From fe4449b2e4914d4a56702ba2ba1dc39d91974375 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 30 Nov 2024 17:39:17 +0000
Subject: [PATCH 001/751] Improve: `#pragma region` dashes

---
 include/stringzilla/stringzilla.h | 6 +++---
 scripts/bench_sort.cpp            | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 3721c5b0..90a4b7e9 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -1188,7 +1188,7 @@ SZ_PUBLIC void sz_sort_intro(sz_sequence_t *sequence, sz_sequence_comparator_t l
 #endif
 #endif // SZ_USE_ARM_SVE
 
-#pragma region Hardware - Specific API
+#pragma region Hardware Specific API
 
 #if SZ_USE_X86_AVX512
 
@@ -4458,7 +4458,7 @@ SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t start, sz_size_t length, sz_size_t windo
  *  * 2019 IceLake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES
  *  * 2020 TigerLake: VP2INTERSECT
  */
-#pragma region AVX - 512 Implementation
+#pragma region AVX512 Implementation
 
 #if SZ_USE_X86_AVX512
 #pragma GCC push_options
@@ -6274,7 +6274,7 @@ SZ_PUBLIC void sz_copy_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length)
 /*
  *  @brief  Pick the right implementation for the string search algorithms.
  */
-#pragma region Compile - Time Dispatching
+#pragma region Compile Time Dispatching
 
 SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t ins, sz_size_t length) { return sz_hash_serial(ins, length); }
 SZ_PUBLIC void sz_tolower(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_tolower_serial(ins, length, outs); }
diff --git a/scripts/bench_sort.cpp b/scripts/bench_sort.cpp
index b70409ca..f46be4a3 100644
--- a/scripts/bench_sort.cpp
+++ b/scripts/bench_sort.cpp
@@ -21,7 +21,7 @@ using strings_t = std::vector<std::string>;
 using idx_t = sz_size_t;
 using permute_t = std::vector<sz_u64_t>;
 
-#pragma region - C callbacks
+#pragma region C callbacks
 
 static char const *get_start(sz_sequence_t const *array_c, sz_size_t i) {
     strings_t const &array = *reinterpret_cast<strings_t const *>(array_c->handle);

From 585f7d5dd8940a045fce616c23fbe147e1a1b3f5 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 30 Nov 2024 17:41:03 +0000
Subject: [PATCH 002/751] Fix: `sz_look_up_transform_avx512` declaration

---
 include/stringzilla/stringzilla.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 90a4b7e9..e1c1d910 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -1202,8 +1202,8 @@ SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
 SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
 /** @copydoc sz_fill */
 SZ_PUBLIC void sz_fill_avx512(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_tranform */
-SZ_PUBLIC void sz_look_up_tranform_avx512(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
+/** @copydoc sz_look_up_transform */
+SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
 /** @copydoc sz_find_byte */
 SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
 /** @copydoc sz_rfind_byte */

From 715ad100d6e667f5c34ad60752ef6f34f90c993d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 2 Dec 2024 22:03:45 +0000
Subject: [PATCH 003/751] Docs: Levenshtein tutorial in Jupyter

---
 scripts/test_levenshtein.ipynb | 342 +++++++++++++++++++++++++++------
 1 file changed, 283 insertions(+), 59 deletions(-)

diff --git a/scripts/test_levenshtein.ipynb b/scripts/test_levenshtein.ipynb
index fc8f9bf6..4718c386 100644
--- a/scripts/test_levenshtein.ipynb
+++ b/scripts/test_levenshtein.ipynb
@@ -1,29 +1,52 @@
 {
  "cells": [
   {
-   "cell_type": "code",
-   "execution_count": 25,
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Exploring the Impact of Evaluation Order on Edit Distance Algorithms\n",
+    "\n",
+    "Removing data-dependencies in the Wagner-Fisher, Needleman-Wunsch, Smith-Waterman, and Gotoh Dynamic Programming algorithms to explain the hardware-accelerated variants in StringZilla."
+   ]
+  },
+  {
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "import numpy as np\n",
-    "import random"
+    "## Levenshtein Distance\n",
+    "\n",
+    "Levenshtein edit distance is one of the most broadly studied string similarity metrics.\n",
+    "It is defined as the minimum number of single-character insertions, deletions, and substitutions required to change one string into another.\n",
+    "The Levenshtein distance between two strings is calculated using dynamic programming algorithms, such as the Wagner-Fisher algorithm, and its variations for Bioinformatics: \n",
+    "\n",
+    "- Needleman-Wunsch for global alignment with substitution matrices, \n",
+    "- Smith-Waterman for local alignment with substitution matrices, \n",
+    "- Gotoh for different penalties for gap opening and extensions.\n",
+    "\n",
+    "Given the shared nature of these algorithms, the same tricks can be applied to all of them to improve their performance."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Exploring the Impact of Evaluation Order on the Wagner Fisher Algorithm for Levenshtein Edit Distance"
+    "## Warner-Fisher Algorithm\n",
+    "\n",
+    "Wagner-Fisher algorithm, in its most naive form, has a time and space complexity of $O(NM)$, where $N$ and $M$ are the lengths of the two strings being compared.\n",
+    "A rectangular matrix of size $(N+1) \\times (M+1)$ is created to store the edit distances between all prefixes of the two strings.\n",
+    "The first row and column are, naturally, initialized with ${0, 1, 2, ..., N}$ and ${0, 1, 2, ..., M}$ respectively."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def algo_v0(s1, s2) -> int:\n",
+    "from typing import Tuple\n",
+    "import numpy as np # NumPy for matrices\n",
+    "\n",
+    "def wagner_fisher(s1: str, s2: str) -> Tuple[int, np.ndarray]:\n",
     "    # Create a matrix of size (len(s1)+1) x (len(s2)+1)\n",
     "    matrix = np.zeros((len(s1) + 1, len(s2) + 1), dtype=int)\n",
     "\n",
@@ -38,12 +61,12 @@
     "        for j in range(1, len(s2) + 1):\n",
     "            substitution_cost = s1[i - 1] != s2[j - 1]\n",
     "            matrix[i, j] = min(\n",
-    "                matrix[i - 1, j] + 1,  # Deletion\n",
-    "                matrix[i, j - 1] + 1,  # Insertion\n",
-    "                matrix[i - 1, j - 1] + substitution_cost,  # Substitution\n",
+    "                matrix[i - 1, j] + 1,                      #? Deletion cost\n",
+    "                matrix[i, j - 1] + 1,                      #? Insertion cost\n",
+    "                matrix[i - 1, j - 1] + substitution_cost,  #? Substitution cost\n",
     "            )\n",
     "\n",
-    "    # Return the Levenshtein distance\n",
+    "    # The distance will be placed in the bottom right corner of the matrix\n",
     "    return matrix[len(s1), len(s2)], matrix"
    ]
   },
@@ -51,25 +74,32 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Accelerating this exact algorithm isn't trivial, is the `matrix[i, j]` value has a dependency on the `matrix[i, j-1]` value.\n",
-    "So we can't brute-force accelerate the inner loop.\n",
-    "Instead, we can show that we can evaluate the matrix in a different order, and still get the same result."
+    "This algorithm is almost never recommended for practical use, as it has a quadratic space complexity.\n",
+    "It's trivial to see that the space complexity can be reduced to $O(min(N, M))$ by only storing the last two rows of the matrix, but we want to keep the entire matrix as a reference to allow debugging and visualization."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "![](https://mathworld.wolfram.com/images/eps-svg/SkewDiagonal_1000.svg)"
+    "## Diagonal Evaluation Order\n",
+    "\n",
+    "Accelerating this exact algorithm with SIMD instructions isn't trivial, is the `matrix[i, j]` value has a dependency on the `matrix[i, j - 1]` value.\n",
+    "So we can't brute-force accelerate the inner loop.\n",
+    "Instead, we can show that we can evaluate the matrix in a different order, and still get the same result.\n",
+    "\n",
+    "![Skewed Diagonals Evaluation Order](https://mathworld.wolfram.com/images/eps-svg/SkewDiagonal_1000.svg)\n",
+    "\n",
+    "But before complicating things too much, let's start with a simple case - when both strings have identical lengths and the DP matrix has a square shape."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def algo_v1(s1, s2, verbose: bool = False) -> int:\n",
+    "def square_skewed_diagonals(s1: str, s2: str, verbose: bool = False) -> Tuple[int, np.ndarray]:\n",
     "    assert len(s1) == len(s2), \"First define an algo for square matrices!\"\n",
     "    # Create a matrix of size (len(s1)+1) x (len(s2)+1)\n",
     "    matrix = np.zeros((len(s1) + 1, len(s2) + 1), dtype=int)\n",
@@ -83,35 +113,45 @@
     "\n",
     "    # Number of rows and columns in the square matrix.\n",
     "    n = len(s1) + 1\n",
+    "    \n",
+    "    # Number of diagonals and skewed diagonals in the square matrix of size (n x n).\n",
     "    skew_diagonals_count = 2 * n - 1\n",
-    "    # Compute Levenshtein distance\n",
-    "    for skew_diagonal_idx in range(2, skew_diagonals_count):\n",
-    "        skew_diagonal_length = (skew_diagonal_idx + 1) if skew_diagonal_idx < n else (2*n - skew_diagonal_idx - 1)\n",
+    "    \n",
+    "    # Populate the matrix in 2 separate loops: for the top left triangle and for the bottom right triangle.\n",
+    "    for skew_diagonal_idx in range(2, n):\n",
+    "        skew_diagonal_length = skew_diagonal_idx + 1\n",
+    "        for offset_within_skew_diagonal in range(1, skew_diagonal_length - 1):\n",
+    "            # If we haven't passed the main skew diagonal yet, \n",
+    "            # then we have to skip the first and the last operation,\n",
+    "            # as those are already pre-populated and form the first column \n",
+    "            # and the first row of the Levenshtein matrix respectively.\n",
+    "            i = skew_diagonal_idx - offset_within_skew_diagonal\n",
+    "            j = offset_within_skew_diagonal\n",
+    "            if verbose:\n",
+    "                print(f\"top left triangle: {skew_diagonal_idx=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+    "            substitution_cost = s1[i - 1] != s2[j - 1]\n",
+    "            matrix[i, j] = min(\n",
+    "                matrix[i - 1, j] + 1,                      #? Deletion cost\n",
+    "                matrix[i, j - 1] + 1,                      #? Insertion cost\n",
+    "                matrix[i - 1, j - 1] + substitution_cost,  #? Substitution cost\n",
+    "            )\n",
+    "            \n",
+    "    # Now the bottom right triangle of the matrix.\n",
+    "    for skew_diagonal_idx in range(n, skew_diagonals_count):\n",
+    "        skew_diagonal_length = 2*n - skew_diagonal_idx - 1\n",
     "        for offset_within_skew_diagonal in range(skew_diagonal_length):\n",
-    "            if skew_diagonal_idx < n:\n",
-    "                # If we passed the main skew diagonal yet, \n",
-    "                # Then we have to skip the first and the last operation,\n",
-    "                # as those are already pre-populated and form the first column \n",
-    "                # and the first row of the Levenshtein matrix respectively.\n",
-    "                if offset_within_skew_diagonal == 0 or offset_within_skew_diagonal + 1 == skew_diagonal_length:\n",
-    "                    continue      \n",
-    "                i = skew_diagonal_idx - offset_within_skew_diagonal\n",
-    "                j = offset_within_skew_diagonal\n",
-    "                if verbose:\n",
-    "                    print(f\"top left triangle: {skew_diagonal_idx=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
-    "            else:\n",
-    "                i = n - offset_within_skew_diagonal - 1\n",
-    "                j = skew_diagonal_idx - n + offset_within_skew_diagonal + 1\n",
-    "                if verbose:\n",
-    "                    print(f\"bottom right triangle: {skew_diagonal_idx=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+    "            i = n - offset_within_skew_diagonal - 1\n",
+    "            j = skew_diagonal_idx - n + offset_within_skew_diagonal + 1\n",
+    "            if verbose:\n",
+    "                print(f\"bottom right triangle: {skew_diagonal_idx=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
     "            substitution_cost = s1[i - 1] != s2[j - 1]\n",
     "            matrix[i, j] = min(\n",
-    "                matrix[i - 1, j] + 1,  # Deletion\n",
-    "                matrix[i, j - 1] + 1,  # Insertion\n",
-    "                matrix[i - 1, j - 1] + substitution_cost,  # Substitution\n",
+    "                matrix[i - 1, j] + 1,                      #? Deletion cost\n",
+    "                matrix[i, j - 1] + 1,                      #? Insertion cost\n",
+    "                matrix[i - 1, j - 1] + substitution_cost,  #? Substitution cost\n",
     "            )\n",
     "\n",
-    "    # Return the Levenshtein distance\n",
+    "    # Similarly, the distance will be placed in the bottom right corner of the matrix\n",
     "    return matrix[len(s1), len(s2)], matrix"
    ]
   },
@@ -124,16 +164,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
+    "import random\n",
     "for _ in range(10):\n",
     "    s1 = ''.join(random.choices(\"ab\", k=50))\n",
     "    s2 = ''.join(random.choices(\"ab\", k=50))\n",
-    "    d0, _ = algo_v0(s1, s2)\n",
-    "    d1, _ = algo_v1(s1, s2)\n",
-    "    assert d0 == d1 "
+    "    d0, _ = wagner_fisher(s1, s2)\n",
+    "    d1, _ = square_skewed_diagonals(s1, s2)\n",
+    "    assert d0 == d1"
    ]
   },
   {
@@ -146,7 +187,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -154,7 +195,7 @@
       "text/plain": [
        "('listen',\n",
        " 'silent',\n",
-       " 'distance = 4',\n",
+       " 'distance = np.int64(4)',\n",
        " array([[0, 1, 2, 3, 4, 5, 6],\n",
        "        [1, 1, 2, 2, 3, 4, 5],\n",
        "        [2, 2, 1, 2, 3, 4, 5],\n",
@@ -164,7 +205,7 @@
        "        [6, 5, 5, 5, 4, 3, 4]]))"
       ]
      },
-     "execution_count": 29,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -174,13 +215,13 @@
     "s2 = \"silent\"\n",
     "# s1 = ''.join(random.choices(\"abcd\", k=100))\n",
     "# s2 = ''.join(random.choices(\"abcd\", k=100))\n",
-    "distance, baseline = algo_v0(s1, s2)\n",
+    "distance, baseline = wagner_fisher(s1, s2)\n",
     "s1, s2, f\"{distance = }\", baseline"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -191,7 +232,7 @@
        " array([0, 0, 0, 0, 0, 0, 0], dtype=uint64))"
       ]
      },
-     "execution_count": 30,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -233,7 +274,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -244,7 +285,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -258,7 +299,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -269,7 +310,7 @@
        " array([6, 4, 3, 2, 3, 4, 6], dtype=uint64))"
       ]
      },
-     "execution_count": 33,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -306,7 +347,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -317,7 +358,7 @@
        " array([4, 5, 4, 5, 5, 5, 6], dtype=uint64))"
       ]
      },
-     "execution_count": 34,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -342,12 +383,195 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
     "assert distance == following[0], f\"{distance = } != {following[0] = }\""
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generalizing to Non-Square Matrices"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def skewed_diagonals(s1, s2, verbose: bool = False) -> int:\n",
+    "    shorter, longer = (s1, s2) if len(s1) < len(s2) else (s2, s1)    \n",
+    "    shorter_dim = len(shorter) + 1\n",
+    "    longer_dim = len(longer) + 1\n",
+    "    # Create a matrix of size (len(s1)+1) x (len(s2)+1)\n",
+    "    matrix = np.zeros((len(shorter) + 1, len(longer) + 1), dtype=int)\n",
+    "    matrix[:, :] = 99\n",
+    "\n",
+    "    # Initialize the first column and first row of the matrix\n",
+    "    for i in range(shorter_dim):\n",
+    "        matrix[i, 0] = i\n",
+    "    for j in range(longer_dim):\n",
+    "        matrix[0, j] = j\n",
+    "\n",
+    "    # Let's say we are dealing with 6 and 9 letter words.\n",
+    "    # The matrix will have size 7 x 10, parameterized as (shorter_dim x longer_dim).\n",
+    "    # It will have:\n",
+    "    # - 8 diagonals of increasing length, at positions: 0, 1, 2, 3, 4, 5, 6, 7.\n",
+    "    # - 2 diagonals of fixed length, at positions: 8, 9.\n",
+    "    # - 8 diagonals of decreasing length, at positions: 10, 11, 12, 13, 14, 15, 16, 17.\n",
+    "    skew_diagonals_count = 2 * longer_dim - 1\n",
+    "\n",
+    "    # Same as with square matrices, the 0th diagonal contains - just one element - zero - skipping it.\n",
+    "    # Same as with square matrices, the 1st diagonal contains the values 1 and 1 - skipping it.\n",
+    "    # Now let's handle the rest of the upper triangle.\n",
+    "    for skew_diagonal_idx in range(2, shorter_dim + 1):\n",
+    "        skew_diagonal_length = (skew_diagonal_idx + 1)\n",
+    "        for offset_within_skew_diagonal in range(1, skew_diagonal_length-1): #! Skip the first column & row\n",
+    "            # If we haven't passed the main skew diagonal yet, \n",
+    "            # then we have to skip the first and the last operation,\n",
+    "            # as those are already pre-populated and form the first column \n",
+    "            # and the first row of the Levenshtein matrix respectively.\n",
+    "            i = skew_diagonal_idx - offset_within_skew_diagonal\n",
+    "            j = offset_within_skew_diagonal\n",
+    "            if verbose:\n",
+    "                print(f\"top left triangle: {skew_diagonal_idx=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+    "            shorter_char = shorter[i - 1]\n",
+    "            longer_char = longer[j - 1]\n",
+    "            substitution_cost = shorter_char != longer_char\n",
+    "            matrix[i, j] = min(\n",
+    "                matrix[i - 1, j] + 1,  # Deletion\n",
+    "                matrix[i, j - 1] + 1,  # Insertion\n",
+    "                matrix[i - 1, j - 1] + substitution_cost,  # Substitution\n",
+    "            )\n",
+    "            \n",
+    "    # Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.        \n",
+    "    for skew_diagonal_idx in range(shorter_dim + 1, longer_dim + 1):\n",
+    "        skew_diagonal_length = shorter_dim\n",
+    "        for offset_within_skew_diagonal in range(skew_diagonal_length):\n",
+    "            i = shorter_dim - offset_within_skew_diagonal - 1\n",
+    "            j = offset_within_skew_diagonal + 1\n",
+    "            if verbose:\n",
+    "                print(f\"anti-band: {skew_diagonal_idx=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+    "            shorter_char = shorter[i - 1]\n",
+    "            longer_char = longer[j - 1]\n",
+    "            substitution_cost = shorter_char != longer_char\n",
+    "            matrix[i, j] = min(\n",
+    "                matrix[i - 1, j] + 1,  # Deletion\n",
+    "                matrix[i, j - 1] + 1,  # Insertion\n",
+    "                matrix[i - 1, j - 1] + substitution_cost,  # Substitution\n",
+    "            )\n",
+    "    \n",
+    "    # Now let's handle the bottom right triangle.\n",
+    "    for skew_diagonal_idx in range(longer_dim + 1, skew_diagonals_count):\n",
+    "        skew_diagonal_length = 2 * longer_dim - skew_diagonal_idx - 1\n",
+    "        for offset_within_skew_diagonal in range(skew_diagonal_length):\n",
+    "            i = shorter_dim - offset_within_skew_diagonal - 1\n",
+    "            j = skew_diagonal_idx - longer_dim + offset_within_skew_diagonal + 1\n",
+    "            if verbose:\n",
+    "                print(f\"bottom right triangle: {skew_diagonal_idx=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+    "            shorter_char = shorter[i - 1]\n",
+    "            longer_char = longer[j - 1]\n",
+    "            substitution_cost = shorter_char != longer_char\n",
+    "            matrix[i, j] = min(\n",
+    "                matrix[i - 1, j] + 1,  # Deletion\n",
+    "                matrix[i, j - 1] + 1,  # Insertion\n",
+    "                matrix[i - 1, j - 1] + substitution_cost,  # Substitution\n",
+    "            )\n",
+    "\n",
+    "    # Return the Levenshtein distance\n",
+    "    return matrix[len(shorter), len(longer)], matrix"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('listeners',\n",
+       " 'silents',\n",
+       " 'distance = np.int64(5)',\n",
+       " array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],\n",
+       "        [1, 1, 2, 2, 3, 4, 5, 6, 7, 8],\n",
+       "        [2, 2, 1, 2, 3, 4, 5, 6, 7, 8],\n",
+       "        [3, 2, 2, 2, 3, 4, 5, 6, 7, 8],\n",
+       "        [4, 3, 3, 3, 3, 3, 4, 5, 6, 7],\n",
+       "        [5, 4, 4, 4, 4, 4, 3, 4, 5, 6],\n",
+       "        [6, 5, 5, 5, 4, 5, 4, 4, 5, 6],\n",
+       "        [7, 6, 6, 5, 5, 5, 5, 5, 5, 5]]))"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "s1 = \"listeners\"\n",
+    "s2 = \"silents\"\n",
+    "distance, baseline = skewed_diagonals(s1, s2)\n",
+    "s1, s2, f\"{distance = }\", baseline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('listeners',\n",
+       " 'silents',\n",
+       " 'distance = np.int64(5)',\n",
+       " array([[0, 1, 2, 3, 4, 5, 6, 7],\n",
+       "        [1, 1, 2, 2, 3, 4, 5, 6],\n",
+       "        [2, 2, 1, 2, 3, 4, 5, 6],\n",
+       "        [3, 2, 2, 2, 3, 4, 5, 5],\n",
+       "        [4, 3, 3, 3, 3, 4, 4, 5],\n",
+       "        [5, 4, 4, 4, 3, 4, 5, 5],\n",
+       "        [6, 5, 5, 5, 4, 3, 4, 5],\n",
+       "        [7, 6, 6, 6, 5, 4, 4, 5],\n",
+       "        [8, 7, 7, 7, 6, 5, 5, 5],\n",
+       "        [9, 8, 8, 8, 7, 6, 6, 5]]))"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "distance, baseline = wagner_fisher(s1, s2)\n",
+    "s1, s2, f\"{distance = }\", baseline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "s1 = ''.join(random.choices(\"abcd\", k=5))\n",
+    "s2 = ''.join(random.choices(\"abcd\", k=6))\n",
+    "distance_v0, baseline_v0 = wagner_fisher(s1, s2)\n",
+    "distance_v2, baseline_v2 = skewed_diagonals(s1, s2, verbose=False)\n",
+    "assert distance_v0 == distance_v2, f\"{distance_v0 = } != {distance_v2 = }\"\n",
+    "assert np.all(baseline_v0 == baseline_v2), f\"{baseline_v0 = }\\n{baseline_v2 = }\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -366,7 +590,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.5"
+   "version": "3.12.2"
   }
  },
  "nbformat": 4,

From d3b423a4c647bec1c823857a4bc043b77d6c2df3 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 09:09:19 +0000
Subject: [PATCH 004/751] Improve: Levenshtein functions for unicode

---
 scripts/test.cpp | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/scripts/test.cpp b/scripts/test.cpp
index 47ef46d2..cb7d0079 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -1394,6 +1394,20 @@ static void test_levenshtein_distances() {
         {"abc", "adc", 1},                  // one substitution
         {"abc", "abc", 0},                  // same string
         {"ggbuzgjux{}l", "gbuzgjux{}l", 1}, // one insertion (prepended)
+        {"apple", "aple", 1},
+        // Unicode:
+        {"αβγδ", "αγδ", 2},                      // Each Greek symbol is 2 bytes in size
+        {"مرحبا بالعالم", "مرحبا يا عالم", 3},   // "Hello World" vs "Welcome to the World" ?
+        {"école", "école", 3},                   // letter "é" as a single character vs "e" + "´"
+        {"Schön", "Scho\u0308n", 3},             // "ö" represented as "o" + "¨"
+        {"💖", "💗", 1},                         // 4-byte emojis: Different hearts
+        {"𠜎 𠜱 𠝹 𠱓", "𠜎𠜱𠝹𠱓", 3},          // Ancient Chinese characters, no spaces vs spaces
+        {"München", "Muenchen", 2},              // German name with umlaut vs. its transcription
+        {"façade", "facade", 2},                 // "ç" represented as "c" with cedilla vs. plain "c"
+        {"こんにちは世界", "こんばんは世界", 3}, // Japanese: "Good morning world" vs "Good evening world"
+        {"👩‍👩‍👧‍👦", "👨‍👩‍👧‍👦", 1}, // Family emojis with different compositions
+        {"Data科学123", "Data科學321", 3},
+        {"🙂🌍🚀", "🙂🌎✨", 5},
     };
 
     using matrix_t = std::int8_t[256][256];
@@ -1435,6 +1449,7 @@ static void test_levenshtein_distances() {
         std::size_t iterations;
     } fuzzy_cases[] = {
         {10, 1000},
+        {64, 128},
         {100, 100},
         {1000, 10},
     };

From 1765f334230e60c7884a1c7efc48f2227c1ed2c9 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 09:10:38 +0000
Subject: [PATCH 005/751] Add: Missing Rust interfaces

`sz_checksum`, `sz_hash`,
`sz_edit_distance_utf8`,
`sz_edit_distance_bounded`,
`sz_edit_distance_utf8_bounded`.
---
 rust/lib.rs | 145 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 139 insertions(+), 6 deletions(-)

diff --git a/rust/lib.rs b/rust/lib.rs
index 30150efb..08c8772a 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -8,7 +8,7 @@
 
 pub mod sz {
 
-    use core::ffi::c_void;
+    use core::{ffi::c_void, usize};
 
     // Import the functions from the StringZilla C library.
     extern "C" {
@@ -54,6 +54,10 @@ pub mod sz {
             needle_length: usize,
         ) -> *const c_void;
 
+        fn sz_hash(text: *const c_void, length: usize) -> u64;
+
+        fn sz_checksum(text: *const c_void, length: usize) -> u64;
+
         fn sz_edit_distance(
             haystack1: *const c_void,
             haystack1_length: usize,
@@ -98,8 +102,6 @@ pub mod sz {
             allocator: *const c_void,
         ) -> isize;
 
-        // type RandomGeneratorT = fn(*mut c_void) -> u64;
-
         fn sz_generate(
             alphabet: *const c_void,
             alphabet_size: usize,
@@ -110,6 +112,51 @@ pub mod sz {
         );
     }
 
+    /// Computes the checksum value of unsigned bytes in a given byte slice `text`.
+    /// This function is useful for verifying data integrity and detecting changes in
+    /// binary data, such as files or network packets.
+    ///
+    /// # Arguments
+    ///
+    /// * `text`: The byte slice to compute the checksum for.
+    ///
+    /// # Returns
+    ///
+    /// A `u64` representing the checksum value of the input byte slice.
+    pub fn checksum<T>(text: T) -> u64
+    where
+        T: AsRef<[u8]>,
+    {
+        let text_ref = text.as_ref();
+        let text_pointer = text_ref.as_ptr() as _;
+        let text_length = text_ref.len();
+        let result = unsafe { sz_checksum(text_pointer, text_length) };
+        return result;
+    }
+
+    /// Computes a 64-bit AES-based hash value for a given byte slice `text`.
+    /// This function is designed to provide a high-quality hash value for use in
+    /// hash tables, data structures, and cryptographic applications.
+    /// Unlike the checksum function, the hash function is order-sensitive.
+    ///
+    /// # Arguments
+    ///
+    /// * `text`: The byte slice to compute the checksum for.
+    ///
+    /// # Returns
+    ///
+    /// A `u64` representing the hash value of the input byte slice.
+    pub fn hash<T>(text: T) -> u64
+    where
+        T: AsRef<[u8]>,
+    {
+        let text_ref = text.as_ref();
+        let text_pointer = text_ref.as_ptr() as _;
+        let text_length = text_ref.len();
+        let result = unsafe { sz_hash(text_pointer, text_length) };
+        return result;
+    }
+
     /// Locates the first matching substring within `haystack` that equals `needle`.
     /// This function is similar to the `memmem()` function in LibC, but, unlike `strstr()`,
     /// it requires the length of both haystack and needle to be known beforehand.
@@ -445,7 +492,7 @@ pub mod sz {
         F: AsRef<[u8]>,
         S: AsRef<[u8]>,
     {
-        edit_distance_bounded(first, second, 0)
+        edit_distance_bounded(first, second, usize::MAX)
     }
 
     /// Computes the Levenshtein edit distance between two UTF8 strings, using the Wagner-Fisher
@@ -465,7 +512,7 @@ pub mod sz {
         F: AsRef<[u8]>,
         S: AsRef<[u8]>,
     {
-        edit_distance_utf8_bounded(first, second, 0)
+        edit_distance_utf8_bounded(first, second, usize::MAX)
     }
 
     /// Computes the Hamming edit distance between two strings, counting the number of substituted characters.
@@ -987,6 +1034,34 @@ pub trait StringZilla<'a, N>
 where
     N: AsRef<[u8]> + 'a,
 {
+    /// Computes the checksum value of unsigned bytes in a given string.
+    /// This function is useful for verifying data integrity and detecting changes in
+    /// binary data, such as files or network packets.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use stringzilla::StringZilla;
+    ///
+    /// let text = "Hello";
+    /// assert_eq!(text.sz_checksum(), Some(500));
+    /// ```
+    fn sz_checksum(&self) -> u64;
+
+    /// Computes a 64-bit AES-based hash value for a given string.
+    /// This function is designed to provide a high-quality hash value for use in
+    /// hash tables, data structures, and cryptographic applications.
+    /// Unlike the checksum function, the hash function is order-sensitive.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use stringzilla::StringZilla;
+    ///
+    /// assert_ne!("Hello".sz_hash(), "World".sz_hash());
+    /// ```
+    fn sz_hash(&self) -> u64;
+
     /// Searches for the first occurrence of `needle` in `self`.
     ///
     /// # Examples
@@ -1072,6 +1147,45 @@ where
     /// ```
     fn sz_edit_distance(&self, other: N) -> usize;
 
+    /// Computes the Levenshtein edit distance between `self` and `other`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use stringzilla::StringZilla;
+    ///
+    /// let first = "kitten";
+    /// let second = "sitting";
+    /// assert_eq!(first.sz_edit_distance_utf8(second.as_bytes()), 3);
+    /// ```
+    fn sz_edit_distance_utf8(&self, other: N) -> usize;
+
+    /// Computes the bounded Levenshtein edit distance between `self` and `other`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use stringzilla::StringZilla;
+    ///
+    /// let first = "kitten";
+    /// let second = "sitting";
+    /// assert_eq!(first.sz_edit_distance_bounded(second.as_bytes()), 3);
+    /// ```
+    fn sz_edit_distance_bounded(&self, other: N, bound: usize) -> usize;
+
+    /// Computes the bounded Levenshtein edit distance between `self` and `other`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use stringzilla::StringZilla;
+    ///
+    /// let first = "kitten";
+    /// let second = "sitting";
+    /// assert_eq!(first.sz_edit_distance_utf8_bounded(second.as_bytes()), 3);
+    /// ```
+    fn sz_edit_distance_utf8_bounded(&self, other: N, bound: usize) -> usize;
+
     /// Computes the alignment score between `self` and `other` using the specified
     /// substitution matrix and gap penalty.
     ///
@@ -1231,7 +1345,6 @@ where
     /// assert_eq!(matches, vec![b"!", b"d", b"l", b"r", b"w", b" ", b",", b"l", b"l", b"H"]);
     /// ```
     fn sz_find_last_not_of(&'a self, needles: &'a N) -> RangeRMatches<'a>;
-
 }
 
 impl<'a, T, N> StringZilla<'a, N> for T
@@ -1239,6 +1352,14 @@ where
     T: AsRef<[u8]> + ?Sized,
     N: AsRef<[u8]> + 'a,
 {
+    fn sz_checksum(&self) -> u64 {
+        sz::checksum(self)
+    }
+
+    fn sz_hash(&self) -> u64 {
+        sz::hash(self)
+    }
+
     fn sz_find(&self, needle: N) -> Option<usize> {
         sz::find(self, needle)
     }
@@ -1267,6 +1388,18 @@ where
         sz::edit_distance(self, other)
     }
 
+    fn sz_edit_distance_utf8(&self, other: N) -> usize {
+        sz::edit_distance_utf8(self, other)
+    }
+
+    fn sz_edit_distance_bounded(&self, other: N, bound: usize) -> usize {
+        sz::edit_distance_bounded(self, other, bound)
+    }
+
+    fn sz_edit_distance_utf8_bounded(&self, other: N, bound: usize) -> usize {
+        sz::edit_distance_utf8_bounded(self, other, bound)
+    }
+
     fn sz_alignment_score(&self, other: N, matrix: [[i8; 256]; 256], gap: i8) -> isize {
         sz::alignment_score(self, other, matrix, gap)
     }

From 62ca6a0e4635cd251bc97530b10438aa13a08eb5 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 09:12:02 +0000
Subject: [PATCH 006/751] Fix: Default Levenshtein upper bound

---
 python/lib.c                           | 7 +++----
 swift/StringProtocol+StringZilla.swift | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/python/lib.c b/python/lib.c
index dcf96625..c5346772 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -1858,8 +1858,8 @@ static PyObject *_Str_edit_distance(PyObject *self, PyObject *args, PyObject *kw
                 return NULL;
     }
 
-    Py_ssize_t bound = 0; // Default value for bound
-    if (bound_obj && ((bound = PyLong_AsSsize_t(bound_obj)) < 0)) {
+    sz_size_t bound = SZ_SIZE_MAX; // Default value for bound
+    if (bound_obj && ((bound = (sz_size_t)PyLong_AsSize_t(bound_obj)) == (sz_size_t)(-1))) {
         PyErr_Format(PyExc_ValueError, "Bound must be a non-negative integer");
         return NULL;
     }
@@ -1877,8 +1877,7 @@ static PyObject *_Str_edit_distance(PyObject *self, PyObject *args, PyObject *kw
     reusing_allocator.free = &temporary_memory_free;
     reusing_allocator.handle = &temporary_memory;
 
-    sz_size_t distance =
-        function(str1.start, str1.length, str2.start, str2.length, (sz_size_t)bound, &reusing_allocator);
+    sz_size_t distance = function(str1.start, str1.length, str2.start, str2.length, bound, &reusing_allocator);
 
     // Check for memory allocation issues
     if (distance == SZ_SIZE_MAX) {
diff --git a/swift/StringProtocol+StringZilla.swift b/swift/StringProtocol+StringZilla.swift
index 0f7b36bc..d90c8afc 100644
--- a/swift/StringProtocol+StringZilla.swift
+++ b/swift/StringProtocol+StringZilla.swift
@@ -255,7 +255,7 @@ public extension StringZillaViewable {
     /// - Throws: If a memory allocation error has happened.
     @_specialize(where Self == String, S == String)
     @_specialize(where Self == String.UTF8View, S == String.UTF8View)
-    func editDistance<S: StringZillaViewable>(from other: S, bound: UInt64 = 0) throws -> UInt64? {
+    func editDistance<S: StringZillaViewable>(from other: S, bound: UInt64 = UInt64.max) throws -> UInt64? {
         var result: UInt64?
         
         // Use a do-catch block to handle potential errors

From 0ee549a106b1ee524fa8059888219c03635e11e6 Mon Sep 17 00:00:00 2001
From: Govind <petrovgovind@gmail.com>
Date: Sat, 7 Dec 2024 12:08:15 +0100
Subject: [PATCH 007/751] Make: Inline ASM for detecting CPU features on ARM

Closes #143
---
 c/lib.c                           | 49 ++++++++++++++++++++++++-------
 include/stringzilla/stringzilla.h |  5 ++--
 2 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/c/lib.c b/c/lib.c
index ee48400e..f38ac534 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -38,6 +38,43 @@ extern void *malloc(size_t length);
 #endif
 #endif
 
+// On Apple Silicon, `mrs` is not allowed in user-space, so we need to use the `sysctl` API.
+#if defined(__APPLE__) && defined(__MACH__)
+#define SZ_APPLE 1
+#include <sys/sysctl.h>
+#endif
+
+#if defined(__linux__)
+#define SZ_LINUX 1
+#endif
+
+SZ_INTERNAL sz_capability_t sz_capabilities_arm(void) {
+    // https://github.com/ashvardanian/SimSIMD/blob/28e536083602f85ad0c59456782c8864463ffb0e/include/simsimd/simsimd.h#L434
+    // for documentation on how we detect capabilities across different ARM platforms.
+#if defined(SZ_APPLE)
+
+    // On Apple Silicon, `mrs` is not allowed in user-space, so we need to use the `sysctl` API.
+    uint32_t supports_neon = 0;
+    size_t size = sizeof(supports_neon);
+    if (sysctlbyname("hw.optional.neon", &supports_neon, &size, NULL, 0) != 0) supports_neon = 0;
+
+    return (sz_capability_t)(                   //
+        (sz_cap_arm_neon_k * (supports_neon)) | //
+        (sz_cap_serial_k));
+
+#elif defined(SZ_LINUX)
+    unsigned supports_neon = 1; // NEON is always supported
+    __asm__ __volatile__("mrs %0, ID_AA64PFR0_EL1" : "=r"(id_aa64pfr0_el1));
+    unsigned supports_sve = ((id_aa64pfr0_el1 >> 32) & 0xF) >= 1;
+    return (sz_capability_t)(               //
+        (sz_cap_neon_k * (supports_neon)) | //
+        (sz_cap_sve_k * (supports_sve)) |   //
+        (sz_cap_serial_k));
+#else // SIMSIMD_DEFINED_LINUX
+    return sz_cap_serial_k;
+#endif
+}
+
 SZ_DYNAMIC sz_capability_t sz_capabilities(void) {
 
 #if SZ_USE_X86_AVX512 || SZ_USE_X86_AVX2
@@ -96,22 +133,12 @@ SZ_DYNAMIC sz_capability_t sz_capabilities(void) {
 
 #if SZ_USE_ARM_NEON || SZ_USE_ARM_SVE
 
-    // Every 64-bit Arm CPU supports NEON
-    unsigned supports_neon = 1;
-    unsigned supports_sve = 0;
-    unsigned supports_sve2 = 0;
-    sz_unused(supports_sve);
-    sz_unused(supports_sve2);
-
-    return (sz_capability_t)(                 //
-        (sz_cap_arm_neon_k * supports_neon) | //
-        (sz_cap_serial_k));
+    return sz_capabilities_arm();
 
 #endif // SIMSIMD_TARGET_ARM
 
     return sz_cap_serial_k;
 }
-
 typedef struct sz_implementations_t {
     sz_equal_t equal;
     sz_order_t order;
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 7aa9e6da..588a3282 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -260,7 +260,8 @@ typedef enum sz_capability_t {
 
     sz_cap_arm_neon_k = 1 << 10, /// ARM NEON capability
     sz_cap_arm_sve_k = 1 << 11,  /// ARM SVE capability TODO: Not yet supported or used
-
+    sz_cap_arm_sve2_k = 1 << 12,
+    sz_cap_arm_sve2p1_k = 1 << 13,
     sz_cap_x86_avx2_k = 1 << 20,       /// x86 AVX2 capability
     sz_cap_x86_avx512f_k = 1 << 21,    /// x86 AVX512 F capability
     sz_cap_x86_avx512bw_k = 1 << 22,   /// x86 AVX512 BW instruction capability
@@ -268,8 +269,6 @@ typedef enum sz_capability_t {
     sz_cap_x86_avx512vbmi_k = 1 << 24, /// x86 AVX512 VBMI instruction capability
     sz_cap_x86_gfni_k = 1 << 25,       /// x86 AVX512 GFNI instruction capability
 
-    sz_cap_x86_avx512vbmi2_k = 1 << 26, /// x86 AVX512 VBMI 2 instruction capability
-
 } sz_capability_t;
 
 /**

From 43471aa8131d17a6d6a4bf521b5a99aa2b59bd54 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:11:42 +0000
Subject: [PATCH 008/751] Add: New Levenshtein distance kernels

---
 README.md                         |  21 +-
 include/stringzilla/stringzilla.h | 501 +++++++++++++++++----
 scripts/test.cpp                  |  23 +-
 scripts/test_levenshtein.ipynb    | 706 +++++++++++++++++++++---------
 4 files changed, 957 insertions(+), 294 deletions(-)

diff --git a/README.md b/README.md
index dbfd3f9b..d5c59ff9 100644
--- a/README.md
+++ b/README.md
@@ -1367,6 +1367,20 @@ Other algorithms previously considered and deprecated:
 > [Exact String Matching Algorithms in Java](https://www-igm.univ-mlv.fr/~lecroq/string).
 > [SIMD-friendly algorithms for substring searching](http://0x80.pl/articles/simd-strfind.html).
 
+### Exact Multiple Substring Search
+
+Few algorithms for multiple substring search are known.
+Most are based on the Aho-Corasick automaton, which is a generalization of the KMP algorithm.
+The naive implementation, however:
+
+- Allocates disjoint memory for each Trie node and Automaton state.
+- Requires a lot of pointer chasing, limiting speculative execution.
+- Has a lot of branches and conditional moves, which are hard to predict.
+- Matches text a character at a time, which is slow on modern CPUs.
+
+There are several ways to improve the original algorithm.
+One is to use sparse DFA representation, which is more cache-friendly, but would require extra processing to navigate state transitions.
+
 ### Levenshtein Edit Distance
 
 Levenshtein distance is the best known edit-distance for strings, that checks, how many insertions, deletions, and substitutions are needed to transform one string to another.
@@ -1388,10 +1402,11 @@ It's less known, than the others, derived from the Baeza-Yates-Gonnet algorithm,
 StringZilla introduces a different approach, extensively used in Unum's internal combinatorial optimization libraries.
 The approach doesn't change the number of trivial operations, but performs them in a different order, removing the data dependency, that occurs when computing the insertion costs.
 This results in much better vectorization for intra-core parallelism and potentially multi-core evaluation of a single request.
+Moreover, it's easy to generalize to weighted edit-distances, where the cost of a substitution between two characters may not be the same for all pairs, often used in bioinformatics.
 
 Next design goals:
 
-- [ ] Generalize fast traversals to rectangular matrices.
+- [x] Generalize fast traversals to non-square matrices.
 - [ ] Port x86 AVX-512 solution to Arm NEON.
 
 > § Reading materials.
@@ -1425,6 +1440,10 @@ With that solved, the SIMD implementation will become 5x faster than the serial
 [faq-dipeptide]: https://en.wikipedia.org/wiki/Dipeptide
 [faq-titin]: https://en.wikipedia.org/wiki/Titin
 
+Next design goals:
+
+- [ ] Needleman-Wunsch Automata
+
 ### Memory Copying, Fills, and Moves
 
 A lot has been written about the time computers spend copying memory and how that operation is implemented in LibC.
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 7aa9e6da..b6622c27 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -1,7 +1,7 @@
 /**
- *  @brief  StringZilla is a collection of simple string algorithms, designed to be used in Big Data applications.
- *          It may be slower than LibC, but has a broader & cleaner interface, and a very short implementation
- *          targeting modern x86 CPUs with AVX-512 and Arm NEON and older CPUs with SWAR and auto-vectorization.
+ *  @brief  StringZilla is a collection of advanced string algorithms, designed to be used in Big Data applications.
+ *          It is generally faster than LibC, and has a broader & cleaner interface, and targets modern x86 CPUs
+ *          with AVX-512 and Arm NEON and older CPUs with SWAR and auto-vectorization.
  *
  *  Consider overriding the following macros to customize the library:
  *
@@ -843,12 +843,12 @@ SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz
  *  @see    sz_hamming_distance_utf8
  *  @see    https://en.wikipedia.org/wiki/Hamming_distance
  */
-SZ_DYNAMIC sz_size_t sz_hamming_distance(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                         sz_size_t bound);
+SZ_DYNAMIC sz_size_t sz_hamming_distance( //
+    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
 
 /** @copydoc sz_hamming_distance */
-SZ_PUBLIC sz_size_t sz_hamming_distance_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                               sz_size_t bound);
+SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
+    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
 
 /**
  *  @brief  Computes the Hamming distance between two @b UTF8 strings - number of not matching characters.
@@ -887,10 +887,11 @@ typedef sz_size_t (*sz_hamming_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_s
  *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
  *                  so the memory usage is linear in relation to ::a_length and ::b_length.
  *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for edit distance, the `bound` if was exceeded or `SZ_SIZE_MAX`
- *                  if the memory allocation failed.
+ *  @param bound    Exclusive upper bound on the distance, that allows us to exit early.
+ *                  Pass `SZ_SIZE_MAX` or any value greater than `(max(a_length, b_length))` to ignore.
+ *                  Pass zero to check if the strings are equal.
+ *  @return         Unsigned integer for the edit distance. Zero means the strings are equal.
+ *                  Returns the `bound` if it was exceeded or `SZ_SIZE_MAX` if the memory allocation failed.
  *
  *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default
  *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
@@ -1022,8 +1023,9 @@ typedef void (*sz_hashes_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_size_t, sz_hash_
  *  @param window_length        Length of the rolling window in bytes.
  *  @see                        sz_hashes, sz_hashes_intersection
  */
-SZ_PUBLIC void sz_hashes_fingerprint(sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
-                                     sz_ptr_t fingerprint, sz_size_t fingerprint_bytes);
+SZ_PUBLIC void sz_hashes_fingerprint(                          //
+    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
+    sz_ptr_t fingerprint, sz_size_t fingerprint_bytes);
 
 typedef void (*sz_hashes_fingerprint_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_ptr_t, sz_size_t);
 
@@ -1041,8 +1043,9 @@ typedef void (*sz_hashes_fingerprint_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_ptr_
  *  @param window_length        Length of the rolling window in bytes.
  *  @see                        sz_hashes, sz_hashes_fingerprint
  */
-SZ_PUBLIC sz_size_t sz_hashes_intersection(sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
-                                           sz_cptr_t fingerprint, sz_size_t fingerprint_bytes);
+SZ_PUBLIC sz_size_t sz_hashes_intersection(                    //
+    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
+    sz_cptr_t fingerprint, sz_size_t fingerprint_bytes);
 
 typedef sz_size_t (*sz_hashes_intersection_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_cptr_t, sz_size_t);
 
@@ -1773,8 +1776,8 @@ SZ_INTERNAL void _sz_locate_needle_anomalies(sz_cptr_t start, sz_size_t length,
 
     // TODO: Investigate alternative strategies for long needles.
     // On very long needles we have the luxury to choose!
-    // Often dealing with UTF8, we will likely benfit from shifting the first and second characters
-    // further to the right, to achieve not only uniqness within the needle, but also avoid common
+    // Often dealing with UTF8, we will likely benefit from shifting the first and second characters
+    // further to the right, to achieve not only uniqueness within the needle, but also avoid common
     // rune prefixes of 2-, 3-, and 4-byte codes.
     if (length > 8) {
         // Pivot the first and second points right, until we find a character, that:
@@ -1788,7 +1791,7 @@ SZ_INTERNAL void _sz_locate_needle_anomalies(sz_cptr_t start, sz_size_t length,
         sz_u8_t const *start_u8 = (sz_u8_t const *)start;
         sz_size_t vibrant_first = *first, vibrant_second = *second, vibrant_third = *third;
 
-        // Let's begin with the seccond character, as the termination criterea there is more obvious
+        // Let's begin with the seccond character, as the termination criteria there is more obvious
         // and we may end up with more variants to check for the first candidate.
         for (; (start_u8[vibrant_second] > 191 || start_u8[vibrant_second] == start_u8[vibrant_third]) &&
                (vibrant_second + 1 < vibrant_third);
@@ -2455,18 +2458,18 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_serial( //
     current_distances[0] = current_distances[1] = 1;
 
     // Progress through the upper triangle of the Levenshtein matrix.
-    sz_size_t next_skew_diagonal_index = 2;
-    for (; next_skew_diagonal_index != n; ++next_skew_diagonal_index) {
-        sz_size_t const next_skew_diagonal_length = next_skew_diagonal_index + 1;
-        for (sz_size_t i = 0; i + 2 < next_skew_diagonal_length; ++i) {
-            sz_size_t cost_of_substitution = shorter[next_skew_diagonal_index - i - 2] != longer[i];
+    sz_size_t next_diagonal_index = 2;
+    for (; next_diagonal_index != n; ++next_diagonal_index) {
+        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
+        for (sz_size_t i = 0; i + 2 < next_diagonal_length; ++i) {
+            sz_size_t cost_of_substitution = shorter[next_diagonal_index - i - 2] != longer[i];
             sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
             sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
             next_distances[i + 1] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
         }
-        // Don't forget to populate the first row and the fiest column of the Levenshtein matrix.
-        next_distances[0] = next_distances[next_skew_diagonal_length - 1] = next_skew_diagonal_index;
-        // Perform a circular rotarion of those buffers, to reuse the memory.
+        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
+        next_distances[0] = next_distances[next_diagonal_length - 1] = next_diagonal_index;
+        // Perform a circular rotation of those buffers, to reuse the memory.
         sz_size_t *temporary = previous_distances;
         previous_distances = current_distances;
         current_distances = next_distances;
@@ -2476,17 +2479,16 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_serial( //
     // By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a
     // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
     // index on either side, we will be cropping those values out.
-    sz_size_t total_diagonals = n + n - 1;
-    for (; next_skew_diagonal_index != total_diagonals; ++next_skew_diagonal_index) {
-        sz_size_t const next_skew_diagonal_length = total_diagonals - next_skew_diagonal_index;
-        for (sz_size_t i = 0; i != next_skew_diagonal_length; ++i) {
-            sz_size_t cost_of_substitution =
-                shorter[shorter_length - 1 - i] != longer[next_skew_diagonal_index - n + i];
+    sz_size_t diagonals_count = n + n - 1;
+    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
+        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
+        for (sz_size_t i = 0; i != next_diagonal_length; ++i) {
+            sz_size_t cost_of_substitution = shorter[shorter_length - 1 - i] != longer[next_diagonal_index - n + i];
             sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
             sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
             next_distances[i] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
         }
-        // Perform a circular rotarion of those buffers, to reuse the memory, this time, with a shift,
+        // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
         // dropping the first element in the current array.
         sz_size_t *temporary = previous_distances;
         previous_distances = current_distances + 1;
@@ -2737,7 +2739,8 @@ SZ_PUBLIC sz_size_t sz_edit_distance_serial(     //
          --longer_length, --shorter_length);
 
     // Bounded computations may exit early.
-    if (bound) {
+    int const is_bounded = bound < longer_length;
+    if (is_bounded) {
         // If one of the strings is empty - the edit distance is equal to the length of the other one.
         if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
         if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
@@ -2746,7 +2749,7 @@ SZ_PUBLIC sz_size_t sz_edit_distance_serial(     //
     }
 
     if (shorter_length == 0) return longer_length; // If no mismatches were found - the distance is zero.
-    if (shorter_length == longer_length && !bound)
+    if (shorter_length == longer_length && !is_bounded)
         return _sz_edit_distance_skewed_diagonals_serial(longer, longer_length, shorter, shorter_length, bound, alloc);
     return _sz_edit_distance_wagner_fisher_serial(longer, longer_length, shorter, shorter_length, bound, sz_false_k,
                                                   alloc);
@@ -4555,10 +4558,10 @@ SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t start, sz_size_t length, sz_size_t windo
  *  @brief  AVX-512 implementation of the string search algorithms.
  *
  *  Different subsets of AVX-512 were introduced in different years:
- *  * 2017 SkyLake: F, CD, ER, PF, VL, DQ, BW
- *  * 2018 CannonLake: IFMA, VBMI
- *  * 2019 IceLake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES
- *  * 2020 TigerLake: VP2INTERSECT
+ *  - 2017 SkyLake: F, CD, ER, PF, VL, DQ, BW
+ *  - 2018 CannonLake: IFMA, VBMI
+ *  - 2019 IceLake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES
+ *  - 2020 TigerLake: VP2INTERSECT
  */
 #pragma region AVX512 Implementation
 
@@ -5130,11 +5133,269 @@ SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n
     return SZ_NULL_CHAR;
 }
 
+#pragma clang attribute pop
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
+#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
+                             apply_to = function)
+
+/**
+ *  @brief  Computes the edit distance between two very short byte-strings using the AVX-512VBMI extensions.
+ *
+ *  Applies to string lengths up to 63, and evaluates at most (63 * 2 + 1 = 127) diagonals, or just as many loop cycles.
+ *  Supports an early exit, if the distance is bounded.
+ *  Keeps all of the data and Levenshtein matrices skew diagonal in just a couple of registers.
+ *  Benefits from the @b `vpermb` instructions, that can rotate the bytes across the entire ZMM register.
+ */
+SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto63_avx512( //
+    sz_cptr_t shorter, sz_size_t shorter_length,                        //
+    sz_cptr_t longer, sz_size_t longer_length,                          //
+    sz_size_t bound) {
+
+    sz_size_t const max_length = 63u;
+    sz_assert(shorter_length <= longer_length && "The 'shorter' string is longer than the 'longer' one.");
+    sz_assert(shorter_length < max_length && "The length must fit into 16-bit integer. Otherwise use serial variant.");
+
+    // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
+    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
+    sz_size_t const shorter_dim = shorter_length + 1;
+    sz_size_t const longer_dim = longer_length + 1;
+
+    // The next few buffers will be swapped around.
+    sz_u512_vec_t previous_vec, current_vec, next_vec;
+    sz_u512_vec_t gaps_vec, substitutions_vec;
+
+    // Load the strings into ZMM registers - just once.
+    sz_u512_vec_t longer_vec, shorter_vec, shorter_rotated_vec, rotate_left_vec, rotate_right_vec, ones_vec, bound_vec;
+    longer_vec.zmm = _mm512_maskz_loadu_epi8(_sz_u64_mask_until(longer_length), longer);
+    rotate_left_vec.zmm = _mm512_set_epi8(                              //
+        0, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,  //
+        48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, //
+        32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, //
+        16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+    rotate_right_vec.zmm = _mm512_set_epi8(                             //
+        62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48,     //
+        47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, //
+        31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, //
+        15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 63);
+    ones_vec.zmm = _mm512_set1_epi8(1);
+    bound_vec.zmm = _mm512_set1_epi8(bound <= 255 ? (sz_u8_t)bound : 255);
+
+    // To simplify comparisons and traversals, we want to reverse the order of bytes in the shorter string.
+    for (sz_size_t i = 0; i != shorter_length; ++i) shorter_vec.u8s[63 - i] = shorter[i];
+    shorter_rotated_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, shorter_vec.zmm);
+
+    // Let's say we are dealing with 3 and 5 letter words.
+    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
+    // It will have:
+    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
+    // - 2 diagonals of fixed length, at positions: 4, 5.
+    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
+    sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
+
+    // Initialize the first two diagonals:
+    //
+    //      previous_vec.u8s[0] = 0;
+    //      current_vec.u8s[0] = current_vec.u8s[1] = 1;
+    //
+    // We can do a similar thing with vector ops:
+    previous_vec.zmm = _mm512_setzero_si512();
+    current_vec.zmm = _mm512_set1_epi8(1);
+
+    // We skip diagonals 0 and 1, as they are trivial.
+    // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
+    // so we are effectively computing just one value, as will be marked by a single set bit in
+    // the `next_diagonal_mask` on the very first iteration.
+    sz_size_t next_diagonal_index = 2;
+    __mmask64 next_diagonal_mask = 0;
+
+    // Progress through the upper triangle of the Levenshtein matrix.
+    for (; next_diagonal_index != shorter_dim; ++next_diagonal_index) {
+        // After this iteration, the values at offset `0` and `next_diagonal_index` in the `next_vec`
+        // should be set to `next_diagonal_index`, but it's easier to broadcast the value to the whole vector,
+        // and later merge with a mask with new values.
+        next_vec.zmm = _mm512_set1_epi8((sz_u8_t)next_diagonal_index);
+
+        // The mask also adds one set bit.
+        next_diagonal_mask = _kor_mask64(next_diagonal_mask, 1);
+        next_diagonal_mask = _kshiftli_mask64(next_diagonal_mask, 1);
+
+        // Check for equality between string slices.
+        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
+        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
+        substitutions_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, substitutions_vec.zmm);
+        gaps_vec.zmm = _mm512_add_epi8(
+            // Insertions or deletions
+            _mm512_min_epu8(_mm512_permutexvar_epi8(rotate_right_vec.zmm, current_vec.zmm), current_vec.zmm),
+            ones_vec.zmm);
+        next_vec.zmm = _mm512_mask_min_epu8(next_vec.zmm, next_diagonal_mask, gaps_vec.zmm, substitutions_vec.zmm);
+
+        // Mark the current skewed diagonal as the previous one and the next one as the current one.
+        previous_vec.zmm = current_vec.zmm;
+        current_vec.zmm = next_vec.zmm;
+
+        // Shift the shorter string
+        shorter_rotated_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, shorter_rotated_vec.zmm);
+
+        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
+        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
+        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
+            return SZ_SIZE_MAX;
+        }
+    }
+
+    // Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.
+    for (; next_diagonal_index != longer_dim; ++next_diagonal_index) {
+        // After this iteration, the value `shorted_dim - 1` in the `next_vec`
+        // should be set to `next_diagonal_index`, but it's easier to broadcast the value to the whole vector,
+        // and later merge with a mask with new values.
+        next_vec.zmm = _mm512_set1_epi8((sz_u8_t)next_diagonal_index);
+
+        // Make sure we update the first entry.
+        next_diagonal_mask = _kor_mask64(next_diagonal_mask, 1);
+
+        // Check for equality between string slices.
+        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
+        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
+        gaps_vec.zmm = _mm512_add_epi8(
+            // Insertions or deletions
+            _mm512_min_epu8(current_vec.zmm, _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm)),
+            ones_vec.zmm);
+        next_vec.zmm = _mm512_mask_min_epu8(next_vec.zmm, next_diagonal_mask, gaps_vec.zmm, substitutions_vec.zmm);
+
+        // Mark the current skewed diagonal as the previous one and the next one as the current one.
+        previous_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm);
+        current_vec.zmm = next_vec.zmm;
+
+        // Let's shift the longer string now.
+        longer_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, longer_vec.zmm);
+
+        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
+        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
+        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
+            return SZ_SIZE_MAX;
+        }
+    }
+
+    // Now let's handle the bottom right triangle.
+    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
+
+        // Check for equality between string slices.
+        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
+        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
+        gaps_vec.zmm = _mm512_add_epi8(
+            // Insertions or deletions
+            _mm512_min_epu8(current_vec.zmm, _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm)),
+            ones_vec.zmm);
+        next_vec.zmm = _mm512_min_epu8(gaps_vec.zmm, substitutions_vec.zmm);
+
+        // Mark the current skewed diagonal as the previous one and the next one as the current one.
+        previous_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm);
+        current_vec.zmm = next_vec.zmm;
+
+        // Let's shift the longer string now.
+        longer_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, longer_vec.zmm);
+
+        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
+        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
+        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
+            return SZ_SIZE_MAX;
+        }
+        // In every following iterations we take use a shorter prefix of each register,
+        // but we don't need to update the `next_diagonal_mask` anymore... except for the early exit.
+        next_diagonal_mask = _kshiftri_mask64(next_diagonal_mask, 1);
+    }
+    return current_vec.u8s[0];
+}
+
+/**
+ *  @brief  Computes the edit distance between two somewhat short bytes-strings using the AVX-512VBMI extensions.
+ *
+ *  Applies to string lengths up to 127, and evaluates at most (127 * 2 + 1 = 255) diagonals.
+ *  Supports an early exit, if the distance is bounded.
+ *  Uses a lot more CPU registers space, than the `upto63` variant.
+ *  Benefits from the @b `vpermi2b` instructions, that can rotate the bytes in 2 registers at once.
+ *
+ *  This may be one of the most freuqently called kernels for:
+ *  - source code analysis, assuming most lines are either under 80 or under 120 characters long.
+ *  - DNA sequence alignment, as most short reads are 50-300 characters long.
+ */
+SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto127_avx512( //
+    sz_cptr_t shorter, sz_size_t shorter_length,                         //
+    sz_cptr_t longer, sz_size_t longer_length,                           //
+    sz_size_t bound) {
+    sz_unused(shorter && shorter_length && longer && longer_length && bound);
+    return 0;
+}
+
+/**
+ *  @brief  Computes the edit distance between two longer bytes-strings using the AVX-512VBMI extensions.
+ *
+ *  Applies to string lengths up to 255, and evaluates at most (255 * 2 + 1 = 511) diagonals.
+ *  Supports an early exit, if the distance is bounded.
+ *  Uses a lot more CPU registers space, than the `upto63` variant.
+ *
+ *  Each of 2x string ends up occupying 4 ZMM registers, and each of 3x diagonals uses 4 ZMM registers.
+ *  So 20x of the 32x are persistently occupied, and the rest are used for math temporarily.
+ *  This is the largest space-efficient variant, as strings beyond 255 characters may require
+ *  16-bit accumulators, which would be a significant bottleneck.
+ */
+SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto_avx512( //
+    sz_cptr_t shorter, sz_size_t shorter_length,                      //
+    sz_cptr_t longer, sz_size_t longer_length,                        //
+    sz_size_t bound) {
+    sz_unused(shorter && shorter_length && longer && longer_length && bound);
+    return 0;
+}
+
+/**
+ *  @brief  Computes the edit distance between two longer bytes-strings using the AVX-512VBMI extensions,
+ *          assuming the upper distance bound can not exceed 255, but the string length can be arbitrary.
+ *
+ *  Applies to string lengths up to 255, and evaluates at most (255 * 2 + 1 = 511) diagonals.
+ *  Supports an early exit, if the distance is bounded.
+ *  Uses a lot more CPU registers space, than the `upto63` variant.
+ *
+ *  Each of 2x string ends up occupying 4 ZMM registers, and each of 3x diagonals uses 4 ZMM registers.
+ *  So 20x of the 32x are persistently occupied, and the rest are used for math temporarily.
+ *  This is the largest space-efficient variant, as strings beyond 255 characters may require
+ *  16-bit accumulators, which would be a significant bottleneck.
+ */
+SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto255bound_avx512( //
+    sz_cptr_t shorter, sz_size_t shorter_length,                              //
+    sz_cptr_t longer, sz_size_t longer_length,                                //
+    sz_size_t bound) {
+    sz_unused(shorter && shorter_length && longer && longer_length && bound);
+    return 0;
+}
+
+/**
+ *  @brief  Computes the edit distance between two mid-length UTF-8-strings using the AVX-512VBMI extensions.
+ *
+ *  Applies to string lengths up to 127, and evaluates at most (127 * 2 + 1 = 511) diagonals.
+ *  Supports an early exit, if the distance is bounded.
+ *  Benefits from the @b `valignd` instructions used to rotate UTF-32 unpacked unicode codepoints.
+ *
+ *  Each string is unpacked into 128 characters * 4 bytes per character / 64 bytes per register = 8 registers.
+ *
+ */
+SZ_INTERNAL sz_size_t _sz_edit_distance_utf8_skewed_diagonals_upto127_avx512( //
+    sz_cptr_t shorter, sz_size_t shorter_length,                              //
+    sz_cptr_t longer, sz_size_t longer_length,                                //
+    sz_size_t bound) {
+    sz_unused(shorter && shorter_length && longer && longer_length && bound);
+    return 0;
+}
+
 SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
     sz_cptr_t shorter, sz_size_t shorter_length,                         //
     sz_cptr_t longer, sz_size_t longer_length,                           //
     sz_size_t bound, sz_memory_allocator_t *alloc) {
 
+    sz_unused(shorter && longer && bound && alloc);
+
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
     sz_memory_allocator_t global_alloc;
     if (!alloc) {
@@ -5143,25 +5404,27 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
     }
 
     // TODO: Generalize!
-    sz_size_t max_length = 256u * 256u;
-    sz_assert(!bound && "For bounded search the method should only evaluate one band of the matrix.");
-    sz_assert(shorter_length == longer_length && "The method hasn't been generalized to different length inputs yet.");
+    sz_size_t const max_length = 256u * 256u;
+    sz_assert(shorter_length <= longer_length && "The 'shorter' string is longer than the 'longer' one.");
     sz_assert(shorter_length < max_length && "The length must fit into 16-bit integer. Otherwise use serial variant.");
     sz_unused(longer_length && bound && max_length);
 
+#if 0
     // We are going to store 3 diagonals of the matrix.
-    // The length of the longest (main) diagonal would be `n = (shorter_length + 1)`.
-    sz_size_t n = shorter_length + 1;
+    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
+    sz_size_t const shorter_dim = shorter_length + 1;
+    sz_size_t const longer_dim = longer_length + 1;
     // Unlike the serial version, we also want to avoid reverse-order iteration over teh shorter string.
     // So let's allocate a bit more memory and reverse-export our shorter string into that buffer.
-    sz_size_t buffer_length = sizeof(sz_u16_t) * n * 3 + shorter_length;
-    sz_u16_t *distances = (sz_u16_t *)alloc->allocate(buffer_length, alloc->handle);
+    sz_size_t const buffer_length = sizeof(sz_u16_t) * longer_dim * 3 + shorter_length;
+    sz_u16_t *const distances = (sz_u16_t *)alloc->allocate(buffer_length, alloc->handle);
     if (!distances) return SZ_SIZE_MAX;
 
+    // The next few pointers will be swapped around.
     sz_u16_t *previous_distances = distances;
-    sz_u16_t *current_distances = previous_distances + n;
-    sz_u16_t *next_distances = current_distances + n;
-    sz_ptr_t shorter_reversed = (sz_ptr_t)(next_distances + n);
+    sz_u16_t *current_distances = previous_distances + longer_dim;
+    sz_u16_t *next_distances = current_distances + longer_dim;
+    sz_ptr_t const shorter_reversed = (sz_ptr_t)(next_distances + longer_dim);
 
     // Export the reversed string into the buffer.
     for (sz_size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
@@ -5175,47 +5438,61 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
     sz_u512_vec_t insertions_vec, deletions_vec, substitutions_vec, next_vec;
     sz_u512_vec_t ones_u16_vec;
     ones_u16_vec.zmm = _mm512_set1_epi16(1);
+
     // This is a mixed-precision implementation, using 8-bit representations for part of the operations.
     // Even there, in case `SZ_USE_X86_AVX2=0`, let's use the `sz_u512_vec_t` type, addressing the first YMM halfs.
     sz_u512_vec_t shorter_vec, longer_vec;
     sz_u512_vec_t ones_u8_vec;
     ones_u8_vec.ymms[0] = _mm256_set1_epi8(1);
 
+    // Let's say we are dealing with 3 and 5 letter words.
+    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
+    // It will have:
+    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
+    // - 2 diagonals of fixed length, at positions: 4, 5.
+    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
+    sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
+
     // Progress through the upper triangle of the Levenshtein matrix.
-    sz_size_t next_skew_diagonal_index = 2;
-    for (; next_skew_diagonal_index != n; ++next_skew_diagonal_index) {
-        sz_size_t const next_skew_diagonal_length = next_skew_diagonal_index + 1;
-        for (sz_size_t i = 0; i + 2 < next_skew_diagonal_length;) {
-            sz_u32_t remaining_length = (sz_u32_t)(next_skew_diagonal_length - i - 2);
+    sz_size_t next_diagonal_index = 2;
+    for (; next_diagonal_index != shorter_dim; ++next_diagonal_index) {
+        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
+        for (sz_size_t offset_within_diagonal = 0; offset_within_diagonal + 2 < next_diagonal_length;) {
+            sz_u32_t remaining_length = (sz_u32_t)(next_diagonal_length - offset_within_diagonal - 2);
             sz_u32_t register_length = remaining_length < 32 ? remaining_length : 32;
             sz_u32_t remaining_length_mask = _bzhi_u32(0xFFFFFFFFu, register_length);
-            longer_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, longer + i);
-            // Our original code addressed the shorter string `[next_skew_diagonal_index - i - 2]` for growing `i`.
-            // If the `shorter` string was reversed, the `[next_skew_diagonal_index - i - 2]` would
-            // be equal to `[shorter_length - 1 - next_skew_diagonal_index + i + 2]`.
-            // Which simplified would be equal to `[shorter_length - next_skew_diagonal_index + i + 1]`.
-            shorter_vec.ymms[0] = _mm256_maskz_loadu_epi8(
-                remaining_length_mask, shorter_reversed + shorter_length - next_skew_diagonal_index + i + 1);
+            longer_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, longer + offset_within_diagonal);
+            // Our original code addressed the shorter string `[next_diagonal_index - offset_within_diagonal - 2]`
+            // for growing `offset_within_diagonal`. If the `shorter` string was reversed, the
+            // `[next_diagonal_index - offset_within_diagonal - 2]` would be equal to `[shorter_length - 1 -
+            // next_diagonal_index + offset_within_diagonal + 2]`. Which simplified would be equal to
+            // `[shorter_length - next_diagonal_index + offset_within_diagonal + 1]`.
+            shorter_vec.ymms[0] = _mm256_maskz_loadu_epi8( //
+                remaining_length_mask,
+                shorter_reversed + shorter_length - next_diagonal_index + offset_within_diagonal + 1);
             // For substitutions, perform the equality comparison using AVX2 instead of AVX-512
             // to get the result as a vector, instead of a bitmask. Adding 1 to every scalar we can overflow
             // transforming from {0xFF, 0} values to {0, 1} values - exactly what we need. Then - upcast to 16-bit.
             substitutions_vec.zmm = _mm512_cvtepi8_epi16( //
                 _mm256_add_epi8(_mm256_cmpeq_epi8(longer_vec.ymms[0], shorter_vec.ymms[0]), ones_u8_vec.ymms[0]));
             substitutions_vec.zmm = _mm512_add_epi16( //
-                substitutions_vec.zmm, _mm512_maskz_loadu_epi16(remaining_length_mask, previous_distances + i));
+                substitutions_vec.zmm,
+                _mm512_maskz_loadu_epi16(remaining_length_mask, previous_distances + offset_within_diagonal));
             // For insertions and deletions, on modern hardware, it's faster to issue two separate loads,
             // than rotate the bytes in the ZMM register.
-            insertions_vec.zmm = _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + i);
-            deletions_vec.zmm = _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + i + 1);
+            insertions_vec.zmm =
+                _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + offset_within_diagonal);
+            deletions_vec.zmm =
+                _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + offset_within_diagonal + 1);
             // First get the minimum of insertions and deletions.
             next_vec.zmm = _mm512_add_epi16(_mm512_min_epu16(insertions_vec.zmm, deletions_vec.zmm), ones_u16_vec.zmm);
             next_vec.zmm = _mm512_min_epu16(next_vec.zmm, substitutions_vec.zmm);
-            _mm512_mask_storeu_epi16(next_distances + i + 1, remaining_length_mask, next_vec.zmm);
-            i += register_length;
+            _mm512_mask_storeu_epi16(next_distances + offset_within_diagonal + 1, remaining_length_mask, next_vec.zmm);
+            offset_within_diagonal += register_length;
         }
-        // Don't forget to populate the first row and the fiest column of the Levenshtein matrix.
-        next_distances[0] = next_distances[next_skew_diagonal_length - 1] = (sz_u16_t)next_skew_diagonal_index;
-        // Perform a circular rotarion of those buffers, to reuse the memory.
+        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
+        next_distances[0] = next_distances[next_diagonal_length - 1] = (sz_u16_t)next_diagonal_index;
+        // Perform a circular rotation (three-way swap) of those buffers, to reuse the memory.
         sz_u16_t *temporary = previous_distances;
         previous_distances = current_distances;
         current_distances = next_distances;
@@ -5225,15 +5502,13 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
     // By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a
     // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
     // index on either side, we will be cropping those values out.
-    sz_size_t total_diagonals = n + n - 1;
-    for (; next_skew_diagonal_index != total_diagonals; ++next_skew_diagonal_index) {
-        sz_size_t const next_skew_diagonal_length = total_diagonals - next_skew_diagonal_index;
-        for (sz_size_t i = 0; i != next_skew_diagonal_length;) {
-            sz_u32_t remaining_length = (sz_u32_t)(next_skew_diagonal_length - i);
+    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
+        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
+        for (sz_size_t i = 0; i != next_diagonal_length;) {
+            sz_u32_t remaining_length = (sz_u32_t)(next_diagonal_length - i);
             sz_u32_t register_length = remaining_length < 32 ? remaining_length : 32;
             sz_u32_t remaining_length_mask = _bzhi_u32(0xFFFFFFFFu, register_length);
-            longer_vec.ymms[0] =
-                _mm256_maskz_loadu_epi8(remaining_length_mask, longer + next_skew_diagonal_index - n + i);
+            longer_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, longer + next_diagonal_index - n + i);
             // Our original code addressed the shorter string `[shorter_length - 1 - i]` for growing `i`.
             // If the `shorter` string was reversed, the `[shorter_length - 1 - i]` would
             // be equal to `[shorter_length - 1 - shorter_length + 1 + i]`.
@@ -5257,7 +5532,7 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
             i += register_length;
         }
 
-        // Perform a circular rotarion of those buffers, to reuse the memory, this time, with a shift,
+        // Perform a circular rotation (three-way swap) of those buffers, to reuse the memory, this time, with a shift,
         // dropping the first element in the current array.
         sz_u16_t *temporary = previous_distances;
         previous_distances = current_distances + 1;
@@ -5269,6 +5544,8 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
     sz_size_t result = current_distances[0];
     alloc->free(distances, buffer_length, alloc->handle);
     return result;
+#endif
+    return 0;
 }
 
 SZ_INTERNAL sz_size_t sz_edit_distance_avx512(   //
@@ -5276,21 +5553,37 @@ SZ_INTERNAL sz_size_t sz_edit_distance_avx512(   //
     sz_cptr_t longer, sz_size_t longer_length,   //
     sz_size_t bound, sz_memory_allocator_t *alloc) {
 
-    if (shorter_length == longer_length && !bound && shorter_length && shorter_length < 256u * 256u)
-        return _sz_edit_distance_skewed_diagonals_upto65k_avx512(shorter, shorter_length, longer, longer_length, bound,
-                                                                 alloc);
+    // Bounded computations may exit early.
+    int const is_bounded = bound < longer_length;
+    if (is_bounded) {
+        // If one of the strings is empty - the edit distance is equal to the length of the other one.
+        if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
+        if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
+        // If the difference in length is beyond the `bound`, there is no need to check at all.
+        if (longer_length - shorter_length > bound) return bound;
+    }
+
+    // Make sure the shorter string is actually shorter.
+    if (shorter_length > longer_length) {
+        sz_cptr_t temporary = shorter;
+        shorter = longer;
+        longer = temporary;
+        sz_size_t temporary_length = shorter_length;
+        shorter_length = longer_length;
+        longer_length = temporary_length;
+    }
+
+    // Dispatch the right implementation based on the length of the strings.
+    if (longer_length < 64u)
+        return _sz_edit_distance_skewed_diagonals_upto63_avx512( //
+            shorter, shorter_length, longer, longer_length, bound);
+    // else if (longer_length < 256u * 256u)
+    //     return _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
+    //         shorter, shorter_length, longer, longer_length, bound, alloc);
     else
         return sz_edit_distance_serial(shorter, shorter_length, longer, longer_length, bound, alloc);
 }
 
-#pragma clang attribute pop
-#pragma GCC pop_options
-
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,bmi,bmi2"))), \
-                             apply_to = function)
-
 SZ_PUBLIC sz_u64_t sz_checksum_avx512(sz_cptr_t text, sz_size_t length) {
     // The naive implementation of this function is very simple.
     // It assumes the CPU is great at handling unaligned "loads".
@@ -5671,10 +5964,11 @@ SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_
     sz_u512_vec_t lower_nibbles_vec, higher_nibbles_vec;
     sz_u512_vec_t bitset_even_vec, bitset_odd_vec;
     sz_u512_vec_t bitmask_vec, bitmask_lookup_vec;
-    bitmask_lookup_vec.zmm = _mm512_set_epi8(-128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-                                             -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-                                             -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-                                             -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);
+    bitmask_lookup_vec.zmm = _mm512_set_epi8(                       //
+        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
+        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
+        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
+        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);
 
     while (length) {
         // The following algorithm is a transposed equivalent of the "SIMDized check which bytes are in a set"
@@ -5746,6 +6040,29 @@ SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx512(sz_cptr_t text, sz_size_t length, sz
     return sz_rfind_charset_serial(text, length, filter);
 }
 
+SZ_PUBLIC sz_cptr_t sz_find_many_avx512(                        //
+    sz_cptr_t haystack, sz_size_t haystack_length,              //
+    sz_cptr_t const *needles, sz_size_t const *needles_lengths, //
+    sz_size_t *needle_offset) {
+
+    // When dealing with huge needles vocabularies, like in tokenization workloads, we need to construct an automaton.
+    // But in many cases, the vocabulary is small enough to use a simpler DFA-less approach, combining the ideas from
+    // the `sz_find_avx512` and `sz_find_charset_avx512` functions.
+    //
+    // Pick the offsets within needles where there is the least variance in the characters.
+    // Like for "the", "then", "there", "these", "those", "their", "they", "them", "that", "this", "thus", "than":
+    //
+    //    0: 't'
+    //    1: 'h'
+    //    2: 'e', 'a', 'i', 'o', 'u'
+    //    3: 'n', 'r', 's', 'i', 'y', 'm', 't'
+    //
+    // So depending on our "register budget", we can use a different number of pivot points: offset 0, 1, 2 make
+    // the most sense if we can only use 3 ZMM registers.
+    sz_unused(haystack && haystack_length && needles && needles_lengths && needle_offset);
+    return 0;
+}
+
 /**
  *  Computes the Needleman Wunsch alignment score between two strings.
  *  The method uses 32-bit integers to accumulate the running score for every cell in the matrix.
diff --git a/scripts/test.cpp b/scripts/test.cpp
index cb7d0079..eecc97f0 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -1431,6 +1431,12 @@ static void test_levenshtein_distances() {
         received_score = sz::alignment_score(r, l, costs, -1);
         if (received != expected) print_failure("Levenshtein", r, l, expected, received);
         if ((std::size_t)(-received_score) != expected) print_failure("Scoring", r, l, expected, received_score);
+
+        // Validate the bounded variants:
+        if (received > 1) {
+            assert(sz::edit_distance(l, r, received) == received);
+            assert(sz::edit_distance(r, l, received - 1) == SZ_SIZE_MAX);
+        }
     };
 
     for (auto explicit_case : explicit_cases)
@@ -1553,6 +1559,20 @@ static void test_stl_containers() {
 
 int main(int argc, char const **argv) {
 
+    auto dist = _sz_edit_distance_skewed_diagonals_upto63_avx512("kiten", 5, "katerinas", 9, SZ_SIZE_MAX);
+    sz_assert(dist == 5);
+    dist = _sz_edit_distance_skewed_diagonals_upto63_avx512("kiten", 5, "katerinas", 9, 3);
+    sz_assert(dist == SZ_SIZE_MAX);
+    dist = _sz_edit_distance_skewed_diagonals_upto63_avx512("kiten", 5, "katerinas", 9, 4);
+    sz_assert(dist == SZ_SIZE_MAX);
+    dist = _sz_edit_distance_skewed_diagonals_upto63_avx512("kiten", 5, "katerinas", 9, 5);
+    sz_assert(dist == 5);
+    dist = _sz_edit_distance_skewed_diagonals_upto63_avx512("kiten", 5, "katerinas", 9, 6);
+    sz_assert(dist == 5);
+
+    // Similarity measures and fuzzy search
+    test_levenshtein_distances();
+
     // Let's greet the user nicely
     sz_unused(argc && argv);
     std::printf("Hi, dear tester! You look nice today!\n");
@@ -1596,9 +1616,6 @@ int main(int argc, char const **argv) {
     test_search_with_misaligned_repetitions();
 #endif
 
-    // Similarity measures and fuzzy search
-    test_levenshtein_distances();
-
     // Sequences of strings
     test_sequence_algorithms();
     test_stl_containers();
diff --git a/scripts/test_levenshtein.ipynb b/scripts/test_levenshtein.ipynb
index 4718c386..606939ae 100644
--- a/scripts/test_levenshtein.ipynb
+++ b/scripts/test_levenshtein.ipynb
@@ -70,6 +70,37 @@
     "    return matrix[len(s1), len(s2)], matrix"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('kiten',\n",
+       " 'katerinas',\n",
+       " 'distance_wf = np.int64(5)',\n",
+       " array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],\n",
+       "        [1, 0, 1, 2, 3, 4, 5, 6, 7, 8],\n",
+       "        [2, 1, 1, 2, 3, 4, 4, 5, 6, 7],\n",
+       "        [3, 2, 2, 1, 2, 3, 4, 5, 6, 7],\n",
+       "        [4, 3, 3, 2, 1, 2, 3, 4, 5, 6],\n",
+       "        [5, 4, 4, 3, 2, 2, 3, 3, 4, 5]]))"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "s1 = \"kiten\"\n",
+    "s2 = \"katerinas\"\n",
+    "distance_wf, matrix_wf = wagner_fisher(s1, s2)\n",
+    "s1, s2, f\"{distance_wf = }\", matrix_wf"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -78,6 +109,53 @@
     "It's trivial to see that the space complexity can be reduced to $O(min(N, M))$ by only storing the last two rows of the matrix, but we want to keep the entire matrix as a reference to allow debugging and visualization."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To feel safer, while designing our alternative traversal algorithm, let's define an extraction function, that will get the values of a certain skewed diagonal."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_skewed_diagonal(matrix: np.ndarray, index: int):\n",
+    "    flipped_matrix = np.fliplr(matrix)\n",
+    "    return np.flip(np.diag(flipped_matrix, k= matrix.shape[1] - index - 1))\n",
+    "\n",
+    "# Let's test this function right away.\n",
+    "matrix = np.array([\n",
+    "    [1, 2, 3],\n",
+    "    [4, 5, 6],\n",
+    "    [7, 8, 9]])\n",
+    "assert np.all(get_skewed_diagonal(matrix, 2) == [7, 5, 3])\n",
+    "assert np.all(get_skewed_diagonal(matrix, 1) == [4, 2])\n",
+    "assert np.all(get_skewed_diagonal(matrix, 4) == [9])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([2, 3, 5, 6, 8])"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_skewed_diagonal(matrix_wf, 10)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -95,11 +173,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def square_skewed_diagonals(s1: str, s2: str, verbose: bool = False) -> Tuple[int, np.ndarray]:\n",
+    "from typing import Optional\n",
+    "\n",
+    "def square_skewed_diagonals(\n",
+    "    s1: str, s2: str, \n",
+    "    verbose: bool = False, \n",
+    "    baseline: Optional[np.ndarray] = None) -> Tuple[int, np.ndarray]:\n",
+    "\n",
     "    assert len(s1) == len(s2), \"First define an algo for square matrices!\"\n",
     "    # Create a matrix of size (len(s1)+1) x (len(s2)+1)\n",
     "    matrix = np.zeros((len(s1) + 1, len(s2) + 1), dtype=int)\n",
@@ -115,20 +199,20 @@
     "    n = len(s1) + 1\n",
     "    \n",
     "    # Number of diagonals and skewed diagonals in the square matrix of size (n x n).\n",
-    "    skew_diagonals_count = 2 * n - 1\n",
+    "    diagonals_count = 2 * n - 1\n",
     "    \n",
     "    # Populate the matrix in 2 separate loops: for the top left triangle and for the bottom right triangle.\n",
-    "    for skew_diagonal_idx in range(2, n):\n",
-    "        skew_diagonal_length = skew_diagonal_idx + 1\n",
-    "        for offset_within_skew_diagonal in range(1, skew_diagonal_length - 1):\n",
+    "    for skew_diagonal_index in range(2, n):\n",
+    "        skew_diagonal_length = skew_diagonal_index + 1\n",
+    "        for offset_within_diagonal in range(1, skew_diagonal_length - 1):\n",
     "            # If we haven't passed the main skew diagonal yet, \n",
     "            # then we have to skip the first and the last operation,\n",
     "            # as those are already pre-populated and form the first column \n",
     "            # and the first row of the Levenshtein matrix respectively.\n",
-    "            i = skew_diagonal_idx - offset_within_skew_diagonal\n",
-    "            j = offset_within_skew_diagonal\n",
+    "            i = skew_diagonal_index - offset_within_diagonal\n",
+    "            j = offset_within_diagonal\n",
     "            if verbose:\n",
-    "                print(f\"top left triangle: {skew_diagonal_idx=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+    "                print(f\"top left triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
     "            substitution_cost = s1[i - 1] != s2[j - 1]\n",
     "            matrix[i, j] = min(\n",
     "                matrix[i - 1, j] + 1,                      #? Deletion cost\n",
@@ -136,20 +220,26 @@
     "                matrix[i - 1, j - 1] + substitution_cost,  #? Substitution cost\n",
     "            )\n",
     "            \n",
+    "            if baseline is not None:\n",
+    "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
+    "            \n",
     "    # Now the bottom right triangle of the matrix.\n",
-    "    for skew_diagonal_idx in range(n, skew_diagonals_count):\n",
-    "        skew_diagonal_length = 2*n - skew_diagonal_idx - 1\n",
-    "        for offset_within_skew_diagonal in range(skew_diagonal_length):\n",
-    "            i = n - offset_within_skew_diagonal - 1\n",
-    "            j = skew_diagonal_idx - n + offset_within_skew_diagonal + 1\n",
+    "    for skew_diagonal_index in range(n, diagonals_count):\n",
+    "        skew_diagonal_length = 2 * n - skew_diagonal_index - 1\n",
+    "        for offset_within_diagonal in range(skew_diagonal_length):\n",
+    "            i = n - offset_within_diagonal - 1\n",
+    "            j = skew_diagonal_index - n + offset_within_diagonal + 1\n",
     "            if verbose:\n",
-    "                print(f\"bottom right triangle: {skew_diagonal_idx=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+    "                print(f\"bottom right triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
     "            substitution_cost = s1[i - 1] != s2[j - 1]\n",
     "            matrix[i, j] = min(\n",
     "                matrix[i - 1, j] + 1,                      #? Deletion cost\n",
     "                matrix[i, j - 1] + 1,                      #? Insertion cost\n",
     "                matrix[i - 1, j - 1] + substitution_cost,  #? Substitution cost\n",
     "            )\n",
+    "            \n",
+    "            if baseline is not None:\n",
+    "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
     "\n",
     "    # Similarly, the distance will be placed in the bottom right corner of the matrix\n",
     "    return matrix[len(s1), len(s2)], matrix"
@@ -164,75 +254,97 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
     "import random\n",
     "for _ in range(10):\n",
-    "    s1 = ''.join(random.choices(\"ab\", k=50))\n",
-    "    s2 = ''.join(random.choices(\"ab\", k=50))\n",
-    "    d0, _ = wagner_fisher(s1, s2)\n",
-    "    d1, _ = square_skewed_diagonals(s1, s2)\n",
-    "    assert d0 == d1"
+    "    s1 = ''.join(random.choices(\"abc\", k=50))\n",
+    "    s2 = ''.join(random.choices(\"abc\", k=50))\n",
+    "    distance_wf, matrix_wf = wagner_fisher(s1, s2)\n",
+    "    distance_sd, matrix_sd = square_skewed_diagonals(s1, s2, baseline=matrix_wf)\n",
+    "    assert distance_wf == distance_sd, f\"{distance_wf = } != {distance_sd = }\"\n",
+    "    assert np.all(matrix_wf == matrix_sd), f\"{matrix_wf = }\\n{matrix_sd = }\""
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Going further, we can avoid storing the whole matrix, and only store two diagonals at a time.\n",
-    "The longer will never exceed N. The shorter one is always at most N-1, and is always shorter by one."
+    "## Vectorizing the Skewed Diagonals Algorithm\n",
+    "\n",
+    "Going further, we can avoid storing the whole matrix, and only store three diagonals at a time.\n",
+    "The longer will never exceed `n` in length.\n",
+    "The others are always at most `n-1`.\n",
+    "Let's try vectorizing different parts of our algorithm, validating it against the output of the naive algorithm for 2 strings: `\"BCDE\"` and `\"FKPU\"`."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "('listen',\n",
-       " 'silent',\n",
-       " 'distance = np.int64(4)',\n",
-       " array([[0, 1, 2, 3, 4, 5, 6],\n",
-       "        [1, 1, 2, 2, 3, 4, 5],\n",
-       "        [2, 2, 1, 2, 3, 4, 5],\n",
-       "        [3, 2, 2, 2, 3, 4, 5],\n",
-       "        [4, 3, 3, 3, 3, 4, 4],\n",
-       "        [5, 4, 4, 4, 3, 4, 5],\n",
-       "        [6, 5, 5, 5, 4, 3, 4]]))"
+       "('BCDE',\n",
+       " 'FKPU',\n",
+       " 'distance_wf = np.int64(4)',\n",
+       " array([[0, 1, 2, 3, 4],\n",
+       "        [1, 1, 2, 3, 4],\n",
+       "        [2, 2, 2, 3, 4],\n",
+       "        [3, 3, 3, 3, 4],\n",
+       "        [4, 4, 4, 4, 4]]))"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "s1 = \"listen\"\n",
-    "s2 = \"silent\"\n",
-    "# s1 = ''.join(random.choices(\"abcd\", k=100))\n",
-    "# s2 = ''.join(random.choices(\"abcd\", k=100))\n",
-    "distance, baseline = wagner_fisher(s1, s2)\n",
-    "s1, s2, f\"{distance = }\", baseline"
+    "s1 = \"BCDE\"\n",
+    "s2 = \"FKPU\"\n",
+    "distance_wf, matrix_wf = wagner_fisher(s1, s2)\n",
+    "s1, s2, f\"{distance_wf = }\", matrix_wf"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Replacing the letters with numbers and annotating with a header row and column for `\"BCDE\"` and `\"FKPU\"`:\n",
+    "\n",
+    "|       |     | **B** | **C** | **D** | **E** |\n",
+    "| ----- | --- | ----- | ----- | ----- | ----- |\n",
+    "|       | a   | b     | c     | d     | e     |\n",
+    "| **F** | f   | g     | h     | i     | j     |\n",
+    "| **K** | k   | l     | m     | n     | o     |\n",
+    "| **P** | p   | q     | r     | s     | t     |\n",
+    "| **U** | u   | v     | w     | x     | y     |\n",
+    "\n",
+    "At any point we will be working with 3 diagonals:\n",
+    "\n",
+    "- `previous` set to `[a]` at start\n",
+    "- `current` set to `[f, b]` at start\n",
+    "- `following` set to `[k, g, c]` at start"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "(array([0, 0, 0, 0, 0, 0, 0], dtype=uint64),\n",
-       " array([1, 1, 0, 0, 0, 0, 0], dtype=uint64),\n",
-       " array([0, 0, 0, 0, 0, 0, 0], dtype=uint64))"
+       "(array([0, 0, 0, 0, 0], dtype=uint64),\n",
+       " array([1, 1, 0, 0, 0], dtype=uint64),\n",
+       " array([0, 0, 0, 0, 0], dtype=uint64))"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -242,17 +354,6 @@
     "# Number of rows and columns in the square matrix.\n",
     "n = len(s1) + 1\n",
     "\n",
-    "# Let's use just a couple of arrays to store the previous skew diagonals.\n",
-    "# Let's imagine that our Levenshtein matrix is gonna have 5x5 size for two words of length 4.\n",
-    "#         B C D E << s2 characters: BCDE\n",
-    "#     + ---------\n",
-    "#     | a b c d e\n",
-    "#   F | f g h i j\n",
-    "#   K | k l m n o\n",
-    "#   P | p q r s t\n",
-    "#   U | u v w x y\n",
-    "#   ^\n",
-    "#   ^ s1 characters: FKPU\n",
     "following = np.zeros(n, dtype=np.uint) # let's assume we are computing the main skew diagonal: [u, q, m, i, e]\n",
     "current = np.zeros(n, dtype=np.uint) # will contain: [p, l, h, e]\n",
     "previous = np.zeros(n, dtype=np.uint) # will contain: [k, g, c]\n",
@@ -269,71 +370,46 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "To feel safer, while designing our alternative traversal algorithm, let's define an extraction function, that will get the values of a certain skewed diagonal."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_skewed_diagonal(matrix: np.ndarray, index: int):\n",
-    "    flipped_matrix = np.fliplr(matrix)\n",
-    "    return np.flip(np.diag(flipped_matrix, k= matrix.shape[1] - index - 1))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "matrix = np.array([[1, 2, 3],\n",
-    "                   [4, 5, 6],\n",
-    "                   [7, 8, 9]])\n",
-    "assert np.all(get_skewed_diagonal(matrix, 2) == [7, 5, 3])\n",
-    "assert np.all(get_skewed_diagonal(matrix, 1) == [4, 2])\n",
-    "assert np.all(get_skewed_diagonal(matrix, 4) == [9])"
+    "Now we can rewrite the first nested loop for the upper triangle of the matrix in NumPy primitives, using it's `np.minimum` function to calculate the minimum of three values."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "(array([5, 3, 2, 2, 3, 5, 0], dtype=uint64),\n",
-       " array([6, 4, 3, 2, 3, 4, 6], dtype=uint64),\n",
-       " array([6, 4, 3, 2, 3, 4, 6], dtype=uint64))"
+       "(array([3, 2, 2, 3, 0], dtype=uint64),\n",
+       " array([4, 3, 2, 3, 4], dtype=uint64),\n",
+       " array([4, 3, 2, 3, 4], dtype=uint64))"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# To evaluate every subsequent entry:\n",
-    "following_skew_diagonal_idx = 2\n",
-    "while following_skew_diagonal_idx < n:\n",
-    "    following_skew_diagonal_length = following_skew_diagonal_idx + 1\n",
+    "next_diagonal_index = 2\n",
+    "while next_diagonal_index < n:\n",
+    "    next_skew_diagonal_length = next_diagonal_index + 1\n",
     "\n",
-    "    old_substitution_costs = previous[:following_skew_diagonal_length - 2]\n",
-    "    added_substitution_costs = [s1[following_skew_diagonal_idx - i - 2] != s2[i] for i in range(following_skew_diagonal_length - 2)]\n",
+    "    old_substitution_costs = previous[:next_skew_diagonal_length - 2]\n",
+    "    added_substitution_costs = [s1[next_diagonal_index - i - 2] != s2[i] for i in range(next_skew_diagonal_length - 2)]\n",
     "    substitution_costs = old_substitution_costs + added_substitution_costs\n",
     "\n",
-    "    following[1:following_skew_diagonal_length-1] = np.minimum(current[1:following_skew_diagonal_length-1] + 1, current[:following_skew_diagonal_length-2] + 1) # Insertions or deletions\n",
-    "    following[1:following_skew_diagonal_length-1] = np.minimum(following[1:following_skew_diagonal_length-1], substitution_costs) # Substitutions\n",
-    "    following[0] = following_skew_diagonal_idx\n",
-    "    following[following_skew_diagonal_length-1] = following_skew_diagonal_idx\n",
-    "    assert np.all(following[:following_skew_diagonal_length] == get_skewed_diagonal(baseline, following_skew_diagonal_idx))\n",
+    "    following[1:next_skew_diagonal_length - 1] = np.minimum(current[1:next_skew_diagonal_length - 1] + 1, current[:next_skew_diagonal_length - 2] + 1) # Insertions or deletions\n",
+    "    following[1:next_skew_diagonal_length - 1] = np.minimum(following[1:next_skew_diagonal_length - 1], substitution_costs) # Substitutions\n",
+    "    following[0] = next_diagonal_index\n",
+    "    following[next_skew_diagonal_length - 1] = next_diagonal_index\n",
+    "    assert np.all(following[:next_skew_diagonal_length] == get_skewed_diagonal(matrix_wf, next_diagonal_index))\n",
     "    \n",
     "    previous[:] = current[:]\n",
     "    current[:] = following[:]\n",
-    "    following_skew_diagonal_idx += 1\n",
+    "    next_diagonal_index += 1\n",
     "\n",
     "previous, current, following # Log the state"
    ]
@@ -342,74 +418,107 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal index on either side, we will be cropping those values out."
+    "By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a larger diagonal.\n",
+    "From now onwards, we will be shrinking.\n",
+    "Instead of adding value equal to the skewed diagonal index on either side, we will be cropping those values out."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "(array([5, 4, 5, 5, 5, 6, 0], dtype=uint64),\n",
-       " array([4, 5, 4, 5, 5, 5, 6], dtype=uint64),\n",
-       " array([4, 5, 4, 5, 5, 5, 6], dtype=uint64))"
+       "(array([4, 4, 4, 4, 0], dtype=uint64),\n",
+       " array([4, 4, 4, 4, 4], dtype=uint64),\n",
+       " array([4, 4, 4, 4, 4], dtype=uint64))"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "while following_skew_diagonal_idx < 2 * n - 1:\n",
-    "    following_skew_diagonal_length = 2 * n - 1 - following_skew_diagonal_idx\n",
-    "    old_substitution_costs = previous[:following_skew_diagonal_length]\n",
-    "    added_substitution_costs = [s1[len(s1) - i - 1] != s2[following_skew_diagonal_idx - n + i] for i in range(following_skew_diagonal_length)]\n",
+    "while next_diagonal_index < 2 * n - 1:\n",
+    "    next_skew_diagonal_length = 2 * n - 1 - next_diagonal_index\n",
+    "    old_substitution_costs = previous[:next_skew_diagonal_length]\n",
+    "    added_substitution_costs = [s1[len(s1) - i - 1] != s2[next_diagonal_index - n + i] for i in range(next_skew_diagonal_length)]\n",
     "    substitution_costs = old_substitution_costs + added_substitution_costs\n",
     "    \n",
-    "    following[:following_skew_diagonal_length] = np.minimum(current[:following_skew_diagonal_length] + 1, current[1:following_skew_diagonal_length+1] + 1) # Insertions or deletions\n",
-    "    following[:following_skew_diagonal_length] = np.minimum(following[:following_skew_diagonal_length], substitution_costs) # Substitutions\n",
-    "    assert np.all(following[:following_skew_diagonal_length] == get_skewed_diagonal(baseline, following_skew_diagonal_idx)), f\"\\n{following[:following_skew_diagonal_length]} not equal to \\n{get_skewed_diagonal(baseline, following_skew_diagonal_idx)}\"\n",
+    "    following[:next_skew_diagonal_length] = np.minimum(current[:next_skew_diagonal_length] + 1, current[1 : next_skew_diagonal_length + 1] + 1) # Insertions or deletions\n",
+    "    following[:next_skew_diagonal_length] = np.minimum(following[:next_skew_diagonal_length], substitution_costs) # Substitutions\n",
+    "    assert np.all(following[:next_skew_diagonal_length] == get_skewed_diagonal(matrix_wf, next_diagonal_index)), f\"\\n{following[:next_skew_diagonal_length]} not equal to \\n{get_skewed_diagonal(baseline, next_diagonal_index)}\"\n",
     "    \n",
-    "    previous[:following_skew_diagonal_length] = current[1:following_skew_diagonal_length+1]\n",
-    "    current[:following_skew_diagonal_length] = following[:following_skew_diagonal_length]\n",
-    "    following_skew_diagonal_idx += 1\n",
+    "    previous[:next_skew_diagonal_length] = current[1:next_skew_diagonal_length + 1]\n",
+    "    current[:next_skew_diagonal_length] = following[:next_skew_diagonal_length]\n",
+    "    next_diagonal_index += 1\n",
     "\n",
     "previous, current, following # Log the state"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
-    "assert distance == following[0], f\"{distance = } != {following[0] = }\""
+    "assert distance_wf == following[0], f\"{distance_wf = } != {following[0] = }\""
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Generalizing to Non-Square Matrices"
+    "## Generalizing to Non-Square Matrices\n",
+    "\n",
+    "Let's imaging 2 inputs of length 3 and 5: `\"KPU\"` and `\"BCDEF\"`:\n",
+    "\n",
+    "|       |     | **B** | **C** | **D** | **E** | **F** |\n",
+    "| ----- | --- | ----- | ----- | ----- | ----- | ----- |\n",
+    "|       | a   | b     | c     | d     | e     | f     |\n",
+    "| **K** | g   | h     | i     | j     | k     | l     |\n",
+    "| **P** | m   | n     | o     | p     | q     | r     |\n",
+    "| **U** | s   | t     | u     | v     | w     | x     |\n",
+    "\n",
+    "At any point we will be working with 3 diagonals:\n",
+    "\n",
+    "- `previous` set to `[a]` at start\n",
+    "- `current` set to `[g, b]` at start\n",
+    "- `next` set to `[m, h, c]` at start\n",
+    "\n",
+    "Once we proceed to for X cycles:\n",
+    "\n",
+    "- `previous` set to `[s, n, i, d]`\n",
+    "- `current` set to `[t, o, j, e]`\n",
+    "- `next` set to `[u, p, k, f]`\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def skewed_diagonals(s1, s2, verbose: bool = False) -> int:\n",
-    "    shorter, longer = (s1, s2) if len(s1) < len(s2) else (s2, s1)    \n",
+    "from typing import Optional\n",
+    "\n",
+    "def skewed_diagonals(\n",
+    "    s1: str, s2: str, \n",
+    "    verbose: bool = False, \n",
+    "    baseline: Optional[np.ndarray] = None) -> Tuple[int, np.ndarray]:\n",
+    "    \n",
+    "    shorter, longer = (s1, s2) if len(s1) <= len(s2) else (s2, s1)    \n",
+    "    baseline = baseline if len(s1) <= len(s2) else baseline.T\n",
     "    shorter_dim = len(shorter) + 1\n",
     "    longer_dim = len(longer) + 1\n",
-    "    # Create a matrix of size (len(s1)+1) x (len(s2)+1)\n",
-    "    matrix = np.zeros((len(shorter) + 1, len(longer) + 1), dtype=int)\n",
-    "    matrix[:, :] = 99\n",
+    "    if verbose:\n",
+    "        print(f\"{shorter=}, {longer=}, {shorter_dim=}, {longer_dim=}\")\n",
+    "    \n",
+    "    # Create a matrix of size (shorter_dim) x (longer_dim)\n",
+    "    matrix = np.zeros((shorter_dim, longer_dim), dtype=int)\n",
+    "    matrix[:, :] = longer_dim + 1 # or +inf \n",
     "\n",
     "    # Initialize the first column and first row of the matrix\n",
     "    for i in range(shorter_dim):\n",
@@ -417,111 +526,110 @@
     "    for j in range(longer_dim):\n",
     "        matrix[0, j] = j\n",
     "\n",
-    "    # Let's say we are dealing with 6 and 9 letter words.\n",
-    "    # The matrix will have size 7 x 10, parameterized as (shorter_dim x longer_dim).\n",
+    "    # Let's say we are dealing with 3 and 5 letter words.\n",
+    "    # The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).\n",
     "    # It will have:\n",
-    "    # - 8 diagonals of increasing length, at positions: 0, 1, 2, 3, 4, 5, 6, 7.\n",
-    "    # - 2 diagonals of fixed length, at positions: 8, 9.\n",
-    "    # - 8 diagonals of decreasing length, at positions: 10, 11, 12, 13, 14, 15, 16, 17.\n",
-    "    skew_diagonals_count = 2 * longer_dim - 1\n",
+    "    # - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.\n",
+    "    # - 2 diagonals of fixed length, at positions: 4, 5.\n",
+    "    # - 3 diagonals of decreasing length, at positions: 6, 7, 8.\n",
+    "    diagonals_count = shorter_dim + longer_dim - 1\n",
     "\n",
     "    # Same as with square matrices, the 0th diagonal contains - just one element - zero - skipping it.\n",
     "    # Same as with square matrices, the 1st diagonal contains the values 1 and 1 - skipping it.\n",
     "    # Now let's handle the rest of the upper triangle.\n",
-    "    for skew_diagonal_idx in range(2, shorter_dim + 1):\n",
-    "        skew_diagonal_length = (skew_diagonal_idx + 1)\n",
-    "        for offset_within_skew_diagonal in range(1, skew_diagonal_length-1): #! Skip the first column & row\n",
+    "    for skew_diagonal_index in range(2, shorter_dim):\n",
+    "        skew_diagonal_length = (skew_diagonal_index + 1)\n",
+    "        for offset_within_diagonal in range(1, skew_diagonal_length - 1): #! Skip the first column & row\n",
     "            # If we haven't passed the main skew diagonal yet, \n",
     "            # then we have to skip the first and the last operation,\n",
     "            # as those are already pre-populated and form the first column \n",
     "            # and the first row of the Levenshtein matrix respectively.\n",
-    "            i = skew_diagonal_idx - offset_within_skew_diagonal\n",
-    "            j = offset_within_skew_diagonal\n",
+    "            i = skew_diagonal_index - offset_within_diagonal\n",
+    "            j = offset_within_diagonal\n",
     "            if verbose:\n",
-    "                print(f\"top left triangle: {skew_diagonal_idx=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+    "                print(f\"top left triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
     "            shorter_char = shorter[i - 1]\n",
     "            longer_char = longer[j - 1]\n",
     "            substitution_cost = shorter_char != longer_char\n",
     "            matrix[i, j] = min(\n",
-    "                matrix[i - 1, j] + 1,  # Deletion\n",
-    "                matrix[i, j - 1] + 1,  # Insertion\n",
-    "                matrix[i - 1, j - 1] + substitution_cost,  # Substitution\n",
+    "                matrix[i - 1, j] + 1,                      #? Deletion cost\n",
+    "                matrix[i, j - 1] + 1,                      #? Insertion cost\n",
+    "                matrix[i - 1, j - 1] + substitution_cost,  #? Substitution cost\n",
     "            )\n",
     "            \n",
+    "            if baseline is not None:\n",
+    "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
+    "            \n",
     "    # Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.        \n",
-    "    for skew_diagonal_idx in range(shorter_dim + 1, longer_dim + 1):\n",
+    "    for skew_diagonal_index in range(shorter_dim, longer_dim):\n",
     "        skew_diagonal_length = shorter_dim\n",
-    "        for offset_within_skew_diagonal in range(skew_diagonal_length):\n",
-    "            i = shorter_dim - offset_within_skew_diagonal - 1\n",
-    "            j = offset_within_skew_diagonal + 1\n",
+    "        for offset_within_diagonal in range(skew_diagonal_length - 1): #! Skip the first row\n",
+    "            i = shorter_dim - offset_within_diagonal - 1\n",
+    "            j = skew_diagonal_index - shorter_dim + offset_within_diagonal + 1\n",
     "            if verbose:\n",
-    "                print(f\"anti-band: {skew_diagonal_idx=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+    "                print(f\"anti-band: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
     "            shorter_char = shorter[i - 1]\n",
     "            longer_char = longer[j - 1]\n",
     "            substitution_cost = shorter_char != longer_char\n",
     "            matrix[i, j] = min(\n",
-    "                matrix[i - 1, j] + 1,  # Deletion\n",
-    "                matrix[i, j - 1] + 1,  # Insertion\n",
-    "                matrix[i - 1, j - 1] + substitution_cost,  # Substitution\n",
+    "                matrix[i - 1, j] + 1,                      #? Deletion cost\n",
+    "                matrix[i, j - 1] + 1,                      #? Insertion cost\n",
+    "                matrix[i - 1, j - 1] + substitution_cost,  #? Substitution cost\n",
     "            )\n",
+    "            \n",
+    "            if baseline is not None:\n",
+    "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
     "    \n",
     "    # Now let's handle the bottom right triangle.\n",
-    "    for skew_diagonal_idx in range(longer_dim + 1, skew_diagonals_count):\n",
-    "        skew_diagonal_length = 2 * longer_dim - skew_diagonal_idx - 1\n",
-    "        for offset_within_skew_diagonal in range(skew_diagonal_length):\n",
-    "            i = shorter_dim - offset_within_skew_diagonal - 1\n",
-    "            j = skew_diagonal_idx - longer_dim + offset_within_skew_diagonal + 1\n",
+    "    for skew_diagonal_index in range(longer_dim, diagonals_count):\n",
+    "        skew_diagonal_length = diagonals_count - skew_diagonal_index\n",
+    "        for offset_within_diagonal in range(skew_diagonal_length):\n",
+    "            i = shorter_dim - offset_within_diagonal - 1\n",
+    "            j = skew_diagonal_index - shorter_dim + offset_within_diagonal + 1\n",
     "            if verbose:\n",
-    "                print(f\"bottom right triangle: {skew_diagonal_idx=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+    "                print(f\"bottom right triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+    "            assert (i - 1) >= 0 and (i - 1) < len(shorter), f\"{i = }\"\n",
+    "            assert (j - 1) >= 0 and (j - 1) < len(longer), f\"{j = }\"\n",
     "            shorter_char = shorter[i - 1]\n",
     "            longer_char = longer[j - 1]\n",
     "            substitution_cost = shorter_char != longer_char\n",
     "            matrix[i, j] = min(\n",
-    "                matrix[i - 1, j] + 1,  # Deletion\n",
-    "                matrix[i, j - 1] + 1,  # Insertion\n",
-    "                matrix[i - 1, j - 1] + substitution_cost,  # Substitution\n",
+    "                matrix[i - 1, j] + 1,                      #? Deletion cost\n",
+    "                matrix[i, j - 1] + 1,                      #? Insertion cost\n",
+    "                matrix[i - 1, j - 1] + substitution_cost,  #? Substitution cost\n",
     "            )\n",
+    "            \n",
+    "            if baseline is not None:\n",
+    "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
     "\n",
     "    # Return the Levenshtein distance\n",
-    "    return matrix[len(shorter), len(longer)], matrix"
+    "    distance = matrix[len(shorter), len(longer)]\n",
+    "    if len(s1) > len(s2):\n",
+    "        matrix = matrix.T\n",
+    "    return distance, matrix"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "('listeners',\n",
-       " 'silents',\n",
-       " 'distance = np.int64(5)',\n",
-       " array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],\n",
-       "        [1, 1, 2, 2, 3, 4, 5, 6, 7, 8],\n",
-       "        [2, 2, 1, 2, 3, 4, 5, 6, 7, 8],\n",
-       "        [3, 2, 2, 2, 3, 4, 5, 6, 7, 8],\n",
-       "        [4, 3, 3, 3, 3, 3, 4, 5, 6, 7],\n",
-       "        [5, 4, 4, 4, 4, 4, 3, 4, 5, 6],\n",
-       "        [6, 5, 5, 5, 4, 5, 4, 4, 5, 6],\n",
-       "        [7, 6, 6, 5, 5, 5, 5, 5, 5, 5]]))"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "s1 = \"listeners\"\n",
-    "s2 = \"silents\"\n",
-    "distance, baseline = skewed_diagonals(s1, s2)\n",
-    "s1, s2, f\"{distance = }\", baseline"
+    "import random\n",
+    "for _ in range(100):\n",
+    "    len1 = random.randint(1, 50)\n",
+    "    len2 = random.randint(1, 50)\n",
+    "    s1 = ''.join(random.choices(\"abc\", k=len1))\n",
+    "    s2 = ''.join(random.choices(\"abc\", k=len2))\n",
+    "    distance_wf, matrix_wf = wagner_fisher(s1, s2)\n",
+    "    distance_sd, matrix_sd = skewed_diagonals(s1, s2, baseline=matrix_wf, verbose=False)\n",
+    "    assert distance_wf == distance_sd, f\"{distance_wf = } != {distance_sd = }\"\n",
+    "    assert np.all(matrix_wf == matrix_sd), f\"{matrix_wf = }\\n{matrix_sd = }\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -529,7 +637,7 @@
       "text/plain": [
        "('listeners',\n",
        " 'silents',\n",
-       " 'distance = np.int64(5)',\n",
+       " 'distance_sd = np.int64(5)',\n",
        " array([[0, 1, 2, 3, 4, 5, 6, 7],\n",
        "        [1, 1, 2, 2, 3, 4, 5, 6],\n",
        "        [2, 2, 1, 2, 3, 4, 5, 6],\n",
@@ -542,36 +650,238 @@
        "        [9, 8, 8, 8, 7, 6, 6, 5]]))"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "distance, baseline = wagner_fisher(s1, s2)\n",
-    "s1, s2, f\"{distance = }\", baseline"
+    "s1 = \"listeners\"\n",
+    "s2 = \"silents\"\n",
+    "distance_wf, matrix_wf = wagner_fisher(s1, s2)\n",
+    "distance_sd, matrix_sd = skewed_diagonals(s1, s2, baseline=matrix_wf)\n",
+    "s1, s2, f\"{distance_sd = }\", matrix_sd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Bounding the Error\n",
+    "\n",
+    "It's easy to spot that the algorithm can be further optimized if we are dealing with \"bounded\" edit distances, where the maximum allowed number of edits is known in advance.\n",
+    "In such cases, we only need to evaluate a band around the main diagonal, and can skip the rest of the matrix.\n",
+    "For the bound $k$, we only need to evaluate $2k+1$ diagonals."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
-    "s1 = ''.join(random.choices(\"abcd\", k=5))\n",
-    "s2 = ''.join(random.choices(\"abcd\", k=6))\n",
-    "distance_v0, baseline_v0 = wagner_fisher(s1, s2)\n",
-    "distance_v2, baseline_v2 = skewed_diagonals(s1, s2, verbose=False)\n",
-    "assert distance_v0 == distance_v2, f\"{distance_v0 = } != {distance_v2 = }\"\n",
-    "assert np.all(baseline_v0 == baseline_v2), f\"{baseline_v0 = }\\n{baseline_v2 = }\""
+    "from typing import Optional\n",
+    "\n",
+    "\n",
+    "def bounded_skewed_diagonals(\n",
+    "    s1: str,\n",
+    "    s2: str,\n",
+    "    verbose: bool = False,\n",
+    "    bound: Optional[int] = None,\n",
+    "    baseline: Optional[np.ndarray] = None,\n",
+    ") -> Tuple[int, np.ndarray]:\n",
+    "\n",
+    "    shorter, longer = (s1, s2) if len(s1) <= len(s2) else (s2, s1)\n",
+    "    baseline = baseline if len(s1) <= len(s2) else baseline.T\n",
+    "    shorter_dim = len(shorter) + 1\n",
+    "    longer_dim = len(longer) + 1\n",
+    "    if verbose:\n",
+    "        print(f\"{shorter=}, {longer=}, {shorter_dim=}, {longer_dim=}\")\n",
+    "\n",
+    "    # Create a matrix of size (shorter_dim) x (longer_dim)\n",
+    "    matrix = np.zeros((shorter_dim, longer_dim), dtype=int)\n",
+    "    matrix[:, :] = np.iinfo(matrix.dtype).max\n",
+    "\n",
+    "    # Initialize the first column and first row of the matrix\n",
+    "    for i in range(shorter_dim):\n",
+    "        matrix[i, 0] = i\n",
+    "    for j in range(longer_dim):\n",
+    "        matrix[0, j] = j\n",
+    "\n",
+    "    # Let's say we are dealing with 3 and 5 letter words.\n",
+    "    # The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).\n",
+    "    # It will have:\n",
+    "    # - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.\n",
+    "    # - 2 diagonals of fixed length, at positions: 4, 5.\n",
+    "    # - 3 diagonals of decreasing length, at positions: 6, 7, 8.\n",
+    "    diagonals_count = shorter_dim + longer_dim - 1\n",
+    "\n",
+    "    # Same as with square matrices, the 0th diagonal contains - just one element - zero - skipping it.\n",
+    "    # Same as with square matrices, the 1st diagonal contains the values 1 and 1 - skipping it.\n",
+    "    # In unbounded case, we the upper triangle will have `shorter_dim` rows and columns.\n",
+    "    # In bounded case, we will have `min(bound, shorter_dim)` rows and columns.\n",
+    "    upper_triangle_dim = min(bound, shorter_dim) if bound is not None else shorter_dim\n",
+    "    for skew_diagonal_index in range(2, upper_triangle_dim):\n",
+    "        skew_diagonal_length = skew_diagonal_index + 1\n",
+    "        for offset_within_diagonal in range(\n",
+    "            1, skew_diagonal_length - 1\n",
+    "        ):  #! Skip the first column & row\n",
+    "            # If we haven't passed the main skew diagonal yet,\n",
+    "            # then we have to skip the first and the last operation,\n",
+    "            # as those are already pre-populated and form the first column\n",
+    "            # and the first row of the Levenshtein matrix respectively.\n",
+    "            i = skew_diagonal_index - offset_within_diagonal\n",
+    "            j = offset_within_diagonal\n",
+    "            if verbose:\n",
+    "                print(\n",
+    "                    f\"top left triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\"\n",
+    "                )\n",
+    "            shorter_char = shorter[i - 1]\n",
+    "            longer_char = longer[j - 1]\n",
+    "            substitution_cost = shorter_char != longer_char\n",
+    "            matrix[i, j] = min(\n",
+    "                matrix[i - 1, j] + 1,  # ? Deletion cost\n",
+    "                matrix[i, j - 1] + 1,  # ? Insertion cost\n",
+    "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
+    "            )\n",
+    "\n",
+    "            # Validation checks:\n",
+    "            if baseline is not None:\n",
+    "                assert (\n",
+    "                    matrix[i, j] == baseline[i, j]\n",
+    "                ), f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
+    "\n",
+    "    # Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.\n",
+    "    # In the unbounded case, we will enumerate diagonal indices from `shorter_dim` to `longer_dim`.\n",
+    "    # In the bounded case, we go through the same \n",
+    "    for skew_diagonal_index in range(shorter_dim, longer_dim):\n",
+    "        skew_diagonal_length = shorter_dim\n",
+    "        for offset_within_diagonal in range(\n",
+    "            skew_diagonal_length - 1\n",
+    "        ):  #! Skip the first row\n",
+    "            i = shorter_dim - offset_within_diagonal - 1\n",
+    "            j = skew_diagonal_index - shorter_dim + offset_within_diagonal + 1\n",
+    "            if verbose:\n",
+    "                print(\n",
+    "                    f\"anti-band: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\"\n",
+    "                )\n",
+    "            shorter_char = shorter[i - 1]\n",
+    "            longer_char = longer[j - 1]\n",
+    "            substitution_cost = shorter_char != longer_char\n",
+    "            matrix[i, j] = min(\n",
+    "                matrix[i - 1, j] + 1,  # ? Deletion cost\n",
+    "                matrix[i, j - 1] + 1,  # ? Insertion cost\n",
+    "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
+    "            )\n",
+    "\n",
+    "            if baseline is not None:\n",
+    "                assert (\n",
+    "                    matrix[i, j] == baseline[i, j]\n",
+    "                ), f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
+    "\n",
+    "    # Now let's handle the bottom right triangle.\n",
+    "    for skew_diagonal_index in range(longer_dim, diagonals_count):\n",
+    "        skew_diagonal_length = diagonals_count - skew_diagonal_index\n",
+    "        for offset_within_diagonal in range(skew_diagonal_length):\n",
+    "            i = shorter_dim - offset_within_diagonal - 1\n",
+    "            j = skew_diagonal_index - shorter_dim + offset_within_diagonal + 1\n",
+    "            if verbose:\n",
+    "                print(\n",
+    "                    f\"bottom right triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\"\n",
+    "                )\n",
+    "            assert (i - 1) >= 0 and (i - 1) < len(shorter), f\"{i = }\"\n",
+    "            assert (j - 1) >= 0 and (j - 1) < len(longer), f\"{j = }\"\n",
+    "            shorter_char = shorter[i - 1]\n",
+    "            longer_char = longer[j - 1]\n",
+    "            substitution_cost = shorter_char != longer_char\n",
+    "            matrix[i, j] = min(\n",
+    "                matrix[i - 1, j] + 1,  # ? Deletion cost\n",
+    "                matrix[i, j - 1] + 1,  # ? Insertion cost\n",
+    "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
+    "            )\n",
+    "\n",
+    "            if baseline is not None:\n",
+    "                assert (\n",
+    "                    matrix[i, j] == baseline[i, j]\n",
+    "                ), f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
+    "\n",
+    "    # Return the Levenshtein distance\n",
+    "    distance = matrix[len(shorter), len(longer)]\n",
+    "    if len(s1) > len(s2):\n",
+    "        matrix = matrix.T\n",
+    "    return distance, matrix"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Putting Everything Together"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "def vectorized_skewed_diagonals(\n",
+    "    s1: str, s2: str, \n",
+    "    verbose: bool = False, \n",
+    "    baseline: Optional[np.ndarray] = None) -> Tuple[int, np.ndarray]:\n",
+    "    \n",
+    "    shorter, longer = (s1, s2) if len(s1) <= len(s2) else (s2, s1)    \n",
+    "    baseline = baseline if len(s1) <= len(s2) else baseline.T\n",
+    "    shorter_dim = len(shorter) + 1\n",
+    "    longer_dim = len(longer) + 1\n",
+    "    if verbose:\n",
+    "        print(f\"{shorter=}, {longer=}, {shorter_dim=}, {longer_dim=}\")\n",
+    "    \n",
+    "    # Create a matrix of size (shorter_dim) x (longer_dim)\n",
+    "    matrix = np.zeros((shorter_dim, longer_dim), dtype=int)\n",
+    "    matrix[:, :] = longer_dim + 1 # or +inf \n",
+    "\n",
+    "    # Initialize the first column and first row of the matrix\n",
+    "    for i in range(shorter_dim):\n",
+    "        matrix[i, 0] = i\n",
+    "    for j in range(longer_dim):\n",
+    "        matrix[0, j] = j\n",
+    "\n",
+    "    # Let's say we are dealing with 3 and 5 letter words.\n",
+    "    # The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).\n",
+    "    # It will have:\n",
+    "    # - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.\n",
+    "    # - 2 diagonals of fixed length, at positions: 4, 5.\n",
+    "    # - 3 diagonals of decreasing length, at positions: 6, 7, 8.\n",
+    "    diagonals_count = shorter_dim + longer_dim - 1\n",
+    "\n",
+    "    # Same as with square matrices, the 0th diagonal contains - just one element - zero - skipping it.\n",
+    "    # Same as with square matrices, the 1st diagonal contains the values 1 and 1 - skipping it.\n",
+    "    # Now let's handle the rest of the upper triangle.\n",
+    "    next_diagonal_index = 2\n",
+    "    while next_diagonal_index < shorter_dim:\n",
+    "        next_skew_diagonal_length = next_diagonal_index + 1\n",
+    "\n",
+    "        old_substitution_costs = previous[:next_skew_diagonal_length - 2]\n",
+    "        added_substitution_costs = [shorter[next_diagonal_index - offset_within_diagonal - 2] != longer[offset_within_diagonal] for offset_within_diagonal in range(next_skew_diagonal_length - 2)]\n",
+    "        substitution_costs = old_substitution_costs + added_substitution_costs\n",
+    "\n",
+    "        following[1:next_skew_diagonal_length - 1] = np.minimum(current[1:next_skew_diagonal_length - 1] + 1, current[:next_skew_diagonal_length - 2] + 1) # Insertions or deletions\n",
+    "        following[1:next_skew_diagonal_length - 1] = np.minimum(following[1:next_skew_diagonal_length - 1], substitution_costs) # Substitutions\n",
+    "        following[0] = next_diagonal_index\n",
+    "        following[next_skew_diagonal_length - 1] = next_diagonal_index\n",
+    "        assert np.all(following[:next_skew_diagonal_length] == get_skewed_diagonal(baseline, next_diagonal_index))\n",
+    "        \n",
+    "        previous[:] = current[:]\n",
+    "        current[:] = following[:]\n",
+    "        next_diagonal_index += 1\n",
+    "                        \n",
+    "    # Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.        \n",
+    "    while next_diagonal_index < longer_dim:\n",
+    "        next_skew_diagonal_length = shorter_dim\n",
+    "    \n",
+    "        ..."
+   ]
   }
  ],
  "metadata": {

From d0678f87cfb00a4268e584aaf07c341f2f7f7c50 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:13:48 +0000
Subject: [PATCH 009/751] Fix: Wrong env. variable names

---
 CONTRIBUTING.md | 2 +-
 c/lib.c         | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index da369582..524d6c49 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -116,7 +116,7 @@ Replacing the default compiler is not recommended, as it may break the system, b
 
 ```bash
 brew install llvm
-cmake -D CMAKE_BUILD_TYPE=Release -D SIMSIMD_BUILD_TESTS=1 \
+cmake -D CMAKE_BUILD_TYPE=Release -D STRINGZILLA_BUILD_TEST=1 \
     -D CMAKE_C_COMPILER="$(brew --prefix llvm)/bin/clang" \
     -D CMAKE_CXX_COMPILER="$(brew --prefix llvm)/bin/clang++" \
     -B build_release
diff --git a/c/lib.c b/c/lib.c
index ee48400e..19d22ba5 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -92,7 +92,7 @@ SZ_DYNAMIC sz_capability_t sz_capabilities(void) {
         (sz_cap_x86_gfni_k * (supports_gfni)) |             //
         (sz_cap_serial_k));
 
-#endif // SIMSIMD_TARGET_X86
+#endif // SZ_TARGET_X86
 
 #if SZ_USE_ARM_NEON || SZ_USE_ARM_SVE
 
@@ -107,7 +107,7 @@ SZ_DYNAMIC sz_capability_t sz_capabilities(void) {
         (sz_cap_arm_neon_k * supports_neon) | //
         (sz_cap_serial_k));
 
-#endif // SIMSIMD_TARGET_ARM
+#endif // SZ_TARGET_ARM
 
     return sz_cap_serial_k;
 }

From ecb377541d0c706cf8997faff4f026b07e3f76f3 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:18:34 +0000
Subject: [PATCH 010/751] Make: Split ./include/stringzilla/stringzilla.h to
 ./include/stringzilla/types.h

---
 include/stringzilla/{stringzilla.h => types.h} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename include/stringzilla/{stringzilla.h => types.h} (100%)

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/types.h
similarity index 100%
rename from include/stringzilla/stringzilla.h
rename to include/stringzilla/types.h

From 22e3d1e34d62d68c1e89df7c8bdc201faa18a9de Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:18:34 +0000
Subject: [PATCH 011/751] Make: Split ./include/stringzilla/stringzilla.h to
 ./include/stringzilla/types.h

---
 include/stringzilla/stringzilla.h => temp | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename include/stringzilla/stringzilla.h => temp (100%)

diff --git a/include/stringzilla/stringzilla.h b/temp
similarity index 100%
rename from include/stringzilla/stringzilla.h
rename to temp

From 8cb0742b2d1b31b61fac5272f17017953c6677e6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:18:34 +0000
Subject: [PATCH 012/751] Make: Split ./include/stringzilla/stringzilla.h to
 ./include/stringzilla/types.h

---
 temp => include/stringzilla/stringzilla.h | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename temp => include/stringzilla/stringzilla.h (100%)

diff --git a/temp b/include/stringzilla/stringzilla.h
similarity index 100%
rename from temp
rename to include/stringzilla/stringzilla.h

From 9e577be71dcd2e20854bf55f08c54854b3e82989 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:19:11 +0000
Subject: [PATCH 013/751] Make: Split ./include/stringzilla/stringzilla.h to
 ./include/stringzilla/find.h

---
 include/stringzilla/{stringzilla.h => find.h} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename include/stringzilla/{stringzilla.h => find.h} (100%)

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/find.h
similarity index 100%
rename from include/stringzilla/stringzilla.h
rename to include/stringzilla/find.h

From 14ba3bf3c43408438a7de9ad57118c747c1347b1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:19:11 +0000
Subject: [PATCH 014/751] Make: Split ./include/stringzilla/stringzilla.h to
 ./include/stringzilla/find.h

---
 include/stringzilla/stringzilla.h => temp | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename include/stringzilla/stringzilla.h => temp (100%)

diff --git a/include/stringzilla/stringzilla.h b/temp
similarity index 100%
rename from include/stringzilla/stringzilla.h
rename to temp

From 974ed78822dc0b519dd61bc1c4dc18d59fe4ad15 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:19:11 +0000
Subject: [PATCH 015/751] Make: Split ./include/stringzilla/stringzilla.h to
 ./include/stringzilla/find.h

---
 temp => include/stringzilla/stringzilla.h | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename temp => include/stringzilla/stringzilla.h (100%)

diff --git a/temp b/include/stringzilla/stringzilla.h
similarity index 100%
rename from temp
rename to include/stringzilla/stringzilla.h

From 9e9f2567d052d635722921a1d70ec63d69ec6669 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:19:29 +0000
Subject: [PATCH 016/751] Make: Split ./include/stringzilla/stringzilla.h to
 ./include/stringzilla/hash.h

---
 include/stringzilla/{stringzilla.h => hash.h} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename include/stringzilla/{stringzilla.h => hash.h} (100%)

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/hash.h
similarity index 100%
rename from include/stringzilla/stringzilla.h
rename to include/stringzilla/hash.h

From 08d0a20d35d3b29a44b9c8a826d53435c3ef839c Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:19:29 +0000
Subject: [PATCH 017/751] Make: Split ./include/stringzilla/stringzilla.h to
 ./include/stringzilla/hash.h

---
 include/stringzilla/stringzilla.h => temp | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename include/stringzilla/stringzilla.h => temp (100%)

diff --git a/include/stringzilla/stringzilla.h b/temp
similarity index 100%
rename from include/stringzilla/stringzilla.h
rename to temp

From 1f60e6d7c81f0e285e594eb63fee6119e05a3e69 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:19:29 +0000
Subject: [PATCH 018/751] Make: Split ./include/stringzilla/stringzilla.h to
 ./include/stringzilla/hash.h

---
 temp => include/stringzilla/stringzilla.h | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename temp => include/stringzilla/stringzilla.h (100%)

diff --git a/temp b/include/stringzilla/stringzilla.h
similarity index 100%
rename from temp
rename to include/stringzilla/stringzilla.h

From d74e5dca2e62eb0078cb2ebacc0dac2b8bb92d54 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:20:12 +0000
Subject: [PATCH 019/751] Make: Split ./include/stringzilla/stringzilla.h to
 ./include/stringzilla/similarity.h

---
 include/stringzilla/{stringzilla.h => similarity.h} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename include/stringzilla/{stringzilla.h => similarity.h} (100%)

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/similarity.h
similarity index 100%
rename from include/stringzilla/stringzilla.h
rename to include/stringzilla/similarity.h

From 10d829efcb8ed4cfa5f2db4050f8403184484423 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:20:12 +0000
Subject: [PATCH 020/751] Make: Split ./include/stringzilla/stringzilla.h to
 ./include/stringzilla/similarity.h

---
 include/stringzilla/stringzilla.h => temp | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename include/stringzilla/stringzilla.h => temp (100%)

diff --git a/include/stringzilla/stringzilla.h b/temp
similarity index 100%
rename from include/stringzilla/stringzilla.h
rename to temp

From e23c35ff2c2d4ccb752f4ffbf9b6f39a1677b532 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:20:12 +0000
Subject: [PATCH 021/751] Make: Split ./include/stringzilla/stringzilla.h to
 ./include/stringzilla/similarity.h

---
 temp => include/stringzilla/stringzilla.h | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename temp => include/stringzilla/stringzilla.h (100%)

diff --git a/temp b/include/stringzilla/stringzilla.h
similarity index 100%
rename from temp
rename to include/stringzilla/stringzilla.h

From 3f9c248fbf59add2246055462e8fc19dc9f1693b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:21:30 +0000
Subject: [PATCH 022/751] Make: Split ./include/stringzilla/stringzilla.h to
 ./include/stringzilla/small_string.h

---
 include/stringzilla/{stringzilla.h => small_string.h} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename include/stringzilla/{stringzilla.h => small_string.h} (100%)

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/small_string.h
similarity index 100%
rename from include/stringzilla/stringzilla.h
rename to include/stringzilla/small_string.h

From 89c46810c2f9bfafa31f8592339f9a1b45dcc245 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:21:30 +0000
Subject: [PATCH 023/751] Make: Split ./include/stringzilla/stringzilla.h to
 ./include/stringzilla/small_string.h

---
 include/stringzilla/stringzilla.h => temp | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename include/stringzilla/stringzilla.h => temp (100%)

diff --git a/include/stringzilla/stringzilla.h b/temp
similarity index 100%
rename from include/stringzilla/stringzilla.h
rename to temp

From 3464cb428ae9a8721ab82a8c4bff214aa9ce6254 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:21:30 +0000
Subject: [PATCH 024/751] Make: Split ./include/stringzilla/stringzilla.h to
 ./include/stringzilla/small_string.h

---
 temp => include/stringzilla/stringzilla.h | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename temp => include/stringzilla/stringzilla.h (100%)

diff --git a/temp b/include/stringzilla/stringzilla.h
similarity index 100%
rename from temp
rename to include/stringzilla/stringzilla.h

From 085d2d3c8b99e0f90d320dd027040e554e410929 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:22:12 +0000
Subject: [PATCH 025/751] Make: Split ./include/stringzilla/stringzilla.h to
 ./include/stringzilla/sort.h

---
 include/stringzilla/{stringzilla.h => sort.h} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename include/stringzilla/{stringzilla.h => sort.h} (100%)

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/sort.h
similarity index 100%
rename from include/stringzilla/stringzilla.h
rename to include/stringzilla/sort.h

From cbfe5c7ac6371047eae88621b092297474d0b82a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:22:12 +0000
Subject: [PATCH 026/751] Make: Split ./include/stringzilla/stringzilla.h to
 ./include/stringzilla/sort.h

---
 include/stringzilla/stringzilla.h => temp | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename include/stringzilla/stringzilla.h => temp (100%)

diff --git a/include/stringzilla/stringzilla.h b/temp
similarity index 100%
rename from include/stringzilla/stringzilla.h
rename to temp

From c357c3ea756523d3bcc8d8f25068ad08aef5456d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:22:12 +0000
Subject: [PATCH 027/751] Make: Split ./include/stringzilla/stringzilla.h to
 ./include/stringzilla/sort.h

---
 temp => include/stringzilla/stringzilla.h | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename temp => include/stringzilla/stringzilla.h (100%)

diff --git a/temp b/include/stringzilla/stringzilla.h
similarity index 100%
rename from temp
rename to include/stringzilla/stringzilla.h

From 66778d6b2b3aa0eed27e32fbdceef79b8c54eda5 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 14:14:26 +0000
Subject: [PATCH 028/751] Make: Split ./include/stringzilla/stringzilla.h to
 ./include/stringzilla/memory.h

---
 include/stringzilla/{stringzilla.h => memory.h} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename include/stringzilla/{stringzilla.h => memory.h} (100%)

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/memory.h
similarity index 100%
rename from include/stringzilla/stringzilla.h
rename to include/stringzilla/memory.h

From 45e57eefd796841cbd14ee7f75ec42b42b5bde0c Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 14:14:26 +0000
Subject: [PATCH 029/751] Make: Split ./include/stringzilla/stringzilla.h to
 ./include/stringzilla/memory.h

---
 include/stringzilla/stringzilla.h => temp | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename include/stringzilla/stringzilla.h => temp (100%)

diff --git a/include/stringzilla/stringzilla.h b/temp
similarity index 100%
rename from include/stringzilla/stringzilla.h
rename to temp

From 2f7652141bd8dc3c2c38ab34321567bfcdb91d93 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 14:14:27 +0000
Subject: [PATCH 030/751] Make: Split ./include/stringzilla/stringzilla.h to
 ./include/stringzilla/memory.h

---
 temp => include/stringzilla/stringzilla.h | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename temp => include/stringzilla/stringzilla.h (100%)

diff --git a/temp b/include/stringzilla/stringzilla.h
similarity index 100%
rename from temp
rename to include/stringzilla/stringzilla.h

From 2a1fcd113d217e3124f6501c38e93a318aca37f0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 14:48:51 +0000
Subject: [PATCH 031/751] Fix: Filter `find.h` file

---
 include/stringzilla/find.h | 6944 +++++-------------------------------
 1 file changed, 797 insertions(+), 6147 deletions(-)

diff --git a/include/stringzilla/find.h b/include/stringzilla/find.h
index de7fbcac..a51bd4c6 100644
--- a/include/stringzilla/find.h
+++ b/include/stringzilla/find.h
@@ -1,724 +1,32 @@
 /**
- *  @brief  StringZilla is a collection of advanced string algorithms, designed to be used in Big Data applications.
- *          It is generally faster than LibC, and has a broader & cleaner interface, and targets modern x86 CPUs
- *          with AVX-512 and Arm NEON and older CPUs with SWAR and auto-vectorization.
- *
- *  Consider overriding the following macros to customize the library:
- *
- *  - `SZ_DEBUG=0` - whether to enable debug assertions and logging.
- *  - `SZ_DYNAMIC_DISPATCH=0` - whether to use runtime dispatching of the most advanced SIMD backend.
- *  - `SZ_USE_MISALIGNED_LOADS=0` - whether to use misaligned loads on platforms that support them.
- *  - `SZ_SWAR_THRESHOLD=24` - threshold for switching to SWAR backend over serial byte-level for-loops.
- *  - `SZ_USE_X86_AVX512=?` - whether to use AVX-512 instructions on x86_64.
- *  - `SZ_USE_X86_AVX2=?` - whether to use AVX2 instructions on x86_64.
- *  - `SZ_USE_ARM_NEON=?` - whether to use NEON instructions on ARM.
- *  - `SZ_USE_ARM_SVE=?` - whether to use SVE instructions on ARM.
- *
- *  @see    StringZilla: https://github.com/ashvardanian/StringZilla/blob/main/README.md
- *  @see    LibC String: https://pubs.opengroup.org/onlinepubs/009695399/basedefs/string.h.html
- *
- *  @file   stringzilla.h
+ *  @brief  Hardware-accelerated sub-string and character-set search utilities.
+ *  @file   find.h
  *  @author Ash Vardanian
- */
-#ifndef STRINGZILLA_H_
-#define STRINGZILLA_H_
-
-#define STRINGZILLA_VERSION_MAJOR 3
-#define STRINGZILLA_VERSION_MINOR 11
-#define STRINGZILLA_VERSION_PATCH 0
-
-/**
- *  @brief  When set to 1, the library will include the following LibC headers: <stddef.h> and <stdint.h>.
- *          In debug builds (SZ_DEBUG=1), the library will also include <stdio.h> and <stdlib.h>.
  *
- *  You may want to disable this compiling for use in the kernel, or in embedded systems.
- *  You may also avoid them, if you are very sensitive to compilation time and avoid pre-compiled headers.
- *  https://artificial-mind.net/projects/compile-health/
- */
-#ifndef SZ_AVOID_LIBC
-#define SZ_AVOID_LIBC (0) // true or false
-#endif
-
-/**
- *  @brief  A misaligned load can be - trying to fetch eight consecutive bytes from an address
- *          that is not divisible by eight. On x86 enabled by default. On ARM it's not.
+ *  Includes core APIs:
  *
- *  Most platforms support it, but there is no industry standard way to check for those.
- *  This value will mostly affect the performance of the serial (SWAR) backend.
- */
-#ifndef SZ_USE_MISALIGNED_LOADS
-#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
-#define SZ_USE_MISALIGNED_LOADS (1) // true or false
-#else
-#define SZ_USE_MISALIGNED_LOADS (0) // true or false
-#endif
-#endif
-
-/**
- *  @brief  Removes compile-time dispatching, and replaces it with runtime dispatching.
- *          So the `sz_find` function will invoke the most advanced backend supported by the CPU,
- *          that runs the program, rather than the most advanced backend supported by the CPU
- *          used to compile the library or the downstream application.
- */
-#ifndef SZ_DYNAMIC_DISPATCH
-#define SZ_DYNAMIC_DISPATCH (0) // true or false
-#endif
-
-/**
- *  @brief  Analogous to `size_t` and `std::size_t`, unsigned integer, identical to pointer size.
- *          64-bit on most platforms where pointers are 64-bit.
- *          32-bit on platforms where pointers are 32-bit.
- */
-#if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64)
-#define SZ_DETECT_64_BIT (1)
-#define SZ_SIZE_MAX (0xFFFFFFFFFFFFFFFFull)  // Largest unsigned integer that fits into 64 bits.
-#define SZ_SSIZE_MAX (0x7FFFFFFFFFFFFFFFull) // Largest signed integer that fits into 64 bits.
-#else
-#define SZ_DETECT_64_BIT (0)
-#define SZ_SIZE_MAX (0xFFFFFFFFu)  // Largest unsigned integer that fits into 32 bits.
-#define SZ_SSIZE_MAX (0x7FFFFFFFu) // Largest signed integer that fits into 32 bits.
-#endif
-
-/**
- *  @brief  On Big-Endian machines StringZilla will work in compatibility mode.
- *          This disables SWAR hacks to minimize code duplication, assuming practically
- *          all modern popular platforms are Little-Endian.
+ *  - `sz_equal`
+ *  - `sz_find` and reverse-order `sz_rfind`
+ *  - `sz_find_byte` and reverse-order `sz_rfind_byte`
+ *  - `sz_find_charset` and reverse-order `sz_rfind_charset`
  *
- *  This variable is hard to infer from macros reliably. It's best to set it manually.
- *  For that CMake provides the `TestBigEndian` and `CMAKE_<LANG>_BYTE_ORDER` (from 3.20 onwards).
- *  In Python one can check `sys.byteorder == 'big'` in the `setup.py` script and pass the appropriate macro.
- *  https://stackoverflow.com/a/27054190
- */
-#ifndef SZ_DETECT_BIG_ENDIAN
-#if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN || defined(__BIG_ENDIAN__) || defined(__ARMEB__) || \
-    defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || defined(__MIBSEB) || defined(__MIBSEB__)
-#define SZ_DETECT_BIG_ENDIAN (1) //< It's a big-endian target architecture
-#else
-#define SZ_DETECT_BIG_ENDIAN (0) //< It's a little-endian target architecture
-#endif
-#endif
-
-/*
- *  Debugging and testing.
- */
-#ifndef SZ_DEBUG
-#if defined(DEBUG) || defined(_DEBUG) // This means "Not using DEBUG information".
-#define SZ_DEBUG (1)
-#else
-#define SZ_DEBUG (0)
-#endif
-#endif
-
-/**
- *  @brief  Threshold for switching to SWAR (8-bytes at a time) backend over serial byte-level for-loops.
- *          On very short strings, under 16 bytes long, at most a single word will be processed with SWAR.
- *          Assuming potentially misaligned loads, SWAR makes sense only after ~24 bytes.
- */
-#ifndef SZ_SWAR_THRESHOLD
-#if SZ_DEBUG
-#define SZ_SWAR_THRESHOLD (8u) // 8 bytes in debug builds
-#else
-#define SZ_SWAR_THRESHOLD (24u) // 24 bytes in release builds
-#endif
-#endif
-
-/*  Annotation for the public API symbols:
+ *  Convenience functions for character-set matching:
  *
- *  - `SZ_PUBLIC` is used for functions that are part of the public API.
- *  - `SZ_INTERNAL` is used for internal helper functions with unstable APIs.
- *  - `SZ_DYNAMIC` is used for functions that are part of the public API, but are dispatched at runtime.
+ *  - `sz_find_char_from`
+ *  - `sz_find_char_not_from`
+ *  - `sz_rfind_char_from`
+ *  - `sz_rfind_char_not_from`
  */
-#ifndef SZ_DYNAMIC
-#if SZ_DYNAMIC_DISPATCH
-#if defined(_WIN32) || defined(__CYGWIN__)
-#define SZ_DYNAMIC __declspec(dllexport)
-#define SZ_EXTERNAL __declspec(dllimport)
-#define SZ_PUBLIC inline static
-#define SZ_INTERNAL inline static
-#else
-#define SZ_DYNAMIC __attribute__((visibility("default")))
-#define SZ_EXTERNAL extern
-#define SZ_PUBLIC __attribute__((unused)) inline static
-#define SZ_INTERNAL __attribute__((always_inline)) inline static
-#endif // _WIN32 || __CYGWIN__
-#else
-#define SZ_DYNAMIC inline static
-#define SZ_EXTERNAL extern
-#define SZ_PUBLIC inline static
-#define SZ_INTERNAL inline static
-#endif // SZ_DYNAMIC_DISPATCH
-#endif // SZ_DYNAMIC
+#ifndef STRINGZILLA_FIND_H_
+#define STRINGZILLA_FIND_H_
 
-/**
- *  @brief  Alignment macro for 64-byte alignment.
- */
-#if defined(_MSC_VER)
-#define SZ_ALIGN64 __declspec(align(64))
-#elif defined(__GNUC__) || defined(__clang__)
-#define SZ_ALIGN64 __attribute__((aligned(64)))
-#else
-#define SZ_ALIGN64
-#endif
+#include "types.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/*
- *  Let's infer the integer types or pull them from LibC,
- *  if that is allowed by the user.
- */
-#if !SZ_AVOID_LIBC
-#include <stddef.h>           // `size_t`
-#include <stdint.h>           // `uint8_t`
-typedef int8_t sz_i8_t;       // Always 8 bits
-typedef uint8_t sz_u8_t;      // Always 8 bits
-typedef uint16_t sz_u16_t;    // Always 16 bits
-typedef int32_t sz_i32_t;     // Always 32 bits
-typedef uint32_t sz_u32_t;    // Always 32 bits
-typedef uint64_t sz_u64_t;    // Always 64 bits
-typedef int64_t sz_i64_t;     // Always 64 bits
-typedef size_t sz_size_t;     // Pointer-sized unsigned integer, 32 or 64 bits
-typedef ptrdiff_t sz_ssize_t; // Signed version of `sz_size_t`, 32 or 64 bits
-
-#else // if SZ_AVOID_LIBC:
-
-// ! The C standard doesn't specify the signedness of char.
-// ! On x86 char is signed by default while on Arm it is unsigned by default.
-// ! That's why we don't define `sz_char_t` and generally use explicit `sz_i8_t` and `sz_u8_t`.
-typedef signed char sz_i8_t;         // Always 8 bits
-typedef unsigned char sz_u8_t;       // Always 8 bits
-typedef unsigned short sz_u16_t;     // Always 16 bits
-typedef int sz_i32_t;                // Always 32 bits
-typedef unsigned int sz_u32_t;       // Always 32 bits
-typedef long long sz_i64_t;          // Always 64 bits
-typedef unsigned long long sz_u64_t; // Always 64 bits
-
-// Now we need to redefine the `size_t`.
-// Microsoft Visual C++ (MSVC) typically follows LLP64 data model on 64-bit platforms,
-// where integers, pointers, and long types have different sizes:
-//
-//  > `int` is 32 bits
-//  > `long` is 32 bits
-//  > `long long` is 64 bits
-//  > pointer (thus, `size_t`) is 64 bits
-//
-// In contrast, GCC and Clang on 64-bit Unix-like systems typically follow the LP64 model, where:
-//
-//  > `int` is 32 bits
-//  > `long` and pointer (thus, `size_t`) are 64 bits
-//  > `long long` is also 64 bits
-//
-// Source: https://learn.microsoft.com/en-us/windows/win32/winprog64/abstract-data-models
-#if SZ_DETECT_64_BIT
-typedef unsigned long long sz_size_t; // 64-bit.
-typedef long long sz_ssize_t;         // 64-bit.
-#else
-typedef unsigned sz_size_t;  // 32-bit.
-typedef unsigned sz_ssize_t; // 32-bit.
-#endif // SZ_DETECT_64_BIT
-
-#endif // SZ_AVOID_LIBC
-
-/**
- *  @brief  Compile-time assert macro similar to `static_assert` in C++.
- */
-#define sz_static_assert(condition, name)                \
-    typedef struct {                                     \
-        int static_assert_##name : (condition) ? 1 : -1; \
-    } sz_static_assert_##name##_t
-
-sz_static_assert(sizeof(sz_size_t) == sizeof(void *), sz_size_t_must_be_pointer_size);
-sz_static_assert(sizeof(sz_ssize_t) == sizeof(void *), sz_ssize_t_must_be_pointer_size);
-
-#pragma region Public API
-
-typedef char *sz_ptr_t;          // A type alias for `char *`
-typedef char const *sz_cptr_t;   // A type alias for `char const *`
-typedef sz_i8_t sz_error_cost_t; // Character mismatch cost for fuzzy matching functions
-
-typedef sz_u64_t sz_sorted_idx_t; // Index of a sorted string in a list of strings
-
-typedef enum { sz_false_k = 0, sz_true_k = 1 } sz_bool_t;                        // Only one relevant bit
-typedef enum { sz_less_k = -1, sz_equal_k = 0, sz_greater_k = 1 } sz_ordering_t; // Only three possible states: <=>
-
-/**
- *  @brief  Tiny string-view structure. It's POD type, unlike the `std::string_view`.
- */
-typedef struct sz_string_view_t {
-    sz_cptr_t start;
-    sz_size_t length;
-} sz_string_view_t;
-
-/**
- *  @brief  Enumeration of SIMD capabilities of the target architecture.
- *          Used to introspect the supported functionality of the dynamic library.
- */
-typedef enum sz_capability_t {
-    sz_cap_serial_k = 1,       /// Serial (non-SIMD) capability
-    sz_cap_any_k = 0x7FFFFFFF, /// Mask representing any capability
-
-    sz_cap_arm_neon_k = 1 << 10, /// ARM NEON capability
-    sz_cap_arm_sve_k = 1 << 11,  /// ARM SVE capability TODO: Not yet supported or used
-    sz_cap_arm_sve2_k = 1 << 12,
-    sz_cap_arm_sve2p1_k = 1 << 13,
-    sz_cap_x86_avx2_k = 1 << 20,       /// x86 AVX2 capability
-    sz_cap_x86_avx512f_k = 1 << 21,    /// x86 AVX512 F capability
-    sz_cap_x86_avx512bw_k = 1 << 22,   /// x86 AVX512 BW instruction capability
-    sz_cap_x86_avx512vl_k = 1 << 23,   /// x86 AVX512 VL instruction capability
-    sz_cap_x86_avx512vbmi_k = 1 << 24, /// x86 AVX512 VBMI instruction capability
-    sz_cap_x86_gfni_k = 1 << 25,       /// x86 AVX512 GFNI instruction capability
-
-} sz_capability_t;
-
-/**
- *  @brief  Function to determine the SIMD capabilities of the current machine @b only at @b runtime.
- *  @return A bitmask of the SIMD capabilities represented as a `sz_capability_t` enum value.
- */
-SZ_DYNAMIC sz_capability_t sz_capabilities(void);
-
-/**
- *  @brief  Bit-set structure for 256 possible byte values. Useful for filtering and search.
- *  @see    sz_charset_init, sz_charset_add, sz_charset_contains, sz_charset_invert
- */
-typedef union sz_charset_t {
-    sz_u64_t _u64s[4];
-    sz_u32_t _u32s[8];
-    sz_u16_t _u16s[16];
-    sz_u8_t _u8s[32];
-} sz_charset_t;
-
-/** @brief  Initializes a bit-set to an empty collection, meaning - all characters are banned. */
-SZ_PUBLIC void sz_charset_init(sz_charset_t *s) { s->_u64s[0] = s->_u64s[1] = s->_u64s[2] = s->_u64s[3] = 0; }
-
-/** @brief  Adds a character to the set and accepts @b unsigned integers. */
-SZ_PUBLIC void sz_charset_add_u8(sz_charset_t *s, sz_u8_t c) { s->_u64s[c >> 6] |= (1ull << (c & 63u)); }
-
-/** @brief  Adds a character to the set. Consider @b sz_charset_add_u8. */
-SZ_PUBLIC void sz_charset_add(sz_charset_t *s, char c) { sz_charset_add_u8(s, *(sz_u8_t *)(&c)); } // bitcast
-
-/** @brief  Checks if the set contains a given character and accepts @b unsigned integers. */
-SZ_PUBLIC sz_bool_t sz_charset_contains_u8(sz_charset_t const *s, sz_u8_t c) {
-    // Checking the bit can be done in different ways:
-    // - (s->_u64s[c >> 6] & (1ull << (c & 63u))) != 0
-    // - (s->_u32s[c >> 5] & (1u << (c & 31u))) != 0
-    // - (s->_u16s[c >> 4] & (1u << (c & 15u))) != 0
-    // - (s->_u8s[c >> 3] & (1u << (c & 7u))) != 0
-    return (sz_bool_t)((s->_u64s[c >> 6] & (1ull << (c & 63u))) != 0);
-}
-
-/** @brief  Checks if the set contains a given character. Consider @b sz_charset_contains_u8. */
-SZ_PUBLIC sz_bool_t sz_charset_contains(sz_charset_t const *s, char c) {
-    return sz_charset_contains_u8(s, *(sz_u8_t *)(&c)); // bitcast
-}
-
-/** @brief  Inverts the contents of the set, so allowed character get disallowed, and vice versa. */
-SZ_PUBLIC void sz_charset_invert(sz_charset_t *s) {
-    s->_u64s[0] ^= 0xFFFFFFFFFFFFFFFFull, s->_u64s[1] ^= 0xFFFFFFFFFFFFFFFFull, //
-        s->_u64s[2] ^= 0xFFFFFFFFFFFFFFFFull, s->_u64s[3] ^= 0xFFFFFFFFFFFFFFFFull;
-}
-
-typedef void *(*sz_memory_allocate_t)(sz_size_t, void *);
-typedef void (*sz_memory_free_t)(void *, sz_size_t, void *);
-typedef sz_u64_t (*sz_random_generator_t)(void *);
-
-/**
- *  @brief  Some complex pattern matching algorithms may require memory allocations.
- *          This structure is used to pass the memory allocator to those functions.
- *  @see    sz_memory_allocator_init_fixed
- */
-typedef struct sz_memory_allocator_t {
-    sz_memory_allocate_t allocate;
-    sz_memory_free_t free;
-    void *handle;
-} sz_memory_allocator_t;
-
-/**
- *  @brief  Initializes a memory allocator to use the system default `malloc` and `free`.
- *          ! The function is not available if the library was compiled with `SZ_AVOID_LIBC`.
- *
- *  @param alloc    Memory allocator to initialize.
- */
-SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc);
-
-/**
- *  @brief  Initializes a memory allocator to use a static-capacity buffer.
- *          No dynamic allocations will be performed.
- *
- *  @param alloc    Memory allocator to initialize.
- *  @param buffer   Buffer to use for allocations.
- *  @param length   Length of the buffer. @b Must be greater than 8 bytes. Different values would be optimal for
- *                  different algorithms and input lengths, but 4096 bytes (one RAM page) is a good default.
- */
-SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length);
-
-/**
- *  @brief  The number of bytes a stack-allocated string can hold, including the SZ_NULL termination character.
- *          ! This can't be changed from outside. Don't use the `#error` as it may already be included and set.
- */
-#ifdef SZ_STRING_INTERNAL_SPACE
-#undef SZ_STRING_INTERNAL_SPACE
-#endif
-#define SZ_STRING_INTERNAL_SPACE (sizeof(sz_size_t) * 3 - 1) // 3 pointers minus one byte for an 8-bit length
-
-/**
- *  @brief  Tiny memory-owning string structure with a Small String Optimization (SSO).
- *          Differs in layout from Folly, Clang, GCC, and probably most other implementations.
- *          It's designed to avoid any branches on read-only operations, and can store up
- *          to 22 characters on stack on 64-bit machines, followed by the SZ_NULL-termination character.
- *
- *  @section Changing Length
- *
- *  One nice thing about this design, is that you can, in many cases, change the length of the string
- *  without any branches, invoking a `+=` or `-=` on the 64-bit `length` field. If the string is on heap,
- *  the solution is obvious. If it's on stack, inplace decrement wouldn't affect the top bytes of the string,
- *  only changing the last byte containing the length.
- */
-typedef union sz_string_t {
-
-#if !SZ_DETECT_BIG_ENDIAN
-
-    struct external {
-        sz_ptr_t start;
-        sz_size_t length;
-        sz_size_t space;
-        sz_size_t padding;
-    } external;
-
-    struct internal {
-        sz_ptr_t start;
-        sz_u8_t length;
-        char chars[SZ_STRING_INTERNAL_SPACE];
-    } internal;
-
-#else
-
-    struct external {
-        sz_ptr_t start;
-        sz_size_t space;
-        sz_size_t padding;
-        sz_size_t length;
-    } external;
-
-    struct internal {
-        sz_ptr_t start;
-        char chars[SZ_STRING_INTERNAL_SPACE];
-        sz_u8_t length;
-    } internal;
-
-#endif
-
-    sz_size_t words[4];
-
-} sz_string_t;
-
-typedef sz_u64_t (*sz_hash_t)(sz_cptr_t, sz_size_t);
-typedef sz_u64_t (*sz_checksum_t)(sz_cptr_t, sz_size_t);
-typedef sz_bool_t (*sz_equal_t)(sz_cptr_t, sz_cptr_t, sz_size_t);
-typedef sz_ordering_t (*sz_order_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
-typedef void (*sz_to_converter_t)(sz_cptr_t, sz_size_t, sz_ptr_t);
-
-/**
- *  @brief  Computes the 64-bit check-sum of bytes in a string.
- *          Similar to `std::ranges::accumulate`.
- *
- *  @param text     String to aggregate.
- *  @param length   Number of bytes in the text.
- *  @return         64-bit unsigned value.
- */
-SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length);
-
-/** @copydoc sz_checksum */
-SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length);
-
-/**
- *  @brief  Computes the 64-bit unsigned hash of a string. Fairly fast for short strings,
- *          simple implementation, and supports rolling computation, reused in other APIs.
- *          Similar to `std::hash` in C++.
- *
- *  @param text     String to hash.
- *  @param length   Number of bytes in the text.
- *  @return         64-bit hash value.
- *
- *  @see    sz_hashes, sz_hashes_fingerprint, sz_hashes_intersection
- */
-SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length);
-
-/** @copydoc sz_hash */
-SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t text, sz_size_t length);
-
-/**
- *  @brief  Checks if two string are equal.
- *          Similar to `memcmp(a, b, length) == 0` in LibC and `a == b` in STL.
- *
- *  The implementation of this function is very similar to `sz_order`, but the usage patterns are different.
- *  This function is more often used in parsing, while `sz_order` is often used in sorting.
- *  It works best on platforms with cheap
- *
- *  @param a        First string to compare.
- *  @param b        Second string to compare.
- *  @param length   Number of bytes in both strings.
- *  @return         1 if strings match, 0 otherwise.
- */
-SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-
-/**
- *  @brief  Estimates the relative order of two strings. Equivalent to `memcmp(a, b, length)` in LibC.
- *          Can be used on different length strings.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *  @return         Negative if (a < b), positive if (a > b), zero if they are equal.
- */
-SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-
-/**
- *  @brief  Look Up Table @b (LUT) transformation of a string. Equivalent to `for (char & c : text) c = lut[c]`.
- *
- *  Can be used to implement some form of string normalization, partially masking punctuation marks,
- *  or converting between different character sets, like uppercase or lowercase. Surprisingly, also has
- *  broad implications in image processing, where image channel transformations are often done using LUTs.
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param lut      Look Up Table to apply. Must be exactly @b 256 bytes long.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
-
-typedef void (*sz_look_up_transform_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_ptr_t);
-
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_serial(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
-
-/**
- *  @brief  Equivalent to `for (char & c : text) c = tolower(c)`.
- *
- *  ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
- *  So there are 26 english letters, shifted by 32 values, meaning that a conversion
- *  can be done by flipping the 5th bit each inappropriate character byte. This, however,
- *  breaks for extended ASCII, so a different solution is needed.
- *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_PUBLIC void sz_tolower(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
-
-/**
- *  @brief  Equivalent to `for (char & c : text) c = toupper(c)`.
- *
- *  ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
- *  So there are 26 english letters, shifted by 32 values, meaning that a conversion
- *  can be done by flipping the 5th bit each inappropriate character byte. This, however,
- *  breaks for extended ASCII, so a different solution is needed.
- *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_PUBLIC void sz_toupper(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
-
-/**
- *  @brief  Equivalent to `for (char & c : text) c = toascii(c)`.
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_PUBLIC void sz_toascii(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
-
-/**
- *  @brief  Checks if all characters in the range are valid ASCII characters.
- *
- *  @param text     String to be analyzed.
- *  @param length   Number of bytes in the string.
- *  @return         Whether all characters are valid ASCII characters.
- */
-SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t text, sz_size_t length);
-
-/**
- *  @brief  Generates a random string for a given alphabet, avoiding integer division and modulo operations.
- *          Similar to `text[i] = alphabet[rand() % cardinality]`.
- *
- *  The modulo operation is expensive, and should be avoided in performance-critical code.
- *  We avoid it using small lookup tables and replacing it with a multiplication and shifts, similar to `libdivide`.
- *  Alternative algorithms would include:
- *      - Montgomery form: https://en.algorithmica.org/hpc/number-theory/montgomery/
- *      - Barret reduction: https://www.nayuki.io/page/barrett-reduction-algorithm
- *      - Lemire's trick: https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
- *
- *  @param alphabet     Set of characters to sample from.
- *  @param cardinality  Number of characters to sample from.
- *  @param text         Output string, can point to the same address as ::text.
- *  @param generate     Callback producing random numbers given the generator state.
- *  @param generator    Generator state, can be a pointer to a seed, or a pointer to a random number generator.
- */
-SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
-                            sz_random_generator_t generate, void *generator);
-
-/** @copydoc sz_generate */
-SZ_PUBLIC void sz_generate_serial(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
-                                  sz_random_generator_t generate, void *generator);
-
-/**
- *  @brief  Similar to `memcpy`, copies contents of one string into another.
- *          The behavior is undefined if the strings overlap.
- *
- *  @param target   String to copy into.
- *  @param length   Number of bytes to copy.
- *  @param source   String to copy from.
- */
-SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-/**
- *  @brief  Similar to `memmove`, copies (moves) contents of one string into another.
- *          Unlike `sz_copy`, allows overlapping strings as arguments.
- *
- *  @param target   String to copy into.
- *  @param length   Number of bytes to copy.
- *  @param source   String to copy from.
- */
-SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-typedef void (*sz_move_t)(sz_ptr_t, sz_cptr_t, sz_size_t);
-
-/**
- *  @brief  Similar to `memset`, fills a string with a given value.
- *
- *  @param target   String to fill.
- *  @param length   Number of bytes to fill.
- *  @param value    Value to fill with.
- */
-SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-
-typedef void (*sz_fill_t)(sz_ptr_t, sz_size_t, sz_u8_t);
-
-/**
- *  @brief  Initializes a string class instance to an empty value.
- */
-SZ_PUBLIC void sz_string_init(sz_string_t *string);
-
-/**
- *  @brief  Convenience function checking if the provided string is stored inside of the ::string instance itself,
- *          alternative being - allocated in a remote region of the heap.
- */
-SZ_PUBLIC sz_bool_t sz_string_is_on_stack(sz_string_t const *string);
-
-/**
- *  @brief  Unpacks the opaque instance of a string class into its components.
- *          Recommended to use only in read-only operations.
- *
- *  @param string       String to unpack.
- *  @param start        Pointer to the start of the string.
- *  @param length       Number of bytes in the string, before the SZ_NULL character.
- *  @param space        Number of bytes allocated for the string (heap or stack), including the SZ_NULL character.
- *  @param is_external  Whether the string is allocated on the heap externally, or fits withing ::string instance.
- */
-SZ_PUBLIC void sz_string_unpack(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length, sz_size_t *space,
-                                sz_bool_t *is_external);
-
-/**
- *  @brief  Unpacks only the start and length of the string.
- *          Recommended to use only in read-only operations.
- *
- * @param string       String to unpack.
- * @param start        Pointer to the start of the string.
- * @param length       Number of bytes in the string, before the SZ_NULL character.
- */
-SZ_PUBLIC void sz_string_range(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length);
-
-/**
- *  @brief  Constructs a string of a given ::length with noisy contents.
- *          Use the returned character pointer to populate the string.
- *
- *  @param string       String to initialize.
- *  @param length       Number of bytes in the string, before the SZ_NULL character.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             SZ_NULL if the operation failed, pointer to the start of the string otherwise.
- */
-SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length, sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Doesn't change the contents or the length of the string, but grows the available memory capacity.
- *          This is beneficial, if several insertions are expected, and we want to minimize allocations.
- *
- *  @param string       String to grow.
- *  @param new_capacity The number of characters to reserve space for, including existing ones.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             SZ_NULL if the operation failed, pointer to the new start of the string otherwise.
- */
-SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity, sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Grows the string by adding an uninitialized region of ::added_length at the given ::offset.
- *          Would often be used in conjunction with one or more `sz_copy` calls to populate the allocated region.
- *          Similar to `sz_string_reserve`, but changes the length of the ::string.
- *
- *  @param string       String to grow.
- *  @param offset       Offset of the first byte to reserve space for.
- *                      If provided offset is larger than the length, it will be capped.
- *  @param added_length The number of new characters to reserve space for.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             SZ_NULL if the operation failed, pointer to the new start of the string otherwise.
- */
-SZ_PUBLIC sz_ptr_t sz_string_expand(sz_string_t *string, sz_size_t offset, sz_size_t added_length,
-                                    sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Removes a range from a string. Changes the length, but not the capacity.
- *          Performs no allocations or deallocations and can't fail.
- *
- *  @param string       String to clean.
- *  @param offset       Offset of the first byte to remove.
- *  @param length       Number of bytes to remove. Out-of-bound ranges will be capped.
- *  @return             Number of bytes removed.
- */
-SZ_PUBLIC sz_size_t sz_string_erase(sz_string_t *string, sz_size_t offset, sz_size_t length);
-
-/**
- *  @brief  Shrinks the string to fit the current length, if it's allocated on the heap.
- *          It's the reverse operation of ::sz_string_reserve.
- *
- *  @param string       String to shrink.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             Whether the operation was successful. The only failures can come from the allocator.
- *                      On failure, the string will remain unchanged.
- */
-SZ_PUBLIC sz_ptr_t sz_string_shrink_to_fit(sz_string_t *string, sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Frees the string, if it's allocated on the heap.
- *          If the string is on the stack, the function clears/resets the state.
- */
-SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *allocator);
-
-#pragma endregion
-
-#pragma region Fast Substring Search API
-
-typedef sz_cptr_t (*sz_find_byte_t)(sz_cptr_t, sz_size_t, sz_cptr_t);
-typedef sz_cptr_t (*sz_find_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
-typedef sz_cptr_t (*sz_find_set_t)(sz_cptr_t, sz_size_t, sz_charset_t const *);
+#pragma region Core API
 
 /**
  *  @brief  Locates first matching byte in a string. Equivalent to `memchr(haystack, *needle, h_length)` in LibC.
@@ -733,9 +41,6 @@ typedef sz_cptr_t (*sz_find_set_t)(sz_cptr_t, sz_size_t, sz_charset_t const *);
  */
 SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
 
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
 /**
  *  @brief  Locates last matching byte in a string. Equivalent to `memrchr(haystack, *needle, h_length)` in LibC.
  *
@@ -749,9 +54,32 @@ SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t haystack, sz_size_t h_length,
  */
 SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
 
+/** @copydoc sz_find_byte */
+SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
 /** @copydoc sz_rfind_byte */
 SZ_PUBLIC sz_cptr_t sz_rfind_byte_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
 
+#if SZ_USE_HASWELL
+/** @copydoc sz_find_byte */
+SZ_PUBLIC sz_cptr_t sz_find_byte_haswell(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+/** @copydoc sz_rfind_byte */
+SZ_PUBLIC sz_cptr_t sz_rfind_byte_haswell(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+#endif
+
+#if SZ_USE_SKYLAKE
+/** @copydoc sz_find_byte */
+SZ_PUBLIC sz_cptr_t sz_find_byte_skylake(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+/** @copydoc sz_rfind_byte */
+SZ_PUBLIC sz_cptr_t sz_rfind_byte_skylake(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+#endif
+
+#if SZ_USE_NEON
+/** @copydoc sz_find_byte */
+SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+/** @copydoc sz_rfind_byte */
+SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+#endif
+
 /**
  *  @brief  Locates first matching substring.
  *          Equivalent to `memmem(haystack, h_length, needle, n_length)` in LibC.
@@ -765,9 +93,6 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byte_serial(sz_cptr_t haystack, sz_size_t h_length,
  */
 SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
 
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
 /**
  *  @brief  Locates the last matching substring.
  *
@@ -779,29 +104,49 @@ SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cp
  */
 SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
 
+/** @copydoc sz_find */
+SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
 /** @copydoc sz_rfind */
 SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
 
-/**
- *  @brief  Finds the first character present from the ::set, present in ::text.
- *          Equivalent to `strspn(text, accepted)` and `strcspn(text, rejected)` in LibC.
- *          May have identical implementation and performance to ::sz_rfind_charset.
- *
- *  Useful for parsing, when we want to skip a set of characters. Examples:
- *  * 6 whitespaces: " \t\n\r\v\f".
- *  * 16 digits forming a float number: "0123456789,.eE+-".
- *  * 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
- *  * 2 JSON string special characters useful to locate the end of the string: "\"\\".
- *
+#if SZ_USE_HASWELL
+/** @copydoc sz_find */
+SZ_PUBLIC sz_cptr_t sz_find_haswell(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+/** @copydoc sz_rfind */
+SZ_PUBLIC sz_cptr_t sz_rfind_haswell(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+#endif
+
+#if SZ_USE_SKYLAKE
+/** @copydoc sz_find */
+SZ_PUBLIC sz_cptr_t sz_find_skylake(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+/** @copydoc sz_rfind */
+SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+#endif
+
+#if SZ_USE_NEON
+/** @copydoc sz_find */
+SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+/** @copydoc sz_rfind */
+SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+#endif
+
+/**
+ *  @brief  Finds the first character present from the ::set, present in ::text.
+ *          Equivalent to `strspn(text, accepted)` and `strcspn(text, rejected)` in LibC.
+ *          May have identical implementation and performance to ::sz_rfind_charset.
+ *
+ *  Useful for parsing, when we want to skip a set of characters. Examples:
+ *  * 6 whitespaces: " \t\n\r\v\f".
+ *  * 16 digits forming a float number: "0123456789,.eE+-".
+ *  * 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
+ *  * 2 JSON string special characters useful to locate the end of the string: "\"\\".
+ *
  *  @param text     String to be scanned.
  *  @param set      Set of relevant characters.
  *  @return         Pointer to the first matching character from ::set.
  */
 SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
 
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
 /**
  *  @brief  Finds the last character present from the ::set, present in ::text.
  *          Equivalent to `strspn(text, accepted)` and `strcspn(text, rejected)` in LibC.
@@ -819,3406 +164,680 @@ SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_
  */
 SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
 
+/** @copydoc sz_find_charset */
+SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
 /** @copydoc sz_rfind_charset */
 SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
 
-#pragma endregion
-
-#pragma region String Similarity Measures API
-
-/**
- *  @brief  Computes the Hamming distance between two strings - number of not matching characters.
- *          Difference in length is is counted as a mismatch.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for the distance, the `bound` if was exceeded.
- *
- *  @see    sz_hamming_distance_utf8
- *  @see    https://en.wikipedia.org/wiki/Hamming_distance
- */
-SZ_DYNAMIC sz_size_t sz_hamming_distance( //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
+#if SZ_USE_HASWELL
+/** @copydoc sz_find_charset */
+SZ_PUBLIC sz_cptr_t sz_find_charset_haswell(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+/** @copydoc sz_rfind_charset */
+SZ_PUBLIC sz_cptr_t sz_rfind_charset_haswell(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+#endif
 
-/** @copydoc sz_hamming_distance */
-SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
+#if SZ_USE_ICE
+/** @copydoc sz_find_charset */
+SZ_PUBLIC sz_cptr_t sz_find_charset_ice(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+/** @copydoc sz_rfind_charset */
+SZ_PUBLIC sz_cptr_t sz_rfind_charset_ice(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+#endif
 
-/**
- *  @brief  Computes the Hamming distance between two @b UTF8 strings - number of not matching characters.
- *          Difference in length is is counted as a mismatch.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for the distance, the `bound` if was exceeded.
- *
- *  @see    sz_hamming_distance
- *  @see    https://en.wikipedia.org/wiki/Hamming_distance
- */
-SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                              sz_size_t bound);
+#if SZ_USE_NEON
+/** @copydoc sz_find_charset */
+SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+/** @copydoc sz_rfind_charset */
+SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+#endif
 
-/** @copydoc sz_hamming_distance_utf8 */
-SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                                    sz_size_t bound);
+#pragma endregion // Core API
 
-typedef sz_size_t (*sz_hamming_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t);
+#pragma region Serial Implementation
 
 /**
- *  @brief  Computes the Levenshtein edit-distance between two strings using the Wagner-Fisher algorithm.
- *          Similar to the Needleman-Wunsch alignment algorithm. Often used in fuzzy string matching.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @param bound    Exclusive upper bound on the distance, that allows us to exit early.
- *                  Pass `SZ_SIZE_MAX` or any value greater than `(max(a_length, b_length))` to ignore.
- *                  Pass zero to check if the strings are equal.
- *  @return         Unsigned integer for the edit distance. Zero means the strings are equal.
- *                  Returns the `bound` if it was exceeded or `SZ_SIZE_MAX` if the memory allocation failed.
- *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default
- *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
+ *  @brief  Byte-level equality comparison between two strings.
+ *          If unaligned loads are allowed, uses a switch-table to avoid loops on short strings.
  */
-SZ_DYNAMIC sz_size_t sz_edit_distance(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                      sz_size_t bound, sz_memory_allocator_t *alloc);
-
-/** @copydoc sz_edit_distance */
-SZ_PUBLIC sz_size_t sz_edit_distance_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                            sz_size_t bound, sz_memory_allocator_t *alloc);
+SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
+    sz_cptr_t const a_end = a + length;
+#if SZ_USE_MISALIGNED_LOADS
+    if (length >= SZ_SWAR_THRESHOLD) {
+        sz_u64_vec_t a_vec, b_vec;
+        for (; a + 8 <= a_end; a += 8, b += 8) {
+            a_vec = sz_u64_load(a);
+            b_vec = sz_u64_load(b);
+            if (a_vec.u64 != b_vec.u64) return sz_false_k;
+        }
+    }
+#endif
+    while (a != a_end && *a == *b) a++, b++;
+    return (sz_bool_t)(a_end == a);
+}
 
 /**
- *  @brief  Computes the Levenshtein edit-distance between two @b UTF8 strings.
- *          Unlike `sz_edit_distance`, reports the distance in Unicode codepoints, and not in bytes.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
+ *  @brief  Chooses the offsets of the most interesting characters in a search needle.
  *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for edit distance, the `bound` if was exceeded or `SZ_SIZE_MAX`
- *                  if the memory allocation failed.
+ *  Search throughput can significantly deteriorate if we are matching the wrong characters.
+ *  Say the needle is "aXaYa", and we are comparing the first, second, and last character.
+ *  If we use SIMD and compare many offsets at a time, comparing against "a" in every register is a waste.
  *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default, sz_edit_distance
- *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
+ *  Similarly, dealing with UTF8 inputs, we know that the lower bits of each character code carry more information.
+ *  Cyrillic alphabet, for example, falls into [0x0410, 0x042F] code range for uppercase [А, Я], and
+ *  into [0x0430, 0x044F] for lowercase [а, я]. Scanning through a text written in Russian, half of the
+ *  bytes will carry absolutely no value and will be equal to 0x04.
  */
-SZ_DYNAMIC sz_size_t sz_edit_distance_utf8(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                           sz_size_t bound, sz_memory_allocator_t *alloc);
-
-typedef sz_size_t (*sz_edit_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t, sz_memory_allocator_t *);
+SZ_INTERNAL void _sz_locate_needle_anomalies( //
+    sz_cptr_t start, sz_size_t length,        //
+    sz_size_t *first, sz_size_t *second, sz_size_t *third) {
 
-/** @copydoc sz_edit_distance_utf8 */
-SZ_PUBLIC sz_size_t sz_edit_distance_utf8_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                                 sz_size_t bound, sz_memory_allocator_t *alloc);
+    *first = 0;
+    *second = length / 2;
+    *third = length - 1;
 
-/**
- *  @brief  Computes Needleman–Wunsch alignment score for two string. Often used in bioinformatics and cheminformatics.
- *          Similar to the Levenshtein edit-distance, parameterized for gap and substitution penalties.
- *
- *  Not commutative in the general case, as the order of the strings matters, as `sz_alignment_score(a, b)` may
- *  not be equal to `sz_alignment_score(b, a)`. Becomes @b commutative, if the substitution costs are symmetric.
- *  Equivalent to the negative Levenshtein distance, if: `gap == -1` and `subs[i][j] == (i == j ? 0: -1)`.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *  @param gap      Penalty cost for gaps - insertions and removals.
- *  @param subs     Substitution costs matrix with 256 x 256 values for all pairs of characters.
- *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @return         Signed similarity score. Can be negative, depending on the substitution costs.
- *                  If the memory allocation fails, the function returns `SZ_SSIZE_MAX`.
- *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default
- *  @see    https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
- */
-SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                         sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                         sz_memory_allocator_t *alloc);
+    //
+    int has_duplicates =                   //
+        start[*first] == start[*second] || //
+        start[*first] == start[*third] ||  //
+        start[*second] == start[*third];
 
-/** @copydoc sz_alignment_score */
-SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                               sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                               sz_memory_allocator_t *alloc);
+    // Loop through letters to find non-colliding variants.
+    if (length > 3 && has_duplicates) {
+        // Pivot the middle point right, until we find a character different from the first one.
+        while (start[*second] == start[*first] && *second + 1 < *third) ++(*second);
+        // Pivot the third (last) point left, until we find a different character.
+        while ((start[*third] == start[*second] || start[*third] == start[*first]) && *third > (*second + 1))
+            --(*third);
+    }
 
-typedef sz_ssize_t (*sz_alignment_score_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_error_cost_t const *,
-                                           sz_error_cost_t, sz_memory_allocator_t *);
+    // TODO: Investigate alternative strategies for long needles.
+    // On very long needles we have the luxury to choose!
+    // Often dealing with UTF8, we will likely benefit from shifting the first and second characters
+    // further to the right, to achieve not only uniqueness within the needle, but also avoid common
+    // rune prefixes of 2-, 3-, and 4-byte codes.
+    if (length > 8) {
+        // Pivot the first and second points right, until we find a character, that:
+        // > is different from others.
+        // > doesn't start with 0b'110x'xxxx - only 5 bits of relevant info.
+        // > doesn't start with 0b'1110'xxxx - only 4 bits of relevant info.
+        // > doesn't start with 0b'1111'0xxx - only 3 bits of relevant info.
+        //
+        // So we are practically searching for byte values that start with 0b0xxx'xxxx or 0b'10xx'xxxx.
+        // Meaning they fall in the range [0, 127] and [128, 191], in other words any unsigned int up to 191.
+        sz_u8_t const *start_u8 = (sz_u8_t const *)start;
+        sz_size_t vibrant_first = *first, vibrant_second = *second, vibrant_third = *third;
 
-typedef void (*sz_hash_callback_t)(sz_cptr_t, sz_size_t, sz_u64_t, void *user);
+        // Let's begin with the seccond character, as the termination criteria there is more obvious
+        // and we may end up with more variants to check for the first candidate.
+        while ((start_u8[vibrant_second] > 191 || start_u8[vibrant_second] == start_u8[vibrant_third]) &&
+               (vibrant_second + 1 < vibrant_third))
+            ++vibrant_second;
 
-/**
- *  @brief  Computes the Karp-Rabin rolling hashes of a string supplying them to the provided `callback`.
- *          Can be used for similarity scores, search, ranking, etc.
- *
- *  Rabin-Karp-like rolling hashes can have very high-level of collisions and depend
- *  on the choice of bases and the prime number. That's why, often two hashes from the same
- *  family are used with different bases.
- *
- *       1. Kernighan and Ritchie's function uses 31, a prime close to the size of English alphabet.
- *       2. To be friendlier to byte-arrays and UTF8, we use 257 for the second function.
- *
- *  Choosing the right ::window_length is task- and domain-dependant. For example, most English words are
- *  between 3 and 7 characters long, so a window of 4 bytes would be a good choice. For DNA sequences,
- *  the ::window_length might be a multiple of 3, as the codons are 3 (nucleotides) bytes long.
- *  With such minimalistic alphabets of just four characters (AGCT) longer windows might be needed.
- *  For protein sequences the alphabet is 20 characters long, so the window can be shorter, than for DNAs.
- *
- *  @param text             String to hash.
- *  @param length           Number of bytes in the string.
- *  @param window_length    Length of the rolling window in bytes.
- *  @param window_step      Step of reported hashes. @b Must be power of two. Should be smaller than `window_length`.
- *  @param callback         Function receiving the start & length of a substring, the hash, and the `callback_handle`.
- *  @param callback_handle  Optional user-provided pointer to be passed to the `callback`.
- *  @see                    sz_hashes_fingerprint, sz_hashes_intersection
- */
-SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                          sz_hash_callback_t callback, void *callback_handle);
+        // Now check if we've indeed found a good candidate or should revert the `vibrant_second` to `second`.
+        if (start_u8[vibrant_second] < 191) { *second = vibrant_second; }
+        else { vibrant_second = *second; }
 
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_serial(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                                sz_hash_callback_t callback, void *callback_handle);
+        // Now check the first character.
+        while ((start_u8[vibrant_first] > 191 || start_u8[vibrant_first] == start_u8[vibrant_second] ||
+                start_u8[vibrant_first] == start_u8[vibrant_third]) &&
+               (vibrant_first + 1 < vibrant_second))
+            ++vibrant_first;
 
-typedef void (*sz_hashes_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_size_t, sz_hash_callback_t, void *);
+        // Now check if we've indeed found a good candidate or should revert the `vibrant_first` to `first`.
+        // We don't need to shift the third one when dealing with texts as the last byte of the text is
+        // also the last byte of a rune and contains the most information.
+        if (start_u8[vibrant_first] < 191) { *first = vibrant_first; }
+    }
+}
 
-/**
- *  @brief  Computes the Karp-Rabin rolling hashes of a string outputting a binary fingerprint.
- *          Such fingerprints can be compared with Hamming or Jaccard (Tanimoto) distance for similarity.
- *
- *  The algorithm doesn't clear the fingerprint buffer on start, so it can be invoked multiple times
- *  to produce a fingerprint of a longer string, by passing the previous fingerprint as the ::fingerprint.
- *  It can also be reused to produce multi-resolution fingerprints by changing the ::window_length
- *  and calling the same function multiple times for the same input ::text.
- *
- *  Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
- *  avoiding cache-coherency penalties of remote on-heap buffers.
- *
- *  @param text                 String to hash.
- *  @param length               Number of bytes in the string.
- *  @param fingerprint          Output fingerprint buffer.
- *  @param fingerprint_bytes    Number of bytes in the fingerprint buffer.
- *  @param window_length        Length of the rolling window in bytes.
- *  @see                        sz_hashes, sz_hashes_intersection
- */
-SZ_PUBLIC void sz_hashes_fingerprint(                          //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
-    sz_ptr_t fingerprint, sz_size_t fingerprint_bytes);
+SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
+    for (sz_cptr_t const end = text + length; text != end; ++text)
+        if (sz_charset_contains(set, *text)) return text;
+    return SZ_NULL_CHAR;
+}
 
-typedef void (*sz_hashes_fingerprint_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_ptr_t, sz_size_t);
+SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
+    sz_cptr_t const end = text;
+    for (text += length; text != end;)
+        if (sz_charset_contains(set, *(text -= 1))) return text;
+    return SZ_NULL_CHAR;
+#pragma GCC diagnostic pop
+}
 
 /**
- *  @brief  Given a hash-fingerprint of a textual document, computes the number of intersecting hashes
- *          of the incoming document. Can be used for document scoring and search.
- *
- *  Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
- *  avoiding cache-coherency penalties of remote on-heap buffers.
- *
- *  @param text                 Input document.
- *  @param length               Number of bytes in the input document.
- *  @param fingerprint          Reference document fingerprint.
- *  @param fingerprint_bytes    Number of bytes in the reference documents fingerprint.
- *  @param window_length        Length of the rolling window in bytes.
- *  @see                        sz_hashes, sz_hashes_fingerprint
+ *  @brief  Byte-level equality comparison between two 64-bit integers.
+ *  @return 64-bit integer, where every top bit in each byte signifies a match.
  */
-SZ_PUBLIC sz_size_t sz_hashes_intersection(                    //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
-    sz_cptr_t fingerprint, sz_size_t fingerprint_bytes);
-
-typedef sz_size_t (*sz_hashes_intersection_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_cptr_t, sz_size_t);
+SZ_INTERNAL sz_u64_vec_t _sz_u64_each_byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
+    sz_u64_vec_t vec;
+    vec.u64 = ~(a.u64 ^ b.u64);
+    // The match is valid, if every bit within each byte is set.
+    // For that take the bottom 7 bits of each byte, add one to them,
+    // and if this sets the top bit to one, then all the 7 bits are ones as well.
+    vec.u64 = ((vec.u64 & 0x7F7F7F7F7F7F7F7Full) + 0x0101010101010101ull) & ((vec.u64 & 0x8080808080808080ull));
+    return vec;
+}
 
-#pragma endregion
+/*  Find the first occurrence of a @b single-character needle in an arbitrary length haystack.
+ *  This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
+ *  Identical to `memchr(haystack, needle[0], haystack_length)`.
+ */
+SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
 
-#pragma region Convenience API
+    if (!h_length) return SZ_NULL_CHAR;
+    sz_cptr_t const h_end = h + h_length;
 
-/**
- *  @brief  Finds the first character in the haystack, that is present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
+#if !_SZ_IS_BIG_ENDIAN       // Use SWAR only on little-endian platforms for brevity.
+#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
+    for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
+        if (*h == *n) return h;
+#endif
 
-/**
- *  @brief  Finds the first character in the haystack, that is @b not present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_find_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
+    // Broadcast the n into every byte of a 64-bit integer to use SWAR
+    // techniques and process eight characters at a time.
+    sz_u64_vec_t h_vec, n_vec, match_vec;
+    match_vec.u64 = 0;
+    n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
+    for (; h + 8 <= h_end; h += 8) {
+        h_vec.u64 = *(sz_u64_t const *)h;
+        match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
+        if (match_vec.u64) return h + sz_u64_ctz(match_vec.u64) / 8;
+    }
+#endif
 
-/**
- *  @brief  Finds the last character in the haystack, that is present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
+    // Handle the misaligned tail.
+    for (; h < h_end; ++h)
+        if (*h == *n) return h;
+    return SZ_NULL_CHAR;
+}
 
-/**
- *  @brief  Finds the last character in the haystack, that is @b not present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
+/*  Find the last occurrence of a @b single-character needle in an arbitrary length haystack.
+ *  This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
+ *  Identical to `memrchr(haystack, needle[0], haystack_length)`.
  */
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-#pragma endregion
+sz_cptr_t sz_rfind_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
 
-#pragma region String Sequences API
+    if (!h_length) return SZ_NULL_CHAR;
+    sz_cptr_t const h_start = h;
 
-struct sz_sequence_t;
+    // Reposition the `h` pointer to the end, as we will be walking backwards.
+    h = h + h_length - 1;
 
-typedef sz_cptr_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_size_t (*sz_sequence_member_length_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_bool_t (*sz_sequence_predicate_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_bool_t (*sz_sequence_comparator_t)(struct sz_sequence_t const *, sz_size_t, sz_size_t);
-typedef sz_bool_t (*sz_string_is_less_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
+#if !_SZ_IS_BIG_ENDIAN       // Use SWAR only on little-endian platforms for brevity.
+#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
+    for (; ((sz_size_t)(h + 1) & 7ull) && h >= h_start; --h)
+        if (*h == *n) return h;
+#endif
 
-typedef struct sz_sequence_t {
-    sz_sorted_idx_t *order;
-    sz_size_t count;
-    sz_sequence_member_start_t get_start;
-    sz_sequence_member_length_t get_length;
-    void const *handle;
-} sz_sequence_t;
+    // Broadcast the n into every byte of a 64-bit integer to use SWAR
+    // techniques and process eight characters at a time.
+    sz_u64_vec_t h_vec, n_vec, match_vec;
+    n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
+    for (; h >= h_start + 7; h -= 8) {
+        h_vec.u64 = *(sz_u64_t const *)(h - 7);
+        match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
+        if (match_vec.u64) return h - sz_u64_clz(match_vec.u64) / 8;
+    }
+#endif
 
-/**
- *  @brief  Initiates the sequence structure from a tape layout, used by Apache Arrow.
- *          Expects ::offsets to contains `count + 1` entries, the last pointing at the end
- *          of the last string, indicating the total length of the ::tape.
- */
-SZ_PUBLIC void sz_sequence_from_u32tape(sz_cptr_t *start, sz_u32_t const *offsets, sz_size_t count,
-                                        sz_sequence_t *sequence);
+    for (; h >= h_start; --h)
+        if (*h == *n) return h;
+    return SZ_NULL_CHAR;
+}
 
 /**
- *  @brief  Initiates the sequence structure from a tape layout, used by Apache Arrow.
- *          Expects ::offsets to contains `count + 1` entries, the last pointing at the end
- *          of the last string, indicating the total length of the ::tape.
+ *  @brief  2Byte-level equality comparison between two 64-bit integers.
+ *  @return 64-bit integer, where every top bit in each 2byte signifies a match.
  */
-SZ_PUBLIC void sz_sequence_from_u64tape(sz_cptr_t *start, sz_u64_t const *offsets, sz_size_t count,
-                                        sz_sequence_t *sequence);
+SZ_INTERNAL sz_u64_vec_t _sz_u64_each_2byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
+    sz_u64_vec_t vec;
+    vec.u64 = ~(a.u64 ^ b.u64);
+    // The match is valid, if every bit within each 2byte is set.
+    // For that take the bottom 15 bits of each 2byte, add one to them,
+    // and if this sets the top bit to one, then all the 15 bits are ones as well.
+    vec.u64 = ((vec.u64 & 0x7FFF7FFF7FFF7FFFull) + 0x0001000100010001ull) & ((vec.u64 & 0x8000800080008000ull));
+    return vec;
+}
 
 /**
- *  @brief  Similar to `std::partition`, given a predicate splits the sequence into two parts.
- *          The algorithm is unstable, meaning that elements may change relative order, as long
- *          as they are in the right partition. This is the simpler algorithm for partitioning.
+ *  @brief  Find the first occurrence of a @b two-character needle in an arbitrary length haystack.
+ *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
  */
-SZ_PUBLIC sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate);
+SZ_INTERNAL sz_cptr_t _sz_find_2byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
 
-/**
- *  @brief  Inplace `std::set_union` for two consecutive chunks forming the same continuous `sequence`.
- *
- *  @param partition The number of elements in the first sub-sequence in `sequence`.
- *  @param less Comparison function, to determine the lexicographic ordering.
- */
-SZ_PUBLIC void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less);
+    // This is an internal method, and the haystack is guaranteed to be at least 2 bytes long.
+    sz_assert(h_length >= 2 && "The haystack is too short.");
+    sz_cptr_t const h_end = h + h_length;
 
-/**
- *  @brief  Sorting algorithm, combining Radix Sort for the first 32 bits of every word
- *          and a follow-up by a more conventional sorting procedure on equally prefixed parts.
- */
-SZ_PUBLIC void sz_sort(sz_sequence_t *sequence);
+#if !SZ_USE_MISALIGNED_LOADS
+    // Process the misaligned head, to void UB on unaligned 64-bit loads.
+    for (; ((sz_size_t)h & 7ull) && h + 2 <= h_end; ++h)
+        if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
+#endif
 
-/**
- *  @brief  Partial sorting algorithm, combining Radix Sort for the first 32 bits of every word
- *          and a follow-up by a more conventional sorting procedure on equally prefixed parts.
- */
-SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t n);
-
-/**
- *  @brief  Intro-Sort algorithm that supports custom comparators.
- */
-SZ_PUBLIC void sz_sort_intro(sz_sequence_t *sequence, sz_sequence_comparator_t less);
-
-#pragma endregion
-
-/*
- *  Hardware feature detection.
- *  All of those can be controlled by the user.
- */
-#ifndef SZ_USE_X86_AVX512
-#ifdef __AVX512BW__
-#define SZ_USE_X86_AVX512 1
-#else
-#define SZ_USE_X86_AVX512 0
-#endif
-#endif
-
-#ifndef SZ_USE_X86_AVX2
-#ifdef __AVX2__
-#define SZ_USE_X86_AVX2 1
-#else
-#define SZ_USE_X86_AVX2 0
-#endif
-#endif
-
-#ifndef SZ_USE_ARM_NEON
-#ifdef __ARM_NEON
-#define SZ_USE_ARM_NEON 1
-#else
-#define SZ_USE_ARM_NEON 0
-#endif
-#endif
-
-#ifndef SZ_USE_ARM_SVE
-#ifdef __ARM_FEATURE_SVE
-#define SZ_USE_ARM_SVE 1
-#else
-#define SZ_USE_ARM_SVE 0
-#endif
-#endif
-
-/*
- *  Include hardware-specific headers.
- */
-#if SZ_USE_X86_AVX512 || SZ_USE_X86_AVX2
-#include <immintrin.h>
-#endif // SZ_USE_X86...
-#if SZ_USE_ARM_NEON
-#if !defined(_MSC_VER)
-#include <arm_acle.h>
-#endif
-#include <arm_neon.h>
-#endif // SZ_USE_ARM_NEON
-#if SZ_USE_ARM_SVE
-#if !defined(_MSC_VER)
-#include <arm_sve.h>
-#endif
-#endif // SZ_USE_ARM_SVE
-
-#pragma region Hardware Specific API
-
-#if SZ_USE_X86_AVX512
-
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_avx512(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_avx512(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_edit_distance */
-SZ_PUBLIC sz_size_t sz_edit_distance_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                            sz_size_t bound, sz_memory_allocator_t *alloc);
-/** @copydoc sz_alignment_score */
-SZ_PUBLIC sz_ssize_t sz_alignment_score_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                               sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                               sz_memory_allocator_t *alloc);
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle);
-#endif
-
-#if SZ_USE_X86_AVX2
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_avx2(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_avx2(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_avx2(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_avx2(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                              sz_hash_callback_t callback, void *callback_handle);
-#endif
-
-#if SZ_USE_ARM_NEON
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_neon(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_neon(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_neon(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-#endif
-
-#if SZ_USE_ARM_SVE
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_sve(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_sve(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_sve(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_sve(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_sve(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-#endif
-
-#pragma endregion
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wconversion"
-
-/*
- **********************************************************************************************************************
- **********************************************************************************************************************
- **********************************************************************************************************************
- *
- *  This is where we the actual implementation begins.
- *  The rest of the file is hidden from the public API.
- *
- **********************************************************************************************************************
- **********************************************************************************************************************
- **********************************************************************************************************************
- */
-
-#pragma region Compiler Extensions and Helper Functions
-
-#pragma GCC visibility push(hidden)
-
-/**
- *  @brief  Helper-macro to mark potentially unused variables.
- */
-#define sz_unused(x) ((void)(x))
-
-/**
- *  @brief  Helper-macro casting a variable to another type of the same size.
- */
-#define sz_bitcast(type, value) (*((type *)&(value)))
-
-/**
- *  @brief  Defines `SZ_NULL`, analogous to `NULL`.
- *          The default often comes from locale.h, stddef.h,
- *          stdio.h, stdlib.h, string.h, time.h, or wchar.h.
- */
-#ifdef __GNUG__
-#define SZ_NULL __null
-#define SZ_NULL_CHAR __null
-#else
-#define SZ_NULL ((void *)0)
-#define SZ_NULL_CHAR ((char *)0)
-#endif
-
-/**
- *  @brief  Cache-line width, that will affect the execution of some algorithms,
- *          like equality checks and relative order computing.
- */
-#define SZ_CACHE_LINE_WIDTH (64) // bytes
-
-/**
- *  @brief  Similar to `assert`, the `sz_assert` is used in the SZ_DEBUG mode
- *          to check the invariants of the library. It's a no-op in the SZ_RELEASE mode.
- *  @note   If you want to catch it, put a breakpoint at @b `__GI_exit`
- */
-#if SZ_DEBUG && defined(SZ_AVOID_LIBC) && !SZ_AVOID_LIBC && !defined(SZ_PIC)
-#include <stdio.h>  // `fprintf`
-#include <stdlib.h> // `EXIT_FAILURE`
-SZ_PUBLIC void _sz_assert_failure(char const *condition, char const *file, int line) {
-    fprintf(stderr, "Assertion failed: %s, in file %s, line %d\n", condition, file, line);
-    exit(EXIT_FAILURE);
-}
-#define sz_assert(condition)                                                      \
-    do {                                                                          \
-        if (!(condition)) { _sz_assert_failure(#condition, __FILE__, __LINE__); } \
-    } while (0)
-#else
-#define sz_assert(condition) ((void)(condition))
-#endif
-
-/*  Intrinsics aliases for MSVC, GCC, Clang, and Clang-Cl.
- *  The following section of compiler intrinsics comes in 2 flavors.
- */
-#if defined(_MSC_VER) && !defined(__clang__) // On Clang-CL
-#include <intrin.h>
-
-// Sadly, when building Win32 images, we can't use the `_tzcnt_u64`, `_lzcnt_u64`,
-// `_BitScanForward64`, or `_BitScanReverse64` intrinsics. For now it's a simple `for`-loop.
-// TODO: In the future we can switch to a more efficient De Bruijn's algorithm.
-// https://www.chessprogramming.org/BitScan
-// https://www.chessprogramming.org/De_Bruijn_Sequence
-// https://gist.github.com/resilar/e722d4600dbec9752771ab4c9d47044f
-//
-// Use the serial version on 32-bit x86 and on Arm.
-#if (defined(_WIN32) && !defined(_WIN64)) || defined(_M_ARM) || defined(_M_ARM64)
-SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 1) == 0) { n++, x >>= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u64_clz(sz_u64_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 0x8000000000000000ull) == 0) { n++, x <<= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) {
-    x = x - ((x >> 1) & 0x5555555555555555ull);
-    x = (x & 0x3333333333333333ull) + ((x >> 2) & 0x3333333333333333ull);
-    return (((x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Full) * 0x0101010101010101ull) >> 56;
-}
-SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 1) == 0) { n++, x >>= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u32_clz(sz_u32_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 0x80000000u) == 0) { n++, x <<= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) {
-    x = x - ((x >> 1) & 0x55555555);
-    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
-    return (((x + (x >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
-}
-#else
-SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) { return (int)_tzcnt_u64(x); }
-SZ_INTERNAL int sz_u64_clz(sz_u64_t x) { return (int)_lzcnt_u64(x); }
-SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) { return (int)__popcnt64(x); }
-SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) { return (int)_tzcnt_u32(x); }
-SZ_INTERNAL int sz_u32_clz(sz_u32_t x) { return (int)_lzcnt_u32(x); }
-SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) { return (int)__popcnt(x); }
-#endif
-// Force the byteswap functions to be intrinsics, because when /Oi- is given, these will turn into CRT function calls,
-// which breaks when `SZ_AVOID_LIBC` is given
-#pragma intrinsic(_byteswap_uint64)
-SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return _byteswap_uint64(val); }
-#pragma intrinsic(_byteswap_ulong)
-SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return _byteswap_ulong(val); }
-#else
-SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) { return __builtin_popcountll(x); }
-SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) { return __builtin_popcount(x); }
-SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) { return __builtin_ctzll(x); }
-SZ_INTERNAL int sz_u64_clz(sz_u64_t x) { return __builtin_clzll(x); }
-SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) { return __builtin_ctz(x); } // ! Undefined if `x == 0`
-SZ_INTERNAL int sz_u32_clz(sz_u32_t x) { return __builtin_clz(x); } // ! Undefined if `x == 0`
-SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return __builtin_bswap64(val); }
-SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return __builtin_bswap32(val); }
-#endif
-
-SZ_INTERNAL sz_u64_t sz_u64_rotl(sz_u64_t x, sz_u64_t r) { return (x << r) | (x >> (64 - r)); }
-
-/**
- *  @brief  Select bits from either ::a or ::b depending on the value of ::mask bits.
- *
- *  Similar to `_mm_blend_epi16` intrinsic on x86.
- *  Described in the "Bit Twiddling Hacks" by Sean Eron Anderson.
- *  https://graphics.stanford.edu/~seander/bithacks.html#ConditionalSetOrClearBitsWithoutBranching
- */
-SZ_INTERNAL sz_u64_t sz_u64_blend(sz_u64_t a, sz_u64_t b, sz_u64_t mask) { return a ^ ((a ^ b) & mask); }
-
-/*
- *  Efficiently computing the minimum and maximum of two or three values can be tricky.
- *  The simple branching baseline would be:
- *
- *      x < y ? x : y                               // can replace with 1 conditional move
- *
- *  Branchless approach is well known for signed integers, but it doesn't apply to unsigned ones.
- *  https://stackoverflow.com/questions/514435/templatized-branchless-int-max-min-function
- *  https://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
- *  Using only bit-shifts for singed integers it would be:
- *
- *      y + ((x - y) & (x - y) >> 31)               // 4 unique operations
- *
- *  Alternatively, for any integers using multiplication:
- *
- *      (x > y) * y + (x <= y) * x                  // 5 operations
- *
- *  Alternatively, to avoid multiplication:
- *
- *      x & ~((x < y) - 1) + y & ((x < y) - 1)      // 6 unique operations
- */
-#define sz_min_of_two(x, y) (x < y ? x : y)
-#define sz_max_of_two(x, y) (x < y ? y : x)
-#define sz_min_of_three(x, y, z) sz_min_of_two(x, sz_min_of_two(y, z))
-#define sz_max_of_three(x, y, z) sz_max_of_two(x, sz_max_of_two(y, z))
-
-/** @brief  Branchless minimum function for two signed 32-bit integers. */
-SZ_INTERNAL sz_i32_t sz_i32_min_of_two(sz_i32_t x, sz_i32_t y) { return y + ((x - y) & (x - y) >> 31); }
-
-/** @brief  Branchless minimum function for two signed 32-bit integers. */
-SZ_INTERNAL sz_i32_t sz_i32_max_of_two(sz_i32_t x, sz_i32_t y) { return x - ((x - y) & (x - y) >> 31); }
-
-/**
- *  @brief  Clamps signed offsets in a string to a valid range. Used for Pythonic-style slicing.
- */
-SZ_INTERNAL void sz_ssize_clamp_interval(sz_size_t length, sz_ssize_t start, sz_ssize_t end,
-                                         sz_size_t *normalized_offset, sz_size_t *normalized_length) {
-    // TODO: Remove branches.
-    // Normalize negative indices
-    if (start < 0) start += length;
-    if (end < 0) end += length;
-
-    // Clamp indices to a valid range
-    if (start < 0) start = 0;
-    if (end < 0) end = 0;
-    if (start > (sz_ssize_t)length) start = length;
-    if (end > (sz_ssize_t)length) end = length;
-
-    // Ensure start <= end
-    if (start > end) start = end;
-
-    *normalized_offset = start;
-    *normalized_length = end - start;
-}
-
-/**
- *  @brief  Compute the logarithm base 2 of a positive integer, rounding down.
- */
-SZ_INTERNAL sz_size_t sz_size_log2i_nonzero(sz_size_t x) {
-    sz_assert(x > 0 && "Non-positive numbers have no defined logarithm");
-    sz_size_t leading_zeros = sz_u64_clz(x);
-    return 63 - leading_zeros;
-}
-
-/**
- *  @brief  Compute the smallest power of two greater than or equal to ::x.
- */
-SZ_INTERNAL sz_size_t sz_size_bit_ceil(sz_size_t x) {
-    // Unlike the commonly used trick with `clz` intrinsics, is valid across the whole range of `x`.
-    // https://stackoverflow.com/a/10143264
-    x--;
-    x |= x >> 1;
-    x |= x >> 2;
-    x |= x >> 4;
-    x |= x >> 8;
-    x |= x >> 16;
-#if SZ_DETECT_64_BIT
-    x |= x >> 32;
-#endif
-    x++;
-    return x;
-}
-
-/**
- *  @brief  Transposes an 8x8 bit matrix packed in a `sz_u64_t`.
- *
- *  There is a well known SWAR sequence for that known to chess programmers,
- *  willing to flip a bit-matrix of pieces along the main A1-H8 diagonal.
- *  https://www.chessprogramming.org/Flipping_Mirroring_and_Rotating
- *  https://lukas-prokop.at/articles/2021-07-23-transpose
- */
-SZ_INTERNAL sz_u64_t sz_u64_transpose(sz_u64_t x) {
-    sz_u64_t t;
-    t = x ^ (x << 36);
-    x ^= 0xf0f0f0f00f0f0f0full & (t ^ (x >> 36));
-    t = 0xcccc0000cccc0000ull & (x ^ (x << 18));
-    x ^= t ^ (t >> 18);
-    t = 0xaa00aa00aa00aa00ull & (x ^ (x << 9));
-    x ^= t ^ (t >> 9);
-    return x;
-}
-
-/**
- *  @brief  Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
- */
-SZ_INTERNAL void sz_u64_swap(sz_u64_t *a, sz_u64_t *b) {
-    sz_u64_t t = *a;
-    *a = *b;
-    *b = t;
-}
-
-/**
- *  @brief  Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
- */
-SZ_INTERNAL void sz_pointer_swap(void **a, void **b) {
-    void *t = *a;
-    *a = *b;
-    *b = t;
-}
-
-/**
- *  @brief  Helper structure to simplify work with 16-bit words.
- *  @see    sz_u16_load
- */
-typedef union sz_u16_vec_t {
-    sz_u16_t u16;
-    sz_u8_t u8s[2];
-} sz_u16_vec_t;
-
-/**
- *  @brief Load a 16-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u16_vec_t sz_u16_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u16_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u16_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u16_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u16_vec_t const *result = (sz_u16_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/**
- *  @brief  Helper structure to simplify work with 32-bit words.
- *  @see    sz_u32_load
- */
-typedef union sz_u32_vec_t {
-    sz_u32_t u32;
-    sz_u16_t u16s[2];
-    sz_u8_t u8s[4];
-} sz_u32_vec_t;
-
-/**
- *  @brief Load a 32-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u32_vec_t sz_u32_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u32_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    result.u8s[2] = ptr[2];
-    result.u8s[3] = ptr[3];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u32_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u32_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u32_vec_t const *result = (sz_u32_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/**
- *  @brief  Helper structure to simplify work with 64-bit words.
- *  @see    sz_u64_load
- */
-typedef union sz_u64_vec_t {
-    sz_u64_t u64;
-    sz_u32_t u32s[2];
-    sz_u16_t u16s[4];
-    sz_u8_t u8s[8];
-} sz_u64_vec_t;
-
-/**
- *  @brief Load a 64-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u64_vec_t sz_u64_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u64_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    result.u8s[2] = ptr[2];
-    result.u8s[3] = ptr[3];
-    result.u8s[4] = ptr[4];
-    result.u8s[5] = ptr[5];
-    result.u8s[6] = ptr[6];
-    result.u8s[7] = ptr[7];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u64_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u64_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u64_vec_t const *result = (sz_u64_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/** @brief  Helper function, using the supplied fixed-capacity buffer to allocate memory. */
-SZ_INTERNAL sz_ptr_t _sz_memory_allocate_fixed(sz_size_t length, void *handle) {
-    sz_size_t capacity;
-    sz_copy((sz_ptr_t)&capacity, (sz_cptr_t)handle, sizeof(sz_size_t));
-    sz_size_t consumed_capacity = sizeof(sz_size_t);
-    if (consumed_capacity + length > capacity) return SZ_NULL_CHAR;
-    return (sz_ptr_t)handle + consumed_capacity;
-}
-
-/** @brief  Helper "no-op" function, simulating memory deallocation when we use a "static" memory buffer. */
-SZ_INTERNAL void _sz_memory_free_fixed(sz_ptr_t start, sz_size_t length, void *handle) {
-    sz_unused(start && length && handle);
-}
-
-/** @brief  An internal callback used to set a bit in a power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void _sz_hashes_fingerprint_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *handle) {
-    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
-    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
-    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
-    fingerprint_u8s[(hash / 8) & (fingerprint_bytes - 1)] |= (1 << (hash & 7));
-    sz_unused(start && length);
-}
-
-/** @brief  An internal callback used to set a bit in a @b non power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void _sz_hashes_fingerprint_non_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash,
-                                                          void *handle) {
-    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
-    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
-    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
-    fingerprint_u8s[(hash / 8) % fingerprint_bytes] |= (1 << (hash & 7));
-    sz_unused(start && length);
-}
-
-/** @brief  An internal callback, used to mix all the running hashes into one pointer-size value. */
-SZ_INTERNAL void _sz_hashes_fingerprint_scalar_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash,
-                                                        void *scalar_handle) {
-    sz_unused(start && length && hash && scalar_handle);
-    sz_size_t *scalar_ptr = (sz_size_t *)scalar_handle;
-    *scalar_ptr ^= hash;
-}
-
-/**
- *  @brief  Chooses the offsets of the most interesting characters in a search needle.
- *
- *  Search throughput can significantly deteriorate if we are matching the wrong characters.
- *  Say the needle is "aXaYa", and we are comparing the first, second, and last character.
- *  If we use SIMD and compare many offsets at a time, comparing against "a" in every register is a waste.
- *
- *  Similarly, dealing with UTF8 inputs, we know that the lower bits of each character code carry more information.
- *  Cyrillic alphabet, for example, falls into [0x0410, 0x042F] code range for uppercase [А, Я], and
- *  into [0x0430, 0x044F] for lowercase [а, я]. Scanning through a text written in Russian, half of the
- *  bytes will carry absolutely no value and will be equal to 0x04.
- */
-SZ_INTERNAL void _sz_locate_needle_anomalies(sz_cptr_t start, sz_size_t length, //
-                                             sz_size_t *first, sz_size_t *second, sz_size_t *third) {
-    *first = 0;
-    *second = length / 2;
-    *third = length - 1;
-
-    //
-    int has_duplicates =                   //
-        start[*first] == start[*second] || //
-        start[*first] == start[*third] ||  //
-        start[*second] == start[*third];
-
-    // Loop through letters to find non-colliding variants.
-    if (length > 3 && has_duplicates) {
-        // Pivot the middle point right, until we find a character different from the first one.
-        for (; start[*second] == start[*first] && *second + 1 < *third; ++(*second)) {}
-        // Pivot the third (last) point left, until we find a different character.
-        for (; (start[*third] == start[*second] || start[*third] == start[*first]) && *third > (*second + 1);
-             --(*third)) {}
-    }
-
-    // TODO: Investigate alternative strategies for long needles.
-    // On very long needles we have the luxury to choose!
-    // Often dealing with UTF8, we will likely benefit from shifting the first and second characters
-    // further to the right, to achieve not only uniqueness within the needle, but also avoid common
-    // rune prefixes of 2-, 3-, and 4-byte codes.
-    if (length > 8) {
-        // Pivot the first and second points right, until we find a character, that:
-        // > is different from others.
-        // > doesn't start with 0b'110x'xxxx - only 5 bits of relevant info.
-        // > doesn't start with 0b'1110'xxxx - only 4 bits of relevant info.
-        // > doesn't start with 0b'1111'0xxx - only 3 bits of relevant info.
-        //
-        // So we are practically searching for byte values that start with 0b0xxx'xxxx or 0b'10xx'xxxx.
-        // Meaning they fall in the range [0, 127] and [128, 191], in other words any unsigned int up to 191.
-        sz_u8_t const *start_u8 = (sz_u8_t const *)start;
-        sz_size_t vibrant_first = *first, vibrant_second = *second, vibrant_third = *third;
-
-        // Let's begin with the seccond character, as the termination criteria there is more obvious
-        // and we may end up with more variants to check for the first candidate.
-        for (; (start_u8[vibrant_second] > 191 || start_u8[vibrant_second] == start_u8[vibrant_third]) &&
-               (vibrant_second + 1 < vibrant_third);
-             ++vibrant_second) {}
-
-        // Now check if we've indeed found a good candidate or should revert the `vibrant_second` to `second`.
-        if (start_u8[vibrant_second] < 191) { *second = vibrant_second; }
-        else { vibrant_second = *second; }
-
-        // Now check the first character.
-        for (; (start_u8[vibrant_first] > 191 || start_u8[vibrant_first] == start_u8[vibrant_second] ||
-                start_u8[vibrant_first] == start_u8[vibrant_third]) &&
-               (vibrant_first + 1 < vibrant_second);
-             ++vibrant_first) {}
-
-        // Now check if we've indeed found a good candidate or should revert the `vibrant_first` to `first`.
-        // We don't need to shift the third one when dealing with texts as the last byte of the text is
-        // also the last byte of a rune and contains the most information.
-        if (start_u8[vibrant_first] < 191) { *first = vibrant_first; }
-    }
-}
-
-#pragma GCC visibility pop
-#pragma endregion
-
-#pragma region Serial Implementation
-
-#if !SZ_AVOID_LIBC
-#include <stdio.h>  // `fprintf`
-#include <stdlib.h> // `malloc`, `EXIT_FAILURE`
-
-SZ_PUBLIC void *_sz_memory_allocate_default(sz_size_t length, void *handle) {
-    sz_unused(handle);
-    return malloc(length);
-}
-SZ_PUBLIC void _sz_memory_free_default(sz_ptr_t start, sz_size_t length, void *handle) {
-    sz_unused(handle && length);
-    free(start);
-}
-
-#endif
-
-SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc) {
-#if !SZ_AVOID_LIBC
-    alloc->allocate = (sz_memory_allocate_t)_sz_memory_allocate_default;
-    alloc->free = (sz_memory_free_t)_sz_memory_free_default;
-#else
-    alloc->allocate = (sz_memory_allocate_t)SZ_NULL;
-    alloc->free = (sz_memory_free_t)SZ_NULL;
-#endif
-    alloc->handle = SZ_NULL;
-}
-
-SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length) {
-    // The logic here is simple - put the buffer length in the first slots of the buffer.
-    // Later use it for bounds checking.
-    alloc->allocate = (sz_memory_allocate_t)_sz_memory_allocate_fixed;
-    alloc->free = (sz_memory_free_t)_sz_memory_free_fixed;
-    alloc->handle = &buffer;
-    sz_copy((sz_ptr_t)buffer, (sz_cptr_t)&length, sizeof(sz_size_t));
-}
-
-/**
- *  @brief  Byte-level equality comparison between two strings.
- *          If unaligned loads are allowed, uses a switch-table to avoid loops on short strings.
- */
-SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_cptr_t const a_end = a + length;
-#if SZ_USE_MISALIGNED_LOADS
-    if (length >= SZ_SWAR_THRESHOLD) {
-        sz_u64_vec_t a_vec, b_vec;
-        for (; a + 8 <= a_end; a += 8, b += 8) {
-            a_vec = sz_u64_load(a);
-            b_vec = sz_u64_load(b);
-            if (a_vec.u64 != b_vec.u64) return sz_false_k;
-        }
-    }
-#endif
-    while (a != a_end && *a == *b) a++, b++;
-    return (sz_bool_t)(a_end == a);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-    for (sz_cptr_t const end = text + length; text != end; ++text)
-        if (sz_charset_contains(set, *text)) return text;
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Warray-bounds"
-    sz_cptr_t const end = text;
-    for (text += length; text != end;)
-        if (sz_charset_contains(set, *(text -= 1))) return text;
-    return SZ_NULL_CHAR;
-#pragma GCC diagnostic pop
-}
-
-/**
- *  One option to avoid branching is to use conditional moves and lookup the comparison result in a table:
- *       sz_ordering_t ordering_lookup[2] = {sz_greater_k, sz_less_k};
- *       for (; a != min_end; ++a, ++b)
- *           if (*a != *b) return ordering_lookup[*a < *b];
- *  That, however, introduces a data-dependency.
- *  A cleaner option is to perform two comparisons and a subtraction.
- *  One instruction more, but no data-dependency.
- */
-#define _sz_order_scalars(a, b) ((sz_ordering_t)((a > b) - (a < b)))
-
-SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    sz_bool_t a_shorter = (sz_bool_t)(a_length < b_length);
-    sz_size_t min_length = a_shorter ? a_length : b_length;
-    sz_cptr_t min_end = a + min_length;
-#if SZ_USE_MISALIGNED_LOADS && !SZ_DETECT_BIG_ENDIAN
-    for (sz_u64_vec_t a_vec, b_vec; a + 8 <= min_end; a += 8, b += 8) {
-        a_vec = sz_u64_load(a);
-        b_vec = sz_u64_load(b);
-        if (a_vec.u64 != b_vec.u64)
-            return _sz_order_scalars(sz_u64_bytes_reverse(a_vec.u64), sz_u64_bytes_reverse(b_vec.u64));
-    }
-#endif
-    for (; a != min_end; ++a, ++b)
-        if (*a != *b) return _sz_order_scalars(*a, *b);
-
-    // If the strings are equal up to `min_end`, then the shorter string is smaller
-    return _sz_order_scalars(a_length, b_length);
-}
-
-/**
- *  @brief  Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each byte is set.
-    // For that take the bottom 7 bits of each byte, add one to them,
-    // and if this sets the top bit to one, then all the 7 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7F7F7F7F7F7F7F7Full) + 0x0101010101010101ull) & ((vec.u64 & 0x8080808080808080ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b single-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- *          Identical to `memchr(haystack, needle[0], haystack_length)`.
- */
-SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    if (!h_length) return SZ_NULL_CHAR;
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_DETECT_BIG_ENDIAN    // Use SWAR only on little-endian platforms for brevety.
-#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
-        if (*h == *n) return h;
-#endif
-
-    // Broadcast the n into every byte of a 64-bit integer to use SWAR
-    // techniques and process eight characters at a time.
-    sz_u64_vec_t h_vec, n_vec, match_vec;
-    match_vec.u64 = 0;
-    n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
-    for (; h + 8 <= h_end; h += 8) {
-        h_vec.u64 = *(sz_u64_t const *)h;
-        match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
-        if (match_vec.u64) return h + sz_u64_ctz(match_vec.u64) / 8;
-    }
-#endif
-
-    // Handle the misaligned tail.
-    for (; h < h_end; ++h)
-        if (*h == *n) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Find the last occurrence of a @b single-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- *          Identical to `memrchr(haystack, needle[0], haystack_length)`.
- */
-sz_cptr_t sz_rfind_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    if (!h_length) return SZ_NULL_CHAR;
-    sz_cptr_t const h_start = h;
-
-    // Reposition the `h` pointer to the end, as we will be walking backwards.
-    h = h + h_length - 1;
-
-#if !SZ_DETECT_BIG_ENDIAN    // Use SWAR only on little-endian platforms for brevety.
-#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)(h + 1) & 7ull) && h >= h_start; --h)
-        if (*h == *n) return h;
-#endif
-
-    // Broadcast the n into every byte of a 64-bit integer to use SWAR
-    // techniques and process eight characters at a time.
-    sz_u64_vec_t h_vec, n_vec, match_vec;
-    n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
-    for (; h >= h_start + 7; h -= 8) {
-        h_vec.u64 = *(sz_u64_t const *)(h - 7);
-        match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
-        if (match_vec.u64) return h - sz_u64_clz(match_vec.u64) / 8;
-    }
-#endif
-
-    for (; h >= h_start; --h)
-        if (*h == *n) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  2Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 2byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_2byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 2byte is set.
-    // For that take the bottom 15 bits of each 2byte, add one to them,
-    // and if this sets the top bit to one, then all the 15 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7FFF7FFF7FFF7FFFull) + 0x0001000100010001ull) & ((vec.u64 & 0x8000800080008000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b two-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_2byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 2 bytes long.
-    sz_assert(h_length >= 2 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 2 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
-#endif
-
-    sz_u64_vec_t h_even_vec, h_odd_vec, n_vec, matches_even_vec, matches_odd_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1];
-    n_vec.u64 *= 0x0001000100010001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
-    for (; h + 9 <= h_end; h += 8) {
-        h_even_vec.u64 = *(sz_u64_t *)h;
-        h_odd_vec.u64 = (h_even_vec.u64 >> 8) | ((sz_u64_t)h[8] << 56);
-        matches_even_vec = _sz_u64_each_2byte_equal(h_even_vec, n_vec);
-        matches_odd_vec = _sz_u64_each_2byte_equal(h_odd_vec, n_vec);
-
-        matches_even_vec.u64 >>= 8;
-        if (matches_even_vec.u64 + matches_odd_vec.u64) {
-            sz_u64_t match_indicators = matches_even_vec.u64 | matches_odd_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 2 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  4Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 4byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_4byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 4byte is set.
-    // For that take the bottom 31 bits of each 4byte, add one to them,
-    // and if this sets the top bit to one, then all the 31 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7FFFFFFF7FFFFFFFull) + 0x0000000100000001ull) & ((vec.u64 & 0x8000000080000000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b four-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_4byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
-    sz_assert(h_length >= 4 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 4 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
-#endif
-
-    sz_u64_vec_t h0_vec, h1_vec, h2_vec, h3_vec, n_vec, matches0_vec, matches1_vec, matches2_vec, matches3_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1], n_vec.u8s[2] = n[2], n_vec.u8s[3] = n[3];
-    n_vec.u64 *= 0x0000000100000001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time using four 64-bit words.
-    // We load the subsequent four-byte word as well, taking its first bytes. Think of it as a glorified prefetch :)
-    sz_u64_t h_page_current, h_page_next;
-    for (; h + sizeof(sz_u64_t) + sizeof(sz_u32_t) <= h_end; h += sizeof(sz_u64_t)) {
-        h_page_current = *(sz_u64_t *)h;
-        h_page_next = *(sz_u32_t *)(h + 8);
-        h0_vec.u64 = (h_page_current);
-        h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
-        h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
-        h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
-        matches0_vec = _sz_u64_each_4byte_equal(h0_vec, n_vec);
-        matches1_vec = _sz_u64_each_4byte_equal(h1_vec, n_vec);
-        matches2_vec = _sz_u64_each_4byte_equal(h2_vec, n_vec);
-        matches3_vec = _sz_u64_each_4byte_equal(h3_vec, n_vec);
-
-        if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64) {
-            matches0_vec.u64 >>= 24;
-            matches1_vec.u64 >>= 16;
-            matches2_vec.u64 >>= 8;
-            sz_u64_t match_indicators = matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 4 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  3Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 3byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_3byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 4byte is set.
-    // For that take the bottom 31 bits of each 4byte, add one to them,
-    // and if this sets the top bit to one, then all the 31 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0xFFFF7FFFFF7FFFFFull) + 0x0000000001000001ull) & ((vec.u64 & 0x0000800000800000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b three-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_3byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
-    sz_assert(h_length >= 3 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 3 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
-#endif
-
-    // We fetch 12
-    sz_u64_vec_t h0_vec, h1_vec, h2_vec, h3_vec, h4_vec;
-    sz_u64_vec_t matches0_vec, matches1_vec, matches2_vec, matches3_vec, matches4_vec;
-    sz_u64_vec_t n_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1], n_vec.u8s[2] = n[2];
-    n_vec.u64 *= 0x0000000001000001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time using three 64-bit words.
-    // We load the subsequent two-byte word as well.
-    sz_u64_t h_page_current, h_page_next;
-    for (; h + sizeof(sz_u64_t) + sizeof(sz_u16_t) <= h_end; h += sizeof(sz_u64_t)) {
-        h_page_current = *(sz_u64_t *)h;
-        h_page_next = *(sz_u16_t *)(h + 8);
-        h0_vec.u64 = (h_page_current);
-        h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
-        h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
-        h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
-        h4_vec.u64 = (h_page_current >> 32) | (h_page_next << 32);
-        matches0_vec = _sz_u64_each_3byte_equal(h0_vec, n_vec);
-        matches1_vec = _sz_u64_each_3byte_equal(h1_vec, n_vec);
-        matches2_vec = _sz_u64_each_3byte_equal(h2_vec, n_vec);
-        matches3_vec = _sz_u64_each_3byte_equal(h3_vec, n_vec);
-        matches4_vec = _sz_u64_each_3byte_equal(h4_vec, n_vec);
-
-        if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64) {
-            matches0_vec.u64 >>= 16;
-            matches1_vec.u64 >>= 8;
-            matches3_vec.u64 <<= 8;
-            matches4_vec.u64 <<= 16;
-            sz_u64_t match_indicators =
-                matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 3 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Boyer-Moore-Horspool algorithm for exact matching of patterns up to @b 256-bytes long.
- *          Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_horspool_upto_256bytes_serial(sz_cptr_t h_chars, sz_size_t h_length, //
-                                                             sz_cptr_t n_chars, sz_size_t n_length) {
-    sz_assert(n_length <= 256 && "The pattern is too long.");
-    // Several popular string matching algorithms are using a bad-character shift table.
-    // Boyer Moore: https://www-igm.univ-mlv.fr/~lecroq/string/node14.html
-    // Quick Search: https://www-igm.univ-mlv.fr/~lecroq/string/node19.html
-    // Smith: https://www-igm.univ-mlv.fr/~lecroq/string/node21.html
-    union {
-        sz_u8_t jumps[256];
-        sz_u64_vec_t vecs[64];
-    } bad_shift_table;
-
-    // Let's initialize the table using SWAR to the total length of the string.
-    sz_u8_t const *h = (sz_u8_t const *)h_chars;
-    sz_u8_t const *n = (sz_u8_t const *)n_chars;
-    {
-        sz_u64_vec_t n_length_vec;
-        n_length_vec.u64 = n_length;
-        n_length_vec.u64 *= 0x0101010101010101ull; // broadcast
-        for (sz_size_t i = 0; i != 64; ++i) bad_shift_table.vecs[i].u64 = n_length_vec.u64;
-        for (sz_size_t i = 0; i + 1 < n_length; ++i) bad_shift_table.jumps[n[i]] = (sz_u8_t)(n_length - i - 1);
-    }
-
-    // Another common heuristic is to match a few characters from different parts of a string.
-    // Raita suggests to use the first two, the last, and the middle character of the pattern.
-    sz_u32_vec_t h_vec, n_vec;
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into an unsigned integer.
-    n_vec.u8s[0] = n[offset_first];
-    n_vec.u8s[1] = n[offset_first + 1];
-    n_vec.u8s[2] = n[offset_mid];
-    n_vec.u8s[3] = n[offset_last];
-
-    // Scan through the whole haystack, skipping the last `n_length - 1` bytes.
-    for (sz_size_t i = 0; i <= h_length - n_length;) {
-        h_vec.u8s[0] = h[i + offset_first];
-        h_vec.u8s[1] = h[i + offset_first + 1];
-        h_vec.u8s[2] = h[i + offset_mid];
-        h_vec.u8s[3] = h[i + offset_last];
-        if (h_vec.u32 == n_vec.u32 && sz_equal((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
-        i += bad_shift_table.jumps[h[i + n_length - 1]];
-    }
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Boyer-Moore-Horspool algorithm for @b reverse-order exact matching of patterns up to @b 256-bytes long.
- *          Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
- */
-SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_upto_256bytes_serial(sz_cptr_t h_chars, sz_size_t h_length, //
-                                                              sz_cptr_t n_chars, sz_size_t n_length) {
-    sz_assert(n_length <= 256 && "The pattern is too long.");
-    union {
-        sz_u8_t jumps[256];
-        sz_u64_vec_t vecs[64];
-    } bad_shift_table;
-
-    // Let's initialize the table using SWAR to the total length of the string.
-    sz_u8_t const *h = (sz_u8_t const *)h_chars;
-    sz_u8_t const *n = (sz_u8_t const *)n_chars;
-    {
-        sz_u64_vec_t n_length_vec;
-        n_length_vec.u64 = n_length;
-        n_length_vec.u64 *= 0x0101010101010101ull; // broadcast
-        for (sz_size_t i = 0; i != 64; ++i) bad_shift_table.vecs[i].u64 = n_length_vec.u64;
-        for (sz_size_t i = 0; i + 1 < n_length; ++i)
-            bad_shift_table.jumps[n[n_length - i - 1]] = (sz_u8_t)(n_length - i - 1);
-    }
-
-    // Another common heuristic is to match a few characters from different parts of a string.
-    // Raita suggests to use the first two, the last, and the middle character of the pattern.
-    sz_u32_vec_t h_vec, n_vec;
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into an unsigned integer.
-    n_vec.u8s[0] = n[offset_first];
-    n_vec.u8s[1] = n[offset_first + 1];
-    n_vec.u8s[2] = n[offset_mid];
-    n_vec.u8s[3] = n[offset_last];
-
-    // Scan through the whole haystack, skipping the first `n_length - 1` bytes.
-    for (sz_size_t j = 0; j <= h_length - n_length;) {
-        sz_size_t i = h_length - n_length - j;
-        h_vec.u8s[0] = h[i + offset_first];
-        h_vec.u8s[1] = h[i + offset_first + 1];
-        h_vec.u8s[2] = h[i + offset_mid];
-        h_vec.u8s[3] = h[i + offset_last];
-        if (h_vec.u32 == n_vec.u32 && sz_equal((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
-        j += bad_shift_table.jumps[h[i]];
-    }
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Exact substring search helper function, that finds the first occurrence of a prefix of the needle
- *          using a given search function, and then verifies the remaining part of the needle.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_with_prefix(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length,
-                                           sz_find_t find_prefix, sz_size_t prefix_length) {
-
-    sz_size_t suffix_length = n_length - prefix_length;
-    while (1) {
-        sz_cptr_t found = find_prefix(h, h_length, n, prefix_length);
-        if (!found) return SZ_NULL_CHAR;
-
-        // Verify the remaining part of the needle
-        sz_size_t remaining = h_length - (found - h);
-        if (remaining < n_length) return SZ_NULL_CHAR;
-        if (sz_equal(found + prefix_length, n + prefix_length, suffix_length)) return found;
-
-        // Adjust the position.
-        h = found + 1;
-        h_length = remaining - 1;
-    }
-
-    // Unreachable, but helps silence compiler warnings:
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Exact reverse-order substring search helper function, that finds the last occurrence of a suffix of the
- *          needle using a given search function, and then verifies the remaining part of the needle.
- */
-SZ_INTERNAL sz_cptr_t _sz_rfind_with_suffix(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length,
-                                            sz_find_t find_suffix, sz_size_t suffix_length) {
-
-    sz_size_t prefix_length = n_length - suffix_length;
-    while (1) {
-        sz_cptr_t found = find_suffix(h, h_length, n + prefix_length, suffix_length);
-        if (!found) return SZ_NULL_CHAR;
-
-        // Verify the remaining part of the needle
-        sz_size_t remaining = found - h;
-        if (remaining < prefix_length) return SZ_NULL_CHAR;
-        if (sz_equal(found - prefix_length, n, prefix_length)) return found - prefix_length;
-
-        // Adjust the position.
-        h_length = remaining - 1;
-    }
-
-    // Unreachable, but helps silence compiler warnings:
-    return SZ_NULL_CHAR;
-}
-
-SZ_INTERNAL sz_cptr_t _sz_find_over_4bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    return _sz_find_with_prefix(h, h_length, n, n_length, (sz_find_t)_sz_find_4byte_serial, 4);
-}
-
-SZ_INTERNAL sz_cptr_t _sz_find_horspool_over_256bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
-                                                             sz_size_t n_length) {
-    return _sz_find_with_prefix(h, h_length, n, n_length, _sz_find_horspool_upto_256bytes_serial, 256);
-}
-
-SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_over_256bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
-                                                              sz_size_t n_length) {
-    return _sz_rfind_with_suffix(h, h_length, n, n_length, _sz_rfind_horspool_upto_256bytes_serial, 256);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-
-#if SZ_DETECT_BIG_ENDIAN
-    sz_find_t backends[] = {
-        (sz_find_t)sz_find_byte_serial,
-        (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_find_horspool_over_256bytes_serial,
-    };
-
-    return backends[(n_length > 1) + (n_length > 256)](h, h_length, n, n_length);
-#else
-    sz_find_t backends[] = {
-        // For very short strings brute-force SWAR makes sense.
-        (sz_find_t)sz_find_byte_serial,
-        (sz_find_t)_sz_find_2byte_serial,
-        (sz_find_t)_sz_find_3byte_serial,
-        (sz_find_t)_sz_find_4byte_serial,
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (sz_find_t)_sz_find_over_4bytes_serial,
-        // For longer needles - use skip tables.
-        (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_find_horspool_over_256bytes_serial,
-    };
-
-    return backends[
-        // For very short strings brute-force SWAR makes sense.
-        (n_length > 1) + (n_length > 2) + (n_length > 3) +
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (n_length > 4) +
-        // For longer needles - use skip tables.
-        (n_length > 8) + (n_length > 256)](h, h_length, n, n_length);
-#endif
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-
-    sz_find_t backends[] = {
-        // For very short strings brute-force SWAR makes sense.
-        (sz_find_t)sz_rfind_byte_serial,
-        //  TODO: implement reverse-order SWAR for 2/3/4 byte variants.
-        //  TODO: (sz_find_t)_sz_rfind_2byte_serial,
-        //  TODO: (sz_find_t)_sz_rfind_3byte_serial,
-        //  TODO: (sz_find_t)_sz_rfind_4byte_serial,
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        // (sz_find_t)_sz_rfind_over_4bytes_serial,
-        // For longer needles - use skip tables.
-        (sz_find_t)_sz_rfind_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_rfind_horspool_over_256bytes_serial,
-    };
-
-    return backends[
-        // For very short strings brute-force SWAR makes sense.
-        0 +
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (n_length > 1) +
-        // For longer needles - use skip tables.
-        (n_length > 256)](h, h_length, n, n_length);
-}
-
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_serial( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                 //
-    sz_cptr_t longer, sz_size_t longer_length,                   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // TODO: Generalize to remove the following asserts!
-    sz_assert(!bound && "For bounded search the method should only evaluate one band of the matrix.");
-    sz_assert(shorter_length == longer_length && "The method hasn't been generalized to different length inputs yet.");
-    sz_unused(longer_length && bound);
-
-    // We are going to store 3 diagonals of the matrix.
-    // The length of the longest (main) diagonal would be `n = (shorter_length + 1)`.
-    sz_size_t n = shorter_length + 1;
-    sz_size_t buffer_length = sizeof(sz_size_t) * n * 3;
-    sz_size_t *distances = (sz_size_t *)alloc->allocate(buffer_length, alloc->handle);
-    if (!distances) return SZ_SIZE_MAX;
-
-    sz_size_t *previous_distances = distances;
-    sz_size_t *current_distances = previous_distances + n;
-    sz_size_t *next_distances = previous_distances + n * 2;
-
-    // Initialize the first two diagonals:
-    previous_distances[0] = 0;
-    current_distances[0] = current_distances[1] = 1;
-
-    // Progress through the upper triangle of the Levenshtein matrix.
-    sz_size_t next_diagonal_index = 2;
-    for (; next_diagonal_index != n; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
-        for (sz_size_t i = 0; i + 2 < next_diagonal_length; ++i) {
-            sz_size_t cost_of_substitution = shorter[next_diagonal_index - i - 2] != longer[i];
-            sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
-            sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
-            next_distances[i + 1] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
-        }
-        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-        next_distances[0] = next_distances[next_diagonal_length - 1] = next_diagonal_index;
-        // Perform a circular rotation of those buffers, to reuse the memory.
-        sz_size_t *temporary = previous_distances;
-        previous_distances = current_distances;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a
-    // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
-    // index on either side, we will be cropping those values out.
-    sz_size_t diagonals_count = n + n - 1;
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
-        for (sz_size_t i = 0; i != next_diagonal_length; ++i) {
-            sz_size_t cost_of_substitution = shorter[shorter_length - 1 - i] != longer[next_diagonal_index - n + i];
-            sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
-            sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
-            next_distances[i] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
-        }
-        // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
-        // dropping the first element in the current array.
-        sz_size_t *temporary = previous_distances;
-        previous_distances = current_distances + 1;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // Cache scalar before `free` call.
-    sz_size_t result = current_distances[0];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-}
-
-/**
- *  @brief  Describes the length of a UTF8 character / codepoint / rune in bytes.
- */
-typedef enum {
-    sz_utf8_invalid_k = 0,     //!< Invalid UTF8 character.
-    sz_utf8_rune_1byte_k = 1,  //!< 1-byte UTF8 character.
-    sz_utf8_rune_2bytes_k = 2, //!< 2-byte UTF8 character.
-    sz_utf8_rune_3bytes_k = 3, //!< 3-byte UTF8 character.
-    sz_utf8_rune_4bytes_k = 4, //!< 4-byte UTF8 character.
-} sz_rune_length_t;
-
-typedef sz_u32_t sz_rune_t;
-
-/**
- *  @brief  Extracts just one UTF8 codepoint from a UTF8 string into a 32-bit unsigned integer.
- */
-SZ_INTERNAL void _sz_extract_utf8_rune(sz_cptr_t utf8, sz_rune_t *code, sz_rune_length_t *code_length) {
-    sz_u8_t const *current = (sz_u8_t const *)utf8;
-    sz_u8_t leading_byte = *current++;
-    sz_rune_t ch;
-    sz_rune_length_t ch_length;
-
-    // TODO: This can be made entirely branchless using 32-bit SWAR.
-    if (leading_byte < 0x80) {
-        // Single-byte rune (0xxxxxxx)
-        ch = leading_byte;
-        ch_length = sz_utf8_rune_1byte_k;
-    }
-    else if ((leading_byte & 0xE0) == 0xC0) {
-        // Two-byte rune (110xxxxx 10xxxxxx)
-        ch = (leading_byte & 0x1F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_2bytes_k;
-    }
-    else if ((leading_byte & 0xF0) == 0xE0) {
-        // Three-byte rune (1110xxxx 10xxxxxx 10xxxxxx)
-        ch = (leading_byte & 0x0F) << 12;
-        ch |= (*current++ & 0x3F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_3bytes_k;
-    }
-    else if ((leading_byte & 0xF8) == 0xF0) {
-        // Four-byte rune (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
-        ch = (leading_byte & 0x07) << 18;
-        ch |= (*current++ & 0x3F) << 12;
-        ch |= (*current++ & 0x3F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_4bytes_k;
-    }
-    else {
-        // Invalid UTF8 rune.
-        ch = 0;
-        ch_length = sz_utf8_invalid_k;
-    }
-    *code = ch;
-    *code_length = ch_length;
-}
-
-/**
- *  @brief  Exports a UTF8 string into a UTF32 buffer.
- *          ! The result is undefined id the UTF8 string is corrupted.
- *  @return The length in the number of codepoints.
- */
-SZ_INTERNAL sz_size_t _sz_export_utf8_to_utf32(sz_cptr_t utf8, sz_size_t utf8_length, sz_rune_t *utf32) {
-    sz_cptr_t const end = utf8 + utf8_length;
-    sz_size_t count = 0;
-    sz_rune_length_t rune_length;
-    for (; utf8 != end; utf8 += rune_length, utf32++, count++) _sz_extract_utf8_rune(utf8, utf32, &rune_length);
-    return count;
-}
-
-/**
- *  @brief  Compute the Levenshtein distance between two strings using the Wagner-Fisher algorithm.
- *          Stores only 2 rows of the Levenshtein matrix, but uses 64-bit integers for the distance values,
- *          and upcasts UTF8 variable-length codepoints to 64-bit integers for faster addressing.
- *
- *  ! In the worst case for 2 strings of length 100, that contain just one 16-bit codepoint this will result in extra:
- *      + 2 rows * 100 slots * 8 bytes/slot = 1600 bytes of memory for the two rows of the Levenshtein matrix rows.
- *      + 100 codepoints * 2 strings * 4 bytes/codepoint = 800 bytes of memory for the UTF8 buffer.
- *      = 2400 bytes of memory or @b 12x memory amplification!
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_wagner_fisher_serial( //
-    sz_cptr_t longer, sz_size_t longer_length,                //
-    sz_cptr_t shorter, sz_size_t shorter_length,              //
-    sz_size_t bound, sz_bool_t can_be_unicode, sz_memory_allocator_t *alloc) {
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // A good idea may be to dispatch different kernels for different string lengths.
-    // Like using `uint8_t` counters for strings under 255 characters long.
-    // Good in theory, this results in frequent upcasts and downcasts in serial code.
-    // On strings over 20 bytes, using `uint8` over `uint64` on 64-bit x86 CPU doubles the execution time.
-    // So one must be very cautious with such optimizations.
-    typedef sz_size_t _distance_t;
-
-    // Compute the number of columns in our Levenshtein matrix.
-    sz_size_t const n = shorter_length + 1;
-
-    // If a buffering memory-allocator is provided, this operation is practically free,
-    // and cheaper than allocating even 512 bytes (for small distance matrices) on stack.
-    sz_size_t buffer_length = sizeof(_distance_t) * (n * 2);
-
-    // If the strings contain Unicode characters, let's estimate the max character width,
-    // and use it to allocate a larger buffer to decode UTF8.
-    if ((can_be_unicode == sz_true_k) &&
-        (sz_isascii(longer, longer_length) == sz_false_k || sz_isascii(shorter, shorter_length) == sz_false_k)) {
-        buffer_length += (shorter_length + longer_length) * sizeof(sz_rune_t);
-    }
-    else { can_be_unicode = sz_false_k; }
-
-    // If the allocation fails, return the maximum distance.
-    sz_ptr_t const buffer = (sz_ptr_t)alloc->allocate(buffer_length, alloc->handle);
-    if (!buffer) return SZ_SIZE_MAX;
-
-    // Let's export the UTF8 sequence into the newly allocated buffer at the end.
-    if (can_be_unicode == sz_true_k) {
-        sz_rune_t *const longer_utf32 = (sz_rune_t *)(buffer + sizeof(_distance_t) * (n * 2));
-        sz_rune_t *const shorter_utf32 = longer_utf32 + longer_length;
-        // Export the UTF8 sequences into the newly allocated buffer.
-        longer_length = _sz_export_utf8_to_utf32(longer, longer_length, longer_utf32);
-        shorter_length = _sz_export_utf8_to_utf32(shorter, shorter_length, shorter_utf32);
-        longer = (sz_cptr_t)longer_utf32;
-        shorter = (sz_cptr_t)shorter_utf32;
-    }
-
-    // Let's parameterize the core logic for different character types and distance types.
-#define _wagner_fisher_unbounded(_distance_t, _char_t)                                                                \
-    /* Now let's cast our pointer to avoid it in subsequent sections. */                                              \
-    _char_t const *const longer_chars = (_char_t const *)longer;                                                      \
-    _char_t const *const shorter_chars = (_char_t const *)shorter;                                                    \
-    _distance_t *previous_distances = (_distance_t *)buffer;                                                          \
-    _distance_t *current_distances = previous_distances + n;                                                          \
-    /*  Initialize the first row of the Levenshtein matrix with `iota`-style arithmetic progression. */               \
-    for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
-    /* The main loop of the algorithm with quadratic complexity. */                                                   \
-    for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {                                     \
-        _char_t const longer_char = longer_chars[idx_longer];                                                         \
-        /* Using pure pointer arithmetic is faster than iterating with an index. */                                   \
-        _char_t const *shorter_ptr = shorter_chars;                                                                   \
-        _distance_t const *previous_ptr = previous_distances;                                                         \
-        _distance_t *current_ptr = current_distances;                                                                 \
-        _distance_t *const current_end = current_ptr + shorter_length;                                                \
-        current_ptr[0] = idx_longer + 1;                                                                              \
-        for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) {                            \
-            _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]);           \
-            /* We can avoid `+1` for costs here, shifting it to post-minimum computation, */                          \
-            /* saving one increment operation. */                                                                     \
-            _distance_t cost_deletion = previous_ptr[1];                                                              \
-            _distance_t cost_insertion = current_ptr[0];                                                              \
-            /* ? It might be a good idea to enforce branchless execution here. */                                     \
-            /* ? The caveat being that the benchmarks on longer sequences backfire and more research is needed. */    \
-            current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1);      \
-        }                                                                                                             \
-        /* Swap `previous_distances` and `current_distances` pointers. */                                             \
-        _distance_t *temporary = previous_distances;                                                                  \
-        previous_distances = current_distances;                                                                       \
-        current_distances = temporary;                                                                                \
-    }                                                                                                                 \
-    /* Cache scalar before `free` call. */                                                                            \
-    sz_size_t result = previous_distances[shorter_length];                                                            \
-    alloc->free(buffer, buffer_length, alloc->handle);                                                                \
-    return result;
-
-    // Let's define a separate variant for bounded distance computation.
-    // Practically the same as unbounded, but also collecting the running minimum within each row for early exit.
-#define _wagner_fisher_bounded(_distance_t, _char_t)                                                                  \
-    _char_t const *const longer_chars = (_char_t const *)longer;                                                      \
-    _char_t const *const shorter_chars = (_char_t const *)shorter;                                                    \
-    _distance_t *previous_distances = (_distance_t *)buffer;                                                          \
-    _distance_t *current_distances = previous_distances + n;                                                          \
-    for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
-    for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {                                     \
-        _char_t const longer_char = longer_chars[idx_longer];                                                         \
-        _char_t const *shorter_ptr = shorter_chars;                                                                   \
-        _distance_t const *previous_ptr = previous_distances;                                                         \
-        _distance_t *current_ptr = current_distances;                                                                 \
-        _distance_t *const current_end = current_ptr + shorter_length;                                                \
-        current_ptr[0] = idx_longer + 1;                                                                              \
-        /* Initialize min_distance with a value greater than bound */                                                 \
-        _distance_t min_distance = bound - 1;                                                                         \
-        for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) {                            \
-            _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]);           \
-            _distance_t cost_deletion = previous_ptr[1];                                                              \
-            _distance_t cost_insertion = current_ptr[0];                                                              \
-            current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1);      \
-            /* Keep track of the minimum distance seen so far in this row */                                          \
-            min_distance = sz_min_of_two(current_ptr[1], min_distance);                                               \
-        }                                                                                                             \
-        /* If the minimum distance in this row exceeded the bound, return early */                                    \
-        if (min_distance >= bound) {                                                                                  \
-            alloc->free(buffer, buffer_length, alloc->handle);                                                        \
-            return bound;                                                                                             \
-        }                                                                                                             \
-        _distance_t *temporary = previous_distances;                                                                  \
-        previous_distances = current_distances;                                                                       \
-        current_distances = temporary;                                                                                \
-    }                                                                                                                 \
-    sz_size_t result = previous_distances[shorter_length];                                                            \
-    alloc->free(buffer, buffer_length, alloc->handle);                                                                \
-    return sz_min_of_two(result, bound);
-
-    // Dispatch the actual computation.
-    if (!bound) {
-        if (can_be_unicode == sz_true_k) { _wagner_fisher_unbounded(sz_size_t, sz_rune_t); }
-        else { _wagner_fisher_unbounded(sz_size_t, sz_u8_t); }
-    }
-    else {
-        if (can_be_unicode == sz_true_k) { _wagner_fisher_bounded(sz_size_t, sz_rune_t); }
-        else { _wagner_fisher_bounded(sz_size_t, sz_u8_t); }
-    }
-}
-
-SZ_PUBLIC sz_size_t sz_edit_distance_serial(     //
-    sz_cptr_t longer, sz_size_t longer_length,   //
-    sz_cptr_t shorter, sz_size_t shorter_length, //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
-    }
-
-    // Skip the matching prefixes and suffixes, they won't affect the distance.
-    for (sz_cptr_t a_end = longer + longer_length, b_end = shorter + shorter_length;
-         longer != a_end && shorter != b_end && *longer == *shorter;
-         ++longer, ++shorter, --longer_length, --shorter_length);
-    for (; longer_length && shorter_length && longer[longer_length - 1] == shorter[shorter_length - 1];
-         --longer_length, --shorter_length);
-
-    // Bounded computations may exit early.
-    int const is_bounded = bound < longer_length;
-    if (is_bounded) {
-        // If one of the strings is empty - the edit distance is equal to the length of the other one.
-        if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
-        if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
-        // If the difference in length is beyond the `bound`, there is no need to check at all.
-        if (longer_length - shorter_length > bound) return bound;
-    }
-
-    if (shorter_length == 0) return longer_length; // If no mismatches were found - the distance is zero.
-    if (shorter_length == longer_length && !is_bounded)
-        return _sz_edit_distance_skewed_diagonals_serial(longer, longer_length, shorter, shorter_length, bound, alloc);
-    return _sz_edit_distance_wagner_fisher_serial(longer, longer_length, shorter, shorter_length, bound, sz_false_k,
-                                                  alloc);
-}
-
-SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(       //
-    sz_cptr_t longer, sz_size_t longer_length,        //
-    sz_cptr_t shorter, sz_size_t shorter_length,      //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, //
-    sz_memory_allocator_t *alloc) {
-
-    // If one of the strings is empty - the edit distance is equal to the length of the other one
-    if (longer_length == 0) return (sz_ssize_t)shorter_length * gap;
-    if (shorter_length == 0) return (sz_ssize_t)longer_length * gap;
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
-    }
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    sz_size_t n = shorter_length + 1;
-    sz_size_t buffer_length = sizeof(sz_ssize_t) * n * 2;
-    sz_ssize_t *distances = (sz_ssize_t *)alloc->allocate(buffer_length, alloc->handle);
-    sz_ssize_t *previous_distances = distances;
-    sz_ssize_t *current_distances = previous_distances + n;
-
-    for (sz_size_t idx_shorter = 0; idx_shorter != n; ++idx_shorter)
-        previous_distances[idx_shorter] = (sz_ssize_t)idx_shorter * gap;
-
-    sz_u8_t const *shorter_unsigned = (sz_u8_t const *)shorter;
-    sz_u8_t const *longer_unsigned = (sz_u8_t const *)longer;
-    for (sz_size_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {
-        current_distances[0] = ((sz_ssize_t)idx_longer + 1) * gap;
-
-        // Initialize min_distance with a value greater than bound
-        sz_error_cost_t const *a_subs = subs + longer_unsigned[idx_longer] * 256ul;
-        for (sz_size_t idx_shorter = 0; idx_shorter != shorter_length; ++idx_shorter) {
-            sz_ssize_t cost_deletion = previous_distances[idx_shorter + 1] + gap;
-            sz_ssize_t cost_insertion = current_distances[idx_shorter] + gap;
-            sz_ssize_t cost_substitution = previous_distances[idx_shorter] + a_subs[shorter_unsigned[idx_shorter]];
-            current_distances[idx_shorter + 1] = sz_max_of_three(cost_deletion, cost_insertion, cost_substitution);
-        }
-
-        // Swap previous_distances and current_distances pointers
-        sz_pointer_swap((void **)&previous_distances, (void **)&current_distances);
-    }
-
-    // Cache scalar before `free` call.
-    sz_ssize_t result = previous_distances[shorter_length];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-}
-
-SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
-    sz_cptr_t a, sz_size_t a_length,            //
-    sz_cptr_t b, sz_size_t b_length,            //
-    sz_size_t bound) {
-
-    sz_size_t const min_length = sz_min_of_two(a_length, b_length);
-    sz_size_t const max_length = sz_max_of_two(a_length, b_length);
-    sz_cptr_t const a_end = a + min_length;
-    bound = bound == 0 ? max_length : bound;
-
-    // Walk through both strings using SWAR and counting the number of differing characters.
-    sz_size_t distance = max_length - min_length;
-#if SZ_USE_MISALIGNED_LOADS && !SZ_DETECT_BIG_ENDIAN
-    if (min_length >= SZ_SWAR_THRESHOLD) {
-        sz_u64_vec_t a_vec, b_vec, match_vec;
-        for (; a + 8 <= a_end && distance < bound; a += 8, b += 8) {
-            a_vec.u64 = sz_u64_load(a).u64;
-            b_vec.u64 = sz_u64_load(b).u64;
-            match_vec = _sz_u64_each_byte_equal(a_vec, b_vec);
-            distance += sz_u64_popcount((~match_vec.u64) & 0x8080808080808080ull);
-        }
-    }
-#endif
-
-    for (; a != a_end && distance < bound; ++a, ++b) { distance += (*a != *b); }
-    return sz_min_of_two(distance, bound);
-}
-
-SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial( //
-    sz_cptr_t a, sz_size_t a_length,                 //
-    sz_cptr_t b, sz_size_t b_length,                 //
-    sz_size_t bound) {
-
-    sz_cptr_t const a_end = a + a_length;
-    sz_cptr_t const b_end = b + b_length;
-    sz_size_t distance = 0;
-
-    sz_rune_t a_rune, b_rune;
-    sz_rune_length_t a_rune_length, b_rune_length;
-
-    if (bound) {
-        for (; a < a_end && b < b_end && distance < bound; a += a_rune_length, b += b_rune_length) {
-            _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-            _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-            distance += (a_rune != b_rune);
-        }
-        // If one string has more runes, we need to go through the tail.
-        if (distance < bound) {
-            for (; a < a_end && distance < bound; a += a_rune_length, ++distance)
-                _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-
-            for (; b < b_end && distance < bound; b += b_rune_length, ++distance)
-                _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-        }
-    }
-    else {
-        for (; a < a_end && b < b_end; a += a_rune_length, b += b_rune_length) {
-            _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-            _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-            distance += (a_rune != b_rune);
-        }
-        // If one string has more runes, we need to go through the tail.
-        for (; a < a_end; a += a_rune_length, ++distance) _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-        for (; b < b_end; b += b_rune_length, ++distance) _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-    }
-    return distance;
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length) {
-    sz_u64_t checksum = 0;
-    sz_u8_t const *text_u8 = (sz_u8_t const *)text;
-    sz_u8_t const *text_end = text_u8 + length;
-    for (; text_u8 != text_end; ++text_u8) checksum += *text_u8;
-    return checksum;
-}
-
-/**
- *  @brief  Largest prime number that fits into 31 bits.
- *  @see    https://mersenneforum.org/showthread.php?t=3471
- */
-#define SZ_U32_MAX_PRIME (2147483647u)
-
-/**
- *  @brief  Largest prime number that fits into 64 bits.
- *  @see    https://mersenneforum.org/showthread.php?t=3471
- *
- *  2^64 = 18,446,744,073,709,551,616
- *  this = 18,446,744,073,709,551,557
- *  diff = 59
- */
-#define SZ_U64_MAX_PRIME (18446744073709551557ull)
-
-/*
- *  One hardware-accelerated way of mixing hashes can be CRC, but it's only implemented for 32-bit values.
- *  Using a Boost-like mixer works very poorly in such case:
- *
- *       hash_first ^ (hash_second + 0x517cc1b727220a95 + (hash_first << 6) + (hash_first >> 2));
- *
- *  Let's stick to the Fibonacci hash trick using the golden ratio.
- *  https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
- */
-#define _sz_hash_mix(first, second) ((first * 11400714819323198485ull) ^ (second * 11400714819323198485ull))
-#define _sz_shift_low(x) (x)
-#define _sz_shift_high(x) ((x + 77ull) & 0xFFull)
-#define _sz_prime_mod(x) (x % SZ_U64_MAX_PRIME)
-
-SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length) {
-
-    sz_u64_t hash_low = 0;
-    sz_u64_t hash_high = 0;
-    sz_u8_t const *text = (sz_u8_t const *)start;
-    sz_u8_t const *text_end = text + length;
-
-    switch (length) {
-    case 0: return 0;
-
-    // Texts under 7 bytes long are definitely below the largest prime.
-    case 1:
-        hash_low = _sz_shift_low(text[0]);
-        hash_high = _sz_shift_high(text[0]);
-        break;
-    case 2:
-        hash_low = _sz_shift_low(text[0]) * 31ull + _sz_shift_low(text[1]);
-        hash_high = _sz_shift_high(text[0]) * 257ull + _sz_shift_high(text[1]);
-        break;
-    case 3:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull +         //
-                   _sz_shift_low(text[2]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull +          //
-                    _sz_shift_high(text[2]);
-        break;
-    case 4:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull +                 //
-                   _sz_shift_low(text[3]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull +                   //
-                    _sz_shift_high(text[3]);
-        break;
-    case 5:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull +                         //
-                   _sz_shift_low(text[4]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull +                            //
-                    _sz_shift_high(text[4]);
-        break;
-    case 6:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull * 31ull +                         //
-                   _sz_shift_low(text[4]) * 31ull +                                 //
-                   _sz_shift_low(text[5]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull * 257ull +                            //
-                    _sz_shift_high(text[4]) * 257ull +                                     //
-                    _sz_shift_high(text[5]);
-        break;
-    case 7:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull * 31ull * 31ull +                         //
-                   _sz_shift_low(text[4]) * 31ull * 31ull +                                 //
-                   _sz_shift_low(text[5]) * 31ull +                                         //
-                   _sz_shift_low(text[6]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull * 257ull * 257ull +                            //
-                    _sz_shift_high(text[4]) * 257ull * 257ull +                                     //
-                    _sz_shift_high(text[5]) * 257ull +                                              //
-                    _sz_shift_high(text[6]);
-        break;
-    default:
-        // Unroll the first seven cycles:
-        hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[1]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[1]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[2]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[2]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[3]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[3]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[4]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[4]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[5]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[5]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[6]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[6]);
-        text += 7;
-
-        // Iterate throw the rest with the modulus:
-        for (; text != text_end; ++text) {
-            hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
-            hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
-            // Wrap the hashes around:
-            hash_low = _sz_prime_mod(hash_low);
-            hash_high = _sz_prime_mod(hash_high);
-        }
-        break;
-    }
+    sz_u64_vec_t h_even_vec, h_odd_vec, n_vec, matches_even_vec, matches_odd_vec;
+    n_vec.u64 = 0;
+    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1];
+    n_vec.u64 *= 0x0001000100010001ull; // broadcast
 
-    return _sz_hash_mix(hash_low, hash_high);
-}
+    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
+    for (; h + 9 <= h_end; h += 8) {
+        h_even_vec.u64 = *(sz_u64_t *)h;
+        h_odd_vec.u64 = (h_even_vec.u64 >> 8) | ((sz_u64_t)h[8] << 56);
+        matches_even_vec = _sz_u64_each_2byte_equal(h_even_vec, n_vec);
+        matches_odd_vec = _sz_u64_each_2byte_equal(h_odd_vec, n_vec);
 
-SZ_PUBLIC void sz_hashes_serial(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    sz_u8_t const *text = (sz_u8_t const *)start;
-    sz_u8_t const *text_end = text + length;
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // Compute the initial hash value for the first window.
-    sz_u64_t hash_low = 0, hash_high = 0, hash_mix;
-    for (sz_u8_t const *first_end = text + window_length; text < first_end; ++text)
-        hash_low = (hash_low * 31ull + _sz_shift_low(*text)) % SZ_U64_MAX_PRIME,
-        hash_high = (hash_high * 257ull + _sz_shift_high(*text)) % SZ_U64_MAX_PRIME;
-
-    // In most cases the fingerprint length will be a power of two.
-    hash_mix = _sz_hash_mix(hash_low, hash_high);
-    callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
-
-    // Compute the hash value for every window, exporting into the fingerprint,
-    // using the expensive modulo operation.
-    sz_size_t cycles = 1;
-    sz_size_t const step_mask = step - 1;
-    for (; text < text_end; ++text, ++cycles) {
-        // Discard one character:
-        hash_low -= _sz_shift_low(*(text - window_length)) * prime_power_low;
-        hash_high -= _sz_shift_high(*(text - window_length)) * prime_power_high;
-        // And add a new one:
-        hash_low = 31ull * hash_low + _sz_shift_low(*text);
-        hash_high = 257ull * hash_high + _sz_shift_high(*text);
-        // Wrap the hashes around:
-        hash_low = _sz_prime_mod(hash_low);
-        hash_high = _sz_prime_mod(hash_high);
-        // Mix only if we've skipped enough hashes.
-        if ((cycles & step_mask) == 0) {
-            hash_mix = _sz_hash_mix(hash_low, hash_high);
-            callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
+        matches_even_vec.u64 >>= 8;
+        if (matches_even_vec.u64 + matches_odd_vec.u64) {
+            sz_u64_t match_indicators = matches_even_vec.u64 | matches_odd_vec.u64;
+            return h + sz_u64_ctz(match_indicators) / 8;
         }
     }
-}
-
-#undef _sz_shift_low
-#undef _sz_shift_high
-#undef _sz_hash_mix
-#undef _sz_prime_mod
-
-/**
- *  @brief  Uses a small lookup-table to convert a lowercase character to uppercase.
- */
-SZ_INTERNAL sz_u8_t sz_u8_tolower(sz_u8_t c) {
-    static sz_u8_t const lowered[256] = {
-        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
-        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
-        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
-        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
-        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
-        96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, //
-        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
-        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
-        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
-        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
-    };
-    return lowered[c];
-}
 
-/**
- *  @brief  Uses a small lookup-table to convert an uppercase character to lowercase.
- */
-SZ_INTERNAL sz_u8_t sz_u8_toupper(sz_u8_t c) {
-    static sz_u8_t const upped[256] = {
-        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
-        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
-        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
-        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
-        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
-        96,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  //
-        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  123, 124, 125, 126, 127, //
-        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
-        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
-        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
-        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
-    };
-    return upped[c];
+    for (; h + 2 <= h_end; ++h)
+        if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
+    return SZ_NULL_CHAR;
 }
 
 /**
- *  @brief  Uses two small lookup tables (768 bytes total) to accelerate division by a small
- *          unsigned integer. Performs two lookups, one multiplication, two shifts, and two accumulations.
- *
- *  @param  divisor Integral value @b larger than one.
- *  @param  number  Integral value to divide.
+ *  @brief  4Byte-level equality comparison between two 64-bit integers.
+ *  @return 64-bit integer, where every top bit in each 4byte signifies a match.
  */
-SZ_INTERNAL sz_u8_t sz_u8_divide(sz_u8_t number, sz_u8_t divisor) {
-    sz_assert(divisor > 1);
-    static sz_u16_t const multipliers[256] = {
-        0,     0,     0,     21846, 0,     39322, 21846, 9363,  0,     50973, 39322, 29790, 21846, 15124, 9363,  4370,
-        0,     57826, 50973, 44841, 39322, 34329, 29790, 25645, 21846, 18351, 15124, 12137, 9363,  6780,  4370,  2115,
-        0,     61565, 57826, 54302, 50973, 47824, 44841, 42011, 39322, 36765, 34329, 32006, 29790, 27671, 25645, 23705,
-        21846, 20063, 18351, 16706, 15124, 13602, 12137, 10725, 9363,  8049,  6780,  5554,  4370,  3224,  2115,  1041,
-        0,     63520, 61565, 59668, 57826, 56039, 54302, 52614, 50973, 49377, 47824, 46313, 44841, 43407, 42011, 40649,
-        39322, 38028, 36765, 35532, 34329, 33154, 32006, 30885, 29790, 28719, 27671, 26647, 25645, 24665, 23705, 22766,
-        21846, 20945, 20063, 19198, 18351, 17520, 16706, 15907, 15124, 14356, 13602, 12863, 12137, 11424, 10725, 10038,
-        9363,  8700,  8049,  7409,  6780,  6162,  5554,  4957,  4370,  3792,  3224,  2665,  2115,  1573,  1041,  517,
-        0,     64520, 63520, 62535, 61565, 60609, 59668, 58740, 57826, 56926, 56039, 55164, 54302, 53452, 52614, 51788,
-        50973, 50169, 49377, 48595, 47824, 47063, 46313, 45572, 44841, 44120, 43407, 42705, 42011, 41326, 40649, 39982,
-        39322, 38671, 38028, 37392, 36765, 36145, 35532, 34927, 34329, 33738, 33154, 32577, 32006, 31443, 30885, 30334,
-        29790, 29251, 28719, 28192, 27671, 27156, 26647, 26143, 25645, 25152, 24665, 24182, 23705, 23233, 22766, 22303,
-        21846, 21393, 20945, 20502, 20063, 19628, 19198, 18772, 18351, 17933, 17520, 17111, 16706, 16305, 15907, 15514,
-        15124, 14738, 14356, 13977, 13602, 13231, 12863, 12498, 12137, 11779, 11424, 11073, 10725, 10380, 10038, 9699,
-        9363,  9030,  8700,  8373,  8049,  7727,  7409,  7093,  6780,  6470,  6162,  5857,  5554,  5254,  4957,  4662,
-        4370,  4080,  3792,  3507,  3224,  2943,  2665,  2388,  2115,  1843,  1573,  1306,  1041,  778,   517,   258,
-    };
-    // This table can be avoided using a single addition and counting trailing zeros.
-    static sz_u8_t const shifts[256] = {
-        0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, //
-        4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, //
-        5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, //
-        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, //
-        6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-    };
-    sz_u32_t multiplier = multipliers[divisor];
-    sz_u8_t shift = shifts[divisor];
-
-    sz_u16_t q = (sz_u16_t)((multiplier * number) >> 16);
-    sz_u16_t t = ((number - q) >> 1) + q;
-    return (sz_u8_t)(t >> shift);
-}
-
-SZ_PUBLIC void sz_look_up_transform_serial(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result) {
-    sz_u8_t const *unsigned_lut = (sz_u8_t const *)lut;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = unsigned_lut[*unsigned_text];
-}
-
-SZ_PUBLIC void sz_tolower_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = sz_u8_tolower(*unsigned_text);
-}
-
-SZ_PUBLIC void sz_toupper_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = sz_u8_toupper(*unsigned_text);
-}
-
-SZ_PUBLIC void sz_toascii_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = *unsigned_text & 0x7F;
+SZ_INTERNAL sz_u64_vec_t _sz_u64_each_4byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
+    sz_u64_vec_t vec;
+    vec.u64 = ~(a.u64 ^ b.u64);
+    // The match is valid, if every bit within each 4byte is set.
+    // For that take the bottom 31 bits of each 4byte, add one to them,
+    // and if this sets the top bit to one, then all the 31 bits are ones as well.
+    vec.u64 = ((vec.u64 & 0x7FFFFFFF7FFFFFFFull) + 0x0000000100000001ull) & ((vec.u64 & 0x8000000080000000ull));
+    return vec;
 }
 
 /**
- *  @brief  Check if there is a byte in this buffer, that exceeds 127 and can't be an ASCII character.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
+ *  @brief  Find the first occurrence of a @b four-character needle in an arbitrary length haystack.
+ *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
  */
-SZ_PUBLIC sz_bool_t sz_isascii_serial(sz_cptr_t text, sz_size_t length) {
+SZ_INTERNAL sz_cptr_t _sz_find_4byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
 
-    if (!length) return sz_true_k;
-    sz_u8_t const *h = (sz_u8_t const *)text;
-    sz_u8_t const *const h_end = h + length;
+    // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
+    sz_assert(h_length >= 4 && "The haystack is too short.");
+    sz_cptr_t const h_end = h + h_length;
 
 #if !SZ_USE_MISALIGNED_LOADS
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
-        if (*h & 0x80ull) return sz_false_k;
+    for (; ((sz_size_t)h & 7ull) && h + 4 <= h_end; ++h)
+        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
 #endif
 
-    // Validate eight bytes at once using SWAR.
-    sz_u64_vec_t text_vec;
-    for (; h + 8 <= h_end; h += 8) {
-        text_vec.u64 = *(sz_u64_t const *)h;
-        if (text_vec.u64 & 0x8080808080808080ull) return sz_false_k;
-    }
-
-    // Handle the misaligned tail.
-    for (; h < h_end; ++h)
-        if (*h & 0x80ull) return sz_false_k;
-    return sz_true_k;
-}
-
-SZ_PUBLIC void sz_generate_serial(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
-                                  sz_random_generator_t generator, void *generator_user_data) {
-
-    sz_assert(alphabet_size > 0 && alphabet_size <= 256 && "Inadequate alphabet size");
+    sz_u64_vec_t h0_vec, h1_vec, h2_vec, h3_vec, n_vec, matches0_vec, matches1_vec, matches2_vec, matches3_vec;
+    n_vec.u64 = 0;
+    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1], n_vec.u8s[2] = n[2], n_vec.u8s[3] = n[3];
+    n_vec.u64 *= 0x0000000100000001ull; // broadcast
 
-    if (alphabet_size == 1) sz_fill(result, result_length, *alphabet);
+    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time using four 64-bit words.
+    // We load the subsequent four-byte word as well, taking its first bytes. Think of it as a glorified prefetch :)
+    sz_u64_t h_page_current, h_page_next;
+    for (; h + sizeof(sz_u64_t) + sizeof(sz_u32_t) <= h_end; h += sizeof(sz_u64_t)) {
+        h_page_current = *(sz_u64_t *)h;
+        h_page_next = *(sz_u32_t *)(h + 8);
+        h0_vec.u64 = (h_page_current);
+        h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
+        h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
+        h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
+        matches0_vec = _sz_u64_each_4byte_equal(h0_vec, n_vec);
+        matches1_vec = _sz_u64_each_4byte_equal(h1_vec, n_vec);
+        matches2_vec = _sz_u64_each_4byte_equal(h2_vec, n_vec);
+        matches3_vec = _sz_u64_each_4byte_equal(h3_vec, n_vec);
 
-    else {
-        sz_assert(generator && "Expects a valid random generator");
-        sz_u8_t divisor = (sz_u8_t)alphabet_size;
-        for (sz_cptr_t end = result + result_length; result != end; ++result) {
-            sz_u8_t random = generator(generator_user_data) & 0xFF;
-            sz_u8_t quotient = sz_u8_divide(random, divisor);
-            *result = alphabet[random - quotient * divisor];
+        if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64) {
+            matches0_vec.u64 >>= 24;
+            matches1_vec.u64 >>= 16;
+            matches2_vec.u64 >>= 8;
+            sz_u64_t match_indicators = matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64;
+            return h + sz_u64_ctz(match_indicators) / 8;
         }
     }
-}
-
-#pragma endregion
-
-/*
- *  Serial implementation of string class operations.
- */
-#pragma region Serial Implementation for the String Class
-
-SZ_PUBLIC sz_bool_t sz_string_is_on_stack(sz_string_t const *string) {
-    // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    return (sz_bool_t)((sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0]);
-}
 
-SZ_PUBLIC void sz_string_range(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length) {
-    sz_size_t is_small = (sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0];
-    sz_size_t is_big_mask = is_small - 1ull;
-    *start = string->external.start; // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    // If the string is small, use branch-less approach to mask-out the top 7 bytes of the length.
-    *length = string->external.length & (0x00000000000000FFull | is_big_mask);
+    for (; h + 4 <= h_end; ++h)
+        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
+    return SZ_NULL_CHAR;
 }
 
-SZ_PUBLIC void sz_string_unpack(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length, sz_size_t *space,
-                                sz_bool_t *is_external) {
-    sz_size_t is_small = (sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0];
-    sz_size_t is_big_mask = is_small - 1ull;
-    *start = string->external.start; // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    // If the string is small, use branch-less approach to mask-out the top 7 bytes of the length.
-    *length = string->external.length & (0x00000000000000FFull | is_big_mask);
-    // In case the string is small, the `is_small - 1ull` will become 0xFFFFFFFFFFFFFFFFull.
-    *space = sz_u64_blend(SZ_STRING_INTERNAL_SPACE, string->external.space, is_big_mask);
-    *is_external = (sz_bool_t)!is_small;
+/**
+ *  @brief  3Byte-level equality comparison between two 64-bit integers.
+ *  @return 64-bit integer, where every top bit in each 3byte signifies a match.
+ */
+SZ_INTERNAL sz_u64_vec_t _sz_u64_each_3byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
+    sz_u64_vec_t vec;
+    vec.u64 = ~(a.u64 ^ b.u64);
+    // The match is valid, if every bit within each 4byte is set.
+    // For that take the bottom 31 bits of each 4byte, add one to them,
+    // and if this sets the top bit to one, then all the 31 bits are ones as well.
+    vec.u64 = ((vec.u64 & 0xFFFF7FFFFF7FFFFFull) + 0x0000000001000001ull) & ((vec.u64 & 0x0000800000800000ull));
+    return vec;
 }
 
-SZ_PUBLIC sz_bool_t sz_string_equal(sz_string_t const *a, sz_string_t const *b) {
-    // Tempting to say that the external.length is bitwise the same even if it includes
-    // some bytes of the on-stack payload, but we don't at this writing maintain that invariant.
-    // (An on-stack string includes noise bytes in the high-order bits of external.length. So do this
-    // the hard/correct way.
-
-#if SZ_USE_MISALIGNED_LOADS
-    // Dealing with StringZilla strings, we know that the `start` pointer always points
-    // to a word at least 8 bytes long. Therefore, we can compare the first 8 bytes at once.
-
-#endif
-    // Alternatively, fall back to byte-by-byte comparison.
-    sz_ptr_t a_start, b_start;
-    sz_size_t a_length, b_length;
-    sz_string_range(a, &a_start, &a_length);
-    sz_string_range(b, &b_start, &b_length);
-    return (sz_bool_t)(a_length == b_length && sz_equal(a_start, b_start, b_length));
-}
+/**
+ *  @brief  Find the first occurrence of a @b three-character needle in an arbitrary length haystack.
+ *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
+ */
+SZ_INTERNAL sz_cptr_t _sz_find_3byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
 
-SZ_PUBLIC sz_ordering_t sz_string_order(sz_string_t const *a, sz_string_t const *b) {
-#if SZ_USE_MISALIGNED_LOADS
-    // Dealing with StringZilla strings, we know that the `start` pointer always points
-    // to a word at least 8 bytes long. Therefore, we can compare the first 8 bytes at once.
+    // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
+    sz_assert(h_length >= 3 && "The haystack is too short.");
+    sz_cptr_t const h_end = h + h_length;
 
+#if !SZ_USE_MISALIGNED_LOADS
+    // Process the misaligned head, to void UB on unaligned 64-bit loads.
+    for (; ((sz_size_t)h & 7ull) && h + 3 <= h_end; ++h)
+        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
 #endif
-    // Alternatively, fall back to byte-by-byte comparison.
-    sz_ptr_t a_start, b_start;
-    sz_size_t a_length, b_length;
-    sz_string_range(a, &a_start, &a_length);
-    sz_string_range(b, &b_start, &b_length);
-    return sz_order(a_start, a_length, b_start, b_length);
-}
-
-SZ_PUBLIC void sz_string_init(sz_string_t *string) {
-    sz_assert(string && "String can't be SZ_NULL.");
-
-    // Only 8 + 1 + 1 need to be initialized.
-    string->internal.start = &string->internal.chars[0];
-    // But for safety let's initialize the entire structure to zeros.
-    // string->internal.chars[0] = 0;
-    // string->internal.length = 0;
-    string->words[1] = 0;
-    string->words[2] = 0;
-    string->words[3] = 0;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length, sz_memory_allocator_t *allocator) {
-    sz_size_t space_needed = length + 1; // space for trailing \0
-    sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
-    // Initialize the string to zeros for safety.
-    string->words[1] = 0;
-    string->words[2] = 0;
-    string->words[3] = 0;
-    // If we are lucky, no memory allocations will be needed.
-    if (space_needed <= SZ_STRING_INTERNAL_SPACE) {
-        string->internal.start = &string->internal.chars[0];
-        string->internal.length = (sz_u8_t)length;
-    }
-    else {
-        // If we are not lucky, we need to allocate memory.
-        string->external.start = (sz_ptr_t)allocator->allocate(space_needed, allocator->handle);
-        if (!string->external.start) return SZ_NULL_CHAR;
-        string->external.length = length;
-        string->external.space = space_needed;
-    }
-    sz_assert(&string->internal.start == &string->external.start && "Alignment confusion");
-    string->external.start[length] = 0;
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity, sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
 
-    sz_size_t new_space = new_capacity + 1;
-    if (new_space <= SZ_STRING_INTERNAL_SPACE) return string->external.start;
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-    sz_assert(new_space > string_space && "New space must be larger than current.");
-
-    sz_ptr_t new_start = (sz_ptr_t)allocator->allocate(new_space, allocator->handle);
-    if (!new_start) return SZ_NULL_CHAR;
-
-    sz_copy(new_start, string_start, string_length);
-    string->external.start = new_start;
-    string->external.space = new_space;
-    string->external.padding = 0;
-    string->external.length = string_length;
-
-    // Deallocate the old string.
-    if (string_is_external) allocator->free(string_start, string_space, allocator->handle);
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_shrink_to_fit(sz_string_t *string, sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // We may already be space-optimal, and in that case we don't need to do anything.
-    sz_size_t new_space = string_length + 1;
-    if (string_space == new_space || !string_is_external) return string->external.start;
-
-    sz_ptr_t new_start = (sz_ptr_t)allocator->allocate(new_space, allocator->handle);
-    if (!new_start) return SZ_NULL_CHAR;
-
-    sz_copy(new_start, string_start, string_length);
-    string->external.start = new_start;
-    string->external.space = new_space;
-    string->external.padding = 0;
-    string->external.length = string_length;
-
-    // Deallocate the old string.
-    if (string_is_external) allocator->free(string_start, string_space, allocator->handle);
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_expand(sz_string_t *string, sz_size_t offset, sz_size_t added_length,
-                                    sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // The user intended to extend the string.
-    offset = sz_min_of_two(offset, string_length);
-
-    // If we are lucky, no memory allocations will be needed.
-    if (string_length + added_length < string_space) {
-        sz_move(string_start + offset + added_length, string_start + offset, string_length - offset);
-        string_start[string_length + added_length] = 0;
-        // Even if the string is on the stack, the `+=` won't affect the tail of the string.
-        string->external.length += added_length;
-    }
-    // If we are not lucky, we need to allocate more memory.
-    else {
-        sz_size_t next_planned_size = sz_max_of_two(SZ_CACHE_LINE_WIDTH, string_space * 2ull);
-        sz_size_t min_needed_space = sz_size_bit_ceil(offset + string_length + added_length + 1);
-        sz_size_t new_space = sz_max_of_two(min_needed_space, next_planned_size);
-        string_start = sz_string_reserve(string, new_space - 1, allocator);
-        if (!string_start) return SZ_NULL_CHAR;
-
-        // Copy into the new buffer.
-        sz_move(string_start + offset + added_length, string_start + offset, string_length - offset);
-        string_start[string_length + added_length] = 0;
-        string->external.length = string_length + added_length;
-    }
-
-    return string_start;
-}
-
-SZ_PUBLIC sz_size_t sz_string_erase(sz_string_t *string, sz_size_t offset, sz_size_t length) {
-
-    sz_assert(string && "String can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // Normalize the offset, it can't be larger than the length.
-    offset = sz_min_of_two(offset, string_length);
-
-    // We shouldn't normalize the length, to avoid overflowing on `offset + length >= string_length`,
-    // if receiving `length == SZ_SIZE_MAX`. After following expression the `length` will contain
-    // exactly the delta between original and final length of this `string`.
-    length = sz_min_of_two(length, string_length - offset);
-
-    // There are 2 common cases, that wouldn't even require a `memmove`:
-    //      1.  Erasing the entire contents of the string.
-    //          In that case `length` argument will be equal or greater than `length` member.
-    //      2.  Removing the tail of the string with something like `string.pop_back()` in C++.
-    //
-    // In both of those, regardless of the location of the string - stack or heap,
-    // the erasing is as easy as setting the length to the offset.
-    // In every other case, we must `memmove` the tail of the string to the left.
-    if (offset + length < string_length)
-        sz_move(string_start + offset, string_start + offset + length, string_length - offset - length);
-
-    // The `string->external.length = offset` assignment would discard last characters
-    // of the on-the-stack string, but inplace subtraction would work.
-    string->external.length -= length;
-    string_start[string_length - length] = 0;
-    return length;
-}
-
-SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *allocator) {
-    if (!sz_string_is_on_stack(string))
-        allocator->free(string->external.start, string->external.space, allocator->handle);
-    sz_string_init(string);
-}
-
-// When overriding libc, disable optimisations for this function beacuse MSVC will optimize the loops into a memset.
-// Which then causes a stack overflow due to infinite recursion (memset -> sz_fill_serial -> memset).
-#if defined(_MSC_VER) && defined(SZ_OVERRIDE_LIBC) && SZ_OVERRIDE_LIBC
-#pragma optimize("", off)
-#endif
-SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    sz_ptr_t end = target + length;
-    // Dealing with short strings, a single sequential pass would be faster.
-    // If the size is larger than 2 words, then at least 1 of them will be aligned.
-    // But just one aligned word may not be worth SWAR.
-    if (length < SZ_SWAR_THRESHOLD)
-        while (target != end) *(target++) = value;
-
-    // In case of long strings, skip unaligned bytes, and then fill the rest in 64-bit chunks.
-    else {
-        sz_u64_t value64 = (sz_u64_t)value * 0x0101010101010101ull;
-        while ((sz_size_t)target & 7ull) *(target++) = value;
-        while (target + 8 <= end) *(sz_u64_t *)target = value64, target += 8;
-        while (target != end) *(target++) = value;
-    }
-}
-#if defined(_MSC_VER) && defined(SZ_OVERRIDE_LIBC) && SZ_OVERRIDE_LIBC
-#pragma optimize("", on)
-#endif
+    // We fetch 12
+    sz_u64_vec_t h0_vec, h1_vec, h2_vec, h3_vec, h4_vec;
+    sz_u64_vec_t matches0_vec, matches1_vec, matches2_vec, matches3_vec, matches4_vec;
+    sz_u64_vec_t n_vec;
+    n_vec.u64 = 0;
+    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1], n_vec.u8s[2] = n[2];
+    n_vec.u64 *= 0x0000000001000001ull; // broadcast
 
-SZ_PUBLIC void sz_copy_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_MISALIGNED_LOADS
-    while (length >= 8) *(sz_u64_t *)target = *(sz_u64_t const *)source, target += 8, source += 8, length -= 8;
-#endif
-    while (length--) *(target++) = *(source++);
-}
+    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time using three 64-bit words.
+    // We load the subsequent two-byte word as well.
+    sz_u64_t h_page_current, h_page_next;
+    for (; h + sizeof(sz_u64_t) + sizeof(sz_u16_t) <= h_end; h += sizeof(sz_u64_t)) {
+        h_page_current = *(sz_u64_t *)h;
+        h_page_next = *(sz_u16_t *)(h + 8);
+        h0_vec.u64 = (h_page_current);
+        h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
+        h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
+        h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
+        h4_vec.u64 = (h_page_current >> 32) | (h_page_next << 32);
+        matches0_vec = _sz_u64_each_3byte_equal(h0_vec, n_vec);
+        matches1_vec = _sz_u64_each_3byte_equal(h1_vec, n_vec);
+        matches2_vec = _sz_u64_each_3byte_equal(h2_vec, n_vec);
+        matches3_vec = _sz_u64_each_3byte_equal(h3_vec, n_vec);
+        matches4_vec = _sz_u64_each_3byte_equal(h4_vec, n_vec);
 
-SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // Implementing `memmove` is trickier, than `memcpy`, as the ranges may overlap.
-    // Existing implementations often have two passes, in normal and reversed order,
-    // depending on the relation of `target` and `source` addresses.
-    // https://student.cs.uwaterloo.ca/~cs350/common/os161-src-html/doxygen/html/memmove_8c_source.html
-    // https://marmota.medium.com/c-language-making-memmove-def8792bb8d5
-    //
-    // We can use the `memcpy` like left-to-right pass if we know that the `target` is before `source`.
-    // Or if we know that they don't intersect! In that case the traversal order is irrelevant,
-    // but older CPUs may predict and fetch forward-passes better.
-    if (target < source || target >= source + length) {
-#if SZ_USE_MISALIGNED_LOADS
-        while (length >= 8) *(sz_u64_t *)target = *(sz_u64_t const *)(source), target += 8, source += 8, length -= 8;
-#endif
-        while (length--) *(target++) = *(source++);
-    }
-    else {
-        // Jump to the end and walk backwards.
-        target += length, source += length;
-#if SZ_USE_MISALIGNED_LOADS
-        while (length >= 8) *(sz_u64_t *)(target -= 8) = *(sz_u64_t const *)(source -= 8), length -= 8;
-#endif
-        while (length--) *(--target) = *(--source);
+        if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64) {
+            matches0_vec.u64 >>= 16;
+            matches1_vec.u64 >>= 8;
+            matches3_vec.u64 <<= 8;
+            matches4_vec.u64 <<= 16;
+            sz_u64_t match_indicators =
+                matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64;
+            return h + sz_u64_ctz(match_indicators) / 8;
+        }
     }
-}
-
-#pragma endregion
-
-/*
- *  @brief  Serial implementation for strings sequence processing.
- */
-#pragma region Serial Implementation for Sequences
-
-SZ_PUBLIC sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate) {
-
-    sz_size_t matches = 0;
-    while (matches != sequence->count && predicate(sequence, sequence->order[matches])) ++matches;
 
-    for (sz_size_t i = matches + 1; i < sequence->count; ++i)
-        if (predicate(sequence, sequence->order[i]))
-            sz_u64_swap(sequence->order + i, sequence->order + matches), ++matches;
-
-    return matches;
+    for (; h + 3 <= h_end; ++h)
+        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
+    return SZ_NULL_CHAR;
 }
 
-SZ_PUBLIC void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less) {
-
-    sz_size_t start_b = partition + 1;
-
-    // If the direct merge is already sorted
-    if (!less(sequence, sequence->order[start_b], sequence->order[partition])) return;
+/**
+ *  @brief  Boyer-Moore-Horspool algorithm for exact matching of patterns up to @b 256-bytes long.
+ *          Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
+ */
+SZ_INTERNAL sz_cptr_t _sz_find_horspool_upto_256bytes_serial( //
+    sz_cptr_t h_chars, sz_size_t h_length,                    //
+    sz_cptr_t n_chars, sz_size_t n_length) {
+    sz_assert(n_length <= 256 && "The pattern is too long.");
+    // Several popular string matching algorithms are using a bad-character shift table.
+    // Boyer Moore: https://www-igm.univ-mlv.fr/~lecroq/string/node14.html
+    // Quick Search: https://www-igm.univ-mlv.fr/~lecroq/string/node19.html
+    // Smith: https://www-igm.univ-mlv.fr/~lecroq/string/node21.html
+    union {
+        sz_u8_t jumps[256];
+        sz_u64_vec_t vecs[64];
+    } bad_shift_table;
 
-    sz_size_t start_a = 0;
-    while (start_a <= partition && start_b <= sequence->count) {
+    // Let's initialize the table using SWAR to the total length of the string.
+    sz_u8_t const *h = (sz_u8_t const *)h_chars;
+    sz_u8_t const *n = (sz_u8_t const *)n_chars;
+    {
+        sz_u64_vec_t n_length_vec;
+        n_length_vec.u64 = n_length;
+        n_length_vec.u64 *= 0x0101010101010101ull; // broadcast
+        for (sz_size_t i = 0; i != 64; ++i) bad_shift_table.vecs[i].u64 = n_length_vec.u64;
+        for (sz_size_t i = 0; i + 1 < n_length; ++i) bad_shift_table.jumps[n[i]] = (sz_u8_t)(n_length - i - 1);
+    }
 
-        // If element 1 is in right place
-        if (!less(sequence, sequence->order[start_b], sequence->order[start_a])) { start_a++; }
-        else {
-            sz_size_t value = sequence->order[start_b];
-            sz_size_t index = start_b;
+    // Another common heuristic is to match a few characters from different parts of a string.
+    // Raita suggests to use the first two, the last, and the middle character of the pattern.
+    sz_u32_vec_t h_vec, n_vec;
 
-            // Shift all the elements between element 1
-            // element 2, right by 1.
-            while (index != start_a) { sequence->order[index] = sequence->order[index - 1], index--; }
-            sequence->order[start_a] = value;
+    // Pick the parts of the needle that are worth comparing.
+    sz_size_t offset_first, offset_mid, offset_last;
+    _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
 
-            // Update all the pointers
-            start_a++;
-            partition++;
-            start_b++;
-        }
-    }
-}
+    // Broadcast those characters into an unsigned integer.
+    n_vec.u8s[0] = n[offset_first];
+    n_vec.u8s[1] = n[offset_first + 1];
+    n_vec.u8s[2] = n[offset_mid];
+    n_vec.u8s[3] = n[offset_last];
 
-SZ_PUBLIC void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
-    sz_u64_t *keys = sequence->order;
-    sz_size_t keys_count = sequence->count;
-    for (sz_size_t i = 1; i < keys_count; i++) {
-        sz_u64_t i_key = keys[i];
-        sz_size_t j = i;
-        for (; j > 0 && less(sequence, i_key, keys[j - 1]); --j) keys[j] = keys[j - 1];
-        keys[j] = i_key;
+    // Scan through the whole haystack, skipping the last `n_length - 1` bytes.
+    for (sz_size_t i = 0; i <= h_length - n_length;) {
+        h_vec.u8s[0] = h[i + offset_first];
+        h_vec.u8s[1] = h[i + offset_first + 1];
+        h_vec.u8s[2] = h[i + offset_mid];
+        h_vec.u8s[3] = h[i + offset_last];
+        if (h_vec.u32 == n_vec.u32 && sz_equal_serial((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
+        i += bad_shift_table.jumps[h[i + n_length - 1]];
     }
+    return SZ_NULL_CHAR;
 }
 
-SZ_INTERNAL void _sz_sift_down(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t start,
-                               sz_size_t end) {
-    sz_size_t root = start;
-    while (2 * root + 1 <= end) {
-        sz_size_t child = 2 * root + 1;
-        if (child + 1 <= end && less(sequence, order[child], order[child + 1])) { child++; }
-        if (!less(sequence, order[root], order[child])) { return; }
-        sz_u64_swap(order + root, order + child);
-        root = child;
-    }
-}
+/**
+ *  @brief  Boyer-Moore-Horspool algorithm for @b reverse-order exact matching of patterns up to @b 256-bytes long.
+ *          Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
+ */
+SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_upto_256bytes_serial( //
+    sz_cptr_t h_chars, sz_size_t h_length,                     //
+    sz_cptr_t n_chars, sz_size_t n_length) {
+    sz_assert(n_length <= 256 && "The pattern is too long.");
+    union {
+        sz_u8_t jumps[256];
+        sz_u64_vec_t vecs[64];
+    } bad_shift_table;
 
-SZ_INTERNAL void _sz_heapify(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t count) {
-    sz_size_t start = (count - 2) / 2;
-    while (1) {
-        _sz_sift_down(sequence, less, order, start, count - 1);
-        if (start == 0) return;
-        start--;
+    // Let's initialize the table using SWAR to the total length of the string.
+    sz_u8_t const *h = (sz_u8_t const *)h_chars;
+    sz_u8_t const *n = (sz_u8_t const *)n_chars;
+    {
+        sz_u64_vec_t n_length_vec;
+        n_length_vec.u64 = n_length;
+        n_length_vec.u64 *= 0x0101010101010101ull; // broadcast
+        for (sz_size_t i = 0; i != 64; ++i) bad_shift_table.vecs[i].u64 = n_length_vec.u64;
+        for (sz_size_t i = 0; i + 1 < n_length; ++i)
+            bad_shift_table.jumps[n[n_length - i - 1]] = (sz_u8_t)(n_length - i - 1);
     }
-}
 
-SZ_INTERNAL void _sz_heapsort(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first, sz_size_t last) {
-    sz_u64_t *order = sequence->order;
-    sz_size_t count = last - first;
-    _sz_heapify(sequence, less, order + first, count);
-    sz_size_t end = count - 1;
-    while (end > 0) {
-        sz_u64_swap(order + first, order + first + end);
-        end--;
-        _sz_sift_down(sequence, less, order + first, 0, end);
-    }
-}
+    // Another common heuristic is to match a few characters from different parts of a string.
+    // Raita suggests to use the first two, the last, and the middle character of the pattern.
+    sz_u32_vec_t h_vec, n_vec;
 
-SZ_PUBLIC void sz_sort_introsort_recursion(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first,
-                                           sz_size_t last, sz_size_t depth) {
-
-    sz_size_t length = last - first;
-    switch (length) {
-    case 0:
-    case 1: return;
-    case 2:
-        if (less(sequence, sequence->order[first + 1], sequence->order[first]))
-            sz_u64_swap(&sequence->order[first], &sequence->order[first + 1]);
-        return;
-    case 3: {
-        sz_u64_t a = sequence->order[first];
-        sz_u64_t b = sequence->order[first + 1];
-        sz_u64_t c = sequence->order[first + 2];
-        if (less(sequence, b, a)) sz_u64_swap(&a, &b);
-        if (less(sequence, c, b)) sz_u64_swap(&c, &b);
-        if (less(sequence, b, a)) sz_u64_swap(&a, &b);
-        sequence->order[first] = a;
-        sequence->order[first + 1] = b;
-        sequence->order[first + 2] = c;
-        return;
-    }
-    }
-    // Until a certain length, the quadratic-complexity insertion-sort is fine
-    if (length <= 16) {
-        sz_sequence_t sub_seq = *sequence;
-        sub_seq.order += first;
-        sub_seq.count = length;
-        sz_sort_insertion(&sub_seq, less);
-        return;
-    }
+    // Pick the parts of the needle that are worth comparing.
+    sz_size_t offset_first, offset_mid, offset_last;
+    _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
 
-    // Fallback to N-logN-complexity heap-sort
-    if (depth == 0) {
-        _sz_heapsort(sequence, less, first, last);
-        return;
-    }
+    // Broadcast those characters into an unsigned integer.
+    n_vec.u8s[0] = n[offset_first];
+    n_vec.u8s[1] = n[offset_first + 1];
+    n_vec.u8s[2] = n[offset_mid];
+    n_vec.u8s[3] = n[offset_last];
 
-    --depth;
-
-    // Median-of-three logic to choose pivot
-    sz_size_t median = first + length / 2;
-    if (less(sequence, sequence->order[median], sequence->order[first]))
-        sz_u64_swap(&sequence->order[first], &sequence->order[median]);
-    if (less(sequence, sequence->order[last - 1], sequence->order[first]))
-        sz_u64_swap(&sequence->order[first], &sequence->order[last - 1]);
-    if (less(sequence, sequence->order[median], sequence->order[last - 1]))
-        sz_u64_swap(&sequence->order[median], &sequence->order[last - 1]);
-
-    // Partition using the median-of-three as the pivot
-    sz_u64_t pivot = sequence->order[median];
-    sz_size_t left = first;
-    sz_size_t right = last - 1;
-    while (1) {
-        while (less(sequence, sequence->order[left], pivot)) left++;
-        while (less(sequence, pivot, sequence->order[right])) right--;
-        if (left >= right) break;
-        sz_u64_swap(&sequence->order[left], &sequence->order[right]);
-        left++;
-        right--;
+    // Scan through the whole haystack, skipping the first `n_length - 1` bytes.
+    for (sz_size_t j = 0; j <= h_length - n_length;) {
+        sz_size_t i = h_length - n_length - j;
+        h_vec.u8s[0] = h[i + offset_first];
+        h_vec.u8s[1] = h[i + offset_first + 1];
+        h_vec.u8s[2] = h[i + offset_mid];
+        h_vec.u8s[3] = h[i + offset_last];
+        if (h_vec.u32 == n_vec.u32 && sz_equal_serial((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
+        j += bad_shift_table.jumps[h[i]];
     }
-
-    // Recursively sort the partitions
-    sz_sort_introsort_recursion(sequence, less, first, left, depth);
-    sz_sort_introsort_recursion(sequence, less, right + 1, last, depth);
-}
-
-SZ_PUBLIC void sz_sort_introsort(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
-    if (sequence->count == 0) return;
-    sz_size_t size_is_not_power_of_two = (sequence->count & (sequence->count - 1)) != 0;
-    sz_size_t depth_limit = sz_size_log2i_nonzero(sequence->count) + size_is_not_power_of_two;
-    sz_sort_introsort_recursion(sequence, less, 0, sequence->count, depth_limit);
+    return SZ_NULL_CHAR;
 }
 
-SZ_PUBLIC void sz_sort_recursion( //
-    sz_sequence_t *sequence, sz_size_t bit_idx, sz_size_t bit_max, sz_sequence_comparator_t comparator,
-    sz_size_t partial_order_length) {
-
-    if (!sequence->count) return;
+/**
+ *  @brief  Exact substring search helper function, that finds the first occurrence of a prefix of the needle
+ *          using a given search function, and then verifies the remaining part of the needle.
+ */
+SZ_INTERNAL sz_cptr_t _sz_find_with_prefix( //
+    sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length, sz_find_t find_prefix, sz_size_t prefix_length) {
 
-    // Array of size one doesn't need sorting - only needs the prefix to be discarded.
-    if (sequence->count == 1) {
-        sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
-        order_half_words[1] = 0;
-        return;
-    }
+    sz_size_t suffix_length = n_length - prefix_length;
+    while (1) {
+        sz_cptr_t found = find_prefix(h, h_length, n, prefix_length);
+        if (!found) return SZ_NULL_CHAR;
 
-    // Partition a range of integers according to a specific bit value
-    sz_size_t split = 0;
-    sz_u64_t mask = (1ull << 63) >> bit_idx;
+        // Verify the remaining part of the needle
+        sz_size_t remaining = h_length - (found - h);
+        if (remaining < n_length) return SZ_NULL_CHAR;
+        if (sz_equal_serial(found + prefix_length, n + prefix_length, suffix_length)) return found;
 
-    // The clean approach would be to perform a single pass over the sequence.
-    //
-    //    while (split != sequence->count && !(sequence->order[split] & mask)) ++split;
-    //    for (sz_size_t i = split + 1; i < sequence->count; ++i)
-    //        if (!(sequence->order[i] & mask)) sz_u64_swap(sequence->order + i, sequence->order + split), ++split;
-    //
-    // This, however, doesn't take into account the high relative cost of writes and swaps.
-    // To circumvent that, we can first count the total number entries to be mapped into either part.
-    // And then walk through both parts, swapping the entries that are in the wrong part.
-    // This would often lead to ~15% performance gain.
-    sz_size_t count_with_bit_set = 0;
-    for (sz_size_t i = 0; i != sequence->count; ++i) count_with_bit_set += (sequence->order[i] & mask) != 0;
-    split = sequence->count - count_with_bit_set;
-
-    // It's possible that the sequence is already partitioned.
-    if (split != 0 && split != sequence->count) {
-        // Use two pointers to efficiently reposition elements.
-        // On pointer walks left-to-right from the start, and the other walks right-to-left from the end.
-        sz_size_t left = 0;
-        sz_size_t right = sequence->count - 1;
-        while (1) {
-            // Find the next element with the bit set on the left side.
-            while (left < split && !(sequence->order[left] & mask)) ++left;
-            // Find the next element without the bit set on the right side.
-            while (right >= split && (sequence->order[right] & mask)) --right;
-            // Swap the mispositioned elements.
-            if (left < split && right >= split) {
-                sz_u64_swap(sequence->order + left, sequence->order + right);
-                ++left;
-                --right;
-            }
-            else { break; }
-        }
+        // Adjust the position.
+        h = found + 1;
+        h_length = remaining - 1;
     }
 
-    // Go down recursively.
-    if (bit_idx < bit_max) {
-        sz_sequence_t a = *sequence;
-        a.count = split;
-        sz_sort_recursion(&a, bit_idx + 1, bit_max, comparator, partial_order_length);
-
-        sz_sequence_t b = *sequence;
-        b.order += split;
-        b.count -= split;
-        sz_sort_recursion(&b, bit_idx + 1, bit_max, comparator, partial_order_length);
-    }
-    // Reached the end of recursion.
-    else {
-        // Discard the prefixes.
-        sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
-        for (sz_size_t i = 0; i != sequence->count; ++i) { order_half_words[i * 2 + 1] = 0; }
-
-        sz_sequence_t a = *sequence;
-        a.count = split;
-        sz_sort_introsort(&a, comparator);
-
-        sz_sequence_t b = *sequence;
-        b.order += split;
-        b.count -= split;
-        sz_sort_introsort(&b, comparator);
-    }
+    // Unreachable, but helps silence compiler warnings:
+    return SZ_NULL_CHAR;
 }
 
-SZ_INTERNAL sz_bool_t _sz_sort_is_less(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) {
-    sz_cptr_t i_str = sequence->get_start(sequence, i_key);
-    sz_cptr_t j_str = sequence->get_start(sequence, j_key);
-    sz_size_t i_len = sequence->get_length(sequence, i_key);
-    sz_size_t j_len = sequence->get_length(sequence, j_key);
-    return (sz_bool_t)(sz_order_serial(i_str, i_len, j_str, j_len) == sz_less_k);
-}
+/**
+ *  @brief  Exact reverse-order substring search helper function, that finds the last occurrence of a suffix of the
+ *          needle using a given search function, and then verifies the remaining part of the needle.
+ */
+SZ_INTERNAL sz_cptr_t _sz_rfind_with_suffix(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length,
+                                            sz_find_t find_suffix, sz_size_t suffix_length) {
 
-SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t partial_order_length) {
+    sz_size_t prefix_length = n_length - suffix_length;
+    while (1) {
+        sz_cptr_t found = find_suffix(h, h_length, n + prefix_length, suffix_length);
+        if (!found) return SZ_NULL_CHAR;
 
-#if SZ_DETECT_BIG_ENDIAN
-    // TODO: Implement partial sort for big-endian systems. For now this sorts the whole thing.
-    sz_unused(partial_order_length);
-    sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
-#else
+        // Verify the remaining part of the needle
+        sz_size_t remaining = found - h;
+        if (remaining < prefix_length) return SZ_NULL_CHAR;
+        if (sz_equal_serial(found - prefix_length, n, prefix_length)) return found - prefix_length;
 
-    // Export up to 4 bytes into the `sequence` bits themselves
-    for (sz_size_t i = 0; i != sequence->count; ++i) {
-        sz_cptr_t begin = sequence->get_start(sequence, sequence->order[i]);
-        sz_size_t length = sequence->get_length(sequence, sequence->order[i]);
-        length = length > 4u ? 4u : length;
-        sz_ptr_t prefix = (sz_ptr_t)&sequence->order[i];
-        for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j];
+        // Adjust the position.
+        h_length = remaining - 1;
     }
 
-    // Perform optionally-parallel radix sort on them
-    sz_sort_recursion(sequence, 0, 32, (sz_sequence_comparator_t)_sz_sort_is_less, partial_order_length);
-#endif
+    // Unreachable, but helps silence compiler warnings:
+    return SZ_NULL_CHAR;
 }
 
-SZ_PUBLIC void sz_sort(sz_sequence_t *sequence) {
-#if SZ_DETECT_BIG_ENDIAN
-    sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
-#else
-    sz_sort_partial(sequence, sequence->count);
-#endif
+SZ_INTERNAL sz_cptr_t _sz_find_over_4bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+    return _sz_find_with_prefix(h, h_length, n, n_length, (sz_find_t)_sz_find_4byte_serial, 4);
 }
 
-#pragma endregion
-
-/*
- *  @brief  AVX2 implementation of the string search algorithms.
- *          Very minimalistic, but still faster than the serial implementation.
- */
-#pragma region AVX2 Implementation
-
-#if SZ_USE_X86_AVX2
-#pragma GCC push_options
-#pragma GCC target("avx2")
-#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
-#include <immintrin.h>
+SZ_INTERNAL sz_cptr_t _sz_find_horspool_over_256bytes_serial( //
+    sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+    return _sz_find_with_prefix(h, h_length, n, n_length, _sz_find_horspool_upto_256bytes_serial, 256);
+}
 
-/**
- *  @brief  Helper structure to simplify work with 256-bit registers.
- */
-typedef union sz_u256_vec_t {
-    __m256i ymm;
-    __m128i xmms[2];
-    sz_u64_t u64s[4];
-    sz_u32_t u32s[8];
-    sz_u16_t u16s[16];
-    sz_u8_t u8s[32];
-} sz_u256_vec_t;
-
-SZ_PUBLIC sz_ordering_t sz_order_avx2(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    //! Before optimizing this, read the "Operations Not Worth Optimizing" in Contributions Guide:
-    //! https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md#general-performance-observations
-    return sz_order_serial(a, a_length, b, b_length);
+SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_over_256bytes_serial( //
+    sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+    return _sz_rfind_with_suffix(h, h_length, n, n_length, _sz_rfind_horspool_upto_256bytes_serial, 256);
 }
 
-SZ_PUBLIC sz_bool_t sz_equal_avx2(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_u256_vec_t a_vec, b_vec;
+SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+    // This almost never fires, but it's better to be safe than sorry.
+    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
 
-    while (length >= 32) {
-        a_vec.ymm = _mm256_lddqu_si256((__m256i const *)a);
-        b_vec.ymm = _mm256_lddqu_si256((__m256i const *)b);
-        // One approach can be to use "movemasks", but we could also use a bitwise matching like `_mm256_testnzc_si256`.
-        int difference_mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi8(a_vec.ymm, b_vec.ymm));
-        if (difference_mask == 0) { a += 32, b += 32, length -= 32; }
-        else { return sz_false_k; }
-    }
+#if _SZ_IS_BIG_ENDIAN
+    sz_find_t backends[] = {
+        (sz_find_t)sz_find_byte_serial,
+        (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
+        (sz_find_t)_sz_find_horspool_over_256bytes_serial,
+    };
 
-    if (length) return sz_equal_serial(a, b, length);
-    return sz_true_k;
-}
+    return backends[(n_length > 1) + (n_length > 256)](h, h_length, n, n_length);
+#else
+    sz_find_t backends[] = {
+        // For very short strings brute-force SWAR makes sense.
+        (sz_find_t)sz_find_byte_serial,
+        (sz_find_t)_sz_find_2byte_serial,
+        (sz_find_t)_sz_find_3byte_serial,
+        (sz_find_t)_sz_find_4byte_serial,
+        // To avoid constructing the skip-table, let's use the prefixed approach.
+        (sz_find_t)_sz_find_over_4bytes_serial,
+        // For longer needles - use skip tables.
+        (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
+        (sz_find_t)_sz_find_horspool_over_256bytes_serial,
+    };
 
-SZ_PUBLIC void sz_fill_avx2(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    char value_char = *(char *)&value;
-    __m256i value_vec = _mm256_set1_epi8(value_char);
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores".
-    //
-    //    for (; length >= 32; target += 32, length -= 32) _mm256_storeu_si256(target, value_vec);
-    //    sz_fill_serial(target, length, value);
-    //
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 32) sz_fill_serial(target, length, value);
-    // When the buffer is aligned, we can avoid any split-stores.
-    else {
-        sz_size_t head_length = (32 - ((sz_size_t)target % 32)) % 32; // 31 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 32;    // 31 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 32.
-        sz_u16_t value16 = (sz_u16_t)value * 0x0101u;
-        sz_u32_t value32 = (sz_u32_t)value16 * 0x00010001u;
-        sz_u64_t value64 = (sz_u64_t)value32 * 0x0000000100000001ull;
-
-        // Fill the head of the buffer. This part is much cleaner with AVX-512.
-        if (head_length & 1) *(sz_u8_t *)target = value, target++, head_length--;
-        if (head_length & 2) *(sz_u16_t *)target = value16, target += 2, head_length -= 2;
-        if (head_length & 4) *(sz_u32_t *)target = value32, target += 4, head_length -= 4;
-        if (head_length & 8) *(sz_u64_t *)target = value64, target += 8, head_length -= 8;
-        if (head_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_set1_epi8(value_char)), target += 16, head_length -= 16;
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-
-        // Fill the aligned body of the buffer.
-        for (; body_length >= 32; target += 32, body_length -= 32) _mm256_store_si256((__m256i *)target, value_vec);
-
-        // Fill the tail of the buffer. This part is much cleaner with AVX-512.
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-        if (tail_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_set1_epi8(value_char)), target += 16, tail_length -= 16;
-        if (tail_length & 8) *(sz_u64_t *)target = value64, target += 8, tail_length -= 8;
-        if (tail_length & 4) *(sz_u32_t *)target = value32, target += 4, tail_length -= 4;
-        if (tail_length & 2) *(sz_u16_t *)target = value16, target += 2, tail_length -= 2;
-        if (tail_length & 1) *(sz_u8_t *)target = value, target++, tail_length--;
-    }
+    return backends[
+        // For very short strings brute-force SWAR makes sense.
+        (n_length > 1) + (n_length > 2) + (n_length > 3) +
+        // To avoid constructing the skip-table, let's use the prefixed approach.
+        (n_length > 4) +
+        // For longer needles - use skip tables.
+        (n_length > 8) + (n_length > 256)](h, h_length, n, n_length);
+#endif
 }
 
-SZ_PUBLIC void sz_copy_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores" and "loads".
-    //
-    //    for (; length >= 32; target += 32, source += 32, length -= 32)
-    //        _mm256_storeu_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-    //    sz_copy_serial(target, source, length);
-    //
-    // A typical AWS Skylake instance can have 32 KB x 2 blocks of L1 data cache per core,
-    // 1 MB x 2 blocks of L2 cache per core, and one shared L3 cache buffer.
-    // For now, let's avoid the cases beyond the L2 size.
-    int is_huge = length > 1ull * 1024ull * 1024ull;
-    if (length <= 32) { sz_copy_serial(target, source, length); }
-    // When dealing wirh larger arrays, the optimization is not as simple as with the `sz_fill_avx2` function,
-    // as both buffers may be unaligned. If we are lucky and the requested operation is some huge page transfer,
-    // we can use aligned loads and stores, and the performance will be great.
-    else if ((sz_size_t)target % 32 == 0 && (sz_size_t)source % 32 == 0 && !is_huge) {
-        for (; length >= 32; target += 32, source += 32, length -= 32)
-            _mm256_store_si256((__m256i *)target, _mm256_load_si256((__m256i const *)source));
-        if (length) sz_copy_serial(target, source, length);
-    }
-    // The trickiest case is when both `source` and `target` are not aligned.
-    // In such and simpler cases we can copy enough bytes into `target` to reach its cacheline boundary,
-    // and then combine unaligned loads with aligned stores.
-    else {
-        sz_size_t head_length = (32 - ((sz_size_t)target % 32)) % 32; // 31 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 32;    // 31 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 32.
-
-        // Fill the head of the buffer. This part is much cleaner with AVX-512.
-        if (head_length & 1) *(sz_u8_t *)target = *(sz_u8_t *)source, target++, source++, head_length--;
-        if (head_length & 2) *(sz_u16_t *)target = *(sz_u16_t *)source, target += 2, source += 2, head_length -= 2;
-        if (head_length & 4) *(sz_u32_t *)target = *(sz_u32_t *)source, target += 4, source += 4, head_length -= 4;
-        if (head_length & 8) *(sz_u64_t *)target = *(sz_u64_t *)source, target += 8, source += 8, head_length -= 8;
-        if (head_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_lddqu_si128((__m128i const *)source)), target += 16, source += 16,
-                head_length -= 16;
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-
-        // Fill the aligned body of the buffer.
-        if (!is_huge) {
-            for (; body_length >= 32; target += 32, source += 32, body_length -= 32)
-                _mm256_store_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-        }
-        // When the biffer is huge, we can traverse it in 2 directions.
-        else {
-            for (; body_length >= 64; target += 32, source += 32, body_length -= 64) {
-                _mm256_store_si256((__m256i *)(target), _mm256_lddqu_si256((__m256i const *)(source)));
-                _mm256_store_si256((__m256i *)(target + body_length - 32),
-                                   _mm256_lddqu_si256((__m256i const *)(source + body_length - 32)));
-            }
-            if (body_length) _mm256_store_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-        }
+SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
 
-        // Fill the tail of the buffer. This part is much cleaner with AVX-512.
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-        if (tail_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_lddqu_si128((__m128i const *)source)), target += 16, source += 16,
-                tail_length -= 16;
-        if (tail_length & 8) *(sz_u64_t *)target = *(sz_u64_t *)source, target += 8, source += 8, tail_length -= 8;
-        if (tail_length & 4) *(sz_u32_t *)target = *(sz_u32_t *)source, target += 4, source += 4, tail_length -= 4;
-        if (tail_length & 2) *(sz_u16_t *)target = *(sz_u16_t *)source, target += 2, source += 2, tail_length -= 2;
-        if (tail_length & 1) *(sz_u8_t *)target = *(sz_u8_t *)source, target++, source++, tail_length--;
-    }
-}
+    // This almost never fires, but it's better to be safe than sorry.
+    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
+
+    sz_find_t backends[] = {
+        // For very short strings brute-force SWAR makes sense.
+        (sz_find_t)sz_rfind_byte_serial,
+        //  TODO: implement reverse-order SWAR for 2/3/4 byte variants.
+        //  TODO: (sz_find_t)_sz_rfind_2byte_serial,
+        //  TODO: (sz_find_t)_sz_rfind_3byte_serial,
+        //  TODO: (sz_find_t)_sz_rfind_4byte_serial,
+        // To avoid constructing the skip-table, let's use the prefixed approach.
+        // (sz_find_t)_sz_rfind_over_4bytes_serial,
+        // For longer needles - use skip tables.
+        (sz_find_t)_sz_rfind_horspool_upto_256bytes_serial,
+        (sz_find_t)_sz_rfind_horspool_over_256bytes_serial,
+    };
 
-SZ_PUBLIC void sz_move_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    if (target < source || target >= source + length) {
-        for (; length >= 32; target += 32, source += 32, length -= 32)
-            _mm256_storeu_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-        while (length--) *(target++) = *(source++);
-    }
-    else {
-        // Jump to the end and walk backwards.
-        for (target += length, source += length; length >= 32; length -= 32)
-            _mm256_storeu_si256((__m256i *)(target -= 32), _mm256_lddqu_si256((__m256i const *)(source -= 32)));
-        while (length--) *(--target) = *(--source);
-    }
+    return backends[
+        // For very short strings brute-force SWAR makes sense.
+        0 +
+        // To avoid constructing the skip-table, let's use the prefixed approach.
+        (n_length > 1) +
+        // For longer needles - use skip tables.
+        (n_length > 256)](h, h_length, n, n_length);
 }
 
-SZ_PUBLIC sz_u64_t sz_checksum_avx2(sz_cptr_t text, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "loads".
-    //
-    // A typical AWS Skylake instance can have 32 KB x 2 blocks of L1 data cache per core,
-    // 1 MB x 2 blocks of L2 cache per core, and one shared L3 cache buffer.
-    // For now, let's avoid the cases beyond the L2 size.
-    int is_huge = length > 1ull * 1024ull * 1024ull;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 32) { return sz_checksum_serial(text, length); }
-    else if (!is_huge) {
-        sz_u256_vec_t text_vec, sums_vec;
-        sums_vec.ymm = _mm256_setzero_si256();
-        for (; length >= 32; text += 32, length -= 32) {
-            text_vec.ymm = _mm256_lddqu_si256((__m256i const *)text);
-            sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-        }
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        sz_u64_t result = low + high;
-        if (length) result += sz_checksum_serial(text, length);
-        return result;
-    }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    // Most notably, we can avoid populating the cache with the entire buffer, and instead traverse it in 2 directions.
-    else {
-        sz_size_t head_length = (32 - ((sz_size_t)text % 32)) % 32; // 31 or less.
-        sz_size_t tail_length = (sz_size_t)(text + length) % 32;    // 31 or less.
-        sz_size_t body_length = length - head_length - tail_length; // Multiple of 32.
-        sz_u64_t result = 0;
-
-        // Handle the head
-        while (head_length--) result += *text++;
-
-        sz_u256_vec_t text_vec, sums_vec;
-        sums_vec.ymm = _mm256_setzero_si256();
-        // Fill the aligned body of the buffer.
-        if (!is_huge) {
-            for (; body_length >= 32; text += 32, body_length -= 32) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i const *)text);
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-            }
-        }
-        // When the biffer is huge, we can traverse it in 2 directions.
-        else {
-            sz_u256_vec_t text_reversed_vec, sums_reversed_vec;
-            sums_reversed_vec.ymm = _mm256_setzero_si256();
-            for (; body_length >= 64; text += 64, body_length -= 64) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i *)(text));
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-                text_reversed_vec.ymm = _mm256_stream_load_si256((__m256i *)(text + body_length - 64));
-                sums_reversed_vec.ymm = _mm256_add_epi64(
-                    sums_reversed_vec.ymm, _mm256_sad_epu8(text_reversed_vec.ymm, _mm256_setzero_si256()));
-            }
-            if (body_length >= 32) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i *)(text));
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-            }
-            sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, sums_reversed_vec.ymm);
-        }
-
-        // Handle the tail
-        while (tail_length--) result += *text++;
-
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        result += low + high;
-        return result;
-    }
-}
+#pragma endregion // Serial Implementation
 
-SZ_PUBLIC void sz_look_up_transform_avx2(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
+/*  AVX2 implementation of the string search algorithms for Haswell processors and newer.
+ *  Very minimalistic (compared to AVX-512), but still faster than the serial implementation.
+ */
+#pragma region Haswell Implementation
+#if SZ_USE_HASWELL
+#pragma GCC push_options
+#pragma GCC target("haswell")
+#pragma clang attribute push(__attribute__((target("haswell"))), apply_to = function)
 
-    // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
-    // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
-    // But if at least 3 cache lines are touched, the AVX-2 implementation should be faster.
-    if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
-        return;
-    }
+SZ_PUBLIC sz_bool_t sz_equal_haswell(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
+    sz_u256_vec_t a_vec, b_vec;
 
-    // We need to pull the lookup table into 8x YMM registers.
-    // The biggest issue is reorganizing the data in the lookup table, as AVX2 doesn't have 256-bit shuffle,
-    // it only has 128-bit "within-lane" shuffle. Still, it's wiser to use full YMM registers, instead of XMM,
-    // so that we can at least compensate high latency with twice larger window and one more level of lookup.
-    sz_u256_vec_t lut_0_to_15_vec, lut_16_to_31_vec, lut_32_to_47_vec, lut_48_to_63_vec, //
-        lut_64_to_79_vec, lut_80_to_95_vec, lut_96_to_111_vec, lut_112_to_127_vec,       //
-        lut_128_to_143_vec, lut_144_to_159_vec, lut_160_to_175_vec, lut_176_to_191_vec,  //
-        lut_192_to_207_vec, lut_208_to_223_vec, lut_224_to_239_vec, lut_240_to_255_vec;
-
-    lut_0_to_15_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut)));
-    lut_16_to_31_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 16)));
-    lut_32_to_47_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 32)));
-    lut_48_to_63_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 48)));
-    lut_64_to_79_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 64)));
-    lut_80_to_95_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 80)));
-    lut_96_to_111_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 96)));
-    lut_112_to_127_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 112)));
-    lut_128_to_143_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 128)));
-    lut_144_to_159_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 144)));
-    lut_160_to_175_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 160)));
-    lut_176_to_191_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 176)));
-    lut_192_to_207_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 192)));
-    lut_208_to_223_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 208)));
-    lut_224_to_239_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 224)));
-    lut_240_to_255_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 240)));
-
-    // Assuming each lookup is performed within 16 elements of 256, we need to reduce the scope by 16x = 2^4.
-    sz_u256_vec_t not_first_bit_vec, not_second_bit_vec, not_third_bit_vec, not_fourth_bit_vec;
-
-    /// Top and bottom nibbles of the source are used separately.
-    sz_u256_vec_t source_vec, source_bot_vec;
-    sz_u256_vec_t blended_0_to_31_vec, blended_32_to_63_vec, blended_64_to_95_vec, blended_96_to_127_vec,
-        blended_128_to_159_vec, blended_160_to_191_vec, blended_192_to_223_vec, blended_224_to_255_vec;
-
-    // Handling the head.
     while (length >= 32) {
-        // Load and separate the nibbles of each byte in the source.
-        source_vec.ymm = _mm256_lddqu_si256((__m256i const *)source);
-        source_bot_vec.ymm = _mm256_and_si256(source_vec.ymm, _mm256_set1_epi8((char)0x0F));
-
-        // In the first round, we select using the 4th bit.
-        not_fourth_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x10), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8(                      //
-            _mm256_shuffle_epi8(lut_16_to_31_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_0_to_15_vec.ymm, source_bot_vec.ymm),  //
-            not_fourth_bit_vec.ymm);
-        blended_32_to_63_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_48_to_63_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_32_to_47_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_64_to_95_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_80_to_95_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_64_to_79_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_96_to_127_vec.ymm = _mm256_blendv_epi8(                      //
-            _mm256_shuffle_epi8(lut_112_to_127_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_96_to_111_vec.ymm, source_bot_vec.ymm),  //
-            not_fourth_bit_vec.ymm);
-        blended_128_to_159_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_144_to_159_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_128_to_143_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_160_to_191_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_176_to_191_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_160_to_175_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_192_to_223_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_208_to_223_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_192_to_207_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_224_to_255_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_240_to_255_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_224_to_239_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-
-        // Perform a tree-like reduction of the 8x "blended" YMM registers, depending on the "source" content.
-        // The first round selects using the 3rd bit.
-        not_third_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x20), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8( //
-            blended_32_to_63_vec.ymm,                 //
-            blended_0_to_31_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-        blended_64_to_95_vec.ymm = _mm256_blendv_epi8( //
-            blended_96_to_127_vec.ymm,                 //
-            blended_64_to_95_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-        blended_128_to_159_vec.ymm = _mm256_blendv_epi8( //
-            blended_160_to_191_vec.ymm,                  //
-            blended_128_to_159_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-        blended_192_to_223_vec.ymm = _mm256_blendv_epi8( //
-            blended_224_to_255_vec.ymm,                  //
-            blended_192_to_223_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-
-        // The second round selects using the 2nd bit.
-        not_second_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x40), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8( //
-            blended_64_to_95_vec.ymm,                 //
-            blended_0_to_31_vec.ymm,                  //
-            not_second_bit_vec.ymm);
-        blended_128_to_159_vec.ymm = _mm256_blendv_epi8( //
-            blended_192_to_223_vec.ymm,                  //
-            blended_128_to_159_vec.ymm,                  //
-            not_second_bit_vec.ymm);
-
-        // The third round selects using the 1st bit.
-        not_first_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x80), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8( //
-            blended_128_to_159_vec.ymm,               //
-            blended_0_to_31_vec.ymm,                  //
-            not_first_bit_vec.ymm);
-
-        // And dump the result into the target.
-        _mm256_storeu_si256((__m256i *)target, blended_0_to_31_vec.ymm);
-        source += 32, target += 32, length -= 32;
+        a_vec.ymm = _mm256_lddqu_si256((__m256i const *)a);
+        b_vec.ymm = _mm256_lddqu_si256((__m256i const *)b);
+        // One approach can be to use "movemasks", but we could also use a bitwise matching like `_mm256_testnzc_si256`.
+        int difference_mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi8(a_vec.ymm, b_vec.ymm));
+        if (difference_mask == 0) { a += 32, b += 32, length -= 32; }
+        else { return sz_false_k; }
     }
 
-    // Handle the tail.
-    if (length) sz_look_up_transform_serial(source, length, lut, target);
+    if (length) return sz_equal_serial(a, b, length);
+    return sz_true_k;
 }
 
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
+SZ_PUBLIC sz_cptr_t sz_find_byte_haswell(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
     int mask;
     sz_u256_vec_t h_vec, n_vec;
     n_vec.ymm = _mm256_set1_epi8(n[0]);
@@ -4233,7 +852,7 @@ SZ_PUBLIC sz_cptr_t sz_find_byte_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
     return sz_find_byte_serial(h, h_length, n);
 }
 
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
+SZ_PUBLIC sz_cptr_t sz_rfind_byte_haswell(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
     int mask;
     sz_u256_vec_t h_vec, n_vec;
     n_vec.ymm = _mm256_set1_epi8(n[0]);
@@ -4248,11 +867,11 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_
     return sz_rfind_byte_serial(h, h_length, n);
 }
 
-SZ_PUBLIC sz_cptr_t sz_find_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+SZ_PUBLIC sz_cptr_t sz_find_haswell(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
 
     // This almost never fires, but it's better to be safe than sorry.
     if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_avx2(h, h_length, n);
+    if (n_length == 1) return sz_find_byte_haswell(h, h_length, n);
 
     // Pick the parts of the needle that are worth comparing.
     sz_size_t offset_first, offset_mid, offset_last;
@@ -4270,9 +889,10 @@ SZ_PUBLIC sz_cptr_t sz_find_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, s
         h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_first));
         h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_mid));
         h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_last));
-        matches = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
+        matches = //
+            _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
+            _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
+            _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
         while (matches) {
             int potential_offset = sz_u32_ctz(matches);
             if (sz_equal(h + potential_offset, n, n_length)) return h + potential_offset;
@@ -4283,11 +903,11 @@ SZ_PUBLIC sz_cptr_t sz_find_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, s
     return sz_find_serial(h, h_length, n, n_length);
 }
 
-SZ_PUBLIC sz_cptr_t sz_rfind_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+SZ_PUBLIC sz_cptr_t sz_rfind_haswell(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
 
     // This almost never fires, but it's better to be safe than sorry.
     if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_avx2(h, h_length, n);
+    if (n_length == 1) return sz_rfind_byte_haswell(h, h_length, n);
 
     // Pick the parts of the needle that are worth comparing.
     sz_size_t offset_first, offset_mid, offset_last;
@@ -4307,9 +927,10 @@ SZ_PUBLIC sz_cptr_t sz_rfind_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
         h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_first));
         h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_mid));
         h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_last));
-        matches = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
+        matches = //
+            _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
+            _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
+            _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
         while (matches) {
             int potential_offset = sz_u32_clz(matches);
             if (sz_equal(h + h_length - n_length - potential_offset, n, n_length))
@@ -4321,7 +942,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
     return sz_rfind_serial(h, h_length, n, n_length);
 }
 
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx2(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
+SZ_PUBLIC sz_cptr_t sz_find_charset_haswell(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
 
     // Let's unzip even and odd elements and replicate them into both lanes of the YMM register.
     // That way when we invoke `_mm256_shuffle_epi8` we can use the same mask for both lanes.
@@ -4336,11 +957,12 @@ SZ_PUBLIC sz_cptr_t sz_find_charset_avx2(sz_cptr_t text, sz_size_t length, sz_ch
     sz_u256_vec_t lower_nibbles_vec, higher_nibbles_vec;
     sz_u256_vec_t bitset_even_vec, bitset_odd_vec;
     sz_u256_vec_t bitmask_vec, bitmask_lookup_vec;
-    bitmask_lookup_vec.ymm = _mm256_set_epi8(-128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-                                             -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);
+    bitmask_lookup_vec.ymm = _mm256_set_epi8(                       //
+        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
+        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);
 
     while (length >= 32) {
-        // The following algorithm is a transposed equivalent of the "SIMDized check which bytes are in a set"
+        // The following algorithm is a transposed equivalent of the "SIMD-ized check which bytes are in a set"
         // solutions by Wojciech Muła. We populate the bitmask differently and target newer CPUs, so
         // StrinZilla uses a somewhat different approach.
         // http://0x80.pl/articles/simd-byte-lookup.html#alternative-implementation-new
@@ -4408,289 +1030,27 @@ SZ_PUBLIC sz_cptr_t sz_find_charset_avx2(sz_cptr_t text, sz_size_t length, sz_ch
     return sz_find_charset_serial(text, length, filter);
 }
 
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx2(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
+SZ_PUBLIC sz_cptr_t sz_rfind_charset_haswell(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
     return sz_rfind_charset_serial(text, length, filter);
 }
 
-/**
- *  @brief  There is no AVX2 instruction for fast multiplication of 64-bit integers.
- *          This implementation is coming from Agner Fog's Vector Class Library.
- */
-SZ_INTERNAL __m256i _mm256_mul_epu64(__m256i a, __m256i b) {
-    __m256i bswap = _mm256_shuffle_epi32(b, 0xB1);
-    __m256i prodlh = _mm256_mullo_epi32(a, bswap);
-    __m256i zero = _mm256_setzero_si256();
-    __m256i prodlh2 = _mm256_hadd_epi32(prodlh, zero);
-    __m256i prodlh3 = _mm256_shuffle_epi32(prodlh2, 0x73);
-    __m256i prodll = _mm256_mul_epu32(a, b);
-    __m256i prod = _mm256_add_epi64(prodll, prodlh3);
-    return prod;
-}
-
-SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                              sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    if (length < 4 * window_length) {
-        sz_hashes_serial(start, length, window_length, step, callback, callback_handle);
-        return;
-    }
-
-    // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
-    // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
-    sz_size_t const max_hashes = length - window_length + 1;
-    sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
-    sz_u8_t const *text_first = (sz_u8_t const *)start;
-    sz_u8_t const *text_second = text_first + min_hashes_per_thread;
-    sz_u8_t const *text_third = text_first + min_hashes_per_thread * 2;
-    sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
-    sz_u8_t const *text_end = text_first + length;
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // Broadcast the constants into the registers.
-    sz_u256_vec_t prime_vec, golden_ratio_vec;
-    sz_u256_vec_t base_low_vec, base_high_vec, prime_power_low_vec, prime_power_high_vec, shift_high_vec;
-    base_low_vec.ymm = _mm256_set1_epi64x(31ull);
-    base_high_vec.ymm = _mm256_set1_epi64x(257ull);
-    shift_high_vec.ymm = _mm256_set1_epi64x(77ull);
-    prime_vec.ymm = _mm256_set1_epi64x(SZ_U64_MAX_PRIME);
-    golden_ratio_vec.ymm = _mm256_set1_epi64x(11400714819323198485ull);
-    prime_power_low_vec.ymm = _mm256_set1_epi64x(prime_power_low);
-    prime_power_high_vec.ymm = _mm256_set1_epi64x(prime_power_high);
-
-    // Compute the initial hash values for every one of the four windows.
-    sz_u256_vec_t hash_low_vec, hash_high_vec, hash_mix_vec, chars_low_vec, chars_high_vec;
-    hash_low_vec.ymm = _mm256_setzero_si256();
-    hash_high_vec.ymm = _mm256_setzero_si256();
-    for (sz_u8_t const *prefix_end = text_first + window_length; text_first < prefix_end;
-         ++text_first, ++text_second, ++text_third, ++text_fourth) {
-
-        // 1. Multiply the hashes by the base.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-
-        // 3. Add the incoming characters.
-        hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_low_vec.ymm = _mm256_blendv_epi8(hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
-                                              _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
-        hash_high_vec.ymm = _mm256_blendv_epi8(hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
-                                               _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
-    }
-
-    // 5. Compute the hash mix, that will be used to index into the fingerprint.
-    //    This includes a serial step at the end.
-    hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
-    hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
-    hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
-    callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-    callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-    callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-    callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-
-    // Now repeat that operation for the remaining characters, discarding older characters.
-    sz_size_t cycle = 1;
-    sz_size_t const step_mask = step - 1;
-    for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
-        // 0. Load again the four characters we are dropping, shift them, and subtract.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[-window_length], text_third[-window_length],
-                                              text_second[-window_length], text_first[-window_length]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-        hash_low_vec.ymm =
-            _mm256_sub_epi64(hash_low_vec.ymm, _mm256_mul_epu64(chars_low_vec.ymm, prime_power_low_vec.ymm));
-        hash_high_vec.ymm =
-            _mm256_sub_epi64(hash_high_vec.ymm, _mm256_mul_epu64(chars_high_vec.ymm, prime_power_high_vec.ymm));
-
-        // 1. Multiply the hashes by the base.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-
-        // 3. Add the incoming characters.
-        hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_low_vec.ymm = _mm256_blendv_epi8(hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
-                                              _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
-        hash_high_vec.ymm = _mm256_blendv_epi8(hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
-                                               _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
-
-        // 5. Compute the hash mix, that will be used to index into the fingerprint.
-        //    This includes a serial step at the end.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
-        hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
-        if ((cycle & step_mask) == 0) {
-            callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-            callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-            callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-            callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-        }
-    }
-}
-
 #pragma clang attribute pop
 #pragma GCC pop_options
-#endif
-#pragma endregion
+#endif            // SZ_USE_HASWELL
+#pragma endregion // Haswell Implementation
 
-/*
- *  @brief  AVX-512 implementation of the string search algorithms.
+/*  AVX512 implementation of the string search algorithms for Skylake and newer CPUs.
+ *  Includes extensions: F, CD, ER, PF, VL, DQ, BW.
  *
- *  Different subsets of AVX-512 were introduced in different years:
- *  - 2017 SkyLake: F, CD, ER, PF, VL, DQ, BW
- *  - 2018 CannonLake: IFMA, VBMI
- *  - 2019 IceLake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES
- *  - 2020 TigerLake: VP2INTERSECT
+ *  This is the "starting level" for the advanced algorithms using K-mask registers on x86.
  */
-#pragma region AVX512 Implementation
-
-#if SZ_USE_X86_AVX512
+#pragma region Skylake Implementation
+#if SZ_USE_SKYLAKE
 #pragma GCC push_options
 #pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
 #pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
-#include <immintrin.h>
-
-/**
- *  @brief  Helper structure to simplify work with 512-bit registers.
- */
-typedef union sz_u512_vec_t {
-    __m512i zmm;
-    __m256i ymms[2];
-    __m128i xmms[4];
-    sz_u64_t u64s[8];
-    sz_u32_t u32s[16];
-    sz_u16_t u16s[32];
-    sz_u8_t u8s[64];
-    sz_i64_t i64s[8];
-    sz_i32_t i32s[16];
-} sz_u512_vec_t;
-
-SZ_INTERNAL __mmask64 _sz_u64_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 64:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 64:
-    return _bzhi_u64(0xFFFFFFFFFFFFFFFF, n < 64 ? (sz_u32_t)n : 64);
-}
-
-SZ_INTERNAL __mmask32 _sz_u32_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 32:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 32:
-    return _bzhi_u32(0xFFFFFFFF, n < 32 ? (sz_u32_t)n : 32);
-}
-
-SZ_INTERNAL __mmask16 _sz_u16_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 16:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 16:
-    return _bzhi_u32(0xFFFFFFFF, n < 16 ? (sz_u32_t)n : 16);
-}
-
-SZ_INTERNAL __mmask16 _sz_u16_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 16:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 16:
-    return (__mmask16)_bzhi_u32(0xFFFFFFFF, (sz_u32_t)n);
-}
-
-SZ_INTERNAL __mmask32 _sz_u32_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 32:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 32:
-    return _bzhi_u32(0xFFFFFFFF, (sz_u32_t)n);
-}
-
-SZ_INTERNAL __mmask64 _sz_u64_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 64:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 64:
-    return _bzhi_u64(0xFFFFFFFFFFFFFFFF, (sz_u32_t)n);
-}
-
-SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    sz_u512_vec_t a_vec, b_vec;
-
-    // Pointer arithmetic is cheap, fetching memory is not!
-    // So we can use the masked loads to fetch at most one cache-line for each string,
-    // compare the prefixes, and only then move forward.
-    sz_size_t a_head_length = 64 - ((sz_size_t)a % 64); // 63 or less.
-    sz_size_t b_head_length = 64 - ((sz_size_t)b % 64); // 63 or less.
-    a_head_length = a_head_length < a_length ? a_head_length : a_length;
-    b_head_length = b_head_length < b_length ? b_head_length : b_length;
-    sz_size_t head_length = a_head_length < b_head_length ? a_head_length : b_head_length;
-    __mmask64 head_mask = _sz_u64_mask_until(head_length);
-    a_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, a);
-    b_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, b);
-    __mmask64 mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-    if (mask_not_equal != 0) {
-        sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-        char a_char = a_vec.u8s[first_diff];
-        char b_char = b_vec.u8s[first_diff];
-        return _sz_order_scalars(a_char, b_char);
-    }
-    else if (head_length == a_length && head_length == b_length) { return sz_equal_k; }
-    else { a += head_length, b += head_length, a_length -= head_length, b_length -= head_length; }
-
-    // The rare case, when both string are very long.
-    __mmask64 a_mask, b_mask;
-    while ((a_length >= 64) & (b_length >= 64)) {
-        a_vec.zmm = _mm512_loadu_si512(a);
-        b_vec.zmm = _mm512_loadu_si512(b);
-        mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask_not_equal != 0) {
-            sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-            char a_char = a_vec.u8s[first_diff];
-            char b_char = b_vec.u8s[first_diff];
-            return _sz_order_scalars(a_char, b_char);
-        }
-        a += 64, b += 64, a_length -= 64, b_length -= 64;
-    }
 
-    // In most common scenarios at least one of the strings is under 64 bytes.
-    if (a_length | b_length) {
-        a_mask = _sz_u64_clamp_mask_until(a_length);
-        b_mask = _sz_u64_clamp_mask_until(b_length);
-        a_vec.zmm = _mm512_maskz_loadu_epi8(a_mask, a);
-        b_vec.zmm = _mm512_maskz_loadu_epi8(b_mask, b);
-        // The AVX-512 `_mm512_mask_cmpneq_epi8_mask` intrinsics are generally handy in such environments.
-        // They, however, have latency 3 on most modern CPUs. Using AVX2: `_mm256_cmpeq_epi8` would have
-        // been cheaper, if we didn't have to apply `_mm256_movemask_epi8` afterwards.
-        mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask_not_equal != 0) {
-            sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-            char a_char = a_vec.u8s[first_diff];
-            char b_char = b_vec.u8s[first_diff];
-            return _sz_order_scalars(a_char, b_char);
-        }
-        // From logic perspective, the hardest cases are "abc\0" and "abc".
-        // The result must be `sz_greater_k`, as the latter is shorter.
-        else { return _sz_order_scalars(a_length, b_length); }
-    }
-
-    return sz_equal_k;
-}
-
-SZ_PUBLIC sz_bool_t sz_equal_avx512(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
+SZ_PUBLIC sz_bool_t sz_equal_skylake(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
     __mmask64 mask;
     sz_u512_vec_t a_vec, b_vec;
 
@@ -4714,219 +1074,6 @@ SZ_PUBLIC sz_bool_t sz_equal_avx512(sz_cptr_t a, sz_cptr_t b, sz_size_t length)
     return sz_true_k;
 }
 
-SZ_PUBLIC void sz_fill_avx512(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    __m512i value_vec = _mm512_set1_epi8(value);
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores".
-    //
-    //    for (; length >= 64; target += 64, length -= 64) _mm512_storeu_si512(target, value_vec);
-    //    _mm512_mask_storeu_epi8(target, _sz_u64_mask_until(length), value_vec);
-    //
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        _mm512_mask_storeu_epi8(target, mask, value_vec);
-    }
-    // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
-    // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
-    // by computing 2 masks - for the head and tail, using masked stores for the head and tail, and unmasked
-    // for the body.
-    else {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        _mm512_mask_storeu_epi8(target, head_mask, value_vec);
-        for (target += head_length; body_length >= 64; target += 64, body_length -= 64)
-            _mm512_store_si512(target, value_vec);
-        _mm512_mask_storeu_epi8(target, tail_mask, value_vec);
-    }
-}
-
-SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores" and "loads".
-    //
-    //    for (; length >= 64; target += 64, source += 64, length -= 64)
-    //        _mm512_storeu_si512(target, _mm512_loadu_si512(source));
-    //    __mmask64 mask = _sz_u64_mask_until(length);
-    //    _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    //
-    // A typical AWS Sapphire Rapids instance can have 48 KB x 2 blocks of L1 data cache per core,
-    // 2 MB x 2 blocks of L2 cache per core, and one shared 60 MB buffer of L3 cache.
-    // With two strings, we may consider the overal workload huge, if each exceeds 1 MB in length.
-    int const is_huge = length >= 1ull * 1024ull * 1024ull;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    }
-    // When dealing wirh larger arrays, the optimization is not as simple as with the `sz_fill_avx512` function,
-    // as both buffers may be unaligned. If we are lucky and the requested operation is some huge page transfer,
-    // we can use aligned loads and stores, and the performance will be great.
-    else if ((sz_size_t)target % 64 == 0 && (sz_size_t)source % 64 == 0 && !is_huge) {
-        for (; length >= 64; target += 64, source += 64, length -= 64)
-            _mm512_store_si512(target, _mm512_load_si512(source));
-        // At this point the length is guaranteed to be under 64.
-        __mmask64 mask = _sz_u64_mask_until(length);
-        // Aligned load and stores would work too, but it's not defined.
-        _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    }
-    // The trickiest case is when both `source` and `target` are not aligned.
-    // In such and simpler cases we can copy enough bytes into `target` to reach its cacheline boundary,
-    // and then combine unaligned loads with aligned stores.
-    else if (!is_huge) {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-        for (target += head_length, source += head_length; body_length >= 64;
-             target += 64, source += 64, body_length -= 64)
-            _mm512_store_si512(target, _mm512_loadu_si512(source)); // Unaligned load, but aligned store!
-        _mm512_mask_storeu_epi8(target, tail_mask, _mm512_maskz_loadu_epi8(tail_mask, source));
-    }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    //
-    //      1. Moving in both directions to maximize the throughput, when fetching from multiple
-    //         memory pages. Also helps with cache set-associativity issues, as we won't always
-    //         be fetching the same entries in the lookup table.
-    //      2. Using non-temporal stores to avoid polluting the cache.
-    //      3. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
-    //         for predictable patterns, so disregard this advice.
-    //
-    // Bidirectional traversal adds about 10%, accelerating from 11 GB/s to 12 GB/s.
-    // Using "streaming stores" boosts us from 12 GB/s to 19 GB/s.
-    else {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64;
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;
-        sz_size_t body_length = length - head_length - tail_length;
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-        _mm512_mask_storeu_epi8(target + head_length + body_length, tail_mask,
-                                _mm512_maskz_loadu_epi8(tail_mask, source));
-
-        // Now in the main loop, we can use non-temporal loads and stores,
-        // performing the operation in both directions.
-        for (target += head_length, source += head_length; //
-             body_length >= 128;                           //
-             target += 64, source += 64, body_length -= 128) {
-            _mm512_stream_si512((__m512i *)(target), _mm512_loadu_si512(source));
-            _mm512_stream_si512((__m512i *)(target + body_length - 64), _mm512_loadu_si512(source + body_length - 64));
-        }
-        if (body_length >= 64) _mm512_stream_si512((__m512i *)target, _mm512_loadu_si512(source));
-    }
-}
-
-SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    if (target == source) return; // Don't be silly, don't move the data if it's already there.
-
-    // On very short buffers, that are one cache line in width or less, we don't need any loops.
-    // We can also avoid any data-dependencies between iterations, assuming we have 32 registers
-    // to pre-load the data, before writing it back.
-    if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    }
-    else if (length <= 128) {
-        sz_size_t last_length = length - 64;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
-        __m512i source0 = _mm512_loadu_epi8(source);
-        __m512i source1 = _mm512_maskz_loadu_epi8(mask, source + 64);
-        _mm512_storeu_epi8(target, source0);
-        _mm512_mask_storeu_epi8(target + 64, mask, source1);
-    }
-    else if (length <= 192) {
-        sz_size_t last_length = length - 128;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
-        __m512i source0 = _mm512_loadu_epi8(source);
-        __m512i source1 = _mm512_loadu_epi8(source + 64);
-        __m512i source2 = _mm512_maskz_loadu_epi8(mask, source + 128);
-        _mm512_storeu_epi8(target, source0);
-        _mm512_storeu_epi8(target + 64, source1);
-        _mm512_mask_storeu_epi8(target + 128, mask, source2);
-    }
-    else if (length <= 256) {
-        sz_size_t last_length = length - 192;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
-        __m512i source0 = _mm512_loadu_epi8(source);
-        __m512i source1 = _mm512_loadu_epi8(source + 64);
-        __m512i source2 = _mm512_loadu_epi8(source + 128);
-        __m512i source3 = _mm512_maskz_loadu_epi8(mask, source + 192);
-        _mm512_storeu_epi8(target, source0);
-        _mm512_storeu_epi8(target + 64, source1);
-        _mm512_storeu_epi8(target + 128, source2);
-        _mm512_mask_storeu_epi8(target + 192, mask, source3);
-    }
-
-    // If the regions don't overlap at all, just use "copy" and save some brain cells thinking about corner cases.
-    else if (target + length < source || target >= source + length) { sz_copy_avx512(target, source, length); }
-
-    // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
-    // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
-    // by computing 2 masks - for the head and tail, using masked stores for the head and tail, and unmasked
-    // for the body.
-    else {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-
-        // The absolute most common case of using "moves" is shifting the data within a continuous buffer
-        // when adding a removing some values in it. In such cases, a typical shift is by 1, 2, 4, 8, 16,
-        // or 32 bytes, rarely larger. For small shifts, under the size of the ZMM register, we can use shuffles.
-        //
-        // Remember:
-        //      - if we are shifting data left, that we are traversing to the right.
-        //      - if we are shifting data right, that we are traversing to the left.
-        int const left_to_right_traversal = source > target;
-
-        // Now we guarantee, that the relative shift within registers is from 1 to 63 bytes and the output is aligned.
-        // Hopefully, we need to shift more than two ZMM registers, so we could consider `valignr` instruction.
-        // Sadly, using `_mm512_alignr_epi8` doesn't make sense, as it operates at a 128-bit granularity.
-        //
-        //      - `_mm256_alignr_epi8` shifts entire 256-bit register, but we need many of them.
-        //      - `_mm512_alignr_epi32` shifts 512-bit chunks, but only if the `shift` is a multiple of 4 bytes.
-        //      - `_mm512_alignr_epi64` shifts 512-bit chunks by 8 bytes.
-        //
-        // All of those have a latency of 1 cycle, and the shift amount must be an immediate value!
-        // For 1-byte-shift granularity, the `_mm512_permutex2var_epi8` has a latency of 6 and needs VBMI!
-        // The most efficient and broadly compatible alternative could be to use a combination of align and shuffle.
-        // A similar approach was outlined in "Byte-wise alignr in AVX512F" by Wojciech Muła.
-        // http://0x80.pl/notesen/2016-10-16-avx512-byte-alignr.html
-        //
-        // That solution, is extremely mouthful, assuming we need compile time constants for the shift amount.
-        // A cleaner one, with a latency of 3 cycles, is to use `_mm512_permutexvar_epi8` or
-        // `_mm512_mask_permutexvar_epi8`, which can be seen as combination of a cross-register shuffle and blend,
-        // and is available with VBMI. That solution is still noticeably slower than AVX2.
-        //
-        // The GLibC implementation also uses non-temporal stores for larger buffers, we don't.
-        // https://codebrowser.dev/glibc/glibc/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S.html
-        if (left_to_right_traversal) {
-            // Head, body, and tail.
-            _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-            for (target += head_length, source += head_length; body_length >= 64;
-                 target += 64, source += 64, body_length -= 64)
-                _mm512_store_si512(target, _mm512_loadu_si512(source));
-            _mm512_mask_storeu_epi8(target, tail_mask, _mm512_maskz_loadu_epi8(tail_mask, source));
-        }
-        else {
-            // Tail, body, and head.
-            _mm512_mask_storeu_epi8(target + head_length + body_length, tail_mask,
-                                    _mm512_maskz_loadu_epi8(tail_mask, source + head_length + body_length));
-            for (; body_length >= 64; body_length -= 64)
-                _mm512_store_si512(target + head_length + body_length - 64,
-                                   _mm512_loadu_si512(source + head_length + body_length - 64));
-            _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-        }
-    }
-}
-
 SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
     __mmask64 mask;
     sz_u512_vec_t h_vec, n_vec;
@@ -4950,7 +1097,7 @@ SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr
     return SZ_NULL_CHAR;
 }
 
-SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+SZ_PUBLIC sz_cptr_t sz_find_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
 
     // This almost never fires, but it's better to be safe than sorry.
     if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
@@ -4969,20 +1116,21 @@ SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
     n_last_vec.zmm = _mm512_set1_epi8(n[offset_last]);
 
     // Scan through the string.
-    // We have several optimized versions of the lagorithm for shorter strings,
+    // We have several optimized versions of the algorithm for shorter strings,
     // but they all mimic the default case for unbounded length needles
     if (n_length >= 64) {
         for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
             h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
             h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
             h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
+            matches = _kand_mask64( //
+                _kand_mask64(       // Intersect the masks
+                    _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
+                    _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
+                _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
             while (matches) {
                 int potential_offset = sz_u64_ctz(matches);
-                if (sz_equal_avx512(h + potential_offset, n, n_length)) return h + potential_offset;
+                if (sz_equal_skylake(h + potential_offset, n, n_length)) return h + potential_offset;
                 matches &= matches - 1;
             }
 
@@ -4996,10 +1144,11 @@ SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
             h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
             h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
             h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
+            matches = _kand_mask64( //
+                _kand_mask64(       // Intersect the masks
+                    _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
+                    _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
+                _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
             if (matches) return h + sz_u64_ctz(matches);
         }
     }
@@ -5014,10 +1163,11 @@ SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
             h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
             h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
             h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
+            matches = _kand_mask64( //
+                _kand_mask64(       // Intersect the masks
+                    _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
+                    _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
+                _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
             while (matches) {
                 int potential_offset = sz_u64_ctz(matches);
                 h_full_vec.zmm = _mm512_maskz_loadu_epi8(n_mask, h + potential_offset);
@@ -5034,893 +1184,126 @@ SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
         h_first_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_first);
         h_mid_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_mid);
         h_last_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
+        matches = _kand_mask64( //
+            _kand_mask64(       // Intersect the masks
+                _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
+                _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
+            _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
         while (matches) {
             int potential_offset = sz_u64_ctz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + potential_offset, n, n_length)) return h + potential_offset;
+            if (n_length <= 3 || sz_equal_skylake(h + potential_offset, n, n_length)) return h + potential_offset;
             matches &= matches - 1;
         }
     }
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    __mmask64 mask;
-    sz_u512_vec_t h_vec, n_vec;
-    n_vec.zmm = _mm512_set1_epi8(n[0]);
-
-    while (h_length >= 64) {
-        h_vec.zmm = _mm512_loadu_si512(h + h_length - 64);
-        mask = _mm512_cmpeq_epi8_mask(h_vec.zmm, n_vec.zmm);
-        if (mask) return h + h_length - 1 - sz_u64_clz(mask);
-        h_length -= 64;
-    }
-
-    if (h_length) {
-        mask = _sz_u64_mask_until(h_length);
-        h_vec.zmm = _mm512_maskz_loadu_epi8(mask, h);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpeq_epu8_mask(mask, h_vec.zmm, n_vec.zmm);
-        if (mask) return h + 64 - sz_u64_clz(mask) - 1;
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_avx512(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into ZMM registers.
-    __mmask64 mask;
-    __mmask64 matches;
-    sz_u512_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.zmm = _mm512_set1_epi8(n[offset_first]);
-    n_mid_vec.zmm = _mm512_set1_epi8(n[offset_mid]);
-    n_last_vec.zmm = _mm512_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 64; h_length -= 64) {
-        h_reversed = h + h_length - n_length - 64 + 1;
-        h_first_vec.zmm = _mm512_loadu_si512(h_reversed + offset_first);
-        h_mid_vec.zmm = _mm512_loadu_si512(h_reversed + offset_mid);
-        h_last_vec.zmm = _mm512_loadu_si512(h_reversed + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~((sz_u64_t)1 << (63 - potential_offset));
-        }
-    }
-
-    // The "tail" of the function uses masked loads to process the remaining bytes.
-    {
-        mask = _sz_u64_mask_until(h_length - n_length + 1);
-        h_first_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_first);
-        h_mid_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_mid);
-        h_last_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + 64 - potential_offset - 1, n, n_length))
-                return h + 64 - potential_offset - 1;
-            sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~((sz_u64_t)1 << (63 - potential_offset));
-        }
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
-                             apply_to = function)
-
-/**
- *  @brief  Computes the edit distance between two very short byte-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 63, and evaluates at most (63 * 2 + 1 = 127) diagonals, or just as many loop cycles.
- *  Supports an early exit, if the distance is bounded.
- *  Keeps all of the data and Levenshtein matrices skew diagonal in just a couple of registers.
- *  Benefits from the @b `vpermb` instructions, that can rotate the bytes across the entire ZMM register.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto63_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                        //
-    sz_cptr_t longer, sz_size_t longer_length,                          //
-    sz_size_t bound) {
-
-    sz_size_t const max_length = 63u;
-    sz_assert(shorter_length <= longer_length && "The 'shorter' string is longer than the 'longer' one.");
-    sz_assert(shorter_length < max_length && "The length must fit into 16-bit integer. Otherwise use serial variant.");
-
-    // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
-    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
-    sz_size_t const shorter_dim = shorter_length + 1;
-    sz_size_t const longer_dim = longer_length + 1;
-
-    // The next few buffers will be swapped around.
-    sz_u512_vec_t previous_vec, current_vec, next_vec;
-    sz_u512_vec_t gaps_vec, substitutions_vec;
-
-    // Load the strings into ZMM registers - just once.
-    sz_u512_vec_t longer_vec, shorter_vec, shorter_rotated_vec, rotate_left_vec, rotate_right_vec, ones_vec, bound_vec;
-    longer_vec.zmm = _mm512_maskz_loadu_epi8(_sz_u64_mask_until(longer_length), longer);
-    rotate_left_vec.zmm = _mm512_set_epi8(                              //
-        0, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,  //
-        48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, //
-        32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, //
-        16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-    rotate_right_vec.zmm = _mm512_set_epi8(                             //
-        62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48,     //
-        47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, //
-        31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, //
-        15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 63);
-    ones_vec.zmm = _mm512_set1_epi8(1);
-    bound_vec.zmm = _mm512_set1_epi8(bound <= 255 ? (sz_u8_t)bound : 255);
-
-    // To simplify comparisons and traversals, we want to reverse the order of bytes in the shorter string.
-    for (sz_size_t i = 0; i != shorter_length; ++i) shorter_vec.u8s[63 - i] = shorter[i];
-    shorter_rotated_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, shorter_vec.zmm);
-
-    // Let's say we are dealing with 3 and 5 letter words.
-    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
-    // It will have:
-    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
-    // - 2 diagonals of fixed length, at positions: 4, 5.
-    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
-    sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
-
-    // Initialize the first two diagonals:
-    //
-    //      previous_vec.u8s[0] = 0;
-    //      current_vec.u8s[0] = current_vec.u8s[1] = 1;
-    //
-    // We can do a similar thing with vector ops:
-    previous_vec.zmm = _mm512_setzero_si512();
-    current_vec.zmm = _mm512_set1_epi8(1);
-
-    // We skip diagonals 0 and 1, as they are trivial.
-    // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
-    // so we are effectively computing just one value, as will be marked by a single set bit in
-    // the `next_diagonal_mask` on the very first iteration.
-    sz_size_t next_diagonal_index = 2;
-    __mmask64 next_diagonal_mask = 0;
-
-    // Progress through the upper triangle of the Levenshtein matrix.
-    for (; next_diagonal_index != shorter_dim; ++next_diagonal_index) {
-        // After this iteration, the values at offset `0` and `next_diagonal_index` in the `next_vec`
-        // should be set to `next_diagonal_index`, but it's easier to broadcast the value to the whole vector,
-        // and later merge with a mask with new values.
-        next_vec.zmm = _mm512_set1_epi8((sz_u8_t)next_diagonal_index);
-
-        // The mask also adds one set bit.
-        next_diagonal_mask = _kor_mask64(next_diagonal_mask, 1);
-        next_diagonal_mask = _kshiftli_mask64(next_diagonal_mask, 1);
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        substitutions_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, substitutions_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(_mm512_permutexvar_epi8(rotate_right_vec.zmm, current_vec.zmm), current_vec.zmm),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_mask_min_epu8(next_vec.zmm, next_diagonal_mask, gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = current_vec.zmm;
-        current_vec.zmm = next_vec.zmm;
-
-        // Shift the shorter string
-        shorter_rotated_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, shorter_rotated_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
-    }
-
-    // Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.
-    for (; next_diagonal_index != longer_dim; ++next_diagonal_index) {
-        // After this iteration, the value `shorted_dim - 1` in the `next_vec`
-        // should be set to `next_diagonal_index`, but it's easier to broadcast the value to the whole vector,
-        // and later merge with a mask with new values.
-        next_vec.zmm = _mm512_set1_epi8((sz_u8_t)next_diagonal_index);
-
-        // Make sure we update the first entry.
-        next_diagonal_mask = _kor_mask64(next_diagonal_mask, 1);
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(current_vec.zmm, _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm)),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_mask_min_epu8(next_vec.zmm, next_diagonal_mask, gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm);
-        current_vec.zmm = next_vec.zmm;
-
-        // Let's shift the longer string now.
-        longer_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, longer_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
-    }
-
-    // Now let's handle the bottom right triangle.
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(current_vec.zmm, _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm)),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_min_epu8(gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm);
-        current_vec.zmm = next_vec.zmm;
-
-        // Let's shift the longer string now.
-        longer_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, longer_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
-        // In every following iterations we take use a shorter prefix of each register,
-        // but we don't need to update the `next_diagonal_mask` anymore... except for the early exit.
-        next_diagonal_mask = _kshiftri_mask64(next_diagonal_mask, 1);
-    }
-    return current_vec.u8s[0];
-}
-
-/**
- *  @brief  Computes the edit distance between two somewhat short bytes-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 127, and evaluates at most (127 * 2 + 1 = 255) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *  Benefits from the @b `vpermi2b` instructions, that can rotate the bytes in 2 registers at once.
- *
- *  This may be one of the most freuqently called kernels for:
- *  - source code analysis, assuming most lines are either under 80 or under 120 characters long.
- *  - DNA sequence alignment, as most short reads are 50-300 characters long.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto127_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two longer bytes-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 255, and evaluates at most (255 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *
- *  Each of 2x string ends up occupying 4 ZMM registers, and each of 3x diagonals uses 4 ZMM registers.
- *  So 20x of the 32x are persistently occupied, and the rest are used for math temporarily.
- *  This is the largest space-efficient variant, as strings beyond 255 characters may require
- *  16-bit accumulators, which would be a significant bottleneck.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                      //
-    sz_cptr_t longer, sz_size_t longer_length,                        //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two longer bytes-strings using the AVX-512VBMI extensions,
- *          assuming the upper distance bound can not exceed 255, but the string length can be arbitrary.
- *
- *  Applies to string lengths up to 255, and evaluates at most (255 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *
- *  Each of 2x string ends up occupying 4 ZMM registers, and each of 3x diagonals uses 4 ZMM registers.
- *  So 20x of the 32x are persistently occupied, and the rest are used for math temporarily.
- *  This is the largest space-efficient variant, as strings beyond 255 characters may require
- *  16-bit accumulators, which would be a significant bottleneck.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto255bound_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                              //
-    sz_cptr_t longer, sz_size_t longer_length,                                //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two mid-length UTF-8-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 127, and evaluates at most (127 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Benefits from the @b `valignd` instructions used to rotate UTF-32 unpacked unicode codepoints.
- *
- *  Each string is unpacked into 128 characters * 4 bytes per character / 64 bytes per register = 8 registers.
- *
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_utf8_skewed_diagonals_upto127_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                              //
-    sz_cptr_t longer, sz_size_t longer_length,                                //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    sz_unused(shorter && longer && bound && alloc);
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // TODO: Generalize!
-    sz_size_t const max_length = 256u * 256u;
-    sz_assert(shorter_length <= longer_length && "The 'shorter' string is longer than the 'longer' one.");
-    sz_assert(shorter_length < max_length && "The length must fit into 16-bit integer. Otherwise use serial variant.");
-    sz_unused(longer_length && bound && max_length);
-
-#if 0
-    // We are going to store 3 diagonals of the matrix.
-    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
-    sz_size_t const shorter_dim = shorter_length + 1;
-    sz_size_t const longer_dim = longer_length + 1;
-    // Unlike the serial version, we also want to avoid reverse-order iteration over teh shorter string.
-    // So let's allocate a bit more memory and reverse-export our shorter string into that buffer.
-    sz_size_t const buffer_length = sizeof(sz_u16_t) * longer_dim * 3 + shorter_length;
-    sz_u16_t *const distances = (sz_u16_t *)alloc->allocate(buffer_length, alloc->handle);
-    if (!distances) return SZ_SIZE_MAX;
-
-    // The next few pointers will be swapped around.
-    sz_u16_t *previous_distances = distances;
-    sz_u16_t *current_distances = previous_distances + longer_dim;
-    sz_u16_t *next_distances = current_distances + longer_dim;
-    sz_ptr_t const shorter_reversed = (sz_ptr_t)(next_distances + longer_dim);
-
-    // Export the reversed string into the buffer.
-    for (sz_size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
-
-    // Initialize the first two diagonals:
-    previous_distances[0] = 0;
-    current_distances[0] = current_distances[1] = 1;
-
-    // Using ZMM registers, we can process 32x 16-bit values at once,
-    // storing 16 bytes of each string in YMM registers.
-    sz_u512_vec_t insertions_vec, deletions_vec, substitutions_vec, next_vec;
-    sz_u512_vec_t ones_u16_vec;
-    ones_u16_vec.zmm = _mm512_set1_epi16(1);
-
-    // This is a mixed-precision implementation, using 8-bit representations for part of the operations.
-    // Even there, in case `SZ_USE_X86_AVX2=0`, let's use the `sz_u512_vec_t` type, addressing the first YMM halfs.
-    sz_u512_vec_t shorter_vec, longer_vec;
-    sz_u512_vec_t ones_u8_vec;
-    ones_u8_vec.ymms[0] = _mm256_set1_epi8(1);
-
-    // Let's say we are dealing with 3 and 5 letter words.
-    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
-    // It will have:
-    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
-    // - 2 diagonals of fixed length, at positions: 4, 5.
-    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
-    sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
-
-    // Progress through the upper triangle of the Levenshtein matrix.
-    sz_size_t next_diagonal_index = 2;
-    for (; next_diagonal_index != shorter_dim; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
-        for (sz_size_t offset_within_diagonal = 0; offset_within_diagonal + 2 < next_diagonal_length;) {
-            sz_u32_t remaining_length = (sz_u32_t)(next_diagonal_length - offset_within_diagonal - 2);
-            sz_u32_t register_length = remaining_length < 32 ? remaining_length : 32;
-            sz_u32_t remaining_length_mask = _bzhi_u32(0xFFFFFFFFu, register_length);
-            longer_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, longer + offset_within_diagonal);
-            // Our original code addressed the shorter string `[next_diagonal_index - offset_within_diagonal - 2]`
-            // for growing `offset_within_diagonal`. If the `shorter` string was reversed, the
-            // `[next_diagonal_index - offset_within_diagonal - 2]` would be equal to `[shorter_length - 1 -
-            // next_diagonal_index + offset_within_diagonal + 2]`. Which simplified would be equal to
-            // `[shorter_length - next_diagonal_index + offset_within_diagonal + 1]`.
-            shorter_vec.ymms[0] = _mm256_maskz_loadu_epi8( //
-                remaining_length_mask,
-                shorter_reversed + shorter_length - next_diagonal_index + offset_within_diagonal + 1);
-            // For substitutions, perform the equality comparison using AVX2 instead of AVX-512
-            // to get the result as a vector, instead of a bitmask. Adding 1 to every scalar we can overflow
-            // transforming from {0xFF, 0} values to {0, 1} values - exactly what we need. Then - upcast to 16-bit.
-            substitutions_vec.zmm = _mm512_cvtepi8_epi16( //
-                _mm256_add_epi8(_mm256_cmpeq_epi8(longer_vec.ymms[0], shorter_vec.ymms[0]), ones_u8_vec.ymms[0]));
-            substitutions_vec.zmm = _mm512_add_epi16( //
-                substitutions_vec.zmm,
-                _mm512_maskz_loadu_epi16(remaining_length_mask, previous_distances + offset_within_diagonal));
-            // For insertions and deletions, on modern hardware, it's faster to issue two separate loads,
-            // than rotate the bytes in the ZMM register.
-            insertions_vec.zmm =
-                _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + offset_within_diagonal);
-            deletions_vec.zmm =
-                _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + offset_within_diagonal + 1);
-            // First get the minimum of insertions and deletions.
-            next_vec.zmm = _mm512_add_epi16(_mm512_min_epu16(insertions_vec.zmm, deletions_vec.zmm), ones_u16_vec.zmm);
-            next_vec.zmm = _mm512_min_epu16(next_vec.zmm, substitutions_vec.zmm);
-            _mm512_mask_storeu_epi16(next_distances + offset_within_diagonal + 1, remaining_length_mask, next_vec.zmm);
-            offset_within_diagonal += register_length;
-        }
-        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-        next_distances[0] = next_distances[next_diagonal_length - 1] = (sz_u16_t)next_diagonal_index;
-        // Perform a circular rotation (three-way swap) of those buffers, to reuse the memory.
-        sz_u16_t *temporary = previous_distances;
-        previous_distances = current_distances;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a
-    // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
-    // index on either side, we will be cropping those values out.
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
-        for (sz_size_t i = 0; i != next_diagonal_length;) {
-            sz_u32_t remaining_length = (sz_u32_t)(next_diagonal_length - i);
-            sz_u32_t register_length = remaining_length < 32 ? remaining_length : 32;
-            sz_u32_t remaining_length_mask = _bzhi_u32(0xFFFFFFFFu, register_length);
-            longer_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, longer + next_diagonal_index - n + i);
-            // Our original code addressed the shorter string `[shorter_length - 1 - i]` for growing `i`.
-            // If the `shorter` string was reversed, the `[shorter_length - 1 - i]` would
-            // be equal to `[shorter_length - 1 - shorter_length + 1 + i]`.
-            // Which simplified would be equal to just `[i]`. Beautiful!
-            shorter_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, shorter_reversed + i);
-            // For substitutions, perform the equality comparison using AVX2 instead of AVX-512
-            // to get the result as a vector, instead of a bitmask. The compare it against the accumulated
-            // substitution costs.
-            substitutions_vec.zmm = _mm512_cvtepi8_epi16( //
-                _mm256_add_epi8(_mm256_cmpeq_epi8(longer_vec.ymms[0], shorter_vec.ymms[0]), ones_u8_vec.ymms[0]));
-            substitutions_vec.zmm = _mm512_add_epi16( //
-                substitutions_vec.zmm, _mm512_maskz_loadu_epi16(remaining_length_mask, previous_distances + i));
-            // For insertions and deletions, on modern hardware, it's faster to issue two separate loads,
-            // than rotate the bytes in the ZMM register.
-            insertions_vec.zmm = _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + i);
-            deletions_vec.zmm = _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + i + 1);
-            // First get the minimum of insertions and deletions.
-            next_vec.zmm = _mm512_add_epi16(_mm512_min_epu16(insertions_vec.zmm, deletions_vec.zmm), ones_u16_vec.zmm);
-            next_vec.zmm = _mm512_min_epu16(next_vec.zmm, substitutions_vec.zmm);
-            _mm512_mask_storeu_epi16(next_distances + i, remaining_length_mask, next_vec.zmm);
-            i += register_length;
-        }
-
-        // Perform a circular rotation (three-way swap) of those buffers, to reuse the memory, this time, with a shift,
-        // dropping the first element in the current array.
-        sz_u16_t *temporary = previous_distances;
-        previous_distances = current_distances + 1;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // Cache scalar before `free` call.
-    sz_size_t result = current_distances[0];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-#endif
-    return 0;
+    return SZ_NULL_CHAR;
 }
 
-SZ_INTERNAL sz_size_t sz_edit_distance_avx512(   //
-    sz_cptr_t shorter, sz_size_t shorter_length, //
-    sz_cptr_t longer, sz_size_t longer_length,   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Bounded computations may exit early.
-    int const is_bounded = bound < longer_length;
-    if (is_bounded) {
-        // If one of the strings is empty - the edit distance is equal to the length of the other one.
-        if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
-        if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
-        // If the difference in length is beyond the `bound`, there is no need to check at all.
-        if (longer_length - shorter_length > bound) return bound;
+SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
+    __mmask64 mask;
+    sz_u512_vec_t h_vec, n_vec;
+    n_vec.zmm = _mm512_set1_epi8(n[0]);
+
+    while (h_length >= 64) {
+        h_vec.zmm = _mm512_loadu_si512(h + h_length - 64);
+        mask = _mm512_cmpeq_epi8_mask(h_vec.zmm, n_vec.zmm);
+        if (mask) return h + h_length - 1 - sz_u64_clz(mask);
+        h_length -= 64;
     }
 
-    // Make sure the shorter string is actually shorter.
-    if (shorter_length > longer_length) {
-        sz_cptr_t temporary = shorter;
-        shorter = longer;
-        longer = temporary;
-        sz_size_t temporary_length = shorter_length;
-        shorter_length = longer_length;
-        longer_length = temporary_length;
+    if (h_length) {
+        mask = _sz_u64_mask_until(h_length);
+        h_vec.zmm = _mm512_maskz_loadu_epi8(mask, h);
+        // Reuse the same `mask` variable to find the bit that doesn't match
+        mask = _mm512_mask_cmpeq_epu8_mask(mask, h_vec.zmm, n_vec.zmm);
+        if (mask) return h + 64 - sz_u64_clz(mask) - 1;
     }
 
-    // Dispatch the right implementation based on the length of the strings.
-    if (longer_length < 64u)
-        return _sz_edit_distance_skewed_diagonals_upto63_avx512( //
-            shorter, shorter_length, longer, longer_length, bound);
-    // else if (longer_length < 256u * 256u)
-    //     return _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
-    //         shorter, shorter_length, longer, longer_length, bound, alloc);
-    else
-        return sz_edit_distance_serial(shorter, shorter_length, longer, longer_length, bound, alloc);
+    return SZ_NULL_CHAR;
 }
 
-SZ_PUBLIC sz_u64_t sz_checksum_avx512(sz_cptr_t text, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "loads".
-    //
-    // A typical AWS Sapphire Rapids instance can have 48 KB x 2 blocks of L1 data cache per core,
-    // 2 MB x 2 blocks of L2 cache per core, and one shared 60 MB buffer of L3 cache.
-    // With two strings, we may consider the overal workload huge, if each exceeds 1 MB in length.
-    int const is_huge = length >= 1ull * 1024ull * 1024ull;
-    sz_u512_vec_t text_vec, sums_vec;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 16) {
-        __mmask16 mask = _sz_u16_mask_until(length);
-        text_vec.xmms[0] = _mm_maskz_loadu_epi8(mask, text);
-        sums_vec.xmms[0] = _mm_sad_epu8(text_vec.xmms[0], _mm_setzero_si128());
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_vec.xmms[0]);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_vec.xmms[0], 1);
-        return low + high;
-    }
-    else if (length <= 32) {
-        __mmask32 mask = _sz_u32_mask_until(length);
-        text_vec.ymms[0] = _mm256_maskz_loadu_epi8(mask, text);
-        sums_vec.ymms[0] = _mm256_sad_epu8(text_vec.ymms[0], _mm256_setzero_si256());
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymms[0]);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymms[0], 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        return low + high;
-    }
-    else if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        return _mm512_reduce_add_epi64(sums_vec.zmm);
-    }
-    else if (!is_huge) {
-        sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(text + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length; // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        for (text += head_length; body_length >= 64; text += 64, body_length -= 64) {
-            text_vec.zmm = _mm512_load_si512((__m512i const *)text);
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        }
-        text_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text);
-        sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        return _mm512_reduce_add_epi64(sums_vec.zmm);
-    }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    //
-    //      1. Moving in both directions to maximize the throughput, when fetching from multiple
-    //         memory pages. Also helps with cache set-associativity issues, as we won't always
-    //         be fetching the same entries in the lookup table.
-    //      2. Using non-temporal stores to avoid polluting the cache.
-    //      3. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
-    //         for predictable patterns, so disregard this advice.
-    //
-    // Bidirectional traversal generally adds about 10% to such algorithms.
-    else {
-        sz_u512_vec_t text_reversed_vec, sums_reversed_vec;
-        sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64;
-        sz_size_t tail_length = (sz_size_t)(text + length) % 64;
-        sz_size_t body_length = length - head_length - tail_length;
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-
-        text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        text_reversed_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text + head_length + body_length);
-        sums_reversed_vec.zmm = _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512());
-
-        // Now in the main loop, we can use non-temporal loads and stores,
-        // performing the operation in both directions.
-        for (text += head_length; body_length >= 128; text += 64, text += 64, body_length -= 128) {
-            text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-            text_reversed_vec.zmm = _mm512_stream_load_si512((__m512i *)(text + body_length - 64));
-            sums_reversed_vec.zmm =
-                _mm512_add_epi64(sums_reversed_vec.zmm, _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512()));
-        }
-        if (body_length >= 64) {
-            text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        }
+SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
 
-        return _mm512_reduce_add_epi64(_mm512_add_epi64(sums_vec.zmm, sums_reversed_vec.zmm));
-    }
-}
+    // This almost never fires, but it's better to be safe than sorry.
+    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
+    if (n_length == 1) return sz_rfind_byte_avx512(h, h_length, n);
 
-SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle) {
+    // Pick the parts of the needle that are worth comparing.
+    sz_size_t offset_first, offset_mid, offset_last;
+    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
 
-    if (length < window_length || !window_length) return;
-    if (length < 4 * window_length) {
-        sz_hashes_serial(start, length, window_length, step, callback, callback_handle);
-        return;
-    }
+    // Broadcast those characters into ZMM registers.
+    __mmask64 mask;
+    __mmask64 matches;
+    sz_u512_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
+    n_first_vec.zmm = _mm512_set1_epi8(n[offset_first]);
+    n_mid_vec.zmm = _mm512_set1_epi8(n[offset_mid]);
+    n_last_vec.zmm = _mm512_set1_epi8(n[offset_last]);
 
-    // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
-    // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
-    sz_size_t const max_hashes = length - window_length + 1;
-    sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
-    sz_u8_t const *text_first = (sz_u8_t const *)start;
-    sz_u8_t const *text_second = text_first + min_hashes_per_thread;
-    sz_u8_t const *text_third = text_first + min_hashes_per_thread * 2;
-    sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
-    sz_u8_t const *text_end = text_first + length;
-
-    // Broadcast the global constants into the registers.
-    // Both high and low hashes will work with the same prime and golden ratio.
-    sz_u512_vec_t prime_vec, golden_ratio_vec;
-    prime_vec.zmm = _mm512_set1_epi64(SZ_U64_MAX_PRIME);
-    golden_ratio_vec.zmm = _mm512_set1_epi64(11400714819323198485ull);
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // We will be evaluating 4 offsets at a time with 2 different hash functions.
-    // We can fit all those 8 state variables in each of the following ZMM registers.
-    sz_u512_vec_t base_vec, prime_power_vec, shift_vec;
-    base_vec.zmm = _mm512_set_epi64(31ull, 31ull, 31ull, 31ull, 257ull, 257ull, 257ull, 257ull);
-    shift_vec.zmm = _mm512_set_epi64(0ull, 0ull, 0ull, 0ull, 77ull, 77ull, 77ull, 77ull);
-    prime_power_vec.zmm = _mm512_set_epi64(prime_power_low, prime_power_low, prime_power_low, prime_power_low,
-                                           prime_power_high, prime_power_high, prime_power_high, prime_power_high);
-
-    // Compute the initial hash values for every one of the four windows.
-    sz_u512_vec_t hash_vec, chars_vec;
-    hash_vec.zmm = _mm512_setzero_si512();
-    for (sz_u8_t const *prefix_end = text_first + window_length; text_first < prefix_end;
-         ++text_first, ++text_second, ++text_third, ++text_fourth) {
-
-        // 1. Multiply the hashes by the base.
-        hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`...
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
-                                         text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-
-        // 3. Add the incoming characters.
-        hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
-                                              _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
+    // Scan through the string.
+    sz_cptr_t h_reversed;
+    for (; h_length >= n_length + 64; h_length -= 64) {
+        h_reversed = h + h_length - n_length - 64 + 1;
+        h_first_vec.zmm = _mm512_loadu_si512(h_reversed + offset_first);
+        h_mid_vec.zmm = _mm512_loadu_si512(h_reversed + offset_mid);
+        h_last_vec.zmm = _mm512_loadu_si512(h_reversed + offset_last);
+        matches = _kand_mask64( //
+            _kand_mask64(       // Intersect the masks
+                _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
+                _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
+            _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
+        while (matches) {
+            int potential_offset = sz_u64_clz(matches);
+            if (n_length <= 3 || sz_equal_skylake(h + h_length - n_length - potential_offset, n, n_length))
+                return h + h_length - n_length - potential_offset;
+            sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
+                      "The bit must be set before we squash it");
+            matches &= ~((sz_u64_t)1 << (63 - potential_offset));
+        }
     }
 
-    // 5. Compute the hash mix, that will be used to index into the fingerprint.
-    //    This includes a serial step at the end.
-    sz_u512_vec_t hash_mix_vec;
-    hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
-    hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
-                                            _mm512_extracti64x4_epi64(hash_mix_vec.zmm, 0));
-
-    callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-    callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-    callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-    callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-
-    // Now repeat that operation for the remaining characters, discarding older characters.
-    sz_size_t cycle = 1;
-    sz_size_t step_mask = step - 1;
-    for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
-        // 0. Load again the four characters we are dropping, shift them, and subtract.
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[-window_length], text_third[-window_length],
-                                         text_second[-window_length], text_first[-window_length], //
-                                         text_fourth[-window_length], text_third[-window_length],
-                                         text_second[-window_length], text_first[-window_length]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-        hash_vec.zmm = _mm512_sub_epi64(hash_vec.zmm, _mm512_mullo_epi64(chars_vec.zmm, prime_power_vec.zmm));
-
-        // 1. Multiply the hashes by the base.
-        hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
-                                         text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-
-        // ... and prefetch the next four characters into Level 2 or higher.
-        _mm_prefetch((sz_cptr_t)text_fourth + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_third + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_second + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_first + 1, _MM_HINT_T1);
-
-        // 3. Add the incoming characters.
-        hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
-                                              _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
-
-        // 5. Compute the hash mix, that will be used to index into the fingerprint.
-        //    This includes a serial step at the end.
-        hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
-        hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
-                                                _mm512_castsi512_si256(hash_mix_vec.zmm));
-
-        if ((cycle & step_mask) == 0) {
-            callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-            callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-            callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-            callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
+    // The "tail" of the function uses masked loads to process the remaining bytes.
+    {
+        mask = _sz_u64_mask_until(h_length - n_length + 1);
+        h_first_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_first);
+        h_mid_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_mid);
+        h_last_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_last);
+        matches = _kand_mask64( //
+            _kand_mask64(       // Intersect the masks
+                _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
+                _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
+            _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
+        while (matches) {
+            int potential_offset = sz_u64_clz(matches);
+            if (n_length <= 3 || sz_equal_skylake(h + 64 - potential_offset - 1, n, n_length))
+                return h + 64 - potential_offset - 1;
+            sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
+                      "The bit must be set before we squash it");
+            matches &= ~((sz_u64_t)1 << (63 - potential_offset));
         }
     }
+
+    return SZ_NULL_CHAR;
 }
 
 #pragma clang attribute pop
 #pragma GCC pop_options
+#endif            // SZ_USE_SKYLAKE
+#pragma endregion // Skylake Implementation
 
+/*  AVX512 implementation of the string search algorithms for Ice Lake and newer CPUs.
+ *  Includes extensions:
+ *      - 2017 Skylake: F, CD, ER, PF, VL, DQ, BW,
+ *      - 2018 CannonLake: IFMA, VBMI,
+ *      - 2019 Ice Lake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES.
+ */
+#pragma region Ice Lake Implementation
+#if SZ_USE_ICE
 #pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512vbmi", "avx512vbmi2", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512vbmi,avx512vbmi2,bmi,bmi2"))), \
+#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
+#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
                              apply_to = function)
 
-SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-
-    // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
-    // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
-    // But if at least 3 cache lines are touched, the AVX-512 implementation should be faster.
-    if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
-        return;
-    }
-
-    // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
-    // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
-    // by computing 2 masks - for the head and tail, using masked stores for the head and tail, and unmasked
-    // for the body.
-    sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-    sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-    __mmask64 head_mask = _sz_u64_mask_until(head_length);
-    __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-
-    // We need to pull the lookup table into 4x ZMM registers.
-    // We can use `vpermi2b` instruction to perform the look in two ZMM registers with `_mm512_permutex2var_epi8`
-    // intrinsics, but it has a 6-cycle latency on Sapphire Rapids and requires AVX512-VBMI. Assuming we need to
-    // operate on 4 registers, it might be cleaner to use 2x separate `_mm512_permutexvar_epi8` calls.
-    // Combining the results with 2x `_mm512_test_epi8_mask` and 3x blends afterwards.
-    //
-    //  - 4x `_mm512_permutexvar_epi8` maps to "VPERMB (ZMM, ZMM, ZMM)":
-    //      - On Ice Lake: 3 cycles latency, ports: 1*p5
-    //      - On Genoa: 6 cycles latency, ports: 1*FP12
-    //  - 3x `_mm512_mask_blend_epi8` maps to "VPBLENDMB_Z (ZMM, K, ZMM, ZMM)":
-    //      - On Ice Lake: 3 cycles latency, ports: 1*p05
-    //      - On Genoa: 1 cycle latency, ports: 1*FP0123
-    //  - 2x `_mm512_test_epi8_mask` maps to "VPTESTMB (K, ZMM, ZMM)":
-    //      - On Ice Lake: 3 cycles latency, ports: 1*p5
-    //      - On Genoa: 4 cycles latency, ports: 1*FP01
-    //
-    sz_u512_vec_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec;
-    lut_0_to_63_vec.zmm = _mm512_loadu_si512((lut));
-    lut_64_to_127_vec.zmm = _mm512_loadu_si512((lut + 64));
-    lut_128_to_191_vec.zmm = _mm512_loadu_si512((lut + 128));
-    lut_192_to_255_vec.zmm = _mm512_loadu_si512((lut + 192));
-
-    sz_u512_vec_t first_bit_vec, second_bit_vec;
-    first_bit_vec.zmm = _mm512_set1_epi8((char)0x80);
-    second_bit_vec.zmm = _mm512_set1_epi8((char)0x40);
-
-    __mmask64 first_bit_mask, second_bit_mask;
-    sz_u512_vec_t source_vec;
-    // If the top bit is set in each word of `source_vec`, than we use `lookup_128_to_191_vec` or
-    // `lookup_192_to_255_vec`. If the second bit is set, we use `lookup_64_to_127_vec` or `lookup_192_to_255_vec`.
-    sz_u512_vec_t lookup_0_to_63_vec, lookup_64_to_127_vec, lookup_128_to_191_vec, lookup_192_to_255_vec;
-    sz_u512_vec_t blended_0_to_127_vec, blended_128_to_255_vec, blended_0_to_255_vec;
-
-    // Handling the head.
-    if (head_length) {
-        source_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, source);
-        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
-        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
-        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
-        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
-        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
-        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
-        blended_0_to_127_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
-        blended_128_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
-        blended_0_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
-        _mm512_mask_storeu_epi8(target, head_mask, blended_0_to_255_vec.zmm);
-        source += head_length, target += head_length, length -= head_length;
-    }
-
-    // Handling the body in 64-byte chunks aligned to cache-line boundaries with respect to `target`.
-    while (length >= 64) {
-        source_vec.zmm = _mm512_loadu_si512(source);
-        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
-        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
-        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
-        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
-        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
-        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
-        blended_0_to_127_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
-        blended_128_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
-        blended_0_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
-        _mm512_store_si512(target, blended_0_to_255_vec.zmm); //! Aligned store, our main weapon!
-        source += 64, target += 64, length -= 64;
-    }
-
-    // Handling the tail.
-    if (tail_length) {
-        source_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, source);
-        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
-        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
-        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
-        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
-        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
-        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
-        blended_0_to_127_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
-        blended_128_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
-        blended_0_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
-        _mm512_mask_storeu_epi8(target, tail_mask, blended_0_to_255_vec.zmm);
-        source += tail_length, target += tail_length, length -= tail_length;
-    }
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
+SZ_PUBLIC sz_cptr_t sz_find_charset_ice(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
 
     // Before initializing the AVX-512 vectors, we may want to run the sequential code for the first few bytes.
     // In practice, that only hurts, even when we have matches every 5-ish bytes.
@@ -6035,365 +1418,30 @@ SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_
     return SZ_NULL_CHAR;
 }
 
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
+SZ_PUBLIC sz_cptr_t sz_rfind_charset_ice(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
     return sz_rfind_charset_serial(text, length, filter);
 }
 
-SZ_PUBLIC sz_cptr_t sz_find_many_avx512(                        //
-    sz_cptr_t haystack, sz_size_t haystack_length,              //
-    sz_cptr_t const *needles, sz_size_t const *needles_lengths, //
-    sz_size_t *needle_offset) {
-
-    // When dealing with huge needles vocabularies, like in tokenization workloads, we need to construct an automaton.
-    // But in many cases, the vocabulary is small enough to use a simpler DFA-less approach, combining the ideas from
-    // the `sz_find_avx512` and `sz_find_charset_avx512` functions.
-    //
-    // Pick the offsets within needles where there is the least variance in the characters.
-    // Like for "the", "then", "there", "these", "those", "their", "they", "them", "that", "this", "thus", "than":
-    //
-    //    0: 't'
-    //    1: 'h'
-    //    2: 'e', 'a', 'i', 'o', 'u'
-    //    3: 'n', 'r', 's', 'i', 'y', 'm', 't'
-    //
-    // So depending on our "register budget", we can use a different number of pivot points: offset 0, 1, 2 make
-    // the most sense if we can only use 3 ZMM registers.
-    sz_unused(haystack && haystack_length && needles && needles_lengths && needle_offset);
-    return 0;
-}
-
-/**
- *  Computes the Needleman Wunsch alignment score between two strings.
- *  The method uses 32-bit integers to accumulate the running score for every cell in the matrix.
- *  Assuming the costs of substitutions can be arbitrary signed 8-bit integers, the method is expected to be used
- *  on strings not exceeding 2^24 length or 16.7 million characters.
- *
- *  Unlike the `_sz_edit_distance_skewed_diagonals_upto65k_avx512` method, this one uses signed integers to store
- *  the accumulated score. Moreover, it's primary bottleneck is the latency of gathering the substitution costs
- *  from the substitution matrix. If we use the diagonal order, we will be comparing a slice of the first string with
- *  a slice of the second. If we stick to the conventional horizontal order, we will be comparing one character against
- *  a slice, which is much easier to optimize. In that case we are sampling costs not from arbitrary parts of
- *  a 256 x 256 matrix, but from a single row!
- */
-SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
-
-    // If one of the strings is empty - the edit distance is equal to the length of the other one
-    if (longer_length == 0) return (sz_ssize_t)shorter_length * gap;
-    if (shorter_length == 0) return (sz_ssize_t)longer_length * gap;
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
-    }
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    sz_size_t const max_length = 256ull * 256ull * 256ull;
-    sz_size_t const n = longer_length + 1;
-    sz_assert(n < max_length && "The length must fit into 24-bit integer. Otherwise use serial variant.");
-    sz_unused(longer_length && max_length);
-
-    sz_size_t buffer_length = sizeof(sz_i32_t) * n * 2;
-    sz_i32_t *distances = (sz_i32_t *)alloc->allocate(buffer_length, alloc->handle);
-    sz_i32_t *previous_distances = distances;
-    sz_i32_t *current_distances = previous_distances + n;
-
-    // Intialize the first row of the Levenshtein matrix with `iota`.
-    for (sz_size_t idx_longer = 0; idx_longer != n; ++idx_longer)
-        previous_distances[idx_longer] = (sz_i32_t)idx_longer * gap;
-
-    /// Contains up to 16 consecutive characters from the longer string.
-    sz_u512_vec_t longer_vec;
-    sz_u512_vec_t cost_deletion_vec, cost_substitution_vec, lookup_substitution_vec, current_vec;
-    sz_u512_vec_t row_first_subs_vec, row_second_subs_vec, row_third_subs_vec, row_fourth_subs_vec;
-    sz_u512_vec_t shuffled_first_subs_vec, shuffled_second_subs_vec, shuffled_third_subs_vec, shuffled_fourth_subs_vec;
-
-    // Prepare constants and masks.
-    sz_u512_vec_t is_third_or_fourth_vec, is_second_or_fourth_vec, gap_vec;
-    {
-        char is_third_or_fourth_check, is_second_or_fourth_check;
-        *(sz_u8_t *)&is_third_or_fourth_check = 0x80, *(sz_u8_t *)&is_second_or_fourth_check = 0x40;
-        is_third_or_fourth_vec.zmm = _mm512_set1_epi8(is_third_or_fourth_check);
-        is_second_or_fourth_vec.zmm = _mm512_set1_epi8(is_second_or_fourth_check);
-        gap_vec.zmm = _mm512_set1_epi32(gap);
-    }
-
-    sz_u8_t const *shorter_unsigned = (sz_u8_t const *)shorter;
-    for (sz_size_t idx_shorter = 0; idx_shorter != shorter_length; ++idx_shorter) {
-        sz_i32_t last_in_row = current_distances[0] = (sz_i32_t)(idx_shorter + 1) * gap;
-
-        // Load one row of the substitution matrix into four ZMM registers.
-        sz_error_cost_t const *row_subs = subs + shorter_unsigned[idx_shorter] * 256u;
-        row_first_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 0);
-        row_second_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 1);
-        row_third_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 2);
-        row_fourth_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 3);
-
-        // In the serial version we have one forward pass, that computes the deletion,
-        // insertion, and substitution costs at once.
-        //    for (sz_size_t idx_longer = 0; idx_longer < longer_length; ++idx_longer) {
-        //        sz_ssize_t cost_deletion = previous_distances[idx_longer + 1] + gap;
-        //        sz_ssize_t cost_insertion = current_distances[idx_longer] + gap;
-        //        sz_ssize_t cost_substitution = previous_distances[idx_longer] + row_subs[longer_unsigned[idx_longer]];
-        //        current_distances[idx_longer + 1] = sz_min_of_three(cost_deletion, cost_insertion, cost_substitution);
-        //    }
-        //
-        // Given the complexity of handling the data-dependency between consecutive insertion cost computations
-        // within a Levenshtein matrix, the simplest design would be to vectorize every kind of cost computation
-        // separately.
-        //      1. Compute substitution costs for up to 64 characters at once, upcasting from 8-bit integers to 32.
-        //      2. Compute the pairwise minimum with deletion costs.
-        //      3. Inclusive prefix minimum computation to combine with addition costs.
-        // Proceeding with substitutions:
-        for (sz_size_t idx_longer = 0; idx_longer < longer_length; idx_longer += 64) {
-            sz_size_t register_length = sz_min_of_two(longer_length - idx_longer, 64);
-            __mmask64 mask = _sz_u64_mask_until(register_length);
-            longer_vec.zmm = _mm512_maskz_loadu_epi8(mask, longer + idx_longer);
-
-            // Blend the `row_(first|second|third|fourth)_subs_vec` into `current_vec`, picking the right source
-            // for every character in `longer_vec`. Before that, we need to permute the subsititution vectors.
-            // Only the bottom 6 bits of a byte are used in VPERB, so we don't even need to mask.
-            shuffled_first_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_first_subs_vec.zmm);
-            shuffled_second_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_second_subs_vec.zmm);
-            shuffled_third_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_third_subs_vec.zmm);
-            shuffled_fourth_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_fourth_subs_vec.zmm);
-
-            // To blend we can invoke three `_mm512_cmplt_epu8_mask`, but we can also achieve the same using
-            // the AND logical operation, checking the top two bits of every byte.
-            // Continuing this thought, we can use the VPTESTMB instruction to output the mask after the AND.
-            __mmask64 is_third_or_fourth = _mm512_mask_test_epi8_mask(mask, longer_vec.zmm, is_third_or_fourth_vec.zmm);
-            __mmask64 is_second_or_fourth =
-                _mm512_mask_test_epi8_mask(mask, longer_vec.zmm, is_second_or_fourth_vec.zmm);
-            lookup_substitution_vec.zmm = _mm512_mask_blend_epi8(
-                is_third_or_fourth,
-                // Choose between the first and the second.
-                _mm512_mask_blend_epi8(is_second_or_fourth, shuffled_first_subs_vec.zmm, shuffled_second_subs_vec.zmm),
-                // Choose between the third and the fourth.
-                _mm512_mask_blend_epi8(is_second_or_fourth, shuffled_third_subs_vec.zmm, shuffled_fourth_subs_vec.zmm));
-
-            // First, sign-extend lower and upper 16 bytes to 16-bit integers.
-            __m512i current_0_31_vec = _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(lookup_substitution_vec.zmm, 0));
-            __m512i current_32_63_vec = _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(lookup_substitution_vec.zmm, 1));
-
-            // Now extend those 16-bit integers to 32-bit.
-            // This isn't free, same as the subsequent store, so we only want to do that for the populated lanes.
-            // To minimize the number of loads and stores, we can combine our substitution costs with the previous
-            // distances, containing the deletion costs.
-            {
-                cost_substitution_vec.zmm = _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_0_31_vec, 0)));
-                cost_deletion_vec.zmm = _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Inclusive prefix minimum computation to combine with insertion costs.
-                // Simply disabling this operation results in 5x performance improvement, meaning
-                // that this operation is responsible for 80% of the total runtime.
-                //    for (sz_size_t idx_longer = 0; idx_longer < longer_length; ++idx_longer) {
-                //        current_distances[idx_longer + 1] =
-                //            sz_max_of_two(current_distances[idx_longer] + gap, current_distances[idx_longer + 1]);
-                //    }
-                //
-                // To perform the same operation in vectorized form, we need to perform a tree-like reduction,
-                // that will involve multiple steps. It's quite expensive and should be first tested in the
-                // "experimental" section.
-                //
-                // Another approach might be loop unrolling:
-                //      current_vec.i32s[0] = last_in_row = sz_i32_max_of_two(current_vec.i32s[0], last_in_row + gap);
-                //      current_vec.i32s[1] = last_in_row = sz_i32_max_of_two(current_vec.i32s[1], last_in_row + gap);
-                //      current_vec.i32s[2] = last_in_row = sz_i32_max_of_two(current_vec.i32s[2], last_in_row + gap);
-                //      ... yet this approach is also quite expensive.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 16 to 31.
-            if (register_length > 16) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 16);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_0_31_vec, 1)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 16);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 16, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 32 to 47.
-            if (register_length > 32) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 32);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_32_63_vec, 0)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 32);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 32, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 32 to 47.
-            if (register_length > 48) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 48);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_32_63_vec, 1)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 48);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 48, (__mmask16)mask, current_vec.zmm);
-            }
-        }
-
-        // Swap previous_distances and current_distances pointers
-        sz_pointer_swap((void **)&previous_distances, (void **)&current_distances);
-    }
-
-    // Cache scalar before `free` call.
-    sz_ssize_t result = previous_distances[longer_length];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-}
-
-SZ_INTERNAL sz_ssize_t sz_alignment_score_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,  //
-    sz_cptr_t longer, sz_size_t longer_length,    //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
-
-    if (sz_max_of_two(shorter_length, longer_length) < (256ull * 256ull * 256ull))
-        return _sz_alignment_score_wagner_fisher_upto17m_avx512(shorter, shorter_length, longer, longer_length, subs,
-                                                                gap, alloc);
-    else
-        return sz_alignment_score_serial(shorter, shorter_length, longer, longer_length, subs, gap, alloc);
-}
-
-enum sz_encoding_t {
-    sz_encoding_unknown_k = 0,
-    sz_encoding_ascii_k = 1,
-    sz_encoding_utf8_k = 2,
-    sz_encoding_utf16_k = 3,
-    sz_encoding_utf32_k = 4,
-    sz_jwt_k,
-    sz_base64_k,
-    // Low priority encodings:
-    sz_encoding_utf8bom_k = 5,
-    sz_encoding_utf16le_k = 6,
-    sz_encoding_utf16be_k = 7,
-    sz_encoding_utf32le_k = 8,
-    sz_encoding_utf32be_k = 9,
-};
-
-// Character Set Detection is one of the most commonly performed operations in data processing with
-// [Chardet](https://github.com/chardet/chardet), [Charset Normalizer](https://github.com/jawah/charset_normalizer),
-// [cChardet](https://github.com/PyYoshi/cChardet) being the most commonly used options in the Python ecosystem.
-// All of them are notoriously slow.
-//
-// Moreover, as of October 2024, UTF-8 is the dominant character encoding on the web, used by 98.4% of websites.
-// Other have minimal usage, according to [W3Techs](https://w3techs.com/technologies/overview/character_encoding):
-// - ISO-8859-1: 1.2%
-// - Windows-1252: 0.3%
-// - Windows-1251: 0.2%
-// - EUC-JP: 0.1%
-// - Shift JIS: 0.1%
-// - EUC-KR: 0.1%
-// - GB2312: 0.1%
-// - Windows-1250: 0.1%
-// Within programming language implementations and database management systems, 16-bit and 32-bit fixed-width encodings
-// are also very popular and we need a way to efficienly differentiate between the most common UTF flavors, ASCII, and
-// the rest.
-//
-// One good solution is the [simdutf](https://github.com/simdutf/simdutf) library, but it depends on the C++ runtime
-// and focuses more on incremental validation & transcoding, rather than detection.
-//
-// So we need a very fast and efficient way of determining
-SZ_PUBLIC sz_bool_t sz_detect_encoding(sz_cptr_t text, sz_size_t length) {
-    // https://github.com/simdutf/simdutf/blob/master/src/icelake/icelake_utf8_validation.inl.cpp
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_from_utf8.inl.cpp#L81
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L661
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L788
-
-    // We can implement this operation simpler & differently, assuming most of the time continuous chunks of memory
-    // have identical encoding. With Russian and many European languages, we generally deal with 2-byte codepoints
-    // with occasional 1-byte punctuation marks. In the case of Chinese, Japanese, and Korean, we deal with 3-byte
-    // codepoints. In the case of emojis, we deal with 4-byte codepoints.
-    // We can also use the idea, that misaligned reads are quite cheap on modern CPUs.
-    int can_be_ascii = 1, can_be_utf8 = 1, can_be_utf16 = 1, can_be_utf32 = 1;
-    sz_unused(can_be_ascii + can_be_utf8 + can_be_utf16 + can_be_utf32);
-    sz_unused(text && length);
-    return sz_false_k;
-}
-
 #pragma clang attribute pop
 #pragma GCC pop_options
-#endif
+#endif            // SZ_USE_ICE
+#pragma endregion // Ice Lake Implementation
 
-#pragma endregion
-
-/*  @brief  Implementation of the string search algorithms using the Arm NEON instruction set, available on 64-bit
- *          Arm processors. Implements: {substring search, character search, character set search} x {forward, reverse}.
+/*  Implementation of the string search algorithms using the Arm NEON instruction set, available on 64-bit
+ *  Arm processors. Covers billions of mobile CPUs worldwide, including Apple's A-series, and Qualcomm's Snapdragon.
  */
-#pragma region ARM NEON
-
-#if SZ_USE_ARM_NEON
+#pragma region NEON Implementation
+#if SZ_USE_NEON
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+simd")
 #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
 
-/**
- *  @brief  Helper structure to simplify work with 64-bit words.
- */
-typedef union sz_u128_vec_t {
-    uint8x16_t u8x16;
-    uint16x8_t u16x8;
-    uint32x4_t u32x4;
-    uint64x2_t u64x2;
-    sz_u64_t u64s[2];
-    sz_u32_t u32s[4];
-    sz_u16_t u16s[8];
-    sz_u8_t u8s[16];
-} sz_u128_vec_t;
-
 SZ_INTERNAL sz_u64_t _sz_vreinterpretq_u8_u4(uint8x16_t vec) {
     // Use `vshrn` to produce a bitmask, similar to `movemask` in SSE.
     // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
     return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(vec), 4)), 0) & 0x8888888888888888ull;
 }
 
-SZ_PUBLIC sz_ordering_t sz_order_neon(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    //! Before optimizing this, read the "Operations Not Worth Optimizing" in Contributions Guide:
-    //! https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md#general-performance-observations
-    return sz_order_serial(a, a_length, b, b_length);
-}
-
 SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
     sz_u128_vec_t a_vec, b_vec;
     for (; length >= 16; a += 16, b += 16, length -= 16) {
@@ -6408,131 +1456,6 @@ SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
     return sz_true_k;
 }
 
-SZ_PUBLIC sz_u64_t sz_checksum_neon(sz_cptr_t text, sz_size_t length) {
-    uint64x2_t sum_vec = vdupq_n_u64(0);
-
-    // Process 16 bytes (128 bits) at a time
-    for (; length >= 16; text += 16, length -= 16) {
-        uint8x16_t vec = vld1q_u8((sz_u8_t const *)text);      // Load 16 bytes
-        uint16x8_t pairwise_sum1 = vpaddlq_u8(vec);            // Pairwise add lower and upper 8 bits
-        uint32x4_t pairwise_sum2 = vpaddlq_u16(pairwise_sum1); // Pairwise add 16-bit results
-        uint64x2_t pairwise_sum3 = vpaddlq_u32(pairwise_sum2); // Pairwise add 32-bit results
-        sum_vec = vaddq_u64(sum_vec, pairwise_sum3);           // Accumulate the sum
-    }
-
-    // Final reduction of `sum_vec` to a single scalar
-    sz_u64_t sum = vgetq_lane_u64(sum_vec, 0) + vgetq_lane_u64(sum_vec, 1);
-    if (length) sum += sz_checksum_serial(text, length);
-    return sum;
-}
-
-SZ_PUBLIC void sz_copy_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // In most cases the `source` and the `target` are not aligned, but we should
-    // at least make sure that writes don't touch many cache lines.
-    // NEON has an instruction to load and write 64 bytes at once.
-    //
-    //    sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-    //    sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-    //    for (; head_length; target += 1, source += 1, head_length -= 1) *target = *source;
-    //    length -= head_length;
-    //    for (; length >= 64; target += 64, source += 64, length -= 64)
-    //        vst4q_u8((sz_u8_t *)target, vld1q_u8_x4((sz_u8_t const *)source));
-    //    for (; tail_length; target += 1, source += 1, tail_length -= 1) *target = *source;
-    //
-    // Sadly, those instructions end up being 20% slower than the code processing 16 bytes at a time:
-    for (; length >= 16; target += 16, source += 16, length -= 16)
-        vst1q_u8((sz_u8_t *)target, vld1q_u8((sz_u8_t const *)source));
-    if (length) sz_copy_serial(target, source, length);
-}
-
-SZ_PUBLIC void sz_move_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // When moving small buffers, using a small buffer on stack as a temporary storage is faster.
-
-    if (target < source || target >= source + length) {
-        // Non-overlapping, proceed forward
-        sz_copy_neon(target, source, length);
-    }
-    else {
-        // Overlapping, proceed backward
-        target += length;
-        source += length;
-
-        sz_u128_vec_t src_vec;
-        while (length >= 16) {
-            target -= 16, source -= 16, length -= 16;
-            src_vec.u8x16 = vld1q_u8((sz_u8_t const *)source);
-            vst1q_u8((sz_u8_t *)target, src_vec.u8x16);
-        }
-        while (length) {
-            target -= 1, source -= 1, length -= 1;
-            *target = *source;
-        }
-    }
-}
-
-SZ_PUBLIC void sz_fill_neon(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    uint8x16_t fill_vec = vdupq_n_u8(value); // Broadcast the value across the register
-
-    while (length >= 16) {
-        vst1q_u8((sz_u8_t *)target, fill_vec);
-        target += 16;
-        length -= 16;
-    }
-
-    // Handle remaining bytes
-    if (length) sz_fill_serial(target, length, value);
-}
-
-SZ_PUBLIC void sz_look_up_transform_neon(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-
-    // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
-    // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
-    if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
-        return;
-    }
-
-    sz_size_t head_length = (16 - ((sz_size_t)target % 16)) % 16; // 15 or less.
-    sz_size_t tail_length = (sz_size_t)(target + length) % 16;    // 15 or less.
-
-    // We need to pull the lookup table into 16x NEON registers. We have a total of 32 such registers.
-    // According to the Neoverse V2 manual, the 4-table lookup has a latency of 6 cycles, and 4x throughput.
-    uint8x16x4_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec;
-    lut_0_to_63_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 0));
-    lut_64_to_127_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 64));
-    lut_128_to_191_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 128));
-    lut_192_to_255_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 192));
-
-    sz_u128_vec_t source_vec;
-    // If the top bit is set in each word of `source_vec`, than we use `lookup_128_to_191_vec` or
-    // `lookup_192_to_255_vec`. If the second bit is set, we use `lookup_64_to_127_vec` or `lookup_192_to_255_vec`.
-    sz_u128_vec_t lookup_0_to_63_vec, lookup_64_to_127_vec, lookup_128_to_191_vec, lookup_192_to_255_vec;
-    sz_u128_vec_t blended_0_to_255_vec;
-
-    // Process the head with serial code
-    for (; head_length; target += 1, source += 1, head_length -= 1) *target = lut[*(sz_u8_t const *)source];
-
-    // Table lookups on Arm are much simpler to use than on x86, as we can use the `vqtbl4q_u8` instruction
-    // to perform a 4-table lookup in a single instruction. The XORs are used to adjust the lookup position
-    // within each 64-byte range of the table.
-    // Details on the 4-table lookup: https://lemire.me/blog/2019/07/23/arbitrary-byte-to-byte-maps-using-arm-neon/
-    length -= head_length;
-    length -= tail_length;
-    for (; length >= 16; source += 16, target += 16, length -= 16) {
-        source_vec.u8x16 = vld1q_u8((sz_u8_t const *)source);
-        lookup_0_to_63_vec.u8x16 = vqtbl4q_u8(lut_0_to_63_vec, source_vec.u8x16);
-        lookup_64_to_127_vec.u8x16 = vqtbl4q_u8(lut_64_to_127_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0x40)));
-        lookup_128_to_191_vec.u8x16 = vqtbl4q_u8(lut_128_to_191_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0x80)));
-        lookup_192_to_255_vec.u8x16 = vqtbl4q_u8(lut_192_to_255_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0xc0)));
-        blended_0_to_255_vec.u8x16 = vorrq_u8(vorrq_u8(lookup_0_to_63_vec.u8x16, lookup_64_to_127_vec.u8x16),
-                                              vorrq_u8(lookup_128_to_191_vec.u8x16, lookup_192_to_255_vec.u8x16));
-        vst1q_u8((sz_u8_t *)target, blended_0_to_255_vec.u8x16);
-    }
-
-    // Process the tail with serial code
-    for (; tail_length; target += 1, source += 1, tail_length -= 1) *target = lut[*(sz_u8_t const *)source];
-}
-
 SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
     sz_u64_t matches;
     sz_u128_vec_t h_vec, n_vec, matches_vec;
@@ -6569,8 +1492,8 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_
     return sz_rfind_byte_serial(h, h_length, n);
 }
 
-SZ_PUBLIC sz_u64_t _sz_find_charset_neon_register(sz_u128_vec_t h_vec, uint8x16_t set_top_vec_u8x16,
-                                                  uint8x16_t set_bottom_vec_u8x16) {
+SZ_PUBLIC sz_u64_t _sz_find_charset_neon_register( //
+    sz_u128_vec_t h_vec, uint8x16_t set_top_vec_u8x16, uint8x16_t set_bottom_vec_u8x16) {
 
     // Once we've read the characters in the haystack, we want to
     // compare them against our bitset. The serial version of that code
@@ -6744,253 +1667,36 @@ SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_ch
 
 #pragma clang attribute pop
 #pragma GCC pop_options
-#endif // Arm Neon
-
-#pragma endregion
+#endif            // SZ_USE_NEON
+#pragma endregion // NEON Implementation
 
-/*  @brief  Implementation of the string search algorithms using the Arm SVE variable-length registers, available
- *          in Arm v9 processors.
- *
- *  Implements:
- *      - memory: {copy, move, fill}
- *      - comparisons: {equal, order}
- *      - search: {substring, character, character set} x {forward, reverse}.
+/*  Implementation of the string search algorithms using the Arm SVE variable-length registers,
+ *  available in Arm v9 processors, like in Apple M4+ and Graviton 3+ CPUs.
  */
-#pragma region ARM SVE
-
-#if SZ_USE_ARM_SVE
+#pragma region SVE Implementation
+#if SZ_USE_SVE
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+sve")
 #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
-
-SZ_PUBLIC void sz_fill_sve(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    svuint8_t value_vec = svdup_u8(value);
-    sz_size_t vec_len = svcntb(); // Vector length in bytes (scalable)
-
-    if (length <= vec_len) {
-        // Small buffer case: use mask to handle small writes
-        svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)length);
-        svst1_u8(mask, (unsigned char *)target, value_vec);
-    }
-    else {
-        // Calculate head, body, and tail sizes
-        sz_size_t head_length = vec_len - ((sz_size_t)target % vec_len);
-        sz_size_t tail_length = (sz_size_t)(target + length) % vec_len;
-        sz_size_t body_length = length - head_length - tail_length;
-
-        // Handle unaligned head
-        svbool_t head_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)head_length);
-        svst1_u8(head_mask, (unsigned char *)target, value_vec);
-        target += head_length;
-
-        // Aligned body loop
-        for (; body_length >= vec_len; target += vec_len, body_length -= vec_len) {
-            svst1_u8(svptrue_b8(), (unsigned char *)target, value_vec);
-        }
-
-        // Handle unaligned tail
-        svbool_t tail_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)tail_length);
-        svst1_u8(tail_mask, (unsigned char *)target, value_vec);
-    }
-}
-
-SZ_PUBLIC void sz_copy_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    sz_size_t vec_len = svcntb(); // Vector length in bytes
-
-    // Arm Neoverse V2 cores in Graviton 4, for example, come with 256 KB of L1 data cache per core,
-    // and 8 MB of L2 cache per core. Moreover, the L1 cache is fully associative.
-    // With two strings, we may consider the overal workload huge, if each exceeds 1 MB in length.
-    //
-    //      int is_huge = length >= 4ull * 1024ull * 1024ull;
-    //
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= vec_len) {
-        // Small buffer case: use mask to handle small writes
-        svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)length);
-        svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-        svst1_u8(mask, (unsigned char *)target, data);
-    }
-    // When dealing with larger buffers, similar to AVX-512, we want minimize unaligned operations
-    // and handle the head, body, and tail separately. We can also traverse the buffer in both directions
-    // as Arm generally supports more simultaneous stores than x86 CPUs.
-    //
-    // For gigantic datasets, similar to AVX-512, non-temporal "loads" and "stores" can be used.
-    // Sadly, if the register size (16 byte or larger) is smaller than a cache-line (64 bytes)
-    // we will pay a huge penalty on loads, fetching the same content many times.
-    // It may be better to allow caching (and subsequent eviction), in favor of using four-element
-    // tuples, wich will be guaranteed to be a multiple of a cache line.
-    //
-    // Another approach is to use the `LD4B` instructions, which will populate four registers at once.
-    // This however, further decreases the performance from LibC-like 29 GB/s to 20 GB/s.
-    else {
-        // Calculating head, body, and tail sizes depends on the `vec_len`,
-        // but it's runtime constant, and the modulo operation is expensive!
-        // Instead we use the fact, that it's always a multiple of 128 bits or 16 bytes.
-        sz_size_t head_length = 16 - ((sz_size_t)target % 16);
-        sz_size_t tail_length = (sz_size_t)(target + length) % 16;
-        sz_size_t body_length = length - head_length - tail_length;
-
-        // Handle unaligned parts
-        svbool_t head_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)head_length);
-        svuint8_t head_data = svld1_u8(head_mask, (unsigned char *)source);
-        svst1_u8(head_mask, (unsigned char *)target, head_data);
-        svbool_t tail_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)tail_length);
-        svuint8_t tail_data = svld1_u8(tail_mask, (unsigned char *)source + head_length + body_length);
-        svst1_u8(tail_mask, (unsigned char *)target + head_length + body_length, tail_data);
-        target += head_length;
-        source += head_length;
-
-        // Aligned body loop, walking in two directions
-        for (; body_length >= vec_len * 2; target += vec_len, source += vec_len, body_length -= vec_len * 2) {
-            svuint8_t forward_data = svld1_u8(svptrue_b8(), (unsigned char *)source);
-            svuint8_t backward_data = svld1_u8(svptrue_b8(), (unsigned char *)source + body_length - vec_len);
-            svst1_u8(svptrue_b8(), (unsigned char *)target, forward_data);
-            svst1_u8(svptrue_b8(), (unsigned char *)target + body_length - vec_len, backward_data);
-        }
-        // Up to (vec_len * 2 - 1) bytes of data may be left in the body,
-        // so we can unroll the last two optional loop iterations.
-        if (body_length > vec_len) {
-            svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)body_length);
-            svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-            svst1_u8(mask, (unsigned char *)target, data);
-            body_length -= vec_len;
-            source += body_length;
-            target += body_length;
-        }
-        if (body_length) {
-            svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)body_length);
-            svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-            svst1_u8(mask, (unsigned char *)target, data);
-        }
-    }
-}
-
 #pragma clang attribute pop
 #pragma GCC pop_options
-#endif // Arm SVE
+#endif            // SZ_USE_SVE
+#pragma endregion // SVE Implementation
 
-#pragma endregion
-
-/*
- *  @brief  Pick the right implementation for the string search algorithms.
+/*  Pick the right implementation for the string search algorithms.
+ *  To override this behavior and precompile all backends - set `SZ_DYNAMIC_DISPATCH` to 1.
  */
 #pragma region Compile Time Dispatching
-
-SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t ins, sz_size_t length) { return sz_hash_serial(ins, length); }
-SZ_PUBLIC void sz_tolower(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_tolower_serial(ins, length, outs); }
-SZ_PUBLIC void sz_toupper(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_toupper_serial(ins, length, outs); }
-SZ_PUBLIC void sz_toascii(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_toascii_serial(ins, length, outs); }
-SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t ins, sz_size_t length) { return sz_isascii_serial(ins, length); }
-
-SZ_PUBLIC void sz_hashes_fingerprint(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_ptr_t fingerprint,
-                                     sz_size_t fingerprint_bytes) {
-
-    sz_bool_t fingerprint_length_is_power_of_two = (sz_bool_t)((fingerprint_bytes & (fingerprint_bytes - 1)) == 0);
-    sz_string_view_t fingerprint_buffer = {fingerprint, fingerprint_bytes};
-
-    // There are several issues related to the fingerprinting algorithm.
-    // First, the memory traversal order is important.
-    // https://blog.stuffedcow.net/2015/08/pagewalk-coherence/
-
-    // In most cases the fingerprint length will be a power of two.
-    if (fingerprint_length_is_power_of_two == sz_false_k)
-        sz_hashes(start, length, window_length, 1, _sz_hashes_fingerprint_non_pow2_callback, &fingerprint_buffer);
-    else
-        sz_hashes(start, length, window_length, 1, _sz_hashes_fingerprint_pow2_callback, &fingerprint_buffer);
-}
-
 #if !SZ_DYNAMIC_DISPATCH
 
-SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    return sz_checksum_avx512(text, length);
-#elif SZ_USE_X86_AVX2
-    return sz_checksum_avx2(text, length);
-#elif SZ_USE_ARM_NEON
-    return sz_checksum_neon(text, length);
-#else
-    return sz_checksum_serial(text, length);
-#endif
-}
-
-SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    return sz_equal_avx512(a, b, length);
-#elif SZ_USE_X86_AVX2
-    return sz_equal_avx2(a, b, length);
-#elif SZ_USE_ARM_NEON
-    return sz_equal_neon(a, b, length);
-#else
-    return sz_equal_serial(a, b, length);
-#endif
-}
-
-SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-#if SZ_USE_X86_AVX512
-    return sz_order_avx512(a, a_length, b, b_length);
-#elif SZ_USE_X86_AVX2
-    return sz_order_avx2(a, a_length, b, b_length);
-#elif SZ_USE_ARM_NEON
-    return sz_order_neon(a, a_length, b, b_length);
-#else
-    return sz_order_serial(a, a_length, b, b_length);
-#endif
-}
-
-SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    sz_copy_avx512(target, source, length);
-#elif SZ_USE_X86_AVX2
-    sz_copy_avx2(target, source, length);
-#elif SZ_USE_ARM_NEON
-    sz_copy_neon(target, source, length);
-#else
-    sz_copy_serial(target, source, length);
-#endif
-}
-
-SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    sz_move_avx512(target, source, length);
-#elif SZ_USE_X86_AVX2
-    sz_move_avx2(target, source, length);
-#elif SZ_USE_ARM_NEON
-    sz_move_neon(target, source, length);
-#else
-    sz_move_serial(target, source, length);
-#endif
-}
-
-SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-#if SZ_USE_X86_AVX512
-    sz_fill_avx512(target, length, value);
-#elif SZ_USE_X86_AVX2
-    sz_fill_avx2(target, length, value);
-#elif SZ_USE_ARM_NEON
-    sz_fill_neon(target, length, value);
-#else
-    sz_fill_serial(target, length, value);
-#endif
-}
-
-SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-#if SZ_USE_X86_AVX512
-    sz_look_up_transform_avx512(source, length, lut, target);
-#elif SZ_USE_X86_AVX2
-    sz_look_up_transform_avx2(source, length, lut, target);
-#elif SZ_USE_ARM_NEON
-    sz_look_up_transform_neon(source, length, lut, target);
-#else
-    sz_look_up_transform_serial(source, length, lut, target);
-#endif
-}
+#pragma region Core Funcitonality
 
 SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
     return sz_find_byte_avx512(haystack, h_length, needle);
-#elif SZ_USE_X86_AVX2
-    return sz_find_byte_avx2(haystack, h_length, needle);
-#elif SZ_USE_ARM_NEON
+#elif SZ_USE_HASWELL
+    return sz_find_byte_haswell(haystack, h_length, needle);
+#elif SZ_USE_NEON
     return sz_find_byte_neon(haystack, h_length, needle);
 #else
     return sz_find_byte_serial(haystack, h_length, needle);
@@ -6998,11 +1704,11 @@ SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cpt
 }
 
 SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
     return sz_rfind_byte_avx512(haystack, h_length, needle);
-#elif SZ_USE_X86_AVX2
-    return sz_rfind_byte_avx2(haystack, h_length, needle);
-#elif SZ_USE_ARM_NEON
+#elif SZ_USE_HASWELL
+    return sz_rfind_byte_haswell(haystack, h_length, needle);
+#elif SZ_USE_NEON
     return sz_rfind_byte_neon(haystack, h_length, needle);
 #else
     return sz_rfind_byte_serial(haystack, h_length, needle);
@@ -7010,11 +1716,11 @@ SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cp
 }
 
 SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
-#if SZ_USE_X86_AVX512
-    return sz_find_avx512(haystack, h_length, needle, n_length);
-#elif SZ_USE_X86_AVX2
-    return sz_find_avx2(haystack, h_length, needle, n_length);
-#elif SZ_USE_ARM_NEON
+#if SZ_USE_ICE
+    return sz_find_skylake(haystack, h_length, needle, n_length);
+#elif SZ_USE_HASWELL
+    return sz_find_haswell(haystack, h_length, needle, n_length);
+#elif SZ_USE_NEON
     return sz_find_neon(haystack, h_length, needle, n_length);
 #else
     return sz_find_serial(haystack, h_length, needle, n_length);
@@ -7022,11 +1728,11 @@ SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t n
 }
 
 SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
-#if SZ_USE_X86_AVX512
-    return sz_rfind_avx512(haystack, h_length, needle, n_length);
-#elif SZ_USE_X86_AVX2
-    return sz_rfind_avx2(haystack, h_length, needle, n_length);
-#elif SZ_USE_ARM_NEON
+#if SZ_USE_ICE
+    return sz_rfind_skylake(haystack, h_length, needle, n_length);
+#elif SZ_USE_HASWELL
+    return sz_rfind_haswell(haystack, h_length, needle, n_length);
+#elif SZ_USE_NEON
     return sz_rfind_neon(haystack, h_length, needle, n_length);
 #else
     return sz_rfind_serial(haystack, h_length, needle, n_length);
@@ -7034,11 +1740,11 @@ SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t
 }
 
 SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#if SZ_USE_X86_AVX512
-    return sz_find_charset_avx512(text, length, set);
-#elif SZ_USE_X86_AVX2
-    return sz_find_charset_avx2(text, length, set);
-#elif SZ_USE_ARM_NEON
+#if SZ_USE_ICE
+    return sz_find_charset_ice(text, length, set);
+#elif SZ_USE_HASWELL
+    return sz_find_charset_haswell(text, length, set);
+#elif SZ_USE_NEON
     return sz_find_charset_neon(text, length, set);
 #else
     return sz_find_charset_serial(text, length, set);
@@ -7046,69 +1752,19 @@ SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charse
 }
 
 SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#if SZ_USE_X86_AVX512
-    return sz_rfind_charset_avx512(text, length, set);
-#elif SZ_USE_X86_AVX2
-    return sz_rfind_charset_avx2(text, length, set);
-#elif SZ_USE_ARM_NEON
+#if SZ_USE_ICE
+    return sz_rfind_charset_ice(text, length, set);
+#elif SZ_USE_HASWELL
+    return sz_rfind_charset_haswell(text, length, set);
+#elif SZ_USE_NEON
     return sz_rfind_charset_neon(text, length, set);
 #else
     return sz_rfind_charset_serial(text, length, set);
 #endif
 }
 
-SZ_DYNAMIC sz_size_t sz_hamming_distance( //
-    sz_cptr_t a, sz_size_t a_length,      //
-    sz_cptr_t b, sz_size_t b_length,      //
-    sz_size_t bound) {
-    return sz_hamming_distance_serial(a, a_length, b, b_length, bound);
-}
-
-SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length,           //
-    sz_cptr_t b, sz_size_t b_length,           //
-    sz_size_t bound) {
-    return sz_hamming_distance_utf8_serial(a, a_length, b, b_length, bound);
-}
-
-SZ_DYNAMIC sz_size_t sz_edit_distance( //
-    sz_cptr_t a, sz_size_t a_length,   //
-    sz_cptr_t b, sz_size_t b_length,   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-#if SZ_USE_X86_AVX512
-    return sz_edit_distance_avx512(a, a_length, b, b_length, bound, alloc);
-#else
-    return sz_edit_distance_serial(a, a_length, b, b_length, bound, alloc);
-#endif
-}
-
-SZ_DYNAMIC sz_size_t sz_edit_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length,        //
-    sz_cptr_t b, sz_size_t b_length,        //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-    return _sz_edit_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc);
-}
-
-SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                         sz_error_cost_t const *subs, sz_error_cost_t gap,
-                                         sz_memory_allocator_t *alloc) {
-#if SZ_USE_X86_AVX512
-    return sz_alignment_score_avx512(a, a_length, b, b_length, subs, gap, alloc);
-#else
-    return sz_alignment_score_serial(a, a_length, b, b_length, subs, gap, alloc);
-#endif
-}
-
-SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                          sz_hash_callback_t callback, void *callback_handle) {
-#if SZ_USE_X86_AVX512
-    sz_hashes_avx512(text, length, window_length, window_step, callback, callback_handle);
-#elif SZ_USE_X86_AVX2
-    sz_hashes_avx2(text, length, window_length, window_step, callback, callback_handle);
-#else
-    sz_hashes_serial(text, length, window_length, window_step, callback, callback_handle);
-#endif
-}
+#pragma endregion
+#pragma region Helper Shortcuts
 
 SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
     sz_charset_t set;
@@ -7140,17 +1796,11 @@ SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_
     return sz_rfind_charset(h, h_length, &set);
 }
 
-SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
-                            sz_random_generator_t generator, void *generator_user_data) {
-    sz_generate_serial(alphabet, alphabet_size, result, result_length, generator, generator_user_data);
-}
-
-#endif
-#pragma endregion
+#pragma endregion // Helper Shortcuts
+#endif            // !SZ_DYNAMIC_DISPATCH
+#pragma endregion // Compile Time Dispatching
 
 #ifdef __cplusplus
-#pragma GCC diagnostic pop
 }
 #endif // __cplusplus
-
-#endif // STRINGZILLA_H_
+#endif // STRINGZILLA_FIND_H_

From 295d49a38d66b08075357ac829ad66d80b5edab0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 14:49:26 +0000
Subject: [PATCH 032/751] Fix: Filter `memory.h` file

---
 include/stringzilla/memory.h | 6359 ++--------------------------------
 1 file changed, 262 insertions(+), 6097 deletions(-)

diff --git a/include/stringzilla/memory.h b/include/stringzilla/memory.h
index de7fbcac..87957878 100644
--- a/include/stringzilla/memory.h
+++ b/include/stringzilla/memory.h
@@ -1,3082 +1,166 @@
 /**
- *  @brief  StringZilla is a collection of advanced string algorithms, designed to be used in Big Data applications.
- *          It is generally faster than LibC, and has a broader & cleaner interface, and targets modern x86 CPUs
- *          with AVX-512 and Arm NEON and older CPUs with SWAR and auto-vectorization.
- *
- *  Consider overriding the following macros to customize the library:
- *
- *  - `SZ_DEBUG=0` - whether to enable debug assertions and logging.
- *  - `SZ_DYNAMIC_DISPATCH=0` - whether to use runtime dispatching of the most advanced SIMD backend.
- *  - `SZ_USE_MISALIGNED_LOADS=0` - whether to use misaligned loads on platforms that support them.
- *  - `SZ_SWAR_THRESHOLD=24` - threshold for switching to SWAR backend over serial byte-level for-loops.
- *  - `SZ_USE_X86_AVX512=?` - whether to use AVX-512 instructions on x86_64.
- *  - `SZ_USE_X86_AVX2=?` - whether to use AVX2 instructions on x86_64.
- *  - `SZ_USE_ARM_NEON=?` - whether to use NEON instructions on ARM.
- *  - `SZ_USE_ARM_SVE=?` - whether to use SVE instructions on ARM.
- *
- *  @see    StringZilla: https://github.com/ashvardanian/StringZilla/blob/main/README.md
- *  @see    LibC String: https://pubs.opengroup.org/onlinepubs/009695399/basedefs/string.h.html
- *
- *  @file   stringzilla.h
+ *  @brief  Hardware-accelerated memory operations.
+ *  @file   memory.h
  *  @author Ash Vardanian
- */
-#ifndef STRINGZILLA_H_
-#define STRINGZILLA_H_
-
-#define STRINGZILLA_VERSION_MAJOR 3
-#define STRINGZILLA_VERSION_MINOR 11
-#define STRINGZILLA_VERSION_PATCH 0
-
-/**
- *  @brief  When set to 1, the library will include the following LibC headers: <stddef.h> and <stdint.h>.
- *          In debug builds (SZ_DEBUG=1), the library will also include <stdio.h> and <stdlib.h>.
- *
- *  You may want to disable this compiling for use in the kernel, or in embedded systems.
- *  You may also avoid them, if you are very sensitive to compilation time and avoid pre-compiled headers.
- *  https://artificial-mind.net/projects/compile-health/
- */
-#ifndef SZ_AVOID_LIBC
-#define SZ_AVOID_LIBC (0) // true or false
-#endif
-
-/**
- *  @brief  A misaligned load can be - trying to fetch eight consecutive bytes from an address
- *          that is not divisible by eight. On x86 enabled by default. On ARM it's not.
- *
- *  Most platforms support it, but there is no industry standard way to check for those.
- *  This value will mostly affect the performance of the serial (SWAR) backend.
- */
-#ifndef SZ_USE_MISALIGNED_LOADS
-#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
-#define SZ_USE_MISALIGNED_LOADS (1) // true or false
-#else
-#define SZ_USE_MISALIGNED_LOADS (0) // true or false
-#endif
-#endif
-
-/**
- *  @brief  Removes compile-time dispatching, and replaces it with runtime dispatching.
- *          So the `sz_find` function will invoke the most advanced backend supported by the CPU,
- *          that runs the program, rather than the most advanced backend supported by the CPU
- *          used to compile the library or the downstream application.
- */
-#ifndef SZ_DYNAMIC_DISPATCH
-#define SZ_DYNAMIC_DISPATCH (0) // true or false
-#endif
-
-/**
- *  @brief  Analogous to `size_t` and `std::size_t`, unsigned integer, identical to pointer size.
- *          64-bit on most platforms where pointers are 64-bit.
- *          32-bit on platforms where pointers are 32-bit.
- */
-#if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64)
-#define SZ_DETECT_64_BIT (1)
-#define SZ_SIZE_MAX (0xFFFFFFFFFFFFFFFFull)  // Largest unsigned integer that fits into 64 bits.
-#define SZ_SSIZE_MAX (0x7FFFFFFFFFFFFFFFull) // Largest signed integer that fits into 64 bits.
-#else
-#define SZ_DETECT_64_BIT (0)
-#define SZ_SIZE_MAX (0xFFFFFFFFu)  // Largest unsigned integer that fits into 32 bits.
-#define SZ_SSIZE_MAX (0x7FFFFFFFu) // Largest signed integer that fits into 32 bits.
-#endif
-
-/**
- *  @brief  On Big-Endian machines StringZilla will work in compatibility mode.
- *          This disables SWAR hacks to minimize code duplication, assuming practically
- *          all modern popular platforms are Little-Endian.
- *
- *  This variable is hard to infer from macros reliably. It's best to set it manually.
- *  For that CMake provides the `TestBigEndian` and `CMAKE_<LANG>_BYTE_ORDER` (from 3.20 onwards).
- *  In Python one can check `sys.byteorder == 'big'` in the `setup.py` script and pass the appropriate macro.
- *  https://stackoverflow.com/a/27054190
- */
-#ifndef SZ_DETECT_BIG_ENDIAN
-#if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN || defined(__BIG_ENDIAN__) || defined(__ARMEB__) || \
-    defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || defined(__MIBSEB) || defined(__MIBSEB__)
-#define SZ_DETECT_BIG_ENDIAN (1) //< It's a big-endian target architecture
-#else
-#define SZ_DETECT_BIG_ENDIAN (0) //< It's a little-endian target architecture
-#endif
-#endif
-
-/*
- *  Debugging and testing.
- */
-#ifndef SZ_DEBUG
-#if defined(DEBUG) || defined(_DEBUG) // This means "Not using DEBUG information".
-#define SZ_DEBUG (1)
-#else
-#define SZ_DEBUG (0)
-#endif
-#endif
-
-/**
- *  @brief  Threshold for switching to SWAR (8-bytes at a time) backend over serial byte-level for-loops.
- *          On very short strings, under 16 bytes long, at most a single word will be processed with SWAR.
- *          Assuming potentially misaligned loads, SWAR makes sense only after ~24 bytes.
- */
-#ifndef SZ_SWAR_THRESHOLD
-#if SZ_DEBUG
-#define SZ_SWAR_THRESHOLD (8u) // 8 bytes in debug builds
-#else
-#define SZ_SWAR_THRESHOLD (24u) // 24 bytes in release builds
-#endif
-#endif
-
-/*  Annotation for the public API symbols:
- *
- *  - `SZ_PUBLIC` is used for functions that are part of the public API.
- *  - `SZ_INTERNAL` is used for internal helper functions with unstable APIs.
- *  - `SZ_DYNAMIC` is used for functions that are part of the public API, but are dispatched at runtime.
- */
-#ifndef SZ_DYNAMIC
-#if SZ_DYNAMIC_DISPATCH
-#if defined(_WIN32) || defined(__CYGWIN__)
-#define SZ_DYNAMIC __declspec(dllexport)
-#define SZ_EXTERNAL __declspec(dllimport)
-#define SZ_PUBLIC inline static
-#define SZ_INTERNAL inline static
-#else
-#define SZ_DYNAMIC __attribute__((visibility("default")))
-#define SZ_EXTERNAL extern
-#define SZ_PUBLIC __attribute__((unused)) inline static
-#define SZ_INTERNAL __attribute__((always_inline)) inline static
-#endif // _WIN32 || __CYGWIN__
-#else
-#define SZ_DYNAMIC inline static
-#define SZ_EXTERNAL extern
-#define SZ_PUBLIC inline static
-#define SZ_INTERNAL inline static
-#endif // SZ_DYNAMIC_DISPATCH
-#endif // SZ_DYNAMIC
-
-/**
- *  @brief  Alignment macro for 64-byte alignment.
- */
-#if defined(_MSC_VER)
-#define SZ_ALIGN64 __declspec(align(64))
-#elif defined(__GNUC__) || defined(__clang__)
-#define SZ_ALIGN64 __attribute__((aligned(64)))
-#else
-#define SZ_ALIGN64
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- *  Let's infer the integer types or pull them from LibC,
- *  if that is allowed by the user.
- */
-#if !SZ_AVOID_LIBC
-#include <stddef.h>           // `size_t`
-#include <stdint.h>           // `uint8_t`
-typedef int8_t sz_i8_t;       // Always 8 bits
-typedef uint8_t sz_u8_t;      // Always 8 bits
-typedef uint16_t sz_u16_t;    // Always 16 bits
-typedef int32_t sz_i32_t;     // Always 32 bits
-typedef uint32_t sz_u32_t;    // Always 32 bits
-typedef uint64_t sz_u64_t;    // Always 64 bits
-typedef int64_t sz_i64_t;     // Always 64 bits
-typedef size_t sz_size_t;     // Pointer-sized unsigned integer, 32 or 64 bits
-typedef ptrdiff_t sz_ssize_t; // Signed version of `sz_size_t`, 32 or 64 bits
-
-#else // if SZ_AVOID_LIBC:
-
-// ! The C standard doesn't specify the signedness of char.
-// ! On x86 char is signed by default while on Arm it is unsigned by default.
-// ! That's why we don't define `sz_char_t` and generally use explicit `sz_i8_t` and `sz_u8_t`.
-typedef signed char sz_i8_t;         // Always 8 bits
-typedef unsigned char sz_u8_t;       // Always 8 bits
-typedef unsigned short sz_u16_t;     // Always 16 bits
-typedef int sz_i32_t;                // Always 32 bits
-typedef unsigned int sz_u32_t;       // Always 32 bits
-typedef long long sz_i64_t;          // Always 64 bits
-typedef unsigned long long sz_u64_t; // Always 64 bits
-
-// Now we need to redefine the `size_t`.
-// Microsoft Visual C++ (MSVC) typically follows LLP64 data model on 64-bit platforms,
-// where integers, pointers, and long types have different sizes:
-//
-//  > `int` is 32 bits
-//  > `long` is 32 bits
-//  > `long long` is 64 bits
-//  > pointer (thus, `size_t`) is 64 bits
-//
-// In contrast, GCC and Clang on 64-bit Unix-like systems typically follow the LP64 model, where:
-//
-//  > `int` is 32 bits
-//  > `long` and pointer (thus, `size_t`) are 64 bits
-//  > `long long` is also 64 bits
-//
-// Source: https://learn.microsoft.com/en-us/windows/win32/winprog64/abstract-data-models
-#if SZ_DETECT_64_BIT
-typedef unsigned long long sz_size_t; // 64-bit.
-typedef long long sz_ssize_t;         // 64-bit.
-#else
-typedef unsigned sz_size_t;  // 32-bit.
-typedef unsigned sz_ssize_t; // 32-bit.
-#endif // SZ_DETECT_64_BIT
-
-#endif // SZ_AVOID_LIBC
-
-/**
- *  @brief  Compile-time assert macro similar to `static_assert` in C++.
- */
-#define sz_static_assert(condition, name)                \
-    typedef struct {                                     \
-        int static_assert_##name : (condition) ? 1 : -1; \
-    } sz_static_assert_##name##_t
-
-sz_static_assert(sizeof(sz_size_t) == sizeof(void *), sz_size_t_must_be_pointer_size);
-sz_static_assert(sizeof(sz_ssize_t) == sizeof(void *), sz_ssize_t_must_be_pointer_size);
-
-#pragma region Public API
-
-typedef char *sz_ptr_t;          // A type alias for `char *`
-typedef char const *sz_cptr_t;   // A type alias for `char const *`
-typedef sz_i8_t sz_error_cost_t; // Character mismatch cost for fuzzy matching functions
-
-typedef sz_u64_t sz_sorted_idx_t; // Index of a sorted string in a list of strings
-
-typedef enum { sz_false_k = 0, sz_true_k = 1 } sz_bool_t;                        // Only one relevant bit
-typedef enum { sz_less_k = -1, sz_equal_k = 0, sz_greater_k = 1 } sz_ordering_t; // Only three possible states: <=>
-
-/**
- *  @brief  Tiny string-view structure. It's POD type, unlike the `std::string_view`.
- */
-typedef struct sz_string_view_t {
-    sz_cptr_t start;
-    sz_size_t length;
-} sz_string_view_t;
-
-/**
- *  @brief  Enumeration of SIMD capabilities of the target architecture.
- *          Used to introspect the supported functionality of the dynamic library.
- */
-typedef enum sz_capability_t {
-    sz_cap_serial_k = 1,       /// Serial (non-SIMD) capability
-    sz_cap_any_k = 0x7FFFFFFF, /// Mask representing any capability
-
-    sz_cap_arm_neon_k = 1 << 10, /// ARM NEON capability
-    sz_cap_arm_sve_k = 1 << 11,  /// ARM SVE capability TODO: Not yet supported or used
-    sz_cap_arm_sve2_k = 1 << 12,
-    sz_cap_arm_sve2p1_k = 1 << 13,
-    sz_cap_x86_avx2_k = 1 << 20,       /// x86 AVX2 capability
-    sz_cap_x86_avx512f_k = 1 << 21,    /// x86 AVX512 F capability
-    sz_cap_x86_avx512bw_k = 1 << 22,   /// x86 AVX512 BW instruction capability
-    sz_cap_x86_avx512vl_k = 1 << 23,   /// x86 AVX512 VL instruction capability
-    sz_cap_x86_avx512vbmi_k = 1 << 24, /// x86 AVX512 VBMI instruction capability
-    sz_cap_x86_gfni_k = 1 << 25,       /// x86 AVX512 GFNI instruction capability
-
-} sz_capability_t;
-
-/**
- *  @brief  Function to determine the SIMD capabilities of the current machine @b only at @b runtime.
- *  @return A bitmask of the SIMD capabilities represented as a `sz_capability_t` enum value.
- */
-SZ_DYNAMIC sz_capability_t sz_capabilities(void);
-
-/**
- *  @brief  Bit-set structure for 256 possible byte values. Useful for filtering and search.
- *  @see    sz_charset_init, sz_charset_add, sz_charset_contains, sz_charset_invert
- */
-typedef union sz_charset_t {
-    sz_u64_t _u64s[4];
-    sz_u32_t _u32s[8];
-    sz_u16_t _u16s[16];
-    sz_u8_t _u8s[32];
-} sz_charset_t;
-
-/** @brief  Initializes a bit-set to an empty collection, meaning - all characters are banned. */
-SZ_PUBLIC void sz_charset_init(sz_charset_t *s) { s->_u64s[0] = s->_u64s[1] = s->_u64s[2] = s->_u64s[3] = 0; }
-
-/** @brief  Adds a character to the set and accepts @b unsigned integers. */
-SZ_PUBLIC void sz_charset_add_u8(sz_charset_t *s, sz_u8_t c) { s->_u64s[c >> 6] |= (1ull << (c & 63u)); }
-
-/** @brief  Adds a character to the set. Consider @b sz_charset_add_u8. */
-SZ_PUBLIC void sz_charset_add(sz_charset_t *s, char c) { sz_charset_add_u8(s, *(sz_u8_t *)(&c)); } // bitcast
-
-/** @brief  Checks if the set contains a given character and accepts @b unsigned integers. */
-SZ_PUBLIC sz_bool_t sz_charset_contains_u8(sz_charset_t const *s, sz_u8_t c) {
-    // Checking the bit can be done in different ways:
-    // - (s->_u64s[c >> 6] & (1ull << (c & 63u))) != 0
-    // - (s->_u32s[c >> 5] & (1u << (c & 31u))) != 0
-    // - (s->_u16s[c >> 4] & (1u << (c & 15u))) != 0
-    // - (s->_u8s[c >> 3] & (1u << (c & 7u))) != 0
-    return (sz_bool_t)((s->_u64s[c >> 6] & (1ull << (c & 63u))) != 0);
-}
-
-/** @brief  Checks if the set contains a given character. Consider @b sz_charset_contains_u8. */
-SZ_PUBLIC sz_bool_t sz_charset_contains(sz_charset_t const *s, char c) {
-    return sz_charset_contains_u8(s, *(sz_u8_t *)(&c)); // bitcast
-}
-
-/** @brief  Inverts the contents of the set, so allowed character get disallowed, and vice versa. */
-SZ_PUBLIC void sz_charset_invert(sz_charset_t *s) {
-    s->_u64s[0] ^= 0xFFFFFFFFFFFFFFFFull, s->_u64s[1] ^= 0xFFFFFFFFFFFFFFFFull, //
-        s->_u64s[2] ^= 0xFFFFFFFFFFFFFFFFull, s->_u64s[3] ^= 0xFFFFFFFFFFFFFFFFull;
-}
-
-typedef void *(*sz_memory_allocate_t)(sz_size_t, void *);
-typedef void (*sz_memory_free_t)(void *, sz_size_t, void *);
-typedef sz_u64_t (*sz_random_generator_t)(void *);
-
-/**
- *  @brief  Some complex pattern matching algorithms may require memory allocations.
- *          This structure is used to pass the memory allocator to those functions.
- *  @see    sz_memory_allocator_init_fixed
- */
-typedef struct sz_memory_allocator_t {
-    sz_memory_allocate_t allocate;
-    sz_memory_free_t free;
-    void *handle;
-} sz_memory_allocator_t;
-
-/**
- *  @brief  Initializes a memory allocator to use the system default `malloc` and `free`.
- *          ! The function is not available if the library was compiled with `SZ_AVOID_LIBC`.
- *
- *  @param alloc    Memory allocator to initialize.
- */
-SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc);
-
-/**
- *  @brief  Initializes a memory allocator to use a static-capacity buffer.
- *          No dynamic allocations will be performed.
- *
- *  @param alloc    Memory allocator to initialize.
- *  @param buffer   Buffer to use for allocations.
- *  @param length   Length of the buffer. @b Must be greater than 8 bytes. Different values would be optimal for
- *                  different algorithms and input lengths, but 4096 bytes (one RAM page) is a good default.
- */
-SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length);
-
-/**
- *  @brief  The number of bytes a stack-allocated string can hold, including the SZ_NULL termination character.
- *          ! This can't be changed from outside. Don't use the `#error` as it may already be included and set.
- */
-#ifdef SZ_STRING_INTERNAL_SPACE
-#undef SZ_STRING_INTERNAL_SPACE
-#endif
-#define SZ_STRING_INTERNAL_SPACE (sizeof(sz_size_t) * 3 - 1) // 3 pointers minus one byte for an 8-bit length
-
-/**
- *  @brief  Tiny memory-owning string structure with a Small String Optimization (SSO).
- *          Differs in layout from Folly, Clang, GCC, and probably most other implementations.
- *          It's designed to avoid any branches on read-only operations, and can store up
- *          to 22 characters on stack on 64-bit machines, followed by the SZ_NULL-termination character.
- *
- *  @section Changing Length
- *
- *  One nice thing about this design, is that you can, in many cases, change the length of the string
- *  without any branches, invoking a `+=` or `-=` on the 64-bit `length` field. If the string is on heap,
- *  the solution is obvious. If it's on stack, inplace decrement wouldn't affect the top bytes of the string,
- *  only changing the last byte containing the length.
- */
-typedef union sz_string_t {
-
-#if !SZ_DETECT_BIG_ENDIAN
-
-    struct external {
-        sz_ptr_t start;
-        sz_size_t length;
-        sz_size_t space;
-        sz_size_t padding;
-    } external;
-
-    struct internal {
-        sz_ptr_t start;
-        sz_u8_t length;
-        char chars[SZ_STRING_INTERNAL_SPACE];
-    } internal;
-
-#else
-
-    struct external {
-        sz_ptr_t start;
-        sz_size_t space;
-        sz_size_t padding;
-        sz_size_t length;
-    } external;
-
-    struct internal {
-        sz_ptr_t start;
-        char chars[SZ_STRING_INTERNAL_SPACE];
-        sz_u8_t length;
-    } internal;
-
-#endif
-
-    sz_size_t words[4];
-
-} sz_string_t;
-
-typedef sz_u64_t (*sz_hash_t)(sz_cptr_t, sz_size_t);
-typedef sz_u64_t (*sz_checksum_t)(sz_cptr_t, sz_size_t);
-typedef sz_bool_t (*sz_equal_t)(sz_cptr_t, sz_cptr_t, sz_size_t);
-typedef sz_ordering_t (*sz_order_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
-typedef void (*sz_to_converter_t)(sz_cptr_t, sz_size_t, sz_ptr_t);
-
-/**
- *  @brief  Computes the 64-bit check-sum of bytes in a string.
- *          Similar to `std::ranges::accumulate`.
- *
- *  @param text     String to aggregate.
- *  @param length   Number of bytes in the text.
- *  @return         64-bit unsigned value.
- */
-SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length);
-
-/** @copydoc sz_checksum */
-SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length);
-
-/**
- *  @brief  Computes the 64-bit unsigned hash of a string. Fairly fast for short strings,
- *          simple implementation, and supports rolling computation, reused in other APIs.
- *          Similar to `std::hash` in C++.
- *
- *  @param text     String to hash.
- *  @param length   Number of bytes in the text.
- *  @return         64-bit hash value.
- *
- *  @see    sz_hashes, sz_hashes_fingerprint, sz_hashes_intersection
- */
-SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length);
-
-/** @copydoc sz_hash */
-SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t text, sz_size_t length);
-
-/**
- *  @brief  Checks if two string are equal.
- *          Similar to `memcmp(a, b, length) == 0` in LibC and `a == b` in STL.
- *
- *  The implementation of this function is very similar to `sz_order`, but the usage patterns are different.
- *  This function is more often used in parsing, while `sz_order` is often used in sorting.
- *  It works best on platforms with cheap
- *
- *  @param a        First string to compare.
- *  @param b        Second string to compare.
- *  @param length   Number of bytes in both strings.
- *  @return         1 if strings match, 0 otherwise.
- */
-SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-
-/**
- *  @brief  Estimates the relative order of two strings. Equivalent to `memcmp(a, b, length)` in LibC.
- *          Can be used on different length strings.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *  @return         Negative if (a < b), positive if (a > b), zero if they are equal.
- */
-SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-
-/**
- *  @brief  Look Up Table @b (LUT) transformation of a string. Equivalent to `for (char & c : text) c = lut[c]`.
- *
- *  Can be used to implement some form of string normalization, partially masking punctuation marks,
- *  or converting between different character sets, like uppercase or lowercase. Surprisingly, also has
- *  broad implications in image processing, where image channel transformations are often done using LUTs.
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param lut      Look Up Table to apply. Must be exactly @b 256 bytes long.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
-
-typedef void (*sz_look_up_transform_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_ptr_t);
-
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_serial(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
-
-/**
- *  @brief  Equivalent to `for (char & c : text) c = tolower(c)`.
- *
- *  ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
- *  So there are 26 english letters, shifted by 32 values, meaning that a conversion
- *  can be done by flipping the 5th bit each inappropriate character byte. This, however,
- *  breaks for extended ASCII, so a different solution is needed.
- *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_PUBLIC void sz_tolower(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
-
-/**
- *  @brief  Equivalent to `for (char & c : text) c = toupper(c)`.
  *
- *  ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
- *  So there are 26 english letters, shifted by 32 values, meaning that a conversion
- *  can be done by flipping the 5th bit each inappropriate character byte. This, however,
- *  breaks for extended ASCII, so a different solution is needed.
- *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_PUBLIC void sz_toupper(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
-
-/**
- *  @brief  Equivalent to `for (char & c : text) c = toascii(c)`.
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_PUBLIC void sz_toascii(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
-
-/**
- *  @brief  Checks if all characters in the range are valid ASCII characters.
- *
- *  @param text     String to be analyzed.
- *  @param length   Number of bytes in the string.
- *  @return         Whether all characters are valid ASCII characters.
- */
-SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t text, sz_size_t length);
-
-/**
- *  @brief  Generates a random string for a given alphabet, avoiding integer division and modulo operations.
- *          Similar to `text[i] = alphabet[rand() % cardinality]`.
- *
- *  The modulo operation is expensive, and should be avoided in performance-critical code.
- *  We avoid it using small lookup tables and replacing it with a multiplication and shifts, similar to `libdivide`.
- *  Alternative algorithms would include:
- *      - Montgomery form: https://en.algorithmica.org/hpc/number-theory/montgomery/
- *      - Barret reduction: https://www.nayuki.io/page/barrett-reduction-algorithm
- *      - Lemire's trick: https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
- *
- *  @param alphabet     Set of characters to sample from.
- *  @param cardinality  Number of characters to sample from.
- *  @param text         Output string, can point to the same address as ::text.
- *  @param generate     Callback producing random numbers given the generator state.
- *  @param generator    Generator state, can be a pointer to a seed, or a pointer to a random number generator.
- */
-SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
-                            sz_random_generator_t generate, void *generator);
-
-/** @copydoc sz_generate */
-SZ_PUBLIC void sz_generate_serial(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
-                                  sz_random_generator_t generate, void *generator);
-
-/**
- *  @brief  Similar to `memcpy`, copies contents of one string into another.
- *          The behavior is undefined if the strings overlap.
- *
- *  @param target   String to copy into.
- *  @param length   Number of bytes to copy.
- *  @param source   String to copy from.
- */
-SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-/**
- *  @brief  Similar to `memmove`, copies (moves) contents of one string into another.
- *          Unlike `sz_copy`, allows overlapping strings as arguments.
- *
- *  @param target   String to copy into.
- *  @param length   Number of bytes to copy.
- *  @param source   String to copy from.
- */
-SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-typedef void (*sz_move_t)(sz_ptr_t, sz_cptr_t, sz_size_t);
-
-/**
- *  @brief  Similar to `memset`, fills a string with a given value.
- *
- *  @param target   String to fill.
- *  @param length   Number of bytes to fill.
- *  @param value    Value to fill with.
- */
-SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-
-typedef void (*sz_fill_t)(sz_ptr_t, sz_size_t, sz_u8_t);
-
-/**
- *  @brief  Initializes a string class instance to an empty value.
- */
-SZ_PUBLIC void sz_string_init(sz_string_t *string);
-
-/**
- *  @brief  Convenience function checking if the provided string is stored inside of the ::string instance itself,
- *          alternative being - allocated in a remote region of the heap.
- */
-SZ_PUBLIC sz_bool_t sz_string_is_on_stack(sz_string_t const *string);
-
-/**
- *  @brief  Unpacks the opaque instance of a string class into its components.
- *          Recommended to use only in read-only operations.
- *
- *  @param string       String to unpack.
- *  @param start        Pointer to the start of the string.
- *  @param length       Number of bytes in the string, before the SZ_NULL character.
- *  @param space        Number of bytes allocated for the string (heap or stack), including the SZ_NULL character.
- *  @param is_external  Whether the string is allocated on the heap externally, or fits withing ::string instance.
- */
-SZ_PUBLIC void sz_string_unpack(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length, sz_size_t *space,
-                                sz_bool_t *is_external);
-
-/**
- *  @brief  Unpacks only the start and length of the string.
- *          Recommended to use only in read-only operations.
- *
- * @param string       String to unpack.
- * @param start        Pointer to the start of the string.
- * @param length       Number of bytes in the string, before the SZ_NULL character.
- */
-SZ_PUBLIC void sz_string_range(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length);
-
-/**
- *  @brief  Constructs a string of a given ::length with noisy contents.
- *          Use the returned character pointer to populate the string.
- *
- *  @param string       String to initialize.
- *  @param length       Number of bytes in the string, before the SZ_NULL character.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             SZ_NULL if the operation failed, pointer to the start of the string otherwise.
- */
-SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length, sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Doesn't change the contents or the length of the string, but grows the available memory capacity.
- *          This is beneficial, if several insertions are expected, and we want to minimize allocations.
- *
- *  @param string       String to grow.
- *  @param new_capacity The number of characters to reserve space for, including existing ones.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             SZ_NULL if the operation failed, pointer to the new start of the string otherwise.
- */
-SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity, sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Grows the string by adding an uninitialized region of ::added_length at the given ::offset.
- *          Would often be used in conjunction with one or more `sz_copy` calls to populate the allocated region.
- *          Similar to `sz_string_reserve`, but changes the length of the ::string.
- *
- *  @param string       String to grow.
- *  @param offset       Offset of the first byte to reserve space for.
- *                      If provided offset is larger than the length, it will be capped.
- *  @param added_length The number of new characters to reserve space for.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             SZ_NULL if the operation failed, pointer to the new start of the string otherwise.
- */
-SZ_PUBLIC sz_ptr_t sz_string_expand(sz_string_t *string, sz_size_t offset, sz_size_t added_length,
-                                    sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Removes a range from a string. Changes the length, but not the capacity.
- *          Performs no allocations or deallocations and can't fail.
- *
- *  @param string       String to clean.
- *  @param offset       Offset of the first byte to remove.
- *  @param length       Number of bytes to remove. Out-of-bound ranges will be capped.
- *  @return             Number of bytes removed.
- */
-SZ_PUBLIC sz_size_t sz_string_erase(sz_string_t *string, sz_size_t offset, sz_size_t length);
-
-/**
- *  @brief  Shrinks the string to fit the current length, if it's allocated on the heap.
- *          It's the reverse operation of ::sz_string_reserve.
- *
- *  @param string       String to shrink.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             Whether the operation was successful. The only failures can come from the allocator.
- *                      On failure, the string will remain unchanged.
- */
-SZ_PUBLIC sz_ptr_t sz_string_shrink_to_fit(sz_string_t *string, sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Frees the string, if it's allocated on the heap.
- *          If the string is on the stack, the function clears/resets the state.
- */
-SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *allocator);
-
-#pragma endregion
-
-#pragma region Fast Substring Search API
-
-typedef sz_cptr_t (*sz_find_byte_t)(sz_cptr_t, sz_size_t, sz_cptr_t);
-typedef sz_cptr_t (*sz_find_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
-typedef sz_cptr_t (*sz_find_set_t)(sz_cptr_t, sz_size_t, sz_charset_t const *);
-
-/**
- *  @brief  Locates first matching byte in a string. Equivalent to `memchr(haystack, *needle, h_length)` in LibC.
- *
- *  X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memchr.S
- *  Aarch64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/aarch64/memchr.S
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - single-byte substring to find.
- *  @return         Address of the first match.
- */
-SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/**
- *  @brief  Locates last matching byte in a string. Equivalent to `memrchr(haystack, *needle, h_length)` in LibC.
- *
- *  X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memrchr.S
- *  Aarch64 implementation: missing
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - single-byte substring to find.
- *  @return         Address of the last match.
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/**
- *  @brief  Locates first matching substring.
- *          Equivalent to `memmem(haystack, h_length, needle, n_length)` in LibC.
- *          Similar to `strstr(haystack, needle)` in LibC, but requires known length.
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - substring to find.
- *  @param n_length Number of bytes in the needle.
- *  @return         Address of the first match.
- */
-SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/**
- *  @brief  Locates the last matching substring.
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - substring to find.
- *  @param n_length Number of bytes in the needle.
- *  @return         Address of the last match.
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/**
- *  @brief  Finds the first character present from the ::set, present in ::text.
- *          Equivalent to `strspn(text, accepted)` and `strcspn(text, rejected)` in LibC.
- *          May have identical implementation and performance to ::sz_rfind_charset.
- *
- *  Useful for parsing, when we want to skip a set of characters. Examples:
- *  * 6 whitespaces: " \t\n\r\v\f".
- *  * 16 digits forming a float number: "0123456789,.eE+-".
- *  * 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
- *  * 2 JSON string special characters useful to locate the end of the string: "\"\\".
- *
- *  @param text     String to be scanned.
- *  @param set      Set of relevant characters.
- *  @return         Pointer to the first matching character from ::set.
- */
-SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-/**
- *  @brief  Finds the last character present from the ::set, present in ::text.
- *          Equivalent to `strspn(text, accepted)` and `strcspn(text, rejected)` in LibC.
- *          May have identical implementation and performance to ::sz_find_charset.
- *
- *  Useful for parsing, when we want to skip a set of characters. Examples:
- *  * 6 whitespaces: " \t\n\r\v\f".
- *  * 16 digits forming a float number: "0123456789,.eE+-".
- *  * 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
- *  * 2 JSON string special characters useful to locate the end of the string: "\"\\".
- *
- *  @param text     String to be scanned.
- *  @param set      Set of relevant characters.
- *  @return         Pointer to the last matching character from ::set.
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-#pragma endregion
-
-#pragma region String Similarity Measures API
-
-/**
- *  @brief  Computes the Hamming distance between two strings - number of not matching characters.
- *          Difference in length is is counted as a mismatch.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for the distance, the `bound` if was exceeded.
- *
- *  @see    sz_hamming_distance_utf8
- *  @see    https://en.wikipedia.org/wiki/Hamming_distance
- */
-SZ_DYNAMIC sz_size_t sz_hamming_distance( //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
-
-/** @copydoc sz_hamming_distance */
-SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
-
-/**
- *  @brief  Computes the Hamming distance between two @b UTF8 strings - number of not matching characters.
- *          Difference in length is is counted as a mismatch.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for the distance, the `bound` if was exceeded.
- *
- *  @see    sz_hamming_distance
- *  @see    https://en.wikipedia.org/wiki/Hamming_distance
- */
-SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                              sz_size_t bound);
-
-/** @copydoc sz_hamming_distance_utf8 */
-SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                                    sz_size_t bound);
-
-typedef sz_size_t (*sz_hamming_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t);
-
-/**
- *  @brief  Computes the Levenshtein edit-distance between two strings using the Wagner-Fisher algorithm.
- *          Similar to the Needleman-Wunsch alignment algorithm. Often used in fuzzy string matching.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @param bound    Exclusive upper bound on the distance, that allows us to exit early.
- *                  Pass `SZ_SIZE_MAX` or any value greater than `(max(a_length, b_length))` to ignore.
- *                  Pass zero to check if the strings are equal.
- *  @return         Unsigned integer for the edit distance. Zero means the strings are equal.
- *                  Returns the `bound` if it was exceeded or `SZ_SIZE_MAX` if the memory allocation failed.
- *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default
- *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
- */
-SZ_DYNAMIC sz_size_t sz_edit_distance(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                      sz_size_t bound, sz_memory_allocator_t *alloc);
-
-/** @copydoc sz_edit_distance */
-SZ_PUBLIC sz_size_t sz_edit_distance_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                            sz_size_t bound, sz_memory_allocator_t *alloc);
-
-/**
- *  @brief  Computes the Levenshtein edit-distance between two @b UTF8 strings.
- *          Unlike `sz_edit_distance`, reports the distance in Unicode codepoints, and not in bytes.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for edit distance, the `bound` if was exceeded or `SZ_SIZE_MAX`
- *                  if the memory allocation failed.
- *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default, sz_edit_distance
- *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
- */
-SZ_DYNAMIC sz_size_t sz_edit_distance_utf8(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                           sz_size_t bound, sz_memory_allocator_t *alloc);
-
-typedef sz_size_t (*sz_edit_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t, sz_memory_allocator_t *);
-
-/** @copydoc sz_edit_distance_utf8 */
-SZ_PUBLIC sz_size_t sz_edit_distance_utf8_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                                 sz_size_t bound, sz_memory_allocator_t *alloc);
-
-/**
- *  @brief  Computes Needleman–Wunsch alignment score for two string. Often used in bioinformatics and cheminformatics.
- *          Similar to the Levenshtein edit-distance, parameterized for gap and substitution penalties.
- *
- *  Not commutative in the general case, as the order of the strings matters, as `sz_alignment_score(a, b)` may
- *  not be equal to `sz_alignment_score(b, a)`. Becomes @b commutative, if the substitution costs are symmetric.
- *  Equivalent to the negative Levenshtein distance, if: `gap == -1` and `subs[i][j] == (i == j ? 0: -1)`.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *  @param gap      Penalty cost for gaps - insertions and removals.
- *  @param subs     Substitution costs matrix with 256 x 256 values for all pairs of characters.
- *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @return         Signed similarity score. Can be negative, depending on the substitution costs.
- *                  If the memory allocation fails, the function returns `SZ_SSIZE_MAX`.
- *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default
- *  @see    https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
- */
-SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                         sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                         sz_memory_allocator_t *alloc);
-
-/** @copydoc sz_alignment_score */
-SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                               sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                               sz_memory_allocator_t *alloc);
-
-typedef sz_ssize_t (*sz_alignment_score_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_error_cost_t const *,
-                                           sz_error_cost_t, sz_memory_allocator_t *);
-
-typedef void (*sz_hash_callback_t)(sz_cptr_t, sz_size_t, sz_u64_t, void *user);
-
-/**
- *  @brief  Computes the Karp-Rabin rolling hashes of a string supplying them to the provided `callback`.
- *          Can be used for similarity scores, search, ranking, etc.
- *
- *  Rabin-Karp-like rolling hashes can have very high-level of collisions and depend
- *  on the choice of bases and the prime number. That's why, often two hashes from the same
- *  family are used with different bases.
- *
- *       1. Kernighan and Ritchie's function uses 31, a prime close to the size of English alphabet.
- *       2. To be friendlier to byte-arrays and UTF8, we use 257 for the second function.
- *
- *  Choosing the right ::window_length is task- and domain-dependant. For example, most English words are
- *  between 3 and 7 characters long, so a window of 4 bytes would be a good choice. For DNA sequences,
- *  the ::window_length might be a multiple of 3, as the codons are 3 (nucleotides) bytes long.
- *  With such minimalistic alphabets of just four characters (AGCT) longer windows might be needed.
- *  For protein sequences the alphabet is 20 characters long, so the window can be shorter, than for DNAs.
- *
- *  @param text             String to hash.
- *  @param length           Number of bytes in the string.
- *  @param window_length    Length of the rolling window in bytes.
- *  @param window_step      Step of reported hashes. @b Must be power of two. Should be smaller than `window_length`.
- *  @param callback         Function receiving the start & length of a substring, the hash, and the `callback_handle`.
- *  @param callback_handle  Optional user-provided pointer to be passed to the `callback`.
- *  @see                    sz_hashes_fingerprint, sz_hashes_intersection
- */
-SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                          sz_hash_callback_t callback, void *callback_handle);
-
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_serial(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                                sz_hash_callback_t callback, void *callback_handle);
-
-typedef void (*sz_hashes_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_size_t, sz_hash_callback_t, void *);
-
-/**
- *  @brief  Computes the Karp-Rabin rolling hashes of a string outputting a binary fingerprint.
- *          Such fingerprints can be compared with Hamming or Jaccard (Tanimoto) distance for similarity.
- *
- *  The algorithm doesn't clear the fingerprint buffer on start, so it can be invoked multiple times
- *  to produce a fingerprint of a longer string, by passing the previous fingerprint as the ::fingerprint.
- *  It can also be reused to produce multi-resolution fingerprints by changing the ::window_length
- *  and calling the same function multiple times for the same input ::text.
- *
- *  Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
- *  avoiding cache-coherency penalties of remote on-heap buffers.
- *
- *  @param text                 String to hash.
- *  @param length               Number of bytes in the string.
- *  @param fingerprint          Output fingerprint buffer.
- *  @param fingerprint_bytes    Number of bytes in the fingerprint buffer.
- *  @param window_length        Length of the rolling window in bytes.
- *  @see                        sz_hashes, sz_hashes_intersection
- */
-SZ_PUBLIC void sz_hashes_fingerprint(                          //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
-    sz_ptr_t fingerprint, sz_size_t fingerprint_bytes);
-
-typedef void (*sz_hashes_fingerprint_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_ptr_t, sz_size_t);
-
-/**
- *  @brief  Given a hash-fingerprint of a textual document, computes the number of intersecting hashes
- *          of the incoming document. Can be used for document scoring and search.
- *
- *  Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
- *  avoiding cache-coherency penalties of remote on-heap buffers.
- *
- *  @param text                 Input document.
- *  @param length               Number of bytes in the input document.
- *  @param fingerprint          Reference document fingerprint.
- *  @param fingerprint_bytes    Number of bytes in the reference documents fingerprint.
- *  @param window_length        Length of the rolling window in bytes.
- *  @see                        sz_hashes, sz_hashes_fingerprint
- */
-SZ_PUBLIC sz_size_t sz_hashes_intersection(                    //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
-    sz_cptr_t fingerprint, sz_size_t fingerprint_bytes);
-
-typedef sz_size_t (*sz_hashes_intersection_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_cptr_t, sz_size_t);
-
-#pragma endregion
-
-#pragma region Convenience API
-
-/**
- *  @brief  Finds the first character in the haystack, that is present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-/**
- *  @brief  Finds the first character in the haystack, that is @b not present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_find_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-/**
- *  @brief  Finds the last character in the haystack, that is present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-/**
- *  @brief  Finds the last character in the haystack, that is @b not present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-#pragma endregion
-
-#pragma region String Sequences API
-
-struct sz_sequence_t;
-
-typedef sz_cptr_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_size_t (*sz_sequence_member_length_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_bool_t (*sz_sequence_predicate_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_bool_t (*sz_sequence_comparator_t)(struct sz_sequence_t const *, sz_size_t, sz_size_t);
-typedef sz_bool_t (*sz_string_is_less_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
-
-typedef struct sz_sequence_t {
-    sz_sorted_idx_t *order;
-    sz_size_t count;
-    sz_sequence_member_start_t get_start;
-    sz_sequence_member_length_t get_length;
-    void const *handle;
-} sz_sequence_t;
-
-/**
- *  @brief  Initiates the sequence structure from a tape layout, used by Apache Arrow.
- *          Expects ::offsets to contains `count + 1` entries, the last pointing at the end
- *          of the last string, indicating the total length of the ::tape.
- */
-SZ_PUBLIC void sz_sequence_from_u32tape(sz_cptr_t *start, sz_u32_t const *offsets, sz_size_t count,
-                                        sz_sequence_t *sequence);
-
-/**
- *  @brief  Initiates the sequence structure from a tape layout, used by Apache Arrow.
- *          Expects ::offsets to contains `count + 1` entries, the last pointing at the end
- *          of the last string, indicating the total length of the ::tape.
- */
-SZ_PUBLIC void sz_sequence_from_u64tape(sz_cptr_t *start, sz_u64_t const *offsets, sz_size_t count,
-                                        sz_sequence_t *sequence);
-
-/**
- *  @brief  Similar to `std::partition`, given a predicate splits the sequence into two parts.
- *          The algorithm is unstable, meaning that elements may change relative order, as long
- *          as they are in the right partition. This is the simpler algorithm for partitioning.
- */
-SZ_PUBLIC sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate);
-
-/**
- *  @brief  Inplace `std::set_union` for two consecutive chunks forming the same continuous `sequence`.
- *
- *  @param partition The number of elements in the first sub-sequence in `sequence`.
- *  @param less Comparison function, to determine the lexicographic ordering.
- */
-SZ_PUBLIC void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less);
-
-/**
- *  @brief  Sorting algorithm, combining Radix Sort for the first 32 bits of every word
- *          and a follow-up by a more conventional sorting procedure on equally prefixed parts.
- */
-SZ_PUBLIC void sz_sort(sz_sequence_t *sequence);
-
-/**
- *  @brief  Partial sorting algorithm, combining Radix Sort for the first 32 bits of every word
- *          and a follow-up by a more conventional sorting procedure on equally prefixed parts.
- */
-SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t n);
-
-/**
- *  @brief  Intro-Sort algorithm that supports custom comparators.
- */
-SZ_PUBLIC void sz_sort_intro(sz_sequence_t *sequence, sz_sequence_comparator_t less);
-
-#pragma endregion
-
-/*
- *  Hardware feature detection.
- *  All of those can be controlled by the user.
- */
-#ifndef SZ_USE_X86_AVX512
-#ifdef __AVX512BW__
-#define SZ_USE_X86_AVX512 1
-#else
-#define SZ_USE_X86_AVX512 0
-#endif
-#endif
-
-#ifndef SZ_USE_X86_AVX2
-#ifdef __AVX2__
-#define SZ_USE_X86_AVX2 1
-#else
-#define SZ_USE_X86_AVX2 0
-#endif
-#endif
-
-#ifndef SZ_USE_ARM_NEON
-#ifdef __ARM_NEON
-#define SZ_USE_ARM_NEON 1
-#else
-#define SZ_USE_ARM_NEON 0
-#endif
-#endif
-
-#ifndef SZ_USE_ARM_SVE
-#ifdef __ARM_FEATURE_SVE
-#define SZ_USE_ARM_SVE 1
-#else
-#define SZ_USE_ARM_SVE 0
-#endif
-#endif
-
-/*
- *  Include hardware-specific headers.
- */
-#if SZ_USE_X86_AVX512 || SZ_USE_X86_AVX2
-#include <immintrin.h>
-#endif // SZ_USE_X86...
-#if SZ_USE_ARM_NEON
-#if !defined(_MSC_VER)
-#include <arm_acle.h>
-#endif
-#include <arm_neon.h>
-#endif // SZ_USE_ARM_NEON
-#if SZ_USE_ARM_SVE
-#if !defined(_MSC_VER)
-#include <arm_sve.h>
-#endif
-#endif // SZ_USE_ARM_SVE
-
-#pragma region Hardware Specific API
-
-#if SZ_USE_X86_AVX512
-
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_avx512(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_avx512(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_edit_distance */
-SZ_PUBLIC sz_size_t sz_edit_distance_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                            sz_size_t bound, sz_memory_allocator_t *alloc);
-/** @copydoc sz_alignment_score */
-SZ_PUBLIC sz_ssize_t sz_alignment_score_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                               sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                               sz_memory_allocator_t *alloc);
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle);
-#endif
-
-#if SZ_USE_X86_AVX2
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_avx2(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_avx2(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_avx2(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_avx2(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                              sz_hash_callback_t callback, void *callback_handle);
-#endif
-
-#if SZ_USE_ARM_NEON
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_neon(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_neon(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_neon(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-#endif
-
-#if SZ_USE_ARM_SVE
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_sve(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_sve(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_sve(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_sve(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_sve(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-#endif
-
-#pragma endregion
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wconversion"
-
-/*
- **********************************************************************************************************************
- **********************************************************************************************************************
- **********************************************************************************************************************
- *
- *  This is where we the actual implementation begins.
- *  The rest of the file is hidden from the public API.
- *
- **********************************************************************************************************************
- **********************************************************************************************************************
- **********************************************************************************************************************
- */
-
-#pragma region Compiler Extensions and Helper Functions
-
-#pragma GCC visibility push(hidden)
-
-/**
- *  @brief  Helper-macro to mark potentially unused variables.
- */
-#define sz_unused(x) ((void)(x))
-
-/**
- *  @brief  Helper-macro casting a variable to another type of the same size.
- */
-#define sz_bitcast(type, value) (*((type *)&(value)))
-
-/**
- *  @brief  Defines `SZ_NULL`, analogous to `NULL`.
- *          The default often comes from locale.h, stddef.h,
- *          stdio.h, stdlib.h, string.h, time.h, or wchar.h.
- */
-#ifdef __GNUG__
-#define SZ_NULL __null
-#define SZ_NULL_CHAR __null
-#else
-#define SZ_NULL ((void *)0)
-#define SZ_NULL_CHAR ((char *)0)
-#endif
-
-/**
- *  @brief  Cache-line width, that will affect the execution of some algorithms,
- *          like equality checks and relative order computing.
- */
-#define SZ_CACHE_LINE_WIDTH (64) // bytes
-
-/**
- *  @brief  Similar to `assert`, the `sz_assert` is used in the SZ_DEBUG mode
- *          to check the invariants of the library. It's a no-op in the SZ_RELEASE mode.
- *  @note   If you want to catch it, put a breakpoint at @b `__GI_exit`
- */
-#if SZ_DEBUG && defined(SZ_AVOID_LIBC) && !SZ_AVOID_LIBC && !defined(SZ_PIC)
-#include <stdio.h>  // `fprintf`
-#include <stdlib.h> // `EXIT_FAILURE`
-SZ_PUBLIC void _sz_assert_failure(char const *condition, char const *file, int line) {
-    fprintf(stderr, "Assertion failed: %s, in file %s, line %d\n", condition, file, line);
-    exit(EXIT_FAILURE);
-}
-#define sz_assert(condition)                                                      \
-    do {                                                                          \
-        if (!(condition)) { _sz_assert_failure(#condition, __FILE__, __LINE__); } \
-    } while (0)
-#else
-#define sz_assert(condition) ((void)(condition))
-#endif
-
-/*  Intrinsics aliases for MSVC, GCC, Clang, and Clang-Cl.
- *  The following section of compiler intrinsics comes in 2 flavors.
- */
-#if defined(_MSC_VER) && !defined(__clang__) // On Clang-CL
-#include <intrin.h>
-
-// Sadly, when building Win32 images, we can't use the `_tzcnt_u64`, `_lzcnt_u64`,
-// `_BitScanForward64`, or `_BitScanReverse64` intrinsics. For now it's a simple `for`-loop.
-// TODO: In the future we can switch to a more efficient De Bruijn's algorithm.
-// https://www.chessprogramming.org/BitScan
-// https://www.chessprogramming.org/De_Bruijn_Sequence
-// https://gist.github.com/resilar/e722d4600dbec9752771ab4c9d47044f
-//
-// Use the serial version on 32-bit x86 and on Arm.
-#if (defined(_WIN32) && !defined(_WIN64)) || defined(_M_ARM) || defined(_M_ARM64)
-SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 1) == 0) { n++, x >>= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u64_clz(sz_u64_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 0x8000000000000000ull) == 0) { n++, x <<= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) {
-    x = x - ((x >> 1) & 0x5555555555555555ull);
-    x = (x & 0x3333333333333333ull) + ((x >> 2) & 0x3333333333333333ull);
-    return (((x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Full) * 0x0101010101010101ull) >> 56;
-}
-SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 1) == 0) { n++, x >>= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u32_clz(sz_u32_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 0x80000000u) == 0) { n++, x <<= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) {
-    x = x - ((x >> 1) & 0x55555555);
-    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
-    return (((x + (x >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
-}
-#else
-SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) { return (int)_tzcnt_u64(x); }
-SZ_INTERNAL int sz_u64_clz(sz_u64_t x) { return (int)_lzcnt_u64(x); }
-SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) { return (int)__popcnt64(x); }
-SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) { return (int)_tzcnt_u32(x); }
-SZ_INTERNAL int sz_u32_clz(sz_u32_t x) { return (int)_lzcnt_u32(x); }
-SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) { return (int)__popcnt(x); }
-#endif
-// Force the byteswap functions to be intrinsics, because when /Oi- is given, these will turn into CRT function calls,
-// which breaks when `SZ_AVOID_LIBC` is given
-#pragma intrinsic(_byteswap_uint64)
-SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return _byteswap_uint64(val); }
-#pragma intrinsic(_byteswap_ulong)
-SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return _byteswap_ulong(val); }
-#else
-SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) { return __builtin_popcountll(x); }
-SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) { return __builtin_popcount(x); }
-SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) { return __builtin_ctzll(x); }
-SZ_INTERNAL int sz_u64_clz(sz_u64_t x) { return __builtin_clzll(x); }
-SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) { return __builtin_ctz(x); } // ! Undefined if `x == 0`
-SZ_INTERNAL int sz_u32_clz(sz_u32_t x) { return __builtin_clz(x); } // ! Undefined if `x == 0`
-SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return __builtin_bswap64(val); }
-SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return __builtin_bswap32(val); }
-#endif
-
-SZ_INTERNAL sz_u64_t sz_u64_rotl(sz_u64_t x, sz_u64_t r) { return (x << r) | (x >> (64 - r)); }
-
-/**
- *  @brief  Select bits from either ::a or ::b depending on the value of ::mask bits.
- *
- *  Similar to `_mm_blend_epi16` intrinsic on x86.
- *  Described in the "Bit Twiddling Hacks" by Sean Eron Anderson.
- *  https://graphics.stanford.edu/~seander/bithacks.html#ConditionalSetOrClearBitsWithoutBranching
- */
-SZ_INTERNAL sz_u64_t sz_u64_blend(sz_u64_t a, sz_u64_t b, sz_u64_t mask) { return a ^ ((a ^ b) & mask); }
-
-/*
- *  Efficiently computing the minimum and maximum of two or three values can be tricky.
- *  The simple branching baseline would be:
- *
- *      x < y ? x : y                               // can replace with 1 conditional move
- *
- *  Branchless approach is well known for signed integers, but it doesn't apply to unsigned ones.
- *  https://stackoverflow.com/questions/514435/templatized-branchless-int-max-min-function
- *  https://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
- *  Using only bit-shifts for singed integers it would be:
- *
- *      y + ((x - y) & (x - y) >> 31)               // 4 unique operations
- *
- *  Alternatively, for any integers using multiplication:
- *
- *      (x > y) * y + (x <= y) * x                  // 5 operations
- *
- *  Alternatively, to avoid multiplication:
- *
- *      x & ~((x < y) - 1) + y & ((x < y) - 1)      // 6 unique operations
- */
-#define sz_min_of_two(x, y) (x < y ? x : y)
-#define sz_max_of_two(x, y) (x < y ? y : x)
-#define sz_min_of_three(x, y, z) sz_min_of_two(x, sz_min_of_two(y, z))
-#define sz_max_of_three(x, y, z) sz_max_of_two(x, sz_max_of_two(y, z))
-
-/** @brief  Branchless minimum function for two signed 32-bit integers. */
-SZ_INTERNAL sz_i32_t sz_i32_min_of_two(sz_i32_t x, sz_i32_t y) { return y + ((x - y) & (x - y) >> 31); }
-
-/** @brief  Branchless minimum function for two signed 32-bit integers. */
-SZ_INTERNAL sz_i32_t sz_i32_max_of_two(sz_i32_t x, sz_i32_t y) { return x - ((x - y) & (x - y) >> 31); }
-
-/**
- *  @brief  Clamps signed offsets in a string to a valid range. Used for Pythonic-style slicing.
- */
-SZ_INTERNAL void sz_ssize_clamp_interval(sz_size_t length, sz_ssize_t start, sz_ssize_t end,
-                                         sz_size_t *normalized_offset, sz_size_t *normalized_length) {
-    // TODO: Remove branches.
-    // Normalize negative indices
-    if (start < 0) start += length;
-    if (end < 0) end += length;
-
-    // Clamp indices to a valid range
-    if (start < 0) start = 0;
-    if (end < 0) end = 0;
-    if (start > (sz_ssize_t)length) start = length;
-    if (end > (sz_ssize_t)length) end = length;
-
-    // Ensure start <= end
-    if (start > end) start = end;
-
-    *normalized_offset = start;
-    *normalized_length = end - start;
-}
-
-/**
- *  @brief  Compute the logarithm base 2 of a positive integer, rounding down.
- */
-SZ_INTERNAL sz_size_t sz_size_log2i_nonzero(sz_size_t x) {
-    sz_assert(x > 0 && "Non-positive numbers have no defined logarithm");
-    sz_size_t leading_zeros = sz_u64_clz(x);
-    return 63 - leading_zeros;
-}
-
-/**
- *  @brief  Compute the smallest power of two greater than or equal to ::x.
- */
-SZ_INTERNAL sz_size_t sz_size_bit_ceil(sz_size_t x) {
-    // Unlike the commonly used trick with `clz` intrinsics, is valid across the whole range of `x`.
-    // https://stackoverflow.com/a/10143264
-    x--;
-    x |= x >> 1;
-    x |= x >> 2;
-    x |= x >> 4;
-    x |= x >> 8;
-    x |= x >> 16;
-#if SZ_DETECT_64_BIT
-    x |= x >> 32;
-#endif
-    x++;
-    return x;
-}
-
-/**
- *  @brief  Transposes an 8x8 bit matrix packed in a `sz_u64_t`.
- *
- *  There is a well known SWAR sequence for that known to chess programmers,
- *  willing to flip a bit-matrix of pieces along the main A1-H8 diagonal.
- *  https://www.chessprogramming.org/Flipping_Mirroring_and_Rotating
- *  https://lukas-prokop.at/articles/2021-07-23-transpose
- */
-SZ_INTERNAL sz_u64_t sz_u64_transpose(sz_u64_t x) {
-    sz_u64_t t;
-    t = x ^ (x << 36);
-    x ^= 0xf0f0f0f00f0f0f0full & (t ^ (x >> 36));
-    t = 0xcccc0000cccc0000ull & (x ^ (x << 18));
-    x ^= t ^ (t >> 18);
-    t = 0xaa00aa00aa00aa00ull & (x ^ (x << 9));
-    x ^= t ^ (t >> 9);
-    return x;
-}
-
-/**
- *  @brief  Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
- */
-SZ_INTERNAL void sz_u64_swap(sz_u64_t *a, sz_u64_t *b) {
-    sz_u64_t t = *a;
-    *a = *b;
-    *b = t;
-}
-
-/**
- *  @brief  Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
- */
-SZ_INTERNAL void sz_pointer_swap(void **a, void **b) {
-    void *t = *a;
-    *a = *b;
-    *b = t;
-}
-
-/**
- *  @brief  Helper structure to simplify work with 16-bit words.
- *  @see    sz_u16_load
- */
-typedef union sz_u16_vec_t {
-    sz_u16_t u16;
-    sz_u8_t u8s[2];
-} sz_u16_vec_t;
-
-/**
- *  @brief Load a 16-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u16_vec_t sz_u16_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u16_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u16_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u16_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u16_vec_t const *result = (sz_u16_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/**
- *  @brief  Helper structure to simplify work with 32-bit words.
- *  @see    sz_u32_load
- */
-typedef union sz_u32_vec_t {
-    sz_u32_t u32;
-    sz_u16_t u16s[2];
-    sz_u8_t u8s[4];
-} sz_u32_vec_t;
-
-/**
- *  @brief Load a 32-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u32_vec_t sz_u32_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u32_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    result.u8s[2] = ptr[2];
-    result.u8s[3] = ptr[3];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u32_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u32_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u32_vec_t const *result = (sz_u32_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/**
- *  @brief  Helper structure to simplify work with 64-bit words.
- *  @see    sz_u64_load
- */
-typedef union sz_u64_vec_t {
-    sz_u64_t u64;
-    sz_u32_t u32s[2];
-    sz_u16_t u16s[4];
-    sz_u8_t u8s[8];
-} sz_u64_vec_t;
-
-/**
- *  @brief Load a 64-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u64_vec_t sz_u64_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u64_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    result.u8s[2] = ptr[2];
-    result.u8s[3] = ptr[3];
-    result.u8s[4] = ptr[4];
-    result.u8s[5] = ptr[5];
-    result.u8s[6] = ptr[6];
-    result.u8s[7] = ptr[7];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u64_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u64_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u64_vec_t const *result = (sz_u64_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/** @brief  Helper function, using the supplied fixed-capacity buffer to allocate memory. */
-SZ_INTERNAL sz_ptr_t _sz_memory_allocate_fixed(sz_size_t length, void *handle) {
-    sz_size_t capacity;
-    sz_copy((sz_ptr_t)&capacity, (sz_cptr_t)handle, sizeof(sz_size_t));
-    sz_size_t consumed_capacity = sizeof(sz_size_t);
-    if (consumed_capacity + length > capacity) return SZ_NULL_CHAR;
-    return (sz_ptr_t)handle + consumed_capacity;
-}
-
-/** @brief  Helper "no-op" function, simulating memory deallocation when we use a "static" memory buffer. */
-SZ_INTERNAL void _sz_memory_free_fixed(sz_ptr_t start, sz_size_t length, void *handle) {
-    sz_unused(start && length && handle);
-}
-
-/** @brief  An internal callback used to set a bit in a power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void _sz_hashes_fingerprint_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *handle) {
-    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
-    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
-    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
-    fingerprint_u8s[(hash / 8) & (fingerprint_bytes - 1)] |= (1 << (hash & 7));
-    sz_unused(start && length);
-}
-
-/** @brief  An internal callback used to set a bit in a @b non power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void _sz_hashes_fingerprint_non_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash,
-                                                          void *handle) {
-    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
-    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
-    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
-    fingerprint_u8s[(hash / 8) % fingerprint_bytes] |= (1 << (hash & 7));
-    sz_unused(start && length);
-}
-
-/** @brief  An internal callback, used to mix all the running hashes into one pointer-size value. */
-SZ_INTERNAL void _sz_hashes_fingerprint_scalar_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash,
-                                                        void *scalar_handle) {
-    sz_unused(start && length && hash && scalar_handle);
-    sz_size_t *scalar_ptr = (sz_size_t *)scalar_handle;
-    *scalar_ptr ^= hash;
-}
-
-/**
- *  @brief  Chooses the offsets of the most interesting characters in a search needle.
- *
- *  Search throughput can significantly deteriorate if we are matching the wrong characters.
- *  Say the needle is "aXaYa", and we are comparing the first, second, and last character.
- *  If we use SIMD and compare many offsets at a time, comparing against "a" in every register is a waste.
- *
- *  Similarly, dealing with UTF8 inputs, we know that the lower bits of each character code carry more information.
- *  Cyrillic alphabet, for example, falls into [0x0410, 0x042F] code range for uppercase [А, Я], and
- *  into [0x0430, 0x044F] for lowercase [а, я]. Scanning through a text written in Russian, half of the
- *  bytes will carry absolutely no value and will be equal to 0x04.
- */
-SZ_INTERNAL void _sz_locate_needle_anomalies(sz_cptr_t start, sz_size_t length, //
-                                             sz_size_t *first, sz_size_t *second, sz_size_t *third) {
-    *first = 0;
-    *second = length / 2;
-    *third = length - 1;
-
-    //
-    int has_duplicates =                   //
-        start[*first] == start[*second] || //
-        start[*first] == start[*third] ||  //
-        start[*second] == start[*third];
-
-    // Loop through letters to find non-colliding variants.
-    if (length > 3 && has_duplicates) {
-        // Pivot the middle point right, until we find a character different from the first one.
-        for (; start[*second] == start[*first] && *second + 1 < *third; ++(*second)) {}
-        // Pivot the third (last) point left, until we find a different character.
-        for (; (start[*third] == start[*second] || start[*third] == start[*first]) && *third > (*second + 1);
-             --(*third)) {}
-    }
-
-    // TODO: Investigate alternative strategies for long needles.
-    // On very long needles we have the luxury to choose!
-    // Often dealing with UTF8, we will likely benefit from shifting the first and second characters
-    // further to the right, to achieve not only uniqueness within the needle, but also avoid common
-    // rune prefixes of 2-, 3-, and 4-byte codes.
-    if (length > 8) {
-        // Pivot the first and second points right, until we find a character, that:
-        // > is different from others.
-        // > doesn't start with 0b'110x'xxxx - only 5 bits of relevant info.
-        // > doesn't start with 0b'1110'xxxx - only 4 bits of relevant info.
-        // > doesn't start with 0b'1111'0xxx - only 3 bits of relevant info.
-        //
-        // So we are practically searching for byte values that start with 0b0xxx'xxxx or 0b'10xx'xxxx.
-        // Meaning they fall in the range [0, 127] and [128, 191], in other words any unsigned int up to 191.
-        sz_u8_t const *start_u8 = (sz_u8_t const *)start;
-        sz_size_t vibrant_first = *first, vibrant_second = *second, vibrant_third = *third;
-
-        // Let's begin with the seccond character, as the termination criteria there is more obvious
-        // and we may end up with more variants to check for the first candidate.
-        for (; (start_u8[vibrant_second] > 191 || start_u8[vibrant_second] == start_u8[vibrant_third]) &&
-               (vibrant_second + 1 < vibrant_third);
-             ++vibrant_second) {}
-
-        // Now check if we've indeed found a good candidate or should revert the `vibrant_second` to `second`.
-        if (start_u8[vibrant_second] < 191) { *second = vibrant_second; }
-        else { vibrant_second = *second; }
-
-        // Now check the first character.
-        for (; (start_u8[vibrant_first] > 191 || start_u8[vibrant_first] == start_u8[vibrant_second] ||
-                start_u8[vibrant_first] == start_u8[vibrant_third]) &&
-               (vibrant_first + 1 < vibrant_second);
-             ++vibrant_first) {}
-
-        // Now check if we've indeed found a good candidate or should revert the `vibrant_first` to `first`.
-        // We don't need to shift the third one when dealing with texts as the last byte of the text is
-        // also the last byte of a rune and contains the most information.
-        if (start_u8[vibrant_first] < 191) { *first = vibrant_first; }
-    }
-}
-
-#pragma GCC visibility pop
-#pragma endregion
-
-#pragma region Serial Implementation
-
-#if !SZ_AVOID_LIBC
-#include <stdio.h>  // `fprintf`
-#include <stdlib.h> // `malloc`, `EXIT_FAILURE`
-
-SZ_PUBLIC void *_sz_memory_allocate_default(sz_size_t length, void *handle) {
-    sz_unused(handle);
-    return malloc(length);
-}
-SZ_PUBLIC void _sz_memory_free_default(sz_ptr_t start, sz_size_t length, void *handle) {
-    sz_unused(handle && length);
-    free(start);
-}
-
-#endif
-
-SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc) {
-#if !SZ_AVOID_LIBC
-    alloc->allocate = (sz_memory_allocate_t)_sz_memory_allocate_default;
-    alloc->free = (sz_memory_free_t)_sz_memory_free_default;
-#else
-    alloc->allocate = (sz_memory_allocate_t)SZ_NULL;
-    alloc->free = (sz_memory_free_t)SZ_NULL;
-#endif
-    alloc->handle = SZ_NULL;
-}
-
-SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length) {
-    // The logic here is simple - put the buffer length in the first slots of the buffer.
-    // Later use it for bounds checking.
-    alloc->allocate = (sz_memory_allocate_t)_sz_memory_allocate_fixed;
-    alloc->free = (sz_memory_free_t)_sz_memory_free_fixed;
-    alloc->handle = &buffer;
-    sz_copy((sz_ptr_t)buffer, (sz_cptr_t)&length, sizeof(sz_size_t));
-}
-
-/**
- *  @brief  Byte-level equality comparison between two strings.
- *          If unaligned loads are allowed, uses a switch-table to avoid loops on short strings.
- */
-SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_cptr_t const a_end = a + length;
-#if SZ_USE_MISALIGNED_LOADS
-    if (length >= SZ_SWAR_THRESHOLD) {
-        sz_u64_vec_t a_vec, b_vec;
-        for (; a + 8 <= a_end; a += 8, b += 8) {
-            a_vec = sz_u64_load(a);
-            b_vec = sz_u64_load(b);
-            if (a_vec.u64 != b_vec.u64) return sz_false_k;
-        }
-    }
-#endif
-    while (a != a_end && *a == *b) a++, b++;
-    return (sz_bool_t)(a_end == a);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-    for (sz_cptr_t const end = text + length; text != end; ++text)
-        if (sz_charset_contains(set, *text)) return text;
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Warray-bounds"
-    sz_cptr_t const end = text;
-    for (text += length; text != end;)
-        if (sz_charset_contains(set, *(text -= 1))) return text;
-    return SZ_NULL_CHAR;
-#pragma GCC diagnostic pop
-}
-
-/**
- *  One option to avoid branching is to use conditional moves and lookup the comparison result in a table:
- *       sz_ordering_t ordering_lookup[2] = {sz_greater_k, sz_less_k};
- *       for (; a != min_end; ++a, ++b)
- *           if (*a != *b) return ordering_lookup[*a < *b];
- *  That, however, introduces a data-dependency.
- *  A cleaner option is to perform two comparisons and a subtraction.
- *  One instruction more, but no data-dependency.
- */
-#define _sz_order_scalars(a, b) ((sz_ordering_t)((a > b) - (a < b)))
-
-SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    sz_bool_t a_shorter = (sz_bool_t)(a_length < b_length);
-    sz_size_t min_length = a_shorter ? a_length : b_length;
-    sz_cptr_t min_end = a + min_length;
-#if SZ_USE_MISALIGNED_LOADS && !SZ_DETECT_BIG_ENDIAN
-    for (sz_u64_vec_t a_vec, b_vec; a + 8 <= min_end; a += 8, b += 8) {
-        a_vec = sz_u64_load(a);
-        b_vec = sz_u64_load(b);
-        if (a_vec.u64 != b_vec.u64)
-            return _sz_order_scalars(sz_u64_bytes_reverse(a_vec.u64), sz_u64_bytes_reverse(b_vec.u64));
-    }
-#endif
-    for (; a != min_end; ++a, ++b)
-        if (*a != *b) return _sz_order_scalars(*a, *b);
-
-    // If the strings are equal up to `min_end`, then the shorter string is smaller
-    return _sz_order_scalars(a_length, b_length);
-}
-
-/**
- *  @brief  Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each byte is set.
-    // For that take the bottom 7 bits of each byte, add one to them,
-    // and if this sets the top bit to one, then all the 7 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7F7F7F7F7F7F7F7Full) + 0x0101010101010101ull) & ((vec.u64 & 0x8080808080808080ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b single-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- *          Identical to `memchr(haystack, needle[0], haystack_length)`.
- */
-SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    if (!h_length) return SZ_NULL_CHAR;
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_DETECT_BIG_ENDIAN    // Use SWAR only on little-endian platforms for brevety.
-#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
-        if (*h == *n) return h;
-#endif
-
-    // Broadcast the n into every byte of a 64-bit integer to use SWAR
-    // techniques and process eight characters at a time.
-    sz_u64_vec_t h_vec, n_vec, match_vec;
-    match_vec.u64 = 0;
-    n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
-    for (; h + 8 <= h_end; h += 8) {
-        h_vec.u64 = *(sz_u64_t const *)h;
-        match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
-        if (match_vec.u64) return h + sz_u64_ctz(match_vec.u64) / 8;
-    }
-#endif
-
-    // Handle the misaligned tail.
-    for (; h < h_end; ++h)
-        if (*h == *n) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Find the last occurrence of a @b single-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- *          Identical to `memrchr(haystack, needle[0], haystack_length)`.
- */
-sz_cptr_t sz_rfind_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    if (!h_length) return SZ_NULL_CHAR;
-    sz_cptr_t const h_start = h;
-
-    // Reposition the `h` pointer to the end, as we will be walking backwards.
-    h = h + h_length - 1;
-
-#if !SZ_DETECT_BIG_ENDIAN    // Use SWAR only on little-endian platforms for brevety.
-#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)(h + 1) & 7ull) && h >= h_start; --h)
-        if (*h == *n) return h;
-#endif
-
-    // Broadcast the n into every byte of a 64-bit integer to use SWAR
-    // techniques and process eight characters at a time.
-    sz_u64_vec_t h_vec, n_vec, match_vec;
-    n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
-    for (; h >= h_start + 7; h -= 8) {
-        h_vec.u64 = *(sz_u64_t const *)(h - 7);
-        match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
-        if (match_vec.u64) return h - sz_u64_clz(match_vec.u64) / 8;
-    }
-#endif
-
-    for (; h >= h_start; --h)
-        if (*h == *n) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  2Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 2byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_2byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 2byte is set.
-    // For that take the bottom 15 bits of each 2byte, add one to them,
-    // and if this sets the top bit to one, then all the 15 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7FFF7FFF7FFF7FFFull) + 0x0001000100010001ull) & ((vec.u64 & 0x8000800080008000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b two-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_2byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 2 bytes long.
-    sz_assert(h_length >= 2 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 2 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
-#endif
-
-    sz_u64_vec_t h_even_vec, h_odd_vec, n_vec, matches_even_vec, matches_odd_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1];
-    n_vec.u64 *= 0x0001000100010001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
-    for (; h + 9 <= h_end; h += 8) {
-        h_even_vec.u64 = *(sz_u64_t *)h;
-        h_odd_vec.u64 = (h_even_vec.u64 >> 8) | ((sz_u64_t)h[8] << 56);
-        matches_even_vec = _sz_u64_each_2byte_equal(h_even_vec, n_vec);
-        matches_odd_vec = _sz_u64_each_2byte_equal(h_odd_vec, n_vec);
-
-        matches_even_vec.u64 >>= 8;
-        if (matches_even_vec.u64 + matches_odd_vec.u64) {
-            sz_u64_t match_indicators = matches_even_vec.u64 | matches_odd_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 2 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  4Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 4byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_4byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 4byte is set.
-    // For that take the bottom 31 bits of each 4byte, add one to them,
-    // and if this sets the top bit to one, then all the 31 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7FFFFFFF7FFFFFFFull) + 0x0000000100000001ull) & ((vec.u64 & 0x8000000080000000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b four-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_4byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
-    sz_assert(h_length >= 4 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 4 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
-#endif
-
-    sz_u64_vec_t h0_vec, h1_vec, h2_vec, h3_vec, n_vec, matches0_vec, matches1_vec, matches2_vec, matches3_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1], n_vec.u8s[2] = n[2], n_vec.u8s[3] = n[3];
-    n_vec.u64 *= 0x0000000100000001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time using four 64-bit words.
-    // We load the subsequent four-byte word as well, taking its first bytes. Think of it as a glorified prefetch :)
-    sz_u64_t h_page_current, h_page_next;
-    for (; h + sizeof(sz_u64_t) + sizeof(sz_u32_t) <= h_end; h += sizeof(sz_u64_t)) {
-        h_page_current = *(sz_u64_t *)h;
-        h_page_next = *(sz_u32_t *)(h + 8);
-        h0_vec.u64 = (h_page_current);
-        h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
-        h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
-        h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
-        matches0_vec = _sz_u64_each_4byte_equal(h0_vec, n_vec);
-        matches1_vec = _sz_u64_each_4byte_equal(h1_vec, n_vec);
-        matches2_vec = _sz_u64_each_4byte_equal(h2_vec, n_vec);
-        matches3_vec = _sz_u64_each_4byte_equal(h3_vec, n_vec);
-
-        if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64) {
-            matches0_vec.u64 >>= 24;
-            matches1_vec.u64 >>= 16;
-            matches2_vec.u64 >>= 8;
-            sz_u64_t match_indicators = matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 4 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  3Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 3byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_3byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 4byte is set.
-    // For that take the bottom 31 bits of each 4byte, add one to them,
-    // and if this sets the top bit to one, then all the 31 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0xFFFF7FFFFF7FFFFFull) + 0x0000000001000001ull) & ((vec.u64 & 0x0000800000800000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b three-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_3byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
-    sz_assert(h_length >= 3 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 3 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
-#endif
-
-    // We fetch 12
-    sz_u64_vec_t h0_vec, h1_vec, h2_vec, h3_vec, h4_vec;
-    sz_u64_vec_t matches0_vec, matches1_vec, matches2_vec, matches3_vec, matches4_vec;
-    sz_u64_vec_t n_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1], n_vec.u8s[2] = n[2];
-    n_vec.u64 *= 0x0000000001000001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time using three 64-bit words.
-    // We load the subsequent two-byte word as well.
-    sz_u64_t h_page_current, h_page_next;
-    for (; h + sizeof(sz_u64_t) + sizeof(sz_u16_t) <= h_end; h += sizeof(sz_u64_t)) {
-        h_page_current = *(sz_u64_t *)h;
-        h_page_next = *(sz_u16_t *)(h + 8);
-        h0_vec.u64 = (h_page_current);
-        h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
-        h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
-        h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
-        h4_vec.u64 = (h_page_current >> 32) | (h_page_next << 32);
-        matches0_vec = _sz_u64_each_3byte_equal(h0_vec, n_vec);
-        matches1_vec = _sz_u64_each_3byte_equal(h1_vec, n_vec);
-        matches2_vec = _sz_u64_each_3byte_equal(h2_vec, n_vec);
-        matches3_vec = _sz_u64_each_3byte_equal(h3_vec, n_vec);
-        matches4_vec = _sz_u64_each_3byte_equal(h4_vec, n_vec);
-
-        if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64) {
-            matches0_vec.u64 >>= 16;
-            matches1_vec.u64 >>= 8;
-            matches3_vec.u64 <<= 8;
-            matches4_vec.u64 <<= 16;
-            sz_u64_t match_indicators =
-                matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 3 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Boyer-Moore-Horspool algorithm for exact matching of patterns up to @b 256-bytes long.
- *          Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_horspool_upto_256bytes_serial(sz_cptr_t h_chars, sz_size_t h_length, //
-                                                             sz_cptr_t n_chars, sz_size_t n_length) {
-    sz_assert(n_length <= 256 && "The pattern is too long.");
-    // Several popular string matching algorithms are using a bad-character shift table.
-    // Boyer Moore: https://www-igm.univ-mlv.fr/~lecroq/string/node14.html
-    // Quick Search: https://www-igm.univ-mlv.fr/~lecroq/string/node19.html
-    // Smith: https://www-igm.univ-mlv.fr/~lecroq/string/node21.html
-    union {
-        sz_u8_t jumps[256];
-        sz_u64_vec_t vecs[64];
-    } bad_shift_table;
-
-    // Let's initialize the table using SWAR to the total length of the string.
-    sz_u8_t const *h = (sz_u8_t const *)h_chars;
-    sz_u8_t const *n = (sz_u8_t const *)n_chars;
-    {
-        sz_u64_vec_t n_length_vec;
-        n_length_vec.u64 = n_length;
-        n_length_vec.u64 *= 0x0101010101010101ull; // broadcast
-        for (sz_size_t i = 0; i != 64; ++i) bad_shift_table.vecs[i].u64 = n_length_vec.u64;
-        for (sz_size_t i = 0; i + 1 < n_length; ++i) bad_shift_table.jumps[n[i]] = (sz_u8_t)(n_length - i - 1);
-    }
-
-    // Another common heuristic is to match a few characters from different parts of a string.
-    // Raita suggests to use the first two, the last, and the middle character of the pattern.
-    sz_u32_vec_t h_vec, n_vec;
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into an unsigned integer.
-    n_vec.u8s[0] = n[offset_first];
-    n_vec.u8s[1] = n[offset_first + 1];
-    n_vec.u8s[2] = n[offset_mid];
-    n_vec.u8s[3] = n[offset_last];
-
-    // Scan through the whole haystack, skipping the last `n_length - 1` bytes.
-    for (sz_size_t i = 0; i <= h_length - n_length;) {
-        h_vec.u8s[0] = h[i + offset_first];
-        h_vec.u8s[1] = h[i + offset_first + 1];
-        h_vec.u8s[2] = h[i + offset_mid];
-        h_vec.u8s[3] = h[i + offset_last];
-        if (h_vec.u32 == n_vec.u32 && sz_equal((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
-        i += bad_shift_table.jumps[h[i + n_length - 1]];
-    }
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Boyer-Moore-Horspool algorithm for @b reverse-order exact matching of patterns up to @b 256-bytes long.
- *          Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
- */
-SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_upto_256bytes_serial(sz_cptr_t h_chars, sz_size_t h_length, //
-                                                              sz_cptr_t n_chars, sz_size_t n_length) {
-    sz_assert(n_length <= 256 && "The pattern is too long.");
-    union {
-        sz_u8_t jumps[256];
-        sz_u64_vec_t vecs[64];
-    } bad_shift_table;
-
-    // Let's initialize the table using SWAR to the total length of the string.
-    sz_u8_t const *h = (sz_u8_t const *)h_chars;
-    sz_u8_t const *n = (sz_u8_t const *)n_chars;
-    {
-        sz_u64_vec_t n_length_vec;
-        n_length_vec.u64 = n_length;
-        n_length_vec.u64 *= 0x0101010101010101ull; // broadcast
-        for (sz_size_t i = 0; i != 64; ++i) bad_shift_table.vecs[i].u64 = n_length_vec.u64;
-        for (sz_size_t i = 0; i + 1 < n_length; ++i)
-            bad_shift_table.jumps[n[n_length - i - 1]] = (sz_u8_t)(n_length - i - 1);
-    }
-
-    // Another common heuristic is to match a few characters from different parts of a string.
-    // Raita suggests to use the first two, the last, and the middle character of the pattern.
-    sz_u32_vec_t h_vec, n_vec;
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into an unsigned integer.
-    n_vec.u8s[0] = n[offset_first];
-    n_vec.u8s[1] = n[offset_first + 1];
-    n_vec.u8s[2] = n[offset_mid];
-    n_vec.u8s[3] = n[offset_last];
-
-    // Scan through the whole haystack, skipping the first `n_length - 1` bytes.
-    for (sz_size_t j = 0; j <= h_length - n_length;) {
-        sz_size_t i = h_length - n_length - j;
-        h_vec.u8s[0] = h[i + offset_first];
-        h_vec.u8s[1] = h[i + offset_first + 1];
-        h_vec.u8s[2] = h[i + offset_mid];
-        h_vec.u8s[3] = h[i + offset_last];
-        if (h_vec.u32 == n_vec.u32 && sz_equal((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
-        j += bad_shift_table.jumps[h[i]];
-    }
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Exact substring search helper function, that finds the first occurrence of a prefix of the needle
- *          using a given search function, and then verifies the remaining part of the needle.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_with_prefix(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length,
-                                           sz_find_t find_prefix, sz_size_t prefix_length) {
-
-    sz_size_t suffix_length = n_length - prefix_length;
-    while (1) {
-        sz_cptr_t found = find_prefix(h, h_length, n, prefix_length);
-        if (!found) return SZ_NULL_CHAR;
-
-        // Verify the remaining part of the needle
-        sz_size_t remaining = h_length - (found - h);
-        if (remaining < n_length) return SZ_NULL_CHAR;
-        if (sz_equal(found + prefix_length, n + prefix_length, suffix_length)) return found;
-
-        // Adjust the position.
-        h = found + 1;
-        h_length = remaining - 1;
-    }
-
-    // Unreachable, but helps silence compiler warnings:
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Exact reverse-order substring search helper function, that finds the last occurrence of a suffix of the
- *          needle using a given search function, and then verifies the remaining part of the needle.
- */
-SZ_INTERNAL sz_cptr_t _sz_rfind_with_suffix(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length,
-                                            sz_find_t find_suffix, sz_size_t suffix_length) {
-
-    sz_size_t prefix_length = n_length - suffix_length;
-    while (1) {
-        sz_cptr_t found = find_suffix(h, h_length, n + prefix_length, suffix_length);
-        if (!found) return SZ_NULL_CHAR;
-
-        // Verify the remaining part of the needle
-        sz_size_t remaining = found - h;
-        if (remaining < prefix_length) return SZ_NULL_CHAR;
-        if (sz_equal(found - prefix_length, n, prefix_length)) return found - prefix_length;
-
-        // Adjust the position.
-        h_length = remaining - 1;
-    }
-
-    // Unreachable, but helps silence compiler warnings:
-    return SZ_NULL_CHAR;
-}
-
-SZ_INTERNAL sz_cptr_t _sz_find_over_4bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    return _sz_find_with_prefix(h, h_length, n, n_length, (sz_find_t)_sz_find_4byte_serial, 4);
-}
-
-SZ_INTERNAL sz_cptr_t _sz_find_horspool_over_256bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
-                                                             sz_size_t n_length) {
-    return _sz_find_with_prefix(h, h_length, n, n_length, _sz_find_horspool_upto_256bytes_serial, 256);
-}
-
-SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_over_256bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
-                                                              sz_size_t n_length) {
-    return _sz_rfind_with_suffix(h, h_length, n, n_length, _sz_rfind_horspool_upto_256bytes_serial, 256);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-
-#if SZ_DETECT_BIG_ENDIAN
-    sz_find_t backends[] = {
-        (sz_find_t)sz_find_byte_serial,
-        (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_find_horspool_over_256bytes_serial,
-    };
-
-    return backends[(n_length > 1) + (n_length > 256)](h, h_length, n, n_length);
-#else
-    sz_find_t backends[] = {
-        // For very short strings brute-force SWAR makes sense.
-        (sz_find_t)sz_find_byte_serial,
-        (sz_find_t)_sz_find_2byte_serial,
-        (sz_find_t)_sz_find_3byte_serial,
-        (sz_find_t)_sz_find_4byte_serial,
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (sz_find_t)_sz_find_over_4bytes_serial,
-        // For longer needles - use skip tables.
-        (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_find_horspool_over_256bytes_serial,
-    };
-
-    return backends[
-        // For very short strings brute-force SWAR makes sense.
-        (n_length > 1) + (n_length > 2) + (n_length > 3) +
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (n_length > 4) +
-        // For longer needles - use skip tables.
-        (n_length > 8) + (n_length > 256)](h, h_length, n, n_length);
-#endif
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-
-    sz_find_t backends[] = {
-        // For very short strings brute-force SWAR makes sense.
-        (sz_find_t)sz_rfind_byte_serial,
-        //  TODO: implement reverse-order SWAR for 2/3/4 byte variants.
-        //  TODO: (sz_find_t)_sz_rfind_2byte_serial,
-        //  TODO: (sz_find_t)_sz_rfind_3byte_serial,
-        //  TODO: (sz_find_t)_sz_rfind_4byte_serial,
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        // (sz_find_t)_sz_rfind_over_4bytes_serial,
-        // For longer needles - use skip tables.
-        (sz_find_t)_sz_rfind_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_rfind_horspool_over_256bytes_serial,
-    };
-
-    return backends[
-        // For very short strings brute-force SWAR makes sense.
-        0 +
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (n_length > 1) +
-        // For longer needles - use skip tables.
-        (n_length > 256)](h, h_length, n, n_length);
-}
-
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_serial( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                 //
-    sz_cptr_t longer, sz_size_t longer_length,                   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // TODO: Generalize to remove the following asserts!
-    sz_assert(!bound && "For bounded search the method should only evaluate one band of the matrix.");
-    sz_assert(shorter_length == longer_length && "The method hasn't been generalized to different length inputs yet.");
-    sz_unused(longer_length && bound);
-
-    // We are going to store 3 diagonals of the matrix.
-    // The length of the longest (main) diagonal would be `n = (shorter_length + 1)`.
-    sz_size_t n = shorter_length + 1;
-    sz_size_t buffer_length = sizeof(sz_size_t) * n * 3;
-    sz_size_t *distances = (sz_size_t *)alloc->allocate(buffer_length, alloc->handle);
-    if (!distances) return SZ_SIZE_MAX;
-
-    sz_size_t *previous_distances = distances;
-    sz_size_t *current_distances = previous_distances + n;
-    sz_size_t *next_distances = previous_distances + n * 2;
-
-    // Initialize the first two diagonals:
-    previous_distances[0] = 0;
-    current_distances[0] = current_distances[1] = 1;
-
-    // Progress through the upper triangle of the Levenshtein matrix.
-    sz_size_t next_diagonal_index = 2;
-    for (; next_diagonal_index != n; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
-        for (sz_size_t i = 0; i + 2 < next_diagonal_length; ++i) {
-            sz_size_t cost_of_substitution = shorter[next_diagonal_index - i - 2] != longer[i];
-            sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
-            sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
-            next_distances[i + 1] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
-        }
-        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-        next_distances[0] = next_distances[next_diagonal_length - 1] = next_diagonal_index;
-        // Perform a circular rotation of those buffers, to reuse the memory.
-        sz_size_t *temporary = previous_distances;
-        previous_distances = current_distances;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a
-    // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
-    // index on either side, we will be cropping those values out.
-    sz_size_t diagonals_count = n + n - 1;
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
-        for (sz_size_t i = 0; i != next_diagonal_length; ++i) {
-            sz_size_t cost_of_substitution = shorter[shorter_length - 1 - i] != longer[next_diagonal_index - n + i];
-            sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
-            sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
-            next_distances[i] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
-        }
-        // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
-        // dropping the first element in the current array.
-        sz_size_t *temporary = previous_distances;
-        previous_distances = current_distances + 1;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // Cache scalar before `free` call.
-    sz_size_t result = current_distances[0];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-}
-
-/**
- *  @brief  Describes the length of a UTF8 character / codepoint / rune in bytes.
- */
-typedef enum {
-    sz_utf8_invalid_k = 0,     //!< Invalid UTF8 character.
-    sz_utf8_rune_1byte_k = 1,  //!< 1-byte UTF8 character.
-    sz_utf8_rune_2bytes_k = 2, //!< 2-byte UTF8 character.
-    sz_utf8_rune_3bytes_k = 3, //!< 3-byte UTF8 character.
-    sz_utf8_rune_4bytes_k = 4, //!< 4-byte UTF8 character.
-} sz_rune_length_t;
-
-typedef sz_u32_t sz_rune_t;
-
-/**
- *  @brief  Extracts just one UTF8 codepoint from a UTF8 string into a 32-bit unsigned integer.
- */
-SZ_INTERNAL void _sz_extract_utf8_rune(sz_cptr_t utf8, sz_rune_t *code, sz_rune_length_t *code_length) {
-    sz_u8_t const *current = (sz_u8_t const *)utf8;
-    sz_u8_t leading_byte = *current++;
-    sz_rune_t ch;
-    sz_rune_length_t ch_length;
-
-    // TODO: This can be made entirely branchless using 32-bit SWAR.
-    if (leading_byte < 0x80) {
-        // Single-byte rune (0xxxxxxx)
-        ch = leading_byte;
-        ch_length = sz_utf8_rune_1byte_k;
-    }
-    else if ((leading_byte & 0xE0) == 0xC0) {
-        // Two-byte rune (110xxxxx 10xxxxxx)
-        ch = (leading_byte & 0x1F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_2bytes_k;
-    }
-    else if ((leading_byte & 0xF0) == 0xE0) {
-        // Three-byte rune (1110xxxx 10xxxxxx 10xxxxxx)
-        ch = (leading_byte & 0x0F) << 12;
-        ch |= (*current++ & 0x3F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_3bytes_k;
-    }
-    else if ((leading_byte & 0xF8) == 0xF0) {
-        // Four-byte rune (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
-        ch = (leading_byte & 0x07) << 18;
-        ch |= (*current++ & 0x3F) << 12;
-        ch |= (*current++ & 0x3F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_4bytes_k;
-    }
-    else {
-        // Invalid UTF8 rune.
-        ch = 0;
-        ch_length = sz_utf8_invalid_k;
-    }
-    *code = ch;
-    *code_length = ch_length;
-}
-
-/**
- *  @brief  Exports a UTF8 string into a UTF32 buffer.
- *          ! The result is undefined id the UTF8 string is corrupted.
- *  @return The length in the number of codepoints.
- */
-SZ_INTERNAL sz_size_t _sz_export_utf8_to_utf32(sz_cptr_t utf8, sz_size_t utf8_length, sz_rune_t *utf32) {
-    sz_cptr_t const end = utf8 + utf8_length;
-    sz_size_t count = 0;
-    sz_rune_length_t rune_length;
-    for (; utf8 != end; utf8 += rune_length, utf32++, count++) _sz_extract_utf8_rune(utf8, utf32, &rune_length);
-    return count;
-}
-
-/**
- *  @brief  Compute the Levenshtein distance between two strings using the Wagner-Fisher algorithm.
- *          Stores only 2 rows of the Levenshtein matrix, but uses 64-bit integers for the distance values,
- *          and upcasts UTF8 variable-length codepoints to 64-bit integers for faster addressing.
+ *  Includes:
  *
- *  ! In the worst case for 2 strings of length 100, that contain just one 16-bit codepoint this will result in extra:
- *      + 2 rows * 100 slots * 8 bytes/slot = 1600 bytes of memory for the two rows of the Levenshtein matrix rows.
- *      + 100 codepoints * 2 strings * 4 bytes/codepoint = 800 bytes of memory for the UTF8 buffer.
- *      = 2400 bytes of memory or @b 12x memory amplification!
+ *  - `sz_copy` - analog to `memcpy`
+ *  - `sz_move` - analog to `memmove`
+ *  - `sz_fill` - analog to `memset`
+ *  - `sz_look_up_transform` - LUT transformation of a string, similar to OpenCV LUT
+ *  - `sz_detect_encoding` - similar to `iconv` or `chardet`
+ *
+ *  Convenience functions for character-set mapping:
+ *
+ *  - `sz_tolower`, `sz_toupper`, `sz_toascii` for ASCII ranges
  */
-SZ_INTERNAL sz_size_t _sz_edit_distance_wagner_fisher_serial( //
-    sz_cptr_t longer, sz_size_t longer_length,                //
-    sz_cptr_t shorter, sz_size_t shorter_length,              //
-    sz_size_t bound, sz_bool_t can_be_unicode, sz_memory_allocator_t *alloc) {
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // A good idea may be to dispatch different kernels for different string lengths.
-    // Like using `uint8_t` counters for strings under 255 characters long.
-    // Good in theory, this results in frequent upcasts and downcasts in serial code.
-    // On strings over 20 bytes, using `uint8` over `uint64` on 64-bit x86 CPU doubles the execution time.
-    // So one must be very cautious with such optimizations.
-    typedef sz_size_t _distance_t;
-
-    // Compute the number of columns in our Levenshtein matrix.
-    sz_size_t const n = shorter_length + 1;
-
-    // If a buffering memory-allocator is provided, this operation is practically free,
-    // and cheaper than allocating even 512 bytes (for small distance matrices) on stack.
-    sz_size_t buffer_length = sizeof(_distance_t) * (n * 2);
-
-    // If the strings contain Unicode characters, let's estimate the max character width,
-    // and use it to allocate a larger buffer to decode UTF8.
-    if ((can_be_unicode == sz_true_k) &&
-        (sz_isascii(longer, longer_length) == sz_false_k || sz_isascii(shorter, shorter_length) == sz_false_k)) {
-        buffer_length += (shorter_length + longer_length) * sizeof(sz_rune_t);
-    }
-    else { can_be_unicode = sz_false_k; }
-
-    // If the allocation fails, return the maximum distance.
-    sz_ptr_t const buffer = (sz_ptr_t)alloc->allocate(buffer_length, alloc->handle);
-    if (!buffer) return SZ_SIZE_MAX;
-
-    // Let's export the UTF8 sequence into the newly allocated buffer at the end.
-    if (can_be_unicode == sz_true_k) {
-        sz_rune_t *const longer_utf32 = (sz_rune_t *)(buffer + sizeof(_distance_t) * (n * 2));
-        sz_rune_t *const shorter_utf32 = longer_utf32 + longer_length;
-        // Export the UTF8 sequences into the newly allocated buffer.
-        longer_length = _sz_export_utf8_to_utf32(longer, longer_length, longer_utf32);
-        shorter_length = _sz_export_utf8_to_utf32(shorter, shorter_length, shorter_utf32);
-        longer = (sz_cptr_t)longer_utf32;
-        shorter = (sz_cptr_t)shorter_utf32;
-    }
-
-    // Let's parameterize the core logic for different character types and distance types.
-#define _wagner_fisher_unbounded(_distance_t, _char_t)                                                                \
-    /* Now let's cast our pointer to avoid it in subsequent sections. */                                              \
-    _char_t const *const longer_chars = (_char_t const *)longer;                                                      \
-    _char_t const *const shorter_chars = (_char_t const *)shorter;                                                    \
-    _distance_t *previous_distances = (_distance_t *)buffer;                                                          \
-    _distance_t *current_distances = previous_distances + n;                                                          \
-    /*  Initialize the first row of the Levenshtein matrix with `iota`-style arithmetic progression. */               \
-    for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
-    /* The main loop of the algorithm with quadratic complexity. */                                                   \
-    for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {                                     \
-        _char_t const longer_char = longer_chars[idx_longer];                                                         \
-        /* Using pure pointer arithmetic is faster than iterating with an index. */                                   \
-        _char_t const *shorter_ptr = shorter_chars;                                                                   \
-        _distance_t const *previous_ptr = previous_distances;                                                         \
-        _distance_t *current_ptr = current_distances;                                                                 \
-        _distance_t *const current_end = current_ptr + shorter_length;                                                \
-        current_ptr[0] = idx_longer + 1;                                                                              \
-        for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) {                            \
-            _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]);           \
-            /* We can avoid `+1` for costs here, shifting it to post-minimum computation, */                          \
-            /* saving one increment operation. */                                                                     \
-            _distance_t cost_deletion = previous_ptr[1];                                                              \
-            _distance_t cost_insertion = current_ptr[0];                                                              \
-            /* ? It might be a good idea to enforce branchless execution here. */                                     \
-            /* ? The caveat being that the benchmarks on longer sequences backfire and more research is needed. */    \
-            current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1);      \
-        }                                                                                                             \
-        /* Swap `previous_distances` and `current_distances` pointers. */                                             \
-        _distance_t *temporary = previous_distances;                                                                  \
-        previous_distances = current_distances;                                                                       \
-        current_distances = temporary;                                                                                \
-    }                                                                                                                 \
-    /* Cache scalar before `free` call. */                                                                            \
-    sz_size_t result = previous_distances[shorter_length];                                                            \
-    alloc->free(buffer, buffer_length, alloc->handle);                                                                \
-    return result;
-
-    // Let's define a separate variant for bounded distance computation.
-    // Practically the same as unbounded, but also collecting the running minimum within each row for early exit.
-#define _wagner_fisher_bounded(_distance_t, _char_t)                                                                  \
-    _char_t const *const longer_chars = (_char_t const *)longer;                                                      \
-    _char_t const *const shorter_chars = (_char_t const *)shorter;                                                    \
-    _distance_t *previous_distances = (_distance_t *)buffer;                                                          \
-    _distance_t *current_distances = previous_distances + n;                                                          \
-    for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
-    for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {                                     \
-        _char_t const longer_char = longer_chars[idx_longer];                                                         \
-        _char_t const *shorter_ptr = shorter_chars;                                                                   \
-        _distance_t const *previous_ptr = previous_distances;                                                         \
-        _distance_t *current_ptr = current_distances;                                                                 \
-        _distance_t *const current_end = current_ptr + shorter_length;                                                \
-        current_ptr[0] = idx_longer + 1;                                                                              \
-        /* Initialize min_distance with a value greater than bound */                                                 \
-        _distance_t min_distance = bound - 1;                                                                         \
-        for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) {                            \
-            _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]);           \
-            _distance_t cost_deletion = previous_ptr[1];                                                              \
-            _distance_t cost_insertion = current_ptr[0];                                                              \
-            current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1);      \
-            /* Keep track of the minimum distance seen so far in this row */                                          \
-            min_distance = sz_min_of_two(current_ptr[1], min_distance);                                               \
-        }                                                                                                             \
-        /* If the minimum distance in this row exceeded the bound, return early */                                    \
-        if (min_distance >= bound) {                                                                                  \
-            alloc->free(buffer, buffer_length, alloc->handle);                                                        \
-            return bound;                                                                                             \
-        }                                                                                                             \
-        _distance_t *temporary = previous_distances;                                                                  \
-        previous_distances = current_distances;                                                                       \
-        current_distances = temporary;                                                                                \
-    }                                                                                                                 \
-    sz_size_t result = previous_distances[shorter_length];                                                            \
-    alloc->free(buffer, buffer_length, alloc->handle);                                                                \
-    return sz_min_of_two(result, bound);
-
-    // Dispatch the actual computation.
-    if (!bound) {
-        if (can_be_unicode == sz_true_k) { _wagner_fisher_unbounded(sz_size_t, sz_rune_t); }
-        else { _wagner_fisher_unbounded(sz_size_t, sz_u8_t); }
-    }
-    else {
-        if (can_be_unicode == sz_true_k) { _wagner_fisher_bounded(sz_size_t, sz_rune_t); }
-        else { _wagner_fisher_bounded(sz_size_t, sz_u8_t); }
-    }
-}
-
-SZ_PUBLIC sz_size_t sz_edit_distance_serial(     //
-    sz_cptr_t longer, sz_size_t longer_length,   //
-    sz_cptr_t shorter, sz_size_t shorter_length, //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
-    }
-
-    // Skip the matching prefixes and suffixes, they won't affect the distance.
-    for (sz_cptr_t a_end = longer + longer_length, b_end = shorter + shorter_length;
-         longer != a_end && shorter != b_end && *longer == *shorter;
-         ++longer, ++shorter, --longer_length, --shorter_length);
-    for (; longer_length && shorter_length && longer[longer_length - 1] == shorter[shorter_length - 1];
-         --longer_length, --shorter_length);
-
-    // Bounded computations may exit early.
-    int const is_bounded = bound < longer_length;
-    if (is_bounded) {
-        // If one of the strings is empty - the edit distance is equal to the length of the other one.
-        if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
-        if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
-        // If the difference in length is beyond the `bound`, there is no need to check at all.
-        if (longer_length - shorter_length > bound) return bound;
-    }
-
-    if (shorter_length == 0) return longer_length; // If no mismatches were found - the distance is zero.
-    if (shorter_length == longer_length && !is_bounded)
-        return _sz_edit_distance_skewed_diagonals_serial(longer, longer_length, shorter, shorter_length, bound, alloc);
-    return _sz_edit_distance_wagner_fisher_serial(longer, longer_length, shorter, shorter_length, bound, sz_false_k,
-                                                  alloc);
-}
-
-SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(       //
-    sz_cptr_t longer, sz_size_t longer_length,        //
-    sz_cptr_t shorter, sz_size_t shorter_length,      //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, //
-    sz_memory_allocator_t *alloc) {
-
-    // If one of the strings is empty - the edit distance is equal to the length of the other one
-    if (longer_length == 0) return (sz_ssize_t)shorter_length * gap;
-    if (shorter_length == 0) return (sz_ssize_t)longer_length * gap;
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
-    }
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    sz_size_t n = shorter_length + 1;
-    sz_size_t buffer_length = sizeof(sz_ssize_t) * n * 2;
-    sz_ssize_t *distances = (sz_ssize_t *)alloc->allocate(buffer_length, alloc->handle);
-    sz_ssize_t *previous_distances = distances;
-    sz_ssize_t *current_distances = previous_distances + n;
+#ifndef STRINGZILLA_MEMORY_H_
+#define STRINGZILLA_MEMORY_H_
 
-    for (sz_size_t idx_shorter = 0; idx_shorter != n; ++idx_shorter)
-        previous_distances[idx_shorter] = (sz_ssize_t)idx_shorter * gap;
+#include "types.h"
 
-    sz_u8_t const *shorter_unsigned = (sz_u8_t const *)shorter;
-    sz_u8_t const *longer_unsigned = (sz_u8_t const *)longer;
-    for (sz_size_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {
-        current_distances[0] = ((sz_ssize_t)idx_longer + 1) * gap;
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-        // Initialize min_distance with a value greater than bound
-        sz_error_cost_t const *a_subs = subs + longer_unsigned[idx_longer] * 256ul;
-        for (sz_size_t idx_shorter = 0; idx_shorter != shorter_length; ++idx_shorter) {
-            sz_ssize_t cost_deletion = previous_distances[idx_shorter + 1] + gap;
-            sz_ssize_t cost_insertion = current_distances[idx_shorter] + gap;
-            sz_ssize_t cost_substitution = previous_distances[idx_shorter] + a_subs[shorter_unsigned[idx_shorter]];
-            current_distances[idx_shorter + 1] = sz_max_of_three(cost_deletion, cost_insertion, cost_substitution);
-        }
+#pragma region Core API
 
-        // Swap previous_distances and current_distances pointers
-        sz_pointer_swap((void **)&previous_distances, (void **)&current_distances);
-    }
+/**
+ *  @brief  Similar to `memcpy`, copies contents of one string into another.
+ *          The behavior is undefined if the strings overlap.
+ *
+ *  @param target   String to copy into.
+ *  @param length   Number of bytes to copy.
+ *  @param source   String to copy from.
+ */
+SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
 
-    // Cache scalar before `free` call.
-    sz_ssize_t result = previous_distances[shorter_length];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-}
+/**
+ *  @brief  Similar to `memmove`, copies (moves) contents of one string into another.
+ *          Unlike `sz_copy`, allows overlapping strings as arguments.
+ *
+ *  @param target   String to copy into.
+ *  @param length   Number of bytes to copy.
+ *  @param source   String to copy from.
+ */
+SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
 
-SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
-    sz_cptr_t a, sz_size_t a_length,            //
-    sz_cptr_t b, sz_size_t b_length,            //
-    sz_size_t bound) {
+/**
+ *  @brief  Similar to `memset`, fills a string with a given value.
+ *
+ *  @param target   String to fill.
+ *  @param length   Number of bytes to fill.
+ *  @param value    Value to fill with.
+ */
+SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value);
 
-    sz_size_t const min_length = sz_min_of_two(a_length, b_length);
-    sz_size_t const max_length = sz_max_of_two(a_length, b_length);
-    sz_cptr_t const a_end = a + min_length;
-    bound = bound == 0 ? max_length : bound;
+/** @copydoc sz_copy */
+SZ_PUBLIC void sz_copy_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
+/** @copydoc sz_move */
+SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
+/** @copydoc sz_fill */
+SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value);
 
-    // Walk through both strings using SWAR and counting the number of differing characters.
-    sz_size_t distance = max_length - min_length;
-#if SZ_USE_MISALIGNED_LOADS && !SZ_DETECT_BIG_ENDIAN
-    if (min_length >= SZ_SWAR_THRESHOLD) {
-        sz_u64_vec_t a_vec, b_vec, match_vec;
-        for (; a + 8 <= a_end && distance < bound; a += 8, b += 8) {
-            a_vec.u64 = sz_u64_load(a).u64;
-            b_vec.u64 = sz_u64_load(b).u64;
-            match_vec = _sz_u64_each_byte_equal(a_vec, b_vec);
-            distance += sz_u64_popcount((~match_vec.u64) & 0x8080808080808080ull);
-        }
-    }
+#if SZ_USE_HASWELL
+/** @copydoc sz_copy */
+SZ_PUBLIC sz_cptr_t sz_copy_haswell(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
+/** @copydoc sz_move */
+SZ_PUBLIC sz_cptr_t sz_move_haswell(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
+/** @copydoc sz_rfind_fill */
+SZ_PUBLIC sz_cptr_t sz_fill_haswell(sz_ptr_t target, sz_size_t length, sz_u8_t value);
 #endif
 
-    for (; a != a_end && distance < bound; ++a, ++b) { distance += (*a != *b); }
-    return sz_min_of_two(distance, bound);
-}
-
-SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial( //
-    sz_cptr_t a, sz_size_t a_length,                 //
-    sz_cptr_t b, sz_size_t b_length,                 //
-    sz_size_t bound) {
+#if SZ_USE_SKYLAKE
+/** @copydoc sz_copy */
+SZ_PUBLIC sz_cptr_t sz_copy_skylake(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
+/** @copydoc sz_move */
+SZ_PUBLIC sz_cptr_t sz_move_skylake(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
+/** @copydoc sz_rfind_fill */
+SZ_PUBLIC sz_cptr_t sz_fill_skylake(sz_ptr_t target, sz_size_t length, sz_u8_t value);
+#endif
 
-    sz_cptr_t const a_end = a + a_length;
-    sz_cptr_t const b_end = b + b_length;
-    sz_size_t distance = 0;
+#if SZ_USE_NEON
+/** @copydoc sz_copy */
+SZ_PUBLIC sz_cptr_t sz_copy_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
+/** @copydoc sz_move */
+SZ_PUBLIC sz_cptr_t sz_move_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
+/** @copydoc sz_rfind_fill */
+SZ_PUBLIC sz_cptr_t sz_fill_neon(sz_ptr_t target, sz_size_t length, sz_u8_t value);
+#endif
 
-    sz_rune_t a_rune, b_rune;
-    sz_rune_length_t a_rune_length, b_rune_length;
+/**
+ *  @brief  Look Up Table @b (LUT) transformation of a string. Equivalent to `for (char & c : text) c = lut[c]`.
+ *
+ *  Can be used to implement some form of string normalization, partially masking punctuation marks,
+ *  or converting between different character sets, like uppercase or lowercase. Surprisingly, also has
+ *  broad implications in image processing, where image channel transformations are often done using LUTs.
+ *
+ *  @param text     String to be normalized.
+ *  @param length   Number of bytes in the string.
+ *  @param lut      Look Up Table to apply. Must be exactly @b 256 bytes long.
+ *  @param result   Output string, can point to the same address as ::text.
+ */
+SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
 
-    if (bound) {
-        for (; a < a_end && b < b_end && distance < bound; a += a_rune_length, b += b_rune_length) {
-            _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-            _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-            distance += (a_rune != b_rune);
-        }
-        // If one string has more runes, we need to go through the tail.
-        if (distance < bound) {
-            for (; a < a_end && distance < bound; a += a_rune_length, ++distance)
-                _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
+/** @copydoc sz_look_up_transform */
+SZ_PUBLIC void sz_look_up_transform_serial(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
 
-            for (; b < b_end && distance < bound; b += b_rune_length, ++distance)
-                _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-        }
-    }
-    else {
-        for (; a < a_end && b < b_end; a += a_rune_length, b += b_rune_length) {
-            _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-            _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-            distance += (a_rune != b_rune);
-        }
-        // If one string has more runes, we need to go through the tail.
-        for (; a < a_end; a += a_rune_length, ++distance) _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-        for (; b < b_end; b += b_rune_length, ++distance) _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-    }
-    return distance;
-}
+#pragma endregion // Core API
 
-SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length) {
-    sz_u64_t checksum = 0;
-    sz_u8_t const *text_u8 = (sz_u8_t const *)text;
-    sz_u8_t const *text_end = text_u8 + length;
-    for (; text_u8 != text_end; ++text_u8) checksum += *text_u8;
-    return checksum;
-}
+#pragma region Helper API
 
 /**
- *  @brief  Largest prime number that fits into 31 bits.
- *  @see    https://mersenneforum.org/showthread.php?t=3471
+ *  @brief  Equivalent to `for (char & c : text) c = tolower(c)`.
+ *
+ *  ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
+ *  So there are 26 english letters, shifted by 32 values, meaning that a conversion
+ *  can be done by flipping the 5th bit each inappropriate character byte. This, however,
+ *  breaks for extended ASCII, so a different solution is needed.
+ *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
+ *
+ *  @param text     String to be normalized.
+ *  @param length   Number of bytes in the string.
+ *  @param result   Output string, can point to the same address as ::text.
  */
-#define SZ_U32_MAX_PRIME (2147483647u)
+SZ_PUBLIC void sz_tolower(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
 
 /**
- *  @brief  Largest prime number that fits into 64 bits.
- *  @see    https://mersenneforum.org/showthread.php?t=3471
+ *  @brief  Equivalent to `for (char & c : text) c = toupper(c)`.
  *
- *  2^64 = 18,446,744,073,709,551,616
- *  this = 18,446,744,073,709,551,557
- *  diff = 59
+ *  ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
+ *  So there are 26 english letters, shifted by 32 values, meaning that a conversion
+ *  can be done by flipping the 5th bit each inappropriate character byte. This, however,
+ *  breaks for extended ASCII, so a different solution is needed.
+ *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
+ *
+ *  @param text     String to be normalized.
+ *  @param length   Number of bytes in the string.
+ *  @param result   Output string, can point to the same address as ::text.
  */
-#define SZ_U64_MAX_PRIME (18446744073709551557ull)
+SZ_PUBLIC void sz_toupper(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
 
-/*
- *  One hardware-accelerated way of mixing hashes can be CRC, but it's only implemented for 32-bit values.
- *  Using a Boost-like mixer works very poorly in such case:
- *
- *       hash_first ^ (hash_second + 0x517cc1b727220a95 + (hash_first << 6) + (hash_first >> 2));
+/**
+ *  @brief  Equivalent to `for (char & c : text) c = toascii(c)`.
  *
- *  Let's stick to the Fibonacci hash trick using the golden ratio.
- *  https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
+ *  @param text     String to be normalized.
+ *  @param length   Number of bytes in the string.
+ *  @param result   Output string, can point to the same address as ::text.
  */
-#define _sz_hash_mix(first, second) ((first * 11400714819323198485ull) ^ (second * 11400714819323198485ull))
-#define _sz_shift_low(x) (x)
-#define _sz_shift_high(x) ((x + 77ull) & 0xFFull)
-#define _sz_prime_mod(x) (x % SZ_U64_MAX_PRIME)
-
-SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length) {
-
-    sz_u64_t hash_low = 0;
-    sz_u64_t hash_high = 0;
-    sz_u8_t const *text = (sz_u8_t const *)start;
-    sz_u8_t const *text_end = text + length;
-
-    switch (length) {
-    case 0: return 0;
-
-    // Texts under 7 bytes long are definitely below the largest prime.
-    case 1:
-        hash_low = _sz_shift_low(text[0]);
-        hash_high = _sz_shift_high(text[0]);
-        break;
-    case 2:
-        hash_low = _sz_shift_low(text[0]) * 31ull + _sz_shift_low(text[1]);
-        hash_high = _sz_shift_high(text[0]) * 257ull + _sz_shift_high(text[1]);
-        break;
-    case 3:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull +         //
-                   _sz_shift_low(text[2]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull +          //
-                    _sz_shift_high(text[2]);
-        break;
-    case 4:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull +                 //
-                   _sz_shift_low(text[3]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull +                   //
-                    _sz_shift_high(text[3]);
-        break;
-    case 5:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull +                         //
-                   _sz_shift_low(text[4]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull +                            //
-                    _sz_shift_high(text[4]);
-        break;
-    case 6:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull * 31ull +                         //
-                   _sz_shift_low(text[4]) * 31ull +                                 //
-                   _sz_shift_low(text[5]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull * 257ull +                            //
-                    _sz_shift_high(text[4]) * 257ull +                                     //
-                    _sz_shift_high(text[5]);
-        break;
-    case 7:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull * 31ull * 31ull +                         //
-                   _sz_shift_low(text[4]) * 31ull * 31ull +                                 //
-                   _sz_shift_low(text[5]) * 31ull +                                         //
-                   _sz_shift_low(text[6]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull * 257ull * 257ull +                            //
-                    _sz_shift_high(text[4]) * 257ull * 257ull +                                     //
-                    _sz_shift_high(text[5]) * 257ull +                                              //
-                    _sz_shift_high(text[6]);
-        break;
-    default:
-        // Unroll the first seven cycles:
-        hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[1]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[1]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[2]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[2]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[3]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[3]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[4]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[4]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[5]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[5]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[6]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[6]);
-        text += 7;
-
-        // Iterate throw the rest with the modulus:
-        for (; text != text_end; ++text) {
-            hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
-            hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
-            // Wrap the hashes around:
-            hash_low = _sz_prime_mod(hash_low);
-            hash_high = _sz_prime_mod(hash_high);
-        }
-        break;
-    }
-
-    return _sz_hash_mix(hash_low, hash_high);
-}
-
-SZ_PUBLIC void sz_hashes_serial(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    sz_u8_t const *text = (sz_u8_t const *)start;
-    sz_u8_t const *text_end = text + length;
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // Compute the initial hash value for the first window.
-    sz_u64_t hash_low = 0, hash_high = 0, hash_mix;
-    for (sz_u8_t const *first_end = text + window_length; text < first_end; ++text)
-        hash_low = (hash_low * 31ull + _sz_shift_low(*text)) % SZ_U64_MAX_PRIME,
-        hash_high = (hash_high * 257ull + _sz_shift_high(*text)) % SZ_U64_MAX_PRIME;
+SZ_PUBLIC void sz_toascii(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
 
-    // In most cases the fingerprint length will be a power of two.
-    hash_mix = _sz_hash_mix(hash_low, hash_high);
-    callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
+/**
+ *  @brief  Checks if all characters in the range are valid ASCII characters.
+ *
+ *  @param text     String to be analyzed.
+ *  @param length   Number of bytes in the string.
+ *  @return         Whether all characters are valid ASCII characters.
+ */
+SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t text, sz_size_t length);
 
-    // Compute the hash value for every window, exporting into the fingerprint,
-    // using the expensive modulo operation.
-    sz_size_t cycles = 1;
-    sz_size_t const step_mask = step - 1;
-    for (; text < text_end; ++text, ++cycles) {
-        // Discard one character:
-        hash_low -= _sz_shift_low(*(text - window_length)) * prime_power_low;
-        hash_high -= _sz_shift_high(*(text - window_length)) * prime_power_high;
-        // And add a new one:
-        hash_low = 31ull * hash_low + _sz_shift_low(*text);
-        hash_high = 257ull * hash_high + _sz_shift_high(*text);
-        // Wrap the hashes around:
-        hash_low = _sz_prime_mod(hash_low);
-        hash_high = _sz_prime_mod(hash_high);
-        // Mix only if we've skipped enough hashes.
-        if ((cycles & step_mask) == 0) {
-            hash_mix = _sz_hash_mix(hash_low, hash_high);
-            callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
-        }
-    }
-}
+#pragma endregion // Helper API
 
-#undef _sz_shift_low
-#undef _sz_shift_high
-#undef _sz_hash_mix
-#undef _sz_prime_mod
+#pragma region Serial Implementation
 
 /**
  *  @brief  Uses a small lookup-table to convert a lowercase character to uppercase.
@@ -3128,52 +212,6 @@ SZ_INTERNAL sz_u8_t sz_u8_toupper(sz_u8_t c) {
     return upped[c];
 }
 
-/**
- *  @brief  Uses two small lookup tables (768 bytes total) to accelerate division by a small
- *          unsigned integer. Performs two lookups, one multiplication, two shifts, and two accumulations.
- *
- *  @param  divisor Integral value @b larger than one.
- *  @param  number  Integral value to divide.
- */
-SZ_INTERNAL sz_u8_t sz_u8_divide(sz_u8_t number, sz_u8_t divisor) {
-    sz_assert(divisor > 1);
-    static sz_u16_t const multipliers[256] = {
-        0,     0,     0,     21846, 0,     39322, 21846, 9363,  0,     50973, 39322, 29790, 21846, 15124, 9363,  4370,
-        0,     57826, 50973, 44841, 39322, 34329, 29790, 25645, 21846, 18351, 15124, 12137, 9363,  6780,  4370,  2115,
-        0,     61565, 57826, 54302, 50973, 47824, 44841, 42011, 39322, 36765, 34329, 32006, 29790, 27671, 25645, 23705,
-        21846, 20063, 18351, 16706, 15124, 13602, 12137, 10725, 9363,  8049,  6780,  5554,  4370,  3224,  2115,  1041,
-        0,     63520, 61565, 59668, 57826, 56039, 54302, 52614, 50973, 49377, 47824, 46313, 44841, 43407, 42011, 40649,
-        39322, 38028, 36765, 35532, 34329, 33154, 32006, 30885, 29790, 28719, 27671, 26647, 25645, 24665, 23705, 22766,
-        21846, 20945, 20063, 19198, 18351, 17520, 16706, 15907, 15124, 14356, 13602, 12863, 12137, 11424, 10725, 10038,
-        9363,  8700,  8049,  7409,  6780,  6162,  5554,  4957,  4370,  3792,  3224,  2665,  2115,  1573,  1041,  517,
-        0,     64520, 63520, 62535, 61565, 60609, 59668, 58740, 57826, 56926, 56039, 55164, 54302, 53452, 52614, 51788,
-        50973, 50169, 49377, 48595, 47824, 47063, 46313, 45572, 44841, 44120, 43407, 42705, 42011, 41326, 40649, 39982,
-        39322, 38671, 38028, 37392, 36765, 36145, 35532, 34927, 34329, 33738, 33154, 32577, 32006, 31443, 30885, 30334,
-        29790, 29251, 28719, 28192, 27671, 27156, 26647, 26143, 25645, 25152, 24665, 24182, 23705, 23233, 22766, 22303,
-        21846, 21393, 20945, 20502, 20063, 19628, 19198, 18772, 18351, 17933, 17520, 17111, 16706, 16305, 15907, 15514,
-        15124, 14738, 14356, 13977, 13602, 13231, 12863, 12498, 12137, 11779, 11424, 11073, 10725, 10380, 10038, 9699,
-        9363,  9030,  8700,  8373,  8049,  7727,  7409,  7093,  6780,  6470,  6162,  5857,  5554,  5254,  4957,  4662,
-        4370,  4080,  3792,  3507,  3224,  2943,  2665,  2388,  2115,  1843,  1573,  1306,  1041,  778,   517,   258,
-    };
-    // This table can be avoided using a single addition and counting trailing zeros.
-    static sz_u8_t const shifts[256] = {
-        0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, //
-        4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, //
-        5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, //
-        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, //
-        6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-    };
-    sz_u32_t multiplier = multipliers[divisor];
-    sz_u8_t shift = shifts[divisor];
-
-    sz_u16_t q = (sz_u16_t)((multiplier * number) >> 16);
-    sz_u16_t t = ((number - q) >> 1) + q;
-    return (sz_u8_t)(t >> shift);
-}
-
 SZ_PUBLIC void sz_look_up_transform_serial(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result) {
     sz_u8_t const *unsigned_lut = (sz_u8_t const *)lut;
     sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
@@ -3216,280 +254,24 @@ SZ_PUBLIC sz_bool_t sz_isascii_serial(sz_cptr_t text, sz_size_t length) {
 #if !SZ_USE_MISALIGNED_LOADS
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
-        if (*h & 0x80ull) return sz_false_k;
-#endif
-
-    // Validate eight bytes at once using SWAR.
-    sz_u64_vec_t text_vec;
-    for (; h + 8 <= h_end; h += 8) {
-        text_vec.u64 = *(sz_u64_t const *)h;
-        if (text_vec.u64 & 0x8080808080808080ull) return sz_false_k;
-    }
-
-    // Handle the misaligned tail.
-    for (; h < h_end; ++h)
-        if (*h & 0x80ull) return sz_false_k;
-    return sz_true_k;
-}
-
-SZ_PUBLIC void sz_generate_serial(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
-                                  sz_random_generator_t generator, void *generator_user_data) {
-
-    sz_assert(alphabet_size > 0 && alphabet_size <= 256 && "Inadequate alphabet size");
-
-    if (alphabet_size == 1) sz_fill(result, result_length, *alphabet);
-
-    else {
-        sz_assert(generator && "Expects a valid random generator");
-        sz_u8_t divisor = (sz_u8_t)alphabet_size;
-        for (sz_cptr_t end = result + result_length; result != end; ++result) {
-            sz_u8_t random = generator(generator_user_data) & 0xFF;
-            sz_u8_t quotient = sz_u8_divide(random, divisor);
-            *result = alphabet[random - quotient * divisor];
-        }
-    }
-}
-
-#pragma endregion
-
-/*
- *  Serial implementation of string class operations.
- */
-#pragma region Serial Implementation for the String Class
-
-SZ_PUBLIC sz_bool_t sz_string_is_on_stack(sz_string_t const *string) {
-    // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    return (sz_bool_t)((sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0]);
-}
-
-SZ_PUBLIC void sz_string_range(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length) {
-    sz_size_t is_small = (sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0];
-    sz_size_t is_big_mask = is_small - 1ull;
-    *start = string->external.start; // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    // If the string is small, use branch-less approach to mask-out the top 7 bytes of the length.
-    *length = string->external.length & (0x00000000000000FFull | is_big_mask);
-}
-
-SZ_PUBLIC void sz_string_unpack(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length, sz_size_t *space,
-                                sz_bool_t *is_external) {
-    sz_size_t is_small = (sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0];
-    sz_size_t is_big_mask = is_small - 1ull;
-    *start = string->external.start; // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    // If the string is small, use branch-less approach to mask-out the top 7 bytes of the length.
-    *length = string->external.length & (0x00000000000000FFull | is_big_mask);
-    // In case the string is small, the `is_small - 1ull` will become 0xFFFFFFFFFFFFFFFFull.
-    *space = sz_u64_blend(SZ_STRING_INTERNAL_SPACE, string->external.space, is_big_mask);
-    *is_external = (sz_bool_t)!is_small;
-}
-
-SZ_PUBLIC sz_bool_t sz_string_equal(sz_string_t const *a, sz_string_t const *b) {
-    // Tempting to say that the external.length is bitwise the same even if it includes
-    // some bytes of the on-stack payload, but we don't at this writing maintain that invariant.
-    // (An on-stack string includes noise bytes in the high-order bits of external.length. So do this
-    // the hard/correct way.
-
-#if SZ_USE_MISALIGNED_LOADS
-    // Dealing with StringZilla strings, we know that the `start` pointer always points
-    // to a word at least 8 bytes long. Therefore, we can compare the first 8 bytes at once.
-
-#endif
-    // Alternatively, fall back to byte-by-byte comparison.
-    sz_ptr_t a_start, b_start;
-    sz_size_t a_length, b_length;
-    sz_string_range(a, &a_start, &a_length);
-    sz_string_range(b, &b_start, &b_length);
-    return (sz_bool_t)(a_length == b_length && sz_equal(a_start, b_start, b_length));
-}
-
-SZ_PUBLIC sz_ordering_t sz_string_order(sz_string_t const *a, sz_string_t const *b) {
-#if SZ_USE_MISALIGNED_LOADS
-    // Dealing with StringZilla strings, we know that the `start` pointer always points
-    // to a word at least 8 bytes long. Therefore, we can compare the first 8 bytes at once.
-
-#endif
-    // Alternatively, fall back to byte-by-byte comparison.
-    sz_ptr_t a_start, b_start;
-    sz_size_t a_length, b_length;
-    sz_string_range(a, &a_start, &a_length);
-    sz_string_range(b, &b_start, &b_length);
-    return sz_order(a_start, a_length, b_start, b_length);
-}
-
-SZ_PUBLIC void sz_string_init(sz_string_t *string) {
-    sz_assert(string && "String can't be SZ_NULL.");
-
-    // Only 8 + 1 + 1 need to be initialized.
-    string->internal.start = &string->internal.chars[0];
-    // But for safety let's initialize the entire structure to zeros.
-    // string->internal.chars[0] = 0;
-    // string->internal.length = 0;
-    string->words[1] = 0;
-    string->words[2] = 0;
-    string->words[3] = 0;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length, sz_memory_allocator_t *allocator) {
-    sz_size_t space_needed = length + 1; // space for trailing \0
-    sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
-    // Initialize the string to zeros for safety.
-    string->words[1] = 0;
-    string->words[2] = 0;
-    string->words[3] = 0;
-    // If we are lucky, no memory allocations will be needed.
-    if (space_needed <= SZ_STRING_INTERNAL_SPACE) {
-        string->internal.start = &string->internal.chars[0];
-        string->internal.length = (sz_u8_t)length;
-    }
-    else {
-        // If we are not lucky, we need to allocate memory.
-        string->external.start = (sz_ptr_t)allocator->allocate(space_needed, allocator->handle);
-        if (!string->external.start) return SZ_NULL_CHAR;
-        string->external.length = length;
-        string->external.space = space_needed;
-    }
-    sz_assert(&string->internal.start == &string->external.start && "Alignment confusion");
-    string->external.start[length] = 0;
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity, sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
-
-    sz_size_t new_space = new_capacity + 1;
-    if (new_space <= SZ_STRING_INTERNAL_SPACE) return string->external.start;
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-    sz_assert(new_space > string_space && "New space must be larger than current.");
-
-    sz_ptr_t new_start = (sz_ptr_t)allocator->allocate(new_space, allocator->handle);
-    if (!new_start) return SZ_NULL_CHAR;
-
-    sz_copy(new_start, string_start, string_length);
-    string->external.start = new_start;
-    string->external.space = new_space;
-    string->external.padding = 0;
-    string->external.length = string_length;
-
-    // Deallocate the old string.
-    if (string_is_external) allocator->free(string_start, string_space, allocator->handle);
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_shrink_to_fit(sz_string_t *string, sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // We may already be space-optimal, and in that case we don't need to do anything.
-    sz_size_t new_space = string_length + 1;
-    if (string_space == new_space || !string_is_external) return string->external.start;
-
-    sz_ptr_t new_start = (sz_ptr_t)allocator->allocate(new_space, allocator->handle);
-    if (!new_start) return SZ_NULL_CHAR;
-
-    sz_copy(new_start, string_start, string_length);
-    string->external.start = new_start;
-    string->external.space = new_space;
-    string->external.padding = 0;
-    string->external.length = string_length;
-
-    // Deallocate the old string.
-    if (string_is_external) allocator->free(string_start, string_space, allocator->handle);
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_expand(sz_string_t *string, sz_size_t offset, sz_size_t added_length,
-                                    sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // The user intended to extend the string.
-    offset = sz_min_of_two(offset, string_length);
-
-    // If we are lucky, no memory allocations will be needed.
-    if (string_length + added_length < string_space) {
-        sz_move(string_start + offset + added_length, string_start + offset, string_length - offset);
-        string_start[string_length + added_length] = 0;
-        // Even if the string is on the stack, the `+=` won't affect the tail of the string.
-        string->external.length += added_length;
-    }
-    // If we are not lucky, we need to allocate more memory.
-    else {
-        sz_size_t next_planned_size = sz_max_of_two(SZ_CACHE_LINE_WIDTH, string_space * 2ull);
-        sz_size_t min_needed_space = sz_size_bit_ceil(offset + string_length + added_length + 1);
-        sz_size_t new_space = sz_max_of_two(min_needed_space, next_planned_size);
-        string_start = sz_string_reserve(string, new_space - 1, allocator);
-        if (!string_start) return SZ_NULL_CHAR;
-
-        // Copy into the new buffer.
-        sz_move(string_start + offset + added_length, string_start + offset, string_length - offset);
-        string_start[string_length + added_length] = 0;
-        string->external.length = string_length + added_length;
-    }
-
-    return string_start;
-}
-
-SZ_PUBLIC sz_size_t sz_string_erase(sz_string_t *string, sz_size_t offset, sz_size_t length) {
-
-    sz_assert(string && "String can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // Normalize the offset, it can't be larger than the length.
-    offset = sz_min_of_two(offset, string_length);
-
-    // We shouldn't normalize the length, to avoid overflowing on `offset + length >= string_length`,
-    // if receiving `length == SZ_SIZE_MAX`. After following expression the `length` will contain
-    // exactly the delta between original and final length of this `string`.
-    length = sz_min_of_two(length, string_length - offset);
-
-    // There are 2 common cases, that wouldn't even require a `memmove`:
-    //      1.  Erasing the entire contents of the string.
-    //          In that case `length` argument will be equal or greater than `length` member.
-    //      2.  Removing the tail of the string with something like `string.pop_back()` in C++.
-    //
-    // In both of those, regardless of the location of the string - stack or heap,
-    // the erasing is as easy as setting the length to the offset.
-    // In every other case, we must `memmove` the tail of the string to the left.
-    if (offset + length < string_length)
-        sz_move(string_start + offset, string_start + offset + length, string_length - offset - length);
+        if (*h & 0x80ull) return sz_false_k;
+#endif
 
-    // The `string->external.length = offset` assignment would discard last characters
-    // of the on-the-stack string, but inplace subtraction would work.
-    string->external.length -= length;
-    string_start[string_length - length] = 0;
-    return length;
-}
+    // Validate eight bytes at once using SWAR.
+    sz_u64_vec_t text_vec;
+    for (; h + 8 <= h_end; h += 8) {
+        text_vec.u64 = *(sz_u64_t const *)h;
+        if (text_vec.u64 & 0x8080808080808080ull) return sz_false_k;
+    }
 
-SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *allocator) {
-    if (!sz_string_is_on_stack(string))
-        allocator->free(string->external.start, string->external.space, allocator->handle);
-    sz_string_init(string);
+    // Handle the misaligned tail.
+    for (; h < h_end; ++h)
+        if (*h & 0x80ull) return sz_false_k;
+    return sz_true_k;
 }
 
-// When overriding libc, disable optimisations for this function beacuse MSVC will optimize the loops into a memset.
-// Which then causes a stack overflow due to infinite recursion (memset -> sz_fill_serial -> memset).
+// When overriding libc, disable optimizations for this function because MSVC will optimize the loops into a `memset`.
+// Which then causes a stack overflow due to infinite recursion (`memset` -> `sz_fill_serial` -> `memset`).
 #if defined(_MSC_VER) && defined(SZ_OVERRIDE_LIBC) && SZ_OVERRIDE_LIBC
 #pragma optimize("", off)
 #endif
@@ -3548,338 +330,17 @@ SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
 
 #pragma endregion
 
-/*
- *  @brief  Serial implementation for strings sequence processing.
- */
-#pragma region Serial Implementation for Sequences
-
-SZ_PUBLIC sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate) {
-
-    sz_size_t matches = 0;
-    while (matches != sequence->count && predicate(sequence, sequence->order[matches])) ++matches;
-
-    for (sz_size_t i = matches + 1; i < sequence->count; ++i)
-        if (predicate(sequence, sequence->order[i]))
-            sz_u64_swap(sequence->order + i, sequence->order + matches), ++matches;
-
-    return matches;
-}
-
-SZ_PUBLIC void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less) {
-
-    sz_size_t start_b = partition + 1;
-
-    // If the direct merge is already sorted
-    if (!less(sequence, sequence->order[start_b], sequence->order[partition])) return;
-
-    sz_size_t start_a = 0;
-    while (start_a <= partition && start_b <= sequence->count) {
-
-        // If element 1 is in right place
-        if (!less(sequence, sequence->order[start_b], sequence->order[start_a])) { start_a++; }
-        else {
-            sz_size_t value = sequence->order[start_b];
-            sz_size_t index = start_b;
-
-            // Shift all the elements between element 1
-            // element 2, right by 1.
-            while (index != start_a) { sequence->order[index] = sequence->order[index - 1], index--; }
-            sequence->order[start_a] = value;
-
-            // Update all the pointers
-            start_a++;
-            partition++;
-            start_b++;
-        }
-    }
-}
-
-SZ_PUBLIC void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
-    sz_u64_t *keys = sequence->order;
-    sz_size_t keys_count = sequence->count;
-    for (sz_size_t i = 1; i < keys_count; i++) {
-        sz_u64_t i_key = keys[i];
-        sz_size_t j = i;
-        for (; j > 0 && less(sequence, i_key, keys[j - 1]); --j) keys[j] = keys[j - 1];
-        keys[j] = i_key;
-    }
-}
-
-SZ_INTERNAL void _sz_sift_down(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t start,
-                               sz_size_t end) {
-    sz_size_t root = start;
-    while (2 * root + 1 <= end) {
-        sz_size_t child = 2 * root + 1;
-        if (child + 1 <= end && less(sequence, order[child], order[child + 1])) { child++; }
-        if (!less(sequence, order[root], order[child])) { return; }
-        sz_u64_swap(order + root, order + child);
-        root = child;
-    }
-}
-
-SZ_INTERNAL void _sz_heapify(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t count) {
-    sz_size_t start = (count - 2) / 2;
-    while (1) {
-        _sz_sift_down(sequence, less, order, start, count - 1);
-        if (start == 0) return;
-        start--;
-    }
-}
-
-SZ_INTERNAL void _sz_heapsort(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first, sz_size_t last) {
-    sz_u64_t *order = sequence->order;
-    sz_size_t count = last - first;
-    _sz_heapify(sequence, less, order + first, count);
-    sz_size_t end = count - 1;
-    while (end > 0) {
-        sz_u64_swap(order + first, order + first + end);
-        end--;
-        _sz_sift_down(sequence, less, order + first, 0, end);
-    }
-}
-
-SZ_PUBLIC void sz_sort_introsort_recursion(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first,
-                                           sz_size_t last, sz_size_t depth) {
-
-    sz_size_t length = last - first;
-    switch (length) {
-    case 0:
-    case 1: return;
-    case 2:
-        if (less(sequence, sequence->order[first + 1], sequence->order[first]))
-            sz_u64_swap(&sequence->order[first], &sequence->order[first + 1]);
-        return;
-    case 3: {
-        sz_u64_t a = sequence->order[first];
-        sz_u64_t b = sequence->order[first + 1];
-        sz_u64_t c = sequence->order[first + 2];
-        if (less(sequence, b, a)) sz_u64_swap(&a, &b);
-        if (less(sequence, c, b)) sz_u64_swap(&c, &b);
-        if (less(sequence, b, a)) sz_u64_swap(&a, &b);
-        sequence->order[first] = a;
-        sequence->order[first + 1] = b;
-        sequence->order[first + 2] = c;
-        return;
-    }
-    }
-    // Until a certain length, the quadratic-complexity insertion-sort is fine
-    if (length <= 16) {
-        sz_sequence_t sub_seq = *sequence;
-        sub_seq.order += first;
-        sub_seq.count = length;
-        sz_sort_insertion(&sub_seq, less);
-        return;
-    }
-
-    // Fallback to N-logN-complexity heap-sort
-    if (depth == 0) {
-        _sz_heapsort(sequence, less, first, last);
-        return;
-    }
-
-    --depth;
-
-    // Median-of-three logic to choose pivot
-    sz_size_t median = first + length / 2;
-    if (less(sequence, sequence->order[median], sequence->order[first]))
-        sz_u64_swap(&sequence->order[first], &sequence->order[median]);
-    if (less(sequence, sequence->order[last - 1], sequence->order[first]))
-        sz_u64_swap(&sequence->order[first], &sequence->order[last - 1]);
-    if (less(sequence, sequence->order[median], sequence->order[last - 1]))
-        sz_u64_swap(&sequence->order[median], &sequence->order[last - 1]);
-
-    // Partition using the median-of-three as the pivot
-    sz_u64_t pivot = sequence->order[median];
-    sz_size_t left = first;
-    sz_size_t right = last - 1;
-    while (1) {
-        while (less(sequence, sequence->order[left], pivot)) left++;
-        while (less(sequence, pivot, sequence->order[right])) right--;
-        if (left >= right) break;
-        sz_u64_swap(&sequence->order[left], &sequence->order[right]);
-        left++;
-        right--;
-    }
-
-    // Recursively sort the partitions
-    sz_sort_introsort_recursion(sequence, less, first, left, depth);
-    sz_sort_introsort_recursion(sequence, less, right + 1, last, depth);
-}
-
-SZ_PUBLIC void sz_sort_introsort(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
-    if (sequence->count == 0) return;
-    sz_size_t size_is_not_power_of_two = (sequence->count & (sequence->count - 1)) != 0;
-    sz_size_t depth_limit = sz_size_log2i_nonzero(sequence->count) + size_is_not_power_of_two;
-    sz_sort_introsort_recursion(sequence, less, 0, sequence->count, depth_limit);
-}
-
-SZ_PUBLIC void sz_sort_recursion( //
-    sz_sequence_t *sequence, sz_size_t bit_idx, sz_size_t bit_max, sz_sequence_comparator_t comparator,
-    sz_size_t partial_order_length) {
-
-    if (!sequence->count) return;
-
-    // Array of size one doesn't need sorting - only needs the prefix to be discarded.
-    if (sequence->count == 1) {
-        sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
-        order_half_words[1] = 0;
-        return;
-    }
-
-    // Partition a range of integers according to a specific bit value
-    sz_size_t split = 0;
-    sz_u64_t mask = (1ull << 63) >> bit_idx;
-
-    // The clean approach would be to perform a single pass over the sequence.
-    //
-    //    while (split != sequence->count && !(sequence->order[split] & mask)) ++split;
-    //    for (sz_size_t i = split + 1; i < sequence->count; ++i)
-    //        if (!(sequence->order[i] & mask)) sz_u64_swap(sequence->order + i, sequence->order + split), ++split;
-    //
-    // This, however, doesn't take into account the high relative cost of writes and swaps.
-    // To circumvent that, we can first count the total number entries to be mapped into either part.
-    // And then walk through both parts, swapping the entries that are in the wrong part.
-    // This would often lead to ~15% performance gain.
-    sz_size_t count_with_bit_set = 0;
-    for (sz_size_t i = 0; i != sequence->count; ++i) count_with_bit_set += (sequence->order[i] & mask) != 0;
-    split = sequence->count - count_with_bit_set;
-
-    // It's possible that the sequence is already partitioned.
-    if (split != 0 && split != sequence->count) {
-        // Use two pointers to efficiently reposition elements.
-        // On pointer walks left-to-right from the start, and the other walks right-to-left from the end.
-        sz_size_t left = 0;
-        sz_size_t right = sequence->count - 1;
-        while (1) {
-            // Find the next element with the bit set on the left side.
-            while (left < split && !(sequence->order[left] & mask)) ++left;
-            // Find the next element without the bit set on the right side.
-            while (right >= split && (sequence->order[right] & mask)) --right;
-            // Swap the mispositioned elements.
-            if (left < split && right >= split) {
-                sz_u64_swap(sequence->order + left, sequence->order + right);
-                ++left;
-                --right;
-            }
-            else { break; }
-        }
-    }
-
-    // Go down recursively.
-    if (bit_idx < bit_max) {
-        sz_sequence_t a = *sequence;
-        a.count = split;
-        sz_sort_recursion(&a, bit_idx + 1, bit_max, comparator, partial_order_length);
-
-        sz_sequence_t b = *sequence;
-        b.order += split;
-        b.count -= split;
-        sz_sort_recursion(&b, bit_idx + 1, bit_max, comparator, partial_order_length);
-    }
-    // Reached the end of recursion.
-    else {
-        // Discard the prefixes.
-        sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
-        for (sz_size_t i = 0; i != sequence->count; ++i) { order_half_words[i * 2 + 1] = 0; }
-
-        sz_sequence_t a = *sequence;
-        a.count = split;
-        sz_sort_introsort(&a, comparator);
-
-        sz_sequence_t b = *sequence;
-        b.order += split;
-        b.count -= split;
-        sz_sort_introsort(&b, comparator);
-    }
-}
-
-SZ_INTERNAL sz_bool_t _sz_sort_is_less(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) {
-    sz_cptr_t i_str = sequence->get_start(sequence, i_key);
-    sz_cptr_t j_str = sequence->get_start(sequence, j_key);
-    sz_size_t i_len = sequence->get_length(sequence, i_key);
-    sz_size_t j_len = sequence->get_length(sequence, j_key);
-    return (sz_bool_t)(sz_order_serial(i_str, i_len, j_str, j_len) == sz_less_k);
-}
-
-SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t partial_order_length) {
-
-#if SZ_DETECT_BIG_ENDIAN
-    // TODO: Implement partial sort for big-endian systems. For now this sorts the whole thing.
-    sz_unused(partial_order_length);
-    sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
-#else
-
-    // Export up to 4 bytes into the `sequence` bits themselves
-    for (sz_size_t i = 0; i != sequence->count; ++i) {
-        sz_cptr_t begin = sequence->get_start(sequence, sequence->order[i]);
-        sz_size_t length = sequence->get_length(sequence, sequence->order[i]);
-        length = length > 4u ? 4u : length;
-        sz_ptr_t prefix = (sz_ptr_t)&sequence->order[i];
-        for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j];
-    }
-
-    // Perform optionally-parallel radix sort on them
-    sz_sort_recursion(sequence, 0, 32, (sz_sequence_comparator_t)_sz_sort_is_less, partial_order_length);
-#endif
-}
-
-SZ_PUBLIC void sz_sort(sz_sequence_t *sequence) {
-#if SZ_DETECT_BIG_ENDIAN
-    sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
-#else
-    sz_sort_partial(sequence, sequence->count);
-#endif
-}
-
-#pragma endregion
-
-/*
- *  @brief  AVX2 implementation of the string search algorithms.
- *          Very minimalistic, but still faster than the serial implementation.
+/*  AVX2 implementation of the string search algorithms for Haswell processors and newer.
+ *  Very minimalistic (compared to AVX-512), but still faster than the serial implementation.
  */
-#pragma region AVX2 Implementation
+#pragma region Haswell Implementation
 
-#if SZ_USE_X86_AVX2
+#if SZ_USE_HASWELL
 #pragma GCC push_options
-#pragma GCC target("avx2")
-#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
-#include <immintrin.h>
-
-/**
- *  @brief  Helper structure to simplify work with 256-bit registers.
- */
-typedef union sz_u256_vec_t {
-    __m256i ymm;
-    __m128i xmms[2];
-    sz_u64_t u64s[4];
-    sz_u32_t u32s[8];
-    sz_u16_t u16s[16];
-    sz_u8_t u8s[32];
-} sz_u256_vec_t;
-
-SZ_PUBLIC sz_ordering_t sz_order_avx2(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    //! Before optimizing this, read the "Operations Not Worth Optimizing" in Contributions Guide:
-    //! https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md#general-performance-observations
-    return sz_order_serial(a, a_length, b, b_length);
-}
+#pragma GCC target("haswell")
+#pragma clang attribute push(__attribute__((target("haswell"))), apply_to = function)
 
-SZ_PUBLIC sz_bool_t sz_equal_avx2(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_u256_vec_t a_vec, b_vec;
-
-    while (length >= 32) {
-        a_vec.ymm = _mm256_lddqu_si256((__m256i const *)a);
-        b_vec.ymm = _mm256_lddqu_si256((__m256i const *)b);
-        // One approach can be to use "movemasks", but we could also use a bitwise matching like `_mm256_testnzc_si256`.
-        int difference_mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi8(a_vec.ymm, b_vec.ymm));
-        if (difference_mask == 0) { a += 32, b += 32, length -= 32; }
-        else { return sz_false_k; }
-    }
-
-    if (length) return sz_equal_serial(a, b, length);
-    return sz_true_k;
-}
-
-SZ_PUBLIC void sz_fill_avx2(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
+SZ_PUBLIC void sz_fill_haswell(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
     char value_char = *(char *)&value;
     __m256i value_vec = _mm256_set1_epi8(value_char);
     // The naive implementation of this function is very simple.
@@ -3935,7 +396,7 @@ SZ_PUBLIC void sz_copy_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length)
     // For now, let's avoid the cases beyond the L2 size.
     int is_huge = length > 1ull * 1024ull * 1024ull;
     if (length <= 32) { sz_copy_serial(target, source, length); }
-    // When dealing wirh larger arrays, the optimization is not as simple as with the `sz_fill_avx2` function,
+    // When dealing wirh larger arrays, the optimization is not as simple as with the `sz_fill_haswell` function,
     // as both buffers may be unaligned. If we are lucky and the requested operation is some huge page transfer,
     // we can use aligned loads and stores, and the performance will be great.
     else if ((sz_size_t)target % 32 == 0 && (sz_size_t)source % 32 == 0 && !is_huge) {
@@ -4002,86 +463,6 @@ SZ_PUBLIC void sz_move_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length)
     }
 }
 
-SZ_PUBLIC sz_u64_t sz_checksum_avx2(sz_cptr_t text, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "loads".
-    //
-    // A typical AWS Skylake instance can have 32 KB x 2 blocks of L1 data cache per core,
-    // 1 MB x 2 blocks of L2 cache per core, and one shared L3 cache buffer.
-    // For now, let's avoid the cases beyond the L2 size.
-    int is_huge = length > 1ull * 1024ull * 1024ull;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 32) { return sz_checksum_serial(text, length); }
-    else if (!is_huge) {
-        sz_u256_vec_t text_vec, sums_vec;
-        sums_vec.ymm = _mm256_setzero_si256();
-        for (; length >= 32; text += 32, length -= 32) {
-            text_vec.ymm = _mm256_lddqu_si256((__m256i const *)text);
-            sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-        }
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        sz_u64_t result = low + high;
-        if (length) result += sz_checksum_serial(text, length);
-        return result;
-    }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    // Most notably, we can avoid populating the cache with the entire buffer, and instead traverse it in 2 directions.
-    else {
-        sz_size_t head_length = (32 - ((sz_size_t)text % 32)) % 32; // 31 or less.
-        sz_size_t tail_length = (sz_size_t)(text + length) % 32;    // 31 or less.
-        sz_size_t body_length = length - head_length - tail_length; // Multiple of 32.
-        sz_u64_t result = 0;
-
-        // Handle the head
-        while (head_length--) result += *text++;
-
-        sz_u256_vec_t text_vec, sums_vec;
-        sums_vec.ymm = _mm256_setzero_si256();
-        // Fill the aligned body of the buffer.
-        if (!is_huge) {
-            for (; body_length >= 32; text += 32, body_length -= 32) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i const *)text);
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-            }
-        }
-        // When the biffer is huge, we can traverse it in 2 directions.
-        else {
-            sz_u256_vec_t text_reversed_vec, sums_reversed_vec;
-            sums_reversed_vec.ymm = _mm256_setzero_si256();
-            for (; body_length >= 64; text += 64, body_length -= 64) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i *)(text));
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-                text_reversed_vec.ymm = _mm256_stream_load_si256((__m256i *)(text + body_length - 64));
-                sums_reversed_vec.ymm = _mm256_add_epi64(
-                    sums_reversed_vec.ymm, _mm256_sad_epu8(text_reversed_vec.ymm, _mm256_setzero_si256()));
-            }
-            if (body_length >= 32) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i *)(text));
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-            }
-            sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, sums_reversed_vec.ymm);
-        }
-
-        // Handle the tail
-        while (tail_length--) result += *text++;
-
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        result += low + high;
-        return result;
-    }
-}
-
 SZ_PUBLIC void sz_look_up_transform_avx2(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
 
     // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
@@ -4218,503 +599,24 @@ SZ_PUBLIC void sz_look_up_transform_avx2(sz_cptr_t source, sz_size_t length, sz_
     if (length) sz_look_up_transform_serial(source, length, lut, target);
 }
 
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    int mask;
-    sz_u256_vec_t h_vec, n_vec;
-    n_vec.ymm = _mm256_set1_epi8(n[0]);
-
-    while (h_length >= 32) {
-        h_vec.ymm = _mm256_lddqu_si256((__m256i const *)h);
-        mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_vec.ymm, n_vec.ymm));
-        if (mask) return h + sz_u32_ctz(mask);
-        h += 32, h_length -= 32;
-    }
-
-    return sz_find_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    int mask;
-    sz_u256_vec_t h_vec, n_vec;
-    n_vec.ymm = _mm256_set1_epi8(n[0]);
-
-    while (h_length >= 32) {
-        h_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + h_length - 32));
-        mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_vec.ymm, n_vec.ymm));
-        if (mask) return h + h_length - 1 - sz_u32_clz(mask);
-        h_length -= 32;
-    }
-
-    return sz_rfind_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_avx2(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into YMM registers.
-    int matches;
-    sz_u256_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.ymm = _mm256_set1_epi8(n[offset_first]);
-    n_mid_vec.ymm = _mm256_set1_epi8(n[offset_mid]);
-    n_last_vec.ymm = _mm256_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    for (; h_length >= n_length + 32; h += 32, h_length -= 32) {
-        h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_first));
-        h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_mid));
-        h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_last));
-        matches = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
-        while (matches) {
-            int potential_offset = sz_u32_ctz(matches);
-            if (sz_equal(h + potential_offset, n, n_length)) return h + potential_offset;
-            matches &= matches - 1;
-        }
-    }
-
-    return sz_find_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_avx2(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into YMM registers.
-    int matches;
-    sz_u256_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.ymm = _mm256_set1_epi8(n[offset_first]);
-    n_mid_vec.ymm = _mm256_set1_epi8(n[offset_mid]);
-    n_last_vec.ymm = _mm256_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 32; h_length -= 32) {
-        h_reversed = h + h_length - n_length - 32 + 1;
-        h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_first));
-        h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_mid));
-        h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_last));
-        matches = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
-        while (matches) {
-            int potential_offset = sz_u32_clz(matches);
-            if (sz_equal(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            matches &= ~(1 << (31 - potential_offset));
-        }
-    }
-
-    return sz_rfind_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx2(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-
-    // Let's unzip even and odd elements and replicate them into both lanes of the YMM register.
-    // That way when we invoke `_mm256_shuffle_epi8` we can use the same mask for both lanes.
-    sz_u256_vec_t filter_even_vec, filter_odd_vec;
-    for (sz_size_t i = 0; i != 16; ++i)
-        filter_even_vec.u8s[i] = filter->_u8s[i * 2], filter_odd_vec.u8s[i] = filter->_u8s[i * 2 + 1];
-    filter_even_vec.xmms[1] = filter_even_vec.xmms[0];
-    filter_odd_vec.xmms[1] = filter_odd_vec.xmms[0];
-
-    sz_u256_vec_t text_vec;
-    sz_u256_vec_t matches_vec;
-    sz_u256_vec_t lower_nibbles_vec, higher_nibbles_vec;
-    sz_u256_vec_t bitset_even_vec, bitset_odd_vec;
-    sz_u256_vec_t bitmask_vec, bitmask_lookup_vec;
-    bitmask_lookup_vec.ymm = _mm256_set_epi8(-128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-                                             -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);
-
-    while (length >= 32) {
-        // The following algorithm is a transposed equivalent of the "SIMDized check which bytes are in a set"
-        // solutions by Wojciech Muła. We populate the bitmask differently and target newer CPUs, so
-        // StrinZilla uses a somewhat different approach.
-        // http://0x80.pl/articles/simd-byte-lookup.html#alternative-implementation-new
-        //
-        //      sz_u8_t input = *(sz_u8_t const *)text;
-        //      sz_u8_t lo_nibble = input & 0x0f;
-        //      sz_u8_t hi_nibble = input >> 4;
-        //      sz_u8_t bitset_even = filter_even_vec.u8s[hi_nibble];
-        //      sz_u8_t bitset_odd = filter_odd_vec.u8s[hi_nibble];
-        //      sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //      sz_u8_t bitset = lo_nibble < 8 ? bitset_even : bitset_odd;
-        //      if ((bitset & bitmask) != 0) return text;
-        //      else { length--, text++; }
-        //
-        // The nice part about this, loading the strided data is vey easy with Arm NEON,
-        // while with x86 CPUs after AVX, shuffles within 256 bits shouldn't be an issue either.
-        text_vec.ymm = _mm256_lddqu_si256((__m256i const *)text);
-        lower_nibbles_vec.ymm = _mm256_and_si256(text_vec.ymm, _mm256_set1_epi8(0x0f));
-        bitmask_vec.ymm = _mm256_shuffle_epi8(bitmask_lookup_vec.ymm, lower_nibbles_vec.ymm);
-        //
-        // At this point we can validate the `bitmask_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != 32; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t lo_nibble = input & 0x0f;
-        //          sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //          sz_assert(bitmask_vec.u8s[i] == bitmask);
-        //      }
-        //
-        // Shift right every byte by 4 bits.
-        // There is no `_mm256_srli_epi8` intrinsic, so we have to use `_mm256_srli_epi16`
-        // and combine it with a mask to clear the higher bits.
-        higher_nibbles_vec.ymm = _mm256_and_si256(_mm256_srli_epi16(text_vec.ymm, 4), _mm256_set1_epi8(0x0f));
-        bitset_even_vec.ymm = _mm256_shuffle_epi8(filter_even_vec.ymm, higher_nibbles_vec.ymm);
-        bitset_odd_vec.ymm = _mm256_shuffle_epi8(filter_odd_vec.ymm, higher_nibbles_vec.ymm);
-        //
-        // At this point we can validate the `bitset_even_vec` and `bitset_odd_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != 32; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t const *bitset_ptr = &filter->_u8s[0];
-        //          sz_u8_t hi_nibble = input >> 4;
-        //          sz_u8_t bitset_even = bitset_ptr[hi_nibble * 2];
-        //          sz_u8_t bitset_odd = bitset_ptr[hi_nibble * 2 + 1];
-        //          sz_assert(bitset_even_vec.u8s[i] == bitset_even);
-        //          sz_assert(bitset_odd_vec.u8s[i] == bitset_odd);
-        //      }
-        //
-        __m256i take_first = _mm256_cmpgt_epi8(_mm256_set1_epi8(8), lower_nibbles_vec.ymm);
-        bitset_even_vec.ymm = _mm256_blendv_epi8(bitset_odd_vec.ymm, bitset_even_vec.ymm, take_first);
-
-        // It would have been great to have an instruction that tests the bits and then broadcasts
-        // the matching bit into all bits in that byte. But we don't have that, so we have to
-        // `and`, `cmpeq`, `movemask`, and then invert at the end...
-        matches_vec.ymm = _mm256_and_si256(bitset_even_vec.ymm, bitmask_vec.ymm);
-        matches_vec.ymm = _mm256_cmpeq_epi8(matches_vec.ymm, _mm256_setzero_si256());
-        int matches_mask = ~_mm256_movemask_epi8(matches_vec.ymm);
-        if (matches_mask) {
-            int offset = sz_u32_ctz(matches_mask);
-            return text + offset;
-        }
-        else { text += 32, length -= 32; }
-    }
-
-    return sz_find_charset_serial(text, length, filter);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx2(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-    return sz_rfind_charset_serial(text, length, filter);
-}
-
-/**
- *  @brief  There is no AVX2 instruction for fast multiplication of 64-bit integers.
- *          This implementation is coming from Agner Fog's Vector Class Library.
- */
-SZ_INTERNAL __m256i _mm256_mul_epu64(__m256i a, __m256i b) {
-    __m256i bswap = _mm256_shuffle_epi32(b, 0xB1);
-    __m256i prodlh = _mm256_mullo_epi32(a, bswap);
-    __m256i zero = _mm256_setzero_si256();
-    __m256i prodlh2 = _mm256_hadd_epi32(prodlh, zero);
-    __m256i prodlh3 = _mm256_shuffle_epi32(prodlh2, 0x73);
-    __m256i prodll = _mm256_mul_epu32(a, b);
-    __m256i prod = _mm256_add_epi64(prodll, prodlh3);
-    return prod;
-}
-
-SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                              sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    if (length < 4 * window_length) {
-        sz_hashes_serial(start, length, window_length, step, callback, callback_handle);
-        return;
-    }
-
-    // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
-    // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
-    sz_size_t const max_hashes = length - window_length + 1;
-    sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
-    sz_u8_t const *text_first = (sz_u8_t const *)start;
-    sz_u8_t const *text_second = text_first + min_hashes_per_thread;
-    sz_u8_t const *text_third = text_first + min_hashes_per_thread * 2;
-    sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
-    sz_u8_t const *text_end = text_first + length;
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // Broadcast the constants into the registers.
-    sz_u256_vec_t prime_vec, golden_ratio_vec;
-    sz_u256_vec_t base_low_vec, base_high_vec, prime_power_low_vec, prime_power_high_vec, shift_high_vec;
-    base_low_vec.ymm = _mm256_set1_epi64x(31ull);
-    base_high_vec.ymm = _mm256_set1_epi64x(257ull);
-    shift_high_vec.ymm = _mm256_set1_epi64x(77ull);
-    prime_vec.ymm = _mm256_set1_epi64x(SZ_U64_MAX_PRIME);
-    golden_ratio_vec.ymm = _mm256_set1_epi64x(11400714819323198485ull);
-    prime_power_low_vec.ymm = _mm256_set1_epi64x(prime_power_low);
-    prime_power_high_vec.ymm = _mm256_set1_epi64x(prime_power_high);
-
-    // Compute the initial hash values for every one of the four windows.
-    sz_u256_vec_t hash_low_vec, hash_high_vec, hash_mix_vec, chars_low_vec, chars_high_vec;
-    hash_low_vec.ymm = _mm256_setzero_si256();
-    hash_high_vec.ymm = _mm256_setzero_si256();
-    for (sz_u8_t const *prefix_end = text_first + window_length; text_first < prefix_end;
-         ++text_first, ++text_second, ++text_third, ++text_fourth) {
-
-        // 1. Multiply the hashes by the base.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-
-        // 3. Add the incoming characters.
-        hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_low_vec.ymm = _mm256_blendv_epi8(hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
-                                              _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
-        hash_high_vec.ymm = _mm256_blendv_epi8(hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
-                                               _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
-    }
-
-    // 5. Compute the hash mix, that will be used to index into the fingerprint.
-    //    This includes a serial step at the end.
-    hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
-    hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
-    hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
-    callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-    callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-    callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-    callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-
-    // Now repeat that operation for the remaining characters, discarding older characters.
-    sz_size_t cycle = 1;
-    sz_size_t const step_mask = step - 1;
-    for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
-        // 0. Load again the four characters we are dropping, shift them, and subtract.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[-window_length], text_third[-window_length],
-                                              text_second[-window_length], text_first[-window_length]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-        hash_low_vec.ymm =
-            _mm256_sub_epi64(hash_low_vec.ymm, _mm256_mul_epu64(chars_low_vec.ymm, prime_power_low_vec.ymm));
-        hash_high_vec.ymm =
-            _mm256_sub_epi64(hash_high_vec.ymm, _mm256_mul_epu64(chars_high_vec.ymm, prime_power_high_vec.ymm));
-
-        // 1. Multiply the hashes by the base.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-
-        // 3. Add the incoming characters.
-        hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_low_vec.ymm = _mm256_blendv_epi8(hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
-                                              _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
-        hash_high_vec.ymm = _mm256_blendv_epi8(hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
-                                               _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
-
-        // 5. Compute the hash mix, that will be used to index into the fingerprint.
-        //    This includes a serial step at the end.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
-        hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
-        if ((cycle & step_mask) == 0) {
-            callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-            callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-            callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-            callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-        }
-    }
-}
-
 #pragma clang attribute pop
 #pragma GCC pop_options
-#endif
-#pragma endregion
-
-/*
- *  @brief  AVX-512 implementation of the string search algorithms.
- *
- *  Different subsets of AVX-512 were introduced in different years:
- *  - 2017 SkyLake: F, CD, ER, PF, VL, DQ, BW
- *  - 2018 CannonLake: IFMA, VBMI
- *  - 2019 IceLake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES
- *  - 2020 TigerLake: VP2INTERSECT
- */
-#pragma region AVX512 Implementation
-
-#if SZ_USE_X86_AVX512
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
-#include <immintrin.h>
+#endif            // SZ_USE_HASWELL
+#pragma endregion // Haswell Implementation
 
-/**
- *  @brief  Helper structure to simplify work with 512-bit registers.
- */
-typedef union sz_u512_vec_t {
-    __m512i zmm;
-    __m256i ymms[2];
-    __m128i xmms[4];
-    sz_u64_t u64s[8];
-    sz_u32_t u32s[16];
-    sz_u16_t u16s[32];
-    sz_u8_t u8s[64];
-    sz_i64_t i64s[8];
-    sz_i32_t i32s[16];
-} sz_u512_vec_t;
-
-SZ_INTERNAL __mmask64 _sz_u64_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 64:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 64:
-    return _bzhi_u64(0xFFFFFFFFFFFFFFFF, n < 64 ? (sz_u32_t)n : 64);
-}
-
-SZ_INTERNAL __mmask32 _sz_u32_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 32:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 32:
-    return _bzhi_u32(0xFFFFFFFF, n < 32 ? (sz_u32_t)n : 32);
-}
-
-SZ_INTERNAL __mmask16 _sz_u16_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 16:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 16:
-    return _bzhi_u32(0xFFFFFFFF, n < 16 ? (sz_u32_t)n : 16);
-}
-
-SZ_INTERNAL __mmask16 _sz_u16_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 16:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 16:
-    return (__mmask16)_bzhi_u32(0xFFFFFFFF, (sz_u32_t)n);
-}
-
-SZ_INTERNAL __mmask32 _sz_u32_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 32:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 32:
-    return _bzhi_u32(0xFFFFFFFF, (sz_u32_t)n);
-}
-
-SZ_INTERNAL __mmask64 _sz_u64_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 64:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 64:
-    return _bzhi_u64(0xFFFFFFFFFFFFFFFF, (sz_u32_t)n);
-}
-
-SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    sz_u512_vec_t a_vec, b_vec;
-
-    // Pointer arithmetic is cheap, fetching memory is not!
-    // So we can use the masked loads to fetch at most one cache-line for each string,
-    // compare the prefixes, and only then move forward.
-    sz_size_t a_head_length = 64 - ((sz_size_t)a % 64); // 63 or less.
-    sz_size_t b_head_length = 64 - ((sz_size_t)b % 64); // 63 or less.
-    a_head_length = a_head_length < a_length ? a_head_length : a_length;
-    b_head_length = b_head_length < b_length ? b_head_length : b_length;
-    sz_size_t head_length = a_head_length < b_head_length ? a_head_length : b_head_length;
-    __mmask64 head_mask = _sz_u64_mask_until(head_length);
-    a_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, a);
-    b_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, b);
-    __mmask64 mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-    if (mask_not_equal != 0) {
-        sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-        char a_char = a_vec.u8s[first_diff];
-        char b_char = b_vec.u8s[first_diff];
-        return _sz_order_scalars(a_char, b_char);
-    }
-    else if (head_length == a_length && head_length == b_length) { return sz_equal_k; }
-    else { a += head_length, b += head_length, a_length -= head_length, b_length -= head_length; }
-
-    // The rare case, when both string are very long.
-    __mmask64 a_mask, b_mask;
-    while ((a_length >= 64) & (b_length >= 64)) {
-        a_vec.zmm = _mm512_loadu_si512(a);
-        b_vec.zmm = _mm512_loadu_si512(b);
-        mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask_not_equal != 0) {
-            sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-            char a_char = a_vec.u8s[first_diff];
-            char b_char = b_vec.u8s[first_diff];
-            return _sz_order_scalars(a_char, b_char);
-        }
-        a += 64, b += 64, a_length -= 64, b_length -= 64;
-    }
-
-    // In most common scenarios at least one of the strings is under 64 bytes.
-    if (a_length | b_length) {
-        a_mask = _sz_u64_clamp_mask_until(a_length);
-        b_mask = _sz_u64_clamp_mask_until(b_length);
-        a_vec.zmm = _mm512_maskz_loadu_epi8(a_mask, a);
-        b_vec.zmm = _mm512_maskz_loadu_epi8(b_mask, b);
-        // The AVX-512 `_mm512_mask_cmpneq_epi8_mask` intrinsics are generally handy in such environments.
-        // They, however, have latency 3 on most modern CPUs. Using AVX2: `_mm256_cmpeq_epi8` would have
-        // been cheaper, if we didn't have to apply `_mm256_movemask_epi8` afterwards.
-        mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask_not_equal != 0) {
-            sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-            char a_char = a_vec.u8s[first_diff];
-            char b_char = b_vec.u8s[first_diff];
-            return _sz_order_scalars(a_char, b_char);
-        }
-        // From logic perspective, the hardest cases are "abc\0" and "abc".
-        // The result must be `sz_greater_k`, as the latter is shorter.
-        else { return _sz_order_scalars(a_length, b_length); }
-    }
-
-    return sz_equal_k;
-}
-
-SZ_PUBLIC sz_bool_t sz_equal_avx512(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    __mmask64 mask;
-    sz_u512_vec_t a_vec, b_vec;
-
-    while (length >= 64) {
-        a_vec.zmm = _mm512_loadu_si512(a);
-        b_vec.zmm = _mm512_loadu_si512(b);
-        mask = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask != 0) return sz_false_k;
-        a += 64, b += 64, length -= 64;
-    }
-
-    if (length) {
-        mask = _sz_u64_mask_until(length);
-        a_vec.zmm = _mm512_maskz_loadu_epi8(mask, a);
-        b_vec.zmm = _mm512_maskz_loadu_epi8(mask, b);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpneq_epi8_mask(mask, a_vec.zmm, b_vec.zmm);
-        return (sz_bool_t)(mask == 0);
-    }
+/*  AVX512 implementation of the string search algorithms for Skylake and newer CPUs.
+ *  Includes extensions: F, CD, ER, PF, VL, DQ, BW.
+ *
+ *  This is the "starting level" for the advanced algorithms using K-mask registers on x86.
+ */
+#pragma region Skylake Implementation
 
-    return sz_true_k;
-}
+#if SZ_USE_SKYLAKE
+#pragma GCC push_options
+#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
+#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
 
-SZ_PUBLIC void sz_fill_avx512(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
+SZ_PUBLIC void sz_fill_skylake(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
     __m512i value_vec = _mm512_set1_epi8(value);
     // The naive implementation of this function is very simple.
     // It assumes the CPU is great at handling unaligned "stores".
@@ -4763,7 +665,7 @@ SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
         __mmask64 mask = _sz_u64_mask_until(length);
         _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
     }
-    // When dealing wirh larger arrays, the optimization is not as simple as with the `sz_fill_avx512` function,
+    // When dealing wirh larger arrays, the optimization is not as simple as with the `sz_fill_skylake` function,
     // as both buffers may be unaligned. If we are lucky and the requested operation is some huge page transfer,
     // we can use aligned loads and stores, and the performance will be great.
     else if ((sz_size_t)target % 64 == 0 && (sz_size_t)source % 64 == 0 && !is_huge) {
@@ -4886,931 +788,66 @@ SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
         //      - if we are shifting data right, that we are traversing to the left.
         int const left_to_right_traversal = source > target;
 
-        // Now we guarantee, that the relative shift within registers is from 1 to 63 bytes and the output is aligned.
-        // Hopefully, we need to shift more than two ZMM registers, so we could consider `valignr` instruction.
-        // Sadly, using `_mm512_alignr_epi8` doesn't make sense, as it operates at a 128-bit granularity.
-        //
-        //      - `_mm256_alignr_epi8` shifts entire 256-bit register, but we need many of them.
-        //      - `_mm512_alignr_epi32` shifts 512-bit chunks, but only if the `shift` is a multiple of 4 bytes.
-        //      - `_mm512_alignr_epi64` shifts 512-bit chunks by 8 bytes.
-        //
-        // All of those have a latency of 1 cycle, and the shift amount must be an immediate value!
-        // For 1-byte-shift granularity, the `_mm512_permutex2var_epi8` has a latency of 6 and needs VBMI!
-        // The most efficient and broadly compatible alternative could be to use a combination of align and shuffle.
-        // A similar approach was outlined in "Byte-wise alignr in AVX512F" by Wojciech Muła.
-        // http://0x80.pl/notesen/2016-10-16-avx512-byte-alignr.html
-        //
-        // That solution, is extremely mouthful, assuming we need compile time constants for the shift amount.
-        // A cleaner one, with a latency of 3 cycles, is to use `_mm512_permutexvar_epi8` or
-        // `_mm512_mask_permutexvar_epi8`, which can be seen as combination of a cross-register shuffle and blend,
-        // and is available with VBMI. That solution is still noticeably slower than AVX2.
-        //
-        // The GLibC implementation also uses non-temporal stores for larger buffers, we don't.
-        // https://codebrowser.dev/glibc/glibc/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S.html
-        if (left_to_right_traversal) {
-            // Head, body, and tail.
-            _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-            for (target += head_length, source += head_length; body_length >= 64;
-                 target += 64, source += 64, body_length -= 64)
-                _mm512_store_si512(target, _mm512_loadu_si512(source));
-            _mm512_mask_storeu_epi8(target, tail_mask, _mm512_maskz_loadu_epi8(tail_mask, source));
-        }
-        else {
-            // Tail, body, and head.
-            _mm512_mask_storeu_epi8(target + head_length + body_length, tail_mask,
-                                    _mm512_maskz_loadu_epi8(tail_mask, source + head_length + body_length));
-            for (; body_length >= 64; body_length -= 64)
-                _mm512_store_si512(target + head_length + body_length - 64,
-                                   _mm512_loadu_si512(source + head_length + body_length - 64));
-            _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-        }
-    }
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    __mmask64 mask;
-    sz_u512_vec_t h_vec, n_vec;
-    n_vec.zmm = _mm512_set1_epi8(n[0]);
-
-    while (h_length >= 64) {
-        h_vec.zmm = _mm512_loadu_si512(h);
-        mask = _mm512_cmpeq_epi8_mask(h_vec.zmm, n_vec.zmm);
-        if (mask) return h + sz_u64_ctz(mask);
-        h += 64, h_length -= 64;
-    }
-
-    if (h_length) {
-        mask = _sz_u64_mask_until(h_length);
-        h_vec.zmm = _mm512_maskz_loadu_epi8(mask, h);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpeq_epu8_mask(mask, h_vec.zmm, n_vec.zmm);
-        if (mask) return h + sz_u64_ctz(mask);
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_avx512(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into ZMM registers.
-    __mmask64 matches;
-    __mmask64 mask;
-    sz_u512_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.zmm = _mm512_set1_epi8(n[offset_first]);
-    n_mid_vec.zmm = _mm512_set1_epi8(n[offset_mid]);
-    n_last_vec.zmm = _mm512_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    // We have several optimized versions of the lagorithm for shorter strings,
-    // but they all mimic the default case for unbounded length needles
-    if (n_length >= 64) {
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches);
-                if (sz_equal_avx512(h + potential_offset, n, n_length)) return h + potential_offset;
-                matches &= matches - 1;
-            }
-
-            // TODO: If the last character contains a bad byte, we can reposition the start of the next iteration.
-            // This will be very helpful for very long needles.
-        }
-    }
-    // If there are only 2 or 3 characters in the needle, we don't even need the nested loop.
-    else if (n_length <= 3) {
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            if (matches) return h + sz_u64_ctz(matches);
-        }
-    }
-    // If the needle is smaller than the size of the ZMM register, we can use masked comparisons
-    // to avoid the the inner-most nested loop and compare the entire needle against a haystack
-    // slice in 3 CPU cycles.
-    else {
-        __mmask64 n_mask = _sz_u64_mask_until(n_length);
-        sz_u512_vec_t n_full_vec, h_full_vec;
-        n_full_vec.zmm = _mm512_maskz_loadu_epi8(n_mask, n);
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches);
-                h_full_vec.zmm = _mm512_maskz_loadu_epi8(n_mask, h + potential_offset);
-                if (_mm512_mask_cmpneq_epi8_mask(n_mask, h_full_vec.zmm, n_full_vec.zmm) == 0)
-                    return h + potential_offset;
-                matches &= matches - 1;
-            }
-        }
-    }
-
-    // The "tail" of the function uses masked loads to process the remaining bytes.
-    {
-        mask = _sz_u64_mask_until(h_length - n_length + 1);
-        h_first_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_first);
-        h_mid_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_mid);
-        h_last_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_ctz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + potential_offset, n, n_length)) return h + potential_offset;
-            matches &= matches - 1;
-        }
-    }
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    __mmask64 mask;
-    sz_u512_vec_t h_vec, n_vec;
-    n_vec.zmm = _mm512_set1_epi8(n[0]);
-
-    while (h_length >= 64) {
-        h_vec.zmm = _mm512_loadu_si512(h + h_length - 64);
-        mask = _mm512_cmpeq_epi8_mask(h_vec.zmm, n_vec.zmm);
-        if (mask) return h + h_length - 1 - sz_u64_clz(mask);
-        h_length -= 64;
-    }
-
-    if (h_length) {
-        mask = _sz_u64_mask_until(h_length);
-        h_vec.zmm = _mm512_maskz_loadu_epi8(mask, h);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpeq_epu8_mask(mask, h_vec.zmm, n_vec.zmm);
-        if (mask) return h + 64 - sz_u64_clz(mask) - 1;
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_avx512(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into ZMM registers.
-    __mmask64 mask;
-    __mmask64 matches;
-    sz_u512_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.zmm = _mm512_set1_epi8(n[offset_first]);
-    n_mid_vec.zmm = _mm512_set1_epi8(n[offset_mid]);
-    n_last_vec.zmm = _mm512_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 64; h_length -= 64) {
-        h_reversed = h + h_length - n_length - 64 + 1;
-        h_first_vec.zmm = _mm512_loadu_si512(h_reversed + offset_first);
-        h_mid_vec.zmm = _mm512_loadu_si512(h_reversed + offset_mid);
-        h_last_vec.zmm = _mm512_loadu_si512(h_reversed + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~((sz_u64_t)1 << (63 - potential_offset));
-        }
-    }
-
-    // The "tail" of the function uses masked loads to process the remaining bytes.
-    {
-        mask = _sz_u64_mask_until(h_length - n_length + 1);
-        h_first_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_first);
-        h_mid_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_mid);
-        h_last_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + 64 - potential_offset - 1, n, n_length))
-                return h + 64 - potential_offset - 1;
-            sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~((sz_u64_t)1 << (63 - potential_offset));
-        }
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
-                             apply_to = function)
-
-/**
- *  @brief  Computes the edit distance between two very short byte-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 63, and evaluates at most (63 * 2 + 1 = 127) diagonals, or just as many loop cycles.
- *  Supports an early exit, if the distance is bounded.
- *  Keeps all of the data and Levenshtein matrices skew diagonal in just a couple of registers.
- *  Benefits from the @b `vpermb` instructions, that can rotate the bytes across the entire ZMM register.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto63_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                        //
-    sz_cptr_t longer, sz_size_t longer_length,                          //
-    sz_size_t bound) {
-
-    sz_size_t const max_length = 63u;
-    sz_assert(shorter_length <= longer_length && "The 'shorter' string is longer than the 'longer' one.");
-    sz_assert(shorter_length < max_length && "The length must fit into 16-bit integer. Otherwise use serial variant.");
-
-    // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
-    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
-    sz_size_t const shorter_dim = shorter_length + 1;
-    sz_size_t const longer_dim = longer_length + 1;
-
-    // The next few buffers will be swapped around.
-    sz_u512_vec_t previous_vec, current_vec, next_vec;
-    sz_u512_vec_t gaps_vec, substitutions_vec;
-
-    // Load the strings into ZMM registers - just once.
-    sz_u512_vec_t longer_vec, shorter_vec, shorter_rotated_vec, rotate_left_vec, rotate_right_vec, ones_vec, bound_vec;
-    longer_vec.zmm = _mm512_maskz_loadu_epi8(_sz_u64_mask_until(longer_length), longer);
-    rotate_left_vec.zmm = _mm512_set_epi8(                              //
-        0, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,  //
-        48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, //
-        32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, //
-        16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-    rotate_right_vec.zmm = _mm512_set_epi8(                             //
-        62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48,     //
-        47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, //
-        31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, //
-        15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 63);
-    ones_vec.zmm = _mm512_set1_epi8(1);
-    bound_vec.zmm = _mm512_set1_epi8(bound <= 255 ? (sz_u8_t)bound : 255);
-
-    // To simplify comparisons and traversals, we want to reverse the order of bytes in the shorter string.
-    for (sz_size_t i = 0; i != shorter_length; ++i) shorter_vec.u8s[63 - i] = shorter[i];
-    shorter_rotated_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, shorter_vec.zmm);
-
-    // Let's say we are dealing with 3 and 5 letter words.
-    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
-    // It will have:
-    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
-    // - 2 diagonals of fixed length, at positions: 4, 5.
-    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
-    sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
-
-    // Initialize the first two diagonals:
-    //
-    //      previous_vec.u8s[0] = 0;
-    //      current_vec.u8s[0] = current_vec.u8s[1] = 1;
-    //
-    // We can do a similar thing with vector ops:
-    previous_vec.zmm = _mm512_setzero_si512();
-    current_vec.zmm = _mm512_set1_epi8(1);
-
-    // We skip diagonals 0 and 1, as they are trivial.
-    // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
-    // so we are effectively computing just one value, as will be marked by a single set bit in
-    // the `next_diagonal_mask` on the very first iteration.
-    sz_size_t next_diagonal_index = 2;
-    __mmask64 next_diagonal_mask = 0;
-
-    // Progress through the upper triangle of the Levenshtein matrix.
-    for (; next_diagonal_index != shorter_dim; ++next_diagonal_index) {
-        // After this iteration, the values at offset `0` and `next_diagonal_index` in the `next_vec`
-        // should be set to `next_diagonal_index`, but it's easier to broadcast the value to the whole vector,
-        // and later merge with a mask with new values.
-        next_vec.zmm = _mm512_set1_epi8((sz_u8_t)next_diagonal_index);
-
-        // The mask also adds one set bit.
-        next_diagonal_mask = _kor_mask64(next_diagonal_mask, 1);
-        next_diagonal_mask = _kshiftli_mask64(next_diagonal_mask, 1);
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        substitutions_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, substitutions_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(_mm512_permutexvar_epi8(rotate_right_vec.zmm, current_vec.zmm), current_vec.zmm),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_mask_min_epu8(next_vec.zmm, next_diagonal_mask, gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = current_vec.zmm;
-        current_vec.zmm = next_vec.zmm;
-
-        // Shift the shorter string
-        shorter_rotated_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, shorter_rotated_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
-    }
-
-    // Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.
-    for (; next_diagonal_index != longer_dim; ++next_diagonal_index) {
-        // After this iteration, the value `shorted_dim - 1` in the `next_vec`
-        // should be set to `next_diagonal_index`, but it's easier to broadcast the value to the whole vector,
-        // and later merge with a mask with new values.
-        next_vec.zmm = _mm512_set1_epi8((sz_u8_t)next_diagonal_index);
-
-        // Make sure we update the first entry.
-        next_diagonal_mask = _kor_mask64(next_diagonal_mask, 1);
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(current_vec.zmm, _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm)),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_mask_min_epu8(next_vec.zmm, next_diagonal_mask, gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm);
-        current_vec.zmm = next_vec.zmm;
-
-        // Let's shift the longer string now.
-        longer_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, longer_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
-    }
-
-    // Now let's handle the bottom right triangle.
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(current_vec.zmm, _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm)),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_min_epu8(gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm);
-        current_vec.zmm = next_vec.zmm;
-
-        // Let's shift the longer string now.
-        longer_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, longer_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
-        // In every following iterations we take use a shorter prefix of each register,
-        // but we don't need to update the `next_diagonal_mask` anymore... except for the early exit.
-        next_diagonal_mask = _kshiftri_mask64(next_diagonal_mask, 1);
-    }
-    return current_vec.u8s[0];
-}
-
-/**
- *  @brief  Computes the edit distance between two somewhat short bytes-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 127, and evaluates at most (127 * 2 + 1 = 255) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *  Benefits from the @b `vpermi2b` instructions, that can rotate the bytes in 2 registers at once.
- *
- *  This may be one of the most freuqently called kernels for:
- *  - source code analysis, assuming most lines are either under 80 or under 120 characters long.
- *  - DNA sequence alignment, as most short reads are 50-300 characters long.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto127_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two longer bytes-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 255, and evaluates at most (255 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *
- *  Each of 2x string ends up occupying 4 ZMM registers, and each of 3x diagonals uses 4 ZMM registers.
- *  So 20x of the 32x are persistently occupied, and the rest are used for math temporarily.
- *  This is the largest space-efficient variant, as strings beyond 255 characters may require
- *  16-bit accumulators, which would be a significant bottleneck.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                      //
-    sz_cptr_t longer, sz_size_t longer_length,                        //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two longer bytes-strings using the AVX-512VBMI extensions,
- *          assuming the upper distance bound can not exceed 255, but the string length can be arbitrary.
- *
- *  Applies to string lengths up to 255, and evaluates at most (255 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *
- *  Each of 2x string ends up occupying 4 ZMM registers, and each of 3x diagonals uses 4 ZMM registers.
- *  So 20x of the 32x are persistently occupied, and the rest are used for math temporarily.
- *  This is the largest space-efficient variant, as strings beyond 255 characters may require
- *  16-bit accumulators, which would be a significant bottleneck.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto255bound_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                              //
-    sz_cptr_t longer, sz_size_t longer_length,                                //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two mid-length UTF-8-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 127, and evaluates at most (127 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Benefits from the @b `valignd` instructions used to rotate UTF-32 unpacked unicode codepoints.
- *
- *  Each string is unpacked into 128 characters * 4 bytes per character / 64 bytes per register = 8 registers.
- *
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_utf8_skewed_diagonals_upto127_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                              //
-    sz_cptr_t longer, sz_size_t longer_length,                                //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    sz_unused(shorter && longer && bound && alloc);
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // TODO: Generalize!
-    sz_size_t const max_length = 256u * 256u;
-    sz_assert(shorter_length <= longer_length && "The 'shorter' string is longer than the 'longer' one.");
-    sz_assert(shorter_length < max_length && "The length must fit into 16-bit integer. Otherwise use serial variant.");
-    sz_unused(longer_length && bound && max_length);
-
-#if 0
-    // We are going to store 3 diagonals of the matrix.
-    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
-    sz_size_t const shorter_dim = shorter_length + 1;
-    sz_size_t const longer_dim = longer_length + 1;
-    // Unlike the serial version, we also want to avoid reverse-order iteration over teh shorter string.
-    // So let's allocate a bit more memory and reverse-export our shorter string into that buffer.
-    sz_size_t const buffer_length = sizeof(sz_u16_t) * longer_dim * 3 + shorter_length;
-    sz_u16_t *const distances = (sz_u16_t *)alloc->allocate(buffer_length, alloc->handle);
-    if (!distances) return SZ_SIZE_MAX;
-
-    // The next few pointers will be swapped around.
-    sz_u16_t *previous_distances = distances;
-    sz_u16_t *current_distances = previous_distances + longer_dim;
-    sz_u16_t *next_distances = current_distances + longer_dim;
-    sz_ptr_t const shorter_reversed = (sz_ptr_t)(next_distances + longer_dim);
-
-    // Export the reversed string into the buffer.
-    for (sz_size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
-
-    // Initialize the first two diagonals:
-    previous_distances[0] = 0;
-    current_distances[0] = current_distances[1] = 1;
-
-    // Using ZMM registers, we can process 32x 16-bit values at once,
-    // storing 16 bytes of each string in YMM registers.
-    sz_u512_vec_t insertions_vec, deletions_vec, substitutions_vec, next_vec;
-    sz_u512_vec_t ones_u16_vec;
-    ones_u16_vec.zmm = _mm512_set1_epi16(1);
-
-    // This is a mixed-precision implementation, using 8-bit representations for part of the operations.
-    // Even there, in case `SZ_USE_X86_AVX2=0`, let's use the `sz_u512_vec_t` type, addressing the first YMM halfs.
-    sz_u512_vec_t shorter_vec, longer_vec;
-    sz_u512_vec_t ones_u8_vec;
-    ones_u8_vec.ymms[0] = _mm256_set1_epi8(1);
-
-    // Let's say we are dealing with 3 and 5 letter words.
-    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
-    // It will have:
-    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
-    // - 2 diagonals of fixed length, at positions: 4, 5.
-    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
-    sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
-
-    // Progress through the upper triangle of the Levenshtein matrix.
-    sz_size_t next_diagonal_index = 2;
-    for (; next_diagonal_index != shorter_dim; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
-        for (sz_size_t offset_within_diagonal = 0; offset_within_diagonal + 2 < next_diagonal_length;) {
-            sz_u32_t remaining_length = (sz_u32_t)(next_diagonal_length - offset_within_diagonal - 2);
-            sz_u32_t register_length = remaining_length < 32 ? remaining_length : 32;
-            sz_u32_t remaining_length_mask = _bzhi_u32(0xFFFFFFFFu, register_length);
-            longer_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, longer + offset_within_diagonal);
-            // Our original code addressed the shorter string `[next_diagonal_index - offset_within_diagonal - 2]`
-            // for growing `offset_within_diagonal`. If the `shorter` string was reversed, the
-            // `[next_diagonal_index - offset_within_diagonal - 2]` would be equal to `[shorter_length - 1 -
-            // next_diagonal_index + offset_within_diagonal + 2]`. Which simplified would be equal to
-            // `[shorter_length - next_diagonal_index + offset_within_diagonal + 1]`.
-            shorter_vec.ymms[0] = _mm256_maskz_loadu_epi8( //
-                remaining_length_mask,
-                shorter_reversed + shorter_length - next_diagonal_index + offset_within_diagonal + 1);
-            // For substitutions, perform the equality comparison using AVX2 instead of AVX-512
-            // to get the result as a vector, instead of a bitmask. Adding 1 to every scalar we can overflow
-            // transforming from {0xFF, 0} values to {0, 1} values - exactly what we need. Then - upcast to 16-bit.
-            substitutions_vec.zmm = _mm512_cvtepi8_epi16( //
-                _mm256_add_epi8(_mm256_cmpeq_epi8(longer_vec.ymms[0], shorter_vec.ymms[0]), ones_u8_vec.ymms[0]));
-            substitutions_vec.zmm = _mm512_add_epi16( //
-                substitutions_vec.zmm,
-                _mm512_maskz_loadu_epi16(remaining_length_mask, previous_distances + offset_within_diagonal));
-            // For insertions and deletions, on modern hardware, it's faster to issue two separate loads,
-            // than rotate the bytes in the ZMM register.
-            insertions_vec.zmm =
-                _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + offset_within_diagonal);
-            deletions_vec.zmm =
-                _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + offset_within_diagonal + 1);
-            // First get the minimum of insertions and deletions.
-            next_vec.zmm = _mm512_add_epi16(_mm512_min_epu16(insertions_vec.zmm, deletions_vec.zmm), ones_u16_vec.zmm);
-            next_vec.zmm = _mm512_min_epu16(next_vec.zmm, substitutions_vec.zmm);
-            _mm512_mask_storeu_epi16(next_distances + offset_within_diagonal + 1, remaining_length_mask, next_vec.zmm);
-            offset_within_diagonal += register_length;
-        }
-        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-        next_distances[0] = next_distances[next_diagonal_length - 1] = (sz_u16_t)next_diagonal_index;
-        // Perform a circular rotation (three-way swap) of those buffers, to reuse the memory.
-        sz_u16_t *temporary = previous_distances;
-        previous_distances = current_distances;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a
-    // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
-    // index on either side, we will be cropping those values out.
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
-        for (sz_size_t i = 0; i != next_diagonal_length;) {
-            sz_u32_t remaining_length = (sz_u32_t)(next_diagonal_length - i);
-            sz_u32_t register_length = remaining_length < 32 ? remaining_length : 32;
-            sz_u32_t remaining_length_mask = _bzhi_u32(0xFFFFFFFFu, register_length);
-            longer_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, longer + next_diagonal_index - n + i);
-            // Our original code addressed the shorter string `[shorter_length - 1 - i]` for growing `i`.
-            // If the `shorter` string was reversed, the `[shorter_length - 1 - i]` would
-            // be equal to `[shorter_length - 1 - shorter_length + 1 + i]`.
-            // Which simplified would be equal to just `[i]`. Beautiful!
-            shorter_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, shorter_reversed + i);
-            // For substitutions, perform the equality comparison using AVX2 instead of AVX-512
-            // to get the result as a vector, instead of a bitmask. The compare it against the accumulated
-            // substitution costs.
-            substitutions_vec.zmm = _mm512_cvtepi8_epi16( //
-                _mm256_add_epi8(_mm256_cmpeq_epi8(longer_vec.ymms[0], shorter_vec.ymms[0]), ones_u8_vec.ymms[0]));
-            substitutions_vec.zmm = _mm512_add_epi16( //
-                substitutions_vec.zmm, _mm512_maskz_loadu_epi16(remaining_length_mask, previous_distances + i));
-            // For insertions and deletions, on modern hardware, it's faster to issue two separate loads,
-            // than rotate the bytes in the ZMM register.
-            insertions_vec.zmm = _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + i);
-            deletions_vec.zmm = _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + i + 1);
-            // First get the minimum of insertions and deletions.
-            next_vec.zmm = _mm512_add_epi16(_mm512_min_epu16(insertions_vec.zmm, deletions_vec.zmm), ones_u16_vec.zmm);
-            next_vec.zmm = _mm512_min_epu16(next_vec.zmm, substitutions_vec.zmm);
-            _mm512_mask_storeu_epi16(next_distances + i, remaining_length_mask, next_vec.zmm);
-            i += register_length;
-        }
-
-        // Perform a circular rotation (three-way swap) of those buffers, to reuse the memory, this time, with a shift,
-        // dropping the first element in the current array.
-        sz_u16_t *temporary = previous_distances;
-        previous_distances = current_distances + 1;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // Cache scalar before `free` call.
-    sz_size_t result = current_distances[0];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-#endif
-    return 0;
-}
-
-SZ_INTERNAL sz_size_t sz_edit_distance_avx512(   //
-    sz_cptr_t shorter, sz_size_t shorter_length, //
-    sz_cptr_t longer, sz_size_t longer_length,   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Bounded computations may exit early.
-    int const is_bounded = bound < longer_length;
-    if (is_bounded) {
-        // If one of the strings is empty - the edit distance is equal to the length of the other one.
-        if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
-        if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
-        // If the difference in length is beyond the `bound`, there is no need to check at all.
-        if (longer_length - shorter_length > bound) return bound;
-    }
-
-    // Make sure the shorter string is actually shorter.
-    if (shorter_length > longer_length) {
-        sz_cptr_t temporary = shorter;
-        shorter = longer;
-        longer = temporary;
-        sz_size_t temporary_length = shorter_length;
-        shorter_length = longer_length;
-        longer_length = temporary_length;
-    }
-
-    // Dispatch the right implementation based on the length of the strings.
-    if (longer_length < 64u)
-        return _sz_edit_distance_skewed_diagonals_upto63_avx512( //
-            shorter, shorter_length, longer, longer_length, bound);
-    // else if (longer_length < 256u * 256u)
-    //     return _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
-    //         shorter, shorter_length, longer, longer_length, bound, alloc);
-    else
-        return sz_edit_distance_serial(shorter, shorter_length, longer, longer_length, bound, alloc);
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_avx512(sz_cptr_t text, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "loads".
-    //
-    // A typical AWS Sapphire Rapids instance can have 48 KB x 2 blocks of L1 data cache per core,
-    // 2 MB x 2 blocks of L2 cache per core, and one shared 60 MB buffer of L3 cache.
-    // With two strings, we may consider the overal workload huge, if each exceeds 1 MB in length.
-    int const is_huge = length >= 1ull * 1024ull * 1024ull;
-    sz_u512_vec_t text_vec, sums_vec;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 16) {
-        __mmask16 mask = _sz_u16_mask_until(length);
-        text_vec.xmms[0] = _mm_maskz_loadu_epi8(mask, text);
-        sums_vec.xmms[0] = _mm_sad_epu8(text_vec.xmms[0], _mm_setzero_si128());
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_vec.xmms[0]);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_vec.xmms[0], 1);
-        return low + high;
-    }
-    else if (length <= 32) {
-        __mmask32 mask = _sz_u32_mask_until(length);
-        text_vec.ymms[0] = _mm256_maskz_loadu_epi8(mask, text);
-        sums_vec.ymms[0] = _mm256_sad_epu8(text_vec.ymms[0], _mm256_setzero_si256());
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymms[0]);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymms[0], 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        return low + high;
-    }
-    else if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        return _mm512_reduce_add_epi64(sums_vec.zmm);
-    }
-    else if (!is_huge) {
-        sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(text + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length; // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        for (text += head_length; body_length >= 64; text += 64, body_length -= 64) {
-            text_vec.zmm = _mm512_load_si512((__m512i const *)text);
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        }
-        text_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text);
-        sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        return _mm512_reduce_add_epi64(sums_vec.zmm);
-    }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    //
-    //      1. Moving in both directions to maximize the throughput, when fetching from multiple
-    //         memory pages. Also helps with cache set-associativity issues, as we won't always
-    //         be fetching the same entries in the lookup table.
-    //      2. Using non-temporal stores to avoid polluting the cache.
-    //      3. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
-    //         for predictable patterns, so disregard this advice.
-    //
-    // Bidirectional traversal generally adds about 10% to such algorithms.
-    else {
-        sz_u512_vec_t text_reversed_vec, sums_reversed_vec;
-        sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64;
-        sz_size_t tail_length = (sz_size_t)(text + length) % 64;
-        sz_size_t body_length = length - head_length - tail_length;
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-
-        text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        text_reversed_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text + head_length + body_length);
-        sums_reversed_vec.zmm = _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512());
-
-        // Now in the main loop, we can use non-temporal loads and stores,
-        // performing the operation in both directions.
-        for (text += head_length; body_length >= 128; text += 64, text += 64, body_length -= 128) {
-            text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-            text_reversed_vec.zmm = _mm512_stream_load_si512((__m512i *)(text + body_length - 64));
-            sums_reversed_vec.zmm =
-                _mm512_add_epi64(sums_reversed_vec.zmm, _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512()));
-        }
-        if (body_length >= 64) {
-            text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        }
-
-        return _mm512_reduce_add_epi64(_mm512_add_epi64(sums_vec.zmm, sums_reversed_vec.zmm));
-    }
-}
-
-SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    if (length < 4 * window_length) {
-        sz_hashes_serial(start, length, window_length, step, callback, callback_handle);
-        return;
-    }
-
-    // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
-    // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
-    sz_size_t const max_hashes = length - window_length + 1;
-    sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
-    sz_u8_t const *text_first = (sz_u8_t const *)start;
-    sz_u8_t const *text_second = text_first + min_hashes_per_thread;
-    sz_u8_t const *text_third = text_first + min_hashes_per_thread * 2;
-    sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
-    sz_u8_t const *text_end = text_first + length;
-
-    // Broadcast the global constants into the registers.
-    // Both high and low hashes will work with the same prime and golden ratio.
-    sz_u512_vec_t prime_vec, golden_ratio_vec;
-    prime_vec.zmm = _mm512_set1_epi64(SZ_U64_MAX_PRIME);
-    golden_ratio_vec.zmm = _mm512_set1_epi64(11400714819323198485ull);
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // We will be evaluating 4 offsets at a time with 2 different hash functions.
-    // We can fit all those 8 state variables in each of the following ZMM registers.
-    sz_u512_vec_t base_vec, prime_power_vec, shift_vec;
-    base_vec.zmm = _mm512_set_epi64(31ull, 31ull, 31ull, 31ull, 257ull, 257ull, 257ull, 257ull);
-    shift_vec.zmm = _mm512_set_epi64(0ull, 0ull, 0ull, 0ull, 77ull, 77ull, 77ull, 77ull);
-    prime_power_vec.zmm = _mm512_set_epi64(prime_power_low, prime_power_low, prime_power_low, prime_power_low,
-                                           prime_power_high, prime_power_high, prime_power_high, prime_power_high);
-
-    // Compute the initial hash values for every one of the four windows.
-    sz_u512_vec_t hash_vec, chars_vec;
-    hash_vec.zmm = _mm512_setzero_si512();
-    for (sz_u8_t const *prefix_end = text_first + window_length; text_first < prefix_end;
-         ++text_first, ++text_second, ++text_third, ++text_fourth) {
-
-        // 1. Multiply the hashes by the base.
-        hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`...
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
-                                         text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-
-        // 3. Add the incoming characters.
-        hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
-                                              _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
-    }
-
-    // 5. Compute the hash mix, that will be used to index into the fingerprint.
-    //    This includes a serial step at the end.
-    sz_u512_vec_t hash_mix_vec;
-    hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
-    hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
-                                            _mm512_extracti64x4_epi64(hash_mix_vec.zmm, 0));
-
-    callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-    callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-    callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-    callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-
-    // Now repeat that operation for the remaining characters, discarding older characters.
-    sz_size_t cycle = 1;
-    sz_size_t step_mask = step - 1;
-    for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
-        // 0. Load again the four characters we are dropping, shift them, and subtract.
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[-window_length], text_third[-window_length],
-                                         text_second[-window_length], text_first[-window_length], //
-                                         text_fourth[-window_length], text_third[-window_length],
-                                         text_second[-window_length], text_first[-window_length]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-        hash_vec.zmm = _mm512_sub_epi64(hash_vec.zmm, _mm512_mullo_epi64(chars_vec.zmm, prime_power_vec.zmm));
-
-        // 1. Multiply the hashes by the base.
-        hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
-                                         text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-
-        // ... and prefetch the next four characters into Level 2 or higher.
-        _mm_prefetch((sz_cptr_t)text_fourth + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_third + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_second + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_first + 1, _MM_HINT_T1);
-
-        // 3. Add the incoming characters.
-        hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
-                                              _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
-
-        // 5. Compute the hash mix, that will be used to index into the fingerprint.
-        //    This includes a serial step at the end.
-        hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
-        hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
-                                                _mm512_castsi512_si256(hash_mix_vec.zmm));
-
-        if ((cycle & step_mask) == 0) {
-            callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-            callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-            callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-            callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
+        // Now we guarantee, that the relative shift within registers is from 1 to 63 bytes and the output is aligned.
+        // Hopefully, we need to shift more than two ZMM registers, so we could consider `valignr` instruction.
+        // Sadly, using `_mm512_alignr_epi8` doesn't make sense, as it operates at a 128-bit granularity.
+        //
+        //      - `_mm256_alignr_epi8` shifts entire 256-bit register, but we need many of them.
+        //      - `_mm512_alignr_epi32` shifts 512-bit chunks, but only if the `shift` is a multiple of 4 bytes.
+        //      - `_mm512_alignr_epi64` shifts 512-bit chunks by 8 bytes.
+        //
+        // All of those have a latency of 1 cycle, and the shift amount must be an immediate value!
+        // For 1-byte-shift granularity, the `_mm512_permutex2var_epi8` has a latency of 6 and needs VBMI!
+        // The most efficient and broadly compatible alternative could be to use a combination of align and shuffle.
+        // A similar approach was outlined in "Byte-wise alignr in AVX512F" by Wojciech Muła.
+        // http://0x80.pl/notesen/2016-10-16-avx512-byte-alignr.html
+        //
+        // That solution, is extremely mouthful, assuming we need compile time constants for the shift amount.
+        // A cleaner one, with a latency of 3 cycles, is to use `_mm512_permutexvar_epi8` or
+        // `_mm512_mask_permutexvar_epi8`, which can be seen as combination of a cross-register shuffle and blend,
+        // and is available with VBMI. That solution is still noticeably slower than AVX2.
+        //
+        // The GLibC implementation also uses non-temporal stores for larger buffers, we don't.
+        // https://codebrowser.dev/glibc/glibc/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S.html
+        if (left_to_right_traversal) {
+            // Head, body, and tail.
+            _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
+            for (target += head_length, source += head_length; body_length >= 64;
+                 target += 64, source += 64, body_length -= 64)
+                _mm512_store_si512(target, _mm512_loadu_si512(source));
+            _mm512_mask_storeu_epi8(target, tail_mask, _mm512_maskz_loadu_epi8(tail_mask, source));
+        }
+        else {
+            // Tail, body, and head.
+            _mm512_mask_storeu_epi8(target + head_length + body_length, tail_mask,
+                                    _mm512_maskz_loadu_epi8(tail_mask, source + head_length + body_length));
+            for (; body_length >= 64; body_length -= 64)
+                _mm512_store_si512(target + head_length + body_length - 64,
+                                   _mm512_loadu_si512(source + head_length + body_length - 64));
+            _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
         }
     }
 }
 
 #pragma clang attribute pop
 #pragma GCC pop_options
+#endif            // SZ_USE_SKYLAKE
+#pragma endregion // Skylake Implementation
 
+/*  AVX512 implementation of the string search algorithms for Ice Lake and newer CPUs.
+ *  Includes extensions:
+ *      - 2017 Skylake: F, CD, ER, PF, VL, DQ, BW,
+ *      - 2018 CannonLake: IFMA, VBMI,
+ *      - 2019 Ice Lake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES.
+ */
+#pragma region Ice Lake Implementation
+#if SZ_USE_ICE
 #pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512vbmi", "avx512vbmi2", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512vbmi,avx512vbmi2,bmi,bmi2"))), \
+#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
+#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
                              apply_to = function)
 
-SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
+SZ_PUBLIC void sz_look_up_transform_ice(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
 
     // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
     // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
@@ -5920,396 +957,20 @@ SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, s
     }
 }
 
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-
-    // Before initializing the AVX-512 vectors, we may want to run the sequential code for the first few bytes.
-    // In practice, that only hurts, even when we have matches every 5-ish bytes.
-    //
-    //      if (length < SZ_SWAR_THRESHOLD) return sz_find_charset_serial(text, length, filter);
-    //      sz_cptr_t early_result = sz_find_charset_serial(text, SZ_SWAR_THRESHOLD, filter);
-    //      if (early_result) return early_result;
-    //      text += SZ_SWAR_THRESHOLD;
-    //      length -= SZ_SWAR_THRESHOLD;
-    //
-    // Let's unzip even and odd elements and replicate them into both lanes of the YMM register.
-    // That way when we invoke `_mm512_shuffle_epi8` we can use the same mask for both lanes.
-    sz_u512_vec_t filter_even_vec, filter_odd_vec;
-    __m256i filter_ymm = _mm256_lddqu_si256((__m256i const *)filter);
-    // There are a few way to initialize filters without having native strided loads.
-    // In the cronological order of experiments:
-    // - serial code initializing 128 bytes of odd and even mask
-    // - using several shuffles
-    // - using `_mm512_permutexvar_epi8`
-    // - using `_mm512_broadcast_i32x4(_mm256_castsi256_si128(_mm256_maskz_compress_epi8(0x55555555, filter_ymm)))`
-    //   and `_mm512_broadcast_i32x4(_mm256_castsi256_si128(_mm256_maskz_compress_epi8(0xaaaaaaaa, filter_ymm)))`
-    filter_even_vec.zmm = _mm512_broadcast_i32x4(_mm256_castsi256_si128( // broadcast __m128i to __m512i
-        _mm256_maskz_compress_epi8(0x55555555, filter_ymm)));
-    filter_odd_vec.zmm = _mm512_broadcast_i32x4(_mm256_castsi256_si128( // broadcast __m128i to __m512i
-        _mm256_maskz_compress_epi8(0xaaaaaaaa, filter_ymm)));
-    // After the unzipping operation, we can validate the contents of the vectors like this:
-    //
-    //      for (sz_size_t i = 0; i != 16; ++i) {
-    //          sz_assert(filter_even_vec.u8s[i] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 16] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 16] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 32] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 32] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 48] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 48] == filter->_u8s[i * 2 + 1]);
-    //      }
-    //
-    sz_u512_vec_t text_vec;
-    sz_u512_vec_t lower_nibbles_vec, higher_nibbles_vec;
-    sz_u512_vec_t bitset_even_vec, bitset_odd_vec;
-    sz_u512_vec_t bitmask_vec, bitmask_lookup_vec;
-    bitmask_lookup_vec.zmm = _mm512_set_epi8(                       //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);
-
-    while (length) {
-        // The following algorithm is a transposed equivalent of the "SIMDized check which bytes are in a set"
-        // solutions by Wojciech Muła. We populate the bitmask differently and target newer CPUs, so
-        // StrinZilla uses a somewhat different approach.
-        // http://0x80.pl/articles/simd-byte-lookup.html#alternative-implementation-new
-        //
-        //      sz_u8_t input = *(sz_u8_t const *)text;
-        //      sz_u8_t lo_nibble = input & 0x0f;
-        //      sz_u8_t hi_nibble = input >> 4;
-        //      sz_u8_t bitset_even = filter_even_vec.u8s[hi_nibble];
-        //      sz_u8_t bitset_odd = filter_odd_vec.u8s[hi_nibble];
-        //      sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //      sz_u8_t bitset = lo_nibble < 8 ? bitset_even : bitset_odd;
-        //      if ((bitset & bitmask) != 0) return text;
-        //      else { length--, text++; }
-        //
-        // The nice part about this, loading the strided data is vey easy with Arm NEON,
-        // while with x86 CPUs after AVX, shuffles within 256 bits shouldn't be an issue either.
-        sz_size_t load_length = sz_min_of_two(length, 64);
-        __mmask64 load_mask = _sz_u64_mask_until(load_length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, text);
-        lower_nibbles_vec.zmm = _mm512_and_si512(text_vec.zmm, _mm512_set1_epi8(0x0f));
-        bitmask_vec.zmm = _mm512_shuffle_epi8(bitmask_lookup_vec.zmm, lower_nibbles_vec.zmm);
-        //
-        // At this point we can validate the `bitmask_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != load_length; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t lo_nibble = input & 0x0f;
-        //          sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //          sz_assert(bitmask_vec.u8s[i] == bitmask);
-        //      }
-        //
-        // Shift right every byte by 4 bits.
-        // There is no `_mm512_srli_epi8` intrinsic, so we have to use `_mm512_srli_epi16`
-        // and combine it with a mask to clear the higher bits.
-        higher_nibbles_vec.zmm = _mm512_and_si512(_mm512_srli_epi16(text_vec.zmm, 4), _mm512_set1_epi8(0x0f));
-        bitset_even_vec.zmm = _mm512_shuffle_epi8(filter_even_vec.zmm, higher_nibbles_vec.zmm);
-        bitset_odd_vec.zmm = _mm512_shuffle_epi8(filter_odd_vec.zmm, higher_nibbles_vec.zmm);
-        //
-        // At this point we can validate the `bitset_even_vec` and `bitset_odd_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != load_length; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t const *bitset_ptr = &filter->_u8s[0];
-        //          sz_u8_t hi_nibble = input >> 4;
-        //          sz_u8_t bitset_even = bitset_ptr[hi_nibble * 2];
-        //          sz_u8_t bitset_odd = bitset_ptr[hi_nibble * 2 + 1];
-        //          sz_assert(bitset_even_vec.u8s[i] == bitset_even);
-        //          sz_assert(bitset_odd_vec.u8s[i] == bitset_odd);
-        //      }
-        //
-        // TODO: Is this a good place for ternary logic?
-        __mmask64 take_first = _mm512_cmplt_epi8_mask(lower_nibbles_vec.zmm, _mm512_set1_epi8(8));
-        bitset_even_vec.zmm = _mm512_mask_blend_epi8(take_first, bitset_odd_vec.zmm, bitset_even_vec.zmm);
-        __mmask64 matches_mask = _mm512_mask_test_epi8_mask(load_mask, bitset_even_vec.zmm, bitmask_vec.zmm);
-        if (matches_mask) {
-            int offset = sz_u64_ctz(matches_mask);
-            return text + offset;
-        }
-        else { text += load_length, length -= load_length; }
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-    return sz_rfind_charset_serial(text, length, filter);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_many_avx512(                        //
-    sz_cptr_t haystack, sz_size_t haystack_length,              //
-    sz_cptr_t const *needles, sz_size_t const *needles_lengths, //
-    sz_size_t *needle_offset) {
-
-    // When dealing with huge needles vocabularies, like in tokenization workloads, we need to construct an automaton.
-    // But in many cases, the vocabulary is small enough to use a simpler DFA-less approach, combining the ideas from
-    // the `sz_find_avx512` and `sz_find_charset_avx512` functions.
-    //
-    // Pick the offsets within needles where there is the least variance in the characters.
-    // Like for "the", "then", "there", "these", "those", "their", "they", "them", "that", "this", "thus", "than":
-    //
-    //    0: 't'
-    //    1: 'h'
-    //    2: 'e', 'a', 'i', 'o', 'u'
-    //    3: 'n', 'r', 's', 'i', 'y', 'm', 't'
-    //
-    // So depending on our "register budget", we can use a different number of pivot points: offset 0, 1, 2 make
-    // the most sense if we can only use 3 ZMM registers.
-    sz_unused(haystack && haystack_length && needles && needles_lengths && needle_offset);
-    return 0;
-}
-
-/**
- *  Computes the Needleman Wunsch alignment score between two strings.
- *  The method uses 32-bit integers to accumulate the running score for every cell in the matrix.
- *  Assuming the costs of substitutions can be arbitrary signed 8-bit integers, the method is expected to be used
- *  on strings not exceeding 2^24 length or 16.7 million characters.
- *
- *  Unlike the `_sz_edit_distance_skewed_diagonals_upto65k_avx512` method, this one uses signed integers to store
- *  the accumulated score. Moreover, it's primary bottleneck is the latency of gathering the substitution costs
- *  from the substitution matrix. If we use the diagonal order, we will be comparing a slice of the first string with
- *  a slice of the second. If we stick to the conventional horizontal order, we will be comparing one character against
- *  a slice, which is much easier to optimize. In that case we are sampling costs not from arbitrary parts of
- *  a 256 x 256 matrix, but from a single row!
- */
-SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
-
-    // If one of the strings is empty - the edit distance is equal to the length of the other one
-    if (longer_length == 0) return (sz_ssize_t)shorter_length * gap;
-    if (shorter_length == 0) return (sz_ssize_t)longer_length * gap;
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
-    }
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    sz_size_t const max_length = 256ull * 256ull * 256ull;
-    sz_size_t const n = longer_length + 1;
-    sz_assert(n < max_length && "The length must fit into 24-bit integer. Otherwise use serial variant.");
-    sz_unused(longer_length && max_length);
-
-    sz_size_t buffer_length = sizeof(sz_i32_t) * n * 2;
-    sz_i32_t *distances = (sz_i32_t *)alloc->allocate(buffer_length, alloc->handle);
-    sz_i32_t *previous_distances = distances;
-    sz_i32_t *current_distances = previous_distances + n;
-
-    // Intialize the first row of the Levenshtein matrix with `iota`.
-    for (sz_size_t idx_longer = 0; idx_longer != n; ++idx_longer)
-        previous_distances[idx_longer] = (sz_i32_t)idx_longer * gap;
-
-    /// Contains up to 16 consecutive characters from the longer string.
-    sz_u512_vec_t longer_vec;
-    sz_u512_vec_t cost_deletion_vec, cost_substitution_vec, lookup_substitution_vec, current_vec;
-    sz_u512_vec_t row_first_subs_vec, row_second_subs_vec, row_third_subs_vec, row_fourth_subs_vec;
-    sz_u512_vec_t shuffled_first_subs_vec, shuffled_second_subs_vec, shuffled_third_subs_vec, shuffled_fourth_subs_vec;
-
-    // Prepare constants and masks.
-    sz_u512_vec_t is_third_or_fourth_vec, is_second_or_fourth_vec, gap_vec;
-    {
-        char is_third_or_fourth_check, is_second_or_fourth_check;
-        *(sz_u8_t *)&is_third_or_fourth_check = 0x80, *(sz_u8_t *)&is_second_or_fourth_check = 0x40;
-        is_third_or_fourth_vec.zmm = _mm512_set1_epi8(is_third_or_fourth_check);
-        is_second_or_fourth_vec.zmm = _mm512_set1_epi8(is_second_or_fourth_check);
-        gap_vec.zmm = _mm512_set1_epi32(gap);
-    }
-
-    sz_u8_t const *shorter_unsigned = (sz_u8_t const *)shorter;
-    for (sz_size_t idx_shorter = 0; idx_shorter != shorter_length; ++idx_shorter) {
-        sz_i32_t last_in_row = current_distances[0] = (sz_i32_t)(idx_shorter + 1) * gap;
-
-        // Load one row of the substitution matrix into four ZMM registers.
-        sz_error_cost_t const *row_subs = subs + shorter_unsigned[idx_shorter] * 256u;
-        row_first_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 0);
-        row_second_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 1);
-        row_third_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 2);
-        row_fourth_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 3);
-
-        // In the serial version we have one forward pass, that computes the deletion,
-        // insertion, and substitution costs at once.
-        //    for (sz_size_t idx_longer = 0; idx_longer < longer_length; ++idx_longer) {
-        //        sz_ssize_t cost_deletion = previous_distances[idx_longer + 1] + gap;
-        //        sz_ssize_t cost_insertion = current_distances[idx_longer] + gap;
-        //        sz_ssize_t cost_substitution = previous_distances[idx_longer] + row_subs[longer_unsigned[idx_longer]];
-        //        current_distances[idx_longer + 1] = sz_min_of_three(cost_deletion, cost_insertion, cost_substitution);
-        //    }
-        //
-        // Given the complexity of handling the data-dependency between consecutive insertion cost computations
-        // within a Levenshtein matrix, the simplest design would be to vectorize every kind of cost computation
-        // separately.
-        //      1. Compute substitution costs for up to 64 characters at once, upcasting from 8-bit integers to 32.
-        //      2. Compute the pairwise minimum with deletion costs.
-        //      3. Inclusive prefix minimum computation to combine with addition costs.
-        // Proceeding with substitutions:
-        for (sz_size_t idx_longer = 0; idx_longer < longer_length; idx_longer += 64) {
-            sz_size_t register_length = sz_min_of_two(longer_length - idx_longer, 64);
-            __mmask64 mask = _sz_u64_mask_until(register_length);
-            longer_vec.zmm = _mm512_maskz_loadu_epi8(mask, longer + idx_longer);
-
-            // Blend the `row_(first|second|third|fourth)_subs_vec` into `current_vec`, picking the right source
-            // for every character in `longer_vec`. Before that, we need to permute the subsititution vectors.
-            // Only the bottom 6 bits of a byte are used in VPERB, so we don't even need to mask.
-            shuffled_first_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_first_subs_vec.zmm);
-            shuffled_second_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_second_subs_vec.zmm);
-            shuffled_third_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_third_subs_vec.zmm);
-            shuffled_fourth_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_fourth_subs_vec.zmm);
-
-            // To blend we can invoke three `_mm512_cmplt_epu8_mask`, but we can also achieve the same using
-            // the AND logical operation, checking the top two bits of every byte.
-            // Continuing this thought, we can use the VPTESTMB instruction to output the mask after the AND.
-            __mmask64 is_third_or_fourth = _mm512_mask_test_epi8_mask(mask, longer_vec.zmm, is_third_or_fourth_vec.zmm);
-            __mmask64 is_second_or_fourth =
-                _mm512_mask_test_epi8_mask(mask, longer_vec.zmm, is_second_or_fourth_vec.zmm);
-            lookup_substitution_vec.zmm = _mm512_mask_blend_epi8(
-                is_third_or_fourth,
-                // Choose between the first and the second.
-                _mm512_mask_blend_epi8(is_second_or_fourth, shuffled_first_subs_vec.zmm, shuffled_second_subs_vec.zmm),
-                // Choose between the third and the fourth.
-                _mm512_mask_blend_epi8(is_second_or_fourth, shuffled_third_subs_vec.zmm, shuffled_fourth_subs_vec.zmm));
-
-            // First, sign-extend lower and upper 16 bytes to 16-bit integers.
-            __m512i current_0_31_vec = _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(lookup_substitution_vec.zmm, 0));
-            __m512i current_32_63_vec = _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(lookup_substitution_vec.zmm, 1));
-
-            // Now extend those 16-bit integers to 32-bit.
-            // This isn't free, same as the subsequent store, so we only want to do that for the populated lanes.
-            // To minimize the number of loads and stores, we can combine our substitution costs with the previous
-            // distances, containing the deletion costs.
-            {
-                cost_substitution_vec.zmm = _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_0_31_vec, 0)));
-                cost_deletion_vec.zmm = _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Inclusive prefix minimum computation to combine with insertion costs.
-                // Simply disabling this operation results in 5x performance improvement, meaning
-                // that this operation is responsible for 80% of the total runtime.
-                //    for (sz_size_t idx_longer = 0; idx_longer < longer_length; ++idx_longer) {
-                //        current_distances[idx_longer + 1] =
-                //            sz_max_of_two(current_distances[idx_longer] + gap, current_distances[idx_longer + 1]);
-                //    }
-                //
-                // To perform the same operation in vectorized form, we need to perform a tree-like reduction,
-                // that will involve multiple steps. It's quite expensive and should be first tested in the
-                // "experimental" section.
-                //
-                // Another approach might be loop unrolling:
-                //      current_vec.i32s[0] = last_in_row = sz_i32_max_of_two(current_vec.i32s[0], last_in_row + gap);
-                //      current_vec.i32s[1] = last_in_row = sz_i32_max_of_two(current_vec.i32s[1], last_in_row + gap);
-                //      current_vec.i32s[2] = last_in_row = sz_i32_max_of_two(current_vec.i32s[2], last_in_row + gap);
-                //      ... yet this approach is also quite expensive.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 16 to 31.
-            if (register_length > 16) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 16);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_0_31_vec, 1)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 16);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 16, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 32 to 47.
-            if (register_length > 32) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 32);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_32_63_vec, 0)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 32);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 32, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 32 to 47.
-            if (register_length > 48) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 48);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_32_63_vec, 1)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 48);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 48, (__mmask16)mask, current_vec.zmm);
-            }
-        }
-
-        // Swap previous_distances and current_distances pointers
-        sz_pointer_swap((void **)&previous_distances, (void **)&current_distances);
-    }
-
-    // Cache scalar before `free` call.
-    sz_ssize_t result = previous_distances[longer_length];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-}
-
-SZ_INTERNAL sz_ssize_t sz_alignment_score_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,  //
-    sz_cptr_t longer, sz_size_t longer_length,    //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
-
-    if (sz_max_of_two(shorter_length, longer_length) < (256ull * 256ull * 256ull))
-        return _sz_alignment_score_wagner_fisher_upto17m_avx512(shorter, shorter_length, longer, longer_length, subs,
-                                                                gap, alloc);
-    else
-        return sz_alignment_score_serial(shorter, shorter_length, longer, longer_length, subs, gap, alloc);
-}
-
 enum sz_encoding_t {
     sz_encoding_unknown_k = 0,
     sz_encoding_ascii_k = 1,
     sz_encoding_utf8_k = 2,
     sz_encoding_utf16_k = 3,
     sz_encoding_utf32_k = 4,
-    sz_jwt_k,
-    sz_base64_k,
+    sz_encoding_jwt_k = 5,
+    sz_encoding_base64_k = 6,
     // Low priority encodings:
-    sz_encoding_utf8bom_k = 5,
-    sz_encoding_utf16le_k = 6,
-    sz_encoding_utf16be_k = 7,
-    sz_encoding_utf32le_k = 8,
-    sz_encoding_utf32be_k = 9,
+    sz_encoding_utf8bom_k = 7,
+    sz_encoding_utf16le_k = 8,
+    sz_encoding_utf16be_k = 9,
+    sz_encoding_utf32le_k = 10,
+    sz_encoding_utf32be_k = 11,
 };
 
 // Character Set Detection is one of the most commonly performed operations in data processing with
@@ -6354,78 +1015,18 @@ SZ_PUBLIC sz_bool_t sz_detect_encoding(sz_cptr_t text, sz_size_t length) {
 
 #pragma clang attribute pop
 #pragma GCC pop_options
-#endif
-
-#pragma endregion
+#endif            // SZ_USE_ICE
+#pragma endregion // Ice Lake Implementation
 
-/*  @brief  Implementation of the string search algorithms using the Arm NEON instruction set, available on 64-bit
- *          Arm processors. Implements: {substring search, character search, character set search} x {forward, reverse}.
+/*  Implementation of the string search algorithms using the Arm NEON instruction set, available on 64-bit
+ *  Arm processors. Covers billions of mobile CPUs worldwide, including Apple's A-series, and Qualcomm's Snapdragon.
  */
-#pragma region ARM NEON
-
-#if SZ_USE_ARM_NEON
+#pragma region NEON Implementation
+#if SZ_USE_NEON
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+simd")
 #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
 
-/**
- *  @brief  Helper structure to simplify work with 64-bit words.
- */
-typedef union sz_u128_vec_t {
-    uint8x16_t u8x16;
-    uint16x8_t u16x8;
-    uint32x4_t u32x4;
-    uint64x2_t u64x2;
-    sz_u64_t u64s[2];
-    sz_u32_t u32s[4];
-    sz_u16_t u16s[8];
-    sz_u8_t u8s[16];
-} sz_u128_vec_t;
-
-SZ_INTERNAL sz_u64_t _sz_vreinterpretq_u8_u4(uint8x16_t vec) {
-    // Use `vshrn` to produce a bitmask, similar to `movemask` in SSE.
-    // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
-    return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(vec), 4)), 0) & 0x8888888888888888ull;
-}
-
-SZ_PUBLIC sz_ordering_t sz_order_neon(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    //! Before optimizing this, read the "Operations Not Worth Optimizing" in Contributions Guide:
-    //! https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md#general-performance-observations
-    return sz_order_serial(a, a_length, b, b_length);
-}
-
-SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_u128_vec_t a_vec, b_vec;
-    for (; length >= 16; a += 16, b += 16, length -= 16) {
-        a_vec.u8x16 = vld1q_u8((sz_u8_t const *)a);
-        b_vec.u8x16 = vld1q_u8((sz_u8_t const *)b);
-        uint8x16_t cmp = vceqq_u8(a_vec.u8x16, b_vec.u8x16);
-        if (vminvq_u8(cmp) != 255) { return sz_false_k; } // Check if all bytes match
-    }
-
-    // Handle remaining bytes
-    if (length) return sz_equal_serial(a, b, length);
-    return sz_true_k;
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_neon(sz_cptr_t text, sz_size_t length) {
-    uint64x2_t sum_vec = vdupq_n_u64(0);
-
-    // Process 16 bytes (128 bits) at a time
-    for (; length >= 16; text += 16, length -= 16) {
-        uint8x16_t vec = vld1q_u8((sz_u8_t const *)text);      // Load 16 bytes
-        uint16x8_t pairwise_sum1 = vpaddlq_u8(vec);            // Pairwise add lower and upper 8 bits
-        uint32x4_t pairwise_sum2 = vpaddlq_u16(pairwise_sum1); // Pairwise add 16-bit results
-        uint64x2_t pairwise_sum3 = vpaddlq_u32(pairwise_sum2); // Pairwise add 32-bit results
-        sum_vec = vaddq_u64(sum_vec, pairwise_sum3);           // Accumulate the sum
-    }
-
-    // Final reduction of `sum_vec` to a single scalar
-    sz_u64_t sum = vgetq_lane_u64(sum_vec, 0) + vgetq_lane_u64(sum_vec, 1);
-    if (length) sum += sz_checksum_serial(text, length);
-    return sum;
-}
-
 SZ_PUBLIC void sz_copy_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
     // In most cases the `source` and the `target` are not aligned, but we should
     // at least make sure that writes don't touch many cache lines.
@@ -6524,8 +1125,9 @@ SZ_PUBLIC void sz_look_up_transform_neon(sz_cptr_t source, sz_size_t length, sz_
         lookup_64_to_127_vec.u8x16 = vqtbl4q_u8(lut_64_to_127_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0x40)));
         lookup_128_to_191_vec.u8x16 = vqtbl4q_u8(lut_128_to_191_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0x80)));
         lookup_192_to_255_vec.u8x16 = vqtbl4q_u8(lut_192_to_255_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0xc0)));
-        blended_0_to_255_vec.u8x16 = vorrq_u8(vorrq_u8(lookup_0_to_63_vec.u8x16, lookup_64_to_127_vec.u8x16),
-                                              vorrq_u8(lookup_128_to_191_vec.u8x16, lookup_192_to_255_vec.u8x16));
+        blended_0_to_255_vec.u8x16 = vorrq_u8( //
+            vorrq_u8(lookup_0_to_63_vec.u8x16, lookup_64_to_127_vec.u8x16),
+            vorrq_u8(lookup_128_to_191_vec.u8x16, lookup_192_to_255_vec.u8x16));
         vst1q_u8((sz_u8_t *)target, blended_0_to_255_vec.u8x16);
     }
 
@@ -6533,232 +1135,16 @@ SZ_PUBLIC void sz_look_up_transform_neon(sz_cptr_t source, sz_size_t length, sz_
     for (; tail_length; target += 1, source += 1, tail_length -= 1) *target = lut[*(sz_u8_t const *)source];
 }
 
-SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec, n_vec, matches_vec;
-    n_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)n);
-
-    while (h_length >= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)h);
-        matches_vec.u8x16 = vceqq_u8(h_vec.u8x16, n_vec.u8x16);
-        // In Arm NEON we don't have a `movemask` to combine it with `ctz` and get the offset of the match.
-        // But assuming the `vmaxvq` is cheap, we can use it to find the first match, by blending (bitwise selecting)
-        // the vector with a relative offsets array.
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        if (matches) return h + sz_u64_ctz(matches) / 4;
-
-        h += 16, h_length -= 16;
-    }
-
-    return sz_find_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec, n_vec, matches_vec;
-    n_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)n);
-
-    while (h_length >= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)h + h_length - 16);
-        matches_vec.u8x16 = vceqq_u8(h_vec.u8x16, n_vec.u8x16);
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        if (matches) return h + h_length - 1 - sz_u64_clz(matches) / 4;
-        h_length -= 16;
-    }
-
-    return sz_rfind_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_u64_t _sz_find_charset_neon_register(sz_u128_vec_t h_vec, uint8x16_t set_top_vec_u8x16,
-                                                  uint8x16_t set_bottom_vec_u8x16) {
-
-    // Once we've read the characters in the haystack, we want to
-    // compare them against our bitset. The serial version of that code
-    // would look like: `(set_->_u8s[c >> 3] & (1u << (c & 7u))) != 0`.
-    uint8x16_t byte_index_vec = vshrq_n_u8(h_vec.u8x16, 3);
-    uint8x16_t byte_mask_vec = vshlq_u8(vdupq_n_u8(1), vreinterpretq_s8_u8(vandq_u8(h_vec.u8x16, vdupq_n_u8(7))));
-    uint8x16_t matches_top_vec = vqtbl1q_u8(set_top_vec_u8x16, byte_index_vec);
-    // The table lookup instruction in NEON replies to out-of-bound requests with zeros.
-    // The values in `byte_index_vec` all fall in [0; 32). So for values under 16, substracting 16 will underflow
-    // and map into interval [240, 256). Meaning that those will be populated with zeros and we can safely
-    // merge `matches_top_vec` and `matches_bottom_vec` with a bitwise OR.
-    uint8x16_t matches_bottom_vec = vqtbl1q_u8(set_bottom_vec_u8x16, vsubq_u8(byte_index_vec, vdupq_n_u8(16)));
-    uint8x16_t matches_vec = vorrq_u8(matches_top_vec, matches_bottom_vec);
-    // Istead of pure `vandq_u8`, we can immediately broadcast a match presence across each 8-bit word.
-    matches_vec = vtstq_u8(matches_vec, byte_mask_vec);
-    return _sz_vreinterpretq_u8_u4(matches_vec);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_neon(h, h_length, n);
-
-    // Scan through the string.
-    // Assuming how tiny the Arm NEON registers are, we should avoid internal branches at all costs.
-    // That's why, for smaller needles, we use different loops.
-    if (n_length == 2) {
-        // Broadcast needle characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_last_vec, n_first_vec, n_last_vec, matches_vec;
-        // Dealing with 16-bit values, we can load 2 registers at a time and compare 31 possible offsets
-        // in a single loop iteration.
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[0]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[1]);
-        for (; h_length >= 17; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 0));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 1));
-            matches_vec.u8x16 =
-                vandq_u8(vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            if (matches) return h + sz_u64_ctz(matches) / 4;
-        }
-    }
-    else if (n_length == 3) {
-        // Broadcast needle characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-        // Comparing 24-bit values is a bumer. Being lazy, I went with the same approach
-        // as when searching for string over 4 characters long. I only avoid the last comparison.
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[0]);
-        n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[1]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[2]);
-        for (; h_length >= 18; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 0));
-            h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 1));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 2));
-            matches_vec.u8x16 = vandq_u8(                           //
-                vandq_u8(                                           //
-                    vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                    vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-                vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            if (matches) return h + sz_u64_ctz(matches) / 4;
-        }
-    }
-    else {
-        // Pick the parts of the needle that are worth comparing.
-        sz_size_t offset_first, offset_mid, offset_last;
-        _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-        // Broadcast those characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_first]);
-        n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_mid]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_last]);
-        // Walk through the string.
-        for (; h_length >= n_length + 16; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_first));
-            h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_mid));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_last));
-            matches_vec.u8x16 = vandq_u8(                           //
-                vandq_u8(                                           //
-                    vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                    vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-                vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches) / 4;
-                if (sz_equal(h + potential_offset, n, n_length)) return h + potential_offset;
-                matches &= matches - 1;
-            }
-        }
-    }
-
-    return sz_find_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_neon(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Will contain 4 bits per character.
-    sz_u64_t matches;
-    sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-    n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_first]);
-    n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_mid]);
-    n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_last]);
-
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 16; h_length -= 16) {
-        h_reversed = h + h_length - n_length - 16 + 1;
-        h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_first));
-        h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_mid));
-        h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_last));
-        matches_vec.u8x16 = vandq_u8(                           //
-            vandq_u8(                                           //
-                vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-            vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches) / 4;
-            if (sz_equal(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            sz_assert((matches & (1ull << (63 - potential_offset * 4))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~(1ull << (63 - potential_offset * 4));
-        }
-    }
-
-    return sz_rfind_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_charset_t const *set) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec;
-    uint8x16_t set_top_vec_u8x16 = vld1q_u8(&set->_u8s[0]);
-    uint8x16_t set_bottom_vec_u8x16 = vld1q_u8(&set->_u8s[16]);
-
-    for (; h_length >= 16; h += 16, h_length -= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h));
-        matches = _sz_find_charset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
-        if (matches) return h + sz_u64_ctz(matches) / 4;
-    }
-
-    return sz_find_charset_serial(h, h_length, set);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_charset_t const *set) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec;
-    uint8x16_t set_top_vec_u8x16 = vld1q_u8(&set->_u8s[0]);
-    uint8x16_t set_bottom_vec_u8x16 = vld1q_u8(&set->_u8s[16]);
-
-    // Check `sz_find_charset_neon` for explanations.
-    for (; h_length >= 16; h_length -= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h) + h_length - 16);
-        matches = _sz_find_charset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
-        if (matches) return h + h_length - 1 - sz_u64_clz(matches) / 4;
-    }
-
-    return sz_rfind_charset_serial(h, h_length, set);
-}
-
 #pragma clang attribute pop
 #pragma GCC pop_options
-#endif // Arm Neon
-
-#pragma endregion
+#endif            // SZ_USE_NEON
+#pragma endregion // NEON Implementation
 
-/*  @brief  Implementation of the string search algorithms using the Arm SVE variable-length registers, available
- *          in Arm v9 processors.
- *
- *  Implements:
- *      - memory: {copy, move, fill}
- *      - comparisons: {equal, order}
- *      - search: {substring, character, character set} x {forward, reverse}.
+/*  Implementation of the memory operations using the Arm SVE variable-length registers,
+ *  available in Arm v9 processors, like in Apple M4+ and Graviton 3+ CPUs.
  */
-#pragma region ARM SVE
-
-#if SZ_USE_ARM_SVE
+#pragma region SVE Implementation
+#if SZ_USE_SVE
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+sve")
 #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
@@ -6867,82 +1253,23 @@ SZ_PUBLIC void sz_copy_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length)
 
 #pragma clang attribute pop
 #pragma GCC pop_options
-#endif // Arm SVE
-
-#pragma endregion
+#endif            // SZ_USE_SVE
+#pragma endregion // SVE Implementation
 
-/*
- *  @brief  Pick the right implementation for the string search algorithms.
+/*  Pick the right implementation for the string search algorithms.
+ *  To override this behavior and precompile all backends - set `SZ_DYNAMIC_DISPATCH` to 1.
  */
 #pragma region Compile Time Dispatching
-
-SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t ins, sz_size_t length) { return sz_hash_serial(ins, length); }
-SZ_PUBLIC void sz_tolower(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_tolower_serial(ins, length, outs); }
-SZ_PUBLIC void sz_toupper(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_toupper_serial(ins, length, outs); }
-SZ_PUBLIC void sz_toascii(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_toascii_serial(ins, length, outs); }
-SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t ins, sz_size_t length) { return sz_isascii_serial(ins, length); }
-
-SZ_PUBLIC void sz_hashes_fingerprint(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_ptr_t fingerprint,
-                                     sz_size_t fingerprint_bytes) {
-
-    sz_bool_t fingerprint_length_is_power_of_two = (sz_bool_t)((fingerprint_bytes & (fingerprint_bytes - 1)) == 0);
-    sz_string_view_t fingerprint_buffer = {fingerprint, fingerprint_bytes};
-
-    // There are several issues related to the fingerprinting algorithm.
-    // First, the memory traversal order is important.
-    // https://blog.stuffedcow.net/2015/08/pagewalk-coherence/
-
-    // In most cases the fingerprint length will be a power of two.
-    if (fingerprint_length_is_power_of_two == sz_false_k)
-        sz_hashes(start, length, window_length, 1, _sz_hashes_fingerprint_non_pow2_callback, &fingerprint_buffer);
-    else
-        sz_hashes(start, length, window_length, 1, _sz_hashes_fingerprint_pow2_callback, &fingerprint_buffer);
-}
-
 #if !SZ_DYNAMIC_DISPATCH
 
-SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    return sz_checksum_avx512(text, length);
-#elif SZ_USE_X86_AVX2
-    return sz_checksum_avx2(text, length);
-#elif SZ_USE_ARM_NEON
-    return sz_checksum_neon(text, length);
-#else
-    return sz_checksum_serial(text, length);
-#endif
-}
-
-SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    return sz_equal_avx512(a, b, length);
-#elif SZ_USE_X86_AVX2
-    return sz_equal_avx2(a, b, length);
-#elif SZ_USE_ARM_NEON
-    return sz_equal_neon(a, b, length);
-#else
-    return sz_equal_serial(a, b, length);
-#endif
-}
-
-SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-#if SZ_USE_X86_AVX512
-    return sz_order_avx512(a, a_length, b, b_length);
-#elif SZ_USE_X86_AVX2
-    return sz_order_avx2(a, a_length, b, b_length);
-#elif SZ_USE_ARM_NEON
-    return sz_order_neon(a, a_length, b, b_length);
-#else
-    return sz_order_serial(a, a_length, b, b_length);
-#endif
-}
+#pragma region Core Funcitonality
 
 SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
     sz_copy_avx512(target, source, length);
-#elif SZ_USE_X86_AVX2
+#elif SZ_USE_HASWELL
     sz_copy_avx2(target, source, length);
-#elif SZ_USE_ARM_NEON
+#elif SZ_USE_NEON
     sz_copy_neon(target, source, length);
 #else
     sz_copy_serial(target, source, length);
@@ -6950,11 +1277,11 @@ SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
 }
 
 SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
     sz_move_avx512(target, source, length);
-#elif SZ_USE_X86_AVX2
+#elif SZ_USE_HASWELL
     sz_move_avx2(target, source, length);
-#elif SZ_USE_ARM_NEON
+#elif SZ_USE_NEON
     sz_move_neon(target, source, length);
 #else
     sz_move_serial(target, source, length);
@@ -6962,11 +1289,11 @@ SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
 }
 
 SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-#if SZ_USE_X86_AVX512
-    sz_fill_avx512(target, length, value);
-#elif SZ_USE_X86_AVX2
-    sz_fill_avx2(target, length, value);
-#elif SZ_USE_ARM_NEON
+#if SZ_USE_ICE
+    sz_fill_skylake(target, length, value);
+#elif SZ_USE_HASWELL
+    sz_fill_haswell(target, length, value);
+#elif SZ_USE_NEON
     sz_fill_neon(target, length, value);
 #else
     sz_fill_serial(target, length, value);
@@ -6974,183 +1301,21 @@ SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
 }
 
 SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-#if SZ_USE_X86_AVX512
-    sz_look_up_transform_avx512(source, length, lut, target);
-#elif SZ_USE_X86_AVX2
-    sz_look_up_transform_avx2(source, length, lut, target);
-#elif SZ_USE_ARM_NEON
+#if SZ_USE_ICE
+    sz_look_up_transform_ice(source, length, lut, target);
+#elif SZ_USE_HASWELL
+    sz_look_up_transform_haswell(source, length, lut, target);
+#elif SZ_USE_NEON
     sz_look_up_transform_neon(source, length, lut, target);
 #else
     sz_look_up_transform_serial(source, length, lut, target);
 #endif
 }
 
-SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
-#if SZ_USE_X86_AVX512
-    return sz_find_byte_avx512(haystack, h_length, needle);
-#elif SZ_USE_X86_AVX2
-    return sz_find_byte_avx2(haystack, h_length, needle);
-#elif SZ_USE_ARM_NEON
-    return sz_find_byte_neon(haystack, h_length, needle);
-#else
-    return sz_find_byte_serial(haystack, h_length, needle);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
-#if SZ_USE_X86_AVX512
-    return sz_rfind_byte_avx512(haystack, h_length, needle);
-#elif SZ_USE_X86_AVX2
-    return sz_rfind_byte_avx2(haystack, h_length, needle);
-#elif SZ_USE_ARM_NEON
-    return sz_rfind_byte_neon(haystack, h_length, needle);
-#else
-    return sz_rfind_byte_serial(haystack, h_length, needle);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
-#if SZ_USE_X86_AVX512
-    return sz_find_avx512(haystack, h_length, needle, n_length);
-#elif SZ_USE_X86_AVX2
-    return sz_find_avx2(haystack, h_length, needle, n_length);
-#elif SZ_USE_ARM_NEON
-    return sz_find_neon(haystack, h_length, needle, n_length);
-#else
-    return sz_find_serial(haystack, h_length, needle, n_length);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
-#if SZ_USE_X86_AVX512
-    return sz_rfind_avx512(haystack, h_length, needle, n_length);
-#elif SZ_USE_X86_AVX2
-    return sz_rfind_avx2(haystack, h_length, needle, n_length);
-#elif SZ_USE_ARM_NEON
-    return sz_rfind_neon(haystack, h_length, needle, n_length);
-#else
-    return sz_rfind_serial(haystack, h_length, needle, n_length);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#if SZ_USE_X86_AVX512
-    return sz_find_charset_avx512(text, length, set);
-#elif SZ_USE_X86_AVX2
-    return sz_find_charset_avx2(text, length, set);
-#elif SZ_USE_ARM_NEON
-    return sz_find_charset_neon(text, length, set);
-#else
-    return sz_find_charset_serial(text, length, set);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#if SZ_USE_X86_AVX512
-    return sz_rfind_charset_avx512(text, length, set);
-#elif SZ_USE_X86_AVX2
-    return sz_rfind_charset_avx2(text, length, set);
-#elif SZ_USE_ARM_NEON
-    return sz_rfind_charset_neon(text, length, set);
-#else
-    return sz_rfind_charset_serial(text, length, set);
-#endif
-}
-
-SZ_DYNAMIC sz_size_t sz_hamming_distance( //
-    sz_cptr_t a, sz_size_t a_length,      //
-    sz_cptr_t b, sz_size_t b_length,      //
-    sz_size_t bound) {
-    return sz_hamming_distance_serial(a, a_length, b, b_length, bound);
-}
-
-SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length,           //
-    sz_cptr_t b, sz_size_t b_length,           //
-    sz_size_t bound) {
-    return sz_hamming_distance_utf8_serial(a, a_length, b, b_length, bound);
-}
-
-SZ_DYNAMIC sz_size_t sz_edit_distance( //
-    sz_cptr_t a, sz_size_t a_length,   //
-    sz_cptr_t b, sz_size_t b_length,   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-#if SZ_USE_X86_AVX512
-    return sz_edit_distance_avx512(a, a_length, b, b_length, bound, alloc);
-#else
-    return sz_edit_distance_serial(a, a_length, b, b_length, bound, alloc);
-#endif
-}
-
-SZ_DYNAMIC sz_size_t sz_edit_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length,        //
-    sz_cptr_t b, sz_size_t b_length,        //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-    return _sz_edit_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc);
-}
-
-SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                         sz_error_cost_t const *subs, sz_error_cost_t gap,
-                                         sz_memory_allocator_t *alloc) {
-#if SZ_USE_X86_AVX512
-    return sz_alignment_score_avx512(a, a_length, b, b_length, subs, gap, alloc);
-#else
-    return sz_alignment_score_serial(a, a_length, b, b_length, subs, gap, alloc);
-#endif
-}
-
-SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                          sz_hash_callback_t callback, void *callback_handle) {
-#if SZ_USE_X86_AVX512
-    sz_hashes_avx512(text, length, window_length, window_step, callback, callback_handle);
-#elif SZ_USE_X86_AVX2
-    sz_hashes_avx2(text, length, window_length, window_step, callback, callback_handle);
-#else
-    sz_hashes_serial(text, length, window_length, window_step, callback, callback_handle);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    return sz_find_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    sz_charset_invert(&set);
-    return sz_find_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    return sz_rfind_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    sz_charset_invert(&set);
-    return sz_rfind_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
-                            sz_random_generator_t generator, void *generator_user_data) {
-    sz_generate_serial(alphabet, alphabet_size, result, result_length, generator, generator_user_data);
-}
-
-#endif
-#pragma endregion
+#endif            // !SZ_DYNAMIC_DISPATCH
+#pragma endregion // Compile Time Dispatching
 
 #ifdef __cplusplus
-#pragma GCC diagnostic pop
 }
 #endif // __cplusplus
-
-#endif // STRINGZILLA_H_
+#endif // STRINGZILLA_MEMORY_H_

From 8b401bd41e4bd9c29c8fad9a5b83d8232efa50c7 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 15:36:38 +0000
Subject: [PATCH 033/751] Fix: Filter `similarity.h` file

---
 include/stringzilla/similarity.h | 6890 +++---------------------------
 1 file changed, 554 insertions(+), 6336 deletions(-)

diff --git a/include/stringzilla/similarity.h b/include/stringzilla/similarity.h
index de7fbcac..e811fefe 100644
--- a/include/stringzilla/similarity.h
+++ b/include/stringzilla/similarity.h
@@ -1,5140 +1,607 @@
 /**
- *  @brief  StringZilla is a collection of advanced string algorithms, designed to be used in Big Data applications.
- *          It is generally faster than LibC, and has a broader & cleaner interface, and targets modern x86 CPUs
- *          with AVX-512 and Arm NEON and older CPUs with SWAR and auto-vectorization.
- *
- *  Consider overriding the following macros to customize the library:
- *
- *  - `SZ_DEBUG=0` - whether to enable debug assertions and logging.
- *  - `SZ_DYNAMIC_DISPATCH=0` - whether to use runtime dispatching of the most advanced SIMD backend.
- *  - `SZ_USE_MISALIGNED_LOADS=0` - whether to use misaligned loads on platforms that support them.
- *  - `SZ_SWAR_THRESHOLD=24` - threshold for switching to SWAR backend over serial byte-level for-loops.
- *  - `SZ_USE_X86_AVX512=?` - whether to use AVX-512 instructions on x86_64.
- *  - `SZ_USE_X86_AVX2=?` - whether to use AVX2 instructions on x86_64.
- *  - `SZ_USE_ARM_NEON=?` - whether to use NEON instructions on ARM.
- *  - `SZ_USE_ARM_SVE=?` - whether to use SVE instructions on ARM.
- *
- *  @see    StringZilla: https://github.com/ashvardanian/StringZilla/blob/main/README.md
- *  @see    LibC String: https://pubs.opengroup.org/onlinepubs/009695399/basedefs/string.h.html
- *
- *  @file   stringzilla.h
+ *  @brief  Hardware-accelerated string similarity utilities.
+ *  @file   similarity.h
  *  @author Ash Vardanian
- */
-#ifndef STRINGZILLA_H_
-#define STRINGZILLA_H_
-
-#define STRINGZILLA_VERSION_MAJOR 3
-#define STRINGZILLA_VERSION_MINOR 11
-#define STRINGZILLA_VERSION_PATCH 0
-
-/**
- *  @brief  When set to 1, the library will include the following LibC headers: <stddef.h> and <stdint.h>.
- *          In debug builds (SZ_DEBUG=1), the library will also include <stdio.h> and <stdlib.h>.
  *
- *  You may want to disable this compiling for use in the kernel, or in embedded systems.
- *  You may also avoid them, if you are very sensitive to compilation time and avoid pre-compiled headers.
- *  https://artificial-mind.net/projects/compile-health/
- */
-#ifndef SZ_AVOID_LIBC
-#define SZ_AVOID_LIBC (0) // true or false
-#endif
-
-/**
- *  @brief  A misaligned load can be - trying to fetch eight consecutive bytes from an address
- *          that is not divisible by eight. On x86 enabled by default. On ARM it's not.
- *
- *  Most platforms support it, but there is no industry standard way to check for those.
- *  This value will mostly affect the performance of the serial (SWAR) backend.
- */
-#ifndef SZ_USE_MISALIGNED_LOADS
-#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
-#define SZ_USE_MISALIGNED_LOADS (1) // true or false
-#else
-#define SZ_USE_MISALIGNED_LOADS (0) // true or false
-#endif
-#endif
-
-/**
- *  @brief  Removes compile-time dispatching, and replaces it with runtime dispatching.
- *          So the `sz_find` function will invoke the most advanced backend supported by the CPU,
- *          that runs the program, rather than the most advanced backend supported by the CPU
- *          used to compile the library or the downstream application.
- */
-#ifndef SZ_DYNAMIC_DISPATCH
-#define SZ_DYNAMIC_DISPATCH (0) // true or false
-#endif
-
-/**
- *  @brief  Analogous to `size_t` and `std::size_t`, unsigned integer, identical to pointer size.
- *          64-bit on most platforms where pointers are 64-bit.
- *          32-bit on platforms where pointers are 32-bit.
- */
-#if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64)
-#define SZ_DETECT_64_BIT (1)
-#define SZ_SIZE_MAX (0xFFFFFFFFFFFFFFFFull)  // Largest unsigned integer that fits into 64 bits.
-#define SZ_SSIZE_MAX (0x7FFFFFFFFFFFFFFFull) // Largest signed integer that fits into 64 bits.
-#else
-#define SZ_DETECT_64_BIT (0)
-#define SZ_SIZE_MAX (0xFFFFFFFFu)  // Largest unsigned integer that fits into 32 bits.
-#define SZ_SSIZE_MAX (0x7FFFFFFFu) // Largest signed integer that fits into 32 bits.
-#endif
-
-/**
- *  @brief  On Big-Endian machines StringZilla will work in compatibility mode.
- *          This disables SWAR hacks to minimize code duplication, assuming practically
- *          all modern popular platforms are Little-Endian.
+ *  Includes core APIs:
  *
- *  This variable is hard to infer from macros reliably. It's best to set it manually.
- *  For that CMake provides the `TestBigEndian` and `CMAKE_<LANG>_BYTE_ORDER` (from 3.20 onwards).
- *  In Python one can check `sys.byteorder == 'big'` in the `setup.py` script and pass the appropriate macro.
- *  https://stackoverflow.com/a/27054190
- */
-#ifndef SZ_DETECT_BIG_ENDIAN
-#if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN || defined(__BIG_ENDIAN__) || defined(__ARMEB__) || \
-    defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || defined(__MIBSEB) || defined(__MIBSEB__)
-#define SZ_DETECT_BIG_ENDIAN (1) //< It's a big-endian target architecture
-#else
-#define SZ_DETECT_BIG_ENDIAN (0) //< It's a little-endian target architecture
-#endif
-#endif
-
-/*
- *  Debugging and testing.
- */
-#ifndef SZ_DEBUG
-#if defined(DEBUG) || defined(_DEBUG) // This means "Not using DEBUG information".
-#define SZ_DEBUG (1)
-#else
-#define SZ_DEBUG (0)
-#endif
-#endif
-
-/**
- *  @brief  Threshold for switching to SWAR (8-bytes at a time) backend over serial byte-level for-loops.
- *          On very short strings, under 16 bytes long, at most a single word will be processed with SWAR.
- *          Assuming potentially misaligned loads, SWAR makes sense only after ~24 bytes.
- */
-#ifndef SZ_SWAR_THRESHOLD
-#if SZ_DEBUG
-#define SZ_SWAR_THRESHOLD (8u) // 8 bytes in debug builds
-#else
-#define SZ_SWAR_THRESHOLD (24u) // 24 bytes in release builds
-#endif
-#endif
-
-/*  Annotation for the public API symbols:
+ *  - `sz_edit_distance` & `sz_edit_distance_utf8` for Levenshtein edit-distance computation.
+ *  - `sz_alignment_score` for weighted Needleman-Wunsch global alignment.
+ *  - `sz_hamming_distance` & `sz_hamming_distance_utf8` for Hamming distance computation.
  *
- *  - `SZ_PUBLIC` is used for functions that are part of the public API.
- *  - `SZ_INTERNAL` is used for internal helper functions with unstable APIs.
- *  - `SZ_DYNAMIC` is used for functions that are part of the public API, but are dispatched at runtime.
+ *  The Hamming distance is rarely used in string processing, so only minimal compatibility is provided.
+ *  The Levenshtein distance, however, is much more popular and computationally intensive.
+ *  So a huge part of this file is focused on optimizing it for different input alphabet sizes and input lengths.
  */
-#ifndef SZ_DYNAMIC
-#if SZ_DYNAMIC_DISPATCH
-#if defined(_WIN32) || defined(__CYGWIN__)
-#define SZ_DYNAMIC __declspec(dllexport)
-#define SZ_EXTERNAL __declspec(dllimport)
-#define SZ_PUBLIC inline static
-#define SZ_INTERNAL inline static
-#else
-#define SZ_DYNAMIC __attribute__((visibility("default")))
-#define SZ_EXTERNAL extern
-#define SZ_PUBLIC __attribute__((unused)) inline static
-#define SZ_INTERNAL __attribute__((always_inline)) inline static
-#endif // _WIN32 || __CYGWIN__
-#else
-#define SZ_DYNAMIC inline static
-#define SZ_EXTERNAL extern
-#define SZ_PUBLIC inline static
-#define SZ_INTERNAL inline static
-#endif // SZ_DYNAMIC_DISPATCH
-#endif // SZ_DYNAMIC
+#ifndef STRINGZILLA_SIMILARITY_H_
+#define STRINGZILLA_SIMILARITY_H_
 
-/**
- *  @brief  Alignment macro for 64-byte alignment.
- */
-#if defined(_MSC_VER)
-#define SZ_ALIGN64 __declspec(align(64))
-#elif defined(__GNUC__) || defined(__clang__)
-#define SZ_ALIGN64 __attribute__((aligned(64)))
-#else
-#define SZ_ALIGN64
-#endif
+#include "types.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/*
- *  Let's infer the integer types or pull them from LibC,
- *  if that is allowed by the user.
- */
-#if !SZ_AVOID_LIBC
-#include <stddef.h>           // `size_t`
-#include <stdint.h>           // `uint8_t`
-typedef int8_t sz_i8_t;       // Always 8 bits
-typedef uint8_t sz_u8_t;      // Always 8 bits
-typedef uint16_t sz_u16_t;    // Always 16 bits
-typedef int32_t sz_i32_t;     // Always 32 bits
-typedef uint32_t sz_u32_t;    // Always 32 bits
-typedef uint64_t sz_u64_t;    // Always 64 bits
-typedef int64_t sz_i64_t;     // Always 64 bits
-typedef size_t sz_size_t;     // Pointer-sized unsigned integer, 32 or 64 bits
-typedef ptrdiff_t sz_ssize_t; // Signed version of `sz_size_t`, 32 or 64 bits
-
-#else // if SZ_AVOID_LIBC:
-
-// ! The C standard doesn't specify the signedness of char.
-// ! On x86 char is signed by default while on Arm it is unsigned by default.
-// ! That's why we don't define `sz_char_t` and generally use explicit `sz_i8_t` and `sz_u8_t`.
-typedef signed char sz_i8_t;         // Always 8 bits
-typedef unsigned char sz_u8_t;       // Always 8 bits
-typedef unsigned short sz_u16_t;     // Always 16 bits
-typedef int sz_i32_t;                // Always 32 bits
-typedef unsigned int sz_u32_t;       // Always 32 bits
-typedef long long sz_i64_t;          // Always 64 bits
-typedef unsigned long long sz_u64_t; // Always 64 bits
-
-// Now we need to redefine the `size_t`.
-// Microsoft Visual C++ (MSVC) typically follows LLP64 data model on 64-bit platforms,
-// where integers, pointers, and long types have different sizes:
-//
-//  > `int` is 32 bits
-//  > `long` is 32 bits
-//  > `long long` is 64 bits
-//  > pointer (thus, `size_t`) is 64 bits
-//
-// In contrast, GCC and Clang on 64-bit Unix-like systems typically follow the LP64 model, where:
-//
-//  > `int` is 32 bits
-//  > `long` and pointer (thus, `size_t`) are 64 bits
-//  > `long long` is also 64 bits
-//
-// Source: https://learn.microsoft.com/en-us/windows/win32/winprog64/abstract-data-models
-#if SZ_DETECT_64_BIT
-typedef unsigned long long sz_size_t; // 64-bit.
-typedef long long sz_ssize_t;         // 64-bit.
-#else
-typedef unsigned sz_size_t;  // 32-bit.
-typedef unsigned sz_ssize_t; // 32-bit.
-#endif // SZ_DETECT_64_BIT
-
-#endif // SZ_AVOID_LIBC
-
-/**
- *  @brief  Compile-time assert macro similar to `static_assert` in C++.
- */
-#define sz_static_assert(condition, name)                \
-    typedef struct {                                     \
-        int static_assert_##name : (condition) ? 1 : -1; \
-    } sz_static_assert_##name##_t
-
-sz_static_assert(sizeof(sz_size_t) == sizeof(void *), sz_size_t_must_be_pointer_size);
-sz_static_assert(sizeof(sz_ssize_t) == sizeof(void *), sz_ssize_t_must_be_pointer_size);
-
-#pragma region Public API
-
-typedef char *sz_ptr_t;          // A type alias for `char *`
-typedef char const *sz_cptr_t;   // A type alias for `char const *`
-typedef sz_i8_t sz_error_cost_t; // Character mismatch cost for fuzzy matching functions
-
-typedef sz_u64_t sz_sorted_idx_t; // Index of a sorted string in a list of strings
-
-typedef enum { sz_false_k = 0, sz_true_k = 1 } sz_bool_t;                        // Only one relevant bit
-typedef enum { sz_less_k = -1, sz_equal_k = 0, sz_greater_k = 1 } sz_ordering_t; // Only three possible states: <=>
-
-/**
- *  @brief  Tiny string-view structure. It's POD type, unlike the `std::string_view`.
- */
-typedef struct sz_string_view_t {
-    sz_cptr_t start;
-    sz_size_t length;
-} sz_string_view_t;
-
-/**
- *  @brief  Enumeration of SIMD capabilities of the target architecture.
- *          Used to introspect the supported functionality of the dynamic library.
- */
-typedef enum sz_capability_t {
-    sz_cap_serial_k = 1,       /// Serial (non-SIMD) capability
-    sz_cap_any_k = 0x7FFFFFFF, /// Mask representing any capability
-
-    sz_cap_arm_neon_k = 1 << 10, /// ARM NEON capability
-    sz_cap_arm_sve_k = 1 << 11,  /// ARM SVE capability TODO: Not yet supported or used
-    sz_cap_arm_sve2_k = 1 << 12,
-    sz_cap_arm_sve2p1_k = 1 << 13,
-    sz_cap_x86_avx2_k = 1 << 20,       /// x86 AVX2 capability
-    sz_cap_x86_avx512f_k = 1 << 21,    /// x86 AVX512 F capability
-    sz_cap_x86_avx512bw_k = 1 << 22,   /// x86 AVX512 BW instruction capability
-    sz_cap_x86_avx512vl_k = 1 << 23,   /// x86 AVX512 VL instruction capability
-    sz_cap_x86_avx512vbmi_k = 1 << 24, /// x86 AVX512 VBMI instruction capability
-    sz_cap_x86_gfni_k = 1 << 25,       /// x86 AVX512 GFNI instruction capability
-
-} sz_capability_t;
-
-/**
- *  @brief  Function to determine the SIMD capabilities of the current machine @b only at @b runtime.
- *  @return A bitmask of the SIMD capabilities represented as a `sz_capability_t` enum value.
- */
-SZ_DYNAMIC sz_capability_t sz_capabilities(void);
-
-/**
- *  @brief  Bit-set structure for 256 possible byte values. Useful for filtering and search.
- *  @see    sz_charset_init, sz_charset_add, sz_charset_contains, sz_charset_invert
- */
-typedef union sz_charset_t {
-    sz_u64_t _u64s[4];
-    sz_u32_t _u32s[8];
-    sz_u16_t _u16s[16];
-    sz_u8_t _u8s[32];
-} sz_charset_t;
-
-/** @brief  Initializes a bit-set to an empty collection, meaning - all characters are banned. */
-SZ_PUBLIC void sz_charset_init(sz_charset_t *s) { s->_u64s[0] = s->_u64s[1] = s->_u64s[2] = s->_u64s[3] = 0; }
-
-/** @brief  Adds a character to the set and accepts @b unsigned integers. */
-SZ_PUBLIC void sz_charset_add_u8(sz_charset_t *s, sz_u8_t c) { s->_u64s[c >> 6] |= (1ull << (c & 63u)); }
-
-/** @brief  Adds a character to the set. Consider @b sz_charset_add_u8. */
-SZ_PUBLIC void sz_charset_add(sz_charset_t *s, char c) { sz_charset_add_u8(s, *(sz_u8_t *)(&c)); } // bitcast
-
-/** @brief  Checks if the set contains a given character and accepts @b unsigned integers. */
-SZ_PUBLIC sz_bool_t sz_charset_contains_u8(sz_charset_t const *s, sz_u8_t c) {
-    // Checking the bit can be done in different ways:
-    // - (s->_u64s[c >> 6] & (1ull << (c & 63u))) != 0
-    // - (s->_u32s[c >> 5] & (1u << (c & 31u))) != 0
-    // - (s->_u16s[c >> 4] & (1u << (c & 15u))) != 0
-    // - (s->_u8s[c >> 3] & (1u << (c & 7u))) != 0
-    return (sz_bool_t)((s->_u64s[c >> 6] & (1ull << (c & 63u))) != 0);
-}
-
-/** @brief  Checks if the set contains a given character. Consider @b sz_charset_contains_u8. */
-SZ_PUBLIC sz_bool_t sz_charset_contains(sz_charset_t const *s, char c) {
-    return sz_charset_contains_u8(s, *(sz_u8_t *)(&c)); // bitcast
-}
-
-/** @brief  Inverts the contents of the set, so allowed character get disallowed, and vice versa. */
-SZ_PUBLIC void sz_charset_invert(sz_charset_t *s) {
-    s->_u64s[0] ^= 0xFFFFFFFFFFFFFFFFull, s->_u64s[1] ^= 0xFFFFFFFFFFFFFFFFull, //
-        s->_u64s[2] ^= 0xFFFFFFFFFFFFFFFFull, s->_u64s[3] ^= 0xFFFFFFFFFFFFFFFFull;
-}
-
-typedef void *(*sz_memory_allocate_t)(sz_size_t, void *);
-typedef void (*sz_memory_free_t)(void *, sz_size_t, void *);
-typedef sz_u64_t (*sz_random_generator_t)(void *);
-
-/**
- *  @brief  Some complex pattern matching algorithms may require memory allocations.
- *          This structure is used to pass the memory allocator to those functions.
- *  @see    sz_memory_allocator_init_fixed
- */
-typedef struct sz_memory_allocator_t {
-    sz_memory_allocate_t allocate;
-    sz_memory_free_t free;
-    void *handle;
-} sz_memory_allocator_t;
-
-/**
- *  @brief  Initializes a memory allocator to use the system default `malloc` and `free`.
- *          ! The function is not available if the library was compiled with `SZ_AVOID_LIBC`.
- *
- *  @param alloc    Memory allocator to initialize.
- */
-SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc);
+#pragma region Core API
 
 /**
- *  @brief  Initializes a memory allocator to use a static-capacity buffer.
- *          No dynamic allocations will be performed.
+ *  @brief  Computes the Hamming distance between two strings - number of not matching characters.
+ *          Difference in length is is counted as a mismatch.
  *
- *  @param alloc    Memory allocator to initialize.
- *  @param buffer   Buffer to use for allocations.
- *  @param length   Length of the buffer. @b Must be greater than 8 bytes. Different values would be optimal for
- *                  different algorithms and input lengths, but 4096 bytes (one RAM page) is a good default.
- */
-SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length);
-
-/**
- *  @brief  The number of bytes a stack-allocated string can hold, including the SZ_NULL termination character.
- *          ! This can't be changed from outside. Don't use the `#error` as it may already be included and set.
- */
-#ifdef SZ_STRING_INTERNAL_SPACE
-#undef SZ_STRING_INTERNAL_SPACE
-#endif
-#define SZ_STRING_INTERNAL_SPACE (sizeof(sz_size_t) * 3 - 1) // 3 pointers minus one byte for an 8-bit length
-
-/**
- *  @brief  Tiny memory-owning string structure with a Small String Optimization (SSO).
- *          Differs in layout from Folly, Clang, GCC, and probably most other implementations.
- *          It's designed to avoid any branches on read-only operations, and can store up
- *          to 22 characters on stack on 64-bit machines, followed by the SZ_NULL-termination character.
+ *  @param a        First string to compare.
+ *  @param a_length Number of bytes in the first string.
+ *  @param b        Second string to compare.
+ *  @param b_length Number of bytes in the second string.
  *
- *  @section Changing Length
+ *  @param bound    Exclusive upper bound on the distance, that allows us to exit early.
+ *                  Pass `SZ_SIZE_MAX` or any value greater than `(max(a_length, b_length))` to ignore.
+ *                  Pass zero to check if the strings are equal.
+ *  @return         Returns an unsigned integer for the edit distance. Zero means the strings are equal.
+ *                  Returns the `(max(a_length, b_length)) + 1` if the distance limit was reached.
  *
- *  One nice thing about this design, is that you can, in many cases, change the length of the string
- *  without any branches, invoking a `+=` or `-=` on the 64-bit `length` field. If the string is on heap,
- *  the solution is obvious. If it's on stack, inplace decrement wouldn't affect the top bytes of the string,
- *  only changing the last byte containing the length.
+ *  @see    sz_hamming_distance_utf8
+ *  @see    https://en.wikipedia.org/wiki/Hamming_distance
  */
-typedef union sz_string_t {
-
-#if !SZ_DETECT_BIG_ENDIAN
-
-    struct external {
-        sz_ptr_t start;
-        sz_size_t length;
-        sz_size_t space;
-        sz_size_t padding;
-    } external;
-
-    struct internal {
-        sz_ptr_t start;
-        sz_u8_t length;
-        char chars[SZ_STRING_INTERNAL_SPACE];
-    } internal;
-
-#else
-
-    struct external {
-        sz_ptr_t start;
-        sz_size_t space;
-        sz_size_t padding;
-        sz_size_t length;
-    } external;
-
-    struct internal {
-        sz_ptr_t start;
-        char chars[SZ_STRING_INTERNAL_SPACE];
-        sz_u8_t length;
-    } internal;
-
-#endif
-
-    sz_size_t words[4];
-
-} sz_string_t;
-
-typedef sz_u64_t (*sz_hash_t)(sz_cptr_t, sz_size_t);
-typedef sz_u64_t (*sz_checksum_t)(sz_cptr_t, sz_size_t);
-typedef sz_bool_t (*sz_equal_t)(sz_cptr_t, sz_cptr_t, sz_size_t);
-typedef sz_ordering_t (*sz_order_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
-typedef void (*sz_to_converter_t)(sz_cptr_t, sz_size_t, sz_ptr_t);
+SZ_DYNAMIC sz_size_t sz_hamming_distance( //
+    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
 
 /**
- *  @brief  Computes the 64-bit check-sum of bytes in a string.
- *          Similar to `std::ranges::accumulate`.
+ *  @brief  Computes the Hamming distance between two @b UTF8 strings - number of not matching characters.
+ *          Difference in length is is counted as a mismatch.
  *
- *  @param text     String to aggregate.
- *  @param length   Number of bytes in the text.
- *  @return         64-bit unsigned value.
- */
-SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length);
-
-/** @copydoc sz_checksum */
-SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length);
-
-/**
- *  @brief  Computes the 64-bit unsigned hash of a string. Fairly fast for short strings,
- *          simple implementation, and supports rolling computation, reused in other APIs.
- *          Similar to `std::hash` in C++.
+ *  @param a        First string to compare.
+ *  @param a_length Number of bytes in the first string.
+ *  @param b        Second string to compare.
+ *  @param b_length Number of bytes in the second string.
  *
- *  @param text     String to hash.
- *  @param length   Number of bytes in the text.
- *  @return         64-bit hash value.
+ *  @param bound    Exclusive upper bound on the distance, that allows us to exit early.
+ *                  Pass `SZ_SIZE_MAX` or any value greater than `(max(a_length, b_length))` to ignore.
+ *                  Pass zero to check if the strings are equal.
+ *  @return         Returns an unsigned integer for the edit distance. Zero means the strings are equal.
+ *                  Returns the `(max(a_length, b_length)) + 1` if the distance limit was reached.
  *
- *  @see    sz_hashes, sz_hashes_fingerprint, sz_hashes_intersection
+ *  @see    sz_hamming_distance
+ *  @see    https://en.wikipedia.org/wiki/Hamming_distance
  */
-SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length);
-
-/** @copydoc sz_hash */
-SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t text, sz_size_t length);
+SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8( //
+    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
 
 /**
- *  @brief  Checks if two string are equal.
- *          Similar to `memcmp(a, b, length) == 0` in LibC and `a == b` in STL.
- *
- *  The implementation of this function is very similar to `sz_order`, but the usage patterns are different.
- *  This function is more often used in parsing, while `sz_order` is often used in sorting.
- *  It works best on platforms with cheap
+ *  @brief  Computes the Levenshtein edit-distance between two strings using the Wagner-Fisher algorithm.
+ *          Similar to the Needleman-Wunsch alignment algorithm. Often used in fuzzy string matching.
  *
  *  @param a        First string to compare.
+ *  @param a_length Number of bytes in the first string.
  *  @param b        Second string to compare.
- *  @param length   Number of bytes in both strings.
- *  @return         1 if strings match, 0 otherwise.
+ *  @param b_length Number of bytes in the second string.
+ *
+ *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
+ *                  so the memory usage is linear in relation to ::a_length and ::b_length.
+ *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
+ *
+ *  @param bound    Exclusive upper bound on the distance, that allows us to exit early.
+ *                  Pass `SZ_SIZE_MAX` or any value greater than `(max(a_length, b_length))` to ignore.
+ *                  Pass zero to check if the strings are equal.
+ *  @return         Returns an unsigned integer for the edit distance. Zero means the strings are equal.
+ *                  Returns the `(max(a_length, b_length)) + 1` if the distance limit was reached.
+ *                  Returns `SZ_SIZE_MAX` if the memory allocation failed.
+ *
+ *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default
+ *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
  */
-SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
+SZ_DYNAMIC sz_size_t sz_edit_distance(                                //
+    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
+    sz_size_t bound, sz_memory_allocator_t *alloc);
 
 /**
- *  @brief  Estimates the relative order of two strings. Equivalent to `memcmp(a, b, length)` in LibC.
- *          Can be used on different length strings.
+ *  @brief  Computes the Levenshtein edit-distance between two @b UTF8 strings.
+ *          Unlike `sz_edit_distance`, reports the distance in Unicode codepoints, and not in bytes.
  *
  *  @param a        First string to compare.
  *  @param a_length Number of bytes in the first string.
  *  @param b        Second string to compare.
  *  @param b_length Number of bytes in the second string.
- *  @return         Negative if (a < b), positive if (a > b), zero if they are equal.
- */
-SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-
-/**
- *  @brief  Look Up Table @b (LUT) transformation of a string. Equivalent to `for (char & c : text) c = lut[c]`.
- *
- *  Can be used to implement some form of string normalization, partially masking punctuation marks,
- *  or converting between different character sets, like uppercase or lowercase. Surprisingly, also has
- *  broad implications in image processing, where image channel transformations are often done using LUTs.
  *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param lut      Look Up Table to apply. Must be exactly @b 256 bytes long.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
-
-typedef void (*sz_look_up_transform_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_ptr_t);
-
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_serial(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
-
-/**
- *  @brief  Equivalent to `for (char & c : text) c = tolower(c)`.
+ *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
+ *                  so the memory usage is linear in relation to ::a_length and ::b_length.
+ *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
  *
- *  ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
- *  So there are 26 english letters, shifted by 32 values, meaning that a conversion
- *  can be done by flipping the 5th bit each inappropriate character byte. This, however,
- *  breaks for extended ASCII, so a different solution is needed.
- *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
+ *  @param bound    Exclusive upper bound on the distance, that allows us to exit early.
+ *                  Pass `SZ_SIZE_MAX` or any value greater than `(max(a_length, b_length))` to ignore.
+ *                  Pass zero to check if the strings are equal.
+ *  @return         Returns an unsigned integer for the edit distance. Zero means the strings are equal.
+ *                  Returns the `(max(a_length, b_length)) + 1` if the distance limit was reached.
+ *                  Returns `SZ_SIZE_MAX` if the memory allocation failed.
  *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
+ *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default, sz_edit_distance
+ *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
  */
-SZ_PUBLIC void sz_tolower(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
+SZ_DYNAMIC sz_size_t sz_edit_distance_utf8(                           //
+    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
+    sz_size_t bound, sz_memory_allocator_t *alloc);
 
 /**
- *  @brief  Equivalent to `for (char & c : text) c = toupper(c)`.
- *
- *  ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
- *  So there are 26 english letters, shifted by 32 values, meaning that a conversion
- *  can be done by flipping the 5th bit each inappropriate character byte. This, however,
- *  breaks for extended ASCII, so a different solution is needed.
- *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
+ *  @brief  Computes Needleman–Wunsch alignment score for two string. Often used in bioinformatics and cheminformatics.
+ *          Similar to the Levenshtein edit-distance, parameterized for gap and substitution penalties.
  *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_PUBLIC void sz_toupper(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
-
-/**
- *  @brief  Equivalent to `for (char & c : text) c = toascii(c)`.
+ *  Not commutative in the general case, as the order of the strings matters, as `sz_alignment_score(a, b)` may
+ *  not be equal to `sz_alignment_score(b, a)`. Becomes @b commutative, if the substitution costs are symmetric.
+ *  Equivalent to the negative Levenshtein distance, if: `gap == -1` and `subs[i][j] == (i == j ? 0: -1)`.
  *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_PUBLIC void sz_toascii(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
-
-/**
- *  @brief  Checks if all characters in the range are valid ASCII characters.
+ *  @param a        First string to compare.
+ *  @param a_length Number of bytes in the first string.
+ *  @param b        Second string to compare.
+ *  @param b_length Number of bytes in the second string.
+ *  @param gap      Penalty cost for gaps - insertions and removals.
+ *  @param subs     Substitution costs matrix with 256 x 256 values for all pairs of characters.
  *
- *  @param text     String to be analyzed.
- *  @param length   Number of bytes in the string.
- *  @return         Whether all characters are valid ASCII characters.
- */
-SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t text, sz_size_t length);
-
-/**
- *  @brief  Generates a random string for a given alphabet, avoiding integer division and modulo operations.
- *          Similar to `text[i] = alphabet[rand() % cardinality]`.
+ *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
+ *                  so the memory usage is linear in relation to ::a_length and ::b_length.
+ *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
  *
- *  The modulo operation is expensive, and should be avoided in performance-critical code.
- *  We avoid it using small lookup tables and replacing it with a multiplication and shifts, similar to `libdivide`.
- *  Alternative algorithms would include:
- *      - Montgomery form: https://en.algorithmica.org/hpc/number-theory/montgomery/
- *      - Barret reduction: https://www.nayuki.io/page/barrett-reduction-algorithm
- *      - Lemire's trick: https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+ *  @return         Signed similarity score. Can be negative, depending on the substitution costs.
+ *                  Returns `SZ_SSIZE_MAX` if the memory allocation failed.
  *
- *  @param alphabet     Set of characters to sample from.
- *  @param cardinality  Number of characters to sample from.
- *  @param text         Output string, can point to the same address as ::text.
- *  @param generate     Callback producing random numbers given the generator state.
- *  @param generator    Generator state, can be a pointer to a seed, or a pointer to a random number generator.
+ *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default
+ *  @see    https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
  */
-SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
-                            sz_random_generator_t generate, void *generator);
+SZ_DYNAMIC sz_ssize_t sz_alignment_score(                             //
+    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
+    sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
+    sz_memory_allocator_t *alloc);
 
-/** @copydoc sz_generate */
-SZ_PUBLIC void sz_generate_serial(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
-                                  sz_random_generator_t generate, void *generator);
+/** @copydoc sz_hamming_distance */
+SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
+    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
 
-/**
- *  @brief  Similar to `memcpy`, copies contents of one string into another.
- *          The behavior is undefined if the strings overlap.
- *
- *  @param target   String to copy into.
- *  @param length   Number of bytes to copy.
- *  @param source   String to copy from.
- */
-SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
+/** @copydoc sz_hamming_distance_utf8 */
+SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial( //
+    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
 
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
+/** @copydoc sz_edit_distance */
+SZ_PUBLIC sz_size_t sz_edit_distance_serial(                          //
+    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
+    sz_size_t bound, sz_memory_allocator_t *alloc);
 
-/**
- *  @brief  Similar to `memmove`, copies (moves) contents of one string into another.
- *          Unlike `sz_copy`, allows overlapping strings as arguments.
- *
- *  @param target   String to copy into.
- *  @param length   Number of bytes to copy.
- *  @param source   String to copy from.
- */
-SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
+/** @copydoc sz_edit_distance_utf8 */
+SZ_PUBLIC sz_size_t sz_edit_distance_utf8_serial(                     //
+    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
+    sz_size_t bound, sz_memory_allocator_t *alloc);
 
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
+/** @copydoc sz_alignment_score */
+SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(                       //
+    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
+    sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
+    sz_memory_allocator_t *alloc);
 
-typedef void (*sz_move_t)(sz_ptr_t, sz_cptr_t, sz_size_t);
+#pragma endregion // Core API
 
-/**
- *  @brief  Similar to `memset`, fills a string with a given value.
- *
- *  @param target   String to fill.
- *  @param length   Number of bytes to fill.
- *  @param value    Value to fill with.
- */
-SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value);
+#pragma region Serial Implementation
 
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value);
+SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_serial( //
+    sz_cptr_t shorter, sz_size_t shorter_length,                 //
+    sz_cptr_t longer, sz_size_t longer_length,                   //
+    sz_size_t bound, sz_memory_allocator_t *alloc) {
 
-typedef void (*sz_fill_t)(sz_ptr_t, sz_size_t, sz_u8_t);
+    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
+    sz_memory_allocator_t global_alloc;
+    if (!alloc) {
+        sz_memory_allocator_init_default(&global_alloc);
+        alloc = &global_alloc;
+    }
 
-/**
- *  @brief  Initializes a string class instance to an empty value.
- */
-SZ_PUBLIC void sz_string_init(sz_string_t *string);
+    // TODO: Generalize to remove the following asserts!
+    sz_assert(!bound && "For bounded search the method should only evaluate one band of the matrix.");
+    sz_assert(shorter_length == longer_length && "The method hasn't been generalized to different length inputs yet.");
+    sz_unused(longer_length && bound);
 
-/**
- *  @brief  Convenience function checking if the provided string is stored inside of the ::string instance itself,
- *          alternative being - allocated in a remote region of the heap.
- */
-SZ_PUBLIC sz_bool_t sz_string_is_on_stack(sz_string_t const *string);
+    // We are going to store 3 diagonals of the matrix.
+    // The length of the longest (main) diagonal would be `n = (shorter_length + 1)`.
+    sz_size_t n = shorter_length + 1;
+    sz_size_t buffer_length = sizeof(sz_size_t) * n * 3;
+    sz_size_t *distances = (sz_size_t *)alloc->allocate(buffer_length, alloc->handle);
+    if (!distances) return SZ_SIZE_MAX;
 
-/**
- *  @brief  Unpacks the opaque instance of a string class into its components.
- *          Recommended to use only in read-only operations.
- *
- *  @param string       String to unpack.
- *  @param start        Pointer to the start of the string.
- *  @param length       Number of bytes in the string, before the SZ_NULL character.
- *  @param space        Number of bytes allocated for the string (heap or stack), including the SZ_NULL character.
- *  @param is_external  Whether the string is allocated on the heap externally, or fits withing ::string instance.
- */
-SZ_PUBLIC void sz_string_unpack(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length, sz_size_t *space,
-                                sz_bool_t *is_external);
+    sz_size_t *previous_distances = distances;
+    sz_size_t *current_distances = previous_distances + n;
+    sz_size_t *next_distances = previous_distances + n * 2;
 
-/**
- *  @brief  Unpacks only the start and length of the string.
- *          Recommended to use only in read-only operations.
- *
- * @param string       String to unpack.
- * @param start        Pointer to the start of the string.
- * @param length       Number of bytes in the string, before the SZ_NULL character.
- */
-SZ_PUBLIC void sz_string_range(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length);
+    // Initialize the first two diagonals:
+    previous_distances[0] = 0;
+    current_distances[0] = current_distances[1] = 1;
 
-/**
- *  @brief  Constructs a string of a given ::length with noisy contents.
- *          Use the returned character pointer to populate the string.
- *
- *  @param string       String to initialize.
- *  @param length       Number of bytes in the string, before the SZ_NULL character.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             SZ_NULL if the operation failed, pointer to the start of the string otherwise.
- */
-SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length, sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Doesn't change the contents or the length of the string, but grows the available memory capacity.
- *          This is beneficial, if several insertions are expected, and we want to minimize allocations.
- *
- *  @param string       String to grow.
- *  @param new_capacity The number of characters to reserve space for, including existing ones.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             SZ_NULL if the operation failed, pointer to the new start of the string otherwise.
- */
-SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity, sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Grows the string by adding an uninitialized region of ::added_length at the given ::offset.
- *          Would often be used in conjunction with one or more `sz_copy` calls to populate the allocated region.
- *          Similar to `sz_string_reserve`, but changes the length of the ::string.
- *
- *  @param string       String to grow.
- *  @param offset       Offset of the first byte to reserve space for.
- *                      If provided offset is larger than the length, it will be capped.
- *  @param added_length The number of new characters to reserve space for.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             SZ_NULL if the operation failed, pointer to the new start of the string otherwise.
- */
-SZ_PUBLIC sz_ptr_t sz_string_expand(sz_string_t *string, sz_size_t offset, sz_size_t added_length,
-                                    sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Removes a range from a string. Changes the length, but not the capacity.
- *          Performs no allocations or deallocations and can't fail.
- *
- *  @param string       String to clean.
- *  @param offset       Offset of the first byte to remove.
- *  @param length       Number of bytes to remove. Out-of-bound ranges will be capped.
- *  @return             Number of bytes removed.
- */
-SZ_PUBLIC sz_size_t sz_string_erase(sz_string_t *string, sz_size_t offset, sz_size_t length);
-
-/**
- *  @brief  Shrinks the string to fit the current length, if it's allocated on the heap.
- *          It's the reverse operation of ::sz_string_reserve.
- *
- *  @param string       String to shrink.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             Whether the operation was successful. The only failures can come from the allocator.
- *                      On failure, the string will remain unchanged.
- */
-SZ_PUBLIC sz_ptr_t sz_string_shrink_to_fit(sz_string_t *string, sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Frees the string, if it's allocated on the heap.
- *          If the string is on the stack, the function clears/resets the state.
- */
-SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *allocator);
-
-#pragma endregion
-
-#pragma region Fast Substring Search API
-
-typedef sz_cptr_t (*sz_find_byte_t)(sz_cptr_t, sz_size_t, sz_cptr_t);
-typedef sz_cptr_t (*sz_find_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
-typedef sz_cptr_t (*sz_find_set_t)(sz_cptr_t, sz_size_t, sz_charset_t const *);
-
-/**
- *  @brief  Locates first matching byte in a string. Equivalent to `memchr(haystack, *needle, h_length)` in LibC.
- *
- *  X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memchr.S
- *  Aarch64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/aarch64/memchr.S
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - single-byte substring to find.
- *  @return         Address of the first match.
- */
-SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/**
- *  @brief  Locates last matching byte in a string. Equivalent to `memrchr(haystack, *needle, h_length)` in LibC.
- *
- *  X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memrchr.S
- *  Aarch64 implementation: missing
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - single-byte substring to find.
- *  @return         Address of the last match.
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/**
- *  @brief  Locates first matching substring.
- *          Equivalent to `memmem(haystack, h_length, needle, n_length)` in LibC.
- *          Similar to `strstr(haystack, needle)` in LibC, but requires known length.
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - substring to find.
- *  @param n_length Number of bytes in the needle.
- *  @return         Address of the first match.
- */
-SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/**
- *  @brief  Locates the last matching substring.
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - substring to find.
- *  @param n_length Number of bytes in the needle.
- *  @return         Address of the last match.
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/**
- *  @brief  Finds the first character present from the ::set, present in ::text.
- *          Equivalent to `strspn(text, accepted)` and `strcspn(text, rejected)` in LibC.
- *          May have identical implementation and performance to ::sz_rfind_charset.
- *
- *  Useful for parsing, when we want to skip a set of characters. Examples:
- *  * 6 whitespaces: " \t\n\r\v\f".
- *  * 16 digits forming a float number: "0123456789,.eE+-".
- *  * 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
- *  * 2 JSON string special characters useful to locate the end of the string: "\"\\".
- *
- *  @param text     String to be scanned.
- *  @param set      Set of relevant characters.
- *  @return         Pointer to the first matching character from ::set.
- */
-SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-/**
- *  @brief  Finds the last character present from the ::set, present in ::text.
- *          Equivalent to `strspn(text, accepted)` and `strcspn(text, rejected)` in LibC.
- *          May have identical implementation and performance to ::sz_find_charset.
- *
- *  Useful for parsing, when we want to skip a set of characters. Examples:
- *  * 6 whitespaces: " \t\n\r\v\f".
- *  * 16 digits forming a float number: "0123456789,.eE+-".
- *  * 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
- *  * 2 JSON string special characters useful to locate the end of the string: "\"\\".
- *
- *  @param text     String to be scanned.
- *  @param set      Set of relevant characters.
- *  @return         Pointer to the last matching character from ::set.
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-#pragma endregion
-
-#pragma region String Similarity Measures API
-
-/**
- *  @brief  Computes the Hamming distance between two strings - number of not matching characters.
- *          Difference in length is is counted as a mismatch.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for the distance, the `bound` if was exceeded.
- *
- *  @see    sz_hamming_distance_utf8
- *  @see    https://en.wikipedia.org/wiki/Hamming_distance
- */
-SZ_DYNAMIC sz_size_t sz_hamming_distance( //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
-
-/** @copydoc sz_hamming_distance */
-SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
-
-/**
- *  @brief  Computes the Hamming distance between two @b UTF8 strings - number of not matching characters.
- *          Difference in length is is counted as a mismatch.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for the distance, the `bound` if was exceeded.
- *
- *  @see    sz_hamming_distance
- *  @see    https://en.wikipedia.org/wiki/Hamming_distance
- */
-SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                              sz_size_t bound);
-
-/** @copydoc sz_hamming_distance_utf8 */
-SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                                    sz_size_t bound);
-
-typedef sz_size_t (*sz_hamming_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t);
-
-/**
- *  @brief  Computes the Levenshtein edit-distance between two strings using the Wagner-Fisher algorithm.
- *          Similar to the Needleman-Wunsch alignment algorithm. Often used in fuzzy string matching.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @param bound    Exclusive upper bound on the distance, that allows us to exit early.
- *                  Pass `SZ_SIZE_MAX` or any value greater than `(max(a_length, b_length))` to ignore.
- *                  Pass zero to check if the strings are equal.
- *  @return         Unsigned integer for the edit distance. Zero means the strings are equal.
- *                  Returns the `bound` if it was exceeded or `SZ_SIZE_MAX` if the memory allocation failed.
- *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default
- *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
- */
-SZ_DYNAMIC sz_size_t sz_edit_distance(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                      sz_size_t bound, sz_memory_allocator_t *alloc);
-
-/** @copydoc sz_edit_distance */
-SZ_PUBLIC sz_size_t sz_edit_distance_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                            sz_size_t bound, sz_memory_allocator_t *alloc);
-
-/**
- *  @brief  Computes the Levenshtein edit-distance between two @b UTF8 strings.
- *          Unlike `sz_edit_distance`, reports the distance in Unicode codepoints, and not in bytes.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for edit distance, the `bound` if was exceeded or `SZ_SIZE_MAX`
- *                  if the memory allocation failed.
- *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default, sz_edit_distance
- *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
- */
-SZ_DYNAMIC sz_size_t sz_edit_distance_utf8(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                           sz_size_t bound, sz_memory_allocator_t *alloc);
-
-typedef sz_size_t (*sz_edit_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t, sz_memory_allocator_t *);
-
-/** @copydoc sz_edit_distance_utf8 */
-SZ_PUBLIC sz_size_t sz_edit_distance_utf8_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                                 sz_size_t bound, sz_memory_allocator_t *alloc);
-
-/**
- *  @brief  Computes Needleman–Wunsch alignment score for two string. Often used in bioinformatics and cheminformatics.
- *          Similar to the Levenshtein edit-distance, parameterized for gap and substitution penalties.
- *
- *  Not commutative in the general case, as the order of the strings matters, as `sz_alignment_score(a, b)` may
- *  not be equal to `sz_alignment_score(b, a)`. Becomes @b commutative, if the substitution costs are symmetric.
- *  Equivalent to the negative Levenshtein distance, if: `gap == -1` and `subs[i][j] == (i == j ? 0: -1)`.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *  @param gap      Penalty cost for gaps - insertions and removals.
- *  @param subs     Substitution costs matrix with 256 x 256 values for all pairs of characters.
- *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @return         Signed similarity score. Can be negative, depending on the substitution costs.
- *                  If the memory allocation fails, the function returns `SZ_SSIZE_MAX`.
- *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default
- *  @see    https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
- */
-SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                         sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                         sz_memory_allocator_t *alloc);
-
-/** @copydoc sz_alignment_score */
-SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                               sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                               sz_memory_allocator_t *alloc);
-
-typedef sz_ssize_t (*sz_alignment_score_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_error_cost_t const *,
-                                           sz_error_cost_t, sz_memory_allocator_t *);
-
-typedef void (*sz_hash_callback_t)(sz_cptr_t, sz_size_t, sz_u64_t, void *user);
-
-/**
- *  @brief  Computes the Karp-Rabin rolling hashes of a string supplying them to the provided `callback`.
- *          Can be used for similarity scores, search, ranking, etc.
- *
- *  Rabin-Karp-like rolling hashes can have very high-level of collisions and depend
- *  on the choice of bases and the prime number. That's why, often two hashes from the same
- *  family are used with different bases.
- *
- *       1. Kernighan and Ritchie's function uses 31, a prime close to the size of English alphabet.
- *       2. To be friendlier to byte-arrays and UTF8, we use 257 for the second function.
- *
- *  Choosing the right ::window_length is task- and domain-dependant. For example, most English words are
- *  between 3 and 7 characters long, so a window of 4 bytes would be a good choice. For DNA sequences,
- *  the ::window_length might be a multiple of 3, as the codons are 3 (nucleotides) bytes long.
- *  With such minimalistic alphabets of just four characters (AGCT) longer windows might be needed.
- *  For protein sequences the alphabet is 20 characters long, so the window can be shorter, than for DNAs.
- *
- *  @param text             String to hash.
- *  @param length           Number of bytes in the string.
- *  @param window_length    Length of the rolling window in bytes.
- *  @param window_step      Step of reported hashes. @b Must be power of two. Should be smaller than `window_length`.
- *  @param callback         Function receiving the start & length of a substring, the hash, and the `callback_handle`.
- *  @param callback_handle  Optional user-provided pointer to be passed to the `callback`.
- *  @see                    sz_hashes_fingerprint, sz_hashes_intersection
- */
-SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                          sz_hash_callback_t callback, void *callback_handle);
-
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_serial(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                                sz_hash_callback_t callback, void *callback_handle);
-
-typedef void (*sz_hashes_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_size_t, sz_hash_callback_t, void *);
-
-/**
- *  @brief  Computes the Karp-Rabin rolling hashes of a string outputting a binary fingerprint.
- *          Such fingerprints can be compared with Hamming or Jaccard (Tanimoto) distance for similarity.
- *
- *  The algorithm doesn't clear the fingerprint buffer on start, so it can be invoked multiple times
- *  to produce a fingerprint of a longer string, by passing the previous fingerprint as the ::fingerprint.
- *  It can also be reused to produce multi-resolution fingerprints by changing the ::window_length
- *  and calling the same function multiple times for the same input ::text.
- *
- *  Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
- *  avoiding cache-coherency penalties of remote on-heap buffers.
- *
- *  @param text                 String to hash.
- *  @param length               Number of bytes in the string.
- *  @param fingerprint          Output fingerprint buffer.
- *  @param fingerprint_bytes    Number of bytes in the fingerprint buffer.
- *  @param window_length        Length of the rolling window in bytes.
- *  @see                        sz_hashes, sz_hashes_intersection
- */
-SZ_PUBLIC void sz_hashes_fingerprint(                          //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
-    sz_ptr_t fingerprint, sz_size_t fingerprint_bytes);
-
-typedef void (*sz_hashes_fingerprint_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_ptr_t, sz_size_t);
-
-/**
- *  @brief  Given a hash-fingerprint of a textual document, computes the number of intersecting hashes
- *          of the incoming document. Can be used for document scoring and search.
- *
- *  Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
- *  avoiding cache-coherency penalties of remote on-heap buffers.
- *
- *  @param text                 Input document.
- *  @param length               Number of bytes in the input document.
- *  @param fingerprint          Reference document fingerprint.
- *  @param fingerprint_bytes    Number of bytes in the reference documents fingerprint.
- *  @param window_length        Length of the rolling window in bytes.
- *  @see                        sz_hashes, sz_hashes_fingerprint
- */
-SZ_PUBLIC sz_size_t sz_hashes_intersection(                    //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
-    sz_cptr_t fingerprint, sz_size_t fingerprint_bytes);
-
-typedef sz_size_t (*sz_hashes_intersection_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_cptr_t, sz_size_t);
-
-#pragma endregion
-
-#pragma region Convenience API
-
-/**
- *  @brief  Finds the first character in the haystack, that is present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-/**
- *  @brief  Finds the first character in the haystack, that is @b not present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_find_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-/**
- *  @brief  Finds the last character in the haystack, that is present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-/**
- *  @brief  Finds the last character in the haystack, that is @b not present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-#pragma endregion
-
-#pragma region String Sequences API
-
-struct sz_sequence_t;
-
-typedef sz_cptr_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_size_t (*sz_sequence_member_length_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_bool_t (*sz_sequence_predicate_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_bool_t (*sz_sequence_comparator_t)(struct sz_sequence_t const *, sz_size_t, sz_size_t);
-typedef sz_bool_t (*sz_string_is_less_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
-
-typedef struct sz_sequence_t {
-    sz_sorted_idx_t *order;
-    sz_size_t count;
-    sz_sequence_member_start_t get_start;
-    sz_sequence_member_length_t get_length;
-    void const *handle;
-} sz_sequence_t;
-
-/**
- *  @brief  Initiates the sequence structure from a tape layout, used by Apache Arrow.
- *          Expects ::offsets to contains `count + 1` entries, the last pointing at the end
- *          of the last string, indicating the total length of the ::tape.
- */
-SZ_PUBLIC void sz_sequence_from_u32tape(sz_cptr_t *start, sz_u32_t const *offsets, sz_size_t count,
-                                        sz_sequence_t *sequence);
-
-/**
- *  @brief  Initiates the sequence structure from a tape layout, used by Apache Arrow.
- *          Expects ::offsets to contains `count + 1` entries, the last pointing at the end
- *          of the last string, indicating the total length of the ::tape.
- */
-SZ_PUBLIC void sz_sequence_from_u64tape(sz_cptr_t *start, sz_u64_t const *offsets, sz_size_t count,
-                                        sz_sequence_t *sequence);
-
-/**
- *  @brief  Similar to `std::partition`, given a predicate splits the sequence into two parts.
- *          The algorithm is unstable, meaning that elements may change relative order, as long
- *          as they are in the right partition. This is the simpler algorithm for partitioning.
- */
-SZ_PUBLIC sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate);
-
-/**
- *  @brief  Inplace `std::set_union` for two consecutive chunks forming the same continuous `sequence`.
- *
- *  @param partition The number of elements in the first sub-sequence in `sequence`.
- *  @param less Comparison function, to determine the lexicographic ordering.
- */
-SZ_PUBLIC void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less);
-
-/**
- *  @brief  Sorting algorithm, combining Radix Sort for the first 32 bits of every word
- *          and a follow-up by a more conventional sorting procedure on equally prefixed parts.
- */
-SZ_PUBLIC void sz_sort(sz_sequence_t *sequence);
-
-/**
- *  @brief  Partial sorting algorithm, combining Radix Sort for the first 32 bits of every word
- *          and a follow-up by a more conventional sorting procedure on equally prefixed parts.
- */
-SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t n);
-
-/**
- *  @brief  Intro-Sort algorithm that supports custom comparators.
- */
-SZ_PUBLIC void sz_sort_intro(sz_sequence_t *sequence, sz_sequence_comparator_t less);
-
-#pragma endregion
-
-/*
- *  Hardware feature detection.
- *  All of those can be controlled by the user.
- */
-#ifndef SZ_USE_X86_AVX512
-#ifdef __AVX512BW__
-#define SZ_USE_X86_AVX512 1
-#else
-#define SZ_USE_X86_AVX512 0
-#endif
-#endif
-
-#ifndef SZ_USE_X86_AVX2
-#ifdef __AVX2__
-#define SZ_USE_X86_AVX2 1
-#else
-#define SZ_USE_X86_AVX2 0
-#endif
-#endif
-
-#ifndef SZ_USE_ARM_NEON
-#ifdef __ARM_NEON
-#define SZ_USE_ARM_NEON 1
-#else
-#define SZ_USE_ARM_NEON 0
-#endif
-#endif
-
-#ifndef SZ_USE_ARM_SVE
-#ifdef __ARM_FEATURE_SVE
-#define SZ_USE_ARM_SVE 1
-#else
-#define SZ_USE_ARM_SVE 0
-#endif
-#endif
-
-/*
- *  Include hardware-specific headers.
- */
-#if SZ_USE_X86_AVX512 || SZ_USE_X86_AVX2
-#include <immintrin.h>
-#endif // SZ_USE_X86...
-#if SZ_USE_ARM_NEON
-#if !defined(_MSC_VER)
-#include <arm_acle.h>
-#endif
-#include <arm_neon.h>
-#endif // SZ_USE_ARM_NEON
-#if SZ_USE_ARM_SVE
-#if !defined(_MSC_VER)
-#include <arm_sve.h>
-#endif
-#endif // SZ_USE_ARM_SVE
-
-#pragma region Hardware Specific API
-
-#if SZ_USE_X86_AVX512
-
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_avx512(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_avx512(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_edit_distance */
-SZ_PUBLIC sz_size_t sz_edit_distance_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                            sz_size_t bound, sz_memory_allocator_t *alloc);
-/** @copydoc sz_alignment_score */
-SZ_PUBLIC sz_ssize_t sz_alignment_score_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                               sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                               sz_memory_allocator_t *alloc);
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle);
-#endif
-
-#if SZ_USE_X86_AVX2
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_avx2(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_avx2(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_avx2(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_avx2(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                              sz_hash_callback_t callback, void *callback_handle);
-#endif
-
-#if SZ_USE_ARM_NEON
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_neon(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_neon(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_neon(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-#endif
-
-#if SZ_USE_ARM_SVE
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_sve(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_sve(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_sve(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_sve(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_sve(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-#endif
-
-#pragma endregion
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wconversion"
-
-/*
- **********************************************************************************************************************
- **********************************************************************************************************************
- **********************************************************************************************************************
- *
- *  This is where we the actual implementation begins.
- *  The rest of the file is hidden from the public API.
- *
- **********************************************************************************************************************
- **********************************************************************************************************************
- **********************************************************************************************************************
- */
-
-#pragma region Compiler Extensions and Helper Functions
-
-#pragma GCC visibility push(hidden)
-
-/**
- *  @brief  Helper-macro to mark potentially unused variables.
- */
-#define sz_unused(x) ((void)(x))
-
-/**
- *  @brief  Helper-macro casting a variable to another type of the same size.
- */
-#define sz_bitcast(type, value) (*((type *)&(value)))
-
-/**
- *  @brief  Defines `SZ_NULL`, analogous to `NULL`.
- *          The default often comes from locale.h, stddef.h,
- *          stdio.h, stdlib.h, string.h, time.h, or wchar.h.
- */
-#ifdef __GNUG__
-#define SZ_NULL __null
-#define SZ_NULL_CHAR __null
-#else
-#define SZ_NULL ((void *)0)
-#define SZ_NULL_CHAR ((char *)0)
-#endif
-
-/**
- *  @brief  Cache-line width, that will affect the execution of some algorithms,
- *          like equality checks and relative order computing.
- */
-#define SZ_CACHE_LINE_WIDTH (64) // bytes
-
-/**
- *  @brief  Similar to `assert`, the `sz_assert` is used in the SZ_DEBUG mode
- *          to check the invariants of the library. It's a no-op in the SZ_RELEASE mode.
- *  @note   If you want to catch it, put a breakpoint at @b `__GI_exit`
- */
-#if SZ_DEBUG && defined(SZ_AVOID_LIBC) && !SZ_AVOID_LIBC && !defined(SZ_PIC)
-#include <stdio.h>  // `fprintf`
-#include <stdlib.h> // `EXIT_FAILURE`
-SZ_PUBLIC void _sz_assert_failure(char const *condition, char const *file, int line) {
-    fprintf(stderr, "Assertion failed: %s, in file %s, line %d\n", condition, file, line);
-    exit(EXIT_FAILURE);
-}
-#define sz_assert(condition)                                                      \
-    do {                                                                          \
-        if (!(condition)) { _sz_assert_failure(#condition, __FILE__, __LINE__); } \
-    } while (0)
-#else
-#define sz_assert(condition) ((void)(condition))
-#endif
-
-/*  Intrinsics aliases for MSVC, GCC, Clang, and Clang-Cl.
- *  The following section of compiler intrinsics comes in 2 flavors.
- */
-#if defined(_MSC_VER) && !defined(__clang__) // On Clang-CL
-#include <intrin.h>
-
-// Sadly, when building Win32 images, we can't use the `_tzcnt_u64`, `_lzcnt_u64`,
-// `_BitScanForward64`, or `_BitScanReverse64` intrinsics. For now it's a simple `for`-loop.
-// TODO: In the future we can switch to a more efficient De Bruijn's algorithm.
-// https://www.chessprogramming.org/BitScan
-// https://www.chessprogramming.org/De_Bruijn_Sequence
-// https://gist.github.com/resilar/e722d4600dbec9752771ab4c9d47044f
-//
-// Use the serial version on 32-bit x86 and on Arm.
-#if (defined(_WIN32) && !defined(_WIN64)) || defined(_M_ARM) || defined(_M_ARM64)
-SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 1) == 0) { n++, x >>= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u64_clz(sz_u64_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 0x8000000000000000ull) == 0) { n++, x <<= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) {
-    x = x - ((x >> 1) & 0x5555555555555555ull);
-    x = (x & 0x3333333333333333ull) + ((x >> 2) & 0x3333333333333333ull);
-    return (((x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Full) * 0x0101010101010101ull) >> 56;
-}
-SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 1) == 0) { n++, x >>= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u32_clz(sz_u32_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 0x80000000u) == 0) { n++, x <<= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) {
-    x = x - ((x >> 1) & 0x55555555);
-    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
-    return (((x + (x >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
-}
-#else
-SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) { return (int)_tzcnt_u64(x); }
-SZ_INTERNAL int sz_u64_clz(sz_u64_t x) { return (int)_lzcnt_u64(x); }
-SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) { return (int)__popcnt64(x); }
-SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) { return (int)_tzcnt_u32(x); }
-SZ_INTERNAL int sz_u32_clz(sz_u32_t x) { return (int)_lzcnt_u32(x); }
-SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) { return (int)__popcnt(x); }
-#endif
-// Force the byteswap functions to be intrinsics, because when /Oi- is given, these will turn into CRT function calls,
-// which breaks when `SZ_AVOID_LIBC` is given
-#pragma intrinsic(_byteswap_uint64)
-SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return _byteswap_uint64(val); }
-#pragma intrinsic(_byteswap_ulong)
-SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return _byteswap_ulong(val); }
-#else
-SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) { return __builtin_popcountll(x); }
-SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) { return __builtin_popcount(x); }
-SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) { return __builtin_ctzll(x); }
-SZ_INTERNAL int sz_u64_clz(sz_u64_t x) { return __builtin_clzll(x); }
-SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) { return __builtin_ctz(x); } // ! Undefined if `x == 0`
-SZ_INTERNAL int sz_u32_clz(sz_u32_t x) { return __builtin_clz(x); } // ! Undefined if `x == 0`
-SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return __builtin_bswap64(val); }
-SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return __builtin_bswap32(val); }
-#endif
-
-SZ_INTERNAL sz_u64_t sz_u64_rotl(sz_u64_t x, sz_u64_t r) { return (x << r) | (x >> (64 - r)); }
-
-/**
- *  @brief  Select bits from either ::a or ::b depending on the value of ::mask bits.
- *
- *  Similar to `_mm_blend_epi16` intrinsic on x86.
- *  Described in the "Bit Twiddling Hacks" by Sean Eron Anderson.
- *  https://graphics.stanford.edu/~seander/bithacks.html#ConditionalSetOrClearBitsWithoutBranching
- */
-SZ_INTERNAL sz_u64_t sz_u64_blend(sz_u64_t a, sz_u64_t b, sz_u64_t mask) { return a ^ ((a ^ b) & mask); }
-
-/*
- *  Efficiently computing the minimum and maximum of two or three values can be tricky.
- *  The simple branching baseline would be:
- *
- *      x < y ? x : y                               // can replace with 1 conditional move
- *
- *  Branchless approach is well known for signed integers, but it doesn't apply to unsigned ones.
- *  https://stackoverflow.com/questions/514435/templatized-branchless-int-max-min-function
- *  https://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
- *  Using only bit-shifts for singed integers it would be:
- *
- *      y + ((x - y) & (x - y) >> 31)               // 4 unique operations
- *
- *  Alternatively, for any integers using multiplication:
- *
- *      (x > y) * y + (x <= y) * x                  // 5 operations
- *
- *  Alternatively, to avoid multiplication:
- *
- *      x & ~((x < y) - 1) + y & ((x < y) - 1)      // 6 unique operations
- */
-#define sz_min_of_two(x, y) (x < y ? x : y)
-#define sz_max_of_two(x, y) (x < y ? y : x)
-#define sz_min_of_three(x, y, z) sz_min_of_two(x, sz_min_of_two(y, z))
-#define sz_max_of_three(x, y, z) sz_max_of_two(x, sz_max_of_two(y, z))
-
-/** @brief  Branchless minimum function for two signed 32-bit integers. */
-SZ_INTERNAL sz_i32_t sz_i32_min_of_two(sz_i32_t x, sz_i32_t y) { return y + ((x - y) & (x - y) >> 31); }
-
-/** @brief  Branchless minimum function for two signed 32-bit integers. */
-SZ_INTERNAL sz_i32_t sz_i32_max_of_two(sz_i32_t x, sz_i32_t y) { return x - ((x - y) & (x - y) >> 31); }
-
-/**
- *  @brief  Clamps signed offsets in a string to a valid range. Used for Pythonic-style slicing.
- */
-SZ_INTERNAL void sz_ssize_clamp_interval(sz_size_t length, sz_ssize_t start, sz_ssize_t end,
-                                         sz_size_t *normalized_offset, sz_size_t *normalized_length) {
-    // TODO: Remove branches.
-    // Normalize negative indices
-    if (start < 0) start += length;
-    if (end < 0) end += length;
-
-    // Clamp indices to a valid range
-    if (start < 0) start = 0;
-    if (end < 0) end = 0;
-    if (start > (sz_ssize_t)length) start = length;
-    if (end > (sz_ssize_t)length) end = length;
-
-    // Ensure start <= end
-    if (start > end) start = end;
-
-    *normalized_offset = start;
-    *normalized_length = end - start;
-}
-
-/**
- *  @brief  Compute the logarithm base 2 of a positive integer, rounding down.
- */
-SZ_INTERNAL sz_size_t sz_size_log2i_nonzero(sz_size_t x) {
-    sz_assert(x > 0 && "Non-positive numbers have no defined logarithm");
-    sz_size_t leading_zeros = sz_u64_clz(x);
-    return 63 - leading_zeros;
-}
-
-/**
- *  @brief  Compute the smallest power of two greater than or equal to ::x.
- */
-SZ_INTERNAL sz_size_t sz_size_bit_ceil(sz_size_t x) {
-    // Unlike the commonly used trick with `clz` intrinsics, is valid across the whole range of `x`.
-    // https://stackoverflow.com/a/10143264
-    x--;
-    x |= x >> 1;
-    x |= x >> 2;
-    x |= x >> 4;
-    x |= x >> 8;
-    x |= x >> 16;
-#if SZ_DETECT_64_BIT
-    x |= x >> 32;
-#endif
-    x++;
-    return x;
-}
-
-/**
- *  @brief  Transposes an 8x8 bit matrix packed in a `sz_u64_t`.
- *
- *  There is a well known SWAR sequence for that known to chess programmers,
- *  willing to flip a bit-matrix of pieces along the main A1-H8 diagonal.
- *  https://www.chessprogramming.org/Flipping_Mirroring_and_Rotating
- *  https://lukas-prokop.at/articles/2021-07-23-transpose
- */
-SZ_INTERNAL sz_u64_t sz_u64_transpose(sz_u64_t x) {
-    sz_u64_t t;
-    t = x ^ (x << 36);
-    x ^= 0xf0f0f0f00f0f0f0full & (t ^ (x >> 36));
-    t = 0xcccc0000cccc0000ull & (x ^ (x << 18));
-    x ^= t ^ (t >> 18);
-    t = 0xaa00aa00aa00aa00ull & (x ^ (x << 9));
-    x ^= t ^ (t >> 9);
-    return x;
-}
-
-/**
- *  @brief  Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
- */
-SZ_INTERNAL void sz_u64_swap(sz_u64_t *a, sz_u64_t *b) {
-    sz_u64_t t = *a;
-    *a = *b;
-    *b = t;
-}
-
-/**
- *  @brief  Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
- */
-SZ_INTERNAL void sz_pointer_swap(void **a, void **b) {
-    void *t = *a;
-    *a = *b;
-    *b = t;
-}
-
-/**
- *  @brief  Helper structure to simplify work with 16-bit words.
- *  @see    sz_u16_load
- */
-typedef union sz_u16_vec_t {
-    sz_u16_t u16;
-    sz_u8_t u8s[2];
-} sz_u16_vec_t;
-
-/**
- *  @brief Load a 16-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u16_vec_t sz_u16_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u16_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u16_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u16_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u16_vec_t const *result = (sz_u16_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/**
- *  @brief  Helper structure to simplify work with 32-bit words.
- *  @see    sz_u32_load
- */
-typedef union sz_u32_vec_t {
-    sz_u32_t u32;
-    sz_u16_t u16s[2];
-    sz_u8_t u8s[4];
-} sz_u32_vec_t;
-
-/**
- *  @brief Load a 32-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u32_vec_t sz_u32_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u32_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    result.u8s[2] = ptr[2];
-    result.u8s[3] = ptr[3];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u32_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u32_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u32_vec_t const *result = (sz_u32_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/**
- *  @brief  Helper structure to simplify work with 64-bit words.
- *  @see    sz_u64_load
- */
-typedef union sz_u64_vec_t {
-    sz_u64_t u64;
-    sz_u32_t u32s[2];
-    sz_u16_t u16s[4];
-    sz_u8_t u8s[8];
-} sz_u64_vec_t;
-
-/**
- *  @brief Load a 64-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u64_vec_t sz_u64_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u64_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    result.u8s[2] = ptr[2];
-    result.u8s[3] = ptr[3];
-    result.u8s[4] = ptr[4];
-    result.u8s[5] = ptr[5];
-    result.u8s[6] = ptr[6];
-    result.u8s[7] = ptr[7];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u64_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u64_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u64_vec_t const *result = (sz_u64_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/** @brief  Helper function, using the supplied fixed-capacity buffer to allocate memory. */
-SZ_INTERNAL sz_ptr_t _sz_memory_allocate_fixed(sz_size_t length, void *handle) {
-    sz_size_t capacity;
-    sz_copy((sz_ptr_t)&capacity, (sz_cptr_t)handle, sizeof(sz_size_t));
-    sz_size_t consumed_capacity = sizeof(sz_size_t);
-    if (consumed_capacity + length > capacity) return SZ_NULL_CHAR;
-    return (sz_ptr_t)handle + consumed_capacity;
-}
-
-/** @brief  Helper "no-op" function, simulating memory deallocation when we use a "static" memory buffer. */
-SZ_INTERNAL void _sz_memory_free_fixed(sz_ptr_t start, sz_size_t length, void *handle) {
-    sz_unused(start && length && handle);
-}
-
-/** @brief  An internal callback used to set a bit in a power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void _sz_hashes_fingerprint_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *handle) {
-    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
-    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
-    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
-    fingerprint_u8s[(hash / 8) & (fingerprint_bytes - 1)] |= (1 << (hash & 7));
-    sz_unused(start && length);
-}
-
-/** @brief  An internal callback used to set a bit in a @b non power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void _sz_hashes_fingerprint_non_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash,
-                                                          void *handle) {
-    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
-    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
-    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
-    fingerprint_u8s[(hash / 8) % fingerprint_bytes] |= (1 << (hash & 7));
-    sz_unused(start && length);
-}
-
-/** @brief  An internal callback, used to mix all the running hashes into one pointer-size value. */
-SZ_INTERNAL void _sz_hashes_fingerprint_scalar_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash,
-                                                        void *scalar_handle) {
-    sz_unused(start && length && hash && scalar_handle);
-    sz_size_t *scalar_ptr = (sz_size_t *)scalar_handle;
-    *scalar_ptr ^= hash;
-}
-
-/**
- *  @brief  Chooses the offsets of the most interesting characters in a search needle.
- *
- *  Search throughput can significantly deteriorate if we are matching the wrong characters.
- *  Say the needle is "aXaYa", and we are comparing the first, second, and last character.
- *  If we use SIMD and compare many offsets at a time, comparing against "a" in every register is a waste.
- *
- *  Similarly, dealing with UTF8 inputs, we know that the lower bits of each character code carry more information.
- *  Cyrillic alphabet, for example, falls into [0x0410, 0x042F] code range for uppercase [А, Я], and
- *  into [0x0430, 0x044F] for lowercase [а, я]. Scanning through a text written in Russian, half of the
- *  bytes will carry absolutely no value and will be equal to 0x04.
- */
-SZ_INTERNAL void _sz_locate_needle_anomalies(sz_cptr_t start, sz_size_t length, //
-                                             sz_size_t *first, sz_size_t *second, sz_size_t *third) {
-    *first = 0;
-    *second = length / 2;
-    *third = length - 1;
-
-    //
-    int has_duplicates =                   //
-        start[*first] == start[*second] || //
-        start[*first] == start[*third] ||  //
-        start[*second] == start[*third];
-
-    // Loop through letters to find non-colliding variants.
-    if (length > 3 && has_duplicates) {
-        // Pivot the middle point right, until we find a character different from the first one.
-        for (; start[*second] == start[*first] && *second + 1 < *third; ++(*second)) {}
-        // Pivot the third (last) point left, until we find a different character.
-        for (; (start[*third] == start[*second] || start[*third] == start[*first]) && *third > (*second + 1);
-             --(*third)) {}
-    }
-
-    // TODO: Investigate alternative strategies for long needles.
-    // On very long needles we have the luxury to choose!
-    // Often dealing with UTF8, we will likely benefit from shifting the first and second characters
-    // further to the right, to achieve not only uniqueness within the needle, but also avoid common
-    // rune prefixes of 2-, 3-, and 4-byte codes.
-    if (length > 8) {
-        // Pivot the first and second points right, until we find a character, that:
-        // > is different from others.
-        // > doesn't start with 0b'110x'xxxx - only 5 bits of relevant info.
-        // > doesn't start with 0b'1110'xxxx - only 4 bits of relevant info.
-        // > doesn't start with 0b'1111'0xxx - only 3 bits of relevant info.
-        //
-        // So we are practically searching for byte values that start with 0b0xxx'xxxx or 0b'10xx'xxxx.
-        // Meaning they fall in the range [0, 127] and [128, 191], in other words any unsigned int up to 191.
-        sz_u8_t const *start_u8 = (sz_u8_t const *)start;
-        sz_size_t vibrant_first = *first, vibrant_second = *second, vibrant_third = *third;
-
-        // Let's begin with the seccond character, as the termination criteria there is more obvious
-        // and we may end up with more variants to check for the first candidate.
-        for (; (start_u8[vibrant_second] > 191 || start_u8[vibrant_second] == start_u8[vibrant_third]) &&
-               (vibrant_second + 1 < vibrant_third);
-             ++vibrant_second) {}
-
-        // Now check if we've indeed found a good candidate or should revert the `vibrant_second` to `second`.
-        if (start_u8[vibrant_second] < 191) { *second = vibrant_second; }
-        else { vibrant_second = *second; }
-
-        // Now check the first character.
-        for (; (start_u8[vibrant_first] > 191 || start_u8[vibrant_first] == start_u8[vibrant_second] ||
-                start_u8[vibrant_first] == start_u8[vibrant_third]) &&
-               (vibrant_first + 1 < vibrant_second);
-             ++vibrant_first) {}
-
-        // Now check if we've indeed found a good candidate or should revert the `vibrant_first` to `first`.
-        // We don't need to shift the third one when dealing with texts as the last byte of the text is
-        // also the last byte of a rune and contains the most information.
-        if (start_u8[vibrant_first] < 191) { *first = vibrant_first; }
-    }
-}
-
-#pragma GCC visibility pop
-#pragma endregion
-
-#pragma region Serial Implementation
-
-#if !SZ_AVOID_LIBC
-#include <stdio.h>  // `fprintf`
-#include <stdlib.h> // `malloc`, `EXIT_FAILURE`
-
-SZ_PUBLIC void *_sz_memory_allocate_default(sz_size_t length, void *handle) {
-    sz_unused(handle);
-    return malloc(length);
-}
-SZ_PUBLIC void _sz_memory_free_default(sz_ptr_t start, sz_size_t length, void *handle) {
-    sz_unused(handle && length);
-    free(start);
-}
-
-#endif
-
-SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc) {
-#if !SZ_AVOID_LIBC
-    alloc->allocate = (sz_memory_allocate_t)_sz_memory_allocate_default;
-    alloc->free = (sz_memory_free_t)_sz_memory_free_default;
-#else
-    alloc->allocate = (sz_memory_allocate_t)SZ_NULL;
-    alloc->free = (sz_memory_free_t)SZ_NULL;
-#endif
-    alloc->handle = SZ_NULL;
-}
-
-SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length) {
-    // The logic here is simple - put the buffer length in the first slots of the buffer.
-    // Later use it for bounds checking.
-    alloc->allocate = (sz_memory_allocate_t)_sz_memory_allocate_fixed;
-    alloc->free = (sz_memory_free_t)_sz_memory_free_fixed;
-    alloc->handle = &buffer;
-    sz_copy((sz_ptr_t)buffer, (sz_cptr_t)&length, sizeof(sz_size_t));
-}
-
-/**
- *  @brief  Byte-level equality comparison between two strings.
- *          If unaligned loads are allowed, uses a switch-table to avoid loops on short strings.
- */
-SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_cptr_t const a_end = a + length;
-#if SZ_USE_MISALIGNED_LOADS
-    if (length >= SZ_SWAR_THRESHOLD) {
-        sz_u64_vec_t a_vec, b_vec;
-        for (; a + 8 <= a_end; a += 8, b += 8) {
-            a_vec = sz_u64_load(a);
-            b_vec = sz_u64_load(b);
-            if (a_vec.u64 != b_vec.u64) return sz_false_k;
-        }
-    }
-#endif
-    while (a != a_end && *a == *b) a++, b++;
-    return (sz_bool_t)(a_end == a);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-    for (sz_cptr_t const end = text + length; text != end; ++text)
-        if (sz_charset_contains(set, *text)) return text;
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Warray-bounds"
-    sz_cptr_t const end = text;
-    for (text += length; text != end;)
-        if (sz_charset_contains(set, *(text -= 1))) return text;
-    return SZ_NULL_CHAR;
-#pragma GCC diagnostic pop
-}
-
-/**
- *  One option to avoid branching is to use conditional moves and lookup the comparison result in a table:
- *       sz_ordering_t ordering_lookup[2] = {sz_greater_k, sz_less_k};
- *       for (; a != min_end; ++a, ++b)
- *           if (*a != *b) return ordering_lookup[*a < *b];
- *  That, however, introduces a data-dependency.
- *  A cleaner option is to perform two comparisons and a subtraction.
- *  One instruction more, but no data-dependency.
- */
-#define _sz_order_scalars(a, b) ((sz_ordering_t)((a > b) - (a < b)))
-
-SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    sz_bool_t a_shorter = (sz_bool_t)(a_length < b_length);
-    sz_size_t min_length = a_shorter ? a_length : b_length;
-    sz_cptr_t min_end = a + min_length;
-#if SZ_USE_MISALIGNED_LOADS && !SZ_DETECT_BIG_ENDIAN
-    for (sz_u64_vec_t a_vec, b_vec; a + 8 <= min_end; a += 8, b += 8) {
-        a_vec = sz_u64_load(a);
-        b_vec = sz_u64_load(b);
-        if (a_vec.u64 != b_vec.u64)
-            return _sz_order_scalars(sz_u64_bytes_reverse(a_vec.u64), sz_u64_bytes_reverse(b_vec.u64));
-    }
-#endif
-    for (; a != min_end; ++a, ++b)
-        if (*a != *b) return _sz_order_scalars(*a, *b);
-
-    // If the strings are equal up to `min_end`, then the shorter string is smaller
-    return _sz_order_scalars(a_length, b_length);
-}
-
-/**
- *  @brief  Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each byte is set.
-    // For that take the bottom 7 bits of each byte, add one to them,
-    // and if this sets the top bit to one, then all the 7 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7F7F7F7F7F7F7F7Full) + 0x0101010101010101ull) & ((vec.u64 & 0x8080808080808080ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b single-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- *          Identical to `memchr(haystack, needle[0], haystack_length)`.
- */
-SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    if (!h_length) return SZ_NULL_CHAR;
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_DETECT_BIG_ENDIAN    // Use SWAR only on little-endian platforms for brevety.
-#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
-        if (*h == *n) return h;
-#endif
-
-    // Broadcast the n into every byte of a 64-bit integer to use SWAR
-    // techniques and process eight characters at a time.
-    sz_u64_vec_t h_vec, n_vec, match_vec;
-    match_vec.u64 = 0;
-    n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
-    for (; h + 8 <= h_end; h += 8) {
-        h_vec.u64 = *(sz_u64_t const *)h;
-        match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
-        if (match_vec.u64) return h + sz_u64_ctz(match_vec.u64) / 8;
-    }
-#endif
-
-    // Handle the misaligned tail.
-    for (; h < h_end; ++h)
-        if (*h == *n) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Find the last occurrence of a @b single-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- *          Identical to `memrchr(haystack, needle[0], haystack_length)`.
- */
-sz_cptr_t sz_rfind_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    if (!h_length) return SZ_NULL_CHAR;
-    sz_cptr_t const h_start = h;
-
-    // Reposition the `h` pointer to the end, as we will be walking backwards.
-    h = h + h_length - 1;
-
-#if !SZ_DETECT_BIG_ENDIAN    // Use SWAR only on little-endian platforms for brevety.
-#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)(h + 1) & 7ull) && h >= h_start; --h)
-        if (*h == *n) return h;
-#endif
-
-    // Broadcast the n into every byte of a 64-bit integer to use SWAR
-    // techniques and process eight characters at a time.
-    sz_u64_vec_t h_vec, n_vec, match_vec;
-    n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
-    for (; h >= h_start + 7; h -= 8) {
-        h_vec.u64 = *(sz_u64_t const *)(h - 7);
-        match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
-        if (match_vec.u64) return h - sz_u64_clz(match_vec.u64) / 8;
-    }
-#endif
-
-    for (; h >= h_start; --h)
-        if (*h == *n) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  2Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 2byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_2byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 2byte is set.
-    // For that take the bottom 15 bits of each 2byte, add one to them,
-    // and if this sets the top bit to one, then all the 15 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7FFF7FFF7FFF7FFFull) + 0x0001000100010001ull) & ((vec.u64 & 0x8000800080008000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b two-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_2byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 2 bytes long.
-    sz_assert(h_length >= 2 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 2 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
-#endif
-
-    sz_u64_vec_t h_even_vec, h_odd_vec, n_vec, matches_even_vec, matches_odd_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1];
-    n_vec.u64 *= 0x0001000100010001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
-    for (; h + 9 <= h_end; h += 8) {
-        h_even_vec.u64 = *(sz_u64_t *)h;
-        h_odd_vec.u64 = (h_even_vec.u64 >> 8) | ((sz_u64_t)h[8] << 56);
-        matches_even_vec = _sz_u64_each_2byte_equal(h_even_vec, n_vec);
-        matches_odd_vec = _sz_u64_each_2byte_equal(h_odd_vec, n_vec);
-
-        matches_even_vec.u64 >>= 8;
-        if (matches_even_vec.u64 + matches_odd_vec.u64) {
-            sz_u64_t match_indicators = matches_even_vec.u64 | matches_odd_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 2 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  4Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 4byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_4byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 4byte is set.
-    // For that take the bottom 31 bits of each 4byte, add one to them,
-    // and if this sets the top bit to one, then all the 31 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7FFFFFFF7FFFFFFFull) + 0x0000000100000001ull) & ((vec.u64 & 0x8000000080000000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b four-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_4byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
-    sz_assert(h_length >= 4 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 4 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
-#endif
-
-    sz_u64_vec_t h0_vec, h1_vec, h2_vec, h3_vec, n_vec, matches0_vec, matches1_vec, matches2_vec, matches3_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1], n_vec.u8s[2] = n[2], n_vec.u8s[3] = n[3];
-    n_vec.u64 *= 0x0000000100000001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time using four 64-bit words.
-    // We load the subsequent four-byte word as well, taking its first bytes. Think of it as a glorified prefetch :)
-    sz_u64_t h_page_current, h_page_next;
-    for (; h + sizeof(sz_u64_t) + sizeof(sz_u32_t) <= h_end; h += sizeof(sz_u64_t)) {
-        h_page_current = *(sz_u64_t *)h;
-        h_page_next = *(sz_u32_t *)(h + 8);
-        h0_vec.u64 = (h_page_current);
-        h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
-        h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
-        h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
-        matches0_vec = _sz_u64_each_4byte_equal(h0_vec, n_vec);
-        matches1_vec = _sz_u64_each_4byte_equal(h1_vec, n_vec);
-        matches2_vec = _sz_u64_each_4byte_equal(h2_vec, n_vec);
-        matches3_vec = _sz_u64_each_4byte_equal(h3_vec, n_vec);
-
-        if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64) {
-            matches0_vec.u64 >>= 24;
-            matches1_vec.u64 >>= 16;
-            matches2_vec.u64 >>= 8;
-            sz_u64_t match_indicators = matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 4 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  3Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 3byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_3byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 4byte is set.
-    // For that take the bottom 31 bits of each 4byte, add one to them,
-    // and if this sets the top bit to one, then all the 31 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0xFFFF7FFFFF7FFFFFull) + 0x0000000001000001ull) & ((vec.u64 & 0x0000800000800000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b three-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_3byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
-    sz_assert(h_length >= 3 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 3 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
-#endif
-
-    // We fetch 12
-    sz_u64_vec_t h0_vec, h1_vec, h2_vec, h3_vec, h4_vec;
-    sz_u64_vec_t matches0_vec, matches1_vec, matches2_vec, matches3_vec, matches4_vec;
-    sz_u64_vec_t n_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1], n_vec.u8s[2] = n[2];
-    n_vec.u64 *= 0x0000000001000001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time using three 64-bit words.
-    // We load the subsequent two-byte word as well.
-    sz_u64_t h_page_current, h_page_next;
-    for (; h + sizeof(sz_u64_t) + sizeof(sz_u16_t) <= h_end; h += sizeof(sz_u64_t)) {
-        h_page_current = *(sz_u64_t *)h;
-        h_page_next = *(sz_u16_t *)(h + 8);
-        h0_vec.u64 = (h_page_current);
-        h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
-        h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
-        h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
-        h4_vec.u64 = (h_page_current >> 32) | (h_page_next << 32);
-        matches0_vec = _sz_u64_each_3byte_equal(h0_vec, n_vec);
-        matches1_vec = _sz_u64_each_3byte_equal(h1_vec, n_vec);
-        matches2_vec = _sz_u64_each_3byte_equal(h2_vec, n_vec);
-        matches3_vec = _sz_u64_each_3byte_equal(h3_vec, n_vec);
-        matches4_vec = _sz_u64_each_3byte_equal(h4_vec, n_vec);
-
-        if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64) {
-            matches0_vec.u64 >>= 16;
-            matches1_vec.u64 >>= 8;
-            matches3_vec.u64 <<= 8;
-            matches4_vec.u64 <<= 16;
-            sz_u64_t match_indicators =
-                matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 3 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Boyer-Moore-Horspool algorithm for exact matching of patterns up to @b 256-bytes long.
- *          Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_horspool_upto_256bytes_serial(sz_cptr_t h_chars, sz_size_t h_length, //
-                                                             sz_cptr_t n_chars, sz_size_t n_length) {
-    sz_assert(n_length <= 256 && "The pattern is too long.");
-    // Several popular string matching algorithms are using a bad-character shift table.
-    // Boyer Moore: https://www-igm.univ-mlv.fr/~lecroq/string/node14.html
-    // Quick Search: https://www-igm.univ-mlv.fr/~lecroq/string/node19.html
-    // Smith: https://www-igm.univ-mlv.fr/~lecroq/string/node21.html
-    union {
-        sz_u8_t jumps[256];
-        sz_u64_vec_t vecs[64];
-    } bad_shift_table;
-
-    // Let's initialize the table using SWAR to the total length of the string.
-    sz_u8_t const *h = (sz_u8_t const *)h_chars;
-    sz_u8_t const *n = (sz_u8_t const *)n_chars;
-    {
-        sz_u64_vec_t n_length_vec;
-        n_length_vec.u64 = n_length;
-        n_length_vec.u64 *= 0x0101010101010101ull; // broadcast
-        for (sz_size_t i = 0; i != 64; ++i) bad_shift_table.vecs[i].u64 = n_length_vec.u64;
-        for (sz_size_t i = 0; i + 1 < n_length; ++i) bad_shift_table.jumps[n[i]] = (sz_u8_t)(n_length - i - 1);
-    }
-
-    // Another common heuristic is to match a few characters from different parts of a string.
-    // Raita suggests to use the first two, the last, and the middle character of the pattern.
-    sz_u32_vec_t h_vec, n_vec;
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into an unsigned integer.
-    n_vec.u8s[0] = n[offset_first];
-    n_vec.u8s[1] = n[offset_first + 1];
-    n_vec.u8s[2] = n[offset_mid];
-    n_vec.u8s[3] = n[offset_last];
-
-    // Scan through the whole haystack, skipping the last `n_length - 1` bytes.
-    for (sz_size_t i = 0; i <= h_length - n_length;) {
-        h_vec.u8s[0] = h[i + offset_first];
-        h_vec.u8s[1] = h[i + offset_first + 1];
-        h_vec.u8s[2] = h[i + offset_mid];
-        h_vec.u8s[3] = h[i + offset_last];
-        if (h_vec.u32 == n_vec.u32 && sz_equal((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
-        i += bad_shift_table.jumps[h[i + n_length - 1]];
-    }
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Boyer-Moore-Horspool algorithm for @b reverse-order exact matching of patterns up to @b 256-bytes long.
- *          Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
- */
-SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_upto_256bytes_serial(sz_cptr_t h_chars, sz_size_t h_length, //
-                                                              sz_cptr_t n_chars, sz_size_t n_length) {
-    sz_assert(n_length <= 256 && "The pattern is too long.");
-    union {
-        sz_u8_t jumps[256];
-        sz_u64_vec_t vecs[64];
-    } bad_shift_table;
-
-    // Let's initialize the table using SWAR to the total length of the string.
-    sz_u8_t const *h = (sz_u8_t const *)h_chars;
-    sz_u8_t const *n = (sz_u8_t const *)n_chars;
-    {
-        sz_u64_vec_t n_length_vec;
-        n_length_vec.u64 = n_length;
-        n_length_vec.u64 *= 0x0101010101010101ull; // broadcast
-        for (sz_size_t i = 0; i != 64; ++i) bad_shift_table.vecs[i].u64 = n_length_vec.u64;
-        for (sz_size_t i = 0; i + 1 < n_length; ++i)
-            bad_shift_table.jumps[n[n_length - i - 1]] = (sz_u8_t)(n_length - i - 1);
-    }
-
-    // Another common heuristic is to match a few characters from different parts of a string.
-    // Raita suggests to use the first two, the last, and the middle character of the pattern.
-    sz_u32_vec_t h_vec, n_vec;
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into an unsigned integer.
-    n_vec.u8s[0] = n[offset_first];
-    n_vec.u8s[1] = n[offset_first + 1];
-    n_vec.u8s[2] = n[offset_mid];
-    n_vec.u8s[3] = n[offset_last];
-
-    // Scan through the whole haystack, skipping the first `n_length - 1` bytes.
-    for (sz_size_t j = 0; j <= h_length - n_length;) {
-        sz_size_t i = h_length - n_length - j;
-        h_vec.u8s[0] = h[i + offset_first];
-        h_vec.u8s[1] = h[i + offset_first + 1];
-        h_vec.u8s[2] = h[i + offset_mid];
-        h_vec.u8s[3] = h[i + offset_last];
-        if (h_vec.u32 == n_vec.u32 && sz_equal((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
-        j += bad_shift_table.jumps[h[i]];
-    }
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Exact substring search helper function, that finds the first occurrence of a prefix of the needle
- *          using a given search function, and then verifies the remaining part of the needle.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_with_prefix(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length,
-                                           sz_find_t find_prefix, sz_size_t prefix_length) {
-
-    sz_size_t suffix_length = n_length - prefix_length;
-    while (1) {
-        sz_cptr_t found = find_prefix(h, h_length, n, prefix_length);
-        if (!found) return SZ_NULL_CHAR;
-
-        // Verify the remaining part of the needle
-        sz_size_t remaining = h_length - (found - h);
-        if (remaining < n_length) return SZ_NULL_CHAR;
-        if (sz_equal(found + prefix_length, n + prefix_length, suffix_length)) return found;
-
-        // Adjust the position.
-        h = found + 1;
-        h_length = remaining - 1;
-    }
-
-    // Unreachable, but helps silence compiler warnings:
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Exact reverse-order substring search helper function, that finds the last occurrence of a suffix of the
- *          needle using a given search function, and then verifies the remaining part of the needle.
- */
-SZ_INTERNAL sz_cptr_t _sz_rfind_with_suffix(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length,
-                                            sz_find_t find_suffix, sz_size_t suffix_length) {
-
-    sz_size_t prefix_length = n_length - suffix_length;
-    while (1) {
-        sz_cptr_t found = find_suffix(h, h_length, n + prefix_length, suffix_length);
-        if (!found) return SZ_NULL_CHAR;
-
-        // Verify the remaining part of the needle
-        sz_size_t remaining = found - h;
-        if (remaining < prefix_length) return SZ_NULL_CHAR;
-        if (sz_equal(found - prefix_length, n, prefix_length)) return found - prefix_length;
-
-        // Adjust the position.
-        h_length = remaining - 1;
-    }
-
-    // Unreachable, but helps silence compiler warnings:
-    return SZ_NULL_CHAR;
-}
-
-SZ_INTERNAL sz_cptr_t _sz_find_over_4bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    return _sz_find_with_prefix(h, h_length, n, n_length, (sz_find_t)_sz_find_4byte_serial, 4);
-}
-
-SZ_INTERNAL sz_cptr_t _sz_find_horspool_over_256bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
-                                                             sz_size_t n_length) {
-    return _sz_find_with_prefix(h, h_length, n, n_length, _sz_find_horspool_upto_256bytes_serial, 256);
-}
-
-SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_over_256bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
-                                                              sz_size_t n_length) {
-    return _sz_rfind_with_suffix(h, h_length, n, n_length, _sz_rfind_horspool_upto_256bytes_serial, 256);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-
-#if SZ_DETECT_BIG_ENDIAN
-    sz_find_t backends[] = {
-        (sz_find_t)sz_find_byte_serial,
-        (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_find_horspool_over_256bytes_serial,
-    };
-
-    return backends[(n_length > 1) + (n_length > 256)](h, h_length, n, n_length);
-#else
-    sz_find_t backends[] = {
-        // For very short strings brute-force SWAR makes sense.
-        (sz_find_t)sz_find_byte_serial,
-        (sz_find_t)_sz_find_2byte_serial,
-        (sz_find_t)_sz_find_3byte_serial,
-        (sz_find_t)_sz_find_4byte_serial,
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (sz_find_t)_sz_find_over_4bytes_serial,
-        // For longer needles - use skip tables.
-        (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_find_horspool_over_256bytes_serial,
-    };
-
-    return backends[
-        // For very short strings brute-force SWAR makes sense.
-        (n_length > 1) + (n_length > 2) + (n_length > 3) +
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (n_length > 4) +
-        // For longer needles - use skip tables.
-        (n_length > 8) + (n_length > 256)](h, h_length, n, n_length);
-#endif
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-
-    sz_find_t backends[] = {
-        // For very short strings brute-force SWAR makes sense.
-        (sz_find_t)sz_rfind_byte_serial,
-        //  TODO: implement reverse-order SWAR for 2/3/4 byte variants.
-        //  TODO: (sz_find_t)_sz_rfind_2byte_serial,
-        //  TODO: (sz_find_t)_sz_rfind_3byte_serial,
-        //  TODO: (sz_find_t)_sz_rfind_4byte_serial,
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        // (sz_find_t)_sz_rfind_over_4bytes_serial,
-        // For longer needles - use skip tables.
-        (sz_find_t)_sz_rfind_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_rfind_horspool_over_256bytes_serial,
-    };
-
-    return backends[
-        // For very short strings brute-force SWAR makes sense.
-        0 +
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (n_length > 1) +
-        // For longer needles - use skip tables.
-        (n_length > 256)](h, h_length, n, n_length);
-}
-
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_serial( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                 //
-    sz_cptr_t longer, sz_size_t longer_length,                   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // TODO: Generalize to remove the following asserts!
-    sz_assert(!bound && "For bounded search the method should only evaluate one band of the matrix.");
-    sz_assert(shorter_length == longer_length && "The method hasn't been generalized to different length inputs yet.");
-    sz_unused(longer_length && bound);
-
-    // We are going to store 3 diagonals of the matrix.
-    // The length of the longest (main) diagonal would be `n = (shorter_length + 1)`.
-    sz_size_t n = shorter_length + 1;
-    sz_size_t buffer_length = sizeof(sz_size_t) * n * 3;
-    sz_size_t *distances = (sz_size_t *)alloc->allocate(buffer_length, alloc->handle);
-    if (!distances) return SZ_SIZE_MAX;
-
-    sz_size_t *previous_distances = distances;
-    sz_size_t *current_distances = previous_distances + n;
-    sz_size_t *next_distances = previous_distances + n * 2;
-
-    // Initialize the first two diagonals:
-    previous_distances[0] = 0;
-    current_distances[0] = current_distances[1] = 1;
-
-    // Progress through the upper triangle of the Levenshtein matrix.
-    sz_size_t next_diagonal_index = 2;
-    for (; next_diagonal_index != n; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
-        for (sz_size_t i = 0; i + 2 < next_diagonal_length; ++i) {
-            sz_size_t cost_of_substitution = shorter[next_diagonal_index - i - 2] != longer[i];
-            sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
-            sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
-            next_distances[i + 1] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
-        }
-        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-        next_distances[0] = next_distances[next_diagonal_length - 1] = next_diagonal_index;
-        // Perform a circular rotation of those buffers, to reuse the memory.
-        sz_size_t *temporary = previous_distances;
-        previous_distances = current_distances;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a
-    // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
-    // index on either side, we will be cropping those values out.
-    sz_size_t diagonals_count = n + n - 1;
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
-        for (sz_size_t i = 0; i != next_diagonal_length; ++i) {
-            sz_size_t cost_of_substitution = shorter[shorter_length - 1 - i] != longer[next_diagonal_index - n + i];
-            sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
-            sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
-            next_distances[i] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
-        }
-        // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
-        // dropping the first element in the current array.
-        sz_size_t *temporary = previous_distances;
-        previous_distances = current_distances + 1;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // Cache scalar before `free` call.
-    sz_size_t result = current_distances[0];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-}
-
-/**
- *  @brief  Describes the length of a UTF8 character / codepoint / rune in bytes.
- */
-typedef enum {
-    sz_utf8_invalid_k = 0,     //!< Invalid UTF8 character.
-    sz_utf8_rune_1byte_k = 1,  //!< 1-byte UTF8 character.
-    sz_utf8_rune_2bytes_k = 2, //!< 2-byte UTF8 character.
-    sz_utf8_rune_3bytes_k = 3, //!< 3-byte UTF8 character.
-    sz_utf8_rune_4bytes_k = 4, //!< 4-byte UTF8 character.
-} sz_rune_length_t;
-
-typedef sz_u32_t sz_rune_t;
-
-/**
- *  @brief  Extracts just one UTF8 codepoint from a UTF8 string into a 32-bit unsigned integer.
- */
-SZ_INTERNAL void _sz_extract_utf8_rune(sz_cptr_t utf8, sz_rune_t *code, sz_rune_length_t *code_length) {
-    sz_u8_t const *current = (sz_u8_t const *)utf8;
-    sz_u8_t leading_byte = *current++;
-    sz_rune_t ch;
-    sz_rune_length_t ch_length;
-
-    // TODO: This can be made entirely branchless using 32-bit SWAR.
-    if (leading_byte < 0x80) {
-        // Single-byte rune (0xxxxxxx)
-        ch = leading_byte;
-        ch_length = sz_utf8_rune_1byte_k;
-    }
-    else if ((leading_byte & 0xE0) == 0xC0) {
-        // Two-byte rune (110xxxxx 10xxxxxx)
-        ch = (leading_byte & 0x1F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_2bytes_k;
-    }
-    else if ((leading_byte & 0xF0) == 0xE0) {
-        // Three-byte rune (1110xxxx 10xxxxxx 10xxxxxx)
-        ch = (leading_byte & 0x0F) << 12;
-        ch |= (*current++ & 0x3F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_3bytes_k;
-    }
-    else if ((leading_byte & 0xF8) == 0xF0) {
-        // Four-byte rune (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
-        ch = (leading_byte & 0x07) << 18;
-        ch |= (*current++ & 0x3F) << 12;
-        ch |= (*current++ & 0x3F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_4bytes_k;
-    }
-    else {
-        // Invalid UTF8 rune.
-        ch = 0;
-        ch_length = sz_utf8_invalid_k;
-    }
-    *code = ch;
-    *code_length = ch_length;
-}
-
-/**
- *  @brief  Exports a UTF8 string into a UTF32 buffer.
- *          ! The result is undefined id the UTF8 string is corrupted.
- *  @return The length in the number of codepoints.
- */
-SZ_INTERNAL sz_size_t _sz_export_utf8_to_utf32(sz_cptr_t utf8, sz_size_t utf8_length, sz_rune_t *utf32) {
-    sz_cptr_t const end = utf8 + utf8_length;
-    sz_size_t count = 0;
-    sz_rune_length_t rune_length;
-    for (; utf8 != end; utf8 += rune_length, utf32++, count++) _sz_extract_utf8_rune(utf8, utf32, &rune_length);
-    return count;
-}
-
-/**
- *  @brief  Compute the Levenshtein distance between two strings using the Wagner-Fisher algorithm.
- *          Stores only 2 rows of the Levenshtein matrix, but uses 64-bit integers for the distance values,
- *          and upcasts UTF8 variable-length codepoints to 64-bit integers for faster addressing.
- *
- *  ! In the worst case for 2 strings of length 100, that contain just one 16-bit codepoint this will result in extra:
- *      + 2 rows * 100 slots * 8 bytes/slot = 1600 bytes of memory for the two rows of the Levenshtein matrix rows.
- *      + 100 codepoints * 2 strings * 4 bytes/codepoint = 800 bytes of memory for the UTF8 buffer.
- *      = 2400 bytes of memory or @b 12x memory amplification!
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_wagner_fisher_serial( //
-    sz_cptr_t longer, sz_size_t longer_length,                //
-    sz_cptr_t shorter, sz_size_t shorter_length,              //
-    sz_size_t bound, sz_bool_t can_be_unicode, sz_memory_allocator_t *alloc) {
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // A good idea may be to dispatch different kernels for different string lengths.
-    // Like using `uint8_t` counters for strings under 255 characters long.
-    // Good in theory, this results in frequent upcasts and downcasts in serial code.
-    // On strings over 20 bytes, using `uint8` over `uint64` on 64-bit x86 CPU doubles the execution time.
-    // So one must be very cautious with such optimizations.
-    typedef sz_size_t _distance_t;
-
-    // Compute the number of columns in our Levenshtein matrix.
-    sz_size_t const n = shorter_length + 1;
-
-    // If a buffering memory-allocator is provided, this operation is practically free,
-    // and cheaper than allocating even 512 bytes (for small distance matrices) on stack.
-    sz_size_t buffer_length = sizeof(_distance_t) * (n * 2);
-
-    // If the strings contain Unicode characters, let's estimate the max character width,
-    // and use it to allocate a larger buffer to decode UTF8.
-    if ((can_be_unicode == sz_true_k) &&
-        (sz_isascii(longer, longer_length) == sz_false_k || sz_isascii(shorter, shorter_length) == sz_false_k)) {
-        buffer_length += (shorter_length + longer_length) * sizeof(sz_rune_t);
-    }
-    else { can_be_unicode = sz_false_k; }
-
-    // If the allocation fails, return the maximum distance.
-    sz_ptr_t const buffer = (sz_ptr_t)alloc->allocate(buffer_length, alloc->handle);
-    if (!buffer) return SZ_SIZE_MAX;
-
-    // Let's export the UTF8 sequence into the newly allocated buffer at the end.
-    if (can_be_unicode == sz_true_k) {
-        sz_rune_t *const longer_utf32 = (sz_rune_t *)(buffer + sizeof(_distance_t) * (n * 2));
-        sz_rune_t *const shorter_utf32 = longer_utf32 + longer_length;
-        // Export the UTF8 sequences into the newly allocated buffer.
-        longer_length = _sz_export_utf8_to_utf32(longer, longer_length, longer_utf32);
-        shorter_length = _sz_export_utf8_to_utf32(shorter, shorter_length, shorter_utf32);
-        longer = (sz_cptr_t)longer_utf32;
-        shorter = (sz_cptr_t)shorter_utf32;
-    }
-
-    // Let's parameterize the core logic for different character types and distance types.
-#define _wagner_fisher_unbounded(_distance_t, _char_t)                                                                \
-    /* Now let's cast our pointer to avoid it in subsequent sections. */                                              \
-    _char_t const *const longer_chars = (_char_t const *)longer;                                                      \
-    _char_t const *const shorter_chars = (_char_t const *)shorter;                                                    \
-    _distance_t *previous_distances = (_distance_t *)buffer;                                                          \
-    _distance_t *current_distances = previous_distances + n;                                                          \
-    /*  Initialize the first row of the Levenshtein matrix with `iota`-style arithmetic progression. */               \
-    for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
-    /* The main loop of the algorithm with quadratic complexity. */                                                   \
-    for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {                                     \
-        _char_t const longer_char = longer_chars[idx_longer];                                                         \
-        /* Using pure pointer arithmetic is faster than iterating with an index. */                                   \
-        _char_t const *shorter_ptr = shorter_chars;                                                                   \
-        _distance_t const *previous_ptr = previous_distances;                                                         \
-        _distance_t *current_ptr = current_distances;                                                                 \
-        _distance_t *const current_end = current_ptr + shorter_length;                                                \
-        current_ptr[0] = idx_longer + 1;                                                                              \
-        for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) {                            \
-            _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]);           \
-            /* We can avoid `+1` for costs here, shifting it to post-minimum computation, */                          \
-            /* saving one increment operation. */                                                                     \
-            _distance_t cost_deletion = previous_ptr[1];                                                              \
-            _distance_t cost_insertion = current_ptr[0];                                                              \
-            /* ? It might be a good idea to enforce branchless execution here. */                                     \
-            /* ? The caveat being that the benchmarks on longer sequences backfire and more research is needed. */    \
-            current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1);      \
-        }                                                                                                             \
-        /* Swap `previous_distances` and `current_distances` pointers. */                                             \
-        _distance_t *temporary = previous_distances;                                                                  \
-        previous_distances = current_distances;                                                                       \
-        current_distances = temporary;                                                                                \
-    }                                                                                                                 \
-    /* Cache scalar before `free` call. */                                                                            \
-    sz_size_t result = previous_distances[shorter_length];                                                            \
-    alloc->free(buffer, buffer_length, alloc->handle);                                                                \
-    return result;
-
-    // Let's define a separate variant for bounded distance computation.
-    // Practically the same as unbounded, but also collecting the running minimum within each row for early exit.
-#define _wagner_fisher_bounded(_distance_t, _char_t)                                                                  \
-    _char_t const *const longer_chars = (_char_t const *)longer;                                                      \
-    _char_t const *const shorter_chars = (_char_t const *)shorter;                                                    \
-    _distance_t *previous_distances = (_distance_t *)buffer;                                                          \
-    _distance_t *current_distances = previous_distances + n;                                                          \
-    for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
-    for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {                                     \
-        _char_t const longer_char = longer_chars[idx_longer];                                                         \
-        _char_t const *shorter_ptr = shorter_chars;                                                                   \
-        _distance_t const *previous_ptr = previous_distances;                                                         \
-        _distance_t *current_ptr = current_distances;                                                                 \
-        _distance_t *const current_end = current_ptr + shorter_length;                                                \
-        current_ptr[0] = idx_longer + 1;                                                                              \
-        /* Initialize min_distance with a value greater than bound */                                                 \
-        _distance_t min_distance = bound - 1;                                                                         \
-        for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) {                            \
-            _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]);           \
-            _distance_t cost_deletion = previous_ptr[1];                                                              \
-            _distance_t cost_insertion = current_ptr[0];                                                              \
-            current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1);      \
-            /* Keep track of the minimum distance seen so far in this row */                                          \
-            min_distance = sz_min_of_two(current_ptr[1], min_distance);                                               \
-        }                                                                                                             \
-        /* If the minimum distance in this row exceeded the bound, return early */                                    \
-        if (min_distance >= bound) {                                                                                  \
-            alloc->free(buffer, buffer_length, alloc->handle);                                                        \
-            return bound;                                                                                             \
-        }                                                                                                             \
-        _distance_t *temporary = previous_distances;                                                                  \
-        previous_distances = current_distances;                                                                       \
-        current_distances = temporary;                                                                                \
-    }                                                                                                                 \
-    sz_size_t result = previous_distances[shorter_length];                                                            \
-    alloc->free(buffer, buffer_length, alloc->handle);                                                                \
-    return sz_min_of_two(result, bound);
-
-    // Dispatch the actual computation.
-    if (!bound) {
-        if (can_be_unicode == sz_true_k) { _wagner_fisher_unbounded(sz_size_t, sz_rune_t); }
-        else { _wagner_fisher_unbounded(sz_size_t, sz_u8_t); }
-    }
-    else {
-        if (can_be_unicode == sz_true_k) { _wagner_fisher_bounded(sz_size_t, sz_rune_t); }
-        else { _wagner_fisher_bounded(sz_size_t, sz_u8_t); }
-    }
-}
-
-SZ_PUBLIC sz_size_t sz_edit_distance_serial(     //
-    sz_cptr_t longer, sz_size_t longer_length,   //
-    sz_cptr_t shorter, sz_size_t shorter_length, //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
-    }
-
-    // Skip the matching prefixes and suffixes, they won't affect the distance.
-    for (sz_cptr_t a_end = longer + longer_length, b_end = shorter + shorter_length;
-         longer != a_end && shorter != b_end && *longer == *shorter;
-         ++longer, ++shorter, --longer_length, --shorter_length);
-    for (; longer_length && shorter_length && longer[longer_length - 1] == shorter[shorter_length - 1];
-         --longer_length, --shorter_length);
-
-    // Bounded computations may exit early.
-    int const is_bounded = bound < longer_length;
-    if (is_bounded) {
-        // If one of the strings is empty - the edit distance is equal to the length of the other one.
-        if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
-        if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
-        // If the difference in length is beyond the `bound`, there is no need to check at all.
-        if (longer_length - shorter_length > bound) return bound;
-    }
-
-    if (shorter_length == 0) return longer_length; // If no mismatches were found - the distance is zero.
-    if (shorter_length == longer_length && !is_bounded)
-        return _sz_edit_distance_skewed_diagonals_serial(longer, longer_length, shorter, shorter_length, bound, alloc);
-    return _sz_edit_distance_wagner_fisher_serial(longer, longer_length, shorter, shorter_length, bound, sz_false_k,
-                                                  alloc);
-}
-
-SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(       //
-    sz_cptr_t longer, sz_size_t longer_length,        //
-    sz_cptr_t shorter, sz_size_t shorter_length,      //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, //
-    sz_memory_allocator_t *alloc) {
-
-    // If one of the strings is empty - the edit distance is equal to the length of the other one
-    if (longer_length == 0) return (sz_ssize_t)shorter_length * gap;
-    if (shorter_length == 0) return (sz_ssize_t)longer_length * gap;
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
-    }
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    sz_size_t n = shorter_length + 1;
-    sz_size_t buffer_length = sizeof(sz_ssize_t) * n * 2;
-    sz_ssize_t *distances = (sz_ssize_t *)alloc->allocate(buffer_length, alloc->handle);
-    sz_ssize_t *previous_distances = distances;
-    sz_ssize_t *current_distances = previous_distances + n;
-
-    for (sz_size_t idx_shorter = 0; idx_shorter != n; ++idx_shorter)
-        previous_distances[idx_shorter] = (sz_ssize_t)idx_shorter * gap;
-
-    sz_u8_t const *shorter_unsigned = (sz_u8_t const *)shorter;
-    sz_u8_t const *longer_unsigned = (sz_u8_t const *)longer;
-    for (sz_size_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {
-        current_distances[0] = ((sz_ssize_t)idx_longer + 1) * gap;
-
-        // Initialize min_distance with a value greater than bound
-        sz_error_cost_t const *a_subs = subs + longer_unsigned[idx_longer] * 256ul;
-        for (sz_size_t idx_shorter = 0; idx_shorter != shorter_length; ++idx_shorter) {
-            sz_ssize_t cost_deletion = previous_distances[idx_shorter + 1] + gap;
-            sz_ssize_t cost_insertion = current_distances[idx_shorter] + gap;
-            sz_ssize_t cost_substitution = previous_distances[idx_shorter] + a_subs[shorter_unsigned[idx_shorter]];
-            current_distances[idx_shorter + 1] = sz_max_of_three(cost_deletion, cost_insertion, cost_substitution);
-        }
-
-        // Swap previous_distances and current_distances pointers
-        sz_pointer_swap((void **)&previous_distances, (void **)&current_distances);
-    }
-
-    // Cache scalar before `free` call.
-    sz_ssize_t result = previous_distances[shorter_length];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-}
-
-SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
-    sz_cptr_t a, sz_size_t a_length,            //
-    sz_cptr_t b, sz_size_t b_length,            //
-    sz_size_t bound) {
-
-    sz_size_t const min_length = sz_min_of_two(a_length, b_length);
-    sz_size_t const max_length = sz_max_of_two(a_length, b_length);
-    sz_cptr_t const a_end = a + min_length;
-    bound = bound == 0 ? max_length : bound;
-
-    // Walk through both strings using SWAR and counting the number of differing characters.
-    sz_size_t distance = max_length - min_length;
-#if SZ_USE_MISALIGNED_LOADS && !SZ_DETECT_BIG_ENDIAN
-    if (min_length >= SZ_SWAR_THRESHOLD) {
-        sz_u64_vec_t a_vec, b_vec, match_vec;
-        for (; a + 8 <= a_end && distance < bound; a += 8, b += 8) {
-            a_vec.u64 = sz_u64_load(a).u64;
-            b_vec.u64 = sz_u64_load(b).u64;
-            match_vec = _sz_u64_each_byte_equal(a_vec, b_vec);
-            distance += sz_u64_popcount((~match_vec.u64) & 0x8080808080808080ull);
-        }
-    }
-#endif
-
-    for (; a != a_end && distance < bound; ++a, ++b) { distance += (*a != *b); }
-    return sz_min_of_two(distance, bound);
-}
-
-SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial( //
-    sz_cptr_t a, sz_size_t a_length,                 //
-    sz_cptr_t b, sz_size_t b_length,                 //
-    sz_size_t bound) {
-
-    sz_cptr_t const a_end = a + a_length;
-    sz_cptr_t const b_end = b + b_length;
-    sz_size_t distance = 0;
-
-    sz_rune_t a_rune, b_rune;
-    sz_rune_length_t a_rune_length, b_rune_length;
-
-    if (bound) {
-        for (; a < a_end && b < b_end && distance < bound; a += a_rune_length, b += b_rune_length) {
-            _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-            _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-            distance += (a_rune != b_rune);
-        }
-        // If one string has more runes, we need to go through the tail.
-        if (distance < bound) {
-            for (; a < a_end && distance < bound; a += a_rune_length, ++distance)
-                _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-
-            for (; b < b_end && distance < bound; b += b_rune_length, ++distance)
-                _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-        }
-    }
-    else {
-        for (; a < a_end && b < b_end; a += a_rune_length, b += b_rune_length) {
-            _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-            _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-            distance += (a_rune != b_rune);
-        }
-        // If one string has more runes, we need to go through the tail.
-        for (; a < a_end; a += a_rune_length, ++distance) _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-        for (; b < b_end; b += b_rune_length, ++distance) _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-    }
-    return distance;
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length) {
-    sz_u64_t checksum = 0;
-    sz_u8_t const *text_u8 = (sz_u8_t const *)text;
-    sz_u8_t const *text_end = text_u8 + length;
-    for (; text_u8 != text_end; ++text_u8) checksum += *text_u8;
-    return checksum;
-}
-
-/**
- *  @brief  Largest prime number that fits into 31 bits.
- *  @see    https://mersenneforum.org/showthread.php?t=3471
- */
-#define SZ_U32_MAX_PRIME (2147483647u)
-
-/**
- *  @brief  Largest prime number that fits into 64 bits.
- *  @see    https://mersenneforum.org/showthread.php?t=3471
- *
- *  2^64 = 18,446,744,073,709,551,616
- *  this = 18,446,744,073,709,551,557
- *  diff = 59
- */
-#define SZ_U64_MAX_PRIME (18446744073709551557ull)
-
-/*
- *  One hardware-accelerated way of mixing hashes can be CRC, but it's only implemented for 32-bit values.
- *  Using a Boost-like mixer works very poorly in such case:
- *
- *       hash_first ^ (hash_second + 0x517cc1b727220a95 + (hash_first << 6) + (hash_first >> 2));
- *
- *  Let's stick to the Fibonacci hash trick using the golden ratio.
- *  https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
- */
-#define _sz_hash_mix(first, second) ((first * 11400714819323198485ull) ^ (second * 11400714819323198485ull))
-#define _sz_shift_low(x) (x)
-#define _sz_shift_high(x) ((x + 77ull) & 0xFFull)
-#define _sz_prime_mod(x) (x % SZ_U64_MAX_PRIME)
-
-SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length) {
-
-    sz_u64_t hash_low = 0;
-    sz_u64_t hash_high = 0;
-    sz_u8_t const *text = (sz_u8_t const *)start;
-    sz_u8_t const *text_end = text + length;
-
-    switch (length) {
-    case 0: return 0;
-
-    // Texts under 7 bytes long are definitely below the largest prime.
-    case 1:
-        hash_low = _sz_shift_low(text[0]);
-        hash_high = _sz_shift_high(text[0]);
-        break;
-    case 2:
-        hash_low = _sz_shift_low(text[0]) * 31ull + _sz_shift_low(text[1]);
-        hash_high = _sz_shift_high(text[0]) * 257ull + _sz_shift_high(text[1]);
-        break;
-    case 3:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull +         //
-                   _sz_shift_low(text[2]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull +          //
-                    _sz_shift_high(text[2]);
-        break;
-    case 4:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull +                 //
-                   _sz_shift_low(text[3]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull +                   //
-                    _sz_shift_high(text[3]);
-        break;
-    case 5:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull +                         //
-                   _sz_shift_low(text[4]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull +                            //
-                    _sz_shift_high(text[4]);
-        break;
-    case 6:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull * 31ull +                         //
-                   _sz_shift_low(text[4]) * 31ull +                                 //
-                   _sz_shift_low(text[5]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull * 257ull +                            //
-                    _sz_shift_high(text[4]) * 257ull +                                     //
-                    _sz_shift_high(text[5]);
-        break;
-    case 7:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull * 31ull * 31ull +                         //
-                   _sz_shift_low(text[4]) * 31ull * 31ull +                                 //
-                   _sz_shift_low(text[5]) * 31ull +                                         //
-                   _sz_shift_low(text[6]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull * 257ull * 257ull +                            //
-                    _sz_shift_high(text[4]) * 257ull * 257ull +                                     //
-                    _sz_shift_high(text[5]) * 257ull +                                              //
-                    _sz_shift_high(text[6]);
-        break;
-    default:
-        // Unroll the first seven cycles:
-        hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[1]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[1]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[2]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[2]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[3]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[3]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[4]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[4]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[5]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[5]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[6]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[6]);
-        text += 7;
-
-        // Iterate throw the rest with the modulus:
-        for (; text != text_end; ++text) {
-            hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
-            hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
-            // Wrap the hashes around:
-            hash_low = _sz_prime_mod(hash_low);
-            hash_high = _sz_prime_mod(hash_high);
-        }
-        break;
-    }
-
-    return _sz_hash_mix(hash_low, hash_high);
-}
-
-SZ_PUBLIC void sz_hashes_serial(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    sz_u8_t const *text = (sz_u8_t const *)start;
-    sz_u8_t const *text_end = text + length;
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // Compute the initial hash value for the first window.
-    sz_u64_t hash_low = 0, hash_high = 0, hash_mix;
-    for (sz_u8_t const *first_end = text + window_length; text < first_end; ++text)
-        hash_low = (hash_low * 31ull + _sz_shift_low(*text)) % SZ_U64_MAX_PRIME,
-        hash_high = (hash_high * 257ull + _sz_shift_high(*text)) % SZ_U64_MAX_PRIME;
-
-    // In most cases the fingerprint length will be a power of two.
-    hash_mix = _sz_hash_mix(hash_low, hash_high);
-    callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
-
-    // Compute the hash value for every window, exporting into the fingerprint,
-    // using the expensive modulo operation.
-    sz_size_t cycles = 1;
-    sz_size_t const step_mask = step - 1;
-    for (; text < text_end; ++text, ++cycles) {
-        // Discard one character:
-        hash_low -= _sz_shift_low(*(text - window_length)) * prime_power_low;
-        hash_high -= _sz_shift_high(*(text - window_length)) * prime_power_high;
-        // And add a new one:
-        hash_low = 31ull * hash_low + _sz_shift_low(*text);
-        hash_high = 257ull * hash_high + _sz_shift_high(*text);
-        // Wrap the hashes around:
-        hash_low = _sz_prime_mod(hash_low);
-        hash_high = _sz_prime_mod(hash_high);
-        // Mix only if we've skipped enough hashes.
-        if ((cycles & step_mask) == 0) {
-            hash_mix = _sz_hash_mix(hash_low, hash_high);
-            callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
-        }
-    }
-}
-
-#undef _sz_shift_low
-#undef _sz_shift_high
-#undef _sz_hash_mix
-#undef _sz_prime_mod
-
-/**
- *  @brief  Uses a small lookup-table to convert a lowercase character to uppercase.
- */
-SZ_INTERNAL sz_u8_t sz_u8_tolower(sz_u8_t c) {
-    static sz_u8_t const lowered[256] = {
-        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
-        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
-        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
-        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
-        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
-        96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, //
-        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
-        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
-        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
-        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
-    };
-    return lowered[c];
-}
-
-/**
- *  @brief  Uses a small lookup-table to convert an uppercase character to lowercase.
- */
-SZ_INTERNAL sz_u8_t sz_u8_toupper(sz_u8_t c) {
-    static sz_u8_t const upped[256] = {
-        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
-        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
-        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
-        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
-        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
-        96,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  //
-        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  123, 124, 125, 126, 127, //
-        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
-        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
-        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
-        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
-    };
-    return upped[c];
-}
-
-/**
- *  @brief  Uses two small lookup tables (768 bytes total) to accelerate division by a small
- *          unsigned integer. Performs two lookups, one multiplication, two shifts, and two accumulations.
- *
- *  @param  divisor Integral value @b larger than one.
- *  @param  number  Integral value to divide.
- */
-SZ_INTERNAL sz_u8_t sz_u8_divide(sz_u8_t number, sz_u8_t divisor) {
-    sz_assert(divisor > 1);
-    static sz_u16_t const multipliers[256] = {
-        0,     0,     0,     21846, 0,     39322, 21846, 9363,  0,     50973, 39322, 29790, 21846, 15124, 9363,  4370,
-        0,     57826, 50973, 44841, 39322, 34329, 29790, 25645, 21846, 18351, 15124, 12137, 9363,  6780,  4370,  2115,
-        0,     61565, 57826, 54302, 50973, 47824, 44841, 42011, 39322, 36765, 34329, 32006, 29790, 27671, 25645, 23705,
-        21846, 20063, 18351, 16706, 15124, 13602, 12137, 10725, 9363,  8049,  6780,  5554,  4370,  3224,  2115,  1041,
-        0,     63520, 61565, 59668, 57826, 56039, 54302, 52614, 50973, 49377, 47824, 46313, 44841, 43407, 42011, 40649,
-        39322, 38028, 36765, 35532, 34329, 33154, 32006, 30885, 29790, 28719, 27671, 26647, 25645, 24665, 23705, 22766,
-        21846, 20945, 20063, 19198, 18351, 17520, 16706, 15907, 15124, 14356, 13602, 12863, 12137, 11424, 10725, 10038,
-        9363,  8700,  8049,  7409,  6780,  6162,  5554,  4957,  4370,  3792,  3224,  2665,  2115,  1573,  1041,  517,
-        0,     64520, 63520, 62535, 61565, 60609, 59668, 58740, 57826, 56926, 56039, 55164, 54302, 53452, 52614, 51788,
-        50973, 50169, 49377, 48595, 47824, 47063, 46313, 45572, 44841, 44120, 43407, 42705, 42011, 41326, 40649, 39982,
-        39322, 38671, 38028, 37392, 36765, 36145, 35532, 34927, 34329, 33738, 33154, 32577, 32006, 31443, 30885, 30334,
-        29790, 29251, 28719, 28192, 27671, 27156, 26647, 26143, 25645, 25152, 24665, 24182, 23705, 23233, 22766, 22303,
-        21846, 21393, 20945, 20502, 20063, 19628, 19198, 18772, 18351, 17933, 17520, 17111, 16706, 16305, 15907, 15514,
-        15124, 14738, 14356, 13977, 13602, 13231, 12863, 12498, 12137, 11779, 11424, 11073, 10725, 10380, 10038, 9699,
-        9363,  9030,  8700,  8373,  8049,  7727,  7409,  7093,  6780,  6470,  6162,  5857,  5554,  5254,  4957,  4662,
-        4370,  4080,  3792,  3507,  3224,  2943,  2665,  2388,  2115,  1843,  1573,  1306,  1041,  778,   517,   258,
-    };
-    // This table can be avoided using a single addition and counting trailing zeros.
-    static sz_u8_t const shifts[256] = {
-        0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, //
-        4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, //
-        5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, //
-        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, //
-        6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-    };
-    sz_u32_t multiplier = multipliers[divisor];
-    sz_u8_t shift = shifts[divisor];
-
-    sz_u16_t q = (sz_u16_t)((multiplier * number) >> 16);
-    sz_u16_t t = ((number - q) >> 1) + q;
-    return (sz_u8_t)(t >> shift);
-}
-
-SZ_PUBLIC void sz_look_up_transform_serial(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result) {
-    sz_u8_t const *unsigned_lut = (sz_u8_t const *)lut;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = unsigned_lut[*unsigned_text];
-}
-
-SZ_PUBLIC void sz_tolower_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = sz_u8_tolower(*unsigned_text);
-}
-
-SZ_PUBLIC void sz_toupper_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = sz_u8_toupper(*unsigned_text);
-}
-
-SZ_PUBLIC void sz_toascii_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = *unsigned_text & 0x7F;
-}
-
-/**
- *  @brief  Check if there is a byte in this buffer, that exceeds 127 and can't be an ASCII character.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- */
-SZ_PUBLIC sz_bool_t sz_isascii_serial(sz_cptr_t text, sz_size_t length) {
-
-    if (!length) return sz_true_k;
-    sz_u8_t const *h = (sz_u8_t const *)text;
-    sz_u8_t const *const h_end = h + length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
-        if (*h & 0x80ull) return sz_false_k;
-#endif
-
-    // Validate eight bytes at once using SWAR.
-    sz_u64_vec_t text_vec;
-    for (; h + 8 <= h_end; h += 8) {
-        text_vec.u64 = *(sz_u64_t const *)h;
-        if (text_vec.u64 & 0x8080808080808080ull) return sz_false_k;
-    }
-
-    // Handle the misaligned tail.
-    for (; h < h_end; ++h)
-        if (*h & 0x80ull) return sz_false_k;
-    return sz_true_k;
-}
-
-SZ_PUBLIC void sz_generate_serial(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
-                                  sz_random_generator_t generator, void *generator_user_data) {
-
-    sz_assert(alphabet_size > 0 && alphabet_size <= 256 && "Inadequate alphabet size");
-
-    if (alphabet_size == 1) sz_fill(result, result_length, *alphabet);
-
-    else {
-        sz_assert(generator && "Expects a valid random generator");
-        sz_u8_t divisor = (sz_u8_t)alphabet_size;
-        for (sz_cptr_t end = result + result_length; result != end; ++result) {
-            sz_u8_t random = generator(generator_user_data) & 0xFF;
-            sz_u8_t quotient = sz_u8_divide(random, divisor);
-            *result = alphabet[random - quotient * divisor];
-        }
-    }
-}
-
-#pragma endregion
-
-/*
- *  Serial implementation of string class operations.
- */
-#pragma region Serial Implementation for the String Class
-
-SZ_PUBLIC sz_bool_t sz_string_is_on_stack(sz_string_t const *string) {
-    // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    return (sz_bool_t)((sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0]);
-}
-
-SZ_PUBLIC void sz_string_range(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length) {
-    sz_size_t is_small = (sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0];
-    sz_size_t is_big_mask = is_small - 1ull;
-    *start = string->external.start; // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    // If the string is small, use branch-less approach to mask-out the top 7 bytes of the length.
-    *length = string->external.length & (0x00000000000000FFull | is_big_mask);
-}
-
-SZ_PUBLIC void sz_string_unpack(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length, sz_size_t *space,
-                                sz_bool_t *is_external) {
-    sz_size_t is_small = (sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0];
-    sz_size_t is_big_mask = is_small - 1ull;
-    *start = string->external.start; // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    // If the string is small, use branch-less approach to mask-out the top 7 bytes of the length.
-    *length = string->external.length & (0x00000000000000FFull | is_big_mask);
-    // In case the string is small, the `is_small - 1ull` will become 0xFFFFFFFFFFFFFFFFull.
-    *space = sz_u64_blend(SZ_STRING_INTERNAL_SPACE, string->external.space, is_big_mask);
-    *is_external = (sz_bool_t)!is_small;
-}
-
-SZ_PUBLIC sz_bool_t sz_string_equal(sz_string_t const *a, sz_string_t const *b) {
-    // Tempting to say that the external.length is bitwise the same even if it includes
-    // some bytes of the on-stack payload, but we don't at this writing maintain that invariant.
-    // (An on-stack string includes noise bytes in the high-order bits of external.length. So do this
-    // the hard/correct way.
-
-#if SZ_USE_MISALIGNED_LOADS
-    // Dealing with StringZilla strings, we know that the `start` pointer always points
-    // to a word at least 8 bytes long. Therefore, we can compare the first 8 bytes at once.
-
-#endif
-    // Alternatively, fall back to byte-by-byte comparison.
-    sz_ptr_t a_start, b_start;
-    sz_size_t a_length, b_length;
-    sz_string_range(a, &a_start, &a_length);
-    sz_string_range(b, &b_start, &b_length);
-    return (sz_bool_t)(a_length == b_length && sz_equal(a_start, b_start, b_length));
-}
-
-SZ_PUBLIC sz_ordering_t sz_string_order(sz_string_t const *a, sz_string_t const *b) {
-#if SZ_USE_MISALIGNED_LOADS
-    // Dealing with StringZilla strings, we know that the `start` pointer always points
-    // to a word at least 8 bytes long. Therefore, we can compare the first 8 bytes at once.
-
-#endif
-    // Alternatively, fall back to byte-by-byte comparison.
-    sz_ptr_t a_start, b_start;
-    sz_size_t a_length, b_length;
-    sz_string_range(a, &a_start, &a_length);
-    sz_string_range(b, &b_start, &b_length);
-    return sz_order(a_start, a_length, b_start, b_length);
-}
-
-SZ_PUBLIC void sz_string_init(sz_string_t *string) {
-    sz_assert(string && "String can't be SZ_NULL.");
-
-    // Only 8 + 1 + 1 need to be initialized.
-    string->internal.start = &string->internal.chars[0];
-    // But for safety let's initialize the entire structure to zeros.
-    // string->internal.chars[0] = 0;
-    // string->internal.length = 0;
-    string->words[1] = 0;
-    string->words[2] = 0;
-    string->words[3] = 0;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length, sz_memory_allocator_t *allocator) {
-    sz_size_t space_needed = length + 1; // space for trailing \0
-    sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
-    // Initialize the string to zeros for safety.
-    string->words[1] = 0;
-    string->words[2] = 0;
-    string->words[3] = 0;
-    // If we are lucky, no memory allocations will be needed.
-    if (space_needed <= SZ_STRING_INTERNAL_SPACE) {
-        string->internal.start = &string->internal.chars[0];
-        string->internal.length = (sz_u8_t)length;
-    }
-    else {
-        // If we are not lucky, we need to allocate memory.
-        string->external.start = (sz_ptr_t)allocator->allocate(space_needed, allocator->handle);
-        if (!string->external.start) return SZ_NULL_CHAR;
-        string->external.length = length;
-        string->external.space = space_needed;
-    }
-    sz_assert(&string->internal.start == &string->external.start && "Alignment confusion");
-    string->external.start[length] = 0;
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity, sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
-
-    sz_size_t new_space = new_capacity + 1;
-    if (new_space <= SZ_STRING_INTERNAL_SPACE) return string->external.start;
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-    sz_assert(new_space > string_space && "New space must be larger than current.");
-
-    sz_ptr_t new_start = (sz_ptr_t)allocator->allocate(new_space, allocator->handle);
-    if (!new_start) return SZ_NULL_CHAR;
-
-    sz_copy(new_start, string_start, string_length);
-    string->external.start = new_start;
-    string->external.space = new_space;
-    string->external.padding = 0;
-    string->external.length = string_length;
-
-    // Deallocate the old string.
-    if (string_is_external) allocator->free(string_start, string_space, allocator->handle);
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_shrink_to_fit(sz_string_t *string, sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // We may already be space-optimal, and in that case we don't need to do anything.
-    sz_size_t new_space = string_length + 1;
-    if (string_space == new_space || !string_is_external) return string->external.start;
-
-    sz_ptr_t new_start = (sz_ptr_t)allocator->allocate(new_space, allocator->handle);
-    if (!new_start) return SZ_NULL_CHAR;
-
-    sz_copy(new_start, string_start, string_length);
-    string->external.start = new_start;
-    string->external.space = new_space;
-    string->external.padding = 0;
-    string->external.length = string_length;
-
-    // Deallocate the old string.
-    if (string_is_external) allocator->free(string_start, string_space, allocator->handle);
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_expand(sz_string_t *string, sz_size_t offset, sz_size_t added_length,
-                                    sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // The user intended to extend the string.
-    offset = sz_min_of_two(offset, string_length);
-
-    // If we are lucky, no memory allocations will be needed.
-    if (string_length + added_length < string_space) {
-        sz_move(string_start + offset + added_length, string_start + offset, string_length - offset);
-        string_start[string_length + added_length] = 0;
-        // Even if the string is on the stack, the `+=` won't affect the tail of the string.
-        string->external.length += added_length;
-    }
-    // If we are not lucky, we need to allocate more memory.
-    else {
-        sz_size_t next_planned_size = sz_max_of_two(SZ_CACHE_LINE_WIDTH, string_space * 2ull);
-        sz_size_t min_needed_space = sz_size_bit_ceil(offset + string_length + added_length + 1);
-        sz_size_t new_space = sz_max_of_two(min_needed_space, next_planned_size);
-        string_start = sz_string_reserve(string, new_space - 1, allocator);
-        if (!string_start) return SZ_NULL_CHAR;
-
-        // Copy into the new buffer.
-        sz_move(string_start + offset + added_length, string_start + offset, string_length - offset);
-        string_start[string_length + added_length] = 0;
-        string->external.length = string_length + added_length;
-    }
-
-    return string_start;
-}
-
-SZ_PUBLIC sz_size_t sz_string_erase(sz_string_t *string, sz_size_t offset, sz_size_t length) {
-
-    sz_assert(string && "String can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // Normalize the offset, it can't be larger than the length.
-    offset = sz_min_of_two(offset, string_length);
-
-    // We shouldn't normalize the length, to avoid overflowing on `offset + length >= string_length`,
-    // if receiving `length == SZ_SIZE_MAX`. After following expression the `length` will contain
-    // exactly the delta between original and final length of this `string`.
-    length = sz_min_of_two(length, string_length - offset);
-
-    // There are 2 common cases, that wouldn't even require a `memmove`:
-    //      1.  Erasing the entire contents of the string.
-    //          In that case `length` argument will be equal or greater than `length` member.
-    //      2.  Removing the tail of the string with something like `string.pop_back()` in C++.
-    //
-    // In both of those, regardless of the location of the string - stack or heap,
-    // the erasing is as easy as setting the length to the offset.
-    // In every other case, we must `memmove` the tail of the string to the left.
-    if (offset + length < string_length)
-        sz_move(string_start + offset, string_start + offset + length, string_length - offset - length);
-
-    // The `string->external.length = offset` assignment would discard last characters
-    // of the on-the-stack string, but inplace subtraction would work.
-    string->external.length -= length;
-    string_start[string_length - length] = 0;
-    return length;
-}
-
-SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *allocator) {
-    if (!sz_string_is_on_stack(string))
-        allocator->free(string->external.start, string->external.space, allocator->handle);
-    sz_string_init(string);
-}
-
-// When overriding libc, disable optimisations for this function beacuse MSVC will optimize the loops into a memset.
-// Which then causes a stack overflow due to infinite recursion (memset -> sz_fill_serial -> memset).
-#if defined(_MSC_VER) && defined(SZ_OVERRIDE_LIBC) && SZ_OVERRIDE_LIBC
-#pragma optimize("", off)
-#endif
-SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    sz_ptr_t end = target + length;
-    // Dealing with short strings, a single sequential pass would be faster.
-    // If the size is larger than 2 words, then at least 1 of them will be aligned.
-    // But just one aligned word may not be worth SWAR.
-    if (length < SZ_SWAR_THRESHOLD)
-        while (target != end) *(target++) = value;
-
-    // In case of long strings, skip unaligned bytes, and then fill the rest in 64-bit chunks.
-    else {
-        sz_u64_t value64 = (sz_u64_t)value * 0x0101010101010101ull;
-        while ((sz_size_t)target & 7ull) *(target++) = value;
-        while (target + 8 <= end) *(sz_u64_t *)target = value64, target += 8;
-        while (target != end) *(target++) = value;
-    }
-}
-#if defined(_MSC_VER) && defined(SZ_OVERRIDE_LIBC) && SZ_OVERRIDE_LIBC
-#pragma optimize("", on)
-#endif
-
-SZ_PUBLIC void sz_copy_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_MISALIGNED_LOADS
-    while (length >= 8) *(sz_u64_t *)target = *(sz_u64_t const *)source, target += 8, source += 8, length -= 8;
-#endif
-    while (length--) *(target++) = *(source++);
-}
-
-SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // Implementing `memmove` is trickier, than `memcpy`, as the ranges may overlap.
-    // Existing implementations often have two passes, in normal and reversed order,
-    // depending on the relation of `target` and `source` addresses.
-    // https://student.cs.uwaterloo.ca/~cs350/common/os161-src-html/doxygen/html/memmove_8c_source.html
-    // https://marmota.medium.com/c-language-making-memmove-def8792bb8d5
-    //
-    // We can use the `memcpy` like left-to-right pass if we know that the `target` is before `source`.
-    // Or if we know that they don't intersect! In that case the traversal order is irrelevant,
-    // but older CPUs may predict and fetch forward-passes better.
-    if (target < source || target >= source + length) {
-#if SZ_USE_MISALIGNED_LOADS
-        while (length >= 8) *(sz_u64_t *)target = *(sz_u64_t const *)(source), target += 8, source += 8, length -= 8;
-#endif
-        while (length--) *(target++) = *(source++);
-    }
-    else {
-        // Jump to the end and walk backwards.
-        target += length, source += length;
-#if SZ_USE_MISALIGNED_LOADS
-        while (length >= 8) *(sz_u64_t *)(target -= 8) = *(sz_u64_t const *)(source -= 8), length -= 8;
-#endif
-        while (length--) *(--target) = *(--source);
-    }
-}
-
-#pragma endregion
-
-/*
- *  @brief  Serial implementation for strings sequence processing.
- */
-#pragma region Serial Implementation for Sequences
-
-SZ_PUBLIC sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate) {
-
-    sz_size_t matches = 0;
-    while (matches != sequence->count && predicate(sequence, sequence->order[matches])) ++matches;
-
-    for (sz_size_t i = matches + 1; i < sequence->count; ++i)
-        if (predicate(sequence, sequence->order[i]))
-            sz_u64_swap(sequence->order + i, sequence->order + matches), ++matches;
-
-    return matches;
-}
-
-SZ_PUBLIC void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less) {
-
-    sz_size_t start_b = partition + 1;
-
-    // If the direct merge is already sorted
-    if (!less(sequence, sequence->order[start_b], sequence->order[partition])) return;
-
-    sz_size_t start_a = 0;
-    while (start_a <= partition && start_b <= sequence->count) {
-
-        // If element 1 is in right place
-        if (!less(sequence, sequence->order[start_b], sequence->order[start_a])) { start_a++; }
-        else {
-            sz_size_t value = sequence->order[start_b];
-            sz_size_t index = start_b;
-
-            // Shift all the elements between element 1
-            // element 2, right by 1.
-            while (index != start_a) { sequence->order[index] = sequence->order[index - 1], index--; }
-            sequence->order[start_a] = value;
-
-            // Update all the pointers
-            start_a++;
-            partition++;
-            start_b++;
-        }
-    }
-}
-
-SZ_PUBLIC void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
-    sz_u64_t *keys = sequence->order;
-    sz_size_t keys_count = sequence->count;
-    for (sz_size_t i = 1; i < keys_count; i++) {
-        sz_u64_t i_key = keys[i];
-        sz_size_t j = i;
-        for (; j > 0 && less(sequence, i_key, keys[j - 1]); --j) keys[j] = keys[j - 1];
-        keys[j] = i_key;
-    }
-}
-
-SZ_INTERNAL void _sz_sift_down(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t start,
-                               sz_size_t end) {
-    sz_size_t root = start;
-    while (2 * root + 1 <= end) {
-        sz_size_t child = 2 * root + 1;
-        if (child + 1 <= end && less(sequence, order[child], order[child + 1])) { child++; }
-        if (!less(sequence, order[root], order[child])) { return; }
-        sz_u64_swap(order + root, order + child);
-        root = child;
-    }
-}
-
-SZ_INTERNAL void _sz_heapify(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t count) {
-    sz_size_t start = (count - 2) / 2;
-    while (1) {
-        _sz_sift_down(sequence, less, order, start, count - 1);
-        if (start == 0) return;
-        start--;
-    }
-}
-
-SZ_INTERNAL void _sz_heapsort(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first, sz_size_t last) {
-    sz_u64_t *order = sequence->order;
-    sz_size_t count = last - first;
-    _sz_heapify(sequence, less, order + first, count);
-    sz_size_t end = count - 1;
-    while (end > 0) {
-        sz_u64_swap(order + first, order + first + end);
-        end--;
-        _sz_sift_down(sequence, less, order + first, 0, end);
-    }
-}
-
-SZ_PUBLIC void sz_sort_introsort_recursion(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first,
-                                           sz_size_t last, sz_size_t depth) {
-
-    sz_size_t length = last - first;
-    switch (length) {
-    case 0:
-    case 1: return;
-    case 2:
-        if (less(sequence, sequence->order[first + 1], sequence->order[first]))
-            sz_u64_swap(&sequence->order[first], &sequence->order[first + 1]);
-        return;
-    case 3: {
-        sz_u64_t a = sequence->order[first];
-        sz_u64_t b = sequence->order[first + 1];
-        sz_u64_t c = sequence->order[first + 2];
-        if (less(sequence, b, a)) sz_u64_swap(&a, &b);
-        if (less(sequence, c, b)) sz_u64_swap(&c, &b);
-        if (less(sequence, b, a)) sz_u64_swap(&a, &b);
-        sequence->order[first] = a;
-        sequence->order[first + 1] = b;
-        sequence->order[first + 2] = c;
-        return;
-    }
-    }
-    // Until a certain length, the quadratic-complexity insertion-sort is fine
-    if (length <= 16) {
-        sz_sequence_t sub_seq = *sequence;
-        sub_seq.order += first;
-        sub_seq.count = length;
-        sz_sort_insertion(&sub_seq, less);
-        return;
-    }
-
-    // Fallback to N-logN-complexity heap-sort
-    if (depth == 0) {
-        _sz_heapsort(sequence, less, first, last);
-        return;
-    }
-
-    --depth;
-
-    // Median-of-three logic to choose pivot
-    sz_size_t median = first + length / 2;
-    if (less(sequence, sequence->order[median], sequence->order[first]))
-        sz_u64_swap(&sequence->order[first], &sequence->order[median]);
-    if (less(sequence, sequence->order[last - 1], sequence->order[first]))
-        sz_u64_swap(&sequence->order[first], &sequence->order[last - 1]);
-    if (less(sequence, sequence->order[median], sequence->order[last - 1]))
-        sz_u64_swap(&sequence->order[median], &sequence->order[last - 1]);
-
-    // Partition using the median-of-three as the pivot
-    sz_u64_t pivot = sequence->order[median];
-    sz_size_t left = first;
-    sz_size_t right = last - 1;
-    while (1) {
-        while (less(sequence, sequence->order[left], pivot)) left++;
-        while (less(sequence, pivot, sequence->order[right])) right--;
-        if (left >= right) break;
-        sz_u64_swap(&sequence->order[left], &sequence->order[right]);
-        left++;
-        right--;
-    }
-
-    // Recursively sort the partitions
-    sz_sort_introsort_recursion(sequence, less, first, left, depth);
-    sz_sort_introsort_recursion(sequence, less, right + 1, last, depth);
-}
-
-SZ_PUBLIC void sz_sort_introsort(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
-    if (sequence->count == 0) return;
-    sz_size_t size_is_not_power_of_two = (sequence->count & (sequence->count - 1)) != 0;
-    sz_size_t depth_limit = sz_size_log2i_nonzero(sequence->count) + size_is_not_power_of_two;
-    sz_sort_introsort_recursion(sequence, less, 0, sequence->count, depth_limit);
-}
-
-SZ_PUBLIC void sz_sort_recursion( //
-    sz_sequence_t *sequence, sz_size_t bit_idx, sz_size_t bit_max, sz_sequence_comparator_t comparator,
-    sz_size_t partial_order_length) {
-
-    if (!sequence->count) return;
-
-    // Array of size one doesn't need sorting - only needs the prefix to be discarded.
-    if (sequence->count == 1) {
-        sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
-        order_half_words[1] = 0;
-        return;
-    }
-
-    // Partition a range of integers according to a specific bit value
-    sz_size_t split = 0;
-    sz_u64_t mask = (1ull << 63) >> bit_idx;
-
-    // The clean approach would be to perform a single pass over the sequence.
-    //
-    //    while (split != sequence->count && !(sequence->order[split] & mask)) ++split;
-    //    for (sz_size_t i = split + 1; i < sequence->count; ++i)
-    //        if (!(sequence->order[i] & mask)) sz_u64_swap(sequence->order + i, sequence->order + split), ++split;
-    //
-    // This, however, doesn't take into account the high relative cost of writes and swaps.
-    // To circumvent that, we can first count the total number entries to be mapped into either part.
-    // And then walk through both parts, swapping the entries that are in the wrong part.
-    // This would often lead to ~15% performance gain.
-    sz_size_t count_with_bit_set = 0;
-    for (sz_size_t i = 0; i != sequence->count; ++i) count_with_bit_set += (sequence->order[i] & mask) != 0;
-    split = sequence->count - count_with_bit_set;
-
-    // It's possible that the sequence is already partitioned.
-    if (split != 0 && split != sequence->count) {
-        // Use two pointers to efficiently reposition elements.
-        // On pointer walks left-to-right from the start, and the other walks right-to-left from the end.
-        sz_size_t left = 0;
-        sz_size_t right = sequence->count - 1;
-        while (1) {
-            // Find the next element with the bit set on the left side.
-            while (left < split && !(sequence->order[left] & mask)) ++left;
-            // Find the next element without the bit set on the right side.
-            while (right >= split && (sequence->order[right] & mask)) --right;
-            // Swap the mispositioned elements.
-            if (left < split && right >= split) {
-                sz_u64_swap(sequence->order + left, sequence->order + right);
-                ++left;
-                --right;
-            }
-            else { break; }
-        }
-    }
-
-    // Go down recursively.
-    if (bit_idx < bit_max) {
-        sz_sequence_t a = *sequence;
-        a.count = split;
-        sz_sort_recursion(&a, bit_idx + 1, bit_max, comparator, partial_order_length);
-
-        sz_sequence_t b = *sequence;
-        b.order += split;
-        b.count -= split;
-        sz_sort_recursion(&b, bit_idx + 1, bit_max, comparator, partial_order_length);
-    }
-    // Reached the end of recursion.
-    else {
-        // Discard the prefixes.
-        sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
-        for (sz_size_t i = 0; i != sequence->count; ++i) { order_half_words[i * 2 + 1] = 0; }
-
-        sz_sequence_t a = *sequence;
-        a.count = split;
-        sz_sort_introsort(&a, comparator);
-
-        sz_sequence_t b = *sequence;
-        b.order += split;
-        b.count -= split;
-        sz_sort_introsort(&b, comparator);
-    }
-}
-
-SZ_INTERNAL sz_bool_t _sz_sort_is_less(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) {
-    sz_cptr_t i_str = sequence->get_start(sequence, i_key);
-    sz_cptr_t j_str = sequence->get_start(sequence, j_key);
-    sz_size_t i_len = sequence->get_length(sequence, i_key);
-    sz_size_t j_len = sequence->get_length(sequence, j_key);
-    return (sz_bool_t)(sz_order_serial(i_str, i_len, j_str, j_len) == sz_less_k);
-}
-
-SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t partial_order_length) {
-
-#if SZ_DETECT_BIG_ENDIAN
-    // TODO: Implement partial sort for big-endian systems. For now this sorts the whole thing.
-    sz_unused(partial_order_length);
-    sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
-#else
-
-    // Export up to 4 bytes into the `sequence` bits themselves
-    for (sz_size_t i = 0; i != sequence->count; ++i) {
-        sz_cptr_t begin = sequence->get_start(sequence, sequence->order[i]);
-        sz_size_t length = sequence->get_length(sequence, sequence->order[i]);
-        length = length > 4u ? 4u : length;
-        sz_ptr_t prefix = (sz_ptr_t)&sequence->order[i];
-        for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j];
-    }
-
-    // Perform optionally-parallel radix sort on them
-    sz_sort_recursion(sequence, 0, 32, (sz_sequence_comparator_t)_sz_sort_is_less, partial_order_length);
-#endif
-}
-
-SZ_PUBLIC void sz_sort(sz_sequence_t *sequence) {
-#if SZ_DETECT_BIG_ENDIAN
-    sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
-#else
-    sz_sort_partial(sequence, sequence->count);
-#endif
-}
-
-#pragma endregion
-
-/*
- *  @brief  AVX2 implementation of the string search algorithms.
- *          Very minimalistic, but still faster than the serial implementation.
- */
-#pragma region AVX2 Implementation
-
-#if SZ_USE_X86_AVX2
-#pragma GCC push_options
-#pragma GCC target("avx2")
-#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
-#include <immintrin.h>
-
-/**
- *  @brief  Helper structure to simplify work with 256-bit registers.
- */
-typedef union sz_u256_vec_t {
-    __m256i ymm;
-    __m128i xmms[2];
-    sz_u64_t u64s[4];
-    sz_u32_t u32s[8];
-    sz_u16_t u16s[16];
-    sz_u8_t u8s[32];
-} sz_u256_vec_t;
-
-SZ_PUBLIC sz_ordering_t sz_order_avx2(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    //! Before optimizing this, read the "Operations Not Worth Optimizing" in Contributions Guide:
-    //! https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md#general-performance-observations
-    return sz_order_serial(a, a_length, b, b_length);
-}
-
-SZ_PUBLIC sz_bool_t sz_equal_avx2(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_u256_vec_t a_vec, b_vec;
-
-    while (length >= 32) {
-        a_vec.ymm = _mm256_lddqu_si256((__m256i const *)a);
-        b_vec.ymm = _mm256_lddqu_si256((__m256i const *)b);
-        // One approach can be to use "movemasks", but we could also use a bitwise matching like `_mm256_testnzc_si256`.
-        int difference_mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi8(a_vec.ymm, b_vec.ymm));
-        if (difference_mask == 0) { a += 32, b += 32, length -= 32; }
-        else { return sz_false_k; }
-    }
-
-    if (length) return sz_equal_serial(a, b, length);
-    return sz_true_k;
-}
-
-SZ_PUBLIC void sz_fill_avx2(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    char value_char = *(char *)&value;
-    __m256i value_vec = _mm256_set1_epi8(value_char);
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores".
-    //
-    //    for (; length >= 32; target += 32, length -= 32) _mm256_storeu_si256(target, value_vec);
-    //    sz_fill_serial(target, length, value);
-    //
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 32) sz_fill_serial(target, length, value);
-    // When the buffer is aligned, we can avoid any split-stores.
-    else {
-        sz_size_t head_length = (32 - ((sz_size_t)target % 32)) % 32; // 31 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 32;    // 31 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 32.
-        sz_u16_t value16 = (sz_u16_t)value * 0x0101u;
-        sz_u32_t value32 = (sz_u32_t)value16 * 0x00010001u;
-        sz_u64_t value64 = (sz_u64_t)value32 * 0x0000000100000001ull;
-
-        // Fill the head of the buffer. This part is much cleaner with AVX-512.
-        if (head_length & 1) *(sz_u8_t *)target = value, target++, head_length--;
-        if (head_length & 2) *(sz_u16_t *)target = value16, target += 2, head_length -= 2;
-        if (head_length & 4) *(sz_u32_t *)target = value32, target += 4, head_length -= 4;
-        if (head_length & 8) *(sz_u64_t *)target = value64, target += 8, head_length -= 8;
-        if (head_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_set1_epi8(value_char)), target += 16, head_length -= 16;
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-
-        // Fill the aligned body of the buffer.
-        for (; body_length >= 32; target += 32, body_length -= 32) _mm256_store_si256((__m256i *)target, value_vec);
-
-        // Fill the tail of the buffer. This part is much cleaner with AVX-512.
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-        if (tail_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_set1_epi8(value_char)), target += 16, tail_length -= 16;
-        if (tail_length & 8) *(sz_u64_t *)target = value64, target += 8, tail_length -= 8;
-        if (tail_length & 4) *(sz_u32_t *)target = value32, target += 4, tail_length -= 4;
-        if (tail_length & 2) *(sz_u16_t *)target = value16, target += 2, tail_length -= 2;
-        if (tail_length & 1) *(sz_u8_t *)target = value, target++, tail_length--;
-    }
-}
-
-SZ_PUBLIC void sz_copy_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores" and "loads".
-    //
-    //    for (; length >= 32; target += 32, source += 32, length -= 32)
-    //        _mm256_storeu_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-    //    sz_copy_serial(target, source, length);
-    //
-    // A typical AWS Skylake instance can have 32 KB x 2 blocks of L1 data cache per core,
-    // 1 MB x 2 blocks of L2 cache per core, and one shared L3 cache buffer.
-    // For now, let's avoid the cases beyond the L2 size.
-    int is_huge = length > 1ull * 1024ull * 1024ull;
-    if (length <= 32) { sz_copy_serial(target, source, length); }
-    // When dealing wirh larger arrays, the optimization is not as simple as with the `sz_fill_avx2` function,
-    // as both buffers may be unaligned. If we are lucky and the requested operation is some huge page transfer,
-    // we can use aligned loads and stores, and the performance will be great.
-    else if ((sz_size_t)target % 32 == 0 && (sz_size_t)source % 32 == 0 && !is_huge) {
-        for (; length >= 32; target += 32, source += 32, length -= 32)
-            _mm256_store_si256((__m256i *)target, _mm256_load_si256((__m256i const *)source));
-        if (length) sz_copy_serial(target, source, length);
-    }
-    // The trickiest case is when both `source` and `target` are not aligned.
-    // In such and simpler cases we can copy enough bytes into `target` to reach its cacheline boundary,
-    // and then combine unaligned loads with aligned stores.
-    else {
-        sz_size_t head_length = (32 - ((sz_size_t)target % 32)) % 32; // 31 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 32;    // 31 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 32.
-
-        // Fill the head of the buffer. This part is much cleaner with AVX-512.
-        if (head_length & 1) *(sz_u8_t *)target = *(sz_u8_t *)source, target++, source++, head_length--;
-        if (head_length & 2) *(sz_u16_t *)target = *(sz_u16_t *)source, target += 2, source += 2, head_length -= 2;
-        if (head_length & 4) *(sz_u32_t *)target = *(sz_u32_t *)source, target += 4, source += 4, head_length -= 4;
-        if (head_length & 8) *(sz_u64_t *)target = *(sz_u64_t *)source, target += 8, source += 8, head_length -= 8;
-        if (head_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_lddqu_si128((__m128i const *)source)), target += 16, source += 16,
-                head_length -= 16;
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-
-        // Fill the aligned body of the buffer.
-        if (!is_huge) {
-            for (; body_length >= 32; target += 32, source += 32, body_length -= 32)
-                _mm256_store_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-        }
-        // When the biffer is huge, we can traverse it in 2 directions.
-        else {
-            for (; body_length >= 64; target += 32, source += 32, body_length -= 64) {
-                _mm256_store_si256((__m256i *)(target), _mm256_lddqu_si256((__m256i const *)(source)));
-                _mm256_store_si256((__m256i *)(target + body_length - 32),
-                                   _mm256_lddqu_si256((__m256i const *)(source + body_length - 32)));
-            }
-            if (body_length) _mm256_store_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
+    // Progress through the upper triangle of the Levenshtein matrix.
+    sz_size_t next_diagonal_index = 2;
+    for (; next_diagonal_index != n; ++next_diagonal_index) {
+        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
+        for (sz_size_t i = 0; i + 2 < next_diagonal_length; ++i) {
+            sz_size_t cost_of_substitution = shorter[next_diagonal_index - i - 2] != longer[i];
+            sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
+            sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
+            next_distances[i + 1] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
         }
-
-        // Fill the tail of the buffer. This part is much cleaner with AVX-512.
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-        if (tail_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_lddqu_si128((__m128i const *)source)), target += 16, source += 16,
-                tail_length -= 16;
-        if (tail_length & 8) *(sz_u64_t *)target = *(sz_u64_t *)source, target += 8, source += 8, tail_length -= 8;
-        if (tail_length & 4) *(sz_u32_t *)target = *(sz_u32_t *)source, target += 4, source += 4, tail_length -= 4;
-        if (tail_length & 2) *(sz_u16_t *)target = *(sz_u16_t *)source, target += 2, source += 2, tail_length -= 2;
-        if (tail_length & 1) *(sz_u8_t *)target = *(sz_u8_t *)source, target++, source++, tail_length--;
-    }
-}
-
-SZ_PUBLIC void sz_move_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    if (target < source || target >= source + length) {
-        for (; length >= 32; target += 32, source += 32, length -= 32)
-            _mm256_storeu_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-        while (length--) *(target++) = *(source++);
-    }
-    else {
-        // Jump to the end and walk backwards.
-        for (target += length, source += length; length >= 32; length -= 32)
-            _mm256_storeu_si256((__m256i *)(target -= 32), _mm256_lddqu_si256((__m256i const *)(source -= 32)));
-        while (length--) *(--target) = *(--source);
+        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
+        next_distances[0] = next_distances[next_diagonal_length - 1] = next_diagonal_index;
+        // Perform a circular rotation of those buffers, to reuse the memory.
+        sz_size_t *temporary = previous_distances;
+        previous_distances = current_distances;
+        current_distances = next_distances;
+        next_distances = temporary;
     }
-}
 
-SZ_PUBLIC sz_u64_t sz_checksum_avx2(sz_cptr_t text, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "loads".
-    //
-    // A typical AWS Skylake instance can have 32 KB x 2 blocks of L1 data cache per core,
-    // 1 MB x 2 blocks of L2 cache per core, and one shared L3 cache buffer.
-    // For now, let's avoid the cases beyond the L2 size.
-    int is_huge = length > 1ull * 1024ull * 1024ull;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 32) { return sz_checksum_serial(text, length); }
-    else if (!is_huge) {
-        sz_u256_vec_t text_vec, sums_vec;
-        sums_vec.ymm = _mm256_setzero_si256();
-        for (; length >= 32; text += 32, length -= 32) {
-            text_vec.ymm = _mm256_lddqu_si256((__m256i const *)text);
-            sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
+    // By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a
+    // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
+    // index on either side, we will be cropping those values out.
+    sz_size_t diagonals_count = n + n - 1;
+    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
+        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
+        for (sz_size_t i = 0; i != next_diagonal_length; ++i) {
+            sz_size_t cost_of_substitution = shorter[shorter_length - 1 - i] != longer[next_diagonal_index - n + i];
+            sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
+            sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
+            next_distances[i] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
         }
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        sz_u64_t result = low + high;
-        if (length) result += sz_checksum_serial(text, length);
-        return result;
+        // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
+        // dropping the first element in the current array.
+        sz_size_t *temporary = previous_distances;
+        previous_distances = current_distances + 1;
+        current_distances = next_distances;
+        next_distances = temporary;
     }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    // Most notably, we can avoid populating the cache with the entire buffer, and instead traverse it in 2 directions.
-    else {
-        sz_size_t head_length = (32 - ((sz_size_t)text % 32)) % 32; // 31 or less.
-        sz_size_t tail_length = (sz_size_t)(text + length) % 32;    // 31 or less.
-        sz_size_t body_length = length - head_length - tail_length; // Multiple of 32.
-        sz_u64_t result = 0;
-
-        // Handle the head
-        while (head_length--) result += *text++;
-
-        sz_u256_vec_t text_vec, sums_vec;
-        sums_vec.ymm = _mm256_setzero_si256();
-        // Fill the aligned body of the buffer.
-        if (!is_huge) {
-            for (; body_length >= 32; text += 32, body_length -= 32) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i const *)text);
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-            }
-        }
-        // When the biffer is huge, we can traverse it in 2 directions.
-        else {
-            sz_u256_vec_t text_reversed_vec, sums_reversed_vec;
-            sums_reversed_vec.ymm = _mm256_setzero_si256();
-            for (; body_length >= 64; text += 64, body_length -= 64) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i *)(text));
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-                text_reversed_vec.ymm = _mm256_stream_load_si256((__m256i *)(text + body_length - 64));
-                sums_reversed_vec.ymm = _mm256_add_epi64(
-                    sums_reversed_vec.ymm, _mm256_sad_epu8(text_reversed_vec.ymm, _mm256_setzero_si256()));
-            }
-            if (body_length >= 32) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i *)(text));
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-            }
-            sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, sums_reversed_vec.ymm);
-        }
 
-        // Handle the tail
-        while (tail_length--) result += *text++;
-
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        result += low + high;
-        return result;
-    }
+    // Cache scalar before `free` call.
+    sz_size_t result = current_distances[0];
+    alloc->free(distances, buffer_length, alloc->handle);
+    return result;
 }
 
-SZ_PUBLIC void sz_look_up_transform_avx2(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
+/**
+ *  @brief  Compute the Levenshtein distance between two strings using the Wagner-Fisher algorithm.
+ *          Stores only 2 rows of the Levenshtein matrix, but uses 64-bit integers for the distance values,
+ *          and upcasts UTF8 variable-length codepoints to 64-bit integers for faster addressing.
+ *
+ *  ! In the worst case for 2 strings of length 100, that contain just one 16-bit codepoint this will result in extra:
+ *      + 2 rows * 100 slots * 8 bytes/slot = 1600 bytes of memory for the two rows of the Levenshtein matrix rows.
+ *      + 100 codepoints * 2 strings * 4 bytes/codepoint = 800 bytes of memory for the UTF8 buffer.
+ *      = 2400 bytes of memory or @b 12x memory amplification!
+ */
+SZ_INTERNAL sz_size_t _sz_edit_distance_wagner_fisher_serial( //
+    sz_cptr_t longer, sz_size_t longer_length,                //
+    sz_cptr_t shorter, sz_size_t shorter_length,              //
+    sz_size_t bound, sz_bool_t can_be_unicode, sz_memory_allocator_t *alloc) {
 
-    // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
-    // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
-    // But if at least 3 cache lines are touched, the AVX-2 implementation should be faster.
-    if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
-        return;
+    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
+    sz_memory_allocator_t global_alloc;
+    if (!alloc) {
+        sz_memory_allocator_init_default(&global_alloc);
+        alloc = &global_alloc;
     }
 
-    // We need to pull the lookup table into 8x YMM registers.
-    // The biggest issue is reorganizing the data in the lookup table, as AVX2 doesn't have 256-bit shuffle,
-    // it only has 128-bit "within-lane" shuffle. Still, it's wiser to use full YMM registers, instead of XMM,
-    // so that we can at least compensate high latency with twice larger window and one more level of lookup.
-    sz_u256_vec_t lut_0_to_15_vec, lut_16_to_31_vec, lut_32_to_47_vec, lut_48_to_63_vec, //
-        lut_64_to_79_vec, lut_80_to_95_vec, lut_96_to_111_vec, lut_112_to_127_vec,       //
-        lut_128_to_143_vec, lut_144_to_159_vec, lut_160_to_175_vec, lut_176_to_191_vec,  //
-        lut_192_to_207_vec, lut_208_to_223_vec, lut_224_to_239_vec, lut_240_to_255_vec;
-
-    lut_0_to_15_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut)));
-    lut_16_to_31_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 16)));
-    lut_32_to_47_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 32)));
-    lut_48_to_63_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 48)));
-    lut_64_to_79_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 64)));
-    lut_80_to_95_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 80)));
-    lut_96_to_111_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 96)));
-    lut_112_to_127_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 112)));
-    lut_128_to_143_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 128)));
-    lut_144_to_159_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 144)));
-    lut_160_to_175_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 160)));
-    lut_176_to_191_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 176)));
-    lut_192_to_207_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 192)));
-    lut_208_to_223_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 208)));
-    lut_224_to_239_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 224)));
-    lut_240_to_255_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 240)));
-
-    // Assuming each lookup is performed within 16 elements of 256, we need to reduce the scope by 16x = 2^4.
-    sz_u256_vec_t not_first_bit_vec, not_second_bit_vec, not_third_bit_vec, not_fourth_bit_vec;
-
-    /// Top and bottom nibbles of the source are used separately.
-    sz_u256_vec_t source_vec, source_bot_vec;
-    sz_u256_vec_t blended_0_to_31_vec, blended_32_to_63_vec, blended_64_to_95_vec, blended_96_to_127_vec,
-        blended_128_to_159_vec, blended_160_to_191_vec, blended_192_to_223_vec, blended_224_to_255_vec;
-
-    // Handling the head.
-    while (length >= 32) {
-        // Load and separate the nibbles of each byte in the source.
-        source_vec.ymm = _mm256_lddqu_si256((__m256i const *)source);
-        source_bot_vec.ymm = _mm256_and_si256(source_vec.ymm, _mm256_set1_epi8((char)0x0F));
-
-        // In the first round, we select using the 4th bit.
-        not_fourth_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x10), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8(                      //
-            _mm256_shuffle_epi8(lut_16_to_31_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_0_to_15_vec.ymm, source_bot_vec.ymm),  //
-            not_fourth_bit_vec.ymm);
-        blended_32_to_63_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_48_to_63_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_32_to_47_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_64_to_95_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_80_to_95_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_64_to_79_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_96_to_127_vec.ymm = _mm256_blendv_epi8(                      //
-            _mm256_shuffle_epi8(lut_112_to_127_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_96_to_111_vec.ymm, source_bot_vec.ymm),  //
-            not_fourth_bit_vec.ymm);
-        blended_128_to_159_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_144_to_159_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_128_to_143_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_160_to_191_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_176_to_191_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_160_to_175_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_192_to_223_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_208_to_223_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_192_to_207_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_224_to_255_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_240_to_255_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_224_to_239_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-
-        // Perform a tree-like reduction of the 8x "blended" YMM registers, depending on the "source" content.
-        // The first round selects using the 3rd bit.
-        not_third_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x20), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8( //
-            blended_32_to_63_vec.ymm,                 //
-            blended_0_to_31_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-        blended_64_to_95_vec.ymm = _mm256_blendv_epi8( //
-            blended_96_to_127_vec.ymm,                 //
-            blended_64_to_95_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-        blended_128_to_159_vec.ymm = _mm256_blendv_epi8( //
-            blended_160_to_191_vec.ymm,                  //
-            blended_128_to_159_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-        blended_192_to_223_vec.ymm = _mm256_blendv_epi8( //
-            blended_224_to_255_vec.ymm,                  //
-            blended_192_to_223_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-
-        // The second round selects using the 2nd bit.
-        not_second_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x40), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8( //
-            blended_64_to_95_vec.ymm,                 //
-            blended_0_to_31_vec.ymm,                  //
-            not_second_bit_vec.ymm);
-        blended_128_to_159_vec.ymm = _mm256_blendv_epi8( //
-            blended_192_to_223_vec.ymm,                  //
-            blended_128_to_159_vec.ymm,                  //
-            not_second_bit_vec.ymm);
-
-        // The third round selects using the 1st bit.
-        not_first_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x80), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8( //
-            blended_128_to_159_vec.ymm,               //
-            blended_0_to_31_vec.ymm,                  //
-            not_first_bit_vec.ymm);
-
-        // And dump the result into the target.
-        _mm256_storeu_si256((__m256i *)target, blended_0_to_31_vec.ymm);
-        source += 32, target += 32, length -= 32;
-    }
+    // A good idea may be to dispatch different kernels for different string lengths.
+    // Like using `uint8_t` counters for strings under 255 characters long.
+    // Good in theory, this results in frequent upcasts and downcasts in serial code.
+    // On strings over 20 bytes, using `uint8` over `uint64` on 64-bit x86 CPU doubles the execution time.
+    // So one must be very cautious with such optimizations.
+    typedef sz_size_t _distance_t;
 
-    // Handle the tail.
-    if (length) sz_look_up_transform_serial(source, length, lut, target);
-}
+    // Compute the number of columns in our Levenshtein matrix.
+    sz_size_t const n = shorter_length + 1;
 
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    int mask;
-    sz_u256_vec_t h_vec, n_vec;
-    n_vec.ymm = _mm256_set1_epi8(n[0]);
+    // If a buffering memory-allocator is provided, this operation is practically free,
+    // and cheaper than allocating even 512 bytes (for small distance matrices) on stack.
+    sz_size_t buffer_length = sizeof(_distance_t) * (n * 2);
 
-    while (h_length >= 32) {
-        h_vec.ymm = _mm256_lddqu_si256((__m256i const *)h);
-        mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_vec.ymm, n_vec.ymm));
-        if (mask) return h + sz_u32_ctz(mask);
-        h += 32, h_length -= 32;
+    // If the strings contain Unicode characters, let's estimate the max character width,
+    // and use it to allocate a larger buffer to decode UTF8.
+    if ((can_be_unicode == sz_true_k) &&
+        (sz_isascii(longer, longer_length) == sz_false_k || sz_isascii(shorter, shorter_length) == sz_false_k)) {
+        buffer_length += (shorter_length + longer_length) * sizeof(sz_rune_t);
     }
+    else { can_be_unicode = sz_false_k; }
 
-    return sz_find_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    int mask;
-    sz_u256_vec_t h_vec, n_vec;
-    n_vec.ymm = _mm256_set1_epi8(n[0]);
+    // If the allocation fails, return the maximum distance.
+    sz_ptr_t const buffer = (sz_ptr_t)alloc->allocate(buffer_length, alloc->handle);
+    if (!buffer) return SZ_SIZE_MAX;
 
-    while (h_length >= 32) {
-        h_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + h_length - 32));
-        mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_vec.ymm, n_vec.ymm));
-        if (mask) return h + h_length - 1 - sz_u32_clz(mask);
-        h_length -= 32;
+    // Let's export the UTF8 sequence into the newly allocated buffer at the end.
+    if (can_be_unicode == sz_true_k) {
+        sz_rune_t *const longer_utf32 = (sz_rune_t *)(buffer + sizeof(_distance_t) * (n * 2));
+        sz_rune_t *const shorter_utf32 = longer_utf32 + longer_length;
+        // Export the UTF8 sequences into the newly allocated buffer.
+        longer_length = _sz_export_utf8_to_utf32(longer, longer_length, longer_utf32);
+        shorter_length = _sz_export_utf8_to_utf32(shorter, shorter_length, shorter_utf32);
+        longer = (sz_cptr_t)longer_utf32;
+        shorter = (sz_cptr_t)shorter_utf32;
     }
 
-    return sz_rfind_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_avx2(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into YMM registers.
-    int matches;
-    sz_u256_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.ymm = _mm256_set1_epi8(n[offset_first]);
-    n_mid_vec.ymm = _mm256_set1_epi8(n[offset_mid]);
-    n_last_vec.ymm = _mm256_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    for (; h_length >= n_length + 32; h += 32, h_length -= 32) {
-        h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_first));
-        h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_mid));
-        h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_last));
-        matches = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
-        while (matches) {
-            int potential_offset = sz_u32_ctz(matches);
-            if (sz_equal(h + potential_offset, n, n_length)) return h + potential_offset;
-            matches &= matches - 1;
-        }
-    }
+    // Let's parameterize the core logic for different character types and distance types.
+#define _wagner_fisher_unbounded(_distance_t, _char_t)                                                                \
+    /* Now let's cast our pointer to avoid it in subsequent sections. */                                              \
+    _char_t const *const longer_chars = (_char_t const *)longer;                                                      \
+    _char_t const *const shorter_chars = (_char_t const *)shorter;                                                    \
+    _distance_t *previous_distances = (_distance_t *)buffer;                                                          \
+    _distance_t *current_distances = previous_distances + n;                                                          \
+    /*  Initialize the first row of the Levenshtein matrix with `iota`-style arithmetic progression. */               \
+    for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
+    /* The main loop of the algorithm with quadratic complexity. */                                                   \
+    for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {                                     \
+        _char_t const longer_char = longer_chars[idx_longer];                                                         \
+        /* Using pure pointer arithmetic is faster than iterating with an index. */                                   \
+        _char_t const *shorter_ptr = shorter_chars;                                                                   \
+        _distance_t const *previous_ptr = previous_distances;                                                         \
+        _distance_t *current_ptr = current_distances;                                                                 \
+        _distance_t *const current_end = current_ptr + shorter_length;                                                \
+        current_ptr[0] = idx_longer + 1;                                                                              \
+        for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) {                            \
+            _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]);           \
+            /* We can avoid `+1` for costs here, shifting it to post-minimum computation, */                          \
+            /* saving one increment operation. */                                                                     \
+            _distance_t cost_deletion = previous_ptr[1];                                                              \
+            _distance_t cost_insertion = current_ptr[0];                                                              \
+            /* ? It might be a good idea to enforce branchless execution here. */                                     \
+            /* ? The caveat being that the benchmarks on longer sequences backfire and more research is needed. */    \
+            current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1);      \
+        }                                                                                                             \
+        /* Swap `previous_distances` and `current_distances` pointers. */                                             \
+        _distance_t *temporary = previous_distances;                                                                  \
+        previous_distances = current_distances;                                                                       \
+        current_distances = temporary;                                                                                \
+    }                                                                                                                 \
+    /* Cache scalar before `free` call. */                                                                            \
+    sz_size_t result = previous_distances[shorter_length];                                                            \
+    alloc->free(buffer, buffer_length, alloc->handle);                                                                \
+    return result;
 
-    return sz_find_serial(h, h_length, n, n_length);
-}
+    // Let's define a separate variant for bounded distance computation.
+    // Practically the same as unbounded, but also collecting the running minimum within each row for early exit.
+#define _wagner_fisher_bounded(_distance_t, _char_t)                                                                  \
+    _char_t const *const longer_chars = (_char_t const *)longer;                                                      \
+    _char_t const *const shorter_chars = (_char_t const *)shorter;                                                    \
+    _distance_t *previous_distances = (_distance_t *)buffer;                                                          \
+    _distance_t *current_distances = previous_distances + n;                                                          \
+    for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
+    for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {                                     \
+        _char_t const longer_char = longer_chars[idx_longer];                                                         \
+        _char_t const *shorter_ptr = shorter_chars;                                                                   \
+        _distance_t const *previous_ptr = previous_distances;                                                         \
+        _distance_t *current_ptr = current_distances;                                                                 \
+        _distance_t *const current_end = current_ptr + shorter_length;                                                \
+        current_ptr[0] = idx_longer + 1;                                                                              \
+        /* Initialize min_distance with a value greater than bound */                                                 \
+        _distance_t min_distance = bound - 1;                                                                         \
+        for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) {                            \
+            _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]);           \
+            _distance_t cost_deletion = previous_ptr[1];                                                              \
+            _distance_t cost_insertion = current_ptr[0];                                                              \
+            current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1);      \
+            /* Keep track of the minimum distance seen so far in this row */                                          \
+            min_distance = sz_min_of_two(current_ptr[1], min_distance);                                               \
+        }                                                                                                             \
+        /* If the minimum distance in this row exceeded the bound, return early */                                    \
+        if (min_distance >= bound) {                                                                                  \
+            alloc->free(buffer, buffer_length, alloc->handle);                                                        \
+            return bound;                                                                                             \
+        }                                                                                                             \
+        _distance_t *temporary = previous_distances;                                                                  \
+        previous_distances = current_distances;                                                                       \
+        current_distances = temporary;                                                                                \
+    }                                                                                                                 \
+    sz_size_t result = previous_distances[shorter_length];                                                            \
+    alloc->free(buffer, buffer_length, alloc->handle);                                                                \
+    return sz_min_of_two(result, bound);
 
-SZ_PUBLIC sz_cptr_t sz_rfind_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_avx2(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into YMM registers.
-    int matches;
-    sz_u256_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.ymm = _mm256_set1_epi8(n[offset_first]);
-    n_mid_vec.ymm = _mm256_set1_epi8(n[offset_mid]);
-    n_last_vec.ymm = _mm256_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 32; h_length -= 32) {
-        h_reversed = h + h_length - n_length - 32 + 1;
-        h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_first));
-        h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_mid));
-        h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_last));
-        matches = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
-        while (matches) {
-            int potential_offset = sz_u32_clz(matches);
-            if (sz_equal(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            matches &= ~(1 << (31 - potential_offset));
-        }
+    // Dispatch the actual computation.
+    if (!bound) {
+        if (can_be_unicode == sz_true_k) { _wagner_fisher_unbounded(sz_size_t, sz_rune_t); }
+        else { _wagner_fisher_unbounded(sz_size_t, sz_u8_t); }
     }
-
-    return sz_rfind_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx2(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-
-    // Let's unzip even and odd elements and replicate them into both lanes of the YMM register.
-    // That way when we invoke `_mm256_shuffle_epi8` we can use the same mask for both lanes.
-    sz_u256_vec_t filter_even_vec, filter_odd_vec;
-    for (sz_size_t i = 0; i != 16; ++i)
-        filter_even_vec.u8s[i] = filter->_u8s[i * 2], filter_odd_vec.u8s[i] = filter->_u8s[i * 2 + 1];
-    filter_even_vec.xmms[1] = filter_even_vec.xmms[0];
-    filter_odd_vec.xmms[1] = filter_odd_vec.xmms[0];
-
-    sz_u256_vec_t text_vec;
-    sz_u256_vec_t matches_vec;
-    sz_u256_vec_t lower_nibbles_vec, higher_nibbles_vec;
-    sz_u256_vec_t bitset_even_vec, bitset_odd_vec;
-    sz_u256_vec_t bitmask_vec, bitmask_lookup_vec;
-    bitmask_lookup_vec.ymm = _mm256_set_epi8(-128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-                                             -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);
-
-    while (length >= 32) {
-        // The following algorithm is a transposed equivalent of the "SIMDized check which bytes are in a set"
-        // solutions by Wojciech Muła. We populate the bitmask differently and target newer CPUs, so
-        // StrinZilla uses a somewhat different approach.
-        // http://0x80.pl/articles/simd-byte-lookup.html#alternative-implementation-new
-        //
-        //      sz_u8_t input = *(sz_u8_t const *)text;
-        //      sz_u8_t lo_nibble = input & 0x0f;
-        //      sz_u8_t hi_nibble = input >> 4;
-        //      sz_u8_t bitset_even = filter_even_vec.u8s[hi_nibble];
-        //      sz_u8_t bitset_odd = filter_odd_vec.u8s[hi_nibble];
-        //      sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //      sz_u8_t bitset = lo_nibble < 8 ? bitset_even : bitset_odd;
-        //      if ((bitset & bitmask) != 0) return text;
-        //      else { length--, text++; }
-        //
-        // The nice part about this, loading the strided data is vey easy with Arm NEON,
-        // while with x86 CPUs after AVX, shuffles within 256 bits shouldn't be an issue either.
-        text_vec.ymm = _mm256_lddqu_si256((__m256i const *)text);
-        lower_nibbles_vec.ymm = _mm256_and_si256(text_vec.ymm, _mm256_set1_epi8(0x0f));
-        bitmask_vec.ymm = _mm256_shuffle_epi8(bitmask_lookup_vec.ymm, lower_nibbles_vec.ymm);
-        //
-        // At this point we can validate the `bitmask_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != 32; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t lo_nibble = input & 0x0f;
-        //          sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //          sz_assert(bitmask_vec.u8s[i] == bitmask);
-        //      }
-        //
-        // Shift right every byte by 4 bits.
-        // There is no `_mm256_srli_epi8` intrinsic, so we have to use `_mm256_srli_epi16`
-        // and combine it with a mask to clear the higher bits.
-        higher_nibbles_vec.ymm = _mm256_and_si256(_mm256_srli_epi16(text_vec.ymm, 4), _mm256_set1_epi8(0x0f));
-        bitset_even_vec.ymm = _mm256_shuffle_epi8(filter_even_vec.ymm, higher_nibbles_vec.ymm);
-        bitset_odd_vec.ymm = _mm256_shuffle_epi8(filter_odd_vec.ymm, higher_nibbles_vec.ymm);
-        //
-        // At this point we can validate the `bitset_even_vec` and `bitset_odd_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != 32; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t const *bitset_ptr = &filter->_u8s[0];
-        //          sz_u8_t hi_nibble = input >> 4;
-        //          sz_u8_t bitset_even = bitset_ptr[hi_nibble * 2];
-        //          sz_u8_t bitset_odd = bitset_ptr[hi_nibble * 2 + 1];
-        //          sz_assert(bitset_even_vec.u8s[i] == bitset_even);
-        //          sz_assert(bitset_odd_vec.u8s[i] == bitset_odd);
-        //      }
-        //
-        __m256i take_first = _mm256_cmpgt_epi8(_mm256_set1_epi8(8), lower_nibbles_vec.ymm);
-        bitset_even_vec.ymm = _mm256_blendv_epi8(bitset_odd_vec.ymm, bitset_even_vec.ymm, take_first);
-
-        // It would have been great to have an instruction that tests the bits and then broadcasts
-        // the matching bit into all bits in that byte. But we don't have that, so we have to
-        // `and`, `cmpeq`, `movemask`, and then invert at the end...
-        matches_vec.ymm = _mm256_and_si256(bitset_even_vec.ymm, bitmask_vec.ymm);
-        matches_vec.ymm = _mm256_cmpeq_epi8(matches_vec.ymm, _mm256_setzero_si256());
-        int matches_mask = ~_mm256_movemask_epi8(matches_vec.ymm);
-        if (matches_mask) {
-            int offset = sz_u32_ctz(matches_mask);
-            return text + offset;
-        }
-        else { text += 32, length -= 32; }
+    else {
+        if (can_be_unicode == sz_true_k) { _wagner_fisher_bounded(sz_size_t, sz_rune_t); }
+        else { _wagner_fisher_bounded(sz_size_t, sz_u8_t); }
     }
-
-    return sz_find_charset_serial(text, length, filter);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx2(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-    return sz_rfind_charset_serial(text, length, filter);
-}
-
-/**
- *  @brief  There is no AVX2 instruction for fast multiplication of 64-bit integers.
- *          This implementation is coming from Agner Fog's Vector Class Library.
- */
-SZ_INTERNAL __m256i _mm256_mul_epu64(__m256i a, __m256i b) {
-    __m256i bswap = _mm256_shuffle_epi32(b, 0xB1);
-    __m256i prodlh = _mm256_mullo_epi32(a, bswap);
-    __m256i zero = _mm256_setzero_si256();
-    __m256i prodlh2 = _mm256_hadd_epi32(prodlh, zero);
-    __m256i prodlh3 = _mm256_shuffle_epi32(prodlh2, 0x73);
-    __m256i prodll = _mm256_mul_epu32(a, b);
-    __m256i prod = _mm256_add_epi64(prodll, prodlh3);
-    return prod;
 }
 
-SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                              sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    if (length < 4 * window_length) {
-        sz_hashes_serial(start, length, window_length, step, callback, callback_handle);
-        return;
-    }
-
-    // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
-    // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
-    sz_size_t const max_hashes = length - window_length + 1;
-    sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
-    sz_u8_t const *text_first = (sz_u8_t const *)start;
-    sz_u8_t const *text_second = text_first + min_hashes_per_thread;
-    sz_u8_t const *text_third = text_first + min_hashes_per_thread * 2;
-    sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
-    sz_u8_t const *text_end = text_first + length;
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // Broadcast the constants into the registers.
-    sz_u256_vec_t prime_vec, golden_ratio_vec;
-    sz_u256_vec_t base_low_vec, base_high_vec, prime_power_low_vec, prime_power_high_vec, shift_high_vec;
-    base_low_vec.ymm = _mm256_set1_epi64x(31ull);
-    base_high_vec.ymm = _mm256_set1_epi64x(257ull);
-    shift_high_vec.ymm = _mm256_set1_epi64x(77ull);
-    prime_vec.ymm = _mm256_set1_epi64x(SZ_U64_MAX_PRIME);
-    golden_ratio_vec.ymm = _mm256_set1_epi64x(11400714819323198485ull);
-    prime_power_low_vec.ymm = _mm256_set1_epi64x(prime_power_low);
-    prime_power_high_vec.ymm = _mm256_set1_epi64x(prime_power_high);
-
-    // Compute the initial hash values for every one of the four windows.
-    sz_u256_vec_t hash_low_vec, hash_high_vec, hash_mix_vec, chars_low_vec, chars_high_vec;
-    hash_low_vec.ymm = _mm256_setzero_si256();
-    hash_high_vec.ymm = _mm256_setzero_si256();
-    for (sz_u8_t const *prefix_end = text_first + window_length; text_first < prefix_end;
-         ++text_first, ++text_second, ++text_third, ++text_fourth) {
-
-        // 1. Multiply the hashes by the base.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-
-        // 3. Add the incoming characters.
-        hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_low_vec.ymm = _mm256_blendv_epi8(hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
-                                              _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
-        hash_high_vec.ymm = _mm256_blendv_epi8(hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
-                                               _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
-    }
+SZ_PUBLIC sz_size_t sz_edit_distance_serial(     //
+    sz_cptr_t longer, sz_size_t longer_length,   //
+    sz_cptr_t shorter, sz_size_t shorter_length, //
+    sz_size_t bound, sz_memory_allocator_t *alloc) {
 
-    // 5. Compute the hash mix, that will be used to index into the fingerprint.
-    //    This includes a serial step at the end.
-    hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
-    hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
-    hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
-    callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-    callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-    callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-    callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-
-    // Now repeat that operation for the remaining characters, discarding older characters.
-    sz_size_t cycle = 1;
-    sz_size_t const step_mask = step - 1;
-    for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
-        // 0. Load again the four characters we are dropping, shift them, and subtract.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[-window_length], text_third[-window_length],
-                                              text_second[-window_length], text_first[-window_length]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-        hash_low_vec.ymm =
-            _mm256_sub_epi64(hash_low_vec.ymm, _mm256_mul_epu64(chars_low_vec.ymm, prime_power_low_vec.ymm));
-        hash_high_vec.ymm =
-            _mm256_sub_epi64(hash_high_vec.ymm, _mm256_mul_epu64(chars_high_vec.ymm, prime_power_high_vec.ymm));
-
-        // 1. Multiply the hashes by the base.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-
-        // 3. Add the incoming characters.
-        hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_low_vec.ymm = _mm256_blendv_epi8(hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
-                                              _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
-        hash_high_vec.ymm = _mm256_blendv_epi8(hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
-                                               _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
-
-        // 5. Compute the hash mix, that will be used to index into the fingerprint.
-        //    This includes a serial step at the end.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
-        hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
-        if ((cycle & step_mask) == 0) {
-            callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-            callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-            callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-            callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-        }
+    // Let's make sure that we use the amount proportional to the
+    // number of elements in the shorter string, not the larger.
+    if (shorter_length > longer_length) {
+        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
+        sz_pointer_swap((void **)&longer, (void **)&shorter);
     }
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif
-#pragma endregion
-
-/*
- *  @brief  AVX-512 implementation of the string search algorithms.
- *
- *  Different subsets of AVX-512 were introduced in different years:
- *  - 2017 SkyLake: F, CD, ER, PF, VL, DQ, BW
- *  - 2018 CannonLake: IFMA, VBMI
- *  - 2019 IceLake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES
- *  - 2020 TigerLake: VP2INTERSECT
- */
-#pragma region AVX512 Implementation
-
-#if SZ_USE_X86_AVX512
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
-#include <immintrin.h>
-
-/**
- *  @brief  Helper structure to simplify work with 512-bit registers.
- */
-typedef union sz_u512_vec_t {
-    __m512i zmm;
-    __m256i ymms[2];
-    __m128i xmms[4];
-    sz_u64_t u64s[8];
-    sz_u32_t u32s[16];
-    sz_u16_t u16s[32];
-    sz_u8_t u8s[64];
-    sz_i64_t i64s[8];
-    sz_i32_t i32s[16];
-} sz_u512_vec_t;
-
-SZ_INTERNAL __mmask64 _sz_u64_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 64:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 64:
-    return _bzhi_u64(0xFFFFFFFFFFFFFFFF, n < 64 ? (sz_u32_t)n : 64);
-}
-
-SZ_INTERNAL __mmask32 _sz_u32_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 32:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 32:
-    return _bzhi_u32(0xFFFFFFFF, n < 32 ? (sz_u32_t)n : 32);
-}
-
-SZ_INTERNAL __mmask16 _sz_u16_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 16:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 16:
-    return _bzhi_u32(0xFFFFFFFF, n < 16 ? (sz_u32_t)n : 16);
-}
-
-SZ_INTERNAL __mmask16 _sz_u16_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 16:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 16:
-    return (__mmask16)_bzhi_u32(0xFFFFFFFF, (sz_u32_t)n);
-}
-
-SZ_INTERNAL __mmask32 _sz_u32_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 32:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 32:
-    return _bzhi_u32(0xFFFFFFFF, (sz_u32_t)n);
-}
-
-SZ_INTERNAL __mmask64 _sz_u64_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 64:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 64:
-    return _bzhi_u64(0xFFFFFFFFFFFFFFFF, (sz_u32_t)n);
-}
 
-SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    sz_u512_vec_t a_vec, b_vec;
-
-    // Pointer arithmetic is cheap, fetching memory is not!
-    // So we can use the masked loads to fetch at most one cache-line for each string,
-    // compare the prefixes, and only then move forward.
-    sz_size_t a_head_length = 64 - ((sz_size_t)a % 64); // 63 or less.
-    sz_size_t b_head_length = 64 - ((sz_size_t)b % 64); // 63 or less.
-    a_head_length = a_head_length < a_length ? a_head_length : a_length;
-    b_head_length = b_head_length < b_length ? b_head_length : b_length;
-    sz_size_t head_length = a_head_length < b_head_length ? a_head_length : b_head_length;
-    __mmask64 head_mask = _sz_u64_mask_until(head_length);
-    a_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, a);
-    b_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, b);
-    __mmask64 mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-    if (mask_not_equal != 0) {
-        sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-        char a_char = a_vec.u8s[first_diff];
-        char b_char = b_vec.u8s[first_diff];
-        return _sz_order_scalars(a_char, b_char);
-    }
-    else if (head_length == a_length && head_length == b_length) { return sz_equal_k; }
-    else { a += head_length, b += head_length, a_length -= head_length, b_length -= head_length; }
-
-    // The rare case, when both string are very long.
-    __mmask64 a_mask, b_mask;
-    while ((a_length >= 64) & (b_length >= 64)) {
-        a_vec.zmm = _mm512_loadu_si512(a);
-        b_vec.zmm = _mm512_loadu_si512(b);
-        mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask_not_equal != 0) {
-            sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-            char a_char = a_vec.u8s[first_diff];
-            char b_char = b_vec.u8s[first_diff];
-            return _sz_order_scalars(a_char, b_char);
-        }
-        a += 64, b += 64, a_length -= 64, b_length -= 64;
-    }
+    // Skip the matching prefixes and suffixes, they won't affect the distance.
+    for (sz_cptr_t a_end = longer + longer_length, b_end = shorter + shorter_length;
+         longer != a_end && shorter != b_end && *longer == *shorter;
+         ++longer, ++shorter, --longer_length, --shorter_length);
+    for (; longer_length && shorter_length && longer[longer_length - 1] == shorter[shorter_length - 1];
+         --longer_length, --shorter_length);
 
-    // In most common scenarios at least one of the strings is under 64 bytes.
-    if (a_length | b_length) {
-        a_mask = _sz_u64_clamp_mask_until(a_length);
-        b_mask = _sz_u64_clamp_mask_until(b_length);
-        a_vec.zmm = _mm512_maskz_loadu_epi8(a_mask, a);
-        b_vec.zmm = _mm512_maskz_loadu_epi8(b_mask, b);
-        // The AVX-512 `_mm512_mask_cmpneq_epi8_mask` intrinsics are generally handy in such environments.
-        // They, however, have latency 3 on most modern CPUs. Using AVX2: `_mm256_cmpeq_epi8` would have
-        // been cheaper, if we didn't have to apply `_mm256_movemask_epi8` afterwards.
-        mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask_not_equal != 0) {
-            sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-            char a_char = a_vec.u8s[first_diff];
-            char b_char = b_vec.u8s[first_diff];
-            return _sz_order_scalars(a_char, b_char);
-        }
-        // From logic perspective, the hardest cases are "abc\0" and "abc".
-        // The result must be `sz_greater_k`, as the latter is shorter.
-        else { return _sz_order_scalars(a_length, b_length); }
+    // Bounded computations may exit early.
+    int const is_bounded = bound < longer_length;
+    if (is_bounded) {
+        // If one of the strings is empty - the edit distance is equal to the length of the other one.
+        if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
+        if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
+        // If the difference in length is beyond the `bound`, there is no need to check at all.
+        if (longer_length - shorter_length > bound) return bound;
     }
 
-    return sz_equal_k;
+    if (shorter_length == 0) return longer_length; // If no mismatches were found - the distance is zero.
+    if (shorter_length == longer_length && !is_bounded)
+        return _sz_edit_distance_skewed_diagonals_serial(longer, longer_length, shorter, shorter_length, bound, alloc);
+    return _sz_edit_distance_wagner_fisher_serial( //
+        longer, longer_length, shorter, shorter_length, bound, sz_false_k, alloc);
 }
 
-SZ_PUBLIC sz_bool_t sz_equal_avx512(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    __mmask64 mask;
-    sz_u512_vec_t a_vec, b_vec;
-
-    while (length >= 64) {
-        a_vec.zmm = _mm512_loadu_si512(a);
-        b_vec.zmm = _mm512_loadu_si512(b);
-        mask = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask != 0) return sz_false_k;
-        a += 64, b += 64, length -= 64;
-    }
-
-    if (length) {
-        mask = _sz_u64_mask_until(length);
-        a_vec.zmm = _mm512_maskz_loadu_epi8(mask, a);
-        b_vec.zmm = _mm512_maskz_loadu_epi8(mask, b);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpneq_epi8_mask(mask, a_vec.zmm, b_vec.zmm);
-        return (sz_bool_t)(mask == 0);
-    }
+SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(       //
+    sz_cptr_t longer, sz_size_t longer_length,        //
+    sz_cptr_t shorter, sz_size_t shorter_length,      //
+    sz_error_cost_t const *subs, sz_error_cost_t gap, //
+    sz_memory_allocator_t *alloc) {
 
-    return sz_true_k;
-}
+    // If one of the strings is empty - the edit distance is equal to the length of the other one
+    if (longer_length == 0) return (sz_ssize_t)shorter_length * gap;
+    if (shorter_length == 0) return (sz_ssize_t)longer_length * gap;
 
-SZ_PUBLIC void sz_fill_avx512(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    __m512i value_vec = _mm512_set1_epi8(value);
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores".
-    //
-    //    for (; length >= 64; target += 64, length -= 64) _mm512_storeu_si512(target, value_vec);
-    //    _mm512_mask_storeu_epi8(target, _sz_u64_mask_until(length), value_vec);
-    //
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        _mm512_mask_storeu_epi8(target, mask, value_vec);
-    }
-    // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
-    // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
-    // by computing 2 masks - for the head and tail, using masked stores for the head and tail, and unmasked
-    // for the body.
-    else {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        _mm512_mask_storeu_epi8(target, head_mask, value_vec);
-        for (target += head_length; body_length >= 64; target += 64, body_length -= 64)
-            _mm512_store_si512(target, value_vec);
-        _mm512_mask_storeu_epi8(target, tail_mask, value_vec);
+    // Let's make sure that we use the amount proportional to the
+    // number of elements in the shorter string, not the larger.
+    if (shorter_length > longer_length) {
+        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
+        sz_pointer_swap((void **)&longer, (void **)&shorter);
     }
-}
 
-SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores" and "loads".
-    //
-    //    for (; length >= 64; target += 64, source += 64, length -= 64)
-    //        _mm512_storeu_si512(target, _mm512_loadu_si512(source));
-    //    __mmask64 mask = _sz_u64_mask_until(length);
-    //    _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    //
-    // A typical AWS Sapphire Rapids instance can have 48 KB x 2 blocks of L1 data cache per core,
-    // 2 MB x 2 blocks of L2 cache per core, and one shared 60 MB buffer of L3 cache.
-    // With two strings, we may consider the overal workload huge, if each exceeds 1 MB in length.
-    int const is_huge = length >= 1ull * 1024ull * 1024ull;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    }
-    // When dealing wirh larger arrays, the optimization is not as simple as with the `sz_fill_avx512` function,
-    // as both buffers may be unaligned. If we are lucky and the requested operation is some huge page transfer,
-    // we can use aligned loads and stores, and the performance will be great.
-    else if ((sz_size_t)target % 64 == 0 && (sz_size_t)source % 64 == 0 && !is_huge) {
-        for (; length >= 64; target += 64, source += 64, length -= 64)
-            _mm512_store_si512(target, _mm512_load_si512(source));
-        // At this point the length is guaranteed to be under 64.
-        __mmask64 mask = _sz_u64_mask_until(length);
-        // Aligned load and stores would work too, but it's not defined.
-        _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    }
-    // The trickiest case is when both `source` and `target` are not aligned.
-    // In such and simpler cases we can copy enough bytes into `target` to reach its cacheline boundary,
-    // and then combine unaligned loads with aligned stores.
-    else if (!is_huge) {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-        for (target += head_length, source += head_length; body_length >= 64;
-             target += 64, source += 64, body_length -= 64)
-            _mm512_store_si512(target, _mm512_loadu_si512(source)); // Unaligned load, but aligned store!
-        _mm512_mask_storeu_epi8(target, tail_mask, _mm512_maskz_loadu_epi8(tail_mask, source));
-    }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    //
-    //      1. Moving in both directions to maximize the throughput, when fetching from multiple
-    //         memory pages. Also helps with cache set-associativity issues, as we won't always
-    //         be fetching the same entries in the lookup table.
-    //      2. Using non-temporal stores to avoid polluting the cache.
-    //      3. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
-    //         for predictable patterns, so disregard this advice.
-    //
-    // Bidirectional traversal adds about 10%, accelerating from 11 GB/s to 12 GB/s.
-    // Using "streaming stores" boosts us from 12 GB/s to 19 GB/s.
-    else {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64;
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;
-        sz_size_t body_length = length - head_length - tail_length;
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-        _mm512_mask_storeu_epi8(target + head_length + body_length, tail_mask,
-                                _mm512_maskz_loadu_epi8(tail_mask, source));
-
-        // Now in the main loop, we can use non-temporal loads and stores,
-        // performing the operation in both directions.
-        for (target += head_length, source += head_length; //
-             body_length >= 128;                           //
-             target += 64, source += 64, body_length -= 128) {
-            _mm512_stream_si512((__m512i *)(target), _mm512_loadu_si512(source));
-            _mm512_stream_si512((__m512i *)(target + body_length - 64), _mm512_loadu_si512(source + body_length - 64));
-        }
-        if (body_length >= 64) _mm512_stream_si512((__m512i *)target, _mm512_loadu_si512(source));
+    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
+    sz_memory_allocator_t global_alloc;
+    if (!alloc) {
+        sz_memory_allocator_init_default(&global_alloc);
+        alloc = &global_alloc;
     }
-}
 
-SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    if (target == source) return; // Don't be silly, don't move the data if it's already there.
+    sz_size_t n = shorter_length + 1;
+    sz_size_t buffer_length = sizeof(sz_ssize_t) * n * 2;
+    sz_ssize_t *distances = (sz_ssize_t *)alloc->allocate(buffer_length, alloc->handle);
+    sz_ssize_t *previous_distances = distances;
+    sz_ssize_t *current_distances = previous_distances + n;
 
-    // On very short buffers, that are one cache line in width or less, we don't need any loops.
-    // We can also avoid any data-dependencies between iterations, assuming we have 32 registers
-    // to pre-load the data, before writing it back.
-    if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    }
-    else if (length <= 128) {
-        sz_size_t last_length = length - 64;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
-        __m512i source0 = _mm512_loadu_epi8(source);
-        __m512i source1 = _mm512_maskz_loadu_epi8(mask, source + 64);
-        _mm512_storeu_epi8(target, source0);
-        _mm512_mask_storeu_epi8(target + 64, mask, source1);
-    }
-    else if (length <= 192) {
-        sz_size_t last_length = length - 128;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
-        __m512i source0 = _mm512_loadu_epi8(source);
-        __m512i source1 = _mm512_loadu_epi8(source + 64);
-        __m512i source2 = _mm512_maskz_loadu_epi8(mask, source + 128);
-        _mm512_storeu_epi8(target, source0);
-        _mm512_storeu_epi8(target + 64, source1);
-        _mm512_mask_storeu_epi8(target + 128, mask, source2);
-    }
-    else if (length <= 256) {
-        sz_size_t last_length = length - 192;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
-        __m512i source0 = _mm512_loadu_epi8(source);
-        __m512i source1 = _mm512_loadu_epi8(source + 64);
-        __m512i source2 = _mm512_loadu_epi8(source + 128);
-        __m512i source3 = _mm512_maskz_loadu_epi8(mask, source + 192);
-        _mm512_storeu_epi8(target, source0);
-        _mm512_storeu_epi8(target + 64, source1);
-        _mm512_storeu_epi8(target + 128, source2);
-        _mm512_mask_storeu_epi8(target + 192, mask, source3);
-    }
+    for (sz_size_t idx_shorter = 0; idx_shorter != n; ++idx_shorter)
+        previous_distances[idx_shorter] = (sz_ssize_t)idx_shorter * gap;
 
-    // If the regions don't overlap at all, just use "copy" and save some brain cells thinking about corner cases.
-    else if (target + length < source || target >= source + length) { sz_copy_avx512(target, source, length); }
+    sz_u8_t const *shorter_unsigned = (sz_u8_t const *)shorter;
+    sz_u8_t const *longer_unsigned = (sz_u8_t const *)longer;
+    for (sz_size_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {
+        current_distances[0] = ((sz_ssize_t)idx_longer + 1) * gap;
 
-    // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
-    // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
-    // by computing 2 masks - for the head and tail, using masked stores for the head and tail, and unmasked
-    // for the body.
-    else {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-
-        // The absolute most common case of using "moves" is shifting the data within a continuous buffer
-        // when adding a removing some values in it. In such cases, a typical shift is by 1, 2, 4, 8, 16,
-        // or 32 bytes, rarely larger. For small shifts, under the size of the ZMM register, we can use shuffles.
-        //
-        // Remember:
-        //      - if we are shifting data left, that we are traversing to the right.
-        //      - if we are shifting data right, that we are traversing to the left.
-        int const left_to_right_traversal = source > target;
-
-        // Now we guarantee, that the relative shift within registers is from 1 to 63 bytes and the output is aligned.
-        // Hopefully, we need to shift more than two ZMM registers, so we could consider `valignr` instruction.
-        // Sadly, using `_mm512_alignr_epi8` doesn't make sense, as it operates at a 128-bit granularity.
-        //
-        //      - `_mm256_alignr_epi8` shifts entire 256-bit register, but we need many of them.
-        //      - `_mm512_alignr_epi32` shifts 512-bit chunks, but only if the `shift` is a multiple of 4 bytes.
-        //      - `_mm512_alignr_epi64` shifts 512-bit chunks by 8 bytes.
-        //
-        // All of those have a latency of 1 cycle, and the shift amount must be an immediate value!
-        // For 1-byte-shift granularity, the `_mm512_permutex2var_epi8` has a latency of 6 and needs VBMI!
-        // The most efficient and broadly compatible alternative could be to use a combination of align and shuffle.
-        // A similar approach was outlined in "Byte-wise alignr in AVX512F" by Wojciech Muła.
-        // http://0x80.pl/notesen/2016-10-16-avx512-byte-alignr.html
-        //
-        // That solution, is extremely mouthful, assuming we need compile time constants for the shift amount.
-        // A cleaner one, with a latency of 3 cycles, is to use `_mm512_permutexvar_epi8` or
-        // `_mm512_mask_permutexvar_epi8`, which can be seen as combination of a cross-register shuffle and blend,
-        // and is available with VBMI. That solution is still noticeably slower than AVX2.
-        //
-        // The GLibC implementation also uses non-temporal stores for larger buffers, we don't.
-        // https://codebrowser.dev/glibc/glibc/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S.html
-        if (left_to_right_traversal) {
-            // Head, body, and tail.
-            _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-            for (target += head_length, source += head_length; body_length >= 64;
-                 target += 64, source += 64, body_length -= 64)
-                _mm512_store_si512(target, _mm512_loadu_si512(source));
-            _mm512_mask_storeu_epi8(target, tail_mask, _mm512_maskz_loadu_epi8(tail_mask, source));
-        }
-        else {
-            // Tail, body, and head.
-            _mm512_mask_storeu_epi8(target + head_length + body_length, tail_mask,
-                                    _mm512_maskz_loadu_epi8(tail_mask, source + head_length + body_length));
-            for (; body_length >= 64; body_length -= 64)
-                _mm512_store_si512(target + head_length + body_length - 64,
-                                   _mm512_loadu_si512(source + head_length + body_length - 64));
-            _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
+        // Initialize min_distance with a value greater than bound
+        sz_error_cost_t const *a_subs = subs + longer_unsigned[idx_longer] * 256ul;
+        for (sz_size_t idx_shorter = 0; idx_shorter != shorter_length; ++idx_shorter) {
+            sz_ssize_t cost_deletion = previous_distances[idx_shorter + 1] + gap;
+            sz_ssize_t cost_insertion = current_distances[idx_shorter] + gap;
+            sz_ssize_t cost_substitution = previous_distances[idx_shorter] + a_subs[shorter_unsigned[idx_shorter]];
+            current_distances[idx_shorter + 1] = sz_max_of_three(cost_deletion, cost_insertion, cost_substitution);
         }
+
+        // Swap previous_distances and current_distances pointers
+        sz_pointer_swap((void **)&previous_distances, (void **)&current_distances);
     }
+
+    // Cache scalar before `free` call.
+    sz_ssize_t result = previous_distances[shorter_length];
+    alloc->free(distances, buffer_length, alloc->handle);
+    return result;
 }
 
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    __mmask64 mask;
-    sz_u512_vec_t h_vec, n_vec;
-    n_vec.zmm = _mm512_set1_epi8(n[0]);
+SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
+    sz_cptr_t a, sz_size_t a_length,            //
+    sz_cptr_t b, sz_size_t b_length,            //
+    sz_size_t bound) {
 
-    while (h_length >= 64) {
-        h_vec.zmm = _mm512_loadu_si512(h);
-        mask = _mm512_cmpeq_epi8_mask(h_vec.zmm, n_vec.zmm);
-        if (mask) return h + sz_u64_ctz(mask);
-        h += 64, h_length -= 64;
-    }
+    sz_size_t const min_length = sz_min_of_two(a_length, b_length);
+    sz_size_t const max_length = sz_max_of_two(a_length, b_length);
+    sz_cptr_t const a_end = a + min_length;
+    bound = bound == 0 ? max_length : bound;
 
-    if (h_length) {
-        mask = _sz_u64_mask_until(h_length);
-        h_vec.zmm = _mm512_maskz_loadu_epi8(mask, h);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpeq_epu8_mask(mask, h_vec.zmm, n_vec.zmm);
-        if (mask) return h + sz_u64_ctz(mask);
+    // Walk through both strings using SWAR and counting the number of differing characters.
+    sz_size_t distance = max_length - min_length;
+#if SZ_USE_MISALIGNED_LOADS && !_SZ_IS_BIG_ENDIAN
+    if (min_length >= SZ_SWAR_THRESHOLD) {
+        sz_u64_vec_t a_vec, b_vec, match_vec;
+        for (; a + 8 <= a_end && distance < bound; a += 8, b += 8) {
+            a_vec.u64 = sz_u64_load(a).u64;
+            b_vec.u64 = sz_u64_load(b).u64;
+            match_vec = _sz_u64_each_byte_equal(a_vec, b_vec);
+            distance += sz_u64_popcount((~match_vec.u64) & 0x8080808080808080ull);
+        }
     }
+#endif
 
-    return SZ_NULL_CHAR;
+    for (; a != a_end && distance < bound; ++a, ++b) { distance += (*a != *b); }
+    return sz_min_of_two(distance, bound);
 }
 
-SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_avx512(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into ZMM registers.
-    __mmask64 matches;
-    __mmask64 mask;
-    sz_u512_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.zmm = _mm512_set1_epi8(n[offset_first]);
-    n_mid_vec.zmm = _mm512_set1_epi8(n[offset_mid]);
-    n_last_vec.zmm = _mm512_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    // We have several optimized versions of the lagorithm for shorter strings,
-    // but they all mimic the default case for unbounded length needles
-    if (n_length >= 64) {
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches);
-                if (sz_equal_avx512(h + potential_offset, n, n_length)) return h + potential_offset;
-                matches &= matches - 1;
-            }
+SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial( //
+    sz_cptr_t a, sz_size_t a_length,                 //
+    sz_cptr_t b, sz_size_t b_length,                 //
+    sz_size_t bound) {
+
+    sz_cptr_t const a_end = a + a_length;
+    sz_cptr_t const b_end = b + b_length;
+    sz_size_t distance = 0;
 
-            // TODO: If the last character contains a bad byte, we can reposition the start of the next iteration.
-            // This will be very helpful for very long needles.
+    sz_rune_t a_rune, b_rune;
+    sz_rune_length_t a_rune_length, b_rune_length;
+
+    if (bound) {
+        for (; a < a_end && b < b_end && distance < bound; a += a_rune_length, b += b_rune_length) {
+            _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
+            _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
+            distance += (a_rune != b_rune);
         }
-    }
-    // If there are only 2 or 3 characters in the needle, we don't even need the nested loop.
-    else if (n_length <= 3) {
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            if (matches) return h + sz_u64_ctz(matches);
+        // If one string has more runes, we need to go through the tail.
+        if (distance < bound) {
+            for (; a < a_end && distance < bound; a += a_rune_length, ++distance)
+                _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
+
+            for (; b < b_end && distance < bound; b += b_rune_length, ++distance)
+                _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
         }
     }
-    // If the needle is smaller than the size of the ZMM register, we can use masked comparisons
-    // to avoid the the inner-most nested loop and compare the entire needle against a haystack
-    // slice in 3 CPU cycles.
     else {
-        __mmask64 n_mask = _sz_u64_mask_until(n_length);
-        sz_u512_vec_t n_full_vec, h_full_vec;
-        n_full_vec.zmm = _mm512_maskz_loadu_epi8(n_mask, n);
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches);
-                h_full_vec.zmm = _mm512_maskz_loadu_epi8(n_mask, h + potential_offset);
-                if (_mm512_mask_cmpneq_epi8_mask(n_mask, h_full_vec.zmm, n_full_vec.zmm) == 0)
-                    return h + potential_offset;
-                matches &= matches - 1;
-            }
-        }
-    }
-
-    // The "tail" of the function uses masked loads to process the remaining bytes.
-    {
-        mask = _sz_u64_mask_until(h_length - n_length + 1);
-        h_first_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_first);
-        h_mid_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_mid);
-        h_last_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_ctz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + potential_offset, n, n_length)) return h + potential_offset;
-            matches &= matches - 1;
+        for (; a < a_end && b < b_end; a += a_rune_length, b += b_rune_length) {
+            _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
+            _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
+            distance += (a_rune != b_rune);
         }
+        // If one string has more runes, we need to go through the tail.
+        for (; a < a_end; a += a_rune_length, ++distance) _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
+        for (; b < b_end; b += b_rune_length, ++distance) _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
     }
-    return SZ_NULL_CHAR;
+    return distance;
 }
 
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    __mmask64 mask;
-    sz_u512_vec_t h_vec, n_vec;
-    n_vec.zmm = _mm512_set1_epi8(n[0]);
-
-    while (h_length >= 64) {
-        h_vec.zmm = _mm512_loadu_si512(h + h_length - 64);
-        mask = _mm512_cmpeq_epi8_mask(h_vec.zmm, n_vec.zmm);
-        if (mask) return h + h_length - 1 - sz_u64_clz(mask);
-        h_length -= 64;
-    }
-
-    if (h_length) {
-        mask = _sz_u64_mask_until(h_length);
-        h_vec.zmm = _mm512_maskz_loadu_epi8(mask, h);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpeq_epu8_mask(mask, h_vec.zmm, n_vec.zmm);
-        if (mask) return h + 64 - sz_u64_clz(mask) - 1;
-    }
-
-    return SZ_NULL_CHAR;
-}
+#pragma endregion // Serial Implementation
 
-SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_avx512(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into ZMM registers.
-    __mmask64 mask;
-    __mmask64 matches;
-    sz_u512_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.zmm = _mm512_set1_epi8(n[offset_first]);
-    n_mid_vec.zmm = _mm512_set1_epi8(n[offset_mid]);
-    n_last_vec.zmm = _mm512_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 64; h_length -= 64) {
-        h_reversed = h + h_length - n_length - 64 + 1;
-        h_first_vec.zmm = _mm512_loadu_si512(h_reversed + offset_first);
-        h_mid_vec.zmm = _mm512_loadu_si512(h_reversed + offset_mid);
-        h_last_vec.zmm = _mm512_loadu_si512(h_reversed + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~((sz_u64_t)1 << (63 - potential_offset));
-        }
-    }
+/*  AVX2 implementation of the string similarity algorithms for Haswell processors and newer.
+ *  Very minimalistic (compared to AVX-512), but still faster than the serial implementation.
+ */
+#pragma region Haswell Implementation
+#if SZ_USE_HASWELL
+#pragma GCC push_options
+#pragma GCC target("haswell")
+#pragma clang attribute push(__attribute__((target("haswell"))), apply_to = function)
 
-    // The "tail" of the function uses masked loads to process the remaining bytes.
-    {
-        mask = _sz_u64_mask_until(h_length - n_length + 1);
-        h_first_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_first);
-        h_mid_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_mid);
-        h_last_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + 64 - potential_offset - 1, n, n_length))
-                return h + 64 - potential_offset - 1;
-            sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~((sz_u64_t)1 << (63 - potential_offset));
-        }
-    }
+#pragma clang attribute pop
+#pragma GCC pop_options
+#endif            // SZ_USE_HASWELL
+#pragma endregion // Haswell Implementation
 
-    return SZ_NULL_CHAR;
-}
+/*  AVX512 implementation of the string similarity algorithms for Skylake and newer CPUs.
+ *  Includes extensions: F, CD, ER, PF, VL, DQ, BW.
+ *
+ *  This is the "starting level" for the advanced algorithms using K-mask registers on x86.
+ */
+#pragma region Skylake Implementation
+#if SZ_USE_SKYLAKE
+#pragma GCC push_options
+#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
+#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
 
 #pragma clang attribute pop
 #pragma GCC pop_options
+#endif            // SZ_USE_SKYLAKE
+#pragma endregion // Skylake Implementation
 
+/*  AVX512 implementation of the string similarity algorithms for Ice Lake and newer CPUs.
+ *  Includes extensions:
+ *      - 2017 Skylake: F, CD, ER, PF, VL, DQ, BW,
+ *      - 2018 CannonLake: IFMA, VBMI,
+ *      - 2019 Ice Lake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES.
+ */
+#pragma region Ice Lake Implementation
+#if SZ_USE_ICE
 #pragma GCC push_options
 #pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
 #pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
@@ -5317,7 +784,7 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto63_avx512( //
  *  Uses a lot more CPU registers space, than the `upto63` variant.
  *  Benefits from the @b `vpermi2b` instructions, that can rotate the bytes in 2 registers at once.
  *
- *  This may be one of the most freuqently called kernels for:
+ *  This may be one of the most frequently called kernels for:
  *  - source code analysis, assuming most lines are either under 80 or under 120 characters long.
  *  - DNA sequence alignment, as most short reads are 50-300 characters long.
  */
@@ -5378,7 +845,6 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto255bound_avx512( //
  *  Benefits from the @b `valignd` instructions used to rotate UTF-32 unpacked unicode codepoints.
  *
  *  Each string is unpacked into 128 characters * 4 bytes per character / 64 bytes per register = 8 registers.
- *
  */
 SZ_INTERNAL sz_size_t _sz_edit_distance_utf8_skewed_diagonals_upto127_avx512( //
     sz_cptr_t shorter, sz_size_t shorter_length,                              //
@@ -5439,7 +905,7 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
     ones_u16_vec.zmm = _mm512_set1_epi16(1);
 
     // This is a mixed-precision implementation, using 8-bit representations for part of the operations.
-    // Even there, in case `SZ_USE_X86_AVX2=0`, let's use the `sz_u512_vec_t` type, addressing the first YMM halfs.
+    // Even there, in case `SZ_USE_HASWELL=0`, let's use the `sz_u512_vec_t` type, addressing the first YMM halfs.
     sz_u512_vec_t shorter_vec, longer_vec;
     sz_u512_vec_t ones_u8_vec;
     ones_u8_vec.ymms[0] = _mm256_set1_epi8(1);
@@ -5527,539 +993,60 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
             // First get the minimum of insertions and deletions.
             next_vec.zmm = _mm512_add_epi16(_mm512_min_epu16(insertions_vec.zmm, deletions_vec.zmm), ones_u16_vec.zmm);
             next_vec.zmm = _mm512_min_epu16(next_vec.zmm, substitutions_vec.zmm);
-            _mm512_mask_storeu_epi16(next_distances + i, remaining_length_mask, next_vec.zmm);
-            i += register_length;
-        }
-
-        // Perform a circular rotation (three-way swap) of those buffers, to reuse the memory, this time, with a shift,
-        // dropping the first element in the current array.
-        sz_u16_t *temporary = previous_distances;
-        previous_distances = current_distances + 1;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // Cache scalar before `free` call.
-    sz_size_t result = current_distances[0];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-#endif
-    return 0;
-}
-
-SZ_INTERNAL sz_size_t sz_edit_distance_avx512(   //
-    sz_cptr_t shorter, sz_size_t shorter_length, //
-    sz_cptr_t longer, sz_size_t longer_length,   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Bounded computations may exit early.
-    int const is_bounded = bound < longer_length;
-    if (is_bounded) {
-        // If one of the strings is empty - the edit distance is equal to the length of the other one.
-        if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
-        if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
-        // If the difference in length is beyond the `bound`, there is no need to check at all.
-        if (longer_length - shorter_length > bound) return bound;
-    }
-
-    // Make sure the shorter string is actually shorter.
-    if (shorter_length > longer_length) {
-        sz_cptr_t temporary = shorter;
-        shorter = longer;
-        longer = temporary;
-        sz_size_t temporary_length = shorter_length;
-        shorter_length = longer_length;
-        longer_length = temporary_length;
-    }
-
-    // Dispatch the right implementation based on the length of the strings.
-    if (longer_length < 64u)
-        return _sz_edit_distance_skewed_diagonals_upto63_avx512( //
-            shorter, shorter_length, longer, longer_length, bound);
-    // else if (longer_length < 256u * 256u)
-    //     return _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
-    //         shorter, shorter_length, longer, longer_length, bound, alloc);
-    else
-        return sz_edit_distance_serial(shorter, shorter_length, longer, longer_length, bound, alloc);
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_avx512(sz_cptr_t text, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "loads".
-    //
-    // A typical AWS Sapphire Rapids instance can have 48 KB x 2 blocks of L1 data cache per core,
-    // 2 MB x 2 blocks of L2 cache per core, and one shared 60 MB buffer of L3 cache.
-    // With two strings, we may consider the overal workload huge, if each exceeds 1 MB in length.
-    int const is_huge = length >= 1ull * 1024ull * 1024ull;
-    sz_u512_vec_t text_vec, sums_vec;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 16) {
-        __mmask16 mask = _sz_u16_mask_until(length);
-        text_vec.xmms[0] = _mm_maskz_loadu_epi8(mask, text);
-        sums_vec.xmms[0] = _mm_sad_epu8(text_vec.xmms[0], _mm_setzero_si128());
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_vec.xmms[0]);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_vec.xmms[0], 1);
-        return low + high;
-    }
-    else if (length <= 32) {
-        __mmask32 mask = _sz_u32_mask_until(length);
-        text_vec.ymms[0] = _mm256_maskz_loadu_epi8(mask, text);
-        sums_vec.ymms[0] = _mm256_sad_epu8(text_vec.ymms[0], _mm256_setzero_si256());
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymms[0]);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymms[0], 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        return low + high;
-    }
-    else if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        return _mm512_reduce_add_epi64(sums_vec.zmm);
-    }
-    else if (!is_huge) {
-        sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(text + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length; // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        for (text += head_length; body_length >= 64; text += 64, body_length -= 64) {
-            text_vec.zmm = _mm512_load_si512((__m512i const *)text);
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        }
-        text_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text);
-        sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        return _mm512_reduce_add_epi64(sums_vec.zmm);
-    }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    //
-    //      1. Moving in both directions to maximize the throughput, when fetching from multiple
-    //         memory pages. Also helps with cache set-associativity issues, as we won't always
-    //         be fetching the same entries in the lookup table.
-    //      2. Using non-temporal stores to avoid polluting the cache.
-    //      3. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
-    //         for predictable patterns, so disregard this advice.
-    //
-    // Bidirectional traversal generally adds about 10% to such algorithms.
-    else {
-        sz_u512_vec_t text_reversed_vec, sums_reversed_vec;
-        sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64;
-        sz_size_t tail_length = (sz_size_t)(text + length) % 64;
-        sz_size_t body_length = length - head_length - tail_length;
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-
-        text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        text_reversed_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text + head_length + body_length);
-        sums_reversed_vec.zmm = _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512());
-
-        // Now in the main loop, we can use non-temporal loads and stores,
-        // performing the operation in both directions.
-        for (text += head_length; body_length >= 128; text += 64, text += 64, body_length -= 128) {
-            text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-            text_reversed_vec.zmm = _mm512_stream_load_si512((__m512i *)(text + body_length - 64));
-            sums_reversed_vec.zmm =
-                _mm512_add_epi64(sums_reversed_vec.zmm, _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512()));
-        }
-        if (body_length >= 64) {
-            text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        }
-
-        return _mm512_reduce_add_epi64(_mm512_add_epi64(sums_vec.zmm, sums_reversed_vec.zmm));
-    }
-}
-
-SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    if (length < 4 * window_length) {
-        sz_hashes_serial(start, length, window_length, step, callback, callback_handle);
-        return;
-    }
-
-    // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
-    // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
-    sz_size_t const max_hashes = length - window_length + 1;
-    sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
-    sz_u8_t const *text_first = (sz_u8_t const *)start;
-    sz_u8_t const *text_second = text_first + min_hashes_per_thread;
-    sz_u8_t const *text_third = text_first + min_hashes_per_thread * 2;
-    sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
-    sz_u8_t const *text_end = text_first + length;
-
-    // Broadcast the global constants into the registers.
-    // Both high and low hashes will work with the same prime and golden ratio.
-    sz_u512_vec_t prime_vec, golden_ratio_vec;
-    prime_vec.zmm = _mm512_set1_epi64(SZ_U64_MAX_PRIME);
-    golden_ratio_vec.zmm = _mm512_set1_epi64(11400714819323198485ull);
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // We will be evaluating 4 offsets at a time with 2 different hash functions.
-    // We can fit all those 8 state variables in each of the following ZMM registers.
-    sz_u512_vec_t base_vec, prime_power_vec, shift_vec;
-    base_vec.zmm = _mm512_set_epi64(31ull, 31ull, 31ull, 31ull, 257ull, 257ull, 257ull, 257ull);
-    shift_vec.zmm = _mm512_set_epi64(0ull, 0ull, 0ull, 0ull, 77ull, 77ull, 77ull, 77ull);
-    prime_power_vec.zmm = _mm512_set_epi64(prime_power_low, prime_power_low, prime_power_low, prime_power_low,
-                                           prime_power_high, prime_power_high, prime_power_high, prime_power_high);
-
-    // Compute the initial hash values for every one of the four windows.
-    sz_u512_vec_t hash_vec, chars_vec;
-    hash_vec.zmm = _mm512_setzero_si512();
-    for (sz_u8_t const *prefix_end = text_first + window_length; text_first < prefix_end;
-         ++text_first, ++text_second, ++text_third, ++text_fourth) {
-
-        // 1. Multiply the hashes by the base.
-        hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`...
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
-                                         text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-
-        // 3. Add the incoming characters.
-        hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
-                                              _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
-    }
-
-    // 5. Compute the hash mix, that will be used to index into the fingerprint.
-    //    This includes a serial step at the end.
-    sz_u512_vec_t hash_mix_vec;
-    hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
-    hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
-                                            _mm512_extracti64x4_epi64(hash_mix_vec.zmm, 0));
-
-    callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-    callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-    callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-    callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-
-    // Now repeat that operation for the remaining characters, discarding older characters.
-    sz_size_t cycle = 1;
-    sz_size_t step_mask = step - 1;
-    for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
-        // 0. Load again the four characters we are dropping, shift them, and subtract.
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[-window_length], text_third[-window_length],
-                                         text_second[-window_length], text_first[-window_length], //
-                                         text_fourth[-window_length], text_third[-window_length],
-                                         text_second[-window_length], text_first[-window_length]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-        hash_vec.zmm = _mm512_sub_epi64(hash_vec.zmm, _mm512_mullo_epi64(chars_vec.zmm, prime_power_vec.zmm));
-
-        // 1. Multiply the hashes by the base.
-        hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
-                                         text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-
-        // ... and prefetch the next four characters into Level 2 or higher.
-        _mm_prefetch((sz_cptr_t)text_fourth + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_third + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_second + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_first + 1, _MM_HINT_T1);
-
-        // 3. Add the incoming characters.
-        hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
-                                              _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
-
-        // 5. Compute the hash mix, that will be used to index into the fingerprint.
-        //    This includes a serial step at the end.
-        hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
-        hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
-                                                _mm512_castsi512_si256(hash_mix_vec.zmm));
-
-        if ((cycle & step_mask) == 0) {
-            callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-            callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-            callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-            callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-        }
-    }
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512vbmi", "avx512vbmi2", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512vbmi,avx512vbmi2,bmi,bmi2"))), \
-                             apply_to = function)
-
-SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-
-    // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
-    // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
-    // But if at least 3 cache lines are touched, the AVX-512 implementation should be faster.
-    if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
-        return;
-    }
-
-    // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
-    // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
-    // by computing 2 masks - for the head and tail, using masked stores for the head and tail, and unmasked
-    // for the body.
-    sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-    sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-    __mmask64 head_mask = _sz_u64_mask_until(head_length);
-    __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-
-    // We need to pull the lookup table into 4x ZMM registers.
-    // We can use `vpermi2b` instruction to perform the look in two ZMM registers with `_mm512_permutex2var_epi8`
-    // intrinsics, but it has a 6-cycle latency on Sapphire Rapids and requires AVX512-VBMI. Assuming we need to
-    // operate on 4 registers, it might be cleaner to use 2x separate `_mm512_permutexvar_epi8` calls.
-    // Combining the results with 2x `_mm512_test_epi8_mask` and 3x blends afterwards.
-    //
-    //  - 4x `_mm512_permutexvar_epi8` maps to "VPERMB (ZMM, ZMM, ZMM)":
-    //      - On Ice Lake: 3 cycles latency, ports: 1*p5
-    //      - On Genoa: 6 cycles latency, ports: 1*FP12
-    //  - 3x `_mm512_mask_blend_epi8` maps to "VPBLENDMB_Z (ZMM, K, ZMM, ZMM)":
-    //      - On Ice Lake: 3 cycles latency, ports: 1*p05
-    //      - On Genoa: 1 cycle latency, ports: 1*FP0123
-    //  - 2x `_mm512_test_epi8_mask` maps to "VPTESTMB (K, ZMM, ZMM)":
-    //      - On Ice Lake: 3 cycles latency, ports: 1*p5
-    //      - On Genoa: 4 cycles latency, ports: 1*FP01
-    //
-    sz_u512_vec_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec;
-    lut_0_to_63_vec.zmm = _mm512_loadu_si512((lut));
-    lut_64_to_127_vec.zmm = _mm512_loadu_si512((lut + 64));
-    lut_128_to_191_vec.zmm = _mm512_loadu_si512((lut + 128));
-    lut_192_to_255_vec.zmm = _mm512_loadu_si512((lut + 192));
-
-    sz_u512_vec_t first_bit_vec, second_bit_vec;
-    first_bit_vec.zmm = _mm512_set1_epi8((char)0x80);
-    second_bit_vec.zmm = _mm512_set1_epi8((char)0x40);
-
-    __mmask64 first_bit_mask, second_bit_mask;
-    sz_u512_vec_t source_vec;
-    // If the top bit is set in each word of `source_vec`, than we use `lookup_128_to_191_vec` or
-    // `lookup_192_to_255_vec`. If the second bit is set, we use `lookup_64_to_127_vec` or `lookup_192_to_255_vec`.
-    sz_u512_vec_t lookup_0_to_63_vec, lookup_64_to_127_vec, lookup_128_to_191_vec, lookup_192_to_255_vec;
-    sz_u512_vec_t blended_0_to_127_vec, blended_128_to_255_vec, blended_0_to_255_vec;
-
-    // Handling the head.
-    if (head_length) {
-        source_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, source);
-        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
-        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
-        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
-        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
-        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
-        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
-        blended_0_to_127_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
-        blended_128_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
-        blended_0_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
-        _mm512_mask_storeu_epi8(target, head_mask, blended_0_to_255_vec.zmm);
-        source += head_length, target += head_length, length -= head_length;
-    }
+            _mm512_mask_storeu_epi16(next_distances + i, remaining_length_mask, next_vec.zmm);
+            i += register_length;
+        }
 
-    // Handling the body in 64-byte chunks aligned to cache-line boundaries with respect to `target`.
-    while (length >= 64) {
-        source_vec.zmm = _mm512_loadu_si512(source);
-        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
-        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
-        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
-        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
-        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
-        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
-        blended_0_to_127_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
-        blended_128_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
-        blended_0_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
-        _mm512_store_si512(target, blended_0_to_255_vec.zmm); //! Aligned store, our main weapon!
-        source += 64, target += 64, length -= 64;
+        // Perform a circular rotation (three-way swap) of those buffers, to reuse the memory, this time, with a shift,
+        // dropping the first element in the current array.
+        sz_u16_t *temporary = previous_distances;
+        previous_distances = current_distances + 1;
+        current_distances = next_distances;
+        next_distances = temporary;
     }
 
-    // Handling the tail.
-    if (tail_length) {
-        source_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, source);
-        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
-        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
-        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
-        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
-        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
-        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
-        blended_0_to_127_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
-        blended_128_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
-        blended_0_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
-        _mm512_mask_storeu_epi8(target, tail_mask, blended_0_to_255_vec.zmm);
-        source += tail_length, target += tail_length, length -= tail_length;
-    }
+    // Cache scalar before `free` call.
+    sz_size_t result = current_distances[0];
+    alloc->free(distances, buffer_length, alloc->handle);
+    return result;
+#endif
+    return 0;
 }
 
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
+SZ_INTERNAL sz_size_t sz_edit_distance_avx512(   //
+    sz_cptr_t shorter, sz_size_t shorter_length, //
+    sz_cptr_t longer, sz_size_t longer_length,   //
+    sz_size_t bound, sz_memory_allocator_t *alloc) {
 
-    // Before initializing the AVX-512 vectors, we may want to run the sequential code for the first few bytes.
-    // In practice, that only hurts, even when we have matches every 5-ish bytes.
-    //
-    //      if (length < SZ_SWAR_THRESHOLD) return sz_find_charset_serial(text, length, filter);
-    //      sz_cptr_t early_result = sz_find_charset_serial(text, SZ_SWAR_THRESHOLD, filter);
-    //      if (early_result) return early_result;
-    //      text += SZ_SWAR_THRESHOLD;
-    //      length -= SZ_SWAR_THRESHOLD;
-    //
-    // Let's unzip even and odd elements and replicate them into both lanes of the YMM register.
-    // That way when we invoke `_mm512_shuffle_epi8` we can use the same mask for both lanes.
-    sz_u512_vec_t filter_even_vec, filter_odd_vec;
-    __m256i filter_ymm = _mm256_lddqu_si256((__m256i const *)filter);
-    // There are a few way to initialize filters without having native strided loads.
-    // In the cronological order of experiments:
-    // - serial code initializing 128 bytes of odd and even mask
-    // - using several shuffles
-    // - using `_mm512_permutexvar_epi8`
-    // - using `_mm512_broadcast_i32x4(_mm256_castsi256_si128(_mm256_maskz_compress_epi8(0x55555555, filter_ymm)))`
-    //   and `_mm512_broadcast_i32x4(_mm256_castsi256_si128(_mm256_maskz_compress_epi8(0xaaaaaaaa, filter_ymm)))`
-    filter_even_vec.zmm = _mm512_broadcast_i32x4(_mm256_castsi256_si128( // broadcast __m128i to __m512i
-        _mm256_maskz_compress_epi8(0x55555555, filter_ymm)));
-    filter_odd_vec.zmm = _mm512_broadcast_i32x4(_mm256_castsi256_si128( // broadcast __m128i to __m512i
-        _mm256_maskz_compress_epi8(0xaaaaaaaa, filter_ymm)));
-    // After the unzipping operation, we can validate the contents of the vectors like this:
-    //
-    //      for (sz_size_t i = 0; i != 16; ++i) {
-    //          sz_assert(filter_even_vec.u8s[i] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 16] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 16] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 32] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 32] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 48] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 48] == filter->_u8s[i * 2 + 1]);
-    //      }
-    //
-    sz_u512_vec_t text_vec;
-    sz_u512_vec_t lower_nibbles_vec, higher_nibbles_vec;
-    sz_u512_vec_t bitset_even_vec, bitset_odd_vec;
-    sz_u512_vec_t bitmask_vec, bitmask_lookup_vec;
-    bitmask_lookup_vec.zmm = _mm512_set_epi8(                       //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);
-
-    while (length) {
-        // The following algorithm is a transposed equivalent of the "SIMDized check which bytes are in a set"
-        // solutions by Wojciech Muła. We populate the bitmask differently and target newer CPUs, so
-        // StrinZilla uses a somewhat different approach.
-        // http://0x80.pl/articles/simd-byte-lookup.html#alternative-implementation-new
-        //
-        //      sz_u8_t input = *(sz_u8_t const *)text;
-        //      sz_u8_t lo_nibble = input & 0x0f;
-        //      sz_u8_t hi_nibble = input >> 4;
-        //      sz_u8_t bitset_even = filter_even_vec.u8s[hi_nibble];
-        //      sz_u8_t bitset_odd = filter_odd_vec.u8s[hi_nibble];
-        //      sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //      sz_u8_t bitset = lo_nibble < 8 ? bitset_even : bitset_odd;
-        //      if ((bitset & bitmask) != 0) return text;
-        //      else { length--, text++; }
-        //
-        // The nice part about this, loading the strided data is vey easy with Arm NEON,
-        // while with x86 CPUs after AVX, shuffles within 256 bits shouldn't be an issue either.
-        sz_size_t load_length = sz_min_of_two(length, 64);
-        __mmask64 load_mask = _sz_u64_mask_until(load_length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, text);
-        lower_nibbles_vec.zmm = _mm512_and_si512(text_vec.zmm, _mm512_set1_epi8(0x0f));
-        bitmask_vec.zmm = _mm512_shuffle_epi8(bitmask_lookup_vec.zmm, lower_nibbles_vec.zmm);
-        //
-        // At this point we can validate the `bitmask_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != load_length; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t lo_nibble = input & 0x0f;
-        //          sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //          sz_assert(bitmask_vec.u8s[i] == bitmask);
-        //      }
-        //
-        // Shift right every byte by 4 bits.
-        // There is no `_mm512_srli_epi8` intrinsic, so we have to use `_mm512_srli_epi16`
-        // and combine it with a mask to clear the higher bits.
-        higher_nibbles_vec.zmm = _mm512_and_si512(_mm512_srli_epi16(text_vec.zmm, 4), _mm512_set1_epi8(0x0f));
-        bitset_even_vec.zmm = _mm512_shuffle_epi8(filter_even_vec.zmm, higher_nibbles_vec.zmm);
-        bitset_odd_vec.zmm = _mm512_shuffle_epi8(filter_odd_vec.zmm, higher_nibbles_vec.zmm);
-        //
-        // At this point we can validate the `bitset_even_vec` and `bitset_odd_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != load_length; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t const *bitset_ptr = &filter->_u8s[0];
-        //          sz_u8_t hi_nibble = input >> 4;
-        //          sz_u8_t bitset_even = bitset_ptr[hi_nibble * 2];
-        //          sz_u8_t bitset_odd = bitset_ptr[hi_nibble * 2 + 1];
-        //          sz_assert(bitset_even_vec.u8s[i] == bitset_even);
-        //          sz_assert(bitset_odd_vec.u8s[i] == bitset_odd);
-        //      }
-        //
-        // TODO: Is this a good place for ternary logic?
-        __mmask64 take_first = _mm512_cmplt_epi8_mask(lower_nibbles_vec.zmm, _mm512_set1_epi8(8));
-        bitset_even_vec.zmm = _mm512_mask_blend_epi8(take_first, bitset_odd_vec.zmm, bitset_even_vec.zmm);
-        __mmask64 matches_mask = _mm512_mask_test_epi8_mask(load_mask, bitset_even_vec.zmm, bitmask_vec.zmm);
-        if (matches_mask) {
-            int offset = sz_u64_ctz(matches_mask);
-            return text + offset;
-        }
-        else { text += load_length, length -= load_length; }
+    // Bounded computations may exit early.
+    int const is_bounded = bound < longer_length;
+    if (is_bounded) {
+        // If one of the strings is empty - the edit distance is equal to the length of the other one.
+        if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
+        if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
+        // If the difference in length is beyond the `bound`, there is no need to check at all.
+        if (longer_length - shorter_length > bound) return bound;
     }
 
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-    return sz_rfind_charset_serial(text, length, filter);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_many_avx512(                        //
-    sz_cptr_t haystack, sz_size_t haystack_length,              //
-    sz_cptr_t const *needles, sz_size_t const *needles_lengths, //
-    sz_size_t *needle_offset) {
+    // Make sure the shorter string is actually shorter.
+    if (shorter_length > longer_length) {
+        sz_cptr_t temporary = shorter;
+        shorter = longer;
+        longer = temporary;
+        sz_size_t temporary_length = shorter_length;
+        shorter_length = longer_length;
+        longer_length = temporary_length;
+    }
 
-    // When dealing with huge needles vocabularies, like in tokenization workloads, we need to construct an automaton.
-    // But in many cases, the vocabulary is small enough to use a simpler DFA-less approach, combining the ideas from
-    // the `sz_find_avx512` and `sz_find_charset_avx512` functions.
-    //
-    // Pick the offsets within needles where there is the least variance in the characters.
-    // Like for "the", "then", "there", "these", "those", "their", "they", "them", "that", "this", "thus", "than":
-    //
-    //    0: 't'
-    //    1: 'h'
-    //    2: 'e', 'a', 'i', 'o', 'u'
-    //    3: 'n', 'r', 's', 'i', 'y', 'm', 't'
-    //
-    // So depending on our "register budget", we can use a different number of pivot points: offset 0, 1, 2 make
-    // the most sense if we can only use 3 ZMM registers.
-    sz_unused(haystack && haystack_length && needles && needles_lengths && needle_offset);
-    return 0;
+    // Dispatch the right implementation based on the length of the strings.
+    if (longer_length < 64u)
+        return _sz_edit_distance_skewed_diagonals_upto63_avx512( //
+            shorter, shorter_length, longer, longer_length, bound);
+    // else if (longer_length < 256u * 256u)
+    //     return _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
+    //         shorter, shorter_length, longer, longer_length, bound, alloc);
+    else
+        return sz_edit_distance_serial(shorter, shorter_length, longer, longer_length, bound, alloc);
 }
 
 /**
@@ -6075,9 +1062,9 @@ SZ_PUBLIC sz_cptr_t sz_find_many_avx512(                        //
  *  a slice, which is much easier to optimize. In that case we are sampling costs not from arbitrary parts of
  *  a 256 x 256 matrix, but from a single row!
  */
-SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
+SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_ice( //
+    sz_cptr_t shorter, sz_size_t shorter_length,                      //
+    sz_cptr_t longer, sz_size_t longer_length,                        //
     sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
 
     // If one of the strings is empty - the edit distance is equal to the length of the other one
@@ -6284,779 +1271,57 @@ SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_avx512( //
     return result;
 }
 
-SZ_INTERNAL sz_ssize_t sz_alignment_score_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,  //
-    sz_cptr_t longer, sz_size_t longer_length,    //
+SZ_INTERNAL sz_ssize_t sz_alignment_score_ice(   //
+    sz_cptr_t shorter, sz_size_t shorter_length, //
+    sz_cptr_t longer, sz_size_t longer_length,   //
     sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
 
     if (sz_max_of_two(shorter_length, longer_length) < (256ull * 256ull * 256ull))
-        return _sz_alignment_score_wagner_fisher_upto17m_avx512(shorter, shorter_length, longer, longer_length, subs,
-                                                                gap, alloc);
+        return _sz_alignment_score_wagner_fisher_upto17m_ice(shorter, shorter_length, longer, longer_length, subs, gap,
+                                                             alloc);
     else
         return sz_alignment_score_serial(shorter, shorter_length, longer, longer_length, subs, gap, alloc);
 }
 
-enum sz_encoding_t {
-    sz_encoding_unknown_k = 0,
-    sz_encoding_ascii_k = 1,
-    sz_encoding_utf8_k = 2,
-    sz_encoding_utf16_k = 3,
-    sz_encoding_utf32_k = 4,
-    sz_jwt_k,
-    sz_base64_k,
-    // Low priority encodings:
-    sz_encoding_utf8bom_k = 5,
-    sz_encoding_utf16le_k = 6,
-    sz_encoding_utf16be_k = 7,
-    sz_encoding_utf32le_k = 8,
-    sz_encoding_utf32be_k = 9,
-};
-
-// Character Set Detection is one of the most commonly performed operations in data processing with
-// [Chardet](https://github.com/chardet/chardet), [Charset Normalizer](https://github.com/jawah/charset_normalizer),
-// [cChardet](https://github.com/PyYoshi/cChardet) being the most commonly used options in the Python ecosystem.
-// All of them are notoriously slow.
-//
-// Moreover, as of October 2024, UTF-8 is the dominant character encoding on the web, used by 98.4% of websites.
-// Other have minimal usage, according to [W3Techs](https://w3techs.com/technologies/overview/character_encoding):
-// - ISO-8859-1: 1.2%
-// - Windows-1252: 0.3%
-// - Windows-1251: 0.2%
-// - EUC-JP: 0.1%
-// - Shift JIS: 0.1%
-// - EUC-KR: 0.1%
-// - GB2312: 0.1%
-// - Windows-1250: 0.1%
-// Within programming language implementations and database management systems, 16-bit and 32-bit fixed-width encodings
-// are also very popular and we need a way to efficienly differentiate between the most common UTF flavors, ASCII, and
-// the rest.
-//
-// One good solution is the [simdutf](https://github.com/simdutf/simdutf) library, but it depends on the C++ runtime
-// and focuses more on incremental validation & transcoding, rather than detection.
-//
-// So we need a very fast and efficient way of determining
-SZ_PUBLIC sz_bool_t sz_detect_encoding(sz_cptr_t text, sz_size_t length) {
-    // https://github.com/simdutf/simdutf/blob/master/src/icelake/icelake_utf8_validation.inl.cpp
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_from_utf8.inl.cpp#L81
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L661
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L788
-
-    // We can implement this operation simpler & differently, assuming most of the time continuous chunks of memory
-    // have identical encoding. With Russian and many European languages, we generally deal with 2-byte codepoints
-    // with occasional 1-byte punctuation marks. In the case of Chinese, Japanese, and Korean, we deal with 3-byte
-    // codepoints. In the case of emojis, we deal with 4-byte codepoints.
-    // We can also use the idea, that misaligned reads are quite cheap on modern CPUs.
-    int can_be_ascii = 1, can_be_utf8 = 1, can_be_utf16 = 1, can_be_utf32 = 1;
-    sz_unused(can_be_ascii + can_be_utf8 + can_be_utf16 + can_be_utf32);
-    sz_unused(text && length);
-    return sz_false_k;
-}
-
 #pragma clang attribute pop
 #pragma GCC pop_options
-#endif
+#endif            // SZ_USE_ICE
+#pragma endregion // Ice Lake Implementation
 
-#pragma endregion
-
-/*  @brief  Implementation of the string search algorithms using the Arm NEON instruction set, available on 64-bit
- *          Arm processors. Implements: {substring search, character search, character set search} x {forward, reverse}.
+/*  Implementation of the similarity algorithms using the Arm NEON instruction set, available on 64-bit
+ *  Arm processors. Covers billions of mobile CPUs worldwide, including Apple's A-series, and Qualcomm's Snapdragon.
  */
-#pragma region ARM NEON
-
-#if SZ_USE_ARM_NEON
+#pragma region NEON Implementation
+#if SZ_USE_NEON
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+simd")
 #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
 
-/**
- *  @brief  Helper structure to simplify work with 64-bit words.
- */
-typedef union sz_u128_vec_t {
-    uint8x16_t u8x16;
-    uint16x8_t u16x8;
-    uint32x4_t u32x4;
-    uint64x2_t u64x2;
-    sz_u64_t u64s[2];
-    sz_u32_t u32s[4];
-    sz_u16_t u16s[8];
-    sz_u8_t u8s[16];
-} sz_u128_vec_t;
-
-SZ_INTERNAL sz_u64_t _sz_vreinterpretq_u8_u4(uint8x16_t vec) {
-    // Use `vshrn` to produce a bitmask, similar to `movemask` in SSE.
-    // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
-    return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(vec), 4)), 0) & 0x8888888888888888ull;
-}
-
-SZ_PUBLIC sz_ordering_t sz_order_neon(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    //! Before optimizing this, read the "Operations Not Worth Optimizing" in Contributions Guide:
-    //! https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md#general-performance-observations
-    return sz_order_serial(a, a_length, b, b_length);
-}
-
-SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_u128_vec_t a_vec, b_vec;
-    for (; length >= 16; a += 16, b += 16, length -= 16) {
-        a_vec.u8x16 = vld1q_u8((sz_u8_t const *)a);
-        b_vec.u8x16 = vld1q_u8((sz_u8_t const *)b);
-        uint8x16_t cmp = vceqq_u8(a_vec.u8x16, b_vec.u8x16);
-        if (vminvq_u8(cmp) != 255) { return sz_false_k; } // Check if all bytes match
-    }
-
-    // Handle remaining bytes
-    if (length) return sz_equal_serial(a, b, length);
-    return sz_true_k;
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_neon(sz_cptr_t text, sz_size_t length) {
-    uint64x2_t sum_vec = vdupq_n_u64(0);
-
-    // Process 16 bytes (128 bits) at a time
-    for (; length >= 16; text += 16, length -= 16) {
-        uint8x16_t vec = vld1q_u8((sz_u8_t const *)text);      // Load 16 bytes
-        uint16x8_t pairwise_sum1 = vpaddlq_u8(vec);            // Pairwise add lower and upper 8 bits
-        uint32x4_t pairwise_sum2 = vpaddlq_u16(pairwise_sum1); // Pairwise add 16-bit results
-        uint64x2_t pairwise_sum3 = vpaddlq_u32(pairwise_sum2); // Pairwise add 32-bit results
-        sum_vec = vaddq_u64(sum_vec, pairwise_sum3);           // Accumulate the sum
-    }
-
-    // Final reduction of `sum_vec` to a single scalar
-    sz_u64_t sum = vgetq_lane_u64(sum_vec, 0) + vgetq_lane_u64(sum_vec, 1);
-    if (length) sum += sz_checksum_serial(text, length);
-    return sum;
-}
-
-SZ_PUBLIC void sz_copy_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // In most cases the `source` and the `target` are not aligned, but we should
-    // at least make sure that writes don't touch many cache lines.
-    // NEON has an instruction to load and write 64 bytes at once.
-    //
-    //    sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-    //    sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-    //    for (; head_length; target += 1, source += 1, head_length -= 1) *target = *source;
-    //    length -= head_length;
-    //    for (; length >= 64; target += 64, source += 64, length -= 64)
-    //        vst4q_u8((sz_u8_t *)target, vld1q_u8_x4((sz_u8_t const *)source));
-    //    for (; tail_length; target += 1, source += 1, tail_length -= 1) *target = *source;
-    //
-    // Sadly, those instructions end up being 20% slower than the code processing 16 bytes at a time:
-    for (; length >= 16; target += 16, source += 16, length -= 16)
-        vst1q_u8((sz_u8_t *)target, vld1q_u8((sz_u8_t const *)source));
-    if (length) sz_copy_serial(target, source, length);
-}
-
-SZ_PUBLIC void sz_move_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // When moving small buffers, using a small buffer on stack as a temporary storage is faster.
-
-    if (target < source || target >= source + length) {
-        // Non-overlapping, proceed forward
-        sz_copy_neon(target, source, length);
-    }
-    else {
-        // Overlapping, proceed backward
-        target += length;
-        source += length;
-
-        sz_u128_vec_t src_vec;
-        while (length >= 16) {
-            target -= 16, source -= 16, length -= 16;
-            src_vec.u8x16 = vld1q_u8((sz_u8_t const *)source);
-            vst1q_u8((sz_u8_t *)target, src_vec.u8x16);
-        }
-        while (length) {
-            target -= 1, source -= 1, length -= 1;
-            *target = *source;
-        }
-    }
-}
-
-SZ_PUBLIC void sz_fill_neon(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    uint8x16_t fill_vec = vdupq_n_u8(value); // Broadcast the value across the register
-
-    while (length >= 16) {
-        vst1q_u8((sz_u8_t *)target, fill_vec);
-        target += 16;
-        length -= 16;
-    }
-
-    // Handle remaining bytes
-    if (length) sz_fill_serial(target, length, value);
-}
-
-SZ_PUBLIC void sz_look_up_transform_neon(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-
-    // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
-    // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
-    if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
-        return;
-    }
-
-    sz_size_t head_length = (16 - ((sz_size_t)target % 16)) % 16; // 15 or less.
-    sz_size_t tail_length = (sz_size_t)(target + length) % 16;    // 15 or less.
-
-    // We need to pull the lookup table into 16x NEON registers. We have a total of 32 such registers.
-    // According to the Neoverse V2 manual, the 4-table lookup has a latency of 6 cycles, and 4x throughput.
-    uint8x16x4_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec;
-    lut_0_to_63_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 0));
-    lut_64_to_127_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 64));
-    lut_128_to_191_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 128));
-    lut_192_to_255_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 192));
-
-    sz_u128_vec_t source_vec;
-    // If the top bit is set in each word of `source_vec`, than we use `lookup_128_to_191_vec` or
-    // `lookup_192_to_255_vec`. If the second bit is set, we use `lookup_64_to_127_vec` or `lookup_192_to_255_vec`.
-    sz_u128_vec_t lookup_0_to_63_vec, lookup_64_to_127_vec, lookup_128_to_191_vec, lookup_192_to_255_vec;
-    sz_u128_vec_t blended_0_to_255_vec;
-
-    // Process the head with serial code
-    for (; head_length; target += 1, source += 1, head_length -= 1) *target = lut[*(sz_u8_t const *)source];
-
-    // Table lookups on Arm are much simpler to use than on x86, as we can use the `vqtbl4q_u8` instruction
-    // to perform a 4-table lookup in a single instruction. The XORs are used to adjust the lookup position
-    // within each 64-byte range of the table.
-    // Details on the 4-table lookup: https://lemire.me/blog/2019/07/23/arbitrary-byte-to-byte-maps-using-arm-neon/
-    length -= head_length;
-    length -= tail_length;
-    for (; length >= 16; source += 16, target += 16, length -= 16) {
-        source_vec.u8x16 = vld1q_u8((sz_u8_t const *)source);
-        lookup_0_to_63_vec.u8x16 = vqtbl4q_u8(lut_0_to_63_vec, source_vec.u8x16);
-        lookup_64_to_127_vec.u8x16 = vqtbl4q_u8(lut_64_to_127_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0x40)));
-        lookup_128_to_191_vec.u8x16 = vqtbl4q_u8(lut_128_to_191_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0x80)));
-        lookup_192_to_255_vec.u8x16 = vqtbl4q_u8(lut_192_to_255_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0xc0)));
-        blended_0_to_255_vec.u8x16 = vorrq_u8(vorrq_u8(lookup_0_to_63_vec.u8x16, lookup_64_to_127_vec.u8x16),
-                                              vorrq_u8(lookup_128_to_191_vec.u8x16, lookup_192_to_255_vec.u8x16));
-        vst1q_u8((sz_u8_t *)target, blended_0_to_255_vec.u8x16);
-    }
-
-    // Process the tail with serial code
-    for (; tail_length; target += 1, source += 1, tail_length -= 1) *target = lut[*(sz_u8_t const *)source];
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec, n_vec, matches_vec;
-    n_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)n);
-
-    while (h_length >= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)h);
-        matches_vec.u8x16 = vceqq_u8(h_vec.u8x16, n_vec.u8x16);
-        // In Arm NEON we don't have a `movemask` to combine it with `ctz` and get the offset of the match.
-        // But assuming the `vmaxvq` is cheap, we can use it to find the first match, by blending (bitwise selecting)
-        // the vector with a relative offsets array.
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        if (matches) return h + sz_u64_ctz(matches) / 4;
-
-        h += 16, h_length -= 16;
-    }
-
-    return sz_find_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec, n_vec, matches_vec;
-    n_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)n);
-
-    while (h_length >= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)h + h_length - 16);
-        matches_vec.u8x16 = vceqq_u8(h_vec.u8x16, n_vec.u8x16);
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        if (matches) return h + h_length - 1 - sz_u64_clz(matches) / 4;
-        h_length -= 16;
-    }
-
-    return sz_rfind_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_u64_t _sz_find_charset_neon_register(sz_u128_vec_t h_vec, uint8x16_t set_top_vec_u8x16,
-                                                  uint8x16_t set_bottom_vec_u8x16) {
-
-    // Once we've read the characters in the haystack, we want to
-    // compare them against our bitset. The serial version of that code
-    // would look like: `(set_->_u8s[c >> 3] & (1u << (c & 7u))) != 0`.
-    uint8x16_t byte_index_vec = vshrq_n_u8(h_vec.u8x16, 3);
-    uint8x16_t byte_mask_vec = vshlq_u8(vdupq_n_u8(1), vreinterpretq_s8_u8(vandq_u8(h_vec.u8x16, vdupq_n_u8(7))));
-    uint8x16_t matches_top_vec = vqtbl1q_u8(set_top_vec_u8x16, byte_index_vec);
-    // The table lookup instruction in NEON replies to out-of-bound requests with zeros.
-    // The values in `byte_index_vec` all fall in [0; 32). So for values under 16, substracting 16 will underflow
-    // and map into interval [240, 256). Meaning that those will be populated with zeros and we can safely
-    // merge `matches_top_vec` and `matches_bottom_vec` with a bitwise OR.
-    uint8x16_t matches_bottom_vec = vqtbl1q_u8(set_bottom_vec_u8x16, vsubq_u8(byte_index_vec, vdupq_n_u8(16)));
-    uint8x16_t matches_vec = vorrq_u8(matches_top_vec, matches_bottom_vec);
-    // Istead of pure `vandq_u8`, we can immediately broadcast a match presence across each 8-bit word.
-    matches_vec = vtstq_u8(matches_vec, byte_mask_vec);
-    return _sz_vreinterpretq_u8_u4(matches_vec);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_neon(h, h_length, n);
-
-    // Scan through the string.
-    // Assuming how tiny the Arm NEON registers are, we should avoid internal branches at all costs.
-    // That's why, for smaller needles, we use different loops.
-    if (n_length == 2) {
-        // Broadcast needle characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_last_vec, n_first_vec, n_last_vec, matches_vec;
-        // Dealing with 16-bit values, we can load 2 registers at a time and compare 31 possible offsets
-        // in a single loop iteration.
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[0]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[1]);
-        for (; h_length >= 17; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 0));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 1));
-            matches_vec.u8x16 =
-                vandq_u8(vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            if (matches) return h + sz_u64_ctz(matches) / 4;
-        }
-    }
-    else if (n_length == 3) {
-        // Broadcast needle characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-        // Comparing 24-bit values is a bumer. Being lazy, I went with the same approach
-        // as when searching for string over 4 characters long. I only avoid the last comparison.
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[0]);
-        n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[1]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[2]);
-        for (; h_length >= 18; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 0));
-            h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 1));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 2));
-            matches_vec.u8x16 = vandq_u8(                           //
-                vandq_u8(                                           //
-                    vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                    vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-                vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            if (matches) return h + sz_u64_ctz(matches) / 4;
-        }
-    }
-    else {
-        // Pick the parts of the needle that are worth comparing.
-        sz_size_t offset_first, offset_mid, offset_last;
-        _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-        // Broadcast those characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_first]);
-        n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_mid]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_last]);
-        // Walk through the string.
-        for (; h_length >= n_length + 16; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_first));
-            h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_mid));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_last));
-            matches_vec.u8x16 = vandq_u8(                           //
-                vandq_u8(                                           //
-                    vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                    vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-                vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches) / 4;
-                if (sz_equal(h + potential_offset, n, n_length)) return h + potential_offset;
-                matches &= matches - 1;
-            }
-        }
-    }
-
-    return sz_find_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_neon(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Will contain 4 bits per character.
-    sz_u64_t matches;
-    sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-    n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_first]);
-    n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_mid]);
-    n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_last]);
-
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 16; h_length -= 16) {
-        h_reversed = h + h_length - n_length - 16 + 1;
-        h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_first));
-        h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_mid));
-        h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_last));
-        matches_vec.u8x16 = vandq_u8(                           //
-            vandq_u8(                                           //
-                vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-            vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches) / 4;
-            if (sz_equal(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            sz_assert((matches & (1ull << (63 - potential_offset * 4))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~(1ull << (63 - potential_offset * 4));
-        }
-    }
-
-    return sz_rfind_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_charset_t const *set) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec;
-    uint8x16_t set_top_vec_u8x16 = vld1q_u8(&set->_u8s[0]);
-    uint8x16_t set_bottom_vec_u8x16 = vld1q_u8(&set->_u8s[16]);
-
-    for (; h_length >= 16; h += 16, h_length -= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h));
-        matches = _sz_find_charset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
-        if (matches) return h + sz_u64_ctz(matches) / 4;
-    }
-
-    return sz_find_charset_serial(h, h_length, set);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_charset_t const *set) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec;
-    uint8x16_t set_top_vec_u8x16 = vld1q_u8(&set->_u8s[0]);
-    uint8x16_t set_bottom_vec_u8x16 = vld1q_u8(&set->_u8s[16]);
-
-    // Check `sz_find_charset_neon` for explanations.
-    for (; h_length >= 16; h_length -= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h) + h_length - 16);
-        matches = _sz_find_charset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
-        if (matches) return h + h_length - 1 - sz_u64_clz(matches) / 4;
-    }
-
-    return sz_rfind_charset_serial(h, h_length, set);
-}
-
 #pragma clang attribute pop
 #pragma GCC pop_options
-#endif // Arm Neon
+#endif            // SZ_USE_NEON
+#pragma endregion // NEON Implementation
 
-#pragma endregion
-
-/*  @brief  Implementation of the string search algorithms using the Arm SVE variable-length registers, available
- *          in Arm v9 processors.
- *
- *  Implements:
- *      - memory: {copy, move, fill}
- *      - comparisons: {equal, order}
- *      - search: {substring, character, character set} x {forward, reverse}.
+/*  Implementation of the string search algorithms using the Arm SVE variable-length registers,
+ *  available in Arm v9 processors, like in Apple M4+ and Graviton 3+ CPUs.
  */
-#pragma region ARM SVE
-
-#if SZ_USE_ARM_SVE
+#pragma region SVE Implementation
+#if SZ_USE_SVE
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+sve")
 #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
 
-SZ_PUBLIC void sz_fill_sve(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    svuint8_t value_vec = svdup_u8(value);
-    sz_size_t vec_len = svcntb(); // Vector length in bytes (scalable)
-
-    if (length <= vec_len) {
-        // Small buffer case: use mask to handle small writes
-        svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)length);
-        svst1_u8(mask, (unsigned char *)target, value_vec);
-    }
-    else {
-        // Calculate head, body, and tail sizes
-        sz_size_t head_length = vec_len - ((sz_size_t)target % vec_len);
-        sz_size_t tail_length = (sz_size_t)(target + length) % vec_len;
-        sz_size_t body_length = length - head_length - tail_length;
-
-        // Handle unaligned head
-        svbool_t head_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)head_length);
-        svst1_u8(head_mask, (unsigned char *)target, value_vec);
-        target += head_length;
-
-        // Aligned body loop
-        for (; body_length >= vec_len; target += vec_len, body_length -= vec_len) {
-            svst1_u8(svptrue_b8(), (unsigned char *)target, value_vec);
-        }
-
-        // Handle unaligned tail
-        svbool_t tail_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)tail_length);
-        svst1_u8(tail_mask, (unsigned char *)target, value_vec);
-    }
-}
-
-SZ_PUBLIC void sz_copy_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    sz_size_t vec_len = svcntb(); // Vector length in bytes
-
-    // Arm Neoverse V2 cores in Graviton 4, for example, come with 256 KB of L1 data cache per core,
-    // and 8 MB of L2 cache per core. Moreover, the L1 cache is fully associative.
-    // With two strings, we may consider the overal workload huge, if each exceeds 1 MB in length.
-    //
-    //      int is_huge = length >= 4ull * 1024ull * 1024ull;
-    //
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= vec_len) {
-        // Small buffer case: use mask to handle small writes
-        svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)length);
-        svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-        svst1_u8(mask, (unsigned char *)target, data);
-    }
-    // When dealing with larger buffers, similar to AVX-512, we want minimize unaligned operations
-    // and handle the head, body, and tail separately. We can also traverse the buffer in both directions
-    // as Arm generally supports more simultaneous stores than x86 CPUs.
-    //
-    // For gigantic datasets, similar to AVX-512, non-temporal "loads" and "stores" can be used.
-    // Sadly, if the register size (16 byte or larger) is smaller than a cache-line (64 bytes)
-    // we will pay a huge penalty on loads, fetching the same content many times.
-    // It may be better to allow caching (and subsequent eviction), in favor of using four-element
-    // tuples, wich will be guaranteed to be a multiple of a cache line.
-    //
-    // Another approach is to use the `LD4B` instructions, which will populate four registers at once.
-    // This however, further decreases the performance from LibC-like 29 GB/s to 20 GB/s.
-    else {
-        // Calculating head, body, and tail sizes depends on the `vec_len`,
-        // but it's runtime constant, and the modulo operation is expensive!
-        // Instead we use the fact, that it's always a multiple of 128 bits or 16 bytes.
-        sz_size_t head_length = 16 - ((sz_size_t)target % 16);
-        sz_size_t tail_length = (sz_size_t)(target + length) % 16;
-        sz_size_t body_length = length - head_length - tail_length;
-
-        // Handle unaligned parts
-        svbool_t head_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)head_length);
-        svuint8_t head_data = svld1_u8(head_mask, (unsigned char *)source);
-        svst1_u8(head_mask, (unsigned char *)target, head_data);
-        svbool_t tail_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)tail_length);
-        svuint8_t tail_data = svld1_u8(tail_mask, (unsigned char *)source + head_length + body_length);
-        svst1_u8(tail_mask, (unsigned char *)target + head_length + body_length, tail_data);
-        target += head_length;
-        source += head_length;
-
-        // Aligned body loop, walking in two directions
-        for (; body_length >= vec_len * 2; target += vec_len, source += vec_len, body_length -= vec_len * 2) {
-            svuint8_t forward_data = svld1_u8(svptrue_b8(), (unsigned char *)source);
-            svuint8_t backward_data = svld1_u8(svptrue_b8(), (unsigned char *)source + body_length - vec_len);
-            svst1_u8(svptrue_b8(), (unsigned char *)target, forward_data);
-            svst1_u8(svptrue_b8(), (unsigned char *)target + body_length - vec_len, backward_data);
-        }
-        // Up to (vec_len * 2 - 1) bytes of data may be left in the body,
-        // so we can unroll the last two optional loop iterations.
-        if (body_length > vec_len) {
-            svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)body_length);
-            svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-            svst1_u8(mask, (unsigned char *)target, data);
-            body_length -= vec_len;
-            source += body_length;
-            target += body_length;
-        }
-        if (body_length) {
-            svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)body_length);
-            svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-            svst1_u8(mask, (unsigned char *)target, data);
-        }
-    }
-}
-
 #pragma clang attribute pop
 #pragma GCC pop_options
-#endif // Arm SVE
+#endif            // SZ_USE_SVE
+#pragma endregion // SVE Implementation
 
-#pragma endregion
-
-/*
- *  @brief  Pick the right implementation for the string search algorithms.
+/*  Pick the right implementation for the string search algorithms.
+ *  To override this behavior and precompile all backends - set `SZ_DYNAMIC_DISPATCH` to 1.
  */
 #pragma region Compile Time Dispatching
-
-SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t ins, sz_size_t length) { return sz_hash_serial(ins, length); }
-SZ_PUBLIC void sz_tolower(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_tolower_serial(ins, length, outs); }
-SZ_PUBLIC void sz_toupper(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_toupper_serial(ins, length, outs); }
-SZ_PUBLIC void sz_toascii(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_toascii_serial(ins, length, outs); }
-SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t ins, sz_size_t length) { return sz_isascii_serial(ins, length); }
-
-SZ_PUBLIC void sz_hashes_fingerprint(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_ptr_t fingerprint,
-                                     sz_size_t fingerprint_bytes) {
-
-    sz_bool_t fingerprint_length_is_power_of_two = (sz_bool_t)((fingerprint_bytes & (fingerprint_bytes - 1)) == 0);
-    sz_string_view_t fingerprint_buffer = {fingerprint, fingerprint_bytes};
-
-    // There are several issues related to the fingerprinting algorithm.
-    // First, the memory traversal order is important.
-    // https://blog.stuffedcow.net/2015/08/pagewalk-coherence/
-
-    // In most cases the fingerprint length will be a power of two.
-    if (fingerprint_length_is_power_of_two == sz_false_k)
-        sz_hashes(start, length, window_length, 1, _sz_hashes_fingerprint_non_pow2_callback, &fingerprint_buffer);
-    else
-        sz_hashes(start, length, window_length, 1, _sz_hashes_fingerprint_pow2_callback, &fingerprint_buffer);
-}
-
 #if !SZ_DYNAMIC_DISPATCH
 
-SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    return sz_checksum_avx512(text, length);
-#elif SZ_USE_X86_AVX2
-    return sz_checksum_avx2(text, length);
-#elif SZ_USE_ARM_NEON
-    return sz_checksum_neon(text, length);
-#else
-    return sz_checksum_serial(text, length);
-#endif
-}
-
-SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    return sz_equal_avx512(a, b, length);
-#elif SZ_USE_X86_AVX2
-    return sz_equal_avx2(a, b, length);
-#elif SZ_USE_ARM_NEON
-    return sz_equal_neon(a, b, length);
-#else
-    return sz_equal_serial(a, b, length);
-#endif
-}
-
-SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-#if SZ_USE_X86_AVX512
-    return sz_order_avx512(a, a_length, b, b_length);
-#elif SZ_USE_X86_AVX2
-    return sz_order_avx2(a, a_length, b, b_length);
-#elif SZ_USE_ARM_NEON
-    return sz_order_neon(a, a_length, b, b_length);
-#else
-    return sz_order_serial(a, a_length, b, b_length);
-#endif
-}
-
-SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    sz_copy_avx512(target, source, length);
-#elif SZ_USE_X86_AVX2
-    sz_copy_avx2(target, source, length);
-#elif SZ_USE_ARM_NEON
-    sz_copy_neon(target, source, length);
-#else
-    sz_copy_serial(target, source, length);
-#endif
-}
-
-SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    sz_move_avx512(target, source, length);
-#elif SZ_USE_X86_AVX2
-    sz_move_avx2(target, source, length);
-#elif SZ_USE_ARM_NEON
-    sz_move_neon(target, source, length);
-#else
-    sz_move_serial(target, source, length);
-#endif
-}
-
-SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-#if SZ_USE_X86_AVX512
-    sz_fill_avx512(target, length, value);
-#elif SZ_USE_X86_AVX2
-    sz_fill_avx2(target, length, value);
-#elif SZ_USE_ARM_NEON
-    sz_fill_neon(target, length, value);
-#else
-    sz_fill_serial(target, length, value);
-#endif
-}
-
-SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-#if SZ_USE_X86_AVX512
-    sz_look_up_transform_avx512(source, length, lut, target);
-#elif SZ_USE_X86_AVX2
-    sz_look_up_transform_avx2(source, length, lut, target);
-#elif SZ_USE_ARM_NEON
-    sz_look_up_transform_neon(source, length, lut, target);
-#else
-    sz_look_up_transform_serial(source, length, lut, target);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
-#if SZ_USE_X86_AVX512
-    return sz_find_byte_avx512(haystack, h_length, needle);
-#elif SZ_USE_X86_AVX2
-    return sz_find_byte_avx2(haystack, h_length, needle);
-#elif SZ_USE_ARM_NEON
-    return sz_find_byte_neon(haystack, h_length, needle);
-#else
-    return sz_find_byte_serial(haystack, h_length, needle);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
-#if SZ_USE_X86_AVX512
-    return sz_rfind_byte_avx512(haystack, h_length, needle);
-#elif SZ_USE_X86_AVX2
-    return sz_rfind_byte_avx2(haystack, h_length, needle);
-#elif SZ_USE_ARM_NEON
-    return sz_rfind_byte_neon(haystack, h_length, needle);
-#else
-    return sz_rfind_byte_serial(haystack, h_length, needle);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
-#if SZ_USE_X86_AVX512
-    return sz_find_avx512(haystack, h_length, needle, n_length);
-#elif SZ_USE_X86_AVX2
-    return sz_find_avx2(haystack, h_length, needle, n_length);
-#elif SZ_USE_ARM_NEON
-    return sz_find_neon(haystack, h_length, needle, n_length);
-#else
-    return sz_find_serial(haystack, h_length, needle, n_length);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
-#if SZ_USE_X86_AVX512
-    return sz_rfind_avx512(haystack, h_length, needle, n_length);
-#elif SZ_USE_X86_AVX2
-    return sz_rfind_avx2(haystack, h_length, needle, n_length);
-#elif SZ_USE_ARM_NEON
-    return sz_rfind_neon(haystack, h_length, needle, n_length);
-#else
-    return sz_rfind_serial(haystack, h_length, needle, n_length);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#if SZ_USE_X86_AVX512
-    return sz_find_charset_avx512(text, length, set);
-#elif SZ_USE_X86_AVX2
-    return sz_find_charset_avx2(text, length, set);
-#elif SZ_USE_ARM_NEON
-    return sz_find_charset_neon(text, length, set);
-#else
-    return sz_find_charset_serial(text, length, set);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#if SZ_USE_X86_AVX512
-    return sz_rfind_charset_avx512(text, length, set);
-#elif SZ_USE_X86_AVX2
-    return sz_rfind_charset_avx2(text, length, set);
-#elif SZ_USE_ARM_NEON
-    return sz_rfind_charset_neon(text, length, set);
-#else
-    return sz_rfind_charset_serial(text, length, set);
-#endif
-}
-
 SZ_DYNAMIC sz_size_t sz_hamming_distance( //
     sz_cptr_t a, sz_size_t a_length,      //
     sz_cptr_t b, sz_size_t b_length,      //
@@ -7075,7 +1340,7 @@ SZ_DYNAMIC sz_size_t sz_edit_distance( //
     sz_cptr_t a, sz_size_t a_length,   //
     sz_cptr_t b, sz_size_t b_length,   //
     sz_size_t bound, sz_memory_allocator_t *alloc) {
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
     return sz_edit_distance_avx512(a, a_length, b, b_length, bound, alloc);
 #else
     return sz_edit_distance_serial(a, a_length, b, b_length, bound, alloc);
@@ -7089,68 +1354,21 @@ SZ_DYNAMIC sz_size_t sz_edit_distance_utf8( //
     return _sz_edit_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc);
 }
 
-SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                         sz_error_cost_t const *subs, sz_error_cost_t gap,
-                                         sz_memory_allocator_t *alloc) {
-#if SZ_USE_X86_AVX512
-    return sz_alignment_score_avx512(a, a_length, b, b_length, subs, gap, alloc);
+SZ_DYNAMIC sz_ssize_t sz_alignment_score( //
+    sz_cptr_t a, sz_size_t a_length,      //
+    sz_cptr_t b, sz_size_t b_length,      //
+    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
+#if SZ_USE_ICE
+    return sz_alignment_score_ice(a, a_length, b, b_length, subs, gap, alloc);
 #else
     return sz_alignment_score_serial(a, a_length, b, b_length, subs, gap, alloc);
 #endif
 }
 
-SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                          sz_hash_callback_t callback, void *callback_handle) {
-#if SZ_USE_X86_AVX512
-    sz_hashes_avx512(text, length, window_length, window_step, callback, callback_handle);
-#elif SZ_USE_X86_AVX2
-    sz_hashes_avx2(text, length, window_length, window_step, callback, callback_handle);
-#else
-    sz_hashes_serial(text, length, window_length, window_step, callback, callback_handle);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    return sz_find_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    sz_charset_invert(&set);
-    return sz_find_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    return sz_rfind_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    sz_charset_invert(&set);
-    return sz_rfind_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
-                            sz_random_generator_t generator, void *generator_user_data) {
-    sz_generate_serial(alphabet, alphabet_size, result, result_length, generator, generator_user_data);
-}
-
-#endif
-#pragma endregion
+#endif            // !SZ_DYNAMIC_DISPATCH
+#pragma endregion // Compile Time Dispatching
 
 #ifdef __cplusplus
-#pragma GCC diagnostic pop
 }
 #endif // __cplusplus
-
-#endif // STRINGZILLA_H_
+#endif // STRINGZILLA_SIMISLARITY_H_

From be4c63d926c8628451726863e4d14dbd1ea374dd Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 15:37:01 +0000
Subject: [PATCH 034/751] Fix: Filter `hash.h` file

---
 include/stringzilla/hash.h | 7483 +++---------------------------------
 1 file changed, 621 insertions(+), 6862 deletions(-)

diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index de7fbcac..bf24a5e6 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -1,422 +1,30 @@
 /**
- *  @brief  StringZilla is a collection of advanced string algorithms, designed to be used in Big Data applications.
- *          It is generally faster than LibC, and has a broader & cleaner interface, and targets modern x86 CPUs
- *          with AVX-512 and Arm NEON and older CPUs with SWAR and auto-vectorization.
- *
- *  Consider overriding the following macros to customize the library:
- *
- *  - `SZ_DEBUG=0` - whether to enable debug assertions and logging.
- *  - `SZ_DYNAMIC_DISPATCH=0` - whether to use runtime dispatching of the most advanced SIMD backend.
- *  - `SZ_USE_MISALIGNED_LOADS=0` - whether to use misaligned loads on platforms that support them.
- *  - `SZ_SWAR_THRESHOLD=24` - threshold for switching to SWAR backend over serial byte-level for-loops.
- *  - `SZ_USE_X86_AVX512=?` - whether to use AVX-512 instructions on x86_64.
- *  - `SZ_USE_X86_AVX2=?` - whether to use AVX2 instructions on x86_64.
- *  - `SZ_USE_ARM_NEON=?` - whether to use NEON instructions on ARM.
- *  - `SZ_USE_ARM_SVE=?` - whether to use SVE instructions on ARM.
- *
- *  @see    StringZilla: https://github.com/ashvardanian/StringZilla/blob/main/README.md
- *  @see    LibC String: https://pubs.opengroup.org/onlinepubs/009695399/basedefs/string.h.html
- *
- *  @file   stringzilla.h
+ *  @brief  Hardware-accelerated string hashing and checksums.
+ *  @file   hash.h
  *  @author Ash Vardanian
- */
-#ifndef STRINGZILLA_H_
-#define STRINGZILLA_H_
-
-#define STRINGZILLA_VERSION_MAJOR 3
-#define STRINGZILLA_VERSION_MINOR 11
-#define STRINGZILLA_VERSION_PATCH 0
-
-/**
- *  @brief  When set to 1, the library will include the following LibC headers: <stddef.h> and <stdint.h>.
- *          In debug builds (SZ_DEBUG=1), the library will also include <stdio.h> and <stdlib.h>.
  *
- *  You may want to disable this compiling for use in the kernel, or in embedded systems.
- *  You may also avoid them, if you are very sensitive to compilation time and avoid pre-compiled headers.
- *  https://artificial-mind.net/projects/compile-health/
- */
-#ifndef SZ_AVOID_LIBC
-#define SZ_AVOID_LIBC (0) // true or false
-#endif
-
-/**
- *  @brief  A misaligned load can be - trying to fetch eight consecutive bytes from an address
- *          that is not divisible by eight. On x86 enabled by default. On ARM it's not.
+ *  Includes core APIs:
  *
- *  Most platforms support it, but there is no industry standard way to check for those.
- *  This value will mostly affect the performance of the serial (SWAR) backend.
- */
-#ifndef SZ_USE_MISALIGNED_LOADS
-#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
-#define SZ_USE_MISALIGNED_LOADS (1) // true or false
-#else
-#define SZ_USE_MISALIGNED_LOADS (0) // true or false
-#endif
-#endif
-
-/**
- *  @brief  Removes compile-time dispatching, and replaces it with runtime dispatching.
- *          So the `sz_find` function will invoke the most advanced backend supported by the CPU,
- *          that runs the program, rather than the most advanced backend supported by the CPU
- *          used to compile the library or the downstream application.
- */
-#ifndef SZ_DYNAMIC_DISPATCH
-#define SZ_DYNAMIC_DISPATCH (0) // true or false
-#endif
-
-/**
- *  @brief  Analogous to `size_t` and `std::size_t`, unsigned integer, identical to pointer size.
- *          64-bit on most platforms where pointers are 64-bit.
- *          32-bit on platforms where pointers are 32-bit.
- */
-#if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64)
-#define SZ_DETECT_64_BIT (1)
-#define SZ_SIZE_MAX (0xFFFFFFFFFFFFFFFFull)  // Largest unsigned integer that fits into 64 bits.
-#define SZ_SSIZE_MAX (0x7FFFFFFFFFFFFFFFull) // Largest signed integer that fits into 64 bits.
-#else
-#define SZ_DETECT_64_BIT (0)
-#define SZ_SIZE_MAX (0xFFFFFFFFu)  // Largest unsigned integer that fits into 32 bits.
-#define SZ_SSIZE_MAX (0x7FFFFFFFu) // Largest signed integer that fits into 32 bits.
-#endif
-
-/**
- *  @brief  On Big-Endian machines StringZilla will work in compatibility mode.
- *          This disables SWAR hacks to minimize code duplication, assuming practically
- *          all modern popular platforms are Little-Endian.
+ *  - `sz_checksum` - for byte-level checksums.
+ *  - `sz_hash` - for 64-bit single-shot hashing.
+ *  - `sz_hashes` - producing the rolling hashes of a string.
+ *  - `sz_generate` - populating buffers with random data.
  *
- *  This variable is hard to infer from macros reliably. It's best to set it manually.
- *  For that CMake provides the `TestBigEndian` and `CMAKE_<LANG>_BYTE_ORDER` (from 3.20 onwards).
- *  In Python one can check `sys.byteorder == 'big'` in the `setup.py` script and pass the appropriate macro.
- *  https://stackoverflow.com/a/27054190
- */
-#ifndef SZ_DETECT_BIG_ENDIAN
-#if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN || defined(__BIG_ENDIAN__) || defined(__ARMEB__) || \
-    defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || defined(__MIBSEB) || defined(__MIBSEB__)
-#define SZ_DETECT_BIG_ENDIAN (1) //< It's a big-endian target architecture
-#else
-#define SZ_DETECT_BIG_ENDIAN (0) //< It's a little-endian target architecture
-#endif
-#endif
-
-/*
- *  Debugging and testing.
- */
-#ifndef SZ_DEBUG
-#if defined(DEBUG) || defined(_DEBUG) // This means "Not using DEBUG information".
-#define SZ_DEBUG (1)
-#else
-#define SZ_DEBUG (0)
-#endif
-#endif
-
-/**
- *  @brief  Threshold for switching to SWAR (8-bytes at a time) backend over serial byte-level for-loops.
- *          On very short strings, under 16 bytes long, at most a single word will be processed with SWAR.
- *          Assuming potentially misaligned loads, SWAR makes sense only after ~24 bytes.
- */
-#ifndef SZ_SWAR_THRESHOLD
-#if SZ_DEBUG
-#define SZ_SWAR_THRESHOLD (8u) // 8 bytes in debug builds
-#else
-#define SZ_SWAR_THRESHOLD (24u) // 24 bytes in release builds
-#endif
-#endif
-
-/*  Annotation for the public API symbols:
+ *  Convenience functions for character-set matching:
  *
- *  - `SZ_PUBLIC` is used for functions that are part of the public API.
- *  - `SZ_INTERNAL` is used for internal helper functions with unstable APIs.
- *  - `SZ_DYNAMIC` is used for functions that are part of the public API, but are dispatched at runtime.
+ *  - `sz_hashes_fingerprint`
+ *  - `sz_hashes_intersection`
  */
-#ifndef SZ_DYNAMIC
-#if SZ_DYNAMIC_DISPATCH
-#if defined(_WIN32) || defined(__CYGWIN__)
-#define SZ_DYNAMIC __declspec(dllexport)
-#define SZ_EXTERNAL __declspec(dllimport)
-#define SZ_PUBLIC inline static
-#define SZ_INTERNAL inline static
-#else
-#define SZ_DYNAMIC __attribute__((visibility("default")))
-#define SZ_EXTERNAL extern
-#define SZ_PUBLIC __attribute__((unused)) inline static
-#define SZ_INTERNAL __attribute__((always_inline)) inline static
-#endif // _WIN32 || __CYGWIN__
-#else
-#define SZ_DYNAMIC inline static
-#define SZ_EXTERNAL extern
-#define SZ_PUBLIC inline static
-#define SZ_INTERNAL inline static
-#endif // SZ_DYNAMIC_DISPATCH
-#endif // SZ_DYNAMIC
+#ifndef STRINGZILLA_HASH_H_
+#define STRINGZILLA_HASH_H_
 
-/**
- *  @brief  Alignment macro for 64-byte alignment.
- */
-#if defined(_MSC_VER)
-#define SZ_ALIGN64 __declspec(align(64))
-#elif defined(__GNUC__) || defined(__clang__)
-#define SZ_ALIGN64 __attribute__((aligned(64)))
-#else
-#define SZ_ALIGN64
-#endif
+#include "types.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/*
- *  Let's infer the integer types or pull them from LibC,
- *  if that is allowed by the user.
- */
-#if !SZ_AVOID_LIBC
-#include <stddef.h>           // `size_t`
-#include <stdint.h>           // `uint8_t`
-typedef int8_t sz_i8_t;       // Always 8 bits
-typedef uint8_t sz_u8_t;      // Always 8 bits
-typedef uint16_t sz_u16_t;    // Always 16 bits
-typedef int32_t sz_i32_t;     // Always 32 bits
-typedef uint32_t sz_u32_t;    // Always 32 bits
-typedef uint64_t sz_u64_t;    // Always 64 bits
-typedef int64_t sz_i64_t;     // Always 64 bits
-typedef size_t sz_size_t;     // Pointer-sized unsigned integer, 32 or 64 bits
-typedef ptrdiff_t sz_ssize_t; // Signed version of `sz_size_t`, 32 or 64 bits
-
-#else // if SZ_AVOID_LIBC:
-
-// ! The C standard doesn't specify the signedness of char.
-// ! On x86 char is signed by default while on Arm it is unsigned by default.
-// ! That's why we don't define `sz_char_t` and generally use explicit `sz_i8_t` and `sz_u8_t`.
-typedef signed char sz_i8_t;         // Always 8 bits
-typedef unsigned char sz_u8_t;       // Always 8 bits
-typedef unsigned short sz_u16_t;     // Always 16 bits
-typedef int sz_i32_t;                // Always 32 bits
-typedef unsigned int sz_u32_t;       // Always 32 bits
-typedef long long sz_i64_t;          // Always 64 bits
-typedef unsigned long long sz_u64_t; // Always 64 bits
-
-// Now we need to redefine the `size_t`.
-// Microsoft Visual C++ (MSVC) typically follows LLP64 data model on 64-bit platforms,
-// where integers, pointers, and long types have different sizes:
-//
-//  > `int` is 32 bits
-//  > `long` is 32 bits
-//  > `long long` is 64 bits
-//  > pointer (thus, `size_t`) is 64 bits
-//
-// In contrast, GCC and Clang on 64-bit Unix-like systems typically follow the LP64 model, where:
-//
-//  > `int` is 32 bits
-//  > `long` and pointer (thus, `size_t`) are 64 bits
-//  > `long long` is also 64 bits
-//
-// Source: https://learn.microsoft.com/en-us/windows/win32/winprog64/abstract-data-models
-#if SZ_DETECT_64_BIT
-typedef unsigned long long sz_size_t; // 64-bit.
-typedef long long sz_ssize_t;         // 64-bit.
-#else
-typedef unsigned sz_size_t;  // 32-bit.
-typedef unsigned sz_ssize_t; // 32-bit.
-#endif // SZ_DETECT_64_BIT
-
-#endif // SZ_AVOID_LIBC
-
-/**
- *  @brief  Compile-time assert macro similar to `static_assert` in C++.
- */
-#define sz_static_assert(condition, name)                \
-    typedef struct {                                     \
-        int static_assert_##name : (condition) ? 1 : -1; \
-    } sz_static_assert_##name##_t
-
-sz_static_assert(sizeof(sz_size_t) == sizeof(void *), sz_size_t_must_be_pointer_size);
-sz_static_assert(sizeof(sz_ssize_t) == sizeof(void *), sz_ssize_t_must_be_pointer_size);
-
-#pragma region Public API
-
-typedef char *sz_ptr_t;          // A type alias for `char *`
-typedef char const *sz_cptr_t;   // A type alias for `char const *`
-typedef sz_i8_t sz_error_cost_t; // Character mismatch cost for fuzzy matching functions
-
-typedef sz_u64_t sz_sorted_idx_t; // Index of a sorted string in a list of strings
-
-typedef enum { sz_false_k = 0, sz_true_k = 1 } sz_bool_t;                        // Only one relevant bit
-typedef enum { sz_less_k = -1, sz_equal_k = 0, sz_greater_k = 1 } sz_ordering_t; // Only three possible states: <=>
-
-/**
- *  @brief  Tiny string-view structure. It's POD type, unlike the `std::string_view`.
- */
-typedef struct sz_string_view_t {
-    sz_cptr_t start;
-    sz_size_t length;
-} sz_string_view_t;
-
-/**
- *  @brief  Enumeration of SIMD capabilities of the target architecture.
- *          Used to introspect the supported functionality of the dynamic library.
- */
-typedef enum sz_capability_t {
-    sz_cap_serial_k = 1,       /// Serial (non-SIMD) capability
-    sz_cap_any_k = 0x7FFFFFFF, /// Mask representing any capability
-
-    sz_cap_arm_neon_k = 1 << 10, /// ARM NEON capability
-    sz_cap_arm_sve_k = 1 << 11,  /// ARM SVE capability TODO: Not yet supported or used
-    sz_cap_arm_sve2_k = 1 << 12,
-    sz_cap_arm_sve2p1_k = 1 << 13,
-    sz_cap_x86_avx2_k = 1 << 20,       /// x86 AVX2 capability
-    sz_cap_x86_avx512f_k = 1 << 21,    /// x86 AVX512 F capability
-    sz_cap_x86_avx512bw_k = 1 << 22,   /// x86 AVX512 BW instruction capability
-    sz_cap_x86_avx512vl_k = 1 << 23,   /// x86 AVX512 VL instruction capability
-    sz_cap_x86_avx512vbmi_k = 1 << 24, /// x86 AVX512 VBMI instruction capability
-    sz_cap_x86_gfni_k = 1 << 25,       /// x86 AVX512 GFNI instruction capability
-
-} sz_capability_t;
-
-/**
- *  @brief  Function to determine the SIMD capabilities of the current machine @b only at @b runtime.
- *  @return A bitmask of the SIMD capabilities represented as a `sz_capability_t` enum value.
- */
-SZ_DYNAMIC sz_capability_t sz_capabilities(void);
-
-/**
- *  @brief  Bit-set structure for 256 possible byte values. Useful for filtering and search.
- *  @see    sz_charset_init, sz_charset_add, sz_charset_contains, sz_charset_invert
- */
-typedef union sz_charset_t {
-    sz_u64_t _u64s[4];
-    sz_u32_t _u32s[8];
-    sz_u16_t _u16s[16];
-    sz_u8_t _u8s[32];
-} sz_charset_t;
-
-/** @brief  Initializes a bit-set to an empty collection, meaning - all characters are banned. */
-SZ_PUBLIC void sz_charset_init(sz_charset_t *s) { s->_u64s[0] = s->_u64s[1] = s->_u64s[2] = s->_u64s[3] = 0; }
-
-/** @brief  Adds a character to the set and accepts @b unsigned integers. */
-SZ_PUBLIC void sz_charset_add_u8(sz_charset_t *s, sz_u8_t c) { s->_u64s[c >> 6] |= (1ull << (c & 63u)); }
-
-/** @brief  Adds a character to the set. Consider @b sz_charset_add_u8. */
-SZ_PUBLIC void sz_charset_add(sz_charset_t *s, char c) { sz_charset_add_u8(s, *(sz_u8_t *)(&c)); } // bitcast
-
-/** @brief  Checks if the set contains a given character and accepts @b unsigned integers. */
-SZ_PUBLIC sz_bool_t sz_charset_contains_u8(sz_charset_t const *s, sz_u8_t c) {
-    // Checking the bit can be done in different ways:
-    // - (s->_u64s[c >> 6] & (1ull << (c & 63u))) != 0
-    // - (s->_u32s[c >> 5] & (1u << (c & 31u))) != 0
-    // - (s->_u16s[c >> 4] & (1u << (c & 15u))) != 0
-    // - (s->_u8s[c >> 3] & (1u << (c & 7u))) != 0
-    return (sz_bool_t)((s->_u64s[c >> 6] & (1ull << (c & 63u))) != 0);
-}
-
-/** @brief  Checks if the set contains a given character. Consider @b sz_charset_contains_u8. */
-SZ_PUBLIC sz_bool_t sz_charset_contains(sz_charset_t const *s, char c) {
-    return sz_charset_contains_u8(s, *(sz_u8_t *)(&c)); // bitcast
-}
-
-/** @brief  Inverts the contents of the set, so allowed character get disallowed, and vice versa. */
-SZ_PUBLIC void sz_charset_invert(sz_charset_t *s) {
-    s->_u64s[0] ^= 0xFFFFFFFFFFFFFFFFull, s->_u64s[1] ^= 0xFFFFFFFFFFFFFFFFull, //
-        s->_u64s[2] ^= 0xFFFFFFFFFFFFFFFFull, s->_u64s[3] ^= 0xFFFFFFFFFFFFFFFFull;
-}
-
-typedef void *(*sz_memory_allocate_t)(sz_size_t, void *);
-typedef void (*sz_memory_free_t)(void *, sz_size_t, void *);
-typedef sz_u64_t (*sz_random_generator_t)(void *);
-
-/**
- *  @brief  Some complex pattern matching algorithms may require memory allocations.
- *          This structure is used to pass the memory allocator to those functions.
- *  @see    sz_memory_allocator_init_fixed
- */
-typedef struct sz_memory_allocator_t {
-    sz_memory_allocate_t allocate;
-    sz_memory_free_t free;
-    void *handle;
-} sz_memory_allocator_t;
-
-/**
- *  @brief  Initializes a memory allocator to use the system default `malloc` and `free`.
- *          ! The function is not available if the library was compiled with `SZ_AVOID_LIBC`.
- *
- *  @param alloc    Memory allocator to initialize.
- */
-SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc);
-
-/**
- *  @brief  Initializes a memory allocator to use a static-capacity buffer.
- *          No dynamic allocations will be performed.
- *
- *  @param alloc    Memory allocator to initialize.
- *  @param buffer   Buffer to use for allocations.
- *  @param length   Length of the buffer. @b Must be greater than 8 bytes. Different values would be optimal for
- *                  different algorithms and input lengths, but 4096 bytes (one RAM page) is a good default.
- */
-SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length);
-
-/**
- *  @brief  The number of bytes a stack-allocated string can hold, including the SZ_NULL termination character.
- *          ! This can't be changed from outside. Don't use the `#error` as it may already be included and set.
- */
-#ifdef SZ_STRING_INTERNAL_SPACE
-#undef SZ_STRING_INTERNAL_SPACE
-#endif
-#define SZ_STRING_INTERNAL_SPACE (sizeof(sz_size_t) * 3 - 1) // 3 pointers minus one byte for an 8-bit length
-
-/**
- *  @brief  Tiny memory-owning string structure with a Small String Optimization (SSO).
- *          Differs in layout from Folly, Clang, GCC, and probably most other implementations.
- *          It's designed to avoid any branches on read-only operations, and can store up
- *          to 22 characters on stack on 64-bit machines, followed by the SZ_NULL-termination character.
- *
- *  @section Changing Length
- *
- *  One nice thing about this design, is that you can, in many cases, change the length of the string
- *  without any branches, invoking a `+=` or `-=` on the 64-bit `length` field. If the string is on heap,
- *  the solution is obvious. If it's on stack, inplace decrement wouldn't affect the top bytes of the string,
- *  only changing the last byte containing the length.
- */
-typedef union sz_string_t {
-
-#if !SZ_DETECT_BIG_ENDIAN
-
-    struct external {
-        sz_ptr_t start;
-        sz_size_t length;
-        sz_size_t space;
-        sz_size_t padding;
-    } external;
-
-    struct internal {
-        sz_ptr_t start;
-        sz_u8_t length;
-        char chars[SZ_STRING_INTERNAL_SPACE];
-    } internal;
-
-#else
-
-    struct external {
-        sz_ptr_t start;
-        sz_size_t space;
-        sz_size_t padding;
-        sz_size_t length;
-    } external;
-
-    struct internal {
-        sz_ptr_t start;
-        char chars[SZ_STRING_INTERNAL_SPACE];
-        sz_u8_t length;
-    } internal;
-
-#endif
-
-    sz_size_t words[4];
-
-} sz_string_t;
-
-typedef sz_u64_t (*sz_hash_t)(sz_cptr_t, sz_size_t);
-typedef sz_u64_t (*sz_checksum_t)(sz_cptr_t, sz_size_t);
-typedef sz_bool_t (*sz_equal_t)(sz_cptr_t, sz_cptr_t, sz_size_t);
-typedef sz_ordering_t (*sz_order_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
-typedef void (*sz_to_converter_t)(sz_cptr_t, sz_size_t, sz_ptr_t);
+#pragma region Core API
 
 /**
  *  @brief  Computes the 64-bit check-sum of bytes in a string.
@@ -428,9 +36,6 @@ typedef void (*sz_to_converter_t)(sz_cptr_t, sz_size_t, sz_ptr_t);
  */
 SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length);
 
-/** @copydoc sz_checksum */
-SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length);
-
 /**
  *  @brief  Computes the 64-bit unsigned hash of a string. Fairly fast for short strings,
  *          simple implementation, and supports rolling computation, reused in other APIs.
@@ -444,108 +49,74 @@ SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length);
  */
 SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length);
 
-/** @copydoc sz_hash */
-SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t text, sz_size_t length);
-
 /**
- *  @brief  Checks if two string are equal.
- *          Similar to `memcmp(a, b, length) == 0` in LibC and `a == b` in STL.
- *
- *  The implementation of this function is very similar to `sz_order`, but the usage patterns are different.
- *  This function is more often used in parsing, while `sz_order` is often used in sorting.
- *  It works best on platforms with cheap
+ *  @brief  Computes the Karp-Rabin rolling hashes of a string supplying them to the provided `callback`.
+ *          Can be used for similarity scores, search, ranking, etc.
  *
- *  @param a        First string to compare.
- *  @param b        Second string to compare.
- *  @param length   Number of bytes in both strings.
- *  @return         1 if strings match, 0 otherwise.
- */
-SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-
-/**
- *  @brief  Estimates the relative order of two strings. Equivalent to `memcmp(a, b, length)` in LibC.
- *          Can be used on different length strings.
+ *  Rabin-Karp-like rolling hashes can have very high-level of collisions and depend
+ *  on the choice of bases and the prime number. That's why, often two hashes from the same
+ *  family are used with different bases.
  *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *  @return         Negative if (a < b), positive if (a > b), zero if they are equal.
- */
-SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-
-/**
- *  @brief  Look Up Table @b (LUT) transformation of a string. Equivalent to `for (char & c : text) c = lut[c]`.
+ *       1. Kernighan and Ritchie's function uses 31, a prime close to the size of English alphabet.
+ *       2. To be friendlier to byte-arrays and UTF8, we use 257 for the second function.
  *
- *  Can be used to implement some form of string normalization, partially masking punctuation marks,
- *  or converting between different character sets, like uppercase or lowercase. Surprisingly, also has
- *  broad implications in image processing, where image channel transformations are often done using LUTs.
+ *  Choosing the right ::window_length is task- and domain-dependant. For example, most English words are
+ *  between 3 and 7 characters long, so a window of 4 bytes would be a good choice. For DNA sequences,
+ *  the ::window_length might be a multiple of 3, as the codons are 3 (nucleotides) bytes long.
+ *  With such minimalistic alphabets of just four characters (AGCT) longer windows might be needed.
+ *  For protein sequences the alphabet is 20 characters long, so the window can be shorter, than for DNAs.
  *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param lut      Look Up Table to apply. Must be exactly @b 256 bytes long.
- *  @param result   Output string, can point to the same address as ::text.
+ *  @param text             String to hash.
+ *  @param length           Number of bytes in the string.
+ *  @param window_length    Length of the rolling window in bytes.
+ *  @param window_step      Step of reported hashes. @b Must be power of two. Should be smaller than `window_length`.
+ *  @param callback         Function receiving the start & length of a substring, the hash, and the `callback_handle`.
+ *  @param callback_handle  Optional user-provided pointer to be passed to the `callback`.
+ *  @see                    sz_hashes_fingerprint, sz_hashes_intersection
  */
-SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
-
-typedef void (*sz_look_up_transform_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_ptr_t);
-
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_serial(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
+SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
+                          sz_hash_callback_t callback, void *callback_handle);
 
 /**
- *  @brief  Equivalent to `for (char & c : text) c = tolower(c)`.
- *
- *  ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
- *  So there are 26 english letters, shifted by 32 values, meaning that a conversion
- *  can be done by flipping the 5th bit each inappropriate character byte. This, however,
- *  breaks for extended ASCII, so a different solution is needed.
- *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
+ *  @brief  Computes the Karp-Rabin rolling hashes of a string outputting a binary fingerprint.
+ *          Such fingerprints can be compared with Hamming or Jaccard (Tanimoto) distance for similarity.
  *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_PUBLIC void sz_tolower(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
-
-/**
- *  @brief  Equivalent to `for (char & c : text) c = toupper(c)`.
+ *  The algorithm doesn't clear the fingerprint buffer on start, so it can be invoked multiple times
+ *  to produce a fingerprint of a longer string, by passing the previous fingerprint as the ::fingerprint.
+ *  It can also be reused to produce multi-resolution fingerprints by changing the ::window_length
+ *  and calling the same function multiple times for the same input ::text.
  *
- *  ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
- *  So there are 26 english letters, shifted by 32 values, meaning that a conversion
- *  can be done by flipping the 5th bit each inappropriate character byte. This, however,
- *  breaks for extended ASCII, so a different solution is needed.
- *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
+ *  Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
+ *  avoiding cache-coherency penalties of remote on-heap buffers.
  *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
+ *  @param text                 String to hash.
+ *  @param length               Number of bytes in the string.
+ *  @param fingerprint          Output fingerprint buffer.
+ *  @param fingerprint_bytes    Number of bytes in the fingerprint buffer.
+ *  @param window_length        Length of the rolling window in bytes.
+ *  @see                        sz_hashes, sz_hashes_intersection
  */
-SZ_PUBLIC void sz_toupper(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
+SZ_PUBLIC void sz_hashes_fingerprint(                          //
+    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
+    sz_ptr_t fingerprint, sz_size_t fingerprint_bytes);
 
 /**
- *  @brief  Equivalent to `for (char & c : text) c = toascii(c)`.
+ *  @brief  Given a hash-fingerprint of a textual document, computes the number of intersecting hashes
+ *          of the incoming document. Can be used for document scoring and search.
  *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_PUBLIC void sz_toascii(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
-
-/**
- *  @brief  Checks if all characters in the range are valid ASCII characters.
+ *  Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
+ *  avoiding cache-coherency penalties of remote on-heap buffers.
  *
- *  @param text     String to be analyzed.
- *  @param length   Number of bytes in the string.
- *  @return         Whether all characters are valid ASCII characters.
+ *  @param text                 Input document.
+ *  @param length               Number of bytes in the input document.
+ *  @param fingerprint          Reference document fingerprint.
+ *  @param fingerprint_bytes    Number of bytes in the reference documents fingerprint.
+ *  @param window_length        Length of the rolling window in bytes.
+ *  @see                        sz_hashes, sz_hashes_fingerprint
  */
-SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t text, sz_size_t length);
+SZ_PUBLIC sz_size_t sz_hashes_intersection(                    //
+    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
+    sz_cptr_t fingerprint, sz_size_t fingerprint_bytes);
 
 /**
  *  @brief  Generates a random string for a given alphabet, avoiding integer division and modulo operations.
@@ -567,5118 +138,312 @@ SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t text, sz_size_t length);
 SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
                             sz_random_generator_t generate, void *generator);
 
+/** @copydoc sz_checksum */
+SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length);
+/** @copydoc sz_hash */
+SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t text, sz_size_t length);
 /** @copydoc sz_generate */
 SZ_PUBLIC void sz_generate_serial(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
                                   sz_random_generator_t generate, void *generator);
+/** @copydoc sz_hashes */
+SZ_PUBLIC void sz_hashes_serial(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
+                                sz_hash_callback_t callback, void *callback_handle);
 
-/**
- *  @brief  Similar to `memcpy`, copies contents of one string into another.
- *          The behavior is undefined if the strings overlap.
- *
- *  @param target   String to copy into.
- *  @param length   Number of bytes to copy.
- *  @param source   String to copy from.
- */
-SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-/**
- *  @brief  Similar to `memmove`, copies (moves) contents of one string into another.
- *          Unlike `sz_copy`, allows overlapping strings as arguments.
- *
- *  @param target   String to copy into.
- *  @param length   Number of bytes to copy.
- *  @param source   String to copy from.
- */
-SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-typedef void (*sz_move_t)(sz_ptr_t, sz_cptr_t, sz_size_t);
-
-/**
- *  @brief  Similar to `memset`, fills a string with a given value.
- *
- *  @param target   String to fill.
- *  @param length   Number of bytes to fill.
- *  @param value    Value to fill with.
- */
-SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-
-typedef void (*sz_fill_t)(sz_ptr_t, sz_size_t, sz_u8_t);
+#pragma endregion // Core API
 
-/**
- *  @brief  Initializes a string class instance to an empty value.
- */
-SZ_PUBLIC void sz_string_init(sz_string_t *string);
+#pragma region Serial Implementation
 
-/**
- *  @brief  Convenience function checking if the provided string is stored inside of the ::string instance itself,
- *          alternative being - allocated in a remote region of the heap.
- */
-SZ_PUBLIC sz_bool_t sz_string_is_on_stack(sz_string_t const *string);
+SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length) {
+    sz_u64_t checksum = 0;
+    sz_u8_t const *text_u8 = (sz_u8_t const *)text;
+    sz_u8_t const *text_end = text_u8 + length;
+    for (; text_u8 != text_end; ++text_u8) checksum += *text_u8;
+    return checksum;
+}
 
-/**
- *  @brief  Unpacks the opaque instance of a string class into its components.
- *          Recommended to use only in read-only operations.
+/*
+ *  One hardware-accelerated way of mixing hashes can be CRC, but it's only implemented for 32-bit values.
+ *  Using a Boost-like mixer works very poorly in such case:
  *
- *  @param string       String to unpack.
- *  @param start        Pointer to the start of the string.
- *  @param length       Number of bytes in the string, before the SZ_NULL character.
- *  @param space        Number of bytes allocated for the string (heap or stack), including the SZ_NULL character.
- *  @param is_external  Whether the string is allocated on the heap externally, or fits withing ::string instance.
- */
-SZ_PUBLIC void sz_string_unpack(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length, sz_size_t *space,
-                                sz_bool_t *is_external);
-
-/**
- *  @brief  Unpacks only the start and length of the string.
- *          Recommended to use only in read-only operations.
+ *       hash_first ^ (hash_second + 0x517cc1b727220a95 + (hash_first << 6) + (hash_first >> 2));
  *
- * @param string       String to unpack.
- * @param start        Pointer to the start of the string.
- * @param length       Number of bytes in the string, before the SZ_NULL character.
+ *  Let's stick to the Fibonacci hash trick using the golden ratio.
+ *  https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
  */
-SZ_PUBLIC void sz_string_range(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length);
+#define _sz_hash_mix(first, second) ((first * 11400714819323198485ull) ^ (second * 11400714819323198485ull))
+#define _sz_shift_low(x) (x)
+#define _sz_shift_high(x) ((x + 77ull) & 0xFFull)
+#define _sz_prime_mod(x) (x % SZ_U64_MAX_PRIME)
 
-/**
- *  @brief  Constructs a string of a given ::length with noisy contents.
- *          Use the returned character pointer to populate the string.
- *
- *  @param string       String to initialize.
- *  @param length       Number of bytes in the string, before the SZ_NULL character.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             SZ_NULL if the operation failed, pointer to the start of the string otherwise.
- */
-SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length, sz_memory_allocator_t *allocator);
+SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length) {
 
-/**
- *  @brief  Doesn't change the contents or the length of the string, but grows the available memory capacity.
- *          This is beneficial, if several insertions are expected, and we want to minimize allocations.
- *
- *  @param string       String to grow.
- *  @param new_capacity The number of characters to reserve space for, including existing ones.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             SZ_NULL if the operation failed, pointer to the new start of the string otherwise.
- */
-SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity, sz_memory_allocator_t *allocator);
+    sz_u64_t hash_low = 0;
+    sz_u64_t hash_high = 0;
+    sz_u8_t const *text = (sz_u8_t const *)start;
+    sz_u8_t const *text_end = text + length;
 
-/**
- *  @brief  Grows the string by adding an uninitialized region of ::added_length at the given ::offset.
- *          Would often be used in conjunction with one or more `sz_copy` calls to populate the allocated region.
- *          Similar to `sz_string_reserve`, but changes the length of the ::string.
- *
- *  @param string       String to grow.
- *  @param offset       Offset of the first byte to reserve space for.
- *                      If provided offset is larger than the length, it will be capped.
- *  @param added_length The number of new characters to reserve space for.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             SZ_NULL if the operation failed, pointer to the new start of the string otherwise.
- */
-SZ_PUBLIC sz_ptr_t sz_string_expand(sz_string_t *string, sz_size_t offset, sz_size_t added_length,
-                                    sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Removes a range from a string. Changes the length, but not the capacity.
- *          Performs no allocations or deallocations and can't fail.
- *
- *  @param string       String to clean.
- *  @param offset       Offset of the first byte to remove.
- *  @param length       Number of bytes to remove. Out-of-bound ranges will be capped.
- *  @return             Number of bytes removed.
- */
-SZ_PUBLIC sz_size_t sz_string_erase(sz_string_t *string, sz_size_t offset, sz_size_t length);
-
-/**
- *  @brief  Shrinks the string to fit the current length, if it's allocated on the heap.
- *          It's the reverse operation of ::sz_string_reserve.
- *
- *  @param string       String to shrink.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             Whether the operation was successful. The only failures can come from the allocator.
- *                      On failure, the string will remain unchanged.
- */
-SZ_PUBLIC sz_ptr_t sz_string_shrink_to_fit(sz_string_t *string, sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Frees the string, if it's allocated on the heap.
- *          If the string is on the stack, the function clears/resets the state.
- */
-SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *allocator);
-
-#pragma endregion
-
-#pragma region Fast Substring Search API
-
-typedef sz_cptr_t (*sz_find_byte_t)(sz_cptr_t, sz_size_t, sz_cptr_t);
-typedef sz_cptr_t (*sz_find_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
-typedef sz_cptr_t (*sz_find_set_t)(sz_cptr_t, sz_size_t, sz_charset_t const *);
-
-/**
- *  @brief  Locates first matching byte in a string. Equivalent to `memchr(haystack, *needle, h_length)` in LibC.
- *
- *  X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memchr.S
- *  Aarch64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/aarch64/memchr.S
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - single-byte substring to find.
- *  @return         Address of the first match.
- */
-SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/**
- *  @brief  Locates last matching byte in a string. Equivalent to `memrchr(haystack, *needle, h_length)` in LibC.
- *
- *  X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memrchr.S
- *  Aarch64 implementation: missing
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - single-byte substring to find.
- *  @return         Address of the last match.
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/**
- *  @brief  Locates first matching substring.
- *          Equivalent to `memmem(haystack, h_length, needle, n_length)` in LibC.
- *          Similar to `strstr(haystack, needle)` in LibC, but requires known length.
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - substring to find.
- *  @param n_length Number of bytes in the needle.
- *  @return         Address of the first match.
- */
-SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/**
- *  @brief  Locates the last matching substring.
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - substring to find.
- *  @param n_length Number of bytes in the needle.
- *  @return         Address of the last match.
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/**
- *  @brief  Finds the first character present from the ::set, present in ::text.
- *          Equivalent to `strspn(text, accepted)` and `strcspn(text, rejected)` in LibC.
- *          May have identical implementation and performance to ::sz_rfind_charset.
- *
- *  Useful for parsing, when we want to skip a set of characters. Examples:
- *  * 6 whitespaces: " \t\n\r\v\f".
- *  * 16 digits forming a float number: "0123456789,.eE+-".
- *  * 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
- *  * 2 JSON string special characters useful to locate the end of the string: "\"\\".
- *
- *  @param text     String to be scanned.
- *  @param set      Set of relevant characters.
- *  @return         Pointer to the first matching character from ::set.
- */
-SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-/**
- *  @brief  Finds the last character present from the ::set, present in ::text.
- *          Equivalent to `strspn(text, accepted)` and `strcspn(text, rejected)` in LibC.
- *          May have identical implementation and performance to ::sz_find_charset.
- *
- *  Useful for parsing, when we want to skip a set of characters. Examples:
- *  * 6 whitespaces: " \t\n\r\v\f".
- *  * 16 digits forming a float number: "0123456789,.eE+-".
- *  * 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
- *  * 2 JSON string special characters useful to locate the end of the string: "\"\\".
- *
- *  @param text     String to be scanned.
- *  @param set      Set of relevant characters.
- *  @return         Pointer to the last matching character from ::set.
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-#pragma endregion
-
-#pragma region String Similarity Measures API
-
-/**
- *  @brief  Computes the Hamming distance between two strings - number of not matching characters.
- *          Difference in length is is counted as a mismatch.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for the distance, the `bound` if was exceeded.
- *
- *  @see    sz_hamming_distance_utf8
- *  @see    https://en.wikipedia.org/wiki/Hamming_distance
- */
-SZ_DYNAMIC sz_size_t sz_hamming_distance( //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
-
-/** @copydoc sz_hamming_distance */
-SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
-
-/**
- *  @brief  Computes the Hamming distance between two @b UTF8 strings - number of not matching characters.
- *          Difference in length is is counted as a mismatch.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for the distance, the `bound` if was exceeded.
- *
- *  @see    sz_hamming_distance
- *  @see    https://en.wikipedia.org/wiki/Hamming_distance
- */
-SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                              sz_size_t bound);
-
-/** @copydoc sz_hamming_distance_utf8 */
-SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                                    sz_size_t bound);
-
-typedef sz_size_t (*sz_hamming_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t);
-
-/**
- *  @brief  Computes the Levenshtein edit-distance between two strings using the Wagner-Fisher algorithm.
- *          Similar to the Needleman-Wunsch alignment algorithm. Often used in fuzzy string matching.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @param bound    Exclusive upper bound on the distance, that allows us to exit early.
- *                  Pass `SZ_SIZE_MAX` or any value greater than `(max(a_length, b_length))` to ignore.
- *                  Pass zero to check if the strings are equal.
- *  @return         Unsigned integer for the edit distance. Zero means the strings are equal.
- *                  Returns the `bound` if it was exceeded or `SZ_SIZE_MAX` if the memory allocation failed.
- *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default
- *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
- */
-SZ_DYNAMIC sz_size_t sz_edit_distance(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                      sz_size_t bound, sz_memory_allocator_t *alloc);
-
-/** @copydoc sz_edit_distance */
-SZ_PUBLIC sz_size_t sz_edit_distance_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                            sz_size_t bound, sz_memory_allocator_t *alloc);
-
-/**
- *  @brief  Computes the Levenshtein edit-distance between two @b UTF8 strings.
- *          Unlike `sz_edit_distance`, reports the distance in Unicode codepoints, and not in bytes.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for edit distance, the `bound` if was exceeded or `SZ_SIZE_MAX`
- *                  if the memory allocation failed.
- *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default, sz_edit_distance
- *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
- */
-SZ_DYNAMIC sz_size_t sz_edit_distance_utf8(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                           sz_size_t bound, sz_memory_allocator_t *alloc);
-
-typedef sz_size_t (*sz_edit_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t, sz_memory_allocator_t *);
-
-/** @copydoc sz_edit_distance_utf8 */
-SZ_PUBLIC sz_size_t sz_edit_distance_utf8_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                                 sz_size_t bound, sz_memory_allocator_t *alloc);
-
-/**
- *  @brief  Computes Needleman–Wunsch alignment score for two string. Often used in bioinformatics and cheminformatics.
- *          Similar to the Levenshtein edit-distance, parameterized for gap and substitution penalties.
- *
- *  Not commutative in the general case, as the order of the strings matters, as `sz_alignment_score(a, b)` may
- *  not be equal to `sz_alignment_score(b, a)`. Becomes @b commutative, if the substitution costs are symmetric.
- *  Equivalent to the negative Levenshtein distance, if: `gap == -1` and `subs[i][j] == (i == j ? 0: -1)`.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *  @param gap      Penalty cost for gaps - insertions and removals.
- *  @param subs     Substitution costs matrix with 256 x 256 values for all pairs of characters.
- *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @return         Signed similarity score. Can be negative, depending on the substitution costs.
- *                  If the memory allocation fails, the function returns `SZ_SSIZE_MAX`.
- *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default
- *  @see    https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
- */
-SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                         sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                         sz_memory_allocator_t *alloc);
-
-/** @copydoc sz_alignment_score */
-SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                               sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                               sz_memory_allocator_t *alloc);
-
-typedef sz_ssize_t (*sz_alignment_score_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_error_cost_t const *,
-                                           sz_error_cost_t, sz_memory_allocator_t *);
-
-typedef void (*sz_hash_callback_t)(sz_cptr_t, sz_size_t, sz_u64_t, void *user);
-
-/**
- *  @brief  Computes the Karp-Rabin rolling hashes of a string supplying them to the provided `callback`.
- *          Can be used for similarity scores, search, ranking, etc.
- *
- *  Rabin-Karp-like rolling hashes can have very high-level of collisions and depend
- *  on the choice of bases and the prime number. That's why, often two hashes from the same
- *  family are used with different bases.
- *
- *       1. Kernighan and Ritchie's function uses 31, a prime close to the size of English alphabet.
- *       2. To be friendlier to byte-arrays and UTF8, we use 257 for the second function.
- *
- *  Choosing the right ::window_length is task- and domain-dependant. For example, most English words are
- *  between 3 and 7 characters long, so a window of 4 bytes would be a good choice. For DNA sequences,
- *  the ::window_length might be a multiple of 3, as the codons are 3 (nucleotides) bytes long.
- *  With such minimalistic alphabets of just four characters (AGCT) longer windows might be needed.
- *  For protein sequences the alphabet is 20 characters long, so the window can be shorter, than for DNAs.
- *
- *  @param text             String to hash.
- *  @param length           Number of bytes in the string.
- *  @param window_length    Length of the rolling window in bytes.
- *  @param window_step      Step of reported hashes. @b Must be power of two. Should be smaller than `window_length`.
- *  @param callback         Function receiving the start & length of a substring, the hash, and the `callback_handle`.
- *  @param callback_handle  Optional user-provided pointer to be passed to the `callback`.
- *  @see                    sz_hashes_fingerprint, sz_hashes_intersection
- */
-SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                          sz_hash_callback_t callback, void *callback_handle);
-
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_serial(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                                sz_hash_callback_t callback, void *callback_handle);
-
-typedef void (*sz_hashes_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_size_t, sz_hash_callback_t, void *);
-
-/**
- *  @brief  Computes the Karp-Rabin rolling hashes of a string outputting a binary fingerprint.
- *          Such fingerprints can be compared with Hamming or Jaccard (Tanimoto) distance for similarity.
- *
- *  The algorithm doesn't clear the fingerprint buffer on start, so it can be invoked multiple times
- *  to produce a fingerprint of a longer string, by passing the previous fingerprint as the ::fingerprint.
- *  It can also be reused to produce multi-resolution fingerprints by changing the ::window_length
- *  and calling the same function multiple times for the same input ::text.
- *
- *  Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
- *  avoiding cache-coherency penalties of remote on-heap buffers.
- *
- *  @param text                 String to hash.
- *  @param length               Number of bytes in the string.
- *  @param fingerprint          Output fingerprint buffer.
- *  @param fingerprint_bytes    Number of bytes in the fingerprint buffer.
- *  @param window_length        Length of the rolling window in bytes.
- *  @see                        sz_hashes, sz_hashes_intersection
- */
-SZ_PUBLIC void sz_hashes_fingerprint(                          //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
-    sz_ptr_t fingerprint, sz_size_t fingerprint_bytes);
-
-typedef void (*sz_hashes_fingerprint_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_ptr_t, sz_size_t);
-
-/**
- *  @brief  Given a hash-fingerprint of a textual document, computes the number of intersecting hashes
- *          of the incoming document. Can be used for document scoring and search.
- *
- *  Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
- *  avoiding cache-coherency penalties of remote on-heap buffers.
- *
- *  @param text                 Input document.
- *  @param length               Number of bytes in the input document.
- *  @param fingerprint          Reference document fingerprint.
- *  @param fingerprint_bytes    Number of bytes in the reference documents fingerprint.
- *  @param window_length        Length of the rolling window in bytes.
- *  @see                        sz_hashes, sz_hashes_fingerprint
- */
-SZ_PUBLIC sz_size_t sz_hashes_intersection(                    //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
-    sz_cptr_t fingerprint, sz_size_t fingerprint_bytes);
-
-typedef sz_size_t (*sz_hashes_intersection_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_cptr_t, sz_size_t);
-
-#pragma endregion
-
-#pragma region Convenience API
-
-/**
- *  @brief  Finds the first character in the haystack, that is present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-/**
- *  @brief  Finds the first character in the haystack, that is @b not present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_find_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-/**
- *  @brief  Finds the last character in the haystack, that is present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-/**
- *  @brief  Finds the last character in the haystack, that is @b not present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-#pragma endregion
-
-#pragma region String Sequences API
-
-struct sz_sequence_t;
-
-typedef sz_cptr_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_size_t (*sz_sequence_member_length_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_bool_t (*sz_sequence_predicate_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_bool_t (*sz_sequence_comparator_t)(struct sz_sequence_t const *, sz_size_t, sz_size_t);
-typedef sz_bool_t (*sz_string_is_less_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
-
-typedef struct sz_sequence_t {
-    sz_sorted_idx_t *order;
-    sz_size_t count;
-    sz_sequence_member_start_t get_start;
-    sz_sequence_member_length_t get_length;
-    void const *handle;
-} sz_sequence_t;
-
-/**
- *  @brief  Initiates the sequence structure from a tape layout, used by Apache Arrow.
- *          Expects ::offsets to contains `count + 1` entries, the last pointing at the end
- *          of the last string, indicating the total length of the ::tape.
- */
-SZ_PUBLIC void sz_sequence_from_u32tape(sz_cptr_t *start, sz_u32_t const *offsets, sz_size_t count,
-                                        sz_sequence_t *sequence);
-
-/**
- *  @brief  Initiates the sequence structure from a tape layout, used by Apache Arrow.
- *          Expects ::offsets to contains `count + 1` entries, the last pointing at the end
- *          of the last string, indicating the total length of the ::tape.
- */
-SZ_PUBLIC void sz_sequence_from_u64tape(sz_cptr_t *start, sz_u64_t const *offsets, sz_size_t count,
-                                        sz_sequence_t *sequence);
-
-/**
- *  @brief  Similar to `std::partition`, given a predicate splits the sequence into two parts.
- *          The algorithm is unstable, meaning that elements may change relative order, as long
- *          as they are in the right partition. This is the simpler algorithm for partitioning.
- */
-SZ_PUBLIC sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate);
-
-/**
- *  @brief  Inplace `std::set_union` for two consecutive chunks forming the same continuous `sequence`.
- *
- *  @param partition The number of elements in the first sub-sequence in `sequence`.
- *  @param less Comparison function, to determine the lexicographic ordering.
- */
-SZ_PUBLIC void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less);
-
-/**
- *  @brief  Sorting algorithm, combining Radix Sort for the first 32 bits of every word
- *          and a follow-up by a more conventional sorting procedure on equally prefixed parts.
- */
-SZ_PUBLIC void sz_sort(sz_sequence_t *sequence);
-
-/**
- *  @brief  Partial sorting algorithm, combining Radix Sort for the first 32 bits of every word
- *          and a follow-up by a more conventional sorting procedure on equally prefixed parts.
- */
-SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t n);
-
-/**
- *  @brief  Intro-Sort algorithm that supports custom comparators.
- */
-SZ_PUBLIC void sz_sort_intro(sz_sequence_t *sequence, sz_sequence_comparator_t less);
-
-#pragma endregion
-
-/*
- *  Hardware feature detection.
- *  All of those can be controlled by the user.
- */
-#ifndef SZ_USE_X86_AVX512
-#ifdef __AVX512BW__
-#define SZ_USE_X86_AVX512 1
-#else
-#define SZ_USE_X86_AVX512 0
-#endif
-#endif
-
-#ifndef SZ_USE_X86_AVX2
-#ifdef __AVX2__
-#define SZ_USE_X86_AVX2 1
-#else
-#define SZ_USE_X86_AVX2 0
-#endif
-#endif
-
-#ifndef SZ_USE_ARM_NEON
-#ifdef __ARM_NEON
-#define SZ_USE_ARM_NEON 1
-#else
-#define SZ_USE_ARM_NEON 0
-#endif
-#endif
-
-#ifndef SZ_USE_ARM_SVE
-#ifdef __ARM_FEATURE_SVE
-#define SZ_USE_ARM_SVE 1
-#else
-#define SZ_USE_ARM_SVE 0
-#endif
-#endif
-
-/*
- *  Include hardware-specific headers.
- */
-#if SZ_USE_X86_AVX512 || SZ_USE_X86_AVX2
-#include <immintrin.h>
-#endif // SZ_USE_X86...
-#if SZ_USE_ARM_NEON
-#if !defined(_MSC_VER)
-#include <arm_acle.h>
-#endif
-#include <arm_neon.h>
-#endif // SZ_USE_ARM_NEON
-#if SZ_USE_ARM_SVE
-#if !defined(_MSC_VER)
-#include <arm_sve.h>
-#endif
-#endif // SZ_USE_ARM_SVE
-
-#pragma region Hardware Specific API
-
-#if SZ_USE_X86_AVX512
-
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_avx512(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_avx512(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_edit_distance */
-SZ_PUBLIC sz_size_t sz_edit_distance_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                            sz_size_t bound, sz_memory_allocator_t *alloc);
-/** @copydoc sz_alignment_score */
-SZ_PUBLIC sz_ssize_t sz_alignment_score_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                               sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                               sz_memory_allocator_t *alloc);
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle);
-#endif
-
-#if SZ_USE_X86_AVX2
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_avx2(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_avx2(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_avx2(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_avx2(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                              sz_hash_callback_t callback, void *callback_handle);
-#endif
-
-#if SZ_USE_ARM_NEON
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_neon(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_neon(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_neon(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-#endif
-
-#if SZ_USE_ARM_SVE
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_sve(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_sve(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_sve(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_sve(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_sve(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-#endif
-
-#pragma endregion
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wconversion"
-
-/*
- **********************************************************************************************************************
- **********************************************************************************************************************
- **********************************************************************************************************************
- *
- *  This is where we the actual implementation begins.
- *  The rest of the file is hidden from the public API.
- *
- **********************************************************************************************************************
- **********************************************************************************************************************
- **********************************************************************************************************************
- */
-
-#pragma region Compiler Extensions and Helper Functions
-
-#pragma GCC visibility push(hidden)
-
-/**
- *  @brief  Helper-macro to mark potentially unused variables.
- */
-#define sz_unused(x) ((void)(x))
-
-/**
- *  @brief  Helper-macro casting a variable to another type of the same size.
- */
-#define sz_bitcast(type, value) (*((type *)&(value)))
-
-/**
- *  @brief  Defines `SZ_NULL`, analogous to `NULL`.
- *          The default often comes from locale.h, stddef.h,
- *          stdio.h, stdlib.h, string.h, time.h, or wchar.h.
- */
-#ifdef __GNUG__
-#define SZ_NULL __null
-#define SZ_NULL_CHAR __null
-#else
-#define SZ_NULL ((void *)0)
-#define SZ_NULL_CHAR ((char *)0)
-#endif
-
-/**
- *  @brief  Cache-line width, that will affect the execution of some algorithms,
- *          like equality checks and relative order computing.
- */
-#define SZ_CACHE_LINE_WIDTH (64) // bytes
-
-/**
- *  @brief  Similar to `assert`, the `sz_assert` is used in the SZ_DEBUG mode
- *          to check the invariants of the library. It's a no-op in the SZ_RELEASE mode.
- *  @note   If you want to catch it, put a breakpoint at @b `__GI_exit`
- */
-#if SZ_DEBUG && defined(SZ_AVOID_LIBC) && !SZ_AVOID_LIBC && !defined(SZ_PIC)
-#include <stdio.h>  // `fprintf`
-#include <stdlib.h> // `EXIT_FAILURE`
-SZ_PUBLIC void _sz_assert_failure(char const *condition, char const *file, int line) {
-    fprintf(stderr, "Assertion failed: %s, in file %s, line %d\n", condition, file, line);
-    exit(EXIT_FAILURE);
-}
-#define sz_assert(condition)                                                      \
-    do {                                                                          \
-        if (!(condition)) { _sz_assert_failure(#condition, __FILE__, __LINE__); } \
-    } while (0)
-#else
-#define sz_assert(condition) ((void)(condition))
-#endif
-
-/*  Intrinsics aliases for MSVC, GCC, Clang, and Clang-Cl.
- *  The following section of compiler intrinsics comes in 2 flavors.
- */
-#if defined(_MSC_VER) && !defined(__clang__) // On Clang-CL
-#include <intrin.h>
-
-// Sadly, when building Win32 images, we can't use the `_tzcnt_u64`, `_lzcnt_u64`,
-// `_BitScanForward64`, or `_BitScanReverse64` intrinsics. For now it's a simple `for`-loop.
-// TODO: In the future we can switch to a more efficient De Bruijn's algorithm.
-// https://www.chessprogramming.org/BitScan
-// https://www.chessprogramming.org/De_Bruijn_Sequence
-// https://gist.github.com/resilar/e722d4600dbec9752771ab4c9d47044f
-//
-// Use the serial version on 32-bit x86 and on Arm.
-#if (defined(_WIN32) && !defined(_WIN64)) || defined(_M_ARM) || defined(_M_ARM64)
-SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 1) == 0) { n++, x >>= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u64_clz(sz_u64_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 0x8000000000000000ull) == 0) { n++, x <<= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) {
-    x = x - ((x >> 1) & 0x5555555555555555ull);
-    x = (x & 0x3333333333333333ull) + ((x >> 2) & 0x3333333333333333ull);
-    return (((x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Full) * 0x0101010101010101ull) >> 56;
-}
-SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 1) == 0) { n++, x >>= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u32_clz(sz_u32_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 0x80000000u) == 0) { n++, x <<= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) {
-    x = x - ((x >> 1) & 0x55555555);
-    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
-    return (((x + (x >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
-}
-#else
-SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) { return (int)_tzcnt_u64(x); }
-SZ_INTERNAL int sz_u64_clz(sz_u64_t x) { return (int)_lzcnt_u64(x); }
-SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) { return (int)__popcnt64(x); }
-SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) { return (int)_tzcnt_u32(x); }
-SZ_INTERNAL int sz_u32_clz(sz_u32_t x) { return (int)_lzcnt_u32(x); }
-SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) { return (int)__popcnt(x); }
-#endif
-// Force the byteswap functions to be intrinsics, because when /Oi- is given, these will turn into CRT function calls,
-// which breaks when `SZ_AVOID_LIBC` is given
-#pragma intrinsic(_byteswap_uint64)
-SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return _byteswap_uint64(val); }
-#pragma intrinsic(_byteswap_ulong)
-SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return _byteswap_ulong(val); }
-#else
-SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) { return __builtin_popcountll(x); }
-SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) { return __builtin_popcount(x); }
-SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) { return __builtin_ctzll(x); }
-SZ_INTERNAL int sz_u64_clz(sz_u64_t x) { return __builtin_clzll(x); }
-SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) { return __builtin_ctz(x); } // ! Undefined if `x == 0`
-SZ_INTERNAL int sz_u32_clz(sz_u32_t x) { return __builtin_clz(x); } // ! Undefined if `x == 0`
-SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return __builtin_bswap64(val); }
-SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return __builtin_bswap32(val); }
-#endif
-
-SZ_INTERNAL sz_u64_t sz_u64_rotl(sz_u64_t x, sz_u64_t r) { return (x << r) | (x >> (64 - r)); }
-
-/**
- *  @brief  Select bits from either ::a or ::b depending on the value of ::mask bits.
- *
- *  Similar to `_mm_blend_epi16` intrinsic on x86.
- *  Described in the "Bit Twiddling Hacks" by Sean Eron Anderson.
- *  https://graphics.stanford.edu/~seander/bithacks.html#ConditionalSetOrClearBitsWithoutBranching
- */
-SZ_INTERNAL sz_u64_t sz_u64_blend(sz_u64_t a, sz_u64_t b, sz_u64_t mask) { return a ^ ((a ^ b) & mask); }
-
-/*
- *  Efficiently computing the minimum and maximum of two or three values can be tricky.
- *  The simple branching baseline would be:
- *
- *      x < y ? x : y                               // can replace with 1 conditional move
- *
- *  Branchless approach is well known for signed integers, but it doesn't apply to unsigned ones.
- *  https://stackoverflow.com/questions/514435/templatized-branchless-int-max-min-function
- *  https://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
- *  Using only bit-shifts for singed integers it would be:
- *
- *      y + ((x - y) & (x - y) >> 31)               // 4 unique operations
- *
- *  Alternatively, for any integers using multiplication:
- *
- *      (x > y) * y + (x <= y) * x                  // 5 operations
- *
- *  Alternatively, to avoid multiplication:
- *
- *      x & ~((x < y) - 1) + y & ((x < y) - 1)      // 6 unique operations
- */
-#define sz_min_of_two(x, y) (x < y ? x : y)
-#define sz_max_of_two(x, y) (x < y ? y : x)
-#define sz_min_of_three(x, y, z) sz_min_of_two(x, sz_min_of_two(y, z))
-#define sz_max_of_three(x, y, z) sz_max_of_two(x, sz_max_of_two(y, z))
-
-/** @brief  Branchless minimum function for two signed 32-bit integers. */
-SZ_INTERNAL sz_i32_t sz_i32_min_of_two(sz_i32_t x, sz_i32_t y) { return y + ((x - y) & (x - y) >> 31); }
-
-/** @brief  Branchless minimum function for two signed 32-bit integers. */
-SZ_INTERNAL sz_i32_t sz_i32_max_of_two(sz_i32_t x, sz_i32_t y) { return x - ((x - y) & (x - y) >> 31); }
-
-/**
- *  @brief  Clamps signed offsets in a string to a valid range. Used for Pythonic-style slicing.
- */
-SZ_INTERNAL void sz_ssize_clamp_interval(sz_size_t length, sz_ssize_t start, sz_ssize_t end,
-                                         sz_size_t *normalized_offset, sz_size_t *normalized_length) {
-    // TODO: Remove branches.
-    // Normalize negative indices
-    if (start < 0) start += length;
-    if (end < 0) end += length;
-
-    // Clamp indices to a valid range
-    if (start < 0) start = 0;
-    if (end < 0) end = 0;
-    if (start > (sz_ssize_t)length) start = length;
-    if (end > (sz_ssize_t)length) end = length;
-
-    // Ensure start <= end
-    if (start > end) start = end;
-
-    *normalized_offset = start;
-    *normalized_length = end - start;
-}
-
-/**
- *  @brief  Compute the logarithm base 2 of a positive integer, rounding down.
- */
-SZ_INTERNAL sz_size_t sz_size_log2i_nonzero(sz_size_t x) {
-    sz_assert(x > 0 && "Non-positive numbers have no defined logarithm");
-    sz_size_t leading_zeros = sz_u64_clz(x);
-    return 63 - leading_zeros;
-}
-
-/**
- *  @brief  Compute the smallest power of two greater than or equal to ::x.
- */
-SZ_INTERNAL sz_size_t sz_size_bit_ceil(sz_size_t x) {
-    // Unlike the commonly used trick with `clz` intrinsics, is valid across the whole range of `x`.
-    // https://stackoverflow.com/a/10143264
-    x--;
-    x |= x >> 1;
-    x |= x >> 2;
-    x |= x >> 4;
-    x |= x >> 8;
-    x |= x >> 16;
-#if SZ_DETECT_64_BIT
-    x |= x >> 32;
-#endif
-    x++;
-    return x;
-}
-
-/**
- *  @brief  Transposes an 8x8 bit matrix packed in a `sz_u64_t`.
- *
- *  There is a well known SWAR sequence for that known to chess programmers,
- *  willing to flip a bit-matrix of pieces along the main A1-H8 diagonal.
- *  https://www.chessprogramming.org/Flipping_Mirroring_and_Rotating
- *  https://lukas-prokop.at/articles/2021-07-23-transpose
- */
-SZ_INTERNAL sz_u64_t sz_u64_transpose(sz_u64_t x) {
-    sz_u64_t t;
-    t = x ^ (x << 36);
-    x ^= 0xf0f0f0f00f0f0f0full & (t ^ (x >> 36));
-    t = 0xcccc0000cccc0000ull & (x ^ (x << 18));
-    x ^= t ^ (t >> 18);
-    t = 0xaa00aa00aa00aa00ull & (x ^ (x << 9));
-    x ^= t ^ (t >> 9);
-    return x;
-}
-
-/**
- *  @brief  Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
- */
-SZ_INTERNAL void sz_u64_swap(sz_u64_t *a, sz_u64_t *b) {
-    sz_u64_t t = *a;
-    *a = *b;
-    *b = t;
-}
-
-/**
- *  @brief  Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
- */
-SZ_INTERNAL void sz_pointer_swap(void **a, void **b) {
-    void *t = *a;
-    *a = *b;
-    *b = t;
-}
-
-/**
- *  @brief  Helper structure to simplify work with 16-bit words.
- *  @see    sz_u16_load
- */
-typedef union sz_u16_vec_t {
-    sz_u16_t u16;
-    sz_u8_t u8s[2];
-} sz_u16_vec_t;
-
-/**
- *  @brief Load a 16-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u16_vec_t sz_u16_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u16_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u16_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u16_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u16_vec_t const *result = (sz_u16_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/**
- *  @brief  Helper structure to simplify work with 32-bit words.
- *  @see    sz_u32_load
- */
-typedef union sz_u32_vec_t {
-    sz_u32_t u32;
-    sz_u16_t u16s[2];
-    sz_u8_t u8s[4];
-} sz_u32_vec_t;
-
-/**
- *  @brief Load a 32-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u32_vec_t sz_u32_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u32_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    result.u8s[2] = ptr[2];
-    result.u8s[3] = ptr[3];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u32_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u32_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u32_vec_t const *result = (sz_u32_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/**
- *  @brief  Helper structure to simplify work with 64-bit words.
- *  @see    sz_u64_load
- */
-typedef union sz_u64_vec_t {
-    sz_u64_t u64;
-    sz_u32_t u32s[2];
-    sz_u16_t u16s[4];
-    sz_u8_t u8s[8];
-} sz_u64_vec_t;
-
-/**
- *  @brief Load a 64-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u64_vec_t sz_u64_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u64_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    result.u8s[2] = ptr[2];
-    result.u8s[3] = ptr[3];
-    result.u8s[4] = ptr[4];
-    result.u8s[5] = ptr[5];
-    result.u8s[6] = ptr[6];
-    result.u8s[7] = ptr[7];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u64_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u64_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u64_vec_t const *result = (sz_u64_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/** @brief  Helper function, using the supplied fixed-capacity buffer to allocate memory. */
-SZ_INTERNAL sz_ptr_t _sz_memory_allocate_fixed(sz_size_t length, void *handle) {
-    sz_size_t capacity;
-    sz_copy((sz_ptr_t)&capacity, (sz_cptr_t)handle, sizeof(sz_size_t));
-    sz_size_t consumed_capacity = sizeof(sz_size_t);
-    if (consumed_capacity + length > capacity) return SZ_NULL_CHAR;
-    return (sz_ptr_t)handle + consumed_capacity;
-}
-
-/** @brief  Helper "no-op" function, simulating memory deallocation when we use a "static" memory buffer. */
-SZ_INTERNAL void _sz_memory_free_fixed(sz_ptr_t start, sz_size_t length, void *handle) {
-    sz_unused(start && length && handle);
-}
-
-/** @brief  An internal callback used to set a bit in a power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void _sz_hashes_fingerprint_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *handle) {
-    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
-    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
-    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
-    fingerprint_u8s[(hash / 8) & (fingerprint_bytes - 1)] |= (1 << (hash & 7));
-    sz_unused(start && length);
-}
-
-/** @brief  An internal callback used to set a bit in a @b non power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void _sz_hashes_fingerprint_non_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash,
-                                                          void *handle) {
-    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
-    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
-    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
-    fingerprint_u8s[(hash / 8) % fingerprint_bytes] |= (1 << (hash & 7));
-    sz_unused(start && length);
-}
-
-/** @brief  An internal callback, used to mix all the running hashes into one pointer-size value. */
-SZ_INTERNAL void _sz_hashes_fingerprint_scalar_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash,
-                                                        void *scalar_handle) {
-    sz_unused(start && length && hash && scalar_handle);
-    sz_size_t *scalar_ptr = (sz_size_t *)scalar_handle;
-    *scalar_ptr ^= hash;
-}
-
-/**
- *  @brief  Chooses the offsets of the most interesting characters in a search needle.
- *
- *  Search throughput can significantly deteriorate if we are matching the wrong characters.
- *  Say the needle is "aXaYa", and we are comparing the first, second, and last character.
- *  If we use SIMD and compare many offsets at a time, comparing against "a" in every register is a waste.
- *
- *  Similarly, dealing with UTF8 inputs, we know that the lower bits of each character code carry more information.
- *  Cyrillic alphabet, for example, falls into [0x0410, 0x042F] code range for uppercase [А, Я], and
- *  into [0x0430, 0x044F] for lowercase [а, я]. Scanning through a text written in Russian, half of the
- *  bytes will carry absolutely no value and will be equal to 0x04.
- */
-SZ_INTERNAL void _sz_locate_needle_anomalies(sz_cptr_t start, sz_size_t length, //
-                                             sz_size_t *first, sz_size_t *second, sz_size_t *third) {
-    *first = 0;
-    *second = length / 2;
-    *third = length - 1;
-
-    //
-    int has_duplicates =                   //
-        start[*first] == start[*second] || //
-        start[*first] == start[*third] ||  //
-        start[*second] == start[*third];
-
-    // Loop through letters to find non-colliding variants.
-    if (length > 3 && has_duplicates) {
-        // Pivot the middle point right, until we find a character different from the first one.
-        for (; start[*second] == start[*first] && *second + 1 < *third; ++(*second)) {}
-        // Pivot the third (last) point left, until we find a different character.
-        for (; (start[*third] == start[*second] || start[*third] == start[*first]) && *third > (*second + 1);
-             --(*third)) {}
-    }
-
-    // TODO: Investigate alternative strategies for long needles.
-    // On very long needles we have the luxury to choose!
-    // Often dealing with UTF8, we will likely benefit from shifting the first and second characters
-    // further to the right, to achieve not only uniqueness within the needle, but also avoid common
-    // rune prefixes of 2-, 3-, and 4-byte codes.
-    if (length > 8) {
-        // Pivot the first and second points right, until we find a character, that:
-        // > is different from others.
-        // > doesn't start with 0b'110x'xxxx - only 5 bits of relevant info.
-        // > doesn't start with 0b'1110'xxxx - only 4 bits of relevant info.
-        // > doesn't start with 0b'1111'0xxx - only 3 bits of relevant info.
-        //
-        // So we are practically searching for byte values that start with 0b0xxx'xxxx or 0b'10xx'xxxx.
-        // Meaning they fall in the range [0, 127] and [128, 191], in other words any unsigned int up to 191.
-        sz_u8_t const *start_u8 = (sz_u8_t const *)start;
-        sz_size_t vibrant_first = *first, vibrant_second = *second, vibrant_third = *third;
-
-        // Let's begin with the seccond character, as the termination criteria there is more obvious
-        // and we may end up with more variants to check for the first candidate.
-        for (; (start_u8[vibrant_second] > 191 || start_u8[vibrant_second] == start_u8[vibrant_third]) &&
-               (vibrant_second + 1 < vibrant_third);
-             ++vibrant_second) {}
-
-        // Now check if we've indeed found a good candidate or should revert the `vibrant_second` to `second`.
-        if (start_u8[vibrant_second] < 191) { *second = vibrant_second; }
-        else { vibrant_second = *second; }
-
-        // Now check the first character.
-        for (; (start_u8[vibrant_first] > 191 || start_u8[vibrant_first] == start_u8[vibrant_second] ||
-                start_u8[vibrant_first] == start_u8[vibrant_third]) &&
-               (vibrant_first + 1 < vibrant_second);
-             ++vibrant_first) {}
-
-        // Now check if we've indeed found a good candidate or should revert the `vibrant_first` to `first`.
-        // We don't need to shift the third one when dealing with texts as the last byte of the text is
-        // also the last byte of a rune and contains the most information.
-        if (start_u8[vibrant_first] < 191) { *first = vibrant_first; }
-    }
-}
-
-#pragma GCC visibility pop
-#pragma endregion
-
-#pragma region Serial Implementation
-
-#if !SZ_AVOID_LIBC
-#include <stdio.h>  // `fprintf`
-#include <stdlib.h> // `malloc`, `EXIT_FAILURE`
-
-SZ_PUBLIC void *_sz_memory_allocate_default(sz_size_t length, void *handle) {
-    sz_unused(handle);
-    return malloc(length);
-}
-SZ_PUBLIC void _sz_memory_free_default(sz_ptr_t start, sz_size_t length, void *handle) {
-    sz_unused(handle && length);
-    free(start);
-}
-
-#endif
-
-SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc) {
-#if !SZ_AVOID_LIBC
-    alloc->allocate = (sz_memory_allocate_t)_sz_memory_allocate_default;
-    alloc->free = (sz_memory_free_t)_sz_memory_free_default;
-#else
-    alloc->allocate = (sz_memory_allocate_t)SZ_NULL;
-    alloc->free = (sz_memory_free_t)SZ_NULL;
-#endif
-    alloc->handle = SZ_NULL;
-}
-
-SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length) {
-    // The logic here is simple - put the buffer length in the first slots of the buffer.
-    // Later use it for bounds checking.
-    alloc->allocate = (sz_memory_allocate_t)_sz_memory_allocate_fixed;
-    alloc->free = (sz_memory_free_t)_sz_memory_free_fixed;
-    alloc->handle = &buffer;
-    sz_copy((sz_ptr_t)buffer, (sz_cptr_t)&length, sizeof(sz_size_t));
-}
-
-/**
- *  @brief  Byte-level equality comparison between two strings.
- *          If unaligned loads are allowed, uses a switch-table to avoid loops on short strings.
- */
-SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_cptr_t const a_end = a + length;
-#if SZ_USE_MISALIGNED_LOADS
-    if (length >= SZ_SWAR_THRESHOLD) {
-        sz_u64_vec_t a_vec, b_vec;
-        for (; a + 8 <= a_end; a += 8, b += 8) {
-            a_vec = sz_u64_load(a);
-            b_vec = sz_u64_load(b);
-            if (a_vec.u64 != b_vec.u64) return sz_false_k;
-        }
-    }
-#endif
-    while (a != a_end && *a == *b) a++, b++;
-    return (sz_bool_t)(a_end == a);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-    for (sz_cptr_t const end = text + length; text != end; ++text)
-        if (sz_charset_contains(set, *text)) return text;
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Warray-bounds"
-    sz_cptr_t const end = text;
-    for (text += length; text != end;)
-        if (sz_charset_contains(set, *(text -= 1))) return text;
-    return SZ_NULL_CHAR;
-#pragma GCC diagnostic pop
-}
-
-/**
- *  One option to avoid branching is to use conditional moves and lookup the comparison result in a table:
- *       sz_ordering_t ordering_lookup[2] = {sz_greater_k, sz_less_k};
- *       for (; a != min_end; ++a, ++b)
- *           if (*a != *b) return ordering_lookup[*a < *b];
- *  That, however, introduces a data-dependency.
- *  A cleaner option is to perform two comparisons and a subtraction.
- *  One instruction more, but no data-dependency.
- */
-#define _sz_order_scalars(a, b) ((sz_ordering_t)((a > b) - (a < b)))
-
-SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    sz_bool_t a_shorter = (sz_bool_t)(a_length < b_length);
-    sz_size_t min_length = a_shorter ? a_length : b_length;
-    sz_cptr_t min_end = a + min_length;
-#if SZ_USE_MISALIGNED_LOADS && !SZ_DETECT_BIG_ENDIAN
-    for (sz_u64_vec_t a_vec, b_vec; a + 8 <= min_end; a += 8, b += 8) {
-        a_vec = sz_u64_load(a);
-        b_vec = sz_u64_load(b);
-        if (a_vec.u64 != b_vec.u64)
-            return _sz_order_scalars(sz_u64_bytes_reverse(a_vec.u64), sz_u64_bytes_reverse(b_vec.u64));
-    }
-#endif
-    for (; a != min_end; ++a, ++b)
-        if (*a != *b) return _sz_order_scalars(*a, *b);
-
-    // If the strings are equal up to `min_end`, then the shorter string is smaller
-    return _sz_order_scalars(a_length, b_length);
-}
-
-/**
- *  @brief  Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each byte is set.
-    // For that take the bottom 7 bits of each byte, add one to them,
-    // and if this sets the top bit to one, then all the 7 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7F7F7F7F7F7F7F7Full) + 0x0101010101010101ull) & ((vec.u64 & 0x8080808080808080ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b single-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- *          Identical to `memchr(haystack, needle[0], haystack_length)`.
- */
-SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    if (!h_length) return SZ_NULL_CHAR;
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_DETECT_BIG_ENDIAN    // Use SWAR only on little-endian platforms for brevety.
-#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
-        if (*h == *n) return h;
-#endif
-
-    // Broadcast the n into every byte of a 64-bit integer to use SWAR
-    // techniques and process eight characters at a time.
-    sz_u64_vec_t h_vec, n_vec, match_vec;
-    match_vec.u64 = 0;
-    n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
-    for (; h + 8 <= h_end; h += 8) {
-        h_vec.u64 = *(sz_u64_t const *)h;
-        match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
-        if (match_vec.u64) return h + sz_u64_ctz(match_vec.u64) / 8;
-    }
-#endif
-
-    // Handle the misaligned tail.
-    for (; h < h_end; ++h)
-        if (*h == *n) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Find the last occurrence of a @b single-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- *          Identical to `memrchr(haystack, needle[0], haystack_length)`.
- */
-sz_cptr_t sz_rfind_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    if (!h_length) return SZ_NULL_CHAR;
-    sz_cptr_t const h_start = h;
-
-    // Reposition the `h` pointer to the end, as we will be walking backwards.
-    h = h + h_length - 1;
-
-#if !SZ_DETECT_BIG_ENDIAN    // Use SWAR only on little-endian platforms for brevety.
-#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)(h + 1) & 7ull) && h >= h_start; --h)
-        if (*h == *n) return h;
-#endif
-
-    // Broadcast the n into every byte of a 64-bit integer to use SWAR
-    // techniques and process eight characters at a time.
-    sz_u64_vec_t h_vec, n_vec, match_vec;
-    n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
-    for (; h >= h_start + 7; h -= 8) {
-        h_vec.u64 = *(sz_u64_t const *)(h - 7);
-        match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
-        if (match_vec.u64) return h - sz_u64_clz(match_vec.u64) / 8;
-    }
-#endif
-
-    for (; h >= h_start; --h)
-        if (*h == *n) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  2Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 2byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_2byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 2byte is set.
-    // For that take the bottom 15 bits of each 2byte, add one to them,
-    // and if this sets the top bit to one, then all the 15 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7FFF7FFF7FFF7FFFull) + 0x0001000100010001ull) & ((vec.u64 & 0x8000800080008000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b two-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_2byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 2 bytes long.
-    sz_assert(h_length >= 2 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 2 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
-#endif
-
-    sz_u64_vec_t h_even_vec, h_odd_vec, n_vec, matches_even_vec, matches_odd_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1];
-    n_vec.u64 *= 0x0001000100010001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
-    for (; h + 9 <= h_end; h += 8) {
-        h_even_vec.u64 = *(sz_u64_t *)h;
-        h_odd_vec.u64 = (h_even_vec.u64 >> 8) | ((sz_u64_t)h[8] << 56);
-        matches_even_vec = _sz_u64_each_2byte_equal(h_even_vec, n_vec);
-        matches_odd_vec = _sz_u64_each_2byte_equal(h_odd_vec, n_vec);
-
-        matches_even_vec.u64 >>= 8;
-        if (matches_even_vec.u64 + matches_odd_vec.u64) {
-            sz_u64_t match_indicators = matches_even_vec.u64 | matches_odd_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 2 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  4Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 4byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_4byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 4byte is set.
-    // For that take the bottom 31 bits of each 4byte, add one to them,
-    // and if this sets the top bit to one, then all the 31 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7FFFFFFF7FFFFFFFull) + 0x0000000100000001ull) & ((vec.u64 & 0x8000000080000000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b four-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_4byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
-    sz_assert(h_length >= 4 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 4 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
-#endif
-
-    sz_u64_vec_t h0_vec, h1_vec, h2_vec, h3_vec, n_vec, matches0_vec, matches1_vec, matches2_vec, matches3_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1], n_vec.u8s[2] = n[2], n_vec.u8s[3] = n[3];
-    n_vec.u64 *= 0x0000000100000001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time using four 64-bit words.
-    // We load the subsequent four-byte word as well, taking its first bytes. Think of it as a glorified prefetch :)
-    sz_u64_t h_page_current, h_page_next;
-    for (; h + sizeof(sz_u64_t) + sizeof(sz_u32_t) <= h_end; h += sizeof(sz_u64_t)) {
-        h_page_current = *(sz_u64_t *)h;
-        h_page_next = *(sz_u32_t *)(h + 8);
-        h0_vec.u64 = (h_page_current);
-        h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
-        h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
-        h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
-        matches0_vec = _sz_u64_each_4byte_equal(h0_vec, n_vec);
-        matches1_vec = _sz_u64_each_4byte_equal(h1_vec, n_vec);
-        matches2_vec = _sz_u64_each_4byte_equal(h2_vec, n_vec);
-        matches3_vec = _sz_u64_each_4byte_equal(h3_vec, n_vec);
-
-        if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64) {
-            matches0_vec.u64 >>= 24;
-            matches1_vec.u64 >>= 16;
-            matches2_vec.u64 >>= 8;
-            sz_u64_t match_indicators = matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 4 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  3Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 3byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_3byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 4byte is set.
-    // For that take the bottom 31 bits of each 4byte, add one to them,
-    // and if this sets the top bit to one, then all the 31 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0xFFFF7FFFFF7FFFFFull) + 0x0000000001000001ull) & ((vec.u64 & 0x0000800000800000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b three-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_3byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
-    sz_assert(h_length >= 3 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 3 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
-#endif
-
-    // We fetch 12
-    sz_u64_vec_t h0_vec, h1_vec, h2_vec, h3_vec, h4_vec;
-    sz_u64_vec_t matches0_vec, matches1_vec, matches2_vec, matches3_vec, matches4_vec;
-    sz_u64_vec_t n_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1], n_vec.u8s[2] = n[2];
-    n_vec.u64 *= 0x0000000001000001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time using three 64-bit words.
-    // We load the subsequent two-byte word as well.
-    sz_u64_t h_page_current, h_page_next;
-    for (; h + sizeof(sz_u64_t) + sizeof(sz_u16_t) <= h_end; h += sizeof(sz_u64_t)) {
-        h_page_current = *(sz_u64_t *)h;
-        h_page_next = *(sz_u16_t *)(h + 8);
-        h0_vec.u64 = (h_page_current);
-        h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
-        h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
-        h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
-        h4_vec.u64 = (h_page_current >> 32) | (h_page_next << 32);
-        matches0_vec = _sz_u64_each_3byte_equal(h0_vec, n_vec);
-        matches1_vec = _sz_u64_each_3byte_equal(h1_vec, n_vec);
-        matches2_vec = _sz_u64_each_3byte_equal(h2_vec, n_vec);
-        matches3_vec = _sz_u64_each_3byte_equal(h3_vec, n_vec);
-        matches4_vec = _sz_u64_each_3byte_equal(h4_vec, n_vec);
-
-        if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64) {
-            matches0_vec.u64 >>= 16;
-            matches1_vec.u64 >>= 8;
-            matches3_vec.u64 <<= 8;
-            matches4_vec.u64 <<= 16;
-            sz_u64_t match_indicators =
-                matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 3 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Boyer-Moore-Horspool algorithm for exact matching of patterns up to @b 256-bytes long.
- *          Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_horspool_upto_256bytes_serial(sz_cptr_t h_chars, sz_size_t h_length, //
-                                                             sz_cptr_t n_chars, sz_size_t n_length) {
-    sz_assert(n_length <= 256 && "The pattern is too long.");
-    // Several popular string matching algorithms are using a bad-character shift table.
-    // Boyer Moore: https://www-igm.univ-mlv.fr/~lecroq/string/node14.html
-    // Quick Search: https://www-igm.univ-mlv.fr/~lecroq/string/node19.html
-    // Smith: https://www-igm.univ-mlv.fr/~lecroq/string/node21.html
-    union {
-        sz_u8_t jumps[256];
-        sz_u64_vec_t vecs[64];
-    } bad_shift_table;
-
-    // Let's initialize the table using SWAR to the total length of the string.
-    sz_u8_t const *h = (sz_u8_t const *)h_chars;
-    sz_u8_t const *n = (sz_u8_t const *)n_chars;
-    {
-        sz_u64_vec_t n_length_vec;
-        n_length_vec.u64 = n_length;
-        n_length_vec.u64 *= 0x0101010101010101ull; // broadcast
-        for (sz_size_t i = 0; i != 64; ++i) bad_shift_table.vecs[i].u64 = n_length_vec.u64;
-        for (sz_size_t i = 0; i + 1 < n_length; ++i) bad_shift_table.jumps[n[i]] = (sz_u8_t)(n_length - i - 1);
-    }
-
-    // Another common heuristic is to match a few characters from different parts of a string.
-    // Raita suggests to use the first two, the last, and the middle character of the pattern.
-    sz_u32_vec_t h_vec, n_vec;
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into an unsigned integer.
-    n_vec.u8s[0] = n[offset_first];
-    n_vec.u8s[1] = n[offset_first + 1];
-    n_vec.u8s[2] = n[offset_mid];
-    n_vec.u8s[3] = n[offset_last];
-
-    // Scan through the whole haystack, skipping the last `n_length - 1` bytes.
-    for (sz_size_t i = 0; i <= h_length - n_length;) {
-        h_vec.u8s[0] = h[i + offset_first];
-        h_vec.u8s[1] = h[i + offset_first + 1];
-        h_vec.u8s[2] = h[i + offset_mid];
-        h_vec.u8s[3] = h[i + offset_last];
-        if (h_vec.u32 == n_vec.u32 && sz_equal((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
-        i += bad_shift_table.jumps[h[i + n_length - 1]];
-    }
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Boyer-Moore-Horspool algorithm for @b reverse-order exact matching of patterns up to @b 256-bytes long.
- *          Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
- */
-SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_upto_256bytes_serial(sz_cptr_t h_chars, sz_size_t h_length, //
-                                                              sz_cptr_t n_chars, sz_size_t n_length) {
-    sz_assert(n_length <= 256 && "The pattern is too long.");
-    union {
-        sz_u8_t jumps[256];
-        sz_u64_vec_t vecs[64];
-    } bad_shift_table;
-
-    // Let's initialize the table using SWAR to the total length of the string.
-    sz_u8_t const *h = (sz_u8_t const *)h_chars;
-    sz_u8_t const *n = (sz_u8_t const *)n_chars;
-    {
-        sz_u64_vec_t n_length_vec;
-        n_length_vec.u64 = n_length;
-        n_length_vec.u64 *= 0x0101010101010101ull; // broadcast
-        for (sz_size_t i = 0; i != 64; ++i) bad_shift_table.vecs[i].u64 = n_length_vec.u64;
-        for (sz_size_t i = 0; i + 1 < n_length; ++i)
-            bad_shift_table.jumps[n[n_length - i - 1]] = (sz_u8_t)(n_length - i - 1);
-    }
-
-    // Another common heuristic is to match a few characters from different parts of a string.
-    // Raita suggests to use the first two, the last, and the middle character of the pattern.
-    sz_u32_vec_t h_vec, n_vec;
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into an unsigned integer.
-    n_vec.u8s[0] = n[offset_first];
-    n_vec.u8s[1] = n[offset_first + 1];
-    n_vec.u8s[2] = n[offset_mid];
-    n_vec.u8s[3] = n[offset_last];
-
-    // Scan through the whole haystack, skipping the first `n_length - 1` bytes.
-    for (sz_size_t j = 0; j <= h_length - n_length;) {
-        sz_size_t i = h_length - n_length - j;
-        h_vec.u8s[0] = h[i + offset_first];
-        h_vec.u8s[1] = h[i + offset_first + 1];
-        h_vec.u8s[2] = h[i + offset_mid];
-        h_vec.u8s[3] = h[i + offset_last];
-        if (h_vec.u32 == n_vec.u32 && sz_equal((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
-        j += bad_shift_table.jumps[h[i]];
-    }
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Exact substring search helper function, that finds the first occurrence of a prefix of the needle
- *          using a given search function, and then verifies the remaining part of the needle.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_with_prefix(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length,
-                                           sz_find_t find_prefix, sz_size_t prefix_length) {
-
-    sz_size_t suffix_length = n_length - prefix_length;
-    while (1) {
-        sz_cptr_t found = find_prefix(h, h_length, n, prefix_length);
-        if (!found) return SZ_NULL_CHAR;
-
-        // Verify the remaining part of the needle
-        sz_size_t remaining = h_length - (found - h);
-        if (remaining < n_length) return SZ_NULL_CHAR;
-        if (sz_equal(found + prefix_length, n + prefix_length, suffix_length)) return found;
-
-        // Adjust the position.
-        h = found + 1;
-        h_length = remaining - 1;
-    }
-
-    // Unreachable, but helps silence compiler warnings:
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Exact reverse-order substring search helper function, that finds the last occurrence of a suffix of the
- *          needle using a given search function, and then verifies the remaining part of the needle.
- */
-SZ_INTERNAL sz_cptr_t _sz_rfind_with_suffix(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length,
-                                            sz_find_t find_suffix, sz_size_t suffix_length) {
-
-    sz_size_t prefix_length = n_length - suffix_length;
-    while (1) {
-        sz_cptr_t found = find_suffix(h, h_length, n + prefix_length, suffix_length);
-        if (!found) return SZ_NULL_CHAR;
-
-        // Verify the remaining part of the needle
-        sz_size_t remaining = found - h;
-        if (remaining < prefix_length) return SZ_NULL_CHAR;
-        if (sz_equal(found - prefix_length, n, prefix_length)) return found - prefix_length;
-
-        // Adjust the position.
-        h_length = remaining - 1;
-    }
-
-    // Unreachable, but helps silence compiler warnings:
-    return SZ_NULL_CHAR;
-}
-
-SZ_INTERNAL sz_cptr_t _sz_find_over_4bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    return _sz_find_with_prefix(h, h_length, n, n_length, (sz_find_t)_sz_find_4byte_serial, 4);
-}
-
-SZ_INTERNAL sz_cptr_t _sz_find_horspool_over_256bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
-                                                             sz_size_t n_length) {
-    return _sz_find_with_prefix(h, h_length, n, n_length, _sz_find_horspool_upto_256bytes_serial, 256);
-}
-
-SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_over_256bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
-                                                              sz_size_t n_length) {
-    return _sz_rfind_with_suffix(h, h_length, n, n_length, _sz_rfind_horspool_upto_256bytes_serial, 256);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-
-#if SZ_DETECT_BIG_ENDIAN
-    sz_find_t backends[] = {
-        (sz_find_t)sz_find_byte_serial,
-        (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_find_horspool_over_256bytes_serial,
-    };
-
-    return backends[(n_length > 1) + (n_length > 256)](h, h_length, n, n_length);
-#else
-    sz_find_t backends[] = {
-        // For very short strings brute-force SWAR makes sense.
-        (sz_find_t)sz_find_byte_serial,
-        (sz_find_t)_sz_find_2byte_serial,
-        (sz_find_t)_sz_find_3byte_serial,
-        (sz_find_t)_sz_find_4byte_serial,
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (sz_find_t)_sz_find_over_4bytes_serial,
-        // For longer needles - use skip tables.
-        (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_find_horspool_over_256bytes_serial,
-    };
-
-    return backends[
-        // For very short strings brute-force SWAR makes sense.
-        (n_length > 1) + (n_length > 2) + (n_length > 3) +
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (n_length > 4) +
-        // For longer needles - use skip tables.
-        (n_length > 8) + (n_length > 256)](h, h_length, n, n_length);
-#endif
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-
-    sz_find_t backends[] = {
-        // For very short strings brute-force SWAR makes sense.
-        (sz_find_t)sz_rfind_byte_serial,
-        //  TODO: implement reverse-order SWAR for 2/3/4 byte variants.
-        //  TODO: (sz_find_t)_sz_rfind_2byte_serial,
-        //  TODO: (sz_find_t)_sz_rfind_3byte_serial,
-        //  TODO: (sz_find_t)_sz_rfind_4byte_serial,
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        // (sz_find_t)_sz_rfind_over_4bytes_serial,
-        // For longer needles - use skip tables.
-        (sz_find_t)_sz_rfind_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_rfind_horspool_over_256bytes_serial,
-    };
-
-    return backends[
-        // For very short strings brute-force SWAR makes sense.
-        0 +
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (n_length > 1) +
-        // For longer needles - use skip tables.
-        (n_length > 256)](h, h_length, n, n_length);
-}
-
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_serial( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                 //
-    sz_cptr_t longer, sz_size_t longer_length,                   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // TODO: Generalize to remove the following asserts!
-    sz_assert(!bound && "For bounded search the method should only evaluate one band of the matrix.");
-    sz_assert(shorter_length == longer_length && "The method hasn't been generalized to different length inputs yet.");
-    sz_unused(longer_length && bound);
-
-    // We are going to store 3 diagonals of the matrix.
-    // The length of the longest (main) diagonal would be `n = (shorter_length + 1)`.
-    sz_size_t n = shorter_length + 1;
-    sz_size_t buffer_length = sizeof(sz_size_t) * n * 3;
-    sz_size_t *distances = (sz_size_t *)alloc->allocate(buffer_length, alloc->handle);
-    if (!distances) return SZ_SIZE_MAX;
-
-    sz_size_t *previous_distances = distances;
-    sz_size_t *current_distances = previous_distances + n;
-    sz_size_t *next_distances = previous_distances + n * 2;
-
-    // Initialize the first two diagonals:
-    previous_distances[0] = 0;
-    current_distances[0] = current_distances[1] = 1;
-
-    // Progress through the upper triangle of the Levenshtein matrix.
-    sz_size_t next_diagonal_index = 2;
-    for (; next_diagonal_index != n; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
-        for (sz_size_t i = 0; i + 2 < next_diagonal_length; ++i) {
-            sz_size_t cost_of_substitution = shorter[next_diagonal_index - i - 2] != longer[i];
-            sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
-            sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
-            next_distances[i + 1] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
-        }
-        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-        next_distances[0] = next_distances[next_diagonal_length - 1] = next_diagonal_index;
-        // Perform a circular rotation of those buffers, to reuse the memory.
-        sz_size_t *temporary = previous_distances;
-        previous_distances = current_distances;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a
-    // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
-    // index on either side, we will be cropping those values out.
-    sz_size_t diagonals_count = n + n - 1;
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
-        for (sz_size_t i = 0; i != next_diagonal_length; ++i) {
-            sz_size_t cost_of_substitution = shorter[shorter_length - 1 - i] != longer[next_diagonal_index - n + i];
-            sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
-            sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
-            next_distances[i] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
-        }
-        // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
-        // dropping the first element in the current array.
-        sz_size_t *temporary = previous_distances;
-        previous_distances = current_distances + 1;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // Cache scalar before `free` call.
-    sz_size_t result = current_distances[0];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-}
-
-/**
- *  @brief  Describes the length of a UTF8 character / codepoint / rune in bytes.
- */
-typedef enum {
-    sz_utf8_invalid_k = 0,     //!< Invalid UTF8 character.
-    sz_utf8_rune_1byte_k = 1,  //!< 1-byte UTF8 character.
-    sz_utf8_rune_2bytes_k = 2, //!< 2-byte UTF8 character.
-    sz_utf8_rune_3bytes_k = 3, //!< 3-byte UTF8 character.
-    sz_utf8_rune_4bytes_k = 4, //!< 4-byte UTF8 character.
-} sz_rune_length_t;
-
-typedef sz_u32_t sz_rune_t;
-
-/**
- *  @brief  Extracts just one UTF8 codepoint from a UTF8 string into a 32-bit unsigned integer.
- */
-SZ_INTERNAL void _sz_extract_utf8_rune(sz_cptr_t utf8, sz_rune_t *code, sz_rune_length_t *code_length) {
-    sz_u8_t const *current = (sz_u8_t const *)utf8;
-    sz_u8_t leading_byte = *current++;
-    sz_rune_t ch;
-    sz_rune_length_t ch_length;
-
-    // TODO: This can be made entirely branchless using 32-bit SWAR.
-    if (leading_byte < 0x80) {
-        // Single-byte rune (0xxxxxxx)
-        ch = leading_byte;
-        ch_length = sz_utf8_rune_1byte_k;
-    }
-    else if ((leading_byte & 0xE0) == 0xC0) {
-        // Two-byte rune (110xxxxx 10xxxxxx)
-        ch = (leading_byte & 0x1F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_2bytes_k;
-    }
-    else if ((leading_byte & 0xF0) == 0xE0) {
-        // Three-byte rune (1110xxxx 10xxxxxx 10xxxxxx)
-        ch = (leading_byte & 0x0F) << 12;
-        ch |= (*current++ & 0x3F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_3bytes_k;
-    }
-    else if ((leading_byte & 0xF8) == 0xF0) {
-        // Four-byte rune (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
-        ch = (leading_byte & 0x07) << 18;
-        ch |= (*current++ & 0x3F) << 12;
-        ch |= (*current++ & 0x3F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_4bytes_k;
-    }
-    else {
-        // Invalid UTF8 rune.
-        ch = 0;
-        ch_length = sz_utf8_invalid_k;
-    }
-    *code = ch;
-    *code_length = ch_length;
-}
-
-/**
- *  @brief  Exports a UTF8 string into a UTF32 buffer.
- *          ! The result is undefined id the UTF8 string is corrupted.
- *  @return The length in the number of codepoints.
- */
-SZ_INTERNAL sz_size_t _sz_export_utf8_to_utf32(sz_cptr_t utf8, sz_size_t utf8_length, sz_rune_t *utf32) {
-    sz_cptr_t const end = utf8 + utf8_length;
-    sz_size_t count = 0;
-    sz_rune_length_t rune_length;
-    for (; utf8 != end; utf8 += rune_length, utf32++, count++) _sz_extract_utf8_rune(utf8, utf32, &rune_length);
-    return count;
-}
-
-/**
- *  @brief  Compute the Levenshtein distance between two strings using the Wagner-Fisher algorithm.
- *          Stores only 2 rows of the Levenshtein matrix, but uses 64-bit integers for the distance values,
- *          and upcasts UTF8 variable-length codepoints to 64-bit integers for faster addressing.
- *
- *  ! In the worst case for 2 strings of length 100, that contain just one 16-bit codepoint this will result in extra:
- *      + 2 rows * 100 slots * 8 bytes/slot = 1600 bytes of memory for the two rows of the Levenshtein matrix rows.
- *      + 100 codepoints * 2 strings * 4 bytes/codepoint = 800 bytes of memory for the UTF8 buffer.
- *      = 2400 bytes of memory or @b 12x memory amplification!
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_wagner_fisher_serial( //
-    sz_cptr_t longer, sz_size_t longer_length,                //
-    sz_cptr_t shorter, sz_size_t shorter_length,              //
-    sz_size_t bound, sz_bool_t can_be_unicode, sz_memory_allocator_t *alloc) {
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // A good idea may be to dispatch different kernels for different string lengths.
-    // Like using `uint8_t` counters for strings under 255 characters long.
-    // Good in theory, this results in frequent upcasts and downcasts in serial code.
-    // On strings over 20 bytes, using `uint8` over `uint64` on 64-bit x86 CPU doubles the execution time.
-    // So one must be very cautious with such optimizations.
-    typedef sz_size_t _distance_t;
-
-    // Compute the number of columns in our Levenshtein matrix.
-    sz_size_t const n = shorter_length + 1;
-
-    // If a buffering memory-allocator is provided, this operation is practically free,
-    // and cheaper than allocating even 512 bytes (for small distance matrices) on stack.
-    sz_size_t buffer_length = sizeof(_distance_t) * (n * 2);
-
-    // If the strings contain Unicode characters, let's estimate the max character width,
-    // and use it to allocate a larger buffer to decode UTF8.
-    if ((can_be_unicode == sz_true_k) &&
-        (sz_isascii(longer, longer_length) == sz_false_k || sz_isascii(shorter, shorter_length) == sz_false_k)) {
-        buffer_length += (shorter_length + longer_length) * sizeof(sz_rune_t);
-    }
-    else { can_be_unicode = sz_false_k; }
-
-    // If the allocation fails, return the maximum distance.
-    sz_ptr_t const buffer = (sz_ptr_t)alloc->allocate(buffer_length, alloc->handle);
-    if (!buffer) return SZ_SIZE_MAX;
-
-    // Let's export the UTF8 sequence into the newly allocated buffer at the end.
-    if (can_be_unicode == sz_true_k) {
-        sz_rune_t *const longer_utf32 = (sz_rune_t *)(buffer + sizeof(_distance_t) * (n * 2));
-        sz_rune_t *const shorter_utf32 = longer_utf32 + longer_length;
-        // Export the UTF8 sequences into the newly allocated buffer.
-        longer_length = _sz_export_utf8_to_utf32(longer, longer_length, longer_utf32);
-        shorter_length = _sz_export_utf8_to_utf32(shorter, shorter_length, shorter_utf32);
-        longer = (sz_cptr_t)longer_utf32;
-        shorter = (sz_cptr_t)shorter_utf32;
-    }
-
-    // Let's parameterize the core logic for different character types and distance types.
-#define _wagner_fisher_unbounded(_distance_t, _char_t)                                                                \
-    /* Now let's cast our pointer to avoid it in subsequent sections. */                                              \
-    _char_t const *const longer_chars = (_char_t const *)longer;                                                      \
-    _char_t const *const shorter_chars = (_char_t const *)shorter;                                                    \
-    _distance_t *previous_distances = (_distance_t *)buffer;                                                          \
-    _distance_t *current_distances = previous_distances + n;                                                          \
-    /*  Initialize the first row of the Levenshtein matrix with `iota`-style arithmetic progression. */               \
-    for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
-    /* The main loop of the algorithm with quadratic complexity. */                                                   \
-    for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {                                     \
-        _char_t const longer_char = longer_chars[idx_longer];                                                         \
-        /* Using pure pointer arithmetic is faster than iterating with an index. */                                   \
-        _char_t const *shorter_ptr = shorter_chars;                                                                   \
-        _distance_t const *previous_ptr = previous_distances;                                                         \
-        _distance_t *current_ptr = current_distances;                                                                 \
-        _distance_t *const current_end = current_ptr + shorter_length;                                                \
-        current_ptr[0] = idx_longer + 1;                                                                              \
-        for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) {                            \
-            _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]);           \
-            /* We can avoid `+1` for costs here, shifting it to post-minimum computation, */                          \
-            /* saving one increment operation. */                                                                     \
-            _distance_t cost_deletion = previous_ptr[1];                                                              \
-            _distance_t cost_insertion = current_ptr[0];                                                              \
-            /* ? It might be a good idea to enforce branchless execution here. */                                     \
-            /* ? The caveat being that the benchmarks on longer sequences backfire and more research is needed. */    \
-            current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1);      \
-        }                                                                                                             \
-        /* Swap `previous_distances` and `current_distances` pointers. */                                             \
-        _distance_t *temporary = previous_distances;                                                                  \
-        previous_distances = current_distances;                                                                       \
-        current_distances = temporary;                                                                                \
-    }                                                                                                                 \
-    /* Cache scalar before `free` call. */                                                                            \
-    sz_size_t result = previous_distances[shorter_length];                                                            \
-    alloc->free(buffer, buffer_length, alloc->handle);                                                                \
-    return result;
-
-    // Let's define a separate variant for bounded distance computation.
-    // Practically the same as unbounded, but also collecting the running minimum within each row for early exit.
-#define _wagner_fisher_bounded(_distance_t, _char_t)                                                                  \
-    _char_t const *const longer_chars = (_char_t const *)longer;                                                      \
-    _char_t const *const shorter_chars = (_char_t const *)shorter;                                                    \
-    _distance_t *previous_distances = (_distance_t *)buffer;                                                          \
-    _distance_t *current_distances = previous_distances + n;                                                          \
-    for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
-    for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {                                     \
-        _char_t const longer_char = longer_chars[idx_longer];                                                         \
-        _char_t const *shorter_ptr = shorter_chars;                                                                   \
-        _distance_t const *previous_ptr = previous_distances;                                                         \
-        _distance_t *current_ptr = current_distances;                                                                 \
-        _distance_t *const current_end = current_ptr + shorter_length;                                                \
-        current_ptr[0] = idx_longer + 1;                                                                              \
-        /* Initialize min_distance with a value greater than bound */                                                 \
-        _distance_t min_distance = bound - 1;                                                                         \
-        for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) {                            \
-            _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]);           \
-            _distance_t cost_deletion = previous_ptr[1];                                                              \
-            _distance_t cost_insertion = current_ptr[0];                                                              \
-            current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1);      \
-            /* Keep track of the minimum distance seen so far in this row */                                          \
-            min_distance = sz_min_of_two(current_ptr[1], min_distance);                                               \
-        }                                                                                                             \
-        /* If the minimum distance in this row exceeded the bound, return early */                                    \
-        if (min_distance >= bound) {                                                                                  \
-            alloc->free(buffer, buffer_length, alloc->handle);                                                        \
-            return bound;                                                                                             \
-        }                                                                                                             \
-        _distance_t *temporary = previous_distances;                                                                  \
-        previous_distances = current_distances;                                                                       \
-        current_distances = temporary;                                                                                \
-    }                                                                                                                 \
-    sz_size_t result = previous_distances[shorter_length];                                                            \
-    alloc->free(buffer, buffer_length, alloc->handle);                                                                \
-    return sz_min_of_two(result, bound);
-
-    // Dispatch the actual computation.
-    if (!bound) {
-        if (can_be_unicode == sz_true_k) { _wagner_fisher_unbounded(sz_size_t, sz_rune_t); }
-        else { _wagner_fisher_unbounded(sz_size_t, sz_u8_t); }
-    }
-    else {
-        if (can_be_unicode == sz_true_k) { _wagner_fisher_bounded(sz_size_t, sz_rune_t); }
-        else { _wagner_fisher_bounded(sz_size_t, sz_u8_t); }
-    }
-}
-
-SZ_PUBLIC sz_size_t sz_edit_distance_serial(     //
-    sz_cptr_t longer, sz_size_t longer_length,   //
-    sz_cptr_t shorter, sz_size_t shorter_length, //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
-    }
-
-    // Skip the matching prefixes and suffixes, they won't affect the distance.
-    for (sz_cptr_t a_end = longer + longer_length, b_end = shorter + shorter_length;
-         longer != a_end && shorter != b_end && *longer == *shorter;
-         ++longer, ++shorter, --longer_length, --shorter_length);
-    for (; longer_length && shorter_length && longer[longer_length - 1] == shorter[shorter_length - 1];
-         --longer_length, --shorter_length);
-
-    // Bounded computations may exit early.
-    int const is_bounded = bound < longer_length;
-    if (is_bounded) {
-        // If one of the strings is empty - the edit distance is equal to the length of the other one.
-        if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
-        if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
-        // If the difference in length is beyond the `bound`, there is no need to check at all.
-        if (longer_length - shorter_length > bound) return bound;
-    }
-
-    if (shorter_length == 0) return longer_length; // If no mismatches were found - the distance is zero.
-    if (shorter_length == longer_length && !is_bounded)
-        return _sz_edit_distance_skewed_diagonals_serial(longer, longer_length, shorter, shorter_length, bound, alloc);
-    return _sz_edit_distance_wagner_fisher_serial(longer, longer_length, shorter, shorter_length, bound, sz_false_k,
-                                                  alloc);
-}
-
-SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(       //
-    sz_cptr_t longer, sz_size_t longer_length,        //
-    sz_cptr_t shorter, sz_size_t shorter_length,      //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, //
-    sz_memory_allocator_t *alloc) {
-
-    // If one of the strings is empty - the edit distance is equal to the length of the other one
-    if (longer_length == 0) return (sz_ssize_t)shorter_length * gap;
-    if (shorter_length == 0) return (sz_ssize_t)longer_length * gap;
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
-    }
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    sz_size_t n = shorter_length + 1;
-    sz_size_t buffer_length = sizeof(sz_ssize_t) * n * 2;
-    sz_ssize_t *distances = (sz_ssize_t *)alloc->allocate(buffer_length, alloc->handle);
-    sz_ssize_t *previous_distances = distances;
-    sz_ssize_t *current_distances = previous_distances + n;
-
-    for (sz_size_t idx_shorter = 0; idx_shorter != n; ++idx_shorter)
-        previous_distances[idx_shorter] = (sz_ssize_t)idx_shorter * gap;
-
-    sz_u8_t const *shorter_unsigned = (sz_u8_t const *)shorter;
-    sz_u8_t const *longer_unsigned = (sz_u8_t const *)longer;
-    for (sz_size_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {
-        current_distances[0] = ((sz_ssize_t)idx_longer + 1) * gap;
-
-        // Initialize min_distance with a value greater than bound
-        sz_error_cost_t const *a_subs = subs + longer_unsigned[idx_longer] * 256ul;
-        for (sz_size_t idx_shorter = 0; idx_shorter != shorter_length; ++idx_shorter) {
-            sz_ssize_t cost_deletion = previous_distances[idx_shorter + 1] + gap;
-            sz_ssize_t cost_insertion = current_distances[idx_shorter] + gap;
-            sz_ssize_t cost_substitution = previous_distances[idx_shorter] + a_subs[shorter_unsigned[idx_shorter]];
-            current_distances[idx_shorter + 1] = sz_max_of_three(cost_deletion, cost_insertion, cost_substitution);
-        }
-
-        // Swap previous_distances and current_distances pointers
-        sz_pointer_swap((void **)&previous_distances, (void **)&current_distances);
-    }
-
-    // Cache scalar before `free` call.
-    sz_ssize_t result = previous_distances[shorter_length];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-}
-
-SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
-    sz_cptr_t a, sz_size_t a_length,            //
-    sz_cptr_t b, sz_size_t b_length,            //
-    sz_size_t bound) {
-
-    sz_size_t const min_length = sz_min_of_two(a_length, b_length);
-    sz_size_t const max_length = sz_max_of_two(a_length, b_length);
-    sz_cptr_t const a_end = a + min_length;
-    bound = bound == 0 ? max_length : bound;
-
-    // Walk through both strings using SWAR and counting the number of differing characters.
-    sz_size_t distance = max_length - min_length;
-#if SZ_USE_MISALIGNED_LOADS && !SZ_DETECT_BIG_ENDIAN
-    if (min_length >= SZ_SWAR_THRESHOLD) {
-        sz_u64_vec_t a_vec, b_vec, match_vec;
-        for (; a + 8 <= a_end && distance < bound; a += 8, b += 8) {
-            a_vec.u64 = sz_u64_load(a).u64;
-            b_vec.u64 = sz_u64_load(b).u64;
-            match_vec = _sz_u64_each_byte_equal(a_vec, b_vec);
-            distance += sz_u64_popcount((~match_vec.u64) & 0x8080808080808080ull);
-        }
-    }
-#endif
-
-    for (; a != a_end && distance < bound; ++a, ++b) { distance += (*a != *b); }
-    return sz_min_of_two(distance, bound);
-}
-
-SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial( //
-    sz_cptr_t a, sz_size_t a_length,                 //
-    sz_cptr_t b, sz_size_t b_length,                 //
-    sz_size_t bound) {
-
-    sz_cptr_t const a_end = a + a_length;
-    sz_cptr_t const b_end = b + b_length;
-    sz_size_t distance = 0;
-
-    sz_rune_t a_rune, b_rune;
-    sz_rune_length_t a_rune_length, b_rune_length;
-
-    if (bound) {
-        for (; a < a_end && b < b_end && distance < bound; a += a_rune_length, b += b_rune_length) {
-            _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-            _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-            distance += (a_rune != b_rune);
-        }
-        // If one string has more runes, we need to go through the tail.
-        if (distance < bound) {
-            for (; a < a_end && distance < bound; a += a_rune_length, ++distance)
-                _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-
-            for (; b < b_end && distance < bound; b += b_rune_length, ++distance)
-                _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-        }
-    }
-    else {
-        for (; a < a_end && b < b_end; a += a_rune_length, b += b_rune_length) {
-            _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-            _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-            distance += (a_rune != b_rune);
-        }
-        // If one string has more runes, we need to go through the tail.
-        for (; a < a_end; a += a_rune_length, ++distance) _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-        for (; b < b_end; b += b_rune_length, ++distance) _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-    }
-    return distance;
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length) {
-    sz_u64_t checksum = 0;
-    sz_u8_t const *text_u8 = (sz_u8_t const *)text;
-    sz_u8_t const *text_end = text_u8 + length;
-    for (; text_u8 != text_end; ++text_u8) checksum += *text_u8;
-    return checksum;
-}
-
-/**
- *  @brief  Largest prime number that fits into 31 bits.
- *  @see    https://mersenneforum.org/showthread.php?t=3471
- */
-#define SZ_U32_MAX_PRIME (2147483647u)
-
-/**
- *  @brief  Largest prime number that fits into 64 bits.
- *  @see    https://mersenneforum.org/showthread.php?t=3471
- *
- *  2^64 = 18,446,744,073,709,551,616
- *  this = 18,446,744,073,709,551,557
- *  diff = 59
- */
-#define SZ_U64_MAX_PRIME (18446744073709551557ull)
-
-/*
- *  One hardware-accelerated way of mixing hashes can be CRC, but it's only implemented for 32-bit values.
- *  Using a Boost-like mixer works very poorly in such case:
- *
- *       hash_first ^ (hash_second + 0x517cc1b727220a95 + (hash_first << 6) + (hash_first >> 2));
- *
- *  Let's stick to the Fibonacci hash trick using the golden ratio.
- *  https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
- */
-#define _sz_hash_mix(first, second) ((first * 11400714819323198485ull) ^ (second * 11400714819323198485ull))
-#define _sz_shift_low(x) (x)
-#define _sz_shift_high(x) ((x + 77ull) & 0xFFull)
-#define _sz_prime_mod(x) (x % SZ_U64_MAX_PRIME)
-
-SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length) {
-
-    sz_u64_t hash_low = 0;
-    sz_u64_t hash_high = 0;
-    sz_u8_t const *text = (sz_u8_t const *)start;
-    sz_u8_t const *text_end = text + length;
-
-    switch (length) {
-    case 0: return 0;
-
-    // Texts under 7 bytes long are definitely below the largest prime.
-    case 1:
-        hash_low = _sz_shift_low(text[0]);
-        hash_high = _sz_shift_high(text[0]);
-        break;
-    case 2:
-        hash_low = _sz_shift_low(text[0]) * 31ull + _sz_shift_low(text[1]);
-        hash_high = _sz_shift_high(text[0]) * 257ull + _sz_shift_high(text[1]);
-        break;
-    case 3:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull +         //
-                   _sz_shift_low(text[2]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull +          //
-                    _sz_shift_high(text[2]);
-        break;
-    case 4:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull +                 //
-                   _sz_shift_low(text[3]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull +                   //
-                    _sz_shift_high(text[3]);
-        break;
-    case 5:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull +                         //
-                   _sz_shift_low(text[4]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull +                            //
-                    _sz_shift_high(text[4]);
-        break;
-    case 6:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull * 31ull +                         //
-                   _sz_shift_low(text[4]) * 31ull +                                 //
-                   _sz_shift_low(text[5]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull * 257ull +                            //
-                    _sz_shift_high(text[4]) * 257ull +                                     //
-                    _sz_shift_high(text[5]);
-        break;
-    case 7:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull * 31ull * 31ull +                         //
-                   _sz_shift_low(text[4]) * 31ull * 31ull +                                 //
-                   _sz_shift_low(text[5]) * 31ull +                                         //
-                   _sz_shift_low(text[6]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull * 257ull * 257ull +                            //
-                    _sz_shift_high(text[4]) * 257ull * 257ull +                                     //
-                    _sz_shift_high(text[5]) * 257ull +                                              //
-                    _sz_shift_high(text[6]);
-        break;
-    default:
-        // Unroll the first seven cycles:
-        hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[1]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[1]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[2]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[2]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[3]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[3]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[4]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[4]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[5]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[5]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[6]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[6]);
-        text += 7;
-
-        // Iterate throw the rest with the modulus:
-        for (; text != text_end; ++text) {
-            hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
-            hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
-            // Wrap the hashes around:
-            hash_low = _sz_prime_mod(hash_low);
-            hash_high = _sz_prime_mod(hash_high);
-        }
-        break;
-    }
-
-    return _sz_hash_mix(hash_low, hash_high);
-}
-
-SZ_PUBLIC void sz_hashes_serial(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    sz_u8_t const *text = (sz_u8_t const *)start;
-    sz_u8_t const *text_end = text + length;
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // Compute the initial hash value for the first window.
-    sz_u64_t hash_low = 0, hash_high = 0, hash_mix;
-    for (sz_u8_t const *first_end = text + window_length; text < first_end; ++text)
-        hash_low = (hash_low * 31ull + _sz_shift_low(*text)) % SZ_U64_MAX_PRIME,
-        hash_high = (hash_high * 257ull + _sz_shift_high(*text)) % SZ_U64_MAX_PRIME;
-
-    // In most cases the fingerprint length will be a power of two.
-    hash_mix = _sz_hash_mix(hash_low, hash_high);
-    callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
-
-    // Compute the hash value for every window, exporting into the fingerprint,
-    // using the expensive modulo operation.
-    sz_size_t cycles = 1;
-    sz_size_t const step_mask = step - 1;
-    for (; text < text_end; ++text, ++cycles) {
-        // Discard one character:
-        hash_low -= _sz_shift_low(*(text - window_length)) * prime_power_low;
-        hash_high -= _sz_shift_high(*(text - window_length)) * prime_power_high;
-        // And add a new one:
-        hash_low = 31ull * hash_low + _sz_shift_low(*text);
-        hash_high = 257ull * hash_high + _sz_shift_high(*text);
-        // Wrap the hashes around:
-        hash_low = _sz_prime_mod(hash_low);
-        hash_high = _sz_prime_mod(hash_high);
-        // Mix only if we've skipped enough hashes.
-        if ((cycles & step_mask) == 0) {
-            hash_mix = _sz_hash_mix(hash_low, hash_high);
-            callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
-        }
-    }
-}
-
-#undef _sz_shift_low
-#undef _sz_shift_high
-#undef _sz_hash_mix
-#undef _sz_prime_mod
-
-/**
- *  @brief  Uses a small lookup-table to convert a lowercase character to uppercase.
- */
-SZ_INTERNAL sz_u8_t sz_u8_tolower(sz_u8_t c) {
-    static sz_u8_t const lowered[256] = {
-        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
-        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
-        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
-        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
-        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
-        96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, //
-        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
-        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
-        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
-        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
-    };
-    return lowered[c];
-}
-
-/**
- *  @brief  Uses a small lookup-table to convert an uppercase character to lowercase.
- */
-SZ_INTERNAL sz_u8_t sz_u8_toupper(sz_u8_t c) {
-    static sz_u8_t const upped[256] = {
-        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
-        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
-        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
-        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
-        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
-        96,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  //
-        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  123, 124, 125, 126, 127, //
-        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
-        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
-        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
-        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
-    };
-    return upped[c];
-}
-
-/**
- *  @brief  Uses two small lookup tables (768 bytes total) to accelerate division by a small
- *          unsigned integer. Performs two lookups, one multiplication, two shifts, and two accumulations.
- *
- *  @param  divisor Integral value @b larger than one.
- *  @param  number  Integral value to divide.
- */
-SZ_INTERNAL sz_u8_t sz_u8_divide(sz_u8_t number, sz_u8_t divisor) {
-    sz_assert(divisor > 1);
-    static sz_u16_t const multipliers[256] = {
-        0,     0,     0,     21846, 0,     39322, 21846, 9363,  0,     50973, 39322, 29790, 21846, 15124, 9363,  4370,
-        0,     57826, 50973, 44841, 39322, 34329, 29790, 25645, 21846, 18351, 15124, 12137, 9363,  6780,  4370,  2115,
-        0,     61565, 57826, 54302, 50973, 47824, 44841, 42011, 39322, 36765, 34329, 32006, 29790, 27671, 25645, 23705,
-        21846, 20063, 18351, 16706, 15124, 13602, 12137, 10725, 9363,  8049,  6780,  5554,  4370,  3224,  2115,  1041,
-        0,     63520, 61565, 59668, 57826, 56039, 54302, 52614, 50973, 49377, 47824, 46313, 44841, 43407, 42011, 40649,
-        39322, 38028, 36765, 35532, 34329, 33154, 32006, 30885, 29790, 28719, 27671, 26647, 25645, 24665, 23705, 22766,
-        21846, 20945, 20063, 19198, 18351, 17520, 16706, 15907, 15124, 14356, 13602, 12863, 12137, 11424, 10725, 10038,
-        9363,  8700,  8049,  7409,  6780,  6162,  5554,  4957,  4370,  3792,  3224,  2665,  2115,  1573,  1041,  517,
-        0,     64520, 63520, 62535, 61565, 60609, 59668, 58740, 57826, 56926, 56039, 55164, 54302, 53452, 52614, 51788,
-        50973, 50169, 49377, 48595, 47824, 47063, 46313, 45572, 44841, 44120, 43407, 42705, 42011, 41326, 40649, 39982,
-        39322, 38671, 38028, 37392, 36765, 36145, 35532, 34927, 34329, 33738, 33154, 32577, 32006, 31443, 30885, 30334,
-        29790, 29251, 28719, 28192, 27671, 27156, 26647, 26143, 25645, 25152, 24665, 24182, 23705, 23233, 22766, 22303,
-        21846, 21393, 20945, 20502, 20063, 19628, 19198, 18772, 18351, 17933, 17520, 17111, 16706, 16305, 15907, 15514,
-        15124, 14738, 14356, 13977, 13602, 13231, 12863, 12498, 12137, 11779, 11424, 11073, 10725, 10380, 10038, 9699,
-        9363,  9030,  8700,  8373,  8049,  7727,  7409,  7093,  6780,  6470,  6162,  5857,  5554,  5254,  4957,  4662,
-        4370,  4080,  3792,  3507,  3224,  2943,  2665,  2388,  2115,  1843,  1573,  1306,  1041,  778,   517,   258,
-    };
-    // This table can be avoided using a single addition and counting trailing zeros.
-    static sz_u8_t const shifts[256] = {
-        0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, //
-        4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, //
-        5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, //
-        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, //
-        6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-    };
-    sz_u32_t multiplier = multipliers[divisor];
-    sz_u8_t shift = shifts[divisor];
-
-    sz_u16_t q = (sz_u16_t)((multiplier * number) >> 16);
-    sz_u16_t t = ((number - q) >> 1) + q;
-    return (sz_u8_t)(t >> shift);
-}
-
-SZ_PUBLIC void sz_look_up_transform_serial(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result) {
-    sz_u8_t const *unsigned_lut = (sz_u8_t const *)lut;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = unsigned_lut[*unsigned_text];
-}
-
-SZ_PUBLIC void sz_tolower_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = sz_u8_tolower(*unsigned_text);
-}
-
-SZ_PUBLIC void sz_toupper_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = sz_u8_toupper(*unsigned_text);
-}
-
-SZ_PUBLIC void sz_toascii_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = *unsigned_text & 0x7F;
-}
-
-/**
- *  @brief  Check if there is a byte in this buffer, that exceeds 127 and can't be an ASCII character.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- */
-SZ_PUBLIC sz_bool_t sz_isascii_serial(sz_cptr_t text, sz_size_t length) {
-
-    if (!length) return sz_true_k;
-    sz_u8_t const *h = (sz_u8_t const *)text;
-    sz_u8_t const *const h_end = h + length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
-        if (*h & 0x80ull) return sz_false_k;
-#endif
-
-    // Validate eight bytes at once using SWAR.
-    sz_u64_vec_t text_vec;
-    for (; h + 8 <= h_end; h += 8) {
-        text_vec.u64 = *(sz_u64_t const *)h;
-        if (text_vec.u64 & 0x8080808080808080ull) return sz_false_k;
-    }
-
-    // Handle the misaligned tail.
-    for (; h < h_end; ++h)
-        if (*h & 0x80ull) return sz_false_k;
-    return sz_true_k;
-}
-
-SZ_PUBLIC void sz_generate_serial(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
-                                  sz_random_generator_t generator, void *generator_user_data) {
-
-    sz_assert(alphabet_size > 0 && alphabet_size <= 256 && "Inadequate alphabet size");
-
-    if (alphabet_size == 1) sz_fill(result, result_length, *alphabet);
-
-    else {
-        sz_assert(generator && "Expects a valid random generator");
-        sz_u8_t divisor = (sz_u8_t)alphabet_size;
-        for (sz_cptr_t end = result + result_length; result != end; ++result) {
-            sz_u8_t random = generator(generator_user_data) & 0xFF;
-            sz_u8_t quotient = sz_u8_divide(random, divisor);
-            *result = alphabet[random - quotient * divisor];
-        }
-    }
-}
-
-#pragma endregion
-
-/*
- *  Serial implementation of string class operations.
- */
-#pragma region Serial Implementation for the String Class
-
-SZ_PUBLIC sz_bool_t sz_string_is_on_stack(sz_string_t const *string) {
-    // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    return (sz_bool_t)((sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0]);
-}
-
-SZ_PUBLIC void sz_string_range(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length) {
-    sz_size_t is_small = (sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0];
-    sz_size_t is_big_mask = is_small - 1ull;
-    *start = string->external.start; // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    // If the string is small, use branch-less approach to mask-out the top 7 bytes of the length.
-    *length = string->external.length & (0x00000000000000FFull | is_big_mask);
-}
-
-SZ_PUBLIC void sz_string_unpack(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length, sz_size_t *space,
-                                sz_bool_t *is_external) {
-    sz_size_t is_small = (sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0];
-    sz_size_t is_big_mask = is_small - 1ull;
-    *start = string->external.start; // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    // If the string is small, use branch-less approach to mask-out the top 7 bytes of the length.
-    *length = string->external.length & (0x00000000000000FFull | is_big_mask);
-    // In case the string is small, the `is_small - 1ull` will become 0xFFFFFFFFFFFFFFFFull.
-    *space = sz_u64_blend(SZ_STRING_INTERNAL_SPACE, string->external.space, is_big_mask);
-    *is_external = (sz_bool_t)!is_small;
-}
-
-SZ_PUBLIC sz_bool_t sz_string_equal(sz_string_t const *a, sz_string_t const *b) {
-    // Tempting to say that the external.length is bitwise the same even if it includes
-    // some bytes of the on-stack payload, but we don't at this writing maintain that invariant.
-    // (An on-stack string includes noise bytes in the high-order bits of external.length. So do this
-    // the hard/correct way.
-
-#if SZ_USE_MISALIGNED_LOADS
-    // Dealing with StringZilla strings, we know that the `start` pointer always points
-    // to a word at least 8 bytes long. Therefore, we can compare the first 8 bytes at once.
-
-#endif
-    // Alternatively, fall back to byte-by-byte comparison.
-    sz_ptr_t a_start, b_start;
-    sz_size_t a_length, b_length;
-    sz_string_range(a, &a_start, &a_length);
-    sz_string_range(b, &b_start, &b_length);
-    return (sz_bool_t)(a_length == b_length && sz_equal(a_start, b_start, b_length));
-}
-
-SZ_PUBLIC sz_ordering_t sz_string_order(sz_string_t const *a, sz_string_t const *b) {
-#if SZ_USE_MISALIGNED_LOADS
-    // Dealing with StringZilla strings, we know that the `start` pointer always points
-    // to a word at least 8 bytes long. Therefore, we can compare the first 8 bytes at once.
-
-#endif
-    // Alternatively, fall back to byte-by-byte comparison.
-    sz_ptr_t a_start, b_start;
-    sz_size_t a_length, b_length;
-    sz_string_range(a, &a_start, &a_length);
-    sz_string_range(b, &b_start, &b_length);
-    return sz_order(a_start, a_length, b_start, b_length);
-}
-
-SZ_PUBLIC void sz_string_init(sz_string_t *string) {
-    sz_assert(string && "String can't be SZ_NULL.");
-
-    // Only 8 + 1 + 1 need to be initialized.
-    string->internal.start = &string->internal.chars[0];
-    // But for safety let's initialize the entire structure to zeros.
-    // string->internal.chars[0] = 0;
-    // string->internal.length = 0;
-    string->words[1] = 0;
-    string->words[2] = 0;
-    string->words[3] = 0;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length, sz_memory_allocator_t *allocator) {
-    sz_size_t space_needed = length + 1; // space for trailing \0
-    sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
-    // Initialize the string to zeros for safety.
-    string->words[1] = 0;
-    string->words[2] = 0;
-    string->words[3] = 0;
-    // If we are lucky, no memory allocations will be needed.
-    if (space_needed <= SZ_STRING_INTERNAL_SPACE) {
-        string->internal.start = &string->internal.chars[0];
-        string->internal.length = (sz_u8_t)length;
-    }
-    else {
-        // If we are not lucky, we need to allocate memory.
-        string->external.start = (sz_ptr_t)allocator->allocate(space_needed, allocator->handle);
-        if (!string->external.start) return SZ_NULL_CHAR;
-        string->external.length = length;
-        string->external.space = space_needed;
-    }
-    sz_assert(&string->internal.start == &string->external.start && "Alignment confusion");
-    string->external.start[length] = 0;
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity, sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
-
-    sz_size_t new_space = new_capacity + 1;
-    if (new_space <= SZ_STRING_INTERNAL_SPACE) return string->external.start;
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-    sz_assert(new_space > string_space && "New space must be larger than current.");
-
-    sz_ptr_t new_start = (sz_ptr_t)allocator->allocate(new_space, allocator->handle);
-    if (!new_start) return SZ_NULL_CHAR;
-
-    sz_copy(new_start, string_start, string_length);
-    string->external.start = new_start;
-    string->external.space = new_space;
-    string->external.padding = 0;
-    string->external.length = string_length;
-
-    // Deallocate the old string.
-    if (string_is_external) allocator->free(string_start, string_space, allocator->handle);
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_shrink_to_fit(sz_string_t *string, sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // We may already be space-optimal, and in that case we don't need to do anything.
-    sz_size_t new_space = string_length + 1;
-    if (string_space == new_space || !string_is_external) return string->external.start;
-
-    sz_ptr_t new_start = (sz_ptr_t)allocator->allocate(new_space, allocator->handle);
-    if (!new_start) return SZ_NULL_CHAR;
-
-    sz_copy(new_start, string_start, string_length);
-    string->external.start = new_start;
-    string->external.space = new_space;
-    string->external.padding = 0;
-    string->external.length = string_length;
-
-    // Deallocate the old string.
-    if (string_is_external) allocator->free(string_start, string_space, allocator->handle);
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_expand(sz_string_t *string, sz_size_t offset, sz_size_t added_length,
-                                    sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // The user intended to extend the string.
-    offset = sz_min_of_two(offset, string_length);
-
-    // If we are lucky, no memory allocations will be needed.
-    if (string_length + added_length < string_space) {
-        sz_move(string_start + offset + added_length, string_start + offset, string_length - offset);
-        string_start[string_length + added_length] = 0;
-        // Even if the string is on the stack, the `+=` won't affect the tail of the string.
-        string->external.length += added_length;
-    }
-    // If we are not lucky, we need to allocate more memory.
-    else {
-        sz_size_t next_planned_size = sz_max_of_two(SZ_CACHE_LINE_WIDTH, string_space * 2ull);
-        sz_size_t min_needed_space = sz_size_bit_ceil(offset + string_length + added_length + 1);
-        sz_size_t new_space = sz_max_of_two(min_needed_space, next_planned_size);
-        string_start = sz_string_reserve(string, new_space - 1, allocator);
-        if (!string_start) return SZ_NULL_CHAR;
-
-        // Copy into the new buffer.
-        sz_move(string_start + offset + added_length, string_start + offset, string_length - offset);
-        string_start[string_length + added_length] = 0;
-        string->external.length = string_length + added_length;
-    }
-
-    return string_start;
-}
-
-SZ_PUBLIC sz_size_t sz_string_erase(sz_string_t *string, sz_size_t offset, sz_size_t length) {
-
-    sz_assert(string && "String can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // Normalize the offset, it can't be larger than the length.
-    offset = sz_min_of_two(offset, string_length);
-
-    // We shouldn't normalize the length, to avoid overflowing on `offset + length >= string_length`,
-    // if receiving `length == SZ_SIZE_MAX`. After following expression the `length` will contain
-    // exactly the delta between original and final length of this `string`.
-    length = sz_min_of_two(length, string_length - offset);
-
-    // There are 2 common cases, that wouldn't even require a `memmove`:
-    //      1.  Erasing the entire contents of the string.
-    //          In that case `length` argument will be equal or greater than `length` member.
-    //      2.  Removing the tail of the string with something like `string.pop_back()` in C++.
-    //
-    // In both of those, regardless of the location of the string - stack or heap,
-    // the erasing is as easy as setting the length to the offset.
-    // In every other case, we must `memmove` the tail of the string to the left.
-    if (offset + length < string_length)
-        sz_move(string_start + offset, string_start + offset + length, string_length - offset - length);
-
-    // The `string->external.length = offset` assignment would discard last characters
-    // of the on-the-stack string, but inplace subtraction would work.
-    string->external.length -= length;
-    string_start[string_length - length] = 0;
-    return length;
-}
-
-SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *allocator) {
-    if (!sz_string_is_on_stack(string))
-        allocator->free(string->external.start, string->external.space, allocator->handle);
-    sz_string_init(string);
-}
-
-// When overriding libc, disable optimisations for this function beacuse MSVC will optimize the loops into a memset.
-// Which then causes a stack overflow due to infinite recursion (memset -> sz_fill_serial -> memset).
-#if defined(_MSC_VER) && defined(SZ_OVERRIDE_LIBC) && SZ_OVERRIDE_LIBC
-#pragma optimize("", off)
-#endif
-SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    sz_ptr_t end = target + length;
-    // Dealing with short strings, a single sequential pass would be faster.
-    // If the size is larger than 2 words, then at least 1 of them will be aligned.
-    // But just one aligned word may not be worth SWAR.
-    if (length < SZ_SWAR_THRESHOLD)
-        while (target != end) *(target++) = value;
-
-    // In case of long strings, skip unaligned bytes, and then fill the rest in 64-bit chunks.
-    else {
-        sz_u64_t value64 = (sz_u64_t)value * 0x0101010101010101ull;
-        while ((sz_size_t)target & 7ull) *(target++) = value;
-        while (target + 8 <= end) *(sz_u64_t *)target = value64, target += 8;
-        while (target != end) *(target++) = value;
-    }
-}
-#if defined(_MSC_VER) && defined(SZ_OVERRIDE_LIBC) && SZ_OVERRIDE_LIBC
-#pragma optimize("", on)
-#endif
-
-SZ_PUBLIC void sz_copy_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_MISALIGNED_LOADS
-    while (length >= 8) *(sz_u64_t *)target = *(sz_u64_t const *)source, target += 8, source += 8, length -= 8;
-#endif
-    while (length--) *(target++) = *(source++);
-}
-
-SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // Implementing `memmove` is trickier, than `memcpy`, as the ranges may overlap.
-    // Existing implementations often have two passes, in normal and reversed order,
-    // depending on the relation of `target` and `source` addresses.
-    // https://student.cs.uwaterloo.ca/~cs350/common/os161-src-html/doxygen/html/memmove_8c_source.html
-    // https://marmota.medium.com/c-language-making-memmove-def8792bb8d5
-    //
-    // We can use the `memcpy` like left-to-right pass if we know that the `target` is before `source`.
-    // Or if we know that they don't intersect! In that case the traversal order is irrelevant,
-    // but older CPUs may predict and fetch forward-passes better.
-    if (target < source || target >= source + length) {
-#if SZ_USE_MISALIGNED_LOADS
-        while (length >= 8) *(sz_u64_t *)target = *(sz_u64_t const *)(source), target += 8, source += 8, length -= 8;
-#endif
-        while (length--) *(target++) = *(source++);
-    }
-    else {
-        // Jump to the end and walk backwards.
-        target += length, source += length;
-#if SZ_USE_MISALIGNED_LOADS
-        while (length >= 8) *(sz_u64_t *)(target -= 8) = *(sz_u64_t const *)(source -= 8), length -= 8;
-#endif
-        while (length--) *(--target) = *(--source);
-    }
-}
-
-#pragma endregion
-
-/*
- *  @brief  Serial implementation for strings sequence processing.
- */
-#pragma region Serial Implementation for Sequences
-
-SZ_PUBLIC sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate) {
-
-    sz_size_t matches = 0;
-    while (matches != sequence->count && predicate(sequence, sequence->order[matches])) ++matches;
-
-    for (sz_size_t i = matches + 1; i < sequence->count; ++i)
-        if (predicate(sequence, sequence->order[i]))
-            sz_u64_swap(sequence->order + i, sequence->order + matches), ++matches;
-
-    return matches;
-}
-
-SZ_PUBLIC void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less) {
-
-    sz_size_t start_b = partition + 1;
-
-    // If the direct merge is already sorted
-    if (!less(sequence, sequence->order[start_b], sequence->order[partition])) return;
-
-    sz_size_t start_a = 0;
-    while (start_a <= partition && start_b <= sequence->count) {
-
-        // If element 1 is in right place
-        if (!less(sequence, sequence->order[start_b], sequence->order[start_a])) { start_a++; }
-        else {
-            sz_size_t value = sequence->order[start_b];
-            sz_size_t index = start_b;
-
-            // Shift all the elements between element 1
-            // element 2, right by 1.
-            while (index != start_a) { sequence->order[index] = sequence->order[index - 1], index--; }
-            sequence->order[start_a] = value;
-
-            // Update all the pointers
-            start_a++;
-            partition++;
-            start_b++;
-        }
-    }
-}
-
-SZ_PUBLIC void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
-    sz_u64_t *keys = sequence->order;
-    sz_size_t keys_count = sequence->count;
-    for (sz_size_t i = 1; i < keys_count; i++) {
-        sz_u64_t i_key = keys[i];
-        sz_size_t j = i;
-        for (; j > 0 && less(sequence, i_key, keys[j - 1]); --j) keys[j] = keys[j - 1];
-        keys[j] = i_key;
-    }
-}
-
-SZ_INTERNAL void _sz_sift_down(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t start,
-                               sz_size_t end) {
-    sz_size_t root = start;
-    while (2 * root + 1 <= end) {
-        sz_size_t child = 2 * root + 1;
-        if (child + 1 <= end && less(sequence, order[child], order[child + 1])) { child++; }
-        if (!less(sequence, order[root], order[child])) { return; }
-        sz_u64_swap(order + root, order + child);
-        root = child;
-    }
-}
-
-SZ_INTERNAL void _sz_heapify(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t count) {
-    sz_size_t start = (count - 2) / 2;
-    while (1) {
-        _sz_sift_down(sequence, less, order, start, count - 1);
-        if (start == 0) return;
-        start--;
-    }
-}
-
-SZ_INTERNAL void _sz_heapsort(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first, sz_size_t last) {
-    sz_u64_t *order = sequence->order;
-    sz_size_t count = last - first;
-    _sz_heapify(sequence, less, order + first, count);
-    sz_size_t end = count - 1;
-    while (end > 0) {
-        sz_u64_swap(order + first, order + first + end);
-        end--;
-        _sz_sift_down(sequence, less, order + first, 0, end);
-    }
-}
-
-SZ_PUBLIC void sz_sort_introsort_recursion(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first,
-                                           sz_size_t last, sz_size_t depth) {
-
-    sz_size_t length = last - first;
-    switch (length) {
-    case 0:
-    case 1: return;
-    case 2:
-        if (less(sequence, sequence->order[first + 1], sequence->order[first]))
-            sz_u64_swap(&sequence->order[first], &sequence->order[first + 1]);
-        return;
-    case 3: {
-        sz_u64_t a = sequence->order[first];
-        sz_u64_t b = sequence->order[first + 1];
-        sz_u64_t c = sequence->order[first + 2];
-        if (less(sequence, b, a)) sz_u64_swap(&a, &b);
-        if (less(sequence, c, b)) sz_u64_swap(&c, &b);
-        if (less(sequence, b, a)) sz_u64_swap(&a, &b);
-        sequence->order[first] = a;
-        sequence->order[first + 1] = b;
-        sequence->order[first + 2] = c;
-        return;
-    }
-    }
-    // Until a certain length, the quadratic-complexity insertion-sort is fine
-    if (length <= 16) {
-        sz_sequence_t sub_seq = *sequence;
-        sub_seq.order += first;
-        sub_seq.count = length;
-        sz_sort_insertion(&sub_seq, less);
-        return;
-    }
-
-    // Fallback to N-logN-complexity heap-sort
-    if (depth == 0) {
-        _sz_heapsort(sequence, less, first, last);
-        return;
-    }
-
-    --depth;
-
-    // Median-of-three logic to choose pivot
-    sz_size_t median = first + length / 2;
-    if (less(sequence, sequence->order[median], sequence->order[first]))
-        sz_u64_swap(&sequence->order[first], &sequence->order[median]);
-    if (less(sequence, sequence->order[last - 1], sequence->order[first]))
-        sz_u64_swap(&sequence->order[first], &sequence->order[last - 1]);
-    if (less(sequence, sequence->order[median], sequence->order[last - 1]))
-        sz_u64_swap(&sequence->order[median], &sequence->order[last - 1]);
-
-    // Partition using the median-of-three as the pivot
-    sz_u64_t pivot = sequence->order[median];
-    sz_size_t left = first;
-    sz_size_t right = last - 1;
-    while (1) {
-        while (less(sequence, sequence->order[left], pivot)) left++;
-        while (less(sequence, pivot, sequence->order[right])) right--;
-        if (left >= right) break;
-        sz_u64_swap(&sequence->order[left], &sequence->order[right]);
-        left++;
-        right--;
-    }
-
-    // Recursively sort the partitions
-    sz_sort_introsort_recursion(sequence, less, first, left, depth);
-    sz_sort_introsort_recursion(sequence, less, right + 1, last, depth);
-}
-
-SZ_PUBLIC void sz_sort_introsort(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
-    if (sequence->count == 0) return;
-    sz_size_t size_is_not_power_of_two = (sequence->count & (sequence->count - 1)) != 0;
-    sz_size_t depth_limit = sz_size_log2i_nonzero(sequence->count) + size_is_not_power_of_two;
-    sz_sort_introsort_recursion(sequence, less, 0, sequence->count, depth_limit);
-}
-
-SZ_PUBLIC void sz_sort_recursion( //
-    sz_sequence_t *sequence, sz_size_t bit_idx, sz_size_t bit_max, sz_sequence_comparator_t comparator,
-    sz_size_t partial_order_length) {
-
-    if (!sequence->count) return;
-
-    // Array of size one doesn't need sorting - only needs the prefix to be discarded.
-    if (sequence->count == 1) {
-        sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
-        order_half_words[1] = 0;
-        return;
-    }
-
-    // Partition a range of integers according to a specific bit value
-    sz_size_t split = 0;
-    sz_u64_t mask = (1ull << 63) >> bit_idx;
-
-    // The clean approach would be to perform a single pass over the sequence.
-    //
-    //    while (split != sequence->count && !(sequence->order[split] & mask)) ++split;
-    //    for (sz_size_t i = split + 1; i < sequence->count; ++i)
-    //        if (!(sequence->order[i] & mask)) sz_u64_swap(sequence->order + i, sequence->order + split), ++split;
-    //
-    // This, however, doesn't take into account the high relative cost of writes and swaps.
-    // To circumvent that, we can first count the total number entries to be mapped into either part.
-    // And then walk through both parts, swapping the entries that are in the wrong part.
-    // This would often lead to ~15% performance gain.
-    sz_size_t count_with_bit_set = 0;
-    for (sz_size_t i = 0; i != sequence->count; ++i) count_with_bit_set += (sequence->order[i] & mask) != 0;
-    split = sequence->count - count_with_bit_set;
-
-    // It's possible that the sequence is already partitioned.
-    if (split != 0 && split != sequence->count) {
-        // Use two pointers to efficiently reposition elements.
-        // On pointer walks left-to-right from the start, and the other walks right-to-left from the end.
-        sz_size_t left = 0;
-        sz_size_t right = sequence->count - 1;
-        while (1) {
-            // Find the next element with the bit set on the left side.
-            while (left < split && !(sequence->order[left] & mask)) ++left;
-            // Find the next element without the bit set on the right side.
-            while (right >= split && (sequence->order[right] & mask)) --right;
-            // Swap the mispositioned elements.
-            if (left < split && right >= split) {
-                sz_u64_swap(sequence->order + left, sequence->order + right);
-                ++left;
-                --right;
-            }
-            else { break; }
-        }
-    }
-
-    // Go down recursively.
-    if (bit_idx < bit_max) {
-        sz_sequence_t a = *sequence;
-        a.count = split;
-        sz_sort_recursion(&a, bit_idx + 1, bit_max, comparator, partial_order_length);
-
-        sz_sequence_t b = *sequence;
-        b.order += split;
-        b.count -= split;
-        sz_sort_recursion(&b, bit_idx + 1, bit_max, comparator, partial_order_length);
-    }
-    // Reached the end of recursion.
-    else {
-        // Discard the prefixes.
-        sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
-        for (sz_size_t i = 0; i != sequence->count; ++i) { order_half_words[i * 2 + 1] = 0; }
-
-        sz_sequence_t a = *sequence;
-        a.count = split;
-        sz_sort_introsort(&a, comparator);
-
-        sz_sequence_t b = *sequence;
-        b.order += split;
-        b.count -= split;
-        sz_sort_introsort(&b, comparator);
-    }
-}
-
-SZ_INTERNAL sz_bool_t _sz_sort_is_less(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) {
-    sz_cptr_t i_str = sequence->get_start(sequence, i_key);
-    sz_cptr_t j_str = sequence->get_start(sequence, j_key);
-    sz_size_t i_len = sequence->get_length(sequence, i_key);
-    sz_size_t j_len = sequence->get_length(sequence, j_key);
-    return (sz_bool_t)(sz_order_serial(i_str, i_len, j_str, j_len) == sz_less_k);
-}
-
-SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t partial_order_length) {
-
-#if SZ_DETECT_BIG_ENDIAN
-    // TODO: Implement partial sort for big-endian systems. For now this sorts the whole thing.
-    sz_unused(partial_order_length);
-    sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
-#else
-
-    // Export up to 4 bytes into the `sequence` bits themselves
-    for (sz_size_t i = 0; i != sequence->count; ++i) {
-        sz_cptr_t begin = sequence->get_start(sequence, sequence->order[i]);
-        sz_size_t length = sequence->get_length(sequence, sequence->order[i]);
-        length = length > 4u ? 4u : length;
-        sz_ptr_t prefix = (sz_ptr_t)&sequence->order[i];
-        for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j];
-    }
-
-    // Perform optionally-parallel radix sort on them
-    sz_sort_recursion(sequence, 0, 32, (sz_sequence_comparator_t)_sz_sort_is_less, partial_order_length);
-#endif
-}
-
-SZ_PUBLIC void sz_sort(sz_sequence_t *sequence) {
-#if SZ_DETECT_BIG_ENDIAN
-    sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
-#else
-    sz_sort_partial(sequence, sequence->count);
-#endif
-}
-
-#pragma endregion
-
-/*
- *  @brief  AVX2 implementation of the string search algorithms.
- *          Very minimalistic, but still faster than the serial implementation.
- */
-#pragma region AVX2 Implementation
-
-#if SZ_USE_X86_AVX2
-#pragma GCC push_options
-#pragma GCC target("avx2")
-#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
-#include <immintrin.h>
-
-/**
- *  @brief  Helper structure to simplify work with 256-bit registers.
- */
-typedef union sz_u256_vec_t {
-    __m256i ymm;
-    __m128i xmms[2];
-    sz_u64_t u64s[4];
-    sz_u32_t u32s[8];
-    sz_u16_t u16s[16];
-    sz_u8_t u8s[32];
-} sz_u256_vec_t;
-
-SZ_PUBLIC sz_ordering_t sz_order_avx2(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    //! Before optimizing this, read the "Operations Not Worth Optimizing" in Contributions Guide:
-    //! https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md#general-performance-observations
-    return sz_order_serial(a, a_length, b, b_length);
-}
-
-SZ_PUBLIC sz_bool_t sz_equal_avx2(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_u256_vec_t a_vec, b_vec;
-
-    while (length >= 32) {
-        a_vec.ymm = _mm256_lddqu_si256((__m256i const *)a);
-        b_vec.ymm = _mm256_lddqu_si256((__m256i const *)b);
-        // One approach can be to use "movemasks", but we could also use a bitwise matching like `_mm256_testnzc_si256`.
-        int difference_mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi8(a_vec.ymm, b_vec.ymm));
-        if (difference_mask == 0) { a += 32, b += 32, length -= 32; }
-        else { return sz_false_k; }
-    }
-
-    if (length) return sz_equal_serial(a, b, length);
-    return sz_true_k;
-}
-
-SZ_PUBLIC void sz_fill_avx2(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    char value_char = *(char *)&value;
-    __m256i value_vec = _mm256_set1_epi8(value_char);
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores".
-    //
-    //    for (; length >= 32; target += 32, length -= 32) _mm256_storeu_si256(target, value_vec);
-    //    sz_fill_serial(target, length, value);
-    //
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 32) sz_fill_serial(target, length, value);
-    // When the buffer is aligned, we can avoid any split-stores.
-    else {
-        sz_size_t head_length = (32 - ((sz_size_t)target % 32)) % 32; // 31 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 32;    // 31 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 32.
-        sz_u16_t value16 = (sz_u16_t)value * 0x0101u;
-        sz_u32_t value32 = (sz_u32_t)value16 * 0x00010001u;
-        sz_u64_t value64 = (sz_u64_t)value32 * 0x0000000100000001ull;
-
-        // Fill the head of the buffer. This part is much cleaner with AVX-512.
-        if (head_length & 1) *(sz_u8_t *)target = value, target++, head_length--;
-        if (head_length & 2) *(sz_u16_t *)target = value16, target += 2, head_length -= 2;
-        if (head_length & 4) *(sz_u32_t *)target = value32, target += 4, head_length -= 4;
-        if (head_length & 8) *(sz_u64_t *)target = value64, target += 8, head_length -= 8;
-        if (head_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_set1_epi8(value_char)), target += 16, head_length -= 16;
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-
-        // Fill the aligned body of the buffer.
-        for (; body_length >= 32; target += 32, body_length -= 32) _mm256_store_si256((__m256i *)target, value_vec);
-
-        // Fill the tail of the buffer. This part is much cleaner with AVX-512.
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-        if (tail_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_set1_epi8(value_char)), target += 16, tail_length -= 16;
-        if (tail_length & 8) *(sz_u64_t *)target = value64, target += 8, tail_length -= 8;
-        if (tail_length & 4) *(sz_u32_t *)target = value32, target += 4, tail_length -= 4;
-        if (tail_length & 2) *(sz_u16_t *)target = value16, target += 2, tail_length -= 2;
-        if (tail_length & 1) *(sz_u8_t *)target = value, target++, tail_length--;
-    }
-}
-
-SZ_PUBLIC void sz_copy_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores" and "loads".
-    //
-    //    for (; length >= 32; target += 32, source += 32, length -= 32)
-    //        _mm256_storeu_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-    //    sz_copy_serial(target, source, length);
-    //
-    // A typical AWS Skylake instance can have 32 KB x 2 blocks of L1 data cache per core,
-    // 1 MB x 2 blocks of L2 cache per core, and one shared L3 cache buffer.
-    // For now, let's avoid the cases beyond the L2 size.
-    int is_huge = length > 1ull * 1024ull * 1024ull;
-    if (length <= 32) { sz_copy_serial(target, source, length); }
-    // When dealing wirh larger arrays, the optimization is not as simple as with the `sz_fill_avx2` function,
-    // as both buffers may be unaligned. If we are lucky and the requested operation is some huge page transfer,
-    // we can use aligned loads and stores, and the performance will be great.
-    else if ((sz_size_t)target % 32 == 0 && (sz_size_t)source % 32 == 0 && !is_huge) {
-        for (; length >= 32; target += 32, source += 32, length -= 32)
-            _mm256_store_si256((__m256i *)target, _mm256_load_si256((__m256i const *)source));
-        if (length) sz_copy_serial(target, source, length);
-    }
-    // The trickiest case is when both `source` and `target` are not aligned.
-    // In such and simpler cases we can copy enough bytes into `target` to reach its cacheline boundary,
-    // and then combine unaligned loads with aligned stores.
-    else {
-        sz_size_t head_length = (32 - ((sz_size_t)target % 32)) % 32; // 31 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 32;    // 31 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 32.
-
-        // Fill the head of the buffer. This part is much cleaner with AVX-512.
-        if (head_length & 1) *(sz_u8_t *)target = *(sz_u8_t *)source, target++, source++, head_length--;
-        if (head_length & 2) *(sz_u16_t *)target = *(sz_u16_t *)source, target += 2, source += 2, head_length -= 2;
-        if (head_length & 4) *(sz_u32_t *)target = *(sz_u32_t *)source, target += 4, source += 4, head_length -= 4;
-        if (head_length & 8) *(sz_u64_t *)target = *(sz_u64_t *)source, target += 8, source += 8, head_length -= 8;
-        if (head_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_lddqu_si128((__m128i const *)source)), target += 16, source += 16,
-                head_length -= 16;
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-
-        // Fill the aligned body of the buffer.
-        if (!is_huge) {
-            for (; body_length >= 32; target += 32, source += 32, body_length -= 32)
-                _mm256_store_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-        }
-        // When the biffer is huge, we can traverse it in 2 directions.
-        else {
-            for (; body_length >= 64; target += 32, source += 32, body_length -= 64) {
-                _mm256_store_si256((__m256i *)(target), _mm256_lddqu_si256((__m256i const *)(source)));
-                _mm256_store_si256((__m256i *)(target + body_length - 32),
-                                   _mm256_lddqu_si256((__m256i const *)(source + body_length - 32)));
-            }
-            if (body_length) _mm256_store_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-        }
-
-        // Fill the tail of the buffer. This part is much cleaner with AVX-512.
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-        if (tail_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_lddqu_si128((__m128i const *)source)), target += 16, source += 16,
-                tail_length -= 16;
-        if (tail_length & 8) *(sz_u64_t *)target = *(sz_u64_t *)source, target += 8, source += 8, tail_length -= 8;
-        if (tail_length & 4) *(sz_u32_t *)target = *(sz_u32_t *)source, target += 4, source += 4, tail_length -= 4;
-        if (tail_length & 2) *(sz_u16_t *)target = *(sz_u16_t *)source, target += 2, source += 2, tail_length -= 2;
-        if (tail_length & 1) *(sz_u8_t *)target = *(sz_u8_t *)source, target++, source++, tail_length--;
-    }
-}
-
-SZ_PUBLIC void sz_move_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    if (target < source || target >= source + length) {
-        for (; length >= 32; target += 32, source += 32, length -= 32)
-            _mm256_storeu_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-        while (length--) *(target++) = *(source++);
-    }
-    else {
-        // Jump to the end and walk backwards.
-        for (target += length, source += length; length >= 32; length -= 32)
-            _mm256_storeu_si256((__m256i *)(target -= 32), _mm256_lddqu_si256((__m256i const *)(source -= 32)));
-        while (length--) *(--target) = *(--source);
-    }
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_avx2(sz_cptr_t text, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "loads".
-    //
-    // A typical AWS Skylake instance can have 32 KB x 2 blocks of L1 data cache per core,
-    // 1 MB x 2 blocks of L2 cache per core, and one shared L3 cache buffer.
-    // For now, let's avoid the cases beyond the L2 size.
-    int is_huge = length > 1ull * 1024ull * 1024ull;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 32) { return sz_checksum_serial(text, length); }
-    else if (!is_huge) {
-        sz_u256_vec_t text_vec, sums_vec;
-        sums_vec.ymm = _mm256_setzero_si256();
-        for (; length >= 32; text += 32, length -= 32) {
-            text_vec.ymm = _mm256_lddqu_si256((__m256i const *)text);
-            sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-        }
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        sz_u64_t result = low + high;
-        if (length) result += sz_checksum_serial(text, length);
-        return result;
-    }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    // Most notably, we can avoid populating the cache with the entire buffer, and instead traverse it in 2 directions.
-    else {
-        sz_size_t head_length = (32 - ((sz_size_t)text % 32)) % 32; // 31 or less.
-        sz_size_t tail_length = (sz_size_t)(text + length) % 32;    // 31 or less.
-        sz_size_t body_length = length - head_length - tail_length; // Multiple of 32.
-        sz_u64_t result = 0;
-
-        // Handle the head
-        while (head_length--) result += *text++;
-
-        sz_u256_vec_t text_vec, sums_vec;
-        sums_vec.ymm = _mm256_setzero_si256();
-        // Fill the aligned body of the buffer.
-        if (!is_huge) {
-            for (; body_length >= 32; text += 32, body_length -= 32) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i const *)text);
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-            }
-        }
-        // When the biffer is huge, we can traverse it in 2 directions.
-        else {
-            sz_u256_vec_t text_reversed_vec, sums_reversed_vec;
-            sums_reversed_vec.ymm = _mm256_setzero_si256();
-            for (; body_length >= 64; text += 64, body_length -= 64) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i *)(text));
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-                text_reversed_vec.ymm = _mm256_stream_load_si256((__m256i *)(text + body_length - 64));
-                sums_reversed_vec.ymm = _mm256_add_epi64(
-                    sums_reversed_vec.ymm, _mm256_sad_epu8(text_reversed_vec.ymm, _mm256_setzero_si256()));
-            }
-            if (body_length >= 32) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i *)(text));
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-            }
-            sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, sums_reversed_vec.ymm);
-        }
-
-        // Handle the tail
-        while (tail_length--) result += *text++;
-
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        result += low + high;
-        return result;
-    }
-}
-
-SZ_PUBLIC void sz_look_up_transform_avx2(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-
-    // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
-    // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
-    // But if at least 3 cache lines are touched, the AVX-2 implementation should be faster.
-    if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
-        return;
-    }
-
-    // We need to pull the lookup table into 8x YMM registers.
-    // The biggest issue is reorganizing the data in the lookup table, as AVX2 doesn't have 256-bit shuffle,
-    // it only has 128-bit "within-lane" shuffle. Still, it's wiser to use full YMM registers, instead of XMM,
-    // so that we can at least compensate high latency with twice larger window and one more level of lookup.
-    sz_u256_vec_t lut_0_to_15_vec, lut_16_to_31_vec, lut_32_to_47_vec, lut_48_to_63_vec, //
-        lut_64_to_79_vec, lut_80_to_95_vec, lut_96_to_111_vec, lut_112_to_127_vec,       //
-        lut_128_to_143_vec, lut_144_to_159_vec, lut_160_to_175_vec, lut_176_to_191_vec,  //
-        lut_192_to_207_vec, lut_208_to_223_vec, lut_224_to_239_vec, lut_240_to_255_vec;
-
-    lut_0_to_15_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut)));
-    lut_16_to_31_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 16)));
-    lut_32_to_47_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 32)));
-    lut_48_to_63_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 48)));
-    lut_64_to_79_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 64)));
-    lut_80_to_95_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 80)));
-    lut_96_to_111_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 96)));
-    lut_112_to_127_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 112)));
-    lut_128_to_143_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 128)));
-    lut_144_to_159_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 144)));
-    lut_160_to_175_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 160)));
-    lut_176_to_191_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 176)));
-    lut_192_to_207_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 192)));
-    lut_208_to_223_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 208)));
-    lut_224_to_239_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 224)));
-    lut_240_to_255_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 240)));
-
-    // Assuming each lookup is performed within 16 elements of 256, we need to reduce the scope by 16x = 2^4.
-    sz_u256_vec_t not_first_bit_vec, not_second_bit_vec, not_third_bit_vec, not_fourth_bit_vec;
-
-    /// Top and bottom nibbles of the source are used separately.
-    sz_u256_vec_t source_vec, source_bot_vec;
-    sz_u256_vec_t blended_0_to_31_vec, blended_32_to_63_vec, blended_64_to_95_vec, blended_96_to_127_vec,
-        blended_128_to_159_vec, blended_160_to_191_vec, blended_192_to_223_vec, blended_224_to_255_vec;
-
-    // Handling the head.
-    while (length >= 32) {
-        // Load and separate the nibbles of each byte in the source.
-        source_vec.ymm = _mm256_lddqu_si256((__m256i const *)source);
-        source_bot_vec.ymm = _mm256_and_si256(source_vec.ymm, _mm256_set1_epi8((char)0x0F));
-
-        // In the first round, we select using the 4th bit.
-        not_fourth_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x10), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8(                      //
-            _mm256_shuffle_epi8(lut_16_to_31_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_0_to_15_vec.ymm, source_bot_vec.ymm),  //
-            not_fourth_bit_vec.ymm);
-        blended_32_to_63_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_48_to_63_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_32_to_47_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_64_to_95_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_80_to_95_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_64_to_79_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_96_to_127_vec.ymm = _mm256_blendv_epi8(                      //
-            _mm256_shuffle_epi8(lut_112_to_127_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_96_to_111_vec.ymm, source_bot_vec.ymm),  //
-            not_fourth_bit_vec.ymm);
-        blended_128_to_159_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_144_to_159_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_128_to_143_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_160_to_191_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_176_to_191_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_160_to_175_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_192_to_223_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_208_to_223_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_192_to_207_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_224_to_255_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_240_to_255_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_224_to_239_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-
-        // Perform a tree-like reduction of the 8x "blended" YMM registers, depending on the "source" content.
-        // The first round selects using the 3rd bit.
-        not_third_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x20), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8( //
-            blended_32_to_63_vec.ymm,                 //
-            blended_0_to_31_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-        blended_64_to_95_vec.ymm = _mm256_blendv_epi8( //
-            blended_96_to_127_vec.ymm,                 //
-            blended_64_to_95_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-        blended_128_to_159_vec.ymm = _mm256_blendv_epi8( //
-            blended_160_to_191_vec.ymm,                  //
-            blended_128_to_159_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-        blended_192_to_223_vec.ymm = _mm256_blendv_epi8( //
-            blended_224_to_255_vec.ymm,                  //
-            blended_192_to_223_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-
-        // The second round selects using the 2nd bit.
-        not_second_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x40), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8( //
-            blended_64_to_95_vec.ymm,                 //
-            blended_0_to_31_vec.ymm,                  //
-            not_second_bit_vec.ymm);
-        blended_128_to_159_vec.ymm = _mm256_blendv_epi8( //
-            blended_192_to_223_vec.ymm,                  //
-            blended_128_to_159_vec.ymm,                  //
-            not_second_bit_vec.ymm);
-
-        // The third round selects using the 1st bit.
-        not_first_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x80), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8( //
-            blended_128_to_159_vec.ymm,               //
-            blended_0_to_31_vec.ymm,                  //
-            not_first_bit_vec.ymm);
-
-        // And dump the result into the target.
-        _mm256_storeu_si256((__m256i *)target, blended_0_to_31_vec.ymm);
-        source += 32, target += 32, length -= 32;
-    }
-
-    // Handle the tail.
-    if (length) sz_look_up_transform_serial(source, length, lut, target);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    int mask;
-    sz_u256_vec_t h_vec, n_vec;
-    n_vec.ymm = _mm256_set1_epi8(n[0]);
-
-    while (h_length >= 32) {
-        h_vec.ymm = _mm256_lddqu_si256((__m256i const *)h);
-        mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_vec.ymm, n_vec.ymm));
-        if (mask) return h + sz_u32_ctz(mask);
-        h += 32, h_length -= 32;
-    }
-
-    return sz_find_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    int mask;
-    sz_u256_vec_t h_vec, n_vec;
-    n_vec.ymm = _mm256_set1_epi8(n[0]);
-
-    while (h_length >= 32) {
-        h_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + h_length - 32));
-        mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_vec.ymm, n_vec.ymm));
-        if (mask) return h + h_length - 1 - sz_u32_clz(mask);
-        h_length -= 32;
-    }
-
-    return sz_rfind_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_avx2(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into YMM registers.
-    int matches;
-    sz_u256_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.ymm = _mm256_set1_epi8(n[offset_first]);
-    n_mid_vec.ymm = _mm256_set1_epi8(n[offset_mid]);
-    n_last_vec.ymm = _mm256_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    for (; h_length >= n_length + 32; h += 32, h_length -= 32) {
-        h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_first));
-        h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_mid));
-        h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_last));
-        matches = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
-        while (matches) {
-            int potential_offset = sz_u32_ctz(matches);
-            if (sz_equal(h + potential_offset, n, n_length)) return h + potential_offset;
-            matches &= matches - 1;
-        }
-    }
-
-    return sz_find_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_avx2(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into YMM registers.
-    int matches;
-    sz_u256_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.ymm = _mm256_set1_epi8(n[offset_first]);
-    n_mid_vec.ymm = _mm256_set1_epi8(n[offset_mid]);
-    n_last_vec.ymm = _mm256_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 32; h_length -= 32) {
-        h_reversed = h + h_length - n_length - 32 + 1;
-        h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_first));
-        h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_mid));
-        h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_last));
-        matches = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
-        while (matches) {
-            int potential_offset = sz_u32_clz(matches);
-            if (sz_equal(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            matches &= ~(1 << (31 - potential_offset));
-        }
-    }
-
-    return sz_rfind_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx2(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-
-    // Let's unzip even and odd elements and replicate them into both lanes of the YMM register.
-    // That way when we invoke `_mm256_shuffle_epi8` we can use the same mask for both lanes.
-    sz_u256_vec_t filter_even_vec, filter_odd_vec;
-    for (sz_size_t i = 0; i != 16; ++i)
-        filter_even_vec.u8s[i] = filter->_u8s[i * 2], filter_odd_vec.u8s[i] = filter->_u8s[i * 2 + 1];
-    filter_even_vec.xmms[1] = filter_even_vec.xmms[0];
-    filter_odd_vec.xmms[1] = filter_odd_vec.xmms[0];
-
-    sz_u256_vec_t text_vec;
-    sz_u256_vec_t matches_vec;
-    sz_u256_vec_t lower_nibbles_vec, higher_nibbles_vec;
-    sz_u256_vec_t bitset_even_vec, bitset_odd_vec;
-    sz_u256_vec_t bitmask_vec, bitmask_lookup_vec;
-    bitmask_lookup_vec.ymm = _mm256_set_epi8(-128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-                                             -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);
-
-    while (length >= 32) {
-        // The following algorithm is a transposed equivalent of the "SIMDized check which bytes are in a set"
-        // solutions by Wojciech Muła. We populate the bitmask differently and target newer CPUs, so
-        // StrinZilla uses a somewhat different approach.
-        // http://0x80.pl/articles/simd-byte-lookup.html#alternative-implementation-new
-        //
-        //      sz_u8_t input = *(sz_u8_t const *)text;
-        //      sz_u8_t lo_nibble = input & 0x0f;
-        //      sz_u8_t hi_nibble = input >> 4;
-        //      sz_u8_t bitset_even = filter_even_vec.u8s[hi_nibble];
-        //      sz_u8_t bitset_odd = filter_odd_vec.u8s[hi_nibble];
-        //      sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //      sz_u8_t bitset = lo_nibble < 8 ? bitset_even : bitset_odd;
-        //      if ((bitset & bitmask) != 0) return text;
-        //      else { length--, text++; }
-        //
-        // The nice part about this, loading the strided data is vey easy with Arm NEON,
-        // while with x86 CPUs after AVX, shuffles within 256 bits shouldn't be an issue either.
-        text_vec.ymm = _mm256_lddqu_si256((__m256i const *)text);
-        lower_nibbles_vec.ymm = _mm256_and_si256(text_vec.ymm, _mm256_set1_epi8(0x0f));
-        bitmask_vec.ymm = _mm256_shuffle_epi8(bitmask_lookup_vec.ymm, lower_nibbles_vec.ymm);
-        //
-        // At this point we can validate the `bitmask_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != 32; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t lo_nibble = input & 0x0f;
-        //          sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //          sz_assert(bitmask_vec.u8s[i] == bitmask);
-        //      }
-        //
-        // Shift right every byte by 4 bits.
-        // There is no `_mm256_srli_epi8` intrinsic, so we have to use `_mm256_srli_epi16`
-        // and combine it with a mask to clear the higher bits.
-        higher_nibbles_vec.ymm = _mm256_and_si256(_mm256_srli_epi16(text_vec.ymm, 4), _mm256_set1_epi8(0x0f));
-        bitset_even_vec.ymm = _mm256_shuffle_epi8(filter_even_vec.ymm, higher_nibbles_vec.ymm);
-        bitset_odd_vec.ymm = _mm256_shuffle_epi8(filter_odd_vec.ymm, higher_nibbles_vec.ymm);
-        //
-        // At this point we can validate the `bitset_even_vec` and `bitset_odd_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != 32; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t const *bitset_ptr = &filter->_u8s[0];
-        //          sz_u8_t hi_nibble = input >> 4;
-        //          sz_u8_t bitset_even = bitset_ptr[hi_nibble * 2];
-        //          sz_u8_t bitset_odd = bitset_ptr[hi_nibble * 2 + 1];
-        //          sz_assert(bitset_even_vec.u8s[i] == bitset_even);
-        //          sz_assert(bitset_odd_vec.u8s[i] == bitset_odd);
-        //      }
-        //
-        __m256i take_first = _mm256_cmpgt_epi8(_mm256_set1_epi8(8), lower_nibbles_vec.ymm);
-        bitset_even_vec.ymm = _mm256_blendv_epi8(bitset_odd_vec.ymm, bitset_even_vec.ymm, take_first);
-
-        // It would have been great to have an instruction that tests the bits and then broadcasts
-        // the matching bit into all bits in that byte. But we don't have that, so we have to
-        // `and`, `cmpeq`, `movemask`, and then invert at the end...
-        matches_vec.ymm = _mm256_and_si256(bitset_even_vec.ymm, bitmask_vec.ymm);
-        matches_vec.ymm = _mm256_cmpeq_epi8(matches_vec.ymm, _mm256_setzero_si256());
-        int matches_mask = ~_mm256_movemask_epi8(matches_vec.ymm);
-        if (matches_mask) {
-            int offset = sz_u32_ctz(matches_mask);
-            return text + offset;
-        }
-        else { text += 32, length -= 32; }
-    }
-
-    return sz_find_charset_serial(text, length, filter);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx2(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-    return sz_rfind_charset_serial(text, length, filter);
-}
-
-/**
- *  @brief  There is no AVX2 instruction for fast multiplication of 64-bit integers.
- *          This implementation is coming from Agner Fog's Vector Class Library.
- */
-SZ_INTERNAL __m256i _mm256_mul_epu64(__m256i a, __m256i b) {
-    __m256i bswap = _mm256_shuffle_epi32(b, 0xB1);
-    __m256i prodlh = _mm256_mullo_epi32(a, bswap);
-    __m256i zero = _mm256_setzero_si256();
-    __m256i prodlh2 = _mm256_hadd_epi32(prodlh, zero);
-    __m256i prodlh3 = _mm256_shuffle_epi32(prodlh2, 0x73);
-    __m256i prodll = _mm256_mul_epu32(a, b);
-    __m256i prod = _mm256_add_epi64(prodll, prodlh3);
-    return prod;
-}
-
-SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                              sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    if (length < 4 * window_length) {
-        sz_hashes_serial(start, length, window_length, step, callback, callback_handle);
-        return;
-    }
-
-    // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
-    // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
-    sz_size_t const max_hashes = length - window_length + 1;
-    sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
-    sz_u8_t const *text_first = (sz_u8_t const *)start;
-    sz_u8_t const *text_second = text_first + min_hashes_per_thread;
-    sz_u8_t const *text_third = text_first + min_hashes_per_thread * 2;
-    sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
-    sz_u8_t const *text_end = text_first + length;
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // Broadcast the constants into the registers.
-    sz_u256_vec_t prime_vec, golden_ratio_vec;
-    sz_u256_vec_t base_low_vec, base_high_vec, prime_power_low_vec, prime_power_high_vec, shift_high_vec;
-    base_low_vec.ymm = _mm256_set1_epi64x(31ull);
-    base_high_vec.ymm = _mm256_set1_epi64x(257ull);
-    shift_high_vec.ymm = _mm256_set1_epi64x(77ull);
-    prime_vec.ymm = _mm256_set1_epi64x(SZ_U64_MAX_PRIME);
-    golden_ratio_vec.ymm = _mm256_set1_epi64x(11400714819323198485ull);
-    prime_power_low_vec.ymm = _mm256_set1_epi64x(prime_power_low);
-    prime_power_high_vec.ymm = _mm256_set1_epi64x(prime_power_high);
-
-    // Compute the initial hash values for every one of the four windows.
-    sz_u256_vec_t hash_low_vec, hash_high_vec, hash_mix_vec, chars_low_vec, chars_high_vec;
-    hash_low_vec.ymm = _mm256_setzero_si256();
-    hash_high_vec.ymm = _mm256_setzero_si256();
-    for (sz_u8_t const *prefix_end = text_first + window_length; text_first < prefix_end;
-         ++text_first, ++text_second, ++text_third, ++text_fourth) {
-
-        // 1. Multiply the hashes by the base.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-
-        // 3. Add the incoming characters.
-        hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_low_vec.ymm = _mm256_blendv_epi8(hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
-                                              _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
-        hash_high_vec.ymm = _mm256_blendv_epi8(hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
-                                               _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
-    }
-
-    // 5. Compute the hash mix, that will be used to index into the fingerprint.
-    //    This includes a serial step at the end.
-    hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
-    hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
-    hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
-    callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-    callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-    callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-    callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-
-    // Now repeat that operation for the remaining characters, discarding older characters.
-    sz_size_t cycle = 1;
-    sz_size_t const step_mask = step - 1;
-    for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
-        // 0. Load again the four characters we are dropping, shift them, and subtract.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[-window_length], text_third[-window_length],
-                                              text_second[-window_length], text_first[-window_length]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-        hash_low_vec.ymm =
-            _mm256_sub_epi64(hash_low_vec.ymm, _mm256_mul_epu64(chars_low_vec.ymm, prime_power_low_vec.ymm));
-        hash_high_vec.ymm =
-            _mm256_sub_epi64(hash_high_vec.ymm, _mm256_mul_epu64(chars_high_vec.ymm, prime_power_high_vec.ymm));
-
-        // 1. Multiply the hashes by the base.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-
-        // 3. Add the incoming characters.
-        hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_low_vec.ymm = _mm256_blendv_epi8(hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
-                                              _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
-        hash_high_vec.ymm = _mm256_blendv_epi8(hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
-                                               _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
-
-        // 5. Compute the hash mix, that will be used to index into the fingerprint.
-        //    This includes a serial step at the end.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
-        hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
-        if ((cycle & step_mask) == 0) {
-            callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-            callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-            callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-            callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-        }
-    }
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif
-#pragma endregion
-
-/*
- *  @brief  AVX-512 implementation of the string search algorithms.
- *
- *  Different subsets of AVX-512 were introduced in different years:
- *  - 2017 SkyLake: F, CD, ER, PF, VL, DQ, BW
- *  - 2018 CannonLake: IFMA, VBMI
- *  - 2019 IceLake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES
- *  - 2020 TigerLake: VP2INTERSECT
- */
-#pragma region AVX512 Implementation
-
-#if SZ_USE_X86_AVX512
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
-#include <immintrin.h>
-
-/**
- *  @brief  Helper structure to simplify work with 512-bit registers.
- */
-typedef union sz_u512_vec_t {
-    __m512i zmm;
-    __m256i ymms[2];
-    __m128i xmms[4];
-    sz_u64_t u64s[8];
-    sz_u32_t u32s[16];
-    sz_u16_t u16s[32];
-    sz_u8_t u8s[64];
-    sz_i64_t i64s[8];
-    sz_i32_t i32s[16];
-} sz_u512_vec_t;
-
-SZ_INTERNAL __mmask64 _sz_u64_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 64:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 64:
-    return _bzhi_u64(0xFFFFFFFFFFFFFFFF, n < 64 ? (sz_u32_t)n : 64);
-}
-
-SZ_INTERNAL __mmask32 _sz_u32_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 32:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 32:
-    return _bzhi_u32(0xFFFFFFFF, n < 32 ? (sz_u32_t)n : 32);
-}
-
-SZ_INTERNAL __mmask16 _sz_u16_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 16:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 16:
-    return _bzhi_u32(0xFFFFFFFF, n < 16 ? (sz_u32_t)n : 16);
-}
-
-SZ_INTERNAL __mmask16 _sz_u16_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 16:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 16:
-    return (__mmask16)_bzhi_u32(0xFFFFFFFF, (sz_u32_t)n);
-}
-
-SZ_INTERNAL __mmask32 _sz_u32_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 32:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 32:
-    return _bzhi_u32(0xFFFFFFFF, (sz_u32_t)n);
-}
-
-SZ_INTERNAL __mmask64 _sz_u64_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 64:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 64:
-    return _bzhi_u64(0xFFFFFFFFFFFFFFFF, (sz_u32_t)n);
-}
-
-SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    sz_u512_vec_t a_vec, b_vec;
-
-    // Pointer arithmetic is cheap, fetching memory is not!
-    // So we can use the masked loads to fetch at most one cache-line for each string,
-    // compare the prefixes, and only then move forward.
-    sz_size_t a_head_length = 64 - ((sz_size_t)a % 64); // 63 or less.
-    sz_size_t b_head_length = 64 - ((sz_size_t)b % 64); // 63 or less.
-    a_head_length = a_head_length < a_length ? a_head_length : a_length;
-    b_head_length = b_head_length < b_length ? b_head_length : b_length;
-    sz_size_t head_length = a_head_length < b_head_length ? a_head_length : b_head_length;
-    __mmask64 head_mask = _sz_u64_mask_until(head_length);
-    a_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, a);
-    b_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, b);
-    __mmask64 mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-    if (mask_not_equal != 0) {
-        sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-        char a_char = a_vec.u8s[first_diff];
-        char b_char = b_vec.u8s[first_diff];
-        return _sz_order_scalars(a_char, b_char);
-    }
-    else if (head_length == a_length && head_length == b_length) { return sz_equal_k; }
-    else { a += head_length, b += head_length, a_length -= head_length, b_length -= head_length; }
-
-    // The rare case, when both string are very long.
-    __mmask64 a_mask, b_mask;
-    while ((a_length >= 64) & (b_length >= 64)) {
-        a_vec.zmm = _mm512_loadu_si512(a);
-        b_vec.zmm = _mm512_loadu_si512(b);
-        mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask_not_equal != 0) {
-            sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-            char a_char = a_vec.u8s[first_diff];
-            char b_char = b_vec.u8s[first_diff];
-            return _sz_order_scalars(a_char, b_char);
-        }
-        a += 64, b += 64, a_length -= 64, b_length -= 64;
-    }
-
-    // In most common scenarios at least one of the strings is under 64 bytes.
-    if (a_length | b_length) {
-        a_mask = _sz_u64_clamp_mask_until(a_length);
-        b_mask = _sz_u64_clamp_mask_until(b_length);
-        a_vec.zmm = _mm512_maskz_loadu_epi8(a_mask, a);
-        b_vec.zmm = _mm512_maskz_loadu_epi8(b_mask, b);
-        // The AVX-512 `_mm512_mask_cmpneq_epi8_mask` intrinsics are generally handy in such environments.
-        // They, however, have latency 3 on most modern CPUs. Using AVX2: `_mm256_cmpeq_epi8` would have
-        // been cheaper, if we didn't have to apply `_mm256_movemask_epi8` afterwards.
-        mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask_not_equal != 0) {
-            sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-            char a_char = a_vec.u8s[first_diff];
-            char b_char = b_vec.u8s[first_diff];
-            return _sz_order_scalars(a_char, b_char);
-        }
-        // From logic perspective, the hardest cases are "abc\0" and "abc".
-        // The result must be `sz_greater_k`, as the latter is shorter.
-        else { return _sz_order_scalars(a_length, b_length); }
-    }
-
-    return sz_equal_k;
-}
-
-SZ_PUBLIC sz_bool_t sz_equal_avx512(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    __mmask64 mask;
-    sz_u512_vec_t a_vec, b_vec;
-
-    while (length >= 64) {
-        a_vec.zmm = _mm512_loadu_si512(a);
-        b_vec.zmm = _mm512_loadu_si512(b);
-        mask = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask != 0) return sz_false_k;
-        a += 64, b += 64, length -= 64;
-    }
-
-    if (length) {
-        mask = _sz_u64_mask_until(length);
-        a_vec.zmm = _mm512_maskz_loadu_epi8(mask, a);
-        b_vec.zmm = _mm512_maskz_loadu_epi8(mask, b);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpneq_epi8_mask(mask, a_vec.zmm, b_vec.zmm);
-        return (sz_bool_t)(mask == 0);
-    }
-
-    return sz_true_k;
-}
-
-SZ_PUBLIC void sz_fill_avx512(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    __m512i value_vec = _mm512_set1_epi8(value);
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores".
-    //
-    //    for (; length >= 64; target += 64, length -= 64) _mm512_storeu_si512(target, value_vec);
-    //    _mm512_mask_storeu_epi8(target, _sz_u64_mask_until(length), value_vec);
-    //
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        _mm512_mask_storeu_epi8(target, mask, value_vec);
-    }
-    // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
-    // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
-    // by computing 2 masks - for the head and tail, using masked stores for the head and tail, and unmasked
-    // for the body.
-    else {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        _mm512_mask_storeu_epi8(target, head_mask, value_vec);
-        for (target += head_length; body_length >= 64; target += 64, body_length -= 64)
-            _mm512_store_si512(target, value_vec);
-        _mm512_mask_storeu_epi8(target, tail_mask, value_vec);
-    }
-}
-
-SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores" and "loads".
-    //
-    //    for (; length >= 64; target += 64, source += 64, length -= 64)
-    //        _mm512_storeu_si512(target, _mm512_loadu_si512(source));
-    //    __mmask64 mask = _sz_u64_mask_until(length);
-    //    _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    //
-    // A typical AWS Sapphire Rapids instance can have 48 KB x 2 blocks of L1 data cache per core,
-    // 2 MB x 2 blocks of L2 cache per core, and one shared 60 MB buffer of L3 cache.
-    // With two strings, we may consider the overal workload huge, if each exceeds 1 MB in length.
-    int const is_huge = length >= 1ull * 1024ull * 1024ull;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    }
-    // When dealing wirh larger arrays, the optimization is not as simple as with the `sz_fill_avx512` function,
-    // as both buffers may be unaligned. If we are lucky and the requested operation is some huge page transfer,
-    // we can use aligned loads and stores, and the performance will be great.
-    else if ((sz_size_t)target % 64 == 0 && (sz_size_t)source % 64 == 0 && !is_huge) {
-        for (; length >= 64; target += 64, source += 64, length -= 64)
-            _mm512_store_si512(target, _mm512_load_si512(source));
-        // At this point the length is guaranteed to be under 64.
-        __mmask64 mask = _sz_u64_mask_until(length);
-        // Aligned load and stores would work too, but it's not defined.
-        _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    }
-    // The trickiest case is when both `source` and `target` are not aligned.
-    // In such and simpler cases we can copy enough bytes into `target` to reach its cacheline boundary,
-    // and then combine unaligned loads with aligned stores.
-    else if (!is_huge) {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-        for (target += head_length, source += head_length; body_length >= 64;
-             target += 64, source += 64, body_length -= 64)
-            _mm512_store_si512(target, _mm512_loadu_si512(source)); // Unaligned load, but aligned store!
-        _mm512_mask_storeu_epi8(target, tail_mask, _mm512_maskz_loadu_epi8(tail_mask, source));
-    }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    //
-    //      1. Moving in both directions to maximize the throughput, when fetching from multiple
-    //         memory pages. Also helps with cache set-associativity issues, as we won't always
-    //         be fetching the same entries in the lookup table.
-    //      2. Using non-temporal stores to avoid polluting the cache.
-    //      3. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
-    //         for predictable patterns, so disregard this advice.
-    //
-    // Bidirectional traversal adds about 10%, accelerating from 11 GB/s to 12 GB/s.
-    // Using "streaming stores" boosts us from 12 GB/s to 19 GB/s.
-    else {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64;
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;
-        sz_size_t body_length = length - head_length - tail_length;
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-        _mm512_mask_storeu_epi8(target + head_length + body_length, tail_mask,
-                                _mm512_maskz_loadu_epi8(tail_mask, source));
-
-        // Now in the main loop, we can use non-temporal loads and stores,
-        // performing the operation in both directions.
-        for (target += head_length, source += head_length; //
-             body_length >= 128;                           //
-             target += 64, source += 64, body_length -= 128) {
-            _mm512_stream_si512((__m512i *)(target), _mm512_loadu_si512(source));
-            _mm512_stream_si512((__m512i *)(target + body_length - 64), _mm512_loadu_si512(source + body_length - 64));
-        }
-        if (body_length >= 64) _mm512_stream_si512((__m512i *)target, _mm512_loadu_si512(source));
-    }
-}
-
-SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    if (target == source) return; // Don't be silly, don't move the data if it's already there.
-
-    // On very short buffers, that are one cache line in width or less, we don't need any loops.
-    // We can also avoid any data-dependencies between iterations, assuming we have 32 registers
-    // to pre-load the data, before writing it back.
-    if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    }
-    else if (length <= 128) {
-        sz_size_t last_length = length - 64;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
-        __m512i source0 = _mm512_loadu_epi8(source);
-        __m512i source1 = _mm512_maskz_loadu_epi8(mask, source + 64);
-        _mm512_storeu_epi8(target, source0);
-        _mm512_mask_storeu_epi8(target + 64, mask, source1);
-    }
-    else if (length <= 192) {
-        sz_size_t last_length = length - 128;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
-        __m512i source0 = _mm512_loadu_epi8(source);
-        __m512i source1 = _mm512_loadu_epi8(source + 64);
-        __m512i source2 = _mm512_maskz_loadu_epi8(mask, source + 128);
-        _mm512_storeu_epi8(target, source0);
-        _mm512_storeu_epi8(target + 64, source1);
-        _mm512_mask_storeu_epi8(target + 128, mask, source2);
-    }
-    else if (length <= 256) {
-        sz_size_t last_length = length - 192;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
-        __m512i source0 = _mm512_loadu_epi8(source);
-        __m512i source1 = _mm512_loadu_epi8(source + 64);
-        __m512i source2 = _mm512_loadu_epi8(source + 128);
-        __m512i source3 = _mm512_maskz_loadu_epi8(mask, source + 192);
-        _mm512_storeu_epi8(target, source0);
-        _mm512_storeu_epi8(target + 64, source1);
-        _mm512_storeu_epi8(target + 128, source2);
-        _mm512_mask_storeu_epi8(target + 192, mask, source3);
-    }
-
-    // If the regions don't overlap at all, just use "copy" and save some brain cells thinking about corner cases.
-    else if (target + length < source || target >= source + length) { sz_copy_avx512(target, source, length); }
-
-    // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
-    // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
-    // by computing 2 masks - for the head and tail, using masked stores for the head and tail, and unmasked
-    // for the body.
-    else {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-
-        // The absolute most common case of using "moves" is shifting the data within a continuous buffer
-        // when adding a removing some values in it. In such cases, a typical shift is by 1, 2, 4, 8, 16,
-        // or 32 bytes, rarely larger. For small shifts, under the size of the ZMM register, we can use shuffles.
-        //
-        // Remember:
-        //      - if we are shifting data left, that we are traversing to the right.
-        //      - if we are shifting data right, that we are traversing to the left.
-        int const left_to_right_traversal = source > target;
-
-        // Now we guarantee, that the relative shift within registers is from 1 to 63 bytes and the output is aligned.
-        // Hopefully, we need to shift more than two ZMM registers, so we could consider `valignr` instruction.
-        // Sadly, using `_mm512_alignr_epi8` doesn't make sense, as it operates at a 128-bit granularity.
-        //
-        //      - `_mm256_alignr_epi8` shifts entire 256-bit register, but we need many of them.
-        //      - `_mm512_alignr_epi32` shifts 512-bit chunks, but only if the `shift` is a multiple of 4 bytes.
-        //      - `_mm512_alignr_epi64` shifts 512-bit chunks by 8 bytes.
-        //
-        // All of those have a latency of 1 cycle, and the shift amount must be an immediate value!
-        // For 1-byte-shift granularity, the `_mm512_permutex2var_epi8` has a latency of 6 and needs VBMI!
-        // The most efficient and broadly compatible alternative could be to use a combination of align and shuffle.
-        // A similar approach was outlined in "Byte-wise alignr in AVX512F" by Wojciech Muła.
-        // http://0x80.pl/notesen/2016-10-16-avx512-byte-alignr.html
-        //
-        // That solution, is extremely mouthful, assuming we need compile time constants for the shift amount.
-        // A cleaner one, with a latency of 3 cycles, is to use `_mm512_permutexvar_epi8` or
-        // `_mm512_mask_permutexvar_epi8`, which can be seen as combination of a cross-register shuffle and blend,
-        // and is available with VBMI. That solution is still noticeably slower than AVX2.
-        //
-        // The GLibC implementation also uses non-temporal stores for larger buffers, we don't.
-        // https://codebrowser.dev/glibc/glibc/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S.html
-        if (left_to_right_traversal) {
-            // Head, body, and tail.
-            _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-            for (target += head_length, source += head_length; body_length >= 64;
-                 target += 64, source += 64, body_length -= 64)
-                _mm512_store_si512(target, _mm512_loadu_si512(source));
-            _mm512_mask_storeu_epi8(target, tail_mask, _mm512_maskz_loadu_epi8(tail_mask, source));
-        }
-        else {
-            // Tail, body, and head.
-            _mm512_mask_storeu_epi8(target + head_length + body_length, tail_mask,
-                                    _mm512_maskz_loadu_epi8(tail_mask, source + head_length + body_length));
-            for (; body_length >= 64; body_length -= 64)
-                _mm512_store_si512(target + head_length + body_length - 64,
-                                   _mm512_loadu_si512(source + head_length + body_length - 64));
-            _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-        }
-    }
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    __mmask64 mask;
-    sz_u512_vec_t h_vec, n_vec;
-    n_vec.zmm = _mm512_set1_epi8(n[0]);
-
-    while (h_length >= 64) {
-        h_vec.zmm = _mm512_loadu_si512(h);
-        mask = _mm512_cmpeq_epi8_mask(h_vec.zmm, n_vec.zmm);
-        if (mask) return h + sz_u64_ctz(mask);
-        h += 64, h_length -= 64;
-    }
-
-    if (h_length) {
-        mask = _sz_u64_mask_until(h_length);
-        h_vec.zmm = _mm512_maskz_loadu_epi8(mask, h);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpeq_epu8_mask(mask, h_vec.zmm, n_vec.zmm);
-        if (mask) return h + sz_u64_ctz(mask);
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_avx512(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into ZMM registers.
-    __mmask64 matches;
-    __mmask64 mask;
-    sz_u512_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.zmm = _mm512_set1_epi8(n[offset_first]);
-    n_mid_vec.zmm = _mm512_set1_epi8(n[offset_mid]);
-    n_last_vec.zmm = _mm512_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    // We have several optimized versions of the lagorithm for shorter strings,
-    // but they all mimic the default case for unbounded length needles
-    if (n_length >= 64) {
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches);
-                if (sz_equal_avx512(h + potential_offset, n, n_length)) return h + potential_offset;
-                matches &= matches - 1;
-            }
-
-            // TODO: If the last character contains a bad byte, we can reposition the start of the next iteration.
-            // This will be very helpful for very long needles.
-        }
-    }
-    // If there are only 2 or 3 characters in the needle, we don't even need the nested loop.
-    else if (n_length <= 3) {
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            if (matches) return h + sz_u64_ctz(matches);
-        }
-    }
-    // If the needle is smaller than the size of the ZMM register, we can use masked comparisons
-    // to avoid the the inner-most nested loop and compare the entire needle against a haystack
-    // slice in 3 CPU cycles.
-    else {
-        __mmask64 n_mask = _sz_u64_mask_until(n_length);
-        sz_u512_vec_t n_full_vec, h_full_vec;
-        n_full_vec.zmm = _mm512_maskz_loadu_epi8(n_mask, n);
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches);
-                h_full_vec.zmm = _mm512_maskz_loadu_epi8(n_mask, h + potential_offset);
-                if (_mm512_mask_cmpneq_epi8_mask(n_mask, h_full_vec.zmm, n_full_vec.zmm) == 0)
-                    return h + potential_offset;
-                matches &= matches - 1;
-            }
-        }
-    }
-
-    // The "tail" of the function uses masked loads to process the remaining bytes.
-    {
-        mask = _sz_u64_mask_until(h_length - n_length + 1);
-        h_first_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_first);
-        h_mid_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_mid);
-        h_last_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_ctz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + potential_offset, n, n_length)) return h + potential_offset;
-            matches &= matches - 1;
-        }
-    }
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    __mmask64 mask;
-    sz_u512_vec_t h_vec, n_vec;
-    n_vec.zmm = _mm512_set1_epi8(n[0]);
-
-    while (h_length >= 64) {
-        h_vec.zmm = _mm512_loadu_si512(h + h_length - 64);
-        mask = _mm512_cmpeq_epi8_mask(h_vec.zmm, n_vec.zmm);
-        if (mask) return h + h_length - 1 - sz_u64_clz(mask);
-        h_length -= 64;
-    }
-
-    if (h_length) {
-        mask = _sz_u64_mask_until(h_length);
-        h_vec.zmm = _mm512_maskz_loadu_epi8(mask, h);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpeq_epu8_mask(mask, h_vec.zmm, n_vec.zmm);
-        if (mask) return h + 64 - sz_u64_clz(mask) - 1;
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_avx512(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into ZMM registers.
-    __mmask64 mask;
-    __mmask64 matches;
-    sz_u512_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.zmm = _mm512_set1_epi8(n[offset_first]);
-    n_mid_vec.zmm = _mm512_set1_epi8(n[offset_mid]);
-    n_last_vec.zmm = _mm512_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 64; h_length -= 64) {
-        h_reversed = h + h_length - n_length - 64 + 1;
-        h_first_vec.zmm = _mm512_loadu_si512(h_reversed + offset_first);
-        h_mid_vec.zmm = _mm512_loadu_si512(h_reversed + offset_mid);
-        h_last_vec.zmm = _mm512_loadu_si512(h_reversed + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~((sz_u64_t)1 << (63 - potential_offset));
-        }
-    }
-
-    // The "tail" of the function uses masked loads to process the remaining bytes.
-    {
-        mask = _sz_u64_mask_until(h_length - n_length + 1);
-        h_first_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_first);
-        h_mid_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_mid);
-        h_last_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + 64 - potential_offset - 1, n, n_length))
-                return h + 64 - potential_offset - 1;
-            sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~((sz_u64_t)1 << (63 - potential_offset));
-        }
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
-                             apply_to = function)
-
-/**
- *  @brief  Computes the edit distance between two very short byte-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 63, and evaluates at most (63 * 2 + 1 = 127) diagonals, or just as many loop cycles.
- *  Supports an early exit, if the distance is bounded.
- *  Keeps all of the data and Levenshtein matrices skew diagonal in just a couple of registers.
- *  Benefits from the @b `vpermb` instructions, that can rotate the bytes across the entire ZMM register.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto63_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                        //
-    sz_cptr_t longer, sz_size_t longer_length,                          //
-    sz_size_t bound) {
-
-    sz_size_t const max_length = 63u;
-    sz_assert(shorter_length <= longer_length && "The 'shorter' string is longer than the 'longer' one.");
-    sz_assert(shorter_length < max_length && "The length must fit into 16-bit integer. Otherwise use serial variant.");
-
-    // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
-    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
-    sz_size_t const shorter_dim = shorter_length + 1;
-    sz_size_t const longer_dim = longer_length + 1;
-
-    // The next few buffers will be swapped around.
-    sz_u512_vec_t previous_vec, current_vec, next_vec;
-    sz_u512_vec_t gaps_vec, substitutions_vec;
-
-    // Load the strings into ZMM registers - just once.
-    sz_u512_vec_t longer_vec, shorter_vec, shorter_rotated_vec, rotate_left_vec, rotate_right_vec, ones_vec, bound_vec;
-    longer_vec.zmm = _mm512_maskz_loadu_epi8(_sz_u64_mask_until(longer_length), longer);
-    rotate_left_vec.zmm = _mm512_set_epi8(                              //
-        0, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,  //
-        48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, //
-        32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, //
-        16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-    rotate_right_vec.zmm = _mm512_set_epi8(                             //
-        62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48,     //
-        47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, //
-        31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, //
-        15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 63);
-    ones_vec.zmm = _mm512_set1_epi8(1);
-    bound_vec.zmm = _mm512_set1_epi8(bound <= 255 ? (sz_u8_t)bound : 255);
-
-    // To simplify comparisons and traversals, we want to reverse the order of bytes in the shorter string.
-    for (sz_size_t i = 0; i != shorter_length; ++i) shorter_vec.u8s[63 - i] = shorter[i];
-    shorter_rotated_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, shorter_vec.zmm);
-
-    // Let's say we are dealing with 3 and 5 letter words.
-    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
-    // It will have:
-    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
-    // - 2 diagonals of fixed length, at positions: 4, 5.
-    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
-    sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
-
-    // Initialize the first two diagonals:
-    //
-    //      previous_vec.u8s[0] = 0;
-    //      current_vec.u8s[0] = current_vec.u8s[1] = 1;
-    //
-    // We can do a similar thing with vector ops:
-    previous_vec.zmm = _mm512_setzero_si512();
-    current_vec.zmm = _mm512_set1_epi8(1);
-
-    // We skip diagonals 0 and 1, as they are trivial.
-    // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
-    // so we are effectively computing just one value, as will be marked by a single set bit in
-    // the `next_diagonal_mask` on the very first iteration.
-    sz_size_t next_diagonal_index = 2;
-    __mmask64 next_diagonal_mask = 0;
-
-    // Progress through the upper triangle of the Levenshtein matrix.
-    for (; next_diagonal_index != shorter_dim; ++next_diagonal_index) {
-        // After this iteration, the values at offset `0` and `next_diagonal_index` in the `next_vec`
-        // should be set to `next_diagonal_index`, but it's easier to broadcast the value to the whole vector,
-        // and later merge with a mask with new values.
-        next_vec.zmm = _mm512_set1_epi8((sz_u8_t)next_diagonal_index);
-
-        // The mask also adds one set bit.
-        next_diagonal_mask = _kor_mask64(next_diagonal_mask, 1);
-        next_diagonal_mask = _kshiftli_mask64(next_diagonal_mask, 1);
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        substitutions_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, substitutions_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(_mm512_permutexvar_epi8(rotate_right_vec.zmm, current_vec.zmm), current_vec.zmm),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_mask_min_epu8(next_vec.zmm, next_diagonal_mask, gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = current_vec.zmm;
-        current_vec.zmm = next_vec.zmm;
-
-        // Shift the shorter string
-        shorter_rotated_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, shorter_rotated_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
-    }
-
-    // Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.
-    for (; next_diagonal_index != longer_dim; ++next_diagonal_index) {
-        // After this iteration, the value `shorted_dim - 1` in the `next_vec`
-        // should be set to `next_diagonal_index`, but it's easier to broadcast the value to the whole vector,
-        // and later merge with a mask with new values.
-        next_vec.zmm = _mm512_set1_epi8((sz_u8_t)next_diagonal_index);
-
-        // Make sure we update the first entry.
-        next_diagonal_mask = _kor_mask64(next_diagonal_mask, 1);
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(current_vec.zmm, _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm)),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_mask_min_epu8(next_vec.zmm, next_diagonal_mask, gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm);
-        current_vec.zmm = next_vec.zmm;
-
-        // Let's shift the longer string now.
-        longer_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, longer_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
-    }
-
-    // Now let's handle the bottom right triangle.
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(current_vec.zmm, _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm)),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_min_epu8(gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm);
-        current_vec.zmm = next_vec.zmm;
-
-        // Let's shift the longer string now.
-        longer_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, longer_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
-        // In every following iterations we take use a shorter prefix of each register,
-        // but we don't need to update the `next_diagonal_mask` anymore... except for the early exit.
-        next_diagonal_mask = _kshiftri_mask64(next_diagonal_mask, 1);
-    }
-    return current_vec.u8s[0];
-}
-
-/**
- *  @brief  Computes the edit distance between two somewhat short bytes-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 127, and evaluates at most (127 * 2 + 1 = 255) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *  Benefits from the @b `vpermi2b` instructions, that can rotate the bytes in 2 registers at once.
- *
- *  This may be one of the most freuqently called kernels for:
- *  - source code analysis, assuming most lines are either under 80 or under 120 characters long.
- *  - DNA sequence alignment, as most short reads are 50-300 characters long.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto127_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two longer bytes-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 255, and evaluates at most (255 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *
- *  Each of 2x string ends up occupying 4 ZMM registers, and each of 3x diagonals uses 4 ZMM registers.
- *  So 20x of the 32x are persistently occupied, and the rest are used for math temporarily.
- *  This is the largest space-efficient variant, as strings beyond 255 characters may require
- *  16-bit accumulators, which would be a significant bottleneck.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                      //
-    sz_cptr_t longer, sz_size_t longer_length,                        //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two longer bytes-strings using the AVX-512VBMI extensions,
- *          assuming the upper distance bound can not exceed 255, but the string length can be arbitrary.
- *
- *  Applies to string lengths up to 255, and evaluates at most (255 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *
- *  Each of 2x string ends up occupying 4 ZMM registers, and each of 3x diagonals uses 4 ZMM registers.
- *  So 20x of the 32x are persistently occupied, and the rest are used for math temporarily.
- *  This is the largest space-efficient variant, as strings beyond 255 characters may require
- *  16-bit accumulators, which would be a significant bottleneck.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto255bound_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                              //
-    sz_cptr_t longer, sz_size_t longer_length,                                //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two mid-length UTF-8-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 127, and evaluates at most (127 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Benefits from the @b `valignd` instructions used to rotate UTF-32 unpacked unicode codepoints.
- *
- *  Each string is unpacked into 128 characters * 4 bytes per character / 64 bytes per register = 8 registers.
- *
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_utf8_skewed_diagonals_upto127_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                              //
-    sz_cptr_t longer, sz_size_t longer_length,                                //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
+    switch (length) {
+    case 0: return 0;
 
-    sz_unused(shorter && longer && bound && alloc);
+    // Texts under 7 bytes long are definitely below the largest prime.
+    case 1:
+        hash_low = _sz_shift_low(text[0]);
+        hash_high = _sz_shift_high(text[0]);
+        break;
+    case 2:
+        hash_low = _sz_shift_low(text[0]) * 31ull + _sz_shift_low(text[1]);
+        hash_high = _sz_shift_high(text[0]) * 257ull + _sz_shift_high(text[1]);
+        break;
+    case 3:
+        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull + //
+                   _sz_shift_low(text[1]) * 31ull +         //
+                   _sz_shift_low(text[2]);
+        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull + //
+                    _sz_shift_high(text[1]) * 257ull +          //
+                    _sz_shift_high(text[2]);
+        break;
+    case 4:
+        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull + //
+                   _sz_shift_low(text[1]) * 31ull * 31ull +         //
+                   _sz_shift_low(text[2]) * 31ull +                 //
+                   _sz_shift_low(text[3]);
+        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull + //
+                    _sz_shift_high(text[1]) * 257ull * 257ull +          //
+                    _sz_shift_high(text[2]) * 257ull +                   //
+                    _sz_shift_high(text[3]);
+        break;
+    case 5:
+        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull + //
+                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull +         //
+                   _sz_shift_low(text[2]) * 31ull * 31ull +                 //
+                   _sz_shift_low(text[3]) * 31ull +                         //
+                   _sz_shift_low(text[4]);
+        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull + //
+                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull +          //
+                    _sz_shift_high(text[2]) * 257ull * 257ull +                   //
+                    _sz_shift_high(text[3]) * 257ull +                            //
+                    _sz_shift_high(text[4]);
+        break;
+    case 6:
+        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull + //
+                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull +         //
+                   _sz_shift_low(text[2]) * 31ull * 31ull * 31ull +                 //
+                   _sz_shift_low(text[3]) * 31ull * 31ull +                         //
+                   _sz_shift_low(text[4]) * 31ull +                                 //
+                   _sz_shift_low(text[5]);
+        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull + //
+                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull +          //
+                    _sz_shift_high(text[2]) * 257ull * 257ull * 257ull +                   //
+                    _sz_shift_high(text[3]) * 257ull * 257ull +                            //
+                    _sz_shift_high(text[4]) * 257ull +                                     //
+                    _sz_shift_high(text[5]);
+        break;
+    case 7:
+        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull * 31ull + //
+                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull * 31ull +         //
+                   _sz_shift_low(text[2]) * 31ull * 31ull * 31ull * 31ull +                 //
+                   _sz_shift_low(text[3]) * 31ull * 31ull * 31ull +                         //
+                   _sz_shift_low(text[4]) * 31ull * 31ull +                                 //
+                   _sz_shift_low(text[5]) * 31ull +                                         //
+                   _sz_shift_low(text[6]);
+        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull * 257ull + //
+                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull * 257ull +          //
+                    _sz_shift_high(text[2]) * 257ull * 257ull * 257ull * 257ull +                   //
+                    _sz_shift_high(text[3]) * 257ull * 257ull * 257ull +                            //
+                    _sz_shift_high(text[4]) * 257ull * 257ull +                                     //
+                    _sz_shift_high(text[5]) * 257ull +                                              //
+                    _sz_shift_high(text[6]);
+        break;
+    default:
+        // Unroll the first seven cycles:
+        hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
+        hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
+        hash_low = hash_low * 31ull + _sz_shift_low(text[1]);
+        hash_high = hash_high * 257ull + _sz_shift_high(text[1]);
+        hash_low = hash_low * 31ull + _sz_shift_low(text[2]);
+        hash_high = hash_high * 257ull + _sz_shift_high(text[2]);
+        hash_low = hash_low * 31ull + _sz_shift_low(text[3]);
+        hash_high = hash_high * 257ull + _sz_shift_high(text[3]);
+        hash_low = hash_low * 31ull + _sz_shift_low(text[4]);
+        hash_high = hash_high * 257ull + _sz_shift_high(text[4]);
+        hash_low = hash_low * 31ull + _sz_shift_low(text[5]);
+        hash_high = hash_high * 257ull + _sz_shift_high(text[5]);
+        hash_low = hash_low * 31ull + _sz_shift_low(text[6]);
+        hash_high = hash_high * 257ull + _sz_shift_high(text[6]);
+        text += 7;
 
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
+        // Iterate throw the rest with the modulus:
+        for (; text != text_end; ++text) {
+            hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
+            hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
+            // Wrap the hashes around:
+            hash_low = _sz_prime_mod(hash_low);
+            hash_high = _sz_prime_mod(hash_high);
+        }
+        break;
     }
 
-    // TODO: Generalize!
-    sz_size_t const max_length = 256u * 256u;
-    sz_assert(shorter_length <= longer_length && "The 'shorter' string is longer than the 'longer' one.");
-    sz_assert(shorter_length < max_length && "The length must fit into 16-bit integer. Otherwise use serial variant.");
-    sz_unused(longer_length && bound && max_length);
-
-#if 0
-    // We are going to store 3 diagonals of the matrix.
-    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
-    sz_size_t const shorter_dim = shorter_length + 1;
-    sz_size_t const longer_dim = longer_length + 1;
-    // Unlike the serial version, we also want to avoid reverse-order iteration over teh shorter string.
-    // So let's allocate a bit more memory and reverse-export our shorter string into that buffer.
-    sz_size_t const buffer_length = sizeof(sz_u16_t) * longer_dim * 3 + shorter_length;
-    sz_u16_t *const distances = (sz_u16_t *)alloc->allocate(buffer_length, alloc->handle);
-    if (!distances) return SZ_SIZE_MAX;
-
-    // The next few pointers will be swapped around.
-    sz_u16_t *previous_distances = distances;
-    sz_u16_t *current_distances = previous_distances + longer_dim;
-    sz_u16_t *next_distances = current_distances + longer_dim;
-    sz_ptr_t const shorter_reversed = (sz_ptr_t)(next_distances + longer_dim);
-
-    // Export the reversed string into the buffer.
-    for (sz_size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
+    return _sz_hash_mix(hash_low, hash_high);
+}
 
-    // Initialize the first two diagonals:
-    previous_distances[0] = 0;
-    current_distances[0] = current_distances[1] = 1;
+SZ_PUBLIC void sz_hashes_serial(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
+                                sz_hash_callback_t callback, void *callback_handle) {
 
-    // Using ZMM registers, we can process 32x 16-bit values at once,
-    // storing 16 bytes of each string in YMM registers.
-    sz_u512_vec_t insertions_vec, deletions_vec, substitutions_vec, next_vec;
-    sz_u512_vec_t ones_u16_vec;
-    ones_u16_vec.zmm = _mm512_set1_epi16(1);
+    if (length < window_length || !window_length) return;
+    sz_u8_t const *text = (sz_u8_t const *)start;
+    sz_u8_t const *text_end = text + length;
 
-    // This is a mixed-precision implementation, using 8-bit representations for part of the operations.
-    // Even there, in case `SZ_USE_X86_AVX2=0`, let's use the `sz_u512_vec_t` type, addressing the first YMM halfs.
-    sz_u512_vec_t shorter_vec, longer_vec;
-    sz_u512_vec_t ones_u8_vec;
-    ones_u8_vec.ymms[0] = _mm256_set1_epi8(1);
+    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
+    sz_u64_t prime_power_low = 1, prime_power_high = 1;
+    for (sz_size_t i = 0; i + 1 < window_length; ++i)
+        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
+        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
 
-    // Let's say we are dealing with 3 and 5 letter words.
-    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
-    // It will have:
-    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
-    // - 2 diagonals of fixed length, at positions: 4, 5.
-    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
-    sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
+    // Compute the initial hash value for the first window.
+    sz_u64_t hash_low = 0, hash_high = 0, hash_mix;
+    for (sz_u8_t const *first_end = text + window_length; text < first_end; ++text)
+        hash_low = (hash_low * 31ull + _sz_shift_low(*text)) % SZ_U64_MAX_PRIME,
+        hash_high = (hash_high * 257ull + _sz_shift_high(*text)) % SZ_U64_MAX_PRIME;
 
-    // Progress through the upper triangle of the Levenshtein matrix.
-    sz_size_t next_diagonal_index = 2;
-    for (; next_diagonal_index != shorter_dim; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
-        for (sz_size_t offset_within_diagonal = 0; offset_within_diagonal + 2 < next_diagonal_length;) {
-            sz_u32_t remaining_length = (sz_u32_t)(next_diagonal_length - offset_within_diagonal - 2);
-            sz_u32_t register_length = remaining_length < 32 ? remaining_length : 32;
-            sz_u32_t remaining_length_mask = _bzhi_u32(0xFFFFFFFFu, register_length);
-            longer_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, longer + offset_within_diagonal);
-            // Our original code addressed the shorter string `[next_diagonal_index - offset_within_diagonal - 2]`
-            // for growing `offset_within_diagonal`. If the `shorter` string was reversed, the
-            // `[next_diagonal_index - offset_within_diagonal - 2]` would be equal to `[shorter_length - 1 -
-            // next_diagonal_index + offset_within_diagonal + 2]`. Which simplified would be equal to
-            // `[shorter_length - next_diagonal_index + offset_within_diagonal + 1]`.
-            shorter_vec.ymms[0] = _mm256_maskz_loadu_epi8( //
-                remaining_length_mask,
-                shorter_reversed + shorter_length - next_diagonal_index + offset_within_diagonal + 1);
-            // For substitutions, perform the equality comparison using AVX2 instead of AVX-512
-            // to get the result as a vector, instead of a bitmask. Adding 1 to every scalar we can overflow
-            // transforming from {0xFF, 0} values to {0, 1} values - exactly what we need. Then - upcast to 16-bit.
-            substitutions_vec.zmm = _mm512_cvtepi8_epi16( //
-                _mm256_add_epi8(_mm256_cmpeq_epi8(longer_vec.ymms[0], shorter_vec.ymms[0]), ones_u8_vec.ymms[0]));
-            substitutions_vec.zmm = _mm512_add_epi16( //
-                substitutions_vec.zmm,
-                _mm512_maskz_loadu_epi16(remaining_length_mask, previous_distances + offset_within_diagonal));
-            // For insertions and deletions, on modern hardware, it's faster to issue two separate loads,
-            // than rotate the bytes in the ZMM register.
-            insertions_vec.zmm =
-                _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + offset_within_diagonal);
-            deletions_vec.zmm =
-                _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + offset_within_diagonal + 1);
-            // First get the minimum of insertions and deletions.
-            next_vec.zmm = _mm512_add_epi16(_mm512_min_epu16(insertions_vec.zmm, deletions_vec.zmm), ones_u16_vec.zmm);
-            next_vec.zmm = _mm512_min_epu16(next_vec.zmm, substitutions_vec.zmm);
-            _mm512_mask_storeu_epi16(next_distances + offset_within_diagonal + 1, remaining_length_mask, next_vec.zmm);
-            offset_within_diagonal += register_length;
-        }
-        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-        next_distances[0] = next_distances[next_diagonal_length - 1] = (sz_u16_t)next_diagonal_index;
-        // Perform a circular rotation (three-way swap) of those buffers, to reuse the memory.
-        sz_u16_t *temporary = previous_distances;
-        previous_distances = current_distances;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
+    // In most cases the fingerprint length will be a power of two.
+    hash_mix = _sz_hash_mix(hash_low, hash_high);
+    callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
 
-    // By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a
-    // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
-    // index on either side, we will be cropping those values out.
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
-        for (sz_size_t i = 0; i != next_diagonal_length;) {
-            sz_u32_t remaining_length = (sz_u32_t)(next_diagonal_length - i);
-            sz_u32_t register_length = remaining_length < 32 ? remaining_length : 32;
-            sz_u32_t remaining_length_mask = _bzhi_u32(0xFFFFFFFFu, register_length);
-            longer_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, longer + next_diagonal_index - n + i);
-            // Our original code addressed the shorter string `[shorter_length - 1 - i]` for growing `i`.
-            // If the `shorter` string was reversed, the `[shorter_length - 1 - i]` would
-            // be equal to `[shorter_length - 1 - shorter_length + 1 + i]`.
-            // Which simplified would be equal to just `[i]`. Beautiful!
-            shorter_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, shorter_reversed + i);
-            // For substitutions, perform the equality comparison using AVX2 instead of AVX-512
-            // to get the result as a vector, instead of a bitmask. The compare it against the accumulated
-            // substitution costs.
-            substitutions_vec.zmm = _mm512_cvtepi8_epi16( //
-                _mm256_add_epi8(_mm256_cmpeq_epi8(longer_vec.ymms[0], shorter_vec.ymms[0]), ones_u8_vec.ymms[0]));
-            substitutions_vec.zmm = _mm512_add_epi16( //
-                substitutions_vec.zmm, _mm512_maskz_loadu_epi16(remaining_length_mask, previous_distances + i));
-            // For insertions and deletions, on modern hardware, it's faster to issue two separate loads,
-            // than rotate the bytes in the ZMM register.
-            insertions_vec.zmm = _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + i);
-            deletions_vec.zmm = _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + i + 1);
-            // First get the minimum of insertions and deletions.
-            next_vec.zmm = _mm512_add_epi16(_mm512_min_epu16(insertions_vec.zmm, deletions_vec.zmm), ones_u16_vec.zmm);
-            next_vec.zmm = _mm512_min_epu16(next_vec.zmm, substitutions_vec.zmm);
-            _mm512_mask_storeu_epi16(next_distances + i, remaining_length_mask, next_vec.zmm);
-            i += register_length;
+    // Compute the hash value for every window, exporting into the fingerprint,
+    // using the expensive modulo operation.
+    sz_size_t cycles = 1;
+    sz_size_t const step_mask = step - 1;
+    for (; text < text_end; ++text, ++cycles) {
+        // Discard one character:
+        hash_low -= _sz_shift_low(*(text - window_length)) * prime_power_low;
+        hash_high -= _sz_shift_high(*(text - window_length)) * prime_power_high;
+        // And add a new one:
+        hash_low = 31ull * hash_low + _sz_shift_low(*text);
+        hash_high = 257ull * hash_high + _sz_shift_high(*text);
+        // Wrap the hashes around:
+        hash_low = _sz_prime_mod(hash_low);
+        hash_high = _sz_prime_mod(hash_high);
+        // Mix only if we've skipped enough hashes.
+        if ((cycles & step_mask) == 0) {
+            hash_mix = _sz_hash_mix(hash_low, hash_high);
+            callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
         }
-
-        // Perform a circular rotation (three-way swap) of those buffers, to reuse the memory, this time, with a shift,
-        // dropping the first element in the current array.
-        sz_u16_t *temporary = previous_distances;
-        previous_distances = current_distances + 1;
-        current_distances = next_distances;
-        next_distances = temporary;
     }
-
-    // Cache scalar before `free` call.
-    sz_size_t result = current_distances[0];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-#endif
-    return 0;
 }
 
-SZ_INTERNAL sz_size_t sz_edit_distance_avx512(   //
-    sz_cptr_t shorter, sz_size_t shorter_length, //
-    sz_cptr_t longer, sz_size_t longer_length,   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Bounded computations may exit early.
-    int const is_bounded = bound < longer_length;
-    if (is_bounded) {
-        // If one of the strings is empty - the edit distance is equal to the length of the other one.
-        if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
-        if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
-        // If the difference in length is beyond the `bound`, there is no need to check at all.
-        if (longer_length - shorter_length > bound) return bound;
-    }
+#undef _sz_shift_low
+#undef _sz_shift_high
+#undef _sz_hash_mix
+#undef _sz_prime_mod
 
-    // Make sure the shorter string is actually shorter.
-    if (shorter_length > longer_length) {
-        sz_cptr_t temporary = shorter;
-        shorter = longer;
-        longer = temporary;
-        sz_size_t temporary_length = shorter_length;
-        shorter_length = longer_length;
-        longer_length = temporary_length;
-    }
+#pragma endregion // Serial Implementation
 
-    // Dispatch the right implementation based on the length of the strings.
-    if (longer_length < 64u)
-        return _sz_edit_distance_skewed_diagonals_upto63_avx512( //
-            shorter, shorter_length, longer, longer_length, bound);
-    // else if (longer_length < 256u * 256u)
-    //     return _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
-    //         shorter, shorter_length, longer, longer_length, bound, alloc);
-    else
-        return sz_edit_distance_serial(shorter, shorter_length, longer, longer_length, bound, alloc);
-}
+/*  AVX2 implementation of the string search algorithms for Haswell processors and newer.
+ *  Very minimalistic (compared to AVX-512), but still faster than the serial implementation.
+ */
+#pragma region Haswell Implementation
+#if SZ_USE_HASWELL
+#pragma GCC push_options
+#pragma GCC target("haswell")
+#pragma clang attribute push(__attribute__((target("haswell"))), apply_to = function)
 
-SZ_PUBLIC sz_u64_t sz_checksum_avx512(sz_cptr_t text, sz_size_t length) {
+SZ_PUBLIC sz_u64_t sz_checksum_avx2(sz_cptr_t text, sz_size_t length) {
     // The naive implementation of this function is very simple.
     // It assumes the CPU is great at handling unaligned "loads".
     //
-    // A typical AWS Sapphire Rapids instance can have 48 KB x 2 blocks of L1 data cache per core,
-    // 2 MB x 2 blocks of L2 cache per core, and one shared 60 MB buffer of L3 cache.
-    // With two strings, we may consider the overal workload huge, if each exceeds 1 MB in length.
-    int const is_huge = length >= 1ull * 1024ull * 1024ull;
-    sz_u512_vec_t text_vec, sums_vec;
+    // A typical AWS Skylake instance can have 32 KB x 2 blocks of L1 data cache per core,
+    // 1 MB x 2 blocks of L2 cache per core, and one shared L3 cache buffer.
+    // For now, let's avoid the cases beyond the L2 size.
+    int is_huge = length > 1ull * 1024ull * 1024ull;
 
     // When the buffer is small, there isn't much to innovate.
-    if (length <= 16) {
-        __mmask16 mask = _sz_u16_mask_until(length);
-        text_vec.xmms[0] = _mm_maskz_loadu_epi8(mask, text);
-        sums_vec.xmms[0] = _mm_sad_epu8(text_vec.xmms[0], _mm_setzero_si128());
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_vec.xmms[0]);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_vec.xmms[0], 1);
-        return low + high;
-    }
-    else if (length <= 32) {
-        __mmask32 mask = _sz_u32_mask_until(length);
-        text_vec.ymms[0] = _mm256_maskz_loadu_epi8(mask, text);
-        sums_vec.ymms[0] = _mm256_sad_epu8(text_vec.ymms[0], _mm256_setzero_si256());
+    if (length <= 32) { return sz_checksum_serial(text, length); }
+    else if (!is_huge) {
+        sz_u256_vec_t text_vec, sums_vec;
+        sums_vec.ymm = _mm256_setzero_si256();
+        for (; length >= 32; text += 32, length -= 32) {
+            text_vec.ymm = _mm256_lddqu_si256((__m256i const *)text);
+            sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
+        }
         // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymms[0]);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymms[0], 1);
+        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
+        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);
         __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
         sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
         sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        return low + high;
-    }
-    else if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        return _mm512_reduce_add_epi64(sums_vec.zmm);
-    }
-    else if (!is_huge) {
-        sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(text + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length; // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        for (text += head_length; body_length >= 64; text += 64, body_length -= 64) {
-            text_vec.zmm = _mm512_load_si512((__m512i const *)text);
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        }
-        text_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text);
-        sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        return _mm512_reduce_add_epi64(sums_vec.zmm);
+        sz_u64_t result = low + high;
+        if (length) result += sz_checksum_serial(text, length);
+        return result;
     }
     // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    //
-    //      1. Moving in both directions to maximize the throughput, when fetching from multiple
-    //         memory pages. Also helps with cache set-associativity issues, as we won't always
-    //         be fetching the same entries in the lookup table.
-    //      2. Using non-temporal stores to avoid polluting the cache.
-    //      3. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
-    //         for predictable patterns, so disregard this advice.
-    //
-    // Bidirectional traversal generally adds about 10% to such algorithms.
+    // Most notably, we can avoid populating the cache with the entire buffer, and instead traverse it in 2 directions.
     else {
-        sz_u512_vec_t text_reversed_vec, sums_reversed_vec;
-        sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64;
-        sz_size_t tail_length = (sz_size_t)(text + length) % 64;
-        sz_size_t body_length = length - head_length - tail_length;
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
+        sz_size_t head_length = (32 - ((sz_size_t)text % 32)) % 32; // 31 or less.
+        sz_size_t tail_length = (sz_size_t)(text + length) % 32;    // 31 or less.
+        sz_size_t body_length = length - head_length - tail_length; // Multiple of 32.
+        sz_u64_t result = 0;
 
-        text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        text_reversed_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text + head_length + body_length);
-        sums_reversed_vec.zmm = _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512());
+        // Handle the head
+        while (head_length--) result += *text++;
 
-        // Now in the main loop, we can use non-temporal loads and stores,
-        // performing the operation in both directions.
-        for (text += head_length; body_length >= 128; text += 64, text += 64, body_length -= 128) {
-            text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-            text_reversed_vec.zmm = _mm512_stream_load_si512((__m512i *)(text + body_length - 64));
-            sums_reversed_vec.zmm =
-                _mm512_add_epi64(sums_reversed_vec.zmm, _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512()));
+        sz_u256_vec_t text_vec, sums_vec;
+        sums_vec.ymm = _mm256_setzero_si256();
+        // Fill the aligned body of the buffer.
+        if (!is_huge) {
+            for (; body_length >= 32; text += 32, body_length -= 32) {
+                text_vec.ymm = _mm256_stream_load_si256((__m256i const *)text);
+                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
+            }
         }
-        if (body_length >= 64) {
-            text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
+        // When the biffer is huge, we can traverse it in 2 directions.
+        else {
+            sz_u256_vec_t text_reversed_vec, sums_reversed_vec;
+            sums_reversed_vec.ymm = _mm256_setzero_si256();
+            for (; body_length >= 64; text += 64, body_length -= 64) {
+                text_vec.ymm = _mm256_stream_load_si256((__m256i *)(text));
+                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
+                text_reversed_vec.ymm = _mm256_stream_load_si256((__m256i *)(text + body_length - 64));
+                sums_reversed_vec.ymm = _mm256_add_epi64(
+                    sums_reversed_vec.ymm, _mm256_sad_epu8(text_reversed_vec.ymm, _mm256_setzero_si256()));
+            }
+            if (body_length >= 32) {
+                text_vec.ymm = _mm256_stream_load_si256((__m256i *)(text));
+                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
+            }
+            sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, sums_reversed_vec.ymm);
         }
 
-        return _mm512_reduce_add_epi64(_mm512_add_epi64(sums_vec.zmm, sums_reversed_vec.zmm));
+        // Handle the tail
+        while (tail_length--) result += *text++;
+
+        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
+        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
+        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);
+        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
+        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
+        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
+        result += low + high;
+        return result;
     }
 }
 
-SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle) {
+/**
+ *  @brief  There is no AVX2 instruction for fast multiplication of 64-bit integers.
+ *          This implementation is coming from Agner Fog's Vector Class Library.
+ */
+SZ_INTERNAL __m256i _mm256_mul_epu64(__m256i a, __m256i b) {
+    __m256i bswap = _mm256_shuffle_epi32(b, 0xB1);
+    __m256i prodlh = _mm256_mullo_epi32(a, bswap);
+    __m256i zero = _mm256_setzero_si256();
+    __m256i prodlh2 = _mm256_hadd_epi32(prodlh, zero);
+    __m256i prodlh3 = _mm256_shuffle_epi32(prodlh2, 0x73);
+    __m256i prodll = _mm256_mul_epu32(a, b);
+    __m256i prod = _mm256_add_epi64(prodll, prodlh3);
+    return prod;
+}
+
+SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
+                              sz_hash_callback_t callback, void *callback_handle) {
 
     if (length < window_length || !window_length) return;
     if (length < 4 * window_length) {
@@ -5696,57 +461,58 @@ SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t start, sz_size_t length, sz_size_t win
     sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
     sz_u8_t const *text_end = text_first + length;
 
-    // Broadcast the global constants into the registers.
-    // Both high and low hashes will work with the same prime and golden ratio.
-    sz_u512_vec_t prime_vec, golden_ratio_vec;
-    prime_vec.zmm = _mm512_set1_epi64(SZ_U64_MAX_PRIME);
-    golden_ratio_vec.zmm = _mm512_set1_epi64(11400714819323198485ull);
-
     // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
     sz_u64_t prime_power_low = 1, prime_power_high = 1;
     for (sz_size_t i = 0; i + 1 < window_length; ++i)
         prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
         prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
 
-    // We will be evaluating 4 offsets at a time with 2 different hash functions.
-    // We can fit all those 8 state variables in each of the following ZMM registers.
-    sz_u512_vec_t base_vec, prime_power_vec, shift_vec;
-    base_vec.zmm = _mm512_set_epi64(31ull, 31ull, 31ull, 31ull, 257ull, 257ull, 257ull, 257ull);
-    shift_vec.zmm = _mm512_set_epi64(0ull, 0ull, 0ull, 0ull, 77ull, 77ull, 77ull, 77ull);
-    prime_power_vec.zmm = _mm512_set_epi64(prime_power_low, prime_power_low, prime_power_low, prime_power_low,
-                                           prime_power_high, prime_power_high, prime_power_high, prime_power_high);
+    // Broadcast the constants into the registers.
+    sz_u256_vec_t prime_vec, golden_ratio_vec;
+    sz_u256_vec_t base_low_vec, base_high_vec, prime_power_low_vec, prime_power_high_vec, shift_high_vec;
+    base_low_vec.ymm = _mm256_set1_epi64x(31ull);
+    base_high_vec.ymm = _mm256_set1_epi64x(257ull);
+    shift_high_vec.ymm = _mm256_set1_epi64x(77ull);
+    prime_vec.ymm = _mm256_set1_epi64x(SZ_U64_MAX_PRIME);
+    golden_ratio_vec.ymm = _mm256_set1_epi64x(11400714819323198485ull);
+    prime_power_low_vec.ymm = _mm256_set1_epi64x(prime_power_low);
+    prime_power_high_vec.ymm = _mm256_set1_epi64x(prime_power_high);
 
     // Compute the initial hash values for every one of the four windows.
-    sz_u512_vec_t hash_vec, chars_vec;
-    hash_vec.zmm = _mm512_setzero_si512();
+    sz_u256_vec_t hash_low_vec, hash_high_vec, hash_mix_vec, chars_low_vec, chars_high_vec;
+    hash_low_vec.ymm = _mm256_setzero_si256();
+    hash_high_vec.ymm = _mm256_setzero_si256();
     for (sz_u8_t const *prefix_end = text_first + window_length; text_first < prefix_end;
          ++text_first, ++text_second, ++text_third, ++text_fourth) {
 
         // 1. Multiply the hashes by the base.
-        hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
+        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
+        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
 
         // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`...
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
-                                         text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
+        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
+        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
+        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
 
         // 3. Add the incoming characters.
-        hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
+        hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
+        hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
 
         // 4. Compute the modulo. Assuming there are only 59 values between our prime
         //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
-                                              _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
+        hash_low_vec.ymm = _mm256_blendv_epi8( //
+            hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
+            _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
+        hash_high_vec.ymm = _mm256_blendv_epi8( //
+            hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
+            _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
     }
 
     // 5. Compute the hash mix, that will be used to index into the fingerprint.
     //    This includes a serial step at the end.
-    sz_u512_vec_t hash_mix_vec;
-    hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
-    hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
-                                            _mm512_extracti64x4_epi64(hash_mix_vec.zmm, 0));
-
+    hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
+    hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
+    hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
     callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
     callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
     callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
@@ -5754,45 +520,45 @@ SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t start, sz_size_t length, sz_size_t win
 
     // Now repeat that operation for the remaining characters, discarding older characters.
     sz_size_t cycle = 1;
-    sz_size_t step_mask = step - 1;
+    sz_size_t const step_mask = step - 1;
     for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
         // 0. Load again the four characters we are dropping, shift them, and subtract.
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[-window_length], text_third[-window_length],
-                                         text_second[-window_length], text_first[-window_length], //
-                                         text_fourth[-window_length], text_third[-window_length],
-                                         text_second[-window_length], text_first[-window_length]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-        hash_vec.zmm = _mm512_sub_epi64(hash_vec.zmm, _mm512_mullo_epi64(chars_vec.zmm, prime_power_vec.zmm));
+        chars_low_vec.ymm = _mm256_set_epi64x( //
+            text_fourth[-window_length], text_third[-window_length], text_second[-window_length],
+            text_first[-window_length]);
+        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
+        hash_low_vec.ymm =
+            _mm256_sub_epi64(hash_low_vec.ymm, _mm256_mul_epu64(chars_low_vec.ymm, prime_power_low_vec.ymm));
+        hash_high_vec.ymm =
+            _mm256_sub_epi64(hash_high_vec.ymm, _mm256_mul_epu64(chars_high_vec.ymm, prime_power_high_vec.ymm));
 
         // 1. Multiply the hashes by the base.
-        hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
+        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
+        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
 
         // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
         //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
-                                         text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-
-        // ... and prefetch the next four characters into Level 2 or higher.
-        _mm_prefetch((sz_cptr_t)text_fourth + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_third + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_second + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_first + 1, _MM_HINT_T1);
+        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
+        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
 
         // 3. Add the incoming characters.
-        hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
+        hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
+        hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
 
         // 4. Compute the modulo. Assuming there are only 59 values between our prime
         //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
-                                              _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
+        hash_low_vec.ymm = _mm256_blendv_epi8( //
+            hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
+            _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
+        hash_high_vec.ymm = _mm256_blendv_epi8( //
+            hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
+            _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
 
         // 5. Compute the hash mix, that will be used to index into the fingerprint.
         //    This includes a serial step at the end.
-        hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
-        hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
-                                                _mm512_castsi512_si256(hash_mix_vec.zmm));
-
+        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
+        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
+        hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
         if ((cycle & step_mask) == 0) {
             callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
             callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
@@ -5804,1353 +570,346 @@ SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t start, sz_size_t length, sz_size_t win
 
 #pragma clang attribute pop
 #pragma GCC pop_options
+#endif            // SZ_USE_HASWELL
+#pragma endregion // Haswell Implementation
 
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512vbmi", "avx512vbmi2", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512vbmi,avx512vbmi2,bmi,bmi2"))), \
-                             apply_to = function)
-
-SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-
-    // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
-    // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
-    // But if at least 3 cache lines are touched, the AVX-512 implementation should be faster.
-    if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
-        return;
-    }
-
-    // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
-    // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
-    // by computing 2 masks - for the head and tail, using masked stores for the head and tail, and unmasked
-    // for the body.
-    sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-    sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-    __mmask64 head_mask = _sz_u64_mask_until(head_length);
-    __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-
-    // We need to pull the lookup table into 4x ZMM registers.
-    // We can use `vpermi2b` instruction to perform the look in two ZMM registers with `_mm512_permutex2var_epi8`
-    // intrinsics, but it has a 6-cycle latency on Sapphire Rapids and requires AVX512-VBMI. Assuming we need to
-    // operate on 4 registers, it might be cleaner to use 2x separate `_mm512_permutexvar_epi8` calls.
-    // Combining the results with 2x `_mm512_test_epi8_mask` and 3x blends afterwards.
-    //
-    //  - 4x `_mm512_permutexvar_epi8` maps to "VPERMB (ZMM, ZMM, ZMM)":
-    //      - On Ice Lake: 3 cycles latency, ports: 1*p5
-    //      - On Genoa: 6 cycles latency, ports: 1*FP12
-    //  - 3x `_mm512_mask_blend_epi8` maps to "VPBLENDMB_Z (ZMM, K, ZMM, ZMM)":
-    //      - On Ice Lake: 3 cycles latency, ports: 1*p05
-    //      - On Genoa: 1 cycle latency, ports: 1*FP0123
-    //  - 2x `_mm512_test_epi8_mask` maps to "VPTESTMB (K, ZMM, ZMM)":
-    //      - On Ice Lake: 3 cycles latency, ports: 1*p5
-    //      - On Genoa: 4 cycles latency, ports: 1*FP01
-    //
-    sz_u512_vec_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec;
-    lut_0_to_63_vec.zmm = _mm512_loadu_si512((lut));
-    lut_64_to_127_vec.zmm = _mm512_loadu_si512((lut + 64));
-    lut_128_to_191_vec.zmm = _mm512_loadu_si512((lut + 128));
-    lut_192_to_255_vec.zmm = _mm512_loadu_si512((lut + 192));
-
-    sz_u512_vec_t first_bit_vec, second_bit_vec;
-    first_bit_vec.zmm = _mm512_set1_epi8((char)0x80);
-    second_bit_vec.zmm = _mm512_set1_epi8((char)0x40);
-
-    __mmask64 first_bit_mask, second_bit_mask;
-    sz_u512_vec_t source_vec;
-    // If the top bit is set in each word of `source_vec`, than we use `lookup_128_to_191_vec` or
-    // `lookup_192_to_255_vec`. If the second bit is set, we use `lookup_64_to_127_vec` or `lookup_192_to_255_vec`.
-    sz_u512_vec_t lookup_0_to_63_vec, lookup_64_to_127_vec, lookup_128_to_191_vec, lookup_192_to_255_vec;
-    sz_u512_vec_t blended_0_to_127_vec, blended_128_to_255_vec, blended_0_to_255_vec;
-
-    // Handling the head.
-    if (head_length) {
-        source_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, source);
-        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
-        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
-        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
-        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
-        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
-        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
-        blended_0_to_127_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
-        blended_128_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
-        blended_0_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
-        _mm512_mask_storeu_epi8(target, head_mask, blended_0_to_255_vec.zmm);
-        source += head_length, target += head_length, length -= head_length;
-    }
-
-    // Handling the body in 64-byte chunks aligned to cache-line boundaries with respect to `target`.
-    while (length >= 64) {
-        source_vec.zmm = _mm512_loadu_si512(source);
-        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
-        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
-        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
-        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
-        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
-        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
-        blended_0_to_127_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
-        blended_128_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
-        blended_0_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
-        _mm512_store_si512(target, blended_0_to_255_vec.zmm); //! Aligned store, our main weapon!
-        source += 64, target += 64, length -= 64;
-    }
-
-    // Handling the tail.
-    if (tail_length) {
-        source_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, source);
-        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
-        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
-        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
-        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
-        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
-        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
-        blended_0_to_127_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
-        blended_128_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
-        blended_0_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
-        _mm512_mask_storeu_epi8(target, tail_mask, blended_0_to_255_vec.zmm);
-        source += tail_length, target += tail_length, length -= tail_length;
-    }
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-
-    // Before initializing the AVX-512 vectors, we may want to run the sequential code for the first few bytes.
-    // In practice, that only hurts, even when we have matches every 5-ish bytes.
-    //
-    //      if (length < SZ_SWAR_THRESHOLD) return sz_find_charset_serial(text, length, filter);
-    //      sz_cptr_t early_result = sz_find_charset_serial(text, SZ_SWAR_THRESHOLD, filter);
-    //      if (early_result) return early_result;
-    //      text += SZ_SWAR_THRESHOLD;
-    //      length -= SZ_SWAR_THRESHOLD;
-    //
-    // Let's unzip even and odd elements and replicate them into both lanes of the YMM register.
-    // That way when we invoke `_mm512_shuffle_epi8` we can use the same mask for both lanes.
-    sz_u512_vec_t filter_even_vec, filter_odd_vec;
-    __m256i filter_ymm = _mm256_lddqu_si256((__m256i const *)filter);
-    // There are a few way to initialize filters without having native strided loads.
-    // In the cronological order of experiments:
-    // - serial code initializing 128 bytes of odd and even mask
-    // - using several shuffles
-    // - using `_mm512_permutexvar_epi8`
-    // - using `_mm512_broadcast_i32x4(_mm256_castsi256_si128(_mm256_maskz_compress_epi8(0x55555555, filter_ymm)))`
-    //   and `_mm512_broadcast_i32x4(_mm256_castsi256_si128(_mm256_maskz_compress_epi8(0xaaaaaaaa, filter_ymm)))`
-    filter_even_vec.zmm = _mm512_broadcast_i32x4(_mm256_castsi256_si128( // broadcast __m128i to __m512i
-        _mm256_maskz_compress_epi8(0x55555555, filter_ymm)));
-    filter_odd_vec.zmm = _mm512_broadcast_i32x4(_mm256_castsi256_si128( // broadcast __m128i to __m512i
-        _mm256_maskz_compress_epi8(0xaaaaaaaa, filter_ymm)));
-    // After the unzipping operation, we can validate the contents of the vectors like this:
-    //
-    //      for (sz_size_t i = 0; i != 16; ++i) {
-    //          sz_assert(filter_even_vec.u8s[i] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 16] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 16] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 32] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 32] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 48] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 48] == filter->_u8s[i * 2 + 1]);
-    //      }
-    //
-    sz_u512_vec_t text_vec;
-    sz_u512_vec_t lower_nibbles_vec, higher_nibbles_vec;
-    sz_u512_vec_t bitset_even_vec, bitset_odd_vec;
-    sz_u512_vec_t bitmask_vec, bitmask_lookup_vec;
-    bitmask_lookup_vec.zmm = _mm512_set_epi8(                       //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);
-
-    while (length) {
-        // The following algorithm is a transposed equivalent of the "SIMDized check which bytes are in a set"
-        // solutions by Wojciech Muła. We populate the bitmask differently and target newer CPUs, so
-        // StrinZilla uses a somewhat different approach.
-        // http://0x80.pl/articles/simd-byte-lookup.html#alternative-implementation-new
-        //
-        //      sz_u8_t input = *(sz_u8_t const *)text;
-        //      sz_u8_t lo_nibble = input & 0x0f;
-        //      sz_u8_t hi_nibble = input >> 4;
-        //      sz_u8_t bitset_even = filter_even_vec.u8s[hi_nibble];
-        //      sz_u8_t bitset_odd = filter_odd_vec.u8s[hi_nibble];
-        //      sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //      sz_u8_t bitset = lo_nibble < 8 ? bitset_even : bitset_odd;
-        //      if ((bitset & bitmask) != 0) return text;
-        //      else { length--, text++; }
-        //
-        // The nice part about this, loading the strided data is vey easy with Arm NEON,
-        // while with x86 CPUs after AVX, shuffles within 256 bits shouldn't be an issue either.
-        sz_size_t load_length = sz_min_of_two(length, 64);
-        __mmask64 load_mask = _sz_u64_mask_until(load_length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, text);
-        lower_nibbles_vec.zmm = _mm512_and_si512(text_vec.zmm, _mm512_set1_epi8(0x0f));
-        bitmask_vec.zmm = _mm512_shuffle_epi8(bitmask_lookup_vec.zmm, lower_nibbles_vec.zmm);
-        //
-        // At this point we can validate the `bitmask_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != load_length; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t lo_nibble = input & 0x0f;
-        //          sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //          sz_assert(bitmask_vec.u8s[i] == bitmask);
-        //      }
-        //
-        // Shift right every byte by 4 bits.
-        // There is no `_mm512_srli_epi8` intrinsic, so we have to use `_mm512_srli_epi16`
-        // and combine it with a mask to clear the higher bits.
-        higher_nibbles_vec.zmm = _mm512_and_si512(_mm512_srli_epi16(text_vec.zmm, 4), _mm512_set1_epi8(0x0f));
-        bitset_even_vec.zmm = _mm512_shuffle_epi8(filter_even_vec.zmm, higher_nibbles_vec.zmm);
-        bitset_odd_vec.zmm = _mm512_shuffle_epi8(filter_odd_vec.zmm, higher_nibbles_vec.zmm);
-        //
-        // At this point we can validate the `bitset_even_vec` and `bitset_odd_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != load_length; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t const *bitset_ptr = &filter->_u8s[0];
-        //          sz_u8_t hi_nibble = input >> 4;
-        //          sz_u8_t bitset_even = bitset_ptr[hi_nibble * 2];
-        //          sz_u8_t bitset_odd = bitset_ptr[hi_nibble * 2 + 1];
-        //          sz_assert(bitset_even_vec.u8s[i] == bitset_even);
-        //          sz_assert(bitset_odd_vec.u8s[i] == bitset_odd);
-        //      }
-        //
-        // TODO: Is this a good place for ternary logic?
-        __mmask64 take_first = _mm512_cmplt_epi8_mask(lower_nibbles_vec.zmm, _mm512_set1_epi8(8));
-        bitset_even_vec.zmm = _mm512_mask_blend_epi8(take_first, bitset_odd_vec.zmm, bitset_even_vec.zmm);
-        __mmask64 matches_mask = _mm512_mask_test_epi8_mask(load_mask, bitset_even_vec.zmm, bitmask_vec.zmm);
-        if (matches_mask) {
-            int offset = sz_u64_ctz(matches_mask);
-            return text + offset;
-        }
-        else { text += load_length, length -= load_length; }
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-    return sz_rfind_charset_serial(text, length, filter);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_many_avx512(                        //
-    sz_cptr_t haystack, sz_size_t haystack_length,              //
-    sz_cptr_t const *needles, sz_size_t const *needles_lengths, //
-    sz_size_t *needle_offset) {
-
-    // When dealing with huge needles vocabularies, like in tokenization workloads, we need to construct an automaton.
-    // But in many cases, the vocabulary is small enough to use a simpler DFA-less approach, combining the ideas from
-    // the `sz_find_avx512` and `sz_find_charset_avx512` functions.
-    //
-    // Pick the offsets within needles where there is the least variance in the characters.
-    // Like for "the", "then", "there", "these", "those", "their", "they", "them", "that", "this", "thus", "than":
-    //
-    //    0: 't'
-    //    1: 'h'
-    //    2: 'e', 'a', 'i', 'o', 'u'
-    //    3: 'n', 'r', 's', 'i', 'y', 'm', 't'
-    //
-    // So depending on our "register budget", we can use a different number of pivot points: offset 0, 1, 2 make
-    // the most sense if we can only use 3 ZMM registers.
-    sz_unused(haystack && haystack_length && needles && needles_lengths && needle_offset);
-    return 0;
-}
-
-/**
- *  Computes the Needleman Wunsch alignment score between two strings.
- *  The method uses 32-bit integers to accumulate the running score for every cell in the matrix.
- *  Assuming the costs of substitutions can be arbitrary signed 8-bit integers, the method is expected to be used
- *  on strings not exceeding 2^24 length or 16.7 million characters.
+/*  AVX512 implementation of the string hashing algorithms for Skylake and newer CPUs.
+ *  Includes extensions: F, CD, ER, PF, VL, DQ, BW.
  *
- *  Unlike the `_sz_edit_distance_skewed_diagonals_upto65k_avx512` method, this one uses signed integers to store
- *  the accumulated score. Moreover, it's primary bottleneck is the latency of gathering the substitution costs
- *  from the substitution matrix. If we use the diagonal order, we will be comparing a slice of the first string with
- *  a slice of the second. If we stick to the conventional horizontal order, we will be comparing one character against
- *  a slice, which is much easier to optimize. In that case we are sampling costs not from arbitrary parts of
- *  a 256 x 256 matrix, but from a single row!
+ *  This is the "starting level" for the advanced algorithms using K-mask registers on x86.
  */
-SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
-
-    // If one of the strings is empty - the edit distance is equal to the length of the other one
-    if (longer_length == 0) return (sz_ssize_t)shorter_length * gap;
-    if (shorter_length == 0) return (sz_ssize_t)longer_length * gap;
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
-    }
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    sz_size_t const max_length = 256ull * 256ull * 256ull;
-    sz_size_t const n = longer_length + 1;
-    sz_assert(n < max_length && "The length must fit into 24-bit integer. Otherwise use serial variant.");
-    sz_unused(longer_length && max_length);
-
-    sz_size_t buffer_length = sizeof(sz_i32_t) * n * 2;
-    sz_i32_t *distances = (sz_i32_t *)alloc->allocate(buffer_length, alloc->handle);
-    sz_i32_t *previous_distances = distances;
-    sz_i32_t *current_distances = previous_distances + n;
-
-    // Intialize the first row of the Levenshtein matrix with `iota`.
-    for (sz_size_t idx_longer = 0; idx_longer != n; ++idx_longer)
-        previous_distances[idx_longer] = (sz_i32_t)idx_longer * gap;
-
-    /// Contains up to 16 consecutive characters from the longer string.
-    sz_u512_vec_t longer_vec;
-    sz_u512_vec_t cost_deletion_vec, cost_substitution_vec, lookup_substitution_vec, current_vec;
-    sz_u512_vec_t row_first_subs_vec, row_second_subs_vec, row_third_subs_vec, row_fourth_subs_vec;
-    sz_u512_vec_t shuffled_first_subs_vec, shuffled_second_subs_vec, shuffled_third_subs_vec, shuffled_fourth_subs_vec;
-
-    // Prepare constants and masks.
-    sz_u512_vec_t is_third_or_fourth_vec, is_second_or_fourth_vec, gap_vec;
-    {
-        char is_third_or_fourth_check, is_second_or_fourth_check;
-        *(sz_u8_t *)&is_third_or_fourth_check = 0x80, *(sz_u8_t *)&is_second_or_fourth_check = 0x40;
-        is_third_or_fourth_vec.zmm = _mm512_set1_epi8(is_third_or_fourth_check);
-        is_second_or_fourth_vec.zmm = _mm512_set1_epi8(is_second_or_fourth_check);
-        gap_vec.zmm = _mm512_set1_epi32(gap);
-    }
-
-    sz_u8_t const *shorter_unsigned = (sz_u8_t const *)shorter;
-    for (sz_size_t idx_shorter = 0; idx_shorter != shorter_length; ++idx_shorter) {
-        sz_i32_t last_in_row = current_distances[0] = (sz_i32_t)(idx_shorter + 1) * gap;
-
-        // Load one row of the substitution matrix into four ZMM registers.
-        sz_error_cost_t const *row_subs = subs + shorter_unsigned[idx_shorter] * 256u;
-        row_first_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 0);
-        row_second_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 1);
-        row_third_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 2);
-        row_fourth_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 3);
-
-        // In the serial version we have one forward pass, that computes the deletion,
-        // insertion, and substitution costs at once.
-        //    for (sz_size_t idx_longer = 0; idx_longer < longer_length; ++idx_longer) {
-        //        sz_ssize_t cost_deletion = previous_distances[idx_longer + 1] + gap;
-        //        sz_ssize_t cost_insertion = current_distances[idx_longer] + gap;
-        //        sz_ssize_t cost_substitution = previous_distances[idx_longer] + row_subs[longer_unsigned[idx_longer]];
-        //        current_distances[idx_longer + 1] = sz_min_of_three(cost_deletion, cost_insertion, cost_substitution);
-        //    }
-        //
-        // Given the complexity of handling the data-dependency between consecutive insertion cost computations
-        // within a Levenshtein matrix, the simplest design would be to vectorize every kind of cost computation
-        // separately.
-        //      1. Compute substitution costs for up to 64 characters at once, upcasting from 8-bit integers to 32.
-        //      2. Compute the pairwise minimum with deletion costs.
-        //      3. Inclusive prefix minimum computation to combine with addition costs.
-        // Proceeding with substitutions:
-        for (sz_size_t idx_longer = 0; idx_longer < longer_length; idx_longer += 64) {
-            sz_size_t register_length = sz_min_of_two(longer_length - idx_longer, 64);
-            __mmask64 mask = _sz_u64_mask_until(register_length);
-            longer_vec.zmm = _mm512_maskz_loadu_epi8(mask, longer + idx_longer);
-
-            // Blend the `row_(first|second|third|fourth)_subs_vec` into `current_vec`, picking the right source
-            // for every character in `longer_vec`. Before that, we need to permute the subsititution vectors.
-            // Only the bottom 6 bits of a byte are used in VPERB, so we don't even need to mask.
-            shuffled_first_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_first_subs_vec.zmm);
-            shuffled_second_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_second_subs_vec.zmm);
-            shuffled_third_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_third_subs_vec.zmm);
-            shuffled_fourth_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_fourth_subs_vec.zmm);
-
-            // To blend we can invoke three `_mm512_cmplt_epu8_mask`, but we can also achieve the same using
-            // the AND logical operation, checking the top two bits of every byte.
-            // Continuing this thought, we can use the VPTESTMB instruction to output the mask after the AND.
-            __mmask64 is_third_or_fourth = _mm512_mask_test_epi8_mask(mask, longer_vec.zmm, is_third_or_fourth_vec.zmm);
-            __mmask64 is_second_or_fourth =
-                _mm512_mask_test_epi8_mask(mask, longer_vec.zmm, is_second_or_fourth_vec.zmm);
-            lookup_substitution_vec.zmm = _mm512_mask_blend_epi8(
-                is_third_or_fourth,
-                // Choose between the first and the second.
-                _mm512_mask_blend_epi8(is_second_or_fourth, shuffled_first_subs_vec.zmm, shuffled_second_subs_vec.zmm),
-                // Choose between the third and the fourth.
-                _mm512_mask_blend_epi8(is_second_or_fourth, shuffled_third_subs_vec.zmm, shuffled_fourth_subs_vec.zmm));
-
-            // First, sign-extend lower and upper 16 bytes to 16-bit integers.
-            __m512i current_0_31_vec = _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(lookup_substitution_vec.zmm, 0));
-            __m512i current_32_63_vec = _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(lookup_substitution_vec.zmm, 1));
-
-            // Now extend those 16-bit integers to 32-bit.
-            // This isn't free, same as the subsequent store, so we only want to do that for the populated lanes.
-            // To minimize the number of loads and stores, we can combine our substitution costs with the previous
-            // distances, containing the deletion costs.
-            {
-                cost_substitution_vec.zmm = _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_0_31_vec, 0)));
-                cost_deletion_vec.zmm = _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Inclusive prefix minimum computation to combine with insertion costs.
-                // Simply disabling this operation results in 5x performance improvement, meaning
-                // that this operation is responsible for 80% of the total runtime.
-                //    for (sz_size_t idx_longer = 0; idx_longer < longer_length; ++idx_longer) {
-                //        current_distances[idx_longer + 1] =
-                //            sz_max_of_two(current_distances[idx_longer] + gap, current_distances[idx_longer + 1]);
-                //    }
-                //
-                // To perform the same operation in vectorized form, we need to perform a tree-like reduction,
-                // that will involve multiple steps. It's quite expensive and should be first tested in the
-                // "experimental" section.
-                //
-                // Another approach might be loop unrolling:
-                //      current_vec.i32s[0] = last_in_row = sz_i32_max_of_two(current_vec.i32s[0], last_in_row + gap);
-                //      current_vec.i32s[1] = last_in_row = sz_i32_max_of_two(current_vec.i32s[1], last_in_row + gap);
-                //      current_vec.i32s[2] = last_in_row = sz_i32_max_of_two(current_vec.i32s[2], last_in_row + gap);
-                //      ... yet this approach is also quite expensive.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 16 to 31.
-            if (register_length > 16) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 16);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_0_31_vec, 1)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 16);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 16, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 32 to 47.
-            if (register_length > 32) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 32);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_32_63_vec, 0)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 32);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 32, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 32 to 47.
-            if (register_length > 48) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 48);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_32_63_vec, 1)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 48);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 48, (__mmask16)mask, current_vec.zmm);
-            }
-        }
-
-        // Swap previous_distances and current_distances pointers
-        sz_pointer_swap((void **)&previous_distances, (void **)&current_distances);
-    }
-
-    // Cache scalar before `free` call.
-    sz_ssize_t result = previous_distances[longer_length];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-}
-
-SZ_INTERNAL sz_ssize_t sz_alignment_score_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,  //
-    sz_cptr_t longer, sz_size_t longer_length,    //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
-
-    if (sz_max_of_two(shorter_length, longer_length) < (256ull * 256ull * 256ull))
-        return _sz_alignment_score_wagner_fisher_upto17m_avx512(shorter, shorter_length, longer, longer_length, subs,
-                                                                gap, alloc);
-    else
-        return sz_alignment_score_serial(shorter, shorter_length, longer, longer_length, subs, gap, alloc);
-}
-
-enum sz_encoding_t {
-    sz_encoding_unknown_k = 0,
-    sz_encoding_ascii_k = 1,
-    sz_encoding_utf8_k = 2,
-    sz_encoding_utf16_k = 3,
-    sz_encoding_utf32_k = 4,
-    sz_jwt_k,
-    sz_base64_k,
-    // Low priority encodings:
-    sz_encoding_utf8bom_k = 5,
-    sz_encoding_utf16le_k = 6,
-    sz_encoding_utf16be_k = 7,
-    sz_encoding_utf32le_k = 8,
-    sz_encoding_utf32be_k = 9,
-};
-
-// Character Set Detection is one of the most commonly performed operations in data processing with
-// [Chardet](https://github.com/chardet/chardet), [Charset Normalizer](https://github.com/jawah/charset_normalizer),
-// [cChardet](https://github.com/PyYoshi/cChardet) being the most commonly used options in the Python ecosystem.
-// All of them are notoriously slow.
-//
-// Moreover, as of October 2024, UTF-8 is the dominant character encoding on the web, used by 98.4% of websites.
-// Other have minimal usage, according to [W3Techs](https://w3techs.com/technologies/overview/character_encoding):
-// - ISO-8859-1: 1.2%
-// - Windows-1252: 0.3%
-// - Windows-1251: 0.2%
-// - EUC-JP: 0.1%
-// - Shift JIS: 0.1%
-// - EUC-KR: 0.1%
-// - GB2312: 0.1%
-// - Windows-1250: 0.1%
-// Within programming language implementations and database management systems, 16-bit and 32-bit fixed-width encodings
-// are also very popular and we need a way to efficienly differentiate between the most common UTF flavors, ASCII, and
-// the rest.
-//
-// One good solution is the [simdutf](https://github.com/simdutf/simdutf) library, but it depends on the C++ runtime
-// and focuses more on incremental validation & transcoding, rather than detection.
-//
-// So we need a very fast and efficient way of determining
-SZ_PUBLIC sz_bool_t sz_detect_encoding(sz_cptr_t text, sz_size_t length) {
-    // https://github.com/simdutf/simdutf/blob/master/src/icelake/icelake_utf8_validation.inl.cpp
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_from_utf8.inl.cpp#L81
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L661
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L788
-
-    // We can implement this operation simpler & differently, assuming most of the time continuous chunks of memory
-    // have identical encoding. With Russian and many European languages, we generally deal with 2-byte codepoints
-    // with occasional 1-byte punctuation marks. In the case of Chinese, Japanese, and Korean, we deal with 3-byte
-    // codepoints. In the case of emojis, we deal with 4-byte codepoints.
-    // We can also use the idea, that misaligned reads are quite cheap on modern CPUs.
-    int can_be_ascii = 1, can_be_utf8 = 1, can_be_utf16 = 1, can_be_utf32 = 1;
-    sz_unused(can_be_ascii + can_be_utf8 + can_be_utf16 + can_be_utf32);
-    sz_unused(text && length);
-    return sz_false_k;
-}
+#pragma region Skylake Implementation
+#if SZ_USE_SKYLAKE
+#pragma GCC push_options
+#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
+#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
 
 #pragma clang attribute pop
 #pragma GCC pop_options
-#endif
+#endif            // SZ_USE_SKYLAKE
+#pragma endregion // Skylake Implementation
 
-#pragma endregion
-
-/*  @brief  Implementation of the string search algorithms using the Arm NEON instruction set, available on 64-bit
- *          Arm processors. Implements: {substring search, character search, character set search} x {forward, reverse}.
+/*  AVX512 implementation of the string search algorithms for Ice Lake and newer CPUs.
+ *  Includes extensions:
+ *      - 2017 Skylake: F, CD, ER, PF, VL, DQ, BW,
+ *      - 2018 CannonLake: IFMA, VBMI,
+ *      - 2019 Ice Lake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES.
  */
-#pragma region ARM NEON
-
-#if SZ_USE_ARM_NEON
+#pragma region Ice Lake Implementation
+#if SZ_USE_ICE
 #pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+simd")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
-
-/**
- *  @brief  Helper structure to simplify work with 64-bit words.
- */
-typedef union sz_u128_vec_t {
-    uint8x16_t u8x16;
-    uint16x8_t u16x8;
-    uint32x4_t u32x4;
-    uint64x2_t u64x2;
-    sz_u64_t u64s[2];
-    sz_u32_t u32s[4];
-    sz_u16_t u16s[8];
-    sz_u8_t u8s[16];
-} sz_u128_vec_t;
-
-SZ_INTERNAL sz_u64_t _sz_vreinterpretq_u8_u4(uint8x16_t vec) {
-    // Use `vshrn` to produce a bitmask, similar to `movemask` in SSE.
-    // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
-    return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(vec), 4)), 0) & 0x8888888888888888ull;
-}
+#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
+#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
+                             apply_to = function)
 
-SZ_PUBLIC sz_ordering_t sz_order_neon(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    //! Before optimizing this, read the "Operations Not Worth Optimizing" in Contributions Guide:
-    //! https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md#general-performance-observations
-    return sz_order_serial(a, a_length, b, b_length);
-}
+SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
+    // The naive implementation of this function is very simple.
+    // It assumes the CPU is great at handling unaligned "loads".
+    //
+    // A typical AWS Sapphire Rapids instance can have 48 KB x 2 blocks of L1 data cache per core,
+    // 2 MB x 2 blocks of L2 cache per core, and one shared 60 MB buffer of L3 cache.
+    // With two strings, we may consider the overall workload huge, if each exceeds 1 MB in length.
+    int const is_huge = length >= 1ull * 1024ull * 1024ull;
+    sz_u512_vec_t text_vec, sums_vec;
 
-SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_u128_vec_t a_vec, b_vec;
-    for (; length >= 16; a += 16, b += 16, length -= 16) {
-        a_vec.u8x16 = vld1q_u8((sz_u8_t const *)a);
-        b_vec.u8x16 = vld1q_u8((sz_u8_t const *)b);
-        uint8x16_t cmp = vceqq_u8(a_vec.u8x16, b_vec.u8x16);
-        if (vminvq_u8(cmp) != 255) { return sz_false_k; } // Check if all bytes match
+    // When the buffer is small, there isn't much to innovate.
+    if (length <= 16) {
+        __mmask16 mask = _sz_u16_mask_until(length);
+        text_vec.xmms[0] = _mm_maskz_loadu_epi8(mask, text);
+        sums_vec.xmms[0] = _mm_sad_epu8(text_vec.xmms[0], _mm_setzero_si128());
+        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_vec.xmms[0]);
+        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_vec.xmms[0], 1);
+        return low + high;
     }
-
-    // Handle remaining bytes
-    if (length) return sz_equal_serial(a, b, length);
-    return sz_true_k;
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_neon(sz_cptr_t text, sz_size_t length) {
-    uint64x2_t sum_vec = vdupq_n_u64(0);
-
-    // Process 16 bytes (128 bits) at a time
-    for (; length >= 16; text += 16, length -= 16) {
-        uint8x16_t vec = vld1q_u8((sz_u8_t const *)text);      // Load 16 bytes
-        uint16x8_t pairwise_sum1 = vpaddlq_u8(vec);            // Pairwise add lower and upper 8 bits
-        uint32x4_t pairwise_sum2 = vpaddlq_u16(pairwise_sum1); // Pairwise add 16-bit results
-        uint64x2_t pairwise_sum3 = vpaddlq_u32(pairwise_sum2); // Pairwise add 32-bit results
-        sum_vec = vaddq_u64(sum_vec, pairwise_sum3);           // Accumulate the sum
+    else if (length <= 32) {
+        __mmask32 mask = _sz_u32_mask_until(length);
+        text_vec.ymms[0] = _mm256_maskz_loadu_epi8(mask, text);
+        sums_vec.ymms[0] = _mm256_sad_epu8(text_vec.ymms[0], _mm256_setzero_si256());
+        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
+        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymms[0]);
+        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymms[0], 1);
+        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
+        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
+        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
+        return low + high;
     }
-
-    // Final reduction of `sum_vec` to a single scalar
-    sz_u64_t sum = vgetq_lane_u64(sum_vec, 0) + vgetq_lane_u64(sum_vec, 1);
-    if (length) sum += sz_checksum_serial(text, length);
-    return sum;
-}
-
-SZ_PUBLIC void sz_copy_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // In most cases the `source` and the `target` are not aligned, but we should
-    // at least make sure that writes don't touch many cache lines.
-    // NEON has an instruction to load and write 64 bytes at once.
+    else if (length <= 64) {
+        __mmask64 mask = _sz_u64_mask_until(length);
+        text_vec.zmm = _mm512_maskz_loadu_epi8(mask, text);
+        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
+        return _mm512_reduce_add_epi64(sums_vec.zmm);
+    }
+    else if (!is_huge) {
+        sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64; // 63 or less.
+        sz_size_t tail_length = (sz_size_t)(text + length) % 64;    // 63 or less.
+        sz_size_t body_length = length - head_length - tail_length; // Multiple of 64.
+        __mmask64 head_mask = _sz_u64_mask_until(head_length);
+        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
+        text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
+        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
+        for (text += head_length; body_length >= 64; text += 64, body_length -= 64) {
+            text_vec.zmm = _mm512_load_si512((__m512i const *)text);
+            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
+        }
+        text_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text);
+        sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
+        return _mm512_reduce_add_epi64(sums_vec.zmm);
+    }
+    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
     //
-    //    sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-    //    sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-    //    for (; head_length; target += 1, source += 1, head_length -= 1) *target = *source;
-    //    length -= head_length;
-    //    for (; length >= 64; target += 64, source += 64, length -= 64)
-    //        vst4q_u8((sz_u8_t *)target, vld1q_u8_x4((sz_u8_t const *)source));
-    //    for (; tail_length; target += 1, source += 1, tail_length -= 1) *target = *source;
+    //      1. Moving in both directions to maximize the throughput, when fetching from multiple
+    //         memory pages. Also helps with cache set-associativity issues, as we won't always
+    //         be fetching the same entries in the lookup table.
+    //      2. Using non-temporal stores to avoid polluting the cache.
+    //      3. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
+    //         for predictable patterns, so disregard this advice.
     //
-    // Sadly, those instructions end up being 20% slower than the code processing 16 bytes at a time:
-    for (; length >= 16; target += 16, source += 16, length -= 16)
-        vst1q_u8((sz_u8_t *)target, vld1q_u8((sz_u8_t const *)source));
-    if (length) sz_copy_serial(target, source, length);
-}
-
-SZ_PUBLIC void sz_move_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // When moving small buffers, using a small buffer on stack as a temporary storage is faster.
-
-    if (target < source || target >= source + length) {
-        // Non-overlapping, proceed forward
-        sz_copy_neon(target, source, length);
-    }
+    // Bidirectional traversal generally adds about 10% to such algorithms.
     else {
-        // Overlapping, proceed backward
-        target += length;
-        source += length;
+        sz_u512_vec_t text_reversed_vec, sums_reversed_vec;
+        sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64;
+        sz_size_t tail_length = (sz_size_t)(text + length) % 64;
+        sz_size_t body_length = length - head_length - tail_length;
+        __mmask64 head_mask = _sz_u64_mask_until(head_length);
+        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
 
-        sz_u128_vec_t src_vec;
-        while (length >= 16) {
-            target -= 16, source -= 16, length -= 16;
-            src_vec.u8x16 = vld1q_u8((sz_u8_t const *)source);
-            vst1q_u8((sz_u8_t *)target, src_vec.u8x16);
-        }
-        while (length) {
-            target -= 1, source -= 1, length -= 1;
-            *target = *source;
-        }
-    }
-}
+        text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
+        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
+        text_reversed_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text + head_length + body_length);
+        sums_reversed_vec.zmm = _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512());
 
-SZ_PUBLIC void sz_fill_neon(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    uint8x16_t fill_vec = vdupq_n_u8(value); // Broadcast the value across the register
+        // Now in the main loop, we can use non-temporal loads and stores,
+        // performing the operation in both directions.
+        for (text += head_length; body_length >= 128; text += 64, text += 64, body_length -= 128) {
+            text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
+            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
+            text_reversed_vec.zmm = _mm512_stream_load_si512((__m512i *)(text + body_length - 64));
+            sums_reversed_vec.zmm =
+                _mm512_add_epi64(sums_reversed_vec.zmm, _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512()));
+        }
+        if (body_length >= 64) {
+            text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
+            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
+        }
 
-    while (length >= 16) {
-        vst1q_u8((sz_u8_t *)target, fill_vec);
-        target += 16;
-        length -= 16;
+        return _mm512_reduce_add_epi64(_mm512_add_epi64(sums_vec.zmm, sums_reversed_vec.zmm));
     }
-
-    // Handle remaining bytes
-    if (length) sz_fill_serial(target, length, value);
 }
 
-SZ_PUBLIC void sz_look_up_transform_neon(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
+SZ_PUBLIC void sz_hashes_ice(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
+                             sz_hash_callback_t callback, void *callback_handle) {
 
-    // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
-    // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
-    if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
+    if (length < window_length || !window_length) return;
+    if (length < 4 * window_length) {
+        sz_hashes_serial(start, length, window_length, step, callback, callback_handle);
         return;
     }
 
-    sz_size_t head_length = (16 - ((sz_size_t)target % 16)) % 16; // 15 or less.
-    sz_size_t tail_length = (sz_size_t)(target + length) % 16;    // 15 or less.
-
-    // We need to pull the lookup table into 16x NEON registers. We have a total of 32 such registers.
-    // According to the Neoverse V2 manual, the 4-table lookup has a latency of 6 cycles, and 4x throughput.
-    uint8x16x4_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec;
-    lut_0_to_63_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 0));
-    lut_64_to_127_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 64));
-    lut_128_to_191_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 128));
-    lut_192_to_255_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 192));
-
-    sz_u128_vec_t source_vec;
-    // If the top bit is set in each word of `source_vec`, than we use `lookup_128_to_191_vec` or
-    // `lookup_192_to_255_vec`. If the second bit is set, we use `lookup_64_to_127_vec` or `lookup_192_to_255_vec`.
-    sz_u128_vec_t lookup_0_to_63_vec, lookup_64_to_127_vec, lookup_128_to_191_vec, lookup_192_to_255_vec;
-    sz_u128_vec_t blended_0_to_255_vec;
-
-    // Process the head with serial code
-    for (; head_length; target += 1, source += 1, head_length -= 1) *target = lut[*(sz_u8_t const *)source];
+    // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
+    // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
+    sz_size_t const max_hashes = length - window_length + 1;
+    sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
+    sz_u8_t const *text_first = (sz_u8_t const *)start;
+    sz_u8_t const *text_second = text_first + min_hashes_per_thread;
+    sz_u8_t const *text_third = text_first + min_hashes_per_thread * 2;
+    sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
+    sz_u8_t const *text_end = text_first + length;
 
-    // Table lookups on Arm are much simpler to use than on x86, as we can use the `vqtbl4q_u8` instruction
-    // to perform a 4-table lookup in a single instruction. The XORs are used to adjust the lookup position
-    // within each 64-byte range of the table.
-    // Details on the 4-table lookup: https://lemire.me/blog/2019/07/23/arbitrary-byte-to-byte-maps-using-arm-neon/
-    length -= head_length;
-    length -= tail_length;
-    for (; length >= 16; source += 16, target += 16, length -= 16) {
-        source_vec.u8x16 = vld1q_u8((sz_u8_t const *)source);
-        lookup_0_to_63_vec.u8x16 = vqtbl4q_u8(lut_0_to_63_vec, source_vec.u8x16);
-        lookup_64_to_127_vec.u8x16 = vqtbl4q_u8(lut_64_to_127_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0x40)));
-        lookup_128_to_191_vec.u8x16 = vqtbl4q_u8(lut_128_to_191_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0x80)));
-        lookup_192_to_255_vec.u8x16 = vqtbl4q_u8(lut_192_to_255_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0xc0)));
-        blended_0_to_255_vec.u8x16 = vorrq_u8(vorrq_u8(lookup_0_to_63_vec.u8x16, lookup_64_to_127_vec.u8x16),
-                                              vorrq_u8(lookup_128_to_191_vec.u8x16, lookup_192_to_255_vec.u8x16));
-        vst1q_u8((sz_u8_t *)target, blended_0_to_255_vec.u8x16);
-    }
+    // Broadcast the global constants into the registers.
+    // Both high and low hashes will work with the same prime and golden ratio.
+    sz_u512_vec_t prime_vec, golden_ratio_vec;
+    prime_vec.zmm = _mm512_set1_epi64(SZ_U64_MAX_PRIME);
+    golden_ratio_vec.zmm = _mm512_set1_epi64(11400714819323198485ull);
 
-    // Process the tail with serial code
-    for (; tail_length; target += 1, source += 1, tail_length -= 1) *target = lut[*(sz_u8_t const *)source];
-}
+    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
+    sz_u64_t prime_power_low = 1, prime_power_high = 1;
+    for (sz_size_t i = 0; i + 1 < window_length; ++i)
+        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
+        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
 
-SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec, n_vec, matches_vec;
-    n_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)n);
+    // We will be evaluating 4 offsets at a time with 2 different hash functions.
+    // We can fit all those 8 state variables in each of the following ZMM registers.
+    sz_u512_vec_t base_vec, prime_power_vec, shift_vec;
+    base_vec.zmm = _mm512_set_epi64(31ull, 31ull, 31ull, 31ull, 257ull, 257ull, 257ull, 257ull);
+    shift_vec.zmm = _mm512_set_epi64(0ull, 0ull, 0ull, 0ull, 77ull, 77ull, 77ull, 77ull);
+    prime_power_vec.zmm = _mm512_set_epi64(prime_power_low, prime_power_low, prime_power_low, prime_power_low,
+                                           prime_power_high, prime_power_high, prime_power_high, prime_power_high);
 
-    while (h_length >= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)h);
-        matches_vec.u8x16 = vceqq_u8(h_vec.u8x16, n_vec.u8x16);
-        // In Arm NEON we don't have a `movemask` to combine it with `ctz` and get the offset of the match.
-        // But assuming the `vmaxvq` is cheap, we can use it to find the first match, by blending (bitwise selecting)
-        // the vector with a relative offsets array.
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        if (matches) return h + sz_u64_ctz(matches) / 4;
+    // Compute the initial hash values for every one of the four windows.
+    sz_u512_vec_t hash_vec, chars_vec;
+    hash_vec.zmm = _mm512_setzero_si512();
+    for (sz_u8_t const *prefix_end = text_first + window_length; text_first < prefix_end;
+         ++text_first, ++text_second, ++text_third, ++text_fourth) {
 
-        h += 16, h_length -= 16;
-    }
+        // 1. Multiply the hashes by the base.
+        hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
 
-    return sz_find_byte_serial(h, h_length, n);
-}
+        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
+        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`...
+        chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
+                                         text_fourth[0], text_third[0], text_second[0], text_first[0]);
+        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
 
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec, n_vec, matches_vec;
-    n_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)n);
+        // 3. Add the incoming characters.
+        hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
 
-    while (h_length >= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)h + h_length - 16);
-        matches_vec.u8x16 = vceqq_u8(h_vec.u8x16, n_vec.u8x16);
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        if (matches) return h + h_length - 1 - sz_u64_clz(matches) / 4;
-        h_length -= 16;
+        // 4. Compute the modulo. Assuming there are only 59 values between our prime
+        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
+        hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
+                                              _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
     }
 
-    return sz_rfind_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_u64_t _sz_find_charset_neon_register(sz_u128_vec_t h_vec, uint8x16_t set_top_vec_u8x16,
-                                                  uint8x16_t set_bottom_vec_u8x16) {
-
-    // Once we've read the characters in the haystack, we want to
-    // compare them against our bitset. The serial version of that code
-    // would look like: `(set_->_u8s[c >> 3] & (1u << (c & 7u))) != 0`.
-    uint8x16_t byte_index_vec = vshrq_n_u8(h_vec.u8x16, 3);
-    uint8x16_t byte_mask_vec = vshlq_u8(vdupq_n_u8(1), vreinterpretq_s8_u8(vandq_u8(h_vec.u8x16, vdupq_n_u8(7))));
-    uint8x16_t matches_top_vec = vqtbl1q_u8(set_top_vec_u8x16, byte_index_vec);
-    // The table lookup instruction in NEON replies to out-of-bound requests with zeros.
-    // The values in `byte_index_vec` all fall in [0; 32). So for values under 16, substracting 16 will underflow
-    // and map into interval [240, 256). Meaning that those will be populated with zeros and we can safely
-    // merge `matches_top_vec` and `matches_bottom_vec` with a bitwise OR.
-    uint8x16_t matches_bottom_vec = vqtbl1q_u8(set_bottom_vec_u8x16, vsubq_u8(byte_index_vec, vdupq_n_u8(16)));
-    uint8x16_t matches_vec = vorrq_u8(matches_top_vec, matches_bottom_vec);
-    // Istead of pure `vandq_u8`, we can immediately broadcast a match presence across each 8-bit word.
-    matches_vec = vtstq_u8(matches_vec, byte_mask_vec);
-    return _sz_vreinterpretq_u8_u4(matches_vec);
-}
+    // 5. Compute the hash mix, that will be used to index into the fingerprint.
+    //    This includes a serial step at the end.
+    sz_u512_vec_t hash_mix_vec;
+    hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
+    hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
+                                            _mm512_extracti64x4_epi64(hash_mix_vec.zmm, 0));
 
-SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+    callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
+    callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
+    callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
+    callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
 
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_neon(h, h_length, n);
+    // Now repeat that operation for the remaining characters, discarding older characters.
+    sz_size_t cycle = 1;
+    sz_size_t step_mask = step - 1;
+    for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
+        // 0. Load again the four characters we are dropping, shift them, and subtract.
+        chars_vec.zmm = _mm512_set_epi64(text_fourth[-window_length], text_third[-window_length],
+                                         text_second[-window_length], text_first[-window_length], //
+                                         text_fourth[-window_length], text_third[-window_length],
+                                         text_second[-window_length], text_first[-window_length]);
+        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
+        hash_vec.zmm = _mm512_sub_epi64(hash_vec.zmm, _mm512_mullo_epi64(chars_vec.zmm, prime_power_vec.zmm));
 
-    // Scan through the string.
-    // Assuming how tiny the Arm NEON registers are, we should avoid internal branches at all costs.
-    // That's why, for smaller needles, we use different loops.
-    if (n_length == 2) {
-        // Broadcast needle characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_last_vec, n_first_vec, n_last_vec, matches_vec;
-        // Dealing with 16-bit values, we can load 2 registers at a time and compare 31 possible offsets
-        // in a single loop iteration.
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[0]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[1]);
-        for (; h_length >= 17; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 0));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 1));
-            matches_vec.u8x16 =
-                vandq_u8(vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            if (matches) return h + sz_u64_ctz(matches) / 4;
-        }
-    }
-    else if (n_length == 3) {
-        // Broadcast needle characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-        // Comparing 24-bit values is a bumer. Being lazy, I went with the same approach
-        // as when searching for string over 4 characters long. I only avoid the last comparison.
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[0]);
-        n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[1]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[2]);
-        for (; h_length >= 18; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 0));
-            h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 1));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 2));
-            matches_vec.u8x16 = vandq_u8(                           //
-                vandq_u8(                                           //
-                    vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                    vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-                vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            if (matches) return h + sz_u64_ctz(matches) / 4;
-        }
-    }
-    else {
-        // Pick the parts of the needle that are worth comparing.
-        sz_size_t offset_first, offset_mid, offset_last;
-        _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-        // Broadcast those characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_first]);
-        n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_mid]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_last]);
-        // Walk through the string.
-        for (; h_length >= n_length + 16; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_first));
-            h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_mid));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_last));
-            matches_vec.u8x16 = vandq_u8(                           //
-                vandq_u8(                                           //
-                    vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                    vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-                vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches) / 4;
-                if (sz_equal(h + potential_offset, n, n_length)) return h + potential_offset;
-                matches &= matches - 1;
-            }
-        }
-    }
+        // 1. Multiply the hashes by the base.
+        hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
 
-    return sz_find_serial(h, h_length, n, n_length);
-}
+        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
+        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
+        chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
+                                         text_fourth[0], text_third[0], text_second[0], text_first[0]);
+        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
 
-SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+        // ... and prefetch the next four characters into Level 2 or higher.
+        _mm_prefetch((sz_cptr_t)text_fourth + 1, _MM_HINT_T1);
+        _mm_prefetch((sz_cptr_t)text_third + 1, _MM_HINT_T1);
+        _mm_prefetch((sz_cptr_t)text_second + 1, _MM_HINT_T1);
+        _mm_prefetch((sz_cptr_t)text_first + 1, _MM_HINT_T1);
 
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_neon(h, h_length, n);
+        // 3. Add the incoming characters.
+        hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
 
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
+        // 4. Compute the modulo. Assuming there are only 59 values between our prime
+        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
+        hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
+                                              _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
 
-    // Will contain 4 bits per character.
-    sz_u64_t matches;
-    sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-    n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_first]);
-    n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_mid]);
-    n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_last]);
+        // 5. Compute the hash mix, that will be used to index into the fingerprint.
+        //    This includes a serial step at the end.
+        hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
+        hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
+                                                _mm512_castsi512_si256(hash_mix_vec.zmm));
 
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 16; h_length -= 16) {
-        h_reversed = h + h_length - n_length - 16 + 1;
-        h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_first));
-        h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_mid));
-        h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_last));
-        matches_vec.u8x16 = vandq_u8(                           //
-            vandq_u8(                                           //
-                vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-            vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches) / 4;
-            if (sz_equal(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            sz_assert((matches & (1ull << (63 - potential_offset * 4))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~(1ull << (63 - potential_offset * 4));
+        if ((cycle & step_mask) == 0) {
+            callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
+            callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
+            callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
+            callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
         }
     }
-
-    return sz_rfind_serial(h, h_length, n, n_length);
 }
 
-SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_charset_t const *set) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec;
-    uint8x16_t set_top_vec_u8x16 = vld1q_u8(&set->_u8s[0]);
-    uint8x16_t set_bottom_vec_u8x16 = vld1q_u8(&set->_u8s[16]);
-
-    for (; h_length >= 16; h += 16, h_length -= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h));
-        matches = _sz_find_charset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
-        if (matches) return h + sz_u64_ctz(matches) / 4;
-    }
+#pragma clang attribute pop
+#pragma GCC pop_options
+#endif            // SZ_USE_ICE
+#pragma endregion // Ice Lake Implementation
 
-    return sz_find_charset_serial(h, h_length, set);
-}
+/*  Implementation of the string hashing algorithms using the Arm NEON instruction set, available on 64-bit
+ *  Arm processors. Covers billions of mobile CPUs worldwide, including Apple's A-series, and Qualcomm's Snapdragon.
+ */
+#pragma region NEON Implementation
+#if SZ_USE_NEON
+#pragma GCC push_options
+#pragma GCC target("arch=armv8.2-a+simd")
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
 
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_charset_t const *set) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec;
-    uint8x16_t set_top_vec_u8x16 = vld1q_u8(&set->_u8s[0]);
-    uint8x16_t set_bottom_vec_u8x16 = vld1q_u8(&set->_u8s[16]);
+SZ_PUBLIC sz_u64_t sz_checksum_neon(sz_cptr_t text, sz_size_t length) {
+    uint64x2_t sum_vec = vdupq_n_u64(0);
 
-    // Check `sz_find_charset_neon` for explanations.
-    for (; h_length >= 16; h_length -= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h) + h_length - 16);
-        matches = _sz_find_charset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
-        if (matches) return h + h_length - 1 - sz_u64_clz(matches) / 4;
+    // Process 16 bytes (128 bits) at a time
+    for (; length >= 16; text += 16, length -= 16) {
+        uint8x16_t vec = vld1q_u8((sz_u8_t const *)text);      // Load 16 bytes
+        uint16x8_t pairwise_sum1 = vpaddlq_u8(vec);            // Pairwise add lower and upper 8 bits
+        uint32x4_t pairwise_sum2 = vpaddlq_u16(pairwise_sum1); // Pairwise add 16-bit results
+        uint64x2_t pairwise_sum3 = vpaddlq_u32(pairwise_sum2); // Pairwise add 32-bit results
+        sum_vec = vaddq_u64(sum_vec, pairwise_sum3);           // Accumulate the sum
     }
 
-    return sz_rfind_charset_serial(h, h_length, set);
+    // Final reduction of `sum_vec` to a single scalar
+    sz_u64_t sum = vgetq_lane_u64(sum_vec, 0) + vgetq_lane_u64(sum_vec, 1);
+    if (length) sum += sz_checksum_serial(text, length);
+    return sum;
 }
 
 #pragma clang attribute pop
 #pragma GCC pop_options
-#endif // Arm Neon
-
-#pragma endregion
+#endif            // SZ_USE_NEON
+#pragma endregion // NEON Implementation
 
-/*  @brief  Implementation of the string search algorithms using the Arm SVE variable-length registers, available
- *          in Arm v9 processors.
- *
- *  Implements:
- *      - memory: {copy, move, fill}
- *      - comparisons: {equal, order}
- *      - search: {substring, character, character set} x {forward, reverse}.
+/*  Implementation of the string search algorithms using the Arm SVE variable-length registers,
+ *  available in Arm v9 processors, like in Apple M4+ and Graviton 3+ CPUs.
  */
-#pragma region ARM SVE
-
-#if SZ_USE_ARM_SVE
+#pragma region SVE Implementation
+#if SZ_USE_SVE
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+sve")
 #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
 
-SZ_PUBLIC void sz_fill_sve(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    svuint8_t value_vec = svdup_u8(value);
-    sz_size_t vec_len = svcntb(); // Vector length in bytes (scalable)
-
-    if (length <= vec_len) {
-        // Small buffer case: use mask to handle small writes
-        svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)length);
-        svst1_u8(mask, (unsigned char *)target, value_vec);
-    }
-    else {
-        // Calculate head, body, and tail sizes
-        sz_size_t head_length = vec_len - ((sz_size_t)target % vec_len);
-        sz_size_t tail_length = (sz_size_t)(target + length) % vec_len;
-        sz_size_t body_length = length - head_length - tail_length;
-
-        // Handle unaligned head
-        svbool_t head_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)head_length);
-        svst1_u8(head_mask, (unsigned char *)target, value_vec);
-        target += head_length;
-
-        // Aligned body loop
-        for (; body_length >= vec_len; target += vec_len, body_length -= vec_len) {
-            svst1_u8(svptrue_b8(), (unsigned char *)target, value_vec);
-        }
-
-        // Handle unaligned tail
-        svbool_t tail_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)tail_length);
-        svst1_u8(tail_mask, (unsigned char *)target, value_vec);
-    }
-}
-
-SZ_PUBLIC void sz_copy_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    sz_size_t vec_len = svcntb(); // Vector length in bytes
-
-    // Arm Neoverse V2 cores in Graviton 4, for example, come with 256 KB of L1 data cache per core,
-    // and 8 MB of L2 cache per core. Moreover, the L1 cache is fully associative.
-    // With two strings, we may consider the overal workload huge, if each exceeds 1 MB in length.
-    //
-    //      int is_huge = length >= 4ull * 1024ull * 1024ull;
-    //
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= vec_len) {
-        // Small buffer case: use mask to handle small writes
-        svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)length);
-        svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-        svst1_u8(mask, (unsigned char *)target, data);
-    }
-    // When dealing with larger buffers, similar to AVX-512, we want minimize unaligned operations
-    // and handle the head, body, and tail separately. We can also traverse the buffer in both directions
-    // as Arm generally supports more simultaneous stores than x86 CPUs.
-    //
-    // For gigantic datasets, similar to AVX-512, non-temporal "loads" and "stores" can be used.
-    // Sadly, if the register size (16 byte or larger) is smaller than a cache-line (64 bytes)
-    // we will pay a huge penalty on loads, fetching the same content many times.
-    // It may be better to allow caching (and subsequent eviction), in favor of using four-element
-    // tuples, wich will be guaranteed to be a multiple of a cache line.
-    //
-    // Another approach is to use the `LD4B` instructions, which will populate four registers at once.
-    // This however, further decreases the performance from LibC-like 29 GB/s to 20 GB/s.
-    else {
-        // Calculating head, body, and tail sizes depends on the `vec_len`,
-        // but it's runtime constant, and the modulo operation is expensive!
-        // Instead we use the fact, that it's always a multiple of 128 bits or 16 bytes.
-        sz_size_t head_length = 16 - ((sz_size_t)target % 16);
-        sz_size_t tail_length = (sz_size_t)(target + length) % 16;
-        sz_size_t body_length = length - head_length - tail_length;
-
-        // Handle unaligned parts
-        svbool_t head_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)head_length);
-        svuint8_t head_data = svld1_u8(head_mask, (unsigned char *)source);
-        svst1_u8(head_mask, (unsigned char *)target, head_data);
-        svbool_t tail_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)tail_length);
-        svuint8_t tail_data = svld1_u8(tail_mask, (unsigned char *)source + head_length + body_length);
-        svst1_u8(tail_mask, (unsigned char *)target + head_length + body_length, tail_data);
-        target += head_length;
-        source += head_length;
-
-        // Aligned body loop, walking in two directions
-        for (; body_length >= vec_len * 2; target += vec_len, source += vec_len, body_length -= vec_len * 2) {
-            svuint8_t forward_data = svld1_u8(svptrue_b8(), (unsigned char *)source);
-            svuint8_t backward_data = svld1_u8(svptrue_b8(), (unsigned char *)source + body_length - vec_len);
-            svst1_u8(svptrue_b8(), (unsigned char *)target, forward_data);
-            svst1_u8(svptrue_b8(), (unsigned char *)target + body_length - vec_len, backward_data);
-        }
-        // Up to (vec_len * 2 - 1) bytes of data may be left in the body,
-        // so we can unroll the last two optional loop iterations.
-        if (body_length > vec_len) {
-            svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)body_length);
-            svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-            svst1_u8(mask, (unsigned char *)target, data);
-            body_length -= vec_len;
-            source += body_length;
-            target += body_length;
-        }
-        if (body_length) {
-            svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)body_length);
-            svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-            svst1_u8(mask, (unsigned char *)target, data);
-        }
-    }
-}
-
 #pragma clang attribute pop
 #pragma GCC pop_options
-#endif // Arm SVE
+#endif            // SZ_USE_SVE
+#pragma endregion // SVE Implementation
 
-#pragma endregion
-
-/*
- *  @brief  Pick the right implementation for the string search algorithms.
+/*  Pick the right implementation for the string search algorithms.
+ *  To override this behavior and precompile all backends - set `SZ_DYNAMIC_DISPATCH` to 1.
  */
 #pragma region Compile Time Dispatching
-
-SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t ins, sz_size_t length) { return sz_hash_serial(ins, length); }
-SZ_PUBLIC void sz_tolower(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_tolower_serial(ins, length, outs); }
-SZ_PUBLIC void sz_toupper(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_toupper_serial(ins, length, outs); }
-SZ_PUBLIC void sz_toascii(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_toascii_serial(ins, length, outs); }
-SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t ins, sz_size_t length) { return sz_isascii_serial(ins, length); }
-
-SZ_PUBLIC void sz_hashes_fingerprint(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_ptr_t fingerprint,
-                                     sz_size_t fingerprint_bytes) {
-
-    sz_bool_t fingerprint_length_is_power_of_two = (sz_bool_t)((fingerprint_bytes & (fingerprint_bytes - 1)) == 0);
-    sz_string_view_t fingerprint_buffer = {fingerprint, fingerprint_bytes};
-
-    // There are several issues related to the fingerprinting algorithm.
-    // First, the memory traversal order is important.
-    // https://blog.stuffedcow.net/2015/08/pagewalk-coherence/
-
-    // In most cases the fingerprint length will be a power of two.
-    if (fingerprint_length_is_power_of_two == sz_false_k)
-        sz_hashes(start, length, window_length, 1, _sz_hashes_fingerprint_non_pow2_callback, &fingerprint_buffer);
-    else
-        sz_hashes(start, length, window_length, 1, _sz_hashes_fingerprint_pow2_callback, &fingerprint_buffer);
-}
-
 #if !SZ_DYNAMIC_DISPATCH
 
 SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    return sz_checksum_avx512(text, length);
-#elif SZ_USE_X86_AVX2
+#if SZ_USE_ICE
+    return sz_checksum_ice(text, length);
+#elif SZ_USE_HASWELL
     return sz_checksum_avx2(text, length);
-#elif SZ_USE_ARM_NEON
+#elif SZ_USE_NEON
     return sz_checksum_neon(text, length);
 #else
     return sz_checksum_serial(text, length);
 #endif
 }
 
-SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    return sz_equal_avx512(a, b, length);
-#elif SZ_USE_X86_AVX2
-    return sz_equal_avx2(a, b, length);
-#elif SZ_USE_ARM_NEON
-    return sz_equal_neon(a, b, length);
-#else
-    return sz_equal_serial(a, b, length);
-#endif
-}
-
-SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-#if SZ_USE_X86_AVX512
-    return sz_order_avx512(a, a_length, b, b_length);
-#elif SZ_USE_X86_AVX2
-    return sz_order_avx2(a, a_length, b, b_length);
-#elif SZ_USE_ARM_NEON
-    return sz_order_neon(a, a_length, b, b_length);
-#else
-    return sz_order_serial(a, a_length, b, b_length);
-#endif
-}
-
-SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    sz_copy_avx512(target, source, length);
-#elif SZ_USE_X86_AVX2
-    sz_copy_avx2(target, source, length);
-#elif SZ_USE_ARM_NEON
-    sz_copy_neon(target, source, length);
-#else
-    sz_copy_serial(target, source, length);
-#endif
-}
-
-SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    sz_move_avx512(target, source, length);
-#elif SZ_USE_X86_AVX2
-    sz_move_avx2(target, source, length);
-#elif SZ_USE_ARM_NEON
-    sz_move_neon(target, source, length);
-#else
-    sz_move_serial(target, source, length);
-#endif
-}
-
-SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-#if SZ_USE_X86_AVX512
-    sz_fill_avx512(target, length, value);
-#elif SZ_USE_X86_AVX2
-    sz_fill_avx2(target, length, value);
-#elif SZ_USE_ARM_NEON
-    sz_fill_neon(target, length, value);
-#else
-    sz_fill_serial(target, length, value);
-#endif
-}
-
-SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-#if SZ_USE_X86_AVX512
-    sz_look_up_transform_avx512(source, length, lut, target);
-#elif SZ_USE_X86_AVX2
-    sz_look_up_transform_avx2(source, length, lut, target);
-#elif SZ_USE_ARM_NEON
-    sz_look_up_transform_neon(source, length, lut, target);
-#else
-    sz_look_up_transform_serial(source, length, lut, target);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
-#if SZ_USE_X86_AVX512
-    return sz_find_byte_avx512(haystack, h_length, needle);
-#elif SZ_USE_X86_AVX2
-    return sz_find_byte_avx2(haystack, h_length, needle);
-#elif SZ_USE_ARM_NEON
-    return sz_find_byte_neon(haystack, h_length, needle);
-#else
-    return sz_find_byte_serial(haystack, h_length, needle);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
-#if SZ_USE_X86_AVX512
-    return sz_rfind_byte_avx512(haystack, h_length, needle);
-#elif SZ_USE_X86_AVX2
-    return sz_rfind_byte_avx2(haystack, h_length, needle);
-#elif SZ_USE_ARM_NEON
-    return sz_rfind_byte_neon(haystack, h_length, needle);
-#else
-    return sz_rfind_byte_serial(haystack, h_length, needle);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
-#if SZ_USE_X86_AVX512
-    return sz_find_avx512(haystack, h_length, needle, n_length);
-#elif SZ_USE_X86_AVX2
-    return sz_find_avx2(haystack, h_length, needle, n_length);
-#elif SZ_USE_ARM_NEON
-    return sz_find_neon(haystack, h_length, needle, n_length);
-#else
-    return sz_find_serial(haystack, h_length, needle, n_length);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
-#if SZ_USE_X86_AVX512
-    return sz_rfind_avx512(haystack, h_length, needle, n_length);
-#elif SZ_USE_X86_AVX2
-    return sz_rfind_avx2(haystack, h_length, needle, n_length);
-#elif SZ_USE_ARM_NEON
-    return sz_rfind_neon(haystack, h_length, needle, n_length);
-#else
-    return sz_rfind_serial(haystack, h_length, needle, n_length);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#if SZ_USE_X86_AVX512
-    return sz_find_charset_avx512(text, length, set);
-#elif SZ_USE_X86_AVX2
-    return sz_find_charset_avx2(text, length, set);
-#elif SZ_USE_ARM_NEON
-    return sz_find_charset_neon(text, length, set);
-#else
-    return sz_find_charset_serial(text, length, set);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#if SZ_USE_X86_AVX512
-    return sz_rfind_charset_avx512(text, length, set);
-#elif SZ_USE_X86_AVX2
-    return sz_rfind_charset_avx2(text, length, set);
-#elif SZ_USE_ARM_NEON
-    return sz_rfind_charset_neon(text, length, set);
-#else
-    return sz_rfind_charset_serial(text, length, set);
-#endif
-}
-
-SZ_DYNAMIC sz_size_t sz_hamming_distance( //
-    sz_cptr_t a, sz_size_t a_length,      //
-    sz_cptr_t b, sz_size_t b_length,      //
-    sz_size_t bound) {
-    return sz_hamming_distance_serial(a, a_length, b, b_length, bound);
-}
-
-SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length,           //
-    sz_cptr_t b, sz_size_t b_length,           //
-    sz_size_t bound) {
-    return sz_hamming_distance_utf8_serial(a, a_length, b, b_length, bound);
-}
-
-SZ_DYNAMIC sz_size_t sz_edit_distance( //
-    sz_cptr_t a, sz_size_t a_length,   //
-    sz_cptr_t b, sz_size_t b_length,   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-#if SZ_USE_X86_AVX512
-    return sz_edit_distance_avx512(a, a_length, b, b_length, bound, alloc);
-#else
-    return sz_edit_distance_serial(a, a_length, b, b_length, bound, alloc);
-#endif
-}
-
-SZ_DYNAMIC sz_size_t sz_edit_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length,        //
-    sz_cptr_t b, sz_size_t b_length,        //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-    return _sz_edit_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc);
-}
-
-SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                         sz_error_cost_t const *subs, sz_error_cost_t gap,
-                                         sz_memory_allocator_t *alloc) {
-#if SZ_USE_X86_AVX512
-    return sz_alignment_score_avx512(a, a_length, b, b_length, subs, gap, alloc);
-#else
-    return sz_alignment_score_serial(a, a_length, b, b_length, subs, gap, alloc);
-#endif
-}
-
 SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
                           sz_hash_callback_t callback, void *callback_handle) {
-#if SZ_USE_X86_AVX512
-    sz_hashes_avx512(text, length, window_length, window_step, callback, callback_handle);
-#elif SZ_USE_X86_AVX2
+#if SZ_USE_ICE
+    sz_hashes_ice(text, length, window_length, window_step, callback, callback_handle);
+#elif SZ_USE_HASWELL
     sz_hashes_avx2(text, length, window_length, window_step, callback, callback_handle);
 #else
     sz_hashes_serial(text, length, window_length, window_step, callback, callback_handle);
 #endif
 }
 
-SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    return sz_find_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    sz_charset_invert(&set);
-    return sz_find_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    return sz_rfind_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    sz_charset_invert(&set);
-    return sz_rfind_charset(h, h_length, &set);
-}
-
 SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
                             sz_random_generator_t generator, void *generator_user_data) {
     sz_generate_serial(alphabet, alphabet_size, result, result_length, generator, generator_user_data);
 }
 
-#endif
-#pragma endregion
+#endif            // !SZ_DYNAMIC_DISPATCH
+#pragma endregion // Compile Time Dispatching
 
 #ifdef __cplusplus
-#pragma GCC diagnostic pop
 }
 #endif // __cplusplus
-
-#endif // STRINGZILLA_H_
+#endif // STRINGZILLA_HASH_H_

From 86f53d99b93b0495b3f8f5ce81e607e1dc80e765 Mon Sep 17 00:00:00 2001
From: Alex Bondarev <44079602+alexbarev@users.noreply.github.com>
Date: Sat, 7 Dec 2024 21:56:31 +0400
Subject: [PATCH 035/751] Test: Add ASCII utilities tests exposing final
 character exclusion bug

---
 scripts/test.cpp | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/scripts/test.cpp b/scripts/test.cpp
index eecc97f0..e8123995 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -137,6 +137,49 @@ static void test_arithmetical_utilities() {
                    (static_cast<sz_u8_t>(number) / static_cast<sz_u8_t>(divisor)));
 }
 
+/**
+ * @brief Tests various ASCII-based methods (e.g., is_alpha, is_digit)
+ *        provided by `sz::string` and `sz::string_view`.
+ */
+template <typename string_type>
+static void test_ascii_utilities() {
+
+    using str = string_type;
+
+    assert(!str("").is_alpha());
+    assert(str("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ").is_alpha());
+    assert(!str("abc9").is_alpha());
+
+    assert(!str("").is_alnum());
+    assert(str("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789").is_alnum());
+    assert(!str("abc!").is_alnum());
+
+    assert(!str("").is_ascii());
+    assert(str("\x00x7F").is_ascii());
+    assert(!str("abc123🔥").is_ascii());
+
+    assert(!str("").is_digit());
+    assert(str("0123456789").is_digit());
+    assert(!str("012a").is_digit());
+
+    assert(!str("").is_lower());
+    assert(str("abcdefghijklmnopqrstuvwxyz").is_lower());
+    assert(!str("abcA").is_lower());
+    assert(!str("abc\n").is_lower());
+
+    assert(!str("").is_space());
+    assert(str(" \t\n\r\f\v").is_space());
+    assert(!str(" \t\r\na").is_space());
+
+    assert(!str("").is_upper());
+    assert(str("ABCDEFGHIJKLMNOPQRSTUVWXYZ").is_upper());
+    assert(!str("ABCa").is_upper());
+
+    assert(!str("").is_printable());
+    assert(str("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*()_+").is_printable());
+    assert(!str("012\n").is_printable());
+}
+
 inline void expect_equality(char const *a, char const *b, std::size_t size) {
     if (std::memcmp(a, b, size) == 0) return;
     std::size_t mismatch_position = 0;
@@ -1583,6 +1626,8 @@ int main(int argc, char const **argv) {
 
     // Basic utilities
     test_arithmetical_utilities();
+    test_ascii_utilities<sz::string>();
+    test_ascii_utilities<sz::string_view>();
     test_memory_utilities();
     test_replacements();
 

From 8b44d6a5fe4d4ee3cf38d76d2d690bcf5b1a8a2d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 18:26:40 +0000
Subject: [PATCH 036/751] Improve: Platform-specific equality checks

---
 include/stringzilla/find.h | 43 +++++++++++++-------------------------
 1 file changed, 15 insertions(+), 28 deletions(-)

diff --git a/include/stringzilla/find.h b/include/stringzilla/find.h
index a51bd4c6..4571515d 100644
--- a/include/stringzilla/find.h
+++ b/include/stringzilla/find.h
@@ -305,20 +305,6 @@ SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz
 #pragma GCC diagnostic pop
 }
 
-/**
- *  @brief  Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each byte is set.
-    // For that take the bottom 7 bits of each byte, add one to them,
-    // and if this sets the top bit to one, then all the 7 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7F7F7F7F7F7F7F7Full) + 0x0101010101010101ull) & ((vec.u64 & 0x8080808080808080ull));
-    return vec;
-}
-
 /*  Find the first occurrence of a @b single-character needle in an arbitrary length haystack.
  *  This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  *  Identical to `memchr(haystack, needle[0], haystack_length)`.
@@ -895,7 +881,7 @@ SZ_PUBLIC sz_cptr_t sz_find_haswell(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n
             _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
         while (matches) {
             int potential_offset = sz_u32_ctz(matches);
-            if (sz_equal(h + potential_offset, n, n_length)) return h + potential_offset;
+            if (sz_equal_haswell(h + potential_offset, n, n_length)) return h + potential_offset;
             matches &= matches - 1;
         }
     }
@@ -933,7 +919,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_haswell(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
             _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
         while (matches) {
             int potential_offset = sz_u32_clz(matches);
-            if (sz_equal(h + h_length - n_length - potential_offset, n, n_length))
+            if (sz_equal_haswell(h + h_length - n_length - potential_offset, n, n_length))
                 return h + h_length - n_length - potential_offset;
             matches &= ~(1 << (31 - potential_offset));
         }
@@ -1074,7 +1060,7 @@ SZ_PUBLIC sz_bool_t sz_equal_skylake(sz_cptr_t a, sz_cptr_t b, sz_size_t length)
     return sz_true_k;
 }
 
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
+SZ_PUBLIC sz_cptr_t sz_find_byte_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
     __mmask64 mask;
     sz_u512_vec_t h_vec, n_vec;
     n_vec.zmm = _mm512_set1_epi8(n[0]);
@@ -1101,7 +1087,7 @@ SZ_PUBLIC sz_cptr_t sz_find_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n
 
     // This almost never fires, but it's better to be safe than sorry.
     if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_avx512(h, h_length, n);
+    if (n_length == 1) return sz_find_byte_skylake(h, h_length, n);
 
     // Pick the parts of the needle that are worth comparing.
     sz_size_t offset_first, offset_mid, offset_last;
@@ -1198,7 +1184,7 @@ SZ_PUBLIC sz_cptr_t sz_find_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n
     return SZ_NULL_CHAR;
 }
 
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
+SZ_PUBLIC sz_cptr_t sz_rfind_byte_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
     __mmask64 mask;
     sz_u512_vec_t h_vec, n_vec;
     n_vec.zmm = _mm512_set1_epi8(n[0]);
@@ -1225,7 +1211,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
 
     // This almost never fires, but it's better to be safe than sorry.
     if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_avx512(h, h_length, n);
+    if (n_length == 1) return sz_rfind_byte_skylake(h, h_length, n);
 
     // Pick the parts of the needle that are worth comparing.
     sz_size_t offset_first, offset_mid, offset_last;
@@ -1583,7 +1569,7 @@ SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, s
             matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
             while (matches) {
                 int potential_offset = sz_u64_ctz(matches) / 4;
-                if (sz_equal(h + potential_offset, n, n_length)) return h + potential_offset;
+                if (sz_equal_neon(h + potential_offset, n, n_length)) return h + potential_offset;
                 matches &= matches - 1;
             }
         }
@@ -1623,7 +1609,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
         matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
         while (matches) {
             int potential_offset = sz_u64_clz(matches) / 4;
-            if (sz_equal(h + h_length - n_length - potential_offset, n, n_length))
+            if (sz_equal_neon(h + h_length - n_length - potential_offset, n, n_length))
                 return h + h_length - n_length - potential_offset;
             sz_assert((matches & (1ull << (63 - potential_offset * 4))) != 0 &&
                       "The bit must be set before we squash it");
@@ -1678,6 +1664,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_ch
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+sve")
 #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
+
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_SVE
@@ -1692,8 +1679,8 @@ SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_ch
 #pragma region Core Funcitonality
 
 SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
-#if SZ_USE_ICE
-    return sz_find_byte_avx512(haystack, h_length, needle);
+#if SZ_USE_SKYLAKE
+    return sz_find_byte_skylake(haystack, h_length, needle);
 #elif SZ_USE_HASWELL
     return sz_find_byte_haswell(haystack, h_length, needle);
 #elif SZ_USE_NEON
@@ -1704,8 +1691,8 @@ SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cpt
 }
 
 SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
-#if SZ_USE_ICE
-    return sz_rfind_byte_avx512(haystack, h_length, needle);
+#if SZ_USE_SKYLAKE
+    return sz_rfind_byte_skylake(haystack, h_length, needle);
 #elif SZ_USE_HASWELL
     return sz_rfind_byte_haswell(haystack, h_length, needle);
 #elif SZ_USE_NEON
@@ -1716,7 +1703,7 @@ SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cp
 }
 
 SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
-#if SZ_USE_ICE
+#if SZ_USE_SKYLAKE
     return sz_find_skylake(haystack, h_length, needle, n_length);
 #elif SZ_USE_HASWELL
     return sz_find_haswell(haystack, h_length, needle, n_length);
@@ -1728,7 +1715,7 @@ SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t n
 }
 
 SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
-#if SZ_USE_ICE
+#if SZ_USE_SKYLAKE
     return sz_rfind_skylake(haystack, h_length, needle, n_length);
 #elif SZ_USE_HASWELL
     return sz_rfind_haswell(haystack, h_length, needle, n_length);

From 4a1f03c46b4f60be3b28e31f58b500734e23699e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 18:32:35 +0000
Subject: [PATCH 037/751] Make: Separate builds for Skylake & Ice

---
 CMakeLists.txt | 24 +++++++++++++-----------
 README.md      |  2 +-
 build.rs       | 46 ++++++++++++++++++++++++++--------------------
 c/lib.c        | 41 +++++++++++++++++++++++------------------
 setup.py       | 27 +++++++++++++++------------
 5 files changed, 78 insertions(+), 62 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 93a9b847..c09fd6e7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,7 +46,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64|arm64|ARM64")
   message(STATUS "Platform: ARM")
 endif()
 
-# Determine if StringZilla is built as a subproject (using `add_subdirectory`)
+# Determine if StringZilla is built as a sub-project (using `add_subdirectory`)
 # or if it is the main project
 set(STRINGZILLA_IS_MAIN_PROJECT OFF)
 
@@ -99,7 +99,7 @@ endif()
 
 if (MSVC)
   # Remove /RTC* from MSVC debug flags by default (it will be added back in the set_compiler_flags function)
-  # Beacuse /RTC* cannot be used without the crt so it needs to be disabled for that specifc target
+  # Because /RTC* cannot be used without the crt so it needs to be disabled for that specific target
   string(REGEX REPLACE "/RTC[^ ]*" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
   string(REGEX REPLACE "/RTC[^ ]*" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
 endif()
@@ -303,18 +303,20 @@ if(${STRINGZILLA_BUILD_SHARED})
       endif()
 
       target_compile_definitions(${target} PRIVATE
-        "SZ_USE_X86_AVX512=1"
-        "SZ_USE_X86_AVX2=1"
-        "SZ_USE_ARM_NEON=0"
-        "SZ_USE_ARM_SVE=0")
+        "SZ_USE_HASWELL=1"
+        "SZ_USE_SKYLAKE=1"
+        "SZ_USE_ICE=1"
+        "SZ_USE_NEON=0"
+        "SZ_USE_SVE=0")
     elseif(SZ_PLATFORM_ARM)
       set_compiler_flags(${target} "" "armv8-a")
 
       target_compile_definitions(${target} PRIVATE
-        "SZ_USE_X86_AVX512=0"
-        "SZ_USE_X86_AVX2=0"
-        "SZ_USE_ARM_NEON=1"
-        "SZ_USE_ARM_SVE=1")
+        "SZ_USE_HASWELL=0"
+        "SZ_USE_SKYLAKE=0"
+        "SZ_USE_ICE=0"
+        "SZ_USE_NEON=1"
+        "SZ_USE_SVE=1")
     endif()
 
     if (MSVC)
@@ -337,7 +339,7 @@ if(${STRINGZILLA_BUILD_SHARED})
   target_compile_definitions(stringzillite PRIVATE "SZ_AVOID_LIBC=1")
   target_compile_definitions(stringzillite PRIVATE "SZ_OVERRIDE_LIBC=1")
 
-  # Avoid built-ins on MSVC and other compilers, as that will cause compileration errors
+  # Avoid built-ins on MSVC and other compilers, as that will cause compilation errors
   target_compile_options(stringzillite PRIVATE
     "$<$<CXX_COMPILER_ID:GNU,Clang>:-fno-builtin;-nostdlib>"
     "$<$<CXX_COMPILER_ID:MSVC>:/Oi-;/GS->")
diff --git a/README.md b/README.md
index d5c59ff9..c4122696 100644
--- a/README.md
+++ b/README.md
@@ -1172,7 +1172,7 @@ __`SZ_DEBUG`__:
 > If you want to enable more aggressive bounds-checking, define `SZ_DEBUG` before including the header.
 > If not explicitly set, it will be inferred from the build type.
 
-__`SZ_USE_X86_AVX512`, `SZ_USE_X86_AVX2`, `SZ_USE_ARM_NEON`__:
+__`SZ_USE_HASWELL`, `SZ_USE_SKYLAKE`, `SZ_USE_ICE`, `SZ_USE_NEON`, `SZ_USE_SVE`__:
 
 > One can explicitly disable certain families of SIMD instructions for compatibility purposes.
 > Default values are inferred at compile time.
diff --git a/build.rs b/build.rs
index 8f7a130d..bb5fb5cf 100644
--- a/build.rs
+++ b/build.rs
@@ -25,20 +25,22 @@ fn main() {
 
         // Set architecture-specific flags and macros
         if target_arch == "x86_64" {
-            build.define("SZ_USE_X86_AVX512", "1");
-            build.define("SZ_USE_X86_AVX2", "1");
+            build.define("SZ_USE_HASWELL", "1");
+            build.define("SZ_USE_SKYLAKE", "1");
+            build.define("SZ_USE_ICE", "1");
         } else {
-            build.define("SZ_USE_X86_AVX512", "0");
-            build.define("SZ_USE_X86_AVX2", "0");
+            build.define("SZ_USE_HASWELL", "0");
+            build.define("SZ_USE_SKYLAKE", "0");
+            build.define("SZ_USE_ICE", "0");
         }
 
         if target_arch == "aarch64" {
             build.flag_if_supported("-march=armv8-a+simd");
-            build.define("SZ_USE_ARM_SVE", "1");
-            build.define("SZ_USE_ARM_NEON", "1");
+            build.define("SZ_USE_NEON", "1");
+            build.define("SZ_USE_SVE", "1");
         } else {
-            build.define("SZ_USE_ARM_SVE", "0");
-            build.define("SZ_USE_ARM_NEON", "0");
+            build.define("SZ_USE_NEON", "0");
+            build.define("SZ_USE_SVE", "0");
         }
     } else if target.contains("darwin") {
         build.flag_if_supported("-fcolor-diagnostics");
@@ -47,28 +49,32 @@ fn main() {
 
         if target_arch == "x86_64" {
             // Assuming no AVX-512 support for Darwin as per setup.py logic
-            build.define("SZ_USE_X86_AVX512", "0");
-            build.define("SZ_USE_X86_AVX2", "1");
+            build.define("SZ_USE_HASWELL", "1");
+            build.define("SZ_USE_SKYLAKE", "0");
+            build.define("SZ_USE_ICE", "0");
         } else {
-            build.define("SZ_USE_X86_AVX512", "0");
-            build.define("SZ_USE_X86_AVX2", "0");
+            build.define("SZ_USE_HASWELL", "0");
+            build.define("SZ_USE_SKYLAKE", "0");
+            build.define("SZ_USE_ICE", "0");
         }
 
         if target_arch == "aarch64" {
-            build.define("SZ_USE_ARM_SVE", "0"); // Assuming no SVE support for Darwin
-            build.define("SZ_USE_ARM_NEON", "1");
+            build.define("SZ_USE_NEON", "1");
+            build.define("SZ_USE_SVE", "0"); // Assuming no SVE support for Darwin
         } else {
-            build.define("SZ_USE_ARM_SVE", "0");
-            build.define("SZ_USE_ARM_NEON", "0");
+            build.define("SZ_USE_NEON", "0");
+            build.define("SZ_USE_SVE", "0");
         }
     } else if target.contains("windows") {
         // Set architecture-specific flags and macros
         if target_arch == "x86_64" {
-            build.define("SZ_USE_X86_AVX512", "1");
-            build.define("SZ_USE_X86_AVX2", "1");
+            build.define("SZ_USE_HASWELL", "1");
+            build.define("SZ_USE_SKYLAKE", "1");
+            build.define("SZ_USE_ICE", "1");
         } else {
-            build.define("SZ_USE_X86_AVX512", "0");
-            build.define("SZ_USE_X86_AVX2", "0");
+            build.define("SZ_USE_HASWELL", "0");
+            build.define("SZ_USE_SKYLAKE", "0");
+            build.define("SZ_USE_ICE", "0");
         }
     }
 
diff --git a/c/lib.c b/c/lib.c
index 2394bf59..e1d98328 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -77,7 +77,7 @@ SZ_INTERNAL sz_capability_t sz_capabilities_arm(void) {
 
 SZ_DYNAMIC sz_capability_t sz_capabilities(void) {
 
-#if SZ_USE_X86_AVX512 || SZ_USE_X86_AVX2
+#if SZ_USE_HASWELL || SZ_USE_SKYLAKE || SZ_USE_ICE
 
     /// The states of 4 registers populated for a specific "cpuid" assembly call
     union four_registers_t {
@@ -131,7 +131,7 @@ SZ_DYNAMIC sz_capability_t sz_capabilities(void) {
 
 #endif // SZ_TARGET_X86
 
-#if SZ_USE_ARM_NEON || SZ_USE_ARM_SVE
+#if SZ_USE_NEON || SZ_USE_SVE
 
     return sz_capabilities_arm();
 
@@ -196,7 +196,7 @@ static void sz_dispatch_table_init(void) {
     impl->alignment_score = sz_alignment_score_serial;
     impl->hashes = sz_hashes_serial;
 
-#if SZ_USE_X86_AVX2
+#if SZ_USE_HASWELL
     if (caps & sz_cap_x86_avx2_k) {
         impl->equal = sz_equal_avx2;
         impl->order = sz_order_avx2;
@@ -216,34 +216,36 @@ static void sz_dispatch_table_init(void) {
     }
 #endif
 
-#if SZ_USE_X86_AVX512
+#if SZ_USE_SKYLAKE
     if (caps & sz_cap_x86_avx512f_k) {
-        impl->equal = sz_equal_avx512;
+        impl->equal = sz_equal_skylake;
         impl->order = sz_order_avx512;
 
         impl->copy = sz_copy_avx512;
         impl->move = sz_move_avx512;
         impl->fill = sz_fill_avx512;
 
-        impl->find = sz_find_avx512;
-        impl->rfind = sz_rfind_avx512;
+        impl->find = sz_find_skylake;
+        impl->rfind = sz_rfind_skylake;
         impl->find_byte = sz_find_byte_avx512;
         impl->rfind_byte = sz_rfind_byte_avx512;
 
         impl->edit_distance = sz_edit_distance_avx512;
     }
+#endif
 
+#if SZ_USE_ICE
     if ((caps & sz_cap_x86_avx512f_k) && (caps & sz_cap_x86_avx512vl_k) && (caps & sz_cap_x86_avx512vbmi2_k) &&
         (caps & sz_cap_x86_avx512bw_k) && (caps & sz_cap_x86_avx512vbmi_k)) {
-        impl->find_from_set = sz_find_charset_avx512;
-        impl->rfind_from_set = sz_rfind_charset_avx512;
+        impl->find_from_set = sz_find_charset_ice;
+        impl->rfind_from_set = sz_rfind_charset_ice;
         impl->alignment_score = sz_alignment_score_avx512;
-        impl->look_up_transform = sz_look_up_transform_avx512;
+        impl->look_up_transform = sz_look_up_transform_ice;
         impl->checksum = sz_checksum_avx512;
     }
 #endif
 
-#if SZ_USE_ARM_NEON
+#if SZ_USE_NEON
     if (caps & sz_cap_arm_neon_k) {
         impl->equal = sz_equal_neon;
 
@@ -361,14 +363,16 @@ SZ_DYNAMIC sz_size_t sz_edit_distance_utf8( //
     return _sz_edit_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc);
 }
 
-SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                         sz_error_cost_t const *subs, sz_error_cost_t gap,
-                                         sz_memory_allocator_t *alloc) {
+SZ_DYNAMIC sz_ssize_t sz_alignment_score( //
+    sz_cptr_t a, sz_size_t a_length,      //
+    sz_cptr_t b, sz_size_t b_length,      //
+    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
     return sz_dispatch_table.alignment_score(a, a_length, b, b_length, subs, gap, alloc);
 }
 
-SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                          sz_hash_callback_t callback, void *callback_handle) {
+SZ_DYNAMIC void sz_hashes(                                                     //
+    sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t step, //
+    sz_hash_callback_t callback, void *callback_handle) {
     sz_dispatch_table.hashes(text, length, window_length, step, callback, callback_handle);
 }
 
@@ -409,8 +413,9 @@ sz_u64_t _sz_random_generator(void *empty_state) {
 }
 #endif
 
-SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
-                            sz_random_generator_t generator, void *generator_user_data) {
+SZ_DYNAMIC void sz_generate( //
+    sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
+    sz_random_generator_t generator, void *generator_user_data) {
 #if !SZ_AVOID_LIBC
     if (!generator) generator = _sz_random_generator;
 #endif
diff --git a/setup.py b/setup.py
index 25b769e8..27ef6be2 100644
--- a/setup.py
+++ b/setup.py
@@ -54,10 +54,11 @@ def linux_settings() -> Tuple[List[str], List[str], List[Tuple[str]]]:
     # GCC is our primary compiler, so when packaging the library, even if the current machine
     # doesn't support AVX-512 or SVE, still precompile those.
     macros_args = [
-        ("SZ_USE_X86_AVX512", "1" if is_64bit_x86() else "0"),
-        ("SZ_USE_X86_AVX2", "1" if is_64bit_x86() else "0"),
-        ("SZ_USE_ARM_SVE", "1" if is_64bit_arm() else "0"),
-        ("SZ_USE_ARM_NEON", "1" if is_64bit_arm() else "0"),
+        ("SZ_USE_HASWELL", "1" if is_64bit_x86() else "0"),
+        ("SZ_USE_SKYLAKE", "1" if is_64bit_x86() else "0"),
+        ("SZ_USE_ICE", "1" if is_64bit_x86() else "0"),
+        ("SZ_USE_NEON", "1" if is_64bit_arm() else "0"),
+        ("SZ_USE_SVE", "1" if is_64bit_arm() else "0"),
         ("SZ_DETECT_BIG_ENDIAN", "1" if is_big_endian() else "0"),
     ]
 
@@ -89,10 +90,11 @@ def darwin_settings() -> Tuple[List[str], List[str], List[Tuple[str]]]:
     # During Universal builds, however, even AVX header cause compilation errors.
     can_use_avx2 = is_64bit_x86() and sysconfig.get_platform().startswith("universal")
     macros_args = [
-        ("SZ_USE_X86_AVX512", "0"),
-        ("SZ_USE_X86_AVX2", "1" if can_use_avx2 else "0"),
-        ("SZ_USE_ARM_SVE", "0"),
-        ("SZ_USE_ARM_NEON", "1" if is_64bit_arm() else "0"),
+        ("SZ_USE_HASWELL", "1" if can_use_avx2 else "0"),
+        ("SZ_USE_SKYLAKE", "0"),
+        ("SZ_USE_ICE", "0"),
+        ("SZ_USE_NEON", "1" if is_64bit_arm() else "0"),
+        ("SZ_USE_SVE", "0"),
     ]
 
     return compile_args, link_args, macros_args
@@ -107,10 +109,11 @@ def windows_settings() -> Tuple[List[str], List[str], List[Tuple[str]]]:
 
     # When packaging the library, even if the current machine doesn't support AVX-512 or SVE, still precompile those.
     macros_args = [
-        ("SZ_USE_X86_AVX512", "1" if is_64bit_x86() else "0"),
-        ("SZ_USE_X86_AVX2", "1" if is_64bit_x86() else "0"),
-        ("SZ_USE_ARM_SVE", "0"),
-        ("SZ_USE_ARM_NEON", "1" if is_64bit_arm() else "0"),
+        ("SZ_USE_HASWELL", "1" if is_64bit_x86() else "0"),
+        ("SZ_USE_SKYLAKE", "1" if is_64bit_x86() else "0"),
+        ("SZ_USE_ICE", "1" if is_64bit_x86() else "0"),
+        ("SZ_USE_NEON", "1" if is_64bit_arm() else "0"),
+        ("SZ_USE_SVE", "0"),
         ("SZ_DETECT_BIG_ENDIAN", "1" if is_big_endian() else "0"),
     ]
 

From 5b55e19d1378c61da88309b30a38f9cf7c64bf79 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 18:37:32 +0000
Subject: [PATCH 038/751] Fix: Filter `small_string.h` file

---
 include/stringzilla/small_string.h | 7148 +---------------------------
 1 file changed, 216 insertions(+), 6932 deletions(-)

diff --git a/include/stringzilla/small_string.h b/include/stringzilla/small_string.h
index de7fbcac..17625700 100644
--- a/include/stringzilla/small_string.h
+++ b/include/stringzilla/small_string.h
@@ -1,365 +1,47 @@
 /**
- *  @brief  StringZilla is a collection of advanced string algorithms, designed to be used in Big Data applications.
- *          It is generally faster than LibC, and has a broader & cleaner interface, and targets modern x86 CPUs
- *          with AVX-512 and Arm NEON and older CPUs with SWAR and auto-vectorization.
- *
- *  Consider overriding the following macros to customize the library:
- *
- *  - `SZ_DEBUG=0` - whether to enable debug assertions and logging.
- *  - `SZ_DYNAMIC_DISPATCH=0` - whether to use runtime dispatching of the most advanced SIMD backend.
- *  - `SZ_USE_MISALIGNED_LOADS=0` - whether to use misaligned loads on platforms that support them.
- *  - `SZ_SWAR_THRESHOLD=24` - threshold for switching to SWAR backend over serial byte-level for-loops.
- *  - `SZ_USE_X86_AVX512=?` - whether to use AVX-512 instructions on x86_64.
- *  - `SZ_USE_X86_AVX2=?` - whether to use AVX2 instructions on x86_64.
- *  - `SZ_USE_ARM_NEON=?` - whether to use NEON instructions on ARM.
- *  - `SZ_USE_ARM_SVE=?` - whether to use SVE instructions on ARM.
- *
- *  @see    StringZilla: https://github.com/ashvardanian/StringZilla/blob/main/README.md
- *  @see    LibC String: https://pubs.opengroup.org/onlinepubs/009695399/basedefs/string.h.html
- *
- *  @file   stringzilla.h
+ *  @brief  Small String Optimization implemented as a C 99 structure.
+ *  @file   small_string.h
  *  @author Ash Vardanian
- */
-#ifndef STRINGZILLA_H_
-#define STRINGZILLA_H_
-
-#define STRINGZILLA_VERSION_MAJOR 3
-#define STRINGZILLA_VERSION_MINOR 11
-#define STRINGZILLA_VERSION_PATCH 0
-
-/**
- *  @brief  When set to 1, the library will include the following LibC headers: <stddef.h> and <stdint.h>.
- *          In debug builds (SZ_DEBUG=1), the library will also include <stdio.h> and <stdlib.h>.
  *
- *  You may want to disable this compiling for use in the kernel, or in embedded systems.
- *  You may also avoid them, if you are very sensitive to compilation time and avoid pre-compiled headers.
- *  https://artificial-mind.net/projects/compile-health/
- */
-#ifndef SZ_AVOID_LIBC
-#define SZ_AVOID_LIBC (0) // true or false
-#endif
-
-/**
- *  @brief  A misaligned load can be - trying to fetch eight consecutive bytes from an address
- *          that is not divisible by eight. On x86 enabled by default. On ARM it's not.
- *
- *  Most platforms support it, but there is no industry standard way to check for those.
- *  This value will mostly affect the performance of the serial (SWAR) backend.
- */
-#ifndef SZ_USE_MISALIGNED_LOADS
-#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
-#define SZ_USE_MISALIGNED_LOADS (1) // true or false
-#else
-#define SZ_USE_MISALIGNED_LOADS (0) // true or false
-#endif
-#endif
-
-/**
- *  @brief  Removes compile-time dispatching, and replaces it with runtime dispatching.
- *          So the `sz_find` function will invoke the most advanced backend supported by the CPU,
- *          that runs the program, rather than the most advanced backend supported by the CPU
- *          used to compile the library or the downstream application.
- */
-#ifndef SZ_DYNAMIC_DISPATCH
-#define SZ_DYNAMIC_DISPATCH (0) // true or false
-#endif
-
-/**
- *  @brief  Analogous to `size_t` and `std::size_t`, unsigned integer, identical to pointer size.
- *          64-bit on most platforms where pointers are 64-bit.
- *          32-bit on platforms where pointers are 32-bit.
- */
-#if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64)
-#define SZ_DETECT_64_BIT (1)
-#define SZ_SIZE_MAX (0xFFFFFFFFFFFFFFFFull)  // Largest unsigned integer that fits into 64 bits.
-#define SZ_SSIZE_MAX (0x7FFFFFFFFFFFFFFFull) // Largest signed integer that fits into 64 bits.
-#else
-#define SZ_DETECT_64_BIT (0)
-#define SZ_SIZE_MAX (0xFFFFFFFFu)  // Largest unsigned integer that fits into 32 bits.
-#define SZ_SSIZE_MAX (0x7FFFFFFFu) // Largest signed integer that fits into 32 bits.
-#endif
-
-/**
- *  @brief  On Big-Endian machines StringZilla will work in compatibility mode.
- *          This disables SWAR hacks to minimize code duplication, assuming practically
- *          all modern popular platforms are Little-Endian.
+ *  Includes core APIs:
+ *  - `sz_string_init`
+ *  - `sz_string_init_length`
+ *  - `sz_string_free`
  *
- *  This variable is hard to infer from macros reliably. It's best to set it manually.
- *  For that CMake provides the `TestBigEndian` and `CMAKE_<LANG>_BYTE_ORDER` (from 3.20 onwards).
- *  In Python one can check `sys.byteorder == 'big'` in the `setup.py` script and pass the appropriate macro.
- *  https://stackoverflow.com/a/27054190
- */
-#ifndef SZ_DETECT_BIG_ENDIAN
-#if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN || defined(__BIG_ENDIAN__) || defined(__ARMEB__) || \
-    defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || defined(__MIBSEB) || defined(__MIBSEB__)
-#define SZ_DETECT_BIG_ENDIAN (1) //< It's a big-endian target architecture
-#else
-#define SZ_DETECT_BIG_ENDIAN (0) //< It's a little-endian target architecture
-#endif
-#endif
-
-/*
- *  Debugging and testing.
- */
-#ifndef SZ_DEBUG
-#if defined(DEBUG) || defined(_DEBUG) // This means "Not using DEBUG information".
-#define SZ_DEBUG (1)
-#else
-#define SZ_DEBUG (0)
-#endif
-#endif
-
-/**
- *  @brief  Threshold for switching to SWAR (8-bytes at a time) backend over serial byte-level for-loops.
- *          On very short strings, under 16 bytes long, at most a single word will be processed with SWAR.
- *          Assuming potentially misaligned loads, SWAR makes sense only after ~24 bytes.
- */
-#ifndef SZ_SWAR_THRESHOLD
-#if SZ_DEBUG
-#define SZ_SWAR_THRESHOLD (8u) // 8 bytes in debug builds
-#else
-#define SZ_SWAR_THRESHOLD (24u) // 24 bytes in release builds
-#endif
-#endif
-
-/*  Annotation for the public API symbols:
+ *  Accessing the underlying string:
+ *  - `sz_string_is_on_stack`
+ *  - `sz_string_unpack`
+ *  - `sz_string_range`
+ *  - `sz_string_equal`
+ *  - `sz_string_order`
  *
- *  - `SZ_PUBLIC` is used for functions that are part of the public API.
- *  - `SZ_INTERNAL` is used for internal helper functions with unstable APIs.
- *  - `SZ_DYNAMIC` is used for functions that are part of the public API, but are dispatched at runtime.
+ *  Modifying the string:
+ *  - `sz_string_reserve`
+ *  - `sz_string_expand`
+ *  - `sz_string_erase`
+ *  - `sz_string_shrink_to_fit`
  */
-#ifndef SZ_DYNAMIC
-#if SZ_DYNAMIC_DISPATCH
-#if defined(_WIN32) || defined(__CYGWIN__)
-#define SZ_DYNAMIC __declspec(dllexport)
-#define SZ_EXTERNAL __declspec(dllimport)
-#define SZ_PUBLIC inline static
-#define SZ_INTERNAL inline static
-#else
-#define SZ_DYNAMIC __attribute__((visibility("default")))
-#define SZ_EXTERNAL extern
-#define SZ_PUBLIC __attribute__((unused)) inline static
-#define SZ_INTERNAL __attribute__((always_inline)) inline static
-#endif // _WIN32 || __CYGWIN__
-#else
-#define SZ_DYNAMIC inline static
-#define SZ_EXTERNAL extern
-#define SZ_PUBLIC inline static
-#define SZ_INTERNAL inline static
-#endif // SZ_DYNAMIC_DISPATCH
-#endif // SZ_DYNAMIC
+#ifndef STRINGZILLA_SMALL_STRING_H_
+#define STRINGZILLA_SMALL_STRING_H_
 
-/**
- *  @brief  Alignment macro for 64-byte alignment.
- */
-#if defined(_MSC_VER)
-#define SZ_ALIGN64 __declspec(align(64))
-#elif defined(__GNUC__) || defined(__clang__)
-#define SZ_ALIGN64 __attribute__((aligned(64)))
-#else
-#define SZ_ALIGN64
-#endif
+#include "find.h"   // `sz_equal`
+#include "memory.h" // `sz_copy`, `sz_move`, `sz_fill`
+#include "types.h"  // `sz_size_t`, `sz_ptr_t`, `sz_cptr_t`
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/*
- *  Let's infer the integer types or pull them from LibC,
- *  if that is allowed by the user.
- */
-#if !SZ_AVOID_LIBC
-#include <stddef.h>           // `size_t`
-#include <stdint.h>           // `uint8_t`
-typedef int8_t sz_i8_t;       // Always 8 bits
-typedef uint8_t sz_u8_t;      // Always 8 bits
-typedef uint16_t sz_u16_t;    // Always 16 bits
-typedef int32_t sz_i32_t;     // Always 32 bits
-typedef uint32_t sz_u32_t;    // Always 32 bits
-typedef uint64_t sz_u64_t;    // Always 64 bits
-typedef int64_t sz_i64_t;     // Always 64 bits
-typedef size_t sz_size_t;     // Pointer-sized unsigned integer, 32 or 64 bits
-typedef ptrdiff_t sz_ssize_t; // Signed version of `sz_size_t`, 32 or 64 bits
-
-#else // if SZ_AVOID_LIBC:
-
-// ! The C standard doesn't specify the signedness of char.
-// ! On x86 char is signed by default while on Arm it is unsigned by default.
-// ! That's why we don't define `sz_char_t` and generally use explicit `sz_i8_t` and `sz_u8_t`.
-typedef signed char sz_i8_t;         // Always 8 bits
-typedef unsigned char sz_u8_t;       // Always 8 bits
-typedef unsigned short sz_u16_t;     // Always 16 bits
-typedef int sz_i32_t;                // Always 32 bits
-typedef unsigned int sz_u32_t;       // Always 32 bits
-typedef long long sz_i64_t;          // Always 64 bits
-typedef unsigned long long sz_u64_t; // Always 64 bits
-
-// Now we need to redefine the `size_t`.
-// Microsoft Visual C++ (MSVC) typically follows LLP64 data model on 64-bit platforms,
-// where integers, pointers, and long types have different sizes:
-//
-//  > `int` is 32 bits
-//  > `long` is 32 bits
-//  > `long long` is 64 bits
-//  > pointer (thus, `size_t`) is 64 bits
-//
-// In contrast, GCC and Clang on 64-bit Unix-like systems typically follow the LP64 model, where:
-//
-//  > `int` is 32 bits
-//  > `long` and pointer (thus, `size_t`) are 64 bits
-//  > `long long` is also 64 bits
-//
-// Source: https://learn.microsoft.com/en-us/windows/win32/winprog64/abstract-data-models
-#if SZ_DETECT_64_BIT
-typedef unsigned long long sz_size_t; // 64-bit.
-typedef long long sz_ssize_t;         // 64-bit.
-#else
-typedef unsigned sz_size_t;  // 32-bit.
-typedef unsigned sz_ssize_t; // 32-bit.
-#endif // SZ_DETECT_64_BIT
-
-#endif // SZ_AVOID_LIBC
-
-/**
- *  @brief  Compile-time assert macro similar to `static_assert` in C++.
- */
-#define sz_static_assert(condition, name)                \
-    typedef struct {                                     \
-        int static_assert_##name : (condition) ? 1 : -1; \
-    } sz_static_assert_##name##_t
-
-sz_static_assert(sizeof(sz_size_t) == sizeof(void *), sz_size_t_must_be_pointer_size);
-sz_static_assert(sizeof(sz_ssize_t) == sizeof(void *), sz_ssize_t_must_be_pointer_size);
-
-#pragma region Public API
-
-typedef char *sz_ptr_t;          // A type alias for `char *`
-typedef char const *sz_cptr_t;   // A type alias for `char const *`
-typedef sz_i8_t sz_error_cost_t; // Character mismatch cost for fuzzy matching functions
-
-typedef sz_u64_t sz_sorted_idx_t; // Index of a sorted string in a list of strings
-
-typedef enum { sz_false_k = 0, sz_true_k = 1 } sz_bool_t;                        // Only one relevant bit
-typedef enum { sz_less_k = -1, sz_equal_k = 0, sz_greater_k = 1 } sz_ordering_t; // Only three possible states: <=>
-
-/**
- *  @brief  Tiny string-view structure. It's POD type, unlike the `std::string_view`.
- */
-typedef struct sz_string_view_t {
-    sz_cptr_t start;
-    sz_size_t length;
-} sz_string_view_t;
-
-/**
- *  @brief  Enumeration of SIMD capabilities of the target architecture.
- *          Used to introspect the supported functionality of the dynamic library.
- */
-typedef enum sz_capability_t {
-    sz_cap_serial_k = 1,       /// Serial (non-SIMD) capability
-    sz_cap_any_k = 0x7FFFFFFF, /// Mask representing any capability
-
-    sz_cap_arm_neon_k = 1 << 10, /// ARM NEON capability
-    sz_cap_arm_sve_k = 1 << 11,  /// ARM SVE capability TODO: Not yet supported or used
-    sz_cap_arm_sve2_k = 1 << 12,
-    sz_cap_arm_sve2p1_k = 1 << 13,
-    sz_cap_x86_avx2_k = 1 << 20,       /// x86 AVX2 capability
-    sz_cap_x86_avx512f_k = 1 << 21,    /// x86 AVX512 F capability
-    sz_cap_x86_avx512bw_k = 1 << 22,   /// x86 AVX512 BW instruction capability
-    sz_cap_x86_avx512vl_k = 1 << 23,   /// x86 AVX512 VL instruction capability
-    sz_cap_x86_avx512vbmi_k = 1 << 24, /// x86 AVX512 VBMI instruction capability
-    sz_cap_x86_gfni_k = 1 << 25,       /// x86 AVX512 GFNI instruction capability
-
-} sz_capability_t;
-
-/**
- *  @brief  Function to determine the SIMD capabilities of the current machine @b only at @b runtime.
- *  @return A bitmask of the SIMD capabilities represented as a `sz_capability_t` enum value.
- */
-SZ_DYNAMIC sz_capability_t sz_capabilities(void);
-
-/**
- *  @brief  Bit-set structure for 256 possible byte values. Useful for filtering and search.
- *  @see    sz_charset_init, sz_charset_add, sz_charset_contains, sz_charset_invert
- */
-typedef union sz_charset_t {
-    sz_u64_t _u64s[4];
-    sz_u32_t _u32s[8];
-    sz_u16_t _u16s[16];
-    sz_u8_t _u8s[32];
-} sz_charset_t;
-
-/** @brief  Initializes a bit-set to an empty collection, meaning - all characters are banned. */
-SZ_PUBLIC void sz_charset_init(sz_charset_t *s) { s->_u64s[0] = s->_u64s[1] = s->_u64s[2] = s->_u64s[3] = 0; }
-
-/** @brief  Adds a character to the set and accepts @b unsigned integers. */
-SZ_PUBLIC void sz_charset_add_u8(sz_charset_t *s, sz_u8_t c) { s->_u64s[c >> 6] |= (1ull << (c & 63u)); }
-
-/** @brief  Adds a character to the set. Consider @b sz_charset_add_u8. */
-SZ_PUBLIC void sz_charset_add(sz_charset_t *s, char c) { sz_charset_add_u8(s, *(sz_u8_t *)(&c)); } // bitcast
-
-/** @brief  Checks if the set contains a given character and accepts @b unsigned integers. */
-SZ_PUBLIC sz_bool_t sz_charset_contains_u8(sz_charset_t const *s, sz_u8_t c) {
-    // Checking the bit can be done in different ways:
-    // - (s->_u64s[c >> 6] & (1ull << (c & 63u))) != 0
-    // - (s->_u32s[c >> 5] & (1u << (c & 31u))) != 0
-    // - (s->_u16s[c >> 4] & (1u << (c & 15u))) != 0
-    // - (s->_u8s[c >> 3] & (1u << (c & 7u))) != 0
-    return (sz_bool_t)((s->_u64s[c >> 6] & (1ull << (c & 63u))) != 0);
-}
-
-/** @brief  Checks if the set contains a given character. Consider @b sz_charset_contains_u8. */
-SZ_PUBLIC sz_bool_t sz_charset_contains(sz_charset_t const *s, char c) {
-    return sz_charset_contains_u8(s, *(sz_u8_t *)(&c)); // bitcast
-}
-
-/** @brief  Inverts the contents of the set, so allowed character get disallowed, and vice versa. */
-SZ_PUBLIC void sz_charset_invert(sz_charset_t *s) {
-    s->_u64s[0] ^= 0xFFFFFFFFFFFFFFFFull, s->_u64s[1] ^= 0xFFFFFFFFFFFFFFFFull, //
-        s->_u64s[2] ^= 0xFFFFFFFFFFFFFFFFull, s->_u64s[3] ^= 0xFFFFFFFFFFFFFFFFull;
-}
-
-typedef void *(*sz_memory_allocate_t)(sz_size_t, void *);
-typedef void (*sz_memory_free_t)(void *, sz_size_t, void *);
-typedef sz_u64_t (*sz_random_generator_t)(void *);
-
-/**
- *  @brief  Some complex pattern matching algorithms may require memory allocations.
- *          This structure is used to pass the memory allocator to those functions.
- *  @see    sz_memory_allocator_init_fixed
- */
-typedef struct sz_memory_allocator_t {
-    sz_memory_allocate_t allocate;
-    sz_memory_free_t free;
-    void *handle;
-} sz_memory_allocator_t;
-
-/**
- *  @brief  Initializes a memory allocator to use the system default `malloc` and `free`.
- *          ! The function is not available if the library was compiled with `SZ_AVOID_LIBC`.
- *
- *  @param alloc    Memory allocator to initialize.
- */
-SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc);
-
-/**
- *  @brief  Initializes a memory allocator to use a static-capacity buffer.
- *          No dynamic allocations will be performed.
- *
- *  @param alloc    Memory allocator to initialize.
- *  @param buffer   Buffer to use for allocations.
- *  @param length   Length of the buffer. @b Must be greater than 8 bytes. Different values would be optimal for
- *                  different algorithms and input lengths, but 4096 bytes (one RAM page) is a good default.
- */
-SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length);
+#pragma region Core Structure
 
 /**
  *  @brief  The number of bytes a stack-allocated string can hold, including the SZ_NULL termination character.
  *          ! This can't be changed from outside. Don't use the `#error` as it may already be included and set.
  */
-#ifdef SZ_STRING_INTERNAL_SPACE
-#undef SZ_STRING_INTERNAL_SPACE
+#ifdef _SZ_STRING_INTERNAL_SPACE
+#undef _SZ_STRING_INTERNAL_SPACE
 #endif
-#define SZ_STRING_INTERNAL_SPACE (sizeof(sz_size_t) * 3 - 1) // 3 pointers minus one byte for an 8-bit length
+#define _SZ_STRING_INTERNAL_SPACE (sizeof(sz_size_t) * 3 - 1) // 3 pointers minus one byte for an 8-bit length
 
 /**
  *  @brief  Tiny memory-owning string structure with a Small String Optimization (SSO).
@@ -376,7 +58,7 @@ SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void
  */
 typedef union sz_string_t {
 
-#if !SZ_DETECT_BIG_ENDIAN
+#if !_SZ_IS_BIG_ENDIAN
 
     struct external {
         sz_ptr_t start;
@@ -388,7 +70,7 @@ typedef union sz_string_t {
     struct internal {
         sz_ptr_t start;
         sz_u8_t length;
-        char chars[SZ_STRING_INTERNAL_SPACE];
+        char chars[_SZ_STRING_INTERNAL_SPACE];
     } internal;
 
 #else
@@ -402,7 +84,7 @@ typedef union sz_string_t {
 
     struct internal {
         sz_ptr_t start;
-        char chars[SZ_STRING_INTERNAL_SPACE];
+        char chars[_SZ_STRING_INTERNAL_SPACE];
         sz_u8_t length;
     } internal;
 
@@ -412,206 +94,9 @@ typedef union sz_string_t {
 
 } sz_string_t;
 
-typedef sz_u64_t (*sz_hash_t)(sz_cptr_t, sz_size_t);
-typedef sz_u64_t (*sz_checksum_t)(sz_cptr_t, sz_size_t);
-typedef sz_bool_t (*sz_equal_t)(sz_cptr_t, sz_cptr_t, sz_size_t);
-typedef sz_ordering_t (*sz_order_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
-typedef void (*sz_to_converter_t)(sz_cptr_t, sz_size_t, sz_ptr_t);
-
-/**
- *  @brief  Computes the 64-bit check-sum of bytes in a string.
- *          Similar to `std::ranges::accumulate`.
- *
- *  @param text     String to aggregate.
- *  @param length   Number of bytes in the text.
- *  @return         64-bit unsigned value.
- */
-SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length);
-
-/** @copydoc sz_checksum */
-SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length);
-
-/**
- *  @brief  Computes the 64-bit unsigned hash of a string. Fairly fast for short strings,
- *          simple implementation, and supports rolling computation, reused in other APIs.
- *          Similar to `std::hash` in C++.
- *
- *  @param text     String to hash.
- *  @param length   Number of bytes in the text.
- *  @return         64-bit hash value.
- *
- *  @see    sz_hashes, sz_hashes_fingerprint, sz_hashes_intersection
- */
-SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length);
-
-/** @copydoc sz_hash */
-SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t text, sz_size_t length);
-
-/**
- *  @brief  Checks if two string are equal.
- *          Similar to `memcmp(a, b, length) == 0` in LibC and `a == b` in STL.
- *
- *  The implementation of this function is very similar to `sz_order`, but the usage patterns are different.
- *  This function is more often used in parsing, while `sz_order` is often used in sorting.
- *  It works best on platforms with cheap
- *
- *  @param a        First string to compare.
- *  @param b        Second string to compare.
- *  @param length   Number of bytes in both strings.
- *  @return         1 if strings match, 0 otherwise.
- */
-SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-
-/**
- *  @brief  Estimates the relative order of two strings. Equivalent to `memcmp(a, b, length)` in LibC.
- *          Can be used on different length strings.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *  @return         Negative if (a < b), positive if (a > b), zero if they are equal.
- */
-SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-
-/**
- *  @brief  Look Up Table @b (LUT) transformation of a string. Equivalent to `for (char & c : text) c = lut[c]`.
- *
- *  Can be used to implement some form of string normalization, partially masking punctuation marks,
- *  or converting between different character sets, like uppercase or lowercase. Surprisingly, also has
- *  broad implications in image processing, where image channel transformations are often done using LUTs.
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param lut      Look Up Table to apply. Must be exactly @b 256 bytes long.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
-
-typedef void (*sz_look_up_transform_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_ptr_t);
-
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_serial(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
+#pragma endregion // Core Structure
 
-/**
- *  @brief  Equivalent to `for (char & c : text) c = tolower(c)`.
- *
- *  ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
- *  So there are 26 english letters, shifted by 32 values, meaning that a conversion
- *  can be done by flipping the 5th bit each inappropriate character byte. This, however,
- *  breaks for extended ASCII, so a different solution is needed.
- *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_PUBLIC void sz_tolower(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
-
-/**
- *  @brief  Equivalent to `for (char & c : text) c = toupper(c)`.
- *
- *  ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
- *  So there are 26 english letters, shifted by 32 values, meaning that a conversion
- *  can be done by flipping the 5th bit each inappropriate character byte. This, however,
- *  breaks for extended ASCII, so a different solution is needed.
- *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_PUBLIC void sz_toupper(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
-
-/**
- *  @brief  Equivalent to `for (char & c : text) c = toascii(c)`.
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_PUBLIC void sz_toascii(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
-
-/**
- *  @brief  Checks if all characters in the range are valid ASCII characters.
- *
- *  @param text     String to be analyzed.
- *  @param length   Number of bytes in the string.
- *  @return         Whether all characters are valid ASCII characters.
- */
-SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t text, sz_size_t length);
-
-/**
- *  @brief  Generates a random string for a given alphabet, avoiding integer division and modulo operations.
- *          Similar to `text[i] = alphabet[rand() % cardinality]`.
- *
- *  The modulo operation is expensive, and should be avoided in performance-critical code.
- *  We avoid it using small lookup tables and replacing it with a multiplication and shifts, similar to `libdivide`.
- *  Alternative algorithms would include:
- *      - Montgomery form: https://en.algorithmica.org/hpc/number-theory/montgomery/
- *      - Barret reduction: https://www.nayuki.io/page/barrett-reduction-algorithm
- *      - Lemire's trick: https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
- *
- *  @param alphabet     Set of characters to sample from.
- *  @param cardinality  Number of characters to sample from.
- *  @param text         Output string, can point to the same address as ::text.
- *  @param generate     Callback producing random numbers given the generator state.
- *  @param generator    Generator state, can be a pointer to a seed, or a pointer to a random number generator.
- */
-SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
-                            sz_random_generator_t generate, void *generator);
-
-/** @copydoc sz_generate */
-SZ_PUBLIC void sz_generate_serial(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
-                                  sz_random_generator_t generate, void *generator);
-
-/**
- *  @brief  Similar to `memcpy`, copies contents of one string into another.
- *          The behavior is undefined if the strings overlap.
- *
- *  @param target   String to copy into.
- *  @param length   Number of bytes to copy.
- *  @param source   String to copy from.
- */
-SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-/**
- *  @brief  Similar to `memmove`, copies (moves) contents of one string into another.
- *          Unlike `sz_copy`, allows overlapping strings as arguments.
- *
- *  @param target   String to copy into.
- *  @param length   Number of bytes to copy.
- *  @param source   String to copy from.
- */
-SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-typedef void (*sz_move_t)(sz_ptr_t, sz_cptr_t, sz_size_t);
-
-/**
- *  @brief  Similar to `memset`, fills a string with a given value.
- *
- *  @param target   String to fill.
- *  @param length   Number of bytes to fill.
- *  @param value    Value to fill with.
- */
-SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-
-typedef void (*sz_fill_t)(sz_ptr_t, sz_size_t, sz_u8_t);
+#pragma region Core API
 
 /**
  *  @brief  Initializes a string class instance to an empty value.
@@ -634,8 +119,8 @@ SZ_PUBLIC sz_bool_t sz_string_is_on_stack(sz_string_t const *string);
  *  @param space        Number of bytes allocated for the string (heap or stack), including the SZ_NULL character.
  *  @param is_external  Whether the string is allocated on the heap externally, or fits withing ::string instance.
  */
-SZ_PUBLIC void sz_string_unpack(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length, sz_size_t *space,
-                                sz_bool_t *is_external);
+SZ_PUBLIC void sz_string_unpack( //
+    sz_string_t const *string, sz_ptr_t *start, sz_size_t *length, sz_size_t *space, sz_bool_t *is_external);
 
 /**
  *  @brief  Unpacks only the start and length of the string.
@@ -681,8 +166,8 @@ SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity
  *  @param allocator    Memory allocator to use for the allocation.
  *  @return             SZ_NULL if the operation failed, pointer to the new start of the string otherwise.
  */
-SZ_PUBLIC sz_ptr_t sz_string_expand(sz_string_t *string, sz_size_t offset, sz_size_t added_length,
-                                    sz_memory_allocator_t *allocator);
+SZ_PUBLIC sz_ptr_t sz_string_expand( //
+    sz_string_t *string, sz_size_t offset, sz_size_t added_length, sz_memory_allocator_t *allocator);
 
 /**
  *  @brief  Removes a range from a string. Changes the length, but not the capacity.
@@ -714,6443 +199,242 @@ SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *alloca
 
 #pragma endregion
 
-#pragma region Fast Substring Search API
-
-typedef sz_cptr_t (*sz_find_byte_t)(sz_cptr_t, sz_size_t, sz_cptr_t);
-typedef sz_cptr_t (*sz_find_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
-typedef sz_cptr_t (*sz_find_set_t)(sz_cptr_t, sz_size_t, sz_charset_t const *);
-
-/**
- *  @brief  Locates first matching byte in a string. Equivalent to `memchr(haystack, *needle, h_length)` in LibC.
- *
- *  X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memchr.S
- *  Aarch64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/aarch64/memchr.S
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - single-byte substring to find.
- *  @return         Address of the first match.
- */
-SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/**
- *  @brief  Locates last matching byte in a string. Equivalent to `memrchr(haystack, *needle, h_length)` in LibC.
- *
- *  X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memrchr.S
- *  Aarch64 implementation: missing
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - single-byte substring to find.
- *  @return         Address of the last match.
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+#pragma region Serial Implementation
 
-/**
- *  @brief  Locates first matching substring.
- *          Equivalent to `memmem(haystack, h_length, needle, n_length)` in LibC.
- *          Similar to `strstr(haystack, needle)` in LibC, but requires known length.
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - substring to find.
- *  @param n_length Number of bytes in the needle.
- *  @return         Address of the first match.
- */
-SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
+SZ_PUBLIC sz_bool_t sz_string_is_on_stack(sz_string_t const *string) {
+    // It doesn't matter if it's on stack or heap, the pointer location is the same.
+    return (sz_bool_t)((sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0]);
+}
 
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
+SZ_PUBLIC void sz_string_range(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length) {
+    sz_size_t is_small = (sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0];
+    sz_size_t is_big_mask = is_small - 1ull;
+    *start = string->external.start; // It doesn't matter if it's on stack or heap, the pointer location is the same.
+    // If the string is small, use branch-less approach to mask-out the top 7 bytes of the length.
+    *length = string->external.length & (0x00000000000000FFull | is_big_mask);
+}
 
-/**
- *  @brief  Locates the last matching substring.
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - substring to find.
- *  @param n_length Number of bytes in the needle.
- *  @return         Address of the last match.
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
+SZ_PUBLIC void sz_string_unpack( //
+    sz_string_t const *string, sz_ptr_t *start, sz_size_t *length, sz_size_t *space, sz_bool_t *is_external) {
+    sz_size_t is_small = (sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0];
+    sz_size_t is_big_mask = is_small - 1ull;
+    *start = string->external.start; // It doesn't matter if it's on stack or heap, the pointer location is the same.
+    // If the string is small, use branch-less approach to mask-out the top 7 bytes of the length.
+    *length = string->external.length & (0x00000000000000FFull | is_big_mask);
+    // In case the string is small, the `is_small - 1ull` will become 0xFFFFFFFFFFFFFFFFull.
+    *space = sz_u64_blend(_SZ_STRING_INTERNAL_SPACE, string->external.space, is_big_mask);
+    *is_external = (sz_bool_t)!is_small;
+}
 
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
+SZ_PUBLIC sz_bool_t sz_string_equal(sz_string_t const *a, sz_string_t const *b) {
+    // Tempting to say that the external.length is bitwise the same even if it includes
+    // some bytes of the on-stack payload, but we don't at this writing maintain that invariant.
+    // (An on-stack string includes noise bytes in the high-order bits of external.length. So do this
+    // the hard/correct way.
 
-/**
- *  @brief  Finds the first character present from the ::set, present in ::text.
- *          Equivalent to `strspn(text, accepted)` and `strcspn(text, rejected)` in LibC.
- *          May have identical implementation and performance to ::sz_rfind_charset.
- *
- *  Useful for parsing, when we want to skip a set of characters. Examples:
- *  * 6 whitespaces: " \t\n\r\v\f".
- *  * 16 digits forming a float number: "0123456789,.eE+-".
- *  * 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
- *  * 2 JSON string special characters useful to locate the end of the string: "\"\\".
- *
- *  @param text     String to be scanned.
- *  @param set      Set of relevant characters.
- *  @return         Pointer to the first matching character from ::set.
- */
-SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
+#if SZ_USE_MISALIGNED_LOADS
+    // Dealing with StringZilla strings, we know that the `start` pointer always points
+    // to a word at least 8 bytes long. Therefore, we can compare the first 8 bytes at once.
 
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
+#endif
+    // Alternatively, fall back to byte-by-byte comparison.
+    sz_ptr_t a_start, b_start;
+    sz_size_t a_length, b_length;
+    sz_string_range(a, &a_start, &a_length);
+    sz_string_range(b, &b_start, &b_length);
+    return (sz_bool_t)(a_length == b_length && sz_equal(a_start, b_start, b_length));
+}
 
-/**
- *  @brief  Finds the last character present from the ::set, present in ::text.
- *          Equivalent to `strspn(text, accepted)` and `strcspn(text, rejected)` in LibC.
- *          May have identical implementation and performance to ::sz_find_charset.
- *
- *  Useful for parsing, when we want to skip a set of characters. Examples:
- *  * 6 whitespaces: " \t\n\r\v\f".
- *  * 16 digits forming a float number: "0123456789,.eE+-".
- *  * 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
- *  * 2 JSON string special characters useful to locate the end of the string: "\"\\".
- *
- *  @param text     String to be scanned.
- *  @param set      Set of relevant characters.
- *  @return         Pointer to the last matching character from ::set.
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
+SZ_PUBLIC sz_ordering_t sz_string_order(sz_string_t const *a, sz_string_t const *b) {
+#if SZ_USE_MISALIGNED_LOADS
+    // Dealing with StringZilla strings, we know that the `start` pointer always points
+    // to a word at least 8 bytes long. Therefore, we can compare the first 8 bytes at once.
 
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
+#endif
+    // Alternatively, fall back to byte-by-byte comparison.
+    sz_ptr_t a_start, b_start;
+    sz_size_t a_length, b_length;
+    sz_string_range(a, &a_start, &a_length);
+    sz_string_range(b, &b_start, &b_length);
+    return sz_order(a_start, a_length, b_start, b_length);
+}
 
-#pragma endregion
+SZ_PUBLIC void sz_string_init(sz_string_t *string) {
+    sz_assert(string && "String can't be SZ_NULL.");
 
-#pragma region String Similarity Measures API
+    // Only 8 + 1 + 1 need to be initialized.
+    string->internal.start = &string->internal.chars[0];
+    // But for safety let's initialize the entire structure to zeros.
+    // string->internal.chars[0] = 0;
+    // string->internal.length = 0;
+    string->words[1] = 0;
+    string->words[2] = 0;
+    string->words[3] = 0;
+}
 
-/**
- *  @brief  Computes the Hamming distance between two strings - number of not matching characters.
- *          Difference in length is is counted as a mismatch.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for the distance, the `bound` if was exceeded.
- *
- *  @see    sz_hamming_distance_utf8
- *  @see    https://en.wikipedia.org/wiki/Hamming_distance
- */
-SZ_DYNAMIC sz_size_t sz_hamming_distance( //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
-
-/** @copydoc sz_hamming_distance */
-SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
-
-/**
- *  @brief  Computes the Hamming distance between two @b UTF8 strings - number of not matching characters.
- *          Difference in length is is counted as a mismatch.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for the distance, the `bound` if was exceeded.
- *
- *  @see    sz_hamming_distance
- *  @see    https://en.wikipedia.org/wiki/Hamming_distance
- */
-SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                              sz_size_t bound);
-
-/** @copydoc sz_hamming_distance_utf8 */
-SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                                    sz_size_t bound);
-
-typedef sz_size_t (*sz_hamming_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t);
-
-/**
- *  @brief  Computes the Levenshtein edit-distance between two strings using the Wagner-Fisher algorithm.
- *          Similar to the Needleman-Wunsch alignment algorithm. Often used in fuzzy string matching.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @param bound    Exclusive upper bound on the distance, that allows us to exit early.
- *                  Pass `SZ_SIZE_MAX` or any value greater than `(max(a_length, b_length))` to ignore.
- *                  Pass zero to check if the strings are equal.
- *  @return         Unsigned integer for the edit distance. Zero means the strings are equal.
- *                  Returns the `bound` if it was exceeded or `SZ_SIZE_MAX` if the memory allocation failed.
- *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default
- *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
- */
-SZ_DYNAMIC sz_size_t sz_edit_distance(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                      sz_size_t bound, sz_memory_allocator_t *alloc);
-
-/** @copydoc sz_edit_distance */
-SZ_PUBLIC sz_size_t sz_edit_distance_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                            sz_size_t bound, sz_memory_allocator_t *alloc);
-
-/**
- *  @brief  Computes the Levenshtein edit-distance between two @b UTF8 strings.
- *          Unlike `sz_edit_distance`, reports the distance in Unicode codepoints, and not in bytes.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for edit distance, the `bound` if was exceeded or `SZ_SIZE_MAX`
- *                  if the memory allocation failed.
- *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default, sz_edit_distance
- *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
- */
-SZ_DYNAMIC sz_size_t sz_edit_distance_utf8(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                           sz_size_t bound, sz_memory_allocator_t *alloc);
-
-typedef sz_size_t (*sz_edit_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t, sz_memory_allocator_t *);
-
-/** @copydoc sz_edit_distance_utf8 */
-SZ_PUBLIC sz_size_t sz_edit_distance_utf8_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                                 sz_size_t bound, sz_memory_allocator_t *alloc);
-
-/**
- *  @brief  Computes Needleman–Wunsch alignment score for two string. Often used in bioinformatics and cheminformatics.
- *          Similar to the Levenshtein edit-distance, parameterized for gap and substitution penalties.
- *
- *  Not commutative in the general case, as the order of the strings matters, as `sz_alignment_score(a, b)` may
- *  not be equal to `sz_alignment_score(b, a)`. Becomes @b commutative, if the substitution costs are symmetric.
- *  Equivalent to the negative Levenshtein distance, if: `gap == -1` and `subs[i][j] == (i == j ? 0: -1)`.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *  @param gap      Penalty cost for gaps - insertions and removals.
- *  @param subs     Substitution costs matrix with 256 x 256 values for all pairs of characters.
- *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @return         Signed similarity score. Can be negative, depending on the substitution costs.
- *                  If the memory allocation fails, the function returns `SZ_SSIZE_MAX`.
- *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default
- *  @see    https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
- */
-SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                         sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                         sz_memory_allocator_t *alloc);
-
-/** @copydoc sz_alignment_score */
-SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                               sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                               sz_memory_allocator_t *alloc);
-
-typedef sz_ssize_t (*sz_alignment_score_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_error_cost_t const *,
-                                           sz_error_cost_t, sz_memory_allocator_t *);
-
-typedef void (*sz_hash_callback_t)(sz_cptr_t, sz_size_t, sz_u64_t, void *user);
-
-/**
- *  @brief  Computes the Karp-Rabin rolling hashes of a string supplying them to the provided `callback`.
- *          Can be used for similarity scores, search, ranking, etc.
- *
- *  Rabin-Karp-like rolling hashes can have very high-level of collisions and depend
- *  on the choice of bases and the prime number. That's why, often two hashes from the same
- *  family are used with different bases.
- *
- *       1. Kernighan and Ritchie's function uses 31, a prime close to the size of English alphabet.
- *       2. To be friendlier to byte-arrays and UTF8, we use 257 for the second function.
- *
- *  Choosing the right ::window_length is task- and domain-dependant. For example, most English words are
- *  between 3 and 7 characters long, so a window of 4 bytes would be a good choice. For DNA sequences,
- *  the ::window_length might be a multiple of 3, as the codons are 3 (nucleotides) bytes long.
- *  With such minimalistic alphabets of just four characters (AGCT) longer windows might be needed.
- *  For protein sequences the alphabet is 20 characters long, so the window can be shorter, than for DNAs.
- *
- *  @param text             String to hash.
- *  @param length           Number of bytes in the string.
- *  @param window_length    Length of the rolling window in bytes.
- *  @param window_step      Step of reported hashes. @b Must be power of two. Should be smaller than `window_length`.
- *  @param callback         Function receiving the start & length of a substring, the hash, and the `callback_handle`.
- *  @param callback_handle  Optional user-provided pointer to be passed to the `callback`.
- *  @see                    sz_hashes_fingerprint, sz_hashes_intersection
- */
-SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                          sz_hash_callback_t callback, void *callback_handle);
-
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_serial(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                                sz_hash_callback_t callback, void *callback_handle);
-
-typedef void (*sz_hashes_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_size_t, sz_hash_callback_t, void *);
-
-/**
- *  @brief  Computes the Karp-Rabin rolling hashes of a string outputting a binary fingerprint.
- *          Such fingerprints can be compared with Hamming or Jaccard (Tanimoto) distance for similarity.
- *
- *  The algorithm doesn't clear the fingerprint buffer on start, so it can be invoked multiple times
- *  to produce a fingerprint of a longer string, by passing the previous fingerprint as the ::fingerprint.
- *  It can also be reused to produce multi-resolution fingerprints by changing the ::window_length
- *  and calling the same function multiple times for the same input ::text.
- *
- *  Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
- *  avoiding cache-coherency penalties of remote on-heap buffers.
- *
- *  @param text                 String to hash.
- *  @param length               Number of bytes in the string.
- *  @param fingerprint          Output fingerprint buffer.
- *  @param fingerprint_bytes    Number of bytes in the fingerprint buffer.
- *  @param window_length        Length of the rolling window in bytes.
- *  @see                        sz_hashes, sz_hashes_intersection
- */
-SZ_PUBLIC void sz_hashes_fingerprint(                          //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
-    sz_ptr_t fingerprint, sz_size_t fingerprint_bytes);
-
-typedef void (*sz_hashes_fingerprint_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_ptr_t, sz_size_t);
-
-/**
- *  @brief  Given a hash-fingerprint of a textual document, computes the number of intersecting hashes
- *          of the incoming document. Can be used for document scoring and search.
- *
- *  Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
- *  avoiding cache-coherency penalties of remote on-heap buffers.
- *
- *  @param text                 Input document.
- *  @param length               Number of bytes in the input document.
- *  @param fingerprint          Reference document fingerprint.
- *  @param fingerprint_bytes    Number of bytes in the reference documents fingerprint.
- *  @param window_length        Length of the rolling window in bytes.
- *  @see                        sz_hashes, sz_hashes_fingerprint
- */
-SZ_PUBLIC sz_size_t sz_hashes_intersection(                    //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
-    sz_cptr_t fingerprint, sz_size_t fingerprint_bytes);
-
-typedef sz_size_t (*sz_hashes_intersection_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_cptr_t, sz_size_t);
-
-#pragma endregion
-
-#pragma region Convenience API
-
-/**
- *  @brief  Finds the first character in the haystack, that is present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-/**
- *  @brief  Finds the first character in the haystack, that is @b not present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_find_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-/**
- *  @brief  Finds the last character in the haystack, that is present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-/**
- *  @brief  Finds the last character in the haystack, that is @b not present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-#pragma endregion
-
-#pragma region String Sequences API
-
-struct sz_sequence_t;
-
-typedef sz_cptr_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_size_t (*sz_sequence_member_length_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_bool_t (*sz_sequence_predicate_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_bool_t (*sz_sequence_comparator_t)(struct sz_sequence_t const *, sz_size_t, sz_size_t);
-typedef sz_bool_t (*sz_string_is_less_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
-
-typedef struct sz_sequence_t {
-    sz_sorted_idx_t *order;
-    sz_size_t count;
-    sz_sequence_member_start_t get_start;
-    sz_sequence_member_length_t get_length;
-    void const *handle;
-} sz_sequence_t;
-
-/**
- *  @brief  Initiates the sequence structure from a tape layout, used by Apache Arrow.
- *          Expects ::offsets to contains `count + 1` entries, the last pointing at the end
- *          of the last string, indicating the total length of the ::tape.
- */
-SZ_PUBLIC void sz_sequence_from_u32tape(sz_cptr_t *start, sz_u32_t const *offsets, sz_size_t count,
-                                        sz_sequence_t *sequence);
-
-/**
- *  @brief  Initiates the sequence structure from a tape layout, used by Apache Arrow.
- *          Expects ::offsets to contains `count + 1` entries, the last pointing at the end
- *          of the last string, indicating the total length of the ::tape.
- */
-SZ_PUBLIC void sz_sequence_from_u64tape(sz_cptr_t *start, sz_u64_t const *offsets, sz_size_t count,
-                                        sz_sequence_t *sequence);
-
-/**
- *  @brief  Similar to `std::partition`, given a predicate splits the sequence into two parts.
- *          The algorithm is unstable, meaning that elements may change relative order, as long
- *          as they are in the right partition. This is the simpler algorithm for partitioning.
- */
-SZ_PUBLIC sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate);
-
-/**
- *  @brief  Inplace `std::set_union` for two consecutive chunks forming the same continuous `sequence`.
- *
- *  @param partition The number of elements in the first sub-sequence in `sequence`.
- *  @param less Comparison function, to determine the lexicographic ordering.
- */
-SZ_PUBLIC void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less);
-
-/**
- *  @brief  Sorting algorithm, combining Radix Sort for the first 32 bits of every word
- *          and a follow-up by a more conventional sorting procedure on equally prefixed parts.
- */
-SZ_PUBLIC void sz_sort(sz_sequence_t *sequence);
-
-/**
- *  @brief  Partial sorting algorithm, combining Radix Sort for the first 32 bits of every word
- *          and a follow-up by a more conventional sorting procedure on equally prefixed parts.
- */
-SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t n);
-
-/**
- *  @brief  Intro-Sort algorithm that supports custom comparators.
- */
-SZ_PUBLIC void sz_sort_intro(sz_sequence_t *sequence, sz_sequence_comparator_t less);
-
-#pragma endregion
-
-/*
- *  Hardware feature detection.
- *  All of those can be controlled by the user.
- */
-#ifndef SZ_USE_X86_AVX512
-#ifdef __AVX512BW__
-#define SZ_USE_X86_AVX512 1
-#else
-#define SZ_USE_X86_AVX512 0
-#endif
-#endif
-
-#ifndef SZ_USE_X86_AVX2
-#ifdef __AVX2__
-#define SZ_USE_X86_AVX2 1
-#else
-#define SZ_USE_X86_AVX2 0
-#endif
-#endif
-
-#ifndef SZ_USE_ARM_NEON
-#ifdef __ARM_NEON
-#define SZ_USE_ARM_NEON 1
-#else
-#define SZ_USE_ARM_NEON 0
-#endif
-#endif
-
-#ifndef SZ_USE_ARM_SVE
-#ifdef __ARM_FEATURE_SVE
-#define SZ_USE_ARM_SVE 1
-#else
-#define SZ_USE_ARM_SVE 0
-#endif
-#endif
-
-/*
- *  Include hardware-specific headers.
- */
-#if SZ_USE_X86_AVX512 || SZ_USE_X86_AVX2
-#include <immintrin.h>
-#endif // SZ_USE_X86...
-#if SZ_USE_ARM_NEON
-#if !defined(_MSC_VER)
-#include <arm_acle.h>
-#endif
-#include <arm_neon.h>
-#endif // SZ_USE_ARM_NEON
-#if SZ_USE_ARM_SVE
-#if !defined(_MSC_VER)
-#include <arm_sve.h>
-#endif
-#endif // SZ_USE_ARM_SVE
-
-#pragma region Hardware Specific API
-
-#if SZ_USE_X86_AVX512
-
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_avx512(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_avx512(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_edit_distance */
-SZ_PUBLIC sz_size_t sz_edit_distance_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                            sz_size_t bound, sz_memory_allocator_t *alloc);
-/** @copydoc sz_alignment_score */
-SZ_PUBLIC sz_ssize_t sz_alignment_score_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                               sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                               sz_memory_allocator_t *alloc);
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle);
-#endif
-
-#if SZ_USE_X86_AVX2
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_avx2(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_avx2(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_avx2(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_avx2(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                              sz_hash_callback_t callback, void *callback_handle);
-#endif
-
-#if SZ_USE_ARM_NEON
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_neon(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_neon(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_neon(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-#endif
-
-#if SZ_USE_ARM_SVE
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_sve(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_sve(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_sve(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_sve(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_sve(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-#endif
-
-#pragma endregion
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wconversion"
-
-/*
- **********************************************************************************************************************
- **********************************************************************************************************************
- **********************************************************************************************************************
- *
- *  This is where we the actual implementation begins.
- *  The rest of the file is hidden from the public API.
- *
- **********************************************************************************************************************
- **********************************************************************************************************************
- **********************************************************************************************************************
- */
-
-#pragma region Compiler Extensions and Helper Functions
-
-#pragma GCC visibility push(hidden)
-
-/**
- *  @brief  Helper-macro to mark potentially unused variables.
- */
-#define sz_unused(x) ((void)(x))
-
-/**
- *  @brief  Helper-macro casting a variable to another type of the same size.
- */
-#define sz_bitcast(type, value) (*((type *)&(value)))
-
-/**
- *  @brief  Defines `SZ_NULL`, analogous to `NULL`.
- *          The default often comes from locale.h, stddef.h,
- *          stdio.h, stdlib.h, string.h, time.h, or wchar.h.
- */
-#ifdef __GNUG__
-#define SZ_NULL __null
-#define SZ_NULL_CHAR __null
-#else
-#define SZ_NULL ((void *)0)
-#define SZ_NULL_CHAR ((char *)0)
-#endif
-
-/**
- *  @brief  Cache-line width, that will affect the execution of some algorithms,
- *          like equality checks and relative order computing.
- */
-#define SZ_CACHE_LINE_WIDTH (64) // bytes
-
-/**
- *  @brief  Similar to `assert`, the `sz_assert` is used in the SZ_DEBUG mode
- *          to check the invariants of the library. It's a no-op in the SZ_RELEASE mode.
- *  @note   If you want to catch it, put a breakpoint at @b `__GI_exit`
- */
-#if SZ_DEBUG && defined(SZ_AVOID_LIBC) && !SZ_AVOID_LIBC && !defined(SZ_PIC)
-#include <stdio.h>  // `fprintf`
-#include <stdlib.h> // `EXIT_FAILURE`
-SZ_PUBLIC void _sz_assert_failure(char const *condition, char const *file, int line) {
-    fprintf(stderr, "Assertion failed: %s, in file %s, line %d\n", condition, file, line);
-    exit(EXIT_FAILURE);
-}
-#define sz_assert(condition)                                                      \
-    do {                                                                          \
-        if (!(condition)) { _sz_assert_failure(#condition, __FILE__, __LINE__); } \
-    } while (0)
-#else
-#define sz_assert(condition) ((void)(condition))
-#endif
-
-/*  Intrinsics aliases for MSVC, GCC, Clang, and Clang-Cl.
- *  The following section of compiler intrinsics comes in 2 flavors.
- */
-#if defined(_MSC_VER) && !defined(__clang__) // On Clang-CL
-#include <intrin.h>
-
-// Sadly, when building Win32 images, we can't use the `_tzcnt_u64`, `_lzcnt_u64`,
-// `_BitScanForward64`, or `_BitScanReverse64` intrinsics. For now it's a simple `for`-loop.
-// TODO: In the future we can switch to a more efficient De Bruijn's algorithm.
-// https://www.chessprogramming.org/BitScan
-// https://www.chessprogramming.org/De_Bruijn_Sequence
-// https://gist.github.com/resilar/e722d4600dbec9752771ab4c9d47044f
-//
-// Use the serial version on 32-bit x86 and on Arm.
-#if (defined(_WIN32) && !defined(_WIN64)) || defined(_M_ARM) || defined(_M_ARM64)
-SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 1) == 0) { n++, x >>= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u64_clz(sz_u64_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 0x8000000000000000ull) == 0) { n++, x <<= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) {
-    x = x - ((x >> 1) & 0x5555555555555555ull);
-    x = (x & 0x3333333333333333ull) + ((x >> 2) & 0x3333333333333333ull);
-    return (((x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Full) * 0x0101010101010101ull) >> 56;
-}
-SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 1) == 0) { n++, x >>= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u32_clz(sz_u32_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 0x80000000u) == 0) { n++, x <<= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) {
-    x = x - ((x >> 1) & 0x55555555);
-    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
-    return (((x + (x >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
-}
-#else
-SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) { return (int)_tzcnt_u64(x); }
-SZ_INTERNAL int sz_u64_clz(sz_u64_t x) { return (int)_lzcnt_u64(x); }
-SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) { return (int)__popcnt64(x); }
-SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) { return (int)_tzcnt_u32(x); }
-SZ_INTERNAL int sz_u32_clz(sz_u32_t x) { return (int)_lzcnt_u32(x); }
-SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) { return (int)__popcnt(x); }
-#endif
-// Force the byteswap functions to be intrinsics, because when /Oi- is given, these will turn into CRT function calls,
-// which breaks when `SZ_AVOID_LIBC` is given
-#pragma intrinsic(_byteswap_uint64)
-SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return _byteswap_uint64(val); }
-#pragma intrinsic(_byteswap_ulong)
-SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return _byteswap_ulong(val); }
-#else
-SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) { return __builtin_popcountll(x); }
-SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) { return __builtin_popcount(x); }
-SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) { return __builtin_ctzll(x); }
-SZ_INTERNAL int sz_u64_clz(sz_u64_t x) { return __builtin_clzll(x); }
-SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) { return __builtin_ctz(x); } // ! Undefined if `x == 0`
-SZ_INTERNAL int sz_u32_clz(sz_u32_t x) { return __builtin_clz(x); } // ! Undefined if `x == 0`
-SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return __builtin_bswap64(val); }
-SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return __builtin_bswap32(val); }
-#endif
-
-SZ_INTERNAL sz_u64_t sz_u64_rotl(sz_u64_t x, sz_u64_t r) { return (x << r) | (x >> (64 - r)); }
-
-/**
- *  @brief  Select bits from either ::a or ::b depending on the value of ::mask bits.
- *
- *  Similar to `_mm_blend_epi16` intrinsic on x86.
- *  Described in the "Bit Twiddling Hacks" by Sean Eron Anderson.
- *  https://graphics.stanford.edu/~seander/bithacks.html#ConditionalSetOrClearBitsWithoutBranching
- */
-SZ_INTERNAL sz_u64_t sz_u64_blend(sz_u64_t a, sz_u64_t b, sz_u64_t mask) { return a ^ ((a ^ b) & mask); }
-
-/*
- *  Efficiently computing the minimum and maximum of two or three values can be tricky.
- *  The simple branching baseline would be:
- *
- *      x < y ? x : y                               // can replace with 1 conditional move
- *
- *  Branchless approach is well known for signed integers, but it doesn't apply to unsigned ones.
- *  https://stackoverflow.com/questions/514435/templatized-branchless-int-max-min-function
- *  https://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
- *  Using only bit-shifts for singed integers it would be:
- *
- *      y + ((x - y) & (x - y) >> 31)               // 4 unique operations
- *
- *  Alternatively, for any integers using multiplication:
- *
- *      (x > y) * y + (x <= y) * x                  // 5 operations
- *
- *  Alternatively, to avoid multiplication:
- *
- *      x & ~((x < y) - 1) + y & ((x < y) - 1)      // 6 unique operations
- */
-#define sz_min_of_two(x, y) (x < y ? x : y)
-#define sz_max_of_two(x, y) (x < y ? y : x)
-#define sz_min_of_three(x, y, z) sz_min_of_two(x, sz_min_of_two(y, z))
-#define sz_max_of_three(x, y, z) sz_max_of_two(x, sz_max_of_two(y, z))
-
-/** @brief  Branchless minimum function for two signed 32-bit integers. */
-SZ_INTERNAL sz_i32_t sz_i32_min_of_two(sz_i32_t x, sz_i32_t y) { return y + ((x - y) & (x - y) >> 31); }
-
-/** @brief  Branchless minimum function for two signed 32-bit integers. */
-SZ_INTERNAL sz_i32_t sz_i32_max_of_two(sz_i32_t x, sz_i32_t y) { return x - ((x - y) & (x - y) >> 31); }
-
-/**
- *  @brief  Clamps signed offsets in a string to a valid range. Used for Pythonic-style slicing.
- */
-SZ_INTERNAL void sz_ssize_clamp_interval(sz_size_t length, sz_ssize_t start, sz_ssize_t end,
-                                         sz_size_t *normalized_offset, sz_size_t *normalized_length) {
-    // TODO: Remove branches.
-    // Normalize negative indices
-    if (start < 0) start += length;
-    if (end < 0) end += length;
-
-    // Clamp indices to a valid range
-    if (start < 0) start = 0;
-    if (end < 0) end = 0;
-    if (start > (sz_ssize_t)length) start = length;
-    if (end > (sz_ssize_t)length) end = length;
-
-    // Ensure start <= end
-    if (start > end) start = end;
-
-    *normalized_offset = start;
-    *normalized_length = end - start;
-}
-
-/**
- *  @brief  Compute the logarithm base 2 of a positive integer, rounding down.
- */
-SZ_INTERNAL sz_size_t sz_size_log2i_nonzero(sz_size_t x) {
-    sz_assert(x > 0 && "Non-positive numbers have no defined logarithm");
-    sz_size_t leading_zeros = sz_u64_clz(x);
-    return 63 - leading_zeros;
-}
-
-/**
- *  @brief  Compute the smallest power of two greater than or equal to ::x.
- */
-SZ_INTERNAL sz_size_t sz_size_bit_ceil(sz_size_t x) {
-    // Unlike the commonly used trick with `clz` intrinsics, is valid across the whole range of `x`.
-    // https://stackoverflow.com/a/10143264
-    x--;
-    x |= x >> 1;
-    x |= x >> 2;
-    x |= x >> 4;
-    x |= x >> 8;
-    x |= x >> 16;
-#if SZ_DETECT_64_BIT
-    x |= x >> 32;
-#endif
-    x++;
-    return x;
-}
-
-/**
- *  @brief  Transposes an 8x8 bit matrix packed in a `sz_u64_t`.
- *
- *  There is a well known SWAR sequence for that known to chess programmers,
- *  willing to flip a bit-matrix of pieces along the main A1-H8 diagonal.
- *  https://www.chessprogramming.org/Flipping_Mirroring_and_Rotating
- *  https://lukas-prokop.at/articles/2021-07-23-transpose
- */
-SZ_INTERNAL sz_u64_t sz_u64_transpose(sz_u64_t x) {
-    sz_u64_t t;
-    t = x ^ (x << 36);
-    x ^= 0xf0f0f0f00f0f0f0full & (t ^ (x >> 36));
-    t = 0xcccc0000cccc0000ull & (x ^ (x << 18));
-    x ^= t ^ (t >> 18);
-    t = 0xaa00aa00aa00aa00ull & (x ^ (x << 9));
-    x ^= t ^ (t >> 9);
-    return x;
-}
-
-/**
- *  @brief  Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
- */
-SZ_INTERNAL void sz_u64_swap(sz_u64_t *a, sz_u64_t *b) {
-    sz_u64_t t = *a;
-    *a = *b;
-    *b = t;
-}
-
-/**
- *  @brief  Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
- */
-SZ_INTERNAL void sz_pointer_swap(void **a, void **b) {
-    void *t = *a;
-    *a = *b;
-    *b = t;
-}
-
-/**
- *  @brief  Helper structure to simplify work with 16-bit words.
- *  @see    sz_u16_load
- */
-typedef union sz_u16_vec_t {
-    sz_u16_t u16;
-    sz_u8_t u8s[2];
-} sz_u16_vec_t;
-
-/**
- *  @brief Load a 16-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u16_vec_t sz_u16_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u16_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u16_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u16_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u16_vec_t const *result = (sz_u16_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/**
- *  @brief  Helper structure to simplify work with 32-bit words.
- *  @see    sz_u32_load
- */
-typedef union sz_u32_vec_t {
-    sz_u32_t u32;
-    sz_u16_t u16s[2];
-    sz_u8_t u8s[4];
-} sz_u32_vec_t;
-
-/**
- *  @brief Load a 32-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u32_vec_t sz_u32_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u32_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    result.u8s[2] = ptr[2];
-    result.u8s[3] = ptr[3];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u32_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u32_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u32_vec_t const *result = (sz_u32_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/**
- *  @brief  Helper structure to simplify work with 64-bit words.
- *  @see    sz_u64_load
- */
-typedef union sz_u64_vec_t {
-    sz_u64_t u64;
-    sz_u32_t u32s[2];
-    sz_u16_t u16s[4];
-    sz_u8_t u8s[8];
-} sz_u64_vec_t;
-
-/**
- *  @brief Load a 64-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u64_vec_t sz_u64_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u64_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    result.u8s[2] = ptr[2];
-    result.u8s[3] = ptr[3];
-    result.u8s[4] = ptr[4];
-    result.u8s[5] = ptr[5];
-    result.u8s[6] = ptr[6];
-    result.u8s[7] = ptr[7];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u64_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u64_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u64_vec_t const *result = (sz_u64_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/** @brief  Helper function, using the supplied fixed-capacity buffer to allocate memory. */
-SZ_INTERNAL sz_ptr_t _sz_memory_allocate_fixed(sz_size_t length, void *handle) {
-    sz_size_t capacity;
-    sz_copy((sz_ptr_t)&capacity, (sz_cptr_t)handle, sizeof(sz_size_t));
-    sz_size_t consumed_capacity = sizeof(sz_size_t);
-    if (consumed_capacity + length > capacity) return SZ_NULL_CHAR;
-    return (sz_ptr_t)handle + consumed_capacity;
-}
-
-/** @brief  Helper "no-op" function, simulating memory deallocation when we use a "static" memory buffer. */
-SZ_INTERNAL void _sz_memory_free_fixed(sz_ptr_t start, sz_size_t length, void *handle) {
-    sz_unused(start && length && handle);
-}
-
-/** @brief  An internal callback used to set a bit in a power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void _sz_hashes_fingerprint_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *handle) {
-    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
-    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
-    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
-    fingerprint_u8s[(hash / 8) & (fingerprint_bytes - 1)] |= (1 << (hash & 7));
-    sz_unused(start && length);
-}
-
-/** @brief  An internal callback used to set a bit in a @b non power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void _sz_hashes_fingerprint_non_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash,
-                                                          void *handle) {
-    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
-    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
-    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
-    fingerprint_u8s[(hash / 8) % fingerprint_bytes] |= (1 << (hash & 7));
-    sz_unused(start && length);
-}
-
-/** @brief  An internal callback, used to mix all the running hashes into one pointer-size value. */
-SZ_INTERNAL void _sz_hashes_fingerprint_scalar_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash,
-                                                        void *scalar_handle) {
-    sz_unused(start && length && hash && scalar_handle);
-    sz_size_t *scalar_ptr = (sz_size_t *)scalar_handle;
-    *scalar_ptr ^= hash;
-}
-
-/**
- *  @brief  Chooses the offsets of the most interesting characters in a search needle.
- *
- *  Search throughput can significantly deteriorate if we are matching the wrong characters.
- *  Say the needle is "aXaYa", and we are comparing the first, second, and last character.
- *  If we use SIMD and compare many offsets at a time, comparing against "a" in every register is a waste.
- *
- *  Similarly, dealing with UTF8 inputs, we know that the lower bits of each character code carry more information.
- *  Cyrillic alphabet, for example, falls into [0x0410, 0x042F] code range for uppercase [А, Я], and
- *  into [0x0430, 0x044F] for lowercase [а, я]. Scanning through a text written in Russian, half of the
- *  bytes will carry absolutely no value and will be equal to 0x04.
- */
-SZ_INTERNAL void _sz_locate_needle_anomalies(sz_cptr_t start, sz_size_t length, //
-                                             sz_size_t *first, sz_size_t *second, sz_size_t *third) {
-    *first = 0;
-    *second = length / 2;
-    *third = length - 1;
-
-    //
-    int has_duplicates =                   //
-        start[*first] == start[*second] || //
-        start[*first] == start[*third] ||  //
-        start[*second] == start[*third];
-
-    // Loop through letters to find non-colliding variants.
-    if (length > 3 && has_duplicates) {
-        // Pivot the middle point right, until we find a character different from the first one.
-        for (; start[*second] == start[*first] && *second + 1 < *third; ++(*second)) {}
-        // Pivot the third (last) point left, until we find a different character.
-        for (; (start[*third] == start[*second] || start[*third] == start[*first]) && *third > (*second + 1);
-             --(*third)) {}
-    }
-
-    // TODO: Investigate alternative strategies for long needles.
-    // On very long needles we have the luxury to choose!
-    // Often dealing with UTF8, we will likely benefit from shifting the first and second characters
-    // further to the right, to achieve not only uniqueness within the needle, but also avoid common
-    // rune prefixes of 2-, 3-, and 4-byte codes.
-    if (length > 8) {
-        // Pivot the first and second points right, until we find a character, that:
-        // > is different from others.
-        // > doesn't start with 0b'110x'xxxx - only 5 bits of relevant info.
-        // > doesn't start with 0b'1110'xxxx - only 4 bits of relevant info.
-        // > doesn't start with 0b'1111'0xxx - only 3 bits of relevant info.
-        //
-        // So we are practically searching for byte values that start with 0b0xxx'xxxx or 0b'10xx'xxxx.
-        // Meaning they fall in the range [0, 127] and [128, 191], in other words any unsigned int up to 191.
-        sz_u8_t const *start_u8 = (sz_u8_t const *)start;
-        sz_size_t vibrant_first = *first, vibrant_second = *second, vibrant_third = *third;
-
-        // Let's begin with the seccond character, as the termination criteria there is more obvious
-        // and we may end up with more variants to check for the first candidate.
-        for (; (start_u8[vibrant_second] > 191 || start_u8[vibrant_second] == start_u8[vibrant_third]) &&
-               (vibrant_second + 1 < vibrant_third);
-             ++vibrant_second) {}
-
-        // Now check if we've indeed found a good candidate or should revert the `vibrant_second` to `second`.
-        if (start_u8[vibrant_second] < 191) { *second = vibrant_second; }
-        else { vibrant_second = *second; }
-
-        // Now check the first character.
-        for (; (start_u8[vibrant_first] > 191 || start_u8[vibrant_first] == start_u8[vibrant_second] ||
-                start_u8[vibrant_first] == start_u8[vibrant_third]) &&
-               (vibrant_first + 1 < vibrant_second);
-             ++vibrant_first) {}
-
-        // Now check if we've indeed found a good candidate or should revert the `vibrant_first` to `first`.
-        // We don't need to shift the third one when dealing with texts as the last byte of the text is
-        // also the last byte of a rune and contains the most information.
-        if (start_u8[vibrant_first] < 191) { *first = vibrant_first; }
-    }
-}
-
-#pragma GCC visibility pop
-#pragma endregion
-
-#pragma region Serial Implementation
-
-#if !SZ_AVOID_LIBC
-#include <stdio.h>  // `fprintf`
-#include <stdlib.h> // `malloc`, `EXIT_FAILURE`
-
-SZ_PUBLIC void *_sz_memory_allocate_default(sz_size_t length, void *handle) {
-    sz_unused(handle);
-    return malloc(length);
-}
-SZ_PUBLIC void _sz_memory_free_default(sz_ptr_t start, sz_size_t length, void *handle) {
-    sz_unused(handle && length);
-    free(start);
-}
-
-#endif
-
-SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc) {
-#if !SZ_AVOID_LIBC
-    alloc->allocate = (sz_memory_allocate_t)_sz_memory_allocate_default;
-    alloc->free = (sz_memory_free_t)_sz_memory_free_default;
-#else
-    alloc->allocate = (sz_memory_allocate_t)SZ_NULL;
-    alloc->free = (sz_memory_free_t)SZ_NULL;
-#endif
-    alloc->handle = SZ_NULL;
-}
-
-SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length) {
-    // The logic here is simple - put the buffer length in the first slots of the buffer.
-    // Later use it for bounds checking.
-    alloc->allocate = (sz_memory_allocate_t)_sz_memory_allocate_fixed;
-    alloc->free = (sz_memory_free_t)_sz_memory_free_fixed;
-    alloc->handle = &buffer;
-    sz_copy((sz_ptr_t)buffer, (sz_cptr_t)&length, sizeof(sz_size_t));
-}
-
-/**
- *  @brief  Byte-level equality comparison between two strings.
- *          If unaligned loads are allowed, uses a switch-table to avoid loops on short strings.
- */
-SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_cptr_t const a_end = a + length;
-#if SZ_USE_MISALIGNED_LOADS
-    if (length >= SZ_SWAR_THRESHOLD) {
-        sz_u64_vec_t a_vec, b_vec;
-        for (; a + 8 <= a_end; a += 8, b += 8) {
-            a_vec = sz_u64_load(a);
-            b_vec = sz_u64_load(b);
-            if (a_vec.u64 != b_vec.u64) return sz_false_k;
-        }
-    }
-#endif
-    while (a != a_end && *a == *b) a++, b++;
-    return (sz_bool_t)(a_end == a);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-    for (sz_cptr_t const end = text + length; text != end; ++text)
-        if (sz_charset_contains(set, *text)) return text;
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Warray-bounds"
-    sz_cptr_t const end = text;
-    for (text += length; text != end;)
-        if (sz_charset_contains(set, *(text -= 1))) return text;
-    return SZ_NULL_CHAR;
-#pragma GCC diagnostic pop
-}
-
-/**
- *  One option to avoid branching is to use conditional moves and lookup the comparison result in a table:
- *       sz_ordering_t ordering_lookup[2] = {sz_greater_k, sz_less_k};
- *       for (; a != min_end; ++a, ++b)
- *           if (*a != *b) return ordering_lookup[*a < *b];
- *  That, however, introduces a data-dependency.
- *  A cleaner option is to perform two comparisons and a subtraction.
- *  One instruction more, but no data-dependency.
- */
-#define _sz_order_scalars(a, b) ((sz_ordering_t)((a > b) - (a < b)))
-
-SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    sz_bool_t a_shorter = (sz_bool_t)(a_length < b_length);
-    sz_size_t min_length = a_shorter ? a_length : b_length;
-    sz_cptr_t min_end = a + min_length;
-#if SZ_USE_MISALIGNED_LOADS && !SZ_DETECT_BIG_ENDIAN
-    for (sz_u64_vec_t a_vec, b_vec; a + 8 <= min_end; a += 8, b += 8) {
-        a_vec = sz_u64_load(a);
-        b_vec = sz_u64_load(b);
-        if (a_vec.u64 != b_vec.u64)
-            return _sz_order_scalars(sz_u64_bytes_reverse(a_vec.u64), sz_u64_bytes_reverse(b_vec.u64));
-    }
-#endif
-    for (; a != min_end; ++a, ++b)
-        if (*a != *b) return _sz_order_scalars(*a, *b);
-
-    // If the strings are equal up to `min_end`, then the shorter string is smaller
-    return _sz_order_scalars(a_length, b_length);
-}
-
-/**
- *  @brief  Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each byte is set.
-    // For that take the bottom 7 bits of each byte, add one to them,
-    // and if this sets the top bit to one, then all the 7 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7F7F7F7F7F7F7F7Full) + 0x0101010101010101ull) & ((vec.u64 & 0x8080808080808080ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b single-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- *          Identical to `memchr(haystack, needle[0], haystack_length)`.
- */
-SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    if (!h_length) return SZ_NULL_CHAR;
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_DETECT_BIG_ENDIAN    // Use SWAR only on little-endian platforms for brevety.
-#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
-        if (*h == *n) return h;
-#endif
-
-    // Broadcast the n into every byte of a 64-bit integer to use SWAR
-    // techniques and process eight characters at a time.
-    sz_u64_vec_t h_vec, n_vec, match_vec;
-    match_vec.u64 = 0;
-    n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
-    for (; h + 8 <= h_end; h += 8) {
-        h_vec.u64 = *(sz_u64_t const *)h;
-        match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
-        if (match_vec.u64) return h + sz_u64_ctz(match_vec.u64) / 8;
-    }
-#endif
-
-    // Handle the misaligned tail.
-    for (; h < h_end; ++h)
-        if (*h == *n) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Find the last occurrence of a @b single-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- *          Identical to `memrchr(haystack, needle[0], haystack_length)`.
- */
-sz_cptr_t sz_rfind_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    if (!h_length) return SZ_NULL_CHAR;
-    sz_cptr_t const h_start = h;
-
-    // Reposition the `h` pointer to the end, as we will be walking backwards.
-    h = h + h_length - 1;
-
-#if !SZ_DETECT_BIG_ENDIAN    // Use SWAR only on little-endian platforms for brevety.
-#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)(h + 1) & 7ull) && h >= h_start; --h)
-        if (*h == *n) return h;
-#endif
-
-    // Broadcast the n into every byte of a 64-bit integer to use SWAR
-    // techniques and process eight characters at a time.
-    sz_u64_vec_t h_vec, n_vec, match_vec;
-    n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
-    for (; h >= h_start + 7; h -= 8) {
-        h_vec.u64 = *(sz_u64_t const *)(h - 7);
-        match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
-        if (match_vec.u64) return h - sz_u64_clz(match_vec.u64) / 8;
-    }
-#endif
-
-    for (; h >= h_start; --h)
-        if (*h == *n) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  2Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 2byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_2byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 2byte is set.
-    // For that take the bottom 15 bits of each 2byte, add one to them,
-    // and if this sets the top bit to one, then all the 15 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7FFF7FFF7FFF7FFFull) + 0x0001000100010001ull) & ((vec.u64 & 0x8000800080008000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b two-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_2byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 2 bytes long.
-    sz_assert(h_length >= 2 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 2 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
-#endif
-
-    sz_u64_vec_t h_even_vec, h_odd_vec, n_vec, matches_even_vec, matches_odd_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1];
-    n_vec.u64 *= 0x0001000100010001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
-    for (; h + 9 <= h_end; h += 8) {
-        h_even_vec.u64 = *(sz_u64_t *)h;
-        h_odd_vec.u64 = (h_even_vec.u64 >> 8) | ((sz_u64_t)h[8] << 56);
-        matches_even_vec = _sz_u64_each_2byte_equal(h_even_vec, n_vec);
-        matches_odd_vec = _sz_u64_each_2byte_equal(h_odd_vec, n_vec);
-
-        matches_even_vec.u64 >>= 8;
-        if (matches_even_vec.u64 + matches_odd_vec.u64) {
-            sz_u64_t match_indicators = matches_even_vec.u64 | matches_odd_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 2 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  4Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 4byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_4byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 4byte is set.
-    // For that take the bottom 31 bits of each 4byte, add one to them,
-    // and if this sets the top bit to one, then all the 31 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7FFFFFFF7FFFFFFFull) + 0x0000000100000001ull) & ((vec.u64 & 0x8000000080000000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b four-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_4byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
-    sz_assert(h_length >= 4 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 4 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
-#endif
-
-    sz_u64_vec_t h0_vec, h1_vec, h2_vec, h3_vec, n_vec, matches0_vec, matches1_vec, matches2_vec, matches3_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1], n_vec.u8s[2] = n[2], n_vec.u8s[3] = n[3];
-    n_vec.u64 *= 0x0000000100000001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time using four 64-bit words.
-    // We load the subsequent four-byte word as well, taking its first bytes. Think of it as a glorified prefetch :)
-    sz_u64_t h_page_current, h_page_next;
-    for (; h + sizeof(sz_u64_t) + sizeof(sz_u32_t) <= h_end; h += sizeof(sz_u64_t)) {
-        h_page_current = *(sz_u64_t *)h;
-        h_page_next = *(sz_u32_t *)(h + 8);
-        h0_vec.u64 = (h_page_current);
-        h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
-        h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
-        h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
-        matches0_vec = _sz_u64_each_4byte_equal(h0_vec, n_vec);
-        matches1_vec = _sz_u64_each_4byte_equal(h1_vec, n_vec);
-        matches2_vec = _sz_u64_each_4byte_equal(h2_vec, n_vec);
-        matches3_vec = _sz_u64_each_4byte_equal(h3_vec, n_vec);
-
-        if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64) {
-            matches0_vec.u64 >>= 24;
-            matches1_vec.u64 >>= 16;
-            matches2_vec.u64 >>= 8;
-            sz_u64_t match_indicators = matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 4 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  3Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 3byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_3byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 4byte is set.
-    // For that take the bottom 31 bits of each 4byte, add one to them,
-    // and if this sets the top bit to one, then all the 31 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0xFFFF7FFFFF7FFFFFull) + 0x0000000001000001ull) & ((vec.u64 & 0x0000800000800000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b three-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_3byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
-    sz_assert(h_length >= 3 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 3 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
-#endif
-
-    // We fetch 12
-    sz_u64_vec_t h0_vec, h1_vec, h2_vec, h3_vec, h4_vec;
-    sz_u64_vec_t matches0_vec, matches1_vec, matches2_vec, matches3_vec, matches4_vec;
-    sz_u64_vec_t n_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1], n_vec.u8s[2] = n[2];
-    n_vec.u64 *= 0x0000000001000001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time using three 64-bit words.
-    // We load the subsequent two-byte word as well.
-    sz_u64_t h_page_current, h_page_next;
-    for (; h + sizeof(sz_u64_t) + sizeof(sz_u16_t) <= h_end; h += sizeof(sz_u64_t)) {
-        h_page_current = *(sz_u64_t *)h;
-        h_page_next = *(sz_u16_t *)(h + 8);
-        h0_vec.u64 = (h_page_current);
-        h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
-        h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
-        h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
-        h4_vec.u64 = (h_page_current >> 32) | (h_page_next << 32);
-        matches0_vec = _sz_u64_each_3byte_equal(h0_vec, n_vec);
-        matches1_vec = _sz_u64_each_3byte_equal(h1_vec, n_vec);
-        matches2_vec = _sz_u64_each_3byte_equal(h2_vec, n_vec);
-        matches3_vec = _sz_u64_each_3byte_equal(h3_vec, n_vec);
-        matches4_vec = _sz_u64_each_3byte_equal(h4_vec, n_vec);
-
-        if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64) {
-            matches0_vec.u64 >>= 16;
-            matches1_vec.u64 >>= 8;
-            matches3_vec.u64 <<= 8;
-            matches4_vec.u64 <<= 16;
-            sz_u64_t match_indicators =
-                matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 3 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Boyer-Moore-Horspool algorithm for exact matching of patterns up to @b 256-bytes long.
- *          Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_horspool_upto_256bytes_serial(sz_cptr_t h_chars, sz_size_t h_length, //
-                                                             sz_cptr_t n_chars, sz_size_t n_length) {
-    sz_assert(n_length <= 256 && "The pattern is too long.");
-    // Several popular string matching algorithms are using a bad-character shift table.
-    // Boyer Moore: https://www-igm.univ-mlv.fr/~lecroq/string/node14.html
-    // Quick Search: https://www-igm.univ-mlv.fr/~lecroq/string/node19.html
-    // Smith: https://www-igm.univ-mlv.fr/~lecroq/string/node21.html
-    union {
-        sz_u8_t jumps[256];
-        sz_u64_vec_t vecs[64];
-    } bad_shift_table;
-
-    // Let's initialize the table using SWAR to the total length of the string.
-    sz_u8_t const *h = (sz_u8_t const *)h_chars;
-    sz_u8_t const *n = (sz_u8_t const *)n_chars;
-    {
-        sz_u64_vec_t n_length_vec;
-        n_length_vec.u64 = n_length;
-        n_length_vec.u64 *= 0x0101010101010101ull; // broadcast
-        for (sz_size_t i = 0; i != 64; ++i) bad_shift_table.vecs[i].u64 = n_length_vec.u64;
-        for (sz_size_t i = 0; i + 1 < n_length; ++i) bad_shift_table.jumps[n[i]] = (sz_u8_t)(n_length - i - 1);
-    }
-
-    // Another common heuristic is to match a few characters from different parts of a string.
-    // Raita suggests to use the first two, the last, and the middle character of the pattern.
-    sz_u32_vec_t h_vec, n_vec;
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into an unsigned integer.
-    n_vec.u8s[0] = n[offset_first];
-    n_vec.u8s[1] = n[offset_first + 1];
-    n_vec.u8s[2] = n[offset_mid];
-    n_vec.u8s[3] = n[offset_last];
-
-    // Scan through the whole haystack, skipping the last `n_length - 1` bytes.
-    for (sz_size_t i = 0; i <= h_length - n_length;) {
-        h_vec.u8s[0] = h[i + offset_first];
-        h_vec.u8s[1] = h[i + offset_first + 1];
-        h_vec.u8s[2] = h[i + offset_mid];
-        h_vec.u8s[3] = h[i + offset_last];
-        if (h_vec.u32 == n_vec.u32 && sz_equal((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
-        i += bad_shift_table.jumps[h[i + n_length - 1]];
-    }
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Boyer-Moore-Horspool algorithm for @b reverse-order exact matching of patterns up to @b 256-bytes long.
- *          Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
- */
-SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_upto_256bytes_serial(sz_cptr_t h_chars, sz_size_t h_length, //
-                                                              sz_cptr_t n_chars, sz_size_t n_length) {
-    sz_assert(n_length <= 256 && "The pattern is too long.");
-    union {
-        sz_u8_t jumps[256];
-        sz_u64_vec_t vecs[64];
-    } bad_shift_table;
-
-    // Let's initialize the table using SWAR to the total length of the string.
-    sz_u8_t const *h = (sz_u8_t const *)h_chars;
-    sz_u8_t const *n = (sz_u8_t const *)n_chars;
-    {
-        sz_u64_vec_t n_length_vec;
-        n_length_vec.u64 = n_length;
-        n_length_vec.u64 *= 0x0101010101010101ull; // broadcast
-        for (sz_size_t i = 0; i != 64; ++i) bad_shift_table.vecs[i].u64 = n_length_vec.u64;
-        for (sz_size_t i = 0; i + 1 < n_length; ++i)
-            bad_shift_table.jumps[n[n_length - i - 1]] = (sz_u8_t)(n_length - i - 1);
-    }
-
-    // Another common heuristic is to match a few characters from different parts of a string.
-    // Raita suggests to use the first two, the last, and the middle character of the pattern.
-    sz_u32_vec_t h_vec, n_vec;
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into an unsigned integer.
-    n_vec.u8s[0] = n[offset_first];
-    n_vec.u8s[1] = n[offset_first + 1];
-    n_vec.u8s[2] = n[offset_mid];
-    n_vec.u8s[3] = n[offset_last];
-
-    // Scan through the whole haystack, skipping the first `n_length - 1` bytes.
-    for (sz_size_t j = 0; j <= h_length - n_length;) {
-        sz_size_t i = h_length - n_length - j;
-        h_vec.u8s[0] = h[i + offset_first];
-        h_vec.u8s[1] = h[i + offset_first + 1];
-        h_vec.u8s[2] = h[i + offset_mid];
-        h_vec.u8s[3] = h[i + offset_last];
-        if (h_vec.u32 == n_vec.u32 && sz_equal((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
-        j += bad_shift_table.jumps[h[i]];
-    }
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Exact substring search helper function, that finds the first occurrence of a prefix of the needle
- *          using a given search function, and then verifies the remaining part of the needle.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_with_prefix(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length,
-                                           sz_find_t find_prefix, sz_size_t prefix_length) {
-
-    sz_size_t suffix_length = n_length - prefix_length;
-    while (1) {
-        sz_cptr_t found = find_prefix(h, h_length, n, prefix_length);
-        if (!found) return SZ_NULL_CHAR;
-
-        // Verify the remaining part of the needle
-        sz_size_t remaining = h_length - (found - h);
-        if (remaining < n_length) return SZ_NULL_CHAR;
-        if (sz_equal(found + prefix_length, n + prefix_length, suffix_length)) return found;
-
-        // Adjust the position.
-        h = found + 1;
-        h_length = remaining - 1;
-    }
-
-    // Unreachable, but helps silence compiler warnings:
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Exact reverse-order substring search helper function, that finds the last occurrence of a suffix of the
- *          needle using a given search function, and then verifies the remaining part of the needle.
- */
-SZ_INTERNAL sz_cptr_t _sz_rfind_with_suffix(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length,
-                                            sz_find_t find_suffix, sz_size_t suffix_length) {
-
-    sz_size_t prefix_length = n_length - suffix_length;
-    while (1) {
-        sz_cptr_t found = find_suffix(h, h_length, n + prefix_length, suffix_length);
-        if (!found) return SZ_NULL_CHAR;
-
-        // Verify the remaining part of the needle
-        sz_size_t remaining = found - h;
-        if (remaining < prefix_length) return SZ_NULL_CHAR;
-        if (sz_equal(found - prefix_length, n, prefix_length)) return found - prefix_length;
-
-        // Adjust the position.
-        h_length = remaining - 1;
-    }
-
-    // Unreachable, but helps silence compiler warnings:
-    return SZ_NULL_CHAR;
-}
-
-SZ_INTERNAL sz_cptr_t _sz_find_over_4bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    return _sz_find_with_prefix(h, h_length, n, n_length, (sz_find_t)_sz_find_4byte_serial, 4);
-}
-
-SZ_INTERNAL sz_cptr_t _sz_find_horspool_over_256bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
-                                                             sz_size_t n_length) {
-    return _sz_find_with_prefix(h, h_length, n, n_length, _sz_find_horspool_upto_256bytes_serial, 256);
-}
-
-SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_over_256bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
-                                                              sz_size_t n_length) {
-    return _sz_rfind_with_suffix(h, h_length, n, n_length, _sz_rfind_horspool_upto_256bytes_serial, 256);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-
-#if SZ_DETECT_BIG_ENDIAN
-    sz_find_t backends[] = {
-        (sz_find_t)sz_find_byte_serial,
-        (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_find_horspool_over_256bytes_serial,
-    };
-
-    return backends[(n_length > 1) + (n_length > 256)](h, h_length, n, n_length);
-#else
-    sz_find_t backends[] = {
-        // For very short strings brute-force SWAR makes sense.
-        (sz_find_t)sz_find_byte_serial,
-        (sz_find_t)_sz_find_2byte_serial,
-        (sz_find_t)_sz_find_3byte_serial,
-        (sz_find_t)_sz_find_4byte_serial,
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (sz_find_t)_sz_find_over_4bytes_serial,
-        // For longer needles - use skip tables.
-        (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_find_horspool_over_256bytes_serial,
-    };
-
-    return backends[
-        // For very short strings brute-force SWAR makes sense.
-        (n_length > 1) + (n_length > 2) + (n_length > 3) +
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (n_length > 4) +
-        // For longer needles - use skip tables.
-        (n_length > 8) + (n_length > 256)](h, h_length, n, n_length);
-#endif
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-
-    sz_find_t backends[] = {
-        // For very short strings brute-force SWAR makes sense.
-        (sz_find_t)sz_rfind_byte_serial,
-        //  TODO: implement reverse-order SWAR for 2/3/4 byte variants.
-        //  TODO: (sz_find_t)_sz_rfind_2byte_serial,
-        //  TODO: (sz_find_t)_sz_rfind_3byte_serial,
-        //  TODO: (sz_find_t)_sz_rfind_4byte_serial,
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        // (sz_find_t)_sz_rfind_over_4bytes_serial,
-        // For longer needles - use skip tables.
-        (sz_find_t)_sz_rfind_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_rfind_horspool_over_256bytes_serial,
-    };
-
-    return backends[
-        // For very short strings brute-force SWAR makes sense.
-        0 +
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (n_length > 1) +
-        // For longer needles - use skip tables.
-        (n_length > 256)](h, h_length, n, n_length);
-}
-
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_serial( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                 //
-    sz_cptr_t longer, sz_size_t longer_length,                   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // TODO: Generalize to remove the following asserts!
-    sz_assert(!bound && "For bounded search the method should only evaluate one band of the matrix.");
-    sz_assert(shorter_length == longer_length && "The method hasn't been generalized to different length inputs yet.");
-    sz_unused(longer_length && bound);
-
-    // We are going to store 3 diagonals of the matrix.
-    // The length of the longest (main) diagonal would be `n = (shorter_length + 1)`.
-    sz_size_t n = shorter_length + 1;
-    sz_size_t buffer_length = sizeof(sz_size_t) * n * 3;
-    sz_size_t *distances = (sz_size_t *)alloc->allocate(buffer_length, alloc->handle);
-    if (!distances) return SZ_SIZE_MAX;
-
-    sz_size_t *previous_distances = distances;
-    sz_size_t *current_distances = previous_distances + n;
-    sz_size_t *next_distances = previous_distances + n * 2;
-
-    // Initialize the first two diagonals:
-    previous_distances[0] = 0;
-    current_distances[0] = current_distances[1] = 1;
-
-    // Progress through the upper triangle of the Levenshtein matrix.
-    sz_size_t next_diagonal_index = 2;
-    for (; next_diagonal_index != n; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
-        for (sz_size_t i = 0; i + 2 < next_diagonal_length; ++i) {
-            sz_size_t cost_of_substitution = shorter[next_diagonal_index - i - 2] != longer[i];
-            sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
-            sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
-            next_distances[i + 1] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
-        }
-        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-        next_distances[0] = next_distances[next_diagonal_length - 1] = next_diagonal_index;
-        // Perform a circular rotation of those buffers, to reuse the memory.
-        sz_size_t *temporary = previous_distances;
-        previous_distances = current_distances;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a
-    // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
-    // index on either side, we will be cropping those values out.
-    sz_size_t diagonals_count = n + n - 1;
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
-        for (sz_size_t i = 0; i != next_diagonal_length; ++i) {
-            sz_size_t cost_of_substitution = shorter[shorter_length - 1 - i] != longer[next_diagonal_index - n + i];
-            sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
-            sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
-            next_distances[i] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
-        }
-        // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
-        // dropping the first element in the current array.
-        sz_size_t *temporary = previous_distances;
-        previous_distances = current_distances + 1;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // Cache scalar before `free` call.
-    sz_size_t result = current_distances[0];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-}
-
-/**
- *  @brief  Describes the length of a UTF8 character / codepoint / rune in bytes.
- */
-typedef enum {
-    sz_utf8_invalid_k = 0,     //!< Invalid UTF8 character.
-    sz_utf8_rune_1byte_k = 1,  //!< 1-byte UTF8 character.
-    sz_utf8_rune_2bytes_k = 2, //!< 2-byte UTF8 character.
-    sz_utf8_rune_3bytes_k = 3, //!< 3-byte UTF8 character.
-    sz_utf8_rune_4bytes_k = 4, //!< 4-byte UTF8 character.
-} sz_rune_length_t;
-
-typedef sz_u32_t sz_rune_t;
-
-/**
- *  @brief  Extracts just one UTF8 codepoint from a UTF8 string into a 32-bit unsigned integer.
- */
-SZ_INTERNAL void _sz_extract_utf8_rune(sz_cptr_t utf8, sz_rune_t *code, sz_rune_length_t *code_length) {
-    sz_u8_t const *current = (sz_u8_t const *)utf8;
-    sz_u8_t leading_byte = *current++;
-    sz_rune_t ch;
-    sz_rune_length_t ch_length;
-
-    // TODO: This can be made entirely branchless using 32-bit SWAR.
-    if (leading_byte < 0x80) {
-        // Single-byte rune (0xxxxxxx)
-        ch = leading_byte;
-        ch_length = sz_utf8_rune_1byte_k;
-    }
-    else if ((leading_byte & 0xE0) == 0xC0) {
-        // Two-byte rune (110xxxxx 10xxxxxx)
-        ch = (leading_byte & 0x1F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_2bytes_k;
-    }
-    else if ((leading_byte & 0xF0) == 0xE0) {
-        // Three-byte rune (1110xxxx 10xxxxxx 10xxxxxx)
-        ch = (leading_byte & 0x0F) << 12;
-        ch |= (*current++ & 0x3F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_3bytes_k;
-    }
-    else if ((leading_byte & 0xF8) == 0xF0) {
-        // Four-byte rune (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
-        ch = (leading_byte & 0x07) << 18;
-        ch |= (*current++ & 0x3F) << 12;
-        ch |= (*current++ & 0x3F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_4bytes_k;
-    }
-    else {
-        // Invalid UTF8 rune.
-        ch = 0;
-        ch_length = sz_utf8_invalid_k;
-    }
-    *code = ch;
-    *code_length = ch_length;
-}
-
-/**
- *  @brief  Exports a UTF8 string into a UTF32 buffer.
- *          ! The result is undefined id the UTF8 string is corrupted.
- *  @return The length in the number of codepoints.
- */
-SZ_INTERNAL sz_size_t _sz_export_utf8_to_utf32(sz_cptr_t utf8, sz_size_t utf8_length, sz_rune_t *utf32) {
-    sz_cptr_t const end = utf8 + utf8_length;
-    sz_size_t count = 0;
-    sz_rune_length_t rune_length;
-    for (; utf8 != end; utf8 += rune_length, utf32++, count++) _sz_extract_utf8_rune(utf8, utf32, &rune_length);
-    return count;
-}
-
-/**
- *  @brief  Compute the Levenshtein distance between two strings using the Wagner-Fisher algorithm.
- *          Stores only 2 rows of the Levenshtein matrix, but uses 64-bit integers for the distance values,
- *          and upcasts UTF8 variable-length codepoints to 64-bit integers for faster addressing.
- *
- *  ! In the worst case for 2 strings of length 100, that contain just one 16-bit codepoint this will result in extra:
- *      + 2 rows * 100 slots * 8 bytes/slot = 1600 bytes of memory for the two rows of the Levenshtein matrix rows.
- *      + 100 codepoints * 2 strings * 4 bytes/codepoint = 800 bytes of memory for the UTF8 buffer.
- *      = 2400 bytes of memory or @b 12x memory amplification!
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_wagner_fisher_serial( //
-    sz_cptr_t longer, sz_size_t longer_length,                //
-    sz_cptr_t shorter, sz_size_t shorter_length,              //
-    sz_size_t bound, sz_bool_t can_be_unicode, sz_memory_allocator_t *alloc) {
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // A good idea may be to dispatch different kernels for different string lengths.
-    // Like using `uint8_t` counters for strings under 255 characters long.
-    // Good in theory, this results in frequent upcasts and downcasts in serial code.
-    // On strings over 20 bytes, using `uint8` over `uint64` on 64-bit x86 CPU doubles the execution time.
-    // So one must be very cautious with such optimizations.
-    typedef sz_size_t _distance_t;
-
-    // Compute the number of columns in our Levenshtein matrix.
-    sz_size_t const n = shorter_length + 1;
-
-    // If a buffering memory-allocator is provided, this operation is practically free,
-    // and cheaper than allocating even 512 bytes (for small distance matrices) on stack.
-    sz_size_t buffer_length = sizeof(_distance_t) * (n * 2);
-
-    // If the strings contain Unicode characters, let's estimate the max character width,
-    // and use it to allocate a larger buffer to decode UTF8.
-    if ((can_be_unicode == sz_true_k) &&
-        (sz_isascii(longer, longer_length) == sz_false_k || sz_isascii(shorter, shorter_length) == sz_false_k)) {
-        buffer_length += (shorter_length + longer_length) * sizeof(sz_rune_t);
-    }
-    else { can_be_unicode = sz_false_k; }
-
-    // If the allocation fails, return the maximum distance.
-    sz_ptr_t const buffer = (sz_ptr_t)alloc->allocate(buffer_length, alloc->handle);
-    if (!buffer) return SZ_SIZE_MAX;
-
-    // Let's export the UTF8 sequence into the newly allocated buffer at the end.
-    if (can_be_unicode == sz_true_k) {
-        sz_rune_t *const longer_utf32 = (sz_rune_t *)(buffer + sizeof(_distance_t) * (n * 2));
-        sz_rune_t *const shorter_utf32 = longer_utf32 + longer_length;
-        // Export the UTF8 sequences into the newly allocated buffer.
-        longer_length = _sz_export_utf8_to_utf32(longer, longer_length, longer_utf32);
-        shorter_length = _sz_export_utf8_to_utf32(shorter, shorter_length, shorter_utf32);
-        longer = (sz_cptr_t)longer_utf32;
-        shorter = (sz_cptr_t)shorter_utf32;
-    }
-
-    // Let's parameterize the core logic for different character types and distance types.
-#define _wagner_fisher_unbounded(_distance_t, _char_t)                                                                \
-    /* Now let's cast our pointer to avoid it in subsequent sections. */                                              \
-    _char_t const *const longer_chars = (_char_t const *)longer;                                                      \
-    _char_t const *const shorter_chars = (_char_t const *)shorter;                                                    \
-    _distance_t *previous_distances = (_distance_t *)buffer;                                                          \
-    _distance_t *current_distances = previous_distances + n;                                                          \
-    /*  Initialize the first row of the Levenshtein matrix with `iota`-style arithmetic progression. */               \
-    for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
-    /* The main loop of the algorithm with quadratic complexity. */                                                   \
-    for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {                                     \
-        _char_t const longer_char = longer_chars[idx_longer];                                                         \
-        /* Using pure pointer arithmetic is faster than iterating with an index. */                                   \
-        _char_t const *shorter_ptr = shorter_chars;                                                                   \
-        _distance_t const *previous_ptr = previous_distances;                                                         \
-        _distance_t *current_ptr = current_distances;                                                                 \
-        _distance_t *const current_end = current_ptr + shorter_length;                                                \
-        current_ptr[0] = idx_longer + 1;                                                                              \
-        for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) {                            \
-            _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]);           \
-            /* We can avoid `+1` for costs here, shifting it to post-minimum computation, */                          \
-            /* saving one increment operation. */                                                                     \
-            _distance_t cost_deletion = previous_ptr[1];                                                              \
-            _distance_t cost_insertion = current_ptr[0];                                                              \
-            /* ? It might be a good idea to enforce branchless execution here. */                                     \
-            /* ? The caveat being that the benchmarks on longer sequences backfire and more research is needed. */    \
-            current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1);      \
-        }                                                                                                             \
-        /* Swap `previous_distances` and `current_distances` pointers. */                                             \
-        _distance_t *temporary = previous_distances;                                                                  \
-        previous_distances = current_distances;                                                                       \
-        current_distances = temporary;                                                                                \
-    }                                                                                                                 \
-    /* Cache scalar before `free` call. */                                                                            \
-    sz_size_t result = previous_distances[shorter_length];                                                            \
-    alloc->free(buffer, buffer_length, alloc->handle);                                                                \
-    return result;
-
-    // Let's define a separate variant for bounded distance computation.
-    // Practically the same as unbounded, but also collecting the running minimum within each row for early exit.
-#define _wagner_fisher_bounded(_distance_t, _char_t)                                                                  \
-    _char_t const *const longer_chars = (_char_t const *)longer;                                                      \
-    _char_t const *const shorter_chars = (_char_t const *)shorter;                                                    \
-    _distance_t *previous_distances = (_distance_t *)buffer;                                                          \
-    _distance_t *current_distances = previous_distances + n;                                                          \
-    for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
-    for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {                                     \
-        _char_t const longer_char = longer_chars[idx_longer];                                                         \
-        _char_t const *shorter_ptr = shorter_chars;                                                                   \
-        _distance_t const *previous_ptr = previous_distances;                                                         \
-        _distance_t *current_ptr = current_distances;                                                                 \
-        _distance_t *const current_end = current_ptr + shorter_length;                                                \
-        current_ptr[0] = idx_longer + 1;                                                                              \
-        /* Initialize min_distance with a value greater than bound */                                                 \
-        _distance_t min_distance = bound - 1;                                                                         \
-        for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) {                            \
-            _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]);           \
-            _distance_t cost_deletion = previous_ptr[1];                                                              \
-            _distance_t cost_insertion = current_ptr[0];                                                              \
-            current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1);      \
-            /* Keep track of the minimum distance seen so far in this row */                                          \
-            min_distance = sz_min_of_two(current_ptr[1], min_distance);                                               \
-        }                                                                                                             \
-        /* If the minimum distance in this row exceeded the bound, return early */                                    \
-        if (min_distance >= bound) {                                                                                  \
-            alloc->free(buffer, buffer_length, alloc->handle);                                                        \
-            return bound;                                                                                             \
-        }                                                                                                             \
-        _distance_t *temporary = previous_distances;                                                                  \
-        previous_distances = current_distances;                                                                       \
-        current_distances = temporary;                                                                                \
-    }                                                                                                                 \
-    sz_size_t result = previous_distances[shorter_length];                                                            \
-    alloc->free(buffer, buffer_length, alloc->handle);                                                                \
-    return sz_min_of_two(result, bound);
-
-    // Dispatch the actual computation.
-    if (!bound) {
-        if (can_be_unicode == sz_true_k) { _wagner_fisher_unbounded(sz_size_t, sz_rune_t); }
-        else { _wagner_fisher_unbounded(sz_size_t, sz_u8_t); }
-    }
-    else {
-        if (can_be_unicode == sz_true_k) { _wagner_fisher_bounded(sz_size_t, sz_rune_t); }
-        else { _wagner_fisher_bounded(sz_size_t, sz_u8_t); }
-    }
-}
-
-SZ_PUBLIC sz_size_t sz_edit_distance_serial(     //
-    sz_cptr_t longer, sz_size_t longer_length,   //
-    sz_cptr_t shorter, sz_size_t shorter_length, //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
-    }
-
-    // Skip the matching prefixes and suffixes, they won't affect the distance.
-    for (sz_cptr_t a_end = longer + longer_length, b_end = shorter + shorter_length;
-         longer != a_end && shorter != b_end && *longer == *shorter;
-         ++longer, ++shorter, --longer_length, --shorter_length);
-    for (; longer_length && shorter_length && longer[longer_length - 1] == shorter[shorter_length - 1];
-         --longer_length, --shorter_length);
-
-    // Bounded computations may exit early.
-    int const is_bounded = bound < longer_length;
-    if (is_bounded) {
-        // If one of the strings is empty - the edit distance is equal to the length of the other one.
-        if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
-        if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
-        // If the difference in length is beyond the `bound`, there is no need to check at all.
-        if (longer_length - shorter_length > bound) return bound;
-    }
-
-    if (shorter_length == 0) return longer_length; // If no mismatches were found - the distance is zero.
-    if (shorter_length == longer_length && !is_bounded)
-        return _sz_edit_distance_skewed_diagonals_serial(longer, longer_length, shorter, shorter_length, bound, alloc);
-    return _sz_edit_distance_wagner_fisher_serial(longer, longer_length, shorter, shorter_length, bound, sz_false_k,
-                                                  alloc);
-}
-
-SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(       //
-    sz_cptr_t longer, sz_size_t longer_length,        //
-    sz_cptr_t shorter, sz_size_t shorter_length,      //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, //
-    sz_memory_allocator_t *alloc) {
-
-    // If one of the strings is empty - the edit distance is equal to the length of the other one
-    if (longer_length == 0) return (sz_ssize_t)shorter_length * gap;
-    if (shorter_length == 0) return (sz_ssize_t)longer_length * gap;
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
-    }
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    sz_size_t n = shorter_length + 1;
-    sz_size_t buffer_length = sizeof(sz_ssize_t) * n * 2;
-    sz_ssize_t *distances = (sz_ssize_t *)alloc->allocate(buffer_length, alloc->handle);
-    sz_ssize_t *previous_distances = distances;
-    sz_ssize_t *current_distances = previous_distances + n;
-
-    for (sz_size_t idx_shorter = 0; idx_shorter != n; ++idx_shorter)
-        previous_distances[idx_shorter] = (sz_ssize_t)idx_shorter * gap;
-
-    sz_u8_t const *shorter_unsigned = (sz_u8_t const *)shorter;
-    sz_u8_t const *longer_unsigned = (sz_u8_t const *)longer;
-    for (sz_size_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {
-        current_distances[0] = ((sz_ssize_t)idx_longer + 1) * gap;
-
-        // Initialize min_distance with a value greater than bound
-        sz_error_cost_t const *a_subs = subs + longer_unsigned[idx_longer] * 256ul;
-        for (sz_size_t idx_shorter = 0; idx_shorter != shorter_length; ++idx_shorter) {
-            sz_ssize_t cost_deletion = previous_distances[idx_shorter + 1] + gap;
-            sz_ssize_t cost_insertion = current_distances[idx_shorter] + gap;
-            sz_ssize_t cost_substitution = previous_distances[idx_shorter] + a_subs[shorter_unsigned[idx_shorter]];
-            current_distances[idx_shorter + 1] = sz_max_of_three(cost_deletion, cost_insertion, cost_substitution);
-        }
-
-        // Swap previous_distances and current_distances pointers
-        sz_pointer_swap((void **)&previous_distances, (void **)&current_distances);
-    }
-
-    // Cache scalar before `free` call.
-    sz_ssize_t result = previous_distances[shorter_length];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-}
-
-SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
-    sz_cptr_t a, sz_size_t a_length,            //
-    sz_cptr_t b, sz_size_t b_length,            //
-    sz_size_t bound) {
-
-    sz_size_t const min_length = sz_min_of_two(a_length, b_length);
-    sz_size_t const max_length = sz_max_of_two(a_length, b_length);
-    sz_cptr_t const a_end = a + min_length;
-    bound = bound == 0 ? max_length : bound;
-
-    // Walk through both strings using SWAR and counting the number of differing characters.
-    sz_size_t distance = max_length - min_length;
-#if SZ_USE_MISALIGNED_LOADS && !SZ_DETECT_BIG_ENDIAN
-    if (min_length >= SZ_SWAR_THRESHOLD) {
-        sz_u64_vec_t a_vec, b_vec, match_vec;
-        for (; a + 8 <= a_end && distance < bound; a += 8, b += 8) {
-            a_vec.u64 = sz_u64_load(a).u64;
-            b_vec.u64 = sz_u64_load(b).u64;
-            match_vec = _sz_u64_each_byte_equal(a_vec, b_vec);
-            distance += sz_u64_popcount((~match_vec.u64) & 0x8080808080808080ull);
-        }
-    }
-#endif
-
-    for (; a != a_end && distance < bound; ++a, ++b) { distance += (*a != *b); }
-    return sz_min_of_two(distance, bound);
-}
-
-SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial( //
-    sz_cptr_t a, sz_size_t a_length,                 //
-    sz_cptr_t b, sz_size_t b_length,                 //
-    sz_size_t bound) {
-
-    sz_cptr_t const a_end = a + a_length;
-    sz_cptr_t const b_end = b + b_length;
-    sz_size_t distance = 0;
-
-    sz_rune_t a_rune, b_rune;
-    sz_rune_length_t a_rune_length, b_rune_length;
-
-    if (bound) {
-        for (; a < a_end && b < b_end && distance < bound; a += a_rune_length, b += b_rune_length) {
-            _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-            _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-            distance += (a_rune != b_rune);
-        }
-        // If one string has more runes, we need to go through the tail.
-        if (distance < bound) {
-            for (; a < a_end && distance < bound; a += a_rune_length, ++distance)
-                _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-
-            for (; b < b_end && distance < bound; b += b_rune_length, ++distance)
-                _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-        }
-    }
-    else {
-        for (; a < a_end && b < b_end; a += a_rune_length, b += b_rune_length) {
-            _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-            _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-            distance += (a_rune != b_rune);
-        }
-        // If one string has more runes, we need to go through the tail.
-        for (; a < a_end; a += a_rune_length, ++distance) _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-        for (; b < b_end; b += b_rune_length, ++distance) _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-    }
-    return distance;
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length) {
-    sz_u64_t checksum = 0;
-    sz_u8_t const *text_u8 = (sz_u8_t const *)text;
-    sz_u8_t const *text_end = text_u8 + length;
-    for (; text_u8 != text_end; ++text_u8) checksum += *text_u8;
-    return checksum;
-}
-
-/**
- *  @brief  Largest prime number that fits into 31 bits.
- *  @see    https://mersenneforum.org/showthread.php?t=3471
- */
-#define SZ_U32_MAX_PRIME (2147483647u)
-
-/**
- *  @brief  Largest prime number that fits into 64 bits.
- *  @see    https://mersenneforum.org/showthread.php?t=3471
- *
- *  2^64 = 18,446,744,073,709,551,616
- *  this = 18,446,744,073,709,551,557
- *  diff = 59
- */
-#define SZ_U64_MAX_PRIME (18446744073709551557ull)
-
-/*
- *  One hardware-accelerated way of mixing hashes can be CRC, but it's only implemented for 32-bit values.
- *  Using a Boost-like mixer works very poorly in such case:
- *
- *       hash_first ^ (hash_second + 0x517cc1b727220a95 + (hash_first << 6) + (hash_first >> 2));
- *
- *  Let's stick to the Fibonacci hash trick using the golden ratio.
- *  https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
- */
-#define _sz_hash_mix(first, second) ((first * 11400714819323198485ull) ^ (second * 11400714819323198485ull))
-#define _sz_shift_low(x) (x)
-#define _sz_shift_high(x) ((x + 77ull) & 0xFFull)
-#define _sz_prime_mod(x) (x % SZ_U64_MAX_PRIME)
-
-SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length) {
-
-    sz_u64_t hash_low = 0;
-    sz_u64_t hash_high = 0;
-    sz_u8_t const *text = (sz_u8_t const *)start;
-    sz_u8_t const *text_end = text + length;
-
-    switch (length) {
-    case 0: return 0;
-
-    // Texts under 7 bytes long are definitely below the largest prime.
-    case 1:
-        hash_low = _sz_shift_low(text[0]);
-        hash_high = _sz_shift_high(text[0]);
-        break;
-    case 2:
-        hash_low = _sz_shift_low(text[0]) * 31ull + _sz_shift_low(text[1]);
-        hash_high = _sz_shift_high(text[0]) * 257ull + _sz_shift_high(text[1]);
-        break;
-    case 3:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull +         //
-                   _sz_shift_low(text[2]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull +          //
-                    _sz_shift_high(text[2]);
-        break;
-    case 4:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull +                 //
-                   _sz_shift_low(text[3]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull +                   //
-                    _sz_shift_high(text[3]);
-        break;
-    case 5:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull +                         //
-                   _sz_shift_low(text[4]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull +                            //
-                    _sz_shift_high(text[4]);
-        break;
-    case 6:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull * 31ull +                         //
-                   _sz_shift_low(text[4]) * 31ull +                                 //
-                   _sz_shift_low(text[5]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull * 257ull +                            //
-                    _sz_shift_high(text[4]) * 257ull +                                     //
-                    _sz_shift_high(text[5]);
-        break;
-    case 7:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull * 31ull * 31ull +                         //
-                   _sz_shift_low(text[4]) * 31ull * 31ull +                                 //
-                   _sz_shift_low(text[5]) * 31ull +                                         //
-                   _sz_shift_low(text[6]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull * 257ull * 257ull +                            //
-                    _sz_shift_high(text[4]) * 257ull * 257ull +                                     //
-                    _sz_shift_high(text[5]) * 257ull +                                              //
-                    _sz_shift_high(text[6]);
-        break;
-    default:
-        // Unroll the first seven cycles:
-        hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[1]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[1]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[2]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[2]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[3]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[3]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[4]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[4]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[5]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[5]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[6]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[6]);
-        text += 7;
-
-        // Iterate throw the rest with the modulus:
-        for (; text != text_end; ++text) {
-            hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
-            hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
-            // Wrap the hashes around:
-            hash_low = _sz_prime_mod(hash_low);
-            hash_high = _sz_prime_mod(hash_high);
-        }
-        break;
-    }
-
-    return _sz_hash_mix(hash_low, hash_high);
-}
-
-SZ_PUBLIC void sz_hashes_serial(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    sz_u8_t const *text = (sz_u8_t const *)start;
-    sz_u8_t const *text_end = text + length;
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // Compute the initial hash value for the first window.
-    sz_u64_t hash_low = 0, hash_high = 0, hash_mix;
-    for (sz_u8_t const *first_end = text + window_length; text < first_end; ++text)
-        hash_low = (hash_low * 31ull + _sz_shift_low(*text)) % SZ_U64_MAX_PRIME,
-        hash_high = (hash_high * 257ull + _sz_shift_high(*text)) % SZ_U64_MAX_PRIME;
-
-    // In most cases the fingerprint length will be a power of two.
-    hash_mix = _sz_hash_mix(hash_low, hash_high);
-    callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
-
-    // Compute the hash value for every window, exporting into the fingerprint,
-    // using the expensive modulo operation.
-    sz_size_t cycles = 1;
-    sz_size_t const step_mask = step - 1;
-    for (; text < text_end; ++text, ++cycles) {
-        // Discard one character:
-        hash_low -= _sz_shift_low(*(text - window_length)) * prime_power_low;
-        hash_high -= _sz_shift_high(*(text - window_length)) * prime_power_high;
-        // And add a new one:
-        hash_low = 31ull * hash_low + _sz_shift_low(*text);
-        hash_high = 257ull * hash_high + _sz_shift_high(*text);
-        // Wrap the hashes around:
-        hash_low = _sz_prime_mod(hash_low);
-        hash_high = _sz_prime_mod(hash_high);
-        // Mix only if we've skipped enough hashes.
-        if ((cycles & step_mask) == 0) {
-            hash_mix = _sz_hash_mix(hash_low, hash_high);
-            callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
-        }
-    }
-}
-
-#undef _sz_shift_low
-#undef _sz_shift_high
-#undef _sz_hash_mix
-#undef _sz_prime_mod
-
-/**
- *  @brief  Uses a small lookup-table to convert a lowercase character to uppercase.
- */
-SZ_INTERNAL sz_u8_t sz_u8_tolower(sz_u8_t c) {
-    static sz_u8_t const lowered[256] = {
-        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
-        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
-        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
-        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
-        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
-        96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, //
-        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
-        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
-        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
-        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
-    };
-    return lowered[c];
-}
-
-/**
- *  @brief  Uses a small lookup-table to convert an uppercase character to lowercase.
- */
-SZ_INTERNAL sz_u8_t sz_u8_toupper(sz_u8_t c) {
-    static sz_u8_t const upped[256] = {
-        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
-        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
-        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
-        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
-        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
-        96,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  //
-        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  123, 124, 125, 126, 127, //
-        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
-        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
-        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
-        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
-    };
-    return upped[c];
-}
-
-/**
- *  @brief  Uses two small lookup tables (768 bytes total) to accelerate division by a small
- *          unsigned integer. Performs two lookups, one multiplication, two shifts, and two accumulations.
- *
- *  @param  divisor Integral value @b larger than one.
- *  @param  number  Integral value to divide.
- */
-SZ_INTERNAL sz_u8_t sz_u8_divide(sz_u8_t number, sz_u8_t divisor) {
-    sz_assert(divisor > 1);
-    static sz_u16_t const multipliers[256] = {
-        0,     0,     0,     21846, 0,     39322, 21846, 9363,  0,     50973, 39322, 29790, 21846, 15124, 9363,  4370,
-        0,     57826, 50973, 44841, 39322, 34329, 29790, 25645, 21846, 18351, 15124, 12137, 9363,  6780,  4370,  2115,
-        0,     61565, 57826, 54302, 50973, 47824, 44841, 42011, 39322, 36765, 34329, 32006, 29790, 27671, 25645, 23705,
-        21846, 20063, 18351, 16706, 15124, 13602, 12137, 10725, 9363,  8049,  6780,  5554,  4370,  3224,  2115,  1041,
-        0,     63520, 61565, 59668, 57826, 56039, 54302, 52614, 50973, 49377, 47824, 46313, 44841, 43407, 42011, 40649,
-        39322, 38028, 36765, 35532, 34329, 33154, 32006, 30885, 29790, 28719, 27671, 26647, 25645, 24665, 23705, 22766,
-        21846, 20945, 20063, 19198, 18351, 17520, 16706, 15907, 15124, 14356, 13602, 12863, 12137, 11424, 10725, 10038,
-        9363,  8700,  8049,  7409,  6780,  6162,  5554,  4957,  4370,  3792,  3224,  2665,  2115,  1573,  1041,  517,
-        0,     64520, 63520, 62535, 61565, 60609, 59668, 58740, 57826, 56926, 56039, 55164, 54302, 53452, 52614, 51788,
-        50973, 50169, 49377, 48595, 47824, 47063, 46313, 45572, 44841, 44120, 43407, 42705, 42011, 41326, 40649, 39982,
-        39322, 38671, 38028, 37392, 36765, 36145, 35532, 34927, 34329, 33738, 33154, 32577, 32006, 31443, 30885, 30334,
-        29790, 29251, 28719, 28192, 27671, 27156, 26647, 26143, 25645, 25152, 24665, 24182, 23705, 23233, 22766, 22303,
-        21846, 21393, 20945, 20502, 20063, 19628, 19198, 18772, 18351, 17933, 17520, 17111, 16706, 16305, 15907, 15514,
-        15124, 14738, 14356, 13977, 13602, 13231, 12863, 12498, 12137, 11779, 11424, 11073, 10725, 10380, 10038, 9699,
-        9363,  9030,  8700,  8373,  8049,  7727,  7409,  7093,  6780,  6470,  6162,  5857,  5554,  5254,  4957,  4662,
-        4370,  4080,  3792,  3507,  3224,  2943,  2665,  2388,  2115,  1843,  1573,  1306,  1041,  778,   517,   258,
-    };
-    // This table can be avoided using a single addition and counting trailing zeros.
-    static sz_u8_t const shifts[256] = {
-        0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, //
-        4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, //
-        5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, //
-        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, //
-        6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-    };
-    sz_u32_t multiplier = multipliers[divisor];
-    sz_u8_t shift = shifts[divisor];
-
-    sz_u16_t q = (sz_u16_t)((multiplier * number) >> 16);
-    sz_u16_t t = ((number - q) >> 1) + q;
-    return (sz_u8_t)(t >> shift);
-}
-
-SZ_PUBLIC void sz_look_up_transform_serial(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result) {
-    sz_u8_t const *unsigned_lut = (sz_u8_t const *)lut;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = unsigned_lut[*unsigned_text];
-}
-
-SZ_PUBLIC void sz_tolower_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = sz_u8_tolower(*unsigned_text);
-}
-
-SZ_PUBLIC void sz_toupper_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = sz_u8_toupper(*unsigned_text);
-}
-
-SZ_PUBLIC void sz_toascii_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = *unsigned_text & 0x7F;
-}
-
-/**
- *  @brief  Check if there is a byte in this buffer, that exceeds 127 and can't be an ASCII character.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- */
-SZ_PUBLIC sz_bool_t sz_isascii_serial(sz_cptr_t text, sz_size_t length) {
-
-    if (!length) return sz_true_k;
-    sz_u8_t const *h = (sz_u8_t const *)text;
-    sz_u8_t const *const h_end = h + length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
-        if (*h & 0x80ull) return sz_false_k;
-#endif
-
-    // Validate eight bytes at once using SWAR.
-    sz_u64_vec_t text_vec;
-    for (; h + 8 <= h_end; h += 8) {
-        text_vec.u64 = *(sz_u64_t const *)h;
-        if (text_vec.u64 & 0x8080808080808080ull) return sz_false_k;
-    }
-
-    // Handle the misaligned tail.
-    for (; h < h_end; ++h)
-        if (*h & 0x80ull) return sz_false_k;
-    return sz_true_k;
-}
-
-SZ_PUBLIC void sz_generate_serial(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
-                                  sz_random_generator_t generator, void *generator_user_data) {
-
-    sz_assert(alphabet_size > 0 && alphabet_size <= 256 && "Inadequate alphabet size");
-
-    if (alphabet_size == 1) sz_fill(result, result_length, *alphabet);
-
-    else {
-        sz_assert(generator && "Expects a valid random generator");
-        sz_u8_t divisor = (sz_u8_t)alphabet_size;
-        for (sz_cptr_t end = result + result_length; result != end; ++result) {
-            sz_u8_t random = generator(generator_user_data) & 0xFF;
-            sz_u8_t quotient = sz_u8_divide(random, divisor);
-            *result = alphabet[random - quotient * divisor];
-        }
-    }
-}
-
-#pragma endregion
-
-/*
- *  Serial implementation of string class operations.
- */
-#pragma region Serial Implementation for the String Class
-
-SZ_PUBLIC sz_bool_t sz_string_is_on_stack(sz_string_t const *string) {
-    // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    return (sz_bool_t)((sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0]);
-}
-
-SZ_PUBLIC void sz_string_range(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length) {
-    sz_size_t is_small = (sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0];
-    sz_size_t is_big_mask = is_small - 1ull;
-    *start = string->external.start; // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    // If the string is small, use branch-less approach to mask-out the top 7 bytes of the length.
-    *length = string->external.length & (0x00000000000000FFull | is_big_mask);
-}
-
-SZ_PUBLIC void sz_string_unpack(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length, sz_size_t *space,
-                                sz_bool_t *is_external) {
-    sz_size_t is_small = (sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0];
-    sz_size_t is_big_mask = is_small - 1ull;
-    *start = string->external.start; // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    // If the string is small, use branch-less approach to mask-out the top 7 bytes of the length.
-    *length = string->external.length & (0x00000000000000FFull | is_big_mask);
-    // In case the string is small, the `is_small - 1ull` will become 0xFFFFFFFFFFFFFFFFull.
-    *space = sz_u64_blend(SZ_STRING_INTERNAL_SPACE, string->external.space, is_big_mask);
-    *is_external = (sz_bool_t)!is_small;
-}
-
-SZ_PUBLIC sz_bool_t sz_string_equal(sz_string_t const *a, sz_string_t const *b) {
-    // Tempting to say that the external.length is bitwise the same even if it includes
-    // some bytes of the on-stack payload, but we don't at this writing maintain that invariant.
-    // (An on-stack string includes noise bytes in the high-order bits of external.length. So do this
-    // the hard/correct way.
-
-#if SZ_USE_MISALIGNED_LOADS
-    // Dealing with StringZilla strings, we know that the `start` pointer always points
-    // to a word at least 8 bytes long. Therefore, we can compare the first 8 bytes at once.
-
-#endif
-    // Alternatively, fall back to byte-by-byte comparison.
-    sz_ptr_t a_start, b_start;
-    sz_size_t a_length, b_length;
-    sz_string_range(a, &a_start, &a_length);
-    sz_string_range(b, &b_start, &b_length);
-    return (sz_bool_t)(a_length == b_length && sz_equal(a_start, b_start, b_length));
-}
-
-SZ_PUBLIC sz_ordering_t sz_string_order(sz_string_t const *a, sz_string_t const *b) {
-#if SZ_USE_MISALIGNED_LOADS
-    // Dealing with StringZilla strings, we know that the `start` pointer always points
-    // to a word at least 8 bytes long. Therefore, we can compare the first 8 bytes at once.
-
-#endif
-    // Alternatively, fall back to byte-by-byte comparison.
-    sz_ptr_t a_start, b_start;
-    sz_size_t a_length, b_length;
-    sz_string_range(a, &a_start, &a_length);
-    sz_string_range(b, &b_start, &b_length);
-    return sz_order(a_start, a_length, b_start, b_length);
-}
-
-SZ_PUBLIC void sz_string_init(sz_string_t *string) {
-    sz_assert(string && "String can't be SZ_NULL.");
-
-    // Only 8 + 1 + 1 need to be initialized.
-    string->internal.start = &string->internal.chars[0];
-    // But for safety let's initialize the entire structure to zeros.
-    // string->internal.chars[0] = 0;
-    // string->internal.length = 0;
-    string->words[1] = 0;
-    string->words[2] = 0;
-    string->words[3] = 0;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length, sz_memory_allocator_t *allocator) {
-    sz_size_t space_needed = length + 1; // space for trailing \0
-    sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
-    // Initialize the string to zeros for safety.
-    string->words[1] = 0;
-    string->words[2] = 0;
-    string->words[3] = 0;
-    // If we are lucky, no memory allocations will be needed.
-    if (space_needed <= SZ_STRING_INTERNAL_SPACE) {
-        string->internal.start = &string->internal.chars[0];
-        string->internal.length = (sz_u8_t)length;
-    }
-    else {
-        // If we are not lucky, we need to allocate memory.
-        string->external.start = (sz_ptr_t)allocator->allocate(space_needed, allocator->handle);
-        if (!string->external.start) return SZ_NULL_CHAR;
-        string->external.length = length;
-        string->external.space = space_needed;
-    }
-    sz_assert(&string->internal.start == &string->external.start && "Alignment confusion");
-    string->external.start[length] = 0;
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity, sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
-
-    sz_size_t new_space = new_capacity + 1;
-    if (new_space <= SZ_STRING_INTERNAL_SPACE) return string->external.start;
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-    sz_assert(new_space > string_space && "New space must be larger than current.");
-
-    sz_ptr_t new_start = (sz_ptr_t)allocator->allocate(new_space, allocator->handle);
-    if (!new_start) return SZ_NULL_CHAR;
-
-    sz_copy(new_start, string_start, string_length);
-    string->external.start = new_start;
-    string->external.space = new_space;
-    string->external.padding = 0;
-    string->external.length = string_length;
-
-    // Deallocate the old string.
-    if (string_is_external) allocator->free(string_start, string_space, allocator->handle);
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_shrink_to_fit(sz_string_t *string, sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // We may already be space-optimal, and in that case we don't need to do anything.
-    sz_size_t new_space = string_length + 1;
-    if (string_space == new_space || !string_is_external) return string->external.start;
-
-    sz_ptr_t new_start = (sz_ptr_t)allocator->allocate(new_space, allocator->handle);
-    if (!new_start) return SZ_NULL_CHAR;
-
-    sz_copy(new_start, string_start, string_length);
-    string->external.start = new_start;
-    string->external.space = new_space;
-    string->external.padding = 0;
-    string->external.length = string_length;
-
-    // Deallocate the old string.
-    if (string_is_external) allocator->free(string_start, string_space, allocator->handle);
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_expand(sz_string_t *string, sz_size_t offset, sz_size_t added_length,
-                                    sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // The user intended to extend the string.
-    offset = sz_min_of_two(offset, string_length);
-
-    // If we are lucky, no memory allocations will be needed.
-    if (string_length + added_length < string_space) {
-        sz_move(string_start + offset + added_length, string_start + offset, string_length - offset);
-        string_start[string_length + added_length] = 0;
-        // Even if the string is on the stack, the `+=` won't affect the tail of the string.
-        string->external.length += added_length;
-    }
-    // If we are not lucky, we need to allocate more memory.
-    else {
-        sz_size_t next_planned_size = sz_max_of_two(SZ_CACHE_LINE_WIDTH, string_space * 2ull);
-        sz_size_t min_needed_space = sz_size_bit_ceil(offset + string_length + added_length + 1);
-        sz_size_t new_space = sz_max_of_two(min_needed_space, next_planned_size);
-        string_start = sz_string_reserve(string, new_space - 1, allocator);
-        if (!string_start) return SZ_NULL_CHAR;
-
-        // Copy into the new buffer.
-        sz_move(string_start + offset + added_length, string_start + offset, string_length - offset);
-        string_start[string_length + added_length] = 0;
-        string->external.length = string_length + added_length;
-    }
-
-    return string_start;
-}
-
-SZ_PUBLIC sz_size_t sz_string_erase(sz_string_t *string, sz_size_t offset, sz_size_t length) {
-
-    sz_assert(string && "String can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // Normalize the offset, it can't be larger than the length.
-    offset = sz_min_of_two(offset, string_length);
-
-    // We shouldn't normalize the length, to avoid overflowing on `offset + length >= string_length`,
-    // if receiving `length == SZ_SIZE_MAX`. After following expression the `length` will contain
-    // exactly the delta between original and final length of this `string`.
-    length = sz_min_of_two(length, string_length - offset);
-
-    // There are 2 common cases, that wouldn't even require a `memmove`:
-    //      1.  Erasing the entire contents of the string.
-    //          In that case `length` argument will be equal or greater than `length` member.
-    //      2.  Removing the tail of the string with something like `string.pop_back()` in C++.
-    //
-    // In both of those, regardless of the location of the string - stack or heap,
-    // the erasing is as easy as setting the length to the offset.
-    // In every other case, we must `memmove` the tail of the string to the left.
-    if (offset + length < string_length)
-        sz_move(string_start + offset, string_start + offset + length, string_length - offset - length);
-
-    // The `string->external.length = offset` assignment would discard last characters
-    // of the on-the-stack string, but inplace subtraction would work.
-    string->external.length -= length;
-    string_start[string_length - length] = 0;
-    return length;
-}
-
-SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *allocator) {
-    if (!sz_string_is_on_stack(string))
-        allocator->free(string->external.start, string->external.space, allocator->handle);
-    sz_string_init(string);
-}
-
-// When overriding libc, disable optimisations for this function beacuse MSVC will optimize the loops into a memset.
-// Which then causes a stack overflow due to infinite recursion (memset -> sz_fill_serial -> memset).
-#if defined(_MSC_VER) && defined(SZ_OVERRIDE_LIBC) && SZ_OVERRIDE_LIBC
-#pragma optimize("", off)
-#endif
-SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    sz_ptr_t end = target + length;
-    // Dealing with short strings, a single sequential pass would be faster.
-    // If the size is larger than 2 words, then at least 1 of them will be aligned.
-    // But just one aligned word may not be worth SWAR.
-    if (length < SZ_SWAR_THRESHOLD)
-        while (target != end) *(target++) = value;
-
-    // In case of long strings, skip unaligned bytes, and then fill the rest in 64-bit chunks.
-    else {
-        sz_u64_t value64 = (sz_u64_t)value * 0x0101010101010101ull;
-        while ((sz_size_t)target & 7ull) *(target++) = value;
-        while (target + 8 <= end) *(sz_u64_t *)target = value64, target += 8;
-        while (target != end) *(target++) = value;
-    }
-}
-#if defined(_MSC_VER) && defined(SZ_OVERRIDE_LIBC) && SZ_OVERRIDE_LIBC
-#pragma optimize("", on)
-#endif
-
-SZ_PUBLIC void sz_copy_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_MISALIGNED_LOADS
-    while (length >= 8) *(sz_u64_t *)target = *(sz_u64_t const *)source, target += 8, source += 8, length -= 8;
-#endif
-    while (length--) *(target++) = *(source++);
-}
-
-SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // Implementing `memmove` is trickier, than `memcpy`, as the ranges may overlap.
-    // Existing implementations often have two passes, in normal and reversed order,
-    // depending on the relation of `target` and `source` addresses.
-    // https://student.cs.uwaterloo.ca/~cs350/common/os161-src-html/doxygen/html/memmove_8c_source.html
-    // https://marmota.medium.com/c-language-making-memmove-def8792bb8d5
-    //
-    // We can use the `memcpy` like left-to-right pass if we know that the `target` is before `source`.
-    // Or if we know that they don't intersect! In that case the traversal order is irrelevant,
-    // but older CPUs may predict and fetch forward-passes better.
-    if (target < source || target >= source + length) {
-#if SZ_USE_MISALIGNED_LOADS
-        while (length >= 8) *(sz_u64_t *)target = *(sz_u64_t const *)(source), target += 8, source += 8, length -= 8;
-#endif
-        while (length--) *(target++) = *(source++);
-    }
-    else {
-        // Jump to the end and walk backwards.
-        target += length, source += length;
-#if SZ_USE_MISALIGNED_LOADS
-        while (length >= 8) *(sz_u64_t *)(target -= 8) = *(sz_u64_t const *)(source -= 8), length -= 8;
-#endif
-        while (length--) *(--target) = *(--source);
-    }
-}
-
-#pragma endregion
-
-/*
- *  @brief  Serial implementation for strings sequence processing.
- */
-#pragma region Serial Implementation for Sequences
-
-SZ_PUBLIC sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate) {
-
-    sz_size_t matches = 0;
-    while (matches != sequence->count && predicate(sequence, sequence->order[matches])) ++matches;
-
-    for (sz_size_t i = matches + 1; i < sequence->count; ++i)
-        if (predicate(sequence, sequence->order[i]))
-            sz_u64_swap(sequence->order + i, sequence->order + matches), ++matches;
-
-    return matches;
-}
-
-SZ_PUBLIC void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less) {
-
-    sz_size_t start_b = partition + 1;
-
-    // If the direct merge is already sorted
-    if (!less(sequence, sequence->order[start_b], sequence->order[partition])) return;
-
-    sz_size_t start_a = 0;
-    while (start_a <= partition && start_b <= sequence->count) {
-
-        // If element 1 is in right place
-        if (!less(sequence, sequence->order[start_b], sequence->order[start_a])) { start_a++; }
-        else {
-            sz_size_t value = sequence->order[start_b];
-            sz_size_t index = start_b;
-
-            // Shift all the elements between element 1
-            // element 2, right by 1.
-            while (index != start_a) { sequence->order[index] = sequence->order[index - 1], index--; }
-            sequence->order[start_a] = value;
-
-            // Update all the pointers
-            start_a++;
-            partition++;
-            start_b++;
-        }
-    }
-}
-
-SZ_PUBLIC void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
-    sz_u64_t *keys = sequence->order;
-    sz_size_t keys_count = sequence->count;
-    for (sz_size_t i = 1; i < keys_count; i++) {
-        sz_u64_t i_key = keys[i];
-        sz_size_t j = i;
-        for (; j > 0 && less(sequence, i_key, keys[j - 1]); --j) keys[j] = keys[j - 1];
-        keys[j] = i_key;
-    }
-}
-
-SZ_INTERNAL void _sz_sift_down(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t start,
-                               sz_size_t end) {
-    sz_size_t root = start;
-    while (2 * root + 1 <= end) {
-        sz_size_t child = 2 * root + 1;
-        if (child + 1 <= end && less(sequence, order[child], order[child + 1])) { child++; }
-        if (!less(sequence, order[root], order[child])) { return; }
-        sz_u64_swap(order + root, order + child);
-        root = child;
-    }
-}
-
-SZ_INTERNAL void _sz_heapify(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t count) {
-    sz_size_t start = (count - 2) / 2;
-    while (1) {
-        _sz_sift_down(sequence, less, order, start, count - 1);
-        if (start == 0) return;
-        start--;
-    }
-}
-
-SZ_INTERNAL void _sz_heapsort(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first, sz_size_t last) {
-    sz_u64_t *order = sequence->order;
-    sz_size_t count = last - first;
-    _sz_heapify(sequence, less, order + first, count);
-    sz_size_t end = count - 1;
-    while (end > 0) {
-        sz_u64_swap(order + first, order + first + end);
-        end--;
-        _sz_sift_down(sequence, less, order + first, 0, end);
-    }
-}
-
-SZ_PUBLIC void sz_sort_introsort_recursion(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first,
-                                           sz_size_t last, sz_size_t depth) {
-
-    sz_size_t length = last - first;
-    switch (length) {
-    case 0:
-    case 1: return;
-    case 2:
-        if (less(sequence, sequence->order[first + 1], sequence->order[first]))
-            sz_u64_swap(&sequence->order[first], &sequence->order[first + 1]);
-        return;
-    case 3: {
-        sz_u64_t a = sequence->order[first];
-        sz_u64_t b = sequence->order[first + 1];
-        sz_u64_t c = sequence->order[first + 2];
-        if (less(sequence, b, a)) sz_u64_swap(&a, &b);
-        if (less(sequence, c, b)) sz_u64_swap(&c, &b);
-        if (less(sequence, b, a)) sz_u64_swap(&a, &b);
-        sequence->order[first] = a;
-        sequence->order[first + 1] = b;
-        sequence->order[first + 2] = c;
-        return;
-    }
-    }
-    // Until a certain length, the quadratic-complexity insertion-sort is fine
-    if (length <= 16) {
-        sz_sequence_t sub_seq = *sequence;
-        sub_seq.order += first;
-        sub_seq.count = length;
-        sz_sort_insertion(&sub_seq, less);
-        return;
-    }
-
-    // Fallback to N-logN-complexity heap-sort
-    if (depth == 0) {
-        _sz_heapsort(sequence, less, first, last);
-        return;
-    }
-
-    --depth;
-
-    // Median-of-three logic to choose pivot
-    sz_size_t median = first + length / 2;
-    if (less(sequence, sequence->order[median], sequence->order[first]))
-        sz_u64_swap(&sequence->order[first], &sequence->order[median]);
-    if (less(sequence, sequence->order[last - 1], sequence->order[first]))
-        sz_u64_swap(&sequence->order[first], &sequence->order[last - 1]);
-    if (less(sequence, sequence->order[median], sequence->order[last - 1]))
-        sz_u64_swap(&sequence->order[median], &sequence->order[last - 1]);
-
-    // Partition using the median-of-three as the pivot
-    sz_u64_t pivot = sequence->order[median];
-    sz_size_t left = first;
-    sz_size_t right = last - 1;
-    while (1) {
-        while (less(sequence, sequence->order[left], pivot)) left++;
-        while (less(sequence, pivot, sequence->order[right])) right--;
-        if (left >= right) break;
-        sz_u64_swap(&sequence->order[left], &sequence->order[right]);
-        left++;
-        right--;
-    }
-
-    // Recursively sort the partitions
-    sz_sort_introsort_recursion(sequence, less, first, left, depth);
-    sz_sort_introsort_recursion(sequence, less, right + 1, last, depth);
-}
-
-SZ_PUBLIC void sz_sort_introsort(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
-    if (sequence->count == 0) return;
-    sz_size_t size_is_not_power_of_two = (sequence->count & (sequence->count - 1)) != 0;
-    sz_size_t depth_limit = sz_size_log2i_nonzero(sequence->count) + size_is_not_power_of_two;
-    sz_sort_introsort_recursion(sequence, less, 0, sequence->count, depth_limit);
-}
-
-SZ_PUBLIC void sz_sort_recursion( //
-    sz_sequence_t *sequence, sz_size_t bit_idx, sz_size_t bit_max, sz_sequence_comparator_t comparator,
-    sz_size_t partial_order_length) {
-
-    if (!sequence->count) return;
-
-    // Array of size one doesn't need sorting - only needs the prefix to be discarded.
-    if (sequence->count == 1) {
-        sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
-        order_half_words[1] = 0;
-        return;
-    }
-
-    // Partition a range of integers according to a specific bit value
-    sz_size_t split = 0;
-    sz_u64_t mask = (1ull << 63) >> bit_idx;
-
-    // The clean approach would be to perform a single pass over the sequence.
-    //
-    //    while (split != sequence->count && !(sequence->order[split] & mask)) ++split;
-    //    for (sz_size_t i = split + 1; i < sequence->count; ++i)
-    //        if (!(sequence->order[i] & mask)) sz_u64_swap(sequence->order + i, sequence->order + split), ++split;
-    //
-    // This, however, doesn't take into account the high relative cost of writes and swaps.
-    // To circumvent that, we can first count the total number entries to be mapped into either part.
-    // And then walk through both parts, swapping the entries that are in the wrong part.
-    // This would often lead to ~15% performance gain.
-    sz_size_t count_with_bit_set = 0;
-    for (sz_size_t i = 0; i != sequence->count; ++i) count_with_bit_set += (sequence->order[i] & mask) != 0;
-    split = sequence->count - count_with_bit_set;
-
-    // It's possible that the sequence is already partitioned.
-    if (split != 0 && split != sequence->count) {
-        // Use two pointers to efficiently reposition elements.
-        // On pointer walks left-to-right from the start, and the other walks right-to-left from the end.
-        sz_size_t left = 0;
-        sz_size_t right = sequence->count - 1;
-        while (1) {
-            // Find the next element with the bit set on the left side.
-            while (left < split && !(sequence->order[left] & mask)) ++left;
-            // Find the next element without the bit set on the right side.
-            while (right >= split && (sequence->order[right] & mask)) --right;
-            // Swap the mispositioned elements.
-            if (left < split && right >= split) {
-                sz_u64_swap(sequence->order + left, sequence->order + right);
-                ++left;
-                --right;
-            }
-            else { break; }
-        }
-    }
-
-    // Go down recursively.
-    if (bit_idx < bit_max) {
-        sz_sequence_t a = *sequence;
-        a.count = split;
-        sz_sort_recursion(&a, bit_idx + 1, bit_max, comparator, partial_order_length);
-
-        sz_sequence_t b = *sequence;
-        b.order += split;
-        b.count -= split;
-        sz_sort_recursion(&b, bit_idx + 1, bit_max, comparator, partial_order_length);
-    }
-    // Reached the end of recursion.
-    else {
-        // Discard the prefixes.
-        sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
-        for (sz_size_t i = 0; i != sequence->count; ++i) { order_half_words[i * 2 + 1] = 0; }
-
-        sz_sequence_t a = *sequence;
-        a.count = split;
-        sz_sort_introsort(&a, comparator);
-
-        sz_sequence_t b = *sequence;
-        b.order += split;
-        b.count -= split;
-        sz_sort_introsort(&b, comparator);
-    }
-}
-
-SZ_INTERNAL sz_bool_t _sz_sort_is_less(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) {
-    sz_cptr_t i_str = sequence->get_start(sequence, i_key);
-    sz_cptr_t j_str = sequence->get_start(sequence, j_key);
-    sz_size_t i_len = sequence->get_length(sequence, i_key);
-    sz_size_t j_len = sequence->get_length(sequence, j_key);
-    return (sz_bool_t)(sz_order_serial(i_str, i_len, j_str, j_len) == sz_less_k);
-}
-
-SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t partial_order_length) {
-
-#if SZ_DETECT_BIG_ENDIAN
-    // TODO: Implement partial sort for big-endian systems. For now this sorts the whole thing.
-    sz_unused(partial_order_length);
-    sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
-#else
-
-    // Export up to 4 bytes into the `sequence` bits themselves
-    for (sz_size_t i = 0; i != sequence->count; ++i) {
-        sz_cptr_t begin = sequence->get_start(sequence, sequence->order[i]);
-        sz_size_t length = sequence->get_length(sequence, sequence->order[i]);
-        length = length > 4u ? 4u : length;
-        sz_ptr_t prefix = (sz_ptr_t)&sequence->order[i];
-        for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j];
-    }
-
-    // Perform optionally-parallel radix sort on them
-    sz_sort_recursion(sequence, 0, 32, (sz_sequence_comparator_t)_sz_sort_is_less, partial_order_length);
-#endif
-}
-
-SZ_PUBLIC void sz_sort(sz_sequence_t *sequence) {
-#if SZ_DETECT_BIG_ENDIAN
-    sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
-#else
-    sz_sort_partial(sequence, sequence->count);
-#endif
-}
-
-#pragma endregion
-
-/*
- *  @brief  AVX2 implementation of the string search algorithms.
- *          Very minimalistic, but still faster than the serial implementation.
- */
-#pragma region AVX2 Implementation
-
-#if SZ_USE_X86_AVX2
-#pragma GCC push_options
-#pragma GCC target("avx2")
-#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
-#include <immintrin.h>
-
-/**
- *  @brief  Helper structure to simplify work with 256-bit registers.
- */
-typedef union sz_u256_vec_t {
-    __m256i ymm;
-    __m128i xmms[2];
-    sz_u64_t u64s[4];
-    sz_u32_t u32s[8];
-    sz_u16_t u16s[16];
-    sz_u8_t u8s[32];
-} sz_u256_vec_t;
-
-SZ_PUBLIC sz_ordering_t sz_order_avx2(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    //! Before optimizing this, read the "Operations Not Worth Optimizing" in Contributions Guide:
-    //! https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md#general-performance-observations
-    return sz_order_serial(a, a_length, b, b_length);
-}
-
-SZ_PUBLIC sz_bool_t sz_equal_avx2(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_u256_vec_t a_vec, b_vec;
-
-    while (length >= 32) {
-        a_vec.ymm = _mm256_lddqu_si256((__m256i const *)a);
-        b_vec.ymm = _mm256_lddqu_si256((__m256i const *)b);
-        // One approach can be to use "movemasks", but we could also use a bitwise matching like `_mm256_testnzc_si256`.
-        int difference_mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi8(a_vec.ymm, b_vec.ymm));
-        if (difference_mask == 0) { a += 32, b += 32, length -= 32; }
-        else { return sz_false_k; }
-    }
-
-    if (length) return sz_equal_serial(a, b, length);
-    return sz_true_k;
-}
-
-SZ_PUBLIC void sz_fill_avx2(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    char value_char = *(char *)&value;
-    __m256i value_vec = _mm256_set1_epi8(value_char);
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores".
-    //
-    //    for (; length >= 32; target += 32, length -= 32) _mm256_storeu_si256(target, value_vec);
-    //    sz_fill_serial(target, length, value);
-    //
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 32) sz_fill_serial(target, length, value);
-    // When the buffer is aligned, we can avoid any split-stores.
-    else {
-        sz_size_t head_length = (32 - ((sz_size_t)target % 32)) % 32; // 31 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 32;    // 31 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 32.
-        sz_u16_t value16 = (sz_u16_t)value * 0x0101u;
-        sz_u32_t value32 = (sz_u32_t)value16 * 0x00010001u;
-        sz_u64_t value64 = (sz_u64_t)value32 * 0x0000000100000001ull;
-
-        // Fill the head of the buffer. This part is much cleaner with AVX-512.
-        if (head_length & 1) *(sz_u8_t *)target = value, target++, head_length--;
-        if (head_length & 2) *(sz_u16_t *)target = value16, target += 2, head_length -= 2;
-        if (head_length & 4) *(sz_u32_t *)target = value32, target += 4, head_length -= 4;
-        if (head_length & 8) *(sz_u64_t *)target = value64, target += 8, head_length -= 8;
-        if (head_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_set1_epi8(value_char)), target += 16, head_length -= 16;
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-
-        // Fill the aligned body of the buffer.
-        for (; body_length >= 32; target += 32, body_length -= 32) _mm256_store_si256((__m256i *)target, value_vec);
-
-        // Fill the tail of the buffer. This part is much cleaner with AVX-512.
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-        if (tail_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_set1_epi8(value_char)), target += 16, tail_length -= 16;
-        if (tail_length & 8) *(sz_u64_t *)target = value64, target += 8, tail_length -= 8;
-        if (tail_length & 4) *(sz_u32_t *)target = value32, target += 4, tail_length -= 4;
-        if (tail_length & 2) *(sz_u16_t *)target = value16, target += 2, tail_length -= 2;
-        if (tail_length & 1) *(sz_u8_t *)target = value, target++, tail_length--;
-    }
-}
-
-SZ_PUBLIC void sz_copy_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores" and "loads".
-    //
-    //    for (; length >= 32; target += 32, source += 32, length -= 32)
-    //        _mm256_storeu_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-    //    sz_copy_serial(target, source, length);
-    //
-    // A typical AWS Skylake instance can have 32 KB x 2 blocks of L1 data cache per core,
-    // 1 MB x 2 blocks of L2 cache per core, and one shared L3 cache buffer.
-    // For now, let's avoid the cases beyond the L2 size.
-    int is_huge = length > 1ull * 1024ull * 1024ull;
-    if (length <= 32) { sz_copy_serial(target, source, length); }
-    // When dealing wirh larger arrays, the optimization is not as simple as with the `sz_fill_avx2` function,
-    // as both buffers may be unaligned. If we are lucky and the requested operation is some huge page transfer,
-    // we can use aligned loads and stores, and the performance will be great.
-    else if ((sz_size_t)target % 32 == 0 && (sz_size_t)source % 32 == 0 && !is_huge) {
-        for (; length >= 32; target += 32, source += 32, length -= 32)
-            _mm256_store_si256((__m256i *)target, _mm256_load_si256((__m256i const *)source));
-        if (length) sz_copy_serial(target, source, length);
-    }
-    // The trickiest case is when both `source` and `target` are not aligned.
-    // In such and simpler cases we can copy enough bytes into `target` to reach its cacheline boundary,
-    // and then combine unaligned loads with aligned stores.
-    else {
-        sz_size_t head_length = (32 - ((sz_size_t)target % 32)) % 32; // 31 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 32;    // 31 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 32.
-
-        // Fill the head of the buffer. This part is much cleaner with AVX-512.
-        if (head_length & 1) *(sz_u8_t *)target = *(sz_u8_t *)source, target++, source++, head_length--;
-        if (head_length & 2) *(sz_u16_t *)target = *(sz_u16_t *)source, target += 2, source += 2, head_length -= 2;
-        if (head_length & 4) *(sz_u32_t *)target = *(sz_u32_t *)source, target += 4, source += 4, head_length -= 4;
-        if (head_length & 8) *(sz_u64_t *)target = *(sz_u64_t *)source, target += 8, source += 8, head_length -= 8;
-        if (head_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_lddqu_si128((__m128i const *)source)), target += 16, source += 16,
-                head_length -= 16;
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-
-        // Fill the aligned body of the buffer.
-        if (!is_huge) {
-            for (; body_length >= 32; target += 32, source += 32, body_length -= 32)
-                _mm256_store_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-        }
-        // When the biffer is huge, we can traverse it in 2 directions.
-        else {
-            for (; body_length >= 64; target += 32, source += 32, body_length -= 64) {
-                _mm256_store_si256((__m256i *)(target), _mm256_lddqu_si256((__m256i const *)(source)));
-                _mm256_store_si256((__m256i *)(target + body_length - 32),
-                                   _mm256_lddqu_si256((__m256i const *)(source + body_length - 32)));
-            }
-            if (body_length) _mm256_store_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-        }
-
-        // Fill the tail of the buffer. This part is much cleaner with AVX-512.
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-        if (tail_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_lddqu_si128((__m128i const *)source)), target += 16, source += 16,
-                tail_length -= 16;
-        if (tail_length & 8) *(sz_u64_t *)target = *(sz_u64_t *)source, target += 8, source += 8, tail_length -= 8;
-        if (tail_length & 4) *(sz_u32_t *)target = *(sz_u32_t *)source, target += 4, source += 4, tail_length -= 4;
-        if (tail_length & 2) *(sz_u16_t *)target = *(sz_u16_t *)source, target += 2, source += 2, tail_length -= 2;
-        if (tail_length & 1) *(sz_u8_t *)target = *(sz_u8_t *)source, target++, source++, tail_length--;
-    }
-}
-
-SZ_PUBLIC void sz_move_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    if (target < source || target >= source + length) {
-        for (; length >= 32; target += 32, source += 32, length -= 32)
-            _mm256_storeu_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-        while (length--) *(target++) = *(source++);
-    }
-    else {
-        // Jump to the end and walk backwards.
-        for (target += length, source += length; length >= 32; length -= 32)
-            _mm256_storeu_si256((__m256i *)(target -= 32), _mm256_lddqu_si256((__m256i const *)(source -= 32)));
-        while (length--) *(--target) = *(--source);
-    }
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_avx2(sz_cptr_t text, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "loads".
-    //
-    // A typical AWS Skylake instance can have 32 KB x 2 blocks of L1 data cache per core,
-    // 1 MB x 2 blocks of L2 cache per core, and one shared L3 cache buffer.
-    // For now, let's avoid the cases beyond the L2 size.
-    int is_huge = length > 1ull * 1024ull * 1024ull;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 32) { return sz_checksum_serial(text, length); }
-    else if (!is_huge) {
-        sz_u256_vec_t text_vec, sums_vec;
-        sums_vec.ymm = _mm256_setzero_si256();
-        for (; length >= 32; text += 32, length -= 32) {
-            text_vec.ymm = _mm256_lddqu_si256((__m256i const *)text);
-            sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-        }
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        sz_u64_t result = low + high;
-        if (length) result += sz_checksum_serial(text, length);
-        return result;
-    }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    // Most notably, we can avoid populating the cache with the entire buffer, and instead traverse it in 2 directions.
-    else {
-        sz_size_t head_length = (32 - ((sz_size_t)text % 32)) % 32; // 31 or less.
-        sz_size_t tail_length = (sz_size_t)(text + length) % 32;    // 31 or less.
-        sz_size_t body_length = length - head_length - tail_length; // Multiple of 32.
-        sz_u64_t result = 0;
-
-        // Handle the head
-        while (head_length--) result += *text++;
-
-        sz_u256_vec_t text_vec, sums_vec;
-        sums_vec.ymm = _mm256_setzero_si256();
-        // Fill the aligned body of the buffer.
-        if (!is_huge) {
-            for (; body_length >= 32; text += 32, body_length -= 32) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i const *)text);
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-            }
-        }
-        // When the biffer is huge, we can traverse it in 2 directions.
-        else {
-            sz_u256_vec_t text_reversed_vec, sums_reversed_vec;
-            sums_reversed_vec.ymm = _mm256_setzero_si256();
-            for (; body_length >= 64; text += 64, body_length -= 64) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i *)(text));
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-                text_reversed_vec.ymm = _mm256_stream_load_si256((__m256i *)(text + body_length - 64));
-                sums_reversed_vec.ymm = _mm256_add_epi64(
-                    sums_reversed_vec.ymm, _mm256_sad_epu8(text_reversed_vec.ymm, _mm256_setzero_si256()));
-            }
-            if (body_length >= 32) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i *)(text));
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-            }
-            sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, sums_reversed_vec.ymm);
-        }
-
-        // Handle the tail
-        while (tail_length--) result += *text++;
-
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        result += low + high;
-        return result;
-    }
-}
-
-SZ_PUBLIC void sz_look_up_transform_avx2(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-
-    // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
-    // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
-    // But if at least 3 cache lines are touched, the AVX-2 implementation should be faster.
-    if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
-        return;
-    }
-
-    // We need to pull the lookup table into 8x YMM registers.
-    // The biggest issue is reorganizing the data in the lookup table, as AVX2 doesn't have 256-bit shuffle,
-    // it only has 128-bit "within-lane" shuffle. Still, it's wiser to use full YMM registers, instead of XMM,
-    // so that we can at least compensate high latency with twice larger window and one more level of lookup.
-    sz_u256_vec_t lut_0_to_15_vec, lut_16_to_31_vec, lut_32_to_47_vec, lut_48_to_63_vec, //
-        lut_64_to_79_vec, lut_80_to_95_vec, lut_96_to_111_vec, lut_112_to_127_vec,       //
-        lut_128_to_143_vec, lut_144_to_159_vec, lut_160_to_175_vec, lut_176_to_191_vec,  //
-        lut_192_to_207_vec, lut_208_to_223_vec, lut_224_to_239_vec, lut_240_to_255_vec;
-
-    lut_0_to_15_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut)));
-    lut_16_to_31_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 16)));
-    lut_32_to_47_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 32)));
-    lut_48_to_63_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 48)));
-    lut_64_to_79_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 64)));
-    lut_80_to_95_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 80)));
-    lut_96_to_111_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 96)));
-    lut_112_to_127_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 112)));
-    lut_128_to_143_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 128)));
-    lut_144_to_159_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 144)));
-    lut_160_to_175_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 160)));
-    lut_176_to_191_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 176)));
-    lut_192_to_207_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 192)));
-    lut_208_to_223_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 208)));
-    lut_224_to_239_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 224)));
-    lut_240_to_255_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 240)));
-
-    // Assuming each lookup is performed within 16 elements of 256, we need to reduce the scope by 16x = 2^4.
-    sz_u256_vec_t not_first_bit_vec, not_second_bit_vec, not_third_bit_vec, not_fourth_bit_vec;
-
-    /// Top and bottom nibbles of the source are used separately.
-    sz_u256_vec_t source_vec, source_bot_vec;
-    sz_u256_vec_t blended_0_to_31_vec, blended_32_to_63_vec, blended_64_to_95_vec, blended_96_to_127_vec,
-        blended_128_to_159_vec, blended_160_to_191_vec, blended_192_to_223_vec, blended_224_to_255_vec;
-
-    // Handling the head.
-    while (length >= 32) {
-        // Load and separate the nibbles of each byte in the source.
-        source_vec.ymm = _mm256_lddqu_si256((__m256i const *)source);
-        source_bot_vec.ymm = _mm256_and_si256(source_vec.ymm, _mm256_set1_epi8((char)0x0F));
-
-        // In the first round, we select using the 4th bit.
-        not_fourth_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x10), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8(                      //
-            _mm256_shuffle_epi8(lut_16_to_31_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_0_to_15_vec.ymm, source_bot_vec.ymm),  //
-            not_fourth_bit_vec.ymm);
-        blended_32_to_63_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_48_to_63_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_32_to_47_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_64_to_95_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_80_to_95_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_64_to_79_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_96_to_127_vec.ymm = _mm256_blendv_epi8(                      //
-            _mm256_shuffle_epi8(lut_112_to_127_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_96_to_111_vec.ymm, source_bot_vec.ymm),  //
-            not_fourth_bit_vec.ymm);
-        blended_128_to_159_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_144_to_159_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_128_to_143_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_160_to_191_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_176_to_191_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_160_to_175_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_192_to_223_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_208_to_223_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_192_to_207_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_224_to_255_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_240_to_255_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_224_to_239_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-
-        // Perform a tree-like reduction of the 8x "blended" YMM registers, depending on the "source" content.
-        // The first round selects using the 3rd bit.
-        not_third_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x20), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8( //
-            blended_32_to_63_vec.ymm,                 //
-            blended_0_to_31_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-        blended_64_to_95_vec.ymm = _mm256_blendv_epi8( //
-            blended_96_to_127_vec.ymm,                 //
-            blended_64_to_95_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-        blended_128_to_159_vec.ymm = _mm256_blendv_epi8( //
-            blended_160_to_191_vec.ymm,                  //
-            blended_128_to_159_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-        blended_192_to_223_vec.ymm = _mm256_blendv_epi8( //
-            blended_224_to_255_vec.ymm,                  //
-            blended_192_to_223_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-
-        // The second round selects using the 2nd bit.
-        not_second_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x40), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8( //
-            blended_64_to_95_vec.ymm,                 //
-            blended_0_to_31_vec.ymm,                  //
-            not_second_bit_vec.ymm);
-        blended_128_to_159_vec.ymm = _mm256_blendv_epi8( //
-            blended_192_to_223_vec.ymm,                  //
-            blended_128_to_159_vec.ymm,                  //
-            not_second_bit_vec.ymm);
-
-        // The third round selects using the 1st bit.
-        not_first_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x80), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8( //
-            blended_128_to_159_vec.ymm,               //
-            blended_0_to_31_vec.ymm,                  //
-            not_first_bit_vec.ymm);
-
-        // And dump the result into the target.
-        _mm256_storeu_si256((__m256i *)target, blended_0_to_31_vec.ymm);
-        source += 32, target += 32, length -= 32;
-    }
-
-    // Handle the tail.
-    if (length) sz_look_up_transform_serial(source, length, lut, target);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    int mask;
-    sz_u256_vec_t h_vec, n_vec;
-    n_vec.ymm = _mm256_set1_epi8(n[0]);
-
-    while (h_length >= 32) {
-        h_vec.ymm = _mm256_lddqu_si256((__m256i const *)h);
-        mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_vec.ymm, n_vec.ymm));
-        if (mask) return h + sz_u32_ctz(mask);
-        h += 32, h_length -= 32;
-    }
-
-    return sz_find_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    int mask;
-    sz_u256_vec_t h_vec, n_vec;
-    n_vec.ymm = _mm256_set1_epi8(n[0]);
-
-    while (h_length >= 32) {
-        h_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + h_length - 32));
-        mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_vec.ymm, n_vec.ymm));
-        if (mask) return h + h_length - 1 - sz_u32_clz(mask);
-        h_length -= 32;
-    }
-
-    return sz_rfind_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_avx2(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into YMM registers.
-    int matches;
-    sz_u256_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.ymm = _mm256_set1_epi8(n[offset_first]);
-    n_mid_vec.ymm = _mm256_set1_epi8(n[offset_mid]);
-    n_last_vec.ymm = _mm256_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    for (; h_length >= n_length + 32; h += 32, h_length -= 32) {
-        h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_first));
-        h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_mid));
-        h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_last));
-        matches = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
-        while (matches) {
-            int potential_offset = sz_u32_ctz(matches);
-            if (sz_equal(h + potential_offset, n, n_length)) return h + potential_offset;
-            matches &= matches - 1;
-        }
-    }
-
-    return sz_find_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_avx2(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into YMM registers.
-    int matches;
-    sz_u256_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.ymm = _mm256_set1_epi8(n[offset_first]);
-    n_mid_vec.ymm = _mm256_set1_epi8(n[offset_mid]);
-    n_last_vec.ymm = _mm256_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 32; h_length -= 32) {
-        h_reversed = h + h_length - n_length - 32 + 1;
-        h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_first));
-        h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_mid));
-        h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_last));
-        matches = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
-        while (matches) {
-            int potential_offset = sz_u32_clz(matches);
-            if (sz_equal(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            matches &= ~(1 << (31 - potential_offset));
-        }
-    }
-
-    return sz_rfind_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx2(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-
-    // Let's unzip even and odd elements and replicate them into both lanes of the YMM register.
-    // That way when we invoke `_mm256_shuffle_epi8` we can use the same mask for both lanes.
-    sz_u256_vec_t filter_even_vec, filter_odd_vec;
-    for (sz_size_t i = 0; i != 16; ++i)
-        filter_even_vec.u8s[i] = filter->_u8s[i * 2], filter_odd_vec.u8s[i] = filter->_u8s[i * 2 + 1];
-    filter_even_vec.xmms[1] = filter_even_vec.xmms[0];
-    filter_odd_vec.xmms[1] = filter_odd_vec.xmms[0];
-
-    sz_u256_vec_t text_vec;
-    sz_u256_vec_t matches_vec;
-    sz_u256_vec_t lower_nibbles_vec, higher_nibbles_vec;
-    sz_u256_vec_t bitset_even_vec, bitset_odd_vec;
-    sz_u256_vec_t bitmask_vec, bitmask_lookup_vec;
-    bitmask_lookup_vec.ymm = _mm256_set_epi8(-128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-                                             -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);
-
-    while (length >= 32) {
-        // The following algorithm is a transposed equivalent of the "SIMDized check which bytes are in a set"
-        // solutions by Wojciech Muła. We populate the bitmask differently and target newer CPUs, so
-        // StrinZilla uses a somewhat different approach.
-        // http://0x80.pl/articles/simd-byte-lookup.html#alternative-implementation-new
-        //
-        //      sz_u8_t input = *(sz_u8_t const *)text;
-        //      sz_u8_t lo_nibble = input & 0x0f;
-        //      sz_u8_t hi_nibble = input >> 4;
-        //      sz_u8_t bitset_even = filter_even_vec.u8s[hi_nibble];
-        //      sz_u8_t bitset_odd = filter_odd_vec.u8s[hi_nibble];
-        //      sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //      sz_u8_t bitset = lo_nibble < 8 ? bitset_even : bitset_odd;
-        //      if ((bitset & bitmask) != 0) return text;
-        //      else { length--, text++; }
-        //
-        // The nice part about this, loading the strided data is vey easy with Arm NEON,
-        // while with x86 CPUs after AVX, shuffles within 256 bits shouldn't be an issue either.
-        text_vec.ymm = _mm256_lddqu_si256((__m256i const *)text);
-        lower_nibbles_vec.ymm = _mm256_and_si256(text_vec.ymm, _mm256_set1_epi8(0x0f));
-        bitmask_vec.ymm = _mm256_shuffle_epi8(bitmask_lookup_vec.ymm, lower_nibbles_vec.ymm);
-        //
-        // At this point we can validate the `bitmask_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != 32; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t lo_nibble = input & 0x0f;
-        //          sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //          sz_assert(bitmask_vec.u8s[i] == bitmask);
-        //      }
-        //
-        // Shift right every byte by 4 bits.
-        // There is no `_mm256_srli_epi8` intrinsic, so we have to use `_mm256_srli_epi16`
-        // and combine it with a mask to clear the higher bits.
-        higher_nibbles_vec.ymm = _mm256_and_si256(_mm256_srli_epi16(text_vec.ymm, 4), _mm256_set1_epi8(0x0f));
-        bitset_even_vec.ymm = _mm256_shuffle_epi8(filter_even_vec.ymm, higher_nibbles_vec.ymm);
-        bitset_odd_vec.ymm = _mm256_shuffle_epi8(filter_odd_vec.ymm, higher_nibbles_vec.ymm);
-        //
-        // At this point we can validate the `bitset_even_vec` and `bitset_odd_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != 32; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t const *bitset_ptr = &filter->_u8s[0];
-        //          sz_u8_t hi_nibble = input >> 4;
-        //          sz_u8_t bitset_even = bitset_ptr[hi_nibble * 2];
-        //          sz_u8_t bitset_odd = bitset_ptr[hi_nibble * 2 + 1];
-        //          sz_assert(bitset_even_vec.u8s[i] == bitset_even);
-        //          sz_assert(bitset_odd_vec.u8s[i] == bitset_odd);
-        //      }
-        //
-        __m256i take_first = _mm256_cmpgt_epi8(_mm256_set1_epi8(8), lower_nibbles_vec.ymm);
-        bitset_even_vec.ymm = _mm256_blendv_epi8(bitset_odd_vec.ymm, bitset_even_vec.ymm, take_first);
-
-        // It would have been great to have an instruction that tests the bits and then broadcasts
-        // the matching bit into all bits in that byte. But we don't have that, so we have to
-        // `and`, `cmpeq`, `movemask`, and then invert at the end...
-        matches_vec.ymm = _mm256_and_si256(bitset_even_vec.ymm, bitmask_vec.ymm);
-        matches_vec.ymm = _mm256_cmpeq_epi8(matches_vec.ymm, _mm256_setzero_si256());
-        int matches_mask = ~_mm256_movemask_epi8(matches_vec.ymm);
-        if (matches_mask) {
-            int offset = sz_u32_ctz(matches_mask);
-            return text + offset;
-        }
-        else { text += 32, length -= 32; }
-    }
-
-    return sz_find_charset_serial(text, length, filter);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx2(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-    return sz_rfind_charset_serial(text, length, filter);
-}
-
-/**
- *  @brief  There is no AVX2 instruction for fast multiplication of 64-bit integers.
- *          This implementation is coming from Agner Fog's Vector Class Library.
- */
-SZ_INTERNAL __m256i _mm256_mul_epu64(__m256i a, __m256i b) {
-    __m256i bswap = _mm256_shuffle_epi32(b, 0xB1);
-    __m256i prodlh = _mm256_mullo_epi32(a, bswap);
-    __m256i zero = _mm256_setzero_si256();
-    __m256i prodlh2 = _mm256_hadd_epi32(prodlh, zero);
-    __m256i prodlh3 = _mm256_shuffle_epi32(prodlh2, 0x73);
-    __m256i prodll = _mm256_mul_epu32(a, b);
-    __m256i prod = _mm256_add_epi64(prodll, prodlh3);
-    return prod;
-}
-
-SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                              sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    if (length < 4 * window_length) {
-        sz_hashes_serial(start, length, window_length, step, callback, callback_handle);
-        return;
-    }
-
-    // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
-    // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
-    sz_size_t const max_hashes = length - window_length + 1;
-    sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
-    sz_u8_t const *text_first = (sz_u8_t const *)start;
-    sz_u8_t const *text_second = text_first + min_hashes_per_thread;
-    sz_u8_t const *text_third = text_first + min_hashes_per_thread * 2;
-    sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
-    sz_u8_t const *text_end = text_first + length;
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // Broadcast the constants into the registers.
-    sz_u256_vec_t prime_vec, golden_ratio_vec;
-    sz_u256_vec_t base_low_vec, base_high_vec, prime_power_low_vec, prime_power_high_vec, shift_high_vec;
-    base_low_vec.ymm = _mm256_set1_epi64x(31ull);
-    base_high_vec.ymm = _mm256_set1_epi64x(257ull);
-    shift_high_vec.ymm = _mm256_set1_epi64x(77ull);
-    prime_vec.ymm = _mm256_set1_epi64x(SZ_U64_MAX_PRIME);
-    golden_ratio_vec.ymm = _mm256_set1_epi64x(11400714819323198485ull);
-    prime_power_low_vec.ymm = _mm256_set1_epi64x(prime_power_low);
-    prime_power_high_vec.ymm = _mm256_set1_epi64x(prime_power_high);
-
-    // Compute the initial hash values for every one of the four windows.
-    sz_u256_vec_t hash_low_vec, hash_high_vec, hash_mix_vec, chars_low_vec, chars_high_vec;
-    hash_low_vec.ymm = _mm256_setzero_si256();
-    hash_high_vec.ymm = _mm256_setzero_si256();
-    for (sz_u8_t const *prefix_end = text_first + window_length; text_first < prefix_end;
-         ++text_first, ++text_second, ++text_third, ++text_fourth) {
-
-        // 1. Multiply the hashes by the base.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-
-        // 3. Add the incoming characters.
-        hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_low_vec.ymm = _mm256_blendv_epi8(hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
-                                              _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
-        hash_high_vec.ymm = _mm256_blendv_epi8(hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
-                                               _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
-    }
-
-    // 5. Compute the hash mix, that will be used to index into the fingerprint.
-    //    This includes a serial step at the end.
-    hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
-    hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
-    hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
-    callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-    callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-    callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-    callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-
-    // Now repeat that operation for the remaining characters, discarding older characters.
-    sz_size_t cycle = 1;
-    sz_size_t const step_mask = step - 1;
-    for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
-        // 0. Load again the four characters we are dropping, shift them, and subtract.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[-window_length], text_third[-window_length],
-                                              text_second[-window_length], text_first[-window_length]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-        hash_low_vec.ymm =
-            _mm256_sub_epi64(hash_low_vec.ymm, _mm256_mul_epu64(chars_low_vec.ymm, prime_power_low_vec.ymm));
-        hash_high_vec.ymm =
-            _mm256_sub_epi64(hash_high_vec.ymm, _mm256_mul_epu64(chars_high_vec.ymm, prime_power_high_vec.ymm));
-
-        // 1. Multiply the hashes by the base.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-
-        // 3. Add the incoming characters.
-        hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_low_vec.ymm = _mm256_blendv_epi8(hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
-                                              _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
-        hash_high_vec.ymm = _mm256_blendv_epi8(hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
-                                               _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
-
-        // 5. Compute the hash mix, that will be used to index into the fingerprint.
-        //    This includes a serial step at the end.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
-        hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
-        if ((cycle & step_mask) == 0) {
-            callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-            callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-            callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-            callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-        }
-    }
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif
-#pragma endregion
-
-/*
- *  @brief  AVX-512 implementation of the string search algorithms.
- *
- *  Different subsets of AVX-512 were introduced in different years:
- *  - 2017 SkyLake: F, CD, ER, PF, VL, DQ, BW
- *  - 2018 CannonLake: IFMA, VBMI
- *  - 2019 IceLake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES
- *  - 2020 TigerLake: VP2INTERSECT
- */
-#pragma region AVX512 Implementation
-
-#if SZ_USE_X86_AVX512
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
-#include <immintrin.h>
-
-/**
- *  @brief  Helper structure to simplify work with 512-bit registers.
- */
-typedef union sz_u512_vec_t {
-    __m512i zmm;
-    __m256i ymms[2];
-    __m128i xmms[4];
-    sz_u64_t u64s[8];
-    sz_u32_t u32s[16];
-    sz_u16_t u16s[32];
-    sz_u8_t u8s[64];
-    sz_i64_t i64s[8];
-    sz_i32_t i32s[16];
-} sz_u512_vec_t;
-
-SZ_INTERNAL __mmask64 _sz_u64_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 64:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 64:
-    return _bzhi_u64(0xFFFFFFFFFFFFFFFF, n < 64 ? (sz_u32_t)n : 64);
-}
-
-SZ_INTERNAL __mmask32 _sz_u32_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 32:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 32:
-    return _bzhi_u32(0xFFFFFFFF, n < 32 ? (sz_u32_t)n : 32);
-}
-
-SZ_INTERNAL __mmask16 _sz_u16_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 16:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 16:
-    return _bzhi_u32(0xFFFFFFFF, n < 16 ? (sz_u32_t)n : 16);
-}
-
-SZ_INTERNAL __mmask16 _sz_u16_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 16:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 16:
-    return (__mmask16)_bzhi_u32(0xFFFFFFFF, (sz_u32_t)n);
-}
-
-SZ_INTERNAL __mmask32 _sz_u32_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 32:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 32:
-    return _bzhi_u32(0xFFFFFFFF, (sz_u32_t)n);
-}
-
-SZ_INTERNAL __mmask64 _sz_u64_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 64:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 64:
-    return _bzhi_u64(0xFFFFFFFFFFFFFFFF, (sz_u32_t)n);
-}
-
-SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    sz_u512_vec_t a_vec, b_vec;
-
-    // Pointer arithmetic is cheap, fetching memory is not!
-    // So we can use the masked loads to fetch at most one cache-line for each string,
-    // compare the prefixes, and only then move forward.
-    sz_size_t a_head_length = 64 - ((sz_size_t)a % 64); // 63 or less.
-    sz_size_t b_head_length = 64 - ((sz_size_t)b % 64); // 63 or less.
-    a_head_length = a_head_length < a_length ? a_head_length : a_length;
-    b_head_length = b_head_length < b_length ? b_head_length : b_length;
-    sz_size_t head_length = a_head_length < b_head_length ? a_head_length : b_head_length;
-    __mmask64 head_mask = _sz_u64_mask_until(head_length);
-    a_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, a);
-    b_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, b);
-    __mmask64 mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-    if (mask_not_equal != 0) {
-        sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-        char a_char = a_vec.u8s[first_diff];
-        char b_char = b_vec.u8s[first_diff];
-        return _sz_order_scalars(a_char, b_char);
-    }
-    else if (head_length == a_length && head_length == b_length) { return sz_equal_k; }
-    else { a += head_length, b += head_length, a_length -= head_length, b_length -= head_length; }
-
-    // The rare case, when both string are very long.
-    __mmask64 a_mask, b_mask;
-    while ((a_length >= 64) & (b_length >= 64)) {
-        a_vec.zmm = _mm512_loadu_si512(a);
-        b_vec.zmm = _mm512_loadu_si512(b);
-        mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask_not_equal != 0) {
-            sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-            char a_char = a_vec.u8s[first_diff];
-            char b_char = b_vec.u8s[first_diff];
-            return _sz_order_scalars(a_char, b_char);
-        }
-        a += 64, b += 64, a_length -= 64, b_length -= 64;
-    }
-
-    // In most common scenarios at least one of the strings is under 64 bytes.
-    if (a_length | b_length) {
-        a_mask = _sz_u64_clamp_mask_until(a_length);
-        b_mask = _sz_u64_clamp_mask_until(b_length);
-        a_vec.zmm = _mm512_maskz_loadu_epi8(a_mask, a);
-        b_vec.zmm = _mm512_maskz_loadu_epi8(b_mask, b);
-        // The AVX-512 `_mm512_mask_cmpneq_epi8_mask` intrinsics are generally handy in such environments.
-        // They, however, have latency 3 on most modern CPUs. Using AVX2: `_mm256_cmpeq_epi8` would have
-        // been cheaper, if we didn't have to apply `_mm256_movemask_epi8` afterwards.
-        mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask_not_equal != 0) {
-            sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-            char a_char = a_vec.u8s[first_diff];
-            char b_char = b_vec.u8s[first_diff];
-            return _sz_order_scalars(a_char, b_char);
-        }
-        // From logic perspective, the hardest cases are "abc\0" and "abc".
-        // The result must be `sz_greater_k`, as the latter is shorter.
-        else { return _sz_order_scalars(a_length, b_length); }
-    }
-
-    return sz_equal_k;
-}
-
-SZ_PUBLIC sz_bool_t sz_equal_avx512(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    __mmask64 mask;
-    sz_u512_vec_t a_vec, b_vec;
-
-    while (length >= 64) {
-        a_vec.zmm = _mm512_loadu_si512(a);
-        b_vec.zmm = _mm512_loadu_si512(b);
-        mask = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask != 0) return sz_false_k;
-        a += 64, b += 64, length -= 64;
-    }
-
-    if (length) {
-        mask = _sz_u64_mask_until(length);
-        a_vec.zmm = _mm512_maskz_loadu_epi8(mask, a);
-        b_vec.zmm = _mm512_maskz_loadu_epi8(mask, b);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpneq_epi8_mask(mask, a_vec.zmm, b_vec.zmm);
-        return (sz_bool_t)(mask == 0);
-    }
-
-    return sz_true_k;
-}
-
-SZ_PUBLIC void sz_fill_avx512(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    __m512i value_vec = _mm512_set1_epi8(value);
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores".
-    //
-    //    for (; length >= 64; target += 64, length -= 64) _mm512_storeu_si512(target, value_vec);
-    //    _mm512_mask_storeu_epi8(target, _sz_u64_mask_until(length), value_vec);
-    //
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        _mm512_mask_storeu_epi8(target, mask, value_vec);
-    }
-    // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
-    // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
-    // by computing 2 masks - for the head and tail, using masked stores for the head and tail, and unmasked
-    // for the body.
-    else {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        _mm512_mask_storeu_epi8(target, head_mask, value_vec);
-        for (target += head_length; body_length >= 64; target += 64, body_length -= 64)
-            _mm512_store_si512(target, value_vec);
-        _mm512_mask_storeu_epi8(target, tail_mask, value_vec);
-    }
-}
-
-SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores" and "loads".
-    //
-    //    for (; length >= 64; target += 64, source += 64, length -= 64)
-    //        _mm512_storeu_si512(target, _mm512_loadu_si512(source));
-    //    __mmask64 mask = _sz_u64_mask_until(length);
-    //    _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    //
-    // A typical AWS Sapphire Rapids instance can have 48 KB x 2 blocks of L1 data cache per core,
-    // 2 MB x 2 blocks of L2 cache per core, and one shared 60 MB buffer of L3 cache.
-    // With two strings, we may consider the overal workload huge, if each exceeds 1 MB in length.
-    int const is_huge = length >= 1ull * 1024ull * 1024ull;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    }
-    // When dealing wirh larger arrays, the optimization is not as simple as with the `sz_fill_avx512` function,
-    // as both buffers may be unaligned. If we are lucky and the requested operation is some huge page transfer,
-    // we can use aligned loads and stores, and the performance will be great.
-    else if ((sz_size_t)target % 64 == 0 && (sz_size_t)source % 64 == 0 && !is_huge) {
-        for (; length >= 64; target += 64, source += 64, length -= 64)
-            _mm512_store_si512(target, _mm512_load_si512(source));
-        // At this point the length is guaranteed to be under 64.
-        __mmask64 mask = _sz_u64_mask_until(length);
-        // Aligned load and stores would work too, but it's not defined.
-        _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    }
-    // The trickiest case is when both `source` and `target` are not aligned.
-    // In such and simpler cases we can copy enough bytes into `target` to reach its cacheline boundary,
-    // and then combine unaligned loads with aligned stores.
-    else if (!is_huge) {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-        for (target += head_length, source += head_length; body_length >= 64;
-             target += 64, source += 64, body_length -= 64)
-            _mm512_store_si512(target, _mm512_loadu_si512(source)); // Unaligned load, but aligned store!
-        _mm512_mask_storeu_epi8(target, tail_mask, _mm512_maskz_loadu_epi8(tail_mask, source));
-    }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    //
-    //      1. Moving in both directions to maximize the throughput, when fetching from multiple
-    //         memory pages. Also helps with cache set-associativity issues, as we won't always
-    //         be fetching the same entries in the lookup table.
-    //      2. Using non-temporal stores to avoid polluting the cache.
-    //      3. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
-    //         for predictable patterns, so disregard this advice.
-    //
-    // Bidirectional traversal adds about 10%, accelerating from 11 GB/s to 12 GB/s.
-    // Using "streaming stores" boosts us from 12 GB/s to 19 GB/s.
-    else {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64;
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;
-        sz_size_t body_length = length - head_length - tail_length;
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-        _mm512_mask_storeu_epi8(target + head_length + body_length, tail_mask,
-                                _mm512_maskz_loadu_epi8(tail_mask, source));
-
-        // Now in the main loop, we can use non-temporal loads and stores,
-        // performing the operation in both directions.
-        for (target += head_length, source += head_length; //
-             body_length >= 128;                           //
-             target += 64, source += 64, body_length -= 128) {
-            _mm512_stream_si512((__m512i *)(target), _mm512_loadu_si512(source));
-            _mm512_stream_si512((__m512i *)(target + body_length - 64), _mm512_loadu_si512(source + body_length - 64));
-        }
-        if (body_length >= 64) _mm512_stream_si512((__m512i *)target, _mm512_loadu_si512(source));
-    }
-}
-
-SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    if (target == source) return; // Don't be silly, don't move the data if it's already there.
-
-    // On very short buffers, that are one cache line in width or less, we don't need any loops.
-    // We can also avoid any data-dependencies between iterations, assuming we have 32 registers
-    // to pre-load the data, before writing it back.
-    if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    }
-    else if (length <= 128) {
-        sz_size_t last_length = length - 64;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
-        __m512i source0 = _mm512_loadu_epi8(source);
-        __m512i source1 = _mm512_maskz_loadu_epi8(mask, source + 64);
-        _mm512_storeu_epi8(target, source0);
-        _mm512_mask_storeu_epi8(target + 64, mask, source1);
-    }
-    else if (length <= 192) {
-        sz_size_t last_length = length - 128;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
-        __m512i source0 = _mm512_loadu_epi8(source);
-        __m512i source1 = _mm512_loadu_epi8(source + 64);
-        __m512i source2 = _mm512_maskz_loadu_epi8(mask, source + 128);
-        _mm512_storeu_epi8(target, source0);
-        _mm512_storeu_epi8(target + 64, source1);
-        _mm512_mask_storeu_epi8(target + 128, mask, source2);
-    }
-    else if (length <= 256) {
-        sz_size_t last_length = length - 192;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
-        __m512i source0 = _mm512_loadu_epi8(source);
-        __m512i source1 = _mm512_loadu_epi8(source + 64);
-        __m512i source2 = _mm512_loadu_epi8(source + 128);
-        __m512i source3 = _mm512_maskz_loadu_epi8(mask, source + 192);
-        _mm512_storeu_epi8(target, source0);
-        _mm512_storeu_epi8(target + 64, source1);
-        _mm512_storeu_epi8(target + 128, source2);
-        _mm512_mask_storeu_epi8(target + 192, mask, source3);
-    }
-
-    // If the regions don't overlap at all, just use "copy" and save some brain cells thinking about corner cases.
-    else if (target + length < source || target >= source + length) { sz_copy_avx512(target, source, length); }
-
-    // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
-    // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
-    // by computing 2 masks - for the head and tail, using masked stores for the head and tail, and unmasked
-    // for the body.
-    else {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-
-        // The absolute most common case of using "moves" is shifting the data within a continuous buffer
-        // when adding a removing some values in it. In such cases, a typical shift is by 1, 2, 4, 8, 16,
-        // or 32 bytes, rarely larger. For small shifts, under the size of the ZMM register, we can use shuffles.
-        //
-        // Remember:
-        //      - if we are shifting data left, that we are traversing to the right.
-        //      - if we are shifting data right, that we are traversing to the left.
-        int const left_to_right_traversal = source > target;
-
-        // Now we guarantee, that the relative shift within registers is from 1 to 63 bytes and the output is aligned.
-        // Hopefully, we need to shift more than two ZMM registers, so we could consider `valignr` instruction.
-        // Sadly, using `_mm512_alignr_epi8` doesn't make sense, as it operates at a 128-bit granularity.
-        //
-        //      - `_mm256_alignr_epi8` shifts entire 256-bit register, but we need many of them.
-        //      - `_mm512_alignr_epi32` shifts 512-bit chunks, but only if the `shift` is a multiple of 4 bytes.
-        //      - `_mm512_alignr_epi64` shifts 512-bit chunks by 8 bytes.
-        //
-        // All of those have a latency of 1 cycle, and the shift amount must be an immediate value!
-        // For 1-byte-shift granularity, the `_mm512_permutex2var_epi8` has a latency of 6 and needs VBMI!
-        // The most efficient and broadly compatible alternative could be to use a combination of align and shuffle.
-        // A similar approach was outlined in "Byte-wise alignr in AVX512F" by Wojciech Muła.
-        // http://0x80.pl/notesen/2016-10-16-avx512-byte-alignr.html
-        //
-        // That solution, is extremely mouthful, assuming we need compile time constants for the shift amount.
-        // A cleaner one, with a latency of 3 cycles, is to use `_mm512_permutexvar_epi8` or
-        // `_mm512_mask_permutexvar_epi8`, which can be seen as combination of a cross-register shuffle and blend,
-        // and is available with VBMI. That solution is still noticeably slower than AVX2.
-        //
-        // The GLibC implementation also uses non-temporal stores for larger buffers, we don't.
-        // https://codebrowser.dev/glibc/glibc/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S.html
-        if (left_to_right_traversal) {
-            // Head, body, and tail.
-            _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-            for (target += head_length, source += head_length; body_length >= 64;
-                 target += 64, source += 64, body_length -= 64)
-                _mm512_store_si512(target, _mm512_loadu_si512(source));
-            _mm512_mask_storeu_epi8(target, tail_mask, _mm512_maskz_loadu_epi8(tail_mask, source));
-        }
-        else {
-            // Tail, body, and head.
-            _mm512_mask_storeu_epi8(target + head_length + body_length, tail_mask,
-                                    _mm512_maskz_loadu_epi8(tail_mask, source + head_length + body_length));
-            for (; body_length >= 64; body_length -= 64)
-                _mm512_store_si512(target + head_length + body_length - 64,
-                                   _mm512_loadu_si512(source + head_length + body_length - 64));
-            _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-        }
-    }
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    __mmask64 mask;
-    sz_u512_vec_t h_vec, n_vec;
-    n_vec.zmm = _mm512_set1_epi8(n[0]);
-
-    while (h_length >= 64) {
-        h_vec.zmm = _mm512_loadu_si512(h);
-        mask = _mm512_cmpeq_epi8_mask(h_vec.zmm, n_vec.zmm);
-        if (mask) return h + sz_u64_ctz(mask);
-        h += 64, h_length -= 64;
-    }
-
-    if (h_length) {
-        mask = _sz_u64_mask_until(h_length);
-        h_vec.zmm = _mm512_maskz_loadu_epi8(mask, h);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpeq_epu8_mask(mask, h_vec.zmm, n_vec.zmm);
-        if (mask) return h + sz_u64_ctz(mask);
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_avx512(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into ZMM registers.
-    __mmask64 matches;
-    __mmask64 mask;
-    sz_u512_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.zmm = _mm512_set1_epi8(n[offset_first]);
-    n_mid_vec.zmm = _mm512_set1_epi8(n[offset_mid]);
-    n_last_vec.zmm = _mm512_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    // We have several optimized versions of the lagorithm for shorter strings,
-    // but they all mimic the default case for unbounded length needles
-    if (n_length >= 64) {
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches);
-                if (sz_equal_avx512(h + potential_offset, n, n_length)) return h + potential_offset;
-                matches &= matches - 1;
-            }
-
-            // TODO: If the last character contains a bad byte, we can reposition the start of the next iteration.
-            // This will be very helpful for very long needles.
-        }
-    }
-    // If there are only 2 or 3 characters in the needle, we don't even need the nested loop.
-    else if (n_length <= 3) {
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            if (matches) return h + sz_u64_ctz(matches);
-        }
-    }
-    // If the needle is smaller than the size of the ZMM register, we can use masked comparisons
-    // to avoid the the inner-most nested loop and compare the entire needle against a haystack
-    // slice in 3 CPU cycles.
-    else {
-        __mmask64 n_mask = _sz_u64_mask_until(n_length);
-        sz_u512_vec_t n_full_vec, h_full_vec;
-        n_full_vec.zmm = _mm512_maskz_loadu_epi8(n_mask, n);
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches);
-                h_full_vec.zmm = _mm512_maskz_loadu_epi8(n_mask, h + potential_offset);
-                if (_mm512_mask_cmpneq_epi8_mask(n_mask, h_full_vec.zmm, n_full_vec.zmm) == 0)
-                    return h + potential_offset;
-                matches &= matches - 1;
-            }
-        }
-    }
-
-    // The "tail" of the function uses masked loads to process the remaining bytes.
-    {
-        mask = _sz_u64_mask_until(h_length - n_length + 1);
-        h_first_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_first);
-        h_mid_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_mid);
-        h_last_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_ctz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + potential_offset, n, n_length)) return h + potential_offset;
-            matches &= matches - 1;
-        }
-    }
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    __mmask64 mask;
-    sz_u512_vec_t h_vec, n_vec;
-    n_vec.zmm = _mm512_set1_epi8(n[0]);
-
-    while (h_length >= 64) {
-        h_vec.zmm = _mm512_loadu_si512(h + h_length - 64);
-        mask = _mm512_cmpeq_epi8_mask(h_vec.zmm, n_vec.zmm);
-        if (mask) return h + h_length - 1 - sz_u64_clz(mask);
-        h_length -= 64;
-    }
-
-    if (h_length) {
-        mask = _sz_u64_mask_until(h_length);
-        h_vec.zmm = _mm512_maskz_loadu_epi8(mask, h);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpeq_epu8_mask(mask, h_vec.zmm, n_vec.zmm);
-        if (mask) return h + 64 - sz_u64_clz(mask) - 1;
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_avx512(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into ZMM registers.
-    __mmask64 mask;
-    __mmask64 matches;
-    sz_u512_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.zmm = _mm512_set1_epi8(n[offset_first]);
-    n_mid_vec.zmm = _mm512_set1_epi8(n[offset_mid]);
-    n_last_vec.zmm = _mm512_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 64; h_length -= 64) {
-        h_reversed = h + h_length - n_length - 64 + 1;
-        h_first_vec.zmm = _mm512_loadu_si512(h_reversed + offset_first);
-        h_mid_vec.zmm = _mm512_loadu_si512(h_reversed + offset_mid);
-        h_last_vec.zmm = _mm512_loadu_si512(h_reversed + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~((sz_u64_t)1 << (63 - potential_offset));
-        }
-    }
-
-    // The "tail" of the function uses masked loads to process the remaining bytes.
-    {
-        mask = _sz_u64_mask_until(h_length - n_length + 1);
-        h_first_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_first);
-        h_mid_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_mid);
-        h_last_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + 64 - potential_offset - 1, n, n_length))
-                return h + 64 - potential_offset - 1;
-            sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~((sz_u64_t)1 << (63 - potential_offset));
-        }
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
-                             apply_to = function)
-
-/**
- *  @brief  Computes the edit distance between two very short byte-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 63, and evaluates at most (63 * 2 + 1 = 127) diagonals, or just as many loop cycles.
- *  Supports an early exit, if the distance is bounded.
- *  Keeps all of the data and Levenshtein matrices skew diagonal in just a couple of registers.
- *  Benefits from the @b `vpermb` instructions, that can rotate the bytes across the entire ZMM register.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto63_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                        //
-    sz_cptr_t longer, sz_size_t longer_length,                          //
-    sz_size_t bound) {
-
-    sz_size_t const max_length = 63u;
-    sz_assert(shorter_length <= longer_length && "The 'shorter' string is longer than the 'longer' one.");
-    sz_assert(shorter_length < max_length && "The length must fit into 16-bit integer. Otherwise use serial variant.");
-
-    // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
-    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
-    sz_size_t const shorter_dim = shorter_length + 1;
-    sz_size_t const longer_dim = longer_length + 1;
-
-    // The next few buffers will be swapped around.
-    sz_u512_vec_t previous_vec, current_vec, next_vec;
-    sz_u512_vec_t gaps_vec, substitutions_vec;
-
-    // Load the strings into ZMM registers - just once.
-    sz_u512_vec_t longer_vec, shorter_vec, shorter_rotated_vec, rotate_left_vec, rotate_right_vec, ones_vec, bound_vec;
-    longer_vec.zmm = _mm512_maskz_loadu_epi8(_sz_u64_mask_until(longer_length), longer);
-    rotate_left_vec.zmm = _mm512_set_epi8(                              //
-        0, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,  //
-        48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, //
-        32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, //
-        16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-    rotate_right_vec.zmm = _mm512_set_epi8(                             //
-        62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48,     //
-        47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, //
-        31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, //
-        15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 63);
-    ones_vec.zmm = _mm512_set1_epi8(1);
-    bound_vec.zmm = _mm512_set1_epi8(bound <= 255 ? (sz_u8_t)bound : 255);
-
-    // To simplify comparisons and traversals, we want to reverse the order of bytes in the shorter string.
-    for (sz_size_t i = 0; i != shorter_length; ++i) shorter_vec.u8s[63 - i] = shorter[i];
-    shorter_rotated_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, shorter_vec.zmm);
-
-    // Let's say we are dealing with 3 and 5 letter words.
-    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
-    // It will have:
-    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
-    // - 2 diagonals of fixed length, at positions: 4, 5.
-    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
-    sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
-
-    // Initialize the first two diagonals:
-    //
-    //      previous_vec.u8s[0] = 0;
-    //      current_vec.u8s[0] = current_vec.u8s[1] = 1;
-    //
-    // We can do a similar thing with vector ops:
-    previous_vec.zmm = _mm512_setzero_si512();
-    current_vec.zmm = _mm512_set1_epi8(1);
-
-    // We skip diagonals 0 and 1, as they are trivial.
-    // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
-    // so we are effectively computing just one value, as will be marked by a single set bit in
-    // the `next_diagonal_mask` on the very first iteration.
-    sz_size_t next_diagonal_index = 2;
-    __mmask64 next_diagonal_mask = 0;
-
-    // Progress through the upper triangle of the Levenshtein matrix.
-    for (; next_diagonal_index != shorter_dim; ++next_diagonal_index) {
-        // After this iteration, the values at offset `0` and `next_diagonal_index` in the `next_vec`
-        // should be set to `next_diagonal_index`, but it's easier to broadcast the value to the whole vector,
-        // and later merge with a mask with new values.
-        next_vec.zmm = _mm512_set1_epi8((sz_u8_t)next_diagonal_index);
-
-        // The mask also adds one set bit.
-        next_diagonal_mask = _kor_mask64(next_diagonal_mask, 1);
-        next_diagonal_mask = _kshiftli_mask64(next_diagonal_mask, 1);
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        substitutions_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, substitutions_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(_mm512_permutexvar_epi8(rotate_right_vec.zmm, current_vec.zmm), current_vec.zmm),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_mask_min_epu8(next_vec.zmm, next_diagonal_mask, gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = current_vec.zmm;
-        current_vec.zmm = next_vec.zmm;
-
-        // Shift the shorter string
-        shorter_rotated_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, shorter_rotated_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
-    }
-
-    // Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.
-    for (; next_diagonal_index != longer_dim; ++next_diagonal_index) {
-        // After this iteration, the value `shorted_dim - 1` in the `next_vec`
-        // should be set to `next_diagonal_index`, but it's easier to broadcast the value to the whole vector,
-        // and later merge with a mask with new values.
-        next_vec.zmm = _mm512_set1_epi8((sz_u8_t)next_diagonal_index);
-
-        // Make sure we update the first entry.
-        next_diagonal_mask = _kor_mask64(next_diagonal_mask, 1);
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(current_vec.zmm, _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm)),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_mask_min_epu8(next_vec.zmm, next_diagonal_mask, gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm);
-        current_vec.zmm = next_vec.zmm;
-
-        // Let's shift the longer string now.
-        longer_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, longer_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
-    }
-
-    // Now let's handle the bottom right triangle.
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(current_vec.zmm, _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm)),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_min_epu8(gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm);
-        current_vec.zmm = next_vec.zmm;
-
-        // Let's shift the longer string now.
-        longer_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, longer_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
-        // In every following iterations we take use a shorter prefix of each register,
-        // but we don't need to update the `next_diagonal_mask` anymore... except for the early exit.
-        next_diagonal_mask = _kshiftri_mask64(next_diagonal_mask, 1);
-    }
-    return current_vec.u8s[0];
-}
-
-/**
- *  @brief  Computes the edit distance between two somewhat short bytes-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 127, and evaluates at most (127 * 2 + 1 = 255) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *  Benefits from the @b `vpermi2b` instructions, that can rotate the bytes in 2 registers at once.
- *
- *  This may be one of the most freuqently called kernels for:
- *  - source code analysis, assuming most lines are either under 80 or under 120 characters long.
- *  - DNA sequence alignment, as most short reads are 50-300 characters long.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto127_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two longer bytes-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 255, and evaluates at most (255 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *
- *  Each of 2x string ends up occupying 4 ZMM registers, and each of 3x diagonals uses 4 ZMM registers.
- *  So 20x of the 32x are persistently occupied, and the rest are used for math temporarily.
- *  This is the largest space-efficient variant, as strings beyond 255 characters may require
- *  16-bit accumulators, which would be a significant bottleneck.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                      //
-    sz_cptr_t longer, sz_size_t longer_length,                        //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two longer bytes-strings using the AVX-512VBMI extensions,
- *          assuming the upper distance bound can not exceed 255, but the string length can be arbitrary.
- *
- *  Applies to string lengths up to 255, and evaluates at most (255 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *
- *  Each of 2x string ends up occupying 4 ZMM registers, and each of 3x diagonals uses 4 ZMM registers.
- *  So 20x of the 32x are persistently occupied, and the rest are used for math temporarily.
- *  This is the largest space-efficient variant, as strings beyond 255 characters may require
- *  16-bit accumulators, which would be a significant bottleneck.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto255bound_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                              //
-    sz_cptr_t longer, sz_size_t longer_length,                                //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two mid-length UTF-8-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 127, and evaluates at most (127 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Benefits from the @b `valignd` instructions used to rotate UTF-32 unpacked unicode codepoints.
- *
- *  Each string is unpacked into 128 characters * 4 bytes per character / 64 bytes per register = 8 registers.
- *
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_utf8_skewed_diagonals_upto127_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                              //
-    sz_cptr_t longer, sz_size_t longer_length,                                //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    sz_unused(shorter && longer && bound && alloc);
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // TODO: Generalize!
-    sz_size_t const max_length = 256u * 256u;
-    sz_assert(shorter_length <= longer_length && "The 'shorter' string is longer than the 'longer' one.");
-    sz_assert(shorter_length < max_length && "The length must fit into 16-bit integer. Otherwise use serial variant.");
-    sz_unused(longer_length && bound && max_length);
-
-#if 0
-    // We are going to store 3 diagonals of the matrix.
-    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
-    sz_size_t const shorter_dim = shorter_length + 1;
-    sz_size_t const longer_dim = longer_length + 1;
-    // Unlike the serial version, we also want to avoid reverse-order iteration over teh shorter string.
-    // So let's allocate a bit more memory and reverse-export our shorter string into that buffer.
-    sz_size_t const buffer_length = sizeof(sz_u16_t) * longer_dim * 3 + shorter_length;
-    sz_u16_t *const distances = (sz_u16_t *)alloc->allocate(buffer_length, alloc->handle);
-    if (!distances) return SZ_SIZE_MAX;
-
-    // The next few pointers will be swapped around.
-    sz_u16_t *previous_distances = distances;
-    sz_u16_t *current_distances = previous_distances + longer_dim;
-    sz_u16_t *next_distances = current_distances + longer_dim;
-    sz_ptr_t const shorter_reversed = (sz_ptr_t)(next_distances + longer_dim);
-
-    // Export the reversed string into the buffer.
-    for (sz_size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
-
-    // Initialize the first two diagonals:
-    previous_distances[0] = 0;
-    current_distances[0] = current_distances[1] = 1;
-
-    // Using ZMM registers, we can process 32x 16-bit values at once,
-    // storing 16 bytes of each string in YMM registers.
-    sz_u512_vec_t insertions_vec, deletions_vec, substitutions_vec, next_vec;
-    sz_u512_vec_t ones_u16_vec;
-    ones_u16_vec.zmm = _mm512_set1_epi16(1);
-
-    // This is a mixed-precision implementation, using 8-bit representations for part of the operations.
-    // Even there, in case `SZ_USE_X86_AVX2=0`, let's use the `sz_u512_vec_t` type, addressing the first YMM halfs.
-    sz_u512_vec_t shorter_vec, longer_vec;
-    sz_u512_vec_t ones_u8_vec;
-    ones_u8_vec.ymms[0] = _mm256_set1_epi8(1);
-
-    // Let's say we are dealing with 3 and 5 letter words.
-    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
-    // It will have:
-    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
-    // - 2 diagonals of fixed length, at positions: 4, 5.
-    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
-    sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
-
-    // Progress through the upper triangle of the Levenshtein matrix.
-    sz_size_t next_diagonal_index = 2;
-    for (; next_diagonal_index != shorter_dim; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
-        for (sz_size_t offset_within_diagonal = 0; offset_within_diagonal + 2 < next_diagonal_length;) {
-            sz_u32_t remaining_length = (sz_u32_t)(next_diagonal_length - offset_within_diagonal - 2);
-            sz_u32_t register_length = remaining_length < 32 ? remaining_length : 32;
-            sz_u32_t remaining_length_mask = _bzhi_u32(0xFFFFFFFFu, register_length);
-            longer_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, longer + offset_within_diagonal);
-            // Our original code addressed the shorter string `[next_diagonal_index - offset_within_diagonal - 2]`
-            // for growing `offset_within_diagonal`. If the `shorter` string was reversed, the
-            // `[next_diagonal_index - offset_within_diagonal - 2]` would be equal to `[shorter_length - 1 -
-            // next_diagonal_index + offset_within_diagonal + 2]`. Which simplified would be equal to
-            // `[shorter_length - next_diagonal_index + offset_within_diagonal + 1]`.
-            shorter_vec.ymms[0] = _mm256_maskz_loadu_epi8( //
-                remaining_length_mask,
-                shorter_reversed + shorter_length - next_diagonal_index + offset_within_diagonal + 1);
-            // For substitutions, perform the equality comparison using AVX2 instead of AVX-512
-            // to get the result as a vector, instead of a bitmask. Adding 1 to every scalar we can overflow
-            // transforming from {0xFF, 0} values to {0, 1} values - exactly what we need. Then - upcast to 16-bit.
-            substitutions_vec.zmm = _mm512_cvtepi8_epi16( //
-                _mm256_add_epi8(_mm256_cmpeq_epi8(longer_vec.ymms[0], shorter_vec.ymms[0]), ones_u8_vec.ymms[0]));
-            substitutions_vec.zmm = _mm512_add_epi16( //
-                substitutions_vec.zmm,
-                _mm512_maskz_loadu_epi16(remaining_length_mask, previous_distances + offset_within_diagonal));
-            // For insertions and deletions, on modern hardware, it's faster to issue two separate loads,
-            // than rotate the bytes in the ZMM register.
-            insertions_vec.zmm =
-                _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + offset_within_diagonal);
-            deletions_vec.zmm =
-                _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + offset_within_diagonal + 1);
-            // First get the minimum of insertions and deletions.
-            next_vec.zmm = _mm512_add_epi16(_mm512_min_epu16(insertions_vec.zmm, deletions_vec.zmm), ones_u16_vec.zmm);
-            next_vec.zmm = _mm512_min_epu16(next_vec.zmm, substitutions_vec.zmm);
-            _mm512_mask_storeu_epi16(next_distances + offset_within_diagonal + 1, remaining_length_mask, next_vec.zmm);
-            offset_within_diagonal += register_length;
-        }
-        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-        next_distances[0] = next_distances[next_diagonal_length - 1] = (sz_u16_t)next_diagonal_index;
-        // Perform a circular rotation (three-way swap) of those buffers, to reuse the memory.
-        sz_u16_t *temporary = previous_distances;
-        previous_distances = current_distances;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a
-    // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
-    // index on either side, we will be cropping those values out.
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
-        for (sz_size_t i = 0; i != next_diagonal_length;) {
-            sz_u32_t remaining_length = (sz_u32_t)(next_diagonal_length - i);
-            sz_u32_t register_length = remaining_length < 32 ? remaining_length : 32;
-            sz_u32_t remaining_length_mask = _bzhi_u32(0xFFFFFFFFu, register_length);
-            longer_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, longer + next_diagonal_index - n + i);
-            // Our original code addressed the shorter string `[shorter_length - 1 - i]` for growing `i`.
-            // If the `shorter` string was reversed, the `[shorter_length - 1 - i]` would
-            // be equal to `[shorter_length - 1 - shorter_length + 1 + i]`.
-            // Which simplified would be equal to just `[i]`. Beautiful!
-            shorter_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, shorter_reversed + i);
-            // For substitutions, perform the equality comparison using AVX2 instead of AVX-512
-            // to get the result as a vector, instead of a bitmask. The compare it against the accumulated
-            // substitution costs.
-            substitutions_vec.zmm = _mm512_cvtepi8_epi16( //
-                _mm256_add_epi8(_mm256_cmpeq_epi8(longer_vec.ymms[0], shorter_vec.ymms[0]), ones_u8_vec.ymms[0]));
-            substitutions_vec.zmm = _mm512_add_epi16( //
-                substitutions_vec.zmm, _mm512_maskz_loadu_epi16(remaining_length_mask, previous_distances + i));
-            // For insertions and deletions, on modern hardware, it's faster to issue two separate loads,
-            // than rotate the bytes in the ZMM register.
-            insertions_vec.zmm = _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + i);
-            deletions_vec.zmm = _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + i + 1);
-            // First get the minimum of insertions and deletions.
-            next_vec.zmm = _mm512_add_epi16(_mm512_min_epu16(insertions_vec.zmm, deletions_vec.zmm), ones_u16_vec.zmm);
-            next_vec.zmm = _mm512_min_epu16(next_vec.zmm, substitutions_vec.zmm);
-            _mm512_mask_storeu_epi16(next_distances + i, remaining_length_mask, next_vec.zmm);
-            i += register_length;
-        }
-
-        // Perform a circular rotation (three-way swap) of those buffers, to reuse the memory, this time, with a shift,
-        // dropping the first element in the current array.
-        sz_u16_t *temporary = previous_distances;
-        previous_distances = current_distances + 1;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // Cache scalar before `free` call.
-    sz_size_t result = current_distances[0];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-#endif
-    return 0;
-}
-
-SZ_INTERNAL sz_size_t sz_edit_distance_avx512(   //
-    sz_cptr_t shorter, sz_size_t shorter_length, //
-    sz_cptr_t longer, sz_size_t longer_length,   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Bounded computations may exit early.
-    int const is_bounded = bound < longer_length;
-    if (is_bounded) {
-        // If one of the strings is empty - the edit distance is equal to the length of the other one.
-        if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
-        if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
-        // If the difference in length is beyond the `bound`, there is no need to check at all.
-        if (longer_length - shorter_length > bound) return bound;
-    }
-
-    // Make sure the shorter string is actually shorter.
-    if (shorter_length > longer_length) {
-        sz_cptr_t temporary = shorter;
-        shorter = longer;
-        longer = temporary;
-        sz_size_t temporary_length = shorter_length;
-        shorter_length = longer_length;
-        longer_length = temporary_length;
-    }
-
-    // Dispatch the right implementation based on the length of the strings.
-    if (longer_length < 64u)
-        return _sz_edit_distance_skewed_diagonals_upto63_avx512( //
-            shorter, shorter_length, longer, longer_length, bound);
-    // else if (longer_length < 256u * 256u)
-    //     return _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
-    //         shorter, shorter_length, longer, longer_length, bound, alloc);
-    else
-        return sz_edit_distance_serial(shorter, shorter_length, longer, longer_length, bound, alloc);
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_avx512(sz_cptr_t text, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "loads".
-    //
-    // A typical AWS Sapphire Rapids instance can have 48 KB x 2 blocks of L1 data cache per core,
-    // 2 MB x 2 blocks of L2 cache per core, and one shared 60 MB buffer of L3 cache.
-    // With two strings, we may consider the overal workload huge, if each exceeds 1 MB in length.
-    int const is_huge = length >= 1ull * 1024ull * 1024ull;
-    sz_u512_vec_t text_vec, sums_vec;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 16) {
-        __mmask16 mask = _sz_u16_mask_until(length);
-        text_vec.xmms[0] = _mm_maskz_loadu_epi8(mask, text);
-        sums_vec.xmms[0] = _mm_sad_epu8(text_vec.xmms[0], _mm_setzero_si128());
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_vec.xmms[0]);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_vec.xmms[0], 1);
-        return low + high;
-    }
-    else if (length <= 32) {
-        __mmask32 mask = _sz_u32_mask_until(length);
-        text_vec.ymms[0] = _mm256_maskz_loadu_epi8(mask, text);
-        sums_vec.ymms[0] = _mm256_sad_epu8(text_vec.ymms[0], _mm256_setzero_si256());
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymms[0]);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymms[0], 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        return low + high;
-    }
-    else if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        return _mm512_reduce_add_epi64(sums_vec.zmm);
-    }
-    else if (!is_huge) {
-        sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(text + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length; // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        for (text += head_length; body_length >= 64; text += 64, body_length -= 64) {
-            text_vec.zmm = _mm512_load_si512((__m512i const *)text);
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        }
-        text_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text);
-        sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        return _mm512_reduce_add_epi64(sums_vec.zmm);
-    }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    //
-    //      1. Moving in both directions to maximize the throughput, when fetching from multiple
-    //         memory pages. Also helps with cache set-associativity issues, as we won't always
-    //         be fetching the same entries in the lookup table.
-    //      2. Using non-temporal stores to avoid polluting the cache.
-    //      3. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
-    //         for predictable patterns, so disregard this advice.
-    //
-    // Bidirectional traversal generally adds about 10% to such algorithms.
-    else {
-        sz_u512_vec_t text_reversed_vec, sums_reversed_vec;
-        sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64;
-        sz_size_t tail_length = (sz_size_t)(text + length) % 64;
-        sz_size_t body_length = length - head_length - tail_length;
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-
-        text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        text_reversed_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text + head_length + body_length);
-        sums_reversed_vec.zmm = _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512());
-
-        // Now in the main loop, we can use non-temporal loads and stores,
-        // performing the operation in both directions.
-        for (text += head_length; body_length >= 128; text += 64, text += 64, body_length -= 128) {
-            text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-            text_reversed_vec.zmm = _mm512_stream_load_si512((__m512i *)(text + body_length - 64));
-            sums_reversed_vec.zmm =
-                _mm512_add_epi64(sums_reversed_vec.zmm, _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512()));
-        }
-        if (body_length >= 64) {
-            text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        }
-
-        return _mm512_reduce_add_epi64(_mm512_add_epi64(sums_vec.zmm, sums_reversed_vec.zmm));
-    }
-}
-
-SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    if (length < 4 * window_length) {
-        sz_hashes_serial(start, length, window_length, step, callback, callback_handle);
-        return;
-    }
-
-    // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
-    // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
-    sz_size_t const max_hashes = length - window_length + 1;
-    sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
-    sz_u8_t const *text_first = (sz_u8_t const *)start;
-    sz_u8_t const *text_second = text_first + min_hashes_per_thread;
-    sz_u8_t const *text_third = text_first + min_hashes_per_thread * 2;
-    sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
-    sz_u8_t const *text_end = text_first + length;
-
-    // Broadcast the global constants into the registers.
-    // Both high and low hashes will work with the same prime and golden ratio.
-    sz_u512_vec_t prime_vec, golden_ratio_vec;
-    prime_vec.zmm = _mm512_set1_epi64(SZ_U64_MAX_PRIME);
-    golden_ratio_vec.zmm = _mm512_set1_epi64(11400714819323198485ull);
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // We will be evaluating 4 offsets at a time with 2 different hash functions.
-    // We can fit all those 8 state variables in each of the following ZMM registers.
-    sz_u512_vec_t base_vec, prime_power_vec, shift_vec;
-    base_vec.zmm = _mm512_set_epi64(31ull, 31ull, 31ull, 31ull, 257ull, 257ull, 257ull, 257ull);
-    shift_vec.zmm = _mm512_set_epi64(0ull, 0ull, 0ull, 0ull, 77ull, 77ull, 77ull, 77ull);
-    prime_power_vec.zmm = _mm512_set_epi64(prime_power_low, prime_power_low, prime_power_low, prime_power_low,
-                                           prime_power_high, prime_power_high, prime_power_high, prime_power_high);
-
-    // Compute the initial hash values for every one of the four windows.
-    sz_u512_vec_t hash_vec, chars_vec;
-    hash_vec.zmm = _mm512_setzero_si512();
-    for (sz_u8_t const *prefix_end = text_first + window_length; text_first < prefix_end;
-         ++text_first, ++text_second, ++text_third, ++text_fourth) {
-
-        // 1. Multiply the hashes by the base.
-        hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`...
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
-                                         text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-
-        // 3. Add the incoming characters.
-        hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
-                                              _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
-    }
-
-    // 5. Compute the hash mix, that will be used to index into the fingerprint.
-    //    This includes a serial step at the end.
-    sz_u512_vec_t hash_mix_vec;
-    hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
-    hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
-                                            _mm512_extracti64x4_epi64(hash_mix_vec.zmm, 0));
-
-    callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-    callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-    callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-    callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-
-    // Now repeat that operation for the remaining characters, discarding older characters.
-    sz_size_t cycle = 1;
-    sz_size_t step_mask = step - 1;
-    for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
-        // 0. Load again the four characters we are dropping, shift them, and subtract.
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[-window_length], text_third[-window_length],
-                                         text_second[-window_length], text_first[-window_length], //
-                                         text_fourth[-window_length], text_third[-window_length],
-                                         text_second[-window_length], text_first[-window_length]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-        hash_vec.zmm = _mm512_sub_epi64(hash_vec.zmm, _mm512_mullo_epi64(chars_vec.zmm, prime_power_vec.zmm));
-
-        // 1. Multiply the hashes by the base.
-        hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
-                                         text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-
-        // ... and prefetch the next four characters into Level 2 or higher.
-        _mm_prefetch((sz_cptr_t)text_fourth + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_third + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_second + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_first + 1, _MM_HINT_T1);
-
-        // 3. Add the incoming characters.
-        hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
-                                              _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
-
-        // 5. Compute the hash mix, that will be used to index into the fingerprint.
-        //    This includes a serial step at the end.
-        hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
-        hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
-                                                _mm512_castsi512_si256(hash_mix_vec.zmm));
-
-        if ((cycle & step_mask) == 0) {
-            callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-            callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-            callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-            callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-        }
-    }
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512vbmi", "avx512vbmi2", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512vbmi,avx512vbmi2,bmi,bmi2"))), \
-                             apply_to = function)
-
-SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-
-    // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
-    // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
-    // But if at least 3 cache lines are touched, the AVX-512 implementation should be faster.
-    if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
-        return;
-    }
-
-    // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
-    // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
-    // by computing 2 masks - for the head and tail, using masked stores for the head and tail, and unmasked
-    // for the body.
-    sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-    sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-    __mmask64 head_mask = _sz_u64_mask_until(head_length);
-    __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-
-    // We need to pull the lookup table into 4x ZMM registers.
-    // We can use `vpermi2b` instruction to perform the look in two ZMM registers with `_mm512_permutex2var_epi8`
-    // intrinsics, but it has a 6-cycle latency on Sapphire Rapids and requires AVX512-VBMI. Assuming we need to
-    // operate on 4 registers, it might be cleaner to use 2x separate `_mm512_permutexvar_epi8` calls.
-    // Combining the results with 2x `_mm512_test_epi8_mask` and 3x blends afterwards.
-    //
-    //  - 4x `_mm512_permutexvar_epi8` maps to "VPERMB (ZMM, ZMM, ZMM)":
-    //      - On Ice Lake: 3 cycles latency, ports: 1*p5
-    //      - On Genoa: 6 cycles latency, ports: 1*FP12
-    //  - 3x `_mm512_mask_blend_epi8` maps to "VPBLENDMB_Z (ZMM, K, ZMM, ZMM)":
-    //      - On Ice Lake: 3 cycles latency, ports: 1*p05
-    //      - On Genoa: 1 cycle latency, ports: 1*FP0123
-    //  - 2x `_mm512_test_epi8_mask` maps to "VPTESTMB (K, ZMM, ZMM)":
-    //      - On Ice Lake: 3 cycles latency, ports: 1*p5
-    //      - On Genoa: 4 cycles latency, ports: 1*FP01
-    //
-    sz_u512_vec_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec;
-    lut_0_to_63_vec.zmm = _mm512_loadu_si512((lut));
-    lut_64_to_127_vec.zmm = _mm512_loadu_si512((lut + 64));
-    lut_128_to_191_vec.zmm = _mm512_loadu_si512((lut + 128));
-    lut_192_to_255_vec.zmm = _mm512_loadu_si512((lut + 192));
-
-    sz_u512_vec_t first_bit_vec, second_bit_vec;
-    first_bit_vec.zmm = _mm512_set1_epi8((char)0x80);
-    second_bit_vec.zmm = _mm512_set1_epi8((char)0x40);
-
-    __mmask64 first_bit_mask, second_bit_mask;
-    sz_u512_vec_t source_vec;
-    // If the top bit is set in each word of `source_vec`, than we use `lookup_128_to_191_vec` or
-    // `lookup_192_to_255_vec`. If the second bit is set, we use `lookup_64_to_127_vec` or `lookup_192_to_255_vec`.
-    sz_u512_vec_t lookup_0_to_63_vec, lookup_64_to_127_vec, lookup_128_to_191_vec, lookup_192_to_255_vec;
-    sz_u512_vec_t blended_0_to_127_vec, blended_128_to_255_vec, blended_0_to_255_vec;
-
-    // Handling the head.
-    if (head_length) {
-        source_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, source);
-        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
-        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
-        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
-        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
-        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
-        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
-        blended_0_to_127_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
-        blended_128_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
-        blended_0_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
-        _mm512_mask_storeu_epi8(target, head_mask, blended_0_to_255_vec.zmm);
-        source += head_length, target += head_length, length -= head_length;
-    }
-
-    // Handling the body in 64-byte chunks aligned to cache-line boundaries with respect to `target`.
-    while (length >= 64) {
-        source_vec.zmm = _mm512_loadu_si512(source);
-        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
-        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
-        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
-        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
-        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
-        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
-        blended_0_to_127_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
-        blended_128_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
-        blended_0_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
-        _mm512_store_si512(target, blended_0_to_255_vec.zmm); //! Aligned store, our main weapon!
-        source += 64, target += 64, length -= 64;
-    }
-
-    // Handling the tail.
-    if (tail_length) {
-        source_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, source);
-        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
-        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
-        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
-        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
-        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
-        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
-        blended_0_to_127_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
-        blended_128_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
-        blended_0_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
-        _mm512_mask_storeu_epi8(target, tail_mask, blended_0_to_255_vec.zmm);
-        source += tail_length, target += tail_length, length -= tail_length;
-    }
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-
-    // Before initializing the AVX-512 vectors, we may want to run the sequential code for the first few bytes.
-    // In practice, that only hurts, even when we have matches every 5-ish bytes.
-    //
-    //      if (length < SZ_SWAR_THRESHOLD) return sz_find_charset_serial(text, length, filter);
-    //      sz_cptr_t early_result = sz_find_charset_serial(text, SZ_SWAR_THRESHOLD, filter);
-    //      if (early_result) return early_result;
-    //      text += SZ_SWAR_THRESHOLD;
-    //      length -= SZ_SWAR_THRESHOLD;
-    //
-    // Let's unzip even and odd elements and replicate them into both lanes of the YMM register.
-    // That way when we invoke `_mm512_shuffle_epi8` we can use the same mask for both lanes.
-    sz_u512_vec_t filter_even_vec, filter_odd_vec;
-    __m256i filter_ymm = _mm256_lddqu_si256((__m256i const *)filter);
-    // There are a few way to initialize filters without having native strided loads.
-    // In the cronological order of experiments:
-    // - serial code initializing 128 bytes of odd and even mask
-    // - using several shuffles
-    // - using `_mm512_permutexvar_epi8`
-    // - using `_mm512_broadcast_i32x4(_mm256_castsi256_si128(_mm256_maskz_compress_epi8(0x55555555, filter_ymm)))`
-    //   and `_mm512_broadcast_i32x4(_mm256_castsi256_si128(_mm256_maskz_compress_epi8(0xaaaaaaaa, filter_ymm)))`
-    filter_even_vec.zmm = _mm512_broadcast_i32x4(_mm256_castsi256_si128( // broadcast __m128i to __m512i
-        _mm256_maskz_compress_epi8(0x55555555, filter_ymm)));
-    filter_odd_vec.zmm = _mm512_broadcast_i32x4(_mm256_castsi256_si128( // broadcast __m128i to __m512i
-        _mm256_maskz_compress_epi8(0xaaaaaaaa, filter_ymm)));
-    // After the unzipping operation, we can validate the contents of the vectors like this:
-    //
-    //      for (sz_size_t i = 0; i != 16; ++i) {
-    //          sz_assert(filter_even_vec.u8s[i] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 16] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 16] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 32] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 32] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 48] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 48] == filter->_u8s[i * 2 + 1]);
-    //      }
-    //
-    sz_u512_vec_t text_vec;
-    sz_u512_vec_t lower_nibbles_vec, higher_nibbles_vec;
-    sz_u512_vec_t bitset_even_vec, bitset_odd_vec;
-    sz_u512_vec_t bitmask_vec, bitmask_lookup_vec;
-    bitmask_lookup_vec.zmm = _mm512_set_epi8(                       //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);
-
-    while (length) {
-        // The following algorithm is a transposed equivalent of the "SIMDized check which bytes are in a set"
-        // solutions by Wojciech Muła. We populate the bitmask differently and target newer CPUs, so
-        // StrinZilla uses a somewhat different approach.
-        // http://0x80.pl/articles/simd-byte-lookup.html#alternative-implementation-new
-        //
-        //      sz_u8_t input = *(sz_u8_t const *)text;
-        //      sz_u8_t lo_nibble = input & 0x0f;
-        //      sz_u8_t hi_nibble = input >> 4;
-        //      sz_u8_t bitset_even = filter_even_vec.u8s[hi_nibble];
-        //      sz_u8_t bitset_odd = filter_odd_vec.u8s[hi_nibble];
-        //      sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //      sz_u8_t bitset = lo_nibble < 8 ? bitset_even : bitset_odd;
-        //      if ((bitset & bitmask) != 0) return text;
-        //      else { length--, text++; }
-        //
-        // The nice part about this, loading the strided data is vey easy with Arm NEON,
-        // while with x86 CPUs after AVX, shuffles within 256 bits shouldn't be an issue either.
-        sz_size_t load_length = sz_min_of_two(length, 64);
-        __mmask64 load_mask = _sz_u64_mask_until(load_length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, text);
-        lower_nibbles_vec.zmm = _mm512_and_si512(text_vec.zmm, _mm512_set1_epi8(0x0f));
-        bitmask_vec.zmm = _mm512_shuffle_epi8(bitmask_lookup_vec.zmm, lower_nibbles_vec.zmm);
-        //
-        // At this point we can validate the `bitmask_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != load_length; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t lo_nibble = input & 0x0f;
-        //          sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //          sz_assert(bitmask_vec.u8s[i] == bitmask);
-        //      }
-        //
-        // Shift right every byte by 4 bits.
-        // There is no `_mm512_srli_epi8` intrinsic, so we have to use `_mm512_srli_epi16`
-        // and combine it with a mask to clear the higher bits.
-        higher_nibbles_vec.zmm = _mm512_and_si512(_mm512_srli_epi16(text_vec.zmm, 4), _mm512_set1_epi8(0x0f));
-        bitset_even_vec.zmm = _mm512_shuffle_epi8(filter_even_vec.zmm, higher_nibbles_vec.zmm);
-        bitset_odd_vec.zmm = _mm512_shuffle_epi8(filter_odd_vec.zmm, higher_nibbles_vec.zmm);
-        //
-        // At this point we can validate the `bitset_even_vec` and `bitset_odd_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != load_length; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t const *bitset_ptr = &filter->_u8s[0];
-        //          sz_u8_t hi_nibble = input >> 4;
-        //          sz_u8_t bitset_even = bitset_ptr[hi_nibble * 2];
-        //          sz_u8_t bitset_odd = bitset_ptr[hi_nibble * 2 + 1];
-        //          sz_assert(bitset_even_vec.u8s[i] == bitset_even);
-        //          sz_assert(bitset_odd_vec.u8s[i] == bitset_odd);
-        //      }
-        //
-        // TODO: Is this a good place for ternary logic?
-        __mmask64 take_first = _mm512_cmplt_epi8_mask(lower_nibbles_vec.zmm, _mm512_set1_epi8(8));
-        bitset_even_vec.zmm = _mm512_mask_blend_epi8(take_first, bitset_odd_vec.zmm, bitset_even_vec.zmm);
-        __mmask64 matches_mask = _mm512_mask_test_epi8_mask(load_mask, bitset_even_vec.zmm, bitmask_vec.zmm);
-        if (matches_mask) {
-            int offset = sz_u64_ctz(matches_mask);
-            return text + offset;
-        }
-        else { text += load_length, length -= load_length; }
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-    return sz_rfind_charset_serial(text, length, filter);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_many_avx512(                        //
-    sz_cptr_t haystack, sz_size_t haystack_length,              //
-    sz_cptr_t const *needles, sz_size_t const *needles_lengths, //
-    sz_size_t *needle_offset) {
-
-    // When dealing with huge needles vocabularies, like in tokenization workloads, we need to construct an automaton.
-    // But in many cases, the vocabulary is small enough to use a simpler DFA-less approach, combining the ideas from
-    // the `sz_find_avx512` and `sz_find_charset_avx512` functions.
-    //
-    // Pick the offsets within needles where there is the least variance in the characters.
-    // Like for "the", "then", "there", "these", "those", "their", "they", "them", "that", "this", "thus", "than":
-    //
-    //    0: 't'
-    //    1: 'h'
-    //    2: 'e', 'a', 'i', 'o', 'u'
-    //    3: 'n', 'r', 's', 'i', 'y', 'm', 't'
-    //
-    // So depending on our "register budget", we can use a different number of pivot points: offset 0, 1, 2 make
-    // the most sense if we can only use 3 ZMM registers.
-    sz_unused(haystack && haystack_length && needles && needles_lengths && needle_offset);
-    return 0;
-}
-
-/**
- *  Computes the Needleman Wunsch alignment score between two strings.
- *  The method uses 32-bit integers to accumulate the running score for every cell in the matrix.
- *  Assuming the costs of substitutions can be arbitrary signed 8-bit integers, the method is expected to be used
- *  on strings not exceeding 2^24 length or 16.7 million characters.
- *
- *  Unlike the `_sz_edit_distance_skewed_diagonals_upto65k_avx512` method, this one uses signed integers to store
- *  the accumulated score. Moreover, it's primary bottleneck is the latency of gathering the substitution costs
- *  from the substitution matrix. If we use the diagonal order, we will be comparing a slice of the first string with
- *  a slice of the second. If we stick to the conventional horizontal order, we will be comparing one character against
- *  a slice, which is much easier to optimize. In that case we are sampling costs not from arbitrary parts of
- *  a 256 x 256 matrix, but from a single row!
- */
-SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
-
-    // If one of the strings is empty - the edit distance is equal to the length of the other one
-    if (longer_length == 0) return (sz_ssize_t)shorter_length * gap;
-    if (shorter_length == 0) return (sz_ssize_t)longer_length * gap;
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
-    }
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    sz_size_t const max_length = 256ull * 256ull * 256ull;
-    sz_size_t const n = longer_length + 1;
-    sz_assert(n < max_length && "The length must fit into 24-bit integer. Otherwise use serial variant.");
-    sz_unused(longer_length && max_length);
-
-    sz_size_t buffer_length = sizeof(sz_i32_t) * n * 2;
-    sz_i32_t *distances = (sz_i32_t *)alloc->allocate(buffer_length, alloc->handle);
-    sz_i32_t *previous_distances = distances;
-    sz_i32_t *current_distances = previous_distances + n;
-
-    // Intialize the first row of the Levenshtein matrix with `iota`.
-    for (sz_size_t idx_longer = 0; idx_longer != n; ++idx_longer)
-        previous_distances[idx_longer] = (sz_i32_t)idx_longer * gap;
-
-    /// Contains up to 16 consecutive characters from the longer string.
-    sz_u512_vec_t longer_vec;
-    sz_u512_vec_t cost_deletion_vec, cost_substitution_vec, lookup_substitution_vec, current_vec;
-    sz_u512_vec_t row_first_subs_vec, row_second_subs_vec, row_third_subs_vec, row_fourth_subs_vec;
-    sz_u512_vec_t shuffled_first_subs_vec, shuffled_second_subs_vec, shuffled_third_subs_vec, shuffled_fourth_subs_vec;
-
-    // Prepare constants and masks.
-    sz_u512_vec_t is_third_or_fourth_vec, is_second_or_fourth_vec, gap_vec;
-    {
-        char is_third_or_fourth_check, is_second_or_fourth_check;
-        *(sz_u8_t *)&is_third_or_fourth_check = 0x80, *(sz_u8_t *)&is_second_or_fourth_check = 0x40;
-        is_third_or_fourth_vec.zmm = _mm512_set1_epi8(is_third_or_fourth_check);
-        is_second_or_fourth_vec.zmm = _mm512_set1_epi8(is_second_or_fourth_check);
-        gap_vec.zmm = _mm512_set1_epi32(gap);
-    }
-
-    sz_u8_t const *shorter_unsigned = (sz_u8_t const *)shorter;
-    for (sz_size_t idx_shorter = 0; idx_shorter != shorter_length; ++idx_shorter) {
-        sz_i32_t last_in_row = current_distances[0] = (sz_i32_t)(idx_shorter + 1) * gap;
-
-        // Load one row of the substitution matrix into four ZMM registers.
-        sz_error_cost_t const *row_subs = subs + shorter_unsigned[idx_shorter] * 256u;
-        row_first_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 0);
-        row_second_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 1);
-        row_third_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 2);
-        row_fourth_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 3);
-
-        // In the serial version we have one forward pass, that computes the deletion,
-        // insertion, and substitution costs at once.
-        //    for (sz_size_t idx_longer = 0; idx_longer < longer_length; ++idx_longer) {
-        //        sz_ssize_t cost_deletion = previous_distances[idx_longer + 1] + gap;
-        //        sz_ssize_t cost_insertion = current_distances[idx_longer] + gap;
-        //        sz_ssize_t cost_substitution = previous_distances[idx_longer] + row_subs[longer_unsigned[idx_longer]];
-        //        current_distances[idx_longer + 1] = sz_min_of_three(cost_deletion, cost_insertion, cost_substitution);
-        //    }
-        //
-        // Given the complexity of handling the data-dependency between consecutive insertion cost computations
-        // within a Levenshtein matrix, the simplest design would be to vectorize every kind of cost computation
-        // separately.
-        //      1. Compute substitution costs for up to 64 characters at once, upcasting from 8-bit integers to 32.
-        //      2. Compute the pairwise minimum with deletion costs.
-        //      3. Inclusive prefix minimum computation to combine with addition costs.
-        // Proceeding with substitutions:
-        for (sz_size_t idx_longer = 0; idx_longer < longer_length; idx_longer += 64) {
-            sz_size_t register_length = sz_min_of_two(longer_length - idx_longer, 64);
-            __mmask64 mask = _sz_u64_mask_until(register_length);
-            longer_vec.zmm = _mm512_maskz_loadu_epi8(mask, longer + idx_longer);
-
-            // Blend the `row_(first|second|third|fourth)_subs_vec` into `current_vec`, picking the right source
-            // for every character in `longer_vec`. Before that, we need to permute the subsititution vectors.
-            // Only the bottom 6 bits of a byte are used in VPERB, so we don't even need to mask.
-            shuffled_first_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_first_subs_vec.zmm);
-            shuffled_second_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_second_subs_vec.zmm);
-            shuffled_third_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_third_subs_vec.zmm);
-            shuffled_fourth_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_fourth_subs_vec.zmm);
-
-            // To blend we can invoke three `_mm512_cmplt_epu8_mask`, but we can also achieve the same using
-            // the AND logical operation, checking the top two bits of every byte.
-            // Continuing this thought, we can use the VPTESTMB instruction to output the mask after the AND.
-            __mmask64 is_third_or_fourth = _mm512_mask_test_epi8_mask(mask, longer_vec.zmm, is_third_or_fourth_vec.zmm);
-            __mmask64 is_second_or_fourth =
-                _mm512_mask_test_epi8_mask(mask, longer_vec.zmm, is_second_or_fourth_vec.zmm);
-            lookup_substitution_vec.zmm = _mm512_mask_blend_epi8(
-                is_third_or_fourth,
-                // Choose between the first and the second.
-                _mm512_mask_blend_epi8(is_second_or_fourth, shuffled_first_subs_vec.zmm, shuffled_second_subs_vec.zmm),
-                // Choose between the third and the fourth.
-                _mm512_mask_blend_epi8(is_second_or_fourth, shuffled_third_subs_vec.zmm, shuffled_fourth_subs_vec.zmm));
-
-            // First, sign-extend lower and upper 16 bytes to 16-bit integers.
-            __m512i current_0_31_vec = _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(lookup_substitution_vec.zmm, 0));
-            __m512i current_32_63_vec = _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(lookup_substitution_vec.zmm, 1));
-
-            // Now extend those 16-bit integers to 32-bit.
-            // This isn't free, same as the subsequent store, so we only want to do that for the populated lanes.
-            // To minimize the number of loads and stores, we can combine our substitution costs with the previous
-            // distances, containing the deletion costs.
-            {
-                cost_substitution_vec.zmm = _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_0_31_vec, 0)));
-                cost_deletion_vec.zmm = _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Inclusive prefix minimum computation to combine with insertion costs.
-                // Simply disabling this operation results in 5x performance improvement, meaning
-                // that this operation is responsible for 80% of the total runtime.
-                //    for (sz_size_t idx_longer = 0; idx_longer < longer_length; ++idx_longer) {
-                //        current_distances[idx_longer + 1] =
-                //            sz_max_of_two(current_distances[idx_longer] + gap, current_distances[idx_longer + 1]);
-                //    }
-                //
-                // To perform the same operation in vectorized form, we need to perform a tree-like reduction,
-                // that will involve multiple steps. It's quite expensive and should be first tested in the
-                // "experimental" section.
-                //
-                // Another approach might be loop unrolling:
-                //      current_vec.i32s[0] = last_in_row = sz_i32_max_of_two(current_vec.i32s[0], last_in_row + gap);
-                //      current_vec.i32s[1] = last_in_row = sz_i32_max_of_two(current_vec.i32s[1], last_in_row + gap);
-                //      current_vec.i32s[2] = last_in_row = sz_i32_max_of_two(current_vec.i32s[2], last_in_row + gap);
-                //      ... yet this approach is also quite expensive.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 16 to 31.
-            if (register_length > 16) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 16);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_0_31_vec, 1)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 16);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 16, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 32 to 47.
-            if (register_length > 32) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 32);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_32_63_vec, 0)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 32);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 32, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 32 to 47.
-            if (register_length > 48) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 48);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_32_63_vec, 1)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 48);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 48, (__mmask16)mask, current_vec.zmm);
-            }
-        }
-
-        // Swap previous_distances and current_distances pointers
-        sz_pointer_swap((void **)&previous_distances, (void **)&current_distances);
-    }
-
-    // Cache scalar before `free` call.
-    sz_ssize_t result = previous_distances[longer_length];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-}
-
-SZ_INTERNAL sz_ssize_t sz_alignment_score_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,  //
-    sz_cptr_t longer, sz_size_t longer_length,    //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
-
-    if (sz_max_of_two(shorter_length, longer_length) < (256ull * 256ull * 256ull))
-        return _sz_alignment_score_wagner_fisher_upto17m_avx512(shorter, shorter_length, longer, longer_length, subs,
-                                                                gap, alloc);
-    else
-        return sz_alignment_score_serial(shorter, shorter_length, longer, longer_length, subs, gap, alloc);
-}
-
-enum sz_encoding_t {
-    sz_encoding_unknown_k = 0,
-    sz_encoding_ascii_k = 1,
-    sz_encoding_utf8_k = 2,
-    sz_encoding_utf16_k = 3,
-    sz_encoding_utf32_k = 4,
-    sz_jwt_k,
-    sz_base64_k,
-    // Low priority encodings:
-    sz_encoding_utf8bom_k = 5,
-    sz_encoding_utf16le_k = 6,
-    sz_encoding_utf16be_k = 7,
-    sz_encoding_utf32le_k = 8,
-    sz_encoding_utf32be_k = 9,
-};
-
-// Character Set Detection is one of the most commonly performed operations in data processing with
-// [Chardet](https://github.com/chardet/chardet), [Charset Normalizer](https://github.com/jawah/charset_normalizer),
-// [cChardet](https://github.com/PyYoshi/cChardet) being the most commonly used options in the Python ecosystem.
-// All of them are notoriously slow.
-//
-// Moreover, as of October 2024, UTF-8 is the dominant character encoding on the web, used by 98.4% of websites.
-// Other have minimal usage, according to [W3Techs](https://w3techs.com/technologies/overview/character_encoding):
-// - ISO-8859-1: 1.2%
-// - Windows-1252: 0.3%
-// - Windows-1251: 0.2%
-// - EUC-JP: 0.1%
-// - Shift JIS: 0.1%
-// - EUC-KR: 0.1%
-// - GB2312: 0.1%
-// - Windows-1250: 0.1%
-// Within programming language implementations and database management systems, 16-bit and 32-bit fixed-width encodings
-// are also very popular and we need a way to efficienly differentiate between the most common UTF flavors, ASCII, and
-// the rest.
-//
-// One good solution is the [simdutf](https://github.com/simdutf/simdutf) library, but it depends on the C++ runtime
-// and focuses more on incremental validation & transcoding, rather than detection.
-//
-// So we need a very fast and efficient way of determining
-SZ_PUBLIC sz_bool_t sz_detect_encoding(sz_cptr_t text, sz_size_t length) {
-    // https://github.com/simdutf/simdutf/blob/master/src/icelake/icelake_utf8_validation.inl.cpp
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_from_utf8.inl.cpp#L81
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L661
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L788
-
-    // We can implement this operation simpler & differently, assuming most of the time continuous chunks of memory
-    // have identical encoding. With Russian and many European languages, we generally deal with 2-byte codepoints
-    // with occasional 1-byte punctuation marks. In the case of Chinese, Japanese, and Korean, we deal with 3-byte
-    // codepoints. In the case of emojis, we deal with 4-byte codepoints.
-    // We can also use the idea, that misaligned reads are quite cheap on modern CPUs.
-    int can_be_ascii = 1, can_be_utf8 = 1, can_be_utf16 = 1, can_be_utf32 = 1;
-    sz_unused(can_be_ascii + can_be_utf8 + can_be_utf16 + can_be_utf32);
-    sz_unused(text && length);
-    return sz_false_k;
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif
-
-#pragma endregion
-
-/*  @brief  Implementation of the string search algorithms using the Arm NEON instruction set, available on 64-bit
- *          Arm processors. Implements: {substring search, character search, character set search} x {forward, reverse}.
- */
-#pragma region ARM NEON
-
-#if SZ_USE_ARM_NEON
-#pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+simd")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
-
-/**
- *  @brief  Helper structure to simplify work with 64-bit words.
- */
-typedef union sz_u128_vec_t {
-    uint8x16_t u8x16;
-    uint16x8_t u16x8;
-    uint32x4_t u32x4;
-    uint64x2_t u64x2;
-    sz_u64_t u64s[2];
-    sz_u32_t u32s[4];
-    sz_u16_t u16s[8];
-    sz_u8_t u8s[16];
-} sz_u128_vec_t;
-
-SZ_INTERNAL sz_u64_t _sz_vreinterpretq_u8_u4(uint8x16_t vec) {
-    // Use `vshrn` to produce a bitmask, similar to `movemask` in SSE.
-    // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
-    return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(vec), 4)), 0) & 0x8888888888888888ull;
-}
-
-SZ_PUBLIC sz_ordering_t sz_order_neon(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    //! Before optimizing this, read the "Operations Not Worth Optimizing" in Contributions Guide:
-    //! https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md#general-performance-observations
-    return sz_order_serial(a, a_length, b, b_length);
-}
-
-SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_u128_vec_t a_vec, b_vec;
-    for (; length >= 16; a += 16, b += 16, length -= 16) {
-        a_vec.u8x16 = vld1q_u8((sz_u8_t const *)a);
-        b_vec.u8x16 = vld1q_u8((sz_u8_t const *)b);
-        uint8x16_t cmp = vceqq_u8(a_vec.u8x16, b_vec.u8x16);
-        if (vminvq_u8(cmp) != 255) { return sz_false_k; } // Check if all bytes match
-    }
-
-    // Handle remaining bytes
-    if (length) return sz_equal_serial(a, b, length);
-    return sz_true_k;
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_neon(sz_cptr_t text, sz_size_t length) {
-    uint64x2_t sum_vec = vdupq_n_u64(0);
-
-    // Process 16 bytes (128 bits) at a time
-    for (; length >= 16; text += 16, length -= 16) {
-        uint8x16_t vec = vld1q_u8((sz_u8_t const *)text);      // Load 16 bytes
-        uint16x8_t pairwise_sum1 = vpaddlq_u8(vec);            // Pairwise add lower and upper 8 bits
-        uint32x4_t pairwise_sum2 = vpaddlq_u16(pairwise_sum1); // Pairwise add 16-bit results
-        uint64x2_t pairwise_sum3 = vpaddlq_u32(pairwise_sum2); // Pairwise add 32-bit results
-        sum_vec = vaddq_u64(sum_vec, pairwise_sum3);           // Accumulate the sum
-    }
-
-    // Final reduction of `sum_vec` to a single scalar
-    sz_u64_t sum = vgetq_lane_u64(sum_vec, 0) + vgetq_lane_u64(sum_vec, 1);
-    if (length) sum += sz_checksum_serial(text, length);
-    return sum;
-}
-
-SZ_PUBLIC void sz_copy_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // In most cases the `source` and the `target` are not aligned, but we should
-    // at least make sure that writes don't touch many cache lines.
-    // NEON has an instruction to load and write 64 bytes at once.
-    //
-    //    sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-    //    sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-    //    for (; head_length; target += 1, source += 1, head_length -= 1) *target = *source;
-    //    length -= head_length;
-    //    for (; length >= 64; target += 64, source += 64, length -= 64)
-    //        vst4q_u8((sz_u8_t *)target, vld1q_u8_x4((sz_u8_t const *)source));
-    //    for (; tail_length; target += 1, source += 1, tail_length -= 1) *target = *source;
-    //
-    // Sadly, those instructions end up being 20% slower than the code processing 16 bytes at a time:
-    for (; length >= 16; target += 16, source += 16, length -= 16)
-        vst1q_u8((sz_u8_t *)target, vld1q_u8((sz_u8_t const *)source));
-    if (length) sz_copy_serial(target, source, length);
-}
-
-SZ_PUBLIC void sz_move_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // When moving small buffers, using a small buffer on stack as a temporary storage is faster.
-
-    if (target < source || target >= source + length) {
-        // Non-overlapping, proceed forward
-        sz_copy_neon(target, source, length);
+SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length, sz_memory_allocator_t *allocator) {
+    sz_size_t space_needed = length + 1; // space for trailing \0
+    sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
+    // Initialize the string to zeros for safety.
+    string->words[1] = 0;
+    string->words[2] = 0;
+    string->words[3] = 0;
+    // If we are lucky, no memory allocations will be needed.
+    if (space_needed <= _SZ_STRING_INTERNAL_SPACE) {
+        string->internal.start = &string->internal.chars[0];
+        string->internal.length = (sz_u8_t)length;
     }
     else {
-        // Overlapping, proceed backward
-        target += length;
-        source += length;
-
-        sz_u128_vec_t src_vec;
-        while (length >= 16) {
-            target -= 16, source -= 16, length -= 16;
-            src_vec.u8x16 = vld1q_u8((sz_u8_t const *)source);
-            vst1q_u8((sz_u8_t *)target, src_vec.u8x16);
-        }
-        while (length) {
-            target -= 1, source -= 1, length -= 1;
-            *target = *source;
-        }
-    }
-}
-
-SZ_PUBLIC void sz_fill_neon(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    uint8x16_t fill_vec = vdupq_n_u8(value); // Broadcast the value across the register
-
-    while (length >= 16) {
-        vst1q_u8((sz_u8_t *)target, fill_vec);
-        target += 16;
-        length -= 16;
+        // If we are not lucky, we need to allocate memory.
+        string->external.start = (sz_ptr_t)allocator->allocate(space_needed, allocator->handle);
+        if (!string->external.start) return SZ_NULL_CHAR;
+        string->external.length = length;
+        string->external.space = space_needed;
     }
-
-    // Handle remaining bytes
-    if (length) sz_fill_serial(target, length, value);
+    sz_assert(&string->internal.start == &string->external.start && "Alignment confusion");
+    string->external.start[length] = 0;
+    return string->external.start;
 }
 
-SZ_PUBLIC void sz_look_up_transform_neon(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-
-    // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
-    // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
-    if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
-        return;
-    }
+SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity, sz_memory_allocator_t *allocator) {
 
-    sz_size_t head_length = (16 - ((sz_size_t)target % 16)) % 16; // 15 or less.
-    sz_size_t tail_length = (sz_size_t)(target + length) % 16;    // 15 or less.
+    sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
 
-    // We need to pull the lookup table into 16x NEON registers. We have a total of 32 such registers.
-    // According to the Neoverse V2 manual, the 4-table lookup has a latency of 6 cycles, and 4x throughput.
-    uint8x16x4_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec;
-    lut_0_to_63_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 0));
-    lut_64_to_127_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 64));
-    lut_128_to_191_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 128));
-    lut_192_to_255_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 192));
+    sz_size_t new_space = new_capacity + 1;
+    if (new_space <= _SZ_STRING_INTERNAL_SPACE) return string->external.start;
 
-    sz_u128_vec_t source_vec;
-    // If the top bit is set in each word of `source_vec`, than we use `lookup_128_to_191_vec` or
-    // `lookup_192_to_255_vec`. If the second bit is set, we use `lookup_64_to_127_vec` or `lookup_192_to_255_vec`.
-    sz_u128_vec_t lookup_0_to_63_vec, lookup_64_to_127_vec, lookup_128_to_191_vec, lookup_192_to_255_vec;
-    sz_u128_vec_t blended_0_to_255_vec;
+    sz_ptr_t string_start;
+    sz_size_t string_length;
+    sz_size_t string_space;
+    sz_bool_t string_is_external;
+    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
+    sz_assert(new_space > string_space && "New space must be larger than current.");
 
-    // Process the head with serial code
-    for (; head_length; target += 1, source += 1, head_length -= 1) *target = lut[*(sz_u8_t const *)source];
+    sz_ptr_t new_start = (sz_ptr_t)allocator->allocate(new_space, allocator->handle);
+    if (!new_start) return SZ_NULL_CHAR;
 
-    // Table lookups on Arm are much simpler to use than on x86, as we can use the `vqtbl4q_u8` instruction
-    // to perform a 4-table lookup in a single instruction. The XORs are used to adjust the lookup position
-    // within each 64-byte range of the table.
-    // Details on the 4-table lookup: https://lemire.me/blog/2019/07/23/arbitrary-byte-to-byte-maps-using-arm-neon/
-    length -= head_length;
-    length -= tail_length;
-    for (; length >= 16; source += 16, target += 16, length -= 16) {
-        source_vec.u8x16 = vld1q_u8((sz_u8_t const *)source);
-        lookup_0_to_63_vec.u8x16 = vqtbl4q_u8(lut_0_to_63_vec, source_vec.u8x16);
-        lookup_64_to_127_vec.u8x16 = vqtbl4q_u8(lut_64_to_127_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0x40)));
-        lookup_128_to_191_vec.u8x16 = vqtbl4q_u8(lut_128_to_191_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0x80)));
-        lookup_192_to_255_vec.u8x16 = vqtbl4q_u8(lut_192_to_255_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0xc0)));
-        blended_0_to_255_vec.u8x16 = vorrq_u8(vorrq_u8(lookup_0_to_63_vec.u8x16, lookup_64_to_127_vec.u8x16),
-                                              vorrq_u8(lookup_128_to_191_vec.u8x16, lookup_192_to_255_vec.u8x16));
-        vst1q_u8((sz_u8_t *)target, blended_0_to_255_vec.u8x16);
-    }
+    sz_copy(new_start, string_start, string_length);
+    string->external.start = new_start;
+    string->external.space = new_space;
+    string->external.padding = 0;
+    string->external.length = string_length;
 
-    // Process the tail with serial code
-    for (; tail_length; target += 1, source += 1, tail_length -= 1) *target = lut[*(sz_u8_t const *)source];
+    // Deallocate the old string.
+    if (string_is_external) allocator->free(string_start, string_space, allocator->handle);
+    return string->external.start;
 }
 
-SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec, n_vec, matches_vec;
-    n_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)n);
+SZ_PUBLIC sz_ptr_t sz_string_shrink_to_fit(sz_string_t *string, sz_memory_allocator_t *allocator) {
 
-    while (h_length >= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)h);
-        matches_vec.u8x16 = vceqq_u8(h_vec.u8x16, n_vec.u8x16);
-        // In Arm NEON we don't have a `movemask` to combine it with `ctz` and get the offset of the match.
-        // But assuming the `vmaxvq` is cheap, we can use it to find the first match, by blending (bitwise selecting)
-        // the vector with a relative offsets array.
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        if (matches) return h + sz_u64_ctz(matches) / 4;
+    sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
 
-        h += 16, h_length -= 16;
-    }
+    sz_ptr_t string_start;
+    sz_size_t string_length;
+    sz_size_t string_space;
+    sz_bool_t string_is_external;
+    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
 
-    return sz_find_byte_serial(h, h_length, n);
-}
+    // We may already be space-optimal, and in that case we don't need to do anything.
+    sz_size_t new_space = string_length + 1;
+    if (string_space == new_space || !string_is_external) return string->external.start;
 
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec, n_vec, matches_vec;
-    n_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)n);
+    sz_ptr_t new_start = (sz_ptr_t)allocator->allocate(new_space, allocator->handle);
+    if (!new_start) return SZ_NULL_CHAR;
 
-    while (h_length >= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)h + h_length - 16);
-        matches_vec.u8x16 = vceqq_u8(h_vec.u8x16, n_vec.u8x16);
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        if (matches) return h + h_length - 1 - sz_u64_clz(matches) / 4;
-        h_length -= 16;
-    }
+    sz_copy(new_start, string_start, string_length);
+    string->external.start = new_start;
+    string->external.space = new_space;
+    string->external.padding = 0;
+    string->external.length = string_length;
 
-    return sz_rfind_byte_serial(h, h_length, n);
+    // Deallocate the old string.
+    if (string_is_external) allocator->free(string_start, string_space, allocator->handle);
+    return string->external.start;
 }
 
-SZ_PUBLIC sz_u64_t _sz_find_charset_neon_register(sz_u128_vec_t h_vec, uint8x16_t set_top_vec_u8x16,
-                                                  uint8x16_t set_bottom_vec_u8x16) {
+SZ_PUBLIC sz_ptr_t sz_string_expand( //
+    sz_string_t *string, sz_size_t offset, sz_size_t added_length, sz_memory_allocator_t *allocator) {
 
-    // Once we've read the characters in the haystack, we want to
-    // compare them against our bitset. The serial version of that code
-    // would look like: `(set_->_u8s[c >> 3] & (1u << (c & 7u))) != 0`.
-    uint8x16_t byte_index_vec = vshrq_n_u8(h_vec.u8x16, 3);
-    uint8x16_t byte_mask_vec = vshlq_u8(vdupq_n_u8(1), vreinterpretq_s8_u8(vandq_u8(h_vec.u8x16, vdupq_n_u8(7))));
-    uint8x16_t matches_top_vec = vqtbl1q_u8(set_top_vec_u8x16, byte_index_vec);
-    // The table lookup instruction in NEON replies to out-of-bound requests with zeros.
-    // The values in `byte_index_vec` all fall in [0; 32). So for values under 16, substracting 16 will underflow
-    // and map into interval [240, 256). Meaning that those will be populated with zeros and we can safely
-    // merge `matches_top_vec` and `matches_bottom_vec` with a bitwise OR.
-    uint8x16_t matches_bottom_vec = vqtbl1q_u8(set_bottom_vec_u8x16, vsubq_u8(byte_index_vec, vdupq_n_u8(16)));
-    uint8x16_t matches_vec = vorrq_u8(matches_top_vec, matches_bottom_vec);
-    // Istead of pure `vandq_u8`, we can immediately broadcast a match presence across each 8-bit word.
-    matches_vec = vtstq_u8(matches_vec, byte_mask_vec);
-    return _sz_vreinterpretq_u8_u4(matches_vec);
-}
+    sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
 
-SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+    sz_ptr_t string_start;
+    sz_size_t string_length;
+    sz_size_t string_space;
+    sz_bool_t string_is_external;
+    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
 
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_neon(h, h_length, n);
+    // The user intended to extend the string.
+    offset = sz_min_of_two(offset, string_length);
 
-    // Scan through the string.
-    // Assuming how tiny the Arm NEON registers are, we should avoid internal branches at all costs.
-    // That's why, for smaller needles, we use different loops.
-    if (n_length == 2) {
-        // Broadcast needle characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_last_vec, n_first_vec, n_last_vec, matches_vec;
-        // Dealing with 16-bit values, we can load 2 registers at a time and compare 31 possible offsets
-        // in a single loop iteration.
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[0]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[1]);
-        for (; h_length >= 17; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 0));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 1));
-            matches_vec.u8x16 =
-                vandq_u8(vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            if (matches) return h + sz_u64_ctz(matches) / 4;
-        }
-    }
-    else if (n_length == 3) {
-        // Broadcast needle characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-        // Comparing 24-bit values is a bumer. Being lazy, I went with the same approach
-        // as when searching for string over 4 characters long. I only avoid the last comparison.
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[0]);
-        n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[1]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[2]);
-        for (; h_length >= 18; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 0));
-            h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 1));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 2));
-            matches_vec.u8x16 = vandq_u8(                           //
-                vandq_u8(                                           //
-                    vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                    vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-                vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            if (matches) return h + sz_u64_ctz(matches) / 4;
-        }
+    // If we are lucky, no memory allocations will be needed.
+    if (string_length + added_length < string_space) {
+        sz_move(string_start + offset + added_length, string_start + offset, string_length - offset);
+        string_start[string_length + added_length] = 0;
+        // Even if the string is on the stack, the `+=` won't affect the tail of the string.
+        string->external.length += added_length;
     }
+    // If we are not lucky, we need to allocate more memory.
     else {
-        // Pick the parts of the needle that are worth comparing.
-        sz_size_t offset_first, offset_mid, offset_last;
-        _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-        // Broadcast those characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_first]);
-        n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_mid]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_last]);
-        // Walk through the string.
-        for (; h_length >= n_length + 16; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_first));
-            h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_mid));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_last));
-            matches_vec.u8x16 = vandq_u8(                           //
-                vandq_u8(                                           //
-                    vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                    vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-                vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches) / 4;
-                if (sz_equal(h + potential_offset, n, n_length)) return h + potential_offset;
-                matches &= matches - 1;
-            }
-        }
-    }
-
-    return sz_find_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_neon(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Will contain 4 bits per character.
-    sz_u64_t matches;
-    sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-    n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_first]);
-    n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_mid]);
-    n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_last]);
-
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 16; h_length -= 16) {
-        h_reversed = h + h_length - n_length - 16 + 1;
-        h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_first));
-        h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_mid));
-        h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_last));
-        matches_vec.u8x16 = vandq_u8(                           //
-            vandq_u8(                                           //
-                vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-            vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches) / 4;
-            if (sz_equal(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            sz_assert((matches & (1ull << (63 - potential_offset * 4))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~(1ull << (63 - potential_offset * 4));
-        }
-    }
-
-    return sz_rfind_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_charset_t const *set) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec;
-    uint8x16_t set_top_vec_u8x16 = vld1q_u8(&set->_u8s[0]);
-    uint8x16_t set_bottom_vec_u8x16 = vld1q_u8(&set->_u8s[16]);
-
-    for (; h_length >= 16; h += 16, h_length -= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h));
-        matches = _sz_find_charset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
-        if (matches) return h + sz_u64_ctz(matches) / 4;
-    }
-
-    return sz_find_charset_serial(h, h_length, set);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_charset_t const *set) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec;
-    uint8x16_t set_top_vec_u8x16 = vld1q_u8(&set->_u8s[0]);
-    uint8x16_t set_bottom_vec_u8x16 = vld1q_u8(&set->_u8s[16]);
+        sz_size_t next_planned_size = sz_max_of_two(SZ_CACHE_LINE_WIDTH, string_space * 2ull);
+        sz_size_t min_needed_space = sz_size_bit_ceil(offset + string_length + added_length + 1);
+        sz_size_t new_space = sz_max_of_two(min_needed_space, next_planned_size);
+        string_start = sz_string_reserve(string, new_space - 1, allocator);
+        if (!string_start) return SZ_NULL_CHAR;
 
-    // Check `sz_find_charset_neon` for explanations.
-    for (; h_length >= 16; h_length -= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h) + h_length - 16);
-        matches = _sz_find_charset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
-        if (matches) return h + h_length - 1 - sz_u64_clz(matches) / 4;
+        // Copy into the new buffer.
+        sz_move(string_start + offset + added_length, string_start + offset, string_length - offset);
+        string_start[string_length + added_length] = 0;
+        string->external.length = string_length + added_length;
     }
 
-    return sz_rfind_charset_serial(h, h_length, set);
+    return string_start;
 }
 
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif // Arm Neon
-
-#pragma endregion
-
-/*  @brief  Implementation of the string search algorithms using the Arm SVE variable-length registers, available
- *          in Arm v9 processors.
- *
- *  Implements:
- *      - memory: {copy, move, fill}
- *      - comparisons: {equal, order}
- *      - search: {substring, character, character set} x {forward, reverse}.
- */
-#pragma region ARM SVE
-
-#if SZ_USE_ARM_SVE
-#pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+sve")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
-
-SZ_PUBLIC void sz_fill_sve(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    svuint8_t value_vec = svdup_u8(value);
-    sz_size_t vec_len = svcntb(); // Vector length in bytes (scalable)
-
-    if (length <= vec_len) {
-        // Small buffer case: use mask to handle small writes
-        svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)length);
-        svst1_u8(mask, (unsigned char *)target, value_vec);
-    }
-    else {
-        // Calculate head, body, and tail sizes
-        sz_size_t head_length = vec_len - ((sz_size_t)target % vec_len);
-        sz_size_t tail_length = (sz_size_t)(target + length) % vec_len;
-        sz_size_t body_length = length - head_length - tail_length;
+SZ_PUBLIC sz_size_t sz_string_erase(sz_string_t *string, sz_size_t offset, sz_size_t length) {
 
-        // Handle unaligned head
-        svbool_t head_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)head_length);
-        svst1_u8(head_mask, (unsigned char *)target, value_vec);
-        target += head_length;
+    sz_assert(string && "String can't be SZ_NULL.");
 
-        // Aligned body loop
-        for (; body_length >= vec_len; target += vec_len, body_length -= vec_len) {
-            svst1_u8(svptrue_b8(), (unsigned char *)target, value_vec);
-        }
+    sz_ptr_t string_start;
+    sz_size_t string_length;
+    sz_size_t string_space;
+    sz_bool_t string_is_external;
+    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
 
-        // Handle unaligned tail
-        svbool_t tail_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)tail_length);
-        svst1_u8(tail_mask, (unsigned char *)target, value_vec);
-    }
-}
+    // Normalize the offset, it can't be larger than the length.
+    offset = sz_min_of_two(offset, string_length);
 
-SZ_PUBLIC void sz_copy_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    sz_size_t vec_len = svcntb(); // Vector length in bytes
+    // We shouldn't normalize the length, to avoid overflowing on `offset + length >= string_length`,
+    // if receiving `length == SZ_SIZE_MAX`. After following expression the `length` will contain
+    // exactly the delta between original and final length of this `string`.
+    length = sz_min_of_two(length, string_length - offset);
 
-    // Arm Neoverse V2 cores in Graviton 4, for example, come with 256 KB of L1 data cache per core,
-    // and 8 MB of L2 cache per core. Moreover, the L1 cache is fully associative.
-    // With two strings, we may consider the overal workload huge, if each exceeds 1 MB in length.
-    //
-    //      int is_huge = length >= 4ull * 1024ull * 1024ull;
-    //
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= vec_len) {
-        // Small buffer case: use mask to handle small writes
-        svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)length);
-        svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-        svst1_u8(mask, (unsigned char *)target, data);
-    }
-    // When dealing with larger buffers, similar to AVX-512, we want minimize unaligned operations
-    // and handle the head, body, and tail separately. We can also traverse the buffer in both directions
-    // as Arm generally supports more simultaneous stores than x86 CPUs.
-    //
-    // For gigantic datasets, similar to AVX-512, non-temporal "loads" and "stores" can be used.
-    // Sadly, if the register size (16 byte or larger) is smaller than a cache-line (64 bytes)
-    // we will pay a huge penalty on loads, fetching the same content many times.
-    // It may be better to allow caching (and subsequent eviction), in favor of using four-element
-    // tuples, wich will be guaranteed to be a multiple of a cache line.
+    // There are 2 common cases, that wouldn't even require a `memmove`:
+    //      1.  Erasing the entire contents of the string.
+    //          In that case `length` argument will be equal or greater than `length` member.
+    //      2.  Removing the tail of the string with something like `string.pop_back()` in C++.
     //
-    // Another approach is to use the `LD4B` instructions, which will populate four registers at once.
-    // This however, further decreases the performance from LibC-like 29 GB/s to 20 GB/s.
-    else {
-        // Calculating head, body, and tail sizes depends on the `vec_len`,
-        // but it's runtime constant, and the modulo operation is expensive!
-        // Instead we use the fact, that it's always a multiple of 128 bits or 16 bytes.
-        sz_size_t head_length = 16 - ((sz_size_t)target % 16);
-        sz_size_t tail_length = (sz_size_t)(target + length) % 16;
-        sz_size_t body_length = length - head_length - tail_length;
-
-        // Handle unaligned parts
-        svbool_t head_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)head_length);
-        svuint8_t head_data = svld1_u8(head_mask, (unsigned char *)source);
-        svst1_u8(head_mask, (unsigned char *)target, head_data);
-        svbool_t tail_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)tail_length);
-        svuint8_t tail_data = svld1_u8(tail_mask, (unsigned char *)source + head_length + body_length);
-        svst1_u8(tail_mask, (unsigned char *)target + head_length + body_length, tail_data);
-        target += head_length;
-        source += head_length;
-
-        // Aligned body loop, walking in two directions
-        for (; body_length >= vec_len * 2; target += vec_len, source += vec_len, body_length -= vec_len * 2) {
-            svuint8_t forward_data = svld1_u8(svptrue_b8(), (unsigned char *)source);
-            svuint8_t backward_data = svld1_u8(svptrue_b8(), (unsigned char *)source + body_length - vec_len);
-            svst1_u8(svptrue_b8(), (unsigned char *)target, forward_data);
-            svst1_u8(svptrue_b8(), (unsigned char *)target + body_length - vec_len, backward_data);
-        }
-        // Up to (vec_len * 2 - 1) bytes of data may be left in the body,
-        // so we can unroll the last two optional loop iterations.
-        if (body_length > vec_len) {
-            svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)body_length);
-            svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-            svst1_u8(mask, (unsigned char *)target, data);
-            body_length -= vec_len;
-            source += body_length;
-            target += body_length;
-        }
-        if (body_length) {
-            svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)body_length);
-            svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-            svst1_u8(mask, (unsigned char *)target, data);
-        }
-    }
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif // Arm SVE
-
-#pragma endregion
-
-/*
- *  @brief  Pick the right implementation for the string search algorithms.
- */
-#pragma region Compile Time Dispatching
-
-SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t ins, sz_size_t length) { return sz_hash_serial(ins, length); }
-SZ_PUBLIC void sz_tolower(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_tolower_serial(ins, length, outs); }
-SZ_PUBLIC void sz_toupper(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_toupper_serial(ins, length, outs); }
-SZ_PUBLIC void sz_toascii(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_toascii_serial(ins, length, outs); }
-SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t ins, sz_size_t length) { return sz_isascii_serial(ins, length); }
-
-SZ_PUBLIC void sz_hashes_fingerprint(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_ptr_t fingerprint,
-                                     sz_size_t fingerprint_bytes) {
-
-    sz_bool_t fingerprint_length_is_power_of_two = (sz_bool_t)((fingerprint_bytes & (fingerprint_bytes - 1)) == 0);
-    sz_string_view_t fingerprint_buffer = {fingerprint, fingerprint_bytes};
-
-    // There are several issues related to the fingerprinting algorithm.
-    // First, the memory traversal order is important.
-    // https://blog.stuffedcow.net/2015/08/pagewalk-coherence/
-
-    // In most cases the fingerprint length will be a power of two.
-    if (fingerprint_length_is_power_of_two == sz_false_k)
-        sz_hashes(start, length, window_length, 1, _sz_hashes_fingerprint_non_pow2_callback, &fingerprint_buffer);
-    else
-        sz_hashes(start, length, window_length, 1, _sz_hashes_fingerprint_pow2_callback, &fingerprint_buffer);
-}
-
-#if !SZ_DYNAMIC_DISPATCH
-
-SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    return sz_checksum_avx512(text, length);
-#elif SZ_USE_X86_AVX2
-    return sz_checksum_avx2(text, length);
-#elif SZ_USE_ARM_NEON
-    return sz_checksum_neon(text, length);
-#else
-    return sz_checksum_serial(text, length);
-#endif
-}
-
-SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    return sz_equal_avx512(a, b, length);
-#elif SZ_USE_X86_AVX2
-    return sz_equal_avx2(a, b, length);
-#elif SZ_USE_ARM_NEON
-    return sz_equal_neon(a, b, length);
-#else
-    return sz_equal_serial(a, b, length);
-#endif
-}
-
-SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-#if SZ_USE_X86_AVX512
-    return sz_order_avx512(a, a_length, b, b_length);
-#elif SZ_USE_X86_AVX2
-    return sz_order_avx2(a, a_length, b, b_length);
-#elif SZ_USE_ARM_NEON
-    return sz_order_neon(a, a_length, b, b_length);
-#else
-    return sz_order_serial(a, a_length, b, b_length);
-#endif
-}
-
-SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    sz_copy_avx512(target, source, length);
-#elif SZ_USE_X86_AVX2
-    sz_copy_avx2(target, source, length);
-#elif SZ_USE_ARM_NEON
-    sz_copy_neon(target, source, length);
-#else
-    sz_copy_serial(target, source, length);
-#endif
-}
-
-SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    sz_move_avx512(target, source, length);
-#elif SZ_USE_X86_AVX2
-    sz_move_avx2(target, source, length);
-#elif SZ_USE_ARM_NEON
-    sz_move_neon(target, source, length);
-#else
-    sz_move_serial(target, source, length);
-#endif
-}
-
-SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-#if SZ_USE_X86_AVX512
-    sz_fill_avx512(target, length, value);
-#elif SZ_USE_X86_AVX2
-    sz_fill_avx2(target, length, value);
-#elif SZ_USE_ARM_NEON
-    sz_fill_neon(target, length, value);
-#else
-    sz_fill_serial(target, length, value);
-#endif
-}
-
-SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-#if SZ_USE_X86_AVX512
-    sz_look_up_transform_avx512(source, length, lut, target);
-#elif SZ_USE_X86_AVX2
-    sz_look_up_transform_avx2(source, length, lut, target);
-#elif SZ_USE_ARM_NEON
-    sz_look_up_transform_neon(source, length, lut, target);
-#else
-    sz_look_up_transform_serial(source, length, lut, target);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
-#if SZ_USE_X86_AVX512
-    return sz_find_byte_avx512(haystack, h_length, needle);
-#elif SZ_USE_X86_AVX2
-    return sz_find_byte_avx2(haystack, h_length, needle);
-#elif SZ_USE_ARM_NEON
-    return sz_find_byte_neon(haystack, h_length, needle);
-#else
-    return sz_find_byte_serial(haystack, h_length, needle);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
-#if SZ_USE_X86_AVX512
-    return sz_rfind_byte_avx512(haystack, h_length, needle);
-#elif SZ_USE_X86_AVX2
-    return sz_rfind_byte_avx2(haystack, h_length, needle);
-#elif SZ_USE_ARM_NEON
-    return sz_rfind_byte_neon(haystack, h_length, needle);
-#else
-    return sz_rfind_byte_serial(haystack, h_length, needle);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
-#if SZ_USE_X86_AVX512
-    return sz_find_avx512(haystack, h_length, needle, n_length);
-#elif SZ_USE_X86_AVX2
-    return sz_find_avx2(haystack, h_length, needle, n_length);
-#elif SZ_USE_ARM_NEON
-    return sz_find_neon(haystack, h_length, needle, n_length);
-#else
-    return sz_find_serial(haystack, h_length, needle, n_length);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
-#if SZ_USE_X86_AVX512
-    return sz_rfind_avx512(haystack, h_length, needle, n_length);
-#elif SZ_USE_X86_AVX2
-    return sz_rfind_avx2(haystack, h_length, needle, n_length);
-#elif SZ_USE_ARM_NEON
-    return sz_rfind_neon(haystack, h_length, needle, n_length);
-#else
-    return sz_rfind_serial(haystack, h_length, needle, n_length);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#if SZ_USE_X86_AVX512
-    return sz_find_charset_avx512(text, length, set);
-#elif SZ_USE_X86_AVX2
-    return sz_find_charset_avx2(text, length, set);
-#elif SZ_USE_ARM_NEON
-    return sz_find_charset_neon(text, length, set);
-#else
-    return sz_find_charset_serial(text, length, set);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#if SZ_USE_X86_AVX512
-    return sz_rfind_charset_avx512(text, length, set);
-#elif SZ_USE_X86_AVX2
-    return sz_rfind_charset_avx2(text, length, set);
-#elif SZ_USE_ARM_NEON
-    return sz_rfind_charset_neon(text, length, set);
-#else
-    return sz_rfind_charset_serial(text, length, set);
-#endif
-}
-
-SZ_DYNAMIC sz_size_t sz_hamming_distance( //
-    sz_cptr_t a, sz_size_t a_length,      //
-    sz_cptr_t b, sz_size_t b_length,      //
-    sz_size_t bound) {
-    return sz_hamming_distance_serial(a, a_length, b, b_length, bound);
-}
-
-SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length,           //
-    sz_cptr_t b, sz_size_t b_length,           //
-    sz_size_t bound) {
-    return sz_hamming_distance_utf8_serial(a, a_length, b, b_length, bound);
-}
-
-SZ_DYNAMIC sz_size_t sz_edit_distance( //
-    sz_cptr_t a, sz_size_t a_length,   //
-    sz_cptr_t b, sz_size_t b_length,   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-#if SZ_USE_X86_AVX512
-    return sz_edit_distance_avx512(a, a_length, b, b_length, bound, alloc);
-#else
-    return sz_edit_distance_serial(a, a_length, b, b_length, bound, alloc);
-#endif
-}
-
-SZ_DYNAMIC sz_size_t sz_edit_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length,        //
-    sz_cptr_t b, sz_size_t b_length,        //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-    return _sz_edit_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc);
-}
-
-SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                         sz_error_cost_t const *subs, sz_error_cost_t gap,
-                                         sz_memory_allocator_t *alloc) {
-#if SZ_USE_X86_AVX512
-    return sz_alignment_score_avx512(a, a_length, b, b_length, subs, gap, alloc);
-#else
-    return sz_alignment_score_serial(a, a_length, b, b_length, subs, gap, alloc);
-#endif
-}
-
-SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                          sz_hash_callback_t callback, void *callback_handle) {
-#if SZ_USE_X86_AVX512
-    sz_hashes_avx512(text, length, window_length, window_step, callback, callback_handle);
-#elif SZ_USE_X86_AVX2
-    sz_hashes_avx2(text, length, window_length, window_step, callback, callback_handle);
-#else
-    sz_hashes_serial(text, length, window_length, window_step, callback, callback_handle);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    return sz_find_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    sz_charset_invert(&set);
-    return sz_find_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    return sz_rfind_charset(h, h_length, &set);
-}
+    // In both of those, regardless of the location of the string - stack or heap,
+    // the erasing is as easy as setting the length to the offset.
+    // In every other case, we must `memmove` the tail of the string to the left.
+    if (offset + length < string_length)
+        sz_move(string_start + offset, string_start + offset + length, string_length - offset - length);
 
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    sz_charset_invert(&set);
-    return sz_rfind_charset(h, h_length, &set);
+    // The `string->external.length = offset` assignment would discard last characters
+    // of the on-the-stack string, but inplace subtraction would work.
+    string->external.length -= length;
+    string_start[string_length - length] = 0;
+    return length;
 }
 
-SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
-                            sz_random_generator_t generator, void *generator_user_data) {
-    sz_generate_serial(alphabet, alphabet_size, result, result_length, generator, generator_user_data);
+SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *allocator) {
+    if (!sz_string_is_on_stack(string))
+        allocator->free(string->external.start, string->external.space, allocator->handle);
+    sz_string_init(string);
 }
 
-#endif
-#pragma endregion
+#pragma endregion // Serial Implementation
 
 #ifdef __cplusplus
-#pragma GCC diagnostic pop
 }
 #endif // __cplusplus
-
-#endif // STRINGZILLA_H_
+#endif // STRINGZILLA_SMALL_STRING_H_

From 1ba7982559111d4fc9b58caa7bc7aa1c6e64257c Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 18:37:47 +0000
Subject: [PATCH 039/751] Fix: Filter `sort.h` file

---
 include/stringzilla/sort.h | 7325 ++----------------------------------
 1 file changed, 256 insertions(+), 7069 deletions(-)

diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index de7fbcac..4fe64bee 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -1,7156 +1,343 @@
 /**
- *  @brief  StringZilla is a collection of advanced string algorithms, designed to be used in Big Data applications.
- *          It is generally faster than LibC, and has a broader & cleaner interface, and targets modern x86 CPUs
- *          with AVX-512 and Arm NEON and older CPUs with SWAR and auto-vectorization.
- *
- *  Consider overriding the following macros to customize the library:
- *
- *  - `SZ_DEBUG=0` - whether to enable debug assertions and logging.
- *  - `SZ_DYNAMIC_DISPATCH=0` - whether to use runtime dispatching of the most advanced SIMD backend.
- *  - `SZ_USE_MISALIGNED_LOADS=0` - whether to use misaligned loads on platforms that support them.
- *  - `SZ_SWAR_THRESHOLD=24` - threshold for switching to SWAR backend over serial byte-level for-loops.
- *  - `SZ_USE_X86_AVX512=?` - whether to use AVX-512 instructions on x86_64.
- *  - `SZ_USE_X86_AVX2=?` - whether to use AVX2 instructions on x86_64.
- *  - `SZ_USE_ARM_NEON=?` - whether to use NEON instructions on ARM.
- *  - `SZ_USE_ARM_SVE=?` - whether to use SVE instructions on ARM.
+ *  @brief  Hardware-accelerated string sorting.
+ *  @file   sort.h
+ *  @author Ash Vardanian
  *
- *  @see    StringZilla: https://github.com/ashvardanian/StringZilla/blob/main/README.md
- *  @see    LibC String: https://pubs.opengroup.org/onlinepubs/009695399/basedefs/string.h.html
+ *  Includes core APIs:
  *
- *  @file   stringzilla.h
- *  @author Ash Vardanian
+ *  - `sz_partition` - to split the sequence into two parts based on a predicate.
+ *  - `sz_merge` - to merge two consecutive sorted chunks forming the same continuous `sequence`.
+ *  - `sz_sort` - to sort an arbitrary string sequence.
+ *  - `sz_sort_partial` - to partially sort an arbitrary string sequence.
  */
-#ifndef STRINGZILLA_H_
-#define STRINGZILLA_H_
+#ifndef STRINGZILLA_SORT_H_
+#define STRINGZILLA_SORT_H_
 
-#define STRINGZILLA_VERSION_MAJOR 3
-#define STRINGZILLA_VERSION_MINOR 11
-#define STRINGZILLA_VERSION_PATCH 0
-
-/**
- *  @brief  When set to 1, the library will include the following LibC headers: <stddef.h> and <stdint.h>.
- *          In debug builds (SZ_DEBUG=1), the library will also include <stdio.h> and <stdlib.h>.
- *
- *  You may want to disable this compiling for use in the kernel, or in embedded systems.
- *  You may also avoid them, if you are very sensitive to compilation time and avoid pre-compiled headers.
- *  https://artificial-mind.net/projects/compile-health/
- */
-#ifndef SZ_AVOID_LIBC
-#define SZ_AVOID_LIBC (0) // true or false
-#endif
+#include "types.h"
 
-/**
- *  @brief  A misaligned load can be - trying to fetch eight consecutive bytes from an address
- *          that is not divisible by eight. On x86 enabled by default. On ARM it's not.
- *
- *  Most platforms support it, but there is no industry standard way to check for those.
- *  This value will mostly affect the performance of the serial (SWAR) backend.
- */
-#ifndef SZ_USE_MISALIGNED_LOADS
-#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
-#define SZ_USE_MISALIGNED_LOADS (1) // true or false
-#else
-#define SZ_USE_MISALIGNED_LOADS (0) // true or false
-#endif
+#ifdef __cplusplus
+extern "C" {
 #endif
 
-/**
- *  @brief  Removes compile-time dispatching, and replaces it with runtime dispatching.
- *          So the `sz_find` function will invoke the most advanced backend supported by the CPU,
- *          that runs the program, rather than the most advanced backend supported by the CPU
- *          used to compile the library or the downstream application.
- */
-#ifndef SZ_DYNAMIC_DISPATCH
-#define SZ_DYNAMIC_DISPATCH (0) // true or false
-#endif
+#pragma region Core API
 
 /**
- *  @brief  Analogous to `size_t` and `std::size_t`, unsigned integer, identical to pointer size.
- *          64-bit on most platforms where pointers are 64-bit.
- *          32-bit on platforms where pointers are 32-bit.
+ *  @brief  Similar to `std::partition`, given a predicate splits the sequence into two parts.
+ *          The algorithm is unstable, meaning that elements may change relative order, as long
+ *          as they are in the right partition. This is the simpler algorithm for partitioning.
  */
-#if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64)
-#define SZ_DETECT_64_BIT (1)
-#define SZ_SIZE_MAX (0xFFFFFFFFFFFFFFFFull)  // Largest unsigned integer that fits into 64 bits.
-#define SZ_SSIZE_MAX (0x7FFFFFFFFFFFFFFFull) // Largest signed integer that fits into 64 bits.
-#else
-#define SZ_DETECT_64_BIT (0)
-#define SZ_SIZE_MAX (0xFFFFFFFFu)  // Largest unsigned integer that fits into 32 bits.
-#define SZ_SSIZE_MAX (0x7FFFFFFFu) // Largest signed integer that fits into 32 bits.
-#endif
+SZ_PUBLIC sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate);
 
 /**
- *  @brief  On Big-Endian machines StringZilla will work in compatibility mode.
- *          This disables SWAR hacks to minimize code duplication, assuming practically
- *          all modern popular platforms are Little-Endian.
+ *  @brief  Inplace `std::set_union` for two consecutive chunks forming the same continuous `sequence`.
  *
- *  This variable is hard to infer from macros reliably. It's best to set it manually.
- *  For that CMake provides the `TestBigEndian` and `CMAKE_<LANG>_BYTE_ORDER` (from 3.20 onwards).
- *  In Python one can check `sys.byteorder == 'big'` in the `setup.py` script and pass the appropriate macro.
- *  https://stackoverflow.com/a/27054190
- */
-#ifndef SZ_DETECT_BIG_ENDIAN
-#if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN || defined(__BIG_ENDIAN__) || defined(__ARMEB__) || \
-    defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || defined(__MIBSEB) || defined(__MIBSEB__)
-#define SZ_DETECT_BIG_ENDIAN (1) //< It's a big-endian target architecture
-#else
-#define SZ_DETECT_BIG_ENDIAN (0) //< It's a little-endian target architecture
-#endif
-#endif
-
-/*
- *  Debugging and testing.
+ *  @param partition The number of elements in the first sub-sequence in `sequence`.
+ *  @param less Comparison function, to determine the lexicographic ordering.
  */
-#ifndef SZ_DEBUG
-#if defined(DEBUG) || defined(_DEBUG) // This means "Not using DEBUG information".
-#define SZ_DEBUG (1)
-#else
-#define SZ_DEBUG (0)
-#endif
-#endif
+SZ_PUBLIC void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less);
 
 /**
- *  @brief  Threshold for switching to SWAR (8-bytes at a time) backend over serial byte-level for-loops.
- *          On very short strings, under 16 bytes long, at most a single word will be processed with SWAR.
- *          Assuming potentially misaligned loads, SWAR makes sense only after ~24 bytes.
- */
-#ifndef SZ_SWAR_THRESHOLD
-#if SZ_DEBUG
-#define SZ_SWAR_THRESHOLD (8u) // 8 bytes in debug builds
-#else
-#define SZ_SWAR_THRESHOLD (24u) // 24 bytes in release builds
-#endif
-#endif
-
-/*  Annotation for the public API symbols:
- *
- *  - `SZ_PUBLIC` is used for functions that are part of the public API.
- *  - `SZ_INTERNAL` is used for internal helper functions with unstable APIs.
- *  - `SZ_DYNAMIC` is used for functions that are part of the public API, but are dispatched at runtime.
+ *  @brief  Sorting algorithm, combining Radix Sort for the first 32 bits of every word
+ *          and a follow-up by a more conventional sorting procedure on equally prefixed parts.
  */
-#ifndef SZ_DYNAMIC
-#if SZ_DYNAMIC_DISPATCH
-#if defined(_WIN32) || defined(__CYGWIN__)
-#define SZ_DYNAMIC __declspec(dllexport)
-#define SZ_EXTERNAL __declspec(dllimport)
-#define SZ_PUBLIC inline static
-#define SZ_INTERNAL inline static
-#else
-#define SZ_DYNAMIC __attribute__((visibility("default")))
-#define SZ_EXTERNAL extern
-#define SZ_PUBLIC __attribute__((unused)) inline static
-#define SZ_INTERNAL __attribute__((always_inline)) inline static
-#endif // _WIN32 || __CYGWIN__
-#else
-#define SZ_DYNAMIC inline static
-#define SZ_EXTERNAL extern
-#define SZ_PUBLIC inline static
-#define SZ_INTERNAL inline static
-#endif // SZ_DYNAMIC_DISPATCH
-#endif // SZ_DYNAMIC
+SZ_PUBLIC void sz_sort(sz_sequence_t *sequence);
 
 /**
- *  @brief  Alignment macro for 64-byte alignment.
- */
-#if defined(_MSC_VER)
-#define SZ_ALIGN64 __declspec(align(64))
-#elif defined(__GNUC__) || defined(__clang__)
-#define SZ_ALIGN64 __attribute__((aligned(64)))
-#else
-#define SZ_ALIGN64
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- *  Let's infer the integer types or pull them from LibC,
- *  if that is allowed by the user.
+ *  @brief  Partial sorting algorithm, combining Radix Sort for the first 32 bits of every word
+ *          and a follow-up by a more conventional sorting procedure on equally prefixed parts.
  */
-#if !SZ_AVOID_LIBC
-#include <stddef.h>           // `size_t`
-#include <stdint.h>           // `uint8_t`
-typedef int8_t sz_i8_t;       // Always 8 bits
-typedef uint8_t sz_u8_t;      // Always 8 bits
-typedef uint16_t sz_u16_t;    // Always 16 bits
-typedef int32_t sz_i32_t;     // Always 32 bits
-typedef uint32_t sz_u32_t;    // Always 32 bits
-typedef uint64_t sz_u64_t;    // Always 64 bits
-typedef int64_t sz_i64_t;     // Always 64 bits
-typedef size_t sz_size_t;     // Pointer-sized unsigned integer, 32 or 64 bits
-typedef ptrdiff_t sz_ssize_t; // Signed version of `sz_size_t`, 32 or 64 bits
-
-#else // if SZ_AVOID_LIBC:
-
-// ! The C standard doesn't specify the signedness of char.
-// ! On x86 char is signed by default while on Arm it is unsigned by default.
-// ! That's why we don't define `sz_char_t` and generally use explicit `sz_i8_t` and `sz_u8_t`.
-typedef signed char sz_i8_t;         // Always 8 bits
-typedef unsigned char sz_u8_t;       // Always 8 bits
-typedef unsigned short sz_u16_t;     // Always 16 bits
-typedef int sz_i32_t;                // Always 32 bits
-typedef unsigned int sz_u32_t;       // Always 32 bits
-typedef long long sz_i64_t;          // Always 64 bits
-typedef unsigned long long sz_u64_t; // Always 64 bits
-
-// Now we need to redefine the `size_t`.
-// Microsoft Visual C++ (MSVC) typically follows LLP64 data model on 64-bit platforms,
-// where integers, pointers, and long types have different sizes:
-//
-//  > `int` is 32 bits
-//  > `long` is 32 bits
-//  > `long long` is 64 bits
-//  > pointer (thus, `size_t`) is 64 bits
-//
-// In contrast, GCC and Clang on 64-bit Unix-like systems typically follow the LP64 model, where:
-//
-//  > `int` is 32 bits
-//  > `long` and pointer (thus, `size_t`) are 64 bits
-//  > `long long` is also 64 bits
-//
-// Source: https://learn.microsoft.com/en-us/windows/win32/winprog64/abstract-data-models
-#if SZ_DETECT_64_BIT
-typedef unsigned long long sz_size_t; // 64-bit.
-typedef long long sz_ssize_t;         // 64-bit.
-#else
-typedef unsigned sz_size_t;  // 32-bit.
-typedef unsigned sz_ssize_t; // 32-bit.
-#endif // SZ_DETECT_64_BIT
-
-#endif // SZ_AVOID_LIBC
+SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t n);
 
 /**
- *  @brief  Compile-time assert macro similar to `static_assert` in C++.
+ *  @brief  Intro-Sort algorithm that supports custom comparators.
  */
-#define sz_static_assert(condition, name)                \
-    typedef struct {                                     \
-        int static_assert_##name : (condition) ? 1 : -1; \
-    } sz_static_assert_##name##_t
-
-sz_static_assert(sizeof(sz_size_t) == sizeof(void *), sz_size_t_must_be_pointer_size);
-sz_static_assert(sizeof(sz_ssize_t) == sizeof(void *), sz_ssize_t_must_be_pointer_size);
-
-#pragma region Public API
+SZ_PUBLIC void sz_sort_intro(sz_sequence_t *sequence, sz_sequence_comparator_t less);
 
-typedef char *sz_ptr_t;          // A type alias for `char *`
-typedef char const *sz_cptr_t;   // A type alias for `char const *`
-typedef sz_i8_t sz_error_cost_t; // Character mismatch cost for fuzzy matching functions
+#pragma endregion
 
-typedef sz_u64_t sz_sorted_idx_t; // Index of a sorted string in a list of strings
+#pragma region Serial Implementation
 
-typedef enum { sz_false_k = 0, sz_true_k = 1 } sz_bool_t;                        // Only one relevant bit
-typedef enum { sz_less_k = -1, sz_equal_k = 0, sz_greater_k = 1 } sz_ordering_t; // Only three possible states: <=>
+SZ_PUBLIC sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate) {
 
-/**
- *  @brief  Tiny string-view structure. It's POD type, unlike the `std::string_view`.
- */
-typedef struct sz_string_view_t {
-    sz_cptr_t start;
-    sz_size_t length;
-} sz_string_view_t;
+    sz_size_t matches = 0;
+    while (matches != sequence->count && predicate(sequence, sequence->order[matches])) ++matches;
 
-/**
- *  @brief  Enumeration of SIMD capabilities of the target architecture.
- *          Used to introspect the supported functionality of the dynamic library.
- */
-typedef enum sz_capability_t {
-    sz_cap_serial_k = 1,       /// Serial (non-SIMD) capability
-    sz_cap_any_k = 0x7FFFFFFF, /// Mask representing any capability
+    for (sz_size_t i = matches + 1; i < sequence->count; ++i)
+        if (predicate(sequence, sequence->order[i]))
+            sz_u64_swap(sequence->order + i, sequence->order + matches), ++matches;
 
-    sz_cap_arm_neon_k = 1 << 10, /// ARM NEON capability
-    sz_cap_arm_sve_k = 1 << 11,  /// ARM SVE capability TODO: Not yet supported or used
-    sz_cap_arm_sve2_k = 1 << 12,
-    sz_cap_arm_sve2p1_k = 1 << 13,
-    sz_cap_x86_avx2_k = 1 << 20,       /// x86 AVX2 capability
-    sz_cap_x86_avx512f_k = 1 << 21,    /// x86 AVX512 F capability
-    sz_cap_x86_avx512bw_k = 1 << 22,   /// x86 AVX512 BW instruction capability
-    sz_cap_x86_avx512vl_k = 1 << 23,   /// x86 AVX512 VL instruction capability
-    sz_cap_x86_avx512vbmi_k = 1 << 24, /// x86 AVX512 VBMI instruction capability
-    sz_cap_x86_gfni_k = 1 << 25,       /// x86 AVX512 GFNI instruction capability
+    return matches;
+}
 
-} sz_capability_t;
+SZ_PUBLIC void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less) {
 
-/**
- *  @brief  Function to determine the SIMD capabilities of the current machine @b only at @b runtime.
- *  @return A bitmask of the SIMD capabilities represented as a `sz_capability_t` enum value.
- */
-SZ_DYNAMIC sz_capability_t sz_capabilities(void);
+    sz_size_t start_b = partition + 1;
 
-/**
- *  @brief  Bit-set structure for 256 possible byte values. Useful for filtering and search.
- *  @see    sz_charset_init, sz_charset_add, sz_charset_contains, sz_charset_invert
- */
-typedef union sz_charset_t {
-    sz_u64_t _u64s[4];
-    sz_u32_t _u32s[8];
-    sz_u16_t _u16s[16];
-    sz_u8_t _u8s[32];
-} sz_charset_t;
+    // If the direct merge is already sorted
+    if (!less(sequence, sequence->order[start_b], sequence->order[partition])) return;
 
-/** @brief  Initializes a bit-set to an empty collection, meaning - all characters are banned. */
-SZ_PUBLIC void sz_charset_init(sz_charset_t *s) { s->_u64s[0] = s->_u64s[1] = s->_u64s[2] = s->_u64s[3] = 0; }
+    sz_size_t start_a = 0;
+    while (start_a <= partition && start_b <= sequence->count) {
 
-/** @brief  Adds a character to the set and accepts @b unsigned integers. */
-SZ_PUBLIC void sz_charset_add_u8(sz_charset_t *s, sz_u8_t c) { s->_u64s[c >> 6] |= (1ull << (c & 63u)); }
+        // If element 1 is in right place
+        if (!less(sequence, sequence->order[start_b], sequence->order[start_a])) { start_a++; }
+        else {
+            sz_size_t value = sequence->order[start_b];
+            sz_size_t index = start_b;
 
-/** @brief  Adds a character to the set. Consider @b sz_charset_add_u8. */
-SZ_PUBLIC void sz_charset_add(sz_charset_t *s, char c) { sz_charset_add_u8(s, *(sz_u8_t *)(&c)); } // bitcast
+            // Shift all the elements between element 1
+            // element 2, right by 1.
+            while (index != start_a) { sequence->order[index] = sequence->order[index - 1], index--; }
+            sequence->order[start_a] = value;
 
-/** @brief  Checks if the set contains a given character and accepts @b unsigned integers. */
-SZ_PUBLIC sz_bool_t sz_charset_contains_u8(sz_charset_t const *s, sz_u8_t c) {
-    // Checking the bit can be done in different ways:
-    // - (s->_u64s[c >> 6] & (1ull << (c & 63u))) != 0
-    // - (s->_u32s[c >> 5] & (1u << (c & 31u))) != 0
-    // - (s->_u16s[c >> 4] & (1u << (c & 15u))) != 0
-    // - (s->_u8s[c >> 3] & (1u << (c & 7u))) != 0
-    return (sz_bool_t)((s->_u64s[c >> 6] & (1ull << (c & 63u))) != 0);
+            // Update all the pointers
+            start_a++;
+            partition++;
+            start_b++;
+        }
+    }
 }
 
-/** @brief  Checks if the set contains a given character. Consider @b sz_charset_contains_u8. */
-SZ_PUBLIC sz_bool_t sz_charset_contains(sz_charset_t const *s, char c) {
-    return sz_charset_contains_u8(s, *(sz_u8_t *)(&c)); // bitcast
+SZ_PUBLIC void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
+    sz_u64_t *keys = sequence->order;
+    sz_size_t keys_count = sequence->count;
+    for (sz_size_t i = 1; i < keys_count; i++) {
+        sz_u64_t i_key = keys[i];
+        sz_size_t j = i;
+        for (; j > 0 && less(sequence, i_key, keys[j - 1]); --j) keys[j] = keys[j - 1];
+        keys[j] = i_key;
+    }
 }
 
-/** @brief  Inverts the contents of the set, so allowed character get disallowed, and vice versa. */
-SZ_PUBLIC void sz_charset_invert(sz_charset_t *s) {
-    s->_u64s[0] ^= 0xFFFFFFFFFFFFFFFFull, s->_u64s[1] ^= 0xFFFFFFFFFFFFFFFFull, //
-        s->_u64s[2] ^= 0xFFFFFFFFFFFFFFFFull, s->_u64s[3] ^= 0xFFFFFFFFFFFFFFFFull;
+SZ_INTERNAL void _sz_sift_down( //
+    sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t start, sz_size_t end) {
+    sz_size_t root = start;
+    while (2 * root + 1 <= end) {
+        sz_size_t child = 2 * root + 1;
+        if (child + 1 <= end && less(sequence, order[child], order[child + 1])) { child++; }
+        if (!less(sequence, order[root], order[child])) { return; }
+        sz_u64_swap(order + root, order + child);
+        root = child;
+    }
 }
 
-typedef void *(*sz_memory_allocate_t)(sz_size_t, void *);
-typedef void (*sz_memory_free_t)(void *, sz_size_t, void *);
-typedef sz_u64_t (*sz_random_generator_t)(void *);
-
-/**
- *  @brief  Some complex pattern matching algorithms may require memory allocations.
- *          This structure is used to pass the memory allocator to those functions.
- *  @see    sz_memory_allocator_init_fixed
- */
-typedef struct sz_memory_allocator_t {
-    sz_memory_allocate_t allocate;
-    sz_memory_free_t free;
-    void *handle;
-} sz_memory_allocator_t;
-
-/**
- *  @brief  Initializes a memory allocator to use the system default `malloc` and `free`.
- *          ! The function is not available if the library was compiled with `SZ_AVOID_LIBC`.
- *
- *  @param alloc    Memory allocator to initialize.
- */
-SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc);
+SZ_INTERNAL void _sz_heapify(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t count) {
+    sz_size_t start = (count - 2) / 2;
+    while (1) {
+        _sz_sift_down(sequence, less, order, start, count - 1);
+        if (start == 0) return;
+        start--;
+    }
+}
 
-/**
- *  @brief  Initializes a memory allocator to use a static-capacity buffer.
- *          No dynamic allocations will be performed.
- *
- *  @param alloc    Memory allocator to initialize.
- *  @param buffer   Buffer to use for allocations.
- *  @param length   Length of the buffer. @b Must be greater than 8 bytes. Different values would be optimal for
- *                  different algorithms and input lengths, but 4096 bytes (one RAM page) is a good default.
- */
-SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length);
+SZ_INTERNAL void _sz_heapsort(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first, sz_size_t last) {
+    sz_u64_t *order = sequence->order;
+    sz_size_t count = last - first;
+    _sz_heapify(sequence, less, order + first, count);
+    sz_size_t end = count - 1;
+    while (end > 0) {
+        sz_u64_swap(order + first, order + first + end);
+        end--;
+        _sz_sift_down(sequence, less, order + first, 0, end);
+    }
+}
 
-/**
- *  @brief  The number of bytes a stack-allocated string can hold, including the SZ_NULL termination character.
- *          ! This can't be changed from outside. Don't use the `#error` as it may already be included and set.
- */
-#ifdef SZ_STRING_INTERNAL_SPACE
-#undef SZ_STRING_INTERNAL_SPACE
-#endif
-#define SZ_STRING_INTERNAL_SPACE (sizeof(sz_size_t) * 3 - 1) // 3 pointers minus one byte for an 8-bit length
+SZ_PUBLIC void sz_sort_introsort_recursion( //
+    sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first, sz_size_t last, sz_size_t depth) {
 
-/**
- *  @brief  Tiny memory-owning string structure with a Small String Optimization (SSO).
- *          Differs in layout from Folly, Clang, GCC, and probably most other implementations.
- *          It's designed to avoid any branches on read-only operations, and can store up
- *          to 22 characters on stack on 64-bit machines, followed by the SZ_NULL-termination character.
- *
- *  @section Changing Length
- *
- *  One nice thing about this design, is that you can, in many cases, change the length of the string
- *  without any branches, invoking a `+=` or `-=` on the 64-bit `length` field. If the string is on heap,
- *  the solution is obvious. If it's on stack, inplace decrement wouldn't affect the top bytes of the string,
- *  only changing the last byte containing the length.
- */
-typedef union sz_string_t {
+    sz_size_t length = last - first;
+    switch (length) {
+    case 0:
+    case 1: return;
+    case 2:
+        if (less(sequence, sequence->order[first + 1], sequence->order[first]))
+            sz_u64_swap(&sequence->order[first], &sequence->order[first + 1]);
+        return;
+    case 3: {
+        sz_u64_t a = sequence->order[first];
+        sz_u64_t b = sequence->order[first + 1];
+        sz_u64_t c = sequence->order[first + 2];
+        if (less(sequence, b, a)) sz_u64_swap(&a, &b);
+        if (less(sequence, c, b)) sz_u64_swap(&c, &b);
+        if (less(sequence, b, a)) sz_u64_swap(&a, &b);
+        sequence->order[first] = a;
+        sequence->order[first + 1] = b;
+        sequence->order[first + 2] = c;
+        return;
+    }
+    }
+    // Until a certain length, the quadratic-complexity insertion-sort is fine
+    if (length <= 16) {
+        sz_sequence_t sub_seq = *sequence;
+        sub_seq.order += first;
+        sub_seq.count = length;
+        sz_sort_insertion(&sub_seq, less);
+        return;
+    }
 
-#if !SZ_DETECT_BIG_ENDIAN
+    // Fallback to N-logN-complexity heap-sort
+    if (depth == 0) {
+        _sz_heapsort(sequence, less, first, last);
+        return;
+    }
 
-    struct external {
-        sz_ptr_t start;
-        sz_size_t length;
-        sz_size_t space;
-        sz_size_t padding;
-    } external;
+    --depth;
 
-    struct internal {
-        sz_ptr_t start;
-        sz_u8_t length;
-        char chars[SZ_STRING_INTERNAL_SPACE];
-    } internal;
+    // Median-of-three logic to choose pivot
+    sz_size_t median = first + length / 2;
+    if (less(sequence, sequence->order[median], sequence->order[first]))
+        sz_u64_swap(&sequence->order[first], &sequence->order[median]);
+    if (less(sequence, sequence->order[last - 1], sequence->order[first]))
+        sz_u64_swap(&sequence->order[first], &sequence->order[last - 1]);
+    if (less(sequence, sequence->order[median], sequence->order[last - 1]))
+        sz_u64_swap(&sequence->order[median], &sequence->order[last - 1]);
 
-#else
+    // Partition using the median-of-three as the pivot
+    sz_u64_t pivot = sequence->order[median];
+    sz_size_t left = first;
+    sz_size_t right = last - 1;
+    while (1) {
+        while (less(sequence, sequence->order[left], pivot)) left++;
+        while (less(sequence, pivot, sequence->order[right])) right--;
+        if (left >= right) break;
+        sz_u64_swap(&sequence->order[left], &sequence->order[right]);
+        left++;
+        right--;
+    }
 
-    struct external {
-        sz_ptr_t start;
-        sz_size_t space;
-        sz_size_t padding;
-        sz_size_t length;
-    } external;
+    // Recursively sort the partitions
+    sz_sort_introsort_recursion(sequence, less, first, left, depth);
+    sz_sort_introsort_recursion(sequence, less, right + 1, last, depth);
+}
 
-    struct internal {
-        sz_ptr_t start;
-        char chars[SZ_STRING_INTERNAL_SPACE];
-        sz_u8_t length;
-    } internal;
+SZ_PUBLIC void sz_sort_introsort(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
+    if (sequence->count == 0) return;
+    sz_size_t size_is_not_power_of_two = (sequence->count & (sequence->count - 1)) != 0;
+    sz_size_t depth_limit = sz_size_log2i_nonzero(sequence->count) + size_is_not_power_of_two;
+    sz_sort_introsort_recursion(sequence, less, 0, sequence->count, depth_limit);
+}
 
-#endif
+SZ_PUBLIC void sz_sort_recursion( //
+    sz_sequence_t *sequence, sz_size_t bit_idx, sz_size_t bit_max, sz_sequence_comparator_t comparator,
+    sz_size_t partial_order_length) {
 
-    sz_size_t words[4];
+    if (!sequence->count) return;
 
-} sz_string_t;
+    // Array of size one doesn't need sorting - only needs the prefix to be discarded.
+    if (sequence->count == 1) {
+        sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
+        order_half_words[1] = 0;
+        return;
+    }
 
-typedef sz_u64_t (*sz_hash_t)(sz_cptr_t, sz_size_t);
-typedef sz_u64_t (*sz_checksum_t)(sz_cptr_t, sz_size_t);
-typedef sz_bool_t (*sz_equal_t)(sz_cptr_t, sz_cptr_t, sz_size_t);
-typedef sz_ordering_t (*sz_order_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
-typedef void (*sz_to_converter_t)(sz_cptr_t, sz_size_t, sz_ptr_t);
+    // Partition a range of integers according to a specific bit value
+    sz_size_t split = 0;
+    sz_u64_t mask = (1ull << 63) >> bit_idx;
 
-/**
- *  @brief  Computes the 64-bit check-sum of bytes in a string.
- *          Similar to `std::ranges::accumulate`.
- *
- *  @param text     String to aggregate.
- *  @param length   Number of bytes in the text.
- *  @return         64-bit unsigned value.
- */
-SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length);
+    // The clean approach would be to perform a single pass over the sequence.
+    //
+    //    while (split != sequence->count && !(sequence->order[split] & mask)) ++split;
+    //    for (sz_size_t i = split + 1; i < sequence->count; ++i)
+    //        if (!(sequence->order[i] & mask)) sz_u64_swap(sequence->order + i, sequence->order + split), ++split;
+    //
+    // This, however, doesn't take into account the high relative cost of writes and swaps.
+    // To circumvent that, we can first count the total number entries to be mapped into either part.
+    // And then walk through both parts, swapping the entries that are in the wrong part.
+    // This would often lead to ~15% performance gain.
+    sz_size_t count_with_bit_set = 0;
+    for (sz_size_t i = 0; i != sequence->count; ++i) count_with_bit_set += (sequence->order[i] & mask) != 0;
+    split = sequence->count - count_with_bit_set;
 
-/** @copydoc sz_checksum */
-SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length);
-
-/**
- *  @brief  Computes the 64-bit unsigned hash of a string. Fairly fast for short strings,
- *          simple implementation, and supports rolling computation, reused in other APIs.
- *          Similar to `std::hash` in C++.
- *
- *  @param text     String to hash.
- *  @param length   Number of bytes in the text.
- *  @return         64-bit hash value.
- *
- *  @see    sz_hashes, sz_hashes_fingerprint, sz_hashes_intersection
- */
-SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length);
-
-/** @copydoc sz_hash */
-SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t text, sz_size_t length);
-
-/**
- *  @brief  Checks if two string are equal.
- *          Similar to `memcmp(a, b, length) == 0` in LibC and `a == b` in STL.
- *
- *  The implementation of this function is very similar to `sz_order`, but the usage patterns are different.
- *  This function is more often used in parsing, while `sz_order` is often used in sorting.
- *  It works best on platforms with cheap
- *
- *  @param a        First string to compare.
- *  @param b        Second string to compare.
- *  @param length   Number of bytes in both strings.
- *  @return         1 if strings match, 0 otherwise.
- */
-SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-
-/**
- *  @brief  Estimates the relative order of two strings. Equivalent to `memcmp(a, b, length)` in LibC.
- *          Can be used on different length strings.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *  @return         Negative if (a < b), positive if (a > b), zero if they are equal.
- */
-SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-
-/**
- *  @brief  Look Up Table @b (LUT) transformation of a string. Equivalent to `for (char & c : text) c = lut[c]`.
- *
- *  Can be used to implement some form of string normalization, partially masking punctuation marks,
- *  or converting between different character sets, like uppercase or lowercase. Surprisingly, also has
- *  broad implications in image processing, where image channel transformations are often done using LUTs.
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param lut      Look Up Table to apply. Must be exactly @b 256 bytes long.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
-
-typedef void (*sz_look_up_transform_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_ptr_t);
-
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_serial(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
-
-/**
- *  @brief  Equivalent to `for (char & c : text) c = tolower(c)`.
- *
- *  ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
- *  So there are 26 english letters, shifted by 32 values, meaning that a conversion
- *  can be done by flipping the 5th bit each inappropriate character byte. This, however,
- *  breaks for extended ASCII, so a different solution is needed.
- *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_PUBLIC void sz_tolower(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
-
-/**
- *  @brief  Equivalent to `for (char & c : text) c = toupper(c)`.
- *
- *  ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
- *  So there are 26 english letters, shifted by 32 values, meaning that a conversion
- *  can be done by flipping the 5th bit each inappropriate character byte. This, however,
- *  breaks for extended ASCII, so a different solution is needed.
- *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_PUBLIC void sz_toupper(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
-
-/**
- *  @brief  Equivalent to `for (char & c : text) c = toascii(c)`.
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_PUBLIC void sz_toascii(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
-
-/**
- *  @brief  Checks if all characters in the range are valid ASCII characters.
- *
- *  @param text     String to be analyzed.
- *  @param length   Number of bytes in the string.
- *  @return         Whether all characters are valid ASCII characters.
- */
-SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t text, sz_size_t length);
-
-/**
- *  @brief  Generates a random string for a given alphabet, avoiding integer division and modulo operations.
- *          Similar to `text[i] = alphabet[rand() % cardinality]`.
- *
- *  The modulo operation is expensive, and should be avoided in performance-critical code.
- *  We avoid it using small lookup tables and replacing it with a multiplication and shifts, similar to `libdivide`.
- *  Alternative algorithms would include:
- *      - Montgomery form: https://en.algorithmica.org/hpc/number-theory/montgomery/
- *      - Barret reduction: https://www.nayuki.io/page/barrett-reduction-algorithm
- *      - Lemire's trick: https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
- *
- *  @param alphabet     Set of characters to sample from.
- *  @param cardinality  Number of characters to sample from.
- *  @param text         Output string, can point to the same address as ::text.
- *  @param generate     Callback producing random numbers given the generator state.
- *  @param generator    Generator state, can be a pointer to a seed, or a pointer to a random number generator.
- */
-SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
-                            sz_random_generator_t generate, void *generator);
-
-/** @copydoc sz_generate */
-SZ_PUBLIC void sz_generate_serial(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
-                                  sz_random_generator_t generate, void *generator);
-
-/**
- *  @brief  Similar to `memcpy`, copies contents of one string into another.
- *          The behavior is undefined if the strings overlap.
- *
- *  @param target   String to copy into.
- *  @param length   Number of bytes to copy.
- *  @param source   String to copy from.
- */
-SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-/**
- *  @brief  Similar to `memmove`, copies (moves) contents of one string into another.
- *          Unlike `sz_copy`, allows overlapping strings as arguments.
- *
- *  @param target   String to copy into.
- *  @param length   Number of bytes to copy.
- *  @param source   String to copy from.
- */
-SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-typedef void (*sz_move_t)(sz_ptr_t, sz_cptr_t, sz_size_t);
-
-/**
- *  @brief  Similar to `memset`, fills a string with a given value.
- *
- *  @param target   String to fill.
- *  @param length   Number of bytes to fill.
- *  @param value    Value to fill with.
- */
-SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-
-typedef void (*sz_fill_t)(sz_ptr_t, sz_size_t, sz_u8_t);
-
-/**
- *  @brief  Initializes a string class instance to an empty value.
- */
-SZ_PUBLIC void sz_string_init(sz_string_t *string);
-
-/**
- *  @brief  Convenience function checking if the provided string is stored inside of the ::string instance itself,
- *          alternative being - allocated in a remote region of the heap.
- */
-SZ_PUBLIC sz_bool_t sz_string_is_on_stack(sz_string_t const *string);
-
-/**
- *  @brief  Unpacks the opaque instance of a string class into its components.
- *          Recommended to use only in read-only operations.
- *
- *  @param string       String to unpack.
- *  @param start        Pointer to the start of the string.
- *  @param length       Number of bytes in the string, before the SZ_NULL character.
- *  @param space        Number of bytes allocated for the string (heap or stack), including the SZ_NULL character.
- *  @param is_external  Whether the string is allocated on the heap externally, or fits withing ::string instance.
- */
-SZ_PUBLIC void sz_string_unpack(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length, sz_size_t *space,
-                                sz_bool_t *is_external);
-
-/**
- *  @brief  Unpacks only the start and length of the string.
- *          Recommended to use only in read-only operations.
- *
- * @param string       String to unpack.
- * @param start        Pointer to the start of the string.
- * @param length       Number of bytes in the string, before the SZ_NULL character.
- */
-SZ_PUBLIC void sz_string_range(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length);
-
-/**
- *  @brief  Constructs a string of a given ::length with noisy contents.
- *          Use the returned character pointer to populate the string.
- *
- *  @param string       String to initialize.
- *  @param length       Number of bytes in the string, before the SZ_NULL character.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             SZ_NULL if the operation failed, pointer to the start of the string otherwise.
- */
-SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length, sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Doesn't change the contents or the length of the string, but grows the available memory capacity.
- *          This is beneficial, if several insertions are expected, and we want to minimize allocations.
- *
- *  @param string       String to grow.
- *  @param new_capacity The number of characters to reserve space for, including existing ones.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             SZ_NULL if the operation failed, pointer to the new start of the string otherwise.
- */
-SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity, sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Grows the string by adding an uninitialized region of ::added_length at the given ::offset.
- *          Would often be used in conjunction with one or more `sz_copy` calls to populate the allocated region.
- *          Similar to `sz_string_reserve`, but changes the length of the ::string.
- *
- *  @param string       String to grow.
- *  @param offset       Offset of the first byte to reserve space for.
- *                      If provided offset is larger than the length, it will be capped.
- *  @param added_length The number of new characters to reserve space for.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             SZ_NULL if the operation failed, pointer to the new start of the string otherwise.
- */
-SZ_PUBLIC sz_ptr_t sz_string_expand(sz_string_t *string, sz_size_t offset, sz_size_t added_length,
-                                    sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Removes a range from a string. Changes the length, but not the capacity.
- *          Performs no allocations or deallocations and can't fail.
- *
- *  @param string       String to clean.
- *  @param offset       Offset of the first byte to remove.
- *  @param length       Number of bytes to remove. Out-of-bound ranges will be capped.
- *  @return             Number of bytes removed.
- */
-SZ_PUBLIC sz_size_t sz_string_erase(sz_string_t *string, sz_size_t offset, sz_size_t length);
-
-/**
- *  @brief  Shrinks the string to fit the current length, if it's allocated on the heap.
- *          It's the reverse operation of ::sz_string_reserve.
- *
- *  @param string       String to shrink.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             Whether the operation was successful. The only failures can come from the allocator.
- *                      On failure, the string will remain unchanged.
- */
-SZ_PUBLIC sz_ptr_t sz_string_shrink_to_fit(sz_string_t *string, sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Frees the string, if it's allocated on the heap.
- *          If the string is on the stack, the function clears/resets the state.
- */
-SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *allocator);
-
-#pragma endregion
-
-#pragma region Fast Substring Search API
-
-typedef sz_cptr_t (*sz_find_byte_t)(sz_cptr_t, sz_size_t, sz_cptr_t);
-typedef sz_cptr_t (*sz_find_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
-typedef sz_cptr_t (*sz_find_set_t)(sz_cptr_t, sz_size_t, sz_charset_t const *);
-
-/**
- *  @brief  Locates first matching byte in a string. Equivalent to `memchr(haystack, *needle, h_length)` in LibC.
- *
- *  X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memchr.S
- *  Aarch64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/aarch64/memchr.S
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - single-byte substring to find.
- *  @return         Address of the first match.
- */
-SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/**
- *  @brief  Locates last matching byte in a string. Equivalent to `memrchr(haystack, *needle, h_length)` in LibC.
- *
- *  X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memrchr.S
- *  Aarch64 implementation: missing
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - single-byte substring to find.
- *  @return         Address of the last match.
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/**
- *  @brief  Locates first matching substring.
- *          Equivalent to `memmem(haystack, h_length, needle, n_length)` in LibC.
- *          Similar to `strstr(haystack, needle)` in LibC, but requires known length.
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - substring to find.
- *  @param n_length Number of bytes in the needle.
- *  @return         Address of the first match.
- */
-SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/**
- *  @brief  Locates the last matching substring.
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - substring to find.
- *  @param n_length Number of bytes in the needle.
- *  @return         Address of the last match.
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/**
- *  @brief  Finds the first character present from the ::set, present in ::text.
- *          Equivalent to `strspn(text, accepted)` and `strcspn(text, rejected)` in LibC.
- *          May have identical implementation and performance to ::sz_rfind_charset.
- *
- *  Useful for parsing, when we want to skip a set of characters. Examples:
- *  * 6 whitespaces: " \t\n\r\v\f".
- *  * 16 digits forming a float number: "0123456789,.eE+-".
- *  * 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
- *  * 2 JSON string special characters useful to locate the end of the string: "\"\\".
- *
- *  @param text     String to be scanned.
- *  @param set      Set of relevant characters.
- *  @return         Pointer to the first matching character from ::set.
- */
-SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-/**
- *  @brief  Finds the last character present from the ::set, present in ::text.
- *          Equivalent to `strspn(text, accepted)` and `strcspn(text, rejected)` in LibC.
- *          May have identical implementation and performance to ::sz_find_charset.
- *
- *  Useful for parsing, when we want to skip a set of characters. Examples:
- *  * 6 whitespaces: " \t\n\r\v\f".
- *  * 16 digits forming a float number: "0123456789,.eE+-".
- *  * 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
- *  * 2 JSON string special characters useful to locate the end of the string: "\"\\".
- *
- *  @param text     String to be scanned.
- *  @param set      Set of relevant characters.
- *  @return         Pointer to the last matching character from ::set.
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-#pragma endregion
-
-#pragma region String Similarity Measures API
-
-/**
- *  @brief  Computes the Hamming distance between two strings - number of not matching characters.
- *          Difference in length is is counted as a mismatch.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for the distance, the `bound` if was exceeded.
- *
- *  @see    sz_hamming_distance_utf8
- *  @see    https://en.wikipedia.org/wiki/Hamming_distance
- */
-SZ_DYNAMIC sz_size_t sz_hamming_distance( //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
-
-/** @copydoc sz_hamming_distance */
-SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
-
-/**
- *  @brief  Computes the Hamming distance between two @b UTF8 strings - number of not matching characters.
- *          Difference in length is is counted as a mismatch.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for the distance, the `bound` if was exceeded.
- *
- *  @see    sz_hamming_distance
- *  @see    https://en.wikipedia.org/wiki/Hamming_distance
- */
-SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                              sz_size_t bound);
-
-/** @copydoc sz_hamming_distance_utf8 */
-SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                                    sz_size_t bound);
-
-typedef sz_size_t (*sz_hamming_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t);
-
-/**
- *  @brief  Computes the Levenshtein edit-distance between two strings using the Wagner-Fisher algorithm.
- *          Similar to the Needleman-Wunsch alignment algorithm. Often used in fuzzy string matching.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @param bound    Exclusive upper bound on the distance, that allows us to exit early.
- *                  Pass `SZ_SIZE_MAX` or any value greater than `(max(a_length, b_length))` to ignore.
- *                  Pass zero to check if the strings are equal.
- *  @return         Unsigned integer for the edit distance. Zero means the strings are equal.
- *                  Returns the `bound` if it was exceeded or `SZ_SIZE_MAX` if the memory allocation failed.
- *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default
- *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
- */
-SZ_DYNAMIC sz_size_t sz_edit_distance(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                      sz_size_t bound, sz_memory_allocator_t *alloc);
-
-/** @copydoc sz_edit_distance */
-SZ_PUBLIC sz_size_t sz_edit_distance_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                            sz_size_t bound, sz_memory_allocator_t *alloc);
-
-/**
- *  @brief  Computes the Levenshtein edit-distance between two @b UTF8 strings.
- *          Unlike `sz_edit_distance`, reports the distance in Unicode codepoints, and not in bytes.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for edit distance, the `bound` if was exceeded or `SZ_SIZE_MAX`
- *                  if the memory allocation failed.
- *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default, sz_edit_distance
- *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
- */
-SZ_DYNAMIC sz_size_t sz_edit_distance_utf8(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                           sz_size_t bound, sz_memory_allocator_t *alloc);
-
-typedef sz_size_t (*sz_edit_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t, sz_memory_allocator_t *);
-
-/** @copydoc sz_edit_distance_utf8 */
-SZ_PUBLIC sz_size_t sz_edit_distance_utf8_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                                 sz_size_t bound, sz_memory_allocator_t *alloc);
-
-/**
- *  @brief  Computes Needleman–Wunsch alignment score for two string. Often used in bioinformatics and cheminformatics.
- *          Similar to the Levenshtein edit-distance, parameterized for gap and substitution penalties.
- *
- *  Not commutative in the general case, as the order of the strings matters, as `sz_alignment_score(a, b)` may
- *  not be equal to `sz_alignment_score(b, a)`. Becomes @b commutative, if the substitution costs are symmetric.
- *  Equivalent to the negative Levenshtein distance, if: `gap == -1` and `subs[i][j] == (i == j ? 0: -1)`.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *  @param gap      Penalty cost for gaps - insertions and removals.
- *  @param subs     Substitution costs matrix with 256 x 256 values for all pairs of characters.
- *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @return         Signed similarity score. Can be negative, depending on the substitution costs.
- *                  If the memory allocation fails, the function returns `SZ_SSIZE_MAX`.
- *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default
- *  @see    https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
- */
-SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                         sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                         sz_memory_allocator_t *alloc);
-
-/** @copydoc sz_alignment_score */
-SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                               sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                               sz_memory_allocator_t *alloc);
-
-typedef sz_ssize_t (*sz_alignment_score_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_error_cost_t const *,
-                                           sz_error_cost_t, sz_memory_allocator_t *);
-
-typedef void (*sz_hash_callback_t)(sz_cptr_t, sz_size_t, sz_u64_t, void *user);
-
-/**
- *  @brief  Computes the Karp-Rabin rolling hashes of a string supplying them to the provided `callback`.
- *          Can be used for similarity scores, search, ranking, etc.
- *
- *  Rabin-Karp-like rolling hashes can have very high-level of collisions and depend
- *  on the choice of bases and the prime number. That's why, often two hashes from the same
- *  family are used with different bases.
- *
- *       1. Kernighan and Ritchie's function uses 31, a prime close to the size of English alphabet.
- *       2. To be friendlier to byte-arrays and UTF8, we use 257 for the second function.
- *
- *  Choosing the right ::window_length is task- and domain-dependant. For example, most English words are
- *  between 3 and 7 characters long, so a window of 4 bytes would be a good choice. For DNA sequences,
- *  the ::window_length might be a multiple of 3, as the codons are 3 (nucleotides) bytes long.
- *  With such minimalistic alphabets of just four characters (AGCT) longer windows might be needed.
- *  For protein sequences the alphabet is 20 characters long, so the window can be shorter, than for DNAs.
- *
- *  @param text             String to hash.
- *  @param length           Number of bytes in the string.
- *  @param window_length    Length of the rolling window in bytes.
- *  @param window_step      Step of reported hashes. @b Must be power of two. Should be smaller than `window_length`.
- *  @param callback         Function receiving the start & length of a substring, the hash, and the `callback_handle`.
- *  @param callback_handle  Optional user-provided pointer to be passed to the `callback`.
- *  @see                    sz_hashes_fingerprint, sz_hashes_intersection
- */
-SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                          sz_hash_callback_t callback, void *callback_handle);
-
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_serial(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                                sz_hash_callback_t callback, void *callback_handle);
-
-typedef void (*sz_hashes_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_size_t, sz_hash_callback_t, void *);
-
-/**
- *  @brief  Computes the Karp-Rabin rolling hashes of a string outputting a binary fingerprint.
- *          Such fingerprints can be compared with Hamming or Jaccard (Tanimoto) distance for similarity.
- *
- *  The algorithm doesn't clear the fingerprint buffer on start, so it can be invoked multiple times
- *  to produce a fingerprint of a longer string, by passing the previous fingerprint as the ::fingerprint.
- *  It can also be reused to produce multi-resolution fingerprints by changing the ::window_length
- *  and calling the same function multiple times for the same input ::text.
- *
- *  Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
- *  avoiding cache-coherency penalties of remote on-heap buffers.
- *
- *  @param text                 String to hash.
- *  @param length               Number of bytes in the string.
- *  @param fingerprint          Output fingerprint buffer.
- *  @param fingerprint_bytes    Number of bytes in the fingerprint buffer.
- *  @param window_length        Length of the rolling window in bytes.
- *  @see                        sz_hashes, sz_hashes_intersection
- */
-SZ_PUBLIC void sz_hashes_fingerprint(                          //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
-    sz_ptr_t fingerprint, sz_size_t fingerprint_bytes);
-
-typedef void (*sz_hashes_fingerprint_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_ptr_t, sz_size_t);
-
-/**
- *  @brief  Given a hash-fingerprint of a textual document, computes the number of intersecting hashes
- *          of the incoming document. Can be used for document scoring and search.
- *
- *  Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
- *  avoiding cache-coherency penalties of remote on-heap buffers.
- *
- *  @param text                 Input document.
- *  @param length               Number of bytes in the input document.
- *  @param fingerprint          Reference document fingerprint.
- *  @param fingerprint_bytes    Number of bytes in the reference documents fingerprint.
- *  @param window_length        Length of the rolling window in bytes.
- *  @see                        sz_hashes, sz_hashes_fingerprint
- */
-SZ_PUBLIC sz_size_t sz_hashes_intersection(                    //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
-    sz_cptr_t fingerprint, sz_size_t fingerprint_bytes);
-
-typedef sz_size_t (*sz_hashes_intersection_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_cptr_t, sz_size_t);
-
-#pragma endregion
-
-#pragma region Convenience API
-
-/**
- *  @brief  Finds the first character in the haystack, that is present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-/**
- *  @brief  Finds the first character in the haystack, that is @b not present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_find_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-/**
- *  @brief  Finds the last character in the haystack, that is present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-/**
- *  @brief  Finds the last character in the haystack, that is @b not present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-#pragma endregion
-
-#pragma region String Sequences API
-
-struct sz_sequence_t;
-
-typedef sz_cptr_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_size_t (*sz_sequence_member_length_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_bool_t (*sz_sequence_predicate_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_bool_t (*sz_sequence_comparator_t)(struct sz_sequence_t const *, sz_size_t, sz_size_t);
-typedef sz_bool_t (*sz_string_is_less_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
-
-typedef struct sz_sequence_t {
-    sz_sorted_idx_t *order;
-    sz_size_t count;
-    sz_sequence_member_start_t get_start;
-    sz_sequence_member_length_t get_length;
-    void const *handle;
-} sz_sequence_t;
-
-/**
- *  @brief  Initiates the sequence structure from a tape layout, used by Apache Arrow.
- *          Expects ::offsets to contains `count + 1` entries, the last pointing at the end
- *          of the last string, indicating the total length of the ::tape.
- */
-SZ_PUBLIC void sz_sequence_from_u32tape(sz_cptr_t *start, sz_u32_t const *offsets, sz_size_t count,
-                                        sz_sequence_t *sequence);
-
-/**
- *  @brief  Initiates the sequence structure from a tape layout, used by Apache Arrow.
- *          Expects ::offsets to contains `count + 1` entries, the last pointing at the end
- *          of the last string, indicating the total length of the ::tape.
- */
-SZ_PUBLIC void sz_sequence_from_u64tape(sz_cptr_t *start, sz_u64_t const *offsets, sz_size_t count,
-                                        sz_sequence_t *sequence);
-
-/**
- *  @brief  Similar to `std::partition`, given a predicate splits the sequence into two parts.
- *          The algorithm is unstable, meaning that elements may change relative order, as long
- *          as they are in the right partition. This is the simpler algorithm for partitioning.
- */
-SZ_PUBLIC sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate);
-
-/**
- *  @brief  Inplace `std::set_union` for two consecutive chunks forming the same continuous `sequence`.
- *
- *  @param partition The number of elements in the first sub-sequence in `sequence`.
- *  @param less Comparison function, to determine the lexicographic ordering.
- */
-SZ_PUBLIC void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less);
-
-/**
- *  @brief  Sorting algorithm, combining Radix Sort for the first 32 bits of every word
- *          and a follow-up by a more conventional sorting procedure on equally prefixed parts.
- */
-SZ_PUBLIC void sz_sort(sz_sequence_t *sequence);
-
-/**
- *  @brief  Partial sorting algorithm, combining Radix Sort for the first 32 bits of every word
- *          and a follow-up by a more conventional sorting procedure on equally prefixed parts.
- */
-SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t n);
-
-/**
- *  @brief  Intro-Sort algorithm that supports custom comparators.
- */
-SZ_PUBLIC void sz_sort_intro(sz_sequence_t *sequence, sz_sequence_comparator_t less);
-
-#pragma endregion
-
-/*
- *  Hardware feature detection.
- *  All of those can be controlled by the user.
- */
-#ifndef SZ_USE_X86_AVX512
-#ifdef __AVX512BW__
-#define SZ_USE_X86_AVX512 1
-#else
-#define SZ_USE_X86_AVX512 0
-#endif
-#endif
-
-#ifndef SZ_USE_X86_AVX2
-#ifdef __AVX2__
-#define SZ_USE_X86_AVX2 1
-#else
-#define SZ_USE_X86_AVX2 0
-#endif
-#endif
-
-#ifndef SZ_USE_ARM_NEON
-#ifdef __ARM_NEON
-#define SZ_USE_ARM_NEON 1
-#else
-#define SZ_USE_ARM_NEON 0
-#endif
-#endif
-
-#ifndef SZ_USE_ARM_SVE
-#ifdef __ARM_FEATURE_SVE
-#define SZ_USE_ARM_SVE 1
-#else
-#define SZ_USE_ARM_SVE 0
-#endif
-#endif
-
-/*
- *  Include hardware-specific headers.
- */
-#if SZ_USE_X86_AVX512 || SZ_USE_X86_AVX2
-#include <immintrin.h>
-#endif // SZ_USE_X86...
-#if SZ_USE_ARM_NEON
-#if !defined(_MSC_VER)
-#include <arm_acle.h>
-#endif
-#include <arm_neon.h>
-#endif // SZ_USE_ARM_NEON
-#if SZ_USE_ARM_SVE
-#if !defined(_MSC_VER)
-#include <arm_sve.h>
-#endif
-#endif // SZ_USE_ARM_SVE
-
-#pragma region Hardware Specific API
-
-#if SZ_USE_X86_AVX512
-
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_avx512(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_avx512(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_edit_distance */
-SZ_PUBLIC sz_size_t sz_edit_distance_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                            sz_size_t bound, sz_memory_allocator_t *alloc);
-/** @copydoc sz_alignment_score */
-SZ_PUBLIC sz_ssize_t sz_alignment_score_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                               sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                               sz_memory_allocator_t *alloc);
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle);
-#endif
-
-#if SZ_USE_X86_AVX2
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_avx2(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_avx2(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_avx2(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_avx2(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                              sz_hash_callback_t callback, void *callback_handle);
-#endif
-
-#if SZ_USE_ARM_NEON
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_neon(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_neon(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_neon(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-#endif
-
-#if SZ_USE_ARM_SVE
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_sve(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_sve(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_sve(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_sve(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_sve(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-#endif
-
-#pragma endregion
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wconversion"
-
-/*
- **********************************************************************************************************************
- **********************************************************************************************************************
- **********************************************************************************************************************
- *
- *  This is where we the actual implementation begins.
- *  The rest of the file is hidden from the public API.
- *
- **********************************************************************************************************************
- **********************************************************************************************************************
- **********************************************************************************************************************
- */
-
-#pragma region Compiler Extensions and Helper Functions
-
-#pragma GCC visibility push(hidden)
-
-/**
- *  @brief  Helper-macro to mark potentially unused variables.
- */
-#define sz_unused(x) ((void)(x))
-
-/**
- *  @brief  Helper-macro casting a variable to another type of the same size.
- */
-#define sz_bitcast(type, value) (*((type *)&(value)))
-
-/**
- *  @brief  Defines `SZ_NULL`, analogous to `NULL`.
- *          The default often comes from locale.h, stddef.h,
- *          stdio.h, stdlib.h, string.h, time.h, or wchar.h.
- */
-#ifdef __GNUG__
-#define SZ_NULL __null
-#define SZ_NULL_CHAR __null
-#else
-#define SZ_NULL ((void *)0)
-#define SZ_NULL_CHAR ((char *)0)
-#endif
-
-/**
- *  @brief  Cache-line width, that will affect the execution of some algorithms,
- *          like equality checks and relative order computing.
- */
-#define SZ_CACHE_LINE_WIDTH (64) // bytes
-
-/**
- *  @brief  Similar to `assert`, the `sz_assert` is used in the SZ_DEBUG mode
- *          to check the invariants of the library. It's a no-op in the SZ_RELEASE mode.
- *  @note   If you want to catch it, put a breakpoint at @b `__GI_exit`
- */
-#if SZ_DEBUG && defined(SZ_AVOID_LIBC) && !SZ_AVOID_LIBC && !defined(SZ_PIC)
-#include <stdio.h>  // `fprintf`
-#include <stdlib.h> // `EXIT_FAILURE`
-SZ_PUBLIC void _sz_assert_failure(char const *condition, char const *file, int line) {
-    fprintf(stderr, "Assertion failed: %s, in file %s, line %d\n", condition, file, line);
-    exit(EXIT_FAILURE);
-}
-#define sz_assert(condition)                                                      \
-    do {                                                                          \
-        if (!(condition)) { _sz_assert_failure(#condition, __FILE__, __LINE__); } \
-    } while (0)
-#else
-#define sz_assert(condition) ((void)(condition))
-#endif
-
-/*  Intrinsics aliases for MSVC, GCC, Clang, and Clang-Cl.
- *  The following section of compiler intrinsics comes in 2 flavors.
- */
-#if defined(_MSC_VER) && !defined(__clang__) // On Clang-CL
-#include <intrin.h>
-
-// Sadly, when building Win32 images, we can't use the `_tzcnt_u64`, `_lzcnt_u64`,
-// `_BitScanForward64`, or `_BitScanReverse64` intrinsics. For now it's a simple `for`-loop.
-// TODO: In the future we can switch to a more efficient De Bruijn's algorithm.
-// https://www.chessprogramming.org/BitScan
-// https://www.chessprogramming.org/De_Bruijn_Sequence
-// https://gist.github.com/resilar/e722d4600dbec9752771ab4c9d47044f
-//
-// Use the serial version on 32-bit x86 and on Arm.
-#if (defined(_WIN32) && !defined(_WIN64)) || defined(_M_ARM) || defined(_M_ARM64)
-SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 1) == 0) { n++, x >>= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u64_clz(sz_u64_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 0x8000000000000000ull) == 0) { n++, x <<= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) {
-    x = x - ((x >> 1) & 0x5555555555555555ull);
-    x = (x & 0x3333333333333333ull) + ((x >> 2) & 0x3333333333333333ull);
-    return (((x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Full) * 0x0101010101010101ull) >> 56;
-}
-SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 1) == 0) { n++, x >>= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u32_clz(sz_u32_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 0x80000000u) == 0) { n++, x <<= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) {
-    x = x - ((x >> 1) & 0x55555555);
-    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
-    return (((x + (x >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
-}
-#else
-SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) { return (int)_tzcnt_u64(x); }
-SZ_INTERNAL int sz_u64_clz(sz_u64_t x) { return (int)_lzcnt_u64(x); }
-SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) { return (int)__popcnt64(x); }
-SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) { return (int)_tzcnt_u32(x); }
-SZ_INTERNAL int sz_u32_clz(sz_u32_t x) { return (int)_lzcnt_u32(x); }
-SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) { return (int)__popcnt(x); }
-#endif
-// Force the byteswap functions to be intrinsics, because when /Oi- is given, these will turn into CRT function calls,
-// which breaks when `SZ_AVOID_LIBC` is given
-#pragma intrinsic(_byteswap_uint64)
-SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return _byteswap_uint64(val); }
-#pragma intrinsic(_byteswap_ulong)
-SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return _byteswap_ulong(val); }
-#else
-SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) { return __builtin_popcountll(x); }
-SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) { return __builtin_popcount(x); }
-SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) { return __builtin_ctzll(x); }
-SZ_INTERNAL int sz_u64_clz(sz_u64_t x) { return __builtin_clzll(x); }
-SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) { return __builtin_ctz(x); } // ! Undefined if `x == 0`
-SZ_INTERNAL int sz_u32_clz(sz_u32_t x) { return __builtin_clz(x); } // ! Undefined if `x == 0`
-SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return __builtin_bswap64(val); }
-SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return __builtin_bswap32(val); }
-#endif
-
-SZ_INTERNAL sz_u64_t sz_u64_rotl(sz_u64_t x, sz_u64_t r) { return (x << r) | (x >> (64 - r)); }
-
-/**
- *  @brief  Select bits from either ::a or ::b depending on the value of ::mask bits.
- *
- *  Similar to `_mm_blend_epi16` intrinsic on x86.
- *  Described in the "Bit Twiddling Hacks" by Sean Eron Anderson.
- *  https://graphics.stanford.edu/~seander/bithacks.html#ConditionalSetOrClearBitsWithoutBranching
- */
-SZ_INTERNAL sz_u64_t sz_u64_blend(sz_u64_t a, sz_u64_t b, sz_u64_t mask) { return a ^ ((a ^ b) & mask); }
-
-/*
- *  Efficiently computing the minimum and maximum of two or three values can be tricky.
- *  The simple branching baseline would be:
- *
- *      x < y ? x : y                               // can replace with 1 conditional move
- *
- *  Branchless approach is well known for signed integers, but it doesn't apply to unsigned ones.
- *  https://stackoverflow.com/questions/514435/templatized-branchless-int-max-min-function
- *  https://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
- *  Using only bit-shifts for singed integers it would be:
- *
- *      y + ((x - y) & (x - y) >> 31)               // 4 unique operations
- *
- *  Alternatively, for any integers using multiplication:
- *
- *      (x > y) * y + (x <= y) * x                  // 5 operations
- *
- *  Alternatively, to avoid multiplication:
- *
- *      x & ~((x < y) - 1) + y & ((x < y) - 1)      // 6 unique operations
- */
-#define sz_min_of_two(x, y) (x < y ? x : y)
-#define sz_max_of_two(x, y) (x < y ? y : x)
-#define sz_min_of_three(x, y, z) sz_min_of_two(x, sz_min_of_two(y, z))
-#define sz_max_of_three(x, y, z) sz_max_of_two(x, sz_max_of_two(y, z))
-
-/** @brief  Branchless minimum function for two signed 32-bit integers. */
-SZ_INTERNAL sz_i32_t sz_i32_min_of_two(sz_i32_t x, sz_i32_t y) { return y + ((x - y) & (x - y) >> 31); }
-
-/** @brief  Branchless minimum function for two signed 32-bit integers. */
-SZ_INTERNAL sz_i32_t sz_i32_max_of_two(sz_i32_t x, sz_i32_t y) { return x - ((x - y) & (x - y) >> 31); }
-
-/**
- *  @brief  Clamps signed offsets in a string to a valid range. Used for Pythonic-style slicing.
- */
-SZ_INTERNAL void sz_ssize_clamp_interval(sz_size_t length, sz_ssize_t start, sz_ssize_t end,
-                                         sz_size_t *normalized_offset, sz_size_t *normalized_length) {
-    // TODO: Remove branches.
-    // Normalize negative indices
-    if (start < 0) start += length;
-    if (end < 0) end += length;
-
-    // Clamp indices to a valid range
-    if (start < 0) start = 0;
-    if (end < 0) end = 0;
-    if (start > (sz_ssize_t)length) start = length;
-    if (end > (sz_ssize_t)length) end = length;
-
-    // Ensure start <= end
-    if (start > end) start = end;
-
-    *normalized_offset = start;
-    *normalized_length = end - start;
-}
-
-/**
- *  @brief  Compute the logarithm base 2 of a positive integer, rounding down.
- */
-SZ_INTERNAL sz_size_t sz_size_log2i_nonzero(sz_size_t x) {
-    sz_assert(x > 0 && "Non-positive numbers have no defined logarithm");
-    sz_size_t leading_zeros = sz_u64_clz(x);
-    return 63 - leading_zeros;
-}
-
-/**
- *  @brief  Compute the smallest power of two greater than or equal to ::x.
- */
-SZ_INTERNAL sz_size_t sz_size_bit_ceil(sz_size_t x) {
-    // Unlike the commonly used trick with `clz` intrinsics, is valid across the whole range of `x`.
-    // https://stackoverflow.com/a/10143264
-    x--;
-    x |= x >> 1;
-    x |= x >> 2;
-    x |= x >> 4;
-    x |= x >> 8;
-    x |= x >> 16;
-#if SZ_DETECT_64_BIT
-    x |= x >> 32;
-#endif
-    x++;
-    return x;
-}
-
-/**
- *  @brief  Transposes an 8x8 bit matrix packed in a `sz_u64_t`.
- *
- *  There is a well known SWAR sequence for that known to chess programmers,
- *  willing to flip a bit-matrix of pieces along the main A1-H8 diagonal.
- *  https://www.chessprogramming.org/Flipping_Mirroring_and_Rotating
- *  https://lukas-prokop.at/articles/2021-07-23-transpose
- */
-SZ_INTERNAL sz_u64_t sz_u64_transpose(sz_u64_t x) {
-    sz_u64_t t;
-    t = x ^ (x << 36);
-    x ^= 0xf0f0f0f00f0f0f0full & (t ^ (x >> 36));
-    t = 0xcccc0000cccc0000ull & (x ^ (x << 18));
-    x ^= t ^ (t >> 18);
-    t = 0xaa00aa00aa00aa00ull & (x ^ (x << 9));
-    x ^= t ^ (t >> 9);
-    return x;
-}
-
-/**
- *  @brief  Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
- */
-SZ_INTERNAL void sz_u64_swap(sz_u64_t *a, sz_u64_t *b) {
-    sz_u64_t t = *a;
-    *a = *b;
-    *b = t;
-}
-
-/**
- *  @brief  Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
- */
-SZ_INTERNAL void sz_pointer_swap(void **a, void **b) {
-    void *t = *a;
-    *a = *b;
-    *b = t;
-}
-
-/**
- *  @brief  Helper structure to simplify work with 16-bit words.
- *  @see    sz_u16_load
- */
-typedef union sz_u16_vec_t {
-    sz_u16_t u16;
-    sz_u8_t u8s[2];
-} sz_u16_vec_t;
-
-/**
- *  @brief Load a 16-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u16_vec_t sz_u16_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u16_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u16_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u16_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u16_vec_t const *result = (sz_u16_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/**
- *  @brief  Helper structure to simplify work with 32-bit words.
- *  @see    sz_u32_load
- */
-typedef union sz_u32_vec_t {
-    sz_u32_t u32;
-    sz_u16_t u16s[2];
-    sz_u8_t u8s[4];
-} sz_u32_vec_t;
-
-/**
- *  @brief Load a 32-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u32_vec_t sz_u32_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u32_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    result.u8s[2] = ptr[2];
-    result.u8s[3] = ptr[3];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u32_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u32_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u32_vec_t const *result = (sz_u32_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/**
- *  @brief  Helper structure to simplify work with 64-bit words.
- *  @see    sz_u64_load
- */
-typedef union sz_u64_vec_t {
-    sz_u64_t u64;
-    sz_u32_t u32s[2];
-    sz_u16_t u16s[4];
-    sz_u8_t u8s[8];
-} sz_u64_vec_t;
-
-/**
- *  @brief Load a 64-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u64_vec_t sz_u64_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u64_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    result.u8s[2] = ptr[2];
-    result.u8s[3] = ptr[3];
-    result.u8s[4] = ptr[4];
-    result.u8s[5] = ptr[5];
-    result.u8s[6] = ptr[6];
-    result.u8s[7] = ptr[7];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u64_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u64_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u64_vec_t const *result = (sz_u64_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/** @brief  Helper function, using the supplied fixed-capacity buffer to allocate memory. */
-SZ_INTERNAL sz_ptr_t _sz_memory_allocate_fixed(sz_size_t length, void *handle) {
-    sz_size_t capacity;
-    sz_copy((sz_ptr_t)&capacity, (sz_cptr_t)handle, sizeof(sz_size_t));
-    sz_size_t consumed_capacity = sizeof(sz_size_t);
-    if (consumed_capacity + length > capacity) return SZ_NULL_CHAR;
-    return (sz_ptr_t)handle + consumed_capacity;
-}
-
-/** @brief  Helper "no-op" function, simulating memory deallocation when we use a "static" memory buffer. */
-SZ_INTERNAL void _sz_memory_free_fixed(sz_ptr_t start, sz_size_t length, void *handle) {
-    sz_unused(start && length && handle);
-}
-
-/** @brief  An internal callback used to set a bit in a power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void _sz_hashes_fingerprint_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *handle) {
-    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
-    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
-    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
-    fingerprint_u8s[(hash / 8) & (fingerprint_bytes - 1)] |= (1 << (hash & 7));
-    sz_unused(start && length);
-}
-
-/** @brief  An internal callback used to set a bit in a @b non power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void _sz_hashes_fingerprint_non_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash,
-                                                          void *handle) {
-    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
-    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
-    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
-    fingerprint_u8s[(hash / 8) % fingerprint_bytes] |= (1 << (hash & 7));
-    sz_unused(start && length);
-}
-
-/** @brief  An internal callback, used to mix all the running hashes into one pointer-size value. */
-SZ_INTERNAL void _sz_hashes_fingerprint_scalar_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash,
-                                                        void *scalar_handle) {
-    sz_unused(start && length && hash && scalar_handle);
-    sz_size_t *scalar_ptr = (sz_size_t *)scalar_handle;
-    *scalar_ptr ^= hash;
-}
-
-/**
- *  @brief  Chooses the offsets of the most interesting characters in a search needle.
- *
- *  Search throughput can significantly deteriorate if we are matching the wrong characters.
- *  Say the needle is "aXaYa", and we are comparing the first, second, and last character.
- *  If we use SIMD and compare many offsets at a time, comparing against "a" in every register is a waste.
- *
- *  Similarly, dealing with UTF8 inputs, we know that the lower bits of each character code carry more information.
- *  Cyrillic alphabet, for example, falls into [0x0410, 0x042F] code range for uppercase [А, Я], and
- *  into [0x0430, 0x044F] for lowercase [а, я]. Scanning through a text written in Russian, half of the
- *  bytes will carry absolutely no value and will be equal to 0x04.
- */
-SZ_INTERNAL void _sz_locate_needle_anomalies(sz_cptr_t start, sz_size_t length, //
-                                             sz_size_t *first, sz_size_t *second, sz_size_t *third) {
-    *first = 0;
-    *second = length / 2;
-    *third = length - 1;
-
-    //
-    int has_duplicates =                   //
-        start[*first] == start[*second] || //
-        start[*first] == start[*third] ||  //
-        start[*second] == start[*third];
-
-    // Loop through letters to find non-colliding variants.
-    if (length > 3 && has_duplicates) {
-        // Pivot the middle point right, until we find a character different from the first one.
-        for (; start[*second] == start[*first] && *second + 1 < *third; ++(*second)) {}
-        // Pivot the third (last) point left, until we find a different character.
-        for (; (start[*third] == start[*second] || start[*third] == start[*first]) && *third > (*second + 1);
-             --(*third)) {}
-    }
-
-    // TODO: Investigate alternative strategies for long needles.
-    // On very long needles we have the luxury to choose!
-    // Often dealing with UTF8, we will likely benefit from shifting the first and second characters
-    // further to the right, to achieve not only uniqueness within the needle, but also avoid common
-    // rune prefixes of 2-, 3-, and 4-byte codes.
-    if (length > 8) {
-        // Pivot the first and second points right, until we find a character, that:
-        // > is different from others.
-        // > doesn't start with 0b'110x'xxxx - only 5 bits of relevant info.
-        // > doesn't start with 0b'1110'xxxx - only 4 bits of relevant info.
-        // > doesn't start with 0b'1111'0xxx - only 3 bits of relevant info.
-        //
-        // So we are practically searching for byte values that start with 0b0xxx'xxxx or 0b'10xx'xxxx.
-        // Meaning they fall in the range [0, 127] and [128, 191], in other words any unsigned int up to 191.
-        sz_u8_t const *start_u8 = (sz_u8_t const *)start;
-        sz_size_t vibrant_first = *first, vibrant_second = *second, vibrant_third = *third;
-
-        // Let's begin with the seccond character, as the termination criteria there is more obvious
-        // and we may end up with more variants to check for the first candidate.
-        for (; (start_u8[vibrant_second] > 191 || start_u8[vibrant_second] == start_u8[vibrant_third]) &&
-               (vibrant_second + 1 < vibrant_third);
-             ++vibrant_second) {}
-
-        // Now check if we've indeed found a good candidate or should revert the `vibrant_second` to `second`.
-        if (start_u8[vibrant_second] < 191) { *second = vibrant_second; }
-        else { vibrant_second = *second; }
-
-        // Now check the first character.
-        for (; (start_u8[vibrant_first] > 191 || start_u8[vibrant_first] == start_u8[vibrant_second] ||
-                start_u8[vibrant_first] == start_u8[vibrant_third]) &&
-               (vibrant_first + 1 < vibrant_second);
-             ++vibrant_first) {}
-
-        // Now check if we've indeed found a good candidate or should revert the `vibrant_first` to `first`.
-        // We don't need to shift the third one when dealing with texts as the last byte of the text is
-        // also the last byte of a rune and contains the most information.
-        if (start_u8[vibrant_first] < 191) { *first = vibrant_first; }
-    }
-}
-
-#pragma GCC visibility pop
-#pragma endregion
-
-#pragma region Serial Implementation
-
-#if !SZ_AVOID_LIBC
-#include <stdio.h>  // `fprintf`
-#include <stdlib.h> // `malloc`, `EXIT_FAILURE`
-
-SZ_PUBLIC void *_sz_memory_allocate_default(sz_size_t length, void *handle) {
-    sz_unused(handle);
-    return malloc(length);
-}
-SZ_PUBLIC void _sz_memory_free_default(sz_ptr_t start, sz_size_t length, void *handle) {
-    sz_unused(handle && length);
-    free(start);
-}
-
-#endif
-
-SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc) {
-#if !SZ_AVOID_LIBC
-    alloc->allocate = (sz_memory_allocate_t)_sz_memory_allocate_default;
-    alloc->free = (sz_memory_free_t)_sz_memory_free_default;
-#else
-    alloc->allocate = (sz_memory_allocate_t)SZ_NULL;
-    alloc->free = (sz_memory_free_t)SZ_NULL;
-#endif
-    alloc->handle = SZ_NULL;
-}
-
-SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length) {
-    // The logic here is simple - put the buffer length in the first slots of the buffer.
-    // Later use it for bounds checking.
-    alloc->allocate = (sz_memory_allocate_t)_sz_memory_allocate_fixed;
-    alloc->free = (sz_memory_free_t)_sz_memory_free_fixed;
-    alloc->handle = &buffer;
-    sz_copy((sz_ptr_t)buffer, (sz_cptr_t)&length, sizeof(sz_size_t));
-}
-
-/**
- *  @brief  Byte-level equality comparison between two strings.
- *          If unaligned loads are allowed, uses a switch-table to avoid loops on short strings.
- */
-SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_cptr_t const a_end = a + length;
-#if SZ_USE_MISALIGNED_LOADS
-    if (length >= SZ_SWAR_THRESHOLD) {
-        sz_u64_vec_t a_vec, b_vec;
-        for (; a + 8 <= a_end; a += 8, b += 8) {
-            a_vec = sz_u64_load(a);
-            b_vec = sz_u64_load(b);
-            if (a_vec.u64 != b_vec.u64) return sz_false_k;
-        }
-    }
-#endif
-    while (a != a_end && *a == *b) a++, b++;
-    return (sz_bool_t)(a_end == a);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-    for (sz_cptr_t const end = text + length; text != end; ++text)
-        if (sz_charset_contains(set, *text)) return text;
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Warray-bounds"
-    sz_cptr_t const end = text;
-    for (text += length; text != end;)
-        if (sz_charset_contains(set, *(text -= 1))) return text;
-    return SZ_NULL_CHAR;
-#pragma GCC diagnostic pop
-}
-
-/**
- *  One option to avoid branching is to use conditional moves and lookup the comparison result in a table:
- *       sz_ordering_t ordering_lookup[2] = {sz_greater_k, sz_less_k};
- *       for (; a != min_end; ++a, ++b)
- *           if (*a != *b) return ordering_lookup[*a < *b];
- *  That, however, introduces a data-dependency.
- *  A cleaner option is to perform two comparisons and a subtraction.
- *  One instruction more, but no data-dependency.
- */
-#define _sz_order_scalars(a, b) ((sz_ordering_t)((a > b) - (a < b)))
-
-SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    sz_bool_t a_shorter = (sz_bool_t)(a_length < b_length);
-    sz_size_t min_length = a_shorter ? a_length : b_length;
-    sz_cptr_t min_end = a + min_length;
-#if SZ_USE_MISALIGNED_LOADS && !SZ_DETECT_BIG_ENDIAN
-    for (sz_u64_vec_t a_vec, b_vec; a + 8 <= min_end; a += 8, b += 8) {
-        a_vec = sz_u64_load(a);
-        b_vec = sz_u64_load(b);
-        if (a_vec.u64 != b_vec.u64)
-            return _sz_order_scalars(sz_u64_bytes_reverse(a_vec.u64), sz_u64_bytes_reverse(b_vec.u64));
-    }
-#endif
-    for (; a != min_end; ++a, ++b)
-        if (*a != *b) return _sz_order_scalars(*a, *b);
-
-    // If the strings are equal up to `min_end`, then the shorter string is smaller
-    return _sz_order_scalars(a_length, b_length);
-}
-
-/**
- *  @brief  Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each byte is set.
-    // For that take the bottom 7 bits of each byte, add one to them,
-    // and if this sets the top bit to one, then all the 7 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7F7F7F7F7F7F7F7Full) + 0x0101010101010101ull) & ((vec.u64 & 0x8080808080808080ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b single-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- *          Identical to `memchr(haystack, needle[0], haystack_length)`.
- */
-SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    if (!h_length) return SZ_NULL_CHAR;
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_DETECT_BIG_ENDIAN    // Use SWAR only on little-endian platforms for brevety.
-#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
-        if (*h == *n) return h;
-#endif
-
-    // Broadcast the n into every byte of a 64-bit integer to use SWAR
-    // techniques and process eight characters at a time.
-    sz_u64_vec_t h_vec, n_vec, match_vec;
-    match_vec.u64 = 0;
-    n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
-    for (; h + 8 <= h_end; h += 8) {
-        h_vec.u64 = *(sz_u64_t const *)h;
-        match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
-        if (match_vec.u64) return h + sz_u64_ctz(match_vec.u64) / 8;
-    }
-#endif
-
-    // Handle the misaligned tail.
-    for (; h < h_end; ++h)
-        if (*h == *n) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Find the last occurrence of a @b single-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- *          Identical to `memrchr(haystack, needle[0], haystack_length)`.
- */
-sz_cptr_t sz_rfind_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    if (!h_length) return SZ_NULL_CHAR;
-    sz_cptr_t const h_start = h;
-
-    // Reposition the `h` pointer to the end, as we will be walking backwards.
-    h = h + h_length - 1;
-
-#if !SZ_DETECT_BIG_ENDIAN    // Use SWAR only on little-endian platforms for brevety.
-#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)(h + 1) & 7ull) && h >= h_start; --h)
-        if (*h == *n) return h;
-#endif
-
-    // Broadcast the n into every byte of a 64-bit integer to use SWAR
-    // techniques and process eight characters at a time.
-    sz_u64_vec_t h_vec, n_vec, match_vec;
-    n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
-    for (; h >= h_start + 7; h -= 8) {
-        h_vec.u64 = *(sz_u64_t const *)(h - 7);
-        match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
-        if (match_vec.u64) return h - sz_u64_clz(match_vec.u64) / 8;
-    }
-#endif
-
-    for (; h >= h_start; --h)
-        if (*h == *n) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  2Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 2byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_2byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 2byte is set.
-    // For that take the bottom 15 bits of each 2byte, add one to them,
-    // and if this sets the top bit to one, then all the 15 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7FFF7FFF7FFF7FFFull) + 0x0001000100010001ull) & ((vec.u64 & 0x8000800080008000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b two-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_2byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 2 bytes long.
-    sz_assert(h_length >= 2 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 2 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
-#endif
-
-    sz_u64_vec_t h_even_vec, h_odd_vec, n_vec, matches_even_vec, matches_odd_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1];
-    n_vec.u64 *= 0x0001000100010001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
-    for (; h + 9 <= h_end; h += 8) {
-        h_even_vec.u64 = *(sz_u64_t *)h;
-        h_odd_vec.u64 = (h_even_vec.u64 >> 8) | ((sz_u64_t)h[8] << 56);
-        matches_even_vec = _sz_u64_each_2byte_equal(h_even_vec, n_vec);
-        matches_odd_vec = _sz_u64_each_2byte_equal(h_odd_vec, n_vec);
-
-        matches_even_vec.u64 >>= 8;
-        if (matches_even_vec.u64 + matches_odd_vec.u64) {
-            sz_u64_t match_indicators = matches_even_vec.u64 | matches_odd_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 2 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  4Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 4byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_4byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 4byte is set.
-    // For that take the bottom 31 bits of each 4byte, add one to them,
-    // and if this sets the top bit to one, then all the 31 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7FFFFFFF7FFFFFFFull) + 0x0000000100000001ull) & ((vec.u64 & 0x8000000080000000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b four-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_4byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
-    sz_assert(h_length >= 4 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 4 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
-#endif
-
-    sz_u64_vec_t h0_vec, h1_vec, h2_vec, h3_vec, n_vec, matches0_vec, matches1_vec, matches2_vec, matches3_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1], n_vec.u8s[2] = n[2], n_vec.u8s[3] = n[3];
-    n_vec.u64 *= 0x0000000100000001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time using four 64-bit words.
-    // We load the subsequent four-byte word as well, taking its first bytes. Think of it as a glorified prefetch :)
-    sz_u64_t h_page_current, h_page_next;
-    for (; h + sizeof(sz_u64_t) + sizeof(sz_u32_t) <= h_end; h += sizeof(sz_u64_t)) {
-        h_page_current = *(sz_u64_t *)h;
-        h_page_next = *(sz_u32_t *)(h + 8);
-        h0_vec.u64 = (h_page_current);
-        h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
-        h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
-        h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
-        matches0_vec = _sz_u64_each_4byte_equal(h0_vec, n_vec);
-        matches1_vec = _sz_u64_each_4byte_equal(h1_vec, n_vec);
-        matches2_vec = _sz_u64_each_4byte_equal(h2_vec, n_vec);
-        matches3_vec = _sz_u64_each_4byte_equal(h3_vec, n_vec);
-
-        if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64) {
-            matches0_vec.u64 >>= 24;
-            matches1_vec.u64 >>= 16;
-            matches2_vec.u64 >>= 8;
-            sz_u64_t match_indicators = matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 4 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  3Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 3byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_3byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 4byte is set.
-    // For that take the bottom 31 bits of each 4byte, add one to them,
-    // and if this sets the top bit to one, then all the 31 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0xFFFF7FFFFF7FFFFFull) + 0x0000000001000001ull) & ((vec.u64 & 0x0000800000800000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b three-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_3byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
-    sz_assert(h_length >= 3 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 3 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
-#endif
-
-    // We fetch 12
-    sz_u64_vec_t h0_vec, h1_vec, h2_vec, h3_vec, h4_vec;
-    sz_u64_vec_t matches0_vec, matches1_vec, matches2_vec, matches3_vec, matches4_vec;
-    sz_u64_vec_t n_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1], n_vec.u8s[2] = n[2];
-    n_vec.u64 *= 0x0000000001000001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time using three 64-bit words.
-    // We load the subsequent two-byte word as well.
-    sz_u64_t h_page_current, h_page_next;
-    for (; h + sizeof(sz_u64_t) + sizeof(sz_u16_t) <= h_end; h += sizeof(sz_u64_t)) {
-        h_page_current = *(sz_u64_t *)h;
-        h_page_next = *(sz_u16_t *)(h + 8);
-        h0_vec.u64 = (h_page_current);
-        h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
-        h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
-        h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
-        h4_vec.u64 = (h_page_current >> 32) | (h_page_next << 32);
-        matches0_vec = _sz_u64_each_3byte_equal(h0_vec, n_vec);
-        matches1_vec = _sz_u64_each_3byte_equal(h1_vec, n_vec);
-        matches2_vec = _sz_u64_each_3byte_equal(h2_vec, n_vec);
-        matches3_vec = _sz_u64_each_3byte_equal(h3_vec, n_vec);
-        matches4_vec = _sz_u64_each_3byte_equal(h4_vec, n_vec);
-
-        if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64) {
-            matches0_vec.u64 >>= 16;
-            matches1_vec.u64 >>= 8;
-            matches3_vec.u64 <<= 8;
-            matches4_vec.u64 <<= 16;
-            sz_u64_t match_indicators =
-                matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 3 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Boyer-Moore-Horspool algorithm for exact matching of patterns up to @b 256-bytes long.
- *          Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_horspool_upto_256bytes_serial(sz_cptr_t h_chars, sz_size_t h_length, //
-                                                             sz_cptr_t n_chars, sz_size_t n_length) {
-    sz_assert(n_length <= 256 && "The pattern is too long.");
-    // Several popular string matching algorithms are using a bad-character shift table.
-    // Boyer Moore: https://www-igm.univ-mlv.fr/~lecroq/string/node14.html
-    // Quick Search: https://www-igm.univ-mlv.fr/~lecroq/string/node19.html
-    // Smith: https://www-igm.univ-mlv.fr/~lecroq/string/node21.html
-    union {
-        sz_u8_t jumps[256];
-        sz_u64_vec_t vecs[64];
-    } bad_shift_table;
-
-    // Let's initialize the table using SWAR to the total length of the string.
-    sz_u8_t const *h = (sz_u8_t const *)h_chars;
-    sz_u8_t const *n = (sz_u8_t const *)n_chars;
-    {
-        sz_u64_vec_t n_length_vec;
-        n_length_vec.u64 = n_length;
-        n_length_vec.u64 *= 0x0101010101010101ull; // broadcast
-        for (sz_size_t i = 0; i != 64; ++i) bad_shift_table.vecs[i].u64 = n_length_vec.u64;
-        for (sz_size_t i = 0; i + 1 < n_length; ++i) bad_shift_table.jumps[n[i]] = (sz_u8_t)(n_length - i - 1);
-    }
-
-    // Another common heuristic is to match a few characters from different parts of a string.
-    // Raita suggests to use the first two, the last, and the middle character of the pattern.
-    sz_u32_vec_t h_vec, n_vec;
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into an unsigned integer.
-    n_vec.u8s[0] = n[offset_first];
-    n_vec.u8s[1] = n[offset_first + 1];
-    n_vec.u8s[2] = n[offset_mid];
-    n_vec.u8s[3] = n[offset_last];
-
-    // Scan through the whole haystack, skipping the last `n_length - 1` bytes.
-    for (sz_size_t i = 0; i <= h_length - n_length;) {
-        h_vec.u8s[0] = h[i + offset_first];
-        h_vec.u8s[1] = h[i + offset_first + 1];
-        h_vec.u8s[2] = h[i + offset_mid];
-        h_vec.u8s[3] = h[i + offset_last];
-        if (h_vec.u32 == n_vec.u32 && sz_equal((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
-        i += bad_shift_table.jumps[h[i + n_length - 1]];
-    }
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Boyer-Moore-Horspool algorithm for @b reverse-order exact matching of patterns up to @b 256-bytes long.
- *          Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
- */
-SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_upto_256bytes_serial(sz_cptr_t h_chars, sz_size_t h_length, //
-                                                              sz_cptr_t n_chars, sz_size_t n_length) {
-    sz_assert(n_length <= 256 && "The pattern is too long.");
-    union {
-        sz_u8_t jumps[256];
-        sz_u64_vec_t vecs[64];
-    } bad_shift_table;
-
-    // Let's initialize the table using SWAR to the total length of the string.
-    sz_u8_t const *h = (sz_u8_t const *)h_chars;
-    sz_u8_t const *n = (sz_u8_t const *)n_chars;
-    {
-        sz_u64_vec_t n_length_vec;
-        n_length_vec.u64 = n_length;
-        n_length_vec.u64 *= 0x0101010101010101ull; // broadcast
-        for (sz_size_t i = 0; i != 64; ++i) bad_shift_table.vecs[i].u64 = n_length_vec.u64;
-        for (sz_size_t i = 0; i + 1 < n_length; ++i)
-            bad_shift_table.jumps[n[n_length - i - 1]] = (sz_u8_t)(n_length - i - 1);
-    }
-
-    // Another common heuristic is to match a few characters from different parts of a string.
-    // Raita suggests to use the first two, the last, and the middle character of the pattern.
-    sz_u32_vec_t h_vec, n_vec;
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into an unsigned integer.
-    n_vec.u8s[0] = n[offset_first];
-    n_vec.u8s[1] = n[offset_first + 1];
-    n_vec.u8s[2] = n[offset_mid];
-    n_vec.u8s[3] = n[offset_last];
-
-    // Scan through the whole haystack, skipping the first `n_length - 1` bytes.
-    for (sz_size_t j = 0; j <= h_length - n_length;) {
-        sz_size_t i = h_length - n_length - j;
-        h_vec.u8s[0] = h[i + offset_first];
-        h_vec.u8s[1] = h[i + offset_first + 1];
-        h_vec.u8s[2] = h[i + offset_mid];
-        h_vec.u8s[3] = h[i + offset_last];
-        if (h_vec.u32 == n_vec.u32 && sz_equal((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
-        j += bad_shift_table.jumps[h[i]];
-    }
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Exact substring search helper function, that finds the first occurrence of a prefix of the needle
- *          using a given search function, and then verifies the remaining part of the needle.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_with_prefix(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length,
-                                           sz_find_t find_prefix, sz_size_t prefix_length) {
-
-    sz_size_t suffix_length = n_length - prefix_length;
-    while (1) {
-        sz_cptr_t found = find_prefix(h, h_length, n, prefix_length);
-        if (!found) return SZ_NULL_CHAR;
-
-        // Verify the remaining part of the needle
-        sz_size_t remaining = h_length - (found - h);
-        if (remaining < n_length) return SZ_NULL_CHAR;
-        if (sz_equal(found + prefix_length, n + prefix_length, suffix_length)) return found;
-
-        // Adjust the position.
-        h = found + 1;
-        h_length = remaining - 1;
-    }
-
-    // Unreachable, but helps silence compiler warnings:
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Exact reverse-order substring search helper function, that finds the last occurrence of a suffix of the
- *          needle using a given search function, and then verifies the remaining part of the needle.
- */
-SZ_INTERNAL sz_cptr_t _sz_rfind_with_suffix(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length,
-                                            sz_find_t find_suffix, sz_size_t suffix_length) {
-
-    sz_size_t prefix_length = n_length - suffix_length;
-    while (1) {
-        sz_cptr_t found = find_suffix(h, h_length, n + prefix_length, suffix_length);
-        if (!found) return SZ_NULL_CHAR;
-
-        // Verify the remaining part of the needle
-        sz_size_t remaining = found - h;
-        if (remaining < prefix_length) return SZ_NULL_CHAR;
-        if (sz_equal(found - prefix_length, n, prefix_length)) return found - prefix_length;
-
-        // Adjust the position.
-        h_length = remaining - 1;
-    }
-
-    // Unreachable, but helps silence compiler warnings:
-    return SZ_NULL_CHAR;
-}
-
-SZ_INTERNAL sz_cptr_t _sz_find_over_4bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    return _sz_find_with_prefix(h, h_length, n, n_length, (sz_find_t)_sz_find_4byte_serial, 4);
-}
-
-SZ_INTERNAL sz_cptr_t _sz_find_horspool_over_256bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
-                                                             sz_size_t n_length) {
-    return _sz_find_with_prefix(h, h_length, n, n_length, _sz_find_horspool_upto_256bytes_serial, 256);
-}
-
-SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_over_256bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
-                                                              sz_size_t n_length) {
-    return _sz_rfind_with_suffix(h, h_length, n, n_length, _sz_rfind_horspool_upto_256bytes_serial, 256);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-
-#if SZ_DETECT_BIG_ENDIAN
-    sz_find_t backends[] = {
-        (sz_find_t)sz_find_byte_serial,
-        (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_find_horspool_over_256bytes_serial,
-    };
-
-    return backends[(n_length > 1) + (n_length > 256)](h, h_length, n, n_length);
-#else
-    sz_find_t backends[] = {
-        // For very short strings brute-force SWAR makes sense.
-        (sz_find_t)sz_find_byte_serial,
-        (sz_find_t)_sz_find_2byte_serial,
-        (sz_find_t)_sz_find_3byte_serial,
-        (sz_find_t)_sz_find_4byte_serial,
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (sz_find_t)_sz_find_over_4bytes_serial,
-        // For longer needles - use skip tables.
-        (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_find_horspool_over_256bytes_serial,
-    };
-
-    return backends[
-        // For very short strings brute-force SWAR makes sense.
-        (n_length > 1) + (n_length > 2) + (n_length > 3) +
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (n_length > 4) +
-        // For longer needles - use skip tables.
-        (n_length > 8) + (n_length > 256)](h, h_length, n, n_length);
-#endif
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-
-    sz_find_t backends[] = {
-        // For very short strings brute-force SWAR makes sense.
-        (sz_find_t)sz_rfind_byte_serial,
-        //  TODO: implement reverse-order SWAR for 2/3/4 byte variants.
-        //  TODO: (sz_find_t)_sz_rfind_2byte_serial,
-        //  TODO: (sz_find_t)_sz_rfind_3byte_serial,
-        //  TODO: (sz_find_t)_sz_rfind_4byte_serial,
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        // (sz_find_t)_sz_rfind_over_4bytes_serial,
-        // For longer needles - use skip tables.
-        (sz_find_t)_sz_rfind_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_rfind_horspool_over_256bytes_serial,
-    };
-
-    return backends[
-        // For very short strings brute-force SWAR makes sense.
-        0 +
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (n_length > 1) +
-        // For longer needles - use skip tables.
-        (n_length > 256)](h, h_length, n, n_length);
-}
-
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_serial( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                 //
-    sz_cptr_t longer, sz_size_t longer_length,                   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // TODO: Generalize to remove the following asserts!
-    sz_assert(!bound && "For bounded search the method should only evaluate one band of the matrix.");
-    sz_assert(shorter_length == longer_length && "The method hasn't been generalized to different length inputs yet.");
-    sz_unused(longer_length && bound);
-
-    // We are going to store 3 diagonals of the matrix.
-    // The length of the longest (main) diagonal would be `n = (shorter_length + 1)`.
-    sz_size_t n = shorter_length + 1;
-    sz_size_t buffer_length = sizeof(sz_size_t) * n * 3;
-    sz_size_t *distances = (sz_size_t *)alloc->allocate(buffer_length, alloc->handle);
-    if (!distances) return SZ_SIZE_MAX;
-
-    sz_size_t *previous_distances = distances;
-    sz_size_t *current_distances = previous_distances + n;
-    sz_size_t *next_distances = previous_distances + n * 2;
-
-    // Initialize the first two diagonals:
-    previous_distances[0] = 0;
-    current_distances[0] = current_distances[1] = 1;
-
-    // Progress through the upper triangle of the Levenshtein matrix.
-    sz_size_t next_diagonal_index = 2;
-    for (; next_diagonal_index != n; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
-        for (sz_size_t i = 0; i + 2 < next_diagonal_length; ++i) {
-            sz_size_t cost_of_substitution = shorter[next_diagonal_index - i - 2] != longer[i];
-            sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
-            sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
-            next_distances[i + 1] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
-        }
-        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-        next_distances[0] = next_distances[next_diagonal_length - 1] = next_diagonal_index;
-        // Perform a circular rotation of those buffers, to reuse the memory.
-        sz_size_t *temporary = previous_distances;
-        previous_distances = current_distances;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a
-    // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
-    // index on either side, we will be cropping those values out.
-    sz_size_t diagonals_count = n + n - 1;
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
-        for (sz_size_t i = 0; i != next_diagonal_length; ++i) {
-            sz_size_t cost_of_substitution = shorter[shorter_length - 1 - i] != longer[next_diagonal_index - n + i];
-            sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
-            sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
-            next_distances[i] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
-        }
-        // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
-        // dropping the first element in the current array.
-        sz_size_t *temporary = previous_distances;
-        previous_distances = current_distances + 1;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // Cache scalar before `free` call.
-    sz_size_t result = current_distances[0];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-}
-
-/**
- *  @brief  Describes the length of a UTF8 character / codepoint / rune in bytes.
- */
-typedef enum {
-    sz_utf8_invalid_k = 0,     //!< Invalid UTF8 character.
-    sz_utf8_rune_1byte_k = 1,  //!< 1-byte UTF8 character.
-    sz_utf8_rune_2bytes_k = 2, //!< 2-byte UTF8 character.
-    sz_utf8_rune_3bytes_k = 3, //!< 3-byte UTF8 character.
-    sz_utf8_rune_4bytes_k = 4, //!< 4-byte UTF8 character.
-} sz_rune_length_t;
-
-typedef sz_u32_t sz_rune_t;
-
-/**
- *  @brief  Extracts just one UTF8 codepoint from a UTF8 string into a 32-bit unsigned integer.
- */
-SZ_INTERNAL void _sz_extract_utf8_rune(sz_cptr_t utf8, sz_rune_t *code, sz_rune_length_t *code_length) {
-    sz_u8_t const *current = (sz_u8_t const *)utf8;
-    sz_u8_t leading_byte = *current++;
-    sz_rune_t ch;
-    sz_rune_length_t ch_length;
-
-    // TODO: This can be made entirely branchless using 32-bit SWAR.
-    if (leading_byte < 0x80) {
-        // Single-byte rune (0xxxxxxx)
-        ch = leading_byte;
-        ch_length = sz_utf8_rune_1byte_k;
-    }
-    else if ((leading_byte & 0xE0) == 0xC0) {
-        // Two-byte rune (110xxxxx 10xxxxxx)
-        ch = (leading_byte & 0x1F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_2bytes_k;
-    }
-    else if ((leading_byte & 0xF0) == 0xE0) {
-        // Three-byte rune (1110xxxx 10xxxxxx 10xxxxxx)
-        ch = (leading_byte & 0x0F) << 12;
-        ch |= (*current++ & 0x3F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_3bytes_k;
-    }
-    else if ((leading_byte & 0xF8) == 0xF0) {
-        // Four-byte rune (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
-        ch = (leading_byte & 0x07) << 18;
-        ch |= (*current++ & 0x3F) << 12;
-        ch |= (*current++ & 0x3F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_4bytes_k;
-    }
-    else {
-        // Invalid UTF8 rune.
-        ch = 0;
-        ch_length = sz_utf8_invalid_k;
-    }
-    *code = ch;
-    *code_length = ch_length;
-}
-
-/**
- *  @brief  Exports a UTF8 string into a UTF32 buffer.
- *          ! The result is undefined id the UTF8 string is corrupted.
- *  @return The length in the number of codepoints.
- */
-SZ_INTERNAL sz_size_t _sz_export_utf8_to_utf32(sz_cptr_t utf8, sz_size_t utf8_length, sz_rune_t *utf32) {
-    sz_cptr_t const end = utf8 + utf8_length;
-    sz_size_t count = 0;
-    sz_rune_length_t rune_length;
-    for (; utf8 != end; utf8 += rune_length, utf32++, count++) _sz_extract_utf8_rune(utf8, utf32, &rune_length);
-    return count;
-}
-
-/**
- *  @brief  Compute the Levenshtein distance between two strings using the Wagner-Fisher algorithm.
- *          Stores only 2 rows of the Levenshtein matrix, but uses 64-bit integers for the distance values,
- *          and upcasts UTF8 variable-length codepoints to 64-bit integers for faster addressing.
- *
- *  ! In the worst case for 2 strings of length 100, that contain just one 16-bit codepoint this will result in extra:
- *      + 2 rows * 100 slots * 8 bytes/slot = 1600 bytes of memory for the two rows of the Levenshtein matrix rows.
- *      + 100 codepoints * 2 strings * 4 bytes/codepoint = 800 bytes of memory for the UTF8 buffer.
- *      = 2400 bytes of memory or @b 12x memory amplification!
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_wagner_fisher_serial( //
-    sz_cptr_t longer, sz_size_t longer_length,                //
-    sz_cptr_t shorter, sz_size_t shorter_length,              //
-    sz_size_t bound, sz_bool_t can_be_unicode, sz_memory_allocator_t *alloc) {
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // A good idea may be to dispatch different kernels for different string lengths.
-    // Like using `uint8_t` counters for strings under 255 characters long.
-    // Good in theory, this results in frequent upcasts and downcasts in serial code.
-    // On strings over 20 bytes, using `uint8` over `uint64` on 64-bit x86 CPU doubles the execution time.
-    // So one must be very cautious with such optimizations.
-    typedef sz_size_t _distance_t;
-
-    // Compute the number of columns in our Levenshtein matrix.
-    sz_size_t const n = shorter_length + 1;
-
-    // If a buffering memory-allocator is provided, this operation is practically free,
-    // and cheaper than allocating even 512 bytes (for small distance matrices) on stack.
-    sz_size_t buffer_length = sizeof(_distance_t) * (n * 2);
-
-    // If the strings contain Unicode characters, let's estimate the max character width,
-    // and use it to allocate a larger buffer to decode UTF8.
-    if ((can_be_unicode == sz_true_k) &&
-        (sz_isascii(longer, longer_length) == sz_false_k || sz_isascii(shorter, shorter_length) == sz_false_k)) {
-        buffer_length += (shorter_length + longer_length) * sizeof(sz_rune_t);
-    }
-    else { can_be_unicode = sz_false_k; }
-
-    // If the allocation fails, return the maximum distance.
-    sz_ptr_t const buffer = (sz_ptr_t)alloc->allocate(buffer_length, alloc->handle);
-    if (!buffer) return SZ_SIZE_MAX;
-
-    // Let's export the UTF8 sequence into the newly allocated buffer at the end.
-    if (can_be_unicode == sz_true_k) {
-        sz_rune_t *const longer_utf32 = (sz_rune_t *)(buffer + sizeof(_distance_t) * (n * 2));
-        sz_rune_t *const shorter_utf32 = longer_utf32 + longer_length;
-        // Export the UTF8 sequences into the newly allocated buffer.
-        longer_length = _sz_export_utf8_to_utf32(longer, longer_length, longer_utf32);
-        shorter_length = _sz_export_utf8_to_utf32(shorter, shorter_length, shorter_utf32);
-        longer = (sz_cptr_t)longer_utf32;
-        shorter = (sz_cptr_t)shorter_utf32;
-    }
-
-    // Let's parameterize the core logic for different character types and distance types.
-#define _wagner_fisher_unbounded(_distance_t, _char_t)                                                                \
-    /* Now let's cast our pointer to avoid it in subsequent sections. */                                              \
-    _char_t const *const longer_chars = (_char_t const *)longer;                                                      \
-    _char_t const *const shorter_chars = (_char_t const *)shorter;                                                    \
-    _distance_t *previous_distances = (_distance_t *)buffer;                                                          \
-    _distance_t *current_distances = previous_distances + n;                                                          \
-    /*  Initialize the first row of the Levenshtein matrix with `iota`-style arithmetic progression. */               \
-    for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
-    /* The main loop of the algorithm with quadratic complexity. */                                                   \
-    for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {                                     \
-        _char_t const longer_char = longer_chars[idx_longer];                                                         \
-        /* Using pure pointer arithmetic is faster than iterating with an index. */                                   \
-        _char_t const *shorter_ptr = shorter_chars;                                                                   \
-        _distance_t const *previous_ptr = previous_distances;                                                         \
-        _distance_t *current_ptr = current_distances;                                                                 \
-        _distance_t *const current_end = current_ptr + shorter_length;                                                \
-        current_ptr[0] = idx_longer + 1;                                                                              \
-        for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) {                            \
-            _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]);           \
-            /* We can avoid `+1` for costs here, shifting it to post-minimum computation, */                          \
-            /* saving one increment operation. */                                                                     \
-            _distance_t cost_deletion = previous_ptr[1];                                                              \
-            _distance_t cost_insertion = current_ptr[0];                                                              \
-            /* ? It might be a good idea to enforce branchless execution here. */                                     \
-            /* ? The caveat being that the benchmarks on longer sequences backfire and more research is needed. */    \
-            current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1);      \
-        }                                                                                                             \
-        /* Swap `previous_distances` and `current_distances` pointers. */                                             \
-        _distance_t *temporary = previous_distances;                                                                  \
-        previous_distances = current_distances;                                                                       \
-        current_distances = temporary;                                                                                \
-    }                                                                                                                 \
-    /* Cache scalar before `free` call. */                                                                            \
-    sz_size_t result = previous_distances[shorter_length];                                                            \
-    alloc->free(buffer, buffer_length, alloc->handle);                                                                \
-    return result;
-
-    // Let's define a separate variant for bounded distance computation.
-    // Practically the same as unbounded, but also collecting the running minimum within each row for early exit.
-#define _wagner_fisher_bounded(_distance_t, _char_t)                                                                  \
-    _char_t const *const longer_chars = (_char_t const *)longer;                                                      \
-    _char_t const *const shorter_chars = (_char_t const *)shorter;                                                    \
-    _distance_t *previous_distances = (_distance_t *)buffer;                                                          \
-    _distance_t *current_distances = previous_distances + n;                                                          \
-    for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
-    for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {                                     \
-        _char_t const longer_char = longer_chars[idx_longer];                                                         \
-        _char_t const *shorter_ptr = shorter_chars;                                                                   \
-        _distance_t const *previous_ptr = previous_distances;                                                         \
-        _distance_t *current_ptr = current_distances;                                                                 \
-        _distance_t *const current_end = current_ptr + shorter_length;                                                \
-        current_ptr[0] = idx_longer + 1;                                                                              \
-        /* Initialize min_distance with a value greater than bound */                                                 \
-        _distance_t min_distance = bound - 1;                                                                         \
-        for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) {                            \
-            _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]);           \
-            _distance_t cost_deletion = previous_ptr[1];                                                              \
-            _distance_t cost_insertion = current_ptr[0];                                                              \
-            current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1);      \
-            /* Keep track of the minimum distance seen so far in this row */                                          \
-            min_distance = sz_min_of_two(current_ptr[1], min_distance);                                               \
-        }                                                                                                             \
-        /* If the minimum distance in this row exceeded the bound, return early */                                    \
-        if (min_distance >= bound) {                                                                                  \
-            alloc->free(buffer, buffer_length, alloc->handle);                                                        \
-            return bound;                                                                                             \
-        }                                                                                                             \
-        _distance_t *temporary = previous_distances;                                                                  \
-        previous_distances = current_distances;                                                                       \
-        current_distances = temporary;                                                                                \
-    }                                                                                                                 \
-    sz_size_t result = previous_distances[shorter_length];                                                            \
-    alloc->free(buffer, buffer_length, alloc->handle);                                                                \
-    return sz_min_of_two(result, bound);
-
-    // Dispatch the actual computation.
-    if (!bound) {
-        if (can_be_unicode == sz_true_k) { _wagner_fisher_unbounded(sz_size_t, sz_rune_t); }
-        else { _wagner_fisher_unbounded(sz_size_t, sz_u8_t); }
-    }
-    else {
-        if (can_be_unicode == sz_true_k) { _wagner_fisher_bounded(sz_size_t, sz_rune_t); }
-        else { _wagner_fisher_bounded(sz_size_t, sz_u8_t); }
-    }
-}
-
-SZ_PUBLIC sz_size_t sz_edit_distance_serial(     //
-    sz_cptr_t longer, sz_size_t longer_length,   //
-    sz_cptr_t shorter, sz_size_t shorter_length, //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
-    }
-
-    // Skip the matching prefixes and suffixes, they won't affect the distance.
-    for (sz_cptr_t a_end = longer + longer_length, b_end = shorter + shorter_length;
-         longer != a_end && shorter != b_end && *longer == *shorter;
-         ++longer, ++shorter, --longer_length, --shorter_length);
-    for (; longer_length && shorter_length && longer[longer_length - 1] == shorter[shorter_length - 1];
-         --longer_length, --shorter_length);
-
-    // Bounded computations may exit early.
-    int const is_bounded = bound < longer_length;
-    if (is_bounded) {
-        // If one of the strings is empty - the edit distance is equal to the length of the other one.
-        if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
-        if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
-        // If the difference in length is beyond the `bound`, there is no need to check at all.
-        if (longer_length - shorter_length > bound) return bound;
-    }
-
-    if (shorter_length == 0) return longer_length; // If no mismatches were found - the distance is zero.
-    if (shorter_length == longer_length && !is_bounded)
-        return _sz_edit_distance_skewed_diagonals_serial(longer, longer_length, shorter, shorter_length, bound, alloc);
-    return _sz_edit_distance_wagner_fisher_serial(longer, longer_length, shorter, shorter_length, bound, sz_false_k,
-                                                  alloc);
-}
-
-SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(       //
-    sz_cptr_t longer, sz_size_t longer_length,        //
-    sz_cptr_t shorter, sz_size_t shorter_length,      //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, //
-    sz_memory_allocator_t *alloc) {
-
-    // If one of the strings is empty - the edit distance is equal to the length of the other one
-    if (longer_length == 0) return (sz_ssize_t)shorter_length * gap;
-    if (shorter_length == 0) return (sz_ssize_t)longer_length * gap;
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
-    }
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    sz_size_t n = shorter_length + 1;
-    sz_size_t buffer_length = sizeof(sz_ssize_t) * n * 2;
-    sz_ssize_t *distances = (sz_ssize_t *)alloc->allocate(buffer_length, alloc->handle);
-    sz_ssize_t *previous_distances = distances;
-    sz_ssize_t *current_distances = previous_distances + n;
-
-    for (sz_size_t idx_shorter = 0; idx_shorter != n; ++idx_shorter)
-        previous_distances[idx_shorter] = (sz_ssize_t)idx_shorter * gap;
-
-    sz_u8_t const *shorter_unsigned = (sz_u8_t const *)shorter;
-    sz_u8_t const *longer_unsigned = (sz_u8_t const *)longer;
-    for (sz_size_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {
-        current_distances[0] = ((sz_ssize_t)idx_longer + 1) * gap;
-
-        // Initialize min_distance with a value greater than bound
-        sz_error_cost_t const *a_subs = subs + longer_unsigned[idx_longer] * 256ul;
-        for (sz_size_t idx_shorter = 0; idx_shorter != shorter_length; ++idx_shorter) {
-            sz_ssize_t cost_deletion = previous_distances[idx_shorter + 1] + gap;
-            sz_ssize_t cost_insertion = current_distances[idx_shorter] + gap;
-            sz_ssize_t cost_substitution = previous_distances[idx_shorter] + a_subs[shorter_unsigned[idx_shorter]];
-            current_distances[idx_shorter + 1] = sz_max_of_three(cost_deletion, cost_insertion, cost_substitution);
-        }
-
-        // Swap previous_distances and current_distances pointers
-        sz_pointer_swap((void **)&previous_distances, (void **)&current_distances);
-    }
-
-    // Cache scalar before `free` call.
-    sz_ssize_t result = previous_distances[shorter_length];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-}
-
-SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
-    sz_cptr_t a, sz_size_t a_length,            //
-    sz_cptr_t b, sz_size_t b_length,            //
-    sz_size_t bound) {
-
-    sz_size_t const min_length = sz_min_of_two(a_length, b_length);
-    sz_size_t const max_length = sz_max_of_two(a_length, b_length);
-    sz_cptr_t const a_end = a + min_length;
-    bound = bound == 0 ? max_length : bound;
-
-    // Walk through both strings using SWAR and counting the number of differing characters.
-    sz_size_t distance = max_length - min_length;
-#if SZ_USE_MISALIGNED_LOADS && !SZ_DETECT_BIG_ENDIAN
-    if (min_length >= SZ_SWAR_THRESHOLD) {
-        sz_u64_vec_t a_vec, b_vec, match_vec;
-        for (; a + 8 <= a_end && distance < bound; a += 8, b += 8) {
-            a_vec.u64 = sz_u64_load(a).u64;
-            b_vec.u64 = sz_u64_load(b).u64;
-            match_vec = _sz_u64_each_byte_equal(a_vec, b_vec);
-            distance += sz_u64_popcount((~match_vec.u64) & 0x8080808080808080ull);
-        }
-    }
-#endif
-
-    for (; a != a_end && distance < bound; ++a, ++b) { distance += (*a != *b); }
-    return sz_min_of_two(distance, bound);
-}
-
-SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial( //
-    sz_cptr_t a, sz_size_t a_length,                 //
-    sz_cptr_t b, sz_size_t b_length,                 //
-    sz_size_t bound) {
-
-    sz_cptr_t const a_end = a + a_length;
-    sz_cptr_t const b_end = b + b_length;
-    sz_size_t distance = 0;
-
-    sz_rune_t a_rune, b_rune;
-    sz_rune_length_t a_rune_length, b_rune_length;
-
-    if (bound) {
-        for (; a < a_end && b < b_end && distance < bound; a += a_rune_length, b += b_rune_length) {
-            _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-            _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-            distance += (a_rune != b_rune);
-        }
-        // If one string has more runes, we need to go through the tail.
-        if (distance < bound) {
-            for (; a < a_end && distance < bound; a += a_rune_length, ++distance)
-                _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-
-            for (; b < b_end && distance < bound; b += b_rune_length, ++distance)
-                _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-        }
-    }
-    else {
-        for (; a < a_end && b < b_end; a += a_rune_length, b += b_rune_length) {
-            _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-            _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-            distance += (a_rune != b_rune);
-        }
-        // If one string has more runes, we need to go through the tail.
-        for (; a < a_end; a += a_rune_length, ++distance) _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-        for (; b < b_end; b += b_rune_length, ++distance) _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-    }
-    return distance;
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length) {
-    sz_u64_t checksum = 0;
-    sz_u8_t const *text_u8 = (sz_u8_t const *)text;
-    sz_u8_t const *text_end = text_u8 + length;
-    for (; text_u8 != text_end; ++text_u8) checksum += *text_u8;
-    return checksum;
-}
-
-/**
- *  @brief  Largest prime number that fits into 31 bits.
- *  @see    https://mersenneforum.org/showthread.php?t=3471
- */
-#define SZ_U32_MAX_PRIME (2147483647u)
-
-/**
- *  @brief  Largest prime number that fits into 64 bits.
- *  @see    https://mersenneforum.org/showthread.php?t=3471
- *
- *  2^64 = 18,446,744,073,709,551,616
- *  this = 18,446,744,073,709,551,557
- *  diff = 59
- */
-#define SZ_U64_MAX_PRIME (18446744073709551557ull)
-
-/*
- *  One hardware-accelerated way of mixing hashes can be CRC, but it's only implemented for 32-bit values.
- *  Using a Boost-like mixer works very poorly in such case:
- *
- *       hash_first ^ (hash_second + 0x517cc1b727220a95 + (hash_first << 6) + (hash_first >> 2));
- *
- *  Let's stick to the Fibonacci hash trick using the golden ratio.
- *  https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
- */
-#define _sz_hash_mix(first, second) ((first * 11400714819323198485ull) ^ (second * 11400714819323198485ull))
-#define _sz_shift_low(x) (x)
-#define _sz_shift_high(x) ((x + 77ull) & 0xFFull)
-#define _sz_prime_mod(x) (x % SZ_U64_MAX_PRIME)
-
-SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length) {
-
-    sz_u64_t hash_low = 0;
-    sz_u64_t hash_high = 0;
-    sz_u8_t const *text = (sz_u8_t const *)start;
-    sz_u8_t const *text_end = text + length;
-
-    switch (length) {
-    case 0: return 0;
-
-    // Texts under 7 bytes long are definitely below the largest prime.
-    case 1:
-        hash_low = _sz_shift_low(text[0]);
-        hash_high = _sz_shift_high(text[0]);
-        break;
-    case 2:
-        hash_low = _sz_shift_low(text[0]) * 31ull + _sz_shift_low(text[1]);
-        hash_high = _sz_shift_high(text[0]) * 257ull + _sz_shift_high(text[1]);
-        break;
-    case 3:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull +         //
-                   _sz_shift_low(text[2]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull +          //
-                    _sz_shift_high(text[2]);
-        break;
-    case 4:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull +                 //
-                   _sz_shift_low(text[3]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull +                   //
-                    _sz_shift_high(text[3]);
-        break;
-    case 5:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull +                         //
-                   _sz_shift_low(text[4]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull +                            //
-                    _sz_shift_high(text[4]);
-        break;
-    case 6:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull * 31ull +                         //
-                   _sz_shift_low(text[4]) * 31ull +                                 //
-                   _sz_shift_low(text[5]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull * 257ull +                            //
-                    _sz_shift_high(text[4]) * 257ull +                                     //
-                    _sz_shift_high(text[5]);
-        break;
-    case 7:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull * 31ull * 31ull +                         //
-                   _sz_shift_low(text[4]) * 31ull * 31ull +                                 //
-                   _sz_shift_low(text[5]) * 31ull +                                         //
-                   _sz_shift_low(text[6]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull * 257ull * 257ull +                            //
-                    _sz_shift_high(text[4]) * 257ull * 257ull +                                     //
-                    _sz_shift_high(text[5]) * 257ull +                                              //
-                    _sz_shift_high(text[6]);
-        break;
-    default:
-        // Unroll the first seven cycles:
-        hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[1]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[1]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[2]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[2]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[3]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[3]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[4]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[4]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[5]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[5]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[6]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[6]);
-        text += 7;
-
-        // Iterate throw the rest with the modulus:
-        for (; text != text_end; ++text) {
-            hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
-            hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
-            // Wrap the hashes around:
-            hash_low = _sz_prime_mod(hash_low);
-            hash_high = _sz_prime_mod(hash_high);
-        }
-        break;
-    }
-
-    return _sz_hash_mix(hash_low, hash_high);
-}
-
-SZ_PUBLIC void sz_hashes_serial(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    sz_u8_t const *text = (sz_u8_t const *)start;
-    sz_u8_t const *text_end = text + length;
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // Compute the initial hash value for the first window.
-    sz_u64_t hash_low = 0, hash_high = 0, hash_mix;
-    for (sz_u8_t const *first_end = text + window_length; text < first_end; ++text)
-        hash_low = (hash_low * 31ull + _sz_shift_low(*text)) % SZ_U64_MAX_PRIME,
-        hash_high = (hash_high * 257ull + _sz_shift_high(*text)) % SZ_U64_MAX_PRIME;
-
-    // In most cases the fingerprint length will be a power of two.
-    hash_mix = _sz_hash_mix(hash_low, hash_high);
-    callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
-
-    // Compute the hash value for every window, exporting into the fingerprint,
-    // using the expensive modulo operation.
-    sz_size_t cycles = 1;
-    sz_size_t const step_mask = step - 1;
-    for (; text < text_end; ++text, ++cycles) {
-        // Discard one character:
-        hash_low -= _sz_shift_low(*(text - window_length)) * prime_power_low;
-        hash_high -= _sz_shift_high(*(text - window_length)) * prime_power_high;
-        // And add a new one:
-        hash_low = 31ull * hash_low + _sz_shift_low(*text);
-        hash_high = 257ull * hash_high + _sz_shift_high(*text);
-        // Wrap the hashes around:
-        hash_low = _sz_prime_mod(hash_low);
-        hash_high = _sz_prime_mod(hash_high);
-        // Mix only if we've skipped enough hashes.
-        if ((cycles & step_mask) == 0) {
-            hash_mix = _sz_hash_mix(hash_low, hash_high);
-            callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
-        }
-    }
-}
-
-#undef _sz_shift_low
-#undef _sz_shift_high
-#undef _sz_hash_mix
-#undef _sz_prime_mod
-
-/**
- *  @brief  Uses a small lookup-table to convert a lowercase character to uppercase.
- */
-SZ_INTERNAL sz_u8_t sz_u8_tolower(sz_u8_t c) {
-    static sz_u8_t const lowered[256] = {
-        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
-        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
-        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
-        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
-        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
-        96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, //
-        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
-        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
-        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
-        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
-    };
-    return lowered[c];
-}
-
-/**
- *  @brief  Uses a small lookup-table to convert an uppercase character to lowercase.
- */
-SZ_INTERNAL sz_u8_t sz_u8_toupper(sz_u8_t c) {
-    static sz_u8_t const upped[256] = {
-        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
-        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
-        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
-        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
-        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
-        96,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  //
-        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  123, 124, 125, 126, 127, //
-        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
-        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
-        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
-        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
-    };
-    return upped[c];
-}
-
-/**
- *  @brief  Uses two small lookup tables (768 bytes total) to accelerate division by a small
- *          unsigned integer. Performs two lookups, one multiplication, two shifts, and two accumulations.
- *
- *  @param  divisor Integral value @b larger than one.
- *  @param  number  Integral value to divide.
- */
-SZ_INTERNAL sz_u8_t sz_u8_divide(sz_u8_t number, sz_u8_t divisor) {
-    sz_assert(divisor > 1);
-    static sz_u16_t const multipliers[256] = {
-        0,     0,     0,     21846, 0,     39322, 21846, 9363,  0,     50973, 39322, 29790, 21846, 15124, 9363,  4370,
-        0,     57826, 50973, 44841, 39322, 34329, 29790, 25645, 21846, 18351, 15124, 12137, 9363,  6780,  4370,  2115,
-        0,     61565, 57826, 54302, 50973, 47824, 44841, 42011, 39322, 36765, 34329, 32006, 29790, 27671, 25645, 23705,
-        21846, 20063, 18351, 16706, 15124, 13602, 12137, 10725, 9363,  8049,  6780,  5554,  4370,  3224,  2115,  1041,
-        0,     63520, 61565, 59668, 57826, 56039, 54302, 52614, 50973, 49377, 47824, 46313, 44841, 43407, 42011, 40649,
-        39322, 38028, 36765, 35532, 34329, 33154, 32006, 30885, 29790, 28719, 27671, 26647, 25645, 24665, 23705, 22766,
-        21846, 20945, 20063, 19198, 18351, 17520, 16706, 15907, 15124, 14356, 13602, 12863, 12137, 11424, 10725, 10038,
-        9363,  8700,  8049,  7409,  6780,  6162,  5554,  4957,  4370,  3792,  3224,  2665,  2115,  1573,  1041,  517,
-        0,     64520, 63520, 62535, 61565, 60609, 59668, 58740, 57826, 56926, 56039, 55164, 54302, 53452, 52614, 51788,
-        50973, 50169, 49377, 48595, 47824, 47063, 46313, 45572, 44841, 44120, 43407, 42705, 42011, 41326, 40649, 39982,
-        39322, 38671, 38028, 37392, 36765, 36145, 35532, 34927, 34329, 33738, 33154, 32577, 32006, 31443, 30885, 30334,
-        29790, 29251, 28719, 28192, 27671, 27156, 26647, 26143, 25645, 25152, 24665, 24182, 23705, 23233, 22766, 22303,
-        21846, 21393, 20945, 20502, 20063, 19628, 19198, 18772, 18351, 17933, 17520, 17111, 16706, 16305, 15907, 15514,
-        15124, 14738, 14356, 13977, 13602, 13231, 12863, 12498, 12137, 11779, 11424, 11073, 10725, 10380, 10038, 9699,
-        9363,  9030,  8700,  8373,  8049,  7727,  7409,  7093,  6780,  6470,  6162,  5857,  5554,  5254,  4957,  4662,
-        4370,  4080,  3792,  3507,  3224,  2943,  2665,  2388,  2115,  1843,  1573,  1306,  1041,  778,   517,   258,
-    };
-    // This table can be avoided using a single addition and counting trailing zeros.
-    static sz_u8_t const shifts[256] = {
-        0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, //
-        4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, //
-        5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, //
-        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, //
-        6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-    };
-    sz_u32_t multiplier = multipliers[divisor];
-    sz_u8_t shift = shifts[divisor];
-
-    sz_u16_t q = (sz_u16_t)((multiplier * number) >> 16);
-    sz_u16_t t = ((number - q) >> 1) + q;
-    return (sz_u8_t)(t >> shift);
-}
-
-SZ_PUBLIC void sz_look_up_transform_serial(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result) {
-    sz_u8_t const *unsigned_lut = (sz_u8_t const *)lut;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = unsigned_lut[*unsigned_text];
-}
-
-SZ_PUBLIC void sz_tolower_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = sz_u8_tolower(*unsigned_text);
-}
-
-SZ_PUBLIC void sz_toupper_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = sz_u8_toupper(*unsigned_text);
-}
-
-SZ_PUBLIC void sz_toascii_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = *unsigned_text & 0x7F;
-}
-
-/**
- *  @brief  Check if there is a byte in this buffer, that exceeds 127 and can't be an ASCII character.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- */
-SZ_PUBLIC sz_bool_t sz_isascii_serial(sz_cptr_t text, sz_size_t length) {
-
-    if (!length) return sz_true_k;
-    sz_u8_t const *h = (sz_u8_t const *)text;
-    sz_u8_t const *const h_end = h + length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
-        if (*h & 0x80ull) return sz_false_k;
-#endif
-
-    // Validate eight bytes at once using SWAR.
-    sz_u64_vec_t text_vec;
-    for (; h + 8 <= h_end; h += 8) {
-        text_vec.u64 = *(sz_u64_t const *)h;
-        if (text_vec.u64 & 0x8080808080808080ull) return sz_false_k;
-    }
-
-    // Handle the misaligned tail.
-    for (; h < h_end; ++h)
-        if (*h & 0x80ull) return sz_false_k;
-    return sz_true_k;
-}
-
-SZ_PUBLIC void sz_generate_serial(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
-                                  sz_random_generator_t generator, void *generator_user_data) {
-
-    sz_assert(alphabet_size > 0 && alphabet_size <= 256 && "Inadequate alphabet size");
-
-    if (alphabet_size == 1) sz_fill(result, result_length, *alphabet);
-
-    else {
-        sz_assert(generator && "Expects a valid random generator");
-        sz_u8_t divisor = (sz_u8_t)alphabet_size;
-        for (sz_cptr_t end = result + result_length; result != end; ++result) {
-            sz_u8_t random = generator(generator_user_data) & 0xFF;
-            sz_u8_t quotient = sz_u8_divide(random, divisor);
-            *result = alphabet[random - quotient * divisor];
-        }
-    }
-}
-
-#pragma endregion
-
-/*
- *  Serial implementation of string class operations.
- */
-#pragma region Serial Implementation for the String Class
-
-SZ_PUBLIC sz_bool_t sz_string_is_on_stack(sz_string_t const *string) {
-    // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    return (sz_bool_t)((sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0]);
-}
-
-SZ_PUBLIC void sz_string_range(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length) {
-    sz_size_t is_small = (sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0];
-    sz_size_t is_big_mask = is_small - 1ull;
-    *start = string->external.start; // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    // If the string is small, use branch-less approach to mask-out the top 7 bytes of the length.
-    *length = string->external.length & (0x00000000000000FFull | is_big_mask);
-}
-
-SZ_PUBLIC void sz_string_unpack(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length, sz_size_t *space,
-                                sz_bool_t *is_external) {
-    sz_size_t is_small = (sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0];
-    sz_size_t is_big_mask = is_small - 1ull;
-    *start = string->external.start; // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    // If the string is small, use branch-less approach to mask-out the top 7 bytes of the length.
-    *length = string->external.length & (0x00000000000000FFull | is_big_mask);
-    // In case the string is small, the `is_small - 1ull` will become 0xFFFFFFFFFFFFFFFFull.
-    *space = sz_u64_blend(SZ_STRING_INTERNAL_SPACE, string->external.space, is_big_mask);
-    *is_external = (sz_bool_t)!is_small;
-}
-
-SZ_PUBLIC sz_bool_t sz_string_equal(sz_string_t const *a, sz_string_t const *b) {
-    // Tempting to say that the external.length is bitwise the same even if it includes
-    // some bytes of the on-stack payload, but we don't at this writing maintain that invariant.
-    // (An on-stack string includes noise bytes in the high-order bits of external.length. So do this
-    // the hard/correct way.
-
-#if SZ_USE_MISALIGNED_LOADS
-    // Dealing with StringZilla strings, we know that the `start` pointer always points
-    // to a word at least 8 bytes long. Therefore, we can compare the first 8 bytes at once.
-
-#endif
-    // Alternatively, fall back to byte-by-byte comparison.
-    sz_ptr_t a_start, b_start;
-    sz_size_t a_length, b_length;
-    sz_string_range(a, &a_start, &a_length);
-    sz_string_range(b, &b_start, &b_length);
-    return (sz_bool_t)(a_length == b_length && sz_equal(a_start, b_start, b_length));
-}
-
-SZ_PUBLIC sz_ordering_t sz_string_order(sz_string_t const *a, sz_string_t const *b) {
-#if SZ_USE_MISALIGNED_LOADS
-    // Dealing with StringZilla strings, we know that the `start` pointer always points
-    // to a word at least 8 bytes long. Therefore, we can compare the first 8 bytes at once.
-
-#endif
-    // Alternatively, fall back to byte-by-byte comparison.
-    sz_ptr_t a_start, b_start;
-    sz_size_t a_length, b_length;
-    sz_string_range(a, &a_start, &a_length);
-    sz_string_range(b, &b_start, &b_length);
-    return sz_order(a_start, a_length, b_start, b_length);
-}
-
-SZ_PUBLIC void sz_string_init(sz_string_t *string) {
-    sz_assert(string && "String can't be SZ_NULL.");
-
-    // Only 8 + 1 + 1 need to be initialized.
-    string->internal.start = &string->internal.chars[0];
-    // But for safety let's initialize the entire structure to zeros.
-    // string->internal.chars[0] = 0;
-    // string->internal.length = 0;
-    string->words[1] = 0;
-    string->words[2] = 0;
-    string->words[3] = 0;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length, sz_memory_allocator_t *allocator) {
-    sz_size_t space_needed = length + 1; // space for trailing \0
-    sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
-    // Initialize the string to zeros for safety.
-    string->words[1] = 0;
-    string->words[2] = 0;
-    string->words[3] = 0;
-    // If we are lucky, no memory allocations will be needed.
-    if (space_needed <= SZ_STRING_INTERNAL_SPACE) {
-        string->internal.start = &string->internal.chars[0];
-        string->internal.length = (sz_u8_t)length;
-    }
-    else {
-        // If we are not lucky, we need to allocate memory.
-        string->external.start = (sz_ptr_t)allocator->allocate(space_needed, allocator->handle);
-        if (!string->external.start) return SZ_NULL_CHAR;
-        string->external.length = length;
-        string->external.space = space_needed;
-    }
-    sz_assert(&string->internal.start == &string->external.start && "Alignment confusion");
-    string->external.start[length] = 0;
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity, sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
-
-    sz_size_t new_space = new_capacity + 1;
-    if (new_space <= SZ_STRING_INTERNAL_SPACE) return string->external.start;
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-    sz_assert(new_space > string_space && "New space must be larger than current.");
-
-    sz_ptr_t new_start = (sz_ptr_t)allocator->allocate(new_space, allocator->handle);
-    if (!new_start) return SZ_NULL_CHAR;
-
-    sz_copy(new_start, string_start, string_length);
-    string->external.start = new_start;
-    string->external.space = new_space;
-    string->external.padding = 0;
-    string->external.length = string_length;
-
-    // Deallocate the old string.
-    if (string_is_external) allocator->free(string_start, string_space, allocator->handle);
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_shrink_to_fit(sz_string_t *string, sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // We may already be space-optimal, and in that case we don't need to do anything.
-    sz_size_t new_space = string_length + 1;
-    if (string_space == new_space || !string_is_external) return string->external.start;
-
-    sz_ptr_t new_start = (sz_ptr_t)allocator->allocate(new_space, allocator->handle);
-    if (!new_start) return SZ_NULL_CHAR;
-
-    sz_copy(new_start, string_start, string_length);
-    string->external.start = new_start;
-    string->external.space = new_space;
-    string->external.padding = 0;
-    string->external.length = string_length;
-
-    // Deallocate the old string.
-    if (string_is_external) allocator->free(string_start, string_space, allocator->handle);
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_expand(sz_string_t *string, sz_size_t offset, sz_size_t added_length,
-                                    sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // The user intended to extend the string.
-    offset = sz_min_of_two(offset, string_length);
-
-    // If we are lucky, no memory allocations will be needed.
-    if (string_length + added_length < string_space) {
-        sz_move(string_start + offset + added_length, string_start + offset, string_length - offset);
-        string_start[string_length + added_length] = 0;
-        // Even if the string is on the stack, the `+=` won't affect the tail of the string.
-        string->external.length += added_length;
-    }
-    // If we are not lucky, we need to allocate more memory.
-    else {
-        sz_size_t next_planned_size = sz_max_of_two(SZ_CACHE_LINE_WIDTH, string_space * 2ull);
-        sz_size_t min_needed_space = sz_size_bit_ceil(offset + string_length + added_length + 1);
-        sz_size_t new_space = sz_max_of_two(min_needed_space, next_planned_size);
-        string_start = sz_string_reserve(string, new_space - 1, allocator);
-        if (!string_start) return SZ_NULL_CHAR;
-
-        // Copy into the new buffer.
-        sz_move(string_start + offset + added_length, string_start + offset, string_length - offset);
-        string_start[string_length + added_length] = 0;
-        string->external.length = string_length + added_length;
-    }
-
-    return string_start;
-}
-
-SZ_PUBLIC sz_size_t sz_string_erase(sz_string_t *string, sz_size_t offset, sz_size_t length) {
-
-    sz_assert(string && "String can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // Normalize the offset, it can't be larger than the length.
-    offset = sz_min_of_two(offset, string_length);
-
-    // We shouldn't normalize the length, to avoid overflowing on `offset + length >= string_length`,
-    // if receiving `length == SZ_SIZE_MAX`. After following expression the `length` will contain
-    // exactly the delta between original and final length of this `string`.
-    length = sz_min_of_two(length, string_length - offset);
-
-    // There are 2 common cases, that wouldn't even require a `memmove`:
-    //      1.  Erasing the entire contents of the string.
-    //          In that case `length` argument will be equal or greater than `length` member.
-    //      2.  Removing the tail of the string with something like `string.pop_back()` in C++.
-    //
-    // In both of those, regardless of the location of the string - stack or heap,
-    // the erasing is as easy as setting the length to the offset.
-    // In every other case, we must `memmove` the tail of the string to the left.
-    if (offset + length < string_length)
-        sz_move(string_start + offset, string_start + offset + length, string_length - offset - length);
-
-    // The `string->external.length = offset` assignment would discard last characters
-    // of the on-the-stack string, but inplace subtraction would work.
-    string->external.length -= length;
-    string_start[string_length - length] = 0;
-    return length;
-}
-
-SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *allocator) {
-    if (!sz_string_is_on_stack(string))
-        allocator->free(string->external.start, string->external.space, allocator->handle);
-    sz_string_init(string);
-}
-
-// When overriding libc, disable optimisations for this function beacuse MSVC will optimize the loops into a memset.
-// Which then causes a stack overflow due to infinite recursion (memset -> sz_fill_serial -> memset).
-#if defined(_MSC_VER) && defined(SZ_OVERRIDE_LIBC) && SZ_OVERRIDE_LIBC
-#pragma optimize("", off)
-#endif
-SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    sz_ptr_t end = target + length;
-    // Dealing with short strings, a single sequential pass would be faster.
-    // If the size is larger than 2 words, then at least 1 of them will be aligned.
-    // But just one aligned word may not be worth SWAR.
-    if (length < SZ_SWAR_THRESHOLD)
-        while (target != end) *(target++) = value;
-
-    // In case of long strings, skip unaligned bytes, and then fill the rest in 64-bit chunks.
-    else {
-        sz_u64_t value64 = (sz_u64_t)value * 0x0101010101010101ull;
-        while ((sz_size_t)target & 7ull) *(target++) = value;
-        while (target + 8 <= end) *(sz_u64_t *)target = value64, target += 8;
-        while (target != end) *(target++) = value;
-    }
-}
-#if defined(_MSC_VER) && defined(SZ_OVERRIDE_LIBC) && SZ_OVERRIDE_LIBC
-#pragma optimize("", on)
-#endif
-
-SZ_PUBLIC void sz_copy_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_MISALIGNED_LOADS
-    while (length >= 8) *(sz_u64_t *)target = *(sz_u64_t const *)source, target += 8, source += 8, length -= 8;
-#endif
-    while (length--) *(target++) = *(source++);
-}
-
-SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // Implementing `memmove` is trickier, than `memcpy`, as the ranges may overlap.
-    // Existing implementations often have two passes, in normal and reversed order,
-    // depending on the relation of `target` and `source` addresses.
-    // https://student.cs.uwaterloo.ca/~cs350/common/os161-src-html/doxygen/html/memmove_8c_source.html
-    // https://marmota.medium.com/c-language-making-memmove-def8792bb8d5
-    //
-    // We can use the `memcpy` like left-to-right pass if we know that the `target` is before `source`.
-    // Or if we know that they don't intersect! In that case the traversal order is irrelevant,
-    // but older CPUs may predict and fetch forward-passes better.
-    if (target < source || target >= source + length) {
-#if SZ_USE_MISALIGNED_LOADS
-        while (length >= 8) *(sz_u64_t *)target = *(sz_u64_t const *)(source), target += 8, source += 8, length -= 8;
-#endif
-        while (length--) *(target++) = *(source++);
-    }
-    else {
-        // Jump to the end and walk backwards.
-        target += length, source += length;
-#if SZ_USE_MISALIGNED_LOADS
-        while (length >= 8) *(sz_u64_t *)(target -= 8) = *(sz_u64_t const *)(source -= 8), length -= 8;
-#endif
-        while (length--) *(--target) = *(--source);
-    }
-}
-
-#pragma endregion
-
-/*
- *  @brief  Serial implementation for strings sequence processing.
- */
-#pragma region Serial Implementation for Sequences
-
-SZ_PUBLIC sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate) {
-
-    sz_size_t matches = 0;
-    while (matches != sequence->count && predicate(sequence, sequence->order[matches])) ++matches;
-
-    for (sz_size_t i = matches + 1; i < sequence->count; ++i)
-        if (predicate(sequence, sequence->order[i]))
-            sz_u64_swap(sequence->order + i, sequence->order + matches), ++matches;
-
-    return matches;
-}
-
-SZ_PUBLIC void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less) {
-
-    sz_size_t start_b = partition + 1;
-
-    // If the direct merge is already sorted
-    if (!less(sequence, sequence->order[start_b], sequence->order[partition])) return;
-
-    sz_size_t start_a = 0;
-    while (start_a <= partition && start_b <= sequence->count) {
-
-        // If element 1 is in right place
-        if (!less(sequence, sequence->order[start_b], sequence->order[start_a])) { start_a++; }
-        else {
-            sz_size_t value = sequence->order[start_b];
-            sz_size_t index = start_b;
-
-            // Shift all the elements between element 1
-            // element 2, right by 1.
-            while (index != start_a) { sequence->order[index] = sequence->order[index - 1], index--; }
-            sequence->order[start_a] = value;
-
-            // Update all the pointers
-            start_a++;
-            partition++;
-            start_b++;
-        }
-    }
-}
-
-SZ_PUBLIC void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
-    sz_u64_t *keys = sequence->order;
-    sz_size_t keys_count = sequence->count;
-    for (sz_size_t i = 1; i < keys_count; i++) {
-        sz_u64_t i_key = keys[i];
-        sz_size_t j = i;
-        for (; j > 0 && less(sequence, i_key, keys[j - 1]); --j) keys[j] = keys[j - 1];
-        keys[j] = i_key;
-    }
-}
-
-SZ_INTERNAL void _sz_sift_down(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t start,
-                               sz_size_t end) {
-    sz_size_t root = start;
-    while (2 * root + 1 <= end) {
-        sz_size_t child = 2 * root + 1;
-        if (child + 1 <= end && less(sequence, order[child], order[child + 1])) { child++; }
-        if (!less(sequence, order[root], order[child])) { return; }
-        sz_u64_swap(order + root, order + child);
-        root = child;
-    }
-}
-
-SZ_INTERNAL void _sz_heapify(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t count) {
-    sz_size_t start = (count - 2) / 2;
-    while (1) {
-        _sz_sift_down(sequence, less, order, start, count - 1);
-        if (start == 0) return;
-        start--;
-    }
-}
-
-SZ_INTERNAL void _sz_heapsort(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first, sz_size_t last) {
-    sz_u64_t *order = sequence->order;
-    sz_size_t count = last - first;
-    _sz_heapify(sequence, less, order + first, count);
-    sz_size_t end = count - 1;
-    while (end > 0) {
-        sz_u64_swap(order + first, order + first + end);
-        end--;
-        _sz_sift_down(sequence, less, order + first, 0, end);
-    }
-}
-
-SZ_PUBLIC void sz_sort_introsort_recursion(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first,
-                                           sz_size_t last, sz_size_t depth) {
-
-    sz_size_t length = last - first;
-    switch (length) {
-    case 0:
-    case 1: return;
-    case 2:
-        if (less(sequence, sequence->order[first + 1], sequence->order[first]))
-            sz_u64_swap(&sequence->order[first], &sequence->order[first + 1]);
-        return;
-    case 3: {
-        sz_u64_t a = sequence->order[first];
-        sz_u64_t b = sequence->order[first + 1];
-        sz_u64_t c = sequence->order[first + 2];
-        if (less(sequence, b, a)) sz_u64_swap(&a, &b);
-        if (less(sequence, c, b)) sz_u64_swap(&c, &b);
-        if (less(sequence, b, a)) sz_u64_swap(&a, &b);
-        sequence->order[first] = a;
-        sequence->order[first + 1] = b;
-        sequence->order[first + 2] = c;
-        return;
-    }
-    }
-    // Until a certain length, the quadratic-complexity insertion-sort is fine
-    if (length <= 16) {
-        sz_sequence_t sub_seq = *sequence;
-        sub_seq.order += first;
-        sub_seq.count = length;
-        sz_sort_insertion(&sub_seq, less);
-        return;
-    }
-
-    // Fallback to N-logN-complexity heap-sort
-    if (depth == 0) {
-        _sz_heapsort(sequence, less, first, last);
-        return;
-    }
-
-    --depth;
-
-    // Median-of-three logic to choose pivot
-    sz_size_t median = first + length / 2;
-    if (less(sequence, sequence->order[median], sequence->order[first]))
-        sz_u64_swap(&sequence->order[first], &sequence->order[median]);
-    if (less(sequence, sequence->order[last - 1], sequence->order[first]))
-        sz_u64_swap(&sequence->order[first], &sequence->order[last - 1]);
-    if (less(sequence, sequence->order[median], sequence->order[last - 1]))
-        sz_u64_swap(&sequence->order[median], &sequence->order[last - 1]);
-
-    // Partition using the median-of-three as the pivot
-    sz_u64_t pivot = sequence->order[median];
-    sz_size_t left = first;
-    sz_size_t right = last - 1;
-    while (1) {
-        while (less(sequence, sequence->order[left], pivot)) left++;
-        while (less(sequence, pivot, sequence->order[right])) right--;
-        if (left >= right) break;
-        sz_u64_swap(&sequence->order[left], &sequence->order[right]);
-        left++;
-        right--;
-    }
-
-    // Recursively sort the partitions
-    sz_sort_introsort_recursion(sequence, less, first, left, depth);
-    sz_sort_introsort_recursion(sequence, less, right + 1, last, depth);
-}
-
-SZ_PUBLIC void sz_sort_introsort(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
-    if (sequence->count == 0) return;
-    sz_size_t size_is_not_power_of_two = (sequence->count & (sequence->count - 1)) != 0;
-    sz_size_t depth_limit = sz_size_log2i_nonzero(sequence->count) + size_is_not_power_of_two;
-    sz_sort_introsort_recursion(sequence, less, 0, sequence->count, depth_limit);
-}
-
-SZ_PUBLIC void sz_sort_recursion( //
-    sz_sequence_t *sequence, sz_size_t bit_idx, sz_size_t bit_max, sz_sequence_comparator_t comparator,
-    sz_size_t partial_order_length) {
-
-    if (!sequence->count) return;
-
-    // Array of size one doesn't need sorting - only needs the prefix to be discarded.
-    if (sequence->count == 1) {
-        sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
-        order_half_words[1] = 0;
-        return;
-    }
-
-    // Partition a range of integers according to a specific bit value
-    sz_size_t split = 0;
-    sz_u64_t mask = (1ull << 63) >> bit_idx;
-
-    // The clean approach would be to perform a single pass over the sequence.
-    //
-    //    while (split != sequence->count && !(sequence->order[split] & mask)) ++split;
-    //    for (sz_size_t i = split + 1; i < sequence->count; ++i)
-    //        if (!(sequence->order[i] & mask)) sz_u64_swap(sequence->order + i, sequence->order + split), ++split;
-    //
-    // This, however, doesn't take into account the high relative cost of writes and swaps.
-    // To circumvent that, we can first count the total number entries to be mapped into either part.
-    // And then walk through both parts, swapping the entries that are in the wrong part.
-    // This would often lead to ~15% performance gain.
-    sz_size_t count_with_bit_set = 0;
-    for (sz_size_t i = 0; i != sequence->count; ++i) count_with_bit_set += (sequence->order[i] & mask) != 0;
-    split = sequence->count - count_with_bit_set;
-
-    // It's possible that the sequence is already partitioned.
-    if (split != 0 && split != sequence->count) {
-        // Use two pointers to efficiently reposition elements.
-        // On pointer walks left-to-right from the start, and the other walks right-to-left from the end.
-        sz_size_t left = 0;
-        sz_size_t right = sequence->count - 1;
-        while (1) {
-            // Find the next element with the bit set on the left side.
-            while (left < split && !(sequence->order[left] & mask)) ++left;
-            // Find the next element without the bit set on the right side.
-            while (right >= split && (sequence->order[right] & mask)) --right;
-            // Swap the mispositioned elements.
-            if (left < split && right >= split) {
-                sz_u64_swap(sequence->order + left, sequence->order + right);
-                ++left;
-                --right;
-            }
-            else { break; }
-        }
-    }
-
-    // Go down recursively.
-    if (bit_idx < bit_max) {
-        sz_sequence_t a = *sequence;
-        a.count = split;
-        sz_sort_recursion(&a, bit_idx + 1, bit_max, comparator, partial_order_length);
-
-        sz_sequence_t b = *sequence;
-        b.order += split;
-        b.count -= split;
-        sz_sort_recursion(&b, bit_idx + 1, bit_max, comparator, partial_order_length);
-    }
-    // Reached the end of recursion.
-    else {
-        // Discard the prefixes.
-        sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
-        for (sz_size_t i = 0; i != sequence->count; ++i) { order_half_words[i * 2 + 1] = 0; }
-
-        sz_sequence_t a = *sequence;
-        a.count = split;
-        sz_sort_introsort(&a, comparator);
-
-        sz_sequence_t b = *sequence;
-        b.order += split;
-        b.count -= split;
-        sz_sort_introsort(&b, comparator);
-    }
-}
-
-SZ_INTERNAL sz_bool_t _sz_sort_is_less(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) {
-    sz_cptr_t i_str = sequence->get_start(sequence, i_key);
-    sz_cptr_t j_str = sequence->get_start(sequence, j_key);
-    sz_size_t i_len = sequence->get_length(sequence, i_key);
-    sz_size_t j_len = sequence->get_length(sequence, j_key);
-    return (sz_bool_t)(sz_order_serial(i_str, i_len, j_str, j_len) == sz_less_k);
-}
-
-SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t partial_order_length) {
-
-#if SZ_DETECT_BIG_ENDIAN
-    // TODO: Implement partial sort for big-endian systems. For now this sorts the whole thing.
-    sz_unused(partial_order_length);
-    sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
-#else
-
-    // Export up to 4 bytes into the `sequence` bits themselves
-    for (sz_size_t i = 0; i != sequence->count; ++i) {
-        sz_cptr_t begin = sequence->get_start(sequence, sequence->order[i]);
-        sz_size_t length = sequence->get_length(sequence, sequence->order[i]);
-        length = length > 4u ? 4u : length;
-        sz_ptr_t prefix = (sz_ptr_t)&sequence->order[i];
-        for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j];
-    }
-
-    // Perform optionally-parallel radix sort on them
-    sz_sort_recursion(sequence, 0, 32, (sz_sequence_comparator_t)_sz_sort_is_less, partial_order_length);
-#endif
-}
-
-SZ_PUBLIC void sz_sort(sz_sequence_t *sequence) {
-#if SZ_DETECT_BIG_ENDIAN
-    sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
-#else
-    sz_sort_partial(sequence, sequence->count);
-#endif
-}
-
-#pragma endregion
-
-/*
- *  @brief  AVX2 implementation of the string search algorithms.
- *          Very minimalistic, but still faster than the serial implementation.
- */
-#pragma region AVX2 Implementation
-
-#if SZ_USE_X86_AVX2
-#pragma GCC push_options
-#pragma GCC target("avx2")
-#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
-#include <immintrin.h>
-
-/**
- *  @brief  Helper structure to simplify work with 256-bit registers.
- */
-typedef union sz_u256_vec_t {
-    __m256i ymm;
-    __m128i xmms[2];
-    sz_u64_t u64s[4];
-    sz_u32_t u32s[8];
-    sz_u16_t u16s[16];
-    sz_u8_t u8s[32];
-} sz_u256_vec_t;
-
-SZ_PUBLIC sz_ordering_t sz_order_avx2(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    //! Before optimizing this, read the "Operations Not Worth Optimizing" in Contributions Guide:
-    //! https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md#general-performance-observations
-    return sz_order_serial(a, a_length, b, b_length);
-}
-
-SZ_PUBLIC sz_bool_t sz_equal_avx2(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_u256_vec_t a_vec, b_vec;
-
-    while (length >= 32) {
-        a_vec.ymm = _mm256_lddqu_si256((__m256i const *)a);
-        b_vec.ymm = _mm256_lddqu_si256((__m256i const *)b);
-        // One approach can be to use "movemasks", but we could also use a bitwise matching like `_mm256_testnzc_si256`.
-        int difference_mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi8(a_vec.ymm, b_vec.ymm));
-        if (difference_mask == 0) { a += 32, b += 32, length -= 32; }
-        else { return sz_false_k; }
-    }
-
-    if (length) return sz_equal_serial(a, b, length);
-    return sz_true_k;
-}
-
-SZ_PUBLIC void sz_fill_avx2(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    char value_char = *(char *)&value;
-    __m256i value_vec = _mm256_set1_epi8(value_char);
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores".
-    //
-    //    for (; length >= 32; target += 32, length -= 32) _mm256_storeu_si256(target, value_vec);
-    //    sz_fill_serial(target, length, value);
-    //
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 32) sz_fill_serial(target, length, value);
-    // When the buffer is aligned, we can avoid any split-stores.
-    else {
-        sz_size_t head_length = (32 - ((sz_size_t)target % 32)) % 32; // 31 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 32;    // 31 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 32.
-        sz_u16_t value16 = (sz_u16_t)value * 0x0101u;
-        sz_u32_t value32 = (sz_u32_t)value16 * 0x00010001u;
-        sz_u64_t value64 = (sz_u64_t)value32 * 0x0000000100000001ull;
-
-        // Fill the head of the buffer. This part is much cleaner with AVX-512.
-        if (head_length & 1) *(sz_u8_t *)target = value, target++, head_length--;
-        if (head_length & 2) *(sz_u16_t *)target = value16, target += 2, head_length -= 2;
-        if (head_length & 4) *(sz_u32_t *)target = value32, target += 4, head_length -= 4;
-        if (head_length & 8) *(sz_u64_t *)target = value64, target += 8, head_length -= 8;
-        if (head_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_set1_epi8(value_char)), target += 16, head_length -= 16;
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-
-        // Fill the aligned body of the buffer.
-        for (; body_length >= 32; target += 32, body_length -= 32) _mm256_store_si256((__m256i *)target, value_vec);
-
-        // Fill the tail of the buffer. This part is much cleaner with AVX-512.
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-        if (tail_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_set1_epi8(value_char)), target += 16, tail_length -= 16;
-        if (tail_length & 8) *(sz_u64_t *)target = value64, target += 8, tail_length -= 8;
-        if (tail_length & 4) *(sz_u32_t *)target = value32, target += 4, tail_length -= 4;
-        if (tail_length & 2) *(sz_u16_t *)target = value16, target += 2, tail_length -= 2;
-        if (tail_length & 1) *(sz_u8_t *)target = value, target++, tail_length--;
-    }
-}
-
-SZ_PUBLIC void sz_copy_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores" and "loads".
-    //
-    //    for (; length >= 32; target += 32, source += 32, length -= 32)
-    //        _mm256_storeu_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-    //    sz_copy_serial(target, source, length);
-    //
-    // A typical AWS Skylake instance can have 32 KB x 2 blocks of L1 data cache per core,
-    // 1 MB x 2 blocks of L2 cache per core, and one shared L3 cache buffer.
-    // For now, let's avoid the cases beyond the L2 size.
-    int is_huge = length > 1ull * 1024ull * 1024ull;
-    if (length <= 32) { sz_copy_serial(target, source, length); }
-    // When dealing wirh larger arrays, the optimization is not as simple as with the `sz_fill_avx2` function,
-    // as both buffers may be unaligned. If we are lucky and the requested operation is some huge page transfer,
-    // we can use aligned loads and stores, and the performance will be great.
-    else if ((sz_size_t)target % 32 == 0 && (sz_size_t)source % 32 == 0 && !is_huge) {
-        for (; length >= 32; target += 32, source += 32, length -= 32)
-            _mm256_store_si256((__m256i *)target, _mm256_load_si256((__m256i const *)source));
-        if (length) sz_copy_serial(target, source, length);
-    }
-    // The trickiest case is when both `source` and `target` are not aligned.
-    // In such and simpler cases we can copy enough bytes into `target` to reach its cacheline boundary,
-    // and then combine unaligned loads with aligned stores.
-    else {
-        sz_size_t head_length = (32 - ((sz_size_t)target % 32)) % 32; // 31 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 32;    // 31 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 32.
-
-        // Fill the head of the buffer. This part is much cleaner with AVX-512.
-        if (head_length & 1) *(sz_u8_t *)target = *(sz_u8_t *)source, target++, source++, head_length--;
-        if (head_length & 2) *(sz_u16_t *)target = *(sz_u16_t *)source, target += 2, source += 2, head_length -= 2;
-        if (head_length & 4) *(sz_u32_t *)target = *(sz_u32_t *)source, target += 4, source += 4, head_length -= 4;
-        if (head_length & 8) *(sz_u64_t *)target = *(sz_u64_t *)source, target += 8, source += 8, head_length -= 8;
-        if (head_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_lddqu_si128((__m128i const *)source)), target += 16, source += 16,
-                head_length -= 16;
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-
-        // Fill the aligned body of the buffer.
-        if (!is_huge) {
-            for (; body_length >= 32; target += 32, source += 32, body_length -= 32)
-                _mm256_store_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-        }
-        // When the biffer is huge, we can traverse it in 2 directions.
-        else {
-            for (; body_length >= 64; target += 32, source += 32, body_length -= 64) {
-                _mm256_store_si256((__m256i *)(target), _mm256_lddqu_si256((__m256i const *)(source)));
-                _mm256_store_si256((__m256i *)(target + body_length - 32),
-                                   _mm256_lddqu_si256((__m256i const *)(source + body_length - 32)));
-            }
-            if (body_length) _mm256_store_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-        }
-
-        // Fill the tail of the buffer. This part is much cleaner with AVX-512.
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-        if (tail_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_lddqu_si128((__m128i const *)source)), target += 16, source += 16,
-                tail_length -= 16;
-        if (tail_length & 8) *(sz_u64_t *)target = *(sz_u64_t *)source, target += 8, source += 8, tail_length -= 8;
-        if (tail_length & 4) *(sz_u32_t *)target = *(sz_u32_t *)source, target += 4, source += 4, tail_length -= 4;
-        if (tail_length & 2) *(sz_u16_t *)target = *(sz_u16_t *)source, target += 2, source += 2, tail_length -= 2;
-        if (tail_length & 1) *(sz_u8_t *)target = *(sz_u8_t *)source, target++, source++, tail_length--;
-    }
-}
-
-SZ_PUBLIC void sz_move_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    if (target < source || target >= source + length) {
-        for (; length >= 32; target += 32, source += 32, length -= 32)
-            _mm256_storeu_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-        while (length--) *(target++) = *(source++);
-    }
-    else {
-        // Jump to the end and walk backwards.
-        for (target += length, source += length; length >= 32; length -= 32)
-            _mm256_storeu_si256((__m256i *)(target -= 32), _mm256_lddqu_si256((__m256i const *)(source -= 32)));
-        while (length--) *(--target) = *(--source);
-    }
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_avx2(sz_cptr_t text, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "loads".
-    //
-    // A typical AWS Skylake instance can have 32 KB x 2 blocks of L1 data cache per core,
-    // 1 MB x 2 blocks of L2 cache per core, and one shared L3 cache buffer.
-    // For now, let's avoid the cases beyond the L2 size.
-    int is_huge = length > 1ull * 1024ull * 1024ull;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 32) { return sz_checksum_serial(text, length); }
-    else if (!is_huge) {
-        sz_u256_vec_t text_vec, sums_vec;
-        sums_vec.ymm = _mm256_setzero_si256();
-        for (; length >= 32; text += 32, length -= 32) {
-            text_vec.ymm = _mm256_lddqu_si256((__m256i const *)text);
-            sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-        }
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        sz_u64_t result = low + high;
-        if (length) result += sz_checksum_serial(text, length);
-        return result;
-    }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    // Most notably, we can avoid populating the cache with the entire buffer, and instead traverse it in 2 directions.
-    else {
-        sz_size_t head_length = (32 - ((sz_size_t)text % 32)) % 32; // 31 or less.
-        sz_size_t tail_length = (sz_size_t)(text + length) % 32;    // 31 or less.
-        sz_size_t body_length = length - head_length - tail_length; // Multiple of 32.
-        sz_u64_t result = 0;
-
-        // Handle the head
-        while (head_length--) result += *text++;
-
-        sz_u256_vec_t text_vec, sums_vec;
-        sums_vec.ymm = _mm256_setzero_si256();
-        // Fill the aligned body of the buffer.
-        if (!is_huge) {
-            for (; body_length >= 32; text += 32, body_length -= 32) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i const *)text);
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-            }
-        }
-        // When the biffer is huge, we can traverse it in 2 directions.
-        else {
-            sz_u256_vec_t text_reversed_vec, sums_reversed_vec;
-            sums_reversed_vec.ymm = _mm256_setzero_si256();
-            for (; body_length >= 64; text += 64, body_length -= 64) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i *)(text));
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-                text_reversed_vec.ymm = _mm256_stream_load_si256((__m256i *)(text + body_length - 64));
-                sums_reversed_vec.ymm = _mm256_add_epi64(
-                    sums_reversed_vec.ymm, _mm256_sad_epu8(text_reversed_vec.ymm, _mm256_setzero_si256()));
-            }
-            if (body_length >= 32) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i *)(text));
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-            }
-            sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, sums_reversed_vec.ymm);
-        }
-
-        // Handle the tail
-        while (tail_length--) result += *text++;
-
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        result += low + high;
-        return result;
-    }
-}
-
-SZ_PUBLIC void sz_look_up_transform_avx2(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-
-    // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
-    // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
-    // But if at least 3 cache lines are touched, the AVX-2 implementation should be faster.
-    if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
-        return;
-    }
-
-    // We need to pull the lookup table into 8x YMM registers.
-    // The biggest issue is reorganizing the data in the lookup table, as AVX2 doesn't have 256-bit shuffle,
-    // it only has 128-bit "within-lane" shuffle. Still, it's wiser to use full YMM registers, instead of XMM,
-    // so that we can at least compensate high latency with twice larger window and one more level of lookup.
-    sz_u256_vec_t lut_0_to_15_vec, lut_16_to_31_vec, lut_32_to_47_vec, lut_48_to_63_vec, //
-        lut_64_to_79_vec, lut_80_to_95_vec, lut_96_to_111_vec, lut_112_to_127_vec,       //
-        lut_128_to_143_vec, lut_144_to_159_vec, lut_160_to_175_vec, lut_176_to_191_vec,  //
-        lut_192_to_207_vec, lut_208_to_223_vec, lut_224_to_239_vec, lut_240_to_255_vec;
-
-    lut_0_to_15_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut)));
-    lut_16_to_31_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 16)));
-    lut_32_to_47_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 32)));
-    lut_48_to_63_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 48)));
-    lut_64_to_79_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 64)));
-    lut_80_to_95_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 80)));
-    lut_96_to_111_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 96)));
-    lut_112_to_127_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 112)));
-    lut_128_to_143_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 128)));
-    lut_144_to_159_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 144)));
-    lut_160_to_175_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 160)));
-    lut_176_to_191_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 176)));
-    lut_192_to_207_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 192)));
-    lut_208_to_223_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 208)));
-    lut_224_to_239_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 224)));
-    lut_240_to_255_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 240)));
-
-    // Assuming each lookup is performed within 16 elements of 256, we need to reduce the scope by 16x = 2^4.
-    sz_u256_vec_t not_first_bit_vec, not_second_bit_vec, not_third_bit_vec, not_fourth_bit_vec;
-
-    /// Top and bottom nibbles of the source are used separately.
-    sz_u256_vec_t source_vec, source_bot_vec;
-    sz_u256_vec_t blended_0_to_31_vec, blended_32_to_63_vec, blended_64_to_95_vec, blended_96_to_127_vec,
-        blended_128_to_159_vec, blended_160_to_191_vec, blended_192_to_223_vec, blended_224_to_255_vec;
-
-    // Handling the head.
-    while (length >= 32) {
-        // Load and separate the nibbles of each byte in the source.
-        source_vec.ymm = _mm256_lddqu_si256((__m256i const *)source);
-        source_bot_vec.ymm = _mm256_and_si256(source_vec.ymm, _mm256_set1_epi8((char)0x0F));
-
-        // In the first round, we select using the 4th bit.
-        not_fourth_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x10), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8(                      //
-            _mm256_shuffle_epi8(lut_16_to_31_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_0_to_15_vec.ymm, source_bot_vec.ymm),  //
-            not_fourth_bit_vec.ymm);
-        blended_32_to_63_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_48_to_63_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_32_to_47_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_64_to_95_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_80_to_95_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_64_to_79_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_96_to_127_vec.ymm = _mm256_blendv_epi8(                      //
-            _mm256_shuffle_epi8(lut_112_to_127_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_96_to_111_vec.ymm, source_bot_vec.ymm),  //
-            not_fourth_bit_vec.ymm);
-        blended_128_to_159_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_144_to_159_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_128_to_143_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_160_to_191_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_176_to_191_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_160_to_175_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_192_to_223_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_208_to_223_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_192_to_207_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_224_to_255_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_240_to_255_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_224_to_239_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-
-        // Perform a tree-like reduction of the 8x "blended" YMM registers, depending on the "source" content.
-        // The first round selects using the 3rd bit.
-        not_third_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x20), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8( //
-            blended_32_to_63_vec.ymm,                 //
-            blended_0_to_31_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-        blended_64_to_95_vec.ymm = _mm256_blendv_epi8( //
-            blended_96_to_127_vec.ymm,                 //
-            blended_64_to_95_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-        blended_128_to_159_vec.ymm = _mm256_blendv_epi8( //
-            blended_160_to_191_vec.ymm,                  //
-            blended_128_to_159_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-        blended_192_to_223_vec.ymm = _mm256_blendv_epi8( //
-            blended_224_to_255_vec.ymm,                  //
-            blended_192_to_223_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-
-        // The second round selects using the 2nd bit.
-        not_second_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x40), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8( //
-            blended_64_to_95_vec.ymm,                 //
-            blended_0_to_31_vec.ymm,                  //
-            not_second_bit_vec.ymm);
-        blended_128_to_159_vec.ymm = _mm256_blendv_epi8( //
-            blended_192_to_223_vec.ymm,                  //
-            blended_128_to_159_vec.ymm,                  //
-            not_second_bit_vec.ymm);
-
-        // The third round selects using the 1st bit.
-        not_first_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x80), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8( //
-            blended_128_to_159_vec.ymm,               //
-            blended_0_to_31_vec.ymm,                  //
-            not_first_bit_vec.ymm);
-
-        // And dump the result into the target.
-        _mm256_storeu_si256((__m256i *)target, blended_0_to_31_vec.ymm);
-        source += 32, target += 32, length -= 32;
-    }
-
-    // Handle the tail.
-    if (length) sz_look_up_transform_serial(source, length, lut, target);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    int mask;
-    sz_u256_vec_t h_vec, n_vec;
-    n_vec.ymm = _mm256_set1_epi8(n[0]);
-
-    while (h_length >= 32) {
-        h_vec.ymm = _mm256_lddqu_si256((__m256i const *)h);
-        mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_vec.ymm, n_vec.ymm));
-        if (mask) return h + sz_u32_ctz(mask);
-        h += 32, h_length -= 32;
-    }
-
-    return sz_find_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    int mask;
-    sz_u256_vec_t h_vec, n_vec;
-    n_vec.ymm = _mm256_set1_epi8(n[0]);
-
-    while (h_length >= 32) {
-        h_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + h_length - 32));
-        mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_vec.ymm, n_vec.ymm));
-        if (mask) return h + h_length - 1 - sz_u32_clz(mask);
-        h_length -= 32;
-    }
-
-    return sz_rfind_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_avx2(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into YMM registers.
-    int matches;
-    sz_u256_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.ymm = _mm256_set1_epi8(n[offset_first]);
-    n_mid_vec.ymm = _mm256_set1_epi8(n[offset_mid]);
-    n_last_vec.ymm = _mm256_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    for (; h_length >= n_length + 32; h += 32, h_length -= 32) {
-        h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_first));
-        h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_mid));
-        h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_last));
-        matches = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
-        while (matches) {
-            int potential_offset = sz_u32_ctz(matches);
-            if (sz_equal(h + potential_offset, n, n_length)) return h + potential_offset;
-            matches &= matches - 1;
-        }
-    }
-
-    return sz_find_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_avx2(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into YMM registers.
-    int matches;
-    sz_u256_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.ymm = _mm256_set1_epi8(n[offset_first]);
-    n_mid_vec.ymm = _mm256_set1_epi8(n[offset_mid]);
-    n_last_vec.ymm = _mm256_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 32; h_length -= 32) {
-        h_reversed = h + h_length - n_length - 32 + 1;
-        h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_first));
-        h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_mid));
-        h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_last));
-        matches = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
-        while (matches) {
-            int potential_offset = sz_u32_clz(matches);
-            if (sz_equal(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            matches &= ~(1 << (31 - potential_offset));
-        }
-    }
-
-    return sz_rfind_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx2(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-
-    // Let's unzip even and odd elements and replicate them into both lanes of the YMM register.
-    // That way when we invoke `_mm256_shuffle_epi8` we can use the same mask for both lanes.
-    sz_u256_vec_t filter_even_vec, filter_odd_vec;
-    for (sz_size_t i = 0; i != 16; ++i)
-        filter_even_vec.u8s[i] = filter->_u8s[i * 2], filter_odd_vec.u8s[i] = filter->_u8s[i * 2 + 1];
-    filter_even_vec.xmms[1] = filter_even_vec.xmms[0];
-    filter_odd_vec.xmms[1] = filter_odd_vec.xmms[0];
-
-    sz_u256_vec_t text_vec;
-    sz_u256_vec_t matches_vec;
-    sz_u256_vec_t lower_nibbles_vec, higher_nibbles_vec;
-    sz_u256_vec_t bitset_even_vec, bitset_odd_vec;
-    sz_u256_vec_t bitmask_vec, bitmask_lookup_vec;
-    bitmask_lookup_vec.ymm = _mm256_set_epi8(-128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-                                             -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);
-
-    while (length >= 32) {
-        // The following algorithm is a transposed equivalent of the "SIMDized check which bytes are in a set"
-        // solutions by Wojciech Muła. We populate the bitmask differently and target newer CPUs, so
-        // StrinZilla uses a somewhat different approach.
-        // http://0x80.pl/articles/simd-byte-lookup.html#alternative-implementation-new
-        //
-        //      sz_u8_t input = *(sz_u8_t const *)text;
-        //      sz_u8_t lo_nibble = input & 0x0f;
-        //      sz_u8_t hi_nibble = input >> 4;
-        //      sz_u8_t bitset_even = filter_even_vec.u8s[hi_nibble];
-        //      sz_u8_t bitset_odd = filter_odd_vec.u8s[hi_nibble];
-        //      sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //      sz_u8_t bitset = lo_nibble < 8 ? bitset_even : bitset_odd;
-        //      if ((bitset & bitmask) != 0) return text;
-        //      else { length--, text++; }
-        //
-        // The nice part about this, loading the strided data is vey easy with Arm NEON,
-        // while with x86 CPUs after AVX, shuffles within 256 bits shouldn't be an issue either.
-        text_vec.ymm = _mm256_lddqu_si256((__m256i const *)text);
-        lower_nibbles_vec.ymm = _mm256_and_si256(text_vec.ymm, _mm256_set1_epi8(0x0f));
-        bitmask_vec.ymm = _mm256_shuffle_epi8(bitmask_lookup_vec.ymm, lower_nibbles_vec.ymm);
-        //
-        // At this point we can validate the `bitmask_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != 32; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t lo_nibble = input & 0x0f;
-        //          sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //          sz_assert(bitmask_vec.u8s[i] == bitmask);
-        //      }
-        //
-        // Shift right every byte by 4 bits.
-        // There is no `_mm256_srli_epi8` intrinsic, so we have to use `_mm256_srli_epi16`
-        // and combine it with a mask to clear the higher bits.
-        higher_nibbles_vec.ymm = _mm256_and_si256(_mm256_srli_epi16(text_vec.ymm, 4), _mm256_set1_epi8(0x0f));
-        bitset_even_vec.ymm = _mm256_shuffle_epi8(filter_even_vec.ymm, higher_nibbles_vec.ymm);
-        bitset_odd_vec.ymm = _mm256_shuffle_epi8(filter_odd_vec.ymm, higher_nibbles_vec.ymm);
-        //
-        // At this point we can validate the `bitset_even_vec` and `bitset_odd_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != 32; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t const *bitset_ptr = &filter->_u8s[0];
-        //          sz_u8_t hi_nibble = input >> 4;
-        //          sz_u8_t bitset_even = bitset_ptr[hi_nibble * 2];
-        //          sz_u8_t bitset_odd = bitset_ptr[hi_nibble * 2 + 1];
-        //          sz_assert(bitset_even_vec.u8s[i] == bitset_even);
-        //          sz_assert(bitset_odd_vec.u8s[i] == bitset_odd);
-        //      }
-        //
-        __m256i take_first = _mm256_cmpgt_epi8(_mm256_set1_epi8(8), lower_nibbles_vec.ymm);
-        bitset_even_vec.ymm = _mm256_blendv_epi8(bitset_odd_vec.ymm, bitset_even_vec.ymm, take_first);
-
-        // It would have been great to have an instruction that tests the bits and then broadcasts
-        // the matching bit into all bits in that byte. But we don't have that, so we have to
-        // `and`, `cmpeq`, `movemask`, and then invert at the end...
-        matches_vec.ymm = _mm256_and_si256(bitset_even_vec.ymm, bitmask_vec.ymm);
-        matches_vec.ymm = _mm256_cmpeq_epi8(matches_vec.ymm, _mm256_setzero_si256());
-        int matches_mask = ~_mm256_movemask_epi8(matches_vec.ymm);
-        if (matches_mask) {
-            int offset = sz_u32_ctz(matches_mask);
-            return text + offset;
-        }
-        else { text += 32, length -= 32; }
-    }
-
-    return sz_find_charset_serial(text, length, filter);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx2(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-    return sz_rfind_charset_serial(text, length, filter);
-}
-
-/**
- *  @brief  There is no AVX2 instruction for fast multiplication of 64-bit integers.
- *          This implementation is coming from Agner Fog's Vector Class Library.
- */
-SZ_INTERNAL __m256i _mm256_mul_epu64(__m256i a, __m256i b) {
-    __m256i bswap = _mm256_shuffle_epi32(b, 0xB1);
-    __m256i prodlh = _mm256_mullo_epi32(a, bswap);
-    __m256i zero = _mm256_setzero_si256();
-    __m256i prodlh2 = _mm256_hadd_epi32(prodlh, zero);
-    __m256i prodlh3 = _mm256_shuffle_epi32(prodlh2, 0x73);
-    __m256i prodll = _mm256_mul_epu32(a, b);
-    __m256i prod = _mm256_add_epi64(prodll, prodlh3);
-    return prod;
-}
-
-SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                              sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    if (length < 4 * window_length) {
-        sz_hashes_serial(start, length, window_length, step, callback, callback_handle);
-        return;
-    }
-
-    // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
-    // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
-    sz_size_t const max_hashes = length - window_length + 1;
-    sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
-    sz_u8_t const *text_first = (sz_u8_t const *)start;
-    sz_u8_t const *text_second = text_first + min_hashes_per_thread;
-    sz_u8_t const *text_third = text_first + min_hashes_per_thread * 2;
-    sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
-    sz_u8_t const *text_end = text_first + length;
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // Broadcast the constants into the registers.
-    sz_u256_vec_t prime_vec, golden_ratio_vec;
-    sz_u256_vec_t base_low_vec, base_high_vec, prime_power_low_vec, prime_power_high_vec, shift_high_vec;
-    base_low_vec.ymm = _mm256_set1_epi64x(31ull);
-    base_high_vec.ymm = _mm256_set1_epi64x(257ull);
-    shift_high_vec.ymm = _mm256_set1_epi64x(77ull);
-    prime_vec.ymm = _mm256_set1_epi64x(SZ_U64_MAX_PRIME);
-    golden_ratio_vec.ymm = _mm256_set1_epi64x(11400714819323198485ull);
-    prime_power_low_vec.ymm = _mm256_set1_epi64x(prime_power_low);
-    prime_power_high_vec.ymm = _mm256_set1_epi64x(prime_power_high);
-
-    // Compute the initial hash values for every one of the four windows.
-    sz_u256_vec_t hash_low_vec, hash_high_vec, hash_mix_vec, chars_low_vec, chars_high_vec;
-    hash_low_vec.ymm = _mm256_setzero_si256();
-    hash_high_vec.ymm = _mm256_setzero_si256();
-    for (sz_u8_t const *prefix_end = text_first + window_length; text_first < prefix_end;
-         ++text_first, ++text_second, ++text_third, ++text_fourth) {
-
-        // 1. Multiply the hashes by the base.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-
-        // 3. Add the incoming characters.
-        hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_low_vec.ymm = _mm256_blendv_epi8(hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
-                                              _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
-        hash_high_vec.ymm = _mm256_blendv_epi8(hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
-                                               _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
-    }
-
-    // 5. Compute the hash mix, that will be used to index into the fingerprint.
-    //    This includes a serial step at the end.
-    hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
-    hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
-    hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
-    callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-    callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-    callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-    callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-
-    // Now repeat that operation for the remaining characters, discarding older characters.
-    sz_size_t cycle = 1;
-    sz_size_t const step_mask = step - 1;
-    for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
-        // 0. Load again the four characters we are dropping, shift them, and subtract.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[-window_length], text_third[-window_length],
-                                              text_second[-window_length], text_first[-window_length]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-        hash_low_vec.ymm =
-            _mm256_sub_epi64(hash_low_vec.ymm, _mm256_mul_epu64(chars_low_vec.ymm, prime_power_low_vec.ymm));
-        hash_high_vec.ymm =
-            _mm256_sub_epi64(hash_high_vec.ymm, _mm256_mul_epu64(chars_high_vec.ymm, prime_power_high_vec.ymm));
-
-        // 1. Multiply the hashes by the base.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-
-        // 3. Add the incoming characters.
-        hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_low_vec.ymm = _mm256_blendv_epi8(hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
-                                              _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
-        hash_high_vec.ymm = _mm256_blendv_epi8(hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
-                                               _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
-
-        // 5. Compute the hash mix, that will be used to index into the fingerprint.
-        //    This includes a serial step at the end.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
-        hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
-        if ((cycle & step_mask) == 0) {
-            callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-            callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-            callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-            callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-        }
-    }
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif
-#pragma endregion
-
-/*
- *  @brief  AVX-512 implementation of the string search algorithms.
- *
- *  Different subsets of AVX-512 were introduced in different years:
- *  - 2017 SkyLake: F, CD, ER, PF, VL, DQ, BW
- *  - 2018 CannonLake: IFMA, VBMI
- *  - 2019 IceLake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES
- *  - 2020 TigerLake: VP2INTERSECT
- */
-#pragma region AVX512 Implementation
-
-#if SZ_USE_X86_AVX512
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
-#include <immintrin.h>
-
-/**
- *  @brief  Helper structure to simplify work with 512-bit registers.
- */
-typedef union sz_u512_vec_t {
-    __m512i zmm;
-    __m256i ymms[2];
-    __m128i xmms[4];
-    sz_u64_t u64s[8];
-    sz_u32_t u32s[16];
-    sz_u16_t u16s[32];
-    sz_u8_t u8s[64];
-    sz_i64_t i64s[8];
-    sz_i32_t i32s[16];
-} sz_u512_vec_t;
-
-SZ_INTERNAL __mmask64 _sz_u64_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 64:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 64:
-    return _bzhi_u64(0xFFFFFFFFFFFFFFFF, n < 64 ? (sz_u32_t)n : 64);
-}
-
-SZ_INTERNAL __mmask32 _sz_u32_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 32:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 32:
-    return _bzhi_u32(0xFFFFFFFF, n < 32 ? (sz_u32_t)n : 32);
-}
-
-SZ_INTERNAL __mmask16 _sz_u16_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 16:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 16:
-    return _bzhi_u32(0xFFFFFFFF, n < 16 ? (sz_u32_t)n : 16);
-}
-
-SZ_INTERNAL __mmask16 _sz_u16_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 16:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 16:
-    return (__mmask16)_bzhi_u32(0xFFFFFFFF, (sz_u32_t)n);
-}
-
-SZ_INTERNAL __mmask32 _sz_u32_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 32:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 32:
-    return _bzhi_u32(0xFFFFFFFF, (sz_u32_t)n);
-}
-
-SZ_INTERNAL __mmask64 _sz_u64_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 64:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 64:
-    return _bzhi_u64(0xFFFFFFFFFFFFFFFF, (sz_u32_t)n);
-}
-
-SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    sz_u512_vec_t a_vec, b_vec;
-
-    // Pointer arithmetic is cheap, fetching memory is not!
-    // So we can use the masked loads to fetch at most one cache-line for each string,
-    // compare the prefixes, and only then move forward.
-    sz_size_t a_head_length = 64 - ((sz_size_t)a % 64); // 63 or less.
-    sz_size_t b_head_length = 64 - ((sz_size_t)b % 64); // 63 or less.
-    a_head_length = a_head_length < a_length ? a_head_length : a_length;
-    b_head_length = b_head_length < b_length ? b_head_length : b_length;
-    sz_size_t head_length = a_head_length < b_head_length ? a_head_length : b_head_length;
-    __mmask64 head_mask = _sz_u64_mask_until(head_length);
-    a_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, a);
-    b_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, b);
-    __mmask64 mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-    if (mask_not_equal != 0) {
-        sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-        char a_char = a_vec.u8s[first_diff];
-        char b_char = b_vec.u8s[first_diff];
-        return _sz_order_scalars(a_char, b_char);
-    }
-    else if (head_length == a_length && head_length == b_length) { return sz_equal_k; }
-    else { a += head_length, b += head_length, a_length -= head_length, b_length -= head_length; }
-
-    // The rare case, when both string are very long.
-    __mmask64 a_mask, b_mask;
-    while ((a_length >= 64) & (b_length >= 64)) {
-        a_vec.zmm = _mm512_loadu_si512(a);
-        b_vec.zmm = _mm512_loadu_si512(b);
-        mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask_not_equal != 0) {
-            sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-            char a_char = a_vec.u8s[first_diff];
-            char b_char = b_vec.u8s[first_diff];
-            return _sz_order_scalars(a_char, b_char);
-        }
-        a += 64, b += 64, a_length -= 64, b_length -= 64;
-    }
-
-    // In most common scenarios at least one of the strings is under 64 bytes.
-    if (a_length | b_length) {
-        a_mask = _sz_u64_clamp_mask_until(a_length);
-        b_mask = _sz_u64_clamp_mask_until(b_length);
-        a_vec.zmm = _mm512_maskz_loadu_epi8(a_mask, a);
-        b_vec.zmm = _mm512_maskz_loadu_epi8(b_mask, b);
-        // The AVX-512 `_mm512_mask_cmpneq_epi8_mask` intrinsics are generally handy in such environments.
-        // They, however, have latency 3 on most modern CPUs. Using AVX2: `_mm256_cmpeq_epi8` would have
-        // been cheaper, if we didn't have to apply `_mm256_movemask_epi8` afterwards.
-        mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask_not_equal != 0) {
-            sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-            char a_char = a_vec.u8s[first_diff];
-            char b_char = b_vec.u8s[first_diff];
-            return _sz_order_scalars(a_char, b_char);
-        }
-        // From logic perspective, the hardest cases are "abc\0" and "abc".
-        // The result must be `sz_greater_k`, as the latter is shorter.
-        else { return _sz_order_scalars(a_length, b_length); }
-    }
-
-    return sz_equal_k;
-}
-
-SZ_PUBLIC sz_bool_t sz_equal_avx512(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    __mmask64 mask;
-    sz_u512_vec_t a_vec, b_vec;
-
-    while (length >= 64) {
-        a_vec.zmm = _mm512_loadu_si512(a);
-        b_vec.zmm = _mm512_loadu_si512(b);
-        mask = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask != 0) return sz_false_k;
-        a += 64, b += 64, length -= 64;
-    }
-
-    if (length) {
-        mask = _sz_u64_mask_until(length);
-        a_vec.zmm = _mm512_maskz_loadu_epi8(mask, a);
-        b_vec.zmm = _mm512_maskz_loadu_epi8(mask, b);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpneq_epi8_mask(mask, a_vec.zmm, b_vec.zmm);
-        return (sz_bool_t)(mask == 0);
-    }
-
-    return sz_true_k;
-}
-
-SZ_PUBLIC void sz_fill_avx512(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    __m512i value_vec = _mm512_set1_epi8(value);
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores".
-    //
-    //    for (; length >= 64; target += 64, length -= 64) _mm512_storeu_si512(target, value_vec);
-    //    _mm512_mask_storeu_epi8(target, _sz_u64_mask_until(length), value_vec);
-    //
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        _mm512_mask_storeu_epi8(target, mask, value_vec);
-    }
-    // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
-    // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
-    // by computing 2 masks - for the head and tail, using masked stores for the head and tail, and unmasked
-    // for the body.
-    else {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        _mm512_mask_storeu_epi8(target, head_mask, value_vec);
-        for (target += head_length; body_length >= 64; target += 64, body_length -= 64)
-            _mm512_store_si512(target, value_vec);
-        _mm512_mask_storeu_epi8(target, tail_mask, value_vec);
-    }
-}
-
-SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores" and "loads".
-    //
-    //    for (; length >= 64; target += 64, source += 64, length -= 64)
-    //        _mm512_storeu_si512(target, _mm512_loadu_si512(source));
-    //    __mmask64 mask = _sz_u64_mask_until(length);
-    //    _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    //
-    // A typical AWS Sapphire Rapids instance can have 48 KB x 2 blocks of L1 data cache per core,
-    // 2 MB x 2 blocks of L2 cache per core, and one shared 60 MB buffer of L3 cache.
-    // With two strings, we may consider the overal workload huge, if each exceeds 1 MB in length.
-    int const is_huge = length >= 1ull * 1024ull * 1024ull;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    }
-    // When dealing wirh larger arrays, the optimization is not as simple as with the `sz_fill_avx512` function,
-    // as both buffers may be unaligned. If we are lucky and the requested operation is some huge page transfer,
-    // we can use aligned loads and stores, and the performance will be great.
-    else if ((sz_size_t)target % 64 == 0 && (sz_size_t)source % 64 == 0 && !is_huge) {
-        for (; length >= 64; target += 64, source += 64, length -= 64)
-            _mm512_store_si512(target, _mm512_load_si512(source));
-        // At this point the length is guaranteed to be under 64.
-        __mmask64 mask = _sz_u64_mask_until(length);
-        // Aligned load and stores would work too, but it's not defined.
-        _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    }
-    // The trickiest case is when both `source` and `target` are not aligned.
-    // In such and simpler cases we can copy enough bytes into `target` to reach its cacheline boundary,
-    // and then combine unaligned loads with aligned stores.
-    else if (!is_huge) {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-        for (target += head_length, source += head_length; body_length >= 64;
-             target += 64, source += 64, body_length -= 64)
-            _mm512_store_si512(target, _mm512_loadu_si512(source)); // Unaligned load, but aligned store!
-        _mm512_mask_storeu_epi8(target, tail_mask, _mm512_maskz_loadu_epi8(tail_mask, source));
-    }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    //
-    //      1. Moving in both directions to maximize the throughput, when fetching from multiple
-    //         memory pages. Also helps with cache set-associativity issues, as we won't always
-    //         be fetching the same entries in the lookup table.
-    //      2. Using non-temporal stores to avoid polluting the cache.
-    //      3. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
-    //         for predictable patterns, so disregard this advice.
-    //
-    // Bidirectional traversal adds about 10%, accelerating from 11 GB/s to 12 GB/s.
-    // Using "streaming stores" boosts us from 12 GB/s to 19 GB/s.
-    else {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64;
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;
-        sz_size_t body_length = length - head_length - tail_length;
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-        _mm512_mask_storeu_epi8(target + head_length + body_length, tail_mask,
-                                _mm512_maskz_loadu_epi8(tail_mask, source));
-
-        // Now in the main loop, we can use non-temporal loads and stores,
-        // performing the operation in both directions.
-        for (target += head_length, source += head_length; //
-             body_length >= 128;                           //
-             target += 64, source += 64, body_length -= 128) {
-            _mm512_stream_si512((__m512i *)(target), _mm512_loadu_si512(source));
-            _mm512_stream_si512((__m512i *)(target + body_length - 64), _mm512_loadu_si512(source + body_length - 64));
-        }
-        if (body_length >= 64) _mm512_stream_si512((__m512i *)target, _mm512_loadu_si512(source));
-    }
-}
-
-SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    if (target == source) return; // Don't be silly, don't move the data if it's already there.
-
-    // On very short buffers, that are one cache line in width or less, we don't need any loops.
-    // We can also avoid any data-dependencies between iterations, assuming we have 32 registers
-    // to pre-load the data, before writing it back.
-    if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    }
-    else if (length <= 128) {
-        sz_size_t last_length = length - 64;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
-        __m512i source0 = _mm512_loadu_epi8(source);
-        __m512i source1 = _mm512_maskz_loadu_epi8(mask, source + 64);
-        _mm512_storeu_epi8(target, source0);
-        _mm512_mask_storeu_epi8(target + 64, mask, source1);
-    }
-    else if (length <= 192) {
-        sz_size_t last_length = length - 128;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
-        __m512i source0 = _mm512_loadu_epi8(source);
-        __m512i source1 = _mm512_loadu_epi8(source + 64);
-        __m512i source2 = _mm512_maskz_loadu_epi8(mask, source + 128);
-        _mm512_storeu_epi8(target, source0);
-        _mm512_storeu_epi8(target + 64, source1);
-        _mm512_mask_storeu_epi8(target + 128, mask, source2);
-    }
-    else if (length <= 256) {
-        sz_size_t last_length = length - 192;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
-        __m512i source0 = _mm512_loadu_epi8(source);
-        __m512i source1 = _mm512_loadu_epi8(source + 64);
-        __m512i source2 = _mm512_loadu_epi8(source + 128);
-        __m512i source3 = _mm512_maskz_loadu_epi8(mask, source + 192);
-        _mm512_storeu_epi8(target, source0);
-        _mm512_storeu_epi8(target + 64, source1);
-        _mm512_storeu_epi8(target + 128, source2);
-        _mm512_mask_storeu_epi8(target + 192, mask, source3);
-    }
-
-    // If the regions don't overlap at all, just use "copy" and save some brain cells thinking about corner cases.
-    else if (target + length < source || target >= source + length) { sz_copy_avx512(target, source, length); }
-
-    // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
-    // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
-    // by computing 2 masks - for the head and tail, using masked stores for the head and tail, and unmasked
-    // for the body.
-    else {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-
-        // The absolute most common case of using "moves" is shifting the data within a continuous buffer
-        // when adding a removing some values in it. In such cases, a typical shift is by 1, 2, 4, 8, 16,
-        // or 32 bytes, rarely larger. For small shifts, under the size of the ZMM register, we can use shuffles.
-        //
-        // Remember:
-        //      - if we are shifting data left, that we are traversing to the right.
-        //      - if we are shifting data right, that we are traversing to the left.
-        int const left_to_right_traversal = source > target;
-
-        // Now we guarantee, that the relative shift within registers is from 1 to 63 bytes and the output is aligned.
-        // Hopefully, we need to shift more than two ZMM registers, so we could consider `valignr` instruction.
-        // Sadly, using `_mm512_alignr_epi8` doesn't make sense, as it operates at a 128-bit granularity.
-        //
-        //      - `_mm256_alignr_epi8` shifts entire 256-bit register, but we need many of them.
-        //      - `_mm512_alignr_epi32` shifts 512-bit chunks, but only if the `shift` is a multiple of 4 bytes.
-        //      - `_mm512_alignr_epi64` shifts 512-bit chunks by 8 bytes.
-        //
-        // All of those have a latency of 1 cycle, and the shift amount must be an immediate value!
-        // For 1-byte-shift granularity, the `_mm512_permutex2var_epi8` has a latency of 6 and needs VBMI!
-        // The most efficient and broadly compatible alternative could be to use a combination of align and shuffle.
-        // A similar approach was outlined in "Byte-wise alignr in AVX512F" by Wojciech Muła.
-        // http://0x80.pl/notesen/2016-10-16-avx512-byte-alignr.html
-        //
-        // That solution, is extremely mouthful, assuming we need compile time constants for the shift amount.
-        // A cleaner one, with a latency of 3 cycles, is to use `_mm512_permutexvar_epi8` or
-        // `_mm512_mask_permutexvar_epi8`, which can be seen as combination of a cross-register shuffle and blend,
-        // and is available with VBMI. That solution is still noticeably slower than AVX2.
-        //
-        // The GLibC implementation also uses non-temporal stores for larger buffers, we don't.
-        // https://codebrowser.dev/glibc/glibc/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S.html
-        if (left_to_right_traversal) {
-            // Head, body, and tail.
-            _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-            for (target += head_length, source += head_length; body_length >= 64;
-                 target += 64, source += 64, body_length -= 64)
-                _mm512_store_si512(target, _mm512_loadu_si512(source));
-            _mm512_mask_storeu_epi8(target, tail_mask, _mm512_maskz_loadu_epi8(tail_mask, source));
-        }
-        else {
-            // Tail, body, and head.
-            _mm512_mask_storeu_epi8(target + head_length + body_length, tail_mask,
-                                    _mm512_maskz_loadu_epi8(tail_mask, source + head_length + body_length));
-            for (; body_length >= 64; body_length -= 64)
-                _mm512_store_si512(target + head_length + body_length - 64,
-                                   _mm512_loadu_si512(source + head_length + body_length - 64));
-            _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-        }
-    }
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    __mmask64 mask;
-    sz_u512_vec_t h_vec, n_vec;
-    n_vec.zmm = _mm512_set1_epi8(n[0]);
-
-    while (h_length >= 64) {
-        h_vec.zmm = _mm512_loadu_si512(h);
-        mask = _mm512_cmpeq_epi8_mask(h_vec.zmm, n_vec.zmm);
-        if (mask) return h + sz_u64_ctz(mask);
-        h += 64, h_length -= 64;
-    }
-
-    if (h_length) {
-        mask = _sz_u64_mask_until(h_length);
-        h_vec.zmm = _mm512_maskz_loadu_epi8(mask, h);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpeq_epu8_mask(mask, h_vec.zmm, n_vec.zmm);
-        if (mask) return h + sz_u64_ctz(mask);
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_avx512(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into ZMM registers.
-    __mmask64 matches;
-    __mmask64 mask;
-    sz_u512_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.zmm = _mm512_set1_epi8(n[offset_first]);
-    n_mid_vec.zmm = _mm512_set1_epi8(n[offset_mid]);
-    n_last_vec.zmm = _mm512_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    // We have several optimized versions of the lagorithm for shorter strings,
-    // but they all mimic the default case for unbounded length needles
-    if (n_length >= 64) {
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches);
-                if (sz_equal_avx512(h + potential_offset, n, n_length)) return h + potential_offset;
-                matches &= matches - 1;
-            }
-
-            // TODO: If the last character contains a bad byte, we can reposition the start of the next iteration.
-            // This will be very helpful for very long needles.
-        }
-    }
-    // If there are only 2 or 3 characters in the needle, we don't even need the nested loop.
-    else if (n_length <= 3) {
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            if (matches) return h + sz_u64_ctz(matches);
-        }
-    }
-    // If the needle is smaller than the size of the ZMM register, we can use masked comparisons
-    // to avoid the the inner-most nested loop and compare the entire needle against a haystack
-    // slice in 3 CPU cycles.
-    else {
-        __mmask64 n_mask = _sz_u64_mask_until(n_length);
-        sz_u512_vec_t n_full_vec, h_full_vec;
-        n_full_vec.zmm = _mm512_maskz_loadu_epi8(n_mask, n);
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches);
-                h_full_vec.zmm = _mm512_maskz_loadu_epi8(n_mask, h + potential_offset);
-                if (_mm512_mask_cmpneq_epi8_mask(n_mask, h_full_vec.zmm, n_full_vec.zmm) == 0)
-                    return h + potential_offset;
-                matches &= matches - 1;
-            }
-        }
-    }
-
-    // The "tail" of the function uses masked loads to process the remaining bytes.
-    {
-        mask = _sz_u64_mask_until(h_length - n_length + 1);
-        h_first_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_first);
-        h_mid_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_mid);
-        h_last_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_ctz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + potential_offset, n, n_length)) return h + potential_offset;
-            matches &= matches - 1;
-        }
-    }
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    __mmask64 mask;
-    sz_u512_vec_t h_vec, n_vec;
-    n_vec.zmm = _mm512_set1_epi8(n[0]);
-
-    while (h_length >= 64) {
-        h_vec.zmm = _mm512_loadu_si512(h + h_length - 64);
-        mask = _mm512_cmpeq_epi8_mask(h_vec.zmm, n_vec.zmm);
-        if (mask) return h + h_length - 1 - sz_u64_clz(mask);
-        h_length -= 64;
-    }
-
-    if (h_length) {
-        mask = _sz_u64_mask_until(h_length);
-        h_vec.zmm = _mm512_maskz_loadu_epi8(mask, h);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpeq_epu8_mask(mask, h_vec.zmm, n_vec.zmm);
-        if (mask) return h + 64 - sz_u64_clz(mask) - 1;
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_avx512(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into ZMM registers.
-    __mmask64 mask;
-    __mmask64 matches;
-    sz_u512_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.zmm = _mm512_set1_epi8(n[offset_first]);
-    n_mid_vec.zmm = _mm512_set1_epi8(n[offset_mid]);
-    n_last_vec.zmm = _mm512_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 64; h_length -= 64) {
-        h_reversed = h + h_length - n_length - 64 + 1;
-        h_first_vec.zmm = _mm512_loadu_si512(h_reversed + offset_first);
-        h_mid_vec.zmm = _mm512_loadu_si512(h_reversed + offset_mid);
-        h_last_vec.zmm = _mm512_loadu_si512(h_reversed + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~((sz_u64_t)1 << (63 - potential_offset));
-        }
-    }
-
-    // The "tail" of the function uses masked loads to process the remaining bytes.
-    {
-        mask = _sz_u64_mask_until(h_length - n_length + 1);
-        h_first_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_first);
-        h_mid_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_mid);
-        h_last_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + 64 - potential_offset - 1, n, n_length))
-                return h + 64 - potential_offset - 1;
-            sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~((sz_u64_t)1 << (63 - potential_offset));
-        }
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
-                             apply_to = function)
-
-/**
- *  @brief  Computes the edit distance between two very short byte-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 63, and evaluates at most (63 * 2 + 1 = 127) diagonals, or just as many loop cycles.
- *  Supports an early exit, if the distance is bounded.
- *  Keeps all of the data and Levenshtein matrices skew diagonal in just a couple of registers.
- *  Benefits from the @b `vpermb` instructions, that can rotate the bytes across the entire ZMM register.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto63_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                        //
-    sz_cptr_t longer, sz_size_t longer_length,                          //
-    sz_size_t bound) {
-
-    sz_size_t const max_length = 63u;
-    sz_assert(shorter_length <= longer_length && "The 'shorter' string is longer than the 'longer' one.");
-    sz_assert(shorter_length < max_length && "The length must fit into 16-bit integer. Otherwise use serial variant.");
-
-    // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
-    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
-    sz_size_t const shorter_dim = shorter_length + 1;
-    sz_size_t const longer_dim = longer_length + 1;
-
-    // The next few buffers will be swapped around.
-    sz_u512_vec_t previous_vec, current_vec, next_vec;
-    sz_u512_vec_t gaps_vec, substitutions_vec;
-
-    // Load the strings into ZMM registers - just once.
-    sz_u512_vec_t longer_vec, shorter_vec, shorter_rotated_vec, rotate_left_vec, rotate_right_vec, ones_vec, bound_vec;
-    longer_vec.zmm = _mm512_maskz_loadu_epi8(_sz_u64_mask_until(longer_length), longer);
-    rotate_left_vec.zmm = _mm512_set_epi8(                              //
-        0, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,  //
-        48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, //
-        32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, //
-        16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-    rotate_right_vec.zmm = _mm512_set_epi8(                             //
-        62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48,     //
-        47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, //
-        31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, //
-        15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 63);
-    ones_vec.zmm = _mm512_set1_epi8(1);
-    bound_vec.zmm = _mm512_set1_epi8(bound <= 255 ? (sz_u8_t)bound : 255);
-
-    // To simplify comparisons and traversals, we want to reverse the order of bytes in the shorter string.
-    for (sz_size_t i = 0; i != shorter_length; ++i) shorter_vec.u8s[63 - i] = shorter[i];
-    shorter_rotated_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, shorter_vec.zmm);
-
-    // Let's say we are dealing with 3 and 5 letter words.
-    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
-    // It will have:
-    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
-    // - 2 diagonals of fixed length, at positions: 4, 5.
-    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
-    sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
-
-    // Initialize the first two diagonals:
-    //
-    //      previous_vec.u8s[0] = 0;
-    //      current_vec.u8s[0] = current_vec.u8s[1] = 1;
-    //
-    // We can do a similar thing with vector ops:
-    previous_vec.zmm = _mm512_setzero_si512();
-    current_vec.zmm = _mm512_set1_epi8(1);
-
-    // We skip diagonals 0 and 1, as they are trivial.
-    // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
-    // so we are effectively computing just one value, as will be marked by a single set bit in
-    // the `next_diagonal_mask` on the very first iteration.
-    sz_size_t next_diagonal_index = 2;
-    __mmask64 next_diagonal_mask = 0;
-
-    // Progress through the upper triangle of the Levenshtein matrix.
-    for (; next_diagonal_index != shorter_dim; ++next_diagonal_index) {
-        // After this iteration, the values at offset `0` and `next_diagonal_index` in the `next_vec`
-        // should be set to `next_diagonal_index`, but it's easier to broadcast the value to the whole vector,
-        // and later merge with a mask with new values.
-        next_vec.zmm = _mm512_set1_epi8((sz_u8_t)next_diagonal_index);
-
-        // The mask also adds one set bit.
-        next_diagonal_mask = _kor_mask64(next_diagonal_mask, 1);
-        next_diagonal_mask = _kshiftli_mask64(next_diagonal_mask, 1);
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        substitutions_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, substitutions_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(_mm512_permutexvar_epi8(rotate_right_vec.zmm, current_vec.zmm), current_vec.zmm),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_mask_min_epu8(next_vec.zmm, next_diagonal_mask, gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = current_vec.zmm;
-        current_vec.zmm = next_vec.zmm;
-
-        // Shift the shorter string
-        shorter_rotated_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, shorter_rotated_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
-    }
-
-    // Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.
-    for (; next_diagonal_index != longer_dim; ++next_diagonal_index) {
-        // After this iteration, the value `shorted_dim - 1` in the `next_vec`
-        // should be set to `next_diagonal_index`, but it's easier to broadcast the value to the whole vector,
-        // and later merge with a mask with new values.
-        next_vec.zmm = _mm512_set1_epi8((sz_u8_t)next_diagonal_index);
-
-        // Make sure we update the first entry.
-        next_diagonal_mask = _kor_mask64(next_diagonal_mask, 1);
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(current_vec.zmm, _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm)),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_mask_min_epu8(next_vec.zmm, next_diagonal_mask, gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm);
-        current_vec.zmm = next_vec.zmm;
-
-        // Let's shift the longer string now.
-        longer_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, longer_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
-    }
-
-    // Now let's handle the bottom right triangle.
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(current_vec.zmm, _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm)),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_min_epu8(gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm);
-        current_vec.zmm = next_vec.zmm;
-
-        // Let's shift the longer string now.
-        longer_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, longer_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
-        // In every following iterations we take use a shorter prefix of each register,
-        // but we don't need to update the `next_diagonal_mask` anymore... except for the early exit.
-        next_diagonal_mask = _kshiftri_mask64(next_diagonal_mask, 1);
-    }
-    return current_vec.u8s[0];
-}
-
-/**
- *  @brief  Computes the edit distance between two somewhat short bytes-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 127, and evaluates at most (127 * 2 + 1 = 255) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *  Benefits from the @b `vpermi2b` instructions, that can rotate the bytes in 2 registers at once.
- *
- *  This may be one of the most freuqently called kernels for:
- *  - source code analysis, assuming most lines are either under 80 or under 120 characters long.
- *  - DNA sequence alignment, as most short reads are 50-300 characters long.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto127_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two longer bytes-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 255, and evaluates at most (255 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *
- *  Each of 2x string ends up occupying 4 ZMM registers, and each of 3x diagonals uses 4 ZMM registers.
- *  So 20x of the 32x are persistently occupied, and the rest are used for math temporarily.
- *  This is the largest space-efficient variant, as strings beyond 255 characters may require
- *  16-bit accumulators, which would be a significant bottleneck.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                      //
-    sz_cptr_t longer, sz_size_t longer_length,                        //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two longer bytes-strings using the AVX-512VBMI extensions,
- *          assuming the upper distance bound can not exceed 255, but the string length can be arbitrary.
- *
- *  Applies to string lengths up to 255, and evaluates at most (255 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *
- *  Each of 2x string ends up occupying 4 ZMM registers, and each of 3x diagonals uses 4 ZMM registers.
- *  So 20x of the 32x are persistently occupied, and the rest are used for math temporarily.
- *  This is the largest space-efficient variant, as strings beyond 255 characters may require
- *  16-bit accumulators, which would be a significant bottleneck.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto255bound_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                              //
-    sz_cptr_t longer, sz_size_t longer_length,                                //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two mid-length UTF-8-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 127, and evaluates at most (127 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Benefits from the @b `valignd` instructions used to rotate UTF-32 unpacked unicode codepoints.
- *
- *  Each string is unpacked into 128 characters * 4 bytes per character / 64 bytes per register = 8 registers.
- *
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_utf8_skewed_diagonals_upto127_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                              //
-    sz_cptr_t longer, sz_size_t longer_length,                                //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    sz_unused(shorter && longer && bound && alloc);
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // TODO: Generalize!
-    sz_size_t const max_length = 256u * 256u;
-    sz_assert(shorter_length <= longer_length && "The 'shorter' string is longer than the 'longer' one.");
-    sz_assert(shorter_length < max_length && "The length must fit into 16-bit integer. Otherwise use serial variant.");
-    sz_unused(longer_length && bound && max_length);
-
-#if 0
-    // We are going to store 3 diagonals of the matrix.
-    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
-    sz_size_t const shorter_dim = shorter_length + 1;
-    sz_size_t const longer_dim = longer_length + 1;
-    // Unlike the serial version, we also want to avoid reverse-order iteration over teh shorter string.
-    // So let's allocate a bit more memory and reverse-export our shorter string into that buffer.
-    sz_size_t const buffer_length = sizeof(sz_u16_t) * longer_dim * 3 + shorter_length;
-    sz_u16_t *const distances = (sz_u16_t *)alloc->allocate(buffer_length, alloc->handle);
-    if (!distances) return SZ_SIZE_MAX;
-
-    // The next few pointers will be swapped around.
-    sz_u16_t *previous_distances = distances;
-    sz_u16_t *current_distances = previous_distances + longer_dim;
-    sz_u16_t *next_distances = current_distances + longer_dim;
-    sz_ptr_t const shorter_reversed = (sz_ptr_t)(next_distances + longer_dim);
-
-    // Export the reversed string into the buffer.
-    for (sz_size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
-
-    // Initialize the first two diagonals:
-    previous_distances[0] = 0;
-    current_distances[0] = current_distances[1] = 1;
-
-    // Using ZMM registers, we can process 32x 16-bit values at once,
-    // storing 16 bytes of each string in YMM registers.
-    sz_u512_vec_t insertions_vec, deletions_vec, substitutions_vec, next_vec;
-    sz_u512_vec_t ones_u16_vec;
-    ones_u16_vec.zmm = _mm512_set1_epi16(1);
-
-    // This is a mixed-precision implementation, using 8-bit representations for part of the operations.
-    // Even there, in case `SZ_USE_X86_AVX2=0`, let's use the `sz_u512_vec_t` type, addressing the first YMM halfs.
-    sz_u512_vec_t shorter_vec, longer_vec;
-    sz_u512_vec_t ones_u8_vec;
-    ones_u8_vec.ymms[0] = _mm256_set1_epi8(1);
-
-    // Let's say we are dealing with 3 and 5 letter words.
-    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
-    // It will have:
-    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
-    // - 2 diagonals of fixed length, at positions: 4, 5.
-    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
-    sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
-
-    // Progress through the upper triangle of the Levenshtein matrix.
-    sz_size_t next_diagonal_index = 2;
-    for (; next_diagonal_index != shorter_dim; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
-        for (sz_size_t offset_within_diagonal = 0; offset_within_diagonal + 2 < next_diagonal_length;) {
-            sz_u32_t remaining_length = (sz_u32_t)(next_diagonal_length - offset_within_diagonal - 2);
-            sz_u32_t register_length = remaining_length < 32 ? remaining_length : 32;
-            sz_u32_t remaining_length_mask = _bzhi_u32(0xFFFFFFFFu, register_length);
-            longer_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, longer + offset_within_diagonal);
-            // Our original code addressed the shorter string `[next_diagonal_index - offset_within_diagonal - 2]`
-            // for growing `offset_within_diagonal`. If the `shorter` string was reversed, the
-            // `[next_diagonal_index - offset_within_diagonal - 2]` would be equal to `[shorter_length - 1 -
-            // next_diagonal_index + offset_within_diagonal + 2]`. Which simplified would be equal to
-            // `[shorter_length - next_diagonal_index + offset_within_diagonal + 1]`.
-            shorter_vec.ymms[0] = _mm256_maskz_loadu_epi8( //
-                remaining_length_mask,
-                shorter_reversed + shorter_length - next_diagonal_index + offset_within_diagonal + 1);
-            // For substitutions, perform the equality comparison using AVX2 instead of AVX-512
-            // to get the result as a vector, instead of a bitmask. Adding 1 to every scalar we can overflow
-            // transforming from {0xFF, 0} values to {0, 1} values - exactly what we need. Then - upcast to 16-bit.
-            substitutions_vec.zmm = _mm512_cvtepi8_epi16( //
-                _mm256_add_epi8(_mm256_cmpeq_epi8(longer_vec.ymms[0], shorter_vec.ymms[0]), ones_u8_vec.ymms[0]));
-            substitutions_vec.zmm = _mm512_add_epi16( //
-                substitutions_vec.zmm,
-                _mm512_maskz_loadu_epi16(remaining_length_mask, previous_distances + offset_within_diagonal));
-            // For insertions and deletions, on modern hardware, it's faster to issue two separate loads,
-            // than rotate the bytes in the ZMM register.
-            insertions_vec.zmm =
-                _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + offset_within_diagonal);
-            deletions_vec.zmm =
-                _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + offset_within_diagonal + 1);
-            // First get the minimum of insertions and deletions.
-            next_vec.zmm = _mm512_add_epi16(_mm512_min_epu16(insertions_vec.zmm, deletions_vec.zmm), ones_u16_vec.zmm);
-            next_vec.zmm = _mm512_min_epu16(next_vec.zmm, substitutions_vec.zmm);
-            _mm512_mask_storeu_epi16(next_distances + offset_within_diagonal + 1, remaining_length_mask, next_vec.zmm);
-            offset_within_diagonal += register_length;
-        }
-        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-        next_distances[0] = next_distances[next_diagonal_length - 1] = (sz_u16_t)next_diagonal_index;
-        // Perform a circular rotation (three-way swap) of those buffers, to reuse the memory.
-        sz_u16_t *temporary = previous_distances;
-        previous_distances = current_distances;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a
-    // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
-    // index on either side, we will be cropping those values out.
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
-        for (sz_size_t i = 0; i != next_diagonal_length;) {
-            sz_u32_t remaining_length = (sz_u32_t)(next_diagonal_length - i);
-            sz_u32_t register_length = remaining_length < 32 ? remaining_length : 32;
-            sz_u32_t remaining_length_mask = _bzhi_u32(0xFFFFFFFFu, register_length);
-            longer_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, longer + next_diagonal_index - n + i);
-            // Our original code addressed the shorter string `[shorter_length - 1 - i]` for growing `i`.
-            // If the `shorter` string was reversed, the `[shorter_length - 1 - i]` would
-            // be equal to `[shorter_length - 1 - shorter_length + 1 + i]`.
-            // Which simplified would be equal to just `[i]`. Beautiful!
-            shorter_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, shorter_reversed + i);
-            // For substitutions, perform the equality comparison using AVX2 instead of AVX-512
-            // to get the result as a vector, instead of a bitmask. The compare it against the accumulated
-            // substitution costs.
-            substitutions_vec.zmm = _mm512_cvtepi8_epi16( //
-                _mm256_add_epi8(_mm256_cmpeq_epi8(longer_vec.ymms[0], shorter_vec.ymms[0]), ones_u8_vec.ymms[0]));
-            substitutions_vec.zmm = _mm512_add_epi16( //
-                substitutions_vec.zmm, _mm512_maskz_loadu_epi16(remaining_length_mask, previous_distances + i));
-            // For insertions and deletions, on modern hardware, it's faster to issue two separate loads,
-            // than rotate the bytes in the ZMM register.
-            insertions_vec.zmm = _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + i);
-            deletions_vec.zmm = _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + i + 1);
-            // First get the minimum of insertions and deletions.
-            next_vec.zmm = _mm512_add_epi16(_mm512_min_epu16(insertions_vec.zmm, deletions_vec.zmm), ones_u16_vec.zmm);
-            next_vec.zmm = _mm512_min_epu16(next_vec.zmm, substitutions_vec.zmm);
-            _mm512_mask_storeu_epi16(next_distances + i, remaining_length_mask, next_vec.zmm);
-            i += register_length;
-        }
-
-        // Perform a circular rotation (three-way swap) of those buffers, to reuse the memory, this time, with a shift,
-        // dropping the first element in the current array.
-        sz_u16_t *temporary = previous_distances;
-        previous_distances = current_distances + 1;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // Cache scalar before `free` call.
-    sz_size_t result = current_distances[0];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-#endif
-    return 0;
-}
-
-SZ_INTERNAL sz_size_t sz_edit_distance_avx512(   //
-    sz_cptr_t shorter, sz_size_t shorter_length, //
-    sz_cptr_t longer, sz_size_t longer_length,   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Bounded computations may exit early.
-    int const is_bounded = bound < longer_length;
-    if (is_bounded) {
-        // If one of the strings is empty - the edit distance is equal to the length of the other one.
-        if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
-        if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
-        // If the difference in length is beyond the `bound`, there is no need to check at all.
-        if (longer_length - shorter_length > bound) return bound;
-    }
-
-    // Make sure the shorter string is actually shorter.
-    if (shorter_length > longer_length) {
-        sz_cptr_t temporary = shorter;
-        shorter = longer;
-        longer = temporary;
-        sz_size_t temporary_length = shorter_length;
-        shorter_length = longer_length;
-        longer_length = temporary_length;
-    }
-
-    // Dispatch the right implementation based on the length of the strings.
-    if (longer_length < 64u)
-        return _sz_edit_distance_skewed_diagonals_upto63_avx512( //
-            shorter, shorter_length, longer, longer_length, bound);
-    // else if (longer_length < 256u * 256u)
-    //     return _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
-    //         shorter, shorter_length, longer, longer_length, bound, alloc);
-    else
-        return sz_edit_distance_serial(shorter, shorter_length, longer, longer_length, bound, alloc);
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_avx512(sz_cptr_t text, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "loads".
-    //
-    // A typical AWS Sapphire Rapids instance can have 48 KB x 2 blocks of L1 data cache per core,
-    // 2 MB x 2 blocks of L2 cache per core, and one shared 60 MB buffer of L3 cache.
-    // With two strings, we may consider the overal workload huge, if each exceeds 1 MB in length.
-    int const is_huge = length >= 1ull * 1024ull * 1024ull;
-    sz_u512_vec_t text_vec, sums_vec;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 16) {
-        __mmask16 mask = _sz_u16_mask_until(length);
-        text_vec.xmms[0] = _mm_maskz_loadu_epi8(mask, text);
-        sums_vec.xmms[0] = _mm_sad_epu8(text_vec.xmms[0], _mm_setzero_si128());
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_vec.xmms[0]);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_vec.xmms[0], 1);
-        return low + high;
-    }
-    else if (length <= 32) {
-        __mmask32 mask = _sz_u32_mask_until(length);
-        text_vec.ymms[0] = _mm256_maskz_loadu_epi8(mask, text);
-        sums_vec.ymms[0] = _mm256_sad_epu8(text_vec.ymms[0], _mm256_setzero_si256());
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymms[0]);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymms[0], 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        return low + high;
-    }
-    else if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        return _mm512_reduce_add_epi64(sums_vec.zmm);
-    }
-    else if (!is_huge) {
-        sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(text + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length; // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        for (text += head_length; body_length >= 64; text += 64, body_length -= 64) {
-            text_vec.zmm = _mm512_load_si512((__m512i const *)text);
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        }
-        text_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text);
-        sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        return _mm512_reduce_add_epi64(sums_vec.zmm);
-    }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    //
-    //      1. Moving in both directions to maximize the throughput, when fetching from multiple
-    //         memory pages. Also helps with cache set-associativity issues, as we won't always
-    //         be fetching the same entries in the lookup table.
-    //      2. Using non-temporal stores to avoid polluting the cache.
-    //      3. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
-    //         for predictable patterns, so disregard this advice.
-    //
-    // Bidirectional traversal generally adds about 10% to such algorithms.
-    else {
-        sz_u512_vec_t text_reversed_vec, sums_reversed_vec;
-        sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64;
-        sz_size_t tail_length = (sz_size_t)(text + length) % 64;
-        sz_size_t body_length = length - head_length - tail_length;
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-
-        text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        text_reversed_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text + head_length + body_length);
-        sums_reversed_vec.zmm = _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512());
-
-        // Now in the main loop, we can use non-temporal loads and stores,
-        // performing the operation in both directions.
-        for (text += head_length; body_length >= 128; text += 64, text += 64, body_length -= 128) {
-            text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-            text_reversed_vec.zmm = _mm512_stream_load_si512((__m512i *)(text + body_length - 64));
-            sums_reversed_vec.zmm =
-                _mm512_add_epi64(sums_reversed_vec.zmm, _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512()));
-        }
-        if (body_length >= 64) {
-            text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        }
-
-        return _mm512_reduce_add_epi64(_mm512_add_epi64(sums_vec.zmm, sums_reversed_vec.zmm));
-    }
-}
-
-SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    if (length < 4 * window_length) {
-        sz_hashes_serial(start, length, window_length, step, callback, callback_handle);
-        return;
-    }
-
-    // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
-    // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
-    sz_size_t const max_hashes = length - window_length + 1;
-    sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
-    sz_u8_t const *text_first = (sz_u8_t const *)start;
-    sz_u8_t const *text_second = text_first + min_hashes_per_thread;
-    sz_u8_t const *text_third = text_first + min_hashes_per_thread * 2;
-    sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
-    sz_u8_t const *text_end = text_first + length;
-
-    // Broadcast the global constants into the registers.
-    // Both high and low hashes will work with the same prime and golden ratio.
-    sz_u512_vec_t prime_vec, golden_ratio_vec;
-    prime_vec.zmm = _mm512_set1_epi64(SZ_U64_MAX_PRIME);
-    golden_ratio_vec.zmm = _mm512_set1_epi64(11400714819323198485ull);
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // We will be evaluating 4 offsets at a time with 2 different hash functions.
-    // We can fit all those 8 state variables in each of the following ZMM registers.
-    sz_u512_vec_t base_vec, prime_power_vec, shift_vec;
-    base_vec.zmm = _mm512_set_epi64(31ull, 31ull, 31ull, 31ull, 257ull, 257ull, 257ull, 257ull);
-    shift_vec.zmm = _mm512_set_epi64(0ull, 0ull, 0ull, 0ull, 77ull, 77ull, 77ull, 77ull);
-    prime_power_vec.zmm = _mm512_set_epi64(prime_power_low, prime_power_low, prime_power_low, prime_power_low,
-                                           prime_power_high, prime_power_high, prime_power_high, prime_power_high);
-
-    // Compute the initial hash values for every one of the four windows.
-    sz_u512_vec_t hash_vec, chars_vec;
-    hash_vec.zmm = _mm512_setzero_si512();
-    for (sz_u8_t const *prefix_end = text_first + window_length; text_first < prefix_end;
-         ++text_first, ++text_second, ++text_third, ++text_fourth) {
-
-        // 1. Multiply the hashes by the base.
-        hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`...
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
-                                         text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-
-        // 3. Add the incoming characters.
-        hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
-                                              _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
-    }
-
-    // 5. Compute the hash mix, that will be used to index into the fingerprint.
-    //    This includes a serial step at the end.
-    sz_u512_vec_t hash_mix_vec;
-    hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
-    hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
-                                            _mm512_extracti64x4_epi64(hash_mix_vec.zmm, 0));
-
-    callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-    callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-    callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-    callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-
-    // Now repeat that operation for the remaining characters, discarding older characters.
-    sz_size_t cycle = 1;
-    sz_size_t step_mask = step - 1;
-    for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
-        // 0. Load again the four characters we are dropping, shift them, and subtract.
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[-window_length], text_third[-window_length],
-                                         text_second[-window_length], text_first[-window_length], //
-                                         text_fourth[-window_length], text_third[-window_length],
-                                         text_second[-window_length], text_first[-window_length]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-        hash_vec.zmm = _mm512_sub_epi64(hash_vec.zmm, _mm512_mullo_epi64(chars_vec.zmm, prime_power_vec.zmm));
-
-        // 1. Multiply the hashes by the base.
-        hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
-                                         text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-
-        // ... and prefetch the next four characters into Level 2 or higher.
-        _mm_prefetch((sz_cptr_t)text_fourth + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_third + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_second + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_first + 1, _MM_HINT_T1);
-
-        // 3. Add the incoming characters.
-        hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
-                                              _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
-
-        // 5. Compute the hash mix, that will be used to index into the fingerprint.
-        //    This includes a serial step at the end.
-        hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
-        hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
-                                                _mm512_castsi512_si256(hash_mix_vec.zmm));
-
-        if ((cycle & step_mask) == 0) {
-            callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-            callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-            callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-            callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-        }
-    }
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512vbmi", "avx512vbmi2", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512vbmi,avx512vbmi2,bmi,bmi2"))), \
-                             apply_to = function)
-
-SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-
-    // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
-    // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
-    // But if at least 3 cache lines are touched, the AVX-512 implementation should be faster.
-    if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
-        return;
-    }
-
-    // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
-    // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
-    // by computing 2 masks - for the head and tail, using masked stores for the head and tail, and unmasked
-    // for the body.
-    sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-    sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-    __mmask64 head_mask = _sz_u64_mask_until(head_length);
-    __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-
-    // We need to pull the lookup table into 4x ZMM registers.
-    // We can use `vpermi2b` instruction to perform the look in two ZMM registers with `_mm512_permutex2var_epi8`
-    // intrinsics, but it has a 6-cycle latency on Sapphire Rapids and requires AVX512-VBMI. Assuming we need to
-    // operate on 4 registers, it might be cleaner to use 2x separate `_mm512_permutexvar_epi8` calls.
-    // Combining the results with 2x `_mm512_test_epi8_mask` and 3x blends afterwards.
-    //
-    //  - 4x `_mm512_permutexvar_epi8` maps to "VPERMB (ZMM, ZMM, ZMM)":
-    //      - On Ice Lake: 3 cycles latency, ports: 1*p5
-    //      - On Genoa: 6 cycles latency, ports: 1*FP12
-    //  - 3x `_mm512_mask_blend_epi8` maps to "VPBLENDMB_Z (ZMM, K, ZMM, ZMM)":
-    //      - On Ice Lake: 3 cycles latency, ports: 1*p05
-    //      - On Genoa: 1 cycle latency, ports: 1*FP0123
-    //  - 2x `_mm512_test_epi8_mask` maps to "VPTESTMB (K, ZMM, ZMM)":
-    //      - On Ice Lake: 3 cycles latency, ports: 1*p5
-    //      - On Genoa: 4 cycles latency, ports: 1*FP01
-    //
-    sz_u512_vec_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec;
-    lut_0_to_63_vec.zmm = _mm512_loadu_si512((lut));
-    lut_64_to_127_vec.zmm = _mm512_loadu_si512((lut + 64));
-    lut_128_to_191_vec.zmm = _mm512_loadu_si512((lut + 128));
-    lut_192_to_255_vec.zmm = _mm512_loadu_si512((lut + 192));
-
-    sz_u512_vec_t first_bit_vec, second_bit_vec;
-    first_bit_vec.zmm = _mm512_set1_epi8((char)0x80);
-    second_bit_vec.zmm = _mm512_set1_epi8((char)0x40);
-
-    __mmask64 first_bit_mask, second_bit_mask;
-    sz_u512_vec_t source_vec;
-    // If the top bit is set in each word of `source_vec`, than we use `lookup_128_to_191_vec` or
-    // `lookup_192_to_255_vec`. If the second bit is set, we use `lookup_64_to_127_vec` or `lookup_192_to_255_vec`.
-    sz_u512_vec_t lookup_0_to_63_vec, lookup_64_to_127_vec, lookup_128_to_191_vec, lookup_192_to_255_vec;
-    sz_u512_vec_t blended_0_to_127_vec, blended_128_to_255_vec, blended_0_to_255_vec;
-
-    // Handling the head.
-    if (head_length) {
-        source_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, source);
-        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
-        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
-        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
-        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
-        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
-        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
-        blended_0_to_127_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
-        blended_128_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
-        blended_0_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
-        _mm512_mask_storeu_epi8(target, head_mask, blended_0_to_255_vec.zmm);
-        source += head_length, target += head_length, length -= head_length;
-    }
-
-    // Handling the body in 64-byte chunks aligned to cache-line boundaries with respect to `target`.
-    while (length >= 64) {
-        source_vec.zmm = _mm512_loadu_si512(source);
-        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
-        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
-        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
-        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
-        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
-        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
-        blended_0_to_127_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
-        blended_128_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
-        blended_0_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
-        _mm512_store_si512(target, blended_0_to_255_vec.zmm); //! Aligned store, our main weapon!
-        source += 64, target += 64, length -= 64;
-    }
-
-    // Handling the tail.
-    if (tail_length) {
-        source_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, source);
-        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
-        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
-        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
-        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
-        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
-        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
-        blended_0_to_127_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
-        blended_128_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
-        blended_0_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
-        _mm512_mask_storeu_epi8(target, tail_mask, blended_0_to_255_vec.zmm);
-        source += tail_length, target += tail_length, length -= tail_length;
-    }
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-
-    // Before initializing the AVX-512 vectors, we may want to run the sequential code for the first few bytes.
-    // In practice, that only hurts, even when we have matches every 5-ish bytes.
-    //
-    //      if (length < SZ_SWAR_THRESHOLD) return sz_find_charset_serial(text, length, filter);
-    //      sz_cptr_t early_result = sz_find_charset_serial(text, SZ_SWAR_THRESHOLD, filter);
-    //      if (early_result) return early_result;
-    //      text += SZ_SWAR_THRESHOLD;
-    //      length -= SZ_SWAR_THRESHOLD;
-    //
-    // Let's unzip even and odd elements and replicate them into both lanes of the YMM register.
-    // That way when we invoke `_mm512_shuffle_epi8` we can use the same mask for both lanes.
-    sz_u512_vec_t filter_even_vec, filter_odd_vec;
-    __m256i filter_ymm = _mm256_lddqu_si256((__m256i const *)filter);
-    // There are a few way to initialize filters without having native strided loads.
-    // In the cronological order of experiments:
-    // - serial code initializing 128 bytes of odd and even mask
-    // - using several shuffles
-    // - using `_mm512_permutexvar_epi8`
-    // - using `_mm512_broadcast_i32x4(_mm256_castsi256_si128(_mm256_maskz_compress_epi8(0x55555555, filter_ymm)))`
-    //   and `_mm512_broadcast_i32x4(_mm256_castsi256_si128(_mm256_maskz_compress_epi8(0xaaaaaaaa, filter_ymm)))`
-    filter_even_vec.zmm = _mm512_broadcast_i32x4(_mm256_castsi256_si128( // broadcast __m128i to __m512i
-        _mm256_maskz_compress_epi8(0x55555555, filter_ymm)));
-    filter_odd_vec.zmm = _mm512_broadcast_i32x4(_mm256_castsi256_si128( // broadcast __m128i to __m512i
-        _mm256_maskz_compress_epi8(0xaaaaaaaa, filter_ymm)));
-    // After the unzipping operation, we can validate the contents of the vectors like this:
-    //
-    //      for (sz_size_t i = 0; i != 16; ++i) {
-    //          sz_assert(filter_even_vec.u8s[i] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 16] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 16] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 32] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 32] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 48] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 48] == filter->_u8s[i * 2 + 1]);
-    //      }
-    //
-    sz_u512_vec_t text_vec;
-    sz_u512_vec_t lower_nibbles_vec, higher_nibbles_vec;
-    sz_u512_vec_t bitset_even_vec, bitset_odd_vec;
-    sz_u512_vec_t bitmask_vec, bitmask_lookup_vec;
-    bitmask_lookup_vec.zmm = _mm512_set_epi8(                       //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);
-
-    while (length) {
-        // The following algorithm is a transposed equivalent of the "SIMDized check which bytes are in a set"
-        // solutions by Wojciech Muła. We populate the bitmask differently and target newer CPUs, so
-        // StrinZilla uses a somewhat different approach.
-        // http://0x80.pl/articles/simd-byte-lookup.html#alternative-implementation-new
-        //
-        //      sz_u8_t input = *(sz_u8_t const *)text;
-        //      sz_u8_t lo_nibble = input & 0x0f;
-        //      sz_u8_t hi_nibble = input >> 4;
-        //      sz_u8_t bitset_even = filter_even_vec.u8s[hi_nibble];
-        //      sz_u8_t bitset_odd = filter_odd_vec.u8s[hi_nibble];
-        //      sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //      sz_u8_t bitset = lo_nibble < 8 ? bitset_even : bitset_odd;
-        //      if ((bitset & bitmask) != 0) return text;
-        //      else { length--, text++; }
-        //
-        // The nice part about this, loading the strided data is vey easy with Arm NEON,
-        // while with x86 CPUs after AVX, shuffles within 256 bits shouldn't be an issue either.
-        sz_size_t load_length = sz_min_of_two(length, 64);
-        __mmask64 load_mask = _sz_u64_mask_until(load_length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, text);
-        lower_nibbles_vec.zmm = _mm512_and_si512(text_vec.zmm, _mm512_set1_epi8(0x0f));
-        bitmask_vec.zmm = _mm512_shuffle_epi8(bitmask_lookup_vec.zmm, lower_nibbles_vec.zmm);
-        //
-        // At this point we can validate the `bitmask_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != load_length; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t lo_nibble = input & 0x0f;
-        //          sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //          sz_assert(bitmask_vec.u8s[i] == bitmask);
-        //      }
-        //
-        // Shift right every byte by 4 bits.
-        // There is no `_mm512_srli_epi8` intrinsic, so we have to use `_mm512_srli_epi16`
-        // and combine it with a mask to clear the higher bits.
-        higher_nibbles_vec.zmm = _mm512_and_si512(_mm512_srli_epi16(text_vec.zmm, 4), _mm512_set1_epi8(0x0f));
-        bitset_even_vec.zmm = _mm512_shuffle_epi8(filter_even_vec.zmm, higher_nibbles_vec.zmm);
-        bitset_odd_vec.zmm = _mm512_shuffle_epi8(filter_odd_vec.zmm, higher_nibbles_vec.zmm);
-        //
-        // At this point we can validate the `bitset_even_vec` and `bitset_odd_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != load_length; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t const *bitset_ptr = &filter->_u8s[0];
-        //          sz_u8_t hi_nibble = input >> 4;
-        //          sz_u8_t bitset_even = bitset_ptr[hi_nibble * 2];
-        //          sz_u8_t bitset_odd = bitset_ptr[hi_nibble * 2 + 1];
-        //          sz_assert(bitset_even_vec.u8s[i] == bitset_even);
-        //          sz_assert(bitset_odd_vec.u8s[i] == bitset_odd);
-        //      }
-        //
-        // TODO: Is this a good place for ternary logic?
-        __mmask64 take_first = _mm512_cmplt_epi8_mask(lower_nibbles_vec.zmm, _mm512_set1_epi8(8));
-        bitset_even_vec.zmm = _mm512_mask_blend_epi8(take_first, bitset_odd_vec.zmm, bitset_even_vec.zmm);
-        __mmask64 matches_mask = _mm512_mask_test_epi8_mask(load_mask, bitset_even_vec.zmm, bitmask_vec.zmm);
-        if (matches_mask) {
-            int offset = sz_u64_ctz(matches_mask);
-            return text + offset;
-        }
-        else { text += load_length, length -= load_length; }
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-    return sz_rfind_charset_serial(text, length, filter);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_many_avx512(                        //
-    sz_cptr_t haystack, sz_size_t haystack_length,              //
-    sz_cptr_t const *needles, sz_size_t const *needles_lengths, //
-    sz_size_t *needle_offset) {
-
-    // When dealing with huge needles vocabularies, like in tokenization workloads, we need to construct an automaton.
-    // But in many cases, the vocabulary is small enough to use a simpler DFA-less approach, combining the ideas from
-    // the `sz_find_avx512` and `sz_find_charset_avx512` functions.
-    //
-    // Pick the offsets within needles where there is the least variance in the characters.
-    // Like for "the", "then", "there", "these", "those", "their", "they", "them", "that", "this", "thus", "than":
-    //
-    //    0: 't'
-    //    1: 'h'
-    //    2: 'e', 'a', 'i', 'o', 'u'
-    //    3: 'n', 'r', 's', 'i', 'y', 'm', 't'
-    //
-    // So depending on our "register budget", we can use a different number of pivot points: offset 0, 1, 2 make
-    // the most sense if we can only use 3 ZMM registers.
-    sz_unused(haystack && haystack_length && needles && needles_lengths && needle_offset);
-    return 0;
-}
-
-/**
- *  Computes the Needleman Wunsch alignment score between two strings.
- *  The method uses 32-bit integers to accumulate the running score for every cell in the matrix.
- *  Assuming the costs of substitutions can be arbitrary signed 8-bit integers, the method is expected to be used
- *  on strings not exceeding 2^24 length or 16.7 million characters.
- *
- *  Unlike the `_sz_edit_distance_skewed_diagonals_upto65k_avx512` method, this one uses signed integers to store
- *  the accumulated score. Moreover, it's primary bottleneck is the latency of gathering the substitution costs
- *  from the substitution matrix. If we use the diagonal order, we will be comparing a slice of the first string with
- *  a slice of the second. If we stick to the conventional horizontal order, we will be comparing one character against
- *  a slice, which is much easier to optimize. In that case we are sampling costs not from arbitrary parts of
- *  a 256 x 256 matrix, but from a single row!
- */
-SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
-
-    // If one of the strings is empty - the edit distance is equal to the length of the other one
-    if (longer_length == 0) return (sz_ssize_t)shorter_length * gap;
-    if (shorter_length == 0) return (sz_ssize_t)longer_length * gap;
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
-    }
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    sz_size_t const max_length = 256ull * 256ull * 256ull;
-    sz_size_t const n = longer_length + 1;
-    sz_assert(n < max_length && "The length must fit into 24-bit integer. Otherwise use serial variant.");
-    sz_unused(longer_length && max_length);
-
-    sz_size_t buffer_length = sizeof(sz_i32_t) * n * 2;
-    sz_i32_t *distances = (sz_i32_t *)alloc->allocate(buffer_length, alloc->handle);
-    sz_i32_t *previous_distances = distances;
-    sz_i32_t *current_distances = previous_distances + n;
-
-    // Intialize the first row of the Levenshtein matrix with `iota`.
-    for (sz_size_t idx_longer = 0; idx_longer != n; ++idx_longer)
-        previous_distances[idx_longer] = (sz_i32_t)idx_longer * gap;
-
-    /// Contains up to 16 consecutive characters from the longer string.
-    sz_u512_vec_t longer_vec;
-    sz_u512_vec_t cost_deletion_vec, cost_substitution_vec, lookup_substitution_vec, current_vec;
-    sz_u512_vec_t row_first_subs_vec, row_second_subs_vec, row_third_subs_vec, row_fourth_subs_vec;
-    sz_u512_vec_t shuffled_first_subs_vec, shuffled_second_subs_vec, shuffled_third_subs_vec, shuffled_fourth_subs_vec;
-
-    // Prepare constants and masks.
-    sz_u512_vec_t is_third_or_fourth_vec, is_second_or_fourth_vec, gap_vec;
-    {
-        char is_third_or_fourth_check, is_second_or_fourth_check;
-        *(sz_u8_t *)&is_third_or_fourth_check = 0x80, *(sz_u8_t *)&is_second_or_fourth_check = 0x40;
-        is_third_or_fourth_vec.zmm = _mm512_set1_epi8(is_third_or_fourth_check);
-        is_second_or_fourth_vec.zmm = _mm512_set1_epi8(is_second_or_fourth_check);
-        gap_vec.zmm = _mm512_set1_epi32(gap);
-    }
-
-    sz_u8_t const *shorter_unsigned = (sz_u8_t const *)shorter;
-    for (sz_size_t idx_shorter = 0; idx_shorter != shorter_length; ++idx_shorter) {
-        sz_i32_t last_in_row = current_distances[0] = (sz_i32_t)(idx_shorter + 1) * gap;
-
-        // Load one row of the substitution matrix into four ZMM registers.
-        sz_error_cost_t const *row_subs = subs + shorter_unsigned[idx_shorter] * 256u;
-        row_first_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 0);
-        row_second_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 1);
-        row_third_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 2);
-        row_fourth_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 3);
-
-        // In the serial version we have one forward pass, that computes the deletion,
-        // insertion, and substitution costs at once.
-        //    for (sz_size_t idx_longer = 0; idx_longer < longer_length; ++idx_longer) {
-        //        sz_ssize_t cost_deletion = previous_distances[idx_longer + 1] + gap;
-        //        sz_ssize_t cost_insertion = current_distances[idx_longer] + gap;
-        //        sz_ssize_t cost_substitution = previous_distances[idx_longer] + row_subs[longer_unsigned[idx_longer]];
-        //        current_distances[idx_longer + 1] = sz_min_of_three(cost_deletion, cost_insertion, cost_substitution);
-        //    }
-        //
-        // Given the complexity of handling the data-dependency between consecutive insertion cost computations
-        // within a Levenshtein matrix, the simplest design would be to vectorize every kind of cost computation
-        // separately.
-        //      1. Compute substitution costs for up to 64 characters at once, upcasting from 8-bit integers to 32.
-        //      2. Compute the pairwise minimum with deletion costs.
-        //      3. Inclusive prefix minimum computation to combine with addition costs.
-        // Proceeding with substitutions:
-        for (sz_size_t idx_longer = 0; idx_longer < longer_length; idx_longer += 64) {
-            sz_size_t register_length = sz_min_of_two(longer_length - idx_longer, 64);
-            __mmask64 mask = _sz_u64_mask_until(register_length);
-            longer_vec.zmm = _mm512_maskz_loadu_epi8(mask, longer + idx_longer);
-
-            // Blend the `row_(first|second|third|fourth)_subs_vec` into `current_vec`, picking the right source
-            // for every character in `longer_vec`. Before that, we need to permute the subsititution vectors.
-            // Only the bottom 6 bits of a byte are used in VPERB, so we don't even need to mask.
-            shuffled_first_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_first_subs_vec.zmm);
-            shuffled_second_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_second_subs_vec.zmm);
-            shuffled_third_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_third_subs_vec.zmm);
-            shuffled_fourth_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_fourth_subs_vec.zmm);
-
-            // To blend we can invoke three `_mm512_cmplt_epu8_mask`, but we can also achieve the same using
-            // the AND logical operation, checking the top two bits of every byte.
-            // Continuing this thought, we can use the VPTESTMB instruction to output the mask after the AND.
-            __mmask64 is_third_or_fourth = _mm512_mask_test_epi8_mask(mask, longer_vec.zmm, is_third_or_fourth_vec.zmm);
-            __mmask64 is_second_or_fourth =
-                _mm512_mask_test_epi8_mask(mask, longer_vec.zmm, is_second_or_fourth_vec.zmm);
-            lookup_substitution_vec.zmm = _mm512_mask_blend_epi8(
-                is_third_or_fourth,
-                // Choose between the first and the second.
-                _mm512_mask_blend_epi8(is_second_or_fourth, shuffled_first_subs_vec.zmm, shuffled_second_subs_vec.zmm),
-                // Choose between the third and the fourth.
-                _mm512_mask_blend_epi8(is_second_or_fourth, shuffled_third_subs_vec.zmm, shuffled_fourth_subs_vec.zmm));
-
-            // First, sign-extend lower and upper 16 bytes to 16-bit integers.
-            __m512i current_0_31_vec = _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(lookup_substitution_vec.zmm, 0));
-            __m512i current_32_63_vec = _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(lookup_substitution_vec.zmm, 1));
-
-            // Now extend those 16-bit integers to 32-bit.
-            // This isn't free, same as the subsequent store, so we only want to do that for the populated lanes.
-            // To minimize the number of loads and stores, we can combine our substitution costs with the previous
-            // distances, containing the deletion costs.
-            {
-                cost_substitution_vec.zmm = _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_0_31_vec, 0)));
-                cost_deletion_vec.zmm = _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Inclusive prefix minimum computation to combine with insertion costs.
-                // Simply disabling this operation results in 5x performance improvement, meaning
-                // that this operation is responsible for 80% of the total runtime.
-                //    for (sz_size_t idx_longer = 0; idx_longer < longer_length; ++idx_longer) {
-                //        current_distances[idx_longer + 1] =
-                //            sz_max_of_two(current_distances[idx_longer] + gap, current_distances[idx_longer + 1]);
-                //    }
-                //
-                // To perform the same operation in vectorized form, we need to perform a tree-like reduction,
-                // that will involve multiple steps. It's quite expensive and should be first tested in the
-                // "experimental" section.
-                //
-                // Another approach might be loop unrolling:
-                //      current_vec.i32s[0] = last_in_row = sz_i32_max_of_two(current_vec.i32s[0], last_in_row + gap);
-                //      current_vec.i32s[1] = last_in_row = sz_i32_max_of_two(current_vec.i32s[1], last_in_row + gap);
-                //      current_vec.i32s[2] = last_in_row = sz_i32_max_of_two(current_vec.i32s[2], last_in_row + gap);
-                //      ... yet this approach is also quite expensive.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 16 to 31.
-            if (register_length > 16) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 16);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_0_31_vec, 1)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 16);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 16, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 32 to 47.
-            if (register_length > 32) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 32);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_32_63_vec, 0)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 32);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 32, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 32 to 47.
-            if (register_length > 48) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 48);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_32_63_vec, 1)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 48);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 48, (__mmask16)mask, current_vec.zmm);
-            }
-        }
-
-        // Swap previous_distances and current_distances pointers
-        sz_pointer_swap((void **)&previous_distances, (void **)&current_distances);
-    }
-
-    // Cache scalar before `free` call.
-    sz_ssize_t result = previous_distances[longer_length];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-}
-
-SZ_INTERNAL sz_ssize_t sz_alignment_score_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,  //
-    sz_cptr_t longer, sz_size_t longer_length,    //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
-
-    if (sz_max_of_two(shorter_length, longer_length) < (256ull * 256ull * 256ull))
-        return _sz_alignment_score_wagner_fisher_upto17m_avx512(shorter, shorter_length, longer, longer_length, subs,
-                                                                gap, alloc);
-    else
-        return sz_alignment_score_serial(shorter, shorter_length, longer, longer_length, subs, gap, alloc);
-}
-
-enum sz_encoding_t {
-    sz_encoding_unknown_k = 0,
-    sz_encoding_ascii_k = 1,
-    sz_encoding_utf8_k = 2,
-    sz_encoding_utf16_k = 3,
-    sz_encoding_utf32_k = 4,
-    sz_jwt_k,
-    sz_base64_k,
-    // Low priority encodings:
-    sz_encoding_utf8bom_k = 5,
-    sz_encoding_utf16le_k = 6,
-    sz_encoding_utf16be_k = 7,
-    sz_encoding_utf32le_k = 8,
-    sz_encoding_utf32be_k = 9,
-};
-
-// Character Set Detection is one of the most commonly performed operations in data processing with
-// [Chardet](https://github.com/chardet/chardet), [Charset Normalizer](https://github.com/jawah/charset_normalizer),
-// [cChardet](https://github.com/PyYoshi/cChardet) being the most commonly used options in the Python ecosystem.
-// All of them are notoriously slow.
-//
-// Moreover, as of October 2024, UTF-8 is the dominant character encoding on the web, used by 98.4% of websites.
-// Other have minimal usage, according to [W3Techs](https://w3techs.com/technologies/overview/character_encoding):
-// - ISO-8859-1: 1.2%
-// - Windows-1252: 0.3%
-// - Windows-1251: 0.2%
-// - EUC-JP: 0.1%
-// - Shift JIS: 0.1%
-// - EUC-KR: 0.1%
-// - GB2312: 0.1%
-// - Windows-1250: 0.1%
-// Within programming language implementations and database management systems, 16-bit and 32-bit fixed-width encodings
-// are also very popular and we need a way to efficienly differentiate between the most common UTF flavors, ASCII, and
-// the rest.
-//
-// One good solution is the [simdutf](https://github.com/simdutf/simdutf) library, but it depends on the C++ runtime
-// and focuses more on incremental validation & transcoding, rather than detection.
-//
-// So we need a very fast and efficient way of determining
-SZ_PUBLIC sz_bool_t sz_detect_encoding(sz_cptr_t text, sz_size_t length) {
-    // https://github.com/simdutf/simdutf/blob/master/src/icelake/icelake_utf8_validation.inl.cpp
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_from_utf8.inl.cpp#L81
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L661
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L788
-
-    // We can implement this operation simpler & differently, assuming most of the time continuous chunks of memory
-    // have identical encoding. With Russian and many European languages, we generally deal with 2-byte codepoints
-    // with occasional 1-byte punctuation marks. In the case of Chinese, Japanese, and Korean, we deal with 3-byte
-    // codepoints. In the case of emojis, we deal with 4-byte codepoints.
-    // We can also use the idea, that misaligned reads are quite cheap on modern CPUs.
-    int can_be_ascii = 1, can_be_utf8 = 1, can_be_utf16 = 1, can_be_utf32 = 1;
-    sz_unused(can_be_ascii + can_be_utf8 + can_be_utf16 + can_be_utf32);
-    sz_unused(text && length);
-    return sz_false_k;
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif
-
-#pragma endregion
-
-/*  @brief  Implementation of the string search algorithms using the Arm NEON instruction set, available on 64-bit
- *          Arm processors. Implements: {substring search, character search, character set search} x {forward, reverse}.
- */
-#pragma region ARM NEON
-
-#if SZ_USE_ARM_NEON
-#pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+simd")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
-
-/**
- *  @brief  Helper structure to simplify work with 64-bit words.
- */
-typedef union sz_u128_vec_t {
-    uint8x16_t u8x16;
-    uint16x8_t u16x8;
-    uint32x4_t u32x4;
-    uint64x2_t u64x2;
-    sz_u64_t u64s[2];
-    sz_u32_t u32s[4];
-    sz_u16_t u16s[8];
-    sz_u8_t u8s[16];
-} sz_u128_vec_t;
-
-SZ_INTERNAL sz_u64_t _sz_vreinterpretq_u8_u4(uint8x16_t vec) {
-    // Use `vshrn` to produce a bitmask, similar to `movemask` in SSE.
-    // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
-    return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(vec), 4)), 0) & 0x8888888888888888ull;
-}
-
-SZ_PUBLIC sz_ordering_t sz_order_neon(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    //! Before optimizing this, read the "Operations Not Worth Optimizing" in Contributions Guide:
-    //! https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md#general-performance-observations
-    return sz_order_serial(a, a_length, b, b_length);
-}
-
-SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_u128_vec_t a_vec, b_vec;
-    for (; length >= 16; a += 16, b += 16, length -= 16) {
-        a_vec.u8x16 = vld1q_u8((sz_u8_t const *)a);
-        b_vec.u8x16 = vld1q_u8((sz_u8_t const *)b);
-        uint8x16_t cmp = vceqq_u8(a_vec.u8x16, b_vec.u8x16);
-        if (vminvq_u8(cmp) != 255) { return sz_false_k; } // Check if all bytes match
-    }
-
-    // Handle remaining bytes
-    if (length) return sz_equal_serial(a, b, length);
-    return sz_true_k;
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_neon(sz_cptr_t text, sz_size_t length) {
-    uint64x2_t sum_vec = vdupq_n_u64(0);
-
-    // Process 16 bytes (128 bits) at a time
-    for (; length >= 16; text += 16, length -= 16) {
-        uint8x16_t vec = vld1q_u8((sz_u8_t const *)text);      // Load 16 bytes
-        uint16x8_t pairwise_sum1 = vpaddlq_u8(vec);            // Pairwise add lower and upper 8 bits
-        uint32x4_t pairwise_sum2 = vpaddlq_u16(pairwise_sum1); // Pairwise add 16-bit results
-        uint64x2_t pairwise_sum3 = vpaddlq_u32(pairwise_sum2); // Pairwise add 32-bit results
-        sum_vec = vaddq_u64(sum_vec, pairwise_sum3);           // Accumulate the sum
-    }
-
-    // Final reduction of `sum_vec` to a single scalar
-    sz_u64_t sum = vgetq_lane_u64(sum_vec, 0) + vgetq_lane_u64(sum_vec, 1);
-    if (length) sum += sz_checksum_serial(text, length);
-    return sum;
-}
-
-SZ_PUBLIC void sz_copy_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // In most cases the `source` and the `target` are not aligned, but we should
-    // at least make sure that writes don't touch many cache lines.
-    // NEON has an instruction to load and write 64 bytes at once.
-    //
-    //    sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-    //    sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-    //    for (; head_length; target += 1, source += 1, head_length -= 1) *target = *source;
-    //    length -= head_length;
-    //    for (; length >= 64; target += 64, source += 64, length -= 64)
-    //        vst4q_u8((sz_u8_t *)target, vld1q_u8_x4((sz_u8_t const *)source));
-    //    for (; tail_length; target += 1, source += 1, tail_length -= 1) *target = *source;
-    //
-    // Sadly, those instructions end up being 20% slower than the code processing 16 bytes at a time:
-    for (; length >= 16; target += 16, source += 16, length -= 16)
-        vst1q_u8((sz_u8_t *)target, vld1q_u8((sz_u8_t const *)source));
-    if (length) sz_copy_serial(target, source, length);
-}
-
-SZ_PUBLIC void sz_move_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // When moving small buffers, using a small buffer on stack as a temporary storage is faster.
-
-    if (target < source || target >= source + length) {
-        // Non-overlapping, proceed forward
-        sz_copy_neon(target, source, length);
-    }
-    else {
-        // Overlapping, proceed backward
-        target += length;
-        source += length;
-
-        sz_u128_vec_t src_vec;
-        while (length >= 16) {
-            target -= 16, source -= 16, length -= 16;
-            src_vec.u8x16 = vld1q_u8((sz_u8_t const *)source);
-            vst1q_u8((sz_u8_t *)target, src_vec.u8x16);
-        }
-        while (length) {
-            target -= 1, source -= 1, length -= 1;
-            *target = *source;
-        }
-    }
-}
-
-SZ_PUBLIC void sz_fill_neon(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    uint8x16_t fill_vec = vdupq_n_u8(value); // Broadcast the value across the register
-
-    while (length >= 16) {
-        vst1q_u8((sz_u8_t *)target, fill_vec);
-        target += 16;
-        length -= 16;
-    }
-
-    // Handle remaining bytes
-    if (length) sz_fill_serial(target, length, value);
-}
-
-SZ_PUBLIC void sz_look_up_transform_neon(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-
-    // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
-    // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
-    if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
-        return;
-    }
-
-    sz_size_t head_length = (16 - ((sz_size_t)target % 16)) % 16; // 15 or less.
-    sz_size_t tail_length = (sz_size_t)(target + length) % 16;    // 15 or less.
-
-    // We need to pull the lookup table into 16x NEON registers. We have a total of 32 such registers.
-    // According to the Neoverse V2 manual, the 4-table lookup has a latency of 6 cycles, and 4x throughput.
-    uint8x16x4_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec;
-    lut_0_to_63_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 0));
-    lut_64_to_127_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 64));
-    lut_128_to_191_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 128));
-    lut_192_to_255_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 192));
-
-    sz_u128_vec_t source_vec;
-    // If the top bit is set in each word of `source_vec`, than we use `lookup_128_to_191_vec` or
-    // `lookup_192_to_255_vec`. If the second bit is set, we use `lookup_64_to_127_vec` or `lookup_192_to_255_vec`.
-    sz_u128_vec_t lookup_0_to_63_vec, lookup_64_to_127_vec, lookup_128_to_191_vec, lookup_192_to_255_vec;
-    sz_u128_vec_t blended_0_to_255_vec;
-
-    // Process the head with serial code
-    for (; head_length; target += 1, source += 1, head_length -= 1) *target = lut[*(sz_u8_t const *)source];
-
-    // Table lookups on Arm are much simpler to use than on x86, as we can use the `vqtbl4q_u8` instruction
-    // to perform a 4-table lookup in a single instruction. The XORs are used to adjust the lookup position
-    // within each 64-byte range of the table.
-    // Details on the 4-table lookup: https://lemire.me/blog/2019/07/23/arbitrary-byte-to-byte-maps-using-arm-neon/
-    length -= head_length;
-    length -= tail_length;
-    for (; length >= 16; source += 16, target += 16, length -= 16) {
-        source_vec.u8x16 = vld1q_u8((sz_u8_t const *)source);
-        lookup_0_to_63_vec.u8x16 = vqtbl4q_u8(lut_0_to_63_vec, source_vec.u8x16);
-        lookup_64_to_127_vec.u8x16 = vqtbl4q_u8(lut_64_to_127_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0x40)));
-        lookup_128_to_191_vec.u8x16 = vqtbl4q_u8(lut_128_to_191_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0x80)));
-        lookup_192_to_255_vec.u8x16 = vqtbl4q_u8(lut_192_to_255_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0xc0)));
-        blended_0_to_255_vec.u8x16 = vorrq_u8(vorrq_u8(lookup_0_to_63_vec.u8x16, lookup_64_to_127_vec.u8x16),
-                                              vorrq_u8(lookup_128_to_191_vec.u8x16, lookup_192_to_255_vec.u8x16));
-        vst1q_u8((sz_u8_t *)target, blended_0_to_255_vec.u8x16);
-    }
-
-    // Process the tail with serial code
-    for (; tail_length; target += 1, source += 1, tail_length -= 1) *target = lut[*(sz_u8_t const *)source];
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec, n_vec, matches_vec;
-    n_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)n);
-
-    while (h_length >= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)h);
-        matches_vec.u8x16 = vceqq_u8(h_vec.u8x16, n_vec.u8x16);
-        // In Arm NEON we don't have a `movemask` to combine it with `ctz` and get the offset of the match.
-        // But assuming the `vmaxvq` is cheap, we can use it to find the first match, by blending (bitwise selecting)
-        // the vector with a relative offsets array.
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        if (matches) return h + sz_u64_ctz(matches) / 4;
-
-        h += 16, h_length -= 16;
-    }
-
-    return sz_find_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec, n_vec, matches_vec;
-    n_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)n);
-
-    while (h_length >= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)h + h_length - 16);
-        matches_vec.u8x16 = vceqq_u8(h_vec.u8x16, n_vec.u8x16);
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        if (matches) return h + h_length - 1 - sz_u64_clz(matches) / 4;
-        h_length -= 16;
-    }
-
-    return sz_rfind_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_u64_t _sz_find_charset_neon_register(sz_u128_vec_t h_vec, uint8x16_t set_top_vec_u8x16,
-                                                  uint8x16_t set_bottom_vec_u8x16) {
-
-    // Once we've read the characters in the haystack, we want to
-    // compare them against our bitset. The serial version of that code
-    // would look like: `(set_->_u8s[c >> 3] & (1u << (c & 7u))) != 0`.
-    uint8x16_t byte_index_vec = vshrq_n_u8(h_vec.u8x16, 3);
-    uint8x16_t byte_mask_vec = vshlq_u8(vdupq_n_u8(1), vreinterpretq_s8_u8(vandq_u8(h_vec.u8x16, vdupq_n_u8(7))));
-    uint8x16_t matches_top_vec = vqtbl1q_u8(set_top_vec_u8x16, byte_index_vec);
-    // The table lookup instruction in NEON replies to out-of-bound requests with zeros.
-    // The values in `byte_index_vec` all fall in [0; 32). So for values under 16, substracting 16 will underflow
-    // and map into interval [240, 256). Meaning that those will be populated with zeros and we can safely
-    // merge `matches_top_vec` and `matches_bottom_vec` with a bitwise OR.
-    uint8x16_t matches_bottom_vec = vqtbl1q_u8(set_bottom_vec_u8x16, vsubq_u8(byte_index_vec, vdupq_n_u8(16)));
-    uint8x16_t matches_vec = vorrq_u8(matches_top_vec, matches_bottom_vec);
-    // Istead of pure `vandq_u8`, we can immediately broadcast a match presence across each 8-bit word.
-    matches_vec = vtstq_u8(matches_vec, byte_mask_vec);
-    return _sz_vreinterpretq_u8_u4(matches_vec);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_neon(h, h_length, n);
-
-    // Scan through the string.
-    // Assuming how tiny the Arm NEON registers are, we should avoid internal branches at all costs.
-    // That's why, for smaller needles, we use different loops.
-    if (n_length == 2) {
-        // Broadcast needle characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_last_vec, n_first_vec, n_last_vec, matches_vec;
-        // Dealing with 16-bit values, we can load 2 registers at a time and compare 31 possible offsets
-        // in a single loop iteration.
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[0]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[1]);
-        for (; h_length >= 17; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 0));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 1));
-            matches_vec.u8x16 =
-                vandq_u8(vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            if (matches) return h + sz_u64_ctz(matches) / 4;
-        }
-    }
-    else if (n_length == 3) {
-        // Broadcast needle characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-        // Comparing 24-bit values is a bumer. Being lazy, I went with the same approach
-        // as when searching for string over 4 characters long. I only avoid the last comparison.
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[0]);
-        n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[1]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[2]);
-        for (; h_length >= 18; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 0));
-            h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 1));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 2));
-            matches_vec.u8x16 = vandq_u8(                           //
-                vandq_u8(                                           //
-                    vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                    vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-                vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            if (matches) return h + sz_u64_ctz(matches) / 4;
-        }
-    }
-    else {
-        // Pick the parts of the needle that are worth comparing.
-        sz_size_t offset_first, offset_mid, offset_last;
-        _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-        // Broadcast those characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_first]);
-        n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_mid]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_last]);
-        // Walk through the string.
-        for (; h_length >= n_length + 16; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_first));
-            h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_mid));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_last));
-            matches_vec.u8x16 = vandq_u8(                           //
-                vandq_u8(                                           //
-                    vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                    vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-                vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches) / 4;
-                if (sz_equal(h + potential_offset, n, n_length)) return h + potential_offset;
-                matches &= matches - 1;
+    // It's possible that the sequence is already partitioned.
+    if (split != 0 && split != sequence->count) {
+        // Use two pointers to efficiently reposition elements.
+        // On pointer walks left-to-right from the start, and the other walks right-to-left from the end.
+        sz_size_t left = 0;
+        sz_size_t right = sequence->count - 1;
+        while (1) {
+            // Find the next element with the bit set on the left side.
+            while (left < split && !(sequence->order[left] & mask)) ++left;
+            // Find the next element without the bit set on the right side.
+            while (right >= split && (sequence->order[right] & mask)) --right;
+            // Swap the mispositioned elements.
+            if (left < split && right >= split) {
+                sz_u64_swap(sequence->order + left, sequence->order + right);
+                ++left;
+                --right;
             }
+            else { break; }
         }
     }
 
-    return sz_find_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_neon(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Will contain 4 bits per character.
-    sz_u64_t matches;
-    sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-    n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_first]);
-    n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_mid]);
-    n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_last]);
-
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 16; h_length -= 16) {
-        h_reversed = h + h_length - n_length - 16 + 1;
-        h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_first));
-        h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_mid));
-        h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_last));
-        matches_vec.u8x16 = vandq_u8(                           //
-            vandq_u8(                                           //
-                vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-            vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches) / 4;
-            if (sz_equal(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            sz_assert((matches & (1ull << (63 - potential_offset * 4))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~(1ull << (63 - potential_offset * 4));
-        }
-    }
-
-    return sz_rfind_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_charset_t const *set) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec;
-    uint8x16_t set_top_vec_u8x16 = vld1q_u8(&set->_u8s[0]);
-    uint8x16_t set_bottom_vec_u8x16 = vld1q_u8(&set->_u8s[16]);
-
-    for (; h_length >= 16; h += 16, h_length -= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h));
-        matches = _sz_find_charset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
-        if (matches) return h + sz_u64_ctz(matches) / 4;
-    }
-
-    return sz_find_charset_serial(h, h_length, set);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_charset_t const *set) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec;
-    uint8x16_t set_top_vec_u8x16 = vld1q_u8(&set->_u8s[0]);
-    uint8x16_t set_bottom_vec_u8x16 = vld1q_u8(&set->_u8s[16]);
-
-    // Check `sz_find_charset_neon` for explanations.
-    for (; h_length >= 16; h_length -= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h) + h_length - 16);
-        matches = _sz_find_charset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
-        if (matches) return h + h_length - 1 - sz_u64_clz(matches) / 4;
-    }
-
-    return sz_rfind_charset_serial(h, h_length, set);
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif // Arm Neon
-
-#pragma endregion
-
-/*  @brief  Implementation of the string search algorithms using the Arm SVE variable-length registers, available
- *          in Arm v9 processors.
- *
- *  Implements:
- *      - memory: {copy, move, fill}
- *      - comparisons: {equal, order}
- *      - search: {substring, character, character set} x {forward, reverse}.
- */
-#pragma region ARM SVE
-
-#if SZ_USE_ARM_SVE
-#pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+sve")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
-
-SZ_PUBLIC void sz_fill_sve(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    svuint8_t value_vec = svdup_u8(value);
-    sz_size_t vec_len = svcntb(); // Vector length in bytes (scalable)
-
-    if (length <= vec_len) {
-        // Small buffer case: use mask to handle small writes
-        svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)length);
-        svst1_u8(mask, (unsigned char *)target, value_vec);
-    }
-    else {
-        // Calculate head, body, and tail sizes
-        sz_size_t head_length = vec_len - ((sz_size_t)target % vec_len);
-        sz_size_t tail_length = (sz_size_t)(target + length) % vec_len;
-        sz_size_t body_length = length - head_length - tail_length;
-
-        // Handle unaligned head
-        svbool_t head_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)head_length);
-        svst1_u8(head_mask, (unsigned char *)target, value_vec);
-        target += head_length;
-
-        // Aligned body loop
-        for (; body_length >= vec_len; target += vec_len, body_length -= vec_len) {
-            svst1_u8(svptrue_b8(), (unsigned char *)target, value_vec);
-        }
-
-        // Handle unaligned tail
-        svbool_t tail_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)tail_length);
-        svst1_u8(tail_mask, (unsigned char *)target, value_vec);
-    }
-}
-
-SZ_PUBLIC void sz_copy_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    sz_size_t vec_len = svcntb(); // Vector length in bytes
+    // Go down recursively.
+    if (bit_idx < bit_max) {
+        sz_sequence_t a = *sequence;
+        a.count = split;
+        sz_sort_recursion(&a, bit_idx + 1, bit_max, comparator, partial_order_length);
 
-    // Arm Neoverse V2 cores in Graviton 4, for example, come with 256 KB of L1 data cache per core,
-    // and 8 MB of L2 cache per core. Moreover, the L1 cache is fully associative.
-    // With two strings, we may consider the overal workload huge, if each exceeds 1 MB in length.
-    //
-    //      int is_huge = length >= 4ull * 1024ull * 1024ull;
-    //
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= vec_len) {
-        // Small buffer case: use mask to handle small writes
-        svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)length);
-        svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-        svst1_u8(mask, (unsigned char *)target, data);
+        sz_sequence_t b = *sequence;
+        b.order += split;
+        b.count -= split;
+        sz_sort_recursion(&b, bit_idx + 1, bit_max, comparator, partial_order_length);
     }
-    // When dealing with larger buffers, similar to AVX-512, we want minimize unaligned operations
-    // and handle the head, body, and tail separately. We can also traverse the buffer in both directions
-    // as Arm generally supports more simultaneous stores than x86 CPUs.
-    //
-    // For gigantic datasets, similar to AVX-512, non-temporal "loads" and "stores" can be used.
-    // Sadly, if the register size (16 byte or larger) is smaller than a cache-line (64 bytes)
-    // we will pay a huge penalty on loads, fetching the same content many times.
-    // It may be better to allow caching (and subsequent eviction), in favor of using four-element
-    // tuples, wich will be guaranteed to be a multiple of a cache line.
-    //
-    // Another approach is to use the `LD4B` instructions, which will populate four registers at once.
-    // This however, further decreases the performance from LibC-like 29 GB/s to 20 GB/s.
+    // Reached the end of recursion.
     else {
-        // Calculating head, body, and tail sizes depends on the `vec_len`,
-        // but it's runtime constant, and the modulo operation is expensive!
-        // Instead we use the fact, that it's always a multiple of 128 bits or 16 bytes.
-        sz_size_t head_length = 16 - ((sz_size_t)target % 16);
-        sz_size_t tail_length = (sz_size_t)(target + length) % 16;
-        sz_size_t body_length = length - head_length - tail_length;
+        // Discard the prefixes.
+        sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
+        for (sz_size_t i = 0; i != sequence->count; ++i) { order_half_words[i * 2 + 1] = 0; }
 
-        // Handle unaligned parts
-        svbool_t head_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)head_length);
-        svuint8_t head_data = svld1_u8(head_mask, (unsigned char *)source);
-        svst1_u8(head_mask, (unsigned char *)target, head_data);
-        svbool_t tail_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)tail_length);
-        svuint8_t tail_data = svld1_u8(tail_mask, (unsigned char *)source + head_length + body_length);
-        svst1_u8(tail_mask, (unsigned char *)target + head_length + body_length, tail_data);
-        target += head_length;
-        source += head_length;
+        sz_sequence_t a = *sequence;
+        a.count = split;
+        sz_sort_introsort(&a, comparator);
 
-        // Aligned body loop, walking in two directions
-        for (; body_length >= vec_len * 2; target += vec_len, source += vec_len, body_length -= vec_len * 2) {
-            svuint8_t forward_data = svld1_u8(svptrue_b8(), (unsigned char *)source);
-            svuint8_t backward_data = svld1_u8(svptrue_b8(), (unsigned char *)source + body_length - vec_len);
-            svst1_u8(svptrue_b8(), (unsigned char *)target, forward_data);
-            svst1_u8(svptrue_b8(), (unsigned char *)target + body_length - vec_len, backward_data);
-        }
-        // Up to (vec_len * 2 - 1) bytes of data may be left in the body,
-        // so we can unroll the last two optional loop iterations.
-        if (body_length > vec_len) {
-            svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)body_length);
-            svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-            svst1_u8(mask, (unsigned char *)target, data);
-            body_length -= vec_len;
-            source += body_length;
-            target += body_length;
-        }
-        if (body_length) {
-            svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)body_length);
-            svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-            svst1_u8(mask, (unsigned char *)target, data);
-        }
+        sz_sequence_t b = *sequence;
+        b.order += split;
+        b.count -= split;
+        sz_sort_introsort(&b, comparator);
     }
 }
 
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif // Arm SVE
-
-#pragma endregion
-
-/*
- *  @brief  Pick the right implementation for the string search algorithms.
- */
-#pragma region Compile Time Dispatching
-
-SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t ins, sz_size_t length) { return sz_hash_serial(ins, length); }
-SZ_PUBLIC void sz_tolower(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_tolower_serial(ins, length, outs); }
-SZ_PUBLIC void sz_toupper(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_toupper_serial(ins, length, outs); }
-SZ_PUBLIC void sz_toascii(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_toascii_serial(ins, length, outs); }
-SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t ins, sz_size_t length) { return sz_isascii_serial(ins, length); }
-
-SZ_PUBLIC void sz_hashes_fingerprint(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_ptr_t fingerprint,
-                                     sz_size_t fingerprint_bytes) {
-
-    sz_bool_t fingerprint_length_is_power_of_two = (sz_bool_t)((fingerprint_bytes & (fingerprint_bytes - 1)) == 0);
-    sz_string_view_t fingerprint_buffer = {fingerprint, fingerprint_bytes};
-
-    // There are several issues related to the fingerprinting algorithm.
-    // First, the memory traversal order is important.
-    // https://blog.stuffedcow.net/2015/08/pagewalk-coherence/
-
-    // In most cases the fingerprint length will be a power of two.
-    if (fingerprint_length_is_power_of_two == sz_false_k)
-        sz_hashes(start, length, window_length, 1, _sz_hashes_fingerprint_non_pow2_callback, &fingerprint_buffer);
-    else
-        sz_hashes(start, length, window_length, 1, _sz_hashes_fingerprint_pow2_callback, &fingerprint_buffer);
-}
-
-#if !SZ_DYNAMIC_DISPATCH
-
-SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    return sz_checksum_avx512(text, length);
-#elif SZ_USE_X86_AVX2
-    return sz_checksum_avx2(text, length);
-#elif SZ_USE_ARM_NEON
-    return sz_checksum_neon(text, length);
-#else
-    return sz_checksum_serial(text, length);
-#endif
-}
-
-SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    return sz_equal_avx512(a, b, length);
-#elif SZ_USE_X86_AVX2
-    return sz_equal_avx2(a, b, length);
-#elif SZ_USE_ARM_NEON
-    return sz_equal_neon(a, b, length);
-#else
-    return sz_equal_serial(a, b, length);
-#endif
-}
-
-SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-#if SZ_USE_X86_AVX512
-    return sz_order_avx512(a, a_length, b, b_length);
-#elif SZ_USE_X86_AVX2
-    return sz_order_avx2(a, a_length, b, b_length);
-#elif SZ_USE_ARM_NEON
-    return sz_order_neon(a, a_length, b, b_length);
-#else
-    return sz_order_serial(a, a_length, b, b_length);
-#endif
-}
-
-SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    sz_copy_avx512(target, source, length);
-#elif SZ_USE_X86_AVX2
-    sz_copy_avx2(target, source, length);
-#elif SZ_USE_ARM_NEON
-    sz_copy_neon(target, source, length);
-#else
-    sz_copy_serial(target, source, length);
-#endif
-}
-
-SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    sz_move_avx512(target, source, length);
-#elif SZ_USE_X86_AVX2
-    sz_move_avx2(target, source, length);
-#elif SZ_USE_ARM_NEON
-    sz_move_neon(target, source, length);
-#else
-    sz_move_serial(target, source, length);
-#endif
-}
-
-SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-#if SZ_USE_X86_AVX512
-    sz_fill_avx512(target, length, value);
-#elif SZ_USE_X86_AVX2
-    sz_fill_avx2(target, length, value);
-#elif SZ_USE_ARM_NEON
-    sz_fill_neon(target, length, value);
-#else
-    sz_fill_serial(target, length, value);
-#endif
-}
-
-SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-#if SZ_USE_X86_AVX512
-    sz_look_up_transform_avx512(source, length, lut, target);
-#elif SZ_USE_X86_AVX2
-    sz_look_up_transform_avx2(source, length, lut, target);
-#elif SZ_USE_ARM_NEON
-    sz_look_up_transform_neon(source, length, lut, target);
-#else
-    sz_look_up_transform_serial(source, length, lut, target);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
-#if SZ_USE_X86_AVX512
-    return sz_find_byte_avx512(haystack, h_length, needle);
-#elif SZ_USE_X86_AVX2
-    return sz_find_byte_avx2(haystack, h_length, needle);
-#elif SZ_USE_ARM_NEON
-    return sz_find_byte_neon(haystack, h_length, needle);
-#else
-    return sz_find_byte_serial(haystack, h_length, needle);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
-#if SZ_USE_X86_AVX512
-    return sz_rfind_byte_avx512(haystack, h_length, needle);
-#elif SZ_USE_X86_AVX2
-    return sz_rfind_byte_avx2(haystack, h_length, needle);
-#elif SZ_USE_ARM_NEON
-    return sz_rfind_byte_neon(haystack, h_length, needle);
-#else
-    return sz_rfind_byte_serial(haystack, h_length, needle);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
-#if SZ_USE_X86_AVX512
-    return sz_find_avx512(haystack, h_length, needle, n_length);
-#elif SZ_USE_X86_AVX2
-    return sz_find_avx2(haystack, h_length, needle, n_length);
-#elif SZ_USE_ARM_NEON
-    return sz_find_neon(haystack, h_length, needle, n_length);
-#else
-    return sz_find_serial(haystack, h_length, needle, n_length);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
-#if SZ_USE_X86_AVX512
-    return sz_rfind_avx512(haystack, h_length, needle, n_length);
-#elif SZ_USE_X86_AVX2
-    return sz_rfind_avx2(haystack, h_length, needle, n_length);
-#elif SZ_USE_ARM_NEON
-    return sz_rfind_neon(haystack, h_length, needle, n_length);
-#else
-    return sz_rfind_serial(haystack, h_length, needle, n_length);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#if SZ_USE_X86_AVX512
-    return sz_find_charset_avx512(text, length, set);
-#elif SZ_USE_X86_AVX2
-    return sz_find_charset_avx2(text, length, set);
-#elif SZ_USE_ARM_NEON
-    return sz_find_charset_neon(text, length, set);
-#else
-    return sz_find_charset_serial(text, length, set);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#if SZ_USE_X86_AVX512
-    return sz_rfind_charset_avx512(text, length, set);
-#elif SZ_USE_X86_AVX2
-    return sz_rfind_charset_avx2(text, length, set);
-#elif SZ_USE_ARM_NEON
-    return sz_rfind_charset_neon(text, length, set);
-#else
-    return sz_rfind_charset_serial(text, length, set);
-#endif
-}
-
-SZ_DYNAMIC sz_size_t sz_hamming_distance( //
-    sz_cptr_t a, sz_size_t a_length,      //
-    sz_cptr_t b, sz_size_t b_length,      //
-    sz_size_t bound) {
-    return sz_hamming_distance_serial(a, a_length, b, b_length, bound);
+SZ_INTERNAL sz_bool_t _sz_sort_is_less(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) {
+    sz_cptr_t i_str = sequence->get_start(sequence, i_key);
+    sz_cptr_t j_str = sequence->get_start(sequence, j_key);
+    sz_size_t i_len = sequence->get_length(sequence, i_key);
+    sz_size_t j_len = sequence->get_length(sequence, j_key);
+    return (sz_bool_t)(sz_order_serial(i_str, i_len, j_str, j_len) == sz_less_k);
 }
 
-SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length,           //
-    sz_cptr_t b, sz_size_t b_length,           //
-    sz_size_t bound) {
-    return sz_hamming_distance_utf8_serial(a, a_length, b, b_length, bound);
-}
+SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t partial_order_length) {
 
-SZ_DYNAMIC sz_size_t sz_edit_distance( //
-    sz_cptr_t a, sz_size_t a_length,   //
-    sz_cptr_t b, sz_size_t b_length,   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-#if SZ_USE_X86_AVX512
-    return sz_edit_distance_avx512(a, a_length, b, b_length, bound, alloc);
+#if _SZ_IS_BIG_ENDIAN
+    // TODO: Implement partial sort for big-endian systems. For now this sorts the whole thing.
+    sz_unused(partial_order_length);
+    sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
 #else
-    return sz_edit_distance_serial(a, a_length, b, b_length, bound, alloc);
-#endif
-}
 
-SZ_DYNAMIC sz_size_t sz_edit_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length,        //
-    sz_cptr_t b, sz_size_t b_length,        //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-    return _sz_edit_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc);
-}
+    // Export up to 4 bytes into the `sequence` bits themselves
+    for (sz_size_t i = 0; i != sequence->count; ++i) {
+        sz_cptr_t begin = sequence->get_start(sequence, sequence->order[i]);
+        sz_size_t length = sequence->get_length(sequence, sequence->order[i]);
+        length = length > 4u ? 4u : length;
+        sz_ptr_t prefix = (sz_ptr_t)&sequence->order[i];
+        for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j];
+    }
 
-SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                         sz_error_cost_t const *subs, sz_error_cost_t gap,
-                                         sz_memory_allocator_t *alloc) {
-#if SZ_USE_X86_AVX512
-    return sz_alignment_score_avx512(a, a_length, b, b_length, subs, gap, alloc);
-#else
-    return sz_alignment_score_serial(a, a_length, b, b_length, subs, gap, alloc);
+    // Perform optionally-parallel radix sort on them
+    sz_sort_recursion(sequence, 0, 32, (sz_sequence_comparator_t)_sz_sort_is_less, partial_order_length);
 #endif
 }
 
-SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                          sz_hash_callback_t callback, void *callback_handle) {
-#if SZ_USE_X86_AVX512
-    sz_hashes_avx512(text, length, window_length, window_step, callback, callback_handle);
-#elif SZ_USE_X86_AVX2
-    sz_hashes_avx2(text, length, window_length, window_step, callback, callback_handle);
+SZ_PUBLIC void sz_sort(sz_sequence_t *sequence) {
+#if _SZ_IS_BIG_ENDIAN
+    sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
 #else
-    sz_hashes_serial(text, length, window_length, window_step, callback, callback_handle);
+    sz_sort_partial(sequence, sequence->count);
 #endif
 }
 
-SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    return sz_find_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    sz_charset_invert(&set);
-    return sz_find_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    return sz_rfind_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    sz_charset_invert(&set);
-    return sz_rfind_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
-                            sz_random_generator_t generator, void *generator_user_data) {
-    sz_generate_serial(alphabet, alphabet_size, result, result_length, generator, generator_user_data);
-}
-
-#endif
-#pragma endregion
+#pragma endregion // Serial Implementation
 
 #ifdef __cplusplus
-#pragma GCC diagnostic pop
 }
 #endif // __cplusplus
-
-#endif // STRINGZILLA_H_
+#endif // STRINGZILLA_SORT_H_

From b835051c09a0ecfc420932de444f3c6839610764 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 18:50:17 +0000
Subject: [PATCH 040/751] Fix: Filter `types.h` file

---
 include/stringzilla/types.h | 7139 +++--------------------------------
 1 file changed, 547 insertions(+), 6592 deletions(-)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index de7fbcac..a39620e6 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -1,31 +1,34 @@
 /**
- *  @brief  StringZilla is a collection of advanced string algorithms, designed to be used in Big Data applications.
- *          It is generally faster than LibC, and has a broader & cleaner interface, and targets modern x86 CPUs
- *          with AVX-512 and Arm NEON and older CPUs with SWAR and auto-vectorization.
+ *  @brief  Shared definitions for the StringZilla library.
+ *  @file   types.h
+ *  @author Ash Vardanian
  *
  *  Consider overriding the following macros to customize the library:
  *
  *  - `SZ_DEBUG=0` - whether to enable debug assertions and logging.
+ *  - `SZ_AVOID_LIBC=0` - whether to avoid including the standard C library headers.
  *  - `SZ_DYNAMIC_DISPATCH=0` - whether to use runtime dispatching of the most advanced SIMD backend.
  *  - `SZ_USE_MISALIGNED_LOADS=0` - whether to use misaligned loads on platforms that support them.
  *  - `SZ_SWAR_THRESHOLD=24` - threshold for switching to SWAR backend over serial byte-level for-loops.
- *  - `SZ_USE_X86_AVX512=?` - whether to use AVX-512 instructions on x86_64.
- *  - `SZ_USE_X86_AVX2=?` - whether to use AVX2 instructions on x86_64.
- *  - `SZ_USE_ARM_NEON=?` - whether to use NEON instructions on ARM.
- *  - `SZ_USE_ARM_SVE=?` - whether to use SVE instructions on ARM.
- *
- *  @see    StringZilla: https://github.com/ashvardanian/StringZilla/blob/main/README.md
- *  @see    LibC String: https://pubs.opengroup.org/onlinepubs/009695399/basedefs/string.h.html
- *
- *  @file   stringzilla.h
- *  @author Ash Vardanian
+ *  - `SZ_USE_HASWELL=?` - whether to use AVX2 instructions on x86_64.
+ *  - `SZ_USE_SKYLAKE=?` - whether to use AVX-512 instructions on x86_64.
+ *  - `SZ_USE_ICE=?` - whether to use AVX-512 VBMI instructions on x86_64.
+ *  - `SZ_USE_NEON=?` - whether to use NEON instructions on ARM.
+ *  - `SZ_USE_SVE=?` - whether to use SVE and SVE2 instructions on ARM.
  */
-#ifndef STRINGZILLA_H_
-#define STRINGZILLA_H_
+#ifndef STRINGZILLA_TYPES_H_
+#define STRINGZILLA_TYPES_H_
 
-#define STRINGZILLA_VERSION_MAJOR 3
-#define STRINGZILLA_VERSION_MINOR 11
-#define STRINGZILLA_VERSION_PATCH 0
+/*
+ *  Debugging and testing.
+ */
+#ifndef SZ_DEBUG
+#if defined(DEBUG) || defined(_DEBUG) // This means "Not using DEBUG information".
+#define SZ_DEBUG (1)
+#else
+#define SZ_DEBUG (0)
+#endif
+#endif
 
 /**
  *  @brief  When set to 1, the library will include the following LibC headers: <stddef.h> and <stdint.h>.
@@ -39,6 +42,16 @@
 #define SZ_AVOID_LIBC (0) // true or false
 #endif
 
+/**
+ *  @brief  Removes compile-time dispatching, and replaces it with runtime dispatching.
+ *          So the `sz_find` function will invoke the most advanced backend supported by the CPU,
+ *          that runs the program, rather than the most advanced backend supported by the CPU
+ *          used to compile the library or the downstream application.
+ */
+#ifndef SZ_DYNAMIC_DISPATCH
+#define SZ_DYNAMIC_DISPATCH (0) // true or false
+#endif
+
 /**
  *  @brief  A misaligned load can be - trying to fetch eight consecutive bytes from an address
  *          that is not divisible by eight. On x86 enabled by default. On ARM it's not.
@@ -54,27 +67,17 @@
 #endif
 #endif
 
-/**
- *  @brief  Removes compile-time dispatching, and replaces it with runtime dispatching.
- *          So the `sz_find` function will invoke the most advanced backend supported by the CPU,
- *          that runs the program, rather than the most advanced backend supported by the CPU
- *          used to compile the library or the downstream application.
- */
-#ifndef SZ_DYNAMIC_DISPATCH
-#define SZ_DYNAMIC_DISPATCH (0) // true or false
-#endif
-
 /**
  *  @brief  Analogous to `size_t` and `std::size_t`, unsigned integer, identical to pointer size.
  *          64-bit on most platforms where pointers are 64-bit.
  *          32-bit on platforms where pointers are 32-bit.
  */
 #if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64)
-#define SZ_DETECT_64_BIT (1)
+#define _SZ_IS_64_BIT (1)
 #define SZ_SIZE_MAX (0xFFFFFFFFFFFFFFFFull)  // Largest unsigned integer that fits into 64 bits.
 #define SZ_SSIZE_MAX (0x7FFFFFFFFFFFFFFFull) // Largest signed integer that fits into 64 bits.
 #else
-#define SZ_DETECT_64_BIT (0)
+#define _SZ_IS_64_BIT (0)
 #define SZ_SIZE_MAX (0xFFFFFFFFu)  // Largest unsigned integer that fits into 32 bits.
 #define SZ_SSIZE_MAX (0x7FFFFFFFu) // Largest signed integer that fits into 32 bits.
 #endif
@@ -89,23 +92,12 @@
  *  In Python one can check `sys.byteorder == 'big'` in the `setup.py` script and pass the appropriate macro.
  *  https://stackoverflow.com/a/27054190
  */
-#ifndef SZ_DETECT_BIG_ENDIAN
+#ifndef _SZ_IS_BIG_ENDIAN
 #if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN || defined(__BIG_ENDIAN__) || defined(__ARMEB__) || \
     defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || defined(__MIBSEB) || defined(__MIBSEB__)
-#define SZ_DETECT_BIG_ENDIAN (1) //< It's a big-endian target architecture
-#else
-#define SZ_DETECT_BIG_ENDIAN (0) //< It's a little-endian target architecture
-#endif
-#endif
-
-/*
- *  Debugging and testing.
- */
-#ifndef SZ_DEBUG
-#if defined(DEBUG) || defined(_DEBUG) // This means "Not using DEBUG information".
-#define SZ_DEBUG (1)
+#define _SZ_IS_BIG_ENDIAN (1) //< It's a big-endian target architecture
 #else
-#define SZ_DEBUG (0)
+#define _SZ_IS_BIG_ENDIAN (0) //< It's a little-endian target architecture
 #endif
 #endif
 
@@ -153,12 +145,93 @@
  *  @brief  Alignment macro for 64-byte alignment.
  */
 #if defined(_MSC_VER)
-#define SZ_ALIGN64 __declspec(align(64))
+#define _SZ_ALIGN64 __declspec(align(64))
 #elif defined(__GNUC__) || defined(__clang__)
-#define SZ_ALIGN64 __attribute__((aligned(64)))
+#define _SZ_ALIGN64 __attribute__((aligned(64)))
+#else
+#define _SZ_ALIGN64
+#endif
+
+/**
+ *  @brief  Largest prime number that fits into 31 bits.
+ *  @see    https://mersenneforum.org/showthread.php?t=3471
+ */
+#define SZ_U32_MAX_PRIME (2147483647u)
+
+/**
+ *  @brief  Largest prime number that fits into 64 bits.
+ *  @see    https://mersenneforum.org/showthread.php?t=3471
+ *
+ *  2^64 = 18,446,744,073,709,551,616
+ *  this = 18,446,744,073,709,551,557
+ *  diff = 59
+ */
+#define SZ_U64_MAX_PRIME (18446744073709551557ull)
+
+#if !SZ_AVOID_LIBC
+#include <stddef.h> // `size_t`
+#include <stdint.h> // `uint8_t`
+#endif
+
+/*  Compile-time hardware features detection.
+ *  All of those can be controlled by the user.
+ */
+#ifndef SZ_USE_HASWELL
+#ifdef __AVX2__
+#define SZ_USE_HASWELL 1
+#else
+#define SZ_USE_HASWELL 0
+#endif
+#endif
+
+#ifndef SZ_USE_SKYLAKE
+#ifdef __AVX512F__
+#define SZ_USE_SKYLAKE 1
+#else
+#define SZ_USE_SKYLAKE 0
+#endif
+#endif
+
+#ifndef SZ_USE_ICE
+#ifdef __AVX512BW__
+#define SZ_USE_ICE 1
+#else
+#define SZ_USE_ICE 0
+#endif
+#endif
+
+#ifndef SZ_USE_NEON
+#ifdef __ARM_NEON
+#define SZ_USE_NEON 1
+#else
+#define SZ_USE_NEON 0
+#endif
+#endif
+
+#ifndef SZ_USE_SVE
+#ifdef __ARM_FEATURE_SVE
+#define SZ_USE_SVE 1
 #else
-#define SZ_ALIGN64
+#define SZ_USE_SVE 0
+#endif
+#endif
+
+/*  Hardware-specific headers for different SIMD intrinsics and register wrappers.
+ */
+#if SZ_USE_HASWELL || SZ_USE_SKYLAKE || SZ_USE_ICE
+#include <immintrin.h>
+#endif // SZ_USE_X86...
+#if SZ_USE_NEON
+#if !defined(_MSC_VER)
+#include <arm_acle.h>
+#endif
+#include <arm_neon.h>
+#endif // SZ_USE_NEON
+#if SZ_USE_SVE
+#if !defined(_MSC_VER)
+#include <arm_sve.h>
 #endif
+#endif // SZ_USE_SVE
 
 #ifdef __cplusplus
 extern "C" {
@@ -169,8 +242,6 @@ extern "C" {
  *  if that is allowed by the user.
  */
 #if !SZ_AVOID_LIBC
-#include <stddef.h>           // `size_t`
-#include <stdint.h>           // `uint8_t`
 typedef int8_t sz_i8_t;       // Always 8 bits
 typedef uint8_t sz_u8_t;      // Always 8 bits
 typedef uint16_t sz_u16_t;    // Always 16 bits
@@ -210,13 +281,13 @@ typedef unsigned long long sz_u64_t; // Always 64 bits
 //  > `long long` is also 64 bits
 //
 // Source: https://learn.microsoft.com/en-us/windows/win32/winprog64/abstract-data-models
-#if SZ_DETECT_64_BIT
+#if _SZ_IS_64_BIT
 typedef unsigned long long sz_size_t; // 64-bit.
 typedef long long sz_ssize_t;         // 64-bit.
 #else
 typedef unsigned sz_size_t;  // 32-bit.
 typedef unsigned sz_ssize_t; // 32-bit.
-#endif // SZ_DETECT_64_BIT
+#endif // _SZ_IS_64_BIT
 
 #endif // SZ_AVOID_LIBC
 
@@ -231,8 +302,6 @@ typedef unsigned sz_ssize_t; // 32-bit.
 sz_static_assert(sizeof(sz_size_t) == sizeof(void *), sz_size_t_must_be_pointer_size);
 sz_static_assert(sizeof(sz_ssize_t) == sizeof(void *), sz_ssize_t_must_be_pointer_size);
 
-#pragma region Public API
-
 typedef char *sz_ptr_t;          // A type alias for `char *`
 typedef char const *sz_cptr_t;   // A type alias for `char const *`
 typedef sz_i8_t sz_error_cost_t; // Character mismatch cost for fuzzy matching functions
@@ -242,6 +311,19 @@ typedef sz_u64_t sz_sorted_idx_t; // Index of a sorted string in a list of strin
 typedef enum { sz_false_k = 0, sz_true_k = 1 } sz_bool_t;                        // Only one relevant bit
 typedef enum { sz_less_k = -1, sz_equal_k = 0, sz_greater_k = 1 } sz_ordering_t; // Only three possible states: <=>
 
+/**
+ *  @brief  Describes the length of a UTF8 @b rune / character / codepoint in bytes.
+ */
+typedef enum {
+    sz_utf8_invalid_k = 0,     //!< Invalid UTF8 character.
+    sz_utf8_rune_1byte_k = 1,  //!< 1-byte UTF8 character.
+    sz_utf8_rune_2bytes_k = 2, //!< 2-byte UTF8 character.
+    sz_utf8_rune_3bytes_k = 3, //!< 3-byte UTF8 character.
+    sz_utf8_rune_4bytes_k = 4, //!< 4-byte UTF8 character.
+} sz_rune_length_t;
+
+typedef sz_u32_t sz_rune_t;
+
 /**
  *  @brief  Tiny string-view structure. It's POD type, unlike the `std::string_view`.
  */
@@ -250,32 +332,7 @@ typedef struct sz_string_view_t {
     sz_size_t length;
 } sz_string_view_t;
 
-/**
- *  @brief  Enumeration of SIMD capabilities of the target architecture.
- *          Used to introspect the supported functionality of the dynamic library.
- */
-typedef enum sz_capability_t {
-    sz_cap_serial_k = 1,       /// Serial (non-SIMD) capability
-    sz_cap_any_k = 0x7FFFFFFF, /// Mask representing any capability
-
-    sz_cap_arm_neon_k = 1 << 10, /// ARM NEON capability
-    sz_cap_arm_sve_k = 1 << 11,  /// ARM SVE capability TODO: Not yet supported or used
-    sz_cap_arm_sve2_k = 1 << 12,
-    sz_cap_arm_sve2p1_k = 1 << 13,
-    sz_cap_x86_avx2_k = 1 << 20,       /// x86 AVX2 capability
-    sz_cap_x86_avx512f_k = 1 << 21,    /// x86 AVX512 F capability
-    sz_cap_x86_avx512bw_k = 1 << 22,   /// x86 AVX512 BW instruction capability
-    sz_cap_x86_avx512vl_k = 1 << 23,   /// x86 AVX512 VL instruction capability
-    sz_cap_x86_avx512vbmi_k = 1 << 24, /// x86 AVX512 VBMI instruction capability
-    sz_cap_x86_gfni_k = 1 << 25,       /// x86 AVX512 GFNI instruction capability
-
-} sz_capability_t;
-
-/**
- *  @brief  Function to determine the SIMD capabilities of the current machine @b only at @b runtime.
- *  @return A bitmask of the SIMD capabilities represented as a `sz_capability_t` enum value.
- */
-SZ_DYNAMIC sz_capability_t sz_capabilities(void);
+#pragma region Character Sets
 
 /**
  *  @brief  Bit-set structure for 256 possible byte values. Useful for filtering and search.
@@ -318,6 +375,10 @@ SZ_PUBLIC void sz_charset_invert(sz_charset_t *s) {
         s->_u64s[2] ^= 0xFFFFFFFFFFFFFFFFull, s->_u64s[3] ^= 0xFFFFFFFFFFFFFFFFull;
 }
 
+#pragma endregion
+
+#pragma region Memory Management
+
 typedef void *(*sz_memory_allocate_t)(sz_size_t, void *);
 typedef void (*sz_memory_free_t)(void *, sz_size_t, void *);
 typedef sz_u64_t (*sz_random_generator_t)(void *);
@@ -352,65 +413,9 @@ SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc);
  */
 SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length);
 
-/**
- *  @brief  The number of bytes a stack-allocated string can hold, including the SZ_NULL termination character.
- *          ! This can't be changed from outside. Don't use the `#error` as it may already be included and set.
- */
-#ifdef SZ_STRING_INTERNAL_SPACE
-#undef SZ_STRING_INTERNAL_SPACE
-#endif
-#define SZ_STRING_INTERNAL_SPACE (sizeof(sz_size_t) * 3 - 1) // 3 pointers minus one byte for an 8-bit length
-
-/**
- *  @brief  Tiny memory-owning string structure with a Small String Optimization (SSO).
- *          Differs in layout from Folly, Clang, GCC, and probably most other implementations.
- *          It's designed to avoid any branches on read-only operations, and can store up
- *          to 22 characters on stack on 64-bit machines, followed by the SZ_NULL-termination character.
- *
- *  @section Changing Length
- *
- *  One nice thing about this design, is that you can, in many cases, change the length of the string
- *  without any branches, invoking a `+=` or `-=` on the 64-bit `length` field. If the string is on heap,
- *  the solution is obvious. If it's on stack, inplace decrement wouldn't affect the top bytes of the string,
- *  only changing the last byte containing the length.
- */
-typedef union sz_string_t {
-
-#if !SZ_DETECT_BIG_ENDIAN
-
-    struct external {
-        sz_ptr_t start;
-        sz_size_t length;
-        sz_size_t space;
-        sz_size_t padding;
-    } external;
-
-    struct internal {
-        sz_ptr_t start;
-        sz_u8_t length;
-        char chars[SZ_STRING_INTERNAL_SPACE];
-    } internal;
-
-#else
-
-    struct external {
-        sz_ptr_t start;
-        sz_size_t space;
-        sz_size_t padding;
-        sz_size_t length;
-    } external;
-
-    struct internal {
-        sz_ptr_t start;
-        char chars[SZ_STRING_INTERNAL_SPACE];
-        sz_u8_t length;
-    } internal;
-
-#endif
-
-    sz_size_t words[4];
+#pragma endregion
 
-} sz_string_t;
+#pragma region API Signature Types
 
 typedef sz_u64_t (*sz_hash_t)(sz_cptr_t, sz_size_t);
 typedef sz_u64_t (*sz_checksum_t)(sz_cptr_t, sz_size_t);
@@ -418,667 +423,184 @@ typedef sz_bool_t (*sz_equal_t)(sz_cptr_t, sz_cptr_t, sz_size_t);
 typedef sz_ordering_t (*sz_order_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
 typedef void (*sz_to_converter_t)(sz_cptr_t, sz_size_t, sz_ptr_t);
 
-/**
- *  @brief  Computes the 64-bit check-sum of bytes in a string.
- *          Similar to `std::ranges::accumulate`.
- *
- *  @param text     String to aggregate.
- *  @param length   Number of bytes in the text.
- *  @return         64-bit unsigned value.
- */
-SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length);
+typedef void (*sz_look_up_transform_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_ptr_t);
 
-/** @copydoc sz_checksum */
-SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length);
+typedef void (*sz_move_t)(sz_ptr_t, sz_cptr_t, sz_size_t);
 
-/**
- *  @brief  Computes the 64-bit unsigned hash of a string. Fairly fast for short strings,
- *          simple implementation, and supports rolling computation, reused in other APIs.
- *          Similar to `std::hash` in C++.
- *
- *  @param text     String to hash.
- *  @param length   Number of bytes in the text.
- *  @return         64-bit hash value.
- *
- *  @see    sz_hashes, sz_hashes_fingerprint, sz_hashes_intersection
- */
-SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length);
+typedef void (*sz_fill_t)(sz_ptr_t, sz_size_t, sz_u8_t);
 
-/** @copydoc sz_hash */
-SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t text, sz_size_t length);
+typedef sz_cptr_t (*sz_find_byte_t)(sz_cptr_t, sz_size_t, sz_cptr_t);
+typedef sz_cptr_t (*sz_find_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
+typedef sz_cptr_t (*sz_find_set_t)(sz_cptr_t, sz_size_t, sz_charset_t const *);
 
-/**
- *  @brief  Checks if two string are equal.
- *          Similar to `memcmp(a, b, length) == 0` in LibC and `a == b` in STL.
- *
- *  The implementation of this function is very similar to `sz_order`, but the usage patterns are different.
- *  This function is more often used in parsing, while `sz_order` is often used in sorting.
- *  It works best on platforms with cheap
- *
- *  @param a        First string to compare.
- *  @param b        Second string to compare.
- *  @param length   Number of bytes in both strings.
- *  @return         1 if strings match, 0 otherwise.
- */
-SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
+typedef sz_size_t (*sz_hamming_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t);
 
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
+typedef sz_size_t (*sz_edit_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t, sz_memory_allocator_t *);
 
-/**
- *  @brief  Estimates the relative order of two strings. Equivalent to `memcmp(a, b, length)` in LibC.
- *          Can be used on different length strings.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *  @return         Negative if (a < b), positive if (a > b), zero if they are equal.
- */
-SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
+typedef sz_ssize_t (*sz_alignment_score_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_error_cost_t const *,
+                                           sz_error_cost_t, sz_memory_allocator_t *);
 
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
+typedef void (*sz_hash_callback_t)(sz_cptr_t, sz_size_t, sz_u64_t, void *user);
 
-/**
- *  @brief  Look Up Table @b (LUT) transformation of a string. Equivalent to `for (char & c : text) c = lut[c]`.
- *
- *  Can be used to implement some form of string normalization, partially masking punctuation marks,
- *  or converting between different character sets, like uppercase or lowercase. Surprisingly, also has
- *  broad implications in image processing, where image channel transformations are often done using LUTs.
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param lut      Look Up Table to apply. Must be exactly @b 256 bytes long.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
+typedef void (*sz_hashes_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_size_t, sz_hash_callback_t, void *);
 
-typedef void (*sz_look_up_transform_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_ptr_t);
+typedef void (*sz_hashes_fingerprint_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_ptr_t, sz_size_t);
 
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_serial(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
+typedef sz_size_t (*sz_hashes_intersection_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_cptr_t, sz_size_t);
 
-/**
- *  @brief  Equivalent to `for (char & c : text) c = tolower(c)`.
- *
- *  ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
- *  So there are 26 english letters, shifted by 32 values, meaning that a conversion
- *  can be done by flipping the 5th bit each inappropriate character byte. This, however,
- *  breaks for extended ASCII, so a different solution is needed.
- *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_PUBLIC void sz_tolower(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
+#pragma endregion
 
-/**
- *  @brief  Equivalent to `for (char & c : text) c = toupper(c)`.
- *
- *  ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
- *  So there are 26 english letters, shifted by 32 values, meaning that a conversion
- *  can be done by flipping the 5th bit each inappropriate character byte. This, however,
- *  breaks for extended ASCII, so a different solution is needed.
- *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_PUBLIC void sz_toupper(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
+#pragma region Helper Structures
 
 /**
- *  @brief  Equivalent to `for (char & c : text) c = toascii(c)`.
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
+ *  @brief  Helper structure to simplify work with 16-bit words.
+ *  @see    sz_u16_load
  */
-SZ_PUBLIC void sz_toascii(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
+typedef union sz_u16_vec_t {
+    sz_u16_t u16;
+    sz_u8_t u8s[2];
+} sz_u16_vec_t;
 
 /**
- *  @brief  Checks if all characters in the range are valid ASCII characters.
- *
- *  @param text     String to be analyzed.
- *  @param length   Number of bytes in the string.
- *  @return         Whether all characters are valid ASCII characters.
+ *  @brief  Helper structure to simplify work with 32-bit words.
+ *  @see    sz_u32_load
  */
-SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t text, sz_size_t length);
+typedef union sz_u32_vec_t {
+    sz_u32_t u32;
+    sz_u16_t u16s[2];
+    sz_u8_t u8s[4];
+} sz_u32_vec_t;
 
 /**
- *  @brief  Generates a random string for a given alphabet, avoiding integer division and modulo operations.
- *          Similar to `text[i] = alphabet[rand() % cardinality]`.
- *
- *  The modulo operation is expensive, and should be avoided in performance-critical code.
- *  We avoid it using small lookup tables and replacing it with a multiplication and shifts, similar to `libdivide`.
- *  Alternative algorithms would include:
- *      - Montgomery form: https://en.algorithmica.org/hpc/number-theory/montgomery/
- *      - Barret reduction: https://www.nayuki.io/page/barrett-reduction-algorithm
- *      - Lemire's trick: https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
- *
- *  @param alphabet     Set of characters to sample from.
- *  @param cardinality  Number of characters to sample from.
- *  @param text         Output string, can point to the same address as ::text.
- *  @param generate     Callback producing random numbers given the generator state.
- *  @param generator    Generator state, can be a pointer to a seed, or a pointer to a random number generator.
+ *  @brief  Helper structure to simplify work with 64-bit words.
+ *  @see    sz_u64_load
  */
-SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
-                            sz_random_generator_t generate, void *generator);
-
-/** @copydoc sz_generate */
-SZ_PUBLIC void sz_generate_serial(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
-                                  sz_random_generator_t generate, void *generator);
+typedef union sz_u64_vec_t {
+    sz_u64_t u64;
+    sz_u32_t u32s[2];
+    sz_u16_t u16s[4];
+    sz_u8_t u8s[8];
+} sz_u64_vec_t;
 
 /**
- *  @brief  Similar to `memcpy`, copies contents of one string into another.
- *          The behavior is undefined if the strings overlap.
- *
- *  @param target   String to copy into.
- *  @param length   Number of bytes to copy.
- *  @param source   String to copy from.
+ *  @brief  Helper structure to simplify work with @b 128-bit registers.
+ *          It can help view the contents as 8-bit, 16-bit, 32-bit, or 64-bit integers,
+ *          as well as 1x XMM register.
  */
-SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
+typedef union sz_u128_vec_t {
+#if SZ_USE_HASWELL
+    __m128i xmm;
+#endif
+#if SZ_USE_NEON
+    uint8x16_t u8x16;
+    uint16x8_t u16x8;
+    uint32x4_t u32x4;
+    uint64x2_t u64x2;
+#endif
+    sz_u64_t u64s[2];
+    sz_u32_t u32s[4];
+    sz_u16_t u16s[8];
+    sz_u8_t u8s[16];
+} sz_u128_vec_t;
 
 /**
- *  @brief  Similar to `memmove`, copies (moves) contents of one string into another.
- *          Unlike `sz_copy`, allows overlapping strings as arguments.
- *
- *  @param target   String to copy into.
- *  @param length   Number of bytes to copy.
- *  @param source   String to copy from.
+ *  @brief  Helper structure to simplify work with @b 256-bit registers.
+ *          It can help view the contents as 8-bit, 16-bit, 32-bit, or 64-bit integers,
+ *          as well as 2x XMM registers or 1x YMM register.
  */
-SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-typedef void (*sz_move_t)(sz_ptr_t, sz_cptr_t, sz_size_t);
+typedef union sz_u256_vec_t {
+#if SZ_USE_HASWELL
+    __m256i ymm;
+    __m128i xmms[2];
+#endif
+    sz_u64_t u64s[4];
+    sz_u32_t u32s[8];
+    sz_u16_t u16s[16];
+    sz_u8_t u8s[32];
+} sz_u256_vec_t;
 
 /**
- *  @brief  Similar to `memset`, fills a string with a given value.
- *
- *  @param target   String to fill.
- *  @param length   Number of bytes to fill.
- *  @param value    Value to fill with.
+ *  @brief  Helper structure to simplify work with @b 512-bit registers.
+ *          It can help view the contents as 8-bit, 16-bit, 32-bit, or 64-bit integers,
+ *          as well as 4x XMM registers or 2x YMM registers or 1x ZMM register.
  */
-SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value);
+typedef union sz_u512_vec_t {
+#if SZ_USE_ICE
+    __m512i zmm;
+#endif
+#if SZ_USE_HASWELL
+    __m256i ymms[2];
+    __m128i xmms[4];
+#endif
+    sz_u64_t u64s[8];
+    sz_i64_t i64s[8];
+    sz_u32_t u32s[16];
+    sz_i32_t i32s[16];
+    sz_u16_t u16s[32];
+    sz_u8_t u8s[64];
+} sz_u512_vec_t;
 
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value);
+#pragma endregion
 
-typedef void (*sz_fill_t)(sz_ptr_t, sz_size_t, sz_u8_t);
+#pragma region UTF8
 
 /**
- *  @brief  Initializes a string class instance to an empty value.
+ *  @brief  Extracts just one UTF8 codepoint from a UTF8 string into a 32-bit unsigned integer.
  */
-SZ_PUBLIC void sz_string_init(sz_string_t *string);
-
-/**
- *  @brief  Convenience function checking if the provided string is stored inside of the ::string instance itself,
- *          alternative being - allocated in a remote region of the heap.
- */
-SZ_PUBLIC sz_bool_t sz_string_is_on_stack(sz_string_t const *string);
-
-/**
- *  @brief  Unpacks the opaque instance of a string class into its components.
- *          Recommended to use only in read-only operations.
- *
- *  @param string       String to unpack.
- *  @param start        Pointer to the start of the string.
- *  @param length       Number of bytes in the string, before the SZ_NULL character.
- *  @param space        Number of bytes allocated for the string (heap or stack), including the SZ_NULL character.
- *  @param is_external  Whether the string is allocated on the heap externally, or fits withing ::string instance.
- */
-SZ_PUBLIC void sz_string_unpack(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length, sz_size_t *space,
-                                sz_bool_t *is_external);
-
-/**
- *  @brief  Unpacks only the start and length of the string.
- *          Recommended to use only in read-only operations.
- *
- * @param string       String to unpack.
- * @param start        Pointer to the start of the string.
- * @param length       Number of bytes in the string, before the SZ_NULL character.
- */
-SZ_PUBLIC void sz_string_range(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length);
-
-/**
- *  @brief  Constructs a string of a given ::length with noisy contents.
- *          Use the returned character pointer to populate the string.
- *
- *  @param string       String to initialize.
- *  @param length       Number of bytes in the string, before the SZ_NULL character.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             SZ_NULL if the operation failed, pointer to the start of the string otherwise.
- */
-SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length, sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Doesn't change the contents or the length of the string, but grows the available memory capacity.
- *          This is beneficial, if several insertions are expected, and we want to minimize allocations.
- *
- *  @param string       String to grow.
- *  @param new_capacity The number of characters to reserve space for, including existing ones.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             SZ_NULL if the operation failed, pointer to the new start of the string otherwise.
- */
-SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity, sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Grows the string by adding an uninitialized region of ::added_length at the given ::offset.
- *          Would often be used in conjunction with one or more `sz_copy` calls to populate the allocated region.
- *          Similar to `sz_string_reserve`, but changes the length of the ::string.
- *
- *  @param string       String to grow.
- *  @param offset       Offset of the first byte to reserve space for.
- *                      If provided offset is larger than the length, it will be capped.
- *  @param added_length The number of new characters to reserve space for.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             SZ_NULL if the operation failed, pointer to the new start of the string otherwise.
- */
-SZ_PUBLIC sz_ptr_t sz_string_expand(sz_string_t *string, sz_size_t offset, sz_size_t added_length,
-                                    sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Removes a range from a string. Changes the length, but not the capacity.
- *          Performs no allocations or deallocations and can't fail.
- *
- *  @param string       String to clean.
- *  @param offset       Offset of the first byte to remove.
- *  @param length       Number of bytes to remove. Out-of-bound ranges will be capped.
- *  @return             Number of bytes removed.
- */
-SZ_PUBLIC sz_size_t sz_string_erase(sz_string_t *string, sz_size_t offset, sz_size_t length);
-
-/**
- *  @brief  Shrinks the string to fit the current length, if it's allocated on the heap.
- *          It's the reverse operation of ::sz_string_reserve.
- *
- *  @param string       String to shrink.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             Whether the operation was successful. The only failures can come from the allocator.
- *                      On failure, the string will remain unchanged.
- */
-SZ_PUBLIC sz_ptr_t sz_string_shrink_to_fit(sz_string_t *string, sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Frees the string, if it's allocated on the heap.
- *          If the string is on the stack, the function clears/resets the state.
- */
-SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *allocator);
-
-#pragma endregion
-
-#pragma region Fast Substring Search API
-
-typedef sz_cptr_t (*sz_find_byte_t)(sz_cptr_t, sz_size_t, sz_cptr_t);
-typedef sz_cptr_t (*sz_find_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
-typedef sz_cptr_t (*sz_find_set_t)(sz_cptr_t, sz_size_t, sz_charset_t const *);
-
-/**
- *  @brief  Locates first matching byte in a string. Equivalent to `memchr(haystack, *needle, h_length)` in LibC.
- *
- *  X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memchr.S
- *  Aarch64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/aarch64/memchr.S
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - single-byte substring to find.
- *  @return         Address of the first match.
- */
-SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/**
- *  @brief  Locates last matching byte in a string. Equivalent to `memrchr(haystack, *needle, h_length)` in LibC.
- *
- *  X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memrchr.S
- *  Aarch64 implementation: missing
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - single-byte substring to find.
- *  @return         Address of the last match.
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/**
- *  @brief  Locates first matching substring.
- *          Equivalent to `memmem(haystack, h_length, needle, n_length)` in LibC.
- *          Similar to `strstr(haystack, needle)` in LibC, but requires known length.
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - substring to find.
- *  @param n_length Number of bytes in the needle.
- *  @return         Address of the first match.
- */
-SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/**
- *  @brief  Locates the last matching substring.
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - substring to find.
- *  @param n_length Number of bytes in the needle.
- *  @return         Address of the last match.
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/**
- *  @brief  Finds the first character present from the ::set, present in ::text.
- *          Equivalent to `strspn(text, accepted)` and `strcspn(text, rejected)` in LibC.
- *          May have identical implementation and performance to ::sz_rfind_charset.
- *
- *  Useful for parsing, when we want to skip a set of characters. Examples:
- *  * 6 whitespaces: " \t\n\r\v\f".
- *  * 16 digits forming a float number: "0123456789,.eE+-".
- *  * 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
- *  * 2 JSON string special characters useful to locate the end of the string: "\"\\".
- *
- *  @param text     String to be scanned.
- *  @param set      Set of relevant characters.
- *  @return         Pointer to the first matching character from ::set.
- */
-SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-/**
- *  @brief  Finds the last character present from the ::set, present in ::text.
- *          Equivalent to `strspn(text, accepted)` and `strcspn(text, rejected)` in LibC.
- *          May have identical implementation and performance to ::sz_find_charset.
- *
- *  Useful for parsing, when we want to skip a set of characters. Examples:
- *  * 6 whitespaces: " \t\n\r\v\f".
- *  * 16 digits forming a float number: "0123456789,.eE+-".
- *  * 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
- *  * 2 JSON string special characters useful to locate the end of the string: "\"\\".
- *
- *  @param text     String to be scanned.
- *  @param set      Set of relevant characters.
- *  @return         Pointer to the last matching character from ::set.
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-#pragma endregion
-
-#pragma region String Similarity Measures API
-
-/**
- *  @brief  Computes the Hamming distance between two strings - number of not matching characters.
- *          Difference in length is is counted as a mismatch.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for the distance, the `bound` if was exceeded.
- *
- *  @see    sz_hamming_distance_utf8
- *  @see    https://en.wikipedia.org/wiki/Hamming_distance
- */
-SZ_DYNAMIC sz_size_t sz_hamming_distance( //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
-
-/** @copydoc sz_hamming_distance */
-SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
-
-/**
- *  @brief  Computes the Hamming distance between two @b UTF8 strings - number of not matching characters.
- *          Difference in length is is counted as a mismatch.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for the distance, the `bound` if was exceeded.
- *
- *  @see    sz_hamming_distance
- *  @see    https://en.wikipedia.org/wiki/Hamming_distance
- */
-SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                              sz_size_t bound);
-
-/** @copydoc sz_hamming_distance_utf8 */
-SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                                    sz_size_t bound);
-
-typedef sz_size_t (*sz_hamming_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t);
-
-/**
- *  @brief  Computes the Levenshtein edit-distance between two strings using the Wagner-Fisher algorithm.
- *          Similar to the Needleman-Wunsch alignment algorithm. Often used in fuzzy string matching.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @param bound    Exclusive upper bound on the distance, that allows us to exit early.
- *                  Pass `SZ_SIZE_MAX` or any value greater than `(max(a_length, b_length))` to ignore.
- *                  Pass zero to check if the strings are equal.
- *  @return         Unsigned integer for the edit distance. Zero means the strings are equal.
- *                  Returns the `bound` if it was exceeded or `SZ_SIZE_MAX` if the memory allocation failed.
- *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default
- *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
- */
-SZ_DYNAMIC sz_size_t sz_edit_distance(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                      sz_size_t bound, sz_memory_allocator_t *alloc);
-
-/** @copydoc sz_edit_distance */
-SZ_PUBLIC sz_size_t sz_edit_distance_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                            sz_size_t bound, sz_memory_allocator_t *alloc);
-
-/**
- *  @brief  Computes the Levenshtein edit-distance between two @b UTF8 strings.
- *          Unlike `sz_edit_distance`, reports the distance in Unicode codepoints, and not in bytes.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for edit distance, the `bound` if was exceeded or `SZ_SIZE_MAX`
- *                  if the memory allocation failed.
- *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default, sz_edit_distance
- *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
- */
-SZ_DYNAMIC sz_size_t sz_edit_distance_utf8(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                           sz_size_t bound, sz_memory_allocator_t *alloc);
-
-typedef sz_size_t (*sz_edit_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t, sz_memory_allocator_t *);
-
-/** @copydoc sz_edit_distance_utf8 */
-SZ_PUBLIC sz_size_t sz_edit_distance_utf8_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                                 sz_size_t bound, sz_memory_allocator_t *alloc);
-
-/**
- *  @brief  Computes Needleman–Wunsch alignment score for two string. Often used in bioinformatics and cheminformatics.
- *          Similar to the Levenshtein edit-distance, parameterized for gap and substitution penalties.
- *
- *  Not commutative in the general case, as the order of the strings matters, as `sz_alignment_score(a, b)` may
- *  not be equal to `sz_alignment_score(b, a)`. Becomes @b commutative, if the substitution costs are symmetric.
- *  Equivalent to the negative Levenshtein distance, if: `gap == -1` and `subs[i][j] == (i == j ? 0: -1)`.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *  @param gap      Penalty cost for gaps - insertions and removals.
- *  @param subs     Substitution costs matrix with 256 x 256 values for all pairs of characters.
- *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @return         Signed similarity score. Can be negative, depending on the substitution costs.
- *                  If the memory allocation fails, the function returns `SZ_SSIZE_MAX`.
- *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default
- *  @see    https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
- */
-SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                         sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                         sz_memory_allocator_t *alloc);
-
-/** @copydoc sz_alignment_score */
-SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                               sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                               sz_memory_allocator_t *alloc);
-
-typedef sz_ssize_t (*sz_alignment_score_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_error_cost_t const *,
-                                           sz_error_cost_t, sz_memory_allocator_t *);
-
-typedef void (*sz_hash_callback_t)(sz_cptr_t, sz_size_t, sz_u64_t, void *user);
-
-/**
- *  @brief  Computes the Karp-Rabin rolling hashes of a string supplying them to the provided `callback`.
- *          Can be used for similarity scores, search, ranking, etc.
- *
- *  Rabin-Karp-like rolling hashes can have very high-level of collisions and depend
- *  on the choice of bases and the prime number. That's why, often two hashes from the same
- *  family are used with different bases.
- *
- *       1. Kernighan and Ritchie's function uses 31, a prime close to the size of English alphabet.
- *       2. To be friendlier to byte-arrays and UTF8, we use 257 for the second function.
- *
- *  Choosing the right ::window_length is task- and domain-dependant. For example, most English words are
- *  between 3 and 7 characters long, so a window of 4 bytes would be a good choice. For DNA sequences,
- *  the ::window_length might be a multiple of 3, as the codons are 3 (nucleotides) bytes long.
- *  With such minimalistic alphabets of just four characters (AGCT) longer windows might be needed.
- *  For protein sequences the alphabet is 20 characters long, so the window can be shorter, than for DNAs.
- *
- *  @param text             String to hash.
- *  @param length           Number of bytes in the string.
- *  @param window_length    Length of the rolling window in bytes.
- *  @param window_step      Step of reported hashes. @b Must be power of two. Should be smaller than `window_length`.
- *  @param callback         Function receiving the start & length of a substring, the hash, and the `callback_handle`.
- *  @param callback_handle  Optional user-provided pointer to be passed to the `callback`.
- *  @see                    sz_hashes_fingerprint, sz_hashes_intersection
- */
-SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                          sz_hash_callback_t callback, void *callback_handle);
-
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_serial(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                                sz_hash_callback_t callback, void *callback_handle);
-
-typedef void (*sz_hashes_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_size_t, sz_hash_callback_t, void *);
-
-/**
- *  @brief  Computes the Karp-Rabin rolling hashes of a string outputting a binary fingerprint.
- *          Such fingerprints can be compared with Hamming or Jaccard (Tanimoto) distance for similarity.
- *
- *  The algorithm doesn't clear the fingerprint buffer on start, so it can be invoked multiple times
- *  to produce a fingerprint of a longer string, by passing the previous fingerprint as the ::fingerprint.
- *  It can also be reused to produce multi-resolution fingerprints by changing the ::window_length
- *  and calling the same function multiple times for the same input ::text.
- *
- *  Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
- *  avoiding cache-coherency penalties of remote on-heap buffers.
- *
- *  @param text                 String to hash.
- *  @param length               Number of bytes in the string.
- *  @param fingerprint          Output fingerprint buffer.
- *  @param fingerprint_bytes    Number of bytes in the fingerprint buffer.
- *  @param window_length        Length of the rolling window in bytes.
- *  @see                        sz_hashes, sz_hashes_intersection
- */
-SZ_PUBLIC void sz_hashes_fingerprint(                          //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
-    sz_ptr_t fingerprint, sz_size_t fingerprint_bytes);
-
-typedef void (*sz_hashes_fingerprint_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_ptr_t, sz_size_t);
-
-/**
- *  @brief  Given a hash-fingerprint of a textual document, computes the number of intersecting hashes
- *          of the incoming document. Can be used for document scoring and search.
- *
- *  Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
- *  avoiding cache-coherency penalties of remote on-heap buffers.
- *
- *  @param text                 Input document.
- *  @param length               Number of bytes in the input document.
- *  @param fingerprint          Reference document fingerprint.
- *  @param fingerprint_bytes    Number of bytes in the reference documents fingerprint.
- *  @param window_length        Length of the rolling window in bytes.
- *  @see                        sz_hashes, sz_hashes_fingerprint
- */
-SZ_PUBLIC sz_size_t sz_hashes_intersection(                    //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
-    sz_cptr_t fingerprint, sz_size_t fingerprint_bytes);
-
-typedef sz_size_t (*sz_hashes_intersection_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_cptr_t, sz_size_t);
-
-#pragma endregion
-
-#pragma region Convenience API
-
-/**
- *  @brief  Finds the first character in the haystack, that is present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-/**
- *  @brief  Finds the first character in the haystack, that is @b not present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_find_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
+SZ_INTERNAL void _sz_extract_utf8_rune(sz_cptr_t utf8, sz_rune_t *code, sz_rune_length_t *code_length) {
+    sz_u8_t const *current = (sz_u8_t const *)utf8;
+    sz_u8_t leading_byte = *current++;
+    sz_rune_t ch;
+    sz_rune_length_t ch_length;
 
-/**
- *  @brief  Finds the last character in the haystack, that is present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
+    // TODO: This can be made entirely branchless using 32-bit SWAR.
+    if (leading_byte < 0x80) {
+        // Single-byte rune (0xxxxxxx)
+        ch = leading_byte;
+        ch_length = sz_utf8_rune_1byte_k;
+    }
+    else if ((leading_byte & 0xE0) == 0xC0) {
+        // Two-byte rune (110xxxxx 10xxxxxx)
+        ch = (leading_byte & 0x1F) << 6;
+        ch |= (*current++ & 0x3F);
+        ch_length = sz_utf8_rune_2bytes_k;
+    }
+    else if ((leading_byte & 0xF0) == 0xE0) {
+        // Three-byte rune (1110xxxx 10xxxxxx 10xxxxxx)
+        ch = (leading_byte & 0x0F) << 12;
+        ch |= (*current++ & 0x3F) << 6;
+        ch |= (*current++ & 0x3F);
+        ch_length = sz_utf8_rune_3bytes_k;
+    }
+    else if ((leading_byte & 0xF8) == 0xF0) {
+        // Four-byte rune (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
+        ch = (leading_byte & 0x07) << 18;
+        ch |= (*current++ & 0x3F) << 12;
+        ch |= (*current++ & 0x3F) << 6;
+        ch |= (*current++ & 0x3F);
+        ch_length = sz_utf8_rune_4bytes_k;
+    }
+    else {
+        // Invalid UTF8 rune.
+        ch = 0;
+        ch_length = sz_utf8_invalid_k;
+    }
+    *code = ch;
+    *code_length = ch_length;
+}
 
 /**
- *  @brief  Finds the last character in the haystack, that is @b not present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
+ *  @brief  Exports a UTF8 string into a UTF32 buffer.
+ *          ! The result is undefined id the UTF8 string is corrupted.
+ *  @return The length in the number of codepoints.
  */
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
+SZ_INTERNAL sz_size_t _sz_export_utf8_to_utf32(sz_cptr_t utf8, sz_size_t utf8_length, sz_rune_t *utf32) {
+    sz_cptr_t const end = utf8 + utf8_length;
+    sz_size_t count = 0;
+    sz_rune_length_t rune_length;
+    for (; utf8 != end; utf8 += rune_length, utf32++, count++) _sz_extract_utf8_rune(utf8, utf32, &rune_length);
+    return count;
+}
 
 #pragma endregion
 
@@ -1105,273 +627,66 @@ typedef struct sz_sequence_t {
  *          Expects ::offsets to contains `count + 1` entries, the last pointing at the end
  *          of the last string, indicating the total length of the ::tape.
  */
-SZ_PUBLIC void sz_sequence_from_u32tape(sz_cptr_t *start, sz_u32_t const *offsets, sz_size_t count,
-                                        sz_sequence_t *sequence);
+SZ_PUBLIC void sz_sequence_from_u32tape( //
+    sz_cptr_t *start, sz_u32_t const *offsets, sz_size_t count, sz_sequence_t *sequence);
 
 /**
  *  @brief  Initiates the sequence structure from a tape layout, used by Apache Arrow.
  *          Expects ::offsets to contains `count + 1` entries, the last pointing at the end
  *          of the last string, indicating the total length of the ::tape.
  */
-SZ_PUBLIC void sz_sequence_from_u64tape(sz_cptr_t *start, sz_u64_t const *offsets, sz_size_t count,
-                                        sz_sequence_t *sequence);
+SZ_PUBLIC void sz_sequence_from_u64tape( //
+    sz_cptr_t *start, sz_u64_t const *offsets, sz_size_t count, sz_sequence_t *sequence);
 
-/**
- *  @brief  Similar to `std::partition`, given a predicate splits the sequence into two parts.
- *          The algorithm is unstable, meaning that elements may change relative order, as long
- *          as they are in the right partition. This is the simpler algorithm for partitioning.
- */
-SZ_PUBLIC sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate);
+#pragma endregion
 
-/**
- *  @brief  Inplace `std::set_union` for two consecutive chunks forming the same continuous `sequence`.
+#pragma region Helper Functions
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#pragma GCC visibility push(hidden)
+
+/*
+ **********************************************************************************************************************
+ **********************************************************************************************************************
+ **********************************************************************************************************************
+ *
+ *  This is where we the actual implementation begins.
+ *  The rest of the file is hidden from the public API.
  *
- *  @param partition The number of elements in the first sub-sequence in `sequence`.
- *  @param less Comparison function, to determine the lexicographic ordering.
+ **********************************************************************************************************************
+ **********************************************************************************************************************
+ **********************************************************************************************************************
  */
-SZ_PUBLIC void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less);
 
 /**
- *  @brief  Sorting algorithm, combining Radix Sort for the first 32 bits of every word
- *          and a follow-up by a more conventional sorting procedure on equally prefixed parts.
+ *  @brief  Helper-macro to mark potentially unused variables.
  */
-SZ_PUBLIC void sz_sort(sz_sequence_t *sequence);
+#define sz_unused(x) ((void)(x))
 
 /**
- *  @brief  Partial sorting algorithm, combining Radix Sort for the first 32 bits of every word
- *          and a follow-up by a more conventional sorting procedure on equally prefixed parts.
+ *  @brief  Helper-macro casting a variable to another type of the same size.
  */
-SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t n);
+#define sz_bitcast(type, value) (*((type *)&(value)))
 
 /**
- *  @brief  Intro-Sort algorithm that supports custom comparators.
- */
-SZ_PUBLIC void sz_sort_intro(sz_sequence_t *sequence, sz_sequence_comparator_t less);
-
-#pragma endregion
-
-/*
- *  Hardware feature detection.
- *  All of those can be controlled by the user.
+ *  @brief  Defines `SZ_NULL`, analogous to `NULL`.
+ *          The default often comes from locale.h, stddef.h,
+ *          stdio.h, stdlib.h, string.h, time.h, or wchar.h.
  */
-#ifndef SZ_USE_X86_AVX512
-#ifdef __AVX512BW__
-#define SZ_USE_X86_AVX512 1
+#ifdef __GNUG__
+#define SZ_NULL __null
+#define SZ_NULL_CHAR __null
 #else
-#define SZ_USE_X86_AVX512 0
-#endif
+#define SZ_NULL ((void *)0)
+#define SZ_NULL_CHAR ((char *)0)
 #endif
 
-#ifndef SZ_USE_X86_AVX2
-#ifdef __AVX2__
-#define SZ_USE_X86_AVX2 1
-#else
-#define SZ_USE_X86_AVX2 0
-#endif
-#endif
-
-#ifndef SZ_USE_ARM_NEON
-#ifdef __ARM_NEON
-#define SZ_USE_ARM_NEON 1
-#else
-#define SZ_USE_ARM_NEON 0
-#endif
-#endif
-
-#ifndef SZ_USE_ARM_SVE
-#ifdef __ARM_FEATURE_SVE
-#define SZ_USE_ARM_SVE 1
-#else
-#define SZ_USE_ARM_SVE 0
-#endif
-#endif
-
-/*
- *  Include hardware-specific headers.
- */
-#if SZ_USE_X86_AVX512 || SZ_USE_X86_AVX2
-#include <immintrin.h>
-#endif // SZ_USE_X86...
-#if SZ_USE_ARM_NEON
-#if !defined(_MSC_VER)
-#include <arm_acle.h>
-#endif
-#include <arm_neon.h>
-#endif // SZ_USE_ARM_NEON
-#if SZ_USE_ARM_SVE
-#if !defined(_MSC_VER)
-#include <arm_sve.h>
-#endif
-#endif // SZ_USE_ARM_SVE
-
-#pragma region Hardware Specific API
-
-#if SZ_USE_X86_AVX512
-
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_avx512(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_avx512(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_edit_distance */
-SZ_PUBLIC sz_size_t sz_edit_distance_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                            sz_size_t bound, sz_memory_allocator_t *alloc);
-/** @copydoc sz_alignment_score */
-SZ_PUBLIC sz_ssize_t sz_alignment_score_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                               sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                               sz_memory_allocator_t *alloc);
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle);
-#endif
-
-#if SZ_USE_X86_AVX2
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_avx2(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_avx2(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_avx2(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_avx2(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                              sz_hash_callback_t callback, void *callback_handle);
-#endif
-
-#if SZ_USE_ARM_NEON
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_neon(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_neon(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_neon(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-#endif
-
-#if SZ_USE_ARM_SVE
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_sve(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_sve(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_sve(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_sve(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_sve(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-#endif
-
-#pragma endregion
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wconversion"
-
-/*
- **********************************************************************************************************************
- **********************************************************************************************************************
- **********************************************************************************************************************
- *
- *  This is where we the actual implementation begins.
- *  The rest of the file is hidden from the public API.
- *
- **********************************************************************************************************************
- **********************************************************************************************************************
- **********************************************************************************************************************
- */
-
-#pragma region Compiler Extensions and Helper Functions
-
-#pragma GCC visibility push(hidden)
-
-/**
- *  @brief  Helper-macro to mark potentially unused variables.
- */
-#define sz_unused(x) ((void)(x))
-
-/**
- *  @brief  Helper-macro casting a variable to another type of the same size.
- */
-#define sz_bitcast(type, value) (*((type *)&(value)))
-
-/**
- *  @brief  Defines `SZ_NULL`, analogous to `NULL`.
- *          The default often comes from locale.h, stddef.h,
- *          stdio.h, stdlib.h, string.h, time.h, or wchar.h.
- */
-#ifdef __GNUG__
-#define SZ_NULL __null
-#define SZ_NULL_CHAR __null
-#else
-#define SZ_NULL ((void *)0)
-#define SZ_NULL_CHAR ((char *)0)
-#endif
-
-/**
- *  @brief  Cache-line width, that will affect the execution of some algorithms,
- *          like equality checks and relative order computing.
- */
-#define SZ_CACHE_LINE_WIDTH (64) // bytes
+/**
+ *  @brief  Cache-line width, that will affect the execution of some algorithms,
+ *          like equality checks and relative order computing.
+ */
+#define SZ_CACHE_LINE_WIDTH (64) // bytes
 
 /**
  *  @brief  Similar to `assert`, the `sz_assert` is used in the SZ_DEBUG mode
@@ -1467,6 +782,17 @@ SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return __builtin_bswap
 SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return __builtin_bswap32(val); }
 #endif
 
+/*
+ */
+SZ_INTERNAL sz_u16_t _sz_u16_mask_until(sz_size_t n) { return (0x0001u << n) - 1u; }
+SZ_INTERNAL sz_u32_t _sz_u32_mask_until(sz_size_t n) { return (0x00000001u << n) - 1u; }
+SZ_INTERNAL sz_u64_t _sz_u64_mask_until(sz_size_t n) { return (0x0000000000000001ull << n) - 1ull; }
+SZ_INTERNAL sz_u16_t _sz_u16_clamp_mask_until(sz_size_t n) { return n < 16 ? _sz_u16_mask_until(n) : 0xFFFFu; }
+SZ_INTERNAL sz_u32_t _sz_u32_clamp_mask_until(sz_size_t n) { return n < 32 ? _sz_u32_mask_until(n) : 0xFFFFFFFFu; }
+SZ_INTERNAL sz_u64_t _sz_u64_clamp_mask_until(sz_size_t n) {
+    return n < 64 ? _sz_u64_mask_until(n) : 0xFFFFFFFFFFFFFFFFull;
+}
+
 SZ_INTERNAL sz_u64_t sz_u64_rotl(sz_u64_t x, sz_u64_t r) { return (x << r) | (x >> (64 - r)); }
 
 /**
@@ -1497,5655 +823,284 @@ SZ_INTERNAL sz_u64_t sz_u64_blend(sz_u64_t a, sz_u64_t b, sz_u64_t mask) { retur
  *
  *  Alternatively, to avoid multiplication:
  *
- *      x & ~((x < y) - 1) + y & ((x < y) - 1)      // 6 unique operations
- */
-#define sz_min_of_two(x, y) (x < y ? x : y)
-#define sz_max_of_two(x, y) (x < y ? y : x)
-#define sz_min_of_three(x, y, z) sz_min_of_two(x, sz_min_of_two(y, z))
-#define sz_max_of_three(x, y, z) sz_max_of_two(x, sz_max_of_two(y, z))
-
-/** @brief  Branchless minimum function for two signed 32-bit integers. */
-SZ_INTERNAL sz_i32_t sz_i32_min_of_two(sz_i32_t x, sz_i32_t y) { return y + ((x - y) & (x - y) >> 31); }
-
-/** @brief  Branchless minimum function for two signed 32-bit integers. */
-SZ_INTERNAL sz_i32_t sz_i32_max_of_two(sz_i32_t x, sz_i32_t y) { return x - ((x - y) & (x - y) >> 31); }
-
-/**
- *  @brief  Clamps signed offsets in a string to a valid range. Used for Pythonic-style slicing.
- */
-SZ_INTERNAL void sz_ssize_clamp_interval(sz_size_t length, sz_ssize_t start, sz_ssize_t end,
-                                         sz_size_t *normalized_offset, sz_size_t *normalized_length) {
-    // TODO: Remove branches.
-    // Normalize negative indices
-    if (start < 0) start += length;
-    if (end < 0) end += length;
-
-    // Clamp indices to a valid range
-    if (start < 0) start = 0;
-    if (end < 0) end = 0;
-    if (start > (sz_ssize_t)length) start = length;
-    if (end > (sz_ssize_t)length) end = length;
-
-    // Ensure start <= end
-    if (start > end) start = end;
-
-    *normalized_offset = start;
-    *normalized_length = end - start;
-}
-
-/**
- *  @brief  Compute the logarithm base 2 of a positive integer, rounding down.
- */
-SZ_INTERNAL sz_size_t sz_size_log2i_nonzero(sz_size_t x) {
-    sz_assert(x > 0 && "Non-positive numbers have no defined logarithm");
-    sz_size_t leading_zeros = sz_u64_clz(x);
-    return 63 - leading_zeros;
-}
-
-/**
- *  @brief  Compute the smallest power of two greater than or equal to ::x.
- */
-SZ_INTERNAL sz_size_t sz_size_bit_ceil(sz_size_t x) {
-    // Unlike the commonly used trick with `clz` intrinsics, is valid across the whole range of `x`.
-    // https://stackoverflow.com/a/10143264
-    x--;
-    x |= x >> 1;
-    x |= x >> 2;
-    x |= x >> 4;
-    x |= x >> 8;
-    x |= x >> 16;
-#if SZ_DETECT_64_BIT
-    x |= x >> 32;
-#endif
-    x++;
-    return x;
-}
-
-/**
- *  @brief  Transposes an 8x8 bit matrix packed in a `sz_u64_t`.
- *
- *  There is a well known SWAR sequence for that known to chess programmers,
- *  willing to flip a bit-matrix of pieces along the main A1-H8 diagonal.
- *  https://www.chessprogramming.org/Flipping_Mirroring_and_Rotating
- *  https://lukas-prokop.at/articles/2021-07-23-transpose
- */
-SZ_INTERNAL sz_u64_t sz_u64_transpose(sz_u64_t x) {
-    sz_u64_t t;
-    t = x ^ (x << 36);
-    x ^= 0xf0f0f0f00f0f0f0full & (t ^ (x >> 36));
-    t = 0xcccc0000cccc0000ull & (x ^ (x << 18));
-    x ^= t ^ (t >> 18);
-    t = 0xaa00aa00aa00aa00ull & (x ^ (x << 9));
-    x ^= t ^ (t >> 9);
-    return x;
-}
-
-/**
- *  @brief  Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
- */
-SZ_INTERNAL void sz_u64_swap(sz_u64_t *a, sz_u64_t *b) {
-    sz_u64_t t = *a;
-    *a = *b;
-    *b = t;
-}
-
-/**
- *  @brief  Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
- */
-SZ_INTERNAL void sz_pointer_swap(void **a, void **b) {
-    void *t = *a;
-    *a = *b;
-    *b = t;
-}
-
-/**
- *  @brief  Helper structure to simplify work with 16-bit words.
- *  @see    sz_u16_load
- */
-typedef union sz_u16_vec_t {
-    sz_u16_t u16;
-    sz_u8_t u8s[2];
-} sz_u16_vec_t;
-
-/**
- *  @brief Load a 16-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u16_vec_t sz_u16_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u16_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u16_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u16_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u16_vec_t const *result = (sz_u16_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/**
- *  @brief  Helper structure to simplify work with 32-bit words.
- *  @see    sz_u32_load
- */
-typedef union sz_u32_vec_t {
-    sz_u32_t u32;
-    sz_u16_t u16s[2];
-    sz_u8_t u8s[4];
-} sz_u32_vec_t;
-
-/**
- *  @brief Load a 32-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u32_vec_t sz_u32_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u32_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    result.u8s[2] = ptr[2];
-    result.u8s[3] = ptr[3];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u32_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u32_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u32_vec_t const *result = (sz_u32_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/**
- *  @brief  Helper structure to simplify work with 64-bit words.
- *  @see    sz_u64_load
- */
-typedef union sz_u64_vec_t {
-    sz_u64_t u64;
-    sz_u32_t u32s[2];
-    sz_u16_t u16s[4];
-    sz_u8_t u8s[8];
-} sz_u64_vec_t;
-
-/**
- *  @brief Load a 64-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u64_vec_t sz_u64_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u64_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    result.u8s[2] = ptr[2];
-    result.u8s[3] = ptr[3];
-    result.u8s[4] = ptr[4];
-    result.u8s[5] = ptr[5];
-    result.u8s[6] = ptr[6];
-    result.u8s[7] = ptr[7];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u64_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u64_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u64_vec_t const *result = (sz_u64_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/** @brief  Helper function, using the supplied fixed-capacity buffer to allocate memory. */
-SZ_INTERNAL sz_ptr_t _sz_memory_allocate_fixed(sz_size_t length, void *handle) {
-    sz_size_t capacity;
-    sz_copy((sz_ptr_t)&capacity, (sz_cptr_t)handle, sizeof(sz_size_t));
-    sz_size_t consumed_capacity = sizeof(sz_size_t);
-    if (consumed_capacity + length > capacity) return SZ_NULL_CHAR;
-    return (sz_ptr_t)handle + consumed_capacity;
-}
-
-/** @brief  Helper "no-op" function, simulating memory deallocation when we use a "static" memory buffer. */
-SZ_INTERNAL void _sz_memory_free_fixed(sz_ptr_t start, sz_size_t length, void *handle) {
-    sz_unused(start && length && handle);
-}
-
-/** @brief  An internal callback used to set a bit in a power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void _sz_hashes_fingerprint_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *handle) {
-    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
-    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
-    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
-    fingerprint_u8s[(hash / 8) & (fingerprint_bytes - 1)] |= (1 << (hash & 7));
-    sz_unused(start && length);
-}
-
-/** @brief  An internal callback used to set a bit in a @b non power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void _sz_hashes_fingerprint_non_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash,
-                                                          void *handle) {
-    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
-    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
-    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
-    fingerprint_u8s[(hash / 8) % fingerprint_bytes] |= (1 << (hash & 7));
-    sz_unused(start && length);
-}
-
-/** @brief  An internal callback, used to mix all the running hashes into one pointer-size value. */
-SZ_INTERNAL void _sz_hashes_fingerprint_scalar_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash,
-                                                        void *scalar_handle) {
-    sz_unused(start && length && hash && scalar_handle);
-    sz_size_t *scalar_ptr = (sz_size_t *)scalar_handle;
-    *scalar_ptr ^= hash;
-}
-
-/**
- *  @brief  Chooses the offsets of the most interesting characters in a search needle.
- *
- *  Search throughput can significantly deteriorate if we are matching the wrong characters.
- *  Say the needle is "aXaYa", and we are comparing the first, second, and last character.
- *  If we use SIMD and compare many offsets at a time, comparing against "a" in every register is a waste.
- *
- *  Similarly, dealing with UTF8 inputs, we know that the lower bits of each character code carry more information.
- *  Cyrillic alphabet, for example, falls into [0x0410, 0x042F] code range for uppercase [А, Я], and
- *  into [0x0430, 0x044F] for lowercase [а, я]. Scanning through a text written in Russian, half of the
- *  bytes will carry absolutely no value and will be equal to 0x04.
- */
-SZ_INTERNAL void _sz_locate_needle_anomalies(sz_cptr_t start, sz_size_t length, //
-                                             sz_size_t *first, sz_size_t *second, sz_size_t *third) {
-    *first = 0;
-    *second = length / 2;
-    *third = length - 1;
-
-    //
-    int has_duplicates =                   //
-        start[*first] == start[*second] || //
-        start[*first] == start[*third] ||  //
-        start[*second] == start[*third];
-
-    // Loop through letters to find non-colliding variants.
-    if (length > 3 && has_duplicates) {
-        // Pivot the middle point right, until we find a character different from the first one.
-        for (; start[*second] == start[*first] && *second + 1 < *third; ++(*second)) {}
-        // Pivot the third (last) point left, until we find a different character.
-        for (; (start[*third] == start[*second] || start[*third] == start[*first]) && *third > (*second + 1);
-             --(*third)) {}
-    }
-
-    // TODO: Investigate alternative strategies for long needles.
-    // On very long needles we have the luxury to choose!
-    // Often dealing with UTF8, we will likely benefit from shifting the first and second characters
-    // further to the right, to achieve not only uniqueness within the needle, but also avoid common
-    // rune prefixes of 2-, 3-, and 4-byte codes.
-    if (length > 8) {
-        // Pivot the first and second points right, until we find a character, that:
-        // > is different from others.
-        // > doesn't start with 0b'110x'xxxx - only 5 bits of relevant info.
-        // > doesn't start with 0b'1110'xxxx - only 4 bits of relevant info.
-        // > doesn't start with 0b'1111'0xxx - only 3 bits of relevant info.
-        //
-        // So we are practically searching for byte values that start with 0b0xxx'xxxx or 0b'10xx'xxxx.
-        // Meaning they fall in the range [0, 127] and [128, 191], in other words any unsigned int up to 191.
-        sz_u8_t const *start_u8 = (sz_u8_t const *)start;
-        sz_size_t vibrant_first = *first, vibrant_second = *second, vibrant_third = *third;
-
-        // Let's begin with the seccond character, as the termination criteria there is more obvious
-        // and we may end up with more variants to check for the first candidate.
-        for (; (start_u8[vibrant_second] > 191 || start_u8[vibrant_second] == start_u8[vibrant_third]) &&
-               (vibrant_second + 1 < vibrant_third);
-             ++vibrant_second) {}
-
-        // Now check if we've indeed found a good candidate or should revert the `vibrant_second` to `second`.
-        if (start_u8[vibrant_second] < 191) { *second = vibrant_second; }
-        else { vibrant_second = *second; }
-
-        // Now check the first character.
-        for (; (start_u8[vibrant_first] > 191 || start_u8[vibrant_first] == start_u8[vibrant_second] ||
-                start_u8[vibrant_first] == start_u8[vibrant_third]) &&
-               (vibrant_first + 1 < vibrant_second);
-             ++vibrant_first) {}
-
-        // Now check if we've indeed found a good candidate or should revert the `vibrant_first` to `first`.
-        // We don't need to shift the third one when dealing with texts as the last byte of the text is
-        // also the last byte of a rune and contains the most information.
-        if (start_u8[vibrant_first] < 191) { *first = vibrant_first; }
-    }
-}
-
-#pragma GCC visibility pop
-#pragma endregion
-
-#pragma region Serial Implementation
-
-#if !SZ_AVOID_LIBC
-#include <stdio.h>  // `fprintf`
-#include <stdlib.h> // `malloc`, `EXIT_FAILURE`
-
-SZ_PUBLIC void *_sz_memory_allocate_default(sz_size_t length, void *handle) {
-    sz_unused(handle);
-    return malloc(length);
-}
-SZ_PUBLIC void _sz_memory_free_default(sz_ptr_t start, sz_size_t length, void *handle) {
-    sz_unused(handle && length);
-    free(start);
-}
-
-#endif
-
-SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc) {
-#if !SZ_AVOID_LIBC
-    alloc->allocate = (sz_memory_allocate_t)_sz_memory_allocate_default;
-    alloc->free = (sz_memory_free_t)_sz_memory_free_default;
-#else
-    alloc->allocate = (sz_memory_allocate_t)SZ_NULL;
-    alloc->free = (sz_memory_free_t)SZ_NULL;
-#endif
-    alloc->handle = SZ_NULL;
-}
-
-SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length) {
-    // The logic here is simple - put the buffer length in the first slots of the buffer.
-    // Later use it for bounds checking.
-    alloc->allocate = (sz_memory_allocate_t)_sz_memory_allocate_fixed;
-    alloc->free = (sz_memory_free_t)_sz_memory_free_fixed;
-    alloc->handle = &buffer;
-    sz_copy((sz_ptr_t)buffer, (sz_cptr_t)&length, sizeof(sz_size_t));
-}
-
-/**
- *  @brief  Byte-level equality comparison between two strings.
- *          If unaligned loads are allowed, uses a switch-table to avoid loops on short strings.
- */
-SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_cptr_t const a_end = a + length;
-#if SZ_USE_MISALIGNED_LOADS
-    if (length >= SZ_SWAR_THRESHOLD) {
-        sz_u64_vec_t a_vec, b_vec;
-        for (; a + 8 <= a_end; a += 8, b += 8) {
-            a_vec = sz_u64_load(a);
-            b_vec = sz_u64_load(b);
-            if (a_vec.u64 != b_vec.u64) return sz_false_k;
-        }
-    }
-#endif
-    while (a != a_end && *a == *b) a++, b++;
-    return (sz_bool_t)(a_end == a);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-    for (sz_cptr_t const end = text + length; text != end; ++text)
-        if (sz_charset_contains(set, *text)) return text;
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Warray-bounds"
-    sz_cptr_t const end = text;
-    for (text += length; text != end;)
-        if (sz_charset_contains(set, *(text -= 1))) return text;
-    return SZ_NULL_CHAR;
-#pragma GCC diagnostic pop
-}
-
-/**
- *  One option to avoid branching is to use conditional moves and lookup the comparison result in a table:
- *       sz_ordering_t ordering_lookup[2] = {sz_greater_k, sz_less_k};
- *       for (; a != min_end; ++a, ++b)
- *           if (*a != *b) return ordering_lookup[*a < *b];
- *  That, however, introduces a data-dependency.
- *  A cleaner option is to perform two comparisons and a subtraction.
- *  One instruction more, but no data-dependency.
- */
-#define _sz_order_scalars(a, b) ((sz_ordering_t)((a > b) - (a < b)))
-
-SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    sz_bool_t a_shorter = (sz_bool_t)(a_length < b_length);
-    sz_size_t min_length = a_shorter ? a_length : b_length;
-    sz_cptr_t min_end = a + min_length;
-#if SZ_USE_MISALIGNED_LOADS && !SZ_DETECT_BIG_ENDIAN
-    for (sz_u64_vec_t a_vec, b_vec; a + 8 <= min_end; a += 8, b += 8) {
-        a_vec = sz_u64_load(a);
-        b_vec = sz_u64_load(b);
-        if (a_vec.u64 != b_vec.u64)
-            return _sz_order_scalars(sz_u64_bytes_reverse(a_vec.u64), sz_u64_bytes_reverse(b_vec.u64));
-    }
-#endif
-    for (; a != min_end; ++a, ++b)
-        if (*a != *b) return _sz_order_scalars(*a, *b);
-
-    // If the strings are equal up to `min_end`, then the shorter string is smaller
-    return _sz_order_scalars(a_length, b_length);
-}
-
-/**
- *  @brief  Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each byte is set.
-    // For that take the bottom 7 bits of each byte, add one to them,
-    // and if this sets the top bit to one, then all the 7 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7F7F7F7F7F7F7F7Full) + 0x0101010101010101ull) & ((vec.u64 & 0x8080808080808080ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b single-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- *          Identical to `memchr(haystack, needle[0], haystack_length)`.
- */
-SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    if (!h_length) return SZ_NULL_CHAR;
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_DETECT_BIG_ENDIAN    // Use SWAR only on little-endian platforms for brevety.
-#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
-        if (*h == *n) return h;
-#endif
-
-    // Broadcast the n into every byte of a 64-bit integer to use SWAR
-    // techniques and process eight characters at a time.
-    sz_u64_vec_t h_vec, n_vec, match_vec;
-    match_vec.u64 = 0;
-    n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
-    for (; h + 8 <= h_end; h += 8) {
-        h_vec.u64 = *(sz_u64_t const *)h;
-        match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
-        if (match_vec.u64) return h + sz_u64_ctz(match_vec.u64) / 8;
-    }
-#endif
-
-    // Handle the misaligned tail.
-    for (; h < h_end; ++h)
-        if (*h == *n) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Find the last occurrence of a @b single-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- *          Identical to `memrchr(haystack, needle[0], haystack_length)`.
- */
-sz_cptr_t sz_rfind_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    if (!h_length) return SZ_NULL_CHAR;
-    sz_cptr_t const h_start = h;
-
-    // Reposition the `h` pointer to the end, as we will be walking backwards.
-    h = h + h_length - 1;
-
-#if !SZ_DETECT_BIG_ENDIAN    // Use SWAR only on little-endian platforms for brevety.
-#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)(h + 1) & 7ull) && h >= h_start; --h)
-        if (*h == *n) return h;
-#endif
-
-    // Broadcast the n into every byte of a 64-bit integer to use SWAR
-    // techniques and process eight characters at a time.
-    sz_u64_vec_t h_vec, n_vec, match_vec;
-    n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
-    for (; h >= h_start + 7; h -= 8) {
-        h_vec.u64 = *(sz_u64_t const *)(h - 7);
-        match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
-        if (match_vec.u64) return h - sz_u64_clz(match_vec.u64) / 8;
-    }
-#endif
-
-    for (; h >= h_start; --h)
-        if (*h == *n) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  2Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 2byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_2byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 2byte is set.
-    // For that take the bottom 15 bits of each 2byte, add one to them,
-    // and if this sets the top bit to one, then all the 15 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7FFF7FFF7FFF7FFFull) + 0x0001000100010001ull) & ((vec.u64 & 0x8000800080008000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b two-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_2byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 2 bytes long.
-    sz_assert(h_length >= 2 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 2 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
-#endif
-
-    sz_u64_vec_t h_even_vec, h_odd_vec, n_vec, matches_even_vec, matches_odd_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1];
-    n_vec.u64 *= 0x0001000100010001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
-    for (; h + 9 <= h_end; h += 8) {
-        h_even_vec.u64 = *(sz_u64_t *)h;
-        h_odd_vec.u64 = (h_even_vec.u64 >> 8) | ((sz_u64_t)h[8] << 56);
-        matches_even_vec = _sz_u64_each_2byte_equal(h_even_vec, n_vec);
-        matches_odd_vec = _sz_u64_each_2byte_equal(h_odd_vec, n_vec);
-
-        matches_even_vec.u64 >>= 8;
-        if (matches_even_vec.u64 + matches_odd_vec.u64) {
-            sz_u64_t match_indicators = matches_even_vec.u64 | matches_odd_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 2 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  4Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 4byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_4byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 4byte is set.
-    // For that take the bottom 31 bits of each 4byte, add one to them,
-    // and if this sets the top bit to one, then all the 31 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7FFFFFFF7FFFFFFFull) + 0x0000000100000001ull) & ((vec.u64 & 0x8000000080000000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b four-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_4byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
-    sz_assert(h_length >= 4 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 4 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
-#endif
-
-    sz_u64_vec_t h0_vec, h1_vec, h2_vec, h3_vec, n_vec, matches0_vec, matches1_vec, matches2_vec, matches3_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1], n_vec.u8s[2] = n[2], n_vec.u8s[3] = n[3];
-    n_vec.u64 *= 0x0000000100000001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time using four 64-bit words.
-    // We load the subsequent four-byte word as well, taking its first bytes. Think of it as a glorified prefetch :)
-    sz_u64_t h_page_current, h_page_next;
-    for (; h + sizeof(sz_u64_t) + sizeof(sz_u32_t) <= h_end; h += sizeof(sz_u64_t)) {
-        h_page_current = *(sz_u64_t *)h;
-        h_page_next = *(sz_u32_t *)(h + 8);
-        h0_vec.u64 = (h_page_current);
-        h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
-        h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
-        h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
-        matches0_vec = _sz_u64_each_4byte_equal(h0_vec, n_vec);
-        matches1_vec = _sz_u64_each_4byte_equal(h1_vec, n_vec);
-        matches2_vec = _sz_u64_each_4byte_equal(h2_vec, n_vec);
-        matches3_vec = _sz_u64_each_4byte_equal(h3_vec, n_vec);
-
-        if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64) {
-            matches0_vec.u64 >>= 24;
-            matches1_vec.u64 >>= 16;
-            matches2_vec.u64 >>= 8;
-            sz_u64_t match_indicators = matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 4 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  3Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 3byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_3byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 4byte is set.
-    // For that take the bottom 31 bits of each 4byte, add one to them,
-    // and if this sets the top bit to one, then all the 31 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0xFFFF7FFFFF7FFFFFull) + 0x0000000001000001ull) & ((vec.u64 & 0x0000800000800000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b three-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_3byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
-    sz_assert(h_length >= 3 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 3 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
-#endif
-
-    // We fetch 12
-    sz_u64_vec_t h0_vec, h1_vec, h2_vec, h3_vec, h4_vec;
-    sz_u64_vec_t matches0_vec, matches1_vec, matches2_vec, matches3_vec, matches4_vec;
-    sz_u64_vec_t n_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1], n_vec.u8s[2] = n[2];
-    n_vec.u64 *= 0x0000000001000001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time using three 64-bit words.
-    // We load the subsequent two-byte word as well.
-    sz_u64_t h_page_current, h_page_next;
-    for (; h + sizeof(sz_u64_t) + sizeof(sz_u16_t) <= h_end; h += sizeof(sz_u64_t)) {
-        h_page_current = *(sz_u64_t *)h;
-        h_page_next = *(sz_u16_t *)(h + 8);
-        h0_vec.u64 = (h_page_current);
-        h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
-        h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
-        h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
-        h4_vec.u64 = (h_page_current >> 32) | (h_page_next << 32);
-        matches0_vec = _sz_u64_each_3byte_equal(h0_vec, n_vec);
-        matches1_vec = _sz_u64_each_3byte_equal(h1_vec, n_vec);
-        matches2_vec = _sz_u64_each_3byte_equal(h2_vec, n_vec);
-        matches3_vec = _sz_u64_each_3byte_equal(h3_vec, n_vec);
-        matches4_vec = _sz_u64_each_3byte_equal(h4_vec, n_vec);
-
-        if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64) {
-            matches0_vec.u64 >>= 16;
-            matches1_vec.u64 >>= 8;
-            matches3_vec.u64 <<= 8;
-            matches4_vec.u64 <<= 16;
-            sz_u64_t match_indicators =
-                matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 3 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Boyer-Moore-Horspool algorithm for exact matching of patterns up to @b 256-bytes long.
- *          Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_horspool_upto_256bytes_serial(sz_cptr_t h_chars, sz_size_t h_length, //
-                                                             sz_cptr_t n_chars, sz_size_t n_length) {
-    sz_assert(n_length <= 256 && "The pattern is too long.");
-    // Several popular string matching algorithms are using a bad-character shift table.
-    // Boyer Moore: https://www-igm.univ-mlv.fr/~lecroq/string/node14.html
-    // Quick Search: https://www-igm.univ-mlv.fr/~lecroq/string/node19.html
-    // Smith: https://www-igm.univ-mlv.fr/~lecroq/string/node21.html
-    union {
-        sz_u8_t jumps[256];
-        sz_u64_vec_t vecs[64];
-    } bad_shift_table;
-
-    // Let's initialize the table using SWAR to the total length of the string.
-    sz_u8_t const *h = (sz_u8_t const *)h_chars;
-    sz_u8_t const *n = (sz_u8_t const *)n_chars;
-    {
-        sz_u64_vec_t n_length_vec;
-        n_length_vec.u64 = n_length;
-        n_length_vec.u64 *= 0x0101010101010101ull; // broadcast
-        for (sz_size_t i = 0; i != 64; ++i) bad_shift_table.vecs[i].u64 = n_length_vec.u64;
-        for (sz_size_t i = 0; i + 1 < n_length; ++i) bad_shift_table.jumps[n[i]] = (sz_u8_t)(n_length - i - 1);
-    }
-
-    // Another common heuristic is to match a few characters from different parts of a string.
-    // Raita suggests to use the first two, the last, and the middle character of the pattern.
-    sz_u32_vec_t h_vec, n_vec;
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into an unsigned integer.
-    n_vec.u8s[0] = n[offset_first];
-    n_vec.u8s[1] = n[offset_first + 1];
-    n_vec.u8s[2] = n[offset_mid];
-    n_vec.u8s[3] = n[offset_last];
-
-    // Scan through the whole haystack, skipping the last `n_length - 1` bytes.
-    for (sz_size_t i = 0; i <= h_length - n_length;) {
-        h_vec.u8s[0] = h[i + offset_first];
-        h_vec.u8s[1] = h[i + offset_first + 1];
-        h_vec.u8s[2] = h[i + offset_mid];
-        h_vec.u8s[3] = h[i + offset_last];
-        if (h_vec.u32 == n_vec.u32 && sz_equal((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
-        i += bad_shift_table.jumps[h[i + n_length - 1]];
-    }
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Boyer-Moore-Horspool algorithm for @b reverse-order exact matching of patterns up to @b 256-bytes long.
- *          Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
- */
-SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_upto_256bytes_serial(sz_cptr_t h_chars, sz_size_t h_length, //
-                                                              sz_cptr_t n_chars, sz_size_t n_length) {
-    sz_assert(n_length <= 256 && "The pattern is too long.");
-    union {
-        sz_u8_t jumps[256];
-        sz_u64_vec_t vecs[64];
-    } bad_shift_table;
-
-    // Let's initialize the table using SWAR to the total length of the string.
-    sz_u8_t const *h = (sz_u8_t const *)h_chars;
-    sz_u8_t const *n = (sz_u8_t const *)n_chars;
-    {
-        sz_u64_vec_t n_length_vec;
-        n_length_vec.u64 = n_length;
-        n_length_vec.u64 *= 0x0101010101010101ull; // broadcast
-        for (sz_size_t i = 0; i != 64; ++i) bad_shift_table.vecs[i].u64 = n_length_vec.u64;
-        for (sz_size_t i = 0; i + 1 < n_length; ++i)
-            bad_shift_table.jumps[n[n_length - i - 1]] = (sz_u8_t)(n_length - i - 1);
-    }
-
-    // Another common heuristic is to match a few characters from different parts of a string.
-    // Raita suggests to use the first two, the last, and the middle character of the pattern.
-    sz_u32_vec_t h_vec, n_vec;
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into an unsigned integer.
-    n_vec.u8s[0] = n[offset_first];
-    n_vec.u8s[1] = n[offset_first + 1];
-    n_vec.u8s[2] = n[offset_mid];
-    n_vec.u8s[3] = n[offset_last];
-
-    // Scan through the whole haystack, skipping the first `n_length - 1` bytes.
-    for (sz_size_t j = 0; j <= h_length - n_length;) {
-        sz_size_t i = h_length - n_length - j;
-        h_vec.u8s[0] = h[i + offset_first];
-        h_vec.u8s[1] = h[i + offset_first + 1];
-        h_vec.u8s[2] = h[i + offset_mid];
-        h_vec.u8s[3] = h[i + offset_last];
-        if (h_vec.u32 == n_vec.u32 && sz_equal((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
-        j += bad_shift_table.jumps[h[i]];
-    }
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Exact substring search helper function, that finds the first occurrence of a prefix of the needle
- *          using a given search function, and then verifies the remaining part of the needle.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_with_prefix(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length,
-                                           sz_find_t find_prefix, sz_size_t prefix_length) {
-
-    sz_size_t suffix_length = n_length - prefix_length;
-    while (1) {
-        sz_cptr_t found = find_prefix(h, h_length, n, prefix_length);
-        if (!found) return SZ_NULL_CHAR;
-
-        // Verify the remaining part of the needle
-        sz_size_t remaining = h_length - (found - h);
-        if (remaining < n_length) return SZ_NULL_CHAR;
-        if (sz_equal(found + prefix_length, n + prefix_length, suffix_length)) return found;
-
-        // Adjust the position.
-        h = found + 1;
-        h_length = remaining - 1;
-    }
-
-    // Unreachable, but helps silence compiler warnings:
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Exact reverse-order substring search helper function, that finds the last occurrence of a suffix of the
- *          needle using a given search function, and then verifies the remaining part of the needle.
- */
-SZ_INTERNAL sz_cptr_t _sz_rfind_with_suffix(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length,
-                                            sz_find_t find_suffix, sz_size_t suffix_length) {
-
-    sz_size_t prefix_length = n_length - suffix_length;
-    while (1) {
-        sz_cptr_t found = find_suffix(h, h_length, n + prefix_length, suffix_length);
-        if (!found) return SZ_NULL_CHAR;
-
-        // Verify the remaining part of the needle
-        sz_size_t remaining = found - h;
-        if (remaining < prefix_length) return SZ_NULL_CHAR;
-        if (sz_equal(found - prefix_length, n, prefix_length)) return found - prefix_length;
-
-        // Adjust the position.
-        h_length = remaining - 1;
-    }
-
-    // Unreachable, but helps silence compiler warnings:
-    return SZ_NULL_CHAR;
-}
-
-SZ_INTERNAL sz_cptr_t _sz_find_over_4bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    return _sz_find_with_prefix(h, h_length, n, n_length, (sz_find_t)_sz_find_4byte_serial, 4);
-}
-
-SZ_INTERNAL sz_cptr_t _sz_find_horspool_over_256bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
-                                                             sz_size_t n_length) {
-    return _sz_find_with_prefix(h, h_length, n, n_length, _sz_find_horspool_upto_256bytes_serial, 256);
-}
-
-SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_over_256bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
-                                                              sz_size_t n_length) {
-    return _sz_rfind_with_suffix(h, h_length, n, n_length, _sz_rfind_horspool_upto_256bytes_serial, 256);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-
-#if SZ_DETECT_BIG_ENDIAN
-    sz_find_t backends[] = {
-        (sz_find_t)sz_find_byte_serial,
-        (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_find_horspool_over_256bytes_serial,
-    };
-
-    return backends[(n_length > 1) + (n_length > 256)](h, h_length, n, n_length);
-#else
-    sz_find_t backends[] = {
-        // For very short strings brute-force SWAR makes sense.
-        (sz_find_t)sz_find_byte_serial,
-        (sz_find_t)_sz_find_2byte_serial,
-        (sz_find_t)_sz_find_3byte_serial,
-        (sz_find_t)_sz_find_4byte_serial,
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (sz_find_t)_sz_find_over_4bytes_serial,
-        // For longer needles - use skip tables.
-        (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_find_horspool_over_256bytes_serial,
-    };
-
-    return backends[
-        // For very short strings brute-force SWAR makes sense.
-        (n_length > 1) + (n_length > 2) + (n_length > 3) +
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (n_length > 4) +
-        // For longer needles - use skip tables.
-        (n_length > 8) + (n_length > 256)](h, h_length, n, n_length);
-#endif
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-
-    sz_find_t backends[] = {
-        // For very short strings brute-force SWAR makes sense.
-        (sz_find_t)sz_rfind_byte_serial,
-        //  TODO: implement reverse-order SWAR for 2/3/4 byte variants.
-        //  TODO: (sz_find_t)_sz_rfind_2byte_serial,
-        //  TODO: (sz_find_t)_sz_rfind_3byte_serial,
-        //  TODO: (sz_find_t)_sz_rfind_4byte_serial,
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        // (sz_find_t)_sz_rfind_over_4bytes_serial,
-        // For longer needles - use skip tables.
-        (sz_find_t)_sz_rfind_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_rfind_horspool_over_256bytes_serial,
-    };
-
-    return backends[
-        // For very short strings brute-force SWAR makes sense.
-        0 +
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (n_length > 1) +
-        // For longer needles - use skip tables.
-        (n_length > 256)](h, h_length, n, n_length);
-}
-
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_serial( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                 //
-    sz_cptr_t longer, sz_size_t longer_length,                   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // TODO: Generalize to remove the following asserts!
-    sz_assert(!bound && "For bounded search the method should only evaluate one band of the matrix.");
-    sz_assert(shorter_length == longer_length && "The method hasn't been generalized to different length inputs yet.");
-    sz_unused(longer_length && bound);
-
-    // We are going to store 3 diagonals of the matrix.
-    // The length of the longest (main) diagonal would be `n = (shorter_length + 1)`.
-    sz_size_t n = shorter_length + 1;
-    sz_size_t buffer_length = sizeof(sz_size_t) * n * 3;
-    sz_size_t *distances = (sz_size_t *)alloc->allocate(buffer_length, alloc->handle);
-    if (!distances) return SZ_SIZE_MAX;
-
-    sz_size_t *previous_distances = distances;
-    sz_size_t *current_distances = previous_distances + n;
-    sz_size_t *next_distances = previous_distances + n * 2;
-
-    // Initialize the first two diagonals:
-    previous_distances[0] = 0;
-    current_distances[0] = current_distances[1] = 1;
-
-    // Progress through the upper triangle of the Levenshtein matrix.
-    sz_size_t next_diagonal_index = 2;
-    for (; next_diagonal_index != n; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
-        for (sz_size_t i = 0; i + 2 < next_diagonal_length; ++i) {
-            sz_size_t cost_of_substitution = shorter[next_diagonal_index - i - 2] != longer[i];
-            sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
-            sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
-            next_distances[i + 1] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
-        }
-        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-        next_distances[0] = next_distances[next_diagonal_length - 1] = next_diagonal_index;
-        // Perform a circular rotation of those buffers, to reuse the memory.
-        sz_size_t *temporary = previous_distances;
-        previous_distances = current_distances;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a
-    // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
-    // index on either side, we will be cropping those values out.
-    sz_size_t diagonals_count = n + n - 1;
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
-        for (sz_size_t i = 0; i != next_diagonal_length; ++i) {
-            sz_size_t cost_of_substitution = shorter[shorter_length - 1 - i] != longer[next_diagonal_index - n + i];
-            sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
-            sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
-            next_distances[i] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
-        }
-        // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
-        // dropping the first element in the current array.
-        sz_size_t *temporary = previous_distances;
-        previous_distances = current_distances + 1;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // Cache scalar before `free` call.
-    sz_size_t result = current_distances[0];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-}
-
-/**
- *  @brief  Describes the length of a UTF8 character / codepoint / rune in bytes.
- */
-typedef enum {
-    sz_utf8_invalid_k = 0,     //!< Invalid UTF8 character.
-    sz_utf8_rune_1byte_k = 1,  //!< 1-byte UTF8 character.
-    sz_utf8_rune_2bytes_k = 2, //!< 2-byte UTF8 character.
-    sz_utf8_rune_3bytes_k = 3, //!< 3-byte UTF8 character.
-    sz_utf8_rune_4bytes_k = 4, //!< 4-byte UTF8 character.
-} sz_rune_length_t;
-
-typedef sz_u32_t sz_rune_t;
-
-/**
- *  @brief  Extracts just one UTF8 codepoint from a UTF8 string into a 32-bit unsigned integer.
- */
-SZ_INTERNAL void _sz_extract_utf8_rune(sz_cptr_t utf8, sz_rune_t *code, sz_rune_length_t *code_length) {
-    sz_u8_t const *current = (sz_u8_t const *)utf8;
-    sz_u8_t leading_byte = *current++;
-    sz_rune_t ch;
-    sz_rune_length_t ch_length;
-
-    // TODO: This can be made entirely branchless using 32-bit SWAR.
-    if (leading_byte < 0x80) {
-        // Single-byte rune (0xxxxxxx)
-        ch = leading_byte;
-        ch_length = sz_utf8_rune_1byte_k;
-    }
-    else if ((leading_byte & 0xE0) == 0xC0) {
-        // Two-byte rune (110xxxxx 10xxxxxx)
-        ch = (leading_byte & 0x1F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_2bytes_k;
-    }
-    else if ((leading_byte & 0xF0) == 0xE0) {
-        // Three-byte rune (1110xxxx 10xxxxxx 10xxxxxx)
-        ch = (leading_byte & 0x0F) << 12;
-        ch |= (*current++ & 0x3F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_3bytes_k;
-    }
-    else if ((leading_byte & 0xF8) == 0xF0) {
-        // Four-byte rune (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
-        ch = (leading_byte & 0x07) << 18;
-        ch |= (*current++ & 0x3F) << 12;
-        ch |= (*current++ & 0x3F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_4bytes_k;
-    }
-    else {
-        // Invalid UTF8 rune.
-        ch = 0;
-        ch_length = sz_utf8_invalid_k;
-    }
-    *code = ch;
-    *code_length = ch_length;
-}
-
-/**
- *  @brief  Exports a UTF8 string into a UTF32 buffer.
- *          ! The result is undefined id the UTF8 string is corrupted.
- *  @return The length in the number of codepoints.
- */
-SZ_INTERNAL sz_size_t _sz_export_utf8_to_utf32(sz_cptr_t utf8, sz_size_t utf8_length, sz_rune_t *utf32) {
-    sz_cptr_t const end = utf8 + utf8_length;
-    sz_size_t count = 0;
-    sz_rune_length_t rune_length;
-    for (; utf8 != end; utf8 += rune_length, utf32++, count++) _sz_extract_utf8_rune(utf8, utf32, &rune_length);
-    return count;
-}
-
-/**
- *  @brief  Compute the Levenshtein distance between two strings using the Wagner-Fisher algorithm.
- *          Stores only 2 rows of the Levenshtein matrix, but uses 64-bit integers for the distance values,
- *          and upcasts UTF8 variable-length codepoints to 64-bit integers for faster addressing.
- *
- *  ! In the worst case for 2 strings of length 100, that contain just one 16-bit codepoint this will result in extra:
- *      + 2 rows * 100 slots * 8 bytes/slot = 1600 bytes of memory for the two rows of the Levenshtein matrix rows.
- *      + 100 codepoints * 2 strings * 4 bytes/codepoint = 800 bytes of memory for the UTF8 buffer.
- *      = 2400 bytes of memory or @b 12x memory amplification!
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_wagner_fisher_serial( //
-    sz_cptr_t longer, sz_size_t longer_length,                //
-    sz_cptr_t shorter, sz_size_t shorter_length,              //
-    sz_size_t bound, sz_bool_t can_be_unicode, sz_memory_allocator_t *alloc) {
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // A good idea may be to dispatch different kernels for different string lengths.
-    // Like using `uint8_t` counters for strings under 255 characters long.
-    // Good in theory, this results in frequent upcasts and downcasts in serial code.
-    // On strings over 20 bytes, using `uint8` over `uint64` on 64-bit x86 CPU doubles the execution time.
-    // So one must be very cautious with such optimizations.
-    typedef sz_size_t _distance_t;
-
-    // Compute the number of columns in our Levenshtein matrix.
-    sz_size_t const n = shorter_length + 1;
-
-    // If a buffering memory-allocator is provided, this operation is practically free,
-    // and cheaper than allocating even 512 bytes (for small distance matrices) on stack.
-    sz_size_t buffer_length = sizeof(_distance_t) * (n * 2);
-
-    // If the strings contain Unicode characters, let's estimate the max character width,
-    // and use it to allocate a larger buffer to decode UTF8.
-    if ((can_be_unicode == sz_true_k) &&
-        (sz_isascii(longer, longer_length) == sz_false_k || sz_isascii(shorter, shorter_length) == sz_false_k)) {
-        buffer_length += (shorter_length + longer_length) * sizeof(sz_rune_t);
-    }
-    else { can_be_unicode = sz_false_k; }
-
-    // If the allocation fails, return the maximum distance.
-    sz_ptr_t const buffer = (sz_ptr_t)alloc->allocate(buffer_length, alloc->handle);
-    if (!buffer) return SZ_SIZE_MAX;
-
-    // Let's export the UTF8 sequence into the newly allocated buffer at the end.
-    if (can_be_unicode == sz_true_k) {
-        sz_rune_t *const longer_utf32 = (sz_rune_t *)(buffer + sizeof(_distance_t) * (n * 2));
-        sz_rune_t *const shorter_utf32 = longer_utf32 + longer_length;
-        // Export the UTF8 sequences into the newly allocated buffer.
-        longer_length = _sz_export_utf8_to_utf32(longer, longer_length, longer_utf32);
-        shorter_length = _sz_export_utf8_to_utf32(shorter, shorter_length, shorter_utf32);
-        longer = (sz_cptr_t)longer_utf32;
-        shorter = (sz_cptr_t)shorter_utf32;
-    }
-
-    // Let's parameterize the core logic for different character types and distance types.
-#define _wagner_fisher_unbounded(_distance_t, _char_t)                                                                \
-    /* Now let's cast our pointer to avoid it in subsequent sections. */                                              \
-    _char_t const *const longer_chars = (_char_t const *)longer;                                                      \
-    _char_t const *const shorter_chars = (_char_t const *)shorter;                                                    \
-    _distance_t *previous_distances = (_distance_t *)buffer;                                                          \
-    _distance_t *current_distances = previous_distances + n;                                                          \
-    /*  Initialize the first row of the Levenshtein matrix with `iota`-style arithmetic progression. */               \
-    for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
-    /* The main loop of the algorithm with quadratic complexity. */                                                   \
-    for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {                                     \
-        _char_t const longer_char = longer_chars[idx_longer];                                                         \
-        /* Using pure pointer arithmetic is faster than iterating with an index. */                                   \
-        _char_t const *shorter_ptr = shorter_chars;                                                                   \
-        _distance_t const *previous_ptr = previous_distances;                                                         \
-        _distance_t *current_ptr = current_distances;                                                                 \
-        _distance_t *const current_end = current_ptr + shorter_length;                                                \
-        current_ptr[0] = idx_longer + 1;                                                                              \
-        for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) {                            \
-            _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]);           \
-            /* We can avoid `+1` for costs here, shifting it to post-minimum computation, */                          \
-            /* saving one increment operation. */                                                                     \
-            _distance_t cost_deletion = previous_ptr[1];                                                              \
-            _distance_t cost_insertion = current_ptr[0];                                                              \
-            /* ? It might be a good idea to enforce branchless execution here. */                                     \
-            /* ? The caveat being that the benchmarks on longer sequences backfire and more research is needed. */    \
-            current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1);      \
-        }                                                                                                             \
-        /* Swap `previous_distances` and `current_distances` pointers. */                                             \
-        _distance_t *temporary = previous_distances;                                                                  \
-        previous_distances = current_distances;                                                                       \
-        current_distances = temporary;                                                                                \
-    }                                                                                                                 \
-    /* Cache scalar before `free` call. */                                                                            \
-    sz_size_t result = previous_distances[shorter_length];                                                            \
-    alloc->free(buffer, buffer_length, alloc->handle);                                                                \
-    return result;
-
-    // Let's define a separate variant for bounded distance computation.
-    // Practically the same as unbounded, but also collecting the running minimum within each row for early exit.
-#define _wagner_fisher_bounded(_distance_t, _char_t)                                                                  \
-    _char_t const *const longer_chars = (_char_t const *)longer;                                                      \
-    _char_t const *const shorter_chars = (_char_t const *)shorter;                                                    \
-    _distance_t *previous_distances = (_distance_t *)buffer;                                                          \
-    _distance_t *current_distances = previous_distances + n;                                                          \
-    for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
-    for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {                                     \
-        _char_t const longer_char = longer_chars[idx_longer];                                                         \
-        _char_t const *shorter_ptr = shorter_chars;                                                                   \
-        _distance_t const *previous_ptr = previous_distances;                                                         \
-        _distance_t *current_ptr = current_distances;                                                                 \
-        _distance_t *const current_end = current_ptr + shorter_length;                                                \
-        current_ptr[0] = idx_longer + 1;                                                                              \
-        /* Initialize min_distance with a value greater than bound */                                                 \
-        _distance_t min_distance = bound - 1;                                                                         \
-        for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) {                            \
-            _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]);           \
-            _distance_t cost_deletion = previous_ptr[1];                                                              \
-            _distance_t cost_insertion = current_ptr[0];                                                              \
-            current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1);      \
-            /* Keep track of the minimum distance seen so far in this row */                                          \
-            min_distance = sz_min_of_two(current_ptr[1], min_distance);                                               \
-        }                                                                                                             \
-        /* If the minimum distance in this row exceeded the bound, return early */                                    \
-        if (min_distance >= bound) {                                                                                  \
-            alloc->free(buffer, buffer_length, alloc->handle);                                                        \
-            return bound;                                                                                             \
-        }                                                                                                             \
-        _distance_t *temporary = previous_distances;                                                                  \
-        previous_distances = current_distances;                                                                       \
-        current_distances = temporary;                                                                                \
-    }                                                                                                                 \
-    sz_size_t result = previous_distances[shorter_length];                                                            \
-    alloc->free(buffer, buffer_length, alloc->handle);                                                                \
-    return sz_min_of_two(result, bound);
-
-    // Dispatch the actual computation.
-    if (!bound) {
-        if (can_be_unicode == sz_true_k) { _wagner_fisher_unbounded(sz_size_t, sz_rune_t); }
-        else { _wagner_fisher_unbounded(sz_size_t, sz_u8_t); }
-    }
-    else {
-        if (can_be_unicode == sz_true_k) { _wagner_fisher_bounded(sz_size_t, sz_rune_t); }
-        else { _wagner_fisher_bounded(sz_size_t, sz_u8_t); }
-    }
-}
-
-SZ_PUBLIC sz_size_t sz_edit_distance_serial(     //
-    sz_cptr_t longer, sz_size_t longer_length,   //
-    sz_cptr_t shorter, sz_size_t shorter_length, //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
-    }
-
-    // Skip the matching prefixes and suffixes, they won't affect the distance.
-    for (sz_cptr_t a_end = longer + longer_length, b_end = shorter + shorter_length;
-         longer != a_end && shorter != b_end && *longer == *shorter;
-         ++longer, ++shorter, --longer_length, --shorter_length);
-    for (; longer_length && shorter_length && longer[longer_length - 1] == shorter[shorter_length - 1];
-         --longer_length, --shorter_length);
-
-    // Bounded computations may exit early.
-    int const is_bounded = bound < longer_length;
-    if (is_bounded) {
-        // If one of the strings is empty - the edit distance is equal to the length of the other one.
-        if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
-        if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
-        // If the difference in length is beyond the `bound`, there is no need to check at all.
-        if (longer_length - shorter_length > bound) return bound;
-    }
-
-    if (shorter_length == 0) return longer_length; // If no mismatches were found - the distance is zero.
-    if (shorter_length == longer_length && !is_bounded)
-        return _sz_edit_distance_skewed_diagonals_serial(longer, longer_length, shorter, shorter_length, bound, alloc);
-    return _sz_edit_distance_wagner_fisher_serial(longer, longer_length, shorter, shorter_length, bound, sz_false_k,
-                                                  alloc);
-}
-
-SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(       //
-    sz_cptr_t longer, sz_size_t longer_length,        //
-    sz_cptr_t shorter, sz_size_t shorter_length,      //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, //
-    sz_memory_allocator_t *alloc) {
-
-    // If one of the strings is empty - the edit distance is equal to the length of the other one
-    if (longer_length == 0) return (sz_ssize_t)shorter_length * gap;
-    if (shorter_length == 0) return (sz_ssize_t)longer_length * gap;
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
-    }
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    sz_size_t n = shorter_length + 1;
-    sz_size_t buffer_length = sizeof(sz_ssize_t) * n * 2;
-    sz_ssize_t *distances = (sz_ssize_t *)alloc->allocate(buffer_length, alloc->handle);
-    sz_ssize_t *previous_distances = distances;
-    sz_ssize_t *current_distances = previous_distances + n;
-
-    for (sz_size_t idx_shorter = 0; idx_shorter != n; ++idx_shorter)
-        previous_distances[idx_shorter] = (sz_ssize_t)idx_shorter * gap;
-
-    sz_u8_t const *shorter_unsigned = (sz_u8_t const *)shorter;
-    sz_u8_t const *longer_unsigned = (sz_u8_t const *)longer;
-    for (sz_size_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {
-        current_distances[0] = ((sz_ssize_t)idx_longer + 1) * gap;
-
-        // Initialize min_distance with a value greater than bound
-        sz_error_cost_t const *a_subs = subs + longer_unsigned[idx_longer] * 256ul;
-        for (sz_size_t idx_shorter = 0; idx_shorter != shorter_length; ++idx_shorter) {
-            sz_ssize_t cost_deletion = previous_distances[idx_shorter + 1] + gap;
-            sz_ssize_t cost_insertion = current_distances[idx_shorter] + gap;
-            sz_ssize_t cost_substitution = previous_distances[idx_shorter] + a_subs[shorter_unsigned[idx_shorter]];
-            current_distances[idx_shorter + 1] = sz_max_of_three(cost_deletion, cost_insertion, cost_substitution);
-        }
-
-        // Swap previous_distances and current_distances pointers
-        sz_pointer_swap((void **)&previous_distances, (void **)&current_distances);
-    }
-
-    // Cache scalar before `free` call.
-    sz_ssize_t result = previous_distances[shorter_length];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-}
-
-SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
-    sz_cptr_t a, sz_size_t a_length,            //
-    sz_cptr_t b, sz_size_t b_length,            //
-    sz_size_t bound) {
-
-    sz_size_t const min_length = sz_min_of_two(a_length, b_length);
-    sz_size_t const max_length = sz_max_of_two(a_length, b_length);
-    sz_cptr_t const a_end = a + min_length;
-    bound = bound == 0 ? max_length : bound;
-
-    // Walk through both strings using SWAR and counting the number of differing characters.
-    sz_size_t distance = max_length - min_length;
-#if SZ_USE_MISALIGNED_LOADS && !SZ_DETECT_BIG_ENDIAN
-    if (min_length >= SZ_SWAR_THRESHOLD) {
-        sz_u64_vec_t a_vec, b_vec, match_vec;
-        for (; a + 8 <= a_end && distance < bound; a += 8, b += 8) {
-            a_vec.u64 = sz_u64_load(a).u64;
-            b_vec.u64 = sz_u64_load(b).u64;
-            match_vec = _sz_u64_each_byte_equal(a_vec, b_vec);
-            distance += sz_u64_popcount((~match_vec.u64) & 0x8080808080808080ull);
-        }
-    }
-#endif
-
-    for (; a != a_end && distance < bound; ++a, ++b) { distance += (*a != *b); }
-    return sz_min_of_two(distance, bound);
-}
-
-SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial( //
-    sz_cptr_t a, sz_size_t a_length,                 //
-    sz_cptr_t b, sz_size_t b_length,                 //
-    sz_size_t bound) {
-
-    sz_cptr_t const a_end = a + a_length;
-    sz_cptr_t const b_end = b + b_length;
-    sz_size_t distance = 0;
-
-    sz_rune_t a_rune, b_rune;
-    sz_rune_length_t a_rune_length, b_rune_length;
-
-    if (bound) {
-        for (; a < a_end && b < b_end && distance < bound; a += a_rune_length, b += b_rune_length) {
-            _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-            _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-            distance += (a_rune != b_rune);
-        }
-        // If one string has more runes, we need to go through the tail.
-        if (distance < bound) {
-            for (; a < a_end && distance < bound; a += a_rune_length, ++distance)
-                _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-
-            for (; b < b_end && distance < bound; b += b_rune_length, ++distance)
-                _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-        }
-    }
-    else {
-        for (; a < a_end && b < b_end; a += a_rune_length, b += b_rune_length) {
-            _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-            _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-            distance += (a_rune != b_rune);
-        }
-        // If one string has more runes, we need to go through the tail.
-        for (; a < a_end; a += a_rune_length, ++distance) _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-        for (; b < b_end; b += b_rune_length, ++distance) _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-    }
-    return distance;
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length) {
-    sz_u64_t checksum = 0;
-    sz_u8_t const *text_u8 = (sz_u8_t const *)text;
-    sz_u8_t const *text_end = text_u8 + length;
-    for (; text_u8 != text_end; ++text_u8) checksum += *text_u8;
-    return checksum;
-}
-
-/**
- *  @brief  Largest prime number that fits into 31 bits.
- *  @see    https://mersenneforum.org/showthread.php?t=3471
- */
-#define SZ_U32_MAX_PRIME (2147483647u)
-
-/**
- *  @brief  Largest prime number that fits into 64 bits.
- *  @see    https://mersenneforum.org/showthread.php?t=3471
- *
- *  2^64 = 18,446,744,073,709,551,616
- *  this = 18,446,744,073,709,551,557
- *  diff = 59
- */
-#define SZ_U64_MAX_PRIME (18446744073709551557ull)
-
-/*
- *  One hardware-accelerated way of mixing hashes can be CRC, but it's only implemented for 32-bit values.
- *  Using a Boost-like mixer works very poorly in such case:
- *
- *       hash_first ^ (hash_second + 0x517cc1b727220a95 + (hash_first << 6) + (hash_first >> 2));
- *
- *  Let's stick to the Fibonacci hash trick using the golden ratio.
- *  https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
- */
-#define _sz_hash_mix(first, second) ((first * 11400714819323198485ull) ^ (second * 11400714819323198485ull))
-#define _sz_shift_low(x) (x)
-#define _sz_shift_high(x) ((x + 77ull) & 0xFFull)
-#define _sz_prime_mod(x) (x % SZ_U64_MAX_PRIME)
-
-SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length) {
-
-    sz_u64_t hash_low = 0;
-    sz_u64_t hash_high = 0;
-    sz_u8_t const *text = (sz_u8_t const *)start;
-    sz_u8_t const *text_end = text + length;
-
-    switch (length) {
-    case 0: return 0;
-
-    // Texts under 7 bytes long are definitely below the largest prime.
-    case 1:
-        hash_low = _sz_shift_low(text[0]);
-        hash_high = _sz_shift_high(text[0]);
-        break;
-    case 2:
-        hash_low = _sz_shift_low(text[0]) * 31ull + _sz_shift_low(text[1]);
-        hash_high = _sz_shift_high(text[0]) * 257ull + _sz_shift_high(text[1]);
-        break;
-    case 3:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull +         //
-                   _sz_shift_low(text[2]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull +          //
-                    _sz_shift_high(text[2]);
-        break;
-    case 4:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull +                 //
-                   _sz_shift_low(text[3]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull +                   //
-                    _sz_shift_high(text[3]);
-        break;
-    case 5:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull +                         //
-                   _sz_shift_low(text[4]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull +                            //
-                    _sz_shift_high(text[4]);
-        break;
-    case 6:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull * 31ull +                         //
-                   _sz_shift_low(text[4]) * 31ull +                                 //
-                   _sz_shift_low(text[5]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull * 257ull +                            //
-                    _sz_shift_high(text[4]) * 257ull +                                     //
-                    _sz_shift_high(text[5]);
-        break;
-    case 7:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull * 31ull * 31ull +                         //
-                   _sz_shift_low(text[4]) * 31ull * 31ull +                                 //
-                   _sz_shift_low(text[5]) * 31ull +                                         //
-                   _sz_shift_low(text[6]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull * 257ull * 257ull +                            //
-                    _sz_shift_high(text[4]) * 257ull * 257ull +                                     //
-                    _sz_shift_high(text[5]) * 257ull +                                              //
-                    _sz_shift_high(text[6]);
-        break;
-    default:
-        // Unroll the first seven cycles:
-        hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[1]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[1]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[2]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[2]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[3]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[3]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[4]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[4]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[5]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[5]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[6]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[6]);
-        text += 7;
-
-        // Iterate throw the rest with the modulus:
-        for (; text != text_end; ++text) {
-            hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
-            hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
-            // Wrap the hashes around:
-            hash_low = _sz_prime_mod(hash_low);
-            hash_high = _sz_prime_mod(hash_high);
-        }
-        break;
-    }
-
-    return _sz_hash_mix(hash_low, hash_high);
-}
-
-SZ_PUBLIC void sz_hashes_serial(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    sz_u8_t const *text = (sz_u8_t const *)start;
-    sz_u8_t const *text_end = text + length;
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // Compute the initial hash value for the first window.
-    sz_u64_t hash_low = 0, hash_high = 0, hash_mix;
-    for (sz_u8_t const *first_end = text + window_length; text < first_end; ++text)
-        hash_low = (hash_low * 31ull + _sz_shift_low(*text)) % SZ_U64_MAX_PRIME,
-        hash_high = (hash_high * 257ull + _sz_shift_high(*text)) % SZ_U64_MAX_PRIME;
-
-    // In most cases the fingerprint length will be a power of two.
-    hash_mix = _sz_hash_mix(hash_low, hash_high);
-    callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
-
-    // Compute the hash value for every window, exporting into the fingerprint,
-    // using the expensive modulo operation.
-    sz_size_t cycles = 1;
-    sz_size_t const step_mask = step - 1;
-    for (; text < text_end; ++text, ++cycles) {
-        // Discard one character:
-        hash_low -= _sz_shift_low(*(text - window_length)) * prime_power_low;
-        hash_high -= _sz_shift_high(*(text - window_length)) * prime_power_high;
-        // And add a new one:
-        hash_low = 31ull * hash_low + _sz_shift_low(*text);
-        hash_high = 257ull * hash_high + _sz_shift_high(*text);
-        // Wrap the hashes around:
-        hash_low = _sz_prime_mod(hash_low);
-        hash_high = _sz_prime_mod(hash_high);
-        // Mix only if we've skipped enough hashes.
-        if ((cycles & step_mask) == 0) {
-            hash_mix = _sz_hash_mix(hash_low, hash_high);
-            callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
-        }
-    }
-}
-
-#undef _sz_shift_low
-#undef _sz_shift_high
-#undef _sz_hash_mix
-#undef _sz_prime_mod
-
-/**
- *  @brief  Uses a small lookup-table to convert a lowercase character to uppercase.
- */
-SZ_INTERNAL sz_u8_t sz_u8_tolower(sz_u8_t c) {
-    static sz_u8_t const lowered[256] = {
-        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
-        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
-        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
-        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
-        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
-        96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, //
-        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
-        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
-        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
-        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
-    };
-    return lowered[c];
-}
-
-/**
- *  @brief  Uses a small lookup-table to convert an uppercase character to lowercase.
- */
-SZ_INTERNAL sz_u8_t sz_u8_toupper(sz_u8_t c) {
-    static sz_u8_t const upped[256] = {
-        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
-        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
-        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
-        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
-        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
-        96,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  //
-        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  123, 124, 125, 126, 127, //
-        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
-        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
-        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
-        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
-    };
-    return upped[c];
-}
-
-/**
- *  @brief  Uses two small lookup tables (768 bytes total) to accelerate division by a small
- *          unsigned integer. Performs two lookups, one multiplication, two shifts, and two accumulations.
- *
- *  @param  divisor Integral value @b larger than one.
- *  @param  number  Integral value to divide.
- */
-SZ_INTERNAL sz_u8_t sz_u8_divide(sz_u8_t number, sz_u8_t divisor) {
-    sz_assert(divisor > 1);
-    static sz_u16_t const multipliers[256] = {
-        0,     0,     0,     21846, 0,     39322, 21846, 9363,  0,     50973, 39322, 29790, 21846, 15124, 9363,  4370,
-        0,     57826, 50973, 44841, 39322, 34329, 29790, 25645, 21846, 18351, 15124, 12137, 9363,  6780,  4370,  2115,
-        0,     61565, 57826, 54302, 50973, 47824, 44841, 42011, 39322, 36765, 34329, 32006, 29790, 27671, 25645, 23705,
-        21846, 20063, 18351, 16706, 15124, 13602, 12137, 10725, 9363,  8049,  6780,  5554,  4370,  3224,  2115,  1041,
-        0,     63520, 61565, 59668, 57826, 56039, 54302, 52614, 50973, 49377, 47824, 46313, 44841, 43407, 42011, 40649,
-        39322, 38028, 36765, 35532, 34329, 33154, 32006, 30885, 29790, 28719, 27671, 26647, 25645, 24665, 23705, 22766,
-        21846, 20945, 20063, 19198, 18351, 17520, 16706, 15907, 15124, 14356, 13602, 12863, 12137, 11424, 10725, 10038,
-        9363,  8700,  8049,  7409,  6780,  6162,  5554,  4957,  4370,  3792,  3224,  2665,  2115,  1573,  1041,  517,
-        0,     64520, 63520, 62535, 61565, 60609, 59668, 58740, 57826, 56926, 56039, 55164, 54302, 53452, 52614, 51788,
-        50973, 50169, 49377, 48595, 47824, 47063, 46313, 45572, 44841, 44120, 43407, 42705, 42011, 41326, 40649, 39982,
-        39322, 38671, 38028, 37392, 36765, 36145, 35532, 34927, 34329, 33738, 33154, 32577, 32006, 31443, 30885, 30334,
-        29790, 29251, 28719, 28192, 27671, 27156, 26647, 26143, 25645, 25152, 24665, 24182, 23705, 23233, 22766, 22303,
-        21846, 21393, 20945, 20502, 20063, 19628, 19198, 18772, 18351, 17933, 17520, 17111, 16706, 16305, 15907, 15514,
-        15124, 14738, 14356, 13977, 13602, 13231, 12863, 12498, 12137, 11779, 11424, 11073, 10725, 10380, 10038, 9699,
-        9363,  9030,  8700,  8373,  8049,  7727,  7409,  7093,  6780,  6470,  6162,  5857,  5554,  5254,  4957,  4662,
-        4370,  4080,  3792,  3507,  3224,  2943,  2665,  2388,  2115,  1843,  1573,  1306,  1041,  778,   517,   258,
-    };
-    // This table can be avoided using a single addition and counting trailing zeros.
-    static sz_u8_t const shifts[256] = {
-        0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, //
-        4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, //
-        5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, //
-        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, //
-        6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-    };
-    sz_u32_t multiplier = multipliers[divisor];
-    sz_u8_t shift = shifts[divisor];
-
-    sz_u16_t q = (sz_u16_t)((multiplier * number) >> 16);
-    sz_u16_t t = ((number - q) >> 1) + q;
-    return (sz_u8_t)(t >> shift);
-}
-
-SZ_PUBLIC void sz_look_up_transform_serial(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result) {
-    sz_u8_t const *unsigned_lut = (sz_u8_t const *)lut;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = unsigned_lut[*unsigned_text];
-}
-
-SZ_PUBLIC void sz_tolower_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = sz_u8_tolower(*unsigned_text);
-}
-
-SZ_PUBLIC void sz_toupper_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = sz_u8_toupper(*unsigned_text);
-}
-
-SZ_PUBLIC void sz_toascii_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = *unsigned_text & 0x7F;
-}
-
-/**
- *  @brief  Check if there is a byte in this buffer, that exceeds 127 and can't be an ASCII character.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- */
-SZ_PUBLIC sz_bool_t sz_isascii_serial(sz_cptr_t text, sz_size_t length) {
-
-    if (!length) return sz_true_k;
-    sz_u8_t const *h = (sz_u8_t const *)text;
-    sz_u8_t const *const h_end = h + length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
-        if (*h & 0x80ull) return sz_false_k;
-#endif
-
-    // Validate eight bytes at once using SWAR.
-    sz_u64_vec_t text_vec;
-    for (; h + 8 <= h_end; h += 8) {
-        text_vec.u64 = *(sz_u64_t const *)h;
-        if (text_vec.u64 & 0x8080808080808080ull) return sz_false_k;
-    }
-
-    // Handle the misaligned tail.
-    for (; h < h_end; ++h)
-        if (*h & 0x80ull) return sz_false_k;
-    return sz_true_k;
-}
-
-SZ_PUBLIC void sz_generate_serial(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
-                                  sz_random_generator_t generator, void *generator_user_data) {
-
-    sz_assert(alphabet_size > 0 && alphabet_size <= 256 && "Inadequate alphabet size");
-
-    if (alphabet_size == 1) sz_fill(result, result_length, *alphabet);
-
-    else {
-        sz_assert(generator && "Expects a valid random generator");
-        sz_u8_t divisor = (sz_u8_t)alphabet_size;
-        for (sz_cptr_t end = result + result_length; result != end; ++result) {
-            sz_u8_t random = generator(generator_user_data) & 0xFF;
-            sz_u8_t quotient = sz_u8_divide(random, divisor);
-            *result = alphabet[random - quotient * divisor];
-        }
-    }
-}
-
-#pragma endregion
-
-/*
- *  Serial implementation of string class operations.
- */
-#pragma region Serial Implementation for the String Class
-
-SZ_PUBLIC sz_bool_t sz_string_is_on_stack(sz_string_t const *string) {
-    // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    return (sz_bool_t)((sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0]);
-}
-
-SZ_PUBLIC void sz_string_range(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length) {
-    sz_size_t is_small = (sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0];
-    sz_size_t is_big_mask = is_small - 1ull;
-    *start = string->external.start; // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    // If the string is small, use branch-less approach to mask-out the top 7 bytes of the length.
-    *length = string->external.length & (0x00000000000000FFull | is_big_mask);
-}
-
-SZ_PUBLIC void sz_string_unpack(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length, sz_size_t *space,
-                                sz_bool_t *is_external) {
-    sz_size_t is_small = (sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0];
-    sz_size_t is_big_mask = is_small - 1ull;
-    *start = string->external.start; // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    // If the string is small, use branch-less approach to mask-out the top 7 bytes of the length.
-    *length = string->external.length & (0x00000000000000FFull | is_big_mask);
-    // In case the string is small, the `is_small - 1ull` will become 0xFFFFFFFFFFFFFFFFull.
-    *space = sz_u64_blend(SZ_STRING_INTERNAL_SPACE, string->external.space, is_big_mask);
-    *is_external = (sz_bool_t)!is_small;
-}
-
-SZ_PUBLIC sz_bool_t sz_string_equal(sz_string_t const *a, sz_string_t const *b) {
-    // Tempting to say that the external.length is bitwise the same even if it includes
-    // some bytes of the on-stack payload, but we don't at this writing maintain that invariant.
-    // (An on-stack string includes noise bytes in the high-order bits of external.length. So do this
-    // the hard/correct way.
-
-#if SZ_USE_MISALIGNED_LOADS
-    // Dealing with StringZilla strings, we know that the `start` pointer always points
-    // to a word at least 8 bytes long. Therefore, we can compare the first 8 bytes at once.
-
-#endif
-    // Alternatively, fall back to byte-by-byte comparison.
-    sz_ptr_t a_start, b_start;
-    sz_size_t a_length, b_length;
-    sz_string_range(a, &a_start, &a_length);
-    sz_string_range(b, &b_start, &b_length);
-    return (sz_bool_t)(a_length == b_length && sz_equal(a_start, b_start, b_length));
-}
-
-SZ_PUBLIC sz_ordering_t sz_string_order(sz_string_t const *a, sz_string_t const *b) {
-#if SZ_USE_MISALIGNED_LOADS
-    // Dealing with StringZilla strings, we know that the `start` pointer always points
-    // to a word at least 8 bytes long. Therefore, we can compare the first 8 bytes at once.
-
-#endif
-    // Alternatively, fall back to byte-by-byte comparison.
-    sz_ptr_t a_start, b_start;
-    sz_size_t a_length, b_length;
-    sz_string_range(a, &a_start, &a_length);
-    sz_string_range(b, &b_start, &b_length);
-    return sz_order(a_start, a_length, b_start, b_length);
-}
-
-SZ_PUBLIC void sz_string_init(sz_string_t *string) {
-    sz_assert(string && "String can't be SZ_NULL.");
-
-    // Only 8 + 1 + 1 need to be initialized.
-    string->internal.start = &string->internal.chars[0];
-    // But for safety let's initialize the entire structure to zeros.
-    // string->internal.chars[0] = 0;
-    // string->internal.length = 0;
-    string->words[1] = 0;
-    string->words[2] = 0;
-    string->words[3] = 0;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length, sz_memory_allocator_t *allocator) {
-    sz_size_t space_needed = length + 1; // space for trailing \0
-    sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
-    // Initialize the string to zeros for safety.
-    string->words[1] = 0;
-    string->words[2] = 0;
-    string->words[3] = 0;
-    // If we are lucky, no memory allocations will be needed.
-    if (space_needed <= SZ_STRING_INTERNAL_SPACE) {
-        string->internal.start = &string->internal.chars[0];
-        string->internal.length = (sz_u8_t)length;
-    }
-    else {
-        // If we are not lucky, we need to allocate memory.
-        string->external.start = (sz_ptr_t)allocator->allocate(space_needed, allocator->handle);
-        if (!string->external.start) return SZ_NULL_CHAR;
-        string->external.length = length;
-        string->external.space = space_needed;
-    }
-    sz_assert(&string->internal.start == &string->external.start && "Alignment confusion");
-    string->external.start[length] = 0;
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity, sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
-
-    sz_size_t new_space = new_capacity + 1;
-    if (new_space <= SZ_STRING_INTERNAL_SPACE) return string->external.start;
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-    sz_assert(new_space > string_space && "New space must be larger than current.");
-
-    sz_ptr_t new_start = (sz_ptr_t)allocator->allocate(new_space, allocator->handle);
-    if (!new_start) return SZ_NULL_CHAR;
-
-    sz_copy(new_start, string_start, string_length);
-    string->external.start = new_start;
-    string->external.space = new_space;
-    string->external.padding = 0;
-    string->external.length = string_length;
-
-    // Deallocate the old string.
-    if (string_is_external) allocator->free(string_start, string_space, allocator->handle);
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_shrink_to_fit(sz_string_t *string, sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // We may already be space-optimal, and in that case we don't need to do anything.
-    sz_size_t new_space = string_length + 1;
-    if (string_space == new_space || !string_is_external) return string->external.start;
-
-    sz_ptr_t new_start = (sz_ptr_t)allocator->allocate(new_space, allocator->handle);
-    if (!new_start) return SZ_NULL_CHAR;
-
-    sz_copy(new_start, string_start, string_length);
-    string->external.start = new_start;
-    string->external.space = new_space;
-    string->external.padding = 0;
-    string->external.length = string_length;
-
-    // Deallocate the old string.
-    if (string_is_external) allocator->free(string_start, string_space, allocator->handle);
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_expand(sz_string_t *string, sz_size_t offset, sz_size_t added_length,
-                                    sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // The user intended to extend the string.
-    offset = sz_min_of_two(offset, string_length);
-
-    // If we are lucky, no memory allocations will be needed.
-    if (string_length + added_length < string_space) {
-        sz_move(string_start + offset + added_length, string_start + offset, string_length - offset);
-        string_start[string_length + added_length] = 0;
-        // Even if the string is on the stack, the `+=` won't affect the tail of the string.
-        string->external.length += added_length;
-    }
-    // If we are not lucky, we need to allocate more memory.
-    else {
-        sz_size_t next_planned_size = sz_max_of_two(SZ_CACHE_LINE_WIDTH, string_space * 2ull);
-        sz_size_t min_needed_space = sz_size_bit_ceil(offset + string_length + added_length + 1);
-        sz_size_t new_space = sz_max_of_two(min_needed_space, next_planned_size);
-        string_start = sz_string_reserve(string, new_space - 1, allocator);
-        if (!string_start) return SZ_NULL_CHAR;
-
-        // Copy into the new buffer.
-        sz_move(string_start + offset + added_length, string_start + offset, string_length - offset);
-        string_start[string_length + added_length] = 0;
-        string->external.length = string_length + added_length;
-    }
-
-    return string_start;
-}
-
-SZ_PUBLIC sz_size_t sz_string_erase(sz_string_t *string, sz_size_t offset, sz_size_t length) {
-
-    sz_assert(string && "String can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // Normalize the offset, it can't be larger than the length.
-    offset = sz_min_of_two(offset, string_length);
-
-    // We shouldn't normalize the length, to avoid overflowing on `offset + length >= string_length`,
-    // if receiving `length == SZ_SIZE_MAX`. After following expression the `length` will contain
-    // exactly the delta between original and final length of this `string`.
-    length = sz_min_of_two(length, string_length - offset);
-
-    // There are 2 common cases, that wouldn't even require a `memmove`:
-    //      1.  Erasing the entire contents of the string.
-    //          In that case `length` argument will be equal or greater than `length` member.
-    //      2.  Removing the tail of the string with something like `string.pop_back()` in C++.
-    //
-    // In both of those, regardless of the location of the string - stack or heap,
-    // the erasing is as easy as setting the length to the offset.
-    // In every other case, we must `memmove` the tail of the string to the left.
-    if (offset + length < string_length)
-        sz_move(string_start + offset, string_start + offset + length, string_length - offset - length);
-
-    // The `string->external.length = offset` assignment would discard last characters
-    // of the on-the-stack string, but inplace subtraction would work.
-    string->external.length -= length;
-    string_start[string_length - length] = 0;
-    return length;
-}
-
-SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *allocator) {
-    if (!sz_string_is_on_stack(string))
-        allocator->free(string->external.start, string->external.space, allocator->handle);
-    sz_string_init(string);
-}
-
-// When overriding libc, disable optimisations for this function beacuse MSVC will optimize the loops into a memset.
-// Which then causes a stack overflow due to infinite recursion (memset -> sz_fill_serial -> memset).
-#if defined(_MSC_VER) && defined(SZ_OVERRIDE_LIBC) && SZ_OVERRIDE_LIBC
-#pragma optimize("", off)
-#endif
-SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    sz_ptr_t end = target + length;
-    // Dealing with short strings, a single sequential pass would be faster.
-    // If the size is larger than 2 words, then at least 1 of them will be aligned.
-    // But just one aligned word may not be worth SWAR.
-    if (length < SZ_SWAR_THRESHOLD)
-        while (target != end) *(target++) = value;
-
-    // In case of long strings, skip unaligned bytes, and then fill the rest in 64-bit chunks.
-    else {
-        sz_u64_t value64 = (sz_u64_t)value * 0x0101010101010101ull;
-        while ((sz_size_t)target & 7ull) *(target++) = value;
-        while (target + 8 <= end) *(sz_u64_t *)target = value64, target += 8;
-        while (target != end) *(target++) = value;
-    }
-}
-#if defined(_MSC_VER) && defined(SZ_OVERRIDE_LIBC) && SZ_OVERRIDE_LIBC
-#pragma optimize("", on)
-#endif
-
-SZ_PUBLIC void sz_copy_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_MISALIGNED_LOADS
-    while (length >= 8) *(sz_u64_t *)target = *(sz_u64_t const *)source, target += 8, source += 8, length -= 8;
-#endif
-    while (length--) *(target++) = *(source++);
-}
-
-SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // Implementing `memmove` is trickier, than `memcpy`, as the ranges may overlap.
-    // Existing implementations often have two passes, in normal and reversed order,
-    // depending on the relation of `target` and `source` addresses.
-    // https://student.cs.uwaterloo.ca/~cs350/common/os161-src-html/doxygen/html/memmove_8c_source.html
-    // https://marmota.medium.com/c-language-making-memmove-def8792bb8d5
-    //
-    // We can use the `memcpy` like left-to-right pass if we know that the `target` is before `source`.
-    // Or if we know that they don't intersect! In that case the traversal order is irrelevant,
-    // but older CPUs may predict and fetch forward-passes better.
-    if (target < source || target >= source + length) {
-#if SZ_USE_MISALIGNED_LOADS
-        while (length >= 8) *(sz_u64_t *)target = *(sz_u64_t const *)(source), target += 8, source += 8, length -= 8;
-#endif
-        while (length--) *(target++) = *(source++);
-    }
-    else {
-        // Jump to the end and walk backwards.
-        target += length, source += length;
-#if SZ_USE_MISALIGNED_LOADS
-        while (length >= 8) *(sz_u64_t *)(target -= 8) = *(sz_u64_t const *)(source -= 8), length -= 8;
-#endif
-        while (length--) *(--target) = *(--source);
-    }
-}
-
-#pragma endregion
-
-/*
- *  @brief  Serial implementation for strings sequence processing.
- */
-#pragma region Serial Implementation for Sequences
-
-SZ_PUBLIC sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate) {
-
-    sz_size_t matches = 0;
-    while (matches != sequence->count && predicate(sequence, sequence->order[matches])) ++matches;
-
-    for (sz_size_t i = matches + 1; i < sequence->count; ++i)
-        if (predicate(sequence, sequence->order[i]))
-            sz_u64_swap(sequence->order + i, sequence->order + matches), ++matches;
-
-    return matches;
-}
-
-SZ_PUBLIC void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less) {
-
-    sz_size_t start_b = partition + 1;
-
-    // If the direct merge is already sorted
-    if (!less(sequence, sequence->order[start_b], sequence->order[partition])) return;
-
-    sz_size_t start_a = 0;
-    while (start_a <= partition && start_b <= sequence->count) {
-
-        // If element 1 is in right place
-        if (!less(sequence, sequence->order[start_b], sequence->order[start_a])) { start_a++; }
-        else {
-            sz_size_t value = sequence->order[start_b];
-            sz_size_t index = start_b;
-
-            // Shift all the elements between element 1
-            // element 2, right by 1.
-            while (index != start_a) { sequence->order[index] = sequence->order[index - 1], index--; }
-            sequence->order[start_a] = value;
-
-            // Update all the pointers
-            start_a++;
-            partition++;
-            start_b++;
-        }
-    }
-}
-
-SZ_PUBLIC void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
-    sz_u64_t *keys = sequence->order;
-    sz_size_t keys_count = sequence->count;
-    for (sz_size_t i = 1; i < keys_count; i++) {
-        sz_u64_t i_key = keys[i];
-        sz_size_t j = i;
-        for (; j > 0 && less(sequence, i_key, keys[j - 1]); --j) keys[j] = keys[j - 1];
-        keys[j] = i_key;
-    }
-}
-
-SZ_INTERNAL void _sz_sift_down(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t start,
-                               sz_size_t end) {
-    sz_size_t root = start;
-    while (2 * root + 1 <= end) {
-        sz_size_t child = 2 * root + 1;
-        if (child + 1 <= end && less(sequence, order[child], order[child + 1])) { child++; }
-        if (!less(sequence, order[root], order[child])) { return; }
-        sz_u64_swap(order + root, order + child);
-        root = child;
-    }
-}
-
-SZ_INTERNAL void _sz_heapify(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t count) {
-    sz_size_t start = (count - 2) / 2;
-    while (1) {
-        _sz_sift_down(sequence, less, order, start, count - 1);
-        if (start == 0) return;
-        start--;
-    }
-}
-
-SZ_INTERNAL void _sz_heapsort(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first, sz_size_t last) {
-    sz_u64_t *order = sequence->order;
-    sz_size_t count = last - first;
-    _sz_heapify(sequence, less, order + first, count);
-    sz_size_t end = count - 1;
-    while (end > 0) {
-        sz_u64_swap(order + first, order + first + end);
-        end--;
-        _sz_sift_down(sequence, less, order + first, 0, end);
-    }
-}
-
-SZ_PUBLIC void sz_sort_introsort_recursion(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first,
-                                           sz_size_t last, sz_size_t depth) {
-
-    sz_size_t length = last - first;
-    switch (length) {
-    case 0:
-    case 1: return;
-    case 2:
-        if (less(sequence, sequence->order[first + 1], sequence->order[first]))
-            sz_u64_swap(&sequence->order[first], &sequence->order[first + 1]);
-        return;
-    case 3: {
-        sz_u64_t a = sequence->order[first];
-        sz_u64_t b = sequence->order[first + 1];
-        sz_u64_t c = sequence->order[first + 2];
-        if (less(sequence, b, a)) sz_u64_swap(&a, &b);
-        if (less(sequence, c, b)) sz_u64_swap(&c, &b);
-        if (less(sequence, b, a)) sz_u64_swap(&a, &b);
-        sequence->order[first] = a;
-        sequence->order[first + 1] = b;
-        sequence->order[first + 2] = c;
-        return;
-    }
-    }
-    // Until a certain length, the quadratic-complexity insertion-sort is fine
-    if (length <= 16) {
-        sz_sequence_t sub_seq = *sequence;
-        sub_seq.order += first;
-        sub_seq.count = length;
-        sz_sort_insertion(&sub_seq, less);
-        return;
-    }
-
-    // Fallback to N-logN-complexity heap-sort
-    if (depth == 0) {
-        _sz_heapsort(sequence, less, first, last);
-        return;
-    }
-
-    --depth;
-
-    // Median-of-three logic to choose pivot
-    sz_size_t median = first + length / 2;
-    if (less(sequence, sequence->order[median], sequence->order[first]))
-        sz_u64_swap(&sequence->order[first], &sequence->order[median]);
-    if (less(sequence, sequence->order[last - 1], sequence->order[first]))
-        sz_u64_swap(&sequence->order[first], &sequence->order[last - 1]);
-    if (less(sequence, sequence->order[median], sequence->order[last - 1]))
-        sz_u64_swap(&sequence->order[median], &sequence->order[last - 1]);
-
-    // Partition using the median-of-three as the pivot
-    sz_u64_t pivot = sequence->order[median];
-    sz_size_t left = first;
-    sz_size_t right = last - 1;
-    while (1) {
-        while (less(sequence, sequence->order[left], pivot)) left++;
-        while (less(sequence, pivot, sequence->order[right])) right--;
-        if (left >= right) break;
-        sz_u64_swap(&sequence->order[left], &sequence->order[right]);
-        left++;
-        right--;
-    }
-
-    // Recursively sort the partitions
-    sz_sort_introsort_recursion(sequence, less, first, left, depth);
-    sz_sort_introsort_recursion(sequence, less, right + 1, last, depth);
-}
-
-SZ_PUBLIC void sz_sort_introsort(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
-    if (sequence->count == 0) return;
-    sz_size_t size_is_not_power_of_two = (sequence->count & (sequence->count - 1)) != 0;
-    sz_size_t depth_limit = sz_size_log2i_nonzero(sequence->count) + size_is_not_power_of_two;
-    sz_sort_introsort_recursion(sequence, less, 0, sequence->count, depth_limit);
-}
-
-SZ_PUBLIC void sz_sort_recursion( //
-    sz_sequence_t *sequence, sz_size_t bit_idx, sz_size_t bit_max, sz_sequence_comparator_t comparator,
-    sz_size_t partial_order_length) {
-
-    if (!sequence->count) return;
-
-    // Array of size one doesn't need sorting - only needs the prefix to be discarded.
-    if (sequence->count == 1) {
-        sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
-        order_half_words[1] = 0;
-        return;
-    }
-
-    // Partition a range of integers according to a specific bit value
-    sz_size_t split = 0;
-    sz_u64_t mask = (1ull << 63) >> bit_idx;
-
-    // The clean approach would be to perform a single pass over the sequence.
-    //
-    //    while (split != sequence->count && !(sequence->order[split] & mask)) ++split;
-    //    for (sz_size_t i = split + 1; i < sequence->count; ++i)
-    //        if (!(sequence->order[i] & mask)) sz_u64_swap(sequence->order + i, sequence->order + split), ++split;
-    //
-    // This, however, doesn't take into account the high relative cost of writes and swaps.
-    // To circumvent that, we can first count the total number entries to be mapped into either part.
-    // And then walk through both parts, swapping the entries that are in the wrong part.
-    // This would often lead to ~15% performance gain.
-    sz_size_t count_with_bit_set = 0;
-    for (sz_size_t i = 0; i != sequence->count; ++i) count_with_bit_set += (sequence->order[i] & mask) != 0;
-    split = sequence->count - count_with_bit_set;
-
-    // It's possible that the sequence is already partitioned.
-    if (split != 0 && split != sequence->count) {
-        // Use two pointers to efficiently reposition elements.
-        // On pointer walks left-to-right from the start, and the other walks right-to-left from the end.
-        sz_size_t left = 0;
-        sz_size_t right = sequence->count - 1;
-        while (1) {
-            // Find the next element with the bit set on the left side.
-            while (left < split && !(sequence->order[left] & mask)) ++left;
-            // Find the next element without the bit set on the right side.
-            while (right >= split && (sequence->order[right] & mask)) --right;
-            // Swap the mispositioned elements.
-            if (left < split && right >= split) {
-                sz_u64_swap(sequence->order + left, sequence->order + right);
-                ++left;
-                --right;
-            }
-            else { break; }
-        }
-    }
-
-    // Go down recursively.
-    if (bit_idx < bit_max) {
-        sz_sequence_t a = *sequence;
-        a.count = split;
-        sz_sort_recursion(&a, bit_idx + 1, bit_max, comparator, partial_order_length);
-
-        sz_sequence_t b = *sequence;
-        b.order += split;
-        b.count -= split;
-        sz_sort_recursion(&b, bit_idx + 1, bit_max, comparator, partial_order_length);
-    }
-    // Reached the end of recursion.
-    else {
-        // Discard the prefixes.
-        sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
-        for (sz_size_t i = 0; i != sequence->count; ++i) { order_half_words[i * 2 + 1] = 0; }
-
-        sz_sequence_t a = *sequence;
-        a.count = split;
-        sz_sort_introsort(&a, comparator);
-
-        sz_sequence_t b = *sequence;
-        b.order += split;
-        b.count -= split;
-        sz_sort_introsort(&b, comparator);
-    }
-}
-
-SZ_INTERNAL sz_bool_t _sz_sort_is_less(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) {
-    sz_cptr_t i_str = sequence->get_start(sequence, i_key);
-    sz_cptr_t j_str = sequence->get_start(sequence, j_key);
-    sz_size_t i_len = sequence->get_length(sequence, i_key);
-    sz_size_t j_len = sequence->get_length(sequence, j_key);
-    return (sz_bool_t)(sz_order_serial(i_str, i_len, j_str, j_len) == sz_less_k);
-}
-
-SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t partial_order_length) {
-
-#if SZ_DETECT_BIG_ENDIAN
-    // TODO: Implement partial sort for big-endian systems. For now this sorts the whole thing.
-    sz_unused(partial_order_length);
-    sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
-#else
-
-    // Export up to 4 bytes into the `sequence` bits themselves
-    for (sz_size_t i = 0; i != sequence->count; ++i) {
-        sz_cptr_t begin = sequence->get_start(sequence, sequence->order[i]);
-        sz_size_t length = sequence->get_length(sequence, sequence->order[i]);
-        length = length > 4u ? 4u : length;
-        sz_ptr_t prefix = (sz_ptr_t)&sequence->order[i];
-        for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j];
-    }
-
-    // Perform optionally-parallel radix sort on them
-    sz_sort_recursion(sequence, 0, 32, (sz_sequence_comparator_t)_sz_sort_is_less, partial_order_length);
-#endif
-}
-
-SZ_PUBLIC void sz_sort(sz_sequence_t *sequence) {
-#if SZ_DETECT_BIG_ENDIAN
-    sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
-#else
-    sz_sort_partial(sequence, sequence->count);
-#endif
-}
-
-#pragma endregion
-
-/*
- *  @brief  AVX2 implementation of the string search algorithms.
- *          Very minimalistic, but still faster than the serial implementation.
- */
-#pragma region AVX2 Implementation
-
-#if SZ_USE_X86_AVX2
-#pragma GCC push_options
-#pragma GCC target("avx2")
-#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
-#include <immintrin.h>
-
-/**
- *  @brief  Helper structure to simplify work with 256-bit registers.
- */
-typedef union sz_u256_vec_t {
-    __m256i ymm;
-    __m128i xmms[2];
-    sz_u64_t u64s[4];
-    sz_u32_t u32s[8];
-    sz_u16_t u16s[16];
-    sz_u8_t u8s[32];
-} sz_u256_vec_t;
-
-SZ_PUBLIC sz_ordering_t sz_order_avx2(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    //! Before optimizing this, read the "Operations Not Worth Optimizing" in Contributions Guide:
-    //! https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md#general-performance-observations
-    return sz_order_serial(a, a_length, b, b_length);
-}
-
-SZ_PUBLIC sz_bool_t sz_equal_avx2(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_u256_vec_t a_vec, b_vec;
-
-    while (length >= 32) {
-        a_vec.ymm = _mm256_lddqu_si256((__m256i const *)a);
-        b_vec.ymm = _mm256_lddqu_si256((__m256i const *)b);
-        // One approach can be to use "movemasks", but we could also use a bitwise matching like `_mm256_testnzc_si256`.
-        int difference_mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi8(a_vec.ymm, b_vec.ymm));
-        if (difference_mask == 0) { a += 32, b += 32, length -= 32; }
-        else { return sz_false_k; }
-    }
-
-    if (length) return sz_equal_serial(a, b, length);
-    return sz_true_k;
-}
-
-SZ_PUBLIC void sz_fill_avx2(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    char value_char = *(char *)&value;
-    __m256i value_vec = _mm256_set1_epi8(value_char);
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores".
-    //
-    //    for (; length >= 32; target += 32, length -= 32) _mm256_storeu_si256(target, value_vec);
-    //    sz_fill_serial(target, length, value);
-    //
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 32) sz_fill_serial(target, length, value);
-    // When the buffer is aligned, we can avoid any split-stores.
-    else {
-        sz_size_t head_length = (32 - ((sz_size_t)target % 32)) % 32; // 31 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 32;    // 31 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 32.
-        sz_u16_t value16 = (sz_u16_t)value * 0x0101u;
-        sz_u32_t value32 = (sz_u32_t)value16 * 0x00010001u;
-        sz_u64_t value64 = (sz_u64_t)value32 * 0x0000000100000001ull;
-
-        // Fill the head of the buffer. This part is much cleaner with AVX-512.
-        if (head_length & 1) *(sz_u8_t *)target = value, target++, head_length--;
-        if (head_length & 2) *(sz_u16_t *)target = value16, target += 2, head_length -= 2;
-        if (head_length & 4) *(sz_u32_t *)target = value32, target += 4, head_length -= 4;
-        if (head_length & 8) *(sz_u64_t *)target = value64, target += 8, head_length -= 8;
-        if (head_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_set1_epi8(value_char)), target += 16, head_length -= 16;
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-
-        // Fill the aligned body of the buffer.
-        for (; body_length >= 32; target += 32, body_length -= 32) _mm256_store_si256((__m256i *)target, value_vec);
-
-        // Fill the tail of the buffer. This part is much cleaner with AVX-512.
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-        if (tail_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_set1_epi8(value_char)), target += 16, tail_length -= 16;
-        if (tail_length & 8) *(sz_u64_t *)target = value64, target += 8, tail_length -= 8;
-        if (tail_length & 4) *(sz_u32_t *)target = value32, target += 4, tail_length -= 4;
-        if (tail_length & 2) *(sz_u16_t *)target = value16, target += 2, tail_length -= 2;
-        if (tail_length & 1) *(sz_u8_t *)target = value, target++, tail_length--;
-    }
-}
-
-SZ_PUBLIC void sz_copy_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores" and "loads".
-    //
-    //    for (; length >= 32; target += 32, source += 32, length -= 32)
-    //        _mm256_storeu_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-    //    sz_copy_serial(target, source, length);
-    //
-    // A typical AWS Skylake instance can have 32 KB x 2 blocks of L1 data cache per core,
-    // 1 MB x 2 blocks of L2 cache per core, and one shared L3 cache buffer.
-    // For now, let's avoid the cases beyond the L2 size.
-    int is_huge = length > 1ull * 1024ull * 1024ull;
-    if (length <= 32) { sz_copy_serial(target, source, length); }
-    // When dealing wirh larger arrays, the optimization is not as simple as with the `sz_fill_avx2` function,
-    // as both buffers may be unaligned. If we are lucky and the requested operation is some huge page transfer,
-    // we can use aligned loads and stores, and the performance will be great.
-    else if ((sz_size_t)target % 32 == 0 && (sz_size_t)source % 32 == 0 && !is_huge) {
-        for (; length >= 32; target += 32, source += 32, length -= 32)
-            _mm256_store_si256((__m256i *)target, _mm256_load_si256((__m256i const *)source));
-        if (length) sz_copy_serial(target, source, length);
-    }
-    // The trickiest case is when both `source` and `target` are not aligned.
-    // In such and simpler cases we can copy enough bytes into `target` to reach its cacheline boundary,
-    // and then combine unaligned loads with aligned stores.
-    else {
-        sz_size_t head_length = (32 - ((sz_size_t)target % 32)) % 32; // 31 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 32;    // 31 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 32.
-
-        // Fill the head of the buffer. This part is much cleaner with AVX-512.
-        if (head_length & 1) *(sz_u8_t *)target = *(sz_u8_t *)source, target++, source++, head_length--;
-        if (head_length & 2) *(sz_u16_t *)target = *(sz_u16_t *)source, target += 2, source += 2, head_length -= 2;
-        if (head_length & 4) *(sz_u32_t *)target = *(sz_u32_t *)source, target += 4, source += 4, head_length -= 4;
-        if (head_length & 8) *(sz_u64_t *)target = *(sz_u64_t *)source, target += 8, source += 8, head_length -= 8;
-        if (head_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_lddqu_si128((__m128i const *)source)), target += 16, source += 16,
-                head_length -= 16;
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-
-        // Fill the aligned body of the buffer.
-        if (!is_huge) {
-            for (; body_length >= 32; target += 32, source += 32, body_length -= 32)
-                _mm256_store_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-        }
-        // When the biffer is huge, we can traverse it in 2 directions.
-        else {
-            for (; body_length >= 64; target += 32, source += 32, body_length -= 64) {
-                _mm256_store_si256((__m256i *)(target), _mm256_lddqu_si256((__m256i const *)(source)));
-                _mm256_store_si256((__m256i *)(target + body_length - 32),
-                                   _mm256_lddqu_si256((__m256i const *)(source + body_length - 32)));
-            }
-            if (body_length) _mm256_store_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-        }
-
-        // Fill the tail of the buffer. This part is much cleaner with AVX-512.
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-        if (tail_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_lddqu_si128((__m128i const *)source)), target += 16, source += 16,
-                tail_length -= 16;
-        if (tail_length & 8) *(sz_u64_t *)target = *(sz_u64_t *)source, target += 8, source += 8, tail_length -= 8;
-        if (tail_length & 4) *(sz_u32_t *)target = *(sz_u32_t *)source, target += 4, source += 4, tail_length -= 4;
-        if (tail_length & 2) *(sz_u16_t *)target = *(sz_u16_t *)source, target += 2, source += 2, tail_length -= 2;
-        if (tail_length & 1) *(sz_u8_t *)target = *(sz_u8_t *)source, target++, source++, tail_length--;
-    }
-}
-
-SZ_PUBLIC void sz_move_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    if (target < source || target >= source + length) {
-        for (; length >= 32; target += 32, source += 32, length -= 32)
-            _mm256_storeu_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-        while (length--) *(target++) = *(source++);
-    }
-    else {
-        // Jump to the end and walk backwards.
-        for (target += length, source += length; length >= 32; length -= 32)
-            _mm256_storeu_si256((__m256i *)(target -= 32), _mm256_lddqu_si256((__m256i const *)(source -= 32)));
-        while (length--) *(--target) = *(--source);
-    }
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_avx2(sz_cptr_t text, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "loads".
-    //
-    // A typical AWS Skylake instance can have 32 KB x 2 blocks of L1 data cache per core,
-    // 1 MB x 2 blocks of L2 cache per core, and one shared L3 cache buffer.
-    // For now, let's avoid the cases beyond the L2 size.
-    int is_huge = length > 1ull * 1024ull * 1024ull;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 32) { return sz_checksum_serial(text, length); }
-    else if (!is_huge) {
-        sz_u256_vec_t text_vec, sums_vec;
-        sums_vec.ymm = _mm256_setzero_si256();
-        for (; length >= 32; text += 32, length -= 32) {
-            text_vec.ymm = _mm256_lddqu_si256((__m256i const *)text);
-            sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-        }
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        sz_u64_t result = low + high;
-        if (length) result += sz_checksum_serial(text, length);
-        return result;
-    }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    // Most notably, we can avoid populating the cache with the entire buffer, and instead traverse it in 2 directions.
-    else {
-        sz_size_t head_length = (32 - ((sz_size_t)text % 32)) % 32; // 31 or less.
-        sz_size_t tail_length = (sz_size_t)(text + length) % 32;    // 31 or less.
-        sz_size_t body_length = length - head_length - tail_length; // Multiple of 32.
-        sz_u64_t result = 0;
-
-        // Handle the head
-        while (head_length--) result += *text++;
-
-        sz_u256_vec_t text_vec, sums_vec;
-        sums_vec.ymm = _mm256_setzero_si256();
-        // Fill the aligned body of the buffer.
-        if (!is_huge) {
-            for (; body_length >= 32; text += 32, body_length -= 32) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i const *)text);
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-            }
-        }
-        // When the biffer is huge, we can traverse it in 2 directions.
-        else {
-            sz_u256_vec_t text_reversed_vec, sums_reversed_vec;
-            sums_reversed_vec.ymm = _mm256_setzero_si256();
-            for (; body_length >= 64; text += 64, body_length -= 64) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i *)(text));
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-                text_reversed_vec.ymm = _mm256_stream_load_si256((__m256i *)(text + body_length - 64));
-                sums_reversed_vec.ymm = _mm256_add_epi64(
-                    sums_reversed_vec.ymm, _mm256_sad_epu8(text_reversed_vec.ymm, _mm256_setzero_si256()));
-            }
-            if (body_length >= 32) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i *)(text));
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-            }
-            sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, sums_reversed_vec.ymm);
-        }
-
-        // Handle the tail
-        while (tail_length--) result += *text++;
-
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        result += low + high;
-        return result;
-    }
-}
-
-SZ_PUBLIC void sz_look_up_transform_avx2(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-
-    // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
-    // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
-    // But if at least 3 cache lines are touched, the AVX-2 implementation should be faster.
-    if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
-        return;
-    }
-
-    // We need to pull the lookup table into 8x YMM registers.
-    // The biggest issue is reorganizing the data in the lookup table, as AVX2 doesn't have 256-bit shuffle,
-    // it only has 128-bit "within-lane" shuffle. Still, it's wiser to use full YMM registers, instead of XMM,
-    // so that we can at least compensate high latency with twice larger window and one more level of lookup.
-    sz_u256_vec_t lut_0_to_15_vec, lut_16_to_31_vec, lut_32_to_47_vec, lut_48_to_63_vec, //
-        lut_64_to_79_vec, lut_80_to_95_vec, lut_96_to_111_vec, lut_112_to_127_vec,       //
-        lut_128_to_143_vec, lut_144_to_159_vec, lut_160_to_175_vec, lut_176_to_191_vec,  //
-        lut_192_to_207_vec, lut_208_to_223_vec, lut_224_to_239_vec, lut_240_to_255_vec;
-
-    lut_0_to_15_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut)));
-    lut_16_to_31_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 16)));
-    lut_32_to_47_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 32)));
-    lut_48_to_63_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 48)));
-    lut_64_to_79_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 64)));
-    lut_80_to_95_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 80)));
-    lut_96_to_111_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 96)));
-    lut_112_to_127_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 112)));
-    lut_128_to_143_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 128)));
-    lut_144_to_159_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 144)));
-    lut_160_to_175_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 160)));
-    lut_176_to_191_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 176)));
-    lut_192_to_207_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 192)));
-    lut_208_to_223_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 208)));
-    lut_224_to_239_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 224)));
-    lut_240_to_255_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 240)));
-
-    // Assuming each lookup is performed within 16 elements of 256, we need to reduce the scope by 16x = 2^4.
-    sz_u256_vec_t not_first_bit_vec, not_second_bit_vec, not_third_bit_vec, not_fourth_bit_vec;
-
-    /// Top and bottom nibbles of the source are used separately.
-    sz_u256_vec_t source_vec, source_bot_vec;
-    sz_u256_vec_t blended_0_to_31_vec, blended_32_to_63_vec, blended_64_to_95_vec, blended_96_to_127_vec,
-        blended_128_to_159_vec, blended_160_to_191_vec, blended_192_to_223_vec, blended_224_to_255_vec;
-
-    // Handling the head.
-    while (length >= 32) {
-        // Load and separate the nibbles of each byte in the source.
-        source_vec.ymm = _mm256_lddqu_si256((__m256i const *)source);
-        source_bot_vec.ymm = _mm256_and_si256(source_vec.ymm, _mm256_set1_epi8((char)0x0F));
-
-        // In the first round, we select using the 4th bit.
-        not_fourth_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x10), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8(                      //
-            _mm256_shuffle_epi8(lut_16_to_31_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_0_to_15_vec.ymm, source_bot_vec.ymm),  //
-            not_fourth_bit_vec.ymm);
-        blended_32_to_63_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_48_to_63_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_32_to_47_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_64_to_95_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_80_to_95_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_64_to_79_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_96_to_127_vec.ymm = _mm256_blendv_epi8(                      //
-            _mm256_shuffle_epi8(lut_112_to_127_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_96_to_111_vec.ymm, source_bot_vec.ymm),  //
-            not_fourth_bit_vec.ymm);
-        blended_128_to_159_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_144_to_159_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_128_to_143_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_160_to_191_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_176_to_191_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_160_to_175_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_192_to_223_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_208_to_223_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_192_to_207_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_224_to_255_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_240_to_255_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_224_to_239_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-
-        // Perform a tree-like reduction of the 8x "blended" YMM registers, depending on the "source" content.
-        // The first round selects using the 3rd bit.
-        not_third_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x20), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8( //
-            blended_32_to_63_vec.ymm,                 //
-            blended_0_to_31_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-        blended_64_to_95_vec.ymm = _mm256_blendv_epi8( //
-            blended_96_to_127_vec.ymm,                 //
-            blended_64_to_95_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-        blended_128_to_159_vec.ymm = _mm256_blendv_epi8( //
-            blended_160_to_191_vec.ymm,                  //
-            blended_128_to_159_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-        blended_192_to_223_vec.ymm = _mm256_blendv_epi8( //
-            blended_224_to_255_vec.ymm,                  //
-            blended_192_to_223_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-
-        // The second round selects using the 2nd bit.
-        not_second_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x40), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8( //
-            blended_64_to_95_vec.ymm,                 //
-            blended_0_to_31_vec.ymm,                  //
-            not_second_bit_vec.ymm);
-        blended_128_to_159_vec.ymm = _mm256_blendv_epi8( //
-            blended_192_to_223_vec.ymm,                  //
-            blended_128_to_159_vec.ymm,                  //
-            not_second_bit_vec.ymm);
-
-        // The third round selects using the 1st bit.
-        not_first_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x80), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8( //
-            blended_128_to_159_vec.ymm,               //
-            blended_0_to_31_vec.ymm,                  //
-            not_first_bit_vec.ymm);
-
-        // And dump the result into the target.
-        _mm256_storeu_si256((__m256i *)target, blended_0_to_31_vec.ymm);
-        source += 32, target += 32, length -= 32;
-    }
-
-    // Handle the tail.
-    if (length) sz_look_up_transform_serial(source, length, lut, target);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    int mask;
-    sz_u256_vec_t h_vec, n_vec;
-    n_vec.ymm = _mm256_set1_epi8(n[0]);
-
-    while (h_length >= 32) {
-        h_vec.ymm = _mm256_lddqu_si256((__m256i const *)h);
-        mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_vec.ymm, n_vec.ymm));
-        if (mask) return h + sz_u32_ctz(mask);
-        h += 32, h_length -= 32;
-    }
-
-    return sz_find_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    int mask;
-    sz_u256_vec_t h_vec, n_vec;
-    n_vec.ymm = _mm256_set1_epi8(n[0]);
-
-    while (h_length >= 32) {
-        h_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + h_length - 32));
-        mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_vec.ymm, n_vec.ymm));
-        if (mask) return h + h_length - 1 - sz_u32_clz(mask);
-        h_length -= 32;
-    }
-
-    return sz_rfind_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_avx2(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into YMM registers.
-    int matches;
-    sz_u256_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.ymm = _mm256_set1_epi8(n[offset_first]);
-    n_mid_vec.ymm = _mm256_set1_epi8(n[offset_mid]);
-    n_last_vec.ymm = _mm256_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    for (; h_length >= n_length + 32; h += 32, h_length -= 32) {
-        h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_first));
-        h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_mid));
-        h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_last));
-        matches = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
-        while (matches) {
-            int potential_offset = sz_u32_ctz(matches);
-            if (sz_equal(h + potential_offset, n, n_length)) return h + potential_offset;
-            matches &= matches - 1;
-        }
-    }
-
-    return sz_find_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_avx2(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into YMM registers.
-    int matches;
-    sz_u256_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.ymm = _mm256_set1_epi8(n[offset_first]);
-    n_mid_vec.ymm = _mm256_set1_epi8(n[offset_mid]);
-    n_last_vec.ymm = _mm256_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 32; h_length -= 32) {
-        h_reversed = h + h_length - n_length - 32 + 1;
-        h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_first));
-        h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_mid));
-        h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_last));
-        matches = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
-        while (matches) {
-            int potential_offset = sz_u32_clz(matches);
-            if (sz_equal(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            matches &= ~(1 << (31 - potential_offset));
-        }
-    }
-
-    return sz_rfind_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx2(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-
-    // Let's unzip even and odd elements and replicate them into both lanes of the YMM register.
-    // That way when we invoke `_mm256_shuffle_epi8` we can use the same mask for both lanes.
-    sz_u256_vec_t filter_even_vec, filter_odd_vec;
-    for (sz_size_t i = 0; i != 16; ++i)
-        filter_even_vec.u8s[i] = filter->_u8s[i * 2], filter_odd_vec.u8s[i] = filter->_u8s[i * 2 + 1];
-    filter_even_vec.xmms[1] = filter_even_vec.xmms[0];
-    filter_odd_vec.xmms[1] = filter_odd_vec.xmms[0];
-
-    sz_u256_vec_t text_vec;
-    sz_u256_vec_t matches_vec;
-    sz_u256_vec_t lower_nibbles_vec, higher_nibbles_vec;
-    sz_u256_vec_t bitset_even_vec, bitset_odd_vec;
-    sz_u256_vec_t bitmask_vec, bitmask_lookup_vec;
-    bitmask_lookup_vec.ymm = _mm256_set_epi8(-128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-                                             -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);
-
-    while (length >= 32) {
-        // The following algorithm is a transposed equivalent of the "SIMDized check which bytes are in a set"
-        // solutions by Wojciech Muła. We populate the bitmask differently and target newer CPUs, so
-        // StrinZilla uses a somewhat different approach.
-        // http://0x80.pl/articles/simd-byte-lookup.html#alternative-implementation-new
-        //
-        //      sz_u8_t input = *(sz_u8_t const *)text;
-        //      sz_u8_t lo_nibble = input & 0x0f;
-        //      sz_u8_t hi_nibble = input >> 4;
-        //      sz_u8_t bitset_even = filter_even_vec.u8s[hi_nibble];
-        //      sz_u8_t bitset_odd = filter_odd_vec.u8s[hi_nibble];
-        //      sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //      sz_u8_t bitset = lo_nibble < 8 ? bitset_even : bitset_odd;
-        //      if ((bitset & bitmask) != 0) return text;
-        //      else { length--, text++; }
-        //
-        // The nice part about this, loading the strided data is vey easy with Arm NEON,
-        // while with x86 CPUs after AVX, shuffles within 256 bits shouldn't be an issue either.
-        text_vec.ymm = _mm256_lddqu_si256((__m256i const *)text);
-        lower_nibbles_vec.ymm = _mm256_and_si256(text_vec.ymm, _mm256_set1_epi8(0x0f));
-        bitmask_vec.ymm = _mm256_shuffle_epi8(bitmask_lookup_vec.ymm, lower_nibbles_vec.ymm);
-        //
-        // At this point we can validate the `bitmask_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != 32; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t lo_nibble = input & 0x0f;
-        //          sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //          sz_assert(bitmask_vec.u8s[i] == bitmask);
-        //      }
-        //
-        // Shift right every byte by 4 bits.
-        // There is no `_mm256_srli_epi8` intrinsic, so we have to use `_mm256_srli_epi16`
-        // and combine it with a mask to clear the higher bits.
-        higher_nibbles_vec.ymm = _mm256_and_si256(_mm256_srli_epi16(text_vec.ymm, 4), _mm256_set1_epi8(0x0f));
-        bitset_even_vec.ymm = _mm256_shuffle_epi8(filter_even_vec.ymm, higher_nibbles_vec.ymm);
-        bitset_odd_vec.ymm = _mm256_shuffle_epi8(filter_odd_vec.ymm, higher_nibbles_vec.ymm);
-        //
-        // At this point we can validate the `bitset_even_vec` and `bitset_odd_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != 32; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t const *bitset_ptr = &filter->_u8s[0];
-        //          sz_u8_t hi_nibble = input >> 4;
-        //          sz_u8_t bitset_even = bitset_ptr[hi_nibble * 2];
-        //          sz_u8_t bitset_odd = bitset_ptr[hi_nibble * 2 + 1];
-        //          sz_assert(bitset_even_vec.u8s[i] == bitset_even);
-        //          sz_assert(bitset_odd_vec.u8s[i] == bitset_odd);
-        //      }
-        //
-        __m256i take_first = _mm256_cmpgt_epi8(_mm256_set1_epi8(8), lower_nibbles_vec.ymm);
-        bitset_even_vec.ymm = _mm256_blendv_epi8(bitset_odd_vec.ymm, bitset_even_vec.ymm, take_first);
-
-        // It would have been great to have an instruction that tests the bits and then broadcasts
-        // the matching bit into all bits in that byte. But we don't have that, so we have to
-        // `and`, `cmpeq`, `movemask`, and then invert at the end...
-        matches_vec.ymm = _mm256_and_si256(bitset_even_vec.ymm, bitmask_vec.ymm);
-        matches_vec.ymm = _mm256_cmpeq_epi8(matches_vec.ymm, _mm256_setzero_si256());
-        int matches_mask = ~_mm256_movemask_epi8(matches_vec.ymm);
-        if (matches_mask) {
-            int offset = sz_u32_ctz(matches_mask);
-            return text + offset;
-        }
-        else { text += 32, length -= 32; }
-    }
-
-    return sz_find_charset_serial(text, length, filter);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx2(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-    return sz_rfind_charset_serial(text, length, filter);
-}
-
-/**
- *  @brief  There is no AVX2 instruction for fast multiplication of 64-bit integers.
- *          This implementation is coming from Agner Fog's Vector Class Library.
- */
-SZ_INTERNAL __m256i _mm256_mul_epu64(__m256i a, __m256i b) {
-    __m256i bswap = _mm256_shuffle_epi32(b, 0xB1);
-    __m256i prodlh = _mm256_mullo_epi32(a, bswap);
-    __m256i zero = _mm256_setzero_si256();
-    __m256i prodlh2 = _mm256_hadd_epi32(prodlh, zero);
-    __m256i prodlh3 = _mm256_shuffle_epi32(prodlh2, 0x73);
-    __m256i prodll = _mm256_mul_epu32(a, b);
-    __m256i prod = _mm256_add_epi64(prodll, prodlh3);
-    return prod;
-}
-
-SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                              sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    if (length < 4 * window_length) {
-        sz_hashes_serial(start, length, window_length, step, callback, callback_handle);
-        return;
-    }
-
-    // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
-    // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
-    sz_size_t const max_hashes = length - window_length + 1;
-    sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
-    sz_u8_t const *text_first = (sz_u8_t const *)start;
-    sz_u8_t const *text_second = text_first + min_hashes_per_thread;
-    sz_u8_t const *text_third = text_first + min_hashes_per_thread * 2;
-    sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
-    sz_u8_t const *text_end = text_first + length;
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // Broadcast the constants into the registers.
-    sz_u256_vec_t prime_vec, golden_ratio_vec;
-    sz_u256_vec_t base_low_vec, base_high_vec, prime_power_low_vec, prime_power_high_vec, shift_high_vec;
-    base_low_vec.ymm = _mm256_set1_epi64x(31ull);
-    base_high_vec.ymm = _mm256_set1_epi64x(257ull);
-    shift_high_vec.ymm = _mm256_set1_epi64x(77ull);
-    prime_vec.ymm = _mm256_set1_epi64x(SZ_U64_MAX_PRIME);
-    golden_ratio_vec.ymm = _mm256_set1_epi64x(11400714819323198485ull);
-    prime_power_low_vec.ymm = _mm256_set1_epi64x(prime_power_low);
-    prime_power_high_vec.ymm = _mm256_set1_epi64x(prime_power_high);
-
-    // Compute the initial hash values for every one of the four windows.
-    sz_u256_vec_t hash_low_vec, hash_high_vec, hash_mix_vec, chars_low_vec, chars_high_vec;
-    hash_low_vec.ymm = _mm256_setzero_si256();
-    hash_high_vec.ymm = _mm256_setzero_si256();
-    for (sz_u8_t const *prefix_end = text_first + window_length; text_first < prefix_end;
-         ++text_first, ++text_second, ++text_third, ++text_fourth) {
-
-        // 1. Multiply the hashes by the base.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-
-        // 3. Add the incoming characters.
-        hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_low_vec.ymm = _mm256_blendv_epi8(hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
-                                              _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
-        hash_high_vec.ymm = _mm256_blendv_epi8(hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
-                                               _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
-    }
-
-    // 5. Compute the hash mix, that will be used to index into the fingerprint.
-    //    This includes a serial step at the end.
-    hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
-    hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
-    hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
-    callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-    callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-    callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-    callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-
-    // Now repeat that operation for the remaining characters, discarding older characters.
-    sz_size_t cycle = 1;
-    sz_size_t const step_mask = step - 1;
-    for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
-        // 0. Load again the four characters we are dropping, shift them, and subtract.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[-window_length], text_third[-window_length],
-                                              text_second[-window_length], text_first[-window_length]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-        hash_low_vec.ymm =
-            _mm256_sub_epi64(hash_low_vec.ymm, _mm256_mul_epu64(chars_low_vec.ymm, prime_power_low_vec.ymm));
-        hash_high_vec.ymm =
-            _mm256_sub_epi64(hash_high_vec.ymm, _mm256_mul_epu64(chars_high_vec.ymm, prime_power_high_vec.ymm));
-
-        // 1. Multiply the hashes by the base.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-
-        // 3. Add the incoming characters.
-        hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_low_vec.ymm = _mm256_blendv_epi8(hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
-                                              _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
-        hash_high_vec.ymm = _mm256_blendv_epi8(hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
-                                               _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
-
-        // 5. Compute the hash mix, that will be used to index into the fingerprint.
-        //    This includes a serial step at the end.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
-        hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
-        if ((cycle & step_mask) == 0) {
-            callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-            callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-            callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-            callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-        }
-    }
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif
-#pragma endregion
-
-/*
- *  @brief  AVX-512 implementation of the string search algorithms.
- *
- *  Different subsets of AVX-512 were introduced in different years:
- *  - 2017 SkyLake: F, CD, ER, PF, VL, DQ, BW
- *  - 2018 CannonLake: IFMA, VBMI
- *  - 2019 IceLake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES
- *  - 2020 TigerLake: VP2INTERSECT
- */
-#pragma region AVX512 Implementation
-
-#if SZ_USE_X86_AVX512
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
-#include <immintrin.h>
-
-/**
- *  @brief  Helper structure to simplify work with 512-bit registers.
- */
-typedef union sz_u512_vec_t {
-    __m512i zmm;
-    __m256i ymms[2];
-    __m128i xmms[4];
-    sz_u64_t u64s[8];
-    sz_u32_t u32s[16];
-    sz_u16_t u16s[32];
-    sz_u8_t u8s[64];
-    sz_i64_t i64s[8];
-    sz_i32_t i32s[16];
-} sz_u512_vec_t;
-
-SZ_INTERNAL __mmask64 _sz_u64_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 64:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 64:
-    return _bzhi_u64(0xFFFFFFFFFFFFFFFF, n < 64 ? (sz_u32_t)n : 64);
-}
-
-SZ_INTERNAL __mmask32 _sz_u32_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 32:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 32:
-    return _bzhi_u32(0xFFFFFFFF, n < 32 ? (sz_u32_t)n : 32);
-}
-
-SZ_INTERNAL __mmask16 _sz_u16_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 16:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 16:
-    return _bzhi_u32(0xFFFFFFFF, n < 16 ? (sz_u32_t)n : 16);
-}
-
-SZ_INTERNAL __mmask16 _sz_u16_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 16:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 16:
-    return (__mmask16)_bzhi_u32(0xFFFFFFFF, (sz_u32_t)n);
-}
-
-SZ_INTERNAL __mmask32 _sz_u32_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 32:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 32:
-    return _bzhi_u32(0xFFFFFFFF, (sz_u32_t)n);
-}
-
-SZ_INTERNAL __mmask64 _sz_u64_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 64:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 64:
-    return _bzhi_u64(0xFFFFFFFFFFFFFFFF, (sz_u32_t)n);
-}
-
-SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    sz_u512_vec_t a_vec, b_vec;
-
-    // Pointer arithmetic is cheap, fetching memory is not!
-    // So we can use the masked loads to fetch at most one cache-line for each string,
-    // compare the prefixes, and only then move forward.
-    sz_size_t a_head_length = 64 - ((sz_size_t)a % 64); // 63 or less.
-    sz_size_t b_head_length = 64 - ((sz_size_t)b % 64); // 63 or less.
-    a_head_length = a_head_length < a_length ? a_head_length : a_length;
-    b_head_length = b_head_length < b_length ? b_head_length : b_length;
-    sz_size_t head_length = a_head_length < b_head_length ? a_head_length : b_head_length;
-    __mmask64 head_mask = _sz_u64_mask_until(head_length);
-    a_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, a);
-    b_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, b);
-    __mmask64 mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-    if (mask_not_equal != 0) {
-        sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-        char a_char = a_vec.u8s[first_diff];
-        char b_char = b_vec.u8s[first_diff];
-        return _sz_order_scalars(a_char, b_char);
-    }
-    else if (head_length == a_length && head_length == b_length) { return sz_equal_k; }
-    else { a += head_length, b += head_length, a_length -= head_length, b_length -= head_length; }
-
-    // The rare case, when both string are very long.
-    __mmask64 a_mask, b_mask;
-    while ((a_length >= 64) & (b_length >= 64)) {
-        a_vec.zmm = _mm512_loadu_si512(a);
-        b_vec.zmm = _mm512_loadu_si512(b);
-        mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask_not_equal != 0) {
-            sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-            char a_char = a_vec.u8s[first_diff];
-            char b_char = b_vec.u8s[first_diff];
-            return _sz_order_scalars(a_char, b_char);
-        }
-        a += 64, b += 64, a_length -= 64, b_length -= 64;
-    }
-
-    // In most common scenarios at least one of the strings is under 64 bytes.
-    if (a_length | b_length) {
-        a_mask = _sz_u64_clamp_mask_until(a_length);
-        b_mask = _sz_u64_clamp_mask_until(b_length);
-        a_vec.zmm = _mm512_maskz_loadu_epi8(a_mask, a);
-        b_vec.zmm = _mm512_maskz_loadu_epi8(b_mask, b);
-        // The AVX-512 `_mm512_mask_cmpneq_epi8_mask` intrinsics are generally handy in such environments.
-        // They, however, have latency 3 on most modern CPUs. Using AVX2: `_mm256_cmpeq_epi8` would have
-        // been cheaper, if we didn't have to apply `_mm256_movemask_epi8` afterwards.
-        mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask_not_equal != 0) {
-            sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-            char a_char = a_vec.u8s[first_diff];
-            char b_char = b_vec.u8s[first_diff];
-            return _sz_order_scalars(a_char, b_char);
-        }
-        // From logic perspective, the hardest cases are "abc\0" and "abc".
-        // The result must be `sz_greater_k`, as the latter is shorter.
-        else { return _sz_order_scalars(a_length, b_length); }
-    }
-
-    return sz_equal_k;
-}
-
-SZ_PUBLIC sz_bool_t sz_equal_avx512(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    __mmask64 mask;
-    sz_u512_vec_t a_vec, b_vec;
-
-    while (length >= 64) {
-        a_vec.zmm = _mm512_loadu_si512(a);
-        b_vec.zmm = _mm512_loadu_si512(b);
-        mask = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask != 0) return sz_false_k;
-        a += 64, b += 64, length -= 64;
-    }
-
-    if (length) {
-        mask = _sz_u64_mask_until(length);
-        a_vec.zmm = _mm512_maskz_loadu_epi8(mask, a);
-        b_vec.zmm = _mm512_maskz_loadu_epi8(mask, b);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpneq_epi8_mask(mask, a_vec.zmm, b_vec.zmm);
-        return (sz_bool_t)(mask == 0);
-    }
-
-    return sz_true_k;
-}
-
-SZ_PUBLIC void sz_fill_avx512(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    __m512i value_vec = _mm512_set1_epi8(value);
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores".
-    //
-    //    for (; length >= 64; target += 64, length -= 64) _mm512_storeu_si512(target, value_vec);
-    //    _mm512_mask_storeu_epi8(target, _sz_u64_mask_until(length), value_vec);
-    //
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        _mm512_mask_storeu_epi8(target, mask, value_vec);
-    }
-    // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
-    // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
-    // by computing 2 masks - for the head and tail, using masked stores for the head and tail, and unmasked
-    // for the body.
-    else {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        _mm512_mask_storeu_epi8(target, head_mask, value_vec);
-        for (target += head_length; body_length >= 64; target += 64, body_length -= 64)
-            _mm512_store_si512(target, value_vec);
-        _mm512_mask_storeu_epi8(target, tail_mask, value_vec);
-    }
-}
-
-SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores" and "loads".
-    //
-    //    for (; length >= 64; target += 64, source += 64, length -= 64)
-    //        _mm512_storeu_si512(target, _mm512_loadu_si512(source));
-    //    __mmask64 mask = _sz_u64_mask_until(length);
-    //    _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    //
-    // A typical AWS Sapphire Rapids instance can have 48 KB x 2 blocks of L1 data cache per core,
-    // 2 MB x 2 blocks of L2 cache per core, and one shared 60 MB buffer of L3 cache.
-    // With two strings, we may consider the overal workload huge, if each exceeds 1 MB in length.
-    int const is_huge = length >= 1ull * 1024ull * 1024ull;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    }
-    // When dealing wirh larger arrays, the optimization is not as simple as with the `sz_fill_avx512` function,
-    // as both buffers may be unaligned. If we are lucky and the requested operation is some huge page transfer,
-    // we can use aligned loads and stores, and the performance will be great.
-    else if ((sz_size_t)target % 64 == 0 && (sz_size_t)source % 64 == 0 && !is_huge) {
-        for (; length >= 64; target += 64, source += 64, length -= 64)
-            _mm512_store_si512(target, _mm512_load_si512(source));
-        // At this point the length is guaranteed to be under 64.
-        __mmask64 mask = _sz_u64_mask_until(length);
-        // Aligned load and stores would work too, but it's not defined.
-        _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    }
-    // The trickiest case is when both `source` and `target` are not aligned.
-    // In such and simpler cases we can copy enough bytes into `target` to reach its cacheline boundary,
-    // and then combine unaligned loads with aligned stores.
-    else if (!is_huge) {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-        for (target += head_length, source += head_length; body_length >= 64;
-             target += 64, source += 64, body_length -= 64)
-            _mm512_store_si512(target, _mm512_loadu_si512(source)); // Unaligned load, but aligned store!
-        _mm512_mask_storeu_epi8(target, tail_mask, _mm512_maskz_loadu_epi8(tail_mask, source));
-    }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    //
-    //      1. Moving in both directions to maximize the throughput, when fetching from multiple
-    //         memory pages. Also helps with cache set-associativity issues, as we won't always
-    //         be fetching the same entries in the lookup table.
-    //      2. Using non-temporal stores to avoid polluting the cache.
-    //      3. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
-    //         for predictable patterns, so disregard this advice.
-    //
-    // Bidirectional traversal adds about 10%, accelerating from 11 GB/s to 12 GB/s.
-    // Using "streaming stores" boosts us from 12 GB/s to 19 GB/s.
-    else {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64;
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;
-        sz_size_t body_length = length - head_length - tail_length;
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-        _mm512_mask_storeu_epi8(target + head_length + body_length, tail_mask,
-                                _mm512_maskz_loadu_epi8(tail_mask, source));
-
-        // Now in the main loop, we can use non-temporal loads and stores,
-        // performing the operation in both directions.
-        for (target += head_length, source += head_length; //
-             body_length >= 128;                           //
-             target += 64, source += 64, body_length -= 128) {
-            _mm512_stream_si512((__m512i *)(target), _mm512_loadu_si512(source));
-            _mm512_stream_si512((__m512i *)(target + body_length - 64), _mm512_loadu_si512(source + body_length - 64));
-        }
-        if (body_length >= 64) _mm512_stream_si512((__m512i *)target, _mm512_loadu_si512(source));
-    }
-}
-
-SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    if (target == source) return; // Don't be silly, don't move the data if it's already there.
-
-    // On very short buffers, that are one cache line in width or less, we don't need any loops.
-    // We can also avoid any data-dependencies between iterations, assuming we have 32 registers
-    // to pre-load the data, before writing it back.
-    if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    }
-    else if (length <= 128) {
-        sz_size_t last_length = length - 64;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
-        __m512i source0 = _mm512_loadu_epi8(source);
-        __m512i source1 = _mm512_maskz_loadu_epi8(mask, source + 64);
-        _mm512_storeu_epi8(target, source0);
-        _mm512_mask_storeu_epi8(target + 64, mask, source1);
-    }
-    else if (length <= 192) {
-        sz_size_t last_length = length - 128;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
-        __m512i source0 = _mm512_loadu_epi8(source);
-        __m512i source1 = _mm512_loadu_epi8(source + 64);
-        __m512i source2 = _mm512_maskz_loadu_epi8(mask, source + 128);
-        _mm512_storeu_epi8(target, source0);
-        _mm512_storeu_epi8(target + 64, source1);
-        _mm512_mask_storeu_epi8(target + 128, mask, source2);
-    }
-    else if (length <= 256) {
-        sz_size_t last_length = length - 192;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
-        __m512i source0 = _mm512_loadu_epi8(source);
-        __m512i source1 = _mm512_loadu_epi8(source + 64);
-        __m512i source2 = _mm512_loadu_epi8(source + 128);
-        __m512i source3 = _mm512_maskz_loadu_epi8(mask, source + 192);
-        _mm512_storeu_epi8(target, source0);
-        _mm512_storeu_epi8(target + 64, source1);
-        _mm512_storeu_epi8(target + 128, source2);
-        _mm512_mask_storeu_epi8(target + 192, mask, source3);
-    }
-
-    // If the regions don't overlap at all, just use "copy" and save some brain cells thinking about corner cases.
-    else if (target + length < source || target >= source + length) { sz_copy_avx512(target, source, length); }
-
-    // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
-    // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
-    // by computing 2 masks - for the head and tail, using masked stores for the head and tail, and unmasked
-    // for the body.
-    else {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-
-        // The absolute most common case of using "moves" is shifting the data within a continuous buffer
-        // when adding a removing some values in it. In such cases, a typical shift is by 1, 2, 4, 8, 16,
-        // or 32 bytes, rarely larger. For small shifts, under the size of the ZMM register, we can use shuffles.
-        //
-        // Remember:
-        //      - if we are shifting data left, that we are traversing to the right.
-        //      - if we are shifting data right, that we are traversing to the left.
-        int const left_to_right_traversal = source > target;
-
-        // Now we guarantee, that the relative shift within registers is from 1 to 63 bytes and the output is aligned.
-        // Hopefully, we need to shift more than two ZMM registers, so we could consider `valignr` instruction.
-        // Sadly, using `_mm512_alignr_epi8` doesn't make sense, as it operates at a 128-bit granularity.
-        //
-        //      - `_mm256_alignr_epi8` shifts entire 256-bit register, but we need many of them.
-        //      - `_mm512_alignr_epi32` shifts 512-bit chunks, but only if the `shift` is a multiple of 4 bytes.
-        //      - `_mm512_alignr_epi64` shifts 512-bit chunks by 8 bytes.
-        //
-        // All of those have a latency of 1 cycle, and the shift amount must be an immediate value!
-        // For 1-byte-shift granularity, the `_mm512_permutex2var_epi8` has a latency of 6 and needs VBMI!
-        // The most efficient and broadly compatible alternative could be to use a combination of align and shuffle.
-        // A similar approach was outlined in "Byte-wise alignr in AVX512F" by Wojciech Muła.
-        // http://0x80.pl/notesen/2016-10-16-avx512-byte-alignr.html
-        //
-        // That solution, is extremely mouthful, assuming we need compile time constants for the shift amount.
-        // A cleaner one, with a latency of 3 cycles, is to use `_mm512_permutexvar_epi8` or
-        // `_mm512_mask_permutexvar_epi8`, which can be seen as combination of a cross-register shuffle and blend,
-        // and is available with VBMI. That solution is still noticeably slower than AVX2.
-        //
-        // The GLibC implementation also uses non-temporal stores for larger buffers, we don't.
-        // https://codebrowser.dev/glibc/glibc/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S.html
-        if (left_to_right_traversal) {
-            // Head, body, and tail.
-            _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-            for (target += head_length, source += head_length; body_length >= 64;
-                 target += 64, source += 64, body_length -= 64)
-                _mm512_store_si512(target, _mm512_loadu_si512(source));
-            _mm512_mask_storeu_epi8(target, tail_mask, _mm512_maskz_loadu_epi8(tail_mask, source));
-        }
-        else {
-            // Tail, body, and head.
-            _mm512_mask_storeu_epi8(target + head_length + body_length, tail_mask,
-                                    _mm512_maskz_loadu_epi8(tail_mask, source + head_length + body_length));
-            for (; body_length >= 64; body_length -= 64)
-                _mm512_store_si512(target + head_length + body_length - 64,
-                                   _mm512_loadu_si512(source + head_length + body_length - 64));
-            _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-        }
-    }
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    __mmask64 mask;
-    sz_u512_vec_t h_vec, n_vec;
-    n_vec.zmm = _mm512_set1_epi8(n[0]);
-
-    while (h_length >= 64) {
-        h_vec.zmm = _mm512_loadu_si512(h);
-        mask = _mm512_cmpeq_epi8_mask(h_vec.zmm, n_vec.zmm);
-        if (mask) return h + sz_u64_ctz(mask);
-        h += 64, h_length -= 64;
-    }
-
-    if (h_length) {
-        mask = _sz_u64_mask_until(h_length);
-        h_vec.zmm = _mm512_maskz_loadu_epi8(mask, h);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpeq_epu8_mask(mask, h_vec.zmm, n_vec.zmm);
-        if (mask) return h + sz_u64_ctz(mask);
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_avx512(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into ZMM registers.
-    __mmask64 matches;
-    __mmask64 mask;
-    sz_u512_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.zmm = _mm512_set1_epi8(n[offset_first]);
-    n_mid_vec.zmm = _mm512_set1_epi8(n[offset_mid]);
-    n_last_vec.zmm = _mm512_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    // We have several optimized versions of the lagorithm for shorter strings,
-    // but they all mimic the default case for unbounded length needles
-    if (n_length >= 64) {
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches);
-                if (sz_equal_avx512(h + potential_offset, n, n_length)) return h + potential_offset;
-                matches &= matches - 1;
-            }
-
-            // TODO: If the last character contains a bad byte, we can reposition the start of the next iteration.
-            // This will be very helpful for very long needles.
-        }
-    }
-    // If there are only 2 or 3 characters in the needle, we don't even need the nested loop.
-    else if (n_length <= 3) {
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            if (matches) return h + sz_u64_ctz(matches);
-        }
-    }
-    // If the needle is smaller than the size of the ZMM register, we can use masked comparisons
-    // to avoid the the inner-most nested loop and compare the entire needle against a haystack
-    // slice in 3 CPU cycles.
-    else {
-        __mmask64 n_mask = _sz_u64_mask_until(n_length);
-        sz_u512_vec_t n_full_vec, h_full_vec;
-        n_full_vec.zmm = _mm512_maskz_loadu_epi8(n_mask, n);
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches);
-                h_full_vec.zmm = _mm512_maskz_loadu_epi8(n_mask, h + potential_offset);
-                if (_mm512_mask_cmpneq_epi8_mask(n_mask, h_full_vec.zmm, n_full_vec.zmm) == 0)
-                    return h + potential_offset;
-                matches &= matches - 1;
-            }
-        }
-    }
-
-    // The "tail" of the function uses masked loads to process the remaining bytes.
-    {
-        mask = _sz_u64_mask_until(h_length - n_length + 1);
-        h_first_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_first);
-        h_mid_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_mid);
-        h_last_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_ctz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + potential_offset, n, n_length)) return h + potential_offset;
-            matches &= matches - 1;
-        }
-    }
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    __mmask64 mask;
-    sz_u512_vec_t h_vec, n_vec;
-    n_vec.zmm = _mm512_set1_epi8(n[0]);
-
-    while (h_length >= 64) {
-        h_vec.zmm = _mm512_loadu_si512(h + h_length - 64);
-        mask = _mm512_cmpeq_epi8_mask(h_vec.zmm, n_vec.zmm);
-        if (mask) return h + h_length - 1 - sz_u64_clz(mask);
-        h_length -= 64;
-    }
-
-    if (h_length) {
-        mask = _sz_u64_mask_until(h_length);
-        h_vec.zmm = _mm512_maskz_loadu_epi8(mask, h);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpeq_epu8_mask(mask, h_vec.zmm, n_vec.zmm);
-        if (mask) return h + 64 - sz_u64_clz(mask) - 1;
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_avx512(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into ZMM registers.
-    __mmask64 mask;
-    __mmask64 matches;
-    sz_u512_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.zmm = _mm512_set1_epi8(n[offset_first]);
-    n_mid_vec.zmm = _mm512_set1_epi8(n[offset_mid]);
-    n_last_vec.zmm = _mm512_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 64; h_length -= 64) {
-        h_reversed = h + h_length - n_length - 64 + 1;
-        h_first_vec.zmm = _mm512_loadu_si512(h_reversed + offset_first);
-        h_mid_vec.zmm = _mm512_loadu_si512(h_reversed + offset_mid);
-        h_last_vec.zmm = _mm512_loadu_si512(h_reversed + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~((sz_u64_t)1 << (63 - potential_offset));
-        }
-    }
-
-    // The "tail" of the function uses masked loads to process the remaining bytes.
-    {
-        mask = _sz_u64_mask_until(h_length - n_length + 1);
-        h_first_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_first);
-        h_mid_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_mid);
-        h_last_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + 64 - potential_offset - 1, n, n_length))
-                return h + 64 - potential_offset - 1;
-            sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~((sz_u64_t)1 << (63 - potential_offset));
-        }
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
-                             apply_to = function)
-
-/**
- *  @brief  Computes the edit distance between two very short byte-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 63, and evaluates at most (63 * 2 + 1 = 127) diagonals, or just as many loop cycles.
- *  Supports an early exit, if the distance is bounded.
- *  Keeps all of the data and Levenshtein matrices skew diagonal in just a couple of registers.
- *  Benefits from the @b `vpermb` instructions, that can rotate the bytes across the entire ZMM register.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto63_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                        //
-    sz_cptr_t longer, sz_size_t longer_length,                          //
-    sz_size_t bound) {
-
-    sz_size_t const max_length = 63u;
-    sz_assert(shorter_length <= longer_length && "The 'shorter' string is longer than the 'longer' one.");
-    sz_assert(shorter_length < max_length && "The length must fit into 16-bit integer. Otherwise use serial variant.");
-
-    // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
-    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
-    sz_size_t const shorter_dim = shorter_length + 1;
-    sz_size_t const longer_dim = longer_length + 1;
-
-    // The next few buffers will be swapped around.
-    sz_u512_vec_t previous_vec, current_vec, next_vec;
-    sz_u512_vec_t gaps_vec, substitutions_vec;
-
-    // Load the strings into ZMM registers - just once.
-    sz_u512_vec_t longer_vec, shorter_vec, shorter_rotated_vec, rotate_left_vec, rotate_right_vec, ones_vec, bound_vec;
-    longer_vec.zmm = _mm512_maskz_loadu_epi8(_sz_u64_mask_until(longer_length), longer);
-    rotate_left_vec.zmm = _mm512_set_epi8(                              //
-        0, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,  //
-        48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, //
-        32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, //
-        16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-    rotate_right_vec.zmm = _mm512_set_epi8(                             //
-        62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48,     //
-        47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, //
-        31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, //
-        15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 63);
-    ones_vec.zmm = _mm512_set1_epi8(1);
-    bound_vec.zmm = _mm512_set1_epi8(bound <= 255 ? (sz_u8_t)bound : 255);
-
-    // To simplify comparisons and traversals, we want to reverse the order of bytes in the shorter string.
-    for (sz_size_t i = 0; i != shorter_length; ++i) shorter_vec.u8s[63 - i] = shorter[i];
-    shorter_rotated_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, shorter_vec.zmm);
-
-    // Let's say we are dealing with 3 and 5 letter words.
-    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
-    // It will have:
-    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
-    // - 2 diagonals of fixed length, at positions: 4, 5.
-    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
-    sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
-
-    // Initialize the first two diagonals:
-    //
-    //      previous_vec.u8s[0] = 0;
-    //      current_vec.u8s[0] = current_vec.u8s[1] = 1;
-    //
-    // We can do a similar thing with vector ops:
-    previous_vec.zmm = _mm512_setzero_si512();
-    current_vec.zmm = _mm512_set1_epi8(1);
-
-    // We skip diagonals 0 and 1, as they are trivial.
-    // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
-    // so we are effectively computing just one value, as will be marked by a single set bit in
-    // the `next_diagonal_mask` on the very first iteration.
-    sz_size_t next_diagonal_index = 2;
-    __mmask64 next_diagonal_mask = 0;
-
-    // Progress through the upper triangle of the Levenshtein matrix.
-    for (; next_diagonal_index != shorter_dim; ++next_diagonal_index) {
-        // After this iteration, the values at offset `0` and `next_diagonal_index` in the `next_vec`
-        // should be set to `next_diagonal_index`, but it's easier to broadcast the value to the whole vector,
-        // and later merge with a mask with new values.
-        next_vec.zmm = _mm512_set1_epi8((sz_u8_t)next_diagonal_index);
-
-        // The mask also adds one set bit.
-        next_diagonal_mask = _kor_mask64(next_diagonal_mask, 1);
-        next_diagonal_mask = _kshiftli_mask64(next_diagonal_mask, 1);
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        substitutions_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, substitutions_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(_mm512_permutexvar_epi8(rotate_right_vec.zmm, current_vec.zmm), current_vec.zmm),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_mask_min_epu8(next_vec.zmm, next_diagonal_mask, gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = current_vec.zmm;
-        current_vec.zmm = next_vec.zmm;
-
-        // Shift the shorter string
-        shorter_rotated_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, shorter_rotated_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
-    }
-
-    // Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.
-    for (; next_diagonal_index != longer_dim; ++next_diagonal_index) {
-        // After this iteration, the value `shorted_dim - 1` in the `next_vec`
-        // should be set to `next_diagonal_index`, but it's easier to broadcast the value to the whole vector,
-        // and later merge with a mask with new values.
-        next_vec.zmm = _mm512_set1_epi8((sz_u8_t)next_diagonal_index);
-
-        // Make sure we update the first entry.
-        next_diagonal_mask = _kor_mask64(next_diagonal_mask, 1);
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(current_vec.zmm, _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm)),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_mask_min_epu8(next_vec.zmm, next_diagonal_mask, gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm);
-        current_vec.zmm = next_vec.zmm;
-
-        // Let's shift the longer string now.
-        longer_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, longer_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
-    }
-
-    // Now let's handle the bottom right triangle.
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(current_vec.zmm, _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm)),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_min_epu8(gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm);
-        current_vec.zmm = next_vec.zmm;
-
-        // Let's shift the longer string now.
-        longer_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, longer_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
-        // In every following iterations we take use a shorter prefix of each register,
-        // but we don't need to update the `next_diagonal_mask` anymore... except for the early exit.
-        next_diagonal_mask = _kshiftri_mask64(next_diagonal_mask, 1);
-    }
-    return current_vec.u8s[0];
-}
-
-/**
- *  @brief  Computes the edit distance between two somewhat short bytes-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 127, and evaluates at most (127 * 2 + 1 = 255) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *  Benefits from the @b `vpermi2b` instructions, that can rotate the bytes in 2 registers at once.
- *
- *  This may be one of the most freuqently called kernels for:
- *  - source code analysis, assuming most lines are either under 80 or under 120 characters long.
- *  - DNA sequence alignment, as most short reads are 50-300 characters long.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto127_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two longer bytes-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 255, and evaluates at most (255 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *
- *  Each of 2x string ends up occupying 4 ZMM registers, and each of 3x diagonals uses 4 ZMM registers.
- *  So 20x of the 32x are persistently occupied, and the rest are used for math temporarily.
- *  This is the largest space-efficient variant, as strings beyond 255 characters may require
- *  16-bit accumulators, which would be a significant bottleneck.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                      //
-    sz_cptr_t longer, sz_size_t longer_length,                        //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two longer bytes-strings using the AVX-512VBMI extensions,
- *          assuming the upper distance bound can not exceed 255, but the string length can be arbitrary.
- *
- *  Applies to string lengths up to 255, and evaluates at most (255 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *
- *  Each of 2x string ends up occupying 4 ZMM registers, and each of 3x diagonals uses 4 ZMM registers.
- *  So 20x of the 32x are persistently occupied, and the rest are used for math temporarily.
- *  This is the largest space-efficient variant, as strings beyond 255 characters may require
- *  16-bit accumulators, which would be a significant bottleneck.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto255bound_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                              //
-    sz_cptr_t longer, sz_size_t longer_length,                                //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two mid-length UTF-8-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 127, and evaluates at most (127 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Benefits from the @b `valignd` instructions used to rotate UTF-32 unpacked unicode codepoints.
- *
- *  Each string is unpacked into 128 characters * 4 bytes per character / 64 bytes per register = 8 registers.
- *
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_utf8_skewed_diagonals_upto127_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                              //
-    sz_cptr_t longer, sz_size_t longer_length,                                //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    sz_unused(shorter && longer && bound && alloc);
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // TODO: Generalize!
-    sz_size_t const max_length = 256u * 256u;
-    sz_assert(shorter_length <= longer_length && "The 'shorter' string is longer than the 'longer' one.");
-    sz_assert(shorter_length < max_length && "The length must fit into 16-bit integer. Otherwise use serial variant.");
-    sz_unused(longer_length && bound && max_length);
-
-#if 0
-    // We are going to store 3 diagonals of the matrix.
-    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
-    sz_size_t const shorter_dim = shorter_length + 1;
-    sz_size_t const longer_dim = longer_length + 1;
-    // Unlike the serial version, we also want to avoid reverse-order iteration over teh shorter string.
-    // So let's allocate a bit more memory and reverse-export our shorter string into that buffer.
-    sz_size_t const buffer_length = sizeof(sz_u16_t) * longer_dim * 3 + shorter_length;
-    sz_u16_t *const distances = (sz_u16_t *)alloc->allocate(buffer_length, alloc->handle);
-    if (!distances) return SZ_SIZE_MAX;
-
-    // The next few pointers will be swapped around.
-    sz_u16_t *previous_distances = distances;
-    sz_u16_t *current_distances = previous_distances + longer_dim;
-    sz_u16_t *next_distances = current_distances + longer_dim;
-    sz_ptr_t const shorter_reversed = (sz_ptr_t)(next_distances + longer_dim);
-
-    // Export the reversed string into the buffer.
-    for (sz_size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
-
-    // Initialize the first two diagonals:
-    previous_distances[0] = 0;
-    current_distances[0] = current_distances[1] = 1;
-
-    // Using ZMM registers, we can process 32x 16-bit values at once,
-    // storing 16 bytes of each string in YMM registers.
-    sz_u512_vec_t insertions_vec, deletions_vec, substitutions_vec, next_vec;
-    sz_u512_vec_t ones_u16_vec;
-    ones_u16_vec.zmm = _mm512_set1_epi16(1);
-
-    // This is a mixed-precision implementation, using 8-bit representations for part of the operations.
-    // Even there, in case `SZ_USE_X86_AVX2=0`, let's use the `sz_u512_vec_t` type, addressing the first YMM halfs.
-    sz_u512_vec_t shorter_vec, longer_vec;
-    sz_u512_vec_t ones_u8_vec;
-    ones_u8_vec.ymms[0] = _mm256_set1_epi8(1);
-
-    // Let's say we are dealing with 3 and 5 letter words.
-    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
-    // It will have:
-    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
-    // - 2 diagonals of fixed length, at positions: 4, 5.
-    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
-    sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
-
-    // Progress through the upper triangle of the Levenshtein matrix.
-    sz_size_t next_diagonal_index = 2;
-    for (; next_diagonal_index != shorter_dim; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
-        for (sz_size_t offset_within_diagonal = 0; offset_within_diagonal + 2 < next_diagonal_length;) {
-            sz_u32_t remaining_length = (sz_u32_t)(next_diagonal_length - offset_within_diagonal - 2);
-            sz_u32_t register_length = remaining_length < 32 ? remaining_length : 32;
-            sz_u32_t remaining_length_mask = _bzhi_u32(0xFFFFFFFFu, register_length);
-            longer_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, longer + offset_within_diagonal);
-            // Our original code addressed the shorter string `[next_diagonal_index - offset_within_diagonal - 2]`
-            // for growing `offset_within_diagonal`. If the `shorter` string was reversed, the
-            // `[next_diagonal_index - offset_within_diagonal - 2]` would be equal to `[shorter_length - 1 -
-            // next_diagonal_index + offset_within_diagonal + 2]`. Which simplified would be equal to
-            // `[shorter_length - next_diagonal_index + offset_within_diagonal + 1]`.
-            shorter_vec.ymms[0] = _mm256_maskz_loadu_epi8( //
-                remaining_length_mask,
-                shorter_reversed + shorter_length - next_diagonal_index + offset_within_diagonal + 1);
-            // For substitutions, perform the equality comparison using AVX2 instead of AVX-512
-            // to get the result as a vector, instead of a bitmask. Adding 1 to every scalar we can overflow
-            // transforming from {0xFF, 0} values to {0, 1} values - exactly what we need. Then - upcast to 16-bit.
-            substitutions_vec.zmm = _mm512_cvtepi8_epi16( //
-                _mm256_add_epi8(_mm256_cmpeq_epi8(longer_vec.ymms[0], shorter_vec.ymms[0]), ones_u8_vec.ymms[0]));
-            substitutions_vec.zmm = _mm512_add_epi16( //
-                substitutions_vec.zmm,
-                _mm512_maskz_loadu_epi16(remaining_length_mask, previous_distances + offset_within_diagonal));
-            // For insertions and deletions, on modern hardware, it's faster to issue two separate loads,
-            // than rotate the bytes in the ZMM register.
-            insertions_vec.zmm =
-                _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + offset_within_diagonal);
-            deletions_vec.zmm =
-                _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + offset_within_diagonal + 1);
-            // First get the minimum of insertions and deletions.
-            next_vec.zmm = _mm512_add_epi16(_mm512_min_epu16(insertions_vec.zmm, deletions_vec.zmm), ones_u16_vec.zmm);
-            next_vec.zmm = _mm512_min_epu16(next_vec.zmm, substitutions_vec.zmm);
-            _mm512_mask_storeu_epi16(next_distances + offset_within_diagonal + 1, remaining_length_mask, next_vec.zmm);
-            offset_within_diagonal += register_length;
-        }
-        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-        next_distances[0] = next_distances[next_diagonal_length - 1] = (sz_u16_t)next_diagonal_index;
-        // Perform a circular rotation (three-way swap) of those buffers, to reuse the memory.
-        sz_u16_t *temporary = previous_distances;
-        previous_distances = current_distances;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a
-    // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
-    // index on either side, we will be cropping those values out.
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
-        for (sz_size_t i = 0; i != next_diagonal_length;) {
-            sz_u32_t remaining_length = (sz_u32_t)(next_diagonal_length - i);
-            sz_u32_t register_length = remaining_length < 32 ? remaining_length : 32;
-            sz_u32_t remaining_length_mask = _bzhi_u32(0xFFFFFFFFu, register_length);
-            longer_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, longer + next_diagonal_index - n + i);
-            // Our original code addressed the shorter string `[shorter_length - 1 - i]` for growing `i`.
-            // If the `shorter` string was reversed, the `[shorter_length - 1 - i]` would
-            // be equal to `[shorter_length - 1 - shorter_length + 1 + i]`.
-            // Which simplified would be equal to just `[i]`. Beautiful!
-            shorter_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, shorter_reversed + i);
-            // For substitutions, perform the equality comparison using AVX2 instead of AVX-512
-            // to get the result as a vector, instead of a bitmask. The compare it against the accumulated
-            // substitution costs.
-            substitutions_vec.zmm = _mm512_cvtepi8_epi16( //
-                _mm256_add_epi8(_mm256_cmpeq_epi8(longer_vec.ymms[0], shorter_vec.ymms[0]), ones_u8_vec.ymms[0]));
-            substitutions_vec.zmm = _mm512_add_epi16( //
-                substitutions_vec.zmm, _mm512_maskz_loadu_epi16(remaining_length_mask, previous_distances + i));
-            // For insertions and deletions, on modern hardware, it's faster to issue two separate loads,
-            // than rotate the bytes in the ZMM register.
-            insertions_vec.zmm = _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + i);
-            deletions_vec.zmm = _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + i + 1);
-            // First get the minimum of insertions and deletions.
-            next_vec.zmm = _mm512_add_epi16(_mm512_min_epu16(insertions_vec.zmm, deletions_vec.zmm), ones_u16_vec.zmm);
-            next_vec.zmm = _mm512_min_epu16(next_vec.zmm, substitutions_vec.zmm);
-            _mm512_mask_storeu_epi16(next_distances + i, remaining_length_mask, next_vec.zmm);
-            i += register_length;
-        }
-
-        // Perform a circular rotation (three-way swap) of those buffers, to reuse the memory, this time, with a shift,
-        // dropping the first element in the current array.
-        sz_u16_t *temporary = previous_distances;
-        previous_distances = current_distances + 1;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // Cache scalar before `free` call.
-    sz_size_t result = current_distances[0];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-#endif
-    return 0;
-}
-
-SZ_INTERNAL sz_size_t sz_edit_distance_avx512(   //
-    sz_cptr_t shorter, sz_size_t shorter_length, //
-    sz_cptr_t longer, sz_size_t longer_length,   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Bounded computations may exit early.
-    int const is_bounded = bound < longer_length;
-    if (is_bounded) {
-        // If one of the strings is empty - the edit distance is equal to the length of the other one.
-        if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
-        if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
-        // If the difference in length is beyond the `bound`, there is no need to check at all.
-        if (longer_length - shorter_length > bound) return bound;
-    }
-
-    // Make sure the shorter string is actually shorter.
-    if (shorter_length > longer_length) {
-        sz_cptr_t temporary = shorter;
-        shorter = longer;
-        longer = temporary;
-        sz_size_t temporary_length = shorter_length;
-        shorter_length = longer_length;
-        longer_length = temporary_length;
-    }
-
-    // Dispatch the right implementation based on the length of the strings.
-    if (longer_length < 64u)
-        return _sz_edit_distance_skewed_diagonals_upto63_avx512( //
-            shorter, shorter_length, longer, longer_length, bound);
-    // else if (longer_length < 256u * 256u)
-    //     return _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
-    //         shorter, shorter_length, longer, longer_length, bound, alloc);
-    else
-        return sz_edit_distance_serial(shorter, shorter_length, longer, longer_length, bound, alloc);
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_avx512(sz_cptr_t text, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "loads".
-    //
-    // A typical AWS Sapphire Rapids instance can have 48 KB x 2 blocks of L1 data cache per core,
-    // 2 MB x 2 blocks of L2 cache per core, and one shared 60 MB buffer of L3 cache.
-    // With two strings, we may consider the overal workload huge, if each exceeds 1 MB in length.
-    int const is_huge = length >= 1ull * 1024ull * 1024ull;
-    sz_u512_vec_t text_vec, sums_vec;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 16) {
-        __mmask16 mask = _sz_u16_mask_until(length);
-        text_vec.xmms[0] = _mm_maskz_loadu_epi8(mask, text);
-        sums_vec.xmms[0] = _mm_sad_epu8(text_vec.xmms[0], _mm_setzero_si128());
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_vec.xmms[0]);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_vec.xmms[0], 1);
-        return low + high;
-    }
-    else if (length <= 32) {
-        __mmask32 mask = _sz_u32_mask_until(length);
-        text_vec.ymms[0] = _mm256_maskz_loadu_epi8(mask, text);
-        sums_vec.ymms[0] = _mm256_sad_epu8(text_vec.ymms[0], _mm256_setzero_si256());
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymms[0]);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymms[0], 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        return low + high;
-    }
-    else if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        return _mm512_reduce_add_epi64(sums_vec.zmm);
-    }
-    else if (!is_huge) {
-        sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(text + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length; // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        for (text += head_length; body_length >= 64; text += 64, body_length -= 64) {
-            text_vec.zmm = _mm512_load_si512((__m512i const *)text);
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        }
-        text_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text);
-        sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        return _mm512_reduce_add_epi64(sums_vec.zmm);
-    }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    //
-    //      1. Moving in both directions to maximize the throughput, when fetching from multiple
-    //         memory pages. Also helps with cache set-associativity issues, as we won't always
-    //         be fetching the same entries in the lookup table.
-    //      2. Using non-temporal stores to avoid polluting the cache.
-    //      3. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
-    //         for predictable patterns, so disregard this advice.
-    //
-    // Bidirectional traversal generally adds about 10% to such algorithms.
-    else {
-        sz_u512_vec_t text_reversed_vec, sums_reversed_vec;
-        sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64;
-        sz_size_t tail_length = (sz_size_t)(text + length) % 64;
-        sz_size_t body_length = length - head_length - tail_length;
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-
-        text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        text_reversed_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text + head_length + body_length);
-        sums_reversed_vec.zmm = _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512());
-
-        // Now in the main loop, we can use non-temporal loads and stores,
-        // performing the operation in both directions.
-        for (text += head_length; body_length >= 128; text += 64, text += 64, body_length -= 128) {
-            text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-            text_reversed_vec.zmm = _mm512_stream_load_si512((__m512i *)(text + body_length - 64));
-            sums_reversed_vec.zmm =
-                _mm512_add_epi64(sums_reversed_vec.zmm, _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512()));
-        }
-        if (body_length >= 64) {
-            text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        }
-
-        return _mm512_reduce_add_epi64(_mm512_add_epi64(sums_vec.zmm, sums_reversed_vec.zmm));
-    }
-}
-
-SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    if (length < 4 * window_length) {
-        sz_hashes_serial(start, length, window_length, step, callback, callback_handle);
-        return;
-    }
-
-    // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
-    // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
-    sz_size_t const max_hashes = length - window_length + 1;
-    sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
-    sz_u8_t const *text_first = (sz_u8_t const *)start;
-    sz_u8_t const *text_second = text_first + min_hashes_per_thread;
-    sz_u8_t const *text_third = text_first + min_hashes_per_thread * 2;
-    sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
-    sz_u8_t const *text_end = text_first + length;
-
-    // Broadcast the global constants into the registers.
-    // Both high and low hashes will work with the same prime and golden ratio.
-    sz_u512_vec_t prime_vec, golden_ratio_vec;
-    prime_vec.zmm = _mm512_set1_epi64(SZ_U64_MAX_PRIME);
-    golden_ratio_vec.zmm = _mm512_set1_epi64(11400714819323198485ull);
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // We will be evaluating 4 offsets at a time with 2 different hash functions.
-    // We can fit all those 8 state variables in each of the following ZMM registers.
-    sz_u512_vec_t base_vec, prime_power_vec, shift_vec;
-    base_vec.zmm = _mm512_set_epi64(31ull, 31ull, 31ull, 31ull, 257ull, 257ull, 257ull, 257ull);
-    shift_vec.zmm = _mm512_set_epi64(0ull, 0ull, 0ull, 0ull, 77ull, 77ull, 77ull, 77ull);
-    prime_power_vec.zmm = _mm512_set_epi64(prime_power_low, prime_power_low, prime_power_low, prime_power_low,
-                                           prime_power_high, prime_power_high, prime_power_high, prime_power_high);
-
-    // Compute the initial hash values for every one of the four windows.
-    sz_u512_vec_t hash_vec, chars_vec;
-    hash_vec.zmm = _mm512_setzero_si512();
-    for (sz_u8_t const *prefix_end = text_first + window_length; text_first < prefix_end;
-         ++text_first, ++text_second, ++text_third, ++text_fourth) {
-
-        // 1. Multiply the hashes by the base.
-        hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`...
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
-                                         text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-
-        // 3. Add the incoming characters.
-        hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
-                                              _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
-    }
-
-    // 5. Compute the hash mix, that will be used to index into the fingerprint.
-    //    This includes a serial step at the end.
-    sz_u512_vec_t hash_mix_vec;
-    hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
-    hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
-                                            _mm512_extracti64x4_epi64(hash_mix_vec.zmm, 0));
-
-    callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-    callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-    callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-    callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-
-    // Now repeat that operation for the remaining characters, discarding older characters.
-    sz_size_t cycle = 1;
-    sz_size_t step_mask = step - 1;
-    for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
-        // 0. Load again the four characters we are dropping, shift them, and subtract.
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[-window_length], text_third[-window_length],
-                                         text_second[-window_length], text_first[-window_length], //
-                                         text_fourth[-window_length], text_third[-window_length],
-                                         text_second[-window_length], text_first[-window_length]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-        hash_vec.zmm = _mm512_sub_epi64(hash_vec.zmm, _mm512_mullo_epi64(chars_vec.zmm, prime_power_vec.zmm));
-
-        // 1. Multiply the hashes by the base.
-        hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
-                                         text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-
-        // ... and prefetch the next four characters into Level 2 or higher.
-        _mm_prefetch((sz_cptr_t)text_fourth + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_third + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_second + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_first + 1, _MM_HINT_T1);
-
-        // 3. Add the incoming characters.
-        hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
-                                              _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
-
-        // 5. Compute the hash mix, that will be used to index into the fingerprint.
-        //    This includes a serial step at the end.
-        hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
-        hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
-                                                _mm512_castsi512_si256(hash_mix_vec.zmm));
-
-        if ((cycle & step_mask) == 0) {
-            callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-            callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-            callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-            callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-        }
-    }
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512vbmi", "avx512vbmi2", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512vbmi,avx512vbmi2,bmi,bmi2"))), \
-                             apply_to = function)
-
-SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-
-    // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
-    // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
-    // But if at least 3 cache lines are touched, the AVX-512 implementation should be faster.
-    if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
-        return;
-    }
-
-    // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
-    // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
-    // by computing 2 masks - for the head and tail, using masked stores for the head and tail, and unmasked
-    // for the body.
-    sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-    sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-    __mmask64 head_mask = _sz_u64_mask_until(head_length);
-    __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-
-    // We need to pull the lookup table into 4x ZMM registers.
-    // We can use `vpermi2b` instruction to perform the look in two ZMM registers with `_mm512_permutex2var_epi8`
-    // intrinsics, but it has a 6-cycle latency on Sapphire Rapids and requires AVX512-VBMI. Assuming we need to
-    // operate on 4 registers, it might be cleaner to use 2x separate `_mm512_permutexvar_epi8` calls.
-    // Combining the results with 2x `_mm512_test_epi8_mask` and 3x blends afterwards.
-    //
-    //  - 4x `_mm512_permutexvar_epi8` maps to "VPERMB (ZMM, ZMM, ZMM)":
-    //      - On Ice Lake: 3 cycles latency, ports: 1*p5
-    //      - On Genoa: 6 cycles latency, ports: 1*FP12
-    //  - 3x `_mm512_mask_blend_epi8` maps to "VPBLENDMB_Z (ZMM, K, ZMM, ZMM)":
-    //      - On Ice Lake: 3 cycles latency, ports: 1*p05
-    //      - On Genoa: 1 cycle latency, ports: 1*FP0123
-    //  - 2x `_mm512_test_epi8_mask` maps to "VPTESTMB (K, ZMM, ZMM)":
-    //      - On Ice Lake: 3 cycles latency, ports: 1*p5
-    //      - On Genoa: 4 cycles latency, ports: 1*FP01
-    //
-    sz_u512_vec_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec;
-    lut_0_to_63_vec.zmm = _mm512_loadu_si512((lut));
-    lut_64_to_127_vec.zmm = _mm512_loadu_si512((lut + 64));
-    lut_128_to_191_vec.zmm = _mm512_loadu_si512((lut + 128));
-    lut_192_to_255_vec.zmm = _mm512_loadu_si512((lut + 192));
-
-    sz_u512_vec_t first_bit_vec, second_bit_vec;
-    first_bit_vec.zmm = _mm512_set1_epi8((char)0x80);
-    second_bit_vec.zmm = _mm512_set1_epi8((char)0x40);
-
-    __mmask64 first_bit_mask, second_bit_mask;
-    sz_u512_vec_t source_vec;
-    // If the top bit is set in each word of `source_vec`, than we use `lookup_128_to_191_vec` or
-    // `lookup_192_to_255_vec`. If the second bit is set, we use `lookup_64_to_127_vec` or `lookup_192_to_255_vec`.
-    sz_u512_vec_t lookup_0_to_63_vec, lookup_64_to_127_vec, lookup_128_to_191_vec, lookup_192_to_255_vec;
-    sz_u512_vec_t blended_0_to_127_vec, blended_128_to_255_vec, blended_0_to_255_vec;
-
-    // Handling the head.
-    if (head_length) {
-        source_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, source);
-        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
-        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
-        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
-        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
-        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
-        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
-        blended_0_to_127_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
-        blended_128_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
-        blended_0_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
-        _mm512_mask_storeu_epi8(target, head_mask, blended_0_to_255_vec.zmm);
-        source += head_length, target += head_length, length -= head_length;
-    }
-
-    // Handling the body in 64-byte chunks aligned to cache-line boundaries with respect to `target`.
-    while (length >= 64) {
-        source_vec.zmm = _mm512_loadu_si512(source);
-        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
-        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
-        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
-        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
-        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
-        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
-        blended_0_to_127_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
-        blended_128_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
-        blended_0_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
-        _mm512_store_si512(target, blended_0_to_255_vec.zmm); //! Aligned store, our main weapon!
-        source += 64, target += 64, length -= 64;
-    }
-
-    // Handling the tail.
-    if (tail_length) {
-        source_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, source);
-        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
-        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
-        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
-        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
-        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
-        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
-        blended_0_to_127_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
-        blended_128_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
-        blended_0_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
-        _mm512_mask_storeu_epi8(target, tail_mask, blended_0_to_255_vec.zmm);
-        source += tail_length, target += tail_length, length -= tail_length;
-    }
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-
-    // Before initializing the AVX-512 vectors, we may want to run the sequential code for the first few bytes.
-    // In practice, that only hurts, even when we have matches every 5-ish bytes.
-    //
-    //      if (length < SZ_SWAR_THRESHOLD) return sz_find_charset_serial(text, length, filter);
-    //      sz_cptr_t early_result = sz_find_charset_serial(text, SZ_SWAR_THRESHOLD, filter);
-    //      if (early_result) return early_result;
-    //      text += SZ_SWAR_THRESHOLD;
-    //      length -= SZ_SWAR_THRESHOLD;
-    //
-    // Let's unzip even and odd elements and replicate them into both lanes of the YMM register.
-    // That way when we invoke `_mm512_shuffle_epi8` we can use the same mask for both lanes.
-    sz_u512_vec_t filter_even_vec, filter_odd_vec;
-    __m256i filter_ymm = _mm256_lddqu_si256((__m256i const *)filter);
-    // There are a few way to initialize filters without having native strided loads.
-    // In the cronological order of experiments:
-    // - serial code initializing 128 bytes of odd and even mask
-    // - using several shuffles
-    // - using `_mm512_permutexvar_epi8`
-    // - using `_mm512_broadcast_i32x4(_mm256_castsi256_si128(_mm256_maskz_compress_epi8(0x55555555, filter_ymm)))`
-    //   and `_mm512_broadcast_i32x4(_mm256_castsi256_si128(_mm256_maskz_compress_epi8(0xaaaaaaaa, filter_ymm)))`
-    filter_even_vec.zmm = _mm512_broadcast_i32x4(_mm256_castsi256_si128( // broadcast __m128i to __m512i
-        _mm256_maskz_compress_epi8(0x55555555, filter_ymm)));
-    filter_odd_vec.zmm = _mm512_broadcast_i32x4(_mm256_castsi256_si128( // broadcast __m128i to __m512i
-        _mm256_maskz_compress_epi8(0xaaaaaaaa, filter_ymm)));
-    // After the unzipping operation, we can validate the contents of the vectors like this:
-    //
-    //      for (sz_size_t i = 0; i != 16; ++i) {
-    //          sz_assert(filter_even_vec.u8s[i] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 16] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 16] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 32] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 32] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 48] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 48] == filter->_u8s[i * 2 + 1]);
-    //      }
-    //
-    sz_u512_vec_t text_vec;
-    sz_u512_vec_t lower_nibbles_vec, higher_nibbles_vec;
-    sz_u512_vec_t bitset_even_vec, bitset_odd_vec;
-    sz_u512_vec_t bitmask_vec, bitmask_lookup_vec;
-    bitmask_lookup_vec.zmm = _mm512_set_epi8(                       //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);
-
-    while (length) {
-        // The following algorithm is a transposed equivalent of the "SIMDized check which bytes are in a set"
-        // solutions by Wojciech Muła. We populate the bitmask differently and target newer CPUs, so
-        // StrinZilla uses a somewhat different approach.
-        // http://0x80.pl/articles/simd-byte-lookup.html#alternative-implementation-new
-        //
-        //      sz_u8_t input = *(sz_u8_t const *)text;
-        //      sz_u8_t lo_nibble = input & 0x0f;
-        //      sz_u8_t hi_nibble = input >> 4;
-        //      sz_u8_t bitset_even = filter_even_vec.u8s[hi_nibble];
-        //      sz_u8_t bitset_odd = filter_odd_vec.u8s[hi_nibble];
-        //      sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //      sz_u8_t bitset = lo_nibble < 8 ? bitset_even : bitset_odd;
-        //      if ((bitset & bitmask) != 0) return text;
-        //      else { length--, text++; }
-        //
-        // The nice part about this, loading the strided data is vey easy with Arm NEON,
-        // while with x86 CPUs after AVX, shuffles within 256 bits shouldn't be an issue either.
-        sz_size_t load_length = sz_min_of_two(length, 64);
-        __mmask64 load_mask = _sz_u64_mask_until(load_length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, text);
-        lower_nibbles_vec.zmm = _mm512_and_si512(text_vec.zmm, _mm512_set1_epi8(0x0f));
-        bitmask_vec.zmm = _mm512_shuffle_epi8(bitmask_lookup_vec.zmm, lower_nibbles_vec.zmm);
-        //
-        // At this point we can validate the `bitmask_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != load_length; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t lo_nibble = input & 0x0f;
-        //          sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //          sz_assert(bitmask_vec.u8s[i] == bitmask);
-        //      }
-        //
-        // Shift right every byte by 4 bits.
-        // There is no `_mm512_srli_epi8` intrinsic, so we have to use `_mm512_srli_epi16`
-        // and combine it with a mask to clear the higher bits.
-        higher_nibbles_vec.zmm = _mm512_and_si512(_mm512_srli_epi16(text_vec.zmm, 4), _mm512_set1_epi8(0x0f));
-        bitset_even_vec.zmm = _mm512_shuffle_epi8(filter_even_vec.zmm, higher_nibbles_vec.zmm);
-        bitset_odd_vec.zmm = _mm512_shuffle_epi8(filter_odd_vec.zmm, higher_nibbles_vec.zmm);
-        //
-        // At this point we can validate the `bitset_even_vec` and `bitset_odd_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != load_length; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t const *bitset_ptr = &filter->_u8s[0];
-        //          sz_u8_t hi_nibble = input >> 4;
-        //          sz_u8_t bitset_even = bitset_ptr[hi_nibble * 2];
-        //          sz_u8_t bitset_odd = bitset_ptr[hi_nibble * 2 + 1];
-        //          sz_assert(bitset_even_vec.u8s[i] == bitset_even);
-        //          sz_assert(bitset_odd_vec.u8s[i] == bitset_odd);
-        //      }
-        //
-        // TODO: Is this a good place for ternary logic?
-        __mmask64 take_first = _mm512_cmplt_epi8_mask(lower_nibbles_vec.zmm, _mm512_set1_epi8(8));
-        bitset_even_vec.zmm = _mm512_mask_blend_epi8(take_first, bitset_odd_vec.zmm, bitset_even_vec.zmm);
-        __mmask64 matches_mask = _mm512_mask_test_epi8_mask(load_mask, bitset_even_vec.zmm, bitmask_vec.zmm);
-        if (matches_mask) {
-            int offset = sz_u64_ctz(matches_mask);
-            return text + offset;
-        }
-        else { text += load_length, length -= load_length; }
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-    return sz_rfind_charset_serial(text, length, filter);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_many_avx512(                        //
-    sz_cptr_t haystack, sz_size_t haystack_length,              //
-    sz_cptr_t const *needles, sz_size_t const *needles_lengths, //
-    sz_size_t *needle_offset) {
-
-    // When dealing with huge needles vocabularies, like in tokenization workloads, we need to construct an automaton.
-    // But in many cases, the vocabulary is small enough to use a simpler DFA-less approach, combining the ideas from
-    // the `sz_find_avx512` and `sz_find_charset_avx512` functions.
-    //
-    // Pick the offsets within needles where there is the least variance in the characters.
-    // Like for "the", "then", "there", "these", "those", "their", "they", "them", "that", "this", "thus", "than":
-    //
-    //    0: 't'
-    //    1: 'h'
-    //    2: 'e', 'a', 'i', 'o', 'u'
-    //    3: 'n', 'r', 's', 'i', 'y', 'm', 't'
-    //
-    // So depending on our "register budget", we can use a different number of pivot points: offset 0, 1, 2 make
-    // the most sense if we can only use 3 ZMM registers.
-    sz_unused(haystack && haystack_length && needles && needles_lengths && needle_offset);
-    return 0;
-}
-
-/**
- *  Computes the Needleman Wunsch alignment score between two strings.
- *  The method uses 32-bit integers to accumulate the running score for every cell in the matrix.
- *  Assuming the costs of substitutions can be arbitrary signed 8-bit integers, the method is expected to be used
- *  on strings not exceeding 2^24 length or 16.7 million characters.
- *
- *  Unlike the `_sz_edit_distance_skewed_diagonals_upto65k_avx512` method, this one uses signed integers to store
- *  the accumulated score. Moreover, it's primary bottleneck is the latency of gathering the substitution costs
- *  from the substitution matrix. If we use the diagonal order, we will be comparing a slice of the first string with
- *  a slice of the second. If we stick to the conventional horizontal order, we will be comparing one character against
- *  a slice, which is much easier to optimize. In that case we are sampling costs not from arbitrary parts of
- *  a 256 x 256 matrix, but from a single row!
- */
-SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
-
-    // If one of the strings is empty - the edit distance is equal to the length of the other one
-    if (longer_length == 0) return (sz_ssize_t)shorter_length * gap;
-    if (shorter_length == 0) return (sz_ssize_t)longer_length * gap;
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
-    }
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    sz_size_t const max_length = 256ull * 256ull * 256ull;
-    sz_size_t const n = longer_length + 1;
-    sz_assert(n < max_length && "The length must fit into 24-bit integer. Otherwise use serial variant.");
-    sz_unused(longer_length && max_length);
-
-    sz_size_t buffer_length = sizeof(sz_i32_t) * n * 2;
-    sz_i32_t *distances = (sz_i32_t *)alloc->allocate(buffer_length, alloc->handle);
-    sz_i32_t *previous_distances = distances;
-    sz_i32_t *current_distances = previous_distances + n;
-
-    // Intialize the first row of the Levenshtein matrix with `iota`.
-    for (sz_size_t idx_longer = 0; idx_longer != n; ++idx_longer)
-        previous_distances[idx_longer] = (sz_i32_t)idx_longer * gap;
-
-    /// Contains up to 16 consecutive characters from the longer string.
-    sz_u512_vec_t longer_vec;
-    sz_u512_vec_t cost_deletion_vec, cost_substitution_vec, lookup_substitution_vec, current_vec;
-    sz_u512_vec_t row_first_subs_vec, row_second_subs_vec, row_third_subs_vec, row_fourth_subs_vec;
-    sz_u512_vec_t shuffled_first_subs_vec, shuffled_second_subs_vec, shuffled_third_subs_vec, shuffled_fourth_subs_vec;
-
-    // Prepare constants and masks.
-    sz_u512_vec_t is_third_or_fourth_vec, is_second_or_fourth_vec, gap_vec;
-    {
-        char is_third_or_fourth_check, is_second_or_fourth_check;
-        *(sz_u8_t *)&is_third_or_fourth_check = 0x80, *(sz_u8_t *)&is_second_or_fourth_check = 0x40;
-        is_third_or_fourth_vec.zmm = _mm512_set1_epi8(is_third_or_fourth_check);
-        is_second_or_fourth_vec.zmm = _mm512_set1_epi8(is_second_or_fourth_check);
-        gap_vec.zmm = _mm512_set1_epi32(gap);
-    }
-
-    sz_u8_t const *shorter_unsigned = (sz_u8_t const *)shorter;
-    for (sz_size_t idx_shorter = 0; idx_shorter != shorter_length; ++idx_shorter) {
-        sz_i32_t last_in_row = current_distances[0] = (sz_i32_t)(idx_shorter + 1) * gap;
-
-        // Load one row of the substitution matrix into four ZMM registers.
-        sz_error_cost_t const *row_subs = subs + shorter_unsigned[idx_shorter] * 256u;
-        row_first_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 0);
-        row_second_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 1);
-        row_third_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 2);
-        row_fourth_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 3);
-
-        // In the serial version we have one forward pass, that computes the deletion,
-        // insertion, and substitution costs at once.
-        //    for (sz_size_t idx_longer = 0; idx_longer < longer_length; ++idx_longer) {
-        //        sz_ssize_t cost_deletion = previous_distances[idx_longer + 1] + gap;
-        //        sz_ssize_t cost_insertion = current_distances[idx_longer] + gap;
-        //        sz_ssize_t cost_substitution = previous_distances[idx_longer] + row_subs[longer_unsigned[idx_longer]];
-        //        current_distances[idx_longer + 1] = sz_min_of_three(cost_deletion, cost_insertion, cost_substitution);
-        //    }
-        //
-        // Given the complexity of handling the data-dependency between consecutive insertion cost computations
-        // within a Levenshtein matrix, the simplest design would be to vectorize every kind of cost computation
-        // separately.
-        //      1. Compute substitution costs for up to 64 characters at once, upcasting from 8-bit integers to 32.
-        //      2. Compute the pairwise minimum with deletion costs.
-        //      3. Inclusive prefix minimum computation to combine with addition costs.
-        // Proceeding with substitutions:
-        for (sz_size_t idx_longer = 0; idx_longer < longer_length; idx_longer += 64) {
-            sz_size_t register_length = sz_min_of_two(longer_length - idx_longer, 64);
-            __mmask64 mask = _sz_u64_mask_until(register_length);
-            longer_vec.zmm = _mm512_maskz_loadu_epi8(mask, longer + idx_longer);
-
-            // Blend the `row_(first|second|third|fourth)_subs_vec` into `current_vec`, picking the right source
-            // for every character in `longer_vec`. Before that, we need to permute the subsititution vectors.
-            // Only the bottom 6 bits of a byte are used in VPERB, so we don't even need to mask.
-            shuffled_first_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_first_subs_vec.zmm);
-            shuffled_second_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_second_subs_vec.zmm);
-            shuffled_third_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_third_subs_vec.zmm);
-            shuffled_fourth_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_fourth_subs_vec.zmm);
-
-            // To blend we can invoke three `_mm512_cmplt_epu8_mask`, but we can also achieve the same using
-            // the AND logical operation, checking the top two bits of every byte.
-            // Continuing this thought, we can use the VPTESTMB instruction to output the mask after the AND.
-            __mmask64 is_third_or_fourth = _mm512_mask_test_epi8_mask(mask, longer_vec.zmm, is_third_or_fourth_vec.zmm);
-            __mmask64 is_second_or_fourth =
-                _mm512_mask_test_epi8_mask(mask, longer_vec.zmm, is_second_or_fourth_vec.zmm);
-            lookup_substitution_vec.zmm = _mm512_mask_blend_epi8(
-                is_third_or_fourth,
-                // Choose between the first and the second.
-                _mm512_mask_blend_epi8(is_second_or_fourth, shuffled_first_subs_vec.zmm, shuffled_second_subs_vec.zmm),
-                // Choose between the third and the fourth.
-                _mm512_mask_blend_epi8(is_second_or_fourth, shuffled_third_subs_vec.zmm, shuffled_fourth_subs_vec.zmm));
-
-            // First, sign-extend lower and upper 16 bytes to 16-bit integers.
-            __m512i current_0_31_vec = _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(lookup_substitution_vec.zmm, 0));
-            __m512i current_32_63_vec = _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(lookup_substitution_vec.zmm, 1));
-
-            // Now extend those 16-bit integers to 32-bit.
-            // This isn't free, same as the subsequent store, so we only want to do that for the populated lanes.
-            // To minimize the number of loads and stores, we can combine our substitution costs with the previous
-            // distances, containing the deletion costs.
-            {
-                cost_substitution_vec.zmm = _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_0_31_vec, 0)));
-                cost_deletion_vec.zmm = _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Inclusive prefix minimum computation to combine with insertion costs.
-                // Simply disabling this operation results in 5x performance improvement, meaning
-                // that this operation is responsible for 80% of the total runtime.
-                //    for (sz_size_t idx_longer = 0; idx_longer < longer_length; ++idx_longer) {
-                //        current_distances[idx_longer + 1] =
-                //            sz_max_of_two(current_distances[idx_longer] + gap, current_distances[idx_longer + 1]);
-                //    }
-                //
-                // To perform the same operation in vectorized form, we need to perform a tree-like reduction,
-                // that will involve multiple steps. It's quite expensive and should be first tested in the
-                // "experimental" section.
-                //
-                // Another approach might be loop unrolling:
-                //      current_vec.i32s[0] = last_in_row = sz_i32_max_of_two(current_vec.i32s[0], last_in_row + gap);
-                //      current_vec.i32s[1] = last_in_row = sz_i32_max_of_two(current_vec.i32s[1], last_in_row + gap);
-                //      current_vec.i32s[2] = last_in_row = sz_i32_max_of_two(current_vec.i32s[2], last_in_row + gap);
-                //      ... yet this approach is also quite expensive.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 16 to 31.
-            if (register_length > 16) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 16);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_0_31_vec, 1)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 16);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 16, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 32 to 47.
-            if (register_length > 32) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 32);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_32_63_vec, 0)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 32);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 32, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 32 to 47.
-            if (register_length > 48) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 48);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_32_63_vec, 1)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 48);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 48, (__mmask16)mask, current_vec.zmm);
-            }
-        }
-
-        // Swap previous_distances and current_distances pointers
-        sz_pointer_swap((void **)&previous_distances, (void **)&current_distances);
-    }
-
-    // Cache scalar before `free` call.
-    sz_ssize_t result = previous_distances[longer_length];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-}
-
-SZ_INTERNAL sz_ssize_t sz_alignment_score_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,  //
-    sz_cptr_t longer, sz_size_t longer_length,    //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
-
-    if (sz_max_of_two(shorter_length, longer_length) < (256ull * 256ull * 256ull))
-        return _sz_alignment_score_wagner_fisher_upto17m_avx512(shorter, shorter_length, longer, longer_length, subs,
-                                                                gap, alloc);
-    else
-        return sz_alignment_score_serial(shorter, shorter_length, longer, longer_length, subs, gap, alloc);
-}
-
-enum sz_encoding_t {
-    sz_encoding_unknown_k = 0,
-    sz_encoding_ascii_k = 1,
-    sz_encoding_utf8_k = 2,
-    sz_encoding_utf16_k = 3,
-    sz_encoding_utf32_k = 4,
-    sz_jwt_k,
-    sz_base64_k,
-    // Low priority encodings:
-    sz_encoding_utf8bom_k = 5,
-    sz_encoding_utf16le_k = 6,
-    sz_encoding_utf16be_k = 7,
-    sz_encoding_utf32le_k = 8,
-    sz_encoding_utf32be_k = 9,
-};
-
-// Character Set Detection is one of the most commonly performed operations in data processing with
-// [Chardet](https://github.com/chardet/chardet), [Charset Normalizer](https://github.com/jawah/charset_normalizer),
-// [cChardet](https://github.com/PyYoshi/cChardet) being the most commonly used options in the Python ecosystem.
-// All of them are notoriously slow.
-//
-// Moreover, as of October 2024, UTF-8 is the dominant character encoding on the web, used by 98.4% of websites.
-// Other have minimal usage, according to [W3Techs](https://w3techs.com/technologies/overview/character_encoding):
-// - ISO-8859-1: 1.2%
-// - Windows-1252: 0.3%
-// - Windows-1251: 0.2%
-// - EUC-JP: 0.1%
-// - Shift JIS: 0.1%
-// - EUC-KR: 0.1%
-// - GB2312: 0.1%
-// - Windows-1250: 0.1%
-// Within programming language implementations and database management systems, 16-bit and 32-bit fixed-width encodings
-// are also very popular and we need a way to efficienly differentiate between the most common UTF flavors, ASCII, and
-// the rest.
-//
-// One good solution is the [simdutf](https://github.com/simdutf/simdutf) library, but it depends on the C++ runtime
-// and focuses more on incremental validation & transcoding, rather than detection.
-//
-// So we need a very fast and efficient way of determining
-SZ_PUBLIC sz_bool_t sz_detect_encoding(sz_cptr_t text, sz_size_t length) {
-    // https://github.com/simdutf/simdutf/blob/master/src/icelake/icelake_utf8_validation.inl.cpp
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_from_utf8.inl.cpp#L81
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L661
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L788
-
-    // We can implement this operation simpler & differently, assuming most of the time continuous chunks of memory
-    // have identical encoding. With Russian and many European languages, we generally deal with 2-byte codepoints
-    // with occasional 1-byte punctuation marks. In the case of Chinese, Japanese, and Korean, we deal with 3-byte
-    // codepoints. In the case of emojis, we deal with 4-byte codepoints.
-    // We can also use the idea, that misaligned reads are quite cheap on modern CPUs.
-    int can_be_ascii = 1, can_be_utf8 = 1, can_be_utf16 = 1, can_be_utf32 = 1;
-    sz_unused(can_be_ascii + can_be_utf8 + can_be_utf16 + can_be_utf32);
-    sz_unused(text && length);
-    return sz_false_k;
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif
-
-#pragma endregion
-
-/*  @brief  Implementation of the string search algorithms using the Arm NEON instruction set, available on 64-bit
- *          Arm processors. Implements: {substring search, character search, character set search} x {forward, reverse}.
- */
-#pragma region ARM NEON
-
-#if SZ_USE_ARM_NEON
-#pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+simd")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
-
-/**
- *  @brief  Helper structure to simplify work with 64-bit words.
- */
-typedef union sz_u128_vec_t {
-    uint8x16_t u8x16;
-    uint16x8_t u16x8;
-    uint32x4_t u32x4;
-    uint64x2_t u64x2;
-    sz_u64_t u64s[2];
-    sz_u32_t u32s[4];
-    sz_u16_t u16s[8];
-    sz_u8_t u8s[16];
-} sz_u128_vec_t;
-
-SZ_INTERNAL sz_u64_t _sz_vreinterpretq_u8_u4(uint8x16_t vec) {
-    // Use `vshrn` to produce a bitmask, similar to `movemask` in SSE.
-    // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
-    return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(vec), 4)), 0) & 0x8888888888888888ull;
-}
-
-SZ_PUBLIC sz_ordering_t sz_order_neon(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    //! Before optimizing this, read the "Operations Not Worth Optimizing" in Contributions Guide:
-    //! https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md#general-performance-observations
-    return sz_order_serial(a, a_length, b, b_length);
-}
-
-SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_u128_vec_t a_vec, b_vec;
-    for (; length >= 16; a += 16, b += 16, length -= 16) {
-        a_vec.u8x16 = vld1q_u8((sz_u8_t const *)a);
-        b_vec.u8x16 = vld1q_u8((sz_u8_t const *)b);
-        uint8x16_t cmp = vceqq_u8(a_vec.u8x16, b_vec.u8x16);
-        if (vminvq_u8(cmp) != 255) { return sz_false_k; } // Check if all bytes match
-    }
-
-    // Handle remaining bytes
-    if (length) return sz_equal_serial(a, b, length);
-    return sz_true_k;
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_neon(sz_cptr_t text, sz_size_t length) {
-    uint64x2_t sum_vec = vdupq_n_u64(0);
-
-    // Process 16 bytes (128 bits) at a time
-    for (; length >= 16; text += 16, length -= 16) {
-        uint8x16_t vec = vld1q_u8((sz_u8_t const *)text);      // Load 16 bytes
-        uint16x8_t pairwise_sum1 = vpaddlq_u8(vec);            // Pairwise add lower and upper 8 bits
-        uint32x4_t pairwise_sum2 = vpaddlq_u16(pairwise_sum1); // Pairwise add 16-bit results
-        uint64x2_t pairwise_sum3 = vpaddlq_u32(pairwise_sum2); // Pairwise add 32-bit results
-        sum_vec = vaddq_u64(sum_vec, pairwise_sum3);           // Accumulate the sum
-    }
-
-    // Final reduction of `sum_vec` to a single scalar
-    sz_u64_t sum = vgetq_lane_u64(sum_vec, 0) + vgetq_lane_u64(sum_vec, 1);
-    if (length) sum += sz_checksum_serial(text, length);
-    return sum;
-}
-
-SZ_PUBLIC void sz_copy_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // In most cases the `source` and the `target` are not aligned, but we should
-    // at least make sure that writes don't touch many cache lines.
-    // NEON has an instruction to load and write 64 bytes at once.
-    //
-    //    sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-    //    sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-    //    for (; head_length; target += 1, source += 1, head_length -= 1) *target = *source;
-    //    length -= head_length;
-    //    for (; length >= 64; target += 64, source += 64, length -= 64)
-    //        vst4q_u8((sz_u8_t *)target, vld1q_u8_x4((sz_u8_t const *)source));
-    //    for (; tail_length; target += 1, source += 1, tail_length -= 1) *target = *source;
-    //
-    // Sadly, those instructions end up being 20% slower than the code processing 16 bytes at a time:
-    for (; length >= 16; target += 16, source += 16, length -= 16)
-        vst1q_u8((sz_u8_t *)target, vld1q_u8((sz_u8_t const *)source));
-    if (length) sz_copy_serial(target, source, length);
-}
-
-SZ_PUBLIC void sz_move_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // When moving small buffers, using a small buffer on stack as a temporary storage is faster.
-
-    if (target < source || target >= source + length) {
-        // Non-overlapping, proceed forward
-        sz_copy_neon(target, source, length);
-    }
-    else {
-        // Overlapping, proceed backward
-        target += length;
-        source += length;
-
-        sz_u128_vec_t src_vec;
-        while (length >= 16) {
-            target -= 16, source -= 16, length -= 16;
-            src_vec.u8x16 = vld1q_u8((sz_u8_t const *)source);
-            vst1q_u8((sz_u8_t *)target, src_vec.u8x16);
-        }
-        while (length) {
-            target -= 1, source -= 1, length -= 1;
-            *target = *source;
-        }
-    }
-}
-
-SZ_PUBLIC void sz_fill_neon(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    uint8x16_t fill_vec = vdupq_n_u8(value); // Broadcast the value across the register
-
-    while (length >= 16) {
-        vst1q_u8((sz_u8_t *)target, fill_vec);
-        target += 16;
-        length -= 16;
-    }
-
-    // Handle remaining bytes
-    if (length) sz_fill_serial(target, length, value);
-}
-
-SZ_PUBLIC void sz_look_up_transform_neon(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-
-    // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
-    // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
-    if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
-        return;
-    }
-
-    sz_size_t head_length = (16 - ((sz_size_t)target % 16)) % 16; // 15 or less.
-    sz_size_t tail_length = (sz_size_t)(target + length) % 16;    // 15 or less.
-
-    // We need to pull the lookup table into 16x NEON registers. We have a total of 32 such registers.
-    // According to the Neoverse V2 manual, the 4-table lookup has a latency of 6 cycles, and 4x throughput.
-    uint8x16x4_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec;
-    lut_0_to_63_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 0));
-    lut_64_to_127_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 64));
-    lut_128_to_191_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 128));
-    lut_192_to_255_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 192));
-
-    sz_u128_vec_t source_vec;
-    // If the top bit is set in each word of `source_vec`, than we use `lookup_128_to_191_vec` or
-    // `lookup_192_to_255_vec`. If the second bit is set, we use `lookup_64_to_127_vec` or `lookup_192_to_255_vec`.
-    sz_u128_vec_t lookup_0_to_63_vec, lookup_64_to_127_vec, lookup_128_to_191_vec, lookup_192_to_255_vec;
-    sz_u128_vec_t blended_0_to_255_vec;
-
-    // Process the head with serial code
-    for (; head_length; target += 1, source += 1, head_length -= 1) *target = lut[*(sz_u8_t const *)source];
-
-    // Table lookups on Arm are much simpler to use than on x86, as we can use the `vqtbl4q_u8` instruction
-    // to perform a 4-table lookup in a single instruction. The XORs are used to adjust the lookup position
-    // within each 64-byte range of the table.
-    // Details on the 4-table lookup: https://lemire.me/blog/2019/07/23/arbitrary-byte-to-byte-maps-using-arm-neon/
-    length -= head_length;
-    length -= tail_length;
-    for (; length >= 16; source += 16, target += 16, length -= 16) {
-        source_vec.u8x16 = vld1q_u8((sz_u8_t const *)source);
-        lookup_0_to_63_vec.u8x16 = vqtbl4q_u8(lut_0_to_63_vec, source_vec.u8x16);
-        lookup_64_to_127_vec.u8x16 = vqtbl4q_u8(lut_64_to_127_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0x40)));
-        lookup_128_to_191_vec.u8x16 = vqtbl4q_u8(lut_128_to_191_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0x80)));
-        lookup_192_to_255_vec.u8x16 = vqtbl4q_u8(lut_192_to_255_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0xc0)));
-        blended_0_to_255_vec.u8x16 = vorrq_u8(vorrq_u8(lookup_0_to_63_vec.u8x16, lookup_64_to_127_vec.u8x16),
-                                              vorrq_u8(lookup_128_to_191_vec.u8x16, lookup_192_to_255_vec.u8x16));
-        vst1q_u8((sz_u8_t *)target, blended_0_to_255_vec.u8x16);
-    }
-
-    // Process the tail with serial code
-    for (; tail_length; target += 1, source += 1, tail_length -= 1) *target = lut[*(sz_u8_t const *)source];
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec, n_vec, matches_vec;
-    n_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)n);
-
-    while (h_length >= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)h);
-        matches_vec.u8x16 = vceqq_u8(h_vec.u8x16, n_vec.u8x16);
-        // In Arm NEON we don't have a `movemask` to combine it with `ctz` and get the offset of the match.
-        // But assuming the `vmaxvq` is cheap, we can use it to find the first match, by blending (bitwise selecting)
-        // the vector with a relative offsets array.
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        if (matches) return h + sz_u64_ctz(matches) / 4;
-
-        h += 16, h_length -= 16;
-    }
-
-    return sz_find_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec, n_vec, matches_vec;
-    n_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)n);
-
-    while (h_length >= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)h + h_length - 16);
-        matches_vec.u8x16 = vceqq_u8(h_vec.u8x16, n_vec.u8x16);
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        if (matches) return h + h_length - 1 - sz_u64_clz(matches) / 4;
-        h_length -= 16;
-    }
-
-    return sz_rfind_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_u64_t _sz_find_charset_neon_register(sz_u128_vec_t h_vec, uint8x16_t set_top_vec_u8x16,
-                                                  uint8x16_t set_bottom_vec_u8x16) {
-
-    // Once we've read the characters in the haystack, we want to
-    // compare them against our bitset. The serial version of that code
-    // would look like: `(set_->_u8s[c >> 3] & (1u << (c & 7u))) != 0`.
-    uint8x16_t byte_index_vec = vshrq_n_u8(h_vec.u8x16, 3);
-    uint8x16_t byte_mask_vec = vshlq_u8(vdupq_n_u8(1), vreinterpretq_s8_u8(vandq_u8(h_vec.u8x16, vdupq_n_u8(7))));
-    uint8x16_t matches_top_vec = vqtbl1q_u8(set_top_vec_u8x16, byte_index_vec);
-    // The table lookup instruction in NEON replies to out-of-bound requests with zeros.
-    // The values in `byte_index_vec` all fall in [0; 32). So for values under 16, substracting 16 will underflow
-    // and map into interval [240, 256). Meaning that those will be populated with zeros and we can safely
-    // merge `matches_top_vec` and `matches_bottom_vec` with a bitwise OR.
-    uint8x16_t matches_bottom_vec = vqtbl1q_u8(set_bottom_vec_u8x16, vsubq_u8(byte_index_vec, vdupq_n_u8(16)));
-    uint8x16_t matches_vec = vorrq_u8(matches_top_vec, matches_bottom_vec);
-    // Istead of pure `vandq_u8`, we can immediately broadcast a match presence across each 8-bit word.
-    matches_vec = vtstq_u8(matches_vec, byte_mask_vec);
-    return _sz_vreinterpretq_u8_u4(matches_vec);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_neon(h, h_length, n);
-
-    // Scan through the string.
-    // Assuming how tiny the Arm NEON registers are, we should avoid internal branches at all costs.
-    // That's why, for smaller needles, we use different loops.
-    if (n_length == 2) {
-        // Broadcast needle characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_last_vec, n_first_vec, n_last_vec, matches_vec;
-        // Dealing with 16-bit values, we can load 2 registers at a time and compare 31 possible offsets
-        // in a single loop iteration.
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[0]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[1]);
-        for (; h_length >= 17; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 0));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 1));
-            matches_vec.u8x16 =
-                vandq_u8(vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            if (matches) return h + sz_u64_ctz(matches) / 4;
-        }
-    }
-    else if (n_length == 3) {
-        // Broadcast needle characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-        // Comparing 24-bit values is a bumer. Being lazy, I went with the same approach
-        // as when searching for string over 4 characters long. I only avoid the last comparison.
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[0]);
-        n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[1]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[2]);
-        for (; h_length >= 18; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 0));
-            h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 1));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 2));
-            matches_vec.u8x16 = vandq_u8(                           //
-                vandq_u8(                                           //
-                    vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                    vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-                vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            if (matches) return h + sz_u64_ctz(matches) / 4;
-        }
-    }
-    else {
-        // Pick the parts of the needle that are worth comparing.
-        sz_size_t offset_first, offset_mid, offset_last;
-        _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-        // Broadcast those characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_first]);
-        n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_mid]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_last]);
-        // Walk through the string.
-        for (; h_length >= n_length + 16; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_first));
-            h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_mid));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_last));
-            matches_vec.u8x16 = vandq_u8(                           //
-                vandq_u8(                                           //
-                    vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                    vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-                vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches) / 4;
-                if (sz_equal(h + potential_offset, n, n_length)) return h + potential_offset;
-                matches &= matches - 1;
-            }
-        }
-    }
-
-    return sz_find_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_neon(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Will contain 4 bits per character.
-    sz_u64_t matches;
-    sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-    n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_first]);
-    n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_mid]);
-    n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_last]);
-
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 16; h_length -= 16) {
-        h_reversed = h + h_length - n_length - 16 + 1;
-        h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_first));
-        h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_mid));
-        h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_last));
-        matches_vec.u8x16 = vandq_u8(                           //
-            vandq_u8(                                           //
-                vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-            vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches) / 4;
-            if (sz_equal(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            sz_assert((matches & (1ull << (63 - potential_offset * 4))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~(1ull << (63 - potential_offset * 4));
-        }
-    }
-
-    return sz_rfind_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_charset_t const *set) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec;
-    uint8x16_t set_top_vec_u8x16 = vld1q_u8(&set->_u8s[0]);
-    uint8x16_t set_bottom_vec_u8x16 = vld1q_u8(&set->_u8s[16]);
-
-    for (; h_length >= 16; h += 16, h_length -= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h));
-        matches = _sz_find_charset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
-        if (matches) return h + sz_u64_ctz(matches) / 4;
-    }
-
-    return sz_find_charset_serial(h, h_length, set);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_charset_t const *set) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec;
-    uint8x16_t set_top_vec_u8x16 = vld1q_u8(&set->_u8s[0]);
-    uint8x16_t set_bottom_vec_u8x16 = vld1q_u8(&set->_u8s[16]);
-
-    // Check `sz_find_charset_neon` for explanations.
-    for (; h_length >= 16; h_length -= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h) + h_length - 16);
-        matches = _sz_find_charset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
-        if (matches) return h + h_length - 1 - sz_u64_clz(matches) / 4;
-    }
-
-    return sz_rfind_charset_serial(h, h_length, set);
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif // Arm Neon
-
-#pragma endregion
-
-/*  @brief  Implementation of the string search algorithms using the Arm SVE variable-length registers, available
- *          in Arm v9 processors.
- *
- *  Implements:
- *      - memory: {copy, move, fill}
- *      - comparisons: {equal, order}
- *      - search: {substring, character, character set} x {forward, reverse}.
- */
-#pragma region ARM SVE
-
-#if SZ_USE_ARM_SVE
-#pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+sve")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
-
-SZ_PUBLIC void sz_fill_sve(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    svuint8_t value_vec = svdup_u8(value);
-    sz_size_t vec_len = svcntb(); // Vector length in bytes (scalable)
-
-    if (length <= vec_len) {
-        // Small buffer case: use mask to handle small writes
-        svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)length);
-        svst1_u8(mask, (unsigned char *)target, value_vec);
-    }
-    else {
-        // Calculate head, body, and tail sizes
-        sz_size_t head_length = vec_len - ((sz_size_t)target % vec_len);
-        sz_size_t tail_length = (sz_size_t)(target + length) % vec_len;
-        sz_size_t body_length = length - head_length - tail_length;
-
-        // Handle unaligned head
-        svbool_t head_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)head_length);
-        svst1_u8(head_mask, (unsigned char *)target, value_vec);
-        target += head_length;
-
-        // Aligned body loop
-        for (; body_length >= vec_len; target += vec_len, body_length -= vec_len) {
-            svst1_u8(svptrue_b8(), (unsigned char *)target, value_vec);
-        }
-
-        // Handle unaligned tail
-        svbool_t tail_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)tail_length);
-        svst1_u8(tail_mask, (unsigned char *)target, value_vec);
-    }
-}
+ *      x & ~((x < y) - 1) + y & ((x < y) - 1)      // 6 unique operations
+ */
+#define sz_min_of_two(x, y) (x < y ? x : y)
+#define sz_max_of_two(x, y) (x < y ? y : x)
+#define sz_min_of_three(x, y, z) sz_min_of_two(x, sz_min_of_two(y, z))
+#define sz_max_of_three(x, y, z) sz_max_of_two(x, sz_max_of_two(y, z))
 
-SZ_PUBLIC void sz_copy_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    sz_size_t vec_len = svcntb(); // Vector length in bytes
+/**
+ *  One option to avoid branching is to use conditional moves and lookup the comparison result in a table:
+ *       sz_ordering_t ordering_lookup[2] = {sz_greater_k, sz_less_k};
+ *       for (; a != min_end; ++a, ++b)
+ *           if (*a != *b) return ordering_lookup[*a < *b];
+ *  That, however, introduces a data-dependency.
+ *  A cleaner option is to perform two comparisons and a subtraction.
+ *  One instruction more, but no data-dependency.
+ */
+#define _sz_order_scalars(a, b) ((sz_ordering_t)((a > b) - (a < b)))
 
-    // Arm Neoverse V2 cores in Graviton 4, for example, come with 256 KB of L1 data cache per core,
-    // and 8 MB of L2 cache per core. Moreover, the L1 cache is fully associative.
-    // With two strings, we may consider the overal workload huge, if each exceeds 1 MB in length.
-    //
-    //      int is_huge = length >= 4ull * 1024ull * 1024ull;
-    //
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= vec_len) {
-        // Small buffer case: use mask to handle small writes
-        svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)length);
-        svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-        svst1_u8(mask, (unsigned char *)target, data);
-    }
-    // When dealing with larger buffers, similar to AVX-512, we want minimize unaligned operations
-    // and handle the head, body, and tail separately. We can also traverse the buffer in both directions
-    // as Arm generally supports more simultaneous stores than x86 CPUs.
-    //
-    // For gigantic datasets, similar to AVX-512, non-temporal "loads" and "stores" can be used.
-    // Sadly, if the register size (16 byte or larger) is smaller than a cache-line (64 bytes)
-    // we will pay a huge penalty on loads, fetching the same content many times.
-    // It may be better to allow caching (and subsequent eviction), in favor of using four-element
-    // tuples, wich will be guaranteed to be a multiple of a cache line.
-    //
-    // Another approach is to use the `LD4B` instructions, which will populate four registers at once.
-    // This however, further decreases the performance from LibC-like 29 GB/s to 20 GB/s.
-    else {
-        // Calculating head, body, and tail sizes depends on the `vec_len`,
-        // but it's runtime constant, and the modulo operation is expensive!
-        // Instead we use the fact, that it's always a multiple of 128 bits or 16 bytes.
-        sz_size_t head_length = 16 - ((sz_size_t)target % 16);
-        sz_size_t tail_length = (sz_size_t)(target + length) % 16;
-        sz_size_t body_length = length - head_length - tail_length;
+/** @brief  Branchless minimum function for two signed 32-bit integers. */
+SZ_INTERNAL sz_i32_t sz_i32_min_of_two(sz_i32_t x, sz_i32_t y) { return y + ((x - y) & (x - y) >> 31); }
 
-        // Handle unaligned parts
-        svbool_t head_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)head_length);
-        svuint8_t head_data = svld1_u8(head_mask, (unsigned char *)source);
-        svst1_u8(head_mask, (unsigned char *)target, head_data);
-        svbool_t tail_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)tail_length);
-        svuint8_t tail_data = svld1_u8(tail_mask, (unsigned char *)source + head_length + body_length);
-        svst1_u8(tail_mask, (unsigned char *)target + head_length + body_length, tail_data);
-        target += head_length;
-        source += head_length;
+/** @brief  Branchless minimum function for two signed 32-bit integers. */
+SZ_INTERNAL sz_i32_t sz_i32_max_of_two(sz_i32_t x, sz_i32_t y) { return x - ((x - y) & (x - y) >> 31); }
 
-        // Aligned body loop, walking in two directions
-        for (; body_length >= vec_len * 2; target += vec_len, source += vec_len, body_length -= vec_len * 2) {
-            svuint8_t forward_data = svld1_u8(svptrue_b8(), (unsigned char *)source);
-            svuint8_t backward_data = svld1_u8(svptrue_b8(), (unsigned char *)source + body_length - vec_len);
-            svst1_u8(svptrue_b8(), (unsigned char *)target, forward_data);
-            svst1_u8(svptrue_b8(), (unsigned char *)target + body_length - vec_len, backward_data);
-        }
-        // Up to (vec_len * 2 - 1) bytes of data may be left in the body,
-        // so we can unroll the last two optional loop iterations.
-        if (body_length > vec_len) {
-            svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)body_length);
-            svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-            svst1_u8(mask, (unsigned char *)target, data);
-            body_length -= vec_len;
-            source += body_length;
-            target += body_length;
-        }
-        if (body_length) {
-            svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)body_length);
-            svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-            svst1_u8(mask, (unsigned char *)target, data);
-        }
-    }
+/**
+ *  @brief  Byte-level equality comparison between two 64-bit integers.
+ *  @return 64-bit integer, where every top bit in each byte signifies a match.
+ */
+SZ_INTERNAL sz_u64_vec_t _sz_u64_each_byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
+    sz_u64_vec_t vec;
+    vec.u64 = ~(a.u64 ^ b.u64);
+    // The match is valid, if every bit within each byte is set.
+    // For that take the bottom 7 bits of each byte, add one to them,
+    // and if this sets the top bit to one, then all the 7 bits are ones as well.
+    vec.u64 = ((vec.u64 & 0x7F7F7F7F7F7F7F7Full) + 0x0101010101010101ull) & ((vec.u64 & 0x8080808080808080ull));
+    return vec;
 }
 
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif // Arm SVE
-
-#pragma endregion
-
-/*
- *  @brief  Pick the right implementation for the string search algorithms.
+/**
+ *  @brief  Clamps signed offsets in a string to a valid range. Used for Pythonic-style slicing.
  */
-#pragma region Compile Time Dispatching
-
-SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t ins, sz_size_t length) { return sz_hash_serial(ins, length); }
-SZ_PUBLIC void sz_tolower(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_tolower_serial(ins, length, outs); }
-SZ_PUBLIC void sz_toupper(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_toupper_serial(ins, length, outs); }
-SZ_PUBLIC void sz_toascii(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_toascii_serial(ins, length, outs); }
-SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t ins, sz_size_t length) { return sz_isascii_serial(ins, length); }
-
-SZ_PUBLIC void sz_hashes_fingerprint(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_ptr_t fingerprint,
-                                     sz_size_t fingerprint_bytes) {
+SZ_INTERNAL void sz_ssize_clamp_interval(sz_size_t length, sz_ssize_t start, sz_ssize_t end,
+                                         sz_size_t *normalized_offset, sz_size_t *normalized_length) {
+    // TODO: Remove branches.
+    // Normalize negative indices
+    if (start < 0) start += length;
+    if (end < 0) end += length;
 
-    sz_bool_t fingerprint_length_is_power_of_two = (sz_bool_t)((fingerprint_bytes & (fingerprint_bytes - 1)) == 0);
-    sz_string_view_t fingerprint_buffer = {fingerprint, fingerprint_bytes};
+    // Clamp indices to a valid range
+    if (start < 0) start = 0;
+    if (end < 0) end = 0;
+    if (start > (sz_ssize_t)length) start = length;
+    if (end > (sz_ssize_t)length) end = length;
 
-    // There are several issues related to the fingerprinting algorithm.
-    // First, the memory traversal order is important.
-    // https://blog.stuffedcow.net/2015/08/pagewalk-coherence/
+    // Ensure start <= end
+    if (start > end) start = end;
 
-    // In most cases the fingerprint length will be a power of two.
-    if (fingerprint_length_is_power_of_two == sz_false_k)
-        sz_hashes(start, length, window_length, 1, _sz_hashes_fingerprint_non_pow2_callback, &fingerprint_buffer);
-    else
-        sz_hashes(start, length, window_length, 1, _sz_hashes_fingerprint_pow2_callback, &fingerprint_buffer);
+    *normalized_offset = start;
+    *normalized_length = end - start;
 }
 
-#if !SZ_DYNAMIC_DISPATCH
-
-SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    return sz_checksum_avx512(text, length);
-#elif SZ_USE_X86_AVX2
-    return sz_checksum_avx2(text, length);
-#elif SZ_USE_ARM_NEON
-    return sz_checksum_neon(text, length);
-#else
-    return sz_checksum_serial(text, length);
-#endif
+/**
+ *  @brief  Compute the logarithm base 2 of a positive integer, rounding down.
+ */
+SZ_INTERNAL sz_size_t sz_size_log2i_nonzero(sz_size_t x) {
+    sz_assert(x > 0 && "Non-positive numbers have no defined logarithm");
+    sz_size_t leading_zeros = sz_u64_clz(x);
+    return 63 - leading_zeros;
 }
 
-SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    return sz_equal_avx512(a, b, length);
-#elif SZ_USE_X86_AVX2
-    return sz_equal_avx2(a, b, length);
-#elif SZ_USE_ARM_NEON
-    return sz_equal_neon(a, b, length);
-#else
-    return sz_equal_serial(a, b, length);
+/**
+ *  @brief  Compute the smallest power of two greater than or equal to ::x.
+ */
+SZ_INTERNAL sz_size_t sz_size_bit_ceil(sz_size_t x) {
+    // Unlike the commonly used trick with `clz` intrinsics, is valid across the whole range of `x`.
+    // https://stackoverflow.com/a/10143264
+    x--;
+    x |= x >> 1;
+    x |= x >> 2;
+    x |= x >> 4;
+    x |= x >> 8;
+    x |= x >> 16;
+#if _SZ_IS_64_BIT
+    x |= x >> 32;
 #endif
+    x++;
+    return x;
 }
 
-SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-#if SZ_USE_X86_AVX512
-    return sz_order_avx512(a, a_length, b, b_length);
-#elif SZ_USE_X86_AVX2
-    return sz_order_avx2(a, a_length, b, b_length);
-#elif SZ_USE_ARM_NEON
-    return sz_order_neon(a, a_length, b, b_length);
-#else
-    return sz_order_serial(a, a_length, b, b_length);
-#endif
+/**
+ *  @brief  Transposes an 8x8 bit matrix packed in a `sz_u64_t`.
+ *
+ *  There is a well known SWAR sequence for that known to chess programmers,
+ *  willing to flip a bit-matrix of pieces along the main A1-H8 diagonal.
+ *  https://www.chessprogramming.org/Flipping_Mirroring_and_Rotating
+ *  https://lukas-prokop.at/articles/2021-07-23-transpose
+ */
+SZ_INTERNAL sz_u64_t sz_u64_transpose(sz_u64_t x) {
+    sz_u64_t t;
+    t = x ^ (x << 36);
+    x ^= 0xf0f0f0f00f0f0f0full & (t ^ (x >> 36));
+    t = 0xcccc0000cccc0000ull & (x ^ (x << 18));
+    x ^= t ^ (t >> 18);
+    t = 0xaa00aa00aa00aa00ull & (x ^ (x << 9));
+    x ^= t ^ (t >> 9);
+    return x;
 }
 
-SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    sz_copy_avx512(target, source, length);
-#elif SZ_USE_X86_AVX2
-    sz_copy_avx2(target, source, length);
-#elif SZ_USE_ARM_NEON
-    sz_copy_neon(target, source, length);
-#else
-    sz_copy_serial(target, source, length);
-#endif
+/**
+ *  @brief  Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
+ */
+SZ_INTERNAL void sz_u64_swap(sz_u64_t *a, sz_u64_t *b) {
+    sz_u64_t t = *a;
+    *a = *b;
+    *b = t;
 }
 
-SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    sz_move_avx512(target, source, length);
-#elif SZ_USE_X86_AVX2
-    sz_move_avx2(target, source, length);
-#elif SZ_USE_ARM_NEON
-    sz_move_neon(target, source, length);
-#else
-    sz_move_serial(target, source, length);
-#endif
+/**
+ *  @brief  Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
+ */
+SZ_INTERNAL void sz_pointer_swap(void **a, void **b) {
+    void *t = *a;
+    *a = *b;
+    *b = t;
 }
 
-SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-#if SZ_USE_X86_AVX512
-    sz_fill_avx512(target, length, value);
-#elif SZ_USE_X86_AVX2
-    sz_fill_avx2(target, length, value);
-#elif SZ_USE_ARM_NEON
-    sz_fill_neon(target, length, value);
+/**
+ *  @brief Load a 16-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
+ */
+SZ_INTERNAL sz_u16_vec_t sz_u16_load(sz_cptr_t ptr) {
+#if !SZ_USE_MISALIGNED_LOADS
+    sz_u16_vec_t result;
+    result.u8s[0] = ptr[0];
+    result.u8s[1] = ptr[1];
+    return result;
+#elif defined(_MSC_VER) && !defined(__clang__)
+#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
+    return *((sz_u16_vec_t *)ptr);
 #else
-    sz_fill_serial(target, length, value);
+    return *((__unaligned sz_u16_vec_t *)ptr);
 #endif
-}
-
-SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-#if SZ_USE_X86_AVX512
-    sz_look_up_transform_avx512(source, length, lut, target);
-#elif SZ_USE_X86_AVX2
-    sz_look_up_transform_avx2(source, length, lut, target);
-#elif SZ_USE_ARM_NEON
-    sz_look_up_transform_neon(source, length, lut, target);
 #else
-    sz_look_up_transform_serial(source, length, lut, target);
+    __attribute__((aligned(1))) sz_u16_vec_t const *result = (sz_u16_vec_t const *)ptr;
+    return *result;
 #endif
 }
 
-SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
-#if SZ_USE_X86_AVX512
-    return sz_find_byte_avx512(haystack, h_length, needle);
-#elif SZ_USE_X86_AVX2
-    return sz_find_byte_avx2(haystack, h_length, needle);
-#elif SZ_USE_ARM_NEON
-    return sz_find_byte_neon(haystack, h_length, needle);
+/**
+ *  @brief Load a 32-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
+ */
+SZ_INTERNAL sz_u32_vec_t sz_u32_load(sz_cptr_t ptr) {
+#if !SZ_USE_MISALIGNED_LOADS
+    sz_u32_vec_t result;
+    result.u8s[0] = ptr[0];
+    result.u8s[1] = ptr[1];
+    result.u8s[2] = ptr[2];
+    result.u8s[3] = ptr[3];
+    return result;
+#elif defined(_MSC_VER) && !defined(__clang__)
+#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
+    return *((sz_u32_vec_t *)ptr);
 #else
-    return sz_find_byte_serial(haystack, h_length, needle);
+    return *((__unaligned sz_u32_vec_t *)ptr);
 #endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
-#if SZ_USE_X86_AVX512
-    return sz_rfind_byte_avx512(haystack, h_length, needle);
-#elif SZ_USE_X86_AVX2
-    return sz_rfind_byte_avx2(haystack, h_length, needle);
-#elif SZ_USE_ARM_NEON
-    return sz_rfind_byte_neon(haystack, h_length, needle);
 #else
-    return sz_rfind_byte_serial(haystack, h_length, needle);
+    __attribute__((aligned(1))) sz_u32_vec_t const *result = (sz_u32_vec_t const *)ptr;
+    return *result;
 #endif
 }
 
-SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
-#if SZ_USE_X86_AVX512
-    return sz_find_avx512(haystack, h_length, needle, n_length);
-#elif SZ_USE_X86_AVX2
-    return sz_find_avx2(haystack, h_length, needle, n_length);
-#elif SZ_USE_ARM_NEON
-    return sz_find_neon(haystack, h_length, needle, n_length);
+/**
+ *  @brief Load a 64-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
+ */
+SZ_INTERNAL sz_u64_vec_t sz_u64_load(sz_cptr_t ptr) {
+#if !SZ_USE_MISALIGNED_LOADS
+    sz_u64_vec_t result;
+    result.u8s[0] = ptr[0];
+    result.u8s[1] = ptr[1];
+    result.u8s[2] = ptr[2];
+    result.u8s[3] = ptr[3];
+    result.u8s[4] = ptr[4];
+    result.u8s[5] = ptr[5];
+    result.u8s[6] = ptr[6];
+    result.u8s[7] = ptr[7];
+    return result;
+#elif defined(_MSC_VER) && !defined(__clang__)
+#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
+    return *((sz_u64_vec_t *)ptr);
 #else
-    return sz_find_serial(haystack, h_length, needle, n_length);
+    return *((__unaligned sz_u64_vec_t *)ptr);
 #endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
-#if SZ_USE_X86_AVX512
-    return sz_rfind_avx512(haystack, h_length, needle, n_length);
-#elif SZ_USE_X86_AVX2
-    return sz_rfind_avx2(haystack, h_length, needle, n_length);
-#elif SZ_USE_ARM_NEON
-    return sz_rfind_neon(haystack, h_length, needle, n_length);
 #else
-    return sz_rfind_serial(haystack, h_length, needle, n_length);
+    __attribute__((aligned(1))) sz_u64_vec_t const *result = (sz_u64_vec_t const *)ptr;
+    return *result;
 #endif
 }
 
-SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#if SZ_USE_X86_AVX512
-    return sz_find_charset_avx512(text, length, set);
-#elif SZ_USE_X86_AVX2
-    return sz_find_charset_avx2(text, length, set);
-#elif SZ_USE_ARM_NEON
-    return sz_find_charset_neon(text, length, set);
-#else
-    return sz_find_charset_serial(text, length, set);
-#endif
+/** @brief  Helper function, using the supplied fixed-capacity buffer to allocate memory. */
+SZ_INTERNAL sz_ptr_t _sz_memory_allocate_fixed(sz_size_t length, void *handle) {
+    sz_size_t capacity;
+    sz_copy((sz_ptr_t)&capacity, (sz_cptr_t)handle, sizeof(sz_size_t));
+    sz_size_t consumed_capacity = sizeof(sz_size_t);
+    if (consumed_capacity + length > capacity) return SZ_NULL_CHAR;
+    return (sz_ptr_t)handle + consumed_capacity;
 }
 
-SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#if SZ_USE_X86_AVX512
-    return sz_rfind_charset_avx512(text, length, set);
-#elif SZ_USE_X86_AVX2
-    return sz_rfind_charset_avx2(text, length, set);
-#elif SZ_USE_ARM_NEON
-    return sz_rfind_charset_neon(text, length, set);
-#else
-    return sz_rfind_charset_serial(text, length, set);
-#endif
+/** @brief  Helper "no-op" function, simulating memory deallocation when we use a "static" memory buffer. */
+SZ_INTERNAL void _sz_memory_free_fixed(sz_ptr_t start, sz_size_t length, void *handle) {
+    sz_unused(start && length && handle);
 }
 
-SZ_DYNAMIC sz_size_t sz_hamming_distance( //
-    sz_cptr_t a, sz_size_t a_length,      //
-    sz_cptr_t b, sz_size_t b_length,      //
-    sz_size_t bound) {
-    return sz_hamming_distance_serial(a, a_length, b, b_length, bound);
+/** @brief  An internal callback used to set a bit in a power-of-two length binary fingerprint of a string. */
+SZ_INTERNAL void _sz_hashes_fingerprint_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *handle) {
+    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
+    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
+    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
+    fingerprint_u8s[(hash / 8) & (fingerprint_bytes - 1)] |= (1 << (hash & 7));
+    sz_unused(start && length);
 }
 
-SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length,           //
-    sz_cptr_t b, sz_size_t b_length,           //
-    sz_size_t bound) {
-    return sz_hamming_distance_utf8_serial(a, a_length, b, b_length, bound);
+/** @brief  An internal callback used to set a bit in a @b non power-of-two length binary fingerprint of a string. */
+SZ_INTERNAL void _sz_hashes_fingerprint_non_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash,
+                                                          void *handle) {
+    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
+    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
+    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
+    fingerprint_u8s[(hash / 8) % fingerprint_bytes] |= (1 << (hash & 7));
+    sz_unused(start && length);
 }
 
-SZ_DYNAMIC sz_size_t sz_edit_distance( //
-    sz_cptr_t a, sz_size_t a_length,   //
-    sz_cptr_t b, sz_size_t b_length,   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-#if SZ_USE_X86_AVX512
-    return sz_edit_distance_avx512(a, a_length, b, b_length, bound, alloc);
-#else
-    return sz_edit_distance_serial(a, a_length, b, b_length, bound, alloc);
-#endif
+/** @brief  An internal callback, used to mix all the running hashes into one pointer-size value. */
+SZ_INTERNAL void _sz_hashes_fingerprint_scalar_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash,
+                                                        void *scalar_handle) {
+    sz_unused(start && length && hash && scalar_handle);
+    sz_size_t *scalar_ptr = (sz_size_t *)scalar_handle;
+    *scalar_ptr ^= hash;
 }
 
-SZ_DYNAMIC sz_size_t sz_edit_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length,        //
-    sz_cptr_t b, sz_size_t b_length,        //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-    return _sz_edit_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc);
-}
+#pragma GCC visibility pop
+#pragma endregion
 
-SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                         sz_error_cost_t const *subs, sz_error_cost_t gap,
-                                         sz_memory_allocator_t *alloc) {
-#if SZ_USE_X86_AVX512
-    return sz_alignment_score_avx512(a, a_length, b, b_length, subs, gap, alloc);
-#else
-    return sz_alignment_score_serial(a, a_length, b, b_length, subs, gap, alloc);
-#endif
-}
+#pragma region Serial Implementation
 
-SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                          sz_hash_callback_t callback, void *callback_handle) {
-#if SZ_USE_X86_AVX512
-    sz_hashes_avx512(text, length, window_length, window_step, callback, callback_handle);
-#elif SZ_USE_X86_AVX2
-    sz_hashes_avx2(text, length, window_length, window_step, callback, callback_handle);
-#else
-    sz_hashes_serial(text, length, window_length, window_step, callback, callback_handle);
-#endif
-}
+#if !SZ_AVOID_LIBC
+#include <stdio.h>  // `fprintf`
+#include <stdlib.h> // `malloc`, `EXIT_FAILURE`
 
-SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    return sz_find_charset(h, h_length, &set);
+SZ_PUBLIC void *_sz_memory_allocate_default(sz_size_t length, void *handle) {
+    sz_unused(handle);
+    return malloc(length);
 }
-
-SZ_DYNAMIC sz_cptr_t sz_find_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    sz_charset_invert(&set);
-    return sz_find_charset(h, h_length, &set);
+SZ_PUBLIC void _sz_memory_free_default(sz_ptr_t start, sz_size_t length, void *handle) {
+    sz_unused(handle && length);
+    free(start);
 }
 
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    return sz_rfind_charset(h, h_length, &set);
-}
+#endif
 
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    sz_charset_invert(&set);
-    return sz_rfind_charset(h, h_length, &set);
+SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc) {
+#if !SZ_AVOID_LIBC
+    alloc->allocate = (sz_memory_allocate_t)_sz_memory_allocate_default;
+    alloc->free = (sz_memory_free_t)_sz_memory_free_default;
+#else
+    alloc->allocate = (sz_memory_allocate_t)SZ_NULL;
+    alloc->free = (sz_memory_free_t)SZ_NULL;
+#endif
+    alloc->handle = SZ_NULL;
 }
 
-SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
-                            sz_random_generator_t generator, void *generator_user_data) {
-    sz_generate_serial(alphabet, alphabet_size, result, result_length, generator, generator_user_data);
+SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length) {
+    // The logic here is simple - put the buffer length in the first slots of the buffer.
+    // Later use it for bounds checking.
+    alloc->allocate = (sz_memory_allocate_t)_sz_memory_allocate_fixed;
+    alloc->free = (sz_memory_free_t)_sz_memory_free_fixed;
+    alloc->handle = &buffer;
+    sz_copy((sz_ptr_t)buffer, (sz_cptr_t)&length, sizeof(sz_size_t));
 }
 
-#endif
 #pragma endregion
 
 #ifdef __cplusplus
@@ -7153,4 +1108,4 @@ SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_
 }
 #endif // __cplusplus
 
-#endif // STRINGZILLA_H_
+#endif // STRINGZILLA_TYPES_H_

From 5f7ca590428e13f6a92e2341b64506f535266ba1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 19:24:05 +0000
Subject: [PATCH 041/751] Fix: Minor macro mismatches

---
 .vscode/settings.json            |  6 ++++
 include/stringzilla/drafts.h     |  4 +--
 include/stringzilla/hash.h       | 18 ++++++----
 include/stringzilla/memory.h     | 11 +-----
 include/stringzilla/similarity.h | 22 +++++++-----
 include/stringzilla/types.h      | 38 ++++++++++++--------
 scripts/bench_memory.cpp         | 30 ++++++++--------
 scripts/bench_search.cpp         | 60 +++++++++++++++++---------------
 scripts/bench_similarity.cpp     |  2 +-
 scripts/bench_token.cpp          | 27 +++++++-------
 scripts/test.cpp                 | 16 ++++-----
 11 files changed, 125 insertions(+), 109 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 980956d1..ee1f1d3b 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -27,6 +27,7 @@
     "Baeza",
     "basicsize",
     "bigram",
+    "bigrams",
     "bioinformaticians",
     "bioinformatics",
     "Bitap",
@@ -50,6 +51,7 @@
     "getslice",
     "Giancarlo",
     "Gonnet",
+    "Haswell",
     "Heikki",
     "hexdigits",
     "Hirschberg's",
@@ -102,6 +104,7 @@
     "readlines",
     "releasebuffer",
     "rfind",
+    "rfinds",
     "richcompare",
     "Ritchie",
     "rmatcher",
@@ -111,11 +114,13 @@
     "rsplits",
     "rstrip",
     "SIMD",
+    "Skylake",
     "splitlines",
     "ssize",
     "startswith",
     "STL",
     "stringzilla",
+    "stringzillite",
     "Strs",
     "strzl",
     "substr",
@@ -129,6 +134,7 @@
     "unpoison",
     "usecases",
     "Vardanian",
+    "VBMI",
     "vectorcallfunc",
     "Wagner",
     "whitespaces",
diff --git a/include/stringzilla/drafts.h b/include/stringzilla/drafts.h
index bcba2233..1817a81e 100644
--- a/include/stringzilla/drafts.h
+++ b/include/stringzilla/drafts.h
@@ -476,7 +476,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx512(sz_cptr_t text, sz_size_t length, sz
 
 #endif // SZ_USE_AVX512
 
-#if SZ_USE_ARM_NEON
+#if SZ_USE_NEON
 
 SZ_PUBLIC sz_cptr_t sz_find_neon_too_smart(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
 
@@ -946,7 +946,7 @@ SZ_PUBLIC void sz_hashes_neon_readahead(sz_cptr_t start, sz_size_t length, sz_si
     }
 }
 
-#endif // SZ_USE_ARM_NEON
+#endif // SZ_USE_NEON
 
 #ifdef __cplusplus
 } // extern "C"
diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index bf24a5e6..d8f4a05e 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -74,8 +74,9 @@ SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length);
  *  @param callback_handle  Optional user-provided pointer to be passed to the `callback`.
  *  @see                    sz_hashes_fingerprint, sz_hashes_intersection
  */
-SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                          sz_hash_callback_t callback, void *callback_handle);
+SZ_DYNAMIC void sz_hashes(                                                            //
+    sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
+    sz_hash_callback_t callback, void *callback_handle);
 
 /**
  *  @brief  Computes the Karp-Rabin rolling hashes of a string outputting a binary fingerprint.
@@ -140,14 +141,19 @@ SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t
 
 /** @copydoc sz_checksum */
 SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length);
+
 /** @copydoc sz_hash */
 SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t text, sz_size_t length);
+
 /** @copydoc sz_generate */
-SZ_PUBLIC void sz_generate_serial(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
-                                  sz_random_generator_t generate, void *generator);
+SZ_PUBLIC void sz_generate_serial( //
+    sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length, sz_random_generator_t generate,
+    void *generator);
+
 /** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_serial(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                                sz_hash_callback_t callback, void *callback_handle);
+SZ_PUBLIC void sz_hashes_serial(                                                      //
+    sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
+    sz_hash_callback_t callback, void *callback_handle);
 
 #pragma endregion // Core API
 
diff --git a/include/stringzilla/memory.h b/include/stringzilla/memory.h
index 87957878..32106a82 100644
--- a/include/stringzilla/memory.h
+++ b/include/stringzilla/memory.h
@@ -9,7 +9,7 @@
  *  - `sz_move` - analog to `memmove`
  *  - `sz_fill` - analog to `memset`
  *  - `sz_look_up_transform` - LUT transformation of a string, similar to OpenCV LUT
- *  - `sz_detect_encoding` - similar to `iconv` or `chardet`
+ *  - TODO: `sz_detect_encoding` - similar to `iconv` or `chardet`
  *
  *  Convenience functions for character-set mapping:
  *
@@ -149,15 +149,6 @@ SZ_PUBLIC void sz_toupper(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
  */
 SZ_PUBLIC void sz_toascii(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
 
-/**
- *  @brief  Checks if all characters in the range are valid ASCII characters.
- *
- *  @param text     String to be analyzed.
- *  @param length   Number of bytes in the string.
- *  @return         Whether all characters are valid ASCII characters.
- */
-SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t text, sz_size_t length);
-
 #pragma endregion // Helper API
 
 #pragma region Serial Implementation
diff --git a/include/stringzilla/similarity.h b/include/stringzilla/similarity.h
index e811fefe..ef34b824 100644
--- a/include/stringzilla/similarity.h
+++ b/include/stringzilla/similarity.h
@@ -150,6 +150,15 @@ SZ_DYNAMIC sz_ssize_t sz_alignment_score(                             //
     sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
     sz_memory_allocator_t *alloc);
 
+/**
+ *  @brief  Checks if all characters in the range are valid ASCII characters.
+ *
+ *  @param text     String to be analyzed.
+ *  @param length   Number of bytes in the string.
+ *  @return         Whether all characters are valid ASCII characters.
+ */
+SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t text, sz_size_t length);
+
 /** @copydoc sz_hamming_distance */
 SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
     sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
@@ -707,9 +716,7 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto63_avx512( //
 
         // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
         __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
+        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return SZ_SIZE_MAX;
     }
 
     // Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.
@@ -740,9 +747,7 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto63_avx512( //
 
         // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
         __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
+        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return SZ_SIZE_MAX;
     }
 
     // Now let's handle the bottom right triangle.
@@ -766,9 +771,8 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto63_avx512( //
 
         // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
         __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
+        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return SZ_SIZE_MAX;
+
         // In every following iterations we take use a shorter prefix of each register,
         // but we don't need to update the `next_diagonal_mask` anymore... except for the early exit.
         next_diagonal_mask = _kshiftri_mask64(next_diagonal_mask, 1);
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index a39620e6..be4a3e0d 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -3,18 +3,26 @@
  *  @file   types.h
  *  @author Ash Vardanian
  *
- *  Consider overriding the following macros to customize the library:
+ *  Includes the following types:
  *
- *  - `SZ_DEBUG=0` - whether to enable debug assertions and logging.
- *  - `SZ_AVOID_LIBC=0` - whether to avoid including the standard C library headers.
- *  - `SZ_DYNAMIC_DISPATCH=0` - whether to use runtime dispatching of the most advanced SIMD backend.
- *  - `SZ_USE_MISALIGNED_LOADS=0` - whether to use misaligned loads on platforms that support them.
- *  - `SZ_SWAR_THRESHOLD=24` - threshold for switching to SWAR backend over serial byte-level for-loops.
- *  - `SZ_USE_HASWELL=?` - whether to use AVX2 instructions on x86_64.
- *  - `SZ_USE_SKYLAKE=?` - whether to use AVX-512 instructions on x86_64.
- *  - `SZ_USE_ICE=?` - whether to use AVX-512 VBMI instructions on x86_64.
- *  - `SZ_USE_NEON=?` - whether to use NEON instructions on ARM.
- *  - `SZ_USE_SVE=?` - whether to use SVE and SVE2 instructions on ARM.
+ *  - `sz_u8_t`, `sz_u16_t`, `sz_u32_t`, `sz_u64_t` - unsigned integers of 8, 16, 32, and 64 bits.
+ *  - `sz_i8_t`, `sz_i16_t`, `sz_i32_t`, `sz_i64_t` - signed integers of 8, 16, 32, and 64 bits.
+ *  - `sz_size_t`, `sz_ssize_t` - unsigned and signed integers of the same size as a pointer.
+ *  - `sz_ptr_t`, `sz_cptr_t` - pointer and constant pointer to a C-style string.
+ *  - `sz_bool_t` - boolean type, `sz_true_k` and `sz_false_k` constants.
+ *  - `sz_ordering_t` - for comparison results, `sz_less_k`, `sz_equal_k`, `sz_greater_k`.
+ *  - @b `sz_u8_vec_t`, `sz_u16_vec_t`, `sz_u32_vec_t`, `sz_u64_vec_t` - @b SWAR vector types.
+ *  - @b `sz_u128_vec_t`, `sz_u256_vec_t`, `sz_u512_vec_t` - @b SIMD vector types for x86 and Arm.
+ *  - @b `sz_rune_t` - for 32-bit Unicode code points ~ @b runes.
+ *  - `sz_rune_length_t` - to describe the number of bytes in a UTF8-encoded rune.
+ *  - `sz_error_cost_t` - for substitution costs in string alignment and scoring algorithms.
+ *
+ *  The library also defines the following higher-level structures:
+ *
+ *  - `sz_string_view_t` - for a C-style `std::string_view`-like structure.
+ *  - `sz_memory_allocator_t` - a wrapper for memory-management functions.
+ *  - `sz_sequence_t` - a wrapper to access strings forming a sequential container.
+ *  - `sz_charset_t` - a bitset for 256 possible byte values.
  */
 #ifndef STRINGZILLA_TYPES_H_
 #define STRINGZILLA_TYPES_H_
@@ -864,8 +872,8 @@ SZ_INTERNAL sz_u64_vec_t _sz_u64_each_byte_equal(sz_u64_vec_t a, sz_u64_vec_t b)
 /**
  *  @brief  Clamps signed offsets in a string to a valid range. Used for Pythonic-style slicing.
  */
-SZ_INTERNAL void sz_ssize_clamp_interval(sz_size_t length, sz_ssize_t start, sz_ssize_t end,
-                                         sz_size_t *normalized_offset, sz_size_t *normalized_length) {
+SZ_INTERNAL void sz_ssize_clamp_interval( //
+    sz_size_t length, sz_ssize_t start, sz_ssize_t end, sz_size_t *normalized_offset, sz_size_t *normalized_length) {
     // TODO: Remove branches.
     // Normalize negative indices
     if (start < 0) start += length;
@@ -1023,7 +1031,7 @@ SZ_INTERNAL sz_u64_vec_t sz_u64_load(sz_cptr_t ptr) {
 /** @brief  Helper function, using the supplied fixed-capacity buffer to allocate memory. */
 SZ_INTERNAL sz_ptr_t _sz_memory_allocate_fixed(sz_size_t length, void *handle) {
     sz_size_t capacity;
-    sz_copy((sz_ptr_t)&capacity, (sz_cptr_t)handle, sizeof(sz_size_t));
+    *(sz_ptr_t)&capacity = *(sz_cptr_t)handle;
     sz_size_t consumed_capacity = sizeof(sz_size_t);
     if (consumed_capacity + length > capacity) return SZ_NULL_CHAR;
     return (sz_ptr_t)handle + consumed_capacity;
@@ -1098,7 +1106,7 @@ SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void
     alloc->allocate = (sz_memory_allocate_t)_sz_memory_allocate_fixed;
     alloc->free = (sz_memory_free_t)_sz_memory_free_fixed;
     alloc->handle = &buffer;
-    sz_copy((sz_ptr_t)buffer, (sz_cptr_t)&length, sizeof(sz_size_t));
+    *(sz_ptr_t)buffer = *(sz_cptr_t)&length;
 }
 
 #pragma endregion
diff --git a/scripts/bench_memory.cpp b/scripts/bench_memory.cpp
index d8131102..ee6ae03b 100644
--- a/scripts/bench_memory.cpp
+++ b/scripts/bench_memory.cpp
@@ -69,16 +69,16 @@ tracked_unary_functions_t copy_functions(sz_cptr_t dataset_start_ptr, sz_ptr_t o
     tracked_unary_functions_t result = {
         {"memcpy" + suffix, wrap_sz(memcpy)},
         {"sz_copy_serial" + suffix, wrap_sz(sz_copy_serial)},
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
         {"sz_copy_avx512" + suffix, wrap_sz(sz_copy_avx512)},
 #endif
-#if SZ_USE_X86_AVX2
+#if SZ_USE_HASWELL
         {"sz_copy_avx2" + suffix, wrap_sz(sz_copy_avx2)},
 #endif
-#if SZ_USE_ARM_SVE
+#if SZ_USE_SVE
         {"sz_copy_sve" + suffix, wrap_sz(sz_copy_sve)},
 #endif
-#if SZ_USE_ARM_NEON
+#if SZ_USE_NEON
         {"sz_copy_neon" + suffix, wrap_sz(sz_copy_neon)},
 #endif
     };
@@ -109,16 +109,16 @@ tracked_unary_functions_t fill_functions(sz_cptr_t dataset_start_ptr, sz_ptr_t o
              return slice.size();
          })},
         {"sz_fill_serial", wrap_sz(sz_fill_serial)},
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
         {"sz_fill_avx512", wrap_sz(sz_fill_avx512)},
 #endif
-#if SZ_USE_X86_AVX2
+#if SZ_USE_HASWELL
         {"sz_fill_avx2", wrap_sz(sz_fill_avx2)},
 #endif
-#if SZ_USE_ARM_SVE
+#if SZ_USE_SVE
         {"sz_fill_sve", wrap_sz(sz_fill_sve)},
 #endif
-#if SZ_USE_ARM_NEON
+#if SZ_USE_NEON
         {"sz_fill_neon", wrap_sz(sz_fill_neon)},
 #endif
     };
@@ -149,13 +149,13 @@ tracked_unary_functions_t move_functions(sz_cptr_t dataset_start_ptr, sz_ptr_t o
     tracked_unary_functions_t result = {
         {"memmove" + suffix, wrap_sz(memmove)},
         {"sz_move_serial" + suffix, wrap_sz(sz_move_serial)},
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
         {"sz_move_avx512" + suffix, wrap_sz(sz_move_avx512)},
 #endif
-#if SZ_USE_X86_AVX2
+#if SZ_USE_HASWELL
         {"sz_move_avx2" + suffix, wrap_sz(sz_move_avx2)},
 #endif
-#if SZ_USE_ARM_NEON
+#if SZ_USE_NEON
         {"sz_move_neon" + suffix, wrap_sz(sz_move_neon)},
 #endif
     };
@@ -192,13 +192,13 @@ tracked_unary_functions_t transform_functions() {
              return slice.size();
          })},
         {"sz_look_up_transform_serial", wrap_sz(sz_look_up_transform_serial)},
-#if SZ_USE_X86_AVX512
-        {"sz_look_up_transform_avx512", wrap_sz(sz_look_up_transform_avx512)},
+#if SZ_USE_ICE
+        {"sz_look_up_transform_ice", wrap_sz(sz_look_up_transform_ice)},
 #endif
-#if SZ_USE_X86_AVX2
+#if SZ_USE_HASWELL
         {"sz_look_up_transform_avx2", wrap_sz(sz_look_up_transform_avx2)},
 #endif
-#if SZ_USE_ARM_NEON
+#if SZ_USE_NEON
         {"sz_look_up_transform_neon", wrap_sz(sz_look_up_transform_neon)},
 #endif
     };
diff --git a/scripts/bench_search.cpp b/scripts/bench_search.cpp
index ada4ded4..7380a697 100644
--- a/scripts/bench_search.cpp
+++ b/scripts/bench_search.cpp
@@ -29,13 +29,13 @@ tracked_binary_functions_t find_functions() {
              return (match == std::string_view::npos ? h.size() : match);
          }},
         {"sz_find_serial", wrap_sz(sz_find_serial), true},
-#if SZ_USE_X86_AVX512
-        {"sz_find_avx512", wrap_sz(sz_find_avx512), true},
+#if SZ_USE_SKYLAKE
+        {"sz_find_skylake", wrap_sz(sz_find_skylake), true},
 #endif
-#if SZ_USE_X86_AVX2
-        {"sz_find_avx2", wrap_sz(sz_find_avx2), true},
+#if SZ_USE_HASWELL
+        {"sz_find_haswell", wrap_sz(sz_find_haswell), true},
 #endif
-#if SZ_USE_ARM_NEON
+#if SZ_USE_NEON
         {"sz_find_neon", wrap_sz(sz_find_neon), true},
 #endif
         {"strstr/strchr",
@@ -90,13 +90,13 @@ tracked_binary_functions_t rfind_functions() {
              return (match == std::string_view::npos ? 0 : match);
          }},
         {"sz_rfind_serial", wrap_sz(sz_rfind_serial), true},
-#if SZ_USE_X86_AVX512
-        {"sz_rfind_avx512", wrap_sz(sz_rfind_avx512), true},
+#if SZ_USE_SKYLAKE
+        {"sz_rfind_skylake", wrap_sz(sz_rfind_skylake), true},
 #endif
-#if SZ_USE_X86_AVX2
-        {"sz_rfind_avx2", wrap_sz(sz_rfind_avx2), true},
+#if SZ_USE_HASWELL
+        {"sz_rfind_haswell", wrap_sz(sz_rfind_haswell), true},
 #endif
-#if SZ_USE_ARM_NEON
+#if SZ_USE_NEON
         {"sz_rfind_neon", wrap_sz(sz_rfind_neon), true},
 #endif
         {"std::search<R>",
@@ -140,13 +140,13 @@ tracked_binary_functions_t find_charset_functions() {
              return (match == std::string_view::npos ? h.size() : match);
          }},
         {"sz_find_charset_serial", wrap_sz(sz_find_charset_serial), true},
-#if SZ_USE_X86_AVX2
-        {"sz_find_charset_avx2", wrap_sz(sz_find_charset_avx2), true},
+#if SZ_USE_HASWELL
+        {"sz_find_charset_haswell", wrap_sz(sz_find_charset_haswell), true},
 #endif
-#if SZ_USE_X86_AVX512
-        {"sz_find_charset_avx512", wrap_sz(sz_find_charset_avx512), true},
+#if SZ_USE_ICE
+        {"sz_find_charset_ice", wrap_sz(sz_find_charset_ice), true},
 #endif
-#if SZ_USE_ARM_NEON
+#if SZ_USE_NEON
         {"sz_find_charset_neon", wrap_sz(sz_find_charset_neon), true},
 #endif
         {"strcspn", [](std::string_view h, std::string_view n) { return strcspn(h.data(), n.data()); }},
@@ -171,10 +171,10 @@ tracked_binary_functions_t rfind_charset_functions() {
              return (match == std::string_view::npos ? 0 : match);
          }},
         {"sz_rfind_charset_serial", wrap_sz(sz_rfind_charset_serial), true},
-#if SZ_USE_X86_AVX512
-        {"sz_rfind_charset_avx512", wrap_sz(sz_rfind_charset_avx512), true},
+#if SZ_USE_ICE
+        {"sz_rfind_charset_ice", wrap_sz(sz_rfind_charset_ice), true},
 #endif
-#if SZ_USE_ARM_NEON
+#if SZ_USE_NEON
         {"sz_rfind_charset_neon", wrap_sz(sz_rfind_charset_neon), true},
 #endif
     };
@@ -184,8 +184,8 @@ tracked_binary_functions_t rfind_charset_functions() {
 /**
  *  @brief  Evaluation for search string operations: find.
  */
-void bench_finds(std::string const &haystack, std::vector<std::string> const &strings,
-                 tracked_binary_functions_t &&variants) {
+void bench_finds( //
+    std::string const &haystack, std::vector<std::string> const &strings, tracked_binary_functions_t &&variants) {
 
     for (std::size_t variant_idx = 0; variant_idx != variants.size(); ++variant_idx) {
         auto &variant = variants[variant_idx];
@@ -234,8 +234,8 @@ void bench_finds(std::string const &haystack, std::vector<std::string> const &st
 /**
  *  @brief  Evaluation for reverse order search string operations: find.
  */
-void bench_rfinds(std::string const &haystack, std::vector<std::string> const &strings,
-                  tracked_binary_functions_t &&variants) {
+void bench_rfinds( //
+    std::string const &haystack, std::vector<std::string> const &strings, tracked_binary_functions_t &&variants) {
 
     for (std::size_t variant_idx = 0; variant_idx != variants.size(); ++variant_idx) {
         auto &variant = variants[variant_idx];
@@ -336,15 +336,17 @@ int main(int argc, char const **argv) {
         bench_search(dataset.text, filter_by_length<std::string>(dataset.tokens, token_length));
     }
 
-    // Run bechnmarks on abstract tokens of different length
+    // Run benchmarks on abstract tokens of different length
     for (std::size_t token_length : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32}) {
         std::printf("Benchmarking for missing tokens of length %zu:\n", token_length);
-        bench_search(dataset.text, std::vector<std::string> {
-                                       std::string(token_length, '\1'),
-                                       std::string(token_length, '\2'),
-                                       std::string(token_length, '\3'),
-                                       std::string(token_length, '\4'),
-                                   });
+        bench_search(     //
+            dataset.text, //
+            std::vector<std::string> {
+                std::string(token_length, '\1'),
+                std::string(token_length, '\2'),
+                std::string(token_length, '\3'),
+                std::string(token_length, '\4'),
+            });
     }
 
     std::printf("All benchmarks passed.\n");
diff --git a/scripts/bench_similarity.cpp b/scripts/bench_similarity.cpp
index b2c36a60..140433e2 100644
--- a/scripts/bench_similarity.cpp
+++ b/scripts/bench_similarity.cpp
@@ -54,7 +54,7 @@ tracked_binary_functions_t distance_functions() {
         {"naive", wrap_baseline},
         {"sz_edit_distance", wrap_sz_distance(sz_edit_distance_serial), true},
         {"sz_alignment_score", wrap_sz_scoring(sz_alignment_score_serial), true},
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
         {"sz_edit_distance_avx512", wrap_sz_distance(sz_edit_distance_avx512), true},
         {"sz_alignment_score_avx512", wrap_sz_scoring(sz_alignment_score_avx512), true},
 #endif
diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index f699f459..1120ad52 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -22,13 +22,12 @@ tracked_unary_functions_t checksum_functions() {
                                     [](std::size_t sum, char c) { return sum + static_cast<unsigned char>(c); });
          }},
         {"sz_checksum_serial", wrap_sz(sz_checksum_serial), true},
-#if SZ_USE_X86_AVX2
-        {"sz_checksum_avx2", wrap_sz(sz_checksum_avx2), true},
+#if SZ_USE_HASWELL
+        {"sz_checksum_haswell", wrap_sz(sz_checksum_haswell), true},
 #endif
-#if SZ_USE_X86_AVX512
-        {"sz_checksum_avx512", wrap_sz(sz_checksum_avx512), true},
+#if SZ_USE_ICE
 #endif
-#if SZ_USE_ARM_NEON
+#if SZ_USE_NEON
         {"sz_checksum_neon", wrap_sz(sz_checksum_neon), true},
 #endif
     };
@@ -56,11 +55,11 @@ tracked_unary_functions_t sliding_hashing_functions(std::size_t window_width, st
     };
     std::string suffix = std::to_string(window_width) + ":step" + std::to_string(step);
     tracked_unary_functions_t result = {
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
         {"sz_hashes_avx512:" + suffix, wrap_sz(sz_hashes_avx512)},
 #endif
-#if SZ_USE_X86_AVX2
-        {"sz_hashes_avx2:" + suffix, wrap_sz(sz_hashes_avx2)},
+#if SZ_USE_HASWELL
+        {"sz_hashes_haswell:" + suffix, wrap_sz(sz_hashes_haswell)},
 #endif
         {"sz_hashes_serial:" + suffix, wrap_sz(sz_hashes_serial)},
     };
@@ -118,10 +117,10 @@ tracked_binary_functions_t equality_functions() {
     tracked_binary_functions_t result = {
         {"std::string_view.==", [](std::string_view a, std::string_view b) { return (a == b); }},
         {"sz_equal_serial", wrap_sz(sz_equal_serial), true},
-#if SZ_USE_X86_AVX2
-        {"sz_equal_avx2", wrap_sz(sz_equal_avx2), true},
+#if SZ_USE_HASWELL
+        {"sz_equal_haswell", wrap_sz(sz_equal_haswell), true},
 #endif
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
         {"sz_equal_avx512", wrap_sz(sz_equal_avx512), true},
 #endif
         {"memcmp",
@@ -145,10 +144,10 @@ tracked_binary_functions_t ordering_functions() {
              return (order == 0 ? sz_equal_k : (order < 0 ? sz_less_k : sz_greater_k));
          }},
         {"sz_order_serial", wrap_sz(sz_order_serial), true},
-#if SZ_USE_X86_AVX2
-        {"sz_order_avx2", wrap_sz(sz_order_avx2), true},
+#if SZ_USE_HASWELL
+        {"sz_order_haswell", wrap_sz(sz_order_haswell), true},
 #endif
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
         {"sz_order_avx512", wrap_sz(sz_order_avx512), true},
 #endif
         {"memcmp",
diff --git a/scripts/test.cpp b/scripts/test.cpp
index eecc97f0..db856a8e 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -11,10 +11,10 @@
 // Those parameters must never be explicitly set during releases,
 // but they come handy during development, if you want to validate
 // different ISA-specific implementations.
-// #define SZ_USE_X86_AVX2 0
-// #define SZ_USE_X86_AVX512 0
-// #define SZ_USE_ARM_NEON 0
-// #define SZ_USE_ARM_SVE 0
+// #define SZ_USE_HASWELL 0
+// #define SZ_USE_ICE 0
+// #define SZ_USE_NEON 0
+// #define SZ_USE_SVE 0
 #define SZ_DEBUG 1 // Enforce aggressive logging for this unit.
 
 // Put this at the top to make sure it pulls all the right dependencies
@@ -1576,10 +1576,10 @@ int main(int argc, char const **argv) {
     // Let's greet the user nicely
     sz_unused(argc && argv);
     std::printf("Hi, dear tester! You look nice today!\n");
-    std::printf("- Uses AVX2: %s \n", SZ_USE_X86_AVX2 ? "yes" : "no");
-    std::printf("- Uses AVX512: %s \n", SZ_USE_X86_AVX512 ? "yes" : "no");
-    std::printf("- Uses NEON: %s \n", SZ_USE_ARM_NEON ? "yes" : "no");
-    std::printf("- Uses SVE: %s \n", SZ_USE_ARM_SVE ? "yes" : "no");
+    std::printf("- Uses AVX2: %s \n", SZ_USE_HASWELL ? "yes" : "no");
+    std::printf("- Uses AVX512: %s \n", SZ_USE_ICE ? "yes" : "no");
+    std::printf("- Uses NEON: %s \n", SZ_USE_NEON ? "yes" : "no");
+    std::printf("- Uses SVE: %s \n", SZ_USE_SVE ? "yes" : "no");
 
     // Basic utilities
     test_arithmetical_utilities();

From 41e59179629f20657c510d8cbe48b8ceaf92be39 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 19:24:25 +0000
Subject: [PATCH 042/751] Fix: Partially filter `stringzilla.h` file

---
 include/stringzilla/stringzilla.h | 909 ++++--------------------------
 1 file changed, 125 insertions(+), 784 deletions(-)

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index de7fbcac..c0b1b369 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -2,23 +2,37 @@
  *  @brief  StringZilla is a collection of advanced string algorithms, designed to be used in Big Data applications.
  *          It is generally faster than LibC, and has a broader & cleaner interface, and targets modern x86 CPUs
  *          with AVX-512 and Arm NEON and older CPUs with SWAR and auto-vectorization.
+ *  @file   stringzilla.h
+ *  @author Ash Vardanian
+ *
+ *  @see    StringZilla docs: https://github.com/ashvardanian/StringZilla/blob/main/README.md
+ *  @see    LibC string docs: https://pubs.opengroup.org/onlinepubs/009695399/basedefs/string.h.html
+ *
+ *  @section    Introduction
+ *
+ *
+ *  @section    Compilation Settings
  *
  *  Consider overriding the following macros to customize the library:
  *
  *  - `SZ_DEBUG=0` - whether to enable debug assertions and logging.
+ *  - `SZ_AVOID_LIBC=0` - whether to avoid including the standard C library headers.
  *  - `SZ_DYNAMIC_DISPATCH=0` - whether to use runtime dispatching of the most advanced SIMD backend.
  *  - `SZ_USE_MISALIGNED_LOADS=0` - whether to use misaligned loads on platforms that support them.
+ *
+ *  Performance tuning:
+ *
  *  - `SZ_SWAR_THRESHOLD=24` - threshold for switching to SWAR backend over serial byte-level for-loops.
- *  - `SZ_USE_X86_AVX512=?` - whether to use AVX-512 instructions on x86_64.
- *  - `SZ_USE_X86_AVX2=?` - whether to use AVX2 instructions on x86_64.
- *  - `SZ_USE_ARM_NEON=?` - whether to use NEON instructions on ARM.
- *  - `SZ_USE_ARM_SVE=?` - whether to use SVE instructions on ARM.
+ *  - `SZ_CACHE_LINE_WIDTH=64` - cache-line width that affects the execution of some algorithms.
+ *  - `SZ_CACHE_SIZE=1048576` - the combined size of L1d and L2 caches in bytes, affecting temporal loads.
  *
- *  @see    StringZilla: https://github.com/ashvardanian/StringZilla/blob/main/README.md
- *  @see    LibC String: https://pubs.opengroup.org/onlinepubs/009695399/basedefs/string.h.html
+ *  Different generations of CPUs and SIMD capabilities can be enabled or disabled with the following macros:
  *
- *  @file   stringzilla.h
- *  @author Ash Vardanian
+ *  - `SZ_USE_HASWELL=?` - whether to use AVX2 instructions on x86_64.
+ *  - `SZ_USE_SKYLAKE=?` - whether to use AVX-512 instructions on x86_64.
+ *  - `SZ_USE_ICE=?` - whether to use AVX-512 VBMI instructions on x86_64.
+ *  - `SZ_USE_NEON=?` - whether to use NEON instructions on ARM.
+ *  - `SZ_USE_SVE=?` - whether to use SVE and SVE2 instructions on ARM.
  */
 #ifndef STRINGZILLA_H_
 #define STRINGZILLA_H_
@@ -27,229 +41,10 @@
 #define STRINGZILLA_VERSION_MINOR 11
 #define STRINGZILLA_VERSION_PATCH 0
 
-/**
- *  @brief  When set to 1, the library will include the following LibC headers: <stddef.h> and <stdint.h>.
- *          In debug builds (SZ_DEBUG=1), the library will also include <stdio.h> and <stdlib.h>.
- *
- *  You may want to disable this compiling for use in the kernel, or in embedded systems.
- *  You may also avoid them, if you are very sensitive to compilation time and avoid pre-compiled headers.
- *  https://artificial-mind.net/projects/compile-health/
- */
-#ifndef SZ_AVOID_LIBC
-#define SZ_AVOID_LIBC (0) // true or false
-#endif
-
-/**
- *  @brief  A misaligned load can be - trying to fetch eight consecutive bytes from an address
- *          that is not divisible by eight. On x86 enabled by default. On ARM it's not.
- *
- *  Most platforms support it, but there is no industry standard way to check for those.
- *  This value will mostly affect the performance of the serial (SWAR) backend.
- */
-#ifndef SZ_USE_MISALIGNED_LOADS
-#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
-#define SZ_USE_MISALIGNED_LOADS (1) // true or false
-#else
-#define SZ_USE_MISALIGNED_LOADS (0) // true or false
-#endif
-#endif
-
-/**
- *  @brief  Removes compile-time dispatching, and replaces it with runtime dispatching.
- *          So the `sz_find` function will invoke the most advanced backend supported by the CPU,
- *          that runs the program, rather than the most advanced backend supported by the CPU
- *          used to compile the library or the downstream application.
- */
-#ifndef SZ_DYNAMIC_DISPATCH
-#define SZ_DYNAMIC_DISPATCH (0) // true or false
-#endif
-
-/**
- *  @brief  Analogous to `size_t` and `std::size_t`, unsigned integer, identical to pointer size.
- *          64-bit on most platforms where pointers are 64-bit.
- *          32-bit on platforms where pointers are 32-bit.
- */
-#if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64)
-#define SZ_DETECT_64_BIT (1)
-#define SZ_SIZE_MAX (0xFFFFFFFFFFFFFFFFull)  // Largest unsigned integer that fits into 64 bits.
-#define SZ_SSIZE_MAX (0x7FFFFFFFFFFFFFFFull) // Largest signed integer that fits into 64 bits.
-#else
-#define SZ_DETECT_64_BIT (0)
-#define SZ_SIZE_MAX (0xFFFFFFFFu)  // Largest unsigned integer that fits into 32 bits.
-#define SZ_SSIZE_MAX (0x7FFFFFFFu) // Largest signed integer that fits into 32 bits.
-#endif
-
-/**
- *  @brief  On Big-Endian machines StringZilla will work in compatibility mode.
- *          This disables SWAR hacks to minimize code duplication, assuming practically
- *          all modern popular platforms are Little-Endian.
- *
- *  This variable is hard to infer from macros reliably. It's best to set it manually.
- *  For that CMake provides the `TestBigEndian` and `CMAKE_<LANG>_BYTE_ORDER` (from 3.20 onwards).
- *  In Python one can check `sys.byteorder == 'big'` in the `setup.py` script and pass the appropriate macro.
- *  https://stackoverflow.com/a/27054190
- */
-#ifndef SZ_DETECT_BIG_ENDIAN
-#if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN || defined(__BIG_ENDIAN__) || defined(__ARMEB__) || \
-    defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || defined(__MIBSEB) || defined(__MIBSEB__)
-#define SZ_DETECT_BIG_ENDIAN (1) //< It's a big-endian target architecture
-#else
-#define SZ_DETECT_BIG_ENDIAN (0) //< It's a little-endian target architecture
-#endif
-#endif
-
-/*
- *  Debugging and testing.
- */
-#ifndef SZ_DEBUG
-#if defined(DEBUG) || defined(_DEBUG) // This means "Not using DEBUG information".
-#define SZ_DEBUG (1)
-#else
-#define SZ_DEBUG (0)
-#endif
-#endif
-
-/**
- *  @brief  Threshold for switching to SWAR (8-bytes at a time) backend over serial byte-level for-loops.
- *          On very short strings, under 16 bytes long, at most a single word will be processed with SWAR.
- *          Assuming potentially misaligned loads, SWAR makes sense only after ~24 bytes.
- */
-#ifndef SZ_SWAR_THRESHOLD
-#if SZ_DEBUG
-#define SZ_SWAR_THRESHOLD (8u) // 8 bytes in debug builds
-#else
-#define SZ_SWAR_THRESHOLD (24u) // 24 bytes in release builds
-#endif
-#endif
-
-/*  Annotation for the public API symbols:
- *
- *  - `SZ_PUBLIC` is used for functions that are part of the public API.
- *  - `SZ_INTERNAL` is used for internal helper functions with unstable APIs.
- *  - `SZ_DYNAMIC` is used for functions that are part of the public API, but are dispatched at runtime.
- */
-#ifndef SZ_DYNAMIC
-#if SZ_DYNAMIC_DISPATCH
-#if defined(_WIN32) || defined(__CYGWIN__)
-#define SZ_DYNAMIC __declspec(dllexport)
-#define SZ_EXTERNAL __declspec(dllimport)
-#define SZ_PUBLIC inline static
-#define SZ_INTERNAL inline static
-#else
-#define SZ_DYNAMIC __attribute__((visibility("default")))
-#define SZ_EXTERNAL extern
-#define SZ_PUBLIC __attribute__((unused)) inline static
-#define SZ_INTERNAL __attribute__((always_inline)) inline static
-#endif // _WIN32 || __CYGWIN__
-#else
-#define SZ_DYNAMIC inline static
-#define SZ_EXTERNAL extern
-#define SZ_PUBLIC inline static
-#define SZ_INTERNAL inline static
-#endif // SZ_DYNAMIC_DISPATCH
-#endif // SZ_DYNAMIC
-
-/**
- *  @brief  Alignment macro for 64-byte alignment.
- */
-#if defined(_MSC_VER)
-#define SZ_ALIGN64 __declspec(align(64))
-#elif defined(__GNUC__) || defined(__clang__)
-#define SZ_ALIGN64 __attribute__((aligned(64)))
-#else
-#define SZ_ALIGN64
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/*
- *  Let's infer the integer types or pull them from LibC,
- *  if that is allowed by the user.
- */
-#if !SZ_AVOID_LIBC
-#include <stddef.h>           // `size_t`
-#include <stdint.h>           // `uint8_t`
-typedef int8_t sz_i8_t;       // Always 8 bits
-typedef uint8_t sz_u8_t;      // Always 8 bits
-typedef uint16_t sz_u16_t;    // Always 16 bits
-typedef int32_t sz_i32_t;     // Always 32 bits
-typedef uint32_t sz_u32_t;    // Always 32 bits
-typedef uint64_t sz_u64_t;    // Always 64 bits
-typedef int64_t sz_i64_t;     // Always 64 bits
-typedef size_t sz_size_t;     // Pointer-sized unsigned integer, 32 or 64 bits
-typedef ptrdiff_t sz_ssize_t; // Signed version of `sz_size_t`, 32 or 64 bits
-
-#else // if SZ_AVOID_LIBC:
-
-// ! The C standard doesn't specify the signedness of char.
-// ! On x86 char is signed by default while on Arm it is unsigned by default.
-// ! That's why we don't define `sz_char_t` and generally use explicit `sz_i8_t` and `sz_u8_t`.
-typedef signed char sz_i8_t;         // Always 8 bits
-typedef unsigned char sz_u8_t;       // Always 8 bits
-typedef unsigned short sz_u16_t;     // Always 16 bits
-typedef int sz_i32_t;                // Always 32 bits
-typedef unsigned int sz_u32_t;       // Always 32 bits
-typedef long long sz_i64_t;          // Always 64 bits
-typedef unsigned long long sz_u64_t; // Always 64 bits
-
-// Now we need to redefine the `size_t`.
-// Microsoft Visual C++ (MSVC) typically follows LLP64 data model on 64-bit platforms,
-// where integers, pointers, and long types have different sizes:
-//
-//  > `int` is 32 bits
-//  > `long` is 32 bits
-//  > `long long` is 64 bits
-//  > pointer (thus, `size_t`) is 64 bits
-//
-// In contrast, GCC and Clang on 64-bit Unix-like systems typically follow the LP64 model, where:
-//
-//  > `int` is 32 bits
-//  > `long` and pointer (thus, `size_t`) are 64 bits
-//  > `long long` is also 64 bits
-//
-// Source: https://learn.microsoft.com/en-us/windows/win32/winprog64/abstract-data-models
-#if SZ_DETECT_64_BIT
-typedef unsigned long long sz_size_t; // 64-bit.
-typedef long long sz_ssize_t;         // 64-bit.
-#else
-typedef unsigned sz_size_t;  // 32-bit.
-typedef unsigned sz_ssize_t; // 32-bit.
-#endif // SZ_DETECT_64_BIT
-
-#endif // SZ_AVOID_LIBC
-
-/**
- *  @brief  Compile-time assert macro similar to `static_assert` in C++.
- */
-#define sz_static_assert(condition, name)                \
-    typedef struct {                                     \
-        int static_assert_##name : (condition) ? 1 : -1; \
-    } sz_static_assert_##name##_t
-
-sz_static_assert(sizeof(sz_size_t) == sizeof(void *), sz_size_t_must_be_pointer_size);
-sz_static_assert(sizeof(sz_ssize_t) == sizeof(void *), sz_ssize_t_must_be_pointer_size);
-
-#pragma region Public API
-
-typedef char *sz_ptr_t;          // A type alias for `char *`
-typedef char const *sz_cptr_t;   // A type alias for `char const *`
-typedef sz_i8_t sz_error_cost_t; // Character mismatch cost for fuzzy matching functions
-
-typedef sz_u64_t sz_sorted_idx_t; // Index of a sorted string in a list of strings
-
-typedef enum { sz_false_k = 0, sz_true_k = 1 } sz_bool_t;                        // Only one relevant bit
-typedef enum { sz_less_k = -1, sz_equal_k = 0, sz_greater_k = 1 } sz_ordering_t; // Only three possible states: <=>
-
-/**
- *  @brief  Tiny string-view structure. It's POD type, unlike the `std::string_view`.
- */
-typedef struct sz_string_view_t {
-    sz_cptr_t start;
-    sz_size_t length;
-} sz_string_view_t;
-
 /**
  *  @brief  Enumeration of SIMD capabilities of the target architecture.
  *          Used to introspect the supported functionality of the dynamic library.
@@ -277,176 +72,6 @@ typedef enum sz_capability_t {
  */
 SZ_DYNAMIC sz_capability_t sz_capabilities(void);
 
-/**
- *  @brief  Bit-set structure for 256 possible byte values. Useful for filtering and search.
- *  @see    sz_charset_init, sz_charset_add, sz_charset_contains, sz_charset_invert
- */
-typedef union sz_charset_t {
-    sz_u64_t _u64s[4];
-    sz_u32_t _u32s[8];
-    sz_u16_t _u16s[16];
-    sz_u8_t _u8s[32];
-} sz_charset_t;
-
-/** @brief  Initializes a bit-set to an empty collection, meaning - all characters are banned. */
-SZ_PUBLIC void sz_charset_init(sz_charset_t *s) { s->_u64s[0] = s->_u64s[1] = s->_u64s[2] = s->_u64s[3] = 0; }
-
-/** @brief  Adds a character to the set and accepts @b unsigned integers. */
-SZ_PUBLIC void sz_charset_add_u8(sz_charset_t *s, sz_u8_t c) { s->_u64s[c >> 6] |= (1ull << (c & 63u)); }
-
-/** @brief  Adds a character to the set. Consider @b sz_charset_add_u8. */
-SZ_PUBLIC void sz_charset_add(sz_charset_t *s, char c) { sz_charset_add_u8(s, *(sz_u8_t *)(&c)); } // bitcast
-
-/** @brief  Checks if the set contains a given character and accepts @b unsigned integers. */
-SZ_PUBLIC sz_bool_t sz_charset_contains_u8(sz_charset_t const *s, sz_u8_t c) {
-    // Checking the bit can be done in different ways:
-    // - (s->_u64s[c >> 6] & (1ull << (c & 63u))) != 0
-    // - (s->_u32s[c >> 5] & (1u << (c & 31u))) != 0
-    // - (s->_u16s[c >> 4] & (1u << (c & 15u))) != 0
-    // - (s->_u8s[c >> 3] & (1u << (c & 7u))) != 0
-    return (sz_bool_t)((s->_u64s[c >> 6] & (1ull << (c & 63u))) != 0);
-}
-
-/** @brief  Checks if the set contains a given character. Consider @b sz_charset_contains_u8. */
-SZ_PUBLIC sz_bool_t sz_charset_contains(sz_charset_t const *s, char c) {
-    return sz_charset_contains_u8(s, *(sz_u8_t *)(&c)); // bitcast
-}
-
-/** @brief  Inverts the contents of the set, so allowed character get disallowed, and vice versa. */
-SZ_PUBLIC void sz_charset_invert(sz_charset_t *s) {
-    s->_u64s[0] ^= 0xFFFFFFFFFFFFFFFFull, s->_u64s[1] ^= 0xFFFFFFFFFFFFFFFFull, //
-        s->_u64s[2] ^= 0xFFFFFFFFFFFFFFFFull, s->_u64s[3] ^= 0xFFFFFFFFFFFFFFFFull;
-}
-
-typedef void *(*sz_memory_allocate_t)(sz_size_t, void *);
-typedef void (*sz_memory_free_t)(void *, sz_size_t, void *);
-typedef sz_u64_t (*sz_random_generator_t)(void *);
-
-/**
- *  @brief  Some complex pattern matching algorithms may require memory allocations.
- *          This structure is used to pass the memory allocator to those functions.
- *  @see    sz_memory_allocator_init_fixed
- */
-typedef struct sz_memory_allocator_t {
-    sz_memory_allocate_t allocate;
-    sz_memory_free_t free;
-    void *handle;
-} sz_memory_allocator_t;
-
-/**
- *  @brief  Initializes a memory allocator to use the system default `malloc` and `free`.
- *          ! The function is not available if the library was compiled with `SZ_AVOID_LIBC`.
- *
- *  @param alloc    Memory allocator to initialize.
- */
-SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc);
-
-/**
- *  @brief  Initializes a memory allocator to use a static-capacity buffer.
- *          No dynamic allocations will be performed.
- *
- *  @param alloc    Memory allocator to initialize.
- *  @param buffer   Buffer to use for allocations.
- *  @param length   Length of the buffer. @b Must be greater than 8 bytes. Different values would be optimal for
- *                  different algorithms and input lengths, but 4096 bytes (one RAM page) is a good default.
- */
-SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length);
-
-/**
- *  @brief  The number of bytes a stack-allocated string can hold, including the SZ_NULL termination character.
- *          ! This can't be changed from outside. Don't use the `#error` as it may already be included and set.
- */
-#ifdef SZ_STRING_INTERNAL_SPACE
-#undef SZ_STRING_INTERNAL_SPACE
-#endif
-#define SZ_STRING_INTERNAL_SPACE (sizeof(sz_size_t) * 3 - 1) // 3 pointers minus one byte for an 8-bit length
-
-/**
- *  @brief  Tiny memory-owning string structure with a Small String Optimization (SSO).
- *          Differs in layout from Folly, Clang, GCC, and probably most other implementations.
- *          It's designed to avoid any branches on read-only operations, and can store up
- *          to 22 characters on stack on 64-bit machines, followed by the SZ_NULL-termination character.
- *
- *  @section Changing Length
- *
- *  One nice thing about this design, is that you can, in many cases, change the length of the string
- *  without any branches, invoking a `+=` or `-=` on the 64-bit `length` field. If the string is on heap,
- *  the solution is obvious. If it's on stack, inplace decrement wouldn't affect the top bytes of the string,
- *  only changing the last byte containing the length.
- */
-typedef union sz_string_t {
-
-#if !SZ_DETECT_BIG_ENDIAN
-
-    struct external {
-        sz_ptr_t start;
-        sz_size_t length;
-        sz_size_t space;
-        sz_size_t padding;
-    } external;
-
-    struct internal {
-        sz_ptr_t start;
-        sz_u8_t length;
-        char chars[SZ_STRING_INTERNAL_SPACE];
-    } internal;
-
-#else
-
-    struct external {
-        sz_ptr_t start;
-        sz_size_t space;
-        sz_size_t padding;
-        sz_size_t length;
-    } external;
-
-    struct internal {
-        sz_ptr_t start;
-        char chars[SZ_STRING_INTERNAL_SPACE];
-        sz_u8_t length;
-    } internal;
-
-#endif
-
-    sz_size_t words[4];
-
-} sz_string_t;
-
-typedef sz_u64_t (*sz_hash_t)(sz_cptr_t, sz_size_t);
-typedef sz_u64_t (*sz_checksum_t)(sz_cptr_t, sz_size_t);
-typedef sz_bool_t (*sz_equal_t)(sz_cptr_t, sz_cptr_t, sz_size_t);
-typedef sz_ordering_t (*sz_order_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
-typedef void (*sz_to_converter_t)(sz_cptr_t, sz_size_t, sz_ptr_t);
-
-/**
- *  @brief  Computes the 64-bit check-sum of bytes in a string.
- *          Similar to `std::ranges::accumulate`.
- *
- *  @param text     String to aggregate.
- *  @param length   Number of bytes in the text.
- *  @return         64-bit unsigned value.
- */
-SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length);
-
-/** @copydoc sz_checksum */
-SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length);
-
-/**
- *  @brief  Computes the 64-bit unsigned hash of a string. Fairly fast for short strings,
- *          simple implementation, and supports rolling computation, reused in other APIs.
- *          Similar to `std::hash` in C++.
- *
- *  @param text     String to hash.
- *  @param length   Number of bytes in the text.
- *  @return         64-bit hash value.
- *
- *  @see    sz_hashes, sz_hashes_fingerprint, sz_hashes_intersection
- */
-SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length);
-
-/** @copydoc sz_hash */
-SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t text, sz_size_t length);
-
 /**
  *  @brief  Checks if two string are equal.
  *          Similar to `memcmp(a, b, length) == 0` in LibC and `a == b` in STL.
@@ -480,139 +105,6 @@ SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b,
 /** @copydoc sz_order */
 SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
 
-/**
- *  @brief  Look Up Table @b (LUT) transformation of a string. Equivalent to `for (char & c : text) c = lut[c]`.
- *
- *  Can be used to implement some form of string normalization, partially masking punctuation marks,
- *  or converting between different character sets, like uppercase or lowercase. Surprisingly, also has
- *  broad implications in image processing, where image channel transformations are often done using LUTs.
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param lut      Look Up Table to apply. Must be exactly @b 256 bytes long.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
-
-typedef void (*sz_look_up_transform_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_ptr_t);
-
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_serial(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
-
-/**
- *  @brief  Equivalent to `for (char & c : text) c = tolower(c)`.
- *
- *  ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
- *  So there are 26 english letters, shifted by 32 values, meaning that a conversion
- *  can be done by flipping the 5th bit each inappropriate character byte. This, however,
- *  breaks for extended ASCII, so a different solution is needed.
- *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_PUBLIC void sz_tolower(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
-
-/**
- *  @brief  Equivalent to `for (char & c : text) c = toupper(c)`.
- *
- *  ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
- *  So there are 26 english letters, shifted by 32 values, meaning that a conversion
- *  can be done by flipping the 5th bit each inappropriate character byte. This, however,
- *  breaks for extended ASCII, so a different solution is needed.
- *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_PUBLIC void sz_toupper(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
-
-/**
- *  @brief  Equivalent to `for (char & c : text) c = toascii(c)`.
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_PUBLIC void sz_toascii(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
-
-/**
- *  @brief  Checks if all characters in the range are valid ASCII characters.
- *
- *  @param text     String to be analyzed.
- *  @param length   Number of bytes in the string.
- *  @return         Whether all characters are valid ASCII characters.
- */
-SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t text, sz_size_t length);
-
-/**
- *  @brief  Generates a random string for a given alphabet, avoiding integer division and modulo operations.
- *          Similar to `text[i] = alphabet[rand() % cardinality]`.
- *
- *  The modulo operation is expensive, and should be avoided in performance-critical code.
- *  We avoid it using small lookup tables and replacing it with a multiplication and shifts, similar to `libdivide`.
- *  Alternative algorithms would include:
- *      - Montgomery form: https://en.algorithmica.org/hpc/number-theory/montgomery/
- *      - Barret reduction: https://www.nayuki.io/page/barrett-reduction-algorithm
- *      - Lemire's trick: https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
- *
- *  @param alphabet     Set of characters to sample from.
- *  @param cardinality  Number of characters to sample from.
- *  @param text         Output string, can point to the same address as ::text.
- *  @param generate     Callback producing random numbers given the generator state.
- *  @param generator    Generator state, can be a pointer to a seed, or a pointer to a random number generator.
- */
-SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
-                            sz_random_generator_t generate, void *generator);
-
-/** @copydoc sz_generate */
-SZ_PUBLIC void sz_generate_serial(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
-                                  sz_random_generator_t generate, void *generator);
-
-/**
- *  @brief  Similar to `memcpy`, copies contents of one string into another.
- *          The behavior is undefined if the strings overlap.
- *
- *  @param target   String to copy into.
- *  @param length   Number of bytes to copy.
- *  @param source   String to copy from.
- */
-SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-/**
- *  @brief  Similar to `memmove`, copies (moves) contents of one string into another.
- *          Unlike `sz_copy`, allows overlapping strings as arguments.
- *
- *  @param target   String to copy into.
- *  @param length   Number of bytes to copy.
- *  @param source   String to copy from.
- */
-SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-
-typedef void (*sz_move_t)(sz_ptr_t, sz_cptr_t, sz_size_t);
-
-/**
- *  @brief  Similar to `memset`, fills a string with a given value.
- *
- *  @param target   String to fill.
- *  @param length   Number of bytes to fill.
- *  @param value    Value to fill with.
- */
-SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-
-typedef void (*sz_fill_t)(sz_ptr_t, sz_size_t, sz_u8_t);
-
 /**
  *  @brief  Initializes a string class instance to an empty value.
  */
@@ -1154,62 +646,62 @@ SZ_PUBLIC void sz_sort_intro(sz_sequence_t *sequence, sz_sequence_comparator_t l
  *  Hardware feature detection.
  *  All of those can be controlled by the user.
  */
-#ifndef SZ_USE_X86_AVX512
+#ifndef SZ_USE_ICE
 #ifdef __AVX512BW__
-#define SZ_USE_X86_AVX512 1
+#define SZ_USE_ICE 1
 #else
-#define SZ_USE_X86_AVX512 0
+#define SZ_USE_ICE 0
 #endif
 #endif
 
-#ifndef SZ_USE_X86_AVX2
+#ifndef SZ_USE_HASWELL
 #ifdef __AVX2__
-#define SZ_USE_X86_AVX2 1
+#define SZ_USE_HASWELL 1
 #else
-#define SZ_USE_X86_AVX2 0
+#define SZ_USE_HASWELL 0
 #endif
 #endif
 
-#ifndef SZ_USE_ARM_NEON
+#ifndef SZ_USE_NEON
 #ifdef __ARM_NEON
-#define SZ_USE_ARM_NEON 1
+#define SZ_USE_NEON 1
 #else
-#define SZ_USE_ARM_NEON 0
+#define SZ_USE_NEON 0
 #endif
 #endif
 
-#ifndef SZ_USE_ARM_SVE
+#ifndef SZ_USE_SVE
 #ifdef __ARM_FEATURE_SVE
-#define SZ_USE_ARM_SVE 1
+#define SZ_USE_SVE 1
 #else
-#define SZ_USE_ARM_SVE 0
+#define SZ_USE_SVE 0
 #endif
 #endif
 
 /*
  *  Include hardware-specific headers.
  */
-#if SZ_USE_X86_AVX512 || SZ_USE_X86_AVX2
+#if SZ_USE_ICE || SZ_USE_HASWELL
 #include <immintrin.h>
 #endif // SZ_USE_X86...
-#if SZ_USE_ARM_NEON
+#if SZ_USE_NEON
 #if !defined(_MSC_VER)
 #include <arm_acle.h>
 #endif
 #include <arm_neon.h>
-#endif // SZ_USE_ARM_NEON
-#if SZ_USE_ARM_SVE
+#endif // SZ_USE_NEON
+#if SZ_USE_SVE
 #if !defined(_MSC_VER)
 #include <arm_sve.h>
 #endif
-#endif // SZ_USE_ARM_SVE
+#endif // SZ_USE_SVE
 
 #pragma region Hardware Specific API
 
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
 
 /** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_avx512(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
+SZ_PUBLIC sz_bool_t sz_equal_skylake(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
 /** @copydoc sz_order */
 SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
 /** @copydoc sz_copy */
@@ -1219,19 +711,19 @@ SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
 /** @copydoc sz_fill */
 SZ_PUBLIC void sz_fill_avx512(sz_ptr_t target, sz_size_t length, sz_u8_t value);
 /** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
+SZ_PUBLIC void sz_look_up_transform_ice(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
 /** @copydoc sz_find_byte */
 SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
 /** @copydoc sz_rfind_byte */
 SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
 /** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
+SZ_PUBLIC sz_cptr_t sz_find_skylake(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
 /** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
+SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
 /** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
+SZ_PUBLIC sz_cptr_t sz_find_charset_ice(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
 /** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
+SZ_PUBLIC sz_cptr_t sz_rfind_charset_ice(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
 /** @copydoc sz_edit_distance */
 SZ_PUBLIC sz_size_t sz_edit_distance_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
                                             sz_size_t bound, sz_memory_allocator_t *alloc);
@@ -1244,7 +736,7 @@ SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t text, sz_size_t length, sz_size_t wind
                                 sz_hash_callback_t callback, void *callback_handle);
 #endif
 
-#if SZ_USE_X86_AVX2
+#if SZ_USE_HASWELL
 /** @copydoc sz_equal */
 SZ_PUBLIC sz_bool_t sz_equal_avx2(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
 /** @copydoc sz_order */
@@ -1270,7 +762,7 @@ SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t text, sz_size_t length, sz_size_t window
                               sz_hash_callback_t callback, void *callback_handle);
 #endif
 
-#if SZ_USE_ARM_NEON
+#if SZ_USE_NEON
 /** @copydoc sz_equal */
 SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
 /** @copydoc sz_order */
@@ -1297,7 +789,7 @@ SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t text, sz_size_t length, sz_ch
 SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
 #endif
 
-#if SZ_USE_ARM_SVE
+#if SZ_USE_SVE
 /** @copydoc sz_equal */
 SZ_PUBLIC sz_bool_t sz_equal_sve(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
 /** @copydoc sz_order */
@@ -1554,7 +1046,7 @@ SZ_INTERNAL sz_size_t sz_size_bit_ceil(sz_size_t x) {
     x |= x >> 4;
     x |= x >> 8;
     x |= x >> 16;
-#if SZ_DETECT_64_BIT
+#if _SZ_IS_64_BIT
     x |= x >> 32;
 #endif
     x++;
@@ -1740,79 +1232,6 @@ SZ_INTERNAL void _sz_hashes_fingerprint_scalar_callback(sz_cptr_t start, sz_size
     *scalar_ptr ^= hash;
 }
 
-/**
- *  @brief  Chooses the offsets of the most interesting characters in a search needle.
- *
- *  Search throughput can significantly deteriorate if we are matching the wrong characters.
- *  Say the needle is "aXaYa", and we are comparing the first, second, and last character.
- *  If we use SIMD and compare many offsets at a time, comparing against "a" in every register is a waste.
- *
- *  Similarly, dealing with UTF8 inputs, we know that the lower bits of each character code carry more information.
- *  Cyrillic alphabet, for example, falls into [0x0410, 0x042F] code range for uppercase [А, Я], and
- *  into [0x0430, 0x044F] for lowercase [а, я]. Scanning through a text written in Russian, half of the
- *  bytes will carry absolutely no value and will be equal to 0x04.
- */
-SZ_INTERNAL void _sz_locate_needle_anomalies(sz_cptr_t start, sz_size_t length, //
-                                             sz_size_t *first, sz_size_t *second, sz_size_t *third) {
-    *first = 0;
-    *second = length / 2;
-    *third = length - 1;
-
-    //
-    int has_duplicates =                   //
-        start[*first] == start[*second] || //
-        start[*first] == start[*third] ||  //
-        start[*second] == start[*third];
-
-    // Loop through letters to find non-colliding variants.
-    if (length > 3 && has_duplicates) {
-        // Pivot the middle point right, until we find a character different from the first one.
-        for (; start[*second] == start[*first] && *second + 1 < *third; ++(*second)) {}
-        // Pivot the third (last) point left, until we find a different character.
-        for (; (start[*third] == start[*second] || start[*third] == start[*first]) && *third > (*second + 1);
-             --(*third)) {}
-    }
-
-    // TODO: Investigate alternative strategies for long needles.
-    // On very long needles we have the luxury to choose!
-    // Often dealing with UTF8, we will likely benefit from shifting the first and second characters
-    // further to the right, to achieve not only uniqueness within the needle, but also avoid common
-    // rune prefixes of 2-, 3-, and 4-byte codes.
-    if (length > 8) {
-        // Pivot the first and second points right, until we find a character, that:
-        // > is different from others.
-        // > doesn't start with 0b'110x'xxxx - only 5 bits of relevant info.
-        // > doesn't start with 0b'1110'xxxx - only 4 bits of relevant info.
-        // > doesn't start with 0b'1111'0xxx - only 3 bits of relevant info.
-        //
-        // So we are practically searching for byte values that start with 0b0xxx'xxxx or 0b'10xx'xxxx.
-        // Meaning they fall in the range [0, 127] and [128, 191], in other words any unsigned int up to 191.
-        sz_u8_t const *start_u8 = (sz_u8_t const *)start;
-        sz_size_t vibrant_first = *first, vibrant_second = *second, vibrant_third = *third;
-
-        // Let's begin with the seccond character, as the termination criteria there is more obvious
-        // and we may end up with more variants to check for the first candidate.
-        for (; (start_u8[vibrant_second] > 191 || start_u8[vibrant_second] == start_u8[vibrant_third]) &&
-               (vibrant_second + 1 < vibrant_third);
-             ++vibrant_second) {}
-
-        // Now check if we've indeed found a good candidate or should revert the `vibrant_second` to `second`.
-        if (start_u8[vibrant_second] < 191) { *second = vibrant_second; }
-        else { vibrant_second = *second; }
-
-        // Now check the first character.
-        for (; (start_u8[vibrant_first] > 191 || start_u8[vibrant_first] == start_u8[vibrant_second] ||
-                start_u8[vibrant_first] == start_u8[vibrant_third]) &&
-               (vibrant_first + 1 < vibrant_second);
-             ++vibrant_first) {}
-
-        // Now check if we've indeed found a good candidate or should revert the `vibrant_first` to `first`.
-        // We don't need to shift the third one when dealing with texts as the last byte of the text is
-        // also the last byte of a rune and contains the most information.
-        if (start_u8[vibrant_first] < 191) { *first = vibrant_first; }
-    }
-}
-
 #pragma GCC visibility pop
 #pragma endregion
 
@@ -1853,26 +1272,6 @@ SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void
     sz_copy((sz_ptr_t)buffer, (sz_cptr_t)&length, sizeof(sz_size_t));
 }
 
-/**
- *  @brief  Byte-level equality comparison between two strings.
- *          If unaligned loads are allowed, uses a switch-table to avoid loops on short strings.
- */
-SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_cptr_t const a_end = a + length;
-#if SZ_USE_MISALIGNED_LOADS
-    if (length >= SZ_SWAR_THRESHOLD) {
-        sz_u64_vec_t a_vec, b_vec;
-        for (; a + 8 <= a_end; a += 8, b += 8) {
-            a_vec = sz_u64_load(a);
-            b_vec = sz_u64_load(b);
-            if (a_vec.u64 != b_vec.u64) return sz_false_k;
-        }
-    }
-#endif
-    while (a != a_end && *a == *b) a++, b++;
-    return (sz_bool_t)(a_end == a);
-}
-
 SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
     for (sz_cptr_t const end = text + length; text != end; ++text)
         if (sz_charset_contains(set, *text)) return text;
@@ -1904,7 +1303,7 @@ SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr
     sz_bool_t a_shorter = (sz_bool_t)(a_length < b_length);
     sz_size_t min_length = a_shorter ? a_length : b_length;
     sz_cptr_t min_end = a + min_length;
-#if SZ_USE_MISALIGNED_LOADS && !SZ_DETECT_BIG_ENDIAN
+#if SZ_USE_MISALIGNED_LOADS && !_SZ_IS_BIG_ENDIAN
     for (sz_u64_vec_t a_vec, b_vec; a + 8 <= min_end; a += 8, b += 8) {
         a_vec = sz_u64_load(a);
         b_vec = sz_u64_load(b);
@@ -1943,7 +1342,7 @@ SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr
     if (!h_length) return SZ_NULL_CHAR;
     sz_cptr_t const h_end = h + h_length;
 
-#if !SZ_DETECT_BIG_ENDIAN    // Use SWAR only on little-endian platforms for brevety.
+#if !_SZ_IS_BIG_ENDIAN       // Use SWAR only on little-endian platforms for brevity.
 #if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
         if (*h == *n) return h;
@@ -1980,7 +1379,7 @@ sz_cptr_t sz_rfind_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
     // Reposition the `h` pointer to the end, as we will be walking backwards.
     h = h + h_length - 1;
 
-#if !SZ_DETECT_BIG_ENDIAN    // Use SWAR only on little-endian platforms for brevety.
+#if !_SZ_IS_BIG_ENDIAN       // Use SWAR only on little-endian platforms for brevity.
 #if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((sz_size_t)(h + 1) & 7ull) && h >= h_start; --h)
         if (*h == *n) return h;
@@ -2364,7 +1763,7 @@ SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
     // This almost never fires, but it's better to be safe than sorry.
     if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
 
-#if SZ_DETECT_BIG_ENDIAN
+#if _SZ_IS_BIG_ENDIAN
     sz_find_t backends[] = {
         (sz_find_t)sz_find_byte_serial,
         (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
@@ -2823,7 +2222,7 @@ SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
 
     // Walk through both strings using SWAR and counting the number of differing characters.
     sz_size_t distance = max_length - min_length;
-#if SZ_USE_MISALIGNED_LOADS && !SZ_DETECT_BIG_ENDIAN
+#if SZ_USE_MISALIGNED_LOADS && !_SZ_IS_BIG_ENDIAN
     if (min_length >= SZ_SWAR_THRESHOLD) {
         sz_u64_vec_t a_vec, b_vec, match_vec;
         for (; a + 8 <= a_end && distance < bound; a += 8, b += 8) {
@@ -3278,7 +2677,7 @@ SZ_PUBLIC void sz_string_unpack(sz_string_t const *string, sz_ptr_t *start, sz_s
     // If the string is small, use branch-less approach to mask-out the top 7 bytes of the length.
     *length = string->external.length & (0x00000000000000FFull | is_big_mask);
     // In case the string is small, the `is_small - 1ull` will become 0xFFFFFFFFFFFFFFFFull.
-    *space = sz_u64_blend(SZ_STRING_INTERNAL_SPACE, string->external.space, is_big_mask);
+    *space = sz_u64_blend(_SZ_STRING_INTERNAL_SPACE, string->external.space, is_big_mask);
     *is_external = (sz_bool_t)!is_small;
 }
 
@@ -3336,7 +2735,7 @@ SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length,
     string->words[2] = 0;
     string->words[3] = 0;
     // If we are lucky, no memory allocations will be needed.
-    if (space_needed <= SZ_STRING_INTERNAL_SPACE) {
+    if (space_needed <= _SZ_STRING_INTERNAL_SPACE) {
         string->internal.start = &string->internal.chars[0];
         string->internal.length = (sz_u8_t)length;
     }
@@ -3357,7 +2756,7 @@ SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity
     sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
 
     sz_size_t new_space = new_capacity + 1;
-    if (new_space <= SZ_STRING_INTERNAL_SPACE) return string->external.start;
+    if (new_space <= _SZ_STRING_INTERNAL_SPACE) return string->external.start;
 
     sz_ptr_t string_start;
     sz_size_t string_length;
@@ -3488,64 +2887,6 @@ SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *alloca
     sz_string_init(string);
 }
 
-// When overriding libc, disable optimisations for this function beacuse MSVC will optimize the loops into a memset.
-// Which then causes a stack overflow due to infinite recursion (memset -> sz_fill_serial -> memset).
-#if defined(_MSC_VER) && defined(SZ_OVERRIDE_LIBC) && SZ_OVERRIDE_LIBC
-#pragma optimize("", off)
-#endif
-SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    sz_ptr_t end = target + length;
-    // Dealing with short strings, a single sequential pass would be faster.
-    // If the size is larger than 2 words, then at least 1 of them will be aligned.
-    // But just one aligned word may not be worth SWAR.
-    if (length < SZ_SWAR_THRESHOLD)
-        while (target != end) *(target++) = value;
-
-    // In case of long strings, skip unaligned bytes, and then fill the rest in 64-bit chunks.
-    else {
-        sz_u64_t value64 = (sz_u64_t)value * 0x0101010101010101ull;
-        while ((sz_size_t)target & 7ull) *(target++) = value;
-        while (target + 8 <= end) *(sz_u64_t *)target = value64, target += 8;
-        while (target != end) *(target++) = value;
-    }
-}
-#if defined(_MSC_VER) && defined(SZ_OVERRIDE_LIBC) && SZ_OVERRIDE_LIBC
-#pragma optimize("", on)
-#endif
-
-SZ_PUBLIC void sz_copy_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_MISALIGNED_LOADS
-    while (length >= 8) *(sz_u64_t *)target = *(sz_u64_t const *)source, target += 8, source += 8, length -= 8;
-#endif
-    while (length--) *(target++) = *(source++);
-}
-
-SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // Implementing `memmove` is trickier, than `memcpy`, as the ranges may overlap.
-    // Existing implementations often have two passes, in normal and reversed order,
-    // depending on the relation of `target` and `source` addresses.
-    // https://student.cs.uwaterloo.ca/~cs350/common/os161-src-html/doxygen/html/memmove_8c_source.html
-    // https://marmota.medium.com/c-language-making-memmove-def8792bb8d5
-    //
-    // We can use the `memcpy` like left-to-right pass if we know that the `target` is before `source`.
-    // Or if we know that they don't intersect! In that case the traversal order is irrelevant,
-    // but older CPUs may predict and fetch forward-passes better.
-    if (target < source || target >= source + length) {
-#if SZ_USE_MISALIGNED_LOADS
-        while (length >= 8) *(sz_u64_t *)target = *(sz_u64_t const *)(source), target += 8, source += 8, length -= 8;
-#endif
-        while (length--) *(target++) = *(source++);
-    }
-    else {
-        // Jump to the end and walk backwards.
-        target += length, source += length;
-#if SZ_USE_MISALIGNED_LOADS
-        while (length >= 8) *(sz_u64_t *)(target -= 8) = *(sz_u64_t const *)(source -= 8), length -= 8;
-#endif
-        while (length--) *(--target) = *(--source);
-    }
-}
-
 #pragma endregion
 
 /*
@@ -3803,7 +3144,7 @@ SZ_INTERNAL sz_bool_t _sz_sort_is_less(sz_sequence_t *sequence, sz_size_t i_key,
 
 SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t partial_order_length) {
 
-#if SZ_DETECT_BIG_ENDIAN
+#if _SZ_IS_BIG_ENDIAN
     // TODO: Implement partial sort for big-endian systems. For now this sorts the whole thing.
     sz_unused(partial_order_length);
     sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
@@ -3824,7 +3165,7 @@ SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t partial_order_
 }
 
 SZ_PUBLIC void sz_sort(sz_sequence_t *sequence) {
-#if SZ_DETECT_BIG_ENDIAN
+#if _SZ_IS_BIG_ENDIAN
     sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
 #else
     sz_sort_partial(sequence, sequence->count);
@@ -3839,7 +3180,7 @@ SZ_PUBLIC void sz_sort(sz_sequence_t *sequence) {
  */
 #pragma region AVX2 Implementation
 
-#if SZ_USE_X86_AVX2
+#if SZ_USE_HASWELL
 #pragma GCC push_options
 #pragma GCC target("avx2")
 #pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
@@ -4564,7 +3905,7 @@ SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t start, sz_size_t length, sz_size_t windo
  */
 #pragma region AVX512 Implementation
 
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
 #pragma GCC push_options
 #pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
 #pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
@@ -4690,7 +4031,7 @@ SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr
     return sz_equal_k;
 }
 
-SZ_PUBLIC sz_bool_t sz_equal_avx512(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
+SZ_PUBLIC sz_bool_t sz_equal_skylake(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
     __mmask64 mask;
     sz_u512_vec_t a_vec, b_vec;
 
@@ -4950,7 +4291,7 @@ SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr
     return SZ_NULL_CHAR;
 }
 
-SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+SZ_PUBLIC sz_cptr_t sz_find_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
 
     // This almost never fires, but it's better to be safe than sorry.
     if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
@@ -4982,7 +4323,7 @@ SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
                                    _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
             while (matches) {
                 int potential_offset = sz_u64_ctz(matches);
-                if (sz_equal_avx512(h + potential_offset, n, n_length)) return h + potential_offset;
+                if (sz_equal_skylake(h + potential_offset, n, n_length)) return h + potential_offset;
                 matches &= matches - 1;
             }
 
@@ -5040,7 +4381,7 @@ SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
                                _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
         while (matches) {
             int potential_offset = sz_u64_ctz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + potential_offset, n, n_length)) return h + potential_offset;
+            if (n_length <= 3 || sz_equal_skylake(h + potential_offset, n, n_length)) return h + potential_offset;
             matches &= matches - 1;
         }
     }
@@ -5070,7 +4411,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cpt
     return SZ_NULL_CHAR;
 }
 
-SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
 
     // This almost never fires, but it's better to be safe than sorry.
     if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
@@ -5101,7 +4442,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n
                                _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
         while (matches) {
             int potential_offset = sz_u64_clz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + h_length - n_length - potential_offset, n, n_length))
+            if (n_length <= 3 || sz_equal_skylake(h + h_length - n_length - potential_offset, n, n_length))
                 return h + h_length - n_length - potential_offset;
             sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
                       "The bit must be set before we squash it");
@@ -5121,7 +4462,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n
                                _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
         while (matches) {
             int potential_offset = sz_u64_clz(matches);
-            if (n_length <= 3 || sz_equal_avx512(h + 64 - potential_offset - 1, n, n_length))
+            if (n_length <= 3 || sz_equal_skylake(h + 64 - potential_offset - 1, n, n_length))
                 return h + 64 - potential_offset - 1;
             sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
                       "The bit must be set before we squash it");
@@ -5439,7 +4780,7 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
     ones_u16_vec.zmm = _mm512_set1_epi16(1);
 
     // This is a mixed-precision implementation, using 8-bit representations for part of the operations.
-    // Even there, in case `SZ_USE_X86_AVX2=0`, let's use the `sz_u512_vec_t` type, addressing the first YMM halfs.
+    // Even there, in case `SZ_USE_HASWELL=0`, let's use the `sz_u512_vec_t` type, addressing the first YMM halfs.
     sz_u512_vec_t shorter_vec, longer_vec;
     sz_u512_vec_t ones_u8_vec;
     ones_u8_vec.ymms[0] = _mm256_set1_epi8(1);
@@ -5810,7 +5151,7 @@ SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t start, sz_size_t length, sz_size_t win
 #pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512vbmi,avx512vbmi2,bmi,bmi2"))), \
                              apply_to = function)
 
-SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
+SZ_PUBLIC void sz_look_up_transform_ice(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
 
     // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
     // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
@@ -5920,7 +5261,7 @@ SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, s
     }
 }
 
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
+SZ_PUBLIC sz_cptr_t sz_find_charset_ice(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
 
     // Before initializing the AVX-512 vectors, we may want to run the sequential code for the first few bytes.
     // In practice, that only hurts, even when we have matches every 5-ish bytes.
@@ -6035,7 +5376,7 @@ SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_
     return SZ_NULL_CHAR;
 }
 
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
+SZ_PUBLIC sz_cptr_t sz_rfind_charset_ice(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
     return sz_rfind_charset_serial(text, length, filter);
 }
 
@@ -6046,7 +5387,7 @@ SZ_PUBLIC sz_cptr_t sz_find_many_avx512(                        //
 
     // When dealing with huge needles vocabularies, like in tokenization workloads, we need to construct an automaton.
     // But in many cases, the vocabulary is small enough to use a simpler DFA-less approach, combining the ideas from
-    // the `sz_find_avx512` and `sz_find_charset_avx512` functions.
+    // the `sz_find_skylake` and `sz_find_charset_ice` functions.
     //
     // Pick the offsets within needles where there is the least variance in the characters.
     // Like for "the", "then", "there", "these", "those", "their", "they", "them", "that", "this", "thus", "than":
@@ -6363,7 +5704,7 @@ SZ_PUBLIC sz_bool_t sz_detect_encoding(sz_cptr_t text, sz_size_t length) {
  */
 #pragma region ARM NEON
 
-#if SZ_USE_ARM_NEON
+#if SZ_USE_NEON
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+simd")
 #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
@@ -6758,7 +6099,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_ch
  */
 #pragma region ARM SVE
 
-#if SZ_USE_ARM_SVE
+#if SZ_USE_SVE
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+sve")
 #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
@@ -6902,11 +6243,11 @@ SZ_PUBLIC void sz_hashes_fingerprint(sz_cptr_t start, sz_size_t length, sz_size_
 #if !SZ_DYNAMIC_DISPATCH
 
 SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length) {
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
     return sz_checksum_avx512(text, length);
-#elif SZ_USE_X86_AVX2
+#elif SZ_USE_HASWELL
     return sz_checksum_avx2(text, length);
-#elif SZ_USE_ARM_NEON
+#elif SZ_USE_NEON
     return sz_checksum_neon(text, length);
 #else
     return sz_checksum_serial(text, length);
@@ -6914,11 +6255,11 @@ SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length) {
 }
 
 SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-#if SZ_USE_X86_AVX512
-    return sz_equal_avx512(a, b, length);
-#elif SZ_USE_X86_AVX2
+#if SZ_USE_ICE
+    return sz_equal_skylake(a, b, length);
+#elif SZ_USE_HASWELL
     return sz_equal_avx2(a, b, length);
-#elif SZ_USE_ARM_NEON
+#elif SZ_USE_NEON
     return sz_equal_neon(a, b, length);
 #else
     return sz_equal_serial(a, b, length);
@@ -6926,11 +6267,11 @@ SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
 }
 
 SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
     return sz_order_avx512(a, a_length, b, b_length);
-#elif SZ_USE_X86_AVX2
+#elif SZ_USE_HASWELL
     return sz_order_avx2(a, a_length, b, b_length);
-#elif SZ_USE_ARM_NEON
+#elif SZ_USE_NEON
     return sz_order_neon(a, a_length, b, b_length);
 #else
     return sz_order_serial(a, a_length, b, b_length);
@@ -6938,11 +6279,11 @@ SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b,
 }
 
 SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
     sz_copy_avx512(target, source, length);
-#elif SZ_USE_X86_AVX2
+#elif SZ_USE_HASWELL
     sz_copy_avx2(target, source, length);
-#elif SZ_USE_ARM_NEON
+#elif SZ_USE_NEON
     sz_copy_neon(target, source, length);
 #else
     sz_copy_serial(target, source, length);
@@ -6950,11 +6291,11 @@ SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
 }
 
 SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
     sz_move_avx512(target, source, length);
-#elif SZ_USE_X86_AVX2
+#elif SZ_USE_HASWELL
     sz_move_avx2(target, source, length);
-#elif SZ_USE_ARM_NEON
+#elif SZ_USE_NEON
     sz_move_neon(target, source, length);
 #else
     sz_move_serial(target, source, length);
@@ -6962,11 +6303,11 @@ SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
 }
 
 SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
     sz_fill_avx512(target, length, value);
-#elif SZ_USE_X86_AVX2
+#elif SZ_USE_HASWELL
     sz_fill_avx2(target, length, value);
-#elif SZ_USE_ARM_NEON
+#elif SZ_USE_NEON
     sz_fill_neon(target, length, value);
 #else
     sz_fill_serial(target, length, value);
@@ -6974,11 +6315,11 @@ SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
 }
 
 SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-#if SZ_USE_X86_AVX512
-    sz_look_up_transform_avx512(source, length, lut, target);
-#elif SZ_USE_X86_AVX2
+#if SZ_USE_ICE
+    sz_look_up_transform_ice(source, length, lut, target);
+#elif SZ_USE_HASWELL
     sz_look_up_transform_avx2(source, length, lut, target);
-#elif SZ_USE_ARM_NEON
+#elif SZ_USE_NEON
     sz_look_up_transform_neon(source, length, lut, target);
 #else
     sz_look_up_transform_serial(source, length, lut, target);
@@ -6986,11 +6327,11 @@ SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t source, sz_size_t length, sz_cptr
 }
 
 SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
     return sz_find_byte_avx512(haystack, h_length, needle);
-#elif SZ_USE_X86_AVX2
+#elif SZ_USE_HASWELL
     return sz_find_byte_avx2(haystack, h_length, needle);
-#elif SZ_USE_ARM_NEON
+#elif SZ_USE_NEON
     return sz_find_byte_neon(haystack, h_length, needle);
 #else
     return sz_find_byte_serial(haystack, h_length, needle);
@@ -6998,11 +6339,11 @@ SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cpt
 }
 
 SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
     return sz_rfind_byte_avx512(haystack, h_length, needle);
-#elif SZ_USE_X86_AVX2
+#elif SZ_USE_HASWELL
     return sz_rfind_byte_avx2(haystack, h_length, needle);
-#elif SZ_USE_ARM_NEON
+#elif SZ_USE_NEON
     return sz_rfind_byte_neon(haystack, h_length, needle);
 #else
     return sz_rfind_byte_serial(haystack, h_length, needle);
@@ -7010,11 +6351,11 @@ SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cp
 }
 
 SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
-#if SZ_USE_X86_AVX512
-    return sz_find_avx512(haystack, h_length, needle, n_length);
-#elif SZ_USE_X86_AVX2
+#if SZ_USE_ICE
+    return sz_find_skylake(haystack, h_length, needle, n_length);
+#elif SZ_USE_HASWELL
     return sz_find_avx2(haystack, h_length, needle, n_length);
-#elif SZ_USE_ARM_NEON
+#elif SZ_USE_NEON
     return sz_find_neon(haystack, h_length, needle, n_length);
 #else
     return sz_find_serial(haystack, h_length, needle, n_length);
@@ -7022,11 +6363,11 @@ SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t n
 }
 
 SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
-#if SZ_USE_X86_AVX512
-    return sz_rfind_avx512(haystack, h_length, needle, n_length);
-#elif SZ_USE_X86_AVX2
+#if SZ_USE_ICE
+    return sz_rfind_skylake(haystack, h_length, needle, n_length);
+#elif SZ_USE_HASWELL
     return sz_rfind_avx2(haystack, h_length, needle, n_length);
-#elif SZ_USE_ARM_NEON
+#elif SZ_USE_NEON
     return sz_rfind_neon(haystack, h_length, needle, n_length);
 #else
     return sz_rfind_serial(haystack, h_length, needle, n_length);
@@ -7034,11 +6375,11 @@ SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t
 }
 
 SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#if SZ_USE_X86_AVX512
-    return sz_find_charset_avx512(text, length, set);
-#elif SZ_USE_X86_AVX2
+#if SZ_USE_ICE
+    return sz_find_charset_ice(text, length, set);
+#elif SZ_USE_HASWELL
     return sz_find_charset_avx2(text, length, set);
-#elif SZ_USE_ARM_NEON
+#elif SZ_USE_NEON
     return sz_find_charset_neon(text, length, set);
 #else
     return sz_find_charset_serial(text, length, set);
@@ -7046,11 +6387,11 @@ SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charse
 }
 
 SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#if SZ_USE_X86_AVX512
-    return sz_rfind_charset_avx512(text, length, set);
-#elif SZ_USE_X86_AVX2
+#if SZ_USE_ICE
+    return sz_rfind_charset_ice(text, length, set);
+#elif SZ_USE_HASWELL
     return sz_rfind_charset_avx2(text, length, set);
-#elif SZ_USE_ARM_NEON
+#elif SZ_USE_NEON
     return sz_rfind_charset_neon(text, length, set);
 #else
     return sz_rfind_charset_serial(text, length, set);
@@ -7075,7 +6416,7 @@ SZ_DYNAMIC sz_size_t sz_edit_distance( //
     sz_cptr_t a, sz_size_t a_length,   //
     sz_cptr_t b, sz_size_t b_length,   //
     sz_size_t bound, sz_memory_allocator_t *alloc) {
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
     return sz_edit_distance_avx512(a, a_length, b, b_length, bound, alloc);
 #else
     return sz_edit_distance_serial(a, a_length, b, b_length, bound, alloc);
@@ -7092,7 +6433,7 @@ SZ_DYNAMIC sz_size_t sz_edit_distance_utf8( //
 SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
                                          sz_error_cost_t const *subs, sz_error_cost_t gap,
                                          sz_memory_allocator_t *alloc) {
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
     return sz_alignment_score_avx512(a, a_length, b, b_length, subs, gap, alloc);
 #else
     return sz_alignment_score_serial(a, a_length, b, b_length, subs, gap, alloc);
@@ -7101,9 +6442,9 @@ SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cpt
 
 SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
                           sz_hash_callback_t callback, void *callback_handle) {
-#if SZ_USE_X86_AVX512
+#if SZ_USE_ICE
     sz_hashes_avx512(text, length, window_length, window_step, callback, callback_handle);
-#elif SZ_USE_X86_AVX2
+#elif SZ_USE_HASWELL
     sz_hashes_avx2(text, length, window_length, window_step, callback, callback_handle);
 #else
     sz_hashes_serial(text, length, window_length, window_step, callback, callback_handle);

From fc408fa0a0f2d947c610568bd7a5c4a60ecca443 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 19:24:49 +0000
Subject: [PATCH 043/751] Make: Split ./include/stringzilla/find.h to
 ./include/stringzilla/compare.h

---
 include/stringzilla/{find.h => compare.h} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename include/stringzilla/{find.h => compare.h} (100%)

diff --git a/include/stringzilla/find.h b/include/stringzilla/compare.h
similarity index 100%
rename from include/stringzilla/find.h
rename to include/stringzilla/compare.h

From 49e8d9d240993bdf68715a9c87824a032752798d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 19:24:50 +0000
Subject: [PATCH 044/751] Make: Split ./include/stringzilla/find.h to
 ./include/stringzilla/compare.h

---
 include/stringzilla/find.h => temp | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename include/stringzilla/find.h => temp (100%)

diff --git a/include/stringzilla/find.h b/temp
similarity index 100%
rename from include/stringzilla/find.h
rename to temp

From fc9e5d61e5fb1c5031f6f10920f6b50e2530de1e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 19:24:50 +0000
Subject: [PATCH 045/751] Make: Split ./include/stringzilla/find.h to
 ./include/stringzilla/compare.h

---
 temp => include/stringzilla/find.h | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename temp => include/stringzilla/find.h (100%)

diff --git a/temp b/include/stringzilla/find.h
similarity index 100%
rename from temp
rename to include/stringzilla/find.h

From 6512f1d129aeddc8601c9df7332c135038914b68 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Dec 2024 19:54:45 +0000
Subject: [PATCH 046/751] Fix: Filter `compare.h` file

---
 include/stringzilla/compare.h | 1716 +++------------------------------
 include/stringzilla/find.h    |   82 +-
 2 files changed, 150 insertions(+), 1648 deletions(-)

diff --git a/include/stringzilla/compare.h b/include/stringzilla/compare.h
index 4571515d..9f2e276d 100644
--- a/include/stringzilla/compare.h
+++ b/include/stringzilla/compare.h
@@ -1,24 +1,17 @@
 /**
- *  @brief  Hardware-accelerated sub-string and character-set search utilities.
- *  @file   find.h
+ *  @brief  Hardware-accelerated string comparison utilities.
+ *  @file   compare.h
  *  @author Ash Vardanian
  *
  *  Includes core APIs:
  *
- *  - `sz_equal`
- *  - `sz_find` and reverse-order `sz_rfind`
- *  - `sz_find_byte` and reverse-order `sz_rfind_byte`
- *  - `sz_find_charset` and reverse-order `sz_rfind_charset`
- *
- *  Convenience functions for character-set matching:
- *
- *  - `sz_find_char_from`
- *  - `sz_find_char_not_from`
- *  - `sz_rfind_char_from`
- *  - `sz_rfind_char_not_from`
+ *  - `sz_equal` - for equality comparison of two strings.
+ *  - `sz_order` - for the relative order of two strings, similar to `memcmp`.
+ *  - TODO: `sz_mismatch`, `sz_rmismatch` - to supersede `sz_equal`.
+ *  - TODO: `sz_order_utf8` - for the relative order of two UTF-8 strings.
  */
-#ifndef STRINGZILLA_FIND_H_
-#define STRINGZILLA_FIND_H_
+#ifndef STRINGZILLA_COMPARE_H_
+#define STRINGZILLA_COMPARE_H_
 
 #include "types.h"
 
@@ -29,165 +22,56 @@ extern "C" {
 #pragma region Core API
 
 /**
- *  @brief  Locates first matching byte in a string. Equivalent to `memchr(haystack, *needle, h_length)` in LibC.
- *
- *  X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memchr.S
- *  Aarch64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/aarch64/memchr.S
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - single-byte substring to find.
- *  @return         Address of the first match.
- */
-SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/**
- *  @brief  Locates last matching byte in a string. Equivalent to `memrchr(haystack, *needle, h_length)` in LibC.
- *
- *  X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memrchr.S
- *  Aarch64 implementation: missing
+ *  @brief  Checks if two string are equal.
+ *          Similar to `memcmp(a, b, length) == 0` in LibC and `a == b` in STL.
  *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - single-byte substring to find.
- *  @return         Address of the last match.
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-#if SZ_USE_HASWELL
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_haswell(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_haswell(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-#endif
-
-#if SZ_USE_SKYLAKE
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_skylake(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_skylake(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-#endif
-
-#if SZ_USE_NEON
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-#endif
-
-/**
- *  @brief  Locates first matching substring.
- *          Equivalent to `memmem(haystack, h_length, needle, n_length)` in LibC.
- *          Similar to `strstr(haystack, needle)` in LibC, but requires known length.
+ *  The implementation of this function is very similar to `sz_order`, but the usage patterns are different.
+ *  This function is more often used in parsing, while `sz_order` is often used in sorting.
+ *  It works best on platforms with cheap
  *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - substring to find.
- *  @param n_length Number of bytes in the needle.
- *  @return         Address of the first match.
+ *  @param a        First string to compare.
+ *  @param b        Second string to compare.
+ *  @param length   Number of bytes in both strings.
+ *  @return         1 if strings match, 0 otherwise.
  */
-SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
+SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
 
 /**
- *  @brief  Locates the last matching substring.
+ *  @brief  Estimates the relative order of two strings. Equivalent to `memcmp(a, b, length)` in LibC.
+ *          Can be used on different length strings.
  *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - substring to find.
- *  @param n_length Number of bytes in the needle.
- *  @return         Address of the last match.
+ *  @param a        First string to compare.
+ *  @param a_length Number of bytes in the first string.
+ *  @param b        Second string to compare.
+ *  @param b_length Number of bytes in the second string.
+ *  @return         Negative if (a < b), positive if (a > b), zero if they are equal.
  */
-SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
+SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
 
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
+/** @copydoc sz_equal */
+SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
+/** @copydoc sz_order */
+SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
 
 #if SZ_USE_HASWELL
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_haswell(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_haswell(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+/** @copydoc sz_equal */
+SZ_PUBLIC sz_bool_t sz_equal_haswell(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
+/** @copydoc sz_order */
+SZ_PUBLIC sz_ordering_t sz_order_haswell(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
 #endif
 
 #if SZ_USE_SKYLAKE
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_skylake(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-#endif
-
-#if SZ_USE_NEON
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-#endif
-
-/**
- *  @brief  Finds the first character present from the ::set, present in ::text.
- *          Equivalent to `strspn(text, accepted)` and `strcspn(text, rejected)` in LibC.
- *          May have identical implementation and performance to ::sz_rfind_charset.
- *
- *  Useful for parsing, when we want to skip a set of characters. Examples:
- *  * 6 whitespaces: " \t\n\r\v\f".
- *  * 16 digits forming a float number: "0123456789,.eE+-".
- *  * 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
- *  * 2 JSON string special characters useful to locate the end of the string: "\"\\".
- *
- *  @param text     String to be scanned.
- *  @param set      Set of relevant characters.
- *  @return         Pointer to the first matching character from ::set.
- */
-SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-/**
- *  @brief  Finds the last character present from the ::set, present in ::text.
- *          Equivalent to `strspn(text, accepted)` and `strcspn(text, rejected)` in LibC.
- *          May have identical implementation and performance to ::sz_find_charset.
- *
- *  Useful for parsing, when we want to skip a set of characters. Examples:
- *  * 6 whitespaces: " \t\n\r\v\f".
- *  * 16 digits forming a float number: "0123456789,.eE+-".
- *  * 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
- *  * 2 JSON string special characters useful to locate the end of the string: "\"\\".
- *
- *  @param text     String to be scanned.
- *  @param set      Set of relevant characters.
- *  @return         Pointer to the last matching character from ::set.
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-#if SZ_USE_HASWELL
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_haswell(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_haswell(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-#endif
-
-#if SZ_USE_ICE
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_ice(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_ice(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+/** @copydoc sz_equal */
+SZ_PUBLIC sz_bool_t sz_equal_skylake(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
+/** @copydoc sz_order */
+SZ_PUBLIC sz_ordering_t sz_order_skylake(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
 #endif
 
 #if SZ_USE_NEON
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+/** @copydoc sz_equal */
+SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
+/** @copydoc sz_order */
+SZ_PUBLIC sz_ordering_t sz_order_neon(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
 #endif
 
 #pragma endregion // Core API
@@ -214,586 +98,23 @@ SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length)
     return (sz_bool_t)(a_end == a);
 }
 
-/**
- *  @brief  Chooses the offsets of the most interesting characters in a search needle.
- *
- *  Search throughput can significantly deteriorate if we are matching the wrong characters.
- *  Say the needle is "aXaYa", and we are comparing the first, second, and last character.
- *  If we use SIMD and compare many offsets at a time, comparing against "a" in every register is a waste.
- *
- *  Similarly, dealing with UTF8 inputs, we know that the lower bits of each character code carry more information.
- *  Cyrillic alphabet, for example, falls into [0x0410, 0x042F] code range for uppercase [А, Я], and
- *  into [0x0430, 0x044F] for lowercase [а, я]. Scanning through a text written in Russian, half of the
- *  bytes will carry absolutely no value and will be equal to 0x04.
- */
-SZ_INTERNAL void _sz_locate_needle_anomalies( //
-    sz_cptr_t start, sz_size_t length,        //
-    sz_size_t *first, sz_size_t *second, sz_size_t *third) {
-
-    *first = 0;
-    *second = length / 2;
-    *third = length - 1;
-
-    //
-    int has_duplicates =                   //
-        start[*first] == start[*second] || //
-        start[*first] == start[*third] ||  //
-        start[*second] == start[*third];
-
-    // Loop through letters to find non-colliding variants.
-    if (length > 3 && has_duplicates) {
-        // Pivot the middle point right, until we find a character different from the first one.
-        while (start[*second] == start[*first] && *second + 1 < *third) ++(*second);
-        // Pivot the third (last) point left, until we find a different character.
-        while ((start[*third] == start[*second] || start[*third] == start[*first]) && *third > (*second + 1))
-            --(*third);
-    }
-
-    // TODO: Investigate alternative strategies for long needles.
-    // On very long needles we have the luxury to choose!
-    // Often dealing with UTF8, we will likely benefit from shifting the first and second characters
-    // further to the right, to achieve not only uniqueness within the needle, but also avoid common
-    // rune prefixes of 2-, 3-, and 4-byte codes.
-    if (length > 8) {
-        // Pivot the first and second points right, until we find a character, that:
-        // > is different from others.
-        // > doesn't start with 0b'110x'xxxx - only 5 bits of relevant info.
-        // > doesn't start with 0b'1110'xxxx - only 4 bits of relevant info.
-        // > doesn't start with 0b'1111'0xxx - only 3 bits of relevant info.
-        //
-        // So we are practically searching for byte values that start with 0b0xxx'xxxx or 0b'10xx'xxxx.
-        // Meaning they fall in the range [0, 127] and [128, 191], in other words any unsigned int up to 191.
-        sz_u8_t const *start_u8 = (sz_u8_t const *)start;
-        sz_size_t vibrant_first = *first, vibrant_second = *second, vibrant_third = *third;
-
-        // Let's begin with the seccond character, as the termination criteria there is more obvious
-        // and we may end up with more variants to check for the first candidate.
-        while ((start_u8[vibrant_second] > 191 || start_u8[vibrant_second] == start_u8[vibrant_third]) &&
-               (vibrant_second + 1 < vibrant_third))
-            ++vibrant_second;
-
-        // Now check if we've indeed found a good candidate or should revert the `vibrant_second` to `second`.
-        if (start_u8[vibrant_second] < 191) { *second = vibrant_second; }
-        else { vibrant_second = *second; }
-
-        // Now check the first character.
-        while ((start_u8[vibrant_first] > 191 || start_u8[vibrant_first] == start_u8[vibrant_second] ||
-                start_u8[vibrant_first] == start_u8[vibrant_third]) &&
-               (vibrant_first + 1 < vibrant_second))
-            ++vibrant_first;
-
-        // Now check if we've indeed found a good candidate or should revert the `vibrant_first` to `first`.
-        // We don't need to shift the third one when dealing with texts as the last byte of the text is
-        // also the last byte of a rune and contains the most information.
-        if (start_u8[vibrant_first] < 191) { *first = vibrant_first; }
-    }
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-    for (sz_cptr_t const end = text + length; text != end; ++text)
-        if (sz_charset_contains(set, *text)) return text;
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Warray-bounds"
-    sz_cptr_t const end = text;
-    for (text += length; text != end;)
-        if (sz_charset_contains(set, *(text -= 1))) return text;
-    return SZ_NULL_CHAR;
-#pragma GCC diagnostic pop
-}
-
-/*  Find the first occurrence of a @b single-character needle in an arbitrary length haystack.
- *  This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- *  Identical to `memchr(haystack, needle[0], haystack_length)`.
- */
-SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    if (!h_length) return SZ_NULL_CHAR;
-    sz_cptr_t const h_end = h + h_length;
-
-#if !_SZ_IS_BIG_ENDIAN       // Use SWAR only on little-endian platforms for brevity.
-#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
-        if (*h == *n) return h;
-#endif
-
-    // Broadcast the n into every byte of a 64-bit integer to use SWAR
-    // techniques and process eight characters at a time.
-    sz_u64_vec_t h_vec, n_vec, match_vec;
-    match_vec.u64 = 0;
-    n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
-    for (; h + 8 <= h_end; h += 8) {
-        h_vec.u64 = *(sz_u64_t const *)h;
-        match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
-        if (match_vec.u64) return h + sz_u64_ctz(match_vec.u64) / 8;
+SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
+    sz_bool_t a_shorter = (sz_bool_t)(a_length < b_length);
+    sz_size_t min_length = a_shorter ? a_length : b_length;
+    sz_cptr_t min_end = a + min_length;
+#if SZ_USE_MISALIGNED_LOADS && !_SZ_IS_BIG_ENDIAN
+    for (sz_u64_vec_t a_vec, b_vec; a + 8 <= min_end; a += 8, b += 8) {
+        a_vec = sz_u64_load(a);
+        b_vec = sz_u64_load(b);
+        if (a_vec.u64 != b_vec.u64)
+            return _sz_order_scalars(sz_u64_bytes_reverse(a_vec.u64), sz_u64_bytes_reverse(b_vec.u64));
     }
 #endif
+    for (; a != min_end; ++a, ++b)
+        if (*a != *b) return _sz_order_scalars(*a, *b);
 
-    // Handle the misaligned tail.
-    for (; h < h_end; ++h)
-        if (*h == *n) return h;
-    return SZ_NULL_CHAR;
-}
-
-/*  Find the last occurrence of a @b single-character needle in an arbitrary length haystack.
- *  This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- *  Identical to `memrchr(haystack, needle[0], haystack_length)`.
- */
-sz_cptr_t sz_rfind_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    if (!h_length) return SZ_NULL_CHAR;
-    sz_cptr_t const h_start = h;
-
-    // Reposition the `h` pointer to the end, as we will be walking backwards.
-    h = h + h_length - 1;
-
-#if !_SZ_IS_BIG_ENDIAN       // Use SWAR only on little-endian platforms for brevity.
-#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)(h + 1) & 7ull) && h >= h_start; --h)
-        if (*h == *n) return h;
-#endif
-
-    // Broadcast the n into every byte of a 64-bit integer to use SWAR
-    // techniques and process eight characters at a time.
-    sz_u64_vec_t h_vec, n_vec, match_vec;
-    n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
-    for (; h >= h_start + 7; h -= 8) {
-        h_vec.u64 = *(sz_u64_t const *)(h - 7);
-        match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
-        if (match_vec.u64) return h - sz_u64_clz(match_vec.u64) / 8;
-    }
-#endif
-
-    for (; h >= h_start; --h)
-        if (*h == *n) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  2Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 2byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_2byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 2byte is set.
-    // For that take the bottom 15 bits of each 2byte, add one to them,
-    // and if this sets the top bit to one, then all the 15 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7FFF7FFF7FFF7FFFull) + 0x0001000100010001ull) & ((vec.u64 & 0x8000800080008000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b two-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_2byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 2 bytes long.
-    sz_assert(h_length >= 2 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 2 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
-#endif
-
-    sz_u64_vec_t h_even_vec, h_odd_vec, n_vec, matches_even_vec, matches_odd_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1];
-    n_vec.u64 *= 0x0001000100010001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
-    for (; h + 9 <= h_end; h += 8) {
-        h_even_vec.u64 = *(sz_u64_t *)h;
-        h_odd_vec.u64 = (h_even_vec.u64 >> 8) | ((sz_u64_t)h[8] << 56);
-        matches_even_vec = _sz_u64_each_2byte_equal(h_even_vec, n_vec);
-        matches_odd_vec = _sz_u64_each_2byte_equal(h_odd_vec, n_vec);
-
-        matches_even_vec.u64 >>= 8;
-        if (matches_even_vec.u64 + matches_odd_vec.u64) {
-            sz_u64_t match_indicators = matches_even_vec.u64 | matches_odd_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 2 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  4Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 4byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_4byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 4byte is set.
-    // For that take the bottom 31 bits of each 4byte, add one to them,
-    // and if this sets the top bit to one, then all the 31 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7FFFFFFF7FFFFFFFull) + 0x0000000100000001ull) & ((vec.u64 & 0x8000000080000000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b four-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_4byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
-    sz_assert(h_length >= 4 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 4 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
-#endif
-
-    sz_u64_vec_t h0_vec, h1_vec, h2_vec, h3_vec, n_vec, matches0_vec, matches1_vec, matches2_vec, matches3_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1], n_vec.u8s[2] = n[2], n_vec.u8s[3] = n[3];
-    n_vec.u64 *= 0x0000000100000001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time using four 64-bit words.
-    // We load the subsequent four-byte word as well, taking its first bytes. Think of it as a glorified prefetch :)
-    sz_u64_t h_page_current, h_page_next;
-    for (; h + sizeof(sz_u64_t) + sizeof(sz_u32_t) <= h_end; h += sizeof(sz_u64_t)) {
-        h_page_current = *(sz_u64_t *)h;
-        h_page_next = *(sz_u32_t *)(h + 8);
-        h0_vec.u64 = (h_page_current);
-        h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
-        h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
-        h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
-        matches0_vec = _sz_u64_each_4byte_equal(h0_vec, n_vec);
-        matches1_vec = _sz_u64_each_4byte_equal(h1_vec, n_vec);
-        matches2_vec = _sz_u64_each_4byte_equal(h2_vec, n_vec);
-        matches3_vec = _sz_u64_each_4byte_equal(h3_vec, n_vec);
-
-        if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64) {
-            matches0_vec.u64 >>= 24;
-            matches1_vec.u64 >>= 16;
-            matches2_vec.u64 >>= 8;
-            sz_u64_t match_indicators = matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 4 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  3Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 3byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_3byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 4byte is set.
-    // For that take the bottom 31 bits of each 4byte, add one to them,
-    // and if this sets the top bit to one, then all the 31 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0xFFFF7FFFFF7FFFFFull) + 0x0000000001000001ull) & ((vec.u64 & 0x0000800000800000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b three-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_3byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
-    sz_assert(h_length >= 3 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 3 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
-#endif
-
-    // We fetch 12
-    sz_u64_vec_t h0_vec, h1_vec, h2_vec, h3_vec, h4_vec;
-    sz_u64_vec_t matches0_vec, matches1_vec, matches2_vec, matches3_vec, matches4_vec;
-    sz_u64_vec_t n_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1], n_vec.u8s[2] = n[2];
-    n_vec.u64 *= 0x0000000001000001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time using three 64-bit words.
-    // We load the subsequent two-byte word as well.
-    sz_u64_t h_page_current, h_page_next;
-    for (; h + sizeof(sz_u64_t) + sizeof(sz_u16_t) <= h_end; h += sizeof(sz_u64_t)) {
-        h_page_current = *(sz_u64_t *)h;
-        h_page_next = *(sz_u16_t *)(h + 8);
-        h0_vec.u64 = (h_page_current);
-        h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
-        h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
-        h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
-        h4_vec.u64 = (h_page_current >> 32) | (h_page_next << 32);
-        matches0_vec = _sz_u64_each_3byte_equal(h0_vec, n_vec);
-        matches1_vec = _sz_u64_each_3byte_equal(h1_vec, n_vec);
-        matches2_vec = _sz_u64_each_3byte_equal(h2_vec, n_vec);
-        matches3_vec = _sz_u64_each_3byte_equal(h3_vec, n_vec);
-        matches4_vec = _sz_u64_each_3byte_equal(h4_vec, n_vec);
-
-        if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64) {
-            matches0_vec.u64 >>= 16;
-            matches1_vec.u64 >>= 8;
-            matches3_vec.u64 <<= 8;
-            matches4_vec.u64 <<= 16;
-            sz_u64_t match_indicators =
-                matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 3 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Boyer-Moore-Horspool algorithm for exact matching of patterns up to @b 256-bytes long.
- *          Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_horspool_upto_256bytes_serial( //
-    sz_cptr_t h_chars, sz_size_t h_length,                    //
-    sz_cptr_t n_chars, sz_size_t n_length) {
-    sz_assert(n_length <= 256 && "The pattern is too long.");
-    // Several popular string matching algorithms are using a bad-character shift table.
-    // Boyer Moore: https://www-igm.univ-mlv.fr/~lecroq/string/node14.html
-    // Quick Search: https://www-igm.univ-mlv.fr/~lecroq/string/node19.html
-    // Smith: https://www-igm.univ-mlv.fr/~lecroq/string/node21.html
-    union {
-        sz_u8_t jumps[256];
-        sz_u64_vec_t vecs[64];
-    } bad_shift_table;
-
-    // Let's initialize the table using SWAR to the total length of the string.
-    sz_u8_t const *h = (sz_u8_t const *)h_chars;
-    sz_u8_t const *n = (sz_u8_t const *)n_chars;
-    {
-        sz_u64_vec_t n_length_vec;
-        n_length_vec.u64 = n_length;
-        n_length_vec.u64 *= 0x0101010101010101ull; // broadcast
-        for (sz_size_t i = 0; i != 64; ++i) bad_shift_table.vecs[i].u64 = n_length_vec.u64;
-        for (sz_size_t i = 0; i + 1 < n_length; ++i) bad_shift_table.jumps[n[i]] = (sz_u8_t)(n_length - i - 1);
-    }
-
-    // Another common heuristic is to match a few characters from different parts of a string.
-    // Raita suggests to use the first two, the last, and the middle character of the pattern.
-    sz_u32_vec_t h_vec, n_vec;
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into an unsigned integer.
-    n_vec.u8s[0] = n[offset_first];
-    n_vec.u8s[1] = n[offset_first + 1];
-    n_vec.u8s[2] = n[offset_mid];
-    n_vec.u8s[3] = n[offset_last];
-
-    // Scan through the whole haystack, skipping the last `n_length - 1` bytes.
-    for (sz_size_t i = 0; i <= h_length - n_length;) {
-        h_vec.u8s[0] = h[i + offset_first];
-        h_vec.u8s[1] = h[i + offset_first + 1];
-        h_vec.u8s[2] = h[i + offset_mid];
-        h_vec.u8s[3] = h[i + offset_last];
-        if (h_vec.u32 == n_vec.u32 && sz_equal_serial((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
-        i += bad_shift_table.jumps[h[i + n_length - 1]];
-    }
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Boyer-Moore-Horspool algorithm for @b reverse-order exact matching of patterns up to @b 256-bytes long.
- *          Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
- */
-SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_upto_256bytes_serial( //
-    sz_cptr_t h_chars, sz_size_t h_length,                     //
-    sz_cptr_t n_chars, sz_size_t n_length) {
-    sz_assert(n_length <= 256 && "The pattern is too long.");
-    union {
-        sz_u8_t jumps[256];
-        sz_u64_vec_t vecs[64];
-    } bad_shift_table;
-
-    // Let's initialize the table using SWAR to the total length of the string.
-    sz_u8_t const *h = (sz_u8_t const *)h_chars;
-    sz_u8_t const *n = (sz_u8_t const *)n_chars;
-    {
-        sz_u64_vec_t n_length_vec;
-        n_length_vec.u64 = n_length;
-        n_length_vec.u64 *= 0x0101010101010101ull; // broadcast
-        for (sz_size_t i = 0; i != 64; ++i) bad_shift_table.vecs[i].u64 = n_length_vec.u64;
-        for (sz_size_t i = 0; i + 1 < n_length; ++i)
-            bad_shift_table.jumps[n[n_length - i - 1]] = (sz_u8_t)(n_length - i - 1);
-    }
-
-    // Another common heuristic is to match a few characters from different parts of a string.
-    // Raita suggests to use the first two, the last, and the middle character of the pattern.
-    sz_u32_vec_t h_vec, n_vec;
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into an unsigned integer.
-    n_vec.u8s[0] = n[offset_first];
-    n_vec.u8s[1] = n[offset_first + 1];
-    n_vec.u8s[2] = n[offset_mid];
-    n_vec.u8s[3] = n[offset_last];
-
-    // Scan through the whole haystack, skipping the first `n_length - 1` bytes.
-    for (sz_size_t j = 0; j <= h_length - n_length;) {
-        sz_size_t i = h_length - n_length - j;
-        h_vec.u8s[0] = h[i + offset_first];
-        h_vec.u8s[1] = h[i + offset_first + 1];
-        h_vec.u8s[2] = h[i + offset_mid];
-        h_vec.u8s[3] = h[i + offset_last];
-        if (h_vec.u32 == n_vec.u32 && sz_equal_serial((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
-        j += bad_shift_table.jumps[h[i]];
-    }
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Exact substring search helper function, that finds the first occurrence of a prefix of the needle
- *          using a given search function, and then verifies the remaining part of the needle.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_with_prefix( //
-    sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length, sz_find_t find_prefix, sz_size_t prefix_length) {
-
-    sz_size_t suffix_length = n_length - prefix_length;
-    while (1) {
-        sz_cptr_t found = find_prefix(h, h_length, n, prefix_length);
-        if (!found) return SZ_NULL_CHAR;
-
-        // Verify the remaining part of the needle
-        sz_size_t remaining = h_length - (found - h);
-        if (remaining < n_length) return SZ_NULL_CHAR;
-        if (sz_equal_serial(found + prefix_length, n + prefix_length, suffix_length)) return found;
-
-        // Adjust the position.
-        h = found + 1;
-        h_length = remaining - 1;
-    }
-
-    // Unreachable, but helps silence compiler warnings:
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Exact reverse-order substring search helper function, that finds the last occurrence of a suffix of the
- *          needle using a given search function, and then verifies the remaining part of the needle.
- */
-SZ_INTERNAL sz_cptr_t _sz_rfind_with_suffix(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length,
-                                            sz_find_t find_suffix, sz_size_t suffix_length) {
-
-    sz_size_t prefix_length = n_length - suffix_length;
-    while (1) {
-        sz_cptr_t found = find_suffix(h, h_length, n + prefix_length, suffix_length);
-        if (!found) return SZ_NULL_CHAR;
-
-        // Verify the remaining part of the needle
-        sz_size_t remaining = found - h;
-        if (remaining < prefix_length) return SZ_NULL_CHAR;
-        if (sz_equal_serial(found - prefix_length, n, prefix_length)) return found - prefix_length;
-
-        // Adjust the position.
-        h_length = remaining - 1;
-    }
-
-    // Unreachable, but helps silence compiler warnings:
-    return SZ_NULL_CHAR;
-}
-
-SZ_INTERNAL sz_cptr_t _sz_find_over_4bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    return _sz_find_with_prefix(h, h_length, n, n_length, (sz_find_t)_sz_find_4byte_serial, 4);
-}
-
-SZ_INTERNAL sz_cptr_t _sz_find_horspool_over_256bytes_serial( //
-    sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    return _sz_find_with_prefix(h, h_length, n, n_length, _sz_find_horspool_upto_256bytes_serial, 256);
-}
-
-SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_over_256bytes_serial( //
-    sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    return _sz_rfind_with_suffix(h, h_length, n, n_length, _sz_rfind_horspool_upto_256bytes_serial, 256);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-
-#if _SZ_IS_BIG_ENDIAN
-    sz_find_t backends[] = {
-        (sz_find_t)sz_find_byte_serial,
-        (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_find_horspool_over_256bytes_serial,
-    };
-
-    return backends[(n_length > 1) + (n_length > 256)](h, h_length, n, n_length);
-#else
-    sz_find_t backends[] = {
-        // For very short strings brute-force SWAR makes sense.
-        (sz_find_t)sz_find_byte_serial,
-        (sz_find_t)_sz_find_2byte_serial,
-        (sz_find_t)_sz_find_3byte_serial,
-        (sz_find_t)_sz_find_4byte_serial,
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (sz_find_t)_sz_find_over_4bytes_serial,
-        // For longer needles - use skip tables.
-        (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_find_horspool_over_256bytes_serial,
-    };
-
-    return backends[
-        // For very short strings brute-force SWAR makes sense.
-        (n_length > 1) + (n_length > 2) + (n_length > 3) +
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (n_length > 4) +
-        // For longer needles - use skip tables.
-        (n_length > 8) + (n_length > 256)](h, h_length, n, n_length);
-#endif
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-
-    sz_find_t backends[] = {
-        // For very short strings brute-force SWAR makes sense.
-        (sz_find_t)sz_rfind_byte_serial,
-        //  TODO: implement reverse-order SWAR for 2/3/4 byte variants.
-        //  TODO: (sz_find_t)_sz_rfind_2byte_serial,
-        //  TODO: (sz_find_t)_sz_rfind_3byte_serial,
-        //  TODO: (sz_find_t)_sz_rfind_4byte_serial,
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        // (sz_find_t)_sz_rfind_over_4bytes_serial,
-        // For longer needles - use skip tables.
-        (sz_find_t)_sz_rfind_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_rfind_horspool_over_256bytes_serial,
-    };
-
-    return backends[
-        // For very short strings brute-force SWAR makes sense.
-        0 +
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (n_length > 1) +
-        // For longer needles - use skip tables.
-        (n_length > 256)](h, h_length, n, n_length);
+    // If the strings are equal up to `min_end`, then the shorter string is smaller
+    return _sz_order_scalars(a_length, b_length);
 }
 
 #pragma endregion // Serial Implementation
@@ -804,8 +125,14 @@ SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n
 #pragma region Haswell Implementation
 #if SZ_USE_HASWELL
 #pragma GCC push_options
-#pragma GCC target("haswell")
-#pragma clang attribute push(__attribute__((target("haswell"))), apply_to = function)
+#pragma GCC target("avx2")
+#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
+
+SZ_PUBLIC sz_ordering_t sz_order_haswell(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
+    //! Before optimizing this, read the "Operations Not Worth Optimizing" in Contributions Guide:
+    //! https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md#general-performance-observations
+    return sz_order_serial(a, a_length, b, b_length);
+}
 
 SZ_PUBLIC sz_bool_t sz_equal_haswell(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
     sz_u256_vec_t a_vec, b_vec;
@@ -823,203 +150,6 @@ SZ_PUBLIC sz_bool_t sz_equal_haswell(sz_cptr_t a, sz_cptr_t b, sz_size_t length)
     return sz_true_k;
 }
 
-SZ_PUBLIC sz_cptr_t sz_find_byte_haswell(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    int mask;
-    sz_u256_vec_t h_vec, n_vec;
-    n_vec.ymm = _mm256_set1_epi8(n[0]);
-
-    while (h_length >= 32) {
-        h_vec.ymm = _mm256_lddqu_si256((__m256i const *)h);
-        mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_vec.ymm, n_vec.ymm));
-        if (mask) return h + sz_u32_ctz(mask);
-        h += 32, h_length -= 32;
-    }
-
-    return sz_find_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_haswell(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    int mask;
-    sz_u256_vec_t h_vec, n_vec;
-    n_vec.ymm = _mm256_set1_epi8(n[0]);
-
-    while (h_length >= 32) {
-        h_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + h_length - 32));
-        mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_vec.ymm, n_vec.ymm));
-        if (mask) return h + h_length - 1 - sz_u32_clz(mask);
-        h_length -= 32;
-    }
-
-    return sz_rfind_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_haswell(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_haswell(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into YMM registers.
-    int matches;
-    sz_u256_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.ymm = _mm256_set1_epi8(n[offset_first]);
-    n_mid_vec.ymm = _mm256_set1_epi8(n[offset_mid]);
-    n_last_vec.ymm = _mm256_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    for (; h_length >= n_length + 32; h += 32, h_length -= 32) {
-        h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_first));
-        h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_mid));
-        h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_last));
-        matches = //
-            _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
-            _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
-            _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
-        while (matches) {
-            int potential_offset = sz_u32_ctz(matches);
-            if (sz_equal_haswell(h + potential_offset, n, n_length)) return h + potential_offset;
-            matches &= matches - 1;
-        }
-    }
-
-    return sz_find_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_haswell(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_haswell(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into YMM registers.
-    int matches;
-    sz_u256_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.ymm = _mm256_set1_epi8(n[offset_first]);
-    n_mid_vec.ymm = _mm256_set1_epi8(n[offset_mid]);
-    n_last_vec.ymm = _mm256_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 32; h_length -= 32) {
-        h_reversed = h + h_length - n_length - 32 + 1;
-        h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_first));
-        h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_mid));
-        h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_last));
-        matches = //
-            _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
-            _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
-            _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
-        while (matches) {
-            int potential_offset = sz_u32_clz(matches);
-            if (sz_equal_haswell(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            matches &= ~(1 << (31 - potential_offset));
-        }
-    }
-
-    return sz_rfind_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_haswell(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-
-    // Let's unzip even and odd elements and replicate them into both lanes of the YMM register.
-    // That way when we invoke `_mm256_shuffle_epi8` we can use the same mask for both lanes.
-    sz_u256_vec_t filter_even_vec, filter_odd_vec;
-    for (sz_size_t i = 0; i != 16; ++i)
-        filter_even_vec.u8s[i] = filter->_u8s[i * 2], filter_odd_vec.u8s[i] = filter->_u8s[i * 2 + 1];
-    filter_even_vec.xmms[1] = filter_even_vec.xmms[0];
-    filter_odd_vec.xmms[1] = filter_odd_vec.xmms[0];
-
-    sz_u256_vec_t text_vec;
-    sz_u256_vec_t matches_vec;
-    sz_u256_vec_t lower_nibbles_vec, higher_nibbles_vec;
-    sz_u256_vec_t bitset_even_vec, bitset_odd_vec;
-    sz_u256_vec_t bitmask_vec, bitmask_lookup_vec;
-    bitmask_lookup_vec.ymm = _mm256_set_epi8(                       //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);
-
-    while (length >= 32) {
-        // The following algorithm is a transposed equivalent of the "SIMD-ized check which bytes are in a set"
-        // solutions by Wojciech Muła. We populate the bitmask differently and target newer CPUs, so
-        // StrinZilla uses a somewhat different approach.
-        // http://0x80.pl/articles/simd-byte-lookup.html#alternative-implementation-new
-        //
-        //      sz_u8_t input = *(sz_u8_t const *)text;
-        //      sz_u8_t lo_nibble = input & 0x0f;
-        //      sz_u8_t hi_nibble = input >> 4;
-        //      sz_u8_t bitset_even = filter_even_vec.u8s[hi_nibble];
-        //      sz_u8_t bitset_odd = filter_odd_vec.u8s[hi_nibble];
-        //      sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //      sz_u8_t bitset = lo_nibble < 8 ? bitset_even : bitset_odd;
-        //      if ((bitset & bitmask) != 0) return text;
-        //      else { length--, text++; }
-        //
-        // The nice part about this, loading the strided data is vey easy with Arm NEON,
-        // while with x86 CPUs after AVX, shuffles within 256 bits shouldn't be an issue either.
-        text_vec.ymm = _mm256_lddqu_si256((__m256i const *)text);
-        lower_nibbles_vec.ymm = _mm256_and_si256(text_vec.ymm, _mm256_set1_epi8(0x0f));
-        bitmask_vec.ymm = _mm256_shuffle_epi8(bitmask_lookup_vec.ymm, lower_nibbles_vec.ymm);
-        //
-        // At this point we can validate the `bitmask_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != 32; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t lo_nibble = input & 0x0f;
-        //          sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //          sz_assert(bitmask_vec.u8s[i] == bitmask);
-        //      }
-        //
-        // Shift right every byte by 4 bits.
-        // There is no `_mm256_srli_epi8` intrinsic, so we have to use `_mm256_srli_epi16`
-        // and combine it with a mask to clear the higher bits.
-        higher_nibbles_vec.ymm = _mm256_and_si256(_mm256_srli_epi16(text_vec.ymm, 4), _mm256_set1_epi8(0x0f));
-        bitset_even_vec.ymm = _mm256_shuffle_epi8(filter_even_vec.ymm, higher_nibbles_vec.ymm);
-        bitset_odd_vec.ymm = _mm256_shuffle_epi8(filter_odd_vec.ymm, higher_nibbles_vec.ymm);
-        //
-        // At this point we can validate the `bitset_even_vec` and `bitset_odd_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != 32; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t const *bitset_ptr = &filter->_u8s[0];
-        //          sz_u8_t hi_nibble = input >> 4;
-        //          sz_u8_t bitset_even = bitset_ptr[hi_nibble * 2];
-        //          sz_u8_t bitset_odd = bitset_ptr[hi_nibble * 2 + 1];
-        //          sz_assert(bitset_even_vec.u8s[i] == bitset_even);
-        //          sz_assert(bitset_odd_vec.u8s[i] == bitset_odd);
-        //      }
-        //
-        __m256i take_first = _mm256_cmpgt_epi8(_mm256_set1_epi8(8), lower_nibbles_vec.ymm);
-        bitset_even_vec.ymm = _mm256_blendv_epi8(bitset_odd_vec.ymm, bitset_even_vec.ymm, take_first);
-
-        // It would have been great to have an instruction that tests the bits and then broadcasts
-        // the matching bit into all bits in that byte. But we don't have that, so we have to
-        // `and`, `cmpeq`, `movemask`, and then invert at the end...
-        matches_vec.ymm = _mm256_and_si256(bitset_even_vec.ymm, bitmask_vec.ymm);
-        matches_vec.ymm = _mm256_cmpeq_epi8(matches_vec.ymm, _mm256_setzero_si256());
-        int matches_mask = ~_mm256_movemask_epi8(matches_vec.ymm);
-        if (matches_mask) {
-            int offset = sz_u32_ctz(matches_mask);
-            return text + offset;
-        }
-        else { text += 32, length -= 32; }
-    }
-
-    return sz_find_charset_serial(text, length, filter);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_haswell(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-    return sz_rfind_charset_serial(text, length, filter);
-}
-
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_HASWELL
@@ -1036,6 +166,69 @@ SZ_PUBLIC sz_cptr_t sz_rfind_charset_haswell(sz_cptr_t text, sz_size_t length, s
 #pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
 #pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
 
+SZ_PUBLIC sz_ordering_t sz_order_skylake(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
+    sz_u512_vec_t a_vec, b_vec;
+
+    // Pointer arithmetic is cheap, fetching memory is not!
+    // So we can use the masked loads to fetch at most one cache-line for each string,
+    // compare the prefixes, and only then move forward.
+    sz_size_t a_head_length = 64 - ((sz_size_t)a % 64); // 63 or less.
+    sz_size_t b_head_length = 64 - ((sz_size_t)b % 64); // 63 or less.
+    a_head_length = a_head_length < a_length ? a_head_length : a_length;
+    b_head_length = b_head_length < b_length ? b_head_length : b_length;
+    sz_size_t head_length = a_head_length < b_head_length ? a_head_length : b_head_length;
+    __mmask64 head_mask = _sz_u64_mask_until(head_length);
+    a_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, a);
+    b_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, b);
+    __mmask64 mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
+    if (mask_not_equal != 0) {
+        sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
+        char a_char = a_vec.u8s[first_diff];
+        char b_char = b_vec.u8s[first_diff];
+        return _sz_order_scalars(a_char, b_char);
+    }
+    else if (head_length == a_length && head_length == b_length) { return sz_equal_k; }
+    else { a += head_length, b += head_length, a_length -= head_length, b_length -= head_length; }
+
+    // The rare case, when both string are very long.
+    __mmask64 a_mask, b_mask;
+    while ((a_length >= 64) & (b_length >= 64)) {
+        a_vec.zmm = _mm512_loadu_si512(a);
+        b_vec.zmm = _mm512_loadu_si512(b);
+        mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
+        if (mask_not_equal != 0) {
+            sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
+            char a_char = a_vec.u8s[first_diff];
+            char b_char = b_vec.u8s[first_diff];
+            return _sz_order_scalars(a_char, b_char);
+        }
+        a += 64, b += 64, a_length -= 64, b_length -= 64;
+    }
+
+    // In most common scenarios at least one of the strings is under 64 bytes.
+    if (a_length | b_length) {
+        a_mask = _sz_u64_clamp_mask_until(a_length);
+        b_mask = _sz_u64_clamp_mask_until(b_length);
+        a_vec.zmm = _mm512_maskz_loadu_epi8(a_mask, a);
+        b_vec.zmm = _mm512_maskz_loadu_epi8(b_mask, b);
+        // The AVX-512 `_mm512_mask_cmpneq_epi8_mask` intrinsics are generally handy in such environments.
+        // They, however, have latency 3 on most modern CPUs. Using AVX2: `_mm256_cmpeq_epi8` would have
+        // been cheaper, if we didn't have to apply `_mm256_movemask_epi8` afterwards.
+        mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
+        if (mask_not_equal != 0) {
+            sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
+            char a_char = a_vec.u8s[first_diff];
+            char b_char = b_vec.u8s[first_diff];
+            return _sz_order_scalars(a_char, b_char);
+        }
+        // From logic perspective, the hardest cases are "abc\0" and "abc".
+        // The result must be `sz_greater_k`, as the latter is shorter.
+        else { return _sz_order_scalars(a_length, b_length); }
+    }
+
+    return sz_equal_k;
+}
+
 SZ_PUBLIC sz_bool_t sz_equal_skylake(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
     __mmask64 mask;
     sz_u512_vec_t a_vec, b_vec;
@@ -1060,217 +253,6 @@ SZ_PUBLIC sz_bool_t sz_equal_skylake(sz_cptr_t a, sz_cptr_t b, sz_size_t length)
     return sz_true_k;
 }
 
-SZ_PUBLIC sz_cptr_t sz_find_byte_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    __mmask64 mask;
-    sz_u512_vec_t h_vec, n_vec;
-    n_vec.zmm = _mm512_set1_epi8(n[0]);
-
-    while (h_length >= 64) {
-        h_vec.zmm = _mm512_loadu_si512(h);
-        mask = _mm512_cmpeq_epi8_mask(h_vec.zmm, n_vec.zmm);
-        if (mask) return h + sz_u64_ctz(mask);
-        h += 64, h_length -= 64;
-    }
-
-    if (h_length) {
-        mask = _sz_u64_mask_until(h_length);
-        h_vec.zmm = _mm512_maskz_loadu_epi8(mask, h);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpeq_epu8_mask(mask, h_vec.zmm, n_vec.zmm);
-        if (mask) return h + sz_u64_ctz(mask);
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_skylake(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into ZMM registers.
-    __mmask64 matches;
-    __mmask64 mask;
-    sz_u512_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.zmm = _mm512_set1_epi8(n[offset_first]);
-    n_mid_vec.zmm = _mm512_set1_epi8(n[offset_mid]);
-    n_last_vec.zmm = _mm512_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    // We have several optimized versions of the algorithm for shorter strings,
-    // but they all mimic the default case for unbounded length needles
-    if (n_length >= 64) {
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64( //
-                _kand_mask64(       // Intersect the masks
-                    _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                    _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches);
-                if (sz_equal_skylake(h + potential_offset, n, n_length)) return h + potential_offset;
-                matches &= matches - 1;
-            }
-
-            // TODO: If the last character contains a bad byte, we can reposition the start of the next iteration.
-            // This will be very helpful for very long needles.
-        }
-    }
-    // If there are only 2 or 3 characters in the needle, we don't even need the nested loop.
-    else if (n_length <= 3) {
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64( //
-                _kand_mask64(       // Intersect the masks
-                    _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                    _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            if (matches) return h + sz_u64_ctz(matches);
-        }
-    }
-    // If the needle is smaller than the size of the ZMM register, we can use masked comparisons
-    // to avoid the the inner-most nested loop and compare the entire needle against a haystack
-    // slice in 3 CPU cycles.
-    else {
-        __mmask64 n_mask = _sz_u64_mask_until(n_length);
-        sz_u512_vec_t n_full_vec, h_full_vec;
-        n_full_vec.zmm = _mm512_maskz_loadu_epi8(n_mask, n);
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64( //
-                _kand_mask64(       // Intersect the masks
-                    _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                    _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches);
-                h_full_vec.zmm = _mm512_maskz_loadu_epi8(n_mask, h + potential_offset);
-                if (_mm512_mask_cmpneq_epi8_mask(n_mask, h_full_vec.zmm, n_full_vec.zmm) == 0)
-                    return h + potential_offset;
-                matches &= matches - 1;
-            }
-        }
-    }
-
-    // The "tail" of the function uses masked loads to process the remaining bytes.
-    {
-        mask = _sz_u64_mask_until(h_length - n_length + 1);
-        h_first_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_first);
-        h_mid_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_mid);
-        h_last_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_last);
-        matches = _kand_mask64( //
-            _kand_mask64(       // Intersect the masks
-                _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-            _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_ctz(matches);
-            if (n_length <= 3 || sz_equal_skylake(h + potential_offset, n, n_length)) return h + potential_offset;
-            matches &= matches - 1;
-        }
-    }
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    __mmask64 mask;
-    sz_u512_vec_t h_vec, n_vec;
-    n_vec.zmm = _mm512_set1_epi8(n[0]);
-
-    while (h_length >= 64) {
-        h_vec.zmm = _mm512_loadu_si512(h + h_length - 64);
-        mask = _mm512_cmpeq_epi8_mask(h_vec.zmm, n_vec.zmm);
-        if (mask) return h + h_length - 1 - sz_u64_clz(mask);
-        h_length -= 64;
-    }
-
-    if (h_length) {
-        mask = _sz_u64_mask_until(h_length);
-        h_vec.zmm = _mm512_maskz_loadu_epi8(mask, h);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpeq_epu8_mask(mask, h_vec.zmm, n_vec.zmm);
-        if (mask) return h + 64 - sz_u64_clz(mask) - 1;
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_skylake(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into ZMM registers.
-    __mmask64 mask;
-    __mmask64 matches;
-    sz_u512_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.zmm = _mm512_set1_epi8(n[offset_first]);
-    n_mid_vec.zmm = _mm512_set1_epi8(n[offset_mid]);
-    n_last_vec.zmm = _mm512_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 64; h_length -= 64) {
-        h_reversed = h + h_length - n_length - 64 + 1;
-        h_first_vec.zmm = _mm512_loadu_si512(h_reversed + offset_first);
-        h_mid_vec.zmm = _mm512_loadu_si512(h_reversed + offset_mid);
-        h_last_vec.zmm = _mm512_loadu_si512(h_reversed + offset_last);
-        matches = _kand_mask64( //
-            _kand_mask64(       // Intersect the masks
-                _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-            _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches);
-            if (n_length <= 3 || sz_equal_skylake(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~((sz_u64_t)1 << (63 - potential_offset));
-        }
-    }
-
-    // The "tail" of the function uses masked loads to process the remaining bytes.
-    {
-        mask = _sz_u64_mask_until(h_length - n_length + 1);
-        h_first_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_first);
-        h_mid_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_mid);
-        h_last_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_last);
-        matches = _kand_mask64( //
-            _kand_mask64(       // Intersect the masks
-                _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-            _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches);
-            if (n_length <= 3 || sz_equal_skylake(h + 64 - potential_offset - 1, n, n_length))
-                return h + 64 - potential_offset - 1;
-            sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~((sz_u64_t)1 << (63 - potential_offset));
-        }
-    }
-
-    return SZ_NULL_CHAR;
-}
-
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_SKYLAKE
@@ -1289,124 +271,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
 #pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
                              apply_to = function)
 
-SZ_PUBLIC sz_cptr_t sz_find_charset_ice(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-
-    // Before initializing the AVX-512 vectors, we may want to run the sequential code for the first few bytes.
-    // In practice, that only hurts, even when we have matches every 5-ish bytes.
-    //
-    //      if (length < SZ_SWAR_THRESHOLD) return sz_find_charset_serial(text, length, filter);
-    //      sz_cptr_t early_result = sz_find_charset_serial(text, SZ_SWAR_THRESHOLD, filter);
-    //      if (early_result) return early_result;
-    //      text += SZ_SWAR_THRESHOLD;
-    //      length -= SZ_SWAR_THRESHOLD;
-    //
-    // Let's unzip even and odd elements and replicate them into both lanes of the YMM register.
-    // That way when we invoke `_mm512_shuffle_epi8` we can use the same mask for both lanes.
-    sz_u512_vec_t filter_even_vec, filter_odd_vec;
-    __m256i filter_ymm = _mm256_lddqu_si256((__m256i const *)filter);
-    // There are a few way to initialize filters without having native strided loads.
-    // In the cronological order of experiments:
-    // - serial code initializing 128 bytes of odd and even mask
-    // - using several shuffles
-    // - using `_mm512_permutexvar_epi8`
-    // - using `_mm512_broadcast_i32x4(_mm256_castsi256_si128(_mm256_maskz_compress_epi8(0x55555555, filter_ymm)))`
-    //   and `_mm512_broadcast_i32x4(_mm256_castsi256_si128(_mm256_maskz_compress_epi8(0xaaaaaaaa, filter_ymm)))`
-    filter_even_vec.zmm = _mm512_broadcast_i32x4(_mm256_castsi256_si128( // broadcast __m128i to __m512i
-        _mm256_maskz_compress_epi8(0x55555555, filter_ymm)));
-    filter_odd_vec.zmm = _mm512_broadcast_i32x4(_mm256_castsi256_si128( // broadcast __m128i to __m512i
-        _mm256_maskz_compress_epi8(0xaaaaaaaa, filter_ymm)));
-    // After the unzipping operation, we can validate the contents of the vectors like this:
-    //
-    //      for (sz_size_t i = 0; i != 16; ++i) {
-    //          sz_assert(filter_even_vec.u8s[i] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 16] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 16] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 32] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 32] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 48] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 48] == filter->_u8s[i * 2 + 1]);
-    //      }
-    //
-    sz_u512_vec_t text_vec;
-    sz_u512_vec_t lower_nibbles_vec, higher_nibbles_vec;
-    sz_u512_vec_t bitset_even_vec, bitset_odd_vec;
-    sz_u512_vec_t bitmask_vec, bitmask_lookup_vec;
-    bitmask_lookup_vec.zmm = _mm512_set_epi8(                       //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);
-
-    while (length) {
-        // The following algorithm is a transposed equivalent of the "SIMDized check which bytes are in a set"
-        // solutions by Wojciech Muła. We populate the bitmask differently and target newer CPUs, so
-        // StrinZilla uses a somewhat different approach.
-        // http://0x80.pl/articles/simd-byte-lookup.html#alternative-implementation-new
-        //
-        //      sz_u8_t input = *(sz_u8_t const *)text;
-        //      sz_u8_t lo_nibble = input & 0x0f;
-        //      sz_u8_t hi_nibble = input >> 4;
-        //      sz_u8_t bitset_even = filter_even_vec.u8s[hi_nibble];
-        //      sz_u8_t bitset_odd = filter_odd_vec.u8s[hi_nibble];
-        //      sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //      sz_u8_t bitset = lo_nibble < 8 ? bitset_even : bitset_odd;
-        //      if ((bitset & bitmask) != 0) return text;
-        //      else { length--, text++; }
-        //
-        // The nice part about this, loading the strided data is vey easy with Arm NEON,
-        // while with x86 CPUs after AVX, shuffles within 256 bits shouldn't be an issue either.
-        sz_size_t load_length = sz_min_of_two(length, 64);
-        __mmask64 load_mask = _sz_u64_mask_until(load_length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, text);
-        lower_nibbles_vec.zmm = _mm512_and_si512(text_vec.zmm, _mm512_set1_epi8(0x0f));
-        bitmask_vec.zmm = _mm512_shuffle_epi8(bitmask_lookup_vec.zmm, lower_nibbles_vec.zmm);
-        //
-        // At this point we can validate the `bitmask_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != load_length; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t lo_nibble = input & 0x0f;
-        //          sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //          sz_assert(bitmask_vec.u8s[i] == bitmask);
-        //      }
-        //
-        // Shift right every byte by 4 bits.
-        // There is no `_mm512_srli_epi8` intrinsic, so we have to use `_mm512_srli_epi16`
-        // and combine it with a mask to clear the higher bits.
-        higher_nibbles_vec.zmm = _mm512_and_si512(_mm512_srli_epi16(text_vec.zmm, 4), _mm512_set1_epi8(0x0f));
-        bitset_even_vec.zmm = _mm512_shuffle_epi8(filter_even_vec.zmm, higher_nibbles_vec.zmm);
-        bitset_odd_vec.zmm = _mm512_shuffle_epi8(filter_odd_vec.zmm, higher_nibbles_vec.zmm);
-        //
-        // At this point we can validate the `bitset_even_vec` and `bitset_odd_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != load_length; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t const *bitset_ptr = &filter->_u8s[0];
-        //          sz_u8_t hi_nibble = input >> 4;
-        //          sz_u8_t bitset_even = bitset_ptr[hi_nibble * 2];
-        //          sz_u8_t bitset_odd = bitset_ptr[hi_nibble * 2 + 1];
-        //          sz_assert(bitset_even_vec.u8s[i] == bitset_even);
-        //          sz_assert(bitset_odd_vec.u8s[i] == bitset_odd);
-        //      }
-        //
-        // TODO: Is this a good place for ternary logic?
-        __mmask64 take_first = _mm512_cmplt_epi8_mask(lower_nibbles_vec.zmm, _mm512_set1_epi8(8));
-        bitset_even_vec.zmm = _mm512_mask_blend_epi8(take_first, bitset_odd_vec.zmm, bitset_even_vec.zmm);
-        __mmask64 matches_mask = _mm512_mask_test_epi8_mask(load_mask, bitset_even_vec.zmm, bitmask_vec.zmm);
-        if (matches_mask) {
-            int offset = sz_u64_ctz(matches_mask);
-            return text + offset;
-        }
-        else { text += load_length, length -= load_length; }
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_ice(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-    return sz_rfind_charset_serial(text, length, filter);
-}
+/* Nothing here for now. */
 
 #pragma clang attribute pop
 #pragma GCC pop_options
@@ -1422,10 +287,10 @@ SZ_PUBLIC sz_cptr_t sz_rfind_charset_ice(sz_cptr_t text, sz_size_t length, sz_ch
 #pragma GCC target("arch=armv8.2-a+simd")
 #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
 
-SZ_INTERNAL sz_u64_t _sz_vreinterpretq_u8_u4(uint8x16_t vec) {
-    // Use `vshrn` to produce a bitmask, similar to `movemask` in SSE.
-    // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
-    return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(vec), 4)), 0) & 0x8888888888888888ull;
+SZ_PUBLIC sz_ordering_t sz_order_neon(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
+    //! Before optimizing this, read the "Operations Not Worth Optimizing" in Contributions Guide:
+    //! https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md#general-performance-observations
+    return sz_order_serial(a, a_length, b, b_length);
 }
 
 SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
@@ -1442,215 +307,6 @@ SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
     return sz_true_k;
 }
 
-SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec, n_vec, matches_vec;
-    n_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)n);
-
-    while (h_length >= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)h);
-        matches_vec.u8x16 = vceqq_u8(h_vec.u8x16, n_vec.u8x16);
-        // In Arm NEON we don't have a `movemask` to combine it with `ctz` and get the offset of the match.
-        // But assuming the `vmaxvq` is cheap, we can use it to find the first match, by blending (bitwise selecting)
-        // the vector with a relative offsets array.
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        if (matches) return h + sz_u64_ctz(matches) / 4;
-
-        h += 16, h_length -= 16;
-    }
-
-    return sz_find_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec, n_vec, matches_vec;
-    n_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)n);
-
-    while (h_length >= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)h + h_length - 16);
-        matches_vec.u8x16 = vceqq_u8(h_vec.u8x16, n_vec.u8x16);
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        if (matches) return h + h_length - 1 - sz_u64_clz(matches) / 4;
-        h_length -= 16;
-    }
-
-    return sz_rfind_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_u64_t _sz_find_charset_neon_register( //
-    sz_u128_vec_t h_vec, uint8x16_t set_top_vec_u8x16, uint8x16_t set_bottom_vec_u8x16) {
-
-    // Once we've read the characters in the haystack, we want to
-    // compare them against our bitset. The serial version of that code
-    // would look like: `(set_->_u8s[c >> 3] & (1u << (c & 7u))) != 0`.
-    uint8x16_t byte_index_vec = vshrq_n_u8(h_vec.u8x16, 3);
-    uint8x16_t byte_mask_vec = vshlq_u8(vdupq_n_u8(1), vreinterpretq_s8_u8(vandq_u8(h_vec.u8x16, vdupq_n_u8(7))));
-    uint8x16_t matches_top_vec = vqtbl1q_u8(set_top_vec_u8x16, byte_index_vec);
-    // The table lookup instruction in NEON replies to out-of-bound requests with zeros.
-    // The values in `byte_index_vec` all fall in [0; 32). So for values under 16, substracting 16 will underflow
-    // and map into interval [240, 256). Meaning that those will be populated with zeros and we can safely
-    // merge `matches_top_vec` and `matches_bottom_vec` with a bitwise OR.
-    uint8x16_t matches_bottom_vec = vqtbl1q_u8(set_bottom_vec_u8x16, vsubq_u8(byte_index_vec, vdupq_n_u8(16)));
-    uint8x16_t matches_vec = vorrq_u8(matches_top_vec, matches_bottom_vec);
-    // Istead of pure `vandq_u8`, we can immediately broadcast a match presence across each 8-bit word.
-    matches_vec = vtstq_u8(matches_vec, byte_mask_vec);
-    return _sz_vreinterpretq_u8_u4(matches_vec);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_neon(h, h_length, n);
-
-    // Scan through the string.
-    // Assuming how tiny the Arm NEON registers are, we should avoid internal branches at all costs.
-    // That's why, for smaller needles, we use different loops.
-    if (n_length == 2) {
-        // Broadcast needle characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_last_vec, n_first_vec, n_last_vec, matches_vec;
-        // Dealing with 16-bit values, we can load 2 registers at a time and compare 31 possible offsets
-        // in a single loop iteration.
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[0]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[1]);
-        for (; h_length >= 17; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 0));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 1));
-            matches_vec.u8x16 =
-                vandq_u8(vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            if (matches) return h + sz_u64_ctz(matches) / 4;
-        }
-    }
-    else if (n_length == 3) {
-        // Broadcast needle characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-        // Comparing 24-bit values is a bumer. Being lazy, I went with the same approach
-        // as when searching for string over 4 characters long. I only avoid the last comparison.
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[0]);
-        n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[1]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[2]);
-        for (; h_length >= 18; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 0));
-            h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 1));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 2));
-            matches_vec.u8x16 = vandq_u8(                           //
-                vandq_u8(                                           //
-                    vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                    vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-                vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            if (matches) return h + sz_u64_ctz(matches) / 4;
-        }
-    }
-    else {
-        // Pick the parts of the needle that are worth comparing.
-        sz_size_t offset_first, offset_mid, offset_last;
-        _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-        // Broadcast those characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_first]);
-        n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_mid]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_last]);
-        // Walk through the string.
-        for (; h_length >= n_length + 16; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_first));
-            h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_mid));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_last));
-            matches_vec.u8x16 = vandq_u8(                           //
-                vandq_u8(                                           //
-                    vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                    vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-                vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches) / 4;
-                if (sz_equal_neon(h + potential_offset, n, n_length)) return h + potential_offset;
-                matches &= matches - 1;
-            }
-        }
-    }
-
-    return sz_find_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_neon(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Will contain 4 bits per character.
-    sz_u64_t matches;
-    sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-    n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_first]);
-    n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_mid]);
-    n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_last]);
-
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 16; h_length -= 16) {
-        h_reversed = h + h_length - n_length - 16 + 1;
-        h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_first));
-        h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_mid));
-        h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_last));
-        matches_vec.u8x16 = vandq_u8(                           //
-            vandq_u8(                                           //
-                vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-            vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches) / 4;
-            if (sz_equal_neon(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            sz_assert((matches & (1ull << (63 - potential_offset * 4))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~(1ull << (63 - potential_offset * 4));
-        }
-    }
-
-    return sz_rfind_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_charset_t const *set) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec;
-    uint8x16_t set_top_vec_u8x16 = vld1q_u8(&set->_u8s[0]);
-    uint8x16_t set_bottom_vec_u8x16 = vld1q_u8(&set->_u8s[16]);
-
-    for (; h_length >= 16; h += 16, h_length -= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h));
-        matches = _sz_find_charset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
-        if (matches) return h + sz_u64_ctz(matches) / 4;
-    }
-
-    return sz_find_charset_serial(h, h_length, set);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_charset_t const *set) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec;
-    uint8x16_t set_top_vec_u8x16 = vld1q_u8(&set->_u8s[0]);
-    uint8x16_t set_bottom_vec_u8x16 = vld1q_u8(&set->_u8s[16]);
-
-    // Check `sz_find_charset_neon` for explanations.
-    for (; h_length >= 16; h_length -= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h) + h_length - 16);
-        matches = _sz_find_charset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
-        if (matches) return h + h_length - 1 - sz_u64_clz(matches) / 4;
-    }
-
-    return sz_rfind_charset_serial(h, h_length, set);
-}
-
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_NEON
@@ -1665,6 +321,8 @@ SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_ch
 #pragma GCC target("arch=armv8.2-a+sve")
 #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
 
+/* Nothing here for now. */
+
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_SVE
@@ -1676,118 +334,34 @@ SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_ch
 #pragma region Compile Time Dispatching
 #if !SZ_DYNAMIC_DISPATCH
 
-#pragma region Core Funcitonality
-
-SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
-#if SZ_USE_SKYLAKE
-    return sz_find_byte_skylake(haystack, h_length, needle);
-#elif SZ_USE_HASWELL
-    return sz_find_byte_haswell(haystack, h_length, needle);
-#elif SZ_USE_NEON
-    return sz_find_byte_neon(haystack, h_length, needle);
-#else
-    return sz_find_byte_serial(haystack, h_length, needle);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
-#if SZ_USE_SKYLAKE
-    return sz_rfind_byte_skylake(haystack, h_length, needle);
-#elif SZ_USE_HASWELL
-    return sz_rfind_byte_haswell(haystack, h_length, needle);
-#elif SZ_USE_NEON
-    return sz_rfind_byte_neon(haystack, h_length, needle);
-#else
-    return sz_rfind_byte_serial(haystack, h_length, needle);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
+SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
 #if SZ_USE_SKYLAKE
-    return sz_find_skylake(haystack, h_length, needle, n_length);
+    return sz_equal_skylake(a, b, length);
 #elif SZ_USE_HASWELL
-    return sz_find_haswell(haystack, h_length, needle, n_length);
+    return sz_equal_haswell(a, b, length);
 #elif SZ_USE_NEON
-    return sz_find_neon(haystack, h_length, needle, n_length);
+    return sz_equal_neon(a, b, length);
 #else
-    return sz_find_serial(haystack, h_length, needle, n_length);
+    return sz_equal_serial(a, b, length);
 #endif
 }
 
-SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
+SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
 #if SZ_USE_SKYLAKE
-    return sz_rfind_skylake(haystack, h_length, needle, n_length);
+    return sz_order_skylake(a, a_length, b, b_length);
 #elif SZ_USE_HASWELL
-    return sz_rfind_haswell(haystack, h_length, needle, n_length);
+    return sz_order_haswell(a, a_length, b, b_length);
 #elif SZ_USE_NEON
-    return sz_rfind_neon(haystack, h_length, needle, n_length);
+    return sz_order_neon(a, a_length, b, b_length);
 #else
-    return sz_rfind_serial(haystack, h_length, needle, n_length);
+    return sz_order_serial(a, a_length, b, b_length);
 #endif
 }
 
-SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#if SZ_USE_ICE
-    return sz_find_charset_ice(text, length, set);
-#elif SZ_USE_HASWELL
-    return sz_find_charset_haswell(text, length, set);
-#elif SZ_USE_NEON
-    return sz_find_charset_neon(text, length, set);
-#else
-    return sz_find_charset_serial(text, length, set);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#if SZ_USE_ICE
-    return sz_rfind_charset_ice(text, length, set);
-#elif SZ_USE_HASWELL
-    return sz_rfind_charset_haswell(text, length, set);
-#elif SZ_USE_NEON
-    return sz_rfind_charset_neon(text, length, set);
-#else
-    return sz_rfind_charset_serial(text, length, set);
-#endif
-}
-
-#pragma endregion
-#pragma region Helper Shortcuts
-
-SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    return sz_find_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    sz_charset_invert(&set);
-    return sz_find_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    return sz_rfind_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    sz_charset_invert(&set);
-    return sz_rfind_charset(h, h_length, &set);
-}
-
-#pragma endregion // Helper Shortcuts
 #endif            // !SZ_DYNAMIC_DISPATCH
 #pragma endregion // Compile Time Dispatching
 
 #ifdef __cplusplus
 }
 #endif // __cplusplus
-#endif // STRINGZILLA_FIND_H_
+#endif // STRINGZILLA_COMPARE_H_
diff --git a/include/stringzilla/find.h b/include/stringzilla/find.h
index 4571515d..91892a0f 100644
--- a/include/stringzilla/find.h
+++ b/include/stringzilla/find.h
@@ -22,6 +22,8 @@
 
 #include "types.h"
 
+#include "compare.h" // `sz_equal`
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -194,26 +196,6 @@ SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t haystack, sz_size_t h_length
 
 #pragma region Serial Implementation
 
-/**
- *  @brief  Byte-level equality comparison between two strings.
- *          If unaligned loads are allowed, uses a switch-table to avoid loops on short strings.
- */
-SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_cptr_t const a_end = a + length;
-#if SZ_USE_MISALIGNED_LOADS
-    if (length >= SZ_SWAR_THRESHOLD) {
-        sz_u64_vec_t a_vec, b_vec;
-        for (; a + 8 <= a_end; a += 8, b += 8) {
-            a_vec = sz_u64_load(a);
-            b_vec = sz_u64_load(b);
-            if (a_vec.u64 != b_vec.u64) return sz_false_k;
-        }
-    }
-#endif
-    while (a != a_end && *a == *b) a++, b++;
-    return (sz_bool_t)(a_end == a);
-}
-
 /**
  *  @brief  Chooses the offsets of the most interesting characters in a search needle.
  *
@@ -804,24 +786,8 @@ SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n
 #pragma region Haswell Implementation
 #if SZ_USE_HASWELL
 #pragma GCC push_options
-#pragma GCC target("haswell")
-#pragma clang attribute push(__attribute__((target("haswell"))), apply_to = function)
-
-SZ_PUBLIC sz_bool_t sz_equal_haswell(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_u256_vec_t a_vec, b_vec;
-
-    while (length >= 32) {
-        a_vec.ymm = _mm256_lddqu_si256((__m256i const *)a);
-        b_vec.ymm = _mm256_lddqu_si256((__m256i const *)b);
-        // One approach can be to use "movemasks", but we could also use a bitwise matching like `_mm256_testnzc_si256`.
-        int difference_mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi8(a_vec.ymm, b_vec.ymm));
-        if (difference_mask == 0) { a += 32, b += 32, length -= 32; }
-        else { return sz_false_k; }
-    }
-
-    if (length) return sz_equal_serial(a, b, length);
-    return sz_true_k;
-}
+#pragma GCC target("avx2")
+#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
 
 SZ_PUBLIC sz_cptr_t sz_find_byte_haswell(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
     int mask;
@@ -1036,30 +1002,6 @@ SZ_PUBLIC sz_cptr_t sz_rfind_charset_haswell(sz_cptr_t text, sz_size_t length, s
 #pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
 #pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
 
-SZ_PUBLIC sz_bool_t sz_equal_skylake(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    __mmask64 mask;
-    sz_u512_vec_t a_vec, b_vec;
-
-    while (length >= 64) {
-        a_vec.zmm = _mm512_loadu_si512(a);
-        b_vec.zmm = _mm512_loadu_si512(b);
-        mask = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask != 0) return sz_false_k;
-        a += 64, b += 64, length -= 64;
-    }
-
-    if (length) {
-        mask = _sz_u64_mask_until(length);
-        a_vec.zmm = _mm512_maskz_loadu_epi8(mask, a);
-        b_vec.zmm = _mm512_maskz_loadu_epi8(mask, b);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpneq_epi8_mask(mask, a_vec.zmm, b_vec.zmm);
-        return (sz_bool_t)(mask == 0);
-    }
-
-    return sz_true_k;
-}
-
 SZ_PUBLIC sz_cptr_t sz_find_byte_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
     __mmask64 mask;
     sz_u512_vec_t h_vec, n_vec;
@@ -1428,20 +1370,6 @@ SZ_INTERNAL sz_u64_t _sz_vreinterpretq_u8_u4(uint8x16_t vec) {
     return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(vec), 4)), 0) & 0x8888888888888888ull;
 }
 
-SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_u128_vec_t a_vec, b_vec;
-    for (; length >= 16; a += 16, b += 16, length -= 16) {
-        a_vec.u8x16 = vld1q_u8((sz_u8_t const *)a);
-        b_vec.u8x16 = vld1q_u8((sz_u8_t const *)b);
-        uint8x16_t cmp = vceqq_u8(a_vec.u8x16, b_vec.u8x16);
-        if (vminvq_u8(cmp) != 255) { return sz_false_k; } // Check if all bytes match
-    }
-
-    // Handle remaining bytes
-    if (length) return sz_equal_serial(a, b, length);
-    return sz_true_k;
-}
-
 SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
     sz_u64_t matches;
     sz_u128_vec_t h_vec, n_vec, matches_vec;
@@ -1676,7 +1604,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_ch
 #pragma region Compile Time Dispatching
 #if !SZ_DYNAMIC_DISPATCH
 
-#pragma region Core Funcitonality
+#pragma region Core Functionality
 
 SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
 #if SZ_USE_SKYLAKE

From 00f27f62c0767838f11dee34359a4aefd55977bd Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 8 Dec 2024 16:21:44 +0000
Subject: [PATCH 047/751] Fix: Haswell compilation flag

---
 include/stringzilla/memory.h       | 6 +++---
 include/stringzilla/similarity.h   | 4 ++--
 include/stringzilla/small_string.h | 3 ++-
 include/stringzilla/sort.h         | 2 ++
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/include/stringzilla/memory.h b/include/stringzilla/memory.h
index 32106a82..06a3dc60 100644
--- a/include/stringzilla/memory.h
+++ b/include/stringzilla/memory.h
@@ -328,8 +328,8 @@ SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
 
 #if SZ_USE_HASWELL
 #pragma GCC push_options
-#pragma GCC target("haswell")
-#pragma clang attribute push(__attribute__((target("haswell"))), apply_to = function)
+#pragma GCC target("avx2")
+#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
 
 SZ_PUBLIC void sz_fill_haswell(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
     char value_char = *(char *)&value;
@@ -1253,7 +1253,7 @@ SZ_PUBLIC void sz_copy_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length)
 #pragma region Compile Time Dispatching
 #if !SZ_DYNAMIC_DISPATCH
 
-#pragma region Core Funcitonality
+#pragma region Core Functionality
 
 SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
 #if SZ_USE_ICE
diff --git a/include/stringzilla/similarity.h b/include/stringzilla/similarity.h
index ef34b824..5451c95f 100644
--- a/include/stringzilla/similarity.h
+++ b/include/stringzilla/similarity.h
@@ -579,8 +579,8 @@ SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial( //
 #pragma region Haswell Implementation
 #if SZ_USE_HASWELL
 #pragma GCC push_options
-#pragma GCC target("haswell")
-#pragma clang attribute push(__attribute__((target("haswell"))), apply_to = function)
+#pragma GCC target("avx2")
+#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
 
 #pragma clang attribute pop
 #pragma GCC pop_options
diff --git a/include/stringzilla/small_string.h b/include/stringzilla/small_string.h
index 17625700..ba823901 100644
--- a/include/stringzilla/small_string.h
+++ b/include/stringzilla/small_string.h
@@ -24,9 +24,10 @@
 #ifndef STRINGZILLA_SMALL_STRING_H_
 #define STRINGZILLA_SMALL_STRING_H_
 
+#include "types.h"
+
 #include "find.h"   // `sz_equal`
 #include "memory.h" // `sz_copy`, `sz_move`, `sz_fill`
-#include "types.h"  // `sz_size_t`, `sz_ptr_t`, `sz_cptr_t`
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index 4fe64bee..7a8de124 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -15,6 +15,8 @@
 
 #include "types.h"
 
+#include "compare.h" // `sz_compare`
+
 #ifdef __cplusplus
 extern "C" {
 #endif

From 406bf0f2befc379c17372e4871e62cc13d6f5ad8 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 8 Dec 2024 19:21:58 +0000
Subject: [PATCH 048/751] Fix: Symbols names & visibility

---
 README.md                           |    5 +-
 c/lib.c                             |  216 +-
 include/stringzilla/drafts.h        |   28 +-
 include/stringzilla/find.h          |   85 +-
 include/stringzilla/hash.h          |   70 +-
 include/stringzilla/memory.h        |   50 +-
 include/stringzilla/similarity.h    |  115 +-
 include/stringzilla/small_string.h  |   16 +-
 include/stringzilla/stringzilla.h   | 6451 +--------------------------
 include/stringzilla/stringzilla.hpp |  100 +-
 include/stringzilla/types.h         |   49 +-
 scripts/bench_memory.cpp            |   20 +-
 scripts/bench_similarity.cpp        |    8 +-
 scripts/bench_token.cpp             |   12 +-
 scripts/test.cpp                    |   29 +-
 15 files changed, 458 insertions(+), 6796 deletions(-)

diff --git a/README.md b/README.md
index c4122696..c07050a3 100644
--- a/README.md
+++ b/README.md
@@ -624,7 +624,8 @@ sz_string_view_t needle = {your_subtext, your_subtext_length};
 
 // Perform string-level operations
 sz_size_t substring_position = sz_find(haystack.start, haystack.length, needle.start, needle.length);
-sz_size_t substring_position = sz_find_avx512(haystack.start, haystack.length, needle.start, needle.length);
+sz_size_t substring_position = sz_find_skylake(haystack.start, haystack.length, needle.start, needle.length);
+sz_size_t substring_position = sz_find_haswell(haystack.start, haystack.length, needle.start, needle.length);
 sz_size_t substring_position = sz_find_neon(haystack.start, haystack.length, needle.start, needle.length);
 
 // Hash strings
@@ -747,7 +748,7 @@ typedef union sz_string_t {
     struct internal {
         sz_ptr_t start;
         sz_u8_t length;
-        char chars[SZ_STRING_INTERNAL_SPACE]; /// Ends with a null-terminator.
+        char chars[_SZ_STRING_INTERNAL_SPACE]; /// Ends with a null-terminator.
     } internal;
 
     struct external {
diff --git a/c/lib.c b/c/lib.c
index e1d98328..8a0a75b9 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -3,10 +3,20 @@
  *  @brief      StringZilla C library with dynamic backed dispatch for the most appropriate implementation.
  *  @author     Ash Vardanian
  *  @date       January 16, 2024
- *  @copyright  Copyright (c) 2024
  */
-#if defined(_WIN32) || defined(__CYGWIN__)
-#include <windows.h> // `DllMain`
+#if SZ_AVOID_LIBC
+// If we don't have the LibC, the `malloc` definition in `stringzilla.h` will be illformed.
+#ifdef _MSC_VER
+typedef sz_size_t size_t; // Reuse the type definition we've inferred from `stringzilla.h`
+extern __declspec(dllimport) int rand(void);
+extern __declspec(dllimport) void free(void *start);
+extern __declspec(dllimport) void *malloc(size_t length);
+#else
+typedef __SIZE_TYPE__ size_t; // For GCC/Clang
+extern int rand(void);
+extern void free(void *start);
+extern void *malloc(size_t length);
+#endif
 #endif
 
 // When enabled, this library will override the symbols usually provided by the C standard library.
@@ -23,35 +33,32 @@
 #define SZ_DYNAMIC_DISPATCH 1
 #include <stringzilla/stringzilla.h>
 
-#if SZ_AVOID_LIBC
-// If we don't have the LibC, the `malloc` definition in `stringzilla.h` will be illformed.
-#ifdef _MSC_VER
-typedef sz_size_t size_t; // Reuse the type definition we've inferred from `stringzilla.h`
-extern __declspec(dllimport) int rand(void);
-extern __declspec(dllimport) void free(void *start);
-extern __declspec(dllimport) void *malloc(size_t length);
-#else
-typedef __SIZE_TYPE__ size_t; // For GCC/Clang
-extern int rand(void);
-extern void free(void *start);
-extern void *malloc(size_t length);
-#endif
+// Inferring target OS: Windows, MacOS, or Linux
+#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || defined(__CYGWIN__)
+#define _SZ_IS_WINDOWS 1
+#elif defined(__APPLE__) && defined(__MACH__)
+#define _SZ_IS_APPLE 1
+#elif defined(__linux__)
+#define _SZ_IS_LINUX 1
 #endif
 
 // On Apple Silicon, `mrs` is not allowed in user-space, so we need to use the `sysctl` API.
-#if defined(__APPLE__) && defined(__MACH__)
-#define SZ_APPLE 1
+#if defined(_SZ_IS_APPLE)
 #include <sys/sysctl.h>
 #endif
 
-#if defined(__linux__)
-#define SZ_LINUX 1
+#if defined(_SZ_IS_WINDOWS)
+#include <windows.h> // `DllMain`
 #endif
 
-SZ_INTERNAL sz_capability_t sz_capabilities_arm(void) {
+/**
+ *  @brief  Function to determine the SIMD capabilities of the current 64-bit Arm machine at @b runtime.
+ *  @return A bitmask of the SIMD capabilities represented as a `sz_capability_t` enum value.
+ */
+SZ_INTERNAL sz_capability_t _sz_capabilities_arm(void) {
     // https://github.com/ashvardanian/SimSIMD/blob/28e536083602f85ad0c59456782c8864463ffb0e/include/simsimd/simsimd.h#L434
     // for documentation on how we detect capabilities across different ARM platforms.
-#if defined(SZ_APPLE)
+#if defined(_SZ_IS_APPLE)
 
     // On Apple Silicon, `mrs` is not allowed in user-space, so we need to use the `sysctl` API.
     uint32_t supports_neon = 0;
@@ -62,20 +69,47 @@ SZ_INTERNAL sz_capability_t sz_capabilities_arm(void) {
         (sz_cap_arm_neon_k * (supports_neon)) | //
         (sz_cap_serial_k));
 
-#elif defined(SZ_LINUX)
-    unsigned supports_neon = 1; // NEON is always supported
+#elif defined(_SZ_IS_LINUX)
+
+    // Read CPUID registers directly
+    unsigned long id_aa64isar0_el1 = 0, id_aa64isar1_el1 = 0, id_aa64pfr0_el1 = 0, id_aa64zfr0_el1 = 0;
+
+    // Now let's unpack the status flags from ID_AA64ISAR0_EL1
+    // https://developer.arm.com/documentation/ddi0601/2024-03/AArch64-Registers/ID-AA64ISAR0-EL1--AArch64-Instruction-Set-Attribute-Register-0?lang=en
+    __asm__ __volatile__("mrs %0, ID_AA64ISAR0_EL1" : "=r"(id_aa64isar0_el1));
+    // Now let's unpack the status flags from ID_AA64ISAR1_EL1
+    // https://developer.arm.com/documentation/ddi0601/2024-03/AArch64-Registers/ID-AA64ISAR1-EL1--AArch64-Instruction-Set-Attribute-Register-1?lang=en
+    __asm__ __volatile__("mrs %0, ID_AA64ISAR1_EL1" : "=r"(id_aa64isar1_el1));
+    // Now let's unpack the status flags from ID_AA64PFR0_EL1
+    // https://developer.arm.com/documentation/ddi0601/2024-03/AArch64-Registers/ID-AA64PFR0-EL1--AArch64-Processor-Feature-Register-0?lang=en
     __asm__ __volatile__("mrs %0, ID_AA64PFR0_EL1" : "=r"(id_aa64pfr0_el1));
+    // SVE, bits [35:32] of ID_AA64PFR0_EL1
     unsigned supports_sve = ((id_aa64pfr0_el1 >> 32) & 0xF) >= 1;
-    return (sz_capability_t)(               //
-        (sz_cap_neon_k * (supports_neon)) | //
-        (sz_cap_sve_k * (supports_sve)) |   //
+    // Now let's unpack the status flags from ID_AA64ZFR0_EL1
+    // https://developer.arm.com/documentation/ddi0601/2024-03/AArch64-Registers/ID-AA64ZFR0-EL1--SVE-Feature-ID-Register-0?lang=en
+    if (supports_sve) __asm__ __volatile__("mrs %0, ID_AA64ZFR0_EL1" : "=r"(id_aa64zfr0_el1));
+    // SVEver, bits [3:0] can be used to check for capability levels:
+    //  - 0b0000: SVE is implemented
+    //  - 0b0001: SVE2 is implemented
+    //  - 0b0010: SVE2.1 is implemented
+    // This value must match the existing indicator obtained from ID_AA64PFR0_EL1:
+    unsigned supports_sve2 = ((id_aa64zfr0_el1) & 0xF) >= 1;
+    unsigned supports_sve2p1 = ((id_aa64zfr0_el1) & 0xF) >= 2;
+    unsigned supports_neon = 1; // NEON is always supported
+
+    return (sz_capability_t)(                   //
+        (sz_cap_neon_k * (supports_neon)) |     //
+        (sz_cap_sve_k * (supports_sve)) |       //
+        (sz_cap_sve2_k * (supports_sve2)) |     //
+        (sz_cap_sve2p1_k * (supports_sve2p1)) | //
         (sz_cap_serial_k));
-#else // SIMSIMD_DEFINED_LINUX
+
+#else // if !defined(_SZ_IS_APPLE) && !defined(_SZ_IS_LINUX)
     return sz_cap_serial_k;
 #endif
 }
 
-SZ_DYNAMIC sz_capability_t sz_capabilities(void) {
+SZ_INTERNAL sz_capability_t _sz_capabilities_x86(void) {
 
 #if SZ_USE_HASWELL || SZ_USE_SKYLAKE || SZ_USE_ICE
 
@@ -91,54 +125,50 @@ SZ_DYNAMIC sz_capability_t sz_capabilities(void) {
     __cpuidex(info1.array, 1, 0);
     __cpuidex(info7.array, 7, 0);
 #else
-    __asm__ __volatile__("cpuid"
-                         : "=a"(info1.named.eax), "=b"(info1.named.ebx), "=c"(info1.named.ecx), "=d"(info1.named.edx)
-                         : "a"(1), "c"(0));
-    __asm__ __volatile__("cpuid"
-                         : "=a"(info7.named.eax), "=b"(info7.named.ebx), "=c"(info7.named.ecx), "=d"(info7.named.edx)
-                         : "a"(7), "c"(0));
+    __asm__ __volatile__( //
+        "cpuid"
+        : "=a"(info1.named.eax), "=b"(info1.named.ebx), "=c"(info1.named.ecx), "=d"(info1.named.edx)
+        : "a"(1), "c"(0));
+    __asm__ __volatile__( //
+        "cpuid"
+        : "=a"(info7.named.eax), "=b"(info7.named.ebx), "=c"(info7.named.ecx), "=d"(info7.named.edx)
+        : "a"(7), "c"(0));
 #endif
 
-    // Check for AVX2 (Function ID 7, EBX register)
+    // Check for AVX2 (Function ID 7, EBX register), you can take the relevant flags from the LLVM implementation:
     // https://github.com/llvm/llvm-project/blob/50598f0ff44f3a4e75706f8c53f3380fe7faa896/clang/lib/Headers/cpuid.h#L148
     unsigned supports_avx2 = (info7.named.ebx & 0x00000020) != 0;
-    // Check for AVX512F (Function ID 7, EBX register)
-    // https://github.com/llvm/llvm-project/blob/50598f0ff44f3a4e75706f8c53f3380fe7faa896/clang/lib/Headers/cpuid.h#L155
     unsigned supports_avx512f = (info7.named.ebx & 0x00010000) != 0;
-    // Check for AVX512BW (Function ID 7, EBX register)
-    // https://github.com/llvm/llvm-project/blob/50598f0ff44f3a4e75706f8c53f3380fe7faa896/clang/lib/Headers/cpuid.h#L166
     unsigned supports_avx512bw = (info7.named.ebx & 0x40000000) != 0;
-    // Check for AVX512VL (Function ID 7, EBX register)
-    // https://github.com/llvm/llvm-project/blob/50598f0ff44f3a4e75706f8c53f3380fe7faa896/clang/lib/Headers/cpuid.h#L167C25-L167C35
     unsigned supports_avx512vl = (info7.named.ebx & 0x80000000) != 0;
-    // Check for GFNI (Function ID 1, ECX register)
-    // https://github.com/llvm/llvm-project/blob/50598f0ff44f3a4e75706f8c53f3380fe7faa896/clang/lib/Headers/cpuid.h#L171C30-L171C40
     unsigned supports_avx512vbmi = (info7.named.ecx & 0x00000002) != 0;
     unsigned supports_avx512vbmi2 = (info7.named.ecx & 0x00000040) != 0;
-    // Check for GFNI (Function ID 1, ECX register)
-    // https://github.com/llvm/llvm-project/blob/50598f0ff44f3a4e75706f8c53f3380fe7faa896/clang/lib/Headers/cpuid.h#L177C30-L177C40
-    unsigned supports_gfni = (info7.named.ecx & 0x00000100) != 0;
-
-    return (sz_capability_t)(                               //
-        (sz_cap_x86_avx2_k * supports_avx2) |               //
-        (sz_cap_x86_avx512f_k * supports_avx512f) |         //
-        (sz_cap_x86_avx512vl_k * supports_avx512vl) |       //
-        (sz_cap_x86_avx512bw_k * supports_avx512bw) |       //
-        (sz_cap_x86_avx512vbmi_k * supports_avx512vbmi) |   //
-        (sz_cap_x86_avx512vbmi2_k * supports_avx512vbmi2) | //
-        (sz_cap_x86_gfni_k * (supports_gfni)) |             //
-        (sz_cap_serial_k));
-
-#endif // SZ_TARGET_X86
-
-#if SZ_USE_NEON || SZ_USE_SVE
+    unsigned supports_vaes = (info7.named.ecx & 0x00000200) != 0;
 
-    return sz_capabilities_arm();
-
-#endif // SZ_TARGET_ARM
+    return (sz_capability_t)(                                                                                //
+        (sz_cap_haswell_k * supports_avx2) |                                                                 //
+        (sz_cap_skylake_k * (supports_avx512f && supports_avx512vl && supports_avx512bw && supports_vaes)) | //
+        (sz_cap_ice_k * (supports_avx512vbmi && supports_avx512vbmi2)) |                                     //
+        (sz_cap_serial_k));
+#else
+    return sz_cap_serial_k;
+#endif
+}
 
+/**
+ *  @brief  Function to determine the SIMD capabilities of the current 64-bit x86 machine at @b runtime.
+ *  @return A bitmask of the SIMD capabilities represented as a `sz_capability_t` enum value.
+ */
+SZ_DYNAMIC sz_capability_t sz_capabilities(void) {
+#if _SZ_IS_X86
+    return _sz_capabilities_x86();
+#elif _SZ_IS_ARM
+    return _sz_capabilities_arm();
+#else
     return sz_cap_serial_k;
+#endif
 }
+
 typedef struct sz_implementations_t {
     sz_equal_t equal;
     sz_order_t order;
@@ -197,56 +227,54 @@ static void sz_dispatch_table_init(void) {
     impl->hashes = sz_hashes_serial;
 
 #if SZ_USE_HASWELL
-    if (caps & sz_cap_x86_avx2_k) {
-        impl->equal = sz_equal_avx2;
-        impl->order = sz_order_avx2;
-
-        impl->copy = sz_copy_avx2;
-        impl->move = sz_move_avx2;
-        impl->fill = sz_fill_avx2;
-        impl->look_up_transform = sz_look_up_transform_avx2;
-        impl->checksum = sz_checksum_avx2;
-
-        impl->find_byte = sz_find_byte_avx2;
-        impl->rfind_byte = sz_rfind_byte_avx2;
-        impl->find = sz_find_avx2;
-        impl->rfind = sz_rfind_avx2;
-        impl->find_from_set = sz_find_charset_avx2;
-        impl->rfind_from_set = sz_rfind_charset_avx2;
+    if (caps & sz_cap_haswell_k) {
+        impl->equal = sz_equal_haswell;
+        impl->order = sz_order_haswell;
+
+        impl->copy = sz_copy_haswell;
+        impl->move = sz_move_haswell;
+        impl->fill = sz_fill_haswell;
+        impl->look_up_transform = sz_look_up_transform_haswell;
+        impl->checksum = sz_checksum_haswell;
+
+        impl->find_byte = sz_find_byte_haswell;
+        impl->rfind_byte = sz_rfind_byte_haswell;
+        impl->find = sz_find_haswell;
+        impl->rfind = sz_rfind_haswell;
+        impl->find_from_set = sz_find_charset_haswell;
+        impl->rfind_from_set = sz_rfind_charset_haswell;
     }
 #endif
 
 #if SZ_USE_SKYLAKE
-    if (caps & sz_cap_x86_avx512f_k) {
+    if (caps & sz_cap_skylake_k) {
         impl->equal = sz_equal_skylake;
-        impl->order = sz_order_avx512;
+        impl->order = sz_order_skylake;
 
-        impl->copy = sz_copy_avx512;
-        impl->move = sz_move_avx512;
-        impl->fill = sz_fill_avx512;
+        impl->copy = sz_copy_skylake;
+        impl->move = sz_move_skylake;
+        impl->fill = sz_fill_skylake;
 
         impl->find = sz_find_skylake;
         impl->rfind = sz_rfind_skylake;
-        impl->find_byte = sz_find_byte_avx512;
-        impl->rfind_byte = sz_rfind_byte_avx512;
-
-        impl->edit_distance = sz_edit_distance_avx512;
+        impl->find_byte = sz_find_byte_skylake;
+        impl->rfind_byte = sz_rfind_byte_skylake;
     }
 #endif
 
 #if SZ_USE_ICE
-    if ((caps & sz_cap_x86_avx512f_k) && (caps & sz_cap_x86_avx512vl_k) && (caps & sz_cap_x86_avx512vbmi2_k) &&
-        (caps & sz_cap_x86_avx512bw_k) && (caps & sz_cap_x86_avx512vbmi_k)) {
+    if (caps & sz_cap_ice_k) {
         impl->find_from_set = sz_find_charset_ice;
         impl->rfind_from_set = sz_rfind_charset_ice;
-        impl->alignment_score = sz_alignment_score_avx512;
+        impl->edit_distance = sz_edit_distance_ice;
+        impl->alignment_score = sz_alignment_score_ice;
         impl->look_up_transform = sz_look_up_transform_ice;
-        impl->checksum = sz_checksum_avx512;
+        impl->checksum = sz_checksum_ice;
     }
 #endif
 
 #if SZ_USE_NEON
-    if (caps & sz_cap_arm_neon_k) {
+    if (caps & sz_cap_neon_k) {
         impl->equal = sz_equal_neon;
 
         impl->copy = sz_copy_neon;
diff --git a/include/stringzilla/drafts.h b/include/stringzilla/drafts.h
index 1817a81e..49099cbe 100644
--- a/include/stringzilla/drafts.h
+++ b/include/stringzilla/drafts.h
@@ -342,24 +342,24 @@ sz_u512_vec_t sz_inclusive_min(sz_i32_t previous, sz_error_cost_t gap, sz_u512_v
     shifted_vec.i32s[0] = previous;
     shifted_vec.zmm = _mm512_add_epi32(shifted_vec.zmm, gap_vec.zmm);
     new_vec.zmm = _mm512_mask_max_epi32(new_vec.zmm, mask_skip_one, new_vec.zmm, shifted_vec.zmm);
-    sz_assert(new_vec.i32s[0] == max(previous + gap, base_vec.i32s[0]));
+    _sz_assert(new_vec.i32s[0] == max(previous + gap, base_vec.i32s[0]));
 
     shifted_vec.zmm = _mm512_permutexvar_epi32(shift_by_two_vec.zmm, new_vec.zmm);
     shifted_vec.zmm = _mm512_add_epi32(shifted_vec.zmm, gap_double_vec.zmm);
     new_vec.zmm = _mm512_mask_max_epi32(new_vec.zmm, mask_skip_two, new_vec.zmm, shifted_vec.zmm);
-    sz_assert(new_vec.i32s[0] == max(previous + gap, base_vec.i32s[0]));
+    _sz_assert(new_vec.i32s[0] == max(previous + gap, base_vec.i32s[0]));
 
     shifted_vec.zmm = _mm512_permutexvar_epi32(shift_by_four_vec.zmm, new_vec.zmm);
     shifted_vec.zmm = _mm512_add_epi32(shifted_vec.zmm, gap_quad_vec.zmm);
     new_vec.zmm = _mm512_mask_max_epi32(new_vec.zmm, mask_skip_four, new_vec.zmm, shifted_vec.zmm);
-    sz_assert(new_vec.i32s[0] == max(previous + gap, base_vec.i32s[0]));
+    _sz_assert(new_vec.i32s[0] == max(previous + gap, base_vec.i32s[0]));
 
     shifted_vec.zmm = _mm512_permutexvar_epi32(shift_by_eight_vec.zmm, new_vec.zmm);
     shifted_vec.zmm = _mm512_add_epi32(shifted_vec.zmm, gap_octa_vec.zmm);
     new_vec.zmm = _mm512_mask_max_epi32(new_vec.zmm, mask_skip_eight, new_vec.zmm, shifted_vec.zmm);
 
-    sz_assert(new_vec.i32s[0] == max(previous + gap, base_vec.i32s[0]));
-    for (sz_size_t i = 1; i < 16; i++) sz_assert(new_vec.i32s[i] == max(new_vec.i32s[i - 1] + gap, new_vec.i32s[i]));
+    _sz_assert(new_vec.i32s[0] == max(previous + gap, base_vec.i32s[0]));
+    for (sz_size_t i = 1; i < 16; i++) _sz_assert(new_vec.i32s[i] == max(new_vec.i32s[i - 1] + gap, new_vec.i32s[i]));
 
     return new_vec;
 }
@@ -1015,7 +1015,7 @@ SZ_PUBLIC sz_ordering_t sz_order_avx2(sz_cptr_t a, sz_size_t a_length, sz_cptr_t
     return sz_order_serial(a, a_length, b, b_length);
 }
 
-SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
+SZ_PUBLIC sz_ordering_t sz_order_skylake(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
     sz_u512_vec_t a_vec, b_vec;
 
     // The rare case, when both string are very long surves as a great example to understand
@@ -1124,8 +1124,8 @@ SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
             for (; length >= 128; target += 64, source += 64, length -= 64) {
                 second_vec.zmm = _mm512_load_si512(target + 64);
                 combined_vec.zmm = _mm512_permutex2var_epi8(first_vec.zmm, selector_vec.zmm, second_vec.zmm);
-                sz_assert(combined_vec.u8s[0] == source[0]);
-                sz_assert(combined_vec.u8s[63] == source[63]);
+                _sz_assert(combined_vec.u8s[0] == source[0]);
+                _sz_assert(combined_vec.u8s[63] == source[63]);
                 _mm512_store_si512(target, combined_vec.zmm);
                 first_vec.zmm = second_vec.zmm;
             }
@@ -1147,8 +1147,8 @@ SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
                 second_vec.zmm = _mm512_load_si512(target + 64);
                 first_shuffled_vec.zmm = _mm512_shuffle_epi8(first_vec.zmm, first_byte_permute_vec.zmm);
                 second_shuffled_vec.zmm = _mm512_shuffle_epi8(second_vec.zmm, second_byte_permute_vec.zmm);
-                sz_assert(first_shuffled_vec.u8s[0] == source[0]);
-                sz_assert(second_shuffled_vec.u8s[63] == source[63]);
+                _sz_assert(first_shuffled_vec.u8s[0] == source[0]);
+                _sz_assert(second_shuffled_vec.u8s[63] == source[63]);
                 combined_vec.zmm = _mm512_or_si512(first_shuffled_vec.zmm, second_shuffled_vec.zmm);
                 _mm512_store_si512(target, combined_vec.zmm);
                 first_vec.zmm = second_vec.zmm;
@@ -1279,8 +1279,8 @@ SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
                 second_vec.zmm = _mm512_load_si512(source_page + 64);
                 second_vec.zmm = _mm512_permutexvar_epi8(selector_vec.zmm, second_vec.zmm);
                 combined_vec.zmm = _mm512_mask_blend_epi8(blend_mask, second_vec.zmm, first_vec.zmm);
-                sz_assert(combined_vec.u8s[0] == source[0]);
-                sz_assert(combined_vec.u8s[63] == source[63]);
+                _sz_assert(combined_vec.u8s[0] == source[0]);
+                _sz_assert(combined_vec.u8s[63] == source[63]);
                 _mm512_store_si512(target, combined_vec.zmm);
                 first_vec.zmm = second_vec.zmm;
             }
@@ -1313,8 +1313,8 @@ SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
                 second_vec.zmm = _mm512_load_si512(source_second_page - 64);
                 second_vec.zmm = _mm512_permutexvar_epi8(selector_vec.zmm, second_vec.zmm);
                 combined_vec.zmm = _mm512_mask_blend_epi8(blend_mask, second_vec.zmm, first_vec.zmm);
-                sz_assert(combined_vec.u8s[0] == source[0]);
-                sz_assert(combined_vec.u8s[63] == source[63]);
+                _sz_assert(combined_vec.u8s[0] == source[0]);
+                _sz_assert(combined_vec.u8s[63] == source[63]);
                 _mm512_store_si512(target + head_length + body_length, combined_vec.zmm);
                 first_vec.zmm = second_vec.zmm;
             }
diff --git a/include/stringzilla/find.h b/include/stringzilla/find.h
index 91892a0f..b5740429 100644
--- a/include/stringzilla/find.h
+++ b/include/stringzilla/find.h
@@ -113,23 +113,23 @@ SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t haystack, sz_size_t h_length, sz_c
 
 #if SZ_USE_HASWELL
 /** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_haswell(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+SZ_PUBLIC sz_cptr_t sz_find_haswell(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
 /** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_haswell(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+SZ_PUBLIC sz_cptr_t sz_rfind_haswell(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
 #endif
 
 #if SZ_USE_SKYLAKE
 /** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_skylake(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+SZ_PUBLIC sz_cptr_t sz_find_skylake(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
 /** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
 #endif
 
 #if SZ_USE_NEON
 /** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
 /** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
 #endif
 
 /**
@@ -173,23 +173,23 @@ SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz
 
 #if SZ_USE_HASWELL
 /** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_haswell(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+SZ_PUBLIC sz_cptr_t sz_find_charset_haswell(sz_cptr_t haystack, sz_size_t length, sz_charset_t const *set);
 /** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_haswell(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+SZ_PUBLIC sz_cptr_t sz_rfind_charset_haswell(sz_cptr_t haystack, sz_size_t length, sz_charset_t const *set);
 #endif
 
 #if SZ_USE_ICE
 /** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_ice(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+SZ_PUBLIC sz_cptr_t sz_find_charset_ice(sz_cptr_t haystack, sz_size_t length, sz_charset_t const *set);
 /** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_ice(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+SZ_PUBLIC sz_cptr_t sz_rfind_charset_ice(sz_cptr_t haystack, sz_size_t length, sz_charset_t const *set);
 #endif
 
 #if SZ_USE_NEON
 /** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t haystack, sz_size_t length, sz_charset_t const *set);
 /** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
+SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t haystack, sz_size_t length, sz_charset_t const *set);
 #endif
 
 #pragma endregion // Core API
@@ -375,7 +375,7 @@ SZ_INTERNAL sz_u64_vec_t _sz_u64_each_2byte_equal(sz_u64_vec_t a, sz_u64_vec_t b
 SZ_INTERNAL sz_cptr_t _sz_find_2byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
 
     // This is an internal method, and the haystack is guaranteed to be at least 2 bytes long.
-    sz_assert(h_length >= 2 && "The haystack is too short.");
+    _sz_assert(h_length >= 2 && "The haystack is too short.");
     sz_cptr_t const h_end = h + h_length;
 
 #if !SZ_USE_MISALIGNED_LOADS
@@ -429,7 +429,7 @@ SZ_INTERNAL sz_u64_vec_t _sz_u64_each_4byte_equal(sz_u64_vec_t a, sz_u64_vec_t b
 SZ_INTERNAL sz_cptr_t _sz_find_4byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
 
     // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
-    sz_assert(h_length >= 4 && "The haystack is too short.");
+    _sz_assert(h_length >= 4 && "The haystack is too short.");
     sz_cptr_t const h_end = h + h_length;
 
 #if !SZ_USE_MISALIGNED_LOADS
@@ -493,7 +493,7 @@ SZ_INTERNAL sz_u64_vec_t _sz_u64_each_3byte_equal(sz_u64_vec_t a, sz_u64_vec_t b
 SZ_INTERNAL sz_cptr_t _sz_find_3byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
 
     // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
-    sz_assert(h_length >= 3 && "The haystack is too short.");
+    _sz_assert(h_length >= 3 && "The haystack is too short.");
     sz_cptr_t const h_end = h + h_length;
 
 #if !SZ_USE_MISALIGNED_LOADS
@@ -550,7 +550,7 @@ SZ_INTERNAL sz_cptr_t _sz_find_3byte_serial(sz_cptr_t h, sz_size_t h_length, sz_
 SZ_INTERNAL sz_cptr_t _sz_find_horspool_upto_256bytes_serial( //
     sz_cptr_t h_chars, sz_size_t h_length,                    //
     sz_cptr_t n_chars, sz_size_t n_length) {
-    sz_assert(n_length <= 256 && "The pattern is too long.");
+    _sz_assert(n_length <= 256 && "The pattern is too long.");
     // Several popular string matching algorithms are using a bad-character shift table.
     // Boyer Moore: https://www-igm.univ-mlv.fr/~lecroq/string/node14.html
     // Quick Search: https://www-igm.univ-mlv.fr/~lecroq/string/node19.html
@@ -604,7 +604,7 @@ SZ_INTERNAL sz_cptr_t _sz_find_horspool_upto_256bytes_serial( //
 SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_upto_256bytes_serial( //
     sz_cptr_t h_chars, sz_size_t h_length,                     //
     sz_cptr_t n_chars, sz_size_t n_length) {
-    sz_assert(n_length <= 256 && "The pattern is too long.");
+    _sz_assert(n_length <= 256 && "The pattern is too long.");
     union {
         sz_u8_t jumps[256];
         sz_u64_vec_t vecs[64];
@@ -941,7 +941,7 @@ SZ_PUBLIC sz_cptr_t sz_find_charset_haswell(sz_cptr_t text, sz_size_t length, sz
         //          sz_u8_t input = *(sz_u8_t const *)(text + i);
         //          sz_u8_t lo_nibble = input & 0x0f;
         //          sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //          sz_assert(bitmask_vec.u8s[i] == bitmask);
+        //          _sz_assert(bitmask_vec.u8s[i] == bitmask);
         //      }
         //
         // Shift right every byte by 4 bits.
@@ -959,8 +959,8 @@ SZ_PUBLIC sz_cptr_t sz_find_charset_haswell(sz_cptr_t text, sz_size_t length, sz
         //          sz_u8_t hi_nibble = input >> 4;
         //          sz_u8_t bitset_even = bitset_ptr[hi_nibble * 2];
         //          sz_u8_t bitset_odd = bitset_ptr[hi_nibble * 2 + 1];
-        //          sz_assert(bitset_even_vec.u8s[i] == bitset_even);
-        //          sz_assert(bitset_odd_vec.u8s[i] == bitset_odd);
+        //          _sz_assert(bitset_even_vec.u8s[i] == bitset_even);
+        //          _sz_assert(bitset_odd_vec.u8s[i] == bitset_odd);
         //      }
         //
         __m256i take_first = _mm256_cmpgt_epi8(_mm256_set1_epi8(8), lower_nibbles_vec.ymm);
@@ -1183,8 +1183,8 @@ SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
             int potential_offset = sz_u64_clz(matches);
             if (n_length <= 3 || sz_equal_skylake(h + h_length - n_length - potential_offset, n, n_length))
                 return h + h_length - n_length - potential_offset;
-            sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
-                      "The bit must be set before we squash it");
+            _sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
+                       "The bit must be set before we squash it");
             matches &= ~((sz_u64_t)1 << (63 - potential_offset));
         }
     }
@@ -1204,8 +1204,8 @@ SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
             int potential_offset = sz_u64_clz(matches);
             if (n_length <= 3 || sz_equal_skylake(h + 64 - potential_offset - 1, n, n_length))
                 return h + 64 - potential_offset - 1;
-            sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
-                      "The bit must be set before we squash it");
+            _sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
+                       "The bit must be set before we squash it");
             matches &= ~((sz_u64_t)1 << (63 - potential_offset));
         }
     }
@@ -1223,13 +1223,16 @@ SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
  *      - 2017 Skylake: F, CD, ER, PF, VL, DQ, BW,
  *      - 2018 CannonLake: IFMA, VBMI,
  *      - 2019 Ice Lake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES.
+ *
+ *  We are going to use VBMI2 for `_mm256_maskz_compress_epi8`.
  */
 #pragma region Ice Lake Implementation
 #if SZ_USE_ICE
 #pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
-                             apply_to = function)
+#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "avx512vbmi2", "bmi", "bmi2")
+#pragma clang attribute push(                                                                          \
+    __attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vbmi2,bmi,bmi2"))), \
+    apply_to = function)
 
 SZ_PUBLIC sz_cptr_t sz_find_charset_ice(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
 
@@ -1247,7 +1250,7 @@ SZ_PUBLIC sz_cptr_t sz_find_charset_ice(sz_cptr_t text, sz_size_t length, sz_cha
     sz_u512_vec_t filter_even_vec, filter_odd_vec;
     __m256i filter_ymm = _mm256_lddqu_si256((__m256i const *)filter);
     // There are a few way to initialize filters without having native strided loads.
-    // In the cronological order of experiments:
+    // In the chronological order of experiments:
     // - serial code initializing 128 bytes of odd and even mask
     // - using several shuffles
     // - using `_mm512_permutexvar_epi8`
@@ -1260,14 +1263,14 @@ SZ_PUBLIC sz_cptr_t sz_find_charset_ice(sz_cptr_t text, sz_size_t length, sz_cha
     // After the unzipping operation, we can validate the contents of the vectors like this:
     //
     //      for (sz_size_t i = 0; i != 16; ++i) {
-    //          sz_assert(filter_even_vec.u8s[i] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 16] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 16] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 32] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 32] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 48] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 48] == filter->_u8s[i * 2 + 1]);
+    //          _sz_assert(filter_even_vec.u8s[i] == filter->_u8s[i * 2]);
+    //          _sz_assert(filter_odd_vec.u8s[i] == filter->_u8s[i * 2 + 1]);
+    //          _sz_assert(filter_even_vec.u8s[i + 16] == filter->_u8s[i * 2]);
+    //          _sz_assert(filter_odd_vec.u8s[i + 16] == filter->_u8s[i * 2 + 1]);
+    //          _sz_assert(filter_even_vec.u8s[i + 32] == filter->_u8s[i * 2]);
+    //          _sz_assert(filter_odd_vec.u8s[i + 32] == filter->_u8s[i * 2 + 1]);
+    //          _sz_assert(filter_even_vec.u8s[i + 48] == filter->_u8s[i * 2]);
+    //          _sz_assert(filter_odd_vec.u8s[i + 48] == filter->_u8s[i * 2 + 1]);
     //      }
     //
     sz_u512_vec_t text_vec;
@@ -1310,7 +1313,7 @@ SZ_PUBLIC sz_cptr_t sz_find_charset_ice(sz_cptr_t text, sz_size_t length, sz_cha
         //          sz_u8_t input = *(sz_u8_t const *)(text + i);
         //          sz_u8_t lo_nibble = input & 0x0f;
         //          sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //          sz_assert(bitmask_vec.u8s[i] == bitmask);
+        //          _sz_assert(bitmask_vec.u8s[i] == bitmask);
         //      }
         //
         // Shift right every byte by 4 bits.
@@ -1328,8 +1331,8 @@ SZ_PUBLIC sz_cptr_t sz_find_charset_ice(sz_cptr_t text, sz_size_t length, sz_cha
         //          sz_u8_t hi_nibble = input >> 4;
         //          sz_u8_t bitset_even = bitset_ptr[hi_nibble * 2];
         //          sz_u8_t bitset_odd = bitset_ptr[hi_nibble * 2 + 1];
-        //          sz_assert(bitset_even_vec.u8s[i] == bitset_even);
-        //          sz_assert(bitset_odd_vec.u8s[i] == bitset_odd);
+        //          _sz_assert(bitset_even_vec.u8s[i] == bitset_even);
+        //          _sz_assert(bitset_odd_vec.u8s[i] == bitset_odd);
         //      }
         //
         // TODO: Is this a good place for ternary logic?
@@ -1539,8 +1542,8 @@ SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
             int potential_offset = sz_u64_clz(matches) / 4;
             if (sz_equal_neon(h + h_length - n_length - potential_offset, n, n_length))
                 return h + h_length - n_length - potential_offset;
-            sz_assert((matches & (1ull << (63 - potential_offset * 4))) != 0 &&
-                      "The bit must be set before we squash it");
+            _sz_assert((matches & (1ull << (63 - potential_offset * 4))) != 0 &&
+                       "The bit must be set before we squash it");
             matches &= ~(1ull << (63 - potential_offset * 4));
         }
     }
diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index d8f4a05e..0e5e883e 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -47,7 +47,10 @@ SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length);
  *
  *  @see    sz_hashes, sz_hashes_fingerprint, sz_hashes_intersection
  */
-SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length);
+SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length) {
+    sz_unused(text && length);
+    return 0;
+}
 
 /**
  *  @brief  Computes the Karp-Rabin rolling hashes of a string supplying them to the provided `callback`.
@@ -99,7 +102,9 @@ SZ_DYNAMIC void sz_hashes(
  */
 SZ_PUBLIC void sz_hashes_fingerprint(                          //
     sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
-    sz_ptr_t fingerprint, sz_size_t fingerprint_bytes);
+    sz_ptr_t fingerprint, sz_size_t fingerprint_bytes) {
+    sz_unused(text && length && window_length && fingerprint && fingerprint_bytes);
+}
 
 /**
  *  @brief  Given a hash-fingerprint of a textual document, computes the number of intersecting hashes
@@ -145,16 +150,18 @@ SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length);
 /** @copydoc sz_hash */
 SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t text, sz_size_t length);
 
-/** @copydoc sz_generate */
-SZ_PUBLIC void sz_generate_serial( //
-    sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length, sz_random_generator_t generate,
-    void *generator);
-
 /** @copydoc sz_hashes */
 SZ_PUBLIC void sz_hashes_serial(                                                      //
     sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
     sz_hash_callback_t callback, void *callback_handle);
 
+/** @copydoc sz_generate */
+SZ_PUBLIC void sz_generate_serial( //
+    sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length, sz_random_generator_t generate,
+    void *generator) {
+    sz_unused(alphabet && cardinality && text && length && generate && generator);
+}
+
 #pragma endregion // Core API
 
 #pragma region Serial Implementation
@@ -337,6 +344,33 @@ SZ_PUBLIC void sz_hashes_serial(sz_cptr_t start, sz_size_t length, sz_size_t win
     }
 }
 
+/** @brief  An internal callback used to set a bit in a power-of-two length binary fingerprint of a string. */
+SZ_INTERNAL void _sz_hashes_fingerprint_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *handle) {
+    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
+    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
+    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
+    fingerprint_u8s[(hash / 8) & (fingerprint_bytes - 1)] |= (1 << (hash & 7));
+    sz_unused(start && length);
+}
+
+/** @brief  An internal callback used to set a bit in a @b non power-of-two length binary fingerprint of a string. */
+SZ_INTERNAL void _sz_hashes_fingerprint_non_pow2_callback( //
+    sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *handle) {
+    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
+    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
+    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
+    fingerprint_u8s[(hash / 8) % fingerprint_bytes] |= (1 << (hash & 7));
+    sz_unused(start && length);
+}
+
+/** @brief  An internal callback, used to mix all the running hashes into one pointer-size value. */
+SZ_INTERNAL void _sz_hashes_fingerprint_scalar_callback( //
+    sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *scalar_handle) {
+    sz_unused(start && length && hash && scalar_handle);
+    sz_size_t *scalar_ptr = (sz_size_t *)scalar_handle;
+    *scalar_ptr ^= hash;
+}
+
 #undef _sz_shift_low
 #undef _sz_shift_high
 #undef _sz_hash_mix
@@ -350,10 +384,10 @@ SZ_PUBLIC void sz_hashes_serial(sz_cptr_t start, sz_size_t length, sz_size_t win
 #pragma region Haswell Implementation
 #if SZ_USE_HASWELL
 #pragma GCC push_options
-#pragma GCC target("haswell")
-#pragma clang attribute push(__attribute__((target("haswell"))), apply_to = function)
+#pragma GCC target("avx2")
+#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
 
-SZ_PUBLIC sz_u64_t sz_checksum_avx2(sz_cptr_t text, sz_size_t length) {
+SZ_PUBLIC sz_u64_t sz_checksum_haswell(sz_cptr_t text, sz_size_t length) {
     // The naive implementation of this function is very simple.
     // It assumes the CPU is great at handling unaligned "loads".
     //
@@ -448,8 +482,8 @@ SZ_INTERNAL __m256i _mm256_mul_epu64(__m256i a, __m256i b) {
     return prod;
 }
 
-SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                              sz_hash_callback_t callback, void *callback_handle) {
+SZ_PUBLIC void sz_hashes_haswell(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
+                                 sz_hash_callback_t callback, void *callback_handle) {
 
     if (length < window_length || !window_length) return;
     if (length < 4 * window_length) {
@@ -702,8 +736,8 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
     }
 }
 
-SZ_PUBLIC void sz_hashes_ice(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                             sz_hash_callback_t callback, void *callback_handle) {
+SZ_PUBLIC void sz_hashes_skylake(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
+                                 sz_hash_callback_t callback, void *callback_handle) {
 
     if (length < window_length || !window_length) return;
     if (length < 4 * window_length) {
@@ -888,7 +922,7 @@ SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length) {
 #if SZ_USE_ICE
     return sz_checksum_ice(text, length);
 #elif SZ_USE_HASWELL
-    return sz_checksum_avx2(text, length);
+    return sz_checksum_haswell(text, length);
 #elif SZ_USE_NEON
     return sz_checksum_neon(text, length);
 #else
@@ -898,10 +932,10 @@ SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length) {
 
 SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
                           sz_hash_callback_t callback, void *callback_handle) {
-#if SZ_USE_ICE
-    sz_hashes_ice(text, length, window_length, window_step, callback, callback_handle);
+#if SZ_USE_SKYLAKE
+    sz_hashes_skylake(text, length, window_length, window_step, callback, callback_handle);
 #elif SZ_USE_HASWELL
-    sz_hashes_avx2(text, length, window_length, window_step, callback, callback_handle);
+    sz_hashes_haswell(text, length, window_length, window_step, callback, callback_handle);
 #else
     sz_hashes_serial(text, length, window_length, window_step, callback, callback_handle);
 #endif
diff --git a/include/stringzilla/memory.h b/include/stringzilla/memory.h
index 06a3dc60..c17f031f 100644
--- a/include/stringzilla/memory.h
+++ b/include/stringzilla/memory.h
@@ -64,29 +64,29 @@ SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value);
 
 #if SZ_USE_HASWELL
 /** @copydoc sz_copy */
-SZ_PUBLIC sz_cptr_t sz_copy_haswell(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
+SZ_PUBLIC void sz_copy_haswell(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
 /** @copydoc sz_move */
-SZ_PUBLIC sz_cptr_t sz_move_haswell(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
+SZ_PUBLIC void sz_move_haswell(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
 /** @copydoc sz_rfind_fill */
-SZ_PUBLIC sz_cptr_t sz_fill_haswell(sz_ptr_t target, sz_size_t length, sz_u8_t value);
+SZ_PUBLIC void sz_fill_haswell(sz_ptr_t target, sz_size_t length, sz_u8_t value);
 #endif
 
 #if SZ_USE_SKYLAKE
 /** @copydoc sz_copy */
-SZ_PUBLIC sz_cptr_t sz_copy_skylake(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
+SZ_PUBLIC void sz_copy_skylake(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
 /** @copydoc sz_move */
-SZ_PUBLIC sz_cptr_t sz_move_skylake(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
+SZ_PUBLIC void sz_move_skylake(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
 /** @copydoc sz_rfind_fill */
-SZ_PUBLIC sz_cptr_t sz_fill_skylake(sz_ptr_t target, sz_size_t length, sz_u8_t value);
+SZ_PUBLIC void sz_fill_skylake(sz_ptr_t target, sz_size_t length, sz_u8_t value);
 #endif
 
 #if SZ_USE_NEON
 /** @copydoc sz_copy */
-SZ_PUBLIC sz_cptr_t sz_copy_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
+SZ_PUBLIC void sz_copy_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
 /** @copydoc sz_move */
-SZ_PUBLIC sz_cptr_t sz_move_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
+SZ_PUBLIC void sz_move_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
 /** @copydoc sz_rfind_fill */
-SZ_PUBLIC sz_cptr_t sz_fill_neon(sz_ptr_t target, sz_size_t length, sz_u8_t value);
+SZ_PUBLIC void sz_fill_neon(sz_ptr_t target, sz_size_t length, sz_u8_t value);
 #endif
 
 /**
@@ -358,13 +358,13 @@ SZ_PUBLIC void sz_fill_haswell(sz_ptr_t target, sz_size_t length, sz_u8_t value)
         if (head_length & 8) *(sz_u64_t *)target = value64, target += 8, head_length -= 8;
         if (head_length & 16)
             _mm_store_si128((__m128i *)target, _mm_set1_epi8(value_char)), target += 16, head_length -= 16;
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
+        _sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
 
         // Fill the aligned body of the buffer.
         for (; body_length >= 32; target += 32, body_length -= 32) _mm256_store_si256((__m256i *)target, value_vec);
 
         // Fill the tail of the buffer. This part is much cleaner with AVX-512.
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
+        _sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
         if (tail_length & 16)
             _mm_store_si128((__m128i *)target, _mm_set1_epi8(value_char)), target += 16, tail_length -= 16;
         if (tail_length & 8) *(sz_u64_t *)target = value64, target += 8, tail_length -= 8;
@@ -374,7 +374,7 @@ SZ_PUBLIC void sz_fill_haswell(sz_ptr_t target, sz_size_t length, sz_u8_t value)
     }
 }
 
-SZ_PUBLIC void sz_copy_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
+SZ_PUBLIC void sz_copy_haswell(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
     // The naive implementation of this function is very simple.
     // It assumes the CPU is great at handling unaligned "stores" and "loads".
     //
@@ -387,7 +387,7 @@ SZ_PUBLIC void sz_copy_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length)
     // For now, let's avoid the cases beyond the L2 size.
     int is_huge = length > 1ull * 1024ull * 1024ull;
     if (length <= 32) { sz_copy_serial(target, source, length); }
-    // When dealing wirh larger arrays, the optimization is not as simple as with the `sz_fill_haswell` function,
+    // When dealing with larger arrays, the optimization is not as simple as with the `sz_fill_haswell` function,
     // as both buffers may be unaligned. If we are lucky and the requested operation is some huge page transfer,
     // we can use aligned loads and stores, and the performance will be great.
     else if ((sz_size_t)target % 32 == 0 && (sz_size_t)source % 32 == 0 && !is_huge) {
@@ -411,7 +411,7 @@ SZ_PUBLIC void sz_copy_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length)
         if (head_length & 16)
             _mm_store_si128((__m128i *)target, _mm_lddqu_si128((__m128i const *)source)), target += 16, source += 16,
                 head_length -= 16;
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
+        _sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
 
         // Fill the aligned body of the buffer.
         if (!is_huge) {
@@ -429,7 +429,7 @@ SZ_PUBLIC void sz_copy_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length)
         }
 
         // Fill the tail of the buffer. This part is much cleaner with AVX-512.
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
+        _sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
         if (tail_length & 16)
             _mm_store_si128((__m128i *)target, _mm_lddqu_si128((__m128i const *)source)), target += 16, source += 16,
                 tail_length -= 16;
@@ -440,7 +440,7 @@ SZ_PUBLIC void sz_copy_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length)
     }
 }
 
-SZ_PUBLIC void sz_move_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
+SZ_PUBLIC void sz_move_haswell(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
     if (target < source || target >= source + length) {
         for (; length >= 32; target += 32, source += 32, length -= 32)
             _mm256_storeu_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
@@ -454,7 +454,7 @@ SZ_PUBLIC void sz_move_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length)
     }
 }
 
-SZ_PUBLIC void sz_look_up_transform_avx2(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
+SZ_PUBLIC void sz_look_up_transform_haswell(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
 
     // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
     // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
@@ -637,7 +637,7 @@ SZ_PUBLIC void sz_fill_skylake(sz_ptr_t target, sz_size_t length, sz_u8_t value)
     }
 }
 
-SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
+SZ_PUBLIC void sz_copy_skylake(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
     // The naive implementation of this function is very simple.
     // It assumes the CPU is great at handling unaligned "stores" and "loads".
     //
@@ -656,7 +656,7 @@ SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
         __mmask64 mask = _sz_u64_mask_until(length);
         _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
     }
-    // When dealing wirh larger arrays, the optimization is not as simple as with the `sz_fill_skylake` function,
+    // When dealing with larger arrays, the optimization is not as simple as with the `sz_fill_skylake` function,
     // as both buffers may be unaligned. If we are lucky and the requested operation is some huge page transfer,
     // we can use aligned loads and stores, and the performance will be great.
     else if ((sz_size_t)target % 64 == 0 && (sz_size_t)source % 64 == 0 && !is_huge) {
@@ -715,7 +715,7 @@ SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
     }
 }
 
-SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
+SZ_PUBLIC void sz_move_skylake(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
     if (target == source) return; // Don't be silly, don't move the data if it's already there.
 
     // On very short buffers, that are one cache line in width or less, we don't need any loops.
@@ -757,7 +757,7 @@ SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
     }
 
     // If the regions don't overlap at all, just use "copy" and save some brain cells thinking about corner cases.
-    else if (target + length < source || target >= source + length) { sz_copy_avx512(target, source, length); }
+    else if (target + length < source || target >= source + length) { sz_copy_skylake(target, source, length); }
 
     // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
     // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
@@ -1257,9 +1257,9 @@ SZ_PUBLIC void sz_copy_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length)
 
 SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
 #if SZ_USE_ICE
-    sz_copy_avx512(target, source, length);
+    sz_copy_skylake(target, source, length);
 #elif SZ_USE_HASWELL
-    sz_copy_avx2(target, source, length);
+    sz_copy_haswell(target, source, length);
 #elif SZ_USE_NEON
     sz_copy_neon(target, source, length);
 #else
@@ -1269,9 +1269,9 @@ SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
 
 SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
 #if SZ_USE_ICE
-    sz_move_avx512(target, source, length);
+    sz_move_skylake(target, source, length);
 #elif SZ_USE_HASWELL
-    sz_move_avx2(target, source, length);
+    sz_move_haswell(target, source, length);
 #elif SZ_USE_NEON
     sz_move_neon(target, source, length);
 #else
diff --git a/include/stringzilla/similarity.h b/include/stringzilla/similarity.h
index 5451c95f..943f7f35 100644
--- a/include/stringzilla/similarity.h
+++ b/include/stringzilla/similarity.h
@@ -16,6 +16,7 @@
 #ifndef STRINGZILLA_SIMILARITY_H_
 #define STRINGZILLA_SIMILARITY_H_
 
+#include "find.h"
 #include "types.h"
 
 #ifdef __cplusplus
@@ -183,6 +184,20 @@ SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(                       //
     sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
     sz_memory_allocator_t *alloc);
 
+#if SZ_USE_ICE
+
+SZ_INTERNAL sz_size_t sz_edit_distance_ice(      //
+    sz_cptr_t shorter, sz_size_t shorter_length, //
+    sz_cptr_t longer, sz_size_t longer_length,   //
+    sz_size_t bound, sz_memory_allocator_t *alloc);
+
+SZ_INTERNAL sz_ssize_t sz_alignment_score_ice(   //
+    sz_cptr_t shorter, sz_size_t shorter_length, //
+    sz_cptr_t longer, sz_size_t longer_length,   //
+    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc);
+
+#endif
+
 #pragma endregion // Core API
 
 #pragma region Serial Implementation
@@ -200,8 +215,8 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_serial( //
     }
 
     // TODO: Generalize to remove the following asserts!
-    sz_assert(!bound && "For bounded search the method should only evaluate one band of the matrix.");
-    sz_assert(shorter_length == longer_length && "The method hasn't been generalized to different length inputs yet.");
+    _sz_assert(!bound && "For bounded search the method should only evaluate one band of the matrix.");
+    _sz_assert(shorter_length == longer_length && "The method hasn't been generalized to different length inputs yet.");
     sz_unused(longer_length && bound);
 
     // We are going to store 3 diagonals of the matrix.
@@ -269,7 +284,8 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_serial( //
  *          Stores only 2 rows of the Levenshtein matrix, but uses 64-bit integers for the distance values,
  *          and upcasts UTF8 variable-length codepoints to 64-bit integers for faster addressing.
  *
- *  ! In the worst case for 2 strings of length 100, that contain just one 16-bit codepoint this will result in extra:
+ *  ! In the worst case for 2 strings of length 100, that contain just one 16-bit codepoint this will result in
+ * extra:
  *      + 2 rows * 100 slots * 8 bytes/slot = 1600 bytes of memory for the two rows of the Levenshtein matrix rows.
  *      + 100 codepoints * 2 strings * 4 bytes/codepoint = 800 bytes of memory for the UTF8 buffer.
  *      = 2400 bytes of memory or @b 12x memory amplification!
@@ -302,10 +318,13 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_wagner_fisher_serial( //
 
     // If the strings contain Unicode characters, let's estimate the max character width,
     // and use it to allocate a larger buffer to decode UTF8.
-    if ((can_be_unicode == sz_true_k) &&
-        (sz_isascii(longer, longer_length) == sz_false_k || sz_isascii(shorter, shorter_length) == sz_false_k)) {
-        buffer_length += (shorter_length + longer_length) * sizeof(sz_rune_t);
-    }
+    sz_charset_t ascii_charset;
+    sz_charset_init_ascii(&ascii_charset);
+    sz_charset_invert(&ascii_charset);
+    int const longer_is_ascii = sz_find_charset_serial(longer, longer_length, &ascii_charset) == SZ_NULL_CHAR;
+    int const shorter_is_ascii = sz_find_charset_serial(shorter, shorter_length, &ascii_charset) == SZ_NULL_CHAR;
+    int const will_convert_to_unicode = can_be_unicode == sz_true_k && (!longer_is_ascii || !shorter_is_ascii);
+    if (will_convert_to_unicode) { buffer_length += (shorter_length + longer_length) * sizeof(sz_rune_t); }
     else { can_be_unicode = sz_false_k; }
 
     // If the allocation fails, return the maximum distance.
@@ -619,19 +638,19 @@ SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial( //
 /**
  *  @brief  Computes the edit distance between two very short byte-strings using the AVX-512VBMI extensions.
  *
- *  Applies to string lengths up to 63, and evaluates at most (63 * 2 + 1 = 127) diagonals, or just as many loop cycles.
- *  Supports an early exit, if the distance is bounded.
- *  Keeps all of the data and Levenshtein matrices skew diagonal in just a couple of registers.
- *  Benefits from the @b `vpermb` instructions, that can rotate the bytes across the entire ZMM register.
+ *  Applies to string lengths up to 63, and evaluates at most (63 * 2 + 1 = 127) diagonals, or just as many loop
+ * cycles. Supports an early exit, if the distance is bounded. Keeps all of the data and Levenshtein matrices skew
+ * diagonal in just a couple of registers. Benefits from the @b `vpermb` instructions, that can rotate the bytes
+ * across the entire ZMM register.
  */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto63_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                        //
-    sz_cptr_t longer, sz_size_t longer_length,                          //
+SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto63_ice( //
+    sz_cptr_t shorter, sz_size_t shorter_length,                     //
+    sz_cptr_t longer, sz_size_t longer_length,                       //
     sz_size_t bound) {
 
     sz_size_t const max_length = 63u;
-    sz_assert(shorter_length <= longer_length && "The 'shorter' string is longer than the 'longer' one.");
-    sz_assert(shorter_length < max_length && "The length must fit into 16-bit integer. Otherwise use serial variant.");
+    _sz_assert(shorter_length <= longer_length && "The 'shorter' string is longer than the 'longer' one.");
+    _sz_assert(shorter_length < max_length && "The length must fit into 16-bit integer. Otherwise use serial variant.");
 
     // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
     // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
@@ -792,9 +811,9 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto63_avx512( //
  *  - source code analysis, assuming most lines are either under 80 or under 120 characters long.
  *  - DNA sequence alignment, as most short reads are 50-300 characters long.
  */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto127_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
+SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto127_ice( //
+    sz_cptr_t shorter, sz_size_t shorter_length,                      //
+    sz_cptr_t longer, sz_size_t longer_length,                        //
     sz_size_t bound) {
     sz_unused(shorter && shorter_length && longer && longer_length && bound);
     return 0;
@@ -812,9 +831,9 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto127_avx512( //
  *  This is the largest space-efficient variant, as strings beyond 255 characters may require
  *  16-bit accumulators, which would be a significant bottleneck.
  */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                      //
-    sz_cptr_t longer, sz_size_t longer_length,                        //
+SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto_ice( //
+    sz_cptr_t shorter, sz_size_t shorter_length,                   //
+    sz_cptr_t longer, sz_size_t longer_length,                     //
     sz_size_t bound) {
     sz_unused(shorter && shorter_length && longer && longer_length && bound);
     return 0;
@@ -833,9 +852,9 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto_avx512( //
  *  This is the largest space-efficient variant, as strings beyond 255 characters may require
  *  16-bit accumulators, which would be a significant bottleneck.
  */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto255bound_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                              //
-    sz_cptr_t longer, sz_size_t longer_length,                                //
+SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto255bound_ice( //
+    sz_cptr_t shorter, sz_size_t shorter_length,                           //
+    sz_cptr_t longer, sz_size_t longer_length,                             //
     sz_size_t bound) {
     sz_unused(shorter && shorter_length && longer && longer_length && bound);
     return 0;
@@ -850,17 +869,17 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto255bound_avx512( //
  *
  *  Each string is unpacked into 128 characters * 4 bytes per character / 64 bytes per register = 8 registers.
  */
-SZ_INTERNAL sz_size_t _sz_edit_distance_utf8_skewed_diagonals_upto127_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                              //
-    sz_cptr_t longer, sz_size_t longer_length,                                //
+SZ_INTERNAL sz_size_t _sz_edit_distance_utf8_skewed_diagonals_upto127_ice( //
+    sz_cptr_t shorter, sz_size_t shorter_length,                           //
+    sz_cptr_t longer, sz_size_t longer_length,                             //
     sz_size_t bound) {
     sz_unused(shorter && shorter_length && longer && longer_length && bound);
     return 0;
 }
 
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
+SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_ice( //
+    sz_cptr_t shorter, sz_size_t shorter_length,                      //
+    sz_cptr_t longer, sz_size_t longer_length,                        //
     sz_size_t bound, sz_memory_allocator_t *alloc) {
 
     sz_unused(shorter && longer && bound && alloc);
@@ -874,8 +893,8 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
 
     // TODO: Generalize!
     sz_size_t const max_length = 256u * 256u;
-    sz_assert(shorter_length <= longer_length && "The 'shorter' string is longer than the 'longer' one.");
-    sz_assert(shorter_length < max_length && "The length must fit into 16-bit integer. Otherwise use serial variant.");
+    _sz_assert(shorter_length <= longer_length && "The 'shorter' string is longer than the 'longer' one.");
+    _sz_assert(shorter_length < max_length && "The length must fit into 16-bit integer. Otherwise use serial variant.");
     sz_unused(longer_length && bound && max_length);
 
 #if 0
@@ -1017,7 +1036,7 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
     return 0;
 }
 
-SZ_INTERNAL sz_size_t sz_edit_distance_avx512(   //
+SZ_INTERNAL sz_size_t sz_edit_distance_ice(      //
     sz_cptr_t shorter, sz_size_t shorter_length, //
     sz_cptr_t longer, sz_size_t longer_length,   //
     sz_size_t bound, sz_memory_allocator_t *alloc) {
@@ -1044,10 +1063,10 @@ SZ_INTERNAL sz_size_t sz_edit_distance_avx512(   //
 
     // Dispatch the right implementation based on the length of the strings.
     if (longer_length < 64u)
-        return _sz_edit_distance_skewed_diagonals_upto63_avx512( //
+        return _sz_edit_distance_skewed_diagonals_upto63_ice( //
             shorter, shorter_length, longer, longer_length, bound);
     // else if (longer_length < 256u * 256u)
-    //     return _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
+    //     return _sz_edit_distance_skewed_diagonals_upto65k_ice( //
     //         shorter, shorter_length, longer, longer_length, bound, alloc);
     else
         return sz_edit_distance_serial(shorter, shorter_length, longer, longer_length, bound, alloc);
@@ -1061,9 +1080,9 @@ SZ_INTERNAL sz_size_t sz_edit_distance_avx512(   //
  *
  *  Unlike the `_sz_edit_distance_skewed_diagonals_upto65k_avx512` method, this one uses signed integers to store
  *  the accumulated score. Moreover, it's primary bottleneck is the latency of gathering the substitution costs
- *  from the substitution matrix. If we use the diagonal order, we will be comparing a slice of the first string with
- *  a slice of the second. If we stick to the conventional horizontal order, we will be comparing one character against
- *  a slice, which is much easier to optimize. In that case we are sampling costs not from arbitrary parts of
+ *  from the substitution matrix. If we use the diagonal order, we will be comparing a slice of the first string
+ * with a slice of the second. If we stick to the conventional horizontal order, we will be comparing one character
+ * against a slice, which is much easier to optimize. In that case we are sampling costs not from arbitrary parts of
  *  a 256 x 256 matrix, but from a single row!
  */
 SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_ice( //
@@ -1091,7 +1110,7 @@ SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_ice( //
 
     sz_size_t const max_length = 256ull * 256ull * 256ull;
     sz_size_t const n = longer_length + 1;
-    sz_assert(n < max_length && "The length must fit into 24-bit integer. Otherwise use serial variant.");
+    _sz_assert(n < max_length && "The length must fit into 24-bit integer. Otherwise use serial variant.");
     sz_unused(longer_length && max_length);
 
     sz_size_t buffer_length = sizeof(sz_i32_t) * n * 2;
@@ -1099,7 +1118,7 @@ SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_ice( //
     sz_i32_t *previous_distances = distances;
     sz_i32_t *current_distances = previous_distances + n;
 
-    // Intialize the first row of the Levenshtein matrix with `iota`.
+    // Initialize the first row of the Levenshtein matrix with `iota`.
     for (sz_size_t idx_longer = 0; idx_longer != n; ++idx_longer)
         previous_distances[idx_longer] = (sz_i32_t)idx_longer * gap;
 
@@ -1135,8 +1154,9 @@ SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_ice( //
         //    for (sz_size_t idx_longer = 0; idx_longer < longer_length; ++idx_longer) {
         //        sz_ssize_t cost_deletion = previous_distances[idx_longer + 1] + gap;
         //        sz_ssize_t cost_insertion = current_distances[idx_longer] + gap;
-        //        sz_ssize_t cost_substitution = previous_distances[idx_longer] + row_subs[longer_unsigned[idx_longer]];
-        //        current_distances[idx_longer + 1] = sz_min_of_three(cost_deletion, cost_insertion, cost_substitution);
+        //        sz_ssize_t cost_substitution = previous_distances[idx_longer] +
+        //        row_subs[longer_unsigned[idx_longer]]; current_distances[idx_longer + 1] =
+        //        sz_min_of_three(cost_deletion, cost_insertion, cost_substitution);
         //    }
         //
         // Given the complexity of handling the data-dependency between consecutive insertion cost computations
@@ -1201,9 +1221,10 @@ SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_ice( //
                 // "experimental" section.
                 //
                 // Another approach might be loop unrolling:
-                //      current_vec.i32s[0] = last_in_row = sz_i32_max_of_two(current_vec.i32s[0], last_in_row + gap);
-                //      current_vec.i32s[1] = last_in_row = sz_i32_max_of_two(current_vec.i32s[1], last_in_row + gap);
-                //      current_vec.i32s[2] = last_in_row = sz_i32_max_of_two(current_vec.i32s[2], last_in_row + gap);
+                //      current_vec.i32s[0] = last_in_row = sz_i32_max_of_two(current_vec.i32s[0], last_in_row +
+                //      gap); current_vec.i32s[1] = last_in_row = sz_i32_max_of_two(current_vec.i32s[1], last_in_row
+                //      + gap); current_vec.i32s[2] = last_in_row = sz_i32_max_of_two(current_vec.i32s[2],
+                //      last_in_row + gap);
                 //      ... yet this approach is also quite expensive.
                 for (int i = 0; i != 16; ++i)
                     current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
@@ -1345,7 +1366,7 @@ SZ_DYNAMIC sz_size_t sz_edit_distance( //
     sz_cptr_t b, sz_size_t b_length,   //
     sz_size_t bound, sz_memory_allocator_t *alloc) {
 #if SZ_USE_ICE
-    return sz_edit_distance_avx512(a, a_length, b, b_length, bound, alloc);
+    return sz_edit_distance_ice(a, a_length, b, b_length, bound, alloc);
 #else
     return sz_edit_distance_serial(a, a_length, b, b_length, bound, alloc);
 #endif
diff --git a/include/stringzilla/small_string.h b/include/stringzilla/small_string.h
index ba823901..c5c70773 100644
--- a/include/stringzilla/small_string.h
+++ b/include/stringzilla/small_string.h
@@ -261,7 +261,7 @@ SZ_PUBLIC sz_ordering_t sz_string_order(sz_string_t const *a, sz_string_t const
 }
 
 SZ_PUBLIC void sz_string_init(sz_string_t *string) {
-    sz_assert(string && "String can't be SZ_NULL.");
+    _sz_assert(string && "String can't be SZ_NULL.");
 
     // Only 8 + 1 + 1 need to be initialized.
     string->internal.start = &string->internal.chars[0];
@@ -275,7 +275,7 @@ SZ_PUBLIC void sz_string_init(sz_string_t *string) {
 
 SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length, sz_memory_allocator_t *allocator) {
     sz_size_t space_needed = length + 1; // space for trailing \0
-    sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
+    _sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
     // Initialize the string to zeros for safety.
     string->words[1] = 0;
     string->words[2] = 0;
@@ -292,14 +292,14 @@ SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length,
         string->external.length = length;
         string->external.space = space_needed;
     }
-    sz_assert(&string->internal.start == &string->external.start && "Alignment confusion");
+    _sz_assert(&string->internal.start == &string->external.start && "Alignment confusion");
     string->external.start[length] = 0;
     return string->external.start;
 }
 
 SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity, sz_memory_allocator_t *allocator) {
 
-    sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
+    _sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
 
     sz_size_t new_space = new_capacity + 1;
     if (new_space <= _SZ_STRING_INTERNAL_SPACE) return string->external.start;
@@ -309,7 +309,7 @@ SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity
     sz_size_t string_space;
     sz_bool_t string_is_external;
     sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-    sz_assert(new_space > string_space && "New space must be larger than current.");
+    _sz_assert(new_space > string_space && "New space must be larger than current.");
 
     sz_ptr_t new_start = (sz_ptr_t)allocator->allocate(new_space, allocator->handle);
     if (!new_start) return SZ_NULL_CHAR;
@@ -327,7 +327,7 @@ SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity
 
 SZ_PUBLIC sz_ptr_t sz_string_shrink_to_fit(sz_string_t *string, sz_memory_allocator_t *allocator) {
 
-    sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
+    _sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
 
     sz_ptr_t string_start;
     sz_size_t string_length;
@@ -356,7 +356,7 @@ SZ_PUBLIC sz_ptr_t sz_string_shrink_to_fit(sz_string_t *string, sz_memory_alloca
 SZ_PUBLIC sz_ptr_t sz_string_expand( //
     sz_string_t *string, sz_size_t offset, sz_size_t added_length, sz_memory_allocator_t *allocator) {
 
-    sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
+    _sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
 
     sz_ptr_t string_start;
     sz_size_t string_length;
@@ -393,7 +393,7 @@ SZ_PUBLIC sz_ptr_t sz_string_expand( //
 
 SZ_PUBLIC sz_size_t sz_string_erase(sz_string_t *string, sz_size_t offset, sz_size_t length) {
 
-    sz_assert(string && "String can't be SZ_NULL.");
+    _sz_assert(string && "String can't be SZ_NULL.");
 
     sz_ptr_t string_start;
     sz_size_t string_length;
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index c0b1b369..ba36975c 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -41,6 +41,15 @@
 #define STRINGZILLA_VERSION_MINOR 11
 #define STRINGZILLA_VERSION_PATCH 0
 
+#include "compare.h"      // `sz_equal`, `sz_order`
+#include "find.h"         // `sz_find`, `sz_find_charset`, `sz_rfind`
+#include "hash.h"         // `sz_checksum`, `sz_hash`, `sz_hashes`
+#include "memory.h"       // `sz_copy`, `sz_move`, `sz_fill`
+#include "similarity.h"   // `sz_edit_distance`, `sz_alignment_score`
+#include "small_string.h" // `sz_string_t`, `sz_string_init`, `sz_string_free`
+#include "sort.h"         // `sz_sort`, `sz_sort_partial`, `sz_partition`
+#include "types.h"        // `sz_size_t`, `sz_bool_t`, `sz_ordering_t`
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -49,20 +58,18 @@ extern "C" {
  *  @brief  Enumeration of SIMD capabilities of the target architecture.
  *          Used to introspect the supported functionality of the dynamic library.
  */
-typedef enum sz_capability_t {
-    sz_cap_serial_k = 1,       /// Serial (non-SIMD) capability
-    sz_cap_any_k = 0x7FFFFFFF, /// Mask representing any capability
+typedef enum {
+    sz_cap_serial_k = 1,       ///< Serial (non-SIMD) capability
+    sz_cap_any_k = 0x7FFFFFFF, ///< Mask representing any capability with `INT_MAX`
 
-    sz_cap_arm_neon_k = 1 << 10, /// ARM NEON capability
-    sz_cap_arm_sve_k = 1 << 11,  /// ARM SVE capability TODO: Not yet supported or used
-    sz_cap_arm_sve2_k = 1 << 12,
-    sz_cap_arm_sve2p1_k = 1 << 13,
-    sz_cap_x86_avx2_k = 1 << 20,       /// x86 AVX2 capability
-    sz_cap_x86_avx512f_k = 1 << 21,    /// x86 AVX512 F capability
-    sz_cap_x86_avx512bw_k = 1 << 22,   /// x86 AVX512 BW instruction capability
-    sz_cap_x86_avx512vl_k = 1 << 23,   /// x86 AVX512 VL instruction capability
-    sz_cap_x86_avx512vbmi_k = 1 << 24, /// x86 AVX512 VBMI instruction capability
-    sz_cap_x86_gfni_k = 1 << 25,       /// x86 AVX512 GFNI instruction capability
+    sz_cap_haswell_k = 1 << 10, ///< x86 AVX2 capability with FMA and F16C extensions
+    sz_cap_skylake_k = 1 << 11, ///< x86 AVX512 baseline capability
+    sz_cap_ice_k = 1 << 12,     ///< x86 AVX512 capability with advanced integer algos
+
+    sz_cap_neon_k = 1 << 20,   ///< ARM NEON baseline capability
+    sz_cap_sve_k = 1 << 21,    ///< ARM SVE baseline capability
+    sz_cap_sve2_k = 1 << 22,   ///< ARM SVE2 capability
+    sz_cap_sve2p1_k = 1 << 23, ///< ARM SVE2p1 capability
 
 } sz_capability_t;
 
@@ -72,6425 +79,7 @@ typedef enum sz_capability_t {
  */
 SZ_DYNAMIC sz_capability_t sz_capabilities(void);
 
-/**
- *  @brief  Checks if two string are equal.
- *          Similar to `memcmp(a, b, length) == 0` in LibC and `a == b` in STL.
- *
- *  The implementation of this function is very similar to `sz_order`, but the usage patterns are different.
- *  This function is more often used in parsing, while `sz_order` is often used in sorting.
- *  It works best on platforms with cheap
- *
- *  @param a        First string to compare.
- *  @param b        Second string to compare.
- *  @param length   Number of bytes in both strings.
- *  @return         1 if strings match, 0 otherwise.
- */
-SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-
-/**
- *  @brief  Estimates the relative order of two strings. Equivalent to `memcmp(a, b, length)` in LibC.
- *          Can be used on different length strings.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *  @return         Negative if (a < b), positive if (a > b), zero if they are equal.
- */
-SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-
-/**
- *  @brief  Initializes a string class instance to an empty value.
- */
-SZ_PUBLIC void sz_string_init(sz_string_t *string);
-
-/**
- *  @brief  Convenience function checking if the provided string is stored inside of the ::string instance itself,
- *          alternative being - allocated in a remote region of the heap.
- */
-SZ_PUBLIC sz_bool_t sz_string_is_on_stack(sz_string_t const *string);
-
-/**
- *  @brief  Unpacks the opaque instance of a string class into its components.
- *          Recommended to use only in read-only operations.
- *
- *  @param string       String to unpack.
- *  @param start        Pointer to the start of the string.
- *  @param length       Number of bytes in the string, before the SZ_NULL character.
- *  @param space        Number of bytes allocated for the string (heap or stack), including the SZ_NULL character.
- *  @param is_external  Whether the string is allocated on the heap externally, or fits withing ::string instance.
- */
-SZ_PUBLIC void sz_string_unpack(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length, sz_size_t *space,
-                                sz_bool_t *is_external);
-
-/**
- *  @brief  Unpacks only the start and length of the string.
- *          Recommended to use only in read-only operations.
- *
- * @param string       String to unpack.
- * @param start        Pointer to the start of the string.
- * @param length       Number of bytes in the string, before the SZ_NULL character.
- */
-SZ_PUBLIC void sz_string_range(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length);
-
-/**
- *  @brief  Constructs a string of a given ::length with noisy contents.
- *          Use the returned character pointer to populate the string.
- *
- *  @param string       String to initialize.
- *  @param length       Number of bytes in the string, before the SZ_NULL character.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             SZ_NULL if the operation failed, pointer to the start of the string otherwise.
- */
-SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length, sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Doesn't change the contents or the length of the string, but grows the available memory capacity.
- *          This is beneficial, if several insertions are expected, and we want to minimize allocations.
- *
- *  @param string       String to grow.
- *  @param new_capacity The number of characters to reserve space for, including existing ones.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             SZ_NULL if the operation failed, pointer to the new start of the string otherwise.
- */
-SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity, sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Grows the string by adding an uninitialized region of ::added_length at the given ::offset.
- *          Would often be used in conjunction with one or more `sz_copy` calls to populate the allocated region.
- *          Similar to `sz_string_reserve`, but changes the length of the ::string.
- *
- *  @param string       String to grow.
- *  @param offset       Offset of the first byte to reserve space for.
- *                      If provided offset is larger than the length, it will be capped.
- *  @param added_length The number of new characters to reserve space for.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             SZ_NULL if the operation failed, pointer to the new start of the string otherwise.
- */
-SZ_PUBLIC sz_ptr_t sz_string_expand(sz_string_t *string, sz_size_t offset, sz_size_t added_length,
-                                    sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Removes a range from a string. Changes the length, but not the capacity.
- *          Performs no allocations or deallocations and can't fail.
- *
- *  @param string       String to clean.
- *  @param offset       Offset of the first byte to remove.
- *  @param length       Number of bytes to remove. Out-of-bound ranges will be capped.
- *  @return             Number of bytes removed.
- */
-SZ_PUBLIC sz_size_t sz_string_erase(sz_string_t *string, sz_size_t offset, sz_size_t length);
-
-/**
- *  @brief  Shrinks the string to fit the current length, if it's allocated on the heap.
- *          It's the reverse operation of ::sz_string_reserve.
- *
- *  @param string       String to shrink.
- *  @param allocator    Memory allocator to use for the allocation.
- *  @return             Whether the operation was successful. The only failures can come from the allocator.
- *                      On failure, the string will remain unchanged.
- */
-SZ_PUBLIC sz_ptr_t sz_string_shrink_to_fit(sz_string_t *string, sz_memory_allocator_t *allocator);
-
-/**
- *  @brief  Frees the string, if it's allocated on the heap.
- *          If the string is on the stack, the function clears/resets the state.
- */
-SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *allocator);
-
-#pragma endregion
-
-#pragma region Fast Substring Search API
-
-typedef sz_cptr_t (*sz_find_byte_t)(sz_cptr_t, sz_size_t, sz_cptr_t);
-typedef sz_cptr_t (*sz_find_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
-typedef sz_cptr_t (*sz_find_set_t)(sz_cptr_t, sz_size_t, sz_charset_t const *);
-
-/**
- *  @brief  Locates first matching byte in a string. Equivalent to `memchr(haystack, *needle, h_length)` in LibC.
- *
- *  X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memchr.S
- *  Aarch64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/aarch64/memchr.S
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - single-byte substring to find.
- *  @return         Address of the first match.
- */
-SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/**
- *  @brief  Locates last matching byte in a string. Equivalent to `memrchr(haystack, *needle, h_length)` in LibC.
- *
- *  X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memrchr.S
- *  Aarch64 implementation: missing
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - single-byte substring to find.
- *  @return         Address of the last match.
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-
-/**
- *  @brief  Locates first matching substring.
- *          Equivalent to `memmem(haystack, h_length, needle, n_length)` in LibC.
- *          Similar to `strstr(haystack, needle)` in LibC, but requires known length.
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - substring to find.
- *  @param n_length Number of bytes in the needle.
- *  @return         Address of the first match.
- */
-SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/**
- *  @brief  Locates the last matching substring.
- *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - substring to find.
- *  @param n_length Number of bytes in the needle.
- *  @return         Address of the last match.
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-
-/**
- *  @brief  Finds the first character present from the ::set, present in ::text.
- *          Equivalent to `strspn(text, accepted)` and `strcspn(text, rejected)` in LibC.
- *          May have identical implementation and performance to ::sz_rfind_charset.
- *
- *  Useful for parsing, when we want to skip a set of characters. Examples:
- *  * 6 whitespaces: " \t\n\r\v\f".
- *  * 16 digits forming a float number: "0123456789,.eE+-".
- *  * 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
- *  * 2 JSON string special characters useful to locate the end of the string: "\"\\".
- *
- *  @param text     String to be scanned.
- *  @param set      Set of relevant characters.
- *  @return         Pointer to the first matching character from ::set.
- */
-SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-/**
- *  @brief  Finds the last character present from the ::set, present in ::text.
- *          Equivalent to `strspn(text, accepted)` and `strcspn(text, rejected)` in LibC.
- *          May have identical implementation and performance to ::sz_find_charset.
- *
- *  Useful for parsing, when we want to skip a set of characters. Examples:
- *  * 6 whitespaces: " \t\n\r\v\f".
- *  * 16 digits forming a float number: "0123456789,.eE+-".
- *  * 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
- *  * 2 JSON string special characters useful to locate the end of the string: "\"\\".
- *
- *  @param text     String to be scanned.
- *  @param set      Set of relevant characters.
- *  @return         Pointer to the last matching character from ::set.
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-
-#pragma endregion
-
-#pragma region String Similarity Measures API
-
-/**
- *  @brief  Computes the Hamming distance between two strings - number of not matching characters.
- *          Difference in length is is counted as a mismatch.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for the distance, the `bound` if was exceeded.
- *
- *  @see    sz_hamming_distance_utf8
- *  @see    https://en.wikipedia.org/wiki/Hamming_distance
- */
-SZ_DYNAMIC sz_size_t sz_hamming_distance( //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
-
-/** @copydoc sz_hamming_distance */
-SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
-
-/**
- *  @brief  Computes the Hamming distance between two @b UTF8 strings - number of not matching characters.
- *          Difference in length is is counted as a mismatch.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for the distance, the `bound` if was exceeded.
- *
- *  @see    sz_hamming_distance
- *  @see    https://en.wikipedia.org/wiki/Hamming_distance
- */
-SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                              sz_size_t bound);
-
-/** @copydoc sz_hamming_distance_utf8 */
-SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                                    sz_size_t bound);
-
-typedef sz_size_t (*sz_hamming_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t);
-
-/**
- *  @brief  Computes the Levenshtein edit-distance between two strings using the Wagner-Fisher algorithm.
- *          Similar to the Needleman-Wunsch alignment algorithm. Often used in fuzzy string matching.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @param bound    Exclusive upper bound on the distance, that allows us to exit early.
- *                  Pass `SZ_SIZE_MAX` or any value greater than `(max(a_length, b_length))` to ignore.
- *                  Pass zero to check if the strings are equal.
- *  @return         Unsigned integer for the edit distance. Zero means the strings are equal.
- *                  Returns the `bound` if it was exceeded or `SZ_SIZE_MAX` if the memory allocation failed.
- *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default
- *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
- */
-SZ_DYNAMIC sz_size_t sz_edit_distance(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                      sz_size_t bound, sz_memory_allocator_t *alloc);
-
-/** @copydoc sz_edit_distance */
-SZ_PUBLIC sz_size_t sz_edit_distance_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                            sz_size_t bound, sz_memory_allocator_t *alloc);
-
-/**
- *  @brief  Computes the Levenshtein edit-distance between two @b UTF8 strings.
- *          Unlike `sz_edit_distance`, reports the distance in Unicode codepoints, and not in bytes.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @param bound    Upper bound on the distance, that allows us to exit early.
- *                  If zero is passed, the maximum possible distance will be equal to the length of the longer input.
- *  @return         Unsigned integer for edit distance, the `bound` if was exceeded or `SZ_SIZE_MAX`
- *                  if the memory allocation failed.
- *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default, sz_edit_distance
- *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
- */
-SZ_DYNAMIC sz_size_t sz_edit_distance_utf8(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                           sz_size_t bound, sz_memory_allocator_t *alloc);
-
-typedef sz_size_t (*sz_edit_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t, sz_memory_allocator_t *);
-
-/** @copydoc sz_edit_distance_utf8 */
-SZ_PUBLIC sz_size_t sz_edit_distance_utf8_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                                 sz_size_t bound, sz_memory_allocator_t *alloc);
-
-/**
- *  @brief  Computes Needleman–Wunsch alignment score for two string. Often used in bioinformatics and cheminformatics.
- *          Similar to the Levenshtein edit-distance, parameterized for gap and substitution penalties.
- *
- *  Not commutative in the general case, as the order of the strings matters, as `sz_alignment_score(a, b)` may
- *  not be equal to `sz_alignment_score(b, a)`. Becomes @b commutative, if the substitution costs are symmetric.
- *  Equivalent to the negative Levenshtein distance, if: `gap == -1` and `subs[i][j] == (i == j ? 0: -1)`.
- *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *  @param gap      Penalty cost for gaps - insertions and removals.
- *  @param subs     Substitution costs matrix with 256 x 256 values for all pairs of characters.
- *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
- *  @return         Signed similarity score. Can be negative, depending on the substitution costs.
- *                  If the memory allocation fails, the function returns `SZ_SSIZE_MAX`.
- *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default
- *  @see    https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
- */
-SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                         sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                         sz_memory_allocator_t *alloc);
-
-/** @copydoc sz_alignment_score */
-SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                               sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                               sz_memory_allocator_t *alloc);
-
-typedef sz_ssize_t (*sz_alignment_score_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_error_cost_t const *,
-                                           sz_error_cost_t, sz_memory_allocator_t *);
-
-typedef void (*sz_hash_callback_t)(sz_cptr_t, sz_size_t, sz_u64_t, void *user);
-
-/**
- *  @brief  Computes the Karp-Rabin rolling hashes of a string supplying them to the provided `callback`.
- *          Can be used for similarity scores, search, ranking, etc.
- *
- *  Rabin-Karp-like rolling hashes can have very high-level of collisions and depend
- *  on the choice of bases and the prime number. That's why, often two hashes from the same
- *  family are used with different bases.
- *
- *       1. Kernighan and Ritchie's function uses 31, a prime close to the size of English alphabet.
- *       2. To be friendlier to byte-arrays and UTF8, we use 257 for the second function.
- *
- *  Choosing the right ::window_length is task- and domain-dependant. For example, most English words are
- *  between 3 and 7 characters long, so a window of 4 bytes would be a good choice. For DNA sequences,
- *  the ::window_length might be a multiple of 3, as the codons are 3 (nucleotides) bytes long.
- *  With such minimalistic alphabets of just four characters (AGCT) longer windows might be needed.
- *  For protein sequences the alphabet is 20 characters long, so the window can be shorter, than for DNAs.
- *
- *  @param text             String to hash.
- *  @param length           Number of bytes in the string.
- *  @param window_length    Length of the rolling window in bytes.
- *  @param window_step      Step of reported hashes. @b Must be power of two. Should be smaller than `window_length`.
- *  @param callback         Function receiving the start & length of a substring, the hash, and the `callback_handle`.
- *  @param callback_handle  Optional user-provided pointer to be passed to the `callback`.
- *  @see                    sz_hashes_fingerprint, sz_hashes_intersection
- */
-SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                          sz_hash_callback_t callback, void *callback_handle);
-
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_serial(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                                sz_hash_callback_t callback, void *callback_handle);
-
-typedef void (*sz_hashes_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_size_t, sz_hash_callback_t, void *);
-
-/**
- *  @brief  Computes the Karp-Rabin rolling hashes of a string outputting a binary fingerprint.
- *          Such fingerprints can be compared with Hamming or Jaccard (Tanimoto) distance for similarity.
- *
- *  The algorithm doesn't clear the fingerprint buffer on start, so it can be invoked multiple times
- *  to produce a fingerprint of a longer string, by passing the previous fingerprint as the ::fingerprint.
- *  It can also be reused to produce multi-resolution fingerprints by changing the ::window_length
- *  and calling the same function multiple times for the same input ::text.
- *
- *  Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
- *  avoiding cache-coherency penalties of remote on-heap buffers.
- *
- *  @param text                 String to hash.
- *  @param length               Number of bytes in the string.
- *  @param fingerprint          Output fingerprint buffer.
- *  @param fingerprint_bytes    Number of bytes in the fingerprint buffer.
- *  @param window_length        Length of the rolling window in bytes.
- *  @see                        sz_hashes, sz_hashes_intersection
- */
-SZ_PUBLIC void sz_hashes_fingerprint(                          //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
-    sz_ptr_t fingerprint, sz_size_t fingerprint_bytes);
-
-typedef void (*sz_hashes_fingerprint_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_ptr_t, sz_size_t);
-
-/**
- *  @brief  Given a hash-fingerprint of a textual document, computes the number of intersecting hashes
- *          of the incoming document. Can be used for document scoring and search.
- *
- *  Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
- *  avoiding cache-coherency penalties of remote on-heap buffers.
- *
- *  @param text                 Input document.
- *  @param length               Number of bytes in the input document.
- *  @param fingerprint          Reference document fingerprint.
- *  @param fingerprint_bytes    Number of bytes in the reference documents fingerprint.
- *  @param window_length        Length of the rolling window in bytes.
- *  @see                        sz_hashes, sz_hashes_fingerprint
- */
-SZ_PUBLIC sz_size_t sz_hashes_intersection(                    //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
-    sz_cptr_t fingerprint, sz_size_t fingerprint_bytes);
-
-typedef sz_size_t (*sz_hashes_intersection_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_cptr_t, sz_size_t);
-
-#pragma endregion
-
-#pragma region Convenience API
-
-/**
- *  @brief  Finds the first character in the haystack, that is present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-/**
- *  @brief  Finds the first character in the haystack, that is @b not present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_find_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-/**
- *  @brief  Finds the last character in the haystack, that is present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-/**
- *  @brief  Finds the last character in the haystack, that is @b not present in the needle.
- *          Convenience function, reused across different language bindings.
- *  @see    sz_find_charset
- */
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
-
-#pragma endregion
-
-#pragma region String Sequences API
-
-struct sz_sequence_t;
-
-typedef sz_cptr_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_size_t (*sz_sequence_member_length_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_bool_t (*sz_sequence_predicate_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_bool_t (*sz_sequence_comparator_t)(struct sz_sequence_t const *, sz_size_t, sz_size_t);
-typedef sz_bool_t (*sz_string_is_less_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
-
-typedef struct sz_sequence_t {
-    sz_sorted_idx_t *order;
-    sz_size_t count;
-    sz_sequence_member_start_t get_start;
-    sz_sequence_member_length_t get_length;
-    void const *handle;
-} sz_sequence_t;
-
-/**
- *  @brief  Initiates the sequence structure from a tape layout, used by Apache Arrow.
- *          Expects ::offsets to contains `count + 1` entries, the last pointing at the end
- *          of the last string, indicating the total length of the ::tape.
- */
-SZ_PUBLIC void sz_sequence_from_u32tape(sz_cptr_t *start, sz_u32_t const *offsets, sz_size_t count,
-                                        sz_sequence_t *sequence);
-
-/**
- *  @brief  Initiates the sequence structure from a tape layout, used by Apache Arrow.
- *          Expects ::offsets to contains `count + 1` entries, the last pointing at the end
- *          of the last string, indicating the total length of the ::tape.
- */
-SZ_PUBLIC void sz_sequence_from_u64tape(sz_cptr_t *start, sz_u64_t const *offsets, sz_size_t count,
-                                        sz_sequence_t *sequence);
-
-/**
- *  @brief  Similar to `std::partition`, given a predicate splits the sequence into two parts.
- *          The algorithm is unstable, meaning that elements may change relative order, as long
- *          as they are in the right partition. This is the simpler algorithm for partitioning.
- */
-SZ_PUBLIC sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate);
-
-/**
- *  @brief  Inplace `std::set_union` for two consecutive chunks forming the same continuous `sequence`.
- *
- *  @param partition The number of elements in the first sub-sequence in `sequence`.
- *  @param less Comparison function, to determine the lexicographic ordering.
- */
-SZ_PUBLIC void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less);
-
-/**
- *  @brief  Sorting algorithm, combining Radix Sort for the first 32 bits of every word
- *          and a follow-up by a more conventional sorting procedure on equally prefixed parts.
- */
-SZ_PUBLIC void sz_sort(sz_sequence_t *sequence);
-
-/**
- *  @brief  Partial sorting algorithm, combining Radix Sort for the first 32 bits of every word
- *          and a follow-up by a more conventional sorting procedure on equally prefixed parts.
- */
-SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t n);
-
-/**
- *  @brief  Intro-Sort algorithm that supports custom comparators.
- */
-SZ_PUBLIC void sz_sort_intro(sz_sequence_t *sequence, sz_sequence_comparator_t less);
-
-#pragma endregion
-
-/*
- *  Hardware feature detection.
- *  All of those can be controlled by the user.
- */
-#ifndef SZ_USE_ICE
-#ifdef __AVX512BW__
-#define SZ_USE_ICE 1
-#else
-#define SZ_USE_ICE 0
-#endif
-#endif
-
-#ifndef SZ_USE_HASWELL
-#ifdef __AVX2__
-#define SZ_USE_HASWELL 1
-#else
-#define SZ_USE_HASWELL 0
-#endif
-#endif
-
-#ifndef SZ_USE_NEON
-#ifdef __ARM_NEON
-#define SZ_USE_NEON 1
-#else
-#define SZ_USE_NEON 0
-#endif
-#endif
-
-#ifndef SZ_USE_SVE
-#ifdef __ARM_FEATURE_SVE
-#define SZ_USE_SVE 1
-#else
-#define SZ_USE_SVE 0
-#endif
-#endif
-
-/*
- *  Include hardware-specific headers.
- */
-#if SZ_USE_ICE || SZ_USE_HASWELL
-#include <immintrin.h>
-#endif // SZ_USE_X86...
-#if SZ_USE_NEON
-#if !defined(_MSC_VER)
-#include <arm_acle.h>
-#endif
-#include <arm_neon.h>
-#endif // SZ_USE_NEON
-#if SZ_USE_SVE
-#if !defined(_MSC_VER)
-#include <arm_sve.h>
-#endif
-#endif // SZ_USE_SVE
-
-#pragma region Hardware Specific API
-
-#if SZ_USE_ICE
-
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_skylake(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_avx512(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_ice(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_skylake(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_ice(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_ice(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_edit_distance */
-SZ_PUBLIC sz_size_t sz_edit_distance_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                            sz_size_t bound, sz_memory_allocator_t *alloc);
-/** @copydoc sz_alignment_score */
-SZ_PUBLIC sz_ssize_t sz_alignment_score_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-                                               sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-                                               sz_memory_allocator_t *alloc);
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle);
-#endif
-
-#if SZ_USE_HASWELL
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_avx2(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_avx2(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_avx2(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_avx2(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                              sz_hash_callback_t callback, void *callback_handle);
-#endif
-
-#if SZ_USE_NEON
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_neon(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_neon(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_neon(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-#endif
-
-#if SZ_USE_SVE
-/** @copydoc sz_equal */
-SZ_PUBLIC sz_bool_t sz_equal_sve(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
-/** @copydoc sz_order */
-SZ_PUBLIC sz_ordering_t sz_order_sve(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
-/** @copydoc sz_copy */
-SZ_PUBLIC void sz_copy_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_move */
-SZ_PUBLIC void sz_move_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
-/** @copydoc sz_fill */
-SZ_PUBLIC void sz_fill_sve(sz_ptr_t target, sz_size_t length, sz_u8_t value);
-/** @copydoc sz_find_byte */
-SZ_PUBLIC sz_cptr_t sz_find_byte_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_rfind_byte */
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
-/** @copydoc sz_find */
-SZ_PUBLIC sz_cptr_t sz_find_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_rfind */
-SZ_PUBLIC sz_cptr_t sz_rfind_sve(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_sve(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_sve(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-#endif
-
-#pragma endregion
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wconversion"
-
-/*
- **********************************************************************************************************************
- **********************************************************************************************************************
- **********************************************************************************************************************
- *
- *  This is where we the actual implementation begins.
- *  The rest of the file is hidden from the public API.
- *
- **********************************************************************************************************************
- **********************************************************************************************************************
- **********************************************************************************************************************
- */
-
-#pragma region Compiler Extensions and Helper Functions
-
-#pragma GCC visibility push(hidden)
-
-/**
- *  @brief  Helper-macro to mark potentially unused variables.
- */
-#define sz_unused(x) ((void)(x))
-
-/**
- *  @brief  Helper-macro casting a variable to another type of the same size.
- */
-#define sz_bitcast(type, value) (*((type *)&(value)))
-
-/**
- *  @brief  Defines `SZ_NULL`, analogous to `NULL`.
- *          The default often comes from locale.h, stddef.h,
- *          stdio.h, stdlib.h, string.h, time.h, or wchar.h.
- */
-#ifdef __GNUG__
-#define SZ_NULL __null
-#define SZ_NULL_CHAR __null
-#else
-#define SZ_NULL ((void *)0)
-#define SZ_NULL_CHAR ((char *)0)
-#endif
-
-/**
- *  @brief  Cache-line width, that will affect the execution of some algorithms,
- *          like equality checks and relative order computing.
- */
-#define SZ_CACHE_LINE_WIDTH (64) // bytes
-
-/**
- *  @brief  Similar to `assert`, the `sz_assert` is used in the SZ_DEBUG mode
- *          to check the invariants of the library. It's a no-op in the SZ_RELEASE mode.
- *  @note   If you want to catch it, put a breakpoint at @b `__GI_exit`
- */
-#if SZ_DEBUG && defined(SZ_AVOID_LIBC) && !SZ_AVOID_LIBC && !defined(SZ_PIC)
-#include <stdio.h>  // `fprintf`
-#include <stdlib.h> // `EXIT_FAILURE`
-SZ_PUBLIC void _sz_assert_failure(char const *condition, char const *file, int line) {
-    fprintf(stderr, "Assertion failed: %s, in file %s, line %d\n", condition, file, line);
-    exit(EXIT_FAILURE);
-}
-#define sz_assert(condition)                                                      \
-    do {                                                                          \
-        if (!(condition)) { _sz_assert_failure(#condition, __FILE__, __LINE__); } \
-    } while (0)
-#else
-#define sz_assert(condition) ((void)(condition))
-#endif
-
-/*  Intrinsics aliases for MSVC, GCC, Clang, and Clang-Cl.
- *  The following section of compiler intrinsics comes in 2 flavors.
- */
-#if defined(_MSC_VER) && !defined(__clang__) // On Clang-CL
-#include <intrin.h>
-
-// Sadly, when building Win32 images, we can't use the `_tzcnt_u64`, `_lzcnt_u64`,
-// `_BitScanForward64`, or `_BitScanReverse64` intrinsics. For now it's a simple `for`-loop.
-// TODO: In the future we can switch to a more efficient De Bruijn's algorithm.
-// https://www.chessprogramming.org/BitScan
-// https://www.chessprogramming.org/De_Bruijn_Sequence
-// https://gist.github.com/resilar/e722d4600dbec9752771ab4c9d47044f
-//
-// Use the serial version on 32-bit x86 and on Arm.
-#if (defined(_WIN32) && !defined(_WIN64)) || defined(_M_ARM) || defined(_M_ARM64)
-SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 1) == 0) { n++, x >>= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u64_clz(sz_u64_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 0x8000000000000000ull) == 0) { n++, x <<= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) {
-    x = x - ((x >> 1) & 0x5555555555555555ull);
-    x = (x & 0x3333333333333333ull) + ((x >> 2) & 0x3333333333333333ull);
-    return (((x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Full) * 0x0101010101010101ull) >> 56;
-}
-SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 1) == 0) { n++, x >>= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u32_clz(sz_u32_t x) {
-    sz_assert(x != 0);
-    int n = 0;
-    while ((x & 0x80000000u) == 0) { n++, x <<= 1; }
-    return n;
-}
-SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) {
-    x = x - ((x >> 1) & 0x55555555);
-    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
-    return (((x + (x >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
-}
-#else
-SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) { return (int)_tzcnt_u64(x); }
-SZ_INTERNAL int sz_u64_clz(sz_u64_t x) { return (int)_lzcnt_u64(x); }
-SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) { return (int)__popcnt64(x); }
-SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) { return (int)_tzcnt_u32(x); }
-SZ_INTERNAL int sz_u32_clz(sz_u32_t x) { return (int)_lzcnt_u32(x); }
-SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) { return (int)__popcnt(x); }
-#endif
-// Force the byteswap functions to be intrinsics, because when /Oi- is given, these will turn into CRT function calls,
-// which breaks when `SZ_AVOID_LIBC` is given
-#pragma intrinsic(_byteswap_uint64)
-SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return _byteswap_uint64(val); }
-#pragma intrinsic(_byteswap_ulong)
-SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return _byteswap_ulong(val); }
-#else
-SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) { return __builtin_popcountll(x); }
-SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) { return __builtin_popcount(x); }
-SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) { return __builtin_ctzll(x); }
-SZ_INTERNAL int sz_u64_clz(sz_u64_t x) { return __builtin_clzll(x); }
-SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) { return __builtin_ctz(x); } // ! Undefined if `x == 0`
-SZ_INTERNAL int sz_u32_clz(sz_u32_t x) { return __builtin_clz(x); } // ! Undefined if `x == 0`
-SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return __builtin_bswap64(val); }
-SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return __builtin_bswap32(val); }
-#endif
-
-SZ_INTERNAL sz_u64_t sz_u64_rotl(sz_u64_t x, sz_u64_t r) { return (x << r) | (x >> (64 - r)); }
-
-/**
- *  @brief  Select bits from either ::a or ::b depending on the value of ::mask bits.
- *
- *  Similar to `_mm_blend_epi16` intrinsic on x86.
- *  Described in the "Bit Twiddling Hacks" by Sean Eron Anderson.
- *  https://graphics.stanford.edu/~seander/bithacks.html#ConditionalSetOrClearBitsWithoutBranching
- */
-SZ_INTERNAL sz_u64_t sz_u64_blend(sz_u64_t a, sz_u64_t b, sz_u64_t mask) { return a ^ ((a ^ b) & mask); }
-
-/*
- *  Efficiently computing the minimum and maximum of two or three values can be tricky.
- *  The simple branching baseline would be:
- *
- *      x < y ? x : y                               // can replace with 1 conditional move
- *
- *  Branchless approach is well known for signed integers, but it doesn't apply to unsigned ones.
- *  https://stackoverflow.com/questions/514435/templatized-branchless-int-max-min-function
- *  https://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
- *  Using only bit-shifts for singed integers it would be:
- *
- *      y + ((x - y) & (x - y) >> 31)               // 4 unique operations
- *
- *  Alternatively, for any integers using multiplication:
- *
- *      (x > y) * y + (x <= y) * x                  // 5 operations
- *
- *  Alternatively, to avoid multiplication:
- *
- *      x & ~((x < y) - 1) + y & ((x < y) - 1)      // 6 unique operations
- */
-#define sz_min_of_two(x, y) (x < y ? x : y)
-#define sz_max_of_two(x, y) (x < y ? y : x)
-#define sz_min_of_three(x, y, z) sz_min_of_two(x, sz_min_of_two(y, z))
-#define sz_max_of_three(x, y, z) sz_max_of_two(x, sz_max_of_two(y, z))
-
-/** @brief  Branchless minimum function for two signed 32-bit integers. */
-SZ_INTERNAL sz_i32_t sz_i32_min_of_two(sz_i32_t x, sz_i32_t y) { return y + ((x - y) & (x - y) >> 31); }
-
-/** @brief  Branchless minimum function for two signed 32-bit integers. */
-SZ_INTERNAL sz_i32_t sz_i32_max_of_two(sz_i32_t x, sz_i32_t y) { return x - ((x - y) & (x - y) >> 31); }
-
-/**
- *  @brief  Clamps signed offsets in a string to a valid range. Used for Pythonic-style slicing.
- */
-SZ_INTERNAL void sz_ssize_clamp_interval(sz_size_t length, sz_ssize_t start, sz_ssize_t end,
-                                         sz_size_t *normalized_offset, sz_size_t *normalized_length) {
-    // TODO: Remove branches.
-    // Normalize negative indices
-    if (start < 0) start += length;
-    if (end < 0) end += length;
-
-    // Clamp indices to a valid range
-    if (start < 0) start = 0;
-    if (end < 0) end = 0;
-    if (start > (sz_ssize_t)length) start = length;
-    if (end > (sz_ssize_t)length) end = length;
-
-    // Ensure start <= end
-    if (start > end) start = end;
-
-    *normalized_offset = start;
-    *normalized_length = end - start;
-}
-
-/**
- *  @brief  Compute the logarithm base 2 of a positive integer, rounding down.
- */
-SZ_INTERNAL sz_size_t sz_size_log2i_nonzero(sz_size_t x) {
-    sz_assert(x > 0 && "Non-positive numbers have no defined logarithm");
-    sz_size_t leading_zeros = sz_u64_clz(x);
-    return 63 - leading_zeros;
-}
-
-/**
- *  @brief  Compute the smallest power of two greater than or equal to ::x.
- */
-SZ_INTERNAL sz_size_t sz_size_bit_ceil(sz_size_t x) {
-    // Unlike the commonly used trick with `clz` intrinsics, is valid across the whole range of `x`.
-    // https://stackoverflow.com/a/10143264
-    x--;
-    x |= x >> 1;
-    x |= x >> 2;
-    x |= x >> 4;
-    x |= x >> 8;
-    x |= x >> 16;
-#if _SZ_IS_64_BIT
-    x |= x >> 32;
-#endif
-    x++;
-    return x;
-}
-
-/**
- *  @brief  Transposes an 8x8 bit matrix packed in a `sz_u64_t`.
- *
- *  There is a well known SWAR sequence for that known to chess programmers,
- *  willing to flip a bit-matrix of pieces along the main A1-H8 diagonal.
- *  https://www.chessprogramming.org/Flipping_Mirroring_and_Rotating
- *  https://lukas-prokop.at/articles/2021-07-23-transpose
- */
-SZ_INTERNAL sz_u64_t sz_u64_transpose(sz_u64_t x) {
-    sz_u64_t t;
-    t = x ^ (x << 36);
-    x ^= 0xf0f0f0f00f0f0f0full & (t ^ (x >> 36));
-    t = 0xcccc0000cccc0000ull & (x ^ (x << 18));
-    x ^= t ^ (t >> 18);
-    t = 0xaa00aa00aa00aa00ull & (x ^ (x << 9));
-    x ^= t ^ (t >> 9);
-    return x;
-}
-
-/**
- *  @brief  Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
- */
-SZ_INTERNAL void sz_u64_swap(sz_u64_t *a, sz_u64_t *b) {
-    sz_u64_t t = *a;
-    *a = *b;
-    *b = t;
-}
-
-/**
- *  @brief  Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
- */
-SZ_INTERNAL void sz_pointer_swap(void **a, void **b) {
-    void *t = *a;
-    *a = *b;
-    *b = t;
-}
-
-/**
- *  @brief  Helper structure to simplify work with 16-bit words.
- *  @see    sz_u16_load
- */
-typedef union sz_u16_vec_t {
-    sz_u16_t u16;
-    sz_u8_t u8s[2];
-} sz_u16_vec_t;
-
-/**
- *  @brief Load a 16-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u16_vec_t sz_u16_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u16_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u16_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u16_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u16_vec_t const *result = (sz_u16_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/**
- *  @brief  Helper structure to simplify work with 32-bit words.
- *  @see    sz_u32_load
- */
-typedef union sz_u32_vec_t {
-    sz_u32_t u32;
-    sz_u16_t u16s[2];
-    sz_u8_t u8s[4];
-} sz_u32_vec_t;
-
-/**
- *  @brief Load a 32-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u32_vec_t sz_u32_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u32_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    result.u8s[2] = ptr[2];
-    result.u8s[3] = ptr[3];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u32_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u32_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u32_vec_t const *result = (sz_u32_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/**
- *  @brief  Helper structure to simplify work with 64-bit words.
- *  @see    sz_u64_load
- */
-typedef union sz_u64_vec_t {
-    sz_u64_t u64;
-    sz_u32_t u32s[2];
-    sz_u16_t u16s[4];
-    sz_u8_t u8s[8];
-} sz_u64_vec_t;
-
-/**
- *  @brief Load a 64-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
-SZ_INTERNAL sz_u64_vec_t sz_u64_load(sz_cptr_t ptr) {
-#if !SZ_USE_MISALIGNED_LOADS
-    sz_u64_vec_t result;
-    result.u8s[0] = ptr[0];
-    result.u8s[1] = ptr[1];
-    result.u8s[2] = ptr[2];
-    result.u8s[3] = ptr[3];
-    result.u8s[4] = ptr[4];
-    result.u8s[5] = ptr[5];
-    result.u8s[6] = ptr[6];
-    result.u8s[7] = ptr[7];
-    return result;
-#elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
-    return *((sz_u64_vec_t *)ptr);
-#else
-    return *((__unaligned sz_u64_vec_t *)ptr);
-#endif
-#else
-    __attribute__((aligned(1))) sz_u64_vec_t const *result = (sz_u64_vec_t const *)ptr;
-    return *result;
-#endif
-}
-
-/** @brief  Helper function, using the supplied fixed-capacity buffer to allocate memory. */
-SZ_INTERNAL sz_ptr_t _sz_memory_allocate_fixed(sz_size_t length, void *handle) {
-    sz_size_t capacity;
-    sz_copy((sz_ptr_t)&capacity, (sz_cptr_t)handle, sizeof(sz_size_t));
-    sz_size_t consumed_capacity = sizeof(sz_size_t);
-    if (consumed_capacity + length > capacity) return SZ_NULL_CHAR;
-    return (sz_ptr_t)handle + consumed_capacity;
-}
-
-/** @brief  Helper "no-op" function, simulating memory deallocation when we use a "static" memory buffer. */
-SZ_INTERNAL void _sz_memory_free_fixed(sz_ptr_t start, sz_size_t length, void *handle) {
-    sz_unused(start && length && handle);
-}
-
-/** @brief  An internal callback used to set a bit in a power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void _sz_hashes_fingerprint_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *handle) {
-    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
-    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
-    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
-    fingerprint_u8s[(hash / 8) & (fingerprint_bytes - 1)] |= (1 << (hash & 7));
-    sz_unused(start && length);
-}
-
-/** @brief  An internal callback used to set a bit in a @b non power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void _sz_hashes_fingerprint_non_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash,
-                                                          void *handle) {
-    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
-    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
-    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
-    fingerprint_u8s[(hash / 8) % fingerprint_bytes] |= (1 << (hash & 7));
-    sz_unused(start && length);
-}
-
-/** @brief  An internal callback, used to mix all the running hashes into one pointer-size value. */
-SZ_INTERNAL void _sz_hashes_fingerprint_scalar_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash,
-                                                        void *scalar_handle) {
-    sz_unused(start && length && hash && scalar_handle);
-    sz_size_t *scalar_ptr = (sz_size_t *)scalar_handle;
-    *scalar_ptr ^= hash;
-}
-
-#pragma GCC visibility pop
-#pragma endregion
-
-#pragma region Serial Implementation
-
-#if !SZ_AVOID_LIBC
-#include <stdio.h>  // `fprintf`
-#include <stdlib.h> // `malloc`, `EXIT_FAILURE`
-
-SZ_PUBLIC void *_sz_memory_allocate_default(sz_size_t length, void *handle) {
-    sz_unused(handle);
-    return malloc(length);
-}
-SZ_PUBLIC void _sz_memory_free_default(sz_ptr_t start, sz_size_t length, void *handle) {
-    sz_unused(handle && length);
-    free(start);
-}
-
-#endif
-
-SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc) {
-#if !SZ_AVOID_LIBC
-    alloc->allocate = (sz_memory_allocate_t)_sz_memory_allocate_default;
-    alloc->free = (sz_memory_free_t)_sz_memory_free_default;
-#else
-    alloc->allocate = (sz_memory_allocate_t)SZ_NULL;
-    alloc->free = (sz_memory_free_t)SZ_NULL;
-#endif
-    alloc->handle = SZ_NULL;
-}
-
-SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length) {
-    // The logic here is simple - put the buffer length in the first slots of the buffer.
-    // Later use it for bounds checking.
-    alloc->allocate = (sz_memory_allocate_t)_sz_memory_allocate_fixed;
-    alloc->free = (sz_memory_free_t)_sz_memory_free_fixed;
-    alloc->handle = &buffer;
-    sz_copy((sz_ptr_t)buffer, (sz_cptr_t)&length, sizeof(sz_size_t));
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-    for (sz_cptr_t const end = text + length; text != end; ++text)
-        if (sz_charset_contains(set, *text)) return text;
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Warray-bounds"
-    sz_cptr_t const end = text;
-    for (text += length; text != end;)
-        if (sz_charset_contains(set, *(text -= 1))) return text;
-    return SZ_NULL_CHAR;
-#pragma GCC diagnostic pop
-}
-
-/**
- *  One option to avoid branching is to use conditional moves and lookup the comparison result in a table:
- *       sz_ordering_t ordering_lookup[2] = {sz_greater_k, sz_less_k};
- *       for (; a != min_end; ++a, ++b)
- *           if (*a != *b) return ordering_lookup[*a < *b];
- *  That, however, introduces a data-dependency.
- *  A cleaner option is to perform two comparisons and a subtraction.
- *  One instruction more, but no data-dependency.
- */
-#define _sz_order_scalars(a, b) ((sz_ordering_t)((a > b) - (a < b)))
-
-SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    sz_bool_t a_shorter = (sz_bool_t)(a_length < b_length);
-    sz_size_t min_length = a_shorter ? a_length : b_length;
-    sz_cptr_t min_end = a + min_length;
-#if SZ_USE_MISALIGNED_LOADS && !_SZ_IS_BIG_ENDIAN
-    for (sz_u64_vec_t a_vec, b_vec; a + 8 <= min_end; a += 8, b += 8) {
-        a_vec = sz_u64_load(a);
-        b_vec = sz_u64_load(b);
-        if (a_vec.u64 != b_vec.u64)
-            return _sz_order_scalars(sz_u64_bytes_reverse(a_vec.u64), sz_u64_bytes_reverse(b_vec.u64));
-    }
-#endif
-    for (; a != min_end; ++a, ++b)
-        if (*a != *b) return _sz_order_scalars(*a, *b);
-
-    // If the strings are equal up to `min_end`, then the shorter string is smaller
-    return _sz_order_scalars(a_length, b_length);
-}
-
-/**
- *  @brief  Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each byte is set.
-    // For that take the bottom 7 bits of each byte, add one to them,
-    // and if this sets the top bit to one, then all the 7 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7F7F7F7F7F7F7F7Full) + 0x0101010101010101ull) & ((vec.u64 & 0x8080808080808080ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b single-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- *          Identical to `memchr(haystack, needle[0], haystack_length)`.
- */
-SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    if (!h_length) return SZ_NULL_CHAR;
-    sz_cptr_t const h_end = h + h_length;
-
-#if !_SZ_IS_BIG_ENDIAN       // Use SWAR only on little-endian platforms for brevity.
-#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
-        if (*h == *n) return h;
-#endif
-
-    // Broadcast the n into every byte of a 64-bit integer to use SWAR
-    // techniques and process eight characters at a time.
-    sz_u64_vec_t h_vec, n_vec, match_vec;
-    match_vec.u64 = 0;
-    n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
-    for (; h + 8 <= h_end; h += 8) {
-        h_vec.u64 = *(sz_u64_t const *)h;
-        match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
-        if (match_vec.u64) return h + sz_u64_ctz(match_vec.u64) / 8;
-    }
-#endif
-
-    // Handle the misaligned tail.
-    for (; h < h_end; ++h)
-        if (*h == *n) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Find the last occurrence of a @b single-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- *          Identical to `memrchr(haystack, needle[0], haystack_length)`.
- */
-sz_cptr_t sz_rfind_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    if (!h_length) return SZ_NULL_CHAR;
-    sz_cptr_t const h_start = h;
-
-    // Reposition the `h` pointer to the end, as we will be walking backwards.
-    h = h + h_length - 1;
-
-#if !_SZ_IS_BIG_ENDIAN       // Use SWAR only on little-endian platforms for brevity.
-#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)(h + 1) & 7ull) && h >= h_start; --h)
-        if (*h == *n) return h;
-#endif
-
-    // Broadcast the n into every byte of a 64-bit integer to use SWAR
-    // techniques and process eight characters at a time.
-    sz_u64_vec_t h_vec, n_vec, match_vec;
-    n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
-    for (; h >= h_start + 7; h -= 8) {
-        h_vec.u64 = *(sz_u64_t const *)(h - 7);
-        match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
-        if (match_vec.u64) return h - sz_u64_clz(match_vec.u64) / 8;
-    }
-#endif
-
-    for (; h >= h_start; --h)
-        if (*h == *n) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  2Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 2byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_2byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 2byte is set.
-    // For that take the bottom 15 bits of each 2byte, add one to them,
-    // and if this sets the top bit to one, then all the 15 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7FFF7FFF7FFF7FFFull) + 0x0001000100010001ull) & ((vec.u64 & 0x8000800080008000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b two-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_2byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 2 bytes long.
-    sz_assert(h_length >= 2 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 2 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
-#endif
-
-    sz_u64_vec_t h_even_vec, h_odd_vec, n_vec, matches_even_vec, matches_odd_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1];
-    n_vec.u64 *= 0x0001000100010001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
-    for (; h + 9 <= h_end; h += 8) {
-        h_even_vec.u64 = *(sz_u64_t *)h;
-        h_odd_vec.u64 = (h_even_vec.u64 >> 8) | ((sz_u64_t)h[8] << 56);
-        matches_even_vec = _sz_u64_each_2byte_equal(h_even_vec, n_vec);
-        matches_odd_vec = _sz_u64_each_2byte_equal(h_odd_vec, n_vec);
-
-        matches_even_vec.u64 >>= 8;
-        if (matches_even_vec.u64 + matches_odd_vec.u64) {
-            sz_u64_t match_indicators = matches_even_vec.u64 | matches_odd_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 2 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  4Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 4byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_4byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 4byte is set.
-    // For that take the bottom 31 bits of each 4byte, add one to them,
-    // and if this sets the top bit to one, then all the 31 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0x7FFFFFFF7FFFFFFFull) + 0x0000000100000001ull) & ((vec.u64 & 0x8000000080000000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b four-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_4byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
-    sz_assert(h_length >= 4 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 4 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
-#endif
-
-    sz_u64_vec_t h0_vec, h1_vec, h2_vec, h3_vec, n_vec, matches0_vec, matches1_vec, matches2_vec, matches3_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1], n_vec.u8s[2] = n[2], n_vec.u8s[3] = n[3];
-    n_vec.u64 *= 0x0000000100000001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time using four 64-bit words.
-    // We load the subsequent four-byte word as well, taking its first bytes. Think of it as a glorified prefetch :)
-    sz_u64_t h_page_current, h_page_next;
-    for (; h + sizeof(sz_u64_t) + sizeof(sz_u32_t) <= h_end; h += sizeof(sz_u64_t)) {
-        h_page_current = *(sz_u64_t *)h;
-        h_page_next = *(sz_u32_t *)(h + 8);
-        h0_vec.u64 = (h_page_current);
-        h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
-        h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
-        h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
-        matches0_vec = _sz_u64_each_4byte_equal(h0_vec, n_vec);
-        matches1_vec = _sz_u64_each_4byte_equal(h1_vec, n_vec);
-        matches2_vec = _sz_u64_each_4byte_equal(h2_vec, n_vec);
-        matches3_vec = _sz_u64_each_4byte_equal(h3_vec, n_vec);
-
-        if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64) {
-            matches0_vec.u64 >>= 24;
-            matches1_vec.u64 >>= 16;
-            matches2_vec.u64 >>= 8;
-            sz_u64_t match_indicators = matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 4 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  3Byte-level equality comparison between two 64-bit integers.
- *  @return 64-bit integer, where every top bit in each 3byte signifies a match.
- */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_3byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
-    sz_u64_vec_t vec;
-    vec.u64 = ~(a.u64 ^ b.u64);
-    // The match is valid, if every bit within each 4byte is set.
-    // For that take the bottom 31 bits of each 4byte, add one to them,
-    // and if this sets the top bit to one, then all the 31 bits are ones as well.
-    vec.u64 = ((vec.u64 & 0xFFFF7FFFFF7FFFFFull) + 0x0000000001000001ull) & ((vec.u64 & 0x0000800000800000ull));
-    return vec;
-}
-
-/**
- *  @brief  Find the first occurrence of a @b three-character needle in an arbitrary length haystack.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_3byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-
-    // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
-    sz_assert(h_length >= 3 && "The haystack is too short.");
-    sz_cptr_t const h_end = h + h_length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h + 3 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
-#endif
-
-    // We fetch 12
-    sz_u64_vec_t h0_vec, h1_vec, h2_vec, h3_vec, h4_vec;
-    sz_u64_vec_t matches0_vec, matches1_vec, matches2_vec, matches3_vec, matches4_vec;
-    sz_u64_vec_t n_vec;
-    n_vec.u64 = 0;
-    n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1], n_vec.u8s[2] = n[2];
-    n_vec.u64 *= 0x0000000001000001ull; // broadcast
-
-    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time using three 64-bit words.
-    // We load the subsequent two-byte word as well.
-    sz_u64_t h_page_current, h_page_next;
-    for (; h + sizeof(sz_u64_t) + sizeof(sz_u16_t) <= h_end; h += sizeof(sz_u64_t)) {
-        h_page_current = *(sz_u64_t *)h;
-        h_page_next = *(sz_u16_t *)(h + 8);
-        h0_vec.u64 = (h_page_current);
-        h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
-        h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
-        h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
-        h4_vec.u64 = (h_page_current >> 32) | (h_page_next << 32);
-        matches0_vec = _sz_u64_each_3byte_equal(h0_vec, n_vec);
-        matches1_vec = _sz_u64_each_3byte_equal(h1_vec, n_vec);
-        matches2_vec = _sz_u64_each_3byte_equal(h2_vec, n_vec);
-        matches3_vec = _sz_u64_each_3byte_equal(h3_vec, n_vec);
-        matches4_vec = _sz_u64_each_3byte_equal(h4_vec, n_vec);
-
-        if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64) {
-            matches0_vec.u64 >>= 16;
-            matches1_vec.u64 >>= 8;
-            matches3_vec.u64 <<= 8;
-            matches4_vec.u64 <<= 16;
-            sz_u64_t match_indicators =
-                matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64;
-            return h + sz_u64_ctz(match_indicators) / 8;
-        }
-    }
-
-    for (; h + 3 <= h_end; ++h)
-        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Boyer-Moore-Horspool algorithm for exact matching of patterns up to @b 256-bytes long.
- *          Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_horspool_upto_256bytes_serial(sz_cptr_t h_chars, sz_size_t h_length, //
-                                                             sz_cptr_t n_chars, sz_size_t n_length) {
-    sz_assert(n_length <= 256 && "The pattern is too long.");
-    // Several popular string matching algorithms are using a bad-character shift table.
-    // Boyer Moore: https://www-igm.univ-mlv.fr/~lecroq/string/node14.html
-    // Quick Search: https://www-igm.univ-mlv.fr/~lecroq/string/node19.html
-    // Smith: https://www-igm.univ-mlv.fr/~lecroq/string/node21.html
-    union {
-        sz_u8_t jumps[256];
-        sz_u64_vec_t vecs[64];
-    } bad_shift_table;
-
-    // Let's initialize the table using SWAR to the total length of the string.
-    sz_u8_t const *h = (sz_u8_t const *)h_chars;
-    sz_u8_t const *n = (sz_u8_t const *)n_chars;
-    {
-        sz_u64_vec_t n_length_vec;
-        n_length_vec.u64 = n_length;
-        n_length_vec.u64 *= 0x0101010101010101ull; // broadcast
-        for (sz_size_t i = 0; i != 64; ++i) bad_shift_table.vecs[i].u64 = n_length_vec.u64;
-        for (sz_size_t i = 0; i + 1 < n_length; ++i) bad_shift_table.jumps[n[i]] = (sz_u8_t)(n_length - i - 1);
-    }
-
-    // Another common heuristic is to match a few characters from different parts of a string.
-    // Raita suggests to use the first two, the last, and the middle character of the pattern.
-    sz_u32_vec_t h_vec, n_vec;
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into an unsigned integer.
-    n_vec.u8s[0] = n[offset_first];
-    n_vec.u8s[1] = n[offset_first + 1];
-    n_vec.u8s[2] = n[offset_mid];
-    n_vec.u8s[3] = n[offset_last];
-
-    // Scan through the whole haystack, skipping the last `n_length - 1` bytes.
-    for (sz_size_t i = 0; i <= h_length - n_length;) {
-        h_vec.u8s[0] = h[i + offset_first];
-        h_vec.u8s[1] = h[i + offset_first + 1];
-        h_vec.u8s[2] = h[i + offset_mid];
-        h_vec.u8s[3] = h[i + offset_last];
-        if (h_vec.u32 == n_vec.u32 && sz_equal((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
-        i += bad_shift_table.jumps[h[i + n_length - 1]];
-    }
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Boyer-Moore-Horspool algorithm for @b reverse-order exact matching of patterns up to @b 256-bytes long.
- *          Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
- */
-SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_upto_256bytes_serial(sz_cptr_t h_chars, sz_size_t h_length, //
-                                                              sz_cptr_t n_chars, sz_size_t n_length) {
-    sz_assert(n_length <= 256 && "The pattern is too long.");
-    union {
-        sz_u8_t jumps[256];
-        sz_u64_vec_t vecs[64];
-    } bad_shift_table;
-
-    // Let's initialize the table using SWAR to the total length of the string.
-    sz_u8_t const *h = (sz_u8_t const *)h_chars;
-    sz_u8_t const *n = (sz_u8_t const *)n_chars;
-    {
-        sz_u64_vec_t n_length_vec;
-        n_length_vec.u64 = n_length;
-        n_length_vec.u64 *= 0x0101010101010101ull; // broadcast
-        for (sz_size_t i = 0; i != 64; ++i) bad_shift_table.vecs[i].u64 = n_length_vec.u64;
-        for (sz_size_t i = 0; i + 1 < n_length; ++i)
-            bad_shift_table.jumps[n[n_length - i - 1]] = (sz_u8_t)(n_length - i - 1);
-    }
-
-    // Another common heuristic is to match a few characters from different parts of a string.
-    // Raita suggests to use the first two, the last, and the middle character of the pattern.
-    sz_u32_vec_t h_vec, n_vec;
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into an unsigned integer.
-    n_vec.u8s[0] = n[offset_first];
-    n_vec.u8s[1] = n[offset_first + 1];
-    n_vec.u8s[2] = n[offset_mid];
-    n_vec.u8s[3] = n[offset_last];
-
-    // Scan through the whole haystack, skipping the first `n_length - 1` bytes.
-    for (sz_size_t j = 0; j <= h_length - n_length;) {
-        sz_size_t i = h_length - n_length - j;
-        h_vec.u8s[0] = h[i + offset_first];
-        h_vec.u8s[1] = h[i + offset_first + 1];
-        h_vec.u8s[2] = h[i + offset_mid];
-        h_vec.u8s[3] = h[i + offset_last];
-        if (h_vec.u32 == n_vec.u32 && sz_equal((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
-        j += bad_shift_table.jumps[h[i]];
-    }
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Exact substring search helper function, that finds the first occurrence of a prefix of the needle
- *          using a given search function, and then verifies the remaining part of the needle.
- */
-SZ_INTERNAL sz_cptr_t _sz_find_with_prefix(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length,
-                                           sz_find_t find_prefix, sz_size_t prefix_length) {
-
-    sz_size_t suffix_length = n_length - prefix_length;
-    while (1) {
-        sz_cptr_t found = find_prefix(h, h_length, n, prefix_length);
-        if (!found) return SZ_NULL_CHAR;
-
-        // Verify the remaining part of the needle
-        sz_size_t remaining = h_length - (found - h);
-        if (remaining < n_length) return SZ_NULL_CHAR;
-        if (sz_equal(found + prefix_length, n + prefix_length, suffix_length)) return found;
-
-        // Adjust the position.
-        h = found + 1;
-        h_length = remaining - 1;
-    }
-
-    // Unreachable, but helps silence compiler warnings:
-    return SZ_NULL_CHAR;
-}
-
-/**
- *  @brief  Exact reverse-order substring search helper function, that finds the last occurrence of a suffix of the
- *          needle using a given search function, and then verifies the remaining part of the needle.
- */
-SZ_INTERNAL sz_cptr_t _sz_rfind_with_suffix(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length,
-                                            sz_find_t find_suffix, sz_size_t suffix_length) {
-
-    sz_size_t prefix_length = n_length - suffix_length;
-    while (1) {
-        sz_cptr_t found = find_suffix(h, h_length, n + prefix_length, suffix_length);
-        if (!found) return SZ_NULL_CHAR;
-
-        // Verify the remaining part of the needle
-        sz_size_t remaining = found - h;
-        if (remaining < prefix_length) return SZ_NULL_CHAR;
-        if (sz_equal(found - prefix_length, n, prefix_length)) return found - prefix_length;
-
-        // Adjust the position.
-        h_length = remaining - 1;
-    }
-
-    // Unreachable, but helps silence compiler warnings:
-    return SZ_NULL_CHAR;
-}
-
-SZ_INTERNAL sz_cptr_t _sz_find_over_4bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    return _sz_find_with_prefix(h, h_length, n, n_length, (sz_find_t)_sz_find_4byte_serial, 4);
-}
-
-SZ_INTERNAL sz_cptr_t _sz_find_horspool_over_256bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
-                                                             sz_size_t n_length) {
-    return _sz_find_with_prefix(h, h_length, n, n_length, _sz_find_horspool_upto_256bytes_serial, 256);
-}
-
-SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_over_256bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
-                                                              sz_size_t n_length) {
-    return _sz_rfind_with_suffix(h, h_length, n, n_length, _sz_rfind_horspool_upto_256bytes_serial, 256);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-
-#if _SZ_IS_BIG_ENDIAN
-    sz_find_t backends[] = {
-        (sz_find_t)sz_find_byte_serial,
-        (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_find_horspool_over_256bytes_serial,
-    };
-
-    return backends[(n_length > 1) + (n_length > 256)](h, h_length, n, n_length);
-#else
-    sz_find_t backends[] = {
-        // For very short strings brute-force SWAR makes sense.
-        (sz_find_t)sz_find_byte_serial,
-        (sz_find_t)_sz_find_2byte_serial,
-        (sz_find_t)_sz_find_3byte_serial,
-        (sz_find_t)_sz_find_4byte_serial,
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (sz_find_t)_sz_find_over_4bytes_serial,
-        // For longer needles - use skip tables.
-        (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_find_horspool_over_256bytes_serial,
-    };
-
-    return backends[
-        // For very short strings brute-force SWAR makes sense.
-        (n_length > 1) + (n_length > 2) + (n_length > 3) +
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (n_length > 4) +
-        // For longer needles - use skip tables.
-        (n_length > 8) + (n_length > 256)](h, h_length, n, n_length);
-#endif
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-
-    sz_find_t backends[] = {
-        // For very short strings brute-force SWAR makes sense.
-        (sz_find_t)sz_rfind_byte_serial,
-        //  TODO: implement reverse-order SWAR for 2/3/4 byte variants.
-        //  TODO: (sz_find_t)_sz_rfind_2byte_serial,
-        //  TODO: (sz_find_t)_sz_rfind_3byte_serial,
-        //  TODO: (sz_find_t)_sz_rfind_4byte_serial,
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        // (sz_find_t)_sz_rfind_over_4bytes_serial,
-        // For longer needles - use skip tables.
-        (sz_find_t)_sz_rfind_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_rfind_horspool_over_256bytes_serial,
-    };
-
-    return backends[
-        // For very short strings brute-force SWAR makes sense.
-        0 +
-        // To avoid constructing the skip-table, let's use the prefixed approach.
-        (n_length > 1) +
-        // For longer needles - use skip tables.
-        (n_length > 256)](h, h_length, n, n_length);
-}
-
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_serial( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                 //
-    sz_cptr_t longer, sz_size_t longer_length,                   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // TODO: Generalize to remove the following asserts!
-    sz_assert(!bound && "For bounded search the method should only evaluate one band of the matrix.");
-    sz_assert(shorter_length == longer_length && "The method hasn't been generalized to different length inputs yet.");
-    sz_unused(longer_length && bound);
-
-    // We are going to store 3 diagonals of the matrix.
-    // The length of the longest (main) diagonal would be `n = (shorter_length + 1)`.
-    sz_size_t n = shorter_length + 1;
-    sz_size_t buffer_length = sizeof(sz_size_t) * n * 3;
-    sz_size_t *distances = (sz_size_t *)alloc->allocate(buffer_length, alloc->handle);
-    if (!distances) return SZ_SIZE_MAX;
-
-    sz_size_t *previous_distances = distances;
-    sz_size_t *current_distances = previous_distances + n;
-    sz_size_t *next_distances = previous_distances + n * 2;
-
-    // Initialize the first two diagonals:
-    previous_distances[0] = 0;
-    current_distances[0] = current_distances[1] = 1;
-
-    // Progress through the upper triangle of the Levenshtein matrix.
-    sz_size_t next_diagonal_index = 2;
-    for (; next_diagonal_index != n; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
-        for (sz_size_t i = 0; i + 2 < next_diagonal_length; ++i) {
-            sz_size_t cost_of_substitution = shorter[next_diagonal_index - i - 2] != longer[i];
-            sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
-            sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
-            next_distances[i + 1] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
-        }
-        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-        next_distances[0] = next_distances[next_diagonal_length - 1] = next_diagonal_index;
-        // Perform a circular rotation of those buffers, to reuse the memory.
-        sz_size_t *temporary = previous_distances;
-        previous_distances = current_distances;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a
-    // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
-    // index on either side, we will be cropping those values out.
-    sz_size_t diagonals_count = n + n - 1;
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
-        for (sz_size_t i = 0; i != next_diagonal_length; ++i) {
-            sz_size_t cost_of_substitution = shorter[shorter_length - 1 - i] != longer[next_diagonal_index - n + i];
-            sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
-            sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
-            next_distances[i] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
-        }
-        // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
-        // dropping the first element in the current array.
-        sz_size_t *temporary = previous_distances;
-        previous_distances = current_distances + 1;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // Cache scalar before `free` call.
-    sz_size_t result = current_distances[0];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-}
-
-/**
- *  @brief  Describes the length of a UTF8 character / codepoint / rune in bytes.
- */
-typedef enum {
-    sz_utf8_invalid_k = 0,     //!< Invalid UTF8 character.
-    sz_utf8_rune_1byte_k = 1,  //!< 1-byte UTF8 character.
-    sz_utf8_rune_2bytes_k = 2, //!< 2-byte UTF8 character.
-    sz_utf8_rune_3bytes_k = 3, //!< 3-byte UTF8 character.
-    sz_utf8_rune_4bytes_k = 4, //!< 4-byte UTF8 character.
-} sz_rune_length_t;
-
-typedef sz_u32_t sz_rune_t;
-
-/**
- *  @brief  Extracts just one UTF8 codepoint from a UTF8 string into a 32-bit unsigned integer.
- */
-SZ_INTERNAL void _sz_extract_utf8_rune(sz_cptr_t utf8, sz_rune_t *code, sz_rune_length_t *code_length) {
-    sz_u8_t const *current = (sz_u8_t const *)utf8;
-    sz_u8_t leading_byte = *current++;
-    sz_rune_t ch;
-    sz_rune_length_t ch_length;
-
-    // TODO: This can be made entirely branchless using 32-bit SWAR.
-    if (leading_byte < 0x80) {
-        // Single-byte rune (0xxxxxxx)
-        ch = leading_byte;
-        ch_length = sz_utf8_rune_1byte_k;
-    }
-    else if ((leading_byte & 0xE0) == 0xC0) {
-        // Two-byte rune (110xxxxx 10xxxxxx)
-        ch = (leading_byte & 0x1F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_2bytes_k;
-    }
-    else if ((leading_byte & 0xF0) == 0xE0) {
-        // Three-byte rune (1110xxxx 10xxxxxx 10xxxxxx)
-        ch = (leading_byte & 0x0F) << 12;
-        ch |= (*current++ & 0x3F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_3bytes_k;
-    }
-    else if ((leading_byte & 0xF8) == 0xF0) {
-        // Four-byte rune (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
-        ch = (leading_byte & 0x07) << 18;
-        ch |= (*current++ & 0x3F) << 12;
-        ch |= (*current++ & 0x3F) << 6;
-        ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_4bytes_k;
-    }
-    else {
-        // Invalid UTF8 rune.
-        ch = 0;
-        ch_length = sz_utf8_invalid_k;
-    }
-    *code = ch;
-    *code_length = ch_length;
-}
-
-/**
- *  @brief  Exports a UTF8 string into a UTF32 buffer.
- *          ! The result is undefined id the UTF8 string is corrupted.
- *  @return The length in the number of codepoints.
- */
-SZ_INTERNAL sz_size_t _sz_export_utf8_to_utf32(sz_cptr_t utf8, sz_size_t utf8_length, sz_rune_t *utf32) {
-    sz_cptr_t const end = utf8 + utf8_length;
-    sz_size_t count = 0;
-    sz_rune_length_t rune_length;
-    for (; utf8 != end; utf8 += rune_length, utf32++, count++) _sz_extract_utf8_rune(utf8, utf32, &rune_length);
-    return count;
-}
-
-/**
- *  @brief  Compute the Levenshtein distance between two strings using the Wagner-Fisher algorithm.
- *          Stores only 2 rows of the Levenshtein matrix, but uses 64-bit integers for the distance values,
- *          and upcasts UTF8 variable-length codepoints to 64-bit integers for faster addressing.
- *
- *  ! In the worst case for 2 strings of length 100, that contain just one 16-bit codepoint this will result in extra:
- *      + 2 rows * 100 slots * 8 bytes/slot = 1600 bytes of memory for the two rows of the Levenshtein matrix rows.
- *      + 100 codepoints * 2 strings * 4 bytes/codepoint = 800 bytes of memory for the UTF8 buffer.
- *      = 2400 bytes of memory or @b 12x memory amplification!
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_wagner_fisher_serial( //
-    sz_cptr_t longer, sz_size_t longer_length,                //
-    sz_cptr_t shorter, sz_size_t shorter_length,              //
-    sz_size_t bound, sz_bool_t can_be_unicode, sz_memory_allocator_t *alloc) {
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // A good idea may be to dispatch different kernels for different string lengths.
-    // Like using `uint8_t` counters for strings under 255 characters long.
-    // Good in theory, this results in frequent upcasts and downcasts in serial code.
-    // On strings over 20 bytes, using `uint8` over `uint64` on 64-bit x86 CPU doubles the execution time.
-    // So one must be very cautious with such optimizations.
-    typedef sz_size_t _distance_t;
-
-    // Compute the number of columns in our Levenshtein matrix.
-    sz_size_t const n = shorter_length + 1;
-
-    // If a buffering memory-allocator is provided, this operation is practically free,
-    // and cheaper than allocating even 512 bytes (for small distance matrices) on stack.
-    sz_size_t buffer_length = sizeof(_distance_t) * (n * 2);
-
-    // If the strings contain Unicode characters, let's estimate the max character width,
-    // and use it to allocate a larger buffer to decode UTF8.
-    if ((can_be_unicode == sz_true_k) &&
-        (sz_isascii(longer, longer_length) == sz_false_k || sz_isascii(shorter, shorter_length) == sz_false_k)) {
-        buffer_length += (shorter_length + longer_length) * sizeof(sz_rune_t);
-    }
-    else { can_be_unicode = sz_false_k; }
-
-    // If the allocation fails, return the maximum distance.
-    sz_ptr_t const buffer = (sz_ptr_t)alloc->allocate(buffer_length, alloc->handle);
-    if (!buffer) return SZ_SIZE_MAX;
-
-    // Let's export the UTF8 sequence into the newly allocated buffer at the end.
-    if (can_be_unicode == sz_true_k) {
-        sz_rune_t *const longer_utf32 = (sz_rune_t *)(buffer + sizeof(_distance_t) * (n * 2));
-        sz_rune_t *const shorter_utf32 = longer_utf32 + longer_length;
-        // Export the UTF8 sequences into the newly allocated buffer.
-        longer_length = _sz_export_utf8_to_utf32(longer, longer_length, longer_utf32);
-        shorter_length = _sz_export_utf8_to_utf32(shorter, shorter_length, shorter_utf32);
-        longer = (sz_cptr_t)longer_utf32;
-        shorter = (sz_cptr_t)shorter_utf32;
-    }
-
-    // Let's parameterize the core logic for different character types and distance types.
-#define _wagner_fisher_unbounded(_distance_t, _char_t)                                                                \
-    /* Now let's cast our pointer to avoid it in subsequent sections. */                                              \
-    _char_t const *const longer_chars = (_char_t const *)longer;                                                      \
-    _char_t const *const shorter_chars = (_char_t const *)shorter;                                                    \
-    _distance_t *previous_distances = (_distance_t *)buffer;                                                          \
-    _distance_t *current_distances = previous_distances + n;                                                          \
-    /*  Initialize the first row of the Levenshtein matrix with `iota`-style arithmetic progression. */               \
-    for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
-    /* The main loop of the algorithm with quadratic complexity. */                                                   \
-    for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {                                     \
-        _char_t const longer_char = longer_chars[idx_longer];                                                         \
-        /* Using pure pointer arithmetic is faster than iterating with an index. */                                   \
-        _char_t const *shorter_ptr = shorter_chars;                                                                   \
-        _distance_t const *previous_ptr = previous_distances;                                                         \
-        _distance_t *current_ptr = current_distances;                                                                 \
-        _distance_t *const current_end = current_ptr + shorter_length;                                                \
-        current_ptr[0] = idx_longer + 1;                                                                              \
-        for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) {                            \
-            _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]);           \
-            /* We can avoid `+1` for costs here, shifting it to post-minimum computation, */                          \
-            /* saving one increment operation. */                                                                     \
-            _distance_t cost_deletion = previous_ptr[1];                                                              \
-            _distance_t cost_insertion = current_ptr[0];                                                              \
-            /* ? It might be a good idea to enforce branchless execution here. */                                     \
-            /* ? The caveat being that the benchmarks on longer sequences backfire and more research is needed. */    \
-            current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1);      \
-        }                                                                                                             \
-        /* Swap `previous_distances` and `current_distances` pointers. */                                             \
-        _distance_t *temporary = previous_distances;                                                                  \
-        previous_distances = current_distances;                                                                       \
-        current_distances = temporary;                                                                                \
-    }                                                                                                                 \
-    /* Cache scalar before `free` call. */                                                                            \
-    sz_size_t result = previous_distances[shorter_length];                                                            \
-    alloc->free(buffer, buffer_length, alloc->handle);                                                                \
-    return result;
-
-    // Let's define a separate variant for bounded distance computation.
-    // Practically the same as unbounded, but also collecting the running minimum within each row for early exit.
-#define _wagner_fisher_bounded(_distance_t, _char_t)                                                                  \
-    _char_t const *const longer_chars = (_char_t const *)longer;                                                      \
-    _char_t const *const shorter_chars = (_char_t const *)shorter;                                                    \
-    _distance_t *previous_distances = (_distance_t *)buffer;                                                          \
-    _distance_t *current_distances = previous_distances + n;                                                          \
-    for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
-    for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {                                     \
-        _char_t const longer_char = longer_chars[idx_longer];                                                         \
-        _char_t const *shorter_ptr = shorter_chars;                                                                   \
-        _distance_t const *previous_ptr = previous_distances;                                                         \
-        _distance_t *current_ptr = current_distances;                                                                 \
-        _distance_t *const current_end = current_ptr + shorter_length;                                                \
-        current_ptr[0] = idx_longer + 1;                                                                              \
-        /* Initialize min_distance with a value greater than bound */                                                 \
-        _distance_t min_distance = bound - 1;                                                                         \
-        for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) {                            \
-            _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]);           \
-            _distance_t cost_deletion = previous_ptr[1];                                                              \
-            _distance_t cost_insertion = current_ptr[0];                                                              \
-            current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1);      \
-            /* Keep track of the minimum distance seen so far in this row */                                          \
-            min_distance = sz_min_of_two(current_ptr[1], min_distance);                                               \
-        }                                                                                                             \
-        /* If the minimum distance in this row exceeded the bound, return early */                                    \
-        if (min_distance >= bound) {                                                                                  \
-            alloc->free(buffer, buffer_length, alloc->handle);                                                        \
-            return bound;                                                                                             \
-        }                                                                                                             \
-        _distance_t *temporary = previous_distances;                                                                  \
-        previous_distances = current_distances;                                                                       \
-        current_distances = temporary;                                                                                \
-    }                                                                                                                 \
-    sz_size_t result = previous_distances[shorter_length];                                                            \
-    alloc->free(buffer, buffer_length, alloc->handle);                                                                \
-    return sz_min_of_two(result, bound);
-
-    // Dispatch the actual computation.
-    if (!bound) {
-        if (can_be_unicode == sz_true_k) { _wagner_fisher_unbounded(sz_size_t, sz_rune_t); }
-        else { _wagner_fisher_unbounded(sz_size_t, sz_u8_t); }
-    }
-    else {
-        if (can_be_unicode == sz_true_k) { _wagner_fisher_bounded(sz_size_t, sz_rune_t); }
-        else { _wagner_fisher_bounded(sz_size_t, sz_u8_t); }
-    }
-}
-
-SZ_PUBLIC sz_size_t sz_edit_distance_serial(     //
-    sz_cptr_t longer, sz_size_t longer_length,   //
-    sz_cptr_t shorter, sz_size_t shorter_length, //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
-    }
-
-    // Skip the matching prefixes and suffixes, they won't affect the distance.
-    for (sz_cptr_t a_end = longer + longer_length, b_end = shorter + shorter_length;
-         longer != a_end && shorter != b_end && *longer == *shorter;
-         ++longer, ++shorter, --longer_length, --shorter_length);
-    for (; longer_length && shorter_length && longer[longer_length - 1] == shorter[shorter_length - 1];
-         --longer_length, --shorter_length);
-
-    // Bounded computations may exit early.
-    int const is_bounded = bound < longer_length;
-    if (is_bounded) {
-        // If one of the strings is empty - the edit distance is equal to the length of the other one.
-        if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
-        if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
-        // If the difference in length is beyond the `bound`, there is no need to check at all.
-        if (longer_length - shorter_length > bound) return bound;
-    }
-
-    if (shorter_length == 0) return longer_length; // If no mismatches were found - the distance is zero.
-    if (shorter_length == longer_length && !is_bounded)
-        return _sz_edit_distance_skewed_diagonals_serial(longer, longer_length, shorter, shorter_length, bound, alloc);
-    return _sz_edit_distance_wagner_fisher_serial(longer, longer_length, shorter, shorter_length, bound, sz_false_k,
-                                                  alloc);
-}
-
-SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(       //
-    sz_cptr_t longer, sz_size_t longer_length,        //
-    sz_cptr_t shorter, sz_size_t shorter_length,      //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, //
-    sz_memory_allocator_t *alloc) {
-
-    // If one of the strings is empty - the edit distance is equal to the length of the other one
-    if (longer_length == 0) return (sz_ssize_t)shorter_length * gap;
-    if (shorter_length == 0) return (sz_ssize_t)longer_length * gap;
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
-    }
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    sz_size_t n = shorter_length + 1;
-    sz_size_t buffer_length = sizeof(sz_ssize_t) * n * 2;
-    sz_ssize_t *distances = (sz_ssize_t *)alloc->allocate(buffer_length, alloc->handle);
-    sz_ssize_t *previous_distances = distances;
-    sz_ssize_t *current_distances = previous_distances + n;
-
-    for (sz_size_t idx_shorter = 0; idx_shorter != n; ++idx_shorter)
-        previous_distances[idx_shorter] = (sz_ssize_t)idx_shorter * gap;
-
-    sz_u8_t const *shorter_unsigned = (sz_u8_t const *)shorter;
-    sz_u8_t const *longer_unsigned = (sz_u8_t const *)longer;
-    for (sz_size_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {
-        current_distances[0] = ((sz_ssize_t)idx_longer + 1) * gap;
-
-        // Initialize min_distance with a value greater than bound
-        sz_error_cost_t const *a_subs = subs + longer_unsigned[idx_longer] * 256ul;
-        for (sz_size_t idx_shorter = 0; idx_shorter != shorter_length; ++idx_shorter) {
-            sz_ssize_t cost_deletion = previous_distances[idx_shorter + 1] + gap;
-            sz_ssize_t cost_insertion = current_distances[idx_shorter] + gap;
-            sz_ssize_t cost_substitution = previous_distances[idx_shorter] + a_subs[shorter_unsigned[idx_shorter]];
-            current_distances[idx_shorter + 1] = sz_max_of_three(cost_deletion, cost_insertion, cost_substitution);
-        }
-
-        // Swap previous_distances and current_distances pointers
-        sz_pointer_swap((void **)&previous_distances, (void **)&current_distances);
-    }
-
-    // Cache scalar before `free` call.
-    sz_ssize_t result = previous_distances[shorter_length];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-}
-
-SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
-    sz_cptr_t a, sz_size_t a_length,            //
-    sz_cptr_t b, sz_size_t b_length,            //
-    sz_size_t bound) {
-
-    sz_size_t const min_length = sz_min_of_two(a_length, b_length);
-    sz_size_t const max_length = sz_max_of_two(a_length, b_length);
-    sz_cptr_t const a_end = a + min_length;
-    bound = bound == 0 ? max_length : bound;
-
-    // Walk through both strings using SWAR and counting the number of differing characters.
-    sz_size_t distance = max_length - min_length;
-#if SZ_USE_MISALIGNED_LOADS && !_SZ_IS_BIG_ENDIAN
-    if (min_length >= SZ_SWAR_THRESHOLD) {
-        sz_u64_vec_t a_vec, b_vec, match_vec;
-        for (; a + 8 <= a_end && distance < bound; a += 8, b += 8) {
-            a_vec.u64 = sz_u64_load(a).u64;
-            b_vec.u64 = sz_u64_load(b).u64;
-            match_vec = _sz_u64_each_byte_equal(a_vec, b_vec);
-            distance += sz_u64_popcount((~match_vec.u64) & 0x8080808080808080ull);
-        }
-    }
-#endif
-
-    for (; a != a_end && distance < bound; ++a, ++b) { distance += (*a != *b); }
-    return sz_min_of_two(distance, bound);
-}
-
-SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial( //
-    sz_cptr_t a, sz_size_t a_length,                 //
-    sz_cptr_t b, sz_size_t b_length,                 //
-    sz_size_t bound) {
-
-    sz_cptr_t const a_end = a + a_length;
-    sz_cptr_t const b_end = b + b_length;
-    sz_size_t distance = 0;
-
-    sz_rune_t a_rune, b_rune;
-    sz_rune_length_t a_rune_length, b_rune_length;
-
-    if (bound) {
-        for (; a < a_end && b < b_end && distance < bound; a += a_rune_length, b += b_rune_length) {
-            _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-            _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-            distance += (a_rune != b_rune);
-        }
-        // If one string has more runes, we need to go through the tail.
-        if (distance < bound) {
-            for (; a < a_end && distance < bound; a += a_rune_length, ++distance)
-                _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-
-            for (; b < b_end && distance < bound; b += b_rune_length, ++distance)
-                _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-        }
-    }
-    else {
-        for (; a < a_end && b < b_end; a += a_rune_length, b += b_rune_length) {
-            _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-            _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-            distance += (a_rune != b_rune);
-        }
-        // If one string has more runes, we need to go through the tail.
-        for (; a < a_end; a += a_rune_length, ++distance) _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-        for (; b < b_end; b += b_rune_length, ++distance) _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
-    }
-    return distance;
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length) {
-    sz_u64_t checksum = 0;
-    sz_u8_t const *text_u8 = (sz_u8_t const *)text;
-    sz_u8_t const *text_end = text_u8 + length;
-    for (; text_u8 != text_end; ++text_u8) checksum += *text_u8;
-    return checksum;
-}
-
-/**
- *  @brief  Largest prime number that fits into 31 bits.
- *  @see    https://mersenneforum.org/showthread.php?t=3471
- */
-#define SZ_U32_MAX_PRIME (2147483647u)
-
-/**
- *  @brief  Largest prime number that fits into 64 bits.
- *  @see    https://mersenneforum.org/showthread.php?t=3471
- *
- *  2^64 = 18,446,744,073,709,551,616
- *  this = 18,446,744,073,709,551,557
- *  diff = 59
- */
-#define SZ_U64_MAX_PRIME (18446744073709551557ull)
-
-/*
- *  One hardware-accelerated way of mixing hashes can be CRC, but it's only implemented for 32-bit values.
- *  Using a Boost-like mixer works very poorly in such case:
- *
- *       hash_first ^ (hash_second + 0x517cc1b727220a95 + (hash_first << 6) + (hash_first >> 2));
- *
- *  Let's stick to the Fibonacci hash trick using the golden ratio.
- *  https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
- */
-#define _sz_hash_mix(first, second) ((first * 11400714819323198485ull) ^ (second * 11400714819323198485ull))
-#define _sz_shift_low(x) (x)
-#define _sz_shift_high(x) ((x + 77ull) & 0xFFull)
-#define _sz_prime_mod(x) (x % SZ_U64_MAX_PRIME)
-
-SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length) {
-
-    sz_u64_t hash_low = 0;
-    sz_u64_t hash_high = 0;
-    sz_u8_t const *text = (sz_u8_t const *)start;
-    sz_u8_t const *text_end = text + length;
-
-    switch (length) {
-    case 0: return 0;
-
-    // Texts under 7 bytes long are definitely below the largest prime.
-    case 1:
-        hash_low = _sz_shift_low(text[0]);
-        hash_high = _sz_shift_high(text[0]);
-        break;
-    case 2:
-        hash_low = _sz_shift_low(text[0]) * 31ull + _sz_shift_low(text[1]);
-        hash_high = _sz_shift_high(text[0]) * 257ull + _sz_shift_high(text[1]);
-        break;
-    case 3:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull +         //
-                   _sz_shift_low(text[2]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull +          //
-                    _sz_shift_high(text[2]);
-        break;
-    case 4:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull +                 //
-                   _sz_shift_low(text[3]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull +                   //
-                    _sz_shift_high(text[3]);
-        break;
-    case 5:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull +                         //
-                   _sz_shift_low(text[4]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull +                            //
-                    _sz_shift_high(text[4]);
-        break;
-    case 6:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull * 31ull +                         //
-                   _sz_shift_low(text[4]) * 31ull +                                 //
-                   _sz_shift_low(text[5]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull * 257ull +                            //
-                    _sz_shift_high(text[4]) * 257ull +                                     //
-                    _sz_shift_high(text[5]);
-        break;
-    case 7:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull * 31ull * 31ull +                         //
-                   _sz_shift_low(text[4]) * 31ull * 31ull +                                 //
-                   _sz_shift_low(text[5]) * 31ull +                                         //
-                   _sz_shift_low(text[6]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull * 257ull * 257ull +                            //
-                    _sz_shift_high(text[4]) * 257ull * 257ull +                                     //
-                    _sz_shift_high(text[5]) * 257ull +                                              //
-                    _sz_shift_high(text[6]);
-        break;
-    default:
-        // Unroll the first seven cycles:
-        hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[1]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[1]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[2]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[2]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[3]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[3]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[4]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[4]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[5]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[5]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[6]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[6]);
-        text += 7;
-
-        // Iterate throw the rest with the modulus:
-        for (; text != text_end; ++text) {
-            hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
-            hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
-            // Wrap the hashes around:
-            hash_low = _sz_prime_mod(hash_low);
-            hash_high = _sz_prime_mod(hash_high);
-        }
-        break;
-    }
-
-    return _sz_hash_mix(hash_low, hash_high);
-}
-
-SZ_PUBLIC void sz_hashes_serial(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    sz_u8_t const *text = (sz_u8_t const *)start;
-    sz_u8_t const *text_end = text + length;
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // Compute the initial hash value for the first window.
-    sz_u64_t hash_low = 0, hash_high = 0, hash_mix;
-    for (sz_u8_t const *first_end = text + window_length; text < first_end; ++text)
-        hash_low = (hash_low * 31ull + _sz_shift_low(*text)) % SZ_U64_MAX_PRIME,
-        hash_high = (hash_high * 257ull + _sz_shift_high(*text)) % SZ_U64_MAX_PRIME;
-
-    // In most cases the fingerprint length will be a power of two.
-    hash_mix = _sz_hash_mix(hash_low, hash_high);
-    callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
-
-    // Compute the hash value for every window, exporting into the fingerprint,
-    // using the expensive modulo operation.
-    sz_size_t cycles = 1;
-    sz_size_t const step_mask = step - 1;
-    for (; text < text_end; ++text, ++cycles) {
-        // Discard one character:
-        hash_low -= _sz_shift_low(*(text - window_length)) * prime_power_low;
-        hash_high -= _sz_shift_high(*(text - window_length)) * prime_power_high;
-        // And add a new one:
-        hash_low = 31ull * hash_low + _sz_shift_low(*text);
-        hash_high = 257ull * hash_high + _sz_shift_high(*text);
-        // Wrap the hashes around:
-        hash_low = _sz_prime_mod(hash_low);
-        hash_high = _sz_prime_mod(hash_high);
-        // Mix only if we've skipped enough hashes.
-        if ((cycles & step_mask) == 0) {
-            hash_mix = _sz_hash_mix(hash_low, hash_high);
-            callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
-        }
-    }
-}
-
-#undef _sz_shift_low
-#undef _sz_shift_high
-#undef _sz_hash_mix
-#undef _sz_prime_mod
-
-/**
- *  @brief  Uses a small lookup-table to convert a lowercase character to uppercase.
- */
-SZ_INTERNAL sz_u8_t sz_u8_tolower(sz_u8_t c) {
-    static sz_u8_t const lowered[256] = {
-        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
-        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
-        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
-        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
-        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
-        96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, //
-        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
-        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
-        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
-        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
-    };
-    return lowered[c];
-}
-
-/**
- *  @brief  Uses a small lookup-table to convert an uppercase character to lowercase.
- */
-SZ_INTERNAL sz_u8_t sz_u8_toupper(sz_u8_t c) {
-    static sz_u8_t const upped[256] = {
-        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
-        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
-        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
-        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
-        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
-        96,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  //
-        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  123, 124, 125, 126, 127, //
-        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
-        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
-        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
-        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
-    };
-    return upped[c];
-}
-
-/**
- *  @brief  Uses two small lookup tables (768 bytes total) to accelerate division by a small
- *          unsigned integer. Performs two lookups, one multiplication, two shifts, and two accumulations.
- *
- *  @param  divisor Integral value @b larger than one.
- *  @param  number  Integral value to divide.
- */
-SZ_INTERNAL sz_u8_t sz_u8_divide(sz_u8_t number, sz_u8_t divisor) {
-    sz_assert(divisor > 1);
-    static sz_u16_t const multipliers[256] = {
-        0,     0,     0,     21846, 0,     39322, 21846, 9363,  0,     50973, 39322, 29790, 21846, 15124, 9363,  4370,
-        0,     57826, 50973, 44841, 39322, 34329, 29790, 25645, 21846, 18351, 15124, 12137, 9363,  6780,  4370,  2115,
-        0,     61565, 57826, 54302, 50973, 47824, 44841, 42011, 39322, 36765, 34329, 32006, 29790, 27671, 25645, 23705,
-        21846, 20063, 18351, 16706, 15124, 13602, 12137, 10725, 9363,  8049,  6780,  5554,  4370,  3224,  2115,  1041,
-        0,     63520, 61565, 59668, 57826, 56039, 54302, 52614, 50973, 49377, 47824, 46313, 44841, 43407, 42011, 40649,
-        39322, 38028, 36765, 35532, 34329, 33154, 32006, 30885, 29790, 28719, 27671, 26647, 25645, 24665, 23705, 22766,
-        21846, 20945, 20063, 19198, 18351, 17520, 16706, 15907, 15124, 14356, 13602, 12863, 12137, 11424, 10725, 10038,
-        9363,  8700,  8049,  7409,  6780,  6162,  5554,  4957,  4370,  3792,  3224,  2665,  2115,  1573,  1041,  517,
-        0,     64520, 63520, 62535, 61565, 60609, 59668, 58740, 57826, 56926, 56039, 55164, 54302, 53452, 52614, 51788,
-        50973, 50169, 49377, 48595, 47824, 47063, 46313, 45572, 44841, 44120, 43407, 42705, 42011, 41326, 40649, 39982,
-        39322, 38671, 38028, 37392, 36765, 36145, 35532, 34927, 34329, 33738, 33154, 32577, 32006, 31443, 30885, 30334,
-        29790, 29251, 28719, 28192, 27671, 27156, 26647, 26143, 25645, 25152, 24665, 24182, 23705, 23233, 22766, 22303,
-        21846, 21393, 20945, 20502, 20063, 19628, 19198, 18772, 18351, 17933, 17520, 17111, 16706, 16305, 15907, 15514,
-        15124, 14738, 14356, 13977, 13602, 13231, 12863, 12498, 12137, 11779, 11424, 11073, 10725, 10380, 10038, 9699,
-        9363,  9030,  8700,  8373,  8049,  7727,  7409,  7093,  6780,  6470,  6162,  5857,  5554,  5254,  4957,  4662,
-        4370,  4080,  3792,  3507,  3224,  2943,  2665,  2388,  2115,  1843,  1573,  1306,  1041,  778,   517,   258,
-    };
-    // This table can be avoided using a single addition and counting trailing zeros.
-    static sz_u8_t const shifts[256] = {
-        0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, //
-        4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, //
-        5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, //
-        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, //
-        6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
-    };
-    sz_u32_t multiplier = multipliers[divisor];
-    sz_u8_t shift = shifts[divisor];
-
-    sz_u16_t q = (sz_u16_t)((multiplier * number) >> 16);
-    sz_u16_t t = ((number - q) >> 1) + q;
-    return (sz_u8_t)(t >> shift);
-}
-
-SZ_PUBLIC void sz_look_up_transform_serial(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result) {
-    sz_u8_t const *unsigned_lut = (sz_u8_t const *)lut;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = unsigned_lut[*unsigned_text];
-}
-
-SZ_PUBLIC void sz_tolower_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = sz_u8_tolower(*unsigned_text);
-}
-
-SZ_PUBLIC void sz_toupper_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = sz_u8_toupper(*unsigned_text);
-}
-
-SZ_PUBLIC void sz_toascii_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = *unsigned_text & 0x7F;
-}
-
-/**
- *  @brief  Check if there is a byte in this buffer, that exceeds 127 and can't be an ASCII character.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
- */
-SZ_PUBLIC sz_bool_t sz_isascii_serial(sz_cptr_t text, sz_size_t length) {
-
-    if (!length) return sz_true_k;
-    sz_u8_t const *h = (sz_u8_t const *)text;
-    sz_u8_t const *const h_end = h + length;
-
-#if !SZ_USE_MISALIGNED_LOADS
-    // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
-        if (*h & 0x80ull) return sz_false_k;
-#endif
-
-    // Validate eight bytes at once using SWAR.
-    sz_u64_vec_t text_vec;
-    for (; h + 8 <= h_end; h += 8) {
-        text_vec.u64 = *(sz_u64_t const *)h;
-        if (text_vec.u64 & 0x8080808080808080ull) return sz_false_k;
-    }
-
-    // Handle the misaligned tail.
-    for (; h < h_end; ++h)
-        if (*h & 0x80ull) return sz_false_k;
-    return sz_true_k;
-}
-
-SZ_PUBLIC void sz_generate_serial(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
-                                  sz_random_generator_t generator, void *generator_user_data) {
-
-    sz_assert(alphabet_size > 0 && alphabet_size <= 256 && "Inadequate alphabet size");
-
-    if (alphabet_size == 1) sz_fill(result, result_length, *alphabet);
-
-    else {
-        sz_assert(generator && "Expects a valid random generator");
-        sz_u8_t divisor = (sz_u8_t)alphabet_size;
-        for (sz_cptr_t end = result + result_length; result != end; ++result) {
-            sz_u8_t random = generator(generator_user_data) & 0xFF;
-            sz_u8_t quotient = sz_u8_divide(random, divisor);
-            *result = alphabet[random - quotient * divisor];
-        }
-    }
-}
-
-#pragma endregion
-
-/*
- *  Serial implementation of string class operations.
- */
-#pragma region Serial Implementation for the String Class
-
-SZ_PUBLIC sz_bool_t sz_string_is_on_stack(sz_string_t const *string) {
-    // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    return (sz_bool_t)((sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0]);
-}
-
-SZ_PUBLIC void sz_string_range(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length) {
-    sz_size_t is_small = (sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0];
-    sz_size_t is_big_mask = is_small - 1ull;
-    *start = string->external.start; // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    // If the string is small, use branch-less approach to mask-out the top 7 bytes of the length.
-    *length = string->external.length & (0x00000000000000FFull | is_big_mask);
-}
-
-SZ_PUBLIC void sz_string_unpack(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length, sz_size_t *space,
-                                sz_bool_t *is_external) {
-    sz_size_t is_small = (sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0];
-    sz_size_t is_big_mask = is_small - 1ull;
-    *start = string->external.start; // It doesn't matter if it's on stack or heap, the pointer location is the same.
-    // If the string is small, use branch-less approach to mask-out the top 7 bytes of the length.
-    *length = string->external.length & (0x00000000000000FFull | is_big_mask);
-    // In case the string is small, the `is_small - 1ull` will become 0xFFFFFFFFFFFFFFFFull.
-    *space = sz_u64_blend(_SZ_STRING_INTERNAL_SPACE, string->external.space, is_big_mask);
-    *is_external = (sz_bool_t)!is_small;
-}
-
-SZ_PUBLIC sz_bool_t sz_string_equal(sz_string_t const *a, sz_string_t const *b) {
-    // Tempting to say that the external.length is bitwise the same even if it includes
-    // some bytes of the on-stack payload, but we don't at this writing maintain that invariant.
-    // (An on-stack string includes noise bytes in the high-order bits of external.length. So do this
-    // the hard/correct way.
-
-#if SZ_USE_MISALIGNED_LOADS
-    // Dealing with StringZilla strings, we know that the `start` pointer always points
-    // to a word at least 8 bytes long. Therefore, we can compare the first 8 bytes at once.
-
-#endif
-    // Alternatively, fall back to byte-by-byte comparison.
-    sz_ptr_t a_start, b_start;
-    sz_size_t a_length, b_length;
-    sz_string_range(a, &a_start, &a_length);
-    sz_string_range(b, &b_start, &b_length);
-    return (sz_bool_t)(a_length == b_length && sz_equal(a_start, b_start, b_length));
-}
-
-SZ_PUBLIC sz_ordering_t sz_string_order(sz_string_t const *a, sz_string_t const *b) {
-#if SZ_USE_MISALIGNED_LOADS
-    // Dealing with StringZilla strings, we know that the `start` pointer always points
-    // to a word at least 8 bytes long. Therefore, we can compare the first 8 bytes at once.
-
-#endif
-    // Alternatively, fall back to byte-by-byte comparison.
-    sz_ptr_t a_start, b_start;
-    sz_size_t a_length, b_length;
-    sz_string_range(a, &a_start, &a_length);
-    sz_string_range(b, &b_start, &b_length);
-    return sz_order(a_start, a_length, b_start, b_length);
-}
-
-SZ_PUBLIC void sz_string_init(sz_string_t *string) {
-    sz_assert(string && "String can't be SZ_NULL.");
-
-    // Only 8 + 1 + 1 need to be initialized.
-    string->internal.start = &string->internal.chars[0];
-    // But for safety let's initialize the entire structure to zeros.
-    // string->internal.chars[0] = 0;
-    // string->internal.length = 0;
-    string->words[1] = 0;
-    string->words[2] = 0;
-    string->words[3] = 0;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length, sz_memory_allocator_t *allocator) {
-    sz_size_t space_needed = length + 1; // space for trailing \0
-    sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
-    // Initialize the string to zeros for safety.
-    string->words[1] = 0;
-    string->words[2] = 0;
-    string->words[3] = 0;
-    // If we are lucky, no memory allocations will be needed.
-    if (space_needed <= _SZ_STRING_INTERNAL_SPACE) {
-        string->internal.start = &string->internal.chars[0];
-        string->internal.length = (sz_u8_t)length;
-    }
-    else {
-        // If we are not lucky, we need to allocate memory.
-        string->external.start = (sz_ptr_t)allocator->allocate(space_needed, allocator->handle);
-        if (!string->external.start) return SZ_NULL_CHAR;
-        string->external.length = length;
-        string->external.space = space_needed;
-    }
-    sz_assert(&string->internal.start == &string->external.start && "Alignment confusion");
-    string->external.start[length] = 0;
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity, sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
-
-    sz_size_t new_space = new_capacity + 1;
-    if (new_space <= _SZ_STRING_INTERNAL_SPACE) return string->external.start;
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-    sz_assert(new_space > string_space && "New space must be larger than current.");
-
-    sz_ptr_t new_start = (sz_ptr_t)allocator->allocate(new_space, allocator->handle);
-    if (!new_start) return SZ_NULL_CHAR;
-
-    sz_copy(new_start, string_start, string_length);
-    string->external.start = new_start;
-    string->external.space = new_space;
-    string->external.padding = 0;
-    string->external.length = string_length;
-
-    // Deallocate the old string.
-    if (string_is_external) allocator->free(string_start, string_space, allocator->handle);
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_shrink_to_fit(sz_string_t *string, sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // We may already be space-optimal, and in that case we don't need to do anything.
-    sz_size_t new_space = string_length + 1;
-    if (string_space == new_space || !string_is_external) return string->external.start;
-
-    sz_ptr_t new_start = (sz_ptr_t)allocator->allocate(new_space, allocator->handle);
-    if (!new_start) return SZ_NULL_CHAR;
-
-    sz_copy(new_start, string_start, string_length);
-    string->external.start = new_start;
-    string->external.space = new_space;
-    string->external.padding = 0;
-    string->external.length = string_length;
-
-    // Deallocate the old string.
-    if (string_is_external) allocator->free(string_start, string_space, allocator->handle);
-    return string->external.start;
-}
-
-SZ_PUBLIC sz_ptr_t sz_string_expand(sz_string_t *string, sz_size_t offset, sz_size_t added_length,
-                                    sz_memory_allocator_t *allocator) {
-
-    sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // The user intended to extend the string.
-    offset = sz_min_of_two(offset, string_length);
-
-    // If we are lucky, no memory allocations will be needed.
-    if (string_length + added_length < string_space) {
-        sz_move(string_start + offset + added_length, string_start + offset, string_length - offset);
-        string_start[string_length + added_length] = 0;
-        // Even if the string is on the stack, the `+=` won't affect the tail of the string.
-        string->external.length += added_length;
-    }
-    // If we are not lucky, we need to allocate more memory.
-    else {
-        sz_size_t next_planned_size = sz_max_of_two(SZ_CACHE_LINE_WIDTH, string_space * 2ull);
-        sz_size_t min_needed_space = sz_size_bit_ceil(offset + string_length + added_length + 1);
-        sz_size_t new_space = sz_max_of_two(min_needed_space, next_planned_size);
-        string_start = sz_string_reserve(string, new_space - 1, allocator);
-        if (!string_start) return SZ_NULL_CHAR;
-
-        // Copy into the new buffer.
-        sz_move(string_start + offset + added_length, string_start + offset, string_length - offset);
-        string_start[string_length + added_length] = 0;
-        string->external.length = string_length + added_length;
-    }
-
-    return string_start;
-}
-
-SZ_PUBLIC sz_size_t sz_string_erase(sz_string_t *string, sz_size_t offset, sz_size_t length) {
-
-    sz_assert(string && "String can't be SZ_NULL.");
-
-    sz_ptr_t string_start;
-    sz_size_t string_length;
-    sz_size_t string_space;
-    sz_bool_t string_is_external;
-    sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-
-    // Normalize the offset, it can't be larger than the length.
-    offset = sz_min_of_two(offset, string_length);
-
-    // We shouldn't normalize the length, to avoid overflowing on `offset + length >= string_length`,
-    // if receiving `length == SZ_SIZE_MAX`. After following expression the `length` will contain
-    // exactly the delta between original and final length of this `string`.
-    length = sz_min_of_two(length, string_length - offset);
-
-    // There are 2 common cases, that wouldn't even require a `memmove`:
-    //      1.  Erasing the entire contents of the string.
-    //          In that case `length` argument will be equal or greater than `length` member.
-    //      2.  Removing the tail of the string with something like `string.pop_back()` in C++.
-    //
-    // In both of those, regardless of the location of the string - stack or heap,
-    // the erasing is as easy as setting the length to the offset.
-    // In every other case, we must `memmove` the tail of the string to the left.
-    if (offset + length < string_length)
-        sz_move(string_start + offset, string_start + offset + length, string_length - offset - length);
-
-    // The `string->external.length = offset` assignment would discard last characters
-    // of the on-the-stack string, but inplace subtraction would work.
-    string->external.length -= length;
-    string_start[string_length - length] = 0;
-    return length;
-}
-
-SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *allocator) {
-    if (!sz_string_is_on_stack(string))
-        allocator->free(string->external.start, string->external.space, allocator->handle);
-    sz_string_init(string);
-}
-
-#pragma endregion
-
-/*
- *  @brief  Serial implementation for strings sequence processing.
- */
-#pragma region Serial Implementation for Sequences
-
-SZ_PUBLIC sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate) {
-
-    sz_size_t matches = 0;
-    while (matches != sequence->count && predicate(sequence, sequence->order[matches])) ++matches;
-
-    for (sz_size_t i = matches + 1; i < sequence->count; ++i)
-        if (predicate(sequence, sequence->order[i]))
-            sz_u64_swap(sequence->order + i, sequence->order + matches), ++matches;
-
-    return matches;
-}
-
-SZ_PUBLIC void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less) {
-
-    sz_size_t start_b = partition + 1;
-
-    // If the direct merge is already sorted
-    if (!less(sequence, sequence->order[start_b], sequence->order[partition])) return;
-
-    sz_size_t start_a = 0;
-    while (start_a <= partition && start_b <= sequence->count) {
-
-        // If element 1 is in right place
-        if (!less(sequence, sequence->order[start_b], sequence->order[start_a])) { start_a++; }
-        else {
-            sz_size_t value = sequence->order[start_b];
-            sz_size_t index = start_b;
-
-            // Shift all the elements between element 1
-            // element 2, right by 1.
-            while (index != start_a) { sequence->order[index] = sequence->order[index - 1], index--; }
-            sequence->order[start_a] = value;
-
-            // Update all the pointers
-            start_a++;
-            partition++;
-            start_b++;
-        }
-    }
-}
-
-SZ_PUBLIC void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
-    sz_u64_t *keys = sequence->order;
-    sz_size_t keys_count = sequence->count;
-    for (sz_size_t i = 1; i < keys_count; i++) {
-        sz_u64_t i_key = keys[i];
-        sz_size_t j = i;
-        for (; j > 0 && less(sequence, i_key, keys[j - 1]); --j) keys[j] = keys[j - 1];
-        keys[j] = i_key;
-    }
-}
-
-SZ_INTERNAL void _sz_sift_down(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t start,
-                               sz_size_t end) {
-    sz_size_t root = start;
-    while (2 * root + 1 <= end) {
-        sz_size_t child = 2 * root + 1;
-        if (child + 1 <= end && less(sequence, order[child], order[child + 1])) { child++; }
-        if (!less(sequence, order[root], order[child])) { return; }
-        sz_u64_swap(order + root, order + child);
-        root = child;
-    }
-}
-
-SZ_INTERNAL void _sz_heapify(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t count) {
-    sz_size_t start = (count - 2) / 2;
-    while (1) {
-        _sz_sift_down(sequence, less, order, start, count - 1);
-        if (start == 0) return;
-        start--;
-    }
-}
-
-SZ_INTERNAL void _sz_heapsort(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first, sz_size_t last) {
-    sz_u64_t *order = sequence->order;
-    sz_size_t count = last - first;
-    _sz_heapify(sequence, less, order + first, count);
-    sz_size_t end = count - 1;
-    while (end > 0) {
-        sz_u64_swap(order + first, order + first + end);
-        end--;
-        _sz_sift_down(sequence, less, order + first, 0, end);
-    }
-}
-
-SZ_PUBLIC void sz_sort_introsort_recursion(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first,
-                                           sz_size_t last, sz_size_t depth) {
-
-    sz_size_t length = last - first;
-    switch (length) {
-    case 0:
-    case 1: return;
-    case 2:
-        if (less(sequence, sequence->order[first + 1], sequence->order[first]))
-            sz_u64_swap(&sequence->order[first], &sequence->order[first + 1]);
-        return;
-    case 3: {
-        sz_u64_t a = sequence->order[first];
-        sz_u64_t b = sequence->order[first + 1];
-        sz_u64_t c = sequence->order[first + 2];
-        if (less(sequence, b, a)) sz_u64_swap(&a, &b);
-        if (less(sequence, c, b)) sz_u64_swap(&c, &b);
-        if (less(sequence, b, a)) sz_u64_swap(&a, &b);
-        sequence->order[first] = a;
-        sequence->order[first + 1] = b;
-        sequence->order[first + 2] = c;
-        return;
-    }
-    }
-    // Until a certain length, the quadratic-complexity insertion-sort is fine
-    if (length <= 16) {
-        sz_sequence_t sub_seq = *sequence;
-        sub_seq.order += first;
-        sub_seq.count = length;
-        sz_sort_insertion(&sub_seq, less);
-        return;
-    }
-
-    // Fallback to N-logN-complexity heap-sort
-    if (depth == 0) {
-        _sz_heapsort(sequence, less, first, last);
-        return;
-    }
-
-    --depth;
-
-    // Median-of-three logic to choose pivot
-    sz_size_t median = first + length / 2;
-    if (less(sequence, sequence->order[median], sequence->order[first]))
-        sz_u64_swap(&sequence->order[first], &sequence->order[median]);
-    if (less(sequence, sequence->order[last - 1], sequence->order[first]))
-        sz_u64_swap(&sequence->order[first], &sequence->order[last - 1]);
-    if (less(sequence, sequence->order[median], sequence->order[last - 1]))
-        sz_u64_swap(&sequence->order[median], &sequence->order[last - 1]);
-
-    // Partition using the median-of-three as the pivot
-    sz_u64_t pivot = sequence->order[median];
-    sz_size_t left = first;
-    sz_size_t right = last - 1;
-    while (1) {
-        while (less(sequence, sequence->order[left], pivot)) left++;
-        while (less(sequence, pivot, sequence->order[right])) right--;
-        if (left >= right) break;
-        sz_u64_swap(&sequence->order[left], &sequence->order[right]);
-        left++;
-        right--;
-    }
-
-    // Recursively sort the partitions
-    sz_sort_introsort_recursion(sequence, less, first, left, depth);
-    sz_sort_introsort_recursion(sequence, less, right + 1, last, depth);
-}
-
-SZ_PUBLIC void sz_sort_introsort(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
-    if (sequence->count == 0) return;
-    sz_size_t size_is_not_power_of_two = (sequence->count & (sequence->count - 1)) != 0;
-    sz_size_t depth_limit = sz_size_log2i_nonzero(sequence->count) + size_is_not_power_of_two;
-    sz_sort_introsort_recursion(sequence, less, 0, sequence->count, depth_limit);
-}
-
-SZ_PUBLIC void sz_sort_recursion( //
-    sz_sequence_t *sequence, sz_size_t bit_idx, sz_size_t bit_max, sz_sequence_comparator_t comparator,
-    sz_size_t partial_order_length) {
-
-    if (!sequence->count) return;
-
-    // Array of size one doesn't need sorting - only needs the prefix to be discarded.
-    if (sequence->count == 1) {
-        sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
-        order_half_words[1] = 0;
-        return;
-    }
-
-    // Partition a range of integers according to a specific bit value
-    sz_size_t split = 0;
-    sz_u64_t mask = (1ull << 63) >> bit_idx;
-
-    // The clean approach would be to perform a single pass over the sequence.
-    //
-    //    while (split != sequence->count && !(sequence->order[split] & mask)) ++split;
-    //    for (sz_size_t i = split + 1; i < sequence->count; ++i)
-    //        if (!(sequence->order[i] & mask)) sz_u64_swap(sequence->order + i, sequence->order + split), ++split;
-    //
-    // This, however, doesn't take into account the high relative cost of writes and swaps.
-    // To circumvent that, we can first count the total number entries to be mapped into either part.
-    // And then walk through both parts, swapping the entries that are in the wrong part.
-    // This would often lead to ~15% performance gain.
-    sz_size_t count_with_bit_set = 0;
-    for (sz_size_t i = 0; i != sequence->count; ++i) count_with_bit_set += (sequence->order[i] & mask) != 0;
-    split = sequence->count - count_with_bit_set;
-
-    // It's possible that the sequence is already partitioned.
-    if (split != 0 && split != sequence->count) {
-        // Use two pointers to efficiently reposition elements.
-        // On pointer walks left-to-right from the start, and the other walks right-to-left from the end.
-        sz_size_t left = 0;
-        sz_size_t right = sequence->count - 1;
-        while (1) {
-            // Find the next element with the bit set on the left side.
-            while (left < split && !(sequence->order[left] & mask)) ++left;
-            // Find the next element without the bit set on the right side.
-            while (right >= split && (sequence->order[right] & mask)) --right;
-            // Swap the mispositioned elements.
-            if (left < split && right >= split) {
-                sz_u64_swap(sequence->order + left, sequence->order + right);
-                ++left;
-                --right;
-            }
-            else { break; }
-        }
-    }
-
-    // Go down recursively.
-    if (bit_idx < bit_max) {
-        sz_sequence_t a = *sequence;
-        a.count = split;
-        sz_sort_recursion(&a, bit_idx + 1, bit_max, comparator, partial_order_length);
-
-        sz_sequence_t b = *sequence;
-        b.order += split;
-        b.count -= split;
-        sz_sort_recursion(&b, bit_idx + 1, bit_max, comparator, partial_order_length);
-    }
-    // Reached the end of recursion.
-    else {
-        // Discard the prefixes.
-        sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
-        for (sz_size_t i = 0; i != sequence->count; ++i) { order_half_words[i * 2 + 1] = 0; }
-
-        sz_sequence_t a = *sequence;
-        a.count = split;
-        sz_sort_introsort(&a, comparator);
-
-        sz_sequence_t b = *sequence;
-        b.order += split;
-        b.count -= split;
-        sz_sort_introsort(&b, comparator);
-    }
-}
-
-SZ_INTERNAL sz_bool_t _sz_sort_is_less(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) {
-    sz_cptr_t i_str = sequence->get_start(sequence, i_key);
-    sz_cptr_t j_str = sequence->get_start(sequence, j_key);
-    sz_size_t i_len = sequence->get_length(sequence, i_key);
-    sz_size_t j_len = sequence->get_length(sequence, j_key);
-    return (sz_bool_t)(sz_order_serial(i_str, i_len, j_str, j_len) == sz_less_k);
-}
-
-SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t partial_order_length) {
-
-#if _SZ_IS_BIG_ENDIAN
-    // TODO: Implement partial sort for big-endian systems. For now this sorts the whole thing.
-    sz_unused(partial_order_length);
-    sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
-#else
-
-    // Export up to 4 bytes into the `sequence` bits themselves
-    for (sz_size_t i = 0; i != sequence->count; ++i) {
-        sz_cptr_t begin = sequence->get_start(sequence, sequence->order[i]);
-        sz_size_t length = sequence->get_length(sequence, sequence->order[i]);
-        length = length > 4u ? 4u : length;
-        sz_ptr_t prefix = (sz_ptr_t)&sequence->order[i];
-        for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j];
-    }
-
-    // Perform optionally-parallel radix sort on them
-    sz_sort_recursion(sequence, 0, 32, (sz_sequence_comparator_t)_sz_sort_is_less, partial_order_length);
-#endif
-}
-
-SZ_PUBLIC void sz_sort(sz_sequence_t *sequence) {
-#if _SZ_IS_BIG_ENDIAN
-    sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
-#else
-    sz_sort_partial(sequence, sequence->count);
-#endif
-}
-
-#pragma endregion
-
-/*
- *  @brief  AVX2 implementation of the string search algorithms.
- *          Very minimalistic, but still faster than the serial implementation.
- */
-#pragma region AVX2 Implementation
-
-#if SZ_USE_HASWELL
-#pragma GCC push_options
-#pragma GCC target("avx2")
-#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
-#include <immintrin.h>
-
-/**
- *  @brief  Helper structure to simplify work with 256-bit registers.
- */
-typedef union sz_u256_vec_t {
-    __m256i ymm;
-    __m128i xmms[2];
-    sz_u64_t u64s[4];
-    sz_u32_t u32s[8];
-    sz_u16_t u16s[16];
-    sz_u8_t u8s[32];
-} sz_u256_vec_t;
-
-SZ_PUBLIC sz_ordering_t sz_order_avx2(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    //! Before optimizing this, read the "Operations Not Worth Optimizing" in Contributions Guide:
-    //! https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md#general-performance-observations
-    return sz_order_serial(a, a_length, b, b_length);
-}
-
-SZ_PUBLIC sz_bool_t sz_equal_avx2(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_u256_vec_t a_vec, b_vec;
-
-    while (length >= 32) {
-        a_vec.ymm = _mm256_lddqu_si256((__m256i const *)a);
-        b_vec.ymm = _mm256_lddqu_si256((__m256i const *)b);
-        // One approach can be to use "movemasks", but we could also use a bitwise matching like `_mm256_testnzc_si256`.
-        int difference_mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi8(a_vec.ymm, b_vec.ymm));
-        if (difference_mask == 0) { a += 32, b += 32, length -= 32; }
-        else { return sz_false_k; }
-    }
-
-    if (length) return sz_equal_serial(a, b, length);
-    return sz_true_k;
-}
-
-SZ_PUBLIC void sz_fill_avx2(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    char value_char = *(char *)&value;
-    __m256i value_vec = _mm256_set1_epi8(value_char);
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores".
-    //
-    //    for (; length >= 32; target += 32, length -= 32) _mm256_storeu_si256(target, value_vec);
-    //    sz_fill_serial(target, length, value);
-    //
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 32) sz_fill_serial(target, length, value);
-    // When the buffer is aligned, we can avoid any split-stores.
-    else {
-        sz_size_t head_length = (32 - ((sz_size_t)target % 32)) % 32; // 31 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 32;    // 31 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 32.
-        sz_u16_t value16 = (sz_u16_t)value * 0x0101u;
-        sz_u32_t value32 = (sz_u32_t)value16 * 0x00010001u;
-        sz_u64_t value64 = (sz_u64_t)value32 * 0x0000000100000001ull;
-
-        // Fill the head of the buffer. This part is much cleaner with AVX-512.
-        if (head_length & 1) *(sz_u8_t *)target = value, target++, head_length--;
-        if (head_length & 2) *(sz_u16_t *)target = value16, target += 2, head_length -= 2;
-        if (head_length & 4) *(sz_u32_t *)target = value32, target += 4, head_length -= 4;
-        if (head_length & 8) *(sz_u64_t *)target = value64, target += 8, head_length -= 8;
-        if (head_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_set1_epi8(value_char)), target += 16, head_length -= 16;
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-
-        // Fill the aligned body of the buffer.
-        for (; body_length >= 32; target += 32, body_length -= 32) _mm256_store_si256((__m256i *)target, value_vec);
-
-        // Fill the tail of the buffer. This part is much cleaner with AVX-512.
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-        if (tail_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_set1_epi8(value_char)), target += 16, tail_length -= 16;
-        if (tail_length & 8) *(sz_u64_t *)target = value64, target += 8, tail_length -= 8;
-        if (tail_length & 4) *(sz_u32_t *)target = value32, target += 4, tail_length -= 4;
-        if (tail_length & 2) *(sz_u16_t *)target = value16, target += 2, tail_length -= 2;
-        if (tail_length & 1) *(sz_u8_t *)target = value, target++, tail_length--;
-    }
-}
-
-SZ_PUBLIC void sz_copy_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores" and "loads".
-    //
-    //    for (; length >= 32; target += 32, source += 32, length -= 32)
-    //        _mm256_storeu_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-    //    sz_copy_serial(target, source, length);
-    //
-    // A typical AWS Skylake instance can have 32 KB x 2 blocks of L1 data cache per core,
-    // 1 MB x 2 blocks of L2 cache per core, and one shared L3 cache buffer.
-    // For now, let's avoid the cases beyond the L2 size.
-    int is_huge = length > 1ull * 1024ull * 1024ull;
-    if (length <= 32) { sz_copy_serial(target, source, length); }
-    // When dealing wirh larger arrays, the optimization is not as simple as with the `sz_fill_avx2` function,
-    // as both buffers may be unaligned. If we are lucky and the requested operation is some huge page transfer,
-    // we can use aligned loads and stores, and the performance will be great.
-    else if ((sz_size_t)target % 32 == 0 && (sz_size_t)source % 32 == 0 && !is_huge) {
-        for (; length >= 32; target += 32, source += 32, length -= 32)
-            _mm256_store_si256((__m256i *)target, _mm256_load_si256((__m256i const *)source));
-        if (length) sz_copy_serial(target, source, length);
-    }
-    // The trickiest case is when both `source` and `target` are not aligned.
-    // In such and simpler cases we can copy enough bytes into `target` to reach its cacheline boundary,
-    // and then combine unaligned loads with aligned stores.
-    else {
-        sz_size_t head_length = (32 - ((sz_size_t)target % 32)) % 32; // 31 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 32;    // 31 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 32.
-
-        // Fill the head of the buffer. This part is much cleaner with AVX-512.
-        if (head_length & 1) *(sz_u8_t *)target = *(sz_u8_t *)source, target++, source++, head_length--;
-        if (head_length & 2) *(sz_u16_t *)target = *(sz_u16_t *)source, target += 2, source += 2, head_length -= 2;
-        if (head_length & 4) *(sz_u32_t *)target = *(sz_u32_t *)source, target += 4, source += 4, head_length -= 4;
-        if (head_length & 8) *(sz_u64_t *)target = *(sz_u64_t *)source, target += 8, source += 8, head_length -= 8;
-        if (head_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_lddqu_si128((__m128i const *)source)), target += 16, source += 16,
-                head_length -= 16;
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-
-        // Fill the aligned body of the buffer.
-        if (!is_huge) {
-            for (; body_length >= 32; target += 32, source += 32, body_length -= 32)
-                _mm256_store_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-        }
-        // When the biffer is huge, we can traverse it in 2 directions.
-        else {
-            for (; body_length >= 64; target += 32, source += 32, body_length -= 64) {
-                _mm256_store_si256((__m256i *)(target), _mm256_lddqu_si256((__m256i const *)(source)));
-                _mm256_store_si256((__m256i *)(target + body_length - 32),
-                                   _mm256_lddqu_si256((__m256i const *)(source + body_length - 32)));
-            }
-            if (body_length) _mm256_store_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-        }
-
-        // Fill the tail of the buffer. This part is much cleaner with AVX-512.
-        sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
-        if (tail_length & 16)
-            _mm_store_si128((__m128i *)target, _mm_lddqu_si128((__m128i const *)source)), target += 16, source += 16,
-                tail_length -= 16;
-        if (tail_length & 8) *(sz_u64_t *)target = *(sz_u64_t *)source, target += 8, source += 8, tail_length -= 8;
-        if (tail_length & 4) *(sz_u32_t *)target = *(sz_u32_t *)source, target += 4, source += 4, tail_length -= 4;
-        if (tail_length & 2) *(sz_u16_t *)target = *(sz_u16_t *)source, target += 2, source += 2, tail_length -= 2;
-        if (tail_length & 1) *(sz_u8_t *)target = *(sz_u8_t *)source, target++, source++, tail_length--;
-    }
-}
-
-SZ_PUBLIC void sz_move_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    if (target < source || target >= source + length) {
-        for (; length >= 32; target += 32, source += 32, length -= 32)
-            _mm256_storeu_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
-        while (length--) *(target++) = *(source++);
-    }
-    else {
-        // Jump to the end and walk backwards.
-        for (target += length, source += length; length >= 32; length -= 32)
-            _mm256_storeu_si256((__m256i *)(target -= 32), _mm256_lddqu_si256((__m256i const *)(source -= 32)));
-        while (length--) *(--target) = *(--source);
-    }
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_avx2(sz_cptr_t text, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "loads".
-    //
-    // A typical AWS Skylake instance can have 32 KB x 2 blocks of L1 data cache per core,
-    // 1 MB x 2 blocks of L2 cache per core, and one shared L3 cache buffer.
-    // For now, let's avoid the cases beyond the L2 size.
-    int is_huge = length > 1ull * 1024ull * 1024ull;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 32) { return sz_checksum_serial(text, length); }
-    else if (!is_huge) {
-        sz_u256_vec_t text_vec, sums_vec;
-        sums_vec.ymm = _mm256_setzero_si256();
-        for (; length >= 32; text += 32, length -= 32) {
-            text_vec.ymm = _mm256_lddqu_si256((__m256i const *)text);
-            sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-        }
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        sz_u64_t result = low + high;
-        if (length) result += sz_checksum_serial(text, length);
-        return result;
-    }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    // Most notably, we can avoid populating the cache with the entire buffer, and instead traverse it in 2 directions.
-    else {
-        sz_size_t head_length = (32 - ((sz_size_t)text % 32)) % 32; // 31 or less.
-        sz_size_t tail_length = (sz_size_t)(text + length) % 32;    // 31 or less.
-        sz_size_t body_length = length - head_length - tail_length; // Multiple of 32.
-        sz_u64_t result = 0;
-
-        // Handle the head
-        while (head_length--) result += *text++;
-
-        sz_u256_vec_t text_vec, sums_vec;
-        sums_vec.ymm = _mm256_setzero_si256();
-        // Fill the aligned body of the buffer.
-        if (!is_huge) {
-            for (; body_length >= 32; text += 32, body_length -= 32) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i const *)text);
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-            }
-        }
-        // When the biffer is huge, we can traverse it in 2 directions.
-        else {
-            sz_u256_vec_t text_reversed_vec, sums_reversed_vec;
-            sums_reversed_vec.ymm = _mm256_setzero_si256();
-            for (; body_length >= 64; text += 64, body_length -= 64) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i *)(text));
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-                text_reversed_vec.ymm = _mm256_stream_load_si256((__m256i *)(text + body_length - 64));
-                sums_reversed_vec.ymm = _mm256_add_epi64(
-                    sums_reversed_vec.ymm, _mm256_sad_epu8(text_reversed_vec.ymm, _mm256_setzero_si256()));
-            }
-            if (body_length >= 32) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i *)(text));
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-            }
-            sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, sums_reversed_vec.ymm);
-        }
-
-        // Handle the tail
-        while (tail_length--) result += *text++;
-
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        result += low + high;
-        return result;
-    }
-}
-
-SZ_PUBLIC void sz_look_up_transform_avx2(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-
-    // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
-    // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
-    // But if at least 3 cache lines are touched, the AVX-2 implementation should be faster.
-    if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
-        return;
-    }
-
-    // We need to pull the lookup table into 8x YMM registers.
-    // The biggest issue is reorganizing the data in the lookup table, as AVX2 doesn't have 256-bit shuffle,
-    // it only has 128-bit "within-lane" shuffle. Still, it's wiser to use full YMM registers, instead of XMM,
-    // so that we can at least compensate high latency with twice larger window and one more level of lookup.
-    sz_u256_vec_t lut_0_to_15_vec, lut_16_to_31_vec, lut_32_to_47_vec, lut_48_to_63_vec, //
-        lut_64_to_79_vec, lut_80_to_95_vec, lut_96_to_111_vec, lut_112_to_127_vec,       //
-        lut_128_to_143_vec, lut_144_to_159_vec, lut_160_to_175_vec, lut_176_to_191_vec,  //
-        lut_192_to_207_vec, lut_208_to_223_vec, lut_224_to_239_vec, lut_240_to_255_vec;
-
-    lut_0_to_15_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut)));
-    lut_16_to_31_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 16)));
-    lut_32_to_47_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 32)));
-    lut_48_to_63_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 48)));
-    lut_64_to_79_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 64)));
-    lut_80_to_95_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 80)));
-    lut_96_to_111_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 96)));
-    lut_112_to_127_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 112)));
-    lut_128_to_143_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 128)));
-    lut_144_to_159_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 144)));
-    lut_160_to_175_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 160)));
-    lut_176_to_191_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 176)));
-    lut_192_to_207_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 192)));
-    lut_208_to_223_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 208)));
-    lut_224_to_239_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 224)));
-    lut_240_to_255_vec.ymm = _mm256_broadcastsi128_si256(_mm_lddqu_si128((__m128i const *)(lut + 240)));
-
-    // Assuming each lookup is performed within 16 elements of 256, we need to reduce the scope by 16x = 2^4.
-    sz_u256_vec_t not_first_bit_vec, not_second_bit_vec, not_third_bit_vec, not_fourth_bit_vec;
-
-    /// Top and bottom nibbles of the source are used separately.
-    sz_u256_vec_t source_vec, source_bot_vec;
-    sz_u256_vec_t blended_0_to_31_vec, blended_32_to_63_vec, blended_64_to_95_vec, blended_96_to_127_vec,
-        blended_128_to_159_vec, blended_160_to_191_vec, blended_192_to_223_vec, blended_224_to_255_vec;
-
-    // Handling the head.
-    while (length >= 32) {
-        // Load and separate the nibbles of each byte in the source.
-        source_vec.ymm = _mm256_lddqu_si256((__m256i const *)source);
-        source_bot_vec.ymm = _mm256_and_si256(source_vec.ymm, _mm256_set1_epi8((char)0x0F));
-
-        // In the first round, we select using the 4th bit.
-        not_fourth_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x10), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8(                      //
-            _mm256_shuffle_epi8(lut_16_to_31_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_0_to_15_vec.ymm, source_bot_vec.ymm),  //
-            not_fourth_bit_vec.ymm);
-        blended_32_to_63_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_48_to_63_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_32_to_47_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_64_to_95_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_80_to_95_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_64_to_79_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_96_to_127_vec.ymm = _mm256_blendv_epi8(                      //
-            _mm256_shuffle_epi8(lut_112_to_127_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_96_to_111_vec.ymm, source_bot_vec.ymm),  //
-            not_fourth_bit_vec.ymm);
-        blended_128_to_159_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_144_to_159_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_128_to_143_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_160_to_191_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_176_to_191_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_160_to_175_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_192_to_223_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_208_to_223_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_192_to_207_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-        blended_224_to_255_vec.ymm = _mm256_blendv_epi8(                     //
-            _mm256_shuffle_epi8(lut_240_to_255_vec.ymm, source_bot_vec.ymm), //
-            _mm256_shuffle_epi8(lut_224_to_239_vec.ymm, source_bot_vec.ymm), //
-            not_fourth_bit_vec.ymm);
-
-        // Perform a tree-like reduction of the 8x "blended" YMM registers, depending on the "source" content.
-        // The first round selects using the 3rd bit.
-        not_third_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x20), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8( //
-            blended_32_to_63_vec.ymm,                 //
-            blended_0_to_31_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-        blended_64_to_95_vec.ymm = _mm256_blendv_epi8( //
-            blended_96_to_127_vec.ymm,                 //
-            blended_64_to_95_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-        blended_128_to_159_vec.ymm = _mm256_blendv_epi8( //
-            blended_160_to_191_vec.ymm,                  //
-            blended_128_to_159_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-        blended_192_to_223_vec.ymm = _mm256_blendv_epi8( //
-            blended_224_to_255_vec.ymm,                  //
-            blended_192_to_223_vec.ymm,                  //
-            not_third_bit_vec.ymm);
-
-        // The second round selects using the 2nd bit.
-        not_second_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x40), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8( //
-            blended_64_to_95_vec.ymm,                 //
-            blended_0_to_31_vec.ymm,                  //
-            not_second_bit_vec.ymm);
-        blended_128_to_159_vec.ymm = _mm256_blendv_epi8( //
-            blended_192_to_223_vec.ymm,                  //
-            blended_128_to_159_vec.ymm,                  //
-            not_second_bit_vec.ymm);
-
-        // The third round selects using the 1st bit.
-        not_first_bit_vec.ymm = _mm256_cmpeq_epi8( //
-            _mm256_and_si256(_mm256_set1_epi8((char)0x80), source_vec.ymm), _mm256_setzero_si256());
-        blended_0_to_31_vec.ymm = _mm256_blendv_epi8( //
-            blended_128_to_159_vec.ymm,               //
-            blended_0_to_31_vec.ymm,                  //
-            not_first_bit_vec.ymm);
-
-        // And dump the result into the target.
-        _mm256_storeu_si256((__m256i *)target, blended_0_to_31_vec.ymm);
-        source += 32, target += 32, length -= 32;
-    }
-
-    // Handle the tail.
-    if (length) sz_look_up_transform_serial(source, length, lut, target);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    int mask;
-    sz_u256_vec_t h_vec, n_vec;
-    n_vec.ymm = _mm256_set1_epi8(n[0]);
-
-    while (h_length >= 32) {
-        h_vec.ymm = _mm256_lddqu_si256((__m256i const *)h);
-        mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_vec.ymm, n_vec.ymm));
-        if (mask) return h + sz_u32_ctz(mask);
-        h += 32, h_length -= 32;
-    }
-
-    return sz_find_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    int mask;
-    sz_u256_vec_t h_vec, n_vec;
-    n_vec.ymm = _mm256_set1_epi8(n[0]);
-
-    while (h_length >= 32) {
-        h_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + h_length - 32));
-        mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_vec.ymm, n_vec.ymm));
-        if (mask) return h + h_length - 1 - sz_u32_clz(mask);
-        h_length -= 32;
-    }
-
-    return sz_rfind_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_avx2(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into YMM registers.
-    int matches;
-    sz_u256_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.ymm = _mm256_set1_epi8(n[offset_first]);
-    n_mid_vec.ymm = _mm256_set1_epi8(n[offset_mid]);
-    n_last_vec.ymm = _mm256_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    for (; h_length >= n_length + 32; h += 32, h_length -= 32) {
-        h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_first));
-        h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_mid));
-        h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_last));
-        matches = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
-        while (matches) {
-            int potential_offset = sz_u32_ctz(matches);
-            if (sz_equal(h + potential_offset, n, n_length)) return h + potential_offset;
-            matches &= matches - 1;
-        }
-    }
-
-    return sz_find_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_avx2(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into YMM registers.
-    int matches;
-    sz_u256_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.ymm = _mm256_set1_epi8(n[offset_first]);
-    n_mid_vec.ymm = _mm256_set1_epi8(n[offset_mid]);
-    n_last_vec.ymm = _mm256_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 32; h_length -= 32) {
-        h_reversed = h + h_length - n_length - 32 + 1;
-        h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_first));
-        h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_mid));
-        h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_last));
-        matches = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
-                  _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
-        while (matches) {
-            int potential_offset = sz_u32_clz(matches);
-            if (sz_equal(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            matches &= ~(1 << (31 - potential_offset));
-        }
-    }
-
-    return sz_rfind_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_avx2(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-
-    // Let's unzip even and odd elements and replicate them into both lanes of the YMM register.
-    // That way when we invoke `_mm256_shuffle_epi8` we can use the same mask for both lanes.
-    sz_u256_vec_t filter_even_vec, filter_odd_vec;
-    for (sz_size_t i = 0; i != 16; ++i)
-        filter_even_vec.u8s[i] = filter->_u8s[i * 2], filter_odd_vec.u8s[i] = filter->_u8s[i * 2 + 1];
-    filter_even_vec.xmms[1] = filter_even_vec.xmms[0];
-    filter_odd_vec.xmms[1] = filter_odd_vec.xmms[0];
-
-    sz_u256_vec_t text_vec;
-    sz_u256_vec_t matches_vec;
-    sz_u256_vec_t lower_nibbles_vec, higher_nibbles_vec;
-    sz_u256_vec_t bitset_even_vec, bitset_odd_vec;
-    sz_u256_vec_t bitmask_vec, bitmask_lookup_vec;
-    bitmask_lookup_vec.ymm = _mm256_set_epi8(-128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-                                             -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);
-
-    while (length >= 32) {
-        // The following algorithm is a transposed equivalent of the "SIMDized check which bytes are in a set"
-        // solutions by Wojciech Muła. We populate the bitmask differently and target newer CPUs, so
-        // StrinZilla uses a somewhat different approach.
-        // http://0x80.pl/articles/simd-byte-lookup.html#alternative-implementation-new
-        //
-        //      sz_u8_t input = *(sz_u8_t const *)text;
-        //      sz_u8_t lo_nibble = input & 0x0f;
-        //      sz_u8_t hi_nibble = input >> 4;
-        //      sz_u8_t bitset_even = filter_even_vec.u8s[hi_nibble];
-        //      sz_u8_t bitset_odd = filter_odd_vec.u8s[hi_nibble];
-        //      sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //      sz_u8_t bitset = lo_nibble < 8 ? bitset_even : bitset_odd;
-        //      if ((bitset & bitmask) != 0) return text;
-        //      else { length--, text++; }
-        //
-        // The nice part about this, loading the strided data is vey easy with Arm NEON,
-        // while with x86 CPUs after AVX, shuffles within 256 bits shouldn't be an issue either.
-        text_vec.ymm = _mm256_lddqu_si256((__m256i const *)text);
-        lower_nibbles_vec.ymm = _mm256_and_si256(text_vec.ymm, _mm256_set1_epi8(0x0f));
-        bitmask_vec.ymm = _mm256_shuffle_epi8(bitmask_lookup_vec.ymm, lower_nibbles_vec.ymm);
-        //
-        // At this point we can validate the `bitmask_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != 32; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t lo_nibble = input & 0x0f;
-        //          sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //          sz_assert(bitmask_vec.u8s[i] == bitmask);
-        //      }
-        //
-        // Shift right every byte by 4 bits.
-        // There is no `_mm256_srli_epi8` intrinsic, so we have to use `_mm256_srli_epi16`
-        // and combine it with a mask to clear the higher bits.
-        higher_nibbles_vec.ymm = _mm256_and_si256(_mm256_srli_epi16(text_vec.ymm, 4), _mm256_set1_epi8(0x0f));
-        bitset_even_vec.ymm = _mm256_shuffle_epi8(filter_even_vec.ymm, higher_nibbles_vec.ymm);
-        bitset_odd_vec.ymm = _mm256_shuffle_epi8(filter_odd_vec.ymm, higher_nibbles_vec.ymm);
-        //
-        // At this point we can validate the `bitset_even_vec` and `bitset_odd_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != 32; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t const *bitset_ptr = &filter->_u8s[0];
-        //          sz_u8_t hi_nibble = input >> 4;
-        //          sz_u8_t bitset_even = bitset_ptr[hi_nibble * 2];
-        //          sz_u8_t bitset_odd = bitset_ptr[hi_nibble * 2 + 1];
-        //          sz_assert(bitset_even_vec.u8s[i] == bitset_even);
-        //          sz_assert(bitset_odd_vec.u8s[i] == bitset_odd);
-        //      }
-        //
-        __m256i take_first = _mm256_cmpgt_epi8(_mm256_set1_epi8(8), lower_nibbles_vec.ymm);
-        bitset_even_vec.ymm = _mm256_blendv_epi8(bitset_odd_vec.ymm, bitset_even_vec.ymm, take_first);
-
-        // It would have been great to have an instruction that tests the bits and then broadcasts
-        // the matching bit into all bits in that byte. But we don't have that, so we have to
-        // `and`, `cmpeq`, `movemask`, and then invert at the end...
-        matches_vec.ymm = _mm256_and_si256(bitset_even_vec.ymm, bitmask_vec.ymm);
-        matches_vec.ymm = _mm256_cmpeq_epi8(matches_vec.ymm, _mm256_setzero_si256());
-        int matches_mask = ~_mm256_movemask_epi8(matches_vec.ymm);
-        if (matches_mask) {
-            int offset = sz_u32_ctz(matches_mask);
-            return text + offset;
-        }
-        else { text += 32, length -= 32; }
-    }
-
-    return sz_find_charset_serial(text, length, filter);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx2(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-    return sz_rfind_charset_serial(text, length, filter);
-}
-
-/**
- *  @brief  There is no AVX2 instruction for fast multiplication of 64-bit integers.
- *          This implementation is coming from Agner Fog's Vector Class Library.
- */
-SZ_INTERNAL __m256i _mm256_mul_epu64(__m256i a, __m256i b) {
-    __m256i bswap = _mm256_shuffle_epi32(b, 0xB1);
-    __m256i prodlh = _mm256_mullo_epi32(a, bswap);
-    __m256i zero = _mm256_setzero_si256();
-    __m256i prodlh2 = _mm256_hadd_epi32(prodlh, zero);
-    __m256i prodlh3 = _mm256_shuffle_epi32(prodlh2, 0x73);
-    __m256i prodll = _mm256_mul_epu32(a, b);
-    __m256i prod = _mm256_add_epi64(prodll, prodlh3);
-    return prod;
-}
-
-SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                              sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    if (length < 4 * window_length) {
-        sz_hashes_serial(start, length, window_length, step, callback, callback_handle);
-        return;
-    }
-
-    // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
-    // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
-    sz_size_t const max_hashes = length - window_length + 1;
-    sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
-    sz_u8_t const *text_first = (sz_u8_t const *)start;
-    sz_u8_t const *text_second = text_first + min_hashes_per_thread;
-    sz_u8_t const *text_third = text_first + min_hashes_per_thread * 2;
-    sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
-    sz_u8_t const *text_end = text_first + length;
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // Broadcast the constants into the registers.
-    sz_u256_vec_t prime_vec, golden_ratio_vec;
-    sz_u256_vec_t base_low_vec, base_high_vec, prime_power_low_vec, prime_power_high_vec, shift_high_vec;
-    base_low_vec.ymm = _mm256_set1_epi64x(31ull);
-    base_high_vec.ymm = _mm256_set1_epi64x(257ull);
-    shift_high_vec.ymm = _mm256_set1_epi64x(77ull);
-    prime_vec.ymm = _mm256_set1_epi64x(SZ_U64_MAX_PRIME);
-    golden_ratio_vec.ymm = _mm256_set1_epi64x(11400714819323198485ull);
-    prime_power_low_vec.ymm = _mm256_set1_epi64x(prime_power_low);
-    prime_power_high_vec.ymm = _mm256_set1_epi64x(prime_power_high);
-
-    // Compute the initial hash values for every one of the four windows.
-    sz_u256_vec_t hash_low_vec, hash_high_vec, hash_mix_vec, chars_low_vec, chars_high_vec;
-    hash_low_vec.ymm = _mm256_setzero_si256();
-    hash_high_vec.ymm = _mm256_setzero_si256();
-    for (sz_u8_t const *prefix_end = text_first + window_length; text_first < prefix_end;
-         ++text_first, ++text_second, ++text_third, ++text_fourth) {
-
-        // 1. Multiply the hashes by the base.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-
-        // 3. Add the incoming characters.
-        hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_low_vec.ymm = _mm256_blendv_epi8(hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
-                                              _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
-        hash_high_vec.ymm = _mm256_blendv_epi8(hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
-                                               _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
-    }
-
-    // 5. Compute the hash mix, that will be used to index into the fingerprint.
-    //    This includes a serial step at the end.
-    hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
-    hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
-    hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
-    callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-    callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-    callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-    callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-
-    // Now repeat that operation for the remaining characters, discarding older characters.
-    sz_size_t cycle = 1;
-    sz_size_t const step_mask = step - 1;
-    for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
-        // 0. Load again the four characters we are dropping, shift them, and subtract.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[-window_length], text_third[-window_length],
-                                              text_second[-window_length], text_first[-window_length]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-        hash_low_vec.ymm =
-            _mm256_sub_epi64(hash_low_vec.ymm, _mm256_mul_epu64(chars_low_vec.ymm, prime_power_low_vec.ymm));
-        hash_high_vec.ymm =
-            _mm256_sub_epi64(hash_high_vec.ymm, _mm256_mul_epu64(chars_high_vec.ymm, prime_power_high_vec.ymm));
-
-        // 1. Multiply the hashes by the base.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-
-        // 3. Add the incoming characters.
-        hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_low_vec.ymm = _mm256_blendv_epi8(hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
-                                              _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
-        hash_high_vec.ymm = _mm256_blendv_epi8(hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
-                                               _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
-
-        // 5. Compute the hash mix, that will be used to index into the fingerprint.
-        //    This includes a serial step at the end.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
-        hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
-        if ((cycle & step_mask) == 0) {
-            callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-            callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-            callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-            callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-        }
-    }
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif
-#pragma endregion
-
-/*
- *  @brief  AVX-512 implementation of the string search algorithms.
- *
- *  Different subsets of AVX-512 were introduced in different years:
- *  - 2017 SkyLake: F, CD, ER, PF, VL, DQ, BW
- *  - 2018 CannonLake: IFMA, VBMI
- *  - 2019 IceLake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES
- *  - 2020 TigerLake: VP2INTERSECT
- */
-#pragma region AVX512 Implementation
-
-#if SZ_USE_ICE
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
-#include <immintrin.h>
-
-/**
- *  @brief  Helper structure to simplify work with 512-bit registers.
- */
-typedef union sz_u512_vec_t {
-    __m512i zmm;
-    __m256i ymms[2];
-    __m128i xmms[4];
-    sz_u64_t u64s[8];
-    sz_u32_t u32s[16];
-    sz_u16_t u16s[32];
-    sz_u8_t u8s[64];
-    sz_i64_t i64s[8];
-    sz_i32_t i32s[16];
-} sz_u512_vec_t;
-
-SZ_INTERNAL __mmask64 _sz_u64_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 64:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 64:
-    return _bzhi_u64(0xFFFFFFFFFFFFFFFF, n < 64 ? (sz_u32_t)n : 64);
-}
-
-SZ_INTERNAL __mmask32 _sz_u32_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 32:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 32:
-    return _bzhi_u32(0xFFFFFFFF, n < 32 ? (sz_u32_t)n : 32);
-}
-
-SZ_INTERNAL __mmask16 _sz_u16_clamp_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 16:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 16:
-    return _bzhi_u32(0xFFFFFFFF, n < 16 ? (sz_u32_t)n : 16);
-}
-
-SZ_INTERNAL __mmask16 _sz_u16_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 16:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 16:
-    return (__mmask16)_bzhi_u32(0xFFFFFFFF, (sz_u32_t)n);
-}
-
-SZ_INTERNAL __mmask32 _sz_u32_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 32:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 32:
-    return _bzhi_u32(0xFFFFFFFF, (sz_u32_t)n);
-}
-
-SZ_INTERNAL __mmask64 _sz_u64_mask_until(sz_size_t n) {
-    // The simplest approach to compute this if we know that `n` is blow or equal 64:
-    //      return (1ull << n) - 1;
-    // A slightly more complex approach, if we don't know that `n` is under 64:
-    return _bzhi_u64(0xFFFFFFFFFFFFFFFF, (sz_u32_t)n);
-}
-
-SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    sz_u512_vec_t a_vec, b_vec;
-
-    // Pointer arithmetic is cheap, fetching memory is not!
-    // So we can use the masked loads to fetch at most one cache-line for each string,
-    // compare the prefixes, and only then move forward.
-    sz_size_t a_head_length = 64 - ((sz_size_t)a % 64); // 63 or less.
-    sz_size_t b_head_length = 64 - ((sz_size_t)b % 64); // 63 or less.
-    a_head_length = a_head_length < a_length ? a_head_length : a_length;
-    b_head_length = b_head_length < b_length ? b_head_length : b_length;
-    sz_size_t head_length = a_head_length < b_head_length ? a_head_length : b_head_length;
-    __mmask64 head_mask = _sz_u64_mask_until(head_length);
-    a_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, a);
-    b_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, b);
-    __mmask64 mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-    if (mask_not_equal != 0) {
-        sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-        char a_char = a_vec.u8s[first_diff];
-        char b_char = b_vec.u8s[first_diff];
-        return _sz_order_scalars(a_char, b_char);
-    }
-    else if (head_length == a_length && head_length == b_length) { return sz_equal_k; }
-    else { a += head_length, b += head_length, a_length -= head_length, b_length -= head_length; }
-
-    // The rare case, when both string are very long.
-    __mmask64 a_mask, b_mask;
-    while ((a_length >= 64) & (b_length >= 64)) {
-        a_vec.zmm = _mm512_loadu_si512(a);
-        b_vec.zmm = _mm512_loadu_si512(b);
-        mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask_not_equal != 0) {
-            sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-            char a_char = a_vec.u8s[first_diff];
-            char b_char = b_vec.u8s[first_diff];
-            return _sz_order_scalars(a_char, b_char);
-        }
-        a += 64, b += 64, a_length -= 64, b_length -= 64;
-    }
-
-    // In most common scenarios at least one of the strings is under 64 bytes.
-    if (a_length | b_length) {
-        a_mask = _sz_u64_clamp_mask_until(a_length);
-        b_mask = _sz_u64_clamp_mask_until(b_length);
-        a_vec.zmm = _mm512_maskz_loadu_epi8(a_mask, a);
-        b_vec.zmm = _mm512_maskz_loadu_epi8(b_mask, b);
-        // The AVX-512 `_mm512_mask_cmpneq_epi8_mask` intrinsics are generally handy in such environments.
-        // They, however, have latency 3 on most modern CPUs. Using AVX2: `_mm256_cmpeq_epi8` would have
-        // been cheaper, if we didn't have to apply `_mm256_movemask_epi8` afterwards.
-        mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask_not_equal != 0) {
-            sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
-            char a_char = a_vec.u8s[first_diff];
-            char b_char = b_vec.u8s[first_diff];
-            return _sz_order_scalars(a_char, b_char);
-        }
-        // From logic perspective, the hardest cases are "abc\0" and "abc".
-        // The result must be `sz_greater_k`, as the latter is shorter.
-        else { return _sz_order_scalars(a_length, b_length); }
-    }
-
-    return sz_equal_k;
-}
-
-SZ_PUBLIC sz_bool_t sz_equal_skylake(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    __mmask64 mask;
-    sz_u512_vec_t a_vec, b_vec;
-
-    while (length >= 64) {
-        a_vec.zmm = _mm512_loadu_si512(a);
-        b_vec.zmm = _mm512_loadu_si512(b);
-        mask = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
-        if (mask != 0) return sz_false_k;
-        a += 64, b += 64, length -= 64;
-    }
-
-    if (length) {
-        mask = _sz_u64_mask_until(length);
-        a_vec.zmm = _mm512_maskz_loadu_epi8(mask, a);
-        b_vec.zmm = _mm512_maskz_loadu_epi8(mask, b);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpneq_epi8_mask(mask, a_vec.zmm, b_vec.zmm);
-        return (sz_bool_t)(mask == 0);
-    }
-
-    return sz_true_k;
-}
-
-SZ_PUBLIC void sz_fill_avx512(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    __m512i value_vec = _mm512_set1_epi8(value);
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores".
-    //
-    //    for (; length >= 64; target += 64, length -= 64) _mm512_storeu_si512(target, value_vec);
-    //    _mm512_mask_storeu_epi8(target, _sz_u64_mask_until(length), value_vec);
-    //
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        _mm512_mask_storeu_epi8(target, mask, value_vec);
-    }
-    // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
-    // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
-    // by computing 2 masks - for the head and tail, using masked stores for the head and tail, and unmasked
-    // for the body.
-    else {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        _mm512_mask_storeu_epi8(target, head_mask, value_vec);
-        for (target += head_length; body_length >= 64; target += 64, body_length -= 64)
-            _mm512_store_si512(target, value_vec);
-        _mm512_mask_storeu_epi8(target, tail_mask, value_vec);
-    }
-}
-
-SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "stores" and "loads".
-    //
-    //    for (; length >= 64; target += 64, source += 64, length -= 64)
-    //        _mm512_storeu_si512(target, _mm512_loadu_si512(source));
-    //    __mmask64 mask = _sz_u64_mask_until(length);
-    //    _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    //
-    // A typical AWS Sapphire Rapids instance can have 48 KB x 2 blocks of L1 data cache per core,
-    // 2 MB x 2 blocks of L2 cache per core, and one shared 60 MB buffer of L3 cache.
-    // With two strings, we may consider the overal workload huge, if each exceeds 1 MB in length.
-    int const is_huge = length >= 1ull * 1024ull * 1024ull;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    }
-    // When dealing wirh larger arrays, the optimization is not as simple as with the `sz_fill_avx512` function,
-    // as both buffers may be unaligned. If we are lucky and the requested operation is some huge page transfer,
-    // we can use aligned loads and stores, and the performance will be great.
-    else if ((sz_size_t)target % 64 == 0 && (sz_size_t)source % 64 == 0 && !is_huge) {
-        for (; length >= 64; target += 64, source += 64, length -= 64)
-            _mm512_store_si512(target, _mm512_load_si512(source));
-        // At this point the length is guaranteed to be under 64.
-        __mmask64 mask = _sz_u64_mask_until(length);
-        // Aligned load and stores would work too, but it's not defined.
-        _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    }
-    // The trickiest case is when both `source` and `target` are not aligned.
-    // In such and simpler cases we can copy enough bytes into `target` to reach its cacheline boundary,
-    // and then combine unaligned loads with aligned stores.
-    else if (!is_huge) {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-        for (target += head_length, source += head_length; body_length >= 64;
-             target += 64, source += 64, body_length -= 64)
-            _mm512_store_si512(target, _mm512_loadu_si512(source)); // Unaligned load, but aligned store!
-        _mm512_mask_storeu_epi8(target, tail_mask, _mm512_maskz_loadu_epi8(tail_mask, source));
-    }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    //
-    //      1. Moving in both directions to maximize the throughput, when fetching from multiple
-    //         memory pages. Also helps with cache set-associativity issues, as we won't always
-    //         be fetching the same entries in the lookup table.
-    //      2. Using non-temporal stores to avoid polluting the cache.
-    //      3. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
-    //         for predictable patterns, so disregard this advice.
-    //
-    // Bidirectional traversal adds about 10%, accelerating from 11 GB/s to 12 GB/s.
-    // Using "streaming stores" boosts us from 12 GB/s to 19 GB/s.
-    else {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64;
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;
-        sz_size_t body_length = length - head_length - tail_length;
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-        _mm512_mask_storeu_epi8(target + head_length + body_length, tail_mask,
-                                _mm512_maskz_loadu_epi8(tail_mask, source));
-
-        // Now in the main loop, we can use non-temporal loads and stores,
-        // performing the operation in both directions.
-        for (target += head_length, source += head_length; //
-             body_length >= 128;                           //
-             target += 64, source += 64, body_length -= 128) {
-            _mm512_stream_si512((__m512i *)(target), _mm512_loadu_si512(source));
-            _mm512_stream_si512((__m512i *)(target + body_length - 64), _mm512_loadu_si512(source + body_length - 64));
-        }
-        if (body_length >= 64) _mm512_stream_si512((__m512i *)target, _mm512_loadu_si512(source));
-    }
-}
-
-SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    if (target == source) return; // Don't be silly, don't move the data if it's already there.
-
-    // On very short buffers, that are one cache line in width or less, we don't need any loops.
-    // We can also avoid any data-dependencies between iterations, assuming we have 32 registers
-    // to pre-load the data, before writing it back.
-    if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
-    }
-    else if (length <= 128) {
-        sz_size_t last_length = length - 64;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
-        __m512i source0 = _mm512_loadu_epi8(source);
-        __m512i source1 = _mm512_maskz_loadu_epi8(mask, source + 64);
-        _mm512_storeu_epi8(target, source0);
-        _mm512_mask_storeu_epi8(target + 64, mask, source1);
-    }
-    else if (length <= 192) {
-        sz_size_t last_length = length - 128;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
-        __m512i source0 = _mm512_loadu_epi8(source);
-        __m512i source1 = _mm512_loadu_epi8(source + 64);
-        __m512i source2 = _mm512_maskz_loadu_epi8(mask, source + 128);
-        _mm512_storeu_epi8(target, source0);
-        _mm512_storeu_epi8(target + 64, source1);
-        _mm512_mask_storeu_epi8(target + 128, mask, source2);
-    }
-    else if (length <= 256) {
-        sz_size_t last_length = length - 192;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
-        __m512i source0 = _mm512_loadu_epi8(source);
-        __m512i source1 = _mm512_loadu_epi8(source + 64);
-        __m512i source2 = _mm512_loadu_epi8(source + 128);
-        __m512i source3 = _mm512_maskz_loadu_epi8(mask, source + 192);
-        _mm512_storeu_epi8(target, source0);
-        _mm512_storeu_epi8(target + 64, source1);
-        _mm512_storeu_epi8(target + 128, source2);
-        _mm512_mask_storeu_epi8(target + 192, mask, source3);
-    }
-
-    // If the regions don't overlap at all, just use "copy" and save some brain cells thinking about corner cases.
-    else if (target + length < source || target >= source + length) { sz_copy_avx512(target, source, length); }
-
-    // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
-    // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
-    // by computing 2 masks - for the head and tail, using masked stores for the head and tail, and unmasked
-    // for the body.
-    else {
-        sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-
-        // The absolute most common case of using "moves" is shifting the data within a continuous buffer
-        // when adding a removing some values in it. In such cases, a typical shift is by 1, 2, 4, 8, 16,
-        // or 32 bytes, rarely larger. For small shifts, under the size of the ZMM register, we can use shuffles.
-        //
-        // Remember:
-        //      - if we are shifting data left, that we are traversing to the right.
-        //      - if we are shifting data right, that we are traversing to the left.
-        int const left_to_right_traversal = source > target;
-
-        // Now we guarantee, that the relative shift within registers is from 1 to 63 bytes and the output is aligned.
-        // Hopefully, we need to shift more than two ZMM registers, so we could consider `valignr` instruction.
-        // Sadly, using `_mm512_alignr_epi8` doesn't make sense, as it operates at a 128-bit granularity.
-        //
-        //      - `_mm256_alignr_epi8` shifts entire 256-bit register, but we need many of them.
-        //      - `_mm512_alignr_epi32` shifts 512-bit chunks, but only if the `shift` is a multiple of 4 bytes.
-        //      - `_mm512_alignr_epi64` shifts 512-bit chunks by 8 bytes.
-        //
-        // All of those have a latency of 1 cycle, and the shift amount must be an immediate value!
-        // For 1-byte-shift granularity, the `_mm512_permutex2var_epi8` has a latency of 6 and needs VBMI!
-        // The most efficient and broadly compatible alternative could be to use a combination of align and shuffle.
-        // A similar approach was outlined in "Byte-wise alignr in AVX512F" by Wojciech Muła.
-        // http://0x80.pl/notesen/2016-10-16-avx512-byte-alignr.html
-        //
-        // That solution, is extremely mouthful, assuming we need compile time constants for the shift amount.
-        // A cleaner one, with a latency of 3 cycles, is to use `_mm512_permutexvar_epi8` or
-        // `_mm512_mask_permutexvar_epi8`, which can be seen as combination of a cross-register shuffle and blend,
-        // and is available with VBMI. That solution is still noticeably slower than AVX2.
-        //
-        // The GLibC implementation also uses non-temporal stores for larger buffers, we don't.
-        // https://codebrowser.dev/glibc/glibc/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S.html
-        if (left_to_right_traversal) {
-            // Head, body, and tail.
-            _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-            for (target += head_length, source += head_length; body_length >= 64;
-                 target += 64, source += 64, body_length -= 64)
-                _mm512_store_si512(target, _mm512_loadu_si512(source));
-            _mm512_mask_storeu_epi8(target, tail_mask, _mm512_maskz_loadu_epi8(tail_mask, source));
-        }
-        else {
-            // Tail, body, and head.
-            _mm512_mask_storeu_epi8(target + head_length + body_length, tail_mask,
-                                    _mm512_maskz_loadu_epi8(tail_mask, source + head_length + body_length));
-            for (; body_length >= 64; body_length -= 64)
-                _mm512_store_si512(target + head_length + body_length - 64,
-                                   _mm512_loadu_si512(source + head_length + body_length - 64));
-            _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
-        }
-    }
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    __mmask64 mask;
-    sz_u512_vec_t h_vec, n_vec;
-    n_vec.zmm = _mm512_set1_epi8(n[0]);
-
-    while (h_length >= 64) {
-        h_vec.zmm = _mm512_loadu_si512(h);
-        mask = _mm512_cmpeq_epi8_mask(h_vec.zmm, n_vec.zmm);
-        if (mask) return h + sz_u64_ctz(mask);
-        h += 64, h_length -= 64;
-    }
-
-    if (h_length) {
-        mask = _sz_u64_mask_until(h_length);
-        h_vec.zmm = _mm512_maskz_loadu_epi8(mask, h);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpeq_epu8_mask(mask, h_vec.zmm, n_vec.zmm);
-        if (mask) return h + sz_u64_ctz(mask);
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_avx512(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into ZMM registers.
-    __mmask64 matches;
-    __mmask64 mask;
-    sz_u512_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.zmm = _mm512_set1_epi8(n[offset_first]);
-    n_mid_vec.zmm = _mm512_set1_epi8(n[offset_mid]);
-    n_last_vec.zmm = _mm512_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    // We have several optimized versions of the lagorithm for shorter strings,
-    // but they all mimic the default case for unbounded length needles
-    if (n_length >= 64) {
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches);
-                if (sz_equal_skylake(h + potential_offset, n, n_length)) return h + potential_offset;
-                matches &= matches - 1;
-            }
-
-            // TODO: If the last character contains a bad byte, we can reposition the start of the next iteration.
-            // This will be very helpful for very long needles.
-        }
-    }
-    // If there are only 2 or 3 characters in the needle, we don't even need the nested loop.
-    else if (n_length <= 3) {
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            if (matches) return h + sz_u64_ctz(matches);
-        }
-    }
-    // If the needle is smaller than the size of the ZMM register, we can use masked comparisons
-    // to avoid the the inner-most nested loop and compare the entire needle against a haystack
-    // slice in 3 CPU cycles.
-    else {
-        __mmask64 n_mask = _sz_u64_mask_until(n_length);
-        sz_u512_vec_t n_full_vec, h_full_vec;
-        n_full_vec.zmm = _mm512_maskz_loadu_epi8(n_mask, n);
-        for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
-            h_first_vec.zmm = _mm512_loadu_si512(h + offset_first);
-            h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid);
-            h_last_vec.zmm = _mm512_loadu_si512(h + offset_last);
-            matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                       _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                       _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                                   _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches);
-                h_full_vec.zmm = _mm512_maskz_loadu_epi8(n_mask, h + potential_offset);
-                if (_mm512_mask_cmpneq_epi8_mask(n_mask, h_full_vec.zmm, n_full_vec.zmm) == 0)
-                    return h + potential_offset;
-                matches &= matches - 1;
-            }
-        }
-    }
-
-    // The "tail" of the function uses masked loads to process the remaining bytes.
-    {
-        mask = _sz_u64_mask_until(h_length - n_length + 1);
-        h_first_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_first);
-        h_mid_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_mid);
-        h_last_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_ctz(matches);
-            if (n_length <= 3 || sz_equal_skylake(h + potential_offset, n, n_length)) return h + potential_offset;
-            matches &= matches - 1;
-        }
-    }
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    __mmask64 mask;
-    sz_u512_vec_t h_vec, n_vec;
-    n_vec.zmm = _mm512_set1_epi8(n[0]);
-
-    while (h_length >= 64) {
-        h_vec.zmm = _mm512_loadu_si512(h + h_length - 64);
-        mask = _mm512_cmpeq_epi8_mask(h_vec.zmm, n_vec.zmm);
-        if (mask) return h + h_length - 1 - sz_u64_clz(mask);
-        h_length -= 64;
-    }
-
-    if (h_length) {
-        mask = _sz_u64_mask_until(h_length);
-        h_vec.zmm = _mm512_maskz_loadu_epi8(mask, h);
-        // Reuse the same `mask` variable to find the bit that doesn't match
-        mask = _mm512_mask_cmpeq_epu8_mask(mask, h_vec.zmm, n_vec.zmm);
-        if (mask) return h + 64 - sz_u64_clz(mask) - 1;
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_avx512(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Broadcast those characters into ZMM registers.
-    __mmask64 mask;
-    __mmask64 matches;
-    sz_u512_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
-    n_first_vec.zmm = _mm512_set1_epi8(n[offset_first]);
-    n_mid_vec.zmm = _mm512_set1_epi8(n[offset_mid]);
-    n_last_vec.zmm = _mm512_set1_epi8(n[offset_last]);
-
-    // Scan through the string.
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 64; h_length -= 64) {
-        h_reversed = h + h_length - n_length - 64 + 1;
-        h_first_vec.zmm = _mm512_loadu_si512(h_reversed + offset_first);
-        h_mid_vec.zmm = _mm512_loadu_si512(h_reversed + offset_mid);
-        h_last_vec.zmm = _mm512_loadu_si512(h_reversed + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches);
-            if (n_length <= 3 || sz_equal_skylake(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~((sz_u64_t)1 << (63 - potential_offset));
-        }
-    }
-
-    // The "tail" of the function uses masked loads to process the remaining bytes.
-    {
-        mask = _sz_u64_mask_until(h_length - n_length + 1);
-        h_first_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_first);
-        h_mid_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_mid);
-        h_last_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_last);
-        matches = _kand_mask64(_kand_mask64( // Intersect the masks
-                                   _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
-                                   _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
-                               _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches);
-            if (n_length <= 3 || sz_equal_skylake(h + 64 - potential_offset - 1, n, n_length))
-                return h + 64 - potential_offset - 1;
-            sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~((sz_u64_t)1 << (63 - potential_offset));
-        }
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
-                             apply_to = function)
-
-/**
- *  @brief  Computes the edit distance between two very short byte-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 63, and evaluates at most (63 * 2 + 1 = 127) diagonals, or just as many loop cycles.
- *  Supports an early exit, if the distance is bounded.
- *  Keeps all of the data and Levenshtein matrices skew diagonal in just a couple of registers.
- *  Benefits from the @b `vpermb` instructions, that can rotate the bytes across the entire ZMM register.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto63_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                        //
-    sz_cptr_t longer, sz_size_t longer_length,                          //
-    sz_size_t bound) {
-
-    sz_size_t const max_length = 63u;
-    sz_assert(shorter_length <= longer_length && "The 'shorter' string is longer than the 'longer' one.");
-    sz_assert(shorter_length < max_length && "The length must fit into 16-bit integer. Otherwise use serial variant.");
-
-    // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
-    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
-    sz_size_t const shorter_dim = shorter_length + 1;
-    sz_size_t const longer_dim = longer_length + 1;
-
-    // The next few buffers will be swapped around.
-    sz_u512_vec_t previous_vec, current_vec, next_vec;
-    sz_u512_vec_t gaps_vec, substitutions_vec;
-
-    // Load the strings into ZMM registers - just once.
-    sz_u512_vec_t longer_vec, shorter_vec, shorter_rotated_vec, rotate_left_vec, rotate_right_vec, ones_vec, bound_vec;
-    longer_vec.zmm = _mm512_maskz_loadu_epi8(_sz_u64_mask_until(longer_length), longer);
-    rotate_left_vec.zmm = _mm512_set_epi8(                              //
-        0, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,  //
-        48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, //
-        32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, //
-        16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-    rotate_right_vec.zmm = _mm512_set_epi8(                             //
-        62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48,     //
-        47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, //
-        31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, //
-        15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 63);
-    ones_vec.zmm = _mm512_set1_epi8(1);
-    bound_vec.zmm = _mm512_set1_epi8(bound <= 255 ? (sz_u8_t)bound : 255);
-
-    // To simplify comparisons and traversals, we want to reverse the order of bytes in the shorter string.
-    for (sz_size_t i = 0; i != shorter_length; ++i) shorter_vec.u8s[63 - i] = shorter[i];
-    shorter_rotated_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, shorter_vec.zmm);
-
-    // Let's say we are dealing with 3 and 5 letter words.
-    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
-    // It will have:
-    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
-    // - 2 diagonals of fixed length, at positions: 4, 5.
-    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
-    sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
-
-    // Initialize the first two diagonals:
-    //
-    //      previous_vec.u8s[0] = 0;
-    //      current_vec.u8s[0] = current_vec.u8s[1] = 1;
-    //
-    // We can do a similar thing with vector ops:
-    previous_vec.zmm = _mm512_setzero_si512();
-    current_vec.zmm = _mm512_set1_epi8(1);
-
-    // We skip diagonals 0 and 1, as they are trivial.
-    // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
-    // so we are effectively computing just one value, as will be marked by a single set bit in
-    // the `next_diagonal_mask` on the very first iteration.
-    sz_size_t next_diagonal_index = 2;
-    __mmask64 next_diagonal_mask = 0;
-
-    // Progress through the upper triangle of the Levenshtein matrix.
-    for (; next_diagonal_index != shorter_dim; ++next_diagonal_index) {
-        // After this iteration, the values at offset `0` and `next_diagonal_index` in the `next_vec`
-        // should be set to `next_diagonal_index`, but it's easier to broadcast the value to the whole vector,
-        // and later merge with a mask with new values.
-        next_vec.zmm = _mm512_set1_epi8((sz_u8_t)next_diagonal_index);
-
-        // The mask also adds one set bit.
-        next_diagonal_mask = _kor_mask64(next_diagonal_mask, 1);
-        next_diagonal_mask = _kshiftli_mask64(next_diagonal_mask, 1);
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        substitutions_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, substitutions_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(_mm512_permutexvar_epi8(rotate_right_vec.zmm, current_vec.zmm), current_vec.zmm),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_mask_min_epu8(next_vec.zmm, next_diagonal_mask, gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = current_vec.zmm;
-        current_vec.zmm = next_vec.zmm;
-
-        // Shift the shorter string
-        shorter_rotated_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, shorter_rotated_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
-    }
-
-    // Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.
-    for (; next_diagonal_index != longer_dim; ++next_diagonal_index) {
-        // After this iteration, the value `shorted_dim - 1` in the `next_vec`
-        // should be set to `next_diagonal_index`, but it's easier to broadcast the value to the whole vector,
-        // and later merge with a mask with new values.
-        next_vec.zmm = _mm512_set1_epi8((sz_u8_t)next_diagonal_index);
-
-        // Make sure we update the first entry.
-        next_diagonal_mask = _kor_mask64(next_diagonal_mask, 1);
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(current_vec.zmm, _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm)),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_mask_min_epu8(next_vec.zmm, next_diagonal_mask, gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm);
-        current_vec.zmm = next_vec.zmm;
-
-        // Let's shift the longer string now.
-        longer_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, longer_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
-    }
-
-    // Now let's handle the bottom right triangle.
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(current_vec.zmm, _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm)),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_min_epu8(gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm);
-        current_vec.zmm = next_vec.zmm;
-
-        // Let's shift the longer string now.
-        longer_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, longer_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) { //
-            return SZ_SIZE_MAX;
-        }
-        // In every following iterations we take use a shorter prefix of each register,
-        // but we don't need to update the `next_diagonal_mask` anymore... except for the early exit.
-        next_diagonal_mask = _kshiftri_mask64(next_diagonal_mask, 1);
-    }
-    return current_vec.u8s[0];
-}
-
-/**
- *  @brief  Computes the edit distance between two somewhat short bytes-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 127, and evaluates at most (127 * 2 + 1 = 255) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *  Benefits from the @b `vpermi2b` instructions, that can rotate the bytes in 2 registers at once.
- *
- *  This may be one of the most freuqently called kernels for:
- *  - source code analysis, assuming most lines are either under 80 or under 120 characters long.
- *  - DNA sequence alignment, as most short reads are 50-300 characters long.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto127_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two longer bytes-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 255, and evaluates at most (255 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *
- *  Each of 2x string ends up occupying 4 ZMM registers, and each of 3x diagonals uses 4 ZMM registers.
- *  So 20x of the 32x are persistently occupied, and the rest are used for math temporarily.
- *  This is the largest space-efficient variant, as strings beyond 255 characters may require
- *  16-bit accumulators, which would be a significant bottleneck.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                      //
-    sz_cptr_t longer, sz_size_t longer_length,                        //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two longer bytes-strings using the AVX-512VBMI extensions,
- *          assuming the upper distance bound can not exceed 255, but the string length can be arbitrary.
- *
- *  Applies to string lengths up to 255, and evaluates at most (255 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *
- *  Each of 2x string ends up occupying 4 ZMM registers, and each of 3x diagonals uses 4 ZMM registers.
- *  So 20x of the 32x are persistently occupied, and the rest are used for math temporarily.
- *  This is the largest space-efficient variant, as strings beyond 255 characters may require
- *  16-bit accumulators, which would be a significant bottleneck.
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto255bound_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                              //
-    sz_cptr_t longer, sz_size_t longer_length,                                //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two mid-length UTF-8-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 127, and evaluates at most (127 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Benefits from the @b `valignd` instructions used to rotate UTF-32 unpacked unicode codepoints.
- *
- *  Each string is unpacked into 128 characters * 4 bytes per character / 64 bytes per register = 8 registers.
- *
- */
-SZ_INTERNAL sz_size_t _sz_edit_distance_utf8_skewed_diagonals_upto127_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                              //
-    sz_cptr_t longer, sz_size_t longer_length,                                //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    sz_unused(shorter && longer && bound && alloc);
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // TODO: Generalize!
-    sz_size_t const max_length = 256u * 256u;
-    sz_assert(shorter_length <= longer_length && "The 'shorter' string is longer than the 'longer' one.");
-    sz_assert(shorter_length < max_length && "The length must fit into 16-bit integer. Otherwise use serial variant.");
-    sz_unused(longer_length && bound && max_length);
-
-#if 0
-    // We are going to store 3 diagonals of the matrix.
-    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
-    sz_size_t const shorter_dim = shorter_length + 1;
-    sz_size_t const longer_dim = longer_length + 1;
-    // Unlike the serial version, we also want to avoid reverse-order iteration over teh shorter string.
-    // So let's allocate a bit more memory and reverse-export our shorter string into that buffer.
-    sz_size_t const buffer_length = sizeof(sz_u16_t) * longer_dim * 3 + shorter_length;
-    sz_u16_t *const distances = (sz_u16_t *)alloc->allocate(buffer_length, alloc->handle);
-    if (!distances) return SZ_SIZE_MAX;
-
-    // The next few pointers will be swapped around.
-    sz_u16_t *previous_distances = distances;
-    sz_u16_t *current_distances = previous_distances + longer_dim;
-    sz_u16_t *next_distances = current_distances + longer_dim;
-    sz_ptr_t const shorter_reversed = (sz_ptr_t)(next_distances + longer_dim);
-
-    // Export the reversed string into the buffer.
-    for (sz_size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
-
-    // Initialize the first two diagonals:
-    previous_distances[0] = 0;
-    current_distances[0] = current_distances[1] = 1;
-
-    // Using ZMM registers, we can process 32x 16-bit values at once,
-    // storing 16 bytes of each string in YMM registers.
-    sz_u512_vec_t insertions_vec, deletions_vec, substitutions_vec, next_vec;
-    sz_u512_vec_t ones_u16_vec;
-    ones_u16_vec.zmm = _mm512_set1_epi16(1);
-
-    // This is a mixed-precision implementation, using 8-bit representations for part of the operations.
-    // Even there, in case `SZ_USE_HASWELL=0`, let's use the `sz_u512_vec_t` type, addressing the first YMM halfs.
-    sz_u512_vec_t shorter_vec, longer_vec;
-    sz_u512_vec_t ones_u8_vec;
-    ones_u8_vec.ymms[0] = _mm256_set1_epi8(1);
-
-    // Let's say we are dealing with 3 and 5 letter words.
-    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
-    // It will have:
-    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
-    // - 2 diagonals of fixed length, at positions: 4, 5.
-    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
-    sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
-
-    // Progress through the upper triangle of the Levenshtein matrix.
-    sz_size_t next_diagonal_index = 2;
-    for (; next_diagonal_index != shorter_dim; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
-        for (sz_size_t offset_within_diagonal = 0; offset_within_diagonal + 2 < next_diagonal_length;) {
-            sz_u32_t remaining_length = (sz_u32_t)(next_diagonal_length - offset_within_diagonal - 2);
-            sz_u32_t register_length = remaining_length < 32 ? remaining_length : 32;
-            sz_u32_t remaining_length_mask = _bzhi_u32(0xFFFFFFFFu, register_length);
-            longer_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, longer + offset_within_diagonal);
-            // Our original code addressed the shorter string `[next_diagonal_index - offset_within_diagonal - 2]`
-            // for growing `offset_within_diagonal`. If the `shorter` string was reversed, the
-            // `[next_diagonal_index - offset_within_diagonal - 2]` would be equal to `[shorter_length - 1 -
-            // next_diagonal_index + offset_within_diagonal + 2]`. Which simplified would be equal to
-            // `[shorter_length - next_diagonal_index + offset_within_diagonal + 1]`.
-            shorter_vec.ymms[0] = _mm256_maskz_loadu_epi8( //
-                remaining_length_mask,
-                shorter_reversed + shorter_length - next_diagonal_index + offset_within_diagonal + 1);
-            // For substitutions, perform the equality comparison using AVX2 instead of AVX-512
-            // to get the result as a vector, instead of a bitmask. Adding 1 to every scalar we can overflow
-            // transforming from {0xFF, 0} values to {0, 1} values - exactly what we need. Then - upcast to 16-bit.
-            substitutions_vec.zmm = _mm512_cvtepi8_epi16( //
-                _mm256_add_epi8(_mm256_cmpeq_epi8(longer_vec.ymms[0], shorter_vec.ymms[0]), ones_u8_vec.ymms[0]));
-            substitutions_vec.zmm = _mm512_add_epi16( //
-                substitutions_vec.zmm,
-                _mm512_maskz_loadu_epi16(remaining_length_mask, previous_distances + offset_within_diagonal));
-            // For insertions and deletions, on modern hardware, it's faster to issue two separate loads,
-            // than rotate the bytes in the ZMM register.
-            insertions_vec.zmm =
-                _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + offset_within_diagonal);
-            deletions_vec.zmm =
-                _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + offset_within_diagonal + 1);
-            // First get the minimum of insertions and deletions.
-            next_vec.zmm = _mm512_add_epi16(_mm512_min_epu16(insertions_vec.zmm, deletions_vec.zmm), ones_u16_vec.zmm);
-            next_vec.zmm = _mm512_min_epu16(next_vec.zmm, substitutions_vec.zmm);
-            _mm512_mask_storeu_epi16(next_distances + offset_within_diagonal + 1, remaining_length_mask, next_vec.zmm);
-            offset_within_diagonal += register_length;
-        }
-        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-        next_distances[0] = next_distances[next_diagonal_length - 1] = (sz_u16_t)next_diagonal_index;
-        // Perform a circular rotation (three-way swap) of those buffers, to reuse the memory.
-        sz_u16_t *temporary = previous_distances;
-        previous_distances = current_distances;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a
-    // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
-    // index on either side, we will be cropping those values out.
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
-        for (sz_size_t i = 0; i != next_diagonal_length;) {
-            sz_u32_t remaining_length = (sz_u32_t)(next_diagonal_length - i);
-            sz_u32_t register_length = remaining_length < 32 ? remaining_length : 32;
-            sz_u32_t remaining_length_mask = _bzhi_u32(0xFFFFFFFFu, register_length);
-            longer_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, longer + next_diagonal_index - n + i);
-            // Our original code addressed the shorter string `[shorter_length - 1 - i]` for growing `i`.
-            // If the `shorter` string was reversed, the `[shorter_length - 1 - i]` would
-            // be equal to `[shorter_length - 1 - shorter_length + 1 + i]`.
-            // Which simplified would be equal to just `[i]`. Beautiful!
-            shorter_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, shorter_reversed + i);
-            // For substitutions, perform the equality comparison using AVX2 instead of AVX-512
-            // to get the result as a vector, instead of a bitmask. The compare it against the accumulated
-            // substitution costs.
-            substitutions_vec.zmm = _mm512_cvtepi8_epi16( //
-                _mm256_add_epi8(_mm256_cmpeq_epi8(longer_vec.ymms[0], shorter_vec.ymms[0]), ones_u8_vec.ymms[0]));
-            substitutions_vec.zmm = _mm512_add_epi16( //
-                substitutions_vec.zmm, _mm512_maskz_loadu_epi16(remaining_length_mask, previous_distances + i));
-            // For insertions and deletions, on modern hardware, it's faster to issue two separate loads,
-            // than rotate the bytes in the ZMM register.
-            insertions_vec.zmm = _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + i);
-            deletions_vec.zmm = _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + i + 1);
-            // First get the minimum of insertions and deletions.
-            next_vec.zmm = _mm512_add_epi16(_mm512_min_epu16(insertions_vec.zmm, deletions_vec.zmm), ones_u16_vec.zmm);
-            next_vec.zmm = _mm512_min_epu16(next_vec.zmm, substitutions_vec.zmm);
-            _mm512_mask_storeu_epi16(next_distances + i, remaining_length_mask, next_vec.zmm);
-            i += register_length;
-        }
-
-        // Perform a circular rotation (three-way swap) of those buffers, to reuse the memory, this time, with a shift,
-        // dropping the first element in the current array.
-        sz_u16_t *temporary = previous_distances;
-        previous_distances = current_distances + 1;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // Cache scalar before `free` call.
-    sz_size_t result = current_distances[0];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-#endif
-    return 0;
-}
-
-SZ_INTERNAL sz_size_t sz_edit_distance_avx512(   //
-    sz_cptr_t shorter, sz_size_t shorter_length, //
-    sz_cptr_t longer, sz_size_t longer_length,   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-
-    // Bounded computations may exit early.
-    int const is_bounded = bound < longer_length;
-    if (is_bounded) {
-        // If one of the strings is empty - the edit distance is equal to the length of the other one.
-        if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
-        if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
-        // If the difference in length is beyond the `bound`, there is no need to check at all.
-        if (longer_length - shorter_length > bound) return bound;
-    }
-
-    // Make sure the shorter string is actually shorter.
-    if (shorter_length > longer_length) {
-        sz_cptr_t temporary = shorter;
-        shorter = longer;
-        longer = temporary;
-        sz_size_t temporary_length = shorter_length;
-        shorter_length = longer_length;
-        longer_length = temporary_length;
-    }
-
-    // Dispatch the right implementation based on the length of the strings.
-    if (longer_length < 64u)
-        return _sz_edit_distance_skewed_diagonals_upto63_avx512( //
-            shorter, shorter_length, longer, longer_length, bound);
-    // else if (longer_length < 256u * 256u)
-    //     return _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
-    //         shorter, shorter_length, longer, longer_length, bound, alloc);
-    else
-        return sz_edit_distance_serial(shorter, shorter_length, longer, longer_length, bound, alloc);
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_avx512(sz_cptr_t text, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "loads".
-    //
-    // A typical AWS Sapphire Rapids instance can have 48 KB x 2 blocks of L1 data cache per core,
-    // 2 MB x 2 blocks of L2 cache per core, and one shared 60 MB buffer of L3 cache.
-    // With two strings, we may consider the overal workload huge, if each exceeds 1 MB in length.
-    int const is_huge = length >= 1ull * 1024ull * 1024ull;
-    sz_u512_vec_t text_vec, sums_vec;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 16) {
-        __mmask16 mask = _sz_u16_mask_until(length);
-        text_vec.xmms[0] = _mm_maskz_loadu_epi8(mask, text);
-        sums_vec.xmms[0] = _mm_sad_epu8(text_vec.xmms[0], _mm_setzero_si128());
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_vec.xmms[0]);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_vec.xmms[0], 1);
-        return low + high;
-    }
-    else if (length <= 32) {
-        __mmask32 mask = _sz_u32_mask_until(length);
-        text_vec.ymms[0] = _mm256_maskz_loadu_epi8(mask, text);
-        sums_vec.ymms[0] = _mm256_sad_epu8(text_vec.ymms[0], _mm256_setzero_si256());
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymms[0]);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymms[0], 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        return low + high;
-    }
-    else if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        return _mm512_reduce_add_epi64(sums_vec.zmm);
-    }
-    else if (!is_huge) {
-        sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(text + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length; // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        for (text += head_length; body_length >= 64; text += 64, body_length -= 64) {
-            text_vec.zmm = _mm512_load_si512((__m512i const *)text);
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        }
-        text_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text);
-        sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        return _mm512_reduce_add_epi64(sums_vec.zmm);
-    }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    //
-    //      1. Moving in both directions to maximize the throughput, when fetching from multiple
-    //         memory pages. Also helps with cache set-associativity issues, as we won't always
-    //         be fetching the same entries in the lookup table.
-    //      2. Using non-temporal stores to avoid polluting the cache.
-    //      3. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
-    //         for predictable patterns, so disregard this advice.
-    //
-    // Bidirectional traversal generally adds about 10% to such algorithms.
-    else {
-        sz_u512_vec_t text_reversed_vec, sums_reversed_vec;
-        sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64;
-        sz_size_t tail_length = (sz_size_t)(text + length) % 64;
-        sz_size_t body_length = length - head_length - tail_length;
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-
-        text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        text_reversed_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text + head_length + body_length);
-        sums_reversed_vec.zmm = _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512());
-
-        // Now in the main loop, we can use non-temporal loads and stores,
-        // performing the operation in both directions.
-        for (text += head_length; body_length >= 128; text += 64, text += 64, body_length -= 128) {
-            text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-            text_reversed_vec.zmm = _mm512_stream_load_si512((__m512i *)(text + body_length - 64));
-            sums_reversed_vec.zmm =
-                _mm512_add_epi64(sums_reversed_vec.zmm, _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512()));
-        }
-        if (body_length >= 64) {
-            text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        }
-
-        return _mm512_reduce_add_epi64(_mm512_add_epi64(sums_vec.zmm, sums_reversed_vec.zmm));
-    }
-}
-
-SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    if (length < 4 * window_length) {
-        sz_hashes_serial(start, length, window_length, step, callback, callback_handle);
-        return;
-    }
-
-    // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
-    // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
-    sz_size_t const max_hashes = length - window_length + 1;
-    sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
-    sz_u8_t const *text_first = (sz_u8_t const *)start;
-    sz_u8_t const *text_second = text_first + min_hashes_per_thread;
-    sz_u8_t const *text_third = text_first + min_hashes_per_thread * 2;
-    sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
-    sz_u8_t const *text_end = text_first + length;
-
-    // Broadcast the global constants into the registers.
-    // Both high and low hashes will work with the same prime and golden ratio.
-    sz_u512_vec_t prime_vec, golden_ratio_vec;
-    prime_vec.zmm = _mm512_set1_epi64(SZ_U64_MAX_PRIME);
-    golden_ratio_vec.zmm = _mm512_set1_epi64(11400714819323198485ull);
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // We will be evaluating 4 offsets at a time with 2 different hash functions.
-    // We can fit all those 8 state variables in each of the following ZMM registers.
-    sz_u512_vec_t base_vec, prime_power_vec, shift_vec;
-    base_vec.zmm = _mm512_set_epi64(31ull, 31ull, 31ull, 31ull, 257ull, 257ull, 257ull, 257ull);
-    shift_vec.zmm = _mm512_set_epi64(0ull, 0ull, 0ull, 0ull, 77ull, 77ull, 77ull, 77ull);
-    prime_power_vec.zmm = _mm512_set_epi64(prime_power_low, prime_power_low, prime_power_low, prime_power_low,
-                                           prime_power_high, prime_power_high, prime_power_high, prime_power_high);
-
-    // Compute the initial hash values for every one of the four windows.
-    sz_u512_vec_t hash_vec, chars_vec;
-    hash_vec.zmm = _mm512_setzero_si512();
-    for (sz_u8_t const *prefix_end = text_first + window_length; text_first < prefix_end;
-         ++text_first, ++text_second, ++text_third, ++text_fourth) {
-
-        // 1. Multiply the hashes by the base.
-        hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`...
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
-                                         text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-
-        // 3. Add the incoming characters.
-        hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
-                                              _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
-    }
-
-    // 5. Compute the hash mix, that will be used to index into the fingerprint.
-    //    This includes a serial step at the end.
-    sz_u512_vec_t hash_mix_vec;
-    hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
-    hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
-                                            _mm512_extracti64x4_epi64(hash_mix_vec.zmm, 0));
-
-    callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-    callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-    callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-    callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-
-    // Now repeat that operation for the remaining characters, discarding older characters.
-    sz_size_t cycle = 1;
-    sz_size_t step_mask = step - 1;
-    for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
-        // 0. Load again the four characters we are dropping, shift them, and subtract.
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[-window_length], text_third[-window_length],
-                                         text_second[-window_length], text_first[-window_length], //
-                                         text_fourth[-window_length], text_third[-window_length],
-                                         text_second[-window_length], text_first[-window_length]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-        hash_vec.zmm = _mm512_sub_epi64(hash_vec.zmm, _mm512_mullo_epi64(chars_vec.zmm, prime_power_vec.zmm));
-
-        // 1. Multiply the hashes by the base.
-        hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
-                                         text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-
-        // ... and prefetch the next four characters into Level 2 or higher.
-        _mm_prefetch((sz_cptr_t)text_fourth + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_third + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_second + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_first + 1, _MM_HINT_T1);
-
-        // 3. Add the incoming characters.
-        hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
-                                              _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
-
-        // 5. Compute the hash mix, that will be used to index into the fingerprint.
-        //    This includes a serial step at the end.
-        hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
-        hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
-                                                _mm512_castsi512_si256(hash_mix_vec.zmm));
-
-        if ((cycle & step_mask) == 0) {
-            callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-            callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-            callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-            callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-        }
-    }
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512vbmi", "avx512vbmi2", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512vbmi,avx512vbmi2,bmi,bmi2"))), \
-                             apply_to = function)
-
-SZ_PUBLIC void sz_look_up_transform_ice(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-
-    // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
-    // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
-    // But if at least 3 cache lines are touched, the AVX-512 implementation should be faster.
-    if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
-        return;
-    }
-
-    // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
-    // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
-    // by computing 2 masks - for the head and tail, using masked stores for the head and tail, and unmasked
-    // for the body.
-    sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-    sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-    __mmask64 head_mask = _sz_u64_mask_until(head_length);
-    __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-
-    // We need to pull the lookup table into 4x ZMM registers.
-    // We can use `vpermi2b` instruction to perform the look in two ZMM registers with `_mm512_permutex2var_epi8`
-    // intrinsics, but it has a 6-cycle latency on Sapphire Rapids and requires AVX512-VBMI. Assuming we need to
-    // operate on 4 registers, it might be cleaner to use 2x separate `_mm512_permutexvar_epi8` calls.
-    // Combining the results with 2x `_mm512_test_epi8_mask` and 3x blends afterwards.
-    //
-    //  - 4x `_mm512_permutexvar_epi8` maps to "VPERMB (ZMM, ZMM, ZMM)":
-    //      - On Ice Lake: 3 cycles latency, ports: 1*p5
-    //      - On Genoa: 6 cycles latency, ports: 1*FP12
-    //  - 3x `_mm512_mask_blend_epi8` maps to "VPBLENDMB_Z (ZMM, K, ZMM, ZMM)":
-    //      - On Ice Lake: 3 cycles latency, ports: 1*p05
-    //      - On Genoa: 1 cycle latency, ports: 1*FP0123
-    //  - 2x `_mm512_test_epi8_mask` maps to "VPTESTMB (K, ZMM, ZMM)":
-    //      - On Ice Lake: 3 cycles latency, ports: 1*p5
-    //      - On Genoa: 4 cycles latency, ports: 1*FP01
-    //
-    sz_u512_vec_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec;
-    lut_0_to_63_vec.zmm = _mm512_loadu_si512((lut));
-    lut_64_to_127_vec.zmm = _mm512_loadu_si512((lut + 64));
-    lut_128_to_191_vec.zmm = _mm512_loadu_si512((lut + 128));
-    lut_192_to_255_vec.zmm = _mm512_loadu_si512((lut + 192));
-
-    sz_u512_vec_t first_bit_vec, second_bit_vec;
-    first_bit_vec.zmm = _mm512_set1_epi8((char)0x80);
-    second_bit_vec.zmm = _mm512_set1_epi8((char)0x40);
-
-    __mmask64 first_bit_mask, second_bit_mask;
-    sz_u512_vec_t source_vec;
-    // If the top bit is set in each word of `source_vec`, than we use `lookup_128_to_191_vec` or
-    // `lookup_192_to_255_vec`. If the second bit is set, we use `lookup_64_to_127_vec` or `lookup_192_to_255_vec`.
-    sz_u512_vec_t lookup_0_to_63_vec, lookup_64_to_127_vec, lookup_128_to_191_vec, lookup_192_to_255_vec;
-    sz_u512_vec_t blended_0_to_127_vec, blended_128_to_255_vec, blended_0_to_255_vec;
-
-    // Handling the head.
-    if (head_length) {
-        source_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, source);
-        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
-        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
-        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
-        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
-        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
-        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
-        blended_0_to_127_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
-        blended_128_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
-        blended_0_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
-        _mm512_mask_storeu_epi8(target, head_mask, blended_0_to_255_vec.zmm);
-        source += head_length, target += head_length, length -= head_length;
-    }
-
-    // Handling the body in 64-byte chunks aligned to cache-line boundaries with respect to `target`.
-    while (length >= 64) {
-        source_vec.zmm = _mm512_loadu_si512(source);
-        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
-        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
-        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
-        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
-        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
-        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
-        blended_0_to_127_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
-        blended_128_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
-        blended_0_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
-        _mm512_store_si512(target, blended_0_to_255_vec.zmm); //! Aligned store, our main weapon!
-        source += 64, target += 64, length -= 64;
-    }
-
-    // Handling the tail.
-    if (tail_length) {
-        source_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, source);
-        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
-        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
-        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
-        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
-        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
-        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
-        blended_0_to_127_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
-        blended_128_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
-        blended_0_to_255_vec.zmm =
-            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
-        _mm512_mask_storeu_epi8(target, tail_mask, blended_0_to_255_vec.zmm);
-        source += tail_length, target += tail_length, length -= tail_length;
-    }
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_ice(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-
-    // Before initializing the AVX-512 vectors, we may want to run the sequential code for the first few bytes.
-    // In practice, that only hurts, even when we have matches every 5-ish bytes.
-    //
-    //      if (length < SZ_SWAR_THRESHOLD) return sz_find_charset_serial(text, length, filter);
-    //      sz_cptr_t early_result = sz_find_charset_serial(text, SZ_SWAR_THRESHOLD, filter);
-    //      if (early_result) return early_result;
-    //      text += SZ_SWAR_THRESHOLD;
-    //      length -= SZ_SWAR_THRESHOLD;
-    //
-    // Let's unzip even and odd elements and replicate them into both lanes of the YMM register.
-    // That way when we invoke `_mm512_shuffle_epi8` we can use the same mask for both lanes.
-    sz_u512_vec_t filter_even_vec, filter_odd_vec;
-    __m256i filter_ymm = _mm256_lddqu_si256((__m256i const *)filter);
-    // There are a few way to initialize filters without having native strided loads.
-    // In the cronological order of experiments:
-    // - serial code initializing 128 bytes of odd and even mask
-    // - using several shuffles
-    // - using `_mm512_permutexvar_epi8`
-    // - using `_mm512_broadcast_i32x4(_mm256_castsi256_si128(_mm256_maskz_compress_epi8(0x55555555, filter_ymm)))`
-    //   and `_mm512_broadcast_i32x4(_mm256_castsi256_si128(_mm256_maskz_compress_epi8(0xaaaaaaaa, filter_ymm)))`
-    filter_even_vec.zmm = _mm512_broadcast_i32x4(_mm256_castsi256_si128( // broadcast __m128i to __m512i
-        _mm256_maskz_compress_epi8(0x55555555, filter_ymm)));
-    filter_odd_vec.zmm = _mm512_broadcast_i32x4(_mm256_castsi256_si128( // broadcast __m128i to __m512i
-        _mm256_maskz_compress_epi8(0xaaaaaaaa, filter_ymm)));
-    // After the unzipping operation, we can validate the contents of the vectors like this:
-    //
-    //      for (sz_size_t i = 0; i != 16; ++i) {
-    //          sz_assert(filter_even_vec.u8s[i] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 16] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 16] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 32] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 32] == filter->_u8s[i * 2 + 1]);
-    //          sz_assert(filter_even_vec.u8s[i + 48] == filter->_u8s[i * 2]);
-    //          sz_assert(filter_odd_vec.u8s[i + 48] == filter->_u8s[i * 2 + 1]);
-    //      }
-    //
-    sz_u512_vec_t text_vec;
-    sz_u512_vec_t lower_nibbles_vec, higher_nibbles_vec;
-    sz_u512_vec_t bitset_even_vec, bitset_odd_vec;
-    sz_u512_vec_t bitmask_vec, bitmask_lookup_vec;
-    bitmask_lookup_vec.zmm = _mm512_set_epi8(                       //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
-        -128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);
-
-    while (length) {
-        // The following algorithm is a transposed equivalent of the "SIMDized check which bytes are in a set"
-        // solutions by Wojciech Muła. We populate the bitmask differently and target newer CPUs, so
-        // StrinZilla uses a somewhat different approach.
-        // http://0x80.pl/articles/simd-byte-lookup.html#alternative-implementation-new
-        //
-        //      sz_u8_t input = *(sz_u8_t const *)text;
-        //      sz_u8_t lo_nibble = input & 0x0f;
-        //      sz_u8_t hi_nibble = input >> 4;
-        //      sz_u8_t bitset_even = filter_even_vec.u8s[hi_nibble];
-        //      sz_u8_t bitset_odd = filter_odd_vec.u8s[hi_nibble];
-        //      sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //      sz_u8_t bitset = lo_nibble < 8 ? bitset_even : bitset_odd;
-        //      if ((bitset & bitmask) != 0) return text;
-        //      else { length--, text++; }
-        //
-        // The nice part about this, loading the strided data is vey easy with Arm NEON,
-        // while with x86 CPUs after AVX, shuffles within 256 bits shouldn't be an issue either.
-        sz_size_t load_length = sz_min_of_two(length, 64);
-        __mmask64 load_mask = _sz_u64_mask_until(load_length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, text);
-        lower_nibbles_vec.zmm = _mm512_and_si512(text_vec.zmm, _mm512_set1_epi8(0x0f));
-        bitmask_vec.zmm = _mm512_shuffle_epi8(bitmask_lookup_vec.zmm, lower_nibbles_vec.zmm);
-        //
-        // At this point we can validate the `bitmask_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != load_length; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t lo_nibble = input & 0x0f;
-        //          sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //          sz_assert(bitmask_vec.u8s[i] == bitmask);
-        //      }
-        //
-        // Shift right every byte by 4 bits.
-        // There is no `_mm512_srli_epi8` intrinsic, so we have to use `_mm512_srli_epi16`
-        // and combine it with a mask to clear the higher bits.
-        higher_nibbles_vec.zmm = _mm512_and_si512(_mm512_srli_epi16(text_vec.zmm, 4), _mm512_set1_epi8(0x0f));
-        bitset_even_vec.zmm = _mm512_shuffle_epi8(filter_even_vec.zmm, higher_nibbles_vec.zmm);
-        bitset_odd_vec.zmm = _mm512_shuffle_epi8(filter_odd_vec.zmm, higher_nibbles_vec.zmm);
-        //
-        // At this point we can validate the `bitset_even_vec` and `bitset_odd_vec` contents like this:
-        //
-        //      for (sz_size_t i = 0; i != load_length; ++i) {
-        //          sz_u8_t input = *(sz_u8_t const *)(text + i);
-        //          sz_u8_t const *bitset_ptr = &filter->_u8s[0];
-        //          sz_u8_t hi_nibble = input >> 4;
-        //          sz_u8_t bitset_even = bitset_ptr[hi_nibble * 2];
-        //          sz_u8_t bitset_odd = bitset_ptr[hi_nibble * 2 + 1];
-        //          sz_assert(bitset_even_vec.u8s[i] == bitset_even);
-        //          sz_assert(bitset_odd_vec.u8s[i] == bitset_odd);
-        //      }
-        //
-        // TODO: Is this a good place for ternary logic?
-        __mmask64 take_first = _mm512_cmplt_epi8_mask(lower_nibbles_vec.zmm, _mm512_set1_epi8(8));
-        bitset_even_vec.zmm = _mm512_mask_blend_epi8(take_first, bitset_odd_vec.zmm, bitset_even_vec.zmm);
-        __mmask64 matches_mask = _mm512_mask_test_epi8_mask(load_mask, bitset_even_vec.zmm, bitmask_vec.zmm);
-        if (matches_mask) {
-            int offset = sz_u64_ctz(matches_mask);
-            return text + offset;
-        }
-        else { text += load_length, length -= load_length; }
-    }
-
-    return SZ_NULL_CHAR;
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_ice(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-    return sz_rfind_charset_serial(text, length, filter);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_many_avx512(                        //
-    sz_cptr_t haystack, sz_size_t haystack_length,              //
-    sz_cptr_t const *needles, sz_size_t const *needles_lengths, //
-    sz_size_t *needle_offset) {
-
-    // When dealing with huge needles vocabularies, like in tokenization workloads, we need to construct an automaton.
-    // But in many cases, the vocabulary is small enough to use a simpler DFA-less approach, combining the ideas from
-    // the `sz_find_skylake` and `sz_find_charset_ice` functions.
-    //
-    // Pick the offsets within needles where there is the least variance in the characters.
-    // Like for "the", "then", "there", "these", "those", "their", "they", "them", "that", "this", "thus", "than":
-    //
-    //    0: 't'
-    //    1: 'h'
-    //    2: 'e', 'a', 'i', 'o', 'u'
-    //    3: 'n', 'r', 's', 'i', 'y', 'm', 't'
-    //
-    // So depending on our "register budget", we can use a different number of pivot points: offset 0, 1, 2 make
-    // the most sense if we can only use 3 ZMM registers.
-    sz_unused(haystack && haystack_length && needles && needles_lengths && needle_offset);
-    return 0;
-}
-
-/**
- *  Computes the Needleman Wunsch alignment score between two strings.
- *  The method uses 32-bit integers to accumulate the running score for every cell in the matrix.
- *  Assuming the costs of substitutions can be arbitrary signed 8-bit integers, the method is expected to be used
- *  on strings not exceeding 2^24 length or 16.7 million characters.
- *
- *  Unlike the `_sz_edit_distance_skewed_diagonals_upto65k_avx512` method, this one uses signed integers to store
- *  the accumulated score. Moreover, it's primary bottleneck is the latency of gathering the substitution costs
- *  from the substitution matrix. If we use the diagonal order, we will be comparing a slice of the first string with
- *  a slice of the second. If we stick to the conventional horizontal order, we will be comparing one character against
- *  a slice, which is much easier to optimize. In that case we are sampling costs not from arbitrary parts of
- *  a 256 x 256 matrix, but from a single row!
- */
-SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                         //
-    sz_cptr_t longer, sz_size_t longer_length,                           //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
-
-    // If one of the strings is empty - the edit distance is equal to the length of the other one
-    if (longer_length == 0) return (sz_ssize_t)shorter_length * gap;
-    if (shorter_length == 0) return (sz_ssize_t)longer_length * gap;
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
-    }
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    sz_size_t const max_length = 256ull * 256ull * 256ull;
-    sz_size_t const n = longer_length + 1;
-    sz_assert(n < max_length && "The length must fit into 24-bit integer. Otherwise use serial variant.");
-    sz_unused(longer_length && max_length);
-
-    sz_size_t buffer_length = sizeof(sz_i32_t) * n * 2;
-    sz_i32_t *distances = (sz_i32_t *)alloc->allocate(buffer_length, alloc->handle);
-    sz_i32_t *previous_distances = distances;
-    sz_i32_t *current_distances = previous_distances + n;
-
-    // Intialize the first row of the Levenshtein matrix with `iota`.
-    for (sz_size_t idx_longer = 0; idx_longer != n; ++idx_longer)
-        previous_distances[idx_longer] = (sz_i32_t)idx_longer * gap;
-
-    /// Contains up to 16 consecutive characters from the longer string.
-    sz_u512_vec_t longer_vec;
-    sz_u512_vec_t cost_deletion_vec, cost_substitution_vec, lookup_substitution_vec, current_vec;
-    sz_u512_vec_t row_first_subs_vec, row_second_subs_vec, row_third_subs_vec, row_fourth_subs_vec;
-    sz_u512_vec_t shuffled_first_subs_vec, shuffled_second_subs_vec, shuffled_third_subs_vec, shuffled_fourth_subs_vec;
-
-    // Prepare constants and masks.
-    sz_u512_vec_t is_third_or_fourth_vec, is_second_or_fourth_vec, gap_vec;
-    {
-        char is_third_or_fourth_check, is_second_or_fourth_check;
-        *(sz_u8_t *)&is_third_or_fourth_check = 0x80, *(sz_u8_t *)&is_second_or_fourth_check = 0x40;
-        is_third_or_fourth_vec.zmm = _mm512_set1_epi8(is_third_or_fourth_check);
-        is_second_or_fourth_vec.zmm = _mm512_set1_epi8(is_second_or_fourth_check);
-        gap_vec.zmm = _mm512_set1_epi32(gap);
-    }
-
-    sz_u8_t const *shorter_unsigned = (sz_u8_t const *)shorter;
-    for (sz_size_t idx_shorter = 0; idx_shorter != shorter_length; ++idx_shorter) {
-        sz_i32_t last_in_row = current_distances[0] = (sz_i32_t)(idx_shorter + 1) * gap;
-
-        // Load one row of the substitution matrix into four ZMM registers.
-        sz_error_cost_t const *row_subs = subs + shorter_unsigned[idx_shorter] * 256u;
-        row_first_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 0);
-        row_second_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 1);
-        row_third_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 2);
-        row_fourth_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 3);
-
-        // In the serial version we have one forward pass, that computes the deletion,
-        // insertion, and substitution costs at once.
-        //    for (sz_size_t idx_longer = 0; idx_longer < longer_length; ++idx_longer) {
-        //        sz_ssize_t cost_deletion = previous_distances[idx_longer + 1] + gap;
-        //        sz_ssize_t cost_insertion = current_distances[idx_longer] + gap;
-        //        sz_ssize_t cost_substitution = previous_distances[idx_longer] + row_subs[longer_unsigned[idx_longer]];
-        //        current_distances[idx_longer + 1] = sz_min_of_three(cost_deletion, cost_insertion, cost_substitution);
-        //    }
-        //
-        // Given the complexity of handling the data-dependency between consecutive insertion cost computations
-        // within a Levenshtein matrix, the simplest design would be to vectorize every kind of cost computation
-        // separately.
-        //      1. Compute substitution costs for up to 64 characters at once, upcasting from 8-bit integers to 32.
-        //      2. Compute the pairwise minimum with deletion costs.
-        //      3. Inclusive prefix minimum computation to combine with addition costs.
-        // Proceeding with substitutions:
-        for (sz_size_t idx_longer = 0; idx_longer < longer_length; idx_longer += 64) {
-            sz_size_t register_length = sz_min_of_two(longer_length - idx_longer, 64);
-            __mmask64 mask = _sz_u64_mask_until(register_length);
-            longer_vec.zmm = _mm512_maskz_loadu_epi8(mask, longer + idx_longer);
-
-            // Blend the `row_(first|second|third|fourth)_subs_vec` into `current_vec`, picking the right source
-            // for every character in `longer_vec`. Before that, we need to permute the subsititution vectors.
-            // Only the bottom 6 bits of a byte are used in VPERB, so we don't even need to mask.
-            shuffled_first_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_first_subs_vec.zmm);
-            shuffled_second_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_second_subs_vec.zmm);
-            shuffled_third_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_third_subs_vec.zmm);
-            shuffled_fourth_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_fourth_subs_vec.zmm);
-
-            // To blend we can invoke three `_mm512_cmplt_epu8_mask`, but we can also achieve the same using
-            // the AND logical operation, checking the top two bits of every byte.
-            // Continuing this thought, we can use the VPTESTMB instruction to output the mask after the AND.
-            __mmask64 is_third_or_fourth = _mm512_mask_test_epi8_mask(mask, longer_vec.zmm, is_third_or_fourth_vec.zmm);
-            __mmask64 is_second_or_fourth =
-                _mm512_mask_test_epi8_mask(mask, longer_vec.zmm, is_second_or_fourth_vec.zmm);
-            lookup_substitution_vec.zmm = _mm512_mask_blend_epi8(
-                is_third_or_fourth,
-                // Choose between the first and the second.
-                _mm512_mask_blend_epi8(is_second_or_fourth, shuffled_first_subs_vec.zmm, shuffled_second_subs_vec.zmm),
-                // Choose between the third and the fourth.
-                _mm512_mask_blend_epi8(is_second_or_fourth, shuffled_third_subs_vec.zmm, shuffled_fourth_subs_vec.zmm));
-
-            // First, sign-extend lower and upper 16 bytes to 16-bit integers.
-            __m512i current_0_31_vec = _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(lookup_substitution_vec.zmm, 0));
-            __m512i current_32_63_vec = _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(lookup_substitution_vec.zmm, 1));
-
-            // Now extend those 16-bit integers to 32-bit.
-            // This isn't free, same as the subsequent store, so we only want to do that for the populated lanes.
-            // To minimize the number of loads and stores, we can combine our substitution costs with the previous
-            // distances, containing the deletion costs.
-            {
-                cost_substitution_vec.zmm = _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_0_31_vec, 0)));
-                cost_deletion_vec.zmm = _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Inclusive prefix minimum computation to combine with insertion costs.
-                // Simply disabling this operation results in 5x performance improvement, meaning
-                // that this operation is responsible for 80% of the total runtime.
-                //    for (sz_size_t idx_longer = 0; idx_longer < longer_length; ++idx_longer) {
-                //        current_distances[idx_longer + 1] =
-                //            sz_max_of_two(current_distances[idx_longer] + gap, current_distances[idx_longer + 1]);
-                //    }
-                //
-                // To perform the same operation in vectorized form, we need to perform a tree-like reduction,
-                // that will involve multiple steps. It's quite expensive and should be first tested in the
-                // "experimental" section.
-                //
-                // Another approach might be loop unrolling:
-                //      current_vec.i32s[0] = last_in_row = sz_i32_max_of_two(current_vec.i32s[0], last_in_row + gap);
-                //      current_vec.i32s[1] = last_in_row = sz_i32_max_of_two(current_vec.i32s[1], last_in_row + gap);
-                //      current_vec.i32s[2] = last_in_row = sz_i32_max_of_two(current_vec.i32s[2], last_in_row + gap);
-                //      ... yet this approach is also quite expensive.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 16 to 31.
-            if (register_length > 16) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 16);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_0_31_vec, 1)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 16);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 16, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 32 to 47.
-            if (register_length > 32) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 32);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_32_63_vec, 0)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 32);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 32, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 32 to 47.
-            if (register_length > 48) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 48);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_32_63_vec, 1)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 48);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 48, (__mmask16)mask, current_vec.zmm);
-            }
-        }
-
-        // Swap previous_distances and current_distances pointers
-        sz_pointer_swap((void **)&previous_distances, (void **)&current_distances);
-    }
-
-    // Cache scalar before `free` call.
-    sz_ssize_t result = previous_distances[longer_length];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-}
-
-SZ_INTERNAL sz_ssize_t sz_alignment_score_avx512( //
-    sz_cptr_t shorter, sz_size_t shorter_length,  //
-    sz_cptr_t longer, sz_size_t longer_length,    //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
-
-    if (sz_max_of_two(shorter_length, longer_length) < (256ull * 256ull * 256ull))
-        return _sz_alignment_score_wagner_fisher_upto17m_avx512(shorter, shorter_length, longer, longer_length, subs,
-                                                                gap, alloc);
-    else
-        return sz_alignment_score_serial(shorter, shorter_length, longer, longer_length, subs, gap, alloc);
-}
-
-enum sz_encoding_t {
-    sz_encoding_unknown_k = 0,
-    sz_encoding_ascii_k = 1,
-    sz_encoding_utf8_k = 2,
-    sz_encoding_utf16_k = 3,
-    sz_encoding_utf32_k = 4,
-    sz_jwt_k,
-    sz_base64_k,
-    // Low priority encodings:
-    sz_encoding_utf8bom_k = 5,
-    sz_encoding_utf16le_k = 6,
-    sz_encoding_utf16be_k = 7,
-    sz_encoding_utf32le_k = 8,
-    sz_encoding_utf32be_k = 9,
-};
-
-// Character Set Detection is one of the most commonly performed operations in data processing with
-// [Chardet](https://github.com/chardet/chardet), [Charset Normalizer](https://github.com/jawah/charset_normalizer),
-// [cChardet](https://github.com/PyYoshi/cChardet) being the most commonly used options in the Python ecosystem.
-// All of them are notoriously slow.
-//
-// Moreover, as of October 2024, UTF-8 is the dominant character encoding on the web, used by 98.4% of websites.
-// Other have minimal usage, according to [W3Techs](https://w3techs.com/technologies/overview/character_encoding):
-// - ISO-8859-1: 1.2%
-// - Windows-1252: 0.3%
-// - Windows-1251: 0.2%
-// - EUC-JP: 0.1%
-// - Shift JIS: 0.1%
-// - EUC-KR: 0.1%
-// - GB2312: 0.1%
-// - Windows-1250: 0.1%
-// Within programming language implementations and database management systems, 16-bit and 32-bit fixed-width encodings
-// are also very popular and we need a way to efficienly differentiate between the most common UTF flavors, ASCII, and
-// the rest.
-//
-// One good solution is the [simdutf](https://github.com/simdutf/simdutf) library, but it depends on the C++ runtime
-// and focuses more on incremental validation & transcoding, rather than detection.
-//
-// So we need a very fast and efficient way of determining
-SZ_PUBLIC sz_bool_t sz_detect_encoding(sz_cptr_t text, sz_size_t length) {
-    // https://github.com/simdutf/simdutf/blob/master/src/icelake/icelake_utf8_validation.inl.cpp
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_from_utf8.inl.cpp#L81
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L661
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L788
-
-    // We can implement this operation simpler & differently, assuming most of the time continuous chunks of memory
-    // have identical encoding. With Russian and many European languages, we generally deal with 2-byte codepoints
-    // with occasional 1-byte punctuation marks. In the case of Chinese, Japanese, and Korean, we deal with 3-byte
-    // codepoints. In the case of emojis, we deal with 4-byte codepoints.
-    // We can also use the idea, that misaligned reads are quite cheap on modern CPUs.
-    int can_be_ascii = 1, can_be_utf8 = 1, can_be_utf16 = 1, can_be_utf32 = 1;
-    sz_unused(can_be_ascii + can_be_utf8 + can_be_utf16 + can_be_utf32);
-    sz_unused(text && length);
-    return sz_false_k;
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif
-
-#pragma endregion
-
-/*  @brief  Implementation of the string search algorithms using the Arm NEON instruction set, available on 64-bit
- *          Arm processors. Implements: {substring search, character search, character set search} x {forward, reverse}.
- */
-#pragma region ARM NEON
-
-#if SZ_USE_NEON
-#pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+simd")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
-
-/**
- *  @brief  Helper structure to simplify work with 64-bit words.
- */
-typedef union sz_u128_vec_t {
-    uint8x16_t u8x16;
-    uint16x8_t u16x8;
-    uint32x4_t u32x4;
-    uint64x2_t u64x2;
-    sz_u64_t u64s[2];
-    sz_u32_t u32s[4];
-    sz_u16_t u16s[8];
-    sz_u8_t u8s[16];
-} sz_u128_vec_t;
-
-SZ_INTERNAL sz_u64_t _sz_vreinterpretq_u8_u4(uint8x16_t vec) {
-    // Use `vshrn` to produce a bitmask, similar to `movemask` in SSE.
-    // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
-    return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(vec), 4)), 0) & 0x8888888888888888ull;
-}
-
-SZ_PUBLIC sz_ordering_t sz_order_neon(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-    //! Before optimizing this, read the "Operations Not Worth Optimizing" in Contributions Guide:
-    //! https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md#general-performance-observations
-    return sz_order_serial(a, a_length, b, b_length);
-}
-
-SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_u128_vec_t a_vec, b_vec;
-    for (; length >= 16; a += 16, b += 16, length -= 16) {
-        a_vec.u8x16 = vld1q_u8((sz_u8_t const *)a);
-        b_vec.u8x16 = vld1q_u8((sz_u8_t const *)b);
-        uint8x16_t cmp = vceqq_u8(a_vec.u8x16, b_vec.u8x16);
-        if (vminvq_u8(cmp) != 255) { return sz_false_k; } // Check if all bytes match
-    }
-
-    // Handle remaining bytes
-    if (length) return sz_equal_serial(a, b, length);
-    return sz_true_k;
-}
-
-SZ_PUBLIC sz_u64_t sz_checksum_neon(sz_cptr_t text, sz_size_t length) {
-    uint64x2_t sum_vec = vdupq_n_u64(0);
-
-    // Process 16 bytes (128 bits) at a time
-    for (; length >= 16; text += 16, length -= 16) {
-        uint8x16_t vec = vld1q_u8((sz_u8_t const *)text);      // Load 16 bytes
-        uint16x8_t pairwise_sum1 = vpaddlq_u8(vec);            // Pairwise add lower and upper 8 bits
-        uint32x4_t pairwise_sum2 = vpaddlq_u16(pairwise_sum1); // Pairwise add 16-bit results
-        uint64x2_t pairwise_sum3 = vpaddlq_u32(pairwise_sum2); // Pairwise add 32-bit results
-        sum_vec = vaddq_u64(sum_vec, pairwise_sum3);           // Accumulate the sum
-    }
-
-    // Final reduction of `sum_vec` to a single scalar
-    sz_u64_t sum = vgetq_lane_u64(sum_vec, 0) + vgetq_lane_u64(sum_vec, 1);
-    if (length) sum += sz_checksum_serial(text, length);
-    return sum;
-}
-
-SZ_PUBLIC void sz_copy_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // In most cases the `source` and the `target` are not aligned, but we should
-    // at least make sure that writes don't touch many cache lines.
-    // NEON has an instruction to load and write 64 bytes at once.
-    //
-    //    sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
-    //    sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-    //    for (; head_length; target += 1, source += 1, head_length -= 1) *target = *source;
-    //    length -= head_length;
-    //    for (; length >= 64; target += 64, source += 64, length -= 64)
-    //        vst4q_u8((sz_u8_t *)target, vld1q_u8_x4((sz_u8_t const *)source));
-    //    for (; tail_length; target += 1, source += 1, tail_length -= 1) *target = *source;
-    //
-    // Sadly, those instructions end up being 20% slower than the code processing 16 bytes at a time:
-    for (; length >= 16; target += 16, source += 16, length -= 16)
-        vst1q_u8((sz_u8_t *)target, vld1q_u8((sz_u8_t const *)source));
-    if (length) sz_copy_serial(target, source, length);
-}
-
-SZ_PUBLIC void sz_move_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    // When moving small buffers, using a small buffer on stack as a temporary storage is faster.
-
-    if (target < source || target >= source + length) {
-        // Non-overlapping, proceed forward
-        sz_copy_neon(target, source, length);
-    }
-    else {
-        // Overlapping, proceed backward
-        target += length;
-        source += length;
-
-        sz_u128_vec_t src_vec;
-        while (length >= 16) {
-            target -= 16, source -= 16, length -= 16;
-            src_vec.u8x16 = vld1q_u8((sz_u8_t const *)source);
-            vst1q_u8((sz_u8_t *)target, src_vec.u8x16);
-        }
-        while (length) {
-            target -= 1, source -= 1, length -= 1;
-            *target = *source;
-        }
-    }
-}
-
-SZ_PUBLIC void sz_fill_neon(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    uint8x16_t fill_vec = vdupq_n_u8(value); // Broadcast the value across the register
-
-    while (length >= 16) {
-        vst1q_u8((sz_u8_t *)target, fill_vec);
-        target += 16;
-        length -= 16;
-    }
-
-    // Handle remaining bytes
-    if (length) sz_fill_serial(target, length, value);
-}
-
-SZ_PUBLIC void sz_look_up_transform_neon(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-
-    // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
-    // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
-    if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
-        return;
-    }
-
-    sz_size_t head_length = (16 - ((sz_size_t)target % 16)) % 16; // 15 or less.
-    sz_size_t tail_length = (sz_size_t)(target + length) % 16;    // 15 or less.
-
-    // We need to pull the lookup table into 16x NEON registers. We have a total of 32 such registers.
-    // According to the Neoverse V2 manual, the 4-table lookup has a latency of 6 cycles, and 4x throughput.
-    uint8x16x4_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec;
-    lut_0_to_63_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 0));
-    lut_64_to_127_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 64));
-    lut_128_to_191_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 128));
-    lut_192_to_255_vec = vld1q_u8_x4((sz_u8_t const *)(lut + 192));
-
-    sz_u128_vec_t source_vec;
-    // If the top bit is set in each word of `source_vec`, than we use `lookup_128_to_191_vec` or
-    // `lookup_192_to_255_vec`. If the second bit is set, we use `lookup_64_to_127_vec` or `lookup_192_to_255_vec`.
-    sz_u128_vec_t lookup_0_to_63_vec, lookup_64_to_127_vec, lookup_128_to_191_vec, lookup_192_to_255_vec;
-    sz_u128_vec_t blended_0_to_255_vec;
-
-    // Process the head with serial code
-    for (; head_length; target += 1, source += 1, head_length -= 1) *target = lut[*(sz_u8_t const *)source];
-
-    // Table lookups on Arm are much simpler to use than on x86, as we can use the `vqtbl4q_u8` instruction
-    // to perform a 4-table lookup in a single instruction. The XORs are used to adjust the lookup position
-    // within each 64-byte range of the table.
-    // Details on the 4-table lookup: https://lemire.me/blog/2019/07/23/arbitrary-byte-to-byte-maps-using-arm-neon/
-    length -= head_length;
-    length -= tail_length;
-    for (; length >= 16; source += 16, target += 16, length -= 16) {
-        source_vec.u8x16 = vld1q_u8((sz_u8_t const *)source);
-        lookup_0_to_63_vec.u8x16 = vqtbl4q_u8(lut_0_to_63_vec, source_vec.u8x16);
-        lookup_64_to_127_vec.u8x16 = vqtbl4q_u8(lut_64_to_127_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0x40)));
-        lookup_128_to_191_vec.u8x16 = vqtbl4q_u8(lut_128_to_191_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0x80)));
-        lookup_192_to_255_vec.u8x16 = vqtbl4q_u8(lut_192_to_255_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0xc0)));
-        blended_0_to_255_vec.u8x16 = vorrq_u8(vorrq_u8(lookup_0_to_63_vec.u8x16, lookup_64_to_127_vec.u8x16),
-                                              vorrq_u8(lookup_128_to_191_vec.u8x16, lookup_192_to_255_vec.u8x16));
-        vst1q_u8((sz_u8_t *)target, blended_0_to_255_vec.u8x16);
-    }
-
-    // Process the tail with serial code
-    for (; tail_length; target += 1, source += 1, tail_length -= 1) *target = lut[*(sz_u8_t const *)source];
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec, n_vec, matches_vec;
-    n_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)n);
-
-    while (h_length >= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)h);
-        matches_vec.u8x16 = vceqq_u8(h_vec.u8x16, n_vec.u8x16);
-        // In Arm NEON we don't have a `movemask` to combine it with `ctz` and get the offset of the match.
-        // But assuming the `vmaxvq` is cheap, we can use it to find the first match, by blending (bitwise selecting)
-        // the vector with a relative offsets array.
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        if (matches) return h + sz_u64_ctz(matches) / 4;
-
-        h += 16, h_length -= 16;
-    }
-
-    return sz_find_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec, n_vec, matches_vec;
-    n_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)n);
-
-    while (h_length >= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)h + h_length - 16);
-        matches_vec.u8x16 = vceqq_u8(h_vec.u8x16, n_vec.u8x16);
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        if (matches) return h + h_length - 1 - sz_u64_clz(matches) / 4;
-        h_length -= 16;
-    }
-
-    return sz_rfind_byte_serial(h, h_length, n);
-}
-
-SZ_PUBLIC sz_u64_t _sz_find_charset_neon_register(sz_u128_vec_t h_vec, uint8x16_t set_top_vec_u8x16,
-                                                  uint8x16_t set_bottom_vec_u8x16) {
-
-    // Once we've read the characters in the haystack, we want to
-    // compare them against our bitset. The serial version of that code
-    // would look like: `(set_->_u8s[c >> 3] & (1u << (c & 7u))) != 0`.
-    uint8x16_t byte_index_vec = vshrq_n_u8(h_vec.u8x16, 3);
-    uint8x16_t byte_mask_vec = vshlq_u8(vdupq_n_u8(1), vreinterpretq_s8_u8(vandq_u8(h_vec.u8x16, vdupq_n_u8(7))));
-    uint8x16_t matches_top_vec = vqtbl1q_u8(set_top_vec_u8x16, byte_index_vec);
-    // The table lookup instruction in NEON replies to out-of-bound requests with zeros.
-    // The values in `byte_index_vec` all fall in [0; 32). So for values under 16, substracting 16 will underflow
-    // and map into interval [240, 256). Meaning that those will be populated with zeros and we can safely
-    // merge `matches_top_vec` and `matches_bottom_vec` with a bitwise OR.
-    uint8x16_t matches_bottom_vec = vqtbl1q_u8(set_bottom_vec_u8x16, vsubq_u8(byte_index_vec, vdupq_n_u8(16)));
-    uint8x16_t matches_vec = vorrq_u8(matches_top_vec, matches_bottom_vec);
-    // Istead of pure `vandq_u8`, we can immediately broadcast a match presence across each 8-bit word.
-    matches_vec = vtstq_u8(matches_vec, byte_mask_vec);
-    return _sz_vreinterpretq_u8_u4(matches_vec);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_find_byte_neon(h, h_length, n);
-
-    // Scan through the string.
-    // Assuming how tiny the Arm NEON registers are, we should avoid internal branches at all costs.
-    // That's why, for smaller needles, we use different loops.
-    if (n_length == 2) {
-        // Broadcast needle characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_last_vec, n_first_vec, n_last_vec, matches_vec;
-        // Dealing with 16-bit values, we can load 2 registers at a time and compare 31 possible offsets
-        // in a single loop iteration.
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[0]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[1]);
-        for (; h_length >= 17; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 0));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 1));
-            matches_vec.u8x16 =
-                vandq_u8(vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            if (matches) return h + sz_u64_ctz(matches) / 4;
-        }
-    }
-    else if (n_length == 3) {
-        // Broadcast needle characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-        // Comparing 24-bit values is a bumer. Being lazy, I went with the same approach
-        // as when searching for string over 4 characters long. I only avoid the last comparison.
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[0]);
-        n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[1]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[2]);
-        for (; h_length >= 18; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 0));
-            h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 1));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 2));
-            matches_vec.u8x16 = vandq_u8(                           //
-                vandq_u8(                                           //
-                    vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                    vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-                vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            if (matches) return h + sz_u64_ctz(matches) / 4;
-        }
-    }
-    else {
-        // Pick the parts of the needle that are worth comparing.
-        sz_size_t offset_first, offset_mid, offset_last;
-        _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-        // Broadcast those characters into SIMD registers.
-        sz_u64_t matches;
-        sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-        n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_first]);
-        n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_mid]);
-        n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_last]);
-        // Walk through the string.
-        for (; h_length >= n_length + 16; h += 16, h_length -= 16) {
-            h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_first));
-            h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_mid));
-            h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_last));
-            matches_vec.u8x16 = vandq_u8(                           //
-                vandq_u8(                                           //
-                    vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                    vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-                vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-            while (matches) {
-                int potential_offset = sz_u64_ctz(matches) / 4;
-                if (sz_equal(h + potential_offset, n, n_length)) return h + potential_offset;
-                matches &= matches - 1;
-            }
-        }
-    }
-
-    return sz_find_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-
-    // This almost never fires, but it's better to be safe than sorry.
-    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
-    if (n_length == 1) return sz_rfind_byte_neon(h, h_length, n);
-
-    // Pick the parts of the needle that are worth comparing.
-    sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
-
-    // Will contain 4 bits per character.
-    sz_u64_t matches;
-    sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
-    n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_first]);
-    n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_mid]);
-    n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_last]);
-
-    sz_cptr_t h_reversed;
-    for (; h_length >= n_length + 16; h_length -= 16) {
-        h_reversed = h + h_length - n_length - 16 + 1;
-        h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_first));
-        h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_mid));
-        h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_last));
-        matches_vec.u8x16 = vandq_u8(                           //
-            vandq_u8(                                           //
-                vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
-                vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
-            vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
-        while (matches) {
-            int potential_offset = sz_u64_clz(matches) / 4;
-            if (sz_equal(h + h_length - n_length - potential_offset, n, n_length))
-                return h + h_length - n_length - potential_offset;
-            sz_assert((matches & (1ull << (63 - potential_offset * 4))) != 0 &&
-                      "The bit must be set before we squash it");
-            matches &= ~(1ull << (63 - potential_offset * 4));
-        }
-    }
-
-    return sz_rfind_serial(h, h_length, n, n_length);
-}
-
-SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_charset_t const *set) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec;
-    uint8x16_t set_top_vec_u8x16 = vld1q_u8(&set->_u8s[0]);
-    uint8x16_t set_bottom_vec_u8x16 = vld1q_u8(&set->_u8s[16]);
-
-    for (; h_length >= 16; h += 16, h_length -= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h));
-        matches = _sz_find_charset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
-        if (matches) return h + sz_u64_ctz(matches) / 4;
-    }
-
-    return sz_find_charset_serial(h, h_length, set);
-}
-
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_charset_t const *set) {
-    sz_u64_t matches;
-    sz_u128_vec_t h_vec;
-    uint8x16_t set_top_vec_u8x16 = vld1q_u8(&set->_u8s[0]);
-    uint8x16_t set_bottom_vec_u8x16 = vld1q_u8(&set->_u8s[16]);
-
-    // Check `sz_find_charset_neon` for explanations.
-    for (; h_length >= 16; h_length -= 16) {
-        h_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h) + h_length - 16);
-        matches = _sz_find_charset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
-        if (matches) return h + h_length - 1 - sz_u64_clz(matches) / 4;
-    }
-
-    return sz_rfind_charset_serial(h, h_length, set);
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif // Arm Neon
-
-#pragma endregion
-
-/*  @brief  Implementation of the string search algorithms using the Arm SVE variable-length registers, available
- *          in Arm v9 processors.
- *
- *  Implements:
- *      - memory: {copy, move, fill}
- *      - comparisons: {equal, order}
- *      - search: {substring, character, character set} x {forward, reverse}.
- */
-#pragma region ARM SVE
-
-#if SZ_USE_SVE
-#pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+sve")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
-
-SZ_PUBLIC void sz_fill_sve(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-    svuint8_t value_vec = svdup_u8(value);
-    sz_size_t vec_len = svcntb(); // Vector length in bytes (scalable)
-
-    if (length <= vec_len) {
-        // Small buffer case: use mask to handle small writes
-        svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)length);
-        svst1_u8(mask, (unsigned char *)target, value_vec);
-    }
-    else {
-        // Calculate head, body, and tail sizes
-        sz_size_t head_length = vec_len - ((sz_size_t)target % vec_len);
-        sz_size_t tail_length = (sz_size_t)(target + length) % vec_len;
-        sz_size_t body_length = length - head_length - tail_length;
-
-        // Handle unaligned head
-        svbool_t head_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)head_length);
-        svst1_u8(head_mask, (unsigned char *)target, value_vec);
-        target += head_length;
-
-        // Aligned body loop
-        for (; body_length >= vec_len; target += vec_len, body_length -= vec_len) {
-            svst1_u8(svptrue_b8(), (unsigned char *)target, value_vec);
-        }
-
-        // Handle unaligned tail
-        svbool_t tail_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)tail_length);
-        svst1_u8(tail_mask, (unsigned char *)target, value_vec);
-    }
-}
-
-SZ_PUBLIC void sz_copy_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    sz_size_t vec_len = svcntb(); // Vector length in bytes
-
-    // Arm Neoverse V2 cores in Graviton 4, for example, come with 256 KB of L1 data cache per core,
-    // and 8 MB of L2 cache per core. Moreover, the L1 cache is fully associative.
-    // With two strings, we may consider the overal workload huge, if each exceeds 1 MB in length.
-    //
-    //      int is_huge = length >= 4ull * 1024ull * 1024ull;
-    //
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= vec_len) {
-        // Small buffer case: use mask to handle small writes
-        svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)length);
-        svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-        svst1_u8(mask, (unsigned char *)target, data);
-    }
-    // When dealing with larger buffers, similar to AVX-512, we want minimize unaligned operations
-    // and handle the head, body, and tail separately. We can also traverse the buffer in both directions
-    // as Arm generally supports more simultaneous stores than x86 CPUs.
-    //
-    // For gigantic datasets, similar to AVX-512, non-temporal "loads" and "stores" can be used.
-    // Sadly, if the register size (16 byte or larger) is smaller than a cache-line (64 bytes)
-    // we will pay a huge penalty on loads, fetching the same content many times.
-    // It may be better to allow caching (and subsequent eviction), in favor of using four-element
-    // tuples, wich will be guaranteed to be a multiple of a cache line.
-    //
-    // Another approach is to use the `LD4B` instructions, which will populate four registers at once.
-    // This however, further decreases the performance from LibC-like 29 GB/s to 20 GB/s.
-    else {
-        // Calculating head, body, and tail sizes depends on the `vec_len`,
-        // but it's runtime constant, and the modulo operation is expensive!
-        // Instead we use the fact, that it's always a multiple of 128 bits or 16 bytes.
-        sz_size_t head_length = 16 - ((sz_size_t)target % 16);
-        sz_size_t tail_length = (sz_size_t)(target + length) % 16;
-        sz_size_t body_length = length - head_length - tail_length;
-
-        // Handle unaligned parts
-        svbool_t head_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)head_length);
-        svuint8_t head_data = svld1_u8(head_mask, (unsigned char *)source);
-        svst1_u8(head_mask, (unsigned char *)target, head_data);
-        svbool_t tail_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)tail_length);
-        svuint8_t tail_data = svld1_u8(tail_mask, (unsigned char *)source + head_length + body_length);
-        svst1_u8(tail_mask, (unsigned char *)target + head_length + body_length, tail_data);
-        target += head_length;
-        source += head_length;
-
-        // Aligned body loop, walking in two directions
-        for (; body_length >= vec_len * 2; target += vec_len, source += vec_len, body_length -= vec_len * 2) {
-            svuint8_t forward_data = svld1_u8(svptrue_b8(), (unsigned char *)source);
-            svuint8_t backward_data = svld1_u8(svptrue_b8(), (unsigned char *)source + body_length - vec_len);
-            svst1_u8(svptrue_b8(), (unsigned char *)target, forward_data);
-            svst1_u8(svptrue_b8(), (unsigned char *)target + body_length - vec_len, backward_data);
-        }
-        // Up to (vec_len * 2 - 1) bytes of data may be left in the body,
-        // so we can unroll the last two optional loop iterations.
-        if (body_length > vec_len) {
-            svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)body_length);
-            svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-            svst1_u8(mask, (unsigned char *)target, data);
-            body_length -= vec_len;
-            source += body_length;
-            target += body_length;
-        }
-        if (body_length) {
-            svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)body_length);
-            svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-            svst1_u8(mask, (unsigned char *)target, data);
-        }
-    }
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif // Arm SVE
-
-#pragma endregion
-
-/*
- *  @brief  Pick the right implementation for the string search algorithms.
- */
-#pragma region Compile Time Dispatching
-
-SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t ins, sz_size_t length) { return sz_hash_serial(ins, length); }
-SZ_PUBLIC void sz_tolower(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_tolower_serial(ins, length, outs); }
-SZ_PUBLIC void sz_toupper(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_toupper_serial(ins, length, outs); }
-SZ_PUBLIC void sz_toascii(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_toascii_serial(ins, length, outs); }
-SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t ins, sz_size_t length) { return sz_isascii_serial(ins, length); }
-
-SZ_PUBLIC void sz_hashes_fingerprint(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_ptr_t fingerprint,
-                                     sz_size_t fingerprint_bytes) {
-
-    sz_bool_t fingerprint_length_is_power_of_two = (sz_bool_t)((fingerprint_bytes & (fingerprint_bytes - 1)) == 0);
-    sz_string_view_t fingerprint_buffer = {fingerprint, fingerprint_bytes};
-
-    // There are several issues related to the fingerprinting algorithm.
-    // First, the memory traversal order is important.
-    // https://blog.stuffedcow.net/2015/08/pagewalk-coherence/
-
-    // In most cases the fingerprint length will be a power of two.
-    if (fingerprint_length_is_power_of_two == sz_false_k)
-        sz_hashes(start, length, window_length, 1, _sz_hashes_fingerprint_non_pow2_callback, &fingerprint_buffer);
-    else
-        sz_hashes(start, length, window_length, 1, _sz_hashes_fingerprint_pow2_callback, &fingerprint_buffer);
-}
-
-#if !SZ_DYNAMIC_DISPATCH
-
-SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length) {
-#if SZ_USE_ICE
-    return sz_checksum_avx512(text, length);
-#elif SZ_USE_HASWELL
-    return sz_checksum_avx2(text, length);
-#elif SZ_USE_NEON
-    return sz_checksum_neon(text, length);
-#else
-    return sz_checksum_serial(text, length);
-#endif
-}
-
-SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-#if SZ_USE_ICE
-    return sz_equal_skylake(a, b, length);
-#elif SZ_USE_HASWELL
-    return sz_equal_avx2(a, b, length);
-#elif SZ_USE_NEON
-    return sz_equal_neon(a, b, length);
-#else
-    return sz_equal_serial(a, b, length);
-#endif
-}
-
-SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
-#if SZ_USE_ICE
-    return sz_order_avx512(a, a_length, b, b_length);
-#elif SZ_USE_HASWELL
-    return sz_order_avx2(a, a_length, b, b_length);
-#elif SZ_USE_NEON
-    return sz_order_neon(a, a_length, b, b_length);
-#else
-    return sz_order_serial(a, a_length, b, b_length);
-#endif
-}
-
-SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_ICE
-    sz_copy_avx512(target, source, length);
-#elif SZ_USE_HASWELL
-    sz_copy_avx2(target, source, length);
-#elif SZ_USE_NEON
-    sz_copy_neon(target, source, length);
-#else
-    sz_copy_serial(target, source, length);
-#endif
-}
-
-SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_ICE
-    sz_move_avx512(target, source, length);
-#elif SZ_USE_HASWELL
-    sz_move_avx2(target, source, length);
-#elif SZ_USE_NEON
-    sz_move_neon(target, source, length);
-#else
-    sz_move_serial(target, source, length);
-#endif
-}
-
-SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-#if SZ_USE_ICE
-    sz_fill_avx512(target, length, value);
-#elif SZ_USE_HASWELL
-    sz_fill_avx2(target, length, value);
-#elif SZ_USE_NEON
-    sz_fill_neon(target, length, value);
-#else
-    sz_fill_serial(target, length, value);
-#endif
-}
-
-SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-#if SZ_USE_ICE
-    sz_look_up_transform_ice(source, length, lut, target);
-#elif SZ_USE_HASWELL
-    sz_look_up_transform_avx2(source, length, lut, target);
-#elif SZ_USE_NEON
-    sz_look_up_transform_neon(source, length, lut, target);
-#else
-    sz_look_up_transform_serial(source, length, lut, target);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
-#if SZ_USE_ICE
-    return sz_find_byte_avx512(haystack, h_length, needle);
-#elif SZ_USE_HASWELL
-    return sz_find_byte_avx2(haystack, h_length, needle);
-#elif SZ_USE_NEON
-    return sz_find_byte_neon(haystack, h_length, needle);
-#else
-    return sz_find_byte_serial(haystack, h_length, needle);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
-#if SZ_USE_ICE
-    return sz_rfind_byte_avx512(haystack, h_length, needle);
-#elif SZ_USE_HASWELL
-    return sz_rfind_byte_avx2(haystack, h_length, needle);
-#elif SZ_USE_NEON
-    return sz_rfind_byte_neon(haystack, h_length, needle);
-#else
-    return sz_rfind_byte_serial(haystack, h_length, needle);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
-#if SZ_USE_ICE
-    return sz_find_skylake(haystack, h_length, needle, n_length);
-#elif SZ_USE_HASWELL
-    return sz_find_avx2(haystack, h_length, needle, n_length);
-#elif SZ_USE_NEON
-    return sz_find_neon(haystack, h_length, needle, n_length);
-#else
-    return sz_find_serial(haystack, h_length, needle, n_length);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
-#if SZ_USE_ICE
-    return sz_rfind_skylake(haystack, h_length, needle, n_length);
-#elif SZ_USE_HASWELL
-    return sz_rfind_avx2(haystack, h_length, needle, n_length);
-#elif SZ_USE_NEON
-    return sz_rfind_neon(haystack, h_length, needle, n_length);
-#else
-    return sz_rfind_serial(haystack, h_length, needle, n_length);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#if SZ_USE_ICE
-    return sz_find_charset_ice(text, length, set);
-#elif SZ_USE_HASWELL
-    return sz_find_charset_avx2(text, length, set);
-#elif SZ_USE_NEON
-    return sz_find_charset_neon(text, length, set);
-#else
-    return sz_find_charset_serial(text, length, set);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
-#if SZ_USE_ICE
-    return sz_rfind_charset_ice(text, length, set);
-#elif SZ_USE_HASWELL
-    return sz_rfind_charset_avx2(text, length, set);
-#elif SZ_USE_NEON
-    return sz_rfind_charset_neon(text, length, set);
-#else
-    return sz_rfind_charset_serial(text, length, set);
-#endif
-}
-
-SZ_DYNAMIC sz_size_t sz_hamming_distance( //
-    sz_cptr_t a, sz_size_t a_length,      //
-    sz_cptr_t b, sz_size_t b_length,      //
-    sz_size_t bound) {
-    return sz_hamming_distance_serial(a, a_length, b, b_length, bound);
-}
-
-SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length,           //
-    sz_cptr_t b, sz_size_t b_length,           //
-    sz_size_t bound) {
-    return sz_hamming_distance_utf8_serial(a, a_length, b, b_length, bound);
-}
-
-SZ_DYNAMIC sz_size_t sz_edit_distance( //
-    sz_cptr_t a, sz_size_t a_length,   //
-    sz_cptr_t b, sz_size_t b_length,   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-#if SZ_USE_ICE
-    return sz_edit_distance_avx512(a, a_length, b, b_length, bound, alloc);
-#else
-    return sz_edit_distance_serial(a, a_length, b, b_length, bound, alloc);
-#endif
-}
-
-SZ_DYNAMIC sz_size_t sz_edit_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length,        //
-    sz_cptr_t b, sz_size_t b_length,        //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-    return _sz_edit_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc);
-}
-
-SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
-                                         sz_error_cost_t const *subs, sz_error_cost_t gap,
-                                         sz_memory_allocator_t *alloc) {
-#if SZ_USE_ICE
-    return sz_alignment_score_avx512(a, a_length, b, b_length, subs, gap, alloc);
-#else
-    return sz_alignment_score_serial(a, a_length, b, b_length, subs, gap, alloc);
-#endif
-}
-
-SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                          sz_hash_callback_t callback, void *callback_handle) {
-#if SZ_USE_ICE
-    sz_hashes_avx512(text, length, window_length, window_step, callback, callback_handle);
-#elif SZ_USE_HASWELL
-    sz_hashes_avx2(text, length, window_length, window_step, callback, callback_handle);
-#else
-    sz_hashes_serial(text, length, window_length, window_step, callback, callback_handle);
-#endif
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    return sz_find_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    sz_charset_invert(&set);
-    return sz_find_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    return sz_rfind_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    sz_charset_invert(&set);
-    return sz_rfind_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
-                            sz_random_generator_t generator, void *generator_user_data) {
-    sz_generate_serial(alphabet, alphabet_size, result, result_length, generator, generator_user_data);
-}
-
-#endif
-#pragma endregion
-
 #ifdef __cplusplus
-#pragma GCC diagnostic pop
 }
 #endif // __cplusplus
 
diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index a80da804..f65b0212 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -20,6 +20,7 @@
  *  @brief  When set to 1, the library will include the C++ STL headers and implement
  *          automatic conversion from and to `std::stirng_view` and `std::basic_string<any_allocator>`.
  */
+#include "types.h"
 #ifndef SZ_AVOID_STL
 #define SZ_AVOID_STL (0) // true or false
 #endif
@@ -2069,7 +2070,7 @@ class basic_string {
      *  @brief  The number of characters that can be stored in the internal buffer.
      *          Depends on the size of the internal buffer for the "Small String Optimization".
      */
-    static constexpr size_type min_capacity = SZ_STRING_INTERNAL_SPACE - 1;
+    static constexpr size_type min_capacity = _SZ_STRING_INTERNAL_SPACE - 1;
 
 #pragma region Constructors and STL Utilities
 
@@ -3663,8 +3664,9 @@ bool basic_string<char_type_, allocator_>::try_assign(concatenation<first_type,
 }
 
 template <typename char_type_, typename allocator_>
-bool basic_string<char_type_, allocator_>::try_preparing_replacement(size_type offset, size_type length,
-                                                                     size_type replacement_length) noexcept {
+bool basic_string<char_type_, allocator_>::try_preparing_replacement( //
+    size_type offset, size_type length, size_type replacement_length) noexcept {
+
     // There are three cases:
     // 1. The replacement is the same length as the replaced range.
     // 2. The replacement is shorter than the replaced range.
@@ -3759,10 +3761,11 @@ typename concatenation_result<first_type, second_type, following_types...>::type
     //      std::string result;
     //      result.reserve(total_size);
     //      (result.append(strings), ...);
-    return ashvardanian::stringzilla::concatenate(
+    return ashvardanian::stringzilla::concatenate( //
         std::forward<first_type>(first),
-        ashvardanian::stringzilla::concatenate(std::forward<second_type>(second),
-                                               std::forward<following_types>(following)...));
+        ashvardanian::stringzilla::concatenate( //
+            std::forward<second_type>(second),  //
+            std::forward<following_types>(following)...));
 }
 
 /**
@@ -3770,8 +3773,9 @@ typename concatenation_result<first_type, second_type, following_types...>::type
  *  @see    sz_edit_distance
  */
 template <typename char_type_>
-std::size_t hamming_distance(basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b,
-                             std::size_t bound = 0) noexcept {
+std::size_t hamming_distance(                                                         //
+    basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b, //
+    std::size_t bound = 0) noexcept {
     return sz_hamming_distance(a.data(), a.size(), b.data(), b.size(), bound);
 }
 
@@ -3780,8 +3784,9 @@ std::size_t hamming_distance(basic_string_slice<char_type_> const &a, basic_stri
  *  @see    sz_edit_distance
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
-std::size_t hamming_distance(basic_string<char_type_, allocator_type_> const &a,
-                             basic_string<char_type_, allocator_type_> const &b, std::size_t bound = 0) noexcept {
+std::size_t hamming_distance(                                                                               //
+    basic_string<char_type_, allocator_type_> const &a, basic_string<char_type_, allocator_type_> const &b, //
+    std::size_t bound = 0) noexcept {
     return ashvardanian::stringzilla::hamming_distance(a.view(), b.view(), bound);
 }
 
@@ -3790,8 +3795,8 @@ std::size_t hamming_distance(basic_string<char_type_, allocator_type_> const &a,
  *  @see    sz_hamming_distance_utf8
  */
 template <typename char_type_>
-std::size_t hamming_distance_utf8(basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b,
-                                  std::size_t bound = 0) noexcept {
+std::size_t hamming_distance_utf8( //
+    basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b, std::size_t bound = 0) noexcept {
     return sz_hamming_distance_utf8(a.data(), a.size(), b.data(), b.size(), bound);
 }
 
@@ -3800,8 +3805,9 @@ std::size_t hamming_distance_utf8(basic_string_slice<char_type_> const &a, basic
  *  @see    sz_edit_distance
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
-std::size_t hamming_distance_utf8(basic_string<char_type_, allocator_type_> const &a,
-                                  basic_string<char_type_, allocator_type_> const &b, std::size_t bound = 0) noexcept {
+std::size_t hamming_distance_utf8( //
+    basic_string<char_type_, allocator_type_> const &a, basic_string<char_type_, allocator_type_> const &b,
+    std::size_t bound = 0) noexcept {
     return ashvardanian::stringzilla::hamming_distance_utf8(a.view(), b.view(), bound);
 }
 
@@ -3810,8 +3816,9 @@ std::size_t hamming_distance_utf8(basic_string<char_type_, allocator_type_> cons
  *  @see    sz_edit_distance
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
-std::size_t edit_distance(basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b,
-                          std::size_t bound = 0, allocator_type_ &&allocator = allocator_type_ {}) noexcept(false) {
+std::size_t edit_distance( //
+    basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b, std::size_t bound = SZ_SIZE_MAX,
+    allocator_type_ &&allocator = allocator_type_ {}) noexcept(false) {
     std::size_t result;
     if (!_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
             result = sz_edit_distance(a.data(), a.size(), b.data(), b.size(), bound, &alloc);
@@ -3826,8 +3833,9 @@ std::size_t edit_distance(basic_string_slice<char_type_> const &a, basic_string_
  *  @see    sz_edit_distance
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<char_type_>>
-std::size_t edit_distance(basic_string<char_type_, allocator_type_> const &a,
-                          basic_string<char_type_, allocator_type_> const &b, std::size_t bound = 0) noexcept(false) {
+std::size_t edit_distance(                                                                                  //
+    basic_string<char_type_, allocator_type_> const &a, basic_string<char_type_, allocator_type_> const &b, //
+    std::size_t bound = SZ_SIZE_MAX) noexcept(false) {
     return ashvardanian::stringzilla::edit_distance(a.view(), b.view(), bound, a.get_allocator());
 }
 
@@ -3836,9 +3844,9 @@ std::size_t edit_distance(basic_string<char_type_, allocator_type_> const &a,
  *  @see    sz_edit_distance_utf8
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
-std::size_t edit_distance_utf8(basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b,
-                               std::size_t bound = 0,
-                               allocator_type_ &&allocator = allocator_type_ {}) noexcept(false) {
+std::size_t edit_distance_utf8(                                                       //
+    basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b, //
+    std::size_t bound = SZ_SIZE_MAX, allocator_type_ &&allocator = allocator_type_ {}) noexcept(false) {
     std::size_t result;
     if (!_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
             result = sz_edit_distance_utf8(a.data(), a.size(), b.data(), b.size(), bound, &alloc);
@@ -3853,9 +3861,9 @@ std::size_t edit_distance_utf8(basic_string_slice<char_type_> const &a, basic_st
  *  @see    sz_edit_distance_utf8
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<char_type_>>
-std::size_t edit_distance_utf8(basic_string<char_type_, allocator_type_> const &a,
-                               basic_string<char_type_, allocator_type_> const &b,
-                               std::size_t bound = 0) noexcept(false) {
+std::size_t edit_distance_utf8(                                                                             //
+    basic_string<char_type_, allocator_type_> const &a, basic_string<char_type_, allocator_type_> const &b, //
+    std::size_t bound = SZ_SIZE_MAX) noexcept(false) {
     return ashvardanian::stringzilla::edit_distance_utf8(a.view(), b.view(), bound, a.get_allocator());
 }
 
@@ -3864,9 +3872,10 @@ std::size_t edit_distance_utf8(basic_string<char_type_, allocator_type_> const &
  *  @see    sz_alignment_score
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
-std::ptrdiff_t alignment_score(basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b,
-                               std::int8_t const (&subs)[256][256], std::int8_t gap = -1,
-                               allocator_type_ &&allocator = allocator_type_ {}) noexcept(false) {
+std::ptrdiff_t alignment_score(                                                       //
+    basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b, //
+    std::int8_t const (&subs)[256][256], std::int8_t gap = -1,
+    allocator_type_ &&allocator = allocator_type_ {}) noexcept(false) {
 
     static_assert(sizeof(sz_error_cost_t) == sizeof(std::int8_t), "sz_error_cost_t must be 8-bit.");
     static_assert(std::is_signed<sz_error_cost_t>() == std::is_signed<std::int8_t>(),
@@ -3886,9 +3895,9 @@ std::ptrdiff_t alignment_score(basic_string_slice<char_type_> const &a, basic_st
  *  @see    sz_alignment_score
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<char_type_>>
-std::ptrdiff_t alignment_score(basic_string<char_type_, allocator_type_> const &a,
-                               basic_string<char_type_, allocator_type_> const &b, //
-                               std::int8_t const (&subs)[256][256], std::int8_t gap = -1) noexcept(false) {
+std::ptrdiff_t alignment_score(                                                                             //
+    basic_string<char_type_, allocator_type_> const &a, basic_string<char_type_, allocator_type_> const &b, //
+    std::int8_t const (&subs)[256][256], std::int8_t gap = -1) noexcept(false) {
     return ashvardanian::stringzilla::alignment_score(a.view(), b.view(), subs, gap, a.get_allocator());
 }
 
@@ -3900,8 +3909,9 @@ std::ptrdiff_t alignment_score(basic_string<char_type_, allocator_type_> const &
  *  @param  alphabet   A string of characters to choose from.
  */
 template <typename char_type_, typename generator_type_>
-void randomize(basic_string_slice<char_type_> string, generator_type_ &generator,
-               string_view alphabet = "abcdefghijklmnopqrstuvwxyz") noexcept {
+void randomize( //
+    basic_string_slice<char_type_> string, generator_type_ &generator,
+    string_view alphabet = "abcdefghijklmnopqrstuvwxyz") noexcept {
     static_assert(!std::is_const<char_type_>::value, "The string must be mutable.");
     sz_random_generator_t generator_callback = &_call_random_generator<generator_type_>;
     sz_generate(alphabet.data(), alphabet.size(), string.data(), string.size(), generator_callback, &generator);
@@ -3921,8 +3931,9 @@ void transform(basic_string_slice<char_type_> string, basic_look_up_table<char_t
  *  @brief  Maps all characters in the current string into another buffer using the provided lookup table.
  */
 template <typename char_type_>
-void transform(basic_string_slice<char_type_ const> source, basic_look_up_table<char_type_> const &table,
-               char_type_ *target) noexcept {
+void transform( //
+    basic_string_slice<char_type_ const> source, basic_look_up_table<char_type_> const &table,
+    char_type_ *target) noexcept {
     static_assert(sizeof(char_type_) == 1, "The character type must be 1 byte long.");
     sz_look_up_transform((sz_cptr_t)source.data(), (sz_size_t)source.size(), (sz_cptr_t)table.raw(), (sz_ptr_t)target);
 }
@@ -4007,8 +4018,9 @@ void sorted_order(objects_type_ const *begin, objects_type_ const *end, sorted_i
  *  @see    sz_hashes
  */
 template <std::size_t bitset_bits_, typename char_type_>
-void hashes_fingerprint(basic_string_slice<char_type_> const &str, std::size_t window_length,
-                        std::bitset<bitset_bits_> &fingerprint) noexcept {
+void hashes_fingerprint( //
+    basic_string_slice<char_type_> const &str, std::size_t window_length,
+    std::bitset<bitset_bits_> &fingerprint) noexcept {
     constexpr std::size_t fingerprint_bytes = sizeof(std::bitset<bitset_bits_>);
     return sz_hashes_fingerprint(str.data(), str.size(), window_length, (sz_ptr_t)&fingerprint, fingerprint_bytes);
 }
@@ -4018,8 +4030,8 @@ void hashes_fingerprint(basic_string_slice<char_type_> const &str, std::size_t w
  *  @see    sz_hashes
  */
 template <std::size_t bitset_bits_, typename char_type_>
-std::bitset<bitset_bits_> hashes_fingerprint(basic_string_slice<char_type_> const &str,
-                                             std::size_t window_length) noexcept {
+std::bitset<bitset_bits_> hashes_fingerprint( //
+    basic_string_slice<char_type_> const &str, std::size_t window_length) noexcept {
     std::bitset<bitset_bits_> fingerprint;
     ashvardanian::stringzilla::hashes_fingerprint(str, window_length, fingerprint);
     return fingerprint;
@@ -4040,8 +4052,8 @@ std::bitset<bitset_bits_> hashes_fingerprint(basic_string<char_type_> const &str
  *  @throw  `std::bad_alloc` if the allocation fails.
  */
 template <typename objects_type_, typename string_extractor_>
-std::vector<sorted_idx_t> sorted_order(objects_type_ const *begin, objects_type_ const *end,
-                                       string_extractor_ &&extractor) noexcept(false) {
+std::vector<sorted_idx_t> sorted_order( //
+    objects_type_ const *begin, objects_type_ const *end, string_extractor_ &&extractor) noexcept(false) {
     std::vector<sorted_idx_t> order(end - begin);
     sorted_order(begin, end, order.data(), std::forward<string_extractor_>(extractor));
     return order;
@@ -4054,8 +4066,8 @@ std::vector<sorted_idx_t> sorted_order(objects_type_ const *begin, objects_type_
  */
 template <typename string_like_type_>
 std::vector<sorted_idx_t> sorted_order(string_like_type_ const *begin, string_like_type_ const *end) noexcept(false) {
-    static_assert(std::is_convertible<string_like_type_, string_view>::value,
-                  "The type must be convertible to string_view.");
+    static_assert( //
+        std::is_convertible<string_like_type_, string_view>::value, "The type must be convertible to string_view.");
     return sorted_order(begin, end, [](string_like_type_ const &s) -> string_view { return s; });
 }
 
@@ -4066,8 +4078,8 @@ std::vector<sorted_idx_t> sorted_order(string_like_type_ const *begin, string_li
  */
 template <typename string_like_type_>
 std::vector<sorted_idx_t> sorted_order(std::vector<string_like_type_> const &array) noexcept(false) {
-    static_assert(std::is_convertible<string_like_type_, string_view>::value,
-                  "The type must be convertible to string_view.");
+    static_assert( //
+        std::is_convertible<string_like_type_, string_view>::value, "The type must be convertible to string_view.");
     return sorted_order(array.data(), array.data() + array.size(),
                         [](string_like_type_ const &s) -> string_view { return s; });
 }
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index be4a3e0d..8002b8a0 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -356,6 +356,12 @@ typedef union sz_charset_t {
 /** @brief  Initializes a bit-set to an empty collection, meaning - all characters are banned. */
 SZ_PUBLIC void sz_charset_init(sz_charset_t *s) { s->_u64s[0] = s->_u64s[1] = s->_u64s[2] = s->_u64s[3] = 0; }
 
+/** @brief  Initializes a bit-set to all ASCII character. */
+SZ_PUBLIC void sz_charset_init_ascii(sz_charset_t *s) {
+    s->_u64s[0] = s->_u64s[1] = 0xFFFFFFFFFFFFFFFFull;
+    s->_u64s[2] = s->_u64s[3] = 0;
+}
+
 /** @brief  Adds a character to the set and accepts @b unsigned integers. */
 SZ_PUBLIC void sz_charset_add_u8(sz_charset_t *s, sz_u8_t c) { s->_u64s[c >> 6] |= (1ull << (c & 63u)); }
 
@@ -697,7 +703,7 @@ SZ_PUBLIC void sz_sequence_from_u64tape( //
 #define SZ_CACHE_LINE_WIDTH (64) // bytes
 
 /**
- *  @brief  Similar to `assert`, the `sz_assert` is used in the SZ_DEBUG mode
+ *  @brief  Similar to `assert`, the `_sz_assert` is used in the SZ_DEBUG mode
  *          to check the invariants of the library. It's a no-op in the SZ_RELEASE mode.
  *  @note   If you want to catch it, put a breakpoint at @b `__GI_exit`
  */
@@ -708,12 +714,12 @@ SZ_PUBLIC void _sz_assert_failure(char const *condition, char const *file, int l
     fprintf(stderr, "Assertion failed: %s, in file %s, line %d\n", condition, file, line);
     exit(EXIT_FAILURE);
 }
-#define sz_assert(condition)                                                      \
+#define _sz_assert(condition)                                                     \
     do {                                                                          \
         if (!(condition)) { _sz_assert_failure(#condition, __FILE__, __LINE__); } \
     } while (0)
 #else
-#define sz_assert(condition) ((void)(condition))
+#define _sz_assert(condition) ((void)(condition))
 #endif
 
 /*  Intrinsics aliases for MSVC, GCC, Clang, and Clang-Cl.
@@ -732,13 +738,13 @@ SZ_PUBLIC void _sz_assert_failure(char const *condition, char const *file, int l
 // Use the serial version on 32-bit x86 and on Arm.
 #if (defined(_WIN32) && !defined(_WIN64)) || defined(_M_ARM) || defined(_M_ARM64)
 SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) {
-    sz_assert(x != 0);
+    _sz_assert(x != 0);
     int n = 0;
     while ((x & 1) == 0) { n++, x >>= 1; }
     return n;
 }
 SZ_INTERNAL int sz_u64_clz(sz_u64_t x) {
-    sz_assert(x != 0);
+    _sz_assert(x != 0);
     int n = 0;
     while ((x & 0x8000000000000000ull) == 0) { n++, x <<= 1; }
     return n;
@@ -749,13 +755,13 @@ SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) {
     return (((x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Full) * 0x0101010101010101ull) >> 56;
 }
 SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) {
-    sz_assert(x != 0);
+    _sz_assert(x != 0);
     int n = 0;
     while ((x & 1) == 0) { n++, x >>= 1; }
     return n;
 }
 SZ_INTERNAL int sz_u32_clz(sz_u32_t x) {
-    sz_assert(x != 0);
+    _sz_assert(x != 0);
     int n = 0;
     while ((x & 0x80000000u) == 0) { n++, x <<= 1; }
     return n;
@@ -896,7 +902,7 @@ SZ_INTERNAL void sz_ssize_clamp_interval( //
  *  @brief  Compute the logarithm base 2 of a positive integer, rounding down.
  */
 SZ_INTERNAL sz_size_t sz_size_log2i_nonzero(sz_size_t x) {
-    sz_assert(x > 0 && "Non-positive numbers have no defined logarithm");
+    _sz_assert(x > 0 && "Non-positive numbers have no defined logarithm");
     sz_size_t leading_zeros = sz_u64_clz(x);
     return 63 - leading_zeros;
 }
@@ -1042,33 +1048,6 @@ SZ_INTERNAL void _sz_memory_free_fixed(sz_ptr_t start, sz_size_t length, void *h
     sz_unused(start && length && handle);
 }
 
-/** @brief  An internal callback used to set a bit in a power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void _sz_hashes_fingerprint_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *handle) {
-    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
-    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
-    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
-    fingerprint_u8s[(hash / 8) & (fingerprint_bytes - 1)] |= (1 << (hash & 7));
-    sz_unused(start && length);
-}
-
-/** @brief  An internal callback used to set a bit in a @b non power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void _sz_hashes_fingerprint_non_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash,
-                                                          void *handle) {
-    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
-    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
-    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
-    fingerprint_u8s[(hash / 8) % fingerprint_bytes] |= (1 << (hash & 7));
-    sz_unused(start && length);
-}
-
-/** @brief  An internal callback, used to mix all the running hashes into one pointer-size value. */
-SZ_INTERNAL void _sz_hashes_fingerprint_scalar_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash,
-                                                        void *scalar_handle) {
-    sz_unused(start && length && hash && scalar_handle);
-    sz_size_t *scalar_ptr = (sz_size_t *)scalar_handle;
-    *scalar_ptr ^= hash;
-}
-
 #pragma GCC visibility pop
 #pragma endregion
 
diff --git a/scripts/bench_memory.cpp b/scripts/bench_memory.cpp
index ee6ae03b..93d7ab2d 100644
--- a/scripts/bench_memory.cpp
+++ b/scripts/bench_memory.cpp
@@ -69,11 +69,11 @@ tracked_unary_functions_t copy_functions(sz_cptr_t dataset_start_ptr, sz_ptr_t o
     tracked_unary_functions_t result = {
         {"memcpy" + suffix, wrap_sz(memcpy)},
         {"sz_copy_serial" + suffix, wrap_sz(sz_copy_serial)},
-#if SZ_USE_ICE
-        {"sz_copy_avx512" + suffix, wrap_sz(sz_copy_avx512)},
+#if SZ_USE_SKYLAKE
+        {"sz_copy_skylake" + suffix, wrap_sz(sz_copy_skylake)},
 #endif
 #if SZ_USE_HASWELL
-        {"sz_copy_avx2" + suffix, wrap_sz(sz_copy_avx2)},
+        {"sz_copy_haswell" + suffix, wrap_sz(sz_copy_haswell)},
 #endif
 #if SZ_USE_SVE
         {"sz_copy_sve" + suffix, wrap_sz(sz_copy_sve)},
@@ -109,11 +109,11 @@ tracked_unary_functions_t fill_functions(sz_cptr_t dataset_start_ptr, sz_ptr_t o
              return slice.size();
          })},
         {"sz_fill_serial", wrap_sz(sz_fill_serial)},
-#if SZ_USE_ICE
-        {"sz_fill_avx512", wrap_sz(sz_fill_avx512)},
+#if SZ_USE_SKYLAKE
+        {"sz_fill_avx512", wrap_sz(sz_fill_skylake)},
 #endif
 #if SZ_USE_HASWELL
-        {"sz_fill_avx2", wrap_sz(sz_fill_avx2)},
+        {"sz_fill_haswell", wrap_sz(sz_fill_haswell)},
 #endif
 #if SZ_USE_SVE
         {"sz_fill_sve", wrap_sz(sz_fill_sve)},
@@ -149,11 +149,11 @@ tracked_unary_functions_t move_functions(sz_cptr_t dataset_start_ptr, sz_ptr_t o
     tracked_unary_functions_t result = {
         {"memmove" + suffix, wrap_sz(memmove)},
         {"sz_move_serial" + suffix, wrap_sz(sz_move_serial)},
-#if SZ_USE_ICE
-        {"sz_move_avx512" + suffix, wrap_sz(sz_move_avx512)},
+#if SZ_USE_SKYLAKE
+        {"sz_move_skylake" + suffix, wrap_sz(sz_move_skylake)},
 #endif
 #if SZ_USE_HASWELL
-        {"sz_move_avx2" + suffix, wrap_sz(sz_move_avx2)},
+        {"sz_move_haswell" + suffix, wrap_sz(sz_move_haswell)},
 #endif
 #if SZ_USE_NEON
         {"sz_move_neon" + suffix, wrap_sz(sz_move_neon)},
@@ -196,7 +196,7 @@ tracked_unary_functions_t transform_functions() {
         {"sz_look_up_transform_ice", wrap_sz(sz_look_up_transform_ice)},
 #endif
 #if SZ_USE_HASWELL
-        {"sz_look_up_transform_avx2", wrap_sz(sz_look_up_transform_avx2)},
+        {"sz_look_up_transform_haswell", wrap_sz(sz_look_up_transform_haswell)},
 #endif
 #if SZ_USE_NEON
         {"sz_look_up_transform_neon", wrap_sz(sz_look_up_transform_neon)},
diff --git a/scripts/bench_similarity.cpp b/scripts/bench_similarity.cpp
index 140433e2..9aa964c3 100644
--- a/scripts/bench_similarity.cpp
+++ b/scripts/bench_similarity.cpp
@@ -52,11 +52,11 @@ tracked_binary_functions_t distance_functions() {
     };
     tracked_binary_functions_t result = {
         {"naive", wrap_baseline},
-        {"sz_edit_distance", wrap_sz_distance(sz_edit_distance_serial), true},
-        {"sz_alignment_score", wrap_sz_scoring(sz_alignment_score_serial), true},
+        {"sz_edit_distance_serial", wrap_sz_distance(sz_edit_distance_serial), true},
+        {"sz_alignment_score_serial", wrap_sz_scoring(sz_alignment_score_serial), true},
 #if SZ_USE_ICE
-        {"sz_edit_distance_avx512", wrap_sz_distance(sz_edit_distance_avx512), true},
-        {"sz_alignment_score_avx512", wrap_sz_scoring(sz_alignment_score_avx512), true},
+        {"sz_edit_distance_ice", wrap_sz_distance(sz_edit_distance_ice), true},
+        {"sz_alignment_score_ice", wrap_sz_scoring(sz_alignment_score_ice), true},
 #endif
     };
     return result;
diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index 1120ad52..492f93f4 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -55,8 +55,8 @@ tracked_unary_functions_t sliding_hashing_functions(std::size_t window_width, st
     };
     std::string suffix = std::to_string(window_width) + ":step" + std::to_string(step);
     tracked_unary_functions_t result = {
-#if SZ_USE_ICE
-        {"sz_hashes_avx512:" + suffix, wrap_sz(sz_hashes_avx512)},
+#if SZ_USE_SKYLAKE
+        {"sz_hashes_skylake:" + suffix, wrap_sz(sz_hashes_skylake)},
 #endif
 #if SZ_USE_HASWELL
         {"sz_hashes_haswell:" + suffix, wrap_sz(sz_hashes_haswell)},
@@ -120,8 +120,8 @@ tracked_binary_functions_t equality_functions() {
 #if SZ_USE_HASWELL
         {"sz_equal_haswell", wrap_sz(sz_equal_haswell), true},
 #endif
-#if SZ_USE_ICE
-        {"sz_equal_avx512", wrap_sz(sz_equal_avx512), true},
+#if SZ_USE_SKYLAKE
+        {"sz_equal_skylake", wrap_sz(sz_equal_skylake), true},
 #endif
         {"memcmp",
          [](std::string_view a, std::string_view b) {
@@ -147,8 +147,8 @@ tracked_binary_functions_t ordering_functions() {
 #if SZ_USE_HASWELL
         {"sz_order_haswell", wrap_sz(sz_order_haswell), true},
 #endif
-#if SZ_USE_ICE
-        {"sz_order_avx512", wrap_sz(sz_order_avx512), true},
+#if SZ_USE_SKYLAKE
+        {"sz_order_skylake", wrap_sz(sz_order_skylake), true},
 #endif
         {"memcmp",
          [](std::string_view a, std::string_view b) {
diff --git a/scripts/test.cpp b/scripts/test.cpp
index db856a8e..3f9add3b 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -121,7 +121,7 @@ static void test_arithmetical_utilities() {
     assert(sz_size_bit_ceil(1000000000ull) == (1ull << 30));
     assert(sz_size_bit_ceil(2000000000ull) == (1ull << 31));
 
-#if SZ_DETECT_64_BIT
+#if _SZ_IS_64_BIT
     assert(sz_size_bit_ceil(4000000000ull) == (1ull << 32));
     assert(sz_size_bit_ceil(8000000000ull) == (1ull << 33));
     assert(sz_size_bit_ceil(16000000000ull) == (1ull << 34));
@@ -130,11 +130,6 @@ static void test_arithmetical_utilities() {
     assert(sz_size_bit_ceil((1ull << 62) + 1) == (1ull << 63));
     assert(sz_size_bit_ceil((1ull << 63)) == (1ull << 63));
 #endif
-
-    for (sz_u16_t number = 0; number != 256; ++number)
-        for (sz_u16_t divisor = 2; divisor != 256; ++divisor)
-            assert(sz_u8_divide(static_cast<sz_u8_t>(number), static_cast<sz_u8_t>(divisor)) ==
-                   (static_cast<sz_u8_t>(number) / static_cast<sz_u8_t>(divisor)));
 }
 
 inline void expect_equality(char const *a, char const *b, std::size_t size) {
@@ -571,7 +566,7 @@ static void test_stl_compatibility_for_updates() {
 
     // On 32-bit systems the base capacity can be larger than our `z::string::min_capacity`.
     // It's true for MSVC: https://github.com/ashvardanian/StringZilla/issues/168
-    if (SZ_DETECT_64_BIT) assert_scoped(str s = "hello", s.shrink_to_fit(), s.capacity() <= sz::string::min_capacity);
+    if (_SZ_IS_64_BIT) assert_scoped(str s = "hello", s.shrink_to_fit(), s.capacity() <= sz::string::min_capacity);
 
     // Concatenation.
     // Following are missing in strings, but are present in vectors.
@@ -1559,16 +1554,16 @@ static void test_stl_containers() {
 
 int main(int argc, char const **argv) {
 
-    auto dist = _sz_edit_distance_skewed_diagonals_upto63_avx512("kiten", 5, "katerinas", 9, SZ_SIZE_MAX);
-    sz_assert(dist == 5);
-    dist = _sz_edit_distance_skewed_diagonals_upto63_avx512("kiten", 5, "katerinas", 9, 3);
-    sz_assert(dist == SZ_SIZE_MAX);
-    dist = _sz_edit_distance_skewed_diagonals_upto63_avx512("kiten", 5, "katerinas", 9, 4);
-    sz_assert(dist == SZ_SIZE_MAX);
-    dist = _sz_edit_distance_skewed_diagonals_upto63_avx512("kiten", 5, "katerinas", 9, 5);
-    sz_assert(dist == 5);
-    dist = _sz_edit_distance_skewed_diagonals_upto63_avx512("kiten", 5, "katerinas", 9, 6);
-    sz_assert(dist == 5);
+    auto dist = _sz_edit_distance_skewed_diagonals_upto63_ice("kiten", 5, "katerinas", 9, SZ_SIZE_MAX);
+    _sz_assert(dist == 5);
+    dist = _sz_edit_distance_skewed_diagonals_upto63_ice("kiten", 5, "katerinas", 9, 3);
+    _sz_assert(dist == SZ_SIZE_MAX);
+    dist = _sz_edit_distance_skewed_diagonals_upto63_ice("kiten", 5, "katerinas", 9, 4);
+    _sz_assert(dist == SZ_SIZE_MAX);
+    dist = _sz_edit_distance_skewed_diagonals_upto63_ice("kiten", 5, "katerinas", 9, 5);
+    _sz_assert(dist == 5);
+    dist = _sz_edit_distance_skewed_diagonals_upto63_ice("kiten", 5, "katerinas", 9, 6);
+    _sz_assert(dist == 5);
 
     // Similarity measures and fuzzy search
     test_levenshtein_distances();

From 364e2ca4908fa3eae21689dc3afc7a1460a1a023 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 8 Dec 2024 19:48:31 +0000
Subject: [PATCH 049/751] Make: Rename `stringzillite` to `stringzilla_bare`

---
 .github/workflows/prerelease.yml |  2 +-
 .github/workflows/release.yml    | 24 +++++------
 CMakeLists.txt                   | 68 ++++++++++++++++++++++++--------
 CONTRIBUTING.md                  |  4 +-
 4 files changed, 66 insertions(+), 32 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 8f8b7803..57514b79 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -275,7 +275,7 @@ jobs:
         # We can't run the produced builds, but we can make sure they exist
       - name: Test artifacts presense
         run: |
-          test -e build_artifacts/libstringzillite.so
+          test -e build_artifacts/libstringzilla_bare.so
           test -e build_artifacts/libstringzilla_shared.so
           test -e build_artifacts/stringzilla_test_cpp20
 
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 80a8f989..6a726b14 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -253,15 +253,15 @@ jobs:
 
           cmake --build build_release --config Release
 
-          cp build_release/libstringzillite.so "stringzillite_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}.so"
-          mkdir -p "stringzillite_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}/DEBIAN"
-          touch "stringzillite_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}/DEBIAN/control"
-          mkdir -p "stringzillite_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}/usr/local/lib"
-          mkdir "stringzillite_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}/usr/local/include"
-          cp include/stringzilla/stringzilla.h "stringzillite_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}/usr/local/include/"
-          cp build_release/libstringzillite.so "stringzillite_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}/usr/local/lib/"
-          echo -e "Package: stringzilla\nVersion: ${{ steps.set_version.outputs.version }}\nMaintainer: Ash Vardanian\nArchitecture: ${{ matrix.arch }}\nDescription: SIMD-accelerated string search, sort, hashes, fingerprints, & edit distances" > "stringzillite_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}/DEBIAN/control"
-          dpkg-deb --build "stringzillite_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}"
+          cp build_release/libstringzilla_bare.so "stringzilla_bare_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}.so"
+          mkdir -p "stringzilla_bare_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}/DEBIAN"
+          touch "stringzilla_bare_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}/DEBIAN/control"
+          mkdir -p "stringzilla_bare_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}/usr/local/lib"
+          mkdir "stringzilla_bare_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}/usr/local/include"
+          cp include/stringzilla/stringzilla.h "stringzilla_bare_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}/usr/local/include/"
+          cp build_release/libstringzilla_bare.so "stringzilla_bare_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}/usr/local/lib/"
+          echo -e "Package: stringzilla\nVersion: ${{ steps.set_version.outputs.version }}\nMaintainer: Ash Vardanian\nArchitecture: ${{ matrix.arch }}\nDescription: SIMD-accelerated string search, sort, hashes, fingerprints, & edit distances" > "stringzilla_bare_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}/DEBIAN/control"
+          dpkg-deb --build "stringzilla_bare_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}"
 
       - name: Upload library
         uses: xresloader/upload-to-github-release@v1
@@ -314,14 +314,14 @@ jobs:
         run: |
           cmake -DCMAKE_BUILD_TYPE=Release -B build_release
           cmake --build build_release --config Release
-          tar -cvf "stringzillite_windows_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}.tar" "build_release/stringzillite.dll" "./include/stringzilla/stringzilla.h"
+          tar -cvf "stringzilla_bare_windows_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}.tar" "build_release/stringzilla_bare.dll" "./include/stringzilla/stringzilla.h"
 
       - name: Upload archive
         uses: xresloader/upload-to-github-release@v1
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
-          file: "stringzillite_windows_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}.tar"
+          file: "stringzilla_bare_windows_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}.tar"
           update_latest_release: true
 
   create_macos_library:
@@ -349,7 +349,7 @@ jobs:
         run: |
           cmake -DCMAKE_BUILD_TYPE=Release -B build_release
           cmake --build build_release --config Release        
-          zip -r stringzillite_macos_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}.zip build_release/libstringzillite.dylib include/stringzilla/stringzilla.h
+          zip -r stringzilla_bare_macos_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}.zip build_release/libstringzilla_bare.dylib include/stringzilla/stringzilla.h
 
       - name: Upload archive
         uses: xresloader/upload-to-github-release@v1
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c09fd6e7..6b931960 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,37 @@
-cmake_minimum_required(VERSION 3.1)
+# StringZilla CMakeLists.txt
+#
+# This file defines several library build & installation targets:
+#
+# - stringzilla_header: A header-only library with the StringZilla C and C++ headers.
+# - stringzilla_shared: A shared library with the StringZilla C and C++ headers and dynamic SIMD dispatch.
+# - stringzilla_bare: A shared library with the StringZilla headers, but without linking the standard C library.
+# 
+# Tests for different C++ standards:
+#
+#   - stringzilla_test_cpp11: A test executable for C++11. 
+#   - stringzilla_test_cpp14: A test executable for C++14.
+#   - stringzilla_test_cpp17: A test executable for C++17.
+#   - stringzilla_test_cpp20: A test executable for C++20.
+#
+# Tests for different SIMD architectures:
+#
+#   - stringzilla_test_cpp20_serial: A test executable for serial execution.
+#   - stringzilla_test_cpp20_haswell: A test executable for AVX2.
+#   - stringzilla_test_cpp20_ice: A test executable for AVX-512.
+#   - stringzilla_test_cpp20_neon: A test executable for ARM Neon.
+#   - stringzilla_test_cpp20_sve: A test executable for ARM Scalable Vector Extension.
+#
+# Benchmarks:
+#
+#   - stringzilla_bench_search: A benchmark for substring search operations.
+#   - stringzilla_bench_similarity: A benchmark for similarity operations.
+#   - stringzilla_bench_sort: A benchmark for sorting operations.
+#   - stringzilla_bench_token: A benchmark for comparators and hash functions.
+#   - stringzilla_bench_container: A benchmark for STL containers powered by StringZilla.
+#   - stringzilla_bench_memory: A benchmark for LibC-style low-level memory operations.
+#
+# For higher-level language bindings separate build scripts are provided, native to each toolchain.
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
 project(
   stringzilla
   VERSION 3.11.0
@@ -7,7 +40,7 @@ project(
   HOMEPAGE_URL "https://github.com/ashvardanian/stringzilla")
 
 set(CMAKE_C_STANDARD 99)
-set(CMAKE_CXX_STANDARD 17) # This gives many issues for msvc and clang-cl, especially if later on you set it to std-c++11 later on in the tests...
+set(CMAKE_CXX_STANDARD 11)
 
 set(CMAKE_C_EXTENSIONS OFF)
 set(CMAKE_CXX_EXTENSIONS OFF)
@@ -270,18 +303,19 @@ if(${STRINGZILLA_BUILD_TEST})
   if(SZ_PLATFORM_X86)
     # x86 specific backends
     if (MSVC)
-      define_launcher(stringzilla_test_cpp20_x86_serial scripts/test.cpp 20 "AVX")
-      define_launcher(stringzilla_test_cpp20_x86_avx2 scripts/test.cpp 20 "AVX2")
-      define_launcher(stringzilla_test_cpp20_x86_avx512 scripts/test.cpp 20 "AVX512")
+      define_launcher(stringzilla_test_cpp20_serial scripts/test.cpp 20 "AVX")
+      define_launcher(stringzilla_test_cpp20_haswell scripts/test.cpp 20 "AVX2")
+      define_launcher(stringzilla_test_cpp20_ice scripts/test.cpp 20 "AVX512")
     else()
-      define_launcher(stringzilla_test_cpp20_x86_serial scripts/test.cpp 20 "ivybridge")
-      define_launcher(stringzilla_test_cpp20_x86_avx2 scripts/test.cpp 20 "haswell")
-      define_launcher(stringzilla_test_cpp20_x86_avx512 scripts/test.cpp 20 "sapphirerapids")
+      define_launcher(stringzilla_test_cpp20_serial scripts/test.cpp 20 "ivybridge")
+      define_launcher(stringzilla_test_cpp20_haswell scripts/test.cpp 20 "haswell")
+      define_launcher(stringzilla_test_cpp20_ice scripts/test.cpp 20 "sapphirerapids")
     endif()
   elseif(SZ_PLATFORM_ARM)
     # ARM specific backends
-    define_launcher(stringzilla_test_cpp20_arm_serial scripts/test.cpp 20 "armv8-a")
-    define_launcher(stringzilla_test_cpp20_arm_neon scripts/test.cpp 20 "armv8-a+simd")
+    define_launcher(stringzilla_test_cpp20_serial scripts/test.cpp 20 "armv8-a")
+    define_launcher(stringzilla_test_cpp20_neon scripts/test.cpp 20 "armv8-a+simd")
+    define_launcher(stringzilla_test_cpp20_sve scripts/test.cpp 20 "armv8.2-a+sve")
   endif()
 endif()
 
@@ -335,16 +369,16 @@ if(${STRINGZILLA_BUILD_SHARED})
   target_compile_definitions(stringzilla_shared PRIVATE "SZ_OVERRIDE_LIBC=1")
 
   # Try compiling a version without linking the LibC
-  define_shared(stringzillite)
-  target_compile_definitions(stringzillite PRIVATE "SZ_AVOID_LIBC=1")
-  target_compile_definitions(stringzillite PRIVATE "SZ_OVERRIDE_LIBC=1")
+  define_shared(stringzilla_bare)
+  target_compile_definitions(stringzilla_bare PRIVATE "SZ_AVOID_LIBC=1")
+  target_compile_definitions(stringzilla_bare PRIVATE "SZ_OVERRIDE_LIBC=1")
 
   # Avoid built-ins on MSVC and other compilers, as that will cause compilation errors
-  target_compile_options(stringzillite PRIVATE
+  target_compile_options(stringzilla_bare PRIVATE
     "$<$<CXX_COMPILER_ID:GNU,Clang>:-fno-builtin;-nostdlib>"
     "$<$<CXX_COMPILER_ID:MSVC>:/Oi-;/GS->")
-  target_link_options(stringzillite PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang>:-nostdlib>")
-  target_link_options(stringzillite PRIVATE "$<$<CXX_COMPILER_ID:MSVC>:/NODEFAULTLIB>")
+  target_link_options(stringzilla_bare PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang>:-nostdlib>")
+  target_link_options(stringzilla_bare PRIVATE "$<$<CXX_COMPILER_ID:MSVC>:/NODEFAULTLIB>")
 
 
 endif()
@@ -362,7 +396,7 @@ if(STRINGZILLA_INSTALL)
     RESOURCE
     RUNTIME)
   install(
-    TARGETS stringzillite
+    TARGETS stringzilla_bare
     ARCHIVE
     BUNDLE
     FRAMEWORK
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 524d6c49..231291c8 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -131,8 +131,8 @@ Using modern syntax, this is how you build and run the test suite:
 cmake -D STRINGZILLA_BUILD_TEST=1 -D CMAKE_BUILD_TYPE=Debug -B build_debug
 cmake --build build_debug --config Debug          # Which will produce the following targets:
 build_debug/stringzilla_test_cpp20                # Unit test for the entire library compiled for current hardware
-build_debug/stringzilla_test_cpp20_x86_serial     # x86 variant compiled for IvyBridge - last arch. before AVX2
-build_debug/stringzilla_test_cpp20_arm_serial     # Arm variant compiled without Neon
+build_debug/stringzilla_test_cpp20_serial     # x86 variant compiled for IvyBridge - last arch. before AVX2
+build_debug/stringzilla_test_cpp20_serial     # Arm variant compiled without Neon
 ```
 
 To use CppCheck for static analysis make sure to export the compilation commands.

From 6d61c2166671ebecc57dba2a5016c5404872b02a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 8 Dec 2024 20:02:43 +0000
Subject: [PATCH 050/751] Make: Detect Apple Universal builds

Imported from #169

Co-authored-by: ashbob999 <32575256+ashbob999@users.noreply.github.com>
---
 pyproject.toml | 7 +++++++
 setup.py       | 7 ++++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e93355ae..a8dd42e2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -117,5 +117,12 @@ select = "*-macos*_arm64"
 inherit.environment = "append"
 environment.SZ_ARM64 = "1"
 
+# Detect MacOS Universal2 builds
+[[tool.cibuildwheel.overrides]]
+select = "*-macos*_universal2"
+inherit.environment = "append"
+environment.SZ_X86_64 = "1"
+environment.SZ_ARM64 = "1"
+
 [tool.cibuildwheel.macos.environment]
 MACOSX_DEPLOYMENT_TARGET = "10.11"
diff --git a/setup.py b/setup.py
index 27ef6be2..a1bce8ad 100644
--- a/setup.py
+++ b/setup.py
@@ -88,12 +88,13 @@ def darwin_settings() -> Tuple[List[str], List[str], List[Tuple[str]]]:
     # so we must pre-set the CPU generation. Technically the last Intel-based Apple
     # product was the 2021 MacBook Pro, which had the "Coffee Lake" architecture.
     # During Universal builds, however, even AVX header cause compilation errors.
-    can_use_avx2 = is_64bit_x86() and sysconfig.get_platform().startswith("universal")
+    is_building_x86 = is_64bit_x86() or "universal" in sysconfig.get_platform()
+    is_building_arm = is_64bit_arm() or "universal" in sysconfig.get_platform()
     macros_args = [
-        ("SZ_USE_HASWELL", "1" if can_use_avx2 else "0"),
+        ("SZ_USE_HASWELL", "1" if is_building_x86 else "0"),
         ("SZ_USE_SKYLAKE", "0"),
         ("SZ_USE_ICE", "0"),
-        ("SZ_USE_NEON", "1" if is_64bit_arm() else "0"),
+        ("SZ_USE_NEON", "1" if is_building_arm else "0"),
         ("SZ_USE_SVE", "0"),
     ]
 

From 19c2ae9743ca9e9b767cda2eb5e828553fd850cd Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 8 Dec 2024 20:04:27 +0000
Subject: [PATCH 051/751] Improve: C++ version macros naming

---
 .vscode/settings.json               |  4 +++-
 CMakeLists.txt                      |  8 ++++----
 include/stringzilla/stringzilla.hpp | 26 +++++++++++++-------------
 include/stringzilla/types.h         |  6 +++---
 scripts/test.cpp                    | 20 ++++++++++----------
 5 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index ee1f1d3b..9d0e1b53 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -39,6 +39,7 @@
     "cheminformatics",
     "cibuildwheel",
     "CONCAT",
+    "constexpr",
     "copydoc",
     "Corasick",
     "cptr",
@@ -82,6 +83,7 @@
     "Merkle-Damgård",
     "Mersenne",
     "MODINIT",
+    "MSVC",
     "napi",
     "nargsf",
     "ndim",
@@ -120,7 +122,7 @@
     "startswith",
     "STL",
     "stringzilla",
-    "stringzillite",
+    "stringzilla_bare",
     "Strs",
     "strzl",
     "substr",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b931960..81e9bbaa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,10 +8,10 @@
 # 
 # Tests for different C++ standards:
 #
-#   - stringzilla_test_cpp11: A test executable for C++11. 
-#   - stringzilla_test_cpp14: A test executable for C++14.
-#   - stringzilla_test_cpp17: A test executable for C++17.
-#   - stringzilla_test_cpp20: A test executable for C++20.
+#   - stringzilla_test_cpp11: C++11 baseline support.
+#   - stringzilla_test_cpp14: C++14 support with `std::less<std::string>`-like function objects.
+#   - stringzilla_test_cpp17: C++17 support with `std::string_view` compatibility.
+#   - stringzilla_test_cpp20: C++20 support with `<=>` operator and more `constexpr` features.
 #
 # Tests for different SIMD architectures:
 #
diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index f65b0212..c705dae6 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -28,18 +28,18 @@
 /*  We need to detect the version of the C++ language we are compiled with.
  *  This will affect recent features like `operator<=>` and tests against STL.
  */
-#define SZ_DETECT_CPP_23 (__cplusplus >= 202101L)
-#define SZ_DETECT_CPP20 (__cplusplus >= 202002L)
-#define SZ_DETECT_CPP_17 (__cplusplus >= 201703L)
-#define SZ_DETECT_CPP14 (__cplusplus >= 201402L)
-#define SZ_DETECT_CPP_11 (__cplusplus >= 201103L)
-#define SZ_DETECT_CPP_98 (__cplusplus >= 199711L)
+#define _SZ_IS_CPP23 (__cplusplus >= 202101L)
+#define _SZ_IS_CPP20 (__cplusplus >= 202002L)
+#define _SZ_IS_CPP17 (__cplusplus >= 201703L)
+#define _SZ_IS_CPP14 (__cplusplus >= 201402L)
+#define _SZ_IS_CPP11 (__cplusplus >= 201103L)
+#define _SZ_IS_CPP98 (__cplusplus >= 199711L)
 
 /**
  *  @brief  The `constexpr` keyword has different applicability scope in different C++ versions.
  *          Useful for STL conversion operators, as several `std::string` members are `constexpr` in C++20.
  */
-#if SZ_DETECT_CPP20
+#if _SZ_IS_CPP20
 #define sz_constexpr_if_cpp20 constexpr
 #else
 #define sz_constexpr_if_cpp20
@@ -50,7 +50,7 @@
 #include <bitset>
 #include <string>
 #include <vector>
-#if SZ_DETECT_CPP_17 && __cpp_lib_string_view
+#if _SZ_IS_CPP17 && __cpp_lib_string_view
 #include <string_view>
 #endif
 #endif
@@ -398,7 +398,7 @@ struct end_sentinel_type {};
 struct include_overlaps_type {};
 struct exclude_overlaps_type {};
 
-#if SZ_DETECT_CPP_17
+#if _SZ_IS_CPP17
 inline static constexpr end_sentinel_type end_sentinel;
 inline static constexpr include_overlaps_type include_overlaps;
 inline static constexpr exclude_overlaps_type exclude_overlaps;
@@ -1265,7 +1265,7 @@ class basic_string_slice {
         return os.write(str.data(), str.size());
     }
 
-#if SZ_DETECT_CPP_17 && __cpp_lib_string_view
+#if _SZ_IS_CPP17 && __cpp_lib_string_view
 
     template <typename sfinae_ = char_type, typename std::enable_if<std::is_const<sfinae_>::value, int>::type = 0>
     sz_constexpr_if_cpp20 basic_string_slice(std::string_view const &other) noexcept
@@ -1496,7 +1496,7 @@ class basic_string_slice {
                sz_equal(data() + other.first.size(), other.second.data(), other.second.size()) == sz_true_k;
     }
 
-#if SZ_DETECT_CPP20
+#if _SZ_IS_CPP20
 
     /**  @brief  Computes the lexicographic ordering between this and the ::other string. */
     std::strong_ordering operator<=>(string_view other) const noexcept {
@@ -2175,7 +2175,7 @@ class basic_string {
         return os.write(str.data(), str.size());
     }
 
-#if SZ_DETECT_CPP_17 && __cpp_lib_string_view
+#if _SZ_IS_CPP17 && __cpp_lib_string_view
 
     basic_string(std::string_view other) noexcept(false) : basic_string(other.data(), other.size()) {}
     basic_string &operator=(std::string_view other) noexcept(false) { return assign({other.data(), other.size()}); }
@@ -2421,7 +2421,7 @@ class basic_string {
     bool operator==(string_view other) const noexcept { return view() == other; }
     bool operator==(const_pointer other) const noexcept { return view() == string_view(other); }
 
-#if SZ_DETECT_CPP20
+#if _SZ_IS_CPP20
 
     /**  @brief  Computes the lexicographic ordering between this and the ::other string. */
     std::strong_ordering operator<=>(basic_string const &other) const noexcept { return view() <=> other.view(); }
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 8002b8a0..c34289fd 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -703,12 +703,12 @@ SZ_PUBLIC void sz_sequence_from_u64tape( //
 #define SZ_CACHE_LINE_WIDTH (64) // bytes
 
 /**
- *  @brief  Similar to `assert`, the `_sz_assert` is used in the SZ_DEBUG mode
- *          to check the invariants of the library. It's a no-op in the SZ_RELEASE mode.
+ *  @brief  Similar to `assert`, the `_sz_assert` is used in the `SZ_DEBUG` mode
+ *          to check the invariants of the library. It's a no-op in the "Release" mode.
  *  @note   If you want to catch it, put a breakpoint at @b `__GI_exit`
  */
 #if SZ_DEBUG && defined(SZ_AVOID_LIBC) && !SZ_AVOID_LIBC && !defined(SZ_PIC)
-#include <stdio.h>  // `fprintf`
+#include <stdio.h>  // `fprintf`, `stderr`
 #include <stdlib.h> // `EXIT_FAILURE`
 SZ_PUBLIC void _sz_assert_failure(char const *condition, char const *file, int line) {
     fprintf(stderr, "Assertion failed: %s, in file %s, line %d\n", condition, file, line);
diff --git a/scripts/test.cpp b/scripts/test.cpp
index 3f9add3b..9ae7e14c 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -38,7 +38,7 @@
 #include <string>      // Baseline
 #include <string_view> // Baseline
 
-#if !SZ_DETECT_CPP_11
+#if !_SZ_IS_CPP11
 #error "This test requires C++11 or later."
 #endif
 
@@ -52,7 +52,7 @@ using sz::literals::operator""_sz;
  *  Instantiate all the templates to make the symbols visible and also check
  *  for weird compilation errors on uncommon paths.
  */
-#if SZ_DETECT_CPP_17 && __cpp_lib_string_view
+#if _SZ_IS_CPP17 && __cpp_lib_string_view
 template class std::basic_string_view<char>;
 #endif
 template class sz::basic_string_slice<char>;
@@ -412,7 +412,7 @@ static void test_stl_compatibility_for_reads() {
     assert(str("b") >= str("a"));
     assert(str("a") < str("aa"));
 
-#if SZ_DETECT_CPP20 && __cpp_lib_three_way_comparison
+#if _SZ_IS_CPP20 && __cpp_lib_three_way_comparison
     // Spaceship operator instead of conventional comparions.
     assert((str("a") <=> str("b")) == std::strong_ordering::less);
     assert((str("b") <=> str("a")) == std::strong_ordering::greater);
@@ -455,7 +455,7 @@ static void test_stl_compatibility_for_reads() {
     assert(str("hello world").compare(6, 5, "worlds", 5) == 0);    // Substring "world" in both strings
     assert(str("hello world").compare(6, 5, "worlds", 6) < 0);     // Substring "world" is less than "worlds"
 
-#if SZ_DETECT_CPP20 && __cpp_lib_starts_ends_with
+#if _SZ_IS_CPP20 && __cpp_lib_starts_ends_with
     // Prefix and suffix checks against strings.
     assert(str("https://cppreference.com").starts_with(str("http")) == true);
     assert(str("https://cppreference.com").starts_with(str("ftp")) == false);
@@ -475,7 +475,7 @@ static void test_stl_compatibility_for_reads() {
     assert(str("string_view").ends_with("View") == false);
 #endif
 
-#if SZ_DETECT_CPP_23 && __cpp_lib_string_contains
+#if _SZ_IS_CPP23 && __cpp_lib_string_contains
     // Checking basic substring presence.
     assert(str("hello").contains(str("ell")) == true);
     assert(str("hello").contains(str("oll")) == false);
@@ -506,7 +506,7 @@ static void test_stl_compatibility_for_reads() {
     assert(std::hash<str> {}("hello") != 0);
     assert_scoped(std::ostringstream os, os << str("hello"), os.str() == "hello");
 
-#if SZ_DETECT_CPP14
+#if _SZ_IS_CPP14
     // Comparison function objects are a C++14 feature.
     assert(std::equal_to<str> {}("hello", "world") == false);
     assert(std::less<str> {}("hello", "world") == true);
@@ -660,7 +660,7 @@ static void test_stl_conversions() {
         sz_unused(sz);
         sz_unused(szv);
     }
-#if SZ_DETECT_CPP_17 && __cpp_lib_string_view
+#if _SZ_IS_CPP17 && __cpp_lib_string_view
     // From STL `string_view` to StringZilla and vice-versa.
     {
         std::string_view stl {"hello"};
@@ -1179,7 +1179,7 @@ static void test_search() {
     assert(rsplits[4] == "");
 }
 
-#if SZ_DETECT_CPP_17 && __cpp_lib_string_view
+#if _SZ_IS_CPP17 && __cpp_lib_string_view
 
 /**
  *  Evaluates the correctness of a "matcher", searching for all the occurrences of the `needle_stl`
@@ -1582,7 +1582,7 @@ int main(int argc, char const **argv) {
     test_replacements();
 
 // Compatibility with STL
-#if SZ_DETECT_CPP_17 && __cpp_lib_string_view
+#if _SZ_IS_CPP17 && __cpp_lib_string_view
     test_stl_compatibility_for_reads<std::string_view>();
 #endif
     test_stl_compatibility_for_reads<std::string>();
@@ -1607,7 +1607,7 @@ int main(int argc, char const **argv) {
     test_stl_conversions();
     test_comparisons();
     test_search();
-#if SZ_DETECT_CPP_17 && __cpp_lib_string_view
+#if _SZ_IS_CPP17 && __cpp_lib_string_view
     test_search_with_misaligned_repetitions();
 #endif
 

From 645539b468f3c2902061425684d9b002c43a14f7 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 8 Dec 2024 20:12:39 +0000
Subject: [PATCH 052/751] Fix: Overriding LibC in 32-bit Windows

Imported from #169

Co-authored-by: ashbob999 <32575256+ashbob999@users.noreply.github.com>
---
 c/lib.c | 49 ++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 42 insertions(+), 7 deletions(-)

diff --git a/c/lib.c b/c/lib.c
index 8a0a75b9..d829e379 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -202,7 +202,7 @@ __attribute__((aligned(64))) static sz_implementations_t sz_dispatch_table;
  *  @brief  Initializes a global static "virtual table" of supported backends
  *          Run it just once to avoiding unnecessary `if`-s.
  */
-static void sz_dispatch_table_init(void) {
+SZ_DYNAMIC void sz_dispatch_table_init(void) {
     sz_implementations_t *impl = &sz_dispatch_table;
     sz_capability_t caps = sz_capabilities();
     sz_unused(caps); //< Unused when compiling on pre-SIMD machines.
@@ -294,9 +294,17 @@ static void sz_dispatch_table_init(void) {
 }
 
 #if defined(_MSC_VER)
-#pragma section(".CRT$XCU", read)
-__declspec(allocate(".CRT$XCU")) void (*_sz_dispatch_table_init)() = sz_dispatch_table_init;
+/*
+ *  Makes sure the `sz_dispatch_table_init` function is called at startup, from either an executable or when loading
+ *  a DLL. The section name must be no more than 8 characters long, and must be between .CRT$XCA and .CRT$XCZ
+ *  alphabetically (exclusive). The Microsoft C++ compiler puts C++ initialisation code in .CRT$XCU, so avoid that
+ *  section: https://learn.microsoft.com/en-us/cpp/c-runtime-library/crt-initialization?view=msvc-170
+ */
+#pragma comment(linker, "/INCLUDE:_sz_dispatch_table_init")
+#pragma section(".CRT$XCS", read)
+__declspec(allocate(".CRT$XCS")) void (*_sz_dispatch_table_init)() = sz_dispatch_table_init;
 
+/*  Called either from CRT code or out own `_DLLMainCRTStartup`, when a DLL is loaded. */
 BOOL WINAPI DllMain(HINSTANCE hints, DWORD forward_reason, LPVOID lp) {
     switch (forward_reason) {
     case DLL_PROCESS_ATTACH:
@@ -309,6 +317,14 @@ BOOL WINAPI DllMain(HINSTANCE hints, DWORD forward_reason, LPVOID lp) {
     return TRUE;
 }
 
+#if SZ_AVOID_LIBC
+/*  Called when the DLL is loaded, and ther is no CRT code. */
+BOOL WINAPI _DllMainCRTStartup(HINSTANCE hints, DWORD forward_reason, LPVOID lp) {
+    DllMain(hints, forward_reason, lp);
+    return TRUE;
+}
+#endif
+
 #else
 __attribute__((constructor)) static void sz_dispatch_table_init_on_gcc_or_clang(void) { sz_dispatch_table_init(); }
 #endif
@@ -451,14 +467,20 @@ SZ_DYNAMIC void sz_generate( //
 }
 
 // Provide overrides for the libc mem* functions
-#if SZ_OVERRIDE_LIBC && !(defined(__CYGWIN__))
+#if SZ_OVERRIDE_LIBC && !defined(__CYGWIN__)
 
-// SZ_DYNAMIC can't be use here for MSVC, because MSVC complains about different linkage (C2375), probably due to to the
-// CRT headers specifying the function as __declspec(dllimport), there might be a combination of defines that works. But
-// for now they will be manually exported using linker flags
+// SZ_DYNAMIC can't be use here for MSVC, because MSVC complains about different linkage (C2375), probably due
+// to to the CRT headers specifying the function as `__declspec(dllimport)`, there might be a combination of
+// defines that works. But for now they will be manually exported using linker flags.
+// Also when building for 32-bit we must add an underscore to the exported function name, because that's
+// how `__cdecl` functions are decorated in MSVC: https://stackoverflow.com/questions/62753691)
 
 #if defined(_MSC_VER)
+#if SZ_DETECT_64_BIT
 #pragma comment(linker, "/export:memchr")
+#else
+#pragma comment(linker, "/export:_memchr")
+#endif
 void *__cdecl memchr(void const *s, int c_wide, size_t n) {
 #else
 SZ_DYNAMIC void *memchr(void const *s, int c_wide, size_t n) {
@@ -468,7 +490,11 @@ SZ_DYNAMIC void *memchr(void const *s, int c_wide, size_t n) {
 }
 
 #if defined(_MSC_VER)
+#if SZ_DETECT_64_BIT
 #pragma comment(linker, "/export:memcpy")
+#else
+#pragma comment(linker, "/export:_memcpy")
+#endif
 void *__cdecl memcpy(void *dest, void const *src, size_t n) {
 #else
 SZ_DYNAMIC void *memcpy(void *dest, void const *src, size_t n) {
@@ -478,7 +504,11 @@ SZ_DYNAMIC void *memcpy(void *dest, void const *src, size_t n) {
 }
 
 #if defined(_MSC_VER)
+#if SZ_DETECT_64_BIT
 #pragma comment(linker, "/export:memmove")
+#else
+#pragma comment(linker, "/export:_memmove")
+#endif
 void *__cdecl memmove(void *dest, void const *src, size_t n) {
 #else
 SZ_DYNAMIC void *memmove(void *dest, void const *src, size_t n) {
@@ -488,7 +518,11 @@ SZ_DYNAMIC void *memmove(void *dest, void const *src, size_t n) {
 }
 
 #if defined(_MSC_VER)
+#if SZ_DETECT_64_BIT
 #pragma comment(linker, "/export:memset")
+#else
+#pragma comment(linker, "/export:_memset")
+#endif
 void *__cdecl memset(void *s, int c, size_t n) {
 #else
 SZ_DYNAMIC void *memset(void *s, int c, size_t n) {
@@ -511,5 +545,6 @@ SZ_DYNAMIC void memfrob(void *s, size_t n) {
     char const *base64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
     sz_generate(base64, 64, s, n, SZ_NULL, SZ_NULL);
 }
+
 #endif
 #endif // SZ_OVERRIDE_LIBC

From 660923e6d1be94a0cf0e2e97a8d0cebf3af2462f Mon Sep 17 00:00:00 2001
From: Alex Bondarev <44079602+alexbarev@users.noreply.github.com>
Date: Mon, 9 Dec 2024 04:30:41 +0400
Subject: [PATCH 053/751] Test: Correct edge cases in ASCII tests

---
 scripts/test.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/test.cpp b/scripts/test.cpp
index e8123995..4aa46766 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -154,7 +154,7 @@ static void test_ascii_utilities() {
     assert(str("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789").is_alnum());
     assert(!str("abc!").is_alnum());
 
-    assert(!str("").is_ascii());
+    assert(str("").is_ascii());
     assert(str("\x00x7F").is_ascii());
     assert(!str("abc123🔥").is_ascii());
 
@@ -175,9 +175,9 @@ static void test_ascii_utilities() {
     assert(str("ABCDEFGHIJKLMNOPQRSTUVWXYZ").is_upper());
     assert(!str("ABCa").is_upper());
 
-    assert(!str("").is_printable());
+    assert(str("").is_printable());
     assert(str("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*()_+").is_printable());
-    assert(!str("012\n").is_printable());
+    assert(!str("012🔥").is_printable());
 }
 
 inline void expect_equality(char const *a, char const *b, std::size_t size) {

From 064829ae0ff2501aba404afaacfd7826586377bf Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Dec 2024 07:55:59 +0000
Subject: [PATCH 054/751] Improve: Ignore 40 commits in blame

---
 .git-blame-ignore-revs | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 .git-blame-ignore-revs

diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 00000000..3d26edb4
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,40 @@
+6512f1d129aeddc8601c9df7332c135038914b68
+fc9e5d61e5fb1c5031f6f10920f6b50e2530de1e
+ad2af78f8651870727c5b39e1fea2eff26d71d2f
+49e8d9d240993bdf68715a9c87824a032752798d
+fc408fa0a0f2d947c610568bd7a5c4a60ecca443
+b835051c09a0ecfc420932de444f3c6839610764
+1ba7982559111d4fc9b58caa7bc7aa1c6e64257c
+5b55e19d1378c61da88309b30a38f9cf7c64bf79
+be4c63d926c8628451726863e4d14dbd1ea374dd
+8b401bd41e4bd9c29c8fad9a5b83d8232efa50c7
+295d49a38d66b08075357ac829ad66d80b5edab0
+2a1fcd113d217e3124f6501c38e93a318aca37f0
+2f7652141bd8dc3c2c38ab34321567bfcdb91d93
+9e3180019acffe5261f0a1713b4ea324dca79ea0
+45e57eefd796841cbd14ee7f75ec42b42b5bde0c
+66778d6b2b3aa0eed27e32fbdceef79b8c54eda5
+c357c3ea756523d3bcc8d8f25068ad08aef5456d
+9b1948b3771c21dd56954e5f43301ca8a0b8b1a9
+cbfe5c7ac6371047eae88621b092297474d0b82a
+085d2d3c8b99e0f90d320dd027040e554e410929
+3464cb428ae9a8721ab82a8c4bff214aa9ce6254
+5d0d2da422c7df96f9613ec843cd47c579a2edce
+89c46810c2f9bfafa31f8592339f9a1b45dcc245
+3f9c248fbf59add2246055462e8fc19dc9f1693b
+e23c35ff2c2d4ccb752f4ffbf9b6f39a1677b532
+7fdc58fd26e06c41052287d47a9c729c068a95ca
+10d829efcb8ed4cfa5f2db4050f8403184484423
+d74e5dca2e62eb0078cb2ebacc0dac2b8bb92d54
+1f60e6d7c81f0e285e594eb63fee6119e05a3e69
+a6768af38b40307fe66364403f141c285b3e164c
+08d0a20d35d3b29a44b9c8a826d53435c3ef839c
+9e9f2567d052d635722921a1d70ec63d69ec6669
+974ed78822dc0b519dd61bc1c4dc18d59fe4ad15
+b007ba571860e1d3737d1478c7f8d66ae1839e36
+14ba3bf3c43408438a7de9ad57118c747c1347b1
+9e577be71dcd2e20854bf55f08c54854b3e82989
+8cb0742b2d1b31b61fac5272f17017953c6677e6
+bd547453122e9f8565e5be15f137e7b0de37caca
+22e3d1e34d62d68c1e89df7c8bdc201faa18a9de
+ecb377541d0c706cf8997faff4f026b07e3f76f3

From e20d207ce5bf7b25da4740ca511a0e5ea44af41f Mon Sep 17 00:00:00 2001
From: Alex <44079602+alexbarev@users.noreply.github.com>
Date: Mon, 9 Dec 2024 11:56:46 +0400
Subject: [PATCH 055/751] Fix: Correct `basic_charset` operator (#203)

---
 include/stringzilla/stringzilla.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index c705dae6..85589909 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -309,7 +309,7 @@ class basic_charset {
         basic_charset result = *this;
         result.bitset_._u64s[0] |= other.bitset_._u64s[0], result.bitset_._u64s[1] |= other.bitset_._u64s[1],
             result.bitset_._u64s[2] |= other.bitset_._u64s[2], result.bitset_._u64s[3] |= other.bitset_._u64s[3];
-        return *this;
+        return result;
     }
 
     inline basic_charset &add(char_type c) noexcept {

From 864ee03fdbbba0b71b982cd9c6a206b9a7f96dee Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Dec 2024 08:05:00 +0000
Subject: [PATCH 056/751] Fix: Initializing `basic_charset`

Closes #200
---
 include/stringzilla/stringzilla.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index a80da804..43869f08 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -283,7 +283,7 @@ class basic_charset {
     template <std::size_t count_characters>
     explicit basic_charset(char_type const (&chars)[count_characters]) noexcept : basic_charset() {
         static_assert(count_characters > 0, "Character array cannot be empty");
-        for (std::size_t i = 0; i < count_characters - 1; ++i) { // count_characters - 1 to exclude the null terminator
+        for (std::size_t i = 0; i != count_characters; ++i) {
             char_type c = chars[i];
             bitset_._u64s[sz_bitcast(sz_u8_t, c) >> 6] |= (1ull << (sz_bitcast(sz_u8_t, c) & 63u));
         }
@@ -292,7 +292,7 @@ class basic_charset {
     template <std::size_t count_characters>
     explicit basic_charset(std::array<char_type, count_characters> const &chars) noexcept : basic_charset() {
         static_assert(count_characters > 0, "Character array cannot be empty");
-        for (std::size_t i = 0; i < count_characters - 1; ++i) { // count_characters - 1 to exclude the null terminator
+        for (std::size_t i = 0; i != count_characters; ++i) {
             char_type c = chars[i];
             bitset_._u64s[sz_bitcast(sz_u8_t, c) >> 6] |= (1ull << (sz_bitcast(sz_u8_t, c) & 63u));
         }

From c99daf3fe04b6dd5dc2ac74803e868b2df056b31 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Dec 2024 08:06:54 +0000
Subject: [PATCH 057/751] Docs: Formatting docstring

---
 scripts/test.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/test.cpp b/scripts/test.cpp
index 4aa46766..72379f78 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -138,8 +138,8 @@ static void test_arithmetical_utilities() {
 }
 
 /**
- * @brief Tests various ASCII-based methods (e.g., is_alpha, is_digit)
- *        provided by `sz::string` and `sz::string_view`.
+ *  @brief  Tests various ASCII-based methods (e.g., `is_alpha`, `is_digit`)
+ *          provided by `sz::string` and `sz::string_view`.
  */
 template <typename string_type>
 static void test_ascii_utilities() {

From 084d6534d30d668edc6d7790f8aaec438832f12b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 10 Dec 2024 10:34:34 +0000
Subject: [PATCH 058/751] Fix: Linking `stderr`

Co-authored-by: Alex Bondarev <44079602+alexbarev@users.noreply.github.com>
---
 include/stringzilla/types.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index c34289fd..b9e202ae 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -181,6 +181,12 @@
 #include <stdint.h> // `uint8_t`
 #endif
 
+/*  The headers needed for the `_sz_assert_failure` function. */
+#if SZ_DEBUG && defined(SZ_AVOID_LIBC) && !SZ_AVOID_LIBC && !defined(SZ_PIC)
+#include <stdio.h>  // `fprintf`, `stderr`
+#include <stdlib.h> // `EXIT_FAILURE`
+#endif
+
 /*  Compile-time hardware features detection.
  *  All of those can be controlled by the user.
  */
@@ -708,8 +714,6 @@ SZ_PUBLIC void sz_sequence_from_u64tape( //
  *  @note   If you want to catch it, put a breakpoint at @b `__GI_exit`
  */
 #if SZ_DEBUG && defined(SZ_AVOID_LIBC) && !SZ_AVOID_LIBC && !defined(SZ_PIC)
-#include <stdio.h>  // `fprintf`, `stderr`
-#include <stdlib.h> // `EXIT_FAILURE`
 SZ_PUBLIC void _sz_assert_failure(char const *condition, char const *file, int line) {
     fprintf(stderr, "Assertion failed: %s, in file %s, line %d\n", condition, file, line);
     exit(EXIT_FAILURE);

From 48e0913944f109703ebbfcb7f14e1b7398544af7 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 10 Dec 2024 11:21:17 +0000
Subject: [PATCH 059/751] Fix: Skylake dispatch

---
 include/stringzilla/memory.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/stringzilla/memory.h b/include/stringzilla/memory.h
index c17f031f..d8db210b 100644
--- a/include/stringzilla/memory.h
+++ b/include/stringzilla/memory.h
@@ -1256,7 +1256,7 @@ SZ_PUBLIC void sz_copy_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length)
 #pragma region Core Functionality
 
 SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_ICE
+#if SZ_USE_SKYLAKE
     sz_copy_skylake(target, source, length);
 #elif SZ_USE_HASWELL
     sz_copy_haswell(target, source, length);
@@ -1268,7 +1268,7 @@ SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
 }
 
 SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-#if SZ_USE_ICE
+#if SZ_USE_SKYLAKE
     sz_move_skylake(target, source, length);
 #elif SZ_USE_HASWELL
     sz_move_haswell(target, source, length);
@@ -1280,7 +1280,7 @@ SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
 }
 
 SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
-#if SZ_USE_ICE
+#if SZ_USE_SKYLAKE
     sz_fill_skylake(target, length, value);
 #elif SZ_USE_HASWELL
     sz_fill_haswell(target, length, value);

From 749b0d86e5cd41df053ea214ef95000bbb90543f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 10 Dec 2024 11:22:55 +0000
Subject: [PATCH 060/751] Fix: Bounded Levenshtein returns

The new uniform behavior across the project
is to return a value different from `SZ_SIZE_MAX`
when the limit is reached, to differentiate memory
allocation and other errors.
---
 include/stringzilla/similarity.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/stringzilla/similarity.h b/include/stringzilla/similarity.h
index 943f7f35..0b119127 100644
--- a/include/stringzilla/similarity.h
+++ b/include/stringzilla/similarity.h
@@ -408,7 +408,7 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_wagner_fisher_serial( //
         /* If the minimum distance in this row exceeded the bound, return early */                                    \
         if (min_distance >= bound) {                                                                                  \
             alloc->free(buffer, buffer_length, alloc->handle);                                                        \
-            return bound;                                                                                             \
+            return longer_length + 1;                                                                                 \
         }                                                                                                             \
         _distance_t *temporary = previous_distances;                                                                  \
         previous_distances = current_distances;                                                                       \
@@ -416,7 +416,7 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_wagner_fisher_serial( //
     }                                                                                                                 \
     sz_size_t result = previous_distances[shorter_length];                                                            \
     alloc->free(buffer, buffer_length, alloc->handle);                                                                \
-    return sz_min_of_two(result, bound);
+    return result;
 
     // Dispatch the actual computation.
     if (!bound) {
@@ -735,7 +735,7 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto63_ice( //
 
         // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
         __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return SZ_SIZE_MAX;
+        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return longer_length + 1;
     }
 
     // Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.
@@ -766,7 +766,7 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto63_ice( //
 
         // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
         __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return SZ_SIZE_MAX;
+        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return longer_length + 1;
     }
 
     // Now let's handle the bottom right triangle.
@@ -790,7 +790,7 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto63_ice( //
 
         // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
         __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return SZ_SIZE_MAX;
+        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return longer_length + 1;
 
         // In every following iterations we take use a shorter prefix of each register,
         // but we don't need to update the `next_diagonal_mask` anymore... except for the early exit.

From 2007d494c019448d440bd8a548c62246bab82f94 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 10 Dec 2024 11:23:37 +0000
Subject: [PATCH 061/751] Fix: `sz_u512_vec_t` members visibility

---
 include/stringzilla/types.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index b9e202ae..f8fe0c9a 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -545,10 +545,10 @@ typedef union sz_u256_vec_t {
  *          as well as 4x XMM registers or 2x YMM registers or 1x ZMM register.
  */
 typedef union sz_u512_vec_t {
-#if SZ_USE_ICE
+#if SZ_USE_SKYLAKE || SZ_USE_ICE
     __m512i zmm;
 #endif
-#if SZ_USE_HASWELL
+#if SZ_USE_HASWELL || SZ_USE_SKYLAKE || SZ_USE_ICE
     __m256i ymms[2];
     __m128i xmms[4];
 #endif

From f3811d70ee0725ea4d2395a1c7fd1125dac3bc3d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 10 Dec 2024 11:24:37 +0000
Subject: [PATCH 062/751] Make: Library namespaced aliases

---
 CMakeLists.txt | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 81e9bbaa..7914aa0e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -111,19 +111,9 @@ endif()
 
 # Configuration
 include(GNUInstallDirs)
-set(STRINGZILLA_TARGET_NAME ${PROJECT_NAME})
 set(STRINGZILLA_INCLUDE_BUILD_DIR "${PROJECT_SOURCE_DIR}/include/")
 set(STRINGZILLA_INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}")
 
-# Define our library
-add_library(${STRINGZILLA_TARGET_NAME} INTERFACE)
-add_library(${PROJECT_NAME}::${STRINGZILLA_TARGET_NAME} ALIAS ${STRINGZILLA_TARGET_NAME})
-
-target_include_directories(
-  ${STRINGZILLA_TARGET_NAME}
-  INTERFACE $<BUILD_INTERFACE:${STRINGZILLA_INCLUDE_BUILD_DIR}>
-  $<INSTALL_INTERFACE:include>)
-
 
 if(${CMAKE_VERSION} VERSION_EQUAL 3.13 OR ${CMAKE_VERSION} VERSION_GREATER 3.13)
   include(CTest)
@@ -142,7 +132,6 @@ function(set_compiler_flags target cpp_standard target_arch)
   get_target_property(target_type ${target} TYPE)
 
   target_include_directories(${target} PRIVATE scripts)
-  target_link_libraries(${target} PRIVATE ${STRINGZILLA_TARGET_NAME})
 
   # Set output directory for single-configuration generators (like Make)
   set_target_properties(${target} PROPERTIES
@@ -278,6 +267,7 @@ endfunction()
 function(define_launcher exec_name source cpp_standard target_arch)
   add_executable(${exec_name} ${source})
   set_compiler_flags(${exec_name} ${cpp_standard} "${target_arch}")
+  target_link_libraries(${exec_name} PRIVATE stringzilla_header)
   add_test(NAME ${exec_name} COMMAND ${exec_name})
 endfunction()
 
@@ -319,10 +309,20 @@ if(${STRINGZILLA_BUILD_TEST})
   endif()
 endif()
 
+# Define our libraries, first the header-only version
+add_library(stringzilla_header INTERFACE)
+add_library(${PROJECT_NAME}::stringzilla_header ALIAS stringzilla_header)
+target_include_directories(
+  stringzilla_header
+  INTERFACE $<BUILD_INTERFACE:${STRINGZILLA_INCLUDE_BUILD_DIR}>
+  $<INSTALL_INTERFACE:include>)
+
+
 if(${STRINGZILLA_BUILD_SHARED})
 
   function(define_shared target)
     add_library(${target} SHARED c/lib.c)
+    add_library(${PROJECT_NAME}::${target} ALIAS ${target})
 
     set_target_properties(${target} PROPERTIES
       VERSION ${PROJECT_VERSION}

From bd7054ea9d5d0810f303a4a56fa0ee25a53410dd Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 10 Dec 2024 11:46:57 +0000
Subject: [PATCH 063/751] Fix: Masks back to using `BZHI`

---
 include/stringzilla/types.h | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index f8fe0c9a..57ff7124 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -800,17 +800,6 @@ SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return __builtin_bswap
 SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return __builtin_bswap32(val); }
 #endif
 
-/*
- */
-SZ_INTERNAL sz_u16_t _sz_u16_mask_until(sz_size_t n) { return (0x0001u << n) - 1u; }
-SZ_INTERNAL sz_u32_t _sz_u32_mask_until(sz_size_t n) { return (0x00000001u << n) - 1u; }
-SZ_INTERNAL sz_u64_t _sz_u64_mask_until(sz_size_t n) { return (0x0000000000000001ull << n) - 1ull; }
-SZ_INTERNAL sz_u16_t _sz_u16_clamp_mask_until(sz_size_t n) { return n < 16 ? _sz_u16_mask_until(n) : 0xFFFFu; }
-SZ_INTERNAL sz_u32_t _sz_u32_clamp_mask_until(sz_size_t n) { return n < 32 ? _sz_u32_mask_until(n) : 0xFFFFFFFFu; }
-SZ_INTERNAL sz_u64_t _sz_u64_clamp_mask_until(sz_size_t n) {
-    return n < 64 ? _sz_u64_mask_until(n) : 0xFFFFFFFFFFFFFFFFull;
-}
-
 SZ_INTERNAL sz_u64_t sz_u64_rotl(sz_u64_t x, sz_u64_t r) { return (x << r) | (x >> (64 - r)); }
 
 /**
@@ -865,6 +854,22 @@ SZ_INTERNAL sz_i32_t sz_i32_min_of_two(sz_i32_t x, sz_i32_t y) { return y + ((x
 /** @brief  Branchless minimum function for two signed 32-bit integers. */
 SZ_INTERNAL sz_i32_t sz_i32_max_of_two(sz_i32_t x, sz_i32_t y) { return x - ((x - y) & (x - y) >> 31); }
 
+/*  In AVX-512 we actively use masked operations and the "K mask registers".
+ *  Producing a mask for the first N elements of a sequence can be done using the `1 << N - 1` idiom.
+ *  It, however, induces undefined behavior if `N == 64` or `N == 32` on 64-bit or 32-bit systems respectively.
+ *  Alternatively, the BZHI instruction can be used to clear the bits above N.
+ */
+#if SZ_USE_SKYLAKE || SZ_USE_ICE
+SZ_INTERNAL __mmask16 _sz_u16_mask_until(sz_size_t n) { return (__mmask16)_bzhi_u32(0xFFFFu, n); }
+SZ_INTERNAL __mmask32 _sz_u32_mask_until(sz_size_t n) { return (__mmask32)_bzhi_u64(0xFFFFFFFFu, n); }
+SZ_INTERNAL __mmask64 _sz_u64_mask_until(sz_size_t n) { return (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n); }
+SZ_INTERNAL __mmask16 _sz_u16_clamp_mask_until(sz_size_t n) { return n < 16 ? _sz_u16_mask_until(n) : 0xFFFFu; }
+SZ_INTERNAL __mmask32 _sz_u32_clamp_mask_until(sz_size_t n) { return n < 32 ? _sz_u32_mask_until(n) : 0xFFFFFFFFu; }
+SZ_INTERNAL __mmask64 _sz_u64_clamp_mask_until(sz_size_t n) {
+    return n < 64 ? _sz_u64_mask_until(n) : 0xFFFFFFFFFFFFFFFFull;
+}
+#endif
+
 /**
  *  @brief  Byte-level equality comparison between two 64-bit integers.
  *  @return 64-bit integer, where every top bit in each byte signifies a match.

From fa47debf2f70d526be4625309f52d5a8a5d5643f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 11 Dec 2024 14:18:27 +0000
Subject: [PATCH 064/751] Fix: BMI flags for `BZHI`

---
 include/stringzilla/types.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 57ff7124..a170b6b0 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -860,6 +860,9 @@ SZ_INTERNAL sz_i32_t sz_i32_max_of_two(sz_i32_t x, sz_i32_t y) { return x - ((x
  *  Alternatively, the BZHI instruction can be used to clear the bits above N.
  */
 #if SZ_USE_SKYLAKE || SZ_USE_ICE
+#pragma GCC push_options
+#pragma GCC target("bmi", "bmi2")
+#pragma clang attribute push(__attribute__((target("bmi,bmi2"))), apply_to = function)
 SZ_INTERNAL __mmask16 _sz_u16_mask_until(sz_size_t n) { return (__mmask16)_bzhi_u32(0xFFFFu, n); }
 SZ_INTERNAL __mmask32 _sz_u32_mask_until(sz_size_t n) { return (__mmask32)_bzhi_u64(0xFFFFFFFFu, n); }
 SZ_INTERNAL __mmask64 _sz_u64_mask_until(sz_size_t n) { return (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n); }
@@ -868,6 +871,8 @@ SZ_INTERNAL __mmask32 _sz_u32_clamp_mask_until(sz_size_t n) { return n < 32 ? _s
 SZ_INTERNAL __mmask64 _sz_u64_clamp_mask_until(sz_size_t n) {
     return n < 64 ? _sz_u64_mask_until(n) : 0xFFFFFFFFFFFFFFFFull;
 }
+#pragma GCC pop_options
+#pragma clang attribute pop
 #endif
 
 /**

From d9557d35078e06e61e3667d2f01b367940189a96 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 11 Dec 2024 14:18:43 +0000
Subject: [PATCH 065/751] Improve: Faster `levenshtein_baseline`

---
 scripts/test.hpp | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/scripts/test.hpp b/scripts/test.hpp
index 9f9abe6b..261f90a5 100644
--- a/scripts/test.hpp
+++ b/scripts/test.hpp
@@ -67,29 +67,29 @@ inline std::string random_string(std::size_t length, char const *alphabet, std::
  *          Allocates a new matrix on every call, with rows potentially scattered around memory.
  */
 inline std::size_t levenshtein_baseline(char const *s1, std::size_t len1, char const *s2, std::size_t len2) {
-    std::vector<std::vector<std::size_t>> dp(len1 + 1, std::vector<std::size_t>(len2 + 1));
+    std::size_t const rows = len1 + 1;
+    std::size_t const cols = len2 + 1;
+    std::vector<std::size_t> matrix_buffer(rows * cols);
 
     // Initialize the borders of the matrix.
-    for (std::size_t i = 0; i <= len1; ++i) dp[i][0] = i;
-    for (std::size_t j = 0; j <= len2; ++j) dp[0][j] = j;
+    for (std::size_t i = 0; i < rows; ++i) matrix_buffer[i * cols + 0] /* [i][0] in 2D */ = i;
+    for (std::size_t j = 0; j < cols; ++j) matrix_buffer[0 * cols + j] /* [0][j] in 2D */ = j;
 
-    for (std::size_t i = 1; i <= len1; ++i) {
-        for (std::size_t j = 1; j <= len2; ++j) {
+    for (std::size_t i = 1; i < rows; ++i) {
+        std::size_t const *last_row = &matrix_buffer[(i - 1) * cols];
+        std::size_t *row = &matrix_buffer[i * cols];
+        for (std::size_t j = 1; j < cols; ++j) {
             std::size_t cost = (s1[i - 1] == s2[j - 1]) ? 0 : 1;
-            // dp[i][j] is the minimum of deletion, insertion, or substitution
-            dp[i][j] = std::min({
-                dp[i - 1][j] + 1,       // Deletion
-                dp[i][j - 1] + 1,       // Insertion
-                dp[i - 1][j - 1] + cost // Substitution
-            });
+            std::size_t deletion_or_insertion = std::min(last_row[j], row[j - 1]) + 1;
+            row[j] = std::min(deletion_or_insertion, last_row[j - 1] + cost);
         }
     }
 
-    return dp[len1][len2];
+    return matrix_buffer.back();
 }
 
 /**
- *  @brief  Produces a substitution cost matrix for the Needlemann-Wunsch alignment score,
+ *  @brief  Produces a substitution cost matrix for the Needleman-Wunsch alignment score,
  *          that would yield the same result as the negative Levenshtein distance.
  */
 inline std::vector<std::int8_t> unary_substitution_costs() {

From d20e589a56921679d0642350df8026df3240b54f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 11 Dec 2024 14:24:55 +0000
Subject: [PATCH 066/751] Fix: Minor dispatch issues

---
 .vscode/settings.json            |  3 ++-
 CMakeLists.txt                   |  5 ++++-
 include/stringzilla/hash.h       |  8 ++++----
 include/stringzilla/similarity.h | 10 +++++++---
 scripts/bench_memory.cpp         |  2 +-
 scripts/bench_similarity.cpp     | 22 ++++++++++++++++++----
 scripts/bench_token.cpp          |  4 ++--
 scripts/test.cpp                 | 13 ++++++++++---
 8 files changed, 48 insertions(+), 19 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 9d0e1b53..051fc5c8 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -268,7 +268,8 @@
     "xtree": "cpp",
     "xutility": "cpp",
     "errno.h": "c",
-    "text_encoding": "cpp"
+    "text_encoding": "cpp",
+    "ranges": "cpp"
   },
   "python.pythonPath": "~/miniconda3/bin/python"
 }
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7914aa0e..df90ad80 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -367,11 +367,14 @@ if(${STRINGZILLA_BUILD_SHARED})
   define_shared(stringzilla_shared)
   target_compile_definitions(stringzilla_shared PRIVATE "SZ_AVOID_LIBC=0")
   target_compile_definitions(stringzilla_shared PRIVATE "SZ_OVERRIDE_LIBC=1")
-
+  target_include_directories(stringzilla_shared PUBLIC include)
+  
+  
   # Try compiling a version without linking the LibC
   define_shared(stringzilla_bare)
   target_compile_definitions(stringzilla_bare PRIVATE "SZ_AVOID_LIBC=1")
   target_compile_definitions(stringzilla_bare PRIVATE "SZ_OVERRIDE_LIBC=1")
+  target_include_directories(stringzilla_bare PUBLIC include)
 
   # Avoid built-ins on MSVC and other compilers, as that will cause compilation errors
   target_compile_options(stringzilla_bare PRIVATE
diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 0e5e883e..262cbdc9 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -736,8 +736,8 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
     }
 }
 
-SZ_PUBLIC void sz_hashes_skylake(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                 sz_hash_callback_t callback, void *callback_handle) {
+SZ_PUBLIC void sz_hashes_ice(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
+                             sz_hash_callback_t callback, void *callback_handle) {
 
     if (length < window_length || !window_length) return;
     if (length < 4 * window_length) {
@@ -932,8 +932,8 @@ SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length) {
 
 SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
                           sz_hash_callback_t callback, void *callback_handle) {
-#if SZ_USE_SKYLAKE
-    sz_hashes_skylake(text, length, window_length, window_step, callback, callback_handle);
+#if SZ_USE_ICE
+    sz_hashes_ice(text, length, window_length, window_step, callback, callback_handle);
 #elif SZ_USE_HASWELL
     sz_hashes_haswell(text, length, window_length, window_step, callback, callback_handle);
 #else
diff --git a/include/stringzilla/similarity.h b/include/stringzilla/similarity.h
index 0b119127..5c521a40 100644
--- a/include/stringzilla/similarity.h
+++ b/include/stringzilla/similarity.h
@@ -639,9 +639,12 @@ SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial( //
  *  @brief  Computes the edit distance between two very short byte-strings using the AVX-512VBMI extensions.
  *
  *  Applies to string lengths up to 63, and evaluates at most (63 * 2 + 1 = 127) diagonals, or just as many loop
- * cycles. Supports an early exit, if the distance is bounded. Keeps all of the data and Levenshtein matrices skew
- * diagonal in just a couple of registers. Benefits from the @b `vpermb` instructions, that can rotate the bytes
- * across the entire ZMM register.
+ *  cycles. Supports an early exit, if the distance is bounded. Keeps all of the data and Levenshtein matrices skew
+ *  diagonal in just a couple of registers. Benefits from the @b `vpermb` instructions, that can rotate the bytes
+ *  across the entire ZMM register.
+ *
+ *? Bounds check, for inputs ranging from 33 to 64 bytes doesn't affect the performance at all.
+ *? It's also worth exploring `_mm512_alignr_epi8` and `_mm512_maskz_compress_epi8` for the shift.
  */
 SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto63_ice( //
     sz_cptr_t shorter, sz_size_t shorter_length,                     //
@@ -678,6 +681,7 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto63_ice( //
     bound_vec.zmm = _mm512_set1_epi8(bound <= 255 ? (sz_u8_t)bound : 255);
 
     // To simplify comparisons and traversals, we want to reverse the order of bytes in the shorter string.
+    shorter_vec.zmm = _mm512_setzero_si512(); //? To simplify debugging.
     for (sz_size_t i = 0; i != shorter_length; ++i) shorter_vec.u8s[63 - i] = shorter[i];
     shorter_rotated_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, shorter_vec.zmm);
 
diff --git a/scripts/bench_memory.cpp b/scripts/bench_memory.cpp
index 93d7ab2d..7a9acf25 100644
--- a/scripts/bench_memory.cpp
+++ b/scripts/bench_memory.cpp
@@ -110,7 +110,7 @@ tracked_unary_functions_t fill_functions(sz_cptr_t dataset_start_ptr, sz_ptr_t o
          })},
         {"sz_fill_serial", wrap_sz(sz_fill_serial)},
 #if SZ_USE_SKYLAKE
-        {"sz_fill_avx512", wrap_sz(sz_fill_skylake)},
+        {"sz_fill_skylake", wrap_sz(sz_fill_skylake)},
 #endif
 #if SZ_USE_HASWELL
         {"sz_fill_haswell", wrap_sz(sz_fill_haswell)},
diff --git a/scripts/bench_similarity.cpp b/scripts/bench_similarity.cpp
index 9aa964c3..ca901a5f 100644
--- a/scripts/bench_similarity.cpp
+++ b/scripts/bench_similarity.cpp
@@ -38,7 +38,7 @@ tracked_binary_functions_t distance_functions() {
     });
     auto wrap_sz_distance = [alloc](auto function) mutable -> binary_function_t {
         return binary_function_t([function, alloc](std::string_view a, std::string_view b) mutable -> std::size_t {
-            return function(a.data(), a.length(), b.data(), b.length(), (sz_size_t)0, &alloc);
+            return function(a.data(), a.length(), b.data(), b.length(), SZ_SIZE_MAX, &alloc);
         });
     };
     auto wrap_sz_scoring = [alloc, costs_ptr](auto function) mutable -> binary_function_t {
@@ -113,10 +113,24 @@ void bench_similarity_on_input_data(int argc, char const **argv) {
     std::printf("Benchmarking on real words:\n");
     bench_similarity(dataset.tokens);
 
+    struct size_range_t {
+        std::size_t min_length;
+        std::size_t max_length;
+    };
+
     // Run benchmarks on tokens of different length
-    for (std::size_t token_length : {20}) {
-        std::printf("Benchmarking on real words of length %zu and longer:\n", token_length);
-        bench_similarity(filter_by_length(dataset.tokens, token_length, std::greater_equal<std::size_t> {}));
+    for (size_range_t size : {
+             size_range_t {1, 16},
+             size_range_t {17, 32},
+             size_range_t {33, 64},
+             size_range_t {65, 128},
+         }) {
+        auto filtered_dataset = filter_by_length(dataset.tokens, size.min_length, std::greater_equal<std::size_t> {});
+        filtered_dataset = filter_by_length(filtered_dataset, size.max_length, std::greater_equal<std::size_t> {});
+        if (filtered_dataset.size() < 3) continue;
+        std::printf("Benchmarking on %zu real words of length %zu to %zu:\n", filtered_dataset.size(), size.min_length,
+                    size.max_length);
+        bench_similarity(std::move(filtered_dataset));
     }
 }
 
diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index 492f93f4..eb82dfd4 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -55,8 +55,8 @@ tracked_unary_functions_t sliding_hashing_functions(std::size_t window_width, st
     };
     std::string suffix = std::to_string(window_width) + ":step" + std::to_string(step);
     tracked_unary_functions_t result = {
-#if SZ_USE_SKYLAKE
-        {"sz_hashes_skylake:" + suffix, wrap_sz(sz_hashes_skylake)},
+#if SZ_USE_ICE
+        {"sz_hashes_ice:" + suffix, wrap_sz(sz_hashes_ice)},
 #endif
 #if SZ_USE_HASWELL
         {"sz_hashes_haswell:" + suffix, wrap_sz(sz_hashes_haswell)},
diff --git a/scripts/test.cpp b/scripts/test.cpp
index 181e0648..e9bcf3c7 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -173,6 +173,10 @@ static void test_ascii_utilities() {
     assert(str("").is_printable());
     assert(str("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*()_+").is_printable());
     assert(!str("012🔥").is_printable());
+
+    assert(str("").contains_only(sz::char_set("abc")));
+    assert(str("abc").contains_only(sz::char_set("abc")));
+    assert(!str("abcd").contains_only(sz::char_set("abc")));
 }
 
 inline void expect_equality(char const *a, char const *b, std::size_t size) {
@@ -1423,6 +1427,8 @@ static void test_levenshtein_distances() {
         char const *right;
         std::size_t distance;
     } explicit_cases[] = {
+        {"a", "a", 0},
+        {"A", "=", 1},
         {"listen", "silent", 4},
         {"", "", 0},
         {"", "abc", 3},
@@ -1473,7 +1479,7 @@ static void test_levenshtein_distances() {
         // Validate the bounded variants:
         if (received > 1) {
             assert(sz::edit_distance(l, r, received) == received);
-            assert(sz::edit_distance(r, l, received - 1) == SZ_SIZE_MAX);
+            assert(sz::edit_distance(r, l, received - 1) >= (std::max)(l.size(), r.size()));
         }
     };
 
@@ -1614,8 +1620,9 @@ int main(int argc, char const **argv) {
     // Let's greet the user nicely
     sz_unused(argc && argv);
     std::printf("Hi, dear tester! You look nice today!\n");
-    std::printf("- Uses AVX2: %s \n", SZ_USE_HASWELL ? "yes" : "no");
-    std::printf("- Uses AVX512: %s \n", SZ_USE_ICE ? "yes" : "no");
+    std::printf("- Uses Haswell: %s \n", SZ_USE_HASWELL ? "yes" : "no");
+    std::printf("- Uses Skylake: %s \n", SZ_USE_SKYLAKE ? "yes" : "no");
+    std::printf("- Uses Ice Lake: %s \n", SZ_USE_ICE ? "yes" : "no");
     std::printf("- Uses NEON: %s \n", SZ_USE_NEON ? "yes" : "no");
     std::printf("- Uses SVE: %s \n", SZ_USE_SVE ? "yes" : "no");
 

From 821d19ed73c5a163f1fcd76a7d10de29be4b1b88 Mon Sep 17 00:00:00 2001
From: ashbob999 <ashbob999@gmail.com>
Date: Sun, 5 Jan 2025 20:08:25 +0000
Subject: [PATCH 067/751] Fix: stable sort bench tests failing

---
 scripts/bench_sort.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/bench_sort.cpp b/scripts/bench_sort.cpp
index f46be4a3..1cd99e49 100644
--- a/scripts/bench_sort.cpp
+++ b/scripts/bench_sort.cpp
@@ -232,9 +232,9 @@ int main(int argc, char const **argv) {
         });
         expect_sorted(strings, permute_base);
 
-        bench_permute(
-            "hybrid_stable_sort_cpp", strings, permute_base,
-            [](strings_t const &strings, permute_t &permute) { hybrid_stable_sort_cpp(strings, permute.data()); });
+        bench_permute("hybrid_stable_sort_cpp", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
+            hybrid_stable_sort_cpp(strings, permute.data());
+        });
         expect_sorted(strings, permute_new);
         expect_same(permute_base, permute_new);
     }

From 455508f9aad42c295248fb6482711f32359d5521 Mon Sep 17 00:00:00 2001
From: ashbob999 <ashbob999@gmail.com>
Date: Sun, 5 Jan 2025 21:40:44 +0000
Subject: [PATCH 068/751] Fix: hybrid bench sorts loading initial stirng bytes
 incorrectly

---
 scripts/bench_sort.cpp | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/scripts/bench_sort.cpp b/scripts/bench_sort.cpp
index 1cd99e49..9b484baa 100644
--- a/scripts/bench_sort.cpp
+++ b/scripts/bench_sort.cpp
@@ -66,14 +66,18 @@ void populate_from_file(std::string path, strings_t &strings,
     while (strings.size() < limit && std::getline(f, s, ' ')) strings.push_back(s);
 }
 
-constexpr size_t offset_in_word = 0;
+constexpr size_t offset_in_word = 4;
 
 static idx_t hybrid_sort_cpp(strings_t const &strings, sz_u64_t *order) {
 
     // What if we take up-to 4 first characters and the index
-    for (size_t i = 0; i != strings.size(); ++i)
-        std::memcpy((char *)&order[i] + offset_in_word, strings[order[i]].c_str(),
-                    std::min<std::size_t>(strings[order[i]].size(), 4ul));
+    for (size_t i = 0; i != strings.size(); ++i) {
+        size_t index = order[i];
+
+        for (size_t j = 0; j < std::min<std::size_t>(strings[(sz_size_t)index].size(), 4ul); ++j) {
+            std::memcpy((char *)&order[i] + offset_in_word + 3 - j, strings[(sz_size_t)index].c_str() + j, 1ul);
+        }
+    }
 
     std::sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) {
         char *i_bytes = (char *)&i;
@@ -91,9 +95,13 @@ static idx_t hybrid_sort_cpp(strings_t const &strings, sz_u64_t *order) {
 static idx_t hybrid_stable_sort_cpp(strings_t const &strings, sz_u64_t *order) {
 
     // What if we take up-to 4 first characters and the index
-    for (size_t i = 0; i != strings.size(); ++i)
-        std::memcpy((char *)&order[i] + offset_in_word, strings[order[i]].c_str(),
-                    std::min<std::size_t>(strings[order[i]].size(), 4ull));
+    for (size_t i = 0; i != strings.size(); ++i) {
+        size_t index = order[i];
+
+        for (size_t j = 0; j < std::min<std::size_t>(strings[(sz_size_t)index].size(), 4ul); ++j) {
+            std::memcpy((char *)&order[i] + offset_in_word + 3 - j, strings[(sz_size_t)index].c_str() + j, 1ul);
+        }
+    }
 
     std::stable_sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) {
         char *i_bytes = (char *)&i;
@@ -196,7 +204,7 @@ int main(int argc, char const **argv) {
         });
         expect_sorted(strings, permute_new);
 
-#if __linux__ && defined(_GNU_SOURCE)
+#if __linux__ && defined(_GNU_SOURCE) & !defined(__BIONIC__)
         bench_permute("qsort_r", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
             sz_sequence_t array;
             array.order = permute.data();

From 9880f266f6d9b440db494aab744aaf0f474232a3 Mon Sep 17 00:00:00 2001
From: ashbob999 <ashbob999@gmail.com>
Date: Sun, 5 Jan 2025 21:47:56 +0000
Subject: [PATCH 069/751] Improve: hybrid bench sort performance

---
 scripts/bench_sort.cpp | 68 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 62 insertions(+), 6 deletions(-)

diff --git a/scripts/bench_sort.cpp b/scripts/bench_sort.cpp
index 9b484baa..91734c1b 100644
--- a/scripts/bench_sort.cpp
+++ b/scripts/bench_sort.cpp
@@ -85,9 +85,37 @@ static idx_t hybrid_sort_cpp(strings_t const &strings, sz_u64_t *order) {
         return *(uint32_t *)(i_bytes + offset_in_word) < *(uint32_t *)(j_bytes + offset_in_word);
     });
 
-    for (size_t i = 0; i != strings.size(); ++i) std::memset((char *)&order[i] + offset_in_word, 0, 4ul);
+    const auto extract_bytes = [](sz_u64_t v) -> uint32_t {
+        char *bytes = (char *)&v;
+        return *(uint32_t *)(bytes + offset_in_word);
+    };
+
+    if (strings.size() >= 2) {
+        size_t prev_index = 0;
+        uint64_t prev_bytes = extract_bytes(order[0]);
+
+        for (size_t i = 1; i < strings.size(); ++i) {
+            uint32_t bytes = extract_bytes(order[i]);
+            if (bytes != prev_bytes) {
+                std::sort(order + prev_index, order + i, [&](sz_u64_t i, sz_u64_t j) {
+                    // Assumes: offset_in_word==4
+                    sz_size_t i_index = i & 0xFFFF'FFFF;
+                    sz_size_t j_index = j & 0xFFFF'FFFF;
+                    return strings[i_index] < strings[j_index];
+                });
+                prev_index = i;
+                prev_bytes = bytes;
+            }
+        }
 
-    std::sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) { return strings[i] < strings[j]; });
+        std::sort(order + prev_index, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) {
+            sz_size_t i_index = i & 0xFFFF'FFFF;
+            sz_size_t j_index = j & 0xFFFF'FFFF;
+            return strings[i_index] < strings[j_index];
+        });
+    }
+
+    for (size_t i = 0; i != strings.size(); ++i) std::memset((char *)&order[i] + offset_in_word, 0, 4ul);
 
     return strings.size();
 }
@@ -109,9 +137,37 @@ static idx_t hybrid_stable_sort_cpp(strings_t const &strings, sz_u64_t *order) {
         return *(uint32_t *)(i_bytes + offset_in_word) < *(uint32_t *)(j_bytes + offset_in_word);
     });
 
-    for (size_t i = 0; i != strings.size(); ++i) std::memset((char *)&order[i] + offset_in_word, 0, 4ul);
+    const auto extract_bytes = [](sz_u64_t v) -> uint32_t {
+        char *bytes = (char *)&v;
+        return *(uint32_t *)(bytes + offset_in_word);
+    };
+
+    if (strings.size() >= 2) {
+        size_t prev_index = 0;
+        uint64_t prev_bytes = extract_bytes(order[0]);
+
+        for (size_t i = 1; i < strings.size(); ++i) {
+            uint32_t bytes = extract_bytes(order[i]);
+            if (bytes != prev_bytes) {
+                std::stable_sort(order + prev_index, order + i, [&](sz_u64_t i, sz_u64_t j) {
+                    // Assumes: offset_in_word==4
+                    sz_size_t i_index = i & 0xFFFF'FFFF;
+                    sz_size_t j_index = j & 0xFFFF'FFFF;
+                    return strings[i_index] < strings[j_index];
+                });
+                prev_index = i;
+                prev_bytes = bytes;
+            }
+        }
+
+        std::stable_sort(order + prev_index, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) {
+            sz_size_t i_index = i & 0xFFFF'FFFF;
+            sz_size_t j_index = j & 0xFFFF'FFFF;
+            return strings[i_index] < strings[j_index];
+        });
+    }
 
-    std::stable_sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) { return strings[i] < strings[j]; });
+    for (size_t i = 0; i != strings.size(); ++i) std::memset((char *)&order[i] + offset_in_word, 0, 4ul);
 
     return strings.size();
 }
@@ -204,7 +260,7 @@ int main(int argc, char const **argv) {
         });
         expect_sorted(strings, permute_new);
 
-#if __linux__ && defined(_GNU_SOURCE) & !defined(__BIONIC__)
+#if __linux__ && defined(_GNU_SOURCE) && !defined(__BIONIC__)
         bench_permute("qsort_r", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
             sz_sequence_t array;
             array.order = permute.data();
@@ -248,4 +304,4 @@ int main(int argc, char const **argv) {
     }
 
     return 0;
-}
\ No newline at end of file
+}

From 2c49eaed742ec055a51b9ba398ed54395ee73707 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 12 Feb 2025 21:51:08 +0000
Subject: [PATCH 070/751] Break: Replace `char_set` constructor with literals

---
 README.md                           |   6 +-
 include/stringzilla/stringzilla.hpp |  73 +++++----
 scripts/test.cpp                    | 227 +++++++++++++++++-----------
 3 files changed, 185 insertions(+), 121 deletions(-)

diff --git a/README.md b/README.md
index bf3872a7..453b3bf6 100644
--- a/README.md
+++ b/README.md
@@ -724,12 +724,12 @@ haystack.compare(needle) == 1; // Or `haystack <=> needle` in C++ 20 and beyond
 StringZilla also provides string literals for automatic type resolution, [similar to STL][stl-literal]:
 
 ```cpp
-using sz::literals::operator""_sz;
+using sz::literals::operator""_sv;
 using std::literals::operator""sv;
 
 auto a = "some string"; // char const *
 auto b = "some string"sv; // std::string_view
-auto b = "some string"_sz; // sz::string_view
+auto b = "some string"_sv; // sz::string_view
 ```
 
 [stl-literal]: https://en.cppreference.com/w/cpp/string/basic_string_view/operator%22%22sv
@@ -887,7 +887,7 @@ str("a:b").back(-2) == ":b"; // similar to Python's `"a:b"[-2:]`
 str("a:b").sub(1, -1) == ":"; // similar to Python's `"a:b"[1:-1]`
 str("a:b").sub(-2, -1) == ":"; // similar to Python's `"a:b"[-2:-1]`
 str("a:b").sub(-2, 1) == ""; // similar to Python's `"a:b"[-2:1]`
-"a:b"_sz[{-2, -1}] == ":"; // works on views and overloads `operator[]`
+"a:b"_sv[{-2, -1}] == ":"; // works on views and overloads `operator[]`
 ```
 
 Assuming StringZilla is a header-only library you can use the full API in some translation units and gradually transition to safer restricted API in others.
diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index a80da804..94c75cba 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -67,7 +67,7 @@ namespace ashvardanian {
 namespace stringzilla {
 
 template <typename>
-class basic_charset;
+class basic_char_set;
 template <typename>
 class basic_string_slice;
 template <typename, typename>
@@ -266,79 +266,85 @@ inline carray<64> const &base64() noexcept {
  *  @brief  A set of characters represented as a bitset with 256 slots.
  */
 template <typename char_type_ = char>
-class basic_charset {
+class basic_char_set {
     sz_charset_t bitset_;
 
   public:
     using char_type = char_type_;
 
-    basic_charset() noexcept {
+    constexpr basic_char_set() noexcept {
         // ! Instead of relying on the `sz_charset_init`, we have to reimplement it to support `constexpr`.
         bitset_._u64s[0] = 0, bitset_._u64s[1] = 0, bitset_._u64s[2] = 0, bitset_._u64s[3] = 0;
     }
-    explicit basic_charset(std::initializer_list<char_type> chars) noexcept : basic_charset() {
+    explicit constexpr basic_char_set(std::initializer_list<char_type> chars) noexcept : basic_char_set() {
         // ! Instead of relying on the `sz_charset_add(&bitset_, c)`, we have to reimplement it to support `constexpr`.
         for (auto c : chars) bitset_._u64s[sz_bitcast(sz_u8_t, c) >> 6] |= (1ull << (sz_bitcast(sz_u8_t, c) & 63u));
     }
-    template <std::size_t count_characters>
-    explicit basic_charset(char_type const (&chars)[count_characters]) noexcept : basic_charset() {
-        static_assert(count_characters > 0, "Character array cannot be empty");
-        for (std::size_t i = 0; i < count_characters - 1; ++i) { // count_characters - 1 to exclude the null terminator
+
+    explicit constexpr basic_char_set(char_type const *chars, std::size_t count_characters) noexcept
+        : basic_char_set() {
+        for (std::size_t i = 0; i < count_characters; ++i) {
             char_type c = chars[i];
             bitset_._u64s[sz_bitcast(sz_u8_t, c) >> 6] |= (1ull << (sz_bitcast(sz_u8_t, c) & 63u));
         }
     }
 
     template <std::size_t count_characters>
-    explicit basic_charset(std::array<char_type, count_characters> const &chars) noexcept : basic_charset() {
+    explicit constexpr basic_char_set(std::array<char_type, count_characters> const &chars) noexcept
+        : basic_char_set() {
         static_assert(count_characters > 0, "Character array cannot be empty");
-        for (std::size_t i = 0; i < count_characters - 1; ++i) { // count_characters - 1 to exclude the null terminator
+        for (std::size_t i = 0; i < count_characters; ++i) {
             char_type c = chars[i];
             bitset_._u64s[sz_bitcast(sz_u8_t, c) >> 6] |= (1ull << (sz_bitcast(sz_u8_t, c) & 63u));
         }
     }
 
-    basic_charset(basic_charset const &other) noexcept : bitset_(other.bitset_) {}
-    basic_charset &operator=(basic_charset const &other) noexcept {
+    constexpr basic_char_set(basic_char_set const &other) noexcept : bitset_(other.bitset_) {}
+    constexpr basic_char_set &operator=(basic_char_set const &other) noexcept {
         bitset_ = other.bitset_;
         return *this;
     }
 
-    basic_charset operator|(basic_charset other) const noexcept {
-        basic_charset result = *this;
+    constexpr basic_char_set operator|(basic_char_set other) const noexcept {
+        basic_char_set result = *this;
         result.bitset_._u64s[0] |= other.bitset_._u64s[0], result.bitset_._u64s[1] |= other.bitset_._u64s[1],
             result.bitset_._u64s[2] |= other.bitset_._u64s[2], result.bitset_._u64s[3] |= other.bitset_._u64s[3];
-        return *this;
+        return result;
     }
 
-    inline basic_charset &add(char_type c) noexcept {
+    inline basic_char_set &add(char_type c) noexcept {
         sz_charset_add(&bitset_, sz_bitcast(sz_u8_t, c));
         return *this;
     }
+    inline std::size_t size() const noexcept {
+        return                                                                      //
+            sz_u64_popcount(bitset_._u64s[0]) + sz_u64_popcount(bitset_._u64s[1]) + //
+            sz_u64_popcount(bitset_._u64s[2]) + sz_u64_popcount(bitset_._u64s[3]);
+    }
     inline sz_charset_t &raw() noexcept { return bitset_; }
     inline sz_charset_t const &raw() const noexcept { return bitset_; }
     inline bool contains(char_type c) const noexcept { return sz_charset_contains(&bitset_, sz_bitcast(sz_u8_t, c)); }
-    inline basic_charset inverted() const noexcept {
-        basic_charset result = *this;
+    inline basic_char_set inverted() const noexcept {
+        basic_char_set result = *this;
         sz_charset_invert(&result.bitset_);
         return result;
     }
 };
 
-using char_set = basic_charset<char>;
-
-inline char_set ascii_letters_set() { return char_set {ascii_letters()}; }
-inline char_set ascii_lowercase_set() { return char_set {ascii_lowercase()}; }
-inline char_set ascii_uppercase_set() { return char_set {ascii_uppercase()}; }
-inline char_set ascii_printables_set() { return char_set {ascii_printables()}; }
-inline char_set ascii_controls_set() { return char_set {ascii_controls()}; }
-inline char_set digits_set() { return char_set {digits()}; }
-inline char_set hexdigits_set() { return char_set {hexdigits()}; }
-inline char_set octdigits_set() { return char_set {octdigits()}; }
-inline char_set punctuation_set() { return char_set {punctuation()}; }
-inline char_set whitespaces_set() { return char_set {whitespaces()}; }
-inline char_set newlines_set() { return char_set {newlines()}; }
-inline char_set base64_set() { return char_set {base64()}; }
+using char_set = basic_char_set<char>;
+
+inline char_set ascii_letters_set() { return char_set {ascii_letters(), sizeof(ascii_letters())}; }
+inline char_set ascii_lowercase_set() { return char_set {ascii_lowercase(), sizeof(ascii_lowercase())}; }
+inline char_set ascii_uppercase_set() { return char_set {ascii_uppercase(), sizeof(ascii_uppercase())}; }
+inline char_set ascii_printables_set() { return char_set {ascii_printables(), sizeof(ascii_printables())}; }
+inline char_set ascii_controls_set() { return char_set {ascii_controls(), sizeof(ascii_controls())}; }
+inline char_set digits_set() { return char_set {digits(), sizeof(digits())}; }
+inline char_set hexdigits_set() { return char_set {hexdigits(), sizeof(hexdigits())}; }
+inline char_set octdigits_set() { return char_set {octdigits(), sizeof(octdigits())}; }
+inline char_set punctuation_set() { return char_set {punctuation(), sizeof(punctuation())}; }
+inline char_set whitespaces_set() { return char_set {whitespaces(), sizeof(whitespaces())}; }
+inline char_set newlines_set() { return char_set {newlines(), sizeof(newlines())}; }
+inline char_set base64_set() { return char_set {base64(), sizeof(base64())}; }
 
 /**
  *  @brief  A look-up table for character replacement operations.
@@ -3446,7 +3452,8 @@ using string = basic_string<char, std::allocator<char>>;
 static_assert(sizeof(string) == 4 * sizeof(void *), "String size must be 4 pointers.");
 
 namespace literals {
-constexpr string_view operator""_sz(char const *str, std::size_t length) noexcept { return {str, length}; }
+constexpr string_view operator""_sv(char const *str, std::size_t length) noexcept { return {str, length}; }
+constexpr char_set operator""_cs(char const *str, std::size_t length) noexcept { return char_set {str, length}; }
 } // namespace literals
 
 template <typename char_type_, typename allocator_>
diff --git a/scripts/test.cpp b/scripts/test.cpp
index ead0c88d..87db34c8 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -46,7 +46,8 @@
 
 namespace sz = ashvardanian::stringzilla;
 using namespace sz::scripts;
-using sz::literals::operator""_sz;
+using sz::literals::operator""_sv; // for `sz::string_view`
+using sz::literals::operator""_cs; // for `sz::char_set`
 
 /*
  *  Instantiate all the templates to make the symbols visible and also check
@@ -58,7 +59,7 @@ template class std::basic_string_view<char>;
 template class sz::basic_string_slice<char>;
 template class std::basic_string<char>;
 template class sz::basic_string<char>;
-template class sz::basic_charset<char>;
+template class sz::basic_char_set<char>;
 
 template class std::vector<sz::string>;
 template class std::map<sz::string, int>;
@@ -137,6 +138,61 @@ static void test_arithmetical_utilities() {
                    (static_cast<sz_u8_t>(number) / static_cast<sz_u8_t>(divisor)));
 }
 
+/**
+ *  @brief  Tests various ASCII-based methods (e.g., `is_alpha`, `is_digit`)
+ *          provided by `sz::string` and `sz::string_view`.
+ */
+template <typename string_type>
+static void test_ascii_utilities() {
+
+    using str = string_type;
+
+    assert("aaa"_cs.size() == 1ull);
+    assert("\0\0"_cs.size() == 1ull);
+    assert("abc"_cs.size() == 3ull);
+    assert("a\0bc"_cs.size() == 4ull);
+
+    assert(!"abc"_cs.contains('\0'));
+    assert(str("bca").contains_only("abc"_cs));
+
+    assert(!str("").is_alpha());
+    assert(str("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ").is_alpha());
+    assert(!str("abc9").is_alpha());
+
+    assert(!str("").is_alnum());
+    assert(str("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789").is_alnum());
+    assert(!str("abc!").is_alnum());
+
+    assert(str("").is_ascii());
+    assert(str("\x00x7F").is_ascii());
+    assert(!str("abc123🔥").is_ascii());
+
+    assert(!str("").is_digit());
+    assert(str("0123456789").is_digit());
+    assert(!str("012a").is_digit());
+
+    assert(!str("").is_lower());
+    assert(str("abcdefghijklmnopqrstuvwxyz").is_lower());
+    assert(!str("abcA").is_lower());
+    assert(!str("abc\n").is_lower());
+
+    assert(!str("").is_space());
+    assert(str(" \t\n\r\f\v").is_space());
+    assert(!str(" \t\r\na").is_space());
+
+    assert(!str("").is_upper());
+    assert(str("ABCDEFGHIJKLMNOPQRSTUVWXYZ").is_upper());
+    assert(!str("ABCa").is_upper());
+
+    assert(str("").is_printable());
+    assert(str("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*()_+").is_printable());
+    assert(!str("012🔥").is_printable());
+
+    assert(str("").contains_only("abc"_cs));
+    assert(str("abc").contains_only("abc"_cs));
+    assert(!str("abcd").contains_only("abc"_cs));
+}
+
 inline void expect_equality(char const *a, char const *b, std::size_t size) {
     if (std::memcmp(a, b, size) == 0) return;
     std::size_t mismatch_position = 0;
@@ -838,9 +894,9 @@ void test_non_stl_extensions_for_updates() {
     assert_scoped(str s = "hello", s.replace_all("xx", "xx"), s == "hello");
     assert_scoped(str s = "hello", s.replace_all("l", "1"), s == "he11o");
     assert_scoped(str s = "hello", s.replace_all("he", "al"), s == "alllo");
-    assert_scoped(str s = "hello", s.replace_all(sz::char_set("x"), "!"), s == "hello");
-    assert_scoped(str s = "hello", s.replace_all(sz::char_set("o"), "!"), s == "hell!");
-    assert_scoped(str s = "hello", s.replace_all(sz::char_set("ho"), "!"), s == "!ell!");
+    assert_scoped(str s = "hello", s.replace_all("x"_cs, "!"), s == "hello");
+    assert_scoped(str s = "hello", s.replace_all("o"_cs, "!"), s == "hell!");
+    assert_scoped(str s = "hello", s.replace_all("ho"_cs, "!"), s == "!ell!");
 
     // Shorter replacements.
     assert_scoped(str s = "hello", s.replace_all("xx", "x"), s == "hello");
@@ -848,8 +904,8 @@ void test_non_stl_extensions_for_updates() {
     assert_scoped(str s = "hello", s.replace_all("h", ""), s == "ello");
     assert_scoped(str s = "hello", s.replace_all("o", ""), s == "hell");
     assert_scoped(str s = "hello", s.replace_all("llo", "!"), s == "he!");
-    assert_scoped(str s = "hello", s.replace_all(sz::char_set("x"), ""), s == "hello");
-    assert_scoped(str s = "hello", s.replace_all(sz::char_set("lo"), ""), s == "he");
+    assert_scoped(str s = "hello", s.replace_all("x"_cs, ""), s == "hello");
+    assert_scoped(str s = "hello", s.replace_all("lo"_cs, ""), s == "he");
 
     // Longer replacements.
     assert_scoped(str s = "hello", s.replace_all("xx", "xxx"), s == "hello");
@@ -857,8 +913,8 @@ void test_non_stl_extensions_for_updates() {
     assert_scoped(str s = "hello", s.replace_all("h", "hh"), s == "hhello");
     assert_scoped(str s = "hello", s.replace_all("o", "oo"), s == "helloo");
     assert_scoped(str s = "hello", s.replace_all("llo", "llo!"), s == "hello!");
-    assert_scoped(str s = "hello", s.replace_all(sz::char_set("x"), "xx"), s == "hello");
-    assert_scoped(str s = "hello", s.replace_all(sz::char_set("lo"), "lo"), s == "helololo");
+    assert_scoped(str s = "hello", s.replace_all("x"_cs, "xx"), s == "hello");
+    assert_scoped(str s = "hello", s.replace_all("lo"_cs, "lo"), s == "helololo");
 
     // Directly mapping bytes using a Look-Up Table.
     sz::look_up_table invert_case = sz::look_up_table::identity();
@@ -872,8 +928,8 @@ void test_non_stl_extensions_for_updates() {
     assert(str(str("a") | str("b")) == "ab");
     assert(str(str("a") | str("b") | str("ab")) == "abab");
 
-    assert(str(sz::concatenate("a"_sz, "b"_sz)) == "ab");
-    assert(str(sz::concatenate("a"_sz, "b"_sz, "c"_sz)) == "abc");
+    assert(str(sz::concatenate("a"_sv, "b"_sv)) == "ab");
+    assert(str(sz::concatenate("a"_sv, "b"_sv, "c"_sv)) == "abc");
 
     // Randomization.
     assert(str::random(0).empty());
@@ -1062,15 +1118,15 @@ static void test_updates(std::size_t repetitions = 1024) {
  */
 static void test_comparisons() {
     // Comparing relative order of the strings
-    assert("a"_sz.compare("a") == 0);
-    assert("a"_sz.compare("ab") == -1);
-    assert("ab"_sz.compare("a") == 1);
-    assert("a"_sz.compare("a\0"_sz) == -1);
-    assert("a\0"_sz.compare("a") == 1);
-    assert("a\0"_sz.compare("a\0"_sz) == 0);
-    assert("a"_sz == "a"_sz);
-    assert("a"_sz != "a\0"_sz);
-    assert("a\0"_sz == "a\0"_sz);
+    assert("a"_sv.compare("a") == 0);
+    assert("a"_sv.compare("ab") == -1);
+    assert("ab"_sv.compare("a") == 1);
+    assert("a"_sv.compare("a\0"_sv) == -1);
+    assert("a\0"_sv.compare("a") == 1);
+    assert("a\0"_sv.compare("a\0"_sv) == 0);
+    assert("a"_sv == "a"_sv);
+    assert("a"_sv != "a\0"_sv);
+    assert("a\0"_sv == "a\0"_sv);
 }
 
 /**
@@ -1099,57 +1155,57 @@ static void test_search() {
     assert(sz::string_view(sz::ascii_printables(), sizeof(sz::ascii_printables())).find_first_of("~") !=
            sz::string_view::npos);
 
-    assert("aabaa"_sz.remove_prefix("a") == "abaa");
-    assert("aabaa"_sz.remove_suffix("a") == "aaba");
-    assert("aabaa"_sz.lstrip(sz::char_set {"a"}) == "baa");
-    assert("aabaa"_sz.rstrip(sz::char_set {"a"}) == "aab");
-    assert("aabaa"_sz.strip(sz::char_set {"a"}) == "b");
+    assert("aabaa"_sv.remove_prefix("a") == "abaa");
+    assert("aabaa"_sv.remove_suffix("a") == "aaba");
+    assert("aabaa"_sv.lstrip("a"_cs) == "baa");
+    assert("aabaa"_sv.rstrip("a"_cs) == "aab");
+    assert("aabaa"_sv.strip("a"_cs) == "b");
 
     // Check more advanced composite operations
-    assert("abbccc"_sz.partition('b').before.size() == 1);
-    assert("abbccc"_sz.partition("bb").before.size() == 1);
-    assert("abbccc"_sz.partition("bb").match.size() == 2);
-    assert("abbccc"_sz.partition("bb").after.size() == 3);
-    assert("abbccc"_sz.partition("bb").before == "a");
-    assert("abbccc"_sz.partition("bb").match == "bb");
-    assert("abbccc"_sz.partition("bb").after == "ccc");
-    assert("abb ccc"_sz.partition(sz::whitespaces_set()).after == "ccc");
+    assert("abbccc"_sv.partition('b').before.size() == 1);
+    assert("abbccc"_sv.partition("bb").before.size() == 1);
+    assert("abbccc"_sv.partition("bb").match.size() == 2);
+    assert("abbccc"_sv.partition("bb").after.size() == 3);
+    assert("abbccc"_sv.partition("bb").before == "a");
+    assert("abbccc"_sv.partition("bb").match == "bb");
+    assert("abbccc"_sv.partition("bb").after == "ccc");
+    assert("abb ccc"_sv.partition(sz::whitespaces_set()).after == "ccc");
 
     // Check ranges of search matches
-    assert("hello"_sz.find_all("l").size() == 2);
-    assert("hello"_sz.rfind_all("l").size() == 2);
-
-    assert(""_sz.find_all(".", sz::include_overlaps_type {}).size() == 0);
-    assert(""_sz.find_all(".", sz::exclude_overlaps_type {}).size() == 0);
-    assert("."_sz.find_all(".", sz::include_overlaps_type {}).size() == 1);
-    assert("."_sz.find_all(".", sz::exclude_overlaps_type {}).size() == 1);
-    assert(".."_sz.find_all(".", sz::include_overlaps_type {}).size() == 2);
-    assert(".."_sz.find_all(".", sz::exclude_overlaps_type {}).size() == 2);
-    assert(""_sz.rfind_all(".", sz::include_overlaps_type {}).size() == 0);
-    assert(""_sz.rfind_all(".", sz::exclude_overlaps_type {}).size() == 0);
-    assert("."_sz.rfind_all(".", sz::include_overlaps_type {}).size() == 1);
-    assert("."_sz.rfind_all(".", sz::exclude_overlaps_type {}).size() == 1);
-    assert(".."_sz.rfind_all(".", sz::include_overlaps_type {}).size() == 2);
-    assert(".."_sz.rfind_all(".", sz::exclude_overlaps_type {}).size() == 2);
-
-    assert("a.b.c.d"_sz.find_all(".").size() == 3);
-    assert("a.,b.,c.,d"_sz.find_all(".,").size() == 3);
-    assert("a.,b.,c.,d"_sz.rfind_all(".,").size() == 3);
-    assert("a.b,c.d"_sz.find_all(sz::char_set(".,")).size() == 3);
-    assert("a...b...c"_sz.rfind_all("..").size() == 4);
-    assert("a...b...c"_sz.rfind_all("..", sz::include_overlaps_type {}).size() == 4);
-    assert("a...b...c"_sz.rfind_all("..", sz::exclude_overlaps_type {}).size() == 2);
-
-    auto finds = "a.b.c"_sz.find_all(sz::char_set("abcd")).template to<std::vector<std::string>>();
+    assert("hello"_sv.find_all("l").size() == 2);
+    assert("hello"_sv.rfind_all("l").size() == 2);
+
+    assert(""_sv.find_all(".", sz::include_overlaps_type {}).size() == 0);
+    assert(""_sv.find_all(".", sz::exclude_overlaps_type {}).size() == 0);
+    assert("."_sv.find_all(".", sz::include_overlaps_type {}).size() == 1);
+    assert("."_sv.find_all(".", sz::exclude_overlaps_type {}).size() == 1);
+    assert(".."_sv.find_all(".", sz::include_overlaps_type {}).size() == 2);
+    assert(".."_sv.find_all(".", sz::exclude_overlaps_type {}).size() == 2);
+    assert(""_sv.rfind_all(".", sz::include_overlaps_type {}).size() == 0);
+    assert(""_sv.rfind_all(".", sz::exclude_overlaps_type {}).size() == 0);
+    assert("."_sv.rfind_all(".", sz::include_overlaps_type {}).size() == 1);
+    assert("."_sv.rfind_all(".", sz::exclude_overlaps_type {}).size() == 1);
+    assert(".."_sv.rfind_all(".", sz::include_overlaps_type {}).size() == 2);
+    assert(".."_sv.rfind_all(".", sz::exclude_overlaps_type {}).size() == 2);
+
+    assert("a.b.c.d"_sv.find_all(".").size() == 3);
+    assert("a.,b.,c.,d"_sv.find_all(".,").size() == 3);
+    assert("a.,b.,c.,d"_sv.rfind_all(".,").size() == 3);
+    assert("a.b,c.d"_sv.find_all(".,"_cs).size() == 3);
+    assert("a...b...c"_sv.rfind_all("..").size() == 4);
+    assert("a...b...c"_sv.rfind_all("..", sz::include_overlaps_type {}).size() == 4);
+    assert("a...b...c"_sv.rfind_all("..", sz::exclude_overlaps_type {}).size() == 2);
+
+    auto finds = "a.b.c"_sv.find_all("abcd"_cs).template to<std::vector<std::string>>();
     assert(finds.size() == 3);
     assert(finds[0] == "a");
 
-    auto rfinds = "a.b.c"_sz.rfind_all(sz::char_set("abcd")).template to<std::vector<std::string>>();
+    auto rfinds = "a.b.c"_sv.rfind_all("abcd"_cs).template to<std::vector<std::string>>();
     assert(rfinds.size() == 3);
     assert(rfinds[0] == "c");
 
     {
-        auto splits = ".a..c."_sz.split(sz::char_set(".")).template to<std::vector<std::string>>();
+        auto splits = ".a..c."_sv.split("."_cs).template to<std::vector<std::string>>();
         assert(splits.size() == 5);
         assert(splits[0] == "");
         assert(splits[1] == "a");
@@ -1157,36 +1213,36 @@ static void test_search() {
     }
 
     {
-        auto splits = "line1\nline2\nline3"_sz.split("line3").template to<std::vector<std::string>>();
+        auto splits = "line1\nline2\nline3"_sv.split("line3").template to<std::vector<std::string>>();
         assert(splits.size() == 2);
         assert(splits[0] == "line1\nline2\n");
         assert(splits[1] == "");
     }
 
-    assert(""_sz.split(".").size() == 1);
-    assert(""_sz.rsplit(".").size() == 1);
-
-    assert("hello"_sz.split("l").size() == 3);
-    assert("hello"_sz.rsplit("l").size() == 3);
-    assert(*advanced("hello"_sz.split("l").begin(), 0) == "he");
-    assert(*advanced("hello"_sz.rsplit("l").begin(), 0) == "o");
-    assert(*advanced("hello"_sz.split("l").begin(), 1) == "");
-    assert(*advanced("hello"_sz.rsplit("l").begin(), 1) == "");
-    assert(*advanced("hello"_sz.split("l").begin(), 2) == "o");
-    assert(*advanced("hello"_sz.rsplit("l").begin(), 2) == "he");
-
-    assert("a.b.c.d"_sz.split(".").size() == 4);
-    assert("a.b.c.d"_sz.rsplit(".").size() == 4);
-    assert(*("a.b.c.d"_sz.split(".").begin()) == "a");
-    assert(*("a.b.c.d"_sz.rsplit(".").begin()) == "d");
-    assert(*advanced("a.b.c.d"_sz.split(".").begin(), 1) == "b");
-    assert(*advanced("a.b.c.d"_sz.rsplit(".").begin(), 1) == "c");
-    assert(*advanced("a.b.c.d"_sz.split(".").begin(), 3) == "d");
-    assert(*advanced("a.b.c.d"_sz.rsplit(".").begin(), 3) == "a");
-    assert("a.b.,c,d"_sz.split(".,").size() == 2);
-    assert("a.b,c.d"_sz.split(sz::char_set(".,")).size() == 4);
-
-    auto rsplits = ".a..c."_sz.rsplit(sz::char_set(".")).template to<std::vector<std::string>>();
+    assert(""_sv.split(".").size() == 1);
+    assert(""_sv.rsplit(".").size() == 1);
+
+    assert("hello"_sv.split("l").size() == 3);
+    assert("hello"_sv.rsplit("l").size() == 3);
+    assert(*advanced("hello"_sv.split("l").begin(), 0) == "he");
+    assert(*advanced("hello"_sv.rsplit("l").begin(), 0) == "o");
+    assert(*advanced("hello"_sv.split("l").begin(), 1) == "");
+    assert(*advanced("hello"_sv.rsplit("l").begin(), 1) == "");
+    assert(*advanced("hello"_sv.split("l").begin(), 2) == "o");
+    assert(*advanced("hello"_sv.rsplit("l").begin(), 2) == "he");
+
+    assert("a.b.c.d"_sv.split(".").size() == 4);
+    assert("a.b.c.d"_sv.rsplit(".").size() == 4);
+    assert(*("a.b.c.d"_sv.split(".").begin()) == "a");
+    assert(*("a.b.c.d"_sv.rsplit(".").begin()) == "d");
+    assert(*advanced("a.b.c.d"_sv.split(".").begin(), 1) == "b");
+    assert(*advanced("a.b.c.d"_sv.rsplit(".").begin(), 1) == "c");
+    assert(*advanced("a.b.c.d"_sv.split(".").begin(), 3) == "d");
+    assert(*advanced("a.b.c.d"_sv.rsplit(".").begin(), 3) == "a");
+    assert("a.b.,c,d"_sv.split(".,").size() == 2);
+    assert("a.b,c.d"_sv.split(".,"_cs).size() == 4);
+
+    auto rsplits = ".a..c."_sv.rsplit("."_cs).template to<std::vector<std::string>>();
     assert(rsplits.size() == 5);
     assert(rsplits[0] == "");
     assert(rsplits[1] == "c");
@@ -1557,6 +1613,7 @@ int main(int argc, char const **argv) {
 
     // Basic utilities
     test_arithmetical_utilities();
+    test_ascii_utilities<sz::string_view>();
     test_memory_utilities();
     test_replacements();
 

From d18a1591c226692a53adabf53418a53cca85bb95 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 12 Feb 2025 21:51:56 +0000
Subject: [PATCH 071/751] Docs: Spelling `usnigned`

---
 .vscode/settings.json               | 18 +++++++++---------
 include/stringzilla/stringzilla.hpp |  8 ++++----
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 980956d1..87e4d065 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -188,6 +188,7 @@
     "cwchar": "cpp",
     "cwctype": "cpp",
     "deque": "cpp",
+    "errno.h": "c",
     "exception": "cpp",
     "filesystem": "cpp",
     "format": "cpp",
@@ -195,6 +196,7 @@
     "fstream": "cpp",
     "functional": "cpp",
     "future": "cpp",
+    "immintrin.h": "c",
     "initializer_list": "cpp",
     "iomanip": "cpp",
     "ios": "cpp",
@@ -216,6 +218,7 @@
     "ostream": "cpp",
     "queue": "cpp",
     "random": "cpp",
+    "ranges": "cpp",
     "ratio": "cpp",
     "semaphore": "cpp",
     "set": "cpp",
@@ -224,6 +227,7 @@
     "span": "cpp",
     "sstream": "cpp",
     "stack": "cpp",
+    "stddef.h": "c",
     "stdexcept": "cpp",
     "stop_token": "cpp",
     "streambuf": "cpp",
@@ -232,6 +236,7 @@
     "stringzilla.h": "c",
     "strstream": "cpp",
     "system_error": "cpp",
+    "text_encoding": "cpp",
     "thread": "cpp",
     "tuple": "cpp",
     "type_traits": "cpp",
@@ -242,12 +247,9 @@
     "utility": "cpp",
     "variant": "cpp",
     "vector": "cpp",
-    "stddef.h": "c",
-    "immintrin.h": "c",
-    "xiosbase": "cpp",
-    "xstring": "cpp",
     "xfacet": "cpp",
     "xhash": "cpp",
+    "xiosbase": "cpp",
     "xlocale": "cpp",
     "xlocbuf": "cpp",
     "xlocinfo": "cpp",
@@ -256,11 +258,9 @@
     "xlocnum": "cpp",
     "xloctime": "cpp",
     "xmemory": "cpp",
+    "xstring": "cpp",
     "xtr1common": "cpp",
     "xtree": "cpp",
-    "xutility": "cpp",
-    "errno.h": "c",
-    "text_encoding": "cpp"
-  },
-  "python.pythonPath": "~/miniconda3/bin/python"
+    "xutility": "cpp"
+  }
 }
\ No newline at end of file
diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 94c75cba..c5918005 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -359,7 +359,7 @@ class basic_look_up_table {
                                           : sizeof(char_type_) == 2 ? 65536ul
                                                                     : 4294967296ul;
     static constexpr std::size_t bytes_k = size_k * sizeof(char_type_);
-    using usnigned_type_ = typename std::make_unsigned<char_type_>::type;
+    using unsigned_type_ = typename std::make_unsigned<char_type_>::type;
 
     char_type_ lut_[size_k];
 
@@ -384,13 +384,13 @@ class basic_look_up_table {
      */
     static basic_look_up_table identity() noexcept {
         basic_look_up_table result;
-        for (std::size_t i = 0; i < size_k; ++i) { result.lut_[i] = static_cast<usnigned_type_>(i); }
+        for (std::size_t i = 0; i < size_k; ++i) { result.lut_[i] = static_cast<unsigned_type_>(i); }
         return result;
     }
 
     inline sz_cptr_t raw() const noexcept { return reinterpret_cast<sz_cptr_t>(&lut_[0]); }
-    inline char_type &operator[](char_type c) noexcept { return lut_[sz_bitcast(usnigned_type_, c)]; }
-    inline char_type const &operator[](char_type c) const noexcept { return lut_[sz_bitcast(usnigned_type_, c)]; }
+    inline char_type &operator[](char_type c) noexcept { return lut_[sz_bitcast(unsigned_type_, c)]; }
+    inline char_type const &operator[](char_type c) const noexcept { return lut_[sz_bitcast(unsigned_type_, c)]; }
 };
 
 using look_up_table = basic_look_up_table<char>;

From 0ef7cf1ea268c3a85b2d6bb023769d64b7217713 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 12 Feb 2025 23:38:19 +0000
Subject: [PATCH 072/751] Make: Renamed include/stringzilla/hash.h ->
 include/stringzilla/fingerprint.h

---
 include/stringzilla/{hash.h => fingerprint.h} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename include/stringzilla/{hash.h => fingerprint.h} (100%)

diff --git a/include/stringzilla/hash.h b/include/stringzilla/fingerprint.h
similarity index 100%
rename from include/stringzilla/hash.h
rename to include/stringzilla/fingerprint.h

From 70522662130c34dc631e927d882489fe68c00592 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 12 Feb 2025 23:38:19 +0000
Subject: [PATCH 073/751] Make: Renamed include/stringzilla/hash.h ->
 temp-git-split-file

---
 include/stringzilla/hash.h => temp-git-split-file | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename include/stringzilla/hash.h => temp-git-split-file (100%)

diff --git a/include/stringzilla/hash.h b/temp-git-split-file
similarity index 100%
rename from include/stringzilla/hash.h
rename to temp-git-split-file

From 5a36cb7dfc7b6f7e6cfe3c6c59b63a3b4cee98ec Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 12 Feb 2025 23:38:19 +0000
Subject: [PATCH 074/751] Make: Renamed temp-git-split-file ->
 include/stringzilla/hash.h

---
 temp-git-split-file => include/stringzilla/hash.h | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename temp-git-split-file => include/stringzilla/hash.h (100%)

diff --git a/temp-git-split-file b/include/stringzilla/hash.h
similarity index 100%
rename from temp-git-split-file
rename to include/stringzilla/hash.h

From 38014ee288f64a715362fcfea7a40bad05af23e7 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 12 Feb 2025 23:46:05 +0000
Subject: [PATCH 075/751] Break: Deprecate old fingerprinting

---
 include/stringzilla/fingerprint.h   | 386 -------------------------
 include/stringzilla/hash.h          | 421 ----------------------------
 include/stringzilla/stringzilla.hpp |   3 +-
 scripts/test.cpp                    |  51 +---
 4 files changed, 5 insertions(+), 856 deletions(-)

diff --git a/include/stringzilla/fingerprint.h b/include/stringzilla/fingerprint.h
index 262cbdc9..9cdfcc5e 100644
--- a/include/stringzilla/fingerprint.h
+++ b/include/stringzilla/fingerprint.h
@@ -26,32 +26,6 @@ extern "C" {
 
 #pragma region Core API
 
-/**
- *  @brief  Computes the 64-bit check-sum of bytes in a string.
- *          Similar to `std::ranges::accumulate`.
- *
- *  @param text     String to aggregate.
- *  @param length   Number of bytes in the text.
- *  @return         64-bit unsigned value.
- */
-SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length);
-
-/**
- *  @brief  Computes the 64-bit unsigned hash of a string. Fairly fast for short strings,
- *          simple implementation, and supports rolling computation, reused in other APIs.
- *          Similar to `std::hash` in C++.
- *
- *  @param text     String to hash.
- *  @param length   Number of bytes in the text.
- *  @return         64-bit hash value.
- *
- *  @see    sz_hashes, sz_hashes_fingerprint, sz_hashes_intersection
- */
-SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length) {
-    sz_unused(text && length);
-    return 0;
-}
-
 /**
  *  @brief  Computes the Karp-Rabin rolling hashes of a string supplying them to the provided `callback`.
  *          Can be used for similarity scores, search, ranking, etc.
@@ -124,56 +98,16 @@ SZ_PUBLIC sz_size_t sz_hashes_intersection(                    //
     sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
     sz_cptr_t fingerprint, sz_size_t fingerprint_bytes);
 
-/**
- *  @brief  Generates a random string for a given alphabet, avoiding integer division and modulo operations.
- *          Similar to `text[i] = alphabet[rand() % cardinality]`.
- *
- *  The modulo operation is expensive, and should be avoided in performance-critical code.
- *  We avoid it using small lookup tables and replacing it with a multiplication and shifts, similar to `libdivide`.
- *  Alternative algorithms would include:
- *      - Montgomery form: https://en.algorithmica.org/hpc/number-theory/montgomery/
- *      - Barret reduction: https://www.nayuki.io/page/barrett-reduction-algorithm
- *      - Lemire's trick: https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
- *
- *  @param alphabet     Set of characters to sample from.
- *  @param cardinality  Number of characters to sample from.
- *  @param text         Output string, can point to the same address as ::text.
- *  @param generate     Callback producing random numbers given the generator state.
- *  @param generator    Generator state, can be a pointer to a seed, or a pointer to a random number generator.
- */
-SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
-                            sz_random_generator_t generate, void *generator);
-
-/** @copydoc sz_checksum */
-SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length);
-
-/** @copydoc sz_hash */
-SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t text, sz_size_t length);
-
 /** @copydoc sz_hashes */
 SZ_PUBLIC void sz_hashes_serial(                                                      //
     sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
     sz_hash_callback_t callback, void *callback_handle);
-
-/** @copydoc sz_generate */
-SZ_PUBLIC void sz_generate_serial( //
-    sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length, sz_random_generator_t generate,
-    void *generator) {
-    sz_unused(alphabet && cardinality && text && length && generate && generator);
 }
 
 #pragma endregion // Core API
 
 #pragma region Serial Implementation
 
-SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length) {
-    sz_u64_t checksum = 0;
-    sz_u8_t const *text_u8 = (sz_u8_t const *)text;
-    sz_u8_t const *text_end = text_u8 + length;
-    for (; text_u8 != text_end; ++text_u8) checksum += *text_u8;
-    return checksum;
-}
-
 /*
  *  One hardware-accelerated way of mixing hashes can be CRC, but it's only implemented for 32-bit values.
  *  Using a Boost-like mixer works very poorly in such case:
@@ -188,117 +122,6 @@ SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length) {
 #define _sz_shift_high(x) ((x + 77ull) & 0xFFull)
 #define _sz_prime_mod(x) (x % SZ_U64_MAX_PRIME)
 
-SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length) {
-
-    sz_u64_t hash_low = 0;
-    sz_u64_t hash_high = 0;
-    sz_u8_t const *text = (sz_u8_t const *)start;
-    sz_u8_t const *text_end = text + length;
-
-    switch (length) {
-    case 0: return 0;
-
-    // Texts under 7 bytes long are definitely below the largest prime.
-    case 1:
-        hash_low = _sz_shift_low(text[0]);
-        hash_high = _sz_shift_high(text[0]);
-        break;
-    case 2:
-        hash_low = _sz_shift_low(text[0]) * 31ull + _sz_shift_low(text[1]);
-        hash_high = _sz_shift_high(text[0]) * 257ull + _sz_shift_high(text[1]);
-        break;
-    case 3:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull +         //
-                   _sz_shift_low(text[2]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull +          //
-                    _sz_shift_high(text[2]);
-        break;
-    case 4:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull +                 //
-                   _sz_shift_low(text[3]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull +                   //
-                    _sz_shift_high(text[3]);
-        break;
-    case 5:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull +                         //
-                   _sz_shift_low(text[4]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull +                            //
-                    _sz_shift_high(text[4]);
-        break;
-    case 6:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull * 31ull +                         //
-                   _sz_shift_low(text[4]) * 31ull +                                 //
-                   _sz_shift_low(text[5]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull * 257ull +                            //
-                    _sz_shift_high(text[4]) * 257ull +                                     //
-                    _sz_shift_high(text[5]);
-        break;
-    case 7:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull * 31ull * 31ull +                         //
-                   _sz_shift_low(text[4]) * 31ull * 31ull +                                 //
-                   _sz_shift_low(text[5]) * 31ull +                                         //
-                   _sz_shift_low(text[6]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull * 257ull * 257ull +                            //
-                    _sz_shift_high(text[4]) * 257ull * 257ull +                                     //
-                    _sz_shift_high(text[5]) * 257ull +                                              //
-                    _sz_shift_high(text[6]);
-        break;
-    default:
-        // Unroll the first seven cycles:
-        hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[1]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[1]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[2]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[2]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[3]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[3]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[4]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[4]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[5]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[5]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[6]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[6]);
-        text += 7;
-
-        // Iterate throw the rest with the modulus:
-        for (; text != text_end; ++text) {
-            hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
-            hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
-            // Wrap the hashes around:
-            hash_low = _sz_prime_mod(hash_low);
-            hash_high = _sz_prime_mod(hash_high);
-        }
-        break;
-    }
-
-    return _sz_hash_mix(hash_low, hash_high);
-}
-
 SZ_PUBLIC void sz_hashes_serial(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
                                 sz_hash_callback_t callback, void *callback_handle) {
 
@@ -387,86 +210,6 @@ SZ_INTERNAL void _sz_hashes_fingerprint_scalar_callback( //
 #pragma GCC target("avx2")
 #pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
 
-SZ_PUBLIC sz_u64_t sz_checksum_haswell(sz_cptr_t text, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "loads".
-    //
-    // A typical AWS Skylake instance can have 32 KB x 2 blocks of L1 data cache per core,
-    // 1 MB x 2 blocks of L2 cache per core, and one shared L3 cache buffer.
-    // For now, let's avoid the cases beyond the L2 size.
-    int is_huge = length > 1ull * 1024ull * 1024ull;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 32) { return sz_checksum_serial(text, length); }
-    else if (!is_huge) {
-        sz_u256_vec_t text_vec, sums_vec;
-        sums_vec.ymm = _mm256_setzero_si256();
-        for (; length >= 32; text += 32, length -= 32) {
-            text_vec.ymm = _mm256_lddqu_si256((__m256i const *)text);
-            sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-        }
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        sz_u64_t result = low + high;
-        if (length) result += sz_checksum_serial(text, length);
-        return result;
-    }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    // Most notably, we can avoid populating the cache with the entire buffer, and instead traverse it in 2 directions.
-    else {
-        sz_size_t head_length = (32 - ((sz_size_t)text % 32)) % 32; // 31 or less.
-        sz_size_t tail_length = (sz_size_t)(text + length) % 32;    // 31 or less.
-        sz_size_t body_length = length - head_length - tail_length; // Multiple of 32.
-        sz_u64_t result = 0;
-
-        // Handle the head
-        while (head_length--) result += *text++;
-
-        sz_u256_vec_t text_vec, sums_vec;
-        sums_vec.ymm = _mm256_setzero_si256();
-        // Fill the aligned body of the buffer.
-        if (!is_huge) {
-            for (; body_length >= 32; text += 32, body_length -= 32) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i const *)text);
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-            }
-        }
-        // When the biffer is huge, we can traverse it in 2 directions.
-        else {
-            sz_u256_vec_t text_reversed_vec, sums_reversed_vec;
-            sums_reversed_vec.ymm = _mm256_setzero_si256();
-            for (; body_length >= 64; text += 64, body_length -= 64) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i *)(text));
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-                text_reversed_vec.ymm = _mm256_stream_load_si256((__m256i *)(text + body_length - 64));
-                sums_reversed_vec.ymm = _mm256_add_epi64(
-                    sums_reversed_vec.ymm, _mm256_sad_epu8(text_reversed_vec.ymm, _mm256_setzero_si256()));
-            }
-            if (body_length >= 32) {
-                text_vec.ymm = _mm256_stream_load_si256((__m256i *)(text));
-                sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-            }
-            sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, sums_reversed_vec.ymm);
-        }
-
-        // Handle the tail
-        while (tail_length--) result += *text++;
-
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        result += low + high;
-        return result;
-    }
-}
-
 /**
  *  @brief  There is no AVX2 instruction for fast multiplication of 64-bit integers.
  *          This implementation is coming from Agner Fog's Vector Class Library.
@@ -642,100 +385,6 @@ SZ_PUBLIC void sz_hashes_haswell(sz_cptr_t start, sz_size_t length, sz_size_t wi
 #pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
                              apply_to = function)
 
-SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
-    // The naive implementation of this function is very simple.
-    // It assumes the CPU is great at handling unaligned "loads".
-    //
-    // A typical AWS Sapphire Rapids instance can have 48 KB x 2 blocks of L1 data cache per core,
-    // 2 MB x 2 blocks of L2 cache per core, and one shared 60 MB buffer of L3 cache.
-    // With two strings, we may consider the overall workload huge, if each exceeds 1 MB in length.
-    int const is_huge = length >= 1ull * 1024ull * 1024ull;
-    sz_u512_vec_t text_vec, sums_vec;
-
-    // When the buffer is small, there isn't much to innovate.
-    if (length <= 16) {
-        __mmask16 mask = _sz_u16_mask_until(length);
-        text_vec.xmms[0] = _mm_maskz_loadu_epi8(mask, text);
-        sums_vec.xmms[0] = _mm_sad_epu8(text_vec.xmms[0], _mm_setzero_si128());
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_vec.xmms[0]);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_vec.xmms[0], 1);
-        return low + high;
-    }
-    else if (length <= 32) {
-        __mmask32 mask = _sz_u32_mask_until(length);
-        text_vec.ymms[0] = _mm256_maskz_loadu_epi8(mask, text);
-        sums_vec.ymms[0] = _mm256_sad_epu8(text_vec.ymms[0], _mm256_setzero_si256());
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
-        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymms[0]);
-        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymms[0], 1);
-        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
-        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
-        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
-        return low + high;
-    }
-    else if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        return _mm512_reduce_add_epi64(sums_vec.zmm);
-    }
-    else if (!is_huge) {
-        sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64; // 63 or less.
-        sz_size_t tail_length = (sz_size_t)(text + length) % 64;    // 63 or less.
-        sz_size_t body_length = length - head_length - tail_length; // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-        text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        for (text += head_length; body_length >= 64; text += 64, body_length -= 64) {
-            text_vec.zmm = _mm512_load_si512((__m512i const *)text);
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        }
-        text_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text);
-        sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        return _mm512_reduce_add_epi64(sums_vec.zmm);
-    }
-    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
-    //
-    //      1. Moving in both directions to maximize the throughput, when fetching from multiple
-    //         memory pages. Also helps with cache set-associativity issues, as we won't always
-    //         be fetching the same entries in the lookup table.
-    //      2. Using non-temporal stores to avoid polluting the cache.
-    //      3. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
-    //         for predictable patterns, so disregard this advice.
-    //
-    // Bidirectional traversal generally adds about 10% to such algorithms.
-    else {
-        sz_u512_vec_t text_reversed_vec, sums_reversed_vec;
-        sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64;
-        sz_size_t tail_length = (sz_size_t)(text + length) % 64;
-        sz_size_t body_length = length - head_length - tail_length;
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
-
-        text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        text_reversed_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text + head_length + body_length);
-        sums_reversed_vec.zmm = _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512());
-
-        // Now in the main loop, we can use non-temporal loads and stores,
-        // performing the operation in both directions.
-        for (text += head_length; body_length >= 128; text += 64, text += 64, body_length -= 128) {
-            text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-            text_reversed_vec.zmm = _mm512_stream_load_si512((__m512i *)(text + body_length - 64));
-            sums_reversed_vec.zmm =
-                _mm512_add_epi64(sums_reversed_vec.zmm, _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512()));
-        }
-        if (body_length >= 64) {
-            text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        }
-
-        return _mm512_reduce_add_epi64(_mm512_add_epi64(sums_vec.zmm, sums_reversed_vec.zmm));
-    }
-}
-
 SZ_PUBLIC void sz_hashes_ice(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
                              sz_hash_callback_t callback, void *callback_handle) {
 
@@ -875,24 +524,6 @@ SZ_PUBLIC void sz_hashes_ice(sz_cptr_t start, sz_size_t length, sz_size_t window
 #pragma GCC target("arch=armv8.2-a+simd")
 #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
 
-SZ_PUBLIC sz_u64_t sz_checksum_neon(sz_cptr_t text, sz_size_t length) {
-    uint64x2_t sum_vec = vdupq_n_u64(0);
-
-    // Process 16 bytes (128 bits) at a time
-    for (; length >= 16; text += 16, length -= 16) {
-        uint8x16_t vec = vld1q_u8((sz_u8_t const *)text);      // Load 16 bytes
-        uint16x8_t pairwise_sum1 = vpaddlq_u8(vec);            // Pairwise add lower and upper 8 bits
-        uint32x4_t pairwise_sum2 = vpaddlq_u16(pairwise_sum1); // Pairwise add 16-bit results
-        uint64x2_t pairwise_sum3 = vpaddlq_u32(pairwise_sum2); // Pairwise add 32-bit results
-        sum_vec = vaddq_u64(sum_vec, pairwise_sum3);           // Accumulate the sum
-    }
-
-    // Final reduction of `sum_vec` to a single scalar
-    sz_u64_t sum = vgetq_lane_u64(sum_vec, 0) + vgetq_lane_u64(sum_vec, 1);
-    if (length) sum += sz_checksum_serial(text, length);
-    return sum;
-}
-
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_NEON
@@ -918,18 +549,6 @@ SZ_PUBLIC sz_u64_t sz_checksum_neon(sz_cptr_t text, sz_size_t length) {
 #pragma region Compile Time Dispatching
 #if !SZ_DYNAMIC_DISPATCH
 
-SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length) {
-#if SZ_USE_ICE
-    return sz_checksum_ice(text, length);
-#elif SZ_USE_HASWELL
-    return sz_checksum_haswell(text, length);
-#elif SZ_USE_NEON
-    return sz_checksum_neon(text, length);
-#else
-    return sz_checksum_serial(text, length);
-#endif
-}
-
 SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
                           sz_hash_callback_t callback, void *callback_handle) {
 #if SZ_USE_ICE
@@ -941,11 +560,6 @@ SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_len
 #endif
 }
 
-SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
-                            sz_random_generator_t generator, void *generator_user_data) {
-    sz_generate_serial(alphabet, alphabet_size, result, result_length, generator, generator_user_data);
-}
-
 #endif            // !SZ_DYNAMIC_DISPATCH
 #pragma endregion // Compile Time Dispatching
 
diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 262cbdc9..4afe9572 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -52,78 +52,6 @@ SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length) {
     return 0;
 }
 
-/**
- *  @brief  Computes the Karp-Rabin rolling hashes of a string supplying them to the provided `callback`.
- *          Can be used for similarity scores, search, ranking, etc.
- *
- *  Rabin-Karp-like rolling hashes can have very high-level of collisions and depend
- *  on the choice of bases and the prime number. That's why, often two hashes from the same
- *  family are used with different bases.
- *
- *       1. Kernighan and Ritchie's function uses 31, a prime close to the size of English alphabet.
- *       2. To be friendlier to byte-arrays and UTF8, we use 257 for the second function.
- *
- *  Choosing the right ::window_length is task- and domain-dependant. For example, most English words are
- *  between 3 and 7 characters long, so a window of 4 bytes would be a good choice. For DNA sequences,
- *  the ::window_length might be a multiple of 3, as the codons are 3 (nucleotides) bytes long.
- *  With such minimalistic alphabets of just four characters (AGCT) longer windows might be needed.
- *  For protein sequences the alphabet is 20 characters long, so the window can be shorter, than for DNAs.
- *
- *  @param text             String to hash.
- *  @param length           Number of bytes in the string.
- *  @param window_length    Length of the rolling window in bytes.
- *  @param window_step      Step of reported hashes. @b Must be power of two. Should be smaller than `window_length`.
- *  @param callback         Function receiving the start & length of a substring, the hash, and the `callback_handle`.
- *  @param callback_handle  Optional user-provided pointer to be passed to the `callback`.
- *  @see                    sz_hashes_fingerprint, sz_hashes_intersection
- */
-SZ_DYNAMIC void sz_hashes(                                                            //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-    sz_hash_callback_t callback, void *callback_handle);
-
-/**
- *  @brief  Computes the Karp-Rabin rolling hashes of a string outputting a binary fingerprint.
- *          Such fingerprints can be compared with Hamming or Jaccard (Tanimoto) distance for similarity.
- *
- *  The algorithm doesn't clear the fingerprint buffer on start, so it can be invoked multiple times
- *  to produce a fingerprint of a longer string, by passing the previous fingerprint as the ::fingerprint.
- *  It can also be reused to produce multi-resolution fingerprints by changing the ::window_length
- *  and calling the same function multiple times for the same input ::text.
- *
- *  Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
- *  avoiding cache-coherency penalties of remote on-heap buffers.
- *
- *  @param text                 String to hash.
- *  @param length               Number of bytes in the string.
- *  @param fingerprint          Output fingerprint buffer.
- *  @param fingerprint_bytes    Number of bytes in the fingerprint buffer.
- *  @param window_length        Length of the rolling window in bytes.
- *  @see                        sz_hashes, sz_hashes_intersection
- */
-SZ_PUBLIC void sz_hashes_fingerprint(                          //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
-    sz_ptr_t fingerprint, sz_size_t fingerprint_bytes) {
-    sz_unused(text && length && window_length && fingerprint && fingerprint_bytes);
-}
-
-/**
- *  @brief  Given a hash-fingerprint of a textual document, computes the number of intersecting hashes
- *          of the incoming document. Can be used for document scoring and search.
- *
- *  Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
- *  avoiding cache-coherency penalties of remote on-heap buffers.
- *
- *  @param text                 Input document.
- *  @param length               Number of bytes in the input document.
- *  @param fingerprint          Reference document fingerprint.
- *  @param fingerprint_bytes    Number of bytes in the reference documents fingerprint.
- *  @param window_length        Length of the rolling window in bytes.
- *  @see                        sz_hashes, sz_hashes_fingerprint
- */
-SZ_PUBLIC sz_size_t sz_hashes_intersection(                    //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
-    sz_cptr_t fingerprint, sz_size_t fingerprint_bytes);
-
 /**
  *  @brief  Generates a random string for a given alphabet, avoiding integer division and modulo operations.
  *          Similar to `text[i] = alphabet[rand() % cardinality]`.
@@ -299,78 +227,6 @@ SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length) {
     return _sz_hash_mix(hash_low, hash_high);
 }
 
-SZ_PUBLIC void sz_hashes_serial(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    sz_u8_t const *text = (sz_u8_t const *)start;
-    sz_u8_t const *text_end = text + length;
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // Compute the initial hash value for the first window.
-    sz_u64_t hash_low = 0, hash_high = 0, hash_mix;
-    for (sz_u8_t const *first_end = text + window_length; text < first_end; ++text)
-        hash_low = (hash_low * 31ull + _sz_shift_low(*text)) % SZ_U64_MAX_PRIME,
-        hash_high = (hash_high * 257ull + _sz_shift_high(*text)) % SZ_U64_MAX_PRIME;
-
-    // In most cases the fingerprint length will be a power of two.
-    hash_mix = _sz_hash_mix(hash_low, hash_high);
-    callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
-
-    // Compute the hash value for every window, exporting into the fingerprint,
-    // using the expensive modulo operation.
-    sz_size_t cycles = 1;
-    sz_size_t const step_mask = step - 1;
-    for (; text < text_end; ++text, ++cycles) {
-        // Discard one character:
-        hash_low -= _sz_shift_low(*(text - window_length)) * prime_power_low;
-        hash_high -= _sz_shift_high(*(text - window_length)) * prime_power_high;
-        // And add a new one:
-        hash_low = 31ull * hash_low + _sz_shift_low(*text);
-        hash_high = 257ull * hash_high + _sz_shift_high(*text);
-        // Wrap the hashes around:
-        hash_low = _sz_prime_mod(hash_low);
-        hash_high = _sz_prime_mod(hash_high);
-        // Mix only if we've skipped enough hashes.
-        if ((cycles & step_mask) == 0) {
-            hash_mix = _sz_hash_mix(hash_low, hash_high);
-            callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
-        }
-    }
-}
-
-/** @brief  An internal callback used to set a bit in a power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void _sz_hashes_fingerprint_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *handle) {
-    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
-    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
-    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
-    fingerprint_u8s[(hash / 8) & (fingerprint_bytes - 1)] |= (1 << (hash & 7));
-    sz_unused(start && length);
-}
-
-/** @brief  An internal callback used to set a bit in a @b non power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void _sz_hashes_fingerprint_non_pow2_callback( //
-    sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *handle) {
-    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
-    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
-    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
-    fingerprint_u8s[(hash / 8) % fingerprint_bytes] |= (1 << (hash & 7));
-    sz_unused(start && length);
-}
-
-/** @brief  An internal callback, used to mix all the running hashes into one pointer-size value. */
-SZ_INTERNAL void _sz_hashes_fingerprint_scalar_callback( //
-    sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *scalar_handle) {
-    sz_unused(start && length && hash && scalar_handle);
-    sz_size_t *scalar_ptr = (sz_size_t *)scalar_handle;
-    *scalar_ptr ^= hash;
-}
-
 #undef _sz_shift_low
 #undef _sz_shift_high
 #undef _sz_hash_mix
@@ -467,147 +323,6 @@ SZ_PUBLIC sz_u64_t sz_checksum_haswell(sz_cptr_t text, sz_size_t length) {
     }
 }
 
-/**
- *  @brief  There is no AVX2 instruction for fast multiplication of 64-bit integers.
- *          This implementation is coming from Agner Fog's Vector Class Library.
- */
-SZ_INTERNAL __m256i _mm256_mul_epu64(__m256i a, __m256i b) {
-    __m256i bswap = _mm256_shuffle_epi32(b, 0xB1);
-    __m256i prodlh = _mm256_mullo_epi32(a, bswap);
-    __m256i zero = _mm256_setzero_si256();
-    __m256i prodlh2 = _mm256_hadd_epi32(prodlh, zero);
-    __m256i prodlh3 = _mm256_shuffle_epi32(prodlh2, 0x73);
-    __m256i prodll = _mm256_mul_epu32(a, b);
-    __m256i prod = _mm256_add_epi64(prodll, prodlh3);
-    return prod;
-}
-
-SZ_PUBLIC void sz_hashes_haswell(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                                 sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    if (length < 4 * window_length) {
-        sz_hashes_serial(start, length, window_length, step, callback, callback_handle);
-        return;
-    }
-
-    // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
-    // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
-    sz_size_t const max_hashes = length - window_length + 1;
-    sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
-    sz_u8_t const *text_first = (sz_u8_t const *)start;
-    sz_u8_t const *text_second = text_first + min_hashes_per_thread;
-    sz_u8_t const *text_third = text_first + min_hashes_per_thread * 2;
-    sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
-    sz_u8_t const *text_end = text_first + length;
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // Broadcast the constants into the registers.
-    sz_u256_vec_t prime_vec, golden_ratio_vec;
-    sz_u256_vec_t base_low_vec, base_high_vec, prime_power_low_vec, prime_power_high_vec, shift_high_vec;
-    base_low_vec.ymm = _mm256_set1_epi64x(31ull);
-    base_high_vec.ymm = _mm256_set1_epi64x(257ull);
-    shift_high_vec.ymm = _mm256_set1_epi64x(77ull);
-    prime_vec.ymm = _mm256_set1_epi64x(SZ_U64_MAX_PRIME);
-    golden_ratio_vec.ymm = _mm256_set1_epi64x(11400714819323198485ull);
-    prime_power_low_vec.ymm = _mm256_set1_epi64x(prime_power_low);
-    prime_power_high_vec.ymm = _mm256_set1_epi64x(prime_power_high);
-
-    // Compute the initial hash values for every one of the four windows.
-    sz_u256_vec_t hash_low_vec, hash_high_vec, hash_mix_vec, chars_low_vec, chars_high_vec;
-    hash_low_vec.ymm = _mm256_setzero_si256();
-    hash_high_vec.ymm = _mm256_setzero_si256();
-    for (sz_u8_t const *prefix_end = text_first + window_length; text_first < prefix_end;
-         ++text_first, ++text_second, ++text_third, ++text_fourth) {
-
-        // 1. Multiply the hashes by the base.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-
-        // 3. Add the incoming characters.
-        hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_low_vec.ymm = _mm256_blendv_epi8( //
-            hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
-            _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
-        hash_high_vec.ymm = _mm256_blendv_epi8( //
-            hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
-            _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
-    }
-
-    // 5. Compute the hash mix, that will be used to index into the fingerprint.
-    //    This includes a serial step at the end.
-    hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
-    hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
-    hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
-    callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-    callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-    callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-    callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-
-    // Now repeat that operation for the remaining characters, discarding older characters.
-    sz_size_t cycle = 1;
-    sz_size_t const step_mask = step - 1;
-    for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
-        // 0. Load again the four characters we are dropping, shift them, and subtract.
-        chars_low_vec.ymm = _mm256_set_epi64x( //
-            text_fourth[-window_length], text_third[-window_length], text_second[-window_length],
-            text_first[-window_length]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-        hash_low_vec.ymm =
-            _mm256_sub_epi64(hash_low_vec.ymm, _mm256_mul_epu64(chars_low_vec.ymm, prime_power_low_vec.ymm));
-        hash_high_vec.ymm =
-            _mm256_sub_epi64(hash_high_vec.ymm, _mm256_mul_epu64(chars_high_vec.ymm, prime_power_high_vec.ymm));
-
-        // 1. Multiply the hashes by the base.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-
-        // 3. Add the incoming characters.
-        hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_low_vec.ymm = _mm256_blendv_epi8( //
-            hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
-            _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
-        hash_high_vec.ymm = _mm256_blendv_epi8( //
-            hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
-            _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
-
-        // 5. Compute the hash mix, that will be used to index into the fingerprint.
-        //    This includes a serial step at the end.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
-        hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
-        if ((cycle & step_mask) == 0) {
-            callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-            callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-            callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-            callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-        }
-    }
-}
-
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_HASWELL
@@ -736,131 +451,6 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
     }
 }
 
-SZ_PUBLIC void sz_hashes_ice(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-                             sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_length || !window_length) return;
-    if (length < 4 * window_length) {
-        sz_hashes_serial(start, length, window_length, step, callback, callback_handle);
-        return;
-    }
-
-    // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
-    // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
-    sz_size_t const max_hashes = length - window_length + 1;
-    sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
-    sz_u8_t const *text_first = (sz_u8_t const *)start;
-    sz_u8_t const *text_second = text_first + min_hashes_per_thread;
-    sz_u8_t const *text_third = text_first + min_hashes_per_thread * 2;
-    sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
-    sz_u8_t const *text_end = text_first + length;
-
-    // Broadcast the global constants into the registers.
-    // Both high and low hashes will work with the same prime and golden ratio.
-    sz_u512_vec_t prime_vec, golden_ratio_vec;
-    prime_vec.zmm = _mm512_set1_epi64(SZ_U64_MAX_PRIME);
-    golden_ratio_vec.zmm = _mm512_set1_epi64(11400714819323198485ull);
-
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // We will be evaluating 4 offsets at a time with 2 different hash functions.
-    // We can fit all those 8 state variables in each of the following ZMM registers.
-    sz_u512_vec_t base_vec, prime_power_vec, shift_vec;
-    base_vec.zmm = _mm512_set_epi64(31ull, 31ull, 31ull, 31ull, 257ull, 257ull, 257ull, 257ull);
-    shift_vec.zmm = _mm512_set_epi64(0ull, 0ull, 0ull, 0ull, 77ull, 77ull, 77ull, 77ull);
-    prime_power_vec.zmm = _mm512_set_epi64(prime_power_low, prime_power_low, prime_power_low, prime_power_low,
-                                           prime_power_high, prime_power_high, prime_power_high, prime_power_high);
-
-    // Compute the initial hash values for every one of the four windows.
-    sz_u512_vec_t hash_vec, chars_vec;
-    hash_vec.zmm = _mm512_setzero_si512();
-    for (sz_u8_t const *prefix_end = text_first + window_length; text_first < prefix_end;
-         ++text_first, ++text_second, ++text_third, ++text_fourth) {
-
-        // 1. Multiply the hashes by the base.
-        hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`...
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
-                                         text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-
-        // 3. Add the incoming characters.
-        hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
-                                              _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
-    }
-
-    // 5. Compute the hash mix, that will be used to index into the fingerprint.
-    //    This includes a serial step at the end.
-    sz_u512_vec_t hash_mix_vec;
-    hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
-    hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
-                                            _mm512_extracti64x4_epi64(hash_mix_vec.zmm, 0));
-
-    callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-    callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-    callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-    callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-
-    // Now repeat that operation for the remaining characters, discarding older characters.
-    sz_size_t cycle = 1;
-    sz_size_t step_mask = step - 1;
-    for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
-        // 0. Load again the four characters we are dropping, shift them, and subtract.
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[-window_length], text_third[-window_length],
-                                         text_second[-window_length], text_first[-window_length], //
-                                         text_fourth[-window_length], text_third[-window_length],
-                                         text_second[-window_length], text_first[-window_length]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-        hash_vec.zmm = _mm512_sub_epi64(hash_vec.zmm, _mm512_mullo_epi64(chars_vec.zmm, prime_power_vec.zmm));
-
-        // 1. Multiply the hashes by the base.
-        hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
-                                         text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-
-        // ... and prefetch the next four characters into Level 2 or higher.
-        _mm_prefetch((sz_cptr_t)text_fourth + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_third + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_second + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_first + 1, _MM_HINT_T1);
-
-        // 3. Add the incoming characters.
-        hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
-                                              _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
-
-        // 5. Compute the hash mix, that will be used to index into the fingerprint.
-        //    This includes a serial step at the end.
-        hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
-        hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
-                                                _mm512_castsi512_si256(hash_mix_vec.zmm));
-
-        if ((cycle & step_mask) == 0) {
-            callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-            callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-            callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-            callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
-        }
-    }
-}
-
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_ICE
@@ -930,17 +520,6 @@ SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length) {
 #endif
 }
 
-SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-                          sz_hash_callback_t callback, void *callback_handle) {
-#if SZ_USE_ICE
-    sz_hashes_ice(text, length, window_length, window_step, callback, callback_handle);
-#elif SZ_USE_HASWELL
-    sz_hashes_haswell(text, length, window_length, window_step, callback, callback_handle);
-#else
-    sz_hashes_serial(text, length, window_length, window_step, callback, callback_handle);
-#endif
-}
-
 SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
                             sz_random_generator_t generator, void *generator_user_data) {
     sz_generate_serial(alphabet, alphabet_size, result, result_length, generator, generator_user_data);
diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 0f0aaef9..af677aad 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -4019,7 +4019,7 @@ void sorted_order(objects_type_ const *begin, objects_type_ const *end, sorted_i
 }
 
 #if !SZ_AVOID_STL
-
+#if _SZ_DEPRECATED_FINGERPRINTS
 /**
  *  @brief  Computes the Rabin-Karp-like rolling binary fingerprint of a string.
  *  @see    sz_hashes
@@ -4052,6 +4052,7 @@ template <std::size_t bitset_bits_, typename char_type_>
 std::bitset<bitset_bits_> hashes_fingerprint(basic_string<char_type_> const &str, std::size_t window_length) noexcept {
     return ashvardanian::stringzilla::hashes_fingerprint<bitset_bits_>(str.view(), window_length);
 }
+#endif
 
 /**
  *  @brief  Computes the permutation of an array, that would lead to sorted order.
diff --git a/scripts/test.cpp b/scripts/test.cpp
index dc13a656..2bf886d1 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -133,53 +133,6 @@ static void test_arithmetical_utilities() {
 #endif
 }
 
-/**
- *  @brief  Tests various ASCII-based methods (e.g., `is_alpha`, `is_digit`)
- *          provided by `sz::string` and `sz::string_view`.
- */
-template <typename string_type>
-static void test_ascii_utilities() {
-
-    using str = string_type;
-
-    assert(!str("").is_alpha());
-    assert(str("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ").is_alpha());
-    assert(!str("abc9").is_alpha());
-
-    assert(!str("").is_alnum());
-    assert(str("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789").is_alnum());
-    assert(!str("abc!").is_alnum());
-
-    assert(str("").is_ascii());
-    assert(str("\x00x7F").is_ascii());
-    assert(!str("abc123🔥").is_ascii());
-
-    assert(!str("").is_digit());
-    assert(str("0123456789").is_digit());
-    assert(!str("012a").is_digit());
-
-    assert(!str("").is_lower());
-    assert(str("abcdefghijklmnopqrstuvwxyz").is_lower());
-    assert(!str("abcA").is_lower());
-    assert(!str("abc\n").is_lower());
-
-    assert(!str("").is_space());
-    assert(str(" \t\n\r\f\v").is_space());
-    assert(!str(" \t\r\na").is_space());
-
-    assert(!str("").is_upper());
-    assert(str("ABCDEFGHIJKLMNOPQRSTUVWXYZ").is_upper());
-    assert(!str("ABCa").is_upper());
-
-    assert(str("").is_printable());
-    assert(str("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*()_+").is_printable());
-    assert(!str("012🔥").is_printable());
-
-    assert(str("").contains_only(sz::char_set("abc")));
-    assert(str("abc").contains_only(sz::char_set("abc")));
-    assert(!str("abcd").contains_only(sz::char_set("abc")));
-}
-
 /**
  *  @brief  Tests various ASCII-based methods (e.g., `is_alpha`, `is_digit`)
  *          provided by `sz::string` and `sz::string_view`.
@@ -892,6 +845,8 @@ static void test_non_stl_extensions_for_reads() {
     assert(str("abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz").checksum() ==
            arithmetic_sum('a', 'z') * 3);
 
+#if _SZ_DEPRECATED_FINGERPRINTS
+
     // Computing rolling fingerprints.
     assert(sz::hashes_fingerprint<512>(str("aaaa"), 3).count() == 1);
     assert(sz::hashes_fingerprint<512>(str("hello"), 4).count() == 2);
@@ -903,7 +858,7 @@ static void test_non_stl_extensions_for_reads() {
     assert(sz::hashes_fingerprint<512>(str("aaa"), 3).count() == 1);
     assert(sz::hashes_fingerprint<512>(str("aaaa"), 3).count() == 1);
     assert(sz::hashes_fingerprint<512>(str("aaaaa"), 3).count() == 1);
-
+#endif
     // Computing fuzzy search results.
 }
 

From 1de3166344e817e91132371e17e1560809a55194 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 12 Feb 2025 23:47:28 +0000
Subject: [PATCH 076/751] Make: Move drafts

---
 CONTRIBUTING.md                                |  4 ++--
 c/lib.c                                        |  2 +-
 include/stringzilla/drafts.h => drafts/bitap.h |  0
 {include/stringzilla => drafts}/fingerprint.h  |  0
 include/stringzilla/hash.h                     | 11 +++--------
 scripts/bench_token.cpp                        |  4 ++++
 6 files changed, 10 insertions(+), 11 deletions(-)
 rename include/stringzilla/drafts.h => drafts/bitap.h (100%)
 rename {include/stringzilla => drafts}/fingerprint.h (100%)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 231291c8..dfb4fb2f 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -129,8 +129,8 @@ Using modern syntax, this is how you build and run the test suite:
 
 ```bash
 cmake -D STRINGZILLA_BUILD_TEST=1 -D CMAKE_BUILD_TYPE=Debug -B build_debug
-cmake --build build_debug --config Debug          # Which will produce the following targets:
-build_debug/stringzilla_test_cpp20                # Unit test for the entire library compiled for current hardware
+cmake --build build_debug --config Debug      # Which will produce the following targets:
+build_debug/stringzilla_test_cpp20            # Unit test for the entire library compiled for current hardware
 build_debug/stringzilla_test_cpp20_serial     # x86 variant compiled for IvyBridge - last arch. before AVX2
 build_debug/stringzilla_test_cpp20_serial     # Arm variant compiled without Neon
 ```
diff --git a/c/lib.c b/c/lib.c
index d829e379..52a6ce7a 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -224,7 +224,7 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
 
     impl->edit_distance = sz_edit_distance_serial;
     impl->alignment_score = sz_alignment_score_serial;
-    impl->hashes = sz_hashes_serial;
+    impl->hashes = 0;
 
 #if SZ_USE_HASWELL
     if (caps & sz_cap_haswell_k) {
diff --git a/include/stringzilla/drafts.h b/drafts/bitap.h
similarity index 100%
rename from include/stringzilla/drafts.h
rename to drafts/bitap.h
diff --git a/include/stringzilla/fingerprint.h b/drafts/fingerprint.h
similarity index 100%
rename from include/stringzilla/fingerprint.h
rename to drafts/fingerprint.h
diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 4afe9572..52b6d372 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -78,11 +78,6 @@ SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length);
 /** @copydoc sz_hash */
 SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t text, sz_size_t length);
 
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_serial(                                                      //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
-    sz_hash_callback_t callback, void *callback_handle);
-
 /** @copydoc sz_generate */
 SZ_PUBLIC void sz_generate_serial( //
     sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length, sz_random_generator_t generate,
@@ -261,7 +256,7 @@ SZ_PUBLIC sz_u64_t sz_checksum_haswell(sz_cptr_t text, sz_size_t length) {
             text_vec.ymm = _mm256_lddqu_si256((__m256i const *)text);
             sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
         }
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
+        // Accumulating 256 bits is harder, as we need to extract the 128-bit sums first.
         __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
         __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);
         __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
@@ -291,7 +286,7 @@ SZ_PUBLIC sz_u64_t sz_checksum_haswell(sz_cptr_t text, sz_size_t length) {
                 sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
             }
         }
-        // When the biffer is huge, we can traverse it in 2 directions.
+        // When the buffer is huge, we can traverse it in 2 directions.
         else {
             sz_u256_vec_t text_reversed_vec, sums_reversed_vec;
             sums_reversed_vec.ymm = _mm256_setzero_si256();
@@ -312,7 +307,7 @@ SZ_PUBLIC sz_u64_t sz_checksum_haswell(sz_cptr_t text, sz_size_t length) {
         // Handle the tail
         while (tail_length--) result += *text++;
 
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
+        // Accumulating 256 bits is harder, as we need to extract the 128-bit sums first.
         __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
         __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);
         __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index eb82dfd4..35369ac0 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -46,6 +46,7 @@ tracked_unary_functions_t hashing_functions() {
 }
 
 tracked_unary_functions_t sliding_hashing_functions(std::size_t window_width, std::size_t step) {
+#if _SZ_DEPRECATED_FINGERPRINTS
     auto wrap_sz = [=](auto function) -> unary_function_t {
         return unary_function_t([function, window_width, step](std::string_view s) {
             sz_size_t mixed_hash = 0;
@@ -53,8 +54,10 @@ tracked_unary_functions_t sliding_hashing_functions(std::size_t window_width, st
             return mixed_hash;
         });
     };
+#endif
     std::string suffix = std::to_string(window_width) + ":step" + std::to_string(step);
     tracked_unary_functions_t result = {
+#if _SZ_DEPRECATED_FINGERPRINTS
 #if SZ_USE_ICE
         {"sz_hashes_ice:" + suffix, wrap_sz(sz_hashes_ice)},
 #endif
@@ -62,6 +65,7 @@ tracked_unary_functions_t sliding_hashing_functions(std::size_t window_width, st
         {"sz_hashes_haswell:" + suffix, wrap_sz(sz_hashes_haswell)},
 #endif
         {"sz_hashes_serial:" + suffix, wrap_sz(sz_hashes_serial)},
+#endif
     };
     return result;
 }

From 0a3e363a4439db233758e944727efef72e373243 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 13 Feb 2025 13:21:45 +0000
Subject: [PATCH 077/751] Improve: Relax many `constexpr`s from C++20 to C++14

---
 include/stringzilla/stringzilla.hpp | 33 ++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index af677aad..126211c4 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -36,9 +36,20 @@
 #define _SZ_IS_CPP98 (__cplusplus >= 199711L)
 
 /**
- *  @brief  The `constexpr` keyword has different applicability scope in different C++ versions.
+ *  @brief  Expands to `constexpr` in C++20 and later, and to nothing in older C++ versions.
  *          Useful for STL conversion operators, as several `std::string` members are `constexpr` in C++20.
+ *
+ *  The `constexpr` keyword has different applicability scope in different C++ versions.
+ *  - C++11: Introduced `constexpr`, but no loops or multiple `return` statements were allowed.
+ *  - C++14: Allowed loops, multiple statements, and local variables in `constexpr` functions.
+ *  - C++17: Added the `if constexpr` construct for compile-time branching.
+ *  - C++20: Added some dynamic memory allocations, `virtual` functions, and `try`/`catch` blocks.
  */
+#if _SZ_IS_CPP14
+#define sz_constexpr_if_cpp14 constexpr
+#else
+#define sz_constexpr_if_cpp14
+#endif
 #if _SZ_IS_CPP20
 #define sz_constexpr_if_cpp20 constexpr
 #else
@@ -277,12 +288,12 @@ class basic_char_set {
         // ! Instead of relying on the `sz_charset_init`, we have to reimplement it to support `constexpr`.
         bitset_._u64s[0] = 0, bitset_._u64s[1] = 0, bitset_._u64s[2] = 0, bitset_._u64s[3] = 0;
     }
-    explicit constexpr basic_char_set(std::initializer_list<char_type> chars) noexcept : basic_char_set() {
+    explicit sz_constexpr_if_cpp14 basic_char_set(std::initializer_list<char_type> chars) noexcept : basic_char_set() {
         // ! Instead of relying on the `sz_charset_add(&bitset_, c)`, we have to reimplement it to support `constexpr`.
         for (auto c : chars) bitset_._u64s[sz_bitcast(sz_u8_t, c) >> 6] |= (1ull << (sz_bitcast(sz_u8_t, c) & 63u));
     }
 
-    explicit constexpr basic_char_set(char_type const *chars, std::size_t count_characters) noexcept
+    explicit sz_constexpr_if_cpp14 basic_char_set(char_type const *chars, std::size_t count_characters) noexcept
         : basic_char_set() {
         for (std::size_t i = 0; i < count_characters; ++i) {
             char_type c = chars[i];
@@ -291,7 +302,7 @@ class basic_char_set {
     }
 
     template <std::size_t count_characters>
-    explicit constexpr basic_char_set(std::array<char_type, count_characters> const &chars) noexcept
+    explicit sz_constexpr_if_cpp14 basic_char_set(std::array<char_type, count_characters> const &chars) noexcept
         : basic_char_set() {
         static_assert(count_characters > 0, "Character array cannot be empty");
         for (std::size_t i = 0; i < count_characters; ++i) {
@@ -1232,8 +1243,8 @@ class basic_string_slice {
         : start_(c_string), length_(null_terminated_length(c_string)) {}
     constexpr basic_string_slice(pointer c_string, size_type length) noexcept : start_(c_string), length_(length) {}
 
-    sz_constexpr_if_cpp20 basic_string_slice(basic_string_slice const &other) noexcept = default;
-    sz_constexpr_if_cpp20 basic_string_slice &operator=(basic_string_slice const &other) noexcept = default;
+    constexpr basic_string_slice(basic_string_slice const &other) noexcept = default;
+    constexpr basic_string_slice &operator=(basic_string_slice const &other) noexcept = default;
     basic_string_slice(std::nullptr_t) = delete;
 
     /**  @brief Exchanges the view with that of the `other`. */
@@ -1927,13 +1938,13 @@ class basic_string_slice {
     }
 
   private:
-    sz_constexpr_if_cpp20 string_slice &assign(string_view const &other) noexcept {
+    sz_constexpr_if_cpp14 string_slice &assign(string_view const &other) noexcept {
         start_ = (pointer)other.data();
         length_ = other.size();
         return *this;
     }
 
-    sz_constexpr_if_cpp20 static size_type null_terminated_length(const_pointer s) noexcept {
+    sz_constexpr_if_cpp14 static size_type null_terminated_length(const_pointer s) noexcept {
         const_pointer p = s;
         while (*p) ++p;
         return p - s;
@@ -2080,7 +2091,7 @@ class basic_string {
 
 #pragma region Constructors and STL Utilities
 
-    sz_constexpr_if_cpp20 basic_string() noexcept {
+    sz_constexpr_if_cpp14 basic_string() noexcept {
         // ! Instead of relying on the `sz_string_init`, we have to reimplement it to support `constexpr`.
         string_.internal.start = &string_.internal.chars[0];
         string_.words[1] = 0;
@@ -3454,7 +3465,9 @@ static_assert(sizeof(string) == 4 * sizeof(void *), "String size must be 4 point
 
 namespace literals {
 constexpr string_view operator""_sv(char const *str, std::size_t length) noexcept { return {str, length}; }
-constexpr char_set operator""_cs(char const *str, std::size_t length) noexcept { return char_set {str, length}; }
+sz_constexpr_if_cpp14 char_set operator""_cs(char const *str, std::size_t length) noexcept {
+    return char_set {str, length};
+}
 } // namespace literals
 
 template <typename char_type_, typename allocator_>

From 554f50d5e3601c6ef2984e287e7a84f91eebf1ae Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 13 Feb 2025 14:05:53 +0000
Subject: [PATCH 078/751] Add: Separate Skylake-X & Ice Lake checksums

---
 c/lib.c                    |   1 +
 include/stringzilla/hash.h | 185 +++++++++++++++++++++++++++++++------
 scripts/bench_token.cpp    |  10 +-
 3 files changed, 164 insertions(+), 32 deletions(-)

diff --git a/c/lib.c b/c/lib.c
index 52a6ce7a..3a447d99 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -259,6 +259,7 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->rfind = sz_rfind_skylake;
         impl->find_byte = sz_find_byte_skylake;
         impl->rfind_byte = sz_rfind_byte_skylake;
+        impl->checksum = sz_checksum_skylake;
     }
 #endif
 
diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 52b6d372..bdffd583 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -5,15 +5,9 @@
  *
  *  Includes core APIs:
  *
- *  - `sz_checksum` - for byte-level checksums.
+ *  - `sz_checksum` - for byte-level 64-bit unsigned checksums.
  *  - `sz_hash` - for 64-bit single-shot hashing.
- *  - `sz_hashes` - producing the rolling hashes of a string.
  *  - `sz_generate` - populating buffers with random data.
- *
- *  Convenience functions for character-set matching:
- *
- *  - `sz_hashes_fingerprint`
- *  - `sz_hashes_intersection`
  */
 #ifndef STRINGZILLA_HASH_H_
 #define STRINGZILLA_HASH_H_
@@ -334,6 +328,106 @@ SZ_PUBLIC sz_u64_t sz_checksum_haswell(sz_cptr_t text, sz_size_t length) {
 #pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
 #pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
 
+SZ_PUBLIC sz_u64_t sz_checksum_skylake(sz_cptr_t text, sz_size_t length) {
+    // The naive implementation of this function is very simple.
+    // It assumes the CPU is great at handling unaligned "loads".
+    //
+    // A typical AWS Sapphire Rapids instance can have 48 KB x 2 blocks of L1 data cache per core,
+    // 2 MB x 2 blocks of L2 cache per core, and one shared 60 MB buffer of L3 cache.
+    // With two strings, we may consider the overall workload huge, if each exceeds 1 MB in length.
+    int const is_huge = length >= 1ull * 1024ull * 1024ull;
+    sz_u512_vec_t text_vec, sums_vec;
+
+    // When the buffer is small, there isn't much to innovate.
+    // Separately handling even smaller payloads doesn't increase performance even on synthetic benchmarks.
+    if (length <= 16) {
+        __mmask16 mask = _sz_u16_mask_until(length);
+        text_vec.xmms[0] = _mm_maskz_loadu_epi8(mask, text);
+        sums_vec.xmms[0] = _mm_sad_epu8(text_vec.xmms[0], _mm_setzero_si128());
+        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_vec.xmms[0]);
+        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_vec.xmms[0], 1);
+        return low + high;
+    }
+    else if (length <= 32) {
+        __mmask32 mask = _sz_u32_mask_until(length);
+        text_vec.ymms[0] = _mm256_maskz_loadu_epi8(mask, text);
+        sums_vec.ymms[0] = _mm256_sad_epu8(text_vec.ymms[0], _mm256_setzero_si256());
+        // Accumulating 256 bits is harder, as we need to extract the 128-bit sums first.
+        __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymms[0]);
+        __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymms[0], 1);
+        __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
+        sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
+        sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
+        return low + high;
+    }
+    else if (length <= 64) {
+        __mmask64 mask = _sz_u64_mask_until(length);
+        text_vec.zmm = _mm512_maskz_loadu_epi8(mask, text);
+        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
+        return _mm512_reduce_add_epi64(sums_vec.zmm);
+    }
+    // For large buffers, fitting into L1 cache sizes, there are other tricks we can use.
+    //
+    // 1. Moving in both directions to maximize the throughput, when fetching from multiple
+    //    memory pages. Also helps with cache set-associativity issues, as we won't always
+    //    be fetching the same buckets in the lookup table.
+    //
+    // Bidirectional traversal generally adds about 10% to such algorithms.
+    else if (!is_huge) {
+        sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64; // 63 or less.
+        sz_size_t tail_length = (sz_size_t)(text + length) % 64;    // 63 or less.
+        sz_size_t body_length = length - head_length - tail_length; // Multiple of 64.
+        _sz_assert(body_length % 64 == 0 && head_length < 64 && tail_length < 64);
+        __mmask64 head_mask = _sz_u64_mask_until(head_length);
+        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
+
+        text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
+        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
+        for (text += head_length; body_length >= 64; text += 64, body_length -= 64) {
+            text_vec.zmm = _mm512_load_si512((__m512i const *)text);
+            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
+        }
+        text_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text);
+        sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
+        return _mm512_reduce_add_epi64(sums_vec.zmm);
+    }
+    // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
+    //
+    // 1. Using non-temporal loads to avoid polluting the cache.
+    // 2. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
+    //    for predictable patterns, so disregard this advice.
+    //
+    // Bidirectional traversal generally adds about 10% to such algorithms.
+    else {
+        sz_u512_vec_t text_reversed_vec, sums_reversed_vec;
+        sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64;
+        sz_size_t tail_length = (sz_size_t)(text + length) % 64;
+        sz_size_t body_length = length - head_length - tail_length;
+        __mmask64 head_mask = _sz_u64_mask_until(head_length);
+        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
+
+        text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
+        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
+        text_reversed_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text + head_length + body_length);
+        sums_reversed_vec.zmm = _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512());
+
+        // Now in the main loop, we can use non-temporal loads, performing the operation in both directions.
+        for (text += head_length; body_length >= 128; text += 64, text += 64, body_length -= 128) {
+            text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
+            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
+            text_reversed_vec.zmm = _mm512_stream_load_si512((__m512i *)(text + body_length - 64));
+            sums_reversed_vec.zmm =
+                _mm512_add_epi64(sums_reversed_vec.zmm, _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512()));
+        }
+        if (body_length >= 64) {
+            text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
+            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
+        }
+
+        return _mm512_reduce_add_epi64(_mm512_add_epi64(sums_vec.zmm, sums_reversed_vec.zmm));
+    }
+}
+
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_SKYLAKE
@@ -341,16 +435,17 @@ SZ_PUBLIC sz_u64_t sz_checksum_haswell(sz_cptr_t text, sz_size_t length) {
 
 /*  AVX512 implementation of the string search algorithms for Ice Lake and newer CPUs.
  *  Includes extensions:
- *      - 2017 Skylake: F, CD, ER, PF, VL, DQ, BW,
- *      - 2018 CannonLake: IFMA, VBMI,
- *      - 2019 Ice Lake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES.
+ *  - 2017 Skylake: F, CD, ER, PF, VL, DQ, BW,
+ *  - 2018 CannonLake: IFMA, VBMI,
+ *  - 2019 Ice Lake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES.
  */
 #pragma region Ice Lake Implementation
 #if SZ_USE_ICE
 #pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
-                             apply_to = function)
+#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "avx512vnni", "bmi", "bmi2")
+#pragma clang attribute push(                                                                         \
+    __attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vnni,bmi,bmi2"))), \
+    apply_to = function)
 
 SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
     // The naive implementation of this function is very simple.
@@ -363,6 +458,7 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
     sz_u512_vec_t text_vec, sums_vec;
 
     // When the buffer is small, there isn't much to innovate.
+    // Separately handling even smaller payloads doesn't increase performance even on synthetic benchmarks.
     if (length <= 16) {
         __mmask16 mask = _sz_u16_mask_until(length);
         text_vec.xmms[0] = _mm_maskz_loadu_epi8(mask, text);
@@ -375,7 +471,7 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
         __mmask32 mask = _sz_u32_mask_until(length);
         text_vec.ymms[0] = _mm256_maskz_loadu_epi8(mask, text);
         sums_vec.ymms[0] = _mm256_sad_epu8(text_vec.ymms[0], _mm256_setzero_si256());
-        // Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
+        // Accumulating 256 bits is harder, as we need to extract the 128-bit sums first.
         __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymms[0]);
         __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymms[0], 1);
         __m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
@@ -389,30 +485,60 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
         sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
         return _mm512_reduce_add_epi64(sums_vec.zmm);
     }
+    // For large buffers, fitting into L1 cache sizes, there are other tricks we can use.
+    //
+    // 1. Moving in both directions to maximize the throughput, when fetching from multiple
+    //    memory pages. Also helps with cache set-associativity issues, as we won't always
+    //    be fetching the same buckets in the lookup table.
+    // 2. Port-level parallelism, can be used to hide the latency of expensive SIMD instructions.
+    //    - `VPSADBW (ZMM, ZMM, ZMM)` combination with `VPADDQ (ZMM, ZMM, ZMM)`:
+    //        - On Ice Lake, the `VPSADBW` is 3 cycles on port 5; the `VPADDQ` is 1 cycle on ports 0/5.
+    //        - On Zen 4, the `VPSADBW` is 3 cycles on ports 0/1; the `VPADDQ` is 1 cycle on ports 0/1/2/3.
+    //    - `VPDPBUSDS (ZMM, ZMM, ZMM)`:
+    //        - On Ice Lake, the `VPDPBUSDS` is 5 cycles on port 0.
+    //        - On Zen 4, the `VPDPBUSDS` is 4 cycles on ports 0/1.
+    //
+    // Bidirectional traversal generally adds about 10% to such algorithms.
     else if (!is_huge) {
         sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64; // 63 or less.
         sz_size_t tail_length = (sz_size_t)(text + length) % 64;    // 63 or less.
         sz_size_t body_length = length - head_length - tail_length; // Multiple of 64.
+        _sz_assert(body_length % 64 == 0 && head_length < 64 && tail_length < 64);
         __mmask64 head_mask = _sz_u64_mask_until(head_length);
         __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
+
+        sz_u512_vec_t zeros_vec, ones_vec;
+        zeros_vec.zmm = _mm512_setzero_si512();
+        ones_vec.zmm = _mm512_set1_epi8(1);
+
+        // Take care of the unaligned head and tail!
+        sz_u512_vec_t text_reversed_vec, sums_reversed_vec;
         text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
-        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
-        for (text += head_length; body_length >= 64; text += 64, body_length -= 64) {
-            text_vec.zmm = _mm512_load_si512((__m512i const *)text);
-            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
+        sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, zeros_vec.zmm);
+        text_reversed_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text + head_length + body_length);
+        sums_reversed_vec.zmm = _mm512_dpbusds_epi32(zeros_vec.zmm, text_reversed_vec.zmm, ones_vec.zmm);
+
+        // Now in the main loop, we can use aligned loads, performing the operation in both directions.
+        for (text += head_length; body_length >= 128; text += 64, text += 64, body_length -= 128) {
+            text_reversed_vec.zmm = _mm512_load_si512((__m512i *)(text + body_length - 64));
+            sums_reversed_vec.zmm = _mm512_dpbusds_epi32(sums_reversed_vec.zmm, text_reversed_vec.zmm, ones_vec.zmm);
+            text_vec.zmm = _mm512_load_si512((__m512i *)(text));
+            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, zeros_vec.zmm));
         }
-        text_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text);
-        sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
-        return _mm512_reduce_add_epi64(sums_vec.zmm);
+        // There may be an aligned chunk of 64 bytes left.
+        if (body_length >= 64) {
+            _sz_assert(body_length == 64);
+            text_vec.zmm = _mm512_load_si512((__m512i *)(text));
+            sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, zeros_vec.zmm));
+        }
+
+        return _mm512_reduce_add_epi64(sums_vec.zmm) + _mm512_reduce_add_epi32(sums_reversed_vec.zmm);
     }
     // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
     //
-    //      1. Moving in both directions to maximize the throughput, when fetching from multiple
-    //         memory pages. Also helps with cache set-associativity issues, as we won't always
-    //         be fetching the same entries in the lookup table.
-    //      2. Using non-temporal stores to avoid polluting the cache.
-    //      3. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
-    //         for predictable patterns, so disregard this advice.
+    // 1. Using non-temporal loads to avoid polluting the cache.
+    // 2. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
+    //    for predictable patterns, so disregard this advice.
     //
     // Bidirectional traversal generally adds about 10% to such algorithms.
     else {
@@ -428,8 +554,7 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
         text_reversed_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text + head_length + body_length);
         sums_reversed_vec.zmm = _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512());
 
-        // Now in the main loop, we can use non-temporal loads and stores,
-        // performing the operation in both directions.
+        // Now in the main loop, we can use non-temporal loads, performing the operation in both directions.
         for (text += head_length; body_length >= 128; text += 64, text += 64, body_length -= 128) {
             text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
             sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
@@ -506,6 +631,8 @@ SZ_PUBLIC sz_u64_t sz_checksum_neon(sz_cptr_t text, sz_size_t length) {
 SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length) {
 #if SZ_USE_ICE
     return sz_checksum_ice(text, length);
+#elif SZ_USE_SKYLAKE
+    return sz_checksum_skylake(text, length);
 #elif SZ_USE_HASWELL
     return sz_checksum_haswell(text, length);
 #elif SZ_USE_NEON
diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index 35369ac0..684adb05 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -21,14 +21,18 @@ tracked_unary_functions_t checksum_functions() {
              return std::accumulate(s.begin(), s.end(), (std::size_t)0,
                                     [](std::size_t sum, char c) { return sum + static_cast<unsigned char>(c); });
          }},
-        {"sz_checksum_serial", wrap_sz(sz_checksum_serial), true},
+        {"sz_checksum_serial", wrap_sz(sz_checksum_serial), false},
 #if SZ_USE_HASWELL
-        {"sz_checksum_haswell", wrap_sz(sz_checksum_haswell), true},
+        {"sz_checksum_haswell", wrap_sz(sz_checksum_haswell), false},
+#endif
+#if SZ_USE_SKYLAKE
+        {"sz_checksum_skylake", wrap_sz(sz_checksum_skylake), false},
 #endif
 #if SZ_USE_ICE
+        {"sz_checksum_ice", wrap_sz(sz_checksum_ice), false},
 #endif
 #if SZ_USE_NEON
-        {"sz_checksum_neon", wrap_sz(sz_checksum_neon), true},
+        {"sz_checksum_neon", wrap_sz(sz_checksum_neon), false},
 #endif
     };
     return result;

From 509b58b754431bad3be050aaa90fd34641596994 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 13 Feb 2025 15:03:51 +0000
Subject: [PATCH 079/751] Fix: Loop in `sz_checksum_haswell`

---
 include/stringzilla/hash.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index bdffd583..a69d85bf 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -284,16 +284,18 @@ SZ_PUBLIC sz_u64_t sz_checksum_haswell(sz_cptr_t text, sz_size_t length) {
         else {
             sz_u256_vec_t text_reversed_vec, sums_reversed_vec;
             sums_reversed_vec.ymm = _mm256_setzero_si256();
-            for (; body_length >= 64; text += 64, body_length -= 64) {
+            for (; body_length >= 64; text += 32, body_length -= 64) {
                 text_vec.ymm = _mm256_stream_load_si256((__m256i *)(text));
                 sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
-                text_reversed_vec.ymm = _mm256_stream_load_si256((__m256i *)(text + body_length - 64));
+                text_reversed_vec.ymm = _mm256_stream_load_si256((__m256i *)(text + body_length - 32));
                 sums_reversed_vec.ymm = _mm256_add_epi64(
                     sums_reversed_vec.ymm, _mm256_sad_epu8(text_reversed_vec.ymm, _mm256_setzero_si256()));
             }
             if (body_length >= 32) {
+                _sz_assert(body_length == 32);
                 text_vec.ymm = _mm256_stream_load_si256((__m256i *)(text));
                 sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
+                text += 32;
             }
             sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, sums_reversed_vec.ymm);
         }

From 4044855653b9571eff857ea303e3286086d502ee Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 13 Feb 2025 15:04:12 +0000
Subject: [PATCH 080/751] Fix: Loops in AVX-512 checksums

---
 include/stringzilla/hash.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index a69d85bf..00200f29 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -414,7 +414,7 @@ SZ_PUBLIC sz_u64_t sz_checksum_skylake(sz_cptr_t text, sz_size_t length) {
         sums_reversed_vec.zmm = _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512());
 
         // Now in the main loop, we can use non-temporal loads, performing the operation in both directions.
-        for (text += head_length; body_length >= 128; text += 64, text += 64, body_length -= 128) {
+        for (text += head_length; body_length >= 128; text += 64, body_length -= 128) {
             text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
             sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
             text_reversed_vec.zmm = _mm512_stream_load_si512((__m512i *)(text + body_length - 64));
@@ -501,6 +501,8 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
     //        - On Zen 4, the `VPDPBUSDS` is 4 cycles on ports 0/1.
     //
     // Bidirectional traversal generally adds about 10% to such algorithms.
+    // Port level parallelism can yield more, but remember that one of the instructions accumulates
+    // with 32-bit integers and the other one will be using 64-bit integers.
     else if (!is_huge) {
         sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64; // 63 or less.
         sz_size_t tail_length = (sz_size_t)(text + length) % 64;    // 63 or less.
@@ -521,7 +523,7 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
         sums_reversed_vec.zmm = _mm512_dpbusds_epi32(zeros_vec.zmm, text_reversed_vec.zmm, ones_vec.zmm);
 
         // Now in the main loop, we can use aligned loads, performing the operation in both directions.
-        for (text += head_length; body_length >= 128; text += 64, text += 64, body_length -= 128) {
+        for (text += head_length; body_length >= 128; text += 64, body_length -= 128) {
             text_reversed_vec.zmm = _mm512_load_si512((__m512i *)(text + body_length - 64));
             sums_reversed_vec.zmm = _mm512_dpbusds_epi32(sums_reversed_vec.zmm, text_reversed_vec.zmm, ones_vec.zmm);
             text_vec.zmm = _mm512_load_si512((__m512i *)(text));
@@ -557,7 +559,7 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
         sums_reversed_vec.zmm = _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512());
 
         // Now in the main loop, we can use non-temporal loads, performing the operation in both directions.
-        for (text += head_length; body_length >= 128; text += 64, text += 64, body_length -= 128) {
+        for (text += head_length; body_length >= 128; text += 64, body_length -= 128) {
             text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
             sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
             text_reversed_vec.zmm = _mm512_stream_load_si512((__m512i *)(text + body_length - 64));

From 84cb4c8ab6bcd22bef935bcb350584dfdd4b6de7 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 14 Feb 2025 15:45:35 +0000
Subject: [PATCH 081/751] Fix: Tail handling in `sz_checksum_haswell`

---
 include/stringzilla/hash.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 00200f29..e0c0447f 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -270,6 +270,8 @@ SZ_PUBLIC sz_u64_t sz_checksum_haswell(sz_cptr_t text, sz_size_t length) {
 
         // Handle the head
         while (head_length--) result += *text++;
+        // Handle the tail
+        while (tail_length) result += text[length - (tail_length--) - 1];
 
         sz_u256_vec_t text_vec, sums_vec;
         sums_vec.ymm = _mm256_setzero_si256();
@@ -300,9 +302,6 @@ SZ_PUBLIC sz_u64_t sz_checksum_haswell(sz_cptr_t text, sz_size_t length) {
             sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, sums_reversed_vec.ymm);
         }
 
-        // Handle the tail
-        while (tail_length--) result += *text++;
-
         // Accumulating 256 bits is harder, as we need to extract the 128-bit sums first.
         __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
         __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);

From 5bbd9715a3521c150e5ec62b6334224e192a7d2a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 14 Feb 2025 23:47:33 +0000
Subject: [PATCH 082/751] Fix: Infer allocators `value_type`

---
 include/stringzilla/stringzilla.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 126211c4..d64b0c03 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -1058,7 +1058,8 @@ static void *_call_allocate(sz_size_t n, void *allocator_state) noexcept {
 
 template <typename allocator_type_>
 static void _call_free(void *ptr, sz_size_t n, void *allocator_state) noexcept {
-    return reinterpret_cast<allocator_type_ *>(allocator_state)->deallocate(reinterpret_cast<char *>(ptr), n);
+    using value_type_ = typename allocator_type_::value_type;
+    return reinterpret_cast<allocator_type_ *>(allocator_state)->deallocate(reinterpret_cast<value_type_ *>(ptr), n);
 }
 
 template <typename generator_type_>

From ec81663483539fefbd3fe496bda62733f4408b99 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 15 Feb 2025 00:02:35 +0000
Subject: [PATCH 083/751] Break: `sz_sort` now takes allocators

---
 README.md                           |   2 +-
 c/lib.c                             |   5 +-
 include/stringzilla/sort.h          | 521 +++++++++++++---------------
 include/stringzilla/stringzilla.hpp |   5 +-
 include/stringzilla/types.h         |   9 +-
 5 files changed, 256 insertions(+), 286 deletions(-)

diff --git a/README.md b/README.md
index fb2a0384..52f80d41 100644
--- a/README.md
+++ b/README.md
@@ -632,7 +632,7 @@ sz_size_t substring_position = sz_find_neon(haystack.start, haystack.length, nee
 sz_u64_t hash = sz_hash(haystack.start, haystack.length);
 
 // Perform collection level operations
-sz_sequence_t array = {your_order, your_count, your_get_start, your_get_length, your_handle};
+sz_sequence_t array = {your_handle, your_count, your_get_start, your_get_length};
 sz_sort(&array, &your_config);
 ```
 
diff --git a/c/lib.c b/c/lib.c
index 3a447d99..5a4183cd 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -188,7 +188,8 @@ typedef struct sz_implementations_t {
 
     sz_edit_distance_t edit_distance;
     sz_alignment_score_t alignment_score;
-    sz_hashes_t hashes;
+
+    sz_sort_t sort;
 
 } sz_implementations_t;
 
@@ -224,7 +225,7 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
 
     impl->edit_distance = sz_edit_distance_serial;
     impl->alignment_score = sz_alignment_score_serial;
-    impl->hashes = 0;
+    impl->sort = sz_sort_serial;
 
 #if SZ_USE_HASWELL
     if (caps & sz_cap_haswell_k) {
diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index 7a8de124..e517159d 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -1,14 +1,12 @@
 /**
- *  @brief  Hardware-accelerated string sorting.
+ *  @brief  Hardware-accelerated string collection sorting and intersections.
  *  @file   sort.h
  *  @author Ash Vardanian
  *
  *  Includes core APIs:
  *
- *  - `sz_partition` - to split the sequence into two parts based on a predicate.
- *  - `sz_merge` - to merge two consecutive sorted chunks forming the same continuous `sequence`.
- *  - `sz_sort` - to sort an arbitrary string sequence.
- *  - `sz_sort_partial` - to partially sort an arbitrary string sequence.
+ *  - `sz_sort` - to sort an arbitrary string collection.
+ *  - TODO: `sz_stable_sort` - to sort a string collection while preserving the relative order of equal elements.
  */
 #ifndef STRINGZILLA_SORT_H_
 #define STRINGZILLA_SORT_H_
@@ -24,320 +22,293 @@ extern "C" {
 #pragma region Core API
 
 /**
- *  @brief  Similar to `std::partition`, given a predicate splits the sequence into two parts.
- *          The algorithm is unstable, meaning that elements may change relative order, as long
- *          as they are in the right partition. This is the simpler algorithm for partitioning.
- */
-SZ_PUBLIC sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate);
-
-/**
- *  @brief  Inplace `std::set_union` for two consecutive chunks forming the same continuous `sequence`.
+ *  @brief  Faster `std::sort` for an arbitrary string sequence.
  *
- *  @param partition The number of elements in the first sub-sequence in `sequence`.
- *  @param less Comparison function, to determine the lexicographic ordering.
+ *  @param collection The collection of strings to sort.
+ *  @param alloc Memory allocator for temporary storage.
+ *  @param order The output - indices of the sorted collection elements.
+ *  @return Whether the operation was successful.
  */
-SZ_PUBLIC void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less);
+SZ_PUBLIC sz_bool_t sz_sort(sz_sequence_t const *collection, sz_memory_allocator_t *alloc, sz_sorted_idx_t *order);
 
-/**
- *  @brief  Sorting algorithm, combining Radix Sort for the first 32 bits of every word
- *          and a follow-up by a more conventional sorting procedure on equally prefixed parts.
- */
-SZ_PUBLIC void sz_sort(sz_sequence_t *sequence);
+/** @copydoc sz_sort */
+SZ_PUBLIC sz_bool_t sz_sort_serial(sz_sequence_t const *collection, sz_memory_allocator_t *alloc,
+                                   sz_sorted_idx_t *order);
 
-/**
- *  @brief  Partial sorting algorithm, combining Radix Sort for the first 32 bits of every word
- *          and a follow-up by a more conventional sorting procedure on equally prefixed parts.
- */
-SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t n);
+/** @copydoc sz_sort */
+SZ_PUBLIC sz_bool_t sz_sort_skylake(sz_sequence_t const *collection, sz_memory_allocator_t *alloc,
+                                    sz_sorted_idx_t *order);
 
-/**
- *  @brief  Intro-Sort algorithm that supports custom comparators.
- */
-SZ_PUBLIC void sz_sort_intro(sz_sequence_t *sequence, sz_sequence_comparator_t less);
+/** @copydoc sz_sort */
+SZ_PUBLIC sz_bool_t sz_sort_sve(sz_sequence_t const *collection, sz_memory_allocator_t *alloc, sz_sorted_idx_t *order);
 
 #pragma endregion
 
 #pragma region Serial Implementation
 
-SZ_PUBLIC sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate) {
-
-    sz_size_t matches = 0;
-    while (matches != sequence->count && predicate(sequence, sequence->order[matches])) ++matches;
-
-    for (sz_size_t i = matches + 1; i < sequence->count; ++i)
-        if (predicate(sequence, sequence->order[i]))
-            sz_u64_swap(sequence->order + i, sequence->order + matches), ++matches;
-
-    return matches;
+typedef sz_size_t _sz_sorting_window_t;
+
+SZ_PUBLIC void _sz_sort_serial_export_prefixes(                             //
+    sz_sequence_t const *const collection,                                  //
+    _sz_sorting_window_t *const global_windows,                             //
+    sz_size_t const start_in_collection, sz_size_t const end_in_collection, //
+    sz_size_t const start_character) {
+
+    // Depending on the architecture, we will export a different number of bytes.
+    // On 32-bit architectures, we will export 3 bytes, and on 64-bit architectures - 7 bytes.
+    sz_size_t const window_capacity = sizeof(_sz_sorting_window_t) - 1;
+
+    // Perform the same operation for every string.
+    for (sz_size_t i = start_in_collection; i < end_in_collection; ++i) {
+        // Get the string slice in global memory.
+        sz_cptr_t const source_str = collection->get_start(collection, i);
+        sz_size_t const length = collection->get_length(collection, i);
+        sz_size_t const remaining_length = length > start_character ? length - start_character : 0;
+        sz_size_t const exported_length = remaining_length > window_capacity ? window_capacity : remaining_length;
+        // Fill with zeros, export a slice, and mark the exported length.
+        sz_size_t *target_integer = &global_windows[i];
+        sz_ptr_t target_str = (sz_ptr_t)target_integer;
+        *target_integer = 0;
+        for (sz_size_t j = 0; j < exported_length; ++j) target_str[j] = source_str[j + start_character];
+        target_str[window_capacity] = exported_length;
+#if defined(_SZ_IS_64_BIT)
+        *target_integer = sz_u64_bytes_reverse(*target_integer);
+#else
+        *target_integer = sz_u32_bytes_reverse(*target_integer);
+#endif
+    }
 }
 
-SZ_PUBLIC void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less) {
-
-    sz_size_t start_b = partition + 1;
-
-    // If the direct merge is already sorted
-    if (!less(sequence, sequence->order[start_b], sequence->order[partition])) return;
-
-    sz_size_t start_a = 0;
-    while (start_a <= partition && start_b <= sequence->count) {
-
-        // If element 1 is in right place
-        if (!less(sequence, sequence->order[start_b], sequence->order[start_a])) { start_a++; }
+/**
+ *  @brief  Helper function of the serial QuickSort algorithm, that rearranges the elements in
+ *          such a way, that all entries around the pivot are less than the pivot.
+ *
+ *  It means that no relative order among the elements on the left or right side of the pivot is preserved.
+ *  We chose the pivot point using Robert Sedgewick's method - the median of three elements - the first,
+ *  the middle, and the last element of the given range.
+ */
+SZ_PUBLIC sz_size_t _sz_sort_serial_partition(                                       //
+    _sz_sorting_window_t *const global_windows, sz_sorted_idx_t *const global_order, //
+    sz_size_t const start_in_collection, sz_size_t const end_in_collection) {
+
+    // Chose the pivot offset.
+    sz_size_t pivot_offset;
+    _sz_sorting_window_t pivot_window;
+    {
+        sz_size_t const middle_offset = start_in_collection + (end_in_collection - start_in_collection) / 2;
+        sz_size_t const last_offset = end_in_collection - 1;
+        sz_size_t const first_offset = start_in_collection;
+        _sz_sorting_window_t const first_window = global_windows[first_offset];
+        _sz_sorting_window_t const middle_window = global_windows[middle_offset];
+        _sz_sorting_window_t const last_window = global_windows[last_offset];
+        if (first_window < middle_window) {
+            if (middle_window < last_window) { pivot_offset = middle_offset, pivot_window = middle_window; }
+            else if (first_window < last_window) { pivot_offset = last_offset, pivot_window = last_window; }
+            else { pivot_offset = first_offset, pivot_window = first_window; }
+        }
         else {
-            sz_size_t value = sequence->order[start_b];
-            sz_size_t index = start_b;
-
-            // Shift all the elements between element 1
-            // element 2, right by 1.
-            while (index != start_a) { sequence->order[index] = sequence->order[index - 1], index--; }
-            sequence->order[start_a] = value;
-
-            // Update all the pointers
-            start_a++;
-            partition++;
-            start_b++;
+            if (first_window < last_window) { pivot_offset = first_offset, pivot_window = first_window; }
+            else if (middle_window < last_window) { pivot_offset = last_offset, pivot_window = last_window; }
+            else { pivot_offset = middle_offset, pivot_window = middle_window; }
         }
     }
-}
 
-SZ_PUBLIC void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
-    sz_u64_t *keys = sequence->order;
-    sz_size_t keys_count = sequence->count;
-    for (sz_size_t i = 1; i < keys_count; i++) {
-        sz_u64_t i_key = keys[i];
-        sz_size_t j = i;
-        for (; j > 0 && less(sequence, i_key, keys[j - 1]); --j) keys[j] = keys[j - 1];
-        keys[j] = i_key;
+    // Loop through the collection and move the elements around the pivot.
+    sz_size_t left_offset = start_in_collection;
+    sz_size_t right_offset = end_in_collection - 1;
+    while (left_offset <= right_offset) {
+        // Find the first element on the left that is greater than the pivot.
+        while (global_windows[left_offset] < pivot_window) ++left_offset;
+        // Find the first element on the right that is less than the pivot.
+        while (global_windows[right_offset] > pivot_window) --right_offset;
+        // Swap the elements if they are in the wrong order.
+        if (left_offset <= right_offset) {
+#if defined(_SZ_IS_64_BIT)
+            sz_u64_swap(&global_order[left_offset], &global_order[right_offset]);
+            sz_u64_swap(&global_windows[left_offset], &global_windows[right_offset]);
+#else
+            sz_u32_swap(&global_order[left_offset], &global_order[right_offset]);
+            sz_u32_swap(&global_windows[left_offset], &global_windows[right_offset]);
+#endif
+            ++left_offset;
+            --right_offset;
+        }
     }
-}
 
-SZ_INTERNAL void _sz_sift_down( //
-    sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t start, sz_size_t end) {
-    sz_size_t root = start;
-    while (2 * root + 1 <= end) {
-        sz_size_t child = 2 * root + 1;
-        if (child + 1 <= end && less(sequence, order[child], order[child + 1])) { child++; }
-        if (!less(sequence, order[root], order[child])) { return; }
-        sz_u64_swap(order + root, order + child);
-        root = child;
-    }
+    return pivot_offset;
 }
 
-SZ_INTERNAL void _sz_heapify(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t count) {
-    sz_size_t start = (count - 2) / 2;
-    while (1) {
-        _sz_sift_down(sequence, less, order, start, count - 1);
-        if (start == 0) return;
-        start--;
+SZ_PUBLIC void _sz_sort_serial_recursively(                                    //
+    sz_sequence_t const *const collection,                                     //
+    _sz_sorting_window_t *const global_windows, sz_size_t *const global_order, //
+    sz_size_t const start_in_collection, sz_size_t const end_in_collection,    //
+    sz_size_t const start_character) {
+    // Partition the collection around some pivot
+    sz_size_t pivot_index =
+        _sz_sort_serial_partition(global_windows, global_order, start_in_collection, end_in_collection);
+
+    // Recursively sort the left partition
+    if (start_in_collection < pivot_index) {
+        _sz_sort_serial_recursively(collection, global_windows, global_order, start_in_collection, pivot_index,
+                                    start_character);
     }
-}
 
-SZ_INTERNAL void _sz_heapsort(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first, sz_size_t last) {
-    sz_u64_t *order = sequence->order;
-    sz_size_t count = last - first;
-    _sz_heapify(sequence, less, order + first, count);
-    sz_size_t end = count - 1;
-    while (end > 0) {
-        sz_u64_swap(order + first, order + first + end);
-        end--;
-        _sz_sift_down(sequence, less, order + first, 0, end);
+    // Recursively sort the right partition
+    if (pivot_index + 1 < end_in_collection) {
+        _sz_sort_serial_recursively(collection, global_windows, global_order, pivot_index + 1, end_in_collection,
+                                    start_character);
     }
 }
 
-SZ_PUBLIC void sz_sort_introsort_recursion( //
-    sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first, sz_size_t last, sz_size_t depth) {
-
-    sz_size_t length = last - first;
-    switch (length) {
-    case 0:
-    case 1: return;
-    case 2:
-        if (less(sequence, sequence->order[first + 1], sequence->order[first]))
-            sz_u64_swap(&sequence->order[first], &sequence->order[first + 1]);
-        return;
-    case 3: {
-        sz_u64_t a = sequence->order[first];
-        sz_u64_t b = sequence->order[first + 1];
-        sz_u64_t c = sequence->order[first + 2];
-        if (less(sequence, b, a)) sz_u64_swap(&a, &b);
-        if (less(sequence, c, b)) sz_u64_swap(&c, &b);
-        if (less(sequence, b, a)) sz_u64_swap(&a, &b);
-        sequence->order[first] = a;
-        sequence->order[first + 1] = b;
-        sequence->order[first + 2] = c;
-        return;
-    }
-    }
-    // Until a certain length, the quadratic-complexity insertion-sort is fine
-    if (length <= 16) {
-        sz_sequence_t sub_seq = *sequence;
-        sub_seq.order += first;
-        sub_seq.count = length;
-        sz_sort_insertion(&sub_seq, less);
-        return;
+SZ_PUBLIC void _sz_sort_serial_next_window(                                    //
+    sz_sequence_t const *const collection,                                     //
+    _sz_sorting_window_t *const global_windows, sz_size_t *const global_order, //
+    sz_size_t const start_in_collection, sz_size_t const end_in_collection,    //
+    sz_size_t const start_character) {
+
+    // Prepare the new range of windows
+    _sz_sort_serial_export_prefixes(collection, global_windows, start_in_collection, end_in_collection,
+                                    start_character);
+
+    // Sort current windows with a quicksort
+    _sz_sort_serial_recursively(collection, global_windows, global_order, start_in_collection, end_in_collection,
+                                start_character);
+
+    // Depending on the architecture, we will export a different number of bytes.
+    // On 32-bit architectures, we will export 3 bytes, and on 64-bit architectures - 7 bytes.
+    sz_size_t const window_capacity = sizeof(_sz_sorting_window_t) - 1;
+
+    // Repeat the procedure for the identical windows
+    sz_size_t nested_start = start_in_collection;
+    sz_size_t nested_end = start_in_collection;
+    while (nested_end != end_in_collection) {
+        // Find the end of the identical windows
+        _sz_sorting_window_t current_window_integer = global_windows[nested_start];
+        while (nested_end != end_in_collection && current_window_integer == global_windows[nested_end]) ++nested_end;
+
+        // If the identical windows are not trivial and each string has more characters, sort them recursively
+        sz_cptr_t current_window_str = (sz_cptr_t)&current_window_integer;
+        int current_window_length = current_window_str[window_capacity];
+        if (nested_end - nested_start > 1 && current_window_length == window_capacity) {
+            _sz_sort_serial_next_window(collection, global_windows, global_order, nested_start, nested_end,
+                                        start_character + window_capacity);
+        }
+        // Move to the next
+        nested_start = nested_end;
     }
+}
 
-    // Fallback to N-logN-complexity heap-sort
-    if (depth == 0) {
-        _sz_heapsort(sequence, less, first, last);
-        return;
-    }
+SZ_PUBLIC void _sz_sort_serial_insertion(sz_sequence_t const *collection, sz_memory_allocator_t *alloc,
+                                         sz_sorted_idx_t *order) {
+    // This algorithm needs no memory allocations:
+    sz_unused(alloc);
 
-    --depth;
-
-    // Median-of-three logic to choose pivot
-    sz_size_t median = first + length / 2;
-    if (less(sequence, sequence->order[median], sequence->order[first]))
-        sz_u64_swap(&sequence->order[first], &sequence->order[median]);
-    if (less(sequence, sequence->order[last - 1], sequence->order[first]))
-        sz_u64_swap(&sequence->order[first], &sequence->order[last - 1]);
-    if (less(sequence, sequence->order[median], sequence->order[last - 1]))
-        sz_u64_swap(&sequence->order[median], &sequence->order[last - 1]);
-
-    // Partition using the median-of-three as the pivot
-    sz_u64_t pivot = sequence->order[median];
-    sz_size_t left = first;
-    sz_size_t right = last - 1;
-    while (1) {
-        while (less(sequence, sequence->order[left], pivot)) left++;
-        while (less(sequence, pivot, sequence->order[right])) right--;
-        if (left >= right) break;
-        sz_u64_swap(&sequence->order[left], &sequence->order[right]);
-        left++;
-        right--;
+    // Assume `order` is already initialized with 0, 1, 2, ... N.
+    for (sz_size_t i = 1; i < collection->count; ++i) {
+        sz_sorted_idx_t current_idx = order[i];
+        sz_size_t j = i;
+        while (j > 0) {
+            // Get the two strings to compare.
+            sz_sorted_idx_t previous_idx = order[j - 1];
+            sz_cptr_t previous_start = collection->get_start(collection, previous_idx);
+            sz_cptr_t current_start = collection->get_start(collection, current_idx);
+            sz_size_t previous_length = collection->get_length(collection, previous_idx);
+            sz_size_t current_length = collection->get_length(collection, current_idx);
+
+            // Use the provided sz_order to compare.
+            sz_ordering_t ordering = sz_order(previous_start, previous_length, current_start, current_length);
+
+            // If the previous string is not greater than current_idx, we're done.
+            if (ordering != sz_greater_k) break;
+
+            // Otherwise, shift the previous element to the right.
+            order[j] = order[j - 1];
+            --j;
+        }
+        order[j] = current_idx;
     }
-
-    // Recursively sort the partitions
-    sz_sort_introsort_recursion(sequence, less, first, left, depth);
-    sz_sort_introsort_recursion(sequence, less, right + 1, last, depth);
-}
-
-SZ_PUBLIC void sz_sort_introsort(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
-    if (sequence->count == 0) return;
-    sz_size_t size_is_not_power_of_two = (sequence->count & (sequence->count - 1)) != 0;
-    sz_size_t depth_limit = sz_size_log2i_nonzero(sequence->count) + size_is_not_power_of_two;
-    sz_sort_introsort_recursion(sequence, less, 0, sequence->count, depth_limit);
 }
 
-SZ_PUBLIC void sz_sort_recursion( //
-    sz_sequence_t *sequence, sz_size_t bit_idx, sz_size_t bit_max, sz_sequence_comparator_t comparator,
-    sz_size_t partial_order_length) {
+SZ_PUBLIC sz_bool_t sz_sort_serial(sz_sequence_t const *collection, sz_memory_allocator_t *alloc,
+                                   sz_sorted_idx_t *order) {
 
-    if (!sequence->count) return;
+    // First, initialize the `order` with `std::iota`-like behavior.
+    for (sz_size_t i = 0; i != collection->count; ++i) order[i] = i;
 
-    // Array of size one doesn't need sorting - only needs the prefix to be discarded.
-    if (sequence->count == 1) {
-        sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
-        order_half_words[1] = 0;
-        return;
+    // On very small collections - just use the quadratic-complexity insertion sort
+    // without any smart optimizations or memory allocations.
+    if (collection->count <= 32) {
+        _sz_sort_serial_insertion(collection, alloc, order);
+        return sz_true_k;
     }
 
-    // Partition a range of integers according to a specific bit value
-    sz_size_t split = 0;
-    sz_u64_t mask = (1ull << 63) >> bit_idx;
-
-    // The clean approach would be to perform a single pass over the sequence.
+    // One of the reasons for slow string operations is the significant overhead of branching when performing
+    // individual string comparisons.
     //
-    //    while (split != sequence->count && !(sequence->order[split] & mask)) ++split;
-    //    for (sz_size_t i = split + 1; i < sequence->count; ++i)
-    //        if (!(sequence->order[i] & mask)) sz_u64_swap(sequence->order + i, sequence->order + split), ++split;
+    // The core idea of our algorithm is to minimize character-level loops in string comparisons and
+    // instead operate on larger integer words - 4 or 8 bytes at once, on 32-bit or 64-bit architectures, respectively.
+    // Let's say we have N strings and the pointer size is P.
     //
-    // This, however, doesn't take into account the high relative cost of writes and swaps.
-    // To circumvent that, we can first count the total number entries to be mapped into either part.
-    // And then walk through both parts, swapping the entries that are in the wrong part.
-    // This would often lead to ~15% performance gain.
-    sz_size_t count_with_bit_set = 0;
-    for (sz_size_t i = 0; i != sequence->count; ++i) count_with_bit_set += (sequence->order[i] & mask) != 0;
-    split = sequence->count - count_with_bit_set;
-
-    // It's possible that the sequence is already partitioned.
-    if (split != 0 && split != sequence->count) {
-        // Use two pointers to efficiently reposition elements.
-        // On pointer walks left-to-right from the start, and the other walks right-to-left from the end.
-        sz_size_t left = 0;
-        sz_size_t right = sequence->count - 1;
-        while (1) {
-            // Find the next element with the bit set on the left side.
-            while (left < split && !(sequence->order[left] & mask)) ++left;
-            // Find the next element without the bit set on the right side.
-            while (right >= split && (sequence->order[right] & mask)) --right;
-            // Swap the mispositioned elements.
-            if (left < split && right >= split) {
-                sz_u64_swap(sequence->order + left, sequence->order + right);
-                ++left;
-                --right;
-            }
-            else { break; }
-        }
-    }
-
-    // Go down recursively.
-    if (bit_idx < bit_max) {
-        sz_sequence_t a = *sequence;
-        a.count = split;
-        sz_sort_recursion(&a, bit_idx + 1, bit_max, comparator, partial_order_length);
-
-        sz_sequence_t b = *sequence;
-        b.order += split;
-        b.count -= split;
-        sz_sort_recursion(&b, bit_idx + 1, bit_max, comparator, partial_order_length);
-    }
-    // Reached the end of recursion.
-    else {
-        // Discard the prefixes.
-        sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
-        for (sz_size_t i = 0; i != sequence->count; ++i) { order_half_words[i * 2 + 1] = 0; }
-
-        sz_sequence_t a = *sequence;
-        a.count = split;
-        sz_sort_introsort(&a, comparator);
-
-        sz_sequence_t b = *sequence;
-        b.order += split;
-        b.count -= split;
-        sz_sort_introsort(&b, comparator);
-    }
+    // Our recursive algorithm will take the first P bytes of each string and sort them as integers.
+    // Assuming that some strings may contain or even end with NULL bytes, we need to make sure, that their length
+    // is included in those P-long words. So, in reality, we will be taking (P-1) bytes from each string on every
+    // iteration of a recursive algorithm.
+    _sz_sorting_window_t *windows =
+        (_sz_sorting_window_t *)alloc->allocate(collection->count * sizeof(_sz_sorting_window_t), alloc);
+    if (!windows) return sz_false_k;
+
+    // Recursively sort the whole collection.
+    _sz_sort_serial_recursively(collection, windows, order, 0, collection->count, 0);
+
+    // Free temporary storage.
+    alloc->free(windows, collection->count * sizeof(_sz_sorting_window_t), alloc);
+    return sz_true_k;
 }
 
-SZ_INTERNAL sz_bool_t _sz_sort_is_less(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) {
-    sz_cptr_t i_str = sequence->get_start(sequence, i_key);
-    sz_cptr_t j_str = sequence->get_start(sequence, j_key);
-    sz_size_t i_len = sequence->get_length(sequence, i_key);
-    sz_size_t j_len = sequence->get_length(sequence, j_key);
-    return (sz_bool_t)(sz_order_serial(i_str, i_len, j_str, j_len) == sz_less_k);
-}
-
-SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t partial_order_length) {
-
-#if _SZ_IS_BIG_ENDIAN
-    // TODO: Implement partial sort for big-endian systems. For now this sorts the whole thing.
-    sz_unused(partial_order_length);
-    sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
-#else
+#pragma endregion // Serial Implementation
 
-    // Export up to 4 bytes into the `sequence` bits themselves
-    for (sz_size_t i = 0; i != sequence->count; ++i) {
-        sz_cptr_t begin = sequence->get_start(sequence, sequence->order[i]);
-        sz_size_t length = sequence->get_length(sequence, sequence->order[i]);
-        length = length > 4u ? 4u : length;
-        sz_ptr_t prefix = (sz_ptr_t)&sequence->order[i];
-        for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j];
+#pragma region Ice Lake Implementation
+
+SZ_PUBLIC void _sz_sort_ice_recursively(                                       //
+    sz_sequence_t const *const collection,                                     //
+    _sz_sorting_window_t *const global_windows, sz_size_t *const global_order, //
+    sz_size_t const start_in_collection, sz_size_t const end_in_collection,    //
+    sz_size_t const start_character) {
+
+    // Prepare the new range of windows
+    _sz_sort_serial_export_prefixes(collection, global_windows, start_in_collection, end_in_collection,
+                                    start_character);
+
+    // We can implement a form of a Radix sort here, that will count the number of elements with
+    // a certain bit set. The naive approach may require too many loops over data. A more "vectorized"
+    // approach would be to maintain a histogram for several bits at once. For 4 bits we will
+    // need 2^4 = 16 counters.
+    sz_size_t histogram[16] = {0};
+    for (sz_size_t byte_in_window = 0; byte_in_window != sizeof(_sz_sorting_window_t); ++byte_in_window) {
+        // First sort based on the low nibble of each byte.
+        for (sz_size_t i = start_in_collection; i < end_in_collection; ++i) {
+            sz_size_t const byte = (global_windows[i] >> (byte_in_window * 8)) & 0xFF;
+            ++histogram[byte];
+        }
+        sz_size_t offset = start_in_collection;
+        for (sz_size_t i = 0; i != 16; ++i) {
+            sz_size_t const count = histogram[i];
+            histogram[i] = offset;
+            offset += count;
+        }
+        for (sz_size_t i = start_in_collection; i < end_in_collection; ++i) {
+            sz_size_t const byte = (global_windows[i] >> (byte_in_window * 8)) & 0xFF;
+            global_order[histogram[byte]] = i;
+            ++histogram[byte];
+        }
     }
-
-    // Perform optionally-parallel radix sort on them
-    sz_sort_recursion(sequence, 0, 32, (sz_sequence_comparator_t)_sz_sort_is_less, partial_order_length);
-#endif
 }
 
-SZ_PUBLIC void sz_sort(sz_sequence_t *sequence) {
-#if _SZ_IS_BIG_ENDIAN
-    sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
-#else
-    sz_sort_partial(sequence, sequence->count);
-#endif
-}
+#pragma endregion // Ice Lake Implementation
 
-#pragma endregion // Serial Implementation
+SZ_PUBLIC sz_bool_t sz_sort(sz_sequence_t const *collection, sz_memory_allocator_t *alloc, sz_sorted_idx_t *order) {
+    return sz_sort_serial(collection, alloc, order);
+}
 
 #ifdef __cplusplus
 }
diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index d64b0c03..89fbd39b 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -4024,12 +4024,13 @@ void sorted_order(objects_type_ const *begin, objects_type_ const *end, sorted_i
     for (std::size_t i = 0; i != args.count; ++i) order[i] = static_cast<sorted_idx_t>(i);
 
     sz_sequence_t array;
-    array.order = reinterpret_cast<sorted_idx_t *>(order);
     array.count = args.count;
     array.handle = &args;
     array.get_start = _call_sequence_member_start<objects_type_, string_extractor_>;
     array.get_length = _call_sequence_member_length<objects_type_, string_extractor_>;
-    sz_sort(&array);
+
+    using sz_alloc_type = sz_memory_allocator_t;
+    _with_alloc<std::allocator<sz_u8_t>>([&](sz_alloc_type &alloc) { return sz_sort(&array, &alloc, order); });
 }
 
 #if !SZ_AVOID_STL
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index a170b6b0..9fb67112 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -320,7 +320,8 @@ typedef char *sz_ptr_t;          // A type alias for `char *`
 typedef char const *sz_cptr_t;   // A type alias for `char const *`
 typedef sz_i8_t sz_error_cost_t; // Character mismatch cost for fuzzy matching functions
 
-typedef sz_u64_t sz_sorted_idx_t; // Index of a sorted string in a list of strings
+struct sz_sequence_t;              // Forward declaration of an ordered collection of strings
+typedef sz_size_t sz_sorted_idx_t; // Index of a sorted string in a list of strings
 
 typedef enum { sz_false_k = 0, sz_true_k = 1 } sz_bool_t;                        // Only one relevant bit
 typedef enum { sz_less_k = -1, sz_equal_k = 0, sz_greater_k = 1 } sz_ordering_t; // Only three possible states: <=>
@@ -626,20 +627,16 @@ SZ_INTERNAL sz_size_t _sz_export_utf8_to_utf32(sz_cptr_t utf8, sz_size_t utf8_le
 
 #pragma region String Sequences API
 
-struct sz_sequence_t;
-
 typedef sz_cptr_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t);
 typedef sz_size_t (*sz_sequence_member_length_t)(struct sz_sequence_t const *, sz_size_t);
 typedef sz_bool_t (*sz_sequence_predicate_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_bool_t (*sz_sequence_comparator_t)(struct sz_sequence_t const *, sz_size_t, sz_size_t);
 typedef sz_bool_t (*sz_string_is_less_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
 
 typedef struct sz_sequence_t {
-    sz_sorted_idx_t *order;
+    void const *handle;
     sz_size_t count;
     sz_sequence_member_start_t get_start;
     sz_sequence_member_length_t get_length;
-    void const *handle;
 } sz_sequence_t;
 
 /**

From b20d7cdcd70dfc1c702cd18525b3f454d6efbaf6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 15 Feb 2025 00:20:27 +0000
Subject: [PATCH 084/751] Fix: Tail sum order in `checksum_haswell`

---
 include/stringzilla/hash.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index e0c0447f..415b4b67 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -38,8 +38,6 @@ SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length);
  *  @param text     String to hash.
  *  @param length   Number of bytes in the text.
  *  @return         64-bit hash value.
- *
- *  @see    sz_hashes, sz_hashes_fingerprint, sz_hashes_intersection
  */
 SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length) {
     sz_unused(text && length);
@@ -268,10 +266,10 @@ SZ_PUBLIC sz_u64_t sz_checksum_haswell(sz_cptr_t text, sz_size_t length) {
         sz_size_t body_length = length - head_length - tail_length; // Multiple of 32.
         sz_u64_t result = 0;
 
+        // Handle the tail before we start updating the `text` pointer
+        while (tail_length) result += text[length - (tail_length--)];
         // Handle the head
         while (head_length--) result += *text++;
-        // Handle the tail
-        while (tail_length) result += text[length - (tail_length--) - 1];
 
         sz_u256_vec_t text_vec, sums_vec;
         sums_vec.ymm = _mm256_setzero_si256();

From abe8d07cc84d62038e1001ec67204102a1b955b0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 15 Feb 2025 00:21:17 +0000
Subject: [PATCH 085/751] Improve: Validate checksums in benchmark

---
 scripts/bench_token.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index 684adb05..2e694588 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -21,18 +21,18 @@ tracked_unary_functions_t checksum_functions() {
              return std::accumulate(s.begin(), s.end(), (std::size_t)0,
                                     [](std::size_t sum, char c) { return sum + static_cast<unsigned char>(c); });
          }},
-        {"sz_checksum_serial", wrap_sz(sz_checksum_serial), false},
+        {"sz_checksum_serial", wrap_sz(sz_checksum_serial), true},
 #if SZ_USE_HASWELL
-        {"sz_checksum_haswell", wrap_sz(sz_checksum_haswell), false},
+        {"sz_checksum_haswell", wrap_sz(sz_checksum_haswell), true},
 #endif
 #if SZ_USE_SKYLAKE
-        {"sz_checksum_skylake", wrap_sz(sz_checksum_skylake), false},
+        {"sz_checksum_skylake", wrap_sz(sz_checksum_skylake), true},
 #endif
 #if SZ_USE_ICE
-        {"sz_checksum_ice", wrap_sz(sz_checksum_ice), false},
+        {"sz_checksum_ice", wrap_sz(sz_checksum_ice), true},
 #endif
 #if SZ_USE_NEON
-        {"sz_checksum_neon", wrap_sz(sz_checksum_neon), false},
+        {"sz_checksum_neon", wrap_sz(sz_checksum_neon), true},
 #endif
     };
     return result;
@@ -242,6 +242,7 @@ void bench_on_synthetic_data() {
 
 int main(int argc, char const **argv) {
     std::printf("StringZilla. Starting token-level benchmarks.\n");
+    std::printf("- Seconds per benchmark: %zu\n", seconds_per_benchmark);
 
     if (argc < 2) { bench_on_synthetic_data(); }
     else { bench_on_input_data(argc, argv); }

From bce107af19ec7894904ab763064986336771b3e4 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 15 Feb 2025 00:21:55 +0000
Subject: [PATCH 086/751] Improve: Wrap `std::accumulate` for checksums

---
 scripts/test.cpp | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/scripts/test.cpp b/scripts/test.cpp
index 2bf886d1..0cf11552 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -797,6 +797,23 @@ static void test_non_stl_extensions_for_reads() {
     assert((str("hello")[{100, -100}] == ""));
     assert((str("hello")[{-100, -100}] == ""));
 
+    // Checksums
+    auto accumulate_bytes = [](str const &s) -> std::size_t {
+        return std::accumulate(s.begin(), s.end(), (std::size_t)0,
+                               [](std::size_t sum, char c) { return sum + static_cast<unsigned char>(c); });
+    };
+    assert(str("a").checksum() == (std::size_t)'a');
+    assert(str("0").checksum() == (std::size_t)'0');
+    assert(str("0123456789").checksum() == arithmetic_sum('0', '9'));
+    assert(str("abcdefghijklmnopqrstuvwxyz").checksum() == arithmetic_sum('a', 'z'));
+    assert(str("abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz").checksum() ==
+           arithmetic_sum('a', 'z') * 3);
+    assert_scoped(
+        str s =
+            "近来，加文出席微博之夜时对着镜头频繁摆出假笑表情、一度累瘫睡倒在沙发上的照片被广泛转发，引发对他失去童年、"
+            "被过度消费的担忧。八岁的加文，已当网红近六年了，可以说，自懂事以来，他没有过过一天没有名气的日子。",
+        (void)0, s.checksum() == accumulate_bytes(s));
+
     // Computing edit-distances.
     assert(sz::hamming_distance(str("hello"), str("hello")) == 0);
     assert(sz::hamming_distance(str("hello"), str("hell")) == 1);
@@ -837,14 +854,6 @@ static void test_non_stl_extensions_for_reads() {
     assert(sz::alignment_score(str("hello"), str("hello"), costs, -1) == 0);
     assert(sz::alignment_score(str("hello"), str("hell"), costs, -1) == -1);
 
-    // Checksums
-    assert(str("a").checksum() == (std::size_t)'a');
-    assert(str("0").checksum() == (std::size_t)'0');
-    assert(str("0123456789").checksum() == arithmetic_sum('0', '9'));
-    assert(str("abcdefghijklmnopqrstuvwxyz").checksum() == arithmetic_sum('a', 'z'));
-    assert(str("abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz").checksum() ==
-           arithmetic_sum('a', 'z') * 3);
-
 #if _SZ_DEPRECATED_FINGERPRINTS
 
     // Computing rolling fingerprints.

From 982dd4d3c5459c309c3b8426191e1d730ec4133d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 15 Feb 2025 00:22:18 +0000
Subject: [PATCH 087/751] Docs: Signatures and typos

---
 include/stringzilla/types.h | 96 ++++++++++++++++++++++---------------
 scripts/bench.hpp           | 16 +++----
 scripts/bench_sort.cpp      | 10 ++--
 3 files changed, 70 insertions(+), 52 deletions(-)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 9fb67112..89cf1ce9 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -267,10 +267,11 @@ typedef size_t sz_size_t;     // Pointer-sized unsigned integer, 32 or 64 bits
 typedef ptrdiff_t sz_ssize_t; // Signed version of `sz_size_t`, 32 or 64 bits
 
 #else // if SZ_AVOID_LIBC:
-
-// ! The C standard doesn't specify the signedness of char.
-// ! On x86 char is signed by default while on Arm it is unsigned by default.
-// ! That's why we don't define `sz_char_t` and generally use explicit `sz_i8_t` and `sz_u8_t`.
+/**
+ *  ! The C standard doesn't specify the signedness of char.
+ *  ! On x86 char is signed by default while on Arm it is unsigned by default.
+ *  ! That's why we don't define `sz_char_t` and generally use explicit `sz_i8_t` and `sz_u8_t`.
+ */
 typedef signed char sz_i8_t;         // Always 8 bits
 typedef unsigned char sz_u8_t;       // Always 8 bits
 typedef unsigned short sz_u16_t;     // Always 16 bits
@@ -279,22 +280,24 @@ typedef unsigned int sz_u32_t;       // Always 32 bits
 typedef long long sz_i64_t;          // Always 64 bits
 typedef unsigned long long sz_u64_t; // Always 64 bits
 
-// Now we need to redefine the `size_t`.
-// Microsoft Visual C++ (MSVC) typically follows LLP64 data model on 64-bit platforms,
-// where integers, pointers, and long types have different sizes:
-//
-//  > `int` is 32 bits
-//  > `long` is 32 bits
-//  > `long long` is 64 bits
-//  > pointer (thus, `size_t`) is 64 bits
-//
-// In contrast, GCC and Clang on 64-bit Unix-like systems typically follow the LP64 model, where:
-//
-//  > `int` is 32 bits
-//  > `long` and pointer (thus, `size_t`) are 64 bits
-//  > `long long` is also 64 bits
-//
-// Source: https://learn.microsoft.com/en-us/windows/win32/winprog64/abstract-data-models
+/**
+ *  Now we need to redefine the `size_t`.
+ *  Microsoft Visual C++ (MSVC) typically follows LLP64 data model on 64-bit platforms,
+ *  where integers, pointers, and long types have different sizes:
+ *
+ *   > `int` is 32 bits
+ *   > `long` is 32 bits
+ *   > `long long` is 64 bits
+ *   > pointer (thus, `size_t`) is 64 bits
+ *
+ *  In contrast, GCC and Clang on 64-bit Unix-like systems typically follow the LP64 model, where:
+ *
+ *   > `int` is 32 bits
+ *   > `long` and pointer (thus, `size_t`) are 64 bits
+ *   > `long long` is also 64 bits
+ *
+ *  Source: https://learn.microsoft.com/en-us/windows/win32/winprog64/abstract-data-models
+ */
 #if _SZ_IS_64_BIT
 typedef unsigned long long sz_size_t; // 64-bit.
 typedef long long sz_ssize_t;         // 64-bit.
@@ -438,36 +441,48 @@ SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void
 
 #pragma region API Signature Types
 
+/** @brief  Signature of ::sz_hash. */
 typedef sz_u64_t (*sz_hash_t)(sz_cptr_t, sz_size_t);
+
+/** @brief  Signature of ::sz_checksum. */
 typedef sz_u64_t (*sz_checksum_t)(sz_cptr_t, sz_size_t);
+
+/** @brief  Signature of ::sz_equal. */
 typedef sz_bool_t (*sz_equal_t)(sz_cptr_t, sz_cptr_t, sz_size_t);
+
+/** @brief  Signature of ::sz_order. */
 typedef sz_ordering_t (*sz_order_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
-typedef void (*sz_to_converter_t)(sz_cptr_t, sz_size_t, sz_ptr_t);
 
+/** @brief  Signature of ::sz_look_up_transform. */
 typedef void (*sz_look_up_transform_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_ptr_t);
 
+/** @brief  Signature of ::sz_move. */
 typedef void (*sz_move_t)(sz_ptr_t, sz_cptr_t, sz_size_t);
 
+/** @brief  Signature of ::sz_fill. */
 typedef void (*sz_fill_t)(sz_ptr_t, sz_size_t, sz_u8_t);
 
+/** @brief  Signature of ::sz_find_byte. */
 typedef sz_cptr_t (*sz_find_byte_t)(sz_cptr_t, sz_size_t, sz_cptr_t);
+
+/** @brief  Signature of ::sz_find. */
 typedef sz_cptr_t (*sz_find_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
+
+/** @brief  Signature of ::sz_find_set. */
 typedef sz_cptr_t (*sz_find_set_t)(sz_cptr_t, sz_size_t, sz_charset_t const *);
 
+/** @brief  Signature of ::sz_hamming_distance. */
 typedef sz_size_t (*sz_hamming_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t);
 
+/** @brief  Signature of ::sz_edit_distance. */
 typedef sz_size_t (*sz_edit_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t, sz_memory_allocator_t *);
 
+/** @brief  Signature of ::sz_alignment_score. */
 typedef sz_ssize_t (*sz_alignment_score_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_error_cost_t const *,
                                            sz_error_cost_t, sz_memory_allocator_t *);
 
-typedef void (*sz_hash_callback_t)(sz_cptr_t, sz_size_t, sz_u64_t, void *user);
-
-typedef void (*sz_hashes_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_size_t, sz_hash_callback_t, void *);
-
-typedef void (*sz_hashes_fingerprint_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_ptr_t, sz_size_t);
-
-typedef sz_size_t (*sz_hashes_intersection_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_cptr_t, sz_size_t);
+/** @brief  Signature of ::sz_sort. */
+typedef sz_bool_t (*sz_sort_t)(sz_sequence_t const *, sz_memory_allocator_t *, sz_sorted_idx_t *);
 
 #pragma endregion
 
@@ -728,15 +743,16 @@ SZ_PUBLIC void _sz_assert_failure(char const *condition, char const *file, int l
  */
 #if defined(_MSC_VER) && !defined(__clang__) // On Clang-CL
 #include <intrin.h>
-
-// Sadly, when building Win32 images, we can't use the `_tzcnt_u64`, `_lzcnt_u64`,
-// `_BitScanForward64`, or `_BitScanReverse64` intrinsics. For now it's a simple `for`-loop.
-// TODO: In the future we can switch to a more efficient De Bruijn's algorithm.
-// https://www.chessprogramming.org/BitScan
-// https://www.chessprogramming.org/De_Bruijn_Sequence
-// https://gist.github.com/resilar/e722d4600dbec9752771ab4c9d47044f
-//
-// Use the serial version on 32-bit x86 and on Arm.
+/*
+ *  Sadly, when building Win32 images, we can't use the `_tzcnt_u64`, `_lzcnt_u64`,
+ *  `_BitScanForward64`, or `_BitScanReverse64` intrinsics. For now it's a simple `for`-loop.
+ *  TODO: In the future we can switch to a more efficient De Bruijn's algorithm.
+ *  https://www.chessprogramming.org/BitScan
+ *  https://www.chessprogramming.org/De_Bruijn_Sequence
+ *  https://gist.github.com/resilar/e722d4600dbec9752771ab4c9d47044f
+ *
+ *  Use the serial version on 32-bit x86 and on Arm.
+ */
 #if (defined(_WIN32) && !defined(_WIN64)) || defined(_M_ARM) || defined(_M_ARM64)
 SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) {
     _sz_assert(x != 0);
@@ -780,8 +796,10 @@ SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) { return (int)_tzcnt_u32(x); }
 SZ_INTERNAL int sz_u32_clz(sz_u32_t x) { return (int)_lzcnt_u32(x); }
 SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) { return (int)__popcnt(x); }
 #endif
-// Force the byteswap functions to be intrinsics, because when /Oi- is given, these will turn into CRT function calls,
-// which breaks when `SZ_AVOID_LIBC` is given
+/*
+ *  Force the byteswap functions to be intrinsics, because when `/Oi-` is given,
+ *  these will turn into CRT function calls, which breaks when `SZ_AVOID_LIBC` is given.
+ */
 #pragma intrinsic(_byteswap_uint64)
 SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return _byteswap_uint64(val); }
 #pragma intrinsic(_byteswap_ulong)
diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index ecdf3bb2..b321fa7e 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -63,7 +63,7 @@ struct tracked_function_gt {
     void print() const {
         bool is_binary = std::is_same<function_type, binary_function_t>();
 
-        // If failures have occured, output them to file tos implify the debugging process.
+        // If failures have occurred, output them to file to simplify the debugging process.
         bool contains_failures = !failed_strings.empty();
         if (contains_failures) {
             // The file name is made of the string hash and the function name.
@@ -161,7 +161,7 @@ inline std::vector<result_string_type> filter_by_length(std::vector<from_string_
     return result;
 }
 
-inline static std::size_t seconds_per_benchmark = SZ_DEBUG ? 1 : 5;
+inline static std::size_t seconds_per_benchmark = SZ_DEBUG ? 1 : 10;
 
 struct dataset_t {
     std::string text;
@@ -239,8 +239,8 @@ template <typename strings_type, typename function_type>
 benchmark_result_t bench_on_tokens(strings_type &&strings, function_type &&function) {
 
     namespace stdc = std::chrono;
-    using stdcc = stdc::high_resolution_clock;
-    stdcc::time_point t1 = stdcc::now();
+    using clock_t = stdc::high_resolution_clock;
+    clock_t::time_point t1 = clock_t::now();
     benchmark_result_t result;
     std::size_t lookup_mask = bit_floor(strings.size()) - 1;
 
@@ -254,7 +254,7 @@ benchmark_result_t bench_on_tokens(strings_type &&strings, function_type &&funct
             result.iterations += 4;
         }
 
-        stdcc::time_point t2 = stdcc::now();
+        clock_t::time_point t2 = clock_t::now();
         result.seconds = stdc::duration_cast<stdc::nanoseconds>(t2 - t1).count() / 1.e9;
         if (result.seconds > seconds_per_benchmark) break;
     }
@@ -273,8 +273,8 @@ template <typename strings_type, typename function_type>
 benchmark_result_t bench_on_token_pairs(strings_type &&strings, function_type &&function) {
 
     namespace stdc = std::chrono;
-    using stdcc = stdc::high_resolution_clock;
-    stdcc::time_point t1 = stdcc::now();
+    using clock_t = stdc::high_resolution_clock;
+    clock_t::time_point t1 = clock_t::now();
     benchmark_result_t result;
     std::size_t lookup_mask = bit_floor(strings.size()) - 1;
     std::size_t largest_prime = static_cast<std::size_t>(18446744073709551557ull);
@@ -290,7 +290,7 @@ benchmark_result_t bench_on_token_pairs(strings_type &&strings, function_type &&
             result.iterations += 4;
         }
 
-        stdcc::time_point t2 = stdcc::now();
+        clock_t::time_point t2 = clock_t::now();
         result.seconds = stdc::duration_cast<stdc::nanoseconds>(t2 - t1).count() / 1.e9;
         if (result.seconds > seconds_per_benchmark) break;
     }
diff --git a/scripts/bench_sort.cpp b/scripts/bench_sort.cpp
index f46be4a3..742d1b9b 100644
--- a/scripts/bench_sort.cpp
+++ b/scripts/bench_sort.cpp
@@ -127,9 +127,9 @@ void expect_same(permute_t const &permute_base, permute_t const &permute_new) {
 template <typename algo_at>
 void bench_permute(char const *name, strings_t &strings, permute_t &permute, algo_at &&algo) {
     namespace stdc = std::chrono;
-    using stdcc = stdc::high_resolution_clock;
+    using clock_t = stdc::high_resolution_clock;
     constexpr std::size_t iterations = 3;
-    stdcc::time_point t1 = stdcc::now();
+    clock_t::time_point t1 = clock_t::now();
 
     // Run multiple iterations
     for (std::size_t i = 0; i != iterations; ++i) {
@@ -138,10 +138,10 @@ void bench_permute(char const *name, strings_t &strings, permute_t &permute, alg
     }
 
     // Measure elapsed time
-    stdcc::time_point t2 = stdcc::now();
+    clock_t::time_point t2 = clock_t::now();
     double dif = stdc::duration_cast<stdc::nanoseconds>(t2 - t1).count() * 1.0;
-    double milisecs = dif / (iterations * 1e6);
-    std::printf("Elapsed time is %.2lf miliseconds/iteration for %s.\n", milisecs, name);
+    double millisecs = dif / (iterations * 1e6);
+    std::printf("Elapsed time is %.2lf milliseconds/iteration for %s.\n", millisecs, name);
 }
 
 int main(int argc, char const **argv) {

From a0318eb4e3e61547a927877df92f405e299ebb3f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 15 Feb 2025 00:23:05 +0000
Subject: [PATCH 088/751] Make: Renamed scripts/bench_token.cpp ->
 scripts/bench_fingerprint.cpp

---
 scripts/{bench_token.cpp => bench_fingerprint.cpp} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename scripts/{bench_token.cpp => bench_fingerprint.cpp} (100%)

diff --git a/scripts/bench_token.cpp b/scripts/bench_fingerprint.cpp
similarity index 100%
rename from scripts/bench_token.cpp
rename to scripts/bench_fingerprint.cpp

From 07d2239431c2cba513f951803b0ea0f7fc942366 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 15 Feb 2025 00:23:05 +0000
Subject: [PATCH 089/751] Make: Renamed scripts/bench_token.cpp ->
 temp-git-split-file

---
 scripts/bench_token.cpp => temp-git-split-file | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename scripts/bench_token.cpp => temp-git-split-file (100%)

diff --git a/scripts/bench_token.cpp b/temp-git-split-file
similarity index 100%
rename from scripts/bench_token.cpp
rename to temp-git-split-file

From 031bedfca6ec7a120e83eba196d8a174b6872796 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 15 Feb 2025 00:23:05 +0000
Subject: [PATCH 090/751] Make: Renamed temp-git-split-file ->
 scripts/bench_token.cpp

---
 temp-git-split-file => scripts/bench_token.cpp | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename temp-git-split-file => scripts/bench_token.cpp (100%)

diff --git a/temp-git-split-file b/scripts/bench_token.cpp
similarity index 100%
rename from temp-git-split-file
rename to scripts/bench_token.cpp

From 187e0bdbeab85634e6a655225ebc54089da11bef Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 15 Feb 2025 00:25:50 +0000
Subject: [PATCH 091/751] Improve: Separate fingerprinting benchmarks

---
 scripts/bench_fingerprint.cpp | 113 ----------------------------------
 scripts/bench_token.cpp       |  68 +-------------------
 2 files changed, 1 insertion(+), 180 deletions(-)

diff --git a/scripts/bench_fingerprint.cpp b/scripts/bench_fingerprint.cpp
index 2e694588..82064a29 100644
--- a/scripts/bench_fingerprint.cpp
+++ b/scripts/bench_fingerprint.cpp
@@ -11,44 +11,6 @@
 
 using namespace ashvardanian::stringzilla::scripts;
 
-tracked_unary_functions_t checksum_functions() {
-    auto wrap_sz = [](auto function) -> unary_function_t {
-        return unary_function_t([function](std::string_view s) { return function(s.data(), s.size()); });
-    };
-    tracked_unary_functions_t result = {
-        {"std::accumulate",
-         [](std::string_view s) {
-             return std::accumulate(s.begin(), s.end(), (std::size_t)0,
-                                    [](std::size_t sum, char c) { return sum + static_cast<unsigned char>(c); });
-         }},
-        {"sz_checksum_serial", wrap_sz(sz_checksum_serial), true},
-#if SZ_USE_HASWELL
-        {"sz_checksum_haswell", wrap_sz(sz_checksum_haswell), true},
-#endif
-#if SZ_USE_SKYLAKE
-        {"sz_checksum_skylake", wrap_sz(sz_checksum_skylake), true},
-#endif
-#if SZ_USE_ICE
-        {"sz_checksum_ice", wrap_sz(sz_checksum_ice), true},
-#endif
-#if SZ_USE_NEON
-        {"sz_checksum_neon", wrap_sz(sz_checksum_neon), true},
-#endif
-    };
-    return result;
-}
-
-tracked_unary_functions_t hashing_functions() {
-    auto wrap_sz = [](auto function) -> unary_function_t {
-        return unary_function_t([function](std::string_view s) { return function(s.data(), s.size()); });
-    };
-    tracked_unary_functions_t result = {
-        {"sz_hash_serial", wrap_sz(sz_hash_serial)},
-        {"std::hash", [](std::string_view s) { return std::hash<std::string_view> {}(s); }},
-    };
-    return result;
-}
-
 tracked_unary_functions_t sliding_hashing_functions(std::size_t window_width, std::size_t step) {
 #if _SZ_DEPRECATED_FINGERPRINTS
     auto wrap_sz = [=](auto function) -> unary_function_t {
@@ -116,59 +78,6 @@ tracked_unary_functions_t random_generation_functions(std::size_t token_length)
     return result;
 }
 
-tracked_binary_functions_t equality_functions() {
-    auto wrap_sz = [](auto function) -> binary_function_t {
-        return binary_function_t([function](std::string_view a, std::string_view b) {
-            return (a.size() == b.size() && function(a.data(), b.data(), a.size()));
-        });
-    };
-    tracked_binary_functions_t result = {
-        {"std::string_view.==", [](std::string_view a, std::string_view b) { return (a == b); }},
-        {"sz_equal_serial", wrap_sz(sz_equal_serial), true},
-#if SZ_USE_HASWELL
-        {"sz_equal_haswell", wrap_sz(sz_equal_haswell), true},
-#endif
-#if SZ_USE_SKYLAKE
-        {"sz_equal_skylake", wrap_sz(sz_equal_skylake), true},
-#endif
-        {"memcmp",
-         [](std::string_view a, std::string_view b) {
-             return (a.size() == b.size() && memcmp(a.data(), b.data(), a.size()) == 0);
-         }},
-    };
-    return result;
-}
-
-tracked_binary_functions_t ordering_functions() {
-    auto wrap_sz = [](auto function) -> binary_function_t {
-        return binary_function_t([function](std::string_view a, std::string_view b) {
-            return function(a.data(), a.size(), b.data(), b.size());
-        });
-    };
-    tracked_binary_functions_t result = {
-        {"std::string_view.compare",
-         [](std::string_view a, std::string_view b) {
-             auto order = a.compare(b);
-             return (order == 0 ? sz_equal_k : (order < 0 ? sz_less_k : sz_greater_k));
-         }},
-        {"sz_order_serial", wrap_sz(sz_order_serial), true},
-#if SZ_USE_HASWELL
-        {"sz_order_haswell", wrap_sz(sz_order_haswell), true},
-#endif
-#if SZ_USE_SKYLAKE
-        {"sz_order_skylake", wrap_sz(sz_order_skylake), true},
-#endif
-        {"memcmp",
-         [](std::string_view a, std::string_view b) {
-             auto order = memcmp(a.data(), b.data(), a.size() < b.size() ? a.size() : b.size());
-             return order != 0 ? (a.size() == b.size() ? (order < 0 ? sz_less_k : sz_greater_k)
-                                                       : (a.size() < b.size() ? sz_less_k : sz_greater_k))
-                               : sz_equal_k;
-         }},
-    };
-    return result;
-}
-
 template <typename string_type>
 void bench_dereferencing(std::string name, std::vector<string_type> strings) {
     auto func = unary_function_t([](std::string_view s) { return s.size(); });
@@ -183,8 +92,6 @@ void bench(strings_type &&strings) {
     // Benchmark logical operations
     bench_unary_functions(strings, checksum_functions());
     bench_unary_functions(strings, hashing_functions());
-    bench_unary_functions(strings, sliding_hashing_functions(8, 1));
-    bench_unary_functions(strings, fingerprinting_functions());
     bench_binary_functions(strings, equality_functions());
     bench_binary_functions(strings, ordering_functions());
 
@@ -198,11 +105,7 @@ void bench(strings_type &&strings) {
 
 void bench_on_input_data(int argc, char const **argv) {
     dataset_t dataset = prepare_benchmark_environment(argc, argv);
-#if 0
     std::printf("Benchmarking on the entire dataset:\n");
-    bench_unary_functions(dataset.tokens, random_generation_functions(100));
-    bench_unary_functions(dataset.tokens, random_generation_functions(20));
-    bench_unary_functions(dataset.tokens, random_generation_functions(5));
 
     // When performing fingerprinting, it's extremely important to:
     //      1. Have small output fingerprints that fit the cache.
@@ -215,25 +118,9 @@ void bench_on_input_data(int argc, char const **argv) {
     bench_unary_functions<std::vector<std::string_view>>({dataset.text}, sliding_hashing_functions(33, 8));
     bench_unary_functions<std::vector<std::string_view>>({dataset.text}, sliding_hashing_functions(127, 16));
 
-    bench_unary_functions<std::vector<std::string_view>>({dataset.text}, hashing_functions());
-
     bench_unary_functions<std::vector<std::string_view>>({dataset.text}, fingerprinting_functions(128, 4 * 1024));
     bench_unary_functions<std::vector<std::string_view>>({dataset.text}, fingerprinting_functions(128, 64 * 1024));
     bench_unary_functions<std::vector<std::string_view>>({dataset.text}, fingerprinting_functions(128, 1024 * 1024));
-#endif
-    // Baseline benchmarks for real words, coming in all lengths
-    std::printf("Benchmarking on real words:\n");
-    bench(dataset.tokens);
-    std::printf("Benchmarking on real lines:\n");
-    bench(dataset.lines);
-    std::printf("Benchmarking on entire dataset:\n");
-    bench<std::vector<std::string_view>>({dataset.text});
-
-    // Run benchmarks on tokens of different length
-    for (std::size_t token_length : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32}) {
-        std::printf("Benchmarking on real words of length %zu:\n", token_length);
-        bench(filter_by_length(dataset.tokens, token_length));
-    }
 }
 
 void bench_on_synthetic_data() {
diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index 2e694588..749daa85 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -49,48 +49,6 @@ tracked_unary_functions_t hashing_functions() {
     return result;
 }
 
-tracked_unary_functions_t sliding_hashing_functions(std::size_t window_width, std::size_t step) {
-#if _SZ_DEPRECATED_FINGERPRINTS
-    auto wrap_sz = [=](auto function) -> unary_function_t {
-        return unary_function_t([function, window_width, step](std::string_view s) {
-            sz_size_t mixed_hash = 0;
-            function(s.data(), s.size(), window_width, step, _sz_hashes_fingerprint_scalar_callback, &mixed_hash);
-            return mixed_hash;
-        });
-    };
-#endif
-    std::string suffix = std::to_string(window_width) + ":step" + std::to_string(step);
-    tracked_unary_functions_t result = {
-#if _SZ_DEPRECATED_FINGERPRINTS
-#if SZ_USE_ICE
-        {"sz_hashes_ice:" + suffix, wrap_sz(sz_hashes_ice)},
-#endif
-#if SZ_USE_HASWELL
-        {"sz_hashes_haswell:" + suffix, wrap_sz(sz_hashes_haswell)},
-#endif
-        {"sz_hashes_serial:" + suffix, wrap_sz(sz_hashes_serial)},
-#endif
-    };
-    return result;
-}
-
-tracked_unary_functions_t fingerprinting_functions(std::size_t window_width = 8, std::size_t fingerprint_bytes = 4096) {
-    using fingerprint_slot_t = std::uint8_t;
-    static std::vector<fingerprint_slot_t> fingerprint;
-    fingerprint.resize(fingerprint_bytes / sizeof(fingerprint_slot_t));
-    auto wrap_sz = [](auto function) -> unary_function_t {
-        return unary_function_t([function](std::string_view s) {
-            sz_size_t mixed_hash = 0;
-            sz_unused(s);
-            return mixed_hash;
-        });
-    };
-    tracked_unary_functions_t result = {};
-    sz_unused(window_width && fingerprint_bytes);
-    sz_unused(wrap_sz);
-    return result;
-}
-
 tracked_unary_functions_t random_generation_functions(std::size_t token_length) {
     static std::vector<char> buffer;
     if (buffer.size() < token_length) buffer.resize(token_length);
@@ -183,8 +141,6 @@ void bench(strings_type &&strings) {
     // Benchmark logical operations
     bench_unary_functions(strings, checksum_functions());
     bench_unary_functions(strings, hashing_functions());
-    bench_unary_functions(strings, sliding_hashing_functions(8, 1));
-    bench_unary_functions(strings, fingerprinting_functions());
     bench_binary_functions(strings, equality_functions());
     bench_binary_functions(strings, ordering_functions());
 
@@ -198,29 +154,7 @@ void bench(strings_type &&strings) {
 
 void bench_on_input_data(int argc, char const **argv) {
     dataset_t dataset = prepare_benchmark_environment(argc, argv);
-#if 0
-    std::printf("Benchmarking on the entire dataset:\n");
-    bench_unary_functions(dataset.tokens, random_generation_functions(100));
-    bench_unary_functions(dataset.tokens, random_generation_functions(20));
-    bench_unary_functions(dataset.tokens, random_generation_functions(5));
-
-    // When performing fingerprinting, it's extremely important to:
-    //      1. Have small output fingerprints that fit the cache.
-    //      2. Have that memory in close affinity to the core, ideally on stack, to avoid cache coherency problems.
-    // This introduces an additional challenge for efficient fingerprinting, as the CPU caches vary a lot.
-    // On the Intel Sapphire Rapids 6455B Gold CPU they are 96 KiB x2 for L1d, 4 MiB x2 for L2.
-    // Spilling into the L3 is a bad idea.
-    bench_unary_functions<std::vector<std::string_view>>({dataset.text}, sliding_hashing_functions(7, 1));
-    bench_unary_functions<std::vector<std::string_view>>({dataset.text}, sliding_hashing_functions(17, 4));
-    bench_unary_functions<std::vector<std::string_view>>({dataset.text}, sliding_hashing_functions(33, 8));
-    bench_unary_functions<std::vector<std::string_view>>({dataset.text}, sliding_hashing_functions(127, 16));
-
-    bench_unary_functions<std::vector<std::string_view>>({dataset.text}, hashing_functions());
-
-    bench_unary_functions<std::vector<std::string_view>>({dataset.text}, fingerprinting_functions(128, 4 * 1024));
-    bench_unary_functions<std::vector<std::string_view>>({dataset.text}, fingerprinting_functions(128, 64 * 1024));
-    bench_unary_functions<std::vector<std::string_view>>({dataset.text}, fingerprinting_functions(128, 1024 * 1024));
-#endif
+
     // Baseline benchmarks for real words, coming in all lengths
     std::printf("Benchmarking on real words:\n");
     bench(dataset.tokens);

From 66f2ac91cff0793490bc690d1911c21976c62f82 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 15 Feb 2025 00:27:25 +0000
Subject: [PATCH 092/751] Fix: Sorting benchmarks for new API

---
 scripts/bench_sort.cpp | 126 ++++++++++++++++-------------------------
 1 file changed, 48 insertions(+), 78 deletions(-)

diff --git a/scripts/bench_sort.cpp b/scripts/bench_sort.cpp
index 742d1b9b..a2c9817f 100644
--- a/scripts/bench_sort.cpp
+++ b/scripts/bench_sort.cpp
@@ -153,91 +153,61 @@ int main(int argc, char const **argv) {
     permute_base.resize(strings.size());
     permute_new.resize(strings.size());
 
-    // Partitioning
-    {
-        std::printf("---- Partitioning:\n");
-        bench_permute("std::partition", strings, permute_base, [](strings_t const &strings, permute_t &permute) {
-            std::partition(permute.begin(), permute.end(), [&](size_t i) { return strings[i].size() < 4; });
-        });
-        expect_partitioned_by_length(strings, permute_base);
-
-        bench_permute("std::stable_partition", strings, permute_base, [](strings_t const &strings, permute_t &permute) {
-            std::stable_partition(permute.begin(), permute.end(), [&](size_t i) { return strings[i].size() < 4; });
-        });
-        expect_partitioned_by_length(strings, permute_base);
-
-        bench_permute("sz_partition", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
-            sz_sequence_t array;
-            array.order = permute.data();
-            array.count = strings.size();
-            array.handle = &strings;
-            sz_partition(&array, &has_under_four_chars);
-        });
-        expect_partitioned_by_length(strings, permute_new);
-        // TODO: expect_same(permute_base, permute_new);
-    }
-
     // Sorting
-    {
-        std::printf("---- Sorting:\n");
-        bench_permute("std::sort", strings, permute_base, [](strings_t const &strings, permute_t &permute) {
-            std::sort(permute.begin(), permute.end(), [&](idx_t i, idx_t j) { return strings[i] < strings[j]; });
-        });
-        expect_sorted(strings, permute_base);
-
-        bench_permute("sz_sort", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
-            sz_sequence_t array;
-            array.order = permute.data();
-            array.count = strings.size();
-            array.handle = &strings;
-            array.get_start = get_start;
-            array.get_length = get_length;
-            sz_sort(&array);
-        });
-        expect_sorted(strings, permute_new);
+    bench_permute("std::sort", strings, permute_base, [](strings_t const &strings, permute_t &permute) {
+        std::sort(permute.begin(), permute.end(), [&](idx_t i, idx_t j) { return strings[i] < strings[j]; });
+    });
+    expect_sorted(strings, permute_base);
+
+    bench_permute("sz_sort", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
+        sz_sequence_t array;
+        array.count = strings.size();
+        array.handle = &strings;
+        array.get_start = get_start;
+        array.get_length = get_length;
+        sz_sort(&array, NULL, permute.data());
+    });
+    expect_sorted(strings, permute_new);
 
 #if __linux__ && defined(_GNU_SOURCE)
-        bench_permute("qsort_r", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
-            sz_sequence_t array;
-            array.order = permute.data();
-            array.count = strings.size();
-            array.handle = &strings;
-            array.get_start = get_start;
-            array.get_length = get_length;
-            qsort_r(array.order, array.count, sizeof(sz_u64_t), _get_qsort_order, &array);
-        });
-        expect_sorted(strings, permute_new);
+    bench_permute("qsort_r", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
+        sz_sequence_t array;
+        array.count = strings.size();
+        array.handle = &strings;
+        array.get_start = get_start;
+        array.get_length = get_length;
+        qsort_r(permute.data(), array.count, sizeof(sz_u64_t), _get_qsort_order, &array);
+    });
+    expect_sorted(strings, permute_new);
 #elif defined(_MSC_VER)
-        bench_permute("qsort_s", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
-            sz_sequence_t array;
-            array.order = permute.data();
-            array.count = strings.size();
-            array.handle = &strings;
-            array.get_start = get_start;
-            array.get_length = get_length;
-            qsort_s(array.order, array.count, sizeof(sz_u64_t), _get_qsort_order, &array);
-        });
-        expect_sorted(strings, permute_new);
+    bench_permute("qsort_s", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
+        sz_sequence_t array;
+        array.count = strings.size();
+        array.handle = &strings;
+        array.get_start = get_start;
+        array.get_length = get_length;
+        qsort_s(permute.data(), array.count, sizeof(sz_u64_t), _get_qsort_order, &array);
+    });
+    expect_sorted(strings, permute_new);
 #else
-        sz_unused(_get_qsort_order);
+    sz_unused(_get_qsort_order);
 #endif
 
-        bench_permute("hybrid_sort_cpp", strings, permute_new,
-                      [](strings_t const &strings, permute_t &permute) { hybrid_sort_cpp(strings, permute.data()); });
-        expect_sorted(strings, permute_new);
-
-        std::printf("---- Stable Sorting:\n");
-        bench_permute("std::stable_sort", strings, permute_base, [](strings_t const &strings, permute_t &permute) {
-            std::stable_sort(permute.begin(), permute.end(), [&](idx_t i, idx_t j) { return strings[i] < strings[j]; });
-        });
-        expect_sorted(strings, permute_base);
-
-        bench_permute(
-            "hybrid_stable_sort_cpp", strings, permute_base,
-            [](strings_t const &strings, permute_t &permute) { hybrid_stable_sort_cpp(strings, permute.data()); });
-        expect_sorted(strings, permute_new);
-        expect_same(permute_base, permute_new);
-    }
+    bench_permute("hybrid_sort_cpp", strings, permute_new,
+                  [](strings_t const &strings, permute_t &permute) { hybrid_sort_cpp(strings, permute.data()); });
+    expect_sorted(strings, permute_new);
+
+    std::printf("---- Stable Sorting:\n");
+    bench_permute("std::stable_sort", strings, permute_base, [](strings_t const &strings, permute_t &permute) {
+        std::stable_sort(permute.begin(), permute.end(), [&](idx_t i, idx_t j) { return strings[i] < strings[j]; });
+    });
+    expect_sorted(strings, permute_base);
+
+    bench_permute("hybrid_stable_sort_cpp", strings, permute_base, [](strings_t const &strings, permute_t &permute) {
+        hybrid_stable_sort_cpp(strings, permute.data());
+    });
+    expect_sorted(strings, permute_new);
+    expect_same(permute_base, permute_new);
 
     return 0;
 }
\ No newline at end of file

From 13bace253bc112cdfd41aa7ede824b0a2bc96790 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 15 Feb 2025 00:36:22 +0000
Subject: [PATCH 093/751] Fix: In C++11 `constexpr` constructor must be empty

---
 include/stringzilla/stringzilla.hpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 89fbd39b..664ce607 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -284,7 +284,7 @@ class basic_char_set {
   public:
     using char_type = char_type_;
 
-    constexpr basic_char_set() noexcept {
+    sz_constexpr_if_cpp14 basic_char_set() noexcept {
         // ! Instead of relying on the `sz_charset_init`, we have to reimplement it to support `constexpr`.
         bitset_._u64s[0] = 0, bitset_._u64s[1] = 0, bitset_._u64s[2] = 0, bitset_._u64s[3] = 0;
     }
@@ -311,8 +311,8 @@ class basic_char_set {
         }
     }
 
-    constexpr basic_char_set(basic_char_set const &other) noexcept : bitset_(other.bitset_) {}
-    constexpr basic_char_set &operator=(basic_char_set const &other) noexcept {
+    sz_constexpr_if_cpp14 basic_char_set(basic_char_set const &other) noexcept : bitset_(other.bitset_) {}
+    sz_constexpr_if_cpp14 basic_char_set &operator=(basic_char_set const &other) noexcept {
         bitset_ = other.bitset_;
         return *this;
     }
@@ -1244,8 +1244,8 @@ class basic_string_slice {
         : start_(c_string), length_(null_terminated_length(c_string)) {}
     constexpr basic_string_slice(pointer c_string, size_type length) noexcept : start_(c_string), length_(length) {}
 
-    constexpr basic_string_slice(basic_string_slice const &other) noexcept = default;
-    constexpr basic_string_slice &operator=(basic_string_slice const &other) noexcept = default;
+    basic_string_slice(basic_string_slice const &other) noexcept = default;
+    basic_string_slice &operator=(basic_string_slice const &other) noexcept = default;
     basic_string_slice(std::nullptr_t) = delete;
 
     /**  @brief Exchanges the view with that of the `other`. */

From eab31371a28d07a552ab542cf371c837ab946905 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 15 Feb 2025 00:38:31 +0000
Subject: [PATCH 094/751] Fox: C library build

---
 c/lib.c                     |  6 ++----
 include/stringzilla/sort.h  | 13 +++++++++++--
 include/stringzilla/types.h |  2 +-
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/c/lib.c b/c/lib.c
index 5a4183cd..361cd049 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -416,10 +416,8 @@ SZ_DYNAMIC sz_ssize_t sz_alignment_score( //
     return sz_dispatch_table.alignment_score(a, a_length, b, b_length, subs, gap, alloc);
 }
 
-SZ_DYNAMIC void sz_hashes(                                                     //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t step, //
-    sz_hash_callback_t callback, void *callback_handle) {
-    sz_dispatch_table.hashes(text, length, window_length, step, callback, callback_handle);
+SZ_DYNAMIC sz_bool_t sz_sort(sz_sequence_t const *array, sz_memory_allocator_t *alloc, sz_size_t *order) {
+    return sz_dispatch_table.sort(array, alloc, order);
 }
 
 SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index e517159d..3ab89737 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -29,7 +29,7 @@ extern "C" {
  *  @param order The output - indices of the sorted collection elements.
  *  @return Whether the operation was successful.
  */
-SZ_PUBLIC sz_bool_t sz_sort(sz_sequence_t const *collection, sz_memory_allocator_t *alloc, sz_sorted_idx_t *order);
+SZ_DYNAMIC sz_bool_t sz_sort(sz_sequence_t const *collection, sz_memory_allocator_t *alloc, sz_sorted_idx_t *order);
 
 /** @copydoc sz_sort */
 SZ_PUBLIC sz_bool_t sz_sort_serial(sz_sequence_t const *collection, sz_memory_allocator_t *alloc,
@@ -306,10 +306,19 @@ SZ_PUBLIC void _sz_sort_ice_recursively(                                       /
 
 #pragma endregion // Ice Lake Implementation
 
-SZ_PUBLIC sz_bool_t sz_sort(sz_sequence_t const *collection, sz_memory_allocator_t *alloc, sz_sorted_idx_t *order) {
+/*  Pick the right implementation for the string search algorithms.
+ *  To override this behavior and precompile all backends - set `SZ_DYNAMIC_DISPATCH` to 1.
+ */
+#pragma region Compile Time Dispatching
+#if !SZ_DYNAMIC_DISPATCH
+
+SZ_DYNAMIC sz_bool_t sz_sort(sz_sequence_t const *collection, sz_memory_allocator_t *alloc, sz_sorted_idx_t *order) {
     return sz_sort_serial(collection, alloc, order);
 }
 
+#endif            // !SZ_DYNAMIC_DISPATCH
+#pragma endregion // Compile Time Dispatching
+
 #ifdef __cplusplus
 }
 #endif // __cplusplus
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 89cf1ce9..d241b69f 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -482,7 +482,7 @@ typedef sz_ssize_t (*sz_alignment_score_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_s
                                            sz_error_cost_t, sz_memory_allocator_t *);
 
 /** @brief  Signature of ::sz_sort. */
-typedef sz_bool_t (*sz_sort_t)(sz_sequence_t const *, sz_memory_allocator_t *, sz_sorted_idx_t *);
+typedef sz_bool_t (*sz_sort_t)(struct sz_sequence_t const *, sz_memory_allocator_t *, sz_sorted_idx_t *);
 
 #pragma endregion
 

From 17f28a33f5c4d0ae6ddaf4ebf71ac76235df2231 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 15 Feb 2025 11:45:40 +0000
Subject: [PATCH 095/751] Fix: `uniform_int_distribution` upper bound

---
 scripts/test.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/test.hpp b/scripts/test.hpp
index 261f90a5..6d85dec2 100644
--- a/scripts/test.hpp
+++ b/scripts/test.hpp
@@ -52,7 +52,7 @@ struct uniform_uint8_distribution_t {
 };
 
 inline void randomize_string(char *string, std::size_t length, char const *alphabet, std::size_t cardinality) {
-    uniform_uint8_distribution_t distribution(cardinality);
+    uniform_uint8_distribution_t distribution(cardinality - 1);
     std::generate(string, string + length, [&]() -> char { return alphabet[distribution(global_random_generator())]; });
 }
 

From a818f978ccd1aad769c5268560929b04c2836b6b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 15 Feb 2025 11:47:05 +0000
Subject: [PATCH 096/751] Make: Recommend pretty-printing GDB symbols

---
 .vscode/launch.json | 18 ++++++++++++++++--
 CONTRIBUTING.md     |  8 ++++++--
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/.vscode/launch.json b/.vscode/launch.json
index 71d59186..34ec245d 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -19,7 +19,14 @@
       ],
       "stopAtEntry": false,
       "linux": {
-        "MIMode": "gdb"
+        "MIMode": "gdb",
+        "setupCommands": [
+          {
+            "description": "Enable pretty-printing for GDB",
+            "text": "-enable-pretty-printing",
+            "ignoreFailures": true
+          }
+        ]
       },
       "osx": {
         "MIMode": "lldb"
@@ -48,7 +55,14 @@
       "stopAtEntry": false,
       "preLaunchTask": "Build Benchmarks: Debug",
       "linux": {
-        "MIMode": "gdb"
+        "MIMode": "gdb",
+        "setupCommands": [
+          {
+            "description": "Enable pretty-printing for GDB",
+            "text": "-enable-pretty-printing",
+            "ignoreFailures": true
+          }
+        ]
       },
       "osx": {
         "MIMode": "lldb"
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index dfb4fb2f..d6009a30 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -104,11 +104,15 @@ For Python code:
 
 The primary C implementation and the C++ wrapper are built with CMake.
 Assuming the extensive use of new SIMD intrinsics and recent C++ language features, using a recent compiler is recommended.
-We prefer GCC 12, which is available from default Ubuntu repositories with Ubuntu 22.04 LTS onwards.
+We prefer GCC 12 or newer, which is available from default Ubuntu repositories with Ubuntu 22.04 LTS onwards.
 If this is your first experience with CMake, use the following commands to get started on Ubuntu:
 
 ```bash
-sudo apt-get update && sudo apt-get install cmake build-essential libjemalloc-dev g++-12 gcc-12
+sudo apt-get update
+sudo apt-get install build-essential
+sudo apt-get install cmake              # Consider pulling a newer version from PyPI
+sudo apt-get install g++-12 gcc-12      # You may already have a newer version on Ubuntu 24
+sudo apt install libstdc++6-12-dbg      # STL debugging symbols for GCC 12
 ```
 
 On MacOS it's recommended to use Homebrew and install Clang, as opposed to "Apple Clang".

From 5970fa40abe7e16d6a82436ec43bbf8e978db3ba Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 15 Feb 2025 11:47:23 +0000
Subject: [PATCH 097/751] Fix: Underflow in serial sorting

---
 include/stringzilla/sort.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index 3ab89737..c96757e6 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -116,7 +116,7 @@ SZ_PUBLIC sz_size_t _sz_sort_serial_partition(
     // Loop through the collection and move the elements around the pivot.
     sz_size_t left_offset = start_in_collection;
     sz_size_t right_offset = end_in_collection - 1;
-    while (left_offset <= right_offset) {
+    while (left_offset < right_offset) {
         // Find the first element on the left that is greater than the pivot.
         while (global_windows[left_offset] < pivot_window) ++left_offset;
         // Find the first element on the right that is less than the pivot.
@@ -188,7 +188,7 @@ SZ_PUBLIC void _sz_sort_serial_next_window(                                    /
 
         // If the identical windows are not trivial and each string has more characters, sort them recursively
         sz_cptr_t current_window_str = (sz_cptr_t)&current_window_integer;
-        int current_window_length = current_window_str[window_capacity];
+        sz_size_t current_window_length = (sz_size_t)current_window_str[window_capacity];
         if (nested_end - nested_start > 1 && current_window_length == window_capacity) {
             _sz_sort_serial_next_window(collection, global_windows, global_order, nested_start, nested_end,
                                         start_character + window_capacity);

From 50d82910b2300377b01fb1b7044eb1d2d78387ca Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 15 Feb 2025 11:47:37 +0000
Subject: [PATCH 098/751] Improve: Drop hybrid sort code

---
 scripts/bench_sort.cpp | 147 ++---------------------------------------
 1 file changed, 5 insertions(+), 142 deletions(-)

diff --git a/scripts/bench_sort.cpp b/scripts/bench_sort.cpp
index 9b4ee90d..ac81e233 100644
--- a/scripts/bench_sort.cpp
+++ b/scripts/bench_sort.cpp
@@ -16,6 +16,7 @@
 #include <bench.hpp>
 
 using namespace ashvardanian::stringzilla::scripts;
+namespace sz = ashvardanian::stringzilla;
 
 using strings_t = std::vector<std::string>;
 using idx_t = sz_size_t;
@@ -33,11 +34,6 @@ static sz_size_t get_length(sz_sequence_t const *array_c, sz_size_t i) {
     return array[i].size();
 }
 
-static sz_bool_t has_under_four_chars(sz_sequence_t const *array_c, sz_size_t i) {
-    strings_t const &array = *reinterpret_cast<strings_t const *>(array_c->handle);
-    return (sz_bool_t)(array[i].size() < 4);
-}
-
 #if defined(_MSC_VER)
 static int _get_qsort_order(void *arg, const void *a, const void *b) {
 #else
@@ -47,8 +43,8 @@ static int _get_qsort_order(const void *a, const void *b, void *arg) {
     sz_size_t idx_a = *(sz_size_t *)a;
     sz_size_t idx_b = *(sz_size_t *)b;
 
-    const char *str_a = sequence->get_start(sequence, idx_a);
-    const char *str_b = sequence->get_start(sequence, idx_b);
+    char const *str_a = sequence->get_start(sequence, idx_a);
+    char const *str_b = sequence->get_start(sequence, idx_b);
     sz_size_t len_a = sequence->get_length(sequence, idx_a);
     sz_size_t len_b = sequence->get_length(sequence, idx_b);
 
@@ -58,136 +54,12 @@ static int _get_qsort_order(const void *a, const void *b, void *arg) {
 
 #pragma endregion
 
-void populate_from_file(std::string path, strings_t &strings,
-                        std::size_t limit = std::numeric_limits<std::size_t>::max()) {
-
-    std::ifstream f(path, std::ios::in);
-    std::string s;
-    while (strings.size() < limit && std::getline(f, s, ' ')) strings.push_back(s);
-}
-
-constexpr size_t offset_in_word = 4;
-
-static idx_t hybrid_sort_cpp(strings_t const &strings, sz_u64_t *order) {
-
-    // What if we take up-to 4 first characters and the index
-    for (size_t i = 0; i != strings.size(); ++i) {
-        size_t index = order[i];
-
-        for (size_t j = 0; j < std::min<std::size_t>(strings[(sz_size_t)index].size(), 4ul); ++j) {
-            std::memcpy((char *)&order[i] + offset_in_word + 3 - j, strings[(sz_size_t)index].c_str() + j, 1ul);
-        }
-    }
-
-    std::sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) {
-        char *i_bytes = (char *)&i;
-        char *j_bytes = (char *)&j;
-        return *(uint32_t *)(i_bytes + offset_in_word) < *(uint32_t *)(j_bytes + offset_in_word);
-    });
-
-    const auto extract_bytes = [](sz_u64_t v) -> uint32_t {
-        char *bytes = (char *)&v;
-        return *(uint32_t *)(bytes + offset_in_word);
-    };
-
-    if (strings.size() >= 2) {
-        size_t prev_index = 0;
-        uint64_t prev_bytes = extract_bytes(order[0]);
-
-        for (size_t i = 1; i < strings.size(); ++i) {
-            uint32_t bytes = extract_bytes(order[i]);
-            if (bytes != prev_bytes) {
-                std::sort(order + prev_index, order + i, [&](sz_u64_t i, sz_u64_t j) {
-                    // Assumes: offset_in_word==4
-                    sz_size_t i_index = i & 0xFFFF'FFFF;
-                    sz_size_t j_index = j & 0xFFFF'FFFF;
-                    return strings[i_index] < strings[j_index];
-                });
-                prev_index = i;
-                prev_bytes = bytes;
-            }
-        }
-
-        std::sort(order + prev_index, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) {
-            sz_size_t i_index = i & 0xFFFF'FFFF;
-            sz_size_t j_index = j & 0xFFFF'FFFF;
-            return strings[i_index] < strings[j_index];
-        });
-    }
-
-    for (size_t i = 0; i != strings.size(); ++i) std::memset((char *)&order[i] + offset_in_word, 0, 4ul);
-
-    return strings.size();
-}
-
-static idx_t hybrid_stable_sort_cpp(strings_t const &strings, sz_u64_t *order) {
-
-    // What if we take up-to 4 first characters and the index
-    for (size_t i = 0; i != strings.size(); ++i) {
-        size_t index = order[i];
-
-        for (size_t j = 0; j < std::min<std::size_t>(strings[(sz_size_t)index].size(), 4ul); ++j) {
-            std::memcpy((char *)&order[i] + offset_in_word + 3 - j, strings[(sz_size_t)index].c_str() + j, 1ul);
-        }
-    }
-
-    std::stable_sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) {
-        char *i_bytes = (char *)&i;
-        char *j_bytes = (char *)&j;
-        return *(uint32_t *)(i_bytes + offset_in_word) < *(uint32_t *)(j_bytes + offset_in_word);
-    });
-
-    const auto extract_bytes = [](sz_u64_t v) -> uint32_t {
-        char *bytes = (char *)&v;
-        return *(uint32_t *)(bytes + offset_in_word);
-    };
-
-    if (strings.size() >= 2) {
-        size_t prev_index = 0;
-        uint64_t prev_bytes = extract_bytes(order[0]);
-
-        for (size_t i = 1; i < strings.size(); ++i) {
-            uint32_t bytes = extract_bytes(order[i]);
-            if (bytes != prev_bytes) {
-                std::stable_sort(order + prev_index, order + i, [&](sz_u64_t i, sz_u64_t j) {
-                    // Assumes: offset_in_word==4
-                    sz_size_t i_index = i & 0xFFFF'FFFF;
-                    sz_size_t j_index = j & 0xFFFF'FFFF;
-                    return strings[i_index] < strings[j_index];
-                });
-                prev_index = i;
-                prev_bytes = bytes;
-            }
-        }
-
-        std::stable_sort(order + prev_index, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) {
-            sz_size_t i_index = i & 0xFFFF'FFFF;
-            sz_size_t j_index = j & 0xFFFF'FFFF;
-            return strings[i_index] < strings[j_index];
-        });
-    }
-
-    for (size_t i = 0; i != strings.size(); ++i) std::memset((char *)&order[i] + offset_in_word, 0, 4ul);
-
-    return strings.size();
-}
-
-void expect_partitioned_by_length(strings_t const &strings, permute_t const &permute) {
-    if (!std::is_partitioned(permute.begin(), permute.end(), [&](size_t i) { return strings[i].size() < 4; }))
-        throw std::runtime_error("Partitioning failed!");
-}
-
 void expect_sorted(strings_t const &strings, permute_t const &permute) {
     if (!std::is_sorted(permute.begin(), permute.end(),
                         [&](std::size_t i, std::size_t j) { return strings[i] < strings[j]; }))
         throw std::runtime_error("Sorting failed!");
 }
 
-void expect_same(permute_t const &permute_base, permute_t const &permute_new) {
-    if (!std::equal(permute_base.begin(), permute_base.end(), permute_new.begin()))
-        throw std::runtime_error("Permutations differ!");
-}
-
 template <typename algo_at>
 void bench_permute(char const *name, strings_t &strings, permute_t &permute, algo_at &&algo) {
     namespace stdc = std::chrono;
@@ -229,7 +101,8 @@ int main(int argc, char const **argv) {
         array.handle = &strings;
         array.get_start = get_start;
         array.get_length = get_length;
-        sz_sort(&array, NULL, permute.data());
+        sz::_with_alloc<std::allocator<char>>(
+            [&](sz_memory_allocator_t &alloc) { return sz_sort(&array, &alloc, permute.data()); });
     });
     expect_sorted(strings, permute_new);
 
@@ -257,21 +130,11 @@ int main(int argc, char const **argv) {
     sz_unused(_get_qsort_order);
 #endif
 
-    bench_permute("hybrid_sort_cpp", strings, permute_new,
-                  [](strings_t const &strings, permute_t &permute) { hybrid_sort_cpp(strings, permute.data()); });
-    expect_sorted(strings, permute_new);
-
     std::printf("---- Stable Sorting:\n");
     bench_permute("std::stable_sort", strings, permute_base, [](strings_t const &strings, permute_t &permute) {
         std::stable_sort(permute.begin(), permute.end(), [&](idx_t i, idx_t j) { return strings[i] < strings[j]; });
     });
     expect_sorted(strings, permute_base);
 
-    bench_permute("hybrid_stable_sort_cpp", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
-        hybrid_stable_sort_cpp(strings, permute.data());
-    });
-    expect_sorted(strings, permute_new);
-    expect_same(permute_base, permute_new);
-
     return 0;
 }

From c670ccd62c36bcf8d09e8874ebc1e41d4bae93ff Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 15 Feb 2025 11:48:04 +0000
Subject: [PATCH 099/751] Add: String sorting tests for different lengths

---
 scripts/test.cpp | 41 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 34 insertions(+), 7 deletions(-)

diff --git a/scripts/test.cpp b/scripts/test.cpp
index 0cf11552..d4df88eb 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -1595,22 +1595,49 @@ static void test_sequence_algorithms() {
     using strs_t = std::vector<std::string>;
     using order_t = std::vector<sz::sorted_idx_t>;
 
+    // Basic tests with predetermined orders.
     assert_scoped(strs_t x({"a", "b", "c", "d"}), (void)0, sz::sorted_order(x) == order_t({0u, 1u, 2u, 3u}));
     assert_scoped(strs_t x({"b", "c", "d", "a"}), (void)0, sz::sorted_order(x) == order_t({3u, 0u, 1u, 2u}));
     assert_scoped(strs_t x({"b", "a", "d", "c"}), (void)0, sz::sorted_order(x) == order_t({1u, 0u, 3u, 2u}));
 
-    // Generate random strings of different lengths.
-    for (std::size_t dataset_size : {10, 100, 1000, 10000}) {
-        // Build the dataset.
+    // Test on long strings of identical length.
+    for (std::size_t dataset_size : {10u, 40u, 1000u, 10000u}) {
         strs_t dataset;
-        for (std::size_t i = 0; i != dataset_size; ++i)
-            dataset.push_back(sz::scripts::random_string(i % 32, "abcdefghijklmnopqrstuvwxyz", 26));
+        constexpr std::size_t long_length = 20;
+        dataset.reserve(dataset_size);
+        for (std::size_t i = 0; i < dataset_size; ++i)
+            dataset.push_back(sz::scripts::random_string(long_length, "abcd", 4));
+
+        auto order = sz::sorted_order(dataset);
+        for (std::size_t i = 1; i < dataset.size(); ++i) assert(dataset[order[i - 1]] <= dataset[order[i]]);
+    }
+
+    // Test on random strings of varying (but small) lengths.
+    for (std::size_t dataset_size : {10u, 40u, 1000u, 10000u}) {
+        strs_t dataset;
+        dataset.reserve(dataset_size);
+        for (std::size_t i = 0; i < dataset_size; ++i) dataset.push_back(sz::scripts::random_string(i % 32, "abcd", 4));
+
+        // Run several iterations of fuzzy tests.
+        for (std::size_t experiment_idx = 0; experiment_idx < 10; ++experiment_idx) {
+            std::shuffle(dataset.begin(), dataset.end(), global_random_generator());
+            auto order = sz::sorted_order(dataset);
+            for (std::size_t i = 1; i < dataset_size; ++i) { assert(dataset[order[i - 1]] <= dataset[order[i]]); }
+        }
+    }
+
+    // Test on random strings of varying lengths with zero characters.
+    for (std::size_t dataset_size : {10u, 100u, 1000u, 10000u}) {
+        strs_t dataset;
+        dataset.reserve(dataset_size);
+        for (std::size_t i = 0; i < dataset_size; ++i)
+            dataset.push_back(sz::scripts::random_string(i % 32, "abcd\0", 5));
 
         // Run several iterations of fuzzy tests.
-        for (std::size_t experiment_idx = 0; experiment_idx != 10; ++experiment_idx) {
+        for (std::size_t experiment_idx = 0; experiment_idx < 10; ++experiment_idx) {
             std::shuffle(dataset.begin(), dataset.end(), global_random_generator());
             auto order = sz::sorted_order(dataset);
-            for (std::size_t i = 1; i != dataset_size; ++i) { assert(dataset[order[i - 1]] <= dataset[order[i]]); }
+            for (std::size_t i = 1; i < dataset_size; ++i) { assert(dataset[order[i - 1]] <= dataset[order[i]]); }
         }
     }
 }

From 0fda5a55523d66bb1bfed3c4cd81635e9df582da Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 15 Feb 2025 12:23:09 +0000
Subject: [PATCH 100/751] Fix: `sz_sort_serial` passes for same length inputs

---
 include/stringzilla/sort.h | 30 +++++++++++++++++++++++++-----
 scripts/test.cpp           | 26 ++++++--------------------
 2 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index c96757e6..d72429ce 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -76,7 +76,28 @@ SZ_PUBLIC void _sz_sort_serial_export_prefixes(                             //
 #else
         *target_integer = sz_u32_bytes_reverse(*target_integer);
 #endif
+        _sz_assert(                                                      //
+            (length <= start_in_collection) == (*target_integer == 0) && //
+            "We can have a zero value if only the string is shorter than other strings at this position.");
     }
+
+    // As our goal is to sort the strings using the exported integer "windows",
+    // this is a good place to validate the correctness of the exported data.
+    if (SZ_DEBUG && start_character == 0)
+        for (sz_size_t i = start_in_collection + 1; i < end_in_collection; ++i) {
+            _sz_sorting_window_t const previous_window = global_windows[i - 1];
+            _sz_sorting_window_t const current_window = global_windows[i];
+            sz_cptr_t const previous_str = collection->get_start(collection, i - 1);
+            sz_size_t const previous_length = collection->get_length(collection, i - 1);
+            sz_cptr_t const current_str = collection->get_start(collection, i);
+            sz_size_t const current_length = collection->get_length(collection, i);
+            sz_ordering_t const ordering = sz_order(                                                 //
+                previous_str, previous_length > window_capacity ? window_capacity : previous_length, //
+                current_str, current_length > window_capacity ? window_capacity : current_length);
+            _sz_assert(                                                          //
+                (previous_window < current_window) == (ordering == sz_less_k) && //
+                "The exported windows should be in the same order as the original strings.");
+        }
 }
 
 /**
@@ -143,21 +164,20 @@ SZ_PUBLIC void _sz_sort_serial_recursively(                                    /
     _sz_sorting_window_t *const global_windows, sz_size_t *const global_order, //
     sz_size_t const start_in_collection, sz_size_t const end_in_collection,    //
     sz_size_t const start_character) {
+
     // Partition the collection around some pivot
     sz_size_t pivot_index =
         _sz_sort_serial_partition(global_windows, global_order, start_in_collection, end_in_collection);
 
     // Recursively sort the left partition
-    if (start_in_collection < pivot_index) {
+    if (start_in_collection < pivot_index)
         _sz_sort_serial_recursively(collection, global_windows, global_order, start_in_collection, pivot_index,
                                     start_character);
-    }
 
     // Recursively sort the right partition
-    if (pivot_index + 1 < end_in_collection) {
+    if (pivot_index + 1 < end_in_collection)
         _sz_sort_serial_recursively(collection, global_windows, global_order, pivot_index + 1, end_in_collection,
                                     start_character);
-    }
 }
 
 SZ_PUBLIC void _sz_sort_serial_next_window(                                    //
@@ -258,7 +278,7 @@ SZ_PUBLIC sz_bool_t sz_sort_serial(sz_sequence_t const *collection, sz_memory_al
     if (!windows) return sz_false_k;
 
     // Recursively sort the whole collection.
-    _sz_sort_serial_recursively(collection, windows, order, 0, collection->count, 0);
+    _sz_sort_serial_next_window(collection, windows, order, 0, collection->count, 0);
 
     // Free temporary storage.
     alloc->free(windows, collection->count * sizeof(_sz_sorting_window_t), alloc);
diff --git a/scripts/test.cpp b/scripts/test.cpp
index d4df88eb..04471495 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -1601,22 +1601,22 @@ static void test_sequence_algorithms() {
     assert_scoped(strs_t x({"b", "a", "d", "c"}), (void)0, sz::sorted_order(x) == order_t({1u, 0u, 3u, 2u}));
 
     // Test on long strings of identical length.
-    for (std::size_t dataset_size : {10u, 40u, 1000u, 10000u}) {
+    for (std::size_t dataset_size : {10u, 100u, 1000u, 10000u}) {
         strs_t dataset;
         constexpr std::size_t long_length = 20;
         dataset.reserve(dataset_size);
         for (std::size_t i = 0; i < dataset_size; ++i)
-            dataset.push_back(sz::scripts::random_string(long_length, "abcd", 4));
+            dataset.push_back(sz::scripts::random_string(long_length, "ab", 2));
 
         auto order = sz::sorted_order(dataset);
         for (std::size_t i = 1; i < dataset.size(); ++i) assert(dataset[order[i - 1]] <= dataset[order[i]]);
     }
 
     // Test on random strings of varying (but small) lengths.
-    for (std::size_t dataset_size : {10u, 40u, 1000u, 10000u}) {
+    for (std::size_t dataset_size : {10u, 100u, 1000u, 10000u}) {
         strs_t dataset;
         dataset.reserve(dataset_size);
-        for (std::size_t i = 0; i < dataset_size; ++i) dataset.push_back(sz::scripts::random_string(i % 32, "abcd", 4));
+        for (std::size_t i = 0; i < dataset_size; ++i) dataset.push_back(sz::scripts::random_string(i % 32, "ab", 2));
 
         // Run several iterations of fuzzy tests.
         for (std::size_t experiment_idx = 0; experiment_idx < 10; ++experiment_idx) {
@@ -1630,8 +1630,7 @@ static void test_sequence_algorithms() {
     for (std::size_t dataset_size : {10u, 100u, 1000u, 10000u}) {
         strs_t dataset;
         dataset.reserve(dataset_size);
-        for (std::size_t i = 0; i < dataset_size; ++i)
-            dataset.push_back(sz::scripts::random_string(i % 32, "abcd\0", 5));
+        for (std::size_t i = 0; i < dataset_size; ++i) dataset.push_back(sz::scripts::random_string(i % 32, "ab\0", 3));
 
         // Run several iterations of fuzzy tests.
         for (std::size_t experiment_idx = 0; experiment_idx < 10; ++experiment_idx) {
@@ -1658,20 +1657,7 @@ static void test_stl_containers() {
 }
 
 int main(int argc, char const **argv) {
-
-    auto dist = _sz_edit_distance_skewed_diagonals_upto63_ice("kiten", 5, "katerinas", 9, SZ_SIZE_MAX);
-    _sz_assert(dist == 5);
-    dist = _sz_edit_distance_skewed_diagonals_upto63_ice("kiten", 5, "katerinas", 9, 3);
-    _sz_assert(dist == SZ_SIZE_MAX);
-    dist = _sz_edit_distance_skewed_diagonals_upto63_ice("kiten", 5, "katerinas", 9, 4);
-    _sz_assert(dist == SZ_SIZE_MAX);
-    dist = _sz_edit_distance_skewed_diagonals_upto63_ice("kiten", 5, "katerinas", 9, 5);
-    _sz_assert(dist == 5);
-    dist = _sz_edit_distance_skewed_diagonals_upto63_ice("kiten", 5, "katerinas", 9, 6);
-    _sz_assert(dist == 5);
-
-    // Similarity measures and fuzzy search
-    test_levenshtein_distances();
+    test_sequence_algorithms();
 
     // Let's greet the user nicely
     sz_unused(argc && argv);

From bdee11181a0e86263b91da0705b6a4bb45489eab Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 15 Feb 2025 20:41:43 +0000
Subject: [PATCH 101/751] Fix: `uniform_int_distribution` lower bound

---
 scripts/test.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/test.hpp b/scripts/test.hpp
index 6d85dec2..6c37e9f6 100644
--- a/scripts/test.hpp
+++ b/scripts/test.hpp
@@ -52,7 +52,7 @@ struct uniform_uint8_distribution_t {
 };
 
 inline void randomize_string(char *string, std::size_t length, char const *alphabet, std::size_t cardinality) {
-    uniform_uint8_distribution_t distribution(cardinality - 1);
+    uniform_uint8_distribution_t distribution(0, cardinality - 1);
     std::generate(string, string + length, [&]() -> char { return alphabet[distribution(global_random_generator())]; });
 }
 

From 8bad799b72ba38728ad0829e1fba25dff9cd6746 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 15 Feb 2025 22:04:29 +0000
Subject: [PATCH 102/751] Fix: `sz_sort_serial` passes tests

Benchmarks on Sapphire Rapids suggest:

- For 8.3 M words in Leipzig1M.txt of length ~5
-- `std::sort` is 2 seconds
-- `sz_sort_serial` is 0.6 seconds
-- `qsort_r` is 3.2 seconds

- For 268 M words in XLSum.csv of length ~8
-- `std::sort` is 147 seconds
-- `sz_sort_serial` is 29 seconds
-- `qsort_r` is 192 seconds
---
 include/stringzilla/sort.h | 120 +++++++++++++++++++++++--------------
 scripts/bench_sort.cpp     |   4 +-
 scripts/test.cpp           |  36 ++++++++---
 3 files changed, 104 insertions(+), 56 deletions(-)

diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index d72429ce..2e35282f 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -48,10 +48,10 @@ SZ_PUBLIC sz_bool_t sz_sort_sve(sz_sequence_t const *collection, sz_memory_alloc
 
 typedef sz_size_t _sz_sorting_window_t;
 
-SZ_PUBLIC void _sz_sort_serial_export_prefixes(                             //
-    sz_sequence_t const *const collection,                                  //
-    _sz_sorting_window_t *const global_windows,                             //
-    sz_size_t const start_in_collection, sz_size_t const end_in_collection, //
+SZ_PUBLIC void _sz_sort_serial_export_prefixes(                                            //
+    sz_sequence_t const *const collection,                                                 //
+    _sz_sorting_window_t *const global_windows, sz_sorted_idx_t const *const global_order, //
+    sz_size_t const start_in_collection, sz_size_t const end_in_collection,                //
     sz_size_t const start_character) {
 
     // Depending on the architecture, we will export a different number of bytes.
@@ -60,11 +60,18 @@ SZ_PUBLIC void _sz_sort_serial_export_prefixes(                             //
 
     // Perform the same operation for every string.
     for (sz_size_t i = start_in_collection; i < end_in_collection; ++i) {
+
+        // On the first recursion level, the `global_order` is the identity permutation.
+        sz_sorted_idx_t const partial_order_index = global_order[i];
+        if (SZ_DEBUG && start_character == 0)
+            _sz_assert(partial_order_index == i && "At start this must be an identity permutation.");
+
         // Get the string slice in global memory.
-        sz_cptr_t const source_str = collection->get_start(collection, i);
-        sz_size_t const length = collection->get_length(collection, i);
+        sz_cptr_t const source_str = collection->get_start(collection, partial_order_index);
+        sz_size_t const length = collection->get_length(collection, partial_order_index);
         sz_size_t const remaining_length = length > start_character ? length - start_character : 0;
         sz_size_t const exported_length = remaining_length > window_capacity ? window_capacity : remaining_length;
+
         // Fill with zeros, export a slice, and mark the exported length.
         sz_size_t *target_integer = &global_windows[i];
         sz_ptr_t target_str = (sz_ptr_t)target_integer;
@@ -76,8 +83,8 @@ SZ_PUBLIC void _sz_sort_serial_export_prefixes(                             //
 #else
         *target_integer = sz_u32_bytes_reverse(*target_integer);
 #endif
-        _sz_assert(                                                      //
-            (length <= start_in_collection) == (*target_integer == 0) && //
+        _sz_assert(                                                  //
+            (length <= start_character) == (*target_integer == 0) && //
             "We can have a zero value if only the string is shorter than other strings at this position.");
     }
 
@@ -101,19 +108,25 @@ SZ_PUBLIC void _sz_sort_serial_export_prefixes(                             //
 }
 
 /**
- *  @brief  Helper function of the serial QuickSort algorithm, that rearranges the elements in
+ *  @brief  The most important part of the QuickSort algorithm, that rearranges the elements in
  *          such a way, that all entries around the pivot are less than the pivot.
  *
  *  It means that no relative order among the elements on the left or right side of the pivot is preserved.
  *  We chose the pivot point using Robert Sedgewick's method - the median of three elements - the first,
  *  the middle, and the last element of the given range.
+ *
+ *  Moreover, considering our iterative refinement procedure, we can't just use the normal 2-way partitioning,
+ *  as it will scatter the values equal to the pivot into the left and right partitions. Instead we use the
+ *  Dutch National Flag @b 3-way partitioning, outputting the range of values equal to the pivot.
+ *
+ *  @see https://en.wikipedia.org/wiki/Dutch_national_flag_problem
  */
-SZ_PUBLIC sz_size_t _sz_sort_serial_partition(                                       //
+SZ_PUBLIC void _sz_sort_serial_3way_partition(                                       //
     _sz_sorting_window_t *const global_windows, sz_sorted_idx_t *const global_order, //
-    sz_size_t const start_in_collection, sz_size_t const end_in_collection) {
+    sz_size_t const start_in_collection, sz_size_t const end_in_collection,          //
+    sz_size_t *first_pivot_offset, sz_size_t *last_pivot_offset) {
 
-    // Chose the pivot offset.
-    sz_size_t pivot_offset;
+    // Chose the pivot offset with Sedgewick's method.
     _sz_sorting_window_t pivot_window;
     {
         sz_size_t const middle_offset = start_in_collection + (end_in_collection - start_in_collection) / 2;
@@ -123,40 +136,52 @@ SZ_PUBLIC sz_size_t _sz_sort_serial_partition(
         _sz_sorting_window_t const middle_window = global_windows[middle_offset];
         _sz_sorting_window_t const last_window = global_windows[last_offset];
         if (first_window < middle_window) {
-            if (middle_window < last_window) { pivot_offset = middle_offset, pivot_window = middle_window; }
-            else if (first_window < last_window) { pivot_offset = last_offset, pivot_window = last_window; }
-            else { pivot_offset = first_offset, pivot_window = first_window; }
+            if (middle_window < last_window) { pivot_window = middle_window; }
+            else if (first_window < last_window) { pivot_window = last_window; }
+            else { pivot_window = first_window; }
         }
         else {
-            if (first_window < last_window) { pivot_offset = first_offset, pivot_window = first_window; }
-            else if (middle_window < last_window) { pivot_offset = last_offset, pivot_window = last_window; }
-            else { pivot_offset = middle_offset, pivot_window = middle_window; }
+            if (first_window < last_window) { pivot_window = first_window; }
+            else if (middle_window < last_window) { pivot_window = last_window; }
+            else { pivot_window = middle_window; }
         }
     }
 
-    // Loop through the collection and move the elements around the pivot.
-    sz_size_t left_offset = start_in_collection;
-    sz_size_t right_offset = end_in_collection - 1;
-    while (left_offset < right_offset) {
-        // Find the first element on the left that is greater than the pivot.
-        while (global_windows[left_offset] < pivot_window) ++left_offset;
-        // Find the first element on the right that is less than the pivot.
-        while (global_windows[right_offset] > pivot_window) --right_offset;
-        // Swap the elements if they are in the wrong order.
-        if (left_offset <= right_offset) {
+    // Loop through the collection and move the elements around the pivot with the 3-way partitioning.
+    sz_size_t partitioning_progress = start_in_collection;       // Current index.
+    sz_size_t less_than_pivot_offset = start_in_collection;      // Boundary for elements < pivot_window.
+    sz_size_t greater_than_pivot_offset = end_in_collection - 1; // Boundary for elements > pivot_window.
+
+    while (partitioning_progress <= greater_than_pivot_offset) {
+        // Element is less than pivot: swap into the < pivot region.
+        if (global_windows[partitioning_progress] < pivot_window) {
+#if defined(_SZ_IS_64_BIT)
+            sz_u64_swap(&global_order[partitioning_progress], &global_order[less_than_pivot_offset]);
+            sz_u64_swap(&global_windows[partitioning_progress], &global_windows[less_than_pivot_offset]);
+#else
+            sz_u32_swap(&global_order[partitioning_progress], &global_order[less_than_pivot_offset]);
+            sz_u32_swap(&global_windows[partitioning_progress], &global_windows[less_than_pivot_offset]);
+#endif
+            ++partitioning_progress;
+            ++less_than_pivot_offset;
+        }
+        // Element is greater than pivot: swap into the > pivot region.
+        else if (global_windows[partitioning_progress] > pivot_window) {
 #if defined(_SZ_IS_64_BIT)
-            sz_u64_swap(&global_order[left_offset], &global_order[right_offset]);
-            sz_u64_swap(&global_windows[left_offset], &global_windows[right_offset]);
+            sz_u64_swap(&global_order[partitioning_progress], &global_order[greater_than_pivot_offset]);
+            sz_u64_swap(&global_windows[partitioning_progress], &global_windows[greater_than_pivot_offset]);
 #else
-            sz_u32_swap(&global_order[left_offset], &global_order[right_offset]);
-            sz_u32_swap(&global_windows[left_offset], &global_windows[right_offset]);
+            sz_u32_swap(&global_order[partitioning_progress], &global_order[greater_than_pivot_offset]);
+            sz_u32_swap(&global_windows[partitioning_progress], &global_windows[greater_than_pivot_offset]);
 #endif
-            ++left_offset;
-            --right_offset;
+            --greater_than_pivot_offset;
         }
+        // Element equals pivot_window: leave it in place.
+        else { ++partitioning_progress; }
     }
 
-    return pivot_offset;
+    *first_pivot_offset = less_than_pivot_offset;
+    *last_pivot_offset = greater_than_pivot_offset;
 }
 
 SZ_PUBLIC void _sz_sort_serial_recursively(                                    //
@@ -165,18 +190,21 @@ SZ_PUBLIC void _sz_sort_serial_recursively(                                    /
     sz_size_t const start_in_collection, sz_size_t const end_in_collection,    //
     sz_size_t const start_character) {
 
-    // Partition the collection around some pivot
-    sz_size_t pivot_index =
-        _sz_sort_serial_partition(global_windows, global_order, start_in_collection, end_in_collection);
+    // Partition the collection around some pivot or 2 pivots in a 3-way partitioning
+    sz_size_t first_pivot_index, last_pivot_index;
+    _sz_sort_serial_3way_partition(             //
+        global_windows, global_order,           //
+        start_in_collection, end_in_collection, //
+        &first_pivot_index, &last_pivot_index);
 
     // Recursively sort the left partition
-    if (start_in_collection < pivot_index)
-        _sz_sort_serial_recursively(collection, global_windows, global_order, start_in_collection, pivot_index,
+    if (start_in_collection < first_pivot_index)
+        _sz_sort_serial_recursively(collection, global_windows, global_order, start_in_collection, first_pivot_index,
                                     start_character);
 
     // Recursively sort the right partition
-    if (pivot_index + 1 < end_in_collection)
-        _sz_sort_serial_recursively(collection, global_windows, global_order, pivot_index + 1, end_in_collection,
+    if (last_pivot_index + 1 < end_in_collection)
+        _sz_sort_serial_recursively(collection, global_windows, global_order, last_pivot_index + 1, end_in_collection,
                                     start_character);
 }
 
@@ -187,7 +215,7 @@ SZ_PUBLIC void _sz_sort_serial_next_window(                                    /
     sz_size_t const start_character) {
 
     // Prepare the new range of windows
-    _sz_sort_serial_export_prefixes(collection, global_windows, start_in_collection, end_in_collection,
+    _sz_sort_serial_export_prefixes(collection, global_windows, global_order, start_in_collection, end_in_collection,
                                     start_character);
 
     // Sort current windows with a quicksort
@@ -208,7 +236,7 @@ SZ_PUBLIC void _sz_sort_serial_next_window(                                    /
 
         // If the identical windows are not trivial and each string has more characters, sort them recursively
         sz_cptr_t current_window_str = (sz_cptr_t)&current_window_integer;
-        sz_size_t current_window_length = (sz_size_t)current_window_str[window_capacity];
+        sz_size_t current_window_length = (sz_size_t)current_window_str[0]; //! The byte order was swapped
         if (nested_end - nested_start > 1 && current_window_length == window_capacity) {
             _sz_sort_serial_next_window(collection, global_windows, global_order, nested_start, nested_end,
                                         start_character + window_capacity);
@@ -296,7 +324,7 @@ SZ_PUBLIC void _sz_sort_ice_recursively(                                       /
     sz_size_t const start_character) {
 
     // Prepare the new range of windows
-    _sz_sort_serial_export_prefixes(collection, global_windows, start_in_collection, end_in_collection,
+    _sz_sort_serial_export_prefixes(collection, global_windows, global_order, start_in_collection, end_in_collection,
                                     start_character);
 
     // We can implement a form of a Radix sort here, that will count the number of elements with
diff --git a/scripts/bench_sort.cpp b/scripts/bench_sort.cpp
index ac81e233..75800582 100644
--- a/scripts/bench_sort.cpp
+++ b/scripts/bench_sort.cpp
@@ -95,14 +95,14 @@ int main(int argc, char const **argv) {
     });
     expect_sorted(strings, permute_base);
 
-    bench_permute("sz_sort", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
+    bench_permute("sz_sort_serial", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
         sz_sequence_t array;
         array.count = strings.size();
         array.handle = &strings;
         array.get_start = get_start;
         array.get_length = get_length;
         sz::_with_alloc<std::allocator<char>>(
-            [&](sz_memory_allocator_t &alloc) { return sz_sort(&array, &alloc, permute.data()); });
+            [&](sz_memory_allocator_t &alloc) { return sz_sort_serial(&array, &alloc, permute.data()); });
     });
     expect_sorted(strings, permute_new);
 
diff --git a/scripts/test.cpp b/scripts/test.cpp
index 04471495..d8f0cdd6 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -1601,22 +1601,43 @@ static void test_sequence_algorithms() {
     assert_scoped(strs_t x({"b", "a", "d", "c"}), (void)0, sz::sorted_order(x) == order_t({1u, 0u, 3u, 2u}));
 
     // Test on long strings of identical length.
+    for (std::size_t string_length : {5u, 25u}) {
+        for (std::size_t dataset_size : {10u, 100u, 1000u, 10000u}) {
+            strs_t dataset;
+            dataset.reserve(dataset_size);
+            for (std::size_t i = 0; i < dataset_size; ++i)
+                dataset.push_back(sz::scripts::random_string(string_length, "ab", 2));
+
+            // Run several iterations of fuzzy tests.
+            for (std::size_t experiment_idx = 0; experiment_idx < 10; ++experiment_idx) {
+                std::shuffle(dataset.begin(), dataset.end(), global_random_generator());
+                auto order = sz::sorted_order(dataset);
+                for (std::size_t i = 1; i < dataset.size(); ++i) assert(dataset[order[i - 1]] <= dataset[order[i]]);
+            }
+        }
+    }
+
+    // Test on random very small strings of varying lengths, likely with many equal inputs.
     for (std::size_t dataset_size : {10u, 100u, 1000u, 10000u}) {
         strs_t dataset;
-        constexpr std::size_t long_length = 20;
         dataset.reserve(dataset_size);
-        for (std::size_t i = 0; i < dataset_size; ++i)
-            dataset.push_back(sz::scripts::random_string(long_length, "ab", 2));
+        for (std::size_t i = 0; i < dataset_size; ++i) dataset.push_back(sz::scripts::random_string(i % 6, "ab", 2));
 
-        auto order = sz::sorted_order(dataset);
-        for (std::size_t i = 1; i < dataset.size(); ++i) assert(dataset[order[i - 1]] <= dataset[order[i]]);
+        // Run several iterations of fuzzy tests.
+        for (std::size_t experiment_idx = 0; experiment_idx < 10; ++experiment_idx) {
+            std::shuffle(dataset.begin(), dataset.end(), global_random_generator());
+            auto order = sz::sorted_order(dataset);
+            for (std::size_t i = 1; i < dataset_size; ++i) { assert(dataset[order[i - 1]] <= dataset[order[i]]); }
+        }
     }
 
-    // Test on random strings of varying (but small) lengths.
+    // Test on random strings of varying lengths.
     for (std::size_t dataset_size : {10u, 100u, 1000u, 10000u}) {
         strs_t dataset;
         dataset.reserve(dataset_size);
-        for (std::size_t i = 0; i < dataset_size; ++i) dataset.push_back(sz::scripts::random_string(i % 32, "ab", 2));
+        constexpr std::size_t min_length = 6;
+        for (std::size_t i = 0; i < dataset_size; ++i)
+            dataset.push_back(sz::scripts::random_string(min_length + i % 32, "ab", 2));
 
         // Run several iterations of fuzzy tests.
         for (std::size_t experiment_idx = 0; experiment_idx < 10; ++experiment_idx) {
@@ -1657,7 +1678,6 @@ static void test_stl_containers() {
 }
 
 int main(int argc, char const **argv) {
-    test_sequence_algorithms();
 
     // Let's greet the user nicely
     sz_unused(argc && argv);

From 6191cc6173b4867223e086599342d876c5bf09ec Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 16 Feb 2025 10:55:42 +0000
Subject: [PATCH 103/751] Improve: Rename `sz_sort` to `sz_qsort`

Makes it easier to differentiate stable `sz_msort`
---
 include/stringzilla/sort.h | 140 ++++++++++++++++++++-----------------
 1 file changed, 75 insertions(+), 65 deletions(-)

diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index 2e35282f..9af46a20 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -5,8 +5,13 @@
  *
  *  Includes core APIs:
  *
- *  - `sz_sort` - to sort an arbitrary string collection.
- *  - TODO: `sz_stable_sort` - to sort a string collection while preserving the relative order of equal elements.
+ *  - `sz_qsort` - to sort an arbitrary string collection with QuickSort-like algorithm.
+ *  - `sz_qsort_addresses` - to sort a collection of continuous pointer-sized integers with QuickSort-like algorithm.
+ *  - `sz_msort` - to sort an arbitrary string collection with a MergeSort-like algorithm.
+ *  - `sz_msort_addresses` - to sort a collection of continuous pointer-sized integers with a MergeSort-like algorithm.
+ *
+ *  The `qsort` variants are not guaranteed to be stable.
+ *  The `msort` variants are guaranteed to be stable.
  */
 #ifndef STRINGZILLA_SORT_H_
 #define STRINGZILLA_SORT_H_
@@ -29,34 +34,39 @@ extern "C" {
  *  @param order The output - indices of the sorted collection elements.
  *  @return Whether the operation was successful.
  */
-SZ_DYNAMIC sz_bool_t sz_sort(sz_sequence_t const *collection, sz_memory_allocator_t *alloc, sz_sorted_idx_t *order);
-
-/** @copydoc sz_sort */
-SZ_PUBLIC sz_bool_t sz_sort_serial(sz_sequence_t const *collection, sz_memory_allocator_t *alloc,
-                                   sz_sorted_idx_t *order);
+SZ_DYNAMIC sz_bool_t sz_qsort(sz_sequence_t const *collection, sz_memory_allocator_t *alloc, sz_sorted_idx_t *order);
 
-/** @copydoc sz_sort */
-SZ_PUBLIC sz_bool_t sz_sort_skylake(sz_sequence_t const *collection, sz_memory_allocator_t *alloc,
+/** @copydoc sz_qsort */
+SZ_PUBLIC sz_bool_t sz_qsort_serial(sz_sequence_t const *collection, sz_memory_allocator_t *alloc,
                                     sz_sorted_idx_t *order);
 
-/** @copydoc sz_sort */
-SZ_PUBLIC sz_bool_t sz_sort_sve(sz_sequence_t const *collection, sz_memory_allocator_t *alloc, sz_sorted_idx_t *order);
+/** @copydoc sz_qsort */
+SZ_PUBLIC sz_bool_t sz_qsort_skylake(sz_sequence_t const *collection, sz_memory_allocator_t *alloc,
+                                     sz_sorted_idx_t *order);
+
+/** @copydoc sz_qsort */
+SZ_PUBLIC sz_bool_t sz_qsort_sve(sz_sequence_t const *collection, sz_memory_allocator_t *alloc, sz_sorted_idx_t *order);
 
 #pragma endregion
 
 #pragma region Serial Implementation
 
-typedef sz_size_t _sz_sorting_window_t;
+/**
+ *  The core idea of all following string algorithms is to sort strings not based on 1 character at a time,
+ *  but on a larger integer word fitting in 4 or 8 bytes at once, on 32-bit or 64-bit architectures, respectively.
+ *  That word is pointer-sized, but it may contain extra information aside from N characters.
+ */
+typedef sz_size_t _sz_sort_ngram_t;
 
-SZ_PUBLIC void _sz_sort_serial_export_prefixes(                                            //
-    sz_sequence_t const *const collection,                                                 //
-    _sz_sorting_window_t *const global_windows, sz_sorted_idx_t const *const global_order, //
-    sz_size_t const start_in_collection, sz_size_t const end_in_collection,                //
+SZ_PUBLIC void _sz_qsort_serial_export_prefixes(                                       //
+    sz_sequence_t const *const collection,                                             //
+    _sz_sort_ngram_t *const global_windows, sz_sorted_idx_t const *const global_order, //
+    sz_size_t const start_in_collection, sz_size_t const end_in_collection,            //
     sz_size_t const start_character) {
 
     // Depending on the architecture, we will export a different number of bytes.
     // On 32-bit architectures, we will export 3 bytes, and on 64-bit architectures - 7 bytes.
-    sz_size_t const window_capacity = sizeof(_sz_sorting_window_t) - 1;
+    sz_size_t const window_capacity = sizeof(_sz_sort_ngram_t) - 1;
 
     // Perform the same operation for every string.
     for (sz_size_t i = start_in_collection; i < end_in_collection; ++i) {
@@ -92,8 +102,8 @@ SZ_PUBLIC void _sz_sort_serial_export_prefixes(
     // this is a good place to validate the correctness of the exported data.
     if (SZ_DEBUG && start_character == 0)
         for (sz_size_t i = start_in_collection + 1; i < end_in_collection; ++i) {
-            _sz_sorting_window_t const previous_window = global_windows[i - 1];
-            _sz_sorting_window_t const current_window = global_windows[i];
+            _sz_sort_ngram_t const previous_window = global_windows[i - 1];
+            _sz_sort_ngram_t const current_window = global_windows[i];
             sz_cptr_t const previous_str = collection->get_start(collection, i - 1);
             sz_size_t const previous_length = collection->get_length(collection, i - 1);
             sz_cptr_t const current_str = collection->get_start(collection, i);
@@ -121,20 +131,20 @@ SZ_PUBLIC void _sz_sort_serial_export_prefixes(
  *
  *  @see https://en.wikipedia.org/wiki/Dutch_national_flag_problem
  */
-SZ_PUBLIC void _sz_sort_serial_3way_partition(                                       //
-    _sz_sorting_window_t *const global_windows, sz_sorted_idx_t *const global_order, //
-    sz_size_t const start_in_collection, sz_size_t const end_in_collection,          //
+SZ_PUBLIC void _sz_qsort_serial_3way_partition(                                  //
+    _sz_sort_ngram_t *const global_windows, sz_sorted_idx_t *const global_order, //
+    sz_size_t const start_in_collection, sz_size_t const end_in_collection,      //
     sz_size_t *first_pivot_offset, sz_size_t *last_pivot_offset) {
 
     // Chose the pivot offset with Sedgewick's method.
-    _sz_sorting_window_t pivot_window;
+    _sz_sort_ngram_t pivot_window;
     {
         sz_size_t const middle_offset = start_in_collection + (end_in_collection - start_in_collection) / 2;
         sz_size_t const last_offset = end_in_collection - 1;
         sz_size_t const first_offset = start_in_collection;
-        _sz_sorting_window_t const first_window = global_windows[first_offset];
-        _sz_sorting_window_t const middle_window = global_windows[middle_offset];
-        _sz_sorting_window_t const last_window = global_windows[last_offset];
+        _sz_sort_ngram_t const first_window = global_windows[first_offset];
+        _sz_sort_ngram_t const middle_window = global_windows[middle_offset];
+        _sz_sort_ngram_t const last_window = global_windows[last_offset];
         if (first_window < middle_window) {
             if (middle_window < last_window) { pivot_window = middle_window; }
             else if (first_window < last_window) { pivot_window = last_window; }
@@ -184,70 +194,70 @@ SZ_PUBLIC void _sz_sort_serial_3way_partition(
     *last_pivot_offset = greater_than_pivot_offset;
 }
 
-SZ_PUBLIC void _sz_sort_serial_recursively(                                    //
-    sz_sequence_t const *const collection,                                     //
-    _sz_sorting_window_t *const global_windows, sz_size_t *const global_order, //
-    sz_size_t const start_in_collection, sz_size_t const end_in_collection,    //
+SZ_PUBLIC void _sz_qsort_serial_recursively(                                //
+    sz_sequence_t const *const collection,                                  //
+    _sz_sort_ngram_t *const global_windows, sz_size_t *const global_order,  //
+    sz_size_t const start_in_collection, sz_size_t const end_in_collection, //
     sz_size_t const start_character) {
 
     // Partition the collection around some pivot or 2 pivots in a 3-way partitioning
     sz_size_t first_pivot_index, last_pivot_index;
-    _sz_sort_serial_3way_partition(             //
+    _sz_qsort_serial_3way_partition(            //
         global_windows, global_order,           //
         start_in_collection, end_in_collection, //
         &first_pivot_index, &last_pivot_index);
 
     // Recursively sort the left partition
     if (start_in_collection < first_pivot_index)
-        _sz_sort_serial_recursively(collection, global_windows, global_order, start_in_collection, first_pivot_index,
-                                    start_character);
+        _sz_qsort_serial_recursively(collection, global_windows, global_order, start_in_collection, first_pivot_index,
+                                     start_character);
 
     // Recursively sort the right partition
     if (last_pivot_index + 1 < end_in_collection)
-        _sz_sort_serial_recursively(collection, global_windows, global_order, last_pivot_index + 1, end_in_collection,
-                                    start_character);
+        _sz_qsort_serial_recursively(collection, global_windows, global_order, last_pivot_index + 1, end_in_collection,
+                                     start_character);
 }
 
-SZ_PUBLIC void _sz_sort_serial_next_window(                                    //
-    sz_sequence_t const *const collection,                                     //
-    _sz_sorting_window_t *const global_windows, sz_size_t *const global_order, //
-    sz_size_t const start_in_collection, sz_size_t const end_in_collection,    //
+SZ_PUBLIC void _sz_qsort_serial_next_window(                                //
+    sz_sequence_t const *const collection,                                  //
+    _sz_sort_ngram_t *const global_windows, sz_size_t *const global_order,  //
+    sz_size_t const start_in_collection, sz_size_t const end_in_collection, //
     sz_size_t const start_character) {
 
     // Prepare the new range of windows
-    _sz_sort_serial_export_prefixes(collection, global_windows, global_order, start_in_collection, end_in_collection,
-                                    start_character);
+    _sz_qsort_serial_export_prefixes(collection, global_windows, global_order, start_in_collection, end_in_collection,
+                                     start_character);
 
     // Sort current windows with a quicksort
-    _sz_sort_serial_recursively(collection, global_windows, global_order, start_in_collection, end_in_collection,
-                                start_character);
+    _sz_qsort_serial_recursively(collection, global_windows, global_order, start_in_collection, end_in_collection,
+                                 start_character);
 
     // Depending on the architecture, we will export a different number of bytes.
     // On 32-bit architectures, we will export 3 bytes, and on 64-bit architectures - 7 bytes.
-    sz_size_t const window_capacity = sizeof(_sz_sorting_window_t) - 1;
+    sz_size_t const window_capacity = sizeof(_sz_sort_ngram_t) - 1;
 
     // Repeat the procedure for the identical windows
     sz_size_t nested_start = start_in_collection;
     sz_size_t nested_end = start_in_collection;
     while (nested_end != end_in_collection) {
         // Find the end of the identical windows
-        _sz_sorting_window_t current_window_integer = global_windows[nested_start];
+        _sz_sort_ngram_t current_window_integer = global_windows[nested_start];
         while (nested_end != end_in_collection && current_window_integer == global_windows[nested_end]) ++nested_end;
 
         // If the identical windows are not trivial and each string has more characters, sort them recursively
         sz_cptr_t current_window_str = (sz_cptr_t)&current_window_integer;
         sz_size_t current_window_length = (sz_size_t)current_window_str[0]; //! The byte order was swapped
         if (nested_end - nested_start > 1 && current_window_length == window_capacity) {
-            _sz_sort_serial_next_window(collection, global_windows, global_order, nested_start, nested_end,
-                                        start_character + window_capacity);
+            _sz_qsort_serial_next_window(collection, global_windows, global_order, nested_start, nested_end,
+                                         start_character + window_capacity);
         }
         // Move to the next
         nested_start = nested_end;
     }
 }
 
-SZ_PUBLIC void _sz_sort_serial_insertion(sz_sequence_t const *collection, sz_memory_allocator_t *alloc,
-                                         sz_sorted_idx_t *order) {
+SZ_PUBLIC void _sz_qsort_serial_insertion(sz_sequence_t const *collection, sz_memory_allocator_t *alloc,
+                                          sz_sorted_idx_t *order) {
     // This algorithm needs no memory allocations:
     sz_unused(alloc);
 
@@ -277,8 +287,8 @@ SZ_PUBLIC void _sz_sort_serial_insertion(sz_sequence_t const *collection, sz_mem
     }
 }
 
-SZ_PUBLIC sz_bool_t sz_sort_serial(sz_sequence_t const *collection, sz_memory_allocator_t *alloc,
-                                   sz_sorted_idx_t *order) {
+SZ_PUBLIC sz_bool_t sz_qsort_serial(sz_sequence_t const *collection, sz_memory_allocator_t *alloc,
+                                    sz_sorted_idx_t *order) {
 
     // First, initialize the `order` with `std::iota`-like behavior.
     for (sz_size_t i = 0; i != collection->count; ++i) order[i] = i;
@@ -286,7 +296,7 @@ SZ_PUBLIC sz_bool_t sz_sort_serial(sz_sequence_t const *collection, sz_memory_al
     // On very small collections - just use the quadratic-complexity insertion sort
     // without any smart optimizations or memory allocations.
     if (collection->count <= 32) {
-        _sz_sort_serial_insertion(collection, alloc, order);
+        _sz_qsort_serial_insertion(collection, alloc, order);
         return sz_true_k;
     }
 
@@ -301,15 +311,15 @@ SZ_PUBLIC sz_bool_t sz_sort_serial(sz_sequence_t const *collection, sz_memory_al
     // Assuming that some strings may contain or even end with NULL bytes, we need to make sure, that their length
     // is included in those P-long words. So, in reality, we will be taking (P-1) bytes from each string on every
     // iteration of a recursive algorithm.
-    _sz_sorting_window_t *windows =
-        (_sz_sorting_window_t *)alloc->allocate(collection->count * sizeof(_sz_sorting_window_t), alloc);
+    _sz_sort_ngram_t *windows =
+        (_sz_sort_ngram_t *)alloc->allocate(collection->count * sizeof(_sz_sort_ngram_t), alloc);
     if (!windows) return sz_false_k;
 
     // Recursively sort the whole collection.
-    _sz_sort_serial_next_window(collection, windows, order, 0, collection->count, 0);
+    _sz_qsort_serial_next_window(collection, windows, order, 0, collection->count, 0);
 
     // Free temporary storage.
-    alloc->free(windows, collection->count * sizeof(_sz_sorting_window_t), alloc);
+    alloc->free(windows, collection->count * sizeof(_sz_sort_ngram_t), alloc);
     return sz_true_k;
 }
 
@@ -317,22 +327,22 @@ SZ_PUBLIC sz_bool_t sz_sort_serial(sz_sequence_t const *collection, sz_memory_al
 
 #pragma region Ice Lake Implementation
 
-SZ_PUBLIC void _sz_sort_ice_recursively(                                       //
-    sz_sequence_t const *const collection,                                     //
-    _sz_sorting_window_t *const global_windows, sz_size_t *const global_order, //
-    sz_size_t const start_in_collection, sz_size_t const end_in_collection,    //
+SZ_PUBLIC void _sz_qsort_ice_recursively(                                   //
+    sz_sequence_t const *const collection,                                  //
+    _sz_sort_ngram_t *const global_windows, sz_size_t *const global_order,  //
+    sz_size_t const start_in_collection, sz_size_t const end_in_collection, //
     sz_size_t const start_character) {
 
     // Prepare the new range of windows
-    _sz_sort_serial_export_prefixes(collection, global_windows, global_order, start_in_collection, end_in_collection,
-                                    start_character);
+    _sz_qsort_serial_export_prefixes(collection, global_windows, global_order, start_in_collection, end_in_collection,
+                                     start_character);
 
     // We can implement a form of a Radix sort here, that will count the number of elements with
     // a certain bit set. The naive approach may require too many loops over data. A more "vectorized"
     // approach would be to maintain a histogram for several bits at once. For 4 bits we will
     // need 2^4 = 16 counters.
     sz_size_t histogram[16] = {0};
-    for (sz_size_t byte_in_window = 0; byte_in_window != sizeof(_sz_sorting_window_t); ++byte_in_window) {
+    for (sz_size_t byte_in_window = 0; byte_in_window != sizeof(_sz_sort_ngram_t); ++byte_in_window) {
         // First sort based on the low nibble of each byte.
         for (sz_size_t i = start_in_collection; i < end_in_collection; ++i) {
             sz_size_t const byte = (global_windows[i] >> (byte_in_window * 8)) & 0xFF;
@@ -360,8 +370,8 @@ SZ_PUBLIC void _sz_sort_ice_recursively(                                       /
 #pragma region Compile Time Dispatching
 #if !SZ_DYNAMIC_DISPATCH
 
-SZ_DYNAMIC sz_bool_t sz_sort(sz_sequence_t const *collection, sz_memory_allocator_t *alloc, sz_sorted_idx_t *order) {
-    return sz_sort_serial(collection, alloc, order);
+SZ_DYNAMIC sz_bool_t sz_qsort(sz_sequence_t const *collection, sz_memory_allocator_t *alloc, sz_sorted_idx_t *order) {
+    return sz_qsort_serial(collection, alloc, order);
 }
 
 #endif            // !SZ_DYNAMIC_DISPATCH

From dcf6c653931b19df6379ebc5476d1e302076c259 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 16 Feb 2025 22:59:16 +0000
Subject: [PATCH 104/751] Improve: Introduce typed `_sz_swap` macro

---
 include/stringzilla/similarity.h | 16 ++++++++--------
 include/stringzilla/types.h      | 10 ++++++++++
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/include/stringzilla/similarity.h b/include/stringzilla/similarity.h
index 5c521a40..188169ff 100644
--- a/include/stringzilla/similarity.h
+++ b/include/stringzilla/similarity.h
@@ -437,8 +437,8 @@ SZ_PUBLIC sz_size_t sz_edit_distance_serial(     //
     // Let's make sure that we use the amount proportional to the
     // number of elements in the shorter string, not the larger.
     if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
+        _sz_swap(sz_size_t, longer_length, shorter_length);
+        _sz_swap(sz_cptr_t, longer, shorter);
     }
 
     // Skip the matching prefixes and suffixes, they won't affect the distance.
@@ -478,8 +478,8 @@ SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(       //
     // Let's make sure that we use the amount proportional to the
     // number of elements in the shorter string, not the larger.
     if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
+        _sz_swap(sz_size_t, longer_length, shorter_length);
+        _sz_swap(sz_cptr_t, longer, shorter);
     }
 
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
@@ -513,7 +513,7 @@ SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(       //
         }
 
         // Swap previous_distances and current_distances pointers
-        sz_pointer_swap((void **)&previous_distances, (void **)&current_distances);
+        _sz_swap(sz_ssize_t *, previous_distances, current_distances);
     }
 
     // Cache scalar before `free` call.
@@ -1101,8 +1101,8 @@ SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_ice( //
     // Let's make sure that we use the amount proportional to the
     // number of elements in the shorter string, not the larger.
     if (shorter_length > longer_length) {
-        sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
-        sz_pointer_swap((void **)&longer, (void **)&shorter);
+        _sz_swap(sz_size_t, longer_length, shorter_length);
+        _sz_swap(sz_cptr_t, longer, shorter);
     }
 
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
@@ -1291,7 +1291,7 @@ SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_ice( //
         }
 
         // Swap previous_distances and current_distances pointers
-        sz_pointer_swap((void **)&previous_distances, (void **)&current_distances);
+        _sz_swap(sz_i32_t *, previous_distances, current_distances);
     }
 
     // Cache scalar before `free` call.
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index d241b69f..6d8086f2 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -863,6 +863,16 @@ SZ_INTERNAL sz_u64_t sz_u64_blend(sz_u64_t a, sz_u64_t b, sz_u64_t mask) { retur
  */
 #define _sz_order_scalars(a, b) ((sz_ordering_t)((a > b) - (a < b)))
 
+/**
+ *  Convenience macro to swap two values of the same type.
+ */
+#define _sz_swap(type, a, b) \
+    do {                     \
+        type _tmp = (a);     \
+        (a) = (b);           \
+        (b) = _tmp;          \
+    } while (0)
+
 /** @brief  Branchless minimum function for two signed 32-bit integers. */
 SZ_INTERNAL sz_i32_t sz_i32_min_of_two(sz_i32_t x, sz_i32_t y) { return y + ((x - y) & (x - y) >> 31); }
 

From 0c38bff7399f28494fa39593488f77105ab3646a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 16 Feb 2025 23:01:56 +0000
Subject: [PATCH 105/751] Break: Pointer-sized N-gram Sorting

This huge commit brings many new sorting APIs,
as well as a new naming convention to differentiate
inplace sorting helpers from "argsort" operations.

Also refactors the testing and micro-benchmarking
helpers.
---
 README.md                           |  12 +-
 c/lib.c                             |   8 +-
 include/stringzilla/sort.h          | 582 +++++++++++++++++++++-------
 include/stringzilla/stringzilla.h   |   2 +-
 include/stringzilla/stringzilla.hpp |  29 +-
 include/stringzilla/types.h         |  25 +-
 python/lib.c                        |   2 +-
 scripts/bench.hpp                   |  98 ++---
 scripts/bench_sort.cpp              | 118 ++++--
 scripts/test.cpp                    |  14 +-
 10 files changed, 602 insertions(+), 288 deletions(-)

diff --git a/README.md b/README.md
index 52f80d41..c5253c4c 100644
--- a/README.md
+++ b/README.md
@@ -229,7 +229,7 @@ __Who is this for?__
       <span style="color:#ABABAB;">arm:</span> <b>13.00</b> s
     </td>
     <td align="center">
-      <code>sz_sort</code><br/>
+      <code>sz_sequence_argsort</code><br/>
       <span style="color:#ABABAB;">x86:</span> <b>1.91</b> &centerdot;
       <span style="color:#ABABAB;">arm:</span> <b>2.37</b> s
     </td>
@@ -429,7 +429,7 @@ lines: Strs = text.split(separator='\n') # 4 bytes per line overhead for under 4
 batch: Strs = lines.sample(seed=42) # 10x faster than `random.choices`
 lines.shuffle(seed=42) # or shuffle all lines in place and shard with slices
 # WIP: lines.sort() # explodes to 16 bytes per line overhead for any length text
-# WIP: sorted_order: tuple = lines.argsort() # similar to `numpy.argsort`
+# WIP: argsort: tuple = lines.argsort() # similar to `numpy.argsort`
 ```
 
 Working on [RedPajama][redpajama], addressing 20 Billion annotated english documents, one will need only 160 GB of RAM instead of Terabytes.
@@ -633,7 +633,7 @@ sz_u64_t hash = sz_hash(haystack.start, haystack.length);
 
 // Perform collection level operations
 sz_sequence_t array = {your_handle, your_count, your_get_start, your_get_length};
-sz_sort(&array, &your_config);
+sz_sequence_argsort(&array, &your_config);
 ```
 
 <details>
@@ -1129,14 +1129,14 @@ C++ generic algorithm is not perfect either.
 There is no guarantee in the standard that `std::sort` won't allocate any memory.
 If you are running on embedded, in real-time or on 100+ CPU cores per node, you may want to avoid that.
 StringZilla doesn't solve the general case, but hopes to improve the performance for strings.
-Use `sz_sort`, or the high-level `sz::sorted_order`, which can be used sort any collection of elements convertible to `sz::string_view`.
+Use `sz_sequence_argsort`, or the high-level `sz::argsort`, which can be used sort any collection of elements convertible to `sz::string_view`.
 
 ```cpp
 std::vector<std::string> data({"c", "b", "a"});
-std::vector<std::size_t> order = sz::sorted_order(data); //< Simple shortcut
+std::vector<std::size_t> order = sz::argsort(data); //< Simple shortcut
 
 // Or, taking care of memory allocation:
-sz::sorted_order(data.begin(), data.end(), order.data(), [](auto const &x) -> sz::string_view { return x; });
+sz::argsort(data.begin(), data.end(), order.data(), [](auto const &x) -> sz::string_view { return x; });
 ```
 
 ### Standard C++ Containers with String Keys
diff --git a/c/lib.c b/c/lib.c
index 361cd049..64e7b61a 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -189,7 +189,7 @@ typedef struct sz_implementations_t {
     sz_edit_distance_t edit_distance;
     sz_alignment_score_t alignment_score;
 
-    sz_sort_t sort;
+    sz_sequence_argsort_t sequence_argsort;
 
 } sz_implementations_t;
 
@@ -225,7 +225,7 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
 
     impl->edit_distance = sz_edit_distance_serial;
     impl->alignment_score = sz_alignment_score_serial;
-    impl->sort = sz_sort_serial;
+    impl->sequence_argsort = sz_sequence_argsort_serial;
 
 #if SZ_USE_HASWELL
     if (caps & sz_cap_haswell_k) {
@@ -416,8 +416,8 @@ SZ_DYNAMIC sz_ssize_t sz_alignment_score( //
     return sz_dispatch_table.alignment_score(a, a_length, b, b_length, subs, gap, alloc);
 }
 
-SZ_DYNAMIC sz_bool_t sz_sort(sz_sequence_t const *array, sz_memory_allocator_t *alloc, sz_size_t *order) {
-    return sz_dispatch_table.sort(array, alloc, order);
+SZ_DYNAMIC sz_bool_t sz_sequence_argsort(sz_sequence_t const *array, sz_memory_allocator_t *alloc, sz_size_t *order) {
+    return sz_dispatch_table.sequence_argsort(array, alloc, order);
 }
 
 SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index 9af46a20..9ea19e8d 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -1,17 +1,24 @@
 /**
- *  @brief  Hardware-accelerated string collection sorting and intersections.
+ *  @brief  Hardware-accelerated string collection sorting.
  *  @file   sort.h
  *  @author Ash Vardanian
  *
- *  Includes core APIs:
+ *  Includes core APIs for `sz_sequence_t` string collections:
  *
- *  - `sz_qsort` - to sort an arbitrary string collection with QuickSort-like algorithm.
- *  - `sz_qsort_addresses` - to sort a collection of continuous pointer-sized integers with QuickSort-like algorithm.
- *  - `sz_msort` - to sort an arbitrary string collection with a MergeSort-like algorithm.
- *  - `sz_msort_addresses` - to sort a collection of continuous pointer-sized integers with a MergeSort-like algorithm.
+ *  - `sz_sequence_argsort` - to get the sorting permutation of a string collection with QuickSort.
+ *  - `sz_sequence_argsort_stable` - to get the stable-sorting permutation of a string collection with a MergeSort.
+ *
+ *  The core idea of all following string algorithms is to sort strings not based on 1 character at a time,
+ *  but on a larger "Pointer-sized N-grams" fitting in 4 or 8 bytes at once, on 32-bit or 64-bit architectures,
+ *  respectively. In reality we may not use the full pointer size, but only a few bytes from it, and keep the rest
+ *  for some metadata.
+ *
+ *  That, however, means, that unsigned integer sorting is a constituent part of our string sorting and we can
+ *  expose it as an additional set of APIs for the users:
+ *
+ *  - `sz_pgrams_sort` - to inplace sort continuous pointer-sized integers with QuickSort.
+ *  - `sz_pgrams_sort_stable` - to inplace stable-sort continuous pointer-sized integers with a MergeSort.
  *
- *  The `qsort` variants are not guaranteed to be stable.
- *  The `msort` variants are guaranteed to be stable.
  */
 #ifndef STRINGZILLA_SORT_H_
 #define STRINGZILLA_SORT_H_
@@ -27,49 +34,131 @@ extern "C" {
 #pragma region Core API
 
 /**
- *  @brief  Faster `std::sort` for an arbitrary string sequence.
+ *  @brief  Faster @b arg-sort for an arbitrary @b string sequence, using QuickSort.
+ *          Outputs the ::order of elements in the immutable ::sequence, that would sort it.
+ *          The algorithm doesn't guarantee stability, meaning that the relative order of equal elements
+ *          may not be preserved.
  *
- *  @param collection The collection of strings to sort.
+ *  @param sequence The sequence of strings to sort.
  *  @param alloc Memory allocator for temporary storage.
- *  @param order The output - indices of the sorted collection elements.
+ *  @param order The output - indices of the sorted sequence elements.
  *  @return Whether the operation was successful.
  */
-SZ_DYNAMIC sz_bool_t sz_qsort(sz_sequence_t const *collection, sz_memory_allocator_t *alloc, sz_sorted_idx_t *order);
+SZ_DYNAMIC sz_bool_t sz_sequence_argsort(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                         sz_sorted_idx_t *order);
 
-/** @copydoc sz_qsort */
-SZ_PUBLIC sz_bool_t sz_qsort_serial(sz_sequence_t const *collection, sz_memory_allocator_t *alloc,
+/**
+ *  @brief  Faster @b inplace `std::sort` for a continuous @b unsigned-integer sequence, using QuickSort.
+ *          Overwrites the input ::sequence with the sorted sequence and exports the permutation ::order.
+ *          The algorithm doesn't guarantee stability, meaning that the relative order of equal elements
+ *          may not be preserved.
+ *
+ *  @param pgrams The continuous buffer of unsigned integers to sort in place.
+ *  @param count The number of elements in the sequence.
+ *  @param alloc Memory allocator for temporary storage.
+ *  @param order The output - indices of the sorted sequence elements.
+ *  @return Whether the operation was successful.
+ */
+SZ_DYNAMIC sz_bool_t sz_pgrams_sort(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
                                     sz_sorted_idx_t *order);
 
-/** @copydoc sz_qsort */
-SZ_PUBLIC sz_bool_t sz_qsort_skylake(sz_sequence_t const *collection, sz_memory_allocator_t *alloc,
-                                     sz_sorted_idx_t *order);
+/**
+ *  @brief  Faster @b arg-sort for an arbitrary @b string sequence, using MergeSort.
+ *          Outputs the ::order of elements in the immutable ::sequence, that would sort it.
+ *          The algorithm guarantees stability, meaning that the relative order of equal elements is preserved.
+ *
+ *  This algorithm uses more memory than `sz_sequence_argsort`, but it's performance is more predictable.
+ *  It's also preferred for very large inputs, as most memory access happens in a predictable sequential order.
+ *
+ *  @param sequence The sequence of strings to sort.
+ *  @param alloc Memory allocator for temporary storage.
+ *  @param order The output - indices of the sorted sequence elements.
+ *  @return Whether the operation was successful.
+ */
+SZ_DYNAMIC sz_bool_t sz_sequence_argsort_stable(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                                sz_sorted_idx_t *order);
 
-/** @copydoc sz_qsort */
-SZ_PUBLIC sz_bool_t sz_qsort_sve(sz_sequence_t const *collection, sz_memory_allocator_t *alloc, sz_sorted_idx_t *order);
+/**
+ *  @brief  Faster @b inplace `std::stable_sort sort` for a continuous @b unsigned-integer sequence, using MergeSort.
+ *          Overwrites the input ::sequence with the sorted sequence and exports the permutation ::order.
+ *          The algorithm guarantees stability, meaning that the relative order of equal elements is preserved.
+ *
+ *  This algorithm uses more memory than `sz_pgrams_sort`, but it's performance is more predictable.
+ *  It's also preferred for very large inputs, as most memory access happens in a predictable sequential order.
+ *
+ *  @param pgrams The continuous buffer of unsigned integers to sort in place.
+ *  @param count The number of elements in the sequence.
+ *  @param alloc Memory allocator for temporary storage.
+ *  @param order The output - indices of the sorted sequence elements.
+ *  @return Whether the operation was successful.
+ */
+SZ_DYNAMIC sz_bool_t sz_pgrams_sort_stable(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                           sz_sorted_idx_t *order);
 
-#pragma endregion
+/** @copydoc sz_sequence_argsort */
+SZ_PUBLIC sz_bool_t sz_sequence_argsort_serial(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                               sz_sorted_idx_t *order);
 
-#pragma region Serial Implementation
+/** @copydoc sz_pgrams_sort */
+SZ_PUBLIC sz_bool_t sz_pgrams_sort_serial(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                          sz_sorted_idx_t *order);
 
-/**
- *  The core idea of all following string algorithms is to sort strings not based on 1 character at a time,
- *  but on a larger integer word fitting in 4 or 8 bytes at once, on 32-bit or 64-bit architectures, respectively.
- *  That word is pointer-sized, but it may contain extra information aside from N characters.
- */
-typedef sz_size_t _sz_sort_ngram_t;
+/** @copydoc sz_sequence_argsort */
+SZ_PUBLIC sz_bool_t sz_sequence_argsort_ice(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                            sz_sorted_idx_t *order);
 
-SZ_PUBLIC void _sz_qsort_serial_export_prefixes(                                       //
-    sz_sequence_t const *const collection,                                             //
-    _sz_sort_ngram_t *const global_windows, sz_sorted_idx_t const *const global_order, //
-    sz_size_t const start_in_collection, sz_size_t const end_in_collection,            //
+/** @copydoc sz_pgrams_sort */
+SZ_PUBLIC sz_bool_t sz_pgrams_sort_ice(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                       sz_sorted_idx_t *order);
+
+/** @copydoc sz_sequence_argsort */
+SZ_PUBLIC sz_bool_t sz_sequence_argsort_sve(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                            sz_sorted_idx_t *order);
+
+/** @copydoc sz_pgrams_sort */
+SZ_PUBLIC sz_bool_t sz_pgrams_sort_sve(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                       sz_sorted_idx_t *order);
+
+/** @copydoc sz_sequence_argsort_stable */
+SZ_PUBLIC sz_bool_t sz_sequence_argsort_stable_serial(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                                      sz_sorted_idx_t *order);
+
+/** @copydoc sz_pgrams_sort_stable */
+SZ_PUBLIC sz_bool_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                                 sz_sorted_idx_t *order);
+
+/** @copydoc sz_sequence_argsort_stable */
+SZ_PUBLIC sz_bool_t sz_sequence_argsort_stable_ice(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                                   sz_sorted_idx_t *order);
+
+/** @copydoc sz_pgrams_sort_stable */
+SZ_PUBLIC sz_bool_t sz_pgrams_sort_stable_ice(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                              sz_sorted_idx_t *order);
+
+/** @copydoc sz_sequence_argsort_stable */
+SZ_PUBLIC sz_bool_t sz_sequence_argsort_stable_sve(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                                   sz_sorted_idx_t *order);
+
+/** @copydoc sz_pgrams_sort_stable */
+SZ_PUBLIC sz_bool_t sz_pgrams_sort_stable_sve(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                              sz_sorted_idx_t *order);
+
+#pragma endregion
+
+#pragma region Serial QuickSort Implementation
+
+SZ_PUBLIC void _sz_sequence_argsort_serial_export_next_pgrams(                  //
+    sz_sequence_t const *const sequence,                                        //
+    sz_pgram_t *const global_pgrams, sz_sorted_idx_t const *const global_order, //
+    sz_size_t const start_in_sequence, sz_size_t const end_in_sequence,         //
     sz_size_t const start_character) {
 
     // Depending on the architecture, we will export a different number of bytes.
     // On 32-bit architectures, we will export 3 bytes, and on 64-bit architectures - 7 bytes.
-    sz_size_t const window_capacity = sizeof(_sz_sort_ngram_t) - 1;
+    sz_size_t const window_capacity = sizeof(sz_pgram_t) - 1;
 
     // Perform the same operation for every string.
-    for (sz_size_t i = start_in_collection; i < end_in_collection; ++i) {
+    for (sz_size_t i = start_in_sequence; i < end_in_sequence; ++i) {
 
         // On the first recursion level, the `global_order` is the identity permutation.
         sz_sorted_idx_t const partial_order_index = global_order[i];
@@ -77,37 +166,37 @@ SZ_PUBLIC void _sz_qsort_serial_export_prefixes(
             _sz_assert(partial_order_index == i && "At start this must be an identity permutation.");
 
         // Get the string slice in global memory.
-        sz_cptr_t const source_str = collection->get_start(collection, partial_order_index);
-        sz_size_t const length = collection->get_length(collection, partial_order_index);
+        sz_cptr_t const source_str = sequence->get_start(sequence, partial_order_index);
+        sz_size_t const length = sequence->get_length(sequence, partial_order_index);
         sz_size_t const remaining_length = length > start_character ? length - start_character : 0;
         sz_size_t const exported_length = remaining_length > window_capacity ? window_capacity : remaining_length;
 
         // Fill with zeros, export a slice, and mark the exported length.
-        sz_size_t *target_integer = &global_windows[i];
-        sz_ptr_t target_str = (sz_ptr_t)target_integer;
-        *target_integer = 0;
+        sz_pgram_t *target_pgram = &global_pgrams[i];
+        sz_ptr_t target_str = (sz_ptr_t)target_pgram;
+        *target_pgram = 0;
         for (sz_size_t j = 0; j < exported_length; ++j) target_str[j] = source_str[j + start_character];
         target_str[window_capacity] = exported_length;
 #if defined(_SZ_IS_64_BIT)
-        *target_integer = sz_u64_bytes_reverse(*target_integer);
+        *target_pgram = sz_u64_bytes_reverse(*target_pgram);
 #else
-        *target_integer = sz_u32_bytes_reverse(*target_integer);
+        *target_pgram = sz_u32_bytes_reverse(*target_pgram);
 #endif
-        _sz_assert(                                                  //
-            (length <= start_character) == (*target_integer == 0) && //
+        _sz_assert(                                                //
+            (length <= start_character) == (*target_pgram == 0) && //
             "We can have a zero value if only the string is shorter than other strings at this position.");
     }
 
     // As our goal is to sort the strings using the exported integer "windows",
     // this is a good place to validate the correctness of the exported data.
     if (SZ_DEBUG && start_character == 0)
-        for (sz_size_t i = start_in_collection + 1; i < end_in_collection; ++i) {
-            _sz_sort_ngram_t const previous_window = global_windows[i - 1];
-            _sz_sort_ngram_t const current_window = global_windows[i];
-            sz_cptr_t const previous_str = collection->get_start(collection, i - 1);
-            sz_size_t const previous_length = collection->get_length(collection, i - 1);
-            sz_cptr_t const current_str = collection->get_start(collection, i);
-            sz_size_t const current_length = collection->get_length(collection, i);
+        for (sz_size_t i = start_in_sequence + 1; i < end_in_sequence; ++i) {
+            sz_pgram_t const previous_window = global_pgrams[i - 1];
+            sz_pgram_t const current_window = global_pgrams[i];
+            sz_cptr_t const previous_str = sequence->get_start(sequence, i - 1);
+            sz_size_t const previous_length = sequence->get_length(sequence, i - 1);
+            sz_cptr_t const current_str = sequence->get_start(sequence, i);
+            sz_size_t const current_length = sequence->get_length(sequence, i);
             sz_ordering_t const ordering = sz_order(                                                 //
                 previous_str, previous_length > window_capacity ? window_capacity : previous_length, //
                 current_str, current_length > window_capacity ? window_capacity : current_length);
@@ -131,20 +220,20 @@ SZ_PUBLIC void _sz_qsort_serial_export_prefixes(
  *
  *  @see https://en.wikipedia.org/wiki/Dutch_national_flag_problem
  */
-SZ_PUBLIC void _sz_qsort_serial_3way_partition(                                  //
-    _sz_sort_ngram_t *const global_windows, sz_sorted_idx_t *const global_order, //
-    sz_size_t const start_in_collection, sz_size_t const end_in_collection,      //
+SZ_PUBLIC void _sz_sequence_argsort_serial_3way_partition(                //
+    sz_pgram_t *const global_pgrams, sz_sorted_idx_t *const global_order, //
+    sz_size_t const start_in_sequence, sz_size_t const end_in_sequence,   //
     sz_size_t *first_pivot_offset, sz_size_t *last_pivot_offset) {
 
     // Chose the pivot offset with Sedgewick's method.
-    _sz_sort_ngram_t pivot_window;
+    sz_pgram_t pivot_window;
     {
-        sz_size_t const middle_offset = start_in_collection + (end_in_collection - start_in_collection) / 2;
-        sz_size_t const last_offset = end_in_collection - 1;
-        sz_size_t const first_offset = start_in_collection;
-        _sz_sort_ngram_t const first_window = global_windows[first_offset];
-        _sz_sort_ngram_t const middle_window = global_windows[middle_offset];
-        _sz_sort_ngram_t const last_window = global_windows[last_offset];
+        sz_size_t const middle_offset = start_in_sequence + (end_in_sequence - start_in_sequence) / 2;
+        sz_size_t const last_offset = end_in_sequence - 1;
+        sz_size_t const first_offset = start_in_sequence;
+        sz_pgram_t const first_window = global_pgrams[first_offset];
+        sz_pgram_t const middle_window = global_pgrams[middle_offset];
+        sz_pgram_t const last_window = global_pgrams[last_offset];
         if (first_window < middle_window) {
             if (middle_window < last_window) { pivot_window = middle_window; }
             else if (first_window < last_window) { pivot_window = last_window; }
@@ -158,120 +247,116 @@ SZ_PUBLIC void _sz_qsort_serial_3way_partition(
     }
 
     // Loop through the collection and move the elements around the pivot with the 3-way partitioning.
-    sz_size_t partitioning_progress = start_in_collection;       // Current index.
-    sz_size_t less_than_pivot_offset = start_in_collection;      // Boundary for elements < pivot_window.
-    sz_size_t greater_than_pivot_offset = end_in_collection - 1; // Boundary for elements > pivot_window.
+    sz_size_t partitioning_progress = start_in_sequence; // Current index.
+    sz_size_t smaller_offset = start_in_sequence;        // Boundary for elements < pivot_window.
+    sz_size_t greater_offset = end_in_sequence - 1;      // Boundary for elements > pivot_window.
 
-    while (partitioning_progress <= greater_than_pivot_offset) {
+    while (partitioning_progress <= greater_offset) {
         // Element is less than pivot: swap into the < pivot region.
-        if (global_windows[partitioning_progress] < pivot_window) {
-#if defined(_SZ_IS_64_BIT)
-            sz_u64_swap(&global_order[partitioning_progress], &global_order[less_than_pivot_offset]);
-            sz_u64_swap(&global_windows[partitioning_progress], &global_windows[less_than_pivot_offset]);
-#else
-            sz_u32_swap(&global_order[partitioning_progress], &global_order[less_than_pivot_offset]);
-            sz_u32_swap(&global_windows[partitioning_progress], &global_windows[less_than_pivot_offset]);
-#endif
+        if (global_pgrams[partitioning_progress] < pivot_window) {
+            _sz_swap(sz_sorted_idx_t, global_order[partitioning_progress], global_order[smaller_offset]);
+            _sz_swap(sz_pgram_t, global_pgrams[partitioning_progress], global_pgrams[smaller_offset]);
             ++partitioning_progress;
-            ++less_than_pivot_offset;
+            ++smaller_offset;
         }
         // Element is greater than pivot: swap into the > pivot region.
-        else if (global_windows[partitioning_progress] > pivot_window) {
-#if defined(_SZ_IS_64_BIT)
-            sz_u64_swap(&global_order[partitioning_progress], &global_order[greater_than_pivot_offset]);
-            sz_u64_swap(&global_windows[partitioning_progress], &global_windows[greater_than_pivot_offset]);
-#else
-            sz_u32_swap(&global_order[partitioning_progress], &global_order[greater_than_pivot_offset]);
-            sz_u32_swap(&global_windows[partitioning_progress], &global_windows[greater_than_pivot_offset]);
-#endif
-            --greater_than_pivot_offset;
+        else if (global_pgrams[partitioning_progress] > pivot_window) {
+            _sz_swap(sz_sorted_idx_t, global_order[partitioning_progress], global_order[greater_offset]);
+            _sz_swap(sz_pgram_t, global_pgrams[partitioning_progress], global_pgrams[greater_offset]);
+            --greater_offset;
         }
         // Element equals pivot_window: leave it in place.
         else { ++partitioning_progress; }
     }
 
-    *first_pivot_offset = less_than_pivot_offset;
-    *last_pivot_offset = greater_than_pivot_offset;
+    *first_pivot_offset = smaller_offset;
+    *last_pivot_offset = greater_offset;
 }
 
-SZ_PUBLIC void _sz_qsort_serial_recursively(                                //
-    sz_sequence_t const *const collection,                                  //
-    _sz_sort_ngram_t *const global_windows, sz_size_t *const global_order,  //
-    sz_size_t const start_in_collection, sz_size_t const end_in_collection, //
-    sz_size_t const start_character) {
+/**
+ *  @brief  Recursive Quick-Sort implementation backing both the `sz_sequence_argsort` and `sz_pgrams_sort`,
+ *          and using the `_sz_sequence_argsort_serial_3way_partition` under the hood.
+ */
+SZ_PUBLIC void _sz_sequence_argsort_serial_recursively(                   //
+    sz_pgram_t *const global_pgrams, sz_sorted_idx_t *const global_order, //
+    sz_size_t const start_in_sequence, sz_size_t const end_in_sequence) {
 
     // Partition the collection around some pivot or 2 pivots in a 3-way partitioning
     sz_size_t first_pivot_index, last_pivot_index;
-    _sz_qsort_serial_3way_partition(            //
-        global_windows, global_order,           //
-        start_in_collection, end_in_collection, //
+    _sz_sequence_argsort_serial_3way_partition( //
+        global_pgrams, global_order,            //
+        start_in_sequence, end_in_sequence,     //
         &first_pivot_index, &last_pivot_index);
 
     // Recursively sort the left partition
-    if (start_in_collection < first_pivot_index)
-        _sz_qsort_serial_recursively(collection, global_windows, global_order, start_in_collection, first_pivot_index,
-                                     start_character);
+    if (start_in_sequence < first_pivot_index)
+        _sz_sequence_argsort_serial_recursively(global_pgrams, global_order, start_in_sequence, first_pivot_index);
 
     // Recursively sort the right partition
-    if (last_pivot_index + 1 < end_in_collection)
-        _sz_qsort_serial_recursively(collection, global_windows, global_order, last_pivot_index + 1, end_in_collection,
-                                     start_character);
+    if (last_pivot_index + 1 < end_in_sequence)
+        _sz_sequence_argsort_serial_recursively(global_pgrams, global_order, last_pivot_index + 1, end_in_sequence);
 }
 
-SZ_PUBLIC void _sz_qsort_serial_next_window(                                //
-    sz_sequence_t const *const collection,                                  //
-    _sz_sort_ngram_t *const global_windows, sz_size_t *const global_order,  //
-    sz_size_t const start_in_collection, sz_size_t const end_in_collection, //
+/**
+ *  @brief  Recursive Quick-Sort adaptation for strings, that processes the strings a few N-grams at a time.
+ *          It combines `_sz_sequence_argsort_serial_export_next_pgrams` and `_sz_sequence_argsort_serial_recursively`,
+ *          recursively diving into the identical windows.
+ */
+SZ_PUBLIC void _sz_sequence_argsort_serial_next_pgrams(                   //
+    sz_sequence_t const *const sequence,                                  //
+    sz_pgram_t *const global_pgrams, sz_sorted_idx_t *const global_order, //
+    sz_size_t const start_in_sequence, sz_size_t const end_in_sequence,   //
     sz_size_t const start_character) {
 
     // Prepare the new range of windows
-    _sz_qsort_serial_export_prefixes(collection, global_windows, global_order, start_in_collection, end_in_collection,
-                                     start_character);
+    _sz_sequence_argsort_serial_export_next_pgrams(sequence, global_pgrams, global_order, start_in_sequence,
+                                                   end_in_sequence, start_character);
 
     // Sort current windows with a quicksort
-    _sz_qsort_serial_recursively(collection, global_windows, global_order, start_in_collection, end_in_collection,
-                                 start_character);
+    _sz_sequence_argsort_serial_recursively(global_pgrams, global_order, start_in_sequence, end_in_sequence);
 
     // Depending on the architecture, we will export a different number of bytes.
     // On 32-bit architectures, we will export 3 bytes, and on 64-bit architectures - 7 bytes.
-    sz_size_t const window_capacity = sizeof(_sz_sort_ngram_t) - 1;
+    sz_size_t const window_capacity = sizeof(sz_pgram_t) - 1;
 
     // Repeat the procedure for the identical windows
-    sz_size_t nested_start = start_in_collection;
-    sz_size_t nested_end = start_in_collection;
-    while (nested_end != end_in_collection) {
+    sz_size_t nested_start = start_in_sequence;
+    sz_size_t nested_end = start_in_sequence;
+    while (nested_end != end_in_sequence) {
         // Find the end of the identical windows
-        _sz_sort_ngram_t current_window_integer = global_windows[nested_start];
-        while (nested_end != end_in_collection && current_window_integer == global_windows[nested_end]) ++nested_end;
+        sz_pgram_t current_window_integer = global_pgrams[nested_start];
+        while (nested_end != end_in_sequence && current_window_integer == global_pgrams[nested_end]) ++nested_end;
 
         // If the identical windows are not trivial and each string has more characters, sort them recursively
         sz_cptr_t current_window_str = (sz_cptr_t)&current_window_integer;
         sz_size_t current_window_length = (sz_size_t)current_window_str[0]; //! The byte order was swapped
-        if (nested_end - nested_start > 1 && current_window_length == window_capacity) {
-            _sz_qsort_serial_next_window(collection, global_windows, global_order, nested_start, nested_end,
-                                         start_character + window_capacity);
+        int has_multiple_strings = nested_end - nested_start > 1;
+        int has_more_characters_in_each = current_window_length == window_capacity;
+        if (has_multiple_strings && has_more_characters_in_each) {
+            _sz_sequence_argsort_serial_next_pgrams(sequence, global_pgrams, global_order, nested_start, nested_end,
+                                                    start_character + window_capacity);
         }
         // Move to the next
         nested_start = nested_end;
     }
 }
 
-SZ_PUBLIC void _sz_qsort_serial_insertion(sz_sequence_t const *collection, sz_memory_allocator_t *alloc,
-                                          sz_sorted_idx_t *order) {
+SZ_PUBLIC void _sz_sequence_argsort_serial_insertion(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                                     sz_sorted_idx_t *order) {
     // This algorithm needs no memory allocations:
     sz_unused(alloc);
 
     // Assume `order` is already initialized with 0, 1, 2, ... N.
-    for (sz_size_t i = 1; i < collection->count; ++i) {
+    for (sz_size_t i = 1; i < sequence->count; ++i) {
         sz_sorted_idx_t current_idx = order[i];
         sz_size_t j = i;
         while (j > 0) {
             // Get the two strings to compare.
             sz_sorted_idx_t previous_idx = order[j - 1];
-            sz_cptr_t previous_start = collection->get_start(collection, previous_idx);
-            sz_cptr_t current_start = collection->get_start(collection, current_idx);
-            sz_size_t previous_length = collection->get_length(collection, previous_idx);
-            sz_size_t current_length = collection->get_length(collection, current_idx);
+            sz_cptr_t previous_start = sequence->get_start(sequence, previous_idx);
+            sz_cptr_t current_start = sequence->get_start(sequence, current_idx);
+            sz_size_t previous_length = sequence->get_length(sequence, previous_idx);
+            sz_size_t current_length = sequence->get_length(sequence, current_idx);
 
             // Use the provided sz_order to compare.
             sz_ordering_t ordering = sz_order(previous_start, previous_length, current_start, current_length);
@@ -287,16 +372,16 @@ SZ_PUBLIC void _sz_qsort_serial_insertion(sz_sequence_t const *collection, sz_me
     }
 }
 
-SZ_PUBLIC sz_bool_t sz_qsort_serial(sz_sequence_t const *collection, sz_memory_allocator_t *alloc,
-                                    sz_sorted_idx_t *order) {
+SZ_PUBLIC sz_bool_t sz_sequence_argsort_serial(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                               sz_sorted_idx_t *order) {
 
     // First, initialize the `order` with `std::iota`-like behavior.
-    for (sz_size_t i = 0; i != collection->count; ++i) order[i] = i;
+    for (sz_size_t i = 0; i != sequence->count; ++i) order[i] = i;
 
     // On very small collections - just use the quadratic-complexity insertion sort
     // without any smart optimizations or memory allocations.
-    if (collection->count <= 32) {
-        _sz_qsort_serial_insertion(collection, alloc, order);
+    if (sequence->count <= 32) {
+        _sz_sequence_argsort_serial_insertion(sequence, alloc, order);
         return sz_true_k;
     }
 
@@ -311,51 +396,255 @@ SZ_PUBLIC sz_bool_t sz_qsort_serial(sz_sequence_t const *collection, sz_memory_a
     // Assuming that some strings may contain or even end with NULL bytes, we need to make sure, that their length
     // is included in those P-long words. So, in reality, we will be taking (P-1) bytes from each string on every
     // iteration of a recursive algorithm.
-    _sz_sort_ngram_t *windows =
-        (_sz_sort_ngram_t *)alloc->allocate(collection->count * sizeof(_sz_sort_ngram_t), alloc);
+    sz_size_t memory_usage = sequence->count * sizeof(sz_pgram_t);
+    sz_pgram_t *windows = (sz_pgram_t *)alloc->allocate(memory_usage, alloc);
     if (!windows) return sz_false_k;
 
-    // Recursively sort the whole collection.
-    _sz_qsort_serial_next_window(collection, windows, order, 0, collection->count, 0);
+    // Recursively sort the whole sequence.
+    _sz_sequence_argsort_serial_next_pgrams(sequence, windows, order, 0, sequence->count, 0);
 
     // Free temporary storage.
-    alloc->free(windows, collection->count * sizeof(_sz_sort_ngram_t), alloc);
+    alloc->free(windows, memory_usage, alloc);
+    return sz_true_k;
+}
+
+SZ_PUBLIC sz_bool_t sz_pgrams_sort_serial(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                          sz_sorted_idx_t *order) {
+    sz_unused(alloc);
+    // First, initialize the `order` with `std::iota`-like behavior.
+    for (sz_size_t i = 0; i != count; ++i) order[i] = i;
+    // Reuse the string sorting algorithm for sorting the "pgrams".
+    _sz_sequence_argsort_serial_recursively((sz_pgram_t *)pgrams, order, 0, count);
+    return sz_true_k;
+}
+
+#pragma endregion // Serial QuickSort Implementation
+
+#pragma region Serial MergeSort Implementation
+
+/**
+ *  @brief  A scalar sorting network for 8 elements that reorders both the keys
+ *          and their corresponding offsets in only 19 comparisons, the most efficient
+ *          variant currently known.
+ *  @see    https://en.wikipedia.org/wiki/Sorting_network
+ *
+ *  The network consists of 6 stages with the following compare–swap pairs:
+ *
+ *      Stage 1: (0,1), (2,3), (4,5), (6,7)
+ *      Stage 2: (0,2), (1,3), (4,6), (5,7)
+ *      Stage 3: (1,2), (5,6)
+ *      Stage 4: (0,4), (1,5), (2,6), (3,7)
+ *      Stage 5: (2,4), (3,5)
+ *      Stage 6: (1,2), (3,4), (5,6)
+ */
+void _sz_sequence_argsort_stable_serial_8x_network(sz_pgram_t *keys, sz_sorted_idx_t *offsets) {
+
+#define _sz_sequence_argsort_stable_8x_conditional_swap(i, j)  \
+    do {                                                       \
+        if (keys[i] > keys[j]) {                               \
+            _sz_swap(sz_pgram_t, keys[i], keys[j]);            \
+            _sz_swap(sz_sorted_idx_t, offsets[i], offsets[j]); \
+        }                                                      \
+    } while (0)
+
+    // Stage 1: Compare–swap adjacent pairs.
+    _sz_sequence_argsort_stable_8x_conditional_swap(0, 1);
+    _sz_sequence_argsort_stable_8x_conditional_swap(2, 3);
+    _sz_sequence_argsort_stable_8x_conditional_swap(4, 5);
+    _sz_sequence_argsort_stable_8x_conditional_swap(6, 7);
+
+    // Stage 2: Compare–swap with stride 2.
+    _sz_sequence_argsort_stable_8x_conditional_swap(0, 2);
+    _sz_sequence_argsort_stable_8x_conditional_swap(1, 3);
+    _sz_sequence_argsort_stable_8x_conditional_swap(4, 6);
+    _sz_sequence_argsort_stable_8x_conditional_swap(5, 7);
+
+    // Stage 3: Compare–swap between middle elements.
+    _sz_sequence_argsort_stable_8x_conditional_swap(1, 2);
+    _sz_sequence_argsort_stable_8x_conditional_swap(5, 6);
+
+    // Stage 4: Compare–swap across the two halves.
+    _sz_sequence_argsort_stable_8x_conditional_swap(0, 4);
+    _sz_sequence_argsort_stable_8x_conditional_swap(1, 5);
+    _sz_sequence_argsort_stable_8x_conditional_swap(2, 6);
+    _sz_sequence_argsort_stable_8x_conditional_swap(3, 7);
+
+    // Stage 5: Compare–swap within each half.
+    _sz_sequence_argsort_stable_8x_conditional_swap(2, 4);
+    _sz_sequence_argsort_stable_8x_conditional_swap(3, 5);
+
+    // Stage 6: Final compare–swap of adjacent elements.
+    _sz_sequence_argsort_stable_8x_conditional_swap(1, 2);
+    _sz_sequence_argsort_stable_8x_conditional_swap(3, 4);
+    _sz_sequence_argsort_stable_8x_conditional_swap(5, 6);
+
+#undef _sz_sequence_argsort_stable_8x_conditional_swap
+
+    // Validate the sorting network.
+    if (SZ_DEBUG)
+        for (sz_size_t i = 1; i < 8; ++i)
+            _sz_assert(keys[i - 1] <= keys[i] && "The sorting network must sort the keys in ascending order.");
+}
+
+/**
+ *  @brief  Helper function similar to `std::set_union` over pairs of integers and their original indices.
+ *  @see    https://en.cppreference.com/w/cpp/algorithm/set_union
+ */
+void _sz_sequence_argsort_stable_serial_merge(                                                      //
+    sz_pgram_t const *first_pgrams, sz_sorted_idx_t const *first_indices, sz_size_t first_count,    //
+    sz_pgram_t const *second_pgrams, sz_sorted_idx_t const *second_indices, sz_size_t second_count, //
+    sz_pgram_t *result_pgrams, sz_sorted_idx_t *result_indices) {
+
+    // Compute the end pointers for each input array
+    sz_pgram_t const *const first_end = first_pgrams + first_count;
+    sz_pgram_t const *const second_end = second_pgrams + second_count;
+
+    // Merge until one array is exhausted
+    while (first_pgrams < first_end && second_pgrams < second_end) {
+        if (*first_pgrams < *second_pgrams) {
+            *result_pgrams++ = *first_pgrams++;
+            *result_indices++ = *first_indices++;
+        }
+        else if (*second_pgrams < *first_pgrams) {
+            *result_pgrams++ = *second_pgrams++;
+            *result_indices++ = *second_indices++;
+        }
+        else {
+            // Equal keys: for stability, choose the one from the first array
+            *result_pgrams++ = *first_pgrams;
+            *result_indices++ = *first_indices;
+            ++first_pgrams;
+            ++first_indices;
+            ++second_pgrams;
+            ++second_indices;
+        }
+    }
+
+    // Copy any remaining elements from the first array
+    while (first_pgrams < first_end) {
+        *result_pgrams++ = *first_pgrams++;
+        *result_indices++ = *first_indices++;
+    }
+
+    // Copy any remaining elements from the second array
+    while (second_pgrams < second_end) {
+        *result_pgrams++ = *second_pgrams++;
+        *result_indices++ = *second_indices++;
+    }
+}
+
+SZ_PUBLIC sz_bool_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                                 sz_sorted_idx_t *order) {
+
+    // First, initialize the `order` with `std::iota`-like behavior.
+    for (sz_size_t i = 0; i != count; ++i) order[i] = i;
+
+    // Go through short chunks of 8 elements and sort them with a sorting network.
+    for (sz_size_t i = 0; i + 8 <= count; i += 8) _sz_sequence_argsort_stable_serial_8x_network(pgrams + i, order + i);
+
+    // For the tail of the array, sort it with insertion sort.
+    for (sz_size_t i = count & ~7; i < count; i++) {
+        sz_pgram_t current_address = pgrams[i];
+        sz_sorted_idx_t current_idx = order[i];
+        sz_size_t j = i;
+        while (j > 0 && pgrams[j - 1] > current_address) {
+            pgrams[j] = pgrams[j - 1];
+            order[j] = order[j - 1];
+            --j;
+        }
+        pgrams[j] = current_address;
+        order[j] = current_idx;
+    }
+
+    // At this point, the array is partitioned into sorted runs.
+    // We'll now merge these runs until the whole array is sorted.
+    // Allocate temporary memory to hold merged results:
+    //    - one block for keys (`sz_pgram_t`)
+    //    - one block for indices (`sz_sorted_idx_t`)
+    sz_size_t memory_usage = sizeof(sz_pgram_t) * count + sizeof(sz_sorted_idx_t) * count;
+    sz_pgram_t *pgrams_temporary = (sz_pgram_t *)alloc->allocate(memory_usage, alloc);
+    sz_sorted_idx_t *order_temporary = (sz_sorted_idx_t *)(pgrams_temporary + count);
+    if (!pgrams_temporary) return sz_false_k;
+
+    // Set initial run size (the sorted chunks).
+    sz_size_t run_size = 8;
+
+    // Pointers for current source and destination arrays.
+    sz_pgram_t *src_pgrams = pgrams;
+    sz_sorted_idx_t *src_order = order;
+    sz_pgram_t *dst_pgrams = pgrams_temporary;
+    sz_sorted_idx_t *dst_order = order_temporary;
+
+    // Merge sorted runs in a bottom-up manner until the run size covers the whole array.
+    while (run_size < count) {
+        // Process adjacent runs.
+        for (sz_size_t i = 0; i < count; i += run_size * 2) {
+            // Determine the number of elements in the left run.
+            sz_size_t left_count = run_size;
+            if (i + left_count > count) { left_count = count - i; }
+
+            // Determine the number of elements in the right run.
+            sz_size_t right_count = run_size;
+            if (i + run_size >= count) { right_count = 0; }
+            else if (i + run_size + right_count > count) { right_count = count - (i + run_size); }
+
+            // Merge the two runs:
+            _sz_sequence_argsort_stable_serial_merge(                             //
+                src_pgrams + i, src_order + i, left_count,                        //
+                src_pgrams + i + run_size, src_order + i + run_size, right_count, //
+                dst_pgrams + i, dst_order + i);
+        }
+
+        // Swap the roles of the source and destination arrays.
+        _sz_swap(sz_pgram_t *, src_pgrams, dst_pgrams);
+        _sz_swap(sz_sorted_idx_t *, src_order, dst_order);
+
+        // Double the run size for the next pass.
+        run_size *= 2;
+    }
+
+    // If the final sorted result is not in the original array, copy the sorted results back.
+    if (src_pgrams != pgrams)
+        for (sz_size_t i = 0; i < count; ++i) pgrams[i] = src_pgrams[i], order[i] = src_order[i];
+
+    // Free the temporary memory used for merging.
+    alloc->free(pgrams_temporary, memory_usage, alloc);
     return sz_true_k;
 }
 
-#pragma endregion // Serial Implementation
+#pragma endregion // Serial MergeSort Implementation
 
 #pragma region Ice Lake Implementation
 
-SZ_PUBLIC void _sz_qsort_ice_recursively(                                   //
-    sz_sequence_t const *const collection,                                  //
-    _sz_sort_ngram_t *const global_windows, sz_size_t *const global_order,  //
-    sz_size_t const start_in_collection, sz_size_t const end_in_collection, //
+SZ_PUBLIC void _sz_sequence_argsort_ice_recursively(                    //
+    sz_sequence_t const *const collection,                              //
+    sz_pgram_t *const global_pgrams, sz_size_t *const global_order,     //
+    sz_size_t const start_in_sequence, sz_size_t const end_in_sequence, //
     sz_size_t const start_character) {
 
     // Prepare the new range of windows
-    _sz_qsort_serial_export_prefixes(collection, global_windows, global_order, start_in_collection, end_in_collection,
-                                     start_character);
+    _sz_sequence_argsort_serial_export_next_pgrams(collection, global_pgrams, global_order, start_in_sequence,
+                                                   end_in_sequence, start_character);
 
     // We can implement a form of a Radix sort here, that will count the number of elements with
     // a certain bit set. The naive approach may require too many loops over data. A more "vectorized"
     // approach would be to maintain a histogram for several bits at once. For 4 bits we will
     // need 2^4 = 16 counters.
     sz_size_t histogram[16] = {0};
-    for (sz_size_t byte_in_window = 0; byte_in_window != sizeof(_sz_sort_ngram_t); ++byte_in_window) {
+    for (sz_size_t byte_in_window = 0; byte_in_window != sizeof(sz_pgram_t); ++byte_in_window) {
         // First sort based on the low nibble of each byte.
-        for (sz_size_t i = start_in_collection; i < end_in_collection; ++i) {
-            sz_size_t const byte = (global_windows[i] >> (byte_in_window * 8)) & 0xFF;
+        for (sz_size_t i = start_in_sequence; i < end_in_sequence; ++i) {
+            sz_size_t const byte = (global_pgrams[i] >> (byte_in_window * 8)) & 0xFF;
             ++histogram[byte];
         }
-        sz_size_t offset = start_in_collection;
+        sz_size_t offset = start_in_sequence;
         for (sz_size_t i = 0; i != 16; ++i) {
             sz_size_t const count = histogram[i];
             histogram[i] = offset;
             offset += count;
         }
-        for (sz_size_t i = start_in_collection; i < end_in_collection; ++i) {
-            sz_size_t const byte = (global_windows[i] >> (byte_in_window * 8)) & 0xFF;
+        for (sz_size_t i = start_in_sequence; i < end_in_sequence; ++i) {
+            sz_size_t const byte = (global_pgrams[i] >> (byte_in_window * 8)) & 0xFF;
             global_order[histogram[byte]] = i;
             ++histogram[byte];
         }
@@ -370,8 +659,9 @@ SZ_PUBLIC void _sz_qsort_ice_recursively(                                   //
 #pragma region Compile Time Dispatching
 #if !SZ_DYNAMIC_DISPATCH
 
-SZ_DYNAMIC sz_bool_t sz_qsort(sz_sequence_t const *collection, sz_memory_allocator_t *alloc, sz_sorted_idx_t *order) {
-    return sz_qsort_serial(collection, alloc, order);
+SZ_DYNAMIC sz_bool_t sz_sequence_argsort(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                         sz_sorted_idx_t *order) {
+    return sz_sequence_argsort_serial(sequence, alloc, order);
 }
 
 #endif            // !SZ_DYNAMIC_DISPATCH
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index ba405700..0b23b33b 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -47,7 +47,7 @@
 #include "memory.h"       // `sz_copy`, `sz_move`, `sz_fill`
 #include "similarity.h"   // `sz_edit_distance`, `sz_alignment_score`
 #include "small_string.h" // `sz_string_t`, `sz_string_init`, `sz_string_free`
-#include "sort.h"         // `sz_sort`, `sz_sort_partial`, `sz_partition`
+#include "sort.h"         // `sz_sequence_argsort`, `sz_pgrams_sort`, `sz_pgrams_sort_stable`
 #include "types.h"        // `sz_size_t`, `sz_bool_t`, `sz_ordering_t`
 
 #ifdef __cplusplus
diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 664ce607..0a4737ad 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -3974,8 +3974,8 @@ void randomize(basic_string_slice<char_type_> string, string_view alphabet = "ab
 using sorted_idx_t = sz_sorted_idx_t;
 
 /**
- *  @brief  Internal data-structure used to forward the arguments to the `sz_sort` function.
- *  @see    sorted_order
+ *  @brief  Internal data-structure used to forward the arguments to the `sz_sequence_argsort` function.
+ *  @see    argsort
  */
 template <typename objects_type_, typename string_extractor_>
 struct _sequence_args {
@@ -4004,18 +4004,18 @@ sz_size_t _call_sequence_member_length(struct sz_sequence_t const *sequence, sz_
 /**
  *  @brief  Computes the permutation of an array, that would lead to sorted order.
  *          The elements of the array must be convertible to a `string_view` with the given extractor.
- *          Unlike the `sz_sort` C interface, overwrites the output array.
+ *          Unlike the `sz_sequence_argsort` C interface, overwrites the output array.
  *
  *  @param[in] begin       The pointer to the first element of the array.
  *  @param[in] end         The pointer to the element after the last element of the array.
  *  @param[out] order      The pointer to the output array of indices, that will be populated with the permutation.
  *  @param[in] extractor   The function object that extracts the string from the object.
  *
- *  @see    sz_sort
+ *  @see    sz_sequence_argsort
  */
 template <typename objects_type_, typename string_extractor_>
-void sorted_order(objects_type_ const *begin, objects_type_ const *end, sorted_idx_t *order,
-                  string_extractor_ &&extractor) noexcept {
+void argsort(objects_type_ const *begin, objects_type_ const *end, sorted_idx_t *order,
+             string_extractor_ &&extractor) noexcept {
 
     // Pack the arguments into a single structure to reference it from the callback.
     _sequence_args<objects_type_, string_extractor_> args = {begin, static_cast<std::size_t>(end - begin), order,
@@ -4030,7 +4030,8 @@ void sorted_order(objects_type_ const *begin, objects_type_ const *end, sorted_i
     array.get_length = _call_sequence_member_length<objects_type_, string_extractor_>;
 
     using sz_alloc_type = sz_memory_allocator_t;
-    _with_alloc<std::allocator<sz_u8_t>>([&](sz_alloc_type &alloc) { return sz_sort(&array, &alloc, order); });
+    _with_alloc<std::allocator<sz_u8_t>>(
+        [&](sz_alloc_type &alloc) { return sz_sequence_argsort(&array, &alloc, order); });
 }
 
 #if !SZ_AVOID_STL
@@ -4075,10 +4076,10 @@ std::bitset<bitset_bits_> hashes_fingerprint(basic_string<char_type_> const &str
  *  @throw  `std::bad_alloc` if the allocation fails.
  */
 template <typename objects_type_, typename string_extractor_>
-std::vector<sorted_idx_t> sorted_order( //
+std::vector<sorted_idx_t> argsort( //
     objects_type_ const *begin, objects_type_ const *end, string_extractor_ &&extractor) noexcept(false) {
     std::vector<sorted_idx_t> order(end - begin);
-    sorted_order(begin, end, order.data(), std::forward<string_extractor_>(extractor));
+    argsort(begin, end, order.data(), std::forward<string_extractor_>(extractor));
     return order;
 }
 
@@ -4088,10 +4089,10 @@ std::vector<sorted_idx_t> sorted_order( //
  *  @throw  `std::bad_alloc` if the allocation fails.
  */
 template <typename string_like_type_>
-std::vector<sorted_idx_t> sorted_order(string_like_type_ const *begin, string_like_type_ const *end) noexcept(false) {
+std::vector<sorted_idx_t> argsort(string_like_type_ const *begin, string_like_type_ const *end) noexcept(false) {
     static_assert( //
         std::is_convertible<string_like_type_, string_view>::value, "The type must be convertible to string_view.");
-    return sorted_order(begin, end, [](string_like_type_ const &s) -> string_view { return s; });
+    return argsort(begin, end, [](string_like_type_ const &s) -> string_view { return s; });
 }
 
 /**
@@ -4100,11 +4101,11 @@ std::vector<sorted_idx_t> sorted_order(string_like_type_ const *begin, string_li
  *  @throw  `std::bad_alloc` if the allocation fails.
  */
 template <typename string_like_type_>
-std::vector<sorted_idx_t> sorted_order(std::vector<string_like_type_> const &array) noexcept(false) {
+std::vector<sorted_idx_t> argsort(std::vector<string_like_type_> const &array) noexcept(false) {
     static_assert( //
         std::is_convertible<string_like_type_, string_view>::value, "The type must be convertible to string_view.");
-    return sorted_order(array.data(), array.data() + array.size(),
-                        [](string_like_type_ const &s) -> string_view { return s; });
+    return argsort(array.data(), array.data() + array.size(),
+                   [](string_like_type_ const &s) -> string_view { return s; });
 }
 
 #endif
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 6d8086f2..01d090b2 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -325,6 +325,7 @@ typedef sz_i8_t sz_error_cost_t; // Character mismatch cost for fuzzy matching f
 
 struct sz_sequence_t;              // Forward declaration of an ordered collection of strings
 typedef sz_size_t sz_sorted_idx_t; // Index of a sorted string in a list of strings
+typedef sz_size_t sz_pgram_t;      // "Pointer-sized N-gram" of a string
 
 typedef enum { sz_false_k = 0, sz_true_k = 1 } sz_bool_t;                        // Only one relevant bit
 typedef enum { sz_less_k = -1, sz_equal_k = 0, sz_greater_k = 1 } sz_ordering_t; // Only three possible states: <=>
@@ -481,8 +482,8 @@ typedef sz_size_t (*sz_edit_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size
 typedef sz_ssize_t (*sz_alignment_score_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_error_cost_t const *,
                                            sz_error_cost_t, sz_memory_allocator_t *);
 
-/** @brief  Signature of ::sz_sort. */
-typedef sz_bool_t (*sz_sort_t)(struct sz_sequence_t const *, sz_memory_allocator_t *, sz_sorted_idx_t *);
+/** @brief  Signature of ::sz_sequence_argsort. */
+typedef sz_bool_t (*sz_sequence_argsort_t)(struct sz_sequence_t const *, sz_memory_allocator_t *, sz_sorted_idx_t *);
 
 #pragma endregion
 
@@ -644,8 +645,6 @@ SZ_INTERNAL sz_size_t _sz_export_utf8_to_utf32(sz_cptr_t utf8, sz_size_t utf8_le
 
 typedef sz_cptr_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t);
 typedef sz_size_t (*sz_sequence_member_length_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_bool_t (*sz_sequence_predicate_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_bool_t (*sz_string_is_less_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
 
 typedef struct sz_sequence_t {
     void const *handle;
@@ -984,24 +983,6 @@ SZ_INTERNAL sz_u64_t sz_u64_transpose(sz_u64_t x) {
     return x;
 }
 
-/**
- *  @brief  Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
- */
-SZ_INTERNAL void sz_u64_swap(sz_u64_t *a, sz_u64_t *b) {
-    sz_u64_t t = *a;
-    *a = *b;
-    *b = t;
-}
-
-/**
- *  @brief  Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
- */
-SZ_INTERNAL void sz_pointer_swap(void **a, void **b) {
-    void *t = *a;
-    *a = *b;
-    *b = t;
-}
-
 /**
  *  @brief Load a 16-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
  */
diff --git a/python/lib.c b/python/lib.c
index c5346772..1406691c 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -3214,7 +3214,7 @@ static sz_bool_t Strs_sort_(Strs *self, sz_string_view_t **parts_output, sz_sort
     sequence.get_start = parts_get_start;
     sequence.get_length = parts_get_length;
     for (sz_sorted_idx_t i = 0; i != sequence.count; ++i) sequence.order[i] = i;
-    sz_sort(&sequence);
+    sz_sequence_argsort(&sequence);
 
     // Export results
     *parts_output = parts;
diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index b321fa7e..cbec9bf5 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -44,24 +44,24 @@ using binary_function_t = std::function<std::size_t(std::string_view, std::strin
 /**
  *  @brief  Wrapper for a single execution backend.
  */
-template <typename function_type>
+template <typename function_type_>
 struct tracked_function_gt {
     std::string name {""};
-    function_type function {nullptr};
+    function_type_ function {nullptr};
     bool needs_testing {false};
 
     std::size_t failed_count;
     std::vector<std::string> failed_strings;
     benchmark_result_t results;
 
-    tracked_function_gt(std::string name = "", function_type function = nullptr, bool needs_testing = false)
+    tracked_function_gt(std::string name = "", function_type_ function = nullptr, bool needs_testing = false)
         : name(name), function(function), needs_testing(needs_testing), failed_count(0), failed_strings(), results() {}
 
     tracked_function_gt(tracked_function_gt const &) = default;
     tracked_function_gt &operator=(tracked_function_gt const &) = default;
 
     void print() const {
-        bool is_binary = std::is_same<function_type, binary_function_t>();
+        bool is_binary = std::is_same<function_type_, binary_function_t>();
 
         // If failures have occurred, output them to file to simplify the debugging process.
         bool contains_failures = !failed_strings.empty();
@@ -229,35 +229,46 @@ inline sz_string_view_t to_c(sz::string_view str) noexcept { return {str.data(),
 inline sz_string_view_t to_c(sz::string const &str) noexcept { return {str.data(), str.size()}; }
 inline sz_string_view_t to_c(sz_string_view_t str) noexcept { return str; }
 
+/**
+ *  @brief  Invoke the same function many times, until the total time elapsed exceeds the limit.
+ *  @return Total seconds elapsed.
+ */
+template <typename function_type_>
+seconds_t repeat_until_limit(function_type_ &&function) {
+
+    namespace stdc = std::chrono;
+    using clock_t = stdc::high_resolution_clock;
+    clock_t::time_point start_time = clock_t::now();
+    seconds_t seconds = 0;
+
+    while (seconds < seconds_per_benchmark) {
+        function();
+        clock_t::time_point current_time = clock_t::now();
+        seconds = stdc::duration_cast<stdc::nanoseconds>(current_time - start_time).count() / 1.e9;
+    }
+    return seconds;
+}
+
 /**
  *  @brief  Loop over all elements in a dataset in somewhat random order, benchmarking the function cost.
  *  @param  strings Strings to loop over. Length must be a power of two.
  *  @param  function Function to be applied to each `sz_string_view_t`. Must return the number of bytes processed.
  *  @return Number of seconds per iteration.
  */
-template <typename strings_type, typename function_type>
-benchmark_result_t bench_on_tokens(strings_type &&strings, function_type &&function) {
+template <typename strings_type_, typename function_type_>
+benchmark_result_t bench_on_tokens(strings_type_ &&strings, function_type_ &&function) {
 
-    namespace stdc = std::chrono;
-    using clock_t = stdc::high_resolution_clock;
-    clock_t::time_point t1 = clock_t::now();
     benchmark_result_t result;
-    std::size_t lookup_mask = bit_floor(strings.size()) - 1;
-
-    while (true) {
+    std::size_t const lookup_mask = bit_floor(strings.size()) - 1;
+    result.seconds = repeat_until_limit([&]() {
         // Unroll a few iterations, to avoid some for-loops overhead and minimize impact of time-tracking
-        {
-            result.bytes_passed += function(strings[(result.iterations + 0) & lookup_mask]) +
-                                   function(strings[(result.iterations + 1) & lookup_mask]) +
-                                   function(strings[(result.iterations + 2) & lookup_mask]) +
-                                   function(strings[(result.iterations + 3) & lookup_mask]);
-            result.iterations += 4;
-        }
-
-        clock_t::time_point t2 = clock_t::now();
-        result.seconds = stdc::duration_cast<stdc::nanoseconds>(t2 - t1).count() / 1.e9;
-        if (result.seconds > seconds_per_benchmark) break;
-    }
+        result.bytes_passed += //
+            function(strings[(result.iterations + 0) & lookup_mask]) +
+            function(strings[(result.iterations + 1) & lookup_mask]) +
+            function(strings[(result.iterations + 2) & lookup_mask]) +
+            function(strings[(result.iterations + 3) & lookup_mask]);
+        result.iterations += 4;
+    });
 
     return result;
 }
@@ -269,31 +280,22 @@ benchmark_result_t bench_on_tokens(strings_type &&strings, function_type &&funct
  *                   Must return the number of bytes processed.
  *  @return Number of seconds per iteration.
  */
-template <typename strings_type, typename function_type>
-benchmark_result_t bench_on_token_pairs(strings_type &&strings, function_type &&function) {
+template <typename strings_type_, typename function_type_>
+benchmark_result_t bench_on_token_pairs(strings_type_ &&strings, function_type_ &&function) {
 
-    namespace stdc = std::chrono;
-    using clock_t = stdc::high_resolution_clock;
-    clock_t::time_point t1 = clock_t::now();
     benchmark_result_t result;
     std::size_t lookup_mask = bit_floor(strings.size()) - 1;
     std::size_t largest_prime = static_cast<std::size_t>(18446744073709551557ull);
-
-    while (true) {
+    result.seconds = repeat_until_limit([&]() {
         // Unroll a few iterations, to avoid some for-loops overhead and minimize impact of time-tracking
-        {
-            auto second = (result.iterations * largest_prime) & lookup_mask;
-            result.bytes_passed += function(strings[(result.iterations + 0) & lookup_mask], strings[second]) +
-                                   function(strings[(result.iterations + 1) & lookup_mask], strings[second]) +
-                                   function(strings[(result.iterations + 2) & lookup_mask], strings[second]) +
-                                   function(strings[(result.iterations + 3) & lookup_mask], strings[second]);
-            result.iterations += 4;
-        }
-
-        clock_t::time_point t2 = clock_t::now();
-        result.seconds = stdc::duration_cast<stdc::nanoseconds>(t2 - t1).count() / 1.e9;
-        if (result.seconds > seconds_per_benchmark) break;
-    }
+        auto second_index = (result.iterations * largest_prime) & lookup_mask;
+        result.bytes_passed += //
+            function(strings[(result.iterations + 0) & lookup_mask], strings[second_index]) +
+            function(strings[(result.iterations + 1) & lookup_mask], strings[second_index]) +
+            function(strings[(result.iterations + 2) & lookup_mask], strings[second_index]) +
+            function(strings[(result.iterations + 3) & lookup_mask], strings[second_index]);
+        result.iterations += 4;
+    });
 
     return result;
 }
@@ -301,8 +303,8 @@ benchmark_result_t bench_on_token_pairs(strings_type &&strings, function_type &&
 /**
  *  @brief  Evaluation for unary string operations: hashing.
  */
-template <typename strings_type, typename functions_type>
-void bench_unary_functions(strings_type &&strings, functions_type &&variants) {
+template <typename strings_type_, typename functions_type>
+void bench_unary_functions(strings_type_ &&strings, functions_type &&variants) {
 
     for (std::size_t variant_idx = 0; variant_idx != variants.size(); ++variant_idx) {
         auto &variant = variants[variant_idx];
@@ -337,8 +339,8 @@ void bench_unary_functions(strings_type &&strings, functions_type &&variants) {
 /**
  *  @brief  Evaluation for binary string operations: equality, ordering, prefix, suffix, distance.
  */
-template <typename strings_type, typename functions_type>
-void bench_binary_functions(strings_type &&strings, functions_type &&variants) {
+template <typename strings_type_, typename functions_type>
+void bench_binary_functions(strings_type_ &&strings, functions_type &&variants) {
 
     for (std::size_t variant_idx = 0; variant_idx != variants.size(); ++variant_idx) {
         auto &variant = variants[variant_idx];
diff --git a/scripts/bench_sort.cpp b/scripts/bench_sort.cpp
index 75800582..729ac856 100644
--- a/scripts/bench_sort.cpp
+++ b/scripts/bench_sort.cpp
@@ -19,8 +19,7 @@ using namespace ashvardanian::stringzilla::scripts;
 namespace sz = ashvardanian::stringzilla;
 
 using strings_t = std::vector<std::string>;
-using idx_t = sz_size_t;
-using permute_t = std::vector<sz_u64_t>;
+using permute_t = std::vector<sz_sorted_idx_t>;
 
 #pragma region C callbacks
 
@@ -54,87 +53,128 @@ static int _get_qsort_order(const void *a, const void *b, void *arg) {
 
 #pragma endregion
 
-void expect_sorted(strings_t const &strings, permute_t const &permute) {
+template <typename strings_type_>
+void expect_sorted(strings_type_ const &strings, permute_t const &permute) {
     if (!std::is_sorted(permute.begin(), permute.end(),
                         [&](std::size_t i, std::size_t j) { return strings[i] < strings[j]; }))
         throw std::runtime_error("Sorting failed!");
 }
 
-template <typename algo_at>
-void bench_permute(char const *name, strings_t &strings, permute_t &permute, algo_at &&algo) {
-    namespace stdc = std::chrono;
-    using clock_t = stdc::high_resolution_clock;
-    constexpr std::size_t iterations = 3;
-    clock_t::time_point t1 = clock_t::now();
+template <typename callback_type_>
+void bench_permute(char const *name, callback_type_ &&callback) {
 
     // Run multiple iterations
-    for (std::size_t i = 0; i != iterations; ++i) {
-        std::iota(permute.begin(), permute.end(), 0);
-        algo(strings, permute);
-    }
+    std::size_t iterations = 0;
+    seconds_t duration = repeat_until_limit([&]() {
+        callback();
+        iterations++;
+    });
 
     // Measure elapsed time
-    clock_t::time_point t2 = clock_t::now();
-    double dif = stdc::duration_cast<stdc::nanoseconds>(t2 - t1).count() * 1.0;
-    double millisecs = dif / (iterations * 1e6);
-    std::printf("Elapsed time is %.2lf milliseconds/iteration for %s.\n", millisecs, name);
+    duration /= iterations;
+    if (duration >= 0.1) { std::printf("Elapsed time is %.2lf seconds for %s.\n", duration, name); }
+    else if (duration >= 0.001) { std::printf("Elapsed time is %.2lf milliseconds for %s.\n", duration * 1e3, name); }
+    else { std::printf("Elapsed time is %.2lf microseconds for %s.\n", duration * 1e6, name); }
 }
 
 int main(int argc, char const **argv) {
     std::printf("StringZilla. Starting sorting benchmarks.\n");
-    dataset_t dataset = prepare_benchmark_environment(argc, argv);
-    strings_t strings {dataset.tokens.begin(), dataset.tokens.end()};
+    dataset_t const dataset = prepare_benchmark_environment(argc, argv);
+    strings_t const strings {dataset.tokens.begin(), dataset.tokens.end()};
+    permute_t permute(strings.size());
+    using allocator_t = std::allocator<char>;
+
+    // Before sorting the strings themselves, which is a heavy operation, let's sort some prefixes
+    // to understand how the sorting algorithm behaves.
+    std::vector<sz_pgram_t> pgrams(strings.size());
+    std::transform(strings.begin(), strings.end(), pgrams.begin(), [](std::string const &str) {
+        sz_pgram_t pgram = 0;
+        std::memcpy(&pgram, str.c_str(), (std::min)(sizeof(pgram), str.size()));
+        return pgram;
+    });
+
+    // Sorting P-grams
+    bench_permute("std::sort(pgrams)", [&]() {
+        std::iota(permute.begin(), permute.end(), 0);
+        std::sort(permute.begin(), permute.end(),
+                  [&](sz_sorted_idx_t i, sz_sorted_idx_t j) { return pgrams[i] < pgrams[j]; });
+    });
+    expect_sorted(pgrams, permute);
+
+    // Unlike the `std::sort` adaptation above, the `sz_pgrams_sort_serial` also sorts the input array inplace
+    std::vector<sz_pgram_t> pgrams_sorted(strings.size());
+    bench_permute("sz_pgrams_sort_serial", [&]() {
+        std::copy(pgrams.begin(), pgrams.end(), pgrams_sorted.begin());
+        std::iota(permute.begin(), permute.end(), 0);
+        sz::_with_alloc<allocator_t>([&](sz_memory_allocator_t &alloc) {
+            return sz_pgrams_sort_serial(pgrams_sorted.data(), pgrams_sorted.size(), &alloc, permute.data());
+        });
+    });
+    expect_sorted(pgrams, permute);
 
-    permute_t permute_base, permute_new;
-    permute_base.resize(strings.size());
-    permute_new.resize(strings.size());
+    // Unlike the `std::sort` adaptation above, the `sz_pgrams_sort_stable_serial` also sorts the input array inplace
+    bench_permute("sz_pgrams_sort_stable_serial", [&]() {
+        std::copy(pgrams.begin(), pgrams.end(), pgrams_sorted.begin());
+        std::iota(permute.begin(), permute.end(), 0);
+        sz::_with_alloc<allocator_t>([&](sz_memory_allocator_t &alloc) {
+            return sz_pgrams_sort_stable_serial(pgrams_sorted.data(), pgrams_sorted.size(), &alloc, permute.data());
+        });
+    });
+    expect_sorted(pgrams, permute);
 
-    // Sorting
-    bench_permute("std::sort", strings, permute_base, [](strings_t const &strings, permute_t &permute) {
-        std::sort(permute.begin(), permute.end(), [&](idx_t i, idx_t j) { return strings[i] < strings[j]; });
+    // Sorting strings
+    bench_permute("std::sort(positions)", [&]() {
+        std::iota(permute.begin(), permute.end(), 0);
+        std::sort(permute.begin(), permute.end(),
+                  [&](sz_sorted_idx_t i, sz_sorted_idx_t j) { return strings[i] < strings[j]; });
     });
-    expect_sorted(strings, permute_base);
+    expect_sorted(strings, permute);
 
-    bench_permute("sz_sort_serial", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
+    bench_permute("sz_sequence_argsort", [&]() {
+        std::iota(permute.begin(), permute.end(), 0);
         sz_sequence_t array;
         array.count = strings.size();
         array.handle = &strings;
         array.get_start = get_start;
         array.get_length = get_length;
-        sz::_with_alloc<std::allocator<char>>(
-            [&](sz_memory_allocator_t &alloc) { return sz_sort_serial(&array, &alloc, permute.data()); });
+        sz::_with_alloc<allocator_t>(
+            [&](sz_memory_allocator_t &alloc) { return sz_sequence_argsort(&array, &alloc, permute.data()); });
     });
-    expect_sorted(strings, permute_new);
+    expect_sorted(strings, permute);
 
 #if __linux__ && defined(_GNU_SOURCE) && !defined(__BIONIC__)
-    bench_permute("qsort_r", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
+    bench_permute("qsort_r", [&]() {
+        std::iota(permute.begin(), permute.end(), 0);
         sz_sequence_t array;
         array.count = strings.size();
         array.handle = &strings;
         array.get_start = get_start;
         array.get_length = get_length;
-        qsort_r(permute.data(), array.count, sizeof(sz_u64_t), _get_qsort_order, &array);
+        qsort_r(permute.data(), array.count, sizeof(sz_sorted_idx_t), _get_qsort_order, &array);
     });
-    expect_sorted(strings, permute_new);
+    expect_sorted(strings, permute);
 #elif defined(_MSC_VER)
-    bench_permute("qsort_s", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
+    bench_permute("qsort_s", [&]() {
+        std::iota(permute.begin(), permute.end(), 0);
         sz_sequence_t array;
         array.count = strings.size();
         array.handle = &strings;
         array.get_start = get_start;
         array.get_length = get_length;
-        qsort_s(permute.data(), array.count, sizeof(sz_u64_t), _get_qsort_order, &array);
+        qsort_s(permute.data(), array.count, sizeof(sz_sorted_idx_t), _get_qsort_order, &array);
     });
-    expect_sorted(strings, permute_new);
+    expect_sorted(strings, permute);
 #else
     sz_unused(_get_qsort_order);
 #endif
 
     std::printf("---- Stable Sorting:\n");
-    bench_permute("std::stable_sort", strings, permute_base, [](strings_t const &strings, permute_t &permute) {
-        std::stable_sort(permute.begin(), permute.end(), [&](idx_t i, idx_t j) { return strings[i] < strings[j]; });
+    bench_permute("std::stable_sort", [&]() {
+        std::iota(permute.begin(), permute.end(), 0);
+        std::stable_sort(permute.begin(), permute.end(),
+                         [&](sz_sorted_idx_t i, sz_sorted_idx_t j) { return strings[i] < strings[j]; });
     });
-    expect_sorted(strings, permute_base);
+    expect_sorted(strings, permute);
 
     return 0;
 }
diff --git a/scripts/test.cpp b/scripts/test.cpp
index d8f0cdd6..7b3fe4db 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -1596,9 +1596,9 @@ static void test_sequence_algorithms() {
     using order_t = std::vector<sz::sorted_idx_t>;
 
     // Basic tests with predetermined orders.
-    assert_scoped(strs_t x({"a", "b", "c", "d"}), (void)0, sz::sorted_order(x) == order_t({0u, 1u, 2u, 3u}));
-    assert_scoped(strs_t x({"b", "c", "d", "a"}), (void)0, sz::sorted_order(x) == order_t({3u, 0u, 1u, 2u}));
-    assert_scoped(strs_t x({"b", "a", "d", "c"}), (void)0, sz::sorted_order(x) == order_t({1u, 0u, 3u, 2u}));
+    assert_scoped(strs_t x({"a", "b", "c", "d"}), (void)0, sz::argsort(x) == order_t({0u, 1u, 2u, 3u}));
+    assert_scoped(strs_t x({"b", "c", "d", "a"}), (void)0, sz::argsort(x) == order_t({3u, 0u, 1u, 2u}));
+    assert_scoped(strs_t x({"b", "a", "d", "c"}), (void)0, sz::argsort(x) == order_t({1u, 0u, 3u, 2u}));
 
     // Test on long strings of identical length.
     for (std::size_t string_length : {5u, 25u}) {
@@ -1611,7 +1611,7 @@ static void test_sequence_algorithms() {
             // Run several iterations of fuzzy tests.
             for (std::size_t experiment_idx = 0; experiment_idx < 10; ++experiment_idx) {
                 std::shuffle(dataset.begin(), dataset.end(), global_random_generator());
-                auto order = sz::sorted_order(dataset);
+                auto order = sz::argsort(dataset);
                 for (std::size_t i = 1; i < dataset.size(); ++i) assert(dataset[order[i - 1]] <= dataset[order[i]]);
             }
         }
@@ -1626,7 +1626,7 @@ static void test_sequence_algorithms() {
         // Run several iterations of fuzzy tests.
         for (std::size_t experiment_idx = 0; experiment_idx < 10; ++experiment_idx) {
             std::shuffle(dataset.begin(), dataset.end(), global_random_generator());
-            auto order = sz::sorted_order(dataset);
+            auto order = sz::argsort(dataset);
             for (std::size_t i = 1; i < dataset_size; ++i) { assert(dataset[order[i - 1]] <= dataset[order[i]]); }
         }
     }
@@ -1642,7 +1642,7 @@ static void test_sequence_algorithms() {
         // Run several iterations of fuzzy tests.
         for (std::size_t experiment_idx = 0; experiment_idx < 10; ++experiment_idx) {
             std::shuffle(dataset.begin(), dataset.end(), global_random_generator());
-            auto order = sz::sorted_order(dataset);
+            auto order = sz::argsort(dataset);
             for (std::size_t i = 1; i < dataset_size; ++i) { assert(dataset[order[i - 1]] <= dataset[order[i]]); }
         }
     }
@@ -1656,7 +1656,7 @@ static void test_sequence_algorithms() {
         // Run several iterations of fuzzy tests.
         for (std::size_t experiment_idx = 0; experiment_idx < 10; ++experiment_idx) {
             std::shuffle(dataset.begin(), dataset.end(), global_random_generator());
-            auto order = sz::sorted_order(dataset);
+            auto order = sz::argsort(dataset);
             for (std::size_t i = 1; i < dataset_size; ++i) { assert(dataset[order[i - 1]] <= dataset[order[i]]); }
         }
     }

From db61d93a1c47f5de0faba07221446cb1a1021510 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 16 Feb 2025 23:31:57 +0000
Subject: [PATCH 106/751] Fix: Merge-step bug in stable sort

---
 include/stringzilla/sort.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index 9ea19e8d..e25876cf 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -498,6 +498,7 @@ void _sz_sequence_argsort_stable_serial_merge(
     // Compute the end pointers for each input array
     sz_pgram_t const *const first_end = first_pgrams + first_count;
     sz_pgram_t const *const second_end = second_pgrams + second_count;
+    sz_pgram_t *const merged_begin = result_pgrams;
 
     // Merge until one array is exhausted
     while (first_pgrams < first_end && second_pgrams < second_end) {
@@ -510,13 +511,11 @@ void _sz_sequence_argsort_stable_serial_merge(
             *result_indices++ = *second_indices++;
         }
         else {
-            // Equal keys: for stability, choose the one from the first array
+            // Equal keys: for stability, choose the one from the first array, and don't increment the second array
             *result_pgrams++ = *first_pgrams;
             *result_indices++ = *first_indices;
             ++first_pgrams;
             ++first_indices;
-            ++second_pgrams;
-            ++second_indices;
         }
     }
 
@@ -531,6 +530,11 @@ void _sz_sequence_argsort_stable_serial_merge(
         *result_pgrams++ = *second_pgrams++;
         *result_indices++ = *second_indices++;
     }
+
+    // Validate the merged result.
+    if (SZ_DEBUG)
+        for (sz_size_t i = 1; i < first_count + second_count; ++i)
+            _sz_assert(merged_begin[i - 1] <= merged_begin[i] && "The merged pgrams must be in ascending order.");
 }
 
 SZ_PUBLIC sz_bool_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,

From a38867fdccf76ac00926c0cb90077eb9be1c5e44 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 16 Feb 2025 23:33:42 +0000
Subject: [PATCH 107/751] Improve: Expose Insertion-sort helpers

---
 include/stringzilla/sort.h | 133 ++++++++++++++++++++++++-------------
 1 file changed, 86 insertions(+), 47 deletions(-)

diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index e25876cf..b6fcdbc9 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -19,6 +19,11 @@
  *  - `sz_pgrams_sort` - to inplace sort continuous pointer-sized integers with QuickSort.
  *  - `sz_pgrams_sort_stable` - to inplace stable-sort continuous pointer-sized integers with a MergeSort.
  *
+ *  For cases, when the input is known to be tiny, we provide quadratic-complexity insertion sort adaptations:
+ *
+ *  - `sz_sequence_argsort_with_insertion` - for string collections.
+ *  - `sz_pgrams_sort_stable_with_insertion` - for continuous unsigned integers.
+ *
  */
 #ifndef STRINGZILLA_SORT_H_
 #define STRINGZILLA_SORT_H_
@@ -145,6 +150,73 @@ SZ_PUBLIC sz_bool_t sz_pgrams_sort_stable_sve(sz_pgram_t *pgrams, sz_size_t coun
 
 #pragma endregion
 
+#pragma region Generic Helpers
+
+/**
+ *  @brief  Quadratic complexity insertion sort adjust for our @b argsort usecase.
+ *          Needs no extra memory and is used as a fallback for small inputs.
+ */
+SZ_PUBLIC void sz_sequence_argsort_with_insertion(sz_sequence_t const *sequence, sz_sorted_idx_t *order) {
+    // Assume `order` is already initialized with 0, 1, 2, ... N.
+    for (sz_size_t i = 1; i < sequence->count; ++i) {
+        sz_sorted_idx_t current_idx = order[i];
+        sz_size_t j = i;
+        while (j > 0) {
+            // Get the two strings to compare.
+            sz_sorted_idx_t previous_idx = order[j - 1];
+            sz_cptr_t previous_start = sequence->get_start(sequence, previous_idx);
+            sz_cptr_t current_start = sequence->get_start(sequence, current_idx);
+            sz_size_t previous_length = sequence->get_length(sequence, previous_idx);
+            sz_size_t current_length = sequence->get_length(sequence, current_idx);
+
+            // Use the provided sz_order to compare.
+            sz_ordering_t ordering = sz_order(previous_start, previous_length, current_start, current_length);
+
+            // If the previous string is not greater than current_idx, we're done.
+            if (ordering != sz_greater_k) break;
+
+            // Otherwise, shift the previous element to the right.
+            order[j] = order[j - 1];
+            --j;
+        }
+        order[j] = current_idx;
+    }
+}
+
+/**
+ *  @brief  Quadratic complexity insertion sort adjust for our @b pgram-sorting usecase.
+ *          Needs no extra memory and is used as a fallback for small inputs.
+ */
+
+SZ_PUBLIC void sz_pgrams_sort_stable_with_insertion(sz_pgram_t *pgrams, sz_size_t count, sz_sorted_idx_t *order) {
+
+    // Assume `order` is already initialized with 0, 1, 2, ... N.
+    for (sz_size_t i = 1; i < count; ++i) {
+        // Save the current key and corresponding index.
+        sz_pgram_t current_key = pgrams[i];
+        sz_sorted_idx_t current_idx = order[i];
+        sz_size_t j = i;
+
+        // Shift elements of the sorted region that are greater than the current key
+        // to the right. This loop stops as soon as the correct insertion point is found.
+        while (j > 0 && pgrams[j - 1] > current_key) {
+            pgrams[j] = pgrams[j - 1];
+            order[j] = order[j - 1];
+            --j;
+        }
+
+        // Insert the current key and index into their proper location.
+        pgrams[j] = current_key;
+        order[j] = current_idx;
+    }
+
+    if (SZ_DEBUG)
+        for (sz_size_t i = 1; i < count; ++i)
+            _sz_assert(pgrams[i - 1] <= pgrams[i] && "The pgrams should be sorted in ascending order.");
+}
+
+#pragma endregion
+
 #pragma region Serial QuickSort Implementation
 
 SZ_PUBLIC void _sz_sequence_argsort_serial_export_next_pgrams(                  //
@@ -341,37 +413,6 @@ SZ_PUBLIC void _sz_sequence_argsort_serial_next_pgrams(                   //
     }
 }
 
-SZ_PUBLIC void _sz_sequence_argsort_serial_insertion(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
-                                                     sz_sorted_idx_t *order) {
-    // This algorithm needs no memory allocations:
-    sz_unused(alloc);
-
-    // Assume `order` is already initialized with 0, 1, 2, ... N.
-    for (sz_size_t i = 1; i < sequence->count; ++i) {
-        sz_sorted_idx_t current_idx = order[i];
-        sz_size_t j = i;
-        while (j > 0) {
-            // Get the two strings to compare.
-            sz_sorted_idx_t previous_idx = order[j - 1];
-            sz_cptr_t previous_start = sequence->get_start(sequence, previous_idx);
-            sz_cptr_t current_start = sequence->get_start(sequence, current_idx);
-            sz_size_t previous_length = sequence->get_length(sequence, previous_idx);
-            sz_size_t current_length = sequence->get_length(sequence, current_idx);
-
-            // Use the provided sz_order to compare.
-            sz_ordering_t ordering = sz_order(previous_start, previous_length, current_start, current_length);
-
-            // If the previous string is not greater than current_idx, we're done.
-            if (ordering != sz_greater_k) break;
-
-            // Otherwise, shift the previous element to the right.
-            order[j] = order[j - 1];
-            --j;
-        }
-        order[j] = current_idx;
-    }
-}
-
 SZ_PUBLIC sz_bool_t sz_sequence_argsort_serial(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
                                                sz_sorted_idx_t *order) {
 
@@ -381,7 +422,7 @@ SZ_PUBLIC sz_bool_t sz_sequence_argsort_serial(sz_sequence_t const *sequence, sz
     // On very small collections - just use the quadratic-complexity insertion sort
     // without any smart optimizations or memory allocations.
     if (sequence->count <= 32) {
-        _sz_sequence_argsort_serial_insertion(sequence, alloc, order);
+        sz_sequence_argsort_with_insertion(sequence, order);
         return sz_true_k;
     }
 
@@ -543,22 +584,20 @@ SZ_PUBLIC sz_bool_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t c
     // First, initialize the `order` with `std::iota`-like behavior.
     for (sz_size_t i = 0; i != count; ++i) order[i] = i;
 
+    // On very small collections - just use the quadratic-complexity insertion sort
+    // without any smart optimizations or memory allocations.
+    if (count <= 32) {
+        sz_pgrams_sort_stable_with_insertion(pgrams, count, order);
+        return sz_true_k;
+    }
+
     // Go through short chunks of 8 elements and sort them with a sorting network.
-    for (sz_size_t i = 0; i + 8 <= count; i += 8) _sz_sequence_argsort_stable_serial_8x_network(pgrams + i, order + i);
+    for (sz_size_t i = 0; i + 8u <= count; i += 8u)
+        _sz_sequence_argsort_stable_serial_8x_network(pgrams + i, order + i);
 
     // For the tail of the array, sort it with insertion sort.
-    for (sz_size_t i = count & ~7; i < count; i++) {
-        sz_pgram_t current_address = pgrams[i];
-        sz_sorted_idx_t current_idx = order[i];
-        sz_size_t j = i;
-        while (j > 0 && pgrams[j - 1] > current_address) {
-            pgrams[j] = pgrams[j - 1];
-            order[j] = order[j - 1];
-            --j;
-        }
-        pgrams[j] = current_address;
-        order[j] = current_idx;
-    }
+    sz_size_t const tail_count = count & 7u;
+    sz_pgrams_sort_stable_with_insertion(pgrams + count - tail_count, tail_count, order + count - tail_count);
 
     // At this point, the array is partitioned into sorted runs.
     // We'll now merge these runs until the whole array is sorted.
@@ -589,8 +628,8 @@ SZ_PUBLIC sz_bool_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t c
 
             // Determine the number of elements in the right run.
             sz_size_t right_count = run_size;
-            if (i + run_size >= count) { right_count = 0; }
-            else if (i + run_size + right_count > count) { right_count = count - (i + run_size); }
+            if (i + left_count >= count) { right_count = 0; }
+            else if (i + left_count + right_count > count) { right_count = count - (i + left_count); }
 
             // Merge the two runs:
             _sz_sequence_argsort_stable_serial_merge(                             //

From cd6859a56f7cdfba698d7389733871ec9cf45e8b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 17 Feb 2025 11:01:02 +0000
Subject: [PATCH 108/751] Add: Smaller Sorting Networks

It yields no noticeable performance improvements
---
 include/stringzilla/sort.h | 352 ++++++++++++++++++++++---------------
 1 file changed, 215 insertions(+), 137 deletions(-)

diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index b6fcdbc9..977d29e1 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -150,7 +150,7 @@ SZ_PUBLIC sz_bool_t sz_pgrams_sort_stable_sve(sz_pgram_t *pgrams, sz_size_t coun
 
 #pragma endregion
 
-#pragma region Generic Helpers
+#pragma region Generic Public Helpers
 
 /**
  *  @brief  Quadratic complexity insertion sort adjust for our @b argsort usecase.
@@ -210,16 +210,148 @@ SZ_PUBLIC void sz_pgrams_sort_stable_with_insertion(sz_pgram_t *pgrams, sz_size_
         order[j] = current_idx;
     }
 
-    if (SZ_DEBUG)
-        for (sz_size_t i = 1; i < count; ++i)
-            _sz_assert(pgrams[i - 1] <= pgrams[i] && "The pgrams should be sorted in ascending order.");
+#if SZ_DEBUG
+    for (sz_size_t i = 1; i < count; ++i)
+        _sz_assert(pgrams[i - 1] <= pgrams[i] && "The pgrams should be sorted in ascending order.");
+#endif
 }
 
-#pragma endregion
+#pragma endregion // Generic Public Helpers
+
+#pragma region Generic Internal Helpers
+
+/**
+ *  @brief  Convenience macro for of conditional swap of "pgrams" and their indices for a sorting network.
+ *  @see    https://en.wikipedia.org/wiki/Sorting_network
+ */
+#define _sz_sequence_sorting_network_conditional_swap(i, j)    \
+    do {                                                       \
+        if (pgrams[i] > pgrams[j]) {                           \
+            _sz_swap(sz_pgram_t, pgrams[i], pgrams[j]);        \
+            _sz_swap(sz_sorted_idx_t, offsets[i], offsets[j]); \
+        }                                                      \
+    } while (0)
+
+/**
+ *  @brief  Sorting network for 2 elements is just a single compare–swap.
+ */
+SZ_INTERNAL void _sz_sequence_sorting_network_2x(sz_pgram_t *pgrams, sz_sorted_idx_t *offsets) {
+    _sz_sequence_sorting_network_conditional_swap(0, 1);
+}
+
+/**
+ *  @brief  Sorting network for 3 elements.
+ *
+ *  The network uses 3 compare–swap operations:
+ *
+ *      Stage 1: (0, 1)
+ *      Stage 2: (0, 2)
+ *      Stage 3: (1, 2)
+ */
+SZ_INTERNAL void _sz_sequence_sorting_network_3x(sz_pgram_t *pgrams, sz_sorted_idx_t *offsets) {
+
+    _sz_sequence_sorting_network_conditional_swap(0, 1);
+    _sz_sequence_sorting_network_conditional_swap(0, 2);
+    _sz_sequence_sorting_network_conditional_swap(1, 2);
+
+#if SZ_DEBUG
+    for (sz_size_t i = 1; i < 3; ++i)
+        _sz_assert(pgrams[i - 1] <= pgrams[i] && "Sorting network for 3 elements failed.");
+#endif
+}
+
+/**
+ *  @brief  Sorting network for 4 elements.
+ *
+ *  The network uses 5 compare–swap operations:
+ *
+ *      Stage 1: (0, 1) and (2, 3)
+ *      Stage 2: (0, 2)
+ *      Stage 3: (1, 3)
+ *      Stage 4: (1, 2)
+ */
+SZ_INTERNAL void _sz_sequence_sorting_network_4x(sz_pgram_t *pgrams, sz_sorted_idx_t *offsets) {
+
+    // Stage 1: Compare–swap adjacent pairs.
+    _sz_sequence_sorting_network_conditional_swap(0, 1);
+    _sz_sequence_sorting_network_conditional_swap(2, 3);
+
+    // Stage 2: Compare–swap (0, 2)
+    _sz_sequence_sorting_network_conditional_swap(0, 2);
+
+    // Stage 3: Compare–swap (1, 3)
+    _sz_sequence_sorting_network_conditional_swap(1, 3);
+
+    // Stage 4: Final compare–swap (1, 2)
+    _sz_sequence_sorting_network_conditional_swap(1, 2);
+
+#if SZ_DEBUG
+    for (sz_size_t i = 1; i < 4; ++i)
+        _sz_assert(pgrams[i - 1] <= pgrams[i] && "Sorting network for 4 elements failed.");
+#endif
+}
+
+/**
+ *  @brief  A scalar sorting network for 8 elements that reorders both the pgrams
+ *          and their corresponding offsets in only 19 comparisons, the most efficient
+ *          variant currently known.
+ *
+ *  The network consists of 6 stages with the following compare–swap pairs:
+ *
+ *      Stage 1: (0,1), (2,3), (4,5), (6,7)
+ *      Stage 2: (0,2), (1,3), (4,6), (5,7)
+ *      Stage 3: (1,2), (5,6)
+ *      Stage 4: (0,4), (1,5), (2,6), (3,7)
+ *      Stage 5: (2,4), (3,5)
+ *      Stage 6: (1,2), (3,4), (5,6)
+ */
+SZ_INTERNAL void _sz_sequence_sorting_network_8x(sz_pgram_t *pgrams, sz_sorted_idx_t *offsets) {
+
+    // Stage 1: Compare–swap adjacent pairs.
+    _sz_sequence_sorting_network_conditional_swap(0, 1);
+    _sz_sequence_sorting_network_conditional_swap(2, 3);
+    _sz_sequence_sorting_network_conditional_swap(4, 5);
+    _sz_sequence_sorting_network_conditional_swap(6, 7);
+
+    // Stage 2: Compare–swap with stride 2.
+    _sz_sequence_sorting_network_conditional_swap(0, 2);
+    _sz_sequence_sorting_network_conditional_swap(1, 3);
+    _sz_sequence_sorting_network_conditional_swap(4, 6);
+    _sz_sequence_sorting_network_conditional_swap(5, 7);
+
+    // Stage 3: Compare–swap between middle elements.
+    _sz_sequence_sorting_network_conditional_swap(1, 2);
+    _sz_sequence_sorting_network_conditional_swap(5, 6);
+
+    // Stage 4: Compare–swap across the two halves.
+    _sz_sequence_sorting_network_conditional_swap(0, 4);
+    _sz_sequence_sorting_network_conditional_swap(1, 5);
+    _sz_sequence_sorting_network_conditional_swap(2, 6);
+    _sz_sequence_sorting_network_conditional_swap(3, 7);
+
+    // Stage 5: Compare–swap within each half.
+    _sz_sequence_sorting_network_conditional_swap(2, 4);
+    _sz_sequence_sorting_network_conditional_swap(3, 5);
+
+    // Stage 6: Final compare–swap of adjacent elements.
+    _sz_sequence_sorting_network_conditional_swap(1, 2);
+    _sz_sequence_sorting_network_conditional_swap(3, 4);
+    _sz_sequence_sorting_network_conditional_swap(5, 6);
+
+#if SZ_DEBUG
+    // Validate the sorting network.
+    for (sz_size_t i = 1; i < 8; ++i)
+        _sz_assert(pgrams[i - 1] <= pgrams[i] && "The sorting network must sort the pgrams in ascending order.");
+#endif
+}
+
+#undef _sz_sequence_sorting_network_conditional_swap
+
+#pragma endregion // Generic Internal Helpers
 
 #pragma region Serial QuickSort Implementation
 
-SZ_PUBLIC void _sz_sequence_argsort_serial_export_next_pgrams(                  //
+SZ_INTERNAL void _sz_sequence_argsort_serial_export_next_pgrams(                //
     sz_sequence_t const *const sequence,                                        //
     sz_pgram_t *const global_pgrams, sz_sorted_idx_t const *const global_order, //
     sz_size_t const start_in_sequence, sz_size_t const end_in_sequence,         //
@@ -227,7 +359,7 @@ SZ_PUBLIC void _sz_sequence_argsort_serial_export_next_pgrams(
 
     // Depending on the architecture, we will export a different number of bytes.
     // On 32-bit architectures, we will export 3 bytes, and on 64-bit architectures - 7 bytes.
-    sz_size_t const window_capacity = sizeof(sz_pgram_t) - 1;
+    sz_size_t const pgram_capacity = sizeof(sz_pgram_t) - 1;
 
     // Perform the same operation for every string.
     for (sz_size_t i = start_in_sequence; i < end_in_sequence; ++i) {
@@ -241,14 +373,14 @@ SZ_PUBLIC void _sz_sequence_argsort_serial_export_next_pgrams(
         sz_cptr_t const source_str = sequence->get_start(sequence, partial_order_index);
         sz_size_t const length = sequence->get_length(sequence, partial_order_index);
         sz_size_t const remaining_length = length > start_character ? length - start_character : 0;
-        sz_size_t const exported_length = remaining_length > window_capacity ? window_capacity : remaining_length;
+        sz_size_t const exported_length = remaining_length > pgram_capacity ? pgram_capacity : remaining_length;
 
         // Fill with zeros, export a slice, and mark the exported length.
         sz_pgram_t *target_pgram = &global_pgrams[i];
         sz_ptr_t target_str = (sz_ptr_t)target_pgram;
         *target_pgram = 0;
         for (sz_size_t j = 0; j < exported_length; ++j) target_str[j] = source_str[j + start_character];
-        target_str[window_capacity] = exported_length;
+        target_str[pgram_capacity] = exported_length;
 #if defined(_SZ_IS_64_BIT)
         *target_pgram = sz_u64_bytes_reverse(*target_pgram);
 #else
@@ -259,36 +391,52 @@ SZ_PUBLIC void _sz_sequence_argsort_serial_export_next_pgrams(
             "We can have a zero value if only the string is shorter than other strings at this position.");
     }
 
-    // As our goal is to sort the strings using the exported integer "windows",
+    // As our goal is to sort the strings using the exported integer "pgrams",
     // this is a good place to validate the correctness of the exported data.
     if (SZ_DEBUG && start_character == 0)
         for (sz_size_t i = start_in_sequence + 1; i < end_in_sequence; ++i) {
-            sz_pgram_t const previous_window = global_pgrams[i - 1];
-            sz_pgram_t const current_window = global_pgrams[i];
+            sz_pgram_t const previous_pgram = global_pgrams[i - 1];
+            sz_pgram_t const current_pgram = global_pgrams[i];
             sz_cptr_t const previous_str = sequence->get_start(sequence, i - 1);
             sz_size_t const previous_length = sequence->get_length(sequence, i - 1);
             sz_cptr_t const current_str = sequence->get_start(sequence, i);
             sz_size_t const current_length = sequence->get_length(sequence, i);
-            sz_ordering_t const ordering = sz_order(                                                 //
-                previous_str, previous_length > window_capacity ? window_capacity : previous_length, //
-                current_str, current_length > window_capacity ? window_capacity : current_length);
-            _sz_assert(                                                          //
-                (previous_window < current_window) == (ordering == sz_less_k) && //
-                "The exported windows should be in the same order as the original strings.");
+            sz_ordering_t const ordering = sz_order(                                               //
+                previous_str, previous_length > pgram_capacity ? pgram_capacity : previous_length, //
+                current_str, current_length > pgram_capacity ? pgram_capacity : current_length);
+            _sz_assert(                                                        //
+                (previous_pgram < current_pgram) == (ordering == sz_less_k) && //
+                "The exported pgrams should be in the same order as the original strings.");
         }
 }
 
 /**
- *  @brief  The most important part of the QuickSort algorithm, that rearranges the elements in
- *          such a way, that all entries around the pivot are less than the pivot.
- *
- *  It means that no relative order among the elements on the left or right side of the pivot is preserved.
- *  We chose the pivot point using Robert Sedgewick's method - the median of three elements - the first,
- *  the middle, and the last element of the given range.
+ *  @brief  Picks the "pivot" value for the QuickSort algorithm's partitioning step using Robert Sedgewick's method,
+ *          the median of three elements - the first, the middle, and the last element of the given range.
+ */
+SZ_INTERNAL sz_pgram_t _sz_sequence_partitioning_pivot(sz_pgram_t const *pgrams, sz_size_t count) {
+    sz_size_t const middle_offset = count / 2;
+    sz_pgram_t const first_pgram = pgrams[0];
+    sz_pgram_t const middle_pgram = pgrams[middle_offset];
+    sz_pgram_t const last_pgram = pgrams[count - 1];
+    if (first_pgram < middle_pgram) {
+        if (middle_pgram < last_pgram) { return middle_pgram; }
+        else if (first_pgram < last_pgram) { return last_pgram; }
+        else { return first_pgram; }
+    }
+    else {
+        if (first_pgram < last_pgram) { return first_pgram; }
+        else if (middle_pgram < last_pgram) { return last_pgram; }
+        else { return middle_pgram; }
+    }
+}
+
+/**
+ *  @brief  The most important part of the QuickSort algorithm partitioning the elements around the pivot.
  *
- *  Moreover, considering our iterative refinement procedure, we can't just use the normal 2-way partitioning,
- *  as it will scatter the values equal to the pivot into the left and right partitions. Instead we use the
- *  Dutch National Flag @b 3-way partitioning, outputting the range of values equal to the pivot.
+ *  The classical variant uses the normal 2-way partitioning, but it will scatter the values equal to the pivot
+ *  into the left and right partitions. Instead we use the Dutch National Flag @b 3-way partitioning, outputting
+ *  the range of values equal to the pivot.
  *
  *  @see https://en.wikipedia.org/wiki/Dutch_national_flag_problem
  */
@@ -297,47 +445,42 @@ SZ_PUBLIC void _sz_sequence_argsort_serial_3way_partition(                //
     sz_size_t const start_in_sequence, sz_size_t const end_in_sequence,   //
     sz_size_t *first_pivot_offset, sz_size_t *last_pivot_offset) {
 
-    // Chose the pivot offset with Sedgewick's method.
-    sz_pgram_t pivot_window;
-    {
-        sz_size_t const middle_offset = start_in_sequence + (end_in_sequence - start_in_sequence) / 2;
-        sz_size_t const last_offset = end_in_sequence - 1;
-        sz_size_t const first_offset = start_in_sequence;
-        sz_pgram_t const first_window = global_pgrams[first_offset];
-        sz_pgram_t const middle_window = global_pgrams[middle_offset];
-        sz_pgram_t const last_window = global_pgrams[last_offset];
-        if (first_window < middle_window) {
-            if (middle_window < last_window) { pivot_window = middle_window; }
-            else if (first_window < last_window) { pivot_window = last_window; }
-            else { pivot_window = first_window; }
-        }
-        else {
-            if (first_window < last_window) { pivot_window = first_window; }
-            else if (middle_window < last_window) { pivot_window = last_window; }
-            else { pivot_window = middle_window; }
-        }
+    // On very small inputs this procedure is rudimentary.
+    sz_size_t const count = end_in_sequence - start_in_sequence;
+    if (count <= 4) {
+        sz_pgram_t *const pgrams = global_pgrams + start_in_sequence;
+        sz_sorted_idx_t *const offsets = global_order + start_in_sequence;
+        if (count == 2) { _sz_sequence_sorting_network_2x(pgrams, offsets); }
+        else if (count == 3) { _sz_sequence_sorting_network_3x(pgrams, offsets); }
+        else if (count == 4) { _sz_sequence_sorting_network_4x(pgrams, offsets); }
+        *first_pivot_offset = start_in_sequence;
+        *last_pivot_offset = end_in_sequence;
+        return;
     }
 
+    // Chose the pivot offset with Sedgewick's method.
+    sz_pgram_t const pivot_pgram = _sz_sequence_partitioning_pivot(global_pgrams + start_in_sequence, count);
+
     // Loop through the collection and move the elements around the pivot with the 3-way partitioning.
     sz_size_t partitioning_progress = start_in_sequence; // Current index.
-    sz_size_t smaller_offset = start_in_sequence;        // Boundary for elements < pivot_window.
-    sz_size_t greater_offset = end_in_sequence - 1;      // Boundary for elements > pivot_window.
+    sz_size_t smaller_offset = start_in_sequence;        // Boundary for elements < `pivot_pgram`.
+    sz_size_t greater_offset = end_in_sequence - 1;      // Boundary for elements > `pivot_pgram`.
 
     while (partitioning_progress <= greater_offset) {
         // Element is less than pivot: swap into the < pivot region.
-        if (global_pgrams[partitioning_progress] < pivot_window) {
+        if (global_pgrams[partitioning_progress] < pivot_pgram) {
             _sz_swap(sz_sorted_idx_t, global_order[partitioning_progress], global_order[smaller_offset]);
             _sz_swap(sz_pgram_t, global_pgrams[partitioning_progress], global_pgrams[smaller_offset]);
             ++partitioning_progress;
             ++smaller_offset;
         }
         // Element is greater than pivot: swap into the > pivot region.
-        else if (global_pgrams[partitioning_progress] > pivot_window) {
+        else if (global_pgrams[partitioning_progress] > pivot_pgram) {
             _sz_swap(sz_sorted_idx_t, global_order[partitioning_progress], global_order[greater_offset]);
             _sz_swap(sz_pgram_t, global_pgrams[partitioning_progress], global_pgrams[greater_offset]);
             --greater_offset;
         }
-        // Element equals pivot_window: leave it in place.
+        // Element equals `pivot_pgram`: leave it in place.
         else { ++partitioning_progress; }
     }
 
@@ -349,7 +492,7 @@ SZ_PUBLIC void _sz_sequence_argsort_serial_3way_partition(                //
  *  @brief  Recursive Quick-Sort implementation backing both the `sz_sequence_argsort` and `sz_pgrams_sort`,
  *          and using the `_sz_sequence_argsort_serial_3way_partition` under the hood.
  */
-SZ_PUBLIC void _sz_sequence_argsort_serial_recursively(                   //
+SZ_INTERNAL void _sz_sequence_argsort_serial_recursively(                 //
     sz_pgram_t *const global_pgrams, sz_sorted_idx_t *const global_order, //
     sz_size_t const start_in_sequence, sz_size_t const end_in_sequence) {
 
@@ -372,41 +515,41 @@ SZ_PUBLIC void _sz_sequence_argsort_serial_recursively(                   //
 /**
  *  @brief  Recursive Quick-Sort adaptation for strings, that processes the strings a few N-grams at a time.
  *          It combines `_sz_sequence_argsort_serial_export_next_pgrams` and `_sz_sequence_argsort_serial_recursively`,
- *          recursively diving into the identical windows.
+ *          recursively diving into the identical pgrams.
  */
-SZ_PUBLIC void _sz_sequence_argsort_serial_next_pgrams(                   //
+SZ_INTERNAL void _sz_sequence_argsort_serial_next_pgrams(                 //
     sz_sequence_t const *const sequence,                                  //
     sz_pgram_t *const global_pgrams, sz_sorted_idx_t *const global_order, //
     sz_size_t const start_in_sequence, sz_size_t const end_in_sequence,   //
     sz_size_t const start_character) {
 
-    // Prepare the new range of windows
+    // Prepare the new range of pgrams
     _sz_sequence_argsort_serial_export_next_pgrams(sequence, global_pgrams, global_order, start_in_sequence,
                                                    end_in_sequence, start_character);
 
-    // Sort current windows with a quicksort
+    // Sort current pgrams with a quicksort
     _sz_sequence_argsort_serial_recursively(global_pgrams, global_order, start_in_sequence, end_in_sequence);
 
     // Depending on the architecture, we will export a different number of bytes.
     // On 32-bit architectures, we will export 3 bytes, and on 64-bit architectures - 7 bytes.
-    sz_size_t const window_capacity = sizeof(sz_pgram_t) - 1;
+    sz_size_t const pgram_capacity = sizeof(sz_pgram_t) - 1;
 
-    // Repeat the procedure for the identical windows
+    // Repeat the procedure for the identical pgrams
     sz_size_t nested_start = start_in_sequence;
     sz_size_t nested_end = start_in_sequence;
     while (nested_end != end_in_sequence) {
-        // Find the end of the identical windows
-        sz_pgram_t current_window_integer = global_pgrams[nested_start];
-        while (nested_end != end_in_sequence && current_window_integer == global_pgrams[nested_end]) ++nested_end;
+        // Find the end of the identical pgrams
+        sz_pgram_t current_pgram = global_pgrams[nested_start];
+        while (nested_end != end_in_sequence && current_pgram == global_pgrams[nested_end]) ++nested_end;
 
-        // If the identical windows are not trivial and each string has more characters, sort them recursively
-        sz_cptr_t current_window_str = (sz_cptr_t)&current_window_integer;
-        sz_size_t current_window_length = (sz_size_t)current_window_str[0]; //! The byte order was swapped
+        // If the identical pgrams are not trivial and each string has more characters, sort them recursively
+        sz_cptr_t current_pgram_str = (sz_cptr_t)&current_pgram;
+        sz_size_t current_pgram_length = (sz_size_t)current_pgram_str[0]; //! The byte order was swapped
         int has_multiple_strings = nested_end - nested_start > 1;
-        int has_more_characters_in_each = current_window_length == window_capacity;
+        int has_more_characters_in_each = current_pgram_length == pgram_capacity;
         if (has_multiple_strings && has_more_characters_in_each) {
             _sz_sequence_argsort_serial_next_pgrams(sequence, global_pgrams, global_order, nested_start, nested_end,
-                                                    start_character + window_capacity);
+                                                    start_character + pgram_capacity);
         }
         // Move to the next
         nested_start = nested_end;
@@ -438,14 +581,14 @@ SZ_PUBLIC sz_bool_t sz_sequence_argsort_serial(sz_sequence_t const *sequence, sz
     // is included in those P-long words. So, in reality, we will be taking (P-1) bytes from each string on every
     // iteration of a recursive algorithm.
     sz_size_t memory_usage = sequence->count * sizeof(sz_pgram_t);
-    sz_pgram_t *windows = (sz_pgram_t *)alloc->allocate(memory_usage, alloc);
-    if (!windows) return sz_false_k;
+    sz_pgram_t *pgrams = (sz_pgram_t *)alloc->allocate(memory_usage, alloc);
+    if (!pgrams) return sz_false_k;
 
     // Recursively sort the whole sequence.
-    _sz_sequence_argsort_serial_next_pgrams(sequence, windows, order, 0, sequence->count, 0);
+    _sz_sequence_argsort_serial_next_pgrams(sequence, pgrams, order, 0, sequence->count, 0);
 
     // Free temporary storage.
-    alloc->free(windows, memory_usage, alloc);
+    alloc->free(pgrams, memory_usage, alloc);
     return sz_true_k;
 }
 
@@ -463,75 +606,11 @@ SZ_PUBLIC sz_bool_t sz_pgrams_sort_serial(sz_pgram_t *pgrams, sz_size_t count, s
 
 #pragma region Serial MergeSort Implementation
 
-/**
- *  @brief  A scalar sorting network for 8 elements that reorders both the keys
- *          and their corresponding offsets in only 19 comparisons, the most efficient
- *          variant currently known.
- *  @see    https://en.wikipedia.org/wiki/Sorting_network
- *
- *  The network consists of 6 stages with the following compare–swap pairs:
- *
- *      Stage 1: (0,1), (2,3), (4,5), (6,7)
- *      Stage 2: (0,2), (1,3), (4,6), (5,7)
- *      Stage 3: (1,2), (5,6)
- *      Stage 4: (0,4), (1,5), (2,6), (3,7)
- *      Stage 5: (2,4), (3,5)
- *      Stage 6: (1,2), (3,4), (5,6)
- */
-void _sz_sequence_argsort_stable_serial_8x_network(sz_pgram_t *keys, sz_sorted_idx_t *offsets) {
-
-#define _sz_sequence_argsort_stable_8x_conditional_swap(i, j)  \
-    do {                                                       \
-        if (keys[i] > keys[j]) {                               \
-            _sz_swap(sz_pgram_t, keys[i], keys[j]);            \
-            _sz_swap(sz_sorted_idx_t, offsets[i], offsets[j]); \
-        }                                                      \
-    } while (0)
-
-    // Stage 1: Compare–swap adjacent pairs.
-    _sz_sequence_argsort_stable_8x_conditional_swap(0, 1);
-    _sz_sequence_argsort_stable_8x_conditional_swap(2, 3);
-    _sz_sequence_argsort_stable_8x_conditional_swap(4, 5);
-    _sz_sequence_argsort_stable_8x_conditional_swap(6, 7);
-
-    // Stage 2: Compare–swap with stride 2.
-    _sz_sequence_argsort_stable_8x_conditional_swap(0, 2);
-    _sz_sequence_argsort_stable_8x_conditional_swap(1, 3);
-    _sz_sequence_argsort_stable_8x_conditional_swap(4, 6);
-    _sz_sequence_argsort_stable_8x_conditional_swap(5, 7);
-
-    // Stage 3: Compare–swap between middle elements.
-    _sz_sequence_argsort_stable_8x_conditional_swap(1, 2);
-    _sz_sequence_argsort_stable_8x_conditional_swap(5, 6);
-
-    // Stage 4: Compare–swap across the two halves.
-    _sz_sequence_argsort_stable_8x_conditional_swap(0, 4);
-    _sz_sequence_argsort_stable_8x_conditional_swap(1, 5);
-    _sz_sequence_argsort_stable_8x_conditional_swap(2, 6);
-    _sz_sequence_argsort_stable_8x_conditional_swap(3, 7);
-
-    // Stage 5: Compare–swap within each half.
-    _sz_sequence_argsort_stable_8x_conditional_swap(2, 4);
-    _sz_sequence_argsort_stable_8x_conditional_swap(3, 5);
-
-    // Stage 6: Final compare–swap of adjacent elements.
-    _sz_sequence_argsort_stable_8x_conditional_swap(1, 2);
-    _sz_sequence_argsort_stable_8x_conditional_swap(3, 4);
-    _sz_sequence_argsort_stable_8x_conditional_swap(5, 6);
-
-#undef _sz_sequence_argsort_stable_8x_conditional_swap
-
-    // Validate the sorting network.
-    if (SZ_DEBUG)
-        for (sz_size_t i = 1; i < 8; ++i)
-            _sz_assert(keys[i - 1] <= keys[i] && "The sorting network must sort the keys in ascending order.");
-}
-
 /**
  *  @brief  Helper function similar to `std::set_union` over pairs of integers and their original indices.
  *  @see    https://en.cppreference.com/w/cpp/algorithm/set_union
  */
-void _sz_sequence_argsort_stable_serial_merge(                                                      //
+SZ_INTERNAL void _sz_sequence_argsort_stable_serial_merge(                                          //
     sz_pgram_t const *first_pgrams, sz_sorted_idx_t const *first_indices, sz_size_t first_count,    //
     sz_pgram_t const *second_pgrams, sz_sorted_idx_t const *second_indices, sz_size_t second_count, //
     sz_pgram_t *result_pgrams, sz_sorted_idx_t *result_indices) {
@@ -592,8 +671,7 @@ SZ_PUBLIC sz_bool_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t c
     }
 
     // Go through short chunks of 8 elements and sort them with a sorting network.
-    for (sz_size_t i = 0; i + 8u <= count; i += 8u)
-        _sz_sequence_argsort_stable_serial_8x_network(pgrams + i, order + i);
+    for (sz_size_t i = 0; i + 8u <= count; i += 8u) _sz_sequence_sorting_network_8x(pgrams + i, order + i);
 
     // For the tail of the array, sort it with insertion sort.
     sz_size_t const tail_count = count & 7u;

From 71f1f4baecca3e3692acc561c74f376a351478f6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 20 Feb 2025 13:35:39 +0000
Subject: [PATCH 109/751] Break: `checkum` to `bytesum`, new hash, and PRNG

---
 README.md                           |  2 +-
 c/lib.c                             | 63 +++++++++++++++++++++++++----
 include/stringzilla/stringzilla.h   |  2 +-
 include/stringzilla/stringzilla.hpp | 43 ++++++++------------
 include/stringzilla/types.h         | 16 ++++++--
 python/lib.c                        | 10 ++---
 rust/lib.rs                         | 20 ++++-----
 scripts/bench_fingerprint.cpp       |  2 +-
 scripts/bench_token.cpp             | 14 +++----
 scripts/test.cpp                    | 12 +++---
 scripts/test.py                     |  4 +-
 11 files changed, 119 insertions(+), 69 deletions(-)

diff --git a/README.md b/README.md
index c5253c4c..c657decf 100644
--- a/README.md
+++ b/README.md
@@ -629,7 +629,7 @@ sz_size_t substring_position = sz_find_haswell(haystack.start, haystack.length,
 sz_size_t substring_position = sz_find_neon(haystack.start, haystack.length, needle.start, needle.length);
 
 // Hash strings
-sz_u64_t hash = sz_hash(haystack.start, haystack.length);
+sz_u64_t hash = sz_hash(haystack.start, haystack.length, 42); // or any other seed ;)
 
 // Perform collection level operations
 sz_sequence_t array = {your_handle, your_count, your_get_start, your_get_length};
diff --git a/c/lib.c b/c/lib.c
index 64e7b61a..c68e7a1f 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -177,7 +177,12 @@ typedef struct sz_implementations_t {
     sz_move_t move;
     sz_fill_t fill;
     sz_look_up_transform_t look_up_transform;
-    sz_checksum_t checksum;
+
+    sz_bytesum_t bytesum;
+    sz_hash_t hash;
+    sz_hash_state_init_t hash_state_init;
+    sz_hash_state_stream_t hash_state_stream;
+    sz_hash_state_fold_t hash_state_fold;
 
     sz_find_byte_t find_byte;
     sz_find_byte_t rfind_byte;
@@ -214,7 +219,12 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
     impl->move = sz_move_serial;
     impl->fill = sz_fill_serial;
     impl->look_up_transform = sz_look_up_transform_serial;
-    impl->checksum = sz_checksum_serial;
+
+    impl->bytesum = sz_bytesum_serial;
+    impl->hash = sz_hash_serial;
+    impl->hash_state_init = sz_hash_state_init_serial;
+    impl->hash_state_stream = sz_hash_state_stream_serial;
+    impl->hash_state_fold = sz_hash_state_fold_serial;
 
     impl->find = sz_find_serial;
     impl->rfind = sz_rfind_serial;
@@ -236,7 +246,12 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->move = sz_move_haswell;
         impl->fill = sz_fill_haswell;
         impl->look_up_transform = sz_look_up_transform_haswell;
-        impl->checksum = sz_checksum_haswell;
+
+        impl->bytesum = sz_bytesum_haswell;
+        impl->hash = sz_hash_haswell;
+        impl->hash_state_init = sz_hash_state_init_haswell;
+        impl->hash_state_stream = sz_hash_state_stream_haswell;
+        impl->hash_state_fold = sz_hash_state_fold_haswell;
 
         impl->find_byte = sz_find_byte_haswell;
         impl->rfind_byte = sz_rfind_byte_haswell;
@@ -256,11 +271,17 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->move = sz_move_skylake;
         impl->fill = sz_fill_skylake;
 
+        impl->bytesum = sz_bytesum_skylake;
+        impl->hash = sz_hash_skylake;
+        impl->hash_state_init = sz_hash_state_init_skylake;
+        impl->hash_state_stream = sz_hash_state_stream_skylake;
+        impl->hash_state_fold = sz_hash_state_fold_skylake;
+
         impl->find = sz_find_skylake;
         impl->rfind = sz_rfind_skylake;
         impl->find_byte = sz_find_byte_skylake;
         impl->rfind_byte = sz_rfind_byte_skylake;
-        impl->checksum = sz_checksum_skylake;
+        impl->bytesum = sz_bytesum_skylake;
     }
 #endif
 
@@ -268,10 +289,17 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
     if (caps & sz_cap_ice_k) {
         impl->find_from_set = sz_find_charset_ice;
         impl->rfind_from_set = sz_rfind_charset_ice;
+
         impl->edit_distance = sz_edit_distance_ice;
         impl->alignment_score = sz_alignment_score_ice;
+
         impl->look_up_transform = sz_look_up_transform_ice;
-        impl->checksum = sz_checksum_ice;
+
+        impl->bytesum = sz_bytesum_ice;
+        impl->hash = sz_hash_ice;
+        impl->hash_state_init = sz_hash_state_init_ice;
+        impl->hash_state_stream = sz_hash_state_stream_ice;
+        impl->hash_state_fold = sz_hash_state_fold_ice;
     }
 #endif
 
@@ -283,7 +311,12 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->move = sz_move_neon;
         impl->fill = sz_fill_neon;
         impl->look_up_transform = sz_look_up_transform_neon;
-        impl->checksum = sz_checksum_neon;
+
+        impl->bytesum = sz_bytesum_neon;
+        impl->hash = sz_hash_neon;
+        impl->hash_state_init = sz_hash_state_init_neon;
+        impl->hash_state_stream = sz_hash_state_stream_neon;
+        impl->hash_state_fold = sz_hash_state_fold_neon;
 
         impl->find = sz_find_neon;
         impl->rfind = sz_rfind_neon;
@@ -331,7 +364,23 @@ BOOL WINAPI _DllMainCRTStartup(HINSTANCE hints, DWORD forward_reason, LPVOID lp)
 __attribute__((constructor)) static void sz_dispatch_table_init_on_gcc_or_clang(void) { sz_dispatch_table_init(); }
 #endif
 
-SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length) { return sz_dispatch_table.checksum(text, length); }
+SZ_DYNAMIC sz_u64_t sz_bytesum(sz_cptr_t text, sz_size_t length) { return sz_dispatch_table.bytesum(text, length); }
+
+SZ_DYNAMIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length, sz_u64_t seed) {
+    return sz_dispatch_table.hash(text, length, seed);
+}
+
+SZ_DYNAMIC void sz_hash_state_init(sz_hash_state_t *state, sz_u64_t seed) {
+    sz_dispatch_table.hash_state_init(state, seed);
+}
+
+SZ_DYNAMIC void sz_hash_state_stream(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
+    sz_dispatch_table.hash_state_stream(state, text, length);
+}
+
+SZ_DYNAMIC sz_u64_t sz_hash_state_fold(sz_hash_state_t const *state) {
+    return sz_dispatch_table.hash_state_fold(state);
+}
 
 SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
     return sz_dispatch_table.equal(a, b, length);
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 0b23b33b..349aba79 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -43,7 +43,7 @@
 
 #include "compare.h"      // `sz_equal`, `sz_order`
 #include "find.h"         // `sz_find`, `sz_find_charset`, `sz_rfind`
-#include "hash.h"         // `sz_checksum`, `sz_hash`, `sz_hashes`
+#include "hash.h"         // `sz_bytesum`, `sz_hash`, `sz_state_init`, `sz_state_stream`, `sz_state_fold`
 #include "memory.h"       // `sz_copy`, `sz_move`, `sz_fill`
 #include "similarity.h"   // `sz_edit_distance`, `sz_alignment_score`
 #include "small_string.h" // `sz_string_t`, `sz_string_init`, `sz_string_free`
diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 0a4737ad..12f65265 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -1929,7 +1929,7 @@ class basic_string_slice {
     size_type hash() const noexcept { return static_cast<size_type>(sz_hash(start_, length_)); }
 
     /**  @brief  Aggregates the values of individual bytes of a string. */
-    size_type checksum() const noexcept { return static_cast<size_type>(sz_checksum(start_, length_)); }
+    size_type bytesum() const noexcept { return static_cast<size_type>(sz_bytesum(start_, length_)); }
 
     /**  @brief  Populate a character set with characters present in this string. */
     char_set as_set() const noexcept {
@@ -3326,33 +3326,30 @@ class basic_string {
     size_type hash() const noexcept { return view().hash(); }
 
     /**  @brief  Aggregates the values of individual bytes of a string. */
-    size_type checksum() const noexcept { return view().checksum(); }
+    size_type bytesum() const noexcept { return view().bytesum(); }
 
     /**
-     *  @brief  Overwrites the string with random characters from the given alphabet using the random generator.
+     *  @brief  Overwrites the string with random binary data.
      *
-     *  @param  generator  A random generator function object that returns a random number in the range [0, 2^64).
-     *  @param  alphabet   A string of characters to choose from.
+     *  @param  nonce   "Number used ONCE" to initialize the random number generator, @b don't repeat it!
+     *  @param  key     A 128-bit key to initialize the AES-CTR block-cypher, zeros by default.
      */
-    template <typename generator_type>
-    basic_string &randomize(generator_type &generator, string_view alphabet = "abcdefghijklmnopqrstuvwxyz") noexcept {
+    basic_string &randomize(sz_u64_t nonce, sz_aes128_block_t key = {}) noexcept {
         sz_ptr_t start;
         sz_size_t length;
         sz_string_range(&string_, &start, &length);
-        sz_random_generator_t generator_callback = &_call_random_generator<generator_type>;
-        sz_generate(alphabet.data(), alphabet.size(), start, length, generator_callback, &generator);
+        sz_generate(start, length, nonce, &key);
         return *this;
     }
 
     /**
-     *  @brief  Overwrites the string with random characters from the given alphabet
-     *          using `std::rand` as the random generator.
-     *
-     *  @param  alphabet   A string of characters to choose from.
+     *  @brief  Overwrites the string with random binary data.
+     *          Produces the nonce from a static variable, incrementing it each time.
+     *          In this case the undefined behaviour in concurrent environments plays in our favor.
      */
-    basic_string &randomize(string_view alphabet = "abcdefghijklmnopqrstuvwxyz") noexcept {
-        auto generator = []() { return static_cast<sz_u64_t>(std::rand()); };
-        return randomize(generator, alphabet);
+    basic_string &randomize() noexcept {
+        static sz_u64_t nonce = 42;
+        return randomize(nonce++, {});
     }
 
     /**
@@ -3360,25 +3357,19 @@ class basic_string {
      *          May throw exceptions if the memory allocation fails.
      *
      *  @param  length     The length of the generated string.
-     *  @param  alphabet   A string of characters to choose from.
+     *  @param  nonce   "Number used ONCE" to initialize the random number generator, @b don't repeat it!
      */
-    static basic_string random(size_type length, string_view alphabet = "abcdefghijklmnopqrstuvwxyz") noexcept(false) {
-        return basic_string(length, '\0').randomize(alphabet);
+    static basic_string random(size_type length, sz_u64_t nonce) noexcept(false) {
+        return basic_string(length, '\0').randomize(nonce);
     }
 
     /**
      *  @brief  Generate a new random string of given length using the provided random number generator.
      *          May throw exceptions if the memory allocation fails.
      *
-     *  @param  generator  A random generator function object that returns a random number in the range [0, 2^64).
      *  @param  length     The length of the generated string.
-     *  @param  alphabet   A string of characters to choose from.
      */
-    template <typename generator_type>
-    static basic_string random(generator_type &generator, size_type length,
-                               string_view alphabet = "abcdefghijklmnopqrstuvwxyz") noexcept(false) {
-        return basic_string(length, '\0').randomize(generator, alphabet);
-    }
+    static basic_string random(size_type length) noexcept(false) { return basic_string(length, '\0').randomize(); }
 
     /**
      *  @brief  Replaces ( @b in-place ) all occurrences of a given string with the ::replacement string.
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 01d090b2..a3b9d62e 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -443,10 +443,19 @@ SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void
 #pragma region API Signature Types
 
 /** @brief  Signature of ::sz_hash. */
-typedef sz_u64_t (*sz_hash_t)(sz_cptr_t, sz_size_t);
+typedef sz_u64_t (*sz_hash_t)(sz_cptr_t, sz_size_t, sz_u64_t);
 
-/** @brief  Signature of ::sz_checksum. */
-typedef sz_u64_t (*sz_checksum_t)(sz_cptr_t, sz_size_t);
+/** @brief  Signature of ::sz_hash_state_init. */
+typedef void (*sz_hash_state_init_t)(struct sz_hash_state_t *, sz_u64_t);
+
+/** @brief  Signature of ::sz_hash_state_stream. */
+typedef void (*sz_hash_state_stream_t)(struct sz_hash_state_t *, sz_cptr_t, sz_size_t);
+
+/** @brief  Signature of ::sz_hash_state_fold. */
+typedef sz_u64_t (*sz_hash_state_fold_t)(struct sz_hash_state_t const *);
+
+/** @brief  Signature of ::sz_bytesum. */
+typedef sz_u64_t (*sz_bytesum_t)(sz_cptr_t, sz_size_t);
 
 /** @brief  Signature of ::sz_equal. */
 typedef sz_bool_t (*sz_equal_t)(sz_cptr_t, sz_cptr_t, sz_size_t);
@@ -887,6 +896,7 @@ SZ_INTERNAL sz_i32_t sz_i32_max_of_two(sz_i32_t x, sz_i32_t y) { return x - ((x
 #pragma GCC push_options
 #pragma GCC target("bmi", "bmi2")
 #pragma clang attribute push(__attribute__((target("bmi,bmi2"))), apply_to = function)
+SZ_INTERNAL __mmask8 _sz_u8_mask_until(sz_size_t n) { return (__mmask8)_bzhi_u32(0xFFu, n); }
 SZ_INTERNAL __mmask16 _sz_u16_mask_until(sz_size_t n) { return (__mmask16)_bzhi_u32(0xFFFFu, n); }
 SZ_INTERNAL __mmask32 _sz_u32_mask_until(sz_size_t n) { return (__mmask32)_bzhi_u64(0xFFFFFFFFu, n); }
 SZ_INTERNAL __mmask64 _sz_u64_mask_until(sz_size_t n) { return (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n); }
diff --git a/python/lib.c b/python/lib.c
index 1406691c..6e334719 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -717,7 +717,7 @@ static PyObject *Str_like_hash(PyObject *self, PyObject *args, PyObject *kwargs)
     return PyLong_FromUnsignedLongLong((unsigned long long)result);
 }
 
-static char const doc_like_checksum[] = //
+static char const doc_like_bytesum[] = //
     "Compute the checksum of individual byte values in a string.\n"
     "\n"
     "This function can be called as a method on a Str object or as a standalone function.\n"
@@ -728,12 +728,12 @@ static char const doc_like_checksum[] = //
     "Raises:\n"
     "  TypeError: If the argument is not string-like or incorrect number of arguments is provided.";
 
-static PyObject *Str_like_checksum(PyObject *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Str_like_bytesum(PyObject *self, PyObject *args, PyObject *kwargs) {
     // Check minimum arguments
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
     Py_ssize_t nargs = PyTuple_Size(args);
     if (nargs < !is_member || nargs > !is_member + 1 || kwargs) {
-        PyErr_SetString(PyExc_TypeError, "checksum() expects exactly one positional argument");
+        PyErr_SetString(PyExc_TypeError, "bytesum() expects exactly one positional argument");
         return NULL;
     }
 
@@ -746,7 +746,7 @@ static PyObject *Str_like_checksum(PyObject *self, PyObject *args, PyObject *kwa
         return NULL;
     }
 
-    sz_u64_t result = sz_checksum(text.start, text.length);
+    sz_u64_t result = sz_bytesum(text.start, text.length);
     return PyLong_FromUnsignedLongLong((unsigned long long)result);
 }
 
@@ -3684,7 +3684,7 @@ static PyMethodDef stringzilla_methods[] = {
 
     // Global unary extensions
     {"hash", Str_like_hash, SZ_METHOD_FLAGS, doc_like_hash},
-    {"checksum", Str_like_checksum, SZ_METHOD_FLAGS, doc_like_checksum},
+    {"bytesum", Str_like_bytesum, SZ_METHOD_FLAGS, doc_like_bytesum},
 
     {NULL, NULL, 0, NULL}};
 
diff --git a/rust/lib.rs b/rust/lib.rs
index 08c8772a..07db0a32 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -56,7 +56,7 @@ pub mod sz {
 
         fn sz_hash(text: *const c_void, length: usize) -> u64;
 
-        fn sz_checksum(text: *const c_void, length: usize) -> u64;
+        fn sz_bytesum(text: *const c_void, length: usize) -> u64;
 
         fn sz_edit_distance(
             haystack1: *const c_void,
@@ -123,21 +123,21 @@ pub mod sz {
     /// # Returns
     ///
     /// A `u64` representing the checksum value of the input byte slice.
-    pub fn checksum<T>(text: T) -> u64
+    pub fn bytesum<T>(text: T) -> u64
     where
         T: AsRef<[u8]>,
     {
         let text_ref = text.as_ref();
         let text_pointer = text_ref.as_ptr() as _;
         let text_length = text_ref.len();
-        let result = unsafe { sz_checksum(text_pointer, text_length) };
+        let result = unsafe { sz_bytesum(text_pointer, text_length) };
         return result;
     }
 
     /// Computes a 64-bit AES-based hash value for a given byte slice `text`.
     /// This function is designed to provide a high-quality hash value for use in
     /// hash tables, data structures, and cryptographic applications.
-    /// Unlike the checksum function, the hash function is order-sensitive.
+    /// Unlike the bytesum function, the hash function is order-sensitive.
     ///
     /// # Arguments
     ///
@@ -1034,7 +1034,7 @@ pub trait StringZilla<'a, N>
 where
     N: AsRef<[u8]> + 'a,
 {
-    /// Computes the checksum value of unsigned bytes in a given string.
+    /// Computes the bytesum value of unsigned bytes in a given string.
     /// This function is useful for verifying data integrity and detecting changes in
     /// binary data, such as files or network packets.
     ///
@@ -1044,14 +1044,14 @@ where
     /// use stringzilla::StringZilla;
     ///
     /// let text = "Hello";
-    /// assert_eq!(text.sz_checksum(), Some(500));
+    /// assert_eq!(text.sz_bytesum(), Some(500));
     /// ```
-    fn sz_checksum(&self) -> u64;
+    fn sz_bytesum(&self) -> u64;
 
     /// Computes a 64-bit AES-based hash value for a given string.
     /// This function is designed to provide a high-quality hash value for use in
     /// hash tables, data structures, and cryptographic applications.
-    /// Unlike the checksum function, the hash function is order-sensitive.
+    /// Unlike the bytesum function, the hash function is order-sensitive.
     ///
     /// # Examples
     ///
@@ -1352,8 +1352,8 @@ where
     T: AsRef<[u8]> + ?Sized,
     N: AsRef<[u8]> + 'a,
 {
-    fn sz_checksum(&self) -> u64 {
-        sz::checksum(self)
+    fn sz_bytesum(&self) -> u64 {
+        sz::bytesum(self)
     }
 
     fn sz_hash(&self) -> u64 {
diff --git a/scripts/bench_fingerprint.cpp b/scripts/bench_fingerprint.cpp
index 82064a29..cbc2812c 100644
--- a/scripts/bench_fingerprint.cpp
+++ b/scripts/bench_fingerprint.cpp
@@ -90,7 +90,7 @@ void bench(strings_type &&strings) {
     if (strings.size() == 0) return;
 
     // Benchmark logical operations
-    bench_unary_functions(strings, checksum_functions());
+    bench_unary_functions(strings, bytesum_functions());
     bench_unary_functions(strings, hashing_functions());
     bench_binary_functions(strings, equality_functions());
     bench_binary_functions(strings, ordering_functions());
diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index 749daa85..64ba2f96 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -11,7 +11,7 @@
 
 using namespace ashvardanian::stringzilla::scripts;
 
-tracked_unary_functions_t checksum_functions() {
+tracked_unary_functions_t bytesum_functions() {
     auto wrap_sz = [](auto function) -> unary_function_t {
         return unary_function_t([function](std::string_view s) { return function(s.data(), s.size()); });
     };
@@ -21,18 +21,18 @@ tracked_unary_functions_t checksum_functions() {
              return std::accumulate(s.begin(), s.end(), (std::size_t)0,
                                     [](std::size_t sum, char c) { return sum + static_cast<unsigned char>(c); });
          }},
-        {"sz_checksum_serial", wrap_sz(sz_checksum_serial), true},
+        {"sz_bytesum_serial", wrap_sz(sz_bytesum_serial), true},
 #if SZ_USE_HASWELL
-        {"sz_checksum_haswell", wrap_sz(sz_checksum_haswell), true},
+        {"sz_bytesum_haswell", wrap_sz(sz_bytesum_haswell), true},
 #endif
 #if SZ_USE_SKYLAKE
-        {"sz_checksum_skylake", wrap_sz(sz_checksum_skylake), true},
+        {"sz_bytesum_skylake", wrap_sz(sz_bytesum_skylake), true},
 #endif
 #if SZ_USE_ICE
-        {"sz_checksum_ice", wrap_sz(sz_checksum_ice), true},
+        {"sz_bytesum_ice", wrap_sz(sz_bytesum_ice), true},
 #endif
 #if SZ_USE_NEON
-        {"sz_checksum_neon", wrap_sz(sz_checksum_neon), true},
+        {"sz_bytesum_neon", wrap_sz(sz_bytesum_neon), true},
 #endif
     };
     return result;
@@ -139,7 +139,7 @@ void bench(strings_type &&strings) {
     if (strings.size() == 0) return;
 
     // Benchmark logical operations
-    bench_unary_functions(strings, checksum_functions());
+    bench_unary_functions(strings, bytesum_functions());
     bench_unary_functions(strings, hashing_functions());
     bench_binary_functions(strings, equality_functions());
     bench_binary_functions(strings, ordering_functions());
diff --git a/scripts/test.cpp b/scripts/test.cpp
index 7b3fe4db..58752a35 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -802,17 +802,17 @@ static void test_non_stl_extensions_for_reads() {
         return std::accumulate(s.begin(), s.end(), (std::size_t)0,
                                [](std::size_t sum, char c) { return sum + static_cast<unsigned char>(c); });
     };
-    assert(str("a").checksum() == (std::size_t)'a');
-    assert(str("0").checksum() == (std::size_t)'0');
-    assert(str("0123456789").checksum() == arithmetic_sum('0', '9'));
-    assert(str("abcdefghijklmnopqrstuvwxyz").checksum() == arithmetic_sum('a', 'z'));
-    assert(str("abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz").checksum() ==
+    assert(str("a").bytesum() == (std::size_t)'a');
+    assert(str("0").bytesum() == (std::size_t)'0');
+    assert(str("0123456789").bytesum() == arithmetic_sum('0', '9'));
+    assert(str("abcdefghijklmnopqrstuvwxyz").bytesum() == arithmetic_sum('a', 'z'));
+    assert(str("abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz").bytesum() ==
            arithmetic_sum('a', 'z') * 3);
     assert_scoped(
         str s =
             "近来，加文出席微博之夜时对着镜头频繁摆出假笑表情、一度累瘫睡倒在沙发上的照片被广泛转发，引发对他失去童年、"
             "被过度消费的担忧。八岁的加文，已当网红近六年了，可以说，自懂事以来，他没有过过一天没有名气的日子。",
-        (void)0, s.checksum() == accumulate_bytes(s));
+        (void)0, s.bytesum() == accumulate_bytes(s));
 
     // Computing edit-distances.
     assert(sz::hamming_distance(str("hello"), str("hello")) == 0);
diff --git a/scripts/test.py b/scripts/test.py
index 93a01706..ea95e8d4 100644
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -777,12 +777,12 @@ def test_translations_random(length: int):
 
 @pytest.mark.repeat(3)
 @pytest.mark.parametrize("length", list(range(0, 300)) + [1024, 4096, 100000])
-def test_checksums_random(length: int):
+def test_bytesums_random(length: int):
     def sum_bytes(body: str) -> int:
         return sum([ord(c) for c in body])
 
     body = get_random_string(length=length)
-    assert sum_bytes(body) == sz.checksum(body)
+    assert sum_bytes(body) == sz.bytesum(body)
 
 
 @pytest.mark.parametrize("list_length", [10, 20, 30, 40, 50])

From cb18c787c0b413a8ce1422222f2a0a68f9e102dc Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 20 Feb 2025 22:46:36 +0000
Subject: [PATCH 110/751] Add: AES-based hash placeholders

---
 README.md                           |  14 +-
 c/lib.c                             |  30 +-
 drafts/fingerprint.h                |  13 +-
 drafts/sort.h                       | 124 +++++
 include/stringzilla/find.h          |  17 +-
 include/stringzilla/hash.h          | 760 +++++++++++++++++++++-------
 include/stringzilla/sort.h          | 217 ++++++--
 include/stringzilla/stringzilla.hpp |  85 ++--
 include/stringzilla/types.h         |   5 +-
 rust/lib.rs                         |  14 +-
 scripts/bench_token.cpp             |  26 +-
 scripts/test.cpp                    |   7 +-
 12 files changed, 978 insertions(+), 334 deletions(-)
 create mode 100644 drafts/sort.h

diff --git a/README.md b/README.md
index c657decf..22c8e2b0 100644
--- a/README.md
+++ b/README.md
@@ -622,14 +622,22 @@ Both are companions of the `sz_find`, first for x86 CPUs with AVX-512 support, a
 sz_string_view_t haystack = {your_text, your_text_length};
 sz_string_view_t needle = {your_subtext, your_subtext_length};
 
-// Perform string-level operations
+// Perform string-level operations auto-picking the backend or dispatching manually
 sz_size_t substring_position = sz_find(haystack.start, haystack.length, needle.start, needle.length);
 sz_size_t substring_position = sz_find_skylake(haystack.start, haystack.length, needle.start, needle.length);
 sz_size_t substring_position = sz_find_haswell(haystack.start, haystack.length, needle.start, needle.length);
 sz_size_t substring_position = sz_find_neon(haystack.start, haystack.length, needle.start, needle.length);
 
-// Hash strings
-sz_u64_t hash = sz_hash(haystack.start, haystack.length, 42); // or any other seed ;)
+// Hash strings at once
+sz_u64_t hash = sz_hash(haystack.start, haystack.length, 42);    // 42 is the seed
+sz_u64_t checksum = sz_bytesum(haystack.start, haystack.length); // or accumulate byte values
+
+// Hash strings incrementally with "init", "stream", and "fold":
+sz_hash_state_t state; 
+sz_hash_state_init(&state, 42);
+sz_hash_state_stream(&state, haystack.start, 1);                       // first char
+sz_hash_state_stream(&state, haystack.start + 1, haystack.length - 1); // rest of the string
+sz_u64_t hash = sz_hash_state_fold(&state);
 
 // Perform collection level operations
 sz_sequence_t array = {your_handle, your_count, your_get_start, your_get_length};
diff --git a/c/lib.c b/c/lib.c
index c68e7a1f..559062ba 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -183,6 +183,7 @@ typedef struct sz_implementations_t {
     sz_hash_state_init_t hash_state_init;
     sz_hash_state_stream_t hash_state_stream;
     sz_hash_state_fold_t hash_state_fold;
+    sz_generate_t generate;
 
     sz_find_byte_t find_byte;
     sz_find_byte_t rfind_byte;
@@ -225,6 +226,7 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
     impl->hash_state_init = sz_hash_state_init_serial;
     impl->hash_state_stream = sz_hash_state_stream_serial;
     impl->hash_state_fold = sz_hash_state_fold_serial;
+    impl->generate = sz_generate_serial;
 
     impl->find = sz_find_serial;
     impl->rfind = sz_rfind_serial;
@@ -252,6 +254,7 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->hash_state_init = sz_hash_state_init_haswell;
         impl->hash_state_stream = sz_hash_state_stream_haswell;
         impl->hash_state_fold = sz_hash_state_fold_haswell;
+        impl->generate = sz_generate_haswell;
 
         impl->find_byte = sz_find_byte_haswell;
         impl->rfind_byte = sz_rfind_byte_haswell;
@@ -276,6 +279,7 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->hash_state_init = sz_hash_state_init_skylake;
         impl->hash_state_stream = sz_hash_state_stream_skylake;
         impl->hash_state_fold = sz_hash_state_fold_skylake;
+        impl->generate = sz_generate_skylake;
 
         impl->find = sz_find_skylake;
         impl->rfind = sz_rfind_skylake;
@@ -300,6 +304,7 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->hash_state_init = sz_hash_state_init_ice;
         impl->hash_state_stream = sz_hash_state_stream_ice;
         impl->hash_state_fold = sz_hash_state_fold_ice;
+        impl->generate = sz_generate_ice;
     }
 #endif
 
@@ -317,6 +322,7 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->hash_state_init = sz_hash_state_init_neon;
         impl->hash_state_stream = sz_hash_state_stream_neon;
         impl->hash_state_fold = sz_hash_state_fold_neon;
+        impl->generate = sz_generate_neon;
 
         impl->find = sz_find_neon;
         impl->rfind = sz_rfind_neon;
@@ -382,6 +388,10 @@ SZ_DYNAMIC sz_u64_t sz_hash_state_fold(sz_hash_state_t const *state) {
     return sz_dispatch_table.hash_state_fold(state);
 }
 
+SZ_DYNAMIC void sz_generate(sz_ptr_t result, sz_size_t result_length, sz_u64_t nonce) {
+    sz_dispatch_table.generate(result, result_length, nonce);
+}
+
 SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
     return sz_dispatch_table.equal(a, b, length);
 }
@@ -499,22 +509,6 @@ SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_
     return sz_rfind_charset(h, h_length, &set);
 }
 
-#if !SZ_AVOID_LIBC
-sz_u64_t _sz_random_generator(void *empty_state) {
-    sz_unused(empty_state);
-    return (sz_u64_t)rand();
-}
-#endif
-
-SZ_DYNAMIC void sz_generate( //
-    sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
-    sz_random_generator_t generator, void *generator_user_data) {
-#if !SZ_AVOID_LIBC
-    if (!generator) generator = _sz_random_generator;
-#endif
-    sz_generate_serial(alphabet, alphabet_size, result, result_length, generator, generator_user_data);
-}
-
 // Provide overrides for the libc mem* functions
 #if SZ_OVERRIDE_LIBC && !defined(__CYGWIN__)
 
@@ -591,8 +585,8 @@ SZ_DYNAMIC void *memrchr(void const *s, int c_wide, size_t n) {
 }
 
 SZ_DYNAMIC void memfrob(void *s, size_t n) {
-    char const *base64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-    sz_generate(base64, 64, s, n, SZ_NULL, SZ_NULL);
+    static sz_u64_t nonce = 42;
+    sz_generate(s, n, nonce++);
 }
 
 #endif
diff --git a/drafts/fingerprint.h b/drafts/fingerprint.h
index 9cdfcc5e..a442d7a0 100644
--- a/drafts/fingerprint.h
+++ b/drafts/fingerprint.h
@@ -1,17 +1,8 @@
 /**
- *  @brief  Hardware-accelerated string hashing and checksums.
+ *  @brief  Hardware-accelerated rolling string hashes or fingerprints.
  *  @file   hash.h
  *  @author Ash Vardanian
- *
- *  Includes core APIs:
- *
- *  - `sz_checksum` - for byte-level checksums.
- *  - `sz_hash` - for 64-bit single-shot hashing.
- *  - `sz_hashes` - producing the rolling hashes of a string.
- *  - `sz_generate` - populating buffers with random data.
- *
- *  Convenience functions for character-set matching:
- *
+
  *  - `sz_hashes_fingerprint`
  *  - `sz_hashes_intersection`
  */
diff --git a/drafts/sort.h b/drafts/sort.h
new file mode 100644
index 00000000..bc1bb34e
--- /dev/null
+++ b/drafts/sort.h
@@ -0,0 +1,124 @@
+
+
+/**
+ *  @brief  Perform a compare–exchange (compare–swap) on two 8‑lane vectors,
+ *          updating both the keys and their associated offsets.
+ *
+ *  @param  keys         Pointer to a __m512i containing 8 keys.
+ *  @param  offsets      Pointer to a __m512i containing 8 offsets.
+ *  @param  perm         Permutation vector (as __m512i) that maps each lane
+ *                       to its “partner” in the compare–exchange.
+ *  @param  fixed_mask   An 8‑bit immediate mask (as __mmask8) that indicates,
+ *                       for each pair, which lane is designated as the “upper”
+ *                       element. For that lane the max is chosen, while for the
+ *                       complementary (“lower”) lane the min is chosen.
+ *
+ *  This helper function “mirrors” the scalar operation:
+ *
+ *      if (keys[i] > keys[j]) {
+ *          swap(keys[i], keys[j]);
+ *          swap(offsets[i], offsets[j]);
+ *      }
+ *
+ *  for each pair (i,j) defined by the permutation vector.
+ *
+ *  The keys are updated by computing the unsigned min and max between each
+ *  element and its partner, and then blending them into the designated positions
+ *  using the fixed_mask. In order to update the offsets in a stable manner,
+ *  we first compute the partner offsets (using the same permutation), then for each
+ *  pair we choose:
+ *
+ *      - For the lane designated as lower (mask bit = 0):
+ *            if (orig_key <= partner_key) then keep self’s offset,
+ *            else take the partner’s offset.
+ *
+ *      - For the lane designated as upper (mask bit = 1):
+ *            if (orig_key > partner_key) then keep self’s offset,
+ *            else take the partner’s offset.
+ *
+ *  This ensures that if keys are equal (thus stable), no swap is done.
+ */
+SZ_INTERNAL void cswap_argsort_avx512(__m512i *pgrams, __m512i *offsets, __m512i perm, __mmask8 fixed_mask) {
+    // Save original pgrams and offsets for condition computation.
+    __m512i orig_pgrams = *pgrams;
+    __m512i orig_offsets = *offsets;
+
+    // Compute partner vectors using the permutation vector.
+    __m512i partner_pgrams = _mm512_permutexvar_epi64(perm, orig_pgrams);
+    __m512i partner_offsets = _mm512_permutexvar_epi64(perm, orig_offsets);
+
+    // Compute new pgrams: for each pair, choose the unsigned min for the lower lane
+    // and the unsigned max for the upper lane.
+    __m512i pgrams_min = _mm512_min_epu64(orig_pgrams, partner_pgrams);
+    __m512i pgrams_max = _mm512_max_epu64(orig_pgrams, partner_pgrams);
+    *pgrams = _mm512_mask_blend_epi64(fixed_mask, pgrams_min, pgrams_max);
+
+    // For offsets, we want to mimic the swap decision used for pgrams.
+    // For each pair (i,j) (with i < j), if orig_pgrams[i] <= partner_pgrams[i] then
+    // the lower key came from the current lane (i) and the upper from the partner (j);
+    // otherwise the lower key came from the partner.
+    __mmask8 lower_cond =
+        _mm512_cmp_epu64_mask(orig_pgrams, partner_pgrams, _MM_CMPINT_LE); // true if no swap needed for lower lane.
+    __mmask8 upper_cond =
+        _mm512_cmp_epu64_mask(orig_pgrams, partner_pgrams, _MM_CMPINT_GT); // true if swap needed for upper lane.
+
+    // Compute offsets for lower positions (fixed_mask bit = 0):
+    //   If lower_cond is true, then the current lane’s offset is correct;
+    //   otherwise, use the partner’s offset.
+    __m512i offsets_lower = _mm512_mask_blend_epi64(lower_cond, partner_offsets, orig_offsets);
+
+    // Compute offsets for upper positions (fixed_mask bit = 1):
+    //   If upper_cond is true, then keep the current lane’s offset;
+    //   otherwise, use the partner’s offset.
+    __m512i offsets_upper = _mm512_mask_blend_epi64(upper_cond, orig_offsets, partner_offsets);
+
+    // Combine the two sets: for lanes designated as lower (mask bit = 0) use offsets_lower;
+    // for lanes designated as upper (mask bit = 1) use offsets_upper.
+    *offsets = _mm512_mask_blend_epi64(fixed_mask, offsets_lower, offsets_upper);
+
+    // Validate the sorting network.
+    if (SZ_DEBUG) {
+        sz_pgram_t pgrams_array[8];
+        sz_sorted_idx_t offsets_array[8];
+        _mm512_storeu_si512(pgrams_array, *pgrams);
+        _mm512_storeu_si512(offsets_array, *offsets);
+        for (sz_size_t i = 1; i < 8; ++i)
+            _sz_assert(pgrams_array[i - 1] <= pgrams_array[i] &&
+                       "The sorting network must sort the pgrams in ascending order.");
+    }
+}
+
+SZ_PUBLIC void _sz_sequence_argsort_ice_recursively(                    //
+    sz_sequence_t const *const collection,                              //
+    sz_pgram_t *const global_pgrams, sz_size_t *const global_order,     //
+    sz_size_t const start_in_sequence, sz_size_t const end_in_sequence, //
+    sz_size_t const start_character) {
+
+    // Prepare the new range of windows
+    _sz_sequence_argsort_serial_export_next_pgrams(collection, global_pgrams, global_order, start_in_sequence,
+                                                   end_in_sequence, start_character);
+
+    // We can implement a form of a Radix sort here, that will count the number of elements with
+    // a certain bit set. The naive approach may require too many loops over data. A more "vectorized"
+    // approach would be to maintain a histogram for several bits at once. For 4 bits we will
+    // need 2^4 = 16 counters.
+    sz_size_t histogram[16] = {0};
+    for (sz_size_t byte_in_window = 0; byte_in_window != sizeof(sz_pgram_t); ++byte_in_window) {
+        // First sort based on the low nibble of each byte.
+        for (sz_size_t i = start_in_sequence; i < end_in_sequence; ++i) {
+            sz_size_t const byte = (global_pgrams[i] >> (byte_in_window * 8)) & 0xFF;
+            ++histogram[byte];
+        }
+        sz_size_t offset = start_in_sequence;
+        for (sz_size_t i = 0; i != 16; ++i) {
+            sz_size_t const count = histogram[i];
+            histogram[i] = offset;
+            offset += count;
+        }
+        for (sz_size_t i = start_in_sequence; i < end_in_sequence; ++i) {
+            sz_size_t const byte = (global_pgrams[i] >> (byte_in_window * 8)) & 0xFF;
+            global_order[histogram[byte]] = i;
+            ++histogram[byte];
+        }
+    }
+}
diff --git a/include/stringzilla/find.h b/include/stringzilla/find.h
index b5740429..90b6a16f 100644
--- a/include/stringzilla/find.h
+++ b/include/stringzilla/find.h
@@ -5,7 +5,6 @@
  *
  *  Includes core APIs:
  *
- *  - `sz_equal`
  *  - `sz_find` and reverse-order `sz_rfind`
  *  - `sz_find_byte` and reverse-order `sz_rfind_byte`
  *  - `sz_find_charset` and reverse-order `sz_rfind_charset`
@@ -138,10 +137,10 @@ SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cpt
  *          May have identical implementation and performance to ::sz_rfind_charset.
  *
  *  Useful for parsing, when we want to skip a set of characters. Examples:
- *  * 6 whitespaces: " \t\n\r\v\f".
- *  * 16 digits forming a float number: "0123456789,.eE+-".
- *  * 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
- *  * 2 JSON string special characters useful to locate the end of the string: "\"\\".
+ *  - 6 whitespaces: " \t\n\r\v\f".
+ *  - 16 digits forming a float number: "0123456789,.eE+-".
+ *  - 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
+ *  - 2 JSON string special characters useful to locate the end of the string: "\"\\".
  *
  *  @param text     String to be scanned.
  *  @param set      Set of relevant characters.
@@ -155,10 +154,10 @@ SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charse
  *          May have identical implementation and performance to ::sz_find_charset.
  *
  *  Useful for parsing, when we want to skip a set of characters. Examples:
- *  * 6 whitespaces: " \t\n\r\v\f".
- *  * 16 digits forming a float number: "0123456789,.eE+-".
- *  * 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
- *  * 2 JSON string special characters useful to locate the end of the string: "\"\\".
+ *  - 6 whitespaces: " \t\n\r\v\f".
+ *  - 16 digits forming a float number: "0123456789,.eE+-".
+ *  - 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
+ *  - 2 JSON string special characters useful to locate the end of the string: "\"\\".
  *
  *  @param text     String to be scanned.
  *  @param set      Set of relevant characters.
diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 415b4b67..2094a0d3 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -1,13 +1,55 @@
 /**
- *  @brief  Hardware-accelerated string hashing and checksums.
+ *  @brief  Hardware-accelerated non-cryptographic string hashing and checksums.
  *  @file   hash.h
  *  @author Ash Vardanian
  *
  *  Includes core APIs:
  *
- *  - `sz_checksum` - for byte-level 64-bit unsigned checksums.
- *  - `sz_hash` - for 64-bit single-shot hashing.
- *  - `sz_generate` - populating buffers with random data.
+ *  - `sz_bytesum` - for byte-level 64-bit unsigned byte-level checksums.
+ *  - `sz_hash` - for 64-bit single-shot hashing using AES instructions.
+ *  - `sz_hash_state_init`, `sz_hash_state_stream`, `sz_hash_state_fold` - for incremental hashing.
+ *  - `sz_generate` - for populating buffers with pseudo-random noise using AES instructions.
+ *
+ *  Why the hell do we need a yet another hashing library?!
+ *  Turns out, most existing libraries have noticeable constraints. Try finding a library that:
+ *
+ *  - Outputs 64-bit or 128-bit hashes and passes the SMHasher test suite.
+ *  - Is fast for both short and long strings.
+ *  - Supports incremental @b (streaming) hashing, when the data arrives in chunks.
+ *  - Supports custom seeds hashes and secret strings for security.
+ *  - Provides dynamic dispatch for different architectures to simplify deployment.
+ *  - Uses modern SIMD, including not just AVX2 and NEON, but also AVX-512 and SVE2.
+ *  - Documents its logic and guarantees the same output across different platforms.
+ *
+ *  This includes projects like "MurmurHash", "CityHash", "SpookyHash", "FarmHash", "MetroHash", "HighwayHash", etc.
+ *  There are 2 libraries that are close to meeting these requirements: "xxHash" in C++ and "aHash" in Rust:
+ *
+ *  - "aHash" is fast, but written in Rust, has no dynamic dispatch, and lacks AVX-512 and SVE2 support.
+ *    It also does not adhere to a fixed output, and can't be used in applications like computing packet checksums
+ *    in network traffic or implementing persistent data structures.
+ *
+ *  - "xxHash" is implemented in C, has an extremely wide set of third-party language bindings, and provides both
+ *    32-, 64-, and 128-bit hashes. It is fast, but its dynamic dispatch is limited to x86 with `xxh_x86dispatch.c`.
+ *
+ *  StringZilla uses a scheme more similar to the "aHash" library, utilizing the AES extensions, that provide
+ *  a remarkable level of "mixing per cycle" and are broadly available on modern CPUs. Similar to "aHash", they
+ *  are combined with "shuffle & add" instructions to provide a high level of entropy in the output. That operation
+ *  is practically free, as many modern CPUs will dispatch them on different ports. On x86, for example:
+ *
+ *  - `VAESDEC` (ZMM, ZMM, ZMM)`:
+ *    - on Intel Ice Lake: 5 cycles on port 0.
+ *    - On AMD Zen4: 4 cycles on ports 0 or 1.
+ *  - `VPSHUFB_Z (ZMM, K, ZMM, ZMM)`
+ *    - on Intel Ice Lake: 3 cycles on port 5.
+ *    - On AMD Zen4: 2 cycles on ports 1 or 2.
+ *  - `VPADDQ (ZMM, ZMM, ZMM)`:
+ *    - on Intel Ice Lake: 1 cycle on ports 0 or 5.
+ *    - On AMD Zen4: 1 cycle on ports 0, 1, 2, 3.
+ *
+ *  Unlike "aHash", on long inputs, we use a procedure that is more vector-friendly on modern servers.
+ *  Unlike "aHash", we don't load interleaved memory regions, making vectorized variant more similar to sequential.
+ *  On platforms like Skylake-X or newer, we also benefit from masked loads.
+ *
  */
 #ifndef STRINGZILLA_HASH_H_
 #define STRINGZILLA_HASH_H_
@@ -28,196 +70,205 @@ extern "C" {
  *  @param length   Number of bytes in the text.
  *  @return         64-bit unsigned value.
  */
-SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length);
+SZ_DYNAMIC sz_u64_t sz_bytesum(sz_cptr_t text, sz_size_t length);
 
 /**
- *  @brief  Computes the 64-bit unsigned hash of a string. Fairly fast for short strings,
- *          simple implementation, and supports rolling computation, reused in other APIs.
- *          Similar to `std::hash` in C++.
+ *  @brief  Computes the 64-bit unsigned hash of a string similar to @b `std::hash` in C++.
+ *          It's not cryptographically secure, but it's fast and provides a good distribution.
+ *          It passes the SMHasher suite by Austin Appleby with no collisions, even with `--extra` flag.
+ *  @see    HASH.md for a detailed explanation of the algorithm.
  *
  *  @param text     String to hash.
  *  @param length   Number of bytes in the text.
+ *  @param seed     64-bit unsigned seed for the hash.
  *  @return         64-bit hash value.
  */
-SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length) {
-    sz_unused(text && length);
-    return 0;
-}
+SZ_DYNAMIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
+
+/**
+ *  @brief  A Pseudorandom Number Generator (PRNG), inspired the AES-CTR-128 algorithm,
+ *          but using only one round of AES mixing as opposed to "NIST SP 800-90A".
+ *
+ *  CTR_DRBG (CounTeR mode Deterministic Random Bit Generator) appears secure and indistinguishable from a true
+ *  random source when AES is used as the underlying block cipher and 112 bits are taken from this PRNG.
+ *  When AES is used as the underlying block cipher and 128 bits are taken from each instantiation,
+ *  the required security level is delivered with the caveat that a 128-bit cipher's output in
+ *  counter mode can be distinguished from a true RNG.
+ *
+ *  In this case, it doesn't apply, as we only use one round of AES mixing. We also don't expose a separate "key",
+ *  only a "nonce", to keep the API simple.
+ *
+ *  @param text     Output string buffer to be populated.
+ *  @param length   Number of bytes in the string.
+ *  @param nonce    "Number used ONCE" to ensure uniqueness of produced blocks.
+ */
+SZ_DYNAMIC void sz_generate(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
+
+/**
+ *  @brief  The state for incremental construction of a hash.
+ *  @see    sz_hash_state_init, sz_hash_state_stream, sz_hash_state_fold.
+ */
+typedef struct sz_hash_state_t {
+    sz_u512_vec_t aes;
+    sz_u512_vec_t sum;
+    sz_u512_vec_t key;
+
+    sz_u512_vec_t ins;
+    sz_size_t ins_length;
+} sz_hash_state_t;
+
+typedef struct _sz_hash_minimal_t {
+    sz_u128_vec_t aes;
+    sz_u128_vec_t sum;
+    sz_u128_vec_t key;
+} _sz_hash_minimal_t;
+
+/**
+ *  @brief  Initializes the state for incremental construction of a hash.
+ *
+ *  @param state    The state to initialize.
+ *  @param seed     The 64-bit unsigned seed for the hash.
+ */
+SZ_DYNAMIC void sz_hash_state_init(sz_hash_state_t *state, sz_u64_t seed);
 
 /**
- *  @brief  Generates a random string for a given alphabet, avoiding integer division and modulo operations.
- *          Similar to `text[i] = alphabet[rand() % cardinality]`.
+ *  @brief  Updates the state with new data.
  *
- *  The modulo operation is expensive, and should be avoided in performance-critical code.
- *  We avoid it using small lookup tables and replacing it with a multiplication and shifts, similar to `libdivide`.
- *  Alternative algorithms would include:
- *      - Montgomery form: https://en.algorithmica.org/hpc/number-theory/montgomery/
- *      - Barret reduction: https://www.nayuki.io/page/barrett-reduction-algorithm
- *      - Lemire's trick: https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+ *  @param state    The state to stream.
+ *  @param text     The new data to include in the hash.
+ *  @param length   The number of bytes in the new data.
+ */
+SZ_DYNAMIC void sz_hash_state_stream(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
+
+/**
+ *  @brief  Finalizes the state and returns the hash.
  *
- *  @param alphabet     Set of characters to sample from.
- *  @param cardinality  Number of characters to sample from.
- *  @param text         Output string, can point to the same address as ::text.
- *  @param generate     Callback producing random numbers given the generator state.
- *  @param generator    Generator state, can be a pointer to a seed, or a pointer to a random number generator.
+ *  @param state    The state to fold.
+ *  @return         The 64-bit hash value.
  */
-SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
-                            sz_random_generator_t generate, void *generator);
+SZ_DYNAMIC sz_u64_t sz_hash_state_fold(sz_hash_state_t const *state);
 
-/** @copydoc sz_checksum */
-SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length);
+/** @copydoc sz_bytesum */
+SZ_PUBLIC sz_u64_t sz_bytesum_serial(sz_cptr_t text, sz_size_t length);
 
 /** @copydoc sz_hash */
-SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t text, sz_size_t length);
+SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
 
 /** @copydoc sz_generate */
-SZ_PUBLIC void sz_generate_serial( //
-    sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length, sz_random_generator_t generate,
-    void *generator) {
-    sz_unused(alphabet && cardinality && text && length && generate && generator);
-}
+SZ_PUBLIC void sz_generate_serial(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
+
+/** @copydoc sz_hash_state_init */
+SZ_PUBLIC void sz_hash_state_init_serial(sz_hash_state_t *state, sz_u64_t seed);
+
+/** @copydoc sz_hash_state_stream */
+SZ_PUBLIC void sz_hash_state_stream_serial(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
+
+/** @copydoc sz_hash_state_fold */
+SZ_PUBLIC sz_u64_t sz_hash_state_fold_serial(sz_hash_state_t const *state);
+
+/** @copydoc sz_bytesum */
+SZ_PUBLIC sz_u64_t sz_bytesum_haswell(sz_cptr_t text, sz_size_t length);
+
+/** @copydoc sz_hash */
+SZ_PUBLIC sz_u64_t sz_hash_haswell(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
+
+/** @copydoc sz_generate */
+SZ_PUBLIC void sz_generate_haswell(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
+
+/** @copydoc sz_hash_state_init */
+SZ_PUBLIC void sz_hash_state_init_haswell(sz_hash_state_t *state, sz_u64_t seed);
+
+/** @copydoc sz_hash_state_stream */
+SZ_PUBLIC void sz_hash_state_stream_haswell(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
+
+/** @copydoc sz_hash_state_fold */
+SZ_PUBLIC sz_u64_t sz_hash_state_fold_haswell(sz_hash_state_t const *state);
+
+/** @copydoc sz_bytesum */
+SZ_PUBLIC sz_u64_t sz_bytesum_skylake(sz_cptr_t text, sz_size_t length);
+
+/** @copydoc sz_hash */
+SZ_PUBLIC sz_u64_t sz_hash_skylake(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
+
+/** @copydoc sz_generate */
+SZ_PUBLIC void sz_generate_skylake(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
+
+/** @copydoc sz_hash_state_init */
+SZ_PUBLIC void sz_hash_state_init_skylake(sz_hash_state_t *state, sz_u64_t seed);
+
+/** @copydoc sz_hash_state_stream */
+SZ_PUBLIC void sz_hash_state_stream_skylake(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
+
+/** @copydoc sz_hash_state_fold */
+SZ_PUBLIC sz_u64_t sz_hash_state_fold_skylake(sz_hash_state_t const *state);
+
+/** @copydoc sz_bytesum */
+SZ_PUBLIC sz_u64_t sz_bytesum_ice(sz_cptr_t text, sz_size_t length);
+
+/** @copydoc sz_hash */
+SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
+
+/** @copydoc sz_generate */
+SZ_PUBLIC void sz_generate_ice(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
+
+/** @copydoc sz_hash_state_init */
+SZ_PUBLIC void sz_hash_state_init_ice(sz_hash_state_t *state, sz_u64_t seed);
+
+/** @copydoc sz_hash_state_stream */
+SZ_PUBLIC void sz_hash_state_stream_ice(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
+
+/** @copydoc sz_hash_state_fold */
+SZ_PUBLIC sz_u64_t sz_hash_state_fold_ice(sz_hash_state_t const *state);
+
+/** @copydoc sz_bytesum */
+SZ_PUBLIC sz_u64_t sz_bytesum_neon(sz_cptr_t text, sz_size_t length);
+
+/** @copydoc sz_hash */
+SZ_PUBLIC sz_u64_t sz_hash_neon(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
+
+/** @copydoc sz_generate */
+SZ_PUBLIC void sz_generate_neon(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
+
+/** @copydoc sz_hash_state_init */
+SZ_PUBLIC void sz_hash_state_init_neon(sz_hash_state_t *state, sz_u64_t seed);
+
+/** @copydoc sz_hash_state_stream */
+SZ_PUBLIC void sz_hash_state_stream_neon(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
+
+/** @copydoc sz_hash_state_fold */
+SZ_PUBLIC sz_u64_t sz_hash_state_fold_neon(sz_hash_state_t const *state);
 
 #pragma endregion // Core API
 
 #pragma region Serial Implementation
 
-SZ_PUBLIC sz_u64_t sz_checksum_serial(sz_cptr_t text, sz_size_t length) {
-    sz_u64_t checksum = 0;
+SZ_PUBLIC sz_u64_t sz_bytesum_serial(sz_cptr_t text, sz_size_t length) {
+    sz_u64_t bytesum = 0;
     sz_u8_t const *text_u8 = (sz_u8_t const *)text;
     sz_u8_t const *text_end = text_u8 + length;
-    for (; text_u8 != text_end; ++text_u8) checksum += *text_u8;
-    return checksum;
+    for (; text_u8 != text_end; ++text_u8) bytesum += *text_u8;
+    return bytesum;
 }
 
-/*
- *  One hardware-accelerated way of mixing hashes can be CRC, but it's only implemented for 32-bit values.
- *  Using a Boost-like mixer works very poorly in such case:
- *
- *       hash_first ^ (hash_second + 0x517cc1b727220a95 + (hash_first << 6) + (hash_first >> 2));
- *
- *  Let's stick to the Fibonacci hash trick using the golden ratio.
- *  https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
- */
-#define _sz_hash_mix(first, second) ((first * 11400714819323198485ull) ^ (second * 11400714819323198485ull))
-#define _sz_shift_low(x) (x)
-#define _sz_shift_high(x) ((x + 77ull) & 0xFFull)
-#define _sz_prime_mod(x) (x % SZ_U64_MAX_PRIME)
-
-SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length) {
-
-    sz_u64_t hash_low = 0;
-    sz_u64_t hash_high = 0;
-    sz_u8_t const *text = (sz_u8_t const *)start;
-    sz_u8_t const *text_end = text + length;
-
-    switch (length) {
-    case 0: return 0;
-
-    // Texts under 7 bytes long are definitely below the largest prime.
-    case 1:
-        hash_low = _sz_shift_low(text[0]);
-        hash_high = _sz_shift_high(text[0]);
-        break;
-    case 2:
-        hash_low = _sz_shift_low(text[0]) * 31ull + _sz_shift_low(text[1]);
-        hash_high = _sz_shift_high(text[0]) * 257ull + _sz_shift_high(text[1]);
-        break;
-    case 3:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull +         //
-                   _sz_shift_low(text[2]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull +          //
-                    _sz_shift_high(text[2]);
-        break;
-    case 4:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull +                 //
-                   _sz_shift_low(text[3]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull +                   //
-                    _sz_shift_high(text[3]);
-        break;
-    case 5:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull +                         //
-                   _sz_shift_low(text[4]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull +                            //
-                    _sz_shift_high(text[4]);
-        break;
-    case 6:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull * 31ull +                         //
-                   _sz_shift_low(text[4]) * 31ull +                                 //
-                   _sz_shift_low(text[5]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull * 257ull +                            //
-                    _sz_shift_high(text[4]) * 257ull +                                     //
-                    _sz_shift_high(text[5]);
-        break;
-    case 7:
-        hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull * 31ull + //
-                   _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull * 31ull +         //
-                   _sz_shift_low(text[2]) * 31ull * 31ull * 31ull * 31ull +                 //
-                   _sz_shift_low(text[3]) * 31ull * 31ull * 31ull +                         //
-                   _sz_shift_low(text[4]) * 31ull * 31ull +                                 //
-                   _sz_shift_low(text[5]) * 31ull +                                         //
-                   _sz_shift_low(text[6]);
-        hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull * 257ull + //
-                    _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull * 257ull +          //
-                    _sz_shift_high(text[2]) * 257ull * 257ull * 257ull * 257ull +                   //
-                    _sz_shift_high(text[3]) * 257ull * 257ull * 257ull +                            //
-                    _sz_shift_high(text[4]) * 257ull * 257ull +                                     //
-                    _sz_shift_high(text[5]) * 257ull +                                              //
-                    _sz_shift_high(text[6]);
-        break;
-    default:
-        // Unroll the first seven cycles:
-        hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[1]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[1]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[2]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[2]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[3]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[3]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[4]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[4]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[5]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[5]);
-        hash_low = hash_low * 31ull + _sz_shift_low(text[6]);
-        hash_high = hash_high * 257ull + _sz_shift_high(text[6]);
-        text += 7;
-
-        // Iterate throw the rest with the modulus:
-        for (; text != text_end; ++text) {
-            hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
-            hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
-            // Wrap the hashes around:
-            hash_low = _sz_prime_mod(hash_low);
-            hash_high = _sz_prime_mod(hash_high);
-        }
-        break;
-    }
+SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
+    sz_unused(start && length && seed);
+    return 0;
+}
+
+SZ_PUBLIC void sz_generate_serial(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
+    sz_unused(text && length && nonce);
+}
+
+SZ_PUBLIC void sz_hash_state_init_serial(sz_hash_state_t *state, sz_u64_t seed) { sz_unused(state && seed); }
 
-    return _sz_hash_mix(hash_low, hash_high);
+SZ_PUBLIC void sz_hash_state_stream_serial(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
+    sz_unused(state && text && length);
 }
 
-#undef _sz_shift_low
-#undef _sz_shift_high
-#undef _sz_hash_mix
-#undef _sz_prime_mod
+SZ_PUBLIC sz_u64_t sz_hash_state_fold_serial(sz_hash_state_t const *state) {
+    sz_unused(state);
+    return 0;
+}
 
 #pragma endregion // Serial Implementation
 
@@ -228,9 +279,9 @@ SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length) {
 #if SZ_USE_HASWELL
 #pragma GCC push_options
 #pragma GCC target("avx2")
-#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
+#pragma clang attribute push(__attribute__((target("avx3332"))), apply_to = function)
 
-SZ_PUBLIC sz_u64_t sz_checksum_haswell(sz_cptr_t text, sz_size_t length) {
+SZ_PUBLIC sz_u64_t sz_bytesum_haswell(sz_cptr_t text, sz_size_t length) {
     // The naive implementation of this function is very simple.
     // It assumes the CPU is great at handling unaligned "loads".
     //
@@ -240,7 +291,7 @@ SZ_PUBLIC sz_u64_t sz_checksum_haswell(sz_cptr_t text, sz_size_t length) {
     int is_huge = length > 1ull * 1024ull * 1024ull;
 
     // When the buffer is small, there isn't much to innovate.
-    if (length <= 32) { return sz_checksum_serial(text, length); }
+    if (length <= 32) { return sz_bytesum_serial(text, length); }
     else if (!is_huge) {
         sz_u256_vec_t text_vec, sums_vec;
         sums_vec.ymm = _mm256_setzero_si256();
@@ -248,6 +299,9 @@ SZ_PUBLIC sz_u64_t sz_checksum_haswell(sz_cptr_t text, sz_size_t length) {
             text_vec.ymm = _mm256_lddqu_si256((__m256i const *)text);
             sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
         }
+        // We can also avoid the final serial loop by fetching 32 bytes from end, in reverse direction,
+        // and shifting the data within the register to zero-out the duplicate bytes.
+
         // Accumulating 256 bits is harder, as we need to extract the 128-bit sums first.
         __m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymm);
         __m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymm, 1);
@@ -255,7 +309,7 @@ SZ_PUBLIC sz_u64_t sz_checksum_haswell(sz_cptr_t text, sz_size_t length) {
         sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
         sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
         sz_u64_t result = low + high;
-        if (length) result += sz_checksum_serial(text, length);
+        if (length) result += sz_bytesum_serial(text, length);
         return result;
     }
     // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
@@ -311,6 +365,24 @@ SZ_PUBLIC sz_u64_t sz_checksum_haswell(sz_cptr_t text, sz_size_t length) {
     }
 }
 
+SZ_PUBLIC sz_u64_t sz_hash_haswell(sz_cptr_t text, sz_size_t length, sz_u64_t seed) {
+    return sz_hash_serial(text, length, seed);
+}
+
+SZ_PUBLIC void sz_generate_haswell(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
+    sz_generate_serial(text, length, nonce);
+}
+
+SZ_PUBLIC void sz_hash_state_init_haswell(sz_hash_state_t *state, sz_u64_t seed) {
+    sz_hash_state_init_serial(state, seed);
+}
+
+SZ_PUBLIC void sz_hash_state_stream_haswell(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
+    sz_hash_state_stream_serial(state, text, length);
+}
+
+SZ_PUBLIC sz_u64_t sz_hash_state_fold_haswell(sz_hash_state_t const *state) { return sz_hash_state_fold_serial(state); }
+
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_HASWELL
@@ -327,7 +399,7 @@ SZ_PUBLIC sz_u64_t sz_checksum_haswell(sz_cptr_t text, sz_size_t length) {
 #pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
 #pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
 
-SZ_PUBLIC sz_u64_t sz_checksum_skylake(sz_cptr_t text, sz_size_t length) {
+SZ_PUBLIC sz_u64_t sz_bytesum_skylake(sz_cptr_t text, sz_size_t length) {
     // The naive implementation of this function is very simple.
     // It assumes the CPU is great at handling unaligned "loads".
     //
@@ -427,6 +499,24 @@ SZ_PUBLIC sz_u64_t sz_checksum_skylake(sz_cptr_t text, sz_size_t length) {
     }
 }
 
+SZ_PUBLIC sz_u64_t sz_hash_skylake(sz_cptr_t text, sz_size_t length, sz_u64_t seed) {
+    return sz_hash_serial(text, length, seed);
+}
+
+SZ_PUBLIC void sz_generate_skylake(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
+    sz_generate_serial(text, length, nonce);
+}
+
+SZ_PUBLIC void sz_hash_state_init_skylake(sz_hash_state_t *state, sz_u64_t seed) {
+    sz_hash_state_init_serial(state, seed);
+}
+
+SZ_PUBLIC void sz_hash_state_stream_skylake(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
+    sz_hash_state_stream_serial(state, text, length);
+}
+
+SZ_PUBLIC sz_u64_t sz_hash_state_fold_skylake(sz_hash_state_t const *state) { return sz_hash_state_fold_serial(state); }
+
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_SKYLAKE
@@ -441,12 +531,13 @@ SZ_PUBLIC sz_u64_t sz_checksum_skylake(sz_cptr_t text, sz_size_t length) {
 #pragma region Ice Lake Implementation
 #if SZ_USE_ICE
 #pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "avx512vnni", "bmi", "bmi2")
-#pragma clang attribute push(                                                                         \
-    __attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vnni,bmi,bmi2"))), \
+#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "avx512vnni", "bmi", "bmi2", \
+                   "aes", "vaes")
+#pragma clang attribute push(                                                                                  \
+    __attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vnni,bmi,bmi2,aes,vaes"))), \
     apply_to = function)
 
-SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
+SZ_PUBLIC sz_u64_t sz_bytesum_ice(sz_cptr_t text, sz_size_t length) {
     // The naive implementation of this function is very simple.
     // It assumes the CPU is great at handling unaligned "loads".
     //
@@ -572,6 +663,230 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
     }
 }
 
+SZ_INTERNAL void _sz_hash_minimal_init_haswell(_sz_hash_minimal_t *state, sz_u64_t seed) {
+    __m128i seed_vec = _mm_set1_epi64x(seed);
+    __m128i pi0 = _mm_set_epi64x(0x13198a2e03707344ull, 0x243f6a8885a308d3ull);
+    __m128i pi1 = _mm_set_epi64x(0x082efa98ec4e6c89ull, 0xa4093822299f31d0ull);
+    // XOR the user-supplied keys with the two "pi" constants
+    __m128i k1 = _mm_xor_si128(seed_vec, pi0);
+    __m128i k2 = _mm_xor_si128(seed_vec, pi1);
+    // Export the keys to the state
+    state->aes.xmm = k1;
+    state->sum.xmm = k2;
+    state->key.xmm = _mm_xor_si128(pi0, pi1);
+}
+
+SZ_INTERNAL sz_u64_t _sz_hash_minimal_finalize_haswell(_sz_hash_minimal_t const *state) {
+    // Combine the sum and the AES block
+    __m128i mixed_registers = _mm_aesenc_si128(state->sum.xmm, state->aes.xmm);
+    // Make sure the "key" mixes enough with the state,
+    // as with less than 2 rounds - SMHasher fails
+    __m128i mixed_within_register =
+        _mm_aesdec_si128(_mm_aesdec_si128(mixed_registers, state->key.xmm), mixed_registers);
+    // Extract the low 64 bits
+    return _mm_cvtsi128_si64(mixed_within_register);
+}
+
+SZ_INTERNAL void _sz_hash_minimal_update_haswell(_sz_hash_minimal_t *state, __m128i block) {
+    // This shuffle mask is identical to "aHash":
+    __m128i const shuffle_mask = _mm_set_epi8(          //
+        0x04, 0x0b, 0x09, 0x06, 0x08, 0x0d, 0x0f, 0x05, //
+        0x0e, 0x03, 0x01, 0x0c, 0x00, 0x07, 0x0a, 0x02);
+    state->aes.xmm = _mm_aesdec_si128(state->aes.xmm, block);
+    state->sum.xmm = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmm, shuffle_mask), block);
+}
+
+SZ_PUBLIC void sz_hash_state_init_ice(sz_hash_state_t *state, sz_u64_t seed) {
+    __m512i seed_vec = _mm512_set1_epi64(seed);
+    __m512i pi0 = _mm512_set_epi64( //
+        0x13198a2e03707344ull, 0x243f6a8885a308d3ull, 0x13198a2e03707344ull, 0x243f6a8885a308d3ull,
+        0x13198a2e03707344ull, 0x243f6a8885a308d3ull, 0x13198a2e03707344ull, 0x243f6a8885a308d3ull);
+    __m512i pi1 = _mm512_set_epi64( //
+        0x082efa98ec4e6c89ull, 0xa4093822299f31d0ull, 0x082efa98ec4e6c89ull, 0xa4093822299f31d0ull,
+        0x082efa98ec4e6c89ull, 0xa4093822299f31d0ull, 0x082efa98ec4e6c89ull, 0xa4093822299f31d0ull);
+    // XOR the user-supplied keys with the two "pi" constants
+    __m512i k1 = _mm512_xor_si512(seed_vec, pi0);
+    __m512i k2 = _mm512_xor_si512(seed_vec, pi1);
+    // Export the keys to the state
+    state->aes.zmm = k1;
+    state->sum.zmm = k2;
+    state->key.zmm = _mm512_xor_si512(pi0, pi1);
+    state->ins_length = 0;
+}
+
+SZ_INTERNAL void _sz_hash_state_update_ice(sz_hash_state_t *state, __m512i block) {
+    // This shuffle mask is identical to "aHash":
+    __m512i const shuffle_mask = _mm512_set_epi8(       //
+        0x04, 0x0b, 0x09, 0x06, 0x08, 0x0d, 0x0f, 0x05, //
+        0x0e, 0x03, 0x01, 0x0c, 0x00, 0x07, 0x0a, 0x02, //
+        0x04, 0x0b, 0x09, 0x06, 0x08, 0x0d, 0x0f, 0x05, //
+        0x0e, 0x03, 0x01, 0x0c, 0x00, 0x07, 0x0a, 0x02, //
+        0x04, 0x0b, 0x09, 0x06, 0x08, 0x0d, 0x0f, 0x05, //
+        0x0e, 0x03, 0x01, 0x0c, 0x00, 0x07, 0x0a, 0x02, //
+        0x04, 0x0b, 0x09, 0x06, 0x08, 0x0d, 0x0f, 0x05, //
+        0x0e, 0x03, 0x01, 0x0c, 0x00, 0x07, 0x0a, 0x02  //
+    );
+    state->aes.zmm = _mm512_aesdec_epi128(state->aes.zmm, block);
+    state->sum.zmm = _mm512_add_epi64(_mm512_shuffle_epi8(state->sum.zmm, shuffle_mask), block);
+}
+
+SZ_INTERNAL sz_u64_t _sz_hash_state_finalize_ice(sz_hash_state_t const *state) {
+    // Combine the sum and the AES block
+    __m128i mixed_registers0 = _mm_aesenc_si128(state->sum.xmms[0], state->aes.xmms[0]);
+    __m128i mixed_registers1 = _mm_aesenc_si128(state->sum.xmms[1], state->aes.xmms[1]);
+    __m128i mixed_registers2 = _mm_aesenc_si128(state->sum.xmms[2], state->aes.xmms[2]);
+    __m128i mixed_registers3 = _mm_aesenc_si128(state->sum.xmms[3], state->aes.xmms[3]);
+    // Combine the mixed registers
+    __m128i mixed_registers01 = _mm_aesenc_si128(mixed_registers0, mixed_registers1);
+    __m128i mixed_registers23 = _mm_aesenc_si128(mixed_registers2, mixed_registers3);
+    __m128i mixed_registers = _mm_aesenc_si128(mixed_registers01, mixed_registers23);
+    // Make sure the "key" mixes enough with the state,
+    // as with less than 2 rounds - SMHasher fails
+    __m128i mixed_within_register = _mm_aesdec_si128( //
+        _mm_aesdec_si128(mixed_registers, state->key.xmms[0]), mixed_registers);
+    // Extract the low 64 bits
+    return _mm_cvtsi128_si64(mixed_within_register);
+}
+
+SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
+
+    if (length <= 16) {
+        // Initialize the AES block with a given seed and update with the input length
+        _sz_hash_minimal_t state;
+        _sz_hash_minimal_init_haswell(&state, seed);
+        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
+        // Load the data and update the state
+        sz_u128_vec_t data_vec;
+        data_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length), start);
+        _sz_hash_minimal_update_haswell(&state, data_vec.xmm);
+        return _sz_hash_minimal_finalize_haswell(&state);
+    }
+    else if (length <= 32) {
+        // Initialize the AES block with a given seed and update with the input length
+        _sz_hash_minimal_t state;
+        _sz_hash_minimal_init_haswell(&state, seed);
+        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
+        // Load the data and update the state
+        sz_u128_vec_t data0_vec, data1_vec;
+        data0_vec.xmm = _mm_loadu_epi8(start);
+        data1_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length - 16), start + 16);
+        _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
+        _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
+        return _sz_hash_minimal_finalize_haswell(&state);
+    }
+    else if (length <= 48) {
+        // Initialize the AES block with a given seed and update with the input length
+        _sz_hash_minimal_t state;
+        _sz_hash_minimal_init_haswell(&state, seed);
+        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
+        // Load the data and update the state
+        sz_u128_vec_t data0_vec, data1_vec, data2_vec;
+        data0_vec.xmm = _mm_loadu_epi8(start);
+        data1_vec.xmm = _mm_loadu_epi8(start + 16);
+        data2_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length - 32), start + 32);
+        _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
+        _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
+        _sz_hash_minimal_update_haswell(&state, data2_vec.xmm);
+        return _sz_hash_minimal_finalize_haswell(&state);
+    }
+    else if (length <= 64) {
+        // Initialize the AES block with a given seed and update with the input length
+        _sz_hash_minimal_t state;
+        _sz_hash_minimal_init_haswell(&state, seed);
+        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
+        // Load the data and update the state
+        sz_u128_vec_t data0_vec, data1_vec, data2_vec, data3_vec;
+        data0_vec.xmm = _mm_loadu_epi8(start);
+        data1_vec.xmm = _mm_loadu_epi8(start + 16);
+        data2_vec.xmm = _mm_loadu_epi8(start + 32);
+        data3_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length - 48), start + 48);
+        _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
+        _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
+        _sz_hash_minimal_update_haswell(&state, data2_vec.xmm);
+        _sz_hash_minimal_update_haswell(&state, data3_vec.xmm);
+        return _sz_hash_minimal_finalize_haswell(&state);
+    }
+    else {
+        // Use a larger state to handle the main loop and add different offsets
+        // to different lanes of the register
+        sz_hash_state_t state;
+        sz_hash_state_init_ice(&state, seed);
+        state.aes.zmm = _mm512_add_epi64( //
+            state.aes.zmm,                //
+            _mm512_set_epi64(0, length, 16, length, 32, length, 48, length));
+
+        for (; state.ins_length + 64 <= length; state.ins_length += 64) {
+            state.ins.zmm = _mm512_loadu_epi8(start + state.ins_length);
+            _sz_hash_state_update_ice(&state, state.ins.zmm);
+        }
+        if (state.ins_length < length) {
+            state.ins.zmm = _mm512_maskz_loadu_epi8( //
+                _sz_u64_mask_until(length - state.ins_length), start + state.ins_length);
+            _sz_hash_state_update_ice(&state, state.ins.zmm);
+        }
+        return _sz_hash_state_finalize_ice(&state);
+    }
+}
+
+SZ_PUBLIC void sz_generate_ice(sz_ptr_t output, sz_size_t length, sz_u64_t nonce) {
+    // We can use `_mm512_broadcast_i32x4` and the `vbroadcasti32x4` instruction, but its latency is freaking 8 cycles.
+    // The `_mm512_shuffle_i32x4` and the `vshufi32x4` instruction has a latency of 3 cycles, somewhat better.
+    // The `_mm512_permutex_epi64` and the `vpermq` instruction also has a latency of 3 cycles.
+    // So we want to avoid that, if possible.
+    __m128i nonce_vec = _mm_set1_epi64x(nonce);
+    __m128i key128 = _mm_xor_si128(nonce_vec, _mm_set_epi64x(0x13198a2e03707344ull, 0x243f6a8885a308d3ull));
+    if (length <= 16) {
+        __mmask16 mask = _sz_u16_mask_until(length);
+        __m128i input = _mm_set1_epi64x(nonce);
+        __m128i generated = _mm_aesenc_si128(input, key128);
+        _mm_mask_storeu_epi8((void *)output, mask, generated);
+    }
+    // Assuming the YMM register contains two 128-bit blocks, the input to the generator
+    // will be more complex, containing the sum of the nonce and the block number.
+    else if (length <= 32) {
+        __mmask32 mask = _sz_u32_mask_until(length);
+        __m256i input = _mm256_set_epi64x(nonce + 1, nonce + 1, nonce, nonce);
+        __m256i key256 =
+            _mm256_permute2x128_si256(_mm256_castsi128_si256(key128), _mm256_castsi128_si256(key128), 0x00);
+        __m256i generated = _mm256_aesenc_epi128(input, key256);
+        _mm256_mask_storeu_epi8((void *)output, mask, generated);
+    }
+    // The last special case we handle outside of the primary loop is for buffers up to 64 bytes long.
+    else if (length <= 64) {
+        __mmask64 mask = _sz_u64_mask_until(length);
+        __m512i input = _mm512_set_epi64(               //
+            nonce + 3, nonce + 3, nonce + 2, nonce + 2, //
+            nonce + 1, nonce + 1, nonce, nonce);
+        __m512i key512 = _mm512_permutex_epi64(_mm512_castsi128_si512(key128), 0x00);
+        __m512i generated = _mm512_aesenc_epi128(input, key512);
+        _mm512_mask_storeu_epi8((void *)output, mask, generated);
+    }
+    // The final part of the function is the primary loop, which processes the buffer in 64-byte chunks.
+    else {
+        __m512i increment = _mm512_set1_epi64(4);
+        __m512i input = _mm512_set_epi64(               //
+            nonce + 3, nonce + 3, nonce + 2, nonce + 2, //
+            nonce + 1, nonce + 1, nonce, nonce);
+        __m512i key512 = _mm512_permutex_epi64(_mm512_castsi128_si512(key128), 0x00);
+        sz_size_t i = 0;
+        for (; i + 64 <= length; i += 64) {
+            __m512i generated = _mm512_aesenc_epi128(input, key512);
+            _mm512_storeu_epi8((void *)(output + i), generated);
+            input = _mm512_add_epi64(input, increment);
+        }
+        // Handle the tail of the buffer.
+        __mmask64 mask = _sz_u64_mask_until(length - i);
+        __m512i generated = _mm512_aesenc_epi128(input, key512);
+        _mm512_mask_storeu_epi8((void *)(output + i), mask, generated);
+    }
+}
+
+SZ_PUBLIC void sz_hash_state_stream_ice(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
+    sz_hash_state_stream_serial(state, text, length);
+}
+
+SZ_PUBLIC sz_u64_t sz_hash_state_fold_ice(sz_hash_state_t const *state) { return sz_hash_state_fold_serial(state); }
+
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_ICE
@@ -586,7 +901,7 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
 #pragma GCC target("arch=armv8.2-a+simd")
 #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
 
-SZ_PUBLIC sz_u64_t sz_checksum_neon(sz_cptr_t text, sz_size_t length) {
+SZ_PUBLIC sz_u64_t sz_bytesum_neon(sz_cptr_t text, sz_size_t length) {
     uint64x2_t sum_vec = vdupq_n_u64(0);
 
     // Process 16 bytes (128 bits) at a time
@@ -600,10 +915,20 @@ SZ_PUBLIC sz_u64_t sz_checksum_neon(sz_cptr_t text, sz_size_t length) {
 
     // Final reduction of `sum_vec` to a single scalar
     sz_u64_t sum = vgetq_lane_u64(sum_vec, 0) + vgetq_lane_u64(sum_vec, 1);
-    if (length) sum += sz_checksum_serial(text, length);
+    if (length) sum += sz_bytesum_serial(text, length);
     return sum;
 }
 
+SZ_PUBLIC void sz_hash_state_init_neon(sz_hash_state_t *state, sz_u64_t seed) {
+    sz_hash_state_init_serial(state, seed);
+}
+
+SZ_PUBLIC void sz_hash_state_stream_neon(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
+    sz_hash_state_stream_serial(state, text, length);
+}
+
+SZ_PUBLIC sz_u64_t sz_hash_state_fold_neon(sz_hash_state_t const *state) { return sz_hash_state_fold_serial(state); }
+
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_NEON
@@ -629,23 +954,88 @@ SZ_PUBLIC sz_u64_t sz_checksum_neon(sz_cptr_t text, sz_size_t length) {
 #pragma region Compile Time Dispatching
 #if !SZ_DYNAMIC_DISPATCH
 
-SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length) {
+SZ_DYNAMIC sz_u64_t sz_bytesum(sz_cptr_t text, sz_size_t length) {
+#if SZ_USE_ICE
+    return sz_bytesum_ice(text, length);
+#elif SZ_USE_SKYLAKE
+    return sz_bytesum_skylake(text, length);
+#elif SZ_USE_HASWELL
+    return sz_bytesum_haswell(text, length);
+#elif SZ_USE_NEON
+    return sz_bytesum_neon(text, length);
+#else
+    return sz_bytesum_serial(text, length);
+#endif
+}
+
+SZ_DYNAMIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length, sz_u64_t seed) {
+#if SZ_USE_ICE
+    return sz_hash_ice(text, length, seed);
+#elif SZ_USE_SKYLAKE
+    return sz_hash_skylake(text, length, seed);
+#elif SZ_USE_HASWELL
+    return sz_hash_haswell(text, length, seed);
+#elif SZ_USE_NEON
+    return sz_hash_neon(text, length, seed);
+#else
+    return sz_hash_serial(text, length, seed);
+#endif
+}
+
+SZ_DYNAMIC void sz_generate(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
 #if SZ_USE_ICE
-    return sz_checksum_ice(text, length);
+    sz_generate_ice(text, length, nonce);
 #elif SZ_USE_SKYLAKE
-    return sz_checksum_skylake(text, length);
+    sz_generate_skylake(text, length, nonce);
 #elif SZ_USE_HASWELL
-    return sz_checksum_haswell(text, length);
+    sz_generate_haswell(text, length, nonce);
 #elif SZ_USE_NEON
-    return sz_checksum_neon(text, length);
+    sz_generate_neon(text, length, nonce);
 #else
-    return sz_checksum_serial(text, length);
+    sz_generate_serial(text, length, nonce);
 #endif
 }
 
-SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
-                            sz_random_generator_t generator, void *generator_user_data) {
-    sz_generate_serial(alphabet, alphabet_size, result, result_length, generator, generator_user_data);
+SZ_DYNAMIC void sz_hash_state_init(sz_hash_state_t *state, sz_u64_t seed) {
+#if SZ_USE_ICE
+    sz_hash_state_init_ice(state, seed);
+#elif SZ_USE_SKYLAKE
+    sz_hash_state_init_skylake(state, seed);
+#elif SZ_USE_HASWELL
+    sz_hash_state_init_haswell(state, seed);
+#elif SZ_USE_NEON
+    sz_hash_state_init_neon(state, seed);
+#else
+    sz_hash_state_init_serial(state, seed);
+#endif
+}
+
+SZ_DYNAMIC void sz_hash_state_stream(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
+#if SZ_USE_ICE
+    sz_hash_state_stream_ice(state, text, length);
+#elif SZ_USE_SKYLAKE
+    sz_hash_state_stream_skylake(state, text, length);
+#elif SZ_USE_HASWELL
+    sz_hash_state_stream_haswell(state, text, length);
+#elif SZ_USE_NEON
+    sz_hash_state_stream_neon(state, text, length);
+#else
+    sz_hash_state_stream_serial(state, text, length);
+#endif
+}
+
+SZ_DYNAMIC sz_u64_t sz_hash_state_fold(sz_hash_state_t const *state) {
+#if SZ_USE_ICE
+    return sz_hash_state_fold_ice(state);
+#elif SZ_USE_SKYLAKE
+    return sz_hash_state_fold_skylake(state);
+#elif SZ_USE_HASWELL
+    return sz_hash_state_fold_haswell(state);
+#elif SZ_USE_NEON
+    return sz_hash_state_fold_neon(state);
+#else
+    return sz_hash_state_fold_serial(state);
+#endif
 }
 
 #endif            // !SZ_DYNAMIC_DISPATCH
diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index 977d29e1..4e1a6377 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -31,6 +31,7 @@
 #include "types.h"
 
 #include "compare.h" // `sz_compare`
+#include "memory.h"  // `sz_copy`
 
 #ifdef __cplusplus
 extern "C" {
@@ -414,19 +415,19 @@ SZ_INTERNAL void _sz_sequence_argsort_serial_export_next_pgrams(
  *  @brief  Picks the "pivot" value for the QuickSort algorithm's partitioning step using Robert Sedgewick's method,
  *          the median of three elements - the first, the middle, and the last element of the given range.
  */
-SZ_INTERNAL sz_pgram_t _sz_sequence_partitioning_pivot(sz_pgram_t const *pgrams, sz_size_t count) {
+SZ_INTERNAL sz_pgram_t const *_sz_sequence_partitioning_pivot(sz_pgram_t const *pgrams, sz_size_t count) {
     sz_size_t const middle_offset = count / 2;
-    sz_pgram_t const first_pgram = pgrams[0];
-    sz_pgram_t const middle_pgram = pgrams[middle_offset];
-    sz_pgram_t const last_pgram = pgrams[count - 1];
-    if (first_pgram < middle_pgram) {
-        if (middle_pgram < last_pgram) { return middle_pgram; }
-        else if (first_pgram < last_pgram) { return last_pgram; }
+    sz_pgram_t const *first_pgram = &pgrams[0];
+    sz_pgram_t const *middle_pgram = &pgrams[middle_offset];
+    sz_pgram_t const *last_pgram = &pgrams[count - 1];
+    if (*first_pgram < *middle_pgram) {
+        if (*middle_pgram < *last_pgram) { return middle_pgram; }
+        else if (*first_pgram < *last_pgram) { return last_pgram; }
         else { return first_pgram; }
     }
     else {
-        if (first_pgram < last_pgram) { return first_pgram; }
-        else if (middle_pgram < last_pgram) { return last_pgram; }
+        if (*first_pgram < *last_pgram) { return first_pgram; }
+        else if (*middle_pgram < *last_pgram) { return last_pgram; }
         else { return middle_pgram; }
     }
 }
@@ -440,7 +441,7 @@ SZ_INTERNAL sz_pgram_t _sz_sequence_partitioning_pivot(sz_pgram_t const *pgrams,
  *
  *  @see https://en.wikipedia.org/wiki/Dutch_national_flag_problem
  */
-SZ_PUBLIC void _sz_sequence_argsort_serial_3way_partition(                //
+SZ_INTERNAL void _sz_sequence_argsort_serial_3way_partition(              //
     sz_pgram_t *const global_pgrams, sz_sorted_idx_t *const global_order, //
     sz_size_t const start_in_sequence, sz_size_t const end_in_sequence,   //
     sz_size_t *first_pivot_offset, sz_size_t *last_pivot_offset) {
@@ -459,7 +460,7 @@ SZ_PUBLIC void _sz_sequence_argsort_serial_3way_partition(                //
     }
 
     // Chose the pivot offset with Sedgewick's method.
-    sz_pgram_t const pivot_pgram = _sz_sequence_partitioning_pivot(global_pgrams + start_in_sequence, count);
+    sz_pgram_t const pivot_pgram = *_sz_sequence_partitioning_pivot(global_pgrams + start_in_sequence, count);
 
     // Loop through the collection and move the elements around the pivot with the 3-way partitioning.
     sz_size_t partitioning_progress = start_in_sequence; // Current index.
@@ -492,7 +493,7 @@ SZ_PUBLIC void _sz_sequence_argsort_serial_3way_partition(                //
  *  @brief  Recursive Quick-Sort implementation backing both the `sz_sequence_argsort` and `sz_pgrams_sort`,
  *          and using the `_sz_sequence_argsort_serial_3way_partition` under the hood.
  */
-SZ_INTERNAL void _sz_sequence_argsort_serial_recursively(                 //
+SZ_PUBLIC void _sz_sequence_argsort_serial_recursively(                   //
     sz_pgram_t *const global_pgrams, sz_sorted_idx_t *const global_order, //
     sz_size_t const start_in_sequence, sz_size_t const end_in_sequence) {
 
@@ -517,7 +518,7 @@ SZ_INTERNAL void _sz_sequence_argsort_serial_recursively(                 //
  *          It combines `_sz_sequence_argsort_serial_export_next_pgrams` and `_sz_sequence_argsort_serial_recursively`,
  *          recursively diving into the identical pgrams.
  */
-SZ_INTERNAL void _sz_sequence_argsort_serial_next_pgrams(                 //
+SZ_PUBLIC void _sz_sequence_argsort_serial_next_pgrams(                   //
     sz_sequence_t const *const sequence,                                  //
     sz_pgram_t *const global_pgrams, sz_sorted_idx_t *const global_order, //
     sz_size_t const start_in_sequence, sz_size_t const end_in_sequence,   //
@@ -735,43 +736,177 @@ SZ_PUBLIC sz_bool_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t c
 
 #pragma endregion // Serial MergeSort Implementation
 
+/*  AVX512 implementation of the string search algorithms for Ice Lake and newer CPUs.
+ *  Includes extensions:
+ *      - 2017 Skylake: F, CD, ER, PF, VL, DQ, BW,
+ *      - 2018 CannonLake: IFMA, VBMI,
+ *      - 2019 Ice Lake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES.
+ *
+ *  We are going to use VBMI2 for `_mm256_maskz_compress_epi8`.
+ */
 #pragma region Ice Lake Implementation
+#if SZ_USE_ICE
+#pragma GCC push_options
+#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "avx512vbmi2", "bmi", "bmi2")
+#pragma clang attribute push(                                                                          \
+    __attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vbmi2,bmi,bmi2"))), \
+    apply_to = function)
 
-SZ_PUBLIC void _sz_sequence_argsort_ice_recursively(                    //
-    sz_sequence_t const *const collection,                              //
-    sz_pgram_t *const global_pgrams, sz_size_t *const global_order,     //
-    sz_size_t const start_in_sequence, sz_size_t const end_in_sequence, //
-    sz_size_t const start_character) {
+/**
+ *  @brief  The most important part of the QuickSort algorithm partitioning the elements around the pivot.
+ *          Unlike the serial algorithm, uses compressed stores to filter and move the elements around the pivot.
+ *          Assuming the extreme cost of shuffling between 2 ZMM registers based on 2 different masks, we use
+ *          extra memory to store the elements smaller and greater than the pivot somewhere else.
+ */
+SZ_INTERNAL void _sz_sequence_argsort_ice_2way_partition(                           //
+    sz_pgram_t *const initial_pgrams, sz_sorted_idx_t *const initial_order,         //
+    sz_pgram_t *const partitioned_pgrams, sz_sorted_idx_t *const partitioned_order, //
+    sz_size_t const start_in_sequence, sz_size_t const end_in_sequence,             //
+    sz_size_t *const first_pivot_offset, sz_size_t *const last_pivot_offset) {
 
-    // Prepare the new range of windows
-    _sz_sequence_argsort_serial_export_next_pgrams(collection, global_pgrams, global_order, start_in_sequence,
-                                                   end_in_sequence, start_character);
+    sz_size_t const count = end_in_sequence - start_in_sequence;
+    sz_size_t const pgrams_per_register = sizeof(sz_u512_vec_t) / sizeof(sz_pgram_t);
 
-    // We can implement a form of a Radix sort here, that will count the number of elements with
-    // a certain bit set. The naive approach may require too many loops over data. A more "vectorized"
-    // approach would be to maintain a histogram for several bits at once. For 4 bits we will
-    // need 2^4 = 16 counters.
-    sz_size_t histogram[16] = {0};
-    for (sz_size_t byte_in_window = 0; byte_in_window != sizeof(sz_pgram_t); ++byte_in_window) {
-        // First sort based on the low nibble of each byte.
-        for (sz_size_t i = start_in_sequence; i < end_in_sequence; ++i) {
-            sz_size_t const byte = (global_pgrams[i] >> (byte_in_window * 8)) & 0xFF;
-            ++histogram[byte];
-        }
-        sz_size_t offset = start_in_sequence;
-        for (sz_size_t i = 0; i != 16; ++i) {
-            sz_size_t const count = histogram[i];
-            histogram[i] = offset;
-            offset += count;
+    // Choose the pivot offset with Sedgewick's method.
+    sz_pgram_t const *pivot_pgram_ptr = _sz_sequence_partitioning_pivot(initial_order + start_in_sequence, count);
+    sz_pgram_t const pivot_pgram = *pivot_pgram_ptr;
+    sz_u512_vec_t pivot_vec;
+    pivot_vec.zmm = _mm512_set1_epi64(pivot_pgram);
+
+    // Reading data is always cheaper than writing, so we can further minimize the writes, if
+    // we know exactly, how many elements are smaller or greater than the pivot.
+    sz_size_t count_smaller = 0, count_greater = 0;
+    sz_size_t const tail_count = count & 7u;
+    __mmask8 const tail_mask = _sz_u8_mask_until(tail_count);
+
+    sz_u512_vec_t pgrams_vec, order_vec;
+    for (sz_size_t i = start_in_sequence; i < end_in_sequence; i += pgrams_per_register) {
+        pgrams_vec.zmm =                               //
+            i + pgrams_per_register <= end_in_sequence //
+                ? _mm512_loadu_si512(initial_pgrams + i)
+                : _mm512_maskz_loadu_epi64(tail_mask, initial_pgrams + i);
+        count_smaller += sz_u32_popcount(_mm512_cmplt_epu64_mask(pgrams_vec.zmm, pivot_vec.zmm));
+        count_greater += sz_u32_popcount(_mm512_cmpgt_epu64_mask(pgrams_vec.zmm, pivot_vec.zmm));
+    }
+
+    // Now all we need to do is to loop through the collection and export them into the temporary buffer
+    // in 3 separate segments - smaller, equal, and greater than the pivot.
+    sz_size_t const count_equal = count - count_smaller - count_greater;
+    sz_size_t smaller_offset = start_in_sequence;
+    sz_size_t equal_offset = start_in_sequence + count_smaller;
+    sz_size_t greater_offset = start_in_sequence + count_smaller + count_equal;
+
+    // The naive algorithm - unzip the elements into 3 separate buffers.
+    for (sz_size_t i = start_in_sequence; i < end_in_sequence; i += pgrams_per_register) {
+        if (i + pgrams_per_register <= end_in_sequence) {
+            pgrams_vec.zmm = _mm512_loadu_si512(initial_pgrams + i);
+            order_vec.zmm = _mm512_loadu_si512(initial_order + i);
         }
-        for (sz_size_t i = start_in_sequence; i < end_in_sequence; ++i) {
-            sz_size_t const byte = (global_pgrams[i] >> (byte_in_window * 8)) & 0xFF;
-            global_order[histogram[byte]] = i;
-            ++histogram[byte];
+        else {
+            pgrams_vec.zmm = _mm512_maskz_loadu_epi64(tail_count, initial_pgrams + i);
+            order_vec.zmm = _mm512_maskz_loadu_epi64(tail_count, initial_order + i);
         }
+        pgrams_vec.zmm = _mm512_loadu_si512(initial_pgrams + i);
+        order_vec.zmm = _mm512_loadu_si512(initial_order + i);
+        __mmask8 const smaller_mask = _mm512_cmplt_epu64_mask(pgrams_vec.zmm, pivot_vec.zmm);
+        __mmask8 const equal_mask = _mm512_cmpeq_epu64_mask(pgrams_vec.zmm, pivot_vec.zmm);
+        __mmask8 const greater_mask = _mm512_cmpgt_epu64_mask(pgrams_vec.zmm, pivot_vec.zmm);
+
+        // Compress the elements into the temporary buffer.
+        _mm512_mask_compressstoreu_epi64(partitioned_pgrams + smaller_offset, smaller_mask, pgrams_vec.zmm);
+        _mm512_mask_compressstoreu_epi64(partitioned_order + smaller_offset, smaller_mask, order_vec.zmm);
+        smaller_offset += _mm_popcnt_u32(smaller_mask);
+
+        _mm512_mask_compressstoreu_epi64(partitioned_pgrams + equal_offset, equal_mask, pgrams_vec.zmm);
+        _mm512_mask_compressstoreu_epi64(partitioned_order + equal_offset, equal_mask, order_vec.zmm);
+        equal_offset += _mm_popcnt_u32(equal_mask);
+
+        _mm512_mask_compressstoreu_epi64(partitioned_pgrams + greater_offset, greater_mask, pgrams_vec.zmm);
+        _mm512_mask_compressstoreu_epi64(partitioned_order + greater_offset, greater_mask, order_vec.zmm);
+        greater_offset += _mm_popcnt_u32(greater_mask);
+    }
+
+    // Copy back.
+    sz_copy((sz_ptr_t)(initial_pgrams), (sz_cptr_t)(partitioned_pgrams), count_smaller * sizeof(sz_pgram_t));
+    sz_copy((sz_ptr_t)(initial_order), (sz_cptr_t)(partitioned_order), count_smaller * sizeof(sz_pgram_t));
+    sz_copy((sz_ptr_t)(initial_pgrams + count_smaller),      //
+            (sz_cptr_t)(partitioned_pgrams + count_smaller), //
+            count_equal * sizeof(sz_pgram_t));
+    sz_copy((sz_ptr_t)(initial_order + count_smaller),      //
+            (sz_cptr_t)(partitioned_order + count_smaller), //
+            count_equal * sizeof(sz_pgram_t));
+    sz_copy((sz_ptr_t)(initial_pgrams + count_smaller + count_equal),      //
+            (sz_cptr_t)(partitioned_pgrams + count_smaller + count_equal), //
+            count_greater);
+    sz_copy((sz_ptr_t)(initial_order + count_smaller + count_equal),      //
+            (sz_cptr_t)(partitioned_order + count_smaller + count_equal), //
+            count_greater);
+
+    // Return the offsets of the equal elements.
+    *first_pivot_offset = count_smaller;
+    *last_pivot_offset = count_smaller + count_equal;
+}
+
+/**
+ *  @brief  Recursive Quick-Sort implementation backing both the `sz_sequence_argsort_ice` and `sz_pgrams_sort_ice`,
+ *          and using the `_sz_sequence_argsort_ice_2way_partition` under the hood.
+ */
+SZ_INTERNAL void _sz_sequence_argsort_ice_recursively(              //
+    sz_pgram_t *initial_pgrams, sz_sorted_idx_t *initial_order,     //
+    sz_pgram_t *temporary_pgrams, sz_sorted_idx_t *temporary_order, //
+    sz_size_t const start_in_sequence, sz_size_t const end_in_sequence) {
+
+    // On very small inputs, when we don't even have enough input for a single ZMM register,
+    // use simple insertion sort without any extra memory.
+    sz_size_t const count = end_in_sequence - start_in_sequence;
+    sz_size_t const pgrams_per_register = sizeof(sz_u512_vec_t) / sizeof(sz_pgram_t);
+    if (count <= pgrams_per_register) {
+        sz_pgrams_sort_stable_with_insertion( //
+            initial_pgrams + start_in_sequence, count, initial_order + start_in_sequence);
+        return;
     }
+
+    // Partition the collection around some pivot
+    sz_size_t first_pivot_index, last_pivot_index;
+    _sz_sequence_argsort_ice_2way_partition(                              //
+        initial_pgrams, initial_order, temporary_pgrams, temporary_order, //
+        start_in_sequence, end_in_sequence,                               //
+        &first_pivot_index, &last_pivot_index);
+
+    // Recursively sort the left and right partitions, tracking where the output goes
+    if (start_in_sequence < first_pivot_index)
+        _sz_sequence_argsort_ice_recursively(                                 //
+            initial_pgrams, initial_order, temporary_pgrams, temporary_order, //
+            start_in_sequence, first_pivot_index);
+    if (last_pivot_index + 1 < end_in_sequence)
+        _sz_sequence_argsort_ice_recursively(                                 //
+            initial_pgrams, initial_order, temporary_pgrams, temporary_order, //
+            last_pivot_index + 1, end_in_sequence);
+}
+
+SZ_PUBLIC sz_bool_t sz_pgrams_sort_ice(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                       sz_sorted_idx_t *order) {
+
+    // First, initialize the `order` with `std::iota`-like behavior.
+    for (sz_size_t i = 0; i != count; ++i) order[i] = i;
+
+    // Allocate memory for partitioning the elements around the pivot.
+    sz_size_t memory_usage = sizeof(sz_pgram_t) * count + sizeof(sz_sorted_idx_t) * count;
+    sz_pgram_t *temporary_pgrams = (sz_pgram_t *)alloc->allocate(memory_usage, alloc);
+    sz_sorted_idx_t *temporary_order = (sz_sorted_idx_t *)(temporary_pgrams + count);
+    if (!temporary_pgrams) return sz_false_k;
+
+    // Reuse the string sorting algorithm for sorting the "pgrams".
+    _sz_sequence_argsort_ice_recursively(pgrams, order, temporary_pgrams, temporary_order, 0, count);
+
+    // Deallocate the temporary memory used for partitioning.
+    alloc->free(temporary_pgrams, memory_usage, alloc);
+    return sz_true_k;
 }
 
+#pragma clang attribute pop
+#pragma GCC pop_options
+#endif            // SZ_USE_ICE
 #pragma endregion // Ice Lake Implementation
 
 /*  Pick the right implementation for the string search algorithms.
diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 12f65265..3f5466b6 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -1926,7 +1926,9 @@ class basic_string_slice {
 #pragma endregion
 
     /**  @brief  Hashes the string, equivalent to `std::hash<string_view>{}(str)`. */
-    size_type hash() const noexcept { return static_cast<size_type>(sz_hash(start_, length_)); }
+    size_type hash(std::uint64_t seed = 42) const noexcept {
+        return static_cast<size_type>(sz_hash(start_, length_, static_cast<sz_u64_t>(seed)));
+    }
 
     /**  @brief  Aggregates the values of individual bytes of a string. */
     size_type bytesum() const noexcept { return static_cast<size_type>(sz_bytesum(start_, length_)); }
@@ -2795,7 +2797,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Erases ( @b in-place ) a range of characters defined with signed offsets.
+     *  @brief  Erases @b (in-place) a range of characters defined with signed offsets.
      *  @return Number of characters removed.
      */
     size_type try_erase(difference_type signed_start_offset = 0, difference_type signed_end_offset = npos) noexcept {
@@ -2807,7 +2809,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Inserts ( @b in-place ) a range of characters at a given signed offset.
+     *  @brief  Inserts @b (in-place) a range of characters at a given signed offset.
      *  @return `true` if the insertion was successful, `false` otherwise.
      */
     bool try_insert(difference_type signed_offset, string_view string) noexcept {
@@ -2823,7 +2825,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces ( @b in-place ) a range of characters with a given string.
+     *  @brief  Replaces @b (in-place) a range of characters with a given string.
      *  @return `true` if the replacement was successful, `false` otherwise.
      */
     bool try_replace(difference_type signed_start_offset, difference_type signed_end_offset,
@@ -2874,7 +2876,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Inserts ( @b in-place ) a ::character multiple times at the given offset.
+     *  @brief  Inserts @b (in-place) a ::character multiple times at the given offset.
      *  @throw  `std::out_of_range` if `offset > size()`.
      *  @throw  `std::length_error` if the string is too long.
      *  @throw  `std::bad_alloc` if the allocation fails.
@@ -2890,7 +2892,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Inserts ( @b in-place ) a range of characters at the given offset.
+     *  @brief  Inserts @b (in-place) a range of characters at the given offset.
      *  @throw  `std::out_of_range` if `offset > size()`.
      *  @throw  `std::length_error` if the string is too long.
      *  @throw  `std::bad_alloc` if the allocation fails.
@@ -2907,7 +2909,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Inserts ( @b in-place ) a range of characters at the given offset.
+     *  @brief  Inserts @b (in-place) a range of characters at the given offset.
      *  @throw  `std::out_of_range` if `offset > size()`.
      *  @throw  `std::length_error` if the string is too long.
      *  @throw  `std::bad_alloc` if the allocation fails.
@@ -2917,7 +2919,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Inserts ( @b in-place ) a slice of another string at the given offset.
+     *  @brief  Inserts @b (in-place) a slice of another string at the given offset.
      *  @throw  `std::out_of_range` if `offset > size()` or `other_index > other.size()`.
      *  @throw  `std::length_error` if the string is too long.
      *  @throw  `std::bad_alloc` if the allocation fails.
@@ -2928,7 +2930,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Inserts ( @b in-place ) one ::character at the given iterator position.
+     *  @brief  Inserts @b (in-place) one ::character at the given iterator position.
      *  @throw  `std::out_of_range` if `pos > size()` or `other_index > other.size()`.
      *  @throw  `std::length_error` if the string is too long.
      *  @throw  `std::bad_alloc` if the allocation fails.
@@ -2940,7 +2942,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Inserts ( @b in-place ) a ::character multiple times at the given iterator position.
+     *  @brief  Inserts @b (in-place) a ::character multiple times at the given iterator position.
      *  @throw  `std::out_of_range` if `pos > size()` or `other_index > other.size()`.
      *  @throw  `std::length_error` if the string is too long.
      *  @throw  `std::bad_alloc` if the allocation fails.
@@ -2952,7 +2954,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Inserts ( @b in-place ) a range at the given iterator position.
+     *  @brief  Inserts @b (in-place) a range at the given iterator position.
      *  @throw  `std::out_of_range` if `pos > size()` or `other_index > other.size()`.
      *  @throw  `std::length_error` if the string is too long.
      *  @throw  `std::bad_alloc` if the allocation fails.
@@ -2975,7 +2977,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Inserts ( @b in-place ) an initializer list of characters.
+     *  @brief  Inserts @b (in-place) an initializer list of characters.
      *  @throw  `std::out_of_range` if `pos > size()` or `other_index > other.size()`.
      *  @throw  `std::length_error` if the string is too long.
      *  @throw  `std::bad_alloc` if the allocation fails.
@@ -2985,7 +2987,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Erases ( @b in-place ) the given range of characters.
+     *  @brief  Erases @b (in-place) the given range of characters.
      *  @throws `std::out_of_range` if `pos > size()`.
      *  @see    `try_erase_slice` for a cleaner exception-less alternative.
      */
@@ -2997,7 +2999,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Erases ( @b in-place ) the given range of characters.
+     *  @brief  Erases @b (in-place) the given range of characters.
      *  @return Iterator pointing following the erased character, or end() if no such character exists.
      */
     iterator erase(const_iterator first, const_iterator last) noexcept {
@@ -3008,13 +3010,13 @@ class basic_string {
     }
 
     /**
-     *  @brief  Erases ( @b in-place ) the one character at a given postion.
+     *  @brief  Erases @b (in-place) the one character at a given postion.
      *  @return Iterator pointing following the erased character, or end() if no such character exists.
      */
     iterator erase(const_iterator pos) noexcept { return erase(pos, pos + 1); }
 
     /**
-     *  @brief  Replaces ( @b in-place ) a range of characters with a given string.
+     *  @brief  Replaces @b (in-place) a range of characters with a given string.
      *  @throws `std::out_of_range` if `pos > size()`.
      *  @throws `std::length_error` if the string is too long.
      *  @see    `try_replace` for a cleaner exception-less alternative.
@@ -3028,7 +3030,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces ( @b in-place ) a range of characters with a given string.
+     *  @brief  Replaces @b (in-place) a range of characters with a given string.
      *  @throws `std::out_of_range` if `pos > size()`.
      *  @throws `std::length_error` if the string is too long.
      *  @see    `try_replace` for a cleaner exception-less alternative.
@@ -3038,7 +3040,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces ( @b in-place ) a range of characters with a given string.
+     *  @brief  Replaces @b (in-place) a range of characters with a given string.
      *  @throws `std::out_of_range` if `pos > size()` or `pos2 > str.size()`.
      *  @throws `std::length_error` if the string is too long.
      *  @see    `try_replace` for a cleaner exception-less alternative.
@@ -3049,7 +3051,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces ( @b in-place ) a range of characters with a given string.
+     *  @brief  Replaces @b (in-place) a range of characters with a given string.
      *  @throws `std::out_of_range` if `pos > size()`.
      *  @throws `std::length_error` if the string is too long.
      *  @see    `try_replace` for a cleaner exception-less alternative.
@@ -3059,7 +3061,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces ( @b in-place ) a range of characters with a given string.
+     *  @brief  Replaces @b (in-place) a range of characters with a given string.
      *  @throws `std::out_of_range` if `pos > size()`.
      *  @throws `std::length_error` if the string is too long.
      *  @see    `try_replace` for a cleaner exception-less alternative.
@@ -3070,7 +3072,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces ( @b in-place ) a range of characters with a given string.
+     *  @brief  Replaces @b (in-place) a range of characters with a given string.
      *  @throws `std::out_of_range` if `pos > size()`.
      *  @throws `std::length_error` if the string is too long.
      *  @see    `try_replace` for a cleaner exception-less alternative.
@@ -3080,7 +3082,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces ( @b in-place ) a range of characters with a given string.
+     *  @brief  Replaces @b (in-place) a range of characters with a given string.
      *  @throws `std::out_of_range` if `pos > size()`.
      *  @throws `std::length_error` if the string is too long.
      *  @see    `try_replace` for a cleaner exception-less alternative.
@@ -3090,7 +3092,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces ( @b in-place ) a range of characters with a repetition of given characters.
+     *  @brief  Replaces @b (in-place) a range of characters with a repetition of given characters.
      *  @throws `std::out_of_range` if `pos > size()`.
      *  @throws `std::length_error` if the string is too long.
      *  @see    `try_replace` for a cleaner exception-less alternative.
@@ -3104,7 +3106,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces ( @b in-place ) a range of characters with a repetition of given characters.
+     *  @brief  Replaces @b (in-place) a range of characters with a repetition of given characters.
      *  @throws `std::out_of_range` if `pos > size()`.
      *  @throws `std::length_error` if the string is too long.
      *  @see    `try_replace` for a cleaner exception-less alternative.
@@ -3115,7 +3117,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces ( @b in-place ) a range of characters with a given string.
+     *  @brief  Replaces @b (in-place) a range of characters with a given string.
      *  @throws `std::out_of_range` if `pos > size()`.
      *  @throws `std::length_error` if the string is too long.
      *  @see    `try_replace` for a cleaner exception-less alternative.
@@ -3134,7 +3136,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces ( @b in-place ) a range of characters with a given initializer list.
+     *  @brief  Replaces @b (in-place) a range of characters with a given initializer list.
      *  @throws `std::out_of_range` if `pos > size()`.
      *  @throws `std::length_error` if the string is too long.
      *  @see    `try_replace` for a cleaner exception-less alternative.
@@ -3332,13 +3334,12 @@ class basic_string {
      *  @brief  Overwrites the string with random binary data.
      *
      *  @param  nonce   "Number used ONCE" to initialize the random number generator, @b don't repeat it!
-     *  @param  key     A 128-bit key to initialize the AES-CTR block-cypher, zeros by default.
      */
-    basic_string &randomize(sz_u64_t nonce, sz_aes128_block_t key = {}) noexcept {
+    basic_string &randomize(sz_u64_t nonce) noexcept {
         sz_ptr_t start;
         sz_size_t length;
         sz_string_range(&string_, &start, &length);
-        sz_generate(start, length, nonce, &key);
+        sz_generate(start, length, nonce);
         return *this;
     }
 
@@ -3349,7 +3350,7 @@ class basic_string {
      */
     basic_string &randomize() noexcept {
         static sz_u64_t nonce = 42;
-        return randomize(nonce++, {});
+        return randomize(nonce++);
     }
 
     /**
@@ -3372,7 +3373,7 @@ class basic_string {
     static basic_string random(size_type length) noexcept(false) { return basic_string(length, '\0').randomize(); }
 
     /**
-     *  @brief  Replaces ( @b in-place ) all occurrences of a given string with the ::replacement string.
+     *  @brief  Replaces @b (in-place) all occurrences of a given string with the ::replacement string.
      *          Similar to `boost::algorithm::replace_all` and Python's `str.replace`.
      *
      *  The implementation is not as composable, as using search ranges combined with a replacing mapping for matches,
@@ -3385,7 +3386,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces ( @b in-place ) all occurrences of a given character set with the ::replacement string.
+     *  @brief  Replaces @b (in-place) all occurrences of a given character set with the ::replacement string.
      *          Similar to `boost::algorithm::replace_all` and Python's `str.replace`.
      *
      *  The implementation is not as composable, as using search ranges combined with a replacing mapping for matches,
@@ -3398,7 +3399,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces ( @b in-place ) all occurrences of a given string with the ::replacement string.
+     *  @brief  Replaces @b (in-place) all occurrences of a given string with the ::replacement string.
      *          Similar to `boost::algorithm::replace_all` and Python's `str.replace`.
      *
      *  The implementation is not as composable, as using search ranges combined with a replacing mapping for matches,
@@ -3410,7 +3411,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces ( @b in-place ) all occurrences of a given character set with the ::replacement string.
+     *  @brief  Replaces @b (in-place) all occurrences of a given character set with the ::replacement string.
      *          Similar to `boost::algorithm::replace_all` and Python's `str.replace`.
      *
      *  The implementation is not as composable, as using search ranges combined with a replacing mapping for matches,
@@ -3422,7 +3423,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces ( @b in-place ) all characters in the string using the provided lookup table.
+     *  @brief  Replaces @b (in-place) all characters in the string using the provided lookup table.
      */
     basic_string &transform(look_up_table const &table) noexcept {
         transform(table, data());
@@ -3917,20 +3918,16 @@ std::ptrdiff_t alignment_score(
  *  @brief  Overwrites the string slice with random characters from the given alphabet using the random generator.
  *
  *  @param  string     The string to overwrite.
- *  @param  generator  A random generator function object that returns a random number in the range [0, 2^64).
- *  @param  alphabet   A string of characters to choose from.
+ *  @param  nonce      "Number used ONCE" to initialize the random number generator, @b don't repeat it!
  */
-template <typename char_type_, typename generator_type_>
-void randomize( //
-    basic_string_slice<char_type_> string, generator_type_ &generator,
-    string_view alphabet = "abcdefghijklmnopqrstuvwxyz") noexcept {
+template <typename char_type_>
+void randomize(basic_string_slice<char_type_> string, sz_u64_t nonce) noexcept {
     static_assert(!std::is_const<char_type_>::value, "The string must be mutable.");
-    sz_random_generator_t generator_callback = &_call_random_generator<generator_type_>;
-    sz_generate(alphabet.data(), alphabet.size(), string.data(), string.size(), generator_callback, &generator);
+    sz_generate(string.data(), string.size(), nonce);
 }
 
 /**
- *  @brief  Replaces ( @b in-place ) all characters in the string using the provided lookup table.
+ *  @brief  Replaces @b (in-place) all characters in the string using the provided lookup table.
  */
 template <typename char_type_>
 void transform(basic_string_slice<char_type_> string, basic_look_up_table<char_type_> const &table) noexcept {
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index a3b9d62e..b10f57a1 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -323,6 +323,7 @@ typedef char *sz_ptr_t;          // A type alias for `char *`
 typedef char const *sz_cptr_t;   // A type alias for `char const *`
 typedef sz_i8_t sz_error_cost_t; // Character mismatch cost for fuzzy matching functions
 
+struct sz_hash_state_t;            // Forward declaration of a hash state structure
 struct sz_sequence_t;              // Forward declaration of an ordered collection of strings
 typedef sz_size_t sz_sorted_idx_t; // Index of a sorted string in a list of strings
 typedef sz_size_t sz_pgram_t;      // "Pointer-sized N-gram" of a string
@@ -406,7 +407,6 @@ SZ_PUBLIC void sz_charset_invert(sz_charset_t *s) {
 
 typedef void *(*sz_memory_allocate_t)(sz_size_t, void *);
 typedef void (*sz_memory_free_t)(void *, sz_size_t, void *);
-typedef sz_u64_t (*sz_random_generator_t)(void *);
 
 /**
  *  @brief  Some complex pattern matching algorithms may require memory allocations.
@@ -457,6 +457,9 @@ typedef sz_u64_t (*sz_hash_state_fold_t)(struct sz_hash_state_t const *);
 /** @brief  Signature of ::sz_bytesum. */
 typedef sz_u64_t (*sz_bytesum_t)(sz_cptr_t, sz_size_t);
 
+/** @brief  Signature of ::sz_generate. */
+typedef void (*sz_generate_t)(sz_ptr_t, sz_size_t, sz_u64_t);
+
 /** @brief  Signature of ::sz_equal. */
 typedef sz_bool_t (*sz_equal_t)(sz_cptr_t, sz_cptr_t, sz_size_t);
 
diff --git a/rust/lib.rs b/rust/lib.rs
index 07db0a32..d9d4e237 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -54,10 +54,12 @@ pub mod sz {
             needle_length: usize,
         ) -> *const c_void;
 
-        fn sz_hash(text: *const c_void, length: usize) -> u64;
-
         fn sz_bytesum(text: *const c_void, length: usize) -> u64;
 
+        fn sz_hash(text: *const c_void, length: usize, seed: u64) -> u64;
+
+        fn sz_generate(text: *mut c_void, length: usize, seed: u64) -> u64;
+
         fn sz_edit_distance(
             haystack1: *const c_void,
             haystack1_length: usize,
@@ -102,14 +104,6 @@ pub mod sz {
             allocator: *const c_void,
         ) -> isize;
 
-        fn sz_generate(
-            alphabet: *const c_void,
-            alphabet_size: usize,
-            text: *mut c_void,
-            length: usize,
-            generate: *const c_void,
-            generator: *mut c_void,
-        );
     }
 
     /// Computes the checksum value of unsigned bytes in a given byte slice `text`.
diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index 64ba2f96..93ae2b7e 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -40,11 +40,23 @@ tracked_unary_functions_t bytesum_functions() {
 
 tracked_unary_functions_t hashing_functions() {
     auto wrap_sz = [](auto function) -> unary_function_t {
-        return unary_function_t([function](std::string_view s) { return function(s.data(), s.size()); });
+        return unary_function_t([function](std::string_view s) { return function(s.data(), s.size(), 42); });
     };
     tracked_unary_functions_t result = {
-        {"sz_hash_serial", wrap_sz(sz_hash_serial)},
         {"std::hash", [](std::string_view s) { return std::hash<std::string_view> {}(s); }},
+        {"sz_hash_serial", wrap_sz(sz_hash_serial)},
+#if SZ_USE_HASWELL
+        {"sz_hash_haswell", wrap_sz(sz_hash_haswell)},
+#endif
+#if SZ_USE_SKYLAKE
+        {"sz_hash_skylake", wrap_sz(sz_hash_skylake)},
+#endif
+#if SZ_USE_ICE
+        {"sz_hash_ice", wrap_sz(sz_hash_ice)},
+#endif
+#if SZ_USE_NEON
+        {"sz_hash_neon", wrap_sz(sz_hash_neon)},
+#endif
     };
     return result;
 }
@@ -65,11 +77,11 @@ tracked_unary_functions_t random_generation_functions(std::size_t token_length)
              randomize_string(buffer.data(), token_length, alphabet.data(), alphabet.size());
              return token_length;
          })},
-        {"sz::randomize" + suffix, unary_function_t([token_length](std::string_view alphabet) -> std::size_t {
-             sz::string_span span(buffer.data(), token_length);
-             sz::randomize(span, global_random_generator(), alphabet);
-             return token_length;
-         })},
+        // {"sz::randomize" + suffix, unary_function_t([token_length](std::string_view alphabet) -> std::size_t {
+        //      sz::string_span span(buffer.data(), token_length);
+        //      sz::randomize(span, global_random_generator(), alphabet);
+        //      return token_length;
+        //  })},
     };
     return result;
 }
diff --git a/scripts/test.cpp b/scripts/test.cpp
index 58752a35..74282523 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -939,11 +939,8 @@ void test_non_stl_extensions_for_updates() {
 
     // Randomization.
     assert(str::random(0).empty());
-    assert(str::random(4, "a") == "aaaa");
-    assert(str::random(4, "aaaa") == "aaaa");
-    assert(str::random(global_random_generator(), 4, "aaaa") == "aaaa");
-    assert_scoped(str s = str::random(128, "ACGT"), (void)s,
-                  s.contains('A') && s.contains('C') && s.contains('G') && s.contains('T'));
+    assert(str::random(4).size() == 4);
+    assert(str::random(4, 42).size() == 4);
 }
 
 /**

From 1da0e2b7944914e12ae7d563d2031af21da08952 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 21 Feb 2025 07:25:30 +0000
Subject: [PATCH 111/751] Fix: Ice Lake partitioning logic

---
 include/stringzilla/sort.h | 88 +++++++++++++++++++-------------------
 scripts/bench_sort.cpp     |  9 ++++
 2 files changed, 54 insertions(+), 43 deletions(-)

diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index 4e1a6377..55a3677e 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -755,10 +755,8 @@ SZ_PUBLIC sz_bool_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t c
 /**
  *  @brief  The most important part of the QuickSort algorithm partitioning the elements around the pivot.
  *          Unlike the serial algorithm, uses compressed stores to filter and move the elements around the pivot.
- *          Assuming the extreme cost of shuffling between 2 ZMM registers based on 2 different masks, we use
- *          extra memory to store the elements smaller and greater than the pivot somewhere else.
  */
-SZ_INTERNAL void _sz_sequence_argsort_ice_2way_partition(                           //
+SZ_INTERNAL void _sz_sequence_argsort_ice_3way_partition(                           //
     sz_pgram_t *const initial_pgrams, sz_sorted_idx_t *const initial_order,         //
     sz_pgram_t *const partitioned_pgrams, sz_sorted_idx_t *const partitioned_order, //
     sz_size_t const start_in_sequence, sz_size_t const end_in_sequence,             //
@@ -768,7 +766,7 @@ SZ_INTERNAL void _sz_sequence_argsort_ice_2way_partition(
     sz_size_t const pgrams_per_register = sizeof(sz_u512_vec_t) / sizeof(sz_pgram_t);
 
     // Choose the pivot offset with Sedgewick's method.
-    sz_pgram_t const *pivot_pgram_ptr = _sz_sequence_partitioning_pivot(initial_order + start_in_sequence, count);
+    sz_pgram_t const *pivot_pgram_ptr = _sz_sequence_partitioning_pivot(initial_pgrams + start_in_sequence, count);
     sz_pgram_t const pivot_pgram = *pivot_pgram_ptr;
     sz_u512_vec_t pivot_vec;
     pivot_vec.zmm = _mm512_set1_epi64(pivot_pgram);
@@ -780,37 +778,35 @@ SZ_INTERNAL void _sz_sequence_argsort_ice_2way_partition(
     __mmask8 const tail_mask = _sz_u8_mask_until(tail_count);
 
     sz_u512_vec_t pgrams_vec, order_vec;
-    for (sz_size_t i = start_in_sequence; i < end_in_sequence; i += pgrams_per_register) {
-        pgrams_vec.zmm =                               //
-            i + pgrams_per_register <= end_in_sequence //
-                ? _mm512_loadu_si512(initial_pgrams + i)
-                : _mm512_maskz_loadu_epi64(tail_mask, initial_pgrams + i);
+    for (sz_size_t i = start_in_sequence; i + pgrams_per_register <= end_in_sequence; i += pgrams_per_register) {
+        pgrams_vec.zmm = _mm512_loadu_si512(initial_pgrams + i);
         count_smaller += sz_u32_popcount(_mm512_cmplt_epu64_mask(pgrams_vec.zmm, pivot_vec.zmm));
         count_greater += sz_u32_popcount(_mm512_cmpgt_epu64_mask(pgrams_vec.zmm, pivot_vec.zmm));
     }
+    if (tail_count) {
+        pgrams_vec.zmm = _mm512_maskz_loadu_epi64(tail_mask, initial_pgrams + end_in_sequence - tail_count);
+        count_smaller += sz_u32_popcount(_mm512_mask_cmplt_epu64_mask(tail_mask, pgrams_vec.zmm, pivot_vec.zmm));
+        count_greater += sz_u32_popcount(_mm512_mask_cmpgt_epu64_mask(tail_mask, pgrams_vec.zmm, pivot_vec.zmm));
+    }
 
     // Now all we need to do is to loop through the collection and export them into the temporary buffer
     // in 3 separate segments - smaller, equal, and greater than the pivot.
     sz_size_t const count_equal = count - count_smaller - count_greater;
+    _sz_assert(count_equal >= 1 && "The pivot must be present in the collection.");
+    _sz_assert(count_smaller + count_equal + count_greater == count && "The partitioning must be exhaustive.");
     sz_size_t smaller_offset = start_in_sequence;
     sz_size_t equal_offset = start_in_sequence + count_smaller;
     sz_size_t greater_offset = start_in_sequence + count_smaller + count_equal;
 
     // The naive algorithm - unzip the elements into 3 separate buffers.
     for (sz_size_t i = start_in_sequence; i < end_in_sequence; i += pgrams_per_register) {
-        if (i + pgrams_per_register <= end_in_sequence) {
-            pgrams_vec.zmm = _mm512_loadu_si512(initial_pgrams + i);
-            order_vec.zmm = _mm512_loadu_si512(initial_order + i);
-        }
-        else {
-            pgrams_vec.zmm = _mm512_maskz_loadu_epi64(tail_count, initial_pgrams + i);
-            order_vec.zmm = _mm512_maskz_loadu_epi64(tail_count, initial_order + i);
-        }
-        pgrams_vec.zmm = _mm512_loadu_si512(initial_pgrams + i);
-        order_vec.zmm = _mm512_loadu_si512(initial_order + i);
-        __mmask8 const smaller_mask = _mm512_cmplt_epu64_mask(pgrams_vec.zmm, pivot_vec.zmm);
-        __mmask8 const equal_mask = _mm512_cmpeq_epu64_mask(pgrams_vec.zmm, pivot_vec.zmm);
-        __mmask8 const greater_mask = _mm512_cmpgt_epu64_mask(pgrams_vec.zmm, pivot_vec.zmm);
+        __mmask8 const load_mask = i + pgrams_per_register <= end_in_sequence ? 0xFF : tail_mask;
+        pgrams_vec.zmm = _mm512_maskz_loadu_epi64(load_mask, initial_pgrams + i);
+        order_vec.zmm = _mm512_maskz_loadu_epi64(load_mask, initial_order + i);
+
+        __mmask8 const smaller_mask = _mm512_mask_cmplt_epu64_mask(load_mask, pgrams_vec.zmm, pivot_vec.zmm);
+        __mmask8 const equal_mask = _mm512_mask_cmpeq_epu64_mask(load_mask, pgrams_vec.zmm, pivot_vec.zmm);
+        __mmask8 const greater_mask = _mm512_mask_cmpgt_epu64_mask(load_mask, pgrams_vec.zmm, pivot_vec.zmm);
 
         // Compress the elements into the temporary buffer.
         _mm512_mask_compressstoreu_epi64(partitioned_pgrams + smaller_offset, smaller_mask, pgrams_vec.zmm);
@@ -827,29 +823,35 @@ SZ_INTERNAL void _sz_sequence_argsort_ice_2way_partition(
     }
 
     // Copy back.
-    sz_copy((sz_ptr_t)(initial_pgrams), (sz_cptr_t)(partitioned_pgrams), count_smaller * sizeof(sz_pgram_t));
-    sz_copy((sz_ptr_t)(initial_order), (sz_cptr_t)(partitioned_order), count_smaller * sizeof(sz_pgram_t));
-    sz_copy((sz_ptr_t)(initial_pgrams + count_smaller),      //
-            (sz_cptr_t)(partitioned_pgrams + count_smaller), //
-            count_equal * sizeof(sz_pgram_t));
-    sz_copy((sz_ptr_t)(initial_order + count_smaller),      //
-            (sz_cptr_t)(partitioned_order + count_smaller), //
-            count_equal * sizeof(sz_pgram_t));
-    sz_copy((sz_ptr_t)(initial_pgrams + count_smaller + count_equal),      //
-            (sz_cptr_t)(partitioned_pgrams + count_smaller + count_equal), //
-            count_greater);
-    sz_copy((sz_ptr_t)(initial_order + count_smaller + count_equal),      //
-            (sz_cptr_t)(partitioned_order + count_smaller + count_equal), //
-            count_greater);
+    sz_copy_skylake((sz_ptr_t)(initial_pgrams + start_in_sequence),      //
+                    (sz_cptr_t)(partitioned_pgrams + start_in_sequence), //
+                    count_smaller * sizeof(sz_pgram_t));
+    sz_copy_skylake((sz_ptr_t)(initial_order + start_in_sequence),      //
+                    (sz_cptr_t)(partitioned_order + start_in_sequence), //
+                    count_smaller * sizeof(sz_sorted_idx_t));
+
+    sz_copy_skylake((sz_ptr_t)(initial_pgrams + start_in_sequence + count_smaller),      //
+                    (sz_cptr_t)(partitioned_pgrams + start_in_sequence + count_smaller), //
+                    count_equal * sizeof(sz_pgram_t));
+    sz_copy_skylake((sz_ptr_t)(initial_order + start_in_sequence + count_smaller),      //
+                    (sz_cptr_t)(partitioned_order + start_in_sequence + count_smaller), //
+                    count_equal * sizeof(sz_sorted_idx_t));
+
+    sz_copy_skylake((sz_ptr_t)(initial_pgrams + start_in_sequence + count_smaller + count_equal),      //
+                    (sz_cptr_t)(partitioned_pgrams + start_in_sequence + count_smaller + count_equal), //
+                    count_greater * sizeof(sz_pgram_t));
+    sz_copy_skylake((sz_ptr_t)(initial_order + start_in_sequence + count_smaller + count_equal),      //
+                    (sz_cptr_t)(partitioned_order + start_in_sequence + count_smaller + count_equal), //
+                    count_greater * sizeof(sz_sorted_idx_t));
 
     // Return the offsets of the equal elements.
-    *first_pivot_offset = count_smaller;
-    *last_pivot_offset = count_smaller + count_equal;
+    *first_pivot_offset = start_in_sequence + count_smaller;
+    *last_pivot_offset = start_in_sequence + count_smaller + count_equal - 1;
 }
 
 /**
  *  @brief  Recursive Quick-Sort implementation backing both the `sz_sequence_argsort_ice` and `sz_pgrams_sort_ice`,
- *          and using the `_sz_sequence_argsort_ice_2way_partition` under the hood.
+ *          and using the `_sz_sequence_argsort_ice_3way_partition` under the hood.
  */
 SZ_INTERNAL void _sz_sequence_argsort_ice_recursively(              //
     sz_pgram_t *initial_pgrams, sz_sorted_idx_t *initial_order,     //
@@ -868,17 +870,17 @@ SZ_INTERNAL void _sz_sequence_argsort_ice_recursively(              //
 
     // Partition the collection around some pivot
     sz_size_t first_pivot_index, last_pivot_index;
-    _sz_sequence_argsort_ice_2way_partition(                              //
+    _sz_sequence_argsort_ice_3way_partition(                              //
         initial_pgrams, initial_order, temporary_pgrams, temporary_order, //
         start_in_sequence, end_in_sequence,                               //
         &first_pivot_index, &last_pivot_index);
 
-    // Recursively sort the left and right partitions, tracking where the output goes
-    if (start_in_sequence < first_pivot_index)
+    // Recursively sort the left and right partitions, if there are at least 2 elements in each
+    if (start_in_sequence + 1 < first_pivot_index)
         _sz_sequence_argsort_ice_recursively(                                 //
             initial_pgrams, initial_order, temporary_pgrams, temporary_order, //
             start_in_sequence, first_pivot_index);
-    if (last_pivot_index + 1 < end_in_sequence)
+    if (last_pivot_index + 2 < end_in_sequence)
         _sz_sequence_argsort_ice_recursively(                                 //
             initial_pgrams, initial_order, temporary_pgrams, temporary_order, //
             last_pivot_index + 1, end_in_sequence);
diff --git a/scripts/bench_sort.cpp b/scripts/bench_sort.cpp
index 729ac856..6bc0dd11 100644
--- a/scripts/bench_sort.cpp
+++ b/scripts/bench_sort.cpp
@@ -112,6 +112,15 @@ int main(int argc, char const **argv) {
     });
     expect_sorted(pgrams, permute);
 
+    bench_permute("sz_pgrams_sort_ice", [&]() {
+        std::copy(pgrams.begin(), pgrams.end(), pgrams_sorted.begin());
+        std::iota(permute.begin(), permute.end(), 0);
+        sz::_with_alloc<allocator_t>([&](sz_memory_allocator_t &alloc) {
+            return sz_pgrams_sort_ice(pgrams_sorted.data(), pgrams_sorted.size(), &alloc, permute.data());
+        });
+    });
+    expect_sorted(pgrams, permute);
+
     // Unlike the `std::sort` adaptation above, the `sz_pgrams_sort_stable_serial` also sorts the input array inplace
     bench_permute("sz_pgrams_sort_stable_serial", [&]() {
         std::copy(pgrams.begin(), pgrams.end(), pgrams_sorted.begin());

From 69d4ecb656ed697eb0209931397587d0a8c165b1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 21 Feb 2025 07:44:39 +0000
Subject: [PATCH 112/751] Add: `sz_sequence_argsort_ice`

---
 include/stringzilla/sort.h | 77 ++++++++++++++++++++++++++++++++++++++
 scripts/bench_sort.cpp     | 16 +++++++-
 2 files changed, 91 insertions(+), 2 deletions(-)

diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index 55a3677e..0273467a 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -906,6 +906,83 @@ SZ_PUBLIC sz_bool_t sz_pgrams_sort_ice(sz_pgram_t *pgrams, sz_size_t count, sz_m
     return sz_true_k;
 }
 
+/**
+ *  @brief  Recursive Quick-Sort adaptation for strings, that processes the strings a few N-grams at a time.
+ *          It combines `_sz_sequence_argsort_serial_export_next_pgrams` and `_sz_sequence_argsort_serial_recursively`,
+ *          recursively diving into the identical pgrams.
+ */
+SZ_PUBLIC void _sz_sequence_argsort_ice_next_pgrams(                            //
+    sz_sequence_t const *const sequence,                                        //
+    sz_pgram_t *const global_pgrams, sz_sorted_idx_t *const global_order,       //
+    sz_pgram_t *const temporary_pgrams, sz_sorted_idx_t *const temporary_order, //
+    sz_size_t const start_in_sequence, sz_size_t const end_in_sequence,         //
+    sz_size_t const start_character) {
+
+    // Prepare the new range of pgrams
+    _sz_sequence_argsort_serial_export_next_pgrams(sequence, global_pgrams, global_order, start_in_sequence,
+                                                   end_in_sequence, start_character);
+
+    // Sort current pgrams with a quicksort
+    _sz_sequence_argsort_ice_recursively(global_pgrams, global_order, temporary_pgrams, temporary_order,
+                                         start_in_sequence, end_in_sequence);
+
+    // Depending on the architecture, we will export a different number of bytes.
+    // On 32-bit architectures, we will export 3 bytes, and on 64-bit architectures - 7 bytes.
+    sz_size_t const pgram_capacity = sizeof(sz_pgram_t) - 1;
+
+    // Repeat the procedure for the identical pgrams
+    sz_size_t nested_start = start_in_sequence;
+    sz_size_t nested_end = start_in_sequence;
+    while (nested_end != end_in_sequence) {
+        // Find the end of the identical pgrams
+        sz_pgram_t current_pgram = global_pgrams[nested_start];
+        while (nested_end != end_in_sequence && current_pgram == global_pgrams[nested_end]) ++nested_end;
+
+        // If the identical pgrams are not trivial and each string has more characters, sort them recursively
+        sz_cptr_t current_pgram_str = (sz_cptr_t)&current_pgram;
+        sz_size_t current_pgram_length = (sz_size_t)current_pgram_str[0]; //! The byte order was swapped
+        int has_multiple_strings = nested_end - nested_start > 1;
+        int has_more_characters_in_each = current_pgram_length == pgram_capacity;
+        if (has_multiple_strings && has_more_characters_in_each) {
+            _sz_sequence_argsort_ice_next_pgrams(sequence, global_pgrams, global_order, temporary_pgrams,
+                                                 temporary_order, nested_start, nested_end,
+                                                 start_character + pgram_capacity);
+        }
+        // Move to the next
+        nested_start = nested_end;
+    }
+}
+
+SZ_PUBLIC sz_bool_t sz_sequence_argsort_ice(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                            sz_sorted_idx_t *order) {
+
+    // First, initialize the `order` with `std::iota`-like behavior.
+    sz_size_t count = sequence->count;
+    for (sz_size_t i = 0; i != count; ++i) order[i] = i;
+
+    // On very small collections - just use the quadratic-complexity insertion sort
+    // without any smart optimizations or memory allocations.
+    if (count <= 32) {
+        sz_sequence_argsort_with_insertion(sequence, order);
+        return sz_true_k;
+    }
+
+    // Allocate memory for partitioning the elements around the pivot.
+    sz_size_t memory_usage = sizeof(sz_pgram_t) * count * 2 + sizeof(sz_sorted_idx_t) * count;
+    sz_pgram_t *global_pgrams = (sz_pgram_t *)alloc->allocate(memory_usage, alloc);
+    sz_pgram_t *temporary_pgrams = global_pgrams + count;
+    sz_sorted_idx_t *temporary_order = (sz_sorted_idx_t *)(temporary_pgrams + count);
+    if (!global_pgrams) return sz_false_k;
+
+    // Recursively sort the whole sequence.
+    _sz_sequence_argsort_ice_next_pgrams(sequence, global_pgrams, order, temporary_pgrams, temporary_order, //
+                                         0, count, 0);
+
+    // Free temporary storage.
+    alloc->free(global_pgrams, memory_usage, alloc);
+    return sz_true_k;
+}
+
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_ICE
diff --git a/scripts/bench_sort.cpp b/scripts/bench_sort.cpp
index 6bc0dd11..22758d95 100644
--- a/scripts/bench_sort.cpp
+++ b/scripts/bench_sort.cpp
@@ -139,7 +139,7 @@ int main(int argc, char const **argv) {
     });
     expect_sorted(strings, permute);
 
-    bench_permute("sz_sequence_argsort", [&]() {
+    bench_permute("sz_sequence_argsort_serial", [&]() {
         std::iota(permute.begin(), permute.end(), 0);
         sz_sequence_t array;
         array.count = strings.size();
@@ -147,7 +147,19 @@ int main(int argc, char const **argv) {
         array.get_start = get_start;
         array.get_length = get_length;
         sz::_with_alloc<allocator_t>(
-            [&](sz_memory_allocator_t &alloc) { return sz_sequence_argsort(&array, &alloc, permute.data()); });
+            [&](sz_memory_allocator_t &alloc) { return sz_sequence_argsort_serial(&array, &alloc, permute.data()); });
+    });
+    expect_sorted(strings, permute);
+
+    bench_permute("sz_sequence_argsort_ice", [&]() {
+        std::iota(permute.begin(), permute.end(), 0);
+        sz_sequence_t array;
+        array.count = strings.size();
+        array.handle = &strings;
+        array.get_start = get_start;
+        array.get_length = get_length;
+        sz::_with_alloc<allocator_t>(
+            [&](sz_memory_allocator_t &alloc) { return sz_sequence_argsort_ice(&array, &alloc, permute.data()); });
     });
     expect_sorted(strings, permute);
 

From cc98389e82005783ffe8c0f28974207ad02ad83a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 21 Feb 2025 07:44:53 +0000
Subject: [PATCH 113/751] Add: Sorting placeholders & dispatch

---
 c/lib.c                     | 36 ++++++++++++++++++++++++++++++++++
 include/stringzilla/sort.h  | 39 +++++++++++++++++++++++++++++++++++++
 include/stringzilla/types.h |  9 +++++++++
 3 files changed, 84 insertions(+)

diff --git a/c/lib.c b/c/lib.c
index 559062ba..9d79fd98 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -196,6 +196,9 @@ typedef struct sz_implementations_t {
     sz_alignment_score_t alignment_score;
 
     sz_sequence_argsort_t sequence_argsort;
+    sz_pgrams_sort_t pgrams_sort;
+    sz_sequence_argsort_stable_t sequence_argsort_stable;
+    sz_pgrams_sort_stable_t pgrams_sort_stable;
 
 } sz_implementations_t;
 
@@ -237,7 +240,11 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
 
     impl->edit_distance = sz_edit_distance_serial;
     impl->alignment_score = sz_alignment_score_serial;
+
     impl->sequence_argsort = sz_sequence_argsort_serial;
+    impl->pgrams_sort = sz_pgrams_sort_serial;
+    impl->sequence_argsort_stable = sz_sequence_argsort_stable_serial;
+    impl->pgrams_sort_stable = sz_pgrams_sort_stable_serial;
 
 #if SZ_USE_HASWELL
     if (caps & sz_cap_haswell_k) {
@@ -305,6 +312,11 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->hash_state_stream = sz_hash_state_stream_ice;
         impl->hash_state_fold = sz_hash_state_fold_ice;
         impl->generate = sz_generate_ice;
+
+        impl->sequence_argsort = sz_sequence_argsort_ice;
+        impl->pgrams_sort = sz_pgrams_sort_ice;
+        impl->sequence_argsort_stable = sz_sequence_argsort_stable_ice;
+        impl->pgrams_sort_stable = sz_pgrams_sort_stable_ice;
     }
 #endif
 
@@ -332,6 +344,15 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->rfind_from_set = sz_rfind_charset_neon;
     }
 #endif
+
+#if SZ_USE_SVE
+    if (caps & sz_cap_sve_k) {
+        impl->sequence_argsort = sz_sequence_argsort_sve;
+        impl->pgrams_sort = sz_pgrams_sort_sve;
+        impl->sequence_argsort_stable = sz_sequence_argsort_stable_sve;
+        impl->pgrams_sort_stable = sz_pgrams_sort_stable_sve;
+    }
+#endif
 }
 
 #if defined(_MSC_VER)
@@ -479,6 +500,21 @@ SZ_DYNAMIC sz_bool_t sz_sequence_argsort(sz_sequence_t const *array, sz_memory_a
     return sz_dispatch_table.sequence_argsort(array, alloc, order);
 }
 
+SZ_DYNAMIC sz_bool_t sz_pgrams_sort(sz_pgram_t *array, sz_size_t count, sz_memory_allocator_t *alloc,
+                                    sz_size_t *order) {
+    return sz_dispatch_table.pgrams_sort(array, count, alloc, order);
+}
+
+SZ_DYNAMIC sz_bool_t sz_sequence_argsort_stable(sz_sequence_t const *array, sz_memory_allocator_t *alloc,
+                                                sz_size_t *order) {
+    return sz_dispatch_table.sequence_argsort_stable(array, alloc, order);
+}
+
+SZ_DYNAMIC sz_bool_t sz_pgrams_sort_stable(sz_pgram_t *array, sz_size_t count, sz_memory_allocator_t *alloc,
+                                           sz_size_t *order) {
+    return sz_dispatch_table.pgrams_sort_stable(array, count, alloc, order);
+}
+
 SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
     sz_charset_t set;
     sz_charset_init(&set);
diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index 0273467a..190074a3 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -996,7 +996,46 @@ SZ_PUBLIC sz_bool_t sz_sequence_argsort_ice(sz_sequence_t const *sequence, sz_me
 
 SZ_DYNAMIC sz_bool_t sz_sequence_argsort(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
                                          sz_sorted_idx_t *order) {
+#if SZ_USE_ICE
+    return sz_sequence_argsort_ice(sequence, alloc, order);
+#elif SZ_USE_SVE
+    return sz_sequence_argsort_sve(sequence, alloc, order);
+#else
+    return sz_sequence_argsort_serial(sequence, alloc, order);
+#endif
+}
+
+SZ_DYNAMIC sz_bool_t sz_pgrams_sort(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                    sz_sorted_idx_t *order) {
+#if SZ_USE_ICE
+    return sz_pgrams_sort_ice(pgrams, count, alloc, order);
+#elif SZ_USE_SVE
+    return sz_pgrams_sort_sve(pgrams, count, alloc, order);
+#else
+    return sz_pgrams_sort_serial(pgrams, count, alloc, order);
+#endif
+}
+
+SZ_DYNAMIC sz_bool_t sz_sequence_argsort_stable(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                                sz_sorted_idx_t *order) {
+#if SZ_USE_ICE
+    return sz_sequence_argsort_ice(sequence, alloc, order);
+#elif SZ_USE_SVE
+    return sz_sequence_argsort_sve(sequence, alloc, order);
+#else
     return sz_sequence_argsort_serial(sequence, alloc, order);
+#endif
+}
+
+SZ_DYNAMIC sz_bool_t sz_pgrams_sort_stable(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                           sz_sorted_idx_t *order) {
+#if SZ_USE_ICE
+    return sz_pgrams_sort_ice(pgrams, count, alloc, order);
+#elif SZ_USE_SVE
+    return sz_pgrams_sort_sve(pgrams, count, alloc, order);
+#else
+    return sz_pgrams_sort_serial(pgrams, count, alloc, order);
+#endif
 }
 
 #endif            // !SZ_DYNAMIC_DISPATCH
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index b10f57a1..825e36f9 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -497,6 +497,15 @@ typedef sz_ssize_t (*sz_alignment_score_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_s
 /** @brief  Signature of ::sz_sequence_argsort. */
 typedef sz_bool_t (*sz_sequence_argsort_t)(struct sz_sequence_t const *, sz_memory_allocator_t *, sz_sorted_idx_t *);
 
+/** @brief  Signature of ::sz_pgrams_sort. */
+typedef sz_bool_t (*sz_pgrams_sort_t)(sz_pgram_t *, sz_size_t, sz_memory_allocator_t *, sz_sorted_idx_t *);
+
+/** @brief  Signature of ::sz_sequence_argsort_stable. */
+typedef sz_sequence_argsort_t sz_sequence_argsort_stable_t;
+
+/** @brief  Signature of ::sz_pgrams_sort_stable. */
+typedef sz_pgrams_sort_t sz_pgrams_sort_stable_t;
+
 #pragma endregion
 
 #pragma region Helper Structures

From 8bc161f4256c50d00a6c5f22e11612a00c47a12a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 21 Feb 2025 13:27:23 +0000
Subject: [PATCH 114/751] Docs: Disable sorting includes

---
 .clang-format | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.clang-format b/.clang-format
index c1418bae..c97feb6f 100644
--- a/.clang-format
+++ b/.clang-format
@@ -44,8 +44,8 @@ BraceWrapping:
   SplitEmptyNamespace: false
   IndentBraces: false
 
-SortIncludes: true
-SortUsingDeclarations: true
+SortIncludes: false
+SortUsingDeclarations: false
 
 SpaceAfterCStyleCast: false
 SpaceAfterLogicalNot: false

From e0055d5d88828aba726c4baf87605abcb3865d39 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 21 Feb 2025 13:29:31 +0000
Subject: [PATCH 115/751] Break: `look_up_transform` to `lookup` API

---
 c/lib.c                             |  60 +++++-----
 include/stringzilla/memory.h        | 163 ++++++++++++++++++++--------
 include/stringzilla/stringzilla.hpp |  11 +-
 scripts/bench_memory.cpp            |   8 +-
 4 files changed, 157 insertions(+), 85 deletions(-)

diff --git a/c/lib.c b/c/lib.c
index 9d79fd98..b65784cc 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -176,7 +176,7 @@ typedef struct sz_implementations_t {
     sz_move_t copy;
     sz_move_t move;
     sz_fill_t fill;
-    sz_look_up_transform_t look_up_transform;
+    sz_lookup_t lookup;
 
     sz_bytesum_t bytesum;
     sz_hash_t hash;
@@ -192,8 +192,8 @@ typedef struct sz_implementations_t {
     sz_find_set_t find_from_set;
     sz_find_set_t rfind_from_set;
 
-    sz_edit_distance_t edit_distance;
-    sz_alignment_score_t alignment_score;
+    sz_levenshtein_distance_t edit_distance;
+    sz_needleman_wunsch_score_t alignment_score;
 
     sz_sequence_argsort_t sequence_argsort;
     sz_pgrams_sort_t pgrams_sort;
@@ -222,7 +222,7 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
     impl->copy = sz_copy_serial;
     impl->move = sz_move_serial;
     impl->fill = sz_fill_serial;
-    impl->look_up_transform = sz_look_up_transform_serial;
+    impl->lookup = sz_lookup_serial;
 
     impl->bytesum = sz_bytesum_serial;
     impl->hash = sz_hash_serial;
@@ -238,8 +238,8 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
     impl->find_from_set = sz_find_charset_serial;
     impl->rfind_from_set = sz_rfind_charset_serial;
 
-    impl->edit_distance = sz_edit_distance_serial;
-    impl->alignment_score = sz_alignment_score_serial;
+    impl->edit_distance = sz_levenshtein_distance_serial;
+    impl->alignment_score = sz_needleman_wunsch_score_serial;
 
     impl->sequence_argsort = sz_sequence_argsort_serial;
     impl->pgrams_sort = sz_pgrams_sort_serial;
@@ -254,7 +254,7 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->copy = sz_copy_haswell;
         impl->move = sz_move_haswell;
         impl->fill = sz_fill_haswell;
-        impl->look_up_transform = sz_look_up_transform_haswell;
+        impl->lookup = sz_lookup_haswell;
 
         impl->bytesum = sz_bytesum_haswell;
         impl->hash = sz_hash_haswell;
@@ -301,10 +301,10 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->find_from_set = sz_find_charset_ice;
         impl->rfind_from_set = sz_rfind_charset_ice;
 
-        impl->edit_distance = sz_edit_distance_ice;
-        impl->alignment_score = sz_alignment_score_ice;
+        impl->edit_distance = sz_levenshtein_distance_ice;
+        impl->alignment_score = sz_needleman_wunsch_score_ice;
 
-        impl->look_up_transform = sz_look_up_transform_ice;
+        impl->lookup = sz_lookup_ice;
 
         impl->bytesum = sz_bytesum_ice;
         impl->hash = sz_hash_ice;
@@ -327,7 +327,7 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->copy = sz_copy_neon;
         impl->move = sz_move_neon;
         impl->fill = sz_fill_neon;
-        impl->look_up_transform = sz_look_up_transform_neon;
+        impl->lookup = sz_lookup_neon;
 
         impl->bytesum = sz_bytesum_neon;
         impl->hash = sz_hash_neon;
@@ -433,8 +433,8 @@ SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
     sz_dispatch_table.fill(target, length, value);
 }
 
-SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
-    sz_dispatch_table.look_up_transform(source, length, lut, target);
+SZ_DYNAMIC void sz_lookup(sz_ptr_t target, sz_size_t length, sz_cptr_t source, sz_cptr_t lut) {
+    sz_dispatch_table.lookup(target, length, source, lut);
 }
 
 SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
@@ -475,43 +475,43 @@ SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8( //
     return sz_hamming_distance_utf8_serial(a, a_length, b, b_length, bound);
 }
 
-SZ_DYNAMIC sz_size_t sz_edit_distance( //
-    sz_cptr_t a, sz_size_t a_length,   //
-    sz_cptr_t b, sz_size_t b_length,   //
+SZ_DYNAMIC sz_size_t sz_levenshtein_distance( //
+    sz_cptr_t a, sz_size_t a_length,          //
+    sz_cptr_t b, sz_size_t b_length,          //
     sz_size_t bound, sz_memory_allocator_t *alloc) {
     return sz_dispatch_table.edit_distance(a, a_length, b, b_length, bound, alloc);
 }
 
-SZ_DYNAMIC sz_size_t sz_edit_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length,        //
-    sz_cptr_t b, sz_size_t b_length,        //
+SZ_DYNAMIC sz_size_t sz_levenshtein_distance_utf8( //
+    sz_cptr_t a, sz_size_t a_length,               //
+    sz_cptr_t b, sz_size_t b_length,               //
     sz_size_t bound, sz_memory_allocator_t *alloc) {
-    return _sz_edit_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc);
+    return _sz_levenshtein_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc);
 }
 
-SZ_DYNAMIC sz_ssize_t sz_alignment_score( //
-    sz_cptr_t a, sz_size_t a_length,      //
-    sz_cptr_t b, sz_size_t b_length,      //
+SZ_DYNAMIC sz_ssize_t sz_needleman_wunsch_score( //
+    sz_cptr_t a, sz_size_t a_length,             //
+    sz_cptr_t b, sz_size_t b_length,             //
     sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
     return sz_dispatch_table.alignment_score(a, a_length, b, b_length, subs, gap, alloc);
 }
 
-SZ_DYNAMIC sz_bool_t sz_sequence_argsort(sz_sequence_t const *array, sz_memory_allocator_t *alloc, sz_size_t *order) {
+SZ_DYNAMIC sz_status_t sz_sequence_argsort(sz_sequence_t const *array, sz_memory_allocator_t *alloc, sz_size_t *order) {
     return sz_dispatch_table.sequence_argsort(array, alloc, order);
 }
 
-SZ_DYNAMIC sz_bool_t sz_pgrams_sort(sz_pgram_t *array, sz_size_t count, sz_memory_allocator_t *alloc,
-                                    sz_size_t *order) {
+SZ_DYNAMIC sz_status_t sz_pgrams_sort(sz_pgram_t *array, sz_size_t count, sz_memory_allocator_t *alloc,
+                                      sz_size_t *order) {
     return sz_dispatch_table.pgrams_sort(array, count, alloc, order);
 }
 
-SZ_DYNAMIC sz_bool_t sz_sequence_argsort_stable(sz_sequence_t const *array, sz_memory_allocator_t *alloc,
-                                                sz_size_t *order) {
+SZ_DYNAMIC sz_status_t sz_sequence_argsort_stable(sz_sequence_t const *array, sz_memory_allocator_t *alloc,
+                                                  sz_size_t *order) {
     return sz_dispatch_table.sequence_argsort_stable(array, alloc, order);
 }
 
-SZ_DYNAMIC sz_bool_t sz_pgrams_sort_stable(sz_pgram_t *array, sz_size_t count, sz_memory_allocator_t *alloc,
-                                           sz_size_t *order) {
+SZ_DYNAMIC sz_status_t sz_pgrams_sort_stable(sz_pgram_t *array, sz_size_t count, sz_memory_allocator_t *alloc,
+                                             sz_size_t *order) {
     return sz_dispatch_table.pgrams_sort_stable(array, count, alloc, order);
 }
 
diff --git a/include/stringzilla/memory.h b/include/stringzilla/memory.h
index d8db210b..de739f22 100644
--- a/include/stringzilla/memory.h
+++ b/include/stringzilla/memory.h
@@ -3,12 +3,12 @@
  *  @file   memory.h
  *  @author Ash Vardanian
  *
- *  Includes:
+ *  Includes core APIs for contiguous memory operations:
  *
  *  - `sz_copy` - analog to `memcpy`
  *  - `sz_move` - analog to `memmove`
  *  - `sz_fill` - analog to `memset`
- *  - `sz_look_up_transform` - LUT transformation of a string, similar to OpenCV LUT
+ *  - `sz_lookup` - LUT transformation of a string, similar to OpenCV LUT
  *  - TODO: `sz_detect_encoding` - similar to `iconv` or `chardet`
  *
  *  Convenience functions for character-set mapping:
@@ -28,11 +28,27 @@ extern "C" {
 
 /**
  *  @brief  Similar to `memcpy`, copies contents of one string into another.
- *          The behavior is undefined if the strings overlap.
  *
- *  @param target   String to copy into.
- *  @param length   Number of bytes to copy.
- *  @param source   String to copy from.
+ *  @param[out] target String to copy into. Can be `NULL`, if the @p length is zero.
+ *  @param[in] length Number of bytes to copy. Can be a zero.
+ *  @param[in] source String to copy from. Can be `NULL`, if the @p length is zero.
+ *
+ *  Example usage:
+ *
+ *  @code{.c}
+ *      #include <stringzilla/memory.h>
+ *      int main() {
+ *          char output[2];
+ *          sz_copy(output, "hi", 2);
+ *          return output[0] == 'h' && output[1] == 'i' ? 0 : 1;
+ *      }
+ *  @endcode
+ *
+ *  @pre    The @p target and @p source must not overlap.
+ *  @sa     sz_move
+ *
+ *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
+ *  @sa     sz_copy_serial, sz_copy_haswell, sz_copy_skylake, sz_copy_neon
  */
 SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
 
@@ -40,27 +56,92 @@ SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
  *  @brief  Similar to `memmove`, copies (moves) contents of one string into another.
  *          Unlike `sz_copy`, allows overlapping strings as arguments.
  *
- *  @param target   String to copy into.
- *  @param length   Number of bytes to copy.
- *  @param source   String to copy from.
+ *  @param[out] target String to copy into. Can be `NULL`, if the @p length is zero.
+ *  @param[in] length Number of bytes to copy. Can be a zero.
+ *  @param[in] source String to copy from. Can be `NULL`, if the @p length is zero.
+ *
+ *  Example usage:
+ *
+ *  @code{.c}
+ *      #include <stringzilla/memory.h>
+ *      int main() {
+ *          char buffer[3] = {'a', 'b', 'c'};
+ *          sz_move(buffer, buffer + 1, 2);
+ *          return buffer[0] == 'b' && buffer[1] == 'c' && buffer[2] == 'c' ? 0 : 1;
+ *      }
+ *  @endcode
+ *
+ *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
+ *  @sa     sz_move_serial, sz_move_haswell, sz_move_skylake, sz_move_neon
  */
 SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
 
 /**
  *  @brief  Similar to `memset`, fills a string with a given value.
  *
- *  @param target   String to fill.
- *  @param length   Number of bytes to fill.
- *  @param value    Value to fill with.
+ *  @param[out] target String to fill. Can be `NULL`, if the @p length is zero.
+ *  @param[in] length Number of bytes to fill. Can be a zero.
+ *  @param[in] value Value to fill with.
+ *
+ *  Example usage:
+ *
+ *  @code{.c}
+ *     #include <stringzilla/memory.h>
+ *     int main() {
+ *          char buffer[2];
+ *          sz_fill(buffer, 2, 'x');
+ *          return buffer[0] == 'x' && buffer[1] == 'x' ? 0 : 1;
+ *     }
+ *  @endcode
+ *
+ *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
+ *  @sa     sz_fill_serial, sz_fill_haswell, sz_fill_skylake, sz_fill_neon
  */
 SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value);
 
+/**
+ *  @brief  Look Up Table @b (LUT) transformation of a @p source string. Same as `for (char &c : text) c = lut[c]`.
+ *  @see    https://en.wikipedia.org/wiki/Lookup_table
+ *
+ *  Can be used to implement some form of string normalization, partially masking punctuation marks,
+ *  or converting between different character sets, like uppercase or lowercase. Surprisingly, also has
+ *  broad implications in image processing, where image channel transformations are often done using LUTs.
+ *
+ *  @param[out] target Output string, can point to the same address as @p source.
+ *  @param[in] length Number of bytes in the string.
+ *  @param[in] source String to be mapped using the @p lut table into the @p target.
+ *  @param[in] lut Look Up Table to apply. Must be exactly @b 256 bytes long.
+ *
+ *  Example usage:
+ *
+ *  @code{.c}
+ *     #include <ctype.h> // for `tolower`
+ *     #include <stringzilla/memory.h>
+ *     int main() {
+ *          char to_lower_lut[256];
+ *          for (int i = 0; i < 256; ++i) to_lower_lut[i] = tolower(i);
+ *          char buffer[3] = {'A', 'B', 'C'};
+ *          sz_lookup(buffer, 3, buffer, to_lower_lut);
+ *          return buffer[0] == 'a' && buffer[1] == 'b' && buffer[2] == 'c' ? 0 : 1;
+ *     }
+ *  @endcode
+ *
+ *  @pre    The @p lut must be exactly 256 bytes long, even if the @p source string has no characters in the top range.
+ *  @pre    The @p target and @p source can be the same, but must not overlap.
+ *
+ *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
+ *  @sa     sz_lookup_serial, sz_lookup_haswell, sz_lookup_ice, sz_lookup_neon
+ */
+SZ_DYNAMIC void sz_lookup(sz_ptr_t target, sz_size_t length, sz_cptr_t source, sz_cptr_t lut);
+
 /** @copydoc sz_copy */
 SZ_PUBLIC void sz_copy_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
 /** @copydoc sz_move */
 SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
 /** @copydoc sz_fill */
 SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value);
+/** @copydoc sz_lookup */
+SZ_PUBLIC void sz_lookup_serial(sz_ptr_t target, sz_size_t length, sz_cptr_t source, sz_cptr_t lut);
 
 #if SZ_USE_HASWELL
 /** @copydoc sz_copy */
@@ -69,6 +150,8 @@ SZ_PUBLIC void sz_copy_haswell(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
 SZ_PUBLIC void sz_move_haswell(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
 /** @copydoc sz_rfind_fill */
 SZ_PUBLIC void sz_fill_haswell(sz_ptr_t target, sz_size_t length, sz_u8_t value);
+/** @copydoc sz_lookup */
+SZ_PUBLIC void sz_lookup_haswell(sz_ptr_t target, sz_size_t length, sz_cptr_t source, sz_cptr_t lut);
 #endif
 
 #if SZ_USE_SKYLAKE
@@ -80,6 +163,11 @@ SZ_PUBLIC void sz_move_skylake(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
 SZ_PUBLIC void sz_fill_skylake(sz_ptr_t target, sz_size_t length, sz_u8_t value);
 #endif
 
+#if SZ_USE_ICE
+/** @copydoc sz_lookup */
+SZ_PUBLIC void sz_lookup_ice(sz_ptr_t target, sz_size_t length, sz_cptr_t source, sz_cptr_t lut);
+#endif
+
 #if SZ_USE_NEON
 /** @copydoc sz_copy */
 SZ_PUBLIC void sz_copy_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
@@ -87,25 +175,10 @@ SZ_PUBLIC void sz_copy_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length)
 SZ_PUBLIC void sz_move_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
 /** @copydoc sz_rfind_fill */
 SZ_PUBLIC void sz_fill_neon(sz_ptr_t target, sz_size_t length, sz_u8_t value);
+/** @copydoc sz_lookup */
+SZ_PUBLIC void sz_lookup_neon(sz_ptr_t target, sz_size_t length, sz_cptr_t source, sz_cptr_t lut);
 #endif
 
-/**
- *  @brief  Look Up Table @b (LUT) transformation of a string. Equivalent to `for (char & c : text) c = lut[c]`.
- *
- *  Can be used to implement some form of string normalization, partially masking punctuation marks,
- *  or converting between different character sets, like uppercase or lowercase. Surprisingly, also has
- *  broad implications in image processing, where image channel transformations are often done using LUTs.
- *
- *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
- *  @param lut      Look Up Table to apply. Must be exactly @b 256 bytes long.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
-
-/** @copydoc sz_look_up_transform */
-SZ_PUBLIC void sz_look_up_transform_serial(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
-
 #pragma endregion // Core API
 
 #pragma region Helper API
@@ -120,7 +193,7 @@ SZ_PUBLIC void sz_look_up_transform_serial(sz_cptr_t text, sz_size_t length, sz_
  *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
  *
  *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
+ *  @param[in] length   Number of bytes in the string.
  *  @param result   Output string, can point to the same address as ::text.
  */
 SZ_PUBLIC void sz_tolower(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
@@ -135,7 +208,7 @@ SZ_PUBLIC void sz_tolower(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
  *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
  *
  *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
+ *  @param[in] length   Number of bytes in the string.
  *  @param result   Output string, can point to the same address as ::text.
  */
 SZ_PUBLIC void sz_toupper(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
@@ -144,7 +217,7 @@ SZ_PUBLIC void sz_toupper(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
  *  @brief  Equivalent to `for (char & c : text) c = toascii(c)`.
  *
  *  @param text     String to be normalized.
- *  @param length   Number of bytes in the string.
+ *  @param[in] length   Number of bytes in the string.
  *  @param result   Output string, can point to the same address as ::text.
  */
 SZ_PUBLIC void sz_toascii(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
@@ -203,7 +276,7 @@ SZ_INTERNAL sz_u8_t sz_u8_toupper(sz_u8_t c) {
     return upped[c];
 }
 
-SZ_PUBLIC void sz_look_up_transform_serial(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result) {
+SZ_PUBLIC void sz_lookup_serial(sz_ptr_t result, sz_size_t length, sz_cptr_t text, sz_cptr_t lut) {
     sz_u8_t const *unsigned_lut = (sz_u8_t const *)lut;
     sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
     sz_u8_t *unsigned_result = (sz_u8_t *)result;
@@ -454,13 +527,13 @@ SZ_PUBLIC void sz_move_haswell(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
     }
 }
 
-SZ_PUBLIC void sz_look_up_transform_haswell(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
+SZ_PUBLIC void sz_lookup_haswell(sz_ptr_t target, sz_size_t length, sz_cptr_t source, sz_cptr_t lut) {
 
     // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
     // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
     // But if at least 3 cache lines are touched, the AVX-2 implementation should be faster.
     if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
+        sz_lookup_serial(target, length, source, lut);
         return;
     }
 
@@ -587,7 +660,7 @@ SZ_PUBLIC void sz_look_up_transform_haswell(sz_cptr_t source, sz_size_t length,
     }
 
     // Handle the tail.
-    if (length) sz_look_up_transform_serial(source, length, lut, target);
+    if (length) sz_lookup_serial(target, length, source, lut);
 }
 
 #pragma clang attribute pop
@@ -838,13 +911,13 @@ SZ_PUBLIC void sz_move_skylake(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
 #pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
                              apply_to = function)
 
-SZ_PUBLIC void sz_look_up_transform_ice(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
+SZ_PUBLIC void sz_lookup_ice(sz_ptr_t target, sz_size_t length, sz_cptr_t source, sz_cptr_t lut) {
 
     // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
     // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
     // But if at least 3 cache lines are touched, the AVX-512 implementation should be faster.
     if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
+        sz_lookup_serial(target, length, source, lut);
         return;
     }
 
@@ -1075,12 +1148,12 @@ SZ_PUBLIC void sz_fill_neon(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
     if (length) sz_fill_serial(target, length, value);
 }
 
-SZ_PUBLIC void sz_look_up_transform_neon(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
+SZ_PUBLIC void sz_lookup_neon(sz_ptr_t target, sz_size_t length, sz_cptr_t source, sz_cptr_t lut) {
 
     // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
     // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
     if (length <= 128) {
-        sz_look_up_transform_serial(source, length, lut, target);
+        sz_lookup_serial(target, length, source, lut);
         return;
     }
 
@@ -1291,15 +1364,15 @@ SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
 #endif
 }
 
-SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
+SZ_DYNAMIC void sz_lookup(sz_ptr_t target, sz_size_t length, sz_cptr_t source, sz_cptr_t lut) {
 #if SZ_USE_ICE
-    sz_look_up_transform_ice(source, length, lut, target);
+    sz_lookup_ice(target, length, source, lut);
 #elif SZ_USE_HASWELL
-    sz_look_up_transform_haswell(source, length, lut, target);
+    sz_lookup_haswell(target, length, source, lut);
 #elif SZ_USE_NEON
-    sz_look_up_transform_neon(source, length, lut, target);
+    sz_lookup_neon(target, length, source, lut);
 #else
-    sz_look_up_transform_serial(source, length, lut, target);
+    sz_lookup_serial(target, length, source, lut);
 #endif
 }
 
diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 3f5466b6..24f8fc94 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -3438,7 +3438,7 @@ class basic_string {
         sz_ptr_t start;
         sz_size_t length;
         sz_string_range(&string_, &start, &length);
-        sz_look_up_transform((sz_cptr_t)start, (sz_size_t)length, (sz_cptr_t)table.raw(), (sz_ptr_t)output);
+        sz_lookup((sz_ptr_t)output, (sz_size_t)length, (sz_cptr_t)start, (sz_cptr_t)table.raw());
     }
 
   private:
@@ -3930,21 +3930,20 @@ void randomize(basic_string_slice<char_type_> string, sz_u64_t nonce) noexcept {
  *  @brief  Replaces @b (in-place) all characters in the string using the provided lookup table.
  */
 template <typename char_type_>
-void transform(basic_string_slice<char_type_> string, basic_look_up_table<char_type_> const &table) noexcept {
+void lookup(basic_string_slice<char_type_> string, basic_look_up_table<char_type_> const &table) noexcept {
     static_assert(sizeof(char_type_) == 1, "The character type must be 1 byte long.");
-    sz_look_up_transform((sz_cptr_t)string.data(), (sz_size_t)string.size(), (sz_cptr_t)table.raw(),
-                         (sz_ptr_t)string.data());
+    sz_lookup((sz_ptr_t)string.data(), (sz_size_t)string.size(), (sz_cptr_t)string.data(), (sz_cptr_t)table.raw());
 }
 
 /**
  *  @brief  Maps all characters in the current string into another buffer using the provided lookup table.
  */
 template <typename char_type_>
-void transform( //
+void lookup( //
     basic_string_slice<char_type_ const> source, basic_look_up_table<char_type_> const &table,
     char_type_ *target) noexcept {
     static_assert(sizeof(char_type_) == 1, "The character type must be 1 byte long.");
-    sz_look_up_transform((sz_cptr_t)source.data(), (sz_size_t)source.size(), (sz_cptr_t)table.raw(), (sz_ptr_t)target);
+    sz_lookup((sz_ptr_t)target, (sz_size_t)source.size(), (sz_cptr_t)source.data(), (sz_cptr_t)table.raw());
 }
 
 /**
diff --git a/scripts/bench_memory.cpp b/scripts/bench_memory.cpp
index 7a9acf25..4f52c282 100644
--- a/scripts/bench_memory.cpp
+++ b/scripts/bench_memory.cpp
@@ -191,15 +191,15 @@ tracked_unary_functions_t transform_functions() {
              std::transform(slice.begin(), slice.end(), output, [](char c) { return c + 1; });
              return slice.size();
          })},
-        {"sz_look_up_transform_serial", wrap_sz(sz_look_up_transform_serial)},
+        {"sz_lookup_serial", wrap_sz(sz_lookup_serial)},
 #if SZ_USE_ICE
-        {"sz_look_up_transform_ice", wrap_sz(sz_look_up_transform_ice)},
+        {"sz_lookup_ice", wrap_sz(sz_lookup_ice)},
 #endif
 #if SZ_USE_HASWELL
-        {"sz_look_up_transform_haswell", wrap_sz(sz_look_up_transform_haswell)},
+        {"sz_lookup_haswell", wrap_sz(sz_lookup_haswell)},
 #endif
 #if SZ_USE_NEON
-        {"sz_look_up_transform_neon", wrap_sz(sz_look_up_transform_neon)},
+        {"sz_lookup_neon", wrap_sz(sz_lookup_neon)},
 #endif
     };
     return result;

From 944804e32588e5a785207644e4a942c18d43a13b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 21 Feb 2025 13:34:41 +0000
Subject: [PATCH 116/751] Break: Return error-codes in sort functions

---
 include/stringzilla/sort.h  | 353 +++++++++++++++++++++++-------------
 include/stringzilla/types.h |  19 ++
 2 files changed, 245 insertions(+), 127 deletions(-)

diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index 190074a3..a394b646 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -23,7 +23,6 @@
  *
  *  - `sz_sequence_argsort_with_insertion` - for string collections.
  *  - `sz_pgrams_sort_stable_with_insertion` - for continuous unsigned integers.
- *
  */
 #ifndef STRINGZILLA_SORT_H_
 #define STRINGZILLA_SORT_H_
@@ -41,113 +40,209 @@ extern "C" {
 
 /**
  *  @brief  Faster @b arg-sort for an arbitrary @b string sequence, using QuickSort.
- *          Outputs the ::order of elements in the immutable ::sequence, that would sort it.
- *          The algorithm doesn't guarantee stability, meaning that the relative order of equal elements
- *          may not be preserved.
+ *          Outputs the @p order of elements in the immutable @p sequence, that would sort it.
+ *
+ *  @param[in] sequence Immutable sequence of strings to sort.
+ *  @param[in] alloc Optional memory allocator for temporary storage.
+ *  @param[out] order Output permutation that sorts the elements. Must fit at least `sequence->count` integers.
+ *
+ *  @retval `sz_success_k` if the operation was successful.
+ *  @retval `sz_bad_alloc_k` if the operation failed due to memory allocation failure.
+ *  @post The @p order array will contain a valid permutation of `[0, sequence->count - 1]`.
+ *
+ *  Example usage:
+ *
+ *  @code{.c}
+ *      #include <stringzilla/sort.h>
+ *      int main() {
+ *          char const *strings[] = {"banana", "apple", "cherry"};
+ *          sz_sequence_t sequence;
+ *          sz_sequence_from_null_terminated_strings(strings, 3, &sequence);
+ *          sz_sorted_idx_t order[3];
+ *          sz_sequence_argsort(&sequence, NULL, order);
+ *          return order[0] == 1 && order[1] == 0 && order[2] == 2 ? 0 : 1;
+ *      }
+ *  @endcode
+ *
+ *  @note   The algorithm has linear memory complexity, quadratic worst-case and log-linear average time complexity.
+ *  @see    https://en.wikipedia.org/wiki/Quicksort
+ *
+ *  @note   This algorithm is @b unstable: equal elements may change relative order.
+ *  @sa     sz_sequence_argsort_stable
  *
- *  @param sequence The sequence of strings to sort.
- *  @param alloc Memory allocator for temporary storage.
- *  @param order The output - indices of the sorted sequence elements.
- *  @return Whether the operation was successful.
+ *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
+ *  @sa     sz_sequence_argsort_serial, sz_sequence_argsort_skylake, sz_sequence_argsort_sve
  */
-SZ_DYNAMIC sz_bool_t sz_sequence_argsort(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
-                                         sz_sorted_idx_t *order);
+SZ_DYNAMIC sz_status_t sz_sequence_argsort(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                           sz_sorted_idx_t *order);
 
 /**
  *  @brief  Faster @b inplace `std::sort` for a continuous @b unsigned-integer sequence, using QuickSort.
- *          Overwrites the input ::sequence with the sorted sequence and exports the permutation ::order.
- *          The algorithm doesn't guarantee stability, meaning that the relative order of equal elements
- *          may not be preserved.
+ *          Overwrites the input @p pgrams with the sorted sequence and exports the @p order permutation.
+ *
+ *  @param[inout] pgrams Continuous buffer of unsigned integers to sort in place.
+ *  @param[in] count Number of elements in the sequence.
+ *  @param[in] alloc Optional memory allocator for temporary storage.
+ *  @param[out] order Output permutation that sorts the elements. Must fit at least @p count integers.
+ *
+ *  @retval `sz_success_k` if the operation was successful.
+ *  @retval `sz_bad_alloc_k` if the operation failed due to memory allocation failure.
+ *  @post The @p order array will contain a valid permutation of `[0, count - 1]`.
+ *
+ *  Example usage:
+ *
+ *  @code{.c}
+ *      #include <stringzilla/sort.h>
+ *      int main() {
+ *          sz_pgram_t pgrams[] = {42, 17, 99, 8};
+ *          sz_sorted_idx_t order[4];
+ *          sz_pgrams_sort(pgrams, 4, NULL, order);
+ *          return order[0] == 3 && order[1] == 1 && order[2] == 0 && order[3] == 2 ? 0 : 1;
+ *      }
+ *  @endcode
  *
- *  @param pgrams The continuous buffer of unsigned integers to sort in place.
- *  @param count The number of elements in the sequence.
- *  @param alloc Memory allocator for temporary storage.
- *  @param order The output - indices of the sorted sequence elements.
- *  @return Whether the operation was successful.
+ *  @note   The algorithm has linear memory complexity, quadratic worst-case and log-linear average time complexity.
+ *  @see    https://en.wikipedia.org/wiki/Quicksort
+ *
+ *  @note   This algorithm is @b unstable: equal elements may change relative order.
+ *  @sa     sz_pgrams_sort_stable
+ *
+ *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
+ *  @sa     sz_pgrams_sort_serial, sz_pgrams_sort_skylake, sz_pgrams_sort_sve
  */
-SZ_DYNAMIC sz_bool_t sz_pgrams_sort(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
-                                    sz_sorted_idx_t *order);
+SZ_DYNAMIC sz_status_t sz_pgrams_sort(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                      sz_sorted_idx_t *order);
 
 /**
  *  @brief  Faster @b arg-sort for an arbitrary @b string sequence, using MergeSort.
- *          Outputs the ::order of elements in the immutable ::sequence, that would sort it.
- *          The algorithm guarantees stability, meaning that the relative order of equal elements is preserved.
+ *          Outputs the @p order of elements in the immutable @p sequence, that would sort it.
+ *
+ *  This algorithm guarantees stability, ensuring that the relative order of equal elements is preserved.
+ *  It uses more memory than `sz_sequence_argsort`, but its performance is more predictable.
+ *  It's preferred for very large inputs, as most memory access happens in a sequential pattern.
+ *
+ *  @param[in] sequence Immutable sequence of strings to sort.
+ *  @param[in] alloc Optional memory allocator for temporary storage.
+ *  @param[out] order Output permutation that sorts the elements. Must fit at least `sequence->count` integers.
+ *
+ *  @retval `sz_success_k` if the operation was successful.
+ *  @retval `sz_bad_alloc_k` if the operation failed due to memory allocation failure.
+ *  @post The @p order array will contain a valid permutation of `[0, sequence->count - 1]`.
+ *
+ *  Example usage:
+ *
+ *  @code{.c}
+ *      #include <stringzilla/sort.h>
+ *      int main() {
+ *          char const *strings[] = {"banana", "apple", "cherry"};
+ *          sz_sequence_t sequence;
+ *          sz_sequence_from_null_terminated_strings(strings, 3, &sequence);
+ *          sz_sorted_idx_t order[3];
+ *          sz_sequence_argsort_stable(&sequence, NULL, order);
+ *          return order[0] == 1 && order[1] == 0 && order[2] == 2 ? 0 : 1;
+ *      }
+ *  @endcode
  *
- *  This algorithm uses more memory than `sz_sequence_argsort`, but it's performance is more predictable.
- *  It's also preferred for very large inputs, as most memory access happens in a predictable sequential order.
+ *  @note   The algorithm has linear memory complexity and log-linear time complexity.
+ *  @see    https://en.wikipedia.org/wiki/Merge_sort
  *
- *  @param sequence The sequence of strings to sort.
- *  @param alloc Memory allocator for temporary storage.
- *  @param order The output - indices of the sorted sequence elements.
- *  @return Whether the operation was successful.
+ *  @note   This algorithm is @b stable: equal elements maintain their relative order.
+ *  @sa     sz_sequence_argsort
+ *
+ *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
+ *  @sa     sz_sequence_argsort_stable_serial, sz_sequence_argsort_stable_skylake, sz_sequence_argsort_stable_sve
  */
-SZ_DYNAMIC sz_bool_t sz_sequence_argsort_stable(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
-                                                sz_sorted_idx_t *order);
+SZ_DYNAMIC sz_status_t sz_sequence_argsort_stable(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                                  sz_sorted_idx_t *order);
 
 /**
- *  @brief  Faster @b inplace `std::stable_sort sort` for a continuous @b unsigned-integer sequence, using MergeSort.
- *          Overwrites the input ::sequence with the sorted sequence and exports the permutation ::order.
- *          The algorithm guarantees stability, meaning that the relative order of equal elements is preserved.
+ *  @brief  Faster @b inplace `std::stable_sort` for a continuous @b unsigned-integer sequence, using MergeSort.
+ *          Overwrites the input @p pgrams with the sorted sequence and exports the @p order permutation.
+ *
+ *  This algorithm guarantees stability, ensuring that the relative order of equal elements is preserved.
+ *  It uses more memory than `sz_pgrams_sort`, but its performance is more predictable.
+ *  It's preferred for very large inputs, as most memory access happens in a sequential pattern.
+ *
+ *  @param[inout] pgrams Continuous buffer of unsigned integers to sort in place.
+ *  @param[in] count Number of elements in the sequence.
+ *  @param[in] alloc Optional memory allocator for temporary storage.
+ *  @param[out] order Output permutation that sorts the elements. Must fit at least @p count integers.
+ *
+ *  @retval `sz_success_k` if the operation was successful.
+ *  @retval `sz_bad_alloc_k` if the operation failed due to memory allocation failure.
+ *  @post The @p order array will contain a valid permutation of `[0, count - 1]`.
+ *
+ *  Example usage:
  *
- *  This algorithm uses more memory than `sz_pgrams_sort`, but it's performance is more predictable.
- *  It's also preferred for very large inputs, as most memory access happens in a predictable sequential order.
+ *  @code{.c}
+ *      #include <stringzilla/sort.h>
+ *      int main() {
+ *          sz_pgram_t pgrams[] = {42, 17, 99, 8};
+ *          sz_sorted_idx_t order[4];
+ *          sz_pgrams_sort_stable(pgrams, 4, NULL, order);
+ *          return order[0] == 3 && order[1] == 1 && order[2] == 0 && order[3] == 2 ? 0 : 1;
+ *      }
+ *  @endcode
  *
- *  @param pgrams The continuous buffer of unsigned integers to sort in place.
- *  @param count The number of elements in the sequence.
- *  @param alloc Memory allocator for temporary storage.
- *  @param order The output - indices of the sorted sequence elements.
- *  @return Whether the operation was successful.
+ *  @note   The algorithm has linear memory complexity and log-linear time complexity.
+ *  @see    [MergeSort Algorithm](https://en.wikipedia.org/wiki/Merge_sort)
+ *
+ *  @note   This algorithm is @b stable: equal elements maintain their relative order.
+ *  @sa     sz_pgrams_sort
+ *
+ *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
+ *  @sa     sz_pgrams_sort_stable_serial, sz_pgrams_sort_stable_skylake, sz_pgrams_sort_stable_sve
  */
-SZ_DYNAMIC sz_bool_t sz_pgrams_sort_stable(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
-                                           sz_sorted_idx_t *order);
+SZ_DYNAMIC sz_status_t sz_pgrams_sort_stable(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                             sz_sorted_idx_t *order);
 
 /** @copydoc sz_sequence_argsort */
-SZ_PUBLIC sz_bool_t sz_sequence_argsort_serial(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
-                                               sz_sorted_idx_t *order);
+SZ_PUBLIC sz_status_t sz_sequence_argsort_serial(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                                 sz_sorted_idx_t *order);
 
 /** @copydoc sz_pgrams_sort */
-SZ_PUBLIC sz_bool_t sz_pgrams_sort_serial(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
-                                          sz_sorted_idx_t *order);
+SZ_PUBLIC sz_status_t sz_pgrams_sort_serial(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                            sz_sorted_idx_t *order);
 
 /** @copydoc sz_sequence_argsort */
-SZ_PUBLIC sz_bool_t sz_sequence_argsort_ice(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
-                                            sz_sorted_idx_t *order);
+SZ_PUBLIC sz_status_t sz_sequence_argsort_skylake(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                                  sz_sorted_idx_t *order);
 
 /** @copydoc sz_pgrams_sort */
-SZ_PUBLIC sz_bool_t sz_pgrams_sort_ice(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
-                                       sz_sorted_idx_t *order);
+SZ_PUBLIC sz_status_t sz_pgrams_sort_skylake(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                             sz_sorted_idx_t *order);
 
 /** @copydoc sz_sequence_argsort */
-SZ_PUBLIC sz_bool_t sz_sequence_argsort_sve(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
-                                            sz_sorted_idx_t *order);
+SZ_PUBLIC sz_status_t sz_sequence_argsort_sve(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                              sz_sorted_idx_t *order);
 
 /** @copydoc sz_pgrams_sort */
-SZ_PUBLIC sz_bool_t sz_pgrams_sort_sve(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
-                                       sz_sorted_idx_t *order);
+SZ_PUBLIC sz_status_t sz_pgrams_sort_sve(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                         sz_sorted_idx_t *order);
 
 /** @copydoc sz_sequence_argsort_stable */
-SZ_PUBLIC sz_bool_t sz_sequence_argsort_stable_serial(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
-                                                      sz_sorted_idx_t *order);
+SZ_PUBLIC sz_status_t sz_sequence_argsort_stable_serial(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                                        sz_sorted_idx_t *order);
 
 /** @copydoc sz_pgrams_sort_stable */
-SZ_PUBLIC sz_bool_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
-                                                 sz_sorted_idx_t *order);
+SZ_PUBLIC sz_status_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                                   sz_sorted_idx_t *order);
 
 /** @copydoc sz_sequence_argsort_stable */
-SZ_PUBLIC sz_bool_t sz_sequence_argsort_stable_ice(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
-                                                   sz_sorted_idx_t *order);
+SZ_PUBLIC sz_status_t sz_sequence_argsort_stable_skylake(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                                         sz_sorted_idx_t *order);
 
 /** @copydoc sz_pgrams_sort_stable */
-SZ_PUBLIC sz_bool_t sz_pgrams_sort_stable_ice(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
-                                              sz_sorted_idx_t *order);
+SZ_PUBLIC sz_status_t sz_pgrams_sort_stable_skylake(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                                    sz_sorted_idx_t *order);
 
 /** @copydoc sz_sequence_argsort_stable */
-SZ_PUBLIC sz_bool_t sz_sequence_argsort_stable_sve(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
-                                                   sz_sorted_idx_t *order);
+SZ_PUBLIC sz_status_t sz_sequence_argsort_stable_sve(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                                     sz_sorted_idx_t *order);
 
 /** @copydoc sz_pgrams_sort_stable */
-SZ_PUBLIC sz_bool_t sz_pgrams_sort_stable_sve(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
-                                              sz_sorted_idx_t *order);
+SZ_PUBLIC sz_status_t sz_pgrams_sort_stable_sve(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                                sz_sorted_idx_t *order);
 
 #pragma endregion
 
@@ -557,8 +652,8 @@ SZ_PUBLIC void _sz_sequence_argsort_serial_next_pgrams(                   //
     }
 }
 
-SZ_PUBLIC sz_bool_t sz_sequence_argsort_serial(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
-                                               sz_sorted_idx_t *order) {
+SZ_PUBLIC sz_status_t sz_sequence_argsort_serial(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                                 sz_sorted_idx_t *order) {
 
     // First, initialize the `order` with `std::iota`-like behavior.
     for (sz_size_t i = 0; i != sequence->count; ++i) order[i] = i;
@@ -567,7 +662,7 @@ SZ_PUBLIC sz_bool_t sz_sequence_argsort_serial(sz_sequence_t const *sequence, sz
     // without any smart optimizations or memory allocations.
     if (sequence->count <= 32) {
         sz_sequence_argsort_with_insertion(sequence, order);
-        return sz_true_k;
+        return sz_success_k;
     }
 
     // One of the reasons for slow string operations is the significant overhead of branching when performing
@@ -583,24 +678,24 @@ SZ_PUBLIC sz_bool_t sz_sequence_argsort_serial(sz_sequence_t const *sequence, sz
     // iteration of a recursive algorithm.
     sz_size_t memory_usage = sequence->count * sizeof(sz_pgram_t);
     sz_pgram_t *pgrams = (sz_pgram_t *)alloc->allocate(memory_usage, alloc);
-    if (!pgrams) return sz_false_k;
+    if (!pgrams) return sz_bad_alloc_k;
 
     // Recursively sort the whole sequence.
     _sz_sequence_argsort_serial_next_pgrams(sequence, pgrams, order, 0, sequence->count, 0);
 
     // Free temporary storage.
     alloc->free(pgrams, memory_usage, alloc);
-    return sz_true_k;
+    return sz_success_k;
 }
 
-SZ_PUBLIC sz_bool_t sz_pgrams_sort_serial(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
-                                          sz_sorted_idx_t *order) {
+SZ_PUBLIC sz_status_t sz_pgrams_sort_serial(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                            sz_sorted_idx_t *order) {
     sz_unused(alloc);
     // First, initialize the `order` with `std::iota`-like behavior.
     for (sz_size_t i = 0; i != count; ++i) order[i] = i;
     // Reuse the string sorting algorithm for sorting the "pgrams".
     _sz_sequence_argsort_serial_recursively((sz_pgram_t *)pgrams, order, 0, count);
-    return sz_true_k;
+    return sz_success_k;
 }
 
 #pragma endregion // Serial QuickSort Implementation
@@ -658,8 +753,8 @@ SZ_INTERNAL void _sz_sequence_argsort_stable_serial_merge(
             _sz_assert(merged_begin[i - 1] <= merged_begin[i] && "The merged pgrams must be in ascending order.");
 }
 
-SZ_PUBLIC sz_bool_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
-                                                 sz_sorted_idx_t *order) {
+SZ_PUBLIC sz_status_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                                   sz_sorted_idx_t *order) {
 
     // First, initialize the `order` with `std::iota`-like behavior.
     for (sz_size_t i = 0; i != count; ++i) order[i] = i;
@@ -668,7 +763,7 @@ SZ_PUBLIC sz_bool_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t c
     // without any smart optimizations or memory allocations.
     if (count <= 32) {
         sz_pgrams_sort_stable_with_insertion(pgrams, count, order);
-        return sz_true_k;
+        return sz_success_k;
     }
 
     // Go through short chunks of 8 elements and sort them with a sorting network.
@@ -686,7 +781,7 @@ SZ_PUBLIC sz_bool_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t c
     sz_size_t memory_usage = sizeof(sz_pgram_t) * count + sizeof(sz_sorted_idx_t) * count;
     sz_pgram_t *pgrams_temporary = (sz_pgram_t *)alloc->allocate(memory_usage, alloc);
     sz_sorted_idx_t *order_temporary = (sz_sorted_idx_t *)(pgrams_temporary + count);
-    if (!pgrams_temporary) return sz_false_k;
+    if (!pgrams_temporary) return sz_bad_alloc_k;
 
     // Set initial run size (the sorted chunks).
     sz_size_t run_size = 8;
@@ -731,7 +826,13 @@ SZ_PUBLIC sz_bool_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t c
 
     // Free the temporary memory used for merging.
     alloc->free(pgrams_temporary, memory_usage, alloc);
-    return sz_true_k;
+    return sz_success_k;
+}
+
+SZ_PUBLIC sz_status_t sz_sequence_argsort_stable_serial(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                                        sz_sorted_idx_t *order) {
+
+    return sz_success_k;
 }
 
 #pragma endregion // Serial MergeSort Implementation
@@ -744,19 +845,17 @@ SZ_PUBLIC sz_bool_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t c
  *
  *  We are going to use VBMI2 for `_mm256_maskz_compress_epi8`.
  */
-#pragma region Ice Lake Implementation
-#if SZ_USE_ICE
+#pragma region Skylake Implementation
+#if SZ_USE_SKYLAKE
 #pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "avx512vbmi2", "bmi", "bmi2")
-#pragma clang attribute push(                                                                          \
-    __attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vbmi2,bmi,bmi2"))), \
-    apply_to = function)
+#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
+#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
 
 /**
  *  @brief  The most important part of the QuickSort algorithm partitioning the elements around the pivot.
  *          Unlike the serial algorithm, uses compressed stores to filter and move the elements around the pivot.
  */
-SZ_INTERNAL void _sz_sequence_argsort_ice_3way_partition(                           //
+SZ_INTERNAL void _sz_sequence_argsort_skylake_3way_partition(                       //
     sz_pgram_t *const initial_pgrams, sz_sorted_idx_t *const initial_order,         //
     sz_pgram_t *const partitioned_pgrams, sz_sorted_idx_t *const partitioned_order, //
     sz_size_t const start_in_sequence, sz_size_t const end_in_sequence,             //
@@ -850,10 +949,10 @@ SZ_INTERNAL void _sz_sequence_argsort_ice_3way_partition(
 }
 
 /**
- *  @brief  Recursive Quick-Sort implementation backing both the `sz_sequence_argsort_ice` and `sz_pgrams_sort_ice`,
- *          and using the `_sz_sequence_argsort_ice_3way_partition` under the hood.
+ *  @brief  Recursive Quick-Sort implementation backing both the `sz_sequence_argsort_skylake` and
+ * `sz_pgrams_sort_skylake`, and using the `_sz_sequence_argsort_skylake_3way_partition` under the hood.
  */
-SZ_INTERNAL void _sz_sequence_argsort_ice_recursively(              //
+SZ_INTERNAL void _sz_sequence_argsort_skylake_recursively(          //
     sz_pgram_t *initial_pgrams, sz_sorted_idx_t *initial_order,     //
     sz_pgram_t *temporary_pgrams, sz_sorted_idx_t *temporary_order, //
     sz_size_t const start_in_sequence, sz_size_t const end_in_sequence) {
@@ -870,24 +969,24 @@ SZ_INTERNAL void _sz_sequence_argsort_ice_recursively(              //
 
     // Partition the collection around some pivot
     sz_size_t first_pivot_index, last_pivot_index;
-    _sz_sequence_argsort_ice_3way_partition(                              //
+    _sz_sequence_argsort_skylake_3way_partition(                          //
         initial_pgrams, initial_order, temporary_pgrams, temporary_order, //
         start_in_sequence, end_in_sequence,                               //
         &first_pivot_index, &last_pivot_index);
 
     // Recursively sort the left and right partitions, if there are at least 2 elements in each
     if (start_in_sequence + 1 < first_pivot_index)
-        _sz_sequence_argsort_ice_recursively(                                 //
+        _sz_sequence_argsort_skylake_recursively(                             //
             initial_pgrams, initial_order, temporary_pgrams, temporary_order, //
             start_in_sequence, first_pivot_index);
     if (last_pivot_index + 2 < end_in_sequence)
-        _sz_sequence_argsort_ice_recursively(                                 //
+        _sz_sequence_argsort_skylake_recursively(                             //
             initial_pgrams, initial_order, temporary_pgrams, temporary_order, //
             last_pivot_index + 1, end_in_sequence);
 }
 
-SZ_PUBLIC sz_bool_t sz_pgrams_sort_ice(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
-                                       sz_sorted_idx_t *order) {
+SZ_PUBLIC sz_status_t sz_pgrams_sort_skylake(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                             sz_sorted_idx_t *order) {
 
     // First, initialize the `order` with `std::iota`-like behavior.
     for (sz_size_t i = 0; i != count; ++i) order[i] = i;
@@ -896,14 +995,14 @@ SZ_PUBLIC sz_bool_t sz_pgrams_sort_ice(sz_pgram_t *pgrams, sz_size_t count, sz_m
     sz_size_t memory_usage = sizeof(sz_pgram_t) * count + sizeof(sz_sorted_idx_t) * count;
     sz_pgram_t *temporary_pgrams = (sz_pgram_t *)alloc->allocate(memory_usage, alloc);
     sz_sorted_idx_t *temporary_order = (sz_sorted_idx_t *)(temporary_pgrams + count);
-    if (!temporary_pgrams) return sz_false_k;
+    if (!temporary_pgrams) return sz_bad_alloc_k;
 
     // Reuse the string sorting algorithm for sorting the "pgrams".
-    _sz_sequence_argsort_ice_recursively(pgrams, order, temporary_pgrams, temporary_order, 0, count);
+    _sz_sequence_argsort_skylake_recursively(pgrams, order, temporary_pgrams, temporary_order, 0, count);
 
     // Deallocate the temporary memory used for partitioning.
     alloc->free(temporary_pgrams, memory_usage, alloc);
-    return sz_true_k;
+    return sz_success_k;
 }
 
 /**
@@ -911,7 +1010,7 @@ SZ_PUBLIC sz_bool_t sz_pgrams_sort_ice(sz_pgram_t *pgrams, sz_size_t count, sz_m
  *          It combines `_sz_sequence_argsort_serial_export_next_pgrams` and `_sz_sequence_argsort_serial_recursively`,
  *          recursively diving into the identical pgrams.
  */
-SZ_PUBLIC void _sz_sequence_argsort_ice_next_pgrams(                            //
+SZ_PUBLIC void _sz_sequence_argsort_skylake_next_pgrams(                        //
     sz_sequence_t const *const sequence,                                        //
     sz_pgram_t *const global_pgrams, sz_sorted_idx_t *const global_order,       //
     sz_pgram_t *const temporary_pgrams, sz_sorted_idx_t *const temporary_order, //
@@ -923,8 +1022,8 @@ SZ_PUBLIC void _sz_sequence_argsort_ice_next_pgrams(
                                                    end_in_sequence, start_character);
 
     // Sort current pgrams with a quicksort
-    _sz_sequence_argsort_ice_recursively(global_pgrams, global_order, temporary_pgrams, temporary_order,
-                                         start_in_sequence, end_in_sequence);
+    _sz_sequence_argsort_skylake_recursively(global_pgrams, global_order, temporary_pgrams, temporary_order,
+                                             start_in_sequence, end_in_sequence);
 
     // Depending on the architecture, we will export a different number of bytes.
     // On 32-bit architectures, we will export 3 bytes, and on 64-bit architectures - 7 bytes.
@@ -944,17 +1043,17 @@ SZ_PUBLIC void _sz_sequence_argsort_ice_next_pgrams(
         int has_multiple_strings = nested_end - nested_start > 1;
         int has_more_characters_in_each = current_pgram_length == pgram_capacity;
         if (has_multiple_strings && has_more_characters_in_each) {
-            _sz_sequence_argsort_ice_next_pgrams(sequence, global_pgrams, global_order, temporary_pgrams,
-                                                 temporary_order, nested_start, nested_end,
-                                                 start_character + pgram_capacity);
+            _sz_sequence_argsort_skylake_next_pgrams(sequence, global_pgrams, global_order, temporary_pgrams,
+                                                     temporary_order, nested_start, nested_end,
+                                                     start_character + pgram_capacity);
         }
         // Move to the next
         nested_start = nested_end;
     }
 }
 
-SZ_PUBLIC sz_bool_t sz_sequence_argsort_ice(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
-                                            sz_sorted_idx_t *order) {
+SZ_PUBLIC sz_status_t sz_sequence_argsort_skylake(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                                  sz_sorted_idx_t *order) {
 
     // First, initialize the `order` with `std::iota`-like behavior.
     sz_size_t count = sequence->count;
@@ -964,7 +1063,7 @@ SZ_PUBLIC sz_bool_t sz_sequence_argsort_ice(sz_sequence_t const *sequence, sz_me
     // without any smart optimizations or memory allocations.
     if (count <= 32) {
         sz_sequence_argsort_with_insertion(sequence, order);
-        return sz_true_k;
+        return sz_success_k;
     }
 
     // Allocate memory for partitioning the elements around the pivot.
@@ -972,20 +1071,20 @@ SZ_PUBLIC sz_bool_t sz_sequence_argsort_ice(sz_sequence_t const *sequence, sz_me
     sz_pgram_t *global_pgrams = (sz_pgram_t *)alloc->allocate(memory_usage, alloc);
     sz_pgram_t *temporary_pgrams = global_pgrams + count;
     sz_sorted_idx_t *temporary_order = (sz_sorted_idx_t *)(temporary_pgrams + count);
-    if (!global_pgrams) return sz_false_k;
+    if (!global_pgrams) return sz_bad_alloc_k;
 
     // Recursively sort the whole sequence.
-    _sz_sequence_argsort_ice_next_pgrams(sequence, global_pgrams, order, temporary_pgrams, temporary_order, //
-                                         0, count, 0);
+    _sz_sequence_argsort_skylake_next_pgrams(sequence, global_pgrams, order, temporary_pgrams, temporary_order, //
+                                             0, count, 0);
 
     // Free temporary storage.
     alloc->free(global_pgrams, memory_usage, alloc);
-    return sz_true_k;
+    return sz_success_k;
 }
 
 #pragma clang attribute pop
 #pragma GCC pop_options
-#endif            // SZ_USE_ICE
+#endif            // SZ_USE_SKYLAKE
 #pragma endregion // Ice Lake Implementation
 
 /*  Pick the right implementation for the string search algorithms.
@@ -994,10 +1093,10 @@ SZ_PUBLIC sz_bool_t sz_sequence_argsort_ice(sz_sequence_t const *sequence, sz_me
 #pragma region Compile Time Dispatching
 #if !SZ_DYNAMIC_DISPATCH
 
-SZ_DYNAMIC sz_bool_t sz_sequence_argsort(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
-                                         sz_sorted_idx_t *order) {
-#if SZ_USE_ICE
-    return sz_sequence_argsort_ice(sequence, alloc, order);
+SZ_DYNAMIC sz_status_t sz_sequence_argsort(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                           sz_sorted_idx_t *order) {
+#if SZ_USE_SKYLAKE
+    return sz_sequence_argsort_skylake(sequence, alloc, order);
 #elif SZ_USE_SVE
     return sz_sequence_argsort_sve(sequence, alloc, order);
 #else
@@ -1005,10 +1104,10 @@ SZ_DYNAMIC sz_bool_t sz_sequence_argsort(sz_sequence_t const *sequence, sz_memor
 #endif
 }
 
-SZ_DYNAMIC sz_bool_t sz_pgrams_sort(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
-                                    sz_sorted_idx_t *order) {
-#if SZ_USE_ICE
-    return sz_pgrams_sort_ice(pgrams, count, alloc, order);
+SZ_DYNAMIC sz_status_t sz_pgrams_sort(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                      sz_sorted_idx_t *order) {
+#if SZ_USE_SKYLAKE
+    return sz_pgrams_sort_skylake(pgrams, count, alloc, order);
 #elif SZ_USE_SVE
     return sz_pgrams_sort_sve(pgrams, count, alloc, order);
 #else
@@ -1016,10 +1115,10 @@ SZ_DYNAMIC sz_bool_t sz_pgrams_sort(sz_pgram_t *pgrams, sz_size_t count, sz_memo
 #endif
 }
 
-SZ_DYNAMIC sz_bool_t sz_sequence_argsort_stable(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
-                                                sz_sorted_idx_t *order) {
-#if SZ_USE_ICE
-    return sz_sequence_argsort_ice(sequence, alloc, order);
+SZ_DYNAMIC sz_status_t sz_sequence_argsort_stable(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                                  sz_sorted_idx_t *order) {
+#if SZ_USE_SKYLAKE
+    return sz_sequence_argsort_skylake(sequence, alloc, order);
 #elif SZ_USE_SVE
     return sz_sequence_argsort_sve(sequence, alloc, order);
 #else
@@ -1027,10 +1126,10 @@ SZ_DYNAMIC sz_bool_t sz_sequence_argsort_stable(sz_sequence_t const *sequence, s
 #endif
 }
 
-SZ_DYNAMIC sz_bool_t sz_pgrams_sort_stable(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
-                                           sz_sorted_idx_t *order) {
-#if SZ_USE_ICE
-    return sz_pgrams_sort_ice(pgrams, count, alloc, order);
+SZ_DYNAMIC sz_status_t sz_pgrams_sort_stable(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                             sz_sorted_idx_t *order) {
+#if SZ_USE_SKYLAKE
+    return sz_pgrams_sort_skylake(pgrams, count, alloc, order);
 #elif SZ_USE_SVE
     return sz_pgrams_sort_sve(pgrams, count, alloc, order);
 #else
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 825e36f9..09b44eff 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -331,6 +331,25 @@ typedef sz_size_t sz_pgram_t;      // "Pointer-sized N-gram" of a string
 typedef enum { sz_false_k = 0, sz_true_k = 1 } sz_bool_t;                        // Only one relevant bit
 typedef enum { sz_less_k = -1, sz_equal_k = 0, sz_greater_k = 1 } sz_ordering_t; // Only three possible states: <=>
 
+/**
+ *  @brief  Describes an error status of a function.
+ */
+typedef enum {
+    /**
+     *  For algorithms that return a status, this status indicates that the operation was successful.
+     */
+    sz_success_k = 0,
+    /**
+     *  For algorithms that require memory allocation, this status indicates that the allocation failed.
+     */
+    sz_bad_alloc_k = -1,
+    /**
+     *  For algorithms that have an upper bound on some parameter, like the maximum number of iterations,
+     *  or the maximum edit distance, this status indicates that the limit was reached.
+     */
+    sz_reached_limit_k = -2,
+} sz_status_t;
+
 /**
  *  @brief  Describes the length of a UTF8 @b rune / character / codepoint in bytes.
  */

From dc7c109ef088faa015bf2135211c9d7978d8ff54 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 21 Feb 2025 13:57:00 +0000
Subject: [PATCH 117/751] Add: Missing `sz_sequence_t` helpers

---
 include/stringzilla/types.h | 112 +++++++++++++++++++++---------------
 scripts/test.cpp            |  15 ++++-
 2 files changed, 80 insertions(+), 47 deletions(-)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 09b44eff..75d76e61 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -461,68 +461,71 @@ SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void
 
 #pragma region API Signature Types
 
-/** @brief  Signature of ::sz_hash. */
+/** @brief  Signature of `sz_hash`. */
 typedef sz_u64_t (*sz_hash_t)(sz_cptr_t, sz_size_t, sz_u64_t);
 
-/** @brief  Signature of ::sz_hash_state_init. */
+/** @brief  Signature of `sz_hash_state_init`. */
 typedef void (*sz_hash_state_init_t)(struct sz_hash_state_t *, sz_u64_t);
 
-/** @brief  Signature of ::sz_hash_state_stream. */
+/** @brief  Signature of `sz_hash_state_stream`. */
 typedef void (*sz_hash_state_stream_t)(struct sz_hash_state_t *, sz_cptr_t, sz_size_t);
 
-/** @brief  Signature of ::sz_hash_state_fold. */
+/** @brief  Signature of `sz_hash_state_fold`. */
 typedef sz_u64_t (*sz_hash_state_fold_t)(struct sz_hash_state_t const *);
 
-/** @brief  Signature of ::sz_bytesum. */
+/** @brief  Signature of `sz_bytesum`. */
 typedef sz_u64_t (*sz_bytesum_t)(sz_cptr_t, sz_size_t);
 
-/** @brief  Signature of ::sz_generate. */
+/** @brief  Signature of `sz_generate`. */
 typedef void (*sz_generate_t)(sz_ptr_t, sz_size_t, sz_u64_t);
 
-/** @brief  Signature of ::sz_equal. */
+/** @brief  Signature of `sz_equal`. */
 typedef sz_bool_t (*sz_equal_t)(sz_cptr_t, sz_cptr_t, sz_size_t);
 
-/** @brief  Signature of ::sz_order. */
+/** @brief  Signature of `sz_order`. */
 typedef sz_ordering_t (*sz_order_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
 
-/** @brief  Signature of ::sz_look_up_transform. */
-typedef void (*sz_look_up_transform_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_ptr_t);
+/** @brief  Signature of `sz_lookup`. */
+typedef void (*sz_lookup_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_ptr_t);
 
-/** @brief  Signature of ::sz_move. */
+/** @brief  Signature of `sz_move`. */
 typedef void (*sz_move_t)(sz_ptr_t, sz_cptr_t, sz_size_t);
 
-/** @brief  Signature of ::sz_fill. */
+/** @brief  Signature of `sz_fill`. */
 typedef void (*sz_fill_t)(sz_ptr_t, sz_size_t, sz_u8_t);
 
-/** @brief  Signature of ::sz_find_byte. */
+/** @brief  Signature of `sz_find_byte`. */
 typedef sz_cptr_t (*sz_find_byte_t)(sz_cptr_t, sz_size_t, sz_cptr_t);
 
-/** @brief  Signature of ::sz_find. */
+/** @brief  Signature of `sz_find`. */
 typedef sz_cptr_t (*sz_find_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
 
-/** @brief  Signature of ::sz_find_set. */
+/** @brief  Signature of `sz_find_set`. */
 typedef sz_cptr_t (*sz_find_set_t)(sz_cptr_t, sz_size_t, sz_charset_t const *);
 
-/** @brief  Signature of ::sz_hamming_distance. */
-typedef sz_size_t (*sz_hamming_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t);
+/** @brief  Signature of `sz_hamming_distance`. */
+typedef sz_status_t (*sz_hamming_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t, sz_size_t *);
 
-/** @brief  Signature of ::sz_edit_distance. */
-typedef sz_size_t (*sz_edit_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t, sz_memory_allocator_t *);
+/** @brief  Signature of `sz_levenshtein_distance`. */
+typedef sz_status_t (*sz_levenshtein_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t,
+                                                 sz_memory_allocator_t *, sz_size_t *);
 
-/** @brief  Signature of ::sz_alignment_score. */
-typedef sz_ssize_t (*sz_alignment_score_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_error_cost_t const *,
-                                           sz_error_cost_t, sz_memory_allocator_t *);
+/** @brief  Signature of `sz_needleman_wunsch_score`. */
+typedef sz_status_t (*sz_needleman_wunsch_score_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_error_cost_t const *,
+                                                   sz_error_cost_t, sz_memory_allocator_t *, sz_ssize_t *);
 
-/** @brief  Signature of ::sz_sequence_argsort. */
-typedef sz_bool_t (*sz_sequence_argsort_t)(struct sz_sequence_t const *, sz_memory_allocator_t *, sz_sorted_idx_t *);
+/** @brief  Signature of `sz_sequence_argsort`. */
+typedef sz_status_t (*sz_sequence_argsort_t)(struct sz_sequence_t const *, sz_memory_allocator_t *, sz_sorted_idx_t *,
+                                             sz_bool_t *);
 
-/** @brief  Signature of ::sz_pgrams_sort. */
-typedef sz_bool_t (*sz_pgrams_sort_t)(sz_pgram_t *, sz_size_t, sz_memory_allocator_t *, sz_sorted_idx_t *);
+/** @brief  Signature of `sz_pgrams_sort`. */
+typedef sz_status_t (*sz_pgrams_sort_t)(sz_pgram_t *, sz_size_t, sz_memory_allocator_t *, sz_sorted_idx_t *,
+                                        sz_bool_t *);
 
-/** @brief  Signature of ::sz_sequence_argsort_stable. */
+/** @brief  Signature of `sz_sequence_argsort_stable`. */
 typedef sz_sequence_argsort_t sz_sequence_argsort_stable_t;
 
-/** @brief  Signature of ::sz_pgrams_sort_stable. */
+/** @brief  Signature of `sz_pgrams_sort_stable`. */
 typedef sz_pgrams_sort_t sz_pgrams_sort_stable_t;
 
 #pragma endregion
@@ -683,9 +686,17 @@ SZ_INTERNAL sz_size_t _sz_export_utf8_to_utf32(sz_cptr_t utf8, sz_size_t utf8_le
 
 #pragma region String Sequences API
 
-typedef sz_cptr_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t);
-typedef sz_size_t (*sz_sequence_member_length_t)(struct sz_sequence_t const *, sz_size_t);
+/** @brief Signature of `sz_sequence_t::get_start` used to get the start of a member string at a given index. */
+typedef sz_cptr_t (*sz_sequence_member_start_t)(void const *, sz_size_t);
+/** @brief Signature of `sz_sequence_t::get_length` used to get the length of a member string at a given index. */
+typedef sz_size_t (*sz_sequence_member_length_t)(void const *, sz_size_t);
 
+/**
+ *  @brief  Structure to represent an ordered collection of strings.
+ *          It's a generic structure that can be used to represent a sequence of strings in different layouts.
+ *          It can be easily combined with Apache Arrow and its tape-like concatenated strings.
+ *  @sa     sz_sequence_from_null_terminated_strings
+ */
 typedef struct sz_sequence_t {
     void const *handle;
     sz_size_t count;
@@ -694,20 +705,12 @@ typedef struct sz_sequence_t {
 } sz_sequence_t;
 
 /**
- *  @brief  Initiates the sequence structure from a tape layout, used by Apache Arrow.
- *          Expects ::offsets to contains `count + 1` entries, the last pointing at the end
- *          of the last string, indicating the total length of the ::tape.
- */
-SZ_PUBLIC void sz_sequence_from_u32tape( //
-    sz_cptr_t *start, sz_u32_t const *offsets, sz_size_t count, sz_sequence_t *sequence);
-
-/**
- *  @brief  Initiates the sequence structure from a tape layout, used by Apache Arrow.
- *          Expects ::offsets to contains `count + 1` entries, the last pointing at the end
- *          of the last string, indicating the total length of the ::tape.
+ *  @brief Initiates the sequence structure from a typical C-style strings array, like `char *[]`.
+ *  @param[in] start Pointer to the array of strings.
+ *  @param[in] count Number of strings in the array.
+ *  @param[out] sequence Sequence structure to initialize.
  */
-SZ_PUBLIC void sz_sequence_from_u64tape( //
-    sz_cptr_t *start, sz_u64_t const *offsets, sz_size_t count, sz_sequence_t *sequence);
+SZ_PUBLIC void sz_sequence_from_null_terminated_strings(sz_cptr_t *start, sz_size_t count, sz_sequence_t *sequence);
 
 #pragma endregion
 
@@ -857,7 +860,7 @@ SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return __builtin_bswap
 SZ_INTERNAL sz_u64_t sz_u64_rotl(sz_u64_t x, sz_u64_t r) { return (x << r) | (x >> (64 - r)); }
 
 /**
- *  @brief  Select bits from either ::a or ::b depending on the value of ::mask bits.
+ *  @brief  Select bits from either @p a or @p b depending on the value of @p mask bits.
  *
  *  Similar to `_mm_blend_epi16` intrinsic on x86.
  *  Described in the "Bit Twiddling Hacks" by Sean Eron Anderson.
@@ -987,7 +990,7 @@ SZ_INTERNAL sz_size_t sz_size_log2i_nonzero(sz_size_t x) {
 }
 
 /**
- *  @brief  Compute the smallest power of two greater than or equal to ::x.
+ *  @brief  Compute the smallest power of two greater than or equal to @p x.
  */
 SZ_INTERNAL sz_size_t sz_size_bit_ceil(sz_size_t x) {
     // Unlike the commonly used trick with `clz` intrinsics, is valid across the whole range of `x`.
@@ -1149,6 +1152,25 @@ SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void
     *(sz_ptr_t)buffer = *(sz_cptr_t)&length;
 }
 
+SZ_PUBLIC sz_cptr_t _sz_sequence_from_null_terminated_strings_get_start(void const *handle, sz_size_t i) {
+    sz_cptr_t const *start = (sz_cptr_t const *)handle;
+    return start[i];
+}
+
+SZ_PUBLIC sz_size_t _sz_sequence_from_null_terminated_strings_get_length(void const *handle, sz_size_t i) {
+    sz_cptr_t const *start = (sz_cptr_t const *)handle;
+    sz_size_t length = 0;
+    for (sz_cptr_t ptr = start[i]; *ptr; ptr++) length++;
+    return length;
+}
+
+SZ_PUBLIC void sz_sequence_from_null_terminated_strings(sz_cptr_t *start, sz_size_t count, sz_sequence_t *sequence) {
+    sequence->handle = start;
+    sequence->count = count;
+    sequence->get_start = _sz_sequence_from_null_terminated_strings_get_start;
+    sequence->get_length = _sz_sequence_from_null_terminated_strings_get_length;
+}
+
 #pragma endregion
 
 #ifdef __cplusplus
diff --git a/scripts/test.cpp b/scripts/test.cpp
index 74282523..e3a62f3d 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -1576,8 +1576,8 @@ void test_replacements(std::size_t lookup_tables_to_try = 128, std::size_t slice
             std::size_t slice_offset = std::rand() % (body.length());
             std::size_t slice_length = std::rand() % (body.length() - slice_offset);
 
-            sz::transform<char>(sz::string_view(body.data() + slice_offset, slice_length), lut,
-                                const_cast<char *>(transformed.data()) + slice_offset);
+            sz::lookup<char>(sz::string_view(body.data() + slice_offset, slice_length), lut,
+                             const_cast<char *>(transformed.data()) + slice_offset);
             for (std::size_t i = 0; i != slice_length; ++i) {
                 assert(transformed[slice_offset + i] == lut[body[slice_offset + i]]);
             }
@@ -1592,6 +1592,17 @@ static void test_sequence_algorithms() {
     using strs_t = std::vector<std::string>;
     using order_t = std::vector<sz::sorted_idx_t>;
 
+    // Make sure teh helper functions work as expected.
+    {
+        sz_sequence_t sequence;
+        sz_cptr_t strings[] = {"banana", "apple", "cherry"};
+        sz_sequence_from_null_terminated_strings(strings, 3, &sequence);
+        assert(sequence.size == 3);
+        assert(sequence.get_start(sequence.handle, 0) == "banana"_sv);
+        assert(sequence.get_start(sequence.handle, 1) == "apple"_sv);
+        assert(sequence.get_start(sequence.handle, 2) == "cherry"_sv);
+    }
+
     // Basic tests with predetermined orders.
     assert_scoped(strs_t x({"a", "b", "c", "d"}), (void)0, sz::argsort(x) == order_t({0u, 1u, 2u, 3u}));
     assert_scoped(strs_t x({"b", "c", "d", "a"}), (void)0, sz::argsort(x) == order_t({3u, 0u, 1u, 2u}));

From 5a12c00d1dd6794da06f5267f01519394a1b49cf Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 21 Feb 2025 14:09:31 +0000
Subject: [PATCH 118/751] Improve: Use default allocator, when not provided

---
 include/stringzilla/sort.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index a394b646..c422e3c3 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -665,6 +665,13 @@ SZ_PUBLIC sz_status_t sz_sequence_argsort_serial(sz_sequence_t const *sequence,
         return sz_success_k;
     }
 
+    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
+    sz_memory_allocator_t global_alloc;
+    if (!alloc) {
+        sz_memory_allocator_init_default(&global_alloc);
+        alloc = &global_alloc;
+    }
+
     // One of the reasons for slow string operations is the significant overhead of branching when performing
     // individual string comparisons.
     //
@@ -773,6 +780,13 @@ SZ_PUBLIC sz_status_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t
     sz_size_t const tail_count = count & 7u;
     sz_pgrams_sort_stable_with_insertion(pgrams + count - tail_count, tail_count, order + count - tail_count);
 
+    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
+    sz_memory_allocator_t global_alloc;
+    if (!alloc) {
+        sz_memory_allocator_init_default(&global_alloc);
+        alloc = &global_alloc;
+    }
+
     // At this point, the array is partitioned into sorted runs.
     // We'll now merge these runs until the whole array is sorted.
     // Allocate temporary memory to hold merged results:
@@ -991,6 +1005,13 @@ SZ_PUBLIC sz_status_t sz_pgrams_sort_skylake(sz_pgram_t *pgrams, sz_size_t count
     // First, initialize the `order` with `std::iota`-like behavior.
     for (sz_size_t i = 0; i != count; ++i) order[i] = i;
 
+    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
+    sz_memory_allocator_t global_alloc;
+    if (!alloc) {
+        sz_memory_allocator_init_default(&global_alloc);
+        alloc = &global_alloc;
+    }
+
     // Allocate memory for partitioning the elements around the pivot.
     sz_size_t memory_usage = sizeof(sz_pgram_t) * count + sizeof(sz_sorted_idx_t) * count;
     sz_pgram_t *temporary_pgrams = (sz_pgram_t *)alloc->allocate(memory_usage, alloc);
@@ -1066,6 +1087,13 @@ SZ_PUBLIC sz_status_t sz_sequence_argsort_skylake(sz_sequence_t const *sequence,
         return sz_success_k;
     }
 
+    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
+    sz_memory_allocator_t global_alloc;
+    if (!alloc) {
+        sz_memory_allocator_init_default(&global_alloc);
+        alloc = &global_alloc;
+    }
+
     // Allocate memory for partitioning the elements around the pivot.
     sz_size_t memory_usage = sizeof(sz_pgram_t) * count * 2 + sizeof(sz_sorted_idx_t) * count;
     sz_pgram_t *global_pgrams = (sz_pgram_t *)alloc->allocate(memory_usage, alloc);

From 7698392efcfab4bb1dee856ceb88f9e618449638 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 22 Feb 2025 12:27:24 +0000
Subject: [PATCH 119/751] Improve: Clean `memory.h` header

---
 include/stringzilla/memory.h | 138 +++++++++++++++--------------------
 1 file changed, 59 insertions(+), 79 deletions(-)

diff --git a/include/stringzilla/memory.h b/include/stringzilla/memory.h
index de739f22..cc5cc6d7 100644
--- a/include/stringzilla/memory.h
+++ b/include/stringzilla/memory.h
@@ -5,15 +5,27 @@
  *
  *  Includes core APIs for contiguous memory operations:
  *
- *  - `sz_copy` - analog to `memcpy`
- *  - `sz_move` - analog to `memmove`
- *  - `sz_fill` - analog to `memset`
- *  - `sz_lookup` - LUT transformation of a string, similar to OpenCV LUT
- *  - TODO: `sz_detect_encoding` - similar to `iconv` or `chardet`
+ *  - @b `sz_copy` - analog to `memcpy`, probably the most common operation in a computer
+ *  - @b `sz_move` - analog to `memmove`, allowing overlapping memory regions, often used in string manipulation
+ *  - @b `sz_fill` - analog to `memset`, often used to initialize memory with a constant value, like zero
+ *  - @b `sz_lookup` - Look-Up Table @b (LUT) transformation of a string, mapping each byte to a new value
+ *  - TODO: @b `sz_lookup_utf8` - LUT transformation of a UTF8 string, which can be used for normalization
+ *  - TODO: @b `sz_detect_encoding` - detects the character encoding similar to "iconv" or "chardet" tools
  *
- *  Convenience functions for character-set mapping:
+ *  All of the core APIs receive the target output buffer as the first argument,
+ *  and aim to minimize the number of "store" instructions, especially unaligned ones,
+ *  that can invalidate 2 cache lines.
  *
- *  - `sz_tolower`, `sz_toupper`, `sz_toascii` for ASCII ranges
+ *  Unlike many other libraries focusing on trivial SIMD transformations, like converting
+ *  lowercase to uppercase, StringZilla generalizes those to basic lookup table transforms.
+ *  For typical ASCII conversions, you can use the following @b LUT initialization functions:
+ *
+ *  - `sz_lookup_init_lower` for transforms like `tolower`
+ *  - `sz_lookup_init_upper` for transforms like `toupper`
+ *  - `sz_lookup_init_ascii` for transforms like `isascii`
+ *
+ *  The header also exposes a minimalistic @b `sz_isascii` which can be used in UTF-8 capable
+ *  methods to select a simpler execution path for ASCII characters.
  */
 #ifndef STRINGZILLA_MEMORY_H_
 #define STRINGZILLA_MEMORY_H_
@@ -28,6 +40,7 @@ extern "C" {
 
 /**
  *  @brief  Similar to `memcpy`, copies contents of one string into another.
+ *  @see    https://en.cppreference.com/w/c/string/byte/memcpy
  *
  *  @param[out] target String to copy into. Can be `NULL`, if the @p length is zero.
  *  @param[in] length Number of bytes to copy. Can be a zero.
@@ -55,6 +68,7 @@ SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
 /**
  *  @brief  Similar to `memmove`, copies (moves) contents of one string into another.
  *          Unlike `sz_copy`, allows overlapping strings as arguments.
+ *  @see    https://en.cppreference.com/w/c/string/byte/memmove
  *
  *  @param[out] target String to copy into. Can be `NULL`, if the @p length is zero.
  *  @param[in] length Number of bytes to copy. Can be a zero.
@@ -78,6 +92,7 @@ SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
 
 /**
  *  @brief  Similar to `memset`, fills a string with a given value.
+ *  @see    https://en.cppreference.com/w/c/string/byte/memset
  *
  *  @param[out] target String to fill. Can be `NULL`, if the @p length is zero.
  *  @param[in] length Number of bytes to fill. Can be a zero.
@@ -184,52 +199,17 @@ SZ_PUBLIC void sz_lookup_neon(sz_ptr_t target, sz_size_t length, sz_cptr_t sourc
 #pragma region Helper API
 
 /**
- *  @brief  Equivalent to `for (char & c : text) c = tolower(c)`.
- *
- *  ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
- *  So there are 26 english letters, shifted by 32 values, meaning that a conversion
- *  can be done by flipping the 5th bit each inappropriate character byte. This, however,
- *  breaks for extended ASCII, so a different solution is needed.
- *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
- *
- *  @param text     String to be normalized.
- *  @param[in] length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_PUBLIC void sz_tolower(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
-
-/**
- *  @brief  Equivalent to `for (char & c : text) c = toupper(c)`.
+ *  @brief  Initializes a lookup table for converting ASCII characters to lowercase.
  *
  *  ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
  *  So there are 26 english letters, shifted by 32 values, meaning that a conversion
- *  can be done by flipping the 5th bit each inappropriate character byte. This, however,
- *  breaks for extended ASCII, so a different solution is needed.
+ *  can be done by flipping the 5th bit each inappropriate character byte.
+ *  This, however, breaks for extended ASCII, so a different solution is needed.
  *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
  *
- *  @param text     String to be normalized.
- *  @param[in] length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
- */
-SZ_PUBLIC void sz_toupper(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
-
-/**
- *  @brief  Equivalent to `for (char & c : text) c = toascii(c)`.
- *
- *  @param text     String to be normalized.
- *  @param[in] length   Number of bytes in the string.
- *  @param result   Output string, can point to the same address as ::text.
+ *  @param[out] lut Lookup table to be initialized. Must be exactly 256 bytes long.
  */
-SZ_PUBLIC void sz_toascii(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
-
-#pragma endregion // Helper API
-
-#pragma region Serial Implementation
-
-/**
- *  @brief  Uses a small lookup-table to convert a lowercase character to uppercase.
- */
-SZ_INTERNAL sz_u8_t sz_u8_tolower(sz_u8_t c) {
+SZ_PUBLIC void sz_lookup_init_lower(sz_ptr_t lut) {
     static sz_u8_t const lowered[256] = {
         0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
         16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
@@ -248,13 +228,21 @@ SZ_INTERNAL sz_u8_t sz_u8_tolower(sz_u8_t c) {
         224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
         240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
     };
-    return lowered[c];
+    for (sz_size_t i = 0; i < 256; ++i) lut[i] = lowered[i];
 }
 
 /**
- *  @brief  Uses a small lookup-table to convert an uppercase character to lowercase.
+ *  @brief  Initializes a lookup table for converting ASCII characters to uppercase.
+ *
+ *  ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
+ *  So there are 26 english letters, shifted by 32 values, meaning that a conversion
+ *  can be done by flipping the 5th bit each inappropriate character byte.
+ *  This, however, breaks for extended ASCII, so a different solution is needed.
+ *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
+ *
+ *  @param[out] lut Lookup table to be initialized. Must be exactly 256 bytes long.
  */
-SZ_INTERNAL sz_u8_t sz_u8_toupper(sz_u8_t c) {
+SZ_PUBLIC void sz_lookup_init_upper(sz_ptr_t lut) {
     static sz_u8_t const upped[256] = {
         0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
         16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
@@ -273,43 +261,23 @@ SZ_INTERNAL sz_u8_t sz_u8_toupper(sz_u8_t c) {
         224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
         240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
     };
-    return upped[c];
-}
-
-SZ_PUBLIC void sz_lookup_serial(sz_ptr_t result, sz_size_t length, sz_cptr_t text, sz_cptr_t lut) {
-    sz_u8_t const *unsigned_lut = (sz_u8_t const *)lut;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = unsigned_lut[*unsigned_text];
-}
-
-SZ_PUBLIC void sz_tolower_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = sz_u8_tolower(*unsigned_text);
+    for (sz_size_t i = 0; i < 256; ++i) lut[i] = upped[i];
 }
 
-SZ_PUBLIC void sz_toupper_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = sz_u8_toupper(*unsigned_text);
-}
-
-SZ_PUBLIC void sz_toascii_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
-    sz_u8_t *unsigned_result = (sz_u8_t *)result;
-    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
-    sz_u8_t const *end = unsigned_text + length;
-    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = *unsigned_text & 0x7F;
+/**
+ *  @brief  Initializes a lookup table for converting bytes to ASCII characters.
+ *
+ *  @param[out] lut Lookup table to be initialized. Must be exactly 256 bytes long.
+ */
+SZ_PUBLIC void sz_lookup_init_ascii(sz_ptr_t lut) {
+    for (sz_size_t i = 0; i < 256; ++i) lut[i] = (sz_u8_t)(i & 0x7F);
 }
 
 /**
  *  @brief  Check if there is a byte in this buffer, that exceeds 127 and can't be an ASCII character.
  *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  */
-SZ_PUBLIC sz_bool_t sz_isascii_serial(sz_cptr_t text, sz_size_t length) {
+SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t text, sz_size_t length) {
 
     if (!length) return sz_true_k;
     sz_u8_t const *h = (sz_u8_t const *)text;
@@ -334,6 +302,18 @@ SZ_PUBLIC sz_bool_t sz_isascii_serial(sz_cptr_t text, sz_size_t length) {
     return sz_true_k;
 }
 
+#pragma endregion // Helper API
+
+#pragma region Serial Implementation
+
+SZ_PUBLIC void sz_lookup_serial(sz_ptr_t result, sz_size_t length, sz_cptr_t text, sz_cptr_t lut) {
+    sz_u8_t const *unsigned_lut = (sz_u8_t const *)lut;
+    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
+    sz_u8_t *unsigned_result = (sz_u8_t *)result;
+    sz_u8_t const *end = unsigned_text + length;
+    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = unsigned_lut[*unsigned_text];
+}
+
 // When overriding libc, disable optimizations for this function because MSVC will optimize the loops into a `memset`.
 // Which then causes a stack overflow due to infinite recursion (`memset` -> `sz_fill_serial` -> `memset`).
 #if defined(_MSC_VER) && defined(SZ_OVERRIDE_LIBC) && SZ_OVERRIDE_LIBC

From 095bc2da38574f8a92ec053efa4c7261eaf1a730 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 22 Feb 2025 12:28:20 +0000
Subject: [PATCH 120/751] Break: New calling convention in `similarity.h`

---
 include/stringzilla/similarity.h    | 588 +++++++++++++++++-----------
 include/stringzilla/stringzilla.hpp |  54 +--
 2 files changed, 394 insertions(+), 248 deletions(-)

diff --git a/include/stringzilla/similarity.h b/include/stringzilla/similarity.h
index 188169ff..60540b33 100644
--- a/include/stringzilla/similarity.h
+++ b/include/stringzilla/similarity.h
@@ -5,9 +5,9 @@
  *
  *  Includes core APIs:
  *
- *  - `sz_edit_distance` & `sz_edit_distance_utf8` for Levenshtein edit-distance computation.
- *  - `sz_alignment_score` for weighted Needleman-Wunsch global alignment.
  *  - `sz_hamming_distance` & `sz_hamming_distance_utf8` for Hamming distance computation.
+ *  - `sz_levenshtein_distance` & `sz_levenshtein_distance_utf8` for Levenshtein edit-distance computation.
+ *  - `sz_needleman_wunsch_score` for weighted Needleman-Wunsch global alignment.
  *
  *  The Hamming distance is rarely used in string processing, so only minimal compatibility is provided.
  *  The Levenshtein distance, however, is much more popular and computationally intensive.
@@ -26,130 +26,220 @@ extern "C" {
 #pragma region Core API
 
 /**
- *  @brief  Computes the Hamming distance between two strings - number of not matching characters.
- *          Difference in length is is counted as a mismatch.
+ *  @brief  Computes the Hamming distance between two strings.
+ *  @see    https://en.wikipedia.org/wiki/Hamming_distance
  *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
+ *  The Hamming distance is defined as the number of positions at which the corresponding bytes differ.
+ *  If the strings have different lengths, the extra characters in the longer string are treated as mismatches.
  *
- *  @param bound    Exclusive upper bound on the distance, that allows us to exit early.
- *                  Pass `SZ_SIZE_MAX` or any value greater than `(max(a_length, b_length))` to ignore.
- *                  Pass zero to check if the strings are equal.
- *  @return         Returns an unsigned integer for the edit distance. Zero means the strings are equal.
- *                  Returns the `(max(a_length, b_length)) + 1` if the distance limit was reached.
+ *  If the running distance reaches the @p bound, the computation aborts early. If the @p bound is zero,
+ *  the function merely checks for equality. If the @p bound is larger than the maximum length of the strings,
+ *  the function will compute the full "unbounded" distance.
  *
- *  @see    sz_hamming_distance_utf8
- *  @see    https://en.wikipedia.org/wiki/Hamming_distance
+ *  @param[in] a Pointer to the first string.
+ *  @param[in] a_length Number of bytes in the first string.
+ *  @param[in] b Pointer to the second string.
+ *  @param[in] b_length Number of bytes in the second string.
+ *  @param[in] bound Exclusive upper bound on the computed distance.
+ *  @param[out] result On success, the computed byte-level Hamming distance is stored here.
+ *  @retval `sz_success_k` if the operation was successful.
+ *  @retval `sz_bad_alloc_k` if the operation failed due to memory allocation failure.
+ *
+ *  Example usage:
+ *
+ *  @code{.c}
+ *      #include <stringzilla/similarity.h>
+ *      int main(void) {
+ *          char const *s1 = "1011101";
+ *          char const *s2 = "1001001";
+ *          sz_size_t result, length = 7, bound = 10;
+ *          sz_status_t status = sz_hamming_distance(s1, length, s2, length, bound, &result);
+ *          return (status == sz_success_k && result == 2) ? 0 : 1;
+ *      }
+ *  @endcode
+ *
+ *  @note   This function isn't intended for UTF-8 texts and is not heavily optimized.
+ *  @sa     sz_hamming_distance_utf8
  */
-SZ_DYNAMIC sz_size_t sz_hamming_distance( //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
+SZ_DYNAMIC sz_status_t sz_hamming_distance( //
+    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound, sz_size_t *result);
 
 /**
- *  @brief  Computes the Hamming distance between two @b UTF8 strings - number of not matching characters.
- *          Difference in length is is counted as a mismatch.
+ *  @brief  Computes the Hamming distance between two @b UTF-8 encoded strings.
+ *  @see    https://en.wikipedia.org/wiki/Hamming_distance
  *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
+ *  The Hamming distance is defined as the number of positions at which the corresponding Unicode runes differ.
+ *  If the strings have different lengths, the extra characters in the longer string are treated as mismatches.
  *
- *  @param bound    Exclusive upper bound on the distance, that allows us to exit early.
- *                  Pass `SZ_SIZE_MAX` or any value greater than `(max(a_length, b_length))` to ignore.
- *                  Pass zero to check if the strings are equal.
- *  @return         Returns an unsigned integer for the edit distance. Zero means the strings are equal.
- *                  Returns the `(max(a_length, b_length)) + 1` if the distance limit was reached.
+ *  If the running distance reaches the @p bound, the computation aborts early. If the @p bound is zero,
+ *  the function merely checks for equality. If the @p bound is larger than the maximum length of the strings,
+ *  the function will compute the full "unbounded" distance.
  *
- *  @see    sz_hamming_distance
- *  @see    https://en.wikipedia.org/wiki/Hamming_distance
+ *  @param[in] a Pointer to the first string.
+ *  @param[in] a_length Number of bytes in the first string.
+ *  @param[in] b Pointer to the second string.
+ *  @param[in] b_length Number of bytes in the second string.
+ *  @param[in] bound Exclusive upper bound on the computed distance.
+ *  @param[out] result On success, the computed Unicode character-level Hamming distance is stored here.
+ *  @retval `sz_success_k` if the operation was successful.
+ *  @retval `sz_bad_alloc_k` if the operation failed due to memory allocation failure.
+ *  @retval `sz_invalid_utf8_k` if the input strings are not valid UTF-8.
+ *
+ *  Example usage:
+ *
+ *  @code{.c}
+ *      #include <stringzilla/similarity.h>
+ *      int main(void) {
+ *          char const *s1 = "café";
+ *          char const *s2 = "cafe";
+ *          sz_size_t result, length1 = 5, length2 = 4, bound = 10;
+ *          sz_status_t status = sz_hamming_distance_utf8(s1, length1, s2, length2, bound, &result);
+ *          return (status == sz_success_k && result == 1) ? 0 : 1;
+ *      }
+ *  @endcode
+ *
+ *  @note   This function isn't heavily optimized.
+ *  @sa     sz_hamming_distance
  */
-SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
+SZ_DYNAMIC sz_status_t sz_hamming_distance_utf8( //
+    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound, sz_size_t *result);
 
 /**
  *  @brief  Computes the Levenshtein edit-distance between two strings using the Wagner-Fisher algorithm.
  *          Similar to the Needleman-Wunsch alignment algorithm. Often used in fuzzy string matching.
  *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
+ *  If the running distance reaches the @p bound, the computation aborts early. If the @p bound is zero,
+ *  the function merely checks for equality. If the @p bound is larger than the maximum length of the strings,
+ *  the function will compute the full "unbounded" distance.
+ *
+ *  @param[in] a Pointer to the first string.
+ *  @param[in] a_length Number of bytes in the first string.
+ *  @param[in] b Pointer to the second string.
+ *  @param[in] b_length Number of bytes in the second string.
+ *  @param[in] bound Exclusive upper bound on the computed distance.
+ *  @param[in] alloc Optional memory allocator. If `NULL` is passed, will use to the systems default `malloc`.
+ *
+ *  @param[out] result On success, the computed byte-level Levenshtein distance is stored here.
+ *  @retval `sz_success_k` if the operation was successful.
+ *  @retval `sz_bad_alloc_k` if the operation failed due to memory allocation failure.
+ *  @retval `sz_invalid_utf8_k` if the input strings are not valid UTF-8.
  *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
+ *  Example usage:
  *
- *  @param bound    Exclusive upper bound on the distance, that allows us to exit early.
- *                  Pass `SZ_SIZE_MAX` or any value greater than `(max(a_length, b_length))` to ignore.
- *                  Pass zero to check if the strings are equal.
- *  @return         Returns an unsigned integer for the edit distance. Zero means the strings are equal.
- *                  Returns the `(max(a_length, b_length)) + 1` if the distance limit was reached.
- *                  Returns `SZ_SIZE_MAX` if the memory allocation failed.
+ *  @code{.c}
+ *      #include <stringzilla/similarity.h>
+ *      int main(void) {
+ *          char const *s1 = "kitten";
+ *          char const *s2 = "sitting";
+ *          sz_size_t result, length1 = 6, length2 = 7, bound = 10;
+ *          sz_status_t status = sz_levenshtein_distance(s1, length1, s2, length2, bound, NULL, &result);
+ *          return (status == sz_success_k && result == 3) ? 0 : 1;
+ *      }
+ *  @endcode
  *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default
+ *  @note   The algorithm has linear memory complexity and @p a_length * @p b_length time complexity.
  *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
+ *
+ *  @note   This function isn't intended for UTF-8 texts.
+ *  @sa     sz_levenshtein_distance_utf8
+ *
+ *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
+ *  @sa     sz_levenshtein_distance_serial, sz_levenshtein_distance_ice
  */
-SZ_DYNAMIC sz_size_t sz_edit_distance(                                //
+SZ_DYNAMIC sz_status_t sz_levenshtein_distance(                       //
     sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-    sz_size_t bound, sz_memory_allocator_t *alloc);
+    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result);
 
 /**
- *  @brief  Computes the Levenshtein edit-distance between two @b UTF8 strings.
- *          Unlike `sz_edit_distance`, reports the distance in Unicode codepoints, and not in bytes.
+ *  @brief  Computes the Levenshtein edit-distance between two @b UTF-8 strings using the Wagner-Fisher algorithm.
+ *          Similar to the Needleman-Wunsch alignment algorithm. Often used in fuzzy string matching.
+ *
+ *  If the running distance reaches the @p bound, the computation aborts early. If the @p bound is zero,
+ *  the function merely checks for equality. If the @p bound is larger than the maximum length of the strings,
+ *  the function will compute the full "unbounded" distance.
  *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
+ *  @param[in] a Pointer to the first string.
+ *  @param[in] a_length Number of bytes in the first string.
+ *  @param[in] b Pointer to the second string.
+ *  @param[in] b_length Number of bytes in the second string.
+ *  @param[in] bound Exclusive upper bound on the computed distance.
+ *  @param[in] alloc Optional memory allocator. If `NULL` is passed, will use to the systems default `malloc`.
  *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
+ *  @param[out] result On success, the computed byte-level Levenshtein distance is stored here.
+ *  @retval `sz_success_k` if the operation was successful.
+ *  @retval `sz_bad_alloc_k` if the operation failed due to memory allocation failure.
+ *  @retval `sz_invalid_utf8_k` if the input strings are not valid UTF-8.
  *
- *  @param bound    Exclusive upper bound on the distance, that allows us to exit early.
- *                  Pass `SZ_SIZE_MAX` or any value greater than `(max(a_length, b_length))` to ignore.
- *                  Pass zero to check if the strings are equal.
- *  @return         Returns an unsigned integer for the edit distance. Zero means the strings are equal.
- *                  Returns the `(max(a_length, b_length)) + 1` if the distance limit was reached.
- *                  Returns `SZ_SIZE_MAX` if the memory allocation failed.
+ *  Example usage:
  *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default, sz_edit_distance
+ *  @code{.c}
+ *      #include <stringzilla/similarity.h>
+ *      int main(void) {
+ *          char const *s1 = "café";
+ *          char const *s2 = "cafe";
+ *          sz_size_t result, length1 = 5, length2 = 4, bound = 10;
+ *          sz_status_t status = sz_levenshtein_distance_utf8(s1, length1, s2, length2, bound, NULL, &result);
+ *          return (status == sz_success_k && result == 1) ? 0 : 1;
+ *      }
+ *  @endcode
+ *
+ *  @note   The algorithm has linear memory complexity and @p a_length * @p b_length time complexity.
  *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
+ *
+ *  @sa     sz_levenshtein_distance
+ *
+ *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
+ *  @sa     sz_levenshtein_distance_utf8_serial, sz_levenshtein_distance_utf8_ice
  */
-SZ_DYNAMIC sz_size_t sz_edit_distance_utf8(                           //
+SZ_DYNAMIC sz_status_t sz_levenshtein_distance_utf8(                  //
     sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-    sz_size_t bound, sz_memory_allocator_t *alloc);
+    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result);
 
 /**
- *  @brief  Computes Needleman–Wunsch alignment score for two string. Often used in bioinformatics and cheminformatics.
- *          Similar to the Levenshtein edit-distance, parameterized for gap and substitution penalties.
+ *  @brief  Computes the Needleman–Wunsch alignment score for two strings.
+ *          Often used in bioinformatics for sequence alignment.
  *
- *  Not commutative in the general case, as the order of the strings matters, as `sz_alignment_score(a, b)` may
- *  not be equal to `sz_alignment_score(b, a)`. Becomes @b commutative, if the substitution costs are symmetric.
- *  Equivalent to the negative Levenshtein distance, if: `gap == -1` and `subs[i][j] == (i == j ? 0: -1)`.
+ *  This function calculates a similarity score by applying gap and substitution penalties,
+ *  following the Needleman–Wunsch algorithm. Note that the result is generally @b not-commutative —
+ *  that is, `sz_needleman_wunsch_score(a, b)` may differ from `sz_needleman_wunsch_score(b, a)`
+ *  unless the @p subs matrix is symmetric. With a @p gap penalty of -1 and substitution costs defined
+ *  as 0 for matches and -1 for mismatches, the score is equivalent to the negative Levenshtein distance.
  *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *  @param gap      Penalty cost for gaps - insertions and removals.
- *  @param subs     Substitution costs matrix with 256 x 256 values for all pairs of characters.
+ *  @param[in] a Pointer to the first string.
+ *  @param[in] a_length Number of bytes in the first string.
+ *  @param[in] b Pointer to the second string.
+ *  @param[in] b_length Number of bytes in the second string.
+ *  @param[in] subs Substitution cost matrix (256×256) for all pairs of characters.
+ *  @param[in] gap Penalty cost for gaps (insertions and deletions).
+ *  @param[in] alloc Optional memory allocator. If `NULL` is passed, the system default `malloc` is used.
+ *  @param[out] result On success, the computed byte-level Levenshtein distance is stored here.
+ *  @retval `sz_success_k` if the operation was successful.
+ *  @retval `sz_bad_alloc_k` if the operation failed due to memory allocation failure.
  *
- *  @param alloc    Temporary memory allocator. Only some of the rows of the matrix will be allocated,
- *                  so the memory usage is linear in relation to ::a_length and ::b_length.
- *                  If SZ_NULL is passed, will initialize to the systems default `malloc`.
+ *  Example usage:
  *
- *  @return         Signed similarity score. Can be negative, depending on the substitution costs.
- *                  Returns `SZ_SSIZE_MAX` if the memory allocation failed.
+ *  @code{.c}
+ *      #include <stringzilla/similarity.h>
+ *      int main(void) {
+ *          char const *s1 = "GATTACA";
+ *          char const *s2 = "GCATGCU";
+ *          sz_error_cost_t subs[256][256] = { ... };
+ *          sz_error_cost_t gap = -1;
+ *          sz_ssize_t score;
+ *          sz_status_t status = sz_needleman_wunsch_score(s1, 7, s2, 7, subs, gap, NULL, &score);
+ *          return (status == sz_success_k) ? 0 : 1;
+ *      }
+ *  @endcode
  *
- *  @see    sz_memory_allocator_init_fixed, sz_memory_allocator_init_default
+ *  @note   Algorithm has @p a_length * @p b_length worst-case time complexity and linear memory complexity.
  *  @see    https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
+ *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
+ *  @sa     sz_needleman_wunsch_score_serial, sz_needleman_wunsch_score_ice
  */
-SZ_DYNAMIC sz_ssize_t sz_alignment_score(                             //
+SZ_DYNAMIC sz_status_t sz_needleman_wunsch_score(                     //
     sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
     sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-    sz_memory_allocator_t *alloc);
+    sz_memory_allocator_t *alloc, sz_ssize_t *result);
 
 /**
  *  @brief  Checks if all characters in the range are valid ASCII characters.
@@ -161,40 +251,44 @@ SZ_DYNAMIC sz_ssize_t sz_alignment_score(                             //
 SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t text, sz_size_t length);
 
 /** @copydoc sz_hamming_distance */
-SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
+SZ_PUBLIC sz_status_t sz_hamming_distance_serial(                     //
+    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
+    sz_size_t bound, sz_size_t *result);
 
 /** @copydoc sz_hamming_distance_utf8 */
-SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial( //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound);
+SZ_PUBLIC sz_status_t sz_hamming_distance_utf8_serial(                //
+    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
+    sz_size_t bound, sz_size_t *result);
 
-/** @copydoc sz_edit_distance */
-SZ_PUBLIC sz_size_t sz_edit_distance_serial(                          //
+/** @copydoc sz_levenshtein_distance */
+SZ_PUBLIC sz_status_t sz_levenshtein_distance_serial(                 //
     sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-    sz_size_t bound, sz_memory_allocator_t *alloc);
+    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result);
 
-/** @copydoc sz_edit_distance_utf8 */
-SZ_PUBLIC sz_size_t sz_edit_distance_utf8_serial(                     //
+/** @copydoc sz_levenshtein_distance_utf8 */
+SZ_PUBLIC sz_status_t sz_levenshtein_distance_utf8_serial(            //
     sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-    sz_size_t bound, sz_memory_allocator_t *alloc);
+    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result);
 
-/** @copydoc sz_alignment_score */
-SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(                       //
+/** @copydoc sz_needleman_wunsch_score */
+SZ_PUBLIC sz_status_t sz_needleman_wunsch_score_serial(               //
     sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
     sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-    sz_memory_allocator_t *alloc);
+    sz_memory_allocator_t *alloc, sz_ssize_t *result);
 
 #if SZ_USE_ICE
 
-SZ_INTERNAL sz_size_t sz_edit_distance_ice(      //
-    sz_cptr_t shorter, sz_size_t shorter_length, //
-    sz_cptr_t longer, sz_size_t longer_length,   //
-    sz_size_t bound, sz_memory_allocator_t *alloc);
+/** @copydoc sz_levenshtein_distance */
+SZ_PUBLIC sz_status_t sz_levenshtein_distance_ice( //
+    sz_cptr_t shorter, sz_size_t shorter_length,   //
+    sz_cptr_t longer, sz_size_t longer_length,     //
+    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result);
 
-SZ_INTERNAL sz_ssize_t sz_alignment_score_ice(   //
-    sz_cptr_t shorter, sz_size_t shorter_length, //
-    sz_cptr_t longer, sz_size_t longer_length,   //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc);
+/** @copydoc sz_needleman_wunsch_score */
+SZ_PUBLIC sz_status_t sz_needleman_wunsch_score_ice( //
+    sz_cptr_t shorter, sz_size_t shorter_length,     //
+    sz_cptr_t longer, sz_size_t longer_length,       //
+    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc, sz_ssize_t *result);
 
 #endif
 
@@ -202,10 +296,10 @@ SZ_INTERNAL sz_ssize_t sz_alignment_score_ice(   //
 
 #pragma region Serial Implementation
 
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_serial( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                 //
-    sz_cptr_t longer, sz_size_t longer_length,                   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
+SZ_INTERNAL sz_status_t _sz_levenshtein_distance_skewed_diagonals_serial( //
+    sz_cptr_t shorter, sz_size_t shorter_length,                          //
+    sz_cptr_t longer, sz_size_t longer_length,                            //
+    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result_ptr) {
 
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
     sz_memory_allocator_t global_alloc;
@@ -224,7 +318,7 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_serial( //
     sz_size_t n = shorter_length + 1;
     sz_size_t buffer_length = sizeof(sz_size_t) * n * 3;
     sz_size_t *distances = (sz_size_t *)alloc->allocate(buffer_length, alloc->handle);
-    if (!distances) return SZ_SIZE_MAX;
+    if (!distances) return sz_bad_alloc_k;
 
     sz_size_t *previous_distances = distances;
     sz_size_t *current_distances = previous_distances + n;
@@ -276,7 +370,8 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_serial( //
     // Cache scalar before `free` call.
     sz_size_t result = current_distances[0];
     alloc->free(distances, buffer_length, alloc->handle);
-    return result;
+    *result_ptr = result;
+    return sz_success_k;
 }
 
 /**
@@ -290,10 +385,10 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_serial( //
  *      + 100 codepoints * 2 strings * 4 bytes/codepoint = 800 bytes of memory for the UTF8 buffer.
  *      = 2400 bytes of memory or @b 12x memory amplification!
  */
-SZ_INTERNAL sz_size_t _sz_edit_distance_wagner_fisher_serial( //
-    sz_cptr_t longer, sz_size_t longer_length,                //
-    sz_cptr_t shorter, sz_size_t shorter_length,              //
-    sz_size_t bound, sz_bool_t can_be_unicode, sz_memory_allocator_t *alloc) {
+SZ_INTERNAL sz_status_t _sz_levenshtein_distance_wagner_fisher_serial( //
+    sz_cptr_t longer, sz_size_t longer_length,                         //
+    sz_cptr_t shorter, sz_size_t shorter_length,                       //
+    sz_size_t bound, sz_bool_t can_be_unicode, sz_memory_allocator_t *alloc, sz_size_t *result_ptr) {
 
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
     sz_memory_allocator_t global_alloc;
@@ -329,7 +424,7 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_wagner_fisher_serial( //
 
     // If the allocation fails, return the maximum distance.
     sz_ptr_t const buffer = (sz_ptr_t)alloc->allocate(buffer_length, alloc->handle);
-    if (!buffer) return SZ_SIZE_MAX;
+    if (!buffer) return sz_bad_alloc_k;
 
     // Let's export the UTF8 sequence into the newly allocated buffer at the end.
     if (can_be_unicode == sz_true_k) {
@@ -378,7 +473,8 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_wagner_fisher_serial( //
     /* Cache scalar before `free` call. */                                                                            \
     sz_size_t result = previous_distances[shorter_length];                                                            \
     alloc->free(buffer, buffer_length, alloc->handle);                                                                \
-    return result;
+    *result_ptr = result;                                                                                             \
+    return sz_success_k;
 
     // Let's define a separate variant for bounded distance computation.
     // Practically the same as unbounded, but also collecting the running minimum within each row for early exit.
@@ -408,7 +504,8 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_wagner_fisher_serial( //
         /* If the minimum distance in this row exceeded the bound, return early */                                    \
         if (min_distance >= bound) {                                                                                  \
             alloc->free(buffer, buffer_length, alloc->handle);                                                        \
-            return longer_length + 1;                                                                                 \
+            *result_ptr = bound;                                                                                      \
+            return sz_success_k;                                                                                      \
         }                                                                                                             \
         _distance_t *temporary = previous_distances;                                                                  \
         previous_distances = current_distances;                                                                       \
@@ -416,7 +513,8 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_wagner_fisher_serial( //
     }                                                                                                                 \
     sz_size_t result = previous_distances[shorter_length];                                                            \
     alloc->free(buffer, buffer_length, alloc->handle);                                                                \
-    return result;
+    *result_ptr = result;                                                                                             \
+    return sz_success_k;
 
     // Dispatch the actual computation.
     if (!bound) {
@@ -429,10 +527,10 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_wagner_fisher_serial( //
     }
 }
 
-SZ_PUBLIC sz_size_t sz_edit_distance_serial(     //
-    sz_cptr_t longer, sz_size_t longer_length,   //
-    sz_cptr_t shorter, sz_size_t shorter_length, //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
+SZ_PUBLIC sz_status_t sz_levenshtein_distance_serial( //
+    sz_cptr_t longer, sz_size_t longer_length,        //
+    sz_cptr_t shorter, sz_size_t shorter_length,      //
+    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result_ptr) {
 
     // Let's make sure that we use the amount proportional to the
     // number of elements in the shorter string, not the larger.
@@ -452,28 +550,48 @@ SZ_PUBLIC sz_size_t sz_edit_distance_serial(     //
     int const is_bounded = bound < longer_length;
     if (is_bounded) {
         // If one of the strings is empty - the edit distance is equal to the length of the other one.
-        if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
-        if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
+        if (longer_length == 0) {
+            *result_ptr = sz_min_of_two(shorter_length, bound);
+            return sz_success_k;
+        }
+        if (shorter_length == 0) {
+            *result_ptr = sz_min_of_two(longer_length, bound);
+            return sz_success_k;
+        }
         // If the difference in length is beyond the `bound`, there is no need to check at all.
-        if (longer_length - shorter_length > bound) return bound;
+        if (longer_length - shorter_length > bound) {
+            *result_ptr = bound;
+            return sz_success_k;
+        }
     }
 
-    if (shorter_length == 0) return longer_length; // If no mismatches were found - the distance is zero.
+    // If no mismatches were found - the distance is zero.
+    if (shorter_length == 0) {
+        *result_ptr = longer_length;
+        return sz_success_k;
+    }
     if (shorter_length == longer_length && !is_bounded)
-        return _sz_edit_distance_skewed_diagonals_serial(longer, longer_length, shorter, shorter_length, bound, alloc);
-    return _sz_edit_distance_wagner_fisher_serial( //
-        longer, longer_length, shorter, shorter_length, bound, sz_false_k, alloc);
+        return _sz_levenshtein_distance_skewed_diagonals_serial(longer, longer_length, shorter, shorter_length, bound,
+                                                                alloc, result_ptr);
+    return _sz_levenshtein_distance_wagner_fisher_serial( //
+        longer, longer_length, shorter, shorter_length, bound, sz_false_k, alloc, result_ptr);
 }
 
-SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(       //
-    sz_cptr_t longer, sz_size_t longer_length,        //
-    sz_cptr_t shorter, sz_size_t shorter_length,      //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, //
-    sz_memory_allocator_t *alloc) {
+SZ_PUBLIC sz_status_t sz_needleman_wunsch_score_serial( //
+    sz_cptr_t longer, sz_size_t longer_length,          //
+    sz_cptr_t shorter, sz_size_t shorter_length,        //
+    sz_error_cost_t const *subs, sz_error_cost_t gap,   //
+    sz_memory_allocator_t *alloc, sz_ssize_t *result_ptr) {
 
     // If one of the strings is empty - the edit distance is equal to the length of the other one
-    if (longer_length == 0) return (sz_ssize_t)shorter_length * gap;
-    if (shorter_length == 0) return (sz_ssize_t)longer_length * gap;
+    if (longer_length == 0) {
+        *result_ptr = (sz_ssize_t)shorter_length * gap;
+        return sz_success_k;
+    }
+    if (shorter_length == 0) {
+        *result_ptr = (sz_ssize_t)longer_length * gap;
+        return sz_success_k;
+    }
 
     // Let's make sure that we use the amount proportional to the
     // number of elements in the shorter string, not the larger.
@@ -519,13 +637,14 @@ SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(       //
     // Cache scalar before `free` call.
     sz_ssize_t result = previous_distances[shorter_length];
     alloc->free(distances, buffer_length, alloc->handle);
-    return result;
+    *result_ptr = result;
+    return sz_success_k;
 }
 
-SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
-    sz_cptr_t a, sz_size_t a_length,            //
-    sz_cptr_t b, sz_size_t b_length,            //
-    sz_size_t bound) {
+SZ_PUBLIC sz_status_t sz_hamming_distance_serial( //
+    sz_cptr_t a, sz_size_t a_length,              //
+    sz_cptr_t b, sz_size_t b_length,              //
+    sz_size_t bound, sz_size_t *result_ptr) {
 
     sz_size_t const min_length = sz_min_of_two(a_length, b_length);
     sz_size_t const max_length = sz_max_of_two(a_length, b_length);
@@ -547,13 +666,14 @@ SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
 #endif
 
     for (; a != a_end && distance < bound; ++a, ++b) { distance += (*a != *b); }
-    return sz_min_of_two(distance, bound);
+    *result_ptr = sz_min_of_two(distance, bound);
+    return sz_success_k;
 }
 
-SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial( //
-    sz_cptr_t a, sz_size_t a_length,                 //
-    sz_cptr_t b, sz_size_t b_length,                 //
-    sz_size_t bound) {
+SZ_PUBLIC sz_status_t sz_hamming_distance_utf8_serial( //
+    sz_cptr_t a, sz_size_t a_length,                   //
+    sz_cptr_t b, sz_size_t b_length,                   //
+    sz_size_t bound, sz_size_t *result_ptr) {
 
     sz_cptr_t const a_end = a + a_length;
     sz_cptr_t const b_end = b + b_length;
@@ -587,7 +707,8 @@ SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial( //
         for (; a < a_end; a += a_rune_length, ++distance) _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
         for (; b < b_end; b += b_rune_length, ++distance) _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
     }
-    return distance;
+    *result_ptr = distance;
+    return sz_success_k;
 }
 
 #pragma endregion // Serial Implementation
@@ -646,9 +767,9 @@ SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial( //
  *? Bounds check, for inputs ranging from 33 to 64 bytes doesn't affect the performance at all.
  *? It's also worth exploring `_mm512_alignr_epi8` and `_mm512_maskz_compress_epi8` for the shift.
  */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto63_ice( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                     //
-    sz_cptr_t longer, sz_size_t longer_length,                       //
+SZ_INTERNAL sz_size_t _sz_levenshtein_distance_skewed_diagonals_upto63_ice( //
+    sz_cptr_t shorter, sz_size_t shorter_length,                            //
+    sz_cptr_t longer, sz_size_t longer_length,                              //
     sz_size_t bound) {
 
     sz_size_t const max_length = 63u;
@@ -815,9 +936,9 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto63_ice( //
  *  - source code analysis, assuming most lines are either under 80 or under 120 characters long.
  *  - DNA sequence alignment, as most short reads are 50-300 characters long.
  */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto127_ice( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                      //
-    sz_cptr_t longer, sz_size_t longer_length,                        //
+SZ_INTERNAL sz_size_t _sz_levenshtein_distance_skewed_diagonals_upto127_ice( //
+    sz_cptr_t shorter, sz_size_t shorter_length,                             //
+    sz_cptr_t longer, sz_size_t longer_length,                               //
     sz_size_t bound) {
     sz_unused(shorter && shorter_length && longer && longer_length && bound);
     return 0;
@@ -835,9 +956,9 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto127_ice( //
  *  This is the largest space-efficient variant, as strings beyond 255 characters may require
  *  16-bit accumulators, which would be a significant bottleneck.
  */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto_ice( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                   //
-    sz_cptr_t longer, sz_size_t longer_length,                     //
+SZ_INTERNAL sz_size_t _sz_levenshtein_distance_skewed_diagonals_upto_ice( //
+    sz_cptr_t shorter, sz_size_t shorter_length,                          //
+    sz_cptr_t longer, sz_size_t longer_length,                            //
     sz_size_t bound) {
     sz_unused(shorter && shorter_length && longer && longer_length && bound);
     return 0;
@@ -856,9 +977,9 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto_ice( //
  *  This is the largest space-efficient variant, as strings beyond 255 characters may require
  *  16-bit accumulators, which would be a significant bottleneck.
  */
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto255bound_ice( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                           //
-    sz_cptr_t longer, sz_size_t longer_length,                             //
+SZ_INTERNAL sz_size_t _sz_levenshtein_distance_skewed_diagonals_upto255bound_ice( //
+    sz_cptr_t shorter, sz_size_t shorter_length,                                  //
+    sz_cptr_t longer, sz_size_t longer_length,                                    //
     sz_size_t bound) {
     sz_unused(shorter && shorter_length && longer && longer_length && bound);
     return 0;
@@ -873,20 +994,20 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto255bound_ice( //
  *
  *  Each string is unpacked into 128 characters * 4 bytes per character / 64 bytes per register = 8 registers.
  */
-SZ_INTERNAL sz_size_t _sz_edit_distance_utf8_skewed_diagonals_upto127_ice( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                           //
-    sz_cptr_t longer, sz_size_t longer_length,                             //
+SZ_INTERNAL sz_size_t _sz_levenshtein_distance_utf8_skewed_diagonals_upto127_ice( //
+    sz_cptr_t shorter, sz_size_t shorter_length,                                  //
+    sz_cptr_t longer, sz_size_t longer_length,                                    //
     sz_size_t bound) {
     sz_unused(shorter && shorter_length && longer && longer_length && bound);
     return 0;
 }
 
-SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_ice( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                      //
-    sz_cptr_t longer, sz_size_t longer_length,                        //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
+SZ_INTERNAL sz_status_t _sz_levenshtein_distance_skewed_diagonals_upto65k_ice( //
+    sz_cptr_t shorter, sz_size_t shorter_length,                               //
+    sz_cptr_t longer, sz_size_t longer_length,                                 //
+    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result_ptr) {
 
-    sz_unused(shorter && longer && bound && alloc);
+    sz_unused(shorter && longer && bound && alloc && result_ptr);
 
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
     sz_memory_allocator_t global_alloc;
@@ -1037,22 +1158,31 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_ice( //
     alloc->free(distances, buffer_length, alloc->handle);
     return result;
 #endif
-    return 0;
+    return sz_success_k;
 }
 
-SZ_INTERNAL sz_size_t sz_edit_distance_ice(      //
-    sz_cptr_t shorter, sz_size_t shorter_length, //
-    sz_cptr_t longer, sz_size_t longer_length,   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
+SZ_PUBLIC sz_status_t sz_levenshtein_distance_ice( //
+    sz_cptr_t shorter, sz_size_t shorter_length,   //
+    sz_cptr_t longer, sz_size_t longer_length,     //
+    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result_ptr) {
 
     // Bounded computations may exit early.
     int const is_bounded = bound < longer_length;
     if (is_bounded) {
         // If one of the strings is empty - the edit distance is equal to the length of the other one.
-        if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
-        if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
+        if (longer_length == 0) {
+            *result_ptr = sz_min_of_two(shorter_length, bound);
+            return sz_success_k;
+        }
+        if (shorter_length == 0) {
+            *result_ptr = sz_min_of_two(longer_length, bound);
+            return sz_success_k;
+        }
         // If the difference in length is beyond the `bound`, there is no need to check at all.
-        if (longer_length - shorter_length > bound) return bound;
+        if (longer_length - shorter_length > bound) {
+            *result_ptr = bound;
+            return sz_success_k;
+        }
     }
 
     // Make sure the shorter string is actually shorter.
@@ -1066,14 +1196,17 @@ SZ_INTERNAL sz_size_t sz_edit_distance_ice(      //
     }
 
     // Dispatch the right implementation based on the length of the strings.
-    if (longer_length < 64u)
-        return _sz_edit_distance_skewed_diagonals_upto63_ice( //
+    if (longer_length < 64u) {
+        *result_ptr = _sz_levenshtein_distance_skewed_diagonals_upto63_ice( //
             shorter, shorter_length, longer, longer_length, bound);
+        return sz_success_k;
+    }
+
     // else if (longer_length < 256u * 256u)
-    //     return _sz_edit_distance_skewed_diagonals_upto65k_ice( //
+    //     return _sz_levenshtein_distance_skewed_diagonals_upto65k_ice( //
     //         shorter, shorter_length, longer, longer_length, bound, alloc);
     else
-        return sz_edit_distance_serial(shorter, shorter_length, longer, longer_length, bound, alloc);
+        return sz_levenshtein_distance_serial(shorter, shorter_length, longer, longer_length, bound, alloc, result_ptr);
 }
 
 /**
@@ -1082,21 +1215,27 @@ SZ_INTERNAL sz_size_t sz_edit_distance_ice(      //
  *  Assuming the costs of substitutions can be arbitrary signed 8-bit integers, the method is expected to be used
  *  on strings not exceeding 2^24 length or 16.7 million characters.
  *
- *  Unlike the `_sz_edit_distance_skewed_diagonals_upto65k_avx512` method, this one uses signed integers to store
+ *  Unlike the `_sz_levenshtein_distance_skewed_diagonals_upto65k_avx512` method, this one uses signed integers to store
  *  the accumulated score. Moreover, it's primary bottleneck is the latency of gathering the substitution costs
  *  from the substitution matrix. If we use the diagonal order, we will be comparing a slice of the first string
  * with a slice of the second. If we stick to the conventional horizontal order, we will be comparing one character
  * against a slice, which is much easier to optimize. In that case we are sampling costs not from arbitrary parts of
  *  a 256 x 256 matrix, but from a single row!
  */
-SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_ice( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                      //
-    sz_cptr_t longer, sz_size_t longer_length,                        //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
+SZ_INTERNAL sz_status_t _sz_needleman_wunsch_score_wagner_fisher_upto17m_ice( //
+    sz_cptr_t shorter, sz_size_t shorter_length,                              //
+    sz_cptr_t longer, sz_size_t longer_length,                                //
+    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc, sz_ssize_t *result_ptr) {
 
     // If one of the strings is empty - the edit distance is equal to the length of the other one
-    if (longer_length == 0) return (sz_ssize_t)shorter_length * gap;
-    if (shorter_length == 0) return (sz_ssize_t)longer_length * gap;
+    if (longer_length == 0) {
+        *result_ptr = (sz_ssize_t)shorter_length * gap;
+        return sz_success_k;
+    }
+    if (shorter_length == 0) {
+        *result_ptr = (sz_ssize_t)longer_length * gap;
+        return sz_success_k;
+    }
 
     // Let's make sure that we use the amount proportional to the
     // number of elements in the shorter string, not the larger.
@@ -1119,6 +1258,7 @@ SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_ice( //
 
     sz_size_t buffer_length = sizeof(sz_i32_t) * n * 2;
     sz_i32_t *distances = (sz_i32_t *)alloc->allocate(buffer_length, alloc->handle);
+    if (!distances) return sz_bad_alloc_k;
     sz_i32_t *previous_distances = distances;
     sz_i32_t *current_distances = previous_distances + n;
 
@@ -1297,19 +1437,21 @@ SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_ice( //
     // Cache scalar before `free` call.
     sz_ssize_t result = previous_distances[longer_length];
     alloc->free(distances, buffer_length, alloc->handle);
-    return result;
+    *result_ptr = result;
+    return sz_success_k;
 }
 
-SZ_INTERNAL sz_ssize_t sz_alignment_score_ice(   //
-    sz_cptr_t shorter, sz_size_t shorter_length, //
-    sz_cptr_t longer, sz_size_t longer_length,   //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
+SZ_PUBLIC sz_status_t sz_needleman_wunsch_score_ice( //
+    sz_cptr_t shorter, sz_size_t shorter_length,     //
+    sz_cptr_t longer, sz_size_t longer_length,       //
+    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc, sz_ssize_t *result_ptr) {
 
     if (sz_max_of_two(shorter_length, longer_length) < (256ull * 256ull * 256ull))
-        return _sz_alignment_score_wagner_fisher_upto17m_ice(shorter, shorter_length, longer, longer_length, subs, gap,
-                                                             alloc);
+        return _sz_needleman_wunsch_score_wagner_fisher_upto17m_ice(shorter, shorter_length, longer, longer_length,
+                                                                    subs, gap, alloc, result_ptr);
     else
-        return sz_alignment_score_serial(shorter, shorter_length, longer, longer_length, subs, gap, alloc);
+        return sz_needleman_wunsch_score_serial(shorter, shorter_length, longer, longer_length, subs, gap, alloc,
+                                                result_ptr);
 }
 
 #pragma clang attribute pop
@@ -1351,46 +1493,46 @@ SZ_INTERNAL sz_ssize_t sz_alignment_score_ice(   //
 #pragma region Compile Time Dispatching
 #if !SZ_DYNAMIC_DISPATCH
 
-SZ_DYNAMIC sz_size_t sz_hamming_distance( //
-    sz_cptr_t a, sz_size_t a_length,      //
-    sz_cptr_t b, sz_size_t b_length,      //
-    sz_size_t bound) {
-    return sz_hamming_distance_serial(a, a_length, b, b_length, bound);
+SZ_DYNAMIC sz_status_t sz_hamming_distance( //
+    sz_cptr_t a, sz_size_t a_length,        //
+    sz_cptr_t b, sz_size_t b_length,        //
+    sz_size_t bound, sz_size_t *result_ptr) {
+    return sz_hamming_distance_serial(a, a_length, b, b_length, bound, result_ptr);
 }
 
-SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length,           //
-    sz_cptr_t b, sz_size_t b_length,           //
-    sz_size_t bound) {
-    return sz_hamming_distance_utf8_serial(a, a_length, b, b_length, bound);
+SZ_DYNAMIC sz_status_t sz_hamming_distance_utf8( //
+    sz_cptr_t a, sz_size_t a_length,             //
+    sz_cptr_t b, sz_size_t b_length,             //
+    sz_size_t bound, sz_size_t *result_ptr) {
+    return sz_hamming_distance_utf8_serial(a, a_length, b, b_length, bound, result_ptr);
 }
 
-SZ_DYNAMIC sz_size_t sz_edit_distance( //
-    sz_cptr_t a, sz_size_t a_length,   //
-    sz_cptr_t b, sz_size_t b_length,   //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
+SZ_DYNAMIC sz_status_t sz_levenshtein_distance( //
+    sz_cptr_t a, sz_size_t a_length,            //
+    sz_cptr_t b, sz_size_t b_length,            //
+    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result_ptr) {
 #if SZ_USE_ICE
-    return sz_edit_distance_ice(a, a_length, b, b_length, bound, alloc);
+    return sz_levenshtein_distance_ice(a, a_length, b, b_length, bound, alloc, result_ptr);
 #else
-    return sz_edit_distance_serial(a, a_length, b, b_length, bound, alloc);
+    return sz_levenshtein_distance_serial(a, a_length, b, b_length, bound, alloc, result_ptr);
 #endif
 }
 
-SZ_DYNAMIC sz_size_t sz_edit_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length,        //
-    sz_cptr_t b, sz_size_t b_length,        //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-    return _sz_edit_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc);
+SZ_DYNAMIC sz_status_t sz_levenshtein_distance_utf8( //
+    sz_cptr_t a, sz_size_t a_length,                 //
+    sz_cptr_t b, sz_size_t b_length,                 //
+    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result_ptr) {
+    return _sz_levenshtein_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc, result_ptr);
 }
 
-SZ_DYNAMIC sz_ssize_t sz_alignment_score( //
-    sz_cptr_t a, sz_size_t a_length,      //
-    sz_cptr_t b, sz_size_t b_length,      //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
+SZ_DYNAMIC sz_status_t sz_needleman_wunsch_score( //
+    sz_cptr_t a, sz_size_t a_length,              //
+    sz_cptr_t b, sz_size_t b_length,              //
+    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc, sz_ssize_t *result_ptr) {
 #if SZ_USE_ICE
-    return sz_alignment_score_ice(a, a_length, b, b_length, subs, gap, alloc);
+    return sz_needleman_wunsch_score_ice(a, a_length, b, b_length, subs, gap, alloc, result_ptr);
 #else
-    return sz_alignment_score_serial(a, a_length, b, b_length, subs, gap, alloc);
+    return sz_needleman_wunsch_score_serial(a, a_length, b, b_length, subs, gap, alloc, result_ptr);
 #endif
 }
 
diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 24f8fc94..143f252e 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -3316,12 +3316,12 @@ class basic_string {
     concatenation<string_view, string_view> operator|(string_view other) const noexcept { return {view(), other}; }
 
     size_type edit_distance(string_view other, size_type bound = 0) const noexcept {
-        size_type distance;
+        size_type result;
         _with_alloc([&](sz_alloc_type &alloc) {
-            distance = sz_edit_distance(data(), size(), other.data(), other.size(), bound, &alloc);
-            return true;
+            return sz_levenshtein_distance(data(), size(), other.data(), other.size(), bound, &alloc, &result) !=
+                   sz_bad_alloc_k;
         });
-        return distance;
+        return result;
     }
 
     /**  @brief  Hashes the string, equivalent to `std::hash<string_view>{}(str)`. */
@@ -3783,18 +3783,20 @@ typename concatenation_result<first_type, second_type, following_types...>::type
 
 /**
  *  @brief  Calculates the Hamming edit distance in @b bytes between two strings.
- *  @see    sz_edit_distance
+ *  @see    sz_levenshtein_distance
  */
 template <typename char_type_>
 std::size_t hamming_distance(                                                         //
     basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b, //
     std::size_t bound = 0) noexcept {
-    return sz_hamming_distance(a.data(), a.size(), b.data(), b.size(), bound);
+    std::size_t result;
+    sz_hamming_distance(a.data(), a.size(), b.data(), b.size(), bound, &result);
+    return result;
 }
 
 /**
  *  @brief  Calculates the Hamming edit distance in @b bytes between two strings.
- *  @see    sz_edit_distance
+ *  @see    sz_levenshtein_distance
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
 std::size_t hamming_distance(                                                                               //
@@ -3810,12 +3812,14 @@ std::size_t hamming_distance(
 template <typename char_type_>
 std::size_t hamming_distance_utf8( //
     basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b, std::size_t bound = 0) noexcept {
-    return sz_hamming_distance_utf8(a.data(), a.size(), b.data(), b.size(), bound);
+    std::size_t result;
+    sz_hamming_distance_utf8(a.data(), a.size(), b.data(), b.size(), bound, &result);
+    return result;
 }
 
 /**
  *  @brief  Calculates the Hamming edit distance in @b unicode codepoints between two strings.
- *  @see    sz_edit_distance
+ *  @see    sz_levenshtein_distance
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
 std::size_t hamming_distance_utf8( //
@@ -3826,7 +3830,7 @@ std::size_t hamming_distance_utf8( //
 
 /**
  *  @brief  Calculates the Levenshtein edit distance in @b bytes between two strings.
- *  @see    sz_edit_distance
+ *  @see    sz_levenshtein_distance
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
 std::size_t edit_distance( //
@@ -3834,8 +3838,8 @@ std::size_t edit_distance( //
     allocator_type_ &&allocator = allocator_type_ {}) noexcept(false) {
     std::size_t result;
     if (!_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
-            result = sz_edit_distance(a.data(), a.size(), b.data(), b.size(), bound, &alloc);
-            return result != SZ_SIZE_MAX;
+            return sz_levenshtein_distance(a.data(), a.size(), b.data(), b.size(), bound, &alloc, &result) !=
+                   sz_bad_alloc_k;
         }))
         throw std::bad_alloc();
     return result;
@@ -3843,7 +3847,7 @@ std::size_t edit_distance( //
 
 /**
  *  @brief  Calculates the Levenshtein edit distance in @b bytes between two strings.
- *  @see    sz_edit_distance
+ *  @see    sz_levenshtein_distance
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<char_type_>>
 std::size_t edit_distance(                                                                                  //
@@ -3854,7 +3858,7 @@ std::size_t edit_distance(
 
 /**
  *  @brief  Calculates the Levenshtein edit distance in @b unicode codepoints between two strings.
- *  @see    sz_edit_distance_utf8
+ *  @see    sz_levenshtein_distance_utf8
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
 std::size_t edit_distance_utf8(                                                       //
@@ -3862,8 +3866,8 @@ std::size_t edit_distance_utf8(
     std::size_t bound = SZ_SIZE_MAX, allocator_type_ &&allocator = allocator_type_ {}) noexcept(false) {
     std::size_t result;
     if (!_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
-            result = sz_edit_distance_utf8(a.data(), a.size(), b.data(), b.size(), bound, &alloc);
-            return result != SZ_SIZE_MAX;
+            return sz_levenshtein_distance_utf8(a.data(), a.size(), b.data(), b.size(), bound, &alloc, &result) !=
+                   sz_bad_alloc_k;
         }))
         throw std::bad_alloc();
     return result;
@@ -3871,7 +3875,7 @@ std::size_t edit_distance_utf8(
 
 /**
  *  @brief  Calculates the Levenshtein edit distance in @b unicode codepoints between two strings.
- *  @see    sz_edit_distance_utf8
+ *  @see    sz_levenshtein_distance_utf8
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<char_type_>>
 std::size_t edit_distance_utf8(                                                                             //
@@ -3882,7 +3886,7 @@ std::size_t edit_distance_utf8(
 
 /**
  *  @brief  Calculates the Needleman-Wunsch alignment score between two strings.
- *  @see    sz_alignment_score
+ *  @see    sz_needleman_wunsch_score
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
 std::ptrdiff_t alignment_score(                                                       //
@@ -3896,8 +3900,8 @@ std::ptrdiff_t alignment_score(
 
     std::ptrdiff_t result;
     if (!_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
-            result = sz_alignment_score(a.data(), a.size(), b.data(), b.size(), &subs[0][0], gap, &alloc);
-            return result != SZ_SSIZE_MAX;
+            return sz_needleman_wunsch_score(a.data(), a.size(), b.data(), b.size(), &subs[0][0], gap, &alloc,
+                                             &result) != sz_bad_alloc_k;
         }))
         throw std::bad_alloc();
     return result;
@@ -3905,7 +3909,7 @@ std::ptrdiff_t alignment_score(
 
 /**
  *  @brief  Calculates the Needleman-Wunsch alignment score between two strings.
- *  @see    sz_alignment_score
+ *  @see    sz_needleman_wunsch_score
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<char_type_>>
 std::ptrdiff_t alignment_score(                                                                             //
@@ -3973,17 +3977,17 @@ struct _sequence_args {
 };
 
 template <typename objects_type_, typename string_extractor_>
-sz_cptr_t _call_sequence_member_start(struct sz_sequence_t const *sequence, sz_size_t i) {
+sz_cptr_t _call_sequence_member_start(void const *sequence, sz_size_t i) {
     using handle_type = _sequence_args<objects_type_, string_extractor_>;
-    handle_type const *args = reinterpret_cast<handle_type const *>(sequence->handle);
+    handle_type const *args = reinterpret_cast<handle_type const *>(sequence);
     string_view member = args->extractor(args->begin[i]);
     return member.data();
 }
 
 template <typename objects_type_, typename string_extractor_>
-sz_size_t _call_sequence_member_length(struct sz_sequence_t const *sequence, sz_size_t i) {
+sz_size_t _call_sequence_member_length(void const *sequence, sz_size_t i) {
     using handle_type = _sequence_args<objects_type_, string_extractor_>;
-    handle_type const *args = reinterpret_cast<handle_type const *>(sequence->handle);
+    handle_type const *args = reinterpret_cast<handle_type const *>(sequence);
     string_view member = args->extractor(args->begin[i]);
     return static_cast<sz_size_t>(member.size());
 }

From d7bab8d58fc7d9e77e6553f7e39aa9cd34a29f93 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 22 Feb 2025 12:36:44 +0000
Subject: [PATCH 121/751] Docs: Explaining `compare.h` operations

---
 include/stringzilla/compare.h | 73 ++++++++++++++++++++++++++---------
 1 file changed, 55 insertions(+), 18 deletions(-)

diff --git a/include/stringzilla/compare.h b/include/stringzilla/compare.h
index 9f2e276d..4d0a7cb5 100644
--- a/include/stringzilla/compare.h
+++ b/include/stringzilla/compare.h
@@ -7,8 +7,13 @@
  *
  *  - `sz_equal` - for equality comparison of two strings.
  *  - `sz_order` - for the relative order of two strings, similar to `memcmp`.
- *  - TODO: `sz_mismatch`, `sz_rmismatch` - to supersede `sz_equal`.
- *  - TODO: `sz_order_utf8` - for the relative order of two UTF-8 strings.
+ *
+ *  A valid suggestion may be to add an `sz_mismatch`, as the shared part of the `sz_order` and `sz_equal`.
+ *  That would be great for a general-purpose library, but has little practical use for string processing.
+ *  
+ *  The functions in this file can be used for both UTF-8 and other inputs.
+ *  On platforms without masked loads they use interleaved prefix and suffix vector-loads
+ *  to avoid scalar code, similar to the kernels in `memory.h`.
  */
 #ifndef STRINGZILLA_COMPARE_H_
 #define STRINGZILLA_COMPARE_H_
@@ -22,29 +27,61 @@ extern "C" {
 #pragma region Core API
 
 /**
- *  @brief  Checks if two string are equal.
- *          Similar to `memcmp(a, b, length) == 0` in LibC and `a == b` in STL.
+ *  @brief  Checks if two strings are equal. Equivalent to `memcmp(a, b, length) == 0` in LibC and `a == b` in STL.
+ *  @see    https://en.cppreference.com/w/c/string/byte/memcmp
+ *
+ *  @param[in] a First string to compare.
+ *  @param[in] b Second string to compare.
+ *  @param[in] length Number of bytes to compare in both strings.
+ *
+ *  @retval `sz_true_k` if strings are equal.
+ *  @retval `sz_false_k` if strings are different.
+ *
+ *  Example usage:
  *
- *  The implementation of this function is very similar to `sz_order`, but the usage patterns are different.
- *  This function is more often used in parsing, while `sz_order` is often used in sorting.
- *  It works best on platforms with cheap
+ *  @code{.c}
+ *      #include <stringzilla/compare.h>
+ *      int main() {
+ *          return sz_equal("hello", "hello", 5) && !sz_equal("hello", "world", 5);
+ *      }
+ *  @endcode
  *
- *  @param a        First string to compare.
- *  @param b        Second string to compare.
- *  @param length   Number of bytes in both strings.
- *  @return         1 if strings match, 0 otherwise.
+ *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
+ *  @sa     sz_equal_serial, sz_equal_haswell, sz_equal_skylake, sz_equal_neon, sz_equal_sve
  */
 SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
 
 /**
- *  @brief  Estimates the relative order of two strings. Equivalent to `memcmp(a, b, length)` in LibC.
- *          Can be used on different length strings.
+ *  @brief  Compares two strings lexicographically. Equivalent to `memcmp(a, b, length)` in LibC.
+ *          Mostly used in sorting and associative containers. Can be used for @b UTF-8 inputs.
+ *  @see    https://en.cppreference.com/w/c/string/byte/memcmp
+ *
+ *  This function uses scalar code on most platforms, as in the majority of cases the strings that
+ *  differ - will have differences among the very first characters and fetching more than one cache
+ *  line may not be justified.
+ * 
+ *  @param[in] a First string to compare.
+ *  @param[in] a_length Number of bytes in the first string.
+ *  @param[in] b Second string to compare.
+ *  @param[in] b_length Number of bytes in the second string.
+ *
+ *  @retval `sz_less_k` if @p a is lexicographically smaller than @p b.
+ *  @retval `sz_greater_k` if @p a is lexicographically greater than @p b.
+ *  @retval `sz_equal_k` if strings @p a and @p b are identical.
+ *
+ *  Example usage:
+ *
+ *  @code{.c}
+ *      #include <stringzilla/compare.h>
+ *      int main() {
+ *          return sz_order("apple", 5, "banana", 6) < 0 &&
+ *                 sz_order("grape", 5, "grape", 5) == 0 &&
+ *                 sz_order("zebra", 5, "apple", 5) > 0;
+ *      }
+ *  @endcode
  *
- *  @param a        First string to compare.
- *  @param a_length Number of bytes in the first string.
- *  @param b        Second string to compare.
- *  @param b_length Number of bytes in the second string.
- *  @return         Negative if (a < b), positive if (a > b), zero if they are equal.
+ *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
+ *  @sa     sz_order_serial, sz_order_haswell, sz_order_skylake, sz_order_neon, sz_order_sve
  */
 SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
 

From 7aad4bb681d1f1d7aae031dd657ed53d82291b17 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 23 Feb 2025 13:07:44 +0000
Subject: [PATCH 122/751] Improve: Vectorize `sz_equal_haswell`

---
 include/stringzilla/compare.h | 69 ++++++++++++++++++++++++++++-------
 1 file changed, 55 insertions(+), 14 deletions(-)

diff --git a/include/stringzilla/compare.h b/include/stringzilla/compare.h
index 4d0a7cb5..494d1442 100644
--- a/include/stringzilla/compare.h
+++ b/include/stringzilla/compare.h
@@ -10,7 +10,7 @@
  *
  *  A valid suggestion may be to add an `sz_mismatch`, as the shared part of the `sz_order` and `sz_equal`.
  *  That would be great for a general-purpose library, but has little practical use for string processing.
- *  
+ *
  *  The functions in this file can be used for both UTF-8 and other inputs.
  *  On platforms without masked loads they use interleaved prefix and suffix vector-loads
  *  to avoid scalar code, similar to the kernels in `memory.h`.
@@ -59,7 +59,7 @@ SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
  *  This function uses scalar code on most platforms, as in the majority of cases the strings that
  *  differ - will have differences among the very first characters and fetching more than one cache
  *  line may not be justified.
- * 
+ *
  *  @param[in] a First string to compare.
  *  @param[in] a_length Number of bytes in the first string.
  *  @param[in] b Second string to compare.
@@ -172,19 +172,60 @@ SZ_PUBLIC sz_ordering_t sz_order_haswell(sz_cptr_t a, sz_size_t a_length, sz_cpt
 }
 
 SZ_PUBLIC sz_bool_t sz_equal_haswell(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
-    sz_u256_vec_t a_vec, b_vec;
-
-    while (length >= 32) {
-        a_vec.ymm = _mm256_lddqu_si256((__m256i const *)a);
-        b_vec.ymm = _mm256_lddqu_si256((__m256i const *)b);
-        // One approach can be to use "movemasks", but we could also use a bitwise matching like `_mm256_testnzc_si256`.
-        int difference_mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi8(a_vec.ymm, b_vec.ymm));
-        if (difference_mask == 0) { a += 32, b += 32, length -= 32; }
-        else { return sz_false_k; }
-    }
 
-    if (length) return sz_equal_serial(a, b, length);
-    return sz_true_k;
+    if (length < 8) {
+        sz_cptr_t const a_end = a + length;
+        while (a != a_end && *a == *b) a++, b++;
+        return (sz_bool_t)(a_end == a);
+    }
+    // We can use 2x 64-bit interleaving loads for each string, and then compare them for equality.
+    // The same approach is used in GLibC and was suggest by Denis Yaroshevskiy.
+    // https://codebrowser.dev/glibc/glibc/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S.html#518
+    // It shouldn't improve performance on microbenchmarks, but should be better in practice.
+    else if (length <= 16) {
+        sz_u64_t a_first_word = sz_u64_load(a).u64;
+        sz_u64_t b_first_word = sz_u64_load(b).u64;
+        sz_u64_t a_second_word = sz_u64_load(a + length - 8).u64;
+        sz_u64_t b_second_word = sz_u64_load(b + length - 8).u64;
+        return (sz_bool_t)((a_first_word == b_first_word) & (a_second_word == b_second_word));
+    }
+    // We can use 2x 128-bit interleaving loads for each string, and then compare them for equality.
+    else if (length <= 32) {
+        sz_u128_vec_t a_first_vec, b_first_vec, a_second_vec, b_second_vec;
+        a_first_vec.xmm = _mm_lddqu_si128((__m128i const *)(a));
+        b_first_vec.xmm = _mm_lddqu_si128((__m128i const *)(b));
+        a_second_vec.xmm = _mm_lddqu_si128((__m128i const *)(a + length - 16));
+        b_second_vec.xmm = _mm_lddqu_si128((__m128i const *)(b + length - 16));
+        return (sz_bool_t)(_mm_movemask_epi8(_mm_and_si128( //
+                               _mm_cmpeq_epi8(a_first_vec.xmm, b_first_vec.xmm),
+                               _mm_cmpeq_epi8(a_second_vec.xmm, b_second_vec.xmm))) == 0xFFFF);
+    }
+    // We can use 2x 256-bit interleaving loads for each string, and then compare them for equality.
+    else if (length <= 64) {
+        sz_u256_vec_t a_first_vec, b_first_vec, a_second_vec, b_second_vec;
+        a_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(a));
+        b_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(b));
+        a_second_vec.ymm = _mm256_lddqu_si256((__m256i const *)(a + length - 32));
+        b_second_vec.ymm = _mm256_lddqu_si256((__m256i const *)(b + length - 32));
+        return (sz_bool_t)(_mm256_movemask_epi8(_mm256_and_si256( //
+                               _mm256_cmpeq_epi8(a_first_vec.ymm, b_first_vec.ymm),
+                               _mm256_cmpeq_epi8(a_second_vec.ymm, b_second_vec.ymm))) == (int)0xFFFFFFFF);
+    }
+    else {
+        sz_size_t i = 0;
+        sz_u256_vec_t a_vec, b_vec;
+        do {
+            a_vec.ymm = _mm256_lddqu_si256((__m256i const *)(a + i));
+            b_vec.ymm = _mm256_lddqu_si256((__m256i const *)(b + i));
+            // One approach can be to use "movemasks", but we could also use a bitwise
+            // matching like `_mm256_testnzc_si256`.
+            if (_mm256_movemask_epi8(_mm256_cmpeq_epi8(a_vec.ymm, b_vec.ymm)) != (int)0xFFFFFFFF) return sz_false_k;
+            i += 32;
+        } while (i + 32 <= length);
+        a_vec.ymm = _mm256_lddqu_si256((__m256i const *)(a + length - 32));
+        b_vec.ymm = _mm256_lddqu_si256((__m256i const *)(b + length - 32));
+        return (sz_bool_t)(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a_vec.ymm, b_vec.ymm)) == (int)0xFFFFFFFF);
+    }
 }
 
 #pragma clang attribute pop

From 6e715362a7f47bfa35ae01690a314e0bf4baefc8 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 23 Feb 2025 13:09:44 +0000
Subject: [PATCH 123/751] Improve: Ordering includes

---
 include/stringzilla/stringzilla.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 349aba79..660ffa6c 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -30,7 +30,7 @@
  *
  *  - `SZ_USE_HASWELL=?` - whether to use AVX2 instructions on x86_64.
  *  - `SZ_USE_SKYLAKE=?` - whether to use AVX-512 instructions on x86_64.
- *  - `SZ_USE_ICE=?` - whether to use AVX-512 VBMI instructions on x86_64.
+ *  - `SZ_USE_ICE=?` - whether to use AVX-512 VBMI & wider AES instructions on x86_64.
  *  - `SZ_USE_NEON=?` - whether to use NEON instructions on ARM.
  *  - `SZ_USE_SVE=?` - whether to use SVE and SVE2 instructions on ARM.
  */
@@ -41,14 +41,14 @@
 #define STRINGZILLA_VERSION_MINOR 11
 #define STRINGZILLA_VERSION_PATCH 3
 
+#include "types.h"        // `sz_size_t`, `sz_bool_t`, `sz_ordering_t`
 #include "compare.h"      // `sz_equal`, `sz_order`
-#include "find.h"         // `sz_find`, `sz_find_charset`, `sz_rfind`
-#include "hash.h"         // `sz_bytesum`, `sz_hash`, `sz_state_init`, `sz_state_stream`, `sz_state_fold`
 #include "memory.h"       // `sz_copy`, `sz_move`, `sz_fill`
-#include "similarity.h"   // `sz_edit_distance`, `sz_alignment_score`
+#include "hash.h"         // `sz_bytesum`, `sz_hash`, `sz_state_init`, `sz_state_stream`, `sz_state_fold`
+#include "find.h"         // `sz_find`, `sz_find_charset`, `sz_rfind`
 #include "small_string.h" // `sz_string_t`, `sz_string_init`, `sz_string_free`
+#include "similarity.h"   // `sz_levenshtein_distance`, `sz_needleman_wunsch_score`
 #include "sort.h"         // `sz_sequence_argsort`, `sz_pgrams_sort`, `sz_pgrams_sort_stable`
-#include "types.h"        // `sz_size_t`, `sz_bool_t`, `sz_ordering_t`
 
 #ifdef __cplusplus
 extern "C" {

From 2225488a0e035a01e50cc78e8c2aa4d3ff8c33ef Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 23 Feb 2025 20:19:15 +0000
Subject: [PATCH 124/751] Docs: Announce JOINs

---
 include/stringzilla/sort.h | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index c422e3c3..8e387b70 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -1,28 +1,29 @@
 /**
- *  @brief  Hardware-accelerated string collection sorting.
+ *  @brief  Hardware-accelerated string collection sorting & joins.
  *  @file   sort.h
  *  @author Ash Vardanian
  *
  *  Includes core APIs for `sz_sequence_t` string collections:
  *
- *  - `sz_sequence_argsort` - to get the sorting permutation of a string collection with QuickSort.
- *  - `sz_sequence_argsort_stable` - to get the stable-sorting permutation of a string collection with a MergeSort.
+ *  - `sz_sequence_argsort` - to get the sorting permutation of a string collection.
+ *  - `sz_sequence_join` - to compute the intersection of two arbitrary string collections.
  *
- *  The core idea of all following string algorithms is to sort strings not based on 1 character at a time,
+ *  The core idea of all following string algorithms is to process strings not based on 1 character at a time,
  *  but on a larger "Pointer-sized N-grams" fitting in 4 or 8 bytes at once, on 32-bit or 64-bit architectures,
- *  respectively. In reality we may not use the full pointer size, but only a few bytes from it, and keep the rest
- *  for some metadata.
+ *  respectively. In reality we may not use the full pointer size, but only a few bytes from it, and keep the
+ *  rest for some metadata.
  *
- *  That, however, means, that unsigned integer sorting is a constituent part of our string sorting and we can
- *  expose it as an additional set of APIs for the users:
+ *  That, however, means, that unsigned integer sorting & matching is a constituent part of our sequence
+ *  algorithms and we can expose them as an additional set of APIs for the users:
  *
- *  - `sz_pgrams_sort` - to inplace sort continuous pointer-sized integers with QuickSort.
- *  - `sz_pgrams_sort_stable` - to inplace stable-sort continuous pointer-sized integers with a MergeSort.
+ *  - `sz_pgrams_sort` - to inplace sort continuous pointer-sized integers.
+ *  - `sz_pgrams_join` - to compute the intersection of two arbitrary integer collections.
  *
- *  For cases, when the input is known to be tiny, we provide quadratic-complexity insertion sort adaptations:
+ *  Other helpers include:
  *
- *  - `sz_sequence_argsort_with_insertion` - for string collections.
- *  - `sz_pgrams_sort_stable_with_insertion` - for continuous unsigned integers.
+ *  - `sz_pgrams_sort_stable_with_insertion` - for quadratic-complexity sorting of small continuous integer arrays.
+ *  - `sz_sequence_argsort_with_insertion` - for quadratic-complexity sorting of small string collections.
+ *  - `sz_sequence_argsort_stabilize` - updates the sorting permutation to be stable.
  */
 #ifndef STRINGZILLA_SORT_H_
 #define STRINGZILLA_SORT_H_
@@ -845,7 +846,7 @@ SZ_PUBLIC sz_status_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t
 
 SZ_PUBLIC sz_status_t sz_sequence_argsort_stable_serial(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
                                                         sz_sorted_idx_t *order) {
-
+    sz_unused(sequence && alloc && order);
     return sz_success_k;
 }
 

From 3c345bcbeefa4bc5dcf3745dc2663d0b35168327 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 23 Feb 2025 20:19:53 +0000
Subject: [PATCH 125/751] Add: Hashing on Haswell & Skylake-X

---
 include/stringzilla/hash.h | 444 ++++++++++++++++++++++++++++---------
 1 file changed, 341 insertions(+), 103 deletions(-)

diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 2094a0d3..539f016a 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -66,9 +66,21 @@ extern "C" {
  *  @brief  Computes the 64-bit check-sum of bytes in a string.
  *          Similar to `std::ranges::accumulate`.
  *
- *  @param text     String to aggregate.
- *  @param length   Number of bytes in the text.
- *  @return         64-bit unsigned value.
+ *  @param[in] text String to aggregate.
+ *  @param[in] length Number of bytes in the text.
+ *  @return 64-bit unsigned value.
+ *
+ *  Example usage:
+ *
+ *  @code{.c}
+ *      #include <stringzilla/hash.h>
+ *      int main() {
+ *          return sz_bytesum("hi", 2) == 209 ? 0 : 1;
+ *      }
+ *  @endcode
+ *
+ *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
+ *  @sa     sz_bytesum_serial, sz_bytesum_haswell, sz_bytesum_skylake, sz_bytesum_ice, sz_bytesum_neon
  */
 SZ_DYNAMIC sz_u64_t sz_bytesum(sz_cptr_t text, sz_size_t length);
 
@@ -78,10 +90,25 @@ SZ_DYNAMIC sz_u64_t sz_bytesum(sz_cptr_t text, sz_size_t length);
  *          It passes the SMHasher suite by Austin Appleby with no collisions, even with `--extra` flag.
  *  @see    HASH.md for a detailed explanation of the algorithm.
  *
- *  @param text     String to hash.
- *  @param length   Number of bytes in the text.
- *  @param seed     64-bit unsigned seed for the hash.
- *  @return         64-bit hash value.
+ *  @param[in] text String to hash.
+ *  @param[in] length Number of bytes in the text.
+ *  @param[in] seed 64-bit unsigned seed for the hash.
+ *  @return 64-bit hash value.
+ *
+ *  Example usage:
+ *
+ *  @code{.c}
+ *      #include <stringzilla/hash.h>
+ *      int main() {
+ *          return sz_hash("hello", 5, 0) != sz_hash("world", 5, 0) ? 0 : 1;
+ *      }
+ *  @endcode
+ *
+ *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
+ *  @sa     sz_hash_serial, sz_hash_haswell, sz_hash_skylake, sz_hash_ice, sz_hash_neon
+ *
+ *  @note   The algorithm must provide the same output on all platforms in both single-shot and incremental modes.
+ *  @sa     sz_hash_state_init, sz_hash_state_stream, sz_hash_state_fold
  */
 SZ_DYNAMIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
 
@@ -98,9 +125,24 @@ SZ_DYNAMIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
  *  In this case, it doesn't apply, as we only use one round of AES mixing. We also don't expose a separate "key",
  *  only a "nonce", to keep the API simple.
  *
- *  @param text     Output string buffer to be populated.
- *  @param length   Number of bytes in the string.
- *  @param nonce    "Number used ONCE" to ensure uniqueness of produced blocks.
+ *  @param[out] text Output string buffer to be populated.
+ *  @param[in] length Number of bytes in the string.
+ *  @param[in] nonce "Number used ONCE" to ensure uniqueness of produced blocks.
+ *
+ *  Example usage:
+ *
+ *  @code{.c}
+ *      #include <stringzilla/hash.h>
+ *      int main() {
+ *          char first_buffer[5], second_buffer[5];
+ *          sz_generate(first_buffer, 5, 0);
+ *          sz_generate(second_buffer, 5, 0); //? Same nonce will produce the same output
+ *          return sz_bytesum(first_buffer, 5) == sz_bytesum(second_buffer, 5) ? 0 : 1;
+ *      }
+ *  @endcode
+ *
+ *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
+ *  @sa     sz_generate_serial, sz_generate_haswell, sz_generate_skylake, sz_generate_ice, sz_generate_neon
  */
 SZ_DYNAMIC void sz_generate(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
 
@@ -126,25 +168,25 @@ typedef struct _sz_hash_minimal_t {
 /**
  *  @brief  Initializes the state for incremental construction of a hash.
  *
- *  @param state    The state to initialize.
- *  @param seed     The 64-bit unsigned seed for the hash.
+ *  @param[out] state The state to initialize.
+ *  @param[in] seed The 64-bit unsigned seed for the hash.
  */
 SZ_DYNAMIC void sz_hash_state_init(sz_hash_state_t *state, sz_u64_t seed);
 
 /**
  *  @brief  Updates the state with new data.
  *
- *  @param state    The state to stream.
- *  @param text     The new data to include in the hash.
- *  @param length   The number of bytes in the new data.
+ *  @param[inout] state The state to stream.
+ *  @param[in] text The new data to include in the hash.
+ *  @param[in] length The number of bytes in the new data.
  */
 SZ_DYNAMIC void sz_hash_state_stream(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
 
 /**
- *  @brief  Finalizes the state and returns the hash.
+ *  @brief  Finalizes the immutable state and returns the hash.
  *
- *  @param state    The state to fold.
- *  @return         The 64-bit hash value.
+ *  @param[in] state The state to fold.
+ *  @return The 64-bit hash value.
  */
 SZ_DYNAMIC sz_u64_t sz_hash_state_fold(sz_hash_state_t const *state);
 
@@ -365,8 +407,182 @@ SZ_PUBLIC sz_u64_t sz_bytesum_haswell(sz_cptr_t text, sz_size_t length) {
     }
 }
 
-SZ_PUBLIC sz_u64_t sz_hash_haswell(sz_cptr_t text, sz_size_t length, sz_u64_t seed) {
-    return sz_hash_serial(text, length, seed);
+SZ_INTERNAL void _sz_hash_minimal_init_haswell(_sz_hash_minimal_t *state, sz_u64_t seed) {
+    __m128i seed_vec = _mm_set1_epi64x(seed);
+    __m128i pi0 = _mm_set_epi64x(0x13198a2e03707344ull, 0x243f6a8885a308d3ull);
+    __m128i pi1 = _mm_set_epi64x(0x082efa98ec4e6c89ull, 0xa4093822299f31d0ull);
+    // XOR the user-supplied keys with the two "pi" constants
+    __m128i k1 = _mm_xor_si128(seed_vec, pi0);
+    __m128i k2 = _mm_xor_si128(seed_vec, pi1);
+    // Export the keys to the state
+    state->aes.xmm = k1;
+    state->sum.xmm = k2;
+    state->key.xmm = _mm_xor_si128(pi0, pi1);
+}
+
+SZ_INTERNAL sz_u64_t _sz_hash_minimal_finalize_haswell(_sz_hash_minimal_t const *state) {
+    // Combine the sum and the AES block
+    __m128i mixed_registers = _mm_aesenc_si128(state->sum.xmm, state->aes.xmm);
+    // Make sure the "key" mixes enough with the state,
+    // as with less than 2 rounds - SMHasher fails
+    __m128i mixed_within_register =
+        _mm_aesdec_si128(_mm_aesdec_si128(mixed_registers, state->key.xmm), mixed_registers);
+    // Extract the low 64 bits
+    return _mm_cvtsi128_si64(mixed_within_register);
+}
+
+SZ_INTERNAL void _sz_hash_minimal_update_haswell(_sz_hash_minimal_t *state, __m128i block) {
+    // This shuffle mask is identical to "aHash":
+    __m128i const shuffle_mask = _mm_set_epi8(          //
+        0x04, 0x0b, 0x09, 0x06, 0x08, 0x0d, 0x0f, 0x05, //
+        0x0e, 0x03, 0x01, 0x0c, 0x00, 0x07, 0x0a, 0x02);
+    state->aes.xmm = _mm_aesdec_si128(state->aes.xmm, block);
+    state->sum.xmm = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmm, shuffle_mask), block);
+}
+
+SZ_PUBLIC void sz_hash_state_init_haswell(sz_hash_state_t *state, sz_u64_t seed) {
+    __m128i seed_vec = _mm_set1_epi64x(seed);
+    __m128i pi0 = _mm_set_epi64x(0x13198a2e03707344ull, 0x243f6a8885a308d3ull);
+    __m128i pi1 = _mm_set_epi64x(0x082efa98ec4e6c89ull, 0xa4093822299f31d0ull);
+    // XOR the user-supplied keys with the two "pi" constants
+    __m128i k1 = _mm_xor_si128(seed_vec, pi0);
+    __m128i k2 = _mm_xor_si128(seed_vec, pi1);
+    // Export the keys to the state
+    state->aes.xmms[0] = state->aes.xmms[1] = state->aes.xmms[2] = state->aes.xmms[3] = k1;
+    state->sum.xmms[0] = state->sum.xmms[1] = state->sum.xmms[2] = state->sum.xmms[3] = k2;
+    state->key.xmms[0] = state->key.xmms[1] = state->key.xmms[2] = state->key.xmms[3] = _mm_xor_si128(pi0, pi1);
+    state->ins_length = 0;
+}
+
+SZ_INTERNAL void _sz_hash_state_update_haswell(sz_hash_state_t *state, __m128i block0, __m128i block1, __m128i block2,
+                                               __m128i block3) {
+    // This shuffle mask is identical to "aHash":
+    __m128i const shuffle_mask = _mm_set_epi8(          //
+        0x04, 0x0b, 0x09, 0x06, 0x08, 0x0d, 0x0f, 0x05, //
+        0x0e, 0x03, 0x01, 0x0c, 0x00, 0x07, 0x0a, 0x02);
+    state->aes.xmms[0] = _mm_aesdec_si128(state->aes.xmms[0], block0);
+    state->sum.xmms[0] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[0], shuffle_mask), block0);
+    state->aes.xmms[1] = _mm_aesdec_si128(state->aes.xmms[1], block1);
+    state->sum.xmms[1] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[1], shuffle_mask), block1);
+    state->aes.xmms[2] = _mm_aesdec_si128(state->aes.xmms[2], block2);
+    state->sum.xmms[2] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[2], shuffle_mask), block2);
+    state->aes.xmms[3] = _mm_aesdec_si128(state->aes.xmms[3], block3);
+    state->sum.xmms[3] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[3], shuffle_mask), block3);
+}
+
+SZ_INTERNAL sz_u64_t _sz_hash_state_finalize_haswell(sz_hash_state_t const *state) {
+    // Combine the sum and the AES block
+    __m128i mixed_registers0 = _mm_aesenc_si128(state->sum.xmms[0], state->aes.xmms[0]);
+    __m128i mixed_registers1 = _mm_aesenc_si128(state->sum.xmms[1], state->aes.xmms[1]);
+    __m128i mixed_registers2 = _mm_aesenc_si128(state->sum.xmms[2], state->aes.xmms[2]);
+    __m128i mixed_registers3 = _mm_aesenc_si128(state->sum.xmms[3], state->aes.xmms[3]);
+    // Combine the mixed registers
+    __m128i mixed_registers01 = _mm_aesenc_si128(mixed_registers0, mixed_registers1);
+    __m128i mixed_registers23 = _mm_aesenc_si128(mixed_registers2, mixed_registers3);
+    __m128i mixed_registers = _mm_aesenc_si128(mixed_registers01, mixed_registers23);
+    // Make sure the "key" mixes enough with the state,
+    // as with less than 2 rounds - SMHasher fails
+    __m128i mixed_within_register = _mm_aesdec_si128( //
+        _mm_aesdec_si128(mixed_registers, state->key.xmms[0]), mixed_registers);
+    // Extract the low 64 bits
+    return _mm_cvtsi128_si64(mixed_within_register);
+}
+
+SZ_PUBLIC sz_u64_t sz_hash_haswell(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
+
+    if (length <= 16) {
+        // Initialize the AES block with a given seed and update with the input length
+        _sz_hash_minimal_t state;
+        _sz_hash_minimal_init_haswell(&state, seed);
+        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
+        // Load the data and update the state
+        sz_u128_vec_t data_vec;
+        data_vec.xmm = _mm_setzero_si128();
+        for (sz_size_t i = 0; i < length; ++i) data_vec.u8s[i] = start[i];
+        _sz_hash_minimal_update_haswell(&state, data_vec.xmm);
+        return _sz_hash_minimal_finalize_haswell(&state);
+    }
+    else if (length <= 32) {
+        // Initialize the AES block with a given seed and update with the input length
+        _sz_hash_minimal_t state;
+        _sz_hash_minimal_init_haswell(&state, seed);
+        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
+        // Load the data and update the state
+        sz_u128_vec_t data0_vec, data1_vec;
+        data0_vec.xmm = _mm_lddqu_si128(start);
+        data1_vec.xmm = _mm_lddqu_si128(start + length - 16);
+        // Let's shift the data within the register to de-interleave the bytes.
+        data1_vec.xmm = _mm_bsrli_si128(data1_vec.xmm, 32 - length);
+        _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
+        _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
+        return _sz_hash_minimal_finalize_haswell(&state);
+    }
+    else if (length <= 48) {
+        // Initialize the AES block with a given seed and update with the input length
+        _sz_hash_minimal_t state;
+        _sz_hash_minimal_init_haswell(&state, seed);
+        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
+        // Load the data and update the state
+        sz_u128_vec_t data0_vec, data1_vec, data2_vec;
+        data0_vec.xmm = _mm_lddqu_si128(start);
+        data1_vec.xmm = _mm_lddqu_si128(start + 16);
+        data2_vec.xmm = _mm_lddqu_si128(start + length - 16);
+        // Let's shift the data within the register to de-interleave the bytes.
+        data2_vec.xmm = _mm_bsrli_si128(data2_vec.xmm, 48 - length);
+        _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
+        _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
+        _sz_hash_minimal_update_haswell(&state, data2_vec.xmm);
+        return _sz_hash_minimal_finalize_haswell(&state);
+    }
+    else if (length <= 64) {
+        // Initialize the AES block with a given seed and update with the input length
+        _sz_hash_minimal_t state;
+        _sz_hash_minimal_init_haswell(&state, seed);
+        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
+        // Load the data and update the state
+        sz_u128_vec_t data0_vec, data1_vec, data2_vec, data3_vec;
+        data0_vec.xmm = _mm_lddqu_si128(start);
+        data1_vec.xmm = _mm_lddqu_si128(start + 16);
+        data2_vec.xmm = _mm_lddqu_si128(start + 32);
+        data3_vec.xmm = _mm_lddqu_si128(start + length - 16);
+        // Let's shift the data within the register to de-interleave the bytes.
+        data3_vec.xmm = _mm_bsrli_si128(data3_vec.xmm, 64 - length);
+        _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
+        _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
+        _sz_hash_minimal_update_haswell(&state, data2_vec.xmm);
+        _sz_hash_minimal_update_haswell(&state, data3_vec.xmm);
+        return _sz_hash_minimal_finalize_haswell(&state);
+    }
+    else {
+        // Use a larger state to handle the main loop and add different offsets
+        // to different lanes of the register
+        sz_hash_state_t state;
+        sz_hash_state_init_haswell(&state, seed);
+        state.aes.xmms[0] = _mm_add_epi64(state.aes.xmms[0], _mm_set_epi64x(0, length));
+        state.aes.xmms[1] = _mm_add_epi64(state.aes.xmms[1], _mm_set_epi64x(16, length));
+        state.aes.xmms[2] = _mm_add_epi64(state.aes.xmms[2], _mm_set_epi64x(32, length));
+        state.aes.xmms[3] = _mm_add_epi64(state.aes.xmms[3], _mm_set_epi64x(48, length));
+
+        for (; state.ins_length + 64 <= length; state.ins_length += 64) {
+            state.ins.xmms[0] = _mm_lddqu_si128(start + state.ins_length);
+            state.ins.xmms[1] = _mm_lddqu_si128(start + state.ins_length + 16);
+            state.ins.xmms[2] = _mm_lddqu_si128(start + state.ins_length + 32);
+            state.ins.xmms[3] = _mm_lddqu_si128(start + state.ins_length + 48);
+            _sz_hash_state_update_haswell(&state, state.ins.xmms[0], state.ins.xmms[1], state.ins.xmms[2],
+                                          state.ins.xmms[3]);
+        }
+        if (state.ins_length < length) {
+            state.ins.xmms[0] = _mm_setzero_si128();
+            state.ins.xmms[1] = _mm_setzero_si128();
+            state.ins.xmms[2] = _mm_setzero_si128();
+            state.ins.xmms[3] = _mm_setzero_si128();
+            for (sz_size_t i = 0; state.ins_length < length; ++i, ++state.ins_length)
+                state.ins.u8s[i] = start[state.ins_length];
+            _sz_hash_state_update_haswell(&state, state.ins.xmms[0], state.ins.xmms[1], state.ins.xmms[2],
+                                          state.ins.xmms[3]);
+        }
+        return _sz_hash_state_finalize_haswell(&state);
+    }
 }
 
 SZ_PUBLIC void sz_generate_haswell(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
@@ -499,16 +715,107 @@ SZ_PUBLIC sz_u64_t sz_bytesum_skylake(sz_cptr_t text, sz_size_t length) {
     }
 }
 
-SZ_PUBLIC sz_u64_t sz_hash_skylake(sz_cptr_t text, sz_size_t length, sz_u64_t seed) {
-    return sz_hash_serial(text, length, seed);
+SZ_PUBLIC void sz_hash_state_init_skylake(sz_hash_state_t *state, sz_u64_t seed) {
+    __m512i seed_vec = _mm512_set1_epi64(seed);
+    __m512i pi0 = _mm512_set_epi64( //
+        0x13198a2e03707344ull, 0x243f6a8885a308d3ull, 0x13198a2e03707344ull, 0x243f6a8885a308d3ull,
+        0x13198a2e03707344ull, 0x243f6a8885a308d3ull, 0x13198a2e03707344ull, 0x243f6a8885a308d3ull);
+    __m512i pi1 = _mm512_set_epi64( //
+        0x082efa98ec4e6c89ull, 0xa4093822299f31d0ull, 0x082efa98ec4e6c89ull, 0xa4093822299f31d0ull,
+        0x082efa98ec4e6c89ull, 0xa4093822299f31d0ull, 0x082efa98ec4e6c89ull, 0xa4093822299f31d0ull);
+    // XOR the user-supplied keys with the two "pi" constants
+    __m512i k1 = _mm512_xor_si512(seed_vec, pi0);
+    __m512i k2 = _mm512_xor_si512(seed_vec, pi1);
+    // Export the keys to the state
+    state->aes.zmm = k1;
+    state->sum.zmm = k2;
+    state->key.zmm = _mm512_xor_si512(pi0, pi1);
+    state->ins_length = 0;
 }
 
-SZ_PUBLIC void sz_generate_skylake(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
-    sz_generate_serial(text, length, nonce);
+SZ_PUBLIC sz_u64_t sz_hash_skylake(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
+
+    if (length <= 16) {
+        // Initialize the AES block with a given seed and update with the input length
+        _sz_hash_minimal_t state;
+        _sz_hash_minimal_init_haswell(&state, seed);
+        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
+        // Load the data and update the state
+        sz_u128_vec_t data_vec;
+        data_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length), start);
+        _sz_hash_minimal_update_haswell(&state, data_vec.xmm);
+        return _sz_hash_minimal_finalize_haswell(&state);
+    }
+    else if (length <= 32) {
+        // Initialize the AES block with a given seed and update with the input length
+        _sz_hash_minimal_t state;
+        _sz_hash_minimal_init_haswell(&state, seed);
+        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
+        // Load the data and update the state
+        sz_u128_vec_t data0_vec, data1_vec;
+        data0_vec.xmm = _mm_lddqu_si128(start);
+        data1_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length - 16), start + 16);
+        _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
+        _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
+        return _sz_hash_minimal_finalize_haswell(&state);
+    }
+    else if (length <= 48) {
+        // Initialize the AES block with a given seed and update with the input length
+        _sz_hash_minimal_t state;
+        _sz_hash_minimal_init_haswell(&state, seed);
+        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
+        // Load the data and update the state
+        sz_u128_vec_t data0_vec, data1_vec, data2_vec;
+        data0_vec.xmm = _mm_lddqu_si128(start);
+        data1_vec.xmm = _mm_lddqu_si128(start + 16);
+        data2_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length - 32), start + 32);
+        _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
+        _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
+        _sz_hash_minimal_update_haswell(&state, data2_vec.xmm);
+        return _sz_hash_minimal_finalize_haswell(&state);
+    }
+    else if (length <= 64) {
+        // Initialize the AES block with a given seed and update with the input length
+        _sz_hash_minimal_t state;
+        _sz_hash_minimal_init_haswell(&state, seed);
+        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
+        // Load the data and update the state
+        sz_u128_vec_t data0_vec, data1_vec, data2_vec, data3_vec;
+        data0_vec.xmm = _mm_lddqu_si128(start);
+        data1_vec.xmm = _mm_lddqu_si128(start + 16);
+        data2_vec.xmm = _mm_lddqu_si128(start + 32);
+        data3_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length - 48), start + 48);
+        _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
+        _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
+        _sz_hash_minimal_update_haswell(&state, data2_vec.xmm);
+        _sz_hash_minimal_update_haswell(&state, data3_vec.xmm);
+        return _sz_hash_minimal_finalize_haswell(&state);
+    }
+    else {
+        // Use a larger state to handle the main loop and add different offsets
+        // to different lanes of the register
+        sz_hash_state_t state;
+        sz_hash_state_init_skylake(&state, seed);
+        state.aes.zmm = _mm512_add_epi64( //
+            state.aes.zmm,                //
+            _mm512_set_epi64(0, length, 16, length, 32, length, 48, length));
+
+        for (; state.ins_length + 64 <= length; state.ins_length += 64) {
+            state.ins.zmm = _mm512_loadu_epi8(start + state.ins_length);
+            _sz_hash_state_update_haswell(&state, state.ins.xmms[0], state.ins.xmms[1], state.ins.xmms[2],
+                                          state.ins.xmms[3]);
+        }
+        if (state.ins_length < length) {
+            state.ins.zmm = _mm512_maskz_loadu_epi8( //
+                _sz_u64_mask_until(length - state.ins_length), start + state.ins_length);
+            _sz_hash_state_update_skylake(&state, state.ins.zmm);
+        }
+        return _sz_hash_state_finalize_haswell(&state);
+    }
 }
 
-SZ_PUBLIC void sz_hash_state_init_skylake(sz_hash_state_t *state, sz_u64_t seed) {
-    sz_hash_state_init_serial(state, seed);
+SZ_PUBLIC void sz_generate_skylake(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
+    sz_generate_serial(text, length, nonce);
 }
 
 SZ_PUBLIC void sz_hash_state_stream_skylake(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
@@ -663,57 +970,6 @@ SZ_PUBLIC sz_u64_t sz_bytesum_ice(sz_cptr_t text, sz_size_t length) {
     }
 }
 
-SZ_INTERNAL void _sz_hash_minimal_init_haswell(_sz_hash_minimal_t *state, sz_u64_t seed) {
-    __m128i seed_vec = _mm_set1_epi64x(seed);
-    __m128i pi0 = _mm_set_epi64x(0x13198a2e03707344ull, 0x243f6a8885a308d3ull);
-    __m128i pi1 = _mm_set_epi64x(0x082efa98ec4e6c89ull, 0xa4093822299f31d0ull);
-    // XOR the user-supplied keys with the two "pi" constants
-    __m128i k1 = _mm_xor_si128(seed_vec, pi0);
-    __m128i k2 = _mm_xor_si128(seed_vec, pi1);
-    // Export the keys to the state
-    state->aes.xmm = k1;
-    state->sum.xmm = k2;
-    state->key.xmm = _mm_xor_si128(pi0, pi1);
-}
-
-SZ_INTERNAL sz_u64_t _sz_hash_minimal_finalize_haswell(_sz_hash_minimal_t const *state) {
-    // Combine the sum and the AES block
-    __m128i mixed_registers = _mm_aesenc_si128(state->sum.xmm, state->aes.xmm);
-    // Make sure the "key" mixes enough with the state,
-    // as with less than 2 rounds - SMHasher fails
-    __m128i mixed_within_register =
-        _mm_aesdec_si128(_mm_aesdec_si128(mixed_registers, state->key.xmm), mixed_registers);
-    // Extract the low 64 bits
-    return _mm_cvtsi128_si64(mixed_within_register);
-}
-
-SZ_INTERNAL void _sz_hash_minimal_update_haswell(_sz_hash_minimal_t *state, __m128i block) {
-    // This shuffle mask is identical to "aHash":
-    __m128i const shuffle_mask = _mm_set_epi8(          //
-        0x04, 0x0b, 0x09, 0x06, 0x08, 0x0d, 0x0f, 0x05, //
-        0x0e, 0x03, 0x01, 0x0c, 0x00, 0x07, 0x0a, 0x02);
-    state->aes.xmm = _mm_aesdec_si128(state->aes.xmm, block);
-    state->sum.xmm = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmm, shuffle_mask), block);
-}
-
-SZ_PUBLIC void sz_hash_state_init_ice(sz_hash_state_t *state, sz_u64_t seed) {
-    __m512i seed_vec = _mm512_set1_epi64(seed);
-    __m512i pi0 = _mm512_set_epi64( //
-        0x13198a2e03707344ull, 0x243f6a8885a308d3ull, 0x13198a2e03707344ull, 0x243f6a8885a308d3ull,
-        0x13198a2e03707344ull, 0x243f6a8885a308d3ull, 0x13198a2e03707344ull, 0x243f6a8885a308d3ull);
-    __m512i pi1 = _mm512_set_epi64( //
-        0x082efa98ec4e6c89ull, 0xa4093822299f31d0ull, 0x082efa98ec4e6c89ull, 0xa4093822299f31d0ull,
-        0x082efa98ec4e6c89ull, 0xa4093822299f31d0ull, 0x082efa98ec4e6c89ull, 0xa4093822299f31d0ull);
-    // XOR the user-supplied keys with the two "pi" constants
-    __m512i k1 = _mm512_xor_si512(seed_vec, pi0);
-    __m512i k2 = _mm512_xor_si512(seed_vec, pi1);
-    // Export the keys to the state
-    state->aes.zmm = k1;
-    state->sum.zmm = k2;
-    state->key.zmm = _mm512_xor_si512(pi0, pi1);
-    state->ins_length = 0;
-}
-
 SZ_INTERNAL void _sz_hash_state_update_ice(sz_hash_state_t *state, __m512i block) {
     // This shuffle mask is identical to "aHash":
     __m512i const shuffle_mask = _mm512_set_epi8(       //
@@ -730,24 +986,6 @@ SZ_INTERNAL void _sz_hash_state_update_ice(sz_hash_state_t *state, __m512i block
     state->sum.zmm = _mm512_add_epi64(_mm512_shuffle_epi8(state->sum.zmm, shuffle_mask), block);
 }
 
-SZ_INTERNAL sz_u64_t _sz_hash_state_finalize_ice(sz_hash_state_t const *state) {
-    // Combine the sum and the AES block
-    __m128i mixed_registers0 = _mm_aesenc_si128(state->sum.xmms[0], state->aes.xmms[0]);
-    __m128i mixed_registers1 = _mm_aesenc_si128(state->sum.xmms[1], state->aes.xmms[1]);
-    __m128i mixed_registers2 = _mm_aesenc_si128(state->sum.xmms[2], state->aes.xmms[2]);
-    __m128i mixed_registers3 = _mm_aesenc_si128(state->sum.xmms[3], state->aes.xmms[3]);
-    // Combine the mixed registers
-    __m128i mixed_registers01 = _mm_aesenc_si128(mixed_registers0, mixed_registers1);
-    __m128i mixed_registers23 = _mm_aesenc_si128(mixed_registers2, mixed_registers3);
-    __m128i mixed_registers = _mm_aesenc_si128(mixed_registers01, mixed_registers23);
-    // Make sure the "key" mixes enough with the state,
-    // as with less than 2 rounds - SMHasher fails
-    __m128i mixed_within_register = _mm_aesdec_si128( //
-        _mm_aesdec_si128(mixed_registers, state->key.xmms[0]), mixed_registers);
-    // Extract the low 64 bits
-    return _mm_cvtsi128_si64(mixed_within_register);
-}
-
 SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
 
     if (length <= 16) {
@@ -768,7 +1006,7 @@ SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t start, sz_size_t length, sz_u64_t seed)
         state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec;
-        data0_vec.xmm = _mm_loadu_epi8(start);
+        data0_vec.xmm = _mm_lddqu_si128(start);
         data1_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length - 16), start + 16);
         _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
         _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
@@ -781,8 +1019,8 @@ SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t start, sz_size_t length, sz_u64_t seed)
         state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec;
-        data0_vec.xmm = _mm_loadu_epi8(start);
-        data1_vec.xmm = _mm_loadu_epi8(start + 16);
+        data0_vec.xmm = _mm_lddqu_si128(start);
+        data1_vec.xmm = _mm_lddqu_si128(start + 16);
         data2_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length - 32), start + 32);
         _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
         _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
@@ -796,9 +1034,9 @@ SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t start, sz_size_t length, sz_u64_t seed)
         state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec, data3_vec;
-        data0_vec.xmm = _mm_loadu_epi8(start);
-        data1_vec.xmm = _mm_loadu_epi8(start + 16);
-        data2_vec.xmm = _mm_loadu_epi8(start + 32);
+        data0_vec.xmm = _mm_lddqu_si128(start);
+        data1_vec.xmm = _mm_lddqu_si128(start + 16);
+        data2_vec.xmm = _mm_lddqu_si128(start + 32);
         data3_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length - 48), start + 48);
         _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
         _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
@@ -810,7 +1048,7 @@ SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t start, sz_size_t length, sz_u64_t seed)
         // Use a larger state to handle the main loop and add different offsets
         // to different lanes of the register
         sz_hash_state_t state;
-        sz_hash_state_init_ice(&state, seed);
+        sz_hash_state_init_skylake(&state, seed);
         state.aes.zmm = _mm512_add_epi64( //
             state.aes.zmm,                //
             _mm512_set_epi64(0, length, 16, length, 32, length, 48, length));
@@ -824,7 +1062,7 @@ SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t start, sz_size_t length, sz_u64_t seed)
                 _sz_u64_mask_until(length - state.ins_length), start + state.ins_length);
             _sz_hash_state_update_ice(&state, state.ins.zmm);
         }
-        return _sz_hash_state_finalize_ice(&state);
+        return _sz_hash_state_finalize_haswell(&state);
     }
 }
 

From 69dfa10cc9ca9e9f8c65876369223a82fb91d7f7 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 23 Feb 2025 20:20:24 +0000
Subject: [PATCH 126/751] Improve: `copy`/`move` on Haswell with interleaving

---
 include/stringzilla/memory.h | 91 ++++++++++++++++++++++++++++++++++--
 1 file changed, 87 insertions(+), 4 deletions(-)

diff --git a/include/stringzilla/memory.h b/include/stringzilla/memory.h
index cc5cc6d7..79cd840c 100644
--- a/include/stringzilla/memory.h
+++ b/include/stringzilla/memory.h
@@ -439,7 +439,44 @@ SZ_PUBLIC void sz_copy_haswell(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
     // 1 MB x 2 blocks of L2 cache per core, and one shared L3 cache buffer.
     // For now, let's avoid the cases beyond the L2 size.
     int is_huge = length > 1ull * 1024ull * 1024ull;
-    if (length <= 32) { sz_copy_serial(target, source, length); }
+    if (length < 8) {
+        while (length--) *(target++) = *(source++);
+    }
+    // The next few sections are identical here and in the `sz_move_haswell` function.
+    // We can use 2x 64-bit interleaving loads for each string, and then compare them for equality.
+    // The same approach is used in GLibC and was suggest by Denis Yaroshevskiy.
+    // https://codebrowser.dev/glibc/glibc/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S.html#518
+    // It shouldn't improve performance on microbenchmarks, but should be better in practice.
+    else if (length <= 16) {
+        sz_u64_t source_first_word = *(sz_u64_t const *)(source);
+        sz_u64_t source_second_word = *(sz_u64_t const *)(source + length - 8);
+        sz_u64_t *target_first_word_ptr = (sz_u64_t *)(target);
+        sz_u64_t *target_second_word_ptr = (sz_u64_t *)(target + length - 8);
+        *target_first_word_ptr = source_first_word;
+        *target_second_word_ptr = source_second_word;
+    }
+    // We can use 2x 128-bit interleaving loads for each string, and then compare them for equality.
+    else if (length <= 32) {
+        sz_u128_vec_t source_first_vec, source_second_vec;
+        sz_u128_vec_t *target_first_word_ptr, *target_second_word_ptr;
+        source_first_vec.xmm = _mm_lddqu_si128((__m128i const *)(source));
+        source_second_vec.xmm = _mm_lddqu_si128((__m128i const *)(source + length - 16));
+        target_first_word_ptr = (sz_u128_vec_t *)(target);
+        target_second_word_ptr = (sz_u128_vec_t *)(target + length - 16);
+        _mm_storeu_si128(&target_first_word_ptr->xmm, source_first_vec.xmm);
+        _mm_storeu_si128(&target_second_word_ptr->xmm, source_second_vec.xmm);
+    }
+    // We can use 2x 256-bit interleaving loads for each string, and then compare them for equality.
+    else if (length <= 64) {
+        sz_u256_vec_t source_first_vec, source_second_vec;
+        sz_u256_vec_t *target_first_word_ptr, *target_second_word_ptr;
+        source_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(source));
+        source_second_vec.ymm = _mm256_lddqu_si256((__m256i const *)(source + length - 32));
+        target_first_word_ptr = (sz_u256_vec_t *)(target);
+        target_second_word_ptr = (sz_u256_vec_t *)(target + length - 32);
+        _mm256_storeu_si256(&target_first_word_ptr->ymm, source_first_vec.ymm);
+        _mm256_storeu_si256(&target_second_word_ptr->ymm, source_second_vec.ymm);
+    }
     // When dealing with larger arrays, the optimization is not as simple as with the `sz_fill_haswell` function,
     // as both buffers may be unaligned. If we are lucky and the requested operation is some huge page transfer,
     // we can use aligned loads and stores, and the performance will be great.
@@ -471,7 +508,7 @@ SZ_PUBLIC void sz_copy_haswell(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
             for (; body_length >= 32; target += 32, source += 32, body_length -= 32)
                 _mm256_store_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
         }
-        // When the biffer is huge, we can traverse it in 2 directions.
+        // When the buffer is huge, we can traverse it in 2 directions.
         else {
             for (; body_length >= 64; target += 32, source += 32, body_length -= 64) {
                 _mm256_store_si256((__m256i *)(target), _mm256_lddqu_si256((__m256i const *)(source)));
@@ -494,13 +531,59 @@ SZ_PUBLIC void sz_copy_haswell(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
 }
 
 SZ_PUBLIC void sz_move_haswell(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
-    if (target < source || target >= source + length) {
+
+    if (length < 8) {
+        if (target < source)
+            while (length--) *(target++) = *(source++);
+        else {
+            // Jump to the end and walk backwards:
+            target += length, source += length;
+            while (length--) *(--target) = *(--source);
+        }
+    }
+    // The next few sections are identical here and in the `sz_copy_haswell` function.
+    // We can use 2x 64-bit interleaving loads for each string, and then compare them for equality.
+    // The same approach is used in GLibC and was suggest by Denis Yaroshevskiy.
+    // https://codebrowser.dev/glibc/glibc/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S.html#518
+    // It shouldn't improve performance on microbenchmarks, but should be better in practice.
+    else if (length <= 16) {
+        sz_u64_t source_first_word = *(sz_u64_t const *)(source);
+        sz_u64_t source_second_word = *(sz_u64_t const *)(source + length - 8);
+        sz_u64_t *target_first_word_ptr = (sz_u64_t *)(target);
+        sz_u64_t *target_second_word_ptr = (sz_u64_t *)(target + length - 8);
+        *target_first_word_ptr = source_first_word;
+        *target_second_word_ptr = source_second_word;
+    }
+    // We can use 2x 128-bit interleaving loads for each string, and then compare them for equality.
+    else if (length <= 32) {
+        sz_u128_vec_t source_first_vec, source_second_vec;
+        sz_u128_vec_t *target_first_word_ptr, *target_second_word_ptr;
+        source_first_vec.xmm = _mm_lddqu_si128((__m128i const *)(source));
+        source_second_vec.xmm = _mm_lddqu_si128((__m128i const *)(source + length - 16));
+        target_first_word_ptr = (sz_u128_vec_t *)(target);
+        target_second_word_ptr = (sz_u128_vec_t *)(target + length - 16);
+        _mm_storeu_si128(&target_first_word_ptr->xmm, source_first_vec.xmm);
+        _mm_storeu_si128(&target_second_word_ptr->xmm, source_second_vec.xmm);
+    }
+    // We can use 2x 256-bit interleaving loads for each string, and then compare them for equality.
+    else if (length <= 64) {
+        sz_u256_vec_t source_first_vec, source_second_vec;
+        sz_u256_vec_t *target_first_word_ptr, *target_second_word_ptr;
+        source_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(source));
+        source_second_vec.ymm = _mm256_lddqu_si256((__m256i const *)(source + length - 32));
+        target_first_word_ptr = (sz_u256_vec_t *)(target);
+        target_second_word_ptr = (sz_u256_vec_t *)(target + length - 32);
+        _mm256_storeu_si256(&target_first_word_ptr->ymm, source_first_vec.ymm);
+        _mm256_storeu_si256(&target_second_word_ptr->ymm, source_second_vec.ymm);
+    }
+    // When dealing with larger arrays, we keep things simple:
+    else if (target < source || target >= source + length) {
         for (; length >= 32; target += 32, source += 32, length -= 32)
             _mm256_storeu_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
         while (length--) *(target++) = *(source++);
     }
     else {
-        // Jump to the end and walk backwards.
+        // Jump to the end and walk backwards:
         for (target += length, source += length; length >= 32; length -= 32)
             _mm256_storeu_si256((__m256i *)(target -= 32), _mm256_lddqu_si256((__m256i const *)(source -= 32)));
         while (length--) *(--target) = *(--source);

From 268af531e29a0077272c3bf023944c2c92ec4ccc Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 25 Feb 2025 23:39:26 +0000
Subject: [PATCH 127/751] Fix: Passing new hashing tests

---
 include/stringzilla/hash.h | 781 ++++++++++++++++++++++++++++---------
 1 file changed, 605 insertions(+), 176 deletions(-)

diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 539f016a..105ffabe 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -36,7 +36,7 @@
  *  are combined with "shuffle & add" instructions to provide a high level of entropy in the output. That operation
  *  is practically free, as many modern CPUs will dispatch them on different ports. On x86, for example:
  *
- *  - `VAESDEC` (ZMM, ZMM, ZMM)`:
+ *  - `VAESENC` (ZMM, ZMM, ZMM)`:
  *    - on Intel Ice Lake: 5 cycles on port 0.
  *    - On AMD Zen4: 4 cycles on ports 0 or 1.
  *  - `VPSHUFB_Z (ZMM, K, ZMM, ZMM)`
@@ -46,10 +46,19 @@
  *    - on Intel Ice Lake: 1 cycle on ports 0 or 5.
  *    - On AMD Zen4: 1 cycle on ports 0, 1, 2, 3.
  *
- *  Unlike "aHash", on long inputs, we use a procedure that is more vector-friendly on modern servers.
+ *  Unlike "aHash", the length is not mixed into "AES" block at start to allow incremental construction.
+ *  Unlike "aHash", on long inputs, we use a heavier procedure that is more vector-friendly on modern servers.
  *  Unlike "aHash", we don't load interleaved memory regions, making vectorized variant more similar to sequential.
- *  On platforms like Skylake-X or newer, we also benefit from masked loads.
+ *  Unlike "aHash", on platforms like Intel Skylake-X or AWS Graviton 3, we use masked loads.
+ *  Unlike "aHash", in final folding procedure, we use the same `VAESENC` instead of `VAESDEC`, which
+ *  still provides the same level of mixing, but allows us to have a lighter serial fallback implementation.
  *
+ *  @see Reini Urban's more active fork of SMHasher by Austin Appleby: https://github.com/rurban/smhasher
+ *  @see The serial AES routines are based on Morten Jensen's "tiny-AES-c": https://github.com/kokke/tiny-AES-c
+ *  @see The "xxHash" C implementation by Yann Collet: https://github.com/Cyan4973/xxHash
+ *  @see The "aHash" Rust implementation by Tom Kaitchuck: https://github.com/tkaitchuck/aHash
+ *  @see "Emulating x86 AES Intrinsics on ARMv8-A" by Michael Brase:
+ *       https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a/
  */
 #ifndef STRINGZILLA_HASH_H_
 #define STRINGZILLA_HASH_H_
@@ -153,9 +162,8 @@ SZ_DYNAMIC void sz_generate(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
 typedef struct sz_hash_state_t {
     sz_u512_vec_t aes;
     sz_u512_vec_t sum;
-    sz_u512_vec_t key;
-
     sz_u512_vec_t ins;
+    sz_u128_vec_t key;
     sz_size_t ins_length;
 } sz_hash_state_t;
 
@@ -282,6 +290,24 @@ SZ_PUBLIC sz_u64_t sz_hash_state_fold_neon(sz_hash_state_t const *state);
 
 #pragma endregion // Core API
 
+#pragma region Helper Methods
+
+/**
+ *  @brief  Compares the state of two running hashes.
+ *  @note   The current content of the `ins` buffer and its length is ignored.
+ */
+SZ_PUBLIC sz_bool_t sz_hash_state_equal(sz_hash_state_t const *lhs, sz_hash_state_t const *rhs) {
+    return lhs->aes.u64s[0] == rhs->aes.u64s[0] && lhs->aes.u64s[1] == rhs->aes.u64s[1] &&
+                   lhs->aes.u64s[2] == rhs->aes.u64s[2] && lhs->aes.u64s[3] == rhs->aes.u64s[3] &&
+                   lhs->sum.u64s[0] == rhs->sum.u64s[0] && lhs->sum.u64s[1] == rhs->sum.u64s[1] &&
+                   lhs->sum.u64s[2] == rhs->sum.u64s[2] && lhs->sum.u64s[3] == rhs->sum.u64s[3] &&
+                   lhs->key.u64s[0] == rhs->key.u64s[0] && lhs->key.u64s[1] == rhs->key.u64s[1]
+               ? sz_true_k
+               : sz_false_k;
+}
+
+#pragma endregion // Helper Methods
+
 #pragma region Serial Implementation
 
 SZ_PUBLIC sz_u64_t sz_bytesum_serial(sz_cptr_t text, sz_size_t length) {
@@ -292,24 +318,392 @@ SZ_PUBLIC sz_u64_t sz_bytesum_serial(sz_cptr_t text, sz_size_t length) {
     return bytesum;
 }
 
+/**
+ *  @brief  Emulates the behaviour of `_mm_aesenc_si128` for a single round.
+ *          This function is used as a fallback when the hardware-accelerated version is not available.
+ *  @return Result of `MixColumns(SubBytes(ShiftRows(state))) ^ round_key`.
+ *  @see    Based on Jean-Philippe Aumasson's reference implementation: https://github.com/veorq/aesenc-noNI
+ */
+SZ_INTERNAL sz_u128_vec_t _sz_emulate_aesenc_si128_serial(sz_u128_vec_t state_vec, sz_u128_vec_t round_key_vec) {
+    static sz_u8_t const sbox[256] = {
+        // 0     1    2      3     4    5     6     7      8    9     A      B     C     D     E     F
+        0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, //
+        0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, //
+        0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, //
+        0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, //
+        0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, //
+        0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, //
+        0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, //
+        0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, //
+        0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, //
+        0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, //
+        0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, //
+        0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, //
+        0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, //
+        0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, //
+        0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, //
+        0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16};
+
+    // Combine `ShiftRows` and `SubBytes`
+    sz_u8_t state_2d[4][4];
+    for (int i = 0; i < 16; ++i) state_2d[((i / 4) + 4 - (i % 4)) % 4][i % 4] = sbox[state_vec.u8s[i]];
+#define _sz_gf2_double(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
+    // Perform `MixColumns` using GF2 multiplication by 2
+    for (int i = 0; i < 4; ++i) {
+        sz_u8_t t = state_2d[i][0];
+        sz_u8_t u = state_2d[i][0] ^ state_2d[i][1] ^ state_2d[i][2] ^ state_2d[i][3];
+        state_2d[i][0] ^= u ^ _sz_gf2_double(state_2d[i][0] ^ state_2d[i][1]);
+        state_2d[i][1] ^= u ^ _sz_gf2_double(state_2d[i][1] ^ state_2d[i][2]);
+        state_2d[i][2] ^= u ^ _sz_gf2_double(state_2d[i][2] ^ state_2d[i][3]);
+        state_2d[i][3] ^= u ^ _sz_gf2_double(state_2d[i][3] ^ t);
+    }
+#undef _sz_gf2_double
+    // Export `XOR`-ing with the round key
+    sz_u128_vec_t result;
+    for (int i = 0; i < 16; ++i) result.u8s[i] = state_2d[i / 4][i % 4] ^ round_key_vec.u8s[i];
+    return result;
+}
+
+SZ_INTERNAL sz_u128_vec_t _sz_emulate_shuffle_epi8_serial(sz_u128_vec_t state_vec, sz_u8_t const order[16]) {
+    sz_u128_vec_t result;
+    for (int i = 0; i < 16; ++i) result.u8s[i] = state_vec.u8s[order[i]];
+    return result;
+}
+
+/**
+ *  @brief  Provides 1024 bits worth of precomputed Pi constants for the hash.
+ *  @return Pointer aligned to 64 bytes on SIMD-capable platforms.
+ *
+ *  Bailey-Borwein-Plouffe @b (BBP) formula is used to compute the hexadecimal digits of Pi.
+ *  It can be easily implemented in just 10 lines of Python and for 1024 bits requires 256 digits:
+ *
+ *  @code{.py}
+ *      def pi(digits: int) -> str:
+ *          n, d = 0, 1
+ *          HEX = "0123456789ABCDEF"
+ *          result = ["3."]
+ *          for i in range(digits):
+ *              xn = 120 * i**2 + 151 * i + 47
+ *              xd = 512 * i**4 + 1024 * i**3 + 712 * i**2 + 194 * i + 15
+ *              n = ((16 * n * xd) + (xn * d)) % (d * xd)
+ *              d *= xd
+ *              result.append(HEX[(16 * n) // d])
+ *          return "".join(result)
+ *  @endcode
+ *
+ *  For `pi(16)` the result is `3.243F6A8885A308D3` and you can find the digits after the dot in
+ *  the first element of output array.
+ *
+ *  @see    Bailey-Borwein-Plouffe @b (BBP) formula explanation by Mosè Giordano:
+ *          https://giordano.github.io/blog/2017-11-21-hexadecimal-pi/
+ *
+ */
+SZ_INTERNAL sz_u64_t const *_sz_hash_pi_constants(void) {
+    static _SZ_ALIGN64 sz_u64_t const pi[16] = {
+        0x243F6A8885A308D3ull, 0x13198A2E03707344ull, 0xA4093822299F31D0ull, 0x082EFA98EC4E6C89ull,
+        0x452821E638D01377ull, 0xBE5466CF34E90C6Cull, 0xC0AC29B7C97C50DDull, 0x3F84D5B5B5470917ull,
+        0x9216D5D98979FB1Bull, 0xD1310BA698DFB5ACull, 0x2FFD72DBD01ADFB7ull, 0xB8E1AFED6A267E96ull,
+        0xBA7C9045F12C7F99ull, 0x24A19947B3916CF7ull, 0x0801F2E2858EFC16ull, 0x636920D871574E69ull,
+    };
+    return &pi[0];
+}
+
+/**
+ *  @brief  Provides a shuffle mask for the additive part, identical to "aHash" in a single lane.
+ *  @return Pointer aligned to 64 bytes on SIMD-capable platforms.
+ */
+SZ_INTERNAL sz_u8_t const *_sz_hash_u8x16x4_shuffle(void) {
+    static _SZ_ALIGN64 sz_u8_t const shuffle[64] = {
+        0x04, 0x0b, 0x09, 0x06, 0x08, 0x0d, 0x0f, 0x05, //
+        0x0e, 0x03, 0x01, 0x0c, 0x00, 0x07, 0x0a, 0x02, //
+        0x04, 0x0b, 0x09, 0x06, 0x08, 0x0d, 0x0f, 0x05, //
+        0x0e, 0x03, 0x01, 0x0c, 0x00, 0x07, 0x0a, 0x02, //
+        0x04, 0x0b, 0x09, 0x06, 0x08, 0x0d, 0x0f, 0x05, //
+        0x0e, 0x03, 0x01, 0x0c, 0x00, 0x07, 0x0a, 0x02, //
+        0x04, 0x0b, 0x09, 0x06, 0x08, 0x0d, 0x0f, 0x05, //
+        0x0e, 0x03, 0x01, 0x0c, 0x00, 0x07, 0x0a, 0x02  //
+    };
+    return &shuffle[0];
+}
+
+SZ_INTERNAL void _sz_hash_minimal_init_serial(_sz_hash_minimal_t *state, sz_u64_t seed) {
+
+    // The key is made from the seed and half of it will be mixed with the length in the end
+    state->key.u64s[1] = seed;
+    state->key.u64s[0] = seed;
+
+    // XOR the user-supplied keys with the two "pi" constants
+    sz_u64_t const *pi = _sz_hash_pi_constants();
+    state->aes.u64s[0] = seed ^ pi[0];
+    state->aes.u64s[1] = seed ^ pi[1];
+    state->sum.u64s[0] = seed ^ pi[8];
+    state->sum.u64s[1] = seed ^ pi[9];
+}
+
+SZ_INTERNAL void _sz_hash_minimal_update_serial(_sz_hash_minimal_t *state, sz_u128_vec_t block) {
+    sz_u8_t const *shuffle = _sz_hash_u8x16x4_shuffle();
+    state->aes = _sz_emulate_aesenc_si128_serial(state->aes, block);
+    state->sum = _sz_emulate_shuffle_epi8_serial(state->sum, shuffle);
+    state->sum.u64s[0] += block.u64s[0], state->sum.u64s[1] += block.u64s[1];
+}
+
+SZ_INTERNAL sz_u64_t _sz_hash_minimal_finalize_serial(_sz_hash_minimal_t const *state, sz_size_t length) {
+    // Mix the length into the key
+    sz_u128_vec_t key_with_length = state->key;
+    key_with_length.u64s[0] += length;
+    // Combine the "sum" and the "AES" blocks
+    sz_u128_vec_t mixed_registers = _sz_emulate_aesenc_si128_serial(state->sum, state->aes);
+    // Make sure the "key" mixes enough with the state,
+    // as with less than 2 rounds - SMHasher fails
+    sz_u128_vec_t mixed_within_register = _sz_emulate_aesenc_si128_serial(
+        _sz_emulate_aesenc_si128_serial(mixed_registers, key_with_length), mixed_registers);
+    // Extract the low 64 bits
+    return mixed_within_register.u64s[0];
+}
+
+SZ_INTERNAL void _sz_hash_shift_in_register_serial(sz_u128_vec_t *vec, int shift_bytes) {
+    // One of the ridiculous things about x86, the `bsrli` instruction requires its operand to be an immediate.
+    // On GCC and Clang, we could use the provided `__int128` type, but MSVC doesn't support it.
+    // So we need to emulate it with 2x 64-bit shifts.
+    if (shift_bytes >= 8) {
+        vec->u64s[0] = (vec->u64s[1] >> (shift_bytes - 8) * 8);
+        vec->u64s[1] = (0);
+    }
+    else if (shift_bytes) { //! If `shift_bytes == 0`, the shift would cause UB.
+        vec->u64s[0] = (vec->u64s[0] >> shift_bytes * 8) | (vec->u64s[1] << (8 - shift_bytes) * 8);
+        vec->u64s[1] = (vec->u64s[1] >> shift_bytes * 8);
+    }
+}
+
+SZ_PUBLIC void sz_hash_state_init_serial(sz_hash_state_t *state, sz_u64_t seed) {
+
+    // The key is made from the seed and half of it will be mixed with the length in the end
+    state->key.u64s[0] = seed;
+    state->key.u64s[1] = seed;
+
+    // XOR the user-supplied keys with the two "pi" constants
+    sz_u64_t const *pi = _sz_hash_pi_constants();
+    for (int i = 0; i < 8; ++i) state->aes.u64s[i] = seed ^ pi[i];
+    for (int i = 0; i < 8; ++i) state->sum.u64s[i] = seed ^ pi[i + 8];
+
+    // The inputs are zeroed out at the beginning
+    for (int i = 0; i < 8; ++i) state->ins.u64s[i] = 0;
+    state->ins_length = 0;
+}
+
+SZ_INTERNAL void _sz_hash_state_update_serial(sz_hash_state_t *state) {
+    sz_u8_t const *shuffle = _sz_hash_u8x16x4_shuffle();
+
+    // To reuse the snippets above, let's cast to our familiar 128-bit vectors
+    sz_u128_vec_t *aes_vecs = (sz_u128_vec_t *)&state->aes.u64s[0];
+    sz_u128_vec_t *sum_vecs = (sz_u128_vec_t *)&state->sum.u64s[0];
+    sz_u128_vec_t *ins_vecs = (sz_u128_vec_t *)&state->ins.u64s[0];
+
+    // First 128-bit block
+    aes_vecs[0] = _sz_emulate_aesenc_si128_serial(aes_vecs[0], ins_vecs[0]);
+    sum_vecs[0] = _sz_emulate_shuffle_epi8_serial(sum_vecs[0], shuffle);
+    sum_vecs[0].u64s[0] += ins_vecs[0].u64s[0], sum_vecs[0].u64s[1] += ins_vecs[0].u64s[1];
+
+    // Second 128-bit block
+    aes_vecs[1] = _sz_emulate_aesenc_si128_serial(aes_vecs[1], ins_vecs[1]);
+    sum_vecs[1] = _sz_emulate_shuffle_epi8_serial(sum_vecs[1], shuffle);
+    sum_vecs[1].u64s[0] += ins_vecs[1].u64s[0], sum_vecs[1].u64s[1] += ins_vecs[1].u64s[1];
+
+    // Third 128-bit block
+    aes_vecs[2] = _sz_emulate_aesenc_si128_serial(aes_vecs[2], ins_vecs[2]);
+    sum_vecs[2] = _sz_emulate_shuffle_epi8_serial(sum_vecs[2], shuffle);
+    sum_vecs[2].u64s[0] += ins_vecs[2].u64s[0], sum_vecs[2].u64s[1] += ins_vecs[2].u64s[1];
+
+    // Fourth 128-bit block
+    aes_vecs[3] = _sz_emulate_aesenc_si128_serial(aes_vecs[3], ins_vecs[3]);
+    sum_vecs[3] = _sz_emulate_shuffle_epi8_serial(sum_vecs[3], shuffle);
+    sum_vecs[3].u64s[0] += ins_vecs[3].u64s[0], sum_vecs[3].u64s[1] += ins_vecs[3].u64s[1];
+}
+
+SZ_INTERNAL sz_u64_t _sz_hash_state_finalize_serial(sz_hash_state_t const *state) {
+
+    // Mix the length into the key
+    sz_u128_vec_t key_with_length = state->key;
+    key_with_length.u64s[0] += state->ins_length;
+
+    // To reuse the snippets above, let's cast to our familiar 128-bit vectors
+    sz_u128_vec_t *aes_vecs = (sz_u128_vec_t *)&state->aes.u64s[0];
+    sz_u128_vec_t *sum_vecs = (sz_u128_vec_t *)&state->sum.u64s[0];
+
+    // Combine the "sum" and the "AES" blocks
+    sz_u128_vec_t mixed_registers0 = _sz_emulate_aesenc_si128_serial(sum_vecs[0], aes_vecs[0]);
+    sz_u128_vec_t mixed_registers1 = _sz_emulate_aesenc_si128_serial(sum_vecs[1], aes_vecs[1]);
+    sz_u128_vec_t mixed_registers2 = _sz_emulate_aesenc_si128_serial(sum_vecs[2], aes_vecs[2]);
+    sz_u128_vec_t mixed_registers3 = _sz_emulate_aesenc_si128_serial(sum_vecs[3], aes_vecs[3]);
+
+    // Combine the mixed registers
+    sz_u128_vec_t mixed_registers01 = _sz_emulate_aesenc_si128_serial(mixed_registers0, mixed_registers1);
+    sz_u128_vec_t mixed_registers23 = _sz_emulate_aesenc_si128_serial(mixed_registers2, mixed_registers3);
+    sz_u128_vec_t mixed_registers = _sz_emulate_aesenc_si128_serial(mixed_registers01, mixed_registers23);
+
+    // Make sure the "key" mixes enough with the state,
+    // as with less than 2 rounds - SMHasher fails
+    sz_u128_vec_t mixed_within_register = _sz_emulate_aesenc_si128_serial(
+        _sz_emulate_aesenc_si128_serial(mixed_registers, key_with_length), mixed_registers);
+
+    // Extract the low 64 bits
+    return mixed_within_register.u64s[0];
+}
+
 SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
-    sz_unused(start && length && seed);
-    return 0;
+    if (length <= 16) {
+        // Initialize the AES block with a given seed
+        _sz_hash_minimal_t state;
+        _sz_hash_minimal_init_serial(&state, seed);
+        // Load the data and update the state
+        sz_u128_vec_t data_vec;
+        data_vec.u64s[0] = data_vec.u64s[1] = 0;
+        for (sz_size_t i = 0; i < length; ++i) data_vec.u8s[i] = start[i];
+        _sz_hash_minimal_update_serial(&state, data_vec);
+        return _sz_hash_minimal_finalize_serial(&state, length);
+    }
+    else if (length <= 32) {
+        // Initialize the AES block with a given seed
+        _sz_hash_minimal_t state;
+        _sz_hash_minimal_init_serial(&state, seed);
+        // Load the data and update the state
+        sz_u128_vec_t data0_vec, data1_vec;
+        data0_vec.u64s[0] = *(sz_u64_t const *)(start);
+        data0_vec.u64s[1] = *(sz_u64_t const *)(start + 8);
+        data1_vec.u64s[0] = *(sz_u64_t const *)(start + length - 16);
+        data1_vec.u64s[1] = *(sz_u64_t const *)(start + length - 8);
+        // Let's shift the data within the register to de-interleave the bytes.
+        _sz_hash_shift_in_register_serial(&data1_vec, 32 - length);
+        _sz_hash_minimal_update_serial(&state, data0_vec);
+        _sz_hash_minimal_update_serial(&state, data1_vec);
+        return _sz_hash_minimal_finalize_serial(&state, length);
+    }
+    else if (length <= 48) {
+        // Initialize the AES block with a given seed
+        _sz_hash_minimal_t state;
+        _sz_hash_minimal_init_serial(&state, seed);
+        // Load the data and update the state
+        sz_u128_vec_t data0_vec, data1_vec, data2_vec;
+        data0_vec.u64s[0] = *(sz_u64_t const *)(start);
+        data0_vec.u64s[1] = *(sz_u64_t const *)(start + 8);
+        data1_vec.u64s[0] = *(sz_u64_t const *)(start + 16);
+        data1_vec.u64s[1] = *(sz_u64_t const *)(start + 24);
+        data2_vec.u64s[0] = *(sz_u64_t const *)(start + length - 16);
+        data2_vec.u64s[1] = *(sz_u64_t const *)(start + length - 8);
+        // Let's shift the data within the register to de-interleave the bytes.
+        _sz_hash_shift_in_register_serial(&data2_vec, 48 - length);
+        _sz_hash_minimal_update_serial(&state, data0_vec);
+        _sz_hash_minimal_update_serial(&state, data1_vec);
+        _sz_hash_minimal_update_serial(&state, data2_vec);
+        return _sz_hash_minimal_finalize_serial(&state, length);
+    }
+    else if (length <= 64) {
+        // Initialize the AES block with a given seed
+        _sz_hash_minimal_t state;
+        _sz_hash_minimal_init_serial(&state, seed);
+        // Load the data and update the state
+        sz_u128_vec_t data0_vec, data1_vec, data2_vec, data3_vec;
+        data0_vec.u64s[0] = *(sz_u64_t const *)(start);
+        data0_vec.u64s[1] = *(sz_u64_t const *)(start + 8);
+        data1_vec.u64s[0] = *(sz_u64_t const *)(start + 16);
+        data1_vec.u64s[1] = *(sz_u64_t const *)(start + 24);
+        data2_vec.u64s[0] = *(sz_u64_t const *)(start + 32);
+        data2_vec.u64s[1] = *(sz_u64_t const *)(start + 40);
+        data3_vec.u64s[0] = *(sz_u64_t const *)(start + length - 16);
+        data3_vec.u64s[1] = *(sz_u64_t const *)(start + length - 8);
+        // Let's shift the data within the register to de-interleave the bytes.
+        _sz_hash_shift_in_register_serial(&data3_vec, 64 - length);
+        _sz_hash_minimal_update_serial(&state, data0_vec);
+        _sz_hash_minimal_update_serial(&state, data1_vec);
+        _sz_hash_minimal_update_serial(&state, data2_vec);
+        _sz_hash_minimal_update_serial(&state, data3_vec);
+        return _sz_hash_minimal_finalize_serial(&state, length);
+    }
+    else {
+        // Use a larger state to handle the main loop and add different offsets
+        // to different lanes of the register
+        sz_hash_state_t state;
+        sz_hash_state_init_serial(&state, seed);
+
+        for (; state.ins_length + 64 <= length; state.ins_length += 64) {
+            state.ins.u64s[0] = *(sz_u64_t const *)(start + state.ins_length);
+            state.ins.u64s[1] = *(sz_u64_t const *)(start + state.ins_length + 8);
+            state.ins.u64s[2] = *(sz_u64_t const *)(start + state.ins_length + 16);
+            state.ins.u64s[3] = *(sz_u64_t const *)(start + state.ins_length + 24);
+            state.ins.u64s[4] = *(sz_u64_t const *)(start + state.ins_length + 32);
+            state.ins.u64s[5] = *(sz_u64_t const *)(start + state.ins_length + 40);
+            state.ins.u64s[6] = *(sz_u64_t const *)(start + state.ins_length + 48);
+            state.ins.u64s[7] = *(sz_u64_t const *)(start + state.ins_length + 56);
+            _sz_hash_state_update_serial(&state);
+        }
+        if (state.ins_length < length) {
+            for (sz_size_t i = 0; i != 8; ++i) state.ins.u64s[i] = 0;
+            for (sz_size_t i = 0; state.ins_length < length; ++i, ++state.ins_length)
+                state.ins.u8s[i] = start[state.ins_length];
+            _sz_hash_state_update_serial(&state);
+            state.ins_length = length;
+        }
+        return _sz_hash_state_finalize_serial(&state);
+    }
 }
 
-SZ_PUBLIC void sz_generate_serial(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
-    sz_unused(text && length && nonce);
+SZ_PUBLIC void sz_hash_state_stream_serial(sz_hash_state_t *state_ptr, sz_cptr_t text, sz_size_t length) {
+    while (length) {
+        sz_size_t progress_in_block = state_ptr->ins_length % 64;
+        sz_size_t to_copy = sz_min_of_two(length, 64 - progress_in_block);
+        int const will_fill_block = progress_in_block + to_copy == 64;
+        // Update the metadata before we modify the `to_copy` variable
+        state_ptr->ins_length += to_copy;
+        length -= to_copy;
+        // Append to the internal buffer until it's full
+        while (to_copy--) state_ptr->ins.u8s[progress_in_block++] = *text++;
+        // If we've reached the end of the buffer, update the state
+        if (will_fill_block) {
+            _sz_hash_state_update_serial(state_ptr);
+            // Reset to zeros now, so we don't have to overwrite an immutable buffer in the folding state
+            for (int i = 0; i < 8; ++i) state_ptr->ins.u64s[i] = 0;
+        }
+    }
 }
 
-SZ_PUBLIC void sz_hash_state_init_serial(sz_hash_state_t *state, sz_u64_t seed) { sz_unused(state && seed); }
+SZ_PUBLIC sz_u64_t sz_hash_state_fold_serial(sz_hash_state_t const *state_ptr) {
+    sz_size_t length = state_ptr->ins_length;
+    if (length >= 64) return _sz_hash_state_finalize_serial(state_ptr);
+
+    // Switch back to a smaller "minimal" state for small inputs
+    _sz_hash_minimal_t state;
+    state.key = state_ptr->key;
+    state.aes = *(sz_u128_vec_t const *)&state_ptr->aes.u64s[0];
+    state.sum = *(sz_u128_vec_t const *)&state_ptr->sum.u64s[0];
 
-SZ_PUBLIC void sz_hash_state_stream_serial(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
-    sz_unused(state && text && length);
+    // The logic is different depending on the length of the input
+    sz_u128_vec_t const *ins_vecs = (sz_u128_vec_t const *)&state_ptr->ins.u64s[0];
+    if (length <= 16) {
+        _sz_hash_minimal_update_serial(&state, ins_vecs[0]);
+        return _sz_hash_minimal_finalize_serial(&state, length);
+    }
+    else if (length <= 32) {
+        _sz_hash_minimal_update_serial(&state, ins_vecs[0]);
+        _sz_hash_minimal_update_serial(&state, ins_vecs[1]);
+        return _sz_hash_minimal_finalize_serial(&state, length);
+    }
+    else if (length <= 48) {
+        _sz_hash_minimal_update_serial(&state, ins_vecs[0]);
+        _sz_hash_minimal_update_serial(&state, ins_vecs[1]);
+        _sz_hash_minimal_update_serial(&state, ins_vecs[2]);
+        return _sz_hash_minimal_finalize_serial(&state, length);
+    }
+    else {
+        _sz_hash_minimal_update_serial(&state, ins_vecs[0]);
+        _sz_hash_minimal_update_serial(&state, ins_vecs[1]);
+        _sz_hash_minimal_update_serial(&state, ins_vecs[2]);
+        _sz_hash_minimal_update_serial(&state, ins_vecs[3]);
+        return _sz_hash_minimal_finalize_serial(&state, length);
+    }
 }
 
-SZ_PUBLIC sz_u64_t sz_hash_state_fold_serial(sz_hash_state_t const *state) {
-    sz_unused(state);
-    return 0;
+SZ_PUBLIC void sz_generate_serial(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
+    sz_unused(text && length && nonce);
 }
 
 #pragma endregion // Serial Implementation
@@ -321,7 +715,7 @@ SZ_PUBLIC sz_u64_t sz_hash_state_fold_serial(sz_hash_state_t const *state) {
 #if SZ_USE_HASWELL
 #pragma GCC push_options
 #pragma GCC target("avx2")
-#pragma clang attribute push(__attribute__((target("avx3332"))), apply_to = function)
+#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
 
 SZ_PUBLIC sz_u64_t sz_bytesum_haswell(sz_cptr_t text, sz_size_t length) {
     // The naive implementation of this function is very simple.
@@ -408,82 +802,87 @@ SZ_PUBLIC sz_u64_t sz_bytesum_haswell(sz_cptr_t text, sz_size_t length) {
 }
 
 SZ_INTERNAL void _sz_hash_minimal_init_haswell(_sz_hash_minimal_t *state, sz_u64_t seed) {
+    sz_u64_t const *pi = _sz_hash_pi_constants();
+    __m128i const pi0 = _mm_load_si128((__m128i const *)(pi));
+    __m128i const pi1 = _mm_load_si128((__m128i const *)(pi + 8));
+
+    // The key is made from the seed and half of it will be mixed with the length in the end
     __m128i seed_vec = _mm_set1_epi64x(seed);
-    __m128i pi0 = _mm_set_epi64x(0x13198a2e03707344ull, 0x243f6a8885a308d3ull);
-    __m128i pi1 = _mm_set_epi64x(0x082efa98ec4e6c89ull, 0xa4093822299f31d0ull);
+    state->key.xmm = seed_vec;
+
     // XOR the user-supplied keys with the two "pi" constants
     __m128i k1 = _mm_xor_si128(seed_vec, pi0);
     __m128i k2 = _mm_xor_si128(seed_vec, pi1);
-    // Export the keys to the state
+
+    // The first 128 bits of the "sum" and "AES" blocks are the same
     state->aes.xmm = k1;
     state->sum.xmm = k2;
-    state->key.xmm = _mm_xor_si128(pi0, pi1);
 }
 
-SZ_INTERNAL sz_u64_t _sz_hash_minimal_finalize_haswell(_sz_hash_minimal_t const *state) {
-    // Combine the sum and the AES block
+SZ_INTERNAL sz_u64_t _sz_hash_minimal_finalize_haswell(_sz_hash_minimal_t const *state, sz_size_t length) {
+    // Mix the length into the key
+    __m128i key_with_length = _mm_add_epi64(state->key.xmm, _mm_set_epi64x(0, length));
+    // Combine the "sum" and the "AES" blocks
     __m128i mixed_registers = _mm_aesenc_si128(state->sum.xmm, state->aes.xmm);
     // Make sure the "key" mixes enough with the state,
     // as with less than 2 rounds - SMHasher fails
     __m128i mixed_within_register =
-        _mm_aesdec_si128(_mm_aesdec_si128(mixed_registers, state->key.xmm), mixed_registers);
+        _mm_aesenc_si128(_mm_aesenc_si128(mixed_registers, key_with_length), mixed_registers);
     // Extract the low 64 bits
     return _mm_cvtsi128_si64(mixed_within_register);
 }
 
 SZ_INTERNAL void _sz_hash_minimal_update_haswell(_sz_hash_minimal_t *state, __m128i block) {
-    // This shuffle mask is identical to "aHash":
-    __m128i const shuffle_mask = _mm_set_epi8(          //
-        0x04, 0x0b, 0x09, 0x06, 0x08, 0x0d, 0x0f, 0x05, //
-        0x0e, 0x03, 0x01, 0x0c, 0x00, 0x07, 0x0a, 0x02);
-    state->aes.xmm = _mm_aesdec_si128(state->aes.xmm, block);
+    __m128i const shuffle_mask = _mm_load_si128((__m128i const *)_sz_hash_u8x16x4_shuffle());
+    state->aes.xmm = _mm_aesenc_si128(state->aes.xmm, block);
     state->sum.xmm = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmm, shuffle_mask), block);
 }
 
 SZ_PUBLIC void sz_hash_state_init_haswell(sz_hash_state_t *state, sz_u64_t seed) {
+    // The key is made from the seed and half of it will be mixed with the length in the end
     __m128i seed_vec = _mm_set1_epi64x(seed);
-    __m128i pi0 = _mm_set_epi64x(0x13198a2e03707344ull, 0x243f6a8885a308d3ull);
-    __m128i pi1 = _mm_set_epi64x(0x082efa98ec4e6c89ull, 0xa4093822299f31d0ull);
+    state->key.xmm = seed_vec;
+
     // XOR the user-supplied keys with the two "pi" constants
-    __m128i k1 = _mm_xor_si128(seed_vec, pi0);
-    __m128i k2 = _mm_xor_si128(seed_vec, pi1);
-    // Export the keys to the state
-    state->aes.xmms[0] = state->aes.xmms[1] = state->aes.xmms[2] = state->aes.xmms[3] = k1;
-    state->sum.xmms[0] = state->sum.xmms[1] = state->sum.xmms[2] = state->sum.xmms[3] = k2;
-    state->key.xmms[0] = state->key.xmms[1] = state->key.xmms[2] = state->key.xmms[3] = _mm_xor_si128(pi0, pi1);
+    sz_u64_t const *pi = _sz_hash_pi_constants();
+    for (int i = 0; i < 4; ++i)
+        state->aes.xmms[i] = _mm_xor_si128(seed_vec, _mm_load_si128((__m128i const *)(pi + i * 2)));
+    for (int i = 0; i < 4; ++i)
+        state->sum.xmms[i] = _mm_xor_si128(seed_vec, _mm_load_si128((__m128i const *)(pi + i * 2 + 8)));
+
+    // The inputs are zeroed out at the beginning
+    state->ins.xmms[0] = state->ins.xmms[1] = state->ins.xmms[2] = state->ins.xmms[3] = _mm_setzero_si128();
     state->ins_length = 0;
 }
 
-SZ_INTERNAL void _sz_hash_state_update_haswell(sz_hash_state_t *state, __m128i block0, __m128i block1, __m128i block2,
-                                               __m128i block3) {
-    // This shuffle mask is identical to "aHash":
-    __m128i const shuffle_mask = _mm_set_epi8(          //
-        0x04, 0x0b, 0x09, 0x06, 0x08, 0x0d, 0x0f, 0x05, //
-        0x0e, 0x03, 0x01, 0x0c, 0x00, 0x07, 0x0a, 0x02);
-    state->aes.xmms[0] = _mm_aesdec_si128(state->aes.xmms[0], block0);
-    state->sum.xmms[0] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[0], shuffle_mask), block0);
-    state->aes.xmms[1] = _mm_aesdec_si128(state->aes.xmms[1], block1);
-    state->sum.xmms[1] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[1], shuffle_mask), block1);
-    state->aes.xmms[2] = _mm_aesdec_si128(state->aes.xmms[2], block2);
-    state->sum.xmms[2] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[2], shuffle_mask), block2);
-    state->aes.xmms[3] = _mm_aesdec_si128(state->aes.xmms[3], block3);
-    state->sum.xmms[3] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[3], shuffle_mask), block3);
+SZ_INTERNAL void _sz_hash_state_update_haswell(sz_hash_state_t *state) {
+    __m128i const shuffle_mask = _mm_load_si128((__m128i const *)_sz_hash_u8x16x4_shuffle());
+    state->aes.xmms[0] = _mm_aesenc_si128(state->aes.xmms[0], state->ins.xmms[0]);
+    state->sum.xmms[0] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[0], shuffle_mask), state->ins.xmms[0]);
+    state->aes.xmms[1] = _mm_aesenc_si128(state->aes.xmms[1], state->ins.xmms[1]);
+    state->sum.xmms[1] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[1], shuffle_mask), state->ins.xmms[1]);
+    state->aes.xmms[2] = _mm_aesenc_si128(state->aes.xmms[2], state->ins.xmms[2]);
+    state->sum.xmms[2] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[2], shuffle_mask), state->ins.xmms[2]);
+    state->aes.xmms[3] = _mm_aesenc_si128(state->aes.xmms[3], state->ins.xmms[3]);
+    state->sum.xmms[3] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[3], shuffle_mask), state->ins.xmms[3]);
 }
 
-SZ_INTERNAL sz_u64_t _sz_hash_state_finalize_haswell(sz_hash_state_t const *state) {
-    // Combine the sum and the AES block
-    __m128i mixed_registers0 = _mm_aesenc_si128(state->sum.xmms[0], state->aes.xmms[0]);
-    __m128i mixed_registers1 = _mm_aesenc_si128(state->sum.xmms[1], state->aes.xmms[1]);
-    __m128i mixed_registers2 = _mm_aesenc_si128(state->sum.xmms[2], state->aes.xmms[2]);
-    __m128i mixed_registers3 = _mm_aesenc_si128(state->sum.xmms[3], state->aes.xmms[3]);
+SZ_INTERNAL sz_u64_t _sz_hash_state_finalize_haswell(sz_hash_state_t const *state_ptr) {
+    // Mix the length into the key
+    __m128i key_with_length = _mm_add_epi64(state_ptr->key.xmm, _mm_set_epi64x(0, state_ptr->ins_length));
+    // Combine the "sum" and the "AES" blocks
+    __m128i mixed_registers0 = _mm_aesenc_si128(state_ptr->sum.xmms[0], state_ptr->aes.xmms[0]);
+    __m128i mixed_registers1 = _mm_aesenc_si128(state_ptr->sum.xmms[1], state_ptr->aes.xmms[1]);
+    __m128i mixed_registers2 = _mm_aesenc_si128(state_ptr->sum.xmms[2], state_ptr->aes.xmms[2]);
+    __m128i mixed_registers3 = _mm_aesenc_si128(state_ptr->sum.xmms[3], state_ptr->aes.xmms[3]);
     // Combine the mixed registers
     __m128i mixed_registers01 = _mm_aesenc_si128(mixed_registers0, mixed_registers1);
     __m128i mixed_registers23 = _mm_aesenc_si128(mixed_registers2, mixed_registers3);
     __m128i mixed_registers = _mm_aesenc_si128(mixed_registers01, mixed_registers23);
     // Make sure the "key" mixes enough with the state,
     // as with less than 2 rounds - SMHasher fails
-    __m128i mixed_within_register = _mm_aesdec_si128( //
-        _mm_aesdec_si128(mixed_registers, state->key.xmms[0]), mixed_registers);
+    __m128i mixed_within_register =
+        _mm_aesenc_si128(_mm_aesenc_si128(mixed_registers, key_with_length), mixed_registers);
     // Extract the low 64 bits
     return _mm_cvtsi128_si64(mixed_within_register);
 }
@@ -491,86 +890,77 @@ SZ_INTERNAL sz_u64_t _sz_hash_state_finalize_haswell(sz_hash_state_t const *stat
 SZ_PUBLIC sz_u64_t sz_hash_haswell(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
 
     if (length <= 16) {
-        // Initialize the AES block with a given seed and update with the input length
+        // Initialize the AES block with a given seed
         _sz_hash_minimal_t state;
         _sz_hash_minimal_init_haswell(&state, seed);
-        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
         // Load the data and update the state
         sz_u128_vec_t data_vec;
         data_vec.xmm = _mm_setzero_si128();
         for (sz_size_t i = 0; i < length; ++i) data_vec.u8s[i] = start[i];
         _sz_hash_minimal_update_haswell(&state, data_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state);
+        return _sz_hash_minimal_finalize_haswell(&state, length);
     }
     else if (length <= 32) {
-        // Initialize the AES block with a given seed and update with the input length
+        // Initialize the AES block with a given seed
         _sz_hash_minimal_t state;
         _sz_hash_minimal_init_haswell(&state, seed);
-        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec;
-        data0_vec.xmm = _mm_lddqu_si128(start);
-        data1_vec.xmm = _mm_lddqu_si128(start + length - 16);
+        data0_vec.xmm = _mm_lddqu_si128((__m128i const *)(start));
+        data1_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + length - 16));
         // Let's shift the data within the register to de-interleave the bytes.
-        data1_vec.xmm = _mm_bsrli_si128(data1_vec.xmm, 32 - length);
+        _sz_hash_shift_in_register_serial(&data1_vec, 32 - length);
         _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
         _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state);
+        return _sz_hash_minimal_finalize_haswell(&state, length);
     }
     else if (length <= 48) {
-        // Initialize the AES block with a given seed and update with the input length
+        // Initialize the AES block with a given seed
         _sz_hash_minimal_t state;
         _sz_hash_minimal_init_haswell(&state, seed);
-        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec;
-        data0_vec.xmm = _mm_lddqu_si128(start);
-        data1_vec.xmm = _mm_lddqu_si128(start + 16);
-        data2_vec.xmm = _mm_lddqu_si128(start + length - 16);
+        data0_vec.xmm = _mm_lddqu_si128((__m128i const *)(start));
+        data1_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 16));
+        data2_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + length - 16));
         // Let's shift the data within the register to de-interleave the bytes.
-        data2_vec.xmm = _mm_bsrli_si128(data2_vec.xmm, 48 - length);
+        _sz_hash_shift_in_register_serial(&data2_vec, 48 - length);
         _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
         _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
         _sz_hash_minimal_update_haswell(&state, data2_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state);
+        return _sz_hash_minimal_finalize_haswell(&state, length);
     }
     else if (length <= 64) {
-        // Initialize the AES block with a given seed and update with the input length
+        // Initialize the AES block with a given seed
         _sz_hash_minimal_t state;
         _sz_hash_minimal_init_haswell(&state, seed);
-        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec, data3_vec;
-        data0_vec.xmm = _mm_lddqu_si128(start);
-        data1_vec.xmm = _mm_lddqu_si128(start + 16);
-        data2_vec.xmm = _mm_lddqu_si128(start + 32);
-        data3_vec.xmm = _mm_lddqu_si128(start + length - 16);
+        data0_vec.xmm = _mm_lddqu_si128((__m128i const *)(start));
+        data1_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 16));
+        data2_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 32));
+        data3_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + length - 16));
         // Let's shift the data within the register to de-interleave the bytes.
-        data3_vec.xmm = _mm_bsrli_si128(data3_vec.xmm, 64 - length);
+        _sz_hash_shift_in_register_serial(&data3_vec, 64 - length);
         _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
         _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
         _sz_hash_minimal_update_haswell(&state, data2_vec.xmm);
         _sz_hash_minimal_update_haswell(&state, data3_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state);
+        return _sz_hash_minimal_finalize_haswell(&state, length);
     }
     else {
         // Use a larger state to handle the main loop and add different offsets
         // to different lanes of the register
         sz_hash_state_t state;
         sz_hash_state_init_haswell(&state, seed);
-        state.aes.xmms[0] = _mm_add_epi64(state.aes.xmms[0], _mm_set_epi64x(0, length));
-        state.aes.xmms[1] = _mm_add_epi64(state.aes.xmms[1], _mm_set_epi64x(16, length));
-        state.aes.xmms[2] = _mm_add_epi64(state.aes.xmms[2], _mm_set_epi64x(32, length));
-        state.aes.xmms[3] = _mm_add_epi64(state.aes.xmms[3], _mm_set_epi64x(48, length));
-
         for (; state.ins_length + 64 <= length; state.ins_length += 64) {
-            state.ins.xmms[0] = _mm_lddqu_si128(start + state.ins_length);
-            state.ins.xmms[1] = _mm_lddqu_si128(start + state.ins_length + 16);
-            state.ins.xmms[2] = _mm_lddqu_si128(start + state.ins_length + 32);
-            state.ins.xmms[3] = _mm_lddqu_si128(start + state.ins_length + 48);
-            _sz_hash_state_update_haswell(&state, state.ins.xmms[0], state.ins.xmms[1], state.ins.xmms[2],
-                                          state.ins.xmms[3]);
+            state.ins.xmms[0] = _mm_lddqu_si128((__m128i const *)(start + state.ins_length));
+            state.ins.xmms[1] = _mm_lddqu_si128((__m128i const *)(start + state.ins_length + 16));
+            state.ins.xmms[2] = _mm_lddqu_si128((__m128i const *)(start + state.ins_length + 32));
+            state.ins.xmms[3] = _mm_lddqu_si128((__m128i const *)(start + state.ins_length + 48));
+            _sz_hash_state_update_haswell(&state);
         }
+        // Handle the tail, resetting the registers to zero first
         if (state.ins_length < length) {
             state.ins.xmms[0] = _mm_setzero_si128();
             state.ins.xmms[1] = _mm_setzero_si128();
@@ -578,26 +968,85 @@ SZ_PUBLIC sz_u64_t sz_hash_haswell(sz_cptr_t start, sz_size_t length, sz_u64_t s
             state.ins.xmms[3] = _mm_setzero_si128();
             for (sz_size_t i = 0; state.ins_length < length; ++i, ++state.ins_length)
                 state.ins.u8s[i] = start[state.ins_length];
-            _sz_hash_state_update_haswell(&state, state.ins.xmms[0], state.ins.xmms[1], state.ins.xmms[2],
-                                          state.ins.xmms[3]);
+            _sz_hash_state_update_haswell(&state);
+            state.ins_length = length;
         }
         return _sz_hash_state_finalize_haswell(&state);
     }
 }
 
-SZ_PUBLIC void sz_generate_haswell(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
-    sz_generate_serial(text, length, nonce);
+SZ_PUBLIC void sz_hash_state_stream_haswell(sz_hash_state_t *state_ptr, sz_cptr_t text, sz_size_t length) {
+    while (length) {
+        // Append to the internal buffer until it's full
+        if (state_ptr->ins_length % 64 == 0 && length >= 64) {
+            state_ptr->ins.xmms[0] = _mm_lddqu_si128((__m128i const *)text);
+            state_ptr->ins.xmms[1] = _mm_lddqu_si128((__m128i const *)(text + 16));
+            state_ptr->ins.xmms[2] = _mm_lddqu_si128((__m128i const *)(text + 32));
+            state_ptr->ins.xmms[3] = _mm_lddqu_si128((__m128i const *)(text + 48));
+            _sz_hash_state_update_haswell(state_ptr);
+            state_ptr->ins_length += 64;
+            text += 64;
+            length -= 64;
+        }
+        // If vectorization isn't that trivial - fall back to the serial implementation
+        else {
+            sz_size_t progress_in_block = state_ptr->ins_length % 64;
+            sz_size_t to_copy = sz_min_of_two(length, 64 - progress_in_block);
+            int const will_fill_block = progress_in_block + to_copy == 64;
+            // Update the metadata before we modify the `to_copy` variable
+            state_ptr->ins_length += to_copy;
+            length -= to_copy;
+            // Append to the internal buffer until it's full
+            while (to_copy--) state_ptr->ins.u8s[progress_in_block++] = *text++;
+            // If we've reached the end of the buffer, update the state
+            if (will_fill_block) {
+                _sz_hash_state_update_haswell(state_ptr);
+                // Reset to zeros now, so we don't have to overwrite an immutable buffer in the folding state
+                for (int i = 0; i < 4; ++i) state_ptr->ins.xmms[i] = _mm_setzero_si128();
+            }
+        }
+    }
 }
 
-SZ_PUBLIC void sz_hash_state_init_haswell(sz_hash_state_t *state, sz_u64_t seed) {
-    sz_hash_state_init_serial(state, seed);
-}
+SZ_PUBLIC sz_u64_t sz_hash_state_fold_haswell(sz_hash_state_t const *state_ptr) {
+    sz_size_t length = state_ptr->ins_length;
+    if (length >= 64) return _sz_hash_state_finalize_haswell(state_ptr);
 
-SZ_PUBLIC void sz_hash_state_stream_haswell(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
-    sz_hash_state_stream_serial(state, text, length);
+    // Switch back to a smaller "minimal" state for small inputs
+    _sz_hash_minimal_t state;
+    state.key.xmm = state_ptr->key.xmm;
+    state.aes.xmm = state_ptr->aes.xmms[0];
+    state.sum.xmm = state_ptr->sum.xmms[0];
+
+    // The logic is different depending on the length of the input
+    __m128i const *ins_vecs = (__m128i const *)&state_ptr->ins.xmms[0];
+    if (length <= 16) {
+        _sz_hash_minimal_update_haswell(&state, ins_vecs[0]);
+        return _sz_hash_minimal_finalize_haswell(&state, length);
+    }
+    else if (length <= 32) {
+        _sz_hash_minimal_update_haswell(&state, ins_vecs[0]);
+        _sz_hash_minimal_update_haswell(&state, ins_vecs[1]);
+        return _sz_hash_minimal_finalize_haswell(&state, length);
+    }
+    else if (length <= 48) {
+        _sz_hash_minimal_update_haswell(&state, ins_vecs[0]);
+        _sz_hash_minimal_update_haswell(&state, ins_vecs[1]);
+        _sz_hash_minimal_update_haswell(&state, ins_vecs[2]);
+        return _sz_hash_minimal_finalize_haswell(&state, length);
+    }
+    else {
+        _sz_hash_minimal_update_haswell(&state, ins_vecs[0]);
+        _sz_hash_minimal_update_haswell(&state, ins_vecs[1]);
+        _sz_hash_minimal_update_haswell(&state, ins_vecs[2]);
+        _sz_hash_minimal_update_haswell(&state, ins_vecs[3]);
+        return _sz_hash_minimal_finalize_haswell(&state, length);
+    }
 }
 
-SZ_PUBLIC sz_u64_t sz_hash_state_fold_haswell(sz_hash_state_t const *state) { return sz_hash_state_fold_serial(state); }
+SZ_PUBLIC void sz_generate_haswell(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
+    sz_generate_serial(text, length, nonce);
+}
 
 #pragma clang attribute pop
 #pragma GCC pop_options
@@ -716,99 +1165,91 @@ SZ_PUBLIC sz_u64_t sz_bytesum_skylake(sz_cptr_t text, sz_size_t length) {
 }
 
 SZ_PUBLIC void sz_hash_state_init_skylake(sz_hash_state_t *state, sz_u64_t seed) {
+    // The key is made from the seed and half of it will be mixed with the length in the end
     __m512i seed_vec = _mm512_set1_epi64(seed);
-    __m512i pi0 = _mm512_set_epi64( //
-        0x13198a2e03707344ull, 0x243f6a8885a308d3ull, 0x13198a2e03707344ull, 0x243f6a8885a308d3ull,
-        0x13198a2e03707344ull, 0x243f6a8885a308d3ull, 0x13198a2e03707344ull, 0x243f6a8885a308d3ull);
-    __m512i pi1 = _mm512_set_epi64( //
-        0x082efa98ec4e6c89ull, 0xa4093822299f31d0ull, 0x082efa98ec4e6c89ull, 0xa4093822299f31d0ull,
-        0x082efa98ec4e6c89ull, 0xa4093822299f31d0ull, 0x082efa98ec4e6c89ull, 0xa4093822299f31d0ull);
+    state->key.xmm = _mm512_castsi512_si128(seed_vec);
+
     // XOR the user-supplied keys with the two "pi" constants
-    __m512i k1 = _mm512_xor_si512(seed_vec, pi0);
-    __m512i k2 = _mm512_xor_si512(seed_vec, pi1);
-    // Export the keys to the state
-    state->aes.zmm = k1;
-    state->sum.zmm = k2;
-    state->key.zmm = _mm512_xor_si512(pi0, pi1);
+    sz_u64_t const *pi = _sz_hash_pi_constants();
+    __m512i const pi0 = _mm512_load_epi64((__m512i const *)(pi));
+    __m512i const pi1 = _mm512_load_epi64((__m512i const *)(pi + 8));
+    state->aes.zmm = _mm512_xor_si512(seed_vec, pi0);
+    state->sum.zmm = _mm512_xor_si512(seed_vec, pi1);
+
+    // The inputs are zeroed out at the beginning
+    state->ins.zmm = _mm512_setzero_si512();
     state->ins_length = 0;
 }
 
 SZ_PUBLIC sz_u64_t sz_hash_skylake(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
 
     if (length <= 16) {
-        // Initialize the AES block with a given seed and update with the input length
+        // Initialize the AES block with a given seed
         _sz_hash_minimal_t state;
         _sz_hash_minimal_init_haswell(&state, seed);
-        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
         // Load the data and update the state
         sz_u128_vec_t data_vec;
         data_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length), start);
         _sz_hash_minimal_update_haswell(&state, data_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state);
+        return _sz_hash_minimal_finalize_haswell(&state, length);
     }
     else if (length <= 32) {
-        // Initialize the AES block with a given seed and update with the input length
+        // Initialize the AES block with a given seed
         _sz_hash_minimal_t state;
         _sz_hash_minimal_init_haswell(&state, seed);
-        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec;
-        data0_vec.xmm = _mm_lddqu_si128(start);
+        data0_vec.xmm = _mm_lddqu_si128((__m128i const *)(start));
         data1_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length - 16), start + 16);
         _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
         _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state);
+        return _sz_hash_minimal_finalize_haswell(&state, length);
     }
     else if (length <= 48) {
-        // Initialize the AES block with a given seed and update with the input length
+        // Initialize the AES block with a given seed
         _sz_hash_minimal_t state;
         _sz_hash_minimal_init_haswell(&state, seed);
-        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec;
-        data0_vec.xmm = _mm_lddqu_si128(start);
-        data1_vec.xmm = _mm_lddqu_si128(start + 16);
+        data0_vec.xmm = _mm_lddqu_si128((__m128i const *)(start));
+        data1_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 16));
         data2_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length - 32), start + 32);
         _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
         _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
         _sz_hash_minimal_update_haswell(&state, data2_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state);
+        return _sz_hash_minimal_finalize_haswell(&state, length);
     }
     else if (length <= 64) {
-        // Initialize the AES block with a given seed and update with the input length
+        // Initialize the AES block with a given seed
         _sz_hash_minimal_t state;
         _sz_hash_minimal_init_haswell(&state, seed);
-        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec, data3_vec;
-        data0_vec.xmm = _mm_lddqu_si128(start);
-        data1_vec.xmm = _mm_lddqu_si128(start + 16);
-        data2_vec.xmm = _mm_lddqu_si128(start + 32);
+        data0_vec.xmm = _mm_lddqu_si128((__m128i const *)(start));
+        data1_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 16));
+        data2_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 32));
         data3_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length - 48), start + 48);
         _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
         _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
         _sz_hash_minimal_update_haswell(&state, data2_vec.xmm);
         _sz_hash_minimal_update_haswell(&state, data3_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state);
+        return _sz_hash_minimal_finalize_haswell(&state, length);
     }
     else {
         // Use a larger state to handle the main loop and add different offsets
         // to different lanes of the register
         sz_hash_state_t state;
         sz_hash_state_init_skylake(&state, seed);
-        state.aes.zmm = _mm512_add_epi64( //
-            state.aes.zmm,                //
-            _mm512_set_epi64(0, length, 16, length, 32, length, 48, length));
 
         for (; state.ins_length + 64 <= length; state.ins_length += 64) {
             state.ins.zmm = _mm512_loadu_epi8(start + state.ins_length);
-            _sz_hash_state_update_haswell(&state, state.ins.xmms[0], state.ins.xmms[1], state.ins.xmms[2],
-                                          state.ins.xmms[3]);
+            _sz_hash_state_update_haswell(&state);
         }
         if (state.ins_length < length) {
             state.ins.zmm = _mm512_maskz_loadu_epi8( //
                 _sz_u64_mask_until(length - state.ins_length), start + state.ins_length);
-            _sz_hash_state_update_skylake(&state, state.ins.zmm);
+            _sz_hash_state_update_haswell(&state);
+            state.ins_length = length;
         }
         return _sz_hash_state_finalize_haswell(&state);
     }
@@ -970,97 +1411,81 @@ SZ_PUBLIC sz_u64_t sz_bytesum_ice(sz_cptr_t text, sz_size_t length) {
     }
 }
 
-SZ_INTERNAL void _sz_hash_state_update_ice(sz_hash_state_t *state, __m512i block) {
-    // This shuffle mask is identical to "aHash":
-    __m512i const shuffle_mask = _mm512_set_epi8(       //
-        0x04, 0x0b, 0x09, 0x06, 0x08, 0x0d, 0x0f, 0x05, //
-        0x0e, 0x03, 0x01, 0x0c, 0x00, 0x07, 0x0a, 0x02, //
-        0x04, 0x0b, 0x09, 0x06, 0x08, 0x0d, 0x0f, 0x05, //
-        0x0e, 0x03, 0x01, 0x0c, 0x00, 0x07, 0x0a, 0x02, //
-        0x04, 0x0b, 0x09, 0x06, 0x08, 0x0d, 0x0f, 0x05, //
-        0x0e, 0x03, 0x01, 0x0c, 0x00, 0x07, 0x0a, 0x02, //
-        0x04, 0x0b, 0x09, 0x06, 0x08, 0x0d, 0x0f, 0x05, //
-        0x0e, 0x03, 0x01, 0x0c, 0x00, 0x07, 0x0a, 0x02  //
-    );
-    state->aes.zmm = _mm512_aesdec_epi128(state->aes.zmm, block);
-    state->sum.zmm = _mm512_add_epi64(_mm512_shuffle_epi8(state->sum.zmm, shuffle_mask), block);
+SZ_INTERNAL void _sz_hash_state_update_ice(sz_hash_state_t *state) {
+    __m512i const shuffle_mask = _mm512_load_si512((__m512i const *)_sz_hash_u8x16x4_shuffle());
+    state->aes.zmm = _mm512_aesenc_epi128(state->aes.zmm, state->ins.zmm);
+    state->sum.zmm = _mm512_add_epi64(_mm512_shuffle_epi8(state->sum.zmm, shuffle_mask), state->ins.zmm);
 }
 
 SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
 
     if (length <= 16) {
-        // Initialize the AES block with a given seed and update with the input length
+        // Initialize the AES block with a given seed
         _sz_hash_minimal_t state;
         _sz_hash_minimal_init_haswell(&state, seed);
-        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
         // Load the data and update the state
         sz_u128_vec_t data_vec;
         data_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length), start);
         _sz_hash_minimal_update_haswell(&state, data_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state);
+        return _sz_hash_minimal_finalize_haswell(&state, length);
     }
     else if (length <= 32) {
-        // Initialize the AES block with a given seed and update with the input length
+        // Initialize the AES block with a given seed
         _sz_hash_minimal_t state;
         _sz_hash_minimal_init_haswell(&state, seed);
-        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec;
-        data0_vec.xmm = _mm_lddqu_si128(start);
+        data0_vec.xmm = _mm_lddqu_si128((__m128i const *)(start));
         data1_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length - 16), start + 16);
         _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
         _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state);
+        return _sz_hash_minimal_finalize_haswell(&state, length);
     }
     else if (length <= 48) {
-        // Initialize the AES block with a given seed and update with the input length
+        // Initialize the AES block with a given seed
         _sz_hash_minimal_t state;
         _sz_hash_minimal_init_haswell(&state, seed);
-        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec;
-        data0_vec.xmm = _mm_lddqu_si128(start);
-        data1_vec.xmm = _mm_lddqu_si128(start + 16);
+        data0_vec.xmm = _mm_lddqu_si128((__m128i const *)(start));
+        data1_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 16));
         data2_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length - 32), start + 32);
         _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
         _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
         _sz_hash_minimal_update_haswell(&state, data2_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state);
+        return _sz_hash_minimal_finalize_haswell(&state, length);
     }
     else if (length <= 64) {
-        // Initialize the AES block with a given seed and update with the input length
+        // Initialize the AES block with a given seed
         _sz_hash_minimal_t state;
         _sz_hash_minimal_init_haswell(&state, seed);
-        state.aes.xmm = _mm_add_epi64(state.aes.xmm, _mm_set_epi64x(0, length));
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec, data3_vec;
-        data0_vec.xmm = _mm_lddqu_si128(start);
-        data1_vec.xmm = _mm_lddqu_si128(start + 16);
-        data2_vec.xmm = _mm_lddqu_si128(start + 32);
+        data0_vec.xmm = _mm_lddqu_si128((__m128i const *)(start));
+        data1_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 16));
+        data2_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 32));
         data3_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length - 48), start + 48);
         _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
         _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
         _sz_hash_minimal_update_haswell(&state, data2_vec.xmm);
         _sz_hash_minimal_update_haswell(&state, data3_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state);
+        return _sz_hash_minimal_finalize_haswell(&state, length);
     }
     else {
         // Use a larger state to handle the main loop and add different offsets
         // to different lanes of the register
         sz_hash_state_t state;
         sz_hash_state_init_skylake(&state, seed);
-        state.aes.zmm = _mm512_add_epi64( //
-            state.aes.zmm,                //
-            _mm512_set_epi64(0, length, 16, length, 32, length, 48, length));
 
         for (; state.ins_length + 64 <= length; state.ins_length += 64) {
             state.ins.zmm = _mm512_loadu_epi8(start + state.ins_length);
-            _sz_hash_state_update_ice(&state, state.ins.zmm);
+            _sz_hash_state_update_ice(&state);
         }
         if (state.ins_length < length) {
             state.ins.zmm = _mm512_maskz_loadu_epi8( //
                 _sz_u64_mask_until(length - state.ins_length), start + state.ins_length);
-            _sz_hash_state_update_ice(&state, state.ins.zmm);
+            _sz_hash_state_update_ice(&state);
+            state.ins_length = length;
         }
         return _sz_hash_state_finalize_haswell(&state);
     }
@@ -1119,6 +1544,10 @@ SZ_PUBLIC void sz_generate_ice(sz_ptr_t output, sz_size_t length, sz_u64_t nonce
     }
 }
 
+SZ_PUBLIC void sz_hash_state_init_ice(sz_hash_state_t *state, sz_u64_t seed) {
+    sz_hash_state_init_skylake(state, seed);
+}
+
 SZ_PUBLIC void sz_hash_state_stream_ice(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
     sz_hash_state_stream_serial(state, text, length);
 }

From 8ac3a23a8db20a44f883475f6b865df45d883308 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 26 Feb 2025 10:46:07 +0000
Subject: [PATCH 128/751] Add: Streaming hash benchmarks

---
 scripts/bench_token.cpp | 51 +++++++++++++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 7 deletions(-)

diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index 93ae2b7e..378ad4f0 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -38,24 +38,60 @@ tracked_unary_functions_t bytesum_functions() {
     return result;
 }
 
-tracked_unary_functions_t hashing_functions() {
+tracked_unary_functions_t hash_functions() {
     auto wrap_sz = [](auto function) -> unary_function_t {
         return unary_function_t([function](std::string_view s) { return function(s.data(), s.size(), 42); });
     };
     tracked_unary_functions_t result = {
-        {"std::hash", [](std::string_view s) { return std::hash<std::string_view> {}(s); }},
         {"sz_hash_serial", wrap_sz(sz_hash_serial)},
 #if SZ_USE_HASWELL
-        {"sz_hash_haswell", wrap_sz(sz_hash_haswell)},
+        {"sz_hash_haswell", wrap_sz(sz_hash_haswell), true},
+#endif
+#if SZ_USE_SKYLAKE
+        {"sz_hash_skylake", wrap_sz(sz_hash_skylake), true},
+#endif
+#if SZ_USE_ICE
+        {"sz_hash_ice", wrap_sz(sz_hash_ice), true},
+#endif
+#if SZ_USE_NEON
+        {"sz_hash_neon", wrap_sz(sz_hash_neon), true},
+#endif
+        {"std::hash", [](std::string_view s) { return std::hash<std::string_view> {}(s); }},
+    };
+    return result;
+}
+
+struct wrapped_incremental_hash {
+    sz_hash_state_t state;
+    sz_hash_state_stream_t stream;
+    sz_hash_state_fold_t fold;
+
+    wrapped_incremental_hash(sz_hash_state_stream_t s, sz_hash_state_fold_t f) : stream(s), fold(f) {
+        sz_hash_state_init(&state, 42);
+    }
+
+    std::size_t operator()(std::string_view s) noexcept {
+        stream(&state, s.data(), s.size());
+        return fold(&state);
+    }
+};
+
+tracked_unary_functions_t hash_stream_functions() {
+    tracked_unary_functions_t result = {
+        {"sz_hash_stream_serial", wrapped_incremental_hash(sz_hash_state_stream_serial, sz_hash_state_fold_serial)},
+#if SZ_USE_HASWELL
+        {"sz_hash_stream_haswell", wrapped_incremental_hash(sz_hash_state_stream_haswell, sz_hash_state_fold_haswell),
+         true},
 #endif
 #if SZ_USE_SKYLAKE
-        {"sz_hash_skylake", wrap_sz(sz_hash_skylake)},
+        {"sz_hash_stream_skylake", wrapped_incremental_hash(sz_hash_state_stream_skylake, sz_hash_state_fold_skylake),
+         true},
 #endif
 #if SZ_USE_ICE
-        {"sz_hash_ice", wrap_sz(sz_hash_ice)},
+        {"sz_hash_stream_ice", wrapped_incremental_hash(sz_hash_state_stream_ice, sz_hash_state_fold_ice), true},
 #endif
 #if SZ_USE_NEON
-        {"sz_hash_neon", wrap_sz(sz_hash_neon)},
+        {"sz_hash_stream_neon", wrapped_incremental_hash(sz_hash_state_stream_neon, sz_hash_state_fold_neon), true},
 #endif
     };
     return result;
@@ -152,7 +188,8 @@ void bench(strings_type &&strings) {
 
     // Benchmark logical operations
     bench_unary_functions(strings, bytesum_functions());
-    bench_unary_functions(strings, hashing_functions());
+    bench_unary_functions(strings, hash_functions());
+    bench_unary_functions(strings, hash_stream_functions());
     bench_binary_functions(strings, equality_functions());
     bench_binary_functions(strings, ordering_functions());
 

From 2607d4513f97b2dd55ac6301e7dc956ad4a5d9ad Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 26 Feb 2025 11:54:45 +0000
Subject: [PATCH 129/751] Add: Streaming hashing on Ice Lake & Skylake X

---
 include/stringzilla/hash.h | 228 ++++++++++++++++++++++---------------
 1 file changed, 134 insertions(+), 94 deletions(-)

diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 105ffabe..50deb776 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -297,13 +297,15 @@ SZ_PUBLIC sz_u64_t sz_hash_state_fold_neon(sz_hash_state_t const *state);
  *  @note   The current content of the `ins` buffer and its length is ignored.
  */
 SZ_PUBLIC sz_bool_t sz_hash_state_equal(sz_hash_state_t const *lhs, sz_hash_state_t const *rhs) {
-    return lhs->aes.u64s[0] == rhs->aes.u64s[0] && lhs->aes.u64s[1] == rhs->aes.u64s[1] &&
-                   lhs->aes.u64s[2] == rhs->aes.u64s[2] && lhs->aes.u64s[3] == rhs->aes.u64s[3] &&
-                   lhs->sum.u64s[0] == rhs->sum.u64s[0] && lhs->sum.u64s[1] == rhs->sum.u64s[1] &&
-                   lhs->sum.u64s[2] == rhs->sum.u64s[2] && lhs->sum.u64s[3] == rhs->sum.u64s[3] &&
-                   lhs->key.u64s[0] == rhs->key.u64s[0] && lhs->key.u64s[1] == rhs->key.u64s[1]
-               ? sz_true_k
-               : sz_false_k;
+    int same_aes = //
+        lhs->aes.u64s[0] == rhs->aes.u64s[0] && lhs->aes.u64s[1] == rhs->aes.u64s[1] &&
+        lhs->aes.u64s[2] == rhs->aes.u64s[2] && lhs->aes.u64s[3] == rhs->aes.u64s[3];
+    int same_sum = //
+        lhs->sum.u64s[0] == rhs->sum.u64s[0] && lhs->sum.u64s[1] == rhs->sum.u64s[1] &&
+        lhs->sum.u64s[2] == rhs->sum.u64s[2] && lhs->sum.u64s[3] == rhs->sum.u64s[3];
+    int same_key = //
+        lhs->key.u64s[0] == rhs->key.u64s[0] && lhs->key.u64s[1] == rhs->key.u64s[1];
+    return same_aes && same_sum && same_key ? sz_true_k : sz_false_k;
 }
 
 #pragma endregion // Helper Methods
@@ -647,58 +649,58 @@ SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length, sz_u64_t se
     }
 }
 
-SZ_PUBLIC void sz_hash_state_stream_serial(sz_hash_state_t *state_ptr, sz_cptr_t text, sz_size_t length) {
+SZ_PUBLIC void sz_hash_state_stream_serial(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
     while (length) {
-        sz_size_t progress_in_block = state_ptr->ins_length % 64;
+        sz_size_t progress_in_block = state->ins_length % 64;
         sz_size_t to_copy = sz_min_of_two(length, 64 - progress_in_block);
         int const will_fill_block = progress_in_block + to_copy == 64;
         // Update the metadata before we modify the `to_copy` variable
-        state_ptr->ins_length += to_copy;
+        state->ins_length += to_copy;
         length -= to_copy;
         // Append to the internal buffer until it's full
-        while (to_copy--) state_ptr->ins.u8s[progress_in_block++] = *text++;
+        while (to_copy--) state->ins.u8s[progress_in_block++] = *text++;
         // If we've reached the end of the buffer, update the state
         if (will_fill_block) {
-            _sz_hash_state_update_serial(state_ptr);
+            _sz_hash_state_update_serial(state);
             // Reset to zeros now, so we don't have to overwrite an immutable buffer in the folding state
-            for (int i = 0; i < 8; ++i) state_ptr->ins.u64s[i] = 0;
+            for (int i = 0; i < 8; ++i) state->ins.u64s[i] = 0;
         }
     }
 }
 
-SZ_PUBLIC sz_u64_t sz_hash_state_fold_serial(sz_hash_state_t const *state_ptr) {
-    sz_size_t length = state_ptr->ins_length;
-    if (length >= 64) return _sz_hash_state_finalize_serial(state_ptr);
+SZ_PUBLIC sz_u64_t sz_hash_state_fold_serial(sz_hash_state_t const *state) {
+    sz_size_t length = state->ins_length;
+    if (length >= 64) return _sz_hash_state_finalize_serial(state);
 
     // Switch back to a smaller "minimal" state for small inputs
-    _sz_hash_minimal_t state;
-    state.key = state_ptr->key;
-    state.aes = *(sz_u128_vec_t const *)&state_ptr->aes.u64s[0];
-    state.sum = *(sz_u128_vec_t const *)&state_ptr->sum.u64s[0];
+    _sz_hash_minimal_t minimal_state;
+    minimal_state.key = state->key;
+    minimal_state.aes = *(sz_u128_vec_t const *)&state->aes.u64s[0];
+    minimal_state.sum = *(sz_u128_vec_t const *)&state->sum.u64s[0];
 
     // The logic is different depending on the length of the input
-    sz_u128_vec_t const *ins_vecs = (sz_u128_vec_t const *)&state_ptr->ins.u64s[0];
+    sz_u128_vec_t const *ins_vecs = (sz_u128_vec_t const *)&state->ins.u64s[0];
     if (length <= 16) {
-        _sz_hash_minimal_update_serial(&state, ins_vecs[0]);
-        return _sz_hash_minimal_finalize_serial(&state, length);
+        _sz_hash_minimal_update_serial(&minimal_state, ins_vecs[0]);
+        return _sz_hash_minimal_finalize_serial(&minimal_state, length);
     }
     else if (length <= 32) {
-        _sz_hash_minimal_update_serial(&state, ins_vecs[0]);
-        _sz_hash_minimal_update_serial(&state, ins_vecs[1]);
-        return _sz_hash_minimal_finalize_serial(&state, length);
+        _sz_hash_minimal_update_serial(&minimal_state, ins_vecs[0]);
+        _sz_hash_minimal_update_serial(&minimal_state, ins_vecs[1]);
+        return _sz_hash_minimal_finalize_serial(&minimal_state, length);
     }
     else if (length <= 48) {
-        _sz_hash_minimal_update_serial(&state, ins_vecs[0]);
-        _sz_hash_minimal_update_serial(&state, ins_vecs[1]);
-        _sz_hash_minimal_update_serial(&state, ins_vecs[2]);
-        return _sz_hash_minimal_finalize_serial(&state, length);
+        _sz_hash_minimal_update_serial(&minimal_state, ins_vecs[0]);
+        _sz_hash_minimal_update_serial(&minimal_state, ins_vecs[1]);
+        _sz_hash_minimal_update_serial(&minimal_state, ins_vecs[2]);
+        return _sz_hash_minimal_finalize_serial(&minimal_state, length);
     }
     else {
-        _sz_hash_minimal_update_serial(&state, ins_vecs[0]);
-        _sz_hash_minimal_update_serial(&state, ins_vecs[1]);
-        _sz_hash_minimal_update_serial(&state, ins_vecs[2]);
-        _sz_hash_minimal_update_serial(&state, ins_vecs[3]);
-        return _sz_hash_minimal_finalize_serial(&state, length);
+        _sz_hash_minimal_update_serial(&minimal_state, ins_vecs[0]);
+        _sz_hash_minimal_update_serial(&minimal_state, ins_vecs[1]);
+        _sz_hash_minimal_update_serial(&minimal_state, ins_vecs[2]);
+        _sz_hash_minimal_update_serial(&minimal_state, ins_vecs[3]);
+        return _sz_hash_minimal_finalize_serial(&minimal_state, length);
     }
 }
 
@@ -802,15 +804,15 @@ SZ_PUBLIC sz_u64_t sz_bytesum_haswell(sz_cptr_t text, sz_size_t length) {
 }
 
 SZ_INTERNAL void _sz_hash_minimal_init_haswell(_sz_hash_minimal_t *state, sz_u64_t seed) {
-    sz_u64_t const *pi = _sz_hash_pi_constants();
-    __m128i const pi0 = _mm_load_si128((__m128i const *)(pi));
-    __m128i const pi1 = _mm_load_si128((__m128i const *)(pi + 8));
 
     // The key is made from the seed and half of it will be mixed with the length in the end
     __m128i seed_vec = _mm_set1_epi64x(seed);
     state->key.xmm = seed_vec;
 
     // XOR the user-supplied keys with the two "pi" constants
+    sz_u64_t const *pi = _sz_hash_pi_constants();
+    __m128i const pi0 = _mm_load_si128((__m128i const *)(pi));
+    __m128i const pi1 = _mm_load_si128((__m128i const *)(pi + 8));
     __m128i k1 = _mm_xor_si128(seed_vec, pi0);
     __m128i k2 = _mm_xor_si128(seed_vec, pi1);
 
@@ -867,14 +869,14 @@ SZ_INTERNAL void _sz_hash_state_update_haswell(sz_hash_state_t *state) {
     state->sum.xmms[3] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[3], shuffle_mask), state->ins.xmms[3]);
 }
 
-SZ_INTERNAL sz_u64_t _sz_hash_state_finalize_haswell(sz_hash_state_t const *state_ptr) {
+SZ_INTERNAL sz_u64_t _sz_hash_state_finalize_haswell(sz_hash_state_t const *state) {
     // Mix the length into the key
-    __m128i key_with_length = _mm_add_epi64(state_ptr->key.xmm, _mm_set_epi64x(0, state_ptr->ins_length));
+    __m128i key_with_length = _mm_add_epi64(state->key.xmm, _mm_set_epi64x(0, state->ins_length));
     // Combine the "sum" and the "AES" blocks
-    __m128i mixed_registers0 = _mm_aesenc_si128(state_ptr->sum.xmms[0], state_ptr->aes.xmms[0]);
-    __m128i mixed_registers1 = _mm_aesenc_si128(state_ptr->sum.xmms[1], state_ptr->aes.xmms[1]);
-    __m128i mixed_registers2 = _mm_aesenc_si128(state_ptr->sum.xmms[2], state_ptr->aes.xmms[2]);
-    __m128i mixed_registers3 = _mm_aesenc_si128(state_ptr->sum.xmms[3], state_ptr->aes.xmms[3]);
+    __m128i mixed_registers0 = _mm_aesenc_si128(state->sum.xmms[0], state->aes.xmms[0]);
+    __m128i mixed_registers1 = _mm_aesenc_si128(state->sum.xmms[1], state->aes.xmms[1]);
+    __m128i mixed_registers2 = _mm_aesenc_si128(state->sum.xmms[2], state->aes.xmms[2]);
+    __m128i mixed_registers3 = _mm_aesenc_si128(state->sum.xmms[3], state->aes.xmms[3]);
     // Combine the mixed registers
     __m128i mixed_registers01 = _mm_aesenc_si128(mixed_registers0, mixed_registers1);
     __m128i mixed_registers23 = _mm_aesenc_si128(mixed_registers2, mixed_registers3);
@@ -975,72 +977,72 @@ SZ_PUBLIC sz_u64_t sz_hash_haswell(sz_cptr_t start, sz_size_t length, sz_u64_t s
     }
 }
 
-SZ_PUBLIC void sz_hash_state_stream_haswell(sz_hash_state_t *state_ptr, sz_cptr_t text, sz_size_t length) {
+SZ_PUBLIC void sz_hash_state_stream_haswell(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
     while (length) {
         // Append to the internal buffer until it's full
-        if (state_ptr->ins_length % 64 == 0 && length >= 64) {
-            state_ptr->ins.xmms[0] = _mm_lddqu_si128((__m128i const *)text);
-            state_ptr->ins.xmms[1] = _mm_lddqu_si128((__m128i const *)(text + 16));
-            state_ptr->ins.xmms[2] = _mm_lddqu_si128((__m128i const *)(text + 32));
-            state_ptr->ins.xmms[3] = _mm_lddqu_si128((__m128i const *)(text + 48));
-            _sz_hash_state_update_haswell(state_ptr);
-            state_ptr->ins_length += 64;
+        if (state->ins_length % 64 == 0 && length >= 64) {
+            state->ins.xmms[0] = _mm_lddqu_si128((__m128i const *)text);
+            state->ins.xmms[1] = _mm_lddqu_si128((__m128i const *)(text + 16));
+            state->ins.xmms[2] = _mm_lddqu_si128((__m128i const *)(text + 32));
+            state->ins.xmms[3] = _mm_lddqu_si128((__m128i const *)(text + 48));
+            _sz_hash_state_update_haswell(state);
+            state->ins_length += 64;
             text += 64;
             length -= 64;
         }
         // If vectorization isn't that trivial - fall back to the serial implementation
         else {
-            sz_size_t progress_in_block = state_ptr->ins_length % 64;
+            sz_size_t progress_in_block = state->ins_length % 64;
             sz_size_t to_copy = sz_min_of_two(length, 64 - progress_in_block);
             int const will_fill_block = progress_in_block + to_copy == 64;
             // Update the metadata before we modify the `to_copy` variable
-            state_ptr->ins_length += to_copy;
+            state->ins_length += to_copy;
             length -= to_copy;
             // Append to the internal buffer until it's full
-            while (to_copy--) state_ptr->ins.u8s[progress_in_block++] = *text++;
+            while (to_copy--) state->ins.u8s[progress_in_block++] = *text++;
             // If we've reached the end of the buffer, update the state
             if (will_fill_block) {
-                _sz_hash_state_update_haswell(state_ptr);
+                _sz_hash_state_update_haswell(state);
                 // Reset to zeros now, so we don't have to overwrite an immutable buffer in the folding state
-                for (int i = 0; i < 4; ++i) state_ptr->ins.xmms[i] = _mm_setzero_si128();
+                for (int i = 0; i < 4; ++i) state->ins.xmms[i] = _mm_setzero_si128();
             }
         }
     }
 }
 
-SZ_PUBLIC sz_u64_t sz_hash_state_fold_haswell(sz_hash_state_t const *state_ptr) {
-    sz_size_t length = state_ptr->ins_length;
-    if (length >= 64) return _sz_hash_state_finalize_haswell(state_ptr);
+SZ_PUBLIC sz_u64_t sz_hash_state_fold_haswell(sz_hash_state_t const *state) {
+    sz_size_t length = state->ins_length;
+    if (length >= 64) return _sz_hash_state_finalize_haswell(state);
 
     // Switch back to a smaller "minimal" state for small inputs
-    _sz_hash_minimal_t state;
-    state.key.xmm = state_ptr->key.xmm;
-    state.aes.xmm = state_ptr->aes.xmms[0];
-    state.sum.xmm = state_ptr->sum.xmms[0];
+    _sz_hash_minimal_t minimal_state;
+    minimal_state.key.xmm = state->key.xmm;
+    minimal_state.aes.xmm = state->aes.xmms[0];
+    minimal_state.sum.xmm = state->sum.xmms[0];
 
     // The logic is different depending on the length of the input
-    __m128i const *ins_vecs = (__m128i const *)&state_ptr->ins.xmms[0];
+    __m128i const *ins_vecs = (__m128i const *)&state->ins.xmms[0];
     if (length <= 16) {
-        _sz_hash_minimal_update_haswell(&state, ins_vecs[0]);
-        return _sz_hash_minimal_finalize_haswell(&state, length);
+        _sz_hash_minimal_update_haswell(&minimal_state, ins_vecs[0]);
+        return _sz_hash_minimal_finalize_haswell(&minimal_state, length);
     }
     else if (length <= 32) {
-        _sz_hash_minimal_update_haswell(&state, ins_vecs[0]);
-        _sz_hash_minimal_update_haswell(&state, ins_vecs[1]);
-        return _sz_hash_minimal_finalize_haswell(&state, length);
+        _sz_hash_minimal_update_haswell(&minimal_state, ins_vecs[0]);
+        _sz_hash_minimal_update_haswell(&minimal_state, ins_vecs[1]);
+        return _sz_hash_minimal_finalize_haswell(&minimal_state, length);
     }
     else if (length <= 48) {
-        _sz_hash_minimal_update_haswell(&state, ins_vecs[0]);
-        _sz_hash_minimal_update_haswell(&state, ins_vecs[1]);
-        _sz_hash_minimal_update_haswell(&state, ins_vecs[2]);
-        return _sz_hash_minimal_finalize_haswell(&state, length);
+        _sz_hash_minimal_update_haswell(&minimal_state, ins_vecs[0]);
+        _sz_hash_minimal_update_haswell(&minimal_state, ins_vecs[1]);
+        _sz_hash_minimal_update_haswell(&minimal_state, ins_vecs[2]);
+        return _sz_hash_minimal_finalize_haswell(&minimal_state, length);
     }
     else {
-        _sz_hash_minimal_update_haswell(&state, ins_vecs[0]);
-        _sz_hash_minimal_update_haswell(&state, ins_vecs[1]);
-        _sz_hash_minimal_update_haswell(&state, ins_vecs[2]);
-        _sz_hash_minimal_update_haswell(&state, ins_vecs[3]);
-        return _sz_hash_minimal_finalize_haswell(&state, length);
+        _sz_hash_minimal_update_haswell(&minimal_state, ins_vecs[0]);
+        _sz_hash_minimal_update_haswell(&minimal_state, ins_vecs[1]);
+        _sz_hash_minimal_update_haswell(&minimal_state, ins_vecs[2]);
+        _sz_hash_minimal_update_haswell(&minimal_state, ins_vecs[3]);
+        return _sz_hash_minimal_finalize_haswell(&minimal_state, length);
     }
 }
 
@@ -1255,15 +1257,35 @@ SZ_PUBLIC sz_u64_t sz_hash_skylake(sz_cptr_t start, sz_size_t length, sz_u64_t s
     }
 }
 
-SZ_PUBLIC void sz_generate_skylake(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
-    sz_generate_serial(text, length, nonce);
+SZ_PUBLIC void sz_hash_state_stream_skylake(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
+    while (length) {
+        sz_size_t const progress_in_block = state->ins_length % 64;
+        sz_size_t const to_copy = sz_min_of_two(length, 64 - progress_in_block);
+        int const will_fill_block = progress_in_block + to_copy == 64;
+        // Update the metadata before we modify the `to_copy` variable
+        state->ins_length += to_copy;
+        length -= to_copy;
+        // Append to the internal buffer until it's full
+        __mmask64 to_copy_mask = _sz_u64_mask_until(to_copy);
+        _mm512_mask_storeu_epi8(&state->ins.u8s[0] + progress_in_block, to_copy_mask,
+                                _mm512_maskz_loadu_epi8(to_copy_mask, text));
+        text += to_copy;
+        // If we've reached the end of the buffer, update the state
+        if (will_fill_block) {
+            _sz_hash_state_update_haswell(state);
+            // Reset to zeros now, so we don't have to overwrite an immutable buffer in the folding state
+            state->ins.zmm = _mm512_setzero_si512();
+        }
+    }
 }
 
-SZ_PUBLIC void sz_hash_state_stream_skylake(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
-    sz_hash_state_stream_serial(state, text, length);
+SZ_PUBLIC sz_u64_t sz_hash_state_fold_skylake(sz_hash_state_t const *state) {
+    return sz_hash_state_fold_haswell(state);
 }
 
-SZ_PUBLIC sz_u64_t sz_hash_state_fold_skylake(sz_hash_state_t const *state) { return sz_hash_state_fold_serial(state); }
+SZ_PUBLIC void sz_generate_skylake(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
+    sz_generate_serial(text, length, nonce);
+}
 
 #pragma clang attribute pop
 #pragma GCC pop_options
@@ -1491,6 +1513,34 @@ SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t start, sz_size_t length, sz_u64_t seed)
     }
 }
 
+SZ_PUBLIC void sz_hash_state_init_ice(sz_hash_state_t *state, sz_u64_t seed) {
+    sz_hash_state_init_skylake(state, seed);
+}
+
+SZ_PUBLIC void sz_hash_state_stream_ice(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
+    while (length) {
+        sz_size_t progress_in_block = state->ins_length % 64;
+        sz_size_t to_copy = sz_min_of_two(length, 64 - progress_in_block);
+        int const will_fill_block = progress_in_block + to_copy == 64;
+        // Update the metadata before we modify the `to_copy` variable
+        state->ins_length += to_copy;
+        length -= to_copy;
+        // Append to the internal buffer until it's full
+        __mmask64 to_copy_mask = _sz_u64_mask_until(to_copy);
+        _mm512_mask_storeu_epi8(state->ins.u8s + progress_in_block, to_copy_mask,
+                                _mm512_maskz_loadu_epi8(to_copy_mask, text));
+        text += to_copy;
+        // If we've reached the end of the buffer, update the state
+        if (will_fill_block) {
+            _sz_hash_state_update_ice(state);
+            // Reset to zeros now, so we don't have to overwrite an immutable buffer in the folding state
+            state->ins.zmm = _mm512_setzero_si512();
+        }
+    }
+}
+
+SZ_PUBLIC sz_u64_t sz_hash_state_fold_ice(sz_hash_state_t const *state) { return sz_hash_state_fold_haswell(state); }
+
 SZ_PUBLIC void sz_generate_ice(sz_ptr_t output, sz_size_t length, sz_u64_t nonce) {
     // We can use `_mm512_broadcast_i32x4` and the `vbroadcasti32x4` instruction, but its latency is freaking 8 cycles.
     // The `_mm512_shuffle_i32x4` and the `vshufi32x4` instruction has a latency of 3 cycles, somewhat better.
@@ -1544,16 +1594,6 @@ SZ_PUBLIC void sz_generate_ice(sz_ptr_t output, sz_size_t length, sz_u64_t nonce
     }
 }
 
-SZ_PUBLIC void sz_hash_state_init_ice(sz_hash_state_t *state, sz_u64_t seed) {
-    sz_hash_state_init_skylake(state, seed);
-}
-
-SZ_PUBLIC void sz_hash_state_stream_ice(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
-    sz_hash_state_stream_serial(state, text, length);
-}
-
-SZ_PUBLIC sz_u64_t sz_hash_state_fold_ice(sz_hash_state_t const *state) { return sz_hash_state_fold_serial(state); }
-
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_ICE

From 80688bb11e136bb6227d1bc9089606620ea142c8 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 26 Feb 2025 17:25:22 +0000
Subject: [PATCH 130/751] Improve: Testing hash functions

---
 scripts/test.cpp | 145 ++++++++++++++++++++++++++++++++++++++++-------
 scripts/test.hpp |  26 ++++++++-
 2 files changed, 151 insertions(+), 20 deletions(-)

diff --git a/scripts/test.cpp b/scripts/test.cpp
index e3a62f3d..13090d8c 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -1,27 +1,43 @@
+/**
+ *  @brief   Extensive @b unit-testing suite for StringZilla, written in C++.
+ *  @note    It tests one target hardware platform at a time and should be compiled and run separately for each.
+ *           To override the default hardware platform, overrides the @b `SZ_USE_*` flags at the top of this file.
+ *
+ *  @see     Stress-tests on real-world and synthetic data are integrated into the @b `scripts/bench*.cpp` benchmarks.
+ *
+ *  @file    test.cpp
+ *  @author  Ash Vardanian
+ */
+
+#include <limits>
 #undef NDEBUG // Enable all assertions
 
-// Enable assertions for iterators
+/* The Visual C++ run-time library detects incorrect iterator use,
+ * and asserts and displays a dialog box at run time on Windows.
+ */
 #if !defined(_ITERATOR_DEBUG_LEVEL) || _ITERATOR_DEBUG_LEVEL == 0
 #define _ITERATOR_DEBUG_LEVEL 1
 #endif
 
 #include <cassert> // assertions
 
-// Overload the following with caution.
-// Those parameters must never be explicitly set during releases,
-// but they come handy during development, if you want to validate
-// different ISA-specific implementations.
+/**
+ * ! Overload the following with caution.
+ * ! Those parameters must never be explicitly set during releases,
+ * ! but they come handy during development, if you want to validate
+ * ! different ISA-specific implementations.
+ */
 // #define SZ_USE_HASWELL 0
+// #define SZ_USE_SKYLAKE 0
 // #define SZ_USE_ICE 0
 // #define SZ_USE_NEON 0
 // #define SZ_USE_SVE 0
 #define SZ_DEBUG 1 // Enforce aggressive logging for this unit.
 
-// Put this at the top to make sure it pulls all the right dependencies
 #include <stringzilla/stringzilla.hpp>
 
 #if defined(__SANITIZE_ADDRESS__)
-#include <sanitizer/asan_interface.h> // ASAN
+#include <sanitizer/asan_interface.h> // We use ASAN API to poison memory addresses
 #endif
 
 #include <algorithm>     // `std::transform`
@@ -133,6 +149,84 @@ static void test_arithmetical_utilities() {
 #endif
 }
 
+/**
+ *  @brief  Several string processing operations rely on computing integer logarithms.
+ *          Failures in such operations will result in wrong `resize` outcomes and heap corruption.
+ */
+static void test_hashing_on_platform(                                   //
+    sz_hash_t hash_base, sz_hash_state_init_t init_base,                //
+    sz_hash_state_stream_t stream_base, sz_hash_state_fold_t fold_base, //
+    sz_hash_t hash_simd, sz_hash_state_init_t init_simd,                //
+    sz_hash_state_stream_t stream_simd, sz_hash_state_fold_t fold_simd) {
+
+    auto test_on_seed = [&](std::string text, sz_u64_t seed) {
+        // Compute the entire hash at once, expecting the same output
+        sz_u64_t result_base = hash_base(text.data(), text.size(), seed);
+        sz_u64_t result_simd = hash_simd(text.data(), text.size(), seed);
+        assert(result_base == result_simd);
+
+        // Compare incremental hashing across platforms
+        sz_hash_state_t state_base, state_simd;
+        init_base(&state_base, seed);
+        init_simd(&state_simd, seed);
+        assert(sz_hash_state_equal(&state_base, &state_base) == sz_true_k); // Self-equality
+        assert(sz_hash_state_equal(&state_simd, &state_simd) == sz_true_k); // Self-equality
+        assert(sz_hash_state_equal(&state_base, &state_simd) == sz_true_k); // Same across platforms
+
+        // Try breaking those strings into arbitrary chunks, expecting the same output in the streaming mode.
+        // The length of each chunk and the number of chunks will be determined with a coin toss.
+        iterate_in_random_slices(text, [&](std::string slice) {
+            stream_base(&state_base, slice.data(), slice.size());
+            stream_simd(&state_simd, slice.data(), slice.size());
+            assert(sz_hash_state_equal(&state_base, &state_simd) == sz_true_k); // Same across platforms
+            result_base = fold_base(&state_base);
+            result_simd = fold_simd(&state_simd);
+            assert(result_base == result_simd);
+        });
+    };
+
+    // Let's try different-length strings repeating a "abc" pattern:
+    std::vector<sz_u64_t> seeds = {
+        0u, 42u,                              //
+        std::numeric_limits<sz_u32_t>::max(), //
+        std::numeric_limits<sz_u64_t>::max(), //
+    };
+    for (auto seed : seeds)
+        for (std::size_t copies = 1; copies != 100; ++copies) //
+            test_on_seed(repeat("abc", copies), seed);
+}
+
+static void test_hashing_across_platforms() {
+#if SZ_USE_HASWELL
+    test_hashing_on_platform(                                   //
+        sz_hash_serial, sz_hash_state_init_serial,              //
+        sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
+        sz_hash_haswell, sz_hash_state_init_haswell,            //
+        sz_hash_state_stream_haswell, sz_hash_state_fold_haswell);
+#endif
+#if SZ_USE_SKYLAKE
+    test_hashing_on_platform(                                   //
+        sz_hash_serial, sz_hash_state_init_serial,              //
+        sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
+        sz_hash_skylake, sz_hash_state_init_skylake,            //
+        sz_hash_state_stream_skylake, sz_hash_state_fold_skylake);
+#endif
+#if SZ_USE_ICE
+    test_hashing_on_platform(                                   //
+        sz_hash_serial, sz_hash_state_init_serial,              //
+        sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
+        sz_hash_ice, sz_hash_state_init_ice,                    //
+        sz_hash_state_stream_ice, sz_hash_state_fold_ice);
+#endif
+#if SZ_USE_NEON
+    test_hashing_on_platform(                                   //
+        sz_hash_serial, sz_hash_state_init_serial,              //
+        sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
+        sz_hash_neon, sz_hash_state_init_neon,                  //
+        sz_hash_state_stream_neon, sz_hash_state_fold_neon);
+#endif
+};
+
 /**
  *  @brief  Tests various ASCII-based methods (e.g., `is_alpha`, `is_digit`)
  *          provided by `sz::string` and `sz::string_view`.
@@ -291,10 +385,10 @@ static void test_memory_utilities( //
 #if 0 // TODO:
 
     // We are going to randomly select the "source" and "target" slices of the strings.
-    // For `memcpy` and `memset` the offsets should have uniform ditribution,
+    // For `memcpy` and `memset` the offsets should have uniform distribution,
     // while the length should decay with an exponential distribution.
     // For `memmove` the offset should be uniform, but the "shift" and "length" should
-    // be exponenetial. The exponential distributions should be functions of the cache line width.
+    // be exponential. The exponential distributions should be functions of the cache line width.
     // https://en.cppreference.com/w/cpp/numeric/random/exponential_distribution
     std::string dataset(max_l2_size, '-');
     auto &gen = global_random_generator();
@@ -953,13 +1047,13 @@ static void test_constructors() {
         strings.push_back(alphabet.substr(0, alphabet_slice));
     std::vector<sz::string> copies {strings};
     assert(copies.size() == strings.size());
-    for (size_t i = 0; i < copies.size(); i++) {
+    for (size_t i = 0; i < copies.size(); ++i) {
         assert(copies[i].size() == strings[i].size());
         assert(copies[i] == strings[i]);
         for (size_t j = 0; j < strings[i].size(); j++) { assert(copies[i][j] == strings[i][j]); }
     }
     std::vector<sz::string> assignments = strings;
-    for (size_t i = 0; i < assignments.size(); i++) {
+    for (size_t i = 0; i < assignments.size(); ++i) {
         assert(assignments[i].size() == strings[i].size());
         assert(assignments[i] == strings[i]);
         for (size_t j = 0; j < strings[i].size(); j++) { assert(assignments[i][j] == strings[i][j]); }
@@ -1027,12 +1121,12 @@ static void test_memory_stability_for_length(std::size_t len = 1ull << 10) {
     using string = sz::basic_string<char, accounting_allocator>;
     string base;
 
-    for (std::size_t i = 0; i < len; i++) base.push_back('c');
+    for (std::size_t i = 0; i < len; ++i) base.push_back('c');
     assert(base.length() == len);
 
     // Do copies leak?
     assert_balanced_memory([&]() {
-        for (std::size_t i = 0; i < iterations; i++) {
+        for (std::size_t i = 0; i < iterations; ++i) {
             string copy(base);
             assert(copy.length() == len);
             assert(copy == base);
@@ -1041,7 +1135,7 @@ static void test_memory_stability_for_length(std::size_t len = 1ull << 10) {
 
     // How about assignments?
     assert_balanced_memory([&]() {
-        for (std::size_t i = 0; i < iterations; i++) {
+        for (std::size_t i = 0; i < iterations; ++i) {
             string copy;
             copy = base;
             assert(copy.length() == len);
@@ -1051,7 +1145,7 @@ static void test_memory_stability_for_length(std::size_t len = 1ull << 10) {
 
     // How about the move constructor?
     assert_balanced_memory([&]() {
-        for (std::size_t i = 0; i < iterations; i++) {
+        for (std::size_t i = 0; i < iterations; ++i) {
             string unique_item(base);
             assert(unique_item.length() == len);
             assert(unique_item == base);
@@ -1063,7 +1157,7 @@ static void test_memory_stability_for_length(std::size_t len = 1ull << 10) {
 
     // And the move assignment operator with an empty target payload?
     assert_balanced_memory([&]() {
-        for (std::size_t i = 0; i < iterations; i++) {
+        for (std::size_t i = 0; i < iterations; ++i) {
             string unique_item(base);
             string copy;
             copy = std::move(unique_item);
@@ -1074,7 +1168,7 @@ static void test_memory_stability_for_length(std::size_t len = 1ull << 10) {
 
     // And move assignment where the target had a payload?
     assert_balanced_memory([&]() {
-        for (std::size_t i = 0; i < iterations; i++) {
+        for (std::size_t i = 0; i < iterations; ++i) {
             string unique_item(base);
             string copy;
             for (std::size_t j = 0; j < 317; j++) copy.push_back('q');
@@ -1570,7 +1664,7 @@ void test_replacements(std::size_t lookup_tables_to_try = 128, std::size_t slice
     for (std::size_t lookup_table_variation = 0; lookup_table_variation != lookup_tables_to_try;
          ++lookup_table_variation) {
         sz::look_up_table lut;
-        for (std::size_t i = 0; i < 256; i++) lut[(char)i] = (char)(std::rand() % 256);
+        for (std::size_t i = 0; i < 256; ++i) lut[(char)i] = (char)(std::rand() % 256);
 
         for (std::size_t slice_idx = 0; slice_idx != slices_per_table; ++slice_idx) {
             std::size_t slice_offset = std::rand() % (body.length());
@@ -1597,7 +1691,7 @@ static void test_sequence_algorithms() {
         sz_sequence_t sequence;
         sz_cptr_t strings[] = {"banana", "apple", "cherry"};
         sz_sequence_from_null_terminated_strings(strings, 3, &sequence);
-        assert(sequence.size == 3);
+        assert(sequence.count == 3);
         assert(sequence.get_start(sequence.handle, 0) == "banana"_sv);
         assert(sequence.get_start(sequence.handle, 1) == "apple"_sv);
         assert(sequence.get_start(sequence.handle, 2) == "cherry"_sv);
@@ -1687,6 +1781,14 @@ static void test_stl_containers() {
 
 int main(int argc, char const **argv) {
 
+    sz_u128_vec_t some_state, some_key;
+    randomize_string((char *)&some_state.u8s[0], 16);
+    randomize_string((char *)&some_key.u8s[0], 16);
+    sz_u128_vec_t emulated_result = _sz_emulate_aesenc_si128_serial(some_state, some_key);
+    sz_u128_vec_t hardware_result;
+    hardware_result.xmm = _mm_aesenc_si128(some_state.xmm, some_key.xmm);
+    assert(memcmp(&emulated_result, &hardware_result, sizeof(sz_u128_vec_t)) == 0);
+
     // Let's greet the user nicely
     sz_unused(argc && argv);
     std::printf("Hi, dear tester! You look nice today!\n");
@@ -1698,6 +1800,11 @@ int main(int argc, char const **argv) {
 
     // Basic utilities
     test_arithmetical_utilities();
+
+    // Compatibility across hardware-specific implementations
+    test_hashing_across_platforms();
+
+    // Core APIs
     test_ascii_utilities<sz::string>();
     test_ascii_utilities<sz::string_view>();
     test_memory_utilities();
diff --git a/scripts/test.hpp b/scripts/test.hpp
index 6c37e9f6..ede983bc 100644
--- a/scripts/test.hpp
+++ b/scripts/test.hpp
@@ -1,5 +1,5 @@
 /**
- *  @brief  Helper structures and functions for C++ tests.
+ *  @brief  Helper structures and functions for C++ unit- and stress-tests.
  */
 #pragma once
 #include <fstream>  // `std::ifstream`
@@ -56,12 +56,36 @@ inline void randomize_string(char *string, std::size_t length, char const *alpha
     std::generate(string, string + length, [&]() -> char { return alphabet[distribution(global_random_generator())]; });
 }
 
+inline void randomize_string(char *string, std::size_t length) {
+    uniform_uint8_distribution_t distribution;
+    std::generate(string, string + length, [&]() -> char { return distribution(global_random_generator()); });
+}
+
 inline std::string random_string(std::size_t length, char const *alphabet, std::size_t cardinality) {
     std::string result(length, '\0');
     randomize_string(&result[0], length, alphabet, cardinality);
     return result;
 }
 
+inline std::string repeat(std::string const &patten, std::size_t count) {
+    std::string result(patten.size() * count, '\0');
+    for (std::size_t i = 0; i < count; ++i) std::copy(patten.begin(), patten.end(), result.begin() + i * patten.size());
+    return result;
+}
+
+/**
+ *  @brief  A callback type for iterating over consecutive random-length slices of a string.
+ */
+template <typename slice_callback_type_>
+inline void iterate_in_random_slices(std::string const &text, slice_callback_type_ &&slice_callback) {
+    std::size_t remaining = text.size();
+    while (remaining > 0) {
+        std::size_t slice_length = std::uniform_int_distribution<std::size_t>(1, remaining)(global_random_generator());
+        slice_callback({text.data() + text.size() - remaining, slice_length});
+        remaining -= slice_length;
+    }
+}
+
 /**
  *  @brief  Inefficient baseline Levenshtein distance computation, as implemented in most codebases.
  *          Allocates a new matrix on every call, with rows potentially scattered around memory.

From 6659aa0869582df0e611fbdcb457fd83aca0917e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 26 Feb 2025 17:42:41 +0000
Subject: [PATCH 131/751] Add: PRNG for Haswell & serial backend

---
 include/stringzilla/hash.h | 169 ++++++++++++++++++++++++++++++-------
 scripts/test.cpp           |  42 +++++++--
 2 files changed, 174 insertions(+), 37 deletions(-)

diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 50deb776..6ef11e3d 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -125,14 +125,14 @@ SZ_DYNAMIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
  *  @brief  A Pseudorandom Number Generator (PRNG), inspired the AES-CTR-128 algorithm,
  *          but using only one round of AES mixing as opposed to "NIST SP 800-90A".
  *
- *  CTR_DRBG (CounTeR mode Deterministic Random Bit Generator) appears secure and indistinguishable from a true
- *  random source when AES is used as the underlying block cipher and 112 bits are taken from this PRNG.
+ *  CTR_DRBG (CounTeR mode Deterministic Random Bit Generator) appears secure and indistinguishable from a
+ *  true random source when AES is used as the underlying block cipher and 112 bits are taken from this PRNG.
  *  When AES is used as the underlying block cipher and 128 bits are taken from each instantiation,
  *  the required security level is delivered with the caveat that a 128-bit cipher's output in
  *  counter mode can be distinguished from a true RNG.
  *
  *  In this case, it doesn't apply, as we only use one round of AES mixing. We also don't expose a separate "key",
- *  only a "nonce", to keep the API simple.
+ *  only a "nonce", to keep the API simple, but we mix it with 512 bits of Pi constants to increase randomness.
  *
  *  @param[out] text Output string buffer to be populated.
  *  @param[in] length Number of bytes in the string.
@@ -145,7 +145,7 @@ SZ_DYNAMIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
  *      int main() {
  *          char first_buffer[5], second_buffer[5];
  *          sz_generate(first_buffer, 5, 0);
- *          sz_generate(second_buffer, 5, 0); //? Same nonce will produce the same output
+ *          sz_generate(second_buffer, 5, 0); //? Same nonce must produce the same output
  *          return sz_bytesum(first_buffer, 5) == sz_bytesum(second_buffer, 5) ? 0 : 1;
  *      }
  *  @endcode
@@ -705,7 +705,19 @@ SZ_PUBLIC sz_u64_t sz_hash_state_fold_serial(sz_hash_state_t const *state) {
 }
 
 SZ_PUBLIC void sz_generate_serial(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
-    sz_unused(text && length && nonce);
+    sz_u64_t const *pi_ptr = _sz_hash_pi_constants();
+    sz_u128_vec_t input_vec, pi_vec, key_vec, generated_vec;
+    for (sz_size_t lane_index = 0; length; ++lane_index) {
+        // Each 128-bit block is initialized with the same nonce
+        input_vec.u64s[0] = input_vec.u64s[1] = nonce + lane_index;
+        // We rotate the first 512-bits of the Pi to mix with the nonce
+        pi_vec = ((sz_u128_vec_t const *)pi_ptr)[lane_index % 4];
+        key_vec.u64s[0] = nonce ^ pi_vec.u64s[0];
+        key_vec.u64s[1] = nonce ^ pi_vec.u64s[1];
+        generated_vec = _sz_emulate_aesenc_si128_serial(input_vec, key_vec);
+        // Export back to the user-supplied buffer
+        for (int i = 0; i < 16 && length; ++i, --length) *text++ = generated_vec.u8s[i];
+    }
 }
 
 #pragma endregion // Serial Implementation
@@ -1047,7 +1059,97 @@ SZ_PUBLIC sz_u64_t sz_hash_state_fold_haswell(sz_hash_state_t const *state) {
 }
 
 SZ_PUBLIC void sz_generate_haswell(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
-    sz_generate_serial(text, length, nonce);
+    sz_u64_t const *pi_ptr = _sz_hash_pi_constants();
+    if (length <= 16) {
+        __m128i input = _mm_set1_epi64x(nonce);
+        __m128i pi = _mm_load_si128((__m128i const *)pi_ptr);
+        __m128i key = _mm_xor_si128(_mm_set1_epi64x(nonce), pi);
+        __m128i generated = _mm_aesenc_si128(input, key);
+        // Now the tricky part is outputting this data to the user-supplied buffer
+        // without masked writes, like in AVX-512.
+        for (sz_size_t i = 0; i < length; ++i) text[i] = ((sz_u8_t *)&generated)[i];
+    }
+    // Assuming the YMM register contains two 128-bit blocks, the input to the generator
+    // will be more complex, containing the sum of the nonce and the block number.
+    else if (length <= 32) {
+        __m128i inputs[2], pis[2], keys[2], generated[2];
+        inputs[0] = _mm_set1_epi64x(nonce);
+        inputs[1] = _mm_set1_epi64x(nonce + 1);
+        pis[0] = _mm_load_si128((__m128i const *)(pi_ptr));
+        pis[1] = _mm_load_si128((__m128i const *)(pi_ptr + 2));
+        keys[0] = _mm_xor_si128(_mm_set1_epi64x(nonce), pis[0]);
+        keys[1] = _mm_xor_si128(_mm_set1_epi64x(nonce), pis[1]);
+        generated[0] = _mm_aesenc_si128(inputs[0], keys[0]);
+        generated[1] = _mm_aesenc_si128(inputs[1], keys[1]);
+        // The first store can easily be vectorized, but the second can be serial for now
+        _mm_storeu_si128((__m128i *)text, generated[0]);
+        for (sz_size_t i = 16; i < length; ++i) text[i] = ((sz_u8_t *)&generated[1])[i - 16];
+    }
+    // The last special case we handle outside of the primary loop is for buffers up to 64 bytes long.
+    else if (length <= 48) {
+        __m128i inputs[3], pis[3], keys[3], generated[3];
+        inputs[0] = _mm_set1_epi64x(nonce);
+        inputs[1] = _mm_set1_epi64x(nonce + 1);
+        inputs[2] = _mm_set1_epi64x(nonce + 2);
+        pis[0] = _mm_load_si128((__m128i const *)(pi_ptr));
+        pis[1] = _mm_load_si128((__m128i const *)(pi_ptr + 2));
+        pis[2] = _mm_load_si128((__m128i const *)(pi_ptr + 4));
+        keys[0] = _mm_xor_si128(_mm_set1_epi64x(nonce), pis[0]);
+        keys[1] = _mm_xor_si128(_mm_set1_epi64x(nonce), pis[1]);
+        keys[2] = _mm_xor_si128(_mm_set1_epi64x(nonce), pis[2]);
+        generated[0] = _mm_aesenc_si128(inputs[0], keys[0]);
+        generated[1] = _mm_aesenc_si128(inputs[1], keys[1]);
+        generated[2] = _mm_aesenc_si128(inputs[2], keys[2]);
+        // The first store can easily be vectorized, but the second can be serial for now
+        _mm_storeu_si128((__m128i *)text, generated[0]);
+        _mm_storeu_si128((__m128i *)(text + 16), generated[1]);
+        for (sz_size_t i = 32; i < length; ++i) text[i] = ((sz_u8_t *)generated)[i];
+    }
+    // The final part of the function is the primary loop, which processes the buffer in 64-byte chunks.
+    else {
+        __m128i inputs[4], pis[4], keys[4], generated[4];
+        inputs[0] = _mm_set1_epi64x(nonce);
+        inputs[1] = _mm_set1_epi64x(nonce + 1);
+        inputs[2] = _mm_set1_epi64x(nonce + 2);
+        inputs[3] = _mm_set1_epi64x(nonce + 3);
+        // Load parts of PI into the registers
+        pis[0] = _mm_load_si128((__m128i const *)(pi_ptr));
+        pis[1] = _mm_load_si128((__m128i const *)(pi_ptr + 2));
+        pis[2] = _mm_load_si128((__m128i const *)(pi_ptr + 4));
+        pis[3] = _mm_load_si128((__m128i const *)(pi_ptr + 6));
+        // XOR the nonce with the PI constants
+        keys[0] = _mm_xor_si128(_mm_set1_epi64x(nonce), pis[0]);
+        keys[1] = _mm_xor_si128(_mm_set1_epi64x(nonce), pis[1]);
+        keys[2] = _mm_xor_si128(_mm_set1_epi64x(nonce), pis[2]);
+        keys[3] = _mm_xor_si128(_mm_set1_epi64x(nonce), pis[3]);
+
+        // Produce the output, fixing the key and enumerating input chunks.
+        sz_size_t i = 0;
+        __m128i const increment = _mm_set1_epi64x(4);
+        for (; i + 64 <= length; i += 64) {
+            generated[0] = _mm_aesenc_si128(inputs[0], keys[0]);
+            generated[1] = _mm_aesenc_si128(inputs[1], keys[1]);
+            generated[2] = _mm_aesenc_si128(inputs[2], keys[2]);
+            generated[3] = _mm_aesenc_si128(inputs[3], keys[3]);
+            _mm_storeu_si128((__m128i *)(text + i), generated[0]);
+            _mm_storeu_si128((__m128i *)(text + i + 16), generated[1]);
+            _mm_storeu_si128((__m128i *)(text + i + 32), generated[2]);
+            _mm_storeu_si128((__m128i *)(text + i + 48), generated[3]);
+            inputs[0] = _mm_add_epi64(inputs[0], increment);
+            inputs[1] = _mm_add_epi64(inputs[1], increment);
+            inputs[2] = _mm_add_epi64(inputs[2], increment);
+            inputs[3] = _mm_add_epi64(inputs[3], increment);
+        }
+
+        // Handle the tail of the buffer.
+        {
+            generated[0] = _mm_aesenc_si128(inputs[0], keys[0]);
+            generated[1] = _mm_aesenc_si128(inputs[1], keys[1]);
+            generated[2] = _mm_aesenc_si128(inputs[2], keys[2]);
+            generated[3] = _mm_aesenc_si128(inputs[3], keys[3]);
+            for (sz_size_t j = 0; i < length; ++i, ++j) text[i] = ((sz_u8_t *)generated)[j];
+        }
+    }
 }
 
 #pragma clang attribute pop
@@ -1280,6 +1382,7 @@ SZ_PUBLIC void sz_hash_state_stream_skylake(sz_hash_state_t *state, sz_cptr_t te
 }
 
 SZ_PUBLIC sz_u64_t sz_hash_state_fold_skylake(sz_hash_state_t const *state) {
+    //? We don't know a better way to fold the state on Ice Lake, than to use the Haswell implementation.
     return sz_hash_state_fold_haswell(state);
 }
 
@@ -1539,58 +1642,62 @@ SZ_PUBLIC void sz_hash_state_stream_ice(sz_hash_state_t *state, sz_cptr_t text,
     }
 }
 
-SZ_PUBLIC sz_u64_t sz_hash_state_fold_ice(sz_hash_state_t const *state) { return sz_hash_state_fold_haswell(state); }
+SZ_PUBLIC sz_u64_t sz_hash_state_fold_ice(sz_hash_state_t const *state) {
+    //? We don't know a better way to fold the state on Ice Lake, than to use the Haswell implementation.
+    return sz_hash_state_fold_haswell(state);
+}
 
 SZ_PUBLIC void sz_generate_ice(sz_ptr_t output, sz_size_t length, sz_u64_t nonce) {
-    // We can use `_mm512_broadcast_i32x4` and the `vbroadcasti32x4` instruction, but its latency is freaking 8 cycles.
-    // The `_mm512_shuffle_i32x4` and the `vshufi32x4` instruction has a latency of 3 cycles, somewhat better.
-    // The `_mm512_permutex_epi64` and the `vpermq` instruction also has a latency of 3 cycles.
-    // So we want to avoid that, if possible.
-    __m128i nonce_vec = _mm_set1_epi64x(nonce);
-    __m128i key128 = _mm_xor_si128(nonce_vec, _mm_set_epi64x(0x13198a2e03707344ull, 0x243f6a8885a308d3ull));
     if (length <= 16) {
-        __mmask16 mask = _sz_u16_mask_until(length);
         __m128i input = _mm_set1_epi64x(nonce);
-        __m128i generated = _mm_aesenc_si128(input, key128);
-        _mm_mask_storeu_epi8((void *)output, mask, generated);
+        __m128i pi = _mm_load_si128((__m128i const *)_sz_hash_pi_constants());
+        __m128i key = _mm_xor_si128(_mm_set1_epi64x(nonce), pi);
+        __m128i generated = _mm_aesenc_si128(input, key);
+        __mmask16 store_mask = _sz_u16_mask_until(length);
+        _mm_mask_storeu_epi8((void *)output, store_mask, generated);
     }
     // Assuming the YMM register contains two 128-bit blocks, the input to the generator
     // will be more complex, containing the sum of the nonce and the block number.
     else if (length <= 32) {
-        __mmask32 mask = _sz_u32_mask_until(length);
         __m256i input = _mm256_set_epi64x(nonce + 1, nonce + 1, nonce, nonce);
-        __m256i key256 =
-            _mm256_permute2x128_si256(_mm256_castsi128_si256(key128), _mm256_castsi128_si256(key128), 0x00);
-        __m256i generated = _mm256_aesenc_epi128(input, key256);
-        _mm256_mask_storeu_epi8((void *)output, mask, generated);
+        __m256i pi = _mm256_load_si256((__m256i const *)_sz_hash_pi_constants());
+        __m256i key = _mm256_xor_si256(_mm256_set1_epi64x(nonce), pi);
+        __m256i generated = _mm256_aesenc_epi128(input, key);
+        __mmask32 store_mask = _sz_u32_mask_until(length);
+        _mm256_mask_storeu_epi8((void *)output, store_mask, generated);
     }
     // The last special case we handle outside of the primary loop is for buffers up to 64 bytes long.
     else if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
         __m512i input = _mm512_set_epi64(               //
             nonce + 3, nonce + 3, nonce + 2, nonce + 2, //
             nonce + 1, nonce + 1, nonce, nonce);
-        __m512i key512 = _mm512_permutex_epi64(_mm512_castsi128_si512(key128), 0x00);
-        __m512i generated = _mm512_aesenc_epi128(input, key512);
-        _mm512_mask_storeu_epi8((void *)output, mask, generated);
+        __m512i pi = _mm512_load_si512((__m512i const *)_sz_hash_pi_constants());
+        __m512i key = _mm512_xor_si512(_mm512_set1_epi64(nonce), pi);
+        __m512i generated = _mm512_aesenc_epi128(input, key);
+        __mmask64 store_mask = _sz_u64_mask_until(length);
+        _mm512_mask_storeu_epi8((void *)output, store_mask, generated);
     }
     // The final part of the function is the primary loop, which processes the buffer in 64-byte chunks.
     else {
-        __m512i increment = _mm512_set1_epi64(4);
+        __m512i const increment = _mm512_set1_epi64(4);
         __m512i input = _mm512_set_epi64(               //
             nonce + 3, nonce + 3, nonce + 2, nonce + 2, //
             nonce + 1, nonce + 1, nonce, nonce);
-        __m512i key512 = _mm512_permutex_epi64(_mm512_castsi128_si512(key128), 0x00);
+        __m512i const pi = _mm512_load_si512((__m512i const *)_sz_hash_pi_constants());
+        __m512i const key = _mm512_xor_si512(_mm512_set1_epi64(nonce), pi);
+
+        // Produce the output, fixing the key and enumerating input chunks.
         sz_size_t i = 0;
         for (; i + 64 <= length; i += 64) {
-            __m512i generated = _mm512_aesenc_epi128(input, key512);
+            __m512i generated = _mm512_aesenc_epi128(input, key);
             _mm512_storeu_epi8((void *)(output + i), generated);
             input = _mm512_add_epi64(input, increment);
         }
+
         // Handle the tail of the buffer.
-        __mmask64 mask = _sz_u64_mask_until(length - i);
-        __m512i generated = _mm512_aesenc_epi128(input, key512);
-        _mm512_mask_storeu_epi8((void *)(output + i), mask, generated);
+        __m512i generated = _mm512_aesenc_epi128(input, key);
+        __mmask64 store_mask = _sz_u64_mask_until(length - i);
+        _mm512_mask_storeu_epi8((void *)(output + i), store_mask, generated);
     }
 }
 
diff --git a/scripts/test.cpp b/scripts/test.cpp
index 13090d8c..8465cf15 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -150,8 +150,10 @@ static void test_arithmetical_utilities() {
 }
 
 /**
- *  @brief  Several string processing operations rely on computing integer logarithms.
- *          Failures in such operations will result in wrong `resize` outcomes and heap corruption.
+ *  @brief  Hashes a string and compares the output between a serial and hardware-specific SIMD backend.
+ *
+ *  The test covers increasingly long and complex strings, starting with "abcabc..." repetitions and
+ *  progressing towards corner cases like empty strings, all-zero inputs, zero seeds, and so on.
  */
 static void test_hashing_on_platform(                                   //
     sz_hash_t hash_base, sz_hash_state_init_t init_base,                //
@@ -196,13 +198,40 @@ static void test_hashing_on_platform(                                   //
             test_on_seed(repeat("abc", copies), seed);
 }
 
-static void test_hashing_across_platforms() {
+/**
+ *  @brief  Tests Pseudo-Random Number Generators (PRNGs) ensuring that the same nonce
+ *          produces exactly the same output across different SIMD implementations.
+ */
+static void test_random_generator_on_platform(sz_generate_t generate_base, sz_generate_t generate_simd) {
+
+    auto test_on_nonce = [&](std::size_t length, sz_u64_t nonce) {
+        std::string text_base(length, '\0');
+        std::string text_simd(length, '\0');
+        generate_base(&text_base[0], static_cast<sz_size_t>(length), nonce);
+        generate_simd(&text_simd[0], static_cast<sz_size_t>(length), nonce);
+        assert(text_base == text_simd);
+    };
+
+    // Let's try different nonces:
+    std::vector<sz_u64_t> nonces = {
+        0u, 42u,                              //
+        std::numeric_limits<sz_u32_t>::max(), //
+        std::numeric_limits<sz_u64_t>::max(), //
+    };
+    std::vector<std::size_t> lengths = {1, 11, 23, 37, 40, 51, 64, 128, 1000};
+    for (auto nonce : nonces)
+        for (auto length : lengths) //
+            test_on_nonce(length, nonce);
+}
+
+static void test_simd_against_serial() {
 #if SZ_USE_HASWELL
     test_hashing_on_platform(                                   //
         sz_hash_serial, sz_hash_state_init_serial,              //
         sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
         sz_hash_haswell, sz_hash_state_init_haswell,            //
         sz_hash_state_stream_haswell, sz_hash_state_fold_haswell);
+    test_random_generator_on_platform(sz_generate_serial, sz_generate_haswell);
 #endif
 #if SZ_USE_SKYLAKE
     test_hashing_on_platform(                                   //
@@ -210,6 +239,7 @@ static void test_hashing_across_platforms() {
         sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
         sz_hash_skylake, sz_hash_state_init_skylake,            //
         sz_hash_state_stream_skylake, sz_hash_state_fold_skylake);
+    test_random_generator_on_platform(sz_generate_serial, sz_generate_skylake);
 #endif
 #if SZ_USE_ICE
     test_hashing_on_platform(                                   //
@@ -217,6 +247,7 @@ static void test_hashing_across_platforms() {
         sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
         sz_hash_ice, sz_hash_state_init_ice,                    //
         sz_hash_state_stream_ice, sz_hash_state_fold_ice);
+    test_random_generator_on_platform(sz_generate_serial, sz_generate_ice);
 #endif
 #if SZ_USE_NEON
     test_hashing_on_platform(                                   //
@@ -224,6 +255,7 @@ static void test_hashing_across_platforms() {
         sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
         sz_hash_neon, sz_hash_state_init_neon,                  //
         sz_hash_state_stream_neon, sz_hash_state_fold_neon);
+    test_random_generator_on_platform(sz_generate_serial, sz_generate_neon);
 #endif
 };
 
@@ -1800,9 +1832,7 @@ int main(int argc, char const **argv) {
 
     // Basic utilities
     test_arithmetical_utilities();
-
-    // Compatibility across hardware-specific implementations
-    test_hashing_across_platforms();
+    test_simd_against_serial();
 
     // Core APIs
     test_ascii_utilities<sz::string>();

From 2bbafa111426cab613053665a67cd41d44154920 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 27 Feb 2025 22:55:19 +0000
Subject: [PATCH 132/751] Make: Drop unused `build.sh`

---
 scripts/build.sh | 65 ------------------------------------------------
 1 file changed, 65 deletions(-)
 delete mode 100755 scripts/build.sh

diff --git a/scripts/build.sh b/scripts/build.sh
deleted file mode 100755
index 600e5758..00000000
--- a/scripts/build.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/bash
-# This Bash script compiles the CMake-based project with different compilers for different verrsions of C++
-# This is what should happen if only GCC 12 is installed and we are running on Sapphire Rapids.
-#
-#       cmake -DCMAKE_BUILD_TYPE=Release -DSTRINGZILLA_BUILD_BENCHMARK=1 \
-#           -DCMAKE_CXX_COMPILER=g++-12 -DCMAKE_C_COMPILER=gcc-12 \
-#           -DSTRINGZILLA_TARGET_ARCH="sandybridge" -B build_release/gcc-12-sandybridge && \
-#           cmake --build build_release/gcc-12-sandybridge --config Release
-#       cmake -DCMAKE_BUILD_TYPE=Release -DSTRINGZILLA_BUILD_BENCHMARK=1 \
-#           -DCMAKE_CXX_COMPILER=g++-12 -DCMAKE_C_COMPILER=gcc-12 \
-#           -DSTRINGZILLA_TARGET_ARCH="haswell" -B build_release/gcc-12-haswell && \
-#           cmake --build build_release/gcc-12-haswell --config Release
-#       cmake -DCMAKE_BUILD_TYPE=Release -DSTRINGZILLA_BUILD_BENCHMARK=1 \
-#           -DCMAKE_CXX_COMPILER=g++-12 -DCMAKE_C_COMPILER=gcc-12 \
-#           -DSTRINGZILLA_TARGET_ARCH="sapphirerapids" -B build_release/gcc-12-sapphirerapids && \
-#           cmake --build build_release/gcc-12-sapphirerapids --config Release
-
-# Array of target architectures
-declare -a architectures=("sandybridge" "haswell" "sapphirerapids")
-
-# Function to get installed versions of a compiler
-get_versions() {
-    local compiler_prefix=$1
-    local versions=()
-
-    echo "Checking for compilers in /usr/bin with prefix: $compiler_prefix"
-
-    # Check if the directory /usr/bin exists and is a directory
-    if [ -d "/usr/bin" ]; then
-        for version in /usr/bin/${compiler_prefix}-*; do
-            echo "Checking: $version"
-            if [[ -x "$version" ]]; then
-                local ver=${version##*-}
-                echo "Found compiler version: $ver"
-                versions+=("$ver")
-            fi
-        done
-    else
-        echo "/usr/bin does not exist or is not a directory"
-    fi
-    
-    echo ${versions[@]}
-}
-
-# Get installed versions of GCC and Clang
-gcc_versions=$(get_versions gcc)
-clang_versions=$(get_versions clang)
-
-# Compile for each combination of compiler and architecture
-for arch in "${ARCHS[@]}"; do
-    for gcc_version in $gcc_versions; do
-        cmake -DCMAKE_BUILD_TYPE=Release -DSTRINGZILLA_BUILD_BENCHMARK=1 \
-              -DCMAKE_CXX_COMPILER=g++-$gcc_version -DCMAKE_C_COMPILER=gcc-$gcc_version \
-              -DSTRINGZILLA_TARGET_ARCH="$arch" -B "build_release/gcc-$gcc_version-$arch" && \
-              cmake --build "build_release/gcc-$gcc_version-$arch" --config Release
-    done
-
-    for clang_version in $clang_versions; do
-        cmake -DCMAKE_BUILD_TYPE=Release -DSTRINGZILLA_BUILD_BENCHMARK=1 \
-              -DCMAKE_CXX_COMPILER=clang++-$clang_version -DCMAKE_C_COMPILER=clang-$clang_version \
-              -DSTRINGZILLA_TARGET_ARCH="$arch" -B "build_release/clang-$clang_version-$arch" && \
-              cmake --build "build_release/clang-$clang_version-$arch" --config Release
-    done
-done
-

From 3538e970e09d9b19dc52ed93bd1508cd0bb9d6d6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 28 Feb 2025 14:43:27 +0000
Subject: [PATCH 133/751] Add: Fetching dynamic library version in C

Added `sz_version_major`, `sz_version_minor`,
and `sz_version_patch` APIs
---
 .github/workflows/prerelease.yml  |  6 +++---
 .github/workflows/release.yml     |  6 +++---
 c/lib.c                           |  4 ++++
 include/stringzilla/stringzilla.h | 20 +++++++++++++++++---
 4 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 57514b79..82cace09 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -37,11 +37,11 @@ jobs:
             package.json:"version": "(\d+\.\d+\.\d+)"
             CMakeLists.txt:VERSION (\d+\.\d+\.\d+)
           update-major-version-in: |
-            include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_MAJOR (\d+)
+            include/stringzilla/stringzilla.h:^#define STRINGZILLA_H_VERSION_MAJOR (\d+)
           update-minor-version-in: |
-            include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_MINOR (\d+)
+            include/stringzilla/stringzilla.h:^#define STRINGZILLA_H_VERSION_MINOR (\d+)
           update-patch-version-in: |
-            include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_PATCH (\d+)
+            include/stringzilla/stringzilla.h:^#define STRINGZILLA_H_VERSION_PATCH (\d+)
           dry-run: "true"
 
   test_ubuntu_gcc:
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 6a726b14..1c95500b 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -37,11 +37,11 @@ jobs:
             package.json:"version": "(\d+\.\d+\.\d+)"
             CMakeLists.txt:VERSION (\d+\.\d+\.\d+)
           update-major-version-in: |
-            include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_MAJOR (\d+)
+            include/stringzilla/stringzilla.h:^#define STRINGZILLA_H_VERSION_MAJOR (\d+)
           update-minor-version-in: |
-            include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_MINOR (\d+)
+            include/stringzilla/stringzilla.h:^#define STRINGZILLA_H_VERSION_MINOR (\d+)
           update-patch-version-in: |
-            include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_PATCH (\d+)
+            include/stringzilla/stringzilla.h:^#define STRINGZILLA_H_VERSION_PATCH (\d+)
           dry-run: "false"
           push: "true"
           create-release: "true"
diff --git a/c/lib.c b/c/lib.c
index b65784cc..555f995a 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -391,6 +391,10 @@ BOOL WINAPI _DllMainCRTStartup(HINSTANCE hints, DWORD forward_reason, LPVOID lp)
 __attribute__((constructor)) static void sz_dispatch_table_init_on_gcc_or_clang(void) { sz_dispatch_table_init(); }
 #endif
 
+SZ_DYNAMIC int sz_version_major(void) { return STRINGZILLA_H_VERSION_MAJOR; }
+SZ_DYNAMIC int sz_version_minor(void) { return STRINGZILLA_H_VERSION_MINOR; }
+SZ_DYNAMIC int sz_version_patch(void) { return STRINGZILLA_H_VERSION_PATCH; }
+
 SZ_DYNAMIC sz_u64_t sz_bytesum(sz_cptr_t text, sz_size_t length) { return sz_dispatch_table.bytesum(text, length); }
 
 SZ_DYNAMIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length, sz_u64_t seed) {
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 660ffa6c..284754bd 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -37,9 +37,9 @@
 #ifndef STRINGZILLA_H_
 #define STRINGZILLA_H_
 
-#define STRINGZILLA_VERSION_MAJOR 3
-#define STRINGZILLA_VERSION_MINOR 11
-#define STRINGZILLA_VERSION_PATCH 3
+#define STRINGZILLA_H_VERSION_MAJOR 3
+#define STRINGZILLA_H_VERSION_MINOR 11
+#define STRINGZILLA_H_VERSION_PATCH 3
 
 #include "types.h"        // `sz_size_t`, `sz_bool_t`, `sz_ordering_t`
 #include "compare.h"      // `sz_equal`, `sz_order`
@@ -79,6 +79,20 @@ typedef enum {
  */
 SZ_DYNAMIC sz_capability_t sz_capabilities(void);
 
+#if defined(SZ_DYNAMIC_DISPATCH)
+
+SZ_DYNAMIC int sz_version_major(void);
+SZ_DYNAMIC int sz_version_minor(void);
+SZ_DYNAMIC int sz_version_patch(void);
+
+#else
+
+SZ_PUBLIC int sz_version_major(void) { return STRINGZILLA_H_VERSION_MAJOR; }
+SZ_PUBLIC int sz_version_minor(void) { return STRINGZILLA_H_VERSION_MINOR; }
+SZ_PUBLIC int sz_version_patch(void) { return STRINGZILLA_H_VERSION_PATCH; }
+
+#endif
+
 #ifdef __cplusplus
 }
 #endif // __cplusplus

From 2ce2b49efd696992571074954942d0ad7ec85847 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 28 Feb 2025 14:47:49 +0000
Subject: [PATCH 134/751] Break: `charset`/`generate` ->
 `byteset`/`fill_random`

---
 c/lib.c                             | 163 ++++++++------------
 include/stringzilla/find.h          | 231 ++++++++++++++--------------
 include/stringzilla/hash.h          |  95 +++++++-----
 include/stringzilla/similarity.h    |  10 +-
 include/stringzilla/stringzilla.h   |   2 +-
 include/stringzilla/stringzilla.hpp | 178 ++++++++++-----------
 include/stringzilla/types.h         |  75 +++++----
 scripts/bench_search.cpp            |  42 ++---
 scripts/test.cpp                    |  78 +++++-----
 9 files changed, 428 insertions(+), 446 deletions(-)

diff --git a/c/lib.c b/c/lib.c
index 555f995a..f742ad2b 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -94,14 +94,12 @@ SZ_INTERNAL sz_capability_t _sz_capabilities_arm(void) {
     //  - 0b0010: SVE2.1 is implemented
     // This value must match the existing indicator obtained from ID_AA64PFR0_EL1:
     unsigned supports_sve2 = ((id_aa64zfr0_el1) & 0xF) >= 1;
-    unsigned supports_sve2p1 = ((id_aa64zfr0_el1) & 0xF) >= 2;
     unsigned supports_neon = 1; // NEON is always supported
 
-    return (sz_capability_t)(                   //
-        (sz_cap_neon_k * (supports_neon)) |     //
-        (sz_cap_sve_k * (supports_sve)) |       //
-        (sz_cap_sve2_k * (supports_sve2)) |     //
-        (sz_cap_sve2p1_k * (supports_sve2p1)) | //
+    return (sz_capability_t)(               //
+        (sz_cap_neon_k * (supports_neon)) | //
+        (sz_cap_sve_k * (supports_sve)) |   //
+        (sz_cap_sve2_k * (supports_sve2)) | //
         (sz_cap_serial_k));
 
 #else // if !defined(_SZ_IS_APPLE) && !defined(_SZ_IS_LINUX)
@@ -183,7 +181,7 @@ typedef struct sz_implementations_t {
     sz_hash_state_init_t hash_state_init;
     sz_hash_state_stream_t hash_state_stream;
     sz_hash_state_fold_t hash_state_fold;
-    sz_generate_t generate;
+    sz_fill_random_t fill_random;
 
     sz_find_byte_t find_byte;
     sz_find_byte_t rfind_byte;
@@ -196,9 +194,8 @@ typedef struct sz_implementations_t {
     sz_needleman_wunsch_score_t alignment_score;
 
     sz_sequence_argsort_t sequence_argsort;
+    sz_sequence_join_t sequence_join;
     sz_pgrams_sort_t pgrams_sort;
-    sz_sequence_argsort_stable_t sequence_argsort_stable;
-    sz_pgrams_sort_stable_t pgrams_sort_stable;
 
 } sz_implementations_t;
 
@@ -229,22 +226,21 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
     impl->hash_state_init = sz_hash_state_init_serial;
     impl->hash_state_stream = sz_hash_state_stream_serial;
     impl->hash_state_fold = sz_hash_state_fold_serial;
-    impl->generate = sz_generate_serial;
+    impl->fill_random = sz_fill_random_serial;
 
     impl->find = sz_find_serial;
     impl->rfind = sz_rfind_serial;
     impl->find_byte = sz_find_byte_serial;
     impl->rfind_byte = sz_rfind_byte_serial;
-    impl->find_from_set = sz_find_charset_serial;
-    impl->rfind_from_set = sz_rfind_charset_serial;
+    impl->find_from_set = sz_find_byteset_serial;
+    impl->rfind_from_set = sz_rfind_byteset_serial;
 
     impl->edit_distance = sz_levenshtein_distance_serial;
     impl->alignment_score = sz_needleman_wunsch_score_serial;
 
     impl->sequence_argsort = sz_sequence_argsort_serial;
+    impl->sequence_join = sz_sequence_join_serial;
     impl->pgrams_sort = sz_pgrams_sort_serial;
-    impl->sequence_argsort_stable = sz_sequence_argsort_stable_serial;
-    impl->pgrams_sort_stable = sz_pgrams_sort_stable_serial;
 
 #if SZ_USE_HASWELL
     if (caps & sz_cap_haswell_k) {
@@ -261,14 +257,14 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->hash_state_init = sz_hash_state_init_haswell;
         impl->hash_state_stream = sz_hash_state_stream_haswell;
         impl->hash_state_fold = sz_hash_state_fold_haswell;
-        impl->generate = sz_generate_haswell;
+        impl->fill_random = sz_fill_random_haswell;
 
         impl->find_byte = sz_find_byte_haswell;
         impl->rfind_byte = sz_rfind_byte_haswell;
         impl->find = sz_find_haswell;
         impl->rfind = sz_rfind_haswell;
-        impl->find_from_set = sz_find_charset_haswell;
-        impl->rfind_from_set = sz_rfind_charset_haswell;
+        impl->find_from_set = sz_find_byteset_haswell;
+        impl->rfind_from_set = sz_rfind_byteset_haswell;
     }
 #endif
 
@@ -286,20 +282,24 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->hash_state_init = sz_hash_state_init_skylake;
         impl->hash_state_stream = sz_hash_state_stream_skylake;
         impl->hash_state_fold = sz_hash_state_fold_skylake;
-        impl->generate = sz_generate_skylake;
+        impl->fill_random = sz_fill_random_skylake;
 
         impl->find = sz_find_skylake;
         impl->rfind = sz_rfind_skylake;
         impl->find_byte = sz_find_byte_skylake;
         impl->rfind_byte = sz_rfind_byte_skylake;
         impl->bytesum = sz_bytesum_skylake;
+
+        impl->sequence_argsort = sz_sequence_argsort_skylake;
+        impl->sequence_join = sz_sequence_join_skylake;
+        impl->pgrams_sort = sz_pgrams_sort_skylake;
     }
 #endif
 
 #if SZ_USE_ICE
     if (caps & sz_cap_ice_k) {
-        impl->find_from_set = sz_find_charset_ice;
-        impl->rfind_from_set = sz_rfind_charset_ice;
+        impl->find_from_set = sz_find_byteset_ice;
+        impl->rfind_from_set = sz_rfind_byteset_ice;
 
         impl->edit_distance = sz_levenshtein_distance_ice;
         impl->alignment_score = sz_needleman_wunsch_score_ice;
@@ -311,12 +311,7 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->hash_state_init = sz_hash_state_init_ice;
         impl->hash_state_stream = sz_hash_state_stream_ice;
         impl->hash_state_fold = sz_hash_state_fold_ice;
-        impl->generate = sz_generate_ice;
-
-        impl->sequence_argsort = sz_sequence_argsort_ice;
-        impl->pgrams_sort = sz_pgrams_sort_ice;
-        impl->sequence_argsort_stable = sz_sequence_argsort_stable_ice;
-        impl->pgrams_sort_stable = sz_pgrams_sort_stable_ice;
+        impl->fill_random = sz_fill_random_ice;
     }
 #endif
 
@@ -334,23 +329,22 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->hash_state_init = sz_hash_state_init_neon;
         impl->hash_state_stream = sz_hash_state_stream_neon;
         impl->hash_state_fold = sz_hash_state_fold_neon;
-        impl->generate = sz_generate_neon;
+        impl->fill_random = sz_fill_random_neon;
 
         impl->find = sz_find_neon;
         impl->rfind = sz_rfind_neon;
         impl->find_byte = sz_find_byte_neon;
         impl->rfind_byte = sz_rfind_byte_neon;
-        impl->find_from_set = sz_find_charset_neon;
-        impl->rfind_from_set = sz_rfind_charset_neon;
+        impl->find_from_set = sz_find_byteset_neon;
+        impl->rfind_from_set = sz_rfind_byteset_neon;
     }
 #endif
 
 #if SZ_USE_SVE
     if (caps & sz_cap_sve_k) {
         impl->sequence_argsort = sz_sequence_argsort_sve;
+        impl->sequence_join = sz_sequence_join_sve;
         impl->pgrams_sort = sz_pgrams_sort_sve;
-        impl->sequence_argsort_stable = sz_sequence_argsort_stable_sve;
-        impl->pgrams_sort_stable = sz_pgrams_sort_stable_sve;
     }
 #endif
 }
@@ -413,8 +407,8 @@ SZ_DYNAMIC sz_u64_t sz_hash_state_fold(sz_hash_state_t const *state) {
     return sz_dispatch_table.hash_state_fold(state);
 }
 
-SZ_DYNAMIC void sz_generate(sz_ptr_t result, sz_size_t result_length, sz_u64_t nonce) {
-    sz_dispatch_table.generate(result, result_length, nonce);
+SZ_DYNAMIC void sz_fill_random(sz_ptr_t result, sz_size_t result_length, sz_u64_t nonce) {
+    sz_dispatch_table.fill_random(result, result_length, nonce);
 }
 
 SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
@@ -457,51 +451,47 @@ SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t
     return sz_dispatch_table.rfind(haystack, h_length, needle, n_length);
 }
 
-SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
+SZ_DYNAMIC sz_cptr_t sz_find_byteset(sz_cptr_t text, sz_size_t length, sz_byteset_t const *set) {
     return sz_dispatch_table.find_from_set(text, length, set);
 }
 
-SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
+SZ_DYNAMIC sz_cptr_t sz_rfind_byteset(sz_cptr_t text, sz_size_t length, sz_byteset_t const *set) {
     return sz_dispatch_table.rfind_from_set(text, length, set);
 }
 
-SZ_DYNAMIC sz_size_t sz_hamming_distance( //
-    sz_cptr_t a, sz_size_t a_length,      //
-    sz_cptr_t b, sz_size_t b_length,      //
-    sz_size_t bound) {
-    return sz_hamming_distance_serial(a, a_length, b, b_length, bound);
+SZ_DYNAMIC sz_status_t sz_hamming_distance( //
+    sz_cptr_t a, sz_size_t a_length,        //
+    sz_cptr_t b, sz_size_t b_length,        //
+    sz_size_t bound, sz_size_t *result) {
+    return sz_hamming_distance_serial(a, a_length, b, b_length, bound, result);
 }
 
-SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length,           //
-    sz_cptr_t b, sz_size_t b_length,           //
-    sz_size_t bound) {
-    return sz_hamming_distance_utf8_serial(a, a_length, b, b_length, bound);
-}
-
-SZ_DYNAMIC sz_size_t sz_levenshtein_distance( //
-    sz_cptr_t a, sz_size_t a_length,          //
-    sz_cptr_t b, sz_size_t b_length,          //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-    return sz_dispatch_table.edit_distance(a, a_length, b, b_length, bound, alloc);
+SZ_DYNAMIC sz_status_t sz_hamming_distance_utf8( //
+    sz_cptr_t a, sz_size_t a_length,             //
+    sz_cptr_t b, sz_size_t b_length,             //
+    sz_size_t bound, sz_size_t *result) {
+    return sz_hamming_distance_utf8_serial(a, a_length, b, b_length, bound, result);
 }
 
-SZ_DYNAMIC sz_size_t sz_levenshtein_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length,               //
-    sz_cptr_t b, sz_size_t b_length,               //
-    sz_size_t bound, sz_memory_allocator_t *alloc) {
-    return _sz_levenshtein_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc);
+SZ_DYNAMIC sz_status_t sz_levenshtein_distance( //
+    sz_cptr_t a, sz_size_t a_length,            //
+    sz_cptr_t b, sz_size_t b_length,            //
+    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result) {
+    return sz_dispatch_table.edit_distance(a, a_length, b, b_length, bound, alloc, result);
 }
 
-SZ_DYNAMIC sz_ssize_t sz_needleman_wunsch_score( //
-    sz_cptr_t a, sz_size_t a_length,             //
-    sz_cptr_t b, sz_size_t b_length,             //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
-    return sz_dispatch_table.alignment_score(a, a_length, b, b_length, subs, gap, alloc);
+SZ_DYNAMIC sz_status_t sz_levenshtein_distance_utf8( //
+    sz_cptr_t a, sz_size_t a_length,                 //
+    sz_cptr_t b, sz_size_t b_length,                 //
+    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result) {
+    return _sz_levenshtein_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc, result);
 }
 
-SZ_DYNAMIC sz_status_t sz_sequence_argsort(sz_sequence_t const *array, sz_memory_allocator_t *alloc, sz_size_t *order) {
-    return sz_dispatch_table.sequence_argsort(array, alloc, order);
+SZ_DYNAMIC sz_status_t sz_needleman_wunsch_score( //
+    sz_cptr_t a, sz_size_t a_length,              //
+    sz_cptr_t b, sz_size_t b_length,              //
+    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc, sz_ssize_t *result) {
+    return sz_dispatch_table.alignment_score(a, a_length, b, b_length, subs, gap, alloc, result);
 }
 
 SZ_DYNAMIC sz_status_t sz_pgrams_sort(sz_pgram_t *array, sz_size_t count, sz_memory_allocator_t *alloc,
@@ -509,44 +499,15 @@ SZ_DYNAMIC sz_status_t sz_pgrams_sort(sz_pgram_t *array, sz_size_t count, sz_mem
     return sz_dispatch_table.pgrams_sort(array, count, alloc, order);
 }
 
-SZ_DYNAMIC sz_status_t sz_sequence_argsort_stable(sz_sequence_t const *array, sz_memory_allocator_t *alloc,
-                                                  sz_size_t *order) {
-    return sz_dispatch_table.sequence_argsort_stable(array, alloc, order);
-}
-
-SZ_DYNAMIC sz_status_t sz_pgrams_sort_stable(sz_pgram_t *array, sz_size_t count, sz_memory_allocator_t *alloc,
-                                             sz_size_t *order) {
-    return sz_dispatch_table.pgrams_sort_stable(array, count, alloc, order);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    return sz_find_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    sz_charset_invert(&set);
-    return sz_find_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    return sz_rfind_charset(h, h_length, &set);
+SZ_DYNAMIC sz_status_t sz_sequence_argsort(sz_sequence_t const *array, sz_memory_allocator_t *alloc, sz_size_t *order) {
+    return sz_dispatch_table.sequence_argsort(array, alloc, order);
 }
 
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    sz_charset_invert(&set);
-    return sz_rfind_charset(h, h_length, &set);
+SZ_DYNAMIC sz_status_t sz_sequence_join(sz_sequence_t const *first_array, sz_sequence_t const *second_array,
+                                        sz_memory_allocator_t *alloc, sz_size_t *intersection_size,
+                                        sz_size_t *first_positions, sz_size_t *second_positions) {
+    return sz_dispatch_table.sequence_join(first_array, second_array, alloc, intersection_size, first_positions,
+                                           second_positions);
 }
 
 // Provide overrides for the libc mem* functions
@@ -626,7 +587,7 @@ SZ_DYNAMIC void *memrchr(void const *s, int c_wide, size_t n) {
 
 SZ_DYNAMIC void memfrob(void *s, size_t n) {
     static sz_u64_t nonce = 42;
-    sz_generate(s, n, nonce++);
+    sz_fill_random(s, n, nonce++);
 }
 
 #endif
diff --git a/include/stringzilla/find.h b/include/stringzilla/find.h
index 90b6a16f..d3db653e 100644
--- a/include/stringzilla/find.h
+++ b/include/stringzilla/find.h
@@ -7,14 +7,14 @@
  *
  *  - `sz_find` and reverse-order `sz_rfind`
  *  - `sz_find_byte` and reverse-order `sz_rfind_byte`
- *  - `sz_find_charset` and reverse-order `sz_rfind_charset`
+ *  - `sz_find_byteset` and reverse-order `sz_rfind_byteset`
  *
  *  Convenience functions for character-set matching:
  *
- *  - `sz_find_char_from`
- *  - `sz_find_char_not_from`
- *  - `sz_rfind_char_from`
- *  - `sz_rfind_char_not_from`
+ *  - `sz_find_byte_from` shortcut for `sz_find_byteset`
+ *  - `sz_find_byte_not_from` shortcut for `sz_find_byteset` with inverted set
+ *  - `sz_rfind_byte_from` shortcut for `sz_rfind_byteset`
+ *  - `sz_rfind_byte_not_from` shortcut for `sz_rfind_byteset` with inverted set
  */
 #ifndef STRINGZILLA_FIND_H_
 #define STRINGZILLA_FIND_H_
@@ -35,10 +35,10 @@ extern "C" {
  *  X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memchr.S
  *  Aarch64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/aarch64/memchr.S
  *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - single-byte substring to find.
- *  @return         Address of the first match.
+ *  @param[in] haystack Haystack - the string to search in.
+ *  @param[in] h_length Number of bytes in the haystack.
+ *  @param[in] needle Needle - single-byte substring to find.
+ *  @return Address of the first match.
  */
 SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
 
@@ -48,10 +48,10 @@ SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cpt
  *  X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memrchr.S
  *  Aarch64 implementation: missing
  *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - single-byte substring to find.
- *  @return         Address of the last match.
+ *  @param[in] haystack Haystack - the string to search in.
+ *  @param[in] h_length Number of bytes in the haystack.
+ *  @param[in] needle Needle - single-byte substring to find.
+ *  @return Address of the last match.
  */
 SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
 
@@ -86,22 +86,22 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t haystack, sz_size_t h_length, s
  *          Equivalent to `memmem(haystack, h_length, needle, n_length)` in LibC.
  *          Similar to `strstr(haystack, needle)` in LibC, but requires known length.
  *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - substring to find.
- *  @param n_length Number of bytes in the needle.
- *  @return         Address of the first match.
+ *  @param[in] haystack Haystack - the string to search in.
+ *  @param[in] h_length Number of bytes in the haystack.
+ *  @param[in] needle Needle - substring to find.
+ *  @param[in] n_length Number of bytes in the needle.
+ *  @return Address of the first match.
  */
 SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
 
 /**
  *  @brief  Locates the last matching substring.
  *
- *  @param haystack Haystack - the string to search in.
- *  @param h_length Number of bytes in the haystack.
- *  @param needle   Needle - substring to find.
- *  @param n_length Number of bytes in the needle.
- *  @return         Address of the last match.
+ *  @param[in] haystack Haystack - the string to search in.
+ *  @param[in] h_length Number of bytes in the haystack.
+ *  @param[in] needle Needle - substring to find.
+ *  @param[in] n_length Number of bytes in the needle.
+ *  @return Address of the last match.
  */
 SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
 
@@ -132,9 +132,9 @@ SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cpt
 #endif
 
 /**
- *  @brief  Finds the first character present from the ::set, present in ::text.
+ *  @brief  Finds the first character present from the @p set, present in @p text.
  *          Equivalent to `strspn(text, accepted)` and `strcspn(text, rejected)` in LibC.
- *          May have identical implementation and performance to ::sz_rfind_charset.
+ *          May have identical implementation and performance to ::sz_rfind_byteset.
  *
  *  Useful for parsing, when we want to skip a set of characters. Examples:
  *  - 6 whitespaces: " \t\n\r\v\f".
@@ -142,16 +142,16 @@ SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cpt
  *  - 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
  *  - 2 JSON string special characters useful to locate the end of the string: "\"\\".
  *
- *  @param text     String to be scanned.
- *  @param set      Set of relevant characters.
- *  @return         Pointer to the first matching character from ::set.
+ *  @param[in] text String to be scanned.
+ *  @param[in] set Set of relevant characters.
+ *  @return Pointer to the first matching character from @p set.
  */
-SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
+SZ_DYNAMIC sz_cptr_t sz_find_byteset(sz_cptr_t text, sz_size_t length, sz_byteset_t const *set);
 
 /**
- *  @brief  Finds the last character present from the ::set, present in ::text.
+ *  @brief  Finds the last character present from the @p set, present in @p text.
  *          Equivalent to `strspn(text, accepted)` and `strcspn(text, rejected)` in LibC.
- *          May have identical implementation and performance to ::sz_find_charset.
+ *          May have identical implementation and performance to ::sz_find_byteset.
  *
  *  Useful for parsing, when we want to skip a set of characters. Examples:
  *  - 6 whitespaces: " \t\n\r\v\f".
@@ -159,40 +159,74 @@ SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charse
  *  - 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
  *  - 2 JSON string special characters useful to locate the end of the string: "\"\\".
  *
- *  @param text     String to be scanned.
- *  @param set      Set of relevant characters.
- *  @return         Pointer to the last matching character from ::set.
+ *  @param[in] text String to be scanned.
+ *  @param[in] set Set of relevant characters.
+ *  @return Pointer to the last matching character from @p set.
  */
-SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
+SZ_DYNAMIC sz_cptr_t sz_rfind_byteset(sz_cptr_t text, sz_size_t length, sz_byteset_t const *set);
 
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
+/** @copydoc sz_find_byteset */
+SZ_PUBLIC sz_cptr_t sz_find_byteset_serial(sz_cptr_t text, sz_size_t length, sz_byteset_t const *set);
+/** @copydoc sz_rfind_byteset */
+SZ_PUBLIC sz_cptr_t sz_rfind_byteset_serial(sz_cptr_t text, sz_size_t length, sz_byteset_t const *set);
 
 #if SZ_USE_HASWELL
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_haswell(sz_cptr_t haystack, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_haswell(sz_cptr_t haystack, sz_size_t length, sz_charset_t const *set);
+/** @copydoc sz_find_byteset */
+SZ_PUBLIC sz_cptr_t sz_find_byteset_haswell(sz_cptr_t haystack, sz_size_t length, sz_byteset_t const *set);
+/** @copydoc sz_rfind_byteset */
+SZ_PUBLIC sz_cptr_t sz_rfind_byteset_haswell(sz_cptr_t haystack, sz_size_t length, sz_byteset_t const *set);
 #endif
 
 #if SZ_USE_ICE
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_ice(sz_cptr_t haystack, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_ice(sz_cptr_t haystack, sz_size_t length, sz_charset_t const *set);
+/** @copydoc sz_find_byteset */
+SZ_PUBLIC sz_cptr_t sz_find_byteset_ice(sz_cptr_t haystack, sz_size_t length, sz_byteset_t const *set);
+/** @copydoc sz_rfind_byteset */
+SZ_PUBLIC sz_cptr_t sz_rfind_byteset_ice(sz_cptr_t haystack, sz_size_t length, sz_byteset_t const *set);
 #endif
 
 #if SZ_USE_NEON
-/** @copydoc sz_find_charset */
-SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t haystack, sz_size_t length, sz_charset_t const *set);
-/** @copydoc sz_rfind_charset */
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t haystack, sz_size_t length, sz_charset_t const *set);
+/** @copydoc sz_find_byteset */
+SZ_PUBLIC sz_cptr_t sz_find_byteset_neon(sz_cptr_t haystack, sz_size_t length, sz_byteset_t const *set);
+/** @copydoc sz_rfind_byteset */
+SZ_PUBLIC sz_cptr_t sz_rfind_byteset_neon(sz_cptr_t haystack, sz_size_t length, sz_byteset_t const *set);
 #endif
 
 #pragma endregion // Core API
 
+#pragma region Helper Shortcuts
+
+SZ_PUBLIC sz_cptr_t sz_find_byte_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+    sz_byteset_t set;
+    sz_byteset_init(&set);
+    for (; n_length; ++n, --n_length) sz_byteset_add(&set, *n);
+    return sz_find_byteset(h, h_length, &set);
+}
+
+SZ_PUBLIC sz_cptr_t sz_find_byte_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+    sz_byteset_t set;
+    sz_byteset_init(&set);
+    for (; n_length; ++n, --n_length) sz_byteset_add(&set, *n);
+    sz_byteset_invert(&set);
+    return sz_find_byteset(h, h_length, &set);
+}
+
+SZ_PUBLIC sz_cptr_t sz_rfind_byte_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+    sz_byteset_t set;
+    sz_byteset_init(&set);
+    for (; n_length; ++n, --n_length) sz_byteset_add(&set, *n);
+    return sz_rfind_byteset(h, h_length, &set);
+}
+
+SZ_PUBLIC sz_cptr_t sz_rfind_byte_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+    sz_byteset_t set;
+    sz_byteset_init(&set);
+    for (; n_length; ++n, --n_length) sz_byteset_add(&set, *n);
+    sz_byteset_invert(&set);
+    return sz_rfind_byteset(h, h_length, &set);
+}
+
+#pragma endregion // Helper Shortcuts
+
 #pragma region Serial Implementation
 
 /**
@@ -270,18 +304,18 @@ SZ_INTERNAL void _sz_locate_needle_anomalies( //
     }
 }
 
-SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
+SZ_PUBLIC sz_cptr_t sz_find_byteset_serial(sz_cptr_t text, sz_size_t length, sz_byteset_t const *set) {
     for (sz_cptr_t const end = text + length; text != end; ++text)
-        if (sz_charset_contains(set, *text)) return text;
+        if (sz_byteset_contains(set, *text)) return text;
     return SZ_NULL_CHAR;
 }
 
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
+SZ_PUBLIC sz_cptr_t sz_rfind_byteset_serial(sz_cptr_t text, sz_size_t length, sz_byteset_t const *set) {
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Warray-bounds"
     sz_cptr_t const end = text;
     for (text += length; text != end;)
-        if (sz_charset_contains(set, *(text -= 1))) return text;
+        if (sz_byteset_contains(set, *(text -= 1))) return text;
     return SZ_NULL_CHAR;
 #pragma GCC diagnostic pop
 }
@@ -893,7 +927,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_haswell(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
     return sz_rfind_serial(h, h_length, n, n_length);
 }
 
-SZ_PUBLIC sz_cptr_t sz_find_charset_haswell(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
+SZ_PUBLIC sz_cptr_t sz_find_byteset_haswell(sz_cptr_t text, sz_size_t length, sz_byteset_t const *filter) {
 
     // Let's unzip even and odd elements and replicate them into both lanes of the YMM register.
     // That way when we invoke `_mm256_shuffle_epi8` we can use the same mask for both lanes.
@@ -978,11 +1012,11 @@ SZ_PUBLIC sz_cptr_t sz_find_charset_haswell(sz_cptr_t text, sz_size_t length, sz
         else { text += 32, length -= 32; }
     }
 
-    return sz_find_charset_serial(text, length, filter);
+    return sz_find_byteset_serial(text, length, filter);
 }
 
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_haswell(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-    return sz_rfind_charset_serial(text, length, filter);
+SZ_PUBLIC sz_cptr_t sz_rfind_byteset_haswell(sz_cptr_t text, sz_size_t length, sz_byteset_t const *filter) {
+    return sz_rfind_byteset_serial(text, length, filter);
 }
 
 #pragma clang attribute pop
@@ -1233,13 +1267,13 @@ SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
     __attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vbmi2,bmi,bmi2"))), \
     apply_to = function)
 
-SZ_PUBLIC sz_cptr_t sz_find_charset_ice(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
+SZ_PUBLIC sz_cptr_t sz_find_byteset_ice(sz_cptr_t text, sz_size_t length, sz_byteset_t const *filter) {
 
     // Before initializing the AVX-512 vectors, we may want to run the sequential code for the first few bytes.
     // In practice, that only hurts, even when we have matches every 5-ish bytes.
     //
-    //      if (length < SZ_SWAR_THRESHOLD) return sz_find_charset_serial(text, length, filter);
-    //      sz_cptr_t early_result = sz_find_charset_serial(text, SZ_SWAR_THRESHOLD, filter);
+    //      if (length < SZ_SWAR_THRESHOLD) return sz_find_byteset_serial(text, length, filter);
+    //      sz_cptr_t early_result = sz_find_byteset_serial(text, SZ_SWAR_THRESHOLD, filter);
     //      if (early_result) return early_result;
     //      text += SZ_SWAR_THRESHOLD;
     //      length -= SZ_SWAR_THRESHOLD;
@@ -1348,8 +1382,8 @@ SZ_PUBLIC sz_cptr_t sz_find_charset_ice(sz_cptr_t text, sz_size_t length, sz_cha
     return SZ_NULL_CHAR;
 }
 
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_ice(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
-    return sz_rfind_charset_serial(text, length, filter);
+SZ_PUBLIC sz_cptr_t sz_rfind_byteset_ice(sz_cptr_t text, sz_size_t length, sz_byteset_t const *filter) {
+    return sz_rfind_byteset_serial(text, length, filter);
 }
 
 #pragma clang attribute pop
@@ -1408,7 +1442,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_
     return sz_rfind_byte_serial(h, h_length, n);
 }
 
-SZ_PUBLIC sz_u64_t _sz_find_charset_neon_register( //
+SZ_PUBLIC sz_u64_t _sz_find_byteset_neon_register( //
     sz_u128_vec_t h_vec, uint8x16_t set_top_vec_u8x16, uint8x16_t set_bottom_vec_u8x16) {
 
     // Once we've read the characters in the haystack, we want to
@@ -1550,7 +1584,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
     return sz_rfind_serial(h, h_length, n, n_length);
 }
 
-SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_charset_t const *set) {
+SZ_PUBLIC sz_cptr_t sz_find_byteset_neon(sz_cptr_t h, sz_size_t h_length, sz_byteset_t const *set) {
     sz_u64_t matches;
     sz_u128_vec_t h_vec;
     uint8x16_t set_top_vec_u8x16 = vld1q_u8(&set->_u8s[0]);
@@ -1558,27 +1592,27 @@ SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_cha
 
     for (; h_length >= 16; h += 16, h_length -= 16) {
         h_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h));
-        matches = _sz_find_charset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
+        matches = _sz_find_byteset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
         if (matches) return h + sz_u64_ctz(matches) / 4;
     }
 
-    return sz_find_charset_serial(h, h_length, set);
+    return sz_find_byteset_serial(h, h_length, set);
 }
 
-SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_charset_t const *set) {
+SZ_PUBLIC sz_cptr_t sz_rfind_byteset_neon(sz_cptr_t h, sz_size_t h_length, sz_byteset_t const *set) {
     sz_u64_t matches;
     sz_u128_vec_t h_vec;
     uint8x16_t set_top_vec_u8x16 = vld1q_u8(&set->_u8s[0]);
     uint8x16_t set_bottom_vec_u8x16 = vld1q_u8(&set->_u8s[16]);
 
-    // Check `sz_find_charset_neon` for explanations.
+    // Check `sz_find_byteset_neon` for explanations.
     for (; h_length >= 16; h_length -= 16) {
         h_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h) + h_length - 16);
-        matches = _sz_find_charset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
+        matches = _sz_find_byteset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
         if (matches) return h + h_length - 1 - sz_u64_clz(matches) / 4;
     }
 
-    return sz_rfind_charset_serial(h, h_length, set);
+    return sz_rfind_byteset_serial(h, h_length, set);
 }
 
 #pragma clang attribute pop
@@ -1656,64 +1690,31 @@ SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t
 #endif
 }
 
-SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
+SZ_DYNAMIC sz_cptr_t sz_find_byteset(sz_cptr_t text, sz_size_t length, sz_byteset_t const *set) {
 #if SZ_USE_ICE
-    return sz_find_charset_ice(text, length, set);
+    return sz_find_byteset_ice(text, length, set);
 #elif SZ_USE_HASWELL
-    return sz_find_charset_haswell(text, length, set);
+    return sz_find_byteset_haswell(text, length, set);
 #elif SZ_USE_NEON
-    return sz_find_charset_neon(text, length, set);
+    return sz_find_byteset_neon(text, length, set);
 #else
-    return sz_find_charset_serial(text, length, set);
+    return sz_find_byteset_serial(text, length, set);
 #endif
 }
 
-SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
+SZ_DYNAMIC sz_cptr_t sz_rfind_byteset(sz_cptr_t text, sz_size_t length, sz_byteset_t const *set) {
 #if SZ_USE_ICE
-    return sz_rfind_charset_ice(text, length, set);
+    return sz_rfind_byteset_ice(text, length, set);
 #elif SZ_USE_HASWELL
-    return sz_rfind_charset_haswell(text, length, set);
+    return sz_rfind_byteset_haswell(text, length, set);
 #elif SZ_USE_NEON
-    return sz_rfind_charset_neon(text, length, set);
+    return sz_rfind_byteset_neon(text, length, set);
 #else
-    return sz_rfind_charset_serial(text, length, set);
+    return sz_rfind_byteset_serial(text, length, set);
 #endif
 }
 
 #pragma endregion
-#pragma region Helper Shortcuts
-
-SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    return sz_find_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_find_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    sz_charset_invert(&set);
-    return sz_find_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    return sz_rfind_charset(h, h_length, &set);
-}
-
-SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_charset_t set;
-    sz_charset_init(&set);
-    for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
-    sz_charset_invert(&set);
-    return sz_rfind_charset(h, h_length, &set);
-}
-
-#pragma endregion // Helper Shortcuts
 #endif            // !SZ_DYNAMIC_DISPATCH
 #pragma endregion // Compile Time Dispatching
 
diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 6ef11e3d..e23b700a 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -3,12 +3,12 @@
  *  @file   hash.h
  *  @author Ash Vardanian
  *
- *  Includes core APIs:
+ *  Includes core APIs with hardware-specific backends:
  *
  *  - `sz_bytesum` - for byte-level 64-bit unsigned byte-level checksums.
  *  - `sz_hash` - for 64-bit single-shot hashing using AES instructions.
  *  - `sz_hash_state_init`, `sz_hash_state_stream`, `sz_hash_state_fold` - for incremental hashing.
- *  - `sz_generate` - for populating buffers with pseudo-random noise using AES instructions.
+ *  - `sz_fill_random` - for populating buffers with pseudo-random noise using AES instructions.
  *
  *  Why the hell do we need a yet another hashing library?!
  *  Turns out, most existing libraries have noticeable constraints. Try finding a library that:
@@ -31,12 +31,12 @@
  *  - "xxHash" is implemented in C, has an extremely wide set of third-party language bindings, and provides both
  *    32-, 64-, and 128-bit hashes. It is fast, but its dynamic dispatch is limited to x86 with `xxh_x86dispatch.c`.
  *
- *  StringZilla uses a scheme more similar to the "aHash" library, utilizing the AES extensions, that provide
+ *  StringZilla uses a scheme more similar to "aHash" and "GxHash", utilizing the AES extensions, that provide
  *  a remarkable level of "mixing per cycle" and are broadly available on modern CPUs. Similar to "aHash", they
  *  are combined with "shuffle & add" instructions to provide a high level of entropy in the output. That operation
  *  is practically free, as many modern CPUs will dispatch them on different ports. On x86, for example:
  *
- *  - `VAESENC` (ZMM, ZMM, ZMM)`:
+ *  - `VAESENC (ZMM, ZMM, ZMM)` and `VAESDEC (ZMM, ZMM, ZMM)`:
  *    - on Intel Ice Lake: 5 cycles on port 0.
  *    - On AMD Zen4: 4 cycles on ports 0 or 1.
  *  - `VPSHUFB_Z (ZMM, K, ZMM, ZMM)`
@@ -46,12 +46,16 @@
  *    - on Intel Ice Lake: 1 cycle on ports 0 or 5.
  *    - On AMD Zen4: 1 cycle on ports 0, 1, 2, 3.
  *
- *  Unlike "aHash", the length is not mixed into "AES" block at start to allow incremental construction.
- *  Unlike "aHash", on long inputs, we use a heavier procedure that is more vector-friendly on modern servers.
- *  Unlike "aHash", we don't load interleaved memory regions, making vectorized variant more similar to sequential.
- *  Unlike "aHash", on platforms like Intel Skylake-X or AWS Graviton 3, we use masked loads.
- *  Unlike "aHash", in final folding procedure, we use the same `VAESENC` instead of `VAESDEC`, which
- *  still provides the same level of mixing, but allows us to have a lighter serial fallback implementation.
+ *  But there several key differences:
+ *
+ *  - A larger state and a larger block size is used for inputs over 64 bytes longs, benefiting from wider registers
+ *    on current CPUs. Like many other hash functions, the state is initialized with the seed and a set of Pi constants.
+ *    Unlike others, we pull more Pi bits (1024), but only 64-bits of the seed, to keep the API sane.
+ *  - The length of the input is not mixed into the AES block at the start to allow incremental construction,
+ *    when the final length is not known in advance.
+ *  - The vector-loads are not interleaved, meaning that each byte of input has exactly the same weight in the hash.
+ *    On the implementation side it require some extra shuffling on older platforms, but on newer platforms it
+ *    can be done with "masked" loads in AVX-512 and "predicated" instructions in SVE2.
  *
  *  @see Reini Urban's more active fork of SMHasher by Austin Appleby: https://github.com/rurban/smhasher
  *  @see The serial AES routines are based on Morten Jensen's "tiny-AES-c": https://github.com/kokke/tiny-AES-c
@@ -59,6 +63,16 @@
  *  @see The "aHash" Rust implementation by Tom Kaitchuck: https://github.com/tkaitchuck/aHash
  *  @see "Emulating x86 AES Intrinsics on ARMv8-A" by Michael Brase:
  *       https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a/
+ *
+ *  Moreover, the same AES primitives are reused to implement a fast Pseudo-Random Number Generator @b (PRNG) that
+ *  is consistent between different implementation backends and has reproducible output with the same "nonce".
+ *  Originally, the PRNG was designed to produce random byte sequences, but combining it with @b `sz_lookup`,
+ *  one can produce random strings with a given byteset.
+ *
+ *  Other helpers include: TODO:
+ *
+ *  - `sz_fill_alphabet` - combines `sz_fill_random` & `sz_lookup` to fill buffers with random ASCII characters.
+ *  - `sz_fill_alphabet_utf8` - combines `sz_fill_random` & `sz_lookup` to fill buffers with random UTF-8 characters.
  */
 #ifndef STRINGZILLA_HASH_H_
 #define STRINGZILLA_HASH_H_
@@ -114,7 +128,7 @@ SZ_DYNAMIC sz_u64_t sz_bytesum(sz_cptr_t text, sz_size_t length);
  *  @endcode
  *
  *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
- *  @sa     sz_hash_serial, sz_hash_haswell, sz_hash_skylake, sz_hash_ice, sz_hash_neon
+ *  @sa     sz_hash_serial, sz_hash_haswell, sz_hash_skylake, sz_hash_ice, sz_hash_neon, sz_hash_sve
  *
  *  @note   The algorithm must provide the same output on all platforms in both single-shot and incremental modes.
  *  @sa     sz_hash_state_init, sz_hash_state_stream, sz_hash_state_fold
@@ -144,16 +158,17 @@ SZ_DYNAMIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
  *      #include <stringzilla/hash.h>
  *      int main() {
  *          char first_buffer[5], second_buffer[5];
- *          sz_generate(first_buffer, 5, 0);
- *          sz_generate(second_buffer, 5, 0); //? Same nonce must produce the same output
+ *          sz_fill_random(first_buffer, 5, 0);
+ *          sz_fill_random(second_buffer, 5, 0); //? Same nonce must produce the same output
  *          return sz_bytesum(first_buffer, 5) == sz_bytesum(second_buffer, 5) ? 0 : 1;
  *      }
  *  @endcode
  *
  *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
- *  @sa     sz_generate_serial, sz_generate_haswell, sz_generate_skylake, sz_generate_ice, sz_generate_neon
+ *  @sa     sz_fill_random_serial, sz_fill_random_haswell, sz_fill_random_skylake, sz_fill_random_ice,
+ *          sz_fill_random_neon, sz_fill_random_sve
  */
-SZ_DYNAMIC void sz_generate(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
+SZ_DYNAMIC void sz_fill_random(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
 
 /**
  *  @brief  The state for incremental construction of a hash.
@@ -204,8 +219,8 @@ SZ_PUBLIC sz_u64_t sz_bytesum_serial(sz_cptr_t text, sz_size_t length);
 /** @copydoc sz_hash */
 SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
 
-/** @copydoc sz_generate */
-SZ_PUBLIC void sz_generate_serial(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
+/** @copydoc sz_fill_random */
+SZ_PUBLIC void sz_fill_random_serial(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
 
 /** @copydoc sz_hash_state_init */
 SZ_PUBLIC void sz_hash_state_init_serial(sz_hash_state_t *state, sz_u64_t seed);
@@ -222,8 +237,8 @@ SZ_PUBLIC sz_u64_t sz_bytesum_haswell(sz_cptr_t text, sz_size_t length);
 /** @copydoc sz_hash */
 SZ_PUBLIC sz_u64_t sz_hash_haswell(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
 
-/** @copydoc sz_generate */
-SZ_PUBLIC void sz_generate_haswell(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
+/** @copydoc sz_fill_random */
+SZ_PUBLIC void sz_fill_random_haswell(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
 
 /** @copydoc sz_hash_state_init */
 SZ_PUBLIC void sz_hash_state_init_haswell(sz_hash_state_t *state, sz_u64_t seed);
@@ -240,8 +255,8 @@ SZ_PUBLIC sz_u64_t sz_bytesum_skylake(sz_cptr_t text, sz_size_t length);
 /** @copydoc sz_hash */
 SZ_PUBLIC sz_u64_t sz_hash_skylake(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
 
-/** @copydoc sz_generate */
-SZ_PUBLIC void sz_generate_skylake(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
+/** @copydoc sz_fill_random */
+SZ_PUBLIC void sz_fill_random_skylake(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
 
 /** @copydoc sz_hash_state_init */
 SZ_PUBLIC void sz_hash_state_init_skylake(sz_hash_state_t *state, sz_u64_t seed);
@@ -258,8 +273,8 @@ SZ_PUBLIC sz_u64_t sz_bytesum_ice(sz_cptr_t text, sz_size_t length);
 /** @copydoc sz_hash */
 SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
 
-/** @copydoc sz_generate */
-SZ_PUBLIC void sz_generate_ice(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
+/** @copydoc sz_fill_random */
+SZ_PUBLIC void sz_fill_random_ice(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
 
 /** @copydoc sz_hash_state_init */
 SZ_PUBLIC void sz_hash_state_init_ice(sz_hash_state_t *state, sz_u64_t seed);
@@ -276,8 +291,8 @@ SZ_PUBLIC sz_u64_t sz_bytesum_neon(sz_cptr_t text, sz_size_t length);
 /** @copydoc sz_hash */
 SZ_PUBLIC sz_u64_t sz_hash_neon(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
 
-/** @copydoc sz_generate */
-SZ_PUBLIC void sz_generate_neon(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
+/** @copydoc sz_fill_random */
+SZ_PUBLIC void sz_fill_random_neon(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
 
 /** @copydoc sz_hash_state_init */
 SZ_PUBLIC void sz_hash_state_init_neon(sz_hash_state_t *state, sz_u64_t seed);
@@ -704,7 +719,7 @@ SZ_PUBLIC sz_u64_t sz_hash_state_fold_serial(sz_hash_state_t const *state) {
     }
 }
 
-SZ_PUBLIC void sz_generate_serial(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
+SZ_PUBLIC void sz_fill_random_serial(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
     sz_u64_t const *pi_ptr = _sz_hash_pi_constants();
     sz_u128_vec_t input_vec, pi_vec, key_vec, generated_vec;
     for (sz_size_t lane_index = 0; length; ++lane_index) {
@@ -728,8 +743,8 @@ SZ_PUBLIC void sz_generate_serial(sz_ptr_t text, sz_size_t length, sz_u64_t nonc
 #pragma region Haswell Implementation
 #if SZ_USE_HASWELL
 #pragma GCC push_options
-#pragma GCC target("avx2")
-#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
+#pragma GCC target("avx2", "aes")
+#pragma clang attribute push(__attribute__((target("avx2,aes"))), apply_to = function)
 
 SZ_PUBLIC sz_u64_t sz_bytesum_haswell(sz_cptr_t text, sz_size_t length) {
     // The naive implementation of this function is very simple.
@@ -1058,7 +1073,7 @@ SZ_PUBLIC sz_u64_t sz_hash_state_fold_haswell(sz_hash_state_t const *state) {
     }
 }
 
-SZ_PUBLIC void sz_generate_haswell(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
+SZ_PUBLIC void sz_fill_random_haswell(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
     sz_u64_t const *pi_ptr = _sz_hash_pi_constants();
     if (length <= 16) {
         __m128i input = _mm_set1_epi64x(nonce);
@@ -1165,8 +1180,8 @@ SZ_PUBLIC void sz_generate_haswell(sz_ptr_t text, sz_size_t length, sz_u64_t non
 #pragma region Skylake Implementation
 #if SZ_USE_SKYLAKE
 #pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
+#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2", "aes")
+#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2,aes"))), apply_to = function)
 
 SZ_PUBLIC sz_u64_t sz_bytesum_skylake(sz_cptr_t text, sz_size_t length) {
     // The naive implementation of this function is very simple.
@@ -1386,8 +1401,8 @@ SZ_PUBLIC sz_u64_t sz_hash_state_fold_skylake(sz_hash_state_t const *state) {
     return sz_hash_state_fold_haswell(state);
 }
 
-SZ_PUBLIC void sz_generate_skylake(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
-    sz_generate_serial(text, length, nonce);
+SZ_PUBLIC void sz_fill_random_skylake(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
+    sz_fill_random_serial(text, length, nonce);
 }
 
 #pragma clang attribute pop
@@ -1647,7 +1662,7 @@ SZ_PUBLIC sz_u64_t sz_hash_state_fold_ice(sz_hash_state_t const *state) {
     return sz_hash_state_fold_haswell(state);
 }
 
-SZ_PUBLIC void sz_generate_ice(sz_ptr_t output, sz_size_t length, sz_u64_t nonce) {
+SZ_PUBLIC void sz_fill_random_ice(sz_ptr_t output, sz_size_t length, sz_u64_t nonce) {
     if (length <= 16) {
         __m128i input = _mm_set1_epi64x(nonce);
         __m128i pi = _mm_load_si128((__m128i const *)_sz_hash_pi_constants());
@@ -1796,17 +1811,17 @@ SZ_DYNAMIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length, sz_u64_t seed) {
 #endif
 }
 
-SZ_DYNAMIC void sz_generate(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
+SZ_DYNAMIC void sz_fill_random(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
 #if SZ_USE_ICE
-    sz_generate_ice(text, length, nonce);
+    sz_fill_random_ice(text, length, nonce);
 #elif SZ_USE_SKYLAKE
-    sz_generate_skylake(text, length, nonce);
+    sz_fill_random_skylake(text, length, nonce);
 #elif SZ_USE_HASWELL
-    sz_generate_haswell(text, length, nonce);
+    sz_fill_random_haswell(text, length, nonce);
 #elif SZ_USE_NEON
-    sz_generate_neon(text, length, nonce);
+    sz_fill_random_neon(text, length, nonce);
 #else
-    sz_generate_serial(text, length, nonce);
+    sz_fill_random_serial(text, length, nonce);
 #endif
 }
 
diff --git a/include/stringzilla/similarity.h b/include/stringzilla/similarity.h
index 60540b33..058b1313 100644
--- a/include/stringzilla/similarity.h
+++ b/include/stringzilla/similarity.h
@@ -413,11 +413,11 @@ SZ_INTERNAL sz_status_t _sz_levenshtein_distance_wagner_fisher_serial( //
 
     // If the strings contain Unicode characters, let's estimate the max character width,
     // and use it to allocate a larger buffer to decode UTF8.
-    sz_charset_t ascii_charset;
-    sz_charset_init_ascii(&ascii_charset);
-    sz_charset_invert(&ascii_charset);
-    int const longer_is_ascii = sz_find_charset_serial(longer, longer_length, &ascii_charset) == SZ_NULL_CHAR;
-    int const shorter_is_ascii = sz_find_charset_serial(shorter, shorter_length, &ascii_charset) == SZ_NULL_CHAR;
+    sz_byteset_t ascii_byteset;
+    sz_byteset_init_ascii(&ascii_byteset);
+    sz_byteset_invert(&ascii_byteset);
+    int const longer_is_ascii = sz_find_byteset_serial(longer, longer_length, &ascii_byteset) == SZ_NULL_CHAR;
+    int const shorter_is_ascii = sz_find_byteset_serial(shorter, shorter_length, &ascii_byteset) == SZ_NULL_CHAR;
     int const will_convert_to_unicode = can_be_unicode == sz_true_k && (!longer_is_ascii || !shorter_is_ascii);
     if (will_convert_to_unicode) { buffer_length += (shorter_length + longer_length) * sizeof(sz_rune_t); }
     else { can_be_unicode = sz_false_k; }
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 284754bd..7642f5ae 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -45,7 +45,7 @@
 #include "compare.h"      // `sz_equal`, `sz_order`
 #include "memory.h"       // `sz_copy`, `sz_move`, `sz_fill`
 #include "hash.h"         // `sz_bytesum`, `sz_hash`, `sz_state_init`, `sz_state_stream`, `sz_state_fold`
-#include "find.h"         // `sz_find`, `sz_find_charset`, `sz_rfind`
+#include "find.h"         // `sz_find`, `sz_find_byteset`, `sz_rfind`
 #include "small_string.h" // `sz_string_t`, `sz_string_init`, `sz_string_free`
 #include "similarity.h"   // `sz_levenshtein_distance`, `sz_needleman_wunsch_score`
 #include "sort.h"         // `sz_sequence_argsort`, `sz_pgrams_sort`, `sz_pgrams_sort_stable`
diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 143f252e..a1b2de28 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -79,7 +79,7 @@ namespace ashvardanian {
 namespace stringzilla {
 
 template <typename>
-class basic_char_set;
+class basic_byteset;
 template <typename>
 class basic_string_slice;
 template <typename, typename>
@@ -278,23 +278,23 @@ inline carray<64> const &base64() noexcept {
  *  @brief  A set of characters represented as a bitset with 256 slots.
  */
 template <typename char_type_ = char>
-class basic_char_set {
-    sz_charset_t bitset_;
+class basic_byteset {
+    sz_byteset_t bitset_;
 
   public:
     using char_type = char_type_;
 
-    sz_constexpr_if_cpp14 basic_char_set() noexcept {
-        // ! Instead of relying on the `sz_charset_init`, we have to reimplement it to support `constexpr`.
+    sz_constexpr_if_cpp14 basic_byteset() noexcept {
+        // ! Instead of relying on the `sz_byteset_init`, we have to reimplement it to support `constexpr`.
         bitset_._u64s[0] = 0, bitset_._u64s[1] = 0, bitset_._u64s[2] = 0, bitset_._u64s[3] = 0;
     }
-    explicit sz_constexpr_if_cpp14 basic_char_set(std::initializer_list<char_type> chars) noexcept : basic_char_set() {
-        // ! Instead of relying on the `sz_charset_add(&bitset_, c)`, we have to reimplement it to support `constexpr`.
+    explicit sz_constexpr_if_cpp14 basic_byteset(std::initializer_list<char_type> chars) noexcept : basic_byteset() {
+        // ! Instead of relying on the `sz_byteset_add(&bitset_, c)`, we have to reimplement it to support `constexpr`.
         for (auto c : chars) bitset_._u64s[sz_bitcast(sz_u8_t, c) >> 6] |= (1ull << (sz_bitcast(sz_u8_t, c) & 63u));
     }
 
-    explicit sz_constexpr_if_cpp14 basic_char_set(char_type const *chars, std::size_t count_characters) noexcept
-        : basic_char_set() {
+    explicit sz_constexpr_if_cpp14 basic_byteset(char_type const *chars, std::size_t count_characters) noexcept
+        : basic_byteset() {
         for (std::size_t i = 0; i < count_characters; ++i) {
             char_type c = chars[i];
             bitset_._u64s[sz_bitcast(sz_u8_t, c) >> 6] |= (1ull << (sz_bitcast(sz_u8_t, c) & 63u));
@@ -302,8 +302,8 @@ class basic_char_set {
     }
 
     template <std::size_t count_characters>
-    explicit sz_constexpr_if_cpp14 basic_char_set(std::array<char_type, count_characters> const &chars) noexcept
-        : basic_char_set() {
+    explicit sz_constexpr_if_cpp14 basic_byteset(std::array<char_type, count_characters> const &chars) noexcept
+        : basic_byteset() {
         static_assert(count_characters > 0, "Character array cannot be empty");
         for (std::size_t i = 0; i < count_characters; ++i) {
             char_type c = chars[i];
@@ -311,21 +311,21 @@ class basic_char_set {
         }
     }
 
-    sz_constexpr_if_cpp14 basic_char_set(basic_char_set const &other) noexcept : bitset_(other.bitset_) {}
-    sz_constexpr_if_cpp14 basic_char_set &operator=(basic_char_set const &other) noexcept {
+    sz_constexpr_if_cpp14 basic_byteset(basic_byteset const &other) noexcept : bitset_(other.bitset_) {}
+    sz_constexpr_if_cpp14 basic_byteset &operator=(basic_byteset const &other) noexcept {
         bitset_ = other.bitset_;
         return *this;
     }
 
-    constexpr basic_char_set operator|(basic_char_set other) const noexcept {
-        basic_char_set result = *this;
+    constexpr basic_byteset operator|(basic_byteset other) const noexcept {
+        basic_byteset result = *this;
         result.bitset_._u64s[0] |= other.bitset_._u64s[0], result.bitset_._u64s[1] |= other.bitset_._u64s[1],
             result.bitset_._u64s[2] |= other.bitset_._u64s[2], result.bitset_._u64s[3] |= other.bitset_._u64s[3];
         return result;
     }
 
-    inline basic_char_set &add(char_type c) noexcept {
-        sz_charset_add(&bitset_, sz_bitcast(sz_u8_t, c));
+    inline basic_byteset &add(char_type c) noexcept {
+        sz_byteset_add(&bitset_, sz_bitcast(sz_u8_t, c));
         return *this;
     }
     inline std::size_t size() const noexcept {
@@ -333,30 +333,30 @@ class basic_char_set {
             sz_u64_popcount(bitset_._u64s[0]) + sz_u64_popcount(bitset_._u64s[1]) + //
             sz_u64_popcount(bitset_._u64s[2]) + sz_u64_popcount(bitset_._u64s[3]);
     }
-    inline sz_charset_t &raw() noexcept { return bitset_; }
-    inline sz_charset_t const &raw() const noexcept { return bitset_; }
-    inline bool contains(char_type c) const noexcept { return sz_charset_contains(&bitset_, sz_bitcast(sz_u8_t, c)); }
-    inline basic_char_set inverted() const noexcept {
-        basic_char_set result = *this;
-        sz_charset_invert(&result.bitset_);
+    inline sz_byteset_t &raw() noexcept { return bitset_; }
+    inline sz_byteset_t const &raw() const noexcept { return bitset_; }
+    inline bool contains(char_type c) const noexcept { return sz_byteset_contains(&bitset_, sz_bitcast(sz_u8_t, c)); }
+    inline basic_byteset inverted() const noexcept {
+        basic_byteset result = *this;
+        sz_byteset_invert(&result.bitset_);
         return result;
     }
 };
 
-using char_set = basic_char_set<char>;
-
-inline char_set ascii_letters_set() { return char_set {ascii_letters(), sizeof(ascii_letters())}; }
-inline char_set ascii_lowercase_set() { return char_set {ascii_lowercase(), sizeof(ascii_lowercase())}; }
-inline char_set ascii_uppercase_set() { return char_set {ascii_uppercase(), sizeof(ascii_uppercase())}; }
-inline char_set ascii_printables_set() { return char_set {ascii_printables(), sizeof(ascii_printables())}; }
-inline char_set ascii_controls_set() { return char_set {ascii_controls(), sizeof(ascii_controls())}; }
-inline char_set digits_set() { return char_set {digits(), sizeof(digits())}; }
-inline char_set hexdigits_set() { return char_set {hexdigits(), sizeof(hexdigits())}; }
-inline char_set octdigits_set() { return char_set {octdigits(), sizeof(octdigits())}; }
-inline char_set punctuation_set() { return char_set {punctuation(), sizeof(punctuation())}; }
-inline char_set whitespaces_set() { return char_set {whitespaces(), sizeof(whitespaces())}; }
-inline char_set newlines_set() { return char_set {newlines(), sizeof(newlines())}; }
-inline char_set base64_set() { return char_set {base64(), sizeof(base64())}; }
+using byteset = basic_byteset<char>;
+
+inline byteset ascii_letters_set() { return byteset {ascii_letters(), sizeof(ascii_letters())}; }
+inline byteset ascii_lowercase_set() { return byteset {ascii_lowercase(), sizeof(ascii_lowercase())}; }
+inline byteset ascii_uppercase_set() { return byteset {ascii_uppercase(), sizeof(ascii_uppercase())}; }
+inline byteset ascii_printables_set() { return byteset {ascii_printables(), sizeof(ascii_printables())}; }
+inline byteset ascii_controls_set() { return byteset {ascii_controls(), sizeof(ascii_controls())}; }
+inline byteset digits_set() { return byteset {digits(), sizeof(digits())}; }
+inline byteset hexdigits_set() { return byteset {hexdigits(), sizeof(hexdigits())}; }
+inline byteset octdigits_set() { return byteset {octdigits(), sizeof(octdigits())}; }
+inline byteset punctuation_set() { return byteset {punctuation(), sizeof(punctuation())}; }
+inline byteset whitespaces_set() { return byteset {whitespaces(), sizeof(whitespaces())}; }
+inline byteset newlines_set() { return byteset {newlines(), sizeof(newlines())}; }
+inline byteset base64_set() { return byteset {base64(), sizeof(base64())}; }
 
 /**
  *  @brief  A look-up table for character replacement operations.
@@ -1667,10 +1667,10 @@ class basic_string_slice {
     }
 
     /**  @brief  Find the first occurrence of a character from a set. */
-    size_type find(char_set set) const noexcept { return find_first_of(set); }
+    size_type find(byteset set) const noexcept { return find_first_of(set); }
 
     /**  @brief  Find the last occurrence of a character from a set. */
-    size_type rfind(char_set set) const noexcept { return find_last_of(set); }
+    size_type rfind(byteset set) const noexcept { return find_last_of(set); }
 
 #pragma endregion
 #pragma region Returning Partitions
@@ -1682,7 +1682,7 @@ class basic_string_slice {
     partition_type partition(value_type pattern) const noexcept { return partition_(string_view(&pattern, 1), 1); }
 
     /**  @brief  Split the string into three parts, before the match, the match itself, and after it. */
-    partition_type partition(char_set pattern) const noexcept { return partition_(pattern, 1); }
+    partition_type partition(byteset pattern) const noexcept { return partition_(pattern, 1); }
 
     /**  @brief  Split the string into three parts, before the @b last match, the last match itself, and after it. */
     partition_type rpartition(string_view pattern) const noexcept { return rpartition_(pattern, pattern.length()); }
@@ -1691,7 +1691,7 @@ class basic_string_slice {
     partition_type rpartition(value_type pattern) const noexcept { return rpartition_(string_view(&pattern, 1), 1); }
 
     /**  @brief  Split the string into three parts, before the @b last match, the last match itself, and after it. */
-    partition_type rpartition(char_set pattern) const noexcept { return rpartition_(pattern, 1); }
+    partition_type rpartition(byteset pattern) const noexcept { return rpartition_(pattern, 1); }
 
 #pragma endregion
 #pragma endregion
@@ -1699,7 +1699,7 @@ class basic_string_slice {
 #pragma region Matching Character Sets
 
     // `isascii` is a macro in MSVC headers
-    bool contains_only(char_set set) const noexcept { return find_first_not_of(set) == npos; }
+    bool contains_only(byteset set) const noexcept { return find_first_not_of(set) == npos; }
     bool is_alpha() const noexcept { return !empty() && contains_only(ascii_letters_set()); }
     bool is_alnum() const noexcept { return !empty() && contains_only(ascii_letters_set() | digits_set()); }
     bool is_ascii() const noexcept { return empty() || contains_only(ascii_controls_set() | ascii_printables_set()); }
@@ -1715,8 +1715,8 @@ class basic_string_slice {
      *  @param  skip Number of characters to skip before the search.
      *  @warning The behavior is @b undefined if `skip > size()`.
      */
-    size_type find_first_of(char_set set, size_type skip = 0) const noexcept {
-        auto ptr = sz_find_charset(start_ + skip, length_ - skip, &set.raw());
+    size_type find_first_of(byteset set, size_type skip = 0) const noexcept {
+        auto ptr = sz_find_byteset(start_ + skip, length_ - skip, &set.raw());
         return ptr ? ptr - start_ : npos;
     }
 
@@ -1725,30 +1725,30 @@ class basic_string_slice {
      *  @param  skip  The number of first characters to be skipped.
      *  @warning The behavior is @b undefined if `skip > size()`.
      */
-    size_type find_first_not_of(char_set set, size_type skip = 0) const noexcept {
+    size_type find_first_not_of(byteset set, size_type skip = 0) const noexcept {
         return find_first_of(set.inverted(), skip);
     }
 
     /**
      *  @brief  Find the last occurrence of a character from a set.
      */
-    size_type find_last_of(char_set set) const noexcept {
-        auto ptr = sz_rfind_charset(start_, length_, &set.raw());
+    size_type find_last_of(byteset set) const noexcept {
+        auto ptr = sz_rfind_byteset(start_, length_, &set.raw());
         return ptr ? ptr - start_ : npos;
     }
 
     /**
      *  @brief  Find the last occurrence of a character outside a set.
      */
-    size_type find_last_not_of(char_set set) const noexcept { return find_last_of(set.inverted()); }
+    size_type find_last_not_of(byteset set) const noexcept { return find_last_of(set.inverted()); }
 
     /**
      *  @brief  Find the last occurrence of a character from a set.
      *  @param  until  The offset of the last character to be considered.
      */
-    size_type find_last_of(char_set set, size_type until) const noexcept {
+    size_type find_last_of(byteset set, size_type until) const noexcept {
         auto len = sz_min_of_two(until + 1, length_);
-        auto ptr = sz_rfind_charset(start_, len, &set.raw());
+        auto ptr = sz_rfind_byteset(start_, len, &set.raw());
         return ptr ? ptr - start_ : npos;
     }
 
@@ -1756,7 +1756,7 @@ class basic_string_slice {
      *  @brief  Find the last occurrence of a character outside a set.
      *  @param  until  The offset of the last character to be considered.
      */
-    size_type find_last_not_of(char_set set, size_type until) const noexcept {
+    size_type find_last_not_of(byteset set, size_type until) const noexcept {
         return find_last_of(set.inverted(), until);
     }
 
@@ -1839,9 +1839,9 @@ class basic_string_slice {
      *  @brief  Python-like convenience function, dropping prefix formed of given characters.
      *          Similar to `boost::algorithm::trim_left_if(str, is_any_of(set))`.
      */
-    string_slice lstrip(char_set set) const noexcept {
+    string_slice lstrip(byteset set) const noexcept {
         set = set.inverted();
-        auto new_start = (pointer)sz_find_charset(start_, length_, &set.raw());
+        auto new_start = (pointer)sz_find_byteset(start_, length_, &set.raw());
         return new_start ? string_slice {new_start, length_ - static_cast<size_type>(new_start - start_)}
                          : string_slice();
     }
@@ -1850,9 +1850,9 @@ class basic_string_slice {
      *  @brief  Python-like convenience function, dropping suffix formed of given characters.
      *          Similar to `boost::algorithm::trim_right_if(str, is_any_of(set))`.
      */
-    string_slice rstrip(char_set set) const noexcept {
+    string_slice rstrip(byteset set) const noexcept {
         set = set.inverted();
-        auto new_end = (pointer)sz_rfind_charset(start_, length_, &set.raw());
+        auto new_end = (pointer)sz_rfind_byteset(start_, length_, &set.raw());
         return new_end ? string_slice {start_, static_cast<size_type>(new_end - start_ + 1)} : string_slice();
     }
 
@@ -1860,12 +1860,12 @@ class basic_string_slice {
      *  @brief  Python-like convenience function, dropping both the prefix & the suffix formed of given characters.
      *          Similar to `boost::algorithm::trim_if(str, is_any_of(set))`.
      */
-    string_slice strip(char_set set) const noexcept {
+    string_slice strip(byteset set) const noexcept {
         set = set.inverted();
-        auto new_start = (pointer)sz_find_charset(start_, length_, &set.raw());
+        auto new_start = (pointer)sz_find_byteset(start_, length_, &set.raw());
         return new_start ? string_slice {new_start,
                                          static_cast<size_type>(
-                                             sz_rfind_charset(new_start, length_ - (new_start - start_), &set.raw()) -
+                                             sz_rfind_byteset(new_start, length_ - (new_start - start_), &set.raw()) -
                                              new_start + 1)}
                          : string_slice();
     }
@@ -1881,8 +1881,8 @@ class basic_string_slice {
     using find_disjoint_type = range_matches<string_slice, matcher_find<string_view, exclude_overlaps_type>>;
     using rfind_disjoint_type = range_rmatches<string_slice, matcher_rfind<string_view, exclude_overlaps_type>>;
 
-    using find_all_chars_type = range_matches<string_slice, matcher_find_first_of<string_view, char_set>>;
-    using rfind_all_chars_type = range_rmatches<string_slice, matcher_find_last_of<string_view, char_set>>;
+    using find_all_chars_type = range_matches<string_slice, matcher_find_first_of<string_view, byteset>>;
+    using rfind_all_chars_type = range_rmatches<string_slice, matcher_find_last_of<string_view, byteset>>;
 
     /**  @brief  Find all potentially @b overlapping occurrences of a given string. */
     find_all_type find_all(string_view needle, include_overlaps_type = {}) const noexcept { return {*this, needle}; }
@@ -1897,16 +1897,16 @@ class basic_string_slice {
     rfind_disjoint_type rfind_all(string_view needle, exclude_overlaps_type) const noexcept { return {*this, needle}; }
 
     /**  @brief  Find all occurrences of given characters. */
-    find_all_chars_type find_all(char_set set) const noexcept { return {*this, {set}}; }
+    find_all_chars_type find_all(byteset set) const noexcept { return {*this, {set}}; }
 
     /**  @brief  Find all occurrences of given characters in @b reverse order. */
-    rfind_all_chars_type rfind_all(char_set set) const noexcept { return {*this, {set}}; }
+    rfind_all_chars_type rfind_all(byteset set) const noexcept { return {*this, {set}}; }
 
     using split_type = range_splits<string_slice, matcher_find<string_view, exclude_overlaps_type>>;
     using rsplit_type = range_rsplits<string_slice, matcher_rfind<string_view, exclude_overlaps_type>>;
 
-    using split_chars_type = range_splits<string_slice, matcher_find_first_of<string_view, char_set>>;
-    using rsplit_chars_type = range_rsplits<string_slice, matcher_find_last_of<string_view, char_set>>;
+    using split_chars_type = range_splits<string_slice, matcher_find_first_of<string_view, byteset>>;
+    using rsplit_chars_type = range_rsplits<string_slice, matcher_find_last_of<string_view, byteset>>;
 
     /**  @brief  Split around occurrences of a given string. */
     split_type split(string_view delimiter) const noexcept { return {*this, delimiter}; }
@@ -1915,10 +1915,10 @@ class basic_string_slice {
     rsplit_type rsplit(string_view delimiter) const noexcept { return {*this, delimiter}; }
 
     /**  @brief  Split around occurrences of given characters. */
-    split_chars_type split(char_set set = whitespaces_set()) const noexcept { return {*this, {set}}; }
+    split_chars_type split(byteset set = whitespaces_set()) const noexcept { return {*this, {set}}; }
 
     /**  @brief  Split around occurrences of given characters in @b reverse order. */
-    rsplit_chars_type rsplit(char_set set = whitespaces_set()) const noexcept { return {*this, {set}}; }
+    rsplit_chars_type rsplit(byteset set = whitespaces_set()) const noexcept { return {*this, {set}}; }
 
     /**  @brief  Split around the occurrences of all newline characters. */
     split_chars_type splitlines() const noexcept { return split(newlines_set()); }
@@ -1934,8 +1934,8 @@ class basic_string_slice {
     size_type bytesum() const noexcept { return static_cast<size_type>(sz_bytesum(start_, length_)); }
 
     /**  @brief  Populate a character set with characters present in this string. */
-    char_set as_set() const noexcept {
-        char_set set;
+    byteset as_set() const noexcept {
+        byteset set;
         for (auto c : *this) set.add(c);
         return set;
     }
@@ -2555,17 +2555,17 @@ class basic_string {
     }
 
     /**  @brief  Find the first occurrence of a character from a set. */
-    size_type find(char_set set) const noexcept { return view().find(set); }
+    size_type find(byteset set) const noexcept { return view().find(set); }
 
     /**  @brief  Find the last occurrence of a character from a set. */
-    size_type rfind(char_set set) const noexcept { return view().rfind(set); }
+    size_type rfind(byteset set) const noexcept { return view().rfind(set); }
 
 #pragma endregion
 #pragma endregion
 
 #pragma region Matching Character Sets
 
-    bool contains_only(char_set set) const noexcept { return find_first_not_of(set) == npos; }
+    bool contains_only(byteset set) const noexcept { return find_first_not_of(set) == npos; }
     bool is_alpha() const noexcept { return !empty() && contains_only(ascii_letters_set()); }
     bool is_alnum() const noexcept { return !empty() && contains_only(ascii_letters_set() | digits_set()); }
     bool is_ascii() const noexcept { return empty() || contains_only(ascii_controls_set() | ascii_printables_set()); }
@@ -2583,38 +2583,38 @@ class basic_string {
      *  @param  skip Number of characters to skip before the search.
      *  @warning The behavior is @b undefined if `skip > size()`.
      */
-    size_type find_first_of(char_set set, size_type skip = 0) const noexcept { return view().find_first_of(set, skip); }
+    size_type find_first_of(byteset set, size_type skip = 0) const noexcept { return view().find_first_of(set, skip); }
 
     /**
      *  @brief  Find the first occurrence of a character outside a set.
      *  @param  skip  The number of first characters to be skipped.
      *  @warning The behavior is @b undefined if `skip > size()`.
      */
-    size_type find_first_not_of(char_set set, size_type skip = 0) const noexcept {
+    size_type find_first_not_of(byteset set, size_type skip = 0) const noexcept {
         return view().find_first_not_of(set, skip);
     }
 
     /**
      *  @brief  Find the last occurrence of a character from a set.
      */
-    size_type find_last_of(char_set set) const noexcept { return view().find_last_of(set); }
+    size_type find_last_of(byteset set) const noexcept { return view().find_last_of(set); }
 
     /**
      *  @brief  Find the last occurrence of a character outside a set.
      */
-    size_type find_last_not_of(char_set set) const noexcept { return view().find_last_not_of(set); }
+    size_type find_last_not_of(byteset set) const noexcept { return view().find_last_not_of(set); }
 
     /**
      *  @brief  Find the last occurrence of a character from a set.
      *  @param  until  The offset of the last character to be considered.
      */
-    size_type find_last_of(char_set set, size_type until) const noexcept { return view().find_last_of(set, until); }
+    size_type find_last_of(byteset set, size_type until) const noexcept { return view().find_last_of(set, until); }
 
     /**
      *  @brief  Find the last occurrence of a character outside a set.
      *  @param  until  The offset of the last character to be considered.
      */
-    size_type find_last_not_of(char_set set, size_type until) const noexcept {
+    size_type find_last_not_of(byteset set, size_type until) const noexcept {
         return view().find_last_not_of(set, until);
     }
 
@@ -2697,7 +2697,7 @@ class basic_string {
      *  @brief  Python-like convenience function, dropping prefix formed of given characters.
      *          Similar to `boost::algorithm::trim_left_if(str, is_any_of(set))`.
      */
-    basic_string &lstrip(char_set set) noexcept {
+    basic_string &lstrip(byteset set) noexcept {
         auto remaining = view().lstrip(set);
         remove_prefix(size() - remaining.size());
         return *this;
@@ -2707,7 +2707,7 @@ class basic_string {
      *  @brief  Python-like convenience function, dropping suffix formed of given characters.
      *          Similar to `boost::algorithm::trim_right_if(str, is_any_of(set))`.
      */
-    basic_string &rstrip(char_set set) noexcept {
+    basic_string &rstrip(byteset set) noexcept {
         auto remaining = view().rstrip(set);
         remove_suffix(size() - remaining.size());
         return *this;
@@ -2717,7 +2717,7 @@ class basic_string {
      *  @brief  Python-like convenience function, dropping both the prefix & the suffix formed of given characters.
      *          Similar to `boost::algorithm::trim_if(str, is_any_of(set))`.
      */
-    basic_string &strip(char_set set) noexcept { return lstrip(set).rstrip(set); }
+    basic_string &strip(byteset set) noexcept { return lstrip(set).rstrip(set); }
 
 #pragma endregion
 #pragma endregion
@@ -3339,7 +3339,7 @@ class basic_string {
         sz_ptr_t start;
         sz_size_t length;
         sz_string_range(&string_, &start, &length);
-        sz_generate(start, length, nonce);
+        sz_fill_random(start, length, nonce);
         return *this;
     }
 
@@ -3393,7 +3393,7 @@ class basic_string {
      *  and might be suboptimal, if you are exporting the cleaned-up string to another buffer.
      *  The algorithm is suboptimal when this string is made exclusively of the pattern.
      */
-    basic_string &replace_all(char_set pattern, string_view replacement) noexcept(false) {
+    basic_string &replace_all(byteset pattern, string_view replacement) noexcept(false) {
         if (!try_replace_all(pattern, replacement)) throw std::bad_alloc();
         return *this;
     }
@@ -3418,8 +3418,8 @@ class basic_string {
      *  and might be suboptimal, if you are exporting the cleaned-up string to another buffer.
      *  The algorithm is suboptimal when this string is made exclusively of the pattern.
      */
-    bool try_replace_all(char_set pattern, string_view replacement) noexcept {
-        return try_replace_all_<char_set>(pattern, replacement);
+    bool try_replace_all(byteset pattern, string_view replacement) noexcept {
+        return try_replace_all_<byteset>(pattern, replacement);
     }
 
     /**
@@ -3458,8 +3458,8 @@ static_assert(sizeof(string) == 4 * sizeof(void *), "String size must be 4 point
 
 namespace literals {
 constexpr string_view operator""_sv(char const *str, std::size_t length) noexcept { return {str, length}; }
-sz_constexpr_if_cpp14 char_set operator""_cs(char const *str, std::size_t length) noexcept {
-    return char_set {str, length};
+sz_constexpr_if_cpp14 byteset operator""_bs(char const *str, std::size_t length) noexcept {
+    return byteset {str, length};
 }
 } // namespace literals
 
@@ -3565,7 +3565,7 @@ bool basic_string<char_type_, allocator_>::try_replace_all_(pattern_type pattern
     // 1. The pattern and the replacement are of the same length. Piece of cake!
     // 2. The pattern is longer than the replacement. We need to compact the strings.
     // 3. The pattern is shorter than the replacement. We may have to allocate more memory.
-    using matcher_type = typename std::conditional<std::is_same<pattern_type, char_set>::value,
+    using matcher_type = typename std::conditional<std::is_same<pattern_type, byteset>::value,
                                                    matcher_find_first_of<string_view, pattern_type>,
                                                    matcher_find<string_view, exclude_overlaps_type>>::type;
     matcher_type matcher({pattern});
@@ -3611,7 +3611,7 @@ bool basic_string<char_type_, allocator_>::try_replace_all_(pattern_type pattern
 
     // 3. The pattern is shorter than the replacement. We may have to allocate more memory.
     else {
-        using rmatcher_type = typename std::conditional<std::is_same<pattern_type, char_set>::value,
+        using rmatcher_type = typename std::conditional<std::is_same<pattern_type, byteset>::value,
                                                         matcher_find_last_of<string_view, pattern_type>,
                                                         matcher_rfind<string_view, exclude_overlaps_type>>::type;
         using rmatches_type = range_rmatches<string_view, rmatcher_type>;
@@ -3927,7 +3927,7 @@ std::ptrdiff_t alignment_score(
 template <typename char_type_>
 void randomize(basic_string_slice<char_type_> string, sz_u64_t nonce) noexcept {
     static_assert(!std::is_const<char_type_>::value, "The string must be mutable.");
-    sz_generate(string.data(), string.size(), nonce);
+    sz_fill_random(string.data(), string.size(), nonce);
 }
 
 /**
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 75d76e61..39a6352b 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -22,7 +22,7 @@
  *  - `sz_string_view_t` - for a C-style `std::string_view`-like structure.
  *  - `sz_memory_allocator_t` - a wrapper for memory-management functions.
  *  - `sz_sequence_t` - a wrapper to access strings forming a sequential container.
- *  - `sz_charset_t` - a bitset for 256 possible byte values.
+ *  - `sz_byteset_t` - a bitset for 256 possible byte values.
  */
 #ifndef STRINGZILLA_TYPES_H_
 #define STRINGZILLA_TYPES_H_
@@ -344,10 +344,13 @@ typedef enum {
      */
     sz_bad_alloc_k = -1,
     /**
-     *  For algorithms that have an upper bound on some parameter, like the maximum number of iterations,
-     *  or the maximum edit distance, this status indicates that the limit was reached.
+     *  For algorithms that require UTF8 input, this status indicates that the input is invalid.
      */
-    sz_reached_limit_k = -2,
+    sz_invalid_utf8_k = -2,
+    /**
+     *  For algorithms that take collections of unique elements, this status indicates presence of duplicates.
+     */
+    sz_contains_duplicates_k = -3,
 } sz_status_t;
 
 /**
@@ -374,33 +377,47 @@ typedef struct sz_string_view_t {
 #pragma region Character Sets
 
 /**
- *  @brief  Bit-set structure for 256 possible byte values. Useful for filtering and search.
- *  @see    sz_charset_init, sz_charset_add, sz_charset_contains, sz_charset_invert
+ *  @brief  Bit-set semi-opaque structure for 256 possible byte values. Useful for filtering and search.
+ *  @sa     sz_byteset_init, sz_byteset_add, sz_byteset_contains, sz_byteset_invert
+ *
+ *  Example usage:
+ *
+ *  @code{.c}
+ *      #include <stringzilla/types.h>
+ *      int main() {
+ *          char const *alphabet = "abcdefghijklmnopqrstuvwxyz";
+ *          sz_byteset_t byteset;
+ *          sz_byteset_init(&byteset);
+ *          for (sz_size_t i = 0; i < 26; ++i)
+ *              sz_byteset_add(&byteset, alphabet[i]);
+ *          return sz_byteset_contains(&byteset, 'a') && !sz_byteset_contains(&byteset, 'A') ? 0 : 1;
+ *      }
+ *  @endcode
  */
-typedef union sz_charset_t {
+typedef union sz_byteset_t {
     sz_u64_t _u64s[4];
     sz_u32_t _u32s[8];
     sz_u16_t _u16s[16];
     sz_u8_t _u8s[32];
-} sz_charset_t;
+} sz_byteset_t;
 
 /** @brief  Initializes a bit-set to an empty collection, meaning - all characters are banned. */
-SZ_PUBLIC void sz_charset_init(sz_charset_t *s) { s->_u64s[0] = s->_u64s[1] = s->_u64s[2] = s->_u64s[3] = 0; }
+SZ_PUBLIC void sz_byteset_init(sz_byteset_t *s) { s->_u64s[0] = s->_u64s[1] = s->_u64s[2] = s->_u64s[3] = 0; }
 
 /** @brief  Initializes a bit-set to all ASCII character. */
-SZ_PUBLIC void sz_charset_init_ascii(sz_charset_t *s) {
+SZ_PUBLIC void sz_byteset_init_ascii(sz_byteset_t *s) {
     s->_u64s[0] = s->_u64s[1] = 0xFFFFFFFFFFFFFFFFull;
     s->_u64s[2] = s->_u64s[3] = 0;
 }
 
 /** @brief  Adds a character to the set and accepts @b unsigned integers. */
-SZ_PUBLIC void sz_charset_add_u8(sz_charset_t *s, sz_u8_t c) { s->_u64s[c >> 6] |= (1ull << (c & 63u)); }
+SZ_PUBLIC void sz_byteset_add_u8(sz_byteset_t *s, sz_u8_t c) { s->_u64s[c >> 6] |= (1ull << (c & 63u)); }
 
-/** @brief  Adds a character to the set. Consider @b sz_charset_add_u8. */
-SZ_PUBLIC void sz_charset_add(sz_charset_t *s, char c) { sz_charset_add_u8(s, *(sz_u8_t *)(&c)); } // bitcast
+/** @brief  Adds a character to the set. Consider @b sz_byteset_add_u8. */
+SZ_PUBLIC void sz_byteset_add(sz_byteset_t *s, char c) { sz_byteset_add_u8(s, *(sz_u8_t *)(&c)); } // bitcast
 
 /** @brief  Checks if the set contains a given character and accepts @b unsigned integers. */
-SZ_PUBLIC sz_bool_t sz_charset_contains_u8(sz_charset_t const *s, sz_u8_t c) {
+SZ_PUBLIC sz_bool_t sz_byteset_contains_u8(sz_byteset_t const *s, sz_u8_t c) {
     // Checking the bit can be done in different ways:
     // - (s->_u64s[c >> 6] & (1ull << (c & 63u))) != 0
     // - (s->_u32s[c >> 5] & (1u << (c & 31u))) != 0
@@ -409,13 +426,13 @@ SZ_PUBLIC sz_bool_t sz_charset_contains_u8(sz_charset_t const *s, sz_u8_t c) {
     return (sz_bool_t)((s->_u64s[c >> 6] & (1ull << (c & 63u))) != 0);
 }
 
-/** @brief  Checks if the set contains a given character. Consider @b sz_charset_contains_u8. */
-SZ_PUBLIC sz_bool_t sz_charset_contains(sz_charset_t const *s, char c) {
-    return sz_charset_contains_u8(s, *(sz_u8_t *)(&c)); // bitcast
+/** @brief  Checks if the set contains a given character. Consider @b sz_byteset_contains_u8. */
+SZ_PUBLIC sz_bool_t sz_byteset_contains(sz_byteset_t const *s, char c) {
+    return sz_byteset_contains_u8(s, *(sz_u8_t *)(&c)); // bitcast
 }
 
 /** @brief  Inverts the contents of the set, so allowed character get disallowed, and vice versa. */
-SZ_PUBLIC void sz_charset_invert(sz_charset_t *s) {
+SZ_PUBLIC void sz_byteset_invert(sz_byteset_t *s) {
     s->_u64s[0] ^= 0xFFFFFFFFFFFFFFFFull, s->_u64s[1] ^= 0xFFFFFFFFFFFFFFFFull, //
         s->_u64s[2] ^= 0xFFFFFFFFFFFFFFFFull, s->_u64s[3] ^= 0xFFFFFFFFFFFFFFFFull;
 }
@@ -476,8 +493,8 @@ typedef sz_u64_t (*sz_hash_state_fold_t)(struct sz_hash_state_t const *);
 /** @brief  Signature of `sz_bytesum`. */
 typedef sz_u64_t (*sz_bytesum_t)(sz_cptr_t, sz_size_t);
 
-/** @brief  Signature of `sz_generate`. */
-typedef void (*sz_generate_t)(sz_ptr_t, sz_size_t, sz_u64_t);
+/** @brief  Signature of `sz_fill_random`. */
+typedef void (*sz_fill_random_t)(sz_ptr_t, sz_size_t, sz_u64_t);
 
 /** @brief  Signature of `sz_equal`. */
 typedef sz_bool_t (*sz_equal_t)(sz_cptr_t, sz_cptr_t, sz_size_t);
@@ -486,7 +503,7 @@ typedef sz_bool_t (*sz_equal_t)(sz_cptr_t, sz_cptr_t, sz_size_t);
 typedef sz_ordering_t (*sz_order_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
 
 /** @brief  Signature of `sz_lookup`. */
-typedef void (*sz_lookup_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_ptr_t);
+typedef void (*sz_lookup_t)(sz_ptr_t, sz_size_t, sz_cptr_t, sz_cptr_t);
 
 /** @brief  Signature of `sz_move`. */
 typedef void (*sz_move_t)(sz_ptr_t, sz_cptr_t, sz_size_t);
@@ -501,7 +518,7 @@ typedef sz_cptr_t (*sz_find_byte_t)(sz_cptr_t, sz_size_t, sz_cptr_t);
 typedef sz_cptr_t (*sz_find_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
 
 /** @brief  Signature of `sz_find_set`. */
-typedef sz_cptr_t (*sz_find_set_t)(sz_cptr_t, sz_size_t, sz_charset_t const *);
+typedef sz_cptr_t (*sz_find_set_t)(sz_cptr_t, sz_size_t, sz_byteset_t const *);
 
 /** @brief  Signature of `sz_hamming_distance`. */
 typedef sz_status_t (*sz_hamming_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t, sz_size_t *);
@@ -515,18 +532,14 @@ typedef sz_status_t (*sz_needleman_wunsch_score_t)(sz_cptr_t, sz_size_t, sz_cptr
                                                    sz_error_cost_t, sz_memory_allocator_t *, sz_ssize_t *);
 
 /** @brief  Signature of `sz_sequence_argsort`. */
-typedef sz_status_t (*sz_sequence_argsort_t)(struct sz_sequence_t const *, sz_memory_allocator_t *, sz_sorted_idx_t *,
-                                             sz_bool_t *);
+typedef sz_status_t (*sz_sequence_argsort_t)(struct sz_sequence_t const *, sz_memory_allocator_t *, sz_sorted_idx_t *);
 
 /** @brief  Signature of `sz_pgrams_sort`. */
-typedef sz_status_t (*sz_pgrams_sort_t)(sz_pgram_t *, sz_size_t, sz_memory_allocator_t *, sz_sorted_idx_t *,
-                                        sz_bool_t *);
-
-/** @brief  Signature of `sz_sequence_argsort_stable`. */
-typedef sz_sequence_argsort_t sz_sequence_argsort_stable_t;
+typedef sz_status_t (*sz_pgrams_sort_t)(sz_pgram_t *, sz_size_t, sz_memory_allocator_t *, sz_sorted_idx_t *);
 
-/** @brief  Signature of `sz_pgrams_sort_stable`. */
-typedef sz_pgrams_sort_t sz_pgrams_sort_stable_t;
+/** @brief  Signature of `sz_sequence_join`. */
+typedef sz_status_t (*sz_sequence_join_t)(struct sz_sequence_t const *, struct sz_sequence_t const *,
+                                          sz_memory_allocator_t *, sz_size_t *, sz_sorted_idx_t *, sz_sorted_idx_t *);
 
 #pragma endregion
 
diff --git a/scripts/bench_search.cpp b/scripts/bench_search.cpp
index 7380a697..6ffd9790 100644
--- a/scripts/bench_search.cpp
+++ b/scripts/bench_search.cpp
@@ -123,11 +123,11 @@ tracked_binary_functions_t rfind_functions() {
     return result;
 }
 
-tracked_binary_functions_t find_charset_functions() {
+tracked_binary_functions_t find_byteset_functions() {
     // ! Despite receiving string-views, following functions are assuming the strings are null-terminated.
     auto wrap_sz = [](auto function) -> binary_function_t {
         return binary_function_t([function](std::string_view h, std::string_view n) {
-            sz::char_set set;
+            sz::byteset set;
             for (auto c : n) set.add(c);
             sz_cptr_t match = function(h.data(), h.size(), &set.raw());
             return (match ? match - h.data() : h.size());
@@ -139,26 +139,26 @@ tracked_binary_functions_t find_charset_functions() {
              auto match = h.find_first_of(n);
              return (match == std::string_view::npos ? h.size() : match);
          }},
-        {"sz_find_charset_serial", wrap_sz(sz_find_charset_serial), true},
+        {"sz_find_byteset_serial", wrap_sz(sz_find_byteset_serial), true},
 #if SZ_USE_HASWELL
-        {"sz_find_charset_haswell", wrap_sz(sz_find_charset_haswell), true},
+        {"sz_find_byteset_haswell", wrap_sz(sz_find_byteset_haswell), true},
 #endif
 #if SZ_USE_ICE
-        {"sz_find_charset_ice", wrap_sz(sz_find_charset_ice), true},
+        {"sz_find_byteset_ice", wrap_sz(sz_find_byteset_ice), true},
 #endif
 #if SZ_USE_NEON
-        {"sz_find_charset_neon", wrap_sz(sz_find_charset_neon), true},
+        {"sz_find_byteset_neon", wrap_sz(sz_find_byteset_neon), true},
 #endif
         {"strcspn", [](std::string_view h, std::string_view n) { return strcspn(h.data(), n.data()); }},
     };
     return result;
 }
 
-tracked_binary_functions_t rfind_charset_functions() {
+tracked_binary_functions_t rfind_byteset_functions() {
     // ! Despite receiving string-views, following functions are assuming the strings are null-terminated.
     auto wrap_sz = [](auto function) -> binary_function_t {
         return binary_function_t([function](std::string_view h, std::string_view n) {
-            sz::char_set set;
+            sz::byteset set;
             for (auto c : n) set.add(c);
             sz_cptr_t match = function(h.data(), h.size(), &set.raw());
             return (match ? match - h.data() : 0);
@@ -170,12 +170,12 @@ tracked_binary_functions_t rfind_charset_functions() {
              auto match = h.find_last_of(n);
              return (match == std::string_view::npos ? 0 : match);
          }},
-        {"sz_rfind_charset_serial", wrap_sz(sz_rfind_charset_serial), true},
+        {"sz_rfind_byteset_serial", wrap_sz(sz_rfind_byteset_serial), true},
 #if SZ_USE_ICE
-        {"sz_rfind_charset_ice", wrap_sz(sz_rfind_charset_ice), true},
+        {"sz_rfind_byteset_ice", wrap_sz(sz_rfind_byteset_ice), true},
 #endif
 #if SZ_USE_NEON
-        {"sz_rfind_charset_neon", wrap_sz(sz_rfind_charset_neon), true},
+        {"sz_rfind_byteset_neon", wrap_sz(sz_rfind_byteset_neon), true},
 #endif
     };
     return result;
@@ -304,25 +304,25 @@ int main(int argc, char const **argv) {
     bench_rfinds(dataset.text, {" "}, rfind_functions());
 
     std::printf("Benchmarking for an [\\n\\r\\v\\f] RegEx:\n");
-    bench_finds(dataset.text, {"\n\r\v\f"}, find_charset_functions());
-    bench_rfinds(dataset.text, {"\n\r\v\f"}, rfind_charset_functions());
+    bench_finds(dataset.text, {"\n\r\v\f"}, find_byteset_functions());
+    bench_rfinds(dataset.text, {"\n\r\v\f"}, rfind_byteset_functions());
 
     // Typical ASCII tokenization and validation benchmarks
     std::printf("Benchmarking for all whitespaces:\n");
-    bench_finds(dataset.text, {{sz::whitespaces(), sizeof(sz::whitespaces())}}, find_charset_functions());
-    bench_rfinds(dataset.text, {{sz::whitespaces(), sizeof(sz::whitespaces())}}, rfind_charset_functions());
+    bench_finds(dataset.text, {{sz::whitespaces(), sizeof(sz::whitespaces())}}, find_byteset_functions());
+    bench_rfinds(dataset.text, {{sz::whitespaces(), sizeof(sz::whitespaces())}}, rfind_byteset_functions());
 
     std::printf("Benchmarking for HTML tag start/end:\n");
-    bench_finds(dataset.text, {"<>"}, find_charset_functions());
-    bench_rfinds(dataset.text, {"<>"}, rfind_charset_functions());
+    bench_finds(dataset.text, {"<>"}, find_byteset_functions());
+    bench_rfinds(dataset.text, {"<>"}, rfind_byteset_functions());
 
     std::printf("Benchmarking for punctuation marks:\n");
-    bench_finds(dataset.text, {{sz::punctuation(), sizeof(sz::punctuation())}}, find_charset_functions());
-    bench_rfinds(dataset.text, {{sz::punctuation(), sizeof(sz::punctuation())}}, rfind_charset_functions());
+    bench_finds(dataset.text, {{sz::punctuation(), sizeof(sz::punctuation())}}, find_byteset_functions());
+    bench_rfinds(dataset.text, {{sz::punctuation(), sizeof(sz::punctuation())}}, rfind_byteset_functions());
 
     std::printf("Benchmarking for non-printable characters:\n");
-    bench_finds(dataset.text, {{sz::ascii_controls(), sizeof(sz::ascii_controls())}}, find_charset_functions());
-    bench_rfinds(dataset.text, {{sz::ascii_controls(), sizeof(sz::ascii_controls())}}, rfind_charset_functions());
+    bench_finds(dataset.text, {{sz::ascii_controls(), sizeof(sz::ascii_controls())}}, find_byteset_functions());
+    bench_rfinds(dataset.text, {{sz::ascii_controls(), sizeof(sz::ascii_controls())}}, rfind_byteset_functions());
 
     // Baseline benchmarks for present tokens, coming in all lengths
     std::printf("Benchmarking on present lines:\n");
diff --git a/scripts/test.cpp b/scripts/test.cpp
index 8465cf15..a0eac08e 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -63,7 +63,7 @@
 namespace sz = ashvardanian::stringzilla;
 using namespace sz::scripts;
 using sz::literals::operator""_sv; // for `sz::string_view`
-using sz::literals::operator""_cs; // for `sz::char_set`
+using sz::literals::operator""_bs; // for `sz::byteset`
 
 /*
  *  Instantiate all the templates to make the symbols visible and also check
@@ -75,7 +75,7 @@ template class std::basic_string_view<char>;
 template class sz::basic_string_slice<char>;
 template class std::basic_string<char>;
 template class sz::basic_string<char>;
-template class sz::basic_char_set<char>;
+template class sz::basic_byteset<char>;
 
 template class std::vector<sz::string>;
 template class std::map<sz::string, int>;
@@ -202,7 +202,7 @@ static void test_hashing_on_platform(                                   //
  *  @brief  Tests Pseudo-Random Number Generators (PRNGs) ensuring that the same nonce
  *          produces exactly the same output across different SIMD implementations.
  */
-static void test_random_generator_on_platform(sz_generate_t generate_base, sz_generate_t generate_simd) {
+static void test_random_generator_on_platform(sz_fill_random_t generate_base, sz_fill_random_t generate_simd) {
 
     auto test_on_nonce = [&](std::size_t length, sz_u64_t nonce) {
         std::string text_base(length, '\0');
@@ -231,7 +231,7 @@ static void test_simd_against_serial() {
         sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
         sz_hash_haswell, sz_hash_state_init_haswell,            //
         sz_hash_state_stream_haswell, sz_hash_state_fold_haswell);
-    test_random_generator_on_platform(sz_generate_serial, sz_generate_haswell);
+    test_random_generator_on_platform(sz_fill_random_serial, sz_fill_random_haswell);
 #endif
 #if SZ_USE_SKYLAKE
     test_hashing_on_platform(                                   //
@@ -239,7 +239,7 @@ static void test_simd_against_serial() {
         sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
         sz_hash_skylake, sz_hash_state_init_skylake,            //
         sz_hash_state_stream_skylake, sz_hash_state_fold_skylake);
-    test_random_generator_on_platform(sz_generate_serial, sz_generate_skylake);
+    test_random_generator_on_platform(sz_fill_random_serial, sz_fill_random_skylake);
 #endif
 #if SZ_USE_ICE
     test_hashing_on_platform(                                   //
@@ -247,7 +247,7 @@ static void test_simd_against_serial() {
         sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
         sz_hash_ice, sz_hash_state_init_ice,                    //
         sz_hash_state_stream_ice, sz_hash_state_fold_ice);
-    test_random_generator_on_platform(sz_generate_serial, sz_generate_ice);
+    test_random_generator_on_platform(sz_fill_random_serial, sz_fill_random_ice);
 #endif
 #if SZ_USE_NEON
     test_hashing_on_platform(                                   //
@@ -255,7 +255,7 @@ static void test_simd_against_serial() {
         sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
         sz_hash_neon, sz_hash_state_init_neon,                  //
         sz_hash_state_stream_neon, sz_hash_state_fold_neon);
-    test_random_generator_on_platform(sz_generate_serial, sz_generate_neon);
+    test_random_generator_on_platform(sz_fill_random_serial, sz_fill_random_neon);
 #endif
 };
 
@@ -268,13 +268,13 @@ static void test_ascii_utilities() {
 
     using str = string_type;
 
-    assert("aaa"_cs.size() == 1ull);
-    assert("\0\0"_cs.size() == 1ull);
-    assert("abc"_cs.size() == 3ull);
-    assert("a\0bc"_cs.size() == 4ull);
+    assert("aaa"_bs.size() == 1ull);
+    assert("\0\0"_bs.size() == 1ull);
+    assert("abc"_bs.size() == 3ull);
+    assert("a\0bc"_bs.size() == 4ull);
 
-    assert(!"abc"_cs.contains('\0'));
-    assert(str("bca").contains_only("abc"_cs));
+    assert(!"abc"_bs.contains('\0'));
+    assert(str("bca").contains_only("abc"_bs));
 
     assert(!str("").is_alpha());
     assert(str("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ").is_alpha());
@@ -309,9 +309,9 @@ static void test_ascii_utilities() {
     assert(str("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*()_+").is_printable());
     assert(!str("012🔥").is_printable());
 
-    assert(str("").contains_only("abc"_cs));
-    assert(str("abc").contains_only("abc"_cs));
-    assert(!str("abcd").contains_only("abc"_cs));
+    assert(str("").contains_only("abc"_bs));
+    assert(str("abc").contains_only("abc"_bs));
+    assert(!str("abcd").contains_only("abc"_bs));
 }
 
 inline void expect_equality(char const *a, char const *b, std::size_t size) {
@@ -1026,9 +1026,9 @@ void test_non_stl_extensions_for_updates() {
     assert_scoped(str s = "hello", s.replace_all("xx", "xx"), s == "hello");
     assert_scoped(str s = "hello", s.replace_all("l", "1"), s == "he11o");
     assert_scoped(str s = "hello", s.replace_all("he", "al"), s == "alllo");
-    assert_scoped(str s = "hello", s.replace_all("x"_cs, "!"), s == "hello");
-    assert_scoped(str s = "hello", s.replace_all("o"_cs, "!"), s == "hell!");
-    assert_scoped(str s = "hello", s.replace_all("ho"_cs, "!"), s == "!ell!");
+    assert_scoped(str s = "hello", s.replace_all("x"_bs, "!"), s == "hello");
+    assert_scoped(str s = "hello", s.replace_all("o"_bs, "!"), s == "hell!");
+    assert_scoped(str s = "hello", s.replace_all("ho"_bs, "!"), s == "!ell!");
 
     // Shorter replacements.
     assert_scoped(str s = "hello", s.replace_all("xx", "x"), s == "hello");
@@ -1036,8 +1036,8 @@ void test_non_stl_extensions_for_updates() {
     assert_scoped(str s = "hello", s.replace_all("h", ""), s == "ello");
     assert_scoped(str s = "hello", s.replace_all("o", ""), s == "hell");
     assert_scoped(str s = "hello", s.replace_all("llo", "!"), s == "he!");
-    assert_scoped(str s = "hello", s.replace_all("x"_cs, ""), s == "hello");
-    assert_scoped(str s = "hello", s.replace_all("lo"_cs, ""), s == "he");
+    assert_scoped(str s = "hello", s.replace_all("x"_bs, ""), s == "hello");
+    assert_scoped(str s = "hello", s.replace_all("lo"_bs, ""), s == "he");
 
     // Longer replacements.
     assert_scoped(str s = "hello", s.replace_all("xx", "xxx"), s == "hello");
@@ -1045,8 +1045,8 @@ void test_non_stl_extensions_for_updates() {
     assert_scoped(str s = "hello", s.replace_all("h", "hh"), s == "hhello");
     assert_scoped(str s = "hello", s.replace_all("o", "oo"), s == "helloo");
     assert_scoped(str s = "hello", s.replace_all("llo", "llo!"), s == "hello!");
-    assert_scoped(str s = "hello", s.replace_all("x"_cs, "xx"), s == "hello");
-    assert_scoped(str s = "hello", s.replace_all("lo"_cs, "lo"), s == "helololo");
+    assert_scoped(str s = "hello", s.replace_all("x"_bs, "xx"), s == "hello");
+    assert_scoped(str s = "hello", s.replace_all("lo"_bs, "lo"), s == "helololo");
 
     // Directly mapping bytes using a Look-Up Table.
     sz::look_up_table invert_case = sz::look_up_table::identity();
@@ -1286,9 +1286,9 @@ static void test_search() {
 
     assert("aabaa"_sv.remove_prefix("a") == "abaa");
     assert("aabaa"_sv.remove_suffix("a") == "aaba");
-    assert("aabaa"_sv.lstrip("a"_cs) == "baa");
-    assert("aabaa"_sv.rstrip("a"_cs) == "aab");
-    assert("aabaa"_sv.strip("a"_cs) == "b");
+    assert("aabaa"_sv.lstrip("a"_bs) == "baa");
+    assert("aabaa"_sv.rstrip("a"_bs) == "aab");
+    assert("aabaa"_sv.strip("a"_bs) == "b");
 
     // Check more advanced composite operations
     assert("abbccc"_sv.partition('b').before.size() == 1);
@@ -1320,21 +1320,21 @@ static void test_search() {
     assert("a.b.c.d"_sv.find_all(".").size() == 3);
     assert("a.,b.,c.,d"_sv.find_all(".,").size() == 3);
     assert("a.,b.,c.,d"_sv.rfind_all(".,").size() == 3);
-    assert("a.b,c.d"_sv.find_all(".,"_cs).size() == 3);
+    assert("a.b,c.d"_sv.find_all(".,"_bs).size() == 3);
     assert("a...b...c"_sv.rfind_all("..").size() == 4);
     assert("a...b...c"_sv.rfind_all("..", sz::include_overlaps_type {}).size() == 4);
     assert("a...b...c"_sv.rfind_all("..", sz::exclude_overlaps_type {}).size() == 2);
 
-    auto finds = "a.b.c"_sv.find_all("abcd"_cs).template to<std::vector<std::string>>();
+    auto finds = "a.b.c"_sv.find_all("abcd"_bs).template to<std::vector<std::string>>();
     assert(finds.size() == 3);
     assert(finds[0] == "a");
 
-    auto rfinds = "a.b.c"_sv.rfind_all("abcd"_cs).template to<std::vector<std::string>>();
+    auto rfinds = "a.b.c"_sv.rfind_all("abcd"_bs).template to<std::vector<std::string>>();
     assert(rfinds.size() == 3);
     assert(rfinds[0] == "c");
 
     {
-        auto splits = ".a..c."_sv.split("."_cs).template to<std::vector<std::string>>();
+        auto splits = ".a..c."_sv.split("."_bs).template to<std::vector<std::string>>();
         assert(splits.size() == 5);
         assert(splits[0] == "");
         assert(splits[1] == "a");
@@ -1369,9 +1369,9 @@ static void test_search() {
     assert(*advanced("a.b.c.d"_sv.split(".").begin(), 3) == "d");
     assert(*advanced("a.b.c.d"_sv.rsplit(".").begin(), 3) == "a");
     assert("a.b.,c,d"_sv.split(".,").size() == 2);
-    assert("a.b,c.d"_sv.split(".,"_cs).size() == 4);
+    assert("a.b,c.d"_sv.split(".,"_bs).size() == 4);
 
-    auto rsplits = ".a..c."_sv.rsplit("."_cs).template to<std::vector<std::string>>();
+    auto rsplits = ".a..c."_sv.rsplit("."_bs).template to<std::vector<std::string>>();
     assert(rsplits.size() == 5);
     assert(rsplits[0] == "");
     assert(rsplits[1] == "c");
@@ -1724,9 +1724,9 @@ static void test_sequence_algorithms() {
         sz_cptr_t strings[] = {"banana", "apple", "cherry"};
         sz_sequence_from_null_terminated_strings(strings, 3, &sequence);
         assert(sequence.count == 3);
-        assert(sequence.get_start(sequence.handle, 0) == "banana"_sv);
-        assert(sequence.get_start(sequence.handle, 1) == "apple"_sv);
-        assert(sequence.get_start(sequence.handle, 2) == "cherry"_sv);
+        assert("banana"_sv == sequence.get_start(sequence.handle, 0));
+        assert("apple"_sv == sequence.get_start(sequence.handle, 1));
+        assert("cherry"_sv == sequence.get_start(sequence.handle, 2));
     }
 
     // Basic tests with predetermined orders.
@@ -1813,14 +1813,6 @@ static void test_stl_containers() {
 
 int main(int argc, char const **argv) {
 
-    sz_u128_vec_t some_state, some_key;
-    randomize_string((char *)&some_state.u8s[0], 16);
-    randomize_string((char *)&some_key.u8s[0], 16);
-    sz_u128_vec_t emulated_result = _sz_emulate_aesenc_si128_serial(some_state, some_key);
-    sz_u128_vec_t hardware_result;
-    hardware_result.xmm = _mm_aesenc_si128(some_state.xmm, some_key.xmm);
-    assert(memcmp(&emulated_result, &hardware_result, sizeof(sz_u128_vec_t)) == 0);
-
     // Let's greet the user nicely
     sz_unused(argc && argv);
     std::printf("Hi, dear tester! You look nice today!\n");

From 2caefac64aeae2c3b5bee2adebfb76dc8acf38f4 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 28 Feb 2025 15:50:37 +0000
Subject: [PATCH 135/751] Fix: Compilation of all bindings

---
 CONTRIBUTING.md                        |  14 +
 Package.swift                          |  12 +-
 README.md                              |  60 +--
 include/stringzilla/sort.h             | 210 ++++----
 include/stringzilla/stringzilla.h      |  11 +-
 include/stringzilla/types.h            |  14 +-
 python/lib.c                           | 183 +++----
 rust/lib.rs                            | 657 ++++++++++++-------------
 rustfmt.toml                           |   1 +
 scripts/bench_memory.cpp               |   2 +-
 scripts/bench_similarity.cpp           |  17 +-
 scripts/bench_sort.cpp                 |  26 +-
 scripts/bench_token.cpp                |  16 +-
 scripts/test.py                        |  30 +-
 swift/StringProtocol+StringZilla.swift | 109 ++--
 swift/Test.swift                       |  25 +-
 16 files changed, 690 insertions(+), 697 deletions(-)
 create mode 100644 rustfmt.toml

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d6009a30..a8e825af 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -432,6 +432,13 @@ npm ci && npm test
 swift build && swift test
 ```
 
+To format, consider using [SwiftFormat](https://github.com/nicklockwood/SwiftFormat):
+
+```bash
+brew install swiftformat
+swiftformat .
+```
+
 Running Swift on Linux requires a couple of extra steps, as the Swift compiler is not available in the default repositories.
 Please get the most recent Swift tarball from the [official website](https://www.swift.org/install/).
 At the time of writing, for 64-bit Arm CPU running Ubuntu 22.04, the following commands would work:
@@ -467,6 +474,13 @@ sudo docker run --rm -v "$PWD:/workspace" -w /workspace swift:5.9 /bin/bash -cl
 cargo test
 ```
 
+If you need to isolate a failing test:
+
+```bash
+export RUST_BACKTRACE=full
+cargo test -- --test-threads=1 --nocapture
+```
+
 If you are updating the package contents, you can validate the list of included files using the following command:
 
 ```bash
diff --git a/Package.swift b/Package.swift
index c5c15fbb..ec3fe103 100644
--- a/Package.swift
+++ b/Package.swift
@@ -5,16 +5,16 @@ let package = Package(
     name: "StringZilla",
     platforms: [
         // Linux doesn't have to be explicitly listed
-        .iOS(.v13),      // For iOS, version 13 and later
-        .tvOS(.v13),     // For tvOS, version 13 and later
+        .iOS(.v13), // For iOS, version 13 and later
+        .tvOS(.v13), // For tvOS, version 13 and later
         .macOS(.v10_15), // For macOS, version 10.15 (Catalina) and later
-        .watchOS(.v6)    // For watchOS, version 6 and later
+        .watchOS(.v6), // For watchOS, version 6 and later
     ],
     products: [
         .library(
             name: "StringZilla",
             targets: ["StringZillaC", "StringZilla"]
-        )
+        ),
     ],
     targets: [
         .target(
@@ -27,7 +27,7 @@ let package = Package(
                 .define("SZ_AVOID_LIBC", to: "0"), // We need `malloc` from LibC
                 .define("SZ_DEBUG", to: "0"), // We don't need any extra assertions in the C layer
                 .headerSearchPath("include/stringzilla"), // Specify header search paths
-                .unsafeFlags(["-Wall"]) // Use with caution: specify custom compiler flags
+                .unsafeFlags(["-Wall"]), // Use with caution: specify custom compiler flags
             ]
         ),
         .target(
@@ -41,7 +41,7 @@ let package = Package(
             dependencies: ["StringZilla"],
             path: "swift",
             sources: ["Test.swift"]
-        )
+        ),
     ],
     cLanguageStandard: CLanguageStandard.c99
 )
diff --git a/README.md b/README.md
index 22c8e2b0..18aea8e2 100644
--- a/README.md
+++ b/README.md
@@ -137,7 +137,7 @@ __Who is this for?__
       <span style="color:#ABABAB;">arm:</span> <b>0.02</b> GB/s
     </td>
     <td align="center">
-      <code>sz_find_charset</code><br/>
+      <code>sz_find_byteset</code><br/>
       <span style="color:#ABABAB;">x86:</span> <b>4.08</b> &centerdot;
       <span style="color:#ABABAB;">arm:</span> <b>3.22</b> GB/s
     </td>
@@ -155,7 +155,7 @@ __Who is this for?__
     </td>
     <td align="center">⚪</td>
     <td align="center">
-      <code>sz_rfind_charset</code><br/>
+      <code>sz_rfind_byteset</code><br/>
       <span style="color:#ABABAB;">x86:</span> <b>0.43</b> &centerdot;
       <span style="color:#ABABAB;">arm:</span> <b>0.23</b> GB/s
     </td>
@@ -181,7 +181,7 @@ __Who is this for?__
       <span style="color:#ABABAB;">arm:</span> <b>5.9</b> MB/s
     </td>
     <td align="center">
-      <code>sz_generate</code><br/>
+      <code>sz_fill_random</code><br/>
       <span style="color:#ABABAB;">x86:</span> <b>56.2</b> &centerdot;
       <span style="color:#ABABAB;">arm:</span> <b>25.8</b> MB/s
     </td>
@@ -203,7 +203,7 @@ __Who is this for?__
       <span style="color:#ABABAB;">arm:</span> <b>140.0</b> MB/s
     </td>
     <td align="center">
-      <code>sz_look_up_transform</code><br/>
+      <code>sz_lookup</code><br/>
       <span style="color:#ABABAB;">x86:</span> <b>21.2</b> &centerdot;
       <span style="color:#ABABAB;">arm:</span> <b>8.5</b> GB/s
     </td>
@@ -247,7 +247,7 @@ __Who is this for?__
       <span style="color:#ABABAB;">arm:</span> <b>2,220</b> ns
     </td>
     <td align="center">
-      <code>sz_edit_distance</code><br/>
+      <code>sz_levenshtein_distance</code><br/>
       <span style="color:#ABABAB;">x86:</span> <b>99</b> &centerdot;
       <span style="color:#ABABAB;">arm:</span> <b>180</b> ns
     </td>
@@ -265,7 +265,7 @@ __Who is this for?__
       <span style="color:#ABABAB;">arm:</span> <b>367</b> ms
     </td>
     <td align="center">
-      <code>sz_alignment_score</code><br/>
+      <code>sz_needleman_wunsch_score</code><br/>
       <span style="color:#ABABAB;">x86:</span> <b>73</b> &centerdot;
       <span style="color:#ABABAB;">arm:</span> <b>177</b> ms
     </td>
@@ -396,8 +396,8 @@ x: int = text.find_first_of('chars', start=0, end=sys.maxsize)
 x: int = text.find_last_of('chars', start=0, end=sys.maxsize)
 x: int = text.find_first_not_of('chars', start=0, end=sys.maxsize)
 x: int = text.find_last_not_of('chars', start=0, end=sys.maxsize)
-x: Strs = text.split_charset(separator='chars', maxsplit=sys.maxsize, keepseparator=False)
-x: Strs = text.rsplit_charset(separator='chars', maxsplit=sys.maxsize, keepseparator=False)
+x: Strs = text.split_byteset(separator='chars', maxsplit=sys.maxsize, keepseparator=False)
+x: Strs = text.rsplit_byteset(separator='chars', maxsplit=sys.maxsize, keepseparator=False)
 ```
 
 You can also transform the string using Look-Up Tables (LUTs), mapping it to a different character set.
@@ -453,8 +453,8 @@ StringZilla saves a lot of memory by viewing existing memory regions as substrin
 ```py
 x: SplitIterator[Str] = text.split_iter(separator=' ', keepseparator=False)
 x: SplitIterator[Str] = text.rsplit_iter(separator=' ', keepseparator=False)
-x: SplitIterator[Str] = text.split_charset_iter(separator='chars', keepseparator=False)
-x: SplitIterator[Str] = text.rsplit_charset_iter(separator='chars', keepseparator=False)
+x: SplitIterator[Str] = text.split_byteset_iter(separator='chars', keepseparator=False)
+x: SplitIterator[Str] = text.rsplit_byteset_iter(separator='chars', keepseparator=False)
 ```
 
 StringZilla can easily be 10x more memory efficient than native Python classes for tokenization.
@@ -654,7 +654,7 @@ By design, StringZilla has a couple of notable differences from LibC:
 
 That way `sz_find` and `sz_rfind` are similar to `strstr` and `strrstr` in LibC.
 Similarly, `sz_find_byte` and `sz_rfind_byte` replace `memchr` and `memrchr`.
-The `sz_find_charset` maps to `strspn` and `strcspn`, while `sz_rfind_charset` has no sibling in LibC.
+The `sz_find_byteset` maps to `strspn` and `strcspn`, while `sz_rfind_byteset` has no sibling in LibC.
 
 <table>
     <tr>
@@ -679,11 +679,11 @@ The `sz_find_charset` maps to `strspn` and `strcspn`, while `sz_rfind_charset` h
     </tr>
     <tr>
         <td><code>strcspn(haystack, needles)</code></td>
-        <td><code>sz_rfind_charset(haystack, haystack_length, needles_bitset)</code></td>
+        <td><code>sz_rfind_byteset(haystack, haystack_length, needles_bitset)</code></td>
     </tr>
     <tr>
         <td><code>strspn(haystack, needles)</code></td>
-        <td><code>sz_find_charset(haystack, haystack_length, needles_bitset)</code></td>
+        <td><code>sz_find_byteset(haystack, haystack_length, needles_bitset)</code></td>
     </tr>
     <tr>
         <td><code>memmem(haystack, haystack_length, needle, needle_length)</code>, <code>strstr</code></td>
@@ -923,7 +923,7 @@ StringZilla provides a convenient `partition` function, which returns a tuple of
 ```cpp
 auto parts = haystack.partition(':'); // Matching a character
 auto [before, match, after] = haystack.partition(':'); // Structure unpacking
-auto [before, match, after] = haystack.partition(sz::char_set(":;")); // Character-set argument
+auto [before, match, after] = haystack.partition(sz::byteset(":;")); // Character-set argument
 auto [before, match, after] = haystack.partition(" : "); // String argument
 auto [before, match, after] = haystack.rpartition(sz::whitespaces_set()); // Split around the last whitespace
 ```
@@ -951,8 +951,8 @@ Here is a sneak peek of the most useful ones.
 ```cpp
 text.hash(); // -> 64 bit unsigned integer 
 text.ssize(); // -> 64 bit signed length to avoid `static_cast<std::ssize_t>(text.size())`
-text.contains_only(" \w\t"); // == text.find_first_not_of(sz::char_set(" \w\t")) == npos;
-text.contains(sz::whitespaces_set()); // == text.find(sz::char_set(sz::whitespaces_set())) != npos;
+text.contains_only(" \w\t"); // == text.find_first_not_of(sz::byteset(" \w\t")) == npos;
+text.contains(sz::whitespaces_set()); // == text.find(sz::byteset(sz::whitespaces_set())) != npos;
 
 // Simpler slicing than `substr`
 text.front(10); // -> sz::string_view
@@ -997,7 +997,7 @@ To avoid those, StringZilla provides lazily-evaluated ranges, compatible with th
 
 ```cpp
 for (auto line : haystack.split("\r\n"))
-    for (auto word : line.split(sz::char_set(" \w\t.,;:!?")))
+    for (auto word : line.split(sz::byteset(" \w\t.,;:!?")))
         std::cout << word << std::endl;
 ```
 
@@ -1006,9 +1006,9 @@ It also allows interleaving matches, if you want both inclusions of `xx` in `xxx
 Debugging pointer offsets is not a pleasant exercise, so keep the following functions in mind.
 
 - `haystack.[r]find_all(needle, interleaving)`
-- `haystack.[r]find_all(sz::char_set(""))`
+- `haystack.[r]find_all(sz::byteset(""))`
 - `haystack.[r]split(needle)`
-- `haystack.[r]split(sz::char_set(""))`
+- `haystack.[r]split(sz::byteset(""))`
 
 For $N$ matches the split functions will report $N+1$ matches, potentially including empty strings.
 Ranges have a few convenience methods as well:
@@ -1065,7 +1065,7 @@ sz::string random_string(std::size_t length, char const *alphabet, std::size_t c
 ```
 
 Mouthful and slow.
-StringZilla provides a C native method - `sz_generate` and a convenient C++ wrapper - `sz::generate`.
+StringZilla provides a C native method - `sz_fill_random` and a convenient C++ wrapper - `sz::generate`.
 Similar to Python it also defines the commonly used character sets.
 
 ```cpp
@@ -1085,9 +1085,9 @@ In text processing, it's often necessary to replace all occurrences of a specifi
 Standard library functions may not offer the most efficient or convenient methods for performing bulk replacements, especially when dealing with large strings or performance-critical applications.
 
 - `haystack.replace_all(needle_string, replacement_string)`
-- `haystack.replace_all(sz::char_set(""), replacement_string)`
+- `haystack.replace_all(sz::byteset(""), replacement_string)`
 - `haystack.try_replace_all(needle_string, replacement_string)`
-- `haystack.try_replace_all(sz::char_set(""), replacement_string)`
+- `haystack.try_replace_all(sz::byteset(""), replacement_string)`
 - `haystack.transform(sz::look_up_table::identity())`
 - `haystack.transform(sz::look_up_table::identity(), haystack.data())`
 
@@ -1250,8 +1250,8 @@ sz::find("Hello, world!", "world") // 7
 sz::rfind("Hello, world!", "world") // 7
 
 // Generalizations of `memchr::memrchr[123]`
-sz::find_char_from("Hello, world!", "world") // 2
-sz::rfind_char_from("Hello, world!", "world") // 11
+sz::find_byte_from("Hello, world!", "world") // 2
+sz::rfind_byte_from("Hello, world!", "world") // 11
 ```
 
 Unlike `memchr`, the throughput of `stringzilla` is [high in both normal and reverse-order searches][memchr-benchmarks].
@@ -1268,10 +1268,10 @@ let my_cow_str = Cow::from(&my_string);
 // Use the generic function with a String
 assert_eq!(my_string.sz_find("world"), Some(7));
 assert_eq!(my_string.sz_rfind("world"), Some(7));
-assert_eq!(my_string.sz_find_char_from("world"), Some(2));
-assert_eq!(my_string.sz_rfind_char_from("world"), Some(11));
-assert_eq!(my_string.sz_find_char_not_from("world"), Some(0));
-assert_eq!(my_string.sz_rfind_char_not_from("world"), Some(12));
+assert_eq!(my_string.sz_find_byte_from("world"), Some(2));
+assert_eq!(my_string.sz_rfind_byte_from("world"), Some(11));
+assert_eq!(my_string.sz_find_byte_not_from("world"), Some(0));
+assert_eq!(my_string.sz_rfind_byte_not_from("world"), Some(12));
 
 // Same works for &str and Cow<'_, str>
 assert_eq!(my_str.sz_find("world"), Some(7));
@@ -1315,7 +1315,7 @@ s[s.findLast(substring: "o")!...] // "o StringZilla. 👋")
 s[s.findFirst(characterFrom: "aeiou")!...] // "ello, world! Welcome to StringZilla. 👋")
 s[s.findLast(characterFrom: "aeiou")!...] // "a. 👋")
 s[s.findFirst(characterNotFrom: "aeiou")!...] // "Hello, world! Welcome to StringZilla. 👋"
-s.editDistance(from: "Hello, world!")! // 29
+s.levenshteinDistance(from: "Hello, world!")! // 29
 ```
 
 ## Algorithms & Design Decisions 📚
@@ -1561,7 +1561,7 @@ Most StringZilla operations are byte-level, so they work well with ASCII and UTF
 In some cases, like edit-distance computation, the result of byte-level evaluation and character-level evaluation may differ.
 So StringZilla provides following functions to work with Unicode:
 
-- `sz_edit_distance_utf8` - computes the Levenshtein distance between two UTF-8 strings.
+- `sz_levenshtein_distance_utf8` - computes the Levenshtein distance between two UTF-8 strings.
 - `sz_hamming_distance_utf8` - computes the Hamming distance between two UTF-8 strings.
 
 Java, JavaScript, Python 2, C#, and Objective-C, however, use wide characters (`wchar`) - two byte long codes, instead of the more reasonable fixed-length UTF32 or variable-length UTF8.
diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index 8e387b70..721ba940 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -3,11 +3,14 @@
  *  @file   sort.h
  *  @author Ash Vardanian
  *
- *  Includes core APIs for `sz_sequence_t` string collections:
+ *  Includes core APIs for `sz_sequence_t` string collections with hardware-specific backends:
  *
  *  - `sz_sequence_argsort` - to get the sorting permutation of a string collection.
  *  - `sz_sequence_join` - to compute the intersection of two arbitrary string collections.
  *
+ *  The first can easily be used to implement SORT and GROUPBY operations SQL, while the second can be used to
+ *  implement JOIN operations. Both are essential for implementing efficient database engines.
+ *
  *  The core idea of all following string algorithms is to process strings not based on 1 character at a time,
  *  but on a larger "Pointer-sized N-grams" fitting in 4 or 8 bytes at once, on 32-bit or 64-bit architectures,
  *  respectively. In reality we may not use the full pointer size, but only a few bytes from it, and keep the
@@ -21,7 +24,7 @@
  *
  *  Other helpers include:
  *
- *  - `sz_pgrams_sort_stable_with_insertion` - for quadratic-complexity sorting of small continuous integer arrays.
+ *  - `sz_pgrams_sort_with_insertion` - for quadratic-complexity sorting of small continuous integer arrays.
  *  - `sz_sequence_argsort_with_insertion` - for quadratic-complexity sorting of small string collections.
  *  - `sz_sequence_argsort_stabilize` - updates the sorting permutation to be stable.
  */
@@ -45,10 +48,11 @@ extern "C" {
  *
  *  @param[in] sequence Immutable sequence of strings to sort.
  *  @param[in] alloc Optional memory allocator for temporary storage.
- *  @param[out] order Output permutation that sorts the elements. Must fit at least `sequence->count` integers.
+ *  @param[out] order Output permutation that sorts the elements.
  *
  *  @retval `sz_success_k` if the operation was successful.
  *  @retval `sz_bad_alloc_k` if the operation failed due to memory allocation failure.
+ *  @pre The @p order array must fit at least `sequence->count` integers.
  *  @post The @p order array will contain a valid permutation of `[0, sequence->count - 1]`.
  *
  *  Example usage:
@@ -60,8 +64,8 @@ extern "C" {
  *          sz_sequence_t sequence;
  *          sz_sequence_from_null_terminated_strings(strings, 3, &sequence);
  *          sz_sorted_idx_t order[3];
- *          sz_sequence_argsort(&sequence, NULL, order);
- *          return order[0] == 1 && order[1] == 0 && order[2] == 2 ? 0 : 1;
+ *          sz_status_t status = sz_sequence_argsort(&sequence, NULL, order);
+ *          return status == sz_success_k && order[0] == 1 && order[1] == 0 && order[2] == 2 ? 0 : 1;
  *      }
  *  @endcode
  *
@@ -69,7 +73,7 @@ extern "C" {
  *  @see    https://en.wikipedia.org/wiki/Quicksort
  *
  *  @note   This algorithm is @b unstable: equal elements may change relative order.
- *  @sa     sz_sequence_argsort_stable
+ *  @sa     sz_sequence_argsort_stabilize
  *
  *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
  *  @sa     sz_sequence_argsort_serial, sz_sequence_argsort_skylake, sz_sequence_argsort_sve
@@ -84,10 +88,11 @@ SZ_DYNAMIC sz_status_t sz_sequence_argsort(sz_sequence_t const *sequence, sz_mem
  *  @param[inout] pgrams Continuous buffer of unsigned integers to sort in place.
  *  @param[in] count Number of elements in the sequence.
  *  @param[in] alloc Optional memory allocator for temporary storage.
- *  @param[out] order Output permutation that sorts the elements. Must fit at least @p count integers.
+ *  @param[out] order Output permutation that sorts the elements.
  *
  *  @retval `sz_success_k` if the operation was successful.
  *  @retval `sz_bad_alloc_k` if the operation failed due to memory allocation failure.
+ *  @pre The @p order array must fit at least `count` integers.
  *  @post The @p order array will contain a valid permutation of `[0, count - 1]`.
  *
  *  Example usage:
@@ -97,17 +102,14 @@ SZ_DYNAMIC sz_status_t sz_sequence_argsort(sz_sequence_t const *sequence, sz_mem
  *      int main() {
  *          sz_pgram_t pgrams[] = {42, 17, 99, 8};
  *          sz_sorted_idx_t order[4];
- *          sz_pgrams_sort(pgrams, 4, NULL, order);
- *          return order[0] == 3 && order[1] == 1 && order[2] == 0 && order[3] == 2 ? 0 : 1;
+ *          sz_status_t status = sz_pgrams_sort(pgrams, 4, NULL, order);
+ *          return status == sz_success_k && order[0] == 3 && order[1] == 1 && order[2] == 0 && order[3] == 2 ? 0 : 1;
  *      }
  *  @endcode
  *
  *  @note   The algorithm has linear memory complexity, quadratic worst-case and log-linear average time complexity.
  *  @see    https://en.wikipedia.org/wiki/Quicksort
  *
- *  @note   This algorithm is @b unstable: equal elements may change relative order.
- *  @sa     sz_pgrams_sort_stable
- *
  *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
  *  @sa     sz_pgrams_sort_serial, sz_pgrams_sort_skylake, sz_pgrams_sort_sve
  */
@@ -115,46 +117,51 @@ SZ_DYNAMIC sz_status_t sz_pgrams_sort(sz_pgram_t *pgrams, sz_size_t count, sz_me
                                       sz_sorted_idx_t *order);
 
 /**
- *  @brief  Faster @b arg-sort for an arbitrary @b string sequence, using MergeSort.
- *          Outputs the @p order of elements in the immutable @p sequence, that would sort it.
+ *  @brief  Intersects two arbitrary @b string sequences, using a hash table.
+ *          Outputs the @p first_positions from the @p first_sequence and @p second_positions from
+ *          the @p second_sequence, that contain identical strings.
  *
- *  This algorithm guarantees stability, ensuring that the relative order of equal elements is preserved.
- *  It uses more memory than `sz_sequence_argsort`, but its performance is more predictable.
- *  It's preferred for very large inputs, as most memory access happens in a sequential pattern.
  *
- *  @param[in] sequence Immutable sequence of strings to sort.
+ *  @param[in] first_sequence First immutable sequence of strings to intersection.
+ *  @param[in] second_sequence Second immutable sequence of strings to intersection.
  *  @param[in] alloc Optional memory allocator for temporary storage.
- *  @param[out] order Output permutation that sorts the elements. Must fit at least `sequence->count` integers.
+ *  @param[out] intersection_size Number of identical strings in both sequences.
+ *  @param[out] first_positions Offset positions of the identical strings from the @p first_sequence.
+ *  @param[out] second_positions Offset positions of the identical strings from the @p second_sequence.
  *
  *  @retval `sz_success_k` if the operation was successful.
  *  @retval `sz_bad_alloc_k` if the operation failed due to memory allocation failure.
- *  @post The @p order array will contain a valid permutation of `[0, sequence->count - 1]`.
+ *  @retval `sz_contains_duplicates_k` if any of the sequences contain duplicate strings.
+ *  @pre The @p first_positions arrays must fit at least `min(first_sequence->count, second_sequence->count)` items.
+ *  @pre The @p second_positions arrays must fit at least `min(first_sequence->count, second_sequence->count)` items.
  *
  *  Example usage:
  *
  *  @code{.c}
  *      #include <stringzilla/sort.h>
  *      int main() {
- *          char const *strings[] = {"banana", "apple", "cherry"};
- *          sz_sequence_t sequence;
- *          sz_sequence_from_null_terminated_strings(strings, 3, &sequence);
- *          sz_sorted_idx_t order[3];
- *          sz_sequence_argsort_stable(&sequence, NULL, order);
- *          return order[0] == 1 && order[1] == 0 && order[2] == 2 ? 0 : 1;
+ *          char const *first[] = {"banana", "apple", "cherry"};
+ *          char const *second[] = {"cherry", "orange", "pineapple", "banana"};
+ *          sz_sequence_t first_sequence, second_sequence;
+ *          sz_sequence_from_null_terminated_strings(first, 3, &first_sequence);
+ *          sz_sequence_from_null_terminated_strings(second, 4, &second_sequence);
+ *          sz_size_t intersection_size;
+ *          sz_sorted_idx_t first_positions[3], second_positions[3]; //? 3 is the size of the smaller sequence
+ *          sz_status_t status = sz_sequence_join(&first_sequence, &second_sequence, NULL,
+ *              &intersection_size, first_positions, second_positions);
+ *          return status == sz_success_k && intersection_size == 2 ? 0 : 1;
  *      }
  *  @endcode
  *
- *  @note   The algorithm has linear memory complexity and log-linear time complexity.
- *  @see    https://en.wikipedia.org/wiki/Merge_sort
- *
- *  @note   This algorithm is @b stable: equal elements maintain their relative order.
- *  @sa     sz_sequence_argsort
+ *  @note   The algorithm has linear memory complexity and linear time complexity.
+ *  @see    https://en.wikipedia.org/wiki/Join_(SQL)
  *
  *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
- *  @sa     sz_sequence_argsort_stable_serial, sz_sequence_argsort_stable_skylake, sz_sequence_argsort_stable_sve
+ *  @sa     sz_sequence_join_serial, sz_sequence_join_skylake, sz_sequence_join_sve
  */
-SZ_DYNAMIC sz_status_t sz_sequence_argsort_stable(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
-                                                  sz_sorted_idx_t *order);
+SZ_DYNAMIC sz_status_t sz_sequence_join(sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence,
+                                        sz_memory_allocator_t *alloc, sz_size_t *intersection_size,
+                                        sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions);
 
 /**
  *  @brief  Faster @b inplace `std::stable_sort` for a continuous @b unsigned-integer sequence, using MergeSort.
@@ -180,7 +187,7 @@ SZ_DYNAMIC sz_status_t sz_sequence_argsort_stable(sz_sequence_t const *sequence,
  *      int main() {
  *          sz_pgram_t pgrams[] = {42, 17, 99, 8};
  *          sz_sorted_idx_t order[4];
- *          sz_pgrams_sort_stable(pgrams, 4, NULL, order);
+ *          sz_pgrams_join(pgrams, 4, NULL, order);
  *          return order[0] == 3 && order[1] == 1 && order[2] == 0 && order[3] == 2 ? 0 : 1;
  *      }
  *  @endcode
@@ -192,10 +199,10 @@ SZ_DYNAMIC sz_status_t sz_sequence_argsort_stable(sz_sequence_t const *sequence,
  *  @sa     sz_pgrams_sort
  *
  *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
- *  @sa     sz_pgrams_sort_stable_serial, sz_pgrams_sort_stable_skylake, sz_pgrams_sort_stable_sve
+ *  @sa     sz_pgrams_join_serial, sz_pgrams_join_skylake, sz_pgrams_join_sve
  */
-SZ_DYNAMIC sz_status_t sz_pgrams_sort_stable(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
-                                             sz_sorted_idx_t *order);
+SZ_DYNAMIC sz_status_t sz_pgrams_join(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                      sz_sorted_idx_t *order);
 
 /** @copydoc sz_sequence_argsort */
 SZ_PUBLIC sz_status_t sz_sequence_argsort_serial(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
@@ -205,6 +212,8 @@ SZ_PUBLIC sz_status_t sz_sequence_argsort_serial(sz_sequence_t const *sequence,
 SZ_PUBLIC sz_status_t sz_pgrams_sort_serial(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
                                             sz_sorted_idx_t *order);
 
+#if SZ_USE_SKYLAKE
+
 /** @copydoc sz_sequence_argsort */
 SZ_PUBLIC sz_status_t sz_sequence_argsort_skylake(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
                                                   sz_sorted_idx_t *order);
@@ -213,6 +222,16 @@ SZ_PUBLIC sz_status_t sz_sequence_argsort_skylake(sz_sequence_t const *sequence,
 SZ_PUBLIC sz_status_t sz_pgrams_sort_skylake(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
                                              sz_sorted_idx_t *order);
 
+/** @copydoc sz_sequence_join */
+SZ_PUBLIC sz_status_t sz_sequence_join_skylake(                                //
+    sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence, //
+    sz_memory_allocator_t *alloc, sz_size_t *intersection_size,                //
+    sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions);
+
+#endif
+
+#if SZ_USE_SVE
+
 /** @copydoc sz_sequence_argsort */
 SZ_PUBLIC sz_status_t sz_sequence_argsort_sve(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
                                               sz_sorted_idx_t *order);
@@ -221,36 +240,20 @@ SZ_PUBLIC sz_status_t sz_sequence_argsort_sve(sz_sequence_t const *sequence, sz_
 SZ_PUBLIC sz_status_t sz_pgrams_sort_sve(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
                                          sz_sorted_idx_t *order);
 
-/** @copydoc sz_sequence_argsort_stable */
-SZ_PUBLIC sz_status_t sz_sequence_argsort_stable_serial(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
-                                                        sz_sorted_idx_t *order);
-
-/** @copydoc sz_pgrams_sort_stable */
-SZ_PUBLIC sz_status_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
-                                                   sz_sorted_idx_t *order);
+/** @copydoc sz_sequence_join */
+SZ_PUBLIC sz_status_t sz_sequence_join_sve(                                    //
+    sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence, //
+    sz_memory_allocator_t *alloc, sz_size_t *intersection_size,                //
+    sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions);
 
-/** @copydoc sz_sequence_argsort_stable */
-SZ_PUBLIC sz_status_t sz_sequence_argsort_stable_skylake(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
-                                                         sz_sorted_idx_t *order);
-
-/** @copydoc sz_pgrams_sort_stable */
-SZ_PUBLIC sz_status_t sz_pgrams_sort_stable_skylake(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
-                                                    sz_sorted_idx_t *order);
-
-/** @copydoc sz_sequence_argsort_stable */
-SZ_PUBLIC sz_status_t sz_sequence_argsort_stable_sve(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
-                                                     sz_sorted_idx_t *order);
-
-/** @copydoc sz_pgrams_sort_stable */
-SZ_PUBLIC sz_status_t sz_pgrams_sort_stable_sve(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
-                                                sz_sorted_idx_t *order);
+#endif
 
 #pragma endregion
 
 #pragma region Generic Public Helpers
 
 /**
- *  @brief  Quadratic complexity insertion sort adjust for our @b argsort usecase.
+ *  @brief  Quadratic complexity @b stable insertion sort adjust for our @b argsort usecase.
  *          Needs no extra memory and is used as a fallback for small inputs.
  */
 SZ_PUBLIC void sz_sequence_argsort_with_insertion(sz_sequence_t const *sequence, sz_sorted_idx_t *order) {
@@ -281,11 +284,11 @@ SZ_PUBLIC void sz_sequence_argsort_with_insertion(sz_sequence_t const *sequence,
 }
 
 /**
- *  @brief  Quadratic complexity insertion sort adjust for our @b pgram-sorting usecase.
+ *  @brief  Quadratic complexity @b stable insertion sort adjust for our @b pgram-sorting usecase.
  *          Needs no extra memory and is used as a fallback for small inputs.
  */
 
-SZ_PUBLIC void sz_pgrams_sort_stable_with_insertion(sz_pgram_t *pgrams, sz_size_t count, sz_sorted_idx_t *order) {
+SZ_PUBLIC void sz_pgrams_sort_with_insertion(sz_pgram_t *pgrams, sz_size_t count, sz_sorted_idx_t *order) {
 
     // Assume `order` is already initialized with 0, 1, 2, ... N.
     for (sz_size_t i = 1; i < count; ++i) {
@@ -714,7 +717,7 @@ SZ_PUBLIC sz_status_t sz_pgrams_sort_serial(sz_pgram_t *pgrams, sz_size_t count,
  *  @brief  Helper function similar to `std::set_union` over pairs of integers and their original indices.
  *  @see    https://en.cppreference.com/w/cpp/algorithm/set_union
  */
-SZ_INTERNAL void _sz_sequence_argsort_stable_serial_merge(                                          //
+SZ_INTERNAL void _sz_sequence_join_serial_merge(                                                    //
     sz_pgram_t const *first_pgrams, sz_sorted_idx_t const *first_indices, sz_size_t first_count,    //
     sz_pgram_t const *second_pgrams, sz_sorted_idx_t const *second_indices, sz_size_t second_count, //
     sz_pgram_t *result_pgrams, sz_sorted_idx_t *result_indices) {
@@ -761,8 +764,8 @@ SZ_INTERNAL void _sz_sequence_argsort_stable_serial_merge(
             _sz_assert(merged_begin[i - 1] <= merged_begin[i] && "The merged pgrams must be in ascending order.");
 }
 
-SZ_PUBLIC sz_status_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
-                                                   sz_sorted_idx_t *order) {
+SZ_PUBLIC sz_status_t sz_pgrams_join_serial(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                            sz_sorted_idx_t *order) {
 
     // First, initialize the `order` with `std::iota`-like behavior.
     for (sz_size_t i = 0; i != count; ++i) order[i] = i;
@@ -770,7 +773,7 @@ SZ_PUBLIC sz_status_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t
     // On very small collections - just use the quadratic-complexity insertion sort
     // without any smart optimizations or memory allocations.
     if (count <= 32) {
-        sz_pgrams_sort_stable_with_insertion(pgrams, count, order);
+        sz_pgrams_sort_with_insertion(pgrams, count, order);
         return sz_success_k;
     }
 
@@ -779,7 +782,7 @@ SZ_PUBLIC sz_status_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t
 
     // For the tail of the array, sort it with insertion sort.
     sz_size_t const tail_count = count & 7u;
-    sz_pgrams_sort_stable_with_insertion(pgrams + count - tail_count, tail_count, order + count - tail_count);
+    sz_pgrams_sort_with_insertion(pgrams + count - tail_count, tail_count, order + count - tail_count);
 
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
     sz_memory_allocator_t global_alloc;
@@ -821,7 +824,7 @@ SZ_PUBLIC sz_status_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t
             else if (i + left_count + right_count > count) { right_count = count - (i + left_count); }
 
             // Merge the two runs:
-            _sz_sequence_argsort_stable_serial_merge(                             //
+            _sz_sequence_join_serial_merge(                                       //
                 src_pgrams + i, src_order + i, left_count,                        //
                 src_pgrams + i + run_size, src_order + i + run_size, right_count, //
                 dst_pgrams + i, dst_order + i);
@@ -844,9 +847,11 @@ SZ_PUBLIC sz_status_t sz_pgrams_sort_stable_serial(sz_pgram_t *pgrams, sz_size_t
     return sz_success_k;
 }
 
-SZ_PUBLIC sz_status_t sz_sequence_argsort_stable_serial(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
-                                                        sz_sorted_idx_t *order) {
-    sz_unused(sequence && alloc && order);
+SZ_PUBLIC sz_status_t sz_sequence_join_serial(                                 //
+    sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence, //
+    sz_memory_allocator_t *alloc, sz_size_t *intersection_size,                //
+    sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions) {
+    sz_unused(first_sequence && second_sequence && alloc && intersection_size && first_positions && second_positions);
     return sz_success_k;
 }
 
@@ -967,7 +972,7 @@ SZ_INTERNAL void _sz_sequence_argsort_skylake_3way_partition(
  *  @brief  Recursive Quick-Sort implementation backing both the `sz_sequence_argsort_skylake` and
  * `sz_pgrams_sort_skylake`, and using the `_sz_sequence_argsort_skylake_3way_partition` under the hood.
  */
-SZ_INTERNAL void _sz_sequence_argsort_skylake_recursively(          //
+SZ_PUBLIC void _sz_sequence_argsort_skylake_recursively(            //
     sz_pgram_t *initial_pgrams, sz_sorted_idx_t *initial_order,     //
     sz_pgram_t *temporary_pgrams, sz_sorted_idx_t *temporary_order, //
     sz_size_t const start_in_sequence, sz_size_t const end_in_sequence) {
@@ -977,7 +982,7 @@ SZ_INTERNAL void _sz_sequence_argsort_skylake_recursively(          //
     sz_size_t const count = end_in_sequence - start_in_sequence;
     sz_size_t const pgrams_per_register = sizeof(sz_u512_vec_t) / sizeof(sz_pgram_t);
     if (count <= pgrams_per_register) {
-        sz_pgrams_sort_stable_with_insertion( //
+        sz_pgrams_sort_with_insertion( //
             initial_pgrams + start_in_sequence, count, initial_order + start_in_sequence);
         return;
     }
@@ -1040,12 +1045,12 @@ SZ_PUBLIC void _sz_sequence_argsort_skylake_next_pgrams(
     sz_size_t const start_character) {
 
     // Prepare the new range of pgrams
-    _sz_sequence_argsort_serial_export_next_pgrams(sequence, global_pgrams, global_order, start_in_sequence,
-                                                   end_in_sequence, start_character);
+    _sz_sequence_argsort_serial_export_next_pgrams( //
+        sequence, global_pgrams, global_order, start_in_sequence, end_in_sequence, start_character);
 
     // Sort current pgrams with a quicksort
-    _sz_sequence_argsort_skylake_recursively(global_pgrams, global_order, temporary_pgrams, temporary_order,
-                                             start_in_sequence, end_in_sequence);
+    _sz_sequence_argsort_skylake_recursively( //
+        global_pgrams, global_order, temporary_pgrams, temporary_order, start_in_sequence, end_in_sequence);
 
     // Depending on the architecture, we will export a different number of bytes.
     // On 32-bit architectures, we will export 3 bytes, and on 64-bit architectures - 7 bytes.
@@ -1064,11 +1069,11 @@ SZ_PUBLIC void _sz_sequence_argsort_skylake_next_pgrams(
         sz_size_t current_pgram_length = (sz_size_t)current_pgram_str[0]; //! The byte order was swapped
         int has_multiple_strings = nested_end - nested_start > 1;
         int has_more_characters_in_each = current_pgram_length == pgram_capacity;
-        if (has_multiple_strings && has_more_characters_in_each) {
-            _sz_sequence_argsort_skylake_next_pgrams(sequence, global_pgrams, global_order, temporary_pgrams,
-                                                     temporary_order, nested_start, nested_end,
-                                                     start_character + pgram_capacity);
-        }
+        if (has_multiple_strings && has_more_characters_in_each)
+            _sz_sequence_argsort_skylake_next_pgrams( //
+                sequence, global_pgrams, global_order, temporary_pgrams, temporary_order, nested_start, nested_end,
+                start_character + pgram_capacity);
+
         // Move to the next
         nested_start = nested_end;
     }
@@ -1111,6 +1116,14 @@ SZ_PUBLIC sz_status_t sz_sequence_argsort_skylake(sz_sequence_t const *sequence,
     return sz_success_k;
 }
 
+SZ_PUBLIC sz_status_t sz_sequence_join_skylake(                                //
+    sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence, //
+    sz_memory_allocator_t *alloc, sz_size_t *intersection_size,                //
+    sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions) {
+    sz_unused(first_sequence && second_sequence && alloc && intersection_size && first_positions && second_positions);
+    return sz_success_k;
+}
+
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_SKYLAKE
@@ -1144,25 +1157,24 @@ SZ_DYNAMIC sz_status_t sz_pgrams_sort(sz_pgram_t *pgrams, sz_size_t count, sz_me
 #endif
 }
 
-SZ_DYNAMIC sz_status_t sz_sequence_argsort_stable(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
-                                                  sz_sorted_idx_t *order) {
+SZ_DYNAMIC sz_status_t sz_sequence_join(sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence,
+                                        sz_memory_allocator_t *alloc, sz_size_t *intersection_size,
+                                        sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions) {
 #if SZ_USE_SKYLAKE
-    return sz_sequence_argsort_skylake(sequence, alloc, order);
+    return sz_sequence_join_skylake(     //
+        first_sequence, second_sequence, //
+        alloc, intersection_size,        //
+        first_positions, second_positions);
 #elif SZ_USE_SVE
-    return sz_sequence_argsort_sve(sequence, alloc, order);
+    return sz_sequence_join_sve(         //
+        first_sequence, second_sequence, //
+        alloc, intersection_size,        //
+        first_positions, second_positions);
 #else
-    return sz_sequence_argsort_serial(sequence, alloc, order);
-#endif
-}
-
-SZ_DYNAMIC sz_status_t sz_pgrams_sort_stable(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
-                                             sz_sorted_idx_t *order) {
-#if SZ_USE_SKYLAKE
-    return sz_pgrams_sort_skylake(pgrams, count, alloc, order);
-#elif SZ_USE_SVE
-    return sz_pgrams_sort_sve(pgrams, count, alloc, order);
-#else
-    return sz_pgrams_sort_serial(pgrams, count, alloc, order);
+    return sz_sequence_join_serial(      //
+        first_sequence, second_sequence, //
+        alloc, intersection_size,        //
+        first_positions, second_positions);
 #endif
 }
 
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 7642f5ae..c497d4f1 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -64,12 +64,13 @@ typedef enum {
 
     sz_cap_haswell_k = 1 << 10, ///< x86 AVX2 capability with FMA and F16C extensions
     sz_cap_skylake_k = 1 << 11, ///< x86 AVX512 baseline capability
-    sz_cap_ice_k = 1 << 12,     ///< x86 AVX512 capability with advanced integer algos
+    sz_cap_ice_k = 1 << 12,     ///< x86 AVX512 capability with advanced integer algos and AES extensions
 
-    sz_cap_neon_k = 1 << 20,   ///< ARM NEON baseline capability
-    sz_cap_sve_k = 1 << 21,    ///< ARM SVE baseline capability
-    sz_cap_sve2_k = 1 << 22,   ///< ARM SVE2 capability
-    sz_cap_sve2p1_k = 1 << 23, ///< ARM SVE2p1 capability
+    sz_cap_neon_k = 1 << 20,     ///< ARM NEON baseline capability
+    sz_cap_neon_aes_k = 1 << 21, ///< ARM NEON baseline capability with AES extensions
+    sz_cap_sve_k = 1 << 24,      ///< ARM SVE baseline capability
+    sz_cap_sve2_k = 1 << 25,     ///< ARM SVE2 capability
+    sz_cap_sve2_aes_k = 1 << 26, ///< ARM SVE2 capability with AES extensions
 
 } sz_capability_t;
 
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 39a6352b..c4f71907 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -82,12 +82,8 @@
  */
 #if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64)
 #define _SZ_IS_64_BIT (1)
-#define SZ_SIZE_MAX (0xFFFFFFFFFFFFFFFFull)  // Largest unsigned integer that fits into 64 bits.
-#define SZ_SSIZE_MAX (0x7FFFFFFFFFFFFFFFull) // Largest signed integer that fits into 64 bits.
 #else
 #define _SZ_IS_64_BIT (0)
-#define SZ_SIZE_MAX (0xFFFFFFFFu)  // Largest unsigned integer that fits into 32 bits.
-#define SZ_SSIZE_MAX (0x7FFFFFFFu) // Largest signed integer that fits into 32 bits.
 #endif
 
 /**
@@ -302,10 +298,9 @@ typedef unsigned long long sz_u64_t; // Always 64 bits
 typedef unsigned long long sz_size_t; // 64-bit.
 typedef long long sz_ssize_t;         // 64-bit.
 #else
-typedef unsigned sz_size_t;  // 32-bit.
-typedef unsigned sz_ssize_t; // 32-bit.
+typedef unsigned int sz_size_t; // 32-bit.
+typedef int sz_ssize_t;         // 32-bit.
 #endif // _SZ_IS_64_BIT
-
 #endif // SZ_AVOID_LIBC
 
 /**
@@ -774,6 +769,11 @@ SZ_PUBLIC void sz_sequence_from_null_terminated_strings(sz_cptr_t *start, sz_siz
  *          like equality checks and relative order computing.
  */
 #define SZ_CACHE_LINE_WIDTH (64) // bytes
+#define SZ_SIZE_MAX ((sz_size_t)(-1))
+#define SZ_SSIZE_MAX ((sz_ssize_t)(SZ_SIZE_MAX >> 1))
+
+SZ_INTERNAL sz_size_t _sz_size_max(void) { return SZ_SIZE_MAX; }
+SZ_INTERNAL sz_ssize_t _sz_ssize_max(void) { return SZ_SSIZE_MAX; }
 
 /**
  *  @brief  Similar to `assert`, the `_sz_assert` is used in the `SZ_DEBUG` mode
diff --git a/python/lib.c b/python/lib.c
index 6e334719..46ed1c51 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -208,12 +208,12 @@ static sz_ptr_t temporary_memory_allocate(sz_size_t size, sz_string_view_t *exis
 
 static void temporary_memory_free(sz_ptr_t start, sz_size_t size, sz_string_view_t *existing) {}
 
-static sz_cptr_t parts_get_start(sz_sequence_t *seq, sz_size_t i) {
-    return ((sz_string_view_t const *)seq->handle)[i].start;
+static sz_cptr_t parts_get_start(void const *handle, sz_size_t i) {
+    return ((sz_string_view_t const *)handle)[i].start;
 }
 
-static sz_size_t parts_get_length(sz_sequence_t *seq, sz_size_t i) {
-    return ((sz_string_view_t const *)seq->handle)[i].length;
+static sz_size_t parts_get_length(void const *handle, sz_size_t i) {
+    return ((sz_string_view_t const *)handle)[i].length;
 }
 
 void reverse_offsets(sz_sorted_idx_t *array, size_t length) {
@@ -236,7 +236,7 @@ void reverse_haystacks(sz_string_view_t *array, size_t length) {
     }
 }
 
-void apply_order(sz_string_view_t *array, sz_sorted_idx_t *order, size_t length) {
+void permute(sz_string_view_t *array, sz_sorted_idx_t *order, size_t length) {
     for (size_t i = 0; i < length; ++i) {
         if (i == order[i]) continue;
         sz_string_view_t temp = array[i];
@@ -682,7 +682,7 @@ static PyObject *Str_repr(Str *self) {
     }
 }
 
-static Py_hash_t Str_hash(Str *self) { return (Py_hash_t)sz_hash(self->memory.start, self->memory.length); }
+static Py_hash_t Str_hash(Str *self) { return (Py_hash_t)sz_hash(self->memory.start, self->memory.length, 0); }
 
 static char const doc_like_hash[] = //
     "Compute the hash value of the string.\n"
@@ -713,7 +713,7 @@ static PyObject *Str_like_hash(PyObject *self, PyObject *args, PyObject *kwargs)
         return NULL;
     }
 
-    sz_u64_t result = sz_hash(text.start, text.length);
+    sz_u64_t result = sz_hash(text.start, text.length, 0);
     return PyLong_FromUnsignedLongLong((unsigned long long)result);
 }
 
@@ -1837,7 +1837,8 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) {
     return PyLong_FromSize_t(count);
 }
 
-static PyObject *_Str_edit_distance(PyObject *self, PyObject *args, PyObject *kwargs, sz_edit_distance_t function) {
+static PyObject *_Str_levenshtein_distance(PyObject *self, PyObject *args, PyObject *kwargs,
+                                           sz_levenshtein_distance_t function) {
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
     Py_ssize_t nargs = PyTuple_Size(args);
     if (nargs < !is_member + 1 || nargs > !is_member + 2) {
@@ -1877,10 +1878,12 @@ static PyObject *_Str_edit_distance(PyObject *self, PyObject *args, PyObject *kw
     reusing_allocator.free = &temporary_memory_free;
     reusing_allocator.handle = &temporary_memory;
 
-    sz_size_t distance = function(str1.start, str1.length, str2.start, str2.length, bound, &reusing_allocator);
+    sz_size_t distance;
+    sz_status_t status =
+        function(str1.start, str1.length, str2.start, str2.length, bound, &reusing_allocator, &distance);
 
     // Check for memory allocation issues
-    if (distance == SZ_SIZE_MAX) {
+    if (status != sz_success_k) {
         PyErr_NoMemory();
         return NULL;
     }
@@ -1888,7 +1891,7 @@ static PyObject *_Str_edit_distance(PyObject *self, PyObject *args, PyObject *kw
     return PyLong_FromSize_t(distance);
 }
 
-static char const doc_edit_distance[] = //
+static char const doc_levenshtein_distance[] = //
     "Compute the Levenshtein edit distance between two strings.\n"
     "\n"
     "Args:\n"
@@ -1898,11 +1901,11 @@ static char const doc_edit_distance[] = //
     "Returns:\n"
     "  int: The edit distance (number of insertions, deletions, substitutions).";
 
-static PyObject *Str_edit_distance(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return _Str_edit_distance(self, args, kwargs, &sz_edit_distance);
+static PyObject *Str_levenshtein_distance(PyObject *self, PyObject *args, PyObject *kwargs) {
+    return _Str_levenshtein_distance(self, args, kwargs, &sz_levenshtein_distance);
 }
 
-static char const doc_edit_distance_unicode[] = //
+static char const doc_levenshtein_distance_unicode[] = //
     "Compute the Levenshtein edit distance between two Unicode strings.\n"
     "\n"
     "Args:\n"
@@ -1912,8 +1915,8 @@ static char const doc_edit_distance_unicode[] = //
     "Returns:\n"
     "  int: The edit distance in Unicode characters.";
 
-static PyObject *Str_edit_distance_unicode(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return _Str_edit_distance(self, args, kwargs, &sz_edit_distance_utf8);
+static PyObject *Str_levenshtein_distance_unicode(PyObject *self, PyObject *args, PyObject *kwargs) {
+    return _Str_levenshtein_distance(self, args, kwargs, &sz_levenshtein_distance_utf8);
 }
 
 static PyObject *_Str_hamming_distance(PyObject *self, PyObject *args, PyObject *kwargs,
@@ -1951,10 +1954,11 @@ static PyObject *_Str_hamming_distance(PyObject *self, PyObject *args, PyObject
         return NULL;
     }
 
-    sz_size_t distance = function(str1.start, str1.length, str2.start, str2.length, (sz_size_t)bound);
+    sz_size_t distance;
+    sz_status_t status = function(str1.start, str1.length, str2.start, str2.length, (sz_size_t)bound, &distance);
 
     // Check for memory allocation issues
-    if (distance == SZ_SIZE_MAX) {
+    if (status != sz_success_k) {
         PyErr_NoMemory();
         return NULL;
     }
@@ -1990,7 +1994,7 @@ static PyObject *Str_hamming_distance_unicode(PyObject *self, PyObject *args, Py
     return _Str_hamming_distance(self, args, kwargs, &sz_hamming_distance_utf8);
 }
 
-static char const doc_alignment_score[] = //
+static char const doc_needleman_wunsch_score[] = //
     "Compute the Needleman-Wunsch alignment score between two strings.\n"
     "\n"
     "Args:\n"
@@ -2002,7 +2006,7 @@ static char const doc_alignment_score[] = //
     "Returns:\n"
     "  int: The alignment score.";
 
-static PyObject *Str_alignment_score(PyObject *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Str_needleman_wunsch_score(PyObject *self, PyObject *args, PyObject *kwargs) {
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
     Py_ssize_t nargs = PyTuple_Size(args);
     if (nargs < !is_member + 1 || nargs > !is_member + 2) {
@@ -2074,14 +2078,15 @@ static PyObject *Str_alignment_score(PyObject *self, PyObject *args, PyObject *k
     reusing_allocator.free = &temporary_memory_free;
     reusing_allocator.handle = &temporary_memory;
 
-    sz_ssize_t score = sz_alignment_score(str1.start, str1.length, str2.start, str2.length, substitutions,
-                                          (sz_error_cost_t)gap, &reusing_allocator);
+    sz_ssize_t score;
+    sz_status_t status = sz_needleman_wunsch_score(str1.start, str1.length, str2.start, str2.length, substitutions,
+                                                   (sz_error_cost_t)gap, &reusing_allocator, &score);
 
     // Don't forget to release the buffer view
     PyBuffer_Release(&substitutions_view);
 
     // Check for memory allocation issues
-    if (score == SZ_SSIZE_MAX) {
+    if (status != sz_success_k) {
         PyErr_NoMemory();
         return NULL;
     }
@@ -2259,11 +2264,11 @@ static PyObject *Str_translate(PyObject *self, PyObject *args, PyObject *kwargs)
     }
 
     sz_string_view_t look_up_table_str;
-    SZ_ALIGN64 char look_up_table[256];
+    _SZ_ALIGN64 char look_up_table[256];
     if (PyDict_Check(look_up_table_obj)) {
 
         // If any character is not defined, it will be replaced with itself:
-        for (int i = 0; i < 256; i++) { look_up_table[i] = (char)i; }
+        for (int i = 0; i < 256; i++) look_up_table[i] = (char)i;
 
         // Process the dictionary into the look-up table
         PyObject *key, *value;
@@ -2305,7 +2310,7 @@ static PyObject *Str_translate(PyObject *self, PyObject *args, PyObject *kwargs)
 
     // Perform the translation using the look-up table
     if (is_inplace) {
-        sz_look_up_transform(str.start, str.length, look_up_table, str.start);
+        sz_lookup(str.start, str.length, str.start, look_up_table);
         Py_RETURN_NONE;
     }
     // Allocate a string of the same size, get it's raw pointer and transform the data into it
@@ -2321,7 +2326,7 @@ static PyObject *Str_translate(PyObject *self, PyObject *args, PyObject *kwargs)
             }
 
             sz_ptr_t new_buffer = (sz_ptr_t)PyUnicode_DATA(new_unicode_obj);
-            sz_look_up_transform(str.start, str.length, look_up_table, new_buffer);
+            sz_lookup(new_buffer, str.length, str.start, look_up_table);
             return new_unicode_obj;
         }
         else {
@@ -2333,7 +2338,7 @@ static PyObject *Str_translate(PyObject *self, PyObject *args, PyObject *kwargs)
 
             // Get the buffer and perform the transformation
             sz_ptr_t new_buffer = (sz_ptr_t)PyBytes_AS_STRING(new_bytes_obj);
-            sz_look_up_transform(str.start, str.length, look_up_table, new_buffer);
+            sz_lookup(new_buffer, str.length, str.start, look_up_table);
             return new_bytes_obj;
         }
     }
@@ -2354,7 +2359,7 @@ static PyObject *Str_find_first_of(PyObject *self, PyObject *args, PyObject *kwa
     Py_ssize_t signed_offset;
     sz_string_view_t text;
     sz_string_view_t separator;
-    if (!_Str_find_implementation_(self, args, kwargs, &sz_find_char_from, sz_false_k, &signed_offset, &text,
+    if (!_Str_find_implementation_(self, args, kwargs, &sz_find_byte_from, sz_false_k, &signed_offset, &text,
                                    &separator))
         return NULL;
     return PyLong_FromSsize_t(signed_offset);
@@ -2375,7 +2380,7 @@ static PyObject *Str_find_first_not_of(PyObject *self, PyObject *args, PyObject
     Py_ssize_t signed_offset;
     sz_string_view_t text;
     sz_string_view_t separator;
-    if (!_Str_find_implementation_(self, args, kwargs, &sz_find_char_not_from, sz_false_k, &signed_offset, &text,
+    if (!_Str_find_implementation_(self, args, kwargs, &sz_find_byte_not_from, sz_false_k, &signed_offset, &text,
                                    &separator))
         return NULL;
     return PyLong_FromSsize_t(signed_offset);
@@ -2396,7 +2401,7 @@ static PyObject *Str_find_last_of(PyObject *self, PyObject *args, PyObject *kwar
     Py_ssize_t signed_offset;
     sz_string_view_t text;
     sz_string_view_t separator;
-    if (!_Str_find_implementation_(self, args, kwargs, &sz_rfind_char_from, sz_true_k, &signed_offset, &text,
+    if (!_Str_find_implementation_(self, args, kwargs, &sz_rfind_byte_from, sz_true_k, &signed_offset, &text,
                                    &separator))
         return NULL;
     return PyLong_FromSsize_t(signed_offset);
@@ -2417,7 +2422,7 @@ static PyObject *Str_find_last_not_of(PyObject *self, PyObject *args, PyObject *
     Py_ssize_t signed_offset;
     sz_string_view_t text;
     sz_string_view_t separator;
-    if (!_Str_find_implementation_(self, args, kwargs, &sz_rfind_char_not_from, sz_true_k, &signed_offset, &text,
+    if (!_Str_find_implementation_(self, args, kwargs, &sz_rfind_byte_not_from, sz_true_k, &signed_offset, &text,
                                    &separator))
         return NULL;
     return PyLong_FromSsize_t(signed_offset);
@@ -2456,7 +2461,7 @@ static SplitIterator *Str_split_iter_(PyObject *text_obj, PyObject *separator_ob
 
 /**
  *  @brief  Implements the normal order split logic for both string-delimiters and character sets.
- *          Produuces one of the consecutive layouts - `STRS_CONSECUTIVE_64` or `STRS_CONSECUTIVE_32`.
+ *          Produces one of the consecutive layouts - `STRS_CONSECUTIVE_64` or `STRS_CONSECUTIVE_32`.
  */
 static Strs *Str_split_(PyObject *parent_string, sz_string_view_t const text, sz_string_view_t const separator,
                         int keepseparator, Py_ssize_t maxsplit, sz_find_t finder, sz_size_t match_length) {
@@ -2544,7 +2549,7 @@ static Strs *Str_split_(PyObject *parent_string, sz_string_view_t const text, sz
 
 /**
  *  @brief  Implements the reverse order split logic for both string-delimiters and character sets.
- *          Unlike the `Str_split_` can't use consecutive layouts and produces a `REAORDERED` one.
+ *          Unlike the `Str_split_` can't use consecutive layouts and produces a `REORDERED` one.
  */
 static Strs *Str_rsplit_(PyObject *parent_string, sz_string_view_t const text, sz_string_view_t const separator,
                          int keepseparator, Py_ssize_t maxsplit, sz_find_t finder, sz_size_t match_length) {
@@ -2622,7 +2627,7 @@ static Strs *Str_rsplit_(PyObject *parent_string, sz_string_view_t const text, s
 }
 
 /**
- *  @brief  Proxy routing requests like `Str.split`, `Str.rsplit`, `Str.split_charset` and `Str.rsplit_charset`
+ *  @brief  Proxy routing requests like `Str.split`, `Str.rsplit`, `Str.split_byteset` and `Str.rsplit_byteset`
  *          to `Str_split_` and `Str_rsplit_` implementations, parsing function arguments.
  */
 static PyObject *Str_split_with_known_callback(PyObject *self, PyObject *args, PyObject *kwargs, //
@@ -2747,7 +2752,7 @@ static PyObject *Str_rsplit(PyObject *self, PyObject *args, PyObject *kwargs) {
     return Str_split_with_known_callback(self, args, kwargs, &sz_rfind, 0, sz_true_k, sz_false_k);
 }
 
-static char const doc_split_charset[] = //
+static char const doc_split_byteset[] = //
     "Split a string by a set of character separators.\n"
     "\n"
     "Args:\n"
@@ -2758,11 +2763,11 @@ static char const doc_split_charset[] = //
     "Returns:\n"
     "  Strs: A list of strings split by the character set.";
 
-static PyObject *Str_split_charset(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return Str_split_with_known_callback(self, args, kwargs, &sz_find_char_from, 1, sz_false_k, sz_false_k);
+static PyObject *Str_split_byteset(PyObject *self, PyObject *args, PyObject *kwargs) {
+    return Str_split_with_known_callback(self, args, kwargs, &sz_find_byte_from, 1, sz_false_k, sz_false_k);
 }
 
-static char const doc_rsplit_charset[] = //
+static char const doc_rsplit_byteset[] = //
     "Split a string by a set of character separators in reverse order.\n"
     "\n"
     "Args:\n"
@@ -2773,8 +2778,8 @@ static char const doc_rsplit_charset[] = //
     "Returns:\n"
     "  Strs: A list of strings split by the character set.";
 
-static PyObject *Str_rsplit_charset(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return Str_split_with_known_callback(self, args, kwargs, &sz_rfind_char_from, 1, sz_true_k, sz_false_k);
+static PyObject *Str_rsplit_byteset(PyObject *self, PyObject *args, PyObject *kwargs) {
+    return Str_split_with_known_callback(self, args, kwargs, &sz_rfind_byte_from, 1, sz_true_k, sz_false_k);
 }
 
 static char const doc_split_iter[] = //
@@ -2809,7 +2814,7 @@ static PyObject *Str_rsplit_iter(PyObject *self, PyObject *args, PyObject *kwarg
     return Str_split_with_known_callback(self, args, kwargs, &sz_rfind, 0, sz_true_k, sz_true_k);
 }
 
-static char const doc_split_charset_iter[] = //
+static char const doc_split_byteset_iter[] = //
     "Create an iterator for splitting a string by a set of character separators.\n"
     "\n"
     "Args:\n"
@@ -2819,11 +2824,11 @@ static char const doc_split_charset_iter[] = //
     "Returns:\n"
     "  iterator: An iterator yielding split substrings.";
 
-static PyObject *Str_split_charset_iter(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return Str_split_with_known_callback(self, args, kwargs, &sz_find_char_from, 1, sz_false_k, sz_true_k);
+static PyObject *Str_split_byteset_iter(PyObject *self, PyObject *args, PyObject *kwargs) {
+    return Str_split_with_known_callback(self, args, kwargs, &sz_find_byte_from, 1, sz_false_k, sz_true_k);
 }
 
-static char const doc_rsplit_charset_iter[] = //
+static char const doc_rsplit_byteset_iter[] = //
     "Create an iterator for splitting a string by a set of character separators in reverse order.\n"
     "\n"
     "Args:\n"
@@ -2833,8 +2838,8 @@ static char const doc_rsplit_charset_iter[] = //
     "Returns:\n"
     "  iterator: An iterator yielding split substrings in reverse.";
 
-static PyObject *Str_rsplit_charset_iter(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return Str_split_with_known_callback(self, args, kwargs, &sz_rfind_char_from, 1, sz_true_k, sz_true_k);
+static PyObject *Str_rsplit_byteset_iter(PyObject *self, PyObject *args, PyObject *kwargs) {
+    return Str_split_with_known_callback(self, args, kwargs, &sz_rfind_byte_from, 1, sz_true_k, sz_true_k);
 }
 
 static char const doc_splitlines[] = //
@@ -2924,7 +2929,7 @@ static PyObject *Str_splitlines(PyObject *self, PyObject *args, PyObject *kwargs
     sz_string_view_t separator;
     separator.start = "\x0A\x0B\x0C\x0D\x85\x1C\x1D\x1E";
     separator.length = 8;
-    return Str_split_(text_obj, text, separator, keeplinebreaks, maxsplit, &sz_find_char_from, 1);
+    return Str_split_(text_obj, text, separator, keeplinebreaks, maxsplit, &sz_find_byte_from, 1);
 }
 
 static PyObject *Str_concat(PyObject *self, PyObject *other) {
@@ -3011,23 +3016,24 @@ static PyMethodDef Str_methods[] = {
     {"hamming_distance", (PyCFunction)Str_hamming_distance, SZ_METHOD_FLAGS, doc_hamming_distance},
     {"hamming_distance_unicode", (PyCFunction)Str_hamming_distance_unicode, SZ_METHOD_FLAGS,
      doc_hamming_distance_unicode},
-    {"edit_distance", (PyCFunction)Str_edit_distance, SZ_METHOD_FLAGS, doc_edit_distance},
-    {"edit_distance_unicode", (PyCFunction)Str_edit_distance_unicode, SZ_METHOD_FLAGS, doc_edit_distance_unicode},
-    {"alignment_score", (PyCFunction)Str_alignment_score, SZ_METHOD_FLAGS, doc_alignment_score},
+    {"levenshtein_distance", (PyCFunction)Str_levenshtein_distance, SZ_METHOD_FLAGS, doc_levenshtein_distance},
+    {"levenshtein_distance_unicode", (PyCFunction)Str_levenshtein_distance_unicode, SZ_METHOD_FLAGS,
+     doc_levenshtein_distance_unicode},
+    {"needleman_wunsch_score", (PyCFunction)Str_needleman_wunsch_score, SZ_METHOD_FLAGS, doc_needleman_wunsch_score},
 
     // Character search extensions
     {"find_first_of", (PyCFunction)Str_find_first_of, SZ_METHOD_FLAGS, doc_find_first_of},
     {"find_last_of", (PyCFunction)Str_find_last_of, SZ_METHOD_FLAGS, doc_find_last_of},
     {"find_first_not_of", (PyCFunction)Str_find_first_not_of, SZ_METHOD_FLAGS, doc_find_first_not_of},
     {"find_last_not_of", (PyCFunction)Str_find_last_not_of, SZ_METHOD_FLAGS, doc_find_last_not_of},
-    {"split_charset", (PyCFunction)Str_split_charset, SZ_METHOD_FLAGS, doc_split_charset},
-    {"rsplit_charset", (PyCFunction)Str_rsplit_charset, SZ_METHOD_FLAGS, doc_rsplit_charset},
+    {"split_byteset", (PyCFunction)Str_split_byteset, SZ_METHOD_FLAGS, doc_split_byteset},
+    {"rsplit_byteset", (PyCFunction)Str_rsplit_byteset, SZ_METHOD_FLAGS, doc_rsplit_byteset},
 
     // Lazily evaluated iterators
     {"split_iter", (PyCFunction)Str_split_iter, SZ_METHOD_FLAGS, doc_split_iter},
     {"rsplit_iter", (PyCFunction)Str_rsplit_iter, SZ_METHOD_FLAGS, doc_rsplit_iter},
-    {"split_charset_iter", (PyCFunction)Str_split_charset_iter, SZ_METHOD_FLAGS, doc_split_charset_iter},
-    {"rsplit_charset_iter", (PyCFunction)Str_rsplit_charset_iter, SZ_METHOD_FLAGS, doc_rsplit_charset_iter},
+    {"split_byteset_iter", (PyCFunction)Str_split_byteset_iter, SZ_METHOD_FLAGS, doc_split_byteset_iter},
+    {"rsplit_byteset_iter", (PyCFunction)Str_rsplit_byteset_iter, SZ_METHOD_FLAGS, doc_rsplit_byteset_iter},
 
     // Dealing with larger-than-memory datasets
     {"offset_within", (PyCFunction)Str_offset_within, SZ_METHOD_FLAGS, doc_offset_within},
@@ -3181,8 +3187,8 @@ static PyObject *Strs_shuffle(Strs *self, PyObject *args, PyObject *kwargs) {
     Py_RETURN_NONE;
 }
 
-static sz_bool_t Strs_sort_(Strs *self, sz_string_view_t **parts_output, sz_sorted_idx_t **order_output,
-                            sz_size_t *count_output) {
+static sz_bool_t Strs_argsort_(Strs *self, sz_string_view_t **parts_output, sz_sorted_idx_t **order_output,
+                               sz_size_t *count_output) {
     // Change the layout
     if (!prepare_strings_for_reordering(self)) {
         PyErr_Format(PyExc_TypeError, "Failed to prepare the sequence for sorting");
@@ -3208,17 +3214,15 @@ static sz_bool_t Strs_sort_(Strs *self, sz_string_view_t **parts_output, sz_sort
     // Call our sorting algorithm
     sz_sequence_t sequence;
     sz_fill(&sequence, sizeof(sequence), 0);
-    sequence.order = (sz_sorted_idx_t *)temporary_memory.start;
     sequence.count = count;
     sequence.handle = parts;
     sequence.get_start = parts_get_start;
     sequence.get_length = parts_get_length;
-    for (sz_sorted_idx_t i = 0; i != sequence.count; ++i) sequence.order[i] = i;
-    sz_sequence_argsort(&sequence);
+    sz_status_t status = sz_sequence_argsort(&sequence, NULL, (sz_sorted_idx_t *)temporary_memory.start);
 
     // Export results
     *parts_output = parts;
-    *order_output = sequence.order;
+    *order_output = (sz_sorted_idx_t *)temporary_memory.start;
     *count_output = sequence.count;
     return 1;
 }
@@ -3256,18 +3260,18 @@ static PyObject *Strs_sort(Strs *self, PyObject *args, PyObject *kwargs) {
     sz_string_view_t *parts = NULL;
     sz_size_t *order = NULL;
     sz_size_t count = 0;
-    if (!Strs_sort_(self, &parts, &order, &count)) return NULL;
+    if (!Strs_argsort_(self, &parts, &order, &count)) return NULL;
 
     // Apply the sorting algorithm here, considering the `reverse` value
     if (reverse) reverse_offsets(order, count);
 
     // Apply the new order.
-    apply_order(parts, order, count);
+    permute(parts, order, count);
 
     Py_RETURN_NONE;
 }
 
-static PyObject *Strs_order(Strs *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Strs_argsort(Strs *self, PyObject *args, PyObject *kwargs) {
     PyObject *reverse_obj = NULL; // Default is not reversed
 
     // Check for positional arguments
@@ -3300,7 +3304,7 @@ static PyObject *Strs_order(Strs *self, PyObject *args, PyObject *kwargs) {
     sz_string_view_t *parts = NULL;
     sz_sorted_idx_t *order = NULL;
     sz_size_t count = 0;
-    if (!Strs_sort_(self, &parts, &order, &count)) return NULL;
+    if (!Strs_argsort_(self, &parts, &order, &count)) return NULL;
 
     // Apply the sorting algorithm here, considering the `reverse` value
     if (reverse) reverse_offsets(order, count);
@@ -3606,11 +3610,11 @@ static PyGetSetDef Strs_getsetters[] = {
 static PyMethodDef Strs_methods[] = {
     {"shuffle", Strs_shuffle, SZ_METHOD_FLAGS, "Shuffle (in-place) the elements of the Strs object."}, //
     {"sort", Strs_sort, SZ_METHOD_FLAGS, "Sort (in-place) the elements of the Strs object."},          //
-    {"order", Strs_order, SZ_METHOD_FLAGS, "Provides the indexes to achieve sorted order."},           //
+    {"argsort", Strs_argsort, SZ_METHOD_FLAGS, "Provides the permutation to achieve sorted order."},   //
     {"sample", Strs_sample, SZ_METHOD_FLAGS, "Provides a random sample of a given size."},             //
-    // {"to_pylist", Strs_to_pylist, SZ_METHOD_FLAGS, "Exports string-views to a native list of native strings."},
-    // //
-    {NULL, NULL, 0, NULL}};
+    // {"to_pylist", Strs_to_pylist, SZ_METHOD_FLAGS, "Exports string-views to a native list of native strings."}, //
+    {NULL, NULL, 0, NULL} // Sentinel
+};
 
 static PyTypeObject StrsType = {
     PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzilla.Strs",
@@ -3660,23 +3664,24 @@ static PyMethodDef stringzilla_methods[] = {
     // Edit distance extensions
     {"hamming_distance", Str_hamming_distance, SZ_METHOD_FLAGS, doc_hamming_distance},
     {"hamming_distance_unicode", Str_hamming_distance_unicode, SZ_METHOD_FLAGS, doc_hamming_distance_unicode},
-    {"edit_distance", Str_edit_distance, SZ_METHOD_FLAGS, doc_edit_distance},
-    {"edit_distance_unicode", Str_edit_distance_unicode, SZ_METHOD_FLAGS, doc_edit_distance_unicode},
-    {"alignment_score", Str_alignment_score, SZ_METHOD_FLAGS, doc_alignment_score},
+    {"levenshtein_distance", Str_levenshtein_distance, SZ_METHOD_FLAGS, doc_levenshtein_distance},
+    {"levenshtein_distance_unicode", Str_levenshtein_distance_unicode, SZ_METHOD_FLAGS,
+     doc_levenshtein_distance_unicode},
+    {"needleman_wunsch_score", Str_needleman_wunsch_score, SZ_METHOD_FLAGS, doc_needleman_wunsch_score},
 
     // Character search extensions
     {"find_first_of", Str_find_first_of, SZ_METHOD_FLAGS, doc_find_first_of},
     {"find_last_of", Str_find_last_of, SZ_METHOD_FLAGS, doc_find_last_of},
     {"find_first_not_of", Str_find_first_not_of, SZ_METHOD_FLAGS, doc_find_first_not_of},
     {"find_last_not_of", Str_find_last_not_of, SZ_METHOD_FLAGS, doc_find_last_not_of},
-    {"split_charset", Str_split_charset, SZ_METHOD_FLAGS, doc_split_charset},
-    {"rsplit_charset", Str_rsplit_charset, SZ_METHOD_FLAGS, doc_rsplit_charset},
+    {"split_byteset", Str_split_byteset, SZ_METHOD_FLAGS, doc_split_byteset},
+    {"rsplit_byteset", Str_rsplit_byteset, SZ_METHOD_FLAGS, doc_rsplit_byteset},
 
     // Lazily evaluated iterators
     {"split_iter", Str_split_iter, SZ_METHOD_FLAGS, doc_split_iter},
     {"rsplit_iter", Str_rsplit_iter, SZ_METHOD_FLAGS, doc_rsplit_iter},
-    {"split_charset_iter", Str_split_charset_iter, SZ_METHOD_FLAGS, doc_split_charset_iter},
-    {"rsplit_charset_iter", Str_rsplit_charset_iter, SZ_METHOD_FLAGS, doc_rsplit_charset_iter},
+    {"split_byteset_iter", Str_split_byteset_iter, SZ_METHOD_FLAGS, doc_split_byteset_iter},
+    {"rsplit_byteset_iter", Str_rsplit_byteset_iter, SZ_METHOD_FLAGS, doc_rsplit_byteset_iter},
 
     // Dealing with larger-than-memory datasets
     {"offset_within", Str_offset_within, SZ_METHOD_FLAGS, doc_offset_within},
@@ -3714,8 +3719,7 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
     // Add version metadata
     {
         char version_str[50];
-        sprintf(version_str, "%d.%d.%d", STRINGZILLA_VERSION_MAJOR, STRINGZILLA_VERSION_MINOR,
-                STRINGZILLA_VERSION_PATCH);
+        sprintf(version_str, "%d.%d.%d", sz_version_major(), sz_version_minor(), sz_version_patch());
         PyModule_AddStringConstant(m, "__version__", version_str);
     }
 
@@ -3724,17 +3728,18 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
         sz_capability_t caps = sz_capabilities();
         char caps_str[512];
         char const *serial = (caps & sz_cap_serial_k) ? "serial," : "";
-        char const *neon = (caps & sz_cap_arm_neon_k) ? "neon," : "";
-        char const *sve = (caps & sz_cap_arm_sve_k) ? "sve," : "";
-        char const *avx2 = (caps & sz_cap_x86_avx2_k) ? "avx2," : "";
-        char const *avx512f = (caps & sz_cap_x86_avx512f_k) ? "avx512f," : "";
-        char const *avx512vl = (caps & sz_cap_x86_avx512vl_k) ? "avx512vl," : "";
-        char const *avx512bw = (caps & sz_cap_x86_avx512bw_k) ? "avx512bw," : "";
-        char const *avx512vbmi = (caps & sz_cap_x86_avx512vbmi_k) ? "avx512vbmi," : "";
-        char const *gfni = (caps & sz_cap_x86_gfni_k) ? "gfni," : "";
-        char const *avx512vbmi2 = (caps & sz_cap_x86_avx512vbmi2_k) ? "avx512vbmi2," : "";
-        sprintf(caps_str, "%s%s%s%s%s%s%s%s%s%s", serial, neon, sve, avx2, avx512f, avx512vl, avx512bw, avx512vbmi,
-                avx512vbmi2, gfni);
+        char const *neon = (caps & sz_cap_neon_k) ? "neon," : "";
+        char const *neon_aes = (caps & sz_cap_neon_aes_k) ? "neon_aes," : "";
+        char const *sve = (caps & sz_cap_sve_k) ? "sve," : "";
+        char const *sve2 = (caps & sz_cap_sve2_k) ? "sve2," : "";
+        char const *sve2_aes = (caps & sz_cap_sve2_aes_k) ? "sve2_aes," : "";
+        char const *haswell = (caps & sz_cap_haswell_k) ? "haswell," : "";
+        char const *skylake = (caps & sz_cap_skylake_k) ? "skylake," : "";
+        char const *ice = (caps & sz_cap_ice_k) ? "ice," : "";
+        sprintf(caps_str, "%s%s%s%s%s%s%s%s%s",      //
+                serial,                              //
+                neon, neon_aes, sve, sve2, sve2_aes, //
+                haswell, skylake, ice);
         PyModule_AddStringConstant(m, "__capabilities__", caps_str);
     }
 
diff --git a/rust/lib.rs b/rust/lib.rs
index d9d4e237..d5e9a682 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -8,6 +8,61 @@
 
 pub mod sz {
 
+    #[repr(C)]
+    #[derive(Debug, PartialEq)]
+    pub enum Status {
+        Success = 0,
+        BadAlloc = -1,
+        InvalidUtf8 = -2,
+        ContainsDuplicates = -3,
+    }
+
+    #[repr(C)]
+    #[derive(Debug, Clone, Copy)]
+    pub struct Byteset {
+        bits: [u64; 4],
+    }
+
+    impl Byteset {
+        /// Initializes a bit‑set to an empty collection (all characters banned).
+        #[inline]
+        pub fn new() -> Self {
+            Self { bits: [0; 4] }
+        }
+
+        /// Initializes a bit‑set to contain all ASCII characters.
+        #[inline]
+        pub fn new_ascii() -> Self {
+            Self {
+                bits: [u64::MAX, u64::MAX, 0, 0],
+            }
+        }
+
+        /// Adds a byte to the set.
+        #[inline]
+        pub fn add_u8(&mut self, c: u8) {
+            let idx = (c >> 6) as usize; // Divide by 64.
+            let bit = c & 63; // Remainder modulo 64.
+            self.bits[idx] |= 1 << bit;
+        }
+
+        /// Adds a character to the set.
+        ///
+        /// This function assumes the character is in the ASCII range.
+        #[inline]
+        pub fn add(&mut self, c: char) {
+            self.add_u8(c as u8);
+        }
+
+        /// Inverts the bit-set so that all set bits become unset and vice versa.
+        #[inline]
+        pub fn invert(&mut self) {
+            for b in self.bits.iter_mut() {
+                *b = !*b;
+            }
+        }
+    }
+
     use core::{ffi::c_void, usize};
 
     // Import the functions from the StringZilla C library.
@@ -26,83 +81,64 @@ pub mod sz {
             needle_length: usize,
         ) -> *const c_void;
 
-        fn sz_find_char_from(
-            haystack: *const c_void,
-            haystack_length: usize,
-            needle: *const c_void,
-            needle_length: usize,
-        ) -> *const c_void;
+        fn sz_find_byteset(haystack: *const c_void, haystack_length: usize, byteset: *const c_void) -> *const c_void;
 
-        fn sz_rfind_char_from(
-            haystack: *const c_void,
-            haystack_length: usize,
-            needle: *const c_void,
-            needle_length: usize,
-        ) -> *const c_void;
-
-        fn sz_find_char_not_from(
-            haystack: *const c_void,
-            haystack_length: usize,
-            needle: *const c_void,
-            needle_length: usize,
-        ) -> *const c_void;
-
-        fn sz_rfind_char_not_from(
-            haystack: *const c_void,
-            haystack_length: usize,
-            needle: *const c_void,
-            needle_length: usize,
-        ) -> *const c_void;
+        fn sz_rfind_byteset(haystack: *const c_void, haystack_length: usize, byteset: *const c_void) -> *const c_void;
 
         fn sz_bytesum(text: *const c_void, length: usize) -> u64;
 
         fn sz_hash(text: *const c_void, length: usize, seed: u64) -> u64;
 
-        fn sz_generate(text: *mut c_void, length: usize, seed: u64) -> u64;
+        fn sz_fill_random(text: *mut c_void, length: usize, seed: u64);
 
-        fn sz_edit_distance(
-            haystack1: *const c_void,
-            haystack1_length: usize,
-            haystack2: *const c_void,
-            haystack2_length: usize,
+        pub fn sz_levenshtein_distance(
+            a: *const c_void,
+            a_length: usize,
+            b: *const c_void,
+            b_length: usize,
             bound: usize,
-            allocator: *const c_void,
-        ) -> usize;
-
-        fn sz_edit_distance_utf8(
-            haystack1: *const c_void,
-            haystack1_length: usize,
-            haystack2: *const c_void,
-            haystack2_length: usize,
+            alloc: *const c_void,
+            result: *mut usize,
+        ) -> Status;
+
+        pub fn sz_levenshtein_distance_utf8(
+            a: *const c_void,
+            a_length: usize,
+            b: *const c_void,
+            b_length: usize,
             bound: usize,
-            allocator: *const c_void,
-        ) -> usize;
-
-        fn sz_hamming_distance(
-            haystack1: *const c_void,
-            haystack1_length: usize,
-            haystack2: *const c_void,
-            haystack2_length: usize,
+            alloc: *const c_void,
+            result: *mut usize,
+        ) -> Status;
+
+        pub fn sz_hamming_distance(
+            a: *const c_void,
+            a_length: usize,
+            b: *const c_void,
+            b_length: usize,
             bound: usize,
-        ) -> usize;
-
-        fn sz_hamming_distance_utf8(
-            haystack1: *const c_void,
-            haystack1_length: usize,
-            haystack2: *const c_void,
-            haystack2_length: usize,
+            result: *mut usize,
+        ) -> Status;
+
+        pub fn sz_hamming_distance_utf8(
+            a: *const c_void,
+            a_length: usize,
+            b: *const c_void,
+            b_length: usize,
             bound: usize,
-        ) -> usize;
-
-        fn sz_alignment_score(
-            haystack1: *const c_void,
-            haystack1_length: usize,
-            haystack2: *const c_void,
-            haystack2_length: usize,
-            matrix: *const c_void,
+            result: *mut usize,
+        ) -> Status;
+
+        pub fn sz_needleman_wunsch_score(
+            a: *const c_void,
+            a_length: usize,
+            b: *const c_void,
+            b_length: usize,
+            subs: *const i8,
             gap: i8,
-            allocator: *const c_void,
-        ) -> isize;
+            alloc: *const c_void,
+            result: *mut isize,
+        ) -> Status;
 
     }
 
@@ -136,21 +172,41 @@ pub mod sz {
     /// # Arguments
     ///
     /// * `text`: The byte slice to compute the checksum for.
+    /// * `seed` - A 64-bit value that acts as the seed for the hash function.
     ///
     /// # Returns
     ///
     /// A `u64` representing the hash value of the input byte slice.
-    pub fn hash<T>(text: T) -> u64
+    pub fn hash_with_seed<T>(text: T, seed: u64) -> u64
     where
         T: AsRef<[u8]>,
     {
         let text_ref = text.as_ref();
         let text_pointer = text_ref.as_ptr() as _;
         let text_length = text_ref.len();
-        let result = unsafe { sz_hash(text_pointer, text_length) };
+        let result = unsafe { sz_hash(text_pointer, text_length, seed) };
         return result;
     }
 
+    /// Computes a 64-bit AES-based hash value for a given byte slice `text`.
+    /// This function is designed to provide a high-quality hash value for use in
+    /// hash tables, data structures, and cryptographic applications.
+    /// Unlike the bytesum function, the hash function is order-sensitive.
+    ///
+    /// # Arguments
+    ///
+    /// * `text`: The byte slice to compute the checksum for.
+    ///
+    /// # Returns
+    ///
+    /// A `u64` representing the hash value of the input byte slice.
+    pub fn hash<T>(text: T) -> u64
+    where
+        T: AsRef<[u8]>,
+    {
+        hash_with_seed(text, 0)
+    }
+
     /// Locates the first matching substring within `haystack` that equals `needle`.
     /// This function is similar to the `memmem()` function in LibC, but, unlike `strstr()`,
     /// it requires the length of both haystack and needle to be known beforehand.
@@ -175,14 +231,7 @@ pub mod sz {
         let haystack_length = haystack_ref.len();
         let needle_pointer = needle_ref.as_ptr() as _;
         let needle_length = needle_ref.len();
-        let result = unsafe {
-            sz_find(
-                haystack_pointer,
-                haystack_length,
-                needle_pointer,
-                needle_length,
-            )
-        };
+        let result = unsafe { sz_find(haystack_pointer, haystack_length, needle_pointer, needle_length) };
 
         if result.is_null() {
             None
@@ -215,14 +264,7 @@ pub mod sz {
         let haystack_length = haystack_ref.len();
         let needle_pointer = needle_ref.as_ptr() as _;
         let needle_length = needle_ref.len();
-        let result = unsafe {
-            sz_rfind(
-                haystack_pointer,
-                haystack_length,
-                needle_pointer,
-                needle_length,
-            )
-        };
+        let result = unsafe { sz_rfind(haystack_pointer, haystack_length, needle_pointer, needle_length) };
 
         if result.is_null() {
             None
@@ -244,7 +286,7 @@ pub mod sz {
     ///
     /// An `Option<usize>` representing the index of the first occurrence of any byte from
     /// `needles` within `haystack`, if found, otherwise `None`.
-    pub fn find_char_from<H, N>(haystack: H, needles: N) -> Option<usize>
+    pub fn find_byte_from<H, N>(haystack: H, needles: N) -> Option<usize>
     where
         H: AsRef<[u8]>,
         N: AsRef<[u8]>,
@@ -253,16 +295,13 @@ pub mod sz {
         let needles_ref = needles.as_ref();
         let haystack_pointer = haystack_ref.as_ptr() as _;
         let haystack_length = haystack_ref.len();
-        let needles_pointer = needles_ref.as_ptr() as _;
-        let needles_length = needles_ref.len();
-        let result = unsafe {
-            sz_find_char_from(
-                haystack_pointer,
-                haystack_length,
-                needles_pointer,
-                needles_length,
-            )
-        };
+        let mut byteset = Byteset::new();
+        for &b in needles_ref {
+            byteset.add_u8(b);
+        }
+
+        let result =
+            unsafe { sz_find_byteset(haystack_pointer, haystack_length, &byteset as *const _ as *const c_void) };
         if result.is_null() {
             None
         } else {
@@ -283,7 +322,7 @@ pub mod sz {
     ///
     /// An `Option<usize>` representing the index of the last occurrence of any byte from
     /// `needles` within `haystack`, if found, otherwise `None`.
-    pub fn rfind_char_from<H, N>(haystack: H, needles: N) -> Option<usize>
+    pub fn rfind_byte_from<H, N>(haystack: H, needles: N) -> Option<usize>
     where
         H: AsRef<[u8]>,
         N: AsRef<[u8]>,
@@ -292,16 +331,13 @@ pub mod sz {
         let needles_ref = needles.as_ref();
         let haystack_pointer = haystack_ref.as_ptr() as _;
         let haystack_length = haystack_ref.len();
-        let needles_pointer = needles_ref.as_ptr() as _;
-        let needles_length = needles_ref.len();
-        let result = unsafe {
-            sz_rfind_char_from(
-                haystack_pointer,
-                haystack_length,
-                needles_pointer,
-                needles_length,
-            )
-        };
+        let mut byteset = Byteset::new();
+        for &b in needles_ref {
+            byteset.add_u8(b);
+        }
+
+        let result =
+            unsafe { sz_rfind_byteset(haystack_pointer, haystack_length, &byteset as *const _ as *const c_void) };
         if result.is_null() {
             None
         } else {
@@ -322,7 +358,7 @@ pub mod sz {
     ///
     /// An `Option<usize>` representing the index of the first occurrence of any byte not in
     /// `needles` within `haystack`, if found, otherwise `None`.
-    pub fn find_char_not_from<H, N>(haystack: H, needles: N) -> Option<usize>
+    pub fn find_byte_not_from<H, N>(haystack: H, needles: N) -> Option<usize>
     where
         H: AsRef<[u8]>,
         N: AsRef<[u8]>,
@@ -331,16 +367,14 @@ pub mod sz {
         let needles_ref = needles.as_ref();
         let haystack_pointer = haystack_ref.as_ptr() as _;
         let haystack_length = haystack_ref.len();
-        let needles_pointer = needles_ref.as_ptr() as _;
-        let needles_length = needles_ref.len();
-        let result = unsafe {
-            sz_find_char_not_from(
-                haystack_pointer,
-                haystack_length,
-                needles_pointer,
-                needles_length,
-            )
-        };
+        let mut byteset = Byteset::new();
+        for &b in needles_ref {
+            byteset.add_u8(b);
+        }
+        byteset.invert();
+
+        let result =
+            unsafe { sz_find_byteset(haystack_pointer, haystack_length, &byteset as *const _ as *const c_void) };
         if result.is_null() {
             None
         } else {
@@ -361,7 +395,7 @@ pub mod sz {
     ///
     /// An `Option<usize>` representing the index of the last occurrence of any byte not in
     /// `needles` within `haystack`, if found, otherwise `None`.
-    pub fn rfind_char_not_from<H, N>(haystack: H, needles: N) -> Option<usize>
+    pub fn rfind_byte_not_from<H, N>(haystack: H, needles: N) -> Option<usize>
     where
         H: AsRef<[u8]>,
         N: AsRef<[u8]>,
@@ -370,16 +404,14 @@ pub mod sz {
         let needles_ref = needles.as_ref();
         let haystack_pointer = haystack_ref.as_ptr() as _;
         let haystack_length = haystack_ref.len();
-        let needles_pointer = needles_ref.as_ptr() as _;
-        let needles_length = needles_ref.len();
-        let result = unsafe {
-            sz_rfind_char_not_from(
-                haystack_pointer,
-                haystack_length,
-                needles_pointer,
-                needles_length,
-            )
-        };
+        let mut byteset = Byteset::new();
+        for &b in needles_ref {
+            byteset.add_u8(b);
+        }
+        byteset.invert();
+
+        let result =
+            unsafe { sz_rfind_byteset(haystack_pointer, haystack_length, &byteset as *const _ as *const c_void) };
         if result.is_null() {
             None
         } else {
@@ -401,7 +433,7 @@ pub mod sz {
     ///
     /// A `usize` representing the minimum number of single-character edits (insertions,
     /// deletions, or substitutions) required to change `first` into `second`.
-    pub fn edit_distance_bounded<F, S>(first: F, second: S, bound: usize) -> usize
+    pub fn levenshtein_distance_bounded<F, S>(first: F, second: S, bound: usize) -> Result<usize, Status>
     where
         F: AsRef<[u8]>,
         S: AsRef<[u8]>,
@@ -412,19 +444,22 @@ pub mod sz {
         let second_length = second_ref.len();
         let first_pointer = first_ref.as_ptr() as _;
         let second_pointer = second_ref.as_ptr() as _;
-        unsafe {
-            sz_edit_distance(
+        let mut result: usize = 0;
+        let status = unsafe {
+            sz_levenshtein_distance(
                 first_pointer,
                 first_length,
                 second_pointer,
                 second_length,
-                // Upper bound on the distance, that allows us to exit early. If zero is
-                // passed, the maximum possible distance will be equal to the length of
-                // the longer input.
                 bound,
-                // Uses the default allocator
-                core::ptr::null(),
+                core::ptr::null(), // Uses the default allocator
+                &mut result as *mut _,
             )
+        };
+        if status == Status::Success {
+            Ok(result)
+        } else {
+            Err(status)
         }
     }
 
@@ -441,7 +476,7 @@ pub mod sz {
     ///
     /// A `usize` representing the minimum number of single-character edits (insertions,
     /// deletions, or substitutions) required to change `first` into `second`.
-    pub fn edit_distance_utf8_bounded<F, S>(first: F, second: S, bound: usize) -> usize
+    pub fn levenshtein_distance_utf8_bounded<F, S>(first: F, second: S, bound: usize) -> Result<usize, Status>
     where
         F: AsRef<[u8]>,
         S: AsRef<[u8]>,
@@ -452,19 +487,22 @@ pub mod sz {
         let second_length = second_ref.len();
         let first_pointer = first_ref.as_ptr() as _;
         let second_pointer = second_ref.as_ptr() as _;
-        unsafe {
-            sz_edit_distance_utf8(
+        let mut result: usize = 0;
+        let status = unsafe {
+            sz_levenshtein_distance_utf8(
                 first_pointer,
                 first_length,
                 second_pointer,
                 second_length,
-                // Upper bound on the distance, that allows us to exit early. If zero is
-                // passed, the maximum possible distance will be equal to the length of
-                // the longer input.
                 bound,
-                // Uses the default allocator
-                core::ptr::null(),
+                core::ptr::null(), // Uses the default allocator
+                &mut result as *mut _,
             )
+        };
+        if status == Status::Success {
+            Ok(result)
+        } else {
+            Err(status)
         }
     }
 
@@ -481,12 +519,12 @@ pub mod sz {
     ///
     /// A `usize` representing the minimum number of single-character edits (insertions,
     /// deletions, or substitutions) required to change `first` into `second`.
-    pub fn edit_distance<F, S>(first: F, second: S) -> usize
+    pub fn levenshtein_distance<F, S>(first: F, second: S) -> Result<usize, Status>
     where
         F: AsRef<[u8]>,
         S: AsRef<[u8]>,
     {
-        edit_distance_bounded(first, second, usize::MAX)
+        levenshtein_distance_bounded(first, second, usize::MAX)
     }
 
     /// Computes the Levenshtein edit distance between two UTF8 strings, using the Wagner-Fisher
@@ -501,12 +539,12 @@ pub mod sz {
     ///
     /// A `usize` representing the minimum number of single-character edits (insertions,
     /// deletions, or substitutions) required to change `first` into `second`.
-    pub fn edit_distance_utf8<F, S>(first: F, second: S) -> usize
+    pub fn levenshtein_distance_utf8<F, S>(first: F, second: S) -> Result<usize, Status>
     where
         F: AsRef<[u8]>,
         S: AsRef<[u8]>,
     {
-        edit_distance_utf8_bounded(first, second, usize::MAX)
+        levenshtein_distance_utf8_bounded(first, second, usize::MAX)
     }
 
     /// Computes the Hamming edit distance between two strings, counting the number of substituted characters.
@@ -522,7 +560,7 @@ pub mod sz {
     ///
     /// A `usize` representing the minimum number of single-character edits (substitutions) required to
     /// change `first` into `second`.
-    pub fn hamming_distance_bounded<F, S>(first: F, second: S, bound: usize) -> usize
+    pub fn hamming_distance_bounded<F, S>(first: F, second: S, bound: usize) -> Result<usize, Status>
     where
         F: AsRef<[u8]>,
         S: AsRef<[u8]>,
@@ -533,17 +571,21 @@ pub mod sz {
         let second_length = second_ref.len();
         let first_pointer = first_ref.as_ptr() as _;
         let second_pointer = second_ref.as_ptr() as _;
-        unsafe {
+        let mut result: usize = 0;
+        let status = unsafe {
             sz_hamming_distance(
                 first_pointer,
                 first_length,
                 second_pointer,
                 second_length,
-                // Upper bound on the distance, that allows us to exit early. If zero is
-                // passed, the maximum possible distance will be equal to the length of
-                // the longer input.
                 bound,
+                &mut result as *mut _,
             )
+        };
+        if status == Status::Success {
+            Ok(result)
+        } else {
+            Err(status)
         }
     }
 
@@ -560,7 +602,7 @@ pub mod sz {
     ///
     /// A `usize` representing the minimum number of single-character edits (substitutions) required to
     /// change `first` into `second`.
-    pub fn hamming_distance_utf8_bounded<F, S>(first: F, second: S, bound: usize) -> usize
+    pub fn hamming_distance_utf8_bounded<F, S>(first: F, second: S, bound: usize) -> Result<usize, Status>
     where
         F: AsRef<[u8]>,
         S: AsRef<[u8]>,
@@ -571,17 +613,21 @@ pub mod sz {
         let second_length = second_ref.len();
         let first_pointer = first_ref.as_ptr() as _;
         let second_pointer = second_ref.as_ptr() as _;
-        unsafe {
+        let mut result: usize = 0;
+        let status = unsafe {
             sz_hamming_distance_utf8(
                 first_pointer,
                 first_length,
                 second_pointer,
                 second_length,
-                // Upper bound on the distance, that allows us to exit early. If zero is
-                // passed, the maximum possible distance will be equal to the length of
-                // the longer input.
                 bound,
+                &mut result as *mut _,
             )
+        };
+        if status == Status::Success {
+            Ok(result)
+        } else {
+            Err(status)
         }
     }
 
@@ -597,7 +643,7 @@ pub mod sz {
     ///
     /// A `usize` representing the minimum number of single-character edits (substitutions) required to
     /// change `first` into `second`.
-    pub fn hamming_distance<F, S>(first: F, second: S) -> usize
+    pub fn hamming_distance<F, S>(first: F, second: S) -> Result<usize, Status>
     where
         F: AsRef<[u8]>,
         S: AsRef<[u8]>,
@@ -617,7 +663,7 @@ pub mod sz {
     ///
     /// A `usize` representing the minimum number of single-character edits (substitutions) required to
     /// change `first` into `second`.
-    pub fn hamming_distance_utf8<F, S>(first: F, second: S) -> usize
+    pub fn hamming_distance_utf8<F, S>(first: F, second: S) -> Result<usize, Status>
     where
         F: AsRef<[u8]>,
         S: AsRef<[u8]>,
@@ -642,7 +688,7 @@ pub mod sz {
     /// An `isize` representing the total alignment score, where higher scores indicate better
     /// alignment between the two strings, considering the specified gap penalties and
     /// substitution matrix.
-    pub fn alignment_score<F, S>(first: F, second: S, matrix: [[i8; 256]; 256], gap: i8) -> isize
+    pub fn alignment_score<F, S>(first: F, second: S, matrix: [[i8; 256]; 256], gap: i8) -> Result<isize, Status>
     where
         F: AsRef<[u8]>,
         S: AsRef<[u8]>,
@@ -653,16 +699,23 @@ pub mod sz {
         let second_length = second_ref.len();
         let first_pointer = first_ref.as_ptr() as _;
         let second_pointer = second_ref.as_ptr() as _;
-        unsafe {
-            sz_alignment_score(
+        let mut result: isize = 0;
+        let status = unsafe {
+            sz_needleman_wunsch_score(
                 first_pointer,
                 first_length,
                 second_pointer,
                 second_length,
                 matrix.as_ptr() as _,
                 gap,
-                core::ptr::null(),
+                core::ptr::null(), // Uses the default allocator
+                &mut result as *mut _,
             )
+        };
+        if status == Status::Success {
+            Ok(result)
+        } else {
+            Err(status)
         }
     }
 
@@ -697,42 +750,27 @@ pub mod sz {
     /// you need to generate random strings or data sequences based on a specific set
     /// of characters, such as generating random DNA sequences or testing inputs.
     ///
-    /// # Type Parameters
-    ///
-    /// * `T`: The type of the text to be randomized. Must be mutable and convertible to a byte slice.
-    /// * `A`: The type of the alphabet. Must be convertible to a byte slice.
-    ///
     /// # Arguments
     ///
-    /// * `text`: A mutable reference to the data to randomize. This data will be mutated in place.
-    /// * `alphabet`: A reference to the byte slice representing the alphabet to use for randomization.
+    /// * `buffer`: A mutable reference to the data to randomize. This data will be mutated in place.
+    /// * `nonce`: A 64-bit "number used once" (nonce) value to seed the random number generator.
     ///
     /// # Examples
     ///
     /// ```
     /// use stringzilla::sz;
-    /// let mut my_text = vec![0; 10]; // A buffer to randomize
-    /// let alphabet = b"ACTG"; // Using a DNA alphabet
-    /// sz::randomize(&mut my_text, &alphabet);
+    /// let mut buffer = vec![0; 10];
+    /// sz::fill_random(&mut buffer, 42);
     /// ```
     ///
-    /// After than,  `my_text` is filled with random 'A', 'C', 'T', or 'G' values.
-    pub fn randomize<T, A>(text: &mut T, alphabet: &A)
+    /// After than,  `buffer` is filled with random byte values from 0 to 255.
+    pub fn fill_random<T>(buffer: &mut T, nonce: u64)
     where
         T: AsMut<[u8]> + ?Sized, // Allows for mutable references to dynamically sized types.
-        A: AsRef<[u8]> + ?Sized, // Allows for references to dynamically sized types.
     {
-        let text_slice = text.as_mut();
-        let alphabet_slice = alphabet.as_ref();
+        let buffer_slice = buffer.as_mut();
         unsafe {
-            sz_generate(
-                alphabet_slice.as_ptr() as *const c_void,
-                alphabet_slice.len(),
-                text_slice.as_mut_ptr() as *mut c_void,
-                text_slice.len(),
-                core::ptr::null(),
-                core::ptr::null_mut(),
-            );
+            sz_fill_random(buffer_slice.as_ptr() as _, buffer_slice.len(), nonce);
         }
     }
 }
@@ -757,10 +795,10 @@ impl<'a> Matcher<'a> for MatcherType<'a> {
         match self {
             MatcherType::Find(needle) => sz::find(haystack, needle),
             MatcherType::RFind(needle) => sz::rfind(haystack, needle),
-            MatcherType::FindFirstOf(needles) => sz::find_char_from(haystack, needles),
-            MatcherType::FindLastOf(needles) => sz::rfind_char_from(haystack, needles),
-            MatcherType::FindFirstNotOf(needles) => sz::find_char_not_from(haystack, needles),
-            MatcherType::FindLastNotOf(needles) => sz::rfind_char_not_from(haystack, needles),
+            MatcherType::FindFirstOf(needles) => sz::find_byte_from(haystack, needles),
+            MatcherType::FindLastOf(needles) => sz::rfind_byte_from(haystack, needles),
+            MatcherType::FindFirstNotOf(needles) => sz::find_byte_not_from(haystack, needles),
+            MatcherType::FindLastNotOf(needles) => sz::rfind_byte_not_from(haystack, needles),
         }
     }
 
@@ -1088,9 +1126,9 @@ where
     /// use stringzilla::StringZilla;
     ///
     /// let haystack = "Hello, world!";
-    /// assert_eq!(haystack.sz_find_char_from("aeiou".as_bytes()), Some(1));
+    /// assert_eq!(haystack.sz_find_byte_from("aeiou".as_bytes()), Some(1));
     /// ```
-    fn sz_find_char_from(&self, needles: N) -> Option<usize>;
+    fn sz_find_byte_from(&self, needles: N) -> Option<usize>;
 
     /// Finds the index of the last character in `self` that is also present in `needles`.
     ///
@@ -1100,9 +1138,9 @@ where
     /// use stringzilla::StringZilla;
     ///
     /// let haystack = "Hello, world!";
-    /// assert_eq!(haystack.sz_rfind_char_from("aeiou".as_bytes()), Some(8));
+    /// assert_eq!(haystack.sz_rfind_byte_from("aeiou".as_bytes()), Some(8));
     /// ```
-    fn sz_rfind_char_from(&self, needles: N) -> Option<usize>;
+    fn sz_rfind_byte_from(&self, needles: N) -> Option<usize>;
 
     /// Finds the index of the first character in `self` that is not present in `needles`.
     ///
@@ -1112,9 +1150,9 @@ where
     /// use stringzilla::StringZilla;
     ///
     /// let haystack = "Hello, world!";
-    /// assert_eq!(haystack.sz_find_char_not_from("aeiou".as_bytes()), Some(0));
+    /// assert_eq!(haystack.sz_find_byte_not_from("aeiou".as_bytes()), Some(0));
     /// ```
-    fn sz_find_char_not_from(&self, needles: N) -> Option<usize>;
+    fn sz_find_byte_not_from(&self, needles: N) -> Option<usize>;
 
     /// Finds the index of the last character in `self` that is not present in `needles`.
     ///
@@ -1124,9 +1162,9 @@ where
     /// use stringzilla::StringZilla;
     ///
     /// let haystack = "Hello, world!";
-    /// assert_eq!(haystack.sz_rfind_char_not_from("aeiou".as_bytes()), Some(12));
+    /// assert_eq!(haystack.sz_rfind_byte_not_from("aeiou".as_bytes()), Some(12));
     /// ```
-    fn sz_rfind_char_not_from(&self, needles: N) -> Option<usize>;
+    fn sz_rfind_byte_not_from(&self, needles: N) -> Option<usize>;
 
     /// Computes the Levenshtein edit distance between `self` and `other`.
     ///
@@ -1137,9 +1175,9 @@ where
     ///
     /// let first = "kitten";
     /// let second = "sitting";
-    /// assert_eq!(first.sz_edit_distance(second.as_bytes()), 3);
+    /// assert_eq!(first.sz_levenshtein_distance(second.as_bytes()), Ok(3));
     /// ```
-    fn sz_edit_distance(&self, other: N) -> usize;
+    fn sz_levenshtein_distance(&self, other: N) -> Result<usize, sz::Status>;
 
     /// Computes the Levenshtein edit distance between `self` and `other`.
     ///
@@ -1150,9 +1188,9 @@ where
     ///
     /// let first = "kitten";
     /// let second = "sitting";
-    /// assert_eq!(first.sz_edit_distance_utf8(second.as_bytes()), 3);
+    /// assert_eq!(first.sz_levenshtein_distance_utf8(second.as_bytes()), Ok(3));
     /// ```
-    fn sz_edit_distance_utf8(&self, other: N) -> usize;
+    fn sz_levenshtein_distance_utf8(&self, other: N) -> Result<usize, sz::Status>;
 
     /// Computes the bounded Levenshtein edit distance between `self` and `other`.
     ///
@@ -1163,9 +1201,9 @@ where
     ///
     /// let first = "kitten";
     /// let second = "sitting";
-    /// assert_eq!(first.sz_edit_distance_bounded(second.as_bytes()), 3);
+    /// assert_eq!(first.sz_levenshtein_distance_bounded(second.as_bytes()), Ok(3));
     /// ```
-    fn sz_edit_distance_bounded(&self, other: N, bound: usize) -> usize;
+    fn sz_levenshtein_distance_bounded(&self, other: N, bound: usize) -> Result<usize, sz::Status>;
 
     /// Computes the bounded Levenshtein edit distance between `self` and `other`.
     ///
@@ -1176,9 +1214,9 @@ where
     ///
     /// let first = "kitten";
     /// let second = "sitting";
-    /// assert_eq!(first.sz_edit_distance_utf8_bounded(second.as_bytes()), 3);
+    /// assert_eq!(first.sz_levenshtein_distance_utf8_bounded(second.as_bytes()), Ok(3));
     /// ```
-    fn sz_edit_distance_utf8_bounded(&self, other: N, bound: usize) -> usize;
+    fn sz_levenshtein_distance_utf8_bounded(&self, other: N, bound: usize) -> Result<usize, sz::Status>;
 
     /// Computes the alignment score between `self` and `other` using the specified
     /// substitution matrix and gap penalty.
@@ -1192,9 +1230,9 @@ where
     /// let second = "sitting";
     /// let matrix = sz::unary_substitution_costs();
     /// let gap_penalty = -1;
-    /// assert_eq!(first.sz_alignment_score(second.as_bytes(), matrix, gap_penalty), -3);
+    /// assert_eq!(first.sz_needleman_wunsch_score(second.as_bytes(), matrix, gap_penalty), Ok(-3));
     /// ```
-    fn sz_alignment_score(&self, other: N, matrix: [[i8; 256]; 256], gap: i8) -> isize;
+    fn sz_needleman_wunsch_score(&self, other: N, matrix: [[i8; 256]; 256], gap: i8) -> Result<isize, sz::Status>;
 
     /// Returns an iterator over all non-overlapping matches of the given `needle` in `self`.
     ///
@@ -1362,39 +1400,39 @@ where
         sz::rfind(self, needle)
     }
 
-    fn sz_find_char_from(&self, needles: N) -> Option<usize> {
-        sz::find_char_from(self, needles)
+    fn sz_find_byte_from(&self, needles: N) -> Option<usize> {
+        sz::find_byte_from(self, needles)
     }
 
-    fn sz_rfind_char_from(&self, needles: N) -> Option<usize> {
-        sz::rfind_char_from(self, needles)
+    fn sz_rfind_byte_from(&self, needles: N) -> Option<usize> {
+        sz::rfind_byte_from(self, needles)
     }
 
-    fn sz_find_char_not_from(&self, needles: N) -> Option<usize> {
-        sz::find_char_not_from(self, needles)
+    fn sz_find_byte_not_from(&self, needles: N) -> Option<usize> {
+        sz::find_byte_not_from(self, needles)
     }
 
-    fn sz_rfind_char_not_from(&self, needles: N) -> Option<usize> {
-        sz::rfind_char_not_from(self, needles)
+    fn sz_rfind_byte_not_from(&self, needles: N) -> Option<usize> {
+        sz::rfind_byte_not_from(self, needles)
     }
 
-    fn sz_edit_distance(&self, other: N) -> usize {
-        sz::edit_distance(self, other)
+    fn sz_levenshtein_distance(&self, other: N) -> Result<usize, sz::Status> {
+        sz::levenshtein_distance(self, other)
     }
 
-    fn sz_edit_distance_utf8(&self, other: N) -> usize {
-        sz::edit_distance_utf8(self, other)
+    fn sz_levenshtein_distance_utf8(&self, other: N) -> Result<usize, sz::Status> {
+        sz::levenshtein_distance_utf8(self, other)
     }
 
-    fn sz_edit_distance_bounded(&self, other: N, bound: usize) -> usize {
-        sz::edit_distance_bounded(self, other, bound)
+    fn sz_levenshtein_distance_bounded(&self, other: N, bound: usize) -> Result<usize, sz::Status> {
+        sz::levenshtein_distance_bounded(self, other, bound)
     }
 
-    fn sz_edit_distance_utf8_bounded(&self, other: N, bound: usize) -> usize {
-        sz::edit_distance_utf8_bounded(self, other, bound)
+    fn sz_levenshtein_distance_utf8_bounded(&self, other: N, bound: usize) -> Result<usize, sz::Status> {
+        sz::levenshtein_distance_utf8_bounded(self, other, bound)
     }
 
-    fn sz_alignment_score(&self, other: N, matrix: [[i8; 256]; 256], gap: i8) -> isize {
+    fn sz_needleman_wunsch_score(&self, other: N, matrix: [[i8; 256]; 256], gap: i8) -> Result<isize, sz::Status> {
         sz::alignment_score(self, other, matrix, gap)
     }
 
@@ -1415,84 +1453,19 @@ where
     }
 
     fn sz_find_first_of(&'a self, needles: &'a N) -> RangeMatches<'a> {
-        RangeMatches::new(
-            self.as_ref(),
-            MatcherType::FindFirstOf(needles.as_ref()),
-            true,
-        )
+        RangeMatches::new(self.as_ref(), MatcherType::FindFirstOf(needles.as_ref()), true)
     }
 
     fn sz_find_last_of(&'a self, needles: &'a N) -> RangeRMatches<'a> {
-        RangeRMatches::new(
-            self.as_ref(),
-            MatcherType::FindLastOf(needles.as_ref()),
-            true,
-        )
+        RangeRMatches::new(self.as_ref(), MatcherType::FindLastOf(needles.as_ref()), true)
     }
 
     fn sz_find_first_not_of(&'a self, needles: &'a N) -> RangeMatches<'a> {
-        RangeMatches::new(
-            self.as_ref(),
-            MatcherType::FindFirstNotOf(needles.as_ref()),
-            true,
-        )
+        RangeMatches::new(self.as_ref(), MatcherType::FindFirstNotOf(needles.as_ref()), true)
     }
 
     fn sz_find_last_not_of(&'a self, needles: &'a N) -> RangeRMatches<'a> {
-        RangeRMatches::new(
-            self.as_ref(),
-            MatcherType::FindLastNotOf(needles.as_ref()),
-            true,
-        )
-    }
-}
-
-/// Provides a tool for mutating a byte slice by filling it with random data from a specified alphabet.
-/// This trait is especially useful for types that need to be mutable and can reference or be converted to byte slices.
-///
-/// # Examples
-///
-/// Filling a mutable byte buffer with random ASCII letters:
-///
-/// ```
-/// use stringzilla::MutableStringZilla;
-///
-/// let mut buffer = vec![0u8; 10]; // A buffer to randomize
-/// let alphabet = b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; // Alphabet to use
-/// buffer.sz_randomize(alphabet);
-///
-/// println!("Random buffer: {:?}", buffer);
-/// // The buffer will now contain random ASCII letters.
-/// ```
-pub trait MutableStringZilla<A>
-where
-    A: AsRef<[u8]>,
-{
-    /// Fills the implementing byte slice with random bytes from the specified `alphabet`.
-    ///
-    /// # Examples
-    ///
-    /// ```
-    /// use stringzilla::MutableStringZilla;
-    ///
-    /// let mut text = vec![0; 1000]; // A buffer to randomize
-    /// let alphabet = b"AGTC"; // Using a DNA alphabet
-    /// text.sz_randomize(alphabet);
-    ///
-    /// // `text` is now filled with random 'A', 'G', 'T', or 'C' values.
-    /// ```
-    fn sz_randomize(&mut self, alphabet: A);
-}
-
-impl<T, A> MutableStringZilla<A> for T
-where
-    T: AsMut<[u8]>,
-    A: AsRef<[u8]>,
-{
-    fn sz_randomize(&mut self, alphabet: A) {
-        let self_mut = self.as_mut();
-        let alphabet_ref = alphabet.as_ref();
-        sz::randomize(self_mut, alphabet_ref);
+        RangeRMatches::new(self.as_ref(), MatcherType::FindLastNotOf(needles.as_ref()), true)
     }
 }
 
@@ -1500,50 +1473,46 @@ where
 mod tests {
     use std::borrow::Cow;
 
-    use crate::sz;
-    use crate::MutableStringZilla;
-    use crate::StringZilla;
+    use crate::sz; // For global functions
+    use crate::StringZilla; // For member functions
 
     #[test]
     fn hamming() {
-        assert_eq!(sz::hamming_distance("hello", "hello"), 0);
-        assert_eq!(sz::hamming_distance("hello", "hell"), 1);
-        assert_eq!(sz::hamming_distance("abc", "adc"), 1);
+        assert_eq!(sz::hamming_distance("hello", "hello"), Ok(0));
+        assert_eq!(sz::hamming_distance("hello", "hell"), Ok(1));
+        assert_eq!(sz::hamming_distance("abc", "adc"), Ok(1));
 
-        assert_eq!(sz::hamming_distance_bounded("abcdefgh", "ABCDEFGH", 2), 2);
-        assert_eq!(sz::hamming_distance_utf8("αβγδ", "αγγδ"), 1);
+        assert_eq!(sz::hamming_distance_bounded("abcdefgh", "ABCDEFGH", 2), Ok(2));
+        assert_eq!(sz::hamming_distance_utf8("αβγδ", "αγγδ"), Ok(1));
     }
 
     #[test]
     fn levenshtein() {
-        assert_eq!(sz::edit_distance("hello", "hell"), 1);
-        assert_eq!(sz::edit_distance("hello", "hell"), 1);
-        assert_eq!(sz::edit_distance("abc", ""), 3);
-        assert_eq!(sz::edit_distance("abc", "ac"), 1);
-        assert_eq!(sz::edit_distance("abc", "a_bc"), 1);
-        assert_eq!(sz::edit_distance("abc", "adc"), 1);
-        assert_eq!(sz::edit_distance("fitting", "kitty"), 4);
-        assert_eq!(sz::edit_distance("smitten", "mitten"), 1);
-        assert_eq!(sz::edit_distance("ggbuzgjux{}l", "gbuzgjux{}l"), 1);
-        assert_eq!(sz::edit_distance("abcdefgABCDEFG", "ABCDEFGabcdefg"), 14);
-
-        assert_eq!(sz::edit_distance_bounded("fitting", "kitty", 2), 2);
-        assert_eq!(sz::edit_distance_utf8("façade", "facade"), 1);
+        assert_eq!(sz::levenshtein_distance("hello", "hell"), Ok(1));
+        assert_eq!(sz::levenshtein_distance("hello", "hell"), Ok(1));
+        assert_eq!(sz::levenshtein_distance("abc", ""), Ok(3));
+        assert_eq!(sz::levenshtein_distance("abc", "ac"), Ok(1));
+        assert_eq!(sz::levenshtein_distance("abc", "a_bc"), Ok(1));
+        assert_eq!(sz::levenshtein_distance("abc", "adc"), Ok(1));
+        assert_eq!(sz::levenshtein_distance("fitting", "kitty"), Ok(4));
+        assert_eq!(sz::levenshtein_distance("smitten", "mitten"), Ok(1));
+        assert_eq!(sz::levenshtein_distance("ggbuzgjux{}l", "gbuzgjux{}l"), Ok(1));
+        assert_eq!(sz::levenshtein_distance("abcdefgABCDEFG", "ABCDEFGabcdefg"), Ok(14));
+
+        assert_eq!(sz::levenshtein_distance_bounded("fitting", "kitty", 2), Ok(2));
+        assert_eq!(sz::levenshtein_distance_utf8("façade", "facade"), Ok(1));
     }
 
     #[test]
     fn needleman() {
         let costs_vector = sz::unary_substitution_costs();
-        assert_eq!(
-            sz::alignment_score("listen", "silent", costs_vector, -1),
-            -4
-        );
+        assert_eq!(sz::alignment_score("listen", "silent", costs_vector, -1), Ok(-4));
         assert_eq!(
             sz::alignment_score("abcdefgABCDEFG", "ABCDEFGabcdefg", costs_vector, -1),
-            -14
+            Ok(-14)
         );
-        assert_eq!(sz::alignment_score("hello", "hello", costs_vector, -1), 0);
-        assert_eq!(sz::alignment_score("hello", "hell", costs_vector, -1), -1);
+        assert_eq!(sz::alignment_score("hello", "hello", costs_vector, -1), Ok(0));
+        assert_eq!(sz::alignment_score("hello", "hell", costs_vector, -1), Ok(-1));
     }
 
     #[test]
@@ -1559,41 +1528,37 @@ mod tests {
         // Use the generic function with a String
         assert_eq!(my_string.sz_find("world"), Some(7));
         assert_eq!(my_string.sz_rfind("world"), Some(7));
-        assert_eq!(my_string.sz_find_char_from("world"), Some(2));
-        assert_eq!(my_string.sz_rfind_char_from("world"), Some(11));
-        assert_eq!(my_string.sz_find_char_not_from("world"), Some(0));
-        assert_eq!(my_string.sz_rfind_char_not_from("world"), Some(12));
+        assert_eq!(my_string.sz_find_byte_from("world"), Some(2));
+        assert_eq!(my_string.sz_rfind_byte_from("world"), Some(11));
+        assert_eq!(my_string.sz_find_byte_not_from("world"), Some(0));
+        assert_eq!(my_string.sz_rfind_byte_not_from("world"), Some(12));
 
         // Use the generic function with a &str
         assert_eq!(my_str.sz_find("world"), Some(7));
         assert_eq!(my_str.sz_find("world"), Some(7));
-        assert_eq!(my_str.sz_find_char_from("world"), Some(2));
-        assert_eq!(my_str.sz_rfind_char_from("world"), Some(11));
-        assert_eq!(my_str.sz_find_char_not_from("world"), Some(0));
-        assert_eq!(my_str.sz_rfind_char_not_from("world"), Some(12));
+        assert_eq!(my_str.sz_find_byte_from("world"), Some(2));
+        assert_eq!(my_str.sz_rfind_byte_from("world"), Some(11));
+        assert_eq!(my_str.sz_find_byte_not_from("world"), Some(0));
+        assert_eq!(my_str.sz_rfind_byte_not_from("world"), Some(12));
 
         // Use the generic function with a Cow<'_, str>
         assert_eq!(my_cow_str.as_ref().sz_find("world"), Some(7));
         assert_eq!(my_cow_str.as_ref().sz_find("world"), Some(7));
-        assert_eq!(my_cow_str.as_ref().sz_find_char_from("world"), Some(2));
-        assert_eq!(my_cow_str.as_ref().sz_rfind_char_from("world"), Some(11));
-        assert_eq!(my_cow_str.as_ref().sz_find_char_not_from("world"), Some(0));
-        assert_eq!(
-            my_cow_str.as_ref().sz_rfind_char_not_from("world"),
-            Some(12)
-        );
+        assert_eq!(my_cow_str.as_ref().sz_find_byte_from("world"), Some(2));
+        assert_eq!(my_cow_str.as_ref().sz_rfind_byte_from("world"), Some(11));
+        assert_eq!(my_cow_str.as_ref().sz_find_byte_not_from("world"), Some(0));
+        assert_eq!(my_cow_str.as_ref().sz_rfind_byte_not_from("world"), Some(12));
     }
 
     #[test]
-    fn randomize() {
-        let mut text: Vec<u8> = vec![0; 10]; // A buffer of ten zeros
-        let alphabet: &[u8] = b"abcd"; // A byte slice alphabet
-        text.sz_randomize(alphabet);
-
-        // Iterate throught text and check that it only contains letters from the alphabet
-        assert!(text
-            .iter()
-            .all(|&b| b == b'd' || b == b'c' || b == b'b' || b == b'a'));
+    fn fill_random() {
+        let mut first_buffer: Vec<u8> = vec![0; 10]; // Ten zeros
+        let mut second_buffer: Vec<u8> = vec![1; 10]; // Ten ones
+        sz::fill_random(&mut first_buffer, 42);
+        sz::fill_random(&mut second_buffer, 42);
+
+        // Same nonce will produce the same outputs
+        assert!(first_buffer != second_buffer);
     }
 
     mod search_split_iterators {
diff --git a/rustfmt.toml b/rustfmt.toml
new file mode 100644
index 00000000..75306517
--- /dev/null
+++ b/rustfmt.toml
@@ -0,0 +1 @@
+max_width = 120
diff --git a/scripts/bench_memory.cpp b/scripts/bench_memory.cpp
index 4f52c282..47a67835 100644
--- a/scripts/bench_memory.cpp
+++ b/scripts/bench_memory.cpp
@@ -176,7 +176,7 @@ tracked_unary_functions_t transform_functions() {
     auto wrap_sz = [](auto function) -> unary_function_t {
         return unary_function_t([function](std::string_view slice) {
             char *output = const_cast<char *>(slice.data());
-            function((sz_cptr_t)output, (sz_size_t)slice.size(), (sz_cptr_t)look_up_table, (sz_ptr_t)output);
+            function((sz_ptr_t)output, (sz_size_t)slice.size(), (sz_cptr_t)output, (sz_cptr_t)look_up_table);
             return slice.size();
         });
     };
diff --git a/scripts/bench_similarity.cpp b/scripts/bench_similarity.cpp
index ca901a5f..b2d1c9ee 100644
--- a/scripts/bench_similarity.cpp
+++ b/scripts/bench_similarity.cpp
@@ -38,25 +38,28 @@ tracked_binary_functions_t distance_functions() {
     });
     auto wrap_sz_distance = [alloc](auto function) mutable -> binary_function_t {
         return binary_function_t([function, alloc](std::string_view a, std::string_view b) mutable -> std::size_t {
-            return function(a.data(), a.length(), b.data(), b.length(), SZ_SIZE_MAX, &alloc);
+            sz_size_t result;
+            function(a.data(), a.length(), b.data(), b.length(), SZ_SIZE_MAX, &alloc, &result);
+            return result;
         });
     };
     auto wrap_sz_scoring = [alloc, costs_ptr](auto function) mutable -> binary_function_t {
         return binary_function_t(
             [function, alloc, costs_ptr](std::string_view a, std::string_view b) mutable -> std::size_t {
                 sz_memory_allocator_t *alloc_ptr = &alloc;
-                sz_ssize_t signed_result =
-                    function(a.data(), a.length(), b.data(), b.length(), costs_ptr, (sz_error_cost_t)-1, alloc_ptr);
+                sz_ssize_t signed_result;
+                function(a.data(), a.length(), b.data(), b.length(), costs_ptr, (sz_error_cost_t)-1, alloc_ptr,
+                         &signed_result);
                 return (std::size_t)(-signed_result);
             });
     };
     tracked_binary_functions_t result = {
         {"naive", wrap_baseline},
-        {"sz_edit_distance_serial", wrap_sz_distance(sz_edit_distance_serial), true},
-        {"sz_alignment_score_serial", wrap_sz_scoring(sz_alignment_score_serial), true},
+        {"sz_levenshtein_distance_serial", wrap_sz_distance(sz_levenshtein_distance_serial), true},
+        {"sz_needleman_wunsch_score_serial", wrap_sz_scoring(sz_needleman_wunsch_score_serial), true},
 #if SZ_USE_ICE
-        {"sz_edit_distance_ice", wrap_sz_distance(sz_edit_distance_ice), true},
-        {"sz_alignment_score_ice", wrap_sz_scoring(sz_alignment_score_ice), true},
+        {"sz_levenshtein_distance_ice", wrap_sz_distance(sz_levenshtein_distance_ice), true},
+        {"sz_needleman_wunsch_score_ice", wrap_sz_scoring(sz_needleman_wunsch_score_ice), true},
 #endif
     };
     return result;
diff --git a/scripts/bench_sort.cpp b/scripts/bench_sort.cpp
index 22758d95..a045192f 100644
--- a/scripts/bench_sort.cpp
+++ b/scripts/bench_sort.cpp
@@ -23,13 +23,13 @@ using permute_t = std::vector<sz_sorted_idx_t>;
 
 #pragma region C callbacks
 
-static char const *get_start(sz_sequence_t const *array_c, sz_size_t i) {
-    strings_t const &array = *reinterpret_cast<strings_t const *>(array_c->handle);
+static sz_cptr_t get_start(void const *handle, sz_size_t i) {
+    strings_t const &array = *reinterpret_cast<strings_t const *>(handle);
     return array[i].c_str();
 }
 
-static sz_size_t get_length(sz_sequence_t const *array_c, sz_size_t i) {
-    strings_t const &array = *reinterpret_cast<strings_t const *>(array_c->handle);
+static sz_size_t get_length(void const *handle, sz_size_t i) {
+    strings_t const &array = *reinterpret_cast<strings_t const *>(handle);
     return array[i].size();
 }
 
@@ -112,21 +112,11 @@ int main(int argc, char const **argv) {
     });
     expect_sorted(pgrams, permute);
 
-    bench_permute("sz_pgrams_sort_ice", [&]() {
+    bench_permute("sz_pgrams_sort_skylake", [&]() {
         std::copy(pgrams.begin(), pgrams.end(), pgrams_sorted.begin());
         std::iota(permute.begin(), permute.end(), 0);
         sz::_with_alloc<allocator_t>([&](sz_memory_allocator_t &alloc) {
-            return sz_pgrams_sort_ice(pgrams_sorted.data(), pgrams_sorted.size(), &alloc, permute.data());
-        });
-    });
-    expect_sorted(pgrams, permute);
-
-    // Unlike the `std::sort` adaptation above, the `sz_pgrams_sort_stable_serial` also sorts the input array inplace
-    bench_permute("sz_pgrams_sort_stable_serial", [&]() {
-        std::copy(pgrams.begin(), pgrams.end(), pgrams_sorted.begin());
-        std::iota(permute.begin(), permute.end(), 0);
-        sz::_with_alloc<allocator_t>([&](sz_memory_allocator_t &alloc) {
-            return sz_pgrams_sort_stable_serial(pgrams_sorted.data(), pgrams_sorted.size(), &alloc, permute.data());
+            return sz_pgrams_sort_skylake(pgrams_sorted.data(), pgrams_sorted.size(), &alloc, permute.data());
         });
     });
     expect_sorted(pgrams, permute);
@@ -151,7 +141,7 @@ int main(int argc, char const **argv) {
     });
     expect_sorted(strings, permute);
 
-    bench_permute("sz_sequence_argsort_ice", [&]() {
+    bench_permute("sz_sequence_argsort_skylake", [&]() {
         std::iota(permute.begin(), permute.end(), 0);
         sz_sequence_t array;
         array.count = strings.size();
@@ -159,7 +149,7 @@ int main(int argc, char const **argv) {
         array.get_start = get_start;
         array.get_length = get_length;
         sz::_with_alloc<allocator_t>(
-            [&](sz_memory_allocator_t &alloc) { return sz_sequence_argsort_ice(&array, &alloc, permute.data()); });
+            [&](sz_memory_allocator_t &alloc) { return sz_sequence_argsort_skylake(&array, &alloc, permute.data()); });
     });
     expect_sorted(strings, permute);
 
diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index 378ad4f0..0d83604b 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -61,12 +61,12 @@ tracked_unary_functions_t hash_functions() {
     return result;
 }
 
-struct wrapped_incremental_hash {
+struct wrap_hash_stream {
     sz_hash_state_t state;
     sz_hash_state_stream_t stream;
     sz_hash_state_fold_t fold;
 
-    wrapped_incremental_hash(sz_hash_state_stream_t s, sz_hash_state_fold_t f) : stream(s), fold(f) {
+    wrap_hash_stream(sz_hash_state_stream_t s, sz_hash_state_fold_t f) : stream(s), fold(f) {
         sz_hash_state_init(&state, 42);
     }
 
@@ -78,20 +78,18 @@ struct wrapped_incremental_hash {
 
 tracked_unary_functions_t hash_stream_functions() {
     tracked_unary_functions_t result = {
-        {"sz_hash_stream_serial", wrapped_incremental_hash(sz_hash_state_stream_serial, sz_hash_state_fold_serial)},
+        {"sz_hash_stream_serial", wrap_hash_stream(sz_hash_state_stream_serial, sz_hash_state_fold_serial)},
 #if SZ_USE_HASWELL
-        {"sz_hash_stream_haswell", wrapped_incremental_hash(sz_hash_state_stream_haswell, sz_hash_state_fold_haswell),
-         true},
+        {"sz_hash_stream_haswell", wrap_hash_stream(sz_hash_state_stream_haswell, sz_hash_state_fold_haswell), true},
 #endif
 #if SZ_USE_SKYLAKE
-        {"sz_hash_stream_skylake", wrapped_incremental_hash(sz_hash_state_stream_skylake, sz_hash_state_fold_skylake),
-         true},
+        {"sz_hash_stream_skylake", wrap_hash_stream(sz_hash_state_stream_skylake, sz_hash_state_fold_skylake), true},
 #endif
 #if SZ_USE_ICE
-        {"sz_hash_stream_ice", wrapped_incremental_hash(sz_hash_state_stream_ice, sz_hash_state_fold_ice), true},
+        {"sz_hash_stream_ice", wrap_hash_stream(sz_hash_state_stream_ice, sz_hash_state_fold_ice), true},
 #endif
 #if SZ_USE_NEON
-        {"sz_hash_stream_neon", wrapped_incremental_hash(sz_hash_state_stream_neon, sz_hash_state_fold_neon), true},
+        {"sz_hash_stream_neon", wrap_hash_stream(sz_hash_state_stream_neon, sz_hash_state_fold_neon), true},
 #endif
     };
     return result;
diff --git a/scripts/test.py b/scripts/test.py
index ea95e8d4..eb92252d 100644
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -178,10 +178,10 @@ def test_unit_split():
     assert letters == ["a", "b", "c", "d"]
 
     # Splitting using character sets
-    letters = sz.split_charset("a b_c d", " _")
+    letters = sz.split_byteset("a b_c d", " _")
     assert letters == ["a", "b", "c", "d"]
 
-    letters = sz.rsplit_charset("a b_c d", " _")
+    letters = sz.rsplit_byteset("a b_c d", " _")
     assert letters == ["a", "b", "c", "d"]
 
     # Check for equivalence with native Python strings for newline separators
@@ -212,17 +212,17 @@ def test_unit_split():
     with pytest.raises(ValueError):
         sz.rsplit(big, "")
     with pytest.raises(ValueError):
-        sz.split_charset(big, "")
+        sz.split_byteset(big, "")
     with pytest.raises(ValueError):
-        sz.rsplit_charset(big, "")
+        sz.rsplit_byteset(big, "")
 
 
 def test_unit_split_iterators():
     """
     Test the iterator-based split methods.
     This is slightly different from `split` and `rsplit` in that it returns an iterator instead of a list.
-    Moreover, the native `rsplit` and even `rsplit_charset` report results in the identical order to `split`
-    and `split_charset`. Here `rsplit_iter` reports elements in the reverse order, compared to `split_iter`.
+    Moreover, the native `rsplit` and even `rsplit_byteset` report results in the identical order to `split`
+    and `split_byteset`. Here `rsplit_iter` reports elements in the reverse order, compared to `split_iter`.
     """
     native = "line1\nline2\nline3"
     big = Str(native)
@@ -244,10 +244,10 @@ def test_unit_split_iterators():
     assert letters == ["a", "b", "c", "d"]
 
     # Splitting using character sets
-    letters = list(sz.split_charset_iter("a-b_c-d", "-_"))
+    letters = list(sz.split_byteset_iter("a-b_c-d", "-_"))
     assert letters == ["a", "b", "c", "d"]
 
-    letters = list(sz.rsplit_charset_iter("a-b_c-d", "-_"))
+    letters = list(sz.rsplit_byteset_iter("a-b_c-d", "-_"))
     assert letters == ["d", "c", "b", "a"]
 
     # Check for equivalence with native Python strings, including boundary conditions
@@ -279,9 +279,9 @@ def rlist(seq):
     with pytest.raises(ValueError):
         sz.rsplit_iter(big, "")
     with pytest.raises(ValueError):
-        sz.split_charset_iter(big, "")
+        sz.split_byteset_iter(big, "")
     with pytest.raises(ValueError):
-        sz.rsplit_charset_iter(big, "")
+        sz.rsplit_byteset_iter(big, "")
 
 
 def test_unit_strs_sequence():
@@ -289,7 +289,7 @@ def test_unit_strs_sequence():
     big = Str(native)
 
     lines = big.splitlines()
-    assert [2, 1, 0] == list(lines.order())
+    assert [2, 1, 0] == list(lines.argsort())
     assert "p3" in lines
     assert "p4" not in lines
 
@@ -301,11 +301,11 @@ def test_unit_strs_sequence():
     assert str(Str("a" * 1_000_000).split()).endswith("aaa']")
 
     lines.sort()
-    assert [0, 1, 2] == list(lines.order())
+    assert [0, 1, 2] == list(lines.argsort())
     assert ["p1", "p2", "p3"] == list(lines)
 
     # Reverse order
-    assert [2, 1, 0] == list(lines.order(reverse=True))
+    assert [2, 1, 0] == list(lines.argsort(reverse=True))
     lines.sort(reverse=True)
     assert ["p3", "p2", "p1"] == list(lines)
 
@@ -798,7 +798,7 @@ def test_fuzzy_sorting(list_length: int, part_length: int, variability: int):
     big_list = big_joined.split(".")
 
     native_ordered = sorted(native_list)
-    native_order = big_list.order()
+    native_order = big_list.argsort()
     for i in range(list_length):
         assert native_ordered[i] == native_list[native_order[i]], "Order is wrong"
         assert native_ordered[i] == str(
@@ -826,7 +826,7 @@ def test_fuzzy_sorting(list_length: int, part_length: int, variability: int):
     big_list = big_joined.split(".")
 
     native_ordered = sorted(native_list)
-    native_order = big_list.order()
+    native_order = big_list.argsort()
     for i in range(list_length):
         assert native_ordered[i] == native_list[native_order[i]], "Order is wrong"
         assert native_ordered[i] == str(
diff --git a/swift/StringProtocol+StringZilla.swift b/swift/StringProtocol+StringZilla.swift
index d90c8afc..e573b609 100644
--- a/swift/StringProtocol+StringZilla.swift
+++ b/swift/StringProtocol+StringZilla.swift
@@ -18,13 +18,13 @@ import StringZillaC
 
 // We need to link the standard libraries.
 #if os(Linux)
-import Glibc
+    import Glibc
 #else
-import Darwin.C
+    import Darwin.C
 #endif
 
 /// Protocol defining a single-byte data type.
-fileprivate protocol SingleByte {}
+private protocol SingleByte {}
 
 extension UInt8: SingleByte {}
 extension Int8: SingleByte {} // This would match `CChar` as well.
@@ -33,7 +33,7 @@ extension Int8: SingleByte {} // This would match `CChar` as well.
 enum StringZillaError: Error {
     case contiguousStorageUnavailable
     case memoryAllocationFailed
-    
+
     var localizedDescription: String {
         switch self {
         case .contiguousStorageUnavailable:
@@ -54,7 +54,7 @@ enum StringZillaError: Error {
 /// https://developer.apple.com/documentation/swift/stringprotocol/data(using:allowlossyconversion:)
 public protocol StringZillaViewable: Collection {
     /// A type that represents a position in the collection.
-    /// 
+    ///
     /// Executes a closure with a pointer to the string's UTF8 C representation and its length.
     ///
     /// - Parameters:
@@ -62,7 +62,7 @@ public protocol StringZillaViewable: Collection {
     /// - Throws: Can throw an error.
     /// - Returns: Returns a value of type R, which is the result of the closure.
     func withStringZillaScope<R>(_ body: (sz_cptr_t, sz_size_t) throws -> R) rethrows -> R
-    
+
     /// Calculates the offset index for a given byte pointer relative to a start pointer.
     ///
     /// - Parameters:
@@ -74,24 +74,24 @@ public protocol StringZillaViewable: Collection {
 
 extension String: StringZillaViewable {
     public typealias Index = String.Index
-    
+
     @_transparent
     public func withStringZillaScope<R>(_ body: (sz_cptr_t, sz_size_t) throws -> R) rethrows -> R {
         let cLength = sz_size_t(utf8.count)
-        return try self.withCString { cString in
+        return try withCString { cString in
             try body(cString, cLength)
         }
     }
-    
+
     @_transparent
     public func stringZillaByteOffset(forByte bytePointer: sz_cptr_t, after startPointer: sz_cptr_t) -> Index {
-        self.utf8.index(self.utf8.startIndex, offsetBy: bytePointer - startPointer)
+        utf8.index(utf8.startIndex, offsetBy: bytePointer - startPointer)
     }
 }
 
 extension Substring.UTF8View: StringZillaViewable {
     public typealias Index = Substring.UTF8View.Index
-    
+
     /// Executes a closure with a pointer to the UTF8View's contiguous storage of single-byte elements (UTF-8 code units).
     /// - Parameters:
     ///   - body: A closure that takes a pointer to the contiguous storage and its size.
@@ -106,7 +106,7 @@ extension Substring.UTF8View: StringZillaViewable {
             throw StringZillaError.contiguousStorageUnavailable
         }()
     }
-    
+
     /// Calculates the offset index for a given byte pointer relative to a start pointer.
     /// - Parameters:
     ///   - bytePointer: A pointer to the byte for which the offset is calculated.
@@ -114,13 +114,13 @@ extension Substring.UTF8View: StringZillaViewable {
     /// - Returns: The calculated index offset.
     @_transparent
     public func stringZillaByteOffset(forByte bytePointer: sz_cptr_t, after startPointer: sz_cptr_t) -> Index {
-        return self.index(self.startIndex, offsetBy: bytePointer - startPointer)
+        return index(startIndex, offsetBy: bytePointer - startPointer)
     }
 }
 
 extension String.UTF8View: StringZillaViewable {
     public typealias Index = String.UTF8View.Index
-    
+
     /// Executes a closure with a pointer to the UTF8View's contiguous storage of single-byte elements (UTF-8 code units).
     /// - Parameters:
     ///   - body: A closure that takes a pointer to the contiguous storage and its size.
@@ -134,19 +134,18 @@ extension String.UTF8View: StringZillaViewable {
             throw StringZillaError.contiguousStorageUnavailable
         }()
     }
-    
+
     /// Calculates the offset index for a given byte pointer relative to a start pointer.
     /// - Parameters:
     ///   - bytePointer: A pointer to the byte for which the offset is calculated.
     ///   - startPointer: The starting pointer for the calculation, previously obtained from `szScope`.
     /// - Returns: The calculated index offset.
     public func stringZillaByteOffset(forByte bytePointer: sz_cptr_t, after startPointer: sz_cptr_t) -> Index {
-        return self.index(self.startIndex, offsetBy: bytePointer - startPointer)
+        return index(startIndex, offsetBy: bytePointer - startPointer)
     }
 }
 
 public extension StringZillaViewable {
-    
     /// Finds the first occurrence of the specified substring within the receiver.
     /// - Parameter needle: The substring to search for.
     /// - Returns: The index of the found occurrence, or `nil` if not found.
@@ -163,7 +162,7 @@ public extension StringZillaViewable {
         }
         return result
     }
-    
+
     /// Finds the last occurrence of the specified substring within the receiver.
     /// - Parameter needle: The substring to search for.
     /// - Returns: The index of the found occurrence, or `nil` if not found.
@@ -180,7 +179,7 @@ public extension StringZillaViewable {
         }
         return result
     }
-    
+
     /// Finds the first occurrence of the specified character-set members within the receiver.
     /// - Parameter characters: A string-like collection of characters to match.
     /// - Returns: The index of the found occurrence, or `nil` if not found.
@@ -190,14 +189,14 @@ public extension StringZillaViewable {
         var result: Index?
         withStringZillaScope { hPointer, hLength in
             characters.withStringZillaScope { nPointer, nLength in
-                if let matchPointer = sz_find_char_from(hPointer, hLength, nPointer, nLength) {
+                if let matchPointer = sz_find_byte_from(hPointer, hLength, nPointer, nLength) {
                     result = self.stringZillaByteOffset(forByte: matchPointer, after: hPointer)
                 }
             }
         }
         return result
     }
-    
+
     /// Finds the last occurrence of the specified character-set members within the receiver.
     /// - Parameter characters: A string-like collection of characters to match.
     /// - Returns: The index of the found occurrence, or `nil` if not found.
@@ -207,14 +206,14 @@ public extension StringZillaViewable {
         var result: Index?
         withStringZillaScope { hPointer, hLength in
             characters.withStringZillaScope { nPointer, nLength in
-                if let matchPointer = sz_rfind_char_from(hPointer, hLength, nPointer, nLength) {
+                if let matchPointer = sz_rfind_byte_from(hPointer, hLength, nPointer, nLength) {
                     result = self.stringZillaByteOffset(forByte: matchPointer, after: hPointer)
                 }
             }
         }
         return result
     }
-    
+
     /// Finds the first occurrence of a character outside of the the given character-set within the receiver.
     /// - Parameter characters: A string-like collection of characters to exclude.
     /// - Returns: The index of the found occurrence, or `nil` if not found.
@@ -224,14 +223,14 @@ public extension StringZillaViewable {
         var result: Index?
         withStringZillaScope { hPointer, hLength in
             characters.withStringZillaScope { nPointer, nLength in
-                if let matchPointer = sz_find_char_not_from(hPointer, hLength, nPointer, nLength) {
+                if let matchPointer = sz_find_byte_not_from(hPointer, hLength, nPointer, nLength) {
                     result = self.stringZillaByteOffset(forByte: matchPointer, after: hPointer)
                 }
             }
         }
         return result
     }
-    
+
     /// Finds the last occurrence of a character outside of the the given character-set within the receiver.
     /// - Parameter characters: A string-like collection of characters to exclude.
     /// - Returns: The index of the found occurrence, or `nil` if not found.
@@ -241,40 +240,46 @@ public extension StringZillaViewable {
         var result: Index?
         withStringZillaScope { hPointer, hLength in
             characters.withStringZillaScope { nPointer, nLength in
-                if let matchPointer = sz_rfind_char_not_from(hPointer, hLength, nPointer, nLength) {
+                if let matchPointer = sz_rfind_byte_not_from(hPointer, hLength, nPointer, nLength) {
                     result = self.stringZillaByteOffset(forByte: matchPointer, after: hPointer)
                 }
             }
         }
         return result
     }
-    
-    /// Computes the Levenshtein edit distance between this and another string.
-    /// - Parameter other: A string-like collection of characters to exclude.
-    /// - Returns: The edit distance, as an unsigned integer.
-    /// - Throws: If a memory allocation error has happened.
-    @_specialize(where Self == String, S == String)
-    @_specialize(where Self == String.UTF8View, S == String.UTF8View)
-    func editDistance<S: StringZillaViewable>(from other: S, bound: UInt64 = UInt64.max) throws -> UInt64? {
-        var result: UInt64?
-        
-        // Use a do-catch block to handle potential errors
-        do {
-            try withStringZillaScope { hPointer, hLength in
-                try other.withStringZillaScope { nPointer, nLength in
-                    result = UInt64(sz_edit_distance(hPointer, hLength, nPointer, nLength, sz_size_t(bound), nil))
-                    if result == SZ_SIZE_MAX {
-                        result = nil
-                        throw StringZillaError.memoryAllocationFailed
-                    }
-                }
+
+    func levenshteinDistance<S: StringZillaViewable>(
+        from other: S,
+        bound: UInt? = nil
+    ) throws -> UInt {
+        // Prepare a local variable for the result.
+        var computedResult: sz_size_t = 0
+
+        // Swift has a ridiculous issue with casting unsigned 64-bit to unsigned 64-bit
+        // values which results in "Fatal error: Not enough bits to represent the passed value".
+        // Let's just copy the bytes: https://stackoverflow.com/a/68650250/2766161
+        let effectiveBound: sz_size_t = bound.map { sz_size_t($0) } ?? _sz_size_max()
+        let status = try withStringZillaScope { hPointer, hLength in
+            try other.withStringZillaScope { nPointer, nLength in
+                // Pass a mutable pointer for the result.
+                sz_levenshtein_distance(
+                    hPointer,
+                    hLength,
+                    nPointer,
+                    nLength,
+                    effectiveBound,
+                    nil, // default allocator
+                    &computedResult // out-parameter for the computed distance
+                )
             }
-        } catch {
-            // Handle or rethrow the error
-            throw error
         }
-        
-        return result
+
+        // Check the returned status code.
+        guard status == sz_success_k else {
+            // Map the status code to an appropriate Swift error.
+            throw StringZillaError.memoryAllocationFailed
+        }
+
+        return UInt(computedResult)
     }
-    
 }
diff --git a/swift/Test.swift b/swift/Test.swift
index 4b839022..670d758a 100644
--- a/swift/Test.swift
+++ b/swift/Test.swift
@@ -5,58 +5,57 @@
 //  Created by Ash Vardanian on 18/1/24.
 //
 
-import XCTest
 @testable import StringZilla
+import XCTest
 
 class StringZillaTests: XCTestCase {
-    
     var testString: String!
-    
+
     override func setUp() {
         super.setUp()
         testString = "Hello, world! Welcome to StringZilla. 👋"
         XCTAssertEqual(testString.count, 39)
         XCTAssertEqual(testString.utf8.count, 42)
     }
-    
+
     func testFindFirstSubstring() {
         let index = testString.findFirst(substring: "world")!
         XCTAssertEqual(testString[index...], "world! Welcome to StringZilla. 👋")
     }
-    
+
     func testFindLastSubstring() {
         let index = testString.findLast(substring: "o")!
         XCTAssertEqual(testString[index...], "o StringZilla. 👋")
     }
-    
+
     func testFindFirstCharacterFromSet() {
         let index = testString.findFirst(characterFrom: "aeiou")!
         XCTAssertEqual(testString[index...], "ello, world! Welcome to StringZilla. 👋")
     }
-    
+
     func testFindLastCharacterFromSet() {
         let index = testString.findLast(characterFrom: "aeiou")!
         XCTAssertEqual(testString[index...], "a. 👋")
     }
-    
+
     func testFindFirstCharacterNotFromSet() {
         let index = testString.findFirst(characterNotFrom: "aeiou")!
         XCTAssertEqual(testString[index...], "Hello, world! Welcome to StringZilla. 👋")
     }
 
-     func testFindLastCharacterNotFromSet() {
+    func testFindLastCharacterNotFromSet() {
         let index = testString.findLast(characterNotFrom: "aeiou")!
         XCTAssertEqual(testString.distance(from: testString.startIndex, to: index), 38)
         XCTAssertEqual(testString[index...], "👋")
     }
-    
-    func testEditDistance() {
+
+    func testLevenshteinDistance() {
         let otherString = "Hello, world!"
-        let distance = try? testString.editDistance(from: otherString)
+        let distance = try? testString.levenshteinDistance(from: otherString)
         XCTAssertNotNil(distance)
         XCTAssertEqual(distance, 29)
     }
-    
+
     func testFindLastCharacterNotFromSetNoMatch() {
         let index = "aeiou".findLast(characterNotFrom: "aeiou")
         XCTAssertNil(index)

From 8b396c829d20a59321b60aa6b856bf52f6426795 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 28 Feb 2025 18:17:39 +0000
Subject: [PATCH 136/751] Fix: `fill_random` test condition

---
 rust/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rust/lib.rs b/rust/lib.rs
index d5e9a682..58553d75 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -1558,7 +1558,7 @@ mod tests {
         sz::fill_random(&mut second_buffer, 42);
 
         // Same nonce will produce the same outputs
-        assert!(first_buffer != second_buffer);
+        assert_eq!(first_buffer, second_buffer);
     }
 
     mod search_split_iterators {

From d52bf63a3d74529fa1b6c397b874e52243d3e80a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 2 Mar 2025 00:33:43 +0000
Subject: [PATCH 137/751] Fix: Detecting caps in dynamic builds

---
 c/lib.c                     |  4 ++--
 include/stringzilla/types.h | 21 ++++++++++++++++++++-
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/c/lib.c b/c/lib.c
index f742ad2b..3132a8f5 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -158,9 +158,9 @@ SZ_INTERNAL sz_capability_t _sz_capabilities_x86(void) {
  *  @return A bitmask of the SIMD capabilities represented as a `sz_capability_t` enum value.
  */
 SZ_DYNAMIC sz_capability_t sz_capabilities(void) {
-#if _SZ_IS_X86
+#if _SZ_IS_X86_64
     return _sz_capabilities_x86();
-#elif _SZ_IS_ARM
+#elif _SZ_IS_ARM64
     return _sz_capabilities_arm();
 #else
     return sz_cap_serial_k;
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index c4f71907..a15cf116 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -105,6 +105,25 @@
 #endif
 #endif
 
+/**
+ *  @brief  Infer the target architecture.
+ *          At this point we only provide optimized backends for x86_64 and ARM64.
+ */
+#ifndef _SZ_IS_X86_64
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
+#define _SZ_IS_X86_64 (1)
+#else
+#define _SZ_IS_X86_64 (0)
+#endif
+#endif
+#ifndef _SZ_IS_ARM64
+#if defined(__aarch64__) || defined(__arm64__) || defined(__arm64) || defined(_M_ARM64)
+#define _SZ_IS_ARM64 (1)
+#else
+#define _SZ_IS_ARM64 (0)
+#endif
+#endif
+
 /**
  *  @brief  Threshold for switching to SWAR (8-bytes at a time) backend over serial byte-level for-loops.
  *          On very short strings, under 16 bytes long, at most a single word will be processed with SWAR.
@@ -230,7 +249,7 @@
  */
 #if SZ_USE_HASWELL || SZ_USE_SKYLAKE || SZ_USE_ICE
 #include <immintrin.h>
-#endif // SZ_USE_X86...
+#endif // SZ_USE_HASWELL || SZ_USE_SKYLAKE || SZ_USE_ICE
 #if SZ_USE_NEON
 #if !defined(_MSC_VER)
 #include <arm_acle.h>

From 8877c82aedf7acc6bba7e667daab3e9e5f36011d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 2 Mar 2025 00:34:29 +0000
Subject: [PATCH 138/751] Make: Decremental Rust builds

SimSIMD uses the same approach
---
 build.rs | 126 ++++++++++++++++++++++++++-----------------------------
 1 file changed, 60 insertions(+), 66 deletions(-)

diff --git a/build.rs b/build.rs
index bb5fb5cf..9622457f 100644
--- a/build.rs
+++ b/build.rs
@@ -6,7 +6,12 @@ fn main() {
         .file("c/lib.c")
         .include("include")
         .warnings(false)
-        .flag_if_supported("-std=c99")
+        .define("SZ_DYNAMIC_DISPATCH", "1")
+        .define("SZ_AVOID_LIBC", "0")
+        .define("SZ_DEBUG", "0")
+        .flag("-O3")
+        .flag("-std=c99") // Enforce C99 standard
+        .flag_if_supported("-fdiagnostics-color=always")
         .flag_if_supported("-fPIC");
 
     // Cargo will set different environment variables that we can use to properly configure the build.
@@ -14,70 +19,6 @@ fn main() {
     let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default();
     let target_endian = env::var("CARGO_CFG_TARGET_ENDIAN").unwrap_or_default();
 
-    // To get the operating system we can use the TARGET environment variable.
-    // To check the list of available targets, run `rustc --print target-list`.
-    let target = env::var("TARGET").unwrap_or_default();
-
-    if target.contains("linux") {
-        build.flag_if_supported("-fdiagnostics-color=always");
-        build.flag_if_supported("-O3");
-        build.flag_if_supported("-pedantic");
-
-        // Set architecture-specific flags and macros
-        if target_arch == "x86_64" {
-            build.define("SZ_USE_HASWELL", "1");
-            build.define("SZ_USE_SKYLAKE", "1");
-            build.define("SZ_USE_ICE", "1");
-        } else {
-            build.define("SZ_USE_HASWELL", "0");
-            build.define("SZ_USE_SKYLAKE", "0");
-            build.define("SZ_USE_ICE", "0");
-        }
-
-        if target_arch == "aarch64" {
-            build.flag_if_supported("-march=armv8-a+simd");
-            build.define("SZ_USE_NEON", "1");
-            build.define("SZ_USE_SVE", "1");
-        } else {
-            build.define("SZ_USE_NEON", "0");
-            build.define("SZ_USE_SVE", "0");
-        }
-    } else if target.contains("darwin") {
-        build.flag_if_supported("-fcolor-diagnostics");
-        build.flag_if_supported("-O3");
-        build.flag_if_supported("-pedantic");
-
-        if target_arch == "x86_64" {
-            // Assuming no AVX-512 support for Darwin as per setup.py logic
-            build.define("SZ_USE_HASWELL", "1");
-            build.define("SZ_USE_SKYLAKE", "0");
-            build.define("SZ_USE_ICE", "0");
-        } else {
-            build.define("SZ_USE_HASWELL", "0");
-            build.define("SZ_USE_SKYLAKE", "0");
-            build.define("SZ_USE_ICE", "0");
-        }
-
-        if target_arch == "aarch64" {
-            build.define("SZ_USE_NEON", "1");
-            build.define("SZ_USE_SVE", "0"); // Assuming no SVE support for Darwin
-        } else {
-            build.define("SZ_USE_NEON", "0");
-            build.define("SZ_USE_SVE", "0");
-        }
-    } else if target.contains("windows") {
-        // Set architecture-specific flags and macros
-        if target_arch == "x86_64" {
-            build.define("SZ_USE_HASWELL", "1");
-            build.define("SZ_USE_SKYLAKE", "1");
-            build.define("SZ_USE_ICE", "1");
-        } else {
-            build.define("SZ_USE_HASWELL", "0");
-            build.define("SZ_USE_SKYLAKE", "0");
-            build.define("SZ_USE_ICE", "0");
-        }
-    }
-
     // Set endian-specific macro
     if target_endian == "big" {
         build.define("SZ_DETECT_BIG_ENDIAN", "1");
@@ -85,9 +26,62 @@ fn main() {
         build.define("SZ_DETECT_BIG_ENDIAN", "0");
     }
 
-    build.compile("stringzilla");
+    if target_arch == "x86_64" {
+        build.define("_SZ_IS_X86_64", "1");
+        build.define("_SZ_IS_ARM64", "0");
+    } else if target_arch == "aarch64" {
+        build.define("_SZ_IS_X86_64", "0");
+        build.define("_SZ_IS_ARM64", "1");
+    }
+
+    // At start we will try compiling with all SIMD backends enabled
+    let flags_to_try = match target_arch.as_str() {
+        "arm" | "aarch64" => vec![
+            //
+            "SZ_USE_SVE2",
+            "SZ_USE_SVE",
+            "SZ_USE_NEON",
+        ],
+        _ => vec![
+            //
+            "SZ_USE_ICE",
+            "SZ_USE_SKYLAKE",
+            "SZ_USE_HASWELL",
+        ],
+    };
+    for flag in flags_to_try.iter() {
+        build.define(flag, "1");
+    }
+
+    // If that fails, we will try disabling them one by one
+    if build.try_compile("stringzilla").is_err() {
+        print!("cargo:warning=Failed to compile with all SIMD backends...");
+
+        for flag in flags_to_try.iter() {
+            build.define(flag, "0");
+            if build.try_compile("stringzilla").is_ok() {
+                break;
+            }
+
+            // Print the failed configuration
+            println!(
+                "cargo:warning=Failed to compile after disabling {}, trying next configuration...",
+                flag
+            );
+        }
+    }
 
     println!("cargo:rerun-if-changed=c/lib.c");
     println!("cargo:rerun-if-changed=rust/lib.rs");
     println!("cargo:rerun-if-changed=include/stringzilla/stringzilla.h");
+
+    // Constituent parts:
+    println!("cargo:rerun-if-changed=include/stringzilla/compare.h");
+    println!("cargo:rerun-if-changed=include/stringzilla/find.h");
+    println!("cargo:rerun-if-changed=include/stringzilla/hash.h");
+    println!("cargo:rerun-if-changed=include/stringzilla/memory.h");
+    println!("cargo:rerun-if-changed=include/stringzilla/similarity.h");
+    println!("cargo:rerun-if-changed=include/stringzilla/small_string.h");
+    println!("cargo:rerun-if-changed=include/stringzilla/sort.h");
+    println!("cargo:rerun-if-changed=include/stringzilla/types.h");
 }

From fbf256aca7b62f3e978334d5554b968fec0a1fa2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 2 Mar 2025 00:35:09 +0000
Subject: [PATCH 139/751] Make: `cibuildwheel` env variables

---
 pyproject.toml | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a8dd42e2..ed969673 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -79,50 +79,50 @@ before-build = ["rd /s /q {project}\\build || echo Done"]
 [[tool.cibuildwheel.overrides]]
 select = "*-win_amd64"
 inherit.environment = "append"
-environment.SZ_X86_64 = "1"
+environment._SZ_IS_X86_64 = "1"
 
 [[tool.cibuildwheel.overrides]]
 select = "*-manylinux*_x86_64"
 inherit.environment = "append"
-environment.SZ_X86_64 = "1"
+environment._SZ_IS_X86_64 = "1"
 
 [[tool.cibuildwheel.overrides]]
 select = "*-musllinux*_x86_64"
 inherit.environment = "append"
-environment.SZ_X86_64 = "1"
+environment._SZ_IS_X86_64 = "1"
 
 [[tool.cibuildwheel.overrides]]
 select = "*-macos*_x86_64"
 inherit.environment = "append"
-environment.SZ_X86_64 = "1"
+environment._SZ_IS_X86_64 = "1"
 
 # Detect ARM 64-bit builds
 [[tool.cibuildwheel.overrides]]
 select = "*-win_arm64"
 inherit.environment = "append"
-environment.SZ_ARM64 = "1"
+environment._SZ_IS_ARM64 = "1"
 
 [[tool.cibuildwheel.overrides]]
 select = "*-manylinux*_aarch64"
 inherit.environment = "append"
-environment.SZ_ARM64 = "1"
+environment._SZ_IS_ARM64 = "1"
 
 [[tool.cibuildwheel.overrides]]
 select = "*-musllinux*_aarch64"
 inherit.environment = "append"
-environment.SZ_ARM64 = "1"
+environment._SZ_IS_ARM64 = "1"
 
 [[tool.cibuildwheel.overrides]]
 select = "*-macos*_arm64"
 inherit.environment = "append"
-environment.SZ_ARM64 = "1"
+environment._SZ_IS_ARM64 = "1"
 
 # Detect MacOS Universal2 builds
 [[tool.cibuildwheel.overrides]]
 select = "*-macos*_universal2"
 inherit.environment = "append"
-environment.SZ_X86_64 = "1"
-environment.SZ_ARM64 = "1"
+environment._SZ_IS_X86_64 = "1"
+environment._SZ_IS_ARM64 = "1"
 
 [tool.cibuildwheel.macos.environment]
 MACOSX_DEPLOYMENT_TARGET = "10.11"

From a30b5b7f54dd586b703415162707cb3b22e40bd4 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 2 Mar 2025 00:36:26 +0000
Subject: [PATCH 140/751] Improve: Inline most common Rust APIs

---
 rust/lib.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/rust/lib.rs b/rust/lib.rs
index 58553d75..a7f2e0ba 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -153,6 +153,7 @@ pub mod sz {
     /// # Returns
     ///
     /// A `u64` representing the checksum value of the input byte slice.
+    #[inline(always)]
     pub fn bytesum<T>(text: T) -> u64
     where
         T: AsRef<[u8]>,
@@ -177,6 +178,7 @@ pub mod sz {
     /// # Returns
     ///
     /// A `u64` representing the hash value of the input byte slice.
+    #[inline(always)]
     pub fn hash_with_seed<T>(text: T, seed: u64) -> u64
     where
         T: AsRef<[u8]>,
@@ -200,6 +202,7 @@ pub mod sz {
     /// # Returns
     ///
     /// A `u64` representing the hash value of the input byte slice.
+    #[inline(always)]
     pub fn hash<T>(text: T) -> u64
     where
         T: AsRef<[u8]>,
@@ -253,6 +256,7 @@ pub mod sz {
     ///
     /// An `Option<usize>` representing the starting index of the last occurrence of `needle`
     /// within `haystack` if found, otherwise `None`.
+    #[inline(always)]
     pub fn rfind<H, N>(haystack: H, needle: N) -> Option<usize>
     where
         H: AsRef<[u8]>,
@@ -286,6 +290,7 @@ pub mod sz {
     ///
     /// An `Option<usize>` representing the index of the first occurrence of any byte from
     /// `needles` within `haystack`, if found, otherwise `None`.
+    #[inline(always)]
     pub fn find_byte_from<H, N>(haystack: H, needles: N) -> Option<usize>
     where
         H: AsRef<[u8]>,

From 9a32744b7046c18039e6e6b4c1d33b9832726638 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 2 Mar 2025 00:37:48 +0000
Subject: [PATCH 141/751] Add: Dispatched version API

---
 c/lib.c                           |   4 +
 include/stringzilla/stringzilla.h |  55 ++++++++++++++
 python/lib.c                      |  15 +---
 rust/lib.rs                       | 118 +++++++++++++++++++++++++++++-
 4 files changed, 176 insertions(+), 16 deletions(-)

diff --git a/c/lib.c b/c/lib.c
index 3132a8f5..9c6324dd 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -385,9 +385,13 @@ BOOL WINAPI _DllMainCRTStartup(HINSTANCE hints, DWORD forward_reason, LPVOID lp)
 __attribute__((constructor)) static void sz_dispatch_table_init_on_gcc_or_clang(void) { sz_dispatch_table_init(); }
 #endif
 
+SZ_DYNAMIC int sz_dynamic_dispatch(void) { return 1; }
 SZ_DYNAMIC int sz_version_major(void) { return STRINGZILLA_H_VERSION_MAJOR; }
 SZ_DYNAMIC int sz_version_minor(void) { return STRINGZILLA_H_VERSION_MINOR; }
 SZ_DYNAMIC int sz_version_patch(void) { return STRINGZILLA_H_VERSION_PATCH; }
+SZ_DYNAMIC sz_cptr_t sz_capabilities_to_string(sz_capability_t caps) {
+    return _sz_capabilities_to_string_implementation(caps);
+}
 
 SZ_DYNAMIC sz_u64_t sz_bytesum(sz_cptr_t text, sz_size_t length) { return sz_dispatch_table.bytesum(text, length); }
 
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index c497d4f1..824bacd4 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -80,17 +80,72 @@ typedef enum {
  */
 SZ_DYNAMIC sz_capability_t sz_capabilities(void);
 
+/**
+ *  @brief Internal helper function to convert SIMD capabilities to a string.
+ *  @sa    sz_capabilities_to_string, sz_capabilities
+ */
+SZ_INTERNAL sz_cptr_t _sz_capabilities_to_string_implementation(sz_capability_t caps) {
+
+    static char buf[256];
+    char *p = buf;
+    char *const end = buf + sizeof(buf);
+
+    // Mapping each flag to its string literal.
+    struct {
+        sz_capability_t flag;
+        char const *name;
+    } capability_map[] = {
+        {sz_cap_serial_k, "serial"}, {sz_cap_haswell_k, "haswell"}, {sz_cap_skylake_k, "skylake"},
+        {sz_cap_ice_k, "ice"},       {sz_cap_neon_k, "neon"},       {sz_cap_neon_aes_k, "neon+aes"},
+        {sz_cap_sve_k, "sve"},       {sz_cap_sve2_k, "sve2"},       {sz_cap_sve2_aes_k, "sve2+aes"},
+    };
+    int const capabilities_count = sizeof(capability_map) / sizeof(capability_map[0]);
+
+    // Iterate over each capability flag.
+    for (int i = 0; i < capabilities_count; i++) {
+        if (caps & capability_map[i].flag) {
+            int const is_first = p == buf;
+            // Add separator if this is not the first capability.
+            if (!is_first) {
+                char const sep[3] = {',', ' ', '\0'};
+                char const *s = sep;
+                while (*s && p < end - 1) *p++ = *s++;
+            }
+            // Append the capability name character by character.
+            char const *s = capability_map[i].name;
+            while (*s && p < end - 1) *p++ = *s++;
+        }
+    }
+
+    // If no capability was added, write "none".
+    int const nothing_detected = p == buf;
+    if (nothing_detected) {
+        char const *s = "none";
+        while (*s && p < end - 1) *p++ = *s++;
+    }
+
+    // Null-terminate the string.
+    *p = '\0';
+    return buf;
+}
+
 #if defined(SZ_DYNAMIC_DISPATCH)
 
+SZ_DYNAMIC int sz_dynamic_dispatch(void);
 SZ_DYNAMIC int sz_version_major(void);
 SZ_DYNAMIC int sz_version_minor(void);
 SZ_DYNAMIC int sz_version_patch(void);
+SZ_DYNAMIC sz_cptr_t sz_capabilities_to_string(sz_capability_t caps);
 
 #else
 
+SZ_DYNAMIC int sz_dynamic_dispatch(void) { return 0; }
 SZ_PUBLIC int sz_version_major(void) { return STRINGZILLA_H_VERSION_MAJOR; }
 SZ_PUBLIC int sz_version_minor(void) { return STRINGZILLA_H_VERSION_MINOR; }
 SZ_PUBLIC int sz_version_patch(void) { return STRINGZILLA_H_VERSION_PATCH; }
+SZ_PUBLIC sz_cptr_t sz_capabilities_to_string(sz_capability_t caps) {
+    return _sz_capabilities_to_string_implementation(caps);
+}
 
 #endif
 
diff --git a/python/lib.c b/python/lib.c
index 46ed1c51..cf6ec6fb 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -3726,20 +3726,7 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
     // Define SIMD capabilities
     {
         sz_capability_t caps = sz_capabilities();
-        char caps_str[512];
-        char const *serial = (caps & sz_cap_serial_k) ? "serial," : "";
-        char const *neon = (caps & sz_cap_neon_k) ? "neon," : "";
-        char const *neon_aes = (caps & sz_cap_neon_aes_k) ? "neon_aes," : "";
-        char const *sve = (caps & sz_cap_sve_k) ? "sve," : "";
-        char const *sve2 = (caps & sz_cap_sve2_k) ? "sve2," : "";
-        char const *sve2_aes = (caps & sz_cap_sve2_aes_k) ? "sve2_aes," : "";
-        char const *haswell = (caps & sz_cap_haswell_k) ? "haswell," : "";
-        char const *skylake = (caps & sz_cap_skylake_k) ? "skylake," : "";
-        char const *ice = (caps & sz_cap_ice_k) ? "ice," : "";
-        sprintf(caps_str, "%s%s%s%s%s%s%s%s%s",      //
-                serial,                              //
-                neon, neon_aes, sve, sve2, sve2_aes, //
-                haswell, skylake, ice);
+        sz_cptr_t caps_str = sz_capability_to_string(caps);
         PyModule_AddStringConstant(m, "__capabilities__", caps_str);
     }
 
diff --git a/rust/lib.rs b/rust/lib.rs
index a7f2e0ba..c8dd629f 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -1,7 +1,7 @@
 #![cfg_attr(not(test), no_std)]
 
 /// The `sz` module provides a collection of string searching and manipulation functionality,
-/// designed for high efficiency and compatibility with no_std environments. This module offers
+/// designed for high efficiency and compatibility with `no_std` environments. This module offers
 /// various utilities for byte string manipulation, including search, reverse search, and
 /// edit-distance calculations, suitable for a wide range of applications from basic string
 /// processing to complex text analysis tasks.
@@ -63,10 +63,19 @@ pub mod sz {
         }
     }
 
-    use core::{ffi::c_void, usize};
+    use core::fmt::{self, Write};
+    use core::{ffi::c_void, ffi::CStr, usize};
 
     // Import the functions from the StringZilla C library.
     extern "C" {
+
+        fn sz_dynamic_dispatch() -> i32;
+        fn sz_version_major() -> i32;
+        fn sz_version_minor() -> i32;
+        fn sz_version_patch() -> i32;
+        fn sz_capabilities() -> u32;
+        fn sz_capabilities_to_string(caps: u32) -> *const c_void;
+
         fn sz_find(
             haystack: *const c_void,
             haystack_length: usize,
@@ -91,6 +100,8 @@ pub mod sz {
 
         fn sz_fill_random(text: *mut c_void, length: usize, seed: u64);
 
+        // fn sz_sort() -> Status;
+
         pub fn sz_levenshtein_distance(
             a: *const c_void,
             a_length: usize,
@@ -142,6 +153,103 @@ pub mod sz {
 
     }
 
+    /// A simple semantic version structure.
+    #[derive(Debug, Copy, Clone, PartialEq, Eq)]
+    pub struct SemVer {
+        pub major: i32,
+        pub minor: i32,
+        pub patch: i32,
+    }
+
+    impl SemVer {
+        pub const fn new(major: i32, minor: i32, patch: i32) -> Self {
+            Self { major, minor, patch }
+        }
+    }
+
+    /// Checks if the library was compiled with dynamic dispatch enabled.
+    pub fn dynamic_dispatch() -> bool {
+        unsafe { sz_dynamic_dispatch() != 0 }
+    }
+
+    /// Returns the semantic version information.
+    pub fn version() -> SemVer {
+        SemVer {
+            major: unsafe { sz_version_major() },
+            minor: unsafe { sz_version_minor() },
+            patch: unsafe { sz_version_patch() },
+        }
+    }
+
+    /// A fixed-size, compile-time known C-string buffer type.
+    /// It keeps track of the number of written bytes (excluding the null terminator).
+    pub struct FixedCString<const N: usize> {
+        buf: [u8; N],
+        len: usize,
+    }
+
+    impl<const N: usize> FixedCString<N> {
+        /// Create a new, empty buffer.
+        /// The buffer always has a terminating NUL (0) byte at position `len`.
+        pub const fn new() -> Self {
+            Self { buf: [0u8; N], len: 0 }
+        }
+
+        /// Returns the raw pointer to the C string.
+        pub fn as_ptr(&self) -> *const u8 {
+            self.buf.as_ptr()
+        }
+
+        /// Returns a reference as a CStr.
+        /// # Safety
+        /// The buffer must be correctly NUL terminated.
+        pub fn as_c_str(&self) -> &CStr {
+            // We know buf[..=len] is NUL-terminated because write_str() always sets it.
+            unsafe { CStr::from_bytes_with_nul_unchecked(&self.buf[..=self.len]) }
+        }
+
+        /// Returns the current content as a &str.
+        /// Returns an empty string if the content isn’t valid UTF‑8.
+        pub fn as_str(&self) -> &str {
+            core::str::from_utf8(&self.buf[..self.len]).unwrap_or("")
+        }
+    }
+
+    impl<const N: usize> Write for FixedCString<N> {
+        fn write_str(&mut self, s: &str) -> fmt::Result {
+            let bytes = s.as_bytes();
+            // Ensure we have room for the new bytes and a NUL terminator.
+            if self.len + bytes.len() >= N {
+                return Err(fmt::Error);
+            }
+            self.buf[self.len..self.len + bytes.len()].copy_from_slice(bytes);
+            self.len += bytes.len();
+            // Always set a null terminator.
+            self.buf[self.len] = 0;
+            Ok(())
+        }
+    }
+
+    pub type SmallCString = FixedCString<256>;
+
+    /// Copies the capabilities C-string into a fixed buffer and returns it.
+    /// The returned SmallCString is guaranteed to be null-terminated.
+    pub fn capabilities() -> SmallCString {
+        let caps = unsafe { sz_capabilities() };
+        let caps_ptr = unsafe { sz_capabilities_to_string(caps) };
+        // Assume that the external function returns a valid null-terminated C string.
+        let cstr = unsafe { CStr::from_ptr(caps_ptr as *const i8) };
+        let bytes = cstr.to_bytes();
+
+        let mut buf = SmallCString::new();
+        // Use core::fmt::Write to copy the bytes.
+        // If the string is too long, it will fail. You might want to truncate in a real-world use.
+        // Here, we assume it fits.
+        let s = core::str::from_utf8(bytes).unwrap_or("");
+        let _ = buf.write_str(s);
+        buf
+    }
+
     /// Computes the checksum value of unsigned bytes in a given byte slice `text`.
     /// This function is useful for verifying data integrity and detecting changes in
     /// binary data, such as files or network packets.
@@ -1481,6 +1589,12 @@ mod tests {
     use crate::sz; // For global functions
     use crate::StringZilla; // For member functions
 
+    #[test]
+    fn metadata() {
+        assert!(sz::dynamic_dispatch());
+        assert!(sz::capabilities().as_str().len() > 0);
+    }
+
     #[test]
     fn hamming() {
         assert_eq!(sz::hamming_distance("hello", "hello"), Ok(0));

From b2085ccc4ee3cdfbca595f73c2cc9bcfe69e2587 Mon Sep 17 00:00:00 2001
From: Mikayel Grigoryan <michael.grigoryan25@gmail.com>
Date: Sun, 2 Mar 2025 10:43:29 +0400
Subject: [PATCH 142/751] Improve: Exposed sz_move, sz_fill, and sz_copy for
 Rust

---
 rust/lib.rs | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/rust/lib.rs b/rust/lib.rs
index c8dd629f..26a86ae8 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -68,7 +68,9 @@ pub mod sz {
 
     // Import the functions from the StringZilla C library.
     extern "C" {
-
+        fn sz_copy(target: *const c_void, source: *const c_void, length: usize);
+        fn sz_fill(target: *const c_void, length: usize, value: u8);
+        fn sz_move(target: *const c_void, source: *const c_void, length: usize);
         fn sz_dynamic_dispatch() -> i32;
         fn sz_version_major() -> i32;
         fn sz_version_minor() -> i32;
@@ -273,6 +275,57 @@ pub mod sz {
         return result;
     }
 
+    /// Moves the contents of `source` into `target`, overwriting the existing contents of `target`.
+    /// This function is useful for scenarios where you need to replace the contents of a byte slice
+    /// with the contents of another byte slice.
+    pub fn move_bytes<T, S>(target: &mut T, source: &S)
+    where
+        T: AsMut<[u8]> + ?Sized,
+        S: AsRef<[u8]> + ?Sized,
+    {
+        let target_slice = target.as_mut();
+        let source_slice = source.as_ref();
+        unsafe {
+            sz_move(
+                target_slice.as_mut_ptr() as *const c_void,
+                source_slice.as_ptr() as *const c_void,
+                source_slice.len(),
+            );
+        }
+    }
+
+    /// Fills the contents of `target` with the specified `value`. This function is useful for
+    /// scenarios where you need to set all bytes in a byte slice to a specific value, such as
+    /// zeroing out a buffer or initializing a buffer with a specific byte pattern.
+    pub fn fill<T>(target: &mut T, value: u8)
+    where
+        T: AsMut<[u8]> + ?Sized,
+    {
+        let target_slice = target.as_mut();
+        unsafe {
+            sz_fill(target_slice.as_ptr() as *const c_void, target_slice.len(), value);
+        }
+    }
+
+    /// Copies the contents of `source` into `target`, overwriting the existing contents of `target`.
+    /// This function is useful for scenarios where you need to replace the contents of a byte slice
+    /// with the contents of another byte slice.
+    pub fn copy<T, S>(target: &mut T, source: &S)
+    where
+        T: AsMut<[u8]> + ?Sized,
+        S: AsRef<[u8]> + ?Sized,
+    {
+        let target_slice = target.as_mut();
+        let source_slice = source.as_ref();
+        unsafe {
+            sz_copy(
+                target_slice.as_mut_ptr() as *mut c_void,
+                source_slice.as_ptr() as *const c_void,
+                source_slice.len(),
+            );
+        }
+    }
+
     /// Computes a 64-bit AES-based hash value for a given byte slice `text`.
     /// This function is designed to provide a high-quality hash value for use in
     /// hash tables, data structures, and cryptographic applications.

From 1757e4ec50d9a177cd5f64d96337d51cbe28d4fc Mon Sep 17 00:00:00 2001
From: Mikayel Grigoryan <michael.grigoryan25@gmail.com>
Date: Sun, 2 Mar 2025 11:08:59 +0400
Subject: [PATCH 143/751] Improve: Expose sz_hash_state_init,
 sz_hash_state_stream, and sz_hash_state_fold to Rust

---
 rust/lib.rs | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/rust/lib.rs b/rust/lib.rs
index 26a86ae8..784c5454 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -153,6 +153,14 @@ pub mod sz {
             result: *mut isize,
         ) -> Status;
 
+        /// Initializes a hash state with a given seed value.
+        fn sz_hash_state_init(state: *const c_void, seed: u64);
+
+        /// Updates the hash state with a new byte slice.
+        fn sz_hash_state_stream(state: *const c_void, text: *const c_void, length: usize);
+
+        /// Finalizes the hash state and returns the computed hash value.
+        fn sz_hash_state_fold(state: *const c_void) -> u64;
     }
 
     /// A simple semantic version structure.

From 471b0024db2b4c183f3277ca08f59a2af89eec24 Mon Sep 17 00:00:00 2001
From: Mikayel Grigoryan <michael.grigoryan25@gmail.com>
Date: Sun, 2 Mar 2025 11:14:19 +0400
Subject: [PATCH 144/751] Improve: Expose sz_lookup

---
 rust/lib.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/rust/lib.rs b/rust/lib.rs
index 784c5454..14f10c7e 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -161,6 +161,8 @@ pub mod sz {
 
         /// Finalizes the hash state and returns the computed hash value.
         fn sz_hash_state_fold(state: *const c_void) -> u64;
+
+        fn sz_lookup(target: *const c_void, length: usize, source: *const c_void, lut: *const u8) -> *const c_void;
     }
 
     /// A simple semantic version structure.

From 9fe25df8abbe428fc7b2255dce19e96acd3d444f Mon Sep 17 00:00:00 2001
From: Mikayel Grigoryan <michael.grigoryan25@gmail.com>
Date: Sun, 2 Mar 2025 11:26:45 +0400
Subject: [PATCH 145/751] Improve: Remove redundant comments from sz_hash_state
 functions in Rust

---
 rust/lib.rs | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/rust/lib.rs b/rust/lib.rs
index 14f10c7e..b3cbca4e 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -153,13 +153,10 @@ pub mod sz {
             result: *mut isize,
         ) -> Status;
 
-        /// Initializes a hash state with a given seed value.
         fn sz_hash_state_init(state: *const c_void, seed: u64);
 
-        /// Updates the hash state with a new byte slice.
         fn sz_hash_state_stream(state: *const c_void, text: *const c_void, length: usize);
 
-        /// Finalizes the hash state and returns the computed hash value.
         fn sz_hash_state_fold(state: *const c_void) -> u64;
 
         fn sz_lookup(target: *const c_void, length: usize, source: *const c_void, lut: *const u8) -> *const c_void;

From c7b841e6eba0ec0600634daaf9b788f8916c425f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 4 Mar 2025 07:35:09 +0000
Subject: [PATCH 146/751] Add: Serial JOINs

---
 include/stringzilla/sort.h | 78 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 74 insertions(+), 4 deletions(-)

diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index 721ba940..d619369e 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -847,11 +847,81 @@ SZ_PUBLIC sz_status_t sz_pgrams_join_serial(sz_pgram_t *pgrams, sz_size_t count,
     return sz_success_k;
 }
 
-SZ_PUBLIC sz_status_t sz_sequence_join_serial(                                 //
-    sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence, //
-    sz_memory_allocator_t *alloc, sz_size_t *intersection_size,                //
+SZ_PUBLIC sz_status_t sz_sequence_join_serial(                                      //
+    sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence,      //
+    sz_memory_allocator_t *alloc, sz_u64_t seed, sz_size_t *intersection_count_ptr, //
     sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions) {
-    sz_unused(first_sequence && second_sequence && alloc && intersection_size && first_positions && second_positions);
+
+    // To join to unordered sets of strings, the simplest approach would be to hash them into a dynamically
+    // allocated hash table and then iterate over the second set, checking for the presence of each element in the
+    // hash table. This would require O(N) memory and O(N) time complexity, where N is the smaller set.
+    sz_sequence_t const *small_sequence, *large_sequence;
+    sz_sorted_idx_t *small_positions, *large_positions;
+    if (first_sequence->count <= second_sequence->count) {
+        small_sequence = first_sequence, large_sequence = second_sequence;
+        small_positions = first_positions, large_positions = second_positions;
+    }
+    else {
+        small_sequence = second_sequence, large_sequence = first_sequence;
+        small_positions = second_positions, large_positions = first_positions;
+    }
+
+    // We may very well have nothing to join
+    if (small_sequence->count == 0) {
+        *intersection_count_ptr = 0;
+        return sz_success_k;
+    }
+
+    // Allocate memory for the hash table and initialize it with 0xFF.
+    sz_size_t const hash_table_slots = sz_size_bit_ceil(small_sequence->count * 2);
+    sz_size_t const bytes_per_entry = sizeof(sz_size_t) + sizeof(sz_u64_t);
+    sz_size_t *table_positions = (sz_size_t *)alloc->allocate(hash_table_slots * bytes_per_entry, alloc);
+    if (!table_positions) return sz_bad_alloc_k;
+    sz_u64_t *table_fingerprints = (sz_u64_t *)(table_positions + hash_table_slots);
+    sz_fill((sz_ptr_t)table_positions, hash_table_slots * bytes_per_entry, 0xFF);
+
+    // Hash the smaller set into the hash table using the default available backend.
+    for (sz_size_t small_position = 0; small_position < small_sequence->count; ++small_position) {
+        sz_cptr_t const str = small_sequence->get_start(small_sequence->handle, small_position);
+        sz_size_t const length = small_sequence->get_length(small_sequence->handle, small_position);
+        sz_u64_t const hash = sz_hash(str, length, seed);
+        sz_size_t hash_slot = hash;
+        // Implement linear probing to resolve collisions.
+        while (table_positions[hash_slot & (hash_table_slots - 1)] != SZ_SIZE_MAX) ++hash_slot;
+        table_positions[hash_slot & (hash_table_slots - 1)] = small_position;
+        table_fingerprints[hash_slot & (hash_table_slots - 1)] = hash;
+    }
+
+    // Iterate over the larger set and check for the presence of each element in the hash table.
+    sz_size_t intersection_count = 0;
+    for (sz_size_t large_position = 0; large_position < large_sequence->count; ++large_position) {
+        sz_cptr_t const str = large_sequence->get_start(large_sequence->handle, large_position);
+        sz_size_t const length = large_sequence->get_length(large_sequence->handle, large_position);
+        sz_u64_t const hash = sz_hash(str, length, seed);
+        sz_size_t hash_slot = hash;
+        // Implement linear probing to resolve collisions.
+        for (; table_positions[hash_slot & (hash_table_slots - 1)] != SZ_SIZE_MAX; ++hash_slot) {
+            sz_u64_t small_hash = table_fingerprints[hash_slot & (hash_table_slots - 1)];
+            if (small_hash != hash) continue;
+
+            // The hash matches, compare the strings.
+            sz_size_t const small_position = table_positions[hash_slot & (hash_table_slots - 1)];
+            sz_size_t const small_length = small_sequence->get_length(small_sequence->handle, small_position);
+            if (length != small_length) continue;
+
+            sz_cptr_t const small_str = small_sequence->get_start(small_sequence->handle, small_position);
+            sz_bool_t const same = sz_equal(str, small_str, length);
+            if (same != sz_true_k) continue;
+
+            // Finally, there is a match, store the positions.
+            small_positions[intersection_count] = small_position;
+            large_positions[intersection_count] = large_position;
+            ++intersection_count;
+            break;
+        }
+    }
+
+    *intersection_count_ptr = intersection_count;
     return sz_success_k;
 }
 

From 75fabf1ba4c241790b0ecfd69c22c7c0a821ec4f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 4 Mar 2025 07:36:02 +0000
Subject: [PATCH 147/751] Fix: Passing `sz_sequence_t::handle`

---
 include/stringzilla/sort.h | 20 ++++++++++----------
 scripts/bench_sort.cpp     |  8 ++++----
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index d619369e..16ca31fe 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -264,10 +264,10 @@ SZ_PUBLIC void sz_sequence_argsort_with_insertion(sz_sequence_t const *sequence,
         while (j > 0) {
             // Get the two strings to compare.
             sz_sorted_idx_t previous_idx = order[j - 1];
-            sz_cptr_t previous_start = sequence->get_start(sequence, previous_idx);
-            sz_cptr_t current_start = sequence->get_start(sequence, current_idx);
-            sz_size_t previous_length = sequence->get_length(sequence, previous_idx);
-            sz_size_t current_length = sequence->get_length(sequence, current_idx);
+            sz_cptr_t previous_start = sequence->get_start(sequence->handle, previous_idx);
+            sz_cptr_t current_start = sequence->get_start(sequence->handle, current_idx);
+            sz_size_t previous_length = sequence->get_length(sequence->handle, previous_idx);
+            sz_size_t current_length = sequence->get_length(sequence->handle, current_idx);
 
             // Use the provided sz_order to compare.
             sz_ordering_t ordering = sz_order(previous_start, previous_length, current_start, current_length);
@@ -470,8 +470,8 @@ SZ_INTERNAL void _sz_sequence_argsort_serial_export_next_pgrams(
             _sz_assert(partial_order_index == i && "At start this must be an identity permutation.");
 
         // Get the string slice in global memory.
-        sz_cptr_t const source_str = sequence->get_start(sequence, partial_order_index);
-        sz_size_t const length = sequence->get_length(sequence, partial_order_index);
+        sz_cptr_t const source_str = sequence->get_start(sequence->handle, partial_order_index);
+        sz_size_t const length = sequence->get_length(sequence->handle, partial_order_index);
         sz_size_t const remaining_length = length > start_character ? length - start_character : 0;
         sz_size_t const exported_length = remaining_length > pgram_capacity ? pgram_capacity : remaining_length;
 
@@ -497,10 +497,10 @@ SZ_INTERNAL void _sz_sequence_argsort_serial_export_next_pgrams(
         for (sz_size_t i = start_in_sequence + 1; i < end_in_sequence; ++i) {
             sz_pgram_t const previous_pgram = global_pgrams[i - 1];
             sz_pgram_t const current_pgram = global_pgrams[i];
-            sz_cptr_t const previous_str = sequence->get_start(sequence, i - 1);
-            sz_size_t const previous_length = sequence->get_length(sequence, i - 1);
-            sz_cptr_t const current_str = sequence->get_start(sequence, i);
-            sz_size_t const current_length = sequence->get_length(sequence, i);
+            sz_cptr_t const previous_str = sequence->get_start(sequence->handle, i - 1);
+            sz_size_t const previous_length = sequence->get_length(sequence->handle, i - 1);
+            sz_cptr_t const current_str = sequence->get_start(sequence->handle, i);
+            sz_size_t const current_length = sequence->get_length(sequence->handle, i);
             sz_ordering_t const ordering = sz_order(                                               //
                 previous_str, previous_length > pgram_capacity ? pgram_capacity : previous_length, //
                 current_str, current_length > pgram_capacity ? pgram_capacity : current_length);
diff --git a/scripts/bench_sort.cpp b/scripts/bench_sort.cpp
index a045192f..f32d9909 100644
--- a/scripts/bench_sort.cpp
+++ b/scripts/bench_sort.cpp
@@ -42,10 +42,10 @@ static int _get_qsort_order(const void *a, const void *b, void *arg) {
     sz_size_t idx_a = *(sz_size_t *)a;
     sz_size_t idx_b = *(sz_size_t *)b;
 
-    char const *str_a = sequence->get_start(sequence, idx_a);
-    char const *str_b = sequence->get_start(sequence, idx_b);
-    sz_size_t len_a = sequence->get_length(sequence, idx_a);
-    sz_size_t len_b = sequence->get_length(sequence, idx_b);
+    char const *str_a = sequence->get_start(sequence->handle, idx_a);
+    char const *str_b = sequence->get_start(sequence->handle, idx_b);
+    sz_size_t len_a = sequence->get_length(sequence->handle, idx_a);
+    sz_size_t len_b = sequence->get_length(sequence->handle, idx_b);
 
     int res = strncmp(str_a, str_b, len_a < len_b ? len_a : len_b);
     return res ? res : (int)(len_a - len_b);

From ea5dc76c2ff9f8b09913a271c4e1fe3696ca817b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 7 Mar 2025 12:26:17 +0000
Subject: [PATCH 148/751] Add: Intersections on Ice Lake

---
 include/stringzilla/intersect.h   | 749 ++++++++++++++++++++++++++++++
 include/stringzilla/sort.h        | 305 +-----------
 include/stringzilla/stringzilla.h |   3 +-
 3 files changed, 756 insertions(+), 301 deletions(-)
 create mode 100644 include/stringzilla/intersect.h

diff --git a/include/stringzilla/intersect.h b/include/stringzilla/intersect.h
new file mode 100644
index 00000000..77033148
--- /dev/null
+++ b/include/stringzilla/intersect.h
@@ -0,0 +1,749 @@
+/**
+ *  @brief  Hardware-accelerated string collection intersections for JOIN-like DBMS operations.
+ *  @file   intersect.h
+ *  @author Ash Vardanian
+ *
+ *  Includes core APIs for `sz_sequence_t` string collections with hardware-specific backends:
+ *
+ *  - `sz_sequence_intersection` - to compute the strict intersection of two deduplicated string collections.
+ *  - TODO: `sz_sequence_join` - to compute the intersection of two arbitrary string collections.
+ */
+#ifndef STRINGZILLA_INTERSECT_H_
+#define STRINGZILLA_INTERSECT_H_
+
+#include "types.h"
+
+#include "compare.h" // `sz_compare`
+#include "memory.h"  // `sz_fill`
+#include "hash.h"    // `sz_hash`
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ *  @brief  The @b power-of-two memory-usage budget @b multiple for the hash table.
+ *
+ *  The behaviour of hashing-based approaches can often be tuned with different "hyper-parameter" values.
+ *  For "unordered set intersections" implemented here, the @p budget argument controls the balance between
+ *  throughput and memory usage. The higher the budget, the more memory is used, but the fewer collisions
+ *  will be observed
+ */
+#if !defined(SZ_SEQUENCE_INTERSECT_BUDGET)
+#define SZ_SEQUENCE_INTERSECT_BUDGET (1)
+#endif
+
+#pragma region Core API
+
+/**
+ *  @brief  Intersects two @b deduplicated binary @b string sequences, using a hash table.
+ *          Outputs the @p first_positions from the @p first_sequence and @p second_positions from
+ *          the @p second_sequence, that contain matched strings. Missing matches are represented as `SZ_SIZE_MAX`.
+ *
+ *  @param[in] first_sequence First immutable sequence of strings to intersection.
+ *  @param[in] second_sequence Second immutable sequence of strings to intersection.
+ *  @param[in] semantics JOIN semantics for the intersection, including handling of duplicates.
+ *  @param[in] alloc Optional memory allocator for temporary storage.
+ *  @param[in] seed Optional seed for the hash table to avoid attacks.
+ *  @param[out] intersection_size Number of matching strings in both sequences.
+ *  @param[out] first_positions Offset positions of the matching strings from the @p first_sequence.
+ *  @param[out] second_positions Offset positions of the matching strings from the @p second_sequence.
+ *
+ *  @retval `sz_success_k` if the operation was successful.
+ *  @retval `sz_bad_alloc_k` if the operation failed due to memory allocation failure.
+ *  @pre The @p first_positions array must fit at least `min(first_sequence->count, second_sequence->count)` items.
+ *  @pre The @p second_positions array must fit at least `min(first_sequence->count, second_sequence->count)` items.
+ *  @warning Doesn't check for duplicates and won't return `sz_contains_duplicates_k`. Duplicates result in UB.
+ *
+ *  Example usage:
+ *
+ *  @code{.c}
+ *      #include <stringzilla/join.h>
+ *      int main() {
+ *          char const *first[] = {"banana", "apple", "cherry"};
+ *          char const *second[] = {"cherry", "orange", "pineapple", "banana"};
+ *          sz_sequence_t first_sequence, second_sequence;
+ *          sz_sequence_from_null_terminated_strings(first, 3, &first_sequence);
+ *          sz_sequence_from_null_terminated_strings(second, 4, &second_sequence);
+ *          sz_size_t intersection_size;
+ *          sz_sorted_idx_t first_positions[3], second_positions[3]; //? 3 is the size of the smaller sequence
+ *          sz_status_t status = sz_sequence_intersect(&first_sequence, &second_sequence,
+ *              sz_join_inner_strict_k, NULL, 0,
+ *              &intersection_size, first_positions, second_positions);
+ *          return status == sz_success_k && intersection_size == 2 ? 0 : 1;
+ *      }
+ *  @endcode
+ *
+ *  @note   The algorithm has linear memory complexity and linear time complexity.
+ *  @see    https://en.wikipedia.org/wiki/Join_(SQL)
+ *
+ *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
+ *  @sa     sz_sequence_intersect_serial, sz_sequence_intersect_ice, sz_sequence_intersect_sve
+ */
+SZ_DYNAMIC sz_status_t sz_sequence_intersect(sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence,
+                                             sz_memory_allocator_t *alloc, sz_u64_t seed, sz_size_t *intersection_size,
+                                             sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions);
+
+/**
+ *  @brief  Defines various JOIN semantics for string sequences, including handling of duplicates.
+ *  @sa     sz_join_inner_strict_k, sz_join_inner_k, sz_join_left_outer_k, sz_join_right_outer_k, sz_join_full_outer_k,
+ *          sz_join_cross_k
+ */
+typedef enum {
+    /**
+     *  @brief  Strict inner join with uniqueness enforcement.
+     *
+     *  In this mode, only unique matching strings from both sequences are returned.
+     *  If either sequence contains duplicate strings, the operation will fail.
+     *
+     *  Example:
+     *  - Input:
+     *      first_sequence: { "apple", "banana", "cherry" }
+     *      second_sequence: { "banana", "cherry", "date" }
+     *  - Output:
+     *      Result: { ("banana", "banana"), ("cherry", "cherry") }
+     *
+     *  SQL equivalent:
+     *  @code{.sql}
+     *  -- Returns unique matching rows only.
+     *  SELECT DISTINCT a.*
+     *  FROM first_sequence a
+     *  INNER JOIN second_sequence b ON a.string = b.string;
+     *  @endcode
+     */
+    sz_join_inner_strict_k = 0,
+
+    /**
+     *  @brief  Conventional inner join allowing duplicate entries.
+     *
+     *  This mode returns all pairs of matching strings from both sequences.
+     *  Each occurrence in the first sequence is paired with every matching occurrence
+     *  in the second sequence. Order stability is not guaranteed.
+     *
+     *  Example:
+     *  - Input:
+     *      first_sequence: { "apple", "banana", "banana" }
+     *      second_sequence: { "banana", "banana", "cherry" }
+     *  - Output:
+     *      Result: { ("banana", "banana"), ("banana", "banana"),
+     *                ("banana", "banana"), ("banana", "banana") }
+     *      (2 occurrences of "banana" in the first sequence × 2 in the second = 4 pairs)
+     *
+     *  SQL equivalent:
+     *  @code{.sql}
+     *  SELECT a.*, b.*
+     *  FROM first_sequence a
+     *  INNER JOIN second_sequence b ON a.string = b.string;
+     *  @endcode
+     */
+    sz_join_inner_k = 1,
+
+    /**
+     *  @brief  Left outer join preserving all entries from the first sequence.
+     *
+     *  This mode returns every string from the first sequence along with matching strings
+     *  from the second sequence. If no match is found for an element in the first sequence,
+     *  the corresponding output for the second sequence is NULL (or its equivalent).
+     *
+     *  Example:
+     *  - Input:
+     *      first_sequence: { "apple", "banana", "cherry" }
+     *      second_sequence: { "banana", "cherry", "date" }
+     *  - Output:
+     *      Result: { ("apple", NULL), ("banana", "banana"), ("cherry", "cherry") }
+     *
+     *  SQL equivalent:
+     *  @code{.sql}
+     *  SELECT a.*, b.*
+     *  FROM first_sequence a
+     *  LEFT OUTER JOIN second_sequence b ON a.string = b.string;
+     *  @endcode
+     */
+    sz_join_left_outer_k = 2,
+
+    /**
+     *  @brief  Right outer join preserving all entries from the second sequence.
+     *
+     *  This mode returns every string from the second sequence along with matching strings
+     *  from the first sequence. If no match is found for an element in the second sequence,
+     *  the corresponding output for the first sequence is NULL (or its equivalent).
+     *
+     *  Example:
+     *  - Input:
+     *      first_sequence: { "apple", "banana" }
+     *      second_sequence: { "banana", "cherry", "date" }
+     *  - Output:
+     *      Result: { ("banana", "banana"), (NULL, "cherry"), (NULL, "date") }
+     *
+     *  SQL equivalent:
+     *  @code{.sql}
+     *  SELECT a.*, b.*
+     *  FROM first_sequence a
+     *  RIGHT OUTER JOIN second_sequence b ON a.string = b.string;
+     *  @endcode
+     */
+    sz_join_right_outer_k = 3,
+
+    /**
+     *  @brief  Full outer join combining all entries from both sequences.
+     *
+     *  This mode returns all matching pairs along with unmatched strings from both sequences.
+     *  For unmatched strings, the corresponding result from the other sequence is NULL.
+     *
+     *  Example:
+     *  - Input:
+     *      first_sequence: { "apple", "banana" }
+     *      second_sequence: { "banana", "cherry" }
+     *  - Output:
+     *      Result: { ("apple", NULL), ("banana", "banana"), (NULL, "cherry") }
+     *
+     *  SQL equivalent:
+     *  @code{.sql}
+     *  SELECT a.*, b.*
+     *  FROM first_sequence a
+     *  FULL OUTER JOIN second_sequence b ON a.string = b.string;
+     *  @endcode
+     */
+    sz_join_full_outer_k = 4,
+
+    /**
+     *  @brief  Cross join (Cartesian product) of two sequences.
+     *
+     *  This mode returns the Cartesian product of both sequences, pairing every string in the first sequence
+     *  with every string in the second sequence regardless of any matching condition.
+     *
+     *  Example:
+     *  - Input:
+     *      first_sequence: { "apple", "banana" }
+     *      second_sequence: { "cherry", "date" }
+     *  - Output:
+     *      Result: { ("apple", "cherry"), ("apple", "date"),
+     *                ("banana", "cherry"), ("banana", "date") }
+     *
+     *  SQL equivalent:
+     *  @code{.sql}
+     *  SELECT a.*, b.*
+     *  FROM first_sequence a, second_sequence b;
+     *  @endcode
+     */
+    sz_join_cross_k = 5,
+} sz_sequence_join_semantics_t;
+
+#if SZ_USE_ICE
+
+/** @copydoc sz_sequence_intersect */
+SZ_PUBLIC sz_status_t sz_sequence_intersect_ice(                               //
+    sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence, //
+    sz_memory_allocator_t *alloc, sz_u64_t seed, sz_size_t *intersection_size, //
+    sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions);
+
+#endif
+
+#if SZ_USE_SVE
+
+/** @copydoc sz_sequence_intersect */
+SZ_PUBLIC sz_status_t sz_sequence_intersect_sve(                               //
+    sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence, //
+    sz_memory_allocator_t *alloc, sz_u64_t seed, sz_size_t *intersection_size, //
+    sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions);
+
+#endif
+
+#pragma endregion
+
+#pragma region Serial Implementation
+
+SZ_PUBLIC sz_status_t sz_sequence_intersect_serial(                                 //
+    sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence,      //
+    sz_memory_allocator_t *alloc, sz_u64_t seed, sz_size_t *intersection_count_ptr, //
+    sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions) {
+
+    // To join to unordered sets of strings, the simplest approach would be to hash them into a dynamically
+    // allocated hash table and then iterate over the second set, checking for the presence of each element in the
+    // hash table. This would require O(N) memory and O(N) time complexity, where N is the smaller set.
+    sz_sequence_t const *small_sequence, *large_sequence;
+    sz_sorted_idx_t *small_positions, *large_positions;
+    if (first_sequence->count <= second_sequence->count) {
+        small_sequence = first_sequence, large_sequence = second_sequence;
+        small_positions = first_positions, large_positions = second_positions;
+    }
+    else {
+        small_sequence = second_sequence, large_sequence = first_sequence;
+        small_positions = second_positions, large_positions = first_positions;
+    }
+
+    // We may very well have nothing to join
+    if (small_sequence->count == 0) {
+        *intersection_count_ptr = 0;
+        return sz_success_k;
+    }
+
+    // Allocate memory for the hash table and initialize it with 0xFF.
+    // The higher is the `hash_table_slots` multiple - the more memory we will use,
+    // but the less likely the collisions will be.
+    sz_size_t const hash_table_slots = sz_size_bit_ceil(small_sequence->count) * (1 << SZ_SEQUENCE_INTERSECT_BUDGET);
+    sz_size_t const bytes_per_entry = sizeof(sz_size_t) + sizeof(sz_u64_t);
+    sz_size_t *const table_positions = (sz_size_t *)alloc->allocate(hash_table_slots * bytes_per_entry, alloc);
+    if (!table_positions) return sz_bad_alloc_k;
+    sz_u64_t *const table_hashes = (sz_u64_t *)(table_positions + hash_table_slots);
+    sz_fill((sz_ptr_t)table_positions, hash_table_slots * bytes_per_entry, 0xFF);
+
+    // Hash the smaller set into the hash table using the default available backend.
+    for (sz_size_t small_position = 0; small_position < small_sequence->count; ++small_position) {
+        sz_cptr_t const str = small_sequence->get_start(small_sequence->handle, small_position);
+        sz_size_t const length = small_sequence->get_length(small_sequence->handle, small_position);
+        sz_u64_t const hash = sz_hash(str, length, seed);
+        sz_size_t hash_slot = hash & (hash_table_slots - 1);
+        // Implement linear probing to find the first free slot.
+        // If we somehow face 2 different strings with same hash, we will export that hash 2 times!
+        while (table_hashes[hash_slot] != SZ_SIZE_MAX) hash_slot = (hash_slot + 1) & (hash_table_slots - 1);
+        table_hashes[hash_slot] = hash;
+        table_positions[hash_slot] = small_position;
+    }
+
+    // Iterate over the larger set and check for the presence of each element in the hash table.
+    sz_size_t intersection_count = 0;
+    for (sz_size_t large_position = 0; large_position < large_sequence->count; ++large_position) {
+        sz_cptr_t const str = large_sequence->get_start(large_sequence->handle, large_position);
+        sz_size_t const length = large_sequence->get_length(large_sequence->handle, large_position);
+        sz_u64_t const hash = sz_hash(str, length, seed);
+        sz_size_t hash_slot = hash & (hash_table_slots - 1);
+
+        // Implement linear probing to resolve collisions.
+        for (; table_hashes[hash_slot] != SZ_SIZE_MAX; hash_slot = (hash_slot + 1) & (hash_table_slots - 1)) {
+            sz_u64_t small_hash = table_hashes[hash_slot];
+            if (small_hash != hash) continue;
+
+            // The hash matches, compare the strings.
+            sz_size_t const small_position = table_positions[hash_slot];
+            sz_size_t const small_length = small_sequence->get_length(small_sequence->handle, small_position);
+            if (length != small_length) continue;
+
+            // Same hash may still imply different strings, so we need to compare them.
+            sz_cptr_t const small_str = small_sequence->get_start(small_sequence->handle, small_position);
+            sz_bool_t const same = sz_equal(str, small_str, length);
+            if (same != sz_true_k) continue;
+
+            // Finally, there is a match, store the positions.
+            small_positions[intersection_count] = small_position;
+            large_positions[intersection_count] = large_position;
+            ++intersection_count;
+            break;
+        }
+    }
+
+    *intersection_count_ptr = intersection_count;
+    return sz_success_k;
+}
+
+#pragma endregion // Serial Implementation
+
+/*  AVX512 implementation of the string search algorithms for Ice Lake and newer CPUs.
+ *  Includes extensions:
+ *      - 2017 Skylake: F, CD, ER, PF, VL, DQ, BW,
+ *      - 2018 CannonLake: IFMA, VBMI,
+ *      - 2019 Ice Lake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES.
+ *
+ *  We are going to use VBMI2 for `_mm256_maskz_compress_epi8`.
+ */
+#pragma region Ice Lake Implementation
+#if SZ_USE_ICE
+#pragma GCC push_options
+#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "avx512vnni", "bmi", "bmi2", \
+                   "aes", "vaes")
+#pragma clang attribute push(                                                                                  \
+    __attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vnni,bmi,bmi2,aes,vaes"))), \
+    apply_to = function)
+
+SZ_INTERNAL int _sz_u64x4_contains_collisions_haswell(__m256i v) {
+    // Assume `v` stores values: [a, b, c, d].
+    __m256i cmp1 = _mm256_cmpeq_epi64(v, _mm256_permute4x64_epi64(v, 0xB1)); // 0xB1 produces [b, a, d, c]
+    __m256i cmp2 = _mm256_cmpeq_epi64(v, _mm256_permute4x64_epi64(v, 0x4E)); // 0x4E produces [c, d, a, b]
+    __m256i cmp3 = _mm256_cmpeq_epi64(v, _mm256_permute4x64_epi64(v, 0x1B)); // 0x1B produces [d, c, b, a]
+
+    // Combine the results from the three comparisons.
+    __m256i cmp = _mm256_or_si256(_mm256_or_si256(cmp1, cmp2), cmp3);
+
+    // Each 64-bit lane comparison yields all ones if equal, so the movemask will be nonzero if any pair matched.
+    int mask = _mm256_movemask_epi8(cmp);
+    return mask;
+}
+
+SZ_PUBLIC sz_status_t sz_sequence_intersect_ice(                                    //
+    sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence,      //
+    sz_memory_allocator_t *alloc, sz_u64_t seed, sz_size_t *intersection_count_ptr, //
+    sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions) {
+
+    // To join to unordered sets of strings, the simplest approach would be to hash them into a dynamically
+    // allocated hash table and then iterate over the second set, checking for the presence of each element in the
+    // hash table. This would require O(N) memory and O(N) time complexity, where N is the smaller set.
+    sz_sequence_t const *small_sequence, *large_sequence;
+    sz_sorted_idx_t *small_positions, *large_positions;
+    if (first_sequence->count <= second_sequence->count) {
+        small_sequence = first_sequence, large_sequence = second_sequence;
+        small_positions = first_positions, large_positions = second_positions;
+    }
+    else {
+        small_sequence = second_sequence, large_sequence = first_sequence;
+        small_positions = second_positions, large_positions = first_positions;
+    }
+
+    // We may very well have nothing to join
+    if (small_sequence->count == 0) {
+        *intersection_count_ptr = 0;
+        return sz_success_k;
+    }
+
+    // Allocate memory for the hash table and initialize it with 0xFF.
+    // The higher is the `hash_table_slots` multiple - the more memory we will use,
+    // but the less likely the collisions will be.
+    sz_size_t const hash_table_slots = sz_size_bit_ceil(small_sequence->count) * (1 << SZ_SEQUENCE_INTERSECT_BUDGET);
+    sz_size_t const bytes_per_entry = sizeof(sz_size_t) + sizeof(sz_u64_t);
+    sz_size_t *table_positions = (sz_size_t *)alloc->allocate(hash_table_slots * bytes_per_entry, alloc);
+    if (!table_positions) return sz_bad_alloc_k;
+    sz_u64_t *table_hashes = (sz_u64_t *)(table_positions + hash_table_slots);
+    sz_fill((sz_ptr_t)table_positions, hash_table_slots * bytes_per_entry, 0xFF);
+
+    // Conceptually the Ice Lake variant is similar to the serial one, except it takes advantage of:
+    // - computing 4x individual high-quality hashes with `_mm512_aesenc_epi128`.
+    // - gathering values from the hash-table using `_mm256_mmask_i64gather_epi64`.
+    //
+    // We still start by hashing the smaller set into the hash table, but we will process 4 entries
+    // at a time and will separately handle values under 16 bytes fitting into one AES block and the
+    // larger values.
+    //
+    // For larger entries, we will use a separate loop afterwards to decrease the likelihood of collisions
+    // on the shorter entries, that can benefit from vectorized processing.
+    _sz_hash_minimal_x4_t batch_hashes_states_initial;
+    _sz_hash_minimal_x4_init_ice(&batch_hashes_states_initial, seed);
+    sz_size_t count_longer = 0;
+    for (sz_size_t small_position = 0; small_position < small_sequence->count;) {
+        sz_string_view_t batch[4];
+        sz_u256_vec_t batch_positions;
+        sz_size_t batch_size;
+        for (batch_size = 0; batch_size < 4 && small_position < small_sequence->count; ++small_position) {
+            sz_size_t length = small_sequence->get_length(small_sequence->handle, small_position);
+            if (length > 16) {
+                count_longer++;
+                continue;
+            }
+            sz_cptr_t str = small_sequence->get_start(small_sequence->handle, small_position);
+            batch[batch_size].start = str;
+            batch[batch_size].length = length;
+            batch_positions.u64s[batch_size] = small_position;
+            ++batch_size;
+        }
+
+        // If we couldn't populate the whole batch, fall back to the serial solution
+        if (batch_size != 4) {
+            for (sz_size_t i = 0; i < batch_size; ++i) {
+                sz_cptr_t const str = batch[i].start;
+                sz_size_t const length = batch[i].length;
+                sz_u64_t const hash = sz_hash(str, length, seed);
+                sz_size_t hash_slot = hash & (hash_table_slots - 1);
+                // Implement linear probing to find the first free slot.
+                // If we somehow face 2 different strings with same hash, we will export that hash 2 times!
+                while (table_hashes[hash_slot] != SZ_SIZE_MAX) hash_slot = (hash_slot + 1) & (hash_table_slots - 1);
+                table_hashes[hash_slot] = hash;
+                table_positions[hash_slot] = batch_positions.u64s[i];
+            }
+        }
+        // The batch is successfully populated, let's use the vectorized solution
+        else {
+            // Now let's load the first bytes of each string.
+            sz_u256_vec_t batch_hashes;
+            sz_u512_vec_t batch_prefixes;
+            batch_prefixes.xmms[0] = _mm_maskz_loadu_epi8(_sz_u16_mask_until(batch[0].length), batch[0].start);
+            batch_prefixes.xmms[1] = _mm_maskz_loadu_epi8(_sz_u16_mask_until(batch[1].length), batch[1].start);
+            batch_prefixes.xmms[2] = _mm_maskz_loadu_epi8(_sz_u16_mask_until(batch[2].length), batch[2].start);
+            batch_prefixes.xmms[3] = _mm_maskz_loadu_epi8(_sz_u16_mask_until(batch[3].length), batch[3].start);
+
+            // Reuse the already computed state for hashes
+            _sz_hash_minimal_x4_t batch_hashes_states = batch_hashes_states_initial;
+            _sz_hash_minimal_x4_update_ice(&batch_hashes_states, batch_prefixes.zmm);
+            batch_hashes.ymm = _sz_hash_minimal_x4_finalize_ice(&batch_hashes_states, batch[0].length, batch[1].length,
+                                                                batch[2].length, batch[3].length);
+            _sz_assert(batch_hashes.u64s[0] == sz_hash(batch[0].start, batch[0].length, seed));
+            _sz_assert(batch_hashes.u64s[1] == sz_hash(batch[1].start, batch[1].length, seed));
+            _sz_assert(batch_hashes.u64s[2] == sz_hash(batch[2].start, batch[2].length, seed));
+            _sz_assert(batch_hashes.u64s[3] == sz_hash(batch[3].start, batch[3].length, seed));
+
+            // Now let's perform an optimistic hash-table lookup using vectorized gathers
+            sz_u256_vec_t batch_slots, existing_hashes;
+            batch_slots.ymm = _mm256_and_si256(batch_hashes.ymm, _mm256_set1_epi64x(hash_table_slots - 1));
+
+            // In case of very small inputs, it's more likely, that some of the 4x hashes or their slots will collide
+            int const has_slot_collisions = _sz_u64x4_contains_collisions_haswell(batch_slots.ymm);
+
+            // Before scattering the new positions - gather the pre-existing ones.
+            // In case of `has_slot_collisions`, this will practically be a "prefetch" operation.
+            existing_hashes.ymm =
+                _mm256_mmask_i64gather_epi64(_mm256_setzero_si256(), 0xFF, batch_slots.ymm, table_hashes, 8);
+
+            // Check that we don't have any collisions - in that case each value will be equal to `SZ_SIZE_MAX`
+            int const all_empty = _mm256_testc_si256(existing_hashes.ymm, _mm256_set1_epi64x(SZ_SIZE_MAX));
+            if (all_empty && !has_slot_collisions) {
+                // Scatter the new positions
+                _mm256_mask_i64scatter_epi64(table_hashes, 0xFF, batch_slots.ymm, batch_hashes.ymm, 8);
+                _mm256_mask_i64scatter_epi64(table_positions, 0xFF, batch_slots.ymm, batch_positions.ymm, 8);
+            }
+            else {
+                // We have a collision, let's resolve it with a serial solution
+                for (sz_size_t i = 0; i < 4; ++i) {
+                    sz_size_t hash_slot = batch_slots.u64s[i] & (hash_table_slots - 1);
+                    // Implement linear probing to find the first free slot.
+                    // If we somehow face 2 different strings with same hash, we will export that hash 2 times!
+                    while (table_hashes[hash_slot] != SZ_SIZE_MAX) hash_slot = (hash_slot + 1) & (hash_table_slots - 1);
+                    table_hashes[hash_slot] = batch_hashes.u64s[i];
+                    table_positions[hash_slot] = batch_positions.u64s[i];
+                }
+            }
+        }
+    }
+
+    // Now, let's cross-reference all shorter values from the larger collection.
+    sz_size_t intersection_count = 0;
+    for (sz_size_t large_position = 0; large_position < large_sequence->count;) {
+        sz_string_view_t batch[4];
+        sz_u256_vec_t batch_positions;
+        sz_size_t batch_size;
+        for (batch_size = 0; batch_size < 4 && large_position < large_sequence->count; ++large_position) {
+            sz_size_t length = large_sequence->get_length(large_sequence->handle, large_position);
+            if (length > 16) {
+                count_longer++;
+                continue;
+            }
+            sz_cptr_t str = large_sequence->get_start(large_sequence->handle, large_position);
+            batch[batch_size].start = str;
+            batch[batch_size].length = length;
+            batch_positions.u64s[batch_size] = large_position;
+            ++batch_size;
+        }
+
+        // If we couldn't populate the whole batch, fall back to the serial solution
+        if (batch_size != 4) {
+            for (sz_size_t i = 0; i < batch_size; ++i) {
+                sz_cptr_t const str = batch[i].start;
+                sz_size_t const length = batch[i].length;
+                sz_u64_t const hash = sz_hash(str, length, seed);
+                sz_size_t hash_slot = hash & (hash_table_slots - 1);
+                // Implement linear probing to resolve collisions.
+                for (; table_hashes[hash_slot] != SZ_SIZE_MAX; hash_slot = (hash_slot + 1) & (hash_table_slots - 1)) {
+                    sz_u64_t small_hash = table_hashes[hash_slot];
+                    if (small_hash != hash) continue;
+
+                    // The hash matches, compare the strings.
+                    sz_size_t const small_position = table_positions[hash_slot];
+                    sz_size_t const small_length = small_sequence->get_length(small_sequence->handle, small_position);
+                    if (length != small_length) continue;
+
+                    // Same hash may still imply different strings, so we need to compare them.
+                    sz_cptr_t const small_str = small_sequence->get_start(small_sequence->handle, small_position);
+                    sz_bool_t const same = sz_equal(str, small_str, length);
+                    if (same != sz_true_k) continue;
+
+                    // Finally, there is a match, store the positions.
+                    small_positions[intersection_count] = small_position;
+                    large_positions[intersection_count] = batch_positions.u64s[i];
+                    ++intersection_count;
+                    break;
+                }
+            }
+        }
+        // The batch is successfully populated, let's use the vectorized solution
+        else {
+            // Now let's load the first bytes of each string.
+            sz_u256_vec_t batch_hashes;
+            sz_u512_vec_t batch_prefixes;
+            batch_prefixes.xmms[0] = _mm_maskz_loadu_epi8(_sz_u16_mask_until(batch[0].length), batch[0].start);
+            batch_prefixes.xmms[1] = _mm_maskz_loadu_epi8(_sz_u16_mask_until(batch[1].length), batch[1].start);
+            batch_prefixes.xmms[2] = _mm_maskz_loadu_epi8(_sz_u16_mask_until(batch[2].length), batch[2].start);
+            batch_prefixes.xmms[3] = _mm_maskz_loadu_epi8(_sz_u16_mask_until(batch[3].length), batch[3].start);
+
+            // Reuse the already computed state for hashes
+            _sz_hash_minimal_x4_t batch_hashes_states = batch_hashes_states_initial;
+            _sz_hash_minimal_x4_update_ice(&batch_hashes_states, batch_prefixes.zmm);
+            batch_hashes.ymm = _sz_hash_minimal_x4_finalize_ice(&batch_hashes_states, batch[0].length, batch[1].length,
+                                                                batch[2].length, batch[3].length);
+            _sz_assert(batch_hashes.u64s[0] == sz_hash(batch[0].start, batch[0].length, seed));
+            _sz_assert(batch_hashes.u64s[1] == sz_hash(batch[1].start, batch[1].length, seed));
+            _sz_assert(batch_hashes.u64s[2] == sz_hash(batch[2].start, batch[2].length, seed));
+            _sz_assert(batch_hashes.u64s[3] == sz_hash(batch[3].start, batch[3].length, seed));
+
+            // Now let's perform an optimistic hash-table lookup using vectorized gathers.
+            sz_u256_vec_t batch_slots, existing_hashes;
+            batch_slots.ymm = _mm256_and_si256(batch_hashes.ymm, _mm256_set1_epi64x(hash_table_slots - 1));
+
+            // Before scattering the new positions - gather the pre-existing ones.
+            // This can help us detect values:
+            // - that are definitely missing in the hash table, if the slot is just NULL-ed
+            // - that may be present in the hash table, and need to be validated in the loop
+            existing_hashes.ymm =
+                _mm256_mmask_i64gather_epi64(_mm256_setzero_si256(), 0xFF, batch_slots.ymm, table_hashes, 8);
+
+            // Check if we already have all of those slots populated with exactly the same values
+            int const same_hashes = _mm256_movemask_epi8(_mm256_cmpeq_epi64(existing_hashes.ymm, batch_hashes.ymm));
+            int const nulled_hashes =
+                _mm256_movemask_epi8(_mm256_cmpeq_epi64(existing_hashes.ymm, _mm256_set1_epi64x(SZ_SIZE_MAX)));
+
+            // Now for every one of the 4 hashed values we can have several outcomes:
+            // - it's an "empty" value → no match
+            // - it's a different hash → continue probing
+            // - it's the same hash for a different string, so we have a rare collision → continue probing
+            // - it's the same hash for the same string, so we have a match → export
+            //
+            // That logic is too complex to be effectively handled by SIMD, so we switch back to serial code.
+            for (sz_size_t i = 0; i < 4; ++i) {
+                sz_cptr_t const str = batch[i].start;
+                sz_size_t const length = batch[i].length;
+                sz_u64_t const hash = batch_hashes.u64s[i];
+                int const same_hash = (same_hashes >> (8 * i)) & 0xFF;
+                int const nulled_hash = (nulled_hashes >> (8 * i)) & 0xFF;
+                if (nulled_hash) continue;
+
+                sz_size_t hash_slot = batch_slots.u64s[i];
+                // This optimization may look like just one less  memory load,
+                // but it will help us produce a different set of branches and will affect
+                // the branch prediction quality on the CPU backend.
+                if (same_hash) {
+                    // The hash matches, compare the strings.
+                    sz_size_t const small_position = table_positions[hash_slot];
+                    sz_size_t const small_length = small_sequence->get_length(small_sequence->handle, small_position);
+                    if (length == small_length) {
+                        // Same hash may still imply different strings, so we need to compare them.
+                        sz_cptr_t const small_str = small_sequence->get_start(small_sequence->handle, small_position);
+                        sz_bool_t const same = sz_equal(str, small_str, length);
+                        if (same == sz_true_k) {
+                            // Finally, there is a match, store the positions.
+                            small_positions[intersection_count] = small_position;
+                            large_positions[intersection_count] = batch_positions.u64s[i];
+                            ++intersection_count;
+                            // Now go to the next value in the batch.
+                            continue;
+                        }
+                    }
+                    // If any of the conditions above didn't hold, just continue probing.
+                    hash_slot = (hash_slot + 1) & (hash_table_slots - 1);
+                }
+
+                // Implement linear probing to resolve collisions.
+                for (; table_hashes[hash_slot] != SZ_SIZE_MAX; hash_slot = (hash_slot + 1) & (hash_table_slots - 1)) {
+                    sz_u64_t small_hash = table_hashes[hash_slot];
+                    if (small_hash != hash) continue;
+
+                    // The hash matches, compare the strings.
+                    sz_size_t const small_position = table_positions[hash_slot];
+                    sz_size_t const small_length = small_sequence->get_length(small_sequence->handle, small_position);
+                    if (length != small_length) continue;
+
+                    // Same hash may still imply different strings, so we need to compare them.
+                    sz_cptr_t const small_str = small_sequence->get_start(small_sequence->handle, small_position);
+                    sz_bool_t const same = sz_equal(str, small_str, length);
+                    if (same != sz_true_k) continue;
+
+                    // Finally, there is a match, store the positions.
+                    small_positions[intersection_count] = small_position;
+                    large_positions[intersection_count] = batch_positions.u64s[i];
+                    ++intersection_count;
+                    break;
+                }
+            }
+        }
+    }
+
+    // TODO: Consider one more level of partitioning, separating the values into [17:64] and [64:] ranges.
+    if (count_longer) {
+        // At this point only large values are remaining, let's process them with the code identical to our
+        // serial solution, but dispatching the right Ice Lake kernel under the hood.
+        sz_fill((sz_ptr_t)table_positions, hash_table_slots * bytes_per_entry, 0xFF);
+
+        // Hash the smaller set into the hash table using the default available backend.
+        for (sz_size_t small_position = 0; small_position < small_sequence->count; ++small_position) {
+            sz_size_t const length = small_sequence->get_length(small_sequence->handle, small_position);
+            if (length <= 16) continue; //! This is the only difference from the serial solution
+            sz_cptr_t const str = small_sequence->get_start(small_sequence->handle, small_position);
+            sz_u64_t const hash = sz_hash(str, length, seed);
+            sz_size_t hash_slot = hash & (hash_table_slots - 1);
+            // Implement linear probing to find the first free slot.
+            // If we somehow face 2 different strings with same hash, we will export that hash 2 times!
+            while (table_hashes[hash_slot] != SZ_SIZE_MAX) hash_slot = (hash_slot + 1) & (hash_table_slots - 1);
+            table_hashes[hash_slot] = hash;
+            table_positions[hash_slot] = small_position;
+        }
+
+        // Iterate over the larger set and check for the presence of each element in the hash table.
+        for (sz_size_t large_position = 0; large_position < large_sequence->count; ++large_position) {
+            sz_size_t const length = large_sequence->get_length(large_sequence->handle, large_position);
+            if (length <= 16) continue; //! This is the only difference from the serial solution
+            sz_cptr_t const str = large_sequence->get_start(large_sequence->handle, large_position);
+            sz_u64_t const hash = sz_hash(str, length, seed);
+            sz_size_t hash_slot = hash & (hash_table_slots - 1);
+
+            // Implement linear probing to resolve collisions.
+            for (; table_hashes[hash_slot] != SZ_SIZE_MAX; hash_slot = (hash_slot + 1) & (hash_table_slots - 1)) {
+                sz_u64_t small_hash = table_hashes[hash_slot];
+                if (small_hash != hash) continue;
+
+                // The hash matches, compare the strings.
+                sz_size_t const small_position = table_positions[hash_slot];
+                sz_size_t const small_length = small_sequence->get_length(small_sequence->handle, small_position);
+                if (length != small_length) continue;
+
+                // Same hash may still imply different strings, so we need to compare them.
+                sz_cptr_t const small_str = small_sequence->get_start(small_sequence->handle, small_position);
+                sz_bool_t const same = sz_equal(str, small_str, length);
+                if (same != sz_true_k) continue;
+
+                // Finally, there is a match, store the positions.
+                small_positions[intersection_count] = small_position;
+                large_positions[intersection_count] = large_position;
+                ++intersection_count;
+                break;
+            }
+        }
+    }
+
+    // Finalize
+    *intersection_count_ptr = intersection_count;
+    return sz_success_k;
+}
+
+#pragma clang attribute pop
+#pragma GCC pop_options
+#endif            // SZ_USE_ICE
+#pragma endregion // Ice Lake Implementation
+
+/*  Pick the right implementation for the string search algorithms.
+ *  To override this behavior and precompile all backends - set `SZ_DYNAMIC_DISPATCH` to 1.
+ */
+#pragma region Compile Time Dispatching
+#if !SZ_DYNAMIC_DISPATCH
+
+SZ_DYNAMIC sz_status_t sz_sequence_intersect(sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence,
+                                             sz_memory_allocator_t *alloc, sz_u64_t seed, sz_size_t *intersection_size,
+                                             sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions) {
+#if SZ_USE_SKYLAKE
+    return sz_sequence_intersect_ice(    //
+        first_sequence, second_sequence, //
+        alloc, seed, intersection_size,  //
+        first_positions, second_positions);
+#elif SZ_USE_SVE
+    return sz_sequence_intersect_sve(    //
+        first_sequence, second_sequence, //
+        alloc, seed, intersection_size,  //
+        first_positions, second_positions);
+#else
+    return sz_sequence_intersect_serial( //
+        first_sequence, second_sequence, //
+        alloc, seed, intersection_size,  //
+        first_positions, second_positions);
+#endif
+}
+
+#endif            // !SZ_DYNAMIC_DISPATCH
+#pragma endregion // Compile Time Dispatching
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+#endif // STRINGZILLA_INTERSECT_H_
diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index 16ca31fe..af808cb5 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -1,15 +1,10 @@
 /**
- *  @brief  Hardware-accelerated string collection sorting & joins.
+ *  @brief  Hardware-accelerated string collection sorting.
  *  @file   sort.h
  *  @author Ash Vardanian
  *
- *  Includes core APIs for `sz_sequence_t` string collections with hardware-specific backends:
- *
- *  - `sz_sequence_argsort` - to get the sorting permutation of a string collection.
- *  - `sz_sequence_join` - to compute the intersection of two arbitrary string collections.
- *
- *  The first can easily be used to implement SORT and GROUPBY operations SQL, while the second can be used to
- *  implement JOIN operations. Both are essential for implementing efficient database engines.
+ *  Provides the @b `sz_sequence_argsort` API to get the sorting permutation of `sz_sequence_t` binary
+ *  string collections in lexicographical order.
  *
  *  The core idea of all following string algorithms is to process strings not based on 1 character at a time,
  *  but on a larger "Pointer-sized N-grams" fitting in 4 or 8 bytes at once, on 32-bit or 64-bit architectures,
@@ -17,7 +12,7 @@
  *  rest for some metadata.
  *
  *  That, however, means, that unsigned integer sorting & matching is a constituent part of our sequence
- *  algorithms and we can expose them as an additional set of APIs for the users:
+ *  algorithms and we can expose them as an additional APIs for the users:
  *
  *  - `sz_pgrams_sort` - to inplace sort continuous pointer-sized integers.
  *  - `sz_pgrams_join` - to compute the intersection of two arbitrary integer collections.
@@ -116,94 +111,6 @@ SZ_DYNAMIC sz_status_t sz_sequence_argsort(sz_sequence_t const *sequence, sz_mem
 SZ_DYNAMIC sz_status_t sz_pgrams_sort(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
                                       sz_sorted_idx_t *order);
 
-/**
- *  @brief  Intersects two arbitrary @b string sequences, using a hash table.
- *          Outputs the @p first_positions from the @p first_sequence and @p second_positions from
- *          the @p second_sequence, that contain identical strings.
- *
- *
- *  @param[in] first_sequence First immutable sequence of strings to intersection.
- *  @param[in] second_sequence Second immutable sequence of strings to intersection.
- *  @param[in] alloc Optional memory allocator for temporary storage.
- *  @param[out] intersection_size Number of identical strings in both sequences.
- *  @param[out] first_positions Offset positions of the identical strings from the @p first_sequence.
- *  @param[out] second_positions Offset positions of the identical strings from the @p second_sequence.
- *
- *  @retval `sz_success_k` if the operation was successful.
- *  @retval `sz_bad_alloc_k` if the operation failed due to memory allocation failure.
- *  @retval `sz_contains_duplicates_k` if any of the sequences contain duplicate strings.
- *  @pre The @p first_positions arrays must fit at least `min(first_sequence->count, second_sequence->count)` items.
- *  @pre The @p second_positions arrays must fit at least `min(first_sequence->count, second_sequence->count)` items.
- *
- *  Example usage:
- *
- *  @code{.c}
- *      #include <stringzilla/sort.h>
- *      int main() {
- *          char const *first[] = {"banana", "apple", "cherry"};
- *          char const *second[] = {"cherry", "orange", "pineapple", "banana"};
- *          sz_sequence_t first_sequence, second_sequence;
- *          sz_sequence_from_null_terminated_strings(first, 3, &first_sequence);
- *          sz_sequence_from_null_terminated_strings(second, 4, &second_sequence);
- *          sz_size_t intersection_size;
- *          sz_sorted_idx_t first_positions[3], second_positions[3]; //? 3 is the size of the smaller sequence
- *          sz_status_t status = sz_sequence_join(&first_sequence, &second_sequence, NULL,
- *              &intersection_size, first_positions, second_positions);
- *          return status == sz_success_k && intersection_size == 2 ? 0 : 1;
- *      }
- *  @endcode
- *
- *  @note   The algorithm has linear memory complexity and linear time complexity.
- *  @see    https://en.wikipedia.org/wiki/Join_(SQL)
- *
- *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
- *  @sa     sz_sequence_join_serial, sz_sequence_join_skylake, sz_sequence_join_sve
- */
-SZ_DYNAMIC sz_status_t sz_sequence_join(sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence,
-                                        sz_memory_allocator_t *alloc, sz_size_t *intersection_size,
-                                        sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions);
-
-/**
- *  @brief  Faster @b inplace `std::stable_sort` for a continuous @b unsigned-integer sequence, using MergeSort.
- *          Overwrites the input @p pgrams with the sorted sequence and exports the @p order permutation.
- *
- *  This algorithm guarantees stability, ensuring that the relative order of equal elements is preserved.
- *  It uses more memory than `sz_pgrams_sort`, but its performance is more predictable.
- *  It's preferred for very large inputs, as most memory access happens in a sequential pattern.
- *
- *  @param[inout] pgrams Continuous buffer of unsigned integers to sort in place.
- *  @param[in] count Number of elements in the sequence.
- *  @param[in] alloc Optional memory allocator for temporary storage.
- *  @param[out] order Output permutation that sorts the elements. Must fit at least @p count integers.
- *
- *  @retval `sz_success_k` if the operation was successful.
- *  @retval `sz_bad_alloc_k` if the operation failed due to memory allocation failure.
- *  @post The @p order array will contain a valid permutation of `[0, count - 1]`.
- *
- *  Example usage:
- *
- *  @code{.c}
- *      #include <stringzilla/sort.h>
- *      int main() {
- *          sz_pgram_t pgrams[] = {42, 17, 99, 8};
- *          sz_sorted_idx_t order[4];
- *          sz_pgrams_join(pgrams, 4, NULL, order);
- *          return order[0] == 3 && order[1] == 1 && order[2] == 0 && order[3] == 2 ? 0 : 1;
- *      }
- *  @endcode
- *
- *  @note   The algorithm has linear memory complexity and log-linear time complexity.
- *  @see    [MergeSort Algorithm](https://en.wikipedia.org/wiki/Merge_sort)
- *
- *  @note   This algorithm is @b stable: equal elements maintain their relative order.
- *  @sa     sz_pgrams_sort
- *
- *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
- *  @sa     sz_pgrams_join_serial, sz_pgrams_join_skylake, sz_pgrams_join_sve
- */
-SZ_DYNAMIC sz_status_t sz_pgrams_join(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
-                                      sz_sorted_idx_t *order);
-
 /** @copydoc sz_sequence_argsort */
 SZ_PUBLIC sz_status_t sz_sequence_argsort_serial(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
                                                  sz_sorted_idx_t *order);
@@ -222,12 +129,6 @@ SZ_PUBLIC sz_status_t sz_sequence_argsort_skylake(sz_sequence_t const *sequence,
 SZ_PUBLIC sz_status_t sz_pgrams_sort_skylake(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
                                              sz_sorted_idx_t *order);
 
-/** @copydoc sz_sequence_join */
-SZ_PUBLIC sz_status_t sz_sequence_join_skylake(                                //
-    sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence, //
-    sz_memory_allocator_t *alloc, sz_size_t *intersection_size,                //
-    sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions);
-
 #endif
 
 #if SZ_USE_SVE
@@ -240,12 +141,6 @@ SZ_PUBLIC sz_status_t sz_sequence_argsort_sve(sz_sequence_t const *sequence, sz_
 SZ_PUBLIC sz_status_t sz_pgrams_sort_sve(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
                                          sz_sorted_idx_t *order);
 
-/** @copydoc sz_sequence_join */
-SZ_PUBLIC sz_status_t sz_sequence_join_sve(                                    //
-    sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence, //
-    sz_memory_allocator_t *alloc, sz_size_t *intersection_size,                //
-    sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions);
-
 #endif
 
 #pragma endregion
@@ -717,7 +612,7 @@ SZ_PUBLIC sz_status_t sz_pgrams_sort_serial(sz_pgram_t *pgrams, sz_size_t count,
  *  @brief  Helper function similar to `std::set_union` over pairs of integers and their original indices.
  *  @see    https://en.cppreference.com/w/cpp/algorithm/set_union
  */
-SZ_INTERNAL void _sz_sequence_join_serial_merge(                                                    //
+SZ_INTERNAL void _sz_pgrams_union_serial(                                                           //
     sz_pgram_t const *first_pgrams, sz_sorted_idx_t const *first_indices, sz_size_t first_count,    //
     sz_pgram_t const *second_pgrams, sz_sorted_idx_t const *second_indices, sz_size_t second_count, //
     sz_pgram_t *result_pgrams, sz_sorted_idx_t *result_indices) {
@@ -764,167 +659,6 @@ SZ_INTERNAL void _sz_sequence_join_serial_merge(
             _sz_assert(merged_begin[i - 1] <= merged_begin[i] && "The merged pgrams must be in ascending order.");
 }
 
-SZ_PUBLIC sz_status_t sz_pgrams_join_serial(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
-                                            sz_sorted_idx_t *order) {
-
-    // First, initialize the `order` with `std::iota`-like behavior.
-    for (sz_size_t i = 0; i != count; ++i) order[i] = i;
-
-    // On very small collections - just use the quadratic-complexity insertion sort
-    // without any smart optimizations or memory allocations.
-    if (count <= 32) {
-        sz_pgrams_sort_with_insertion(pgrams, count, order);
-        return sz_success_k;
-    }
-
-    // Go through short chunks of 8 elements and sort them with a sorting network.
-    for (sz_size_t i = 0; i + 8u <= count; i += 8u) _sz_sequence_sorting_network_8x(pgrams + i, order + i);
-
-    // For the tail of the array, sort it with insertion sort.
-    sz_size_t const tail_count = count & 7u;
-    sz_pgrams_sort_with_insertion(pgrams + count - tail_count, tail_count, order + count - tail_count);
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // At this point, the array is partitioned into sorted runs.
-    // We'll now merge these runs until the whole array is sorted.
-    // Allocate temporary memory to hold merged results:
-    //    - one block for keys (`sz_pgram_t`)
-    //    - one block for indices (`sz_sorted_idx_t`)
-    sz_size_t memory_usage = sizeof(sz_pgram_t) * count + sizeof(sz_sorted_idx_t) * count;
-    sz_pgram_t *pgrams_temporary = (sz_pgram_t *)alloc->allocate(memory_usage, alloc);
-    sz_sorted_idx_t *order_temporary = (sz_sorted_idx_t *)(pgrams_temporary + count);
-    if (!pgrams_temporary) return sz_bad_alloc_k;
-
-    // Set initial run size (the sorted chunks).
-    sz_size_t run_size = 8;
-
-    // Pointers for current source and destination arrays.
-    sz_pgram_t *src_pgrams = pgrams;
-    sz_sorted_idx_t *src_order = order;
-    sz_pgram_t *dst_pgrams = pgrams_temporary;
-    sz_sorted_idx_t *dst_order = order_temporary;
-
-    // Merge sorted runs in a bottom-up manner until the run size covers the whole array.
-    while (run_size < count) {
-        // Process adjacent runs.
-        for (sz_size_t i = 0; i < count; i += run_size * 2) {
-            // Determine the number of elements in the left run.
-            sz_size_t left_count = run_size;
-            if (i + left_count > count) { left_count = count - i; }
-
-            // Determine the number of elements in the right run.
-            sz_size_t right_count = run_size;
-            if (i + left_count >= count) { right_count = 0; }
-            else if (i + left_count + right_count > count) { right_count = count - (i + left_count); }
-
-            // Merge the two runs:
-            _sz_sequence_join_serial_merge(                                       //
-                src_pgrams + i, src_order + i, left_count,                        //
-                src_pgrams + i + run_size, src_order + i + run_size, right_count, //
-                dst_pgrams + i, dst_order + i);
-        }
-
-        // Swap the roles of the source and destination arrays.
-        _sz_swap(sz_pgram_t *, src_pgrams, dst_pgrams);
-        _sz_swap(sz_sorted_idx_t *, src_order, dst_order);
-
-        // Double the run size for the next pass.
-        run_size *= 2;
-    }
-
-    // If the final sorted result is not in the original array, copy the sorted results back.
-    if (src_pgrams != pgrams)
-        for (sz_size_t i = 0; i < count; ++i) pgrams[i] = src_pgrams[i], order[i] = src_order[i];
-
-    // Free the temporary memory used for merging.
-    alloc->free(pgrams_temporary, memory_usage, alloc);
-    return sz_success_k;
-}
-
-SZ_PUBLIC sz_status_t sz_sequence_join_serial(                                      //
-    sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence,      //
-    sz_memory_allocator_t *alloc, sz_u64_t seed, sz_size_t *intersection_count_ptr, //
-    sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions) {
-
-    // To join to unordered sets of strings, the simplest approach would be to hash them into a dynamically
-    // allocated hash table and then iterate over the second set, checking for the presence of each element in the
-    // hash table. This would require O(N) memory and O(N) time complexity, where N is the smaller set.
-    sz_sequence_t const *small_sequence, *large_sequence;
-    sz_sorted_idx_t *small_positions, *large_positions;
-    if (first_sequence->count <= second_sequence->count) {
-        small_sequence = first_sequence, large_sequence = second_sequence;
-        small_positions = first_positions, large_positions = second_positions;
-    }
-    else {
-        small_sequence = second_sequence, large_sequence = first_sequence;
-        small_positions = second_positions, large_positions = first_positions;
-    }
-
-    // We may very well have nothing to join
-    if (small_sequence->count == 0) {
-        *intersection_count_ptr = 0;
-        return sz_success_k;
-    }
-
-    // Allocate memory for the hash table and initialize it with 0xFF.
-    sz_size_t const hash_table_slots = sz_size_bit_ceil(small_sequence->count * 2);
-    sz_size_t const bytes_per_entry = sizeof(sz_size_t) + sizeof(sz_u64_t);
-    sz_size_t *table_positions = (sz_size_t *)alloc->allocate(hash_table_slots * bytes_per_entry, alloc);
-    if (!table_positions) return sz_bad_alloc_k;
-    sz_u64_t *table_fingerprints = (sz_u64_t *)(table_positions + hash_table_slots);
-    sz_fill((sz_ptr_t)table_positions, hash_table_slots * bytes_per_entry, 0xFF);
-
-    // Hash the smaller set into the hash table using the default available backend.
-    for (sz_size_t small_position = 0; small_position < small_sequence->count; ++small_position) {
-        sz_cptr_t const str = small_sequence->get_start(small_sequence->handle, small_position);
-        sz_size_t const length = small_sequence->get_length(small_sequence->handle, small_position);
-        sz_u64_t const hash = sz_hash(str, length, seed);
-        sz_size_t hash_slot = hash;
-        // Implement linear probing to resolve collisions.
-        while (table_positions[hash_slot & (hash_table_slots - 1)] != SZ_SIZE_MAX) ++hash_slot;
-        table_positions[hash_slot & (hash_table_slots - 1)] = small_position;
-        table_fingerprints[hash_slot & (hash_table_slots - 1)] = hash;
-    }
-
-    // Iterate over the larger set and check for the presence of each element in the hash table.
-    sz_size_t intersection_count = 0;
-    for (sz_size_t large_position = 0; large_position < large_sequence->count; ++large_position) {
-        sz_cptr_t const str = large_sequence->get_start(large_sequence->handle, large_position);
-        sz_size_t const length = large_sequence->get_length(large_sequence->handle, large_position);
-        sz_u64_t const hash = sz_hash(str, length, seed);
-        sz_size_t hash_slot = hash;
-        // Implement linear probing to resolve collisions.
-        for (; table_positions[hash_slot & (hash_table_slots - 1)] != SZ_SIZE_MAX; ++hash_slot) {
-            sz_u64_t small_hash = table_fingerprints[hash_slot & (hash_table_slots - 1)];
-            if (small_hash != hash) continue;
-
-            // The hash matches, compare the strings.
-            sz_size_t const small_position = table_positions[hash_slot & (hash_table_slots - 1)];
-            sz_size_t const small_length = small_sequence->get_length(small_sequence->handle, small_position);
-            if (length != small_length) continue;
-
-            sz_cptr_t const small_str = small_sequence->get_start(small_sequence->handle, small_position);
-            sz_bool_t const same = sz_equal(str, small_str, length);
-            if (same != sz_true_k) continue;
-
-            // Finally, there is a match, store the positions.
-            small_positions[intersection_count] = small_position;
-            large_positions[intersection_count] = large_position;
-            ++intersection_count;
-            break;
-        }
-    }
-
-    *intersection_count_ptr = intersection_count;
-    return sz_success_k;
-}
-
 #pragma endregion // Serial MergeSort Implementation
 
 /*  AVX512 implementation of the string search algorithms for Ice Lake and newer CPUs.
@@ -1186,14 +920,6 @@ SZ_PUBLIC sz_status_t sz_sequence_argsort_skylake(sz_sequence_t const *sequence,
     return sz_success_k;
 }
 
-SZ_PUBLIC sz_status_t sz_sequence_join_skylake(                                //
-    sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence, //
-    sz_memory_allocator_t *alloc, sz_size_t *intersection_size,                //
-    sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions) {
-    sz_unused(first_sequence && second_sequence && alloc && intersection_size && first_positions && second_positions);
-    return sz_success_k;
-}
-
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_SKYLAKE
@@ -1227,27 +953,6 @@ SZ_DYNAMIC sz_status_t sz_pgrams_sort(sz_pgram_t *pgrams, sz_size_t count, sz_me
 #endif
 }
 
-SZ_DYNAMIC sz_status_t sz_sequence_join(sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence,
-                                        sz_memory_allocator_t *alloc, sz_size_t *intersection_size,
-                                        sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions) {
-#if SZ_USE_SKYLAKE
-    return sz_sequence_join_skylake(     //
-        first_sequence, second_sequence, //
-        alloc, intersection_size,        //
-        first_positions, second_positions);
-#elif SZ_USE_SVE
-    return sz_sequence_join_sve(         //
-        first_sequence, second_sequence, //
-        alloc, intersection_size,        //
-        first_positions, second_positions);
-#else
-    return sz_sequence_join_serial(      //
-        first_sequence, second_sequence, //
-        alloc, intersection_size,        //
-        first_positions, second_positions);
-#endif
-}
-
 #endif            // !SZ_DYNAMIC_DISPATCH
 #pragma endregion // Compile Time Dispatching
 
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 824bacd4..682f63f0 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -48,7 +48,8 @@
 #include "find.h"         // `sz_find`, `sz_find_byteset`, `sz_rfind`
 #include "small_string.h" // `sz_string_t`, `sz_string_init`, `sz_string_free`
 #include "similarity.h"   // `sz_levenshtein_distance`, `sz_needleman_wunsch_score`
-#include "sort.h"         // `sz_sequence_argsort`, `sz_pgrams_sort`, `sz_pgrams_sort_stable`
+#include "sort.h"         // `sz_sequence_argsort`, `sz_pgrams_sort`
+#include "intersect.h"    // `sz_sequence_intersect`
 
 #ifdef __cplusplus
 extern "C" {

From e25f518b8ff21ad244923a8562ca0325d024fa66 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 7 Mar 2025 12:27:07 +0000
Subject: [PATCH 149/751] Import: Portable macros for C++ version inference

---
 include/stringzilla/stringzilla.hpp | 36 ++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index a1b2de28..eafa448f 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -28,12 +28,36 @@
 /*  We need to detect the version of the C++ language we are compiled with.
  *  This will affect recent features like `operator<=>` and tests against STL.
  */
-#define _SZ_IS_CPP23 (__cplusplus >= 202101L)
-#define _SZ_IS_CPP20 (__cplusplus >= 202002L)
-#define _SZ_IS_CPP17 (__cplusplus >= 201703L)
-#define _SZ_IS_CPP14 (__cplusplus >= 201402L)
-#define _SZ_IS_CPP11 (__cplusplus >= 201103L)
-#define _SZ_IS_CPP98 (__cplusplus >= 199711L)
+#if __cplusplus >= 202101L
+#define _SZ_IS_CPP23 1
+#else
+#define _SZ_IS_CPP23 0
+#endif
+#if __cplusplus >= 202002L
+#define _SZ_IS_CPP20 1
+#else
+#define _SZ_IS_CPP20 0
+#endif
+#if __cplusplus >= 201703L
+#define _SZ_IS_CPP17 1
+#else
+#define _SZ_IS_CPP17 0
+#endif
+#if __cplusplus >= 201402L
+#define _SZ_IS_CPP14 1
+#else
+#define _SZ_IS_CPP14 0
+#endif
+#if __cplusplus >= 201103L
+#define _SZ_IS_CPP11 1
+#else
+#define _SZ_IS_CPP11 0
+#endif
+#if __cplusplus >= 199711L
+#define _SZ_IS_CPP98 1
+#else
+#define _SZ_IS_CPP98 0
+#endif
 
 /**
  *  @brief  Expands to `constexpr` in C++20 and later, and to nothing in older C++ versions.

From 0d982a45f842287d7e344f0d8b360f52482017f5 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 7 Mar 2025 12:53:39 +0000
Subject: [PATCH 150/751] Docs: New formatting in C++

Refreshed the C++ doxygen docstring to
match the style of C headers with the
new `@sa`, `@p`, and `@retval` tags.
---
 include/stringzilla/stringzilla.hpp | 1102 +++++++++++++--------------
 1 file changed, 545 insertions(+), 557 deletions(-)

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index eafa448f..98817dd7 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -118,30 +118,33 @@ using carray = char[count_characters];
 #pragma region Memory Operations
 
 /**
- *  @brief  Analog to @b `std::memset`, but with a more efficient implementation.
- *  @param  target The pointer to the target memory region.
- *  @param  value The byte value to set.
- *  @param  n The number of bytes to copy.
+ *  @brief Analog to @b `std::memset`, but with a more efficient implementation.
+ *  @param[in] target The pointer to the target memory region.
+ *  @param[in] value The byte value to set.
+ *  @param[in] n The number of bytes to copy.
+ *  @see https://en.cppreference.com/w/cpp/string/byte/memset
  */
 inline void memset(void *target, char value, std::size_t n) noexcept {
     return sz_fill(reinterpret_cast<sz_ptr_t>(target), n, value);
 }
 
 /**
- *  @brief  Analog to @b `std::memmove`, but with a more efficient implementation.
- *  @param  target The pointer to the target memory region.
- *  @param  source The pointer to the source memory region.
- *  @param  n The number of bytes to copy.
+ *  @brief Analog to @b `std::memmove`, but with a more efficient implementation.
+ *  @param[in] target The pointer to the target memory region.
+ *  @param[in] source The pointer to the source memory region.
+ *  @param[in] n The number of bytes to copy.
+ *  @see https://en.cppreference.com/w/cpp/string/byte/memmove
  */
 inline void memmove(void *target, void const *source, std::size_t n) noexcept {
     return sz_move(reinterpret_cast<sz_ptr_t>(target), reinterpret_cast<sz_cptr_t>(source), n);
 }
 
 /**
- *  @brief  Analog to @b `std::memcpy`, but with a more efficient implementation.
- *  @param  target The pointer to the target memory region.
- *  @param  source The pointer to the source memory region.
- *  @param  n The number of bytes to copy.
+ *  @brief Analog to @b `std::memcpy`, but with a more efficient implementation.
+ *  @param[in] target The pointer to the target memory region.
+ *  @param[in] source The pointer to the source memory region.
+ *  @param[in] n The number of bytes to copy.
+ *  @see https://en.cppreference.com/w/cpp/string/byte/memcpy
  */
 inline void memcpy(void *target, void const *source, std::size_t n) noexcept {
     return sz_copy(reinterpret_cast<sz_ptr_t>(target), reinterpret_cast<sz_cptr_t>(source), n);
@@ -152,8 +155,8 @@ inline void memcpy(void *target, void const *source, std::size_t n) noexcept {
 #pragma region Character Sets
 
 /**
- *  @brief  The concatenation of the `ascii_lowercase` and `ascii_uppercase`. This value is not locale-dependent.
- *          https://docs.python.org/3/library/string.html#string.ascii_letters
+ *  @brief The concatenation of the `ascii_lowercase` and `ascii_uppercase`. This value is not locale-dependent.
+ *  @see https://docs.python.org/3/library/string.html#string.ascii_letters
  */
 inline carray<52> const &ascii_letters() noexcept {
     static carray<52> const all = {
@@ -166,8 +169,8 @@ inline carray<52> const &ascii_letters() noexcept {
 }
 
 /**
- *  @brief  The lowercase letters "abcdefghijklmnopqrstuvwxyz". This value is not locale-dependent.
- *          https://docs.python.org/3/library/string.html#string.ascii_lowercase
+ *  @brief The lowercase letters "abcdefghijklmnopqrstuvwxyz". This value is not locale-dependent.
+ *  @see https://docs.python.org/3/library/string.html#string.ascii_lowercase
  */
 inline carray<26> const &ascii_lowercase() noexcept {
     static carray<26> const all = {
@@ -179,8 +182,8 @@ inline carray<26> const &ascii_lowercase() noexcept {
 }
 
 /**
- *  @brief  The uppercase letters "ABCDEFGHIJKLMNOPQRSTUVWXYZ". This value is not locale-dependent.
- *          https://docs.python.org/3/library/string.html#string.ascii_uppercase
+ *  @brief The uppercase letters "ABCDEFGHIJKLMNOPQRSTUVWXYZ". This value is not locale-dependent.
+ *  @see https://docs.python.org/3/library/string.html#string.ascii_uppercase
  */
 inline carray<26> const &ascii_uppercase() noexcept {
     static carray<26> const all = {
@@ -192,9 +195,8 @@ inline carray<26> const &ascii_uppercase() noexcept {
 }
 
 /**
- *  @brief  ASCII characters which are considered printable.
- *          A combination of `digits`, `ascii_letters`, `punctuation`, and `whitespace`.
- *          https://docs.python.org/3/library/string.html#string.printable
+ *  @brief Printable ASCII characters, including: `digits`, `ascii_letters`, `punctuation`, and `whitespace`.
+ *  @see https://docs.python.org/3/library/string.html#string.printable
  */
 inline carray<100> const &ascii_printables() noexcept {
     static carray<100> const all = {
@@ -209,8 +211,7 @@ inline carray<100> const &ascii_printables() noexcept {
 }
 
 /**
- *  @brief  Non-printable ASCII control characters.
- *          Includes all codes from 0 to 31 and 127.
+ *  @brief Non-printable ASCII control characters. Includes all codes from 0 to 31 and 127.
  */
 inline carray<33> const &ascii_controls() noexcept {
     static carray<33> const all = {
@@ -222,8 +223,8 @@ inline carray<33> const &ascii_controls() noexcept {
 }
 
 /**
- *  @brief  The digits "0123456789".
- *          https://docs.python.org/3/library/string.html#string.digits
+ *  @brief The digits "0123456789".
+ *  @see https://docs.python.org/3/library/string.html#string.digits
  */
 inline carray<10> const &digits() noexcept {
     static carray<10> const all = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'};
@@ -231,8 +232,8 @@ inline carray<10> const &digits() noexcept {
 }
 
 /**
- *  @brief  The letters "0123456789abcdefABCDEF".
- *          https://docs.python.org/3/library/string.html#string.hexdigits
+ *  @brief The letters "0123456789abcdefABCDEF".
+ *  @see https://docs.python.org/3/library/string.html#string.hexdigits
  */
 inline carray<22> const &hexdigits() noexcept {
     static carray<22> const all = {
@@ -244,8 +245,8 @@ inline carray<22> const &hexdigits() noexcept {
 }
 
 /**
- *  @brief  The letters "01234567".
- *          https://docs.python.org/3/library/string.html#string.octdigits
+ *  @brief The letters "01234567".
+ *  @see https://docs.python.org/3/library/string.html#string.octdigits
  */
 inline carray<8> const &octdigits() noexcept {
     static carray<8> const all = {'0', '1', '2', '3', '4', '5', '6', '7'};
@@ -253,9 +254,8 @@ inline carray<8> const &octdigits() noexcept {
 }
 
 /**
- *  @brief  ASCII characters considered punctuation characters in the C locale:
- *          !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~.
- *          https://docs.python.org/3/library/string.html#string.punctuation
+ *  @brief ASCII characters considered punctuation characters in the C locale: @b !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~.
+ *  @see https://docs.python.org/3/library/string.html#string.punctuation
  */
 inline carray<32> const &punctuation() noexcept {
     static carray<32> const all = {
@@ -267,9 +267,8 @@ inline carray<32> const &punctuation() noexcept {
 }
 
 /**
- *  @brief  ASCII characters that are considered whitespace.
- *          This includes space, tab, linefeed, return, formfeed, and vertical tab.
- *          https://docs.python.org/3/library/string.html#string.whitespace
+ *  @brief Whitespace ASCII characters, including: space, tab, linefeed, return, formfeed, and vertical tab.
+ *  @see https://docs.python.org/3/library/string.html#string.whitespace
  */
 inline carray<6> const &whitespaces() noexcept {
     static carray<6> const all = {' ', '\t', '\n', '\r', '\f', '\v'};
@@ -277,8 +276,8 @@ inline carray<6> const &whitespaces() noexcept {
 }
 
 /**
- *  @brief  ASCII characters that are considered line delimiters.
- *          https://docs.python.org/3/library/stdtypes.html#str.splitlines
+ *  @brief ASCII characters that are considered line delimiters.
+ *  @see https://docs.python.org/3/library/stdtypes.html#str.splitlines
  */
 inline carray<8> const &newlines() noexcept {
     static carray<8> const all = {'\n', '\r', '\f', '\v', '\x1C', '\x1D', '\x1E', '\x85'};
@@ -286,7 +285,8 @@ inline carray<8> const &newlines() noexcept {
 }
 
 /**
- *  @brief  ASCII characters forming the BASE64 encoding alphabet.
+ *  @brief ASCII characters forming the BASE64 encoding alphabet: a-z, A-Z, 0-9, +, and /.
+ *  @see https://docs.python.org/3/library/base64.html
  */
 inline carray<64> const &base64() noexcept {
     static carray<64> const all = {
@@ -299,7 +299,7 @@ inline carray<64> const &base64() noexcept {
 }
 
 /**
- *  @brief  A set of characters represented as a bitset with 256 slots.
+ *  @brief A set of characters represented as a bitset with 256 slots.
  */
 template <typename char_type_ = char>
 class basic_byteset {
@@ -383,9 +383,8 @@ inline byteset newlines_set() { return byteset {newlines(), sizeof(newlines())};
 inline byteset base64_set() { return byteset {base64(), sizeof(base64())}; }
 
 /**
- *  @brief  A look-up table for character replacement operations.
- *          Exactly 256 bytes for byte-to-byte replacement.
- *          ! For larger character types should be allocated on the heap.
+ *  @brief A look-up table for character replacement operations. Exactly 256 bytes for byte-to-byte replacement.
+ *  @warning For larger character types should be allocated on the heap.
  */
 template <typename char_type_ = char>
 class basic_look_up_table {
@@ -415,8 +414,8 @@ class basic_look_up_table {
     }
 
     /**
-     *  @brief  Creates a look-up table with a one-to-one mapping of characters to themselves.
-     *  Similar to `std::iota` filling, but properly handles signed integer casts.
+     *  @brief Creates a look-up table with a one-to-one mapping of characters to themselves.
+     *  @see Similar to `std::iota` filling, but properly handles signed integer casts.
      */
     static basic_look_up_table identity() noexcept {
         basic_look_up_table result;
@@ -446,7 +445,8 @@ inline static constexpr exclude_overlaps_type exclude_overlaps;
 #endif
 
 /**
- *  @brief  Zero-cost wrapper around the `.find` member function of string-like classes.
+ *  @brief Zero-cost wrapper around the `.find` member function of string-like classes.
+ *  @see https://en.cppreference.com/w/cpp/string/basic_string/find
  */
 template <typename string_type_, typename overlaps_type = include_overlaps_type>
 struct matcher_find {
@@ -463,7 +463,8 @@ struct matcher_find {
 };
 
 /**
- *  @brief  Zero-cost wrapper around the `.rfind` member function of string-like classes.
+ *  @brief Zero-cost wrapper around the `.rfind` member function of string-like classes.
+ *  @see https://en.cppreference.com/w/cpp/string/basic_string/rfind
  */
 template <typename string_type_, typename overlaps_type = include_overlaps_type>
 struct matcher_rfind {
@@ -480,7 +481,8 @@ struct matcher_rfind {
 };
 
 /**
- *  @brief  Zero-cost wrapper around the `.find_first_of` member function of string-like classes.
+ *  @brief Zero-cost wrapper around the `.find_first_of` member function of string-like classes.
+ *  @see https://en.cppreference.com/w/cpp/string/basic_string/find_first_of
  */
 template <typename haystack_type, typename needles_type = haystack_type>
 struct matcher_find_first_of {
@@ -492,7 +494,8 @@ struct matcher_find_first_of {
 };
 
 /**
- *  @brief  Zero-cost wrapper around the `.find_last_of` member function of string-like classes.
+ *  @brief Zero-cost wrapper around the `.find_last_of` member function of string-like classes.
+ *  @see https://en.cppreference.com/w/cpp/string/basic_string/find_last_of
  */
 template <typename haystack_type, typename needles_type = haystack_type>
 struct matcher_find_last_of {
@@ -504,7 +507,8 @@ struct matcher_find_last_of {
 };
 
 /**
- *  @brief  Zero-cost wrapper around the `.find_first_not_of` member function of string-like classes.
+ *  @brief Zero-cost wrapper around the `.find_first_not_of` member function of string-like classes.
+ *  @see https://en.cppreference.com/w/cpp/string/basic_string/find_first_not_of
  */
 template <typename haystack_type, typename needles_type = haystack_type>
 struct matcher_find_first_not_of {
@@ -516,7 +520,8 @@ struct matcher_find_first_not_of {
 };
 
 /**
- *  @brief  Zero-cost wrapper around the `.find_last_not_of` member function of string-like classes.
+ *  @brief Zero-cost wrapper around the `.find_last_not_of` member function of string-like classes.
+ *  @see https://en.cppreference.com/w/cpp/string/basic_string/find_last_not_of
  */
 template <typename haystack_type, typename needles_type = haystack_type>
 struct matcher_find_last_not_of {
@@ -528,9 +533,9 @@ struct matcher_find_last_not_of {
 };
 
 /**
- *  @brief  A range of string slices representing the matches of a substring search.
- *          Compatible with C++23 ranges, C++11 string views, and of course, StringZilla.
- *          Similar to a pair of `boost::algorithm::find_iterator`.
+ *  @brief A range of string slices representing the matches of a substring search.
+ *  @note Compatible with C++23 ranges, C++11 string views, and of course, StringZilla.
+ *  @see Similar to a pair of `boost::algorithm::find_iterator`.
  */
 template <typename string_type_, typename matcher_type_>
 class range_matches {
@@ -597,17 +602,13 @@ class range_matches {
     bool empty() const noexcept { return begin() == end_sentinel_type {}; }
     bool include_overlaps() const noexcept { return matcher_.skip_length() < matcher_.needle_length(); }
 
-    /**
-     *  @brief  Copies the matches into a container.
-     */
+    /** @brief Copies the matches into a container. */
     template <typename container_>
     void to(container_ &container) {
-        for (auto match : *this) { container.push_back(match); }
+        for (auto match : *this) container.push_back(match);
     }
 
-    /**
-     *  @brief  Copies the matches into a consumed container, returning it at the end.
-     */
+    /** @brief Copies the matches into a consumed container, returning it at the end. */
     template <typename container_>
     container_ to() {
         return container_ {begin(), end()};
@@ -615,9 +616,9 @@ class range_matches {
 };
 
 /**
- *  @brief  A range of string slices representing the matches of a @b reverse-order substring search.
- *          Compatible with C++23 ranges, C++11 string views, and of course, StringZilla.
- *          Similar to a pair of `boost::algorithm::find_iterator`.
+ *  @brief A range of string slices representing the matches of a @b reverse-order substring search.
+ *  @note Compatible with C++23 ranges, C++11 string views, and of course, StringZilla.
+ *  @see Similar to a pair of `boost::algorithm::find_iterator`.
  */
 template <typename string_type_, typename matcher_type_>
 class range_rmatches {
@@ -695,17 +696,13 @@ class range_rmatches {
     bool empty() const noexcept { return begin() == end_sentinel_type {}; }
     bool include_overlaps() const noexcept { return matcher_.skip_length() < matcher_.needle_length(); }
 
-    /**
-     *  @brief  Copies the matches into a container.
-     */
+    /** @brief Copies the matches into a container. */
     template <typename container_>
     void to(container_ &container) {
-        for (auto match : *this) { container.push_back(match); }
+        for (auto match : *this) container.push_back(match);
     }
 
-    /**
-     *  @brief  Copies the matches into a consumed container, returning it at the end.
-     */
+    /** @brief Copies the matches into a consumed container, returning it at the end. */
     template <typename container_>
     container_ to() {
         return container_ {begin(), end()};
@@ -713,9 +710,9 @@ class range_rmatches {
 };
 
 /**
- *  @brief  A range of string slices for different splits of the data.
- *          Compatible with C++23 ranges, C++11 string views, and of course, StringZilla.
- *          Similar to a pair of `boost::algorithm::split_iterator`.
+ *  @brief A range of string slices for different splits of the data.
+ *  @note Compatible with C++23 ranges, C++11 string views, and of course, StringZilla.
+ *  @see Similar to a pair of `boost::algorithm::split_iterator`.
  *
  *  In some sense, represents the inverse operation to `range_matches`, as it reports not the search matches
  *  but the data between them. Meaning that for `N` search matches, there will be `N+1` elements in the range.
@@ -797,28 +794,24 @@ class range_splits {
     difference_type ssize() const noexcept { return std::distance(begin(), end()); }
     constexpr bool empty() const noexcept { return false; }
 
-    /**
-     *  @brief  Copies the matches into a container.
-     */
+    /** @brief Copies the matches into a container. */
     template <typename container_>
     void to(container_ &container) {
-        for (auto match : *this) { container.push_back(match); }
+        for (auto match : *this) container.push_back(match);
     }
 
-    /**
-     *  @brief  Copies the matches into a consumed container, returning it at the end.
-     */
+    /** @brief Copies the matches into a consumed container, returning it at the end. */
     template <typename container_>
     container_ to(container_ &&container = {}) {
-        for (auto match : *this) { container.push_back(match); }
+        for (auto match : *this) container.push_back(match);
         return std::move(container);
     }
 };
 
 /**
- *  @brief  A range of string slices for different splits of the data in @b reverse-order.
- *          Compatible with C++23 ranges, C++11 string views, and of course, StringZilla.
- *          Similar to a pair of `boost::algorithm::split_iterator`.
+ *  @brief A range of string slices for different splits of the data in @b reverse-order.
+ *  @note Compatible with C++23 ranges, C++11 string views, and of course, StringZilla.
+ *  @see Similar to a pair of `boost::algorithm::split_iterator`.
  *
  *  In some sense, represents the inverse operation to `range_matches`, as it reports not the search matches
  *  but the data between them. Meaning that for `N` search matches, there will be `N+1` elements in the range.
@@ -906,27 +899,23 @@ class range_rsplits {
     difference_type ssize() const noexcept { return std::distance(begin(), end()); }
     constexpr bool empty() const noexcept { return false; }
 
-    /**
-     *  @brief  Copies the matches into a container.
-     */
+    /** @brief Copies the matches into a container. */
     template <typename container_>
     void to(container_ &container) {
-        for (auto match : *this) { container.push_back(match); }
+        for (auto match : *this) container.push_back(match);
     }
 
-    /**
-     *  @brief  Copies the matches into a consumed container, returning it at the end.
-     */
+    /** @brief Copies the matches into a consumed container, returning it at the end. */
     template <typename container_>
     container_ to(container_ &&container = {}) {
-        for (auto match : *this) { container.push_back(match); }
+        for (auto match : *this) container.push_back(match);
         return std::move(container);
     }
 };
 
 /**
- *  @brief  Find all potentially @b overlapping inclusions of a needle substring.
- *  @tparam string  A string-like type, ideally a view, like StringZilla or STL `string_view`.
+ *  @brief Find all potentially @b overlapping inclusions of a needle substring.
+ *  @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  */
 template <typename string>
 range_matches<string, matcher_find<string, include_overlaps_type>> find_all(string const &h, string const &n,
@@ -935,8 +924,8 @@ range_matches<string, matcher_find<string, include_overlaps_type>> find_all(stri
 }
 
 /**
- *  @brief  Find all potentially @b overlapping inclusions of a needle substring in @b reverse order.
- *  @tparam string  A string-like type, ideally a view, like StringZilla or STL `string_view`.
+ *  @brief Find all potentially @b overlapping inclusions of a needle substring in @b reverse order.
+ *  @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  */
 template <typename string>
 range_rmatches<string, matcher_rfind<string, include_overlaps_type>> rfind_all(string const &h, string const &n,
@@ -945,8 +934,8 @@ range_rmatches<string, matcher_rfind<string, include_overlaps_type>> rfind_all(s
 }
 
 /**
- *  @brief  Find all @b non-overlapping inclusions of a needle substring.
- *  @tparam string  A string-like type, ideally a view, like StringZilla or STL `string_view`.
+ *  @brief Find all @b non-overlapping inclusions of a needle substring.
+ *  @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  */
 template <typename string>
 range_matches<string, matcher_find<string, exclude_overlaps_type>> find_all(string const &h, string const &n,
@@ -955,8 +944,8 @@ range_matches<string, matcher_find<string, exclude_overlaps_type>> find_all(stri
 }
 
 /**
- *  @brief  Find all @b non-overlapping inclusions of a needle substring in @b reverse order.
- *  @tparam string  A string-like type, ideally a view, like StringZilla or STL `string_view`.
+ *  @brief Find all @b non-overlapping inclusions of a needle substring in @b reverse order.
+ *  @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  */
 template <typename string>
 range_rmatches<string, matcher_rfind<string, exclude_overlaps_type>> rfind_all(string const &h, string const &n,
@@ -965,8 +954,8 @@ range_rmatches<string, matcher_rfind<string, exclude_overlaps_type>> rfind_all(s
 }
 
 /**
- *  @brief  Find all inclusions of characters from the second string.
- *  @tparam string  A string-like type, ideally a view, like StringZilla or STL `string_view`.
+ *  @brief Find all inclusions of characters from the second string.
+ *  @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  */
 template <typename string>
 range_matches<string, matcher_find_first_of<string>> find_all_characters(string const &h, string const &n) noexcept {
@@ -974,8 +963,8 @@ range_matches<string, matcher_find_first_of<string>> find_all_characters(string
 }
 
 /**
- *  @brief  Find all inclusions of characters from the second string in @b reverse order.
- *  @tparam string  A string-like type, ideally a view, like StringZilla or STL `string_view`.
+ *  @brief Find all inclusions of characters from the second string in @b reverse order.
+ *  @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  */
 template <typename string>
 range_rmatches<string, matcher_find_last_of<string>> rfind_all_characters(string const &h, string const &n) noexcept {
@@ -983,8 +972,8 @@ range_rmatches<string, matcher_find_last_of<string>> rfind_all_characters(string
 }
 
 /**
- *  @brief  Find all characters except the ones in the second string.
- *  @tparam string  A string-like type, ideally a view, like StringZilla or STL `string_view`.
+ *  @brief Find all characters except the ones in the second string.
+ *  @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  */
 template <typename string>
 range_matches<string, matcher_find_first_not_of<string>> find_all_other_characters(string const &h,
@@ -993,8 +982,8 @@ range_matches<string, matcher_find_first_not_of<string>> find_all_other_characte
 }
 
 /**
- *  @brief  Find all characters except the ones in the second string in @b reverse order.
- *  @tparam string  A string-like type, ideally a view, like StringZilla or STL `string_view`.
+ *  @brief Find all characters except the ones in the second string in @b reverse order.
+ *  @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  */
 template <typename string>
 range_rmatches<string, matcher_find_last_not_of<string>> rfind_all_other_characters(string const &h,
@@ -1003,8 +992,8 @@ range_rmatches<string, matcher_find_last_not_of<string>> rfind_all_other_charact
 }
 
 /**
- *  @brief  Splits a string around every @b non-overlapping inclusion of the second string.
- *  @tparam string  A string-like type, ideally a view, like StringZilla or STL `string_view`.
+ *  @brief Splits a string around every @b non-overlapping inclusion of the second string.
+ *  @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  */
 template <typename string>
 range_splits<string, matcher_find<string, exclude_overlaps_type>> split(string const &h, string const &n) noexcept {
@@ -1012,8 +1001,8 @@ range_splits<string, matcher_find<string, exclude_overlaps_type>> split(string c
 }
 
 /**
- *  @brief  Splits a string around every @b non-overlapping inclusion of the second string in @b reverse order.
- *  @tparam string  A string-like type, ideally a view, like StringZilla or STL `string_view`.
+ *  @brief Splits a string around every @b non-overlapping inclusion of the second string in @b reverse order.
+ *  @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  */
 template <typename string>
 range_rsplits<string, matcher_rfind<string, exclude_overlaps_type>> rsplit(string const &h, string const &n) noexcept {
@@ -1021,8 +1010,8 @@ range_rsplits<string, matcher_rfind<string, exclude_overlaps_type>> rsplit(strin
 }
 
 /**
- *  @brief  Splits a string around every character from the second string.
- *  @tparam string  A string-like type, ideally a view, like StringZilla or STL `string_view`.
+ *  @brief Splits a string around every character from the second string.
+ *  @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  */
 template <typename string>
 range_splits<string, matcher_find_first_of<string>> split_characters(string const &h, string const &n) noexcept {
@@ -1030,8 +1019,8 @@ range_splits<string, matcher_find_first_of<string>> split_characters(string cons
 }
 
 /**
- *  @brief  Splits a string around every character from the second string in @b reverse order.
- *  @tparam string  A string-like type, ideally a view, like StringZilla or STL `string_view`.
+ *  @brief Splits a string around every character from the second string in @b reverse order.
+ *  @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  */
 template <typename string>
 range_rsplits<string, matcher_find_last_of<string>> rsplit_characters(string const &h, string const &n) noexcept {
@@ -1039,8 +1028,8 @@ range_rsplits<string, matcher_find_last_of<string>> rsplit_characters(string con
 }
 
 /**
- *  @brief  Splits a string around every character except the ones from the second string.
- *  @tparam string  A string-like type, ideally a view, like StringZilla or STL `string_view`.
+ *  @brief Splits a string around every character except the ones from the second string.
+ *  @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  */
 template <typename string>
 range_splits<string, matcher_find_first_not_of<string>> split_other_characters(string const &h,
@@ -1049,8 +1038,8 @@ range_splits<string, matcher_find_first_not_of<string>> split_other_characters(s
 }
 
 /**
- *  @brief  Splits a string around every character except the ones from the second string in @b reverse order.
- *  @tparam string  A string-like type, ideally a view, like StringZilla or STL `string_view`.
+ *  @brief Splits a string around every character except the ones from the second string in @b reverse order.
+ *  @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  */
 template <typename string>
 range_rsplits<string, matcher_find_last_not_of<string>> rsplit_other_characters(string const &h,
@@ -1058,14 +1047,14 @@ range_rsplits<string, matcher_find_last_not_of<string>> rsplit_other_characters(
     return {h, n};
 }
 
-/**  @brief  Helper function using `std::advance` iterator and return it back. */
+/**  @brief Helper function using `std::advance` iterator and return it back. */
 template <typename iterator_type, typename distance_type>
 iterator_type advanced(iterator_type &&it, distance_type n) {
     std::advance(it, n);
     return it;
 }
 
-/**  @brief  Helper function using `range_length` to compute the unsigned distance. */
+/**  @brief Helper function using `range_length` to compute the unsigned distance. */
 template <typename iterator_type>
 std::size_t range_length(iterator_type first, iterator_type last) {
     return static_cast<std::size_t>(std::distance(first, last));
@@ -1172,7 +1161,9 @@ class reversed_iterator_for {
 };
 
 /**
- *  @brief  An "expression template" for lazy concatenation of strings using the `operator|`.
+ *  @brief An "expression template" for lazy concatenation of strings using the `operator|`.
+ *  @see https://en.wikipedia.org/wiki/Expression_templates
+ *  @sa `concatenate` function for usage examples.
  */
 template <typename first_type, typename second_type>
 struct concatenation {
@@ -1218,7 +1209,7 @@ struct concatenation {
  *          with much faster SIMD-accelerated substring search and approximate matching.
  *          Constructors are `constexpr` enabling `_sz` literals.
  *
- *  @tparam char_type_  The character type, usually `char const` or `char`. Must be a single byte long.
+ *  @tparam char_type_ The character type, usually `char const` or `char`. Must be a single byte long.
  */
 template <typename char_type_>
 class basic_string_slice {
@@ -1254,7 +1245,7 @@ class basic_string_slice {
     using string_view = basic_string_slice<immutable_char_type>;
     using partition_type = string_partition_result<string_slice>;
 
-    /** @brief  Special value for missing matches.
+    /** @brief Special value for missing matches.
      *
      *  We take the largest 63-bit unsigned integer on 64-bit machines.
      *  We take the largest 31-bit unsigned integer on 32-bit machines.
@@ -1298,8 +1289,8 @@ class basic_string_slice {
     operator std::string() const { return {data(), size()}; }
 
     /**
-     *  @brief  Formatted output function for compatibility with STL's `std::basic_ostream`.
-     *  @throw  `std::ios_base::failure` if an exception occurred during output.
+     *  @brief Formatted output function for compatibility with STL's `std::basic_ostream`.
+     *  @throw `std::ios_base::failure` if an exception occurred during output.
      */
     template <typename stream_traits>
     friend std::basic_ostream<value_type, stream_traits> &operator<<(std::basic_ostream<value_type, stream_traits> &os,
@@ -1364,7 +1355,7 @@ class basic_string_slice {
     }
 
     /**
-     *  @brief  Signed alternative to `at()`. Handy if you often write `str[str.size() - 2]`.
+     *  @brief Signed alternative to `at()`. Handy if you often write `str[str.size() - 2]`.
      *  @warning The behavior is @b undefined if the position is beyond bounds.
      */
     reference sat(difference_type signed_offset) const noexcept {
@@ -1376,6 +1367,7 @@ class basic_string_slice {
     /**
      *  @brief  The slice that would be dropped by `remove_prefix`, that accepts signed arguments
      *          and does no bounds checking. Equivalent to Python's `"abc"[:2]` and `"abc"[:-1]`.
+     *
      *  @warning The behavior is @b undefined if `n > size() || n < -size() || n == -0`.
      */
     string_slice front(difference_type signed_offset) const noexcept {
@@ -1420,49 +1412,49 @@ class basic_string_slice {
 #pragma region STL Style
 
     /**
-     *  @brief  Removes the first `n` characters from the view.
+     *  @brief Removes the first @p `n` bytes from the view.
      *  @warning The behavior is @b undefined if `n > size()`.
      */
     void remove_prefix(size_type n) noexcept { assert(n <= size()), start_ += n, length_ -= n; }
 
     /**
-     *  @brief  Removes the last `n` characters from the view.
+     *  @brief Removes the last @p `n` bytes from the view.
      *  @warning The behavior is @b undefined if `n > size()`.
      */
     void remove_suffix(size_type n) noexcept { assert(n <= size()), length_ -= n; }
 
-    /**  @brief  Added for STL compatibility. */
+    /**  @brief Added for STL compatibility. */
     string_slice substr() const noexcept { return *this; }
 
     /**
-     *  @brief  Return a slice of this view after first `skip` bytes.
-     *  @throws `std::out_of_range` if `skip > size()`.
-     *  @see    `sub` for a cleaner exception-less alternative.
+     *  @brief Return a slice of this view after first @p `n` bytes.
+     *  @throws `std::out_of_range` if `n > size()`.
+     *  @sa `sub` for a cleaner exception-less alternative.
      */
-    string_slice substr(size_type skip) const noexcept(false) {
-        if (skip > size()) throw std::out_of_range("string_slice::substr");
-        return string_slice(start_ + skip, length_ - skip);
+    string_slice substr(size_type n) const noexcept(false) {
+        if (n > size()) throw std::out_of_range("string_slice::substr");
+        return string_slice(start_ + n, length_ - n);
     }
 
     /**
-     *  @brief  Return a slice of this view after first `skip` bytes, taking at most `count` bytes.
-     *  @throws `std::out_of_range` if `skip > size()`.
-     *  @see    `sub` for a cleaner exception-less alternative.
+     *  @brief Return a slice of this view after first @p `n` bytes, taking at most `count` bytes.
+     *  @throws `std::out_of_range` if `n > size()`.
+     *  @sa `sub` for a cleaner exception-less alternative.
      */
-    string_slice substr(size_type skip, size_type count) const noexcept(false) {
-        if (skip > size()) throw std::out_of_range("string_slice::substr");
-        return string_slice(start_ + skip, sz_min_of_two(count, length_ - skip));
+    string_slice substr(size_type n, size_type count) const noexcept(false) {
+        if (n > size()) throw std::out_of_range("string_slice::substr");
+        return string_slice(start_ + n, sz_min_of_two(count, length_ - n));
     }
 
     /**
-     *  @brief  Exports a slice of this view after first `skip` bytes, taking at most `count` bytes.
-     *  @throws `std::out_of_range` if `skip > size()`.
-     *  @see    `sub` for a cleaner exception-less alternative.
+     *  @brief Exports a slice of this view after first @p `n` bytes, taking at most `count` bytes.
+     *  @throws `std::out_of_range` if `n > size()`.
+     *  @sa `sub` for a cleaner exception-less alternative.
      */
-    size_type copy(value_type *destination, size_type count, size_type skip = 0) const noexcept(false) {
-        if (skip > size()) throw std::out_of_range("string_slice::copy");
-        count = sz_min_of_two(count, length_ - skip);
-        sz_copy((sz_ptr_t)destination, start_ + skip, count);
+    size_type copy(value_type *destination, size_type count, size_type n = 0) const noexcept(false) {
+        if (n > size()) throw std::out_of_range("string_slice::copy");
+        count = sz_min_of_two(count, length_ - n);
+        sz_copy((sz_ptr_t)destination, start_ + n, count);
         return count;
     }
 
@@ -1475,26 +1467,26 @@ class basic_string_slice {
 #pragma region Whole String Comparisons
 
     /**
-     *  @brief  Compares two strings lexicographically. If prefix matches, lengths are compared.
+     *  @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
      *  @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
      */
     int compare(string_view other) const noexcept { return (int)sz_order(data(), size(), other.data(), other.size()); }
 
     /**
-     *  @brief  Compares two strings lexicographically. If prefix matches, lengths are compared.
-     *          Equivalent to `substr(pos1, count1).compare(other)`.
+     *  @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
+     *  @see Equivalent to `substr(pos1, count1).compare(other)`.
      *  @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
-     *  @throw  `std::out_of_range` if `pos1 > size()`.
+     *  @throw `std::out_of_range` if `pos1 > size()`.
      */
     int compare(size_type pos1, size_type count1, string_view other) const noexcept(false) {
         return substr(pos1, count1).compare(other);
     }
 
     /**
-     *  @brief  Compares two strings lexicographically. If prefix matches, lengths are compared.
-     *          Equivalent to `substr(pos1, count1).compare(other.substr(pos2, count2))`.
+     *  @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
+     *  @see Equivalent to `substr(pos1, count1).compare(other.substr(pos2, count2))`.
      *  @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
-     *  @throw  `std::out_of_range` if `pos1 > size()` or if `pos2 > other.size()`.
+     *  @throw `std::out_of_range` if `pos1 > size()` or if `pos2 > other.size()`.
      */
     int compare(size_type pos1, size_type count1, string_view other, size_type pos2, size_type count2) const
         noexcept(false) {
@@ -1502,37 +1494,37 @@ class basic_string_slice {
     }
 
     /**
-     *  @brief  Compares two strings lexicographically. If prefix matches, lengths are compared.
+     *  @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
      *  @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
      */
     int compare(const_pointer other) const noexcept { return compare(string_view(other)); }
 
     /**
-     *  @brief  Compares two strings lexicographically. If prefix matches, lengths are compared.
-     *          Equivalent to substr(pos1, count1).compare(other).
+     *  @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
+     *  @see Equivalent to substr(pos1, count1).compare(other).
      *  @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
-     *  @throw  `std::out_of_range` if `pos1 > size()`.
+     *  @throw `std::out_of_range` if `pos1 > size()`.
      */
     int compare(size_type pos1, size_type count1, const_pointer other) const noexcept(false) {
         return substr(pos1, count1).compare(string_view(other));
     }
 
     /**
-     *  @brief  Compares two strings lexicographically. If prefix matches, lengths are compared.
-     *          Equivalent to `substr(pos1, count1).compare({s, count2})`.
+     *  @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
+     *  @see Equivalent to `substr(pos1, count1).compare({s, count2})`.
      *  @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
-     *  @throw  `std::out_of_range` if `pos1 > size()`.
+     *  @throw `std::out_of_range` if `pos1 > size()`.
      */
     int compare(size_type pos1, size_type count1, const_pointer other, size_type count2) const noexcept(false) {
         return substr(pos1, count1).compare(string_view(other, count2));
     }
 
-    /**  @brief  Checks if the string is equal to the other string. */
+    /**  @brief Checks if the string is equal to the other string. */
     bool operator==(string_view other) const noexcept {
         return size() == other.size() && sz_equal(data(), other.data(), other.size()) == sz_true_k;
     }
 
-    /**  @brief  Checks if the string is equal to a concatenation of two strings. */
+    /**  @brief Checks if the string is equal to a concatenation of two strings. */
     bool operator==(concatenation<string_view, string_view> const &other) const noexcept {
         return size() == other.size() && sz_equal(data(), other.first.data(), other.first.size()) == sz_true_k &&
                sz_equal(data() + other.first.size(), other.second.data(), other.second.size()) == sz_true_k;
@@ -1540,7 +1532,7 @@ class basic_string_slice {
 
 #if _SZ_IS_CPP20
 
-    /**  @brief  Computes the lexicographic ordering between this and the ::other string. */
+    /**  @brief Computes the lexicographic ordering between this and the ::other string. */
     std::strong_ordering operator<=>(string_view other) const noexcept {
         std::strong_ordering orders[3] {std::strong_ordering::less, std::strong_ordering::equal,
                                         std::strong_ordering::greater};
@@ -1549,19 +1541,19 @@ class basic_string_slice {
 
 #else
 
-    /**  @brief  Checks if the string is not equal to the other string. */
+    /**  @brief Checks if the string is not equal to the other string. */
     bool operator!=(string_view other) const noexcept { return !operator==(other); }
 
-    /**  @brief  Checks if the string is lexicographically smaller than the other string. */
+    /**  @brief Checks if the string is lexicographically smaller than the other string. */
     bool operator<(string_view other) const noexcept { return compare(other) == sz_less_k; }
 
-    /**  @brief  Checks if the string is lexicographically equal or smaller than the other string. */
+    /**  @brief Checks if the string is lexicographically equal or smaller than the other string. */
     bool operator<=(string_view other) const noexcept { return compare(other) != sz_greater_k; }
 
-    /**  @brief  Checks if the string is lexicographically greater than the other string. */
+    /**  @brief Checks if the string is lexicographically greater than the other string. */
     bool operator>(string_view other) const noexcept { return compare(other) == sz_greater_k; }
 
-    /**  @brief  Checks if the string is lexicographically equal or greater than the other string. */
+    /**  @brief Checks if the string is lexicographically equal or greater than the other string. */
     bool operator>=(string_view other) const noexcept { return compare(other) != sz_less_k; }
 
 #endif
@@ -1569,41 +1561,41 @@ class basic_string_slice {
 #pragma endregion
 #pragma region Prefix and Suffix Comparisons
 
-    /**  @brief  Checks if the string starts with the other string. */
+    /**  @brief Checks if the string starts with the other string. */
     bool starts_with(string_view other) const noexcept {
         return length_ >= other.size() && sz_equal(start_, other.data(), other.size()) == sz_true_k;
     }
 
-    /**  @brief  Checks if the string starts with the other string. */
+    /**  @brief Checks if the string starts with the other string. */
     bool starts_with(const_pointer other) const noexcept {
         auto other_length = null_terminated_length(other);
         return length_ >= other_length && sz_equal(start_, other, other_length) == sz_true_k;
     }
 
-    /**  @brief  Checks if the string starts with the other character. */
+    /**  @brief Checks if the string starts with the other character. */
     bool starts_with(value_type other) const noexcept { return length_ && start_[0] == other; }
 
-    /**  @brief  Checks if the string ends with the other string. */
+    /**  @brief Checks if the string ends with the other string. */
     bool ends_with(string_view other) const noexcept {
         return length_ >= other.size() &&
                sz_equal(start_ + length_ - other.size(), other.data(), other.size()) == sz_true_k;
     }
 
-    /**  @brief  Checks if the string ends with the other string. */
+    /**  @brief Checks if the string ends with the other string. */
     bool ends_with(const_pointer other) const noexcept {
         auto other_length = null_terminated_length(other);
         return length_ >= other_length && sz_equal(start_ + length_ - other_length, other, other_length) == sz_true_k;
     }
 
-    /**  @brief  Checks if the string ends with the other character. */
+    /**  @brief Checks if the string ends with the other character. */
     bool ends_with(value_type other) const noexcept { return length_ && start_[length_ - 1] == other; }
 
-    /**  @brief  Python-like convenience function, dropping the matching prefix. */
+    /**  @brief Python-like convenience function, dropping the matching prefix. */
     string_slice remove_prefix(string_view other) const noexcept {
         return starts_with(other) ? string_slice {start_ + other.size(), length_ - other.size()} : *this;
     }
 
-    /**  @brief  Python-like convenience function, dropping the matching suffix. */
+    /**  @brief Python-like convenience function, dropping the matching suffix. */
     string_slice remove_suffix(string_view other) const noexcept {
         return ends_with(other) ? string_slice {start_, length_ - other.size()} : *this;
     }
@@ -1620,9 +1612,9 @@ class basic_string_slice {
 #pragma region Returning offsets
 
     /**
-     *  @brief  Find the first occurrence of a substring, skipping the first `skip` characters.
-     *          The behavior is @b undefined if `skip > size()`.
+     *  @brief Find the first occurrence of a substring, skipping the first `skip` characters.
      *  @return The offset of the first character of the match, or `npos` if not found.
+     *  @warning The behavior is @b undefined if `skip > size()`.
      */
     size_type find(string_view other, size_type skip = 0) const noexcept {
         auto ptr = sz_find(start_ + skip, length_ - skip, other.data(), other.size());
@@ -1630,9 +1622,9 @@ class basic_string_slice {
     }
 
     /**
-     *  @brief  Find the first occurrence of a character, skipping the first `skip` characters.
-     *          The behavior is @b undefined if `skip > size()`.
+     *  @brief Find the first occurrence of a character, skipping the first `skip` characters.
      *  @return The offset of the match, or `npos` if not found.
+     *  @warning The behavior is @b undefined if `skip > size()`.
      */
     size_type find(value_type character, size_type skip = 0) const noexcept {
         auto ptr = sz_find_byte(start_ + skip, length_ - skip, &character);
@@ -1640,16 +1632,16 @@ class basic_string_slice {
     }
 
     /**
-     *  @brief  Find the first occurrence of a substring, skipping the first `skip` characters.
-     *          The behavior is @b undefined if `skip > size()`.
+     *  @brief Find the first occurrence of a substring, skipping the first `skip` characters.
      *  @return The offset of the first character of the match, or `npos` if not found.
+     *  @warning The behavior is @b undefined if `skip > size()`.
      */
     size_type find(const_pointer other, size_type pos, size_type count) const noexcept {
         return find(string_view(other, count), pos);
     }
 
     /**
-     *  @brief  Find the last occurrence of a substring.
+     *  @brief Find the last occurrence of a substring.
      *  @return The offset of the first character of the match, or `npos` if not found.
      */
     size_type rfind(string_view other) const noexcept {
@@ -1658,7 +1650,7 @@ class basic_string_slice {
     }
 
     /**
-     *  @brief  Find the last occurrence of a substring, within first `until` characters.
+     *  @brief Find the last occurrence of a substring, within first `until` characters.
      *  @return The offset of the first character of the match, or `npos` if not found.
      */
     size_type rfind(string_view other, size_type until) const noexcept(false) {
@@ -1666,7 +1658,7 @@ class basic_string_slice {
     }
 
     /**
-     *  @brief  Find the last occurrence of a character.
+     *  @brief Find the last occurrence of a character.
      *  @return The offset of the match, or `npos` if not found.
      */
     size_type rfind(value_type character) const noexcept {
@@ -1675,7 +1667,7 @@ class basic_string_slice {
     }
 
     /**
-     *  @brief  Find the last occurrence of a character, within first `until` characters.
+     *  @brief Find the last occurrence of a character, within first `until` characters.
      *  @return The offset of the match, or `npos` if not found.
      */
     size_type rfind(value_type character, size_type until) const noexcept {
@@ -1683,38 +1675,38 @@ class basic_string_slice {
     }
 
     /**
-     *  @brief  Find the last occurrence of a substring, within first `until` characters.
+     *  @brief Find the last occurrence of a substring, within first `until` characters.
      *  @return The offset of the first character of the match, or `npos` if not found.
      */
     size_type rfind(const_pointer other, size_type until, size_type count) const noexcept {
         return rfind(string_view(other, count), until);
     }
 
-    /**  @brief  Find the first occurrence of a character from a set. */
+    /**  @brief Find the first occurrence of a character from a set. */
     size_type find(byteset set) const noexcept { return find_first_of(set); }
 
-    /**  @brief  Find the last occurrence of a character from a set. */
+    /**  @brief Find the last occurrence of a character from a set. */
     size_type rfind(byteset set) const noexcept { return find_last_of(set); }
 
 #pragma endregion
 #pragma region Returning Partitions
 
-    /**  @brief  Split the string into three parts, before the match, the match itself, and after it. */
+    /**  @brief Split the string into three parts, before the match, the match itself, and after it. */
     partition_type partition(string_view pattern) const noexcept { return partition_(pattern, pattern.length()); }
 
-    /**  @brief  Split the string into three parts, before the match, the match itself, and after it. */
+    /**  @brief Split the string into three parts, before the match, the match itself, and after it. */
     partition_type partition(value_type pattern) const noexcept { return partition_(string_view(&pattern, 1), 1); }
 
-    /**  @brief  Split the string into three parts, before the match, the match itself, and after it. */
+    /**  @brief Split the string into three parts, before the match, the match itself, and after it. */
     partition_type partition(byteset pattern) const noexcept { return partition_(pattern, 1); }
 
-    /**  @brief  Split the string into three parts, before the @b last match, the last match itself, and after it. */
+    /**  @brief Split the string into three parts, before the @b last match, the last match itself, and after it. */
     partition_type rpartition(string_view pattern) const noexcept { return rpartition_(pattern, pattern.length()); }
 
-    /**  @brief  Split the string into three parts, before the @b last match, the last match itself, and after it. */
+    /**  @brief Split the string into three parts, before the @b last match, the last match itself, and after it. */
     partition_type rpartition(value_type pattern) const noexcept { return rpartition_(string_view(&pattern, 1), 1); }
 
-    /**  @brief  Split the string into three parts, before the @b last match, the last match itself, and after it. */
+    /**  @brief Split the string into three parts, before the @b last match, the last match itself, and after it. */
     partition_type rpartition(byteset pattern) const noexcept { return rpartition_(pattern, 1); }
 
 #pragma endregion
@@ -1735,8 +1727,8 @@ class basic_string_slice {
 
 #pragma region Character Set Arguments
     /**
-     *  @brief  Find the first occurrence of a character from a set.
-     *  @param  skip Number of characters to skip before the search.
+     *  @brief Find the first occurrence of a character from a @p `set`.
+     *  @param[in] skip Number of characters to skip before the search.
      *  @warning The behavior is @b undefined if `skip > size()`.
      */
     size_type find_first_of(byteset set, size_type skip = 0) const noexcept {
@@ -1745,8 +1737,8 @@ class basic_string_slice {
     }
 
     /**
-     *  @brief  Find the first occurrence of a character outside a set.
-     *  @param  skip  The number of first characters to be skipped.
+     *  @brief Find the first occurrence of a character outside a @p `set`.
+     *  @param[in] skip The number of first characters to be skipped.
      *  @warning The behavior is @b undefined if `skip > size()`.
      */
     size_type find_first_not_of(byteset set, size_type skip = 0) const noexcept {
@@ -1754,7 +1746,7 @@ class basic_string_slice {
     }
 
     /**
-     *  @brief  Find the last occurrence of a character from a set.
+     *  @brief Find the last occurrence of a character from a @p `set`.
      */
     size_type find_last_of(byteset set) const noexcept {
         auto ptr = sz_rfind_byteset(start_, length_, &set.raw());
@@ -1762,13 +1754,13 @@ class basic_string_slice {
     }
 
     /**
-     *  @brief  Find the last occurrence of a character outside a set.
+     *  @brief Find the last occurrence of a character outside a @p `set`.
      */
     size_type find_last_not_of(byteset set) const noexcept { return find_last_of(set.inverted()); }
 
     /**
-     *  @brief  Find the last occurrence of a character from a set.
-     *  @param  until  The offset of the last character to be considered.
+     *  @brief Find the last occurrence of a character from a @p `set`.
+     *  @param[in] until The offset of the last character to be considered.
      */
     size_type find_last_of(byteset set, size_type until) const noexcept {
         auto len = sz_min_of_two(until + 1, length_);
@@ -1777,8 +1769,8 @@ class basic_string_slice {
     }
 
     /**
-     *  @brief  Find the last occurrence of a character outside a set.
-     *  @param  until  The offset of the last character to be considered.
+     *  @brief Find the last occurrence of a character outside a @p `set`.
+     *  @param[in] until The offset of the last character to be considered.
      */
     size_type find_last_not_of(byteset set, size_type until) const noexcept {
         return find_last_of(set.inverted(), until);
@@ -1788,32 +1780,32 @@ class basic_string_slice {
 #pragma region String Arguments
 
     /**
-     *  @brief  Find the first occurrence of a character from a ::set.
-     *  @param  skip  The number of first characters to be skipped.
+     *  @brief Find the first occurrence of a character from the @p `other` string.
+     *  @param[in] skip The number of first characters to be skipped.
      */
     size_type find_first_of(string_view other, size_type skip = 0) const noexcept {
         return find_first_of(other.as_set(), skip);
     }
 
     /**
-     *  @brief  Find the first occurrence of a character outside a ::set.
-     *  @param  skip  The number of first characters to be skipped.
+     *  @brief Find the first occurrence of a character missing in the @p `other` string.
+     *  @param[in] skip The number of first characters to be skipped.
      */
     size_type find_first_not_of(string_view other, size_type skip = 0) const noexcept {
         return find_first_not_of(other.as_set(), skip);
     }
 
     /**
-     *  @brief  Find the last occurrence of a character from a ::set.
-     *  @param  until  The offset of the last character to be considered.
+     *  @brief Find the last occurrence of a character from the @p `other` string.
+     *  @param[in] until The offset of the last character to be considered.
      */
     size_type find_last_of(string_view other, size_type until = npos) const noexcept {
         return find_last_of(other.as_set(), until);
     }
 
     /**
-     *  @brief  Find the last occurrence of a character outside a ::set.
-     *  @param  until  The offset of the last character to be considered.
+     *  @brief Find the last occurrence of a character missing in the @p `other` string.
+     *  @param[in] until The offset of the last character to be considered.
      */
     size_type find_last_not_of(string_view other, size_type until = npos) const noexcept {
         return find_last_not_of(other.as_set(), until);
@@ -1823,8 +1815,8 @@ class basic_string_slice {
 #pragma region C Style Arguments
 
     /**
-     *  @brief  Find the first occurrence of a character from a set.
-     *  @param  skip  The number of first characters to be skipped.
+     *  @brief Find the first occurrence of a character from the @p `other` string.
+     *  @param[in] skip The number of first characters to be skipped.
      *  @warning The behavior is @b undefined if `skip > size()`.
      */
     size_type find_first_of(const_pointer other, size_type skip, size_type count) const noexcept {
@@ -1832,8 +1824,8 @@ class basic_string_slice {
     }
 
     /**
-     *  @brief  Find the first occurrence of a character outside a set.
-     *  @param  skip  The number of first characters to be skipped.
+     *  @brief Find the first occurrence of a character missing in the @p `other` string.
+     *  @param[in] skip The number of first characters to be skipped.
      *  @warning The behavior is @b undefined if `skip > size()`.
      */
     size_type find_first_not_of(const_pointer other, size_type skip, size_type count) const noexcept {
@@ -1841,16 +1833,16 @@ class basic_string_slice {
     }
 
     /**
-     *  @brief  Find the last occurrence of a character from a set.
-     *  @param  until  The number of first characters to be considered.
+     *  @brief Find the last occurrence of a character from the @p `other` string.
+     *  @param[in] until The number of first characters to be considered.
      */
     size_type find_last_of(const_pointer other, size_type until, size_type count) const noexcept {
         return find_last_of(string_view(other, count), until);
     }
 
     /**
-     *  @brief  Find the last occurrence of a character outside a set.
-     *  @param  until  The number of first characters to be considered.
+     *  @brief Find the last occurrence of a character missing in the @p `other` string.
+     *  @param[in] until The number of first characters to be considered.
      */
     size_type find_last_not_of(const_pointer other, size_type until, size_type count) const noexcept {
         return find_last_not_of(string_view(other, count), until);
@@ -1860,8 +1852,8 @@ class basic_string_slice {
 #pragma region Slicing
 
     /**
-     *  @brief  Python-like convenience function, dropping prefix formed of given characters.
-     *          Similar to `boost::algorithm::trim_left_if(str, is_any_of(set))`.
+     *  @brief Python-like convenience function, dropping prefix formed of given characters.
+     *  @see Similar to `boost::algorithm::trim_left_if(str, is_any_of(set))`.
      */
     string_slice lstrip(byteset set) const noexcept {
         set = set.inverted();
@@ -1871,8 +1863,8 @@ class basic_string_slice {
     }
 
     /**
-     *  @brief  Python-like convenience function, dropping suffix formed of given characters.
-     *          Similar to `boost::algorithm::trim_right_if(str, is_any_of(set))`.
+     *  @brief Python-like convenience function, dropping suffix formed of given characters.
+     *  @see Similar to `boost::algorithm::trim_right_if(str, is_any_of(set))`.
      */
     string_slice rstrip(byteset set) const noexcept {
         set = set.inverted();
@@ -1881,8 +1873,8 @@ class basic_string_slice {
     }
 
     /**
-     *  @brief  Python-like convenience function, dropping both the prefix & the suffix formed of given characters.
-     *          Similar to `boost::algorithm::trim_if(str, is_any_of(set))`.
+     *  @brief Python-like convenience function, dropping both the prefix & the suffix formed of given characters.
+     *  @see Similar to `boost::algorithm::trim_if(str, is_any_of(set))`.
      */
     string_slice strip(byteset set) const noexcept {
         set = set.inverted();
@@ -1908,22 +1900,22 @@ class basic_string_slice {
     using find_all_chars_type = range_matches<string_slice, matcher_find_first_of<string_view, byteset>>;
     using rfind_all_chars_type = range_rmatches<string_slice, matcher_find_last_of<string_view, byteset>>;
 
-    /**  @brief  Find all potentially @b overlapping occurrences of a given string. */
+    /**  @brief Find all potentially @b overlapping occurrences of a given string. */
     find_all_type find_all(string_view needle, include_overlaps_type = {}) const noexcept { return {*this, needle}; }
 
-    /**  @brief  Find all potentially @b overlapping occurrences of a given string in @b reverse order. */
+    /**  @brief Find all potentially @b overlapping occurrences of a given string in @b reverse order. */
     rfind_all_type rfind_all(string_view needle, include_overlaps_type = {}) const noexcept { return {*this, needle}; }
 
-    /**  @brief  Find all @b non-overlapping occurrences of a given string. */
+    /**  @brief Find all @b non-overlapping occurrences of a given string. */
     find_disjoint_type find_all(string_view needle, exclude_overlaps_type) const noexcept { return {*this, needle}; }
 
-    /**  @brief  Find all @b non-overlapping occurrences of a given string in @b reverse order. */
+    /**  @brief Find all @b non-overlapping occurrences of a given string in @b reverse order. */
     rfind_disjoint_type rfind_all(string_view needle, exclude_overlaps_type) const noexcept { return {*this, needle}; }
 
-    /**  @brief  Find all occurrences of given characters. */
+    /**  @brief Find all occurrences of given characters. */
     find_all_chars_type find_all(byteset set) const noexcept { return {*this, {set}}; }
 
-    /**  @brief  Find all occurrences of given characters in @b reverse order. */
+    /**  @brief Find all occurrences of given characters in @b reverse order. */
     rfind_all_chars_type rfind_all(byteset set) const noexcept { return {*this, {set}}; }
 
     using split_type = range_splits<string_slice, matcher_find<string_view, exclude_overlaps_type>>;
@@ -1932,32 +1924,32 @@ class basic_string_slice {
     using split_chars_type = range_splits<string_slice, matcher_find_first_of<string_view, byteset>>;
     using rsplit_chars_type = range_rsplits<string_slice, matcher_find_last_of<string_view, byteset>>;
 
-    /**  @brief  Split around occurrences of a given string. */
+    /**  @brief Split around occurrences of a given string. */
     split_type split(string_view delimiter) const noexcept { return {*this, delimiter}; }
 
-    /**  @brief  Split around occurrences of a given string in @b reverse order. */
+    /**  @brief Split around occurrences of a given string in @b reverse order. */
     rsplit_type rsplit(string_view delimiter) const noexcept { return {*this, delimiter}; }
 
-    /**  @brief  Split around occurrences of given characters. */
+    /**  @brief Split around occurrences of given characters. */
     split_chars_type split(byteset set = whitespaces_set()) const noexcept { return {*this, {set}}; }
 
-    /**  @brief  Split around occurrences of given characters in @b reverse order. */
+    /**  @brief Split around occurrences of given characters in @b reverse order. */
     rsplit_chars_type rsplit(byteset set = whitespaces_set()) const noexcept { return {*this, {set}}; }
 
-    /**  @brief  Split around the occurrences of all newline characters. */
+    /**  @brief Split around the occurrences of all newline characters. */
     split_chars_type splitlines() const noexcept { return split(newlines_set()); }
 
 #pragma endregion
 
-    /**  @brief  Hashes the string, equivalent to `std::hash<string_view>{}(str)`. */
+    /**  @brief Hashes the string, equivalent to `std::hash<string_view>{}(str)`. */
     size_type hash(std::uint64_t seed = 42) const noexcept {
         return static_cast<size_type>(sz_hash(start_, length_, static_cast<sz_u64_t>(seed)));
     }
 
-    /**  @brief  Aggregates the values of individual bytes of a string. */
+    /**  @brief Aggregates the values of individual bytes of a string. */
     size_type bytesum() const noexcept { return static_cast<size_type>(sz_bytesum(start_, length_)); }
 
-    /**  @brief  Populate a character set with characters present in this string. */
+    /**  @brief Populate a character set with characters present in this string. */
     byteset as_set() const noexcept {
         byteset set;
         for (auto c : *this) set.add(c);
@@ -2157,7 +2149,7 @@ class basic_string {
 
     basic_string(std::nullptr_t) = delete;
 
-    /**  @brief  Construct a string by repeating a certain ::character ::count times. */
+    /**  @brief Construct a string by repeating a certain @p character @p count times. */
     basic_string(size_type count, value_type character) noexcept(false) { init(count, character); }
 
     basic_string(basic_string const &other, size_type pos) noexcept(false) { init(string_view(other).substr(pos)); }
@@ -2210,8 +2202,8 @@ class basic_string {
     operator std::string() const { return view(); }
 
     /**
-     *  @brief  Formatted output function for compatibility with STL's `std::basic_ostream`.
-     *  @throw  `std::ios_base::failure` if an exception occurred during output.
+     *  @brief Formatted output function for compatibility with STL's `std::basic_ostream`.
+     *  @throw `std::ios_base::failure` if an exception occurred during output.
      */
     template <typename stream_traits>
     friend std::basic_ostream<value_type, stream_traits> &operator<<(std::basic_ostream<value_type, stream_traits> &os,
@@ -2231,12 +2223,12 @@ class basic_string {
 
     template <typename first_type, typename second_type>
     explicit basic_string(concatenation<first_type, second_type> const &expression) noexcept(false) {
-        _with_alloc([&](sz_alloc_type &alloc) {
+        raise(_with_alloc([&](sz_alloc_type &alloc) {
             sz_ptr_t ptr = sz_string_init_length(&string_, expression.length(), &alloc);
-            if (!ptr) return false;
+            if (!ptr) return sz_bad_alloc_k;
             expression.copy(ptr);
-            return true;
-        });
+            return sz_success_k;
+        }));
     }
 
     template <typename first_type, typename second_type>
@@ -2318,21 +2310,21 @@ class basic_string {
     string_span operator[](std::initializer_list<difference_type> offsets) noexcept { return span()[offsets]; }
 
     /**
-     *  @brief  Signed alternative to `at()`. Handy if you often write `str[str.size() - 2]`.
+     *  @brief Signed alternative to `at()`. Handy if you often write `str[str.size() - 2]`.
      *  @warning The behavior is @b undefined if the position is beyond bounds.
      */
     value_type sat(difference_type offset) const noexcept { return view().sat(offset); }
     reference sat(difference_type offset) noexcept { return span().sat(offset); }
 
     /**
-     *  @brief  The opposite operation to `remove_prefix`, that does no bounds checking.
+     *  @brief The opposite operation to `remove_prefix`, that does no bounds checking.
      *  @warning The behavior is @b undefined if `n > size()`.
      */
     string_view front(difference_type n) const noexcept { return view().front(n); }
     string_span front(difference_type n) noexcept { return span().front(n); }
 
     /**
-     *  @brief  The opposite operation to `remove_prefix`, that does no bounds checking.
+     *  @brief The opposite operation to `remove_prefix`, that does no bounds checking.
      *  @warning The behavior is @b undefined if `n > size()`.
      */
     string_view back(difference_type n) const noexcept { return view().back(n); }
@@ -2356,7 +2348,7 @@ class basic_string {
 #pragma region STL Style
 
     /**
-     *  @brief  Removes the first `n` characters from the view.
+     *  @brief Removes the first `n` characters from the view.
      *  @warning The behavior is @b undefined if `n > size()`.
      */
     void remove_prefix(size_type n) noexcept {
@@ -2365,7 +2357,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Removes the last `n` characters from the view.
+     *  @brief Removes the last `n` characters from the view.
      *  @warning The behavior is @b undefined if `n > size()`.
      */
     void remove_suffix(size_type n) noexcept {
@@ -2373,27 +2365,27 @@ class basic_string {
         sz_string_erase(&string_, size() - n, n);
     }
 
-    /**  @brief  Added for STL compatibility. */
+    /**  @brief Added for STL compatibility. */
     basic_string substr() const noexcept { return *this; }
 
     /**
-     *  @brief  Return a slice of this view after first `skip` bytes.
+     *  @brief Return a slice of this view after first `skip` bytes.
      *  @throws `std::out_of_range` if `skip > size()`.
-     *  @see    `sub` for a cleaner exception-less alternative.
+     *  @sa `sub` for a cleaner exception-less alternative.
      */
     basic_string substr(size_type skip) const noexcept(false) { return view().substr(skip); }
 
     /**
-     *  @brief  Return a slice of this view after first `skip` bytes, taking at most `count` bytes.
+     *  @brief Return a slice of this view after first `skip` bytes, taking at most `count` bytes.
      *  @throws `std::out_of_range` if `skip > size()`.
-     *  @see    `sub` for a cleaner exception-less alternative.
+     *  @sa `sub` for a cleaner exception-less alternative.
      */
     basic_string substr(size_type skip, size_type count) const noexcept(false) { return view().substr(skip, count); }
 
     /**
-     *  @brief  Exports a slice of this view after first `skip` bytes, taking at most `count` bytes.
+     *  @brief Exports a slice of this view after first `skip` bytes, taking at most `count` bytes.
      *  @throws `std::out_of_range` if `skip > size()`.
-     *  @see    `sub` for a cleaner exception-less alternative.
+     *  @sa `sub` for a cleaner exception-less alternative.
      */
     size_type copy(value_type *destination, size_type count, size_type skip = 0) const noexcept(false) {
         return view().copy(destination, count, skip);
@@ -2408,26 +2400,26 @@ class basic_string {
 #pragma region Whole String Comparisons
 
     /**
-     *  @brief  Compares two strings lexicographically. If prefix matches, lengths are compared.
+     *  @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
      *  @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
      */
     int compare(string_view other) const noexcept { return view().compare(other); }
 
     /**
-     *  @brief  Compares two strings lexicographically. If prefix matches, lengths are compared.
-     *          Equivalent to `substr(pos1, count1).compare(other)`.
+     *  @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
      *  @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
-     *  @throw  `std::out_of_range` if `pos1 > size()`.
+     *  @throw `std::out_of_range` if `pos1 > size()`.
+     *  @sa Equivalent to `substr(pos1, count1).compare(other)`.
      */
     int compare(size_type pos1, size_type count1, string_view other) const noexcept(false) {
         return view().compare(pos1, count1, other);
     }
 
     /**
-     *  @brief  Compares two strings lexicographically. If prefix matches, lengths are compared.
-     *          Equivalent to `substr(pos1, count1).compare(other.substr(pos2, count2))`.
-     *  @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
-     *  @throw  `std::out_of_range` if `pos1 > size()` or if `pos2 > other.size()`.
+     *  @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
+     *  @return 0 if equal, negative if `*this` is less than @p other, positive if `*this` is greater than @p other.
+     *  @throw `std::out_of_range` if `pos1 > size()` or if `pos2 > other.size()`.
+     *  @sa Equivalent to `substr(pos1, count1).compare(other.substr(pos2, count2))`.
      */
     int compare(size_type pos1, size_type count1, string_view other, size_type pos2, size_type count2) const
         noexcept(false) {
@@ -2435,58 +2427,58 @@ class basic_string {
     }
 
     /**
-     *  @brief  Compares two strings lexicographically. If prefix matches, lengths are compared.
-     *  @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
+     *  @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
+     *  @return 0 if equal, negative if `*this` is less than @p other, positive if `*this` is greater than @p other.
      */
     int compare(const_pointer other) const noexcept { return view().compare(other); }
 
     /**
-     *  @brief  Compares two strings lexicographically. If prefix matches, lengths are compared.
-     *          Equivalent to substr(pos1, count1).compare(other).
-     *  @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
-     *  @throw  `std::out_of_range` if `pos1 > size()`.
+     *  @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
+     *  @return 0 if equal, negative if `*this` is less than @p other, positive if `*this` is greater than @p other.
+     *  @throw `std::out_of_range` if `pos1 > size()`.
+     *  @sa Equivalent to `substr(pos1, count1).compare(other)`.
      */
     int compare(size_type pos1, size_type count1, const_pointer other) const noexcept(false) {
         return view().compare(pos1, count1, other);
     }
 
     /**
-     *  @brief  Compares two strings lexicographically. If prefix matches, lengths are compared.
-     *          Equivalent to `substr(pos1, count1).compare({s, count2})`.
-     *  @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
-     *  @throw  `std::out_of_range` if `pos1 > size()`.
+     *  @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
+     *  @return 0 if equal, negative if `*this` is less than @p other, positive if `*this` is greater than @p other.
+     *  @throw `std::out_of_range` if `pos1 > size()`.
+     *  @sa Equivalent to `substr(pos1, count1).compare({s, count2})`.
      */
     int compare(size_type pos1, size_type count1, const_pointer other, size_type count2) const noexcept(false) {
         return view().compare(pos1, count1, other, count2);
     }
 
-    /**  @brief  Checks if the string is equal to the other string. */
+    /**  @brief Checks if the string is equal to the other string. */
     bool operator==(basic_string const &other) const noexcept { return view() == other.view(); }
     bool operator==(string_view other) const noexcept { return view() == other; }
     bool operator==(const_pointer other) const noexcept { return view() == string_view(other); }
 
 #if _SZ_IS_CPP20
 
-    /**  @brief  Computes the lexicographic ordering between this and the ::other string. */
+    /**  @brief Computes the lexicographic ordering between this and the @p other string. */
     std::strong_ordering operator<=>(basic_string const &other) const noexcept { return view() <=> other.view(); }
     std::strong_ordering operator<=>(string_view other) const noexcept { return view() <=> other; }
     std::strong_ordering operator<=>(const_pointer other) const noexcept { return view() <=> string_view(other); }
 
 #else
 
-    /**  @brief  Checks if the string is not equal to the other string. */
+    /**  @brief Checks if the string is not equal to the other string. */
     bool operator!=(string_view other) const noexcept { return !operator==(other); }
 
-    /**  @brief  Checks if the string is lexicographically smaller than the other string. */
+    /**  @brief Checks if the string is lexicographically smaller than the other string. */
     bool operator<(string_view other) const noexcept { return compare(other) == sz_less_k; }
 
-    /**  @brief  Checks if the string is lexicographically equal or smaller than the other string. */
+    /**  @brief Checks if the string is lexicographically equal or smaller than the other string. */
     bool operator<=(string_view other) const noexcept { return compare(other) != sz_greater_k; }
 
-    /**  @brief  Checks if the string is lexicographically greater than the other string. */
+    /**  @brief Checks if the string is lexicographically greater than the other string. */
     bool operator>(string_view other) const noexcept { return compare(other) == sz_greater_k; }
 
-    /**  @brief  Checks if the string is lexicographically equal or greater than the other string. */
+    /**  @brief Checks if the string is lexicographically equal or greater than the other string. */
     bool operator>=(string_view other) const noexcept { return compare(other) != sz_less_k; }
 
 #endif
@@ -2494,22 +2486,22 @@ class basic_string {
 #pragma endregion
 #pragma region Prefix and Suffix Comparisons
 
-    /**  @brief  Checks if the string starts with the other string. */
+    /**  @brief Checks if the string starts with the other string. */
     bool starts_with(string_view other) const noexcept { return view().starts_with(other); }
 
-    /**  @brief  Checks if the string starts with the other string. */
+    /**  @brief Checks if the string starts with the other string. */
     bool starts_with(const_pointer other) const noexcept { return view().starts_with(other); }
 
-    /**  @brief  Checks if the string starts with the other character. */
+    /**  @brief Checks if the string starts with the other character. */
     bool starts_with(value_type other) const noexcept { return view().starts_with(other); }
 
-    /**  @brief  Checks if the string ends with the other string. */
+    /**  @brief Checks if the string ends with the other string. */
     bool ends_with(string_view other) const noexcept { return view().ends_with(other); }
 
-    /**  @brief  Checks if the string ends with the other string. */
+    /**  @brief Checks if the string ends with the other string. */
     bool ends_with(const_pointer other) const noexcept { return view().ends_with(other); }
 
-    /**  @brief  Checks if the string ends with the other character. */
+    /**  @brief Checks if the string ends with the other character. */
     bool ends_with(value_type other) const noexcept { return view().ends_with(other); }
 
 #pragma endregion
@@ -2524,64 +2516,64 @@ class basic_string {
 #pragma region Returning offsets
 
     /**
-     *  @brief  Find the first occurrence of a substring, skipping the first `skip` characters.
-     *          The behavior is @b undefined if `skip > size()`.
+     *  @brief Find the first occurrence of a substring, skipping the first `skip` characters.
      *  @return The offset of the first character of the match, or `npos` if not found.
+     *  @warning The behavior is @b undefined if `skip > size()`.
      */
     size_type find(string_view other, size_type skip = 0) const noexcept { return view().find(other, skip); }
 
     /**
-     *  @brief  Find the first occurrence of a character, skipping the first `skip` characters.
-     *          The behavior is @b undefined if `skip > size()`.
+     *  @brief Find the first occurrence of a character, skipping the first `skip` characters.
      *  @return The offset of the match, or `npos` if not found.
+     *  @warning The behavior is @b undefined if `skip > size()`.
      */
     size_type find(value_type character, size_type skip = 0) const noexcept { return view().find(character, skip); }
 
     /**
-     *  @brief  Find the first occurrence of a substring, skipping the first `skip` characters.
-     *          The behavior is @b undefined if `skip > size()`.
+     *  @brief Find the first occurrence of a substring, skipping the first `skip` characters.
      *  @return The offset of the first character of the match, or `npos` if not found.
+     *  @warning The behavior is @b undefined if `skip > size()`.
      */
     size_type find(const_pointer other, size_type pos, size_type count) const noexcept {
         return view().find(other, pos, count);
     }
 
     /**
-     *  @brief  Find the last occurrence of a substring.
+     *  @brief Find the last occurrence of a substring.
      *  @return The offset of the first character of the match, or `npos` if not found.
      */
     size_type rfind(string_view other) const noexcept { return view().rfind(other); }
 
     /**
-     *  @brief  Find the last occurrence of a substring, within first `until` characters.
+     *  @brief Find the last occurrence of a substring, within first `until` characters.
      *  @return The offset of the first character of the match, or `npos` if not found.
      */
     size_type rfind(string_view other, size_type until) const noexcept { return view().rfind(other, until); }
 
     /**
-     *  @brief  Find the last occurrence of a character.
+     *  @brief Find the last occurrence of a character.
      *  @return The offset of the match, or `npos` if not found.
      */
     size_type rfind(value_type character) const noexcept { return view().rfind(character); }
 
     /**
-     *  @brief  Find the last occurrence of a character, within first `until` characters.
+     *  @brief Find the last occurrence of a character, within first `until` characters.
      *  @return The offset of the match, or `npos` if not found.
      */
     size_type rfind(value_type character, size_type until) const noexcept { return view().rfind(character, until); }
 
     /**
-     *  @brief  Find the last occurrence of a substring, within first `until` characters.
+     *  @brief Find the last occurrence of a substring, within first `until` characters.
      *  @return The offset of the first character of the match, or `npos` if not found.
      */
     size_type rfind(const_pointer other, size_type until, size_type count) const noexcept {
         return view().rfind(other, until, count);
     }
 
-    /**  @brief  Find the first occurrence of a character from a set. */
+    /**  @brief Find the first occurrence of a character from a set. */
     size_type find(byteset set) const noexcept { return view().find(set); }
 
-    /**  @brief  Find the last occurrence of a character from a set. */
+    /**  @brief Find the last occurrence of a character from a set. */
     size_type rfind(byteset set) const noexcept { return view().rfind(set); }
 
 #pragma endregion
@@ -2603,40 +2595,36 @@ class basic_string {
 #pragma region Character Set Arguments
 
     /**
-     *  @brief  Find the first occurrence of a character from a set.
-     *  @param  skip Number of characters to skip before the search.
+     *  @brief Find the first occurrence of a character from a @p `set`.
+     *  @param[in] skip Number of characters to skip before the search.
      *  @warning The behavior is @b undefined if `skip > size()`.
      */
     size_type find_first_of(byteset set, size_type skip = 0) const noexcept { return view().find_first_of(set, skip); }
 
     /**
-     *  @brief  Find the first occurrence of a character outside a set.
-     *  @param  skip  The number of first characters to be skipped.
+     *  @brief Find the first occurrence of a character outside a @p `set`.
+     *  @param[in] skip The number of first characters to be skipped.
      *  @warning The behavior is @b undefined if `skip > size()`.
      */
     size_type find_first_not_of(byteset set, size_type skip = 0) const noexcept {
         return view().find_first_not_of(set, skip);
     }
 
-    /**
-     *  @brief  Find the last occurrence of a character from a set.
-     */
+    /** @brief Find the last occurrence of a character from a @p `set`. */
     size_type find_last_of(byteset set) const noexcept { return view().find_last_of(set); }
 
-    /**
-     *  @brief  Find the last occurrence of a character outside a set.
-     */
+    /** @brief Find the last occurrence of a character outside a @p `set`. */
     size_type find_last_not_of(byteset set) const noexcept { return view().find_last_not_of(set); }
 
     /**
-     *  @brief  Find the last occurrence of a character from a set.
-     *  @param  until  The offset of the last character to be considered.
+     *  @brief Find the last occurrence of a character from a @p `set`.
+     *  @param[in] until The offset of the last character to be considered.
      */
     size_type find_last_of(byteset set, size_type until) const noexcept { return view().find_last_of(set, until); }
 
     /**
-     *  @brief  Find the last occurrence of a character outside a set.
-     *  @param  until  The offset of the last character to be considered.
+     *  @brief Find the last occurrence of a character outside a @p `set`.
+     *  @param[in] until The offset of the last character to be considered.
      */
     size_type find_last_not_of(byteset set, size_type until) const noexcept {
         return view().find_last_not_of(set, until);
@@ -2646,32 +2634,32 @@ class basic_string {
 #pragma region String Arguments
 
     /**
-     *  @brief  Find the first occurrence of a character from a ::set.
-     *  @param  skip  The number of first characters to be skipped.
+     *  @brief Find the first occurrence of a character from the @p `other` string.
+     *  @param[in] skip The number of first characters to be skipped.
      */
     size_type find_first_of(string_view other, size_type skip = 0) const noexcept {
         return view().find_first_of(other, skip);
     }
 
     /**
-     *  @brief  Find the first occurrence of a character outside a ::set.
-     *  @param  skip  The number of first characters to be skipped.
+     *  @brief Find the first occurrence of a character outside the @p `other` string.
+     *  @param[in] skip The number of first characters to be skipped.
      */
     size_type find_first_not_of(string_view other, size_type skip = 0) const noexcept {
         return view().find_first_not_of(other, skip);
     }
 
     /**
-     *  @brief  Find the last occurrence of a character from a ::set.
-     *  @param  until  The offset of the last character to be considered.
+     *  @brief Find the last occurrence of a character from the @p `other` string.
+     *  @param[in] until The offset of the last character to be considered.
      */
     size_type find_last_of(string_view other, size_type until = npos) const noexcept {
         return view().find_last_of(other, until);
     }
 
     /**
-     *  @brief  Find the last occurrence of a character outside a ::set.
-     *  @param  until  The offset of the last character to be considered.
+     *  @brief Find the last occurrence of a character outside the @p `other` string.
+     *  @param[in] until The offset of the last character to be considered.
      */
     size_type find_last_not_of(string_view other, size_type until = npos) const noexcept {
         return view().find_last_not_of(other, until);
@@ -2681,8 +2669,8 @@ class basic_string {
 #pragma region C Style Arguments
 
     /**
-     *  @brief  Find the first occurrence of a character from a set.
-     *  @param  skip  The number of first characters to be skipped.
+     *  @brief Find the first occurrence of a character from a set.
+     *  @param[in] skip The number of first characters to be skipped.
      *  @warning The behavior is @b undefined if `skip > size()`.
      */
     size_type find_first_of(const_pointer other, size_type skip, size_type count) const noexcept {
@@ -2690,8 +2678,8 @@ class basic_string {
     }
 
     /**
-     *  @brief  Find the first occurrence of a character outside a set.
-     *  @param  skip  The number of first characters to be skipped.
+     *  @brief Find the first occurrence of a character outside a set.
+     *  @param[in] skip The number of first characters to be skipped.
      *  @warning The behavior is @b undefined if `skip > size()`.
      */
     size_type find_first_not_of(const_pointer other, size_type skip, size_type count) const noexcept {
@@ -2699,16 +2687,16 @@ class basic_string {
     }
 
     /**
-     *  @brief  Find the last occurrence of a character from a set.
-     *  @param  until  The number of first characters to be considered.
+     *  @brief Find the last occurrence of a character from a set.
+     *  @param[in] until The number of first characters to be considered.
      */
     size_type find_last_of(const_pointer other, size_type until, size_type count) const noexcept {
         return view().find_last_of(other, until, count);
     }
 
     /**
-     *  @brief  Find the last occurrence of a character outside a set.
-     *  @param  until  The number of first characters to be considered.
+     *  @brief Find the last occurrence of a character outside a set.
+     *  @param[in] until The number of first characters to be considered.
      */
     size_type find_last_not_of(const_pointer other, size_type until, size_type count) const noexcept {
         return view().find_last_not_of(other, until, count);
@@ -2718,8 +2706,8 @@ class basic_string {
 #pragma region Slicing
 
     /**
-     *  @brief  Python-like convenience function, dropping prefix formed of given characters.
-     *          Similar to `boost::algorithm::trim_left_if(str, is_any_of(set))`.
+     *  @brief Python-like convenience function, dropping prefix formed of given characters.
+     *  @see Similar to `boost::algorithm::trim_left_if(str, is_any_of(set))`.
      */
     basic_string &lstrip(byteset set) noexcept {
         auto remaining = view().lstrip(set);
@@ -2729,7 +2717,7 @@ class basic_string {
 
     /**
      *  @brief  Python-like convenience function, dropping suffix formed of given characters.
-     *          Similar to `boost::algorithm::trim_right_if(str, is_any_of(set))`.
+     *  @see Similar to `boost::algorithm::trim_right_if(str, is_any_of(set))`.
      */
     basic_string &rstrip(byteset set) noexcept {
         auto remaining = view().rstrip(set);
@@ -2738,8 +2726,8 @@ class basic_string {
     }
 
     /**
-     *  @brief  Python-like convenience function, dropping both the prefix & the suffix formed of given characters.
-     *          Similar to `boost::algorithm::trim_if(str, is_any_of(set))`.
+     *  @brief Python-like convenience function, dropping both the prefix & the suffix formed of given characters.
+     *  @see Similar to `boost::algorithm::trim_if(str, is_any_of(set))`.
      */
     basic_string &strip(byteset set) noexcept { return lstrip(set).rstrip(set); }
 
@@ -2750,15 +2738,15 @@ class basic_string {
 #pragma region Non STL API
 
     /**
-     *  @brief  Resizes the string to a specified number of characters, padding with the specified character if needed.
-     *  @param  count The new size of the string.
-     *  @param  character The character to fill new elements with, if expanding. Defaults to null character.
+     *  @brief Resizes the string to a specified number of characters, padding with the specified character if needed.
+     *  @param[in] count The new size of the string.
+     *  @param[in] character The character to fill new elements with, if expanding. Defaults to null character.
      *  @return `true` if the resizing was successful, `false` otherwise.
      */
     bool try_resize(size_type count, value_type character = '\0') noexcept;
 
     /**
-     *  @brief  Attempts to reduce memory usage by freeing unused memory.
+     *  @brief Attempts to reduce memory usage by freeing unused memory.
      *  @return `true` if the operation was successful and potentially reduced the memory footprint, `false` otherwise.
      */
     bool try_shrink_to_fit() noexcept {
@@ -2766,8 +2754,8 @@ class basic_string {
     }
 
     /**
-     *  @brief  Attempts to reserve enough space for a specified number of characters.
-     *  @param  capacity The new capacity to reserve.
+     *  @brief Attempts to reserve enough space for a specified number of characters.
+     *  @param[in] capacity The new capacity to reserve.
      *  @return `true` if the reservation was successful, `false` otherwise.
      */
     bool try_reserve(size_type capacity) noexcept {
@@ -2775,44 +2763,44 @@ class basic_string {
     }
 
     /**
-     *  @brief  Assigns a new value to the string, replacing its current contents.
-     *  @param  other The string view whose contents to assign.
+     *  @brief Assigns a new value to the string, replacing its current contents.
+     *  @param[in] other The string view whose contents to assign.
      *  @return `true` if the assignment was successful, `false` otherwise.
      */
     bool try_assign(string_view other) noexcept;
 
     /**
-     *  @brief  Assigns a concatenated sequence to the string, replacing its current contents.
-     *  @param  other The concatenation object representing the sequence to assign.
+     *  @brief Assigns a concatenated sequence to the string, replacing its current contents.
+     *  @param[in] other The concatenation object representing the sequence to assign.
      *  @return `true` if the assignment was successful, `false` otherwise.
      */
     template <typename first_type, typename second_type>
     bool try_assign(concatenation<first_type, second_type> const &other) noexcept;
 
     /**
-     *  @brief  Attempts to add a single character to the end of the string.
-     *  @param  c The character to add.
+     *  @brief Attempts to add a single character to the end of the string.
+     *  @param[in] c The character to add.
      *  @return `true` if the character was successfully added, `false` otherwise.
      */
     bool try_push_back(char_type c) noexcept;
 
     /**
-     *  @brief  Attempts to append a given character array to the string.
-     *  @param  str The pointer to the array of characters to append.
-     *  @param  length The number of characters to append.
+     *  @brief Attempts to append a given character array to the string.
+     *  @param[in] str The pointer to the array of characters to append.
+     *  @param[in] length The number of characters to append.
      *  @return `true` if the append operation was successful, `false` otherwise.
      */
     bool try_append(const_pointer str, size_type length) noexcept;
 
     /**
-     *  @brief  Attempts to append a string view to the string.
-     *  @param  str The string view to append.
+     *  @brief Attempts to append a string view to the string.
+     *  @param[in] str The string view to append.
      *  @return `true` if the append operation was successful, `false` otherwise.
      */
     bool try_append(string_view str) noexcept { return try_append(str.data(), str.size()); }
 
     /**
-     *  @brief  Clears the contents of the string and resets its length to 0.
+     *  @brief Clears the contents of the string and resets its length to 0.
      *  @return Always returns `true` as this operation cannot fail under normal conditions.
      */
     bool try_clear() noexcept {
@@ -2821,7 +2809,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Erases @b (in-place) a range of characters defined with signed offsets.
+     *  @brief Erases @b (in-place) a range of characters defined with signed offsets.
      *  @return Number of characters removed.
      */
     size_type try_erase(difference_type signed_start_offset = 0, difference_type signed_end_offset = npos) noexcept {
@@ -2833,7 +2821,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Inserts @b (in-place) a range of characters at a given signed offset.
+     *  @brief Inserts @b (in-place) a range of characters at a given signed offset.
      *  @return `true` if the insertion was successful, `false` otherwise.
      */
     bool try_insert(difference_type signed_offset, string_view string) noexcept {
@@ -2866,16 +2854,13 @@ class basic_string {
 
 #pragma region STL Interfaces
 
-    /**
-     *  @brief  Clears the string contents, but @b no deallocations happen.
-     */
+    /** @brief Clears the string contents, but @b no deallocations happen. */
     void clear() noexcept { sz_string_erase(&string_, 0, SZ_SIZE_MAX); }
 
     /**
-     *  @brief  Resizes the string to the given size, filling the new space with the given character,
-     *          or NULL-character if nothing is provided.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
+     *  @brief Resizes the string to match @p count, filling the new space with the given @p character.
+     *  @throw `std::length_error` if the string is too long.
+     *  @throw `std::bad_alloc` if the allocation fails.
      */
     void resize(size_type count, value_type character = '\0') noexcept(false) {
         if (count > max_size()) throw std::length_error("sz::basic_string::resize");
@@ -2883,16 +2868,16 @@ class basic_string {
     }
 
     /**
-     *  @brief  Reclaims the unused memory, if any.
-     *  @throw  `std::bad_alloc` if the allocation fails.
+     *  @brief Reclaims the unused memory, if any.
+     *  @throw `std::bad_alloc` if the allocation fails.
      */
     void shrink_to_fit() noexcept(false) {
         if (!try_shrink_to_fit()) throw std::bad_alloc();
     }
 
     /**
-     *  @brief  Informs the string object of a planned change in size, so that it pre-allocate once.
-     *  @throw  `std::length_error` if the string is too long.
+     *  @brief Informs the string object of a planned change in size, so that it pre-allocate once.
+     *  @throw `std::length_error` if the string is too long.
      */
     void reserve(size_type capacity) noexcept(false) {
         if (capacity > max_size()) throw std::length_error("sz::basic_string::reserve");
@@ -2900,10 +2885,10 @@ class basic_string {
     }
 
     /**
-     *  @brief  Inserts @b (in-place) a ::character multiple times at the given offset.
-     *  @throw  `std::out_of_range` if `offset > size()`.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
+     *  @brief Inserts @b (in-place) a ::character multiple times at the given offset.
+     *  @throw `std::out_of_range` if `offset > size()`.
+     *  @throw `std::length_error` if the string is too long.
+     *  @throw `std::bad_alloc` if the allocation fails.
      */
     basic_string &insert(size_type offset, size_type repeats, char_type character) noexcept(false) {
         if (offset > size()) throw std::out_of_range("sz::basic_string::insert");
@@ -2916,10 +2901,10 @@ class basic_string {
     }
 
     /**
-     *  @brief  Inserts @b (in-place) a range of characters at the given offset.
-     *  @throw  `std::out_of_range` if `offset > size()`.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
+     *  @brief Inserts @b (in-place) a range of characters at the given offset.
+     *  @throw `std::out_of_range` if `offset > size()`.
+     *  @throw `std::length_error` if the string is too long.
+     *  @throw `std::bad_alloc` if the allocation fails.
      */
     basic_string &insert(size_type offset, string_view other) noexcept(false) {
         if (offset > size()) throw std::out_of_range("sz::basic_string::insert");
@@ -2933,20 +2918,20 @@ class basic_string {
     }
 
     /**
-     *  @brief  Inserts @b (in-place) a range of characters at the given offset.
-     *  @throw  `std::out_of_range` if `offset > size()`.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
+     *  @brief Inserts @b (in-place) a range of characters at the given offset.
+     *  @throw `std::out_of_range` if `offset > size()`.
+     *  @throw `std::length_error` if the string is too long.
+     *  @throw `std::bad_alloc` if the allocation fails.
      */
     basic_string &insert(size_type offset, const_pointer start, size_type length) noexcept(false) {
         return insert(offset, string_view(start, length));
     }
 
     /**
-     *  @brief  Inserts @b (in-place) a slice of another string at the given offset.
-     *  @throw  `std::out_of_range` if `offset > size()` or `other_index > other.size()`.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
+     *  @brief Inserts @b (in-place) a slice of another string at the given offset.
+     *  @throw `std::out_of_range` if `offset > size()` or `other_index > other.size()`.
+     *  @throw `std::length_error` if the string is too long.
+     *  @throw `std::bad_alloc` if the allocation fails.
      */
     basic_string &insert(size_type offset, string_view other, size_type other_index,
                          size_type count = npos) noexcept(false) {
@@ -2966,10 +2951,10 @@ class basic_string {
     }
 
     /**
-     *  @brief  Inserts @b (in-place) a ::character multiple times at the given iterator position.
-     *  @throw  `std::out_of_range` if `pos > size()` or `other_index > other.size()`.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
+     *  @brief Inserts @b (in-place) a ::character multiple times at the given iterator position.
+     *  @throw `std::out_of_range` if `pos > size()` or `other_index > other.size()`.
+     *  @throw `std::length_error` if the string is too long.
+     *  @throw `std::bad_alloc` if the allocation fails.
      */
     iterator insert(const_iterator it, size_type repeats, char_type character) noexcept(false) {
         auto pos = range_length(cbegin(), it);
@@ -2978,10 +2963,10 @@ class basic_string {
     }
 
     /**
-     *  @brief  Inserts @b (in-place) a range at the given iterator position.
-     *  @throw  `std::out_of_range` if `pos > size()` or `other_index > other.size()`.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
+     *  @brief Inserts @b (in-place) a range at the given iterator position.
+     *  @throw `std::out_of_range` if `pos > size()` or `other_index > other.size()`.
+     *  @throw `std::length_error` if the string is too long.
+     *  @throw `std::bad_alloc` if the allocation fails.
      */
     template <typename input_iterator>
     iterator insert(const_iterator it, input_iterator first, input_iterator last) noexcept(false) {
@@ -3001,19 +2986,19 @@ class basic_string {
     }
 
     /**
-     *  @brief  Inserts @b (in-place) an initializer list of characters.
-     *  @throw  `std::out_of_range` if `pos > size()` or `other_index > other.size()`.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
+     *  @brief Inserts @b (in-place) an initializer list of characters.
+     *  @throw `std::out_of_range` if `pos > size()` or `other_index > other.size()`.
+     *  @throw `std::length_error` if the string is too long.
+     *  @throw `std::bad_alloc` if the allocation fails.
      */
     iterator insert(const_iterator it, std::initializer_list<char_type> list) noexcept(false) {
         return insert(it, list.begin(), list.end());
     }
 
     /**
-     *  @brief  Erases @b (in-place) the given range of characters.
+     *  @brief Erases @b (in-place) the given range of characters.
      *  @throws `std::out_of_range` if `pos > size()`.
-     *  @see    `try_erase_slice` for a cleaner exception-less alternative.
+     *  @sa `try_erase_slice` for a cleaner exception-less alternative.
      */
     basic_string &erase(size_type pos = 0, size_type count = npos) noexcept(false) {
         if (!count || empty()) return *this;
@@ -3023,7 +3008,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Erases @b (in-place) the given range of characters.
+     *  @brief Erases @b (in-place) the given range of characters.
      *  @return Iterator pointing following the erased character, or end() if no such character exists.
      */
     iterator erase(const_iterator first, const_iterator last) noexcept {
@@ -3034,16 +3019,16 @@ class basic_string {
     }
 
     /**
-     *  @brief  Erases @b (in-place) the one character at a given postion.
+     *  @brief Erases @b (in-place) the one character at a given postion.
      *  @return Iterator pointing following the erased character, or end() if no such character exists.
      */
     iterator erase(const_iterator pos) noexcept { return erase(pos, pos + 1); }
 
     /**
-     *  @brief  Replaces @b (in-place) a range of characters with a given string.
+     *  @brief Replaces @b (in-place) a range of characters with a given string.
      *  @throws `std::out_of_range` if `pos > size()`.
      *  @throws `std::length_error` if the string is too long.
-     *  @see    `try_replace` for a cleaner exception-less alternative.
+     *  @sa `try_replace` for a cleaner exception-less alternative.
      */
     basic_string &replace(size_type pos, size_type count, string_view const &str) noexcept(false) {
         if (pos > size()) throw std::out_of_range("sz::basic_string::replace");
@@ -3054,20 +3039,20 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces @b (in-place) a range of characters with a given string.
+     *  @brief Replaces @b (in-place) a range of characters with a given string.
      *  @throws `std::out_of_range` if `pos > size()`.
      *  @throws `std::length_error` if the string is too long.
-     *  @see    `try_replace` for a cleaner exception-less alternative.
+     *  @sa `try_replace` for a cleaner exception-less alternative.
      */
     basic_string &replace(const_iterator first, const_iterator last, string_view const &str) noexcept(false) {
         return replace(range_length(cbegin(), first), last - first, str);
     }
 
     /**
-     *  @brief  Replaces @b (in-place) a range of characters with a given string.
+     *  @brief Replaces @b (in-place) a range of characters with a given string.
      *  @throws `std::out_of_range` if `pos > size()` or `pos2 > str.size()`.
      *  @throws `std::length_error` if the string is too long.
-     *  @see    `try_replace` for a cleaner exception-less alternative.
+     *  @sa `try_replace` for a cleaner exception-less alternative.
      */
     basic_string &replace(size_type pos, size_type count, string_view const &str, size_type pos2,
                           size_type count2 = npos) noexcept(false) {
@@ -3075,20 +3060,20 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces @b (in-place) a range of characters with a given string.
+     *  @brief Replaces @b (in-place) a range of characters with a given string.
      *  @throws `std::out_of_range` if `pos > size()`.
      *  @throws `std::length_error` if the string is too long.
-     *  @see    `try_replace` for a cleaner exception-less alternative.
+     *  @sa `try_replace` for a cleaner exception-less alternative.
      */
     basic_string &replace(size_type pos, size_type count, const_pointer cstr, size_type count2) noexcept(false) {
         return replace(pos, count, string_view(cstr, count2));
     }
 
     /**
-     *  @brief  Replaces @b (in-place) a range of characters with a given string.
+     *  @brief Replaces @b (in-place) a range of characters with a given string.
      *  @throws `std::out_of_range` if `pos > size()`.
      *  @throws `std::length_error` if the string is too long.
-     *  @see    `try_replace` for a cleaner exception-less alternative.
+     *  @sa `try_replace` for a cleaner exception-less alternative.
      */
     basic_string &replace(const_iterator first, const_iterator last, const_pointer cstr,
                           size_type count2) noexcept(false) {
@@ -3096,30 +3081,30 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces @b (in-place) a range of characters with a given string.
+     *  @brief Replaces @b (in-place) a range of characters with a given string.
      *  @throws `std::out_of_range` if `pos > size()`.
      *  @throws `std::length_error` if the string is too long.
-     *  @see    `try_replace` for a cleaner exception-less alternative.
+     *  @sa `try_replace` for a cleaner exception-less alternative.
      */
     basic_string &replace(size_type pos, size_type count, const_pointer cstr) noexcept(false) {
         return replace(pos, count, string_view(cstr));
     }
 
     /**
-     *  @brief  Replaces @b (in-place) a range of characters with a given string.
+     *  @brief Replaces @b (in-place) a range of characters with a given string.
      *  @throws `std::out_of_range` if `pos > size()`.
      *  @throws `std::length_error` if the string is too long.
-     *  @see    `try_replace` for a cleaner exception-less alternative.
+     *  @sa `try_replace` for a cleaner exception-less alternative.
      */
     basic_string &replace(const_iterator first, const_iterator last, const_pointer cstr) noexcept(false) {
         return replace(range_length(cbegin(), first), last - first, string_view(cstr));
     }
 
     /**
-     *  @brief  Replaces @b (in-place) a range of characters with a repetition of given characters.
+     *  @brief Replaces @b (in-place) a range of characters with a repetition of given characters.
      *  @throws `std::out_of_range` if `pos > size()`.
      *  @throws `std::length_error` if the string is too long.
-     *  @see    `try_replace` for a cleaner exception-less alternative.
+     *  @sa `try_replace` for a cleaner exception-less alternative.
      */
     basic_string &replace(size_type pos, size_type count, size_type count2, char_type character) noexcept(false) {
         if (pos > size()) throw std::out_of_range("sz::basic_string::replace");
@@ -3130,10 +3115,10 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces @b (in-place) a range of characters with a repetition of given characters.
+     *  @brief Replaces @b (in-place) a range of characters with a repetition of given characters.
      *  @throws `std::out_of_range` if `pos > size()`.
      *  @throws `std::length_error` if the string is too long.
-     *  @see    `try_replace` for a cleaner exception-less alternative.
+     *  @sa `try_replace` for a cleaner exception-less alternative.
      */
     basic_string &replace(const_iterator first, const_iterator last, size_type count2,
                           char_type character) noexcept(false) {
@@ -3141,10 +3126,10 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces @b (in-place) a range of characters with a given string.
+     *  @brief Replaces @b (in-place) a range of characters with a given string.
      *  @throws `std::out_of_range` if `pos > size()`.
      *  @throws `std::length_error` if the string is too long.
-     *  @see    `try_replace` for a cleaner exception-less alternative.
+     *  @sa `try_replace` for a cleaner exception-less alternative.
      */
     template <typename input_iterator>
     basic_string &replace(const_iterator first, const_iterator last, input_iterator first2,
@@ -3160,10 +3145,10 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces @b (in-place) a range of characters with a given initializer list.
+     *  @brief Replaces @b (in-place) a range of characters with a given initializer list.
      *  @throws `std::out_of_range` if `pos > size()`.
      *  @throws `std::length_error` if the string is too long.
-     *  @see    `try_replace` for a cleaner exception-less alternative.
+     *  @sa `try_replace` for a cleaner exception-less alternative.
      */
     basic_string &replace(const_iterator first, const_iterator last,
                           std::initializer_list<char_type> list) noexcept(false) {
@@ -3171,9 +3156,9 @@ class basic_string {
     }
 
     /**
-     *  @brief  Appends the given character at the end.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
+     *  @brief Appends the given character at the end.
+     *  @throw `std::length_error` if the string is too long.
+     *  @throw `std::bad_alloc` if the allocation fails.
      */
     void push_back(char_type ch) noexcept(false) {
         if (size() == max_size()) throw std::length_error("string::push_back");
@@ -3181,16 +3166,16 @@ class basic_string {
     }
 
     /**
-     *  @brief  Removes the last character from the string.
+     *  @brief Removes the last character from the string.
      *  @warning The behavior is @b undefined if the string is empty.
      */
     void pop_back() noexcept { sz_string_erase(&string_, size() - 1, 1); }
 
     /**
-     *  @brief  Overwrites the string with the given string.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
-     *  @see    `try_assign` for a cleaner exception-less alternative.
+     *  @brief Overwrites the string with the given string.
+     *  @throw `std::length_error` if the string is too long.
+     *  @throw `std::bad_alloc` if the allocation fails.
+     *  @sa `try_assign` for a cleaner exception-less alternative.
      */
     basic_string &assign(string_view other) noexcept(false) {
         if (!try_assign(other)) throw std::bad_alloc();
@@ -3198,10 +3183,10 @@ class basic_string {
     }
 
     /**
-     *  @brief  Overwrites the string with the given repeated character.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
-     *  @see    `try_assign` for a cleaner exception-less alternative.
+     *  @brief Overwrites the string with the given repeated character.
+     *  @throw `std::length_error` if the string is too long.
+     *  @throw `std::bad_alloc` if the allocation fails.
+     *  @sa `try_assign` for a cleaner exception-less alternative.
      */
     basic_string &assign(size_type repeats, char_type character) noexcept(false) {
         resize(repeats, character);
@@ -3210,28 +3195,28 @@ class basic_string {
     }
 
     /**
-     *  @brief  Overwrites the string with the given string.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
-     *  @see    `try_assign` for a cleaner exception-less alternative.
+     *  @brief Overwrites the string with the given string.
+     *  @throw `std::length_error` if the string is too long.
+     *  @throw `std::bad_alloc` if the allocation fails.
+     *  @sa `try_assign` for a cleaner exception-less alternative.
      */
     basic_string &assign(const_pointer other, size_type length) noexcept(false) { return assign({other, length}); }
 
     /**
-     *  @brief  Overwrites the string with the given string.
-     *  @throw  `std::length_error` if the string is too long or `pos > str.size()`.
-     *  @throw  `std::bad_alloc` if the allocation fails.
-     *  @see    `try_assign` for a cleaner exception-less alternative.
+     *  @brief Overwrites the string with the given string.
+     *  @throw `std::length_error` if the string is too long or `pos > str.size()`.
+     *  @throw `std::bad_alloc` if the allocation fails.
+     *  @sa `try_assign` for a cleaner exception-less alternative.
      */
     basic_string &assign(string_view str, size_type pos, size_type count = npos) noexcept(false) {
         return assign(str.substr(pos, count));
     }
 
     /**
-     *  @brief  Overwrites the string with the given iterator range.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
-     *  @see    `try_assign` for a cleaner exception-less alternative.
+     *  @brief Overwrites the string with the given iterator range.
+     *  @throw `std::length_error` if the string is too long.
+     *  @throw `std::bad_alloc` if the allocation fails.
+     *  @sa `try_assign` for a cleaner exception-less alternative.
      */
     template <typename input_iterator>
     basic_string &assign(input_iterator first, input_iterator last) noexcept(false) {
@@ -3241,20 +3226,20 @@ class basic_string {
     }
 
     /**
-     *  @brief  Overwrites the string with the given initializer list.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
-     *  @see    `try_assign` for a cleaner exception-less alternative.
+     *  @brief Overwrites the string with the given initializer list.
+     *  @throw `std::length_error` if the string is too long.
+     *  @throw `std::bad_alloc` if the allocation fails.
+     *  @sa `try_assign` for a cleaner exception-less alternative.
      */
     basic_string &assign(std::initializer_list<char_type> list) noexcept(false) {
         return assign(list.begin(), list.end());
     }
 
     /**
-     *  @brief  Appends to the end of the current string.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
-     *  @see    `try_append` for a cleaner exception-less alternative.
+     *  @brief Appends to the end of the current string.
+     *  @throw `std::length_error` if the string is too long.
+     *  @throw `std::bad_alloc` if the allocation fails.
+     *  @sa `try_append` for a cleaner exception-less alternative.
      */
     basic_string &append(string_view str) noexcept(false) {
         if (!try_append(str)) throw std::bad_alloc();
@@ -3262,36 +3247,36 @@ class basic_string {
     }
 
     /**
-     *  @brief  Appends to the end of the current string.
-     *  @throw  `std::length_error` if the string is too long or `pos > str.size()`.
-     *  @throw  `std::bad_alloc` if the allocation fails.
-     *  @see    `try_append` for a cleaner exception-less alternative.
+     *  @brief Appends to the end of the current string.
+     *  @throw `std::length_error` if the string is too long or `pos > str.size()`.
+     *  @throw `std::bad_alloc` if the allocation fails.
+     *  @sa `try_append` for a cleaner exception-less alternative.
      */
     basic_string &append(string_view str, size_type pos, size_type length = npos) noexcept(false) {
         return append(str.substr(pos, length));
     }
 
     /**
-     *  @brief  Appends to the end of the current string.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
-     *  @see    `try_append` for a cleaner exception-less alternative.
+     *  @brief Appends to the end of the current string.
+     *  @throw `std::length_error` if the string is too long.
+     *  @throw `std::bad_alloc` if the allocation fails.
+     *  @sa `try_append` for a cleaner exception-less alternative.
      */
     basic_string &append(const_pointer str, size_type length) noexcept(false) { return append({str, length}); }
 
     /**
-     *  @brief  Appends to the end of the current string.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
-     *  @see    `try_append` for a cleaner exception-less alternative.
+     *  @brief Appends to the end of the current string.
+     *  @throw `std::length_error` if the string is too long.
+     *  @throw `std::bad_alloc` if the allocation fails.
+     *  @sa `try_append` for a cleaner exception-less alternative.
      */
     basic_string &append(const_pointer str) noexcept(false) { return append(string_view(str)); }
 
     /**
-     *  @brief  Appends a repeated character to the end of the current string.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
-     *  @see    `try_append` for a cleaner exception-less alternative.
+     *  @brief Appends a repeated character to the end of the current string.
+     *  @throw `std::length_error` if the string is too long.
+     *  @throw `std::bad_alloc` if the allocation fails.
+     *  @sa `try_append` for a cleaner exception-less alternative.
      */
     basic_string &append(size_type repeats, char_type ch) noexcept(false) {
         resize(size() + repeats, ch);
@@ -3299,20 +3284,20 @@ class basic_string {
     }
 
     /**
-     *  @brief  Appends to the end of the current string.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
-     *  @see    `try_append` for a cleaner exception-less alternative.
+     *  @brief Appends to the end of the current string.
+     *  @throw `std::length_error` if the string is too long.
+     *  @throw `std::bad_alloc` if the allocation fails.
+     *  @sa `try_append` for a cleaner exception-less alternative.
      */
     basic_string &append(std::initializer_list<char_type> other) noexcept(false) {
         return append(other.begin(), other.end());
     }
 
     /**
-     *  @brief  Appends to the end of the current string.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
-     *  @see    `try_append` for a cleaner exception-less alternative.
+     *  @brief Appends to the end of the current string.
+     *  @throw `std::length_error` if the string is too long.
+     *  @throw `std::bad_alloc` if the allocation fails.
+     *  @sa `try_append` for a cleaner exception-less alternative.
      */
     template <typename input_iterator>
     basic_string &append(input_iterator first, input_iterator last) noexcept(false) {
@@ -3348,16 +3333,16 @@ class basic_string {
         return result;
     }
 
-    /**  @brief  Hashes the string, equivalent to `std::hash<string_view>{}(str)`. */
+    /**  @brief Hashes the string, equivalent to `std::hash<string_view>{}(str)`. */
     size_type hash() const noexcept { return view().hash(); }
 
-    /**  @brief  Aggregates the values of individual bytes of a string. */
+    /**  @brief Aggregates the values of individual bytes of a string. */
     size_type bytesum() const noexcept { return view().bytesum(); }
 
     /**
      *  @brief  Overwrites the string with random binary data.
      *
-     *  @param  nonce   "Number used ONCE" to initialize the random number generator, @b don't repeat it!
+     *  @param[in] nonce "Number used ONCE" to initialize the random number generator, @b don't repeat it!
      */
     basic_string &randomize(sz_u64_t nonce) noexcept {
         sz_ptr_t start;
@@ -3369,8 +3354,11 @@ class basic_string {
 
     /**
      *  @brief  Overwrites the string with random binary data.
-     *          Produces the nonce from a static variable, incrementing it each time.
-     *          In this case the undefined behaviour in concurrent environments plays in our favor.
+     *  @sa     sz_fill_random
+     *
+     *  This overload produces the nonce from a static variable, incrementing it each time.
+     *  In this case the undefined behaviour in concurrent environments may play in our favor,
+     *  but it's recommended to use the other overload in such cases.
      */
     basic_string &randomize() noexcept {
         static sz_u64_t nonce = 42;
@@ -3378,27 +3366,25 @@ class basic_string {
     }
 
     /**
-     *  @brief  Generate a new random string of given length using `std::rand` as the random generator.
-     *          May throw exceptions if the memory allocation fails.
-     *
-     *  @param  length     The length of the generated string.
-     *  @param  nonce   "Number used ONCE" to initialize the random number generator, @b don't repeat it!
+     *  @brief Generate a new random binary string of given @p length.
+     *  @param[in] length The length of the generated string.
+     *  @param[in] nonce "Number used ONCE" to initialize the random number generator, @b don't repeat it!
+     *  @throw `std::bad_alloc` if the allocation fails.
      */
     static basic_string random(size_type length, sz_u64_t nonce) noexcept(false) {
         return basic_string(length, '\0').randomize(nonce);
     }
 
     /**
-     *  @brief  Generate a new random string of given length using the provided random number generator.
-     *          May throw exceptions if the memory allocation fails.
-     *
-     *  @param  length     The length of the generated string.
+     *  @brief Generate a new random binary string of given @p length.
+     *  @param[in] length The length of the generated string.
+     *  @throw `std::bad_alloc` if the allocation fails.
      */
     static basic_string random(size_type length) noexcept(false) { return basic_string(length, '\0').randomize(); }
 
     /**
-     *  @brief  Replaces @b (in-place) all occurrences of a given string with the ::replacement string.
-     *          Similar to `boost::algorithm::replace_all` and Python's `str.replace`.
+     *  @brief Replaces @b (in-place) all occurrences of a given string with the ::replacement string.
+     *  @see Similar to `boost::algorithm::replace_all` and Python's `str.replace`.
      *
      *  The implementation is not as composable, as using search ranges combined with a replacing mapping for matches,
      *  and might be suboptimal, if you are exporting the cleaned-up string to another buffer.
@@ -3410,8 +3396,8 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces @b (in-place) all occurrences of a given character set with the ::replacement string.
-     *          Similar to `boost::algorithm::replace_all` and Python's `str.replace`.
+     *  @brief Replaces @b (in-place) all occurrences of a given character set with the ::replacement string.
+     *  @see Similar to `boost::algorithm::replace_all` and Python's `str.replace`.
      *
      *  The implementation is not as composable, as using search ranges combined with a replacing mapping for matches,
      *  and might be suboptimal, if you are exporting the cleaned-up string to another buffer.
@@ -3423,8 +3409,8 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces @b (in-place) all occurrences of a given string with the ::replacement string.
-     *          Similar to `boost::algorithm::replace_all` and Python's `str.replace`.
+     *  @brief Replaces @b (in-place) all occurrences of a given string with the ::replacement string.
+     *  @see Similar to `boost::algorithm::replace_all` and Python's `str.replace`.
      *
      *  The implementation is not as composable, as using search ranges combined with a replacing mapping for matches,
      *  and might be suboptimal, if you are exporting the cleaned-up string to another buffer.
@@ -3435,8 +3421,8 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces @b (in-place) all occurrences of a given character set with the ::replacement string.
-     *          Similar to `boost::algorithm::replace_all` and Python's `str.replace`.
+     *  @brief Replaces @b (in-place) all occurrences of a given character set with the ::replacement string.
+     *  @see Similar to `boost::algorithm::replace_all` and Python's `str.replace`.
      *
      *  The implementation is not as composable, as using search ranges combined with a replacing mapping for matches,
      *  and might be suboptimal, if you are exporting the cleaned-up string to another buffer.
@@ -3447,7 +3433,8 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces @b (in-place) all characters in the string using the provided lookup table.
+     *  @brief Replaces @b (in-place) all characters in the string using the provided lookup @p table.
+     *  @sa sz_lookup
      */
     basic_string &transform(look_up_table const &table) noexcept {
         transform(table, data());
@@ -3455,8 +3442,9 @@ class basic_string {
     }
 
     /**
-     *  @brief  Maps all characters in the current string into another buffer using the provided lookup table.
-     *  @param  output  The buffer to write the transformed string into.
+     *  @brief Maps all characters in the current string into the @p output buffer using the provided lookup @p table.
+     *  @param[in] output The buffer to write the transformed string into.
+     *  @sa sz_lookup
      */
     void transform(look_up_table const &table, pointer output) const noexcept {
         sz_ptr_t start;
@@ -3470,8 +3458,8 @@ class basic_string {
     bool try_replace_all_(pattern_type pattern, string_view replacement) noexcept;
 
     /**
-     *  @brief  Tries to prepare the string for a replacement of a given range with a new string.
-     *          The allocation may occur, if the replacement is longer than the replaced range.
+     *  @brief Tries to prepare the string for a replacement of a given range with a new string.
+     *  @warning A memory allocation may occur, if the replacement is longer than the replaced range.
      */
     bool try_preparing_replacement(size_type offset, size_type length, size_type new_length) noexcept;
 };
@@ -3738,8 +3726,8 @@ struct string_view_less {
 };
 
 /**
- *  @brief  Helper function-like object to check equality between string-view convertible objects with StringZilla.
- *  @see    Similar to `std::equal_to<std::string_view>`: https://en.cppreference.com/w/cpp/utility/functional/equal_to
+ *  @brief Helper function-like object to check equality between string-view convertible objects with StringZilla.
+ *  @see Similar to `std::equal_to<std::string_view>`: https://en.cppreference.com/w/cpp/utility/functional/equal_to
  *
  *  Unlike the STL analog, doesn't require C++14 or including the heavy `<functional>` header.
  *  Can be used to combine STL classes with StringZilla logic, like:
@@ -3750,8 +3738,8 @@ struct string_view_equal_to {
 };
 
 /**
- *  @brief  Helper function-like object to hash string-view convertible objects with StringZilla.
- *  @see    Similar to `std::hash<std::string_view>`: https://en.cppreference.com/w/cpp/utility/functional/hash
+ *  @brief Helper function-like object to hash string-view convertible objects with StringZilla.
+ *  @see Similar to `std::hash<std::string_view>`: https://en.cppreference.com/w/cpp/utility/functional/hash
  *
  *  Unlike the STL analog, doesn't require C++14 or including the heavy `<functional>` header.
  *  Can be used to combine STL classes with StringZilla logic, like:
@@ -3761,7 +3749,7 @@ struct string_view_hash {
     std::size_t operator()(string_view str) const noexcept { return str.hash(); }
 };
 
-/**  @brief  SFINAE-type used to infer the resulting type of concatenating multiple string together. */
+/**  @brief SFINAE-type used to infer the resulting type of concatenating multiple string together. */
 template <typename... args_types>
 struct concatenation_result {};
 
@@ -3776,8 +3764,8 @@ struct concatenation_result<first_type, following_types...> {
 };
 
 /**
- *  @brief  Concatenates two strings into a template expression.
- *  @see    `concatenation` class for more details.
+ *  @brief Concatenates two strings into a template expression.
+ *  @sa `concatenation` class for more details.
  */
 template <typename first_type, typename second_type>
 concatenation<first_type, second_type> concatenate(first_type &&first, second_type &&second) noexcept(false) {
@@ -3785,8 +3773,8 @@ concatenation<first_type, second_type> concatenate(first_type &&first, second_ty
 }
 
 /**
- *  @brief  Concatenates two or more strings into a template expression.
- *  @see    `concatenation` class for more details.
+ *  @brief Concatenates two or more strings into a template expression.
+ *  @sa `concatenation` class for more details.
  */
 template <typename first_type, typename second_type, typename... following_types>
 typename concatenation_result<first_type, second_type, following_types...>::type concatenate(
@@ -3806,8 +3794,8 @@ typename concatenation_result<first_type, second_type, following_types...>::type
 }
 
 /**
- *  @brief  Calculates the Hamming edit distance in @b bytes between two strings.
- *  @see    sz_levenshtein_distance
+ *  @brief Calculates the Hamming edit distance in @b bytes between two strings.
+ *  @sa sz_levenshtein_distance
  */
 template <typename char_type_>
 std::size_t hamming_distance(                                                         //
@@ -3819,8 +3807,8 @@ std::size_t hamming_distance(
 }
 
 /**
- *  @brief  Calculates the Hamming edit distance in @b bytes between two strings.
- *  @see    sz_levenshtein_distance
+ *  @brief Calculates the Hamming edit distance in @b bytes between two strings.
+ *  @sa sz_levenshtein_distance
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
 std::size_t hamming_distance(                                                                               //
@@ -3830,8 +3818,8 @@ std::size_t hamming_distance(
 }
 
 /**
- *  @brief  Calculates the Hamming edit distance in @b unicode codepoints between two strings.
- *  @see    sz_hamming_distance_utf8
+ *  @brief Calculates the Hamming edit distance in @b unicode codepoints between two strings.
+ *  @sa sz_hamming_distance_utf8
  */
 template <typename char_type_>
 std::size_t hamming_distance_utf8( //
@@ -3842,8 +3830,8 @@ std::size_t hamming_distance_utf8( //
 }
 
 /**
- *  @brief  Calculates the Hamming edit distance in @b unicode codepoints between two strings.
- *  @see    sz_levenshtein_distance
+ *  @brief Calculates the Hamming edit distance in @b unicode codepoints between two strings.
+ *  @sa sz_levenshtein_distance
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
 std::size_t hamming_distance_utf8( //
@@ -3853,8 +3841,8 @@ std::size_t hamming_distance_utf8( //
 }
 
 /**
- *  @brief  Calculates the Levenshtein edit distance in @b bytes between two strings.
- *  @see    sz_levenshtein_distance
+ *  @brief Calculates the Levenshtein edit distance in @b bytes between two strings.
+ *  @sa sz_levenshtein_distance
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
 std::size_t edit_distance( //
@@ -3870,8 +3858,8 @@ std::size_t edit_distance( //
 }
 
 /**
- *  @brief  Calculates the Levenshtein edit distance in @b bytes between two strings.
- *  @see    sz_levenshtein_distance
+ *  @brief Calculates the Levenshtein edit distance in @b bytes between two strings.
+ *  @sa sz_levenshtein_distance
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<char_type_>>
 std::size_t edit_distance(                                                                                  //
@@ -3881,8 +3869,8 @@ std::size_t edit_distance(
 }
 
 /**
- *  @brief  Calculates the Levenshtein edit distance in @b unicode codepoints between two strings.
- *  @see    sz_levenshtein_distance_utf8
+ *  @brief Calculates the Levenshtein edit distance in @b unicode codepoints between two strings.
+ *  @sa sz_levenshtein_distance_utf8
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
 std::size_t edit_distance_utf8(                                                       //
@@ -3898,8 +3886,8 @@ std::size_t edit_distance_utf8(
 }
 
 /**
- *  @brief  Calculates the Levenshtein edit distance in @b unicode codepoints between two strings.
- *  @see    sz_levenshtein_distance_utf8
+ *  @brief Calculates the Levenshtein edit distance in @b unicode codepoints between two strings.
+ *  @sa sz_levenshtein_distance_utf8
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<char_type_>>
 std::size_t edit_distance_utf8(                                                                             //
@@ -3909,8 +3897,8 @@ std::size_t edit_distance_utf8(
 }
 
 /**
- *  @brief  Calculates the Needleman-Wunsch alignment score between two strings.
- *  @see    sz_needleman_wunsch_score
+ *  @brief Calculates the Needleman-Wunsch alignment score between two strings.
+ *  @sa sz_needleman_wunsch_score
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
 std::ptrdiff_t alignment_score(                                                       //
@@ -3932,8 +3920,8 @@ std::ptrdiff_t alignment_score(
 }
 
 /**
- *  @brief  Calculates the Needleman-Wunsch alignment score between two strings.
- *  @see    sz_needleman_wunsch_score
+ *  @brief Calculates the Needleman-Wunsch alignment score between two strings.
+ *  @sa sz_needleman_wunsch_score
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<char_type_>>
 std::ptrdiff_t alignment_score(                                                                             //
@@ -3943,10 +3931,10 @@ std::ptrdiff_t alignment_score(
 }
 
 /**
- *  @brief  Overwrites the string slice with random characters from the given alphabet using the random generator.
- *
- *  @param  string     The string to overwrite.
- *  @param  nonce      "Number used ONCE" to initialize the random number generator, @b don't repeat it!
+ *  @brief Overwrites the @p string slice with random bytes.
+ *  @param[in] string The string to overwrite.
+ *  @param[in] nonce "Number used ONCE" to initialize the random number generator, @b don't repeat it!
+ *  @sa sz_fill_random
  */
 template <typename char_type_>
 void randomize(basic_string_slice<char_type_> string, sz_u64_t nonce) noexcept {
@@ -3955,7 +3943,9 @@ void randomize(basic_string_slice<char_type_> string, sz_u64_t nonce) noexcept {
 }
 
 /**
- *  @brief  Replaces @b (in-place) all characters in the string using the provided lookup table.
+ *  @brief Overwrites the @p string slice with random bytes using `std::rand` for the nonce.
+ *  @param[in] string The string to overwrite.
+ *  @sa sz_fill_random
  */
 template <typename char_type_>
 void lookup(basic_string_slice<char_type_> string, basic_look_up_table<char_type_> const &table) noexcept {
@@ -3964,7 +3954,8 @@ void lookup(basic_string_slice<char_type_> string, basic_look_up_table<char_type
 }
 
 /**
- *  @brief  Maps all characters in the current string into another buffer using the provided lookup table.
+ *  @brief Maps all characters in the @p source string into the @p target buffer using the provided lookup @p table.
+ *  @sa sz_lookup
  */
 template <typename char_type_>
 void lookup( //
@@ -3975,11 +3966,8 @@ void lookup( //
 }
 
 /**
- *  @brief  Overwrites the string slice with random characters from the given alphabet
- *          using `std::rand` as the random generator.
- *
- *  @param  string     The string to overwrite.
- *  @param  alphabet   A string of characters to choose from.
+ *  @brief Replaces @b (in-place) all characters in the string using the provided lookup table.
+ *  @sa sz_lookup
  */
 template <typename char_type_>
 void randomize(basic_string_slice<char_type_> string, string_view alphabet = "abcdefghijklmnopqrstuvwxyz") noexcept {
@@ -3989,8 +3977,8 @@ void randomize(basic_string_slice<char_type_> string, string_view alphabet = "ab
 using sorted_idx_t = sz_sorted_idx_t;
 
 /**
- *  @brief  Internal data-structure used to forward the arguments to the `sz_sequence_argsort` function.
- *  @see    argsort
+ *  @brief Internal data-structure used to wrap arbitrary sequential containers with a random-order lookup.
+ *  @sa try_argsort, argsort, try_join, join
  */
 template <typename objects_type_, typename string_extractor_>
 struct _sequence_args {
@@ -4064,8 +4052,8 @@ void hashes_fingerprint( //
 }
 
 /**
- *  @brief  Computes the Rabin-Karp-like rolling binary fingerprint of a string.
- *  @see    sz_hashes
+ *  @brief Computes the Rabin-Karp-like rolling binary fingerprint of a string.
+ *  @sa sz_hashes
  */
 template <std::size_t bitset_bits_, typename char_type_>
 std::bitset<bitset_bits_> hashes_fingerprint( //
@@ -4076,8 +4064,8 @@ std::bitset<bitset_bits_> hashes_fingerprint( //
 }
 
 /**
- *  @brief  Computes the Rabin-Karp-like rolling binary fingerprint of a string.
- *  @see    sz_hashes
+ *  @brief Computes the Rabin-Karp-like rolling binary fingerprint of a string.
+ *  @sa sz_hashes
  */
 template <std::size_t bitset_bits_, typename char_type_>
 std::bitset<bitset_bits_> hashes_fingerprint(basic_string<char_type_> const &str, std::size_t window_length) noexcept {

From 407dd2de067bea93c1144e35bc8e3a8ab670ef64 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 7 Mar 2025 12:54:55 +0000
Subject: [PATCH 151/751] Docs: Ignore C++ docstring updates blame

---
 .git-blame-ignore-revs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index 3d26edb4..c583f5fb 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -38,3 +38,5 @@ b007ba571860e1d3737d1478c7f8d66ae1839e36
 bd547453122e9f8565e5be15f137e7b0de37caca
 22e3d1e34d62d68c1e89df7c8bdc201faa18a9de
 ecb377541d0c706cf8997faff4f026b07e3f76f3
+0d982a45f842287d7e344f0d8b360f52482017f5
+

From b6e4406101cda970659c64a9215b7e81072b2168 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 7 Mar 2025 13:02:30 +0000
Subject: [PATCH 152/751] Docs: Details on the Unicode range

---
 include/stringzilla/types.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index a15cf116..f5658035 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -368,7 +368,8 @@ typedef enum {
 } sz_status_t;
 
 /**
- *  @brief  Describes the length of a UTF8 @b rune / character / codepoint in bytes.
+ *  @brief Describes the length of a UTF-8 @b rune / character / codepoint in bytes, which can be 1 to 4.
+ *  @see https://en.wikipedia.org/wiki/UTF-8
  */
 typedef enum {
     sz_utf8_invalid_k = 0,     //!< Invalid UTF8 character.
@@ -378,6 +379,16 @@ typedef enum {
     sz_utf8_rune_4bytes_k = 4, //!< 4-byte UTF8 character.
 } sz_rune_length_t;
 
+/**
+ *  @brief Stores a single UTF-8 @b rune / character / codepoint unpacked into @b UTF-32.
+ *  @see https://en.wikipedia.org/wiki/UTF-32
+ *
+ *  The theoretical capacity of the underlying numeric type is 4 bytes, with over 4 billion possible states, but:
+ *  - UTF-8, however, in its' largest 4-byte form has only 3+6+6+6 = 21 bits of usable space for 2 million states.
+ *  - Unicode, in turn, has only @b 1'114'112 possible code points from U+0000 to U+10FFFF.
+ *  - Of those, in Unicode 16, only @b 155'063 are assigned characters ~ a little over 17 bits of content.
+ *  That's @b 0.004% of the 32-bit space, so sparse data-structures are encouraged for UTF-8 oriented algorithms.
+ */
 typedef sz_u32_t sz_rune_t;
 
 /**

From 5c02c4edb7b559ed129047fdb6739f2721fc481b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 9 Mar 2025 05:07:11 +0000
Subject: [PATCH 153/751] Docs: Formatting

---
 include/stringzilla/types.h | 148 +++++++++++++++++-------------------
 1 file changed, 70 insertions(+), 78 deletions(-)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index f5658035..6d693347 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -391,8 +391,15 @@ typedef enum {
  */
 typedef sz_u32_t sz_rune_t;
 
+SZ_PUBLIC sz_rune_t sz_rune_perfect_hash(sz_rune_t rune) {
+    // TODO: A perfect hashing scheme can be constructed to map a 32-bit rune into an 18-bit representation,
+    // TODO: that can fit all of the unique values in the Unicode 16 standard.
+    return rune;
+}
+
 /**
- *  @brief  Tiny string-view structure. It's POD type, unlike the `std::string_view`.
+ *  @brief Tiny string-view structure. It's Plain-Old Datatype @b (POD) type, unlike the `std::string_view`.
+ *  @see https://en.cppreference.com/w/cpp/named_req/PODType
  */
 typedef struct sz_string_view_t {
     sz_cptr_t start;
@@ -402,8 +409,8 @@ typedef struct sz_string_view_t {
 #pragma region Character Sets
 
 /**
- *  @brief  Bit-set semi-opaque structure for 256 possible byte values. Useful for filtering and search.
- *  @sa     sz_byteset_init, sz_byteset_add, sz_byteset_contains, sz_byteset_invert
+ *  @brief Bit-set semi-opaque structure for 256 possible byte values. Useful for filtering and search.
+ *  @sa sz_byteset_init, sz_byteset_add, sz_byteset_contains, sz_byteset_invert
  *
  *  Example usage:
  *
@@ -426,22 +433,22 @@ typedef union sz_byteset_t {
     sz_u8_t _u8s[32];
 } sz_byteset_t;
 
-/** @brief  Initializes a bit-set to an empty collection, meaning - all characters are banned. */
+/** @brief Initializes a bit-set to an empty collection, meaning - all characters are banned. */
 SZ_PUBLIC void sz_byteset_init(sz_byteset_t *s) { s->_u64s[0] = s->_u64s[1] = s->_u64s[2] = s->_u64s[3] = 0; }
 
-/** @brief  Initializes a bit-set to all ASCII character. */
+/** @brief Initializes a bit-set to all ASCII character. */
 SZ_PUBLIC void sz_byteset_init_ascii(sz_byteset_t *s) {
     s->_u64s[0] = s->_u64s[1] = 0xFFFFFFFFFFFFFFFFull;
     s->_u64s[2] = s->_u64s[3] = 0;
 }
 
-/** @brief  Adds a character to the set and accepts @b unsigned integers. */
+/** @brief Adds a character to the set and accepts @b unsigned integers. */
 SZ_PUBLIC void sz_byteset_add_u8(sz_byteset_t *s, sz_u8_t c) { s->_u64s[c >> 6] |= (1ull << (c & 63u)); }
 
-/** @brief  Adds a character to the set. Consider @b sz_byteset_add_u8. */
+/** @brief Adds a character to the set. Consider @b sz_byteset_add_u8. */
 SZ_PUBLIC void sz_byteset_add(sz_byteset_t *s, char c) { sz_byteset_add_u8(s, *(sz_u8_t *)(&c)); } // bitcast
 
-/** @brief  Checks if the set contains a given character and accepts @b unsigned integers. */
+/** @brief Checks if the set contains a given character and accepts @b unsigned integers. */
 SZ_PUBLIC sz_bool_t sz_byteset_contains_u8(sz_byteset_t const *s, sz_u8_t c) {
     // Checking the bit can be done in different ways:
     // - (s->_u64s[c >> 6] & (1ull << (c & 63u))) != 0
@@ -451,12 +458,12 @@ SZ_PUBLIC sz_bool_t sz_byteset_contains_u8(sz_byteset_t const *s, sz_u8_t c) {
     return (sz_bool_t)((s->_u64s[c >> 6] & (1ull << (c & 63u))) != 0);
 }
 
-/** @brief  Checks if the set contains a given character. Consider @b sz_byteset_contains_u8. */
+/** @brief Checks if the set contains a given character. Consider @b sz_byteset_contains_u8. */
 SZ_PUBLIC sz_bool_t sz_byteset_contains(sz_byteset_t const *s, char c) {
     return sz_byteset_contains_u8(s, *(sz_u8_t *)(&c)); // bitcast
 }
 
-/** @brief  Inverts the contents of the set, so allowed character get disallowed, and vice versa. */
+/** @brief Inverts the contents of the set, so allowed character get disallowed, and vice versa. */
 SZ_PUBLIC void sz_byteset_invert(sz_byteset_t *s) {
     s->_u64s[0] ^= 0xFFFFFFFFFFFFFFFFull, s->_u64s[1] ^= 0xFFFFFFFFFFFFFFFFull, //
         s->_u64s[2] ^= 0xFFFFFFFFFFFFFFFFull, s->_u64s[3] ^= 0xFFFFFFFFFFFFFFFFull;
@@ -472,7 +479,7 @@ typedef void (*sz_memory_free_t)(void *, sz_size_t, void *);
 /**
  *  @brief  Some complex pattern matching algorithms may require memory allocations.
  *          This structure is used to pass the memory allocator to those functions.
- *  @see    sz_memory_allocator_init_fixed
+ *  @sa     sz_memory_allocator_init_fixed
  */
 typedef struct sz_memory_allocator_t {
     sz_memory_allocate_t allocate;
@@ -481,21 +488,17 @@ typedef struct sz_memory_allocator_t {
 } sz_memory_allocator_t;
 
 /**
- *  @brief  Initializes a memory allocator to use the system default `malloc` and `free`.
- *          ! The function is not available if the library was compiled with `SZ_AVOID_LIBC`.
- *
- *  @param alloc    Memory allocator to initialize.
+ *  @brief Initializes a memory allocator to use the system default `malloc` and `free`.
+ *  @warning The function is not available if the library was compiled with `SZ_AVOID_LIBC`.
+ *  @param[in] alloc Memory allocator to initialize.
  */
 SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc);
 
 /**
- *  @brief  Initializes a memory allocator to use a static-capacity buffer.
- *          No dynamic allocations will be performed.
- *
- *  @param alloc    Memory allocator to initialize.
- *  @param buffer   Buffer to use for allocations.
- *  @param length   Length of the buffer. @b Must be greater than 8 bytes. Different values would be optimal for
- *                  different algorithms and input lengths, but 4096 bytes (one RAM page) is a good default.
+ *  @brief Initializes a memory allocator to use only a static-capacity buffer @b w/out any dynamic allocations.
+ *  @param[in] alloc Memory allocator to initialize.
+ *  @param[in] buffer Buffer to use for allocations.
+ *  @param[in] length Length of the buffer. @b Must be greater than 8, at least 4KB (one RAM page) is recommended.
  */
 SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length);
 
@@ -503,66 +506,66 @@ SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void
 
 #pragma region API Signature Types
 
-/** @brief  Signature of `sz_hash`. */
+/** @brief Signature of `sz_hash`. */
 typedef sz_u64_t (*sz_hash_t)(sz_cptr_t, sz_size_t, sz_u64_t);
 
-/** @brief  Signature of `sz_hash_state_init`. */
+/** @brief Signature of `sz_hash_state_init`. */
 typedef void (*sz_hash_state_init_t)(struct sz_hash_state_t *, sz_u64_t);
 
-/** @brief  Signature of `sz_hash_state_stream`. */
+/** @brief Signature of `sz_hash_state_stream`. */
 typedef void (*sz_hash_state_stream_t)(struct sz_hash_state_t *, sz_cptr_t, sz_size_t);
 
-/** @brief  Signature of `sz_hash_state_fold`. */
+/** @brief Signature of `sz_hash_state_fold`. */
 typedef sz_u64_t (*sz_hash_state_fold_t)(struct sz_hash_state_t const *);
 
-/** @brief  Signature of `sz_bytesum`. */
+/** @brief Signature of `sz_bytesum`. */
 typedef sz_u64_t (*sz_bytesum_t)(sz_cptr_t, sz_size_t);
 
-/** @brief  Signature of `sz_fill_random`. */
+/** @brief Signature of `sz_fill_random`. */
 typedef void (*sz_fill_random_t)(sz_ptr_t, sz_size_t, sz_u64_t);
 
-/** @brief  Signature of `sz_equal`. */
+/** @brief Signature of `sz_equal`. */
 typedef sz_bool_t (*sz_equal_t)(sz_cptr_t, sz_cptr_t, sz_size_t);
 
-/** @brief  Signature of `sz_order`. */
+/** @brief Signature of `sz_order`. */
 typedef sz_ordering_t (*sz_order_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
 
-/** @brief  Signature of `sz_lookup`. */
+/** @brief Signature of `sz_lookup`. */
 typedef void (*sz_lookup_t)(sz_ptr_t, sz_size_t, sz_cptr_t, sz_cptr_t);
 
-/** @brief  Signature of `sz_move`. */
+/** @brief Signature of `sz_move`. */
 typedef void (*sz_move_t)(sz_ptr_t, sz_cptr_t, sz_size_t);
 
-/** @brief  Signature of `sz_fill`. */
+/** @brief Signature of `sz_fill`. */
 typedef void (*sz_fill_t)(sz_ptr_t, sz_size_t, sz_u8_t);
 
-/** @brief  Signature of `sz_find_byte`. */
+/** @brief Signature of `sz_find_byte`. */
 typedef sz_cptr_t (*sz_find_byte_t)(sz_cptr_t, sz_size_t, sz_cptr_t);
 
-/** @brief  Signature of `sz_find`. */
+/** @brief Signature of `sz_find`. */
 typedef sz_cptr_t (*sz_find_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
 
-/** @brief  Signature of `sz_find_set`. */
+/** @brief Signature of `sz_find_set`. */
 typedef sz_cptr_t (*sz_find_set_t)(sz_cptr_t, sz_size_t, sz_byteset_t const *);
 
-/** @brief  Signature of `sz_hamming_distance`. */
+/** @brief Signature of `sz_hamming_distance`. */
 typedef sz_status_t (*sz_hamming_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t, sz_size_t *);
 
-/** @brief  Signature of `sz_levenshtein_distance`. */
+/** @brief Signature of `sz_levenshtein_distance`. */
 typedef sz_status_t (*sz_levenshtein_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t,
                                                  sz_memory_allocator_t *, sz_size_t *);
 
-/** @brief  Signature of `sz_needleman_wunsch_score`. */
+/** @brief Signature of `sz_needleman_wunsch_score`. */
 typedef sz_status_t (*sz_needleman_wunsch_score_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_error_cost_t const *,
                                                    sz_error_cost_t, sz_memory_allocator_t *, sz_ssize_t *);
 
-/** @brief  Signature of `sz_sequence_argsort`. */
+/** @brief Signature of `sz_sequence_argsort`. */
 typedef sz_status_t (*sz_sequence_argsort_t)(struct sz_sequence_t const *, sz_memory_allocator_t *, sz_sorted_idx_t *);
 
-/** @brief  Signature of `sz_pgrams_sort`. */
+/** @brief Signature of `sz_pgrams_sort`. */
 typedef sz_status_t (*sz_pgrams_sort_t)(sz_pgram_t *, sz_size_t, sz_memory_allocator_t *, sz_sorted_idx_t *);
 
-/** @brief  Signature of `sz_sequence_join`. */
+/** @brief Signature of `sz_sequence_join`. */
 typedef sz_status_t (*sz_sequence_join_t)(struct sz_sequence_t const *, struct sz_sequence_t const *,
                                           sz_memory_allocator_t *, sz_size_t *, sz_sorted_idx_t *, sz_sorted_idx_t *);
 
@@ -571,8 +574,8 @@ typedef sz_status_t (*sz_sequence_join_t)(struct sz_sequence_t const *, struct s
 #pragma region Helper Structures
 
 /**
- *  @brief  Helper structure to simplify work with 16-bit words.
- *  @see    sz_u16_load
+ *  @brief Helper structure to simplify work with 16-bit words.
+ *  @sa sz_u16_load
  */
 typedef union sz_u16_vec_t {
     sz_u16_t u16;
@@ -580,8 +583,8 @@ typedef union sz_u16_vec_t {
 } sz_u16_vec_t;
 
 /**
- *  @brief  Helper structure to simplify work with 32-bit words.
- *  @see    sz_u32_load
+ *  @brief Helper structure to simplify work with 32-bit words.
+ *  @sa sz_u32_load
  */
 typedef union sz_u32_vec_t {
     sz_u32_t u32;
@@ -590,8 +593,8 @@ typedef union sz_u32_vec_t {
 } sz_u32_vec_t;
 
 /**
- *  @brief  Helper structure to simplify work with 64-bit words.
- *  @see    sz_u64_load
+ *  @brief Helper structure to simplify work with 64-bit words.
+ *  @sa sz_u64_load
  */
 typedef union sz_u64_vec_t {
     sz_u64_t u64;
@@ -662,9 +665,7 @@ typedef union sz_u512_vec_t {
 
 #pragma region UTF8
 
-/**
- *  @brief  Extracts just one UTF8 codepoint from a UTF8 string into a 32-bit unsigned integer.
- */
+/** @brief Extracts just one UTF8 codepoint from a UTF8 string into a 32-bit unsigned integer. */
 SZ_INTERNAL void _sz_extract_utf8_rune(sz_cptr_t utf8, sz_rune_t *code, sz_rune_length_t *code_length) {
     sz_u8_t const *current = (sz_u8_t const *)utf8;
     sz_u8_t leading_byte = *current++;
@@ -708,8 +709,8 @@ SZ_INTERNAL void _sz_extract_utf8_rune(sz_cptr_t utf8, sz_rune_t *code, sz_rune_
 }
 
 /**
- *  @brief  Exports a UTF8 string into a UTF32 buffer.
- *          ! The result is undefined id the UTF8 string is corrupted.
+ *  @brief Exports a UTF8 string into a UTF32 buffer.
+ *  @warning The result is undefined id the UTF8 string is corrupted.
  *  @return The length in the number of codepoints.
  */
 SZ_INTERNAL sz_size_t _sz_export_utf8_to_utf32(sz_cptr_t utf8, sz_size_t utf8_length, sz_rune_t *utf32) {
@@ -771,14 +772,10 @@ SZ_PUBLIC void sz_sequence_from_null_terminated_strings(sz_cptr_t *start, sz_siz
  **********************************************************************************************************************
  */
 
-/**
- *  @brief  Helper-macro to mark potentially unused variables.
- */
+/** @brief Helper-macro to mark potentially unused variables. */
 #define sz_unused(x) ((void)(x))
 
-/**
- *  @brief  Helper-macro casting a variable to another type of the same size.
- */
+/** @brief Helper-macro casting a variable to another type of the same size. */
 #define sz_bitcast(type, value) (*((type *)&(value)))
 
 /**
@@ -1024,7 +1021,8 @@ SZ_INTERNAL void sz_ssize_clamp_interval( //
 }
 
 /**
- *  @brief  Compute the logarithm base 2 of a positive integer, rounding down.
+ *  @brief Compute the logarithm base 2 of a positive integer, rounding down.
+ *  @pre Input must be a positive number, as the logarithm of zero is undefined.
  */
 SZ_INTERNAL sz_size_t sz_size_log2i_nonzero(sz_size_t x) {
     _sz_assert(x > 0 && "Non-positive numbers have no defined logarithm");
@@ -1033,11 +1031,11 @@ SZ_INTERNAL sz_size_t sz_size_log2i_nonzero(sz_size_t x) {
 }
 
 /**
- *  @brief  Compute the smallest power of two greater than or equal to @p x.
+ *  @brief Compute the smallest power of two greater than or equal to @p x.
+ *  @pre Unlike the commonly used trick with `clz` intrinsics, is valid across the whole range of `x`, @b including 0.
+ *  @see https://stackoverflow.com/a/10143264
  */
 SZ_INTERNAL sz_size_t sz_size_bit_ceil(sz_size_t x) {
-    // Unlike the commonly used trick with `clz` intrinsics, is valid across the whole range of `x`.
-    // https://stackoverflow.com/a/10143264
     x--;
     x |= x >> 1;
     x |= x >> 2;
@@ -1052,7 +1050,7 @@ SZ_INTERNAL sz_size_t sz_size_bit_ceil(sz_size_t x) {
 }
 
 /**
- *  @brief  Transposes an 8x8 bit matrix packed in a `sz_u64_t`.
+ *  @brief Transposes an 8x8 bit matrix packed in a `sz_u64_t`.
  *
  *  There is a well known SWAR sequence for that known to chess programmers,
  *  willing to flip a bit-matrix of pieces along the main A1-H8 diagonal.
@@ -1070,9 +1068,7 @@ SZ_INTERNAL sz_u64_t sz_u64_transpose(sz_u64_t x) {
     return x;
 }
 
-/**
- *  @brief Load a 16-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
+/** @brief Load a 16-bit unsigned integer from a potentially unaligned pointer. Can be expensive on some platforms. */
 SZ_INTERNAL sz_u16_vec_t sz_u16_load(sz_cptr_t ptr) {
 #if !SZ_USE_MISALIGNED_LOADS
     sz_u16_vec_t result;
@@ -1080,7 +1076,7 @@ SZ_INTERNAL sz_u16_vec_t sz_u16_load(sz_cptr_t ptr) {
     result.u8s[1] = ptr[1];
     return result;
 #elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
+#if defined(_M_IX86) //< The `__unaligned` modifier isn't valid for the x86 platform.
     return *((sz_u16_vec_t *)ptr);
 #else
     return *((__unaligned sz_u16_vec_t *)ptr);
@@ -1091,9 +1087,7 @@ SZ_INTERNAL sz_u16_vec_t sz_u16_load(sz_cptr_t ptr) {
 #endif
 }
 
-/**
- *  @brief Load a 32-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
+/** @brief Load a 32-bit unsigned integer from a potentially unaligned pointer. Can be expensive on some platforms. */
 SZ_INTERNAL sz_u32_vec_t sz_u32_load(sz_cptr_t ptr) {
 #if !SZ_USE_MISALIGNED_LOADS
     sz_u32_vec_t result;
@@ -1103,7 +1097,7 @@ SZ_INTERNAL sz_u32_vec_t sz_u32_load(sz_cptr_t ptr) {
     result.u8s[3] = ptr[3];
     return result;
 #elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
+#if defined(_M_IX86) //< The `__unaligned` modifier isn't valid for the x86 platform.
     return *((sz_u32_vec_t *)ptr);
 #else
     return *((__unaligned sz_u32_vec_t *)ptr);
@@ -1114,9 +1108,7 @@ SZ_INTERNAL sz_u32_vec_t sz_u32_load(sz_cptr_t ptr) {
 #endif
 }
 
-/**
- *  @brief Load a 64-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
- */
+/** @brief Load a 64-bit unsigned integer from a potentially unaligned pointer. Can be expensive on some platforms. */
 SZ_INTERNAL sz_u64_vec_t sz_u64_load(sz_cptr_t ptr) {
 #if !SZ_USE_MISALIGNED_LOADS
     sz_u64_vec_t result;
@@ -1130,7 +1122,7 @@ SZ_INTERNAL sz_u64_vec_t sz_u64_load(sz_cptr_t ptr) {
     result.u8s[7] = ptr[7];
     return result;
 #elif defined(_MSC_VER) && !defined(__clang__)
-#if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
+#if defined(_M_IX86) //< The `__unaligned` modifier isn't valid for the x86 platform.
     return *((sz_u64_vec_t *)ptr);
 #else
     return *((__unaligned sz_u64_vec_t *)ptr);
@@ -1141,7 +1133,7 @@ SZ_INTERNAL sz_u64_vec_t sz_u64_load(sz_cptr_t ptr) {
 #endif
 }
 
-/** @brief  Helper function, using the supplied fixed-capacity buffer to allocate memory. */
+/** @brief Helper function, using the supplied fixed-capacity buffer to allocate memory. */
 SZ_INTERNAL sz_ptr_t _sz_memory_allocate_fixed(sz_size_t length, void *handle) {
     sz_size_t capacity;
     *(sz_ptr_t)&capacity = *(sz_cptr_t)handle;
@@ -1150,7 +1142,7 @@ SZ_INTERNAL sz_ptr_t _sz_memory_allocate_fixed(sz_size_t length, void *handle) {
     return (sz_ptr_t)handle + consumed_capacity;
 }
 
-/** @brief  Helper "no-op" function, simulating memory deallocation when we use a "static" memory buffer. */
+/** @brief Helper "no-op" function, simulating memory deallocation when we use a "static" memory buffer. */
 SZ_INTERNAL void _sz_memory_free_fixed(sz_ptr_t start, sz_size_t length, void *handle) {
     sz_unused(start && length && handle);
 }

From 8dc4a2c70fb84f87c9e573579dc496a91b5f22e4 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 9 Mar 2025 05:07:51 +0000
Subject: [PATCH 154/751] Fix: Randomization benchmarks

---
 scripts/bench_token.cpp | 41 +++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index 0d83604b..112fbc98 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -95,27 +95,25 @@ tracked_unary_functions_t hash_stream_functions() {
     return result;
 }
 
-tracked_unary_functions_t random_generation_functions(std::size_t token_length) {
+tracked_unary_functions_t random_generation_functions() {
     static std::vector<char> buffer;
-    if (buffer.size() < token_length) buffer.resize(token_length);
-
-    auto suffix = ", " + std::to_string(token_length) + " chars";
     tracked_unary_functions_t result = {
-        {"std::rand % uint8" + suffix, unary_function_t([token_length](std::string_view alphabet) -> std::size_t {
-             using max_alphabet_size_t = std::uint8_t;
-             auto max_alphabet_size = static_cast<max_alphabet_size_t>(alphabet.size());
-             for (std::size_t i = 0; i < token_length; ++i) { buffer[i] = alphabet[std::rand() % max_alphabet_size]; }
-             return token_length;
+        {"std::rand() & 0xFF", unary_function_t([](std::string_view token) -> std::size_t {
+             if (buffer.size() < token.size()) buffer.resize(token.size());
+             for (std::size_t i = 0; i < token.size(); ++i) buffer[i] = static_cast<char>(std::rand() & 0xFF);
+             return token.size();
+         })},
+        {"std::uniform_int<uint8>", unary_function_t([](std::string_view token) -> std::size_t {
+             if (buffer.size() < token.size()) buffer.resize(token.size());
+             randomize_string(buffer.data(), token.size());
+             return token.size();
          })},
-        {"std::uniform_int<uint8>" + suffix, unary_function_t([token_length](std::string_view alphabet) -> std::size_t {
-             randomize_string(buffer.data(), token_length, alphabet.data(), alphabet.size());
-             return token_length;
+        {"sz::randomize", unary_function_t([](std::string_view token) -> std::size_t {
+             if (buffer.size() < token.size()) buffer.resize(token.size());
+             sz::string_span span(buffer.data(), token.size());
+             sz::fill_random(span);
+             return token.size();
          })},
-        // {"sz::randomize" + suffix, unary_function_t([token_length](std::string_view alphabet) -> std::size_t {
-        //      sz::string_span span(buffer.data(), token_length);
-        //      sz::randomize(span, global_random_generator(), alphabet);
-        //      return token_length;
-        //  })},
     };
     return result;
 }
@@ -123,11 +121,11 @@ tracked_unary_functions_t random_generation_functions(std::size_t token_length)
 tracked_binary_functions_t equality_functions() {
     auto wrap_sz = [](auto function) -> binary_function_t {
         return binary_function_t([function](std::string_view a, std::string_view b) {
-            return (a.size() == b.size() && function(a.data(), b.data(), a.size()));
+            return a.size() == b.size() && function(a.data(), b.data(), a.size());
         });
     };
     tracked_binary_functions_t result = {
-        {"std::string_view.==", [](std::string_view a, std::string_view b) { return (a == b); }},
+        {"std::string_view.==", [](std::string_view a, std::string_view b) { return a == b; }},
         {"sz_equal_serial", wrap_sz(sz_equal_serial), true},
 #if SZ_USE_HASWELL
         {"sz_equal_haswell", wrap_sz(sz_equal_haswell), true},
@@ -190,6 +188,7 @@ void bench(strings_type &&strings) {
     bench_unary_functions(strings, hash_stream_functions());
     bench_binary_functions(strings, equality_functions());
     bench_binary_functions(strings, ordering_functions());
+    bench_unary_functions(strings, random_generation_functions());
 
     // Benchmark the cost of converting `std::string` and `sz::string` to `std::string_view`.
     // ! The results on a mixture of short and long strings should be similar.
@@ -208,7 +207,9 @@ void bench_on_input_data(int argc, char const **argv) {
     std::printf("Benchmarking on real lines:\n");
     bench(dataset.lines);
     std::printf("Benchmarking on entire dataset:\n");
-    bench<std::vector<std::string_view>>({dataset.text});
+    bench_unary_functions<std::vector<std::string_view>>({dataset.text}, bytesum_functions());
+    bench_unary_functions<std::vector<std::string_view>>({dataset.text}, hash_functions());
+    bench_unary_functions<std::vector<std::string_view>>({dataset.text}, hash_stream_functions());
 
     // Run benchmarks on tokens of different length
     for (std::size_t token_length : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32}) {

From 1d956019b42c0df9ef32cd546911c931f37febad Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 9 Mar 2025 05:08:24 +0000
Subject: [PATCH 155/751] Improve: Test set intersections

---
 scripts/test.cpp | 150 +++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 127 insertions(+), 23 deletions(-)

diff --git a/scripts/test.cpp b/scripts/test.cpp
index a0eac08e..63452df4 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -49,6 +49,8 @@
 #include <random>        // `std::random_device`
 #include <sstream>       // `std::ostringstream`
 #include <unordered_map> // `std::unordered_map`
+#include <unordered_set> // `std::unordered_set`
+#include <set>           // `std::set`
 #include <vector>        // `std::vector`
 
 #include <string>      // Baseline
@@ -149,6 +151,31 @@ static void test_arithmetical_utilities() {
 #endif
 }
 
+static void test_structural_utilities() {
+    // Make sure the sequence helper functions work as expected
+    // for both trivial c-style arrays and
+    {
+        sz_sequence_t sequence;
+        sz_cptr_t strings[] = {"banana", "apple", "cherry"};
+        sz_sequence_from_null_terminated_strings(strings, 3, &sequence);
+        assert(sequence.count == 3);
+        assert("banana"_sv == sequence.get_start(sequence.handle, 0));
+        assert("apple"_sv == sequence.get_start(sequence.handle, 1));
+        assert("cherry"_sv == sequence.get_start(sequence.handle, 2));
+    }
+
+    // sz_memory_allocator_init_default;
+    // sz_memory_allocator_init_fixed;
+    // _sz_extract_utf8_rune;
+    // sz_byteset_init;
+    // sz_byteset_init_ascii;
+    // sz_byteset_add_u8;
+    // sz_byteset_add;
+    // sz_byteset_contains_u8;
+    // sz_byteset_contains;
+    // sz_byteset_invert;
+}
+
 /**
  *  @brief  Hashes a string and compares the output between a serial and hardware-specific SIMD backend.
  *
@@ -437,14 +464,14 @@ static void test_memory_utilities( //
 }
 
 #define assert_scoped(init, operation, condition) \
-    {                                             \
+    do {                                          \
         init;                                     \
         operation;                                \
         assert(condition);                        \
-    }
+    } while (0)
 
 #define assert_throws(expression, exception_type) \
-    {                                             \
+    do {                                          \
         bool threw = false;                       \
         try {                                     \
             sz_unused(expression);                \
@@ -453,7 +480,7 @@ static void test_memory_utilities( //
             threw = true;                         \
         }                                         \
         assert(threw);                            \
-    }
+    } while (0)
 
 /**
  *  @brief  Invokes different C++ member methods of immutable strings to cover all STL APIs.
@@ -1684,9 +1711,10 @@ static void test_levenshtein_distances() {
 /**
  *  Evaluates the correctness of look-up table transforms using random lookup tables.
  *
- *  @param misalignment The number of bytes to misalign the haystack within the cacheline.
+ *  @param lookup_tables_to_try The number of random lookup tables to try.
+ *  @param slices_per_table The number of random inputs to test per lookup table.
  */
-void test_replacements(std::size_t lookup_tables_to_try = 128, std::size_t slices_per_table = 256) {
+void test_replacements(std::size_t lookup_tables_to_try = 32, std::size_t slices_per_table = 16) {
 
     std::string body, transformed;
     body.resize(1024 * 1024); // 1MB
@@ -1712,23 +1740,19 @@ void test_replacements(std::size_t lookup_tables_to_try = 128, std::size_t slice
 }
 
 /**
- *  @brief  Tests sorting functionality.
+ *  @brief  Tests array sorting functionality, such as `argsort`, `sort`, and `sorted`.
+ *
+ *  Tries to sort incrementally complex inputs, such as strings of varying lengths, with many equal inputs.
+ *  1. Basic tests with predetermined orders.
+ *  2. Test on long strings of identical length.
+ *  3. Test on random very small strings of varying lengths, likely with many equal inputs.
+ *  4. Test on random strings of varying lengths.
+ *  5. Test on random strings of varying lengths with zero characters.
  */
-static void test_sequence_algorithms() {
+static void test_sorting_algorithms() {
     using strs_t = std::vector<std::string>;
     using order_t = std::vector<sz::sorted_idx_t>;
 
-    // Make sure teh helper functions work as expected.
-    {
-        sz_sequence_t sequence;
-        sz_cptr_t strings[] = {"banana", "apple", "cherry"};
-        sz_sequence_from_null_terminated_strings(strings, 3, &sequence);
-        assert(sequence.count == 3);
-        assert("banana"_sv == sequence.get_start(sequence.handle, 0));
-        assert("apple"_sv == sequence.get_start(sequence.handle, 1));
-        assert("cherry"_sv == sequence.get_start(sequence.handle, 2));
-    }
-
     // Basic tests with predetermined orders.
     assert_scoped(strs_t x({"a", "b", "c", "d"}), (void)0, sz::argsort(x) == order_t({0u, 1u, 2u, 3u}));
     assert_scoped(strs_t x({"b", "c", "d", "a"}), (void)0, sz::argsort(x) == order_t({3u, 0u, 1u, 2u}));
@@ -1796,6 +1820,84 @@ static void test_sequence_algorithms() {
     }
 }
 
+/**
+ *  @brief  Tests array intersection functionality.
+ */
+static void test_intersecting_algorithms() {
+    using strs_t = std::vector<std::string>;
+    using result_t = sz::intersect_result_t;
+
+    // The mapping aren't guaranteed to be in any specific order, so we will sort them for comparisons.
+    using idx_pair_t = std::pair<std::size_t, std::size_t>;
+    using idx_pairs_t = std::set<idx_pair_t>;
+    auto to_pairs = [](result_t const &result) -> idx_pairs_t {
+        idx_pairs_t pairs;
+        for (std::size_t i = 0; i < result.first_offsets.size(); ++i)
+            pairs.insert({result.first_offsets[i], result.second_offsets[i]});
+        return pairs;
+    };
+
+    // Predetermined simple cases
+    {
+        strs_t abcd({"a", "b", "c", "d"});
+        strs_t dcba({"d", "c", "b", "a"});
+        strs_t abs({"a", "b", "s"});
+        strs_t empty;
+        result_t result;
+        // Empty sets
+        {
+            result = sz::intersect(empty, empty);
+            assert(result.first_offsets.size() == 0 && result.second_offsets.size() == 0);
+            result = sz::intersect(abcd, empty);
+            assert(result.first_offsets.size() == 0 && result.second_offsets.size() == 0);
+        }
+        // Identity check
+        {
+            result = sz::intersect(abcd, abcd);
+            assert(result.first_offsets.size() == 4 && result.second_offsets.size() == 4);
+            assert(to_pairs(result) == idx_pairs_t({{0u, 0u}, {1u, 1u}, {2u, 2u}, {3u, 3u}}));
+        }
+        // Identical size, different order
+        {
+            result = sz::intersect(abcd, dcba);
+            assert(result.first_offsets.size() == 4 && result.second_offsets.size() == 4);
+            assert(to_pairs(result) == idx_pairs_t({{0u, 3u}, {1u, 2u}, {2u, 1u}, {3u, 0u}}));
+        }
+        // Different sets
+        {
+            result = sz::intersect(abcd, abs);
+            assert(result.first_offsets.size() == 2 && result.second_offsets.size() == 2);
+            assert(to_pairs(result) == idx_pairs_t({{0u, 0u}, {1u, 1u}}));
+        }
+    }
+
+    // Generate random strings
+    struct {
+        std::size_t min_length;
+        std::size_t max_length;
+        std::size_t count_strings;
+    } experiments[] = {
+        {10, 10, 100},
+        {15, 15, 1000},
+        {5, 30, 2000},
+    };
+    for (auto experiment : experiments) {
+        std::unordered_set<std::string> random_strings;
+        while (random_strings.size() < experiment.count_strings)
+            random_strings.insert(sz::scripts::random_string(
+                experiment.min_length + std::rand() % (experiment.max_length - experiment.min_length + 1), //
+                "ab", 2));
+
+        strs_t all_strings(random_strings.begin(), random_strings.end());
+        strs_t first_half(all_strings.begin(), all_strings.begin() + all_strings.size() / 2);
+
+        // Try different joins
+        result_t result;
+        result = sz::intersect(all_strings, first_half);
+        assert(result.first_offsets.size() == first_half.size() && result.second_offsets.size() == first_half.size());
+    }
+}
+
 /**
  *  @brief  Tests constructing STL containers with StringZilla strings.
  */
@@ -1824,8 +1926,14 @@ int main(int argc, char const **argv) {
 
     // Basic utilities
     test_arithmetical_utilities();
+    test_structural_utilities();
     test_simd_against_serial();
 
+    // Sequences of strings
+    test_sorting_algorithms();
+    test_intersecting_algorithms();
+    test_stl_containers();
+
     // Core APIs
     test_ascii_utilities<sz::string>();
     test_ascii_utilities<sz::string_view>();
@@ -1862,10 +1970,6 @@ int main(int argc, char const **argv) {
     test_search_with_misaligned_repetitions();
 #endif
 
-    // Sequences of strings
-    test_sequence_algorithms();
-    test_stl_containers();
-
     std::printf("All tests passed... Unbelievable!\n");
     return 0;
 }

From de62723158719f184d9d81db3ac786e47883f391 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 9 Mar 2025 05:08:44 +0000
Subject: [PATCH 156/751] Add: Feature-extraction placeholder

---
 include/stringzilla/features.h | 134 +++++++++++++++++++++++++++++++++
 1 file changed, 134 insertions(+)
 create mode 100644 include/stringzilla/features.h

diff --git a/include/stringzilla/features.h b/include/stringzilla/features.h
new file mode 100644
index 00000000..1389a4cf
--- /dev/null
+++ b/include/stringzilla/features.h
@@ -0,0 +1,134 @@
+/**
+ *  @brief  Hardware-accelerated feature extractions for string collections.
+ *  @file   features.h
+ *  @author Ash Vardanian
+ *
+ *  The `sklearn.feature_extraction` module for @b TF-IDF, "CountVectorizer", and "HashingVectorizer"
+ *  is one of the most commonly used in the industry due to its extreme flexibility. It can:
+ *
+ *  - Tokenize by words, N-grams, or in-word N-grams.
+ *  - Use arbitrary Regular Expressions as word separators.
+ *  - Return matrices of different types, normalized or not.
+ *  - Exclude "stop words" and remove ASCII and Unicode accents.
+ *  - Dynamically build a vocabulary or use a fixed list/dictionary.
+ *
+ *  That level of flexibility is not feasible for a hardware-accelerated SIMD library, but we
+ *  can provide a set of APIs that can be used to build such a library on top of StringZilla.
+ *  That functionality will reuse our @b Trie data-structure for vocabulary building histograms.
+ *
+ */
+#ifndef STRINGZILLA_FEATURES_H_
+#define STRINGZILLA_FEATURES_H_
+
+#include "types.h"
+
+#include "compare.h" // `sz_compare`
+#include "memory.h"  // `sz_copy`
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#pragma region Core API
+
+/**
+ *  @brief  Faster @b arg-sort for an arbitrary @b string sequence, using QuickSort.
+ *          Outputs the @p order of elements in the immutable @p sequence, that would sort it.
+ *
+ *  @param[in] sequence Immutable sequence of strings to sort.
+ *  @param[in] alloc Optional memory allocator for temporary storage.
+ *  @param[out] order Output permutation that sorts the elements.
+ *
+ *  @retval `sz_success_k` if the operation was successful.
+ *  @retval `sz_bad_alloc_k` if the operation failed due to memory allocation failure.
+ *  @pre The @p order array must fit at least `sequence->count` integers.
+ *  @post The @p order array will contain a valid permutation of `[0, sequence->count - 1]`.
+ *
+ *  Example usage:
+ *
+ *  @code{.c}
+ *      #include <stringzilla/features.h>
+ *      int main() {
+ *          char const *strings[] = {"banana", "apple", "cherry"};
+ *          sz_sequence_t sequence;
+ *          sz_sequence_from_null_terminated_strings(strings, 3, &sequence);
+ *          sz_sorted_idx_t order[3];
+ *          sz_status_t status = sz_sequence_argsort(&sequence, NULL, order);
+ *          return status == sz_success_k && order[0] == 1 && order[1] == 0 && order[2] == 2 ? 0 : 1;
+ *      }
+ *  @endcode
+ *
+ *  @note   The algorithm has linear memory complexity, quadratic worst-case and log-linear average time complexity.
+ *  @see    https://en.wikipedia.org/wiki/Quicksort
+ *
+ *  @note   This algorithm is @b unstable: equal elements may change relative order.
+ *  @sa     sz_sequence_argsort_stabilize
+ *
+ *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
+ *  @sa     sz_sequence_argsort_serial, sz_sequence_argsort_skylake, sz_sequence_argsort_sve
+ */
+SZ_DYNAMIC sz_status_t sz_sequence_argsort(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                           sz_sorted_idx_t *order);
+
+enum sz_encoding_t {
+    sz_encoding_unknown_k = 0,
+    sz_encoding_ascii_k = 1,
+    sz_encoding_utf8_k = 2,
+    sz_encoding_utf16_k = 3,
+    sz_encoding_utf32_k = 4,
+    sz_encoding_jwt_k = 5,
+    sz_encoding_base64_k = 6,
+    // Low priority encodings:
+    sz_encoding_utf8bom_k = 7,
+    sz_encoding_utf16le_k = 8,
+    sz_encoding_utf16be_k = 9,
+    sz_encoding_utf32le_k = 10,
+    sz_encoding_utf32be_k = 11,
+};
+
+// Character Set Detection is one of the most commonly performed operations in data processing with
+// [Chardet](https://github.com/chardet/chardet), [Charset Normalizer](https://github.com/jawah/charset_normalizer),
+// [cChardet](https://github.com/PyYoshi/cChardet) being the most commonly used options in the Python ecosystem.
+// All of them are notoriously slow.
+//
+// Moreover, as of October 2024, UTF-8 is the dominant character encoding on the web, used by 98.4% of websites.
+// Other have minimal usage, according to [W3Techs](https://w3techs.com/technologies/overview/character_encoding):
+// - ISO-8859-1: 1.2%
+// - Windows-1252: 0.3%
+// - Windows-1251: 0.2%
+// - EUC-JP: 0.1%
+// - Shift JIS: 0.1%
+// - EUC-KR: 0.1%
+// - GB2312: 0.1%
+// - Windows-1250: 0.1%
+// Within programming language implementations and database management systems, 16-bit and 32-bit fixed-width encodings
+// are also very popular and we need a way to efficiently differentiate between the most common UTF flavors, ASCII, and
+// the rest.
+//
+// One good solution is the [simdutf](https://github.com/simdutf/simdutf) library, but it depends on the C++ runtime
+// and focuses more on incremental validation & transcoding, rather than detection.
+//
+// So we need a very fast and efficient way of determining
+SZ_PUBLIC sz_bool_t sz_detect_encoding(sz_cptr_t text, sz_size_t length) {
+    // https://github.com/simdutf/simdutf/blob/master/src/icelake/icelake_utf8_validation.inl.cpp
+    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_from_utf8.inl.cpp#L81
+    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L661
+    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L788
+
+    // We can implement this operation simpler & differently, assuming most of the time continuous chunks of memory
+    // have identical encoding. With Russian and many European languages, we generally deal with 2-byte codepoints
+    // with occasional 1-byte punctuation marks. In the case of Chinese, Japanese, and Korean, we deal with 3-byte
+    // codepoints. In the case of emojis, we deal with 4-byte codepoints.
+    // We can also use the idea, that misaligned reads are quite cheap on modern CPUs.
+    int can_be_ascii = 1, can_be_utf8 = 1, can_be_utf16 = 1, can_be_utf32 = 1;
+    sz_unused(can_be_ascii + can_be_utf8 + can_be_utf16 + can_be_utf32);
+    sz_unused(text && length);
+    return sz_false_k;
+}
+
+#pragma endregion // Core API
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+#endif // STRINGZILLA_FEATURES_H_

From 197cd8719017f9fd43d1f4327e0d3f5fed477340 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 9 Mar 2025 05:09:21 +0000
Subject: [PATCH 157/751] Docs: Exploring perfect Unicode hashing

---
 ...shtein.ipynb => explore_levenshtein.ipynb} |   0
 scripts/explore_unicode.ipynb                 | 646 ++++++++++++++++++
 2 files changed, 646 insertions(+)
 rename scripts/{test_levenshtein.ipynb => explore_levenshtein.ipynb} (100%)
 create mode 100644 scripts/explore_unicode.ipynb

diff --git a/scripts/test_levenshtein.ipynb b/scripts/explore_levenshtein.ipynb
similarity index 100%
rename from scripts/test_levenshtein.ipynb
rename to scripts/explore_levenshtein.ipynb
diff --git a/scripts/explore_unicode.ipynb b/scripts/explore_unicode.ipynb
new file mode 100644
index 00000000..86af3fd6
--- /dev/null
+++ b/scripts/explore_unicode.ipynb
@@ -0,0 +1,646 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Unicode and Perfect Hashing\n",
+    "\n",
+    "Generalizing StringZilla from byte-strings to UTF-8 strings requires a deep understanding of Unicode.\n",
+    "This notebook is a playground to explore Unicode and UTF-8 encoding.\n",
+    "Most importantly it provides a snippet for finding the perfect-hash for unicode, which allows us to produce more efficient histograms and lookup tables for unicode characters.\n",
+    "That cab be a constituent part of any UTF-8-aware text-processing algorithm, be it Levenshtein automata or distance calculation, Aho-Corasick automata, or high-level NLP tasks, like feature extraction or text classification."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%pip install -q numba numpy tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from numba import jit as njit\n",
+    "from tqdm import tqdm\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloaded 10,047,830 bytes of UCD XML data\n",
+      "Files in zip: ['ucd.all.flat.xml']\n"
+     ]
+    }
+   ],
+   "source": [
+    "import urllib.request\n",
+    "import io\n",
+    "import zipfile\n",
+    "import xml.etree.ElementTree as ET\n",
+    "\n",
+    "# URL for the latest UCD XML archive (flattened)\n",
+    "ucd_zip_url = \"https://www.unicode.org/Public/UCD/latest/ucdxml/ucd.all.flat.zip\"\n",
+    "\n",
+    "# Download the ZIP file\n",
+    "with urllib.request.urlopen(ucd_zip_url) as response:\n",
+    "    zip_data = response.read()\n",
+    "print(f\"Downloaded {len(zip_data):,} bytes of UCD XML data\")\n",
+    "\n",
+    "# Read the ZIP file from memory\n",
+    "zip_bytes = io.BytesIO(zip_data)\n",
+    "with zipfile.ZipFile(zip_bytes) as zf:\n",
+    "    # List files in the zip archive (typically one XML file)\n",
+    "    file_list = zf.namelist()\n",
+    "    print(\"Files in zip:\", file_list)\n",
+    "    # Assuming the first file is the desired XML file\n",
+    "    xml_filename = file_list[0]\n",
+    "    with zf.open(xml_filename) as xml_file:\n",
+    "        # Parse the XML file\n",
+    "        tree = ET.parse(xml_file)\n",
+    "        root = tree.getroot()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The XML structure contains a `<repertoire>` element with many `<char>` elements.\n",
+    "Each `<char>` element has attributes, including:\n",
+    "\n",
+    "- `'cp'`: the code point (as hexadecimal)\n",
+    "- `'na'`: the character name\n",
+    "- `'gc'`: the general category"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total code points processed (after expanding ranges): 155,063\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Use a namespace-agnostic search for all elements ending with 'char'\n",
+    "chars = [elem for elem in root.iter() if elem.tag.endswith('char')]\n",
+    "\n",
+    "# List to hold all characters (expanded ranges)\n",
+    "all_chars = []\n",
+    "\n",
+    "def process_char(elem):\n",
+    "    \"\"\"\n",
+    "    Process a <char> element, handling all individual code points (cp)\n",
+    "    and ignoring ranges (first-cp and last-cp). Appends each code point to all_chars.\n",
+    "    \"\"\"\n",
+    "    if 'cp' in elem.attrib:\n",
+    "        cp = int(elem.attrib['cp'], 16)\n",
+    "        entry = {\n",
+    "            'cp': cp,\n",
+    "            'name': elem.attrib.get('na', '').strip(),\n",
+    "            'gc': elem.attrib.get('gc', '').strip(),\n",
+    "            'age': elem.attrib.get('age', '').strip()\n",
+    "            # You can add pull attributes here if needed.\n",
+    "        }\n",
+    "        all_chars.append(entry)\n",
+    "\n",
+    "# Process every 'char' element found\n",
+    "for elem in chars:\n",
+    "    process_char(elem)\n",
+    "\n",
+    "print(f\"Total code points processed (after expanding ranges): {len(all_chars):,}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The Unicode standard defines a range of 1,114,112 possible code points (from U+0000 to U+10FFFF), but only a subset of these are actually assigned characters or have specific property data.\n",
+    "As of Unicode version 16.0, there are 155,063 characters with code points, covering 168 modern and historical scripts, as well as multiple symbol sets, split into [338 blocks](https://en.wikipedia.org/wiki/Unicode_block).\n",
+    "Let's random sample and print a few:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Example symbols:\n",
+      "U+0C15: TELUGU LETTER KA (Lo)\n",
+      "U+1016: MYANMAR LETTER PHA (Lo)\n",
+      "U+98B4: CJK UNIFIED IDEOGRAPH-# (Lo)\n",
+      "U+2BE19: CJK UNIFIED IDEOGRAPH-# (Lo)\n",
+      "U+297D: RIGHT FISH TAIL (Sm)\n",
+      "U+3D9A: CJK UNIFIED IDEOGRAPH-# (Lo)\n",
+      "U+2F95: KANGXI RADICAL VALLEY (So)\n",
+      "U+9527: CJK UNIFIED IDEOGRAPH-# (Lo)\n",
+      "U+16F6E: MIAO VOWEL SIGN UU (Mc)\n",
+      "U+28B5F: CJK UNIFIED IDEOGRAPH-# (Lo)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import random\n",
+    "\n",
+    "random_chars = random.sample(all_chars, 10)\n",
+    "\n",
+    "print(\"Example symbols:\")\n",
+    "for char in random_chars:\n",
+    "    cp = char['cp']\n",
+    "    na = char['name']\n",
+    "    gc = char['gc']\n",
+    "    print(f\"U+{cp:04X}: {na} ({gc})\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A natural question can be asked, is that set of codepoints dense or does it contain holes?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Highest code point: U+E01EF (917,999)\n",
+      "Number of holes: 762,936\n"
+     ]
+    }
+   ],
+   "source": [
+    "highest_code_point = max(char['cp'] for char in all_chars)\n",
+    "print(f\"Highest code point: U+{highest_code_point:04X} ({highest_code_point:,})\")\n",
+    "count_holes = highest_code_point - len(all_chars)\n",
+    "print(f\"Number of holes: {count_holes:,}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The presence of holes means, that simply using the code-point itself as a lookup index would result in a significant \"memory amplification\" factor, lower data locality, and very uneven distribution of data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Memory amplification: 5.9\n"
+     ]
+    }
+   ],
+   "source": [
+    "memory_amplification = 1.0 * highest_code_point / len(all_chars)\n",
+    "print(f\"Memory amplification: {memory_amplification:.1f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For various hash-functions, we may want to find the smallest buffer size that results in no collisions.\n",
+    "Moreover, assuming how small code-points can be, we would prefer hash-functions that only rely on 32-bit arithmetic and avoid expensive operations.\n",
+    "We may want to start by using a power-of-two hash-table size, as the final stage of the hash-function can be a simple bitwise-and operation.\n",
+    "\n",
+    "- $2^{17} = 131072$ is the closes power of two to the number of code-points.\n",
+    "- $2^{18} = 262144$ is the next power of two - the first one that fits all code-points.\n",
+    "\n",
+    "The latter would still have a 69% memory amplifications factor with only 59% of the slots filled."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's export all code-points to a flat NumPy array and for efficiency, calculate all hashes at once."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Memory usage for code points: 620,252 bytes\n"
+     ]
+    }
+   ],
+   "source": [
+    "code_points = np.array([char['cp'] for char in all_chars], dtype=np.uint32)\n",
+    "print(f\"Memory usage for code points: {code_points.nbytes:,} bytes\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---------------------------------------------------------------------\n",
+    "# 1. Jenkins One-at-a-Time Hash\n",
+    "# ---------------------------------------------------------------------\n",
+    "def hash_all_jenkins(code_points: np.ndarray) -> np.ndarray:\n",
+    "    # Ensure input is np.uint32.\n",
+    "    code_points = code_points.astype(np.uint32)\n",
+    "    h = np.zeros_like(code_points, dtype=np.uint32)\n",
+    "    # Process each of the 4 bytes of the 32-bit integer.\n",
+    "    for shift in (0, 8, 16, 24):\n",
+    "        # Extract one byte at a time.\n",
+    "        b = (code_points >> shift) & np.uint32(0xFF)\n",
+    "        h = (h + b) & np.uint32(0xFFFFFFFF)\n",
+    "        h = (h + (h << np.uint32(10))) & np.uint32(0xFFFFFFFF)\n",
+    "        h = (h ^ (h >> np.uint32(6))) & np.uint32(0xFFFFFFFF)\n",
+    "    h = (h + (h << np.uint32(3))) & np.uint32(0xFFFFFFFF)\n",
+    "    h = (h ^ (h >> np.uint32(11))) & np.uint32(0xFFFFFFFF)\n",
+    "    h = (h + (h << np.uint32(15))) & np.uint32(0xFFFFFFFF)\n",
+    "    return h\n",
+    "\n",
+    "# ---------------------------------------------------------------------\n",
+    "# 2. FNV-1a Hash (32-bit)\n",
+    "# ---------------------------------------------------------------------\n",
+    "def hash_all_fnv1a(code_points: np.ndarray) -> np.ndarray:\n",
+    "    # FNV-1a 32-bit parameters\n",
+    "    FNV_offset = np.uint32(0x811C9DC5)\n",
+    "    FNV_prime  = np.uint32(16777619)\n",
+    "    code_points = code_points.astype(np.uint32)\n",
+    "    h = np.full_like(code_points, FNV_offset, dtype=np.uint32)\n",
+    "    # Process each of the 4 bytes\n",
+    "    for shift in (0, 8, 16, 24):\n",
+    "        byte = (code_points >> shift) & np.uint32(0xFF)\n",
+    "        h = h ^ byte\n",
+    "        h = (h * FNV_prime) & np.uint32(0xFFFFFFFF)\n",
+    "    return h\n",
+    "\n",
+    "# ---------------------------------------------------------------------\n",
+    "# 3. Thomas Wang's 32-bit Integer Hash\n",
+    "# ---------------------------------------------------------------------\n",
+    "def hash_all_thomas_wang(code_points: np.ndarray) -> np.ndarray:\n",
+    "    code_points = code_points.astype(np.uint32)\n",
+    "    x = code_points.copy()\n",
+    "    x = (x ^ np.uint32(61)) ^ (x >> np.uint32(16))\n",
+    "    x = (x + (x << np.uint32(3))) & np.uint32(0xFFFFFFFF)\n",
+    "    x = x ^ (x >> np.uint32(4))\n",
+    "    x = (x * np.uint32(0x27d4eb2d)) & np.uint32(0xFFFFFFFF)\n",
+    "    x = x ^ (x >> np.uint32(15))\n",
+    "    return x\n",
+    "\n",
+    "# ---------------------------------------------------------------------\n",
+    "# 4. MurmurHash3 (x86 32-bit variant for 4-byte input)\n",
+    "# ---------------------------------------------------------------------\n",
+    "def hash_all_murmur3(code_points: np.ndarray, seed: np.uint32 = np.uint32(0)) -> np.ndarray:\n",
+    "    code_points = code_points.astype(np.uint32)\n",
+    "    c1 = np.uint32(0xcc9e2d51)\n",
+    "    c2 = np.uint32(0x1b873593)\n",
+    "    r1 = np.uint32(15)\n",
+    "    r2 = np.uint32(13)\n",
+    "    m  = np.uint32(5)\n",
+    "    n  = np.uint32(0xe6546b64)\n",
+    "    \n",
+    "    # Treat each 32-bit integer as 4 bytes of data.\n",
+    "    k = (code_points * c1) & np.uint32(0xFFFFFFFF)\n",
+    "    k = ((k << r1) | (k >> (32 - r1))) & np.uint32(0xFFFFFFFF)\n",
+    "    k = (k * c2) & np.uint32(0xFFFFFFFF)\n",
+    "    \n",
+    "    h = seed ^ k\n",
+    "    h = ((h << r2) | (h >> (32 - r2))) & np.uint32(0xFFFFFFFF)\n",
+    "    h = (h * m + n) & np.uint32(0xFFFFFFFF)\n",
+    "    \n",
+    "    # Since input length is always 4 bytes for a 32-bit integer:\n",
+    "    h ^= np.uint32(4)\n",
+    "    # Finalization mix\n",
+    "    h ^= (h >> np.uint32(16))\n",
+    "    h = (h * np.uint32(0x85ebca6b)) & np.uint32(0xFFFFFFFF)\n",
+    "    h ^= (h >> np.uint32(13))\n",
+    "    h = (h * np.uint32(0xc2b2ae35)) & np.uint32(0xFFFFFFFF)\n",
+    "    h ^= (h >> np.uint32(16))\n",
+    "    return h"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def count_unique(x: np.ndarray) -> int:\n",
+    "    # This approach is about 50% faster than `len(np.unique(x))`.\n",
+    "    if x.size == 0:\n",
+    "        return 0\n",
+    "    xs = np.sort(x)\n",
+    "    return int(np.count_nonzero(np.diff(xs)) + 1)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def rotate_left(x: np.ndarray, r: int) -> np.ndarray:\n",
+    "    \"\"\"Rotate left the 32-bit integers in x by r bits.\"\"\"\n",
+    "    return ((x << np.uint32(r)) | (x >> np.uint32(32 - r))) & np.uint32(0xFFFFFFFF)\n",
+    "\n",
+    "def hash_custom(code_points: np.ndarray) -> np.ndarray:\n",
+    "    \"\"\"\n",
+    "    Compute a composite hash on an array of 32-bit integers.\n",
+    "    The hash is a combination of multiplications, rotations, and XOR mixing.\n",
+    "    \"\"\"\n",
+    "    # Ensure code_points are treated as 32-bit unsigned integers.\n",
+    "    x = code_points.astype(np.uint32)\n",
+    "    \n",
+    "    # First mixing stage:\n",
+    "    # Multiply by a constant and then rotate left.\n",
+    "    x = (x * np.uint32(0xcc9e2d51)) & np.uint32(0xFFFFFFFF)\n",
+    "    x = rotate_left(x, 15)\n",
+    "    \n",
+    "    # Second stage: XOR with a constant.\n",
+    "    x ^= np.uint32(0x1b873593)\n",
+    "    \n",
+    "    # Third stage: Multiply and then rotate.\n",
+    "    x = (x * np.uint32(0x85ebca6b)) & np.uint32(0xFFFFFFFF)\n",
+    "    x = rotate_left(x, 13)\n",
+    "    \n",
+    "    # Fourth stage: Final XOR mix.\n",
+    "    x ^= np.uint32(0xc2b2ae35)\n",
+    "    \n",
+    "    # Optionally, perform one more multiplication to scramble bits further.\n",
+    "    x = (x * np.uint32(0x27d4eb2d)) & np.uint32(0xFFFFFFFF)\n",
+    "    \n",
+    "    return x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Jenkins One-at-a-Time:\n",
+      "Unique hashes: 154,839 = 99.8555%\n",
+      "Unique hashes (modulo size): 97,909 = 63.1414%\n",
+      "Unique hashes (modulo 262144): 117,278 = 75.6325%\n",
+      "\n",
+      "FNV-1a:\n",
+      "Unique hashes: 155,063 = 100.0000%\n",
+      "Unique hashes (modulo size): 101,078 = 65.1851%\n",
+      "Unique hashes (modulo 262144): 104,858 = 67.6228%\n",
+      "\n",
+      "Thomas Wang's Hash:\n",
+      "Unique hashes: 155,063 = 100.0000%\n",
+      "Unique hashes (modulo size): 98,080 = 63.2517%\n",
+      "Unique hashes (modulo 262144): 117,089 = 75.5106%\n",
+      "\n",
+      "MurmurHash3:\n",
+      "Unique hashes: 155,063 = 100.0000%\n",
+      "Unique hashes (modulo size): 98,034 = 63.2220%\n",
+      "Unique hashes (modulo 262144): 116,970 = 75.4339%\n",
+      "\n",
+      "Custom:\n",
+      "Unique hashes: 155,063 = 100.0000%\n",
+      "Unique hashes (modulo size): 98,139 = 63.2898%\n",
+      "Unique hashes (modulo 262144): 116,442 = 75.0933%\n"
+     ]
+    }
+   ],
+   "source": [
+    "for name, func in [\n",
+    "    ('Jenkins One-at-a-Time', hash_all_jenkins),\n",
+    "    ('FNV-1a', hash_all_fnv1a),\n",
+    "    (\"Thomas Wang's Hash\", hash_all_thomas_wang),\n",
+    "    ('MurmurHash3', hash_all_murmur3),\n",
+    "    ('Custom', hash_custom),\n",
+    "]:\n",
+    "    print(f\"\\n{name}:\")\n",
+    "    hashes = func(code_points)\n",
+    "    \n",
+    "    unique_hashes = count_unique(hashes)\n",
+    "    print(f\"Unique hashes: {unique_hashes:,} = {unique_hashes / len(code_points):.4%}\")\n",
+    "    \n",
+    "    # Lets estimate the number of collisions for different modulo values\n",
+    "    hashes_modulo_valid = hashes % len(code_points)\n",
+    "    unique_hashes_modulo_valid = count_unique(hashes_modulo_valid)\n",
+    "    print(f\"Unique hashes (modulo size): {unique_hashes_modulo_valid:,} = {unique_hashes_modulo_valid / len(code_points):.4%}\")\n",
+    "    \n",
+    "    # Try the next power of 2 for modulo size\n",
+    "    bitceil = 2 ** 18\n",
+    "    hashes_modulo_bitceil = hashes % bitceil\n",
+    "    unique_hashes_modulo_bitceil = count_unique(hashes_modulo_bitceil)\n",
+    "    print(f\"Unique hashes (modulo {bitceil}): {unique_hashes_modulo_bitceil:,} = {unique_hashes_modulo_bitceil / len(code_points):.4%}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We end up with a fairly high collision rate of around 37% with vocabulary-size modulo and slightly more tolerable 25% with the next power of two.\n",
+    "Still, that's far from perfect-hashing.\n",
+    "Let's try different multiplicative hash-functions and see if we can find a better one."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Take the range of all 32-bit unsigned integers\n",
+    "# ? Random shuffling to simplify the search for the first multiplier is a good idea,\n",
+    "# ? but it would take forever to run on 4 billion elements in Python.\n",
+    "# ! all_integers = np.arange(1, 2**32, dtype=np.uint32)\n",
+    "# ! np.random.shuffle(all_integers)\n",
+    "all_integers = np.random.permutation(2**32)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Memory usage for all_integers: 17,179,869,184 bytes\n"
+     ]
+    }
+   ],
+   "source": [
+    "all_integers = all_integers.astype(np.uint32)\n",
+    "print(f\"Memory usage for all_integers: {all_integers.nbytes:,} bytes\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Take the range of all 32-bit unsigned integers\n",
+    "bitceil = 2 ** 18\n",
+    "for multiplier in tqdm(all_integers):\n",
+    "    hashes = code_points * multiplier\n",
+    "    \n",
+    "    # Lets estimate the number of collisions for different modulo values\n",
+    "    hashes_modulo_valid = hashes % len(code_points)\n",
+    "    unique_hashes_modulo_valid = count_unique(hashes_modulo_valid)\n",
+    "    if unique_hashes_modulo_valid == len(code_points):\n",
+    "        print(f\"Multiplier (modulo size): {multiplier}\")\n",
+    "        break\n",
+    "    \n",
+    "    # Try the next power of 2 for modulo size\n",
+    "    hashes_modulo_bitceil = hashes % bitceil\n",
+    "    unique_hashes_modulo_bitceil = count_unique(hashes_modulo_bitceil)\n",
+    "    if unique_hashes_modulo_bitceil == len(code_points):\n",
+    "        print(f\"Multiplier (modulo {bitceil}): {multiplier}\")\n",
+    "        break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from numba import uint32\n",
+    "\n",
+    "@njit(nopython=True)\n",
+    "def check_multiplier(code_points: np.ndarray, multiplier: uint32, seen_flags: np.ndarray) -> bool:\n",
+    "    \"\"\"\n",
+    "    Check if the multiplier produces a perfect hash mapping\n",
+    "    for the given code_points with modulus `len(seen_flags)`.\n",
+    "    Returns True if no collisions are found, False otherwise.\n",
+    "    \"\"\"\n",
+    "    # Create an array of flags for each hash value.\n",
+    "    n = code_points.shape[0]\n",
+    "    modulo = uint32(len(seen_flags))\n",
+    "    for i in range(n):\n",
+    "        # Compute hash value (simulate 32-bit wrap-around implicitly via modulo arithmetic)\n",
+    "        h = uint32(code_points[i] * multiplier) % modulo\n",
+    "        if seen_flags[h] == 1:\n",
+    "            # Collision found.\n",
+    "            return False\n",
+    "        seen_flags[h] = 1\n",
+    "    return True\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|          | 13887159/4294967296 [2:11:09<673:54:25, 1764.62it/s]\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mKeyboardInterrupt\u001b[39m                         Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[18]\u001b[39m\u001b[32m, line 14\u001b[39m\n\u001b[32m     11\u001b[39m     \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mMultiplier (modulo size): \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmultiplier\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m     12\u001b[39m     \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m14\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mcheck_multiplier\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcode_points\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmultiplier\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mseen_modulo_bitceil\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[32m     15\u001b[39m     \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mMultiplier (modulo \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mbitceil\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m): \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmultiplier\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m     16\u001b[39m     \u001b[38;5;28;01mbreak\u001b[39;00m\n",
+      "\u001b[31mKeyboardInterrupt\u001b[39m: "
+     ]
+    }
+   ],
+   "source": [
+    "# Take the range of all 32-bit unsigned integers\n",
+    "seen_modulo_vocabulary = np.zeros(len(code_points), dtype=np.uint8)\n",
+    "seen_modulo_bitceil = np.zeros(2 ** 18, dtype=np.uint8)\n",
+    "\n",
+    "for multiplier in tqdm(all_integers):\n",
+    "    seen_modulo_vocabulary.fill(0)\n",
+    "    seen_modulo_bitceil.fill(0)\n",
+    "\n",
+    "    # Lets estimate the number of collisions for different modulo values\n",
+    "    if check_multiplier(code_points, multiplier, seen_modulo_vocabulary):\n",
+    "        print(f\"Multiplier (modulo size): {multiplier}\")\n",
+    "        break\n",
+    "\n",
+    "    if check_multiplier(code_points, multiplier, seen_modulo_bitceil):\n",
+    "        print(f\"Multiplier (modulo {bitceil}): {multiplier}\")\n",
+    "        break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 63daa5f56e3ab92a528936d97d55eab379e29af8 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 9 Mar 2025 05:33:52 +0000
Subject: [PATCH 158/751] Add: `status_t` for errors in C++

---
 include/stringzilla/stringzilla.hpp | 152 +++++++++++++++++-----------
 1 file changed, 93 insertions(+), 59 deletions(-)

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 98817dd7..87e14831 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -1062,6 +1062,31 @@ std::size_t range_length(iterator_type first, iterator_type last) {
 
 #pragma endregion
 
+#pragma region Helper Types
+
+enum class status_t {
+    success_k = sz_success_k,
+    bad_alloc_k = sz_bad_alloc_k,
+    invalid_utf8_k = sz_invalid_utf8_k,
+    contains_duplicates_k = sz_contains_duplicates_k,
+};
+
+#if !SZ_AVOID_STL
+void raise(status_t status) noexcept(false) {
+    switch (status) {
+    case status_t::bad_alloc_k: throw std::bad_alloc();
+    case status_t::invalid_utf8_k: throw std::invalid_argument("Invalid UTF-8 string");
+    case status_t::contains_duplicates_k: throw std::invalid_argument("Array contains identical strings");
+    default: break;
+    }
+}
+
+using sorted_idx_t = sz_sorted_idx_t;
+
+#endif
+
+#pragma endregion
+
 #pragma region Global Operations with Dynamic Memory
 
 template <typename allocator_type_>
@@ -1082,16 +1107,20 @@ static sz_u64_t _call_random_generator(void *state) noexcept {
 }
 
 template <typename allocator_type_, typename allocator_callback_>
-static bool _with_alloc(allocator_type_ &allocator, allocator_callback_ &&callback) noexcept {
+static status_t _with_alloc(allocator_type_ &allocator, allocator_callback_ &&callback) noexcept {
     sz_memory_allocator_t alloc;
     alloc.allocate = &_call_allocate<allocator_type_>;
     alloc.free = &_call_free<allocator_type_>;
     alloc.handle = &allocator;
-    return callback(alloc);
+    return static_cast<status_t>(callback(alloc));
 }
 
+/**
+ *  @brief Helper function, wrapping a C++ allocator into a C-style allocator.
+ *  @return Error code or success. All allocating functions may fail.
+ */
 template <typename allocator_type_, typename allocator_callback_>
-static bool _with_alloc(allocator_callback_ &&callback) noexcept {
+static status_t _with_alloc(allocator_callback_ &&callback) noexcept {
     allocator_type_ allocator;
     return _with_alloc(allocator, std::forward<allocator_callback_>(callback));
 }
@@ -2038,23 +2067,23 @@ class basic_string {
     static_assert(std::is_empty<allocator_type_>::value, "We currently only support stateless allocators");
 
     template <typename allocator_callback>
-    static bool _with_alloc(allocator_callback &&callback) noexcept {
+    static status_t _with_alloc(allocator_callback &&callback) noexcept {
         return ashvardanian::stringzilla::_with_alloc<allocator_type_>(callback);
     }
 
     void init(std::size_t length, char_type value) noexcept(false) {
         sz_ptr_t start;
-        if (!_with_alloc(
-                [&](sz_alloc_type &alloc) { return (start = sz_string_init_length(&string_, length, &alloc)); }))
-            throw std::bad_alloc();
+        raise(_with_alloc([&](sz_alloc_type &alloc) {
+            return (start = sz_string_init_length(&string_, length, &alloc)) ? sz_success_k : sz_bad_alloc_k;
+        }));
         sz_fill(start, length, *(sz_u8_t *)&value);
     }
 
     void init(string_view other) noexcept(false) {
         sz_ptr_t start;
-        if (!_with_alloc(
-                [&](sz_alloc_type &alloc) { return (start = sz_string_init_length(&string_, other.size(), &alloc)); }))
-            throw std::bad_alloc();
+        raise(_with_alloc([&](sz_alloc_type &alloc) {
+            return (start = sz_string_init_length(&string_, other.size(), &alloc)) ? sz_success_k : sz_bad_alloc_k;
+        }));
         sz_copy(start, (sz_cptr_t)other.data(), other.size());
     }
 
@@ -2121,7 +2150,7 @@ class basic_string {
     ~basic_string() noexcept {
         _with_alloc([&](sz_alloc_type &alloc) {
             sz_string_free(&string_, &alloc);
-            return true;
+            return sz_success_k;
         });
     }
 
@@ -2130,7 +2159,7 @@ class basic_string {
         if (!is_internal()) {
             _with_alloc([&](sz_alloc_type &alloc) {
                 sz_string_free(&string_, &alloc);
-                return true;
+                return sz_success_k;
             });
         }
         move(other);
@@ -2750,7 +2779,10 @@ class basic_string {
      *  @return `true` if the operation was successful and potentially reduced the memory footprint, `false` otherwise.
      */
     bool try_shrink_to_fit() noexcept {
-        return _with_alloc([&](sz_alloc_type &alloc) { return sz_string_shrink_to_fit(&string_, &alloc); });
+        auto status = _with_alloc([&](sz_alloc_type &alloc) {
+            return sz_string_shrink_to_fit(&string_, &alloc) ? sz_success_k : sz_bad_alloc_k;
+        });
+        return status == status_t::success_k;
     }
 
     /**
@@ -2759,7 +2791,10 @@ class basic_string {
      *  @return `true` if the reservation was successful, `false` otherwise.
      */
     bool try_reserve(size_type capacity) noexcept {
-        return _with_alloc([&](sz_alloc_type &alloc) { return sz_string_reserve(&string_, capacity, &alloc); });
+        auto status = _with_alloc([&](sz_alloc_type &alloc) {
+            return sz_string_reserve(&string_, capacity, &alloc) ? sz_success_k : sz_bad_alloc_k;
+        });
+        return status == status_t::success_k;
     }
 
     /**
@@ -2827,9 +2862,10 @@ class basic_string {
     bool try_insert(difference_type signed_offset, string_view string) noexcept {
         sz_size_t normalized_offset, normalized_length;
         sz_ssize_clamp_interval(size(), signed_offset, 0, &normalized_offset, &normalized_length);
-        if (!_with_alloc([&](sz_alloc_type &alloc) {
-                return sz_string_expand(&string_, normalized_offset, string.size(), &alloc);
-            }))
+        if (_with_alloc([&](sz_alloc_type &alloc) {
+                return sz_string_expand(&string_, normalized_offset, string.size(), &alloc) ? sz_success_k
+                                                                                            : sz_bad_alloc_k;
+            }) != status_t::success_k)
             return false;
 
         sz_copy(data() + normalized_offset, string.data(), string.size());
@@ -2909,10 +2945,9 @@ class basic_string {
     basic_string &insert(size_type offset, string_view other) noexcept(false) {
         if (offset > size()) throw std::out_of_range("sz::basic_string::insert");
         if (size() + other.size() > max_size()) throw std::length_error("sz::basic_string::insert");
-        if (!_with_alloc(
-                [&](sz_alloc_type &alloc) { return sz_string_expand(&string_, offset, other.size(), &alloc); }))
-            throw std::bad_alloc();
-
+        raise(_with_alloc([&](sz_alloc_type &alloc) {
+            return sz_string_expand(&string_, offset, other.size(), &alloc) ? sz_success_k : sz_bad_alloc_k;
+        }));
         sz_copy(data() + offset, other.data(), other.size());
         return *this;
     }
@@ -2977,8 +3012,9 @@ class basic_string {
         auto added_length = range_length(first, last);
         if (size() + added_length > max_size()) throw std::length_error("sz::basic_string::insert");
 
-        if (!_with_alloc([&](sz_alloc_type &alloc) { return sz_string_expand(&string_, pos, added_length, &alloc); }))
-            throw std::bad_alloc();
+        raise(_with_alloc([&](sz_alloc_type &alloc) {
+            return sz_string_expand(&string_, pos, added_length, &alloc) ? sz_success_k : sz_bad_alloc_k;
+        }));
 
         iterator result = begin() + pos;
         for (iterator output = result; first != last; ++first, ++output) *output = *first;
@@ -3327,8 +3363,7 @@ class basic_string {
     size_type edit_distance(string_view other, size_type bound = 0) const noexcept {
         size_type result;
         _with_alloc([&](sz_alloc_type &alloc) {
-            return sz_levenshtein_distance(data(), size(), other.data(), other.size(), bound, &alloc, &result) !=
-                   sz_bad_alloc_k;
+            return sz_levenshtein_distance(data(), size(), other.data(), other.size(), bound, &alloc, &result);
         });
         return result;
     }
@@ -3485,9 +3520,10 @@ bool basic_string<char_type_, allocator_>::try_resize(size_type count, value_typ
 
     // Allocate more space if needed.
     if (count >= string_space) {
-        if (!_with_alloc([&](sz_alloc_type &alloc) {
-                return sz_string_expand(&string_, SZ_SIZE_MAX, count - string_length, &alloc) != NULL;
-            }))
+        if (_with_alloc([&](sz_alloc_type &alloc) {
+                return sz_string_expand(&string_, SZ_SIZE_MAX, count - string_length, &alloc) ? sz_success_k
+                                                                                              : sz_bad_alloc_k;
+            }) != status_t::success_k)
             return false;
         sz_string_unpack(&string_, &string_start, &string_length, &string_space, &string_is_external);
     }
@@ -3525,12 +3561,12 @@ bool basic_string<char_type_, allocator_>::try_assign(string_view other) noexcep
     }
     // In the common case, however, we need to allocate.
     else {
-        if (!_with_alloc([&](sz_alloc_type &alloc) {
+        if (_with_alloc([&](sz_alloc_type &alloc) {
                 string_start = sz_string_expand(&string_, SZ_SIZE_MAX, other.length() - string_length, &alloc);
-                if (!string_start) return false;
+                if (!string_start) return sz_bad_alloc_k;
                 other.copy(string_start, other.length());
-                return true;
-            }))
+                return sz_success_k;
+            }) != status_t::success_k)
             return false;
     }
     return true;
@@ -3538,18 +3574,19 @@ bool basic_string<char_type_, allocator_>::try_assign(string_view other) noexcep
 
 template <typename char_type_, typename allocator_>
 bool basic_string<char_type_, allocator_>::try_push_back(char_type c) noexcept {
-    return _with_alloc([&](sz_alloc_type &alloc) {
+    auto result = _with_alloc([&](sz_alloc_type &alloc) {
         auto old_size = size();
         sz_ptr_t start = sz_string_expand(&string_, SZ_SIZE_MAX, 1, &alloc);
-        if (!start) return false;
+        if (!start) return sz_bad_alloc_k;
         start[old_size] = c;
-        return true;
+        return sz_success_k;
     });
+    return result == status_t::success_k;
 }
 
 template <typename char_type_, typename allocator_>
 bool basic_string<char_type_, allocator_>::try_append(const_pointer str, size_type length) noexcept {
-    return _with_alloc([&](sz_alloc_type &alloc) {
+    auto result = _with_alloc([&](sz_alloc_type &alloc) {
         // Sometimes we are inserting part of this string into itself.
         // By the time `sz_string_expand` finished, the old `str` pointer may be invalidated,
         // so we need to handle that special case separately.
@@ -3557,16 +3594,17 @@ bool basic_string<char_type_, allocator_>::try_append(const_pointer str, size_ty
         if (str >= this_span.begin() && str < this_span.end()) {
             auto str_offset_in_this = str - data();
             sz_ptr_t start = sz_string_expand(&string_, SZ_SIZE_MAX, length, &alloc);
-            if (!start) return false;
+            if (!start) return sz_bad_alloc_k;
             sz_copy(start + this_span.size(), start + str_offset_in_this, length);
         }
         else {
             sz_ptr_t start = sz_string_expand(&string_, SZ_SIZE_MAX, length, &alloc);
-            if (!start) return false;
+            if (!start) return sz_bad_alloc_k;
             sz_copy(start + this_span.size(), str, length);
         }
-        return true;
+        return sz_success_k;
     });
+    return result == status_t::success_k;
 }
 
 template <typename char_type_, typename allocator_>
@@ -3677,12 +3715,12 @@ bool basic_string<char_type_, allocator_>::try_assign(concatenation<first_type,
         other.copy(string_start, other.length());
     }
     else {
-        if (!_with_alloc([&](sz_alloc_type &alloc) {
+        if (_with_alloc([&](sz_alloc_type &alloc) {
                 string_start = sz_string_expand(&string_, SZ_SIZE_MAX, other.length(), &alloc);
                 if (!string_start) return false;
                 other.copy(string_start, other.length());
                 return true;
-            }))
+            }) != status_t::success_k)
             return false;
     }
     return true;
@@ -3699,7 +3737,7 @@ bool basic_string<char_type_, allocator_>::try_preparing_replacement( //
     assert(offset + length <= size());
 
     // 1. The replacement is the same length as the replaced range.
-    if (replacement_length == length) { return true; }
+    if (replacement_length == length) return true;
 
     // 2. The replacement is shorter than the replaced range.
     else if (replacement_length < length) {
@@ -3708,9 +3746,11 @@ bool basic_string<char_type_, allocator_>::try_preparing_replacement( //
     }
     // 3. The replacement is longer than the replaced range. An allocation may occur.
     else {
-        return _with_alloc([&](sz_alloc_type &alloc) {
-            return sz_string_expand(&string_, offset + length, replacement_length - length, &alloc);
+        auto result = _with_alloc([&](sz_alloc_type &alloc) {
+            return sz_string_expand(&string_, offset + length, replacement_length - length, &alloc) ? sz_success_k
+                                                                                                    : sz_bad_alloc_k;
         });
+        return result == status_t::success_k;
     }
 }
 
@@ -3849,11 +3889,9 @@ std::size_t edit_distance( //
     basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b, std::size_t bound = SZ_SIZE_MAX,
     allocator_type_ &&allocator = allocator_type_ {}) noexcept(false) {
     std::size_t result;
-    if (!_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
-            return sz_levenshtein_distance(a.data(), a.size(), b.data(), b.size(), bound, &alloc, &result) !=
-                   sz_bad_alloc_k;
-        }))
-        throw std::bad_alloc();
+    raise(_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
+        return sz_levenshtein_distance(a.data(), a.size(), b.data(), b.size(), bound, &alloc, &result);
+    }));
     return result;
 }
 
@@ -3877,11 +3915,9 @@ std::size_t edit_distance_utf8(
     basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b, //
     std::size_t bound = SZ_SIZE_MAX, allocator_type_ &&allocator = allocator_type_ {}) noexcept(false) {
     std::size_t result;
-    if (!_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
-            return sz_levenshtein_distance_utf8(a.data(), a.size(), b.data(), b.size(), bound, &alloc, &result) !=
-                   sz_bad_alloc_k;
-        }))
-        throw std::bad_alloc();
+    raise(_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
+        return sz_levenshtein_distance_utf8(a.data(), a.size(), b.data(), b.size(), bound, &alloc, &result);
+    }));
     return result;
 }
 
@@ -3911,11 +3947,9 @@ std::ptrdiff_t alignment_score(
                   "sz_error_cost_t must be signed.");
 
     std::ptrdiff_t result;
-    if (!_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
-            return sz_needleman_wunsch_score(a.data(), a.size(), b.data(), b.size(), &subs[0][0], gap, &alloc,
-                                             &result) != sz_bad_alloc_k;
-        }))
-        throw std::bad_alloc();
+    raise(_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
+        return sz_needleman_wunsch_score(a.data(), a.size(), b.data(), b.size(), &subs[0][0], gap, &alloc, &result);
+    }));
     return result;
 }
 

From 1ce830bfda97715e126d5cefade0e9aa4c1c1ca2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 9 Mar 2025 05:37:31 +0000
Subject: [PATCH 159/751] Break: C++ `lookup` and `fill_random`

---
 include/stringzilla/stringzilla.hpp | 27 ++++++++++++---------------
 scripts/test.cpp                    |  6 +++---
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 87e14831..91b2480b 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -3395,9 +3395,9 @@ class basic_string {
      *  In this case the undefined behaviour in concurrent environments may play in our favor,
      *  but it's recommended to use the other overload in such cases.
      */
-    basic_string &randomize() noexcept {
+    basic_string &fill_random() noexcept {
         static sz_u64_t nonce = 42;
-        return randomize(nonce++);
+        return fill_random(nonce++);
     }
 
     /**
@@ -3407,7 +3407,7 @@ class basic_string {
      *  @throw `std::bad_alloc` if the allocation fails.
      */
     static basic_string random(size_type length, sz_u64_t nonce) noexcept(false) {
-        return basic_string(length, '\0').randomize(nonce);
+        return basic_string(length, '\0').fill_random(nonce);
     }
 
     /**
@@ -3415,7 +3415,7 @@ class basic_string {
      *  @param[in] length The length of the generated string.
      *  @throw `std::bad_alloc` if the allocation fails.
      */
-    static basic_string random(size_type length) noexcept(false) { return basic_string(length, '\0').randomize(); }
+    static basic_string random(size_type length) noexcept(false) { return basic_string(length, '\0').fill_random(); }
 
     /**
      *  @brief Replaces @b (in-place) all occurrences of a given string with the ::replacement string.
@@ -3471,8 +3471,8 @@ class basic_string {
      *  @brief Replaces @b (in-place) all characters in the string using the provided lookup @p table.
      *  @sa sz_lookup
      */
-    basic_string &transform(look_up_table const &table) noexcept {
-        transform(table, data());
+    basic_string &lookup(look_up_table const &table) noexcept {
+        lookup(table, data());
         return *this;
     }
 
@@ -3481,7 +3481,7 @@ class basic_string {
      *  @param[in] output The buffer to write the transformed string into.
      *  @sa sz_lookup
      */
-    void transform(look_up_table const &table, pointer output) const noexcept {
+    void lookup(look_up_table const &table, pointer output) const noexcept {
         sz_ptr_t start;
         sz_size_t length;
         sz_string_range(&string_, &start, &length);
@@ -3971,7 +3971,7 @@ std::ptrdiff_t alignment_score(
  *  @sa sz_fill_random
  */
 template <typename char_type_>
-void randomize(basic_string_slice<char_type_> string, sz_u64_t nonce) noexcept {
+void fill_random(basic_string_slice<char_type_> string, sz_u64_t nonce) noexcept {
     static_assert(!std::is_const<char_type_>::value, "The string must be mutable.");
     sz_fill_random(string.data(), string.size(), nonce);
 }
@@ -3982,9 +3982,8 @@ void randomize(basic_string_slice<char_type_> string, sz_u64_t nonce) noexcept {
  *  @sa sz_fill_random
  */
 template <typename char_type_>
-void lookup(basic_string_slice<char_type_> string, basic_look_up_table<char_type_> const &table) noexcept {
-    static_assert(sizeof(char_type_) == 1, "The character type must be 1 byte long.");
-    sz_lookup((sz_ptr_t)string.data(), (sz_size_t)string.size(), (sz_cptr_t)string.data(), (sz_cptr_t)table.raw());
+void fill_random(basic_string_slice<char_type_> string) noexcept {
+    fill_random(string, std::rand());
 }
 
 /**
@@ -4004,12 +4003,10 @@ void lookup( //
  *  @sa sz_lookup
  */
 template <typename char_type_>
-void randomize(basic_string_slice<char_type_> string, string_view alphabet = "abcdefghijklmnopqrstuvwxyz") noexcept {
-    randomize(string, std::rand, alphabet);
+void lookup(basic_string_slice<char_type_> string, basic_look_up_table<char_type_> const &table) noexcept {
+    lookup(string, table, string.data());
 }
 
-using sorted_idx_t = sz_sorted_idx_t;
-
 /**
  *  @brief Internal data-structure used to wrap arbitrary sequential containers with a random-order lookup.
  *  @sa try_argsort, argsort, try_join, join
diff --git a/scripts/test.cpp b/scripts/test.cpp
index 63452df4..aefcb638 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -1079,9 +1079,9 @@ void test_non_stl_extensions_for_updates() {
     sz::look_up_table invert_case = sz::look_up_table::identity();
     for (char c = 'a'; c <= 'z'; c++) invert_case[c] = c - 'a' + 'A';
     for (char c = 'A'; c <= 'Z'; c++) invert_case[c] = c - 'A' + 'a';
-    assert_scoped(str s = "hello", s.transform(invert_case), s == "HELLO");
-    assert_scoped(str s = "HeLLo", s.transform(invert_case), s == "hEllO");
-    assert_scoped(str s = "H-lL0", s.transform(invert_case), s == "h-Ll0");
+    assert_scoped(str s = "hello", s.lookup(invert_case), s == "HELLO");
+    assert_scoped(str s = "HeLLo", s.lookup(invert_case), s == "hEllO");
+    assert_scoped(str s = "H-lL0", s.lookup(invert_case), s == "h-Ll0");
 
     // Concatenation.
     assert(str(str("a") | str("b")) == "ab");

From 5ea06981d27510b86d70cbdca37378fdd0b10c6c Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 9 Mar 2025 05:38:39 +0000
Subject: [PATCH 160/751] Add: C++ `argsort`, `intersect`

---
 include/stringzilla/stringzilla.hpp | 190 +++++++++++++++++++---------
 1 file changed, 133 insertions(+), 57 deletions(-)

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 91b2480b..a8c077d6 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -4011,27 +4011,25 @@ void lookup(basic_string_slice<char_type_> string, basic_look_up_table<char_type
  *  @brief Internal data-structure used to wrap arbitrary sequential containers with a random-order lookup.
  *  @sa try_argsort, argsort, try_join, join
  */
-template <typename objects_type_, typename string_extractor_>
+template <typename container_type_, typename string_extractor_>
 struct _sequence_args {
-    objects_type_ const *begin;
-    std::size_t count;
-    sorted_idx_t *order;
-    string_extractor_ extractor;
+    container_type_ const &container;
+    string_extractor_ const &extractor;
 };
 
-template <typename objects_type_, typename string_extractor_>
-sz_cptr_t _call_sequence_member_start(void const *sequence, sz_size_t i) {
-    using handle_type = _sequence_args<objects_type_, string_extractor_>;
-    handle_type const *args = reinterpret_cast<handle_type const *>(sequence);
-    string_view member = args->extractor(args->begin[i]);
+template <typename container_type_, typename string_extractor_>
+sz_cptr_t _call_sequence_member_start(void const *sequence_args_ptr, sz_size_t i) {
+    using sequence_args_t = _sequence_args<container_type_, string_extractor_>;
+    sequence_args_t const *args = reinterpret_cast<sequence_args_t const *>(sequence_args_ptr);
+    string_view member = args->extractor(args->container[i]);
     return member.data();
 }
 
-template <typename objects_type_, typename string_extractor_>
-sz_size_t _call_sequence_member_length(void const *sequence, sz_size_t i) {
-    using handle_type = _sequence_args<objects_type_, string_extractor_>;
-    handle_type const *args = reinterpret_cast<handle_type const *>(sequence);
-    string_view member = args->extractor(args->begin[i]);
+template <typename container_type_, typename string_extractor_>
+sz_size_t _call_sequence_member_length(void const *sequence_args_ptr, sz_size_t i) {
+    using sequence_args_t = _sequence_args<container_type_, string_extractor_>;
+    sequence_args_t const *args = reinterpret_cast<sequence_args_t const *>(sequence_args_ptr);
+    string_view member = args->extractor(args->container[i]);
     return static_cast<sz_size_t>(member.size());
 }
 
@@ -4039,40 +4037,79 @@ sz_size_t _call_sequence_member_length(void const *sequence, sz_size_t i) {
  *  @brief  Computes the permutation of an array, that would lead to sorted order.
  *          The elements of the array must be convertible to a `string_view` with the given extractor.
  *          Unlike the `sz_sequence_argsort` C interface, overwrites the output array.
+ *  @sa     sz_sequence_argsort
  *
- *  @param[in] begin       The pointer to the first element of the array.
- *  @param[in] end         The pointer to the element after the last element of the array.
- *  @param[out] order      The pointer to the output array of indices, that will be populated with the permutation.
- *  @param[in] extractor   The function object that extracts the string from the object.
- *
- *  @see    sz_sequence_argsort
+ *  @param[in] begin The pointer to the first element of the array.
+ *  @param[in] end The pointer to the element after the last element of the array.
+ *  @param[in] extractor The function object that extracts the string from the object.
+ *  @param[out] order The pointer to the output array of indices, that will be populated with the permutation.
  */
-template <typename objects_type_, typename string_extractor_>
-void argsort(objects_type_ const *begin, objects_type_ const *end, sorted_idx_t *order,
-             string_extractor_ &&extractor) noexcept {
+template <typename container_type_, typename string_extractor_>
+status_t try_argsort(container_type_ const &container, string_extractor_ const &extractor,
+                     sorted_idx_t *order) noexcept {
 
     // Pack the arguments into a single structure to reference it from the callback.
-    _sequence_args<objects_type_, string_extractor_> args = {begin, static_cast<std::size_t>(end - begin), order,
-                                                             std::forward<string_extractor_>(extractor)};
-    // Populate the array with `iota`-style order.
-    for (std::size_t i = 0; i != args.count; ++i) order[i] = static_cast<sorted_idx_t>(i);
+    using args_t = _sequence_args<container_type_, string_extractor_>;
+    args_t args {container, extractor};
+    sz_sequence_t sequence;
+    sequence.handle = &args;
+    sequence.count = container.size();
+    sequence.get_start = _call_sequence_member_start<container_type_, string_extractor_>;
+    sequence.get_length = _call_sequence_member_length<container_type_, string_extractor_>;
 
-    sz_sequence_t array;
-    array.count = args.count;
-    array.handle = &args;
-    array.get_start = _call_sequence_member_start<objects_type_, string_extractor_>;
-    array.get_length = _call_sequence_member_length<objects_type_, string_extractor_>;
+    using sz_alloc_type = sz_memory_allocator_t;
+    return _with_alloc<std::allocator<sz_u8_t>>(
+        [&](sz_alloc_type &alloc) { return sz_sequence_argsort(&sequence, &alloc, order); });
+}
+
+/**
+ *  @brief Locates the positions of the elements in 2 deduplicated string arrays that have identical values.
+ *  @sa sz_sequence_join
+ *
+ *  @param[in] first_begin The pointer to the first element of the first array.
+ *  @param[in] first_end The pointer to the element after the last element of the first array.
+ *  @param[in] second_begin The pointer to the first element of the second array.
+ *  @param[in] second_end The pointer to the element after the last element of the second array.
+ *  @param[out] first_positions The pointer to the output array of indices from the first array.
+ *  @param[out] second_positions The pointer to the output array of indices from the second array.
+ *  @param[in] first_extractor The function object that extracts the string from the object in the first array.
+ *  @param[in] second_extractor The function object that extracts the string from the object in the second array.
+ */
+template <typename first_container_, typename second_container_, typename first_extractor_, typename second_extractor_>
+status_t try_intersect(                                                                   //
+    first_container_ const &first_container, first_extractor_ const &first_extractor,     //
+    second_container_ const &second_container, second_extractor_ const &second_extractor, //
+    std::uint64_t seed, std::size_t *intersection_size_ptr,                               //
+    sorted_idx_t *first_positions, sorted_idx_t *second_positions) noexcept {
+
+    // Pack the arguments into a single structure to reference it from the callback.
+    using first_t = _sequence_args<first_container_, first_extractor_>;
+    using second_t = _sequence_args<second_container_, second_extractor_>;
+    first_t first_args {first_container, first_extractor};
+    second_t second_args {second_container, second_extractor};
+
+    sz_sequence_t first_sequence, second_sequence;
+    first_sequence.count = first_container.size(), second_sequence.count = second_container.size();
+    first_sequence.handle = &first_args, second_sequence.handle = &second_args;
+    first_sequence.get_start = _call_sequence_member_start<first_container_, first_extractor_>;
+    first_sequence.get_length = _call_sequence_member_length<first_container_, first_extractor_>;
+    second_sequence.get_start = _call_sequence_member_start<second_container_, second_extractor_>;
+    second_sequence.get_length = _call_sequence_member_length<second_container_, second_extractor_>;
 
     using sz_alloc_type = sz_memory_allocator_t;
-    _with_alloc<std::allocator<sz_u8_t>>(
-        [&](sz_alloc_type &alloc) { return sz_sequence_argsort(&array, &alloc, order); });
+    return _with_alloc<std::allocator<sz_u8_t>>([&](sz_alloc_type &alloc) {
+        static_assert(sizeof(sz_size_t) == sizeof(std::size_t), "sz_size_t must be the same size as std::size_t.");
+        return sz_sequence_intersect(&first_sequence, &second_sequence, &alloc, static_cast<sz_u64_t>(seed),
+                                     reinterpret_cast<sz_size_t *>(intersection_size_ptr), first_positions,
+                                     second_positions);
+    });
 }
 
 #if !SZ_AVOID_STL
 #if _SZ_DEPRECATED_FINGERPRINTS
 /**
- *  @brief  Computes the Rabin-Karp-like rolling binary fingerprint of a string.
- *  @see    sz_hashes
+ *  @brief Computes the Rabin-Karp-like rolling binary fingerprint of a string.
+ *  @sa sz_hashes
  */
 template <std::size_t bitset_bits_, typename char_type_>
 void hashes_fingerprint( //
@@ -4105,41 +4142,80 @@ std::bitset<bitset_bits_> hashes_fingerprint(basic_string<char_type_> const &str
 #endif
 
 /**
- *  @brief  Computes the permutation of an array, that would lead to sorted order.
+ *  @brief Computes the permutation of an array, that would lead to sorted order.
  *  @return The array of indices, that will be populated with the permutation.
- *  @throw  `std::bad_alloc` if the allocation fails.
+ *  @throw `std::bad_alloc` if the allocation fails.
  */
-template <typename objects_type_, typename string_extractor_>
+template <typename container_type_, typename string_extractor_>
 std::vector<sorted_idx_t> argsort( //
-    objects_type_ const *begin, objects_type_ const *end, string_extractor_ &&extractor) noexcept(false) {
-    std::vector<sorted_idx_t> order(end - begin);
-    argsort(begin, end, order.data(), std::forward<string_extractor_>(extractor));
+    container_type_ const &container, string_extractor_ const &extractor) noexcept(false) {
+    std::vector<sorted_idx_t> order(container.size());
+    status_t status = try_argsort(container, extractor, order.data());
+    raise(status);
     return order;
 }
 
 /**
- *  @brief  Computes the permutation of an array, that would lead to sorted order.
+ *  @brief Computes the permutation of an array, that would lead to sorted order.
  *  @return The array of indices, that will be populated with the permutation.
- *  @throw  `std::bad_alloc` if the allocation fails.
+ *  @throw `std::bad_alloc` if the allocation fails.
  */
-template <typename string_like_type_>
-std::vector<sorted_idx_t> argsort(string_like_type_ const *begin, string_like_type_ const *end) noexcept(false) {
+template <typename container_type_>
+std::vector<sorted_idx_t> argsort(container_type_ const &container) noexcept(false) {
+    using string_like_type = typename container_type_::value_type;
     static_assert( //
-        std::is_convertible<string_like_type_, string_view>::value, "The type must be convertible to string_view.");
-    return argsort(begin, end, [](string_like_type_ const &s) -> string_view { return s; });
+        std::is_convertible<string_like_type, string_view>::value, "The type must be convertible to string_view.");
+    return argsort(container, [](string_like_type const &s) -> string_view { return s; });
 }
 
+struct intersect_result_t {
+    std::vector<std::size_t> first_offsets;
+    std::vector<std::size_t> second_offsets;
+};
+
 /**
- *  @brief  Computes the permutation of an array, that would lead to sorted order.
- *  @return The array of indices, that will be populated with the permutation.
- *  @throw  `std::bad_alloc` if the allocation fails.
+ *  @brief Locates identical elements in two arrays.
+ *  @return Two arrays of indicies, mapping the elements of the first and the second array that have identical values.
+ *  @throw `std::bad_alloc` if the allocation fails.
  */
-template <typename string_like_type_>
-std::vector<sorted_idx_t> argsort(std::vector<string_like_type_> const &array) noexcept(false) {
+template <typename first_type_, typename second_type_, typename first_extractor_, typename second_extractor_>
+intersect_result_t intersect(first_type_ const &first, second_type_ const &second,
+                             first_extractor_ const &first_extractor, second_extractor_ const &second_extractor,
+                             std::uint64_t seed = 0) noexcept(false) {
+
+    std::size_t const max_count = (std::min)(first.size(), second.size());
+    std::vector<sorted_idx_t> first_positions(max_count);
+    std::vector<sorted_idx_t> second_positions(max_count);
+    std::size_t count;
+    status_t status = try_intersect( //
+        first, first_extractor,      //
+        second, second_extractor,    //
+        seed, &count, first_positions.data(), second_positions.data());
+    raise(status);
+    first_positions.resize(count);
+    second_positions.resize(count);
+    return {std::move(first_positions), std::move(second_positions)};
+}
+
+/**
+ *  @brief Locates identical elements in two arrays.
+ *  @return Two arrays of indicies, mapping the elements of the first and the second array that have identical values.
+ *  @throw `std::bad_alloc` if the allocation fails.
+ */
+template <typename first_type_, typename second_type_>
+intersect_result_t intersect(first_type_ const &first, second_type_ const &second,
+                             std::uint64_t seed = 0) noexcept(false) {
+    using first_string_type = typename first_type_::value_type;
+    using second_string_type = typename second_type_::value_type;
+    static_assert( //
+        std::is_convertible<first_string_type, string_view>::value, "The type must be convertible to string_view.");
     static_assert( //
-        std::is_convertible<string_like_type_, string_view>::value, "The type must be convertible to string_view.");
-    return argsort(array.data(), array.data() + array.size(),
-                   [](string_like_type_ const &s) -> string_view { return s; });
+        std::is_convertible<second_string_type, string_view>::value, "The type must be convertible to string_view.");
+    return intersect(
+        first, second,                                                //
+        [](first_string_type const &s) -> string_view { return s; },  //
+        [](second_string_type const &s) -> string_view { return s; }, //
+        seed);
 }
 
 #endif

From f656577f60ec12ee4eb1b17888b2fc22b71d6e88 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 9 Mar 2025 05:39:45 +0000
Subject: [PATCH 161/751] Improve: Fix minor inconsistencies

---
 .clang-format                       |  1 +
 .vscode/settings.json               | 34 +++++++++++++++-
 README.md                           |  6 +--
 include/stringzilla/hash.h          | 63 ++++++++++++++++++++++++++++-
 include/stringzilla/memory.h        | 63 ++---------------------------
 include/stringzilla/similarity.h    |  8 ++--
 include/stringzilla/stringzilla.hpp | 51 ++++++++++++-----------
 rust/lib.rs                         | 38 ++++++++++++-----
 8 files changed, 159 insertions(+), 105 deletions(-)

diff --git a/.clang-format b/.clang-format
index c97feb6f..1ce7d064 100644
--- a/.clang-format
+++ b/.clang-format
@@ -6,6 +6,7 @@ NamespaceIndentation: None
 ColumnLimit: 120
 ReflowComments: true
 UseTab: Never
+IndentPPDirectives: None
 
 AlignConsecutiveAssignments: false
 AlignConsecutiveDeclarations: false
diff --git a/.vscode/settings.json b/.vscode/settings.json
index a4925981..678b1305 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,5 +1,6 @@
 {
   "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools",
+  "C_Cpp.dimInactiveRegions": false,
   // This may cause overheating.
   // https://github.com/microsoft/vscode-cpptools/issues/1816
   "C_Cpp.workspaceParsingPriority": "low",
@@ -17,6 +18,7 @@
   },
   "cmake.sourceDirectory": "${workspaceRoot}",
   "cSpell.words": [
+    "aesdec",
     "allowoverlap",
     "aminoacid",
     "aminoacids",
@@ -24,6 +26,7 @@
     "Appleby",
     "ASAN",
     "ashvardanian",
+    "Aumasson",
     "Baeza",
     "basicsize",
     "bigram",
@@ -32,10 +35,16 @@
     "bioinformatics",
     "Bitap",
     "bitcast",
+    "bitceil",
     "BLOSUM",
+    "Borwein",
+    "Brase",
     "Brumme",
+    "Byteset",
+    "bytesum",
     "carray",
     "Cawley",
+    "chardet",
     "cheminformatics",
     "cibuildwheel",
     "CONCAT",
@@ -43,6 +52,7 @@
     "copydoc",
     "Corasick",
     "cptr",
+    "DRBG",
     "endregion",
     "endswith",
     "Eron",
@@ -51,7 +61,9 @@
     "getitem",
     "getslice",
     "Giancarlo",
+    "Giordano",
     "Gonnet",
+    "Gotoh",
     "Haswell",
     "Heikki",
     "hexdigits",
@@ -65,6 +77,7 @@
     "isprintable",
     "itemsize",
     "Jaccard",
+    "Kaitchuck",
     "Karp",
     "keeplinebreaks",
     "keepseparator",
@@ -82,13 +95,18 @@
     "memcpy",
     "Merkle-Damgård",
     "Mersenne",
+    "misalign",
     "MODINIT",
+    "Morten",
+    "Mosè",
     "MSVC",
     "napi",
     "nargsf",
     "ndim",
     "Needleman",
     "newfunc",
+    "ngram",
+    "ngrams",
     "NOARGS",
     "noexcept",
     "NOMINMAX",
@@ -97,6 +115,9 @@
     "numpy",
     "octdigits",
     "octogram",
+    "pgram",
+    "pgrams",
+    "Plouffe",
     "printables",
     "pytest",
     "Pythonic",
@@ -104,7 +125,9 @@
     "quadgram",
     "Raita",
     "readlines",
+    "Reini",
     "releasebuffer",
+    "repr",
     "rfind",
     "rfinds",
     "richcompare",
@@ -116,6 +139,7 @@
     "rsplits",
     "rstrip",
     "SIMD",
+    "sklearn",
     "Skylake",
     "splitlines",
     "ssize",
@@ -138,10 +162,14 @@
     "Vardanian",
     "VBMI",
     "vectorcallfunc",
+    "Vectorizer",
     "Wagner",
     "whitespaces",
     "Wunsch",
     "XDECREF",
+    "xmms",
+    "Yann",
+    "Yaroshevskiy",
     "Zilla"
   ],
   "editor.formatOnSave": true,
@@ -149,7 +177,6 @@
     120
   ],
   "files.associations": {
-    "*.tcc": "cpp",
     "__bit_reference": "cpp",
     "__bits": "cpp",
     "__config": "cpp",
@@ -168,12 +195,14 @@
     "__tree": "cpp",
     "__tuple": "cpp",
     "__verbose_abort": "cpp",
+    "*.tcc": "cpp",
     "algorithm": "cpp",
     "any": "cpp",
     "array": "cpp",
     "atomic": "cpp",
     "bit": "cpp",
     "bitset": "cpp",
+    "cassert": "cpp",
     "cctype": "cpp",
     "charconv": "c",
     "chrono": "cpp",
@@ -231,6 +260,7 @@
     "semaphore": "cpp",
     "set": "cpp",
     "shared_mutex": "cpp",
+    "sort.h": "c",
     "source_location": "cpp",
     "span": "cpp",
     "sstream": "cpp",
@@ -269,6 +299,6 @@
     "xstring": "cpp",
     "xtr1common": "cpp",
     "xtree": "cpp",
-    "xutility": "cpp",
+    "xutility": "cpp"
   }
 }
\ No newline at end of file
diff --git a/README.md b/README.md
index 18aea8e2..a3121cb4 100644
--- a/README.md
+++ b/README.md
@@ -1072,11 +1072,11 @@ Similar to Python it also defines the commonly used character sets.
 auto protein = sz::string::random(300, "ARNDCQEGHILKMFPSTWYV"); // static method
 auto dna = sz::basic_string<custom_allocator>::random(3_000_000_000, "ACGT");
 
-dna.randomize("ACGT"); // `noexcept` pre-allocated version
-dna.randomize(&std::rand, "ACGT"); // pass any generator, like `std::mt19937`
+dna.fill_random("ACGT"); // `noexcept` pre-allocated version
+dna.fill_random(&std::rand, "ACGT"); // pass any generator, like `std::mt19937`
 
 char uuid[36];
-sz::randomize(sz::string_span(uuid, 36), "0123456789abcdef-"); // Overwrite any buffer
+sz::fill_random(sz::string_span(uuid, 36), "0123456789abcdef-"); // Overwrite any buffer
 ```
 
 ### Bulk Replacements
diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index e23b700a..1db8a4b3 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -843,7 +843,7 @@ SZ_INTERNAL void _sz_hash_minimal_init_haswell(_sz_hash_minimal_t *state, sz_u64
     __m128i k1 = _mm_xor_si128(seed_vec, pi0);
     __m128i k2 = _mm_xor_si128(seed_vec, pi1);
 
-    // The first 128 bits of the "sum" and "AES" blocks are the same
+    // The first 128 bits of the "sum" and "AES" blocks are the same for the "minimal" and full state
     state->aes.xmm = k1;
     state->sum.xmm = k2;
 }
@@ -1559,6 +1559,8 @@ SZ_INTERNAL void _sz_hash_state_update_ice(sz_hash_state_t *state) {
 
 SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
 
+    // For short strings the "masked loads" are identical to Skylake-X and
+    // the "logic" is identical to Haswell.
     if (length <= 16) {
         // Initialize the AES block with a given seed
         _sz_hash_minimal_t state;
@@ -1611,6 +1613,7 @@ SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t start, sz_size_t length, sz_u64_t seed)
         _sz_hash_minimal_update_haswell(&state, data3_vec.xmm);
         return _sz_hash_minimal_finalize_haswell(&state, length);
     }
+    // This is where the logic differs from Skylake-X and other pre-Ice Lake CPUs:
     else {
         // Use a larger state to handle the main loop and add different offsets
         // to different lanes of the register
@@ -1716,6 +1719,64 @@ SZ_PUBLIC void sz_fill_random_ice(sz_ptr_t output, sz_size_t length, sz_u64_t no
     }
 }
 
+/**
+ *  @brief  A wider parallel analog of `_sz_hash_minimal_t`, which is not used for computing individual hashes,
+ *          but for parallel hashing of @b short 4x separate strings under 16 bytes long.
+ *          Useful for higher-level Database and Machine Learning operations.
+ */
+typedef struct _sz_hash_minimal_x4_t {
+    sz_u512_vec_t aes;
+    sz_u512_vec_t sum;
+    sz_u512_vec_t key;
+} _sz_hash_minimal_x4_t;
+
+SZ_INTERNAL void _sz_hash_minimal_x4_init_ice(_sz_hash_minimal_x4_t *state, sz_u64_t seed) {
+
+    // The key is made from the seed and half of it will be mixed with the length in the end
+    __m512i seed_vec = _mm512_set1_epi64(seed);
+    state->key.zmm = seed_vec;
+
+    // XOR the user-supplied keys with the two "pi" constants
+    sz_u64_t const *pi = _sz_hash_pi_constants();
+    __m512i pi0 = _mm512_load_si512((__m512i const *)(pi));
+    __m512i pi1 = _mm512_load_si512((__m512i const *)(pi + 8));
+    // We will load the entire 512-bit values, but will only use the first 128 bits,
+    // replicating it 4x times across the register. The `_mm512_shuffle_i64x2` is supposed to
+    // be faster than `_mm512_broadcast_i64x2` on Ice Lake.
+    pi0 = _mm512_shuffle_i64x2(pi0, pi0, 0);
+    pi1 = _mm512_shuffle_i64x2(pi1, pi1, 0);
+    __m512i k1 = _mm512_xor_si512(seed_vec, pi0);
+    __m512i k2 = _mm512_xor_si512(seed_vec, pi1);
+
+    // The first 128 bits of the "sum" and "AES" blocks are the same for the "minimal" and full state
+    state->aes.zmm = k1;
+    state->sum.zmm = k2;
+}
+
+SZ_INTERNAL __m256i _sz_hash_minimal_x4_finalize_ice(_sz_hash_minimal_x4_t const *state, //
+                                                     sz_size_t length0, sz_size_t length1, sz_size_t length2,
+                                                     sz_size_t length3) {
+    __m512i const padded_lengths = _mm512_set_epi64(0, length3, 0, length2, 0, length1, 0, length0);
+    // Mix the length into the key
+    __m512i key_with_length = _mm512_add_epi64(state->key.zmm, padded_lengths);
+    // Combine the "sum" and the "AES" blocks
+    __m512i mixed_registers = _mm512_aesenc_epi128(state->sum.zmm, state->aes.zmm);
+    // Make sure the "key" mixes enough with the state,
+    // as with less than 2 rounds - SMHasher fails
+    __m512i mixed_within_register =
+        _mm512_aesenc_epi128(_mm512_aesenc_epi128(mixed_registers, key_with_length), mixed_registers);
+    // Extract the low 64 bits from each 128-bit lane - weirdly using the `permutexvar` instruction
+    // is cheaper than compressing instructions like `_mm512_maskz_compress_epi64`.
+    return _mm512_castsi512_si256(
+        _mm512_permutexvar_epi64(_mm512_set_epi64(0, 0, 0, 0, 6, 4, 2, 0), mixed_within_register));
+}
+
+SZ_INTERNAL void _sz_hash_minimal_x4_update_ice(_sz_hash_minimal_x4_t *state, __m512i blocks) {
+    __m512i const shuffle_mask = _mm512_load_si512((__m512i const *)_sz_hash_u8x16x4_shuffle());
+    state->aes.zmm = _mm512_aesenc_epi128(state->aes.zmm, blocks);
+    state->sum.zmm = _mm512_add_epi64(_mm512_shuffle_epi8(state->sum.zmm, shuffle_mask), blocks);
+}
+
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_ICE
diff --git a/include/stringzilla/memory.h b/include/stringzilla/memory.h
index 79cd840c..5b14108c 100644
--- a/include/stringzilla/memory.h
+++ b/include/stringzilla/memory.h
@@ -5,12 +5,11 @@
  *
  *  Includes core APIs for contiguous memory operations:
  *
- *  - @b `sz_copy` - analog to `memcpy`, probably the most common operation in a computer
- *  - @b `sz_move` - analog to `memmove`, allowing overlapping memory regions, often used in string manipulation
- *  - @b `sz_fill` - analog to `memset`, often used to initialize memory with a constant value, like zero
+ *  - @b `sz_copy` - analog to @b `memcpy`, probably the most common operation in a computer
+ *  - @b `sz_move` - analog to @b `memmove`, allowing overlapping memory regions, often used in string manipulation
+ *  - @b `sz_fill` - analog to @b `memset`, often used to initialize memory with a constant value, like zero
  *  - @b `sz_lookup` - Look-Up Table @b (LUT) transformation of a string, mapping each byte to a new value
  *  - TODO: @b `sz_lookup_utf8` - LUT transformation of a UTF8 string, which can be used for normalization
- *  - TODO: @b `sz_detect_encoding` - detects the character encoding similar to "iconv" or "chardet" tools
  *
  *  All of the core APIs receive the target output buffer as the first argument,
  *  and aim to minimize the number of "store" instructions, especially unaligned ones,
@@ -1084,62 +1083,6 @@ SZ_PUBLIC void sz_lookup_ice(sz_ptr_t target, sz_size_t length, sz_cptr_t source
     }
 }
 
-enum sz_encoding_t {
-    sz_encoding_unknown_k = 0,
-    sz_encoding_ascii_k = 1,
-    sz_encoding_utf8_k = 2,
-    sz_encoding_utf16_k = 3,
-    sz_encoding_utf32_k = 4,
-    sz_encoding_jwt_k = 5,
-    sz_encoding_base64_k = 6,
-    // Low priority encodings:
-    sz_encoding_utf8bom_k = 7,
-    sz_encoding_utf16le_k = 8,
-    sz_encoding_utf16be_k = 9,
-    sz_encoding_utf32le_k = 10,
-    sz_encoding_utf32be_k = 11,
-};
-
-// Character Set Detection is one of the most commonly performed operations in data processing with
-// [Chardet](https://github.com/chardet/chardet), [Charset Normalizer](https://github.com/jawah/charset_normalizer),
-// [cChardet](https://github.com/PyYoshi/cChardet) being the most commonly used options in the Python ecosystem.
-// All of them are notoriously slow.
-//
-// Moreover, as of October 2024, UTF-8 is the dominant character encoding on the web, used by 98.4% of websites.
-// Other have minimal usage, according to [W3Techs](https://w3techs.com/technologies/overview/character_encoding):
-// - ISO-8859-1: 1.2%
-// - Windows-1252: 0.3%
-// - Windows-1251: 0.2%
-// - EUC-JP: 0.1%
-// - Shift JIS: 0.1%
-// - EUC-KR: 0.1%
-// - GB2312: 0.1%
-// - Windows-1250: 0.1%
-// Within programming language implementations and database management systems, 16-bit and 32-bit fixed-width encodings
-// are also very popular and we need a way to efficienly differentiate between the most common UTF flavors, ASCII, and
-// the rest.
-//
-// One good solution is the [simdutf](https://github.com/simdutf/simdutf) library, but it depends on the C++ runtime
-// and focuses more on incremental validation & transcoding, rather than detection.
-//
-// So we need a very fast and efficient way of determining
-SZ_PUBLIC sz_bool_t sz_detect_encoding(sz_cptr_t text, sz_size_t length) {
-    // https://github.com/simdutf/simdutf/blob/master/src/icelake/icelake_utf8_validation.inl.cpp
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_from_utf8.inl.cpp#L81
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L661
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L788
-
-    // We can implement this operation simpler & differently, assuming most of the time continuous chunks of memory
-    // have identical encoding. With Russian and many European languages, we generally deal with 2-byte codepoints
-    // with occasional 1-byte punctuation marks. In the case of Chinese, Japanese, and Korean, we deal with 3-byte
-    // codepoints. In the case of emojis, we deal with 4-byte codepoints.
-    // We can also use the idea, that misaligned reads are quite cheap on modern CPUs.
-    int can_be_ascii = 1, can_be_utf8 = 1, can_be_utf16 = 1, can_be_utf32 = 1;
-    sz_unused(can_be_ascii + can_be_utf8 + can_be_utf16 + can_be_utf32);
-    sz_unused(text && length);
-    return sz_false_k;
-}
-
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_ICE
diff --git a/include/stringzilla/similarity.h b/include/stringzilla/similarity.h
index 058b1313..6d65fcbe 100644
--- a/include/stringzilla/similarity.h
+++ b/include/stringzilla/similarity.h
@@ -309,7 +309,7 @@ SZ_INTERNAL sz_status_t _sz_levenshtein_distance_skewed_diagonals_serial( //
     }
 
     // TODO: Generalize to remove the following asserts!
-    _sz_assert(!bound && "For bounded search the method should only evaluate one band of the matrix.");
+    _sz_assert(bound >= longer_length && "For bounded search the method should only evaluate one band of the matrix.");
     _sz_assert(shorter_length == longer_length && "The method hasn't been generalized to different length inputs yet.");
     sz_unused(longer_length && bound);
 
@@ -860,7 +860,7 @@ SZ_INTERNAL sz_size_t _sz_levenshtein_distance_skewed_diagonals_upto63_ice( //
 
         // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
         __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return longer_length + 1;
+        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return bound;
     }
 
     // Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.
@@ -891,7 +891,7 @@ SZ_INTERNAL sz_size_t _sz_levenshtein_distance_skewed_diagonals_upto63_ice( //
 
         // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
         __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return longer_length + 1;
+        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return bound;
     }
 
     // Now let's handle the bottom right triangle.
@@ -915,7 +915,7 @@ SZ_INTERNAL sz_size_t _sz_levenshtein_distance_skewed_diagonals_upto63_ice( //
 
         // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
         __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return longer_length + 1;
+        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return bound;
 
         // In every following iterations we take use a shorter prefix of each register,
         // but we don't need to update the `next_diagonal_mask` anymore... except for the early exit.
diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index a8c077d6..6b0d598a 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -652,9 +652,10 @@ class range_rmatches {
 
         iterator(string_type haystack, matcher_type matcher) noexcept : matcher_(matcher), remaining_(haystack) {
             auto position = matcher_(remaining_);
-            remaining_.remove_suffix(position != string_type::npos
-                                         ? remaining_.size() - position - matcher_.needle_length()
-                                         : remaining_.size());
+            remaining_.remove_suffix(         //
+                position != string_type::npos //
+                    ? remaining_.size() - position - matcher_.needle_length()
+                    : remaining_.size());
         }
 
         pointer operator->() const noexcept = delete;
@@ -665,9 +666,10 @@ class range_rmatches {
         iterator &operator++() noexcept {
             remaining_.remove_suffix(matcher_.skip_length());
             auto position = matcher_(remaining_);
-            remaining_.remove_suffix(position != string_type::npos
-                                         ? remaining_.size() - position - matcher_.needle_length()
-                                         : remaining_.size());
+            remaining_.remove_suffix(         //
+                position != string_type::npos //
+                    ? remaining_.size() - position - matcher_.needle_length()
+                    : remaining_.size());
             return *this;
         }
 
@@ -1100,12 +1102,10 @@ static void _call_free(void *ptr, sz_size_t n, void *allocator_state) noexcept {
     return reinterpret_cast<allocator_type_ *>(allocator_state)->deallocate(reinterpret_cast<value_type_ *>(ptr), n);
 }
 
-template <typename generator_type_>
-static sz_u64_t _call_random_generator(void *state) noexcept {
-    generator_type_ &generator = *reinterpret_cast<generator_type_ *>(state);
-    return generator();
-}
-
+/**
+ *  @brief Helper function, wrapping a C++ allocator into a C-style allocator.
+ *  @return Error code or success. All allocating functions may fail.
+ */
 template <typename allocator_type_, typename allocator_callback_>
 static status_t _with_alloc(allocator_type_ &allocator, allocator_callback_ &&callback) noexcept {
     sz_memory_allocator_t alloc;
@@ -2018,7 +2018,7 @@ class basic_string_slice {
 #pragma endregion
 
 /**
- *  @brief  Memory-owning string class with a Small String Optimization.
+ *  @brief Memory-owning string class with a Small String Optimization.
  *
  *  @section API
  *
@@ -2873,7 +2873,7 @@ class basic_string {
     }
 
     /**
-     *  @brief  Replaces @b (in-place) a range of characters with a given string.
+     *  @brief Replaces @b (in-place) a range of characters with a given string.
      *  @return `true` if the replacement was successful, `false` otherwise.
      */
     bool try_replace(difference_type signed_start_offset, difference_type signed_end_offset,
@@ -2929,9 +2929,9 @@ class basic_string {
     basic_string &insert(size_type offset, size_type repeats, char_type character) noexcept(false) {
         if (offset > size()) throw std::out_of_range("sz::basic_string::insert");
         if (size() + repeats > max_size()) throw std::length_error("sz::basic_string::insert");
-        if (!_with_alloc([&](sz_alloc_type &alloc) { return sz_string_expand(&string_, offset, repeats, &alloc); }))
-            throw std::bad_alloc();
-
+        raise(_with_alloc([&](sz_alloc_type &alloc) {
+            return sz_string_expand(&string_, offset, repeats, &alloc) ? sz_success_k : sz_bad_alloc_k;
+        }));
         sz_fill(data() + offset, repeats, character);
         return *this;
     }
@@ -2974,10 +2974,10 @@ class basic_string {
     }
 
     /**
-     *  @brief  Inserts @b (in-place) one ::character at the given iterator position.
-     *  @throw  `std::out_of_range` if `pos > size()` or `other_index > other.size()`.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
+     *  @brief Inserts @b (in-place) one ::character at the given iterator position.
+     *  @throw `std::out_of_range` if `pos > size()` or `other_index > other.size()`.
+     *  @throw `std::length_error` if the string is too long.
+     *  @throw `std::bad_alloc` if the allocation fails.
      */
     iterator insert(const_iterator it, char_type character) noexcept(false) {
         auto pos = range_length(cbegin(), it);
@@ -3375,11 +3375,10 @@ class basic_string {
     size_type bytesum() const noexcept { return view().bytesum(); }
 
     /**
-     *  @brief  Overwrites the string with random binary data.
-     *
+     *  @brief Overwrites the string with random binary data.
      *  @param[in] nonce "Number used ONCE" to initialize the random number generator, @b don't repeat it!
      */
-    basic_string &randomize(sz_u64_t nonce) noexcept {
+    basic_string &fill_random(sz_u64_t nonce) noexcept {
         sz_ptr_t start;
         sz_size_t length;
         sz_string_range(&string_, &start, &length);
@@ -3755,8 +3754,8 @@ bool basic_string<char_type_, allocator_>::try_preparing_replacement( //
 }
 
 /**
- *  @brief  Helper function-like object to order string-view convertible objects with StringZilla.
- *  @see    Similar to `std::less<std::string_view>`: https://en.cppreference.com/w/cpp/utility/functional/less
+ *  @brief Helper function-like object to order string-view convertible objects with StringZilla.
+ *  @see Similar to `std::less<std::string_view>`: https://en.cppreference.com/w/cpp/utility/functional/less
  *
  *  Unlike the STL analog, doesn't require C++14 or including the heavy `<functional>` header.
  *  Can be used to combine STL classes with StringZilla logic, like: `std::map<std::string, int, sz::string_view_less>`.
diff --git a/rust/lib.rs b/rust/lib.rs
index b3cbca4e..3d32227a 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -23,6 +23,24 @@ pub mod sz {
         bits: [u64; 4],
     }
 
+    pub type SortedIdx = usize;
+
+    #[repr(C)]
+    pub struct Sequence {
+        pub handle: *const c_void,
+        pub count: usize,
+        pub get_start: Option<unsafe extern "C" fn(handle: *const c_void, idx: usize) -> *const c_void>,
+        pub get_length: Option<unsafe extern "C" fn(handle: *const c_void, idx: usize) -> usize>,
+    }
+
+    /// A simple semantic version structure.
+    #[derive(Debug, Copy, Clone, PartialEq, Eq)]
+    pub struct SemVer {
+        pub major: i32,
+        pub minor: i32,
+        pub patch: i32,
+    }
+
     impl Byteset {
         /// Initializes a bit‑set to an empty collection (all characters banned).
         #[inline]
@@ -102,7 +120,17 @@ pub mod sz {
 
         fn sz_fill_random(text: *mut c_void, length: usize, seed: u64);
 
-        // fn sz_sort() -> Status;
+        pub fn sz_sequence_argsort(sequence: *const Sequence, alloc: *const c_void, order: *mut SortedIdx) -> Status;
+
+        pub fn sz_sequence_intersect(
+            first_sequence: *const Sequence,
+            second_sequence: *const Sequence,
+            alloc: *const c_void,
+            seed: u64,
+            intersection_size: *mut usize,
+            first_positions: *mut SortedIdx,
+            second_positions: *mut SortedIdx,
+        ) -> Status;
 
         pub fn sz_levenshtein_distance(
             a: *const c_void,
@@ -162,14 +190,6 @@ pub mod sz {
         fn sz_lookup(target: *const c_void, length: usize, source: *const c_void, lut: *const u8) -> *const c_void;
     }
 
-    /// A simple semantic version structure.
-    #[derive(Debug, Copy, Clone, PartialEq, Eq)]
-    pub struct SemVer {
-        pub major: i32,
-        pub minor: i32,
-        pub patch: i32,
-    }
-
     impl SemVer {
         pub const fn new(major: i32, minor: i32, patch: i32) -> Self {
             Self { major, minor, patch }

From d19e8b83754b172dc6d57b639b0d8b5feb5b230f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 9 Mar 2025 09:56:05 +0400
Subject: [PATCH 162/751] Fix: `find_1byte` signature compatibility

Without this patch Clang raises
"converts to incompatible function
type" due to the following flag:
`-Wcast-function-type-mismatch`.
---
 include/stringzilla/find.h | 53 ++++++++++++++++++++++++--------------
 1 file changed, 33 insertions(+), 20 deletions(-)

diff --git a/include/stringzilla/find.h b/include/stringzilla/find.h
index d3db653e..1cf99e3b 100644
--- a/include/stringzilla/find.h
+++ b/include/stringzilla/find.h
@@ -401,14 +401,25 @@ SZ_INTERNAL sz_u64_vec_t _sz_u64_each_2byte_equal(sz_u64_vec_t a, sz_u64_vec_t b
     return vec;
 }
 
+SZ_INTERNAL sz_cptr_t _sz_find_1byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+    sz_unused(n_length); //? We keep this argument only for `sz_find_t` signature compatibility.
+    return sz_find_byte_serial(h, h_length, n);
+}
+
+SZ_INTERNAL sz_cptr_t _sz_rfind_1byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+    sz_unused(n_length); //? We keep this argument only for `sz_rfind_t` signature compatibility.
+    return sz_rfind_byte_serial(h, h_length, n);
+}
+
 /**
  *  @brief  Find the first occurrence of a @b two-character needle in an arbitrary length haystack.
  *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
  */
-SZ_INTERNAL sz_cptr_t _sz_find_2byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
+SZ_INTERNAL sz_cptr_t _sz_find_2byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
 
     // This is an internal method, and the haystack is guaranteed to be at least 2 bytes long.
     _sz_assert(h_length >= 2 && "The haystack is too short.");
+    sz_unused(n_length); //? We keep this argument only for `sz_find_t` signature compatibility.
     sz_cptr_t const h_end = h + h_length;
 
 #if !SZ_USE_MISALIGNED_LOADS
@@ -459,10 +470,11 @@ SZ_INTERNAL sz_u64_vec_t _sz_u64_each_4byte_equal(sz_u64_vec_t a, sz_u64_vec_t b
  *  @brief  Find the first occurrence of a @b four-character needle in an arbitrary length haystack.
  *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
  */
-SZ_INTERNAL sz_cptr_t _sz_find_4byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
+SZ_INTERNAL sz_cptr_t _sz_find_4byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
 
     // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
     _sz_assert(h_length >= 4 && "The haystack is too short.");
+    sz_unused(n_length); //? We keep this argument only for `sz_find_t` signature compatibility.
     sz_cptr_t const h_end = h + h_length;
 
 #if !SZ_USE_MISALIGNED_LOADS
@@ -523,10 +535,11 @@ SZ_INTERNAL sz_u64_vec_t _sz_u64_each_3byte_equal(sz_u64_vec_t a, sz_u64_vec_t b
  *  @brief  Find the first occurrence of a @b three-character needle in an arbitrary length haystack.
  *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
  */
-SZ_INTERNAL sz_cptr_t _sz_find_3byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
+SZ_INTERNAL sz_cptr_t _sz_find_3byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
 
     // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
     _sz_assert(h_length >= 3 && "The haystack is too short.");
+    sz_unused(n_length); //? We keep this argument only for `sz_find_t` signature compatibility.
     sz_cptr_t const h_end = h + h_length;
 
 #if !SZ_USE_MISALIGNED_LOADS
@@ -753,24 +766,24 @@ SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
 
 #if _SZ_IS_BIG_ENDIAN
     sz_find_t backends[] = {
-        (sz_find_t)sz_find_byte_serial,
-        (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_find_horspool_over_256bytes_serial,
+        _sz_find_1byte_serial,
+        _sz_find_horspool_upto_256bytes_serial,
+        _sz_find_horspool_over_256bytes_serial,
     };
 
     return backends[(n_length > 1) + (n_length > 256)](h, h_length, n, n_length);
 #else
     sz_find_t backends[] = {
         // For very short strings brute-force SWAR makes sense.
-        (sz_find_t)sz_find_byte_serial,
-        (sz_find_t)_sz_find_2byte_serial,
-        (sz_find_t)_sz_find_3byte_serial,
-        (sz_find_t)_sz_find_4byte_serial,
+        _sz_find_1byte_serial,
+        _sz_find_2byte_serial,
+        _sz_find_3byte_serial,
+        _sz_find_4byte_serial,
         // To avoid constructing the skip-table, let's use the prefixed approach.
-        (sz_find_t)_sz_find_over_4bytes_serial,
+        _sz_find_over_4bytes_serial,
         // For longer needles - use skip tables.
-        (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_find_horspool_over_256bytes_serial,
+        _sz_find_horspool_upto_256bytes_serial,
+        _sz_find_horspool_over_256bytes_serial,
     };
 
     return backends[
@@ -790,16 +803,16 @@ SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n
 
     sz_find_t backends[] = {
         // For very short strings brute-force SWAR makes sense.
-        (sz_find_t)sz_rfind_byte_serial,
+        _sz_rfind_1byte_serial,
         //  TODO: implement reverse-order SWAR for 2/3/4 byte variants.
-        //  TODO: (sz_find_t)_sz_rfind_2byte_serial,
-        //  TODO: (sz_find_t)_sz_rfind_3byte_serial,
-        //  TODO: (sz_find_t)_sz_rfind_4byte_serial,
+        //  TODO: _sz_rfind_2byte_serial,
+        //  TODO: _sz_rfind_3byte_serial,
+        //  TODO: _sz_rfind_4byte_serial,
         // To avoid constructing the skip-table, let's use the prefixed approach.
-        // (sz_find_t)_sz_rfind_over_4bytes_serial,
+        // _sz_rfind_over_4bytes_serial,
         // For longer needles - use skip tables.
-        (sz_find_t)_sz_rfind_horspool_upto_256bytes_serial,
-        (sz_find_t)_sz_rfind_horspool_over_256bytes_serial,
+        _sz_rfind_horspool_upto_256bytes_serial,
+        _sz_rfind_horspool_over_256bytes_serial,
     };
 
     return backends[

From 90540d3a29406874fa049438dc1ed8559bd80eb9 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 9 Mar 2025 10:02:47 +0400
Subject: [PATCH 163/751] Fix: Unused Levenshtein tests

---
 scripts/test.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/test.cpp b/scripts/test.cpp
index aefcb638..ebcc01fe 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -1970,6 +1970,8 @@ int main(int argc, char const **argv) {
     test_search_with_misaligned_repetitions();
 #endif
 
+    test_levenshtein_distances();
+
     std::printf("All tests passed... Unbelievable!\n");
     return 0;
 }

From feb415f74bf86f75f1841f45b88d3c681ae569c1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 9 Mar 2025 10:03:19 +0400
Subject: [PATCH 164/751] Fix: Variable in C++14 `constexpr`

---
 include/stringzilla/stringzilla.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 6b0d598a..ab99bc5f 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -341,8 +341,8 @@ class basic_byteset {
         return *this;
     }
 
-    constexpr basic_byteset operator|(basic_byteset other) const noexcept {
-        basic_byteset result = *this;
+    sz_constexpr_if_cpp14 basic_byteset operator|(basic_byteset other) const noexcept {
+        basic_byteset result = *this; //? Variable declaration in a `constexpr` function is a C++14 extension
         result.bitset_._u64s[0] |= other.bitset_._u64s[0], result.bitset_._u64s[1] |= other.bitset_._u64s[1],
             result.bitset_._u64s[2] |= other.bitset_._u64s[2], result.bitset_._u64s[3] |= other.bitset_._u64s[3];
         return result;

From a7b35bad578482b405dc0307ecd62350717c1c9e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 9 Mar 2025 10:45:22 +0400
Subject: [PATCH 165/751] Make: Don't build `stringzilla_bare` on MacOS

---
 CMakeLists.txt | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 15145843..1da1e36f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -369,21 +369,24 @@ if(${STRINGZILLA_BUILD_SHARED})
   target_compile_definitions(stringzilla_shared PRIVATE "SZ_OVERRIDE_LIBC=1")
   target_include_directories(stringzilla_shared PUBLIC include)
   
-  
-  # Try compiling a version without linking the LibC
-  define_shared(stringzilla_bare)
-  target_compile_definitions(stringzilla_bare PRIVATE "SZ_AVOID_LIBC=1")
-  target_compile_definitions(stringzilla_bare PRIVATE "SZ_OVERRIDE_LIBC=1")
-  target_include_directories(stringzilla_bare PUBLIC include)
-
-  # Avoid built-ins on MSVC and other compilers, as that will cause compilation errors
-  target_compile_options(stringzilla_bare PRIVATE
-    "$<$<CXX_COMPILER_ID:GNU,Clang>:-fno-builtin;-nostdlib>"
-    "$<$<CXX_COMPILER_ID:MSVC>:/Oi-;/GS->")
-  target_link_options(stringzilla_bare PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang>:-nostdlib>")
-  target_link_options(stringzilla_bare PRIVATE "$<$<CXX_COMPILER_ID:MSVC>:/NODEFAULTLIB>")
-
 
+  # Try compiling a version without linking the LibC
+  # ! This is only for Linux and Windows, as on modern Arm-based MacOS machines
+  # ! we can't legally access Arm's "feature registers" without `sysctl` or `sysctlbyname`.
+  # So let's check if we are compiling for a Darwin-based OS.
+  if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    define_shared(stringzilla_bare)
+    target_compile_definitions(stringzilla_bare PRIVATE "SZ_AVOID_LIBC=1")
+    target_compile_definitions(stringzilla_bare PRIVATE "SZ_OVERRIDE_LIBC=1")
+    target_include_directories(stringzilla_bare PUBLIC include)
+
+    # Avoid built-ins on MSVC and other compilers, as that will cause compilation errors
+    target_compile_options(stringzilla_bare PRIVATE
+      "$<$<CXX_COMPILER_ID:GNU,Clang>:-fno-builtin;-nostdlib>"
+      "$<$<CXX_COMPILER_ID:MSVC>:/Oi-;/GS->")
+    target_link_options(stringzilla_bare PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang>:-nostdlib>")
+    target_link_options(stringzilla_bare PRIVATE "$<$<CXX_COMPILER_ID:MSVC>:/NODEFAULTLIB>")
+  endif()
 endif()
 
 if(STRINGZILLA_INSTALL)

From f712de33b4c6fcadac27bf4b59feb7b01234f234 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 9 Mar 2025 18:38:15 +0000
Subject: [PATCH 166/751] Fix: `sz_intersect` signature

---
 c/lib.c                     | 18 +++++++++---------
 include/stringzilla/types.h | 11 ++++++-----
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/c/lib.c b/c/lib.c
index 9c6324dd..7f1219ca 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -194,7 +194,7 @@ typedef struct sz_implementations_t {
     sz_needleman_wunsch_score_t alignment_score;
 
     sz_sequence_argsort_t sequence_argsort;
-    sz_sequence_join_t sequence_join;
+    sz_sequence_intersect_t sequence_intersect;
     sz_pgrams_sort_t pgrams_sort;
 
 } sz_implementations_t;
@@ -239,7 +239,7 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
     impl->alignment_score = sz_needleman_wunsch_score_serial;
 
     impl->sequence_argsort = sz_sequence_argsort_serial;
-    impl->sequence_join = sz_sequence_join_serial;
+    impl->sequence_intersect = sz_sequence_intersect_serial;
     impl->pgrams_sort = sz_pgrams_sort_serial;
 
 #if SZ_USE_HASWELL
@@ -291,7 +291,7 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->bytesum = sz_bytesum_skylake;
 
         impl->sequence_argsort = sz_sequence_argsort_skylake;
-        impl->sequence_join = sz_sequence_join_skylake;
+        impl->sequence_intersect = sz_sequence_intersect_skylake;
         impl->pgrams_sort = sz_pgrams_sort_skylake;
     }
 #endif
@@ -343,7 +343,7 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
 #if SZ_USE_SVE
     if (caps & sz_cap_sve_k) {
         impl->sequence_argsort = sz_sequence_argsort_sve;
-        impl->sequence_join = sz_sequence_join_sve;
+        impl->sequence_intersect = sz_sequence_intersect_sve;
         impl->pgrams_sort = sz_pgrams_sort_sve;
     }
 #endif
@@ -507,11 +507,11 @@ SZ_DYNAMIC sz_status_t sz_sequence_argsort(sz_sequence_t const *array, sz_memory
     return sz_dispatch_table.sequence_argsort(array, alloc, order);
 }
 
-SZ_DYNAMIC sz_status_t sz_sequence_join(sz_sequence_t const *first_array, sz_sequence_t const *second_array,
-                                        sz_memory_allocator_t *alloc, sz_size_t *intersection_size,
-                                        sz_size_t *first_positions, sz_size_t *second_positions) {
-    return sz_dispatch_table.sequence_join(first_array, second_array, alloc, intersection_size, first_positions,
-                                           second_positions);
+SZ_DYNAMIC sz_status_t sz_sequence_intersect(sz_sequence_t const *first_array, sz_sequence_t const *second_array,
+                                             sz_memory_allocator_t *alloc, sz_u64_t seed, sz_size_t *intersection_size,
+                                             sz_size_t *first_positions, sz_size_t *second_positions) {
+    return sz_dispatch_table.sequence_intersect(first_array, second_array, alloc, seed, intersection_size,
+                                                first_positions, second_positions);
 }
 
 // Provide overrides for the libc mem* functions
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 6d693347..11366304 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -565,9 +565,10 @@ typedef sz_status_t (*sz_sequence_argsort_t)(struct sz_sequence_t const *, sz_me
 /** @brief Signature of `sz_pgrams_sort`. */
 typedef sz_status_t (*sz_pgrams_sort_t)(sz_pgram_t *, sz_size_t, sz_memory_allocator_t *, sz_sorted_idx_t *);
 
-/** @brief Signature of `sz_sequence_join`. */
-typedef sz_status_t (*sz_sequence_join_t)(struct sz_sequence_t const *, struct sz_sequence_t const *,
-                                          sz_memory_allocator_t *, sz_size_t *, sz_sorted_idx_t *, sz_sorted_idx_t *);
+/** @brief Signature of `sz_sequence_intersect`. */
+typedef sz_status_t (*sz_sequence_intersect_t)(struct sz_sequence_t const *, struct sz_sequence_t const *,
+                                               sz_memory_allocator_t *, sz_u64_t, sz_size_t *, sz_sorted_idx_t *,
+                                               sz_sorted_idx_t *);
 
 #pragma endregion
 
@@ -726,9 +727,9 @@ SZ_INTERNAL sz_size_t _sz_export_utf8_to_utf32(sz_cptr_t utf8, sz_size_t utf8_le
 #pragma region String Sequences API
 
 /** @brief Signature of `sz_sequence_t::get_start` used to get the start of a member string at a given index. */
-typedef sz_cptr_t (*sz_sequence_member_start_t)(void const *, sz_size_t);
+typedef sz_cptr_t (*sz_sequence_member_start_t)(void const *, sz_sorted_idx_t);
 /** @brief Signature of `sz_sequence_t::get_length` used to get the length of a member string at a given index. */
-typedef sz_size_t (*sz_sequence_member_length_t)(void const *, sz_size_t);
+typedef sz_size_t (*sz_sequence_member_length_t)(void const *, sz_sorted_idx_t);
 
 /**
  *  @brief  Structure to represent an ordered collection of strings.

From 8bb90e54210bf5d1e3e626de12ff6086c3ad358e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 9 Mar 2025 18:42:04 +0000
Subject: [PATCH 167/751] Fix: Unused `_sz_capabilities` symbols

---
 c/lib.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/c/lib.c b/c/lib.c
index 7f1219ca..7aea9455 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -51,6 +51,8 @@ extern void *malloc(size_t length);
 #include <windows.h> // `DllMain`
 #endif
 
+#if _SZ_IS_ARM64
+
 /**
  *  @brief  Function to determine the SIMD capabilities of the current 64-bit Arm machine at @b runtime.
  *  @return A bitmask of the SIMD capabilities represented as a `sz_capability_t` enum value.
@@ -65,8 +67,8 @@ SZ_INTERNAL sz_capability_t _sz_capabilities_arm(void) {
     size_t size = sizeof(supports_neon);
     if (sysctlbyname("hw.optional.neon", &supports_neon, &size, NULL, 0) != 0) supports_neon = 0;
 
-    return (sz_capability_t)(                   //
-        (sz_cap_arm_neon_k * (supports_neon)) | //
+    return (sz_capability_t)(               //
+        (sz_cap_neon_k * (supports_neon)) | //
         (sz_cap_serial_k));
 
 #elif defined(_SZ_IS_LINUX)
@@ -107,6 +109,10 @@ SZ_INTERNAL sz_capability_t _sz_capabilities_arm(void) {
 #endif
 }
 
+#endif // _SZ_IS_ARM64
+
+#if _SZ_IS_X86_64
+
 SZ_INTERNAL sz_capability_t _sz_capabilities_x86(void) {
 
 #if SZ_USE_HASWELL || SZ_USE_SKYLAKE || SZ_USE_ICE
@@ -152,6 +158,7 @@ SZ_INTERNAL sz_capability_t _sz_capabilities_x86(void) {
     return sz_cap_serial_k;
 #endif
 }
+#endif // _SZ_IS_X86_64
 
 /**
  *  @brief  Function to determine the SIMD capabilities of the current 64-bit x86 machine at @b runtime.

From 63f0368dc869c0d977c308a9015e03211f5a4866 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 9 Mar 2025 18:44:14 +0000
Subject: [PATCH 168/751] Add: Missing SVE placeholder definition

Only declarations were present for SVE
---
 include/stringzilla/hash.h      | 18 ++++++++++++++++++
 include/stringzilla/intersect.h | 22 ++++++++++++++++++++++
 include/stringzilla/sort.h      | 23 +++++++++++++++++++++++
 3 files changed, 63 insertions(+)

diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 1db8a4b3..aedbff89 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -1833,6 +1833,24 @@ SZ_PUBLIC sz_u64_t sz_hash_state_fold_neon(sz_hash_state_t const *state) { retur
 #pragma GCC target("arch=armv8.2-a+sve")
 #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
 
+SZ_PUBLIC sz_u64_t sz_bytesum_sve(sz_cptr_t text, sz_size_t length) { return sz_bytesum_serial(text, length); }
+
+SZ_PUBLIC void sz_hash_state_init_sve(sz_hash_state_t *state, sz_u64_t seed) { sz_hash_state_init_serial(state, seed); }
+
+SZ_PUBLIC void sz_hash_state_stream_sve(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
+    sz_hash_state_stream_serial(state, text, length);
+}
+
+SZ_PUBLIC sz_u64_t sz_hash_state_fold_sve(sz_hash_state_t const *state) { return sz_hash_state_fold_serial(state); }
+
+SZ_PUBLIC sz_u64_t sz_hash_sve(sz_cptr_t text, sz_size_t length, sz_u64_t seed) {
+    return sz_hash_serial(text, length, seed);
+}
+
+SZ_PUBLIC void sz_fill_random_sve(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
+    sz_fill_random_serial(text, length, nonce);
+}
+
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_SVE
diff --git a/include/stringzilla/intersect.h b/include/stringzilla/intersect.h
index 77033148..b3610969 100644
--- a/include/stringzilla/intersect.h
+++ b/include/stringzilla/intersect.h
@@ -713,6 +713,28 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_ice(
 #endif            // SZ_USE_ICE
 #pragma endregion // Ice Lake Implementation
 
+#pragma region SVE Implementation
+#if SZ_USE_SVE
+#pragma GCC push_options
+#pragma GCC target("arch=armv8.2-a+sve")
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
+
+SZ_PUBLIC sz_status_t sz_sequence_intersect_sve(sz_sequence_t const *first_sequence,
+                                                sz_sequence_t const *second_sequence, //
+                                                sz_memory_allocator_t *alloc, sz_u64_t seed,
+                                                sz_size_t *intersection_size, sz_sorted_idx_t *first_positions,
+                                                sz_sorted_idx_t *second_positions) {
+    return sz_sequence_intersect_serial( //
+        first_sequence, second_sequence, //
+        alloc, seed, intersection_size,  //
+        first_positions, second_positions);
+}
+
+#pragma clang attribute pop
+#pragma GCC pop_options
+#endif            // SZ_USE_SVE
+#pragma endregion // SVE Implementation
+
 /*  Pick the right implementation for the string search algorithms.
  *  To override this behavior and precompile all backends - set `SZ_DYNAMIC_DISPATCH` to 1.
  */
diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index af808cb5..b4d487bd 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -925,6 +925,29 @@ SZ_PUBLIC sz_status_t sz_sequence_argsort_skylake(sz_sequence_t const *sequence,
 #endif            // SZ_USE_SKYLAKE
 #pragma endregion // Ice Lake Implementation
 
+#pragma region SVE Implementation
+#if SZ_USE_SVE
+#pragma GCC push_options
+#pragma GCC target("arch=armv8.2-a+sve")
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
+
+/** @copydoc sz_sequence_argsort */
+SZ_PUBLIC sz_status_t sz_sequence_argsort_sve(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                              sz_sorted_idx_t *order) {
+    return sz_sequence_argsort_serial(sequence, alloc, order);
+}
+
+/** @copydoc sz_pgrams_sort */
+SZ_PUBLIC sz_status_t sz_pgrams_sort_sve(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
+                                         sz_sorted_idx_t *order) {
+    return sz_pgrams_sort_serial(pgrams, count, alloc, order);
+}
+
+#pragma clang attribute pop
+#pragma GCC pop_options
+#endif            // SZ_USE_SVE
+#pragma endregion // SVE Implementation
+
 /*  Pick the right implementation for the string search algorithms.
  *  To override this behavior and precompile all backends - set `SZ_DYNAMIC_DISPATCH` to 1.
  */

From 2965502c82a7e96c3252b745e029f713f8c2b1f9 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 9 Mar 2025 18:44:58 +0000
Subject: [PATCH 169/751] Fix: Guard Skylake benchmarks

---
 scripts/bench_sort.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/bench_sort.cpp b/scripts/bench_sort.cpp
index f32d9909..8cdf97fd 100644
--- a/scripts/bench_sort.cpp
+++ b/scripts/bench_sort.cpp
@@ -112,6 +112,7 @@ int main(int argc, char const **argv) {
     });
     expect_sorted(pgrams, permute);
 
+#if SZ_USE_SKYLAKE
     bench_permute("sz_pgrams_sort_skylake", [&]() {
         std::copy(pgrams.begin(), pgrams.end(), pgrams_sorted.begin());
         std::iota(permute.begin(), permute.end(), 0);
@@ -120,6 +121,7 @@ int main(int argc, char const **argv) {
         });
     });
     expect_sorted(pgrams, permute);
+#endif
 
     // Sorting strings
     bench_permute("std::sort(positions)", [&]() {
@@ -140,7 +142,7 @@ int main(int argc, char const **argv) {
             [&](sz_memory_allocator_t &alloc) { return sz_sequence_argsort_serial(&array, &alloc, permute.data()); });
     });
     expect_sorted(strings, permute);
-
+#if SZ_USE_SKYLAKE
     bench_permute("sz_sequence_argsort_skylake", [&]() {
         std::iota(permute.begin(), permute.end(), 0);
         sz_sequence_t array;
@@ -152,6 +154,7 @@ int main(int argc, char const **argv) {
             [&](sz_memory_allocator_t &alloc) { return sz_sequence_argsort_skylake(&array, &alloc, permute.data()); });
     });
     expect_sorted(strings, permute);
+#endif
 
 #if __linux__ && defined(_GNU_SOURCE) && !defined(__BIONIC__)
     bench_permute("qsort_r", [&]() {

From 4b3847df38feef40e8ad4bf53eab3dc5c6f1c83d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 9 Mar 2025 19:05:26 +0000
Subject: [PATCH 170/751] Add: Arm NEON hashing

---
 include/stringzilla/hash.h  | 379 ++++++++++++++++++++++++++++++++++--
 include/stringzilla/types.h |  12 ++
 2 files changed, 374 insertions(+), 17 deletions(-)

diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index aedbff89..eb748ef4 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -61,8 +61,6 @@
  *  @see The serial AES routines are based on Morten Jensen's "tiny-AES-c": https://github.com/kokke/tiny-AES-c
  *  @see The "xxHash" C implementation by Yann Collet: https://github.com/Cyan4973/xxHash
  *  @see The "aHash" Rust implementation by Tom Kaitchuck: https://github.com/tkaitchuck/aHash
- *  @see "Emulating x86 AES Intrinsics on ARMv8-A" by Michael Brase:
- *       https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a/
  *
  *  Moreover, the same AES primitives are reused to implement a fast Pseudo-Random Number Generator @b (PRNG) that
  *  is consistent between different implementation backends and has reproducible output with the same "nonce".
@@ -877,7 +875,7 @@ SZ_PUBLIC void sz_hash_state_init_haswell(sz_hash_state_t *state, sz_u64_t seed)
     for (int i = 0; i < 4; ++i)
         state->aes.xmms[i] = _mm_xor_si128(seed_vec, _mm_load_si128((__m128i const *)(pi + i * 2)));
     for (int i = 0; i < 4; ++i)
-        state->sum.xmms[i] = _mm_xor_si128(seed_vec, _mm_load_si128((__m128i const *)(pi + i * 2 + 8)));
+        state->sum.u64x2s[i] = _mm_xor_si128(seed_vec, _mm_load_si128((__m128i const *)(pi + i * 2 + 8)));
 
     // The inputs are zeroed out at the beginning
     state->ins.xmms[0] = state->ins.xmms[1] = state->ins.xmms[2] = state->ins.xmms[3] = _mm_setzero_si128();
@@ -887,23 +885,23 @@ SZ_PUBLIC void sz_hash_state_init_haswell(sz_hash_state_t *state, sz_u64_t seed)
 SZ_INTERNAL void _sz_hash_state_update_haswell(sz_hash_state_t *state) {
     __m128i const shuffle_mask = _mm_load_si128((__m128i const *)_sz_hash_u8x16x4_shuffle());
     state->aes.xmms[0] = _mm_aesenc_si128(state->aes.xmms[0], state->ins.xmms[0]);
-    state->sum.xmms[0] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[0], shuffle_mask), state->ins.xmms[0]);
+    state->sum.u64x2s[0] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.u64x2s[0], shuffle_mask), state->ins.xmms[0]);
     state->aes.xmms[1] = _mm_aesenc_si128(state->aes.xmms[1], state->ins.xmms[1]);
-    state->sum.xmms[1] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[1], shuffle_mask), state->ins.xmms[1]);
+    state->sum.u64x2s[1] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.u64x2s[1], shuffle_mask), state->ins.xmms[1]);
     state->aes.xmms[2] = _mm_aesenc_si128(state->aes.xmms[2], state->ins.xmms[2]);
-    state->sum.xmms[2] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[2], shuffle_mask), state->ins.xmms[2]);
+    state->sum.u64x2s[2] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.u64x2s[2], shuffle_mask), state->ins.xmms[2]);
     state->aes.xmms[3] = _mm_aesenc_si128(state->aes.xmms[3], state->ins.xmms[3]);
-    state->sum.xmms[3] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[3], shuffle_mask), state->ins.xmms[3]);
+    state->sum.u64x2s[3] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.u64x2s[3], shuffle_mask), state->ins.xmms[3]);
 }
 
 SZ_INTERNAL sz_u64_t _sz_hash_state_finalize_haswell(sz_hash_state_t const *state) {
     // Mix the length into the key
     __m128i key_with_length = _mm_add_epi64(state->key.xmm, _mm_set_epi64x(0, state->ins_length));
     // Combine the "sum" and the "AES" blocks
-    __m128i mixed_registers0 = _mm_aesenc_si128(state->sum.xmms[0], state->aes.xmms[0]);
-    __m128i mixed_registers1 = _mm_aesenc_si128(state->sum.xmms[1], state->aes.xmms[1]);
-    __m128i mixed_registers2 = _mm_aesenc_si128(state->sum.xmms[2], state->aes.xmms[2]);
-    __m128i mixed_registers3 = _mm_aesenc_si128(state->sum.xmms[3], state->aes.xmms[3]);
+    __m128i mixed_registers0 = _mm_aesenc_si128(state->sum.u64x2s[0], state->aes.xmms[0]);
+    __m128i mixed_registers1 = _mm_aesenc_si128(state->sum.u64x2s[1], state->aes.xmms[1]);
+    __m128i mixed_registers2 = _mm_aesenc_si128(state->sum.u64x2s[2], state->aes.xmms[2]);
+    __m128i mixed_registers3 = _mm_aesenc_si128(state->sum.u64x2s[3], state->aes.xmms[3]);
     // Combine the mixed registers
     __m128i mixed_registers01 = _mm_aesenc_si128(mixed_registers0, mixed_registers1);
     __m128i mixed_registers23 = _mm_aesenc_si128(mixed_registers2, mixed_registers3);
@@ -1045,7 +1043,7 @@ SZ_PUBLIC sz_u64_t sz_hash_state_fold_haswell(sz_hash_state_t const *state) {
     _sz_hash_minimal_t minimal_state;
     minimal_state.key.xmm = state->key.xmm;
     minimal_state.aes.xmm = state->aes.xmms[0];
-    minimal_state.sum.xmm = state->sum.xmms[0];
+    minimal_state.sum.xmm = state->sum.u64x2s[0];
 
     // The logic is different depending on the length of the input
     __m128i const *ins_vecs = (__m128i const *)&state->ins.xmms[0];
@@ -1788,8 +1786,8 @@ SZ_INTERNAL void _sz_hash_minimal_x4_update_ice(_sz_hash_minimal_x4_t *state, __
 #pragma region NEON Implementation
 #if SZ_USE_NEON
 #pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+simd")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
+#pragma GCC target("arch=armv8.2-a+simd+crypto")
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+crypto"))), apply_to = function)
 
 SZ_PUBLIC sz_u64_t sz_bytesum_neon(sz_cptr_t text, sz_size_t length) {
     uint64x2_t sum_vec = vdupq_n_u64(0);
@@ -1809,15 +1807,362 @@ SZ_PUBLIC sz_u64_t sz_bytesum_neon(sz_cptr_t text, sz_size_t length) {
     return sum;
 }
 
+/**
+ *  @brief  Emulates the Intel's AES-NI `AESENC` instruction on Arm NEON.
+ *  @see    "Emulating x86 AES Intrinsics on ARMv8-A" by Michael Brase:
+ *          https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a/
+ */
+SZ_INTERNAL uint8x16_t _sz_emulate_aesenc_u8x16_neon(uint8x16_t state_vec, uint8x16_t round_key_vec) {
+    return veorq_u8(vaesmcq_u8(vaeseq_u8(state_vec, vdupq_n_u8(0))), round_key_vec);
+}
+
+SZ_INTERNAL uint64x2_t _sz_emulate_aesenc_u64x2_neon(uint64x2_t state_vec, uint64x2_t round_key_vec) {
+    return vreinterpretq_u64_u8(             //
+        _sz_emulate_aesenc_u8x16_neon(       //
+            vreinterpretq_u8_u64(state_vec), //
+            vreinterpretq_u8_u64(round_key_vec)));
+}
+
+SZ_INTERNAL void _sz_hash_minimal_init_neon(_sz_hash_minimal_t *state, sz_u64_t seed) {
+
+    // The key is made from the seed and half of it will be mixed with the length in the end
+    uint64x2_t seed_vec = vdupq_n_u64(seed);
+    state->key.u64x2 = seed_vec;
+
+    // XOR the user-supplied keys with the two "pi" constants
+    sz_u64_t const *pi = _sz_hash_pi_constants();
+    uint64x2_t const pi0 = vld1q_u64(pi);
+    uint64x2_t const pi1 = vld1q_u64(pi + 8);
+    uint64x2_t k1 = veorq_u64(seed_vec, pi0);
+    uint64x2_t k2 = veorq_u64(seed_vec, pi1);
+
+    // The first 128 bits of the "sum" and "AES" blocks are the same for the "minimal" and full state
+    state->aes.u64x2 = k1;
+    state->sum.u64x2 = k2;
+}
+
+SZ_INTERNAL sz_u64_t _sz_hash_minimal_finalize_neon(_sz_hash_minimal_t const *state, sz_size_t length) {
+    // Mix the length into the key
+    uint64x2_t key_with_length = vaddq_u64(state->key.u64x2, vsetq_lane_u64(length, vdupq_n_u64(0), 0));
+    // Combine the "sum" and the "AES" blocks
+    uint8x16_t mixed_registers = _sz_emulate_aesenc_u8x16_neon(state->sum.u8x16, state->aes.u8x16);
+    // Make sure the "key" mixes enough with the state,
+    // as with less than 2 rounds - SMHasher fails
+    uint8x16_t mixed_within_register = _sz_emulate_aesenc_u8x16_neon(
+        _sz_emulate_aesenc_u8x16_neon(mixed_registers, vreinterpretq_u8_u64(key_with_length)), mixed_registers);
+    // Extract the low 64 bits
+    return vgetq_lane_u64(vreinterpretq_u64_u8(mixed_within_register), 0);
+}
+
+SZ_INTERNAL void _sz_hash_minimal_update_neon(_sz_hash_minimal_t *state, uint8x16_t block) {
+    uint8x16_t const shuffle_mask = vld1q_u8(_sz_hash_u8x16x4_shuffle());
+    state->aes.u8x16 = _sz_emulate_aesenc_u8x16_neon(state->aes.u8x16, block);
+    uint8x16_t sum_shuffled = vqtbl1q_u8(vreinterpretq_u8_u64(state->sum.u64x2), shuffle_mask);
+    state->sum.u64x2 = vaddq_u64(vreinterpretq_u64_u8(sum_shuffled), vreinterpretq_u64_u8(block));
+}
+
 SZ_PUBLIC void sz_hash_state_init_neon(sz_hash_state_t *state, sz_u64_t seed) {
-    sz_hash_state_init_serial(state, seed);
+    // The key is made from the seed and half of it will be mixed with the length in the end
+    uint64x2_t seed_vec = vdupq_n_u64(seed);
+    state->key.u64x2 = seed_vec;
+
+    // XOR the user-supplied keys with the two "pi" constants
+    sz_u64_t const *pi = _sz_hash_pi_constants();
+    for (int i = 0; i < 4; ++i) state->aes.u64x2s[i] = veorq_u64(seed_vec, vld1q_u64(pi + i * 2));
+    for (int i = 0; i < 4; ++i) state->sum.u64x2s[i] = veorq_u64(seed_vec, vld1q_u64(pi + i * 2 + 8));
+
+    // The inputs are zeroed out at the beginning
+    state->ins.u8x16s[0] = state->ins.u8x16s[1] = state->ins.u8x16s[2] = state->ins.u8x16s[3] = vdupq_n_u8(0);
+    state->ins_length = 0;
+}
+
+SZ_INTERNAL void _sz_hash_state_update_neon(sz_hash_state_t *state) {
+    uint8x16_t const shuffle_mask = vld1q_u8(_sz_hash_u8x16x4_shuffle());
+    state->aes.u8x16s[0] = _sz_emulate_aesenc_u8x16_neon(state->aes.u8x16s[0], state->ins.u8x16s[0]);
+    uint8x16_t sum_shuffled0 = vqtbl1q_u8(vreinterpretq_u8_u64(state->sum.u64x2s[0]), shuffle_mask);
+    state->sum.u64x2s[0] = vaddq_u64(vreinterpretq_u64_u8(sum_shuffled0), state->ins.u64x2s[0]);
+    state->aes.u8x16s[1] = _sz_emulate_aesenc_u8x16_neon(state->aes.u8x16s[1], state->ins.u8x16s[1]);
+    uint8x16_t sum_shuffled1 = vqtbl1q_u8(vreinterpretq_u8_u64(state->sum.u64x2s[1]), shuffle_mask);
+    state->sum.u64x2s[1] = vaddq_u64(vreinterpretq_u64_u8(sum_shuffled1), state->ins.u64x2s[1]);
+    state->aes.u8x16s[2] = _sz_emulate_aesenc_u8x16_neon(state->aes.u8x16s[2], state->ins.u8x16s[2]);
+    uint8x16_t sum_shuffled2 = vqtbl1q_u8(vreinterpretq_u8_u64(state->sum.u64x2s[2]), shuffle_mask);
+    state->sum.u64x2s[2] = vaddq_u64(vreinterpretq_u64_u8(sum_shuffled2), state->ins.u64x2s[2]);
+    state->aes.u8x16s[3] = _sz_emulate_aesenc_u8x16_neon(state->aes.u8x16s[3], state->ins.u8x16s[3]);
+    uint8x16_t sum_shuffled3 = vqtbl1q_u8(vreinterpretq_u8_u64(state->sum.u64x2s[3]), shuffle_mask);
+    state->sum.u64x2s[3] = vaddq_u64(vreinterpretq_u64_u8(sum_shuffled3), state->ins.u64x2s[3]);
+}
+
+SZ_INTERNAL sz_u64_t _sz_hash_state_finalize_neon(sz_hash_state_t const *state) {
+    // Mix the length into the key
+    uint64x2_t key_with_length = vaddq_u64(state->key.u64x2, vsetq_lane_u64(state->ins_length, vdupq_n_u64(0), 0));
+    // Combine the "sum" and the "AES" blocks
+    uint8x16_t mixed_registers0 = _sz_emulate_aesenc_u8x16_neon(state->sum.u8x16s[0], state->aes.u8x16s[0]);
+    uint8x16_t mixed_registers1 = _sz_emulate_aesenc_u8x16_neon(state->sum.u8x16s[1], state->aes.u8x16s[1]);
+    uint8x16_t mixed_registers2 = _sz_emulate_aesenc_u8x16_neon(state->sum.u8x16s[2], state->aes.u8x16s[2]);
+    uint8x16_t mixed_registers3 = _sz_emulate_aesenc_u8x16_neon(state->sum.u8x16s[3], state->aes.u8x16s[3]);
+    // Combine the mixed registers
+    uint8x16_t mixed_registers01 = _sz_emulate_aesenc_u8x16_neon(mixed_registers0, mixed_registers1);
+    uint8x16_t mixed_registers23 = _sz_emulate_aesenc_u8x16_neon(mixed_registers2, mixed_registers3);
+    uint8x16_t mixed_registers = _sz_emulate_aesenc_u8x16_neon(mixed_registers01, mixed_registers23);
+    // Make sure the "key" mixes enough with the state,
+    // as with less than 2 rounds - SMHasher fails
+    uint8x16_t mixed_within_register = _sz_emulate_aesenc_u8x16_neon(
+        _sz_emulate_aesenc_u8x16_neon(mixed_registers, vreinterpretq_u8_u64(key_with_length)), mixed_registers);
+    // Extract the low 64 bits
+    return vgetq_lane_u64(vreinterpretq_u64_u8(mixed_within_register), 0);
 }
 
 SZ_PUBLIC void sz_hash_state_stream_neon(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
-    sz_hash_state_stream_serial(state, text, length);
+    // This whole function is identical to Haswell.
+    while (length) {
+        // Append to the internal buffer until it's full
+        if (state->ins_length % 64 == 0 && length >= 64) {
+            state->ins.u8x16s[0] = vld1q_u8((sz_u8_t const *)text);
+            state->ins.u8x16s[1] = vld1q_u8((sz_u8_t const *)(text + 16));
+            state->ins.u8x16s[2] = vld1q_u8((sz_u8_t const *)(text + 32));
+            state->ins.u8x16s[3] = vld1q_u8((sz_u8_t const *)(text + 48));
+            _sz_hash_state_update_neon(state);
+            state->ins_length += 64;
+            text += 64;
+            length -= 64;
+        }
+        // If vectorization isn't that trivial - fall back to the serial implementation
+        else {
+            sz_size_t progress_in_block = state->ins_length % 64;
+            sz_size_t to_copy = sz_min_of_two(length, 64 - progress_in_block);
+            int const will_fill_block = progress_in_block + to_copy == 64;
+            // Update the metadata before we modify the `to_copy` variable
+            state->ins_length += to_copy;
+            length -= to_copy;
+            // Append to the internal buffer until it's full
+            while (to_copy--) state->ins.u8s[progress_in_block++] = *text++;
+            // If we've reached the end of the buffer, update the state
+            if (will_fill_block) {
+                _sz_hash_state_update_neon(state);
+                // Reset to zeros now, so we don't have to overwrite an immutable buffer in the folding state
+                for (int i = 0; i < 4; ++i) state->ins.u8x16s[i] = vdupq_n_u8(0);
+            }
+        }
+    }
 }
 
-SZ_PUBLIC sz_u64_t sz_hash_state_fold_neon(sz_hash_state_t const *state) { return sz_hash_state_fold_serial(state); }
+SZ_PUBLIC sz_u64_t sz_hash_state_fold_neon(sz_hash_state_t const *state) {
+    // This whole function is identical to Haswell.
+    sz_size_t length = state->ins_length;
+    if (length >= 64) return _sz_hash_state_finalize_neon(state);
+
+    // Switch back to a smaller "minimal" state for small inputs
+    _sz_hash_minimal_t minimal_state;
+    minimal_state.key.u8x16 = state->key.u8x16;
+    minimal_state.aes.u8x16 = state->aes.u8x16s[0];
+    minimal_state.sum.u8x16 = state->sum.u8x16s[0];
+
+    // The logic is different depending on the length of the input
+    uint8x16_t const *ins_vecs = (uint8x16_t const *)&state->ins.u8x16s[0];
+    if (length <= 16) {
+        _sz_hash_minimal_update_neon(&minimal_state, ins_vecs[0]);
+        return _sz_hash_minimal_finalize_neon(&minimal_state, length);
+    }
+    else if (length <= 32) {
+        _sz_hash_minimal_update_neon(&minimal_state, ins_vecs[0]);
+        _sz_hash_minimal_update_neon(&minimal_state, ins_vecs[1]);
+        return _sz_hash_minimal_finalize_neon(&minimal_state, length);
+    }
+    else if (length <= 48) {
+        _sz_hash_minimal_update_neon(&minimal_state, ins_vecs[0]);
+        _sz_hash_minimal_update_neon(&minimal_state, ins_vecs[1]);
+        _sz_hash_minimal_update_neon(&minimal_state, ins_vecs[2]);
+        return _sz_hash_minimal_finalize_neon(&minimal_state, length);
+    }
+    else {
+        _sz_hash_minimal_update_neon(&minimal_state, ins_vecs[0]);
+        _sz_hash_minimal_update_neon(&minimal_state, ins_vecs[1]);
+        _sz_hash_minimal_update_neon(&minimal_state, ins_vecs[2]);
+        _sz_hash_minimal_update_neon(&minimal_state, ins_vecs[3]);
+        return _sz_hash_minimal_finalize_neon(&minimal_state, length);
+    }
+}
+
+SZ_PUBLIC sz_u64_t sz_hash_neon(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
+    if (length <= 16) {
+        // Initialize the AES block with a given seed
+        _sz_hash_minimal_t state;
+        _sz_hash_minimal_init_neon(&state, seed);
+        // Load the data and update the state
+        sz_u128_vec_t data_vec;
+        data_vec.u8x16 = vdupq_n_u8(0);
+        for (sz_size_t i = 0; i < length; ++i) data_vec.u8s[i] = start[i];
+        _sz_hash_minimal_update_neon(&state, data_vec.u8x16);
+        return _sz_hash_minimal_finalize_neon(&state, length);
+    }
+    else if (length <= 32) {
+        // Initialize the AES block with a given seed
+        _sz_hash_minimal_t state;
+        _sz_hash_minimal_init_neon(&state, seed);
+        // Load the data and update the state
+        sz_u128_vec_t data0_vec, data1_vec;
+        data0_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start));
+        data1_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + length - 16));
+        // Let's shift the data within the register to de-interleave the bytes.
+        _sz_hash_shift_in_register_serial(&data1_vec, 32 - length);
+        _sz_hash_minimal_update_neon(&state, data0_vec.u8x16);
+        _sz_hash_minimal_update_neon(&state, data1_vec.u8x16);
+        return _sz_hash_minimal_finalize_neon(&state, length);
+    }
+    else if (length <= 48) {
+        // Initialize the AES block with a given seed
+        _sz_hash_minimal_t state;
+        _sz_hash_minimal_init_neon(&state, seed);
+        // Load the data and update the state
+        sz_u128_vec_t data0_vec, data1_vec, data2_vec;
+        data0_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start));
+        data1_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + 16));
+        data2_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + length - 16));
+        // Let's shift the data within the register to de-interleave the bytes.
+        _sz_hash_shift_in_register_serial(&data2_vec, 48 - length);
+        _sz_hash_minimal_update_neon(&state, data0_vec.u8x16);
+        _sz_hash_minimal_update_neon(&state, data1_vec.u8x16);
+        _sz_hash_minimal_update_neon(&state, data2_vec.u8x16);
+        return _sz_hash_minimal_finalize_neon(&state, length);
+    }
+    else if (length <= 64) {
+        // Initialize the AES block with a given seed
+        _sz_hash_minimal_t state;
+        _sz_hash_minimal_init_neon(&state, seed);
+        // Load the data and update the state
+        sz_u128_vec_t data0_vec, data1_vec, data2_vec, data3_vec;
+        data0_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start));
+        data1_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + 16));
+        data2_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + 32));
+        data3_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + length - 16));
+        // Let's shift the data within the register to de-interleave the bytes.
+        _sz_hash_shift_in_register_serial(&data3_vec, 64 - length);
+        _sz_hash_minimal_update_neon(&state, data0_vec.u8x16);
+        _sz_hash_minimal_update_neon(&state, data1_vec.u8x16);
+        _sz_hash_minimal_update_neon(&state, data2_vec.u8x16);
+        _sz_hash_minimal_update_neon(&state, data3_vec.u8x16);
+        return _sz_hash_minimal_finalize_neon(&state, length);
+    }
+    else {
+        // Use a larger state to handle the main loop and add different offsets
+        // to different lanes of the register
+        sz_hash_state_t state;
+        sz_hash_state_init_neon(&state, seed);
+        for (; state.ins_length + 64 <= length; state.ins_length += 64) {
+            state.ins.u8x16s[0] = vld1q_u8((sz_u8_t const *)(start + state.ins_length));
+            state.ins.u8x16s[1] = vld1q_u8((sz_u8_t const *)(start + state.ins_length + 16));
+            state.ins.u8x16s[2] = vld1q_u8((sz_u8_t const *)(start + state.ins_length + 32));
+            state.ins.u8x16s[3] = vld1q_u8((sz_u8_t const *)(start + state.ins_length + 48));
+            _sz_hash_state_update_neon(&state);
+        }
+        // Handle the tail, resetting the registers to zero first
+        if (state.ins_length < length) {
+            state.ins.u8x16s[0] = vdupq_n_u8(0);
+            state.ins.u8x16s[1] = vdupq_n_u8(0);
+            state.ins.u8x16s[2] = vdupq_n_u8(0);
+            state.ins.u8x16s[3] = vdupq_n_u8(0);
+            for (sz_size_t i = 0; state.ins_length < length; ++i, ++state.ins_length)
+                state.ins.u8s[i] = start[state.ins_length];
+            _sz_hash_state_update_neon(&state);
+            state.ins_length = length;
+        }
+        return _sz_hash_state_finalize_neon(&state);
+    }
+}
+
+SZ_PUBLIC void sz_fill_random_neon(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
+    sz_u64_t const *pi_ptr = _sz_hash_pi_constants();
+    if (length <= 16) {
+        uint64x2_t input = vdupq_n_u64(nonce);
+        uint64x2_t pi = vld1q_u64(pi_ptr);
+        uint64x2_t key = veorq_u64(vdupq_n_u64(nonce), pi);
+        uint64x2_t generated = _sz_emulate_aesenc_u64x2_neon(input, key);
+        // Now the tricky part is outputting this data to the user-supplied buffer
+        // without masked writes, like in AVX-512.
+        for (sz_size_t i = 0; i < length; ++i) text[i] = ((sz_u8_t *)&generated)[i];
+    }
+    // Assuming the YMM register contains two 128-bit blocks, the input to the generator
+    // will be more complex, containing the sum of the nonce and the block number.
+    else if (length <= 32) {
+        uint64x2_t inputs[2], pis[2], keys[2], generated[2];
+        inputs[0] = vdupq_n_u64(nonce + 0);
+        inputs[1] = vdupq_n_u64(nonce + 1);
+        pis[0] = vld1q_u64(pi_ptr + 0);
+        pis[1] = vld1q_u64(pi_ptr + 2);
+        keys[0] = veorq_u64(vdupq_n_u64(nonce), pis[0]);
+        keys[1] = veorq_u64(vdupq_n_u64(nonce), pis[1]);
+        generated[0] = _sz_emulate_aesenc_u64x2_neon(inputs[0], keys[0]);
+        generated[1] = _sz_emulate_aesenc_u64x2_neon(inputs[1], keys[1]);
+        // The first store can easily be vectorized, but the second can be serial for now
+        vst1q_u64((sz_u64_t *)(text), generated[0]);
+        for (sz_size_t i = 16; i < length; ++i) text[i] = ((sz_u8_t *)&generated[1])[i - 16];
+    }
+    // The last special case we handle outside of the primary loop is for buffers up to 64 bytes long.
+    else if (length <= 48) {
+        uint64x2_t inputs[3], pis[3], keys[3], generated[3];
+        inputs[0] = vdupq_n_u64(nonce);
+        inputs[1] = vdupq_n_u64(nonce + 1);
+        inputs[2] = vdupq_n_u64(nonce + 2);
+        pis[0] = vld1q_u64(pi_ptr + 0);
+        pis[1] = vld1q_u64(pi_ptr + 2);
+        pis[2] = vld1q_u64(pi_ptr + 4);
+        keys[0] = veorq_u64(vdupq_n_u64(nonce), pis[0]);
+        keys[1] = veorq_u64(vdupq_n_u64(nonce), pis[1]);
+        keys[2] = veorq_u64(vdupq_n_u64(nonce), pis[2]);
+        generated[0] = _sz_emulate_aesenc_u64x2_neon(inputs[0], keys[0]);
+        generated[1] = _sz_emulate_aesenc_u64x2_neon(inputs[1], keys[1]);
+        generated[2] = _sz_emulate_aesenc_u64x2_neon(inputs[2], keys[2]);
+        // The first store can easily be vectorized, but the second can be serial for now
+        vst1q_u64((sz_u64_t *)(text + 0), generated[0]);
+        vst1q_u64((sz_u64_t *)(text + 16), generated[1]);
+        for (sz_size_t i = 32; i < length; ++i) text[i] = ((sz_u8_t *)generated)[i];
+    }
+    // The final part of the function is the primary loop, which processes the buffer in 64-byte chunks.
+    else {
+        uint64x2_t inputs[4], pis[4], keys[4], generated[4];
+        inputs[0] = vdupq_n_u64(nonce + 0);
+        inputs[1] = vdupq_n_u64(nonce + 1);
+        inputs[2] = vdupq_n_u64(nonce + 2);
+        inputs[3] = vdupq_n_u64(nonce + 3);
+        // Load parts of PI into the registers
+        pis[0] = vld1q_u64(pi_ptr + 0);
+        pis[1] = vld1q_u64(pi_ptr + 2);
+        pis[2] = vld1q_u64(pi_ptr + 4);
+        pis[3] = vld1q_u64(pi_ptr + 6);
+        // XOR the nonce with the PI constants
+        keys[0] = veorq_u64(vdupq_n_u64(nonce), pis[0]);
+        keys[1] = veorq_u64(vdupq_n_u64(nonce), pis[1]);
+        keys[2] = veorq_u64(vdupq_n_u64(nonce), pis[2]);
+        keys[3] = veorq_u64(vdupq_n_u64(nonce), pis[3]);
+
+        // Produce the output, fixing the key and enumerating input chunks.
+        sz_size_t i = 0;
+        uint64x2_t const increment = vdupq_n_u64(4);
+        for (; i + 64 <= length; i += 64) {
+            generated[0] = _sz_emulate_aesenc_u64x2_neon(inputs[0], keys[0]);
+            generated[1] = _sz_emulate_aesenc_u64x2_neon(inputs[1], keys[1]);
+            generated[2] = _sz_emulate_aesenc_u64x2_neon(inputs[2], keys[2]);
+            generated[3] = _sz_emulate_aesenc_u64x2_neon(inputs[3], keys[3]);
+            vst1q_u64((sz_u64_t *)(text + i + 0), generated[0]);
+            vst1q_u64((sz_u64_t *)(text + i + 16), generated[1]);
+            vst1q_u64((sz_u64_t *)(text + i + 32), generated[2]);
+            vst1q_u64((sz_u64_t *)(text + i + 48), generated[3]);
+            inputs[0] = vaddq_u64(inputs[0], increment);
+            inputs[1] = vaddq_u64(inputs[1], increment);
+            inputs[2] = vaddq_u64(inputs[2], increment);
+            inputs[3] = vaddq_u64(inputs[3], increment);
+        }
+
+        // Handle the tail of the buffer.
+        {
+            generated[0] = _sz_emulate_aesenc_u64x2_neon(inputs[0], keys[0]);
+            generated[1] = _sz_emulate_aesenc_u64x2_neon(inputs[1], keys[1]);
+            generated[2] = _sz_emulate_aesenc_u64x2_neon(inputs[2], keys[2]);
+            generated[3] = _sz_emulate_aesenc_u64x2_neon(inputs[3], keys[3]);
+            for (sz_size_t j = 0; i < length; ++i, ++j) text[i] = ((sz_u8_t *)generated)[j];
+        }
+    }
+}
 
 #pragma clang attribute pop
 #pragma GCC pop_options
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 11366304..164932df 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -634,6 +634,12 @@ typedef union sz_u256_vec_t {
 #if SZ_USE_HASWELL
     __m256i ymm;
     __m128i xmms[2];
+#endif
+#if SZ_USE_NEON
+    uint8x16_t u8x16s[2];
+    uint16x8_t u16x8s[2];
+    uint32x4_t u32x4s[2];
+    uint64x2_t u64x2s[2];
 #endif
     sz_u64_t u64s[4];
     sz_u32_t u32s[8];
@@ -653,6 +659,12 @@ typedef union sz_u512_vec_t {
 #if SZ_USE_HASWELL || SZ_USE_SKYLAKE || SZ_USE_ICE
     __m256i ymms[2];
     __m128i xmms[4];
+#endif
+#if SZ_USE_NEON
+    uint8x16_t u8x16s[4];
+    uint16x8_t u16x8s[4];
+    uint32x4_t u32x4s[4];
+    uint64x2_t u64x2s[4];
 #endif
     sz_u64_t u64s[8];
     sz_i64_t i64s[8];

From d44beb4377fad8ccf8b8240245dc12bcb16eb346 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 9 Mar 2025 19:27:59 +0000
Subject: [PATCH 171/751] Break: `sz::edit_distance` -> Levenshtein

---
 include/stringzilla/stringzilla.hpp | 38 +++++++++++----------
 include/stringzilla/types.h         |  1 +
 scripts/test.cpp                    | 51 +++++++++++++++--------------
 3 files changed, 47 insertions(+), 43 deletions(-)

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index ab99bc5f..b0146a82 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -2030,7 +2030,7 @@ class basic_string_slice {
  *      * `replace`, `insert`, `erase`, `append`, `push_back`, `pop_back`, `resize`, `shrink_to_fit`... from STL,
  *      * `try_` exception-free "try" operations that returning non-zero values on success,
  *      * `replace_all` and `erase_all` similar to Boost,
- *      * `edit_distance` - Levenshtein distance computation reusing the allocator,
+ *      * `levenshtein_distance` - Levenshtein distance computation reusing the allocator,
  *      * `translate` - character mapping,
  *      * `randomize`, `random` - for fast random string generation.
  *
@@ -3360,11 +3360,12 @@ class basic_string {
 
     concatenation<string_view, string_view> operator|(string_view other) const noexcept { return {view(), other}; }
 
-    size_type edit_distance(string_view other, size_type bound = 0) const noexcept {
-        size_type result;
-        _with_alloc([&](sz_alloc_type &alloc) {
+    size_type levenshtein_distance(string_view other, size_type bound = std::numeric_limits<size_type>::max()) const
+        noexcept(false) {
+        size_type result = std::numeric_limits<size_type>::max();
+        raise(_with_alloc([&](sz_alloc_type &alloc) {
             return sz_levenshtein_distance(data(), size(), other.data(), other.size(), bound, &alloc, &result);
-        });
+        }));
         return result;
     }
 
@@ -3839,7 +3840,7 @@ typename concatenation_result<first_type, second_type, following_types...>::type
 template <typename char_type_>
 std::size_t hamming_distance(                                                         //
     basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b, //
-    std::size_t bound = 0) noexcept {
+    std::size_t bound = SZ_SIZE_MAX) noexcept {
     std::size_t result;
     sz_hamming_distance(a.data(), a.size(), b.data(), b.size(), bound, &result);
     return result;
@@ -3852,7 +3853,7 @@ std::size_t hamming_distance(
 template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
 std::size_t hamming_distance(                                                                               //
     basic_string<char_type_, allocator_type_> const &a, basic_string<char_type_, allocator_type_> const &b, //
-    std::size_t bound = 0) noexcept {
+    std::size_t bound = SZ_SIZE_MAX) noexcept {
     return ashvardanian::stringzilla::hamming_distance(a.view(), b.view(), bound);
 }
 
@@ -3862,7 +3863,8 @@ std::size_t hamming_distance(
  */
 template <typename char_type_>
 std::size_t hamming_distance_utf8( //
-    basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b, std::size_t bound = 0) noexcept {
+    basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b,
+    std::size_t bound = SZ_SIZE_MAX) noexcept {
     std::size_t result;
     sz_hamming_distance_utf8(a.data(), a.size(), b.data(), b.size(), bound, &result);
     return result;
@@ -3875,7 +3877,7 @@ std::size_t hamming_distance_utf8( //
 template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
 std::size_t hamming_distance_utf8( //
     basic_string<char_type_, allocator_type_> const &a, basic_string<char_type_, allocator_type_> const &b,
-    std::size_t bound = 0) noexcept {
+    std::size_t bound = SZ_SIZE_MAX) noexcept {
     return ashvardanian::stringzilla::hamming_distance_utf8(a.view(), b.view(), bound);
 }
 
@@ -3884,10 +3886,10 @@ std::size_t hamming_distance_utf8( //
  *  @sa sz_levenshtein_distance
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
-std::size_t edit_distance( //
+std::size_t levenshtein_distance( //
     basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b, std::size_t bound = SZ_SIZE_MAX,
     allocator_type_ &&allocator = allocator_type_ {}) noexcept(false) {
-    std::size_t result;
+    std::size_t result = SZ_SIZE_MAX;
     raise(_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
         return sz_levenshtein_distance(a.data(), a.size(), b.data(), b.size(), bound, &alloc, &result);
     }));
@@ -3899,10 +3901,10 @@ std::size_t edit_distance( //
  *  @sa sz_levenshtein_distance
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<char_type_>>
-std::size_t edit_distance(                                                                                  //
+std::size_t levenshtein_distance(                                                                           //
     basic_string<char_type_, allocator_type_> const &a, basic_string<char_type_, allocator_type_> const &b, //
     std::size_t bound = SZ_SIZE_MAX) noexcept(false) {
-    return ashvardanian::stringzilla::edit_distance(a.view(), b.view(), bound, a.get_allocator());
+    return ashvardanian::stringzilla::levenshtein_distance(a.view(), b.view(), bound, a.get_allocator());
 }
 
 /**
@@ -3910,10 +3912,10 @@ std::size_t edit_distance(
  *  @sa sz_levenshtein_distance_utf8
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
-std::size_t edit_distance_utf8(                                                       //
+std::size_t levenshtein_distance_utf8(                                                //
     basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b, //
     std::size_t bound = SZ_SIZE_MAX, allocator_type_ &&allocator = allocator_type_ {}) noexcept(false) {
-    std::size_t result;
+    std::size_t result = SZ_SIZE_MAX;
     raise(_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
         return sz_levenshtein_distance_utf8(a.data(), a.size(), b.data(), b.size(), bound, &alloc, &result);
     }));
@@ -3925,10 +3927,10 @@ std::size_t edit_distance_utf8(
  *  @sa sz_levenshtein_distance_utf8
  */
 template <typename char_type_, typename allocator_type_ = std::allocator<char_type_>>
-std::size_t edit_distance_utf8(                                                                             //
+std::size_t levenshtein_distance_utf8(                                                                      //
     basic_string<char_type_, allocator_type_> const &a, basic_string<char_type_, allocator_type_> const &b, //
     std::size_t bound = SZ_SIZE_MAX) noexcept(false) {
-    return ashvardanian::stringzilla::edit_distance_utf8(a.view(), b.view(), bound, a.get_allocator());
+    return ashvardanian::stringzilla::levenshtein_distance_utf8(a.view(), b.view(), bound, a.get_allocator());
 }
 
 /**
@@ -3945,7 +3947,7 @@ std::ptrdiff_t alignment_score(
     static_assert(std::is_signed<sz_error_cost_t>() == std::is_signed<std::int8_t>(),
                   "sz_error_cost_t must be signed.");
 
-    std::ptrdiff_t result;
+    std::ptrdiff_t result = SZ_SSIZE_MIN;
     raise(_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
         return sz_needleman_wunsch_score(a.data(), a.size(), b.data(), b.size(), &subs[0][0], gap, &alloc, &result);
     }));
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 164932df..3a117f61 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -811,6 +811,7 @@ SZ_PUBLIC void sz_sequence_from_null_terminated_strings(sz_cptr_t *start, sz_siz
 #define SZ_CACHE_LINE_WIDTH (64) // bytes
 #define SZ_SIZE_MAX ((sz_size_t)(-1))
 #define SZ_SSIZE_MAX ((sz_ssize_t)(SZ_SIZE_MAX >> 1))
+#define SZ_SSIZE_MIN ((sz_ssize_t)(-SZ_SSIZE_MAX - 1))
 
 SZ_INTERNAL sz_size_t _sz_size_max(void) { return SZ_SIZE_MAX; }
 SZ_INTERNAL sz_ssize_t _sz_ssize_max(void) { return SZ_SSIZE_MAX; }
diff --git a/scripts/test.cpp b/scripts/test.cpp
index ebcc01fe..8dd66dd9 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -975,27 +975,28 @@ static void test_non_stl_extensions_for_reads() {
     assert(sz::hamming_distance_utf8(str("abcdefgh"), str("_bcdefg_")) == 2); // replace ASCI prefix and suffix
     assert(sz::hamming_distance_utf8(str("αβγδ"), str("αγγδ")) == 1);         // replace Beta UTF8 codepoint
 
-    assert(sz::edit_distance(str("hello"), str("hello")) == 0);
-    assert(sz::edit_distance(str("hello"), str("hell")) == 1);
-    assert(sz::edit_distance(str(""), str("")) == 0);
-    assert(sz::edit_distance(str(""), str("abc")) == 3);
-    assert(sz::edit_distance(str("abc"), str("")) == 3);
-    assert(sz::edit_distance(str("abc"), str("ac")) == 1);                   // one deletion
-    assert(sz::edit_distance(str("abc"), str("a_bc")) == 1);                 // one insertion
-    assert(sz::edit_distance(str("abc"), str("adc")) == 1);                  // one substitution
-    assert(sz::edit_distance(str("ggbuzgjux{}l"), str("gbuzgjux{}l")) == 1); // one insertion (prepended)
-    assert(sz::edit_distance(str("abcdefgABCDEFG"), str("ABCDEFGabcdefg")) == 14);
-
-    assert(sz::edit_distance_utf8(str("hello"), str("hell")) == 1);           // no unicode symbols, just ASCII
-    assert(sz::edit_distance_utf8(str("𠜎 𠜱 𠝹 𠱓"), str("𠜎𠜱𠝹𠱓")) == 3); // add 3 whitespaces in Chinese
-    assert(sz::edit_distance_utf8(str("💖"), str("💗")) == 1);
-
-    assert(sz::edit_distance_utf8(str("αβγδ"), str("αγδ")) == 1);      // insert Beta
-    assert(sz::edit_distance_utf8(str("école"), str("école")) == 2);   // etter "é" as a single character vs "e" + "´"
-    assert(sz::edit_distance_utf8(str("façade"), str("facade")) == 1); // "ç" with cedilla vs. plain
-    assert(sz::edit_distance_utf8(str("Schön"), str("Scho\u0308n")) == 2); // "ö" represented as "o" + "¨"
-    assert(sz::edit_distance_utf8(str("München"), str("Muenchen")) == 2);  // German with umlaut vs. transcription
-    assert(sz::edit_distance_utf8(str("こんにちは世界"), str("こんばんは世界")) == 2);
+    assert(sz::levenshtein_distance(str("hello"), str("hello")) == 0);
+    assert(sz::levenshtein_distance(str("hello"), str("hell")) == 1);
+    assert(sz::levenshtein_distance(str(""), str("")) == 0);
+    assert(sz::levenshtein_distance(str(""), str("abc")) == 3);
+    assert(sz::levenshtein_distance(str("abc"), str("")) == 3);
+    assert(sz::levenshtein_distance(str("abc"), str("ac")) == 1);                   // one deletion
+    assert(sz::levenshtein_distance(str("abc"), str("a_bc")) == 1);                 // one insertion
+    assert(sz::levenshtein_distance(str("abc"), str("adc")) == 1);                  // one substitution
+    assert(sz::levenshtein_distance(str("ggbuzgjux{}l"), str("gbuzgjux{}l")) == 1); // one insertion (prepended)
+    assert(sz::levenshtein_distance(str("abcdefgABCDEFG"), str("ABCDEFGabcdefg")) == 14);
+
+    assert(sz::levenshtein_distance_utf8(str("hello"), str("hell")) == 1);           // no unicode symbols, just ASCII
+    assert(sz::levenshtein_distance_utf8(str("𠜎 𠜱 𠝹 𠱓"), str("𠜎𠜱𠝹𠱓")) == 3); // add 3 whitespaces in Chinese
+    assert(sz::levenshtein_distance_utf8(str("💖"), str("💗")) == 1);
+
+    assert(sz::levenshtein_distance_utf8(str("αβγδ"), str("αγδ")) == 1); // insert Beta
+    assert(sz::levenshtein_distance_utf8(str("école"), str("école")) ==
+           2); // etter "é" as a single character vs "e" + "´"
+    assert(sz::levenshtein_distance_utf8(str("façade"), str("facade")) == 1);     // "ç" with cedilla vs. plain
+    assert(sz::levenshtein_distance_utf8(str("Schön"), str("Scho\u0308n")) == 2); // "ö" represented as "o" + "¨"
+    assert(sz::levenshtein_distance_utf8(str("München"), str("Muenchen")) == 2); // German with umlaut vs. transcription
+    assert(sz::levenshtein_distance_utf8(str("こんにちは世界"), str("こんばんは世界")) == 2);
 
     // Computing alignment scores.
     using matrix_t = std::int8_t[256][256];
@@ -1645,20 +1646,20 @@ static void test_levenshtein_distances() {
     };
 
     auto test_distance = [&](sz::string const &l, sz::string const &r, std::size_t expected) {
-        auto received = sz::edit_distance(l, r);
+        auto received = sz::levenshtein_distance(l, r);
         auto received_score = sz::alignment_score(l, r, costs, -1);
         if (received != expected) print_failure("Levenshtein", l, r, expected, received);
         if ((std::size_t)(-received_score) != expected) print_failure("Scoring", l, r, expected, received_score);
         // The distance relation commutes
-        received = sz::edit_distance(r, l);
+        received = sz::levenshtein_distance(r, l);
         received_score = sz::alignment_score(r, l, costs, -1);
         if (received != expected) print_failure("Levenshtein", r, l, expected, received);
         if ((std::size_t)(-received_score) != expected) print_failure("Scoring", r, l, expected, received_score);
 
         // Validate the bounded variants:
         if (received > 1) {
-            assert(sz::edit_distance(l, r, received) == received);
-            assert(sz::edit_distance(r, l, received - 1) >= (std::max)(l.size(), r.size()));
+            assert(sz::levenshtein_distance(l, r, received) == received);
+            assert(sz::levenshtein_distance(r, l, received - 1) >= (std::max)(l.size(), r.size()));
         }
     };
 

From af54e933479b50bc3fef0c1cbe0511a384c2fb66 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 9 Mar 2025 20:20:17 +0000
Subject: [PATCH 172/751] Improve: Separate PRNG backends in benchmarks

---
 scripts/bench_token.cpp | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index 112fbc98..ac632767 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -97,7 +97,28 @@ tracked_unary_functions_t hash_stream_functions() {
 
 tracked_unary_functions_t random_generation_functions() {
     static std::vector<char> buffer;
+    auto wrap_sz = [](auto function) -> unary_function_t {
+        return unary_function_t([function](std::string_view s) {
+            if (buffer.size() < s.size()) buffer.resize(s.size());
+            function(buffer.data(), s.size(), 0);
+            return s.size();
+        });
+    };
+
     tracked_unary_functions_t result = {
+        {"sz_fill_random_serial", wrap_sz(sz_fill_random_serial)},
+#if SZ_USE_HASWELL
+        {"sz_fill_random_haswell", wrap_sz(sz_fill_random_haswell), true},
+#endif
+#if SZ_USE_SKYLAKE
+        {"sz_fill_random_skylake", wrap_sz(sz_fill_random_skylake), true},
+#endif
+#if SZ_USE_ICE
+        {"sz_fill_random_ice", wrap_sz(sz_fill_random_ice), true},
+#endif
+#if SZ_USE_NEON
+        {"sz_fill_random_neon", wrap_sz(sz_fill_random_neon), true},
+#endif
         {"std::rand() & 0xFF", unary_function_t([](std::string_view token) -> std::size_t {
              if (buffer.size() < token.size()) buffer.resize(token.size());
              for (std::size_t i = 0; i < token.size(); ++i) buffer[i] = static_cast<char>(std::rand() & 0xFF);
@@ -108,12 +129,6 @@ tracked_unary_functions_t random_generation_functions() {
              randomize_string(buffer.data(), token.size());
              return token.size();
          })},
-        {"sz::randomize", unary_function_t([](std::string_view token) -> std::size_t {
-             if (buffer.size() < token.size()) buffer.resize(token.size());
-             sz::string_span span(buffer.data(), token.size());
-             sz::fill_random(span);
-             return token.size();
-         })},
     };
     return result;
 }

From c4f7a0e36b9afd67ce14763d9399535a93994f63 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 9 Mar 2025 20:23:25 +0000
Subject: [PATCH 173/751] Improve: Discarding buffer in streaming hashes

---
 include/stringzilla/hash.h | 44 +++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index eb748ef4..3dab9488 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -947,7 +947,7 @@ SZ_PUBLIC sz_u64_t sz_hash_haswell(sz_cptr_t start, sz_size_t length, sz_u64_t s
         _sz_hash_minimal_init_haswell(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec;
-        data0_vec.xmm = _mm_lddqu_si128((__m128i const *)(start));
+        data0_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 0));
         data1_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 16));
         data2_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + length - 16));
         // Let's shift the data within the register to de-interleave the bytes.
@@ -963,7 +963,7 @@ SZ_PUBLIC sz_u64_t sz_hash_haswell(sz_cptr_t start, sz_size_t length, sz_u64_t s
         _sz_hash_minimal_init_haswell(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec, data3_vec;
-        data0_vec.xmm = _mm_lddqu_si128((__m128i const *)(start));
+        data0_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 0));
         data1_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 16));
         data2_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 32));
         data3_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + length - 16));
@@ -981,7 +981,7 @@ SZ_PUBLIC sz_u64_t sz_hash_haswell(sz_cptr_t start, sz_size_t length, sz_u64_t s
         sz_hash_state_t state;
         sz_hash_state_init_haswell(&state, seed);
         for (; state.ins_length + 64 <= length; state.ins_length += 64) {
-            state.ins.xmms[0] = _mm_lddqu_si128((__m128i const *)(start + state.ins_length));
+            state.ins.xmms[0] = _mm_lddqu_si128((__m128i const *)(start + state.ins_length + 0));
             state.ins.xmms[1] = _mm_lddqu_si128((__m128i const *)(start + state.ins_length + 16));
             state.ins.xmms[2] = _mm_lddqu_si128((__m128i const *)(start + state.ins_length + 32));
             state.ins.xmms[3] = _mm_lddqu_si128((__m128i const *)(start + state.ins_length + 48));
@@ -1006,7 +1006,7 @@ SZ_PUBLIC void sz_hash_state_stream_haswell(sz_hash_state_t *state, sz_cptr_t te
     while (length) {
         // Append to the internal buffer until it's full
         if (state->ins_length % 64 == 0 && length >= 64) {
-            state->ins.xmms[0] = _mm_lddqu_si128((__m128i const *)text);
+            state->ins.xmms[0] = _mm_lddqu_si128((__m128i const *)(text + 0));
             state->ins.xmms[1] = _mm_lddqu_si128((__m128i const *)(text + 16));
             state->ins.xmms[2] = _mm_lddqu_si128((__m128i const *)(text + 32));
             state->ins.xmms[3] = _mm_lddqu_si128((__m128i const *)(text + 48));
@@ -1029,7 +1029,7 @@ SZ_PUBLIC void sz_hash_state_stream_haswell(sz_hash_state_t *state, sz_cptr_t te
             if (will_fill_block) {
                 _sz_hash_state_update_haswell(state);
                 // Reset to zeros now, so we don't have to overwrite an immutable buffer in the folding state
-                for (int i = 0; i < 4; ++i) state->ins.xmms[i] = _mm_setzero_si128();
+                for (int i = 0; i < 4; ++i) _mm_storeu_si128(&state->ins.xmms[i], _mm_setzero_si128());
             }
         }
     }
@@ -1043,7 +1043,7 @@ SZ_PUBLIC sz_u64_t sz_hash_state_fold_haswell(sz_hash_state_t const *state) {
     _sz_hash_minimal_t minimal_state;
     minimal_state.key.xmm = state->key.xmm;
     minimal_state.aes.xmm = state->aes.xmms[0];
-    minimal_state.sum.xmm = state->sum.u64x2s[0];
+    minimal_state.sum.xmm = state->sum.xmms[0];
 
     // The logic is different depending on the length of the input
     __m128i const *ins_vecs = (__m128i const *)&state->ins.xmms[0];
@@ -1088,7 +1088,7 @@ SZ_PUBLIC void sz_fill_random_haswell(sz_ptr_t text, sz_size_t length, sz_u64_t
         __m128i inputs[2], pis[2], keys[2], generated[2];
         inputs[0] = _mm_set1_epi64x(nonce);
         inputs[1] = _mm_set1_epi64x(nonce + 1);
-        pis[0] = _mm_load_si128((__m128i const *)(pi_ptr));
+        pis[0] = _mm_load_si128((__m128i const *)(pi_ptr + 0));
         pis[1] = _mm_load_si128((__m128i const *)(pi_ptr + 2));
         keys[0] = _mm_xor_si128(_mm_set1_epi64x(nonce), pis[0]);
         keys[1] = _mm_xor_si128(_mm_set1_epi64x(nonce), pis[1]);
@@ -1104,7 +1104,7 @@ SZ_PUBLIC void sz_fill_random_haswell(sz_ptr_t text, sz_size_t length, sz_u64_t
         inputs[0] = _mm_set1_epi64x(nonce);
         inputs[1] = _mm_set1_epi64x(nonce + 1);
         inputs[2] = _mm_set1_epi64x(nonce + 2);
-        pis[0] = _mm_load_si128((__m128i const *)(pi_ptr));
+        pis[0] = _mm_load_si128((__m128i const *)(pi_ptr + 0));
         pis[1] = _mm_load_si128((__m128i const *)(pi_ptr + 2));
         pis[2] = _mm_load_si128((__m128i const *)(pi_ptr + 4));
         keys[0] = _mm_xor_si128(_mm_set1_epi64x(nonce), pis[0]);
@@ -1114,7 +1114,7 @@ SZ_PUBLIC void sz_fill_random_haswell(sz_ptr_t text, sz_size_t length, sz_u64_t
         generated[1] = _mm_aesenc_si128(inputs[1], keys[1]);
         generated[2] = _mm_aesenc_si128(inputs[2], keys[2]);
         // The first store can easily be vectorized, but the second can be serial for now
-        _mm_storeu_si128((__m128i *)text, generated[0]);
+        _mm_storeu_si128((__m128i *)(text + 0), generated[0]);
         _mm_storeu_si128((__m128i *)(text + 16), generated[1]);
         for (sz_size_t i = 32; i < length; ++i) text[i] = ((sz_u8_t *)generated)[i];
     }
@@ -1126,7 +1126,7 @@ SZ_PUBLIC void sz_fill_random_haswell(sz_ptr_t text, sz_size_t length, sz_u64_t
         inputs[2] = _mm_set1_epi64x(nonce + 2);
         inputs[3] = _mm_set1_epi64x(nonce + 3);
         // Load parts of PI into the registers
-        pis[0] = _mm_load_si128((__m128i const *)(pi_ptr));
+        pis[0] = _mm_load_si128((__m128i const *)(pi_ptr + 0));
         pis[1] = _mm_load_si128((__m128i const *)(pi_ptr + 2));
         pis[2] = _mm_load_si128((__m128i const *)(pi_ptr + 4));
         pis[3] = _mm_load_si128((__m128i const *)(pi_ptr + 6));
@@ -1144,7 +1144,7 @@ SZ_PUBLIC void sz_fill_random_haswell(sz_ptr_t text, sz_size_t length, sz_u64_t
             generated[1] = _mm_aesenc_si128(inputs[1], keys[1]);
             generated[2] = _mm_aesenc_si128(inputs[2], keys[2]);
             generated[3] = _mm_aesenc_si128(inputs[3], keys[3]);
-            _mm_storeu_si128((__m128i *)(text + i), generated[0]);
+            _mm_storeu_si128((__m128i *)(text + i + 0), generated[0]);
             _mm_storeu_si128((__m128i *)(text + i + 16), generated[1]);
             _mm_storeu_si128((__m128i *)(text + i + 32), generated[2]);
             _mm_storeu_si128((__m128i *)(text + i + 48), generated[3]);
@@ -1389,7 +1389,7 @@ SZ_PUBLIC void sz_hash_state_stream_skylake(sz_hash_state_t *state, sz_cptr_t te
         if (will_fill_block) {
             _sz_hash_state_update_haswell(state);
             // Reset to zeros now, so we don't have to overwrite an immutable buffer in the folding state
-            state->ins.zmm = _mm512_setzero_si512();
+            _mm512_storeu_si512(&state->ins.zmm, _mm512_setzero_si512());
         }
     }
 }
@@ -1803,7 +1803,7 @@ SZ_PUBLIC sz_u64_t sz_bytesum_neon(sz_cptr_t text, sz_size_t length) {
 
     // Final reduction of `sum_vec` to a single scalar
     sz_u64_t sum = vgetq_lane_u64(sum_vec, 0) + vgetq_lane_u64(sum_vec, 1);
-    if (length) sum += sz_bytesum_serial(text, length);
+    while (length--) sum += *(sz_u8_t const *)text++; // Same as the scalar version
     return sum;
 }
 
@@ -1917,7 +1917,7 @@ SZ_PUBLIC void sz_hash_state_stream_neon(sz_hash_state_t *state, sz_cptr_t text,
     while (length) {
         // Append to the internal buffer until it's full
         if (state->ins_length % 64 == 0 && length >= 64) {
-            state->ins.u8x16s[0] = vld1q_u8((sz_u8_t const *)text);
+            state->ins.u8x16s[0] = vld1q_u8((sz_u8_t const *)(text + 0));
             state->ins.u8x16s[1] = vld1q_u8((sz_u8_t const *)(text + 16));
             state->ins.u8x16s[2] = vld1q_u8((sz_u8_t const *)(text + 32));
             state->ins.u8x16s[3] = vld1q_u8((sz_u8_t const *)(text + 48));
@@ -1940,7 +1940,7 @@ SZ_PUBLIC void sz_hash_state_stream_neon(sz_hash_state_t *state, sz_cptr_t text,
             if (will_fill_block) {
                 _sz_hash_state_update_neon(state);
                 // Reset to zeros now, so we don't have to overwrite an immutable buffer in the folding state
-                for (int i = 0; i < 4; ++i) state->ins.u8x16s[i] = vdupq_n_u8(0);
+                for (int i = 0; i < 4; ++i) vst1q_u8(state->ins.u8s + i * 16, vdupq_n_u8(0));
             }
         }
     }
@@ -2001,10 +2001,10 @@ SZ_PUBLIC sz_u64_t sz_hash_neon(sz_cptr_t start, sz_size_t length, sz_u64_t seed
         _sz_hash_minimal_init_neon(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec;
-        data0_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start));
+        data0_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + 0));
         data1_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + length - 16));
         // Let's shift the data within the register to de-interleave the bytes.
-        _sz_hash_shift_in_register_serial(&data1_vec, 32 - length);
+        _sz_hash_shift_in_register_serial(&data1_vec, 32 - length); //! `vextq_u8` requires immediates
         _sz_hash_minimal_update_neon(&state, data0_vec.u8x16);
         _sz_hash_minimal_update_neon(&state, data1_vec.u8x16);
         return _sz_hash_minimal_finalize_neon(&state, length);
@@ -2015,11 +2015,11 @@ SZ_PUBLIC sz_u64_t sz_hash_neon(sz_cptr_t start, sz_size_t length, sz_u64_t seed
         _sz_hash_minimal_init_neon(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec;
-        data0_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start));
+        data0_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + 0));
         data1_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + 16));
         data2_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + length - 16));
         // Let's shift the data within the register to de-interleave the bytes.
-        _sz_hash_shift_in_register_serial(&data2_vec, 48 - length);
+        _sz_hash_shift_in_register_serial(&data2_vec, 48 - length); //! `vextq_u8` requires immediates
         _sz_hash_minimal_update_neon(&state, data0_vec.u8x16);
         _sz_hash_minimal_update_neon(&state, data1_vec.u8x16);
         _sz_hash_minimal_update_neon(&state, data2_vec.u8x16);
@@ -2031,12 +2031,12 @@ SZ_PUBLIC sz_u64_t sz_hash_neon(sz_cptr_t start, sz_size_t length, sz_u64_t seed
         _sz_hash_minimal_init_neon(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec, data3_vec;
-        data0_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start));
+        data0_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + 0));
         data1_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + 16));
         data2_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + 32));
         data3_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + length - 16));
         // Let's shift the data within the register to de-interleave the bytes.
-        _sz_hash_shift_in_register_serial(&data3_vec, 64 - length);
+        _sz_hash_shift_in_register_serial(&data3_vec, 64 - length); //! `vextq_u8` requires immediates
         _sz_hash_minimal_update_neon(&state, data0_vec.u8x16);
         _sz_hash_minimal_update_neon(&state, data1_vec.u8x16);
         _sz_hash_minimal_update_neon(&state, data2_vec.u8x16);
@@ -2049,7 +2049,7 @@ SZ_PUBLIC sz_u64_t sz_hash_neon(sz_cptr_t start, sz_size_t length, sz_u64_t seed
         sz_hash_state_t state;
         sz_hash_state_init_neon(&state, seed);
         for (; state.ins_length + 64 <= length; state.ins_length += 64) {
-            state.ins.u8x16s[0] = vld1q_u8((sz_u8_t const *)(start + state.ins_length));
+            state.ins.u8x16s[0] = vld1q_u8((sz_u8_t const *)(start + state.ins_length + 0));
             state.ins.u8x16s[1] = vld1q_u8((sz_u8_t const *)(start + state.ins_length + 16));
             state.ins.u8x16s[2] = vld1q_u8((sz_u8_t const *)(start + state.ins_length + 32));
             state.ins.u8x16s[3] = vld1q_u8((sz_u8_t const *)(start + state.ins_length + 48));

From 828263f27903ae9df580531ee2589ea3876cfbaa Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 9 Mar 2025 20:24:59 +0000
Subject: [PATCH 174/751] Improve: Discard state in streaming hash

---
 scripts/bench_token.cpp | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index ac632767..8418f826 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -63,14 +63,15 @@ tracked_unary_functions_t hash_functions() {
 
 struct wrap_hash_stream {
     sz_hash_state_t state;
+    sz_hash_state_init_t init;
     sz_hash_state_stream_t stream;
     sz_hash_state_fold_t fold;
 
-    wrap_hash_stream(sz_hash_state_stream_t s, sz_hash_state_fold_t f) : stream(s), fold(f) {
-        sz_hash_state_init(&state, 42);
-    }
+    wrap_hash_stream(sz_hash_state_init_t i, sz_hash_state_stream_t s, sz_hash_state_fold_t f)
+        : init(i), stream(s), fold(f) {}
 
     std::size_t operator()(std::string_view s) noexcept {
+        init(&state, 42);
         stream(&state, s.data(), s.size());
         return fold(&state);
     }
@@ -78,18 +79,23 @@ struct wrap_hash_stream {
 
 tracked_unary_functions_t hash_stream_functions() {
     tracked_unary_functions_t result = {
-        {"sz_hash_stream_serial", wrap_hash_stream(sz_hash_state_stream_serial, sz_hash_state_fold_serial)},
+        {"sz_hash_stream_serial",
+         wrap_hash_stream(sz_hash_state_init_serial, sz_hash_state_stream_serial, sz_hash_state_fold_serial)},
 #if SZ_USE_HASWELL
-        {"sz_hash_stream_haswell", wrap_hash_stream(sz_hash_state_stream_haswell, sz_hash_state_fold_haswell), true},
+        {"sz_hash_stream_haswell",
+         wrap_hash_stream(sz_hash_state_init_haswell, sz_hash_state_stream_haswell, sz_hash_state_fold_haswell), true},
 #endif
 #if SZ_USE_SKYLAKE
-        {"sz_hash_stream_skylake", wrap_hash_stream(sz_hash_state_stream_skylake, sz_hash_state_fold_skylake), true},
+        {"sz_hash_stream_skylake",
+         wrap_hash_stream(sz_hash_state_init_skylake, sz_hash_state_stream_skylake, sz_hash_state_fold_skylake), true},
 #endif
 #if SZ_USE_ICE
-        {"sz_hash_stream_ice", wrap_hash_stream(sz_hash_state_stream_ice, sz_hash_state_fold_ice), true},
+        {"sz_hash_stream_ice",
+         wrap_hash_stream(sz_hash_state_init_ice, sz_hash_state_stream_ice, sz_hash_state_fold_ice), true},
 #endif
 #if SZ_USE_NEON
-        {"sz_hash_stream_neon", wrap_hash_stream(sz_hash_state_stream_neon, sz_hash_state_fold_neon), true},
+        {"sz_hash_stream_neon",
+         wrap_hash_stream(sz_hash_state_init_neon, sz_hash_state_stream_neon, sz_hash_state_fold_neon), true},
 #endif
     };
     return result;

From ff23c3de6851b83908cd81ab9b0e47e26d340f2f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 9 Mar 2025 20:27:29 +0000
Subject: [PATCH 175/751] Fix: `std::string::data` is mutable only since C++17

---
 scripts/bench_token.cpp | 2 +-
 scripts/test.cpp        | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index 8418f826..af16e7a4 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -132,7 +132,7 @@ tracked_unary_functions_t random_generation_functions() {
          })},
         {"std::uniform_int<uint8>", unary_function_t([](std::string_view token) -> std::size_t {
              if (buffer.size() < token.size()) buffer.resize(token.size());
-             randomize_string(buffer.data(), token.size());
+             randomize_string(&buffer[0], token.size());
              return token.size();
          })},
     };
diff --git a/scripts/test.cpp b/scripts/test.cpp
index 8dd66dd9..58756efe 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -223,6 +223,13 @@ static void test_hashing_on_platform(                                   //
     for (auto seed : seeds)
         for (std::size_t copies = 1; copies != 100; ++copies) //
             test_on_seed(repeat("abc", copies), seed);
+
+    // Let's try truly random inputs of different lengths:
+    for (std::size_t length = 0; length != 200; ++length) {
+        std::string text(length, '\0');
+        randomize_string(&text[0], length);
+        for (auto seed : seeds) test_on_seed(text, seed);
+    }
 }
 
 /**

From f9da4edcbc845ff83f65c62818dd913179470e77 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 10 Mar 2025 06:00:01 +0000
Subject: [PATCH 176/751] Fix: Composing STL collections

---
 include/stringzilla/stringzilla.hpp | 20 ++++++++++----------
 scripts/bench_container.cpp         | 27 +++++++++++++++------------
 scripts/test.cpp                    |  4 ++--
 3 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index b0146a82..03cef8d1 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -1971,7 +1971,7 @@ class basic_string_slice {
 #pragma endregion
 
     /**  @brief Hashes the string, equivalent to `std::hash<string_view>{}(str)`. */
-    size_type hash(std::uint64_t seed = 42) const noexcept {
+    size_type hash(std::uint64_t seed = 0) const noexcept {
         return static_cast<size_type>(sz_hash(start_, length_, static_cast<sz_u64_t>(seed)));
     }
 
@@ -3759,10 +3759,10 @@ bool basic_string<char_type_, allocator_>::try_preparing_replacement( //
  *  @see Similar to `std::less<std::string_view>`: https://en.cppreference.com/w/cpp/utility/functional/less
  *
  *  Unlike the STL analog, doesn't require C++14 or including the heavy `<functional>` header.
- *  Can be used to combine STL classes with StringZilla logic, like: `std::map<std::string, int, sz::string_view_less>`.
+ *  Can be used to combine STL classes with StringZilla logic, like: `std::map<std::string, int, sz::less>`.
  */
-struct string_view_less {
-    bool operator()(string_view a, string_view b) const noexcept { return a < b; }
+struct less {
+    inline bool operator()(string_view a, string_view b) const noexcept { return a < b; }
 };
 
 /**
@@ -3771,10 +3771,10 @@ struct string_view_less {
  *
  *  Unlike the STL analog, doesn't require C++14 or including the heavy `<functional>` header.
  *  Can be used to combine STL classes with StringZilla logic, like:
- *      `std::unordered_map<std::string, int, sz::string_view_hash, sz::string_view_equal_to>`.
+ *      `std::unordered_map<std::string, int, sz::hash, sz::equal_to>`.
  */
-struct string_view_equal_to {
-    bool operator()(string_view a, string_view b) const noexcept { return a == b; }
+struct equal_to {
+    inline bool operator()(string_view a, string_view b) const noexcept { return a == b; }
 };
 
 /**
@@ -3783,10 +3783,10 @@ struct string_view_equal_to {
  *
  *  Unlike the STL analog, doesn't require C++14 or including the heavy `<functional>` header.
  *  Can be used to combine STL classes with StringZilla logic, like:
- *      `std::unordered_map<std::string, int, sz::string_view_hash, sz::string_view_equal_to>`.
+ *      `std::unordered_map<std::string, int, sz::hash, sz::equal_to>`.
  */
-struct string_view_hash {
-    std::size_t operator()(string_view str) const noexcept { return str.hash(); }
+struct hash {
+    inline std::size_t operator()(string_view str) const noexcept { return str.hash(); }
 };
 
 /**  @brief SFINAE-type used to infer the resulting type of concatenating multiple string together. */
diff --git a/scripts/bench_container.cpp b/scripts/bench_container.cpp
index 17cd1ec6..ab214517 100644
--- a/scripts/bench_container.cpp
+++ b/scripts/bench_container.cpp
@@ -51,22 +51,23 @@ void bench_tokens(strings_type const &strings) {
     auto const &s = strings;
 
     // StringZilla structures
-    bench<std::map<sz::string, int>>("map<sz::string>", s);
-    bench<std::map<sz::string_view, int>>("map<sz::string_view>", s);
-    bench<std::unordered_map<sz::string, int>>("unordered_map<sz::string>", s);
-    bench<std::unordered_map<sz::string_view, int>>("unordered_map<sz::string_view>", s);
+    bench<std::map<sz::string, int>>("std::map<sz::string>", s);
+    bench<std::map<sz::string_view, int>>("std::map<sz::string_view>", s);
+    bench<std::unordered_map<sz::string, int>>("std::umap<sz::string>", s);
+    bench<std::unordered_map<sz::string_view, int>>("std::umap<sz::string_view>", s);
 
     // Pure STL
-    bench<std::map<std::string, int>>("map<std::string>", s);
-    bench<std::map<std::string_view, int>>("map<std::string_view>", s);
-    bench<std::unordered_map<std::string, int>>("unordered_map<std::string>", s);
-    bench<std::unordered_map<std::string_view, int>>("unordered_map<std::string_view>", s);
+    bench<std::map<std::string, int>>("std::map<std::string>", s);
+    bench<std::map<std::string_view, int>>("std::map<std::string_view>", s);
+    bench<std::unordered_map<std::string, int>>("std::umap<std::string>", s);
+    bench<std::unordered_map<std::string_view, int>>("std::umap<std::string_view>", s);
 
     // STL structures with StringZilla operations
-    // bench<std::map<std::string, int, sz::less>>("map<std::string>", s);
-    // bench<std::map<std::string_view, int, sz::less>>("map<std::string_view>", s);
-    // bench<std::unordered_map<std::string, int, sz::hash, sz::equal_to>>("unordered_map<std::string>", s);
-    // bench<std::unordered_map<std::string_view, int, sz::hash, sz::equal_to>>("unordered_map<std::string_view>", s);
+    bench<std::map<std::string, int, sz::less>>("std::map<std::string, sz::less>", s);
+    bench<std::map<std::string_view, int, sz::less>>("std::map<std::string_view, sz::less>", s);
+    bench<std::unordered_map<std::string, int, sz::hash, sz::equal_to>>("std::umap<std::string, sz::hash>", s);
+    bench<std::unordered_map<std::string_view, int, sz::hash, sz::equal_to>>("std::umap<std::string_view, sz::hash>",
+                                                                             s);
 }
 
 int main(int argc, char const **argv) {
@@ -77,6 +78,8 @@ int main(int argc, char const **argv) {
     // Baseline benchmarks for real words, coming in all lengths
     std::printf("Benchmarking on real words:\n");
     bench_tokens(dataset.tokens);
+    std::printf("Benchmarking on real lines:\n");
+    bench_tokens(dataset.lines);
 
     // Run benchmarks on tokens of different length
     for (std::size_t token_length : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32}) {
diff --git a/scripts/test.cpp b/scripts/test.cpp
index 58756efe..0137053c 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -1915,8 +1915,8 @@ static void test_stl_containers() {
     assert(sorted_words_sz.empty());
     assert(words_sz.empty());
 
-    std::map<std::string, int, sz::string_view_less> sorted_words_stl;
-    std::unordered_map<std::string, int, sz::string_view_hash, sz::string_view_equal_to> words_stl;
+    std::map<std::string, int, sz::less> sorted_words_stl;
+    std::unordered_map<std::string, int, sz::hash, sz::equal_to> words_stl;
     assert(sorted_words_stl.empty());
     assert(words_stl.empty());
 }

From 4bec1e511237b530454ec0e7d3d7b2ff35cd96d1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 10 Mar 2025 07:40:00 +0000
Subject: [PATCH 177/751] Fix: Revert to XMM on Haswell

---
 include/stringzilla/hash.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 3dab9488..5aa40884 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -875,7 +875,7 @@ SZ_PUBLIC void sz_hash_state_init_haswell(sz_hash_state_t *state, sz_u64_t seed)
     for (int i = 0; i < 4; ++i)
         state->aes.xmms[i] = _mm_xor_si128(seed_vec, _mm_load_si128((__m128i const *)(pi + i * 2)));
     for (int i = 0; i < 4; ++i)
-        state->sum.u64x2s[i] = _mm_xor_si128(seed_vec, _mm_load_si128((__m128i const *)(pi + i * 2 + 8)));
+        state->sum.xmms[i] = _mm_xor_si128(seed_vec, _mm_load_si128((__m128i const *)(pi + i * 2 + 8)));
 
     // The inputs are zeroed out at the beginning
     state->ins.xmms[0] = state->ins.xmms[1] = state->ins.xmms[2] = state->ins.xmms[3] = _mm_setzero_si128();
@@ -885,23 +885,23 @@ SZ_PUBLIC void sz_hash_state_init_haswell(sz_hash_state_t *state, sz_u64_t seed)
 SZ_INTERNAL void _sz_hash_state_update_haswell(sz_hash_state_t *state) {
     __m128i const shuffle_mask = _mm_load_si128((__m128i const *)_sz_hash_u8x16x4_shuffle());
     state->aes.xmms[0] = _mm_aesenc_si128(state->aes.xmms[0], state->ins.xmms[0]);
-    state->sum.u64x2s[0] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.u64x2s[0], shuffle_mask), state->ins.xmms[0]);
+    state->sum.xmms[0] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[0], shuffle_mask), state->ins.xmms[0]);
     state->aes.xmms[1] = _mm_aesenc_si128(state->aes.xmms[1], state->ins.xmms[1]);
-    state->sum.u64x2s[1] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.u64x2s[1], shuffle_mask), state->ins.xmms[1]);
+    state->sum.xmms[1] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[1], shuffle_mask), state->ins.xmms[1]);
     state->aes.xmms[2] = _mm_aesenc_si128(state->aes.xmms[2], state->ins.xmms[2]);
-    state->sum.u64x2s[2] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.u64x2s[2], shuffle_mask), state->ins.xmms[2]);
+    state->sum.xmms[2] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[2], shuffle_mask), state->ins.xmms[2]);
     state->aes.xmms[3] = _mm_aesenc_si128(state->aes.xmms[3], state->ins.xmms[3]);
-    state->sum.u64x2s[3] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.u64x2s[3], shuffle_mask), state->ins.xmms[3]);
+    state->sum.xmms[3] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[3], shuffle_mask), state->ins.xmms[3]);
 }
 
 SZ_INTERNAL sz_u64_t _sz_hash_state_finalize_haswell(sz_hash_state_t const *state) {
     // Mix the length into the key
     __m128i key_with_length = _mm_add_epi64(state->key.xmm, _mm_set_epi64x(0, state->ins_length));
     // Combine the "sum" and the "AES" blocks
-    __m128i mixed_registers0 = _mm_aesenc_si128(state->sum.u64x2s[0], state->aes.xmms[0]);
-    __m128i mixed_registers1 = _mm_aesenc_si128(state->sum.u64x2s[1], state->aes.xmms[1]);
-    __m128i mixed_registers2 = _mm_aesenc_si128(state->sum.u64x2s[2], state->aes.xmms[2]);
-    __m128i mixed_registers3 = _mm_aesenc_si128(state->sum.u64x2s[3], state->aes.xmms[3]);
+    __m128i mixed_registers0 = _mm_aesenc_si128(state->sum.xmms[0], state->aes.xmms[0]);
+    __m128i mixed_registers1 = _mm_aesenc_si128(state->sum.xmms[1], state->aes.xmms[1]);
+    __m128i mixed_registers2 = _mm_aesenc_si128(state->sum.xmms[2], state->aes.xmms[2]);
+    __m128i mixed_registers3 = _mm_aesenc_si128(state->sum.xmms[3], state->aes.xmms[3]);
     // Combine the mixed registers
     __m128i mixed_registers01 = _mm_aesenc_si128(mixed_registers0, mixed_registers1);
     __m128i mixed_registers23 = _mm_aesenc_si128(mixed_registers2, mixed_registers3);

From 48d70ea4d859c8624af661f4a10aa654aa46e6a7 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 10 Mar 2025 07:40:29 +0000
Subject: [PATCH 178/751] Fix: No intersect for Skylake

---
 c/lib.c                         | 3 ++-
 include/stringzilla/intersect.h | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/c/lib.c b/c/lib.c
index 7aea9455..afffeeff 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -298,7 +298,6 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->bytesum = sz_bytesum_skylake;
 
         impl->sequence_argsort = sz_sequence_argsort_skylake;
-        impl->sequence_intersect = sz_sequence_intersect_skylake;
         impl->pgrams_sort = sz_pgrams_sort_skylake;
     }
 #endif
@@ -319,6 +318,8 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->hash_state_stream = sz_hash_state_stream_ice;
         impl->hash_state_fold = sz_hash_state_fold_ice;
         impl->fill_random = sz_fill_random_ice;
+
+        impl->sequence_intersect = sz_sequence_intersect_ice;
     }
 #endif
 
diff --git a/include/stringzilla/intersect.h b/include/stringzilla/intersect.h
index b3610969..cf24eb57 100644
--- a/include/stringzilla/intersect.h
+++ b/include/stringzilla/intersect.h
@@ -58,7 +58,7 @@ extern "C" {
  *  Example usage:
  *
  *  @code{.c}
- *      #include <stringzilla/join.h>
+ *      #include <stringzilla/intersect.h>
  *      int main() {
  *          char const *first[] = {"banana", "apple", "cherry"};
  *          char const *second[] = {"cherry", "orange", "pineapple", "banana"};

From 4d955d37530a5e60c73fedd8d057da8a75203b55 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 10 Mar 2025 08:06:31 +0000
Subject: [PATCH 179/751] Improve: Logging in container benchmarks

---
 scripts/bench.hpp           |  3 ++-
 scripts/bench_container.cpp | 53 ++++++++++++++++++-------------------
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index cbec9bf5..69be9722 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -199,7 +199,8 @@ inline dataset_t make_dataset_from_path(std::string path) {
         "Parsed the dataset with:\n"                 //
         "- %zu words of mean length ~ %.2f bytes\n"  //
         "- %zu lines of mean length ~ %.2f bytes\n", //
-        data.tokens.size(), mean_token_bytes, data.lines.size(), mean_line_bytes);
+        "- %zu bytes in total\n",                    //
+        data.tokens.size(), mean_token_bytes, data.lines.size(), mean_line_bytes, data.text.size());
 
     return data;
 }
diff --git a/scripts/bench_container.cpp b/scripts/bench_container.cpp
index ab214517..38d92038 100644
--- a/scripts/bench_container.cpp
+++ b/scripts/bench_container.cpp
@@ -14,25 +14,25 @@
 
 using namespace ashvardanian::stringzilla::scripts;
 
-template <typename string_type_to, typename string_type_from>
-std::vector<string_type_to> to(std::vector<string_type_from> const &strings) {
-    std::vector<string_type_to> result;
+template <typename string_to_type_, typename string_from_type_>
+std::vector<string_to_type_> to(std::vector<string_from_type_> const &strings) {
+    std::vector<string_to_type_> result;
     result.reserve(strings.size());
-    for (string_type_from const &string : strings) result.push_back({string.data(), string.size()});
+    for (string_from_type_ const &string : strings) result.push_back({string.data(), string.size()});
     return result;
 }
 
 /**
  *  @brief  Evaluation for search string operations: find.
  */
-template <typename container_at>
+template <typename container_type_>
 void bench(std::string name, std::vector<std::string_view> const &strings) {
 
-    using key_type = typename container_at::key_type;
+    using key_type = typename container_type_::key_type;
     std::vector<key_type> keys = to<key_type>(strings);
 
     // Build up the container
-    container_at container;
+    container_type_ container;
     for (key_type const &key : keys) container[key] = 0;
 
     tracked_function_gt<unary_function_t> variant;
@@ -45,33 +45,32 @@ void bench(std::string name, std::vector<std::string_view> const &strings) {
     variant.print();
 }
 
-template <typename strings_type>
-void bench_tokens(strings_type const &strings) {
-    if (strings.size() == 0) return;
-    auto const &s = strings;
+template <typename strings_type_>
+void bench_tokens(strings_type_ const &s) {
+    if (s.size() == 0) return;
 
-    // StringZilla structures
-    bench<std::map<sz::string, int>>("std::map<sz::string>", s);
-    bench<std::map<sz::string_view, int>>("std::map<sz::string_view>", s);
-    bench<std::unordered_map<sz::string, int>>("std::umap<sz::string>", s);
-    bench<std::unordered_map<sz::string_view, int>>("std::umap<sz::string_view>", s);
+    // STL containers with StringZilla strings and views
+    bench<std::map<sz::string, int>>("std::map<sz::string>::find", s);
+    bench<std::map<sz::string_view, int>>("std::map<sz::string_view>::find", s);
+    bench<std::unordered_map<sz::string, int>>("std::umap<sz::string>::find", s);
+    bench<std::unordered_map<sz::string_view, int>>("std::umap<sz::string_view>::find", s);
 
-    // Pure STL
-    bench<std::map<std::string, int>>("std::map<std::string>", s);
-    bench<std::map<std::string_view, int>>("std::map<std::string_view>", s);
-    bench<std::unordered_map<std::string, int>>("std::umap<std::string>", s);
-    bench<std::unordered_map<std::string_view, int>>("std::umap<std::string_view>", s);
+    // STL containers with STL strings and views
+    bench<std::map<std::string, int>>("std::map<std::string>::find", s);
+    bench<std::map<std::string_view, int>>("std::map<std::string_view>::find", s);
+    bench<std::unordered_map<std::string, int>>("std::umap<std::string>::find", s);
+    bench<std::unordered_map<std::string_view, int>>("std::umap<std::string_view>::find", s);
 
     // STL structures with StringZilla operations
-    bench<std::map<std::string, int, sz::less>>("std::map<std::string, sz::less>", s);
-    bench<std::map<std::string_view, int, sz::less>>("std::map<std::string_view, sz::less>", s);
-    bench<std::unordered_map<std::string, int, sz::hash, sz::equal_to>>("std::umap<std::string, sz::hash>", s);
-    bench<std::unordered_map<std::string_view, int, sz::hash, sz::equal_to>>("std::umap<std::string_view, sz::hash>",
-                                                                             s);
+    bench<std::map<std::string, int, sz::less>>("std::map<std::string, sz::less>::find", s);
+    bench<std::map<std::string_view, int, sz::less>>("std::map<std::string_view, sz::less>::find", s);
+    bench<std::unordered_map<std::string, int, sz::hash, sz::equal_to>>("std::umap<std::string, sz::hash>::find", s);
+    bench<std::unordered_map<std::string_view, int, sz::hash, sz::equal_to>>(
+        "std::umap<std::string_view, sz::hash>::find", s);
 }
 
 int main(int argc, char const **argv) {
-    std::printf("StringZilla. Starting search benchmarks.\n");
+    std::printf("StringZilla. Starting container benchmarks.\n");
 
     dataset_t dataset = prepare_benchmark_environment(argc, argv);
 

From 3b1897ef2acb8d00834c13f8873210244932ee2f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 10 Mar 2025 10:15:35 +0000
Subject: [PATCH 180/751] Improve: Token benchmarks

In the past, token benchmarks weren't balanced.
For equality comparisons and ordering, they would
take random strings which are almost always differing
in the very first character and in length, making
branch prediction trivial and performance identical
between backends.

The new benchmarks include self-comparisons, which
are more similar to hash-table probing or strings sorting
workloads.
---
 scripts/bench.hpp       |  26 ++++++---
 scripts/bench_token.cpp | 115 +++++++++++++++++++++++++++-------------
 2 files changed, 97 insertions(+), 44 deletions(-)

diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index cbec9bf5..c0877033 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -41,6 +41,22 @@ struct benchmark_result_t {
 using unary_function_t = std::function<std::size_t(std::string_view)>;
 using binary_function_t = std::function<std::size_t(std::string_view, std::string_view)>;
 
+/**
+ *  @brief  Wraps a binary function to compare all combinations of two tokens.
+ *          Designed to benchmark functions that on-average take very different times to execute
+ *          for the same string or different strings. For equality checks it's similar to a typical
+ *          load when probing a Hash Table. For relative ordering, it's similar to sorting a dense
+ *          array with many similar strings.
+ */
+template <typename function_type_>
+binary_function_t binary_combinations(function_type_ function) {
+    return binary_function_t([function](std::string_view a, std::string_view b) {
+        // Assuming most outputs here will be 0 or 1, we want to combine them to with different
+        // multiples to ensure a unique output for each combination.
+        return function(a, b) * 1 + function(a, a) * 2 + function(b, a) * 4 + function(b, b) * 8;
+    });
+}
+
 /**
  *  @brief  Wrapper for a single execution backend.
  */
@@ -144,9 +160,7 @@ inline std::vector<std::string_view> tokenize(std::string_view str, is_separator
     return words;
 }
 
-/**
- *  @brief  Splits a string into words, using newlines, tabs, and whitespaces as delimiters.
- */
+/** @brief Splits a string into words, using newlines, tabs, and whitespaces as delimiters using @b `std::isspace`. */
 inline std::vector<std::string_view> tokenize(std::string_view str) {
     return tokenize(str, [](char c) { return std::isspace(c); });
 }
@@ -175,11 +189,11 @@ struct dataset_t {
 inline dataset_t make_dataset_from_path(std::string path) {
     dataset_t data;
     data.text = read_file(path);
-    data.text.resize(bit_floor(data.text.size()));
+    data.text.resize(bit_floor(data.text.size())); // Shrink to the nearest power of two
     data.tokens = tokenize(data.text);
-    data.tokens.resize(bit_floor(data.tokens.size()));
+    data.tokens.resize(bit_floor(data.tokens.size())); // Shrink to the nearest power of two
     data.lines = tokenize(data.text, [](char c) { return c == '\n'; });
-    data.lines.resize(bit_floor(data.lines.size()));
+    data.lines.resize(bit_floor(data.lines.size())); // Shrink to the nearest power of two
 
 #if !SZ_DEBUG // Shuffle only in release mode
     auto &generator = global_random_generator();
diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index af16e7a4..57cb4bc2 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -11,6 +11,10 @@
 
 using namespace ashvardanian::stringzilla::scripts;
 
+/**
+ *  @brief  Provides kernels, each computing the unsigned sum of bytes in given tokens.
+ *          Compares all supported SIMD backed outputs to the serial implementation.
+ */
 tracked_unary_functions_t bytesum_functions() {
     auto wrap_sz = [](auto function) -> unary_function_t {
         return unary_function_t([function](std::string_view s) { return function(s.data(), s.size()); });
@@ -38,9 +42,13 @@ tracked_unary_functions_t bytesum_functions() {
     return result;
 }
 
+/**
+ *  @brief Provides kernels, each computing the hash of given tokens using the same seed.
+ *         Compares all supported SIMD backed outputs to the serial implementation.
+ */
 tracked_unary_functions_t hash_functions() {
     auto wrap_sz = [](auto function) -> unary_function_t {
-        return unary_function_t([function](std::string_view s) { return function(s.data(), s.size(), 42); });
+        return unary_function_t([function](std::string_view s) { return function(s.data(), s.size(), 0); });
     };
     tracked_unary_functions_t result = {
         {"sz_hash_serial", wrap_sz(sz_hash_serial)},
@@ -61,13 +69,14 @@ tracked_unary_functions_t hash_functions() {
     return result;
 }
 
-struct wrap_hash_stream {
+/** @brief Wraps hash state initialization, streaming, and folding for streaming benchmarks. */
+struct wrap_sz_hash_stream {
     sz_hash_state_t state;
     sz_hash_state_init_t init;
     sz_hash_state_stream_t stream;
     sz_hash_state_fold_t fold;
 
-    wrap_hash_stream(sz_hash_state_init_t i, sz_hash_state_stream_t s, sz_hash_state_fold_t f)
+    wrap_sz_hash_stream(sz_hash_state_init_t i, sz_hash_state_stream_t s, sz_hash_state_fold_t f)
         : init(i), stream(s), fold(f) {}
 
     std::size_t operator()(std::string_view s) noexcept {
@@ -77,30 +86,40 @@ struct wrap_hash_stream {
     }
 };
 
+/**
+ *  @brief  Provides kernels, each computing the hash of given tokens using more expensive "streaming" API.
+ *          Compares all supported SIMD backed outputs to the serial implementation.
+ */
 tracked_unary_functions_t hash_stream_functions() {
     tracked_unary_functions_t result = {
         {"sz_hash_stream_serial",
-         wrap_hash_stream(sz_hash_state_init_serial, sz_hash_state_stream_serial, sz_hash_state_fold_serial)},
+         wrap_sz_hash_stream(sz_hash_state_init_serial, sz_hash_state_stream_serial, sz_hash_state_fold_serial)},
 #if SZ_USE_HASWELL
         {"sz_hash_stream_haswell",
-         wrap_hash_stream(sz_hash_state_init_haswell, sz_hash_state_stream_haswell, sz_hash_state_fold_haswell), true},
+         wrap_sz_hash_stream(sz_hash_state_init_haswell, sz_hash_state_stream_haswell, sz_hash_state_fold_haswell),
+         true},
 #endif
 #if SZ_USE_SKYLAKE
         {"sz_hash_stream_skylake",
-         wrap_hash_stream(sz_hash_state_init_skylake, sz_hash_state_stream_skylake, sz_hash_state_fold_skylake), true},
+         wrap_sz_hash_stream(sz_hash_state_init_skylake, sz_hash_state_stream_skylake, sz_hash_state_fold_skylake),
+         true},
 #endif
 #if SZ_USE_ICE
         {"sz_hash_stream_ice",
-         wrap_hash_stream(sz_hash_state_init_ice, sz_hash_state_stream_ice, sz_hash_state_fold_ice), true},
+         wrap_sz_hash_stream(sz_hash_state_init_ice, sz_hash_state_stream_ice, sz_hash_state_fold_ice), true},
 #endif
 #if SZ_USE_NEON
         {"sz_hash_stream_neon",
-         wrap_hash_stream(sz_hash_state_init_neon, sz_hash_state_stream_neon, sz_hash_state_fold_neon), true},
+         wrap_sz_hash_stream(sz_hash_state_init_neon, sz_hash_state_stream_neon, sz_hash_state_fold_neon), true},
 #endif
     };
     return result;
 }
 
+/**
+ *  @brief  Provides kernels, each generating random bytes for given tokens using the same "nonce".
+ *          Compares all supported SIMD backed outputs to the serial implementation.
+ */
 tracked_unary_functions_t random_generation_functions() {
     static std::vector<char> buffer;
     auto wrap_sz = [](auto function) -> unary_function_t {
@@ -139,55 +158,75 @@ tracked_unary_functions_t random_generation_functions() {
     return result;
 }
 
+/** @brief Wraps string equality check for potentially different length inputs. */
+struct wrap_sz_equal {
+    sz_equal_t function;
+
+    wrap_sz_equal(sz_equal_t f) : function(f) {}
+    bool operator()(std::string_view a, std::string_view b) const noexcept {
+        return a.size() == b.size() && function(a.data(), b.data(), a.size());
+    }
+};
+
+/** @brief Wraps LibC's string equality check for potentially different length inputs. */
+bool memcmp_for_equality(std::string_view a, std::string_view b) noexcept {
+    return (a.size() == b.size() && memcmp(a.data(), b.data(), a.size()) == 0);
+}
+
+/**
+ *  @brief  Provides kernels, each comparing two tokens for equality.
+ *          Compares all supported SIMD backed outputs to the serial implementation.
+ *          In each iteration combines self- and cross-compares to dampen the branch prediction effect,
+ *          assuming most random string would differ in the very first byte.
+ */
 tracked_binary_functions_t equality_functions() {
-    auto wrap_sz = [](auto function) -> binary_function_t {
-        return binary_function_t([function](std::string_view a, std::string_view b) {
-            return a.size() == b.size() && function(a.data(), b.data(), a.size());
-        });
-    };
     tracked_binary_functions_t result = {
-        {"std::string_view.==", [](std::string_view a, std::string_view b) { return a == b; }},
-        {"sz_equal_serial", wrap_sz(sz_equal_serial), true},
+        {"sz_equal_serial", binary_combinations(wrap_sz_equal(sz_equal_serial))},
 #if SZ_USE_HASWELL
-        {"sz_equal_haswell", wrap_sz(sz_equal_haswell), true},
+        {"sz_equal_haswell", binary_combinations(wrap_sz_equal(sz_equal_haswell)), true},
 #endif
 #if SZ_USE_SKYLAKE
-        {"sz_equal_skylake", wrap_sz(sz_equal_skylake), true},
+        {"sz_equal_skylake", binary_combinations(wrap_sz_equal(sz_equal_skylake)), true},
 #endif
-        {"memcmp",
-         [](std::string_view a, std::string_view b) {
-             return (a.size() == b.size() && memcmp(a.data(), b.data(), a.size()) == 0);
-         }},
+#if SZ_USE_SVE
+        {"sz_equal_sve", binary_combinations(wrap_sz_equal(sz_equal_sve)), true},
+#endif
+#if SZ_USE_NEON
+        {"sz_equal_neon", binary_combinations(wrap_sz_equal(sz_equal_neon)), true},
+#endif
+        {"memcmp(equality)", binary_combinations(memcmp_for_equality)},
     };
     return result;
 }
 
+/** @brief Wraps LibC's string comparison for potentially different length inputs. */
+int memcmp_for_ordering(std::string_view a, std::string_view b) noexcept {
+    auto order = memcmp(a.data(), b.data(), a.size() < b.size() ? a.size() : b.size());
+    if (order == 0) return a.size() == b.size() ? 0 : (a.size() < b.size() ? -1 : 1);
+    return order;
+}
+
+/**
+ *  @brief  Provides kernels, each computing the relative order of two tokens.
+ *          Compares all supported SIMD backed outputs to the serial implementation.
+ *          In each iteration combines self- and cross-compares to dampen the branch prediction effect,
+ *          assuming most random string would differ in the very first byte.
+ */
 tracked_binary_functions_t ordering_functions() {
     auto wrap_sz = [](auto function) -> binary_function_t {
         return binary_function_t([function](std::string_view a, std::string_view b) {
-            return function(a.data(), a.size(), b.data(), b.size());
+            return (int)function(a.data(), a.size(), b.data(), b.size());
         });
     };
     tracked_binary_functions_t result = {
-        {"std::string_view.compare",
-         [](std::string_view a, std::string_view b) {
-             auto order = a.compare(b);
-             return (order == 0 ? sz_equal_k : (order < 0 ? sz_less_k : sz_greater_k));
-         }},
-        {"sz_order_serial", wrap_sz(sz_order_serial), true},
+        {"sz_order_serial", binary_combinations(wrap_sz(sz_order_serial))},
 #if SZ_USE_HASWELL
-        {"sz_order_haswell", wrap_sz(sz_order_haswell), true},
+        {"sz_order_haswell", binary_combinations(wrap_sz(sz_order_haswell)), true},
 #endif
 #if SZ_USE_SKYLAKE
-        {"sz_order_skylake", wrap_sz(sz_order_skylake), true},
-#endif
-        {"memcmp",
-         [](std::string_view a, std::string_view b) {
-             auto order = memcmp(a.data(), b.data(), a.size() < b.size() ? a.size() : b.size());
-             return order != 0 ? (a.size() == b.size() ? (order < 0 ? sz_less_k : sz_greater_k)
-                                                       : (a.size() < b.size() ? sz_less_k : sz_greater_k))
-                               : sz_equal_k;
-         }},
+        {"sz_order_skylake", binary_combinations(wrap_sz(sz_order_skylake)), true},
+#endif
+        {"memcmp(ordering)", binary_combinations(memcmp_for_ordering)},
     };
     return result;
 }

From c31020dfbb0bc40a7bfdf86091add8cae1a2fd0c Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 10 Mar 2025 10:17:32 +0000
Subject: [PATCH 181/751] Add: Comparisons in SVE

This leads to doubling the performance
on mixed workloads which may include
self-comparisons, where both comparison
arguments are the same.
---
 include/stringzilla/compare.h | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/include/stringzilla/compare.h b/include/stringzilla/compare.h
index 494d1442..b6412016 100644
--- a/include/stringzilla/compare.h
+++ b/include/stringzilla/compare.h
@@ -399,7 +399,26 @@ SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
 #pragma GCC target("arch=armv8.2-a+sve")
 #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
 
-/* Nothing here for now. */
+SZ_PUBLIC sz_bool_t sz_equal_sve(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
+    // Determine the number of bytes in an SVE vector.
+    do {
+        svbool_t progress_vec = svwhilelt_b8((sz_size_t)0, length);
+        svuint8_t a_vec = svld1(progress_vec, (sz_u8_t const *)a);
+        svuint8_t b_vec = svld1(progress_vec, (sz_u8_t const *)b);
+        // Compare: generate a predicate marking lanes where a!=b
+        svbool_t not_equal_vec = svcmpne(progress_vec, a_vec, b_vec);
+        if (svptest_any(progress_vec, not_equal_vec)) return sz_false_k;
+        sz_size_t const vector_length = svcntp_b8(svptrue_b8(), progress_vec);
+        a += vector_length, b += vector_length, length -= vector_length;
+    } while (length > 0);
+    return sz_true_k;
+}
+
+SZ_PUBLIC sz_ordering_t sz_order_sve(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
+    //! Before optimizing this, read the "Operations Not Worth Optimizing" in Contributions Guide:
+    //! https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md#general-performance-observations
+    return sz_order_serial(a, a_length, b, b_length);
+}
 
 #pragma clang attribute pop
 #pragma GCC pop_options
@@ -417,6 +436,8 @@ SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
     return sz_equal_skylake(a, b, length);
 #elif SZ_USE_HASWELL
     return sz_equal_haswell(a, b, length);
+#elif SZ_USE_SVE
+    return sz_equal_sve(a, b, length);
 #elif SZ_USE_NEON
     return sz_equal_neon(a, b, length);
 #else
@@ -429,6 +450,8 @@ SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b,
     return sz_order_skylake(a, a_length, b, b_length);
 #elif SZ_USE_HASWELL
     return sz_order_haswell(a, a_length, b, b_length);
+#elif SZ_USE_SVE
+    return sz_order_sve(a, a_length, b, b_length);
 #elif SZ_USE_NEON
     return sz_order_neon(a, a_length, b, b_length);
 #else

From 92b9a569d9b0ad24afd9f3a51ef312ccd130ce71 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 10 Mar 2025 10:34:28 +0000
Subject: [PATCH 182/751] Docs: Outdated function naming & spelling

---
 .vscode/settings.json |  9 +++++++++
 README.md             | 28 ++++++++++++++--------------
 2 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 678b1305..85f842ea 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -69,6 +69,7 @@
     "hexdigits",
     "Hirschberg's",
     "Horspool",
+    "Hutter",
     "Hyyro",
     "illformed",
     "initproc",
@@ -100,6 +101,7 @@
     "Morten",
     "Mosè",
     "MSVC",
+    "Nadav",
     "napi",
     "nargsf",
     "ndim",
@@ -119,6 +121,7 @@
     "pgrams",
     "Plouffe",
     "printables",
+    "ptrdiff",
     "pytest",
     "Pythonic",
     "qsort",
@@ -134,13 +137,17 @@
     "Ritchie",
     "rmatcher",
     "rmatches",
+    "Rotem",
     "rpartition",
     "rsplit",
     "rsplits",
     "rstrip",
+    "Sankoff",
+    "Sergey",
     "SIMD",
     "sklearn",
     "Skylake",
+    "Slotin",
     "splitlines",
     "ssize",
     "startswith",
@@ -152,6 +159,7 @@
     "substr",
     "SWAR",
     "Tanimoto",
+    "Taras",
     "thyrotropin",
     "Titin",
     "tparam",
@@ -163,6 +171,7 @@
     "VBMI",
     "vectorcallfunc",
     "Vectorizer",
+    "Vintsyuk",
     "Wagner",
     "whitespaces",
     "Wunsch",
diff --git a/README.md b/README.md
index a3121cb4..b544bb1c 100644
--- a/README.md
+++ b/README.md
@@ -486,9 +486,9 @@ count: int = sz.count("haystack", "needle", start=0, end=sys.maxsize, allowoverl
 ### Edit Distances
 
 ```py
-assert sz.edit_distance("apple", "aple") == 1 # skip one ASCII character
-assert sz.edit_distance("αβγδ", "αγδ") == 2 # skip two bytes forming one rune
-assert sz.edit_distance_unicode("αβγδ", "αγδ") == 1 # one unicode rune
+assert sz.levenshtein_distance("apple", "aple") == 1 # skip one ASCII character
+assert sz.levenshtein_distance("αβγδ", "αγδ") == 2 # skip two bytes forming one rune
+assert sz.levenshtein_distance_unicode("αβγδ", "αγδ") == 1 # one unicode rune
 ```
 
 Several Python libraries provide edit distance computation.
@@ -513,7 +513,7 @@ costs = np.zeros((256, 256), dtype=np.int8)
 costs.fill(-1)
 np.fill_diagonal(costs, 0)
 
-assert sz.alignment_score("first", "second", substitution_matrix=costs, gap_score=-1) == -sz.edit_distance(a, b)
+assert sz.alignment_score("first", "second", substitution_matrix=costs, gap_score=-1) == -sz.levenshtein_distance(a, b)
 ```
 
 Using the same proteins as for Levenshtein distance benchmarks:
@@ -1088,8 +1088,8 @@ Standard library functions may not offer the most efficient or convenient method
 - `haystack.replace_all(sz::byteset(""), replacement_string)`
 - `haystack.try_replace_all(needle_string, replacement_string)`
 - `haystack.try_replace_all(sz::byteset(""), replacement_string)`
-- `haystack.transform(sz::look_up_table::identity())`
-- `haystack.transform(sz::look_up_table::identity(), haystack.data())`
+- `haystack.lookup(sz::look_up_table::identity())`
+- `haystack.lookup(sz::look_up_table::identity(), haystack.data())`
 
 ### Levenshtein Edit Distance and Alignment Scores
 
@@ -1103,8 +1103,8 @@ sz::hamming_distance(first, second[, upper_bound]) -> std::size_t;
 sz::hamming_distance_utf8(first, second[, upper_bound]) -> std::size_t;
 
 // Count number of insertions, deletions and substitutions
-sz::edit_distance(first, second[, upper_bound[, allocator]]) -> std::size_t;
-sz::edit_distance_utf8(first, second[, upper_bound[, allocator]]) -> std::size_t;
+sz::levenshtein_distance(first, second[, upper_bound[, allocator]]) -> std::size_t;
+sz::levenshtein_distance_utf8(first, second[, upper_bound[, allocator]]) -> std::size_t;
 
 // Substitution-parametrized Needleman-Wunsch global alignment score
 std::int8_t costs[256][256]; // Substitution costs matrix
@@ -1160,8 +1160,8 @@ The performance of those containers is often limited by the performance of the s
 StringZilla can be used to accelerate containers with `std::string` keys, by overriding the default comparator and hash functions.
 
 ```cpp
-std::map<std::string, int, sz::string_view_less> sorted_words;
-std::unordered_map<std::string, int, sz::string_view_hash, sz::string_view_equal_to> words;
+std::map<std::string, int, sz::less> sorted_words;
+std::unordered_map<std::string, int, sz::hash, sz::equal_to> words;
 ```
 
 Alternatively, a better approach would be to use the `sz::string` class as a key.
@@ -1278,19 +1278,19 @@ assert_eq!(my_str.sz_find("world"), Some(7));
 assert_eq!(my_cow_str.as_ref().sz_find("world"), Some(7));
 ```
 
-The library also exposes Levenshtein and Hamming edit-distances for byte-arrays and UTF-8 strings, as well as Needleman-Wunch alignment scores.
+The library also exposes Levenshtein and Hamming edit-distances for byte-arrays and UTF-8 strings, as well as Needleman-Wunsch alignment scores.
 
 ```rust
 use stringzilla::sz;
 
 // Handling arbitrary byte arrays:
-sz::edit_distance("Hello, world!", "Hello, world?"); // 1
+sz::levenshtein_distance("Hello, world!", "Hello, world?"); // 1
 sz::hamming_distance("Hello, world!", "Hello, world?"); // 1
 sz::alignment_score("Hello, world!", "Hello, world?", sz::unary_substitution_costs(), -1); // -1
 
 // Handling UTF-8 strings:
 sz::hamming_distance_utf8("αβγδ", "αγγδ") // 1
-sz::edit_distance_utf8("façade", "facade") // 1
+sz::levenshtein_distance_utf8("façade", "facade") // 1
 ```
 
 [memchr-benchmarks]: https://github.com/ashvardanian/memchr_vs_stringzilla
@@ -1465,7 +1465,7 @@ In AVX-512, StringZilla uses non-temporal stores to avoid cache pollution, when
 Moreover, it handles the unaligned head and the tails of the `target` buffer separately, ensuring that writes in big copies are always aligned to cache-line boundaries.
 That's true for both AVX2 and AVX-512 backends.
 
-StringZilla also contains "drafts" of smarter, but less efficient algorithms, that minimize the number of unaligned loads, perfoming shuffles and permutations.
+StringZilla also contains "drafts" of smarter, but less efficient algorithms, that minimize the number of unaligned loads, performing shuffles and permutations.
 That's a topic for future research, as the performance gains are not yet satisfactory.
 
 > § Reading materials.

From 298d2146b8839eab71cb40f5837adc687dc1b952 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 10 Mar 2025 10:51:36 +0000
Subject: [PATCH 183/751] Fix: Extra comma in `printf`

---
 scripts/bench.hpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index a18e47c5..fca1cfea 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -209,11 +209,11 @@ inline dataset_t make_dataset_from_path(std::string path) {
     mean_line_bytes /= data.lines.size();
 
     std::setlocale(LC_NUMERIC, "");
-    std::printf(                                     //
-        "Parsed the dataset with:\n"                 //
-        "- %zu words of mean length ~ %.2f bytes\n"  //
-        "- %zu lines of mean length ~ %.2f bytes\n", //
-        "- %zu bytes in total\n",                    //
+    std::printf(                                    //
+        "Parsed the dataset with:\n"                //
+        "- %zu words of mean length ~ %.2f bytes\n" //
+        "- %zu lines of mean length ~ %.2f bytes\n" //
+        "- %zu bytes in total\n",                   //
         data.tokens.size(), mean_token_bytes, data.lines.size(), mean_line_bytes, data.text.size());
 
     return data;

From 467b4b81cb4bc0e9a64844748a417762378918c9 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 10 Mar 2025 11:10:04 +0000
Subject: [PATCH 184/751] Make: Formatting CMakeLists.txt

---
 .cmake-format.py |  19 ++
 CMakeLists.txt   | 666 +++++++++++++++++++++++------------------------
 2 files changed, 340 insertions(+), 345 deletions(-)
 create mode 100644 .cmake-format.py

diff --git a/.cmake-format.py b/.cmake-format.py
new file mode 100644
index 00000000..fb56f11b
--- /dev/null
+++ b/.cmake-format.py
@@ -0,0 +1,19 @@
+# -----------------------------
+# Options effecting formatting.
+# -----------------------------
+with section("format"):
+    # How wide to allow formatted cmake files
+    line_width = 120
+
+    # How many spaces to tab for indent
+    tab_size = 4
+
+    # If true, separate flow control names from their parentheses with a space
+    separate_ctrl_name_with_space = True
+
+    # If true, separate function names from parentheses with a space
+    separate_fn_name_with_space = False
+
+    # If a statement is wrapped to more than one line, than dangle the closing
+    # parenthesis on its own line.
+    dangle_parens = True
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1da1e36f..f1c1a24f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,42 +2,43 @@
 #
 # This file defines several library build & installation targets:
 #
-# - stringzilla_header: A header-only library with the StringZilla C and C++ headers.
-# - stringzilla_shared: A shared library with the StringZilla C and C++ headers and dynamic SIMD dispatch.
-# - stringzilla_bare: A shared library with the StringZilla headers, but without linking the standard C library.
-# 
+# * stringzilla_header: A header-only library with the StringZilla C and C++ headers.
+# * stringzilla_shared: A shared library with the StringZilla C and C++ headers and dynamic SIMD dispatch.
+# * stringzilla_bare: A shared library with the StringZilla headers, but without linking the standard C library.
+#
 # Tests for different C++ standards:
 #
-#   - stringzilla_test_cpp11: C++11 baseline support.
-#   - stringzilla_test_cpp14: C++14 support with `std::less<std::string>`-like function objects.
-#   - stringzilla_test_cpp17: C++17 support with `std::string_view` compatibility.
-#   - stringzilla_test_cpp20: C++20 support with `<=>` operator and more `constexpr` features.
+# * stringzilla_test_cpp11: C++11 baseline support.
+# * stringzilla_test_cpp14: C++14 support with `std::less<std::string>`-like function objects.
+# * stringzilla_test_cpp17: C++17 support with `std::string_view` compatibility.
+# * stringzilla_test_cpp20: C++20 support with `<=>` operator and more `constexpr` features.
 #
 # Tests for different SIMD architectures:
 #
-#   - stringzilla_test_cpp20_serial: A test executable for serial execution.
-#   - stringzilla_test_cpp20_haswell: A test executable for AVX2.
-#   - stringzilla_test_cpp20_ice: A test executable for AVX-512.
-#   - stringzilla_test_cpp20_neon: A test executable for ARM Neon.
-#   - stringzilla_test_cpp20_sve: A test executable for ARM Scalable Vector Extension.
+# * stringzilla_test_cpp20_serial: A test executable for serial execution.
+# * stringzilla_test_cpp20_haswell: A test executable for AVX2.
+# * stringzilla_test_cpp20_ice: A test executable for AVX-512.
+# * stringzilla_test_cpp20_neon: A test executable for ARM Neon.
+# * stringzilla_test_cpp20_sve: A test executable for ARM Scalable Vector Extension.
 #
 # Benchmarks:
 #
-#   - stringzilla_bench_search: A benchmark for substring search operations.
-#   - stringzilla_bench_similarity: A benchmark for similarity operations.
-#   - stringzilla_bench_sort: A benchmark for sorting operations.
-#   - stringzilla_bench_token: A benchmark for comparators and hash functions.
-#   - stringzilla_bench_container: A benchmark for STL containers powered by StringZilla.
-#   - stringzilla_bench_memory: A benchmark for LibC-style low-level memory operations.
+# * stringzilla_bench_search: A benchmark for substring search operations.
+# * stringzilla_bench_similarity: A benchmark for similarity operations.
+# * stringzilla_bench_sort: A benchmark for sorting operations.
+# * stringzilla_bench_token: A benchmark for comparators and hash functions.
+# * stringzilla_bench_container: A benchmark for STL containers powered by StringZilla.
+# * stringzilla_bench_memory: A benchmark for LibC-style low-level memory operations.
 #
 # For higher-level language bindings separate build scripts are provided, native to each toolchain.
 cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
 project(
-  stringzilla
-  VERSION 3.11.3
-  LANGUAGES C CXX
-  DESCRIPTION "SIMD-accelerated string search, sort, hashes, fingerprints, & edit distances"
-  HOMEPAGE_URL "https://github.com/ashvardanian/stringzilla")
+    stringzilla
+    VERSION 3.11.3
+    LANGUAGES C CXX
+    DESCRIPTION "SIMD-accelerated string search, sort, hashes, fingerprints, & edit distances"
+    HOMEPAGE_URL "https://github.com/ashvardanian/stringzilla"
+)
 
 set(CMAKE_C_STANDARD 99)
 set(CMAKE_CXX_STANDARD 11)
@@ -55,363 +56,338 @@ message(STATUS "C++ Compiler Version: ${CMAKE_CXX_COMPILER_VERSION}")
 message(STATUS "C++ Compiler: ${CMAKE_CXX_COMPILER}")
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 
-if(CMAKE_SIZEOF_VOID_P EQUAL 8)
-  message(STATUS "Pointer size: 64-bit")
-else()
-  message(STATUS "Pointer size: 32-bit")
-endif()
+if (CMAKE_SIZEOF_VOID_P EQUAL 8)
+    message(STATUS "Pointer size: 64-bit")
+else ()
+    message(STATUS "Pointer size: 32-bit")
+endif ()
 
 # Set a default build type to "Release" if none was specified
-if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
-  message(STATUS "Setting build type to 'Release' as none was specified.")
-  set(CMAKE_BUILD_TYPE
-    Release
-    CACHE STRING "Choose the type of build." FORCE)
-  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release"
-    "MinSizeRel" "RelWithDebInfo")
-endif()
-
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64")
-  SET(SZ_PLATFORM_X86 TRUE)
-  message(STATUS "Platform: x86")
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64|arm64|ARM64")
-  SET(SZ_PLATFORM_ARM TRUE)
-  message(STATUS "Platform: ARM")
-endif()
-
-# Determine if StringZilla is built as a sub-project (using `add_subdirectory`)
-# or if it is the main project
+if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+    message(STATUS "Setting build type to 'Release' as none was specified.")
+    set(CMAKE_BUILD_TYPE
+        Release
+        CACHE STRING "Choose the type of build." FORCE
+    )
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
+endif ()
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64")
+    set(SZ_PLATFORM_X86 TRUE)
+    message(STATUS "Platform: x86")
+elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64|arm64|ARM64")
+    set(SZ_PLATFORM_ARM TRUE)
+    message(STATUS "Platform: ARM")
+endif ()
+
+# Determine if StringZilla is built as a sub-project (using `add_subdirectory`) or if it is the main project
 set(STRINGZILLA_IS_MAIN_PROJECT OFF)
 
-if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
-  set(STRINGZILLA_IS_MAIN_PROJECT ON)
-endif()
+if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
+    set(STRINGZILLA_IS_MAIN_PROJECT ON)
+endif ()
 
 # Installation options
 option(STRINGZILLA_INSTALL "Install CMake targets" OFF)
-option(STRINGZILLA_BUILD_TEST "Compile a native unit test in C++"
-  ${STRINGZILLA_IS_MAIN_PROJECT})
-option(STRINGZILLA_BUILD_BENCHMARK "Compile a native benchmark in C++"
-  ${STRINGZILLA_IS_MAIN_PROJECT})
+option(STRINGZILLA_BUILD_TEST "Compile a native unit test in C++" ${STRINGZILLA_IS_MAIN_PROJECT})
+option(STRINGZILLA_BUILD_BENCHMARK "Compile a native benchmark in C++" ${STRINGZILLA_IS_MAIN_PROJECT})
 option(STRINGZILLA_BUILD_SHARED "Compile a dynamic library" ${STRINGZILLA_IS_MAIN_PROJECT})
 set(STRINGZILLA_TARGET_ARCH
-  ""
-  CACHE STRING "Architecture to tell the compiler to optimize for (-march)")
+    ""
+    CACHE STRING "Architecture to tell the compiler to optimize for (-march)"
+)
 
 # Includes
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
 include(ExternalProject)
 include(CheckCSourceCompiles)
 
-# Allow CMake 3.13+ to override options when using FetchContent /
-# add_subdirectory
-if(POLICY CMP0077)
-  cmake_policy(SET CMP0077 NEW)
-endif()
+# Allow CMake 3.13+ to override options when using FetchContent / add_subdirectory
+if (POLICY CMP0077)
+    cmake_policy(SET CMP0077 NEW)
+endif ()
 
 # Configuration
 include(GNUInstallDirs)
 set(STRINGZILLA_INCLUDE_BUILD_DIR "${PROJECT_SOURCE_DIR}/include/")
 set(STRINGZILLA_INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}")
 
-
-if(${CMAKE_VERSION} VERSION_EQUAL 3.13 OR ${CMAKE_VERSION} VERSION_GREATER 3.13)
-  include(CTest)
-  enable_testing()
-endif()
+if (${CMAKE_VERSION} VERSION_EQUAL 3.13 OR ${CMAKE_VERSION} VERSION_GREATER 3.13)
+    include(CTest)
+    enable_testing()
+endif ()
 
 if (MSVC)
-  # Remove /RTC* from MSVC debug flags by default (it will be added back in the set_compiler_flags function)
-  # Because /RTC* cannot be used without the crt so it needs to be disabled for that specific target
-  string(REGEX REPLACE "/RTC[^ ]*" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
-  string(REGEX REPLACE "/RTC[^ ]*" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
-endif()
+    # Remove /RTC* from MSVC debug flags by default (it will be added back in the set_compiler_flags function) Because
+    # /RTC* cannot be used without the crt so it needs to be disabled for that specific target
+    string(REGEX REPLACE "/RTC[^ ]*" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
+    string(REGEX REPLACE "/RTC[^ ]*" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
+endif ()
 
 # Function to set compiler-specific flags
-function(set_compiler_flags target cpp_standard target_arch)
-  get_target_property(target_type ${target} TYPE)
+function (set_compiler_flags target cpp_standard target_arch)
+    get_target_property(target_type ${target} TYPE)
 
-  target_include_directories(${target} PRIVATE scripts)
+    target_include_directories(${target} PRIVATE scripts)
 
-  # Set output directory for single-configuration generators (like Make)
-  set_target_properties(${target} PROPERTIES
-    RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/$<0:>
-  )
+    # Set output directory for single-configuration generators (like Make)
+    set_target_properties(${target} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/$<0:>)
 
-  # Set output directory for multi-configuration generators (like Visual Studio)
-  foreach(config IN LISTS CMAKE_CONFIGURATION_TYPES)
-    string(TOUPPER ${config} config_upper)
-    set_target_properties(${target} PROPERTIES
-      RUNTIME_OUTPUT_DIRECTORY_${config_upper} ${CMAKE_BINARY_DIR}/$<0:>
-    )
-  endforeach()
-
-  # Set the C++ standard
-  if(NOT ${cpp_standard} STREQUAL "")
-    set_target_properties(${target} PROPERTIES CXX_STANDARD ${cpp_standard})
-  endif()
-
-  # Use the /Zc:__cplusplus flag to correctly define the __cplusplus macro in MSVC
-  target_compile_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:MSVC>:/Zc:__cplusplus>")
-
-  # Maximum warnings level & warnings as error.
-  # MVC uses numeric values:
-  # > 4068 for "unknown pragmas".
-  # > 4146 for "unary minus operator applied to unsigned type, result still unsigned".
-  # We also specify /utf-8 to properly UTF-8 symbols in tests.
-  target_compile_options(
-    ${target}
-    PRIVATE
-    "$<$<CXX_COMPILER_ID:MSVC>:/Bt;/wd4068;/wd4146;/utf-8;/WX>"
-    "$<$<CXX_COMPILER_ID:GNU>:-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas;-Wno-cast-function-type;-Wno-unused-function>"
-    "$<$<CXX_COMPILER_ID:Clang>:-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas>"
-    "$<$<CXX_COMPILER_ID:AppleClang>:-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas>"
-  )
-
-  # Set optimization options for different compilers differently
-  target_compile_options(
-    ${target}
-    PRIVATE
-    "$<$<AND:$<CXX_COMPILER_ID:GNU>,$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>>:-O3>"
-    "$<$<AND:$<CXX_COMPILER_ID:GNU>,$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>>:-g>"
-    "$<$<AND:$<CXX_COMPILER_ID:Clang>,$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>>:-O3>"
-    "$<$<AND:$<CXX_COMPILER_ID:Clang>,$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>>:-g>"
-    "$<$<AND:$<CXX_COMPILER_ID:MSVC>,$<CONFIG:Release>>:/O2>"
-    "$<$<AND:$<CXX_COMPILER_ID:MSVC>,$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>>:/O2>"
-    "$<$<AND:$<CXX_COMPILER_ID:MSVC>,$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>>:/Zi>"
-  )
-
-  if(NOT target_type STREQUAL "SHARED_LIBRARY")
-    if(MSVC)
-      target_compile_options(${target} PRIVATE "$<$<CONFIG:Debug>:/RTC1>")
-    endif()
-  endif()
-
-  # If available, enable Position Independent Code
-  get_target_property(target_pic ${target} POSITION_INDEPENDENT_CODE)
-  if(target_pic)
-    target_compile_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-fPIC>")
-    target_link_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-fPIC>")
-    target_compile_definitions(${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:SZ_PIC>")
-  endif()
-
-  # Avoid builtin functions where we know what we are doing.
-  target_compile_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-fno-builtin-memcmp>")
-  target_compile_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-fno-builtin-memchr>")
-  target_compile_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-fno-builtin-memcpy>")
-  target_compile_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-fno-builtin-memset>")
-  target_compile_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:MSVC>:/Oi->")
-
-  # Check for ${target_arch} and set it or use the current system if not defined
-  if("${target_arch}" STREQUAL "")
-    # Only use the current system if we are not cross compiling
-    if((NOT CMAKE_CROSSCOMPILING) OR (CMAKE_SYSTEM_PROCESSOR MATCHES CMAKE_HOST_SYSTEM_PROCESSOR))
-      if (NOT MSVC)
-        include(CheckCXXCompilerFlag)
-        check_cxx_compiler_flag("-march=native" supports_march_native)
-        if (supports_march_native)
-          target_compile_options(${target} PRIVATE "-march=native")
-        endif()
-      else()
-        # MSVC does not have a direct equivalent to -march=native
-        target_compile_options(${target} PRIVATE "/arch:AVX2")
-      endif()
-    endif()
-  else()
+    # Set output directory for multi-configuration generators (like Visual Studio)
+    foreach (config IN LISTS CMAKE_CONFIGURATION_TYPES)
+        string(TOUPPER ${config} config_upper)
+        set_target_properties(${target} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${config_upper} ${CMAKE_BINARY_DIR}/$<0:>)
+    endforeach ()
+
+    # Set the C++ standard
+    if (NOT ${cpp_standard} STREQUAL "")
+        set_target_properties(${target} PROPERTIES CXX_STANDARD ${cpp_standard})
+    endif ()
+
+    # Use the /Zc:__cplusplus flag to correctly define the __cplusplus macro in MSVC
+    target_compile_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:MSVC>:/Zc:__cplusplus>")
+
+    # Maximum warnings level & warnings as error. MVC uses numeric values: > 4068 for "unknown pragmas". > 4146 for
+    # "unary minus operator applied to unsigned type, result still unsigned". We also specify /utf-8 to properly UTF-8
+    # symbols in tests.
     target_compile_options(
-      ${target}
-      PRIVATE
-      "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-march=${target_arch}>"
-      "$<$<CXX_COMPILER_ID:MSVC>:/arch:${target_arch}>")
-  endif()
-
-  # Define SZ_DETECT_BIG_ENDIAN macro based on system byte order
-  if(CMAKE_C_BYTE_ORDER STREQUAL "BIG_ENDIAN")
-    set(SZ_DETECT_BIG_ENDIAN 1)
-  else()
-    set(SZ_DETECT_BIG_ENDIAN 0)
-  endif()
-
-  target_compile_definitions(
-    ${target}
-    PRIVATE
-    "SZ_DETECT_BIG_ENDIAN=${SZ_DETECT_BIG_ENDIAN}"
-  )
-
-  # Sanitizer options for Debug mode
-  if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-    if(NOT target_type STREQUAL "SHARED_LIBRARY")
-      target_compile_options(
         ${target}
         PRIVATE
-        "$<$<CXX_COMPILER_ID:GNU,Clang>:-fsanitize=address;-fsanitize=leak>"
-        "$<$<CXX_COMPILER_ID:MSVC>:/fsanitize=address>")
+            "$<$<CXX_COMPILER_ID:MSVC>:/Bt;/wd4068;/wd4146;/utf-8;/WX>"
+            "$<$<CXX_COMPILER_ID:GNU>:-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas;-Wno-cast-function-type;-Wno-unused-function>"
+            "$<$<CXX_COMPILER_ID:Clang>:-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas>"
+            "$<$<CXX_COMPILER_ID:AppleClang>:-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas>"
+    )
 
-      target_link_options(
+    # Set optimization options for different compilers differently
+    target_compile_options(
         ${target}
-        PRIVATE
-        "$<$<CXX_COMPILER_ID:GNU,Clang>:-fsanitize=address;-fsanitize=leak>"
-        "$<$<CXX_COMPILER_ID:MSVC>:/fsanitize=address>")
-    endif()
-
-    # Define SZ_DEBUG macro based on build configuration
-    target_compile_definitions(
-      ${target}
-      PRIVATE
-      "$<$<CONFIG:Debug>:SZ_DEBUG=1>"
-      "$<$<NOT:$<CONFIG:Debug>>:SZ_DEBUG=0>"
+        PRIVATE "$<$<AND:$<CXX_COMPILER_ID:GNU>,$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>>:-O3>"
+                "$<$<AND:$<CXX_COMPILER_ID:GNU>,$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>>:-g>"
+                "$<$<AND:$<CXX_COMPILER_ID:Clang>,$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>>:-O3>"
+                "$<$<AND:$<CXX_COMPILER_ID:Clang>,$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>>:-g>"
+                "$<$<AND:$<CXX_COMPILER_ID:MSVC>,$<CONFIG:Release>>:/O2>"
+                "$<$<AND:$<CXX_COMPILER_ID:MSVC>,$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>>:/O2>"
+                "$<$<AND:$<CXX_COMPILER_ID:MSVC>,$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>>:/Zi>"
     )
-  endif()
-endfunction()
-
-function(define_launcher exec_name source cpp_standard target_arch)
-  add_executable(${exec_name} ${source})
-  set_compiler_flags(${exec_name} ${cpp_standard} "${target_arch}")
-  target_link_libraries(${exec_name} PRIVATE stringzilla_header)
-  add_test(NAME ${exec_name} COMMAND ${exec_name})
-endfunction()
-
-if(${STRINGZILLA_BUILD_BENCHMARK})
-  define_launcher(stringzilla_bench_search scripts/bench_search.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
-  define_launcher(stringzilla_bench_similarity scripts/bench_similarity.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
-  define_launcher(stringzilla_bench_sort scripts/bench_sort.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
-  define_launcher(stringzilla_bench_token scripts/bench_token.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
-  define_launcher(stringzilla_bench_container scripts/bench_container.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
-  define_launcher(stringzilla_bench_memory scripts/bench_memory.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
-endif()
-
-if(${STRINGZILLA_BUILD_TEST})
-  # Make sure that the compilation passes for different C++ standards
-  # ! Keep in mind, MSVC only supports C++11 and newer.
-  define_launcher(stringzilla_test_cpp11 scripts/test.cpp 11 "${STRINGZILLA_TARGET_ARCH}")
-  define_launcher(stringzilla_test_cpp14 scripts/test.cpp 14 "${STRINGZILLA_TARGET_ARCH}")
-  define_launcher(stringzilla_test_cpp17 scripts/test.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
-  define_launcher(stringzilla_test_cpp20 scripts/test.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
-
-  # Check system architecture to avoid complex cross-compilation workflows, but
-  # compile multiple backends: disabling all SIMD, enabling only AVX2, only AVX-512, only Arm Neon.
-  if(SZ_PLATFORM_X86)
-    # x86 specific backends
-    if (MSVC)
-      define_launcher(stringzilla_test_cpp20_serial scripts/test.cpp 20 "AVX")
-      define_launcher(stringzilla_test_cpp20_haswell scripts/test.cpp 20 "AVX2")
-      define_launcher(stringzilla_test_cpp20_ice scripts/test.cpp 20 "AVX512")
-    else()
-      define_launcher(stringzilla_test_cpp20_serial scripts/test.cpp 20 "ivybridge")
-      define_launcher(stringzilla_test_cpp20_haswell scripts/test.cpp 20 "haswell")
-      define_launcher(stringzilla_test_cpp20_ice scripts/test.cpp 20 "sapphirerapids")
-    endif()
-  elseif(SZ_PLATFORM_ARM)
-    # ARM specific backends
-    define_launcher(stringzilla_test_cpp20_serial scripts/test.cpp 20 "armv8-a")
-    define_launcher(stringzilla_test_cpp20_neon scripts/test.cpp 20 "armv8-a+simd")
-    define_launcher(stringzilla_test_cpp20_sve scripts/test.cpp 20 "armv8.2-a+sve")
-  endif()
-endif()
+
+    if (NOT target_type STREQUAL "SHARED_LIBRARY")
+        if (MSVC)
+            target_compile_options(${target} PRIVATE "$<$<CONFIG:Debug>:/RTC1>")
+        endif ()
+    endif ()
+
+    # If available, enable Position Independent Code
+    get_target_property(target_pic ${target} POSITION_INDEPENDENT_CODE)
+    if (target_pic)
+        target_compile_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-fPIC>")
+        target_link_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-fPIC>")
+        target_compile_definitions(${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:SZ_PIC>")
+    endif ()
+
+    # Avoid builtin functions where we know what we are doing.
+    target_compile_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-fno-builtin-memcmp>")
+    target_compile_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-fno-builtin-memchr>")
+    target_compile_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-fno-builtin-memcpy>")
+    target_compile_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-fno-builtin-memset>")
+    target_compile_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:MSVC>:/Oi->")
+
+    # Check for ${target_arch} and set it or use the current system if not defined
+    if ("${target_arch}" STREQUAL "")
+        # Only use the current system if we are not cross compiling
+        if ((NOT CMAKE_CROSSCOMPILING) OR (CMAKE_SYSTEM_PROCESSOR MATCHES CMAKE_HOST_SYSTEM_PROCESSOR))
+            if (NOT MSVC)
+                include(CheckCXXCompilerFlag)
+                check_cxx_compiler_flag("-march=native" supports_march_native)
+                if (supports_march_native)
+                    target_compile_options(${target} PRIVATE "-march=native")
+                endif ()
+            else ()
+                # MSVC does not have a direct equivalent to -march=native
+                target_compile_options(${target} PRIVATE "/arch:AVX2")
+            endif ()
+        endif ()
+    else ()
+        target_compile_options(
+            ${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-march=${target_arch}>"
+                              "$<$<CXX_COMPILER_ID:MSVC>:/arch:${target_arch}>"
+        )
+    endif ()
+
+    # Define SZ_DETECT_BIG_ENDIAN macro based on system byte order
+    if (CMAKE_C_BYTE_ORDER STREQUAL "BIG_ENDIAN")
+        set(SZ_DETECT_BIG_ENDIAN 1)
+    else ()
+        set(SZ_DETECT_BIG_ENDIAN 0)
+    endif ()
+
+    target_compile_definitions(${target} PRIVATE "SZ_DETECT_BIG_ENDIAN=${SZ_DETECT_BIG_ENDIAN}")
+
+    # Sanitizer options for Debug mode
+    if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+        if (NOT target_type STREQUAL "SHARED_LIBRARY")
+            target_compile_options(
+                ${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang>:-fsanitize=address;-fsanitize=leak>"
+                                  "$<$<CXX_COMPILER_ID:MSVC>:/fsanitize=address>"
+            )
+
+            target_link_options(
+                ${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang>:-fsanitize=address;-fsanitize=leak>"
+                "$<$<CXX_COMPILER_ID:MSVC>:/fsanitize=address>"
+            )
+        endif ()
+
+        # Define SZ_DEBUG macro based on build configuration
+        target_compile_definitions(
+            ${target} PRIVATE "$<$<CONFIG:Debug>:SZ_DEBUG=1>" "$<$<NOT:$<CONFIG:Debug>>:SZ_DEBUG=0>"
+        )
+    endif ()
+endfunction ()
+
+function (define_launcher exec_name source cpp_standard target_arch)
+    add_executable(${exec_name} ${source})
+    set_compiler_flags(${exec_name} ${cpp_standard} "${target_arch}")
+    target_link_libraries(${exec_name} PRIVATE stringzilla_header)
+    add_test(NAME ${exec_name} COMMAND ${exec_name})
+endfunction ()
+
+if (${STRINGZILLA_BUILD_BENCHMARK})
+    define_launcher(stringzilla_bench_search scripts/bench_search.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_bench_similarity scripts/bench_similarity.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_bench_sort scripts/bench_sort.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_bench_token scripts/bench_token.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_bench_container scripts/bench_container.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_bench_memory scripts/bench_memory.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
+endif ()
+
+if (${STRINGZILLA_BUILD_TEST})
+    # Make sure that the compilation passes for different C++ standards ! Keep in mind, MSVC only supports C++11 and
+    # newer.
+    define_launcher(stringzilla_test_cpp11 scripts/test.cpp 11 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_test_cpp14 scripts/test.cpp 14 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_test_cpp17 scripts/test.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_test_cpp20 scripts/test.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+
+    # Check system architecture to avoid complex cross-compilation workflows, but compile multiple backends: disabling
+    # all SIMD, enabling only AVX2, only AVX-512, only Arm Neon.
+    if (SZ_PLATFORM_X86)
+        # x86 specific backends
+        if (MSVC)
+            define_launcher(stringzilla_test_cpp20_serial scripts/test.cpp 20 "AVX")
+            define_launcher(stringzilla_test_cpp20_haswell scripts/test.cpp 20 "AVX2")
+            define_launcher(stringzilla_test_cpp20_ice scripts/test.cpp 20 "AVX512")
+        else ()
+            define_launcher(stringzilla_test_cpp20_serial scripts/test.cpp 20 "ivybridge")
+            define_launcher(stringzilla_test_cpp20_haswell scripts/test.cpp 20 "haswell")
+            define_launcher(stringzilla_test_cpp20_ice scripts/test.cpp 20 "sapphirerapids")
+        endif ()
+    elseif (SZ_PLATFORM_ARM)
+        # ARM specific backends
+        define_launcher(stringzilla_test_cpp20_serial scripts/test.cpp 20 "armv8-a")
+        define_launcher(stringzilla_test_cpp20_neon scripts/test.cpp 20 "armv8-a+simd")
+        define_launcher(stringzilla_test_cpp20_sve scripts/test.cpp 20 "armv8.2-a+sve")
+    endif ()
+endif ()
 
 # Define our libraries, first the header-only version
 add_library(stringzilla_header INTERFACE)
 add_library(${PROJECT_NAME}::stringzilla_header ALIAS stringzilla_header)
 target_include_directories(
-  stringzilla_header
-  INTERFACE $<BUILD_INTERFACE:${STRINGZILLA_INCLUDE_BUILD_DIR}>
-  $<INSTALL_INTERFACE:include>)
-
-
-if(${STRINGZILLA_BUILD_SHARED})
-
-  function(define_shared target)
-    add_library(${target} SHARED c/lib.c)
-    add_library(${PROJECT_NAME}::${target} ALIAS ${target})
-
-    set_target_properties(${target} PROPERTIES
-      VERSION ${PROJECT_VERSION}
-      SOVERSION 1
-      POSITION_INDEPENDENT_CODE ON)
-
-    if (SZ_PLATFORM_X86)
-      if (MSVC)
-        set_compiler_flags(${target} "" "SSE2")
-      else()
-        set_compiler_flags(${target} "" "ivybridge")
-      endif()
-
-      target_compile_definitions(${target} PRIVATE
-        "SZ_USE_HASWELL=1"
-        "SZ_USE_SKYLAKE=1"
-        "SZ_USE_ICE=1"
-        "SZ_USE_NEON=0"
-        "SZ_USE_SVE=0")
-    elseif(SZ_PLATFORM_ARM)
-      set_compiler_flags(${target} "" "armv8-a")
-
-      target_compile_definitions(${target} PRIVATE
-        "SZ_USE_HASWELL=0"
-        "SZ_USE_SKYLAKE=0"
-        "SZ_USE_ICE=0"
-        "SZ_USE_NEON=1"
-        "SZ_USE_SVE=1")
-    endif()
-
-    if (MSVC)
-      # Add dependencies for necessary runtime libraries in case of static linking
-      # This ensures that basic runtime functions are available:
-      # msvcrt.lib: Microsoft Visual C Runtime, required for basic C runtime functions on Windows.
-      # vcruntime.lib: Microsoft Visual C++ Runtime library for basic runtime functions.
-      # ucrt.lib: Universal C Runtime, necessary for linking basic C functions like I/O.
-      target_link_libraries(${target} PRIVATE msvcrt.lib vcruntime.lib ucrt.lib)
-    endif()
-
-  endfunction()
-
-  define_shared(stringzilla_shared)
-  target_compile_definitions(stringzilla_shared PRIVATE "SZ_AVOID_LIBC=0")
-  target_compile_definitions(stringzilla_shared PRIVATE "SZ_OVERRIDE_LIBC=1")
-  target_include_directories(stringzilla_shared PUBLIC include)
-  
-
-  # Try compiling a version without linking the LibC
-  # ! This is only for Linux and Windows, as on modern Arm-based MacOS machines
-  # ! we can't legally access Arm's "feature registers" without `sysctl` or `sysctlbyname`.
-  # So let's check if we are compiling for a Darwin-based OS.
-  if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
-    define_shared(stringzilla_bare)
-    target_compile_definitions(stringzilla_bare PRIVATE "SZ_AVOID_LIBC=1")
-    target_compile_definitions(stringzilla_bare PRIVATE "SZ_OVERRIDE_LIBC=1")
-    target_include_directories(stringzilla_bare PUBLIC include)
-
-    # Avoid built-ins on MSVC and other compilers, as that will cause compilation errors
-    target_compile_options(stringzilla_bare PRIVATE
-      "$<$<CXX_COMPILER_ID:GNU,Clang>:-fno-builtin;-nostdlib>"
-      "$<$<CXX_COMPILER_ID:MSVC>:/Oi-;/GS->")
-    target_link_options(stringzilla_bare PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang>:-nostdlib>")
-    target_link_options(stringzilla_bare PRIVATE "$<$<CXX_COMPILER_ID:MSVC>:/NODEFAULTLIB>")
-  endif()
-endif()
-
-if(STRINGZILLA_INSTALL)
-  install(
-    TARGETS stringzilla_shared
-    ARCHIVE
-    BUNDLE
-    FRAMEWORK
-    LIBRARY
-    OBJECTS
-    PRIVATE_HEADER
-    PUBLIC_HEADER
-    RESOURCE
-    RUNTIME)
-  install(
-    TARGETS stringzilla_bare
-    ARCHIVE
-    BUNDLE
-    FRAMEWORK
-    LIBRARY
-    OBJECTS
-    PRIVATE_HEADER
-    PUBLIC_HEADER
-    RESOURCE
-    RUNTIME)
-  install(DIRECTORY ${STRINGZILLA_INCLUDE_BUILD_DIR} DESTINATION ${STRINGZILLA_INCLUDE_INSTALL_DIR})
-  install(DIRECTORY ./c/ DESTINATION /usr/src/${PROJECT_NAME}/)
-endif()
+    stringzilla_header INTERFACE $<BUILD_INTERFACE:${STRINGZILLA_INCLUDE_BUILD_DIR}> $<INSTALL_INTERFACE:include>
+)
+
+if (${STRINGZILLA_BUILD_SHARED})
+
+    function (define_shared target)
+        add_library(${target} SHARED c/lib.c)
+        add_library(${PROJECT_NAME}::${target} ALIAS ${target})
+
+        set_target_properties(
+            ${target}
+            PROPERTIES VERSION ${PROJECT_VERSION}
+                       SOVERSION 1
+                       POSITION_INDEPENDENT_CODE ON
+        )
+
+        if (SZ_PLATFORM_X86)
+            if (MSVC)
+                set_compiler_flags(${target} "" "SSE2")
+            else ()
+                set_compiler_flags(${target} "" "ivybridge")
+            endif ()
+
+            target_compile_definitions(
+                ${target} PRIVATE "SZ_USE_HASWELL=1" "SZ_USE_SKYLAKE=1" "SZ_USE_ICE=1" "SZ_USE_NEON=0" "SZ_USE_SVE=0"
+            )
+        elseif (SZ_PLATFORM_ARM)
+            set_compiler_flags(${target} "" "armv8-a")
+
+            target_compile_definitions(
+                ${target} PRIVATE "SZ_USE_HASWELL=0" "SZ_USE_SKYLAKE=0" "SZ_USE_ICE=0" "SZ_USE_NEON=1" "SZ_USE_SVE=1"
+            )
+        endif ()
+
+        if (MSVC)
+            # Add dependencies for necessary runtime libraries in case of static linking. This ensures that basic
+            # runtime functions are available:
+            #
+            # * msvcrt.lib: Microsoft Visual C Runtime, required for basic C runtime functions on Windows.
+            # * vcruntime.lib: Microsoft Visual C++ Runtime library for basic runtime functions.
+            # * ucrt.lib: Universal C Runtime, necessary for linking basic C functions like I/O.
+            target_link_libraries(${target} PRIVATE msvcrt.lib vcruntime.lib ucrt.lib)
+        endif ()
+
+    endfunction ()
+
+    define_shared(stringzilla_shared)
+    target_compile_definitions(stringzilla_shared PRIVATE "SZ_AVOID_LIBC=0")
+    target_compile_definitions(stringzilla_shared PRIVATE "SZ_OVERRIDE_LIBC=1")
+    target_include_directories(stringzilla_shared PUBLIC include)
+
+    # Try compiling a version without linking the LibC ! This is only for Linux and Windows, as on modern Arm-based
+    # MacOS machines ! we can't legally access Arm's "feature registers" without `sysctl` or `sysctlbyname`. So let's
+    # check if we are compiling for a Darwin-based OS.
+    if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+        define_shared(stringzilla_bare)
+        target_compile_definitions(stringzilla_bare PRIVATE "SZ_AVOID_LIBC=1")
+        target_compile_definitions(stringzilla_bare PRIVATE "SZ_OVERRIDE_LIBC=1")
+        target_include_directories(stringzilla_bare PUBLIC include)
+
+        # Avoid built-ins on MSVC and other compilers, as that will cause compilation errors
+        target_compile_options(
+            stringzilla_bare PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang>:-fno-builtin;-nostdlib>"
+                                     "$<$<CXX_COMPILER_ID:MSVC>:/Oi-;/GS->"
+        )
+        target_link_options(stringzilla_bare PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang>:-nostdlib>")
+        target_link_options(stringzilla_bare PRIVATE "$<$<CXX_COMPILER_ID:MSVC>:/NODEFAULTLIB>")
+    endif ()
+endif ()
+
+if (STRINGZILLA_INSTALL)
+    install(
+        TARGETS stringzilla_shared
+        ARCHIVE
+        BUNDLE
+        FRAMEWORK
+        LIBRARY
+        OBJECTS
+        PRIVATE_HEADER
+        PUBLIC_HEADER
+        RESOURCE
+        RUNTIME
+    )
+    install(
+        TARGETS stringzilla_bare
+        ARCHIVE
+        BUNDLE
+        FRAMEWORK
+        LIBRARY
+        OBJECTS
+        PRIVATE_HEADER
+        PUBLIC_HEADER
+        RESOURCE
+        RUNTIME
+    )
+    install(DIRECTORY ${STRINGZILLA_INCLUDE_BUILD_DIR} DESTINATION ${STRINGZILLA_INCLUDE_INSTALL_DIR})
+    install(DIRECTORY ./c/ DESTINATION /usr/src/${PROJECT_NAME}/)
+endif ()

From 366816ed5c9811de42b5affdfab41d329ede06cd Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 10 Mar 2025 11:10:30 +0000
Subject: [PATCH 185/751] Docs: Ignore formatting CMake

---
 .git-blame-ignore-revs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index c583f5fb..0f60e2c9 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -39,4 +39,4 @@ bd547453122e9f8565e5be15f137e7b0de37caca
 22e3d1e34d62d68c1e89df7c8bdc201faa18a9de
 ecb377541d0c706cf8997faff4f026b07e3f76f3
 0d982a45f842287d7e344f0d8b360f52482017f5
-
+467b4b81cb4bc0e9a64844748a417762378918c9

From 47444066d8817831d961095c278b20a1a01d9678 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 11 Mar 2025 21:54:26 +0000
Subject: [PATCH 186/751] Add: All new benchmarking suite

The initial version only reimplements the
substring and byteset search benchmarks.
---
 .vscode/launch.json      |   4 +
 scripts/bench.hpp        | 734 +++++++++++++++++++++++------------
 scripts/bench_search.cpp | 820 +++++++++++++++++++++++++--------------
 3 files changed, 999 insertions(+), 559 deletions(-)

diff --git a/.vscode/launch.json b/.vscode/launch.json
index 34ec245d..70e06dc5 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -50,6 +50,10 @@
         {
           "name": "ASAN_OPTIONS",
           "value": "detect_leaks=0:atexit=1:strict_init_order=1:strict_string_checks=1"
+        },
+        {
+          "name": "STRINGWARS_DATASET",
+          "value": "leipzig1M.txt"
         }
       ],
       "stopAtEntry": false,
diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index fca1cfea..1d390238 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -1,16 +1,45 @@
 /**
+ *  @file   bench.hpp
  *  @brief  Helper structures and functions for C++ benchmarks.
+ *
+ *  The StringZilla benchmarking suite doesn't use any external frameworks like Criterion or Google Benchmark.
+ *  There are several reasons for that:
+ *
+ *  1.  Reduce the number of @b dependencies and the complexity of the build system.
+ *
+ *  2.  Combine @b "stress-testing" with benchmarks to deduplicate logic.
+ *      As we work with often large datasets, with complex preprocessing, and many different backends,
+ *      we want to minimize the surface area we debug and maintain, keeping track of string-specific
+ *      properties, like:
+ *
+ *      -   Is the string start aligned in memory?
+ *      -   Does it take more than one cache line? Is it's length a multiple of the SIMD vector size?
+ *      -   Is the string cached in the L1 or L2 cache? Can the dataset fit in L3?
+ *
+ *      As part of that stress-testing, on failure, those properties will be persisted in a file on disk.
+ *
+ *  3.  Use cheaper profiling methods like @b CPU-counter instructions, as opposed to wall-clock time.
+ *      Assuming we can clearly isolate single-threaded workloads and are more interested in the number
+ *      of retired instructions, CPU counters can be more accurate and less noisy.
+ *
+ *  4.  Integrate with Linux @b `perf` and other tools for more detailed analysis.
+ *      We can isolate the relevant pieces of code, excluding the preprocessing costs from the actual workload.
+ *
+ *  5.  Visualize the results differently, with a compact output for both generic workloads and special cases.
  */
 #pragma once
 #include <algorithm>
 #include <chrono>     // `std::chrono::high_resolution_clock`
 #include <clocale>    // `std::setlocale`
 #include <cstring>    // `std::memcpy`
+#include <exception>  // `std::invalid_argument`
 #include <functional> // `std::equal_to`
 #include <limits>     // `std::numeric_limits`
 #include <random>     // `std::random_device`, `std::mt19937`
 #include <string>     // `std::hash`
-#include <vector>
+#include <vector>     // `std::vector`
+#include <regex>      // `std::regex`, `std::regex_search`
+#include <thread>     // `std::this_thread::sleep_for`
 
 #include <string_view> // Requires C++17
 
@@ -20,125 +49,127 @@
 #include "test.hpp" // `read_file`
 
 namespace sz = ashvardanian::stringzilla;
+namespace stdc = std::chrono;
 
 namespace ashvardanian {
 namespace stringzilla {
 namespace scripts {
 
-using seconds_t = double;
+using accurate_clock_t = stdc::high_resolution_clock;
 
 template <std::size_t multiple>
 std::size_t round_up_to_multiple(std::size_t n) {
     return n == 0 ? multiple : ((n + multiple - 1) / multiple) * multiple;
 }
 
-struct benchmark_result_t {
-    std::size_t iterations = 0;
+struct call_result_t {
+    /** @brief Number of input bytes processed. */
     std::size_t bytes_passed = 0;
-    seconds_t seconds = 0;
+    /** @brief Some value used to compare execution result between the baseline and accelerated backend. */
+    std::size_t check_value = 0;
+    /** @brief For some operations with non-linear complexity, the throughput should be measured differently. */
+    std::size_t operations = 0;
+
+    call_result_t() = default;
+    call_result_t(std::size_t bytes_passed, std::size_t check_value = 0, std::size_t operations = 0)
+        : bytes_passed(bytes_passed), check_value(check_value), operations(operations) {}
+};
+
+struct callable_no_op_t {
+    call_result_t operator()(std::size_t) const { return {}; }
 };
 
-using unary_function_t = std::function<std::size_t(std::string_view)>;
-using binary_function_t = std::function<std::size_t(std::string_view, std::string_view)>;
+using profiled_function_t = std::function<call_result_t(std::size_t)>;
 
 /**
- *  @brief  Wraps a binary function to compare all combinations of two tokens.
- *          Designed to benchmark functions that on-average take very different times to execute
- *          for the same string or different strings. For equality checks it's similar to a typical
- *          load when probing a Hash Table. For relative ordering, it's similar to sorting a dense
- *          array with many similar strings.
+ *  @brief  Cross-platform function to get the number of CPU cycles elapsed @b only on the current core.
+ *          Used as a more efficient alternative to `std::chrono::high_resolution_clock`.
  */
+inline std::uint64_t cpu_cycle_counter() {
+#if defined(__i386__) || defined(__x86_64__)
+    // Use x86 inline assembly for `rdtsc` only if actually compiling for x86.
+    unsigned int lo, hi;
+    __asm__ volatile("rdtsc" : "=a"(lo), "=d"(hi));
+    return (static_cast<std::uint64_t>(hi) << 32) | lo;
+#elif defined(__aarch64__) || defined(_SZ_IS_ARM64)
+    // On ARM64, read the virtual count register (CNTVCT_EL0) which provides cycle count.
+    std::uint64_t cnt;
+    asm volatile("mrs %0, cntvct_el0" : "=r"(cnt));
+    return cnt;
+#else
+    return 0;
+#endif
+}
+
+/** @brief Measures the approximate number of CPU cycles per second. */
+inline std::uint64_t cpu_cycles_per_second() {
+    std::uint64_t start = cpu_cycle_counter();
+    std::this_thread::sleep_for(stdc::seconds(1));
+    std::uint64_t end = cpu_cycle_counter();
+    return end - start;
+}
+
+/** @brief Measures the duration of a single call to the given function. */
 template <typename function_type_>
-binary_function_t binary_combinations(function_type_ function) {
-    return binary_function_t([function](std::string_view a, std::string_view b) {
-        // Assuming most outputs here will be 0 or 1, we want to combine them to with different
-        // multiples to ensure a unique output for each combination.
-        return function(a, b) * 1 + function(a, a) * 2 + function(b, a) * 4 + function(b, b) * 8;
-    });
+double seconds_per_call(function_type_ &&function) {
+    accurate_clock_t::time_point start = accurate_clock_t::now();
+    function();
+    accurate_clock_t::time_point end = accurate_clock_t::now();
+    return stdc::duration_cast<stdc::nanoseconds>(end - start).count() / 1.e9;
 }
 
 /**
- *  @brief  Wrapper for a single execution backend.
+ *  @brief  Allows time-limited for-loop iteration, similar to Google Benchmark's `for (auto _ : state)`.
+ *          Use as `for (auto running_seconds : repeat_up_to(5.0)) { ... }`.
  */
-template <typename function_type_>
-struct tracked_function_gt {
-    std::string name {""};
-    function_type_ function {nullptr};
-    bool needs_testing {false};
-
-    std::size_t failed_count;
-    std::vector<std::string> failed_strings;
-    benchmark_result_t results;
-
-    tracked_function_gt(std::string name = "", function_type_ function = nullptr, bool needs_testing = false)
-        : name(name), function(function), needs_testing(needs_testing), failed_count(0), failed_strings(), results() {}
-
-    tracked_function_gt(tracked_function_gt const &) = default;
-    tracked_function_gt &operator=(tracked_function_gt const &) = default;
-
-    void print() const {
-        bool is_binary = std::is_same<function_type_, binary_function_t>();
-
-        // If failures have occurred, output them to file to simplify the debugging process.
-        bool contains_failures = !failed_strings.empty();
-        if (contains_failures) {
-            // The file name is made of the string hash and the function name.
-            for (std::size_t fail_index = 0; fail_index != failed_strings.size();) {
-                std::string const &first_argument = failed_strings[fail_index];
-                std::string file_name =
-                    "failed_" + name + "_" + std::to_string(std::hash<std::string> {}(first_argument));
-                if (is_binary) {
-                    std::string const &second_argument = failed_strings[fail_index + 1];
-                    write_file(file_name + ".first.txt", first_argument);
-                    write_file(file_name + ".second.txt", second_argument);
-                    fail_index += 2;
-                }
-                else {
-                    write_file(file_name + ".txt", first_argument);
-                    fail_index += 1;
-                }
-            }
+struct repeat_up_to {
+    double max_seconds = 0;
+    double passed_seconds = 0;
+
+    struct end_sentinel {};
+    class iterator {
+        accurate_clock_t::time_point start_time_;
+        double max_seconds_ = 0;
+        double &passed_seconds_;
+
+      public:
+        inline iterator(double max_seconds, double &passed_seconds)
+            : start_time_(accurate_clock_t::now()), max_seconds_(max_seconds), passed_seconds_(passed_seconds) {}
+        inline bool operator!=(end_sentinel) const {
+            accurate_clock_t::time_point current_time = accurate_clock_t::now();
+            passed_seconds_ = stdc::duration_cast<stdc::nanoseconds>(current_time - start_time_).count() / 1.e9;
+            return passed_seconds_ < max_seconds_;
         }
-
-        // Now let's print in the format:
-        //  - name, up to 32 characters
-        //  - throughput in GB/s with up to 3 significant digits, 10 characters
-        //  - call latency in ns with up to 1 significant digit, 10 characters
-        //  - number of failed tests, 10 characters
-        //  - first example of a failed test, up to 20 characters
-        char const *format;
-        if (is_binary) { format = "- %-32s %15.4f GB/s %15.1f ns %10zu errors in %10zu iterations %-20s %-20s\n"; }
-        else { format = "- %-32s %15.4f GB/s %15.1f ns %10zu errors in %10zu iterations %-20s\n"; }
-
-        std::printf(format, name.c_str(), results.bytes_passed / results.seconds / 1.e9,
-                    results.seconds * 1e9 / results.iterations, failed_count, results.iterations,
-                    failed_strings.size() ? failed_strings[0].c_str() : "",
-                    failed_strings.size() >= 2 && is_binary ? failed_strings[1].c_str() : "");
-    }
+        inline double operator*() const { return passed_seconds_; }
+        constexpr void operator++() {} // No-op
+    };
+
+    inline repeat_up_to(double max_seconds) : max_seconds(max_seconds) {}
+    inline iterator begin() { return {max_seconds, passed_seconds}; }
+    inline end_sentinel end() const noexcept { return {}; }
+    inline double seconds() const noexcept { return passed_seconds; }
 };
 
-using tracked_unary_functions_t = std::vector<tracked_function_gt<unary_function_t>>;
-using tracked_binary_functions_t = std::vector<tracked_function_gt<binary_function_t>>;
-
 /**
  *  @brief  Stops compilers from optimizing out the expression.
- *          Shamelessly stolen from Google Benchmark.
+ *          Shamelessly stolen from Google Benchmark's @b `DoNotOptimize`.
  */
 template <typename argument_type>
-inline void do_not_optimize(argument_type &&value) {
+static void do_not_optimize(argument_type &&value) noexcept {
+
 #if defined(_MSC_VER) // MSVC
     using plain_type = typename std::remove_reference<argument_type>::type;
     // Use the `volatile` keyword and a memory barrier to prevent optimization
     volatile plain_type *p = &value;
     _ReadWriteBarrier();
 #else // Other compilers (GCC, Clang, etc.)
-    asm volatile("" : "+r,m"(value) : : "memory");
+    __asm__ __volatile__("" : "+g"(value) : : "memory");
 #endif
 }
 
 /**
- *  @brief  Rounds the number down to the preceding power of two.
- *          Equivalent to `std::bit_ceil`.
+ *  @brief Rounds the number @b down to the preceding power of two.
+ *  @see Equivalent to `std::bit_floor`: https://en.cppreference.com/w/cpp/numeric/bit_floor
  */
 inline std::size_t bit_floor(std::size_t n) {
     if (n == 0) return 0;
@@ -147,6 +178,10 @@ inline std::size_t bit_floor(std::size_t n) {
     return static_cast<std::size_t>(1) << most_siginificant_bit_position;
 }
 
+/**
+ *  @brief Tokenizes a string with the given separator predicate.
+ *  @see For faster ways to tokenize a string with STL: https://ashvardanian.com/posts/splitting-strings-cpp/
+ */
 template <typename is_separator_callback_type>
 inline std::vector<std::string_view> tokenize(std::string_view str, is_separator_callback_type &&is_separator) {
     std::vector<std::string_view> words;
@@ -175,219 +210,402 @@ inline std::vector<result_string_type> filter_by_length(std::vector<from_string_
     return result;
 }
 
-inline static std::size_t seconds_per_benchmark = SZ_DEBUG ? 1 : 10;
-
-struct dataset_t {
-    std::string text;
+/**
+ *  @brief  Environment for the benchmarking scripts pulled from the CLI arguments.
+ *
+ *  The original CLI arguments include the @p path to the dataset file and the number of @p seconds per benchmark,
+ *  the Regex @p filter to select only the backends that match the given pattern, as well as the @p tokenization
+ *  mode to convert the loaded textual @p dataset to a @p tokens array.
+ *
+ *  In the RELEASE mode, the tokens will be shuffled to avoid any bias in the benchmarking process.
+ *  The @p seed is used to guarantee reproducibility of the results between different runs.
+ */
+struct environment_t {
+    enum tokenization_t : unsigned char {
+        file_k = 255,
+        lines_k = 254,
+        words_k = 253,
+    };
+
+    /** @brief Absolute path of the textual input file on disk. */
+    std::string path;
+    /** @brief Stress-testing results directory. */
+    std::string stress_dir;
+
+    /** @brief Tokenization mode to convert the @p dataset to @p tokens. */
+    tokenization_t tokenization = tokenization_t::words_k;
+    /** @brief Regular expression to filter the backends. */
+    std::string filter;
+
+    /** @brief Whether to stress-test the backends. */
+    bool stress = true;
+    /** @brief Upper time bound on a duration of the stress-test for a single callable. */
+    std::size_t stress_seconds = SZ_DEBUG ? 1 : 10;
+    /** @brief Upper time bound on a duration of a single callable. */
+    std::size_t benchmark_seconds = SZ_DEBUG ? 1 : 10;
+    /** @brief Seed for the random number generator. */
+    std::size_t seed = 0;
+    /** @brief Upper bound on the number of stress test failures on a callable. */
+    std::size_t stress_limit = 1;
+
+    /** @brief Textual content of the dataset file, fully loaded into memory. */
+    std::string dataset;
+    /** @brief Array of tokens extracted from the @p dataset. */
     std::vector<std::string_view> tokens;
-    std::vector<std::string_view> lines;
+
+    bool allow(std::string const &benchmark_name) const {
+        return filter.empty() || std::regex_search(benchmark_name, std::regex(filter));
+    }
 };
 
 /**
- *  @brief  Loads a dataset from a file.
+ *  @brief  Prepares the environment for benchmarking based on environment variables and default settings.
+ *          It's expected that different workloads may use different default datasets and tokenization modes,
+ *          but time limits and seeds are usually consistent across all benchmarks.
+ *
+ *  @param[in] argc Number of command-line string arguments. Not used in reality.
+ *  @param[in] argv Array of command-line string arguments. Not used in reality.
+ *
+ *  @param[in] default_dataset Path to the default dataset file, if the @b `STRINGWARS_DATASET` is not set.
+ *  @param[in] default_tokens Tokenization mode, if the @b `STRINGWARS_TOKENS` is not set.
+ *  @param[in] default_duration Time limit per benchmark, if the @b `STRINGWARS_DURATION` is not set.
+ *
+ *  @param[in] default_stress Whether to stress-test the backends, if the @b `STRINGWARS_STRESS` is not set.
+ *  @param[in] default_stress_dir Directory for stress-testing logs, if the @b `STRINGWARS_STRESS_DIR` is not set.
+ *  @param[in] default_stress_limit Max number of failures to tolerate, if the @b `STRINGWARS_STRESS_LIMIT` is not set.
+ *  @param[in] default_stress_duration Time limit per stress-test, if the @b `STRINGWARS_STRESS_DURATION` is not set.
+ *
+ *  @param[in] default_filter Regular expression to filter the backends, if the @b `STRINGWARS_FILTER` is not set.
+ *  @param[in] default_seed Seed for reproducibility, if the @b `STRINGWARS_SEED` is not set.
  */
-inline dataset_t make_dataset_from_path(std::string path) {
-    dataset_t data;
-    data.text = read_file(path);
-    data.text.resize(bit_floor(data.text.size())); // Shrink to the nearest power of two
-    data.tokens = tokenize(data.text);
-    data.tokens.resize(bit_floor(data.tokens.size())); // Shrink to the nearest power of two
-    data.lines = tokenize(data.text, [](char c) { return c == '\n'; });
-    data.lines.resize(bit_floor(data.lines.size())); // Shrink to the nearest power of two
-
-#if !SZ_DEBUG // Shuffle only in release mode
-    auto &generator = global_random_generator();
-    std::shuffle(data.tokens.begin(), data.tokens.end(), generator);
-    std::shuffle(data.lines.begin(), data.lines.end(), generator);
-#endif
+inline environment_t build_environment(                                        //
+    int argc, char const *argv[],                                              //< Ignored
+    std::string default_dataset, environment_t::tokenization_t default_tokens, //< Mandatory
+    std::size_t default_duration = SZ_DEBUG ? 1 : 10,                          //< Optional
+    bool default_stress = true,                                                //
+    std::string default_stress_dir = ".tmp",                                   //
+    std::size_t default_stress_limit = 1,                                      //
+    std::size_t default_stress_duration = SZ_DEBUG ? 1 : 10,                   //
+    std::string default_filter = "",                                           //
+    std::size_t default_seed = 0                                               //
+    ) noexcept(false) {
+
+    sz_unused(argc && argv); // Unused in this context
+    environment_t env;
+
+    // Use `STRINGWARS_DATASET` if set, otherwise `default_dataset`
+    if (char const *env_var = std::getenv("STRINGWARS_DATASET")) { env.path = env_var; }
+    else { env.path = default_dataset; }
+
+    // Use `STRINGWARS_FILTER` if set, otherwise `default_filter`
+    if (char const *env_var = std::getenv("STRINGWARS_FILTER")) { env.filter = env_var; }
+    else { env.filter = default_filter; }
+
+    // Use `STRINGWARS_DURATION` if set, otherwise `default_duration`
+    if (char const *env_var = std::getenv("STRINGWARS_DURATION")) {
+        env.benchmark_seconds = std::stoul(env_var);
+        if (env.benchmark_seconds == 0) throw std::invalid_argument("The time limit must be greater than 0.");
+    }
+    else { env.benchmark_seconds = default_duration; }
 
-    // Report some basic stats about the dataset
-    double mean_token_bytes = 0, mean_line_bytes = 0;
-    for (auto const &str : data.tokens) mean_token_bytes += str.size();
-    for (auto const &str : data.lines) mean_line_bytes += str.size();
-    mean_token_bytes /= data.tokens.size();
-    mean_line_bytes /= data.lines.size();
-
-    std::setlocale(LC_NUMERIC, "");
-    std::printf(                                    //
-        "Parsed the dataset with:\n"                //
-        "- %zu words of mean length ~ %.2f bytes\n" //
-        "- %zu lines of mean length ~ %.2f bytes\n" //
-        "- %zu bytes in total\n",                   //
-        data.tokens.size(), mean_token_bytes, data.lines.size(), mean_line_bytes, data.text.size());
-
-    return data;
-}
+    // Use `STRINGWARS_SEED` if set, otherwise `default_seed`
+    if (char const *env_var = std::getenv("STRINGWARS_SEED")) {
+        env.seed = std::stoul(env_var);
+        if (env.seed == 0) throw std::invalid_argument("The seed must be a positive integer.");
+    }
+    else { env.seed = default_seed; }
+
+    // Use `STRINGWARS_TOKENS` if set, otherwise `default_tokens`
+    if (char const *env_var = std::getenv("STRINGWARS_TOKENS")) {
+        std::string token_arg(env_var);
+        if (token_arg == "file") { env.tokenization = environment_t::file_k; }
+        else if (token_arg == "lines") { env.tokenization = environment_t::lines_k; }
+        else if (token_arg == "words") { env.tokenization = environment_t::words_k; }
+        else {
+            // If it's not one of the known strings, assume it's an unsigned integer (for N-grams).
+            env.tokenization = static_cast<environment_t::tokenization_t>(std::stoul(token_arg));
+            if (env.tokenization == 0)
+                throw std::invalid_argument(
+                    "The tokenization mode must be 'file', 'line', 'word', or a positive integer.");
+        }
+    }
+    else { env.tokenization = default_tokens; }
+
+    // Extract the stress-testing settings
+    if (char const *env_var = std::getenv("STRINGWARS_STRESS")) {
+        bool is_zero = std::strcmp(env_var, "0") != 0 || std::strcmp(env_var, "false") != 0;
+        bool is_one = std::strcmp(env_var, "1") != 0 || std::strcmp(env_var, "true") != 0;
+        env.stress = is_one;
+        if (!is_zero && !is_one) throw std::invalid_argument("The stress-testing flag must be '0' or '1'.");
+    }
+    else { env.stress = default_stress; }
+    if (char const *env_var = std::getenv("STRINGWARS_STRESS_DURATION")) {
+        env.stress_seconds = std::stoul(env_var);
+        if (env.stress_seconds == 0)
+            throw std::invalid_argument("The stress-testing time limit must be greater than 0.");
+    }
+    else { env.stress_seconds = default_stress_duration; }
+    if (char const *env_var = std::getenv("STRINGWARS_STRESS_DIR")) { env.stress_dir = env_var; }
+    else { env.stress_dir = default_stress_dir; }
+    if (char const *env_var = std::getenv("STRINGWARS_STRESS_LIMIT")) {
+        env.stress_limit = std::stoul(env_var);
+        if (env.stress_limit == 0) throw std::invalid_argument("The stress-testing limit must be greater than 0.");
+    }
+    else { env.stress_limit = default_stress_limit; }
 
-/**
- *  @brief  Loads a dataset, depending on the passed CLI arguments.
- */
-inline dataset_t prepare_benchmark_environment(int argc, char const *argv[]) {
-    if (argc < 2 || argc > 3)
-        throw std::runtime_error("Usage: " + std::string(argv[0]) + " <path> [seconds_per_benchmark]");
+    env.dataset = read_file(env.path);
+    env.dataset.resize(bit_floor(env.dataset.size())); // Shrink to the nearest power of two
 
-    dataset_t data = make_dataset_from_path(argv[1]);
+    // Tokenize the dataset according to the tokenization mode
+    if (env.tokenization == environment_t::file_k) { env.tokens.push_back(env.dataset); }
+    else if (env.tokenization == environment_t::lines_k) {
+        env.tokens = tokenize(env.dataset, [](char c) { return c == '\n'; });
+    }
+    else if (env.tokenization == environment_t::words_k) { env.tokens = tokenize(env.dataset); }
+    else {
+        std::size_t n = static_cast<std::size_t>(env.tokenization);
+        env.tokens = filter_by_length(tokenize(env.dataset), n, std::equal_to<std::size_t>());
+    }
+    env.tokens.resize(bit_floor(env.tokens.size())); // Shrink to the nearest power of two
+
+    // In "RELEASE" mode, shuffle tokens to avoid bias.
+    char const *seed_message = " (not used in DEBUG mode)";
+#if !defined(SZ_DEBUG)
+    std::mt19937 generator(static_cast<unsigned int>(env.seed));
+    std::shuffle(env.tokens.begin(), env.tokens.end(), generator);
+    seed_message = "";
+#endif
 
-    // If the seconds_per_benchmark argument is provided, update the value in the dataset
-    if (argc == 3) {
-        seconds_per_benchmark = std::stoi(argv[2]);
-        if (seconds_per_benchmark == 0)
-            throw std::invalid_argument("The number of seconds per task must be greater than 0.");
+    auto const mean_token_length =
+        std::accumulate(env.tokens.begin(), env.tokens.end(), 0,
+                        [](std::size_t sum, std::string_view token) { return sum + token.size(); }) *
+        1.0 / env.tokens.size();
+
+    // Group integer decimal separators by 3
+    // https://www.ibm.com/docs/en/i/7.4?topic=categories-lc-numeric-category
+    std::setlocale(LC_NUMERIC, "en_US.UTF-8");
+    std::printf("Environment built with the following settings:\n");
+    std::printf(" - Dataset path: %s\n", env.path.c_str());
+    std::printf(" - Time limit: %zu seconds per benchmark (%zu per stress-test)\n", env.benchmark_seconds,
+                env.stress_seconds);
+    if (!env.filter.empty()) std::printf(" - Algorithm filter: %s\n", env.filter.c_str());
+    std::printf(" - Tokenization mode: ");
+    switch (env.tokenization) {
+    case environment_t::file_k: std::printf("file\n"); break;
+    case environment_t::lines_k: std::printf("line\n"); break;
+    case environment_t::words_k: std::printf("word\n"); break;
+    default: std::printf("%zu-grams\n", static_cast<std::size_t>(env.tokenization)); break;
     }
+    std::printf(" - Seed: %zu%s\n", env.seed, seed_message);
+    std::printf(" - Loaded dataset size: %zu bytes\n", env.dataset.size());
+    std::printf(" - Number of tokens: %zu\n", env.tokens.size());
+    std::printf(" - Mean token length: %.2f bytes\n", mean_token_length);
 
-    return data;
+    return env;
 }
 
-inline sz_string_view_t to_c(std::string_view str) noexcept { return {str.data(), str.size()}; }
-inline sz_string_view_t to_c(std::string const &str) noexcept { return {str.data(), str.size()}; }
-inline sz_string_view_t to_c(sz::string_view str) noexcept { return {str.data(), str.size()}; }
-inline sz_string_view_t to_c(sz::string const &str) noexcept { return {str.data(), str.size()}; }
-inline sz_string_view_t to_c(sz_string_view_t str) noexcept { return str; }
-
 /**
- *  @brief  Invoke the same function many times, until the total time elapsed exceeds the limit.
- *  @return Total seconds elapsed.
+ *  @brief  Uses C-style file IO to save information about the most recent stress test failure.
+ *          Files can be found in: "$STRINGWARS_STRESS_DIR/failed_$time_$name.txt".
  */
-template <typename function_type_>
-seconds_t repeat_until_limit(function_type_ &&function) {
+inline void log_stress_failure(environment_t const &env, std::string const &name, std::size_t input_index,
+                               std::size_t expected_check_value, std::size_t actual_check_value) noexcept(false) {
 
-    namespace stdc = std::chrono;
-    using clock_t = stdc::high_resolution_clock;
-    clock_t::time_point start_time = clock_t::now();
-    seconds_t seconds = 0;
+    std::string file_name = "failed_" + name + "_" + std::to_string(input_index) + ".txt";
+    std::string file_path = env.stress_dir + "/" + file_name;
+    std::FILE *file = std::fopen(file_path.c_str(), "w");
+    if (!file) throw std::runtime_error("Failed to open file for writing: " + file_name);
 
-    while (seconds < seconds_per_benchmark) {
-        function();
-        clock_t::time_point current_time = clock_t::now();
-        seconds = stdc::duration_cast<stdc::nanoseconds>(current_time - start_time).count() / 1.e9;
-    }
-    return seconds;
+    std::fprintf(file, "Expected: %zu\n", expected_check_value);
+    std::fprintf(file, "Actual: %zu\n", actual_check_value);
+    std::fclose(file);
 }
 
-/**
- *  @brief  Loop over all elements in a dataset in somewhat random order, benchmarking the function cost.
- *  @param  strings Strings to loop over. Length must be a power of two.
- *  @param  function Function to be applied to each `sz_string_view_t`. Must return the number of bytes processed.
- *  @return Number of seconds per iteration.
- */
-template <typename strings_type_, typename function_type_>
-benchmark_result_t bench_on_tokens(strings_type_ &&strings, function_type_ &&function) {
+struct benchmark_result_t {
+    std::string name;
+    bool skipped = false;
+
+    std::size_t stress_calls = 0;
+    std::size_t profiled_calls = 0;
+    double profiled_seconds = 0;
+    std::uint64_t profiled_cpu_cycles = 0;
+
+    std::size_t bytes_passed = 0; //< Pulled from the `call_result_t`
+    std::size_t operations = 0;   //< Pulled from the `call_result_t`
+    std::size_t errors = 0;       //< Pulled from the `call_result_t`
+
+    inline benchmark_result_t &operator+=(call_result_t const &run) noexcept {
+        bytes_passed += run.bytes_passed;
+        operations += run.operations;
+        return *this;
+    }
 
-    benchmark_result_t result;
-    std::size_t const lookup_mask = bit_floor(strings.size()) - 1;
-    result.seconds = repeat_until_limit([&]() {
-        // Unroll a few iterations, to avoid some for-loops overhead and minimize impact of time-tracking
-        result.bytes_passed += //
-            function(strings[(result.iterations + 0) & lookup_mask]) +
-            function(strings[(result.iterations + 1) & lookup_mask]) +
-            function(strings[(result.iterations + 2) & lookup_mask]) +
-            function(strings[(result.iterations + 3) & lookup_mask]);
-        result.iterations += 4;
-    });
+    /**
+     *  @brief  Logs the benchmark results to the console, including the throughput and latency.
+     *
+     *  Example output:
+     *
+     *  @code{.unparsed}
+     *  Benchmarking sz_find_serial:
+     *  - Performance: 0.00 TOps/s @ 0.00 ns/call
+     *  - Errors: 1 in 1 calls
+     *  @endcode
+     */
+    benchmark_result_t const &log() const {
+        benchmark_result_t const &result = *this;
+        if (result.skipped) return result;
+        std::printf("Benchmarking %s:\n", result.name.c_str());
+
+        // Infer the latency from the number of calls and the total time
+        auto duration = result.profiled_seconds * 1e9 / result.profiled_calls;
+        auto duration_unit = "ns";
+        if (duration > 1e3) duration /= 1e3, duration_unit = "us";
+        if (duration > 1e3) duration /= 1e3, duration_unit = "ms";
+        if (duration > 1e3) duration /= 1e3, duration_unit = "s";
+
+        // We may want to analyze the call latency distribution:
+        // auto cpu_frequency = result.profiled_cpu_cycles / result.profiled_seconds;
+        // auto cpu_frequency_unit = "Hz";
+        // if (cpu_frequency > 1e3) cpu_frequency /= 1e3, cpu_frequency_unit = "KHz";
+        // if (cpu_frequency > 1e3) cpu_frequency /= 1e3, cpu_frequency_unit = "MHz";
+        // if (cpu_frequency > 1e3) cpu_frequency /= 1e3, cpu_frequency_unit = "GHz";
+
+        // Infer the throughput from the number of operations and the total time
+        auto throughput = (result.operations ? result.operations : result.bytes_passed) / result.profiled_seconds;
+        auto throughput_unit = result.operations ? "Ops/s" : "B/s";
+        if (throughput > 1e3) throughput /= 1e3, throughput_unit = result.operations ? "KOps/s" : "KB/s";
+        if (throughput > 1e3) throughput /= 1e3, throughput_unit = result.operations ? "MOps/s" : "MB/s";
+        if (throughput > 1e3) throughput /= 1e3, throughput_unit = result.operations ? "GOps/s" : "GB/s";
+
+        // Print to console
+        std::printf(" - Performance: %.2f %s @ %.2f %s/call\n", throughput, throughput_unit, duration, duration_unit);
+        if (result.errors) std::printf(" - Errors: %zu in %zu calls\n", result.errors, result.stress_calls);
+
+        return result;
+    }
 
-    return result;
-}
+    /**
+     *  @brief  Logs @b relative results to the console, comparing @p this to a @p base result.
+     *
+     *  Example output:
+     *
+     *  @code{.unparsed}
+     *  Benchmarking sz_find_skylake:
+     *  - Performance: 0.00 TOps/s @ 0.00 ns/call
+     *  - Errors: 1 in 1 calls
+     *  - Relative performance: +25% vs sz_find_serial
+     *  @endcode
+     */
+    benchmark_result_t const &log(benchmark_result_t const &base) const {
+        benchmark_result_t const &new_ = *this;
+        new_.log();
+
+        if (new_.skipped || base.skipped) return new_; //? Nothing to compare to
+        auto base_throughput = (base.operations ? base.operations : base.bytes_passed) / base.profiled_seconds;
+        auto new_throughput = (new_.operations ? new_.operations : new_.bytes_passed) / new_.profiled_seconds;
+        auto relative_throughput = new_throughput / base_throughput;
+
+        // Now format the relative improvement as a percentage for small changes and as a multiplier for large ones,
+        // formatting it with a plus and a green color for improvements and a minus and a red color for regressions.
+        auto relative_color = relative_throughput > 1 ? "\033[32m" : "\033[31m";
+        auto relative_sign = relative_throughput > 1 ? "+" : "-";
+        auto relative_unit = relative_throughput > 2 ? "x" : "%";
+        if (relative_throughput < 0.5) relative_throughput = 1 / relative_throughput, relative_unit = "x";
+        if (std::strcmp(relative_unit, "%") == 0) relative_throughput *= 100;
+        std::printf(" - Relative performance: %s%s%.0f %s\033[0m vs. %s\n", relative_color, relative_sign,
+                    relative_throughput, relative_unit, base.name.c_str());
+        return new_;
+    }
+};
 
 /**
- *  @brief  Loop over all elements in a dataset, benchmarking the function cost.
- *  @param  strings Strings to loop over. Length must be a power of two.
- *  @param  function Function to be applied to pairs of `sz_string_view_t`.
- *                   Must return the number of bytes processed.
- *  @return Number of seconds per iteration.
+ *  @brief Loops over all tokens (in loop-unrolled batches) in environment and applies the given unary function.
+ *  @param[in] env Environment with the dataset and tokens.
+ *  @param[in] name Name of the benchmark, used for logging.
+ *  @param[in] baseline Optional serial analog, against which the accelerated function will be stress-tested.
+ *  @param[in] callable Unary function taking a @b `std::size_t` token index and returning a @b `call_result_t`.
+ *  @return Profiling results, including the number of cycles, bytes processed, and error counts.
  */
-template <typename strings_type_, typename function_type_>
-benchmark_result_t bench_on_token_pairs(strings_type_ &&strings, function_type_ &&function) {
+template <typename callable_type_, typename baseline_type_ = callable_no_op_t>
+benchmark_result_t benchmark(environment_t const &env, std::string const &name, baseline_type_ &&baseline,
+                             callable_type_ &&callable) {
 
     benchmark_result_t result;
-    std::size_t lookup_mask = bit_floor(strings.size()) - 1;
-    std::size_t largest_prime = static_cast<std::size_t>(18446744073709551557ull);
-    result.seconds = repeat_until_limit([&]() {
-        // Unroll a few iterations, to avoid some for-loops overhead and minimize impact of time-tracking
-        auto second_index = (result.iterations * largest_prime) & lookup_mask;
-        result.bytes_passed += //
-            function(strings[(result.iterations + 0) & lookup_mask], strings[second_index]) +
-            function(strings[(result.iterations + 1) & lookup_mask], strings[second_index]) +
-            function(strings[(result.iterations + 2) & lookup_mask], strings[second_index]) +
-            function(strings[(result.iterations + 3) & lookup_mask], strings[second_index]);
-        result.iterations += 4;
-    });
-
-    return result;
-}
-
-/**
- *  @brief  Evaluation for unary string operations: hashing.
- */
-template <typename strings_type_, typename functions_type>
-void bench_unary_functions(strings_type_ &&strings, functions_type &&variants) {
-
-    for (std::size_t variant_idx = 0; variant_idx != variants.size(); ++variant_idx) {
-        auto &variant = variants[variant_idx];
-
-        // Tests
-        if (variant.function && variant.needs_testing) {
-            bench_on_tokens(strings, [&](auto str) -> std::size_t {
-                auto baseline = variants[0].function(str);
-                auto result = variant.function(str);
-                if (result != baseline) {
-                    ++variant.failed_count;
-                    if (variant.failed_strings.empty()) {
-                        variant.failed_strings.push_back({to_c(str).start, to_c(str).length});
-                    }
-                }
-                return to_c(str).length;
-            });
-        }
+    result.name = name;
+    if (!env.allow(name)) {
+        result.skipped = true;
+        return result;
+    }
 
-        // Benchmarks
-        if (variant.function) {
-            variant.results = bench_on_tokens(strings, [&](auto str) -> std::size_t {
-                do_not_optimize(variant.function(str));
-                return to_c(str).length;
-            });
+    std::size_t const lookup_mask = bit_floor(env.tokens.size()) - 1;
+    if constexpr (!std::is_same<baseline_type_, callable_no_op_t>())
+        for (auto running_seconds : repeat_up_to(env.stress_seconds)) {
+            std::size_t const input_index = (result.stress_calls++) & lookup_mask;
+            call_result_t const accelerated_result = callable(input_index);
+            call_result_t const baseline_result = baseline(input_index);
+            if (accelerated_result.check_value == baseline_result.check_value) continue; // No failures
+
+            // If we got here, the error needs to be reported and investigated.
+            ++result.errors;
+            if (result.errors > env.stress_limit) {
+                std::printf("Too many errors in %s after %.3f seconds. Stopping the test.\n", name.c_str(),
+                            running_seconds);
+                std::terminate();
+            }
+            log_stress_failure(env, name, input_index, baseline_result.check_value, accelerated_result.check_value);
         }
 
-        variant.print();
+    // For profiling, we will first run the benchmark just once to get a rough estimate of the time.
+    // But then we will repeat it in an unrolled fashion for a more accurate measurement.
+    result.profiled_seconds += seconds_per_call([&] {
+        std::uint64_t start_cycle = cpu_cycle_counter();
+        result += callable(0); // First input for debugging
+        std::uint64_t end_cycle = cpu_cycle_counter();
+        result.profiled_calls += 1;
+        result.profiled_cpu_cycles += end_cycle - start_cycle;
+    });
+    if (result.profiled_seconds >= env.benchmark_seconds) return result;
+
+    // Repeat the benchmarks in unrolled batches until the time limit is reached.
+    for (auto running_seconds : repeat_up_to(env.benchmark_seconds - result.profiled_seconds)) {
+        std::uint64_t start_cycle = cpu_cycle_counter();
+        call_result_t r0 = callable((result.profiled_calls + 0) & lookup_mask);
+        call_result_t r1 = callable((result.profiled_calls + 1) & lookup_mask);
+        call_result_t r2 = callable((result.profiled_calls + 2) & lookup_mask);
+        call_result_t r3 = callable((result.profiled_calls + 3) & lookup_mask);
+        std::uint64_t end_cycle = cpu_cycle_counter();
+
+        // Aggregate all of them:
+        result += r0;
+        result += r1;
+        result += r2;
+        result += r3;
+        result.profiled_calls += 4;
+        result.profiled_cpu_cycles += end_cycle - start_cycle;
+        result.profiled_seconds = running_seconds;
     }
+
+    return result;
 }
 
 /**
- *  @brief  Evaluation for binary string operations: equality, ordering, prefix, suffix, distance.
+ *  @brief Loops over all tokens (in loop-unrolled batches) in environment and applies the given unary function.
+ *  @param[in] env Environment with the dataset and tokens.
+ *  @param[in] name Name of the benchmark, used for logging.
+ *  @param[in] callable Unary function taking a @b `std::size_t` token index and returning a @b `call_result_t`.
+ *  @return Profiling results, including the number of cycles, bytes processed, and error counts.
  */
-template <typename strings_type_, typename functions_type>
-void bench_binary_functions(strings_type_ &&strings, functions_type &&variants) {
-
-    for (std::size_t variant_idx = 0; variant_idx != variants.size(); ++variant_idx) {
-        auto &variant = variants[variant_idx];
-
-        // Tests
-        if (variant.function && variant.needs_testing) {
-            bench_on_token_pairs(strings, [&](auto str_a, auto str_b) -> std::size_t {
-                auto baseline = variants[0].function(str_a, str_b);
-                auto result = variant.function(str_a, str_b);
-                if (result != baseline) {
-                    ++variant.failed_count;
-                    if (variant.failed_strings.empty()) {
-                        variant.failed_strings.push_back({to_c(str_a).start, to_c(str_a).length});
-                        variant.failed_strings.push_back({to_c(str_b).start, to_c(str_b).length});
-                    }
-                }
-                return to_c(str_a).length + to_c(str_b).length;
-            });
-        }
-
-        // Benchmarks
-        if (variant.function) {
-            variant.results = bench_on_token_pairs(strings, [&](auto str_a, auto str_b) -> std::size_t {
-                do_not_optimize(variant.function(str_a, str_b));
-                return to_c(str_a).length + to_c(str_b).length;
-            });
-        }
-
-        variant.print();
-    }
+template <typename callable_type_>
+benchmark_result_t benchmark(environment_t const &env, std::string const &name, callable_type_ &&callable) {
+    return benchmark(env, name, callable_no_op_t {}, callable);
 }
 
+inline sz_string_view_t to_c(std::string_view str) noexcept { return {str.data(), str.size()}; }
+inline sz_string_view_t to_c(std::string const &str) noexcept { return {str.data(), str.size()}; }
+inline sz_string_view_t to_c(sz::string_view str) noexcept { return {str.data(), str.size()}; }
+inline sz_string_view_t to_c(sz::string const &str) noexcept { return {str.data(), str.size()}; }
+inline sz_string_view_t to_c(sz_string_view_t str) noexcept { return str; }
+
 } // namespace scripts
 } // namespace stringzilla
 } // namespace ashvardanian
\ No newline at end of file
diff --git a/scripts/bench_search.cpp b/scripts/bench_search.cpp
index 6ffd9790..aa76adf5 100644
--- a/scripts/bench_search.cpp
+++ b/scripts/bench_search.cpp
@@ -1,353 +1,571 @@
 /**
  *  @file   bench_search.cpp
- *  @brief  Benchmarks for bidirectional string search operations - exact and TODO: approximate.
+ *  @brief  Benchmarks for bidirectional string search operations.
+ *          The program accepts a file path to a dataset, tokenizes it, and benchmarks the search operations,
+ *          validating the SIMD-accelerated backends against the serial baselines.
  *
- *  This file is the sibling of `bench_sort.cpp`, `bench_token.cpp` and `bench_similarity.cpp`.
- *  It accepts a file with a list of words, and benchmarks the search operations on them.
- *  Outside of present tokens also tries missing tokens.
+ *  Benchmarks include:
+ *  - Substring search: find all inclusions of a token in the dataset - @b find & @b rfind.
+ *  - Byte search: find a specific byte value in each token (word, line, or file) - @b find_byte & @b rfind_byte.
+ *  - Byteset search: find any byte value from a set in each token (line or file) - @b find_byteset & @b rfind_byteset.
+ *
+ *  For substring search, the number of operations per second are reported as the number of character-level comparisons
+ *  happening in the worst case in the naive algorithm, meaning O(N*M) for N characters in the haystack and M in the
+ *  needle.
+ *
+ *  Instead of CLI arguments, for compatibility with @b StringWa.rs, the following environment variables are used:
+ *  - `STRINGWARS_DATASET` : Path to the dataset file.
+ *  - `STRINGWARS_TOKENS=word` : Tokenization model ("file", "line", "word", or positive integer [1:200] for N-grams
+ *  - `STRINGWARS_SEED=42` : Optional seed for shuffling reproducibility.
+ *
+ *  Unlike StringWa.rs, the following additional environment variables are supported:
+ *  - `STRINGWARS_DURATION=10` : Time limit (in seconds) per benchmark.
+ *  - `STRINGWARS_STRESS=1` : Test SIMD-accelerated functions against the serial baselines.
+ *  - `STRINGWARS_STRESS_DIR=/.tmp` : Output directory for stress-testing failures logs.
+ *  - `STRINGWARS_STRESS_LIMIT=1` : Controls the number of failures we're willing to tolerate.
+ *  - `STRINGWARS_STRESS_DURATION=10` : Stress-testing time limit (in seconds) per benchmark.
+ *  - `STRINGWARS_FILTER` : Regular Expression pattern to filter algorithm/backend names.
+ *
+ *  Here are a few build & run commands:
+ *
+ *  @code{.sh}
+ *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
+ *  cmake --build build_release --config Release --target stringzilla_bench_search
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=lines build_release/stringzilla_bench_search
+ *  @endcode
+ *
+ *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
+ *  like all Skylake-X and newer kernels on a boundary-condition input length of 64 bytes (exactly 1 cache line),
+ *  your last command may look like:
+ *
+ *  @code{.sh}
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
+ *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
+ *  build_release/stringzilla_bench_search
+ *  @endcode
+ *
+ *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
+ *  This file is the sibling of `bench_sort.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
+ *
+ *  ! It requires more memory than some of the other benchmarks, as every token is re-allocated
+ *  ! into a NULL-terminated buffer for compatibility with the C-style string functions.
  */
 #include <cstring>    // `memmem`
 #include <functional> // `std::boyer_moore_searcher`
 
 #define SZ_USE_MISALIGNED_LOADS (1)
-#include <bench.hpp>
+#include "bench.hpp"
 
 using namespace ashvardanian::stringzilla::scripts;
 
-tracked_binary_functions_t find_functions() {
-    // ! Despite receiving string-views, following functions are assuming the strings are null-terminated.
-    auto wrap_sz = [](auto function) -> binary_function_t {
-        return binary_function_t([function](std::string_view h, std::string_view n) {
-            sz_cptr_t match = function(h.data(), h.size(), n.data(), n.size());
-            return (match ? match - h.data() : h.size());
-        });
-    };
-    tracked_binary_functions_t result = {
-        {"std::string_view.find",
-         [](std::string_view h, std::string_view n) {
-             auto match = n.size() == 1 ? h.find(n.front()) : h.find(n);
-             return (match == std::string_view::npos ? h.size() : match);
-         }},
-        {"sz_find_serial", wrap_sz(sz_find_serial), true},
-#if SZ_USE_SKYLAKE
-        {"sz_find_skylake", wrap_sz(sz_find_skylake), true},
-#endif
-#if SZ_USE_HASWELL
-        {"sz_find_haswell", wrap_sz(sz_find_haswell), true},
-#endif
-#if SZ_USE_NEON
-        {"sz_find_neon", wrap_sz(sz_find_neon), true},
-#endif
-        {"strstr/strchr",
-         [](std::string_view h, std::string_view n) {
-             sz_cptr_t match = n.size() == 1 ? (sz_cptr_t)strchr(h.data(), n.front()) //
-                                             : (sz_cptr_t)strstr(h.data(), n.data());
-             return (match ? match - h.data() : h.size());
-         }},
-#ifdef _GNU_SOURCE
-        {"memmem/memchr", // Not supported on MSVC
-         [](std::string_view h, std::string_view n) {
-             sz_cptr_t match = n.size() == 1 ? (sz_cptr_t)memchr(h.data(), n.front(), h.size())
-                                             : (sz_cptr_t)memmem(h.data(), h.size(), n.data(), n.size());
-             return (match ? match - h.data() : h.size());
-         }},
+#pragma region Substring Search
+
+/**
+ *  @brief  Wraps an individual hardware-specific search backend into something similar
+ *          to @b `sz::matcher_find` and compatible with @b `sz::range_matches`.
+ */
+template <sz_find_t find_func_>
+struct matcher_from_sz_find {
+    using size_type = std::size_t;
+    std::string_view needle_;
+
+    inline matcher_from_sz_find(std::string_view needle = {}) noexcept : needle_(needle) {}
+    inline size_type needle_length() const noexcept { return needle_.size(); }
+    inline size_type operator()(std::string_view haystack) const noexcept {
+        auto ptr = find_func_(haystack.data(), haystack.size(), needle_.data(), needle_.size());
+        if (!ptr) return std::string_view::npos; // No match found
+        return ptr - haystack.data();
+    }
+    constexpr size_type skip_length() const noexcept { return 1; }
+};
+
+static std::string strstr_needle_copy_ {}; //! Reuse the same memory for all needles, potentially causing allocations
+
+/**
+ *  @brief  Wraps the LibC functionality for finding the next occurrence of a NULL-terminated string
+ *          into something similar to @b `sz::matcher_find` and compatible with @b `sz::range_matches`.
+ */
+struct matcher_strstr_t {
+    using size_type = std::size_t;
+
+    inline matcher_strstr_t(std::string_view needle = {}) noexcept(false) { strstr_needle_copy_ = needle; }
+    inline size_type needle_length() const noexcept { return strstr_needle_copy_.size(); }
+    inline size_type operator()(std::string_view haystack) const noexcept {
+        auto ptr = (char *)strstr(haystack.data(), strstr_needle_copy_.c_str());
+        do_not_optimize(ptr);
+        if (!ptr) return std::string_view::npos; // No match found
+        return (size_type)(ptr - haystack.data());
+    }
+    constexpr size_type skip_length() const noexcept { return 1; }
+};
+
+#if defined(_GNU_SOURCE)
+/**
+ *  @brief  Wraps the LibC functionality for finding the next occurrence of a byte-string in a buffer
+ *          into something similar to @b `sz::matcher_find` and compatible with @b `sz::range_matches`.
+ */
+struct matcher_memmem_t {
+    using size_type = std::size_t;
+    std::string_view needle_;
+
+    inline matcher_memmem_t(std::string_view needle = {}) noexcept : needle_(needle) {}
+    inline size_type needle_length() const noexcept { return needle_.size(); }
+    inline size_type operator()(std::string_view haystack) const noexcept {
+        auto ptr = (char *)memmem(haystack.data(), haystack.size(), needle_.data(), needle_.size());
+        do_not_optimize(ptr);
+        if (!ptr) return std::string_view::npos; // No match found
+        return (size_type)(ptr - haystack.data());
+    }
+    constexpr size_type skip_length() const noexcept { return 1; }
+};
 #endif
-        {"std::search<>",
-         [](std::string_view h, std::string_view n) {
-             auto match = std::search(h.data(), h.data() + h.size(), n.data(), n.data() + n.size());
-             return (match - h.data());
-         }},
+
 #if __cpp_lib_boyer_moore_searcher
-        {"std::search<BM>",
-         [](std::string_view h, std::string_view n) {
-             auto match =
-                 std::search(h.data(), h.data() + h.size(), std::boyer_moore_searcher(n.data(), n.data() + n.size()));
-             return (match - h.data());
-         }},
-        {"std::search<BMH>",
-         [](std::string_view h, std::string_view n) {
-             auto match = std::search(h.data(), h.data() + h.size(),
-                                      std::boyer_moore_horspool_searcher(n.data(), n.data() + n.size()));
-             return (match - h.data());
-         }},
+/**
+ *  @brief  Wraps the C++20 @b Boyer-Moore algorithms for finding the next occurrence of a string
+ *          into something similar to @b `sz::matcher_find` and compatible with @b `sz::range_matches`.
+ *  @tparam searcher_type_ Can be `std::boyer_moore_searcher` or `std::boyer_moore_horspool_searcher`.
+ *          Both should be instantiated with the `std::string_view::const_iterator` type.
+ */
+template <typename searcher_type_>
+struct matcher_from_std_search {
+    using size_type = std::size_t;
+    std::string_view needle_;
+    searcher_type_ searcher_;
+
+    inline matcher_from_std_search(std::string_view needle = {}) noexcept
+        : needle_(needle), searcher_(needle.begin(), needle.end()) {}
+    inline size_type needle_length() const noexcept { return needle_.size(); }
+    inline size_type operator()(std::string_view haystack) const noexcept {
+        auto match = std::search(haystack.begin(), haystack.end(), searcher_);
+        return (size_type)(match - haystack.begin());
+    }
+    constexpr size_type skip_length() const noexcept { return 1; }
+};
+
+template <typename searcher_type_>
+struct rmatcher_from_std_search {
+    using size_type = std::size_t;
+    std::string_view needle_;
+    searcher_type_ searcher_;
+
+    inline rmatcher_from_std_search(std::string_view needle = {}) noexcept
+        : needle_(needle), searcher_(needle.rbegin(), needle.rend()) {}
+    inline size_type needle_length() const noexcept { return needle_.size(); }
+    inline size_type operator()(std::string_view haystack) const noexcept {
+        auto match = std::search(haystack.rbegin(), haystack.rend(), searcher_);
+        auto offset_from_end = match - haystack.rbegin();
+        return std::string_view::npos - offset_from_end - needle_.size();
+    }
+    constexpr size_type skip_length() const noexcept { return 1; }
+};
+
 #endif
+
+template <template <typename, typename> class range_template_, typename matcher_type_>
+auto callable_for_substring_search(environment_t const &env) {
+    using matcher_t = matcher_type_;
+    using matches_t = range_template_<std::string_view, matcher_t>;
+    return [&env](std::size_t token_index) -> call_result_t {
+        std::string_view haystack = env.dataset;
+        std::string_view needle = env.tokens[token_index];
+        matcher_t matcher(needle);
+        matches_t matches(haystack, matcher);
+        // Drain all matches to ensure the compiler doesn't optimize the search away
+        std::size_t count_bytes = haystack.size();
+        std::size_t count_matches = matches.size();
+        std::size_t count_operations = count_bytes * needle.size();
+        return call_result_t {count_bytes, count_matches, count_operations};
     };
-    return result;
 }
 
-tracked_binary_functions_t rfind_functions() {
-    // ! Despite receiving string-views, following functions are assuming the strings are null-terminated.
-    auto wrap_sz = [](auto function) -> binary_function_t {
-        return binary_function_t([function](std::string_view h, std::string_view n) {
-            sz_cptr_t match = function(h.data(), h.size(), n.data(), n.size());
-            return (match ? match - h.data() : 0);
-        });
-    };
-    tracked_binary_functions_t result = {
-        {"std::string_view.rfind",
-         [](std::string_view h, std::string_view n) {
-             auto match = n.size() == 1 ? h.rfind(n.front()) : h.rfind(n);
-             return (match == std::string_view::npos ? 0 : match);
-         }},
-        {"sz_rfind_serial", wrap_sz(sz_rfind_serial), true},
+/**
+ *  @brief Find all inclusions of each given token in the dataset, using various search backends.
+ */
+void bench_substring_search(environment_t const &env) {
+
+    // First, benchmark the serial function
+    // The "check value" for normal and reverse search is the same - simply the number of matches.
+    auto base_call = callable_for_substring_search<sz::range_matches, matcher_from_sz_find<sz_find_serial>>(env);
+    benchmark_result_t base = benchmark(env, "sz_find_serial", base_call).log();
+    benchmark_result_t base_reverse =
+        benchmark(env, "sz_rfind_serial",
+                  callable_for_substring_search<sz::range_rmatches, matcher_from_sz_find<sz_rfind_serial>>(env))
+            .log();
+
+    // Conditionally include SIMD-accelerated backends
 #if SZ_USE_SKYLAKE
-        {"sz_rfind_skylake", wrap_sz(sz_rfind_skylake), true},
+    benchmark(env, "sz_find_skylake", base_call,
+              callable_for_substring_search<sz::range_matches, matcher_from_sz_find<sz_find_skylake>>(env))
+        .log(base);
+    benchmark(env, "sz_rfind_skylake", base_call,
+              callable_for_substring_search<sz::range_rmatches, matcher_from_sz_find<sz_rfind_skylake>>(env))
+        .log(base_reverse);
 #endif
 #if SZ_USE_HASWELL
-        {"sz_rfind_haswell", wrap_sz(sz_rfind_haswell), true},
+    benchmark(env, "sz_find_haswell", base_call,
+              callable_for_substring_search<sz::range_matches, matcher_from_sz_find<sz_find_haswell>>(env))
+        .log(base);
+    benchmark(env, "sz_rfind_haswell", base_call,
+              callable_for_substring_search<sz::range_rmatches, matcher_from_sz_find<sz_rfind_haswell>>(env))
+        .log(base_reverse);
 #endif
 #if SZ_USE_NEON
-        {"sz_rfind_neon", wrap_sz(sz_rfind_neon), true},
+    benchmark(env, "sz_find_neon", base_call,
+              callable_for_substring_search<sz::range_matches, matcher_from_sz_find<sz_find_neon>>(env))
+        .log(base);
+    benchmark(env, "sz_rfind_neon", base_call,
+              callable_for_substring_search<sz::range_rmatches, matcher_from_sz_find<sz_rfind_neon>>(env))
+        .log(base_reverse);
 #endif
-        {"std::search<R>",
-         [](std::string_view h, std::string_view n) {
-             auto match = std::search(h.rbegin(), h.rend(), n.rbegin(), n.rend());
-             auto offset_from_end = (sz_ssize_t)(match - h.rbegin());
-             return h.size() - offset_from_end;
-         }},
-#if __cpp_lib_boyer_moore_searcher
-        {"std::search<R, BM>",
-         [](std::string_view h, std::string_view n) {
-             auto match = std::search(h.rbegin(), h.rend(), std::boyer_moore_searcher(n.rbegin(), n.rend()));
-             auto offset_from_end = (sz_ssize_t)(match - h.rbegin());
-             return h.size() - offset_from_end;
-         }},
-        {"std::search<R, BMH>",
-         [](std::string_view h, std::string_view n) {
-             auto match = std::search(h.rbegin(), h.rend(), std::boyer_moore_horspool_searcher(n.rbegin(), n.rend()));
-             auto offset_from_end = (sz_ssize_t)(match - h.rbegin());
-             return h.size() - offset_from_end;
-         }},
-#endif
-    };
-    return result;
-}
 
-tracked_binary_functions_t find_byteset_functions() {
+    // Include LibC functionality
     // ! Despite receiving string-views, following functions are assuming the strings are null-terminated.
-    auto wrap_sz = [](auto function) -> binary_function_t {
-        return binary_function_t([function](std::string_view h, std::string_view n) {
-            sz::byteset set;
-            for (auto c : n) set.add(c);
-            sz_cptr_t match = function(h.data(), h.size(), &set.raw());
-            return (match ? match - h.data() : h.size());
-        });
-    };
-    tracked_binary_functions_t result = {
-        {"std::string_view.find_first_of",
-         [](std::string_view h, std::string_view n) {
-             auto match = h.find_first_of(n);
-             return (match == std::string_view::npos ? h.size() : match);
-         }},
-        {"sz_find_byteset_serial", wrap_sz(sz_find_byteset_serial), true},
-#if SZ_USE_HASWELL
-        {"sz_find_byteset_haswell", wrap_sz(sz_find_byteset_haswell), true},
+    benchmark(env, "strstr", base_call, callable_for_substring_search<sz::range_matches, matcher_strstr_t>(env))
+        .log(base);
+
+    // Include POSIX functionality
+#if defined(_GNU_SOURCE)
+    benchmark(env, "memmem", base_call, callable_for_substring_search<sz::range_matches, matcher_memmem_t>(env))
+        .log(base);
 #endif
-#if SZ_USE_ICE
-        {"sz_find_byteset_ice", wrap_sz(sz_find_byteset_ice), true},
+
+    // Include STL functionality
+#if __cpp_lib_boyer_moore_searcher
+    using matcher_bm_t = matcher_from_std_search<std::boyer_moore_searcher<std::string_view::const_iterator>>;
+    using matcher_bmh_t = matcher_from_std_search<std::boyer_moore_horspool_searcher<std::string_view::const_iterator>>;
+    using rmatcher_bm_t = rmatcher_from_std_search<std::boyer_moore_searcher<std::string_view::const_reverse_iterator>>;
+    using rmatcher_bmh_t =
+        rmatcher_from_std_search<std::boyer_moore_horspool_searcher<std::string_view::const_reverse_iterator>>;
+    benchmark(env, "std::boyer_moore", base_call, callable_for_substring_search<sz::range_matches, matcher_bm_t>(env))
+        .log(base);
+    benchmark(env, "std::boyer_moore<reverse>", base_call,
+              callable_for_substring_search<sz::range_rmatches, rmatcher_bm_t>(env))
+        .log(base_reverse);
+    benchmark(env, "std::boyer_moore_horspool", base_call,
+              callable_for_substring_search<sz::range_matches, matcher_bmh_t>(env))
+        .log(base);
+    benchmark(env, "std::boyer_moore_horspool<reverse>", base_call,
+              callable_for_substring_search<sz::range_rmatches, rmatcher_bmh_t>(env))
+        .log(base_reverse);
 #endif
-#if SZ_USE_NEON
-        {"sz_find_byteset_neon", wrap_sz(sz_find_byteset_neon), true},
+}
+
+#pragma endregion // Substring Search
+
+#pragma region Byte Search
+
+/**
+ *  @brief  Wraps an individual hardware-specific search backend into something similar
+ *          to @b `sz::matcher_find` and compatible with @b `sz::range_matches`.
+ */
+template <sz_find_byte_t find_func_>
+struct matcher_from_sz_find_byte {
+    using size_type = std::size_t;
+    char needle_;
+
+    inline matcher_from_sz_find_byte(char needle) noexcept : needle_(needle) {}
+    constexpr size_type needle_length() const noexcept { return 1; }
+    inline size_type operator()(std::string_view haystack) const noexcept {
+        auto ptr = find_func_(haystack.data(), haystack.size(), &needle_);
+        if (!ptr) return std::string_view::npos; // No match found
+        return ptr - haystack.data();
+    }
+    constexpr size_type skip_length() const noexcept { return 1; }
+};
+
+/**
+ *  @brief  Wraps the LibC functionality for finding the next occurrence of a NULL-terminated string
+ *          into something similar to @b `sz::matcher_find` and compatible with @b `sz::range_matches`.
+ */
+struct matcher_strchr_t {
+    using size_type = std::size_t;
+    char needle_;
+
+    inline matcher_strchr_t(char needle) noexcept : needle_(needle) {}
+    constexpr size_type needle_length() const noexcept { return 1; }
+    inline size_type operator()(std::string_view haystack) const noexcept {
+        auto ptr = (char *)strchr(haystack.data(), needle_);
+        do_not_optimize(ptr);
+        if (!ptr) return std::string_view::npos; // No match found
+        return (size_type)(ptr - haystack.data());
+    }
+    constexpr size_type skip_length() const noexcept { return 1; }
+};
+
+#if defined(_GNU_SOURCE)
+/**
+ *  @brief  Wraps the LibC functionality for finding the next occurrence of a byte-string in a buffer
+ *          into something similar to @b `sz::matcher_find` and compatible with @b `sz::range_matches`.
+ */
+struct matcher_memchr_t {
+    using size_type = std::size_t;
+    char needle_;
+
+    inline matcher_memchr_t(char needle) noexcept : needle_(needle) {}
+    constexpr size_type needle_length() const noexcept { return 1; }
+    inline size_type operator()(std::string_view haystack) const noexcept {
+        auto ptr = (char *)memchr(haystack.data(), needle_, haystack.size());
+        do_not_optimize(ptr);
+        if (!ptr) return std::string_view::npos; // No match found
+        return (size_type)(ptr - haystack.data());
+    }
+    constexpr size_type skip_length() const noexcept { return 1; }
+};
 #endif
-        {"strcspn", [](std::string_view h, std::string_view n) { return strcspn(h.data(), n.data()); }},
+
+/**
+ *  @brief  Wraps the C++11 @b `std::find` algorithms for finding the next occurrence of a string
+ *          into something similar to @b `sz::matcher_find` and compatible with @b `sz::range_matches`.
+ */
+struct matcher_from_std_find {
+    using size_type = std::size_t;
+    char needle_;
+
+    inline matcher_from_std_find(char needle) noexcept : needle_(needle) {}
+    constexpr size_type needle_length() const noexcept { return 1; }
+    inline size_type operator()(std::string_view haystack) const noexcept {
+        auto match = std::find(haystack.begin(), haystack.end(), needle_);
+        return (size_type)(match - haystack.begin());
+    }
+    constexpr size_type skip_length() const noexcept { return 1; }
+};
+
+template <template <typename, typename> class range_template_, typename matcher_type_>
+auto callable_for_byte_search(environment_t const &env) {
+    using matcher_t = matcher_type_;
+    using matches_t = range_template_<std::string_view, matcher_t>;
+    return [&env](std::size_t token_index) -> call_result_t {
+        std::string_view haystack = env.tokens[token_index];
+        std::size_t count_whitespaces = matches_t(haystack, matcher_t(' ')).size();
+        std::size_t count_newlines = matches_t(haystack, matcher_t('\n')).size();
+        std::size_t count_nulls = matches_t(haystack, matcher_t(0)).size();
+        // As a checksum, mix the counts together
+        std::size_t count_matches = count_whitespaces + count_newlines + count_nulls;
+        std::size_t count_bytes = haystack.size() * 3; // We've traversed the input 3 times
+        return call_result_t {count_bytes, count_matches};
     };
-    return result;
 }
 
-tracked_binary_functions_t rfind_byteset_functions() {
-    // ! Despite receiving string-views, following functions are assuming the strings are null-terminated.
-    auto wrap_sz = [](auto function) -> binary_function_t {
-        return binary_function_t([function](std::string_view h, std::string_view n) {
-            sz::byteset set;
-            for (auto c : n) set.add(c);
-            sz_cptr_t match = function(h.data(), h.size(), &set.raw());
-            return (match ? match - h.data() : 0);
-        });
-    };
-    tracked_binary_functions_t result = {
-        {"std::string_view.find_last_of",
-         [](std::string_view h, std::string_view n) {
-             auto match = h.find_last_of(n);
-             return (match == std::string_view::npos ? 0 : match);
-         }},
-        {"sz_rfind_byteset_serial", wrap_sz(sz_rfind_byteset_serial), true},
-#if SZ_USE_ICE
-        {"sz_rfind_byteset_ice", wrap_sz(sz_rfind_byteset_ice), true},
+/**
+ *  @brief Find all inclusions of a certain byte value in each token, be it a word, line, or the whole file.
+ *  @warning Notice, the roles differ from `bench_substring_search`: each individual token is now treated as a haystack.
+ */
+void bench_byte_search(environment_t const &env) {
+    // First, benchmark the serial function
+    // The "check value" for normal and reverse search is the same - simply the number of matches.
+    auto base_call = callable_for_byte_search<sz::range_matches, matcher_from_sz_find_byte<sz_find_byte_serial>>(env);
+    benchmark_result_t base = benchmark(env, "sz_find_byte_serial", base_call).log();
+    benchmark_result_t base_reverse =
+        benchmark(env, "sz_rfind_byte_serial",
+                  callable_for_byte_search<sz::range_rmatches, matcher_from_sz_find_byte<sz_rfind_byte_serial>>(env))
+            .log();
+
+    // Conditionally include SIMD-accelerated backends
+#if SZ_USE_SKYLAKE
+    benchmark(env, "sz_find_byte_skylake", base_call,
+              callable_for_byte_search<sz::range_matches, matcher_from_sz_find_byte<sz_find_byte_skylake>>(env))
+        .log(base);
+    benchmark(env, "sz_rfind_byte_skylake", base_call,
+              callable_for_byte_search<sz::range_rmatches, matcher_from_sz_find_byte<sz_rfind_byte_skylake>>(env))
+        .log(base_reverse);
+#endif
+#if SZ_USE_HASWELL
+    benchmark(env, "sz_find_byte_haswell", base_call,
+              callable_for_byte_search<sz::range_matches, matcher_from_sz_find_byte<sz_find_byte_haswell>>(env))
+        .log(base);
+    benchmark(env, "sz_rfind_byte_haswell", base_call,
+              callable_for_byte_search<sz::range_rmatches, matcher_from_sz_find_byte<sz_rfind_byte_haswell>>(env))
+        .log(base_reverse);
 #endif
 #if SZ_USE_NEON
-        {"sz_rfind_byteset_neon", wrap_sz(sz_rfind_byteset_neon), true},
+    benchmark(env, "sz_find_byte_neon", base_call,
+              callable_for_byte_search<sz::range_matches, matcher_from_sz_find_byte<sz_find_byte_neon>>(env))
+        .log(base);
+    benchmark(env, "sz_rfind_byte_neon", base_call,
+              callable_for_byte_search<sz::range_rmatches, matcher_from_sz_find_byte<sz_rfind_byte_neon>>(env))
+        .log(base_reverse);
 #endif
-    };
-    return result;
+
+    // Include LibC functionality
+    benchmark(env, "strchr", base_call, callable_for_byte_search<sz::range_matches, matcher_strchr_t>(env)).log(base);
+
+    // Include POSIX functionality
+#if defined(_GNU_SOURCE)
+    benchmark(env, "memchr", base_call, callable_for_byte_search<sz::range_matches, matcher_memchr_t>(env)).log(base);
+#endif
+
+    // Include STL functionality
+    benchmark(env, "std::find", base_call, callable_for_byte_search<sz::range_matches, matcher_from_std_find>(env))
+        .log(base);
 }
 
+#pragma endregion // Byte Search
+
+#pragma region Byteset Search
+
 /**
- *  @brief  Evaluation for search string operations: find.
+ *  @brief  Wraps an individual hardware-specific search backend into something similar
+ *          to @b `sz::matcher_find` and compatible with @b `sz::range_matches`.
  */
-void bench_finds( //
-    std::string const &haystack, std::vector<std::string> const &strings, tracked_binary_functions_t &&variants) {
-
-    for (std::size_t variant_idx = 0; variant_idx != variants.size(); ++variant_idx) {
-        auto &variant = variants[variant_idx];
-
-        // Tests
-        if (variant.function && variant.needs_testing) {
-            bench_on_tokens(strings, [&](std::string_view needle) {
-                std::string_view remaining = haystack;
-                while (true) {
-                    auto baseline = variants[0].function(remaining, needle);
-                    auto result = variant.function(remaining, needle);
-                    if (result != baseline) {
-                        ++variant.failed_count;
-                        if (variant.failed_strings.empty()) {
-                            variant.failed_strings.push_back({remaining.data(), baseline + needle.size()});
-                            variant.failed_strings.push_back({needle.data(), needle.size()});
-                        }
-                    }
-
-                    if (baseline == remaining.size()) break;
-                    remaining = remaining.substr(baseline + 1);
-                }
-
-                return haystack.size();
-            });
-        }
-
-        // Benchmarks
-        if (variant.function) {
-            variant.results = bench_on_tokens(strings, [&](std::string_view needle) {
-                std::string_view remaining = haystack;
-                auto offset_from_start = variant.function(remaining, needle);
-                while (offset_from_start != remaining.size()) {
-                    remaining = remaining.substr(offset_from_start + 1);
-                    offset_from_start = variant.function(remaining, needle);
-                    do_not_optimize(offset_from_start);
-                }
-                return haystack.size();
-            });
-        }
-
-        variant.print();
+template <sz_find_byteset_t find_func_>
+struct matcher_from_sz_find_byteset {
+    using size_type = std::size_t;
+    sz::byteset needles_; // Pick C++ alternative over `sz_byteset_t` for `constexp` constructor
+
+    constexpr matcher_from_sz_find_byteset(sz::byteset needles) noexcept : needles_(needles) {}
+    constexpr size_type needle_length() const noexcept { return 1; }
+    inline size_type operator()(std::string_view haystack) const noexcept {
+        auto ptr = find_func_(haystack.data(), haystack.size(), &needles_.raw());
+        if (!ptr) return std::string_view::npos; // No match found
+        return ptr - haystack.data();
     }
-}
+    constexpr size_type skip_length() const noexcept { return 1; }
+};
 
 /**
- *  @brief  Evaluation for reverse order search string operations: find.
+ *  @brief  Wraps the LibC functionality for finding the next occurrence of a NULL-terminated string
+ *          into something similar to @b `sz::matcher_find` and compatible with @b `sz::range_matches`.
  */
-void bench_rfinds( //
-    std::string const &haystack, std::vector<std::string> const &strings, tracked_binary_functions_t &&variants) {
-
-    for (std::size_t variant_idx = 0; variant_idx != variants.size(); ++variant_idx) {
-        auto &variant = variants[variant_idx];
-
-        // Tests
-        if (variant.function && variant.needs_testing) {
-            bench_on_tokens(strings, [&](std::string_view needle) {
-                std::string_view remaining = haystack;
-                while (true) {
-                    auto baseline = variants[0].function(remaining, needle);
-                    auto result = variant.function(remaining, needle);
-                    if (result != baseline) {
-                        ++variant.failed_count;
-                        if (variant.failed_strings.empty()) {
-                            variant.failed_strings.push_back(
-                                {remaining.data() + baseline, remaining.data() + remaining.size()});
-                            variant.failed_strings.push_back({needle.data(), needle.size()});
-                        }
-                    }
-
-                    if (baseline == remaining.size()) break;
-                    remaining = remaining.substr(0, baseline);
-                }
-
-                return haystack.size();
-            });
-        }
-
-        // Benchmarks
-        if (variant.function) {
-            variant.results = bench_on_tokens(strings, [&](std::string_view needle) {
-                std::string_view remaining = haystack;
-                auto offset_from_start = variant.function(remaining, needle);
-                while (offset_from_start != 0) {
-                    remaining = remaining.substr(0, offset_from_start - 1);
-                    offset_from_start = variant.function(remaining, needle);
-                    do_not_optimize(offset_from_start);
-                }
-                return haystack.size();
-            });
-        }
-
-        variant.print();
+struct matcher_strcspn_t {
+    using size_type = std::size_t;
+    std::string_view needles_;
+
+    inline matcher_strcspn_t(std::string_view needles) noexcept : needles_(needles) {}
+    inline size_type needle_length() const noexcept { return 1; }
+    inline size_type operator()(std::string_view haystack) const noexcept {
+        auto match = strcspn(haystack.data(), needles_.data());
+        if (match == haystack.size()) return std::string_view::npos; // No match found
+        return match;
     }
+    constexpr size_type skip_length() const noexcept { return 1; }
+};
+
+/**
+ *  @brief  Wraps the C++11 @b `std::string_view::find_first_of` algorithms for finding the next occurrence of a string
+ *          into something similar to @b `sz::matcher_find` and compatible with @b `sz::range_matches`.
+ */
+struct matcher_std_string_first_of_t {
+    using size_type = std::size_t;
+    std::string_view needles_;
+
+    inline matcher_std_string_first_of_t(std::string_view needles) noexcept : needles_(needles) {}
+    inline size_type needle_length() const noexcept { return 1; }
+    inline size_type operator()(std::string_view haystack) const noexcept { return haystack.find_first_of(needles_); }
+    constexpr size_type skip_length() const noexcept { return 1; }
+};
+
+/**
+ *  @brief  Wraps the C++11 @b `std::string_view::find_last_of` algorithms for finding the next occurrence of a string
+ *          into something similar to @b `sz::matcher_rfind` and compatible with @b `sz::range_rmatches`.
+ */
+struct matcher_std_string_last_of_t {
+    using size_type = std::size_t;
+    std::string_view needles_;
+
+    inline matcher_std_string_last_of_t(std::string_view needles) noexcept : needles_(needles) {}
+    inline size_type needle_length() const noexcept { return 1; }
+    inline size_type operator()(std::string_view haystack) const noexcept { return haystack.find_last_of(needles_); }
+    constexpr size_type skip_length() const noexcept { return 1; }
+};
+
+template <template <typename, typename> class range_template_, typename matcher_type_, typename byteset_type_>
+auto callable_for_byteset_search(environment_t const &env) {
+    using matcher_t = matcher_type_;
+    using matches_t = range_template_<std::string_view, matcher_t>;
+    return [&env](std::size_t token_index) -> call_result_t {
+        std::string_view haystack = env.tokens[token_index];
+        std::size_t count_tabs = matches_t(haystack, matcher_t(byteset_type_("\n\r\v\f", 4))).size();
+        std::size_t count_html = matches_t(haystack, matcher_t(byteset_type_("</>&'\"=[]", 9))).size();
+        std::size_t count_digits = matches_t(haystack, matcher_t(byteset_type_("0123456789", 10))).size();
+        // As a checksum, mix the counts together
+        std::size_t count_matches = count_tabs + count_html + count_digits;
+        std::size_t count_bytes = haystack.size() * 3; // We've traversed the input 3 times
+        return call_result_t {count_bytes, count_matches};
+    };
 }
 
-void bench_search(std::string const &haystack, std::vector<std::string> const &strings) {
-    if (strings.size() == 0) return;
+/**
+ *  @brief Find all inclusions of any byte from a set in each token, be it a word, line, or the whole file.
+ *  @warning Notice, the roles differ from `bench_substring_search`: each individual token is now treated as a haystack.
+ */
+void bench_byteset_search(environment_t const &env) {
+
+    // First, benchmark the serial function
+    // The "check value" for normal and reverse search is the same - simply the number of matches.
+    auto base_call =
+        callable_for_byteset_search<sz::range_matches, matcher_from_sz_find_byteset<sz_find_byteset_serial>,
+                                    sz::byteset>(env);
+    benchmark_result_t base = benchmark(env, "sz_find_byteset_serial", base_call).log();
+    benchmark_result_t base_reverse =
+        benchmark(env, "sz_rfind_byteset_serial",
+                  callable_for_byteset_search<sz::range_rmatches, matcher_from_sz_find_byteset<sz_rfind_byteset_serial>,
+                                              sz::byteset>(env))
+            .log();
+
+    // Conditionally include SIMD-accelerated backends
+#if SZ_USE_HASWELL
+    benchmark(env, "sz_find_byteset_haswell", base_call,
+              callable_for_byteset_search<sz::range_matches, matcher_from_sz_find_byteset<sz_find_byteset_haswell>,
+                                          sz::byteset>(env))
+        .log(base);
+    benchmark(env, "sz_rfind_byteset_haswell", base_call,
+              callable_for_byteset_search<sz::range_rmatches, matcher_from_sz_find_byteset<sz_rfind_byteset_haswell>,
+                                          sz::byteset>(env))
+        .log(base_reverse);
+#endif
+#if SZ_USE_ICE
+    benchmark(
+        env, "sz_find_byteset_ice", base_call,
+        callable_for_byteset_search<sz::range_matches, matcher_from_sz_find_byteset<sz_find_byteset_ice>, sz::byteset>(
+            env))
+        .log(base);
+    benchmark(env, "sz_rfind_byteset_ice", base_call,
+              callable_for_byteset_search<sz::range_rmatches, matcher_from_sz_find_byteset<sz_rfind_byteset_ice>,
+                                          sz::byteset>(env))
+        .log(base_reverse);
+#endif
+#if SZ_USE_NEON
+    benchmark(
+        env, "sz_find_byteset_neon", base_call,
+        callable_for_byteset_search<sz::range_matches, matcher_from_sz_find_byteset<sz_find_byteset_neon>, sz::byteset>(
+            env))
+        .log(base);
+    benchmark(env, "sz_rfind_byteset_neon", base_call,
+              callable_for_byteset_search<sz::range_rmatches, matcher_from_sz_find_byteset<sz_rfind_byteset_neon>,
+                                          sz::byteset>(env))
+        .log(base_reverse);
+#endif
 
-    bench_finds(haystack, strings, find_functions());
-    bench_rfinds(haystack, strings, rfind_functions());
+    // Include LibC functionality
+    benchmark(env, "strcspn", base_call,
+              callable_for_byteset_search<sz::range_matches, matcher_strcspn_t, std::string_view>(env))
+        .log(base);
+
+    // Include STL functionality
+    benchmark(env, "std::string_view.find_first_of", base_call,
+              callable_for_byteset_search<sz::range_matches, matcher_std_string_first_of_t, std::string_view>(env))
+        .log(base);
+    benchmark(env, "std::string_view.find_last_of", base_call,
+              callable_for_byteset_search<sz::range_rmatches, matcher_std_string_last_of_t, std::string_view>(env))
+        .log(base_reverse);
 }
 
-int main(int argc, char const **argv) {
-    std::printf("StringZilla. Starting search benchmarks.\n");
-
-    dataset_t dataset = prepare_benchmark_environment(argc, argv);
-
-    // Splitting by new lines
-    std::printf("Benchmarking for a newline symbol:\n");
-    bench_finds(dataset.text, {"\n"}, find_functions());
-    bench_rfinds(dataset.text, {"\n"}, rfind_functions());
-
-    std::printf("Benchmarking for one whitespace:\n");
-    bench_finds(dataset.text, {" "}, find_functions());
-    bench_rfinds(dataset.text, {" "}, rfind_functions());
-
-    std::printf("Benchmarking for an [\\n\\r\\v\\f] RegEx:\n");
-    bench_finds(dataset.text, {"\n\r\v\f"}, find_byteset_functions());
-    bench_rfinds(dataset.text, {"\n\r\v\f"}, rfind_byteset_functions());
-
-    // Typical ASCII tokenization and validation benchmarks
-    std::printf("Benchmarking for all whitespaces:\n");
-    bench_finds(dataset.text, {{sz::whitespaces(), sizeof(sz::whitespaces())}}, find_byteset_functions());
-    bench_rfinds(dataset.text, {{sz::whitespaces(), sizeof(sz::whitespaces())}}, rfind_byteset_functions());
-
-    std::printf("Benchmarking for HTML tag start/end:\n");
-    bench_finds(dataset.text, {"<>"}, find_byteset_functions());
-    bench_rfinds(dataset.text, {"<>"}, rfind_byteset_functions());
-
-    std::printf("Benchmarking for punctuation marks:\n");
-    bench_finds(dataset.text, {{sz::punctuation(), sizeof(sz::punctuation())}}, find_byteset_functions());
-    bench_rfinds(dataset.text, {{sz::punctuation(), sizeof(sz::punctuation())}}, rfind_byteset_functions());
-
-    std::printf("Benchmarking for non-printable characters:\n");
-    bench_finds(dataset.text, {{sz::ascii_controls(), sizeof(sz::ascii_controls())}}, find_byteset_functions());
-    bench_rfinds(dataset.text, {{sz::ascii_controls(), sizeof(sz::ascii_controls())}}, rfind_byteset_functions());
-
-    // Baseline benchmarks for present tokens, coming in all lengths
-    std::printf("Benchmarking on present lines:\n");
-    bench_search(dataset.text, {dataset.lines.begin(), dataset.lines.end()});
-    std::printf("Benchmarking on present tokens:\n");
-    bench_search(dataset.text, {dataset.tokens.begin(), dataset.tokens.end()});
-
-    // Run benchmarks on tokens of different length
-    for (std::size_t token_length : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32}) {
-        std::printf("Benchmarking on present tokens of length %zu:\n", token_length);
-        bench_search(dataset.text, filter_by_length<std::string>(dataset.tokens, token_length));
-    }
+#pragma endregion // Byteset Search
 
-    // Run benchmarks on abstract tokens of different length
-    for (std::size_t token_length : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32}) {
-        std::printf("Benchmarking for missing tokens of length %zu:\n", token_length);
-        bench_search(     //
-            dataset.text, //
-            std::vector<std::string> {
-                std::string(token_length, '\1'),
-                std::string(token_length, '\2'),
-                std::string(token_length, '\3'),
-                std::string(token_length, '\4'),
-            });
-    }
+int main(int argc, char const **argv) {
+    std::printf("Welcome to StringZilla!\n");
+
+    std::printf("Building up the environment...\n");
+    environment_t env = build_environment( //
+        argc, argv,                        //
+        "leipzig1M.txt",                   //
+        environment_t::tokenization_t::words_k);
+
+    std::printf("Starting search benchmarks...\n");
+    bench_substring_search(env);
+    bench_byte_search(env);
+    bench_byteset_search(env);
 
     std::printf("All benchmarks passed.\n");
     return 0;

From 244e605c038a9d5c14063fd5d281a1a7a7abefc0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 11 Mar 2025 22:14:58 +0000
Subject: [PATCH 187/751] Fix: Reverse order `std::search` offsets

---
 scripts/bench_search.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/bench_search.cpp b/scripts/bench_search.cpp
index aa76adf5..a342f73c 100644
--- a/scripts/bench_search.cpp
+++ b/scripts/bench_search.cpp
@@ -138,6 +138,7 @@ struct matcher_from_std_search {
     inline size_type needle_length() const noexcept { return needle_.size(); }
     inline size_type operator()(std::string_view haystack) const noexcept {
         auto match = std::search(haystack.begin(), haystack.end(), searcher_);
+        if (match == haystack.end()) return std::string_view::npos; // No match found
         return (size_type)(match - haystack.begin());
     }
     constexpr size_type skip_length() const noexcept { return 1; }
@@ -154,8 +155,10 @@ struct rmatcher_from_std_search {
     inline size_type needle_length() const noexcept { return needle_.size(); }
     inline size_type operator()(std::string_view haystack) const noexcept {
         auto match = std::search(haystack.rbegin(), haystack.rend(), searcher_);
+        if (match == haystack.rend()) return std::string_view::npos; // No match found
         auto offset_from_end = match - haystack.rbegin();
-        return std::string_view::npos - offset_from_end - needle_.size();
+        auto offset_from_start = haystack.size() - offset_from_end - needle_.size();
+        return (size_type)offset_from_start;
     }
     constexpr size_type skip_length() const noexcept { return 1; }
 };

From 3f1c723e8b7c3509cbc6144a8635478165e9d011 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 12 Mar 2025 14:30:24 +0000
Subject: [PATCH 188/751] Improve: New-style "container" benchmarks

---
 scripts/bench.hpp           | 214 +++++++++++++++++----------
 scripts/bench_container.cpp | 288 ++++++++++++++++++++++++++++--------
 scripts/bench_search.cpp    |   5 +-
 3 files changed, 364 insertions(+), 143 deletions(-)

diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index 1d390238..63abddfb 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -24,6 +24,8 @@
  *
  *  4.  Integrate with Linux @b `perf` and other tools for more detailed analysis.
  *      We can isolate the relevant pieces of code, excluding the preprocessing costs from the actual workload.
+ *      We can also track individual hardware counters, including platform-specific `PERF_TYPE_RAW` ones,
+ *      that are not handled by most tools.
  *
  *  5.  Visualize the results differently, with a compact output for both generic workloads and special cases.
  */
@@ -421,14 +423,36 @@ inline void log_stress_failure(environment_t const &env, std::string const &name
     std::fclose(file);
 }
 
+/**
+ *  @brief  Light-weight structure to construct a histogram of function call durations for a very
+ *          wide range of floating point values using logarithmic binning. TODO:
+ */
+template <std::size_t slots_ = 128>
+struct duration_histogram {
+    using count_t = std::uint32_t;
+    std::array<count_t, slots_> bins = {};
+    static constexpr double max_seconds = 1000; // Hard to imagine a single call taking more than 15-ish minutes
+    static constexpr double min_seconds = 1e-9; // A single nanosecond is just 3-ish CPU cycles on modern hardware
+
+    inline count_t &operator[](double seconds) {
+        auto bin_float = std::log(seconds / min_seconds) / std::log(max_seconds / min_seconds) * bins.size();
+        std::size_t bin = std::min(bins.size(), static_cast<std::size_t>(bin_float));
+        return bins[bin];
+    }
+};
+
+using duration_histogram_t = duration_histogram<>;
+
 struct benchmark_result_t {
     std::string name;
     bool skipped = false;
 
     std::size_t stress_calls = 0;
     std::size_t profiled_calls = 0;
+    std::size_t profiled_cpu_cycles = 0;
     double profiled_seconds = 0;
-    std::uint64_t profiled_cpu_cycles = 0;
+
+    duration_histogram_t cpu_cycles_histogram;
 
     std::size_t bytes_passed = 0; //< Pulled from the `call_result_t`
     std::size_t operations = 0;   //< Pulled from the `call_result_t`
@@ -441,80 +465,99 @@ struct benchmark_result_t {
     }
 
     /**
-     *  @brief  Logs the benchmark results to the console, including the throughput and latency.
+     *  @brief  Logs the benchmark results to the console, including the throughput and latency,
+     *          comparing against one or more baselines.
      *
      *  Example output:
      *
      *  @code{.unparsed}
-     *  Benchmarking sz_find_serial:
-     *  - Performance: 0.00 TOps/s @ 0.00 ns/call
-     *  - Errors: 1 in 1 calls
+     *  Benchmarking `sz_find_skylake`:
+     *  > Throughput: 0.00 TB/s @ 0.00 ns/call
+     *  > Efficiency: 0.00 TOps/s @ 0.00 ops/cycle
+     *  > Errors: 0 in 10 calls
+     *  > + 3.5 x against `sz_find_serial`
+     *  > + 70 % against `memmem`
      *  @endcode
-     */
-    benchmark_result_t const &log() const {
-        benchmark_result_t const &result = *this;
-        if (result.skipped) return result;
-        std::printf("Benchmarking %s:\n", result.name.c_str());
-
-        // Infer the latency from the number of calls and the total time
-        auto duration = result.profiled_seconds * 1e9 / result.profiled_calls;
-        auto duration_unit = "ns";
-        if (duration > 1e3) duration /= 1e3, duration_unit = "us";
-        if (duration > 1e3) duration /= 1e3, duration_unit = "ms";
-        if (duration > 1e3) duration /= 1e3, duration_unit = "s";
-
-        // We may want to analyze the call latency distribution:
-        // auto cpu_frequency = result.profiled_cpu_cycles / result.profiled_seconds;
-        // auto cpu_frequency_unit = "Hz";
-        // if (cpu_frequency > 1e3) cpu_frequency /= 1e3, cpu_frequency_unit = "KHz";
-        // if (cpu_frequency > 1e3) cpu_frequency /= 1e3, cpu_frequency_unit = "MHz";
-        // if (cpu_frequency > 1e3) cpu_frequency /= 1e3, cpu_frequency_unit = "GHz";
-
-        // Infer the throughput from the number of operations and the total time
-        auto throughput = (result.operations ? result.operations : result.bytes_passed) / result.profiled_seconds;
-        auto throughput_unit = result.operations ? "Ops/s" : "B/s";
-        if (throughput > 1e3) throughput /= 1e3, throughput_unit = result.operations ? "KOps/s" : "KB/s";
-        if (throughput > 1e3) throughput /= 1e3, throughput_unit = result.operations ? "MOps/s" : "MB/s";
-        if (throughput > 1e3) throughput /= 1e3, throughput_unit = result.operations ? "GOps/s" : "GB/s";
-
-        // Print to console
-        std::printf(" - Performance: %.2f %s @ %.2f %s/call\n", throughput, throughput_unit, duration, duration_unit);
-        if (result.errors) std::printf(" - Errors: %zu in %zu calls\n", result.errors, result.stress_calls);
-
-        return result;
-    }
-
-    /**
-     *  @brief  Logs @b relative results to the console, comparing @p this to a @p base result.
      *
-     *  Example output:
+     *  When running on Linux, additional hardware counters can be sampled using `perf`:
      *
      *  @code{.unparsed}
-     *  Benchmarking sz_find_skylake:
-     *  - Performance: 0.00 TOps/s @ 0.00 ns/call
-     *  - Errors: 1 in 1 calls
-     *  - Relative performance: +25% vs sz_find_serial
+     *  > Instructions retired: ... ~ 3.2 per cycle
+     *  > L1 cache misses: ...
+     *  > L2 cache misses: ...
+     *  > L3 cache misses: ...
+     *  > Branch misses: ... ~ 3% of all branches
+     *  > Branch instructions: ... ~ 20% of all instructions
+     *  > Frontend stall cycles: %
+     *  > Backend stall cycles: %
+     *  > Port 0 cycles: ... progress bar showing its share of the total
+     *  > Port 3 cycles: ... progress bar showing its share of the total
+     *  ...
      *  @endcode
+     *
+     *  After a section of benchmarks is completed, you can use other functionality to visualize the results
+     *  in a more structured way, like a table or a graph or a set of progress bars.
      */
-    benchmark_result_t const &log(benchmark_result_t const &base) const {
-        benchmark_result_t const &new_ = *this;
-        new_.log();
-
-        if (new_.skipped || base.skipped) return new_; //? Nothing to compare to
-        auto base_throughput = (base.operations ? base.operations : base.bytes_passed) / base.profiled_seconds;
-        auto new_throughput = (new_.operations ? new_.operations : new_.bytes_passed) / new_.profiled_seconds;
-        auto relative_throughput = new_throughput / base_throughput;
-
-        // Now format the relative improvement as a percentage for small changes and as a multiplier for large ones,
-        // formatting it with a plus and a green color for improvements and a minus and a red color for regressions.
-        auto relative_color = relative_throughput > 1 ? "\033[32m" : "\033[31m";
-        auto relative_sign = relative_throughput > 1 ? "+" : "-";
-        auto relative_unit = relative_throughput > 2 ? "x" : "%";
-        if (relative_throughput < 0.5) relative_throughput = 1 / relative_throughput, relative_unit = "x";
-        if (std::strcmp(relative_unit, "%") == 0) relative_throughput *= 100;
-        std::printf(" - Relative performance: %s%s%.0f %s\033[0m vs. %s\n", relative_color, relative_sign,
-                    relative_throughput, relative_unit, base.name.c_str());
-        return new_;
+    template <typename... Baselines>
+    benchmark_result_t const &log(Baselines const &...bases) const {
+        if (skipped) return *this;
+        std::printf("Benchmarking `%s`:\n", name.c_str());
+
+        // Print the number of errors, if any
+        if (errors) std::printf("> Errors: %zu in %zu calls\n", errors, stress_calls);
+
+        // Compute average call latency.
+        auto seconds_printable = profiled_seconds * 1e9 / profiled_calls;
+        char const *seconds_printable_unit = "ns";
+        if (seconds_printable > 1e3) seconds_printable /= 1e3, seconds_printable_unit = "us";
+        if (seconds_printable > 1e3) seconds_printable /= 1e3, seconds_printable_unit = "ms";
+        if (seconds_printable > 1e3) seconds_printable /= 1e3, seconds_printable_unit = "s";
+
+        // Compute throughput based on operations (or bytes passed if operations == 0).
+        auto bytes_printable = bytes_passed / profiled_seconds;
+        char const *bytes_printable_unit = "B/s";
+        if (bytes_printable > 1e3) bytes_printable /= 1e3, bytes_printable_unit = "KB/s";
+        if (bytes_printable > 1e3) bytes_printable /= 1e3, bytes_printable_unit = "MB/s";
+        if (bytes_printable > 1e3) bytes_printable /= 1e3, bytes_printable_unit = "GB/s";
+        std::printf("> Throughput: %.2f %s @ %.2f %s/call\n", //
+                    bytes_printable, bytes_printable_unit,    //
+                    seconds_printable, seconds_printable_unit);
+
+        // Print the number of operations, if there was a separate tracking mechanism for those
+        if (operations) {
+            auto ops_printable = operations * 1.0 / profiled_seconds;
+            auto ops_per_cycle = operations * 1.0 / profiled_cpu_cycles;
+            char const *ops_printable_unit = (operations ? "Ops/s" : "B/s");
+            if (ops_printable > 1e3) ops_printable /= 1e3, ops_printable_unit = "KOps/s";
+            if (ops_printable > 1e3) ops_printable /= 1e3, ops_printable_unit = "MOps/s";
+            if (ops_printable > 1e3) ops_printable /= 1e3, ops_printable_unit = "GOps/s";
+            std::printf("> Efficiency: %.2f %s @ %.2f ops/cycle\n", ops_printable, ops_printable_unit, ops_per_cycle);
+        }
+
+        // Define a helper lambda to log relative performance with folding expressions.
+        auto log_relative = [this](benchmark_result_t const &base) {
+            if (skipped || base.skipped) return;
+            auto relative_throughput = (bytes_passed / profiled_seconds) / (base.bytes_passed / base.profiled_seconds);
+            if (operations)
+                relative_throughput = (operations / profiled_seconds) / (base.operations / base.profiled_seconds);
+
+            // Format relative improvements: green and a plus for improvements, red and a minus for regressions.
+            char const *relative_color = (relative_throughput > 1) ? "\033[32m" : "\033[31m";
+            char const *relative_sign = (relative_throughput > 1) ? "+" : "-";
+            char const *relative_unit = (relative_throughput > 2) ? "x" : "%";
+            if (relative_throughput < 0.5) relative_throughput = 1 / relative_throughput, relative_unit = "x";
+            if (std::strcmp(relative_unit, "%") == 0) relative_throughput *= 100;
+            std::printf("> %s%s %.0f %s\033[0m against `%s`\n", //
+                        relative_color, relative_sign, relative_throughput,
+                        relative_unit, //
+                        base.name.c_str());
+        };
+
+        // Expand over all provided baselines.
+        (void)std::initializer_list<int> {(log_relative(bases), 0)...};
+        sz_unused(log_relative); // In case no `bases` were provided
+
+        return *this;
     }
 };
 
@@ -526,9 +569,17 @@ struct benchmark_result_t {
  *  @param[in] callable Unary function taking a @b `std::size_t` token index and returning a @b `call_result_t`.
  *  @return Profiling results, including the number of cycles, bytes processed, and error counts.
  */
-template <typename callable_type_, typename baseline_type_ = callable_no_op_t>
-benchmark_result_t benchmark(environment_t const &env, std::string const &name, baseline_type_ &&baseline,
-                             callable_type_ &&callable) {
+template <                                          //
+    typename callable_type_,                        //
+    typename baseline_type_ = callable_no_op_t,     //
+    typename preprocessing_type_ = callable_no_op_t //
+    >
+benchmark_result_t benchmark(  //
+    environment_t const &env,  //
+    std::string const &name,   //
+    baseline_type_ &&baseline, //
+    callable_type_ &&callable, //
+    preprocessing_type_ &&preprocessing = callable_no_op_t()) {
 
     benchmark_result_t result;
     result.name = name;
@@ -537,6 +588,9 @@ benchmark_result_t benchmark(environment_t const &env, std::string const &name,
         return result;
     }
 
+    // Pre-process before testing
+    if constexpr (!std::is_same<preprocessing_type_, callable_no_op_t>()) preprocessing();
+
     std::size_t const lookup_mask = bit_floor(env.tokens.size()) - 1;
     if constexpr (!std::is_same<baseline_type_, callable_no_op_t>())
         for (auto running_seconds : repeat_up_to(env.stress_seconds)) {
@@ -559,30 +613,38 @@ benchmark_result_t benchmark(environment_t const &env, std::string const &name,
     // But then we will repeat it in an unrolled fashion for a more accurate measurement.
     result.profiled_seconds += seconds_per_call([&] {
         std::uint64_t start_cycle = cpu_cycle_counter();
-        result += callable(0); // First input for debugging
+        result += callable((std::size_t)0); // First input for debugging
         std::uint64_t end_cycle = cpu_cycle_counter();
         result.profiled_calls += 1;
         result.profiled_cpu_cycles += end_cycle - start_cycle;
+        result.cpu_cycles_histogram[end_cycle - start_cycle] += 1;
     });
     if (result.profiled_seconds >= env.benchmark_seconds) return result;
 
     // Repeat the benchmarks in unrolled batches until the time limit is reached.
     for (auto running_seconds : repeat_up_to(env.benchmark_seconds - result.profiled_seconds)) {
-        std::uint64_t start_cycle = cpu_cycle_counter();
+        std::uint64_t t0 = cpu_cycle_counter();
         call_result_t r0 = callable((result.profiled_calls + 0) & lookup_mask);
+        std::uint64_t t1 = cpu_cycle_counter();
         call_result_t r1 = callable((result.profiled_calls + 1) & lookup_mask);
+        std::uint64_t t2 = cpu_cycle_counter();
         call_result_t r2 = callable((result.profiled_calls + 2) & lookup_mask);
+        std::uint64_t t3 = cpu_cycle_counter();
         call_result_t r3 = callable((result.profiled_calls + 3) & lookup_mask);
-        std::uint64_t end_cycle = cpu_cycle_counter();
+        std::uint64_t t4 = cpu_cycle_counter();
 
         // Aggregate all of them:
         result += r0;
         result += r1;
         result += r2;
         result += r3;
-        result.profiled_calls += 4;
-        result.profiled_cpu_cycles += end_cycle - start_cycle;
         result.profiled_seconds = running_seconds;
+        result.profiled_calls += 4;
+        result.profiled_cpu_cycles += t4 - t0;
+        result.cpu_cycles_histogram[t1 - t0] += 1;
+        result.cpu_cycles_histogram[t2 - t1] += 1;
+        result.cpu_cycles_histogram[t3 - t2] += 1;
+        result.cpu_cycles_histogram[t4 - t3] += 1;
     }
 
     return result;
@@ -600,12 +662,6 @@ benchmark_result_t benchmark(environment_t const &env, std::string const &name,
     return benchmark(env, name, callable_no_op_t {}, callable);
 }
 
-inline sz_string_view_t to_c(std::string_view str) noexcept { return {str.data(), str.size()}; }
-inline sz_string_view_t to_c(std::string const &str) noexcept { return {str.data(), str.size()}; }
-inline sz_string_view_t to_c(sz::string_view str) noexcept { return {str.data(), str.size()}; }
-inline sz_string_view_t to_c(sz::string const &str) noexcept { return {str.data(), str.size()}; }
-inline sz_string_view_t to_c(sz_string_view_t str) noexcept { return str; }
-
 } // namespace scripts
 } // namespace stringzilla
 } // namespace ashvardanian
\ No newline at end of file
diff --git a/scripts/bench_container.cpp b/scripts/bench_container.cpp
index 38d92038..39c1a4c8 100644
--- a/scripts/bench_container.cpp
+++ b/scripts/bench_container.cpp
@@ -1,91 +1,259 @@
 /**
  *  @file   bench_container.cpp
- *  @brief  Benchmarks STL associative containers with string keys.
+ *  @brief  Benchmarks STL associative containers with @b `std::string_view`-compatible keys.
+ *          The program accepts a file path to a dataset, tokenizes it, and benchmarks the lookup operations.
  *
  *  This file is the sibling of `bench_sort.cpp`, `bench_search.cpp` and `bench_token.cpp`.
  *  It accepts a file with a list of words, constructs associative containers with string keys,
  *  using `std::string`, `std::string_view`, `sz::string_view`, and `sz::string`, and then
  *  evaluates the latency of lookups.
+ *
+ *  Instead of CLI arguments, for compatibility with @b StringWa.rs, the following environment variables are used:
+ *  - `STRINGWARS_DATASET` : Path to the dataset file.
+ *  - `STRINGWARS_TOKENS=words` : Tokenization model ("file", "lines", "words", or positive integer [1:200] for N-grams
+ *  - `STRINGWARS_SEED=42` : Optional seed for shuffling reproducibility.
+ *
+ *  Unlike StringWa.rs, the following additional environment variables are supported:
+ *  - `STRINGWARS_DURATION=10` : Time limit (in seconds) per benchmark.
+ *  - `STRINGWARS_FILTER` : Regular Expression pattern to filter algorithm/backend names.
+ *
+ *  Here are a few build & run commands:
+ *
+ *  @code{.sh}
+ *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
+ *  cmake --build build_release --config Release --target stringzilla_bench_container
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=lines build_release/stringzilla_bench_container
+ *  @endcode
+ *
+ *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
+ *  like all Skylake-X and newer kernels on a boundary-condition input length of 64 bytes (exactly 1 cache line),
+ *  your last command may look like:
+ *
+ *  @code{.sh}
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
+ *  build_release/stringzilla_bench_container
+ *  @endcode
+ *
+ *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
+ *  This file is the sibling of `bench_sort.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
  */
-#include <map>
-#include <unordered_map>
+#include <map>           // `std::map`
+#include <unordered_map> // `std::unordered_map`
 
-#include <bench.hpp>
+#define SZ_USE_MISALIGNED_LOADS (1)
+#include "bench.hpp"
 
 using namespace ashvardanian::stringzilla::scripts;
 
-template <typename string_to_type_, typename string_from_type_>
-std::vector<string_to_type_> to(std::vector<string_from_type_> const &strings) {
-    std::vector<string_to_type_> result;
-    result.reserve(strings.size());
-    for (string_from_type_ const &string : strings) result.push_back({string.data(), string.size()});
-    return result;
-}
+/**
+ *  @brief Helper function-like object to order string-view convertible objects with StringZilla.
+ *  @see Similar to `std::less<std::string_view>`: https://en.cppreference.com/w/cpp/utility/functional/less
+ *  @note Unlike the `sz::less`, the structure below supports different hardware backends.
+ */
+template <sz_order_t order_>
+struct less_from_sz {
+    inline bool operator()(std::string_view a, std::string_view b) const noexcept {
+        return order_(a.data(), a.size(), b.data(), b.size()) < 0;
+    }
+};
 
 /**
- *  @brief  Evaluation for search string operations: find.
+ *  @brief Helper function-like object to check equality between string-view convertible objects with StringZilla.
+ *  @see Similar to `std::equal_to<std::string_view>`: https://en.cppreference.com/w/cpp/utility/functional/equal_to
+ *  @note Unlike the `sz::equal_to`, the structure below supports different hardware backends.
  */
-template <typename container_type_>
-void bench(std::string name, std::vector<std::string_view> const &strings) {
+template <sz_equal_t equal_>
+struct equal_to_from_sz {
+    inline bool operator()(std::string_view a, std::string_view b) const noexcept {
+        return a.size() == b.size() && equal_(a.data(), b.data(), b.size());
+    }
+};
+
+/**
+ *  @brief Helper function-like object to hash string-view convertible objects with StringZilla.
+ *  @see Similar to `hash_through_std_t`: https://en.cppreference.com/w/cpp/utility/functional/hash
+ *  @note Unlike the `sz::hash`, the structure below supports different hardware backends.
+ */
+template <sz_hash_t hash_>
+struct hash_from_sz {
+    inline std::size_t operator()(std::string_view str) const noexcept { return hash_(str.data(), str.size(), 0); }
+};
 
-    using key_type = typename container_type_::key_type;
-    std::vector<key_type> keys = to<key_type>(strings);
+template <typename container_type_>
+struct callable_for_associative_lookups {
 
-    // Build up the container
     container_type_ container;
-    for (key_type const &key : keys) container[key] = 0;
+    environment_t const &env;
+
+    inline callable_for_associative_lookups(environment_t const &env) noexcept : env(env) {}
+    void preprocess() {
+        using key_type = typename container_type_::key_type;
+        for (std::string_view const &key : env.tokens) container[key_type(key)]++;
+    }
 
-    tracked_function_gt<unary_function_t> variant;
-    variant.name = name;
-    variant.results = bench_on_tokens(keys, [&](key_type const &key) {
-        container.find(key)->second++;
-        return key.size();
-    });
+    /** @brief Helper API to produce a delayed construction lambda. */
+    inline auto preprocessor() {
+        return [this] { preprocess(); };
+    }
 
-    variant.print();
-}
+    /** @brief The actual lookup operation to be benchmarked. */
+    call_result_t operator()(std::size_t token_index) const {
+        std::string_view key = env.tokens[token_index];
+        auto counter = container.find(key)->second;
+        return {key.size(), static_cast<std::size_t>(counter)};
+    }
+};
 
-template <typename strings_type_>
-void bench_tokens(strings_type_ const &s) {
-    if (s.size() == 0) return;
-
-    // STL containers with StringZilla strings and views
-    bench<std::map<sz::string, int>>("std::map<sz::string>::find", s);
-    bench<std::map<sz::string_view, int>>("std::map<sz::string_view>::find", s);
-    bench<std::unordered_map<sz::string, int>>("std::umap<sz::string>::find", s);
-    bench<std::unordered_map<sz::string_view, int>>("std::umap<sz::string_view>::find", s);
-
-    // STL containers with STL strings and views
-    bench<std::map<std::string, int>>("std::map<std::string>::find", s);
-    bench<std::map<std::string_view, int>>("std::map<std::string_view>::find", s);
-    bench<std::unordered_map<std::string, int>>("std::umap<std::string>::find", s);
-    bench<std::unordered_map<std::string_view, int>>("std::umap<std::string_view>::find", s);
-
-    // STL structures with StringZilla operations
-    bench<std::map<std::string, int, sz::less>>("std::map<std::string, sz::less>::find", s);
-    bench<std::map<std::string_view, int, sz::less>>("std::map<std::string_view, sz::less>::find", s);
-    bench<std::unordered_map<std::string, int, sz::hash, sz::equal_to>>("std::umap<std::string, sz::hash>::find", s);
-    bench<std::unordered_map<std::string_view, int, sz::hash, sz::equal_to>>(
-        "std::umap<std::string_view, sz::hash>::find", s);
+/**
+ *  @brief Find all inclusions of each given token in the dataset, using various search backends.
+ */
+void bench_associative_lookups_with_different_simd_backends(environment_t const &env) {
+
+    // First, benchmark the default STL equality comparison and hashes
+    benchmark_result_t base_map, base_umap;
+    {
+        auto callable_map = callable_for_associative_lookups<std::map<std::string_view, unsigned>>(env);
+        base_map = benchmark(env, "map::find", callable_no_op_t(), callable_map, callable_map.preprocessor()).log();
+        auto callable_umap = callable_for_associative_lookups<std::unordered_map<std::string_view, unsigned>>(env);
+        base_umap =
+            benchmark(env, "unordered_map::find", callable_no_op_t(), callable_umap, callable_umap.preprocessor())
+                .log();
+    }
+
+    // Conditionally include SIMD-accelerated backends
+#if SZ_USE_SKYLAKE
+    {
+        auto callable_map =
+            callable_for_associative_lookups<std::map<std::string_view, unsigned, less_from_sz<sz_order_skylake>>>(env);
+        benchmark(env, "map<sz_order_skylake>::find", callable_no_op_t(), callable_map, callable_map.preprocessor())
+            .log(base_map);
+        auto callable_umap = callable_for_associative_lookups<std::unordered_map<
+            std::string_view, unsigned, hash_from_sz<sz_hash_skylake>, equal_to_from_sz<sz_equal_skylake>>>(env);
+        benchmark(env, "unordered_map<sz_hash_skylake, sz_equal_skylake>::find", callable_no_op_t(), callable_umap,
+                  callable_umap.preprocessor())
+            .log(base_umap);
+    }
+
+#endif
+#if SZ_USE_HASWELL
+    {
+        auto callable_map =
+            callable_for_associative_lookups<std::map<std::string_view, unsigned, less_from_sz<sz_order_haswell>>>(env);
+        benchmark(env, "map<sz_order_haswell>::find", callable_no_op_t(), callable_map, callable_map.preprocessor())
+            .log(base_map);
+        auto callable_umap = callable_for_associative_lookups<std::unordered_map<
+            std::string_view, unsigned, hash_from_sz<sz_hash_haswell>, equal_to_from_sz<sz_equal_haswell>>>(env);
+        benchmark(env, "unordered_map<sz_hash_haswell, sz_equal_haswell>::find", callable_no_op_t(), callable_umap,
+                  callable_umap.preprocessor())
+            .log(base_umap);
+    }
+#endif
+#if SZ_USE_NEON
+    {
+        auto callable_map =
+            callable_for_associative_lookups<std::map<std::string_view, unsigned, less_from_sz<sz_order_neon>>>(env);
+        benchmark(env, "map<sz_order_neon>::find", callable_no_op_t(), callable_map, callable_map.preprocessor())
+            .log(base_map);
+        auto callable_umap =
+            callable_for_associative_lookups<std::unordered_map<std::string_view, unsigned, hash_from_sz<sz_hash_neon>,
+                                                                equal_to_from_sz<sz_equal_neon>>>(env);
+        benchmark(env, "unordered_map<sz_hash_neon, sz_equal_neon>::find", callable_no_op_t(), callable_umap,
+                  callable_umap.preprocessor())
+            .log(base_umap);
+    }
+#endif
 }
 
-int main(int argc, char const **argv) {
-    std::printf("StringZilla. Starting container benchmarks.\n");
+struct less_through_std_t {
+    using is_transparent = void;
+    template <typename first_type_, typename second_type_>
+    inline bool operator()(first_type_ const &a, second_type_ const &b) const noexcept {
+        return std::less<std::string_view> {}(std::string_view(a), std::string_view(b));
+    }
+};
 
-    dataset_t dataset = prepare_benchmark_environment(argc, argv);
+struct hash_through_std_t {
+    using is_transparent = void;
+    template <typename string_like_>
+    inline std::size_t operator()(string_like_ const &str) const noexcept {
+        return std::hash<std::string_view> {}(std::string_view(str));
+    }
+};
+
+struct equal_to_through_std_t {
+    using is_transparent = void;
+    template <typename first_type_, typename second_type_>
+    inline bool operator()(first_type_ const &a, second_type_ const &b) const noexcept {
+        return std::equal_to<std::string_view> {}(std::string_view(a), std::string_view(b));
+    }
+};
 
-    // Baseline benchmarks for real words, coming in all lengths
-    std::printf("Benchmarking on real words:\n");
-    bench_tokens(dataset.tokens);
-    std::printf("Benchmarking on real lines:\n");
-    bench_tokens(dataset.lines);
+void bench_associative_lookups_with_different_key_classes(environment_t const &env) {
 
-    // Run benchmarks on tokens of different length
-    for (std::size_t token_length : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32}) {
-        std::printf("Benchmarking on real words of length %zu:\n", token_length);
-        bench_tokens(filter_by_length(dataset.tokens, token_length));
+    // First, benchmark the default STL equality comparison and hashes for `std::string_view` keys
+    benchmark_result_t base_map, base_umap;
+    {
+        auto callable_map = callable_for_associative_lookups<std::map<std::string_view, unsigned>>(env);
+        base_map =
+            benchmark(env, "map<std::string_view>::find", callable_no_op_t(), callable_map, callable_map.preprocessor())
+                .log();
+        auto callable_umap = callable_for_associative_lookups<std::unordered_map<std::string_view, unsigned>>(env);
+        base_umap = benchmark(env, "unordered_map<std::string_view>::find", callable_no_op_t(), callable_umap,
+                              callable_umap.preprocessor())
+                        .log();
     }
 
+    // Compare that to using `std::string` for keys
+    {
+        auto callable_map = callable_for_associative_lookups<std::map<std::string, unsigned, less_through_std_t>>(env);
+        benchmark(env, "map<std::string>::find", callable_no_op_t(), callable_map, callable_map.preprocessor())
+            .log(base_map);
+        auto callable_umap = callable_for_associative_lookups<
+            std::unordered_map<std::string, unsigned, hash_through_std_t, equal_to_through_std_t>>(env);
+        benchmark(env, "unordered_map<std::string>::find", callable_no_op_t(), callable_umap,
+                  callable_umap.preprocessor())
+            .log(base_umap);
+    }
+
+    // Try using StringZilla's `sz::string_view` for keys
+    {
+        auto callable_map =
+            callable_for_associative_lookups<std::map<sz::string_view, unsigned, less_through_std_t>>(env);
+        benchmark(env, "map<sz::string_view>::find", callable_no_op_t(), callable_map, callable_map.preprocessor())
+            .log(base_map);
+        auto callable_umap = callable_for_associative_lookups<
+            std::unordered_map<sz::string_view, unsigned, hash_through_std_t, equal_to_through_std_t>>(env);
+        benchmark(env, "unordered_map<sz::string_view>::find", callable_no_op_t(), callable_umap,
+                  callable_umap.preprocessor())
+            .log(base_umap);
+    }
+
+    // Try StringZilla's "Small String Optimization" class - `sz::string`
+    {
+        auto callable_map = callable_for_associative_lookups<std::map<sz::string, unsigned, less_through_std_t>>(env);
+        benchmark(env, "map<sz::string>::find", callable_no_op_t(), callable_map, callable_map.preprocessor())
+            .log(base_map);
+        auto callable_umap = callable_for_associative_lookups<
+            std::unordered_map<sz::string, unsigned, hash_through_std_t, equal_to_through_std_t>>(env);
+        benchmark(env, "unordered_map<sz::string>::find", callable_no_op_t(), callable_umap,
+                  callable_umap.preprocessor())
+            .log(base_umap);
+    }
+}
+
+int main(int argc, char const **argv) {
+    std::printf("Welcome to StringZilla!\n");
+
+    std::printf("Building up the environment...\n");
+    environment_t env = build_environment( //
+        argc, argv,                        //
+        "leipzig1M.txt",                   //
+        environment_t::tokenization_t::words_k);
+
+    std::printf("Starting associative STL container benchmarks...\n");
+    bench_associative_lookups_with_different_simd_backends(env);
+    bench_associative_lookups_with_different_key_classes(env);
+
     std::printf("All benchmarks passed.\n");
     return 0;
 }
\ No newline at end of file
diff --git a/scripts/bench_search.cpp b/scripts/bench_search.cpp
index a342f73c..4c2d2655 100644
--- a/scripts/bench_search.cpp
+++ b/scripts/bench_search.cpp
@@ -15,7 +15,7 @@
  *
  *  Instead of CLI arguments, for compatibility with @b StringWa.rs, the following environment variables are used:
  *  - `STRINGWARS_DATASET` : Path to the dataset file.
- *  - `STRINGWARS_TOKENS=word` : Tokenization model ("file", "line", "word", or positive integer [1:200] for N-grams
+ *  - `STRINGWARS_TOKENS=words` : Tokenization model ("file", "lines", "words", or positive integer [1:200] for N-grams
  *  - `STRINGWARS_SEED=42` : Optional seed for shuffling reproducibility.
  *
  *  Unlike StringWa.rs, the following additional environment variables are supported:
@@ -46,9 +46,6 @@
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
  *  This file is the sibling of `bench_sort.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
- *
- *  ! It requires more memory than some of the other benchmarks, as every token is re-allocated
- *  ! into a NULL-terminated buffer for compatibility with the C-style string functions.
  */
 #include <cstring>    // `memmem`
 #include <functional> // `std::boyer_moore_searcher`

From aa7f275bd56fcffe80c2068966f4c2d6d417599b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 12 Mar 2025 14:30:36 +0000
Subject: [PATCH 189/751] Make: Upgrade to C++20 for benchmarks

---
 CMakeLists.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f1c1a24f..eb538eef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -252,12 +252,12 @@ function (define_launcher exec_name source cpp_standard target_arch)
 endfunction ()
 
 if (${STRINGZILLA_BUILD_BENCHMARK})
-    define_launcher(stringzilla_bench_search scripts/bench_search.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringzilla_bench_similarity scripts/bench_similarity.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringzilla_bench_sort scripts/bench_sort.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringzilla_bench_token scripts/bench_token.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringzilla_bench_container scripts/bench_container.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringzilla_bench_memory scripts/bench_memory.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_bench_search scripts/bench_search.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_bench_similarity scripts/bench_similarity.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_bench_sort scripts/bench_sort.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_bench_token scripts/bench_token.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_bench_container scripts/bench_container.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_bench_memory scripts/bench_memory.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
 endif ()
 
 if (${STRINGZILLA_BUILD_TEST})

From b9794e5717e8fc8d16184f499f0c8da23266cef4 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 12 Mar 2025 14:30:52 +0000
Subject: [PATCH 190/751] Improve: Naming "vtable" entries

---
 c/lib.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/c/lib.c b/c/lib.c
index afffeeff..97cf8d1f 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -194,10 +194,10 @@ typedef struct sz_implementations_t {
     sz_find_byte_t rfind_byte;
     sz_find_t find;
     sz_find_t rfind;
-    sz_find_set_t find_from_set;
-    sz_find_set_t rfind_from_set;
+    sz_find_byteset_t find_byteset;
+    sz_find_byteset_t rfind_byteset;
 
-    sz_levenshtein_distance_t edit_distance;
+    sz_levenshtein_distance_t levenshtein_distance;
     sz_needleman_wunsch_score_t alignment_score;
 
     sz_sequence_argsort_t sequence_argsort;
@@ -239,10 +239,10 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
     impl->rfind = sz_rfind_serial;
     impl->find_byte = sz_find_byte_serial;
     impl->rfind_byte = sz_rfind_byte_serial;
-    impl->find_from_set = sz_find_byteset_serial;
-    impl->rfind_from_set = sz_rfind_byteset_serial;
+    impl->find_byteset = sz_find_byteset_serial;
+    impl->rfind_byteset = sz_rfind_byteset_serial;
 
-    impl->edit_distance = sz_levenshtein_distance_serial;
+    impl->levenshtein_distance = sz_levenshtein_distance_serial;
     impl->alignment_score = sz_needleman_wunsch_score_serial;
 
     impl->sequence_argsort = sz_sequence_argsort_serial;
@@ -270,8 +270,8 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->rfind_byte = sz_rfind_byte_haswell;
         impl->find = sz_find_haswell;
         impl->rfind = sz_rfind_haswell;
-        impl->find_from_set = sz_find_byteset_haswell;
-        impl->rfind_from_set = sz_rfind_byteset_haswell;
+        impl->find_byteset = sz_find_byteset_haswell;
+        impl->rfind_byteset = sz_rfind_byteset_haswell;
     }
 #endif
 
@@ -304,10 +304,10 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
 
 #if SZ_USE_ICE
     if (caps & sz_cap_ice_k) {
-        impl->find_from_set = sz_find_byteset_ice;
-        impl->rfind_from_set = sz_rfind_byteset_ice;
+        impl->find_byteset = sz_find_byteset_ice;
+        impl->rfind_byteset = sz_rfind_byteset_ice;
 
-        impl->edit_distance = sz_levenshtein_distance_ice;
+        impl->levenshtein_distance = sz_levenshtein_distance_ice;
         impl->alignment_score = sz_needleman_wunsch_score_ice;
 
         impl->lookup = sz_lookup_ice;
@@ -343,8 +343,8 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->rfind = sz_rfind_neon;
         impl->find_byte = sz_find_byte_neon;
         impl->rfind_byte = sz_rfind_byte_neon;
-        impl->find_from_set = sz_find_byteset_neon;
-        impl->rfind_from_set = sz_rfind_byteset_neon;
+        impl->find_byteset = sz_find_byteset_neon;
+        impl->rfind_byteset = sz_rfind_byteset_neon;
     }
 #endif
 
@@ -464,11 +464,11 @@ SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t
 }
 
 SZ_DYNAMIC sz_cptr_t sz_find_byteset(sz_cptr_t text, sz_size_t length, sz_byteset_t const *set) {
-    return sz_dispatch_table.find_from_set(text, length, set);
+    return sz_dispatch_table.find_byteset(text, length, set);
 }
 
 SZ_DYNAMIC sz_cptr_t sz_rfind_byteset(sz_cptr_t text, sz_size_t length, sz_byteset_t const *set) {
-    return sz_dispatch_table.rfind_from_set(text, length, set);
+    return sz_dispatch_table.rfind_byteset(text, length, set);
 }
 
 SZ_DYNAMIC sz_status_t sz_hamming_distance( //
@@ -489,7 +489,7 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distance( //
     sz_cptr_t a, sz_size_t a_length,            //
     sz_cptr_t b, sz_size_t b_length,            //
     sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result) {
-    return sz_dispatch_table.edit_distance(a, a_length, b, b_length, bound, alloc, result);
+    return sz_dispatch_table.levenshtein_distance(a, a_length, b, b_length, bound, alloc, result);
 }
 
 SZ_DYNAMIC sz_status_t sz_levenshtein_distance_utf8( //

From 9676cdbd66c4a4346ef5d06d8d28ecdeb0472435 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 12 Mar 2025 14:32:03 +0000
Subject: [PATCH 191/751] Fix: Naming byteset signature

---
 include/stringzilla/types.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 3a117f61..81d0b4fb 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -545,8 +545,8 @@ typedef sz_cptr_t (*sz_find_byte_t)(sz_cptr_t, sz_size_t, sz_cptr_t);
 /** @brief Signature of `sz_find`. */
 typedef sz_cptr_t (*sz_find_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
 
-/** @brief Signature of `sz_find_set`. */
-typedef sz_cptr_t (*sz_find_set_t)(sz_cptr_t, sz_size_t, sz_byteset_t const *);
+/** @brief Signature of `sz_find_byteset`. */
+typedef sz_cptr_t (*sz_find_byteset_t)(sz_cptr_t, sz_size_t, sz_byteset_t const *);
 
 /** @brief Signature of `sz_hamming_distance`. */
 typedef sz_status_t (*sz_hamming_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t, sz_size_t *);

From af686dd134808a4d4a00ef7ff22af5a665ea92cf Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 12 Mar 2025 14:32:21 +0000
Subject: [PATCH 192/751] Docs: Describe trivial types

---
 include/stringzilla/types.h | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 81d0b4fb..089c505a 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -342,28 +342,30 @@ struct sz_sequence_t;              // Forward declaration of an ordered collecti
 typedef sz_size_t sz_sorted_idx_t; // Index of a sorted string in a list of strings
 typedef sz_size_t sz_pgram_t;      // "Pointer-sized N-gram" of a string
 
-typedef enum { sz_false_k = 0, sz_true_k = 1 } sz_bool_t;                        // Only one relevant bit
-typedef enum { sz_less_k = -1, sz_equal_k = 0, sz_greater_k = 1 } sz_ordering_t; // Only three possible states: <=>
+/**
+ *  @brief Simple boolean type, until `_Bool` in C 99 and `true` and `false` in C 23.
+ *  @see https://stackoverflow.com/questions/1921539/using-boolean-values-in-c
+ */
+typedef enum { sz_false_k = 0, sz_true_k = 1 } sz_bool_t;
+
+/**
+ *  @brief Describes the result of a comparison operation. Equivalent to @b `std::strong_ordering` in C++20.
+ *  @see https://en.cppreference.com/w/cpp/utility/compare/strong_ordering
+ */
+typedef enum { sz_less_k = -1, sz_equal_k = 0, sz_greater_k = 1 } sz_ordering_t;
 
 /**
- *  @brief  Describes an error status of a function.
+ *  @brief A simple signed integer type describing the status of a faulty operation.
+ *  @sa sz_success_k, sz_bad_alloc_k, sz_invalid_utf8_k, sz_contains_duplicates_k
  */
 typedef enum {
-    /**
-     *  For algorithms that return a status, this status indicates that the operation was successful.
-     */
+    /** For algorithms that return a status, this status indicates that the operation was successful. */
     sz_success_k = 0,
-    /**
-     *  For algorithms that require memory allocation, this status indicates that the allocation failed.
-     */
+    /** For algorithms that require memory allocation, this status indicates that the allocation failed. */
     sz_bad_alloc_k = -1,
-    /**
-     *  For algorithms that require UTF8 input, this status indicates that the input is invalid.
-     */
+    /** For algorithms that require UTF8 input, this status indicates that the input is invalid. */
     sz_invalid_utf8_k = -2,
-    /**
-     *  For algorithms that take collections of unique elements, this status indicates presence of duplicates.
-     */
+    /** For algorithms that take collections of unique elements, this status indicates presence of duplicates. */
     sz_contains_duplicates_k = -3,
 } sz_status_t;
 

From 12e1edd7888f19603d5d4c13b927d1950327b8ab Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 12 Mar 2025 18:03:28 +0000
Subject: [PATCH 193/751] Improve: New token-level benchmarks

---
 scripts/bench_token.cpp | 519 +++++++++++++++++++++++-----------------
 1 file changed, 302 insertions(+), 217 deletions(-)

diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index 57cb4bc2..85785d74 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -1,293 +1,378 @@
 /**
  *  @file   bench_token.cpp
  *  @brief  Benchmarks token-level operations like hashing, equality, ordering, and copies.
+ *          The program accepts a file path to a dataset, tokenizes it, and benchmarks the search operations,
+ *          validating the SIMD-accelerated backends against the serial baselines.
  *
- *  This file is the sibling of `bench_sort.cpp`, `bench_search.cpp` and `bench_similarity.cpp`.
+ *  Benchmarks include:
+ *  - Checksum calculation and hashing for each token - @b bytesum and @b hash.
+ *  - Stream hashing of a token (file, lines, or words) - @b hash_init, @b hash_stream, @b hash_fold.
+ *  - Equality check between two tokens and their relative order - @b equal and @b ordering.
+ *
+ *  For substring search, the number of operations per second are reported as the number of character-level comparisons
+ *  happening in the worst case in the naive algorithm, meaning O(N*M) for N characters in the haystack and M in the
+ *  needle.
+ *
+ *  Instead of CLI arguments, for compatibility with @b StringWa.rs, the following environment variables are used:
+ *  - `STRINGWARS_DATASET` : Path to the dataset file.
+ *  - `STRINGWARS_TOKENS=lines` : Tokenization model ("file", "lines", "words", or positive integer [1:200] for N-grams
+ *  - `STRINGWARS_SEED=42` : Optional seed for shuffling reproducibility.
+ *
+ *  Unlike StringWa.rs, the following additional environment variables are supported:
+ *  - `STRINGWARS_DURATION=10` : Time limit (in seconds) per benchmark.
+ *  - `STRINGWARS_STRESS=1` : Test SIMD-accelerated functions against the serial baselines.
+ *  - `STRINGWARS_STRESS_DIR=/.tmp` : Output directory for stress-testing failures logs.
+ *  - `STRINGWARS_STRESS_LIMIT=1` : Controls the number of failures we're willing to tolerate.
+ *  - `STRINGWARS_STRESS_DURATION=10` : Stress-testing time limit (in seconds) per benchmark.
+ *  - `STRINGWARS_FILTER` : Regular Expression pattern to filter algorithm/backend names.
+ *
+ *  Here are a few build & run commands:
+ *
+ *  @code{.sh}
+ *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
+ *  cmake --build build_release --config Release --target stringzilla_bench_token
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=lines build_release/stringzilla_bench_token
+ *  @endcode
+ *
+ *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
+ *  like all Skylake-X and newer kernels on a boundary-condition input length of 64 bytes (exactly 1 cache line),
+ *  your last command may look like:
+ *
+ *  @code{.sh}
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
+ *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
+ *  build_release/stringzilla_bench_token
+ *  @endcode
+ *
+ *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
+ *  This file is the sibling of `bench_sort.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
  */
 #include <numeric> // `std::accumulate`
 
-#include <bench.hpp>
-#include <test.hpp> // `random_string`
+#include "bench.hpp"
 
 using namespace ashvardanian::stringzilla::scripts;
 
-/**
- *  @brief  Provides kernels, each computing the unsigned sum of bytes in given tokens.
- *          Compares all supported SIMD backed outputs to the serial implementation.
- */
-tracked_unary_functions_t bytesum_functions() {
-    auto wrap_sz = [](auto function) -> unary_function_t {
-        return unary_function_t([function](std::string_view s) { return function(s.data(), s.size()); });
-    };
-    tracked_unary_functions_t result = {
-        {"std::accumulate",
-         [](std::string_view s) {
-             return std::accumulate(s.begin(), s.end(), (std::size_t)0,
-                                    [](std::size_t sum, char c) { return sum + static_cast<unsigned char>(c); });
-         }},
-        {"sz_bytesum_serial", wrap_sz(sz_bytesum_serial), true},
+#pragma region Unary Functions
+
+/** @brief Wraps a hardware-specific hashing backend into something similar to @b `std::accumulate`. */
+template <sz_bytesum_t func_>
+struct bytesum_from_sz {
+
+    environment_t const &env;
+    inline call_result_t operator()(std::size_t token_index) const noexcept {
+        return operator()(env.tokens[token_index]);
+    }
+
+    inline call_result_t operator()(std::string_view buffer) const noexcept {
+        sz_u64_t bytesum = func_(buffer.data(), buffer.size());
+        return {buffer.size(), static_cast<check_value_t>(bytesum)};
+    }
+};
+
+/** @brief Wraps @b `std::accumulate` into a function object compatible with our benchmarking suite. */
+struct bytesum_from_std_t {
+
+    environment_t const &env;
+    inline call_result_t operator()(std::size_t token_index) const noexcept {
+        return operator()(env.tokens[token_index]);
+    }
+
+    inline call_result_t operator()(std::string_view buffer) const noexcept {
+        std::size_t bytesum =
+            std::accumulate(buffer.begin(), buffer.end(), (std::size_t)0,
+                            [](std::size_t sum, char c) { return sum + static_cast<unsigned char>(c); });
+        return {buffer.size(), static_cast<check_value_t>(bytesum)};
+    }
+};
+
+/** @brief Wraps a hardware-specific hashing backend into something similar to @b `std::hash`. */
+template <sz_hash_t func_>
+struct hash_from_sz {
+
+    environment_t const &env;
+    inline call_result_t operator()(std::size_t token_index) const noexcept {
+        return operator()(env.tokens[token_index]);
+    }
+
+    inline call_result_t operator()(std::string_view buffer) const noexcept {
+        sz_u64_t hash = func_(buffer.data(), buffer.size(), 0);
+        return {buffer.size(), static_cast<check_value_t>(hash)};
+    }
+};
+
+/** @brief Wraps @b `std::hash` into a function object compatible with our benchmarking suite. */
+struct hash_from_std_t {
+
+    environment_t const &env;
+    inline call_result_t operator()(std::size_t token_index) const noexcept {
+        return operator()(env.tokens[token_index]);
+    }
+
+    inline call_result_t operator()(std::string_view buffer) const noexcept {
+        std::size_t hash = std::hash<std::string_view> {}(buffer);
+        do_not_optimize(hash); //! The used function is not documented and can't be tested against anything
+        return {buffer.size() /* static_cast<check_value_t>(hash) */};
+    }
+};
+
+/** @brief Wraps hash state initialization, streaming, and folding for streaming benchmarks. */
+template <sz_hash_state_init_t init_, sz_hash_state_stream_t stream_, sz_hash_state_fold_t fold_>
+struct hash_stream_from_sz {
+
+    environment_t const &env;
+    inline call_result_t operator()(std::size_t token_index) const noexcept {
+        return operator()(env.tokens[token_index]);
+    }
+
+    call_result_t operator()(std::string_view s) const noexcept {
+        sz_hash_state_t state;
+        init_(&state, 42);
+        stream_(&state, s.data(), s.size());
+        sz_u64_t hash = fold_(&state);
+        return {s.size(), static_cast<check_value_t>(hash)};
+    }
+};
+
+void bench_checksums(environment_t const &env) {
+
+    auto validator = bytesum_from_std_t(env);
+    benchmark_result_t base_stl = benchmark(env, "std::accumulate", validator).log();
+    benchmark_result_t base =
+        benchmark(env, "sz_bytesum_serial", validator, bytesum_from_sz<sz_bytesum_serial>(env)).log(base_stl);
+
 #if SZ_USE_HASWELL
-        {"sz_bytesum_haswell", wrap_sz(sz_bytesum_haswell), true},
+    benchmark(env, "sz_bytesum_haswell", validator, bytesum_from_sz<sz_bytesum_haswell>(env)).log(base, base_stl);
 #endif
 #if SZ_USE_SKYLAKE
-        {"sz_bytesum_skylake", wrap_sz(sz_bytesum_skylake), true},
+    benchmark(env, "sz_bytesum_skylake", validator, bytesum_from_sz<sz_bytesum_skylake>(env)).log(base, base_stl);
 #endif
 #if SZ_USE_ICE
-        {"sz_bytesum_ice", wrap_sz(sz_bytesum_ice), true},
+    benchmark(env, "sz_bytesum_ice", validator, bytesum_from_sz<sz_bytesum_ice>(env)).log(base, base_stl);
 #endif
 #if SZ_USE_NEON
-        {"sz_bytesum_neon", wrap_sz(sz_bytesum_neon), true},
+    benchmark(env, "sz_bytesum_neon", validator, bytesum_from_sz<sz_bytesum_neon>(env)).log(base, base_stl);
 #endif
-    };
-    return result;
 }
 
-/**
- *  @brief Provides kernels, each computing the hash of given tokens using the same seed.
- *         Compares all supported SIMD backed outputs to the serial implementation.
- */
-tracked_unary_functions_t hash_functions() {
-    auto wrap_sz = [](auto function) -> unary_function_t {
-        return unary_function_t([function](std::string_view s) { return function(s.data(), s.size(), 0); });
-    };
-    tracked_unary_functions_t result = {
-        {"sz_hash_serial", wrap_sz(sz_hash_serial)},
+void bench_hashing(environment_t const &env) {
+
+    auto validator = hash_from_sz<sz_hash_serial>(env);
+    benchmark_result_t base = benchmark(env, "sz_hash_serial", validator).log();
+    benchmark_result_t base_stl = benchmark(env, "std::hash", hash_from_std_t(env)).log(base);
 #if SZ_USE_HASWELL
-        {"sz_hash_haswell", wrap_sz(sz_hash_haswell), true},
+    benchmark(env, "sz_hash_haswell", validator, hash_from_sz<sz_hash_haswell>(env)).log(base, base_stl);
 #endif
 #if SZ_USE_SKYLAKE
-        {"sz_hash_skylake", wrap_sz(sz_hash_skylake), true},
+    benchmark(env, "sz_hash_skylake", validator, hash_from_sz<sz_hash_skylake>(env)).log(base, base_stl);
 #endif
 #if SZ_USE_ICE
-        {"sz_hash_ice", wrap_sz(sz_hash_ice), true},
+    benchmark(env, "sz_hash_ice", validator, hash_from_sz<sz_hash_ice>(env)).log(base, base_stl);
 #endif
 #if SZ_USE_NEON
-        {"sz_hash_neon", wrap_sz(sz_hash_neon), true},
+    benchmark(env, "sz_hash_neon", validator, hash_from_sz<sz_hash_neon>(env)).log(base, base_stl);
 #endif
-        {"std::hash", [](std::string_view s) { return std::hash<std::string_view> {}(s); }},
-    };
-    return result;
 }
 
-/** @brief Wraps hash state initialization, streaming, and folding for streaming benchmarks. */
-struct wrap_sz_hash_stream {
-    sz_hash_state_t state;
-    sz_hash_state_init_t init;
-    sz_hash_state_stream_t stream;
-    sz_hash_state_fold_t fold;
-
-    wrap_sz_hash_stream(sz_hash_state_init_t i, sz_hash_state_stream_t s, sz_hash_state_fold_t f)
-        : init(i), stream(s), fold(f) {}
-
-    std::size_t operator()(std::string_view s) noexcept {
-        init(&state, 42);
-        stream(&state, s.data(), s.size());
-        return fold(&state);
-    }
-};
+void bench_stream_hashing(environment_t const &env) {
+
+    auto validator =
+        hash_stream_from_sz<sz_hash_state_init_serial, sz_hash_state_stream_serial, sz_hash_state_fold_serial>(env);
+    benchmark_result_t base = benchmark(env, "sz_hash_stream_serial", validator).log();
+    benchmark_result_t base_stl = benchmark(env, "std::hash", hash_from_std_t(env)).log(base);
 
-/**
- *  @brief  Provides kernels, each computing the hash of given tokens using more expensive "streaming" API.
- *          Compares all supported SIMD backed outputs to the serial implementation.
- */
-tracked_unary_functions_t hash_stream_functions() {
-    tracked_unary_functions_t result = {
-        {"sz_hash_stream_serial",
-         wrap_sz_hash_stream(sz_hash_state_init_serial, sz_hash_state_stream_serial, sz_hash_state_fold_serial)},
 #if SZ_USE_HASWELL
-        {"sz_hash_stream_haswell",
-         wrap_sz_hash_stream(sz_hash_state_init_haswell, sz_hash_state_stream_haswell, sz_hash_state_fold_haswell),
-         true},
+    benchmark(
+        env, "sz_hash_stream_haswell", validator,
+        hash_stream_from_sz<sz_hash_state_init_haswell, sz_hash_state_stream_haswell, sz_hash_state_fold_haswell>(env))
+        .log(base, base_stl);
 #endif
 #if SZ_USE_SKYLAKE
-        {"sz_hash_stream_skylake",
-         wrap_sz_hash_stream(sz_hash_state_init_skylake, sz_hash_state_stream_skylake, sz_hash_state_fold_skylake),
-         true},
+    benchmark(
+        env, "sz_hash_stream_skylake", validator,
+        hash_stream_from_sz<sz_hash_state_init_skylake, sz_hash_state_stream_skylake, sz_hash_state_fold_skylake>(env))
+        .log(base, base_stl);
 #endif
 #if SZ_USE_ICE
-        {"sz_hash_stream_ice",
-         wrap_sz_hash_stream(sz_hash_state_init_ice, sz_hash_state_stream_ice, sz_hash_state_fold_ice), true},
+    benchmark(env, "sz_hash_stream_ice", validator,
+              hash_stream_from_sz<sz_hash_state_init_ice, sz_hash_state_stream_ice, sz_hash_state_fold_ice>(env))
+        .log(base, base_stl);
 #endif
 #if SZ_USE_NEON
-        {"sz_hash_stream_neon",
-         wrap_sz_hash_stream(sz_hash_state_init_neon, sz_hash_state_stream_neon, sz_hash_state_fold_neon), true},
+    benchmark(env, "sz_hash_stream_neon", validator,
+              hash_stream_from_sz<sz_hash_state_init_neon, sz_hash_state_stream_neon, sz_hash_state_fold_neon>(env))
+        .log(base, base_stl);
 #endif
-    };
-    return result;
 }
 
+#pragma endregion
+
+#pragma region Binary Functions
+
 /**
- *  @brief  Provides kernels, each generating random bytes for given tokens using the same "nonce".
- *          Compares all supported SIMD backed outputs to the serial implementation.
+ *  @brief  Wraps a hardware-specific equality-checking backend into something similar to @b `std::equal_to`.
+ *          Assuming that almost any random pair of strings would differ in the very first byte, to make benchmarks
+ *          more similar to mixed cases, like Hash Table lookups, where during probing we meet both differing
+ *          and equivalent strings.
  */
-tracked_unary_functions_t random_generation_functions() {
-    static std::vector<char> buffer;
-    auto wrap_sz = [](auto function) -> unary_function_t {
-        return unary_function_t([function](std::string_view s) {
-            if (buffer.size() < s.size()) buffer.resize(s.size());
-            function(buffer.data(), s.size(), 0);
-            return s.size();
-        });
-    };
-
-    tracked_unary_functions_t result = {
-        {"sz_fill_random_serial", wrap_sz(sz_fill_random_serial)},
-#if SZ_USE_HASWELL
-        {"sz_fill_random_haswell", wrap_sz(sz_fill_random_haswell), true},
-#endif
-#if SZ_USE_SKYLAKE
-        {"sz_fill_random_skylake", wrap_sz(sz_fill_random_skylake), true},
-#endif
-#if SZ_USE_ICE
-        {"sz_fill_random_ice", wrap_sz(sz_fill_random_ice), true},
-#endif
-#if SZ_USE_NEON
-        {"sz_fill_random_neon", wrap_sz(sz_fill_random_neon), true},
-#endif
-        {"std::rand() & 0xFF", unary_function_t([](std::string_view token) -> std::size_t {
-             if (buffer.size() < token.size()) buffer.resize(token.size());
-             for (std::size_t i = 0; i < token.size(); ++i) buffer[i] = static_cast<char>(std::rand() & 0xFF);
-             return token.size();
-         })},
-        {"std::uniform_int<uint8>", unary_function_t([](std::string_view token) -> std::size_t {
-             if (buffer.size() < token.size()) buffer.resize(token.size());
-             randomize_string(&buffer[0], token.size());
-             return token.size();
-         })},
-    };
-    return result;
-}
+template <sz_equal_t func_>
+struct equality_from_sz {
 
-/** @brief Wraps string equality check for potentially different length inputs. */
-struct wrap_sz_equal {
-    sz_equal_t function;
+    environment_t const &env;
+    inline call_result_t operator()(std::size_t token_index) const noexcept {
+        return operator()(env.tokens[token_index], env.tokens[env.tokens.size() - 1 - token_index]);
+    }
 
-    wrap_sz_equal(sz_equal_t f) : function(f) {}
-    bool operator()(std::string_view a, std::string_view b) const noexcept {
-        return a.size() == b.size() && function(a.data(), b.data(), a.size());
+    inline call_result_t operator()(std::string_view a, std::string_view b) const noexcept {
+        bool ab = func_(a.data(), b.data(), std::min(a.size(), b.size())) == sz_true_k;
+        bool aa = func_(a.data(), a.data(), a.size()) == sz_true_k;
+        bool bb = func_(b.data(), b.data(), b.size()) == sz_true_k;
+        bool ba = func_(b.data(), a.data(), std::min(a.size(), b.size())) == sz_true_k;
+        std::size_t max_bytes_passed = a.size() + b.size() + std::min(a.size(), b.size());
+        check_value_t check_value = ab;
+        do_not_optimize(ab);
+        do_not_optimize(aa);
+        do_not_optimize(bb);
+        do_not_optimize(ba);
+        return {max_bytes_passed, check_value};
     }
 };
 
 /** @brief Wraps LibC's string equality check for potentially different length inputs. */
-bool memcmp_for_equality(std::string_view a, std::string_view b) noexcept {
-    return (a.size() == b.size() && memcmp(a.data(), b.data(), a.size()) == 0);
-}
+struct equality_from_memcmp_t {
+
+    environment_t const &env;
+    inline call_result_t operator()(std::size_t token_index) const noexcept {
+        return operator()(env.tokens[token_index], env.tokens[env.tokens.size() - 1 - token_index]);
+    }
+
+    inline call_result_t operator()(std::string_view a, std::string_view b) const noexcept {
+        bool ab = std::memcmp(a.data(), b.data(), std::min(a.size(), b.size())) == 0;
+        bool aa = std::memcmp(a.data(), a.data(), a.size()) == 0;
+        bool bb = std::memcmp(b.data(), b.data(), b.size()) == 0;
+        bool ba = std::memcmp(b.data(), a.data(), std::min(a.size(), b.size())) == 0;
+        std::size_t max_bytes_passed = a.size() + b.size() + std::min(a.size(), b.size());
+        check_value_t check_value = ab;
+        do_not_optimize(ab);
+        do_not_optimize(aa);
+        do_not_optimize(bb);
+        do_not_optimize(ba);
+        return {max_bytes_passed, check_value};
+    }
+};
 
 /**
- *  @brief  Provides kernels, each comparing two tokens for equality.
- *          Compares all supported SIMD backed outputs to the serial implementation.
- *          In each iteration combines self- and cross-compares to dampen the branch prediction effect,
- *          assuming most random string would differ in the very first byte.
+ *  @brief  Wraps a hardware-specific order-checking backend into something similar to @b `std::equal_to`.
+ *          Assuming that almost any random pair of strings would differ in the very first byte, to make benchmarks
+ *          more similar to mixed cases, like Hash Table lookups, where during probing we meet both differing
+ *          and equivalent strings.
  */
-tracked_binary_functions_t equality_functions() {
-    tracked_binary_functions_t result = {
-        {"sz_equal_serial", binary_combinations(wrap_sz_equal(sz_equal_serial))},
+template <sz_order_t func_>
+struct ordering_from_sz {
+
+    environment_t const &env;
+    inline call_result_t operator()(std::size_t token_index) const noexcept {
+        return operator()(env.tokens[token_index], env.tokens[env.tokens.size() - 1 - token_index]);
+    }
+
+    inline call_result_t operator()(std::string_view a, std::string_view b) const noexcept {
+        sz_ordering_t ab = func_(a.data(), a.size(), b.data(), b.size());
+        sz_ordering_t aa = func_(a.data(), a.size(), a.data(), a.size());
+        sz_ordering_t bb = func_(b.data(), b.size(), b.data(), b.size());
+        sz_ordering_t ba = func_(b.data(), a.size(), a.data(), a.size());
+        std::size_t max_bytes_passed = 4 * std::min(a.size(), b.size());
+        check_value_t check_value = ab + aa * 3 + bb * 9 + ba * 27; // Each can have 3 unique values
+        return {max_bytes_passed, check_value};
+    }
+};
+
+/** @brief Wraps LibC's string order-checking for potentially different length inputs. */
+struct ordering_from_memcmp_t {
+
+    environment_t const &env;
+    inline call_result_t operator()(std::size_t token_index) const noexcept {
+        return operator()(env.tokens[token_index], env.tokens[env.tokens.size() - 1 - token_index]);
+    }
+
+    inline call_result_t operator()(std::string_view a, std::string_view b) const noexcept {
+        int ab = memcmp_for_ordering(a, b);
+        int aa = memcmp_for_ordering(a, a);
+        int bb = memcmp_for_ordering(b, b);
+        int ba = memcmp_for_ordering(b, a);
+        std::size_t max_bytes_passed = 4 * std::min(a.size(), b.size());
+        check_value_t check_value = ab + aa * 3 + bb * 9 + ba * 27; // Each can have 3 unique values
+        return {max_bytes_passed, check_value};
+    }
+
+    /** @brief Wraps LibC's string comparison for potentially different length inputs. */
+    static int memcmp_for_ordering(std::string_view a, std::string_view b) noexcept {
+        auto order = memcmp(a.data(), b.data(), a.size() < b.size() ? a.size() : b.size());
+        if (order == 0) return a.size() == b.size() ? 0 : (a.size() < b.size() ? -1 : 1);
+        return order;
+    }
+};
+
+void bench_comparing_equality(environment_t const &env) {
+
+    auto validator = equality_from_memcmp_t(env);
+    benchmark_result_t base = benchmark(env, "sz_equal_serial", validator, equality_from_sz<sz_equal_serial>(env));
+    benchmark_result_t base_stl = benchmark(env, "std::memcmp==0", validator).log(base);
+
 #if SZ_USE_HASWELL
-        {"sz_equal_haswell", binary_combinations(wrap_sz_equal(sz_equal_haswell)), true},
+    benchmark(env, "sz_equal_haswell", validator, equality_from_sz<sz_equal_haswell>(env)).log(base, base_stl);
 #endif
 #if SZ_USE_SKYLAKE
-        {"sz_equal_skylake", binary_combinations(wrap_sz_equal(sz_equal_skylake)), true},
+    benchmark(env, "sz_equal_skylake", validator, equality_from_sz<sz_equal_skylake>(env)).log(base, base_stl);
 #endif
-#if SZ_USE_SVE
-        {"sz_equal_sve", binary_combinations(wrap_sz_equal(sz_equal_sve)), true},
+#if SZ_USE_ICE
+    benchmark(env, "sz_equal_ice", validator, equality_from_sz<sz_equal_ice>(env)).log(base, base_stl);
 #endif
 #if SZ_USE_NEON
-        {"sz_equal_neon", binary_combinations(wrap_sz_equal(sz_equal_neon)), true},
+    benchmark(env, "sz_equal_neon", validator, equality_from_sz<sz_equal_neon>(env)).log(base, base_stl);
 #endif
-        {"memcmp(equality)", binary_combinations(memcmp_for_equality)},
-    };
-    return result;
 }
 
-/** @brief Wraps LibC's string comparison for potentially different length inputs. */
-int memcmp_for_ordering(std::string_view a, std::string_view b) noexcept {
-    auto order = memcmp(a.data(), b.data(), a.size() < b.size() ? a.size() : b.size());
-    if (order == 0) return a.size() == b.size() ? 0 : (a.size() < b.size() ? -1 : 1);
-    return order;
-}
+void bench_comparing_order(environment_t const &env) {
+
+    auto validator = ordering_from_memcmp_t(env);
+    benchmark_result_t base = benchmark(env, "sz_order_serial", validator, ordering_from_sz<sz_order_serial>(env));
+    benchmark_result_t base_stl = benchmark(env, "memcmp<=>0", validator).log(base);
 
-/**
- *  @brief  Provides kernels, each computing the relative order of two tokens.
- *          Compares all supported SIMD backed outputs to the serial implementation.
- *          In each iteration combines self- and cross-compares to dampen the branch prediction effect,
- *          assuming most random string would differ in the very first byte.
- */
-tracked_binary_functions_t ordering_functions() {
-    auto wrap_sz = [](auto function) -> binary_function_t {
-        return binary_function_t([function](std::string_view a, std::string_view b) {
-            return (int)function(a.data(), a.size(), b.data(), b.size());
-        });
-    };
-    tracked_binary_functions_t result = {
-        {"sz_order_serial", binary_combinations(wrap_sz(sz_order_serial))},
 #if SZ_USE_HASWELL
-        {"sz_order_haswell", binary_combinations(wrap_sz(sz_order_haswell)), true},
+    benchmark(env, "sz_order_haswell", validator, ordering_from_sz<sz_order_haswell>(env)).log(base, base_stl);
 #endif
 #if SZ_USE_SKYLAKE
-        {"sz_order_skylake", binary_combinations(wrap_sz(sz_order_skylake)), true},
+    benchmark(env, "sz_order_skylake", validator, ordering_from_sz<sz_order_skylake>(env)).log(base, base_stl);
+#endif
+#if SZ_USE_ICE
+    benchmark(env, "sz_order_ice", validator, ordering_from_sz<sz_order_ice>(env)).log(base, base_stl);
+#endif
+#if SZ_USE_NEON
+    benchmark(env, "sz_order_neon", validator, ordering_from_sz<sz_order_neon>(env)).log(base, base_stl);
 #endif
-        {"memcmp(ordering)", binary_combinations(memcmp_for_ordering)},
-    };
-    return result;
 }
 
-template <typename string_type>
-void bench_dereferencing(std::string name, std::vector<string_type> strings) {
-    auto func = unary_function_t([](std::string_view s) { return s.size(); });
-    tracked_unary_functions_t converts = {{name, func}};
-    bench_unary_functions(strings, converts);
-}
+#pragma endregion
 
-template <typename strings_type>
-void bench(strings_type &&strings) {
-    if (strings.size() == 0) return;
-
-    // Benchmark logical operations
-    bench_unary_functions(strings, bytesum_functions());
-    bench_unary_functions(strings, hash_functions());
-    bench_unary_functions(strings, hash_stream_functions());
-    bench_binary_functions(strings, equality_functions());
-    bench_binary_functions(strings, ordering_functions());
-    bench_unary_functions(strings, random_generation_functions());
-
-    // Benchmark the cost of converting `std::string` and `sz::string` to `std::string_view`.
-    // ! The results on a mixture of short and long strings should be similar.
-    // ! If the dataset is made of exclusively short or long strings, STL will look much better
-    // ! in this micro-benchmark, as the correct branch of the SSO will be predicted every time.
-    bench_dereferencing<std::string>("std::string -> std::string_view", {strings.begin(), strings.end()});
-    bench_dereferencing<sz::string>("sz::string -> std::string_view", {strings.begin(), strings.end()});
-}
+int main(int argc, char const **argv) {
+    std::printf("Welcome to StringZilla!\n");
 
-void bench_on_input_data(int argc, char const **argv) {
-    dataset_t dataset = prepare_benchmark_environment(argc, argv);
-
-    // Baseline benchmarks for real words, coming in all lengths
-    std::printf("Benchmarking on real words:\n");
-    bench(dataset.tokens);
-    std::printf("Benchmarking on real lines:\n");
-    bench(dataset.lines);
-    std::printf("Benchmarking on entire dataset:\n");
-    bench_unary_functions<std::vector<std::string_view>>({dataset.text}, bytesum_functions());
-    bench_unary_functions<std::vector<std::string_view>>({dataset.text}, hash_functions());
-    bench_unary_functions<std::vector<std::string_view>>({dataset.text}, hash_stream_functions());
-
-    // Run benchmarks on tokens of different length
-    for (std::size_t token_length : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32}) {
-        std::printf("Benchmarking on real words of length %zu:\n", token_length);
-        bench(filter_by_length(dataset.tokens, token_length));
-    }
-}
+    std::printf("Building up the environment...\n");
+    environment_t env = build_environment( //
+        argc, argv,                        //
+        "leipzig1M.txt",                   //
+        environment_t::tokenization_t::lines_k);
 
-void bench_on_synthetic_data() {
-    // Generate some random words
-}
+    std::printf("Starting individual token-level benchmarks...\n");
 
-int main(int argc, char const **argv) {
-    std::printf("StringZilla. Starting token-level benchmarks.\n");
-    std::printf("- Seconds per benchmark: %zu\n", seconds_per_benchmark);
+    // Unary operations
+    bench_checksums(env);
+    bench_hashing(env);
+    bench_stream_hashing(env);
 
-    if (argc < 2) { bench_on_synthetic_data(); }
-    else { bench_on_input_data(argc, argv); }
+    // Binary operations
+    bench_comparing_equality(env);
+    bench_comparing_order(env);
 
     std::printf("All benchmarks passed.\n");
     return 0;

From 20f35c7b31e01cdf83b56f0d6e4d7e2f471cc1fe Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 12 Mar 2025 18:20:16 +0000
Subject: [PATCH 194/751] Improve: Faster equality checks on NEON/SVE

---
 include/stringzilla/compare.h | 36 +++++++++++++++++++++--------------
 scripts/bench_token.cpp       |  9 +++++++--
 2 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/include/stringzilla/compare.h b/include/stringzilla/compare.h
index b6412016..8a739cca 100644
--- a/include/stringzilla/compare.h
+++ b/include/stringzilla/compare.h
@@ -372,16 +372,23 @@ SZ_PUBLIC sz_ordering_t sz_order_neon(sz_cptr_t a, sz_size_t a_length, sz_cptr_t
 }
 
 SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
+    if (length < 16) return sz_equal_serial(a, b, length);
+
     sz_u128_vec_t a_vec, b_vec;
-    for (; length >= 16; a += 16, b += 16, length -= 16) {
-        a_vec.u8x16 = vld1q_u8((sz_u8_t const *)a);
-        b_vec.u8x16 = vld1q_u8((sz_u8_t const *)b);
+    sz_size_t offset = 0;
+    do {
+        a_vec.u8x16 = vld1q_u8((sz_u8_t const *)(a + offset));
+        b_vec.u8x16 = vld1q_u8((sz_u8_t const *)(b + offset));
         uint8x16_t cmp = vceqq_u8(a_vec.u8x16, b_vec.u8x16);
-        if (vminvq_u8(cmp) != 255) { return sz_false_k; } // Check if all bytes match
-    }
-
-    // Handle remaining bytes
-    if (length) return sz_equal_serial(a, b, length);
+        if (vminvq_u8(cmp) != 255) return sz_false_k; // Check if all bytes match
+        offset += 16;
+    } while (offset + 16 <= length);
+
+    // For final check - load the last register-long piece of content from the end
+    a_vec.u8x16 = vld1q_u8((sz_u8_t const *)(a + length - 16));
+    b_vec.u8x16 = vld1q_u8((sz_u8_t const *)(b + length - 16));
+    uint8x16_t cmp = vceqq_u8(a_vec.u8x16, b_vec.u8x16);
+    if (vminvq_u8(cmp) != 255) return sz_false_k;
     return sz_true_k;
 }
 
@@ -401,16 +408,17 @@ SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
 
 SZ_PUBLIC sz_bool_t sz_equal_sve(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
     // Determine the number of bytes in an SVE vector.
+    sz_size_t const vector_bytes = svcntb();
+    sz_size_t progress = 0;
     do {
-        svbool_t progress_vec = svwhilelt_b8((sz_size_t)0, length);
-        svuint8_t a_vec = svld1(progress_vec, (sz_u8_t const *)a);
-        svuint8_t b_vec = svld1(progress_vec, (sz_u8_t const *)b);
+        svbool_t progress_vec = svwhilelt_b8(progress, length);
+        svuint8_t a_vec = svld1(progress_vec, (sz_u8_t const *)(a + progress));
+        svuint8_t b_vec = svld1(progress_vec, (sz_u8_t const *)(b + progress));
         // Compare: generate a predicate marking lanes where a!=b
         svbool_t not_equal_vec = svcmpne(progress_vec, a_vec, b_vec);
         if (svptest_any(progress_vec, not_equal_vec)) return sz_false_k;
-        sz_size_t const vector_length = svcntp_b8(svptrue_b8(), progress_vec);
-        a += vector_length, b += vector_length, length -= vector_length;
-    } while (length > 0);
+        progress += vector_bytes;
+    } while (progress < length);
     return sz_true_k;
 }
 
diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index 85785d74..48c2d374 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -315,7 +315,8 @@ struct ordering_from_memcmp_t {
 void bench_comparing_equality(environment_t const &env) {
 
     auto validator = equality_from_memcmp_t(env);
-    benchmark_result_t base = benchmark(env, "sz_equal_serial", validator, equality_from_sz<sz_equal_serial>(env));
+    benchmark_result_t base =
+        benchmark(env, "sz_equal_serial", validator, equality_from_sz<sz_equal_serial>(env)).log();
     benchmark_result_t base_stl = benchmark(env, "std::memcmp==0", validator).log(base);
 
 #if SZ_USE_HASWELL
@@ -330,12 +331,16 @@ void bench_comparing_equality(environment_t const &env) {
 #if SZ_USE_NEON
     benchmark(env, "sz_equal_neon", validator, equality_from_sz<sz_equal_neon>(env)).log(base, base_stl);
 #endif
+#if SZ_USE_SVE
+    benchmark(env, "sz_equal_sve", validator, equality_from_sz<sz_equal_sve>(env)).log(base, base_stl);
+#endif
 }
 
 void bench_comparing_order(environment_t const &env) {
 
     auto validator = ordering_from_memcmp_t(env);
-    benchmark_result_t base = benchmark(env, "sz_order_serial", validator, ordering_from_sz<sz_order_serial>(env));
+    benchmark_result_t base =
+        benchmark(env, "sz_order_serial", validator, ordering_from_sz<sz_order_serial>(env)).log();
     benchmark_result_t base_stl = benchmark(env, "memcmp<=>0", validator).log(base);
 
 #if SZ_USE_HASWELL

From a5de795340b514002110978cca6f03b094584701 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 12 Mar 2025 18:21:10 +0000
Subject: [PATCH 195/751] Fix: Computing improvement percent

---
 scripts/bench.hpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index 63abddfb..ef0e6f69 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -64,11 +64,13 @@ std::size_t round_up_to_multiple(std::size_t n) {
     return n == 0 ? multiple : ((n + multiple - 1) / multiple) * multiple;
 }
 
+using check_value_t = std::uint64_t;
+
 struct call_result_t {
     /** @brief Number of input bytes processed. */
     std::size_t bytes_passed = 0;
     /** @brief Some value used to compare execution result between the baseline and accelerated backend. */
-    std::size_t check_value = 0;
+    check_value_t check_value = 0;
     /** @brief For some operations with non-linear complexity, the throughput should be measured differently. */
     std::size_t operations = 0;
 
@@ -546,8 +548,8 @@ struct benchmark_result_t {
             char const *relative_sign = (relative_throughput > 1) ? "+" : "-";
             char const *relative_unit = (relative_throughput > 2) ? "x" : "%";
             if (relative_throughput < 0.5) relative_throughput = 1 / relative_throughput, relative_unit = "x";
-            if (std::strcmp(relative_unit, "%") == 0) relative_throughput *= 100;
-            std::printf("> %s%s %.0f %s\033[0m against `%s`\n", //
+            if (std::strcmp(relative_unit, "%") == 0) relative_throughput = (relative_throughput - 1) * 100;
+            std::printf("> %s%s %.1f %s\033[0m against `%s`\n", //
                         relative_color, relative_sign, relative_throughput,
                         relative_unit, //
                         base.name.c_str());

From 148b615f255bc27f528b9883c26be10cc59471d8 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 12 Mar 2025 21:36:25 +0000
Subject: [PATCH 196/751] Add: SVE backend for sorting

---
 include/stringzilla/sort.h | 245 ++++++++++++++++++++++++++++++++-----
 1 file changed, 217 insertions(+), 28 deletions(-)

diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index b4d487bd..8eecc339 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -676,8 +676,8 @@ SZ_INTERNAL void _sz_pgrams_union_serial(
 #pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
 
 /**
- *  @brief  The most important part of the QuickSort algorithm partitioning the elements around the pivot.
- *          Unlike the serial algorithm, uses compressed stores to filter and move the elements around the pivot.
+ *  @brief The most important part of the QuickSort algorithm partitioning the elements around the pivot.
+ *  @note Unlike the serial algorithm, uses compressed stores to filter and move the elements around the pivot.
  */
 SZ_INTERNAL void _sz_sequence_argsort_skylake_3way_partition(                       //
     sz_pgram_t *const initial_pgrams, sz_sorted_idx_t *const initial_order,         //
@@ -748,24 +748,10 @@ SZ_INTERNAL void _sz_sequence_argsort_skylake_3way_partition(
     // Copy back.
     sz_copy_skylake((sz_ptr_t)(initial_pgrams + start_in_sequence),      //
                     (sz_cptr_t)(partitioned_pgrams + start_in_sequence), //
-                    count_smaller * sizeof(sz_pgram_t));
+                    count * sizeof(sz_pgram_t));
     sz_copy_skylake((sz_ptr_t)(initial_order + start_in_sequence),      //
                     (sz_cptr_t)(partitioned_order + start_in_sequence), //
-                    count_smaller * sizeof(sz_sorted_idx_t));
-
-    sz_copy_skylake((sz_ptr_t)(initial_pgrams + start_in_sequence + count_smaller),      //
-                    (sz_cptr_t)(partitioned_pgrams + start_in_sequence + count_smaller), //
-                    count_equal * sizeof(sz_pgram_t));
-    sz_copy_skylake((sz_ptr_t)(initial_order + start_in_sequence + count_smaller),      //
-                    (sz_cptr_t)(partitioned_order + start_in_sequence + count_smaller), //
-                    count_equal * sizeof(sz_sorted_idx_t));
-
-    sz_copy_skylake((sz_ptr_t)(initial_pgrams + start_in_sequence + count_smaller + count_equal),      //
-                    (sz_cptr_t)(partitioned_pgrams + start_in_sequence + count_smaller + count_equal), //
-                    count_greater * sizeof(sz_pgram_t));
-    sz_copy_skylake((sz_ptr_t)(initial_order + start_in_sequence + count_smaller + count_equal),      //
-                    (sz_cptr_t)(partitioned_order + start_in_sequence + count_smaller + count_equal), //
-                    count_greater * sizeof(sz_sorted_idx_t));
+                    count * sizeof(sz_sorted_idx_t));
 
     // Return the offsets of the equal elements.
     *first_pivot_offset = start_in_sequence + count_smaller;
@@ -773,7 +759,7 @@ SZ_INTERNAL void _sz_sequence_argsort_skylake_3way_partition(
 }
 
 /**
- *  @brief  Recursive Quick-Sort implementation backing both the `sz_sequence_argsort_skylake` and
+ *  @brief Recursive Quick-Sort implementation backing both the `sz_sequence_argsort_skylake` and
  * `sz_pgrams_sort_skylake`, and using the `_sz_sequence_argsort_skylake_3way_partition` under the hood.
  */
 SZ_PUBLIC void _sz_sequence_argsort_skylake_recursively(            //
@@ -837,9 +823,10 @@ SZ_PUBLIC sz_status_t sz_pgrams_sort_skylake(sz_pgram_t *pgrams, sz_size_t count
 }
 
 /**
- *  @brief  Recursive Quick-Sort adaptation for strings, that processes the strings a few N-grams at a time.
- *          It combines `_sz_sequence_argsort_serial_export_next_pgrams` and `_sz_sequence_argsort_serial_recursively`,
- *          recursively diving into the identical pgrams.
+ *  @brief Recursive Quick-Sort adaptation for strings, that processes the strings a few N-grams at a time.
+ *
+ *  It combines `_sz_sequence_argsort_serial_export_next_pgrams` and `_sz_sequence_argsort_serial_recursively`,
+ *  recursively diving into the identical pgrams.
  */
 SZ_PUBLIC void _sz_sequence_argsort_skylake_next_pgrams(                        //
     sz_sequence_t const *const sequence,                                        //
@@ -931,16 +918,218 @@ SZ_PUBLIC sz_status_t sz_sequence_argsort_skylake(sz_sequence_t const *sequence,
 #pragma GCC target("arch=armv8.2-a+sve")
 #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
 
-/** @copydoc sz_sequence_argsort */
-SZ_PUBLIC sz_status_t sz_sequence_argsort_sve(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
-                                              sz_sorted_idx_t *order) {
-    return sz_sequence_argsort_serial(sequence, alloc, order);
+/**
+ *  @brief The most important part of the QuickSort algorithm partitioning the elements around the pivot.
+ *  @note Unlike the serial algorithm, uses compressed stores to filter and move the elements around the pivot.
+ *  @sa Identical to @b Skylake implementation, but uses variable length SVE registers.
+ */
+SZ_INTERNAL void _sz_sequence_argsort_sve_3way_partition(
+    sz_pgram_t *const initial_pgrams, sz_sorted_idx_t *const initial_order, sz_pgram_t *const partitioned_pgrams,
+    sz_sorted_idx_t *const partitioned_order, sz_size_t const start_in_sequence, sz_size_t const end_in_sequence,
+    sz_size_t *const first_pivot_offset, sz_size_t *const last_pivot_offset) {
+    sz_size_t const count = end_in_sequence - start_in_sequence;
+
+    // Use `svcntd()` to obtain the number of 64-bit elements in one SVE vector.
+    sz_size_t const pgrams_per_vector = svcntd();
+
+    // Choose the pivot with Sedgewick's method.
+    sz_pgram_t const *pivot_pgram_ptr = _sz_sequence_partitioning_pivot(initial_pgrams + start_in_sequence, count);
+    sz_pgram_t const pivot_pgram = *pivot_pgram_ptr;
+    svuint64_t pivot_vec = svdup_n_u64(pivot_pgram);
+
+    // Count elements smaller and greater than the pivot.
+    sz_size_t count_smaller = 0, count_greater = 0;
+    for (sz_size_t i = start_in_sequence; i < end_in_sequence; i += pgrams_per_vector) {
+        svbool_t load_mask = svwhilelt_b64(i, end_in_sequence);
+        svuint64_t pgrams_vec = svld1_u64(load_mask, initial_pgrams + i);
+        svbool_t smaller_mask = svcmplt_u64(load_mask, pgrams_vec, pivot_vec);
+        svbool_t greater_mask = svcmpgt_u64(load_mask, pgrams_vec, pivot_vec);
+        count_smaller += svcntp_b64(smaller_mask, smaller_mask);
+        count_greater += svcntp_b64(greater_mask, greater_mask);
+    }
+
+    sz_size_t const count_equal = count - count_smaller - count_greater;
+    _sz_assert(count_equal >= 1 && "The pivot must be present in the collection.");
+    _sz_assert(count_smaller + count_equal + count_greater == count && "The partitioning must be exhaustive.");
+
+    // Set offsets for each partition.
+    sz_size_t smaller_offset = start_in_sequence;
+    sz_size_t equal_offset = start_in_sequence + count_smaller;
+    sz_size_t greater_offset = start_in_sequence + count_smaller + count_equal;
+
+    // Partition elements into three segments.
+    for (sz_size_t i = start_in_sequence; i < end_in_sequence; i += pgrams_per_vector) {
+        svbool_t load_mask = svwhilelt_b64(i, end_in_sequence);
+        svuint64_t pgrams_vec = svld1_u64(load_mask, initial_pgrams + i);
+        svuint64_t order_vec = svld1_u64(load_mask, initial_order + i);
+
+        svbool_t smaller_mask = svcmplt_u64(load_mask, pgrams_vec, pivot_vec);
+        svbool_t equal_mask = svcmpeq_u64(load_mask, pgrams_vec, pivot_vec);
+        svbool_t greater_mask = svcmpgt_u64(load_mask, pgrams_vec, pivot_vec);
+
+        // Compress the elements that satisfy the predicate and store them contiguously.
+        sz_size_t count_smaller = svcntp_b64(smaller_mask, smaller_mask);
+        sz_size_t count_equal = svcntp_b64(equal_mask, equal_mask);
+        sz_size_t count_greater = svcntp_b64(greater_mask, greater_mask);
+        if (count_smaller) {
+            svuint64_t comp_pgrams = svcompact_u64(smaller_mask, pgrams_vec);
+            svuint64_t comp_order = svcompact_u64(smaller_mask, order_vec);
+            svbool_t store_mask = svwhilelt_b64((sz_size_t)0, count_smaller);
+            svst1_u64(store_mask, partitioned_pgrams + smaller_offset, comp_pgrams);
+            svst1_u64(store_mask, partitioned_order + smaller_offset, comp_order);
+            smaller_offset += count_smaller;
+        }
+        if (count_equal) {
+            svuint64_t comp_pgrams = svcompact_u64(equal_mask, pgrams_vec);
+            svuint64_t comp_order = svcompact_u64(equal_mask, order_vec);
+            svbool_t store_mask = svwhilelt_b64((sz_size_t)0, count_equal);
+            svst1_u64(store_mask, partitioned_pgrams + equal_offset, comp_pgrams);
+            svst1_u64(store_mask, partitioned_order + equal_offset, comp_order);
+            equal_offset += count_equal;
+        }
+        if (count_greater) {
+            svuint64_t comp_pgrams = svcompact_u64(greater_mask, pgrams_vec);
+            svuint64_t comp_order = svcompact_u64(greater_mask, order_vec);
+            svbool_t store_mask = svwhilelt_b64((sz_size_t)0, count_greater);
+            svst1_u64(store_mask, partitioned_pgrams + greater_offset, comp_pgrams);
+            svst1_u64(store_mask, partitioned_order + greater_offset, comp_order);
+            greater_offset += count_greater;
+        }
+    }
+
+    // Copy back.
+    sz_copy_sve((sz_ptr_t)(initial_pgrams + start_in_sequence),      //
+                (sz_cptr_t)(partitioned_pgrams + start_in_sequence), //
+                count * sizeof(sz_pgram_t));
+    sz_copy_sve((sz_ptr_t)(initial_order + start_in_sequence),      //
+                (sz_cptr_t)(partitioned_order + start_in_sequence), //
+                count * sizeof(sz_sorted_idx_t));
+
+    // Return the offsets of the equal elements.
+    *first_pivot_offset = start_in_sequence + count_smaller;
+    *last_pivot_offset = start_in_sequence + count_smaller + count_equal - 1;
+}
+
+/**
+ *  @brief Recursive Quick-Sort implementation backing both the `sz_sequence_argsort_skylake` and
+ * `sz_pgrams_sort_skylake`, and using the `_sz_sequence_argsort_skylake_3way_partition` under the hood.
+ *  @sa Identical to @b Skylake implementation, but uses variable length SVE registers.
+ */
+SZ_PUBLIC void _sz_sequence_argsort_sve_recursively(sz_pgram_t *initial_pgrams, sz_sorted_idx_t *initial_order,
+                                                    sz_pgram_t *temporary_pgrams, sz_sorted_idx_t *temporary_order,
+                                                    sz_size_t const start_in_sequence,
+                                                    sz_size_t const end_in_sequence) {
+    sz_size_t const count = end_in_sequence - start_in_sequence;
+    sz_size_t const pgrams_per_vector = svcntd();
+    if (count <= pgrams_per_vector) {
+        // For very small arrays use a simple insertion sort.
+        sz_pgrams_sort_with_insertion(initial_pgrams + start_in_sequence, count, initial_order + start_in_sequence);
+        return;
+    }
+
+    sz_size_t first_pivot_index, last_pivot_index;
+    _sz_sequence_argsort_sve_3way_partition(initial_pgrams, initial_order, temporary_pgrams, temporary_order,
+                                            start_in_sequence, end_in_sequence, &first_pivot_index, &last_pivot_index);
+
+    if (start_in_sequence + 1 < first_pivot_index)
+        _sz_sequence_argsort_sve_recursively(initial_pgrams, initial_order, temporary_pgrams, temporary_order,
+                                             start_in_sequence, first_pivot_index);
+    if (last_pivot_index + 2 < end_in_sequence)
+        _sz_sequence_argsort_sve_recursively(initial_pgrams, initial_order, temporary_pgrams, temporary_order,
+                                             last_pivot_index + 1, end_in_sequence);
 }
 
-/** @copydoc sz_pgrams_sort */
 SZ_PUBLIC sz_status_t sz_pgrams_sort_sve(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
                                          sz_sorted_idx_t *order) {
-    return sz_pgrams_sort_serial(pgrams, count, alloc, order);
+    // Initialize the order with 0,1,2,...
+    for (sz_size_t i = 0; i != count; ++i) order[i] = i;
+
+    sz_memory_allocator_t global_alloc;
+    if (!alloc) {
+        sz_memory_allocator_init_default(&global_alloc);
+        alloc = &global_alloc;
+    }
+
+    // Allocate temporary memory for partitioning.
+    sz_size_t memory_usage = sizeof(sz_pgram_t) * count + sizeof(sz_sorted_idx_t) * count;
+    sz_pgram_t *temporary_pgrams = (sz_pgram_t *)alloc->allocate(memory_usage, alloc);
+    sz_sorted_idx_t *temporary_order = (sz_sorted_idx_t *)(temporary_pgrams + count);
+    if (!temporary_pgrams) return sz_bad_alloc_k;
+
+    _sz_sequence_argsort_sve_recursively(pgrams, order, temporary_pgrams, temporary_order, 0, count);
+
+    alloc->free(temporary_pgrams, memory_usage, alloc);
+    return sz_success_k;
+}
+
+/**
+ *  @brief Recursive Quick-Sort adaptation for strings, that processes the strings a few N-grams at a time.
+ *  @sa Identical to @b Skylake implementation, but uses variable length SVE registers.
+ *
+ *  It combines `_sz_sequence_argsort_serial_export_next_pgrams` and `_sz_sequence_argsort_serial_recursively`,
+ *  recursively diving into the identical pgrams.
+ */
+SZ_PUBLIC void _sz_sequence_argsort_sve_next_pgrams(
+    sz_sequence_t const *const sequence, sz_pgram_t *const global_pgrams, sz_sorted_idx_t *const global_order,
+    sz_pgram_t *const temporary_pgrams, sz_sorted_idx_t *const temporary_order, sz_size_t const start_in_sequence,
+    sz_size_t const end_in_sequence, sz_size_t const start_character) {
+
+    // Export the next pgrams from the sequence.
+    _sz_sequence_argsort_serial_export_next_pgrams(sequence, global_pgrams, global_order, start_in_sequence,
+                                                   end_in_sequence, start_character);
+
+    // Sort the current pgrams with the SVE quicksort.
+    _sz_sequence_argsort_sve_recursively(global_pgrams, global_order, temporary_pgrams, temporary_order,
+                                         start_in_sequence, end_in_sequence);
+
+    // For each group of equal pgrams, if there are multiple strings and more characters,
+    // recursively sort the next pgrams.
+    sz_size_t const pgram_capacity = sizeof(sz_pgram_t) - 1;
+    sz_size_t nested_start = start_in_sequence;
+    sz_size_t nested_end = start_in_sequence;
+    while (nested_end != end_in_sequence) {
+        sz_pgram_t current_pgram = global_pgrams[nested_start];
+        while (nested_end != end_in_sequence && current_pgram == global_pgrams[nested_end]) ++nested_end;
+
+        sz_cptr_t current_pgram_str = (sz_cptr_t)&current_pgram;
+        sz_size_t current_pgram_length = (sz_size_t)current_pgram_str[0]; // byte order was swapped
+        int has_multiple_strings = nested_end - nested_start > 1;
+        int has_more_characters_in_each = current_pgram_length == pgram_capacity;
+        if (has_multiple_strings && has_more_characters_in_each)
+            _sz_sequence_argsort_sve_next_pgrams(sequence, global_pgrams, global_order, temporary_pgrams,
+                                                 temporary_order, nested_start, nested_end,
+                                                 start_character + pgram_capacity);
+        nested_start = nested_end;
+    }
+}
+
+SZ_PUBLIC sz_status_t sz_sequence_argsort_sve(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
+                                              sz_sorted_idx_t *order) {
+    sz_size_t count = sequence->count;
+    for (sz_size_t i = 0; i != count; ++i) order[i] = i;
+
+    if (count <= 32) {
+        sz_sequence_argsort_with_insertion(sequence, order);
+        return sz_success_k;
+    }
+
+    sz_memory_allocator_t global_alloc;
+    if (!alloc) {
+        sz_memory_allocator_init_default(&global_alloc);
+        alloc = &global_alloc;
+    }
+
+    sz_size_t memory_usage = sizeof(sz_pgram_t) * count * 2 + sizeof(sz_sorted_idx_t) * count;
+    sz_pgram_t *global_pgrams = (sz_pgram_t *)alloc->allocate(memory_usage, alloc);
+    sz_pgram_t *temporary_pgrams = global_pgrams + count;
+    sz_sorted_idx_t *temporary_order = (sz_sorted_idx_t *)(temporary_pgrams + count);
+    if (!global_pgrams) return sz_bad_alloc_k;
+
+    _sz_sequence_argsort_sve_next_pgrams(sequence, global_pgrams, order, temporary_pgrams, temporary_order, 0, count,
+                                         0);
+
+    alloc->free(global_pgrams, memory_usage, alloc);
+    return sz_success_k;
 }
 
 #pragma clang attribute pop

From 7d534fb849705f9faa4134c0997995729ccc4369 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 12 Mar 2025 21:36:42 +0000
Subject: [PATCH 197/751] Improve: Better sorting benchmarks

---
 scripts/bench.hpp      | 120 +++++++++++--
 scripts/bench_sort.cpp | 393 +++++++++++++++++++++++++++--------------
 2 files changed, 367 insertions(+), 146 deletions(-)

diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index ef0e6f69..aafd3142 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -42,6 +42,7 @@
 #include <vector>     // `std::vector`
 #include <regex>      // `std::regex`, `std::regex_search`
 #include <thread>     // `std::this_thread::sleep_for`
+#include <optional>   // `std::optional`
 
 #include <string_view> // Requires C++17
 
@@ -412,14 +413,21 @@ inline environment_t build_environment(                                        /
  *  @brief  Uses C-style file IO to save information about the most recent stress test failure.
  *          Files can be found in: "$STRINGWARS_STRESS_DIR/failed_$time_$name.txt".
  */
-inline void log_stress_failure(environment_t const &env, std::string const &name, std::size_t input_index,
-                               std::size_t expected_check_value, std::size_t actual_check_value) noexcept(false) {
+inline void log_failure(                                              //
+    environment_t const &env, std::string const &name,                //
+    std::size_t expected_check_value, std::size_t actual_check_value, //
+    std::optional<std::size_t> token_index) noexcept(false) {
 
-    std::string file_name = "failed_" + name + "_" + std::to_string(input_index) + ".txt";
+    std::string timestamp = std::to_string(std::time(nullptr));
+    std::string file_name = "failed_" + timestamp + "_" + name + "_" + ".txt";
     std::string file_path = env.stress_dir + "/" + file_name;
     std::FILE *file = std::fopen(file_path.c_str(), "w");
     if (!file) throw std::runtime_error("Failed to open file for writing: " + file_name);
 
+    std::fprintf(file, "Dataset path: %s\n", env.path.c_str());
+    std::fprintf(file, "Tokenization mode: %d\n", env.tokenization);
+    std::fprintf(file, "Seed: %zu\n", env.seed);
+    if (token_index) std::fprintf(file, "Token index: %zu\n", *token_index);
     std::fprintf(file, "Expected: %zu\n", expected_check_value);
     std::fprintf(file, "Actual: %zu\n", actual_check_value);
     std::fclose(file);
@@ -564,7 +572,73 @@ struct benchmark_result_t {
 };
 
 /**
- *  @brief Loops over all tokens (in loop-unrolled batches) in environment and applies the given unary function.
+ *  @brief Loops over all tokens (in loop-unrolled batches) in environment and applies the given @b nullary function.
+ *  @param[in] env Environment with the dataset and tokens.
+ *  @param[in] name Name of the benchmark, used for logging.
+ *  @param[in] baseline Optional serial analog, against which the accelerated function will be stress-tested.
+ *  @param[in] callable Nullary function taking no arguments and returning a @b `call_result_t`.
+ *  @return Profiling results, including the number of cycles, bytes processed, and error counts.
+ */
+template <                                          //
+    typename callable_type_,                        //
+    typename baseline_type_ = callable_no_op_t,     //
+    typename preprocessing_type_ = callable_no_op_t //
+    >
+benchmark_result_t benchmark_nullary( //
+    environment_t const &env,         //
+    std::string const &name,          //
+    baseline_type_ &&baseline,        //
+    callable_type_ &&callable,        //
+    preprocessing_type_ &&preprocessing = callable_no_op_t()) {
+
+    benchmark_result_t result;
+    result.name = name;
+    if (!env.allow(name)) {
+        result.skipped = true;
+        return result;
+    }
+
+    // Pre-process before testing
+    if constexpr (!std::is_same<preprocessing_type_, callable_no_op_t>()) preprocessing();
+
+    // Perform the testing against the baseline, if provided.
+    if constexpr (!std::is_same<baseline_type_, callable_no_op_t>())
+        for (auto running_seconds : repeat_up_to(env.stress_seconds)) {
+            call_result_t const accelerated_result = callable();
+            call_result_t const baseline_result = baseline();
+            ++result.stress_calls;
+            if (accelerated_result.check_value == baseline_result.check_value) continue; // No failures
+
+            // If we got here, the error needs to be reported and investigated.
+            ++result.errors;
+            if (result.errors > env.stress_limit) {
+                std::printf("Too many errors in %s after %.3f seconds. Stopping the test.\n", name.c_str(),
+                            running_seconds);
+                std::terminate();
+            }
+            log_failure(env, name, baseline_result.check_value, accelerated_result.check_value, {});
+        }
+
+    // Repeat the benchmark of the unary function. Assume most of them are applied to the entire
+    // dataset and take a lot of time, so we don't unroll much, unlike `benchmark_unary`.
+    for (auto running_seconds : repeat_up_to(env.benchmark_seconds)) {
+        std::uint64_t time_start = cpu_cycle_counter();
+        call_result_t call_result = callable();
+        std::uint64_t time_end = cpu_cycle_counter();
+
+        // Aggregate:
+        result += call_result;
+        result.profiled_seconds = running_seconds;
+        result.profiled_calls += 1;
+        result.profiled_cpu_cycles += time_end - time_start;
+        result.cpu_cycles_histogram[time_end - time_start] += 1;
+    }
+
+    return result;
+}
+
+/**
+ *  @brief Loops over all tokens (in loop-unrolled batches) in environment and applies the given @b unary function.
  *  @param[in] env Environment with the dataset and tokens.
  *  @param[in] name Name of the benchmark, used for logging.
  *  @param[in] baseline Optional serial analog, against which the accelerated function will be stress-tested.
@@ -576,11 +650,11 @@ template <                                          //
     typename baseline_type_ = callable_no_op_t,     //
     typename preprocessing_type_ = callable_no_op_t //
     >
-benchmark_result_t benchmark(  //
-    environment_t const &env,  //
-    std::string const &name,   //
-    baseline_type_ &&baseline, //
-    callable_type_ &&callable, //
+benchmark_result_t benchmark_unary( //
+    environment_t const &env,       //
+    std::string const &name,        //
+    baseline_type_ &&baseline,      //
+    callable_type_ &&callable,      //
     preprocessing_type_ &&preprocessing = callable_no_op_t()) {
 
     benchmark_result_t result;
@@ -596,9 +670,9 @@ benchmark_result_t benchmark(  //
     std::size_t const lookup_mask = bit_floor(env.tokens.size()) - 1;
     if constexpr (!std::is_same<baseline_type_, callable_no_op_t>())
         for (auto running_seconds : repeat_up_to(env.stress_seconds)) {
-            std::size_t const input_index = (result.stress_calls++) & lookup_mask;
-            call_result_t const accelerated_result = callable(input_index);
-            call_result_t const baseline_result = baseline(input_index);
+            std::size_t const token_index = (result.stress_calls++) & lookup_mask;
+            call_result_t const accelerated_result = callable(token_index);
+            call_result_t const baseline_result = baseline(token_index);
             if (accelerated_result.check_value == baseline_result.check_value) continue; // No failures
 
             // If we got here, the error needs to be reported and investigated.
@@ -608,14 +682,14 @@ benchmark_result_t benchmark(  //
                             running_seconds);
                 std::terminate();
             }
-            log_stress_failure(env, name, input_index, baseline_result.check_value, accelerated_result.check_value);
+            log_failure(env, name, baseline_result.check_value, accelerated_result.check_value, token_index);
         }
 
     // For profiling, we will first run the benchmark just once to get a rough estimate of the time.
     // But then we will repeat it in an unrolled fashion for a more accurate measurement.
     result.profiled_seconds += seconds_per_call([&] {
         std::uint64_t start_cycle = cpu_cycle_counter();
-        result += callable((std::size_t)0); // First input for debugging
+        result += callable((std::size_t)0); //? Use the first token
         std::uint64_t end_cycle = cpu_cycle_counter();
         result.profiled_calls += 1;
         result.profiled_cpu_cycles += end_cycle - start_cycle;
@@ -653,15 +727,27 @@ benchmark_result_t benchmark(  //
 }
 
 /**
- *  @brief Loops over all tokens (in loop-unrolled batches) in environment and applies the given unary function.
+ *  @brief Loops over all tokens (in loop-unrolled batches) in environment and applies the given @b nullary function.
+ *  @param[in] env Environment with the dataset and tokens.
+ *  @param[in] name Name of the benchmark, used for logging.
+ *  @param[in] callable Nullary function taking no arguments and returning a @b `call_result_t`.
+ *  @return Profiling results, including the number of cycles, bytes processed, and error counts.
+ */
+template <typename callable_type_>
+benchmark_result_t benchmark_nullary(environment_t const &env, std::string const &name, callable_type_ &&callable) {
+    return benchmark_nullary(env, name, callable_no_op_t {}, callable);
+}
+
+/**
+ *  @brief Loops over all tokens (in loop-unrolled batches) in environment and applies the given @b unary function.
  *  @param[in] env Environment with the dataset and tokens.
  *  @param[in] name Name of the benchmark, used for logging.
  *  @param[in] callable Unary function taking a @b `std::size_t` token index and returning a @b `call_result_t`.
  *  @return Profiling results, including the number of cycles, bytes processed, and error counts.
  */
 template <typename callable_type_>
-benchmark_result_t benchmark(environment_t const &env, std::string const &name, callable_type_ &&callable) {
-    return benchmark(env, name, callable_no_op_t {}, callable);
+benchmark_result_t benchmark_unary(environment_t const &env, std::string const &name, callable_type_ &&callable) {
+    return benchmark_unary(env, name, callable_no_op_t {}, callable);
 }
 
 } // namespace scripts
diff --git a/scripts/bench_sort.cpp b/scripts/bench_sort.cpp
index 8cdf97fd..ebc86999 100644
--- a/scripts/bench_sort.cpp
+++ b/scripts/bench_sort.cpp
@@ -2,9 +2,50 @@
 /**
  *  @file   bench_sort.cpp
  *  @brief  Benchmarks sorting, partitioning, and merging operations on string sequences.
+ *          The program accepts a file path to a dataset, tokenizes it, and benchmarks the search operations,
+ *          validating the SIMD-accelerated backends against the serial baselines.
  *
- *  This file is the sibling of `bench_similarity.cpp`, `bench_search.cpp` and `bench_token.cpp`.
- *  It accepts a file with a list of words, and benchmarks the sorting operations on them.
+ *  Benchmarks include:
+ *  - String sequence sorting algorithms - @b argsort and @b pgrams_sort.
+ *  - String sequences intersections - @b intersect.
+ *
+ *  For sorting, the number of operations per second are reported as the worst-case time complexity of a
+ *  comparison-based sorting algorithm, meaning O(N*log(N)) for N elements. For intersections, the number of
+ *  operations is estimated as the total number of characters in the two input sequences.
+ *
+ *  Instead of CLI arguments, for compatibility with @b StringWa.rs, the following environment variables are used:
+ *  - `STRINGWARS_DATASET` : Path to the dataset file.
+ *  - `STRINGWARS_TOKENS=words` : Tokenization model ("file", "lines", "words", or positive integer [1:200] for N-grams
+ *  - `STRINGWARS_SEED=42` : Optional seed for shuffling reproducibility.
+ *
+ *  Unlike StringWa.rs, the following additional environment variables are supported:
+ *  - `STRINGWARS_DURATION=10` : Time limit (in seconds) per benchmark.
+ *  - `STRINGWARS_STRESS=1` : Test SIMD-accelerated functions against the serial baselines.
+ *  - `STRINGWARS_STRESS_DIR=/.tmp` : Output directory for stress-testing failures logs.
+ *  - `STRINGWARS_STRESS_LIMIT=1` : Controls the number of failures we're willing to tolerate.
+ *  - `STRINGWARS_STRESS_DURATION=10` : Stress-testing time limit (in seconds) per benchmark.
+ *  - `STRINGWARS_FILTER` : Regular Expression pattern to filter algorithm/backend names.
+ *
+ *  Here are a few build & run commands:
+ *
+ *  @code{.sh}
+ *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
+ *  cmake --build build_release --config Release --target stringzilla_bench_sort
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringzilla_bench_sort
+ *  @endcode
+ *
+ *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
+ *  like all Skylake-X and newer kernels on a boundary-condition input length of 64 bytes (exactly 1 cache line),
+ *  your last command may look like:
+ *
+ *  @code{.sh}
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
+ *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
+ *  build_release/stringzilla_bench_sort
+ *  @endcode
+ *
+ *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
+ *  This file is the sibling of `bench_search.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
  */
 #include <memory>  // `std::memcpy`
 #include <numeric> // `std::iota`
@@ -13,37 +54,63 @@
 #include <stdlib.h> // `qsort_r`
 #endif
 
-#include <bench.hpp>
+#define SZ_USE_MISALIGNED_LOADS (1)
+#include "bench.hpp"
 
 using namespace ashvardanian::stringzilla::scripts;
-namespace sz = ashvardanian::stringzilla;
 
-using strings_t = std::vector<std::string>;
+using pgrams_t = std::vector<sz_pgram_t>;
+using strings_t = std::vector<std::string_view>;
 using permute_t = std::vector<sz_sorted_idx_t>;
 
-#pragma region C callbacks
+#if __linux__ && defined(_GNU_SOURCE) && !defined(__BIONIC__)
+#define _SZ_HAS_QSORT_R 1
+#elif defined(_MSC_VER)
+#define _SZ_HAS_QSORT_S 1
+#endif
+
+/** @brief Helper function to distill a large @b `permute_t` object down to a single comparable hash integer. */
+template <typename entries_type_>
+bool is_sorting_permutation(entries_type_ const &entries, permute_t const &permute) {
+    return std::is_sorted(permute.begin(), permute.end(),
+                          [&](std::size_t i, std::size_t j) { return entries[i] < entries[j]; });
+}
+
+/** @brief Helper function to accumulate the total length of all strings in a sequence. */
+std::size_t accumulate_lengths(strings_t const &strings) {
+    return std::accumulate(strings.begin(), strings.end(), (std::size_t)0,
+                           [](std::size_t sum, std::string_view const &str) { return sum + str.size(); });
+}
 
+#pragma region C Callbacks
+
+/** @brief Trampoline function to access @b `sz_cptr_t[]` arrays via @b `sz_sequence_t::get_start`. */
 static sz_cptr_t get_start(void const *handle, sz_size_t i) {
     strings_t const &array = *reinterpret_cast<strings_t const *>(handle);
-    return array[i].c_str();
+    return array[i].data();
 }
 
+/** @brief Trampoline function to access @b `sz_cptr_t[]` arrays via @b `sz_sequence_t::get_length`. */
 static sz_size_t get_length(void const *handle, sz_size_t i) {
     strings_t const &array = *reinterpret_cast<strings_t const *>(handle);
     return array[i].size();
 }
 
+/**
+ *  @brief Callback function for the @b `qsort_r` re-entrant sorting function.
+ *  @note The `qsort_r` function is not available on all platforms, and is not part of the C standard.
+ */
 #if defined(_MSC_VER)
-static int _get_qsort_order(void *arg, const void *a, const void *b) {
+static int _get_qsort_order(void *arg, void const *a, void const *b) {
 #else
-static int _get_qsort_order(const void *a, const void *b, void *arg) {
+static int _get_qsort_order(void const *a, void const *b, void *arg) {
 #endif
     sz_sequence_t *sequence = (sz_sequence_t *)arg;
     sz_size_t idx_a = *(sz_size_t *)a;
     sz_size_t idx_b = *(sz_size_t *)b;
 
-    char const *str_a = sequence->get_start(sequence->handle, idx_a);
-    char const *str_b = sequence->get_start(sequence->handle, idx_b);
+    sz_cptr_t str_a = sequence->get_start(sequence->handle, idx_a);
+    sz_cptr_t str_b = sequence->get_start(sequence->handle, idx_b);
     sz_size_t len_a = sequence->get_length(sequence->handle, idx_a);
     sz_size_t len_b = sequence->get_length(sequence->handle, idx_b);
 
@@ -53,142 +120,210 @@ static int _get_qsort_order(const void *a, const void *b, void *arg) {
 
 #pragma endregion
 
-template <typename strings_type_>
-void expect_sorted(strings_type_ const &strings, permute_t const &permute) {
-    if (!std::is_sorted(permute.begin(), permute.end(),
-                        [&](std::size_t i, std::size_t j) { return strings[i] < strings[j]; }))
-        throw std::runtime_error("Sorting failed!");
-}
+#pragma region Sorting Benchmarks
 
-template <typename callback_type_>
-void bench_permute(char const *name, callback_type_ &&callback) {
+struct argsort_strings_via_std_t {
+    strings_t const &input;
+    permute_t &output;
 
-    // Run multiple iterations
-    std::size_t iterations = 0;
-    seconds_t duration = repeat_until_limit([&]() {
-        callback();
-        iterations++;
-    });
+    argsort_strings_via_std_t(strings_t const &input, permute_t &output) : input(input), output(output) {}
+    call_result_t operator()() const {
+        std::iota(output.begin(), output.end(), 0);
+        std::sort(output.begin(), output.end(),
+                  [&](sz_sorted_idx_t i, sz_sorted_idx_t j) { return input[i] < input[j]; });
 
-    // Measure elapsed time
-    duration /= iterations;
-    if (duration >= 0.1) { std::printf("Elapsed time is %.2lf seconds for %s.\n", duration, name); }
-    else if (duration >= 0.001) { std::printf("Elapsed time is %.2lf milliseconds for %s.\n", duration * 1e3, name); }
-    else { std::printf("Elapsed time is %.2lf microseconds for %s.\n", duration * 1e6, name); }
-}
+        // Prepare stats and hash the permutation to compare with the reference.
+        std::size_t ops_performed = input.size() * std::log2(input.size());
+        check_value_t checksum = is_sorting_permutation(input, output);
+        std::size_t bytes_passed = accumulate_lengths(input);
+        return {bytes_passed, checksum, ops_performed};
+    }
+};
 
-int main(int argc, char const **argv) {
-    std::printf("StringZilla. Starting sorting benchmarks.\n");
-    dataset_t const dataset = prepare_benchmark_environment(argc, argv);
-    strings_t const strings {dataset.tokens.begin(), dataset.tokens.end()};
-    permute_t permute(strings.size());
-    using allocator_t = std::allocator<char>;
-
-    // Before sorting the strings themselves, which is a heavy operation, let's sort some prefixes
-    // to understand how the sorting algorithm behaves.
-    std::vector<sz_pgram_t> pgrams(strings.size());
-    std::transform(strings.begin(), strings.end(), pgrams.begin(), [](std::string const &str) {
-        sz_pgram_t pgram = 0;
-        std::memcpy(&pgram, str.c_str(), (std::min)(sizeof(pgram), str.size()));
-        return pgram;
-    });
-
-    // Sorting P-grams
-    bench_permute("std::sort(pgrams)", [&]() {
-        std::iota(permute.begin(), permute.end(), 0);
-        std::sort(permute.begin(), permute.end(),
-                  [&](sz_sorted_idx_t i, sz_sorted_idx_t j) { return pgrams[i] < pgrams[j]; });
-    });
-    expect_sorted(pgrams, permute);
-
-    // Unlike the `std::sort` adaptation above, the `sz_pgrams_sort_serial` also sorts the input array inplace
-    std::vector<sz_pgram_t> pgrams_sorted(strings.size());
-    bench_permute("sz_pgrams_sort_serial", [&]() {
-        std::copy(pgrams.begin(), pgrams.end(), pgrams_sorted.begin());
-        std::iota(permute.begin(), permute.end(), 0);
-        sz::_with_alloc<allocator_t>([&](sz_memory_allocator_t &alloc) {
-            return sz_pgrams_sort_serial(pgrams_sorted.data(), pgrams_sorted.size(), &alloc, permute.data());
-        });
-    });
-    expect_sorted(pgrams, permute);
+#if defined(_SZ_HAS_QSORT_R) || defined(_SZ_HAS_QSORT_S)
 
-#if SZ_USE_SKYLAKE
-    bench_permute("sz_pgrams_sort_skylake", [&]() {
-        std::copy(pgrams.begin(), pgrams.end(), pgrams_sorted.begin());
-        std::iota(permute.begin(), permute.end(), 0);
-        sz::_with_alloc<allocator_t>([&](sz_memory_allocator_t &alloc) {
-            return sz_pgrams_sort_skylake(pgrams_sorted.data(), pgrams_sorted.size(), &alloc, permute.data());
-        });
-    });
-    expect_sorted(pgrams, permute);
-#endif
+struct argsort_strings_via_qsort_t {
+    strings_t const &input;
+    permute_t &output;
 
-    // Sorting strings
-    bench_permute("std::sort(positions)", [&]() {
-        std::iota(permute.begin(), permute.end(), 0);
-        std::sort(permute.begin(), permute.end(),
-                  [&](sz_sorted_idx_t i, sz_sorted_idx_t j) { return strings[i] < strings[j]; });
-    });
-    expect_sorted(strings, permute);
+    argsort_strings_via_qsort_t(strings_t const &input, permute_t &output) : input(input), output(output) {}
+    call_result_t operator()() const {
+        std::iota(output.begin(), output.end(), 0);
 
-    bench_permute("sz_sequence_argsort_serial", [&]() {
-        std::iota(permute.begin(), permute.end(), 0);
+        // Prepare the sequence structure for the callback.
         sz_sequence_t array;
-        array.count = strings.size();
-        array.handle = &strings;
+        array.count = input.size();
+        array.handle = &input;
         array.get_start = get_start;
         array.get_length = get_length;
-        sz::_with_alloc<allocator_t>(
-            [&](sz_memory_allocator_t &alloc) { return sz_sequence_argsort_serial(&array, &alloc, permute.data()); });
-    });
-    expect_sorted(strings, permute);
-#if SZ_USE_SKYLAKE
-    bench_permute("sz_sequence_argsort_skylake", [&]() {
-        std::iota(permute.begin(), permute.end(), 0);
+#if defined(_SZ_HAS_QSORT_R)
+        qsort_r(output.data(), array.count, sizeof(sz_sorted_idx_t), _get_qsort_order, &array);
+#elif defined(_SZ_HAS_QSORT_S)
+        qsort_s(output.data(), array.count, sizeof(sz_sorted_idx_t), _get_qsort_order, &array);
+#endif
+
+        // Prepare stats and hash the permutation to compare with the reference.
+        std::size_t ops_performed = input.size() * std::log2(input.size());
+        check_value_t checksum = is_sorting_permutation(input, output);
+        std::size_t bytes_passed = accumulate_lengths(input);
+        return {bytes_passed, checksum, ops_performed};
+    }
+};
+
+#endif
+
+template <sz_sequence_argsort_t func_>
+struct argsort_strings_via_sz {
+    strings_t const &input;
+    permute_t &output;
+
+    argsort_strings_via_sz(strings_t const &input, permute_t &output) : input(input), output(output) {}
+    call_result_t operator()() const {
+        std::iota(output.begin(), output.end(), 0);
+
+        // Prepare the sequence structure for the callback.
         sz_sequence_t array;
-        array.count = strings.size();
-        array.handle = &strings;
+        array.count = input.size();
+        array.handle = &input;
         array.get_start = get_start;
         array.get_length = get_length;
-        sz::_with_alloc<allocator_t>(
-            [&](sz_memory_allocator_t &alloc) { return sz_sequence_argsort_skylake(&array, &alloc, permute.data()); });
-    });
-    expect_sorted(strings, permute);
+        sz::_with_alloc<std::allocator<char>>(
+            [&](sz_memory_allocator_t &alloc) { return func_(&array, &alloc, output.data()); });
+
+        // Prepare stats and hash the permutation to compare with the reference.
+        std::size_t ops_performed = input.size() * std::log2(input.size());
+        check_value_t checksum = is_sorting_permutation(input, output);
+        std::size_t bytes_passed = accumulate_lengths(input);
+        return {bytes_passed, checksum, ops_performed};
+    }
+};
+
+/**
+ *  @brief Find the array permutation that sorts the input strings.
+ *  @warning Some algorithms use more memory than others and memory usage is not accounted for in this benchmark.
+ */
+void bench_sorting_strings(environment_t const &env) {
+    permute_t permute_buffer(env.tokens.size());
+
+    // First, benchmark the STL function
+    auto base_call = argsort_strings_via_std_t {env.tokens, permute_buffer};
+    benchmark_result_t base = benchmark_nullary(env, "std::sort(positions)", base_call).log();
+    auto serial_call = argsort_strings_via_sz<sz_sequence_argsort_serial> {env.tokens, permute_buffer};
+    benchmark_nullary(env, "sz_sequence_argsort_serial(positions)", base_call, serial_call).log(base);
+
+// Conditionally include SIMD-accelerated backends
+#if SZ_USE_SKYLAKE
+    auto skylake_call = argsort_strings_via_sz<sz_sequence_argsort_skylake> {env.tokens, permute_buffer};
+    benchmark_nullary(env, "sz_sequence_argsort_skylake(positions)", base_call, skylake_call).log(base);
+#endif
+#if SZ_USE_SVE
+    auto sve_call = argsort_strings_via_sz<sz_sequence_argsort_sve> {env.tokens, permute_buffer};
+    benchmark_nullary(env, "sz_sequence_argsort_sve(positions)", base_call, sve_call).log(base);
 #endif
 
+    // Include POSIX functionality
 #if __linux__ && defined(_GNU_SOURCE) && !defined(__BIONIC__)
-    bench_permute("qsort_r", [&]() {
-        std::iota(permute.begin(), permute.end(), 0);
-        sz_sequence_t array;
-        array.count = strings.size();
-        array.handle = &strings;
-        array.get_start = get_start;
-        array.get_length = get_length;
-        qsort_r(permute.data(), array.count, sizeof(sz_sorted_idx_t), _get_qsort_order, &array);
-    });
-    expect_sorted(strings, permute);
-#elif defined(_MSC_VER)
-    bench_permute("qsort_s", [&]() {
-        std::iota(permute.begin(), permute.end(), 0);
-        sz_sequence_t array;
-        array.count = strings.size();
-        array.handle = &strings;
-        array.get_start = get_start;
-        array.get_length = get_length;
-        qsort_s(permute.data(), array.count, sizeof(sz_sorted_idx_t), _get_qsort_order, &array);
-    });
-    expect_sorted(strings, permute);
-#else
-    sz_unused(_get_qsort_order);
+    auto qsort_call = argsort_strings_via_qsort_t {env.tokens, permute_buffer};
+    benchmark_nullary(env, "qsort_r(positions)", base_call, qsort_call).log(base);
 #endif
+}
+
+#pragma endregion
+
+#pragma region P-grams Sorting Benchmarks
+
+struct sort_pgrams_via_std_t {
+    pgrams_t const &input;
+    permute_t &output;
+
+    sort_pgrams_via_std_t(pgrams_t const &input, permute_t &output) : input(input), output(output) {}
+
+    call_result_t operator()() const {
+        std::iota(output.begin(), output.end(), 0);
+        std::sort(output.begin(), output.end(),
+                  [&](sz_sorted_idx_t i, sz_sorted_idx_t j) { return input[i] < input[j]; });
+
+        // Prepare stats and hash the permutation to compare with the reference.
+        std::size_t ops_performed = input.size() * std::log2(input.size());
+        check_value_t checksum = is_sorting_permutation(input, output);
+        std::size_t bytes_passed = input.size() * sizeof(sz_pgram_t);
+        return {bytes_passed, checksum, ops_performed};
+    }
+};
 
-    std::printf("---- Stable Sorting:\n");
-    bench_permute("std::stable_sort", [&]() {
-        std::iota(permute.begin(), permute.end(), 0);
-        std::stable_sort(permute.begin(), permute.end(),
-                         [&](sz_sorted_idx_t i, sz_sorted_idx_t j) { return strings[i] < strings[j]; });
+template <sz_pgrams_sort_t func_>
+struct sort_pgrams_via_sz {
+    pgrams_t const &input;
+    pgrams_t &output_sorted;
+    permute_t &output_permutation;
+
+    sort_pgrams_via_sz(pgrams_t const &input, pgrams_t &output_sorted, permute_t &output_permutation)
+        : input(input), output_sorted(output_sorted), output_permutation(output_permutation) {}
+    call_result_t operator()() const {
+        std::copy(input.begin(), input.end(), output_sorted.begin());
+        std::iota(output_permutation.begin(), output_permutation.end(), 0);
+
+        // Prepare the sequence structure for the callback.
+        sz::_with_alloc<std::allocator<char>>([&](sz_memory_allocator_t &alloc) {
+            return func_(output_sorted.data(), output_sorted.size(), &alloc, output_permutation.data());
+        });
+
+        // Prepare stats and hash the permutation to compare with the reference.
+        std::size_t ops_performed = input.size() * std::log2(input.size());
+        check_value_t checksum = is_sorting_permutation(input, output_permutation);
+        std::size_t bytes_passed = input.size() * sizeof(sz_pgram_t);
+        return {bytes_passed, checksum, ops_performed};
+    }
+};
+
+/**
+ *  @brief Find the array permutation that sorts the input strings.
+ *  @warning Some algorithms use more memory than others and memory usage is not accounted for in this benchmark.
+ */
+void bench_sorting_pgrams(environment_t const &env) {
+    permute_t permute_buffer(env.tokens.size());
+
+    // Before sorting the strings themselves, which is a heavy operation,
+    // let's sort some prefixes to understand how the sorting algorithm behaves.
+    pgrams_t pgrams_buffer(env.tokens.size()), pgrams_sorted(env.tokens.size());
+    std::transform(env.tokens.begin(), env.tokens.end(), pgrams_buffer.begin(), [](std::string_view const &str) {
+        sz_pgram_t pgram = 0;
+        std::memcpy(&pgram, str.data(), (std::min)(sizeof(pgram), str.size()));
+        return pgram;
     });
-    expect_sorted(strings, permute);
 
-    return 0;
+    // First, benchmark the STL function
+    auto base_call = sort_pgrams_via_std_t {pgrams_buffer, permute_buffer};
+    benchmark_result_t base = benchmark_nullary(env, "std::sort(pgrams)", base_call).log();
+    auto serial_call = sort_pgrams_via_sz<sz_pgrams_sort_serial> {pgrams_buffer, pgrams_sorted, permute_buffer};
+    benchmark_nullary(env, "sz_pgrams_sort_serial(pgrams)", base_call, serial_call).log(base);
+
+    // Conditionally include SIMD-accelerated backends
+#if SZ_USE_SKYLAKE
+    auto skylake_call = sort_pgrams_via_sz<sz_pgrams_sort_skylake> {pgrams_buffer, pgrams_sorted, permute_buffer};
+    benchmark_nullary(env, "sz_pgrams_sort_skylake(pgrams)", base_call, skylake_call).log(base);
+#endif
+#if SZ_USE_SVE
+    auto sve_call = sort_pgrams_via_sz<sz_pgrams_sort_sve> {pgrams_buffer, pgrams_sorted, permute_buffer};
+    benchmark_nullary(env, "sz_pgrams_sort_sve(pgrams)", base_call, sve_call).log(base);
+#endif
 }
+
+#pragma endregion
+
+int main(int argc, char const **argv) {
+    std::printf("Welcome to StringZilla!\n");
+
+    std::printf("Building up the environment...\n");
+    environment_t env = build_environment( //
+        argc, argv,                        //
+        "leipzig1M.txt",                   //
+        environment_t::tokenization_t::words_k);
+
+    std::printf("Starting search benchmarks...\n");
+    bench_sorting_pgrams(env);
+    bench_sorting_strings(env);
+
+    std::printf("All benchmarks passed.\n");
+    return 0;
+}
\ No newline at end of file

From e860af0f45251e62e825d72c2312f1c0457d8060 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 13 Mar 2025 09:51:30 +0000
Subject: [PATCH 198/751] Add: Intersection benchmarks

---
 scripts/bench_sort.cpp | 145 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 132 insertions(+), 13 deletions(-)

diff --git a/scripts/bench_sort.cpp b/scripts/bench_sort.cpp
index ebc86999..2f501147 100644
--- a/scripts/bench_sort.cpp
+++ b/scripts/bench_sort.cpp
@@ -47,8 +47,9 @@
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
  *  This file is the sibling of `bench_search.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
  */
-#include <memory>  // `std::memcpy`
-#include <numeric> // `std::iota`
+#include <memory>        // `std::memcpy`
+#include <numeric>       // `std::iota`
+#include <unordered_set> // `std::unordered_set`
 
 #if __linux__ && defined(_GNU_SOURCE)
 #include <stdlib.h> // `qsort_r`
@@ -56,6 +57,7 @@
 
 #define SZ_USE_MISALIGNED_LOADS (1)
 #include "bench.hpp"
+#include "test.hpp" // `global_random_generator`
 
 using namespace ashvardanian::stringzilla::scripts;
 
@@ -207,24 +209,24 @@ void bench_sorting_strings(environment_t const &env) {
 
     // First, benchmark the STL function
     auto base_call = argsort_strings_via_std_t {env.tokens, permute_buffer};
-    benchmark_result_t base = benchmark_nullary(env, "std::sort(positions)", base_call).log();
+    benchmark_result_t base = benchmark_nullary(env, "sequence_argsort<std::sort>", base_call).log();
     auto serial_call = argsort_strings_via_sz<sz_sequence_argsort_serial> {env.tokens, permute_buffer};
-    benchmark_nullary(env, "sz_sequence_argsort_serial(positions)", base_call, serial_call).log(base);
+    benchmark_nullary(env, "sz_sequence_argsort_serial", base_call, serial_call).log(base);
 
 // Conditionally include SIMD-accelerated backends
 #if SZ_USE_SKYLAKE
     auto skylake_call = argsort_strings_via_sz<sz_sequence_argsort_skylake> {env.tokens, permute_buffer};
-    benchmark_nullary(env, "sz_sequence_argsort_skylake(positions)", base_call, skylake_call).log(base);
+    benchmark_nullary(env, "sz_sequence_argsort_skylake", base_call, skylake_call).log(base);
 #endif
 #if SZ_USE_SVE
     auto sve_call = argsort_strings_via_sz<sz_sequence_argsort_sve> {env.tokens, permute_buffer};
-    benchmark_nullary(env, "sz_sequence_argsort_sve(positions)", base_call, sve_call).log(base);
+    benchmark_nullary(env, "sz_sequence_argsort_sve", base_call, sve_call).log(base);
 #endif
 
-    // Include POSIX functionality
-#if __linux__ && defined(_GNU_SOURCE) && !defined(__BIONIC__)
+    // Include POSIX and WinAPI functionality
+#if defined(_SZ_HAS_QSORT_R) || defined(_SZ_HAS_QSORT_S)
     auto qsort_call = argsort_strings_via_qsort_t {env.tokens, permute_buffer};
-    benchmark_nullary(env, "qsort_r(positions)", base_call, qsort_call).log(base);
+    benchmark_nullary(env, "sequence_argsort<qsort>", base_call, qsort_call).log(base);
 #endif
 }
 
@@ -294,18 +296,134 @@ void bench_sorting_pgrams(environment_t const &env) {
 
     // First, benchmark the STL function
     auto base_call = sort_pgrams_via_std_t {pgrams_buffer, permute_buffer};
-    benchmark_result_t base = benchmark_nullary(env, "std::sort(pgrams)", base_call).log();
+    benchmark_result_t base = benchmark_nullary(env, "pgrams_sort<std::sort>", base_call).log();
     auto serial_call = sort_pgrams_via_sz<sz_pgrams_sort_serial> {pgrams_buffer, pgrams_sorted, permute_buffer};
-    benchmark_nullary(env, "sz_pgrams_sort_serial(pgrams)", base_call, serial_call).log(base);
+    benchmark_nullary(env, "sz_pgrams_sort_serial", base_call, serial_call).log(base);
 
     // Conditionally include SIMD-accelerated backends
 #if SZ_USE_SKYLAKE
     auto skylake_call = sort_pgrams_via_sz<sz_pgrams_sort_skylake> {pgrams_buffer, pgrams_sorted, permute_buffer};
-    benchmark_nullary(env, "sz_pgrams_sort_skylake(pgrams)", base_call, skylake_call).log(base);
+    benchmark_nullary(env, "sz_pgrams_sort_skylake", base_call, skylake_call).log(base);
 #endif
 #if SZ_USE_SVE
     auto sve_call = sort_pgrams_via_sz<sz_pgrams_sort_sve> {pgrams_buffer, pgrams_sorted, permute_buffer};
-    benchmark_nullary(env, "sz_pgrams_sort_sve(pgrams)", base_call, sve_call).log(base);
+    benchmark_nullary(env, "sz_pgrams_sort_sve", base_call, sve_call).log(base);
+#endif
+}
+
+#pragma endregion
+
+#pragma region Intersections Benchmarks
+
+/** @brief Uses the STL's @b `std::unordered_map` to find the intersections between two string sequences. */
+struct intersect_strings_via_std_t {
+    strings_t const &input_a;
+    strings_t const &input_b;
+    permute_t &output_a;
+    permute_t &output_b;
+
+    intersect_strings_via_std_t(strings_t const &input_a, strings_t const &input_b, //
+                                permute_t &output_a, permute_t &output_b)
+        : input_a(input_a), input_b(input_b), output_a(output_a), output_b(output_b) {}
+
+    call_result_t operator()() const {
+        auto const &input_small = input_a.size() < input_b.size() ? input_a : input_b;
+        auto const &input_large = input_a.size() < input_b.size() ? input_b : input_a;
+        auto &output_small = input_a.size() < input_b.size() ? output_a : output_b;
+        auto &output_large = input_a.size() < input_b.size() ? output_b : output_a;
+
+        // Construct an unordered map for the smaller input
+        std::unordered_map<std::string_view, sz_sorted_idx_t> map_small;
+        for (sz_sorted_idx_t idx_in_small = 0; idx_in_small < input_small.size(); ++idx_in_small)
+            map_small[input_small[idx_in_small]] = idx_in_small;
+
+        // Iterate through the larger input and find the intersections
+        std::size_t intersections = 0;
+        for (sz_sorted_idx_t idx_in_large = 0; idx_in_large < input_large.size(); ++idx_in_large) {
+            auto it = map_small.find(input_large[idx_in_large]);
+            if (it == map_small.end()) continue;
+            output_large[intersections] = idx_in_large;
+            output_small[intersections] = it->second;
+            ++intersections;
+        }
+
+        // Prepare stats
+        check_value_t checksum = static_cast<check_value_t>(intersections);
+        std::size_t bytes_passed = accumulate_lengths(input_a) + accumulate_lengths(input_b);
+        return {bytes_passed, checksum};
+    }
+};
+
+template <sz_sequence_intersect_t func_>
+struct intersect_strings_via_sz {
+    strings_t const &input_a;
+    strings_t const &input_b;
+    permute_t &output_a;
+    permute_t &output_b;
+
+    intersect_strings_via_sz(strings_t const &input_a, strings_t const &input_b, //
+                             permute_t &output_a, permute_t &output_b)
+        : input_a(input_a), input_b(input_b), output_a(output_a), output_b(output_b) {}
+
+    call_result_t operator()() const {
+
+        // Prepare the sequence structure for the callback.
+        sz_sequence_t array_a, array_b;
+        array_a.count = input_a.size();
+        array_a.handle = &input_a;
+        array_a.get_start = get_start;
+        array_a.get_length = get_length;
+        array_b.count = input_b.size();
+        array_b.handle = &input_b;
+        array_b.get_start = get_start;
+        array_b.get_length = get_length;
+
+        // Prepare the sequence structure for the callback.
+        sz_size_t intersections = 0;
+        sz::_with_alloc<std::allocator<char>>([&](sz_memory_allocator_t &alloc) {
+            return func_(&array_a, &array_b, &alloc, 0, //
+                         &intersections, output_a.data(), output_b.data());
+        });
+
+        // Prepare stats
+        check_value_t checksum = static_cast<check_value_t>(intersections);
+        std::size_t bytes_passed = accumulate_lengths(input_a) + accumulate_lengths(input_b);
+        return {bytes_passed, checksum};
+    }
+};
+
+/**
+ *  @brief Find the array permutation that sorts the input strings.
+ *  @warning Some algorithms use more memory than others and memory usage is not accounted for in this benchmark.
+ */
+void bench_intersections(environment_t const &env) {
+
+    // Deduplicate the entire set of tokens and also sample some tokens into the second set
+    std::unordered_set<std::string_view> unique_tokens(env.tokens.begin(), env.tokens.end());
+    std::vector<std::string_view> tokens_a(unique_tokens.begin(), unique_tokens.end());
+    std::vector<std::string_view> tokens_b;
+    std::size_t const tokens_b_size = env.tokens.size() / 2;
+    std::sample(unique_tokens.begin(), unique_tokens.end(), //
+                std::back_inserter(tokens_b), tokens_b_size, global_random_generator());
+
+    std::size_t const max_tokens_in_intersection = (std::min)(tokens_a.size(), tokens_b.size());
+    permute_t permute_a(max_tokens_in_intersection), permute_b(max_tokens_in_intersection);
+
+    // First, benchmark the STL function
+    auto base_call = intersect_strings_via_std_t {tokens_a, tokens_b, permute_a, permute_b};
+    benchmark_result_t base = benchmark_nullary(env, "intersect<std::unordered_map>", base_call).log();
+    auto serial_call =
+        intersect_strings_via_sz<sz_sequence_intersect_serial> {tokens_a, tokens_b, permute_a, permute_b};
+    benchmark_nullary(env, "sz_sequence_intersect_serial", base_call, serial_call).log(base);
+
+    // Conditionally include SIMD-accelerated backends
+#if SZ_USE_SKYLAKE
+    auto skylake_call = intersect_strings_via_sz<sz_sequence_intersect_ice> {tokens_a, tokens_b, permute_a, permute_b};
+    benchmark_nullary(env, "sz_sequence_intersect_ice", base_call, skylake_call).log(base);
+#endif
+#if SZ_USE_SVE
+    auto sve_call = intersect_strings_via_sz<sz_sequence_intersect_sve> {tokens_a, tokens_b, permute_a, permute_b};
+    benchmark_nullary(env, "sz_sequence_intersect_sve", base_call, sve_call).log(base);
 #endif
 }
 
@@ -323,6 +441,7 @@ int main(int argc, char const **argv) {
     std::printf("Starting search benchmarks...\n");
     bench_sorting_pgrams(env);
     bench_sorting_strings(env);
+    bench_intersections(env);
 
     std::printf("All benchmarks passed.\n");
     return 0;

From 0074ad7bc1601bfee876779d55bc8e9b188d1c93 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 13 Mar 2025 10:00:49 +0000
Subject: [PATCH 199/751] Improve: Naming benchmark names

---
 scripts/bench.hpp           |  44 +++++-----
 scripts/bench_container.cpp |  54 ++++++-------
 scripts/bench_search.cpp    | 156 +++++++++++++++++++-----------------
 scripts/bench_sort.cpp      |  26 +++---
 scripts/bench_token.cpp     |  70 ++++++++--------
 5 files changed, 178 insertions(+), 172 deletions(-)

diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index aafd3142..85b3557a 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -453,7 +453,7 @@ struct duration_histogram {
 
 using duration_histogram_t = duration_histogram<>;
 
-struct benchmark_result_t {
+struct bench_result_t {
     std::string name;
     bool skipped = false;
 
@@ -468,7 +468,7 @@ struct benchmark_result_t {
     std::size_t operations = 0;   //< Pulled from the `call_result_t`
     std::size_t errors = 0;       //< Pulled from the `call_result_t`
 
-    inline benchmark_result_t &operator+=(call_result_t const &run) noexcept {
+    inline bench_result_t &operator+=(call_result_t const &run) noexcept {
         bytes_passed += run.bytes_passed;
         operations += run.operations;
         return *this;
@@ -508,8 +508,8 @@ struct benchmark_result_t {
      *  After a section of benchmarks is completed, you can use other functionality to visualize the results
      *  in a more structured way, like a table or a graph or a set of progress bars.
      */
-    template <typename... Baselines>
-    benchmark_result_t const &log(Baselines const &...bases) const {
+    template <typename... baselines_types_>
+    bench_result_t const &log(baselines_types_ const &...bases) const {
         if (skipped) return *this;
         std::printf("Benchmarking `%s`:\n", name.c_str());
 
@@ -545,7 +545,7 @@ struct benchmark_result_t {
         }
 
         // Define a helper lambda to log relative performance with folding expressions.
-        auto log_relative = [this](benchmark_result_t const &base) {
+        auto log_relative = [this](bench_result_t const &base) {
             if (skipped || base.skipped) return;
             auto relative_throughput = (bytes_passed / profiled_seconds) / (base.bytes_passed / base.profiled_seconds);
             if (operations)
@@ -584,14 +584,14 @@ template <                                          //
     typename baseline_type_ = callable_no_op_t,     //
     typename preprocessing_type_ = callable_no_op_t //
     >
-benchmark_result_t benchmark_nullary( //
-    environment_t const &env,         //
-    std::string const &name,          //
-    baseline_type_ &&baseline,        //
-    callable_type_ &&callable,        //
+bench_result_t bench_nullary(  //
+    environment_t const &env,  //
+    std::string const &name,   //
+    baseline_type_ &&baseline, //
+    callable_type_ &&callable, //
     preprocessing_type_ &&preprocessing = callable_no_op_t()) {
 
-    benchmark_result_t result;
+    bench_result_t result;
     result.name = name;
     if (!env.allow(name)) {
         result.skipped = true;
@@ -620,7 +620,7 @@ benchmark_result_t benchmark_nullary( //
         }
 
     // Repeat the benchmark of the unary function. Assume most of them are applied to the entire
-    // dataset and take a lot of time, so we don't unroll much, unlike `benchmark_unary`.
+    // dataset and take a lot of time, so we don't unroll much, unlike `bench_unary`.
     for (auto running_seconds : repeat_up_to(env.benchmark_seconds)) {
         std::uint64_t time_start = cpu_cycle_counter();
         call_result_t call_result = callable();
@@ -650,14 +650,14 @@ template <                                          //
     typename baseline_type_ = callable_no_op_t,     //
     typename preprocessing_type_ = callable_no_op_t //
     >
-benchmark_result_t benchmark_unary( //
-    environment_t const &env,       //
-    std::string const &name,        //
-    baseline_type_ &&baseline,      //
-    callable_type_ &&callable,      //
+bench_result_t bench_unary(    //
+    environment_t const &env,  //
+    std::string const &name,   //
+    baseline_type_ &&baseline, //
+    callable_type_ &&callable, //
     preprocessing_type_ &&preprocessing = callable_no_op_t()) {
 
-    benchmark_result_t result;
+    bench_result_t result;
     result.name = name;
     if (!env.allow(name)) {
         result.skipped = true;
@@ -734,8 +734,8 @@ benchmark_result_t benchmark_unary( //
  *  @return Profiling results, including the number of cycles, bytes processed, and error counts.
  */
 template <typename callable_type_>
-benchmark_result_t benchmark_nullary(environment_t const &env, std::string const &name, callable_type_ &&callable) {
-    return benchmark_nullary(env, name, callable_no_op_t {}, callable);
+bench_result_t bench_nullary(environment_t const &env, std::string const &name, callable_type_ &&callable) {
+    return bench_nullary(env, name, callable_no_op_t {}, callable);
 }
 
 /**
@@ -746,8 +746,8 @@ benchmark_result_t benchmark_nullary(environment_t const &env, std::string const
  *  @return Profiling results, including the number of cycles, bytes processed, and error counts.
  */
 template <typename callable_type_>
-benchmark_result_t benchmark_unary(environment_t const &env, std::string const &name, callable_type_ &&callable) {
-    return benchmark_unary(env, name, callable_no_op_t {}, callable);
+bench_result_t bench_unary(environment_t const &env, std::string const &name, callable_type_ &&callable) {
+    return bench_unary(env, name, callable_no_op_t {}, callable);
 }
 
 } // namespace scripts
diff --git a/scripts/bench_container.cpp b/scripts/bench_container.cpp
index 39c1a4c8..419f0dfa 100644
--- a/scripts/bench_container.cpp
+++ b/scripts/bench_container.cpp
@@ -110,13 +110,13 @@ struct callable_for_associative_lookups {
 void bench_associative_lookups_with_different_simd_backends(environment_t const &env) {
 
     // First, benchmark the default STL equality comparison and hashes
-    benchmark_result_t base_map, base_umap;
+    bench_result_t base_map, base_umap;
     {
         auto callable_map = callable_for_associative_lookups<std::map<std::string_view, unsigned>>(env);
-        base_map = benchmark(env, "map::find", callable_no_op_t(), callable_map, callable_map.preprocessor()).log();
+        base_map = bench_unary(env, "map::find", callable_no_op_t(), callable_map, callable_map.preprocessor()).log();
         auto callable_umap = callable_for_associative_lookups<std::unordered_map<std::string_view, unsigned>>(env);
         base_umap =
-            benchmark(env, "unordered_map::find", callable_no_op_t(), callable_umap, callable_umap.preprocessor())
+            bench_unary(env, "unordered_map::find", callable_no_op_t(), callable_umap, callable_umap.preprocessor())
                 .log();
     }
 
@@ -125,12 +125,12 @@ void bench_associative_lookups_with_different_simd_backends(environment_t const
     {
         auto callable_map =
             callable_for_associative_lookups<std::map<std::string_view, unsigned, less_from_sz<sz_order_skylake>>>(env);
-        benchmark(env, "map<sz_order_skylake>::find", callable_no_op_t(), callable_map, callable_map.preprocessor())
+        bench_unary(env, "map<sz_order_skylake>::find", callable_no_op_t(), callable_map, callable_map.preprocessor())
             .log(base_map);
         auto callable_umap = callable_for_associative_lookups<std::unordered_map<
             std::string_view, unsigned, hash_from_sz<sz_hash_skylake>, equal_to_from_sz<sz_equal_skylake>>>(env);
-        benchmark(env, "unordered_map<sz_hash_skylake, sz_equal_skylake>::find", callable_no_op_t(), callable_umap,
-                  callable_umap.preprocessor())
+        bench_unary(env, "unordered_map<sz_hash_skylake, sz_equal_skylake>::find", callable_no_op_t(), callable_umap,
+                    callable_umap.preprocessor())
             .log(base_umap);
     }
 
@@ -139,12 +139,12 @@ void bench_associative_lookups_with_different_simd_backends(environment_t const
     {
         auto callable_map =
             callable_for_associative_lookups<std::map<std::string_view, unsigned, less_from_sz<sz_order_haswell>>>(env);
-        benchmark(env, "map<sz_order_haswell>::find", callable_no_op_t(), callable_map, callable_map.preprocessor())
+        bench_unary(env, "map<sz_order_haswell>::find", callable_no_op_t(), callable_map, callable_map.preprocessor())
             .log(base_map);
         auto callable_umap = callable_for_associative_lookups<std::unordered_map<
             std::string_view, unsigned, hash_from_sz<sz_hash_haswell>, equal_to_from_sz<sz_equal_haswell>>>(env);
-        benchmark(env, "unordered_map<sz_hash_haswell, sz_equal_haswell>::find", callable_no_op_t(), callable_umap,
-                  callable_umap.preprocessor())
+        bench_unary(env, "unordered_map<sz_hash_haswell, sz_equal_haswell>::find", callable_no_op_t(), callable_umap,
+                    callable_umap.preprocessor())
             .log(base_umap);
     }
 #endif
@@ -152,13 +152,13 @@ void bench_associative_lookups_with_different_simd_backends(environment_t const
     {
         auto callable_map =
             callable_for_associative_lookups<std::map<std::string_view, unsigned, less_from_sz<sz_order_neon>>>(env);
-        benchmark(env, "map<sz_order_neon>::find", callable_no_op_t(), callable_map, callable_map.preprocessor())
+        bench_unary(env, "map<sz_order_neon>::find", callable_no_op_t(), callable_map, callable_map.preprocessor())
             .log(base_map);
         auto callable_umap =
             callable_for_associative_lookups<std::unordered_map<std::string_view, unsigned, hash_from_sz<sz_hash_neon>,
                                                                 equal_to_from_sz<sz_equal_neon>>>(env);
-        benchmark(env, "unordered_map<sz_hash_neon, sz_equal_neon>::find", callable_no_op_t(), callable_umap,
-                  callable_umap.preprocessor())
+        bench_unary(env, "unordered_map<sz_hash_neon, sz_equal_neon>::find", callable_no_op_t(), callable_umap,
+                    callable_umap.preprocessor())
             .log(base_umap);
     }
 #endif
@@ -191,27 +191,27 @@ struct equal_to_through_std_t {
 void bench_associative_lookups_with_different_key_classes(environment_t const &env) {
 
     // First, benchmark the default STL equality comparison and hashes for `std::string_view` keys
-    benchmark_result_t base_map, base_umap;
+    bench_result_t base_map, base_umap;
     {
         auto callable_map = callable_for_associative_lookups<std::map<std::string_view, unsigned>>(env);
-        base_map =
-            benchmark(env, "map<std::string_view>::find", callable_no_op_t(), callable_map, callable_map.preprocessor())
-                .log();
+        base_map = bench_unary(env, "map<std::string_view>::find", callable_no_op_t(), callable_map,
+                               callable_map.preprocessor())
+                       .log();
         auto callable_umap = callable_for_associative_lookups<std::unordered_map<std::string_view, unsigned>>(env);
-        base_umap = benchmark(env, "unordered_map<std::string_view>::find", callable_no_op_t(), callable_umap,
-                              callable_umap.preprocessor())
+        base_umap = bench_unary(env, "unordered_map<std::string_view>::find", callable_no_op_t(), callable_umap,
+                                callable_umap.preprocessor())
                         .log();
     }
 
     // Compare that to using `std::string` for keys
     {
         auto callable_map = callable_for_associative_lookups<std::map<std::string, unsigned, less_through_std_t>>(env);
-        benchmark(env, "map<std::string>::find", callable_no_op_t(), callable_map, callable_map.preprocessor())
+        bench_unary(env, "map<std::string>::find", callable_no_op_t(), callable_map, callable_map.preprocessor())
             .log(base_map);
         auto callable_umap = callable_for_associative_lookups<
             std::unordered_map<std::string, unsigned, hash_through_std_t, equal_to_through_std_t>>(env);
-        benchmark(env, "unordered_map<std::string>::find", callable_no_op_t(), callable_umap,
-                  callable_umap.preprocessor())
+        bench_unary(env, "unordered_map<std::string>::find", callable_no_op_t(), callable_umap,
+                    callable_umap.preprocessor())
             .log(base_umap);
     }
 
@@ -219,24 +219,24 @@ void bench_associative_lookups_with_different_key_classes(environment_t const &e
     {
         auto callable_map =
             callable_for_associative_lookups<std::map<sz::string_view, unsigned, less_through_std_t>>(env);
-        benchmark(env, "map<sz::string_view>::find", callable_no_op_t(), callable_map, callable_map.preprocessor())
+        bench_unary(env, "map<sz::string_view>::find", callable_no_op_t(), callable_map, callable_map.preprocessor())
             .log(base_map);
         auto callable_umap = callable_for_associative_lookups<
             std::unordered_map<sz::string_view, unsigned, hash_through_std_t, equal_to_through_std_t>>(env);
-        benchmark(env, "unordered_map<sz::string_view>::find", callable_no_op_t(), callable_umap,
-                  callable_umap.preprocessor())
+        bench_unary(env, "unordered_map<sz::string_view>::find", callable_no_op_t(), callable_umap,
+                    callable_umap.preprocessor())
             .log(base_umap);
     }
 
     // Try StringZilla's "Small String Optimization" class - `sz::string`
     {
         auto callable_map = callable_for_associative_lookups<std::map<sz::string, unsigned, less_through_std_t>>(env);
-        benchmark(env, "map<sz::string>::find", callable_no_op_t(), callable_map, callable_map.preprocessor())
+        bench_unary(env, "map<sz::string>::find", callable_no_op_t(), callable_map, callable_map.preprocessor())
             .log(base_map);
         auto callable_umap = callable_for_associative_lookups<
             std::unordered_map<sz::string, unsigned, hash_through_std_t, equal_to_through_std_t>>(env);
-        benchmark(env, "unordered_map<sz::string>::find", callable_no_op_t(), callable_umap,
-                  callable_umap.preprocessor())
+        bench_unary(env, "unordered_map<sz::string>::find", callable_no_op_t(), callable_umap,
+                    callable_umap.preprocessor())
             .log(base_umap);
     }
 }
diff --git a/scripts/bench_search.cpp b/scripts/bench_search.cpp
index 4c2d2655..c9dd6e5f 100644
--- a/scripts/bench_search.cpp
+++ b/scripts/bench_search.cpp
@@ -187,46 +187,48 @@ void bench_substring_search(environment_t const &env) {
     // First, benchmark the serial function
     // The "check value" for normal and reverse search is the same - simply the number of matches.
     auto base_call = callable_for_substring_search<sz::range_matches, matcher_from_sz_find<sz_find_serial>>(env);
-    benchmark_result_t base = benchmark(env, "sz_find_serial", base_call).log();
-    benchmark_result_t base_reverse =
-        benchmark(env, "sz_rfind_serial",
-                  callable_for_substring_search<sz::range_rmatches, matcher_from_sz_find<sz_rfind_serial>>(env))
+    bench_result_t base = bench_unary(env, "sz_find_serial", base_call).log();
+    bench_result_t base_reverse =
+        bench_unary(env, "sz_rfind_serial",
+                    callable_for_substring_search<sz::range_rmatches, matcher_from_sz_find<sz_rfind_serial>>(env))
             .log();
 
     // Conditionally include SIMD-accelerated backends
 #if SZ_USE_SKYLAKE
-    benchmark(env, "sz_find_skylake", base_call,
-              callable_for_substring_search<sz::range_matches, matcher_from_sz_find<sz_find_skylake>>(env))
+    bench_unary(env, "sz_find_skylake", base_call,
+                callable_for_substring_search<sz::range_matches, matcher_from_sz_find<sz_find_skylake>>(env))
         .log(base);
-    benchmark(env, "sz_rfind_skylake", base_call,
-              callable_for_substring_search<sz::range_rmatches, matcher_from_sz_find<sz_rfind_skylake>>(env))
+    bench_unary(env, "sz_rfind_skylake", base_call,
+                callable_for_substring_search<sz::range_rmatches, matcher_from_sz_find<sz_rfind_skylake>>(env))
         .log(base_reverse);
 #endif
 #if SZ_USE_HASWELL
-    benchmark(env, "sz_find_haswell", base_call,
-              callable_for_substring_search<sz::range_matches, matcher_from_sz_find<sz_find_haswell>>(env))
+    bench_unary(env, "sz_find_haswell", base_call,
+                callable_for_substring_search<sz::range_matches, matcher_from_sz_find<sz_find_haswell>>(env))
         .log(base);
-    benchmark(env, "sz_rfind_haswell", base_call,
-              callable_for_substring_search<sz::range_rmatches, matcher_from_sz_find<sz_rfind_haswell>>(env))
+    bench_unary(env, "sz_rfind_haswell", base_call,
+                callable_for_substring_search<sz::range_rmatches, matcher_from_sz_find<sz_rfind_haswell>>(env))
         .log(base_reverse);
 #endif
 #if SZ_USE_NEON
-    benchmark(env, "sz_find_neon", base_call,
-              callable_for_substring_search<sz::range_matches, matcher_from_sz_find<sz_find_neon>>(env))
+    bench_unary(env, "sz_find_neon", base_call,
+                callable_for_substring_search<sz::range_matches, matcher_from_sz_find<sz_find_neon>>(env))
         .log(base);
-    benchmark(env, "sz_rfind_neon", base_call,
-              callable_for_substring_search<sz::range_rmatches, matcher_from_sz_find<sz_rfind_neon>>(env))
+    bench_unary(env, "sz_rfind_neon", base_call,
+                callable_for_substring_search<sz::range_rmatches, matcher_from_sz_find<sz_rfind_neon>>(env))
         .log(base_reverse);
 #endif
 
     // Include LibC functionality
     // ! Despite receiving string-views, following functions are assuming the strings are null-terminated.
-    benchmark(env, "strstr", base_call, callable_for_substring_search<sz::range_matches, matcher_strstr_t>(env))
+    bench_unary(env, "find<std::strstr>", base_call, //
+                callable_for_substring_search<sz::range_matches, matcher_strstr_t>(env))
         .log(base);
 
     // Include POSIX functionality
 #if defined(_GNU_SOURCE)
-    benchmark(env, "memmem", base_call, callable_for_substring_search<sz::range_matches, matcher_memmem_t>(env))
+    bench_unary(env, "find<memmem>", base_call, //
+                callable_for_substring_search<sz::range_matches, matcher_memmem_t>(env))
         .log(base);
 #endif
 
@@ -237,16 +239,17 @@ void bench_substring_search(environment_t const &env) {
     using rmatcher_bm_t = rmatcher_from_std_search<std::boyer_moore_searcher<std::string_view::const_reverse_iterator>>;
     using rmatcher_bmh_t =
         rmatcher_from_std_search<std::boyer_moore_horspool_searcher<std::string_view::const_reverse_iterator>>;
-    benchmark(env, "std::boyer_moore", base_call, callable_for_substring_search<sz::range_matches, matcher_bm_t>(env))
+    bench_unary(env, "find<std::boyer_moore>", base_call,
+                callable_for_substring_search<sz::range_matches, matcher_bm_t>(env))
         .log(base);
-    benchmark(env, "std::boyer_moore<reverse>", base_call,
-              callable_for_substring_search<sz::range_rmatches, rmatcher_bm_t>(env))
+    bench_unary(env, "rfind<std::boyer_moore>", base_call,
+                callable_for_substring_search<sz::range_rmatches, rmatcher_bm_t>(env))
         .log(base_reverse);
-    benchmark(env, "std::boyer_moore_horspool", base_call,
-              callable_for_substring_search<sz::range_matches, matcher_bmh_t>(env))
+    bench_unary(env, "find<std::boyer_moore_horspool>", base_call,
+                callable_for_substring_search<sz::range_matches, matcher_bmh_t>(env))
         .log(base);
-    benchmark(env, "std::boyer_moore_horspool<reverse>", base_call,
-              callable_for_substring_search<sz::range_rmatches, rmatcher_bmh_t>(env))
+    bench_unary(env, "rfind<std::boyer_moore_horspool>", base_call,
+                callable_for_substring_search<sz::range_rmatches, rmatcher_bmh_t>(env))
         .log(base_reverse);
 #endif
 }
@@ -305,7 +308,7 @@ struct matcher_memchr_t {
     inline matcher_memchr_t(char needle) noexcept : needle_(needle) {}
     constexpr size_type needle_length() const noexcept { return 1; }
     inline size_type operator()(std::string_view haystack) const noexcept {
-        auto ptr = (char *)memchr(haystack.data(), needle_, haystack.size());
+        auto ptr = (char *)std::memchr(haystack.data(), needle_, haystack.size());
         do_not_optimize(ptr);
         if (!ptr) return std::string_view::npos; // No match found
         return (size_type)(ptr - haystack.data());
@@ -355,48 +358,49 @@ void bench_byte_search(environment_t const &env) {
     // First, benchmark the serial function
     // The "check value" for normal and reverse search is the same - simply the number of matches.
     auto base_call = callable_for_byte_search<sz::range_matches, matcher_from_sz_find_byte<sz_find_byte_serial>>(env);
-    benchmark_result_t base = benchmark(env, "sz_find_byte_serial", base_call).log();
-    benchmark_result_t base_reverse =
-        benchmark(env, "sz_rfind_byte_serial",
-                  callable_for_byte_search<sz::range_rmatches, matcher_from_sz_find_byte<sz_rfind_byte_serial>>(env))
+    bench_result_t base = bench_unary(env, "sz_find_byte_serial", base_call).log();
+    bench_result_t base_reverse =
+        bench_unary(env, "sz_rfind_byte_serial",
+                    callable_for_byte_search<sz::range_rmatches, matcher_from_sz_find_byte<sz_rfind_byte_serial>>(env))
             .log();
 
     // Conditionally include SIMD-accelerated backends
 #if SZ_USE_SKYLAKE
-    benchmark(env, "sz_find_byte_skylake", base_call,
-              callable_for_byte_search<sz::range_matches, matcher_from_sz_find_byte<sz_find_byte_skylake>>(env))
+    bench_unary(env, "sz_find_byte_skylake", base_call,
+                callable_for_byte_search<sz::range_matches, matcher_from_sz_find_byte<sz_find_byte_skylake>>(env))
         .log(base);
-    benchmark(env, "sz_rfind_byte_skylake", base_call,
-              callable_for_byte_search<sz::range_rmatches, matcher_from_sz_find_byte<sz_rfind_byte_skylake>>(env))
+    bench_unary(env, "sz_rfind_byte_skylake", base_call,
+                callable_for_byte_search<sz::range_rmatches, matcher_from_sz_find_byte<sz_rfind_byte_skylake>>(env))
         .log(base_reverse);
 #endif
 #if SZ_USE_HASWELL
-    benchmark(env, "sz_find_byte_haswell", base_call,
-              callable_for_byte_search<sz::range_matches, matcher_from_sz_find_byte<sz_find_byte_haswell>>(env))
+    bench_unary(env, "sz_find_byte_haswell", base_call,
+                callable_for_byte_search<sz::range_matches, matcher_from_sz_find_byte<sz_find_byte_haswell>>(env))
         .log(base);
-    benchmark(env, "sz_rfind_byte_haswell", base_call,
-              callable_for_byte_search<sz::range_rmatches, matcher_from_sz_find_byte<sz_rfind_byte_haswell>>(env))
+    bench_unary(env, "sz_rfind_byte_haswell", base_call,
+                callable_for_byte_search<sz::range_rmatches, matcher_from_sz_find_byte<sz_rfind_byte_haswell>>(env))
         .log(base_reverse);
 #endif
 #if SZ_USE_NEON
-    benchmark(env, "sz_find_byte_neon", base_call,
-              callable_for_byte_search<sz::range_matches, matcher_from_sz_find_byte<sz_find_byte_neon>>(env))
+    bench_unary(env, "sz_find_byte_neon", base_call,
+                callable_for_byte_search<sz::range_matches, matcher_from_sz_find_byte<sz_find_byte_neon>>(env))
         .log(base);
-    benchmark(env, "sz_rfind_byte_neon", base_call,
-              callable_for_byte_search<sz::range_rmatches, matcher_from_sz_find_byte<sz_rfind_byte_neon>>(env))
+    bench_unary(env, "sz_rfind_byte_neon", base_call,
+                callable_for_byte_search<sz::range_rmatches, matcher_from_sz_find_byte<sz_rfind_byte_neon>>(env))
         .log(base_reverse);
 #endif
 
     // Include LibC functionality
-    benchmark(env, "strchr", base_call, callable_for_byte_search<sz::range_matches, matcher_strchr_t>(env)).log(base);
-
-    // Include POSIX functionality
-#if defined(_GNU_SOURCE)
-    benchmark(env, "memchr", base_call, callable_for_byte_search<sz::range_matches, matcher_memchr_t>(env)).log(base);
-#endif
+    bench_unary(env, "find_byte<std::strchr>", base_call, //
+                callable_for_byte_search<sz::range_matches, matcher_strchr_t>(env))
+        .log(base);
+    bench_unary(env, "find_byte<std::memchr>", base_call, //
+                callable_for_byte_search<sz::range_matches, matcher_memchr_t>(env))
+        .log(base);
 
     // Include STL functionality
-    benchmark(env, "std::find", base_call, callable_for_byte_search<sz::range_matches, matcher_from_std_find>(env))
+    bench_unary(env, "find_byte<std::find>", base_call, //
+                callable_for_byte_search<sz::range_matches, matcher_from_std_find>(env))
         .log(base);
 }
 
@@ -496,58 +500,62 @@ void bench_byteset_search(environment_t const &env) {
     auto base_call =
         callable_for_byteset_search<sz::range_matches, matcher_from_sz_find_byteset<sz_find_byteset_serial>,
                                     sz::byteset>(env);
-    benchmark_result_t base = benchmark(env, "sz_find_byteset_serial", base_call).log();
-    benchmark_result_t base_reverse =
-        benchmark(env, "sz_rfind_byteset_serial",
-                  callable_for_byteset_search<sz::range_rmatches, matcher_from_sz_find_byteset<sz_rfind_byteset_serial>,
-                                              sz::byteset>(env))
+    bench_result_t base = bench_unary(env, "sz_find_byteset_serial", base_call).log();
+    bench_result_t base_reverse =
+        bench_unary(
+            env, "sz_rfind_byteset_serial",
+            callable_for_byteset_search<sz::range_rmatches, matcher_from_sz_find_byteset<sz_rfind_byteset_serial>,
+                                        sz::byteset>(env))
             .log();
 
     // Conditionally include SIMD-accelerated backends
 #if SZ_USE_HASWELL
-    benchmark(env, "sz_find_byteset_haswell", base_call,
-              callable_for_byteset_search<sz::range_matches, matcher_from_sz_find_byteset<sz_find_byteset_haswell>,
-                                          sz::byteset>(env))
+    bench_unary( //
+        env, "sz_find_byteset_haswell", base_call,
+        callable_for_byteset_search<sz::range_matches, matcher_from_sz_find_byteset<sz_find_byteset_haswell>,
+                                    sz::byteset>(env))
         .log(base);
-    benchmark(env, "sz_rfind_byteset_haswell", base_call,
-              callable_for_byteset_search<sz::range_rmatches, matcher_from_sz_find_byteset<sz_rfind_byteset_haswell>,
-                                          sz::byteset>(env))
+    bench_unary( //
+        env, "sz_rfind_byteset_haswell", base_call,
+        callable_for_byteset_search<sz::range_rmatches, matcher_from_sz_find_byteset<sz_rfind_byteset_haswell>,
+                                    sz::byteset>(env))
         .log(base_reverse);
 #endif
 #if SZ_USE_ICE
-    benchmark(
+    bench_unary( //
         env, "sz_find_byteset_ice", base_call,
         callable_for_byteset_search<sz::range_matches, matcher_from_sz_find_byteset<sz_find_byteset_ice>, sz::byteset>(
             env))
         .log(base);
-    benchmark(env, "sz_rfind_byteset_ice", base_call,
-              callable_for_byteset_search<sz::range_rmatches, matcher_from_sz_find_byteset<sz_rfind_byteset_ice>,
-                                          sz::byteset>(env))
+    bench_unary( //
+        env, "sz_rfind_byteset_ice", base_call,
+        callable_for_byteset_search<sz::range_rmatches, matcher_from_sz_find_byteset<sz_rfind_byteset_ice>,
+                                    sz::byteset>(env))
         .log(base_reverse);
 #endif
 #if SZ_USE_NEON
-    benchmark(
+    bench_unary(
         env, "sz_find_byteset_neon", base_call,
         callable_for_byteset_search<sz::range_matches, matcher_from_sz_find_byteset<sz_find_byteset_neon>, sz::byteset>(
             env))
         .log(base);
-    benchmark(env, "sz_rfind_byteset_neon", base_call,
-              callable_for_byteset_search<sz::range_rmatches, matcher_from_sz_find_byteset<sz_rfind_byteset_neon>,
-                                          sz::byteset>(env))
+    bench_unary(env, "sz_rfind_byteset_neon", base_call,
+                callable_for_byteset_search<sz::range_rmatches, matcher_from_sz_find_byteset<sz_rfind_byteset_neon>,
+                                            sz::byteset>(env))
         .log(base_reverse);
 #endif
 
     // Include LibC functionality
-    benchmark(env, "strcspn", base_call,
-              callable_for_byteset_search<sz::range_matches, matcher_strcspn_t, std::string_view>(env))
+    bench_unary(env, "find_byteset<std::strcspn>", base_call,
+                callable_for_byteset_search<sz::range_matches, matcher_strcspn_t, std::string_view>(env))
         .log(base);
 
     // Include STL functionality
-    benchmark(env, "std::string_view.find_first_of", base_call,
-              callable_for_byteset_search<sz::range_matches, matcher_std_string_first_of_t, std::string_view>(env))
+    bench_unary(env, "find_byteset<std::string_view::find_first_of>", base_call,
+                callable_for_byteset_search<sz::range_matches, matcher_std_string_first_of_t, std::string_view>(env))
         .log(base);
-    benchmark(env, "std::string_view.find_last_of", base_call,
-              callable_for_byteset_search<sz::range_rmatches, matcher_std_string_last_of_t, std::string_view>(env))
+    bench_unary(env, "rfind_byteset<std::string_view::find_last_of>", base_call,
+                callable_for_byteset_search<sz::range_rmatches, matcher_std_string_last_of_t, std::string_view>(env))
         .log(base_reverse);
 }
 
diff --git a/scripts/bench_sort.cpp b/scripts/bench_sort.cpp
index 2f501147..a99607e9 100644
--- a/scripts/bench_sort.cpp
+++ b/scripts/bench_sort.cpp
@@ -209,24 +209,24 @@ void bench_sorting_strings(environment_t const &env) {
 
     // First, benchmark the STL function
     auto base_call = argsort_strings_via_std_t {env.tokens, permute_buffer};
-    benchmark_result_t base = benchmark_nullary(env, "sequence_argsort<std::sort>", base_call).log();
+    bench_result_t base = bench_nullary(env, "sequence_argsort<std::sort>", base_call).log();
     auto serial_call = argsort_strings_via_sz<sz_sequence_argsort_serial> {env.tokens, permute_buffer};
-    benchmark_nullary(env, "sz_sequence_argsort_serial", base_call, serial_call).log(base);
+    bench_nullary(env, "sz_sequence_argsort_serial", base_call, serial_call).log(base);
 
 // Conditionally include SIMD-accelerated backends
 #if SZ_USE_SKYLAKE
     auto skylake_call = argsort_strings_via_sz<sz_sequence_argsort_skylake> {env.tokens, permute_buffer};
-    benchmark_nullary(env, "sz_sequence_argsort_skylake", base_call, skylake_call).log(base);
+    bench_nullary(env, "sz_sequence_argsort_skylake", base_call, skylake_call).log(base);
 #endif
 #if SZ_USE_SVE
     auto sve_call = argsort_strings_via_sz<sz_sequence_argsort_sve> {env.tokens, permute_buffer};
-    benchmark_nullary(env, "sz_sequence_argsort_sve", base_call, sve_call).log(base);
+    bench_nullary(env, "sz_sequence_argsort_sve", base_call, sve_call).log(base);
 #endif
 
     // Include POSIX and WinAPI functionality
 #if defined(_SZ_HAS_QSORT_R) || defined(_SZ_HAS_QSORT_S)
     auto qsort_call = argsort_strings_via_qsort_t {env.tokens, permute_buffer};
-    benchmark_nullary(env, "sequence_argsort<qsort>", base_call, qsort_call).log(base);
+    bench_nullary(env, "sequence_argsort<qsort>", base_call, qsort_call).log(base);
 #endif
 }
 
@@ -296,18 +296,18 @@ void bench_sorting_pgrams(environment_t const &env) {
 
     // First, benchmark the STL function
     auto base_call = sort_pgrams_via_std_t {pgrams_buffer, permute_buffer};
-    benchmark_result_t base = benchmark_nullary(env, "pgrams_sort<std::sort>", base_call).log();
+    bench_result_t base = bench_nullary(env, "pgrams_sort<std::sort>", base_call).log();
     auto serial_call = sort_pgrams_via_sz<sz_pgrams_sort_serial> {pgrams_buffer, pgrams_sorted, permute_buffer};
-    benchmark_nullary(env, "sz_pgrams_sort_serial", base_call, serial_call).log(base);
+    bench_nullary(env, "sz_pgrams_sort_serial", base_call, serial_call).log(base);
 
     // Conditionally include SIMD-accelerated backends
 #if SZ_USE_SKYLAKE
     auto skylake_call = sort_pgrams_via_sz<sz_pgrams_sort_skylake> {pgrams_buffer, pgrams_sorted, permute_buffer};
-    benchmark_nullary(env, "sz_pgrams_sort_skylake", base_call, skylake_call).log(base);
+    bench_nullary(env, "sz_pgrams_sort_skylake", base_call, skylake_call).log(base);
 #endif
 #if SZ_USE_SVE
     auto sve_call = sort_pgrams_via_sz<sz_pgrams_sort_sve> {pgrams_buffer, pgrams_sorted, permute_buffer};
-    benchmark_nullary(env, "sz_pgrams_sort_sve", base_call, sve_call).log(base);
+    bench_nullary(env, "sz_pgrams_sort_sve", base_call, sve_call).log(base);
 #endif
 }
 
@@ -411,19 +411,19 @@ void bench_intersections(environment_t const &env) {
 
     // First, benchmark the STL function
     auto base_call = intersect_strings_via_std_t {tokens_a, tokens_b, permute_a, permute_b};
-    benchmark_result_t base = benchmark_nullary(env, "intersect<std::unordered_map>", base_call).log();
+    bench_result_t base = bench_nullary(env, "intersect<std::unordered_map>", base_call).log();
     auto serial_call =
         intersect_strings_via_sz<sz_sequence_intersect_serial> {tokens_a, tokens_b, permute_a, permute_b};
-    benchmark_nullary(env, "sz_sequence_intersect_serial", base_call, serial_call).log(base);
+    bench_nullary(env, "sz_sequence_intersect_serial", base_call, serial_call).log(base);
 
     // Conditionally include SIMD-accelerated backends
 #if SZ_USE_SKYLAKE
     auto skylake_call = intersect_strings_via_sz<sz_sequence_intersect_ice> {tokens_a, tokens_b, permute_a, permute_b};
-    benchmark_nullary(env, "sz_sequence_intersect_ice", base_call, skylake_call).log(base);
+    bench_nullary(env, "sz_sequence_intersect_ice", base_call, skylake_call).log(base);
 #endif
 #if SZ_USE_SVE
     auto sve_call = intersect_strings_via_sz<sz_sequence_intersect_sve> {tokens_a, tokens_b, permute_a, permute_b};
-    benchmark_nullary(env, "sz_sequence_intersect_sve", base_call, sve_call).log(base);
+    bench_nullary(env, "sz_sequence_intersect_sve", base_call, sve_call).log(base);
 #endif
 }
 
diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index 48c2d374..a97c336c 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -137,40 +137,40 @@ struct hash_stream_from_sz {
 void bench_checksums(environment_t const &env) {
 
     auto validator = bytesum_from_std_t(env);
-    benchmark_result_t base_stl = benchmark(env, "std::accumulate", validator).log();
-    benchmark_result_t base =
-        benchmark(env, "sz_bytesum_serial", validator, bytesum_from_sz<sz_bytesum_serial>(env)).log(base_stl);
+    bench_result_t base_stl = bench_unary(env, "bytesum<std::accumulate>", validator).log();
+    bench_result_t base =
+        bench_unary(env, "sz_bytesum_serial", validator, bytesum_from_sz<sz_bytesum_serial>(env)).log(base_stl);
 
 #if SZ_USE_HASWELL
-    benchmark(env, "sz_bytesum_haswell", validator, bytesum_from_sz<sz_bytesum_haswell>(env)).log(base, base_stl);
+    bench_unary(env, "sz_bytesum_haswell", validator, bytesum_from_sz<sz_bytesum_haswell>(env)).log(base, base_stl);
 #endif
 #if SZ_USE_SKYLAKE
-    benchmark(env, "sz_bytesum_skylake", validator, bytesum_from_sz<sz_bytesum_skylake>(env)).log(base, base_stl);
+    bench_unary(env, "sz_bytesum_skylake", validator, bytesum_from_sz<sz_bytesum_skylake>(env)).log(base, base_stl);
 #endif
 #if SZ_USE_ICE
-    benchmark(env, "sz_bytesum_ice", validator, bytesum_from_sz<sz_bytesum_ice>(env)).log(base, base_stl);
+    bench_unary(env, "sz_bytesum_ice", validator, bytesum_from_sz<sz_bytesum_ice>(env)).log(base, base_stl);
 #endif
 #if SZ_USE_NEON
-    benchmark(env, "sz_bytesum_neon", validator, bytesum_from_sz<sz_bytesum_neon>(env)).log(base, base_stl);
+    bench_unary(env, "sz_bytesum_neon", validator, bytesum_from_sz<sz_bytesum_neon>(env)).log(base, base_stl);
 #endif
 }
 
 void bench_hashing(environment_t const &env) {
 
     auto validator = hash_from_sz<sz_hash_serial>(env);
-    benchmark_result_t base = benchmark(env, "sz_hash_serial", validator).log();
-    benchmark_result_t base_stl = benchmark(env, "std::hash", hash_from_std_t(env)).log(base);
+    bench_result_t base = bench_unary(env, "sz_hash_serial", validator).log();
+    bench_result_t base_stl = bench_unary(env, "std::hash", hash_from_std_t(env)).log(base);
 #if SZ_USE_HASWELL
-    benchmark(env, "sz_hash_haswell", validator, hash_from_sz<sz_hash_haswell>(env)).log(base, base_stl);
+    bench_unary(env, "sz_hash_haswell", validator, hash_from_sz<sz_hash_haswell>(env)).log(base, base_stl);
 #endif
 #if SZ_USE_SKYLAKE
-    benchmark(env, "sz_hash_skylake", validator, hash_from_sz<sz_hash_skylake>(env)).log(base, base_stl);
+    bench_unary(env, "sz_hash_skylake", validator, hash_from_sz<sz_hash_skylake>(env)).log(base, base_stl);
 #endif
 #if SZ_USE_ICE
-    benchmark(env, "sz_hash_ice", validator, hash_from_sz<sz_hash_ice>(env)).log(base, base_stl);
+    bench_unary(env, "sz_hash_ice", validator, hash_from_sz<sz_hash_ice>(env)).log(base, base_stl);
 #endif
 #if SZ_USE_NEON
-    benchmark(env, "sz_hash_neon", validator, hash_from_sz<sz_hash_neon>(env)).log(base, base_stl);
+    bench_unary(env, "sz_hash_neon", validator, hash_from_sz<sz_hash_neon>(env)).log(base, base_stl);
 #endif
 }
 
@@ -178,29 +178,29 @@ void bench_stream_hashing(environment_t const &env) {
 
     auto validator =
         hash_stream_from_sz<sz_hash_state_init_serial, sz_hash_state_stream_serial, sz_hash_state_fold_serial>(env);
-    benchmark_result_t base = benchmark(env, "sz_hash_stream_serial", validator).log();
-    benchmark_result_t base_stl = benchmark(env, "std::hash", hash_from_std_t(env)).log(base);
+    bench_result_t base = bench_unary(env, "sz_hash_stream_serial", validator).log();
+    bench_result_t base_stl = bench_unary(env, "std::hash", hash_from_std_t(env)).log(base);
 
 #if SZ_USE_HASWELL
-    benchmark(
+    bench_unary(
         env, "sz_hash_stream_haswell", validator,
         hash_stream_from_sz<sz_hash_state_init_haswell, sz_hash_state_stream_haswell, sz_hash_state_fold_haswell>(env))
         .log(base, base_stl);
 #endif
 #if SZ_USE_SKYLAKE
-    benchmark(
+    bench_unary(
         env, "sz_hash_stream_skylake", validator,
         hash_stream_from_sz<sz_hash_state_init_skylake, sz_hash_state_stream_skylake, sz_hash_state_fold_skylake>(env))
         .log(base, base_stl);
 #endif
 #if SZ_USE_ICE
-    benchmark(env, "sz_hash_stream_ice", validator,
-              hash_stream_from_sz<sz_hash_state_init_ice, sz_hash_state_stream_ice, sz_hash_state_fold_ice>(env))
+    bench_unary(env, "sz_hash_stream_ice", validator,
+                hash_stream_from_sz<sz_hash_state_init_ice, sz_hash_state_stream_ice, sz_hash_state_fold_ice>(env))
         .log(base, base_stl);
 #endif
 #if SZ_USE_NEON
-    benchmark(env, "sz_hash_stream_neon", validator,
-              hash_stream_from_sz<sz_hash_state_init_neon, sz_hash_state_stream_neon, sz_hash_state_fold_neon>(env))
+    bench_unary(env, "sz_hash_stream_neon", validator,
+                hash_stream_from_sz<sz_hash_state_init_neon, sz_hash_state_stream_neon, sz_hash_state_fold_neon>(env))
         .log(base, base_stl);
 #endif
 }
@@ -315,45 +315,43 @@ struct ordering_from_memcmp_t {
 void bench_comparing_equality(environment_t const &env) {
 
     auto validator = equality_from_memcmp_t(env);
-    benchmark_result_t base =
-        benchmark(env, "sz_equal_serial", validator, equality_from_sz<sz_equal_serial>(env)).log();
-    benchmark_result_t base_stl = benchmark(env, "std::memcmp==0", validator).log(base);
+    bench_result_t base = bench_unary(env, "sz_equal_serial", validator, equality_from_sz<sz_equal_serial>(env)).log();
+    bench_result_t base_stl = bench_unary(env, "equal<std::memcmp>", validator).log(base);
 
 #if SZ_USE_HASWELL
-    benchmark(env, "sz_equal_haswell", validator, equality_from_sz<sz_equal_haswell>(env)).log(base, base_stl);
+    bench_unary(env, "sz_equal_haswell", validator, equality_from_sz<sz_equal_haswell>(env)).log(base, base_stl);
 #endif
 #if SZ_USE_SKYLAKE
-    benchmark(env, "sz_equal_skylake", validator, equality_from_sz<sz_equal_skylake>(env)).log(base, base_stl);
+    bench_unary(env, "sz_equal_skylake", validator, equality_from_sz<sz_equal_skylake>(env)).log(base, base_stl);
 #endif
 #if SZ_USE_ICE
-    benchmark(env, "sz_equal_ice", validator, equality_from_sz<sz_equal_ice>(env)).log(base, base_stl);
+    bench_unary(env, "sz_equal_ice", validator, equality_from_sz<sz_equal_ice>(env)).log(base, base_stl);
 #endif
 #if SZ_USE_NEON
-    benchmark(env, "sz_equal_neon", validator, equality_from_sz<sz_equal_neon>(env)).log(base, base_stl);
+    bench_unary(env, "sz_equal_neon", validator, equality_from_sz<sz_equal_neon>(env)).log(base, base_stl);
 #endif
 #if SZ_USE_SVE
-    benchmark(env, "sz_equal_sve", validator, equality_from_sz<sz_equal_sve>(env)).log(base, base_stl);
+    bench_unary(env, "sz_equal_sve", validator, equality_from_sz<sz_equal_sve>(env)).log(base, base_stl);
 #endif
 }
 
 void bench_comparing_order(environment_t const &env) {
 
     auto validator = ordering_from_memcmp_t(env);
-    benchmark_result_t base =
-        benchmark(env, "sz_order_serial", validator, ordering_from_sz<sz_order_serial>(env)).log();
-    benchmark_result_t base_stl = benchmark(env, "memcmp<=>0", validator).log(base);
+    bench_result_t base = bench_unary(env, "sz_order_serial", validator, ordering_from_sz<sz_order_serial>(env)).log();
+    bench_result_t base_stl = bench_unary(env, "order<std::memcmp>", validator).log(base);
 
 #if SZ_USE_HASWELL
-    benchmark(env, "sz_order_haswell", validator, ordering_from_sz<sz_order_haswell>(env)).log(base, base_stl);
+    bench_unary(env, "sz_order_haswell", validator, ordering_from_sz<sz_order_haswell>(env)).log(base, base_stl);
 #endif
 #if SZ_USE_SKYLAKE
-    benchmark(env, "sz_order_skylake", validator, ordering_from_sz<sz_order_skylake>(env)).log(base, base_stl);
+    bench_unary(env, "sz_order_skylake", validator, ordering_from_sz<sz_order_skylake>(env)).log(base, base_stl);
 #endif
 #if SZ_USE_ICE
-    benchmark(env, "sz_order_ice", validator, ordering_from_sz<sz_order_ice>(env)).log(base, base_stl);
+    bench_unary(env, "sz_order_ice", validator, ordering_from_sz<sz_order_ice>(env)).log(base, base_stl);
 #endif
 #if SZ_USE_NEON
-    benchmark(env, "sz_order_neon", validator, ordering_from_sz<sz_order_neon>(env)).log(base, base_stl);
+    bench_unary(env, "sz_order_neon", validator, ordering_from_sz<sz_order_neon>(env)).log(base, base_stl);
 #endif
 }
 

From 991a78b62869e3dd3be8464ccc7987430eeb3fbb Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 13 Mar 2025 10:03:05 +0000
Subject: [PATCH 200/751] Make: Rename `bench_sort`

---
 CMakeLists.txt                                 |  2 +-
 CONTRIBUTING.md                                |  4 ++--
 scripts/bench_container.cpp                    |  4 ++--
 scripts/bench_search.cpp                       |  2 +-
 scripts/{bench_sort.cpp => bench_sequence.cpp} | 16 ++++++++--------
 scripts/{bench_sort.py => bench_sequence.py}   |  0
 scripts/bench_token.cpp                        |  2 +-
 7 files changed, 15 insertions(+), 15 deletions(-)
 rename scripts/{bench_sort.cpp => bench_sequence.cpp} (98%)
 rename scripts/{bench_sort.py => bench_sequence.py} (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index eb538eef..362bb7ba 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -254,7 +254,7 @@ endfunction ()
 if (${STRINGZILLA_BUILD_BENCHMARK})
     define_launcher(stringzilla_bench_search scripts/bench_search.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_similarity scripts/bench_similarity.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringzilla_bench_sort scripts/bench_sort.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_bench_sort scripts/bench_sequence.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_token scripts/bench_token.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_container scripts/bench_container.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_memory scripts/bench_memory.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index a8e825af..44096b43 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -27,13 +27,13 @@ They have the broadest coverage of the library, and are the most important to ke
 - `bench_token.cpp` - token-level ops, like hashing, ordering, equality checks.
 - `bench_search.cpp` - bidirectional substring search, both exact and fuzzy.
 - `bench_similarity.cpp` - benchmark all edit distance backends.
-- `bench_sort.cpp` - sorting, partitioning, merging.
+- `bench_sequence.cpp` - sorting, partitioning, merging.
 - `bench_container.cpp` - STL containers with different string keys.
 
 The role of Python benchmarks is less to provide absolute number, but to compare against popular tools in the Python ecosystem.
 
 - `bench_search.(py|ipynb)` - compares against native Python `str`.
-- `bench_sort.(py|ipynb)` - compares against `pandas`.
+- `bench_sequence.(py|ipynb)` - compares against `pandas`.
 - `bench_similarity.(ipynb)` - compares against `jellyfish`, `editdistance`, etc.
 
 ## Benchmarking Datasets
diff --git a/scripts/bench_container.cpp b/scripts/bench_container.cpp
index 419f0dfa..7b6f841c 100644
--- a/scripts/bench_container.cpp
+++ b/scripts/bench_container.cpp
@@ -3,7 +3,7 @@
  *  @brief  Benchmarks STL associative containers with @b `std::string_view`-compatible keys.
  *          The program accepts a file path to a dataset, tokenizes it, and benchmarks the lookup operations.
  *
- *  This file is the sibling of `bench_sort.cpp`, `bench_search.cpp` and `bench_token.cpp`.
+ *  This file is the sibling of `bench_sequence.cpp`, `bench_search.cpp` and `bench_token.cpp`.
  *  It accepts a file with a list of words, constructs associative containers with string keys,
  *  using `std::string`, `std::string_view`, `sz::string_view`, and `sz::string`, and then
  *  evaluates the latency of lookups.
@@ -35,7 +35,7 @@
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
- *  This file is the sibling of `bench_sort.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
+ *  This file is the sibling of `bench_sequence.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
  */
 #include <map>           // `std::map`
 #include <unordered_map> // `std::unordered_map`
diff --git a/scripts/bench_search.cpp b/scripts/bench_search.cpp
index c9dd6e5f..da827436 100644
--- a/scripts/bench_search.cpp
+++ b/scripts/bench_search.cpp
@@ -45,7 +45,7 @@
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
- *  This file is the sibling of `bench_sort.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
+ *  This file is the sibling of `bench_sequence.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
  */
 #include <cstring>    // `memmem`
 #include <functional> // `std::boyer_moore_searcher`
diff --git a/scripts/bench_sort.cpp b/scripts/bench_sequence.cpp
similarity index 98%
rename from scripts/bench_sort.cpp
rename to scripts/bench_sequence.cpp
index a99607e9..7560fdc0 100644
--- a/scripts/bench_sort.cpp
+++ b/scripts/bench_sequence.cpp
@@ -1,6 +1,6 @@
 
 /**
- *  @file   bench_sort.cpp
+ *  @file   bench_sequence.cpp
  *  @brief  Benchmarks sorting, partitioning, and merging operations on string sequences.
  *          The program accepts a file path to a dataset, tokenizes it, and benchmarks the search operations,
  *          validating the SIMD-accelerated backends against the serial baselines.
@@ -30,8 +30,8 @@
  *
  *  @code{.sh}
  *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
- *  cmake --build build_release --config Release --target stringzilla_bench_sort
- *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringzilla_bench_sort
+ *  cmake --build build_release --config Release --target stringzilla_bench_sequence
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringzilla_bench_sequence
  *  @endcode
  *
  *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
@@ -41,7 +41,7 @@
  *  @code{.sh}
  *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
  *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
- *  build_release/stringzilla_bench_sort
+ *  build_release/stringzilla_bench_sequence
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
@@ -204,7 +204,7 @@ struct argsort_strings_via_sz {
  *  @brief Find the array permutation that sorts the input strings.
  *  @warning Some algorithms use more memory than others and memory usage is not accounted for in this benchmark.
  */
-void bench_sorting_strings(environment_t const &env) {
+void bench_sequenceing_strings(environment_t const &env) {
     permute_t permute_buffer(env.tokens.size());
 
     // First, benchmark the STL function
@@ -282,7 +282,7 @@ struct sort_pgrams_via_sz {
  *  @brief Find the array permutation that sorts the input strings.
  *  @warning Some algorithms use more memory than others and memory usage is not accounted for in this benchmark.
  */
-void bench_sorting_pgrams(environment_t const &env) {
+void bench_sequenceing_pgrams(environment_t const &env) {
     permute_t permute_buffer(env.tokens.size());
 
     // Before sorting the strings themselves, which is a heavy operation,
@@ -439,8 +439,8 @@ int main(int argc, char const **argv) {
         environment_t::tokenization_t::words_k);
 
     std::printf("Starting search benchmarks...\n");
-    bench_sorting_pgrams(env);
-    bench_sorting_strings(env);
+    bench_sequenceing_pgrams(env);
+    bench_sequenceing_strings(env);
     bench_intersections(env);
 
     std::printf("All benchmarks passed.\n");
diff --git a/scripts/bench_sort.py b/scripts/bench_sequence.py
similarity index 100%
rename from scripts/bench_sort.py
rename to scripts/bench_sequence.py
diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index a97c336c..10e70148 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -45,7 +45,7 @@
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
- *  This file is the sibling of `bench_sort.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
+ *  This file is the sibling of `bench_sequence.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
  */
 #include <numeric> // `std::accumulate`
 

From d1a37792e7a0853eb89a2233d50d45ce0c304244 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 13 Mar 2025 12:56:32 +0000
Subject: [PATCH 201/751] Improve: `do_not_optimize` token-level results

---
 scripts/bench_token.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index 10e70148..e842f287 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -66,6 +66,7 @@ struct bytesum_from_sz {
 
     inline call_result_t operator()(std::string_view buffer) const noexcept {
         sz_u64_t bytesum = func_(buffer.data(), buffer.size());
+        do_not_optimize(bytesum);
         return {buffer.size(), static_cast<check_value_t>(bytesum)};
     }
 };
@@ -82,6 +83,7 @@ struct bytesum_from_std_t {
         std::size_t bytesum =
             std::accumulate(buffer.begin(), buffer.end(), (std::size_t)0,
                             [](std::size_t sum, char c) { return sum + static_cast<unsigned char>(c); });
+        do_not_optimize(bytesum);
         return {buffer.size(), static_cast<check_value_t>(bytesum)};
     }
 };
@@ -97,6 +99,7 @@ struct hash_from_sz {
 
     inline call_result_t operator()(std::string_view buffer) const noexcept {
         sz_u64_t hash = func_(buffer.data(), buffer.size(), 0);
+        do_not_optimize(hash);
         return {buffer.size(), static_cast<check_value_t>(hash)};
     }
 };
@@ -130,6 +133,7 @@ struct hash_stream_from_sz {
         init_(&state, 42);
         stream_(&state, s.data(), s.size());
         sz_u64_t hash = fold_(&state);
+        do_not_optimize(hash);
         return {s.size(), static_cast<check_value_t>(hash)};
     }
 };
@@ -282,6 +286,10 @@ struct ordering_from_sz {
         sz_ordering_t ba = func_(b.data(), a.size(), a.data(), a.size());
         std::size_t max_bytes_passed = 4 * std::min(a.size(), b.size());
         check_value_t check_value = ab + aa * 3 + bb * 9 + ba * 27; // Each can have 3 unique values
+        do_not_optimize(ab);
+        do_not_optimize(aa);
+        do_not_optimize(bb);
+        do_not_optimize(ba);
         return {max_bytes_passed, check_value};
     }
 };
@@ -301,6 +309,10 @@ struct ordering_from_memcmp_t {
         int ba = memcmp_for_ordering(b, a);
         std::size_t max_bytes_passed = 4 * std::min(a.size(), b.size());
         check_value_t check_value = ab + aa * 3 + bb * 9 + ba * 27; // Each can have 3 unique values
+        do_not_optimize(ab);
+        do_not_optimize(aa);
+        do_not_optimize(bb);
+        do_not_optimize(ba);
         return {max_bytes_passed, check_value};
     }
 

From 9fbdc9a9b012a104f5885a52ef0d764b516c82af Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 13 Mar 2025 13:04:21 +0000
Subject: [PATCH 202/751] Add: SVE2 macros

---
 CMakeLists.txt              |  6 ++++--
 README.md                   |  2 +-
 include/stringzilla/types.h | 12 ++++++++++--
 scripts/bench.hpp           |  8 ++++++++
 scripts/test.cpp            |  2 ++
 setup.py                    |  3 +++
 6 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 362bb7ba..d275fd0b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -317,13 +317,15 @@ if (${STRINGZILLA_BUILD_SHARED})
             endif ()
 
             target_compile_definitions(
-                ${target} PRIVATE "SZ_USE_HASWELL=1" "SZ_USE_SKYLAKE=1" "SZ_USE_ICE=1" "SZ_USE_NEON=0" "SZ_USE_SVE=0"
+                ${target} PRIVATE "SZ_USE_HASWELL=1" "SZ_USE_SKYLAKE=1" "SZ_USE_ICE=1" "SZ_USE_NEON=0"
+                                  "SZ_USE_SVE=0 SZ_USE_SVE2=0"
             )
         elseif (SZ_PLATFORM_ARM)
             set_compiler_flags(${target} "" "armv8-a")
 
             target_compile_definitions(
-                ${target} PRIVATE "SZ_USE_HASWELL=0" "SZ_USE_SKYLAKE=0" "SZ_USE_ICE=0" "SZ_USE_NEON=1" "SZ_USE_SVE=1"
+                ${target} PRIVATE "SZ_USE_HASWELL=0" "SZ_USE_SKYLAKE=0" "SZ_USE_ICE=0" "SZ_USE_NEON=1"
+                                  "SZ_USE_SVE=1 SZ_USE_SVE2=1"
             )
         endif ()
 
diff --git a/README.md b/README.md
index b544bb1c..152bec37 100644
--- a/README.md
+++ b/README.md
@@ -1181,7 +1181,7 @@ __`SZ_DEBUG`__:
 > If you want to enable more aggressive bounds-checking, define `SZ_DEBUG` before including the header.
 > If not explicitly set, it will be inferred from the build type.
 
-__`SZ_USE_HASWELL`, `SZ_USE_SKYLAKE`, `SZ_USE_ICE`, `SZ_USE_NEON`, `SZ_USE_SVE`__:
+__`SZ_USE_HASWELL`, `SZ_USE_SKYLAKE`, `SZ_USE_ICE`, `SZ_USE_NEON`, `SZ_USE_SVE`, `SZ_USE_SVE2`__:
 
 > One can explicitly disable certain families of SIMD instructions for compatibility purposes.
 > Default values are inferred at compile time.
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 089c505a..5ab77fb6 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -245,6 +245,14 @@
 #endif
 #endif
 
+#ifndef SZ_USE_SVE2
+#ifdef __ARM_FEATURE_SVE2
+#define SZ_USE_SVE2 1
+#else
+#define SZ_USE_SVE2 0
+#endif
+#endif
+
 /*  Hardware-specific headers for different SIMD intrinsics and register wrappers.
  */
 #if SZ_USE_HASWELL || SZ_USE_SKYLAKE || SZ_USE_ICE
@@ -256,11 +264,11 @@
 #endif
 #include <arm_neon.h>
 #endif // SZ_USE_NEON
-#if SZ_USE_SVE
+#if SZ_USE_SVE || SZ_USE_SVE2
 #if !defined(_MSC_VER)
 #include <arm_sve.h>
 #endif
-#endif // SZ_USE_SVE
+#endif // SZ_USE_SVE || SZ_USE_SVE2
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index 85b3557a..fc0d74ea 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -406,6 +406,14 @@ inline environment_t build_environment(                                        /
     std::printf(" - Number of tokens: %zu\n", env.tokens.size());
     std::printf(" - Mean token length: %.2f bytes\n", mean_token_length);
 
+    std::printf("Compile-time capabilities:\n");
+    std::printf("- Uses Haswell: %s \n", SZ_USE_HASWELL ? "yes" : "no");
+    std::printf("- Uses Skylake: %s \n", SZ_USE_SKYLAKE ? "yes" : "no");
+    std::printf("- Uses Ice Lake: %s \n", SZ_USE_ICE ? "yes" : "no");
+    std::printf("- Uses NEON: %s \n", SZ_USE_NEON ? "yes" : "no");
+    std::printf("- Uses SVE: %s \n", SZ_USE_SVE ? "yes" : "no");
+    std::printf("- Uses SVE2: %s \n", SZ_USE_SVE2 ? "yes" : "no");
+
     return env;
 }
 
diff --git a/scripts/test.cpp b/scripts/test.cpp
index 0137053c..a21e6e4d 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -32,6 +32,7 @@
 // #define SZ_USE_ICE 0
 // #define SZ_USE_NEON 0
 // #define SZ_USE_SVE 0
+// #define SZ_USE_SVE2 0
 #define SZ_DEBUG 1 // Enforce aggressive logging for this unit.
 
 #include <stringzilla/stringzilla.hpp>
@@ -1931,6 +1932,7 @@ int main(int argc, char const **argv) {
     std::printf("- Uses Ice Lake: %s \n", SZ_USE_ICE ? "yes" : "no");
     std::printf("- Uses NEON: %s \n", SZ_USE_NEON ? "yes" : "no");
     std::printf("- Uses SVE: %s \n", SZ_USE_SVE ? "yes" : "no");
+    std::printf("- Uses SVE2: %s \n", SZ_USE_SVE2 ? "yes" : "no");
 
     // Basic utilities
     test_arithmetical_utilities();
diff --git a/setup.py b/setup.py
index fd1fbb5c..b29c4760 100644
--- a/setup.py
+++ b/setup.py
@@ -59,6 +59,7 @@ def linux_settings() -> Tuple[List[str], List[str], List[Tuple[str]]]:
         ("SZ_USE_ICE", "1" if is_64bit_x86() else "0"),
         ("SZ_USE_NEON", "1" if is_64bit_arm() else "0"),
         ("SZ_USE_SVE", "1" if is_64bit_arm() else "0"),
+        ("SZ_USE_SVE2", "1" if is_64bit_arm() else "0"),
         ("SZ_DETECT_BIG_ENDIAN", "1" if is_big_endian() else "0"),
     ]
 
@@ -96,6 +97,7 @@ def darwin_settings() -> Tuple[List[str], List[str], List[Tuple[str]]]:
         ("SZ_USE_ICE", "0"),
         ("SZ_USE_NEON", "1" if is_building_arm else "0"),
         ("SZ_USE_SVE", "0"),
+        ("SZ_USE_SVE2", "0"),
     ]
 
     return compile_args, link_args, macros_args
@@ -115,6 +117,7 @@ def windows_settings() -> Tuple[List[str], List[str], List[Tuple[str]]]:
         ("SZ_USE_ICE", "1" if is_64bit_x86() else "0"),
         ("SZ_USE_NEON", "1" if is_64bit_arm() else "0"),
         ("SZ_USE_SVE", "0"),
+        ("SZ_USE_SVE2", "0"),
         ("SZ_DETECT_BIG_ENDIAN", "1" if is_big_endian() else "0"),
     ]
 

From d6f87d7c85292406eeb408d986b84a66c126661f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 13 Mar 2025 13:05:41 +0000
Subject: [PATCH 203/751] Add: SVE & SVE2 bytesum

Benchmarking results look like:
```
Starting individual token-level benchmarks...
Benchmarking `bytesum<std::accumulate>`:
> Throughput: 2.30 GB/s @ 55.99 ns/call
Benchmarking `sz_bytesum_serial`:
> Throughput: 2.29 GB/s @ 56.20 ns/call
> - -0.4 % against `bytesum<std::accumulate>`
Benchmarking `sz_bytesum_neon`:
> Throughput: 3.38 GB/s @ 38.01 ns/call
> + 47.9 % against `sz_bytesum_serial`
> + 47.3 % against `bytesum<std::accumulate>`
Benchmarking `sz_bytesum_sve`:
> Throughput: 4.10 GB/s @ 31.36 ns/call
> + 79.2 % against `sz_bytesum_serial`
> + 78.5 % against `bytesum<std::accumulate>`
Benchmarking `sz_bytesum_sve2`:
> Throughput: 4.30 GB/s @ 29.88 ns/call
> + 88.1 % against `sz_bytesum_serial`
> + 87.3 % against `bytesum<std::accumulate>`
```
---
 include/stringzilla/hash.h | 60 +++++++++++++++++++++++++++++---------
 scripts/bench_token.cpp    |  6 ++++
 2 files changed, 53 insertions(+), 13 deletions(-)

diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 5aa40884..823e0b1e 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -2178,27 +2178,61 @@ SZ_PUBLIC void sz_fill_random_neon(sz_ptr_t text, sz_size_t length, sz_u64_t non
 #pragma GCC target("arch=armv8.2-a+sve")
 #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
 
-SZ_PUBLIC sz_u64_t sz_bytesum_sve(sz_cptr_t text, sz_size_t length) { return sz_bytesum_serial(text, length); }
-
-SZ_PUBLIC void sz_hash_state_init_sve(sz_hash_state_t *state, sz_u64_t seed) { sz_hash_state_init_serial(state, seed); }
-
-SZ_PUBLIC void sz_hash_state_stream_sve(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
-    sz_hash_state_stream_serial(state, text, length);
+SZ_PUBLIC sz_u64_t sz_bytesum_sve(sz_cptr_t text, sz_size_t length) {
+    sz_u64_t sum = 0;
+    sz_size_t progress = 0;
+    sz_size_t const vector_length = svcntb();
+    // SVE doesn't have widening accumulation, so we reduce across each loaded vector
+    for (; progress < length; progress += vector_length) {
+        svbool_t progress_mask = svwhilelt_b8(progress, length);
+        svuint8_t text_vec = svld1_u8(progress_mask, (sz_u8_t const *)(text + progress));
+        sum += svaddv_u8(progress_mask, text_vec);
+    }
+    return sum;
 }
 
-SZ_PUBLIC sz_u64_t sz_hash_state_fold_sve(sz_hash_state_t const *state) { return sz_hash_state_fold_serial(state); }
+#pragma clang attribute pop
+#pragma GCC pop_options
+#endif            // SZ_USE_SVE
+#pragma endregion // SVE Implementation
 
-SZ_PUBLIC sz_u64_t sz_hash_sve(sz_cptr_t text, sz_size_t length, sz_u64_t seed) {
-    return sz_hash_serial(text, length, seed);
-}
+/*  Implementation of the string search algorithms using the Arm SVE2 variable-length registers,
+ *  available in Arm v9 processors, like in Apple M4+ and Graviton 4+ CPUs.
+ */
+#pragma region SVE Implementation
+#if SZ_USE_SVE2
+#pragma GCC push_options
+#pragma GCC target("arch=armv8.2-a+sve+sve2")
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+sve2"))), apply_to = function)
+
+SZ_PUBLIC sz_u64_t sz_bytesum_sve2(sz_cptr_t text, sz_size_t length) {
+    sz_u64_t sum = 0;
+    sz_size_t progress = 0;
+    sz_size_t const vector_length = svcntb();
+    // In SVE2 we have an instruction, that can add 8-bit elements in one operand to 16-bit elements in another.
+    // Assuming the size mismatch, there 2 such instructions - for the top and bottom elements in each 8-bit pair.
+    //
+    // We can use that kind of logic to accelerate the inner loop, but we still need to reduce the 64-bit results.
+    while (progress < length) {
+        svuint16_t sum_u16_top = svdup_n_u16(0);
+        svuint16_t sum_u16_bot = svdup_n_u16(0);
+        // Assuming `u16` has a 256x wider range than `u8`, we can aggregate up to 256 lanes in each value.
+        for (sz_size_t loop_index = 0; progress < length && loop_index < 256; progress += vector_length, ++loop_index) {
+            svbool_t progress_mask = svwhilelt_b8(progress, length);
+            svuint8_t text_vec = svld1_u8(progress_mask, (sz_u8_t const *)(text + progress));
+            sum_u16_top = svaddwb_u16(sum_u16_top, text_vec);
+            sum_u16_bot = svaddwt_u16(sum_u16_bot, text_vec);
+        }
+        sum += svaddv_u16(svptrue_b16(), sum_u16_top);
+        sum += svaddv_u16(svptrue_b16(), sum_u16_bot);
+    }
 
-SZ_PUBLIC void sz_fill_random_sve(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
-    sz_fill_random_serial(text, length, nonce);
+    return sum;
 }
 
 #pragma clang attribute pop
 #pragma GCC pop_options
-#endif            // SZ_USE_SVE
+#endif            // SZ_USE_SVE2
 #pragma endregion // SVE Implementation
 
 /*  Pick the right implementation for the string search algorithms.
diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index e842f287..3ccd7fcc 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -157,6 +157,12 @@ void bench_checksums(environment_t const &env) {
 #if SZ_USE_NEON
     bench_unary(env, "sz_bytesum_neon", validator, bytesum_from_sz<sz_bytesum_neon>(env)).log(base, base_stl);
 #endif
+#if SZ_USE_SVE
+    bench_unary(env, "sz_bytesum_sve", validator, bytesum_from_sz<sz_bytesum_sve>(env)).log(base, base_stl);
+#endif
+#if SZ_USE_SVE2
+    bench_unary(env, "sz_bytesum_sve2", validator, bytesum_from_sz<sz_bytesum_sve2>(env)).log(base, base_stl);
+#endif
 }
 
 void bench_hashing(environment_t const &env) {

From a007c7ce91f756f6632a799f342632eed9704c01 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 13 Mar 2025 17:37:57 +0000
Subject: [PATCH 204/751] Add: Short string hashing in SVE2

On Graviton 4:
- Serial: 38 MB/s
- SVE2: 173 MB/s
- NEON: 167 MB/s
---
 include/stringzilla/hash.h | 155 ++++++++++++++++++++++++++++++++++++-
 scripts/bench_token.cpp    |   3 +
 scripts/test.cpp           |   8 ++
 3 files changed, 163 insertions(+), 3 deletions(-)

diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 823e0b1e..b0a54e9d 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -2197,13 +2197,18 @@ SZ_PUBLIC sz_u64_t sz_bytesum_sve(sz_cptr_t text, sz_size_t length) {
 #pragma endregion // SVE Implementation
 
 /*  Implementation of the string search algorithms using the Arm SVE2 variable-length registers,
- *  available in Arm v9 processors, like in Apple M4+ and Graviton 4+ CPUs.
+ *  available in Arm v9 processors, like in AWS Graviton 4+ CPUs.
+ *
+ *  Our AES hashing algorithms are implemented differently depending on the size of the size of the input.
+ *  Given how SVE+AES extensions are structured, we have a separate implementation for different register sizes.
+ *
+ *  @see https://stackoverflow.com/a/73218637/2766161
  */
 #pragma region SVE Implementation
 #if SZ_USE_SVE2
 #pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+sve+sve2")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+sve2"))), apply_to = function)
+#pragma GCC target("arch=armv8.2-a+sve+sve2+sve2-aes")
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+sve2+sve2-aes"))), apply_to = function)
 
 SZ_PUBLIC sz_u64_t sz_bytesum_sve2(sz_cptr_t text, sz_size_t length) {
     sz_u64_t sum = 0;
@@ -2230,6 +2235,136 @@ SZ_PUBLIC sz_u64_t sz_bytesum_sve2(sz_cptr_t text, sz_size_t length) {
     return sum;
 }
 
+/**
+ *  @brief  Emulates the Intel's AES-NI `AESENC` instruction with Arm SVE2.
+ *  @see    "Emulating x86 AES Intrinsics on ARMv8-A" by Michael Brase:
+ *          https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a/
+ */
+SZ_INTERNAL svuint8_t _sz_emulate_aesenc_u8x16_sve2(svuint8_t state_vec, svuint8_t round_key_vec) {
+    return sveor_u8_x(svptrue_b8(), svaesmc_u8(svaese_u8(state_vec, svdup_n_u8(0))), round_key_vec);
+}
+
+SZ_INTERNAL svuint64_t _sz_emulate_aesenc_u64x2_sve2(svuint64_t state_vec, svuint64_t round_key_vec) {
+    return svreinterpret_u64_u8(_sz_emulate_aesenc_u8x16_sve2( //
+        svreinterpret_u8_u64(state_vec),                       //
+        svreinterpret_u8_u64(round_key_vec)));
+}
+
+/** @brief A variant of `sz_hash_sve2` for strings up to 16 bytes long - smallest SVE register size. */
+SZ_PUBLIC sz_u64_t _sz_hash_sve2_upto16(sz_cptr_t text, sz_size_t length, sz_u64_t seed) {
+    svuint8_t state_aes, state_sum, state_key;
+
+    // To load and store the seed, we don't even need a `svwhilelt_b64(0, 2)`.
+    state_key = svreinterpret_u8_u64(svdup_n_u64(seed));
+
+    // XOR the user-supplied keys with the two "pi" constants
+    sz_u64_t const *pi = _sz_hash_pi_constants();
+    svuint64_t pi0 = svld1_u64(svptrue_b64(), pi);
+    svuint64_t pi1 = svld1_u64(svptrue_b64(), pi + 8);
+    state_aes = sveor_u8_x(svptrue_b8(), state_key, svreinterpret_u8_u64(pi0));
+    state_sum = sveor_u8_x(svptrue_b8(), state_key, svreinterpret_u8_u64(pi1));
+
+    // We will only use the first 128 bits of the shuffle mask
+    svuint8_t const shuffle_mask = svld1_u8(svptrue_b8(), _sz_hash_u8x16x4_shuffle());
+
+    // This is our best case for SVE2 dominance over NEON - we can load the data in one go with a predicate.
+    svuint8_t block = svld1_u8(svwhilelt_b8((sz_size_t)0, length), (sz_u8_t const *)text);
+    // One round of hashing logic
+    state_aes = _sz_emulate_aesenc_u8x16_sve2(state_aes, block);
+    svuint8_t sum_shuffled = svtbl_u8(state_sum, shuffle_mask);
+    state_sum = svreinterpret_u8_u64(
+        svadd_u64_x(svptrue_b64(), svreinterpret_u64_u8(sum_shuffled), svreinterpret_u64_u8(block)));
+
+    // Now mix, folding the length into the key
+    svuint64_t key_with_length = svadd_u64_x(svptrue_b64(), svreinterpret_u64_u8(state_key), svdupq_n_u64(length, 0));
+    // Combine the "sum" and the "AES" blocks
+    svuint8_t mixed_registers = _sz_emulate_aesenc_u8x16_sve2(state_sum, state_aes);
+    // Make sure the "key" mixes enough with the state,
+    // as with less than 2 rounds - SMHasher fails
+    svuint8_t mixed_within_register = _sz_emulate_aesenc_u8x16_sve2(
+        _sz_emulate_aesenc_u8x16_sve2(mixed_registers, svreinterpret_u8_u64(key_with_length)), mixed_registers);
+    // Extract the low 64 bits
+    svuint64_t mixed_within_register_u64 = svreinterpret_u64_u8(mixed_within_register);
+    return svlasta_u64(svpfalse_b(), mixed_within_register_u64); // Extract the first element
+}
+
+SZ_PUBLIC void sz_hash_state_init_sve2(sz_hash_state_t *state, sz_u64_t seed) { sz_hash_state_init_neon(state, seed); }
+
+SZ_PUBLIC void sz_hash_state_stream_sve2(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
+    sz_hash_state_stream_neon(state, text, length);
+}
+
+SZ_PUBLIC sz_u64_t sz_hash_state_fold_sve2(sz_hash_state_t const *state) { //
+    return sz_hash_state_fold_neon(state);
+}
+
+SZ_PUBLIC sz_u64_t sz_hash_sve2(sz_cptr_t text, sz_size_t length, sz_u64_t seed) {
+    if (length <= 16) { return _sz_hash_sve2_upto16(text, length, seed); }
+    else { return sz_hash_neon(text, length, seed); }
+}
+
+SZ_PUBLIC void sz_fill_random_sve2(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
+    return sz_fill_random_neon(text, length, nonce);
+}
+
+#if 0
+/**
+ *  @brief  A helper function for computing 16x packed string hashes for strings up to 16 bytes long.
+ *          The number 16 is derived from 2048 bits (256 bytes) being the maximum size of the SVE register
+ *          and the AES block size being 128 bits (16 bytes). So in the largest SVE register, we can fit
+ *          16 such individual AES blocks.
+ *          It's relevant for set intersection operations and is faster than hashing each string individually.
+ */
+SZ_PUBLIC void _sz_hash_sve2_upto16x16(char texts[16][16], sz_size_t length[16], sz_u64_t seed, sz_u64_t hashes[16]) {
+    svuint8_t state_aes, state_sum, state_key;
+
+    // To load and store the seed, we don't even need a `svwhilelt_b64(0, 2)`.
+    state_key = svreinterpret_u8_u64(svdup_n_u64(seed));
+
+    // XOR the user-supplied keys with the two "pi" constants
+    sz_u64_t const *pi = _sz_hash_pi_constants();
+    svuint64_t pi0 = svdupq_n_u64(pi[0], pi[1]);
+    svuint64_t pi1 = svdupq_n_u64(pi[8], pi[9]);
+    state_aes = sveor_u8_x(svptrue_b8(), state_key, svreinterpret_u8_u64(pi0));
+    state_sum = sveor_u8_x(svptrue_b8(), state_key, svreinterpret_u8_u64(pi1));
+
+    // We will only use the first 128 bits of the shuffle mask
+    sz_u8_t const *shuffle_mask = _sz_hash_u8x16x4_shuffle();
+    svuint8_t const shuffle_mask = svreinterpret_u8_u64(svdupq_n_u64( //
+        *(sz_u64_t const *)(shuffle_mask + 0),                        //
+        *(sz_u64_t const *)(shuffle_mask + 8)));
+    svuint8_t const sum_shuffled = svtbl_u8(state_sum, shuffle_mask);
+
+    // Loop throughthe input until we process all the bytes
+    sz_size_t const bytes_per_register = svcntb();
+    sz_size_t const texts_per_register = bytes_per_register / 16;
+    for (sz_size_t progress_bytes = 0; progress_bytes < 256; progress_bytes += bytes_per_register) {
+        svuint8_t blocks =
+            svld1_u8(svwhilelt_b8(progress_bytes, 256), (sz_u8_t const *)(&texts[0][0] + progress_bytes));
+
+        // One round of hashing logic for multiple blocks
+        svuint8_t blocks_aes = _sz_emulate_aesenc_u8x16_sve2(state_aes, blocks);
+        svuint8_t blocks_sum = svreinterpret_u8_u64(
+            svadd_u64_x(svptrue_b64(), svreinterpret_u64_u8(sum_shuffled), svreinterpret_u64_u8(blocks)));
+
+        // Now mix, folding the length into the key
+        svuint64_t key_with_lengths =
+            svadd_u64_x(svptrue_b64(), svreinterpret_u64_u8(state_key), svdupq_n_u64(length, 0));
+
+        // Combine the "sum" and the "AES" blocks
+        svuint8_t mixed_registers = _sz_emulate_aesenc_u8x16_sve2(blocks_sum, blocks_aes);
+
+        // Make sure the "key" mixes enough with the state,
+        // as with less than 2 rounds - SMHasher fails
+        svuint8_t mixed_within_register = _sz_emulate_aesenc_u8x16_sve2(
+            _sz_emulate_aesenc_u8x16_sve2(mixed_registers, svreinterpret_u8_u64(key_with_lengths)), mixed_registers);
+
+        // Extract the low 64 bits from each lane
+        svuint64_t mixed_within_register_u64 = svreinterpret_u64_u8(mixed_within_register);
+    }
+}
+#endif
+
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_SVE2
@@ -2248,6 +2383,10 @@ SZ_DYNAMIC sz_u64_t sz_bytesum(sz_cptr_t text, sz_size_t length) {
     return sz_bytesum_skylake(text, length);
 #elif SZ_USE_HASWELL
     return sz_bytesum_haswell(text, length);
+#elif SZ_USE_SVE2
+    return sz_bytesum_sve2(text, length);
+#elif SZ_USE_SVE
+    return sz_bytesum_sve(text, length);
 #elif SZ_USE_NEON
     return sz_bytesum_neon(text, length);
 #else
@@ -2262,6 +2401,8 @@ SZ_DYNAMIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length, sz_u64_t seed) {
     return sz_hash_skylake(text, length, seed);
 #elif SZ_USE_HASWELL
     return sz_hash_haswell(text, length, seed);
+#elif SZ_USE_SVE2
+    return sz_hash_sve2(text, length, seed);
 #elif SZ_USE_NEON
     return sz_hash_neon(text, length, seed);
 #else
@@ -2276,6 +2417,8 @@ SZ_DYNAMIC void sz_fill_random(sz_ptr_t text, sz_size_t length, sz_u64_t nonce)
     sz_fill_random_skylake(text, length, nonce);
 #elif SZ_USE_HASWELL
     sz_fill_random_haswell(text, length, nonce);
+#elif SZ_USE_SVE2
+    sz_fill_random_sve2(text, length, nonce);
 #elif SZ_USE_NEON
     sz_fill_random_neon(text, length, nonce);
 #else
@@ -2290,6 +2433,8 @@ SZ_DYNAMIC void sz_hash_state_init(sz_hash_state_t *state, sz_u64_t seed) {
     sz_hash_state_init_skylake(state, seed);
 #elif SZ_USE_HASWELL
     sz_hash_state_init_haswell(state, seed);
+#elif SZ_USE_SVE2
+    sz_hash_state_init_sve2(state, seed);
 #elif SZ_USE_NEON
     sz_hash_state_init_neon(state, seed);
 #else
@@ -2304,6 +2449,8 @@ SZ_DYNAMIC void sz_hash_state_stream(sz_hash_state_t *state, sz_cptr_t text, sz_
     sz_hash_state_stream_skylake(state, text, length);
 #elif SZ_USE_HASWELL
     sz_hash_state_stream_haswell(state, text, length);
+#elif SZ_USE_SVE2
+    sz_hash_state_stream_sve2(state, text, length);
 #elif SZ_USE_NEON
     sz_hash_state_stream_neon(state, text, length);
 #else
@@ -2318,6 +2465,8 @@ SZ_DYNAMIC sz_u64_t sz_hash_state_fold(sz_hash_state_t const *state) {
     return sz_hash_state_fold_skylake(state);
 #elif SZ_USE_HASWELL
     return sz_hash_state_fold_haswell(state);
+#elif SZ_USE_SVE2
+    return sz_hash_state_fold_sve2(state);
 #elif SZ_USE_NEON
     return sz_hash_state_fold_neon(state);
 #else
diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index 3ccd7fcc..3a6daf6a 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -179,6 +179,9 @@ void bench_hashing(environment_t const &env) {
 #if SZ_USE_ICE
     bench_unary(env, "sz_hash_ice", validator, hash_from_sz<sz_hash_ice>(env)).log(base, base_stl);
 #endif
+#if SZ_USE_SVE2
+    bench_unary(env, "sz_hash_sve2", validator, hash_from_sz<sz_hash_sve2>(env)).log(base, base_stl);
+#endif
 #if SZ_USE_NEON
     bench_unary(env, "sz_hash_neon", validator, hash_from_sz<sz_hash_neon>(env)).log(base, base_stl);
 #endif
diff --git a/scripts/test.cpp b/scripts/test.cpp
index a21e6e4d..590326f2 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -292,6 +292,14 @@ static void test_simd_against_serial() {
         sz_hash_state_stream_neon, sz_hash_state_fold_neon);
     test_random_generator_on_platform(sz_fill_random_serial, sz_fill_random_neon);
 #endif
+#if SZ_USE_SVE2
+    test_hashing_on_platform(                                   //
+        sz_hash_serial, sz_hash_state_init_serial,              //
+        sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
+        sz_hash_sve2, sz_hash_state_init_sve2,                  //
+        sz_hash_state_stream_sve2, sz_hash_state_fold_sve2);
+    test_random_generator_on_platform(sz_fill_random_serial, sz_fill_random_sve2);
+#endif
 };
 
 /**

From fafb8b0a7a053d14de7200c9995568baf8ebbfb1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 13 Mar 2025 17:38:39 +0000
Subject: [PATCH 205/751] Docs: SVE2 intersects TODO

---
 include/stringzilla/intersect.h   | 3 ++-
 include/stringzilla/stringzilla.h | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/stringzilla/intersect.h b/include/stringzilla/intersect.h
index cf24eb57..7504e6e8 100644
--- a/include/stringzilla/intersect.h
+++ b/include/stringzilla/intersect.h
@@ -5,7 +5,7 @@
  *
  *  Includes core APIs for `sz_sequence_t` string collections with hardware-specific backends:
  *
- *  - `sz_sequence_intersection` - to compute the strict intersection of two deduplicated string collections.
+ *  - `sz_sequence_intersect` - to compute the strict intersection of two deduplicated string collections.
  *  - TODO: `sz_sequence_join` - to compute the intersection of two arbitrary string collections.
  */
 #ifndef STRINGZILLA_INTERSECT_H_
@@ -724,6 +724,7 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_sve(sz_sequence_t const *first_seque
                                                 sz_memory_allocator_t *alloc, sz_u64_t seed,
                                                 sz_size_t *intersection_size, sz_sorted_idx_t *first_positions,
                                                 sz_sorted_idx_t *second_positions) {
+    // TODO: Finalize `_sz_hash_sve2_upto16x16` and integrate here
     return sz_sequence_intersect_serial( //
         first_sequence, second_sequence, //
         alloc, seed, intersection_size,  //
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 682f63f0..a1c5e60f 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -32,7 +32,8 @@
  *  - `SZ_USE_SKYLAKE=?` - whether to use AVX-512 instructions on x86_64.
  *  - `SZ_USE_ICE=?` - whether to use AVX-512 VBMI & wider AES instructions on x86_64.
  *  - `SZ_USE_NEON=?` - whether to use NEON instructions on ARM.
- *  - `SZ_USE_SVE=?` - whether to use SVE and SVE2 instructions on ARM.
+ *  - `SZ_USE_SVE=?` - whether to use SVE instructions on ARM.
+ *  - `SZ_USE_SVE2=?` - whether to use SVE2 instructions on ARM.
  */
 #ifndef STRINGZILLA_H_
 #define STRINGZILLA_H_

From c12e6c4651c9fe5f171dd6f82dc35b9f807160f9 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 13 Mar 2025 18:52:53 +0000
Subject: [PATCH 206/751] Add: New string similarity benchmarks

---
 CONTRIBUTING.md                  |   4 +-
 include/stringzilla/similarity.h |  11 +-
 scripts/bench_similarity.cpp     | 279 +++++++++++++++++--------------
 scripts/test.cpp                 |   4 +-
 scripts/test.hpp                 |   6 +-
 5 files changed, 171 insertions(+), 133 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 44096b43..c6c4cefc 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -58,9 +58,9 @@ wget --no-clobber -O xlsum.csv.gz https://github.com/ashvardanian/xl-sum/release
 gzip -d  xlsum.csv.gz
 
 # Human chromosome generator dataset generated by:
-# 1200 rows, each 800 characters long (939K)
 # https://github.com/rghilduta/human-chromosome-data-generator/blob/main/generate_chromosome_data.sh
-wget --no-clobber -O human_protein_1200row_800len.txt https://media.githubusercontent.com/media/rghilduta/human-chromosome-data-generator/main/examples/human_protein_1200row_800len.txt
+# 1200 rows, each 800 characters long (939K)
+wget --no-clobber -O proteins.txt https://media.githubusercontent.com/media/rghilduta/human-chromosome-data-generator/main/examples/human_protein_1200row_800len.txt
 ```
 
 ## IDE Integrations
diff --git a/include/stringzilla/similarity.h b/include/stringzilla/similarity.h
index 6d65fcbe..a1c60037 100644
--- a/include/stringzilla/similarity.h
+++ b/include/stringzilla/similarity.h
@@ -189,7 +189,7 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distance(                       //
  *  @sa     sz_levenshtein_distance
  *
  *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
- *  @sa     sz_levenshtein_distance_utf8_serial, sz_levenshtein_distance_utf8_ice
+ *  @sa     sz_levenshtein_distance_utf8_serial
  */
 SZ_DYNAMIC sz_status_t sz_levenshtein_distance_utf8(                  //
     sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
@@ -577,6 +577,13 @@ SZ_PUBLIC sz_status_t sz_levenshtein_distance_serial( //
         longer, longer_length, shorter, shorter_length, bound, sz_false_k, alloc, result_ptr);
 }
 
+SZ_PUBLIC sz_status_t sz_levenshtein_distance_utf8_serial( //
+    sz_cptr_t a, sz_size_t a_length,                       //
+    sz_cptr_t b, sz_size_t b_length,                       //
+    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result_ptr) {
+    return _sz_levenshtein_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc, result_ptr);
+}
+
 SZ_PUBLIC sz_status_t sz_needleman_wunsch_score_serial( //
     sz_cptr_t longer, sz_size_t longer_length,          //
     sz_cptr_t shorter, sz_size_t shorter_length,        //
@@ -1522,7 +1529,7 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distance_utf8( //
     sz_cptr_t a, sz_size_t a_length,                 //
     sz_cptr_t b, sz_size_t b_length,                 //
     sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result_ptr) {
-    return _sz_levenshtein_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc, result_ptr);
+    return sz_levenshtein_distance_utf8_serial(a, a_length, b, b_length, bound, alloc, result_ptr);
 }
 
 SZ_DYNAMIC sz_status_t sz_needleman_wunsch_score( //
diff --git a/scripts/bench_similarity.cpp b/scripts/bench_similarity.cpp
index b2d1c9ee..df0d38f4 100644
--- a/scripts/bench_similarity.cpp
+++ b/scripts/bench_similarity.cpp
@@ -1,148 +1,177 @@
 /**
  *  @file   bench_similarity.cpp
  *  @brief  Benchmarks string similarity computations.
+ *          It accepts a file with a list of words, and benchmarks the levenshtein edit-distance computations,
+ *          alignment scores, and fingerprinting techniques combined with the Hamming distance.
  *
- *  This file is the sibling of `bench_sort.cpp`, `bench_search.cpp` and `bench_token.cpp`.
- *  It accepts a file with a list of words, and benchmarks the levenshtein edit-distance computations,
- *  alignment scores, and fingerprinting techniques combined with the Hamming distance.
+ *  Benchmarks include:
+ *  - Linear-complexity basic & bounded Hamming distance computations.
+ *  - Quadratic-complexity basic & bounded Levenshtein edit-distance computations.
+ *  - Quadratic-complexity Needleman-Wunsch alignment scores for bioinformatics.
+ *
+ *  For Dynamic Programming algorithms, the number of operations per second are reported as the worst-case time
+ *  complexity of the Cells Updates Per Second @b (CUPS) metric, meaning O(N*M) for a pair of strings with N and M
+ *  characters, respectively.
+ *
+ *  Instead of CLI arguments, for compatibility with @b StringWa.rs, the following environment variables are used:
+ *  - `STRINGWARS_DATASET` : Path to the dataset file.
+ *  - `STRINGWARS_TOKENS=words` : Tokenization model ("file", "lines", "words", or positive integer [1:200] for N-grams
+ *  - `STRINGWARS_SEED=42` : Optional seed for shuffling reproducibility.
+ *
+ *  Unlike StringWa.rs, the following additional environment variables are supported:
+ *  - `STRINGWARS_DURATION=10` : Time limit (in seconds) per benchmark.
+ *  - `STRINGWARS_STRESS=1` : Test SIMD-accelerated functions against the serial baselines.
+ *  - `STRINGWARS_STRESS_DIR=/.tmp` : Output directory for stress-testing failures logs.
+ *  - `STRINGWARS_STRESS_LIMIT=1` : Controls the number of failures we're willing to tolerate.
+ *  - `STRINGWARS_STRESS_DURATION=10` : Stress-testing time limit (in seconds) per benchmark.
+ *  - `STRINGWARS_FILTER` : Regular Expression pattern to filter algorithm/backend names.
+ *
+ *  Here are a few build & run commands:
+ *
+ *  @code{.sh}
+ *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
+ *  cmake --build build_release --config Release --target stringzilla_bench_similarity
+ *  STRINGWARS_DATASET=xlsum.csv STRINGWARS_TOKENS=words build_release/stringzilla_bench_similarity
+ *  @endcode
+ *
+ *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
+ *  like all Skylake-X and newer kernels on a boundary-condition input length of 64 bytes (exactly 1 cache line),
+ *  your last command may look like:
+ *
+ *  @code{.sh}
+ *  STRINGWARS_DATASET=proteins.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
+ *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
+ *  build_release/stringzilla_bench_similarity
+ *  @endcode
+ *
+ *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
+ *  This file is the sibling of `bench_search.cpp`, `bench_token.cpp`, `bench_sequence.cpp`, and `bench_memory.cpp`.
  */
-#include <bench.hpp>
-#include <test.hpp> // `levenshtein_baseline`, `unary_substitution_costs`
+
+#include "bench.hpp"
+#include "test.hpp" // `levenshtein_baseline`, `unary_substitution_costs`
 
 using namespace ashvardanian::stringzilla::scripts;
 
-using temporary_memory_t = std::vector<char>;
-temporary_memory_t temporary_memory;
+#pragma region Hamming Distance
 
-static void *allocate_from_vector(sz_size_t length, void *handle) {
-    temporary_memory_t &vec = *reinterpret_cast<temporary_memory_t *>(handle);
-    if (vec.size() < length) vec.resize(length);
-    return vec.data();
-}
+/** @brief Wraps a hardware-specific Hamming-distance backend into something @b `bench_unary`-compatible . */
+template <sz_hamming_distance_t hamming_distance_>
+struct hamming_from_sz {
 
-static void free_from_vector(void *buffer, sz_size_t length, void *handle) { sz_unused(buffer && length && handle); }
-
-tracked_binary_functions_t distance_functions() {
-    // Populate the unary substitutions matrix
-    static std::vector<std::int8_t> costs = unary_substitution_costs();
-    sz_error_cost_t const *costs_ptr = reinterpret_cast<sz_error_cost_t const *>(costs.data());
-
-    // Two rows of the Levenshtein matrix will occupy this much:
-    sz_memory_allocator_t alloc;
-    alloc.allocate = &allocate_from_vector;
-    alloc.free = &free_from_vector;
-    alloc.handle = &temporary_memory;
-
-    auto wrap_baseline = binary_function_t([](std::string_view a, std::string_view b) -> std::size_t {
-        return levenshtein_baseline(a.data(), a.length(), b.data(), b.length());
-    });
-    auto wrap_sz_distance = [alloc](auto function) mutable -> binary_function_t {
-        return binary_function_t([function, alloc](std::string_view a, std::string_view b) mutable -> std::size_t {
-            sz_size_t result;
-            function(a.data(), a.length(), b.data(), b.length(), SZ_SIZE_MAX, &alloc, &result);
-            return result;
-        });
-    };
-    auto wrap_sz_scoring = [alloc, costs_ptr](auto function) mutable -> binary_function_t {
-        return binary_function_t(
-            [function, alloc, costs_ptr](std::string_view a, std::string_view b) mutable -> std::size_t {
-                sz_memory_allocator_t *alloc_ptr = &alloc;
-                sz_ssize_t signed_result;
-                function(a.data(), a.length(), b.data(), b.length(), costs_ptr, (sz_error_cost_t)-1, alloc_ptr,
-                         &signed_result);
-                return (std::size_t)(-signed_result);
-            });
-    };
-    tracked_binary_functions_t result = {
-        {"naive", wrap_baseline},
-        {"sz_levenshtein_distance_serial", wrap_sz_distance(sz_levenshtein_distance_serial), true},
-        {"sz_needleman_wunsch_score_serial", wrap_sz_scoring(sz_needleman_wunsch_score_serial), true},
-#if SZ_USE_ICE
-        {"sz_levenshtein_distance_ice", wrap_sz_distance(sz_levenshtein_distance_ice), true},
-        {"sz_needleman_wunsch_score_ice", wrap_sz_scoring(sz_needleman_wunsch_score_ice), true},
-#endif
-    };
-    return result;
-}
+    environment_t const &env;
+    sz_size_t bound = SZ_SIZE_MAX;
 
-template <typename strings_at>
-void bench_similarity(strings_at &&strings) {
-    if (strings.size() == 0) return;
-    bench_binary_functions(strings, distance_functions());
-}
+    inline call_result_t operator()(std::size_t token_index) const noexcept {
+        return operator()(env.tokens[token_index], env.tokens[env.tokens.size() - 1 - token_index]);
+    }
 
-void bench_similarity_on_bio_data() {
-    std::vector<std::string> proteins;
-
-    // A typical protein is 100-1000 amino acids long.
-    // The alphabet is generally 20 amino acids, but that won't affect the throughput.
-    char alphabet[4] = {'a', 'c', 'g', 't'};
-    constexpr std::size_t bio_samples = 128;
-    struct {
-        std::size_t length_lower_bound;
-        std::size_t length_upper_bound;
-        char const *name;
-    } bio_cases[] = {
-        {60, 60, "60 aminoacids"},              //
-        {100, 100, "100 aminoacids"},           //
-        {300, 300, "300 aminoacids"},           //
-        {1000, 1000, "1000 aminoacids"},        //
-        {100, 1000, "100-1000 aminoacids"},     //
-        {1000, 10000, "1000-10000 aminoacids"}, //
-    };
-    std::random_device random_device;
-    std::mt19937 generator(random_device());
-    for (auto bio_case : bio_cases) {
-        std::uniform_int_distribution<std::size_t> length_distribution(bio_case.length_lower_bound,
-                                                                       bio_case.length_upper_bound);
-        for (std::size_t i = 0; i != bio_samples; ++i) {
-            std::size_t length = length_distribution(generator);
-            std::string protein(length, 'a');
-            std::generate(protein.begin(), protein.end(), [&]() { return alphabet[generator() % sizeof(alphabet)]; });
-            proteins.push_back(protein);
-        }
-
-        std::printf("Benchmarking on protein-like sequences with %s:\n", bio_case.name);
-        bench_similarity(proteins);
-        proteins.clear();
+    inline call_result_t operator()(std::string_view a, std::string_view b) const noexcept {
+        sz_size_t result_distance;
+        sz_status_t status = hamming_distance_( //
+            a.data(), a.size(),                 //
+            b.data(), b.size(),                 //
+            bound, &result_distance);
+        do_not_optimize(status);
+        std::size_t bytes_passed = std::min(a.size(), b.size());
+        return {bytes_passed, static_cast<check_value_t>(result_distance)};
     }
+};
+
+void bench_hamming(environment_t const &env) {
+    auto base_call = hamming_from_sz<sz_hamming_distance_serial>(env);
+    bench_result_t base = bench_unary(env, "sz_hamming_distance_serial", base_call).log();
+    auto base_utf8_call = hamming_from_sz<sz_hamming_distance_utf8_serial>(env);
+    bench_result_t base_utf8 = bench_unary(env, "sz_hamming_distance_utf8_serial", base_utf8_call).log(base);
+    sz_unused(base_utf8);
 }
 
-void bench_similarity_on_input_data(int argc, char const **argv) {
-
-    dataset_t dataset = prepare_benchmark_environment(argc, argv);
-
-    // Baseline benchmarks for real words, coming in all lengths
-    std::printf("Benchmarking on real words:\n");
-    bench_similarity(dataset.tokens);
-
-    struct size_range_t {
-        std::size_t min_length;
-        std::size_t max_length;
-    };
-
-    // Run benchmarks on tokens of different length
-    for (size_range_t size : {
-             size_range_t {1, 16},
-             size_range_t {17, 32},
-             size_range_t {33, 64},
-             size_range_t {65, 128},
-         }) {
-        auto filtered_dataset = filter_by_length(dataset.tokens, size.min_length, std::greater_equal<std::size_t> {});
-        filtered_dataset = filter_by_length(filtered_dataset, size.max_length, std::greater_equal<std::size_t> {});
-        if (filtered_dataset.size() < 3) continue;
-        std::printf("Benchmarking on %zu real words of length %zu to %zu:\n", filtered_dataset.size(), size.min_length,
-                    size.max_length);
-        bench_similarity(std::move(filtered_dataset));
+#pragma endregion
+
+#pragma region Levenshtein Distance and Alignment Scores
+
+/** @brief Wraps a hardware-specific Levenshtein-distance backend into something @b `bench_unary`-compatible . */
+template <sz_levenshtein_distance_t levenshtein_distance_>
+struct levenshtein_from_sz {
+
+    environment_t const &env;
+    sz_size_t bound = SZ_SIZE_MAX;
+
+    inline call_result_t operator()(std::size_t token_index) const noexcept {
+        return operator()(env.tokens[token_index], env.tokens[env.tokens.size() - 1 - token_index]);
+    }
+
+    inline call_result_t operator()(std::string_view a, std::string_view b) const noexcept {
+        sz_size_t result_distance;
+        sz_status_t status = levenshtein_distance_( //
+            a.data(), a.size(),                     //
+            b.data(), b.size(),                     //
+            bound, NULL, &result_distance);
+        do_not_optimize(status);
+        std::size_t bytes_passed = std::min(a.size(), b.size());
+        std::size_t cells_passed = a.size() * b.size();
+        return {bytes_passed, static_cast<check_value_t>(result_distance), cells_passed};
+    }
+};
+
+/** @brief Wraps a hardware-specific Levenshtein-distance backend into something @b `bench_unary`-compatible . */
+template <sz_needleman_wunsch_score_t needleman_wunsch_>
+struct alignment_score_from_sz {
+
+    environment_t const &env;
+    sz_size_t bound = SZ_SIZE_MAX;
+    error_costs_256x256_t costs = unary_substitution_costs();
+
+    inline call_result_t operator()(std::size_t token_index) const noexcept {
+        return operator()(env.tokens[token_index], env.tokens[env.tokens.size() - 1 - token_index]);
+    }
+
+    inline call_result_t operator()(std::string_view a, std::string_view b) const noexcept {
+        sz_ssize_t result_score;
+        sz_status_t status = needleman_wunsch_( //
+            a.data(), a.size(),                 //
+            b.data(), b.size(),                 //
+            costs.data(), (sz_error_cost_t)-1,  //
+            NULL, &result_score);
+        do_not_optimize(status);
+        sz_size_t result_distance = (sz_size_t)(-result_score);
+        std::size_t bytes_passed = std::min(a.size(), b.size());
+        std::size_t cells_passed = a.size() * b.size();
+        return {bytes_passed, static_cast<check_value_t>(result_distance), cells_passed};
     }
+};
+
+void bench_edits(environment_t const &env) {
+    auto base_call = levenshtein_from_sz<sz_levenshtein_distance_serial>(env);
+    bench_result_t base = bench_unary(env, "sz_levenshtein_distance_serial", base_call).log();
+    auto base_utf8_call = levenshtein_from_sz<sz_levenshtein_distance_utf8_serial>(env);
+    bench_result_t base_utf8 = bench_unary(env, "sz_levenshtein_distance_utf8_serial", base_utf8_call).log(base);
+    sz_unused(base_utf8);
+
+#if SZ_USE_ICE
+    auto ice_call = levenshtein_from_sz<sz_levenshtein_distance_ice>(env);
+    bench_unary(env, "sz_levenshtein_distance_ice", ice_call).log(base);
+#endif
+
+    auto needleman_wunsch_call = alignment_score_from_sz<sz_needleman_wunsch_score_serial>(env);
+    bench_unary(env, "sz_needleman_wunsch_score_serial", needleman_wunsch_call).log(base);
 }
 
+#pragma endregion
+
 int main(int argc, char const **argv) {
-    std::printf("StringZilla. Starting similarity benchmarks.\n");
+    std::printf("Welcome to StringZilla!\n");
 
-    if (argc < 2) { bench_similarity_on_bio_data(); }
-    else { bench_similarity_on_input_data(argc, argv); }
+    std::printf("Building up the environment...\n");
+    environment_t env = build_environment( //
+        argc, argv,                        //
+        "xlsum.csv",                       // Preferred for UTF-8 content
+        environment_t::tokenization_t::words_k);
+
+    std::printf("Starting string similarity benchmarks...\n");
+    bench_hamming(env);
+    bench_edits(env);
 
     std::printf("All benchmarks passed.\n");
     return 0;
-}
+}
\ No newline at end of file
diff --git a/scripts/test.cpp b/scripts/test.cpp
index 590326f2..309d3d9d 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -1016,7 +1016,7 @@ static void test_non_stl_extensions_for_reads() {
 
     // Computing alignment scores.
     using matrix_t = std::int8_t[256][256];
-    std::vector<std::int8_t> costs_vector = unary_substitution_costs();
+    error_costs_256x256_t costs_vector = unary_substitution_costs();
     matrix_t &costs = *reinterpret_cast<matrix_t *>(costs_vector.data());
 
     assert(sz::alignment_score(str("listen"), str("silent"), costs, -1) == -4);
@@ -1651,7 +1651,7 @@ static void test_levenshtein_distances() {
     };
 
     using matrix_t = std::int8_t[256][256];
-    std::vector<std::int8_t> costs_vector = unary_substitution_costs();
+    error_costs_256x256_t costs_vector = unary_substitution_costs();
     matrix_t &costs = *reinterpret_cast<matrix_t *>(costs_vector.data());
 
     auto print_failure = [&](char const *name, sz::string const &l, sz::string const &r, std::size_t expected,
diff --git a/scripts/test.hpp b/scripts/test.hpp
index ede983bc..0512cb40 100644
--- a/scripts/test.hpp
+++ b/scripts/test.hpp
@@ -112,12 +112,14 @@ inline std::size_t levenshtein_baseline(char const *s1, std::size_t len1, char c
     return matrix_buffer.back();
 }
 
+using error_costs_256x256_t = std::array<sz_error_cost_t, 256 * 256>;
+
 /**
  *  @brief  Produces a substitution cost matrix for the Needleman-Wunsch alignment score,
  *          that would yield the same result as the negative Levenshtein distance.
  */
-inline std::vector<std::int8_t> unary_substitution_costs() {
-    std::vector<std::int8_t> result(256 * 256);
+inline error_costs_256x256_t unary_substitution_costs() {
+    error_costs_256x256_t result;
     for (std::size_t i = 0; i != 256; ++i)
         for (std::size_t j = 0; j != 256; ++j) result[i * 256 + j] = (i == j ? 0 : -1);
     return result;

From 56a49c8d188619caf1ce95c6d1e7d0469f94cbc0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 13 Mar 2025 19:41:58 +0000
Subject: [PATCH 207/751] Add: Faster `find_byte` with SVE

For short words it works much faster than NEON:

Benchmarking `sz_find_byte_sve`:
> Throughput: 512.59 MB/s @ 29.96 ns/call
> + 22.7 % against `sz_find_byte_serial`
Benchmarking `sz_rfind_byte_sve`:
> Throughput: 515.07 MB/s @ 29.81 ns/call
> + 22.3 % against `sz_rfind_byte_serial`
Benchmarking `sz_find_byte_neon`:
> Throughput: 368.16 MB/s @ 41.71 ns/call
> - 11.9 % against `sz_find_byte_serial`
Benchmarking `sz_rfind_byte_neon`:
> Throughput: 316.95 MB/s @ 48.45 ns/call
> - 24.7 % against `sz_rfind_byte_serial`
---
 include/stringzilla/find.h | 49 ++++++++++++++++++++++++++++++++++++--
 scripts/bench_search.cpp   | 10 +++++++-
 2 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/include/stringzilla/find.h b/include/stringzilla/find.h
index 1cf99e3b..d520b948 100644
--- a/include/stringzilla/find.h
+++ b/include/stringzilla/find.h
@@ -38,7 +38,7 @@ extern "C" {
  *  @param[in] haystack Haystack - the string to search in.
  *  @param[in] h_length Number of bytes in the haystack.
  *  @param[in] needle Needle - single-byte substring to find.
- *  @return Address of the first match.
+ *  @return Address of the first match. NULL if not found.
  */
 SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
 
@@ -51,7 +51,7 @@ SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cpt
  *  @param[in] haystack Haystack - the string to search in.
  *  @param[in] h_length Number of bytes in the haystack.
  *  @param[in] needle Needle - single-byte substring to find.
- *  @return Address of the last match.
+ *  @return Address of the last match. NULL if not found.
  */
 SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
 
@@ -1642,6 +1642,47 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byteset_neon(sz_cptr_t h, sz_size_t h_length, sz_by
 #pragma GCC target("arch=armv8.2-a+sve")
 #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
 
+SZ_PUBLIC sz_cptr_t sz_find_byte_sve(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
+    sz_u8_t const n_scalar = *n;
+    // Determine the number of bytes in an SVE vector.
+    sz_size_t const vector_bytes = svcntb();
+    sz_size_t progress = 0;
+    do {
+        svbool_t progress_mask = svwhilelt_b8(progress, h_length);
+        svuint8_t h_vec = svld1(progress_mask, (sz_u8_t const *)(h + progress));
+        // Compare: generate a predicate marking lanes where h[i]!=n
+        svbool_t equal_vec = svcmpeq_n_u8(progress_mask, h_vec, n_scalar);
+        if (svptest_any(progress_mask, equal_vec)) {
+            sz_size_t forward_offset_in_register = svcntp_b8(progress_mask, svbrkb_b_z(progress_mask, equal_vec));
+            return h + progress + forward_offset_in_register;
+        }
+        progress += vector_bytes;
+    } while (progress < h_length);
+    // No match found.
+    return SZ_NULL_CHAR;
+}
+
+SZ_PUBLIC sz_cptr_t sz_rfind_byte_sve(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
+    sz_u8_t const n_scalar = *n;
+    // Determine the number of bytes in an SVE vector.
+    sz_size_t const vector_bytes = svcntb();
+    sz_size_t progress = 0;
+    do {
+        svbool_t progress_mask = svwhilelt_b8(progress, h_length);
+        svbool_t backward_mask = svrev_b8(progress_mask);
+        svuint8_t h_vec = svld1(backward_mask, (sz_u8_t const *)(h + h_length - progress - vector_bytes));
+        // Compare: generate a predicate marking lanes where h[i]!=n
+        svbool_t equal_vec = svcmpeq_n_u8(backward_mask, h_vec, n_scalar);
+        if (svptest_any(backward_mask, equal_vec)) {
+            sz_size_t backward_offset_in_register =
+                svcntp_b8(progress_mask, svbrkb_b_z(progress_mask, svrev_b8(equal_vec)));
+            return h + h_length - progress - backward_offset_in_register - 1;
+        }
+        progress += vector_bytes;
+    } while (progress < h_length);
+    return SZ_NULL_CHAR;
+}
+
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_SVE
@@ -1660,6 +1701,8 @@ SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cpt
     return sz_find_byte_skylake(haystack, h_length, needle);
 #elif SZ_USE_HASWELL
     return sz_find_byte_haswell(haystack, h_length, needle);
+#elif SZ_USE_SVE
+    return sz_find_byte_sve(haystack, h_length, needle);
 #elif SZ_USE_NEON
     return sz_find_byte_neon(haystack, h_length, needle);
 #else
@@ -1672,6 +1715,8 @@ SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cp
     return sz_rfind_byte_skylake(haystack, h_length, needle);
 #elif SZ_USE_HASWELL
     return sz_rfind_byte_haswell(haystack, h_length, needle);
+#elif SZ_USE_SVE
+    return sz_rfind_byte_sve(haystack, h_length, needle);
 #elif SZ_USE_NEON
     return sz_rfind_byte_neon(haystack, h_length, needle);
 #else
diff --git a/scripts/bench_search.cpp b/scripts/bench_search.cpp
index da827436..a988d349 100644
--- a/scripts/bench_search.cpp
+++ b/scripts/bench_search.cpp
@@ -31,7 +31,7 @@
  *  @code{.sh}
  *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
  *  cmake --build build_release --config Release --target stringzilla_bench_search
- *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=lines build_release/stringzilla_bench_search
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringzilla_bench_search
  *  @endcode
  *
  *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
@@ -389,6 +389,14 @@ void bench_byte_search(environment_t const &env) {
                 callable_for_byte_search<sz::range_rmatches, matcher_from_sz_find_byte<sz_rfind_byte_neon>>(env))
         .log(base_reverse);
 #endif
+#if SZ_USE_SVE
+    bench_unary(env, "sz_find_byte_sve", base_call,
+                callable_for_byte_search<sz::range_matches, matcher_from_sz_find_byte<sz_find_byte_sve>>(env))
+        .log(base);
+    bench_unary(env, "sz_rfind_byte_sve", base_call,
+                callable_for_byte_search<sz::range_rmatches, matcher_from_sz_find_byte<sz_rfind_byte_sve>>(env))
+        .log(base_reverse);
+#endif
 
     // Include LibC functionality
     bench_unary(env, "find_byte<std::strchr>", base_call, //

From d7ede5d83760cc0a91ddbb6da1775091a0972c69 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 13 Mar 2025 20:49:08 +0000
Subject: [PATCH 208/751] Add: Draft `sz_find_sve`

---
 include/stringzilla/find.h | 85 ++++++++++++++++++++++++++++++++++++++
 scripts/bench_search.cpp   |  5 +++
 2 files changed, 90 insertions(+)

diff --git a/include/stringzilla/find.h b/include/stringzilla/find.h
index d520b948..a8d949a6 100644
--- a/include/stringzilla/find.h
+++ b/include/stringzilla/find.h
@@ -1680,9 +1680,92 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byte_sve(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
         }
         progress += vector_bytes;
     } while (progress < h_length);
+    // No match found.
     return SZ_NULL_CHAR;
 }
 
+SZ_PUBLIC sz_cptr_t sz_find_sve(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+    if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
+    if (n_length == 1) return sz_find_byte_sve(h, h_length, n);
+
+    // Determine the number of bytes in an SVE vector.
+    sz_size_t const vector_bytes = svcntb();
+    sz_size_t progress = 0;
+
+    if (n_length == 2) {
+        // Broadcast needle characters.
+        sz_u8_t n0 = ((sz_u8_t *)n)[0];
+        sz_u8_t n1 = ((sz_u8_t *)n)[1];
+        do {
+            // We must avoid overrunning the haystack for the second byte.
+            svbool_t pred = svwhilelt_b8(progress, h_length - 1);
+            // Load two adjacent vectors.
+            svuint8_t hay0 = svld1(pred, (sz_u8_t const *)(h + progress));
+            svuint8_t hay1 = svld1(pred, (sz_u8_t const *)(h + progress + 1));
+            svbool_t cmp0 = svcmpeq_n_u8(pred, hay0, n0);
+            svbool_t cmp1 = svcmpeq_n_u8(pred, hay1, n1);
+            svbool_t matches = svmov_b_z(cmp0, cmp1); //? Practically a bitwise AND
+            if (svptest_any(pred, matches)) return h + progress + svcntp_b8(pred, svbrkb_b_z(pred, matches));
+            progress += vector_bytes;
+        } while (progress < (h_length - 1));
+        return SZ_NULL_CHAR;
+    }
+    else if (n_length == 3) {
+        // Broadcast needle characters.
+        sz_u8_t n0 = ((sz_u8_t *)n)[0];
+        sz_u8_t n1 = ((sz_u8_t *)n)[1];
+        sz_u8_t n2 = ((sz_u8_t *)n)[2];
+        do {
+            // Prevent overrunning for the 3rd byte.
+            svbool_t pred = svwhilelt_b8(progress, h_length - 2);
+            svuint8_t hay0 = svld1(pred, (sz_u8_t const *)(h + progress));
+            svuint8_t hay1 = svld1(pred, (sz_u8_t const *)(h + progress + 1));
+            svuint8_t hay2 = svld1(pred, (sz_u8_t const *)(h + progress + 2));
+            svbool_t cmp0 = svcmpeq_n_u8(pred, hay0, n0);
+            svbool_t cmp1 = svcmpeq_n_u8(pred, hay1, n1);
+            svbool_t cmp2 = svcmpeq_n_u8(pred, hay2, n2);
+            svbool_t matches = svand_b_z(cmp0, cmp1, cmp2); //? Practically a 3-way AND.
+            if (svptest_any(pred, matches)) return h + progress + svcntp_b8(pred, svbrkb_b_z(pred, matches));
+            progress += vector_bytes;
+        } while (progress < (h_length - 2));
+        return SZ_NULL_CHAR;
+    }
+    else {
+        // For longer needles we first pick "anomalies" (i.e. informative offsets)
+        sz_size_t offset_first, offset_mid, offset_last;
+        _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
+        // Broadcast the selected needle bytes.
+        sz_u8_t n_first = ((sz_u8_t *)n)[offset_first];
+        sz_u8_t n_mid = ((sz_u8_t *)n)[offset_mid];
+        sz_u8_t n_last = ((sz_u8_t *)n)[offset_last];
+        do {
+            // Make sure the predicate does not run off the end.
+            svbool_t pred = svwhilelt_b8(progress, h_length - (n_length - 1));
+            // Load haystack bytes at the chosen offsets.
+            svuint8_t hay_first = svld1(pred, (sz_u8_t const *)(h + progress + offset_first));
+            svuint8_t hay_mid = svld1(pred, (sz_u8_t const *)(h + progress + offset_mid));
+            svuint8_t hay_last = svld1(pred, (sz_u8_t const *)(h + progress + offset_last));
+            svbool_t cmp0 = svcmpeq_n_u8(pred, hay_first, n_first);
+            svbool_t cmp1 = svcmpeq_n_u8(pred, hay_mid, n_mid);
+            svbool_t cmp2 = svcmpeq_n_u8(pred, hay_last, n_last);
+            svbool_t matches = svand_b_z(cmp0, cmp1, cmp2); //? Practically a 3-way AND.
+            // There might be multiple candidate positions, so we need to iterate over them.
+            sz_size_t matches_count = svcntp_b8(pred, matches);
+            for (; matches_count; --matches_count) {
+                svbool_t pred_to_skip = svbrkb_b_z(pred, matches);
+                sz_size_t forward_offset_in_register = svcntp_b8(pred, pred_to_skip);
+                if (sz_equal_sve(h + progress + forward_offset_in_register, n, n_length))
+                    return h + progress + forward_offset_in_register;
+                // If it doesn't match - clear the first bit and continue
+                svbool_t only_first_match = svpfirst_b(svnot_b_z(svptrue_b8(), pred_to_skip), matches);
+                matches = svbic_b_z(svptrue_b8(), matches, only_first_match);
+            }
+            progress += vector_bytes;
+        } while (progress < h_length - (n_length - 1));
+        return SZ_NULL_CHAR;
+    }
+}
+
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_SVE
@@ -1729,6 +1812,8 @@ SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t n
     return sz_find_skylake(haystack, h_length, needle, n_length);
 #elif SZ_USE_HASWELL
     return sz_find_haswell(haystack, h_length, needle, n_length);
+#elif SZ_USE_SVE
+    return sz_find_sve(haystack, h_length, needle, n_length);
 #elif SZ_USE_NEON
     return sz_find_neon(haystack, h_length, needle, n_length);
 #else
diff --git a/scripts/bench_search.cpp b/scripts/bench_search.cpp
index a988d349..04192146 100644
--- a/scripts/bench_search.cpp
+++ b/scripts/bench_search.cpp
@@ -210,6 +210,11 @@ void bench_substring_search(environment_t const &env) {
                 callable_for_substring_search<sz::range_rmatches, matcher_from_sz_find<sz_rfind_haswell>>(env))
         .log(base_reverse);
 #endif
+#if SZ_USE_SVE
+    bench_unary(env, "sz_find_sve", base_call,
+                callable_for_substring_search<sz::range_matches, matcher_from_sz_find<sz_find_sve>>(env))
+        .log(base);
+#endif
 #if SZ_USE_NEON
     bench_unary(env, "sz_find_neon", base_call,
                 callable_for_substring_search<sz::range_matches, matcher_from_sz_find<sz_find_neon>>(env))

From 06ea5f749e82f5524251e04d36d19da88c34321e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 13 Mar 2025 20:59:27 +0000
Subject: [PATCH 209/751] Fix: Dispatching SVE kernels

---
 include/stringzilla/memory.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/include/stringzilla/memory.h b/include/stringzilla/memory.h
index 5b14108c..67027317 100644
--- a/include/stringzilla/memory.h
+++ b/include/stringzilla/memory.h
@@ -1321,6 +1321,14 @@ SZ_PUBLIC void sz_copy_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length)
     }
 }
 
+SZ_PUBLIC void sz_move_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
+#if SZ_USE_NEON
+    sz_move_neon(target, source, length);
+#else
+    sz_move_serial(target, source, length);
+#endif
+}
+
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_SVE
@@ -1339,6 +1347,8 @@ SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
     sz_copy_skylake(target, source, length);
 #elif SZ_USE_HASWELL
     sz_copy_haswell(target, source, length);
+#elif SZ_USE_SVE
+    sz_copy_sve(target, source, length);
 #elif SZ_USE_NEON
     sz_copy_neon(target, source, length);
 #else
@@ -1351,6 +1361,8 @@ SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
     sz_move_skylake(target, source, length);
 #elif SZ_USE_HASWELL
     sz_move_haswell(target, source, length);
+#elif SZ_USE_SVE
+    sz_move_sve(target, source, length);
 #elif SZ_USE_NEON
     sz_move_neon(target, source, length);
 #else
@@ -1363,6 +1375,8 @@ SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
     sz_fill_skylake(target, length, value);
 #elif SZ_USE_HASWELL
     sz_fill_haswell(target, length, value);
+#elif SZ_USE_SVE
+    sz_fill_sve(target, length, value);
 #elif SZ_USE_NEON
     sz_fill_neon(target, length, value);
 #else

From efda23bfbdb15813e9061d8daa02c6516f42c0de Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 13 Mar 2025 21:13:45 +0000
Subject: [PATCH 210/751] Improve: Use `SQINCP` in SVE for increments

---
 include/stringzilla/sort.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index 8eecc339..67b21822 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -944,8 +944,8 @@ SZ_INTERNAL void _sz_sequence_argsort_sve_3way_partition(
         svuint64_t pgrams_vec = svld1_u64(load_mask, initial_pgrams + i);
         svbool_t smaller_mask = svcmplt_u64(load_mask, pgrams_vec, pivot_vec);
         svbool_t greater_mask = svcmpgt_u64(load_mask, pgrams_vec, pivot_vec);
-        count_smaller += svcntp_b64(smaller_mask, smaller_mask);
-        count_greater += svcntp_b64(greater_mask, greater_mask);
+        count_smaller = svqincp_n_u64_b64(count_smaller, smaller_mask); // Smarter than `svcntp_b64`
+        count_greater = svqincp_n_u64_b64(count_greater, greater_mask); // Smarter than `svcntp_b64`
     }
 
     sz_size_t const count_equal = count - count_smaller - count_greater;

From 3d77ec6d505bf70a8797942e50f1220bd9608b8b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 13 Mar 2025 21:14:11 +0000
Subject: [PATCH 211/751] Fix: Drop double negation in logging

---
 scripts/bench.hpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index fc0d74ea..4f4790b4 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -565,9 +565,10 @@ struct bench_result_t {
             char const *relative_unit = (relative_throughput > 2) ? "x" : "%";
             if (relative_throughput < 0.5) relative_throughput = 1 / relative_throughput, relative_unit = "x";
             if (std::strcmp(relative_unit, "%") == 0) relative_throughput = (relative_throughput - 1) * 100;
-            std::printf("> %s%s %.1f %s\033[0m against `%s`\n", //
-                        relative_color, relative_sign, relative_throughput,
-                        relative_unit, //
+            std::printf("> %s%s %.1f %s\033[0m against `%s`\n",       //
+                        relative_color,                               //
+                        relative_sign, std::abs(relative_throughput), //
+                        relative_unit,                                //
                         base.name.c_str());
         };
 

From 905749cf49659d92730b2855e455be38171be1ec Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 13 Mar 2025 21:14:24 +0000
Subject: [PATCH 212/751] Fix: `bench_sequence` CMake target

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d275fd0b..c061ea67 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,7 +25,7 @@
 #
 # * stringzilla_bench_search: A benchmark for substring search operations.
 # * stringzilla_bench_similarity: A benchmark for similarity operations.
-# * stringzilla_bench_sort: A benchmark for sorting operations.
+# * stringzilla_bench_sequence: A benchmark for string array-level operations.
 # * stringzilla_bench_token: A benchmark for comparators and hash functions.
 # * stringzilla_bench_container: A benchmark for STL containers powered by StringZilla.
 # * stringzilla_bench_memory: A benchmark for LibC-style low-level memory operations.
@@ -254,7 +254,7 @@ endfunction ()
 if (${STRINGZILLA_BUILD_BENCHMARK})
     define_launcher(stringzilla_bench_search scripts/bench_search.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_similarity scripts/bench_similarity.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringzilla_bench_sort scripts/bench_sequence.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_bench_sequence scripts/bench_sequence.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_token scripts/bench_token.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_container scripts/bench_container.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_memory scripts/bench_memory.cpp 20 "${STRINGZILLA_TARGET_ARCH}")

From 68449fd87e783974c3efa558e4c8e9939187af5e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 13 Mar 2025 21:15:20 +0000
Subject: [PATCH 213/751] Improve: More simple substring search tests

---
 scripts/test.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/scripts/test.cpp b/scripts/test.cpp
index 309d3d9d..72d01e07 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -568,6 +568,19 @@ static void test_stl_compatibility_for_reads() {
     assert(str("abbabbaaaaaa").find("aa") == 6);
     assert(str("abcdabcd").substr(2, 4).find("abc") == str::npos);
     assert(str("hello, world!").substr(0, 11).find("world") == str::npos);
+    assert(str("axabbcxcaaabbccc").find("aaabbccc") == 8);
+
+    // Simple repeating patterns - with one "almost match" before an actual match in each direction
+    assert(str("_ab_abc_").find("abc") == 4);
+    assert(str("_abc_ab_").rfind("abc") == 1);
+    assert(str("_abc_abcd_").find("abcd") == 5);
+    assert(str("_abcd_abc_").rfind("abcd") == 1);
+    assert(str("_abcd_abcde_").find("abcde") == 6);
+    assert(str("_abcde_abcd_").rfind("abcde") == 1);
+    assert(str("_abcde_abcdef_").find("abcdef") == 7);
+    assert(str("_abcdef_abcde_").rfind("abcdef") == 1);
+    assert(str("_abcdef_abcdefg_").find("abcdefg") == 8);
+    assert(str("_abcdefg_abcdef_").rfind("abcdefg") == 1);
 
     // ! `rfind` and `find_last_of` are not consistent in meaning of their arguments.
     assert(str("hello").find_first_of("le") == 1);

From a493ab869fe9202841777f31927f8a00fd43f529 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 13 Mar 2025 21:51:51 +0000
Subject: [PATCH 214/751] Fix: `find_sve` mask update on long needles

---
 include/stringzilla/find.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/stringzilla/find.h b/include/stringzilla/find.h
index a8d949a6..f46f60e8 100644
--- a/include/stringzilla/find.h
+++ b/include/stringzilla/find.h
@@ -1757,8 +1757,10 @@ SZ_PUBLIC sz_cptr_t sz_find_sve(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz
                 if (sz_equal_sve(h + progress + forward_offset_in_register, n, n_length))
                     return h + progress + forward_offset_in_register;
                 // If it doesn't match - clear the first bit and continue
-                svbool_t only_first_match = svpfirst_b(svnot_b_z(svptrue_b8(), pred_to_skip), matches);
-                matches = svbic_b_z(svptrue_b8(), matches, only_first_match);
+                svbool_t first_match = svpnext_b8(svptrue_b8(), pred_to_skip);
+                _sz_assert(svcntp_b8(svptrue_b8(), first_match) == 1);
+                matches = svbic_b_z(svptrue_b8(), matches, first_match);
+                _sz_assert(svcntp_b8(svptrue_b8(), matches) == (matches_count - 1));
             }
             progress += vector_bytes;
         } while (progress < h_length - (n_length - 1));

From 77e482e79862631df7c57161474af507d4bdebb5 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 13 Mar 2025 21:55:58 +0000
Subject: [PATCH 215/751] Improve: Unrolled serial hashing

Resulted in +50% throughput improvement
---
 include/stringzilla/hash.h | 87 ++++++++++++++++++++++++++++++++------
 1 file changed, 74 insertions(+), 13 deletions(-)

diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index b0a54e9d..af008dbd 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -361,27 +361,88 @@ SZ_INTERNAL sz_u128_vec_t _sz_emulate_aesenc_si128_serial(sz_u128_vec_t state_ve
 
     // Combine `ShiftRows` and `SubBytes`
     sz_u8_t state_2d[4][4];
-    for (int i = 0; i < 16; ++i) state_2d[((i / 4) + 4 - (i % 4)) % 4][i % 4] = sbox[state_vec.u8s[i]];
-#define _sz_gf2_double(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
+
+    state_2d[0][0] = sbox[state_vec.u8s[0]];
+    state_2d[3][1] = sbox[state_vec.u8s[1]];
+    state_2d[2][2] = sbox[state_vec.u8s[2]];
+    state_2d[1][3] = sbox[state_vec.u8s[3]];
+
+    state_2d[1][0] = sbox[state_vec.u8s[4]];
+    state_2d[0][1] = sbox[state_vec.u8s[5]];
+    state_2d[3][2] = sbox[state_vec.u8s[6]];
+    state_2d[2][3] = sbox[state_vec.u8s[7]];
+
+    state_2d[2][0] = sbox[state_vec.u8s[8]];
+    state_2d[1][1] = sbox[state_vec.u8s[9]];
+    state_2d[0][2] = sbox[state_vec.u8s[10]];
+    state_2d[3][3] = sbox[state_vec.u8s[11]];
+
+    state_2d[3][0] = sbox[state_vec.u8s[12]];
+    state_2d[2][1] = sbox[state_vec.u8s[13]];
+    state_2d[1][2] = sbox[state_vec.u8s[14]];
+    state_2d[0][3] = sbox[state_vec.u8s[15]];
+
     // Perform `MixColumns` using GF2 multiplication by 2
-    for (int i = 0; i < 4; ++i) {
-        sz_u8_t t = state_2d[i][0];
-        sz_u8_t u = state_2d[i][0] ^ state_2d[i][1] ^ state_2d[i][2] ^ state_2d[i][3];
-        state_2d[i][0] ^= u ^ _sz_gf2_double(state_2d[i][0] ^ state_2d[i][1]);
-        state_2d[i][1] ^= u ^ _sz_gf2_double(state_2d[i][1] ^ state_2d[i][2]);
-        state_2d[i][2] ^= u ^ _sz_gf2_double(state_2d[i][2] ^ state_2d[i][3]);
-        state_2d[i][3] ^= u ^ _sz_gf2_double(state_2d[i][3] ^ t);
-    }
+#define _sz_gf2_double(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
+    // Row 0:
+    sz_u8_t t0 = state_2d[0][0];
+    sz_u8_t u0 = state_2d[0][0] ^ state_2d[0][1] ^ state_2d[0][2] ^ state_2d[0][3];
+    state_2d[0][0] ^= u0 ^ _sz_gf2_double(state_2d[0][0] ^ state_2d[0][1]);
+    state_2d[0][1] ^= u0 ^ _sz_gf2_double(state_2d[0][1] ^ state_2d[0][2]);
+    state_2d[0][2] ^= u0 ^ _sz_gf2_double(state_2d[0][2] ^ state_2d[0][3]);
+    state_2d[0][3] ^= u0 ^ _sz_gf2_double(state_2d[0][3] ^ t0);
+
+    // Row 1:
+    sz_u8_t t1 = state_2d[1][0];
+    sz_u8_t u1 = state_2d[1][0] ^ state_2d[1][1] ^ state_2d[1][2] ^ state_2d[1][3];
+    state_2d[1][0] ^= u1 ^ _sz_gf2_double(state_2d[1][0] ^ state_2d[1][1]);
+    state_2d[1][1] ^= u1 ^ _sz_gf2_double(state_2d[1][1] ^ state_2d[1][2]);
+    state_2d[1][2] ^= u1 ^ _sz_gf2_double(state_2d[1][2] ^ state_2d[1][3]);
+    state_2d[1][3] ^= u1 ^ _sz_gf2_double(state_2d[1][3] ^ t1);
+
+    // Row 2:
+    sz_u8_t t2 = state_2d[2][0];
+    sz_u8_t u2 = state_2d[2][0] ^ state_2d[2][1] ^ state_2d[2][2] ^ state_2d[2][3];
+    state_2d[2][0] ^= u2 ^ _sz_gf2_double(state_2d[2][0] ^ state_2d[2][1]);
+    state_2d[2][1] ^= u2 ^ _sz_gf2_double(state_2d[2][1] ^ state_2d[2][2]);
+    state_2d[2][2] ^= u2 ^ _sz_gf2_double(state_2d[2][2] ^ state_2d[2][3]);
+    state_2d[2][3] ^= u2 ^ _sz_gf2_double(state_2d[2][3] ^ t2);
+
+    // Row 3:
+    sz_u8_t t3 = state_2d[3][0];
+    sz_u8_t u3 = state_2d[3][0] ^ state_2d[3][1] ^ state_2d[3][2] ^ state_2d[3][3];
+    state_2d[3][0] ^= u3 ^ _sz_gf2_double(state_2d[3][0] ^ state_2d[3][1]);
+    state_2d[3][1] ^= u3 ^ _sz_gf2_double(state_2d[3][1] ^ state_2d[3][2]);
+    state_2d[3][2] ^= u3 ^ _sz_gf2_double(state_2d[3][2] ^ state_2d[3][3]);
+    state_2d[3][3] ^= u3 ^ _sz_gf2_double(state_2d[3][3] ^ t3);
 #undef _sz_gf2_double
+
     // Export `XOR`-ing with the round key
-    sz_u128_vec_t result;
-    for (int i = 0; i < 16; ++i) result.u8s[i] = state_2d[i / 4][i % 4] ^ round_key_vec.u8s[i];
+    sz_u128_vec_t result = *(sz_u128_vec_t *)state_2d;
+    result.u64s[0] ^= round_key_vec.u64s[0];
+    result.u64s[1] ^= round_key_vec.u64s[1];
     return result;
 }
 
 SZ_INTERNAL sz_u128_vec_t _sz_emulate_shuffle_epi8_serial(sz_u128_vec_t state_vec, sz_u8_t const order[16]) {
     sz_u128_vec_t result;
-    for (int i = 0; i < 16; ++i) result.u8s[i] = state_vec.u8s[order[i]];
+    // Unroll the loop for 16 bytes
+    result.u8s[0] = state_vec.u8s[order[0]];
+    result.u8s[1] = state_vec.u8s[order[1]];
+    result.u8s[2] = state_vec.u8s[order[2]];
+    result.u8s[3] = state_vec.u8s[order[3]];
+    result.u8s[4] = state_vec.u8s[order[4]];
+    result.u8s[5] = state_vec.u8s[order[5]];
+    result.u8s[6] = state_vec.u8s[order[6]];
+    result.u8s[7] = state_vec.u8s[order[7]];
+    result.u8s[8] = state_vec.u8s[order[8]];
+    result.u8s[9] = state_vec.u8s[order[9]];
+    result.u8s[10] = state_vec.u8s[order[10]];
+    result.u8s[11] = state_vec.u8s[order[11]];
+    result.u8s[12] = state_vec.u8s[order[12]];
+    result.u8s[13] = state_vec.u8s[order[13]];
+    result.u8s[14] = state_vec.u8s[order[14]];
+    result.u8s[15] = state_vec.u8s[order[15]];
     return result;
 }
 

From a1604b71390f87041a0044bf944319a44d26241a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 13 Mar 2025 22:04:27 +0000
Subject: [PATCH 216/751] Improve: Simpler SVE find nested loop

---
 include/stringzilla/find.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/stringzilla/find.h b/include/stringzilla/find.h
index f46f60e8..ed6e6271 100644
--- a/include/stringzilla/find.h
+++ b/include/stringzilla/find.h
@@ -1750,8 +1750,7 @@ SZ_PUBLIC sz_cptr_t sz_find_sve(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz
             svbool_t cmp2 = svcmpeq_n_u8(pred, hay_last, n_last);
             svbool_t matches = svand_b_z(cmp0, cmp1, cmp2); //? Practically a 3-way AND.
             // There might be multiple candidate positions, so we need to iterate over them.
-            sz_size_t matches_count = svcntp_b8(pred, matches);
-            for (; matches_count; --matches_count) {
+            while (svptest_any(pred, matches)) {
                 svbool_t pred_to_skip = svbrkb_b_z(pred, matches);
                 sz_size_t forward_offset_in_register = svcntp_b8(pred, pred_to_skip);
                 if (sz_equal_sve(h + progress + forward_offset_in_register, n, n_length))
@@ -1760,7 +1759,6 @@ SZ_PUBLIC sz_cptr_t sz_find_sve(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz
                 svbool_t first_match = svpnext_b8(svptrue_b8(), pred_to_skip);
                 _sz_assert(svcntp_b8(svptrue_b8(), first_match) == 1);
                 matches = svbic_b_z(svptrue_b8(), matches, first_match);
-                _sz_assert(svcntp_b8(svptrue_b8(), matches) == (matches_count - 1));
             }
             progress += vector_bytes;
         } while (progress < h_length - (n_length - 1));

From dfd6ddf616c40987a0631c7a6220fd42684fa956 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 14 Mar 2025 12:28:31 +0000
Subject: [PATCH 217/751] Add: New memory benchmarks

---
 c/lib.c                     |   2 +-
 include/stringzilla/types.h |   3 +
 scripts/bench_memory.cpp    | 485 ++++++++++++++++++++++--------------
 3 files changed, 306 insertions(+), 184 deletions(-)

diff --git a/c/lib.c b/c/lib.c
index 97cf8d1f..f83d72b6 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -178,7 +178,7 @@ typedef struct sz_implementations_t {
     sz_equal_t equal;
     sz_order_t order;
 
-    sz_move_t copy;
+    sz_copy_t copy;
     sz_move_t move;
     sz_fill_t fill;
     sz_lookup_t lookup;
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 5ab77fb6..25e1e4c5 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -543,6 +543,9 @@ typedef sz_ordering_t (*sz_order_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
 /** @brief Signature of `sz_lookup`. */
 typedef void (*sz_lookup_t)(sz_ptr_t, sz_size_t, sz_cptr_t, sz_cptr_t);
 
+/** @brief Signature of `sz_copy`. */
+typedef void (*sz_copy_t)(sz_ptr_t, sz_cptr_t, sz_size_t);
+
 /** @brief Signature of `sz_move`. */
 typedef void (*sz_move_t)(sz_ptr_t, sz_cptr_t, sz_size_t);
 
diff --git a/scripts/bench_memory.cpp b/scripts/bench_memory.cpp
index 47a67835..c692792c 100644
--- a/scripts/bench_memory.cpp
+++ b/scripts/bench_memory.cpp
@@ -1,9 +1,42 @@
 /**
  *  @file   bench_memory.cpp
- *  @brief  Benchmarks for memory operations like copying, moving, and comparing.
+ *  @brief  Benchmarks for memory operations like copying, moving, resetting, and converting with lookup tables.
+ *          The program accepts a file path to a dataset, tokenizes it, and uses those tokens only for size
+ *          references to mimic real-world scenarios dealing with individual strings of different lengths.
  *
- *  This file is the sibling of `bench_sort.cpp`, `bench_token.cpp` and `bench_similarity.cpp`.
- *  It accepts a file with a list of words, and benchmarks the memory operations on them.
+ *  Instead of CLI arguments, for compatibility with @b StringWa.rs, the following environment variables are used:
+ *  - `STRINGWARS_DATASET` : Path to the dataset file.
+ *  - `STRINGWARS_TOKENS=words` : Tokenization model ("file", "lines", "words", or positive integer [1:200] for N-grams
+ *  - `STRINGWARS_SEED=42` : Optional seed for shuffling reproducibility.
+ *
+ *  Unlike StringWa.rs, the following additional environment variables are supported:
+ *  - `STRINGWARS_DURATION=10` : Time limit (in seconds) per benchmark.
+ *  - `STRINGWARS_STRESS=1` : Test SIMD-accelerated functions against the serial baselines.
+ *  - `STRINGWARS_STRESS_DIR=/.tmp` : Output directory for stress-testing failures logs.
+ *  - `STRINGWARS_STRESS_LIMIT=1` : Controls the number of failures we're willing to tolerate.
+ *  - `STRINGWARS_STRESS_DURATION=10` : Stress-testing time limit (in seconds) per benchmark.
+ *  - `STRINGWARS_FILTER` : Regular Expression pattern to filter algorithm/backend names.
+ *
+ *  Here are a few build & run commands:
+ *
+ *  @code{.sh}
+ *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
+ *  cmake --build build_release --config Release --target stringzilla_bench_memory
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=lines build_release/stringzilla_bench_memory
+ *  @endcode
+ *
+ *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
+ *  like all Skylake-X and newer kernels on a boundary-condition input length of 64 bytes (exactly 1 cache line),
+ *  your last command may look like:
+ *
+ *  @code{.sh}
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
+ *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
+ *  build_release/stringzilla_bench_memory
+ *  @endcode
+ *
+ *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
+ *  This file is the sibling of `bench_search.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_sequence.cpp`.
  */
 #include <cstring> // `memmem`
 #include <memory>  // `std::unique_ptr`
@@ -17,7 +50,7 @@
 #endif
 
 #define SZ_USE_MISALIGNED_LOADS (1)
-#include <bench.hpp>
+#include "bench.hpp"
 
 using namespace ashvardanian::stringzilla::scripts;
 constexpr std::size_t max_shift_length = 299;
@@ -40,8 +73,34 @@ struct page_alloc_and_free_t {
 #endif
 };
 
+#pragma region MemCpy
+
+/** @brief Wraps a hardware-specific @b `memcpy`-like backend into something compatible with @b `bench_unary`. */
+template <sz_copy_t copy_func_, int page_misalignment_ = 0>
+struct copy_from_sz {
+
+    environment_t const &env;
+    sz_ptr_t output;
+
+    inline call_result_t operator()(std::size_t token_index) const noexcept {
+        return operator()(env.tokens[token_index]);
+    }
+
+    inline call_result_t operator()(std::string_view slice) const noexcept {
+        std::size_t output_offset = slice.data() - env.dataset.data();
+        // Round down to the nearest multiple of a cache line width for aligned writes
+        output_offset = round_up_to_multiple<SZ_CACHE_LINE_WIDTH>(output_offset) - SZ_CACHE_LINE_WIDTH;
+        // Ensure unaligned exports if needed
+        output_offset += page_misalignment_;
+        copy_func_(output + output_offset, slice.data(), slice.size());
+        return {slice.size()};
+    }
+};
+
+void memcpy_like_sz(sz_ptr_t output, sz_cptr_t input, std::size_t length) { std::memcpy(output, input, length); }
+
 /**
- *  @brief  Benchmarks `memcpy`-like operations in 2 modes: aligned @b output buffer and unaligned.
+ *  @brief Benchmarks `memcpy`-like operations in 2 modes: @b aligned output buffer and @b shifted misaligned.
  *
  *  In the aligned case we copy a random part of the input string into the start of a matching cache line in the output.
  *  In the unaligned case we also locate a matching cache line in the output, but shift by one to guarantee unaligned
@@ -49,230 +108,290 @@ struct page_alloc_and_free_t {
  *
  *  Multiple calls to the provided functions even with the same arguments won't change the input or output.
  *  So the kernels can be compared against the baseline `memcpy` function.
- *
- *  @param  output_buffer_ptr Aligned output buffer.
  */
-template <bool aligned_output>
-tracked_unary_functions_t copy_functions(sz_cptr_t dataset_start_ptr, sz_ptr_t output_buffer_ptr) {
-    std::string suffix = aligned_output ? "<aligned>" : "<unaligned>";
-    auto wrap_sz = [dataset_start_ptr, output_buffer_ptr](auto function) -> unary_function_t {
-        return unary_function_t([function, dataset_start_ptr, output_buffer_ptr](std::string_view slice) {
-            std::size_t output_offset = slice.data() - dataset_start_ptr;
-            // Round down to the nearest multiple of a cache line width for aligned writes
-            output_offset = round_up_to_multiple<SZ_CACHE_LINE_WIDTH>(output_offset) - SZ_CACHE_LINE_WIDTH;
-            // Ensure unaligned exports if needed
-            if constexpr (!aligned_output) output_offset += 1;
-            function(output_buffer_ptr + output_offset, slice.data(), slice.size());
-            return slice.size();
-        });
-    };
-    tracked_unary_functions_t result = {
-        {"memcpy" + suffix, wrap_sz(memcpy)},
-        {"sz_copy_serial" + suffix, wrap_sz(sz_copy_serial)},
-#if SZ_USE_SKYLAKE
-        {"sz_copy_skylake" + suffix, wrap_sz(sz_copy_skylake)},
-#endif
+void bench_copy(environment_t const &env) {
+
+    // Create an aligned buffer for the output
+    std::unique_ptr<char, page_alloc_and_free_t> output_buffer;
+    // Add space for at least one cache line to simplify unaligned exports
+    std::size_t const output_length = round_up_to_multiple<4096>(env.dataset.size() + max_shift_length);
+    output_buffer.reset(page_alloc_and_free_t {}(4096, output_length));
+    sz_ptr_t o = output_buffer.get();
+
+    // Provide a baseline
+    bench_result_t align = bench_unary(env, "sz_copy_serial(align)", copy_from_sz<sz_copy_serial>(env, o)).log();
+    bench_result_t shift = bench_unary(env, "sz_copy_serial(shift)", copy_from_sz<sz_copy_serial, 1>(env, o)) //
+                               .log(align);
+
 #if SZ_USE_HASWELL
-        {"sz_copy_haswell" + suffix, wrap_sz(sz_copy_haswell)},
+    bench_unary(env, "sz_copy_haswell(align)", copy_from_sz<sz_copy_haswell>(env, o)).log(align);
+    bench_unary(env, "sz_copy_haswell(shift)", copy_from_sz<sz_copy_haswell, 1>(env, o)).log(align, shift);
 #endif
-#if SZ_USE_SVE
-        {"sz_copy_sve" + suffix, wrap_sz(sz_copy_sve)},
+#if SZ_USE_SKYLAKE
+    bench_unary(env, "sz_copy_skylake(align)", copy_from_sz<sz_copy_skylake>(env, o)).log(align);
+    bench_unary(env, "sz_copy_skylake(shift)", copy_from_sz<sz_copy_skylake, 1>(env, o)).log(align, shift);
 #endif
 #if SZ_USE_NEON
-        {"sz_copy_neon" + suffix, wrap_sz(sz_copy_neon)},
+    bench_unary(env, "sz_copy_neon(align)", copy_from_sz<sz_copy_neon>(env, o)).log(align);
+    bench_unary(env, "sz_copy_neon(shift)", copy_from_sz<sz_copy_neon, 1>(env, o)).log(align, shift);
 #endif
-    };
-    return result;
+#if SZ_USE_SVE
+    bench_unary(env, "sz_copy_sve(align)", copy_from_sz<sz_copy_sve>(env, o)).log(align);
+    bench_unary(env, "sz_copy_sve(shift)", copy_from_sz<sz_copy_sve, 1>(env, o)).log(align, shift);
+#endif
+
+    bench_unary(env, "std::memcpy(align)", copy_from_sz<memcpy_like_sz>(env, o)).log(align);
+    bench_unary(env, "std::memcpy(shift)", copy_from_sz<memcpy_like_sz, 1>(env, o)).log(align, shift);
 }
 
+#pragma endregion // MemCpy
+
+#pragma region MemMove
+
+/** @brief Wraps a hardware-specific @b `memmove`-like backend into something compatible with @b `bench_unary`. */
+template <sz_move_t move_func_, int shift_ = 0>
+struct move_from_sz {
+
+    environment_t const &env;
+    sz_ptr_t output;
+
+    inline call_result_t operator()(std::size_t token_index) const noexcept {
+        return operator()(env.tokens[token_index]);
+    }
+
+    inline call_result_t operator()(std::string_view slice) const noexcept {
+        std::size_t output_offset = slice.data() - env.dataset.data();
+        // Shift forward
+        move_func_(output + output_offset + shift_, output + output_offset, slice.size());
+        // Shift backward to revert the changes
+        move_func_(output + output_offset, output + output_offset + shift_, slice.size());
+        return {slice.size() * 2};
+    }
+};
+
+void memmove_like_sz(sz_ptr_t output, sz_cptr_t input, std::size_t length) { std::memmove(output, input, length); }
+
 /**
- *  @brief  Benchmarks `memset`-like operations overwriting regions of output memory filling
- *          them with the first byte of the input regions.
+ *  @brief Benchmarks @b `memmove`-like operations shuffling back and forth the regions of output memory.
  *
  *  Multiple calls to the provided functions even with the same arguments won't change the input or output.
- *  So the kernels can be compared against the baseline `memset` function.
- *
- *  @param  output_buffer_ptr Aligned output buffer.
+ *  This is achieved by performing a combination of a forward and a backward move.
+ *  So the kernels can be compared against the baseline `memmove` function.
  */
-tracked_unary_functions_t fill_functions(sz_cptr_t dataset_start_ptr, sz_ptr_t output_buffer_ptr) {
-    auto wrap_sz = [dataset_start_ptr, output_buffer_ptr](auto function) -> unary_function_t {
-        return unary_function_t([function, dataset_start_ptr, output_buffer_ptr](std::string_view slice) {
-            std::size_t output_offset = (std::size_t)(slice.data() - dataset_start_ptr);
-            function(output_buffer_ptr + output_offset, slice.size(), slice.front());
-            return slice.size();
-        });
-    };
-    tracked_unary_functions_t result = {
-        {"memset", unary_function_t([dataset_start_ptr, output_buffer_ptr](std::string_view slice) {
-             std::size_t output_offset = (std::size_t)(slice.data() - dataset_start_ptr);
-             memset(output_buffer_ptr + output_offset, slice.front(), slice.size());
-             return slice.size();
-         })},
-        {"sz_fill_serial", wrap_sz(sz_fill_serial)},
-#if SZ_USE_SKYLAKE
-        {"sz_fill_skylake", wrap_sz(sz_fill_skylake)},
-#endif
+void bench_move(environment_t const &env) {
+
+    // Create an aligned buffer for the output
+    std::unique_ptr<char, page_alloc_and_free_t> output_buffer;
+    // Add space for at least one cache line to simplify unaligned exports
+    std::size_t const output_length = round_up_to_multiple<4096>(env.dataset.size() + max_shift_length);
+    output_buffer.reset(page_alloc_and_free_t {}(4096, output_length));
+    sz_ptr_t o = output_buffer.get();
+
+    // Copy the dataset to the output buffer
+    std::memcpy(o, env.dataset.data(), env.dataset.size());
+
+    // Provide a baseline for shifting forward by a single byte or a single cache line
+    bench_result_t byte = bench_unary(env, "sz_move_serial(by1)", move_from_sz<sz_move_serial, 1>(env, o)).log();
+    bench_result_t page = bench_unary(env, "sz_move_serial(by64)", move_from_sz<sz_move_serial, 64>(env, o)).log(byte);
+
 #if SZ_USE_HASWELL
-        {"sz_fill_haswell", wrap_sz(sz_fill_haswell)},
+    bench_unary(env, "sz_move_haswell(by1)", move_from_sz<sz_move_haswell, 1>(env, o)).log(byte);
+    bench_unary(env, "sz_move_haswell(by64)", move_from_sz<sz_move_haswell, 64>(env, o)).log(byte, page);
 #endif
-#if SZ_USE_SVE
-        {"sz_fill_sve", wrap_sz(sz_fill_sve)},
+#if SZ_USE_SKYLAKE
+    bench_unary(env, "sz_move_skylake(by1)", move_from_sz<sz_move_skylake, 1>(env, o)).log(byte);
+    bench_unary(env, "sz_move_skylake(by64)", move_from_sz<sz_move_skylake, 64>(env, o)).log(byte, page);
 #endif
 #if SZ_USE_NEON
-        {"sz_fill_neon", wrap_sz(sz_fill_neon)},
+    bench_unary(env, "sz_move_neon(by1)", move_from_sz<sz_move_neon, 1>(env, o)).log(byte);
+    bench_unary(env, "sz_move_neon(by64)", move_from_sz<sz_move_neon, 64>(env, o)).log(byte, page);
+#endif
+#if SZ_USE_SVE
+    bench_unary(env, "sz_move_sve(by1)", move_from_sz<sz_move_sve, 1>(env, o)).log(byte);
+    bench_unary(env, "sz_move_sve(by64)", move_from_sz<sz_move_sve, 64>(env, o)).log(byte, page);
 #endif
-    };
-    return result;
+
+    bench_unary(env, "std::memmove(by1)", move_from_sz<memmove_like_sz, 1>(env, o)).log(byte);
+    bench_unary(env, "std::memmove(by64)", move_from_sz<memmove_like_sz, 64>(env, o)).log(byte, page);
+}
+
+#pragma endregion // MemMove
+
+#pragma region Broadcasting Constants with MemSet
+
+/** @brief Wraps a hardware-specific @b `memset`-like backend into something compatible with @b `bench_unary`. */
+template <sz_fill_t fill_func_>
+struct fill_from_sz {
+
+    environment_t const &env;
+    sz_ptr_t output;
+
+    inline call_result_t operator()(std::size_t token_index) const noexcept {
+        return operator()(env.tokens[token_index]);
+    }
+
+    inline call_result_t operator()(std::string_view slice) const noexcept {
+        std::size_t output_offset = slice.data() - env.dataset.data();
+        fill_func_(output + output_offset, slice.size(), slice.front());
+        return {slice.size(), static_cast<check_value_t>(slice.front())};
+    }
+};
+
+/** @brief Wraps a hardware-specific @b `std::generate`-like backend into something compatible with @b `bench_unary`. */
+template <sz_fill_random_t fill_func_>
+struct fill_random_from_sz {
+
+    environment_t const &env;
+    sz_ptr_t output;
+
+    inline call_result_t operator()(std::size_t token_index) const noexcept {
+        return operator()(env.tokens[token_index]);
+    }
+
+    inline call_result_t operator()(std::string_view slice) const noexcept {
+        std::size_t output_offset = slice.data() - env.dataset.data();
+        fill_func_(output + output_offset, slice.size(), slice.front());
+        return {slice.size(), static_cast<check_value_t>(slice.front())};
+    }
+};
+
+void memset_like_sz(sz_ptr_t output, sz_size_t length, sz_u8_t value) { std::memset(output, value, length); }
+
+void generate_like_sz(sz_ptr_t output, sz_size_t length, sz_u64_t nonce) {
+    uniform_uint8_distribution_t distribution;
+    std::generate(output, output + length, [&]() -> char { return distribution(global_random_generator()); });
+    sz_unused(nonce);
 }
 
 /**
- *  @brief  Benchmarks `memmove`-like operations shuffling back and forth the regions of output memory.
+ *  @brief  Benchmarks `memset`-like operations overwriting regions of output memory filling
+ *          them with the first byte of the input regions.
  *
  *  Multiple calls to the provided functions even with the same arguments won't change the input or output.
- *  This is achieved by performing a combination of a forward and a backward move.
- *  So the kernels can be compared against the baseline `memmove` function.
- *
- *  @param  output_buffer_ptr Aligned output buffer, that ahs at least `shift` bytes of space at the end.
+ *  So the kernels can be compared against the baseline `memset` function.
  */
-tracked_unary_functions_t move_functions(sz_cptr_t dataset_start_ptr, sz_ptr_t output_buffer_ptr, std::size_t shift) {
-    std::string suffix = "<shift" + std::to_string(shift) + ">";
-    auto wrap_sz = [dataset_start_ptr, output_buffer_ptr, shift](auto function) -> unary_function_t {
-        return unary_function_t([function, dataset_start_ptr, output_buffer_ptr, shift](std::string_view slice) {
-            std::size_t output_offset = slice.data() - dataset_start_ptr;
-            // Shift forward
-            function(output_buffer_ptr + output_offset + shift, output_buffer_ptr + output_offset, slice.size());
-            // Shift backward to revert the changes
-            function(output_buffer_ptr + output_offset, output_buffer_ptr + output_offset + shift, slice.size());
-            return slice.size() * 2;
-        });
-    };
-    tracked_unary_functions_t result = {
-        {"memmove" + suffix, wrap_sz(memmove)},
-        {"sz_move_serial" + suffix, wrap_sz(sz_move_serial)},
+void bench_fill(environment_t const &env) {
+
+    // Create an aligned buffer for the output
+    std::unique_ptr<char, page_alloc_and_free_t> output_buffer;
+    // Add space for at least one cache line to simplify unaligned exports
+    std::size_t const output_length = round_up_to_multiple<4096>(env.dataset.size() + max_shift_length);
+    output_buffer.reset(page_alloc_and_free_t {}(4096, output_length));
+    sz_ptr_t o = output_buffer.get();
+
+    // Copy the dataset to the output buffer
+    std::memcpy(o, env.dataset.data(), env.dataset.size());
+
+    // Provide a baseline for overwriting the `output_buffer` memory
+    bench_result_t zeros = bench_unary(env, "sz_fill_serial", fill_from_sz<sz_fill_serial>(env, o)).log();
+    bench_result_t random =
+        bench_unary(env, "sz_fill_random_serial", fill_random_from_sz<sz_fill_random_serial>(env, o)).log(zeros);
+
+#if SZ_USE_HASWELL
+    bench_unary(env, "sz_fill_haswell", fill_from_sz<sz_fill_haswell>(env, o)).log(zeros);
+    bench_unary(env, "sz_fill_random_haswell", fill_random_from_sz<sz_fill_random_haswell>(env, o)).log(zeros, random);
+#endif
 #if SZ_USE_SKYLAKE
-        {"sz_move_skylake" + suffix, wrap_sz(sz_move_skylake)},
+    bench_unary(env, "sz_fill_skylake", fill_from_sz<sz_fill_skylake>(env, o)).log(zeros);
+    bench_unary(env, "sz_fill_random_skylake", fill_random_from_sz<sz_fill_random_skylake>(env, o)).log(zeros, random);
 #endif
-#if SZ_USE_HASWELL
-        {"sz_move_haswell" + suffix, wrap_sz(sz_move_haswell)},
+#if SZ_USE_ICE
+    bench_unary(env, "sz_fill_random_ice", fill_random_from_sz<sz_fill_random_ice>(env, o)).log(zeros, random);
 #endif
 #if SZ_USE_NEON
-        {"sz_move_neon" + suffix, wrap_sz(sz_move_neon)},
+    bench_unary(env, "sz_fill_neon", fill_from_sz<sz_fill_neon>(env, o)).log(zeros);
+    bench_unary(env, "sz_fill_random_neon", fill_random_from_sz<sz_fill_random_neon>(env, o)).log(zeros, random);
+#endif
+#if SZ_USE_SVE
+    bench_unary(env, "sz_fill_sve", fill_from_sz<sz_fill_sve>(env, o)).log(zeros);
 #endif
-    };
-    return result;
+    bench_unary(env, "fill<std::memset>", fill_from_sz<memset_like_sz>(env, o)).log(zeros);
+    bench_unary(env, "fill<std::random_device>", fill_random_from_sz<generate_like_sz>(env, o)).log(zeros, random);
+}
+
+#pragma endregion // Broadcasting Constants with MemSet
+
+#pragma region Lookup Transformations
+
+/** @brief Wraps a hardware-specific @b `memset`-like backend into something compatible with @b `bench_unary`. */
+template <sz_lookup_t lookup_func_>
+struct lookup_from_sz {
+
+    environment_t const &env;
+    sz_ptr_t output;
+    sz_cptr_t lookup_table;
+
+    inline call_result_t operator()(std::size_t token_index) const noexcept {
+        return operator()(env.tokens[token_index]);
+    }
+
+    inline call_result_t operator()(std::string_view slice) const noexcept {
+        std::size_t output_offset = slice.data() - env.dataset.data();
+        lookup_func_(output + output_offset, slice.size(), slice.data(), lookup_table);
+        return {slice.size(), static_cast<check_value_t>(slice.front())};
+    }
+};
+
+void transform_like_sz(sz_ptr_t output, sz_size_t length, sz_cptr_t input, sz_cptr_t lookup_table) {
+    std::transform(input, input + length, output, [=](char c) { return (char)lookup_table[(unsigned char)c]; });
 }
 
 /**
  *  @brief  Benchmarks look-up transformations on the provided slices, updating them inplace.
  *
  *  Performs a simple cyclical rotation of the alphabet, to test the performance of the different
- * "look-up table"-based transformations.
+ *  "look-up table"-based transformations.
  */
-tracked_unary_functions_t transform_functions() {
-    static unsigned char look_up_table[256];
-    std::iota(std::begin(look_up_table), std::end(look_up_table), 0);
-    std::rotate(std::begin(look_up_table), std::begin(look_up_table) + 1, std::end(look_up_table));
-
-    auto wrap_sz = [](auto function) -> unary_function_t {
-        return unary_function_t([function](std::string_view slice) {
-            char *output = const_cast<char *>(slice.data());
-            function((sz_ptr_t)output, (sz_size_t)slice.size(), (sz_cptr_t)output, (sz_cptr_t)look_up_table);
-            return slice.size();
-        });
-    };
-    tracked_unary_functions_t result = {
-        {"str::transform<lookup>", unary_function_t([](std::string_view slice) {
-             char *output = const_cast<char *>(slice.data());
-             std::transform(slice.begin(), slice.end(), output, [](char c) { return look_up_table[(unsigned char)c]; });
-             return slice.size();
-         })},
-        {"str::transform<increment>", unary_function_t([](std::string_view slice) {
-             char *output = const_cast<char *>(slice.data());
-             std::transform(slice.begin(), slice.end(), output, [](char c) { return c + 1; });
-             return slice.size();
-         })},
-        {"sz_lookup_serial", wrap_sz(sz_lookup_serial)},
-#if SZ_USE_ICE
-        {"sz_lookup_ice", wrap_sz(sz_lookup_ice)},
-#endif
-#if SZ_USE_HASWELL
-        {"sz_lookup_haswell", wrap_sz(sz_lookup_haswell)},
-#endif
-#if SZ_USE_NEON
-        {"sz_lookup_neon", wrap_sz(sz_lookup_neon)},
-#endif
-    };
-    return result;
-}
+void bench_lookup(environment_t const &env) {
+
+    // Create an aligned buffer for the output
+    std::unique_ptr<char, page_alloc_and_free_t> output_buffer;
+    // Add space for at least one cache line to simplify unaligned exports
+    std::size_t const output_length = round_up_to_multiple<4096>(env.dataset.size() + max_shift_length);
+    output_buffer.reset(page_alloc_and_free_t {}(4096, output_length));
+    sz_ptr_t o = output_buffer.get();
 
-void bench_memory(std::vector<std::string_view> const &slices, tracked_unary_functions_t &&variants) {
+    // Copy the dataset to the output buffer
+    std::memcpy(o, env.dataset.data(), env.dataset.size());
 
-    for (std::size_t variant_idx = 0; variant_idx != variants.size(); ++variant_idx) {
-        auto &variant = variants[variant_idx];
+    // Prepare cyclic rotation of the alphabet
+    static unsigned char lookup_table[256];
+    std::iota(std::begin(lookup_table), std::end(lookup_table), 0);
+    std::rotate(std::begin(lookup_table), std::begin(lookup_table) + 1, std::end(lookup_table));
 
-        // Tests
-        if (variant.function && variant.needs_testing) {
-            std::fprintf(stderr, "Testing is not currently implemented.\n");
-            exit(1);
-        }
+    // Provide a baseline for overwriting the `output_buffer` memory
+    sz_cptr_t lut = reinterpret_cast<sz_cptr_t>(lookup_table);
+    bench_result_t zeros = bench_unary(env, "sz_lookup_serial", lookup_from_sz<sz_lookup_serial>(env, o, lut)).log();
 
-        // Benchmarks
-        if (variant.function) variant.results = bench_on_tokens(slices, variant.function);
-        variant.print();
-    }
+#if SZ_USE_HASWELL
+    bench_unary(env, "sz_lookup_haswell", lookup_from_sz<sz_lookup_haswell>(env, o, lut)).log(zeros);
+#endif
+#if SZ_USE_ICE
+    bench_unary(env, "sz_lookup_ice", lookup_from_sz<sz_lookup_ice>(env, o, lut)).log(zeros);
+#endif
+#if SZ_USE_NEON
+    bench_unary(env, "sz_lookup_neon", lookup_from_sz<sz_lookup_neon>(env, o, lut)).log(zeros);
+#endif
+    bench_unary(env, "lookup<std::transform>", lookup_from_sz<transform_like_sz>(env, o, lut)).log(zeros);
 }
 
-void bench_memory(std::vector<std::string_view> const &slices, sz_cptr_t dataset_start_ptr,
-                  sz_ptr_t output_buffer_ptr) {
-
-    if (slices.size() == 0) return;
-    (void)dataset_start_ptr;
-    (void)output_buffer_ptr;
-
-    bench_memory(slices, copy_functions<true>(dataset_start_ptr, output_buffer_ptr));
-    bench_memory(slices, copy_functions<false>(dataset_start_ptr, output_buffer_ptr));
-    bench_memory(slices, fill_functions(dataset_start_ptr, output_buffer_ptr));
-    bench_memory(slices, move_functions(dataset_start_ptr, output_buffer_ptr, 1));
-    bench_memory(slices, move_functions(dataset_start_ptr, output_buffer_ptr, 8));
-    bench_memory(slices, move_functions(dataset_start_ptr, output_buffer_ptr, SZ_CACHE_LINE_WIDTH));
-    bench_memory(slices, move_functions(dataset_start_ptr, output_buffer_ptr, max_shift_length));
-    bench_memory(slices, transform_functions());
-}
+#pragma endregion // Lookup Transformations
 
 int main(int argc, char const **argv) {
-    std::printf("StringZilla. Starting memory benchmarks.\n");
+    std::printf("Welcome to StringZilla!\n");
 
-    dataset_t dataset = prepare_benchmark_environment(argc, argv);
-    sz_cptr_t const dataset_start_ptr = dataset.text.data();
+    std::printf("Building up the environment...\n");
+    environment_t env = build_environment( //
+        argc, argv,                        //
+        "leipzig1M.txt",                   //
+        environment_t::tokenization_t::lines_k);
 
-    // These benchmarks should be heavier than substring search and other less critical operations.
-    if (!SZ_DEBUG) seconds_per_benchmark *= 5;
+    std::printf("Starting low-level memory-operation benchmarks...\n");
+    bench_copy(env);
+    bench_move(env);
+    bench_fill(env);
+    bench_lookup(env);
 
-    // Create an aligned buffer for the output
-    std::unique_ptr<char, page_alloc_and_free_t> output_buffer;
-    // Add space for at least one cache line to simplify unaligned exports
-    std::size_t const output_length = round_up_to_multiple<4096>(dataset.text.size() + max_shift_length);
-    output_buffer.reset(page_alloc_and_free_t {}(4096, output_length));
-    if (!output_buffer) {
-        std::fprintf(stderr, "Failed to allocate an output buffer of %zu bytes.\n", output_length);
-        return 1;
-    }
-    std::memcpy(output_buffer.get(), dataset.text.data(), dataset.text.size());
-
-    // Baseline benchmarks for present tokens, coming in all lengths
-    std::printf("Benchmarking on entire dataset:\n");
-    bench_memory({dataset.text}, dataset_start_ptr, output_buffer.get());
-    std::printf("Benchmarking on lines:\n");
-    bench_memory(dataset.lines, dataset_start_ptr, output_buffer.get());
-    std::printf("Benchmarking on tokens:\n");
-    bench_memory(dataset.tokens, dataset_start_ptr, output_buffer.get());
-
-    // Run benchmarks on tokens of different length
-    for (std::size_t token_length : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32}) {
-        std::printf("Benchmarking on tokens of length %zu:\n", token_length);
-        bench_memory(filter_by_length<std::string_view>(dataset.tokens, token_length), dataset_start_ptr,
-                     output_buffer.get());
-    }
     std::printf("All benchmarks passed.\n");
     return 0;
 }
\ No newline at end of file

From 0bba772da463fbfc77f2ce0ce959e6ee66c34fdc Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 14 Mar 2025 12:28:54 +0000
Subject: [PATCH 218/751] Improve: `fill_random` checksums in benchmarks

---
 scripts/bench_memory.cpp | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/scripts/bench_memory.cpp b/scripts/bench_memory.cpp
index c692792c..951b01cf 100644
--- a/scripts/bench_memory.cpp
+++ b/scripts/bench_memory.cpp
@@ -251,7 +251,9 @@ struct fill_random_from_sz {
     inline call_result_t operator()(std::string_view slice) const noexcept {
         std::size_t output_offset = slice.data() - env.dataset.data();
         fill_func_(output + output_offset, slice.size(), slice.front());
-        return {slice.size(), static_cast<check_value_t>(slice.front())};
+        char last_random_byte = output[output_offset + slice.size() - 1];
+        do_not_optimize(last_random_byte);
+        return {slice.size(), static_cast<check_value_t>(last_random_byte)};
     }
 };
 
@@ -265,7 +267,7 @@ void generate_like_sz(sz_ptr_t output, sz_size_t length, sz_u64_t nonce) {
 
 /**
  *  @brief  Benchmarks `memset`-like operations overwriting regions of output memory filling
- *          them with the first byte of the input regions.
+ *          them with the first byte of the input regions or with random @b (reproducible) byte streams.
  *
  *  Multiple calls to the provided functions even with the same arguments won't change the input or output.
  *  So the kernels can be compared against the baseline `memset` function.
@@ -284,23 +286,27 @@ void bench_fill(environment_t const &env) {
 
     // Provide a baseline for overwriting the `output_buffer` memory
     bench_result_t zeros = bench_unary(env, "sz_fill_serial", fill_from_sz<sz_fill_serial>(env, o)).log();
-    bench_result_t random =
-        bench_unary(env, "sz_fill_random_serial", fill_random_from_sz<sz_fill_random_serial>(env, o)).log(zeros);
+    auto random_call = fill_random_from_sz<sz_fill_random_serial>(env, o);
+    bench_result_t random = bench_unary(env, "sz_fill_random_serial", random_call).log(zeros);
 
 #if SZ_USE_HASWELL
     bench_unary(env, "sz_fill_haswell", fill_from_sz<sz_fill_haswell>(env, o)).log(zeros);
-    bench_unary(env, "sz_fill_random_haswell", fill_random_from_sz<sz_fill_random_haswell>(env, o)).log(zeros, random);
+    bench_unary(env, "sz_fill_random_haswell", random_call, fill_random_from_sz<sz_fill_random_haswell>(env, o))
+        .log(zeros, random);
 #endif
 #if SZ_USE_SKYLAKE
     bench_unary(env, "sz_fill_skylake", fill_from_sz<sz_fill_skylake>(env, o)).log(zeros);
-    bench_unary(env, "sz_fill_random_skylake", fill_random_from_sz<sz_fill_random_skylake>(env, o)).log(zeros, random);
+    bench_unary(env, "sz_fill_random_skylake", random_call, fill_random_from_sz<sz_fill_random_skylake>(env, o))
+        .log(zeros, random);
 #endif
 #if SZ_USE_ICE
-    bench_unary(env, "sz_fill_random_ice", fill_random_from_sz<sz_fill_random_ice>(env, o)).log(zeros, random);
+    bench_unary(env, "sz_fill_random_ice", random_call, fill_random_from_sz<sz_fill_random_ice>(env, o))
+        .log(zeros, random);
 #endif
 #if SZ_USE_NEON
     bench_unary(env, "sz_fill_neon", fill_from_sz<sz_fill_neon>(env, o)).log(zeros);
-    bench_unary(env, "sz_fill_random_neon", fill_random_from_sz<sz_fill_random_neon>(env, o)).log(zeros, random);
+    bench_unary(env, "sz_fill_random_neon", random_call, fill_random_from_sz<sz_fill_random_neon>(env, o))
+        .log(zeros, random);
 #endif
 #if SZ_USE_SVE
     bench_unary(env, "sz_fill_sve", fill_from_sz<sz_fill_sve>(env, o)).log(zeros);

From 2ab635e8c2055c5cb623874cd05e2aa9e181b276 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 14 Mar 2025 12:30:24 +0000
Subject: [PATCH 219/751] Improve: Bold benchmark names in CLI

---
 .vscode/settings.json | 6 +++++-
 scripts/bench.hpp     | 3 ++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 85f842ea..a4854ce9 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -114,6 +114,7 @@
     "NOMINMAX",
     "NOTIMPLEMENTED",
     "npos",
+    "nullary",
     "numpy",
     "octdigits",
     "octogram",
@@ -152,6 +153,8 @@
     "ssize",
     "startswith",
     "STL",
+    "StringWa.rs",
+    "STRINGWARS",
     "stringzilla",
     "stringzilla_bare",
     "Strs",
@@ -308,6 +311,7 @@
     "xstring": "cpp",
     "xtr1common": "cpp",
     "xtree": "cpp",
-    "xutility": "cpp"
+    "xutility": "cpp",
+    "regex": "cpp"
   }
 }
\ No newline at end of file
diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index 4f4790b4..6be8c2c4 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -519,7 +519,8 @@ struct bench_result_t {
     template <typename... baselines_types_>
     bench_result_t const &log(baselines_types_ const &...bases) const {
         if (skipped) return *this;
-        std::printf("Benchmarking `%s`:\n", name.c_str());
+        std::printf("\n"); // Let's add some spacing between separate benchmarks
+        std::printf("Benchmarking \033[1m`%s`\033[0m:\n", name.c_str());
 
         // Print the number of errors, if any
         if (errors) std::printf("> Errors: %zu in %zu calls\n", errors, stress_calls);

From 75ef77e42bb9994e544ceec98b9c9a5819912691 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 14 Mar 2025 12:40:22 +0000
Subject: [PATCH 220/751] Fix: Expanding feature-detecting macros

---
 include/stringzilla/types.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 25e1e4c5..e99ef6fd 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -207,49 +207,49 @@
  */
 #ifndef SZ_USE_HASWELL
 #ifdef __AVX2__
-#define SZ_USE_HASWELL 1
+#define SZ_USE_HASWELL (1)
 #else
-#define SZ_USE_HASWELL 0
+#define SZ_USE_HASWELL (0)
 #endif
 #endif
 
 #ifndef SZ_USE_SKYLAKE
 #ifdef __AVX512F__
-#define SZ_USE_SKYLAKE 1
+#define SZ_USE_SKYLAKE (1)
 #else
-#define SZ_USE_SKYLAKE 0
+#define SZ_USE_SKYLAKE (0)
 #endif
 #endif
 
 #ifndef SZ_USE_ICE
 #ifdef __AVX512BW__
-#define SZ_USE_ICE 1
+#define SZ_USE_ICE (1)
 #else
-#define SZ_USE_ICE 0
+#define SZ_USE_ICE (0)
 #endif
 #endif
 
 #ifndef SZ_USE_NEON
 #ifdef __ARM_NEON
-#define SZ_USE_NEON 1
+#define SZ_USE_NEON (1)
 #else
-#define SZ_USE_NEON 0
+#define SZ_USE_NEON (0)
 #endif
 #endif
 
 #ifndef SZ_USE_SVE
 #ifdef __ARM_FEATURE_SVE
-#define SZ_USE_SVE 1
+#define SZ_USE_SVE (1)
 #else
-#define SZ_USE_SVE 0
+#define SZ_USE_SVE (0)
 #endif
 #endif
 
 #ifndef SZ_USE_SVE2
 #ifdef __ARM_FEATURE_SVE2
-#define SZ_USE_SVE2 1
+#define SZ_USE_SVE2 (1)
 #else
-#define SZ_USE_SVE2 0
+#define SZ_USE_SVE2 (0)
 #endif
 #endif
 

From 45b15b04185229016cd43be95c91faf9ec92db9a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 14 Mar 2025 12:40:56 +0000
Subject: [PATCH 221/751] Make: Patch passing `SZ_USE_SVE` definitions

---
 CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c061ea67..dd4348d1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -317,15 +317,15 @@ if (${STRINGZILLA_BUILD_SHARED})
             endif ()
 
             target_compile_definitions(
-                ${target} PRIVATE "SZ_USE_HASWELL=1" "SZ_USE_SKYLAKE=1" "SZ_USE_ICE=1" "SZ_USE_NEON=0"
-                                  "SZ_USE_SVE=0 SZ_USE_SVE2=0"
+                ${target} PRIVATE "SZ_USE_HASWELL=1" "SZ_USE_SKYLAKE=1" "SZ_USE_ICE=1" "SZ_USE_NEON=0" "SZ_USE_SVE=0"
+                                  "SZ_USE_SVE2=0"
             )
         elseif (SZ_PLATFORM_ARM)
             set_compiler_flags(${target} "" "armv8-a")
 
             target_compile_definitions(
-                ${target} PRIVATE "SZ_USE_HASWELL=0" "SZ_USE_SKYLAKE=0" "SZ_USE_ICE=0" "SZ_USE_NEON=1"
-                                  "SZ_USE_SVE=1 SZ_USE_SVE2=1"
+                ${target} PRIVATE "SZ_USE_HASWELL=0" "SZ_USE_SKYLAKE=0" "SZ_USE_ICE=0" "SZ_USE_NEON=1" "SZ_USE_SVE=1"
+                                  "SZ_USE_SVE2=1"
             )
         endif ()
 

From 343b85802670d7b18297f7bb4029f4c0545f5c49 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 14 Mar 2025 12:41:22 +0000
Subject: [PATCH 222/751] Docs: Listing bench details

---
 .github/workflows/prerelease.yml |  9 +++++----
 CONTRIBUTING.md                  | 11 ++++++++++-
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 82cace09..e2d92970 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -87,10 +87,11 @@ jobs:
         run: build_artifacts/stringzilla_test_cpp20
       - name: Test on Real World Data
         run: |
+          build_artifacts/stringzilla_bench_memory ${DATASET_PATH}     # for string copies and fills
           build_artifacts/stringzilla_bench_search ${DATASET_PATH}     # for substring search
           build_artifacts/stringzilla_bench_token ${DATASET_PATH}      # for hashing, equality comparisons, etc.
           build_artifacts/stringzilla_bench_similarity ${DATASET_PATH} # for edit distances and alignment scores
-          build_artifacts/stringzilla_bench_sort ${DATASET_PATH}       # for sorting arrays of strings
+          build_artifacts/stringzilla_bench_sequence ${DATASET_PATH}   # for sorting arrays of strings
           build_artifacts/stringzilla_bench_container ${DATASET_PATH}  # for STL containers with string keys
         env:
           DATASET_PATH: ./README.md
@@ -174,7 +175,7 @@ jobs:
           build_artifacts/stringzilla_bench_search ${DATASET_PATH}     # for substring search
           build_artifacts/stringzilla_bench_token ${DATASET_PATH}      # for hashing, equality comparisons, etc.
           build_artifacts/stringzilla_bench_similarity ${DATASET_PATH} # for edit distances and alignment scores
-          build_artifacts/stringzilla_bench_sort ${DATASET_PATH}       # for sorting arrays of strings
+          build_artifacts/stringzilla_bench_sequence ${DATASET_PATH}       # for sorting arrays of strings
           build_artifacts/stringzilla_bench_container ${DATASET_PATH}  # for STL containers with string keys
         env:
           DATASET_PATH: ./README.md
@@ -306,7 +307,7 @@ jobs:
           build_artifacts/stringzilla_bench_search ${DATASET_PATH}     # for substring search
           build_artifacts/stringzilla_bench_token ${DATASET_PATH}      # for hashing, equality comparisons, etc.
           build_artifacts/stringzilla_bench_similarity ${DATASET_PATH} # for edit distances and alignment scores
-          build_artifacts/stringzilla_bench_sort ${DATASET_PATH}       # for sorting arrays of strings
+          build_artifacts/stringzilla_bench_sequence ${DATASET_PATH}       # for sorting arrays of strings
           build_artifacts/stringzilla_bench_container ${DATASET_PATH}  # for STL containers with string keys
         env:
           DATASET_PATH: ./README.md
@@ -379,7 +380,7 @@ jobs:
           .\build_artifacts\stringzilla_bench_search.exe ${DATASET_PATH}     # for substring search
           .\build_artifacts\stringzilla_bench_token.exe ${DATASET_PATH}      # for hashing, equality comparisons, etc.
           .\build_artifacts\stringzilla_bench_similarity.exe ${DATASET_PATH} # for edit distances and alignment scores
-          .\build_artifacts\stringzilla_bench_sort.exe ${DATASET_PATH}       # for sorting arrays of strings
+          .\build_artifacts\stringzilla_bench_sequence.exe ${DATASET_PATH}       # for sorting arrays of strings
           .\build_artifacts\stringzilla_bench_container.exe ${DATASET_PATH}  # for STL containers with string keys
         env:
           DATASET_PATH: ./README.md
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index c6c4cefc..3a5c29c6 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -174,10 +174,19 @@ build_release/stringzilla_bench_memory <path>     # - for string copies and fill
 build_release/stringzilla_bench_search <path>     # - for substring search
 build_release/stringzilla_bench_token <path>      # - for hashing, equality comparisons, etc.
 build_release/stringzilla_bench_similarity <path> # - for edit distances and alignment scores
-build_release/stringzilla_bench_sort <path>       # - for sorting arrays of strings
+build_release/stringzilla_bench_sequence <path>   # - for sorting arrays of strings
 build_release/stringzilla_bench_container <path>  # - for STL containers with string keys
 ```
 
+Each benchmark originates from an identically named single-source file in the `scripts/` directory.
+All of them feature file-level documentation, and are designed to be self-explanatory.
+You can easily log their descriptions until the first `*/` with the following `sed` and `awk` commands:
+
+```sh
+sed '/\*\//q' scripts/bench_memory.cpp
+awk '/\*\// { exit } { print }' scripts/bench_memory.cpp
+```
+
 ### Benchmarking Hardware-Specific Optimizations
 
 Running on modern hardware, you may want to compile the code for older generations to compare the relative performance.

From 4cb096b3d3b48c6448af19f276d1bfa66bc19c70 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 14 Mar 2025 12:41:32 +0000
Subject: [PATCH 223/751] Fix: No return

---
 include/stringzilla/hash.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index af008dbd..549dbf76 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -2365,7 +2365,7 @@ SZ_PUBLIC sz_u64_t sz_hash_sve2(sz_cptr_t text, sz_size_t length, sz_u64_t seed)
 }
 
 SZ_PUBLIC void sz_fill_random_sve2(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
-    return sz_fill_random_neon(text, length, nonce);
+    sz_fill_random_neon(text, length, nonce);
 }
 
 #if 0

From 928fd790a1f273ef30d8d1f904a24de3a7ae31a1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 14 Mar 2025 12:50:06 +0000
Subject: [PATCH 224/751] Docs: Formatting and references

---
 include/stringzilla/find.h          | 22 ++++------------------
 include/stringzilla/stringzilla.hpp |  2 +-
 pyproject.toml                      |  4 ++--
 setup.py                            |  2 +-
 4 files changed, 8 insertions(+), 22 deletions(-)

diff --git a/include/stringzilla/find.h b/include/stringzilla/find.h
index 1cf99e3b..57dbee38 100644
--- a/include/stringzilla/find.h
+++ b/include/stringzilla/find.h
@@ -32,8 +32,8 @@ extern "C" {
 /**
  *  @brief  Locates first matching byte in a string. Equivalent to `memchr(haystack, *needle, h_length)` in LibC.
  *
- *  X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memchr.S
- *  Aarch64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/aarch64/memchr.S
+ *  @see X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memchr.S
+ *  @see Aarch64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/aarch64/memchr.S
  *
  *  @param[in] haystack Haystack - the string to search in.
  *  @param[in] h_length Number of bytes in the haystack.
@@ -45,8 +45,8 @@ SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cpt
 /**
  *  @brief  Locates last matching byte in a string. Equivalent to `memrchr(haystack, *needle, h_length)` in LibC.
  *
- *  X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memrchr.S
- *  Aarch64 implementation: missing
+ *  @see X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memrchr.S
+ *  @see Aarch64 implementation: missing
  *
  *  @param[in] haystack Haystack - the string to search in.
  *  @param[in] h_length Number of bytes in the haystack.
@@ -1633,20 +1633,6 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byteset_neon(sz_cptr_t h, sz_size_t h_length, sz_by
 #endif            // SZ_USE_NEON
 #pragma endregion // NEON Implementation
 
-/*  Implementation of the string search algorithms using the Arm SVE variable-length registers,
- *  available in Arm v9 processors, like in Apple M4+ and Graviton 3+ CPUs.
- */
-#pragma region SVE Implementation
-#if SZ_USE_SVE
-#pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+sve")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif            // SZ_USE_SVE
-#pragma endregion // SVE Implementation
-
 /*  Pick the right implementation for the string search algorithms.
  *  To override this behavior and precompile all backends - set `SZ_DYNAMIC_DISPATCH` to 1.
  */
diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index b0146a82..067acf0e 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -4065,7 +4065,7 @@ status_t try_argsort(container_type_ const &container, string_extractor_ const &
 
 /**
  *  @brief Locates the positions of the elements in 2 deduplicated string arrays that have identical values.
- *  @sa sz_sequence_join
+ *  @sa sz_sequence_intersect
  *
  *  @param[in] first_begin The pointer to the first element of the first array.
  *  @param[in] first_end The pointer to the element after the last element of the first array.
diff --git a/pyproject.toml b/pyproject.toml
index ed969673..008254ca 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,8 +3,8 @@
 #   - `manylinux` and `musllinux` wheels for Linux on x86_64, aarch64, i686, ppc64le, s390x;
 #   - `macos` wheels for x86_64, arm64, and universal2;
 #   - `windows` wheels for AMD64, x86, and ARM64.
-#   * for Python versions from 3.6 to 3.13.
-#   * running over 5,000 tests on each wheel.
+#   - for Python versions from 3.6 to 3.13.
+#   - running over 5,000 tests on each wheel.
 #   = meaning 16 platforms * 8 Python versions = 128 builds.
 #   = meaning over 500,000 tests.
 [build-system]
diff --git a/setup.py b/setup.py
index fd1fbb5c..e0bf92ce 100644
--- a/setup.py
+++ b/setup.py
@@ -77,7 +77,7 @@ def darwin_settings() -> Tuple[List[str], List[str], List[Tuple[str]]]:
         "-Wno-incompatible-pointer-types",  # like: passing argument 4 of ‘sz_export_prefix_u32’ from incompatible pointer type
         "-Wno-discarded-qualifiers",  # like: passing argument 1 of ‘free’ discards ‘const’ qualifier from pointer target type
         "-fPIC",  # to enable dynamic dispatch
-        "-mfloat-abi=hard",  # NEON intrinsics not available with the soft-float ABI
+        # "-mfloat-abi=hard",  # NEON intrinsics not available with the soft-float ABI
         "-mmacosx-version-min=11.0",  # minimum macOS version
     ]
     link_args = [

From 44485fb64bb7bcbf09a139a6479c6f1baf7bcbb3 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 14 Mar 2025 12:50:26 +0000
Subject: [PATCH 225/751] Make: CMake formatting

---
 .cmake-format.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 .cmake-format.py

diff --git a/.cmake-format.py b/.cmake-format.py
new file mode 100644
index 00000000..a0ed2e20
--- /dev/null
+++ b/.cmake-format.py
@@ -0,0 +1,19 @@
+# -----------------------------
+# Options effecting formatting.
+# -----------------------------
+with section("format"):
+    # How wide to allow formatted cmake files
+    line_width = 120
+
+    # How many spaces to tab for indent
+    tab_size = 4
+
+    # If true, separate flow control names from their parentheses with a space
+    separate_ctrl_name_with_space = True
+
+    # If true, separate function names from parentheses with a space
+    separate_fn_name_with_space = False
+
+    # If a statement is wrapped to more than one line, than dangle the closing
+    # parenthesis on its own line.
+    dangle_parens = True
\ No newline at end of file

From 7e0818008552f6edfac3e4b91e44713498f187e3 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 14 Mar 2025 12:50:48 +0000
Subject: [PATCH 226/751] Fix: Compiling Py bindings

---
 python/lib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/lib.c b/python/lib.c
index cf6ec6fb..c9dd5742 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -3726,7 +3726,7 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
     // Define SIMD capabilities
     {
         sz_capability_t caps = sz_capabilities();
-        sz_cptr_t caps_str = sz_capability_to_string(caps);
+        sz_cptr_t caps_str = sz_capabilities_to_string(caps);
         PyModule_AddStringConstant(m, "__capabilities__", caps_str);
     }
 

From e467649d58fd9fe3bd77dac03d0db0ea155fb7ab Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 14 Mar 2025 12:51:12 +0000
Subject: [PATCH 227/751] Add: Sorting in Rust

---
 rust/lib.rs | 339 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 326 insertions(+), 13 deletions(-)

diff --git a/rust/lib.rs b/rust/lib.rs
index 3d32227a..ecbac4f0 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -8,6 +8,14 @@
 
 pub mod sz {
 
+    /// A simple semantic version structure.
+    #[derive(Debug, Copy, Clone, PartialEq, Eq)]
+    pub struct SemVer {
+        pub major: i32,
+        pub minor: i32,
+        pub patch: i32,
+    }
+
     #[repr(C)]
     #[derive(Debug, PartialEq)]
     pub enum Status {
@@ -25,22 +33,34 @@ pub mod sz {
 
     pub type SortedIdx = usize;
 
+    /// A trait for types that support indexed lookup.
+    pub trait SequenceData {
+        type Item;
+        fn len(&self) -> usize;
+        fn index(&self, idx: usize) -> &Self::Item;
+    }
+
+    // Implement SequenceData for slices.
+    impl<T> SequenceData for [T] {
+        type Item = T;
+        #[inline]
+        fn len(&self) -> usize {
+            self.len()
+        }
+        #[inline]
+        fn index(&self, idx: usize) -> &T {
+            &self[idx]
+        }
+    }
+
     #[repr(C)]
-    pub struct Sequence {
+    pub struct _SzSequence {
         pub handle: *const c_void,
         pub count: usize,
         pub get_start: Option<unsafe extern "C" fn(handle: *const c_void, idx: usize) -> *const c_void>,
         pub get_length: Option<unsafe extern "C" fn(handle: *const c_void, idx: usize) -> usize>,
     }
 
-    /// A simple semantic version structure.
-    #[derive(Debug, Copy, Clone, PartialEq, Eq)]
-    pub struct SemVer {
-        pub major: i32,
-        pub minor: i32,
-        pub patch: i32,
-    }
-
     impl Byteset {
         /// Initializes a bit‑set to an empty collection (all characters banned).
         #[inline]
@@ -120,11 +140,12 @@ pub mod sz {
 
         fn sz_fill_random(text: *mut c_void, length: usize, seed: u64);
 
-        pub fn sz_sequence_argsort(sequence: *const Sequence, alloc: *const c_void, order: *mut SortedIdx) -> Status;
+        pub fn sz_sequence_argsort(sequence: *const _SzSequence, alloc: *const c_void, order: *mut SortedIdx)
+            -> Status;
 
         pub fn sz_sequence_intersect(
-            first_sequence: *const Sequence,
-            second_sequence: *const Sequence,
+            first_sequence: *const _SzSequence,
+            second_sequence: *const _SzSequence,
             alloc: *const c_void,
             seed: u64,
             intersection_size: *mut usize,
@@ -966,6 +987,163 @@ pub mod sz {
             sz_fill_random(buffer_slice.as_ptr() as _, buffer_slice.len(), nonce);
         }
     }
+
+    /// A helper type that represents a view into an existing slice of items,
+    /// along with a function pointer that maps an element to a byte slice.
+    /// This view is only used for the duration of a single FFI call.
+    struct SliceSequenceView<'a, Container> {
+        data: &'a Container,
+        /// A mapping function that, given an index, returns the corresponding byte slice.
+        mapper: fn(&'a Container, SortedIdx) -> &'a [u8],
+    }
+
+    unsafe extern "C" fn slice_get_start<T>(handle: *const c_void, idx: SortedIdx) -> *const c_void {
+        let view = &*(handle as *const SliceSequenceView<T>);
+        (view.mapper)(view.data, idx).as_ptr() as *const c_void
+    }
+
+    unsafe extern "C" fn slice_get_length<T>(handle: *const c_void, idx: SortedIdx) -> usize {
+        let view = &*(handle as *const SliceSequenceView<T>);
+        (view.mapper)(view.data, idx).len()
+    }
+
+    /// The default mapper for types that implement AsRef<[u8]>.
+    fn default_mapper<T: AsRef<[u8]>>(data: &[T], idx: SortedIdx) -> &[u8] {
+        data[idx].as_ref()
+    }
+
+    /// Sorts a sequence of items by comparing their byte‑slice representations.
+    ///
+    /// This variant uses the default mapping (which requires that the element
+    /// type implements `AsRef<[u8]>`).
+    ///
+    /// The caller must supply an output buffer `order` whose length is at least
+    /// equal to the length of `data`. On success, the function writes the sorted
+    /// permutation indices into `order` and returns the number of items (which is
+    /// always equal to data.len()).
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// use stringzilla::sz;
+    ///
+    /// let fruits = ["banana", "apple", "cherry"];
+    /// let mut order = [0; 3];
+    /// let n = sz::sort_into(&fruits, &mut order).expect("sort failed");
+    /// assert_eq!(n, 3);
+    /// assert_eq!(&order[..n], &[1, 0, 2]); // "apple", "banana", "cherry"
+    /// ```
+    pub fn sort_into<T: AsRef<[u8]>>(data: &[T], order: &mut [SortedIdx]) -> Result<usize, Status> {
+        sort_by_into(data, default_mapper::<T>, order)
+    }
+
+    /// Like `sort_into` but accepts a custom mapping function.
+    /// The mapper is a function pointer (no closures) that converts an element
+    /// into a byte‑slice view.
+    pub fn sort_by_into<T>(data: &[T], mapper: fn(&T) -> &[u8], order: &mut [SortedIdx]) -> Result<usize, Status> {
+        if order.len() < data.len() {
+            return Err(Status::BadAlloc);
+        }
+        // Create a view; no data copies occur.
+        let view = SliceSequenceView { data, mapper };
+        let seq = _SzSequence {
+            handle: &view as *const _ as *const c_void,
+            count: data.len(),
+            get_start: Some(slice_get_start::<T>),
+            get_length: Some(slice_get_length::<T>),
+        };
+        let status = unsafe { sz_sequence_argsort(&seq, core::ptr::null(), order.as_mut_ptr()) };
+        if status == Status::Success {
+            Ok(data.len())
+        } else {
+            Err(status)
+        }
+    }
+
+    // ----------------------------------------------------------------------
+    // Intersection functions
+    // ----------------------------------------------------------------------
+
+    /// Intersects two sequences (inner join) using their default byte‑slice views.
+    ///
+    /// Both sequences must have an output buffer provided (for first and second positions)
+    /// whose length is at least the minimum of the two input lengths.
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// use stringzilla::sz;
+    ///
+    /// let set1 = ["banana", "apple", "cherry"];
+    /// let set2 = ["cherry", "orange", "pineapple", "banana"];
+    /// let mut out1 = [0; 3]; // at least min(3, 4) == 3 elements.
+    /// let mut out2 = [0; 3];
+    /// let n = sz::intersect_into(&set1, &set2, 0, &mut out1, &mut out2).expect("intersect failed");
+    /// assert!(n == 2); // "banana" and "cherry" are common.
+    /// ```
+    pub fn intersect_into<T: AsRef<[u8]>>(
+        data1: &[T],
+        data2: &[T],
+        seed: u64,
+        out1: &mut [SortedIdx],
+        out2: &mut [SortedIdx],
+    ) -> Result<usize, Status> {
+        intersect_by_into(data1, default_mapper::<T>, data2, default_mapper::<T>, seed, out1, out2)
+    }
+
+    /// Like `intersect_into` but accepts custom mapping functions.
+    pub fn intersect_by_into<T, U>(
+        data1: &[T],
+        mapper1: fn(SortedIdx) -> &[u8],
+        data2: &[U],
+        mapper2: fn(SortedIdx) -> &[u8],
+        seed: u64,
+        out1: &mut [SortedIdx],
+        out2: &mut [SortedIdx],
+    ) -> Result<usize, Status> {
+        let min_count = data1.len().min(data2.len());
+        if out1.len() < min_count || out2.len() < min_count {
+            return Err(Status::BadAlloc);
+        }
+        let view1 = SliceSequenceView {
+            data: data1,
+            mapper: mapper1,
+        };
+        let view2 = SliceSequenceView {
+            data: data2,
+            mapper: mapper2,
+        };
+        let seq1 = _SzSequence {
+            handle: &view1 as *const _ as *const c_void,
+            count: data1.len(),
+            get_start: Some(slice_get_start::<T>),
+            get_length: Some(slice_get_length::<T>),
+        };
+        let seq2 = _SzSequence {
+            handle: &view2 as *const _ as *const c_void,
+            count: data2.len(),
+            get_start: Some(slice_get_start::<U>),
+            get_length: Some(slice_get_length::<U>),
+        };
+        let mut inter_size: usize = 0;
+        let status = unsafe {
+            sz_sequence_intersect(
+                &seq1,
+                &seq2,
+                core::ptr::null(),
+                seed,
+                &mut inter_size as *mut usize,
+                out1.as_mut_ptr(),
+                out2.as_mut_ptr(),
+            )
+        };
+        if status == Status::Success {
+            Ok(inter_size)
+        } else {
+            Err(status)
+        }
+        // The temporary views are dropped after the call.
+    }
 }
 
 pub trait Matcher<'a> {
@@ -1665,8 +1843,11 @@ where
 #[cfg(test)]
 mod tests {
     use std::borrow::Cow;
+    use std::collections::HashSet;
 
-    use crate::sz; // For global functions
+    use crate::sz;
+    use crate::sz::SortedIdx;
+    // For global functions
     use crate::StringZilla; // For member functions
 
     #[test]
@@ -1919,4 +2100,136 @@ mod tests {
             assert_eq!(matches, vec![&b"aa"[..], &b"aa"[..]]);
         }
     }
+
+    #[test]
+    fn test_sort_into_default() {
+        // Test with a slice of string literals.
+        let fruits = ["banana", "apple", "cherry"];
+        let mut order = [0; 3]; // output buffer must be at least fruits.len()
+        let n = sz::sort_into(&fruits, &mut order).expect("sort_into failed");
+        assert_eq!(n, fruits.len());
+
+        // Reconstruct sorted order using the returned indices.
+        let sorted_from_api: Vec<_> = order[..n].iter().map(|&i| fruits[i]).collect();
+
+        // Compute expected order using the standard sort.
+        let mut expected = fruits.to_vec();
+        expected.sort();
+
+        assert_eq!(sorted_from_api, expected);
+    }
+
+    #[test]
+    fn test_sort_by_into_custom() {
+        // Define a custom type.
+        #[derive(Debug)]
+        struct Person {
+            name: &'static str,
+            age: u32,
+        }
+
+        let people = [
+            Person {
+                name: "Charlie",
+                age: 30,
+            },
+            Person { name: "Alice", age: 25 },
+            Person { name: "Bob", age: 40 },
+        ];
+        let mut order = [0; 3];
+        // Use sort_by_into with a custom mapper that extracts the name as a byte slice.
+        let n = sz::sort_by_into(&people, |p: &Person| p.name.as_bytes(), &mut order).expect("sort_by_into failed");
+        assert_eq!(n, people.len());
+
+        let sorted_from_api: Vec<_> = order[..n].iter().map(|&i| people[i].name).collect();
+
+        // Compute expected order using standard sorting on the names.
+        let mut expected: Vec<_> = people.iter().map(|p| p.name).collect();
+        expected.sort();
+
+        assert_eq!(sorted_from_api, expected);
+    }
+
+    #[test]
+    fn test_intersect_into_default() {
+        // Two slices of string literals.
+        let set1 = ["banana", "apple", "cherry"];
+        let set2 = ["cherry", "orange", "pineapple", "banana"];
+        // Output buffers: size must be at least min(set1.len(), set2.len()).
+        let mut out1 = [0; 3];
+        let mut out2 = [0; 3];
+
+        let n = sz::intersect_into(&set1, &set2, 0, &mut out1, &mut out2).expect("intersect_into failed");
+        assert!(n <= set1.len().min(set2.len()));
+
+        // For simplicity, we will compare the intersection from the first set.
+        // Our API returns indices (for set1 in out1).
+        let common_from_api: HashSet<_> = out1[..n].iter().map(|&i| set1[i]).collect();
+
+        // Compute the expected intersection using a HashSet.
+        let expected: HashSet<_> = set1
+            .iter()
+            .cloned()
+            .collect::<HashSet<_>>()
+            .intersection(&set2.iter().cloned().collect())
+            .cloned()
+            .collect();
+
+        assert_eq!(common_from_api, expected);
+    }
+
+    #[test]
+    fn test_intersect_by_into_custom() {
+        // Define a custom type.
+        #[derive(Debug)]
+        struct Person {
+            name: &'static str,
+            age: u32,
+        }
+
+        let group1 = [
+            Person { name: "Alice", age: 25 },
+            Person { name: "Bob", age: 30 },
+            Person {
+                name: "Charlie",
+                age: 35,
+            },
+        ];
+        let group2 = [
+            Person { name: "David", age: 40 },
+            Person {
+                name: "Charlie",
+                age: 50,
+            },
+            Person { name: "Alice", age: 60 },
+        ];
+        let mut out1 = [0; 3];
+        let mut out2 = [0; 3];
+
+        let n = sz::intersect_by_into(
+            &group1,
+            |i: SortedIdx| group1[i].name.as_bytes(),
+            &group2,
+            |j: SortedIdx| group2[j].name.as_bytes(),
+            0,
+            &mut out1,
+            &mut out2,
+        )
+        .expect("intersect_by_into failed");
+        assert!(n <= group1.len().min(group2.len()));
+
+        // Use the indices for group1 to get common names.
+        let common_from_api: HashSet<_> = out1[..n].iter().map(|&i| group1[i].name).collect();
+
+        // Compute expected common names using a HashSet.
+        let expected: HashSet<_> = group1
+            .iter()
+            .map(|p| p.name)
+            .collect::<HashSet<_>>()
+            .intersection(&group2.iter().map(|p| p.name).collect())
+            .cloned()
+            .collect();
+
+        assert_eq!(common_from_api, expected);
+    }
 }

From 0c085641a4f10539a318ee35c106f2050b09eebf Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 14 Mar 2025 14:51:00 +0000
Subject: [PATCH 228/751] Fix: Remove missing Ice-Lake benchs

---
 scripts/bench_token.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index 3a6daf6a..ebd4a9df 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -345,9 +345,6 @@ void bench_comparing_equality(environment_t const &env) {
 #if SZ_USE_SKYLAKE
     bench_unary(env, "sz_equal_skylake", validator, equality_from_sz<sz_equal_skylake>(env)).log(base, base_stl);
 #endif
-#if SZ_USE_ICE
-    bench_unary(env, "sz_equal_ice", validator, equality_from_sz<sz_equal_ice>(env)).log(base, base_stl);
-#endif
 #if SZ_USE_NEON
     bench_unary(env, "sz_equal_neon", validator, equality_from_sz<sz_equal_neon>(env)).log(base, base_stl);
 #endif
@@ -368,9 +365,6 @@ void bench_comparing_order(environment_t const &env) {
 #if SZ_USE_SKYLAKE
     bench_unary(env, "sz_order_skylake", validator, ordering_from_sz<sz_order_skylake>(env)).log(base, base_stl);
 #endif
-#if SZ_USE_ICE
-    bench_unary(env, "sz_order_ice", validator, ordering_from_sz<sz_order_ice>(env)).log(base, base_stl);
-#endif
 #if SZ_USE_NEON
     bench_unary(env, "sz_order_neon", validator, ordering_from_sz<sz_order_neon>(env)).log(base, base_stl);
 #endif

From 06784fce73a143deed5848768cf5cd30cb2270f2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 14 Mar 2025 14:51:25 +0000
Subject: [PATCH 229/751] Add: Set intersections in Rust

---
 rust/lib.rs | 279 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 183 insertions(+), 96 deletions(-)

diff --git a/rust/lib.rs b/rust/lib.rs
index ecbac4f0..4edff5f2 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -988,39 +988,36 @@ pub mod sz {
         }
     }
 
-    /// A helper type that represents a view into an existing slice of items,
-    /// along with a function pointer that maps an element to a byte slice.
-    /// This view is only used for the duration of a single FFI call.
-    struct SliceSequenceView<'a, Container> {
-        data: &'a Container,
-        /// A mapping function that, given an index, returns the corresponding byte slice.
-        mapper: fn(&'a Container, SortedIdx) -> &'a [u8],
-    }
-
-    unsafe extern "C" fn slice_get_start<T>(handle: *const c_void, idx: SortedIdx) -> *const c_void {
-        let view = &*(handle as *const SliceSequenceView<T>);
-        (view.mapper)(view.data, idx).as_ptr() as *const c_void
+    /// A helper type that holds a mapper closure which, given an index,
+    /// returns the corresponding byte‑slice representation.
+    ///
+    /// The closure is expected to have type `Fn(usize) -> &[u8]` so that callers
+    /// can write closures like `|i| data[i].as_ref()` or `|i| people[i].name.as_bytes()`.
+    struct _SliceLookupView<F: Fn(usize) -> &'static [u8]> {
+        mapper: F,
     }
 
-    unsafe extern "C" fn slice_get_length<T>(handle: *const c_void, idx: SortedIdx) -> usize {
-        let view = &*(handle as *const SliceSequenceView<T>);
-        (view.mapper)(view.data, idx).len()
+    unsafe extern "C" fn _slice_get_start<F>(handle: *const c_void, idx: SortedIdx) -> *const c_void
+    where
+        F: Fn(usize) -> &'static [u8],
+    {
+        let view = &*(handle as *const _SliceLookupView<F>);
+        (view.mapper)(idx).as_ptr() as *const c_void
     }
 
-    /// The default mapper for types that implement AsRef<[u8]>.
-    fn default_mapper<T: AsRef<[u8]>>(data: &[T], idx: SortedIdx) -> &[u8] {
-        data[idx].as_ref()
+    unsafe extern "C" fn _slice_get_length<F>(handle: *const c_void, idx: SortedIdx) -> usize
+    where
+        F: Fn(usize) -> &'static [u8],
+    {
+        let view = &*(handle as *const _SliceLookupView<F>);
+        (view.mapper)(idx).len()
     }
 
     /// Sorts a sequence of items by comparing their byte‑slice representations.
     ///
-    /// This variant uses the default mapping (which requires that the element
-    /// type implements `AsRef<[u8]>`).
-    ///
     /// The caller must supply an output buffer `order` whose length is at least
     /// equal to the length of `data`. On success, the function writes the sorted
-    /// permutation indices into `order` and returns the number of items (which is
-    /// always equal to data.len()).
+    /// permutation indices into `order`.
     ///
     /// # Example
     ///
@@ -1028,33 +1025,66 @@ pub mod sz {
     /// use stringzilla::sz;
     ///
     /// let fruits = ["banana", "apple", "cherry"];
-    /// let mut order = [0; 3];
-    /// let n = sz::sort_into(&fruits, &mut order).expect("sort failed");
-    /// assert_eq!(n, 3);
-    /// assert_eq!(&order[..n], &[1, 0, 2]); // "apple", "banana", "cherry"
+    /// let mut order = [0; fruits.len()];
+    /// sz::argsort_permutation(&fruits, &mut order).expect("sort failed");
+    /// assert_eq!(order, &[1, 0, 2]); // "apple", "banana", "cherry"
     /// ```
-    pub fn sort_into<T: AsRef<[u8]>>(data: &[T], order: &mut [SortedIdx]) -> Result<usize, Status> {
-        sort_by_into(data, default_mapper::<T>, order)
-    }
-
-    /// Like `sort_into` but accepts a custom mapping function.
-    /// The mapper is a function pointer (no closures) that converts an element
-    /// into a byte‑slice view.
-    pub fn sort_by_into<T>(data: &[T], mapper: fn(&T) -> &[u8], order: &mut [SortedIdx]) -> Result<usize, Status> {
-        if order.len() < data.len() {
+    pub fn argsort_permutation<T: AsRef<[u8]>>(data: &[T], order: &mut [SortedIdx]) -> Result<(), Status> {
+        if data.len() > order.len() {
             return Err(Status::BadAlloc);
         }
-        // Create a view; no data copies occur.
-        let view = SliceSequenceView { data, mapper };
+        argsort_permutation_by(|i| data[i].as_ref(), order)
+    }
+
+    /// Sorts a sequence of items by comparing their corresponding byte‑slice representations.
+    /// The size of the permutation is inferred from the length of the `order` slice.
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// use stringzilla::sz;
+    ///
+    /// let people = [
+    ///     Person { name: "Charlie", age: 20 },
+    ///     Person { name: "Alice", age: 25 },
+    ///     Person { name: "Bob", age: 30 },
+    /// ];
+    /// let mut order = [0; people.len()];
+    /// sz::argsort_permutation_by(|i| people[i].name.as_bytes(), &mut order).expect("sort failed");
+    /// assert_eq!(order, &[1, 2, 0]); // "Alice", "Bob", "Charlie"
+    /// ```
+    pub fn argsort_permutation_by<F, A>(mapper: F, order: &mut [SortedIdx]) -> Result<(), Status>
+    where
+        F: Fn(usize) -> A,
+        A: AsRef<[u8]>,
+    {
+        // Adapter closure: given an index, call the provided mapper and then transmute the
+        // resulting slice to have a `'static` lifetime. This transmute is safe as long as
+        // the FFI call is synchronous and the returned slices are only used during the call.
+        let adapter = move |i: usize| -> &'static [u8] {
+            let binding = mapper(i);
+            let slice = binding.as_ref();
+            unsafe { core::mem::transmute(slice) }
+        };
+
+        _argsort_permutation_impl(adapter, order)
+    }
+
+    /// Helper that takes an adapter (with a concrete type) and performs the FFI call.
+    fn _argsort_permutation_impl<FAdapter>(adapter: FAdapter, order: &mut [SortedIdx]) -> Result<(), Status>
+    where
+        FAdapter: Fn(usize) -> &'static [u8],
+    {
+        let view = _SliceLookupView { mapper: adapter };
         let seq = _SzSequence {
             handle: &view as *const _ as *const c_void,
-            count: data.len(),
-            get_start: Some(slice_get_start::<T>),
-            get_length: Some(slice_get_length::<T>),
+            count: order.len(),
+            get_start: Some(_slice_get_start::<FAdapter>),
+            get_length: Some(_slice_get_length::<FAdapter>),
         };
         let status = unsafe { sz_sequence_argsort(&seq, core::ptr::null(), order.as_mut_ptr()) };
         if status == Status::Success {
-            Ok(data.len())
+            Ok(())
         } else {
             Err(status)
         }
@@ -1076,54 +1106,116 @@ pub mod sz {
     ///
     /// let set1 = ["banana", "apple", "cherry"];
     /// let set2 = ["cherry", "orange", "pineapple", "banana"];
-    /// let mut out1 = [0; 3]; // at least min(3, 4) == 3 elements.
-    /// let mut out2 = [0; 3];
-    /// let n = sz::intersect_into(&set1, &set2, 0, &mut out1, &mut out2).expect("intersect failed");
+    /// let mut positions1 = [0; 3]; // at least min(3, 4) == 3 elements.
+    /// let mut positions2 = [0; 3];
+    /// let n = sz::intersection(&set1, &set2, 0, &mut positions1, &mut positions2).expect("intersect failed");
     /// assert!(n == 2); // "banana" and "cherry" are common.
     /// ```
-    pub fn intersect_into<T: AsRef<[u8]>>(
+    pub fn intersection<T: AsRef<[u8]>>(
         data1: &[T],
         data2: &[T],
         seed: u64,
-        out1: &mut [SortedIdx],
-        out2: &mut [SortedIdx],
-    ) -> Result<usize, Status> {
-        intersect_by_into(data1, default_mapper::<T>, data2, default_mapper::<T>, seed, out1, out2)
-    }
-
-    /// Like `intersect_into` but accepts custom mapping functions.
-    pub fn intersect_by_into<T, U>(
-        data1: &[T],
-        mapper1: fn(SortedIdx) -> &[u8],
-        data2: &[U],
-        mapper2: fn(SortedIdx) -> &[u8],
-        seed: u64,
-        out1: &mut [SortedIdx],
-        out2: &mut [SortedIdx],
+        positions1: &mut [SortedIdx],
+        positions2: &mut [SortedIdx],
     ) -> Result<usize, Status> {
         let min_count = data1.len().min(data2.len());
-        if out1.len() < min_count || out2.len() < min_count {
+        if positions1.len() < min_count || positions2.len() < min_count {
             return Err(Status::BadAlloc);
         }
-        let view1 = SliceSequenceView {
-            data: data1,
-            mapper: mapper1,
+
+        intersection_by(
+            |i| data1[i].as_ref(),
+            |j| data2[j].as_ref(),
+            seed,
+            positions1,
+            positions2,
+        )
+    }
+
+    /// Intersects two sequences (inner join) using their elements corresponding byte‑slice views.
+    /// The caller must provide a closure that maps an index to the byte slice representation of
+    /// the corresponding element in the first and second sequences.
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// use stringzilla::sz;
+    ///
+    /// let people1 = [
+    ///     Person { name: "Charlie", age: 20 },
+    ///     Person { name: "Alice", age: 25 },
+    ///     Person { name: "Bob", age: 30 },
+    /// ];
+    /// let people2 = [
+    ///     Person { name: "Alice", age: 25 },
+    ///     Person { name: "Bob", age: 30 },
+    ///     Person { name: "Charlie", age: 20 },
+    /// ];
+    /// let mut positions1 = [0; people1.len().min(people2.len())];
+    /// let mut positions2 = [0; people2.len().min(people1.len())];
+    /// let n = sz::intersection_by(
+    ///     |i| people1[i].name.as_bytes(),
+    ///     |j| people2[j].name.as_bytes(),
+    ///     0,
+    ///     &mut positions1,
+    ///     &mut positions2,
+    /// ).expect("intersect failed");
+    /// assert!(n == 3); // "Alice", "Bob", and "Charlie" are common.
+    /// ```
+    pub fn intersection_by<F, G, A, B>(
+        mapper1: F,
+        mapper2: G,
+        seed: u64,
+        positions1: &mut [SortedIdx],
+        positions2: &mut [SortedIdx],
+    ) -> Result<usize, Status>
+    where
+        F: Fn(usize) -> A,
+        A: AsRef<[u8]>,
+        G: Fn(usize) -> B,
+        B: AsRef<[u8]>,
+    {
+        // Adapter closure: given an index, call the provided mapper and then transmute the
+        // resulting slice to have a `'static` lifetime. This transmute is safe as long as
+        // the FFI call is synchronous and the returned slices are only used during the call.
+        let adapter1 = move |i: usize| -> &'static [u8] {
+            let binding = mapper1(i);
+            let slice = binding.as_ref();
+            unsafe { core::mem::transmute(slice) }
         };
-        let view2 = SliceSequenceView {
-            data: data2,
-            mapper: mapper2,
+        let adapter2 = move |i: usize| -> &'static [u8] {
+            let binding = mapper2(i);
+            let slice = binding.as_ref();
+            unsafe { core::mem::transmute(slice) }
         };
+
+        _intersection_by_impl(adapter1, adapter2, seed, positions1, positions2)
+    }
+
+    fn _intersection_by_impl<FAdapter, GAdapter>(
+        adapter1: FAdapter,
+        adapter2: GAdapter,
+        seed: u64,
+        positions1: &mut [SortedIdx],
+        positions2: &mut [SortedIdx],
+    ) -> Result<usize, Status>
+    where
+        FAdapter: Fn(usize) -> &'static [u8],
+        GAdapter: Fn(usize) -> &'static [u8],
+    {
+        let view1 = _SliceLookupView { mapper: adapter1 };
+        let view2 = _SliceLookupView { mapper: adapter2 };
         let seq1 = _SzSequence {
             handle: &view1 as *const _ as *const c_void,
-            count: data1.len(),
-            get_start: Some(slice_get_start::<T>),
-            get_length: Some(slice_get_length::<T>),
+            count: positions1.len(),
+            get_start: Some(_slice_get_start::<FAdapter>),
+            get_length: Some(_slice_get_length::<FAdapter>),
         };
         let seq2 = _SzSequence {
             handle: &view2 as *const _ as *const c_void,
-            count: data2.len(),
-            get_start: Some(slice_get_start::<U>),
-            get_length: Some(slice_get_length::<U>),
+            count: positions2.len(),
+            get_start: Some(_slice_get_start::<GAdapter>),
+            get_length: Some(_slice_get_length::<GAdapter>),
         };
         let mut inter_size: usize = 0;
         let status = unsafe {
@@ -1133,8 +1225,8 @@ pub mod sz {
                 core::ptr::null(),
                 seed,
                 &mut inter_size as *mut usize,
-                out1.as_mut_ptr(),
-                out2.as_mut_ptr(),
+                positions1.as_mut_ptr(),
+                positions2.as_mut_ptr(),
             )
         };
         if status == Status::Success {
@@ -1142,7 +1234,6 @@ pub mod sz {
         } else {
             Err(status)
         }
-        // The temporary views are dropped after the call.
     }
 }
 
@@ -2102,15 +2193,14 @@ mod tests {
     }
 
     #[test]
-    fn test_sort_into_default() {
+    fn test_argsort_permutation_default() {
         // Test with a slice of string literals.
         let fruits = ["banana", "apple", "cherry"];
         let mut order = [0; 3]; // output buffer must be at least fruits.len()
-        let n = sz::sort_into(&fruits, &mut order).expect("sort_into failed");
-        assert_eq!(n, fruits.len());
+        sz::argsort_permutation(&fruits, &mut order).expect("argsort_permutation failed");
 
         // Reconstruct sorted order using the returned indices.
-        let sorted_from_api: Vec<_> = order[..n].iter().map(|&i| fruits[i]).collect();
+        let sorted_from_api: Vec<_> = order.iter().map(|&i| fruits[i]).collect();
 
         // Compute expected order using the standard sort.
         let mut expected = fruits.to_vec();
@@ -2120,7 +2210,7 @@ mod tests {
     }
 
     #[test]
-    fn test_sort_by_into_custom() {
+    fn test_argsort_permutation_by_custom() {
         // Define a custom type.
         #[derive(Debug)]
         struct Person {
@@ -2137,11 +2227,10 @@ mod tests {
             Person { name: "Bob", age: 40 },
         ];
         let mut order = [0; 3];
-        // Use sort_by_into with a custom mapper that extracts the name as a byte slice.
-        let n = sz::sort_by_into(&people, |p: &Person| p.name.as_bytes(), &mut order).expect("sort_by_into failed");
-        assert_eq!(n, people.len());
+        sz::argsort_permutation_by(|i: usize| people[i].name.as_bytes(), &mut order)
+            .expect("argsort_permutation_by failed");
 
-        let sorted_from_api: Vec<_> = order[..n].iter().map(|&i| people[i].name).collect();
+        let sorted_from_api: Vec<_> = order.iter().map(|&i| people[i].name).collect();
 
         // Compute expected order using standard sorting on the names.
         let mut expected: Vec<_> = people.iter().map(|p| p.name).collect();
@@ -2151,7 +2240,7 @@ mod tests {
     }
 
     #[test]
-    fn test_intersect_into_default() {
+    fn test_intersection_default() {
         // Two slices of string literals.
         let set1 = ["banana", "apple", "cherry"];
         let set2 = ["cherry", "orange", "pineapple", "banana"];
@@ -2159,14 +2248,14 @@ mod tests {
         let mut out1 = [0; 3];
         let mut out2 = [0; 3];
 
-        let n = sz::intersect_into(&set1, &set2, 0, &mut out1, &mut out2).expect("intersect_into failed");
+        let n = sz::intersection(&set1, &set2, 0, &mut out1, &mut out2).expect("intersection failed");
         assert!(n <= set1.len().min(set2.len()));
 
         // For simplicity, we will compare the intersection from the first set.
         // Our API returns indices (for set1 in out1).
         let common_from_api: HashSet<_> = out1[..n].iter().map(|&i| set1[i]).collect();
 
-        // Compute the expected intersection using a HashSet.
+        // Compute the expected intersection using a `HashSet`.
         let expected: HashSet<_> = set1
             .iter()
             .cloned()
@@ -2179,7 +2268,7 @@ mod tests {
     }
 
     #[test]
-    fn test_intersect_by_into_custom() {
+    fn test_intersection_by_custom() {
         // Define a custom type.
         #[derive(Debug)]
         struct Person {
@@ -2206,22 +2295,20 @@ mod tests {
         let mut out1 = [0; 3];
         let mut out2 = [0; 3];
 
-        let n = sz::intersect_by_into(
-            &group1,
+        let n = sz::intersection_by(
             |i: SortedIdx| group1[i].name.as_bytes(),
-            &group2,
             |j: SortedIdx| group2[j].name.as_bytes(),
             0,
             &mut out1,
             &mut out2,
         )
-        .expect("intersect_by_into failed");
+        .expect("intersection_by failed");
         assert!(n <= group1.len().min(group2.len()));
 
-        // Use the indices for group1 to get common names.
+        // Use the indices for `group1` to get common names.
         let common_from_api: HashSet<_> = out1[..n].iter().map(|&i| group1[i].name).collect();
 
-        // Compute expected common names using a HashSet.
+        // Compute expected common names using a `HashSet`.
         let expected: HashSet<_> = group1
             .iter()
             .map(|p| p.name)

From 34660f2a6b881cdbc83d19d74bae751c54c23f70 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 14 Mar 2025 17:32:17 +0000
Subject: [PATCH 230/751] Improve: Construct `Byteset::from_bytes`

This API is similar to the `bstr` crate
---
 rust/lib.rs | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/rust/lib.rs b/rust/lib.rs
index 4edff5f2..c8aa9ae3 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -99,6 +99,22 @@ pub mod sz {
                 *b = !*b;
             }
         }
+        /// Constructs a Byteset from a slice of bytes.
+        #[inline]
+        pub fn from_bytes(bytes: &[u8]) -> Self {
+            let mut set = Self::new();
+            for &b in bytes {
+                set.add_u8(b);
+            }
+            set
+        }
+    }
+
+    impl From<&[u8]> for Byteset {
+        #[inline]
+        fn from(bytes: &[u8]) -> Self {
+            Self::from_bytes(bytes)
+        }
     }
 
     use core::fmt::{self, Write};

From 667ea917cb47504cba776bda8ddaef05a962c70b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 14 Mar 2025 19:18:47 +0000
Subject: [PATCH 231/751] Add: Expose `find_byteset` in Rust

---
 rust/lib.rs | 116 ++++++++++++++++++++++++++++------------------------
 1 file changed, 62 insertions(+), 54 deletions(-)

diff --git a/rust/lib.rs b/rust/lib.rs
index c8aa9ae3..b265d434 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -99,6 +99,15 @@ pub mod sz {
                 *b = !*b;
             }
         }
+
+        /// Returns a new Byteset with all bits inverted, leaving self unchanged.
+        #[inline]
+        pub fn inverted(&self) -> Self {
+            Self {
+                bits: [!self.bits[0], !self.bits[1], !self.bits[2], !self.bits[3]],
+            }
+        }
+
         /// Constructs a Byteset from a slice of bytes.
         #[inline]
         pub fn from_bytes(bytes: &[u8]) -> Self {
@@ -110,10 +119,10 @@ pub mod sz {
         }
     }
 
-    impl From<&[u8]> for Byteset {
+    impl<T: AsRef<[u8]>> From<T> for Byteset {
         #[inline]
-        fn from(bytes: &[u8]) -> Self {
-            Self::from_bytes(bytes)
+        fn from(bytes: T) -> Self {
+            Self::from_bytes(bytes.as_ref())
         }
     }
 
@@ -516,22 +525,16 @@ pub mod sz {
     /// An `Option<usize>` representing the index of the first occurrence of any byte from
     /// `needles` within `haystack`, if found, otherwise `None`.
     #[inline(always)]
-    pub fn find_byte_from<H, N>(haystack: H, needles: N) -> Option<usize>
+    pub fn find_byteset<H>(haystack: H, needles: Byteset) -> Option<usize>
     where
         H: AsRef<[u8]>,
-        N: AsRef<[u8]>,
     {
         let haystack_ref = haystack.as_ref();
-        let needles_ref = needles.as_ref();
         let haystack_pointer = haystack_ref.as_ptr() as _;
         let haystack_length = haystack_ref.len();
-        let mut byteset = Byteset::new();
-        for &b in needles_ref {
-            byteset.add_u8(b);
-        }
 
         let result =
-            unsafe { sz_find_byteset(haystack_pointer, haystack_length, &byteset as *const _ as *const c_void) };
+            unsafe { sz_find_byteset(haystack_pointer, haystack_length, &needles as *const _ as *const c_void) };
         if result.is_null() {
             None
         } else {
@@ -552,22 +555,16 @@ pub mod sz {
     ///
     /// An `Option<usize>` representing the index of the last occurrence of any byte from
     /// `needles` within `haystack`, if found, otherwise `None`.
-    pub fn rfind_byte_from<H, N>(haystack: H, needles: N) -> Option<usize>
+    pub fn rfind_byteset<H>(haystack: H, needles: Byteset) -> Option<usize>
     where
         H: AsRef<[u8]>,
-        N: AsRef<[u8]>,
     {
         let haystack_ref = haystack.as_ref();
-        let needles_ref = needles.as_ref();
         let haystack_pointer = haystack_ref.as_ptr() as _;
         let haystack_length = haystack_ref.len();
-        let mut byteset = Byteset::new();
-        for &b in needles_ref {
-            byteset.add_u8(b);
-        }
 
         let result =
-            unsafe { sz_rfind_byteset(haystack_pointer, haystack_length, &byteset as *const _ as *const c_void) };
+            unsafe { sz_rfind_byteset(haystack_pointer, haystack_length, &needles as *const _ as *const c_void) };
         if result.is_null() {
             None
         } else {
@@ -575,6 +572,49 @@ pub mod sz {
         }
     }
 
+    /// Finds the index of the first character in `haystack` that is also present in `needles`.
+    /// This function is particularly useful for parsing and tokenization tasks where a set of
+    /// delimiter characters is used.
+    ///
+    /// # Arguments
+    ///
+    /// * `haystack`: The byte slice to search.
+    /// * `needles`: The set of bytes to search for within the haystack.
+    ///
+    /// # Returns
+    ///
+    /// An `Option<usize>` representing the index of the first occurrence of any byte from
+    /// `needles` within `haystack`, if found, otherwise `None`.
+    #[inline(always)]
+    pub fn find_byte_from<H, N>(haystack: H, needles: N) -> Option<usize>
+    where
+        H: AsRef<[u8]>,
+        N: AsRef<[u8]>,
+    {
+        find_byteset(haystack, Byteset::from(needles))
+    }
+
+    /// Finds the index of the last character in `haystack` that is also present in `needles`.
+    /// This can be used to find the last occurrence of any character from a specified set,
+    /// useful in parsing scenarios such as finding the last delimiter in a string.
+    ///
+    /// # Arguments
+    ///
+    /// * `haystack`: The byte slice to search.
+    /// * `needles`: The set of bytes to search for within the haystack.
+    ///
+    /// # Returns
+    ///
+    /// An `Option<usize>` representing the index of the last occurrence of any byte from
+    /// `needles` within `haystack`, if found, otherwise `None`.
+    pub fn rfind_byte_from<H, N>(haystack: H, needles: N) -> Option<usize>
+    where
+        H: AsRef<[u8]>,
+        N: AsRef<[u8]>,
+    {
+        rfind_byteset(haystack, Byteset::from(needles))
+    }
+
     /// Finds the index of the first character in `haystack` that is not present in `needles`.
     /// This function is useful for skipping over a known set of characters and finding the
     /// first character that does not belong to that set.
@@ -593,23 +633,7 @@ pub mod sz {
         H: AsRef<[u8]>,
         N: AsRef<[u8]>,
     {
-        let haystack_ref = haystack.as_ref();
-        let needles_ref = needles.as_ref();
-        let haystack_pointer = haystack_ref.as_ptr() as _;
-        let haystack_length = haystack_ref.len();
-        let mut byteset = Byteset::new();
-        for &b in needles_ref {
-            byteset.add_u8(b);
-        }
-        byteset.invert();
-
-        let result =
-            unsafe { sz_find_byteset(haystack_pointer, haystack_length, &byteset as *const _ as *const c_void) };
-        if result.is_null() {
-            None
-        } else {
-            Some(unsafe { result.offset_from(haystack_pointer) } as usize)
-        }
+        find_byteset(haystack, Byteset::from(needles).inverted())
     }
 
     /// Finds the index of the last character in `haystack` that is not present in `needles`.
@@ -630,23 +654,7 @@ pub mod sz {
         H: AsRef<[u8]>,
         N: AsRef<[u8]>,
     {
-        let haystack_ref = haystack.as_ref();
-        let needles_ref = needles.as_ref();
-        let haystack_pointer = haystack_ref.as_ptr() as _;
-        let haystack_length = haystack_ref.len();
-        let mut byteset = Byteset::new();
-        for &b in needles_ref {
-            byteset.add_u8(b);
-        }
-        byteset.invert();
-
-        let result =
-            unsafe { sz_rfind_byteset(haystack_pointer, haystack_length, &byteset as *const _ as *const c_void) };
-        if result.is_null() {
-            None
-        } else {
-            Some(unsafe { result.offset_from(haystack_pointer) } as usize)
-        }
+        rfind_byteset(haystack, Byteset::from(needles).inverted())
     }
 
     /// Computes the Levenshtein edit distance between two strings, using the Wagner-Fisher
@@ -1843,7 +1851,7 @@ where
     /// # Arguments
     ///
     /// * `needles`: The set of bytes that should not be matched within `self`.
-    ///q
+    ///
     /// # Examples
     ///
     /// ```

From 811fc597f089739b6fdf1e4c725839f2357f083c Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 14 Mar 2025 19:53:43 +0000
Subject: [PATCH 232/751] Improve: Use `GiB` over `GB`

---
 scripts/bench.hpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index 6be8c2c4..53f24bec 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -532,17 +532,18 @@ struct bench_result_t {
         if (seconds_printable > 1e3) seconds_printable /= 1e3, seconds_printable_unit = "ms";
         if (seconds_printable > 1e3) seconds_printable /= 1e3, seconds_printable_unit = "s";
 
-        // Compute throughput based on operations (or bytes passed if operations == 0).
+        // Compute throughput based on operations.
+        // Assuming we normalize by a power of 2, we use "Ki", "Mi", "Gi" prefixes over "K", "M", "G".
         auto bytes_printable = bytes_passed / profiled_seconds;
         char const *bytes_printable_unit = "B/s";
-        if (bytes_printable > 1e3) bytes_printable /= 1e3, bytes_printable_unit = "KB/s";
-        if (bytes_printable > 1e3) bytes_printable /= 1e3, bytes_printable_unit = "MB/s";
-        if (bytes_printable > 1e3) bytes_printable /= 1e3, bytes_printable_unit = "GB/s";
+        if (bytes_printable > 1024) bytes_printable /= 1024, bytes_printable_unit = "KiB/s";
+        if (bytes_printable > 1024) bytes_printable /= 1024, bytes_printable_unit = "MiB/s";
+        if (bytes_printable > 1024) bytes_printable /= 1024, bytes_printable_unit = "GiB/s";
         std::printf("> Throughput: %.2f %s @ %.2f %s/call\n", //
                     bytes_printable, bytes_printable_unit,    //
                     seconds_printable, seconds_printable_unit);
 
-        // Print the number of operations, if there was a separate tracking mechanism for those
+        // Print the number of operations, if there was a separate tracking mechanism for those.
         if (operations) {
             auto ops_printable = operations * 1.0 / profiled_seconds;
             auto ops_per_cycle = operations * 1.0 / profiled_cpu_cycles;

From 763538e451525e9adbe96068da33a95d71df9e65 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 15 Mar 2025 01:04:10 +0000
Subject: [PATCH 233/751] Add: Stateful hashing in Rust

---
 rust/lib.rs | 118 +++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 98 insertions(+), 20 deletions(-)

diff --git a/rust/lib.rs b/rust/lib.rs
index b265d434..5a80042a 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -31,6 +31,16 @@ pub mod sz {
         bits: [u64; 4],
     }
 
+    #[repr(C)]
+    #[derive(Debug, Clone, Copy)]
+    pub struct HashState {
+        aes: [u64; 8],
+        sum: [u64; 8],
+        ins: [u64; 8], // Ignored in comparisons
+        key: [u64; 2],
+        ins_length: usize, // Ignored in comparisons
+    }
+
     pub type SortedIdx = usize;
 
     /// A trait for types that support indexed lookup.
@@ -131,9 +141,7 @@ pub mod sz {
 
     // Import the functions from the StringZilla C library.
     extern "C" {
-        fn sz_copy(target: *const c_void, source: *const c_void, length: usize);
-        fn sz_fill(target: *const c_void, length: usize, value: u8);
-        fn sz_move(target: *const c_void, source: *const c_void, length: usize);
+
         fn sz_dynamic_dispatch() -> i32;
         fn sz_version_major() -> i32;
         fn sz_version_minor() -> i32;
@@ -141,6 +149,12 @@ pub mod sz {
         fn sz_capabilities() -> u32;
         fn sz_capabilities_to_string(caps: u32) -> *const c_void;
 
+        fn sz_copy(target: *const c_void, source: *const c_void, length: usize);
+        fn sz_fill(target: *const c_void, length: usize, value: u8);
+        fn sz_move(target: *const c_void, source: *const c_void, length: usize);
+        fn sz_fill_random(text: *mut c_void, length: usize, seed: u64);
+        fn sz_lookup(target: *const c_void, length: usize, source: *const c_void, lut: *const u8) -> *const c_void;
+
         fn sz_find(
             haystack: *const c_void,
             haystack_length: usize,
@@ -156,17 +170,20 @@ pub mod sz {
         ) -> *const c_void;
 
         fn sz_find_byteset(haystack: *const c_void, haystack_length: usize, byteset: *const c_void) -> *const c_void;
-
         fn sz_rfind_byteset(haystack: *const c_void, haystack_length: usize, byteset: *const c_void) -> *const c_void;
 
         fn sz_bytesum(text: *const c_void, length: usize) -> u64;
-
         fn sz_hash(text: *const c_void, length: usize, seed: u64) -> u64;
+        fn sz_hash_state_init(state: *const c_void, seed: u64);
+        fn sz_hash_state_stream(state: *const c_void, text: *const c_void, length: usize);
+        fn sz_hash_state_fold(state: *const c_void) -> u64;
 
-        fn sz_fill_random(text: *mut c_void, length: usize, seed: u64);
-
-        pub fn sz_sequence_argsort(sequence: *const _SzSequence, alloc: *const c_void, order: *mut SortedIdx)
-            -> Status;
+        pub fn sz_sequence_argsort(
+            //
+            sequence: *const _SzSequence,
+            alloc: *const c_void,
+            order: *mut SortedIdx,
+        ) -> Status;
 
         pub fn sz_sequence_intersect(
             first_sequence: *const _SzSequence,
@@ -227,13 +244,6 @@ pub mod sz {
             result: *mut isize,
         ) -> Status;
 
-        fn sz_hash_state_init(state: *const c_void, seed: u64);
-
-        fn sz_hash_state_stream(state: *const c_void, text: *const c_void, length: usize);
-
-        fn sz_hash_state_fold(state: *const c_void) -> u64;
-
-        fn sz_lookup(target: *const c_void, length: usize, source: *const c_void, lut: *const u8) -> *const c_void;
     }
 
     impl SemVer {
@@ -242,6 +252,46 @@ pub mod sz {
         }
     }
 
+    impl HashState {
+        /// Creates a new `HashState` and initializes it with a given seed.
+        pub fn new(seed: u64) -> Self {
+            let mut state = HashState {
+                aes: [0; 8],
+                sum: [0; 8],
+                ins: [0; 8],
+                key: [0; 2],
+                ins_length: 0,
+            };
+            unsafe {
+                sz_hash_state_init(&mut state as *mut _ as *mut c_void, seed);
+            }
+            state
+        }
+
+        /// Streams data into the hash state.
+        pub fn stream(&mut self, data: &[u8]) -> &mut Self {
+            unsafe {
+                sz_hash_state_stream(
+                    self as *mut _ as *mut c_void,
+                    data.as_ptr() as *const c_void,
+                    data.len(),
+                );
+            }
+            self
+        }
+
+        /// Finalizes the hash and returns the folded value.
+        pub fn fold(&self) -> u64 {
+            unsafe { sz_hash_state_fold(self as *const _ as *const c_void) }
+        }
+    }
+
+    impl PartialEq for HashState {
+        fn eq(&self, other: &Self) -> bool {
+            self.aes == other.aes && self.sum == other.sum && self.key == other.key
+        }
+    }
+
     /// Checks if the library was compiled with dynamic dispatch enabled.
     pub fn dynamic_dispatch() -> bool {
         unsafe { sz_dynamic_dispatch() != 0 }
@@ -1962,8 +2012,7 @@ mod tests {
 
     use crate::sz;
     use crate::sz::SortedIdx;
-    // For global functions
-    use crate::StringZilla; // For member functions
+    use crate::StringZilla;
 
     #[test]
     fn metadata() {
@@ -1971,6 +2020,33 @@ mod tests {
         assert!(sz::capabilities().as_str().len() > 0);
     }
 
+    #[test]
+    fn bytesum() {
+        assert_eq!(sz::bytesum("hi"), 209u64);
+    }
+
+    #[test]
+    fn hash() {
+        assert_ne!(sz::hash("Hello"), sz::hash("World"));
+
+        // Hashing should work the same for any seed
+        for seed in [0u64, 42, 123456789].iter() {
+            // Single-pass hashing
+            assert_eq!(
+                sz::HashState::new(*seed).stream("Hello".as_bytes()).fold(),
+                sz::hash_with_seed("Hello", *seed)
+            );
+            // Dual pass for short strings
+            assert_eq!(
+                sz::HashState::new(*seed)
+                    .stream("Hello".as_bytes())
+                    .stream("World".as_bytes())
+                    .fold(),
+                sz::hash_with_seed("HelloWorld", *seed)
+            );
+        }
+    }
+
     #[test]
     fn hamming() {
         assert_eq!(sz::hamming_distance("hello", "hello"), Ok(0));
@@ -2237,9 +2313,10 @@ mod tests {
     fn test_argsort_permutation_by_custom() {
         // Define a custom type.
         #[derive(Debug)]
+        #[allow(dead_code)]
         struct Person {
             name: &'static str,
-            age: u32,
+            age: u32, //? We won't use this field for intersection
         }
 
         let people = [
@@ -2295,9 +2372,10 @@ mod tests {
     fn test_intersection_by_custom() {
         // Define a custom type.
         #[derive(Debug)]
+        #[allow(dead_code)]
         struct Person {
             name: &'static str,
-            age: u32,
+            age: u32, //? We won't use this field for intersection
         }
 
         let group1 = [

From 1b3cdd559d6c18a6003aa99c3dd4a055c9facce0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 15 Mar 2025 02:05:29 +0000
Subject: [PATCH 234/751] Improve: Align inner hash-states

---
 include/stringzilla/hash.h | 10 +++++-----
 rust/lib.rs                |  5 ++++-
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 549dbf76..cf83d0f3 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -698,7 +698,7 @@ SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length, sz_u64_t se
     else {
         // Use a larger state to handle the main loop and add different offsets
         // to different lanes of the register
-        sz_hash_state_t state;
+        _SZ_ALIGN64 sz_hash_state_t state;
         sz_hash_state_init_serial(&state, seed);
 
         for (; state.ins_length + 64 <= length; state.ins_length += 64) {
@@ -1039,7 +1039,7 @@ SZ_PUBLIC sz_u64_t sz_hash_haswell(sz_cptr_t start, sz_size_t length, sz_u64_t s
     else {
         // Use a larger state to handle the main loop and add different offsets
         // to different lanes of the register
-        sz_hash_state_t state;
+        _SZ_ALIGN64 sz_hash_state_t state;
         sz_hash_state_init_haswell(&state, seed);
         for (; state.ins_length + 64 <= length; state.ins_length += 64) {
             state.ins.xmms[0] = _mm_lddqu_si128((__m128i const *)(start + state.ins_length + 0));
@@ -1416,7 +1416,7 @@ SZ_PUBLIC sz_u64_t sz_hash_skylake(sz_cptr_t start, sz_size_t length, sz_u64_t s
     else {
         // Use a larger state to handle the main loop and add different offsets
         // to different lanes of the register
-        sz_hash_state_t state;
+        _SZ_ALIGN64 sz_hash_state_t state;
         sz_hash_state_init_skylake(&state, seed);
 
         for (; state.ins_length + 64 <= length; state.ins_length += 64) {
@@ -1676,7 +1676,7 @@ SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t start, sz_size_t length, sz_u64_t seed)
     else {
         // Use a larger state to handle the main loop and add different offsets
         // to different lanes of the register
-        sz_hash_state_t state;
+        _SZ_ALIGN64 sz_hash_state_t state;
         sz_hash_state_init_skylake(&state, seed);
 
         for (; state.ins_length + 64 <= length; state.ins_length += 64) {
@@ -2107,7 +2107,7 @@ SZ_PUBLIC sz_u64_t sz_hash_neon(sz_cptr_t start, sz_size_t length, sz_u64_t seed
     else {
         // Use a larger state to handle the main loop and add different offsets
         // to different lanes of the register
-        sz_hash_state_t state;
+        _SZ_ALIGN64 sz_hash_state_t state;
         sz_hash_state_init_neon(&state, seed);
         for (; state.ins_length + 64 <= length; state.ins_length += 64) {
             state.ins.u8x16s[0] = vld1q_u8((sz_u8_t const *)(start + state.ins_length + 0));
diff --git a/rust/lib.rs b/rust/lib.rs
index 5a80042a..b449a5db 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -33,6 +33,7 @@ pub mod sz {
 
     #[repr(C)]
     #[derive(Debug, Clone, Copy)]
+    #[repr(align(64))] // For optimal performance we align to 64 bytes.
     pub struct HashState {
         aes: [u64; 8],
         sum: [u64; 8],
@@ -2027,7 +2028,9 @@ mod tests {
 
     #[test]
     fn hash() {
-        assert_ne!(sz::hash("Hello"), sz::hash("World"));
+        let hash_hello = sz::hash("Hello");
+        let hash_world = sz::hash("World");
+        assert_ne!(hash_hello, hash_world);
 
         // Hashing should work the same for any seed
         for seed in [0u64, 42, 123456789].iter() {

From 7e65a1e8f9d0eb8713e4569080bd40b8da08ce7a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 15 Mar 2025 11:38:31 +0000
Subject: [PATCH 235/751] Fix: Unaligned `sz_hash_state_t` stores

---
 include/stringzilla/hash.h | 193 ++++++++++++++++++++-----------------
 scripts/test.cpp           |  13 +++
 2 files changed, 115 insertions(+), 91 deletions(-)

diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index cf83d0f3..0f01379b 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -157,7 +157,7 @@ SZ_DYNAMIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
  *      int main() {
  *          char first_buffer[5], second_buffer[5];
  *          sz_fill_random(first_buffer, 5, 0);
- *          sz_fill_random(second_buffer, 5, 0); //? Same nonce must produce the same output
+ *          sz_fill_random(second_buffer, 5, 0); // ? Same nonce must produce the same output
  *          return sz_bytesum(first_buffer, 5) == sz_bytesum(second_buffer, 5) ? 0 : 1;
  *      }
  *  @endcode
@@ -528,13 +528,13 @@ SZ_INTERNAL sz_u64_t _sz_hash_minimal_finalize_serial(_sz_hash_minimal_t const *
     sz_u128_vec_t key_with_length = state->key;
     key_with_length.u64s[0] += length;
     // Combine the "sum" and the "AES" blocks
-    sz_u128_vec_t mixed_registers = _sz_emulate_aesenc_si128_serial(state->sum, state->aes);
+    sz_u128_vec_t mixed = _sz_emulate_aesenc_si128_serial(state->sum, state->aes);
     // Make sure the "key" mixes enough with the state,
     // as with less than 2 rounds - SMHasher fails
-    sz_u128_vec_t mixed_within_register = _sz_emulate_aesenc_si128_serial(
-        _sz_emulate_aesenc_si128_serial(mixed_registers, key_with_length), mixed_registers);
+    sz_u128_vec_t mixed_in_register =
+        _sz_emulate_aesenc_si128_serial(_sz_emulate_aesenc_si128_serial(mixed, key_with_length), mixed);
     // Extract the low 64 bits
-    return mixed_within_register.u64s[0];
+    return mixed_in_register.u64s[0];
 }
 
 SZ_INTERNAL void _sz_hash_shift_in_register_serial(sz_u128_vec_t *vec, int shift_bytes) {
@@ -607,23 +607,23 @@ SZ_INTERNAL sz_u64_t _sz_hash_state_finalize_serial(sz_hash_state_t const *state
     sz_u128_vec_t *sum_vecs = (sz_u128_vec_t *)&state->sum.u64s[0];
 
     // Combine the "sum" and the "AES" blocks
-    sz_u128_vec_t mixed_registers0 = _sz_emulate_aesenc_si128_serial(sum_vecs[0], aes_vecs[0]);
-    sz_u128_vec_t mixed_registers1 = _sz_emulate_aesenc_si128_serial(sum_vecs[1], aes_vecs[1]);
-    sz_u128_vec_t mixed_registers2 = _sz_emulate_aesenc_si128_serial(sum_vecs[2], aes_vecs[2]);
-    sz_u128_vec_t mixed_registers3 = _sz_emulate_aesenc_si128_serial(sum_vecs[3], aes_vecs[3]);
+    sz_u128_vec_t mixed0 = _sz_emulate_aesenc_si128_serial(sum_vecs[0], aes_vecs[0]);
+    sz_u128_vec_t mixed1 = _sz_emulate_aesenc_si128_serial(sum_vecs[1], aes_vecs[1]);
+    sz_u128_vec_t mixed2 = _sz_emulate_aesenc_si128_serial(sum_vecs[2], aes_vecs[2]);
+    sz_u128_vec_t mixed3 = _sz_emulate_aesenc_si128_serial(sum_vecs[3], aes_vecs[3]);
 
     // Combine the mixed registers
-    sz_u128_vec_t mixed_registers01 = _sz_emulate_aesenc_si128_serial(mixed_registers0, mixed_registers1);
-    sz_u128_vec_t mixed_registers23 = _sz_emulate_aesenc_si128_serial(mixed_registers2, mixed_registers3);
-    sz_u128_vec_t mixed_registers = _sz_emulate_aesenc_si128_serial(mixed_registers01, mixed_registers23);
+    sz_u128_vec_t mixed01 = _sz_emulate_aesenc_si128_serial(mixed0, mixed1);
+    sz_u128_vec_t mixed23 = _sz_emulate_aesenc_si128_serial(mixed2, mixed3);
+    sz_u128_vec_t mixed = _sz_emulate_aesenc_si128_serial(mixed01, mixed23);
 
     // Make sure the "key" mixes enough with the state,
     // as with less than 2 rounds - SMHasher fails
-    sz_u128_vec_t mixed_within_register = _sz_emulate_aesenc_si128_serial(
-        _sz_emulate_aesenc_si128_serial(mixed_registers, key_with_length), mixed_registers);
+    sz_u128_vec_t mixed_in_register =
+        _sz_emulate_aesenc_si128_serial(_sz_emulate_aesenc_si128_serial(mixed, key_with_length), mixed);
 
     // Extract the low 64 bits
-    return mixed_within_register.u64s[0];
+    return mixed_in_register.u64s[0];
 }
 
 SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
@@ -911,35 +911,43 @@ SZ_INTERNAL sz_u64_t _sz_hash_minimal_finalize_haswell(_sz_hash_minimal_t const
     // Mix the length into the key
     __m128i key_with_length = _mm_add_epi64(state->key.xmm, _mm_set_epi64x(0, length));
     // Combine the "sum" and the "AES" blocks
-    __m128i mixed_registers = _mm_aesenc_si128(state->sum.xmm, state->aes.xmm);
+    __m128i mixed = _mm_aesenc_si128(state->sum.xmm, state->aes.xmm);
     // Make sure the "key" mixes enough with the state,
     // as with less than 2 rounds - SMHasher fails
-    __m128i mixed_within_register =
-        _mm_aesenc_si128(_mm_aesenc_si128(mixed_registers, key_with_length), mixed_registers);
+    __m128i mixed_in_register = _mm_aesenc_si128(_mm_aesenc_si128(mixed, key_with_length), mixed);
     // Extract the low 64 bits
-    return _mm_cvtsi128_si64(mixed_within_register);
+    return _mm_cvtsi128_si64(mixed_in_register);
 }
 
 SZ_INTERNAL void _sz_hash_minimal_update_haswell(_sz_hash_minimal_t *state, __m128i block) {
+    // ? In this kernel, assuming it's only used internally on properly aligned `state`,
+    // ? we don't need `_mm_storeu_si128` stores to update the state.
     __m128i const shuffle_mask = _mm_load_si128((__m128i const *)_sz_hash_u8x16x4_shuffle());
-    state->aes.xmm = _mm_aesenc_si128(state->aes.xmm, block);
-    state->sum.xmm = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmm, shuffle_mask), block);
+    state->aes.xmm = _mm_aesenc_si128(_mm_lddqu_si128(&state->aes.xmm), block);
+    state->sum.xmm = _mm_add_epi64(_mm_shuffle_epi8(_mm_lddqu_si128(&state->sum.xmm), shuffle_mask), block);
 }
 
 SZ_PUBLIC void sz_hash_state_init_haswell(sz_hash_state_t *state, sz_u64_t seed) {
     // The key is made from the seed and half of it will be mixed with the length in the end
     __m128i seed_vec = _mm_set1_epi64x(seed);
-    state->key.xmm = seed_vec;
+
+    // ! In this kernel, assuming it may be called on arbitrarily misaligned `state`,
+    // ! we must use `_mm_storeu_si128` stores to update the state.
+    _mm_storeu_si128(&state->key.xmm, seed_vec);
 
     // XOR the user-supplied keys with the two "pi" constants
     sz_u64_t const *pi = _sz_hash_pi_constants();
     for (int i = 0; i < 4; ++i)
-        state->aes.xmms[i] = _mm_xor_si128(seed_vec, _mm_load_si128((__m128i const *)(pi + i * 2)));
+        _mm_storeu_si128(&state->aes.xmms[i], _mm_xor_si128(seed_vec, _mm_load_si128((__m128i const *)(pi + i * 2))));
     for (int i = 0; i < 4; ++i)
-        state->sum.xmms[i] = _mm_xor_si128(seed_vec, _mm_load_si128((__m128i const *)(pi + i * 2 + 8)));
+        _mm_storeu_si128(&state->sum.xmms[i],
+                         _mm_xor_si128(seed_vec, _mm_load_si128((__m128i const *)(pi + i * 2 + 8))));
 
     // The inputs are zeroed out at the beginning
-    state->ins.xmms[0] = state->ins.xmms[1] = state->ins.xmms[2] = state->ins.xmms[3] = _mm_setzero_si128();
+    _mm_storeu_si128(&state->ins.xmms[0], _mm_setzero_si128());
+    _mm_storeu_si128(&state->ins.xmms[1], _mm_setzero_si128());
+    _mm_storeu_si128(&state->ins.xmms[2], _mm_setzero_si128());
+    _mm_storeu_si128(&state->ins.xmms[3], _mm_setzero_si128());
     state->ins_length = 0;
 }
 
@@ -957,22 +965,21 @@ SZ_INTERNAL void _sz_hash_state_update_haswell(sz_hash_state_t *state) {
 
 SZ_INTERNAL sz_u64_t _sz_hash_state_finalize_haswell(sz_hash_state_t const *state) {
     // Mix the length into the key
-    __m128i key_with_length = _mm_add_epi64(state->key.xmm, _mm_set_epi64x(0, state->ins_length));
+    __m128i key_with_length = _mm_add_epi64(_mm_lddqu_si128(&state->key.xmm), _mm_set_epi64x(0, state->ins_length));
     // Combine the "sum" and the "AES" blocks
-    __m128i mixed_registers0 = _mm_aesenc_si128(state->sum.xmms[0], state->aes.xmms[0]);
-    __m128i mixed_registers1 = _mm_aesenc_si128(state->sum.xmms[1], state->aes.xmms[1]);
-    __m128i mixed_registers2 = _mm_aesenc_si128(state->sum.xmms[2], state->aes.xmms[2]);
-    __m128i mixed_registers3 = _mm_aesenc_si128(state->sum.xmms[3], state->aes.xmms[3]);
+    __m128i mixed0 = _mm_aesenc_si128(_mm_lddqu_si128(&state->sum.xmms[0]), _mm_lddqu_si128(&state->aes.xmms[0]));
+    __m128i mixed1 = _mm_aesenc_si128(_mm_lddqu_si128(&state->sum.xmms[1]), _mm_lddqu_si128(&state->aes.xmms[1]));
+    __m128i mixed2 = _mm_aesenc_si128(_mm_lddqu_si128(&state->sum.xmms[2]), _mm_lddqu_si128(&state->aes.xmms[2]));
+    __m128i mixed3 = _mm_aesenc_si128(_mm_lddqu_si128(&state->sum.xmms[3]), _mm_lddqu_si128(&state->aes.xmms[3]));
     // Combine the mixed registers
-    __m128i mixed_registers01 = _mm_aesenc_si128(mixed_registers0, mixed_registers1);
-    __m128i mixed_registers23 = _mm_aesenc_si128(mixed_registers2, mixed_registers3);
-    __m128i mixed_registers = _mm_aesenc_si128(mixed_registers01, mixed_registers23);
+    __m128i mixed01 = _mm_aesenc_si128(mixed0, mixed1);
+    __m128i mixed23 = _mm_aesenc_si128(mixed2, mixed3);
+    __m128i mixed = _mm_aesenc_si128(mixed01, mixed23);
     // Make sure the "key" mixes enough with the state,
     // as with less than 2 rounds - SMHasher fails
-    __m128i mixed_within_register =
-        _mm_aesenc_si128(_mm_aesenc_si128(mixed_registers, key_with_length), mixed_registers);
+    __m128i mixed_in_register = _mm_aesenc_si128(_mm_aesenc_si128(mixed, key_with_length), mixed);
     // Extract the low 64 bits
-    return _mm_cvtsi128_si64(mixed_within_register);
+    return _mm_cvtsi128_si64(mixed_in_register);
 }
 
 SZ_PUBLIC sz_u64_t sz_hash_haswell(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
@@ -1101,33 +1108,33 @@ SZ_PUBLIC sz_u64_t sz_hash_state_fold_haswell(sz_hash_state_t const *state) {
     if (length >= 64) return _sz_hash_state_finalize_haswell(state);
 
     // Switch back to a smaller "minimal" state for small inputs
-    _sz_hash_minimal_t minimal_state;
-    minimal_state.key.xmm = state->key.xmm;
-    minimal_state.aes.xmm = state->aes.xmms[0];
-    minimal_state.sum.xmm = state->sum.xmms[0];
+    _SZ_ALIGN64 _sz_hash_minimal_t minimal_state;
+    minimal_state.key.xmm = _mm_lddqu_si128(&state->key.xmm);
+    minimal_state.aes.xmm = _mm_lddqu_si128(&state->aes.xmms[0]);
+    minimal_state.sum.xmm = _mm_lddqu_si128(&state->sum.xmms[0]);
 
     // The logic is different depending on the length of the input
-    __m128i const *ins_vecs = (__m128i const *)&state->ins.xmms[0];
+    __m128i const *ins_xmms = (__m128i const *)&state->ins.xmms[0];
     if (length <= 16) {
-        _sz_hash_minimal_update_haswell(&minimal_state, ins_vecs[0]);
+        _sz_hash_minimal_update_haswell(&minimal_state, _mm_lddqu_si128(&ins_xmms[0]));
         return _sz_hash_minimal_finalize_haswell(&minimal_state, length);
     }
     else if (length <= 32) {
-        _sz_hash_minimal_update_haswell(&minimal_state, ins_vecs[0]);
-        _sz_hash_minimal_update_haswell(&minimal_state, ins_vecs[1]);
+        _sz_hash_minimal_update_haswell(&minimal_state, _mm_lddqu_si128(&ins_xmms[0]));
+        _sz_hash_minimal_update_haswell(&minimal_state, _mm_lddqu_si128(&ins_xmms[1]));
         return _sz_hash_minimal_finalize_haswell(&minimal_state, length);
     }
     else if (length <= 48) {
-        _sz_hash_minimal_update_haswell(&minimal_state, ins_vecs[0]);
-        _sz_hash_minimal_update_haswell(&minimal_state, ins_vecs[1]);
-        _sz_hash_minimal_update_haswell(&minimal_state, ins_vecs[2]);
+        _sz_hash_minimal_update_haswell(&minimal_state, _mm_lddqu_si128(&ins_xmms[0]));
+        _sz_hash_minimal_update_haswell(&minimal_state, _mm_lddqu_si128(&ins_xmms[1]));
+        _sz_hash_minimal_update_haswell(&minimal_state, _mm_lddqu_si128(&ins_xmms[2]));
         return _sz_hash_minimal_finalize_haswell(&minimal_state, length);
     }
     else {
-        _sz_hash_minimal_update_haswell(&minimal_state, ins_vecs[0]);
-        _sz_hash_minimal_update_haswell(&minimal_state, ins_vecs[1]);
-        _sz_hash_minimal_update_haswell(&minimal_state, ins_vecs[2]);
-        _sz_hash_minimal_update_haswell(&minimal_state, ins_vecs[3]);
+        _sz_hash_minimal_update_haswell(&minimal_state, _mm_lddqu_si128(&ins_xmms[0]));
+        _sz_hash_minimal_update_haswell(&minimal_state, _mm_lddqu_si128(&ins_xmms[1]));
+        _sz_hash_minimal_update_haswell(&minimal_state, _mm_lddqu_si128(&ins_xmms[2]));
+        _sz_hash_minimal_update_haswell(&minimal_state, _mm_lddqu_si128(&ins_xmms[3]));
         return _sz_hash_minimal_finalize_haswell(&minimal_state, length);
     }
 }
@@ -1345,17 +1352,19 @@ SZ_PUBLIC sz_u64_t sz_bytesum_skylake(sz_cptr_t text, sz_size_t length) {
 SZ_PUBLIC void sz_hash_state_init_skylake(sz_hash_state_t *state, sz_u64_t seed) {
     // The key is made from the seed and half of it will be mixed with the length in the end
     __m512i seed_vec = _mm512_set1_epi64(seed);
-    state->key.xmm = _mm512_castsi512_si128(seed_vec);
+    // ! In this kernel, assuming it may be called on arbitrarily misaligned `state`,
+    // ! we must use `_mm_storeu_si128` stores to update the state.
+    _mm_storeu_si128(&state->key.xmm, _mm512_castsi512_si128(seed_vec));
 
     // XOR the user-supplied keys with the two "pi" constants
     sz_u64_t const *pi = _sz_hash_pi_constants();
     __m512i const pi0 = _mm512_load_epi64((__m512i const *)(pi));
     __m512i const pi1 = _mm512_load_epi64((__m512i const *)(pi + 8));
-    state->aes.zmm = _mm512_xor_si512(seed_vec, pi0);
-    state->sum.zmm = _mm512_xor_si512(seed_vec, pi1);
+    _mm512_storeu_si512(&state->aes.zmm, _mm512_xor_si512(seed_vec, pi0));
+    _mm512_storeu_si512(&state->sum.zmm, _mm512_xor_si512(seed_vec, pi1));
 
     // The inputs are zeroed out at the beginning
-    state->ins.zmm = _mm512_setzero_si512();
+    _mm512_storeu_si512(&state->ins.zmm, _mm512_setzero_si512());
     state->ins_length = 0;
 }
 
@@ -1456,12 +1465,12 @@ SZ_PUBLIC void sz_hash_state_stream_skylake(sz_hash_state_t *state, sz_cptr_t te
 }
 
 SZ_PUBLIC sz_u64_t sz_hash_state_fold_skylake(sz_hash_state_t const *state) {
-    //? We don't know a better way to fold the state on Ice Lake, than to use the Haswell implementation.
+    // ? We don't know a better way to fold the state on Ice Lake, than to use the Haswell implementation.
     return sz_hash_state_fold_haswell(state);
 }
 
 SZ_PUBLIC void sz_fill_random_skylake(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
-    sz_fill_random_serial(text, length, nonce);
+    sz_fill_random_haswell(text, length, nonce);
 }
 
 #pragma clang attribute pop
@@ -1612,8 +1621,11 @@ SZ_PUBLIC sz_u64_t sz_bytesum_ice(sz_cptr_t text, sz_size_t length) {
 
 SZ_INTERNAL void _sz_hash_state_update_ice(sz_hash_state_t *state) {
     __m512i const shuffle_mask = _mm512_load_si512((__m512i const *)_sz_hash_u8x16x4_shuffle());
-    state->aes.zmm = _mm512_aesenc_epi128(state->aes.zmm, state->ins.zmm);
-    state->sum.zmm = _mm512_add_epi64(_mm512_shuffle_epi8(state->sum.zmm, shuffle_mask), state->ins.zmm);
+    // ! In this kernel, assuming it may be called on arbitrarily misaligned `state`,
+    // ! we must use `_mm512_storeu_si512` stores to update the state.
+    _mm512_storeu_si512(&state->aes.zmm, _mm512_aesenc_epi128(state->aes.zmm, state->ins.zmm));
+    _mm512_storeu_si512(&state->sum.zmm,
+                        _mm512_add_epi64(_mm512_shuffle_epi8(state->sum.zmm, shuffle_mask), state->ins.zmm));
 }
 
 SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
@@ -1684,8 +1696,8 @@ SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t start, sz_size_t length, sz_u64_t seed)
             _sz_hash_state_update_ice(&state);
         }
         if (state.ins_length < length) {
-            state.ins.zmm = _mm512_maskz_loadu_epi8( //
-                _sz_u64_mask_until(length - state.ins_length), start + state.ins_length);
+            state.ins.zmm =
+                _mm512_maskz_loadu_epi8(_sz_u64_mask_until(length - state.ins_length), start + state.ins_length);
             _sz_hash_state_update_ice(&state);
             state.ins_length = length;
         }
@@ -1714,13 +1726,13 @@ SZ_PUBLIC void sz_hash_state_stream_ice(sz_hash_state_t *state, sz_cptr_t text,
         if (will_fill_block) {
             _sz_hash_state_update_ice(state);
             // Reset to zeros now, so we don't have to overwrite an immutable buffer in the folding state
-            state->ins.zmm = _mm512_setzero_si512();
+            _mm512_storeu_si512(&state->ins.zmm, _mm512_setzero_si512());
         }
     }
 }
 
 SZ_PUBLIC sz_u64_t sz_hash_state_fold_ice(sz_hash_state_t const *state) {
-    //? We don't know a better way to fold the state on Ice Lake, than to use the Haswell implementation.
+    // ? We don't know a better way to fold the state on Ice Lake, than to use the Haswell implementation.
     return sz_hash_state_fold_haswell(state);
 }
 
@@ -1793,7 +1805,7 @@ SZ_INTERNAL void _sz_hash_minimal_x4_init_ice(_sz_hash_minimal_x4_t *state, sz_u
 
     // The key is made from the seed and half of it will be mixed with the length in the end
     __m512i seed_vec = _mm512_set1_epi64(seed);
-    state->key.zmm = seed_vec;
+    state->key.zmm = seed_vec; //! This will definitely be aligned
 
     // XOR the user-supplied keys with the two "pi" constants
     sz_u64_t const *pi = _sz_hash_pi_constants();
@@ -1819,15 +1831,14 @@ SZ_INTERNAL __m256i _sz_hash_minimal_x4_finalize_ice(_sz_hash_minimal_x4_t const
     // Mix the length into the key
     __m512i key_with_length = _mm512_add_epi64(state->key.zmm, padded_lengths);
     // Combine the "sum" and the "AES" blocks
-    __m512i mixed_registers = _mm512_aesenc_epi128(state->sum.zmm, state->aes.zmm);
+    __m512i mixed = _mm512_aesenc_epi128(state->sum.zmm, state->aes.zmm);
     // Make sure the "key" mixes enough with the state,
     // as with less than 2 rounds - SMHasher fails
-    __m512i mixed_within_register =
-        _mm512_aesenc_epi128(_mm512_aesenc_epi128(mixed_registers, key_with_length), mixed_registers);
+    __m512i mixed_in_register = _mm512_aesenc_epi128(_mm512_aesenc_epi128(mixed, key_with_length), mixed);
     // Extract the low 64 bits from each 128-bit lane - weirdly using the `permutexvar` instruction
     // is cheaper than compressing instructions like `_mm512_maskz_compress_epi64`.
     return _mm512_castsi512_si256(
-        _mm512_permutexvar_epi64(_mm512_set_epi64(0, 0, 0, 0, 6, 4, 2, 0), mixed_within_register));
+        _mm512_permutexvar_epi64(_mm512_set_epi64(0, 0, 0, 0, 6, 4, 2, 0), mixed_in_register));
 }
 
 SZ_INTERNAL void _sz_hash_minimal_x4_update_ice(_sz_hash_minimal_x4_t *state, __m512i blocks) {
@@ -1906,13 +1917,13 @@ SZ_INTERNAL sz_u64_t _sz_hash_minimal_finalize_neon(_sz_hash_minimal_t const *st
     // Mix the length into the key
     uint64x2_t key_with_length = vaddq_u64(state->key.u64x2, vsetq_lane_u64(length, vdupq_n_u64(0), 0));
     // Combine the "sum" and the "AES" blocks
-    uint8x16_t mixed_registers = _sz_emulate_aesenc_u8x16_neon(state->sum.u8x16, state->aes.u8x16);
+    uint8x16_t mixed = _sz_emulate_aesenc_u8x16_neon(state->sum.u8x16, state->aes.u8x16);
     // Make sure the "key" mixes enough with the state,
     // as with less than 2 rounds - SMHasher fails
-    uint8x16_t mixed_within_register = _sz_emulate_aesenc_u8x16_neon(
-        _sz_emulate_aesenc_u8x16_neon(mixed_registers, vreinterpretq_u8_u64(key_with_length)), mixed_registers);
+    uint8x16_t mixed_in_register = _sz_emulate_aesenc_u8x16_neon(
+        _sz_emulate_aesenc_u8x16_neon(mixed, vreinterpretq_u8_u64(key_with_length)), mixed);
     // Extract the low 64 bits
-    return vgetq_lane_u64(vreinterpretq_u64_u8(mixed_within_register), 0);
+    return vgetq_lane_u64(vreinterpretq_u64_u8(mixed_in_register), 0);
 }
 
 SZ_INTERNAL void _sz_hash_minimal_update_neon(_sz_hash_minimal_t *state, uint8x16_t block) {
@@ -1957,20 +1968,20 @@ SZ_INTERNAL sz_u64_t _sz_hash_state_finalize_neon(sz_hash_state_t const *state)
     // Mix the length into the key
     uint64x2_t key_with_length = vaddq_u64(state->key.u64x2, vsetq_lane_u64(state->ins_length, vdupq_n_u64(0), 0));
     // Combine the "sum" and the "AES" blocks
-    uint8x16_t mixed_registers0 = _sz_emulate_aesenc_u8x16_neon(state->sum.u8x16s[0], state->aes.u8x16s[0]);
-    uint8x16_t mixed_registers1 = _sz_emulate_aesenc_u8x16_neon(state->sum.u8x16s[1], state->aes.u8x16s[1]);
-    uint8x16_t mixed_registers2 = _sz_emulate_aesenc_u8x16_neon(state->sum.u8x16s[2], state->aes.u8x16s[2]);
-    uint8x16_t mixed_registers3 = _sz_emulate_aesenc_u8x16_neon(state->sum.u8x16s[3], state->aes.u8x16s[3]);
+    uint8x16_t mixed0 = _sz_emulate_aesenc_u8x16_neon(state->sum.u8x16s[0], state->aes.u8x16s[0]);
+    uint8x16_t mixed1 = _sz_emulate_aesenc_u8x16_neon(state->sum.u8x16s[1], state->aes.u8x16s[1]);
+    uint8x16_t mixed2 = _sz_emulate_aesenc_u8x16_neon(state->sum.u8x16s[2], state->aes.u8x16s[2]);
+    uint8x16_t mixed3 = _sz_emulate_aesenc_u8x16_neon(state->sum.u8x16s[3], state->aes.u8x16s[3]);
     // Combine the mixed registers
-    uint8x16_t mixed_registers01 = _sz_emulate_aesenc_u8x16_neon(mixed_registers0, mixed_registers1);
-    uint8x16_t mixed_registers23 = _sz_emulate_aesenc_u8x16_neon(mixed_registers2, mixed_registers3);
-    uint8x16_t mixed_registers = _sz_emulate_aesenc_u8x16_neon(mixed_registers01, mixed_registers23);
+    uint8x16_t mixed01 = _sz_emulate_aesenc_u8x16_neon(mixed0, mixed1);
+    uint8x16_t mixed23 = _sz_emulate_aesenc_u8x16_neon(mixed2, mixed3);
+    uint8x16_t mixed = _sz_emulate_aesenc_u8x16_neon(mixed01, mixed23);
     // Make sure the "key" mixes enough with the state,
     // as with less than 2 rounds - SMHasher fails
-    uint8x16_t mixed_within_register = _sz_emulate_aesenc_u8x16_neon(
-        _sz_emulate_aesenc_u8x16_neon(mixed_registers, vreinterpretq_u8_u64(key_with_length)), mixed_registers);
+    uint8x16_t mixed_in_register = _sz_emulate_aesenc_u8x16_neon(
+        _sz_emulate_aesenc_u8x16_neon(mixed, vreinterpretq_u8_u64(key_with_length)), mixed);
     // Extract the low 64 bits
-    return vgetq_lane_u64(vreinterpretq_u64_u8(mixed_within_register), 0);
+    return vgetq_lane_u64(vreinterpretq_u64_u8(mixed_in_register), 0);
 }
 
 SZ_PUBLIC void sz_hash_state_stream_neon(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
@@ -2339,14 +2350,14 @@ SZ_PUBLIC sz_u64_t _sz_hash_sve2_upto16(sz_cptr_t text, sz_size_t length, sz_u64
     // Now mix, folding the length into the key
     svuint64_t key_with_length = svadd_u64_x(svptrue_b64(), svreinterpret_u64_u8(state_key), svdupq_n_u64(length, 0));
     // Combine the "sum" and the "AES" blocks
-    svuint8_t mixed_registers = _sz_emulate_aesenc_u8x16_sve2(state_sum, state_aes);
+    svuint8_t mixed = _sz_emulate_aesenc_u8x16_sve2(state_sum, state_aes);
     // Make sure the "key" mixes enough with the state,
     // as with less than 2 rounds - SMHasher fails
-    svuint8_t mixed_within_register = _sz_emulate_aesenc_u8x16_sve2(
-        _sz_emulate_aesenc_u8x16_sve2(mixed_registers, svreinterpret_u8_u64(key_with_length)), mixed_registers);
+    svuint8_t mixed_in_register = _sz_emulate_aesenc_u8x16_sve2(
+        _sz_emulate_aesenc_u8x16_sve2(mixed, svreinterpret_u8_u64(key_with_length)), mixed);
     // Extract the low 64 bits
-    svuint64_t mixed_within_register_u64 = svreinterpret_u64_u8(mixed_within_register);
-    return svlasta_u64(svpfalse_b(), mixed_within_register_u64); // Extract the first element
+    svuint64_t mixed_in_register_u64 = svreinterpret_u64_u8(mixed_in_register);
+    return svlasta_u64(svpfalse_b(), mixed_in_register_u64); // Extract the first element
 }
 
 SZ_PUBLIC void sz_hash_state_init_sve2(sz_hash_state_t *state, sz_u64_t seed) { sz_hash_state_init_neon(state, seed); }
@@ -2413,15 +2424,15 @@ SZ_PUBLIC void _sz_hash_sve2_upto16x16(char texts[16][16], sz_size_t length[16],
             svadd_u64_x(svptrue_b64(), svreinterpret_u64_u8(state_key), svdupq_n_u64(length, 0));
 
         // Combine the "sum" and the "AES" blocks
-        svuint8_t mixed_registers = _sz_emulate_aesenc_u8x16_sve2(blocks_sum, blocks_aes);
+        svuint8_t mixed = _sz_emulate_aesenc_u8x16_sve2(blocks_sum, blocks_aes);
 
         // Make sure the "key" mixes enough with the state,
         // as with less than 2 rounds - SMHasher fails
-        svuint8_t mixed_within_register = _sz_emulate_aesenc_u8x16_sve2(
-            _sz_emulate_aesenc_u8x16_sve2(mixed_registers, svreinterpret_u8_u64(key_with_lengths)), mixed_registers);
+        svuint8_t mixed_in_register = _sz_emulate_aesenc_u8x16_sve2(
+            _sz_emulate_aesenc_u8x16_sve2(mixed, svreinterpret_u8_u64(key_with_lengths)), mixed);
 
         // Extract the low 64 bits from each lane
-        svuint64_t mixed_within_register_u64 = svreinterpret_u64_u8(mixed_within_register);
+        svuint64_t mixed_in_register_u64 = svreinterpret_u64_u8(mixed_in_register);
     }
 }
 #endif
diff --git a/scripts/test.cpp b/scripts/test.cpp
index 72d01e07..6630ec61 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -203,15 +203,28 @@ static void test_hashing_on_platform(                                   //
         assert(sz_hash_state_equal(&state_simd, &state_simd) == sz_true_k); // Self-equality
         assert(sz_hash_state_equal(&state_base, &state_simd) == sz_true_k); // Same across platforms
 
+        // Let's also create an intentionally misaligned version of the state,
+        // assuming some of the SIMD instructions may require alignment.
+        _SZ_ALIGN64 char state_misaligned_buffer[sizeof(sz_hash_state_t) + 1];
+        sz_hash_state_t &state_misaligned = *reinterpret_cast<sz_hash_state_t *>(state_misaligned_buffer + 1);
+        init_simd(&state_misaligned, seed);
+        assert(sz_hash_state_equal(&state_base, &state_misaligned) == sz_true_k); // Same across platforms
+
         // Try breaking those strings into arbitrary chunks, expecting the same output in the streaming mode.
         // The length of each chunk and the number of chunks will be determined with a coin toss.
         iterate_in_random_slices(text, [&](std::string slice) {
             stream_base(&state_base, slice.data(), slice.size());
             stream_simd(&state_simd, slice.data(), slice.size());
             assert(sz_hash_state_equal(&state_base, &state_simd) == sz_true_k); // Same across platforms
+
+            stream_simd(&state_misaligned, slice.data(), slice.size());
+            assert(sz_hash_state_equal(&state_base, &state_misaligned) == sz_true_k); // Same across platforms
+
             result_base = fold_base(&state_base);
             result_simd = fold_simd(&state_simd);
             assert(result_base == result_simd);
+            sz_u64_t result_misaligned = fold_simd(&state_misaligned);
+            assert(result_base == result_misaligned);
         });
     };
 

From 9460fd45dab3c2c8b3522222f4b129143ef45c9d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 16 Mar 2025 09:16:41 +0000
Subject: [PATCH 236/751] Docs: `uv` instructions

---
 CONTRIBUTING.md | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3a5c29c6..9badd49e 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -366,9 +366,12 @@ cmake --build build_artifacts --config Release
 ## Contributing in Python
 
 Python bindings are implemented using pure CPython, so you wouldn't need to install SWIG, PyBind11, or any other third-party library.
+Still, you need a virtual environment, and it's recommended to use `uv` to create one.
 
 ```bash
-pip install -e . # To build locally from source
+uv venv --python 3.11           # Or your preferred Python version
+source .venv/bin/activate       # To activate the virtual environment
+pip install -e .                # To build locally from source
 ```
 
 ### Testing
@@ -376,16 +379,17 @@ pip install -e . # To build locally from source
 For testing we use PyTest, which may not be installed on your system.
 
 ```bash
-pip install pytest              # To install PyTest
+pip install pytest numpy        # NumPy is optional, but recommended
 pytest scripts/test.py -s -x    # Runs tests printing logs and stops on the first failure
 ```
 
-On a related note, StringZilla for Python seems to cover more OS and hardware combinations, than NumPy.
+StringZilla for Python seems to cover more OS and hardware combinations, than NumPy.
 That's why NumPy isn't a required dependency.
 Still, many tests may use NumPy, so consider installing it on mainstream platforms.
+Also considering the other optional dependencies for benchmarking and other scripts:
 
 ```bash
-pip install numpy
+uv pip install -r scripts/requirements.txt 
 ```
 
 Before you ship, please make sure the `cibuilwheel` packaging works and tests pass on other platforms.

From dd5753650f6f906e4c502d04f059f149595295fb Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 16 Mar 2025 09:17:51 +0000
Subject: [PATCH 237/751] Docs: Formatting

---
 .gitignore                       |  5 +++--
 CONTRIBUTING.md                  | 24 ++++++++++++------------
 include/stringzilla/features.h   | 15 +++++++++++++--
 include/stringzilla/similarity.h | 25 +++++++++++++------------
 pyproject.toml                   |  4 ++--
 rust/lib.rs                      | 13 +++++++------
 scripts/bench_search.cpp         |  7 ++++++-
 scripts/test.hpp                 | 13 ++++++++-----
 8 files changed, 64 insertions(+), 42 deletions(-)

diff --git a/.gitignore b/.gitignore
index fa4a8d0d..7484a986 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,7 +35,8 @@ node_modules/
 leipzig1M.txt
 enwik9.txt
 xlsum.csv
-human_protein_1200row_800len.txt
+proteins.txt
 
 # StringZilla-specific log files
-/failed_sz_*
\ No newline at end of file
+/failed_sz_*
+/.cache/
\ No newline at end of file
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 9badd49e..3312f583 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -17,24 +17,24 @@ The project is split into the following parts:
 
 For minimal test coverage, check the following scripts:
 
-- `test.cpp` - tests C++ API (not underlying C) against STL.
-- `test.py` - tests Python API against native strings.
-- `test.js`.
+- `scripts/test.cpp` - tests C++ API (not underlying C) against STL.
+- `scripts/test.py` - tests Python API against native strings.
+- `scripts/test.js`.
 
 At the C++ level all benchmarks also validate the results against the STL baseline, serving as tests on real-world data.
 They have the broadest coverage of the library, and are the most important to keep up-to-date:
 
-- `bench_token.cpp` - token-level ops, like hashing, ordering, equality checks.
-- `bench_search.cpp` - bidirectional substring search, both exact and fuzzy.
-- `bench_similarity.cpp` - benchmark all edit distance backends.
-- `bench_sequence.cpp` - sorting, partitioning, merging.
-- `bench_container.cpp` - STL containers with different string keys.
+- `scripts/bench_token.cpp` - token-level ops, like hashing, ordering, equality checks.
+- `scripts/bench_search.cpp` - bidirectional substring search, both exact and fuzzy.
+- `scripts/bench_similarity.cpp` - benchmark all edit distance backends.
+- `scripts/bench_sequence.cpp` - sorting, partitioning, merging.
+- `scripts/bench_container.cpp` - STL containers with different string keys.
 
 The role of Python benchmarks is less to provide absolute number, but to compare against popular tools in the Python ecosystem.
 
-- `bench_search.(py|ipynb)` - compares against native Python `str`.
-- `bench_sequence.(py|ipynb)` - compares against `pandas`.
-- `bench_similarity.(ipynb)` - compares against `jellyfish`, `editdistance`, etc.
+- `scripts/bench_search.(py|ipynb)` - compares against native Python `str`.
+- `scripts/bench_sequence.(py|ipynb)` - compares against `pandas`.
+- `scripts/bench_similarity.(ipynb)` - compares against `jellyfish`, `editdistance`, etc.
 
 ## Benchmarking Datasets
 
@@ -55,7 +55,7 @@ unzip enwik9.zip && rm enwik9.zip && mv enwik9 enwik9.txt
 # XL Sum dataset for multilingual extractive summarization
 # 4.7 GB (1.7 GB compressed), 1'004'598 lines of UTF8, 268'435'456 tokens of mean length 8
 wget --no-clobber -O xlsum.csv.gz https://github.com/ashvardanian/xl-sum/releases/download/v1.0.0/xlsum.csv.gz
-gzip -d  xlsum.csv.gz
+gzip -d xlsum.csv.gz
 
 # Human chromosome generator dataset generated by:
 # https://github.com/rghilduta/human-chromosome-data-generator/blob/main/generate_chromosome_data.sh
diff --git a/include/stringzilla/features.h b/include/stringzilla/features.h
index 1389a4cf..06a94338 100644
--- a/include/stringzilla/features.h
+++ b/include/stringzilla/features.h
@@ -3,7 +3,7 @@
  *  @file   features.h
  *  @author Ash Vardanian
  *
- *  The `sklearn.feature_extraction` module for @b TF-IDF, "CountVectorizer", and "HashingVectorizer"
+ *  The `sklearn.feature_extraction` module for @b TF-IDF, `CountVectorizer`, and `HashingVectorizer`
  *  is one of the most commonly used in the industry due to its extreme flexibility. It can:
  *
  *  - Tokenize by words, N-grams, or in-word N-grams.
@@ -14,7 +14,18 @@
  *
  *  That level of flexibility is not feasible for a hardware-accelerated SIMD library, but we
  *  can provide a set of APIs that can be used to build such a library on top of StringZilla.
- *  That functionality will reuse our @b Trie data-structure for vocabulary building histograms.
+ *  That functionality can reuse our @b Trie data-structure for vocabulary building histograms.
+ *
+ *  In this file, we mostly focus on batch-level hashing operations, similar to the `intersect.h`
+ *  module. There, we cross-reference two sets of strings, and here we only analyze one at a time.
+ *
+ *  - The text comes in pre-tokenized form, as a stream, not even indexed-lookup is needed,
+ *    unlike the `sz_sequence_t` in `sz_intersect` APIs.
+ *  - We scatter those tokens into the output in multiple forms:
+ *
+ *    - output hashes into a continuous buffer.
+ *    - output hashes into a hash-map with counts.
+ *    - output hashes into a high-dimensional bit-vector.
  *
  */
 #ifndef STRINGZILLA_FEATURES_H_
diff --git a/include/stringzilla/similarity.h b/include/stringzilla/similarity.h
index a1c60037..4fb3b648 100644
--- a/include/stringzilla/similarity.h
+++ b/include/stringzilla/similarity.h
@@ -379,11 +379,11 @@ SZ_INTERNAL sz_status_t _sz_levenshtein_distance_skewed_diagonals_serial( //
  *          Stores only 2 rows of the Levenshtein matrix, but uses 64-bit integers for the distance values,
  *          and upcasts UTF8 variable-length codepoints to 64-bit integers for faster addressing.
  *
- *  ! In the worst case for 2 strings of length 100, that contain just one 16-bit codepoint this will result in
- * extra:
- *      + 2 rows * 100 slots * 8 bytes/slot = 1600 bytes of memory for the two rows of the Levenshtein matrix rows.
- *      + 100 codepoints * 2 strings * 4 bytes/codepoint = 800 bytes of memory for the UTF8 buffer.
- *      = 2400 bytes of memory or @b 12x memory amplification!
+ *  ! In the worst case for 2 strings of length 100, that contain just one 16-bit codepoint this algorithm
+ *  ! will require 2400 bytes of memory:
+ *  !    + 2 rows * 100 slots * 8 bytes/slot = 1600 bytes of memory for the two rows of the Levenshtein matrix rows.
+ *  !    + 100 codepoints * 2 strings * 4 bytes/codepoint = 800 bytes of memory for the UTF8 buffer.
+ *  !    = 2400 bytes of memory or @b 12x memory amplification!
  */
 SZ_INTERNAL sz_status_t _sz_levenshtein_distance_wagner_fisher_serial( //
     sz_cptr_t longer, sz_size_t longer_length,                         //
@@ -764,15 +764,16 @@ SZ_PUBLIC sz_status_t sz_hamming_distance_utf8_serial( //
                              apply_to = function)
 
 /**
- *  @brief  Computes the edit distance between two very short byte-strings using the AVX-512VBMI extensions.
+ *  @brief Computes the edit distance between two very short byte-strings using the AVX-512VBMI extensions.
+ *  @sa `sz::levenshtein_distance_openmp`.
  *
  *  Applies to string lengths up to 63, and evaluates at most (63 * 2 + 1 = 127) diagonals, or just as many loop
  *  cycles. Supports an early exit, if the distance is bounded. Keeps all of the data and Levenshtein matrices skew
  *  diagonal in just a couple of registers. Benefits from the @b `vpermb` instructions, that can rotate the bytes
  *  across the entire ZMM register.
  *
- *? Bounds check, for inputs ranging from 33 to 64 bytes doesn't affect the performance at all.
- *? It's also worth exploring `_mm512_alignr_epi8` and `_mm512_maskz_compress_epi8` for the shift.
+ *  ? Bounds check, for inputs ranging from 33 to 64 bytes doesn't affect the performance at all.
+ *  ? It's also worth exploring `_mm512_alignr_epi8` and `_mm512_maskz_compress_epi8` for the shift.
  */
 SZ_INTERNAL sz_size_t _sz_levenshtein_distance_skewed_diagonals_upto63_ice( //
     sz_cptr_t shorter, sz_size_t shorter_length,                            //
@@ -809,7 +810,7 @@ SZ_INTERNAL sz_size_t _sz_levenshtein_distance_skewed_diagonals_upto63_ice( //
     bound_vec.zmm = _mm512_set1_epi8(bound <= 255 ? (sz_u8_t)bound : 255);
 
     // To simplify comparisons and traversals, we want to reverse the order of bytes in the shorter string.
-    shorter_vec.zmm = _mm512_setzero_si512(); //? To simplify debugging.
+    shorter_vec.zmm = _mm512_setzero_si512(); //? To simplify debugging, but can be noise
     for (sz_size_t i = 0; i != shorter_length; ++i) shorter_vec.u8s[63 - i] = shorter[i];
     shorter_rotated_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, shorter_vec.zmm);
 
@@ -1034,7 +1035,7 @@ SZ_INTERNAL sz_status_t _sz_levenshtein_distance_skewed_diagonals_upto65k_ice( /
     // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
     sz_size_t const shorter_dim = shorter_length + 1;
     sz_size_t const longer_dim = longer_length + 1;
-    // Unlike the serial version, we also want to avoid reverse-order iteration over teh shorter string.
+    // Unlike the serial version, we also want to avoid reverse-order iteration over the shorter string.
     // So let's allocate a bit more memory and reverse-export our shorter string into that buffer.
     sz_size_t const buffer_length = sizeof(sz_u16_t) * longer_dim * 3 + shorter_length;
     sz_u16_t *const distances = (sz_u16_t *)alloc->allocate(buffer_length, alloc->handle);
@@ -1225,8 +1226,8 @@ SZ_PUBLIC sz_status_t sz_levenshtein_distance_ice( //
  *  Unlike the `_sz_levenshtein_distance_skewed_diagonals_upto65k_avx512` method, this one uses signed integers to store
  *  the accumulated score. Moreover, it's primary bottleneck is the latency of gathering the substitution costs
  *  from the substitution matrix. If we use the diagonal order, we will be comparing a slice of the first string
- * with a slice of the second. If we stick to the conventional horizontal order, we will be comparing one character
- * against a slice, which is much easier to optimize. In that case we are sampling costs not from arbitrary parts of
+ *  with a slice of the second. If we stick to the conventional horizontal order, we will be comparing one character
+ *  against a slice, which is much easier to optimize. In that case we are sampling costs not from arbitrary parts of
  *  a 256 x 256 matrix, but from a single row!
  */
 SZ_INTERNAL sz_status_t _sz_needleman_wunsch_score_wagner_fisher_upto17m_ice( //
diff --git a/pyproject.toml b/pyproject.toml
index 008254ca..3f32dfb2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,4 +1,4 @@
-# This file configures wheels compilation for `cibuilwheel` for StringZilla CPython bindings.
+# This file configures wheels compilation for `cibuildwheel` for StringZilla CPython bindings.
 # On a good day it will produce:
 #   - `manylinux` and `musllinux` wheels for Linux on x86_64, aarch64, i686, ppc64le, s390x;
 #   - `macos` wheels for x86_64, arm64, and universal2;
@@ -28,7 +28,7 @@ build-verbosity = 0
 # - on Windows: AMD64, x86, ARM64
 # https://cibuildwheel.readthedocs.io/en/stable/options/#archs
 #
-# Important to note, not all those paltforms have recent images.
+# Important to note, not all those platforms have recent images.
 # The `manylinux_2_28` seems to be missing for `i686`.
 # The `i686` is 32-bit x86, and `x86_64` is 64-bit x86.
 archs = ["all"]
diff --git a/rust/lib.rs b/rust/lib.rs
index b449a5db..3a7a02ac 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -1,10 +1,11 @@
 #![cfg_attr(not(test), no_std)]
-
-/// The `sz` module provides a collection of string searching and manipulation functionality,
-/// designed for high efficiency and compatibility with `no_std` environments. This module offers
-/// various utilities for byte string manipulation, including search, reverse search, and
-/// edit-distance calculations, suitable for a wide range of applications from basic string
-/// processing to complex text analysis tasks.
+#[doc = r"
+The `sz` module provides a collection of string searching and manipulation functionality,
+designed for high efficiency and compatibility with `no_std` environments. This module offers
+various utilities for byte string manipulation, including search, reverse search, and
+edit-distance calculations, suitable for a wide range of applications from basic string
+processing to complex text analysis tasks.
+"]
 
 pub mod sz {
 
diff --git a/scripts/bench_search.cpp b/scripts/bench_search.cpp
index 04192146..c1d5b744 100644
--- a/scripts/bench_search.cpp
+++ b/scripts/bench_search.cpp
@@ -11,7 +11,12 @@
  *
  *  For substring search, the number of operations per second are reported as the number of character-level comparisons
  *  happening in the worst case in the naive algorithm, meaning O(N*M) for N characters in the haystack and M in the
- *  needle.
+ *  needle. In byteset search, the number of operations per second is computed the same way and the following character
+ *  sets are tested against each scanned token:
+ *
+ *  - "\n\r\v\f": 4 tabs
+ *  - "</>&'\"=[]": 9 html
+ *  - "0123456789": 10 digits
  *
  *  Instead of CLI arguments, for compatibility with @b StringWa.rs, the following environment variables are used:
  *  - `STRINGWARS_DATASET` : Path to the dataset file.
diff --git a/scripts/test.hpp b/scripts/test.hpp
index 0512cb40..55a916ae 100644
--- a/scripts/test.hpp
+++ b/scripts/test.hpp
@@ -14,13 +14,13 @@ namespace scripts {
 
 inline std::string read_file(std::string path) {
     std::ifstream stream(path);
-    if (!stream.is_open()) { throw std::runtime_error("Failed to open file: " + path); }
+    if (!stream.is_open()) throw std::runtime_error("Failed to open file: " + path);
     return std::string((std::istreambuf_iterator<char>(stream)), std::istreambuf_iterator<char>());
 }
 
 inline void write_file(std::string path, std::string content) {
     std::ofstream stream(path);
-    if (!stream.is_open()) { throw std::runtime_error("Failed to open file: " + path); }
+    if (!stream.is_open()) throw std::runtime_error("Failed to open file: " + path);
     stream << content;
     stream.close();
 }
@@ -74,13 +74,15 @@ inline std::string repeat(std::string const &patten, std::size_t count) {
 }
 
 /**
- *  @brief  A callback type for iterating over consecutive random-length slices of a string.
+ *  @brief Randomly slices a string into consecutive parts and passes those to @p slice_callback.
+ *  @warning Is @b single-threaded in nature, as it depends on the `global_random_generator`.
  */
 template <typename slice_callback_type_>
 inline void iterate_in_random_slices(std::string const &text, slice_callback_type_ &&slice_callback) {
     std::size_t remaining = text.size();
     while (remaining > 0) {
-        std::size_t slice_length = std::uniform_int_distribution<std::size_t>(1, remaining)(global_random_generator());
+        std::uniform_int_distribution<std::size_t> slice_length_distribution(1, remaining);
+        std::size_t slice_length = slice_length_distribution(global_random_generator());
         slice_callback({text.data() + text.size() - remaining, slice_length});
         remaining -= slice_length;
     }
@@ -121,7 +123,8 @@ using error_costs_256x256_t = std::array<sz_error_cost_t, 256 * 256>;
 inline error_costs_256x256_t unary_substitution_costs() {
     error_costs_256x256_t result;
     for (std::size_t i = 0; i != 256; ++i)
-        for (std::size_t j = 0; j != 256; ++j) result[i * 256 + j] = (i == j ? 0 : -1);
+        for (std::size_t j = 0; j != 256; ++j) //
+            result[i * 256 + j] = i == j ? 0 : -1;
     return result;
 }
 

From 6f8cdb93987548f1aef70e4ee5f29a779a4f6ef6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 16 Mar 2025 09:18:15 +0000
Subject: [PATCH 238/751] Add: OpenMP C++ draft

---
 include/stringzilla/similarity.hpp | 147 +++++++++++++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100644 include/stringzilla/similarity.hpp

diff --git a/include/stringzilla/similarity.hpp b/include/stringzilla/similarity.hpp
new file mode 100644
index 00000000..9730eb86
--- /dev/null
+++ b/include/stringzilla/similarity.hpp
@@ -0,0 +1,147 @@
+/**
+ *  @brief  OpenMP-accelerated string similarity utilities.
+ *  @file   similarity.hpp
+ *  @author Ash Vardanian
+ *
+ *  Includes core APIs:
+ *
+ *  - `sz::openmp::levenshtein_distances` & `sz::openmp::levenshtein_distances_utf8` for Levenshtein edit-distances.
+ *  - `sz::openmp::needleman_wunsch_score` for weighted Needleman-Wunsch global alignment.
+ *
+ *  Those are mostly providing specialized overloads of the @b `sz::openmp::global_score_kernel` template.
+ *
+ */
+#ifndef STRINGZILLA_SIMILARITY_HPP_
+#define STRINGZILLA_SIMILARITY_HPP_
+
+#include "types.h"
+
+namespace ashvardanian {
+namespace stringzilla {
+namespace openmp {
+
+struct uniform_substitution_cost_t {
+    sz_error_cost_t operator()(char a, char b) const { return a == b ? 0 : 1; }
+};
+
+/**
+ *  @brief  Levenshtein edit-distance computation using skewed diagonal order evaluation on CPU.
+ *          Type-agnostic, unlike `sz_levenshtein_distance`, and annotated for OpenMP parallelization.
+ *
+ *  @param[in] first The first string.
+ *  @param[in] first_length The length of the first string.
+ *  @param[in] second The second string.
+ *  @param[in] second_length The length of the second string.
+ *
+ *  There are smarter algorithms for computing the Levenshtein distance, but this one is extremely flexible.
+ */
+template <                                                         //
+    typename char_type_ = char,                                    //
+    typename distance_type_ = sz_size_t,                           //
+    typename get_substitution_cost_ = uniform_substitution_cost_t, //
+    sz_error_cost_t gap_cost_ = 1                                  //
+    >
+sz_status_t global_score_kernel(                         //
+    char_type_ const *shorter, sz_size_t shorter_length, //
+    char_type_ const *longer, sz_size_t longer_length,   //
+    get_substitution_cost_ &&get_substitution_cost,      //
+    sz_memory_allocator_t *alloc,                        //
+    distance_type_ *result_ptr) {
+
+    // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
+    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
+    sz_size_t const shorter_dim = shorter_length + 1;
+    sz_size_t const longer_dim = longer_length + 1;
+
+    // Let's say we are dealing with 3 and 5 letter words.
+    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
+    // It will have:
+    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
+    // - 2 diagonals of fixed length, at positions: 4, 5.
+    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
+    sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
+    sz_size_t const max_diagonal_length = shorter_length + 1;
+
+    // We want to avoid reverse-order iteration over the shorter string.
+    // Let's allocate a bit more memory and reverse-export our shorter string into that buffer.
+    sz_size_t const buffer_length =
+        sizeof(distance_type_) * max_diagonal_length * 3 + shorter_length * sizeof(char_type_);
+    distance_type_ *const distances = (distance_type_ *)alloc->allocate(buffer_length, alloc->handle);
+    if (!distances) return sz_bad_alloc_k;
+
+    // The next few pointers will be swapped around.
+    distance_type_ *previous_distances = distances;
+    distance_type_ *current_distances = previous_distances + longer_dim;
+    distance_type_ *next_distances = current_distances + longer_dim;
+    char_type_ const *const shorter_reversed = (char_type_ const *)(next_distances + longer_dim);
+
+    // Export the reversed string into the buffer.
+    for (sz_size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
+
+    // Initialize the first two diagonals:
+    previous_distances[0] = 0;
+    current_distances[0] = current_distances[1] = 1;
+
+    // We skip diagonals 0 and 1, as they are trivial.
+    // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
+    // so we are effectively computing just one value, as will be marked by a single set bit in
+    // the `next_diagonal_mask` on the very first iteration.
+    sz_size_t next_diagonal_index = 2;
+
+    // Progress through the upper triangle of the Levenshtein matrix.
+    for (; next_diagonal_index != shorter_dim; ++next_diagonal_index) {
+        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
+#pragma omp simd
+        for (sz_size_t offset_in_diagonal = 1; offset_in_diagonal + 1 < next_diagonal_length; ++offset_in_diagonal) {
+            // ? Note that here we are still traversing both buffers in the same order,
+            // ? because the shorter string has been reversed into `shorter_reversed`.
+            char_type_ shorter_char = shorter_reversed[shorter_length - next_diagonal_index + offset_in_diagonal - 1];
+            char_type_ longer_char = longer[offset_in_diagonal];
+            sz_error_cost_t cost_of_substitution = get_substitution_cost(shorter_char, longer_char);
+            distance_type_ cost_if_substitution = previous_distances[offset_in_diagonal] + cost_of_substitution;
+            distance_type_ cost_if_deletion_or_insertion =
+                gap_cost_ +
+                sz_min_of_two(current_distances[offset_in_diagonal - 1], current_distances[offset_in_diagonal]);
+            next_distances[offset_in_diagonal] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
+        }
+        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
+        next_distances[0] = next_distances[next_diagonal_length - 1] = next_diagonal_index;
+        // Perform a circular rotation of those buffers, to reuse the memory.
+        distance_type_ *temporary = previous_distances;
+        previous_distances = current_distances;
+        current_distances = next_distances;
+        next_distances = temporary;
+    }
+
+    // Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.
+    for (; next_diagonal_index != longer_dim; ++next_diagonal_index) {
+        sz_size_t const next_diagonal_length = shorter_dim;
+        for (sz_size_t offset_in_diagonal = 1; offset_in_diagonal + 1 < next_diagonal_length; ++offset_in_diagonal) {
+            char_type_ shorter_char = shorter[shorter_length - 1 - i]; // ! Walks in reverse order.
+            char_type_ longer_char = longer[next_diagonal_index - n + i];
+            sz_error_cost_t cost_of_substitution = get_substitution_cost(shorter_char, longer_char);
+            distance_type_ cost_if_substitution = previous_distances[i] + cost_of_substitution;
+            distance_type_ cost_if_deletion_or_insertion =
+                sz_min_of_two(current_distances[i], current_distances[i + 1]) + gap_cost_;
+            next_distances[i] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
+        }
+        // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
+        // dropping the first element in the current array.
+        distance_type_ *temporary = previous_distances;
+        previous_distances = current_distances + 1;
+        current_distances = next_distances;
+        next_distances = temporary;
+    }
+
+    // Cache scalar before `free` call.
+    distance_type_ result = current_distances[0];
+    alloc->free(distances, buffer_length, alloc->handle);
+    *result_ptr = result;
+    return sz_success_k;
+}
+
+} // namespace openmp
+} // namespace stringzilla
+} // namespace ashvardanian
+
+#endif // STRINGZILLA_SIMILARITY_HPP_
\ No newline at end of file

From 1907d2b8261bd00f6ad3a7e0fe26e6e5e1f6c8ea Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 16 Mar 2025 09:18:30 +0000
Subject: [PATCH 239/751] Make: List `scripts/` deps for `uv`

---
 scripts/requirements.txt | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 scripts/requirements.txt

diff --git a/scripts/requirements.txt b/scripts/requirements.txt
new file mode 100644
index 00000000..f768f0d8
--- /dev/null
+++ b/scripts/requirements.txt
@@ -0,0 +1,33 @@
+# This file lists the optional development dependencies 
+# for StringZilla benchmarks in the `scripts/` directory.
+fire
+arrow
+pandas
+tabulate
+
+# For stress testing & benchmarking:
+pytest
+pytest-repeat
+pytest-benchmark
+ipykernel # for Jupyter notebooks
+
+# For hashing:
+xxhash # https://github.com/ifduyue/python-xxhash, https://github.com/Cyan4973/xxHash 
+murmurhash # https://github.com/explosion/murmurhash, https://github.com/aappleby/smhasher
+
+# For Levenshtein distances:
+rapidfuzz  # https://github.com/rapidfuzz/RapidFuzz
+python-Levenshtein  # https://github.com/maxbachmann/python-Levenshtein
+levenshtein # https://github.com/maxbachmann/Levenshtein
+jellyfish # https://github.com/jamesturk/jellyfish/
+editdistance # https://github.com/roy-ht/editdistance
+distance # https://github.com/doukremt/distance
+polyleven # https://github.com/fujimotos/polyleven
+edlib # https://github.com/Martinsos/edlib
+nltk # https://github.com/nltk/nltk
+
+# For Needleman-Wunsch and Smith-Waterman algorithms with custom scoring matrices:
+biopython # https://github.com/biopython/biopython
+
+# Unicode processing:
+wcwidth # https://github.com/jquast/wcwidth

From 3090472085ca2b105ecbd69261c97f07e25cb3ea Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 16 Mar 2025 13:43:30 +0000
Subject: [PATCH 240/751] Add: `lookup` transform in Rust

---
 rust/lib.rs | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/rust/lib.rs b/rust/lib.rs
index 3a7a02ac..fec495af 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -451,6 +451,84 @@ pub mod sz {
         }
     }
 
+    /// Performs a lookup transformation (LUT), mapping contents of a buffer into the same or other
+    /// memory region, taking a byte substitution value from the provided table.
+    ///
+    /// # Arguments
+    ///
+    /// * `target`: A mutable buffer to populate.
+    /// * `source`: An immutable buffer to map from.
+    /// * `table`: Lookup table of 256 substitution values.
+    ///
+    /// # Examples
+    ///
+    /// To convert uppercase ASCII characters to lowercase:
+    ///
+    /// ```
+    /// let mut to_lower = [0u8; 256];
+    /// for (upper, lower) in ('A'..='Z').zip('a'..='z') {
+    ///     to_lower[upper as usize] = lower as u8;
+    /// }
+    /// let source = "HELLO WORLD!";
+    /// let mut target = vec![0u8; source.len()];
+    /// sz::lookup(&mut target, &source, to_lower);
+    /// let result = String::from_utf8(target).expect("Invalid UTF-8 sequence");
+    /// assert_eq!(result, "hello world!");
+    /// ```
+    ///
+    pub fn lookup<T, S>(target: &mut T, source: &S, table: [u8; 256])
+    where
+        T: AsMut<[u8]> + ?Sized,
+        S: AsRef<[u8]> + ?Sized,
+    {
+        let target_slice = target.as_mut();
+        let source_slice = source.as_ref();
+        unsafe {
+            sz_lookup(
+                target_slice.as_mut_ptr() as *mut c_void,
+                source_slice.len(),
+                source_slice.as_ptr() as *const c_void,
+                table.as_ptr() as _,
+            );
+        }
+    }
+
+    /// Performs a lookup transformation (LUT), mapping contents of a buffer into the same or other
+    /// memory region, taking a byte substitution value from the provided table.
+    ///
+    /// # Arguments
+    ///
+    /// * `buffer`: A mutable buffer to update inplace.
+    /// * `table`: Lookup table of 256 substitution values.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use stringzilla::sz;
+    /// let mut to_lower = [0u8; 256];
+    /// for (upper, lower) in ('A'..='Z').zip('a'..='z') {
+    ///     to_lower[upper as usize] = lower as u8;
+    /// }
+    /// let mut text = b"HELLO WORLD!";
+    /// sz::lookup_inplace(&mut text, to_lower);
+    /// assert_eq!(text, "hello world!");
+    /// ```
+    ///
+    pub fn lookup_inplace<T>(buffer: &T, table: [u8; 256])
+    where
+        T: AsMut<[u8]> + ?Sized,
+    {
+        let buffer_slice = buffer.as_mut();
+        unsafe {
+            sz_lookup(
+                buffer_slice.as_mut_ptr() as *mut c_void,
+                buffer_slice.len(),
+                buffer_slice.as_ptr() as *const c_void,
+                table.as_ptr() as _,
+            );
+        }
+    }
+
     /// Computes a 64-bit AES-based hash value for a given byte slice `text`.
     /// This function is designed to provide a high-quality hash value for use in
     /// hash tables, data structures, and cryptographic applications.

From 28282d2936a9e83c447065a35f45e8624b5e4964 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 16 Mar 2025 13:45:18 +0000
Subject: [PATCH 241/751] Improve: Inline cheap calls

---
 rust/lib.rs | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/rust/lib.rs b/rust/lib.rs
index fec495af..ce94fe0a 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -403,7 +403,8 @@ pub mod sz {
     /// Moves the contents of `source` into `target`, overwriting the existing contents of `target`.
     /// This function is useful for scenarios where you need to replace the contents of a byte slice
     /// with the contents of another byte slice.
-    pub fn move_bytes<T, S>(target: &mut T, source: &S)
+    #[inline(always)]
+    pub fn move_<T, S>(target: &mut T, source: &S)
     where
         T: AsMut<[u8]> + ?Sized,
         S: AsRef<[u8]> + ?Sized,
@@ -422,6 +423,7 @@ pub mod sz {
     /// Fills the contents of `target` with the specified `value`. This function is useful for
     /// scenarios where you need to set all bytes in a byte slice to a specific value, such as
     /// zeroing out a buffer or initializing a buffer with a specific byte pattern.
+    #[inline(always)]
     pub fn fill<T>(target: &mut T, value: u8)
     where
         T: AsMut<[u8]> + ?Sized,
@@ -435,6 +437,7 @@ pub mod sz {
     /// Copies the contents of `source` into `target`, overwriting the existing contents of `target`.
     /// This function is useful for scenarios where you need to replace the contents of a byte slice
     /// with the contents of another byte slice.
+    #[inline(always)]
     pub fn copy<T, S>(target: &mut T, source: &S)
     where
         T: AsMut<[u8]> + ?Sized,
@@ -537,7 +540,7 @@ pub mod sz {
     /// # Arguments
     ///
     /// * `text`: The byte slice to compute the checksum for.
-    /// * `seed` - A 64-bit value that acts as the seed for the hash function.
+    /// * `seed`: A 64-bit value that acts as the seed for the hash function.
     ///
     /// # Returns
     ///
@@ -1101,7 +1104,7 @@ pub mod sz {
     /// between two characters (byte values). Matching characters are assigned a cost
     /// of 0, and non-matching characters are assigned a cost of -1.
     pub fn unary_substitution_costs() -> [[i8; 256]; 256] {
-        let mut result = [[0; 256]; 256];
+        let mut result = [[0i8; 256]; 256];
 
         for i in 0..256 {
             for j in 0..256 {
@@ -1469,6 +1472,7 @@ impl<'a> RangeMatches<'a> {
 impl<'a> Iterator for RangeMatches<'a> {
     type Item = &'a [u8];
 
+    #[inline(always)]
     fn next(&mut self) -> Option<Self::Item> {
         if self.position >= self.haystack.len() {
             return None;
@@ -1520,6 +1524,7 @@ impl<'a> RangeSplits<'a> {
 impl<'a> Iterator for RangeSplits<'a> {
     type Item = &'a [u8];
 
+    #[inline(always)]
     fn next(&mut self) -> Option<Self::Item> {
         if self.position > self.haystack.len() {
             return None;
@@ -1575,6 +1580,7 @@ impl<'a> RangeRMatches<'a> {
 impl<'a> Iterator for RangeRMatches<'a> {
     type Item = &'a [u8];
 
+    #[inline(always)]
     fn next(&mut self) -> Option<Self::Item> {
         if self.position == 0 {
             return None;
@@ -1628,6 +1634,7 @@ impl<'a> RangeRSplits<'a> {
 impl<'a> Iterator for RangeRSplits<'a> {
     type Item = &'a [u8];
 
+    #[inline(always)]
     fn next(&mut self) -> Option<Self::Item> {
         if self.position == 0 {
             return None;

From 42270c85b865ba2aea5562856616e22a3fa7d563 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 16 Mar 2025 13:51:28 +0000
Subject: [PATCH 242/751] Fix: Compiling SVE on MacOS

Clang would complain about type mismatches
for SVE code when compiled through Cargo and
C FFI, so the new patch infers some sized types
from compiler macros.
---
 include/stringzilla/compare.h |  2 +-
 include/stringzilla/find.h    | 10 +++---
 include/stringzilla/hash.h    |  8 ++---
 include/stringzilla/memory.h  | 52 +++++++++++++++----------------
 include/stringzilla/sort.h    | 28 ++++++++---------
 include/stringzilla/types.h   | 58 ++++++++++++++++++++++++++++-------
 6 files changed, 97 insertions(+), 61 deletions(-)

diff --git a/include/stringzilla/compare.h b/include/stringzilla/compare.h
index 8a739cca..11b82e17 100644
--- a/include/stringzilla/compare.h
+++ b/include/stringzilla/compare.h
@@ -411,7 +411,7 @@ SZ_PUBLIC sz_bool_t sz_equal_sve(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
     sz_size_t const vector_bytes = svcntb();
     sz_size_t progress = 0;
     do {
-        svbool_t progress_vec = svwhilelt_b8(progress, length);
+        svbool_t progress_vec = svwhilelt_b8((sz_u64_t)progress, (sz_u64_t)length);
         svuint8_t a_vec = svld1(progress_vec, (sz_u8_t const *)(a + progress));
         svuint8_t b_vec = svld1(progress_vec, (sz_u8_t const *)(b + progress));
         // Compare: generate a predicate marking lanes where a!=b
diff --git a/include/stringzilla/find.h b/include/stringzilla/find.h
index 76dfdc40..2e8724c3 100644
--- a/include/stringzilla/find.h
+++ b/include/stringzilla/find.h
@@ -1648,7 +1648,7 @@ SZ_PUBLIC sz_cptr_t sz_find_byte_sve(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
     sz_size_t const vector_bytes = svcntb();
     sz_size_t progress = 0;
     do {
-        svbool_t progress_mask = svwhilelt_b8(progress, h_length);
+        svbool_t progress_mask = svwhilelt_b8((sz_u64_t)progress, (sz_u64_t)h_length);
         svuint8_t h_vec = svld1(progress_mask, (sz_u8_t const *)(h + progress));
         // Compare: generate a predicate marking lanes where h[i]!=n
         svbool_t equal_vec = svcmpeq_n_u8(progress_mask, h_vec, n_scalar);
@@ -1668,7 +1668,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byte_sve(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
     sz_size_t const vector_bytes = svcntb();
     sz_size_t progress = 0;
     do {
-        svbool_t progress_mask = svwhilelt_b8(progress, h_length);
+        svbool_t progress_mask = svwhilelt_b8((sz_u64_t)progress, (sz_u64_t)h_length);
         svbool_t backward_mask = svrev_b8(progress_mask);
         svuint8_t h_vec = svld1(backward_mask, (sz_u8_t const *)(h + h_length - progress - vector_bytes));
         // Compare: generate a predicate marking lanes where h[i]!=n
@@ -1698,7 +1698,7 @@ SZ_PUBLIC sz_cptr_t sz_find_sve(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz
         sz_u8_t n1 = ((sz_u8_t *)n)[1];
         do {
             // We must avoid overrunning the haystack for the second byte.
-            svbool_t pred = svwhilelt_b8(progress, h_length - 1);
+            svbool_t pred = svwhilelt_b8((sz_u64_t)progress, (sz_u64_t)(h_length - 1));
             // Load two adjacent vectors.
             svuint8_t hay0 = svld1(pred, (sz_u8_t const *)(h + progress));
             svuint8_t hay1 = svld1(pred, (sz_u8_t const *)(h + progress + 1));
@@ -1717,7 +1717,7 @@ SZ_PUBLIC sz_cptr_t sz_find_sve(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz
         sz_u8_t n2 = ((sz_u8_t *)n)[2];
         do {
             // Prevent overrunning for the 3rd byte.
-            svbool_t pred = svwhilelt_b8(progress, h_length - 2);
+            svbool_t pred = svwhilelt_b8((sz_u64_t)progress, (sz_u64_t)(h_length - 2));
             svuint8_t hay0 = svld1(pred, (sz_u8_t const *)(h + progress));
             svuint8_t hay1 = svld1(pred, (sz_u8_t const *)(h + progress + 1));
             svuint8_t hay2 = svld1(pred, (sz_u8_t const *)(h + progress + 2));
@@ -1740,7 +1740,7 @@ SZ_PUBLIC sz_cptr_t sz_find_sve(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz
         sz_u8_t n_last = ((sz_u8_t *)n)[offset_last];
         do {
             // Make sure the predicate does not run off the end.
-            svbool_t pred = svwhilelt_b8(progress, h_length - (n_length - 1));
+            svbool_t pred = svwhilelt_b8((sz_u64_t)progress, (sz_u64_t)(h_length - n_length + 1));
             // Load haystack bytes at the chosen offsets.
             svuint8_t hay_first = svld1(pred, (sz_u8_t const *)(h + progress + offset_first));
             svuint8_t hay_mid = svld1(pred, (sz_u8_t const *)(h + progress + offset_mid));
diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 0f01379b..01346871 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -2256,7 +2256,7 @@ SZ_PUBLIC sz_u64_t sz_bytesum_sve(sz_cptr_t text, sz_size_t length) {
     sz_size_t const vector_length = svcntb();
     // SVE doesn't have widening accumulation, so we reduce across each loaded vector
     for (; progress < length; progress += vector_length) {
-        svbool_t progress_mask = svwhilelt_b8(progress, length);
+        svbool_t progress_mask = svwhilelt_b8((sz_u64_t)progress, (sz_u64_t)length);
         svuint8_t text_vec = svld1_u8(progress_mask, (sz_u8_t const *)(text + progress));
         sum += svaddv_u8(progress_mask, text_vec);
     }
@@ -2295,7 +2295,7 @@ SZ_PUBLIC sz_u64_t sz_bytesum_sve2(sz_cptr_t text, sz_size_t length) {
         svuint16_t sum_u16_bot = svdup_n_u16(0);
         // Assuming `u16` has a 256x wider range than `u8`, we can aggregate up to 256 lanes in each value.
         for (sz_size_t loop_index = 0; progress < length && loop_index < 256; progress += vector_length, ++loop_index) {
-            svbool_t progress_mask = svwhilelt_b8(progress, length);
+            svbool_t progress_mask = svwhilelt_b8((sz_u64_t)progress, (sz_u64_t)length);
             svuint8_t text_vec = svld1_u8(progress_mask, (sz_u8_t const *)(text + progress));
             sum_u16_top = svaddwb_u16(sum_u16_top, text_vec);
             sum_u16_bot = svaddwt_u16(sum_u16_bot, text_vec);
@@ -2340,7 +2340,7 @@ SZ_PUBLIC sz_u64_t _sz_hash_sve2_upto16(sz_cptr_t text, sz_size_t length, sz_u64
     svuint8_t const shuffle_mask = svld1_u8(svptrue_b8(), _sz_hash_u8x16x4_shuffle());
 
     // This is our best case for SVE2 dominance over NEON - we can load the data in one go with a predicate.
-    svuint8_t block = svld1_u8(svwhilelt_b8((sz_size_t)0, length), (sz_u8_t const *)text);
+    svuint8_t block = svld1_u8(svwhilelt_b8((sz_u64_t)0, (sz_u64_t)length), (sz_u8_t const *)text);
     // One round of hashing logic
     state_aes = _sz_emulate_aesenc_u8x16_sve2(state_aes, block);
     svuint8_t sum_shuffled = svtbl_u8(state_sum, shuffle_mask);
@@ -2412,7 +2412,7 @@ SZ_PUBLIC void _sz_hash_sve2_upto16x16(char texts[16][16], sz_size_t length[16],
     sz_size_t const texts_per_register = bytes_per_register / 16;
     for (sz_size_t progress_bytes = 0; progress_bytes < 256; progress_bytes += bytes_per_register) {
         svuint8_t blocks =
-            svld1_u8(svwhilelt_b8(progress_bytes, 256), (sz_u8_t const *)(&texts[0][0] + progress_bytes));
+            svld1_u8(svwhilelt_b8((sz_u64_t)progress_bytes, (sz_u64_t)256), (sz_u8_t const *)(&texts[0][0] + progress_bytes));
 
         // One round of hashing logic for multiple blocks
         svuint8_t blocks_aes = _sz_emulate_aesenc_u8x16_sve2(state_aes, blocks);
diff --git a/include/stringzilla/memory.h b/include/stringzilla/memory.h
index 67027317..6aa52019 100644
--- a/include/stringzilla/memory.h
+++ b/include/stringzilla/memory.h
@@ -1225,8 +1225,8 @@ SZ_PUBLIC void sz_fill_sve(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
 
     if (length <= vec_len) {
         // Small buffer case: use mask to handle small writes
-        svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)length);
-        svst1_u8(mask, (unsigned char *)target, value_vec);
+        svbool_t mask = svwhilelt_b8((sz_u64_t)0ull, (sz_u64_t)length);
+        svst1_u8(mask, (sz_u8_t *)target, value_vec);
     }
     else {
         // Calculate head, body, and tail sizes
@@ -1235,18 +1235,18 @@ SZ_PUBLIC void sz_fill_sve(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
         sz_size_t body_length = length - head_length - tail_length;
 
         // Handle unaligned head
-        svbool_t head_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)head_length);
-        svst1_u8(head_mask, (unsigned char *)target, value_vec);
+        svbool_t head_mask = svwhilelt_b8((sz_u64_t)0ull, (sz_u64_t)head_length);
+        svst1_u8(head_mask, (sz_u8_t *)target, value_vec);
         target += head_length;
 
         // Aligned body loop
         for (; body_length >= vec_len; target += vec_len, body_length -= vec_len) {
-            svst1_u8(svptrue_b8(), (unsigned char *)target, value_vec);
+            svst1_u8(svptrue_b8(), (sz_u8_t *)target, value_vec);
         }
 
         // Handle unaligned tail
-        svbool_t tail_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)tail_length);
-        svst1_u8(tail_mask, (unsigned char *)target, value_vec);
+        svbool_t tail_mask = svwhilelt_b8((sz_u64_t)0ull, (sz_u64_t)tail_length);
+        svst1_u8(tail_mask, (sz_u8_t *)target, value_vec);
     }
 }
 
@@ -1262,9 +1262,9 @@ SZ_PUBLIC void sz_copy_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length)
     // When the buffer is small, there isn't much to innovate.
     if (length <= vec_len) {
         // Small buffer case: use mask to handle small writes
-        svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)length);
-        svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-        svst1_u8(mask, (unsigned char *)target, data);
+        svbool_t mask = svwhilelt_b8((sz_u64_t)0ull, (sz_u64_t)length);
+        svuint8_t data = svld1_u8(mask, (sz_u8_t *)source);
+        svst1_u8(mask, (sz_u8_t *)target, data);
     }
     // When dealing with larger buffers, similar to AVX-512, we want minimize unaligned operations
     // and handle the head, body, and tail separately. We can also traverse the buffer in both directions
@@ -1287,36 +1287,36 @@ SZ_PUBLIC void sz_copy_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length)
         sz_size_t body_length = length - head_length - tail_length;
 
         // Handle unaligned parts
-        svbool_t head_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)head_length);
-        svuint8_t head_data = svld1_u8(head_mask, (unsigned char *)source);
-        svst1_u8(head_mask, (unsigned char *)target, head_data);
-        svbool_t tail_mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)tail_length);
-        svuint8_t tail_data = svld1_u8(tail_mask, (unsigned char *)source + head_length + body_length);
-        svst1_u8(tail_mask, (unsigned char *)target + head_length + body_length, tail_data);
+        svbool_t head_mask = svwhilelt_b8((sz_u64_t)0ull, (sz_u64_t)head_length);
+        svuint8_t head_data = svld1_u8(head_mask, (sz_u8_t *)source);
+        svst1_u8(head_mask, (sz_u8_t *)target, head_data);
+        svbool_t tail_mask = svwhilelt_b8((sz_u64_t)0ull, (sz_u64_t)tail_length);
+        svuint8_t tail_data = svld1_u8(tail_mask, (sz_u8_t *)source + head_length + body_length);
+        svst1_u8(tail_mask, (sz_u8_t *)target + head_length + body_length, tail_data);
         target += head_length;
         source += head_length;
 
         // Aligned body loop, walking in two directions
         for (; body_length >= vec_len * 2; target += vec_len, source += vec_len, body_length -= vec_len * 2) {
-            svuint8_t forward_data = svld1_u8(svptrue_b8(), (unsigned char *)source);
-            svuint8_t backward_data = svld1_u8(svptrue_b8(), (unsigned char *)source + body_length - vec_len);
-            svst1_u8(svptrue_b8(), (unsigned char *)target, forward_data);
-            svst1_u8(svptrue_b8(), (unsigned char *)target + body_length - vec_len, backward_data);
+            svuint8_t forward_data = svld1_u8(svptrue_b8(), (sz_u8_t *)source);
+            svuint8_t backward_data = svld1_u8(svptrue_b8(), (sz_u8_t *)source + body_length - vec_len);
+            svst1_u8(svptrue_b8(), (sz_u8_t *)target, forward_data);
+            svst1_u8(svptrue_b8(), (sz_u8_t *)target + body_length - vec_len, backward_data);
         }
         // Up to (vec_len * 2 - 1) bytes of data may be left in the body,
         // so we can unroll the last two optional loop iterations.
         if (body_length > vec_len) {
-            svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)body_length);
-            svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-            svst1_u8(mask, (unsigned char *)target, data);
+            svbool_t mask = svwhilelt_b8((sz_u64_t)0ull, (sz_u64_t)body_length);
+            svuint8_t data = svld1_u8(mask, (sz_u8_t *)source);
+            svst1_u8(mask, (sz_u8_t *)target, data);
             body_length -= vec_len;
             source += body_length;
             target += body_length;
         }
         if (body_length) {
-            svbool_t mask = svwhilelt_b8((sz_u32_t)0ull, (sz_u32_t)body_length);
-            svuint8_t data = svld1_u8(mask, (unsigned char *)source);
-            svst1_u8(mask, (unsigned char *)target, data);
+            svbool_t mask = svwhilelt_b8((sz_u64_t)0ull, (sz_u64_t)body_length);
+            svuint8_t data = svld1_u8(mask, (sz_u8_t *)source);
+            svst1_u8(mask, (sz_u8_t *)target, data);
         }
     }
 }
diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index 67b21822..f4dc23f4 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -940,8 +940,8 @@ SZ_INTERNAL void _sz_sequence_argsort_sve_3way_partition(
     // Count elements smaller and greater than the pivot.
     sz_size_t count_smaller = 0, count_greater = 0;
     for (sz_size_t i = start_in_sequence; i < end_in_sequence; i += pgrams_per_vector) {
-        svbool_t load_mask = svwhilelt_b64(i, end_in_sequence);
-        svuint64_t pgrams_vec = svld1_u64(load_mask, initial_pgrams + i);
+        svbool_t load_mask = svwhilelt_b64((sz_u64_t)i, (sz_u64_t)end_in_sequence);
+        svuint64_t pgrams_vec = svld1_u64(load_mask, (sz_u64_t const *)(initial_pgrams + i));
         svbool_t smaller_mask = svcmplt_u64(load_mask, pgrams_vec, pivot_vec);
         svbool_t greater_mask = svcmpgt_u64(load_mask, pgrams_vec, pivot_vec);
         count_smaller = svqincp_n_u64_b64(count_smaller, smaller_mask); // Smarter than `svcntp_b64`
@@ -959,9 +959,9 @@ SZ_INTERNAL void _sz_sequence_argsort_sve_3way_partition(
 
     // Partition elements into three segments.
     for (sz_size_t i = start_in_sequence; i < end_in_sequence; i += pgrams_per_vector) {
-        svbool_t load_mask = svwhilelt_b64(i, end_in_sequence);
-        svuint64_t pgrams_vec = svld1_u64(load_mask, initial_pgrams + i);
-        svuint64_t order_vec = svld1_u64(load_mask, initial_order + i);
+        svbool_t load_mask = svwhilelt_b64((sz_u64_t)i, (sz_u64_t)end_in_sequence);
+        svuint64_t pgrams_vec = svld1_u64(load_mask, (sz_u64_t const *)(initial_pgrams + i));
+        svuint64_t order_vec = svld1_u64(load_mask, (sz_u64_t const *)(initial_order + i));
 
         svbool_t smaller_mask = svcmplt_u64(load_mask, pgrams_vec, pivot_vec);
         svbool_t equal_mask = svcmpeq_u64(load_mask, pgrams_vec, pivot_vec);
@@ -974,25 +974,25 @@ SZ_INTERNAL void _sz_sequence_argsort_sve_3way_partition(
         if (count_smaller) {
             svuint64_t comp_pgrams = svcompact_u64(smaller_mask, pgrams_vec);
             svuint64_t comp_order = svcompact_u64(smaller_mask, order_vec);
-            svbool_t store_mask = svwhilelt_b64((sz_size_t)0, count_smaller);
-            svst1_u64(store_mask, partitioned_pgrams + smaller_offset, comp_pgrams);
-            svst1_u64(store_mask, partitioned_order + smaller_offset, comp_order);
+            svbool_t store_mask = svwhilelt_b64((sz_u64_t)0, (sz_u64_t)count_smaller);
+            svst1_u64(store_mask, (sz_u64_t *)(partitioned_pgrams + smaller_offset), comp_pgrams);
+            svst1_u64(store_mask, (sz_u64_t *)(partitioned_order + smaller_offset), comp_order);
             smaller_offset += count_smaller;
         }
         if (count_equal) {
             svuint64_t comp_pgrams = svcompact_u64(equal_mask, pgrams_vec);
             svuint64_t comp_order = svcompact_u64(equal_mask, order_vec);
-            svbool_t store_mask = svwhilelt_b64((sz_size_t)0, count_equal);
-            svst1_u64(store_mask, partitioned_pgrams + equal_offset, comp_pgrams);
-            svst1_u64(store_mask, partitioned_order + equal_offset, comp_order);
+            svbool_t store_mask = svwhilelt_b64((sz_u64_t)0, (sz_u64_t)count_equal);
+            svst1_u64(store_mask, (sz_u64_t *)(partitioned_pgrams + equal_offset), comp_pgrams);
+            svst1_u64(store_mask, (sz_u64_t *)(partitioned_order + equal_offset), comp_order);
             equal_offset += count_equal;
         }
         if (count_greater) {
             svuint64_t comp_pgrams = svcompact_u64(greater_mask, pgrams_vec);
             svuint64_t comp_order = svcompact_u64(greater_mask, order_vec);
-            svbool_t store_mask = svwhilelt_b64((sz_size_t)0, count_greater);
-            svst1_u64(store_mask, partitioned_pgrams + greater_offset, comp_pgrams);
-            svst1_u64(store_mask, partitioned_order + greater_offset, comp_order);
+            svbool_t store_mask = svwhilelt_b64((sz_u64_t)0, (sz_u64_t)count_greater);
+            svst1_u64(store_mask, (sz_u64_t *)(partitioned_pgrams + greater_offset), comp_pgrams);
+            svst1_u64(store_mask, (sz_u64_t *)(partitioned_order + greater_offset), comp_order);
             greater_offset += count_greater;
         }
     }
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index e99ef6fd..67b2fc61 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -291,17 +291,53 @@ typedef ptrdiff_t sz_ssize_t; // Signed version of `sz_size_t`, 32 or 64 bits
 
 #else // if SZ_AVOID_LIBC:
 /**
+ *  Even when LibC is not available, we can use compiler macros to infer the size of integer types.
+ *  https://gcc.gnu.org/onlinedocs/cpp/Common-Predefined-Macros.html
+ *
  *  ! The C standard doesn't specify the signedness of char.
  *  ! On x86 char is signed by default while on Arm it is unsigned by default.
  *  ! That's why we don't define `sz_char_t` and generally use explicit `sz_i8_t` and `sz_u8_t`.
  */
-typedef signed char sz_i8_t;         // Always 8 bits
-typedef unsigned char sz_u8_t;       // Always 8 bits
-typedef unsigned short sz_u16_t;     // Always 16 bits
-typedef int sz_i32_t;                // Always 32 bits
-typedef unsigned int sz_u32_t;       // Always 32 bits
-typedef long long sz_i64_t;          // Always 64 bits
-typedef unsigned long long sz_u64_t; // Always 64 bits
+#if defined(__INT8_TYPE__)
+typedef __INT8_TYPE__ sz_i8_t;
+#else
+typedef signed char sz_i8_t;
+#endif
+#if defined(__UINT8_TYPE__)
+typedef __UINT8_TYPE__ sz_u8_t;
+#else
+typedef unsigned char sz_u8_t;
+#endif
+#if defined(__INT16_TYPE__)
+typedef __INT16_TYPE__ sz_i16_t;
+#else
+typedef short sz_i16_t;
+#endif
+#if defined(__UINT16_TYPE__)
+typedef __UINT16_TYPE__ sz_u16_t;
+#else
+typedef unsigned short sz_u16_t;
+#endif
+#if defined(__INT32_TYPE__)
+typedef __INT32_TYPE__ sz_i32_t;
+#else
+typedef int sz_i32_t;
+#endif
+#if defined(__UINT32_TYPE__)
+typedef __UINT32_TYPE__ sz_u32_t;
+#else
+typedef unsigned int sz_u32_t;
+#endif
+#if defined(__INT64_TYPE__)
+typedef __INT64_TYPE__ sz_i64_t;
+#else
+typedef long long sz_i64_t;
+#endif
+#if defined(__UINT64_TYPE__)
+typedef __UINT64_TYPE__ sz_u64_t;
+#else
+typedef unsigned long long sz_u64_t;
+#endif
 
 /**
  *  Now we need to redefine the `size_t`.
@@ -322,11 +358,11 @@ typedef unsigned long long sz_u64_t; // Always 64 bits
  *  Source: https://learn.microsoft.com/en-us/windows/win32/winprog64/abstract-data-models
  */
 #if _SZ_IS_64_BIT
-typedef unsigned long long sz_size_t; // 64-bit.
-typedef long long sz_ssize_t;         // 64-bit.
+typedef sz_u64_t sz_size_t;  // ? Preferred over the `__SIZE_TYPE__` and `__UINTMAX_TYPE__` macros
+typedef sz_i64_t sz_ssize_t; // ? Preferred over the `__PTRDIFF_TYPE__` and `__INTMAX_TYPE__` macros
 #else
-typedef unsigned int sz_size_t; // 32-bit.
-typedef int sz_ssize_t;         // 32-bit.
+typedef sz_u32_t sz_size_t;  // ? Preferred over the `__SIZE_TYPE__` and `__UINTMAX_TYPE__` macros
+typedef sz_i32_t sz_ssize_t; // ? Preferred over the `__PTRDIFF_TYPE__` and `__INTMAX_TYPE__` macros
 #endif // _SZ_IS_64_BIT
 #endif // SZ_AVOID_LIBC
 

From 778d4f04def0dde448f7a7bddc235e47dc6cf5db Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 16 Mar 2025 14:54:54 +0000
Subject: [PATCH 243/751] Fix: `sz::lookup` examples in Rs

---
 rust/lib.rs | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/rust/lib.rs b/rust/lib.rs
index ce94fe0a..4d8bcf27 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -468,7 +468,8 @@ pub mod sz {
     /// To convert uppercase ASCII characters to lowercase:
     ///
     /// ```
-    /// let mut to_lower = [0u8; 256];
+    /// use stringzilla::sz;
+    /// let mut to_lower: [u8; 256] = core::array::from_fn(|i| i as u8);
     /// for (upper, lower) in ('A'..='Z').zip('a'..='z') {
     ///     to_lower[upper as usize] = lower as u8;
     /// }
@@ -508,16 +509,16 @@ pub mod sz {
     ///
     /// ```
     /// use stringzilla::sz;
-    /// let mut to_lower = [0u8; 256];
+    /// let mut to_lower: [u8; 256] = core::array::from_fn(|i| i as u8);
     /// for (upper, lower) in ('A'..='Z').zip('a'..='z') {
     ///     to_lower[upper as usize] = lower as u8;
     /// }
-    /// let mut text = b"HELLO WORLD!";
+    /// let mut text = *b"HELLO WORLD!";
     /// sz::lookup_inplace(&mut text, to_lower);
-    /// assert_eq!(text, "hello world!");
+    /// assert_eq!(text, *b"hello world!");
     /// ```
     ///
-    pub fn lookup_inplace<T>(buffer: &T, table: [u8; 256])
+    pub fn lookup_inplace<T>(buffer: &mut T, table: [u8; 256])
     where
         T: AsMut<[u8]> + ?Sized,
     {

From b55c696c7e4429397ea63e9a59b044b74079f00f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 16 Mar 2025 19:38:30 +0000
Subject: [PATCH 244/751] Docs: Showcase indexing diagonals

---
 scripts/explore_levenshtein.ipynb | 429 +++++++++++++++++++-----------
 1 file changed, 273 insertions(+), 156 deletions(-)

diff --git a/scripts/explore_levenshtein.ipynb b/scripts/explore_levenshtein.ipynb
index 606939ae..49b343c7 100644
--- a/scripts/explore_levenshtein.ipynb
+++ b/scripts/explore_levenshtein.ipynb
@@ -39,7 +39,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -72,28 +72,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "('kiten',\n",
-       " 'katerinas',\n",
-       " 'distance_wf = np.int64(5)',\n",
-       " array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],\n",
-       "        [1, 0, 1, 2, 3, 4, 5, 6, 7, 8],\n",
-       "        [2, 1, 1, 2, 3, 4, 4, 5, 6, 7],\n",
-       "        [3, 2, 2, 1, 2, 3, 4, 5, 6, 7],\n",
-       "        [4, 3, 3, 2, 1, 2, 3, 4, 5, 6],\n",
-       "        [5, 4, 4, 3, 2, 2, 3, 3, 4, 5]]))"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "s1 = \"kiten\"\n",
     "s2 = \"katerinas\"\n",
@@ -118,7 +99,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -133,25 +114,23 @@
     "    [7, 8, 9]])\n",
     "assert np.all(get_skewed_diagonal(matrix, 2) == [7, 5, 3])\n",
     "assert np.all(get_skewed_diagonal(matrix, 1) == [4, 2])\n",
-    "assert np.all(get_skewed_diagonal(matrix, 4) == [9])"
+    "assert np.all(get_skewed_diagonal(matrix, 4) == [9])\n",
+    "\n",
+    "# Let's test this function right away.\n",
+    "matrix = np.array([\n",
+    "    [1, 2, 3],\n",
+    "    [4, 5, 6]])\n",
+    "assert np.all(get_skewed_diagonal(matrix, 0) == [1])\n",
+    "assert np.all(get_skewed_diagonal(matrix, 1) == [4, 2])\n",
+    "assert np.all(get_skewed_diagonal(matrix, 2) == [5, 3])\n",
+    "assert np.all(get_skewed_diagonal(matrix, 3) == [6])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([2, 3, 5, 6, 8])"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "get_skewed_diagonal(matrix_wf, 10)"
    ]
@@ -173,7 +152,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -254,7 +233,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -282,27 +261,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "('BCDE',\n",
-       " 'FKPU',\n",
-       " 'distance_wf = np.int64(4)',\n",
-       " array([[0, 1, 2, 3, 4],\n",
-       "        [1, 1, 2, 3, 4],\n",
-       "        [2, 2, 2, 3, 4],\n",
-       "        [3, 3, 3, 3, 4],\n",
-       "        [4, 4, 4, 4, 4]]))"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "s1 = \"BCDE\"\n",
     "s2 = \"FKPU\"\n",
@@ -333,22 +294,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(array([0, 0, 0, 0, 0], dtype=uint64),\n",
-       " array([1, 1, 0, 0, 0], dtype=uint64),\n",
-       " array([0, 0, 0, 0, 0], dtype=uint64))"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "assert len(s1) == len(s2), \"First define an algo for square matrices!\"\n",
     "# Number of rows and columns in the square matrix.\n",
@@ -375,22 +323,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(array([3, 2, 2, 3, 0], dtype=uint64),\n",
-       " array([4, 3, 2, 3, 4], dtype=uint64),\n",
-       " array([4, 3, 2, 3, 4], dtype=uint64))"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# To evaluate every subsequent entry:\n",
     "next_diagonal_index = 2\n",
@@ -425,22 +360,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(array([4, 4, 4, 4, 0], dtype=uint64),\n",
-       " array([4, 4, 4, 4, 4], dtype=uint64),\n",
-       " array([4, 4, 4, 4, 4], dtype=uint64))"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "while next_diagonal_index < 2 * n - 1:\n",
     "    next_skew_diagonal_length = 2 * n - 1 - next_diagonal_index\n",
@@ -461,7 +383,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -498,7 +420,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -539,7 +461,7 @@
     "    # Now let's handle the rest of the upper triangle.\n",
     "    for skew_diagonal_index in range(2, shorter_dim):\n",
     "        skew_diagonal_length = (skew_diagonal_index + 1)\n",
-    "        for offset_within_diagonal in range(1, skew_diagonal_length - 1): #! Skip the first column & row\n",
+    "        for offset_within_diagonal in range(1, skew_diagonal_length - 1): # ! Skip the first column & row\n",
     "            # If we haven't passed the main skew diagonal yet, \n",
     "            # then we have to skip the first and the last operation,\n",
     "            # as those are already pre-populated and form the first column \n",
@@ -552,18 +474,18 @@
     "            longer_char = longer[j - 1]\n",
     "            substitution_cost = shorter_char != longer_char\n",
     "            matrix[i, j] = min(\n",
-    "                matrix[i - 1, j] + 1,                      #? Deletion cost\n",
-    "                matrix[i, j - 1] + 1,                      #? Insertion cost\n",
-    "                matrix[i - 1, j - 1] + substitution_cost,  #? Substitution cost\n",
+    "                matrix[i - 1, j] + 1,                      # ? Deletion cost\n",
+    "                matrix[i, j - 1] + 1,                      # ? Insertion cost\n",
+    "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
     "            )\n",
     "            \n",
     "            if baseline is not None:\n",
     "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
-    "            \n",
+    "                            \n",
     "    # Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.        \n",
     "    for skew_diagonal_index in range(shorter_dim, longer_dim):\n",
     "        skew_diagonal_length = shorter_dim\n",
-    "        for offset_within_diagonal in range(skew_diagonal_length - 1): #! Skip the first row\n",
+    "        for offset_within_diagonal in range(skew_diagonal_length - 1): # ! Skip the first row\n",
     "            i = shorter_dim - offset_within_diagonal - 1\n",
     "            j = skew_diagonal_index - shorter_dim + offset_within_diagonal + 1\n",
     "            if verbose:\n",
@@ -572,9 +494,9 @@
     "            longer_char = longer[j - 1]\n",
     "            substitution_cost = shorter_char != longer_char\n",
     "            matrix[i, j] = min(\n",
-    "                matrix[i - 1, j] + 1,                      #? Deletion cost\n",
-    "                matrix[i, j - 1] + 1,                      #? Insertion cost\n",
-    "                matrix[i - 1, j - 1] + substitution_cost,  #? Substitution cost\n",
+    "                matrix[i - 1, j] + 1,                      # ? Deletion cost\n",
+    "                matrix[i, j - 1] + 1,                      # ? Insertion cost\n",
+    "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
     "            )\n",
     "            \n",
     "            if baseline is not None:\n",
@@ -594,9 +516,9 @@
     "            longer_char = longer[j - 1]\n",
     "            substitution_cost = shorter_char != longer_char\n",
     "            matrix[i, j] = min(\n",
-    "                matrix[i - 1, j] + 1,                      #? Deletion cost\n",
-    "                matrix[i, j - 1] + 1,                      #? Insertion cost\n",
-    "                matrix[i - 1, j - 1] + substitution_cost,  #? Substitution cost\n",
+    "                matrix[i - 1, j] + 1,                      # ? Deletion cost\n",
+    "                matrix[i, j - 1] + 1,                      # ? Insertion cost\n",
+    "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
     "            )\n",
     "            \n",
     "            if baseline is not None:\n",
@@ -611,7 +533,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -629,32 +551,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "('listeners',\n",
-       " 'silents',\n",
-       " 'distance_sd = np.int64(5)',\n",
-       " array([[0, 1, 2, 3, 4, 5, 6, 7],\n",
-       "        [1, 1, 2, 2, 3, 4, 5, 6],\n",
-       "        [2, 2, 1, 2, 3, 4, 5, 6],\n",
-       "        [3, 2, 2, 2, 3, 4, 5, 5],\n",
-       "        [4, 3, 3, 3, 3, 4, 4, 5],\n",
-       "        [5, 4, 4, 4, 3, 4, 5, 5],\n",
-       "        [6, 5, 5, 5, 4, 3, 4, 5],\n",
-       "        [7, 6, 6, 6, 5, 4, 4, 5],\n",
-       "        [8, 7, 7, 7, 6, 5, 5, 5],\n",
-       "        [9, 8, 8, 8, 7, 6, 6, 5]]))"
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "s1 = \"listeners\"\n",
     "s2 = \"silents\"\n",
@@ -663,6 +562,224 @@
     "s1, s2, f\"{distance_sd = }\", matrix_sd"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Reversing the Input\n",
+    "\n",
+    "One of the issues with vectorizing this algorithm is the traversal order of the shorter string.\n",
+    "It's different from the longer string and different from the natural traversal order of the loop.\n",
+    "To make the indexing simpler, we can pre-reverse the shorter string."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Optional\n",
+    "\n",
+    "def skewed_diagonals_reversed(\n",
+    "    s1: str, s2: str, \n",
+    "    verbose: bool = False, \n",
+    "    baseline: Optional[np.ndarray] = None) -> Tuple[int, np.ndarray]:\n",
+    "    \n",
+    "    shorter, longer = (s1, s2) if len(s1) <= len(s2) else (s2, s1)    \n",
+    "    baseline = baseline if len(s1) <= len(s2) else baseline.T\n",
+    "    shorter_dim = len(shorter) + 1\n",
+    "    longer_dim = len(longer) + 1\n",
+    "    if verbose:\n",
+    "        print(f\"{shorter=}, {longer=}, {shorter_dim=}, {longer_dim=}\")\n",
+    "    \n",
+    "    # Create a matrix of size (shorter_dim) x (longer_dim)\n",
+    "    matrix = np.zeros((shorter_dim, longer_dim), dtype=int)\n",
+    "    matrix[:, :] = longer_dim + 1 # or +inf \n",
+    "\n",
+    "    # Initialize the first column and first row of the matrix\n",
+    "    for i in range(shorter_dim):\n",
+    "        matrix[i, 0] = i\n",
+    "    for j in range(longer_dim):\n",
+    "        matrix[0, j] = j\n",
+    "\n",
+    "    # Let's say we are dealing with 3 and 5 letter words.\n",
+    "    # The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).\n",
+    "    # It will have:\n",
+    "    # - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.\n",
+    "    # - 2 diagonals of fixed length, at positions: 4, 5.\n",
+    "    # - 3 diagonals of decreasing length, at positions: 6, 7, 8.\n",
+    "    diagonals_count = shorter_dim + longer_dim - 1\n",
+    "    shorter_reversed = \"\".join(reversed(shorter))\n",
+    "\n",
+    "    # In reality, we need to keep only 3 diagonals to produce the same score in the end.\n",
+    "    previous_distances = np.zeros(shorter_dim, dtype=np.uint)\n",
+    "    current_distances = np.zeros(shorter_dim, dtype=np.uint)\n",
+    "    next_distances = np.zeros(shorter_dim, dtype=np.uint)\n",
+    "    temporary_distances = np.zeros(shorter_dim, dtype=np.uint)\n",
+    "    previous_distances[0] = 0\n",
+    "    current_distances[0] = current_distances[1] = 1\n",
+    "\n",
+    "    # Same as with square matrices, the 0th diagonal contains - just one element - zero - skipping it.\n",
+    "    # Same as with square matrices, the 1st diagonal contains the values 1 and 1 - skipping it.\n",
+    "    # Now let's handle the rest of the upper triangle.\n",
+    "    for skew_diagonal_index in range(2, shorter_dim):\n",
+    "        skew_diagonal_length = (skew_diagonal_index + 1)\n",
+    "        for offset_within_diagonal in range(1, skew_diagonal_length - 1): # ! Skip the left column & top row\n",
+    "            # If we haven't passed the main skew diagonal yet, \n",
+    "            # then we have to skip the first and the last operation,\n",
+    "            # as those are already pre-populated and form the first column \n",
+    "            # and the first row of the Levenshtein matrix respectively.\n",
+    "            i = skew_diagonal_index - offset_within_diagonal\n",
+    "            j = offset_within_diagonal\n",
+    "            if verbose:\n",
+    "                print(f\"top left triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+    "            shorter_char = shorter_reversed[len(shorter) - i]\n",
+    "            longer_char = longer[j - 1]\n",
+    "            substitution_cost = shorter_char != longer_char\n",
+    "            matrix[i, j] = min(\n",
+    "                matrix[i - 1, j] + 1,                      # ? Deletion cost\n",
+    "                matrix[i, j - 1] + 1,                      # ? Insertion cost\n",
+    "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
+    "            )\n",
+    "\n",
+    "            # ? For reproducibility let's also try doing the same only using the info in the 3 diagonals\n",
+    "            next_distances[offset_within_diagonal] = min(\n",
+    "                current_distances[offset_within_diagonal - 1] + 1,\n",
+    "                current_distances[offset_within_diagonal] + 1,\n",
+    "                previous_distances[offset_within_diagonal - 1] + substitution_cost,\n",
+    "            )\n",
+    "            \n",
+    "            if baseline is not None:\n",
+    "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
+    "            \n",
+    "        next_distances[0] = next_distances[skew_diagonal_length-1] = skew_diagonal_index\n",
+    "        \n",
+    "        # ? Let's validate the contents of the diagonal\n",
+    "        skew_diagonal_expected = get_skewed_diagonal(matrix, skew_diagonal_index)\n",
+    "        assert len(skew_diagonal_expected) == skew_diagonal_length\n",
+    "        assert (skew_diagonal_expected == next_distances[:skew_diagonal_length]).all(), f\"diagonal:{skew_diagonal_index}\\nexpected:{skew_diagonal_expected}\\nproduced:{next_distances[:skew_diagonal_length]}\"\n",
+    "        temporary_distances[:] = previous_distances[:]\n",
+    "        previous_distances[:] = current_distances[:]\n",
+    "        current_distances[:] = next_distances[:]\n",
+    "        next_distances[:] = temporary_distances[:]\n",
+    "\n",
+    "    # Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.        \n",
+    "    for skew_diagonal_index in range(shorter_dim, longer_dim):\n",
+    "        skew_diagonal_length = shorter_dim\n",
+    "        for offset_within_diagonal in range(skew_diagonal_length - 1): # ! Skip the top row\n",
+    "            i = shorter_dim - offset_within_diagonal - 1\n",
+    "            j = skew_diagonal_index - shorter_dim + offset_within_diagonal + 1\n",
+    "            if verbose:\n",
+    "                print(f\"anti-band: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+    "            shorter_char = shorter_reversed[len(shorter) - i]\n",
+    "            longer_char = longer[j - 1]\n",
+    "            substitution_cost = shorter_char != longer_char\n",
+    "            matrix[i, j] = min(\n",
+    "                matrix[i - 1, j] + 1,                      # ? Deletion cost\n",
+    "                matrix[i, j - 1] + 1,                      # ? Insertion cost\n",
+    "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
+    "            )\n",
+    "            \n",
+    "            if baseline is not None:\n",
+    "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
+    "    \n",
+    "            # ? For reproducibility let's also try doing the same only using the info in the 3 diagonals\n",
+    "            next_distances[offset_within_diagonal] = min(\n",
+    "                current_distances[offset_within_diagonal] + 1,\n",
+    "                current_distances[offset_within_diagonal + 1] + 1,\n",
+    "                previous_distances[offset_within_diagonal] + substitution_cost,\n",
+    "            )\n",
+    "            \n",
+    "        next_distances[shorter_dim-1] = skew_diagonal_index\n",
+    "        \n",
+    "        # ? Let's validate the contents of the diagonal\n",
+    "        skew_diagonal_expected = get_skewed_diagonal(matrix, skew_diagonal_index)\n",
+    "        assert len(skew_diagonal_expected) == skew_diagonal_length\n",
+    "        assert (skew_diagonal_expected == next_distances[:skew_diagonal_length]).all(), f\"diagonal:{skew_diagonal_index}\\nexpected:{skew_diagonal_expected}\\nproduced:{next_distances[:skew_diagonal_length]}\"\n",
+    "        temporary_distances[:] = previous_distances[:]\n",
+    "        previous_distances[:-1] = current_distances[1:] # ! Note we shift here\n",
+    "        current_distances[:] = next_distances[:]\n",
+    "        next_distances[:] = temporary_distances[:]\n",
+    "    \n",
+    "    # Now let's handle the bottom right triangle.\n",
+    "    for skew_diagonal_index in range(longer_dim, diagonals_count):\n",
+    "        skew_diagonal_length = diagonals_count - skew_diagonal_index\n",
+    "        for offset_within_diagonal in range(skew_diagonal_length):\n",
+    "            i = shorter_dim - offset_within_diagonal - 1\n",
+    "            j = skew_diagonal_index - shorter_dim + offset_within_diagonal + 1\n",
+    "            if verbose:\n",
+    "                print(f\"bottom right triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+    "            assert (i - 1) >= 0 and (i - 1) < len(shorter), f\"{i = }\"\n",
+    "            assert (j - 1) >= 0 and (j - 1) < len(longer), f\"{j = }\"\n",
+    "            shorter_char = shorter_reversed[len(shorter) - i]\n",
+    "            longer_char = longer[j - 1]\n",
+    "            substitution_cost = shorter_char != longer_char\n",
+    "            matrix[i, j] = min(\n",
+    "                matrix[i - 1, j] + 1,                      # ? Deletion cost\n",
+    "                matrix[i, j - 1] + 1,                      # ? Insertion cost\n",
+    "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
+    "            )\n",
+    "            \n",
+    "            if baseline is not None:\n",
+    "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
+    "\n",
+    "            # ? For reproducibility let's also try doing the same only using the info in the 3 diagonals\n",
+    "            next_distances[offset_within_diagonal] = min(\n",
+    "                current_distances[offset_within_diagonal] + 1,\n",
+    "                current_distances[offset_within_diagonal + 1] + 1,\n",
+    "                previous_distances[offset_within_diagonal] + substitution_cost,\n",
+    "            )\n",
+    "\n",
+    "        # ? Let's validate the contents of the diagonal\n",
+    "        skew_diagonal_expected = get_skewed_diagonal(matrix, skew_diagonal_index)\n",
+    "        assert len(skew_diagonal_expected) == skew_diagonal_length\n",
+    "        assert (skew_diagonal_expected == next_distances[:skew_diagonal_length]).all(), f\"diagonal:{skew_diagonal_index}\\nexpected:{skew_diagonal_expected}\\nproduced:{next_distances[:skew_diagonal_length]}\"\n",
+    "        temporary_distances[:] = previous_distances[:]\n",
+    "        previous_distances[:-1] = current_distances[1:] # ! Note we shift here\n",
+    "        current_distances[:] = next_distances[:]\n",
+    "        next_distances[:] = temporary_distances[:]\n",
+    "\n",
+    "    # Return the Levenshtein distance\n",
+    "    distance_from_matrix = matrix[len(shorter), len(longer)]\n",
+    "    distance_from_diagonal = current_distances[0]\n",
+    "    assert distance_from_diagonal == distance_from_matrix\n",
+    "    if len(s1) > len(s2):\n",
+    "        matrix = matrix.T\n",
+    "    return distance_from_matrix, matrix"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "for _ in range(100):\n",
+    "    len1 = random.randint(1, 50)\n",
+    "    len2 = random.randint(1, 50)\n",
+    "    s1 = ''.join(random.choices(\"abc\", k=len1))\n",
+    "    s2 = ''.join(random.choices(\"abc\", k=len2))\n",
+    "    distance_wf, matrix_wf = wagner_fisher(s1, s2)\n",
+    "    distance_sd, matrix_sd = skewed_diagonals_reversed(s1, s2, baseline=matrix_wf, verbose=False)\n",
+    "    assert distance_wf == distance_sd, f\"{distance_wf = } != {distance_sd = }\"\n",
+    "    assert np.all(matrix_wf == matrix_sd), f\"{matrix_wf = }\\n{matrix_sd = }\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "s1 = \"listeners\"\n",
+    "s2 = \"silents\"\n",
+    "distance_wf, matrix_wf = wagner_fisher(s1, s2)\n",
+    "distance_sd, matrix_sd = skewed_diagonals_reversed(s1, s2, baseline=matrix_wf)\n",
+    "s1, s2, f\"{distance_wf = }\", f\"{distance_sd = }\", matrix_sd"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -676,7 +793,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -744,7 +861,7 @@
     "                matrix[i, j - 1] + 1,  # ? Insertion cost\n",
     "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
     "            )\n",
-    "\n",
+    "            \n",
     "            # Validation checks:\n",
     "            if baseline is not None:\n",
     "                assert (\n",
@@ -821,7 +938,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -886,7 +1003,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "base",
+   "display_name": ".venv",
    "language": "python",
    "name": "python3"
   },
@@ -900,7 +1017,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.2"
+   "version": "3.11.10"
   }
  },
  "nbformat": 4,

From 427d5b5bcb3315510a70ead524a96efb19bd7806 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 16 Mar 2025 20:01:59 +0000
Subject: [PATCH 245/751] Add: OpenMP `score_diagonally`

---
 include/stringzilla/similarity.hpp | 72 ++++++++++++++++++++++--------
 1 file changed, 54 insertions(+), 18 deletions(-)

diff --git a/include/stringzilla/similarity.hpp b/include/stringzilla/similarity.hpp
index 9730eb86..cff8c149 100644
--- a/include/stringzilla/similarity.hpp
+++ b/include/stringzilla/similarity.hpp
@@ -8,8 +8,7 @@
  *  - `sz::openmp::levenshtein_distances` & `sz::openmp::levenshtein_distances_utf8` for Levenshtein edit-distances.
  *  - `sz::openmp::needleman_wunsch_score` for weighted Needleman-Wunsch global alignment.
  *
- *  Those are mostly providing specialized overloads of the @b `sz::openmp::global_score_kernel` template.
- *
+ *  Those are mostly providing specialized overloads of the @b `sz::openmp::score_diagonally` template.
  */
 #ifndef STRINGZILLA_SIMILARITY_HPP_
 #define STRINGZILLA_SIMILARITY_HPP_
@@ -25,15 +24,18 @@ struct uniform_substitution_cost_t {
 };
 
 /**
- *  @brief  Levenshtein edit-distance computation using skewed diagonal order evaluation on CPU.
- *          Type-agnostic, unlike `sz_levenshtein_distance`, and annotated for OpenMP parallelization.
+ *  @brief  Alignment Score and Edit Distance algorithm evaluating the Dynamic Programming matrix
+ *          @b three skewed (reverse) diagonals at a time on a CPU, leveraging OpenMP for parallelization.
+ *  @sa     sz_levenshtein_distance, sz_levenshtein_distance_utf8, sz_needleman_wunsch_score
  *
  *  @param[in] first The first string.
  *  @param[in] first_length The length of the first string.
  *  @param[in] second The second string.
  *  @param[in] second_length The length of the second string.
  *
- *  There are smarter algorithms for computing the Levenshtein distance, but this one is extremely flexible.
+ *  There are smarter algorithms for computing the Levenshtein distance, mostly based on bit-level operations.
+ *  Those, however, don't generalize well to arbitrary length inputs or non-uniform substitution costs.
+ *  This algorithm provides a more flexible baseline implementation for future SIMD and GPGPU optimizations.
  */
 template <                                                         //
     typename char_type_ = char,                                    //
@@ -41,7 +43,7 @@ template <                                                         //
     typename get_substitution_cost_ = uniform_substitution_cost_t, //
     sz_error_cost_t gap_cost_ = 1                                  //
     >
-sz_status_t global_score_kernel(                         //
+sz_status_t score_diagonally(                            //
     char_type_ const *shorter, sz_size_t shorter_length, //
     char_type_ const *longer, sz_size_t longer_length,   //
     get_substitution_cost_ &&get_substitution_cost,      //
@@ -95,13 +97,16 @@ sz_status_t global_score_kernel(                         //
         for (sz_size_t offset_in_diagonal = 1; offset_in_diagonal + 1 < next_diagonal_length; ++offset_in_diagonal) {
             // ? Note that here we are still traversing both buffers in the same order,
             // ? because the shorter string has been reversed into `shorter_reversed`.
-            char_type_ shorter_char = shorter_reversed[shorter_length - next_diagonal_index + offset_in_diagonal - 1];
-            char_type_ longer_char = longer[offset_in_diagonal];
+            char_type_ shorter_char = shorter_reversed[shorter_length - next_diagonal_index + offset_within_diagonal];
+            char_type_ longer_char = longer[offset_in_diagonal - 1];
             sz_error_cost_t cost_of_substitution = get_substitution_cost(shorter_char, longer_char);
             distance_type_ cost_if_substitution = previous_distances[offset_in_diagonal] + cost_of_substitution;
-            distance_type_ cost_if_deletion_or_insertion =
-                gap_cost_ +
-                sz_min_of_two(current_distances[offset_in_diagonal - 1], current_distances[offset_in_diagonal]);
+            distance_type_ cost_if_deletion_or_insertion =     //
+                sz_min_of_two(                                 //
+                    current_distances[offset_in_diagonal - 1], //
+                    current_distances[offset_in_diagonal]      //
+                    ) +
+                gap_cost_;
             next_distances[offset_in_diagonal] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
         }
         // Don't forget to populate the first row and the first column of the Levenshtein matrix.
@@ -116,19 +121,50 @@ sz_status_t global_score_kernel(                         //
     // Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.
     for (; next_diagonal_index != longer_dim; ++next_diagonal_index) {
         sz_size_t const next_diagonal_length = shorter_dim;
-        for (sz_size_t offset_in_diagonal = 1; offset_in_diagonal + 1 < next_diagonal_length; ++offset_in_diagonal) {
-            char_type_ shorter_char = shorter[shorter_length - 1 - i]; // ! Walks in reverse order.
-            char_type_ longer_char = longer[next_diagonal_index - n + i];
+#pragma omp simd
+        for (sz_size_t offset_in_diagonal = 0; offset_in_diagonal + 1 < next_diagonal_length; ++offset_in_diagonal) {
+            char_type_ shorter_char = shorter_reversed[shorter_length - shorter_dim + offset_in_diagonal + 1];
+            char_type_ longer_char = longer[next_diagonal_index - shorter_dim + offset_in_diagonal];
             sz_error_cost_t cost_of_substitution = get_substitution_cost(shorter_char, longer_char);
-            distance_type_ cost_if_substitution = previous_distances[i] + cost_of_substitution;
-            distance_type_ cost_if_deletion_or_insertion =
-                sz_min_of_two(current_distances[i], current_distances[i + 1]) + gap_cost_;
+            distance_type_ cost_if_substitution = previous_distances[offset_in_diagonal] + cost_of_substitution;
+            distance_type_ cost_if_deletion_or_insertion =    //
+                sz_min_of_two(                                //
+                    current_distances[offset_in_diagonal],    //
+                    current_distances[offset_in_diagonal + 1] //
+                    ) +
+                gap_cost_;
+            next_distances[i] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
+        }
+        next_distances[next_diagonal_length - 1] = next_diagonal_index;
+        // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
+        // dropping the first element in the current array.
+        distance_type_ *temporary = previous_distances;
+        previous_distances = current_distances + 1; // ! Note how we shift forward here
+        current_distances = next_distances;
+        next_distances = temporary;
+    }
+
+    // Now let's handle the bottom-right triangle of the matrix.
+    for (; next_diagonal_index != longer_dim; ++next_diagonal_index) {
+        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
+#pragma omp simd
+        for (sz_size_t offset_in_diagonal = 0; offset_in_diagonal < next_diagonal_length; ++offset_in_diagonal) {
+            char_type_ shorter_char = shorter_reversed[shorter_length - shorter_dim + offset_in_diagonal + 1];
+            char_type_ longer_char = longer[next_diagonal_index - shorter_dim + offset_in_diagonal];
+            sz_error_cost_t cost_of_substitution = get_substitution_cost(shorter_char, longer_char);
+            distance_type_ cost_if_substitution = previous_distances[offset_in_diagonal] + cost_of_substitution;
+            distance_type_ cost_if_deletion_or_insertion =    //
+                sz_min_of_two(                                //
+                    current_distances[offset_in_diagonal],    //
+                    current_distances[offset_in_diagonal + 1] //
+                    ) +
+                gap_cost_;
             next_distances[i] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
         }
         // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
         // dropping the first element in the current array.
         distance_type_ *temporary = previous_distances;
-        previous_distances = current_distances + 1;
+        previous_distances = current_distances + 1; // ! Note how we shift forward here
         current_distances = next_distances;
         next_distances = temporary;
     }

From ef53f75ecf474d87451f62764a9cdd0c93a4b6d8 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 17 Mar 2025 00:50:55 +0000
Subject: [PATCH 246/751] Fix: Diagonals depth

---
 include/stringzilla/similarity.hpp | 72 +++++++++++++++++++++++++++---
 1 file changed, 65 insertions(+), 7 deletions(-)

diff --git a/include/stringzilla/similarity.hpp b/include/stringzilla/similarity.hpp
index cff8c149..b294eaae 100644
--- a/include/stringzilla/similarity.hpp
+++ b/include/stringzilla/similarity.hpp
@@ -50,6 +50,13 @@ sz_status_t score_diagonally(                            //
     sz_memory_allocator_t *alloc,                        //
     distance_type_ *result_ptr) {
 
+    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
+    sz_memory_allocator_t global_alloc;
+    if (!alloc) {
+        sz_memory_allocator_init_default(&global_alloc);
+        alloc = &global_alloc;
+    }
+
     // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
     // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
     sz_size_t const shorter_dim = shorter_length + 1;
@@ -75,7 +82,7 @@ sz_status_t score_diagonally(                            //
     distance_type_ *previous_distances = distances;
     distance_type_ *current_distances = previous_distances + longer_dim;
     distance_type_ *next_distances = current_distances + longer_dim;
-    char_type_ const *const shorter_reversed = (char_type_ const *)(next_distances + longer_dim);
+    char_type_ *const shorter_reversed = (char_type_ *)(next_distances + longer_dim);
 
     // Export the reversed string into the buffer.
     for (sz_size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
@@ -91,13 +98,13 @@ sz_status_t score_diagonally(                            //
     sz_size_t next_diagonal_index = 2;
 
     // Progress through the upper triangle of the Levenshtein matrix.
-    for (; next_diagonal_index != shorter_dim; ++next_diagonal_index) {
+    for (; next_diagonal_index < shorter_dim; ++next_diagonal_index) {
         sz_size_t const next_diagonal_length = next_diagonal_index + 1;
 #pragma omp simd
         for (sz_size_t offset_in_diagonal = 1; offset_in_diagonal + 1 < next_diagonal_length; ++offset_in_diagonal) {
             // ? Note that here we are still traversing both buffers in the same order,
             // ? because the shorter string has been reversed into `shorter_reversed`.
-            char_type_ shorter_char = shorter_reversed[shorter_length - next_diagonal_index + offset_within_diagonal];
+            char_type_ shorter_char = shorter_reversed[shorter_length - next_diagonal_index + offset_in_diagonal];
             char_type_ longer_char = longer[offset_in_diagonal - 1];
             sz_error_cost_t cost_of_substitution = get_substitution_cost(shorter_char, longer_char);
             distance_type_ cost_if_substitution = previous_distances[offset_in_diagonal] + cost_of_substitution;
@@ -119,7 +126,7 @@ sz_status_t score_diagonally(                            //
     }
 
     // Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.
-    for (; next_diagonal_index != longer_dim; ++next_diagonal_index) {
+    for (; next_diagonal_index < longer_dim; ++next_diagonal_index) {
         sz_size_t const next_diagonal_length = shorter_dim;
 #pragma omp simd
         for (sz_size_t offset_in_diagonal = 0; offset_in_diagonal + 1 < next_diagonal_length; ++offset_in_diagonal) {
@@ -133,7 +140,7 @@ sz_status_t score_diagonally(                            //
                     current_distances[offset_in_diagonal + 1] //
                     ) +
                 gap_cost_;
-            next_distances[i] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
+            next_distances[offset_in_diagonal] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
         }
         next_distances[next_diagonal_length - 1] = next_diagonal_index;
         // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
@@ -145,7 +152,7 @@ sz_status_t score_diagonally(                            //
     }
 
     // Now let's handle the bottom-right triangle of the matrix.
-    for (; next_diagonal_index != longer_dim; ++next_diagonal_index) {
+    for (; next_diagonal_index < diagonals_count; ++next_diagonal_index) {
         sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
 #pragma omp simd
         for (sz_size_t offset_in_diagonal = 0; offset_in_diagonal < next_diagonal_length; ++offset_in_diagonal) {
@@ -159,7 +166,7 @@ sz_status_t score_diagonally(                            //
                     current_distances[offset_in_diagonal + 1] //
                     ) +
                 gap_cost_;
-            next_distances[i] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
+            next_distances[offset_in_diagonal] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
         }
         // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
         // dropping the first element in the current array.
@@ -176,6 +183,57 @@ sz_status_t score_diagonally(                            //
     return sz_success_k;
 }
 
+template <typename first_string_type_, typename second_string_type_>
+inline std::size_t levenshtein_distance( //
+    first_string_type_ const &first, second_string_type_ const &second) {
+
+    std::size_t const first_length = first.length();
+    std::size_t const second_length = second.length();
+    if (first_length == 0) return second_length;
+    if (second_length == 0) return first_length;
+
+    sz_size_t max_length = sz_max_of_two(first_length, second_length);
+    if (max_length < 256u) {
+
+        sz_u8_t result_u8;
+        sz_status_t status = score_diagonally<char, sz_u8_t, uniform_substitution_cost_t, 1>(
+            first.data(), first_length, second.data(), second_length, {}, NULL, &result_u8);
+        sz_unused(status);
+        return result_u8;
+    }
+    else if (max_length < 65536u) {
+        sz_u16_t result_u16;
+        sz_status_t status = score_diagonally<char, sz_u16_t, uniform_substitution_cost_t, 1>(
+            first.data(), first_length, second.data(), second_length, {}, NULL, &result_u16);
+        sz_unused(status);
+        return result_u16;
+    }
+    else {
+        sz_size_t result_size_t;
+        sz_status_t status = score_diagonally<char, sz_size_t, uniform_substitution_cost_t, 1>(
+            first.data(), first_length, second.data(), second_length, {}, NULL, &result_size_t);
+        sz_unused(status);
+        return result_size_t;
+    }
+}
+
+inline void levenshtein_distance_utf8(                                //
+    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
+    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result) {
+    sz_unused(a && b && a_length && b_length && alloc && result && bound);
+}
+
+inline void needleman_wunsch_score(                                   //
+    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
+    sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
+    sz_memory_allocator_t *alloc, sz_ssize_t *result) {
+    sz_unused(a && b && a_length && b_length && subs && alloc && result && gap);
+}
+
+inline void levenshtein_distances() {}
+inline void levenshtein_distances_utf8() {}
+inline void needleman_wunsch_scores() {}
+
 } // namespace openmp
 } // namespace stringzilla
 } // namespace ashvardanian

From 2727a8791997d9a75b273773bb865b880dd66ab0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 22 Mar 2025 03:56:01 +0000
Subject: [PATCH 247/751] Improve: Expose rune-parsing headers

---
 include/stringzilla/memory.h     | 19 ++++++++---------
 include/stringzilla/similarity.h | 36 +++++++++++++++-----------------
 include/stringzilla/types.h      |  6 +++---
 python/lib.c                     |  2 +-
 4 files changed, 30 insertions(+), 33 deletions(-)

diff --git a/include/stringzilla/memory.h b/include/stringzilla/memory.h
index 6aa52019..3b8f2a47 100644
--- a/include/stringzilla/memory.h
+++ b/include/stringzilla/memory.h
@@ -198,15 +198,14 @@ SZ_PUBLIC void sz_lookup_neon(sz_ptr_t target, sz_size_t length, sz_cptr_t sourc
 #pragma region Helper API
 
 /**
- *  @brief  Initializes a lookup table for converting ASCII characters to lowercase.
+ *  @brief Initializes a lookup table for converting ASCII characters to lowercase.
+ *  @param[out] lut Lookup table to be initialized. Must be exactly 256 bytes long.
  *
  *  ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
  *  So there are 26 english letters, shifted by 32 values, meaning that a conversion
  *  can be done by flipping the 5th bit each inappropriate character byte.
  *  This, however, breaks for extended ASCII, so a different solution is needed.
  *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
- *
- *  @param[out] lut Lookup table to be initialized. Must be exactly 256 bytes long.
  */
 SZ_PUBLIC void sz_lookup_init_lower(sz_ptr_t lut) {
     static sz_u8_t const lowered[256] = {
@@ -231,15 +230,14 @@ SZ_PUBLIC void sz_lookup_init_lower(sz_ptr_t lut) {
 }
 
 /**
- *  @brief  Initializes a lookup table for converting ASCII characters to uppercase.
+ *  @brief Initializes a lookup table for converting ASCII characters to uppercase.
+ *  @param[out] lut Lookup table to be initialized. Must be exactly 256 bytes long.
  *
  *  ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
  *  So there are 26 english letters, shifted by 32 values, meaning that a conversion
  *  can be done by flipping the 5th bit each inappropriate character byte.
  *  This, however, breaks for extended ASCII, so a different solution is needed.
  *  http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
- *
- *  @param[out] lut Lookup table to be initialized. Must be exactly 256 bytes long.
  */
 SZ_PUBLIC void sz_lookup_init_upper(sz_ptr_t lut) {
     static sz_u8_t const upped[256] = {
@@ -264,8 +262,7 @@ SZ_PUBLIC void sz_lookup_init_upper(sz_ptr_t lut) {
 }
 
 /**
- *  @brief  Initializes a lookup table for converting bytes to ASCII characters.
- *
+ *  @brief Initializes a lookup table for converting bytes to ASCII characters.
  *  @param[out] lut Lookup table to be initialized. Must be exactly 256 bytes long.
  */
 SZ_PUBLIC void sz_lookup_init_ascii(sz_ptr_t lut) {
@@ -273,8 +270,10 @@ SZ_PUBLIC void sz_lookup_init_ascii(sz_ptr_t lut) {
 }
 
 /**
- *  @brief  Check if there is a byte in this buffer, that exceeds 127 and can't be an ASCII character.
- *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
+ *  @brief Checks if all characters in a @p text are valid ASCII characters.
+ *  @param[in] text String to be analyzed.
+ *  @param[in] length Number of bytes in the string.
+ *  @return Whether all characters are valid ASCII characters.
  */
 SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t text, sz_size_t length) {
 
diff --git a/include/stringzilla/similarity.h b/include/stringzilla/similarity.h
index 4fb3b648..c7c2a824 100644
--- a/include/stringzilla/similarity.h
+++ b/include/stringzilla/similarity.h
@@ -241,15 +241,6 @@ SZ_DYNAMIC sz_status_t sz_needleman_wunsch_score(                     //
     sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
     sz_memory_allocator_t *alloc, sz_ssize_t *result);
 
-/**
- *  @brief  Checks if all characters in the range are valid ASCII characters.
- *
- *  @param text     String to be analyzed.
- *  @param length   Number of bytes in the string.
- *  @return         Whether all characters are valid ASCII characters.
- */
-SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t text, sz_size_t length);
-
 /** @copydoc sz_hamming_distance */
 SZ_PUBLIC sz_status_t sz_hamming_distance_serial(                     //
     sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
@@ -431,8 +422,8 @@ SZ_INTERNAL sz_status_t _sz_levenshtein_distance_wagner_fisher_serial( //
         sz_rune_t *const longer_utf32 = (sz_rune_t *)(buffer + sizeof(_distance_t) * (n * 2));
         sz_rune_t *const shorter_utf32 = longer_utf32 + longer_length;
         // Export the UTF8 sequences into the newly allocated buffer.
-        longer_length = _sz_export_utf8_to_utf32(longer, longer_length, longer_utf32);
-        shorter_length = _sz_export_utf8_to_utf32(shorter, shorter_length, shorter_utf32);
+        longer_length = sz_runes_parse(longer, longer_length, longer_utf32);
+        shorter_length = sz_runes_parse(shorter, shorter_length, shorter_utf32);
         longer = (sz_cptr_t)longer_utf32;
         shorter = (sz_cptr_t)shorter_utf32;
     }
@@ -581,6 +572,10 @@ SZ_PUBLIC sz_status_t sz_levenshtein_distance_utf8_serial( //
     sz_cptr_t a, sz_size_t a_length,                       //
     sz_cptr_t b, sz_size_t b_length,                       //
     sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result_ptr) {
+
+    if (sz_isascii(a, a_length) && sz_isascii(b, b_length))
+        return sz_levenshtein_distance_serial(a, a_length, b, b_length, bound, alloc, result_ptr);
+
     return _sz_levenshtein_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc, result_ptr);
 }
 
@@ -682,6 +677,9 @@ SZ_PUBLIC sz_status_t sz_hamming_distance_utf8_serial( //
     sz_cptr_t b, sz_size_t b_length,                   //
     sz_size_t bound, sz_size_t *result_ptr) {
 
+    if (sz_isascii(a, a_length) && sz_isascii(b, b_length))
+        return sz_hamming_distance_serial(a, a_length, b, b_length, bound, result_ptr);
+
     sz_cptr_t const a_end = a + a_length;
     sz_cptr_t const b_end = b + b_length;
     sz_size_t distance = 0;
@@ -691,28 +689,28 @@ SZ_PUBLIC sz_status_t sz_hamming_distance_utf8_serial( //
 
     if (bound) {
         for (; a < a_end && b < b_end && distance < bound; a += a_rune_length, b += b_rune_length) {
-            _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-            _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
+            sz_rune_parse(a, &a_rune, &a_rune_length);
+            sz_rune_parse(b, &b_rune, &b_rune_length);
             distance += (a_rune != b_rune);
         }
         // If one string has more runes, we need to go through the tail.
         if (distance < bound) {
             for (; a < a_end && distance < bound; a += a_rune_length, ++distance)
-                _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
+                sz_rune_parse(a, &a_rune, &a_rune_length);
 
             for (; b < b_end && distance < bound; b += b_rune_length, ++distance)
-                _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
+                sz_rune_parse(b, &b_rune, &b_rune_length);
         }
     }
     else {
         for (; a < a_end && b < b_end; a += a_rune_length, b += b_rune_length) {
-            _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-            _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
+            sz_rune_parse(a, &a_rune, &a_rune_length);
+            sz_rune_parse(b, &b_rune, &b_rune_length);
             distance += (a_rune != b_rune);
         }
         // If one string has more runes, we need to go through the tail.
-        for (; a < a_end; a += a_rune_length, ++distance) _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
-        for (; b < b_end; b += b_rune_length, ++distance) _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
+        for (; a < a_end; a += a_rune_length, ++distance) sz_rune_parse(a, &a_rune, &a_rune_length);
+        for (; b < b_end; b += b_rune_length, ++distance) sz_rune_parse(b, &b_rune, &b_rune_length);
     }
     *result_ptr = distance;
     return sz_success_k;
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 67b2fc61..24202655 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -728,7 +728,7 @@ typedef union sz_u512_vec_t {
 #pragma region UTF8
 
 /** @brief Extracts just one UTF8 codepoint from a UTF8 string into a 32-bit unsigned integer. */
-SZ_INTERNAL void _sz_extract_utf8_rune(sz_cptr_t utf8, sz_rune_t *code, sz_rune_length_t *code_length) {
+SZ_PUBLIC void sz_rune_parse(sz_cptr_t utf8, sz_rune_t *code, sz_rune_length_t *code_length) {
     sz_u8_t const *current = (sz_u8_t const *)utf8;
     sz_u8_t leading_byte = *current++;
     sz_rune_t ch;
@@ -775,11 +775,11 @@ SZ_INTERNAL void _sz_extract_utf8_rune(sz_cptr_t utf8, sz_rune_t *code, sz_rune_
  *  @warning The result is undefined id the UTF8 string is corrupted.
  *  @return The length in the number of codepoints.
  */
-SZ_INTERNAL sz_size_t _sz_export_utf8_to_utf32(sz_cptr_t utf8, sz_size_t utf8_length, sz_rune_t *utf32) {
+SZ_PUBLIC sz_size_t sz_runes_parse(sz_cptr_t utf8, sz_size_t utf8_length, sz_rune_t *utf32) {
     sz_cptr_t const end = utf8 + utf8_length;
     sz_size_t count = 0;
     sz_rune_length_t rune_length;
-    for (; utf8 != end; utf8 += rune_length, utf32++, count++) _sz_extract_utf8_rune(utf8, utf32, &rune_length);
+    for (; utf8 != end; utf8 += rune_length, utf32++, count++) sz_rune_parse(utf8, utf32, &rune_length);
     return count;
 }
 
diff --git a/python/lib.c b/python/lib.c
index c9dd5742..d6ce5f64 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -3436,7 +3436,7 @@ char const *export_escaped_unquoted_to_utf8_buffer(char const *cstr, size_t cstr
     while (cstr < cstr_end) {
         sz_rune_t rune;
         sz_rune_length_t rune_length;
-        _sz_extract_utf8_rune(cstr, &rune, &rune_length);
+        sz_rune_parse(cstr, &rune, &rune_length);
         if (rune_length == 1 && buffer + 2 < buffer_end) {
             if (*cstr == '\'') {
                 *(buffer++) = '\\';

From 37863c9eb7243192bc4767c11eef2231341edf69 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 22 Mar 2025 04:00:03 +0000
Subject: [PATCH 248/751] Fix: Unaligned loads/stores of hash state

---
 include/stringzilla/hash.h | 46 ++++++++++++++++++++++++++------------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 01346871..c3829d5d 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -953,14 +953,30 @@ SZ_PUBLIC void sz_hash_state_init_haswell(sz_hash_state_t *state, sz_u64_t seed)
 
 SZ_INTERNAL void _sz_hash_state_update_haswell(sz_hash_state_t *state) {
     __m128i const shuffle_mask = _mm_load_si128((__m128i const *)_sz_hash_u8x16x4_shuffle());
-    state->aes.xmms[0] = _mm_aesenc_si128(state->aes.xmms[0], state->ins.xmms[0]);
-    state->sum.xmms[0] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[0], shuffle_mask), state->ins.xmms[0]);
-    state->aes.xmms[1] = _mm_aesenc_si128(state->aes.xmms[1], state->ins.xmms[1]);
-    state->sum.xmms[1] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[1], shuffle_mask), state->ins.xmms[1]);
-    state->aes.xmms[2] = _mm_aesenc_si128(state->aes.xmms[2], state->ins.xmms[2]);
-    state->sum.xmms[2] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[2], shuffle_mask), state->ins.xmms[2]);
-    state->aes.xmms[3] = _mm_aesenc_si128(state->aes.xmms[3], state->ins.xmms[3]);
-    state->sum.xmms[3] = _mm_add_epi64(_mm_shuffle_epi8(state->sum.xmms[3], shuffle_mask), state->ins.xmms[3]);
+    _mm_storeu_si128( //
+        &state->aes.xmms[0],
+        _mm_aesenc_si128(_mm_lddqu_si128(&state->aes.xmms[0]), _mm_lddqu_si128(&state->ins.xmms[0])));
+    _mm_storeu_si128( //
+        &state->sum.xmms[0], _mm_add_epi64(_mm_shuffle_epi8(_mm_lddqu_si128(&state->sum.xmms[0]), shuffle_mask),
+                                           _mm_lddqu_si128(&state->ins.xmms[0])));
+    _mm_storeu_si128( //
+        &state->aes.xmms[1],
+        _mm_aesenc_si128(_mm_lddqu_si128(&state->aes.xmms[1]), _mm_lddqu_si128(&state->ins.xmms[1])));
+    _mm_storeu_si128( //
+        &state->sum.xmms[1], _mm_add_epi64(_mm_shuffle_epi8(_mm_lddqu_si128(&state->sum.xmms[1]), shuffle_mask),
+                                           _mm_lddqu_si128(&state->ins.xmms[1])));
+    _mm_storeu_si128( //
+        &state->aes.xmms[2],
+        _mm_aesenc_si128(_mm_lddqu_si128(&state->aes.xmms[2]), _mm_lddqu_si128(&state->ins.xmms[2])));
+    _mm_storeu_si128( //
+        &state->sum.xmms[2], _mm_add_epi64(_mm_shuffle_epi8(_mm_lddqu_si128(&state->sum.xmms[2]), shuffle_mask),
+                                           _mm_lddqu_si128(&state->ins.xmms[2])));
+    _mm_storeu_si128( //
+        &state->aes.xmms[3],
+        _mm_aesenc_si128(_mm_lddqu_si128(&state->aes.xmms[3]), _mm_lddqu_si128(&state->ins.xmms[3])));
+    _mm_storeu_si128( //
+        &state->sum.xmms[3], _mm_add_epi64(_mm_shuffle_epi8(_mm_lddqu_si128(&state->sum.xmms[3]), shuffle_mask),
+                                           _mm_lddqu_si128(&state->ins.xmms[3])));
 }
 
 SZ_INTERNAL sz_u64_t _sz_hash_state_finalize_haswell(sz_hash_state_t const *state) {
@@ -1074,10 +1090,10 @@ SZ_PUBLIC void sz_hash_state_stream_haswell(sz_hash_state_t *state, sz_cptr_t te
     while (length) {
         // Append to the internal buffer until it's full
         if (state->ins_length % 64 == 0 && length >= 64) {
-            state->ins.xmms[0] = _mm_lddqu_si128((__m128i const *)(text + 0));
-            state->ins.xmms[1] = _mm_lddqu_si128((__m128i const *)(text + 16));
-            state->ins.xmms[2] = _mm_lddqu_si128((__m128i const *)(text + 32));
-            state->ins.xmms[3] = _mm_lddqu_si128((__m128i const *)(text + 48));
+            _mm_storeu_si128(&state->ins.xmms[0], _mm_lddqu_si128((__m128i const *)(text + 0)));
+            _mm_storeu_si128(&state->ins.xmms[1], _mm_lddqu_si128((__m128i const *)(text + 16)));
+            _mm_storeu_si128(&state->ins.xmms[2], _mm_lddqu_si128((__m128i const *)(text + 32)));
+            _mm_storeu_si128(&state->ins.xmms[3], _mm_lddqu_si128((__m128i const *)(text + 48)));
             _sz_hash_state_update_haswell(state);
             state->ins_length += 64;
             text += 64;
@@ -1623,9 +1639,11 @@ SZ_INTERNAL void _sz_hash_state_update_ice(sz_hash_state_t *state) {
     __m512i const shuffle_mask = _mm512_load_si512((__m512i const *)_sz_hash_u8x16x4_shuffle());
     // ! In this kernel, assuming it may be called on arbitrarily misaligned `state`,
     // ! we must use `_mm512_storeu_si512` stores to update the state.
-    _mm512_storeu_si512(&state->aes.zmm, _mm512_aesenc_epi128(state->aes.zmm, state->ins.zmm));
+    _mm512_storeu_si512(&state->aes.zmm,
+                        _mm512_aesenc_epi128(_mm512_loadu_si512(&state->aes.zmm), _mm512_loadu_si512(&state->ins.zmm)));
     _mm512_storeu_si512(&state->sum.zmm,
-                        _mm512_add_epi64(_mm512_shuffle_epi8(state->sum.zmm, shuffle_mask), state->ins.zmm));
+                        _mm512_add_epi64(_mm512_shuffle_epi8(_mm512_loadu_si512(&state->sum.zmm), shuffle_mask),
+                                         _mm512_loadu_si512(&state->ins.zmm)));
 }
 
 SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {

From 6b5ef98a9385dfd7ec7bda6230ca022fdd66bb10 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 22 Mar 2025 04:06:30 +0000
Subject: [PATCH 249/751] Fix: Shifting Levenshtein diagonals in OpenMP

---
 include/stringzilla/similarity.hpp | 230 +++++++++++++-----
 scripts/bench_similarity.cpp       |  22 ++
 scripts/test.cpp                   | 370 +++++++++++++++++------------
 3 files changed, 419 insertions(+), 203 deletions(-)

diff --git a/include/stringzilla/similarity.hpp b/include/stringzilla/similarity.hpp
index b294eaae..752897a9 100644
--- a/include/stringzilla/similarity.hpp
+++ b/include/stringzilla/similarity.hpp
@@ -5,10 +5,10 @@
  *
  *  Includes core APIs:
  *
- *  - `sz::openmp::levenshtein_distances` & `sz::openmp::levenshtein_distances_utf8` for Levenshtein edit-distances.
- *  - `sz::openmp::needleman_wunsch_score` for weighted Needleman-Wunsch global alignment.
+ *  - `sz::levenshtein_distances` & `sz::levenshtein_distances_utf8` for Levenshtein edit-distances.
+ *  - `sz::needleman_wunsch_score` for weighted Needleman-Wunsch global alignment.
  *
- *  Those are mostly providing specialized overloads of the @b `sz::openmp::score_diagonally` template.
+ *  Those are mostly providing specialized overloads of the @b `sz::score_diagonally` template.
  */
 #ifndef STRINGZILLA_SIMILARITY_HPP_
 #define STRINGZILLA_SIMILARITY_HPP_
@@ -23,10 +23,14 @@ struct uniform_substitution_cost_t {
     sz_error_cost_t operator()(char a, char b) const { return a == b ? 0 : 1; }
 };
 
+struct lookup_substitution_cost_t {
+    sz_error_cost_t const *costs;
+    sz_error_cost_t operator()(char a, char b) const { return costs[(sz_u8_t)a * 256 + (sz_u8_t)b]; }
+};
+
 /**
  *  @brief  Alignment Score and Edit Distance algorithm evaluating the Dynamic Programming matrix
  *          @b three skewed (reverse) diagonals at a time on a CPU, leveraging OpenMP for parallelization.
- *  @sa     sz_levenshtein_distance, sz_levenshtein_distance_utf8, sz_needleman_wunsch_score
  *
  *  @param[in] first The first string.
  *  @param[in] first_length The length of the first string.
@@ -36,25 +40,37 @@ struct uniform_substitution_cost_t {
  *  There are smarter algorithms for computing the Levenshtein distance, mostly based on bit-level operations.
  *  Those, however, don't generalize well to arbitrary length inputs or non-uniform substitution costs.
  *  This algorithm provides a more flexible baseline implementation for future SIMD and GPGPU optimizations.
+ *
+ *  @note   The API of this algorithm is a bit weird, but it's designed to minimize the reliance on the definitions
+ *          in the `stringzilla.hpp` header, making compilation times shorter for the end-user.
+ *  @sa     For lower-level API, check `sz_levenshtein_distance[_utf8]` and `sz_needleman_wunsch_score`.
+ *  @sa     For simplicity, use the `sz::levenshtein_distance[_utf8]` and `sz::needleman_wunsch_score`.
+ *  @sa     For bulk API, use `sz::levenshtein_distances[_utf8]`.
  */
 template <                                                         //
     typename char_type_ = char,                                    //
     typename distance_type_ = sz_size_t,                           //
     typename get_substitution_cost_ = uniform_substitution_cost_t, //
-    sz_error_cost_t gap_cost_ = 1                                  //
+    typename allocator_type_ = std::allocator<char>                //
     >
-sz_status_t score_diagonally(                            //
-    char_type_ const *shorter, sz_size_t shorter_length, //
-    char_type_ const *longer, sz_size_t longer_length,   //
-    get_substitution_cost_ &&get_substitution_cost,      //
-    sz_memory_allocator_t *alloc,                        //
-    distance_type_ *result_ptr) {
-
+sz_status_t score_diagonally(                          //
+    char_type_ const *first, sz_size_t first_length,   //
+    char_type_ const *second, sz_size_t second_length, //
+    distance_type_ *result_ptr,
+    sz_error_cost_t gap_cost = 1,                                                    //
+    get_substitution_cost_ &&get_substitution_cost = uniform_substitution_cost_t {}, //
+    allocator_type_ &&alloc = allocator_type_ {}                                     //
+) {
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
+    using allocated_type = typename allocator_type_::value_type;
+    static_assert(sizeof(allocated_type) == sizeof(char), "Allocator must be byte-aligned");
+
+    // Make sure the size relation between the strings is correct.
+    char_type_ const *shorter = first, *longer = second;
+    sz_size_t shorter_length = first_length, longer_length = second_length;
+    if (shorter_length > longer_length) {
+        std::swap(shorter, longer);
+        std::swap(shorter_length, longer_length);
     }
 
     // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
@@ -75,14 +91,14 @@ sz_status_t score_diagonally(                            //
     // Let's allocate a bit more memory and reverse-export our shorter string into that buffer.
     sz_size_t const buffer_length =
         sizeof(distance_type_) * max_diagonal_length * 3 + shorter_length * sizeof(char_type_);
-    distance_type_ *const distances = (distance_type_ *)alloc->allocate(buffer_length, alloc->handle);
+    distance_type_ *const distances = (distance_type_ *)alloc.allocate(buffer_length);
     if (!distances) return sz_bad_alloc_k;
 
     // The next few pointers will be swapped around.
     distance_type_ *previous_distances = distances;
-    distance_type_ *current_distances = previous_distances + longer_dim;
-    distance_type_ *next_distances = current_distances + longer_dim;
-    char_type_ *const shorter_reversed = (char_type_ *)(next_distances + longer_dim);
+    distance_type_ *current_distances = previous_distances + max_diagonal_length;
+    distance_type_ *next_distances = current_distances + max_diagonal_length;
+    char_type_ *const shorter_reversed = (char_type_ *)(next_distances + max_diagonal_length);
 
     // Export the reversed string into the buffer.
     for (sz_size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
@@ -107,13 +123,13 @@ sz_status_t score_diagonally(                            //
             char_type_ shorter_char = shorter_reversed[shorter_length - next_diagonal_index + offset_in_diagonal];
             char_type_ longer_char = longer[offset_in_diagonal - 1];
             sz_error_cost_t cost_of_substitution = get_substitution_cost(shorter_char, longer_char);
-            distance_type_ cost_if_substitution = previous_distances[offset_in_diagonal] + cost_of_substitution;
+            distance_type_ cost_if_substitution = previous_distances[offset_in_diagonal - 1] + cost_of_substitution;
             distance_type_ cost_if_deletion_or_insertion =     //
                 sz_min_of_two(                                 //
                     current_distances[offset_in_diagonal - 1], //
                     current_distances[offset_in_diagonal]      //
                     ) +
-                gap_cost_;
+                gap_cost;
             next_distances[offset_in_diagonal] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
         }
         // Don't forget to populate the first row and the first column of the Levenshtein matrix.
@@ -139,16 +155,19 @@ sz_status_t score_diagonally(                            //
                     current_distances[offset_in_diagonal],    //
                     current_distances[offset_in_diagonal + 1] //
                     ) +
-                gap_cost_;
+                gap_cost;
             next_distances[offset_in_diagonal] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
         }
         next_distances[next_diagonal_length - 1] = next_diagonal_index;
         // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
         // dropping the first element in the current array.
         distance_type_ *temporary = previous_distances;
-        previous_distances = current_distances + 1; // ! Note how we shift forward here
+        previous_distances = current_distances;
         current_distances = next_distances;
         next_distances = temporary;
+        // ! Drop the first entry among the current distances.
+        sz_move((sz_ptr_t)(previous_distances), (sz_ptr_t)(previous_distances + 1),
+                (max_diagonal_length - 1) * sizeof(distance_type_));
     }
 
     // Now let's handle the bottom-right triangle of the matrix.
@@ -165,69 +184,168 @@ sz_status_t score_diagonally(                            //
                     current_distances[offset_in_diagonal],    //
                     current_distances[offset_in_diagonal + 1] //
                     ) +
-                gap_cost_;
+                gap_cost;
             next_distances[offset_in_diagonal] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
         }
         // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
         // dropping the first element in the current array.
         distance_type_ *temporary = previous_distances;
-        previous_distances = current_distances + 1; // ! Note how we shift forward here
+        previous_distances = current_distances;
         current_distances = next_distances;
         next_distances = temporary;
+        // ! Drop the first entry among the current distances.
+        sz_move((sz_ptr_t)(previous_distances), (sz_ptr_t)(previous_distances + 1),
+                (max_diagonal_length - 1) * sizeof(distance_type_));
     }
 
     // Cache scalar before `free` call.
     distance_type_ result = current_distances[0];
-    alloc->free(distances, buffer_length, alloc->handle);
+    alloc.deallocate((allocated_type *)distances, buffer_length);
     *result_ptr = result;
     return sz_success_k;
 }
 
-template <typename first_string_type_, typename second_string_type_>
-inline std::size_t levenshtein_distance( //
-    first_string_type_ const &first, second_string_type_ const &second) {
+template <                                          //
+    typename first_type_,                           //
+    typename second_type_,                          //
+    typename allocator_type_ = std::allocator<char> //
+    >
+inline sz_size_t levenshtein_distance( //
+    first_type_ const &first, second_type_ const &second,
+    allocator_type_ &&alloc = allocator_type_ {}) noexcept(false) {
 
-    std::size_t const first_length = first.length();
-    std::size_t const second_length = second.length();
+    sz_size_t const first_length = first.length();
+    sz_size_t const second_length = second.length();
     if (first_length == 0) return second_length;
     if (second_length == 0) return first_length;
 
-    sz_size_t max_length = sz_max_of_two(first_length, second_length);
-    if (max_length < 256u) {
-
+    // Estimate the maximum dimension of the DP matrix
+    sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
+    if (max_dim < 256u) {
         sz_u8_t result_u8;
-        sz_status_t status = score_diagonally<char, sz_u8_t, uniform_substitution_cost_t, 1>(
-            first.data(), first_length, second.data(), second_length, {}, NULL, &result_u8);
-        sz_unused(status);
+        sz_status_t status = score_diagonally<char, sz_u8_t, uniform_substitution_cost_t, allocator_type_>(
+            first.data(), first_length, second.data(), second_length, &result_u8, 1, uniform_substitution_cost_t {},
+            std::forward<allocator_type_>(alloc));
+        if (status == sz_bad_alloc_k) throw std::bad_alloc();
         return result_u8;
     }
-    else if (max_length < 65536u) {
+    else if (max_dim < 65536u) {
         sz_u16_t result_u16;
-        sz_status_t status = score_diagonally<char, sz_u16_t, uniform_substitution_cost_t, 1>(
-            first.data(), first_length, second.data(), second_length, {}, NULL, &result_u16);
-        sz_unused(status);
+        sz_status_t status = score_diagonally<char, sz_u16_t, uniform_substitution_cost_t, allocator_type_>(
+            first.data(), first_length, second.data(), second_length, &result_u16, 1, uniform_substitution_cost_t {},
+            std::forward<allocator_type_>(alloc));
+        if (status == sz_bad_alloc_k) throw std::bad_alloc();
         return result_u16;
     }
     else {
-        sz_size_t result_size_t;
-        sz_status_t status = score_diagonally<char, sz_size_t, uniform_substitution_cost_t, 1>(
-            first.data(), first_length, second.data(), second_length, {}, NULL, &result_size_t);
-        sz_unused(status);
-        return result_size_t;
+        sz_size_t result_size;
+        sz_status_t status = score_diagonally<char, sz_size_t, uniform_substitution_cost_t, allocator_type_>(
+            first.data(), first_length, second.data(), second_length, &result_size, 1, uniform_substitution_cost_t {},
+            std::forward<allocator_type_>(alloc));
+        if (status == sz_bad_alloc_k) throw std::bad_alloc();
+        return result_size;
     }
 }
 
-inline void levenshtein_distance_utf8(                                //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result) {
-    sz_unused(a && b && a_length && b_length && alloc && result && bound);
+template <                                          //
+    typename first_type_,                           //
+    typename second_type_,                          //
+    typename allocator_type_ = std::allocator<char> //
+    >
+inline sz_size_t levenshtein_distance_utf8( //
+    first_type_ const &first, second_type_ const &second,
+    allocator_type_ &&alloc = allocator_type_ {}) noexcept(false) {
+
+    // Check if the strings are entirely composed of ASCII characters,
+    // and default to a simpler algorithm in that case.
+    if (sz_isascii(first.data(), first.length()) && sz_isascii(second.data(), second.length()))
+        return levenshtein_distance(first, second);
+
+    // Allocate some memory to expand UTF-8 strings into UTF-32.
+    sz_size_t const max_utf32_bytes = first.size() * 4 + second.size() * 4;
+    sz_rune_t const *const first_utf32 = (sz_rune_t *)alloc.allocate(max_utf32_bytes);
+    sz_rune_t const *const second_utf32 = first_utf32 + first.size();
+
+    // Export into UTF-32 buffer.
+    sz_rune_length_t rune_length;
+    sz_size_t first_length_utf32 = 0, second_length_utf32 = 0;
+    for (sz_size_t progress_utf8 = 0, progress_utf32 = 0; progress_utf8 < first.size();
+         progress_utf8 += rune_length, ++progress_utf32, ++first_length_utf32)
+        sz_rune_parse(first.data() + progress_utf8, first_utf32 + progress_utf32, &rune_length);
+    for (sz_size_t progress_utf8 = 0, progress_utf32 = 0; progress_utf8 < second.size();
+         progress_utf8 += rune_length, ++progress_utf32, ++second_length_utf32)
+        sz_rune_parse(second.data() + progress_utf8, second_utf32 + progress_utf32, &rune_length);
+
+    // Infer the largest distance type we may need fr aggregated error costs.
+    // Estimate the maximum dimension of the DP matrix
+    sz_size_t const max_dim = sz_max_of_two(first_length_utf32, second_length_utf32) + 1;
+    if (max_dim < 256u) {
+        sz_u8_t result_u8;
+        sz_status_t status = score_diagonally<sz_rune_t, sz_u8_t, uniform_substitution_cost_t, allocator_type_>(
+            first_utf32, first_length_utf32, second_utf32, second_length_utf32, &result_u8, 1,
+            uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
+        if (status == sz_bad_alloc_k) throw std::bad_alloc();
+        return result_u8;
+    }
+    else if (max_dim < 65536u) {
+        sz_u16_t result_u16;
+        sz_status_t status = score_diagonally<sz_rune_t, sz_u16_t, uniform_substitution_cost_t, allocator_type_>(
+            first_utf32, first_length_utf32, second_utf32, second_length_utf32, &result_u16, 1,
+            uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
+        if (status == sz_bad_alloc_k) throw std::bad_alloc();
+        return result_u16;
+    }
+    else {
+        sz_size_t result_size;
+        sz_status_t status = score_diagonally<sz_rune_t, sz_size_t, uniform_substitution_cost_t, allocator_type_>(
+            first_utf32, first_length_utf32, second_utf32, second_length_utf32, &result_size, 1,
+            uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
+        if (status == sz_bad_alloc_k) throw std::bad_alloc();
+        return result_size;
+    }
 }
 
-inline void needleman_wunsch_score(                                   //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-    sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-    sz_memory_allocator_t *alloc, sz_ssize_t *result) {
-    sz_unused(a && b && a_length && b_length && subs && alloc && result && gap);
+template <                                          //
+    typename first_type_,                           //
+    typename second_type_,                          //
+    typename allocator_type_ = std::allocator<char> //
+    >
+inline sz_ssize_t needleman_wunsch_score(                 //
+    first_type_ const &first, second_type_ const &second, //
+    sz_error_cost_t const *subs, sz_error_cost_t gap,     //
+    allocator_type_ &&alloc = allocator_type_ {}) noexcept(false) {
+
+    sz_size_t const first_length = first.length();
+    sz_size_t const second_length = second.length();
+    if (first_length == 0) return second_length;
+    if (second_length == 0) return first_length;
+
+    // Estimate the maximum dimension of the DP matrix
+    sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
+    if (max_dim < 256u) {
+        sz_u8_t result_u8;
+        sz_status_t status = score_diagonally<char, sz_u8_t, lookup_substitution_cost_t, allocator_type_>(
+            first.data(), first_length, second.data(), second_length, &result_u8, gap,
+            lookup_substitution_cost_t {subs}, std::forward<allocator_type_>(alloc));
+        if (status == sz_bad_alloc_k) throw std::bad_alloc();
+        return result_u8;
+    }
+    else if (max_dim < 65536u) {
+        sz_u16_t result_u16;
+        sz_status_t status = score_diagonally<char, sz_u16_t, lookup_substitution_cost_t, allocator_type_>(
+            first.data(), first_length, second.data(), second_length, &result_u16, gap,
+            lookup_substitution_cost_t {subs}, std::forward<allocator_type_>(alloc));
+        if (status == sz_bad_alloc_k) throw std::bad_alloc();
+        return result_u16;
+    }
+    else {
+        sz_size_t result_size;
+        sz_status_t status = score_diagonally<char, sz_size_t, lookup_substitution_cost_t, allocator_type_>(
+            first.data(), first_length, second.data(), second_length, &result_size, gap,
+            lookup_substitution_cost_t {subs}, std::forward<allocator_type_>(alloc));
+        if (status == sz_bad_alloc_k) throw std::bad_alloc();
+        return result_size;
+    }
 }
 
 inline void levenshtein_distances() {}
diff --git a/scripts/bench_similarity.cpp b/scripts/bench_similarity.cpp
index df0d38f4..3d08b59e 100644
--- a/scripts/bench_similarity.cpp
+++ b/scripts/bench_similarity.cpp
@@ -51,6 +51,8 @@
 #include "bench.hpp"
 #include "test.hpp" // `levenshtein_baseline`, `unary_substitution_costs`
 
+#include "stringzilla/similarity.hpp"
+
 using namespace ashvardanian::stringzilla::scripts;
 
 #pragma region Hamming Distance
@@ -141,6 +143,26 @@ struct alignment_score_from_sz {
     }
 };
 
+/** @brief Wraps a hardware-specific Levenshtein-distance backend into something @b `bench_unary`-compatible . */
+template <sz_levenshtein_distance_t levenshtein_distance_>
+struct score_from_sz_cpp {
+
+    environment_t const &env;
+    sz_size_t bound = SZ_SIZE_MAX;
+
+    inline call_result_t operator()(std::size_t token_index) const noexcept {
+        return operator()(env.tokens[token_index], env.tokens[env.tokens.size() - 1 - token_index]);
+    }
+
+    inline call_result_t operator()(std::string_view a, std::string_view b) const noexcept(false) {
+        sz_size_t result_distance = sz::openmp::levenshtein_distance(a, b);
+        do_not_optimize(result_distance);
+        std::size_t bytes_passed = std::min(a.size(), b.size());
+        std::size_t cells_passed = a.size() * b.size();
+        return {bytes_passed, static_cast<check_value_t>(result_distance), cells_passed};
+    }
+};
+
 void bench_edits(environment_t const &env) {
     auto base_call = levenshtein_from_sz<sz_levenshtein_distance_serial>(env);
     bench_result_t base = bench_unary(env, "sz_levenshtein_distance_serial", base_call).log();
diff --git a/scripts/test.cpp b/scripts/test.cpp
index 6630ec61..b6d5c25b 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -36,11 +36,18 @@
 #define SZ_DEBUG 1 // Enforce aggressive logging for this unit.
 
 #include <stringzilla/stringzilla.hpp>
+#include <stringzilla/similarity.hpp>
 
 #if defined(__SANITIZE_ADDRESS__)
 #include <sanitizer/asan_interface.h> // We use ASAN API to poison memory addresses
 #endif
 
+#if defined(__clang__) || defined(__GNUC__)
+#define ASAN_DISABLE __attribute__((no_sanitize_address))
+#else
+#define ASAN_DISABLE
+#endif
+
 #include <algorithm>     // `std::transform`
 #include <cstdio>        // `std::printf`
 #include <cstring>       // `std::memcpy`
@@ -68,6 +75,10 @@ using namespace sz::scripts;
 using sz::literals::operator""_sv; // for `sz::string_view`
 using sz::literals::operator""_bs; // for `sz::byteset`
 
+#if _SZ_IS_CPP17
+using namespace std::literals; // for ""sv
+#endif
+
 /*
  *  Instantiate all the templates to make the symbols visible and also check
  *  for weird compilation errors on uncommon paths.
@@ -152,29 +163,62 @@ static void test_arithmetical_utilities() {
 #endif
 }
 
-static void test_structural_utilities() {
+/** @brief Validates `sz_sequence_t` and related construction utilities. */
+static void test_sequence_struct() {
     // Make sure the sequence helper functions work as expected
     // for both trivial c-style arrays and
+    sz_sequence_t sequence;
+    sz_cptr_t strings[] = {"banana", "apple", "cherry"};
+    sz_sequence_from_null_terminated_strings(strings, 3, &sequence);
+    assert(sequence.count == 3);
+    assert("banana"_sv == sequence.get_start(sequence.handle, 0));
+    assert("apple"_sv == sequence.get_start(sequence.handle, 1));
+    assert("cherry"_sv == sequence.get_start(sequence.handle, 2));
+}
+
+/** @brief Validates `sz_memory_allocator_t` and related construction utilities. */
+static void test_memory_allocator_struct() {
+    {
+        sz_memory_allocator_t alloc;
+        sz_memory_allocator_init_default(&alloc);
+        assert(alloc.allocate(0, alloc.handle) == nullptr);
+    }
+
+    // Non-NULL allocation
     {
-        sz_sequence_t sequence;
-        sz_cptr_t strings[] = {"banana", "apple", "cherry"};
-        sz_sequence_from_null_terminated_strings(strings, 3, &sequence);
-        assert(sequence.count == 3);
-        assert("banana"_sv == sequence.get_start(sequence.handle, 0));
-        assert("apple"_sv == sequence.get_start(sequence.handle, 1));
-        assert("cherry"_sv == sequence.get_start(sequence.handle, 2));
+        sz_memory_allocator_t alloc;
+        sz_memory_allocator_init_default(&alloc);
+        void *byte = alloc.allocate(1, alloc.handle);
+        assert(byte != nullptr);
+        alloc.free(byte, 1, alloc.handle);
     }
 
-    // sz_memory_allocator_init_default;
-    // sz_memory_allocator_init_fixed;
-    // _sz_extract_utf8_rune;
-    // sz_byteset_init;
-    // sz_byteset_init_ascii;
-    // sz_byteset_add_u8;
-    // sz_byteset_add;
-    // sz_byteset_contains_u8;
-    // sz_byteset_contains;
-    // sz_byteset_invert;
+    // Use a fixed buffer
+    {
+        char buffer[1024];
+        sz_memory_allocator_t alloc;
+        sz_memory_allocator_init_fixed(&alloc, buffer, sizeof(buffer));
+        void *byte = alloc.allocate(1, alloc.handle);
+        assert(byte != nullptr);
+        alloc.free(byte, 1, alloc.handle);
+    }
+}
+
+/** @brief Validates `sz_byteset_t` and related construction utilities. */
+static void test_bytest_struct() {
+    sz_byteset_t s;
+    sz_byteset_init(&s);
+    assert(sz_byteset_contains(&s, 'a') == false);
+    sz_byteset_add(&s, 'a');
+    assert(sz_byteset_contains(&s, 'a') == true);
+    sz_byteset_add(&s, 'z');
+    assert(sz_byteset_contains(&s, 'z') == true);
+    sz_byteset_invert(&s);
+    assert(sz_byteset_contains(&s, 'a') == false);
+    assert(sz_byteset_contains(&s, 'z') == false);
+    assert(sz_byteset_contains(&s, 'b') == true);
+    sz_byteset_init_ascii(&s);
+    assert(sz_byteset_contains(&s, 'A') == true);
 }
 
 /**
@@ -183,7 +227,7 @@ static void test_structural_utilities() {
  *  The test covers increasingly long and complex strings, starting with "abcabc..." repetitions and
  *  progressing towards corner cases like empty strings, all-zero inputs, zero seeds, and so on.
  */
-static void test_hashing_on_platform(                                   //
+static void test_hash_equivalence(                                      //
     sz_hash_t hash_base, sz_hash_state_init_t init_base,                //
     sz_hash_state_stream_t stream_base, sz_hash_state_fold_t fold_base, //
     sz_hash_t hash_simd, sz_hash_state_init_t init_simd,                //
@@ -250,7 +294,7 @@ static void test_hashing_on_platform(                                   //
  *  @brief  Tests Pseudo-Random Number Generators (PRNGs) ensuring that the same nonce
  *          produces exactly the same output across different SIMD implementations.
  */
-static void test_random_generator_on_platform(sz_fill_random_t generate_base, sz_fill_random_t generate_simd) {
+static void test_random_generator_equivalence(sz_fill_random_t generate_base, sz_fill_random_t generate_simd) {
 
     auto test_on_nonce = [&](std::size_t length, sz_u64_t nonce) {
         std::string text_base(length, '\0');
@@ -272,46 +316,188 @@ static void test_random_generator_on_platform(sz_fill_random_t generate_base, sz
             test_on_nonce(length, nonce);
 }
 
-static void test_simd_against_serial() {
+/**
+ *  @brief  Tests the correctness of the string class Levenshtein distance computation,
+ *          as well as the similarity scoring functions for bioinformatics-like workloads.
+ */
+template <typename base_operator_, typename simd_operator_>
+static void test_edit_distance_equivalence(base_operator_ distance_base, simd_operator_ &&distance_simd) {
+
+    // Let's log the error and the strings that caused it.
+    auto test_distance = [&](std::string const &l, std::string const &r) {
+        auto result_base = distance_base(l, r);
+        auto result_simd = distance_simd(l, r);
+        if (result_base == result_simd) return;
+
+        char const *ellipsis = l.length() > 22 || r.length() > 22 ? "..." : "";
+        std::printf("Edit Distance error: distance(\"%.22s%s\", \"%.22s%s\"); got %zd, expected %zd\n", //
+                    l.c_str(), ellipsis, r.c_str(), ellipsis, result_simd, result_base);
+    };
+
+    // Pick a few representative cases for ASCII and Unicode strings.
+    struct {
+        char const *left;
+        char const *right;
+        std::size_t edit_distance_bytes;
+        std::size_t edit_distance_utf8;
+    } explicit_cases[] = {
+        {"atca", "ctactcaccc", 6, 6},
+        {"listen", "silent", 4, 4},
+        {"A", "=", 1, 1},
+        {"a", "a", 0, 0},
+        {"", "", 0, 0},
+        {"", "abc", 3, 3},
+        {"abc", "", 3, 3},
+        {"abc", "ac", 1, 1},                   // one deletion
+        {"abc", "a_bc", 1, 1},                 // one insertion
+        {"abc", "adc", 1, 1},                  // one substitution
+        {"abc", "abc", 0, 0},                  // same string
+        {"ggbuzgjux{}l", "gbuzgjux{}l", 1, 1}, // one insertion (prepended)
+        {"apple", "aple", 1, 1},
+        //
+        // Unicode:
+        {"αβγδ", "αγδ", 2, 1},                      // Each Greek symbol is 2 bytes in size
+        {"مرحبا بالعالم", "مرحبا يا عالم", 3, 2},   // "Hello World" vs "Welcome to the World" ?
+        {"école", "école", 3, 2},                   // letter "é" as a single character vs "e" + "´"
+        {"Schön", "Scho\u0308n", 3, 2},             // "ö" represented as "o" + "¨"
+        {"💖", "💗", 1, 1},                         // 4-byte emojis: Different hearts
+        {"𠜎 𠜱 𠝹 𠱓", "𠜎𠜱𠝹𠱓", 3, 3},          // Ancient Chinese characters, no spaces vs spaces
+        {"München", "Muenchen", 2, 2},              // German name with umlaut vs. its transcription
+        {"façade", "facade", 2, 1},                 // "ç" represented as "c" with cedilla vs. plain "c"
+        {"こんにちは世界", "こんばんは世界", 3, 2}, // Japanese: "Good morning world" vs "Good evening world"
+        {"👩‍👩‍👧‍👦", "👨‍👩‍👧‍👦", 1, 1}, // Family emojis with different compositions
+        {"Data科学123", "Data科學321", 3, 3},
+        {"🙂🌍🚀", "🙂🌎✨", 5, 2},
+    };
+    for (auto explicit_case : explicit_cases) test_distance(explicit_case.left, explicit_case.right);
+
+    // Gradually increasing the length of the strings.
+    for (std::size_t length = 0; length != 1000; ++length) {
+        std::string left, right;
+        for (std::size_t i = 0; i != length; ++i) left.push_back('a'), right.push_back('b');
+        test_distance(left, right);
+    }
+
+    // Generate random strings and compare the results.
+    struct {
+        std::size_t length_upper_bound;
+        std::size_t iterations;
+    } fuzzy_cases[] = {
+        {10, 1000},
+        {64, 128},
+        {100, 100},
+        {1000, 10},
+    };
+    std::mt19937 &generator = global_random_generator();
+    sz::string first, second;
+    for (auto fuzzy_case : fuzzy_cases) {
+        char alphabet[4] = {'a', 'c', 'g', 't'};
+        std::uniform_int_distribution<std::size_t> length_distribution(0, fuzzy_case.length_upper_bound);
+        for (std::size_t i = 0; i != fuzzy_case.iterations; ++i) {
+            std::size_t first_length = length_distribution(generator);
+            std::size_t second_length = length_distribution(generator);
+            std::generate_n(std::back_inserter(first), first_length, [&]() { return alphabet[generator() % 4]; });
+            std::generate_n(std::back_inserter(second), second_length, [&]() { return alphabet[generator() % 4]; });
+            test_distance(first, second);
+
+            // Try computing the distance on equal-length chunks of those strings.
+            first.resize((std::min)(first_length, second_length));
+            second.resize((std::min)(first_length, second_length));
+            test_distance(first, second);
+
+            // Discard before the next iteration.
+            first.clear();
+            second.clear();
+        }
+    }
+}
+
+/** @brief Wraps a hardware-specific Levenshtein-distance backend. */
+template <sz_levenshtein_distance_t levenshtein_distance_>
+struct levenshtein_from_sz {
+
+    sz_size_t bound = SZ_SIZE_MAX;
+
+    inline sz_size_t operator()(std::string const &a, std::string const &b) const noexcept(false) {
+        sz_size_t result_distance;
+        sz_status_t status = levenshtein_distance_( //
+            a.data(), a.size(),                     //
+            b.data(), b.size(),                     //
+            bound, NULL, &result_distance);
+        assert(status == sz_success_k);
+        return result_distance;
+    }
+};
+
+/** @brief Wraps a hardware-specific Levenshtein-distance backend into something @b `bench_unary`-compatible . */
+template <sz_needleman_wunsch_score_t needleman_wunsch_>
+struct alignment_score_from_sz {
+
+    sz_size_t bound = SZ_SIZE_MAX;
+    error_costs_256x256_t costs = unary_substitution_costs();
+
+    inline sz_size_t operator()(std::string const &a, std::string const &b) const noexcept(false) {
+        sz_ssize_t result_score;
+        sz_status_t status = needleman_wunsch_( //
+            a.data(), a.size(),                 //
+            b.data(), b.size(),                 //
+            costs.data(), (sz_error_cost_t)-1,  //
+            NULL, &result_score);
+        sz_size_t result_distance = (sz_size_t)(-result_score);
+        assert(status == sz_success_k);
+        return result_distance;
+    }
+};
+
+static void test_equivalence() {
+
+    test_edit_distance_equivalence(                            //
+        levenshtein_from_sz<sz_levenshtein_distance_serial>(), //
+        alignment_score_from_sz<sz_needleman_wunsch_score_serial>());
+
+    test_edit_distance_equivalence(                            //
+        levenshtein_from_sz<sz_levenshtein_distance_serial>(), //
+        [](std::string const &a, std::string const &b) { return sz::openmp::levenshtein_distance(a, b); });
+
 #if SZ_USE_HASWELL
-    test_hashing_on_platform(                                   //
+    test_hash_equivalence(                                      //
         sz_hash_serial, sz_hash_state_init_serial,              //
         sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
         sz_hash_haswell, sz_hash_state_init_haswell,            //
         sz_hash_state_stream_haswell, sz_hash_state_fold_haswell);
-    test_random_generator_on_platform(sz_fill_random_serial, sz_fill_random_haswell);
+    test_random_generator_equivalence(sz_fill_random_serial, sz_fill_random_haswell);
 #endif
 #if SZ_USE_SKYLAKE
-    test_hashing_on_platform(                                   //
+    test_hash_equivalence(                                      //
         sz_hash_serial, sz_hash_state_init_serial,              //
         sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
         sz_hash_skylake, sz_hash_state_init_skylake,            //
         sz_hash_state_stream_skylake, sz_hash_state_fold_skylake);
-    test_random_generator_on_platform(sz_fill_random_serial, sz_fill_random_skylake);
+    test_random_generator_equivalence(sz_fill_random_serial, sz_fill_random_skylake);
 #endif
 #if SZ_USE_ICE
-    test_hashing_on_platform(                                   //
+    test_hash_equivalence(                                      //
         sz_hash_serial, sz_hash_state_init_serial,              //
         sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
         sz_hash_ice, sz_hash_state_init_ice,                    //
         sz_hash_state_stream_ice, sz_hash_state_fold_ice);
-    test_random_generator_on_platform(sz_fill_random_serial, sz_fill_random_ice);
+    test_random_generator_equivalence(sz_fill_random_serial, sz_fill_random_ice);
 #endif
 #if SZ_USE_NEON
-    test_hashing_on_platform(                                   //
+    test_hash_equivalence(                                      //
         sz_hash_serial, sz_hash_state_init_serial,              //
         sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
         sz_hash_neon, sz_hash_state_init_neon,                  //
         sz_hash_state_stream_neon, sz_hash_state_fold_neon);
-    test_random_generator_on_platform(sz_fill_random_serial, sz_fill_random_neon);
+    test_random_generator_equivalence(sz_fill_random_serial, sz_fill_random_neon);
 #endif
 #if SZ_USE_SVE2
-    test_hashing_on_platform(                                   //
+    test_hash_equivalence(                                      //
         sz_hash_serial, sz_hash_state_init_serial,              //
         sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
         sz_hash_sve2, sz_hash_state_init_sve2,                  //
         sz_hash_state_stream_sve2, sz_hash_state_fold_sve2);
-    test_random_generator_on_platform(sz_fill_random_serial, sz_fill_random_sve2);
+    test_random_generator_equivalence(sz_fill_random_serial, sz_fill_random_sve2);
 #endif
 };
 
@@ -932,6 +1118,10 @@ static void test_stl_conversions() {
 #endif
 }
 
+/**
+ *  @brief The sum of an arithmetic progression.
+ *  @see https://en.wikipedia.org/wiki/Arithmetic_progression
+ */
 inline std::size_t arithmetic_sum(std::size_t first, std::size_t last, std::size_t step = 1) {
     std::size_t n = (last >= first) ? ((last - first) / step + 1) : 0;
     // Return 0 if there are no terms
@@ -1639,118 +1829,6 @@ static void test_search_with_misaligned_repetitions() {
 
 #endif
 
-/**
- *  @brief  Tests the correctness of the string class Levenshtein distance computation,
- *          as well as the similarity scoring functions for bioinformatics-like workloads.
- */
-static void test_levenshtein_distances() {
-    struct {
-        char const *left;
-        char const *right;
-        std::size_t distance;
-    } explicit_cases[] = {
-        {"a", "a", 0},
-        {"A", "=", 1},
-        {"listen", "silent", 4},
-        {"", "", 0},
-        {"", "abc", 3},
-        {"abc", "", 3},
-        {"abc", "ac", 1},                   // one deletion
-        {"abc", "a_bc", 1},                 // one insertion
-        {"abc", "adc", 1},                  // one substitution
-        {"abc", "abc", 0},                  // same string
-        {"ggbuzgjux{}l", "gbuzgjux{}l", 1}, // one insertion (prepended)
-        {"apple", "aple", 1},
-        // Unicode:
-        {"αβγδ", "αγδ", 2},                      // Each Greek symbol is 2 bytes in size
-        {"مرحبا بالعالم", "مرحبا يا عالم", 3},   // "Hello World" vs "Welcome to the World" ?
-        {"école", "école", 3},                   // letter "é" as a single character vs "e" + "´"
-        {"Schön", "Scho\u0308n", 3},             // "ö" represented as "o" + "¨"
-        {"💖", "💗", 1},                         // 4-byte emojis: Different hearts
-        {"𠜎 𠜱 𠝹 𠱓", "𠜎𠜱𠝹𠱓", 3},          // Ancient Chinese characters, no spaces vs spaces
-        {"München", "Muenchen", 2},              // German name with umlaut vs. its transcription
-        {"façade", "facade", 2},                 // "ç" represented as "c" with cedilla vs. plain "c"
-        {"こんにちは世界", "こんばんは世界", 3}, // Japanese: "Good morning world" vs "Good evening world"
-        {"👩‍👩‍👧‍👦", "👨‍👩‍👧‍👦", 1}, // Family emojis with different compositions
-        {"Data科学123", "Data科學321", 3},
-        {"🙂🌍🚀", "🙂🌎✨", 5},
-    };
-
-    using matrix_t = std::int8_t[256][256];
-    error_costs_256x256_t costs_vector = unary_substitution_costs();
-    matrix_t &costs = *reinterpret_cast<matrix_t *>(costs_vector.data());
-
-    auto print_failure = [&](char const *name, sz::string const &l, sz::string const &r, std::size_t expected,
-                             std::size_t received) {
-        char const *ellipsis = l.length() > 22 || r.length() > 22 ? "..." : "";
-        std::printf("%s error: distance(\"%.22s%s\", \"%.22s%s\"); got %zd, expected %zd\n", //
-                    name, l.c_str(), ellipsis, r.c_str(), ellipsis, received, expected);
-    };
-
-    auto test_distance = [&](sz::string const &l, sz::string const &r, std::size_t expected) {
-        auto received = sz::levenshtein_distance(l, r);
-        auto received_score = sz::alignment_score(l, r, costs, -1);
-        if (received != expected) print_failure("Levenshtein", l, r, expected, received);
-        if ((std::size_t)(-received_score) != expected) print_failure("Scoring", l, r, expected, received_score);
-        // The distance relation commutes
-        received = sz::levenshtein_distance(r, l);
-        received_score = sz::alignment_score(r, l, costs, -1);
-        if (received != expected) print_failure("Levenshtein", r, l, expected, received);
-        if ((std::size_t)(-received_score) != expected) print_failure("Scoring", r, l, expected, received_score);
-
-        // Validate the bounded variants:
-        if (received > 1) {
-            assert(sz::levenshtein_distance(l, r, received) == received);
-            assert(sz::levenshtein_distance(r, l, received - 1) >= (std::max)(l.size(), r.size()));
-        }
-    };
-
-    for (auto explicit_case : explicit_cases)
-        test_distance(sz::string(explicit_case.left), sz::string(explicit_case.right), explicit_case.distance);
-
-    // Gradually increasing the length of the strings.
-    for (std::size_t length = 0; length != 1000; ++length) {
-        sz::string left, right;
-        for (std::size_t i = 0; i != length; ++i) left.push_back('a'), right.push_back('b');
-        test_distance(left, right, length);
-    }
-
-    // Randomized tests
-    struct {
-        std::size_t length_upper_bound;
-        std::size_t iterations;
-    } fuzzy_cases[] = {
-        {10, 1000},
-        {64, 128},
-        {100, 100},
-        {1000, 10},
-    };
-    std::mt19937 &generator = global_random_generator();
-    sz::string first, second;
-    for (auto fuzzy_case : fuzzy_cases) {
-        char alphabet[4] = {'a', 'c', 'g', 't'};
-        std::uniform_int_distribution<std::size_t> length_distribution(0, fuzzy_case.length_upper_bound);
-        for (std::size_t i = 0; i != fuzzy_case.iterations; ++i) {
-            std::size_t first_length = length_distribution(generator);
-            std::size_t second_length = length_distribution(generator);
-            std::generate_n(std::back_inserter(first), first_length, [&]() { return alphabet[generator() % 4]; });
-            std::generate_n(std::back_inserter(second), second_length, [&]() { return alphabet[generator() % 4]; });
-            test_distance(first, second,
-                          levenshtein_baseline(first.c_str(), first.length(), second.c_str(), second.length()));
-
-            // Try computing the distance on equal-length chunks of those strings.
-            first.resize(std::min(first_length, second_length));
-            second.resize(std::min(first_length, second_length));
-            test_distance(first, second,
-                          levenshtein_baseline(first.c_str(), first.length(), second.c_str(), second.length()));
-
-            // Discard before the next iteration.
-            first.clear();
-            second.clear();
-        }
-    }
-}
-
 /**
  *  Evaluates the correctness of look-up table transforms using random lookup tables.
  *
@@ -1970,8 +2048,8 @@ int main(int argc, char const **argv) {
 
     // Basic utilities
     test_arithmetical_utilities();
-    test_structural_utilities();
-    test_simd_against_serial();
+    test_sequence_struct();
+    test_equivalence();
 
     // Sequences of strings
     test_sorting_algorithms();
@@ -2014,8 +2092,6 @@ int main(int argc, char const **argv) {
     test_search_with_misaligned_repetitions();
 #endif
 
-    test_levenshtein_distances();
-
     std::printf("All tests passed... Unbelievable!\n");
     return 0;
 }

From bc311b31942b82fc7320edf0651518a716d4aa05 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 23 Mar 2025 18:01:09 +0000
Subject: [PATCH 250/751] Improve: Cleaner API for OpenMP

- No dependencies on STL
- Fewer `memmove` calls
- Output to reference, instead of pointer
---
 include/stringzilla/similarity.hpp | 153 ++++++++++++++++-------------
 scripts/test.cpp                   |   4 +-
 2 files changed, 89 insertions(+), 68 deletions(-)

diff --git a/include/stringzilla/similarity.hpp b/include/stringzilla/similarity.hpp
index 752897a9..6c5d7905 100644
--- a/include/stringzilla/similarity.hpp
+++ b/include/stringzilla/similarity.hpp
@@ -19,13 +19,30 @@ namespace ashvardanian {
 namespace stringzilla {
 namespace openmp {
 
+struct dummy_allocator_t {
+    using value_type = char;
+    inline char *allocate(sz_size_t) const noexcept { return nullptr; }
+    inline void deallocate(char *, sz_size_t) const noexcept {}
+};
+
 struct uniform_substitution_cost_t {
-    sz_error_cost_t operator()(char a, char b) const { return a == b ? 0 : 1; }
+    inline sz_error_cost_t operator()(char a, char b) const noexcept { return a == b ? 0 : 1; }
 };
 
 struct lookup_substitution_cost_t {
     sz_error_cost_t const *costs;
-    sz_error_cost_t operator()(char a, char b) const { return costs[(sz_u8_t)a * 256 + (sz_u8_t)b]; }
+    inline sz_error_cost_t operator()(char a, char b) const noexcept { return costs[(sz_u8_t)a * 256 + (sz_u8_t)b]; }
+};
+
+template <typename char_type_>
+struct span {
+    char_type_ const *data_;
+    sz_size_t size_;
+
+    char_type_ const *begin() const noexcept { return data_; }
+    char_type_ const *end() const noexcept { return data_ + size_; }
+    char_type_ const *data() const noexcept { return data_; }
+    sz_size_t size() const noexcept { return size_; }
 };
 
 /**
@@ -33,9 +50,11 @@ struct lookup_substitution_cost_t {
  *          @b three skewed (reverse) diagonals at a time on a CPU, leveraging OpenMP for parallelization.
  *
  *  @param[in] first The first string.
- *  @param[in] first_length The length of the first string.
  *  @param[in] second The second string.
- *  @param[in] second_length The length of the second string.
+ *  @param[out] result_ref Location to dump the calculated score.
+ *  @param[in] gap_cost The uniform cost of a gap (insertion or deletion).
+ *  @param[in] get_substitution_cost A commutative function returning the cost of substituting one char with another.
+ *  @param[in] alloc A default-constructible allocator for the internal buffers.
  *
  *  There are smarter algorithms for computing the Levenshtein distance, mostly based on bit-level operations.
  *  Those, however, don't generalize well to arbitrary length inputs or non-uniform substitution costs.
@@ -48,15 +67,14 @@ struct lookup_substitution_cost_t {
  *  @sa     For bulk API, use `sz::levenshtein_distances[_utf8]`.
  */
 template <                                                         //
-    typename char_type_ = char,                                    //
+    typename char_type_,                                           //
     typename distance_type_ = sz_size_t,                           //
     typename get_substitution_cost_ = uniform_substitution_cost_t, //
-    typename allocator_type_ = std::allocator<char>                //
+    typename allocator_type_ = dummy_allocator_t                   //
     >
-sz_status_t score_diagonally(                          //
-    char_type_ const *first, sz_size_t first_length,   //
-    char_type_ const *second, sz_size_t second_length, //
-    distance_type_ *result_ptr,
+sz_status_t score_diagonally(                                                        //
+    span<char_type_ const> first, span<char_type_ const> second,                     //
+    distance_type_ &result_ref,                                                      //
     sz_error_cost_t gap_cost = 1,                                                    //
     get_substitution_cost_ &&get_substitution_cost = uniform_substitution_cost_t {}, //
     allocator_type_ &&alloc = allocator_type_ {}                                     //
@@ -64,10 +82,12 @@ sz_status_t score_diagonally(                          //
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
     using allocated_type = typename allocator_type_::value_type;
     static_assert(sizeof(allocated_type) == sizeof(char), "Allocator must be byte-aligned");
+    using char_type = char_type_;
+    using distance_type = distance_type_;
 
     // Make sure the size relation between the strings is correct.
-    char_type_ const *shorter = first, *longer = second;
-    sz_size_t shorter_length = first_length, longer_length = second_length;
+    char_type const *shorter = first.data(), *longer = second.data();
+    sz_size_t shorter_length = first.size(), longer_length = second.size();
     if (shorter_length > longer_length) {
         std::swap(shorter, longer);
         std::swap(shorter_length, longer_length);
@@ -90,15 +110,15 @@ sz_status_t score_diagonally(                          //
     // We want to avoid reverse-order iteration over the shorter string.
     // Let's allocate a bit more memory and reverse-export our shorter string into that buffer.
     sz_size_t const buffer_length =
-        sizeof(distance_type_) * max_diagonal_length * 3 + shorter_length * sizeof(char_type_);
-    distance_type_ *const distances = (distance_type_ *)alloc.allocate(buffer_length);
-    if (!distances) return sz_bad_alloc_k;
+        sizeof(distance_type) * max_diagonal_length * 3 + shorter_length * sizeof(char_type);
+    distance_type *const buffer = (distance_type *)alloc.allocate(buffer_length);
+    if (!buffer) return sz_bad_alloc_k;
 
     // The next few pointers will be swapped around.
-    distance_type_ *previous_distances = distances;
-    distance_type_ *current_distances = previous_distances + max_diagonal_length;
-    distance_type_ *next_distances = current_distances + max_diagonal_length;
-    char_type_ *const shorter_reversed = (char_type_ *)(next_distances + max_diagonal_length);
+    distance_type *previous_distances = buffer;
+    distance_type *current_distances = previous_distances + max_diagonal_length;
+    distance_type *next_distances = current_distances + max_diagonal_length;
+    char_type *const shorter_reversed = (char_type *)(next_distances + max_diagonal_length);
 
     // Export the reversed string into the buffer.
     for (sz_size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
@@ -113,18 +133,18 @@ sz_status_t score_diagonally(                          //
     // the `next_diagonal_mask` on the very first iteration.
     sz_size_t next_diagonal_index = 2;
 
-    // Progress through the upper triangle of the Levenshtein matrix.
+    // Progress through the upper-left triangle of the Levenshtein matrix.
     for (; next_diagonal_index < shorter_dim; ++next_diagonal_index) {
         sz_size_t const next_diagonal_length = next_diagonal_index + 1;
 #pragma omp simd
         for (sz_size_t offset_in_diagonal = 1; offset_in_diagonal + 1 < next_diagonal_length; ++offset_in_diagonal) {
             // ? Note that here we are still traversing both buffers in the same order,
             // ? because the shorter string has been reversed into `shorter_reversed`.
-            char_type_ shorter_char = shorter_reversed[shorter_length - next_diagonal_index + offset_in_diagonal];
-            char_type_ longer_char = longer[offset_in_diagonal - 1];
+            char_type shorter_char = shorter_reversed[shorter_length - next_diagonal_index + offset_in_diagonal];
+            char_type longer_char = longer[offset_in_diagonal - 1];
             sz_error_cost_t cost_of_substitution = get_substitution_cost(shorter_char, longer_char);
-            distance_type_ cost_if_substitution = previous_distances[offset_in_diagonal - 1] + cost_of_substitution;
-            distance_type_ cost_if_deletion_or_insertion =     //
+            distance_type cost_if_substitution = previous_distances[offset_in_diagonal - 1] + cost_of_substitution;
+            distance_type cost_if_deletion_or_insertion =      //
                 sz_min_of_two(                                 //
                     current_distances[offset_in_diagonal - 1], //
                     current_distances[offset_in_diagonal]      //
@@ -135,22 +155,22 @@ sz_status_t score_diagonally(                          //
         // Don't forget to populate the first row and the first column of the Levenshtein matrix.
         next_distances[0] = next_distances[next_diagonal_length - 1] = next_diagonal_index;
         // Perform a circular rotation of those buffers, to reuse the memory.
-        distance_type_ *temporary = previous_distances;
+        distance_type *temporary = previous_distances;
         previous_distances = current_distances;
         current_distances = next_distances;
         next_distances = temporary;
     }
 
-    // Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.
+    // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
     for (; next_diagonal_index < longer_dim; ++next_diagonal_index) {
         sz_size_t const next_diagonal_length = shorter_dim;
 #pragma omp simd
         for (sz_size_t offset_in_diagonal = 0; offset_in_diagonal + 1 < next_diagonal_length; ++offset_in_diagonal) {
-            char_type_ shorter_char = shorter_reversed[shorter_length - shorter_dim + offset_in_diagonal + 1];
-            char_type_ longer_char = longer[next_diagonal_index - shorter_dim + offset_in_diagonal];
+            char_type shorter_char = shorter_reversed[shorter_length - shorter_dim + offset_in_diagonal + 1];
+            char_type longer_char = longer[next_diagonal_index - shorter_dim + offset_in_diagonal];
             sz_error_cost_t cost_of_substitution = get_substitution_cost(shorter_char, longer_char);
-            distance_type_ cost_if_substitution = previous_distances[offset_in_diagonal] + cost_of_substitution;
-            distance_type_ cost_if_deletion_or_insertion =    //
+            distance_type cost_if_substitution = previous_distances[offset_in_diagonal] + cost_of_substitution;
+            distance_type cost_if_deletion_or_insertion =     //
                 sz_min_of_two(                                //
                     current_distances[offset_in_diagonal],    //
                     current_distances[offset_in_diagonal + 1] //
@@ -161,13 +181,13 @@ sz_status_t score_diagonally(                          //
         next_distances[next_diagonal_length - 1] = next_diagonal_index;
         // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
         // dropping the first element in the current array.
-        distance_type_ *temporary = previous_distances;
+        distance_type *temporary = previous_distances;
         previous_distances = current_distances;
         current_distances = next_distances;
         next_distances = temporary;
         // ! Drop the first entry among the current distances.
         sz_move((sz_ptr_t)(previous_distances), (sz_ptr_t)(previous_distances + 1),
-                (max_diagonal_length - 1) * sizeof(distance_type_));
+                (max_diagonal_length - 1) * sizeof(distance_type));
     }
 
     // Now let's handle the bottom-right triangle of the matrix.
@@ -175,11 +195,11 @@ sz_status_t score_diagonally(                          //
         sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
 #pragma omp simd
         for (sz_size_t offset_in_diagonal = 0; offset_in_diagonal < next_diagonal_length; ++offset_in_diagonal) {
-            char_type_ shorter_char = shorter_reversed[shorter_length - shorter_dim + offset_in_diagonal + 1];
-            char_type_ longer_char = longer[next_diagonal_index - shorter_dim + offset_in_diagonal];
+            char_type shorter_char = shorter_reversed[shorter_length - shorter_dim + offset_in_diagonal + 1];
+            char_type longer_char = longer[next_diagonal_index - shorter_dim + offset_in_diagonal];
             sz_error_cost_t cost_of_substitution = get_substitution_cost(shorter_char, longer_char);
-            distance_type_ cost_if_substitution = previous_distances[offset_in_diagonal] + cost_of_substitution;
-            distance_type_ cost_if_deletion_or_insertion =    //
+            distance_type cost_if_substitution = previous_distances[offset_in_diagonal] + cost_of_substitution;
+            distance_type cost_if_deletion_or_insertion =     //
                 sz_min_of_two(                                //
                     current_distances[offset_in_diagonal],    //
                     current_distances[offset_in_diagonal + 1] //
@@ -189,26 +209,25 @@ sz_status_t score_diagonally(                          //
         }
         // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
         // dropping the first element in the current array.
-        distance_type_ *temporary = previous_distances;
-        previous_distances = current_distances;
+        distance_type *temporary = previous_distances;
+        // ! Drop the first entry among the current distances.
+        // ! Assuming every next diagonal is shorter by one element, we don't need a full-blown `sz_move`.
+        // ! to shift the array by one element.
+        previous_distances = current_distances + 1;
         current_distances = next_distances;
         next_distances = temporary;
-        // ! Drop the first entry among the current distances.
-        sz_move((sz_ptr_t)(previous_distances), (sz_ptr_t)(previous_distances + 1),
-                (max_diagonal_length - 1) * sizeof(distance_type_));
     }
 
-    // Cache scalar before `free` call.
-    distance_type_ result = current_distances[0];
-    alloc.deallocate((allocated_type *)distances, buffer_length);
-    *result_ptr = result;
+    // Export the scalar before `free` call.
+    result_ref = current_distances[0];
+    alloc.deallocate((allocated_type *)buffer, buffer_length);
     return sz_success_k;
 }
 
-template <                                          //
-    typename first_type_,                           //
-    typename second_type_,                          //
-    typename allocator_type_ = std::allocator<char> //
+template <                                       //
+    typename first_type_,                        //
+    typename second_type_,                       //
+    typename allocator_type_ = dummy_allocator_t //
     >
 inline sz_size_t levenshtein_distance( //
     first_type_ const &first, second_type_ const &second,
@@ -224,7 +243,7 @@ inline sz_size_t levenshtein_distance( //
     if (max_dim < 256u) {
         sz_u8_t result_u8;
         sz_status_t status = score_diagonally<char, sz_u8_t, uniform_substitution_cost_t, allocator_type_>(
-            first.data(), first_length, second.data(), second_length, &result_u8, 1, uniform_substitution_cost_t {},
+            {first.data(), first_length}, {second.data(), second_length}, result_u8, 1, uniform_substitution_cost_t {},
             std::forward<allocator_type_>(alloc));
         if (status == sz_bad_alloc_k) throw std::bad_alloc();
         return result_u8;
@@ -232,7 +251,7 @@ inline sz_size_t levenshtein_distance( //
     else if (max_dim < 65536u) {
         sz_u16_t result_u16;
         sz_status_t status = score_diagonally<char, sz_u16_t, uniform_substitution_cost_t, allocator_type_>(
-            first.data(), first_length, second.data(), second_length, &result_u16, 1, uniform_substitution_cost_t {},
+            {first.data(), first_length}, {second.data(), second_length}, result_u16, 1, uniform_substitution_cost_t {},
             std::forward<allocator_type_>(alloc));
         if (status == sz_bad_alloc_k) throw std::bad_alloc();
         return result_u16;
@@ -240,17 +259,17 @@ inline sz_size_t levenshtein_distance( //
     else {
         sz_size_t result_size;
         sz_status_t status = score_diagonally<char, sz_size_t, uniform_substitution_cost_t, allocator_type_>(
-            first.data(), first_length, second.data(), second_length, &result_size, 1, uniform_substitution_cost_t {},
-            std::forward<allocator_type_>(alloc));
+            {first.data(), first_length}, {second.data(), second_length}, result_size, 1,
+            uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
         if (status == sz_bad_alloc_k) throw std::bad_alloc();
         return result_size;
     }
 }
 
-template <                                          //
-    typename first_type_,                           //
-    typename second_type_,                          //
-    typename allocator_type_ = std::allocator<char> //
+template <                                       //
+    typename first_type_,                        //
+    typename second_type_,                       //
+    typename allocator_type_ = dummy_allocator_t //
     >
 inline sz_size_t levenshtein_distance_utf8( //
     first_type_ const &first, second_type_ const &second,
@@ -282,7 +301,7 @@ inline sz_size_t levenshtein_distance_utf8( //
     if (max_dim < 256u) {
         sz_u8_t result_u8;
         sz_status_t status = score_diagonally<sz_rune_t, sz_u8_t, uniform_substitution_cost_t, allocator_type_>(
-            first_utf32, first_length_utf32, second_utf32, second_length_utf32, &result_u8, 1,
+            {first_utf32, first_length_utf32}, {second_utf32, second_length_utf32}, result_u8, 1,
             uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
         if (status == sz_bad_alloc_k) throw std::bad_alloc();
         return result_u8;
@@ -290,7 +309,7 @@ inline sz_size_t levenshtein_distance_utf8( //
     else if (max_dim < 65536u) {
         sz_u16_t result_u16;
         sz_status_t status = score_diagonally<sz_rune_t, sz_u16_t, uniform_substitution_cost_t, allocator_type_>(
-            first_utf32, first_length_utf32, second_utf32, second_length_utf32, &result_u16, 1,
+            {first_utf32, first_length_utf32}, {second_utf32, second_length_utf32}, result_u16, 1,
             uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
         if (status == sz_bad_alloc_k) throw std::bad_alloc();
         return result_u16;
@@ -298,17 +317,17 @@ inline sz_size_t levenshtein_distance_utf8( //
     else {
         sz_size_t result_size;
         sz_status_t status = score_diagonally<sz_rune_t, sz_size_t, uniform_substitution_cost_t, allocator_type_>(
-            first_utf32, first_length_utf32, second_utf32, second_length_utf32, &result_size, 1,
+            {first_utf32, first_length_utf32}, {second_utf32, second_length_utf32}, result_size, 1,
             uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
         if (status == sz_bad_alloc_k) throw std::bad_alloc();
         return result_size;
     }
 }
 
-template <                                          //
-    typename first_type_,                           //
-    typename second_type_,                          //
-    typename allocator_type_ = std::allocator<char> //
+template <                                       //
+    typename first_type_,                        //
+    typename second_type_,                       //
+    typename allocator_type_ = dummy_allocator_t //
     >
 inline sz_ssize_t needleman_wunsch_score(                 //
     first_type_ const &first, second_type_ const &second, //
@@ -325,7 +344,7 @@ inline sz_ssize_t needleman_wunsch_score(                 //
     if (max_dim < 256u) {
         sz_u8_t result_u8;
         sz_status_t status = score_diagonally<char, sz_u8_t, lookup_substitution_cost_t, allocator_type_>(
-            first.data(), first_length, second.data(), second_length, &result_u8, gap,
+            {first.data(), first_length}, {second.data(), second_length}, result_u8, gap,
             lookup_substitution_cost_t {subs}, std::forward<allocator_type_>(alloc));
         if (status == sz_bad_alloc_k) throw std::bad_alloc();
         return result_u8;
@@ -333,7 +352,7 @@ inline sz_ssize_t needleman_wunsch_score(                 //
     else if (max_dim < 65536u) {
         sz_u16_t result_u16;
         sz_status_t status = score_diagonally<char, sz_u16_t, lookup_substitution_cost_t, allocator_type_>(
-            first.data(), first_length, second.data(), second_length, &result_u16, gap,
+            {first.data(), first_length}, {second.data(), second_length}, result_u16, gap,
             lookup_substitution_cost_t {subs}, std::forward<allocator_type_>(alloc));
         if (status == sz_bad_alloc_k) throw std::bad_alloc();
         return result_u16;
@@ -341,7 +360,7 @@ inline sz_ssize_t needleman_wunsch_score(                 //
     else {
         sz_size_t result_size;
         sz_status_t status = score_diagonally<char, sz_size_t, lookup_substitution_cost_t, allocator_type_>(
-            first.data(), first_length, second.data(), second_length, &result_size, gap,
+            {first.data(), first_length}, {second.data(), second_length}, result_size, gap,
             lookup_substitution_cost_t {subs}, std::forward<allocator_type_>(alloc));
         if (status == sz_bad_alloc_k) throw std::bad_alloc();
         return result_size;
diff --git a/scripts/test.cpp b/scripts/test.cpp
index b6d5c25b..579c0f42 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -457,7 +457,9 @@ static void test_equivalence() {
 
     test_edit_distance_equivalence(                            //
         levenshtein_from_sz<sz_levenshtein_distance_serial>(), //
-        [](std::string const &a, std::string const &b) { return sz::openmp::levenshtein_distance(a, b); });
+        [](std::string const &a, std::string const &b) {
+            return sz::openmp::levenshtein_distance(a, b, std::allocator<char>());
+        });
 
 #if SZ_USE_HASWELL
     test_hash_equivalence(                                      //

From d3691709c5e91d5fb03672e0b67b4a1bee0a77ec Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 28 Mar 2025 16:13:44 +0000
Subject: [PATCH 251/751] Fix: Calling unused helper struct unit tests

---
 scripts/test.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/test.cpp b/scripts/test.cpp
index 579c0f42..19d96565 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -205,7 +205,7 @@ static void test_memory_allocator_struct() {
 }
 
 /** @brief Validates `sz_byteset_t` and related construction utilities. */
-static void test_bytest_struct() {
+static void test_byteset_struct() {
     sz_byteset_t s;
     sz_byteset_init(&s);
     assert(sz_byteset_contains(&s, 'a') == false);
@@ -2051,6 +2051,8 @@ int main(int argc, char const **argv) {
     // Basic utilities
     test_arithmetical_utilities();
     test_sequence_struct();
+    test_memory_allocator_struct();
+    test_byteset_struct();
     test_equivalence();
 
     // Sequences of strings

From bca734a9df92cf4c3f2b27f5fb9067db3fd6b44c Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 28 Mar 2025 16:14:06 +0000
Subject: [PATCH 252/751] Fix: Overriding `SZ_DEBUG` macro

---
 scripts/test.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scripts/test.cpp b/scripts/test.cpp
index 19d96565..834f270d 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -33,6 +33,9 @@
 // #define SZ_USE_NEON 0
 // #define SZ_USE_SVE 0
 // #define SZ_USE_SVE2 0
+#if defined(SZ_DEBUG)
+#undef SZ_DEBUG
+#endif
 #define SZ_DEBUG 1 // Enforce aggressive logging for this unit.
 
 #include <stringzilla/stringzilla.hpp>

From aac2e8f003040f8a0410a8d98f6a107df62744bf Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 28 Mar 2025 16:14:41 +0000
Subject: [PATCH 253/751] Fix: Sign-cast warning in `_mm256_set1_epi64x`

---
 include/stringzilla/intersect.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/stringzilla/intersect.h b/include/stringzilla/intersect.h
index 7504e6e8..5bda1717 100644
--- a/include/stringzilla/intersect.h
+++ b/include/stringzilla/intersect.h
@@ -481,7 +481,7 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_ice(
                 _mm256_mmask_i64gather_epi64(_mm256_setzero_si256(), 0xFF, batch_slots.ymm, table_hashes, 8);
 
             // Check that we don't have any collisions - in that case each value will be equal to `SZ_SIZE_MAX`
-            int const all_empty = _mm256_testc_si256(existing_hashes.ymm, _mm256_set1_epi64x(SZ_SIZE_MAX));
+            int const all_empty = _mm256_testc_si256(existing_hashes.ymm, _mm256_set1_epi64x(-1));
             if (all_empty && !has_slot_collisions) {
                 // Scatter the new positions
                 _mm256_mask_i64scatter_epi64(table_hashes, 0xFF, batch_slots.ymm, batch_hashes.ymm, 8);
@@ -584,7 +584,7 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_ice(
             // Check if we already have all of those slots populated with exactly the same values
             int const same_hashes = _mm256_movemask_epi8(_mm256_cmpeq_epi64(existing_hashes.ymm, batch_hashes.ymm));
             int const nulled_hashes =
-                _mm256_movemask_epi8(_mm256_cmpeq_epi64(existing_hashes.ymm, _mm256_set1_epi64x(SZ_SIZE_MAX)));
+                _mm256_movemask_epi8(_mm256_cmpeq_epi64(existing_hashes.ymm, _mm256_set1_epi64x(-1)));
 
             // Now for every one of the 4 hashed values we can have several outcomes:
             // - it's an "empty" value → no match

From 64b40a9f754e4039e342cdeb559169a8ec70b032 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 28 Mar 2025 16:16:50 +0000
Subject: [PATCH 254/751] Fix: Track capacity in fixed buffer alloc

---
 include/stringzilla/types.h | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 24202655..11ec13a9 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -544,7 +544,9 @@ SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc);
  *  @brief Initializes a memory allocator to use only a static-capacity buffer @b w/out any dynamic allocations.
  *  @param[in] alloc Memory allocator to initialize.
  *  @param[in] buffer Buffer to use for allocations.
- *  @param[in] length Length of the buffer. @b Must be greater than 8, at least 4KB (one RAM page) is recommended.
+ *  @param[in] length Length of the buffer. @b Must be greater than 16, at least 4KB (one RAM page) is recommended.
+ *
+ *  The `buffer` itself will be prepended with the capacity and the consumed size. Those values shouldn't be modified.
  */
 SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length);
 
@@ -1198,10 +1200,12 @@ SZ_INTERNAL sz_u64_vec_t sz_u64_load(sz_cptr_t ptr) {
 
 /** @brief Helper function, using the supplied fixed-capacity buffer to allocate memory. */
 SZ_INTERNAL sz_ptr_t _sz_memory_allocate_fixed(sz_size_t length, void *handle) {
-    sz_size_t capacity;
-    *(sz_ptr_t)&capacity = *(sz_cptr_t)handle;
-    sz_size_t consumed_capacity = sizeof(sz_size_t);
+
+    sz_size_t const capacity = *(sz_size_t *)handle;
+    sz_size_t const consumed_capacity = *((sz_size_t *)handle + 1);
     if (consumed_capacity + length > capacity) return SZ_NULL_CHAR;
+    // Increase the consumed capacity.
+    *((sz_size_t *)handle + 1) += length;
     return (sz_ptr_t)handle + consumed_capacity;
 }
 
@@ -1242,12 +1246,14 @@ SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc) {
 }
 
 SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length) {
-    // The logic here is simple - put the buffer length in the first slots of the buffer.
-    // Later use it for bounds checking.
+    // The logic here is simple - put the buffer capacity in the first slots of the buffer.
+    // The second slot is used to store the current consumed capacity.
+    // The rest of the buffer is used for the actual data.
     alloc->allocate = (sz_memory_allocate_t)_sz_memory_allocate_fixed;
     alloc->free = (sz_memory_free_t)_sz_memory_free_fixed;
-    alloc->handle = &buffer;
-    *(sz_ptr_t)buffer = *(sz_cptr_t)&length;
+    alloc->handle = buffer;
+    *(sz_size_t *)buffer = length;
+    *((sz_ptr_t)buffer + sizeof(sz_size_t)) = sizeof(sz_size_t) * 2; // The capacity and consumption so far
 }
 
 SZ_PUBLIC sz_cptr_t _sz_sequence_from_null_terminated_strings_get_start(void const *handle, sz_size_t i) {

From a6c0fa214143fd97cf5fdb71fd373f9889461307 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 28 Mar 2025 16:18:00 +0000
Subject: [PATCH 255/751] Fix: NVCC warning for negative size field

Fixes the "signed bit field of length 1" warning #108-D.
---
 include/stringzilla/types.h | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 11ec13a9..b0bb953a 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -253,6 +253,22 @@
 #endif
 #endif
 
+#ifndef SZ_USE_OPENMP
+#ifdef _OPENMP
+#define SZ_USE_OPENMP (1)
+#else
+#define SZ_USE_OPENMP (0)
+#endif
+#endif
+
+#ifndef SZ_USE_CUDA
+#ifdef __NVCC__
+#define SZ_USE_CUDA (1)
+#else
+#define SZ_USE_CUDA (0)
+#endif
+#endif
+
 /*  Hardware-specific headers for different SIMD intrinsics and register wrappers.
  */
 #if SZ_USE_HASWELL || SZ_USE_SKYLAKE || SZ_USE_ICE
@@ -369,10 +385,7 @@ typedef sz_i32_t sz_ssize_t; // ? Preferred over the `__PTRDIFF_TYPE__` and `__I
 /**
  *  @brief  Compile-time assert macro similar to `static_assert` in C++.
  */
-#define sz_static_assert(condition, name)                \
-    typedef struct {                                     \
-        int static_assert_##name : (condition) ? 1 : -1; \
-    } sz_static_assert_##name##_t
+#define sz_static_assert(condition, name) typedef char sz_static_assert_##name[(condition) ? 1 : -1]
 
 sz_static_assert(sizeof(sz_size_t) == sizeof(void *), sz_size_t_must_be_pointer_size);
 sz_static_assert(sizeof(sz_ssize_t) == sizeof(void *), sz_ssize_t_must_be_pointer_size);

From e4e517fdb863ee17694d517c882261c6298557b6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 28 Mar 2025 18:18:29 +0000
Subject: [PATCH 256/751] Fix: Accounting for different gap costs

---
 include/stringzilla/similarity.hpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/stringzilla/similarity.hpp b/include/stringzilla/similarity.hpp
index 6c5d7905..975269c6 100644
--- a/include/stringzilla/similarity.hpp
+++ b/include/stringzilla/similarity.hpp
@@ -275,6 +275,11 @@ inline sz_size_t levenshtein_distance_utf8( //
     first_type_ const &first, second_type_ const &second,
     allocator_type_ &&alloc = allocator_type_ {}) noexcept(false) {
 
+    sz_size_t const first_length = first.length();
+    sz_size_t const second_length = second.length();
+    if (first_length == 0) return second_length;
+    if (second_length == 0) return first_length;
+
     // Check if the strings are entirely composed of ASCII characters,
     // and default to a simpler algorithm in that case.
     if (sz_isascii(first.data(), first.length()) && sz_isascii(second.data(), second.length()))
@@ -336,8 +341,8 @@ inline sz_ssize_t needleman_wunsch_score(                 //
 
     sz_size_t const first_length = first.length();
     sz_size_t const second_length = second.length();
-    if (first_length == 0) return second_length;
-    if (second_length == 0) return first_length;
+    if (first_length == 0) return second_length * gap;
+    if (second_length == 0) return first_length * gap;
 
     // Estimate the maximum dimension of the DP matrix
     sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;

From e82d0459f4baeec3bfdfd57dec50b3026338920d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 28 Mar 2025 18:19:03 +0000
Subject: [PATCH 257/751] Improve: Share C++ macros and typedefs

---
 include/stringzilla/similarity.hpp  |  86 +++++-----
 include/stringzilla/stringzilla.hpp |  73 +-------
 include/stringzilla/types.hpp       | 257 ++++++++++++++++++++++++++++
 3 files changed, 303 insertions(+), 113 deletions(-)
 create mode 100644 include/stringzilla/types.hpp

diff --git a/include/stringzilla/similarity.hpp b/include/stringzilla/similarity.hpp
index 975269c6..22abd9d5 100644
--- a/include/stringzilla/similarity.hpp
+++ b/include/stringzilla/similarity.hpp
@@ -1,6 +1,6 @@
 /**
  *  @brief  OpenMP-accelerated string similarity utilities.
- *  @file   similarity.hpp
+ *  @file   similarities.hpp
  *  @author Ash Vardanian
  *
  *  Includes core APIs:
@@ -13,38 +13,12 @@
 #ifndef STRINGZILLA_SIMILARITY_HPP_
 #define STRINGZILLA_SIMILARITY_HPP_
 
-#include "types.h"
+#include "types.hpp"
 
 namespace ashvardanian {
 namespace stringzilla {
 namespace openmp {
 
-struct dummy_allocator_t {
-    using value_type = char;
-    inline char *allocate(sz_size_t) const noexcept { return nullptr; }
-    inline void deallocate(char *, sz_size_t) const noexcept {}
-};
-
-struct uniform_substitution_cost_t {
-    inline sz_error_cost_t operator()(char a, char b) const noexcept { return a == b ? 0 : 1; }
-};
-
-struct lookup_substitution_cost_t {
-    sz_error_cost_t const *costs;
-    inline sz_error_cost_t operator()(char a, char b) const noexcept { return costs[(sz_u8_t)a * 256 + (sz_u8_t)b]; }
-};
-
-template <typename char_type_>
-struct span {
-    char_type_ const *data_;
-    sz_size_t size_;
-
-    char_type_ const *begin() const noexcept { return data_; }
-    char_type_ const *end() const noexcept { return data_ + size_; }
-    char_type_ const *data() const noexcept { return data_; }
-    sz_size_t size() const noexcept { return size_; }
-};
-
 /**
  *  @brief  Alignment Score and Edit Distance algorithm evaluating the Dynamic Programming matrix
  *          @b three skewed (reverse) diagonals at a time on a CPU, leveraging OpenMP for parallelization.
@@ -70,7 +44,7 @@ template <                                                         //
     typename char_type_,                                           //
     typename distance_type_ = sz_size_t,                           //
     typename get_substitution_cost_ = uniform_substitution_cost_t, //
-    typename allocator_type_ = dummy_allocator_t                   //
+    typename allocator_type_ = dummy_alloc_t                       //
     >
 sz_status_t score_diagonally(                                                        //
     span<char_type_ const> first, span<char_type_ const> second,                     //
@@ -224,10 +198,20 @@ sz_status_t score_diagonally(
     return sz_success_k;
 }
 
-template <                                       //
-    typename first_type_,                        //
-    typename second_type_,                       //
-    typename allocator_type_ = dummy_allocator_t //
+/**
+ *  @brief Computes the @b byte-level Levenshtein distance between two strings using the OpenMP backend.
+ *  @param[in] first The first string.
+ *  @param[in] second The second string.
+ *  @param[in] alloc An allocator for the internal buffers.
+ *  @return The Levenshtein distance between the two strings.
+ *  @throws `std::bad_alloc` if the allocator fails to allocate memory.
+ *  @sa `levenshtein_distance_utf8` for UTF-8 strings.
+ *  @sa `score_diagonally` for the core algorithm.
+ */
+template <                                   //
+    typename first_type_,                    //
+    typename second_type_,                   //
+    typename allocator_type_ = dummy_alloc_t //
     >
 inline sz_size_t levenshtein_distance( //
     first_type_ const &first, second_type_ const &second,
@@ -266,10 +250,20 @@ inline sz_size_t levenshtein_distance( //
     }
 }
 
-template <                                       //
-    typename first_type_,                        //
-    typename second_type_,                       //
-    typename allocator_type_ = dummy_allocator_t //
+/**
+ *  @brief Computes the @b rune-level Levenshtein distance between two UTF-8 strings using the OpenMP backend.
+ *  @param[in] first The first string.
+ *  @param[in] second The second string.
+ *  @param[in] alloc An allocator for the internal buffers.
+ *  @return The Levenshtein distance between the two strings.
+ *  @throws `std::bad_alloc` if the allocator fails to allocate memory.
+ *  @sa `levenshtein_distance` for binary strings.
+ *  @sa `score_diagonally` for the core algorithm.
+ */
+template <                                   //
+    typename first_type_,                    //
+    typename second_type_,                   //
+    typename allocator_type_ = dummy_alloc_t //
     >
 inline sz_size_t levenshtein_distance_utf8( //
     first_type_ const &first, second_type_ const &second,
@@ -329,10 +323,20 @@ inline sz_size_t levenshtein_distance_utf8( //
     }
 }
 
-template <                                       //
-    typename first_type_,                        //
-    typename second_type_,                       //
-    typename allocator_type_ = dummy_allocator_t //
+/**
+ *  @brief Computes the @b byte-level Needleman-Wunsch score between two strings using the OpenMP backend.
+ *  @param[in] first The first string.
+ *  @param[in] second The second string.
+ *  @param[in] alloc An allocator for the internal buffers.
+ *  @return The Needleman-Wunsch global alignment score between the two strings.
+ *  @throws `std::bad_alloc` if the allocator fails to allocate memory.
+ *  @sa `levenshtein_distance` for uniform substitution and gap costs.
+ *  @sa `score_diagonally` for the core algorithm.
+ */
+template <                                   //
+    typename first_type_,                    //
+    typename second_type_,                   //
+    typename allocator_type_ = dummy_alloc_t //
     >
 inline sz_ssize_t needleman_wunsch_score(                 //
     first_type_ const &first, second_type_ const &second, //
diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 98ad2872..1be79425 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -16,69 +16,7 @@
 #ifndef STRINGZILLA_HPP_
 #define STRINGZILLA_HPP_
 
-/**
- *  @brief  When set to 1, the library will include the C++ STL headers and implement
- *          automatic conversion from and to `std::stirng_view` and `std::basic_string<any_allocator>`.
- */
-#include "types.h"
-#ifndef SZ_AVOID_STL
-#define SZ_AVOID_STL (0) // true or false
-#endif
-
-/*  We need to detect the version of the C++ language we are compiled with.
- *  This will affect recent features like `operator<=>` and tests against STL.
- */
-#if __cplusplus >= 202101L
-#define _SZ_IS_CPP23 1
-#else
-#define _SZ_IS_CPP23 0
-#endif
-#if __cplusplus >= 202002L
-#define _SZ_IS_CPP20 1
-#else
-#define _SZ_IS_CPP20 0
-#endif
-#if __cplusplus >= 201703L
-#define _SZ_IS_CPP17 1
-#else
-#define _SZ_IS_CPP17 0
-#endif
-#if __cplusplus >= 201402L
-#define _SZ_IS_CPP14 1
-#else
-#define _SZ_IS_CPP14 0
-#endif
-#if __cplusplus >= 201103L
-#define _SZ_IS_CPP11 1
-#else
-#define _SZ_IS_CPP11 0
-#endif
-#if __cplusplus >= 199711L
-#define _SZ_IS_CPP98 1
-#else
-#define _SZ_IS_CPP98 0
-#endif
-
-/**
- *  @brief  Expands to `constexpr` in C++20 and later, and to nothing in older C++ versions.
- *          Useful for STL conversion operators, as several `std::string` members are `constexpr` in C++20.
- *
- *  The `constexpr` keyword has different applicability scope in different C++ versions.
- *  - C++11: Introduced `constexpr`, but no loops or multiple `return` statements were allowed.
- *  - C++14: Allowed loops, multiple statements, and local variables in `constexpr` functions.
- *  - C++17: Added the `if constexpr` construct for compile-time branching.
- *  - C++20: Added some dynamic memory allocations, `virtual` functions, and `try`/`catch` blocks.
- */
-#if _SZ_IS_CPP14
-#define sz_constexpr_if_cpp14 constexpr
-#else
-#define sz_constexpr_if_cpp14
-#endif
-#if _SZ_IS_CPP20
-#define sz_constexpr_if_cpp20 constexpr
-#else
-#define sz_constexpr_if_cpp20
-#endif
+#include "types.hpp"
 
 #if !SZ_AVOID_STL
 #include <array>
@@ -1066,13 +1004,6 @@ std::size_t range_length(iterator_type first, iterator_type last) {
 
 #pragma region Helper Types
 
-enum class status_t {
-    success_k = sz_success_k,
-    bad_alloc_k = sz_bad_alloc_k,
-    invalid_utf8_k = sz_invalid_utf8_k,
-    contains_duplicates_k = sz_contains_duplicates_k,
-};
-
 #if !SZ_AVOID_STL
 void raise(status_t status) noexcept(false) {
     switch (status) {
@@ -1083,8 +1014,6 @@ void raise(status_t status) noexcept(false) {
     }
 }
 
-using sorted_idx_t = sz_sorted_idx_t;
-
 #endif
 
 #pragma endregion
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
new file mode 100644
index 00000000..99855ed2
--- /dev/null
+++ b/include/stringzilla/types.hpp
@@ -0,0 +1,257 @@
+/**
+ *  @brief  Shared definitions for the StringZilla C++ library.
+ *  @file   types.hpp
+ *  @author Ash Vardanian
+ *
+ *  The goal for this header is to provide absolutely-minimal set of types and forward-declarations for
+ *  CPU and GPU backends of higher-level complex templated algorithms implemented outside of the C layer.
+ *  It includes the following primitive type aliases for the @b "types.h" header:
+ *
+ *  - `u8_t`, `u16_t`, `u32_t`, `u64_t`, `i8_t`, `i16_t`, `i32_t`, `i64_t` - sized integers.
+ *  - `size_t`, `ssize_t`, `ptr_t`, `cptr_t` - address-related types.
+ *  - `status_t`, `bool_t`, `ordering_t`, `rune_t`, `rune_length_t`, `error_cost_t` - for logic.
+ *
+ *  The library also defines the following higher-level structures:
+ *
+ *  - `span<value_type>` -
+ *  - `dummy_alloc_t` -
+ *  - `dummy_alloc<value_type>` -
+ *  - `arrow_string_tape<char_type, offset_type>` -
+ */
+#ifndef STRINGZILLA_TYPES_HPP_
+#define STRINGZILLA_TYPES_HPP_
+
+#include "types.h"
+
+/**
+ *  @brief  When set to 1, the library will include the C++ STL headers and implement
+ *          automatic conversion from and to `std::stirng_view` and `std::basic_string<any_allocator>`.
+ */
+#ifndef SZ_AVOID_STL
+#define SZ_AVOID_STL (0) // true or false
+#endif
+
+/*  We need to detect the version of the C++ language we are compiled with.
+ *  This will affect recent features like `operator<=>` and tests against STL.
+ */
+#if __cplusplus >= 202101L
+#define _SZ_IS_CPP23 1
+#else
+#define _SZ_IS_CPP23 0
+#endif
+#if __cplusplus >= 202002L
+#define _SZ_IS_CPP20 1
+#else
+#define _SZ_IS_CPP20 0
+#endif
+#if __cplusplus >= 201703L
+#define _SZ_IS_CPP17 1
+#else
+#define _SZ_IS_CPP17 0
+#endif
+#if __cplusplus >= 201402L
+#define _SZ_IS_CPP14 1
+#else
+#define _SZ_IS_CPP14 0
+#endif
+#if __cplusplus >= 201103L
+#define _SZ_IS_CPP11 1
+#else
+#define _SZ_IS_CPP11 0
+#endif
+#if __cplusplus >= 199711L
+#define _SZ_IS_CPP98 1
+#else
+#define _SZ_IS_CPP98 0
+#endif
+
+/**
+ *  @brief  Expands to `constexpr` in C++20 and later, and to nothing in older C++ versions.
+ *          Useful for STL conversion operators, as several `std::string` members are `constexpr` in C++20.
+ *
+ *  The `constexpr` keyword has different applicability scope in different C++ versions.
+ *  - C++11: Introduced `constexpr`, but no loops or multiple `return` statements were allowed.
+ *  - C++14: Allowed loops, multiple statements, and local variables in `constexpr` functions.
+ *  - C++17: Added the `if constexpr` construct for compile-time branching.
+ *  - C++20: Added some dynamic memory allocations, `virtual` functions, and `try`/`catch` blocks.
+ */
+#if _SZ_IS_CPP14
+#define sz_constexpr_if_cpp14 constexpr
+#else
+#define sz_constexpr_if_cpp14
+#endif
+#if _SZ_IS_CPP20
+#define sz_constexpr_if_cpp20 constexpr
+#else
+#define sz_constexpr_if_cpp20
+#endif
+
+namespace ashvardanian {
+namespace stringzilla {
+
+using i8_t = sz_i8_t;
+using u8_t = sz_u8_t;
+using u16_t = sz_u16_t;
+using i32_t = sz_i32_t;
+using u32_t = sz_u32_t;
+using u64_t = sz_u64_t;
+using i64_t = sz_i64_t;
+using size_t = sz_size_t;
+using ssize_t = sz_ssize_t;
+
+using ptr_t = sz_ptr_t;
+using cptr_t = sz_cptr_t;
+using error_cost_t = sz_error_cost_t;
+
+using bool_t = sz_bool_t;
+using ordering_t = sz_ordering_t;
+using rune_length_t = sz_rune_length_t;
+using rune_t = sz_rune_t;
+using sorted_idx_t = sz_sorted_idx_t;
+
+/** @sa sz_status_t */
+enum class status_t {
+    success_k = sz_success_k,
+    bad_alloc_k = sz_bad_alloc_k,
+    invalid_utf8_k = sz_invalid_utf8_k,
+    contains_duplicates_k = sz_contains_duplicates_k,
+};
+
+struct uniform_substitution_cost_t {
+    constexpr error_cost_t operator()(char a, char b) const noexcept { return a == b ? 0 : 1; }
+};
+
+struct lookup_substitution_cost_t {
+    error_cost_t const *costs;
+    constexpr error_cost_t operator()(char a, char b) const noexcept { return costs[(u8_t)a * 256 + (u8_t)b]; }
+};
+
+template <typename value_type_>
+struct span {
+    using value_type = value_type_;
+    using size_type = sz_size_t;
+    using difference_type = sz_ssize_t;
+
+    value_type *data_;
+    size_type size_;
+
+    constexpr value_type *begin() const noexcept { return data_; }
+    constexpr value_type *end() const noexcept { return data_ + size_; }
+    constexpr value_type *data() const noexcept { return data_; }
+    constexpr size_type size() const noexcept { return size_; }
+    constexpr size_type length() const noexcept { return size_; }
+};
+
+template <typename value_type_>
+struct dummy_alloc {
+    using value_type = value_type_;
+    using pointer = value_type *;
+    using size_type = size_t;
+    using difference_type = sz_ssize_t;
+
+    template <typename other_value_type_>
+    struct rebind {
+        using other = dummy_alloc<other_value_type_>;
+    };
+
+    constexpr dummy_alloc() noexcept = default;
+    constexpr dummy_alloc(dummy_alloc const &) noexcept = default;
+
+    template <typename other_value_type_>
+    constexpr dummy_alloc(dummy_alloc<other_value_type_> const &) noexcept {}
+
+    sz_constexpr_if_cpp14 value_type *allocate(size_type) const noexcept { return nullptr; }
+    sz_constexpr_if_cpp14 void deallocate(pointer, size_type) const noexcept {}
+
+    template <typename other_type_>
+    constexpr bool operator==(dummy_alloc<other_type_> const &) const noexcept {
+        return true;
+    }
+
+    template <typename other_type_>
+    constexpr bool operator!=(dummy_alloc<other_type_> const &) const noexcept {
+        return false;
+    }
+};
+
+using dummy_alloc_t = dummy_alloc<char>;
+
+/**
+ *  @brief  Apache @b Arrow-compatible tape data-structure to store a sequence of variable length strings.
+ *          Each string is appended to a contiguous memory block, delimited by the NULL character.
+ *          Provides @b ~O(1) access to each string by storing the offsets of each string in a separate array.
+ */
+template <typename char_type_, typename offset_type_, typename allocator_type_>
+struct arrow_string_tape {
+    using char_type = char_type_;
+    using offset_type = offset_type_;
+    using allocator_type = allocator_type_;
+
+    using value_type = span<char_type>;
+    using view_type = arrow_string_tape<char_type, offset_type, dummy_alloc<char_type>>;
+
+    using char_alloc_t = typename allocator_type::template rebind<char_type>::other;
+    using offset_alloc_t = typename allocator_type::template rebind<offset_type>::other;
+
+  private:
+    span<char_type> buffer_;
+    span<offset_type> offsets_;
+    char_alloc_t char_alloc_;
+    offset_alloc_t offset_alloc_;
+
+  public:
+    constexpr arrow_string_tape() = default;
+    constexpr arrow_string_tape(span<char_type> buffer, span<offset_type> offsets, allocator_type alloc)
+        : buffer_(buffer), offsets_(offsets), char_alloc_(alloc), offset_alloc_(alloc) {}
+
+    template <typename strings_iterator_>
+    sz_constexpr_if_cpp14 status_t try_assign(strings_iterator_ first, strings_iterator_ last) noexcept {
+        // Deallocate the previous memory if it was allocated
+        if (buffer_.data_) char_alloc_.deallocate(const_cast<char_type *>(buffer_.data_), buffer_.size_);
+        if (offsets_.data_) offset_alloc_.deallocate(const_cast<offset_type *>(offsets_.data_), offsets_.size_);
+
+        // Estimate the required memory size
+        size_t buffer_capacity = 0;
+        size_t max_count = 0;
+        for (auto it = first; it != last; ++it) {
+            buffer_capacity += it->size() + 1; // ? NULL-terminated
+            ++max_count;
+        }
+        buffer_ = {char_alloc_.allocate(buffer_capacity), buffer_capacity};
+        offsets_ = {offset_alloc_.allocate(max_count), max_count};
+        if (!buffer_.data_ || !offsets_.data_) return status_t::bad_alloc_k;
+
+        // Copy the strings to the buffer and store the offsets
+        auto buffer_ptr = buffer_.data_;
+        auto offsets_ptr = offsets_.data_;
+        for (auto it = first; it != last; ++it) {
+            *offsets_ptr++ = buffer_ptr - buffer_.data_;
+            // Perform a byte-level copy of the string, similar to `sz_copy`
+            for (size_t i = 0; i != it->size(); ++i) buffer_ptr[i] = it->data()[i];
+            buffer_ptr[it->size()] = '\0'; // ? NULL-terminated
+            buffer_ptr += it->size() + 1;
+        }
+        *offsets_ptr = static_cast<offset_type>(buffer_ptr - buffer_.data_);
+        return status_t::success_k;
+    }
+
+    sz_constexpr_if_cpp20 ~arrow_string_tape() noexcept {
+        if (buffer_.data_) char_alloc_.deallocate(const_cast<char_type *>(buffer_.data_), buffer_.size_);
+        if (offsets_.data_) offset_alloc_.deallocate(const_cast<offset_type *>(offsets_.data_), offsets_.size_);
+    }
+
+    constexpr value_type operator[](size_t i) const noexcept {
+        return {buffer_.data_ + offsets_.data_[i], offsets_.data_[i + 1] - offsets_.data_[i] - 1};
+    }
+
+    constexpr size_t size() const noexcept { return offsets_.size_ - 1; }
+    constexpr view_type view() const noexcept { return {buffer_, offsets_, dummy_alloc_t {}}; }
+
+    constexpr span<char_type> const &buffer() const noexcept { return buffer_; }
+    constexpr span<offset_type> const &offsets() const noexcept { return offsets_; }
+};
+
+} // namespace stringzilla
+} // namespace ashvardanian
+
+#endif // STRINGZILLA_TYPES_HPP_

From 75248821027c1dff9529d01509cafc3d4fcbcd7e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 28 Mar 2025 18:20:11 +0000
Subject: [PATCH 258/751] Fix: Hardening `malloc(0)` behavior

---
 include/stringzilla/types.h | 4 ++++
 scripts/test.cpp            | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index b0bb953a..4330bd0f 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -550,6 +550,9 @@ typedef struct sz_memory_allocator_t {
  *  @brief Initializes a memory allocator to use the system default `malloc` and `free`.
  *  @warning The function is not available if the library was compiled with `SZ_AVOID_LIBC`.
  *  @param[in] alloc Memory allocator to initialize.
+ *
+ *  @note Unlike the C standard library, the `malloc(0)` is guaranteed to return a non-null pointer.
+ *  @see https://en.cppreference.com/w/c/memory/malloc
  */
 SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc);
 
@@ -1238,6 +1241,7 @@ SZ_INTERNAL void _sz_memory_free_fixed(sz_ptr_t start, sz_size_t length, void *h
 
 SZ_PUBLIC void *_sz_memory_allocate_default(sz_size_t length, void *handle) {
     sz_unused(handle);
+    if (length == 0) return SZ_NULL;
     return malloc(length);
 }
 SZ_PUBLIC void _sz_memory_free_default(sz_ptr_t start, sz_size_t length, void *handle) {
diff --git a/scripts/test.cpp b/scripts/test.cpp
index 834f270d..bda3a01e 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -181,6 +181,8 @@ static void test_sequence_struct() {
 
 /** @brief Validates `sz_memory_allocator_t` and related construction utilities. */
 static void test_memory_allocator_struct() {
+    // Our behavior for `malloc(0)` is to return a NULL pointer,
+    // while the standard is implementation-defined.
     {
         sz_memory_allocator_t alloc;
         sz_memory_allocator_init_default(&alloc);

From 2861a853c6e14f6343b5837d72e31ef31e69081c Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 28 Mar 2025 18:22:35 +0000
Subject: [PATCH 259/751] Add: Parallel GPU kernels

---
 include/stringzilla/similarities.cuh          | 381 ++++++++++++++++++
 .../{similarity.hpp => similarities.hpp}      |   0
 include/stringzilla/stringzilla.h             |   6 +-
 include/stringzilla/types.cuh                 |  76 ++++
 scripts/bench_similarity.cpp                  |   7 +-
 scripts/{test.cpp => test.cu}                 |  79 +++-
 6 files changed, 542 insertions(+), 7 deletions(-)
 create mode 100644 include/stringzilla/similarities.cuh
 rename include/stringzilla/{similarity.hpp => similarities.hpp} (100%)
 create mode 100644 include/stringzilla/types.cuh
 rename scripts/{test.cpp => test.cu} (96%)

diff --git a/include/stringzilla/similarities.cuh b/include/stringzilla/similarities.cuh
new file mode 100644
index 00000000..e952d858
--- /dev/null
+++ b/include/stringzilla/similarities.cuh
@@ -0,0 +1,381 @@
+/**
+ *  @brief  CUDA-accelerated string similarity utilities.
+ *  @file   similarities.cuh
+ *  @author Ash Vardanian
+ *
+ *  Includes core APIs:
+ *
+ *  - `sz::cuda::levenshtein_distances` & `sz::cuda::levenshtein_distances_utf8` for Levenshtein edit-distances.
+ *  - `sz::cuda::needleman_wunsch_score` for weighted Needleman-Wunsch global alignment.
+ *
+ *  Unlike the trivially parallelizable CPU kernels in `stringzilla/similarity.h`, the GPU kernels in this file are
+ *  designed for batch-processing of large collections of strings, assigning a single warp to each string pair.
+ *  Thus, they should be used when hundreds of pairwise comparisons are needed, and the strings are long enough to
+ *  amortize the cost of copying them to the GPU.
+ */
+#ifndef STRINGZILLA_SIMILARITIES_CUH_
+#define STRINGZILLA_SIMILARITIES_CUH_
+
+#include "types.cuh"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+namespace ashvardanian {
+namespace stringzilla {
+namespace cuda {
+
+/**
+ *  @brief  Alignment Score and Edit Distance algorithm evaluating the Dynamic Programming matrix
+ *          @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization.
+ *          This function implements a logic for a single pair of strings.
+ *
+ *  @param[in] first The first string.
+ *  @param[in] second The second string.
+ *  @param[out] result_ref Location to dump the calculated score from the first thread in warp.
+ *  @param[in] gap_cost The uniform cost of a gap (insertion or deletion).
+ *  @param[in] get_substitution_cost A commutative function returning the cost of substituting one char with another.
+ *
+ *  We could have implemented the logic of this function as part of the `scores_diagonally` kernel,
+ *  but we want to control the used @b `distance_type_` at the level of each warp and score computation.
+ *  If all of the strings except for one are 100-ish bytes, but one is 1000-ish bytes, we want to use
+ *  the 8-bit `distance_type_` for the smaller strings, and 16-bit `distance_type_` for the larger one.
+ *  The smaller the type, the more likely we are to use specialized @b SIMD instructions, like
+ */
+template <                                                        //
+    typename char_type_,                                          //
+    typename distance_type_ = sz_size_t,                          //
+    typename get_substitution_cost_ = uniform_substitution_cost_t //
+
+    >
+__device__ void score_diagonally(                                                        //
+    span<char_type_ const> const &first, span<char_type_ const> const &second,           //
+    distance_type_ &result_ref,                                                          //
+    sz_error_cost_t gap_cost = 1,                                                        //
+    get_substitution_cost_ const &get_substitution_cost = uniform_substitution_cost_t {} //
+) {
+    using char_type = char_type_;
+    using distance_type = distance_type_;
+
+    // Make sure the size relation between the strings is correct.
+    char_type const *shorter_global = first.data(), *longer_global = second.data();
+    sz_size_t shorter_length = first.size(), longer_length = second.size();
+    if (shorter_length > longer_length) {
+        std::swap(shorter_global, longer_global);
+        std::swap(shorter_length, longer_length);
+    }
+
+    // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
+    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
+    sz_size_t const shorter_dim = shorter_length + 1;
+    sz_size_t const longer_dim = longer_length + 1;
+
+    // Let's say we are dealing with 3 and 5 letter words.
+    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
+    // It will have:
+    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
+    // - 2 diagonals of fixed length, at positions: 4, 5.
+    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
+    sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
+    sz_size_t const max_diagonal_length = shorter_length + 1;
+
+    // Allocating shared memory is handled on the host side.
+    extern __shared__ char buffer[];
+
+    // The next few pointers will be swapped around.
+    distance_type *previous_distances = reinterpret_cast<distance_type *>(buffer);
+    distance_type *current_distances = previous_distances + max_diagonal_length;
+    distance_type *next_distances = current_distances + max_diagonal_length;
+    char_type *const longer = (char_type *)(next_distances + max_diagonal_length);
+    char_type *const shorter_reversed = longer + longer_length;
+
+    // Each thread in the warp will be loading it's own set of strided characters into shared memory.
+    for (sz_size_t i = threadIdx.x; i < longer_length; i += blockDim.x) longer[i] = longer_global[i];
+    for (sz_size_t i = threadIdx.x; i < shorter_length; i += blockDim.x)
+        shorter_reversed[i] = shorter_global[shorter_length - i - 1];
+
+    // Initialize the first two diagonals:
+    if (threadIdx.x == 0) {
+        previous_distances[0] = 0;
+        current_distances[0] = current_distances[1] = 1;
+    }
+
+    // Make sure the shared memory is fully loaded.
+    __syncthreads();
+
+    // We skip diagonals 0 and 1, as they are trivial.
+    // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
+    // so we are effectively computing just one value, as will be marked by a single set bit in
+    // the `next_diagonal_mask` on the very first iteration.
+    sz_size_t next_diagonal_index = 2;
+
+    // Progress through the upper-left triangle of the Levenshtein matrix.
+    for (; next_diagonal_index < shorter_dim; ++next_diagonal_index) {
+        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
+        for (sz_size_t offset_in_diagonal = threadIdx.x + 1; offset_in_diagonal + 1 < next_diagonal_length;
+             offset_in_diagonal += blockDim.x) {
+            // ? Note that here we are still traversing both buffers in the same order,
+            // ? because the shorter string has been reversed into `shorter_reversed`.
+            char_type shorter_char = shorter_reversed[shorter_length - next_diagonal_index + offset_in_diagonal];
+            char_type longer_char = longer[offset_in_diagonal - 1];
+            sz_error_cost_t cost_of_substitution = get_substitution_cost(shorter_char, longer_char);
+            distance_type cost_if_substitution = previous_distances[offset_in_diagonal - 1] + cost_of_substitution;
+            distance_type cost_if_deletion_or_insertion =      //
+                sz_min_of_two(                                 //
+                    current_distances[offset_in_diagonal - 1], //
+                    current_distances[offset_in_diagonal]      //
+                    ) +
+                gap_cost;
+            next_distances[offset_in_diagonal] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
+        }
+        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
+        if (threadIdx.x == 0) next_distances[0] = next_distances[next_diagonal_length - 1] = next_diagonal_index;
+        __syncthreads();
+
+        // Perform a circular rotation of those buffers, to reuse the memory.
+        distance_type *temporary = previous_distances;
+        previous_distances = current_distances;
+        current_distances = next_distances;
+        next_distances = temporary;
+    }
+
+    // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
+    for (; next_diagonal_index < longer_dim; ++next_diagonal_index) {
+        sz_size_t const next_diagonal_length = shorter_dim;
+        for (sz_size_t offset_in_diagonal = threadIdx.x; offset_in_diagonal + 1 < next_diagonal_length;
+             offset_in_diagonal += blockDim.x) {
+            char_type shorter_char = shorter_reversed[shorter_length - shorter_dim + offset_in_diagonal + 1];
+            char_type longer_char = longer[next_diagonal_index - shorter_dim + offset_in_diagonal];
+            sz_error_cost_t cost_of_substitution = get_substitution_cost(shorter_char, longer_char);
+            distance_type cost_if_substitution = previous_distances[offset_in_diagonal] + cost_of_substitution;
+            distance_type cost_if_deletion_or_insertion =     //
+                sz_min_of_two(                                //
+                    current_distances[offset_in_diagonal],    //
+                    current_distances[offset_in_diagonal + 1] //
+                    ) +
+                gap_cost;
+            next_distances[offset_in_diagonal] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
+        }
+        if (threadIdx.x == 0) next_distances[next_diagonal_length - 1] = next_diagonal_index;
+
+        __syncthreads();
+        // ! In the central anti-diagonal band, we can't just assign the `current_distances + 1` to `previous_distances`
+        // ! for the circular shift, as we will end up spilling outside of the diagonal's buffer a few iterations later.
+        // ! Assuming in-place `memmove` is tricky on the GPU, so we will copy the data.
+        for (sz_size_t i = threadIdx.x; i + 1 < next_diagonal_length; i += blockDim.x)
+            previous_distances[i] = current_distances[i + 1];
+        __syncthreads();
+        for (sz_size_t i = threadIdx.x; i < next_diagonal_length; i += blockDim.x)
+            current_distances[i] = next_distances[i];
+        __syncthreads();
+    }
+
+    // Now let's handle the bottom-right triangle of the matrix.
+    for (; next_diagonal_index < diagonals_count; ++next_diagonal_index) {
+        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
+#pragma omp simd
+        for (sz_size_t offset_in_diagonal = threadIdx.x; offset_in_diagonal < next_diagonal_length;
+             offset_in_diagonal += blockDim.x) {
+            char_type shorter_char = shorter_reversed[shorter_length - shorter_dim + offset_in_diagonal + 1];
+            char_type longer_char = longer[next_diagonal_index - shorter_dim + offset_in_diagonal];
+            sz_error_cost_t cost_of_substitution = get_substitution_cost(shorter_char, longer_char);
+            distance_type cost_if_substitution = previous_distances[offset_in_diagonal] + cost_of_substitution;
+            distance_type cost_if_deletion_or_insertion =     //
+                sz_min_of_two(                                //
+                    current_distances[offset_in_diagonal],    //
+                    current_distances[offset_in_diagonal + 1] //
+                    ) +
+                gap_cost;
+            next_distances[offset_in_diagonal] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
+        }
+        // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
+        // dropping the first element in the current array.
+        distance_type *temporary = previous_distances;
+        // ! Drop the first entry among the current distances.
+        // ! Assuming every next diagonal is shorter by one element, we don't need a full-blown `sz_move`.
+        // ! to shift the array by one element.
+        previous_distances = current_distances + 1;
+        current_distances = next_distances;
+        next_distances = temporary;
+        __syncthreads();
+    }
+
+    // Export one result per each block.
+    if (threadIdx.x == 0) result_ref = current_distances[0];
+}
+
+/**
+ *  @brief  Alignment Scores and Edit Distances algorithm evaluating the Dynamic Programming matrix
+ *          @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization.
+ *          Each pair of strings gets its own @b "block" of CUDA threads and shared memory.
+ *
+ *  @note   Unlike the `openmp::score_diagonally` kernel, 32-bit integers are used for offsets and lengths,
+ *          as GPUs are often better optimized for 32-bit arithmetic.
+ *
+ *  @param[in] first_strings Array of first strings in each pair for score calculation.
+ *  @param[in] second_strings Array of second strings in each pair for score calculation.
+ *  @param[out] results_ptr Output array of scores for each pair of strings.
+ */
+template <                                                        //
+    typename first_strings_type_,                                 //
+    typename second_strings_type_,                                //
+    typename global_distance_type_ = sz_size_t,                   //
+    typename get_substitution_cost_ = uniform_substitution_cost_t //
+    >
+__global__ void scores_diagonally(                                                //
+    first_strings_type_ const &first_strings,                                     //
+    second_strings_type_ const &second_strings,                                   //
+    global_distance_type_ *results_ptr,                                           //
+    sz_error_cost_t gap_cost = 1,                                                 //
+    get_substitution_cost_ get_substitution_cost = uniform_substitution_cost_t {} //
+) {
+    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
+    using first_string_type = typename first_strings_type_::value_type;
+    using second_string_type = typename second_strings_type_::value_type;
+    using first_char_type = typename first_string_type::value_type;
+    using second_char_type = typename second_string_type::value_type;
+    static_assert(sizeof(first_char_type) == sizeof(second_char_type), "Character types don't match");
+    using char_type = first_char_type;
+    using distance_type = global_distance_type_;
+
+    // We are computing N edit distances for N pairs of strings. Not a cartesian product!
+    // Each block/warp may end up receiving a different number of strings.
+    for (sz_size_t pair_idx = threadIdx.x; pair_idx < first_strings.size(); pair_idx += blockDim.x) {
+        first_string_type const &first_global = first_strings[pair_idx];
+        second_string_type const &second_global = second_strings[pair_idx];
+        distance_type &results_ref = results_ptr[pair_idx];
+
+        // Skip empty strings.
+        sz_size_t const first_length = first_global.length();
+        sz_size_t const second_length = second_global.length();
+        if (first_length == 0) {
+            results_ref = second_length * gap_cost;
+            continue;
+        }
+        if (second_length == 0) {
+            results_ref = first_length * gap_cost;
+            continue;
+        }
+
+        // Estimate the maximum dimension of the DP matrix to pick the smallest fitting type.
+        sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
+        span<char const> const first = {first_global.data(), first_length};
+        span<char const> const second = {second_global.data(), second_length};
+        if (max_dim < 256u) {
+            sz_u8_t result_u8;
+            score_diagonally(first, second, result_u8, gap_cost, get_substitution_cost);
+            results_ref = result_u8;
+        }
+        else if (max_dim < 65536u) {
+            sz_u16_t result_u16;
+            score_diagonally(first, second, result_u16, gap_cost, get_substitution_cost);
+            results_ref = result_u16;
+        }
+        else {
+            sz_size_t result_size;
+            score_diagonally(first, second, result_size, gap_cost, get_substitution_cost);
+            results_ref = result_size;
+        }
+    }
+}
+
+template <typename first_strings_type_,
+          typename second_strings_type_>
+size_t scores_diagonally_shared_memory_requirement( //
+    first_strings_type_ const &first_strings, second_strings_type_ const &second_strings) noexcept {
+    sz_size_t max_required_shared_memory = 0;
+    for (sz_size_t i = 0; i < first_strings.size(); ++i) {
+        sz_size_t const first_length = first_strings[i].length();
+        sz_size_t const second_length = second_strings[i].length();
+        sz_size_t const shorter_length = sz_min_of_two(first_length, second_length);
+        sz_size_t const longer_length = sz_max_of_two(first_length, second_length);
+        sz_size_t const max_dim = longer_length + 1;
+        sz_size_t const bytes_per_cell = max_dim < 256 ? 1 : max_dim < 65536 ? 2 : 4;
+        // For each string we need to copy its contents, and allocate 3 bands proportional to the length
+        // of the shorter string with each cell being big enough to hold the length of the longer one.
+        sz_size_t const shared_memory_requirement = 3 * shorter_length * bytes_per_cell + first_length + second_length;
+        max_required_shared_memory = sz_max_of_two(max_required_shared_memory, shared_memory_requirement);
+    }
+    return max_required_shared_memory;
+}
+
+/**
+ *  @brief Computes the @b byte-level Levenshtein distances between pairs of strings using the CUDA backend.
+ *  @param[in] first_strings Array of first strings in each pair for score calculation.
+ *  @param[in] second_strings Array of second strings in each pair for score calculation.
+ *  @param[in] alloc An allocator for the internal buffers.
+ *  @sa `levenshtein_distance_utf8` for UTF-8 strings.
+ *  @sa `scores_diagonally` for the core algorithm.
+ */
+template <                             //
+    typename first_strings_type_,      //
+    typename second_strings_type_,     //
+    typename results_type_ = sz_size_t //
+    >
+status_t levenshtein_distances(                                                           //
+    first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
+    results_type_ *results,                                                               //
+    specs_t specs = {}) noexcept(false) {
+
+    // Make sure that we don't string pairs that are too large to fit 3 matrix diagonals into shared memory.
+    // H100 Streaming Multiprocessor can have up to 128 active warps concurrently and only 256 KB of shared memory.
+    // A100 SMs had only 192 KB. We can't deal with blocks that require more memory than the SM can provide.
+    sz_size_t shared_memory_per_block = scores_diagonally_shared_memory_requirement(first_strings, second_strings);
+    if (shared_memory_per_block > specs.shared_memory_per_sm) return status_t::bad_alloc_k;
+
+    // In most cases we should be able to fir many blocks per SM.
+    sz_size_t count_blocks = specs.shared_memory_per_sm / shared_memory_per_block;
+    if (count_blocks > specs.blocks_per_sm) count_blocks = specs.blocks_per_sm;
+
+    // Let's use all 32 threads in a warp.
+    constexpr sz_size_t threads_per_block = 32u;
+    auto kernel =
+        &scores_diagonally<first_strings_type_, second_strings_type_, results_type_, uniform_substitution_cost_t>;
+    kernel<<<count_blocks, threads_per_block, shared_memory_per_block>>>(first_strings, second_strings, results, 1, {});
+    return status_t::success_k;
+}
+
+/**
+ *  @brief Computes the @b byte-level Needleman-Wunsch scores between pairs of strings using the CUDA backend.
+ *  @param[in] first_strings Array of first strings in each pair for score calculation.
+ *  @param[in] second_strings Array of second strings in each pair for score calculation.
+ *  @param[in] alloc An allocator for the internal buffers.
+ *  @sa `scores_diagonally` for the core algorithm.
+ */
+template <                              //
+    typename first_strings_type_,       //
+    typename second_strings_type_,      //
+    typename results_type_ = sz_ssize_t //
+    >
+status_t needleman_wunsch_scores(                                                         //
+    first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
+    results_type_ *results,                                                               //
+    sz_error_cost_t const *subs, sz_error_cost_t gap,                                     //
+    specs_t specs = {}) noexcept(false) {
+
+    // Make sure that we don't string pairs that are too large to fit 3 matrix diagonals into shared memory.
+    // H100 Streaming Multiprocessor can have up to 128 active warps concurrently and only 256 KB of shared memory.
+    // A100 SMs had only 192 KB. We can't deal with blocks that require more memory than the SM can provide.
+    sz_size_t shared_memory_per_block = scores_diagonally_shared_memory_requirement(first_strings, second_strings);
+    // ! Add the space we need to preload the substitution costs.
+    // shared_memory_per_block += 256 * 256 * sizeof(sz_error_cost_t);
+    if (shared_memory_per_block > specs.shared_memory_per_sm) return status_t::bad_alloc_k;
+
+    // In most cases we should be able to fir many blocks per SM.
+    sz_size_t count_blocks = specs.shared_memory_per_sm / shared_memory_per_block;
+    if (count_blocks > specs.blocks_per_sm) count_blocks = specs.blocks_per_sm;
+
+    // Let's use all 32 threads in a warp.
+    constexpr sz_size_t threads_per_block = 32u;
+    auto kernel =
+        &scores_diagonally<first_strings_type_, second_strings_type_, results_type_, lookup_substitution_cost_t>;
+    kernel<<<count_blocks, threads_per_block, shared_memory_per_block>>>(first_strings, second_strings, results, gap,
+                                                                         lookup_substitution_cost_t {subs});
+    return status_t::success_k;
+}
+
+} // namespace cuda
+} // namespace stringzilla
+} // namespace ashvardanian
+
+#endif // STRINGZILLA_SIMILARITIES_CUH_
\ No newline at end of file
diff --git a/include/stringzilla/similarity.hpp b/include/stringzilla/similarities.hpp
similarity index 100%
rename from include/stringzilla/similarity.hpp
rename to include/stringzilla/similarities.hpp
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index a1c5e60f..913720d4 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -1,7 +1,9 @@
 /**
  *  @brief  StringZilla is a collection of advanced string algorithms, designed to be used in Big Data applications.
- *          It is generally faster than LibC, and has a broader & cleaner interface, and targets modern x86 CPUs
- *          with AVX-512 and Arm NEON and older CPUs with SWAR and auto-vectorization.
+ *          It is generally faster than LibC, and has a broader & cleaner interface for safer @b length-bounded strings.
+ *          On modern CPUs it uses AVX2, AVX-512, NEON, SVE, & SVE2 @b SIMD instructions & provides SWAR for older CPUs.
+ *          On @b CUDA-capable GPUs it also provides C++ kernels for bulk processing.
+ *
  *  @file   stringzilla.h
  *  @author Ash Vardanian
  *
diff --git a/include/stringzilla/types.cuh b/include/stringzilla/types.cuh
new file mode 100644
index 00000000..f3a23f7c
--- /dev/null
+++ b/include/stringzilla/types.cuh
@@ -0,0 +1,76 @@
+/**
+ *  @brief  Shared definitions for the StringZilla CUDA library.
+ *  @file   types.cuh
+ *  @author Ash Vardanian
+ *
+ *  The goal for this header is to provide absolutely-minimal set of types and forward-declarations for
+ *  CUDA backends of higher-level complex templated algorithms implemented outside of the C layer, like:
+ *
+ *  - `unified_alloc` - a custom allocator that uses CUDA Unified Memory for allocation.
+ */
+#ifndef STRINGZILLA_TYPES_CUH_
+#define STRINGZILLA_TYPES_CUH_
+
+#include <cuda_runtime.h> // `cudaMallocManaged`, `cudaFree`, `cudaSuccess`, `cudaGetErrorString`
+
+#include "types.hpp"
+
+namespace ashvardanian {
+namespace stringzilla {
+namespace cuda {
+
+/**
+ *  @brief  A custom allocator that uses CUDA Unified Memory for allocation.
+ */
+template <typename value_type_>
+struct unified_alloc {
+    using value_type = value_type_;
+    using pointer = value_type *;
+    using size_type = sz_size_t;
+    using difference_type = sz_ssize_t;
+
+    template <typename other_value_type_>
+    struct rebind {
+        using other = unified_alloc<other_value_type_>;
+    };
+
+    constexpr unified_alloc() noexcept = default;
+    constexpr unified_alloc(unified_alloc const &) noexcept = default;
+
+    template <typename other_value_type_>
+    constexpr unified_alloc(unified_alloc<other_value_type_> const &) noexcept {}
+
+    value_type *allocate(size_type n) const noexcept {
+        value_type *result = nullptr;
+        auto error = cudaMallocManaged((value_type **)&result, n * sizeof(value_type));
+        if (error != cudaSuccess) return nullptr;
+        return result;
+    }
+
+    void deallocate(pointer p, size_type) const noexcept {
+        if (!p) return;
+        cudaFree(p);
+    }
+
+    template <typename other_type_>
+    bool operator==(unified_alloc<other_type_> const &) const noexcept {
+        return true;
+    }
+
+    template <typename other_type_>
+    bool operator!=(unified_alloc<other_type_> const &) const noexcept {
+        return false;
+    }
+};
+
+struct specs_t {
+    size_t total_sm_count = 108;              // ? On A100
+    size_t blocks_per_sm = 128;               // ? Each, generally, with 32 threads
+    size_t shared_memory_per_sm = 192 * 1024; // ? On A100 it's 192 KB per SM
+};
+
+} // namespace cuda
+} // namespace stringzilla
+} // namespace ashvardanian
+
+#endif // STRINGZILLA_TYPES_CUH_
diff --git a/scripts/bench_similarity.cpp b/scripts/bench_similarity.cpp
index 3d08b59e..60d55c7f 100644
--- a/scripts/bench_similarity.cpp
+++ b/scripts/bench_similarity.cpp
@@ -51,7 +51,7 @@
 #include "bench.hpp"
 #include "test.hpp" // `levenshtein_baseline`, `unary_substitution_costs`
 
-#include "stringzilla/similarity.hpp"
+#include "stringzilla/similarities.hpp"
 
 using namespace ashvardanian::stringzilla::scripts;
 
@@ -144,8 +144,7 @@ struct alignment_score_from_sz {
 };
 
 /** @brief Wraps a hardware-specific Levenshtein-distance backend into something @b `bench_unary`-compatible . */
-template <sz_levenshtein_distance_t levenshtein_distance_>
-struct score_from_sz_cpp {
+struct levenshtein_from_sz_openmp {
 
     environment_t const &env;
     sz_size_t bound = SZ_SIZE_MAX;
@@ -170,6 +169,8 @@ void bench_edits(environment_t const &env) {
     bench_result_t base_utf8 = bench_unary(env, "sz_levenshtein_distance_utf8_serial", base_utf8_call).log(base);
     sz_unused(base_utf8);
 
+    bench_unary(env, "sz::openmp::levenshtein_distance", levenshtein_from_sz_openmp(env)).log(base);
+
 #if SZ_USE_ICE
     auto ice_call = levenshtein_from_sz<sz_levenshtein_distance_ice>(env);
     bench_unary(env, "sz_levenshtein_distance_ice", ice_call).log(base);
diff --git a/scripts/test.cpp b/scripts/test.cu
similarity index 96%
rename from scripts/test.cpp
rename to scripts/test.cu
index bda3a01e..a18dd430 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cu
@@ -38,8 +38,21 @@
 #endif
 #define SZ_DEBUG 1 // Enforce aggressive logging for this unit.
 
-#include <stringzilla/stringzilla.hpp>
-#include <stringzilla/similarity.hpp>
+#include <stringzilla/stringzilla.hpp> // C++ string class replacement
+#include <stringzilla/stringzilla.cuh> // Precompiled CUDA templates
+
+/**
+ * ! Overload the following with caution to enable parallelism.
+ * ! They control the OpenMP CPU backend as well as the CUDA GPU backend.
+ */
+#if SZ_USE_CUDA
+#include <stringzilla/types.cuh>        // `unified_alloc`
+#include <stringzilla/similarities.cuh> // Parallel string processing on CUDA or OpenMP
+#endif
+
+#if SZ_USE_OPENMP
+#include <stringzilla/similarities.hpp> // OpenMP templates for string similarity measures
+#endif
 
 #if defined(__SANITIZE_ADDRESS__)
 #include <sanitizer/asan_interface.h> // We use ASAN API to poison memory addresses
@@ -460,11 +473,49 @@ static void test_equivalence() {
         levenshtein_from_sz<sz_levenshtein_distance_serial>(), //
         alignment_score_from_sz<sz_needleman_wunsch_score_serial>());
 
+#if SZ_USE_OPENMP
     test_edit_distance_equivalence(                            //
         levenshtein_from_sz<sz_levenshtein_distance_serial>(), //
         [](std::string const &a, std::string const &b) {
             return sz::openmp::levenshtein_distance(a, b, std::allocator<char>());
         });
+#endif
+
+#if 0
+    using arrow_string_tape_cuda_t = sz::arrow_string_tape<char, sz_u32_t, sz::cuda::unified_alloc<char>>;
+    arrow_string_tape_cuda_t a_tape, b_tape;
+    std::vector<sz_size_t, sz::cuda::unified_alloc<sz_size_t>> cuda_results(1);
+    test_edit_distance_equivalence(                            //
+        levenshtein_from_sz<sz_levenshtein_distance_serial>(), //
+        [&](std::string const &a, std::string const &b) {
+            // Compiling heavy CUDA templates is tricky and time-consuming!
+            a_tape.try_assign(&a, &a + 1);
+            b_tape.try_assign(&b, &b + 1);
+            sz_status_t status = sz_levenshtein_distances_u32tape( //
+                a_tape.buffer().data(), a_tape.offsets().data(),   //
+                b_tape.buffer().data(), b_tape.offsets().data(),   //
+                1, SZ_SIZE_MAX, nullptr, cuda_results.data());
+            assert(status == sz_success_k);
+            return cuda_results[0];
+        });
+#endif
+
+#if SZ_USE_CUDA
+    using arrow_string_tape_cuda_t = sz::arrow_string_tape<char, sz_u32_t, sz::cuda::unified_alloc<char>>;
+    using malloc_size_cuda_t = sz::cuda::unified_alloc<sz_size_t>;
+    arrow_string_tape_cuda_t a_tape, b_tape;
+    std::vector<sz_size_t, malloc_size_cuda_t> cuda_results(1);
+    test_edit_distance_equivalence(                            //
+        levenshtein_from_sz<sz_levenshtein_distance_serial>(), //
+        [&](std::string const &a, std::string const &b) {
+            // Compiling heavy CUDA templates is tricky and time-consuming!
+            a_tape.try_assign(&a, &a + 1);
+            b_tape.try_assign(&b, &b + 1);
+            sz::status_t status = sz::cuda::levenshtein_distances(a_tape, b_tape, cuda_results.data());
+            assert(status == sz::status_t::success_k);
+            return cuda_results[0];
+        });
+#endif
 
 #if SZ_USE_HASWELL
     test_hash_equivalence(                                      //
@@ -2052,6 +2103,30 @@ int main(int argc, char const **argv) {
     std::printf("- Uses NEON: %s \n", SZ_USE_NEON ? "yes" : "no");
     std::printf("- Uses SVE: %s \n", SZ_USE_SVE ? "yes" : "no");
     std::printf("- Uses SVE2: %s \n", SZ_USE_SVE2 ? "yes" : "no");
+    std::printf("- Uses OpenMP: %s \n", SZ_USE_OPENMP ? "yes" : "no");
+    std::printf("- Uses CUDA: %s \n", SZ_USE_CUDA ? "yes" : "no");
+
+#if SZ_USE_CUDA
+    int device_count = 0;
+    cudaError_t cuda_error = cudaGetDeviceCount(&device_count);
+    if (cuda_error != cudaSuccess) {
+        std::printf("CUDA error: %s\n", cudaGetErrorString(cuda_error));
+        return 1;
+    }
+    std::printf("CUDA device count: %d\n", device_count);
+    if (device_count == 0) {
+        std::printf("No CUDA devices found.\n");
+        return 1;
+    }
+    std::printf("- CUDA devices:\n");
+    cudaDeviceProp prop;
+    for (int i = 0; i < device_count; ++i) {
+        cudaGetDeviceProperties(&prop, i);
+        std::printf("  - %s\n", prop.name);
+    }
+    std::printf("- CUDA managed memory support: %s\n", prop.managedMemory == 1 ? "yes" : "no");
+    std::printf("- CUDA unified memory support: %s\n", prop.unifiedAddressing == 1 ? "yes" : "no");
+#endif
 
     // Basic utilities
     test_arithmetical_utilities();

From 1b96ef4c667546d6561d426f011685bafd34174d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 28 Mar 2025 18:24:06 +0000
Subject: [PATCH 260/751] Make: Draft CUDA compilation

---
 CMakeLists.txt | 222 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 145 insertions(+), 77 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dd4348d1..f95c3dc0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -35,7 +35,7 @@ cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
 project(
     stringzilla
     VERSION 3.11.3
-    LANGUAGES C CXX
+    LANGUAGES C CXX ASM
     DESCRIPTION "SIMD-accelerated string search, sort, hashes, fingerprints, & edit distances"
     HOMEPAGE_URL "https://github.com/ashvardanian/stringzilla"
 )
@@ -54,7 +54,28 @@ message(STATUS "C Compiler: ${CMAKE_C_COMPILER}")
 message(STATUS "C++ Compiler ID: ${CMAKE_CXX_COMPILER_ID}")
 message(STATUS "C++ Compiler Version: ${CMAKE_CXX_COMPILER_VERSION}")
 message(STATUS "C++ Compiler: ${CMAKE_CXX_COMPILER}")
-message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
+
+# Detect CUDA Support
+set(ENABLE_CUDA OFF)
+include(CheckLanguage)
+check_language(CUDA)
+
+if (CMAKE_CUDA_COMPILER)
+    enable_language(CUDA)
+    set(ENABLE_CUDA ON)
+    set(CMAKE_CUDA_STANDARD 17)
+    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+    set(CMAKE_CUDA_EXTENSIONS OFF)
+    find_package(CUDAToolkit REQUIRED)
+    message(STATUS "CUDA detected!")
+    message(STATUS "CUDA Compiler: ${CMAKE_CUDA_COMPILER}")
+    message(STATUS "CUDA Compiler ID: ${CMAKE_CUDA_COMPILER_ID}")
+    message(STATUS "CUDA Toolkit Version: ${CUDAToolkit_VERSION}")
+    message(STATUS "CUDA Toolkit Include Path: ${CUDAToolkit_INCLUDE_DIRS}")
+    message(STATUS "CUDA Toolkit Libraries Path: ${CUDAToolkit_LIBRARY_DIR}")
+else ()
+    message(STATUS "CUDA not detected. Skipping CUDA-specific builds.")
+endif ()
 
 if (CMAKE_SIZEOF_VOID_P EQUAL 8)
     message(STATUS "Pointer size: 64-bit")
@@ -71,6 +92,7 @@ if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
     )
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif ()
+message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64")
     set(SZ_PLATFORM_X86 TRUE)
@@ -124,8 +146,8 @@ if (MSVC)
     string(REGEX REPLACE "/RTC[^ ]*" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
 endif ()
 
-# Function to set compiler-specific flags
-function (set_compiler_flags target cpp_standard target_arch)
+# Function to set the default compiler-specific flags
+function (set_compiler_flags target cpp_standard target_arch compiler_id)
     get_target_property(target_type ${target} TYPE)
 
     target_include_directories(${target} PRIVATE scripts)
@@ -141,62 +163,83 @@ function (set_compiler_flags target cpp_standard target_arch)
 
     # Set the C++ standard
     if (NOT ${cpp_standard} STREQUAL "")
-        set_target_properties(${target} PROPERTIES CXX_STANDARD ${cpp_standard})
+        if (${compiler_id} STREQUAL "NVIDIA")
+            set_target_properties(${target} PROPERTIES CUDA_STANDARD ${cpp_standard})
+        else ()
+            set_target_properties(${target} PROPERTIES CXX_STANDARD ${cpp_standard})
+        endif ()
     endif ()
 
-    # Use the /Zc:__cplusplus flag to correctly define the __cplusplus macro in MSVC
-    target_compile_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:MSVC>:/Zc:__cplusplus>")
-
-    # Maximum warnings level & warnings as error. MVC uses numeric values: > 4068 for "unknown pragmas". > 4146 for
-    # "unary minus operator applied to unsigned type, result still unsigned". We also specify /utf-8 to properly UTF-8
-    # symbols in tests.
-    target_compile_options(
-        ${target}
-        PRIVATE
-            "$<$<CXX_COMPILER_ID:MSVC>:/Bt;/wd4068;/wd4146;/utf-8;/WX>"
-            "$<$<CXX_COMPILER_ID:GNU>:-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas;-Wno-cast-function-type;-Wno-unused-function>"
-            "$<$<CXX_COMPILER_ID:Clang>:-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas>"
-            "$<$<CXX_COMPILER_ID:AppleClang>:-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas>"
-    )
+    # Use the `/Zc:__cplusplus` flag to correctly define the `__cplusplus` macro in MSVC
+    if (${compiler_id} STREQUAL "MSVC")
+        target_compile_options(${target} PRIVATE "/Zc:__cplusplus")
+    endif ()
+
+    # Make sure CUDA C++ allows calling `constexpr` from device code
+    if (${compiler_id} STREQUAL "NVIDIA")
+        target_compile_options(${target} PRIVATE "--expt-relaxed-constexpr")
+    endif ()
+
+    # Maximum warnings level & warnings as error.
+    #
+    # MSVC uses numeric values: > 4068 for "unknown pragmas". > 4146 for "unary minus operator applied to unsigned type,
+    # result still unsigned". We also specify `/utf-8` to properly UTF-8 symbols in tests.
+    if (${compiler_id} STREQUAL "GNU")
+        target_compile_options(${target} PRIVATE "-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas")
+        target_compile_options(${target} PRIVATE "-Wno-cast-function-type;-Wno-unused-function") # ? Unique to GCC
+    elseif (${compiler_id} STREQUAL "Clang" OR ${compiler_id} STREQUAL "AppleClang")
+        target_compile_options(${target} PRIVATE "-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas")
+    elseif (${compiler_id} STREQUAL "MSVC")
+        target_compile_options(${target} PRIVATE "/Bt;/wd4068;/wd4146;/utf-8;/WX")
+    endif ()
 
     # Set optimization options for different compilers differently
-    target_compile_options(
-        ${target}
-        PRIVATE "$<$<AND:$<CXX_COMPILER_ID:GNU>,$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>>:-O3>"
-                "$<$<AND:$<CXX_COMPILER_ID:GNU>,$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>>:-g>"
-                "$<$<AND:$<CXX_COMPILER_ID:Clang>,$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>>:-O3>"
-                "$<$<AND:$<CXX_COMPILER_ID:Clang>,$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>>:-g>"
-                "$<$<AND:$<CXX_COMPILER_ID:MSVC>,$<CONFIG:Release>>:/O2>"
-                "$<$<AND:$<CXX_COMPILER_ID:MSVC>,$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>>:/O2>"
-                "$<$<AND:$<CXX_COMPILER_ID:MSVC>,$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>>:/Zi>"
+    if (${compiler_id} STREQUAL "MSVC")
+        if (${CMAKE_BUILD_TYPE} STREQUAL "Debug" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
+            target_compile_options(${target} PRIVATE "/Od;/Zi")
+            if (NOT target_type STREQUAL "SHARED_LIBRARY")
+                target_compile_options(${target} PRIVATE "/RTC1")
+            endif ()
+        endif ()
+        if (${CMAKE_BUILD_TYPE} STREQUAL "Release" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
+            target_compile_options(${target} PRIVATE "/O2;/Zi")
+        endif ()
+    elseif (
+        ${compiler_id} STREQUAL "GNU"
+        OR ${compiler_id} STREQUAL "Clang"
+        OR ${compiler_id} STREQUAL "AppleClang"
+        OR ${compiler_id} STREQUAL "NVIDIA"
     )
-
-    if (NOT target_type STREQUAL "SHARED_LIBRARY")
-        if (MSVC)
-            target_compile_options(${target} PRIVATE "$<$<CONFIG:Debug>:/RTC1>")
+        if (${CMAKE_BUILD_TYPE} STREQUAL "Debug" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
+            target_compile_options(${target} PRIVATE "-O0;-g")
+        endif ()
+        if (${CMAKE_BUILD_TYPE} STREQUAL "Release" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
+            target_compile_options(${target} PRIVATE "-O3")
         endif ()
     endif ()
 
     # If available, enable Position Independent Code
     get_target_property(target_pic ${target} POSITION_INDEPENDENT_CODE)
     if (target_pic)
-        target_compile_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-fPIC>")
-        target_link_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-fPIC>")
-        target_compile_definitions(${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:SZ_PIC>")
+        target_compile_definitions(${target} PRIVATE "SZ_PIC")
     endif ()
 
     # Avoid builtin functions where we know what we are doing.
-    target_compile_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-fno-builtin-memcmp>")
-    target_compile_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-fno-builtin-memchr>")
-    target_compile_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-fno-builtin-memcpy>")
-    target_compile_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-fno-builtin-memset>")
-    target_compile_options(${target} PRIVATE "$<$<CXX_COMPILER_ID:MSVC>:/Oi->")
+    if (${compiler_id} STREQUAL "MSVC")
+        target_compile_options(${target} PRIVATE "/Oi-")
+    else ()
+        target_compile_options(${target} PRIVATE "-fno-builtin;-nostdlib")
+        target_compile_options(${target} PRIVATE "-fno-builtin-memcmp")
+        target_compile_options(${target} PRIVATE "-fno-builtin-memchr")
+        target_compile_options(${target} PRIVATE "-fno-builtin-memcpy")
+        target_compile_options(${target} PRIVATE "-fno-builtin-memset")
+    endif ()
 
     # Check for ${target_arch} and set it or use the current system if not defined
     if ("${target_arch}" STREQUAL "")
         # Only use the current system if we are not cross compiling
         if ((NOT CMAKE_CROSSCOMPILING) OR (CMAKE_SYSTEM_PROCESSOR MATCHES CMAKE_HOST_SYSTEM_PROCESSOR))
-            if (NOT MSVC)
+            if (NOT (${compiler_id} STREQUAL "MSVC"))
                 include(CheckCXXCompilerFlag)
                 check_cxx_compiler_flag("-march=native" supports_march_native)
                 if (supports_march_native)
@@ -208,10 +251,11 @@ function (set_compiler_flags target cpp_standard target_arch)
             endif ()
         endif ()
     else ()
-        target_compile_options(
-            ${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-march=${target_arch}>"
-                              "$<$<CXX_COMPILER_ID:MSVC>:/arch:${target_arch}>"
-        )
+        if (${compiler_id} STREQUAL "MSVC")
+            target_compile_options(${target} PRIVATE "/arch:${target_arch}")
+        else ()
+            target_compile_options(${target} PRIVATE "-march=${target_arch}")
+        endif ()
     endif ()
 
     # Define SZ_DETECT_BIG_ENDIAN macro based on system byte order
@@ -225,28 +269,42 @@ function (set_compiler_flags target cpp_standard target_arch)
 
     # Sanitizer options for Debug mode
     if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+        target_compile_definitions(${target} PRIVATE "SZ_DEBUG=1")
         if (NOT target_type STREQUAL "SHARED_LIBRARY")
-            target_compile_options(
-                ${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang>:-fsanitize=address;-fsanitize=leak>"
-                                  "$<$<CXX_COMPILER_ID:MSVC>:/fsanitize=address>"
-            )
-
-            target_link_options(
-                ${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang>:-fsanitize=address;-fsanitize=leak>"
-                "$<$<CXX_COMPILER_ID:MSVC>:/fsanitize=address>"
-            )
+            if (${compiler_id} STREQUAL "MSVC")
+                target_compile_options(${target} PRIVATE "/fsanitize=address;/fsanitize=leak")
+                target_link_options(${target} PRIVATE "/fsanitize=address;/fsanitize=leak")
+            else ()
+                target_compile_options(${target} PRIVATE "-fsanitize=address;-fsanitize=leak")
+                target_link_options(${target} PRIVATE "-fsanitize=address;-fsanitize=leak")
+            endif ()
         endif ()
-
-        # Define SZ_DEBUG macro based on build configuration
-        target_compile_definitions(
-            ${target} PRIVATE "$<$<CONFIG:Debug>:SZ_DEBUG=1>" "$<$<NOT:$<CONFIG:Debug>>:SZ_DEBUG=0>"
-        )
+    else ()
+        target_compile_definitions(${target} PRIVATE "SZ_DEBUG=0")
     endif ()
 endfunction ()
 
 function (define_launcher exec_name source cpp_standard target_arch)
-    add_executable(${exec_name} ${source})
-    set_compiler_flags(${exec_name} ${cpp_standard} "${target_arch}")
+    add_executable(${exec_name})
+    target_sources(${exec_name} PRIVATE ${source})
+    # TODO: How do we constrain this scope?! set_source_files_properties(${source} TARGET_DIRECTORY ${exec_name}
+    # PROPERTIES LANGUAGE CXX)
+    set_compiler_flags(${exec_name} ${cpp_standard} "${target_arch}" "${CMAKE_CXX_COMPILER_ID}")
+    target_link_libraries(${exec_name} PRIVATE stringzilla_header)
+    add_test(NAME ${exec_name} COMMAND ${exec_name})
+endfunction ()
+
+function (define_gpu_launcher exec_name source cuda_standard target_arch)
+    add_executable(${exec_name})
+    target_sources(${exec_name} PRIVATE ${source})
+    set_source_files_properties(${source} TARGET_DIRECTORY ${exec_name} PROPERTIES LANGUAGE CUDA)
+    target_compile_definitions(${exec_name} PRIVATE "SZ_USE_CUDA=1")
+    set_target_properties(${exec_name} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    target_include_directories(${exec_name} PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+    set_compiler_flags(${exec_name} ${cuda_standard} "${target_arch}" "${CMAKE_CUDA_COMPILER_ID}")
+    target_link_libraries(${exec_name} PRIVATE CUDA::cudart CUDA::cuda_driver)
+    # Only targeting Ampere and Hopper architectures for now
+    set_property(TARGET ${exec_name} PROPERTY CUDA_ARCHITECTURES 80 90)
     target_link_libraries(${exec_name} PRIVATE stringzilla_header)
     add_test(NAME ${exec_name} COMMAND ${exec_name})
 endfunction ()
@@ -263,29 +321,37 @@ endif ()
 if (${STRINGZILLA_BUILD_TEST})
     # Make sure that the compilation passes for different C++ standards ! Keep in mind, MSVC only supports C++11 and
     # newer.
-    define_launcher(stringzilla_test_cpp11 scripts/test.cpp 11 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringzilla_test_cpp14 scripts/test.cpp 14 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringzilla_test_cpp17 scripts/test.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringzilla_test_cpp20 scripts/test.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_test_cpp11 scripts/test.cu 11 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_test_cpp14 scripts/test.cu 14 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_test_cpp17 scripts/test.cu 17 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_test_cpp20 scripts/test.cu 20 "${STRINGZILLA_TARGET_ARCH}")
+
+    # To avoid bloating our codebase with `__device__` function annotations, we only target C++14 and newer to compile
+    # `constexpr` functions on both host and device side. To avoid the complexity of defining too many template objects
+    # and complex SFINAE, we only target C++17 anf newer to compile `if constexpr` compile-time SIMD dispatch.
+    if (ENABLE_CUDA)
+        define_gpu_launcher(stringzilla_test_cu17 scripts/test.cu 17 "${STRINGZILLA_TARGET_ARCH}")
+        define_gpu_launcher(stringzilla_test_cu20 scripts/test.cu 20 "${STRINGZILLA_TARGET_ARCH}")
+    endif ()
 
     # Check system architecture to avoid complex cross-compilation workflows, but compile multiple backends: disabling
     # all SIMD, enabling only AVX2, only AVX-512, only Arm Neon.
     if (SZ_PLATFORM_X86)
         # x86 specific backends
         if (MSVC)
-            define_launcher(stringzilla_test_cpp20_serial scripts/test.cpp 20 "AVX")
-            define_launcher(stringzilla_test_cpp20_haswell scripts/test.cpp 20 "AVX2")
-            define_launcher(stringzilla_test_cpp20_ice scripts/test.cpp 20 "AVX512")
+            define_launcher(stringzilla_test_cpp20_serial scripts/test.cu 20 "AVX")
+            define_launcher(stringzilla_test_cpp20_haswell scripts/test.cu 20 "AVX2")
+            define_launcher(stringzilla_test_cpp20_ice scripts/test.cu 20 "AVX512")
         else ()
-            define_launcher(stringzilla_test_cpp20_serial scripts/test.cpp 20 "ivybridge")
-            define_launcher(stringzilla_test_cpp20_haswell scripts/test.cpp 20 "haswell")
-            define_launcher(stringzilla_test_cpp20_ice scripts/test.cpp 20 "sapphirerapids")
+            define_launcher(stringzilla_test_cpp20_serial scripts/test.cu 20 "ivybridge")
+            define_launcher(stringzilla_test_cpp20_haswell scripts/test.cu 20 "haswell")
+            define_launcher(stringzilla_test_cpp20_ice scripts/test.cu 20 "sapphirerapids")
         endif ()
     elseif (SZ_PLATFORM_ARM)
         # ARM specific backends
-        define_launcher(stringzilla_test_cpp20_serial scripts/test.cpp 20 "armv8-a")
-        define_launcher(stringzilla_test_cpp20_neon scripts/test.cpp 20 "armv8-a+simd")
-        define_launcher(stringzilla_test_cpp20_sve scripts/test.cpp 20 "armv8.2-a+sve")
+        define_launcher(stringzilla_test_cpp20_serial scripts/test.cu 20 "armv8-a")
+        define_launcher(stringzilla_test_cpp20_neon scripts/test.cu 20 "armv8-a+simd")
+        define_launcher(stringzilla_test_cpp20_sve scripts/test.cu 20 "armv8.2-a+sve")
     endif ()
 endif ()
 
@@ -300,6 +366,8 @@ if (${STRINGZILLA_BUILD_SHARED})
 
     function (define_shared target)
         add_library(${target} SHARED c/lib.c)
+        # if (ENABLE_CUDA) target_sources(${target} PRIVATE c/lib.cu) target_include_directories(${target} PRIVATE
+        # ${CUDAToolkit_INCLUDE_DIRS}) target_link_libraries(${target} PRIVATE CUDA::cudart CUDA::cuda_driver) endif()
         add_library(${PROJECT_NAME}::${target} ALIAS ${target})
 
         set_target_properties(
@@ -311,9 +379,9 @@ if (${STRINGZILLA_BUILD_SHARED})
 
         if (SZ_PLATFORM_X86)
             if (MSVC)
-                set_compiler_flags(${target} "" "SSE2")
+                set_compiler_flags(${target} "" "SSE2" "${CMAKE_CXX_COMPILER_ID}")
             else ()
-                set_compiler_flags(${target} "" "ivybridge")
+                set_compiler_flags(${target} "" "ivybridge" "${CMAKE_CXX_COMPILER_ID}")
             endif ()
 
             target_compile_definitions(
@@ -321,7 +389,7 @@ if (${STRINGZILLA_BUILD_SHARED})
                                   "SZ_USE_SVE2=0"
             )
         elseif (SZ_PLATFORM_ARM)
-            set_compiler_flags(${target} "" "armv8-a")
+            set_compiler_flags(${target} "" "armv8-a" "${CMAKE_CXX_COMPILER_ID}")
 
             target_compile_definitions(
                 ${target} PRIVATE "SZ_USE_HASWELL=0" "SZ_USE_SKYLAKE=0" "SZ_USE_ICE=0" "SZ_USE_NEON=1" "SZ_USE_SVE=1"

From ea7647ff150e80aef009ba719e2241011b8abf0a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 28 Mar 2025 20:42:42 +0000
Subject: [PATCH 261/751] Make: NVCC can't handle `fsanitize`

---
 CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f95c3dc0..2f93553d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -66,6 +66,8 @@ if (CMAKE_CUDA_COMPILER)
     set(CMAKE_CUDA_STANDARD 17)
     set(CMAKE_CUDA_STANDARD_REQUIRED ON)
     set(CMAKE_CUDA_EXTENSIONS OFF)
+    set(CMAKE_CUDA_ARCHITECTURES 80 90)
+    set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
     find_package(CUDAToolkit REQUIRED)
     message(STATUS "CUDA detected!")
     message(STATUS "CUDA Compiler: ${CMAKE_CUDA_COMPILER}")
@@ -228,7 +230,6 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
     if (${compiler_id} STREQUAL "MSVC")
         target_compile_options(${target} PRIVATE "/Oi-")
     else ()
-        target_compile_options(${target} PRIVATE "-fno-builtin;-nostdlib")
         target_compile_options(${target} PRIVATE "-fno-builtin-memcmp")
         target_compile_options(${target} PRIVATE "-fno-builtin-memchr")
         target_compile_options(${target} PRIVATE "-fno-builtin-memcpy")
@@ -274,6 +275,9 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
             if (${compiler_id} STREQUAL "MSVC")
                 target_compile_options(${target} PRIVATE "/fsanitize=address;/fsanitize=leak")
                 target_link_options(${target} PRIVATE "/fsanitize=address;/fsanitize=leak")
+            elseif (${compiler_id} STREQUAL "NVIDIA")
+                # ! NVCC can't handle sanitizers?!
+                # https://stackoverflow.com/questions/75590579/cuda-fails-to-initialise-when-address-sanitizer-is-enabled
             else ()
                 target_compile_options(${target} PRIVATE "-fsanitize=address;-fsanitize=leak")
                 target_link_options(${target} PRIVATE "-fsanitize=address;-fsanitize=leak")

From 27ad8f5efe309a5b06054409c7b86135d55aa3e0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 28 Mar 2025 22:58:02 +0000
Subject: [PATCH 262/751] Fix: Shared memory requirements

---
 include/stringzilla/similarities.cuh | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/stringzilla/similarities.cuh b/include/stringzilla/similarities.cuh
index e952d858..88986408 100644
--- a/include/stringzilla/similarities.cuh
+++ b/include/stringzilla/similarities.cuh
@@ -289,11 +289,13 @@ size_t scores_diagonally_shared_memory_requirement( //
         sz_size_t const second_length = second_strings[i].length();
         sz_size_t const shorter_length = sz_min_of_two(first_length, second_length);
         sz_size_t const longer_length = sz_max_of_two(first_length, second_length);
-        sz_size_t const max_dim = longer_length + 1;
-        sz_size_t const bytes_per_cell = max_dim < 256 ? 1 : max_dim < 65536 ? 2 : 4;
+        sz_size_t const max_diagonal_length = shorter_length + 1;
+        sz_size_t const max_cell_value = longer_length + 1;
+        sz_size_t const bytes_per_cell = max_cell_value < 256 ? 1 : max_cell_value < 65536 ? 2 : 4;
         // For each string we need to copy its contents, and allocate 3 bands proportional to the length
         // of the shorter string with each cell being big enough to hold the length of the longer one.
-        sz_size_t const shared_memory_requirement = 3 * shorter_length * bytes_per_cell + first_length + second_length;
+        sz_size_t const shared_memory_requirement = 3 * max_diagonal_length * bytes_per_cell + //
+                                                    first_length + second_length;
         max_required_shared_memory = sz_max_of_two(max_required_shared_memory, shared_memory_requirement);
     }
     return max_required_shared_memory;

From 3b85a0093f64dc09972277bb9ae94f118ebb87b9 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 28 Mar 2025 22:58:39 +0000
Subject: [PATCH 263/751] Fix: Synchronizing CUDA kernel launch

---
 include/stringzilla/similarities.cuh | 53 +++++++++++++++++++++++++---
 1 file changed, 48 insertions(+), 5 deletions(-)

diff --git a/include/stringzilla/similarities.cuh b/include/stringzilla/similarities.cuh
index 88986408..03828d5f 100644
--- a/include/stringzilla/similarities.cuh
+++ b/include/stringzilla/similarities.cuh
@@ -317,7 +317,12 @@ template <                             //
 status_t levenshtein_distances(                                                           //
     first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
     results_type_ *results,                                                               //
-    specs_t specs = {}) noexcept(false) {
+    specs_t specs = {}, cudaStream_t stream = 0) noexcept(false) {
+
+    // We need to be able to copy these function arguments into GPU memory:
+    static_assert(
+        std::is_trivially_copyable<first_strings_type_>() && std::is_trivially_copyable<second_strings_type_>(),
+        "The first and second strings must be trivially copyable types - consider `arrow_strings_view`.");
 
     // Make sure that we don't string pairs that are too large to fit 3 matrix diagonals into shared memory.
     // H100 Streaming Multiprocessor can have up to 128 active warps concurrently and only 256 KB of shared memory.
@@ -325,15 +330,32 @@ status_t levenshtein_distances(
     sz_size_t shared_memory_per_block = scores_diagonally_shared_memory_requirement(first_strings, second_strings);
     if (shared_memory_per_block > specs.shared_memory_per_sm) return status_t::bad_alloc_k;
 
+    // It may be the case that we've only received empty strings.
+    if (shared_memory_per_block == 0) {
+        for (sz_size_t i = 0; i < first_strings.size(); ++i)
+            if (first_strings[i].length() == 0) { results[i] = second_strings[i].length(); }
+            else if (second_strings[i].length() == 0) { results[i] = first_strings[i].length(); }
+        return status_t::success_k;
+    }
+
     // In most cases we should be able to fir many blocks per SM.
     sz_size_t count_blocks = specs.shared_memory_per_sm / shared_memory_per_block;
     if (count_blocks > specs.blocks_per_sm) count_blocks = specs.blocks_per_sm;
+    if (count_blocks > first_strings.size()) count_blocks = first_strings.size();
 
     // Let's use all 32 threads in a warp.
     constexpr sz_size_t threads_per_block = 32u;
     auto kernel =
         &scores_diagonally<first_strings_type_, second_strings_type_, results_type_, uniform_substitution_cost_t>;
-    kernel<<<count_blocks, threads_per_block, shared_memory_per_block>>>(first_strings, second_strings, results, 1, {});
+    kernel<<<count_blocks, threads_per_block, shared_memory_per_block, stream>>>(first_strings, second_strings, results,
+                                                                                 1, {});
+
+    // Fetch the error:
+    cudaError_t error = cudaStreamSynchronize(stream);
+    if (error != cudaSuccess) {
+        if (error == cudaErrorMemoryAllocation) { return status_t::bad_alloc_k; }
+        else { return status_t::unknown_error_k; }
+    }
     return status_t::success_k;
 }
 
@@ -353,7 +375,12 @@ status_t needleman_wunsch_scores(
     first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
     results_type_ *results,                                                               //
     sz_error_cost_t const *subs, sz_error_cost_t gap,                                     //
-    specs_t specs = {}) noexcept(false) {
+    specs_t specs = {}, cudaStream_t stream = 0) noexcept(false) {
+
+    // We need to be able to copy these function arguments into GPU memory:
+    static_assert(
+        std::is_trivially_copyable<first_strings_type_>() && std::is_trivially_copyable<second_strings_type_>(),
+        "The first and second strings must be trivially copyable types - consider `arrow_strings_view`.");
 
     // Make sure that we don't string pairs that are too large to fit 3 matrix diagonals into shared memory.
     // H100 Streaming Multiprocessor can have up to 128 active warps concurrently and only 256 KB of shared memory.
@@ -363,16 +390,32 @@ status_t needleman_wunsch_scores(
     // shared_memory_per_block += 256 * 256 * sizeof(sz_error_cost_t);
     if (shared_memory_per_block > specs.shared_memory_per_sm) return status_t::bad_alloc_k;
 
+    // It may be the case that we've only received empty strings.
+    if (shared_memory_per_block == 0) {
+        for (sz_size_t i = 0; i < first_strings.size(); ++i)
+            if (first_strings[i].length() == 0) { results[i] = second_strings[i].length() * gap; }
+            else if (second_strings[i].length() == 0) { results[i] = first_strings[i].length() * gap; }
+        return status_t::success_k;
+    }
+
     // In most cases we should be able to fir many blocks per SM.
     sz_size_t count_blocks = specs.shared_memory_per_sm / shared_memory_per_block;
     if (count_blocks > specs.blocks_per_sm) count_blocks = specs.blocks_per_sm;
+    if (count_blocks > first_strings.size()) count_blocks = first_strings.size();
 
     // Let's use all 32 threads in a warp.
     constexpr sz_size_t threads_per_block = 32u;
     auto kernel =
         &scores_diagonally<first_strings_type_, second_strings_type_, results_type_, lookup_substitution_cost_t>;
-    kernel<<<count_blocks, threads_per_block, shared_memory_per_block>>>(first_strings, second_strings, results, gap,
-                                                                         lookup_substitution_cost_t {subs});
+    kernel<<<count_blocks, threads_per_block, shared_memory_per_block, stream>>>(
+        first_strings, second_strings, results, gap, lookup_substitution_cost_t {subs});
+
+    // Fetch the error:
+    cudaError_t error = cudaStreamSynchronize(stream);
+    if (error != cudaSuccess) {
+        if (error == cudaErrorMemoryAllocation) { return status_t::bad_alloc_k; }
+        else { return status_t::unknown_error_k; }
+    }
     return status_t::success_k;
 }
 

From 668a3860cdaaf831e65a0dcb8bd4898c53fcad51 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 28 Mar 2025 22:59:27 +0000
Subject: [PATCH 264/751] Fix: Overwriting alignment scores

---
 include/stringzilla/similarities.cuh | 47 ++++++++++++++--------------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/include/stringzilla/similarities.cuh b/include/stringzilla/similarities.cuh
index 03828d5f..647f4d30 100644
--- a/include/stringzilla/similarities.cuh
+++ b/include/stringzilla/similarities.cuh
@@ -48,11 +48,11 @@ template <                                                        //
     typename get_substitution_cost_ = uniform_substitution_cost_t //
 
     >
-__device__ void score_diagonally(                                                        //
-    span<char_type_ const> const &first, span<char_type_ const> const &second,           //
-    distance_type_ &result_ref,                                                          //
-    sz_error_cost_t gap_cost = 1,                                                        //
-    get_substitution_cost_ const &get_substitution_cost = uniform_substitution_cost_t {} //
+__device__ void score_diagonally(                                                 //
+    span<char_type_ const> first, span<char_type_ const> second,                  //
+    distance_type_ &result_ref,                                                   //
+    sz_error_cost_t gap_cost = 1,                                                 //
+    get_substitution_cost_ get_substitution_cost = uniform_substitution_cost_t {} //
 ) {
     using char_type = char_type_;
     using distance_type = distance_type_;
@@ -64,6 +64,8 @@ __device__ void score_diagonally(
         std::swap(shorter_global, longer_global);
         std::swap(shorter_length, longer_length);
     }
+    assert(shorter_length <= longer_length);
+    assert(shorter_length > 0 && longer_length > 0);
 
     // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
     // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
@@ -173,7 +175,6 @@ __device__ void score_diagonally(
     // Now let's handle the bottom-right triangle of the matrix.
     for (; next_diagonal_index < diagonals_count; ++next_diagonal_index) {
         sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
-#pragma omp simd
         for (sz_size_t offset_in_diagonal = threadIdx.x; offset_in_diagonal < next_diagonal_length;
              offset_in_diagonal += blockDim.x) {
             char_type shorter_char = shorter_reversed[shorter_length - shorter_dim + offset_in_diagonal + 1];
@@ -223,8 +224,8 @@ template <                                                        //
     typename get_substitution_cost_ = uniform_substitution_cost_t //
     >
 __global__ void scores_diagonally(                                                //
-    first_strings_type_ const &first_strings,                                     //
-    second_strings_type_ const &second_strings,                                   //
+    first_strings_type_ first_strings,                                            //
+    second_strings_type_ second_strings,                                          //
     global_distance_type_ *results_ptr,                                           //
     sz_error_cost_t gap_cost = 1,                                                 //
     get_substitution_cost_ get_substitution_cost = uniform_substitution_cost_t {} //
@@ -240,41 +241,41 @@ __global__ void scores_diagonally(
 
     // We are computing N edit distances for N pairs of strings. Not a cartesian product!
     // Each block/warp may end up receiving a different number of strings.
-    for (sz_size_t pair_idx = threadIdx.x; pair_idx < first_strings.size(); pair_idx += blockDim.x) {
-        first_string_type const &first_global = first_strings[pair_idx];
-        second_string_type const &second_global = second_strings[pair_idx];
-        distance_type &results_ref = results_ptr[pair_idx];
+    for (sz_size_t pair_idx = blockIdx.x; pair_idx < first_strings.size(); pair_idx += gridDim.x) {
+        first_string_type const first_global = first_strings[pair_idx];
+        second_string_type const second_global = second_strings[pair_idx];
+        distance_type &result_ref = results_ptr[pair_idx];
 
         // Skip empty strings.
         sz_size_t const first_length = first_global.length();
         sz_size_t const second_length = second_global.length();
         if (first_length == 0) {
-            results_ref = second_length * gap_cost;
+            result_ref = second_length * gap_cost;
             continue;
         }
         if (second_length == 0) {
-            results_ref = first_length * gap_cost;
+            result_ref = first_length * gap_cost;
             continue;
         }
 
         // Estimate the maximum dimension of the DP matrix to pick the smallest fitting type.
-        sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
+        sz_size_t const max_cell_value = sz_max_of_two(first_length, second_length) + 1;
         span<char const> const first = {first_global.data(), first_length};
         span<char const> const second = {second_global.data(), second_length};
-        if (max_dim < 256u) {
-            sz_u8_t result_u8;
+        if (max_cell_value < 256u) {
+            sz_u8_t result_u8 = (sz_u8_t)-1;
             score_diagonally(first, second, result_u8, gap_cost, get_substitution_cost);
-            results_ref = result_u8;
+            if (threadIdx.x == 0) result_ref = result_u8;
         }
-        else if (max_dim < 65536u) {
-            sz_u16_t result_u16;
+        else if (max_cell_value < 65536u) {
+            sz_u16_t result_u16 = (sz_u16_t)-1;
             score_diagonally(first, second, result_u16, gap_cost, get_substitution_cost);
-            results_ref = result_u16;
+            if (threadIdx.x == 0) result_ref = result_u16;
         }
         else {
-            sz_size_t result_size;
+            sz_size_t result_size = (sz_size_t)-1;
             score_diagonally(first, second, result_size, gap_cost, get_substitution_cost);
-            results_ref = result_size;
+            if (threadIdx.x == 0) result_ref = result_size;
         }
     }
 }

From fa4b0f432c284046d87ed0fa12573608a41163b1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 28 Mar 2025 23:02:46 +0000
Subject: [PATCH 265/751] Fix: Arrow-like string array

---
 include/stringzilla/types.hpp | 93 ++++++++++++++++++++++++-----------
 scripts/test.cu               | 32 +++++++-----
 2 files changed, 84 insertions(+), 41 deletions(-)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 99855ed2..daabc9e7 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -16,7 +16,7 @@
  *  - `span<value_type>` -
  *  - `dummy_alloc_t` -
  *  - `dummy_alloc<value_type>` -
- *  - `arrow_string_tape<char_type, offset_type>` -
+ *  - `arrow_strings_tape<char_type, offset_type>` -
  */
 #ifndef STRINGZILLA_TYPES_HPP_
 #define STRINGZILLA_TYPES_HPP_
@@ -115,6 +115,7 @@ enum class status_t {
     bad_alloc_k = sz_bad_alloc_k,
     invalid_utf8_k = sz_invalid_utf8_k,
     contains_duplicates_k = sz_contains_duplicates_k,
+    unknown_error_k = sz_unknown_error_k,
 };
 
 struct uniform_substitution_cost_t {
@@ -132,14 +133,15 @@ struct span {
     using size_type = sz_size_t;
     using difference_type = sz_ssize_t;
 
-    value_type *data_;
-    size_type size_;
+    value_type *data_ {};
+    size_type size_ {};
 
     constexpr value_type *begin() const noexcept { return data_; }
     constexpr value_type *end() const noexcept { return data_ + size_; }
     constexpr value_type *data() const noexcept { return data_; }
     constexpr size_type size() const noexcept { return size_; }
     constexpr size_type length() const noexcept { return size_; }
+    constexpr value_type &operator[](size_type i) const noexcept { return data_[i]; }
 };
 
 template <typename value_type_>
@@ -176,19 +178,42 @@ struct dummy_alloc {
 
 using dummy_alloc_t = dummy_alloc<char>;
 
+/**
+ *  @brief  Apache @b Arrow-compatible tape data-structure to store a sequence of variable length strings.
+ *          Doesn't own the memory, but provides a view to the strings stored in a contiguous memory block.
+ *  @sa     arrow_strings_tape
+ */
+template <typename char_type_, typename offset_type_>
+struct arrow_strings_view {
+    using char_type = char_type_;
+    using offset_type = offset_type_;
+    using value_type = span<char_type>;
+
+    span<char_type> buffer_;
+    span<offset_type> offsets_;
+
+    constexpr arrow_strings_view() noexcept : buffer_ {}, offsets_ {} {}
+    constexpr arrow_strings_view(span<char_type> buf, span<offset_type> offs) noexcept : buffer_(buf), offsets_(offs) {}
+    constexpr size_t size() const noexcept { return offsets_.size() - 1; }
+
+    constexpr span<char_type> operator[](size_t i) const noexcept {
+        return {&buffer_[offsets_[i]], offsets_[i + 1] - offsets_[i] - 1};
+    }
+};
+
 /**
  *  @brief  Apache @b Arrow-compatible tape data-structure to store a sequence of variable length strings.
  *          Each string is appended to a contiguous memory block, delimited by the NULL character.
  *          Provides @b ~O(1) access to each string by storing the offsets of each string in a separate array.
  */
 template <typename char_type_, typename offset_type_, typename allocator_type_>
-struct arrow_string_tape {
+struct arrow_strings_tape {
     using char_type = char_type_;
     using offset_type = offset_type_;
     using allocator_type = allocator_type_;
 
     using value_type = span<char_type>;
-    using view_type = arrow_string_tape<char_type, offset_type, dummy_alloc<char_type>>;
+    using view_type = arrow_strings_view<char_type, offset_type>;
 
     using char_alloc_t = typename allocator_type::template rebind<char_type>::other;
     using offset_alloc_t = typename allocator_type::template rebind<offset_type>::other;
@@ -200,52 +225,60 @@ struct arrow_string_tape {
     offset_alloc_t offset_alloc_;
 
   public:
-    constexpr arrow_string_tape() = default;
-    constexpr arrow_string_tape(span<char_type> buffer, span<offset_type> offsets, allocator_type alloc)
+    constexpr arrow_strings_tape() = default;
+    constexpr arrow_strings_tape(arrow_strings_tape const &) = delete;
+    constexpr arrow_strings_tape &operator=(arrow_strings_tape const &other) = delete;
+
+    constexpr arrow_strings_tape(arrow_strings_tape &&) = delete;
+    constexpr arrow_strings_tape &operator=(arrow_strings_tape &&) = delete;
+
+    constexpr arrow_strings_tape(span<char_type> buffer, span<offset_type> offsets, allocator_type alloc)
         : buffer_(buffer), offsets_(offsets), char_alloc_(alloc), offset_alloc_(alloc) {}
 
     template <typename strings_iterator_>
     sz_constexpr_if_cpp14 status_t try_assign(strings_iterator_ first, strings_iterator_ last) noexcept {
         // Deallocate the previous memory if it was allocated
-        if (buffer_.data_) char_alloc_.deallocate(const_cast<char_type *>(buffer_.data_), buffer_.size_);
-        if (offsets_.data_) offset_alloc_.deallocate(const_cast<offset_type *>(offsets_.data_), offsets_.size_);
+        if (buffer_.data_ && buffer_.size_)
+            char_alloc_.deallocate(const_cast<char_type *>(buffer_.data_), buffer_.size_), buffer_ = {};
+        if (offsets_.data_ && offsets_.size_)
+            offset_alloc_.deallocate(const_cast<offset_type *>(offsets_.data_), offsets_.size_), offsets_ = {};
 
         // Estimate the required memory size
-        size_t buffer_capacity = 0;
-        size_t max_count = 0;
-        for (auto it = first; it != last; ++it) {
-            buffer_capacity += it->size() + 1; // ? NULL-terminated
-            ++max_count;
-        }
-        buffer_ = {char_alloc_.allocate(buffer_capacity), buffer_capacity};
-        offsets_ = {offset_alloc_.allocate(max_count), max_count};
+        size_t count = 0;
+        size_t combined_length = 0;
+        for (auto it = first; it != last; ++it, ++count) combined_length += it->length();
+        combined_length += count; // ? NULL-terminate every string
+        buffer_ = {char_alloc_.allocate(combined_length), combined_length};
+        offsets_ = {offset_alloc_.allocate(count + 1), count + 1};
         if (!buffer_.data_ || !offsets_.data_) return status_t::bad_alloc_k;
 
         // Copy the strings to the buffer and store the offsets
-        auto buffer_ptr = buffer_.data_;
-        auto offsets_ptr = offsets_.data_;
+        char_type *buffer_ptr = buffer_.data_;
+        offset_type *offsets_ptr = offsets_.data_;
         for (auto it = first; it != last; ++it) {
-            *offsets_ptr++ = buffer_ptr - buffer_.data_;
+            *offsets_ptr++ = static_cast<offset_type>(buffer_ptr - buffer_.data_);
             // Perform a byte-level copy of the string, similar to `sz_copy`
-            for (size_t i = 0; i != it->size(); ++i) buffer_ptr[i] = it->data()[i];
-            buffer_ptr[it->size()] = '\0'; // ? NULL-terminated
-            buffer_ptr += it->size() + 1;
+            char_type const *from_ptr = it->data();
+            size_t const from_length = it->length();
+            for (size_t i = 0; i != from_length; ++i) *buffer_ptr++ = *from_ptr++;
+            *buffer_ptr++ = '\0'; // ? NULL-terminated
         }
-        *offsets_ptr = static_cast<offset_type>(buffer_ptr - buffer_.data_);
+        *offsets_ptr++ = static_cast<offset_type>(buffer_ptr - buffer_.data_);
         return status_t::success_k;
     }
 
-    sz_constexpr_if_cpp20 ~arrow_string_tape() noexcept {
-        if (buffer_.data_) char_alloc_.deallocate(const_cast<char_type *>(buffer_.data_), buffer_.size_);
-        if (offsets_.data_) offset_alloc_.deallocate(const_cast<offset_type *>(offsets_.data_), offsets_.size_);
+    sz_constexpr_if_cpp20 ~arrow_strings_tape() noexcept {
+        if (buffer_.data_) char_alloc_.deallocate(const_cast<char_type *>(buffer_.data_), buffer_.size_), buffer_ = {};
+        if (offsets_.data_)
+            offset_alloc_.deallocate(const_cast<offset_type *>(offsets_.data_), offsets_.size_), offsets_ = {};
     }
 
     constexpr value_type operator[](size_t i) const noexcept {
-        return {buffer_.data_ + offsets_.data_[i], offsets_.data_[i + 1] - offsets_.data_[i] - 1};
+        return {&buffer_[0] + offsets_[i], offsets_[i + 1] - offsets_[i] - 1};
     }
 
-    constexpr size_t size() const noexcept { return offsets_.size_ - 1; }
-    constexpr view_type view() const noexcept { return {buffer_, offsets_, dummy_alloc_t {}}; }
+    constexpr size_t size() const noexcept { return offsets_.size() - 1; }
+    constexpr view_type view() const noexcept { return {buffer_, offsets_}; }
 
     constexpr span<char_type> const &buffer() const noexcept { return buffer_; }
     constexpr span<offset_type> const &offsets() const noexcept { return offsets_; }
diff --git a/scripts/test.cu b/scripts/test.cu
index a18dd430..e1b7d59f 100644
--- a/scripts/test.cu
+++ b/scripts/test.cu
@@ -482,8 +482,8 @@ static void test_equivalence() {
 #endif
 
 #if 0
-    using arrow_string_tape_cuda_t = sz::arrow_string_tape<char, sz_u32_t, sz::cuda::unified_alloc<char>>;
-    arrow_string_tape_cuda_t a_tape, b_tape;
+    using arrow_strings_tape_cuda_t = sz::arrow_strings_tape<char, sz_u32_t, sz::cuda::unified_alloc<char>>;
+    arrow_strings_tape_cuda_t a_tape, b_tape;
     std::vector<sz_size_t, sz::cuda::unified_alloc<sz_size_t>> cuda_results(1);
     test_edit_distance_equivalence(                            //
         levenshtein_from_sz<sz_levenshtein_distance_serial>(), //
@@ -501,17 +501,20 @@ static void test_equivalence() {
 #endif
 
 #if SZ_USE_CUDA
-    using arrow_string_tape_cuda_t = sz::arrow_string_tape<char, sz_u32_t, sz::cuda::unified_alloc<char>>;
+    using arrow_strings_tape_cuda_t = sz::arrow_strings_tape<char, sz_u32_t, sz::cuda::unified_alloc<char>>;
     using malloc_size_cuda_t = sz::cuda::unified_alloc<sz_size_t>;
-    arrow_string_tape_cuda_t a_tape, b_tape;
     std::vector<sz_size_t, malloc_size_cuda_t> cuda_results(1);
     test_edit_distance_equivalence(                            //
         levenshtein_from_sz<sz_levenshtein_distance_serial>(), //
         [&](std::string const &a, std::string const &b) {
             // Compiling heavy CUDA templates is tricky and time-consuming!
-            a_tape.try_assign(&a, &a + 1);
-            b_tape.try_assign(&b, &b + 1);
-            sz::status_t status = sz::cuda::levenshtein_distances(a_tape, b_tape, cuda_results.data());
+            arrow_strings_tape_cuda_t a_tape, b_tape;
+            sz::status_t status;
+            status = a_tape.try_assign(&a, &a + 1);
+            assert(status == sz::status_t::success_k);
+            status = b_tape.try_assign(&b, &b + 1);
+            assert(status == sz::status_t::success_k);
+            status = sz::cuda::levenshtein_distances(a_tape.view(), b_tape.view(), cuda_results.data());
             assert(status == sz::status_t::success_k);
             return cuda_results[0];
         });
@@ -794,8 +797,10 @@ static void test_stl_compatibility_for_reads() {
     assert(str("hello world").substr(6, 100) == "world"); // 106 is beyond the length of the string, but its OK
     assert_throws(str("hello world").substr(100), std::out_of_range);   // 100 is beyond the length of the string
     assert_throws(str("hello world").substr(20, 5), std::out_of_range); // 20 is beyond the length of the string
-    assert_throws(str("hello world").substr(-1, 5), std::out_of_range); // -1 casts to unsigned without any warnings...
-    assert(str("hello world").substr(0, -1) == "hello world");          // -1 casts to unsigned without any warnings...
+#if defined(__GNUC__) && !defined(__NVCC__) // -1 casts to unsigned without warnings on GCC, but not NVCC
+    assert_throws(str("hello world").substr(-1, 5), std::out_of_range);
+    assert(str("hello world").substr(0, -1) == "hello world");
+#endif
 
     // Character search in normal and reverse directions.
     assert(str("hello").find('e') == 1);
@@ -2107,8 +2112,13 @@ int main(int argc, char const **argv) {
     std::printf("- Uses CUDA: %s \n", SZ_USE_CUDA ? "yes" : "no");
 
 #if SZ_USE_CUDA
+    cudaError_t cuda_error = cudaFree(0); // Force context initialization
+    if (cuda_error != cudaSuccess) {
+        std::printf("CUDA initialization error: %s\n", cudaGetErrorString(cuda_error));
+        return 1;
+    }
     int device_count = 0;
-    cudaError_t cuda_error = cudaGetDeviceCount(&device_count);
+    cuda_error = cudaGetDeviceCount(&device_count);
     if (cuda_error != cudaSuccess) {
         std::printf("CUDA error: %s\n", cudaGetErrorString(cuda_error));
         return 1;
@@ -2121,7 +2131,7 @@ int main(int argc, char const **argv) {
     std::printf("- CUDA devices:\n");
     cudaDeviceProp prop;
     for (int i = 0; i < device_count; ++i) {
-        cudaGetDeviceProperties(&prop, i);
+        cuda_error = cudaGetDeviceProperties(&prop, i);
         std::printf("  - %s\n", prop.name);
     }
     std::printf("- CUDA managed memory support: %s\n", prop.managedMemory == 1 ? "yes" : "no");

From 0c0ff4234b8d9513824a7760cb8f3502423f8ff7 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 28 Mar 2025 23:03:17 +0000
Subject: [PATCH 266/751] Make: NVCC kernel debugging symbols

---
 CMakeLists.txt | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2f93553d..34340f00 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -210,7 +210,6 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
         ${compiler_id} STREQUAL "GNU"
         OR ${compiler_id} STREQUAL "Clang"
         OR ${compiler_id} STREQUAL "AppleClang"
-        OR ${compiler_id} STREQUAL "NVIDIA"
     )
         if (${CMAKE_BUILD_TYPE} STREQUAL "Debug" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
             target_compile_options(${target} PRIVATE "-O0;-g")
@@ -218,6 +217,18 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
         if (${CMAKE_BUILD_TYPE} STREQUAL "Release" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
             target_compile_options(${target} PRIVATE "-O3")
         endif ()
+    elseif (${compiler_id} STREQUAL "NVIDIA")
+        if (${CMAKE_BUILD_TYPE} STREQUAL "Debug" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
+            target_compile_options(${target} PRIVATE "-O0;-g")
+            target_compile_options(${target} PRIVATE "-G;-no-compress") # for device code debug symbols
+            target_compile_options(${target} PRIVATE "-Xptxas=-O0") # pass to the PTX assembler
+            target_compile_options(${target} PRIVATE "-Xcompiler=-fno-omit-frame-pointer")
+            target_compile_options(${target} PRIVATE "-Xcompiler=-fno-inline")
+            target_compile_options(${target} PRIVATE "-maxrregcount=0")
+        endif ()
+        if (${CMAKE_BUILD_TYPE} STREQUAL "Release" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
+            target_compile_options(${target} PRIVATE "-O3")
+        endif ()
     endif ()
 
     # If available, enable Position Independent Code

From efcadd1005ea10f5f9dfbc6063abe5fed9d5899c Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 29 Mar 2025 23:59:32 +0000
Subject: [PATCH 267/751] Fix: Overflow `mean_token_length` calculation

---
 scripts/bench.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index 53f24bec..e046d0dd 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -382,8 +382,8 @@ inline environment_t build_environment(                                        /
 #endif
 
     auto const mean_token_length =
-        std::accumulate(env.tokens.begin(), env.tokens.end(), 0,
-                        [](std::size_t sum, std::string_view token) { return sum + token.size(); }) *
+        std::accumulate(env.tokens.begin(), env.tokens.end(), (std::size_t)0u,
+                        [](std::size_t sum, token_view_t token) -> std::size_t { return sum + token.size(); }) *
         1.0 / env.tokens.size();
 
     // Group integer decimal separators by 3

From 247c6ec6601fe19fcb838d0040c183d9272762f2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 30 Mar 2025 00:00:14 +0000
Subject: [PATCH 268/751] Fix: Shuffle datasets with over 4B tokens

---
 scripts/bench.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index e046d0dd..559e32a1 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -375,8 +375,8 @@ inline environment_t build_environment(                                        /
 
     // In "RELEASE" mode, shuffle tokens to avoid bias.
     char const *seed_message = " (not used in DEBUG mode)";
-#if !defined(SZ_DEBUG)
-    std::mt19937 generator(static_cast<unsigned int>(env.seed));
+#if !defined(SZ_DEBUG) || !SZ_DEBUG
+    std::mt19937_64 generator(static_cast<unsigned long>(env.seed));
     std::shuffle(env.tokens.begin(), env.tokens.end(), generator);
     seed_message = "";
 #endif

From b1c9a74d06bbede558fbccd57bb758c24b051768 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 30 Mar 2025 00:01:34 +0000
Subject: [PATCH 269/751] Improve: Allow datasets in VRAM

---
 scripts/bench.hpp | 136 +++++++++++++++++++++++++++++++---------------
 1 file changed, 91 insertions(+), 45 deletions(-)

diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index 559e32a1..6c6878a1 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -45,10 +45,15 @@
 #include <optional>   // `std::optional`
 
 #include <string_view> // Requires C++17
+#include <span>        // Requires C++20, used to pass info to batch-capable parallel backends
 
 #include <stringzilla/stringzilla.h>
 #include <stringzilla/stringzilla.hpp>
 
+#if SZ_USE_CUDA
+#include <stringzilla/types.cuh> // `unified_alloc`
+#endif
+
 #include "test.hpp" // `read_file`
 
 namespace sz = ashvardanian::stringzilla;
@@ -74,10 +79,12 @@ struct call_result_t {
     check_value_t check_value = 0;
     /** @brief For some operations with non-linear complexity, the throughput should be measured differently. */
     std::size_t operations = 0;
+    /** @brief Equal to 1 for most inputs, but can be larger for batch-capable functions. */
+    std::size_t inputs_processed = 1;
 
     call_result_t() = default;
     call_result_t(std::size_t bytes_passed, std::size_t check_value = 0, std::size_t operations = 0)
-        : bytes_passed(bytes_passed), check_value(check_value), operations(operations) {}
+        : bytes_passed(bytes_passed), check_value(check_value), operations(operations), inputs_processed(1) {}
 };
 
 struct callable_no_op_t {
@@ -183,33 +190,57 @@ inline std::size_t bit_floor(std::size_t n) {
     return static_cast<std::size_t>(1) << most_siginificant_bit_position;
 }
 
+#if !SZ_USE_CUDA
+using dataset_t = std::string;
+using token_view_t = std::string_view;
+using tokens_t = std::vector<token_view_t>;
+#else
+using dataset_t = std::basic_string<char, std::char_traits<char>, sz::cuda::unified_alloc<char>>;
+using token_view_t = sz::span<char const>;
+using tokens_t = std::vector<token_view_t, sz::cuda::unified_alloc<token_view_t>>;
+#endif
+
 /**
  *  @brief Tokenizes a string with the given separator predicate.
  *  @see For faster ways to tokenize a string with STL: https://ashvardanian.com/posts/splitting-strings-cpp/
  */
-template <typename is_separator_callback_type>
-inline std::vector<std::string_view> tokenize(std::string_view str, is_separator_callback_type &&is_separator) {
-    std::vector<std::string_view> words;
-    std::size_t start = 0;
-    for (std::size_t end = 0; end <= str.length(); ++end) {
+template <typename is_separator_callback_type_>
+tokens_t tokenize(std::string_view str, is_separator_callback_type_ &&is_separator) {
+
+    // First, let's count the number of separators to minimize the number of allocations.
+    std::size_t separator_count = 0;
+    for (std::size_t i = 0; i < str.length(); ++i)
+        if (is_separator(str[i])) separator_count++;
+
+    // Now, let's allocate the vector with the right size.
+    std::size_t const token_upper_bound = separator_count + 1;
+    tokens_t tokens(token_upper_bound);
+
+    // Now, let's split the string into non-empty tokens.
+    std::size_t tokens_found = 0;
+    for (std::size_t start = 0, end = 0; end <= str.length(); ++end)
         if (end == str.length() || is_separator(str[end])) {
-            if (start < end) words.push_back({&str[start], end - start});
+            if (start < end) tokens[tokens_found] = {&str[start], end - start}, ++tokens_found;
             start = end + 1;
         }
-    }
-    return words;
+
+    // Now, let's resize the vector to the actual number of tokens found.
+    tokens.resize(tokens_found);
+    return tokens;
 }
 
 /** @brief Splits a string into words, using newlines, tabs, and whitespaces as delimiters using @b `std::isspace`. */
-inline std::vector<std::string_view> tokenize(std::string_view str) {
+inline tokens_t tokenize(std::string_view str) {
     return tokenize(str, [](char c) { return std::isspace(c); });
 }
 
-template <typename result_string_type = std::string_view, typename from_string_type = result_string_type,
-          typename comparator_type = std::equal_to<std::size_t>>
-inline std::vector<result_string_type> filter_by_length(std::vector<from_string_type> tokens, std::size_t n,
-                                                        comparator_type &&comparator = {}) {
-    std::vector<result_string_type> result;
+template <typename result_string_type_ = std::string_view, typename from_string_type_ = result_string_type_,
+          typename comparator_type_ = std::equal_to<std::size_t>, typename allocator_type_ = std::allocator<char>>
+std::vector<result_string_type_, allocator_type_> filter_by_length(
+    std::vector<from_string_type_, allocator_type_> const &tokens, //
+    std::size_t n, comparator_type_ &&comparator = {}) {
+
+    std::vector<result_string_type_, allocator_type_> result;
     for (auto const &str : tokens)
         if (comparator(str.length(), n)) result.push_back({str.data(), str.length()});
     return result;
@@ -249,18 +280,23 @@ struct environment_t {
     /** @brief Upper time bound on a duration of a single callable. */
     std::size_t benchmark_seconds = SZ_DEBUG ? 1 : 10;
     /** @brief Seed for the random number generator. */
-    std::size_t seed = 0;
+    std::uint64_t seed = 0;
     /** @brief Upper bound on the number of stress test failures on a callable. */
     std::size_t stress_limit = 1;
 
     /** @brief Textual content of the dataset file, fully loaded into memory. */
-    std::string dataset;
+    dataset_t dataset;
     /** @brief Array of tokens extracted from the @p dataset. */
-    std::vector<std::string_view> tokens;
+    tokens_t tokens;
 
     bool allow(std::string const &benchmark_name) const {
         return filter.empty() || std::regex_search(benchmark_name, std::regex(filter));
     }
+
+    std::string_view operator[](std::size_t i) const {
+        if (i >= tokens.size()) throw std::out_of_range("Index out of range");
+        return {tokens[i].data(), tokens[i].size()};
+    }
 };
 
 /**
@@ -362,14 +398,14 @@ inline environment_t build_environment(                                        /
     env.dataset.resize(bit_floor(env.dataset.size())); // Shrink to the nearest power of two
 
     // Tokenize the dataset according to the tokenization mode
-    if (env.tokenization == environment_t::file_k) { env.tokens.push_back(env.dataset); }
+    if (env.tokenization == environment_t::file_k) { env.tokens.push_back({env.dataset.data(), env.dataset.size()}); }
     else if (env.tokenization == environment_t::lines_k) {
         env.tokens = tokenize(env.dataset, [](char c) { return c == '\n'; });
     }
     else if (env.tokenization == environment_t::words_k) { env.tokens = tokenize(env.dataset); }
     else {
         std::size_t n = static_cast<std::size_t>(env.tokenization);
-        env.tokens = filter_by_length(tokenize(env.dataset), n, std::equal_to<std::size_t>());
+        env.tokens = filter_by_length<token_view_t>(tokenize(env.dataset), n, std::equal_to<std::size_t>());
     }
     env.tokens.resize(bit_floor(env.tokens.size())); // Shrink to the nearest power of two
 
@@ -465,10 +501,14 @@ struct bench_result_t {
     std::string name;
     bool skipped = false;
 
-    std::size_t stress_calls = 0;
-    std::size_t profiled_calls = 0;
-    std::size_t profiled_cpu_cycles = 0;
-    double profiled_seconds = 0;
+    std::size_t stress_calls = 0;   //< Number of calls to the callable for stress-testing
+    std::size_t profiled_calls = 0; //< Number of calls to the callable for profiling/benchmarking
+
+    std::size_t stress_inputs = 0;   //< Can be larger than `stress_calls` for batch-capable functions
+    std::size_t profiled_inputs = 0; //< Can be larger than `profiled_calls` for batch-capable functions
+
+    std::size_t profiled_cpu_cycles = 0; //< Number of CPU cycles used in the benchmark by the main thread
+    double profiled_seconds = 0;         //< Wall clock duration of the benchmark
 
     duration_histogram_t cpu_cycles_histogram;
 
@@ -476,12 +516,6 @@ struct bench_result_t {
     std::size_t operations = 0;   //< Pulled from the `call_result_t`
     std::size_t errors = 0;       //< Pulled from the `call_result_t`
 
-    inline bench_result_t &operator+=(call_result_t const &run) noexcept {
-        bytes_passed += run.bytes_passed;
-        operations += run.operations;
-        return *this;
-    }
-
     /**
      *  @brief  Logs the benchmark results to the console, including the throughput and latency,
      *          comparing against one or more baselines.
@@ -618,6 +652,7 @@ bench_result_t bench_nullary(  //
             call_result_t const accelerated_result = callable();
             call_result_t const baseline_result = baseline();
             ++result.stress_calls;
+            result.stress_inputs += accelerated_result.inputs_processed;
             if (accelerated_result.check_value == baseline_result.check_value) continue; // No failures
 
             // If we got here, the error needs to be reported and investigated.
@@ -633,16 +668,18 @@ bench_result_t bench_nullary(  //
     // Repeat the benchmark of the unary function. Assume most of them are applied to the entire
     // dataset and take a lot of time, so we don't unroll much, unlike `bench_unary`.
     for (auto running_seconds : repeat_up_to(env.benchmark_seconds)) {
-        std::uint64_t time_start = cpu_cycle_counter();
+        std::uint64_t cpu_cycles_at_start = cpu_cycle_counter();
         call_result_t call_result = callable();
-        std::uint64_t time_end = cpu_cycle_counter();
+        std::uint64_t cpu_cycles_at_end = cpu_cycle_counter();
 
         // Aggregate:
-        result += call_result;
+        result.operations += call_result.operations;
+        result.bytes_passed += call_result.bytes_passed;
+        result.profiled_inputs += call_result.inputs_processed;
         result.profiled_seconds = running_seconds;
         result.profiled_calls += 1;
-        result.profiled_cpu_cycles += time_end - time_start;
-        result.cpu_cycles_histogram[time_end - time_start] += 1;
+        result.profiled_cpu_cycles += cpu_cycles_at_end - cpu_cycles_at_start;
+        result.cpu_cycles_histogram[cpu_cycles_at_end - cpu_cycles_at_start] += 1;
     }
 
     return result;
@@ -654,6 +691,7 @@ bench_result_t bench_nullary(  //
  *  @param[in] name Name of the benchmark, used for logging.
  *  @param[in] baseline Optional serial analog, against which the accelerated function will be stress-tested.
  *  @param[in] callable Unary function taking a @b `std::size_t` token index and returning a @b `call_result_t`.
+ *  @param[in] preprocessing Optional function to pre-process the data after the prediction.
  *  @return Profiling results, including the number of cycles, bytes processed, and error counts.
  */
 template <                                          //
@@ -684,6 +722,7 @@ bench_result_t bench_unary(    //
             std::size_t const token_index = (result.stress_calls++) & lookup_mask;
             call_result_t const accelerated_result = callable(token_index);
             call_result_t const baseline_result = baseline(token_index);
+            result.stress_calls += accelerated_result.inputs_processed;
             if (accelerated_result.check_value == baseline_result.check_value) continue; // No failures
 
             // If we got here, the error needs to be reported and investigated.
@@ -699,12 +738,16 @@ bench_result_t bench_unary(    //
     // For profiling, we will first run the benchmark just once to get a rough estimate of the time.
     // But then we will repeat it in an unrolled fashion for a more accurate measurement.
     result.profiled_seconds += seconds_per_call([&] {
-        std::uint64_t start_cycle = cpu_cycle_counter();
-        result += callable((std::size_t)0); //? Use the first token
-        std::uint64_t end_cycle = cpu_cycle_counter();
+        std::uint64_t cpu_cycles_at_start = cpu_cycle_counter();
+        call_result_t const call_result = callable((std::size_t)0); //? Use the first token
+        std::uint64_t cpu_cycles_at_end = cpu_cycle_counter();
+
+        result.operations += call_result.operations;
+        result.bytes_passed += call_result.bytes_passed;
+        result.profiled_inputs += call_result.inputs_processed;
         result.profiled_calls += 1;
-        result.profiled_cpu_cycles += end_cycle - start_cycle;
-        result.cpu_cycles_histogram[end_cycle - start_cycle] += 1;
+        result.profiled_cpu_cycles += cpu_cycles_at_end - cpu_cycles_at_start;
+        result.cpu_cycles_histogram[cpu_cycles_at_end - cpu_cycles_at_start] += 1;
     });
     if (result.profiled_seconds >= env.benchmark_seconds) return result;
 
@@ -721,12 +764,15 @@ bench_result_t bench_unary(    //
         std::uint64_t t4 = cpu_cycle_counter();
 
         // Aggregate all of them:
-        result += r0;
-        result += r1;
-        result += r2;
-        result += r3;
-        result.profiled_seconds = running_seconds;
+        result.operations += r0.operations, result.operations += r1.operations,                           //
+            result.operations += r2.operations, result.operations += r3.operations;                       //
+        result.bytes_passed += r0.bytes_passed, result.bytes_passed += r1.bytes_passed,                   //
+            result.bytes_passed += r2.bytes_passed, result.bytes_passed += r3.bytes_passed;               //
+        result.profiled_inputs += r0.inputs_processed, result.profiled_inputs += r1.inputs_processed,     //
+            result.profiled_inputs += r2.inputs_processed, result.profiled_inputs += r3.inputs_processed; //
         result.profiled_calls += 4;
+
+        result.profiled_seconds = running_seconds;
         result.profiled_cpu_cycles += t4 - t0;
         result.cpu_cycles_histogram[t1 - t0] += 1;
         result.cpu_cycles_histogram[t2 - t1] += 1;

From 1671b0fe94244bd20a57b0dca5bae683a24888d1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 30 Mar 2025 00:01:52 +0000
Subject: [PATCH 270/751] Make: Compile with OpenMP

---
 CMakeLists.txt | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 34340f00..fbfcc05e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -195,6 +195,20 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
         target_compile_options(${target} PRIVATE "/Bt;/wd4068;/wd4146;/utf-8;/WX")
     endif ()
 
+    # Enable OpenMP if available
+    if (NOT target_type STREQUAL "SHARED_LIBRARY")
+        if (${compiler_id} STREQUAL "GNU"
+            OR ${compiler_id} STREQUAL "Clang"
+            OR ${compiler_id} STREQUAL "NVIDIA"
+        )
+            target_compile_options(${target} PRIVATE "-fopenmp")
+            target_link_libraries(${target} PRIVATE "-fopenmp")
+        elseif (${compiler_id} STREQUAL "MSVC")
+            target_compile_options(${target} PRIVATE "/openmp")
+            target_link_libraries(${target} PRIVATE "/openmp")
+        endif ()
+    endif ()
+
     # Set optimization options for different compilers differently
     if (${compiler_id} STREQUAL "MSVC")
         if (${CMAKE_BUILD_TYPE} STREQUAL "Debug" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
@@ -326,7 +340,7 @@ endfunction ()
 
 if (${STRINGZILLA_BUILD_BENCHMARK})
     define_launcher(stringzilla_bench_search scripts/bench_search.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringzilla_bench_similarity scripts/bench_similarity.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_gpu_launcher(stringzilla_bench_similarity scripts/bench_similarity.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_sequence scripts/bench_sequence.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_token scripts/bench_token.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_container scripts/bench_container.cpp 20 "${STRINGZILLA_TARGET_ARCH}")

From 109d8de9e443e48c15e19e9a677ad721e5e98424 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 31 Mar 2025 10:09:17 +0000
Subject: [PATCH 271/751] Add: Local alignment adaptation

---
 include/stringzilla/similarities.hpp | 336 ++++++++++++++++++++-------
 1 file changed, 251 insertions(+), 85 deletions(-)

diff --git a/include/stringzilla/similarities.hpp b/include/stringzilla/similarities.hpp
index 22abd9d5..59c3858c 100644
--- a/include/stringzilla/similarities.hpp
+++ b/include/stringzilla/similarities.hpp
@@ -1,14 +1,40 @@
 /**
- *  @brief  OpenMP-accelerated string similarity utilities.
+ *  @brief  OpenMP-accelerated string similarity scores in C++.
  *  @file   similarities.hpp
  *  @author Ash Vardanian
  *
  *  Includes core APIs:
  *
- *  - `sz::levenshtein_distances` & `sz::levenshtein_distances_utf8` for Levenshtein edit-distances.
+ *  - `sz::levenshtein_distance` & `sz::levenshtein_distance_utf8` for Levenshtein edit-distances.
  *  - `sz::needleman_wunsch_score` for weighted Needleman-Wunsch global alignment.
+ *  - `sz::smith_waterman_score` for weighted Smith-Waterman local alignment.
+ *
+ *  Also includes their batch-capable and parallel versions:
+ *
+ *  - `sz::levenshtein_distances` & `sz::levenshtein_distances_utf8` for Levenshtein edit-distances.
+ *  - `sz::needleman_wunsch_scores` for weighted Needleman-Wunsch global alignment.
+ *  - `sz::smith_waterman_scores` for weighted Smith-Waterman local alignment.
+ *
+ *  Those are mostly providing specialized overloads of the @b `sz::score_diagonally` wavefront-like template
+ *  or @b `sz::score_horizontally` conventional Wagner-Fischer algorithm template, that may be more suitable
+ *  for large 256x256 substitution matrices on x86 CPUs.
+ *
+ *  @section    Why not reimplement this in pure C 99?
+ *
+ *  In bioinformatics and other string processing applications we are exposed to too much variability in the
+ *  form of inputs and the kind of processing optimizations we want to apply. Many of those optimizations are
+ *  independent from the core logic and can be composed together in a modular way. Doing that in C 99 would
+ *  require a lot of boilerplate code and would be hard to maintain.
+ *
+ *  - The core algorithm for byte-level and UTF-32 alignment scoring is identical.
+ *  - Local and global alignment algorithms are almost identical, only differing in one more `min`/`max`
+ *    operation and the way the top row and left column of the DP matrix are initialized.
+ *  - Different CPU cores may be scheduled to process different pairs individually, or collaborate to
+ *    align very large strings, still using the same core logic.
+ *  - Different substitution cost models require very different SIMD implementations in case of uniform
+ *    costs, DNA scoring with 4x4 matrix, protein scoring with 20x20 matrix, or custom costs.
  *
- *  Those are mostly providing specialized overloads of the @b `sz::score_diagonally` template.
+ *  Each of those may just be a 2 line change in the core logic, but can produce a @b 1000 lines of boilerplate!
  */
 #ifndef STRINGZILLA_SIMILARITY_HPP_
 #define STRINGZILLA_SIMILARITY_HPP_
@@ -19,9 +45,140 @@ namespace ashvardanian {
 namespace stringzilla {
 namespace openmp {
 
+/**
+ *  @brief  An operator to be applied to be applied to all 2x2 blocks of the DP matrix to produce
+ *          the bottom-right value from the 3x others in case of Global Alignment algorithms, like
+ *          the Needleman-Wunsch or Levenshtein distance calculations.
+ *
+ *  It updates the internal state to remember the last calculated value, as in Global Alignment it's
+ *  always in the bottom-right corner of the DP matrix, which is evaluated last.
+ */
+template <typename char_type = char, typename distance_type = sz_size_t,
+          typename get_substitution_cost_type = uniform_substitution_cost_t,
+          sz_capability_t capability = sz_cap_serial_k>
+struct global_aligner {
+
+    static constexpr bool is_parallel_k = capability & sz_cap_parallel_k;
+
+    get_substitution_cost_type get_substitution_cost_ {};
+    error_cost_t gap_cost_ {1};
+    distance_type last_cell_ {0};
+
+    global_aligner() = default;
+    global_aligner(get_substitution_cost_type &&get_substitution_cost, error_cost_t gap_cost) noexcept
+        : get_substitution_cost_(std::move(get_substitution_cost)), gap_cost_(gap_cost) {}
+
+    /**
+     *  @brief Initializes a boundary value within a certain diagonal.
+     *  @note Should only be called for the diagonals outside of the bottom-right triangle.
+     */
+    void init(distance_type &cell, sz_size_t diagonal_index) const noexcept { cell = gap_cost_ * diagonal_index; }
+
+    /**
+     *  @brief Extract the final result of the scoring operation which will be always in the bottom-right corner.
+     */
+    distance_type score() const noexcept { return last_cell_; }
+
+    /**
+     *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
+     *  @param first_reversed_slice The first string, @b reversed.
+     *  @param second_slice The second string.
+     *  @param n The length of the diagonal to evaluate and the number of characters to compare from each string.
+     */
+    void operator()(                                                                             //
+        char_type const *first_reversed_slice, char_type const *second_slice, sz_size_t const n, //
+        distance_type const *costs_pre_substitution,                                             //
+        distance_type const *costs_pre_insertion, distance_type const *costs_pre_deletion,       //
+        distance_type *costs_new) noexcept {
+
+        _sz_assert(costs_pre_insertion + 1 == costs_pre_deletion); // ? Those are expected to be in consecutive slots
+
+#pragma omp parallel for simd schedule(dynamic, 1) if (is_parallel_k)
+        for (sz_size_t i = 0; i < n; ++i) {
+            distance_type cost_pre_substitution = costs_pre_substitution[i];
+            distance_type cost_pre_insertion = costs_pre_insertion[i];
+            distance_type cost_pre_deletion = costs_pre_deletion[i];
+
+            // ? Note that here we are still traversing both buffers in the same order,
+            // ? because the one of the strings has been reversed beforehand.
+            error_cost_t cost_of_substitution = get_substitution_cost_(first_reversed_slice[i], second_slice[i]);
+            distance_type cost_if_substitution = cost_pre_substitution + cost_of_substitution;
+            distance_type cost_if_deletion_or_insertion =
+                sz_min_of_two(cost_pre_deletion, cost_pre_insertion) + gap_cost_;
+            distance_type cell_score = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
+            costs_new[i] = cell_score;
+        }
+
+        // The last element of the last diagonal is the result of the global alignment.
+        last_cell_ = costs_new[0];
+    }
+};
+
+/**
+ *  @brief  An operator to be applied to be applied to all 2x2 blocks of the DP matrix to produce
+ *          the bottom-right value from the 3x others in case of Global Alignment algorithms, like
+ *          the Smith-Waterman score.
+ *
+ *  It updates the internal state to remember the minimum/maximum calculated value, as in Local Alignment
+ *  it's always in the bottom-right corner of the DP matrix, which is evaluated last.
+ */
+template <typename char_type = char, typename distance_type = sz_size_t,
+          typename get_substitution_cost_type = uniform_substitution_cost_t,
+          sz_capability_t capability = sz_cap_serial_k>
+struct local_aligner {
+
+    static constexpr bool is_parallel_k = capability & sz_cap_parallel_k;
+
+    get_substitution_cost_type get_substitution_cost_ {};
+    error_cost_t gap_cost_ {-1};
+    distance_type max_cell_ {0};
+
+    local_aligner() = default;
+    local_aligner(get_substitution_cost_type &&get_substitution_cost, error_cost_t gap_cost) noexcept
+        : get_substitution_cost_(std::move(get_substitution_cost)), gap_cost_(gap_cost) {}
+
+    void init(distance_type &cell, sz_size_t /*diagonal_index*/) const noexcept { cell = 0; }
+    distance_type score() const noexcept { return max_cell_; }
+
+    void operator()(char_type const *first_reversed_slice, char_type const *second_slice, sz_size_t const n,
+                    distance_type const *scores_pre_substitution, distance_type const *scores_pre_insertion,
+                    distance_type const *scores_pre_deletion, distance_type *scores_new) noexcept {
+
+#pragma omp parallel for simd schedule(dynamic, 1) if (is_parallel_k)
+        for (sz_size_t i = 0; i < n; ++i) {
+            distance_type score_pre_substitution = scores_pre_substitution[i];
+            distance_type score_pre_insertion = scores_pre_insertion[i];
+            distance_type score_pre_deletion = scores_pre_deletion[i];
+
+            // ? Note that here we are still traversing both buffers in the same order,
+            // ? because the one of the strings has been reversed beforehand.
+            error_cost_t substitution = get_substitution_cost_(first_reversed_slice[i], second_slice[i]);
+            distance_type cost_if_substitution = cost_pre_substitution + cost_of_substitution;
+            distance_type cost_if_deletion_or_insertion =
+                sz_min_of_two(cost_pre_deletion, cost_pre_insertion) + gap_cost_;
+            distance_type cell_score = sz_min_of_three(cost_if_deletion_or_insertion, cost_if_substitution, 0);
+            scores_new[i] = cell_score;
+
+            // Update the global maximum score if this cell beats it.
+#pragma omp critical
+            {
+                if (cell_score > max_cell_) { max_cell_ = cell_score; }
+            }
+        }
+    }
+};
+
 /**
  *  @brief  Alignment Score and Edit Distance algorithm evaluating the Dynamic Programming matrix
  *          @b three skewed (reverse) diagonals at a time on a CPU, leveraging OpenMP for parallelization.
+ *          Can be used for both global and local alignment, like Needleman-Wunsch and Smith-Waterman.
+ *
+ *  ? There are smarter algorithms for computing the Levenshtein distance, mostly based on bit-level operations.
+ *  ? Those, however, don't generalize well to arbitrary length inputs or non-uniform substitution costs.
+ *  ? This algorithm provides a more flexible baseline implementation for future SIMD and GPGPU optimizations.
+ *  ! This algorithm can't handle different "gap opening" and "gap extension" costs, those need 3x more memory.
+ *  ! This algorithm may be suboptimal for very small strings, where a conventional Wagner-Fischer algorithm
+ *  ! with horizontal traversal order and fewer loops may be faster.
  *
  *  @param[in] first The first string.
  *  @param[in] second The second string.
@@ -30,9 +187,12 @@ namespace openmp {
  *  @param[in] get_substitution_cost A commutative function returning the cost of substituting one char with another.
  *  @param[in] alloc A default-constructible allocator for the internal buffers.
  *
- *  There are smarter algorithms for computing the Levenshtein distance, mostly based on bit-level operations.
- *  Those, however, don't generalize well to arbitrary length inputs or non-uniform substitution costs.
- *  This algorithm provides a more flexible baseline implementation for future SIMD and GPGPU optimizations.
+ *  @tparam char_type_ The type of the characters in the strings, generally `char` or @b `rune_t` for UTF-8.
+ *  @tparam distance_type_ The smallest type that can hold the distance, ideally `sz_i8_t` or `sz_u8_t`.
+ *  @tparam get_substitution_cost_ A callable type that takes two characters and returns the substitution cost.
+ *  @tparam allocator_type_ A default-constructible allocator type for the internal buffers.
+ *  @tparam multi_threaded_ Whether to use OpenMP for @b multi-threading or just vectorization.
+ *  @tparam global_alignment_ Whether to use the global alignment algorithm or the local one.
  *
  *  @note   The API of this algorithm is a bit weird, but it's designed to minimize the reliance on the definitions
  *          in the `stringzilla.hpp` header, making compilation times shorter for the end-user.
@@ -41,7 +201,9 @@ namespace openmp {
  *  @sa     For bulk API, use `sz::levenshtein_distances[_utf8]`.
  */
 template <                                                         //
-    typename char_type_,                                           //
+    sz_capability_t capability_ = sz_cap_serial_k,                 //
+    sz_alignment_locality_t locality_ = sz_align_global_k,         //
+    typename char_type_ = char,                                    //
     typename distance_type_ = sz_size_t,                           //
     typename get_substitution_cost_ = uniform_substitution_cost_t, //
     typename allocator_type_ = dummy_alloc_t                       //
@@ -59,6 +221,8 @@ sz_status_t score_diagonally(
     using char_type = char_type_;
     using distance_type = distance_type_;
 
+    using aligner_t = global_aligner<char_type, distance_type, get_substitution_cost_, capability_>;
+
     // Make sure the size relation between the strings is correct.
     char_type const *shorter = first.data(), *longer = second.data();
     sz_size_t shorter_length = first.size(), longer_length = second.size();
@@ -98,8 +262,10 @@ sz_status_t score_diagonally(
     for (sz_size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
 
     // Initialize the first two diagonals:
-    previous_distances[0] = 0;
-    current_distances[0] = current_distances[1] = 1;
+    aligner_t diagonal_aligner;
+    diagonal_aligner.init(previous_distances[0], 0);
+    diagonal_aligner.init(current_distances[0], 1);
+    diagonal_aligner.init(current_distances[1], 1);
 
     // We skip diagonals 0 and 1, as they are trivial.
     // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
@@ -109,25 +275,20 @@ sz_status_t score_diagonally(
 
     // Progress through the upper-left triangle of the Levenshtein matrix.
     for (; next_diagonal_index < shorter_dim; ++next_diagonal_index) {
+
         sz_size_t const next_diagonal_length = next_diagonal_index + 1;
-#pragma omp simd
-        for (sz_size_t offset_in_diagonal = 1; offset_in_diagonal + 1 < next_diagonal_length; ++offset_in_diagonal) {
-            // ? Note that here we are still traversing both buffers in the same order,
-            // ? because the shorter string has been reversed into `shorter_reversed`.
-            char_type shorter_char = shorter_reversed[shorter_length - next_diagonal_index + offset_in_diagonal];
-            char_type longer_char = longer[offset_in_diagonal - 1];
-            sz_error_cost_t cost_of_substitution = get_substitution_cost(shorter_char, longer_char);
-            distance_type cost_if_substitution = previous_distances[offset_in_diagonal - 1] + cost_of_substitution;
-            distance_type cost_if_deletion_or_insertion =      //
-                sz_min_of_two(                                 //
-                    current_distances[offset_in_diagonal - 1], //
-                    current_distances[offset_in_diagonal]      //
-                    ) +
-                gap_cost;
-            next_distances[offset_in_diagonal] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
-        }
+        diagonal_aligner(                                                //
+            shorter_reversed + shorter_length - next_diagonal_index + 1, // first string
+            longer,                                                      // second string
+            next_diagonal_length - 2,                 // number of elements to compute with the `diagonal_aligner`
+            previous_distances,                       // costs pre substitution
+            current_distances, current_distances + 1, // costs pre insertion/deletion
+            next_distances + 1);
+
         // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-        next_distances[0] = next_distances[next_diagonal_length - 1] = next_diagonal_index;
+        diagonal_aligner.init(next_distances[0], next_diagonal_index);
+        diagonal_aligner.init(next_distances[next_diagonal_length - 1], next_diagonal_index);
+
         // Perform a circular rotation of those buffers, to reuse the memory.
         distance_type *temporary = previous_distances;
         previous_distances = current_distances;
@@ -137,28 +298,26 @@ sz_status_t score_diagonally(
 
     // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
     for (; next_diagonal_index < longer_dim; ++next_diagonal_index) {
+
         sz_size_t const next_diagonal_length = shorter_dim;
-#pragma omp simd
-        for (sz_size_t offset_in_diagonal = 0; offset_in_diagonal + 1 < next_diagonal_length; ++offset_in_diagonal) {
-            char_type shorter_char = shorter_reversed[shorter_length - shorter_dim + offset_in_diagonal + 1];
-            char_type longer_char = longer[next_diagonal_index - shorter_dim + offset_in_diagonal];
-            sz_error_cost_t cost_of_substitution = get_substitution_cost(shorter_char, longer_char);
-            distance_type cost_if_substitution = previous_distances[offset_in_diagonal] + cost_of_substitution;
-            distance_type cost_if_deletion_or_insertion =     //
-                sz_min_of_two(                                //
-                    current_distances[offset_in_diagonal],    //
-                    current_distances[offset_in_diagonal + 1] //
-                    ) +
-                gap_cost;
-            next_distances[offset_in_diagonal] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
-        }
-        next_distances[next_diagonal_length - 1] = next_diagonal_index;
+        diagonal_aligner(                                        //
+            shorter_reversed + shorter_length - shorter_dim + 1, // first string
+            longer + next_diagonal_index - shorter_dim,          // second string
+            next_diagonal_length - 1,                 // number of elements to compute with the `diagonal_aligner`
+            previous_distances,                       // costs pre substitution
+            current_distances, current_distances + 1, // costs pre insertion/deletion
+            next_distances);
+
+        // Don't forget to populate the first row of the Levenshtein matrix.
+        diagonal_aligner.init(next_distances[next_diagonal_length - 1], next_diagonal_index);
+
         // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
         // dropping the first element in the current array.
         distance_type *temporary = previous_distances;
         previous_distances = current_distances;
         current_distances = next_distances;
         next_distances = temporary;
+
         // ! Drop the first entry among the current distances.
         sz_move((sz_ptr_t)(previous_distances), (sz_ptr_t)(previous_distances + 1),
                 (max_diagonal_length - 1) * sizeof(distance_type));
@@ -166,21 +325,16 @@ sz_status_t score_diagonally(
 
     // Now let's handle the bottom-right triangle of the matrix.
     for (; next_diagonal_index < diagonals_count; ++next_diagonal_index) {
+
         sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
-#pragma omp simd
-        for (sz_size_t offset_in_diagonal = 0; offset_in_diagonal < next_diagonal_length; ++offset_in_diagonal) {
-            char_type shorter_char = shorter_reversed[shorter_length - shorter_dim + offset_in_diagonal + 1];
-            char_type longer_char = longer[next_diagonal_index - shorter_dim + offset_in_diagonal];
-            sz_error_cost_t cost_of_substitution = get_substitution_cost(shorter_char, longer_char);
-            distance_type cost_if_substitution = previous_distances[offset_in_diagonal] + cost_of_substitution;
-            distance_type cost_if_deletion_or_insertion =     //
-                sz_min_of_two(                                //
-                    current_distances[offset_in_diagonal],    //
-                    current_distances[offset_in_diagonal + 1] //
-                    ) +
-                gap_cost;
-            next_distances[offset_in_diagonal] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
-        }
+        diagonal_aligner(                                        //
+            shorter_reversed + shorter_length - shorter_dim + 1, // first string
+            longer + next_diagonal_index - shorter_dim,          // second string
+            next_diagonal_length,                     // number of elements to compute with the `diagonal_aligner`
+            previous_distances,                       // costs pre substitution
+            current_distances, current_distances + 1, // costs pre insertion/deletion
+            next_distances);
+
         // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
         // dropping the first element in the current array.
         distance_type *temporary = previous_distances;
@@ -193,7 +347,7 @@ sz_status_t score_diagonally(
     }
 
     // Export the scalar before `free` call.
-    result_ref = current_distances[0];
+    result_ref = diagonal_aligner.score();
     alloc.deallocate((allocated_type *)buffer, buffer_length);
     return sz_success_k;
 }
@@ -208,10 +362,11 @@ sz_status_t score_diagonally(
  *  @sa `levenshtein_distance_utf8` for UTF-8 strings.
  *  @sa `score_diagonally` for the core algorithm.
  */
-template <                                   //
-    typename first_type_,                    //
-    typename second_type_,                   //
-    typename allocator_type_ = dummy_alloc_t //
+template < //
+    sz_capability_t capability_ = sz_cap_serial_k,
+    typename first_type_ = span<char const>,  //
+    typename second_type_ = span<char const>, //
+    typename allocator_type_ = dummy_alloc_t  //
     >
 inline sz_size_t levenshtein_distance( //
     first_type_ const &first, second_type_ const &second,
@@ -226,25 +381,28 @@ inline sz_size_t levenshtein_distance( //
     sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
     if (max_dim < 256u) {
         sz_u8_t result_u8;
-        sz_status_t status = score_diagonally<char, sz_u8_t, uniform_substitution_cost_t, allocator_type_>(
-            {first.data(), first_length}, {second.data(), second_length}, result_u8, 1, uniform_substitution_cost_t {},
-            std::forward<allocator_type_>(alloc));
+        sz_status_t status =
+            score_diagonally<capability_, sz_align_global_k, char, sz_u8_t, uniform_substitution_cost_t,
+                             allocator_type_>({first.data(), first_length}, {second.data(), second_length}, result_u8,
+                                              1, uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
         if (status == sz_bad_alloc_k) throw std::bad_alloc();
         return result_u8;
     }
     else if (max_dim < 65536u) {
         sz_u16_t result_u16;
-        sz_status_t status = score_diagonally<char, sz_u16_t, uniform_substitution_cost_t, allocator_type_>(
-            {first.data(), first_length}, {second.data(), second_length}, result_u16, 1, uniform_substitution_cost_t {},
-            std::forward<allocator_type_>(alloc));
+        sz_status_t status =
+            score_diagonally<capability_, sz_align_global_k, char, sz_u16_t, uniform_substitution_cost_t,
+                             allocator_type_>({first.data(), first_length}, {second.data(), second_length}, result_u16,
+                                              1, uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
         if (status == sz_bad_alloc_k) throw std::bad_alloc();
         return result_u16;
     }
     else {
         sz_size_t result_size;
-        sz_status_t status = score_diagonally<char, sz_size_t, uniform_substitution_cost_t, allocator_type_>(
-            {first.data(), first_length}, {second.data(), second_length}, result_size, 1,
-            uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
+        sz_status_t status =
+            score_diagonally<capability_, sz_align_global_k, char, sz_size_t, uniform_substitution_cost_t,
+                             allocator_type_>({first.data(), first_length}, {second.data(), second_length}, result_size,
+                                              1, uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
         if (status == sz_bad_alloc_k) throw std::bad_alloc();
         return result_size;
     }
@@ -260,10 +418,11 @@ inline sz_size_t levenshtein_distance( //
  *  @sa `levenshtein_distance` for binary strings.
  *  @sa `score_diagonally` for the core algorithm.
  */
-template <                                   //
-    typename first_type_,                    //
-    typename second_type_,                   //
-    typename allocator_type_ = dummy_alloc_t //
+template < //
+    sz_capability_t capability_ = sz_cap_serial_k,
+    typename first_type_ = span<char const>,  //
+    typename second_type_ = span<char const>, //
+    typename allocator_type_ = dummy_alloc_t  //
     >
 inline sz_size_t levenshtein_distance_utf8( //
     first_type_ const &first, second_type_ const &second,
@@ -299,7 +458,8 @@ inline sz_size_t levenshtein_distance_utf8( //
     sz_size_t const max_dim = sz_max_of_two(first_length_utf32, second_length_utf32) + 1;
     if (max_dim < 256u) {
         sz_u8_t result_u8;
-        sz_status_t status = score_diagonally<sz_rune_t, sz_u8_t, uniform_substitution_cost_t, allocator_type_>(
+        sz_status_t status = score_diagonally<capability_, sz_align_global_k, sz_rune_t, sz_u8_t,
+                                              uniform_substitution_cost_t, allocator_type_>(
             {first_utf32, first_length_utf32}, {second_utf32, second_length_utf32}, result_u8, 1,
             uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
         if (status == sz_bad_alloc_k) throw std::bad_alloc();
@@ -307,7 +467,8 @@ inline sz_size_t levenshtein_distance_utf8( //
     }
     else if (max_dim < 65536u) {
         sz_u16_t result_u16;
-        sz_status_t status = score_diagonally<sz_rune_t, sz_u16_t, uniform_substitution_cost_t, allocator_type_>(
+        sz_status_t status = score_diagonally<capability_, sz_align_global_k, sz_rune_t, sz_u16_t,
+                                              uniform_substitution_cost_t, allocator_type_>(
             {first_utf32, first_length_utf32}, {second_utf32, second_length_utf32}, result_u16, 1,
             uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
         if (status == sz_bad_alloc_k) throw std::bad_alloc();
@@ -315,7 +476,8 @@ inline sz_size_t levenshtein_distance_utf8( //
     }
     else {
         sz_size_t result_size;
-        sz_status_t status = score_diagonally<sz_rune_t, sz_size_t, uniform_substitution_cost_t, allocator_type_>(
+        sz_status_t status = score_diagonally<capability_, sz_align_global_k, sz_rune_t, sz_size_t,
+                                              uniform_substitution_cost_t, allocator_type_>(
             {first_utf32, first_length_utf32}, {second_utf32, second_length_utf32}, result_size, 1,
             uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
         if (status == sz_bad_alloc_k) throw std::bad_alloc();
@@ -333,10 +495,11 @@ inline sz_size_t levenshtein_distance_utf8( //
  *  @sa `levenshtein_distance` for uniform substitution and gap costs.
  *  @sa `score_diagonally` for the core algorithm.
  */
-template <                                   //
-    typename first_type_,                    //
-    typename second_type_,                   //
-    typename allocator_type_ = dummy_alloc_t //
+template < //
+    sz_capability_t capability_ = sz_cap_serial_k,
+    typename first_type_ = span<char const>,  //
+    typename second_type_ = span<char const>, //
+    typename allocator_type_ = dummy_alloc_t  //
     >
 inline sz_ssize_t needleman_wunsch_score(                 //
     first_type_ const &first, second_type_ const &second, //
@@ -352,15 +515,17 @@ inline sz_ssize_t needleman_wunsch_score(                 //
     sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
     if (max_dim < 256u) {
         sz_u8_t result_u8;
-        sz_status_t status = score_diagonally<char, sz_u8_t, lookup_substitution_cost_t, allocator_type_>(
-            {first.data(), first_length}, {second.data(), second_length}, result_u8, gap,
-            lookup_substitution_cost_t {subs}, std::forward<allocator_type_>(alloc));
+        sz_status_t status =
+            score_diagonally<capability_, sz_align_local_k, char, sz_u8_t, lookup_substitution_cost_t, allocator_type_>(
+                {first.data(), first_length}, {second.data(), second_length}, result_u8, gap,
+                lookup_substitution_cost_t {subs}, std::forward<allocator_type_>(alloc));
         if (status == sz_bad_alloc_k) throw std::bad_alloc();
         return result_u8;
     }
     else if (max_dim < 65536u) {
         sz_u16_t result_u16;
-        sz_status_t status = score_diagonally<char, sz_u16_t, lookup_substitution_cost_t, allocator_type_>(
+        sz_status_t status = score_diagonally<capability_, sz_align_local_k, char, sz_u16_t, lookup_substitution_cost_t,
+                                              allocator_type_>(
             {first.data(), first_length}, {second.data(), second_length}, result_u16, gap,
             lookup_substitution_cost_t {subs}, std::forward<allocator_type_>(alloc));
         if (status == sz_bad_alloc_k) throw std::bad_alloc();
@@ -368,7 +533,8 @@ inline sz_ssize_t needleman_wunsch_score(                 //
     }
     else {
         sz_size_t result_size;
-        sz_status_t status = score_diagonally<char, sz_size_t, lookup_substitution_cost_t, allocator_type_>(
+        sz_status_t status = score_diagonally<capability_, sz_align_local_k, char, sz_size_t,
+                                              lookup_substitution_cost_t, allocator_type_>(
             {first.data(), first_length}, {second.data(), second_length}, result_size, gap,
             lookup_substitution_cost_t {subs}, std::forward<allocator_type_>(alloc));
         if (status == sz_bad_alloc_k) throw std::bad_alloc();

From 1c1582ffa15602e56e5bb784e2c9ce0a7a18d1bd Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 31 Mar 2025 10:11:53 +0000
Subject: [PATCH 272/751] Improve: Move capabilities to `types.h`

---
 include/stringzilla/stringzilla.h | 20 --------------------
 include/stringzilla/types.h       | 24 ++++++++++++++++++++++++
 2 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 913720d4..f42cfb25 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -58,26 +58,6 @@
 extern "C" {
 #endif
 
-/**
- *  @brief  Enumeration of SIMD capabilities of the target architecture.
- *          Used to introspect the supported functionality of the dynamic library.
- */
-typedef enum {
-    sz_cap_serial_k = 1,       ///< Serial (non-SIMD) capability
-    sz_cap_any_k = 0x7FFFFFFF, ///< Mask representing any capability with `INT_MAX`
-
-    sz_cap_haswell_k = 1 << 10, ///< x86 AVX2 capability with FMA and F16C extensions
-    sz_cap_skylake_k = 1 << 11, ///< x86 AVX512 baseline capability
-    sz_cap_ice_k = 1 << 12,     ///< x86 AVX512 capability with advanced integer algos and AES extensions
-
-    sz_cap_neon_k = 1 << 20,     ///< ARM NEON baseline capability
-    sz_cap_neon_aes_k = 1 << 21, ///< ARM NEON baseline capability with AES extensions
-    sz_cap_sve_k = 1 << 24,      ///< ARM SVE baseline capability
-    sz_cap_sve2_k = 1 << 25,     ///< ARM SVE2 capability
-    sz_cap_sve2_aes_k = 1 << 26, ///< ARM SVE2 capability with AES extensions
-
-} sz_capability_t;
-
 /**
  *  @brief  Function to determine the SIMD capabilities of the current machine @b only at @b runtime.
  *  @return A bitmask of the SIMD capabilities represented as a `sz_capability_t` enum value.
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 4330bd0f..ae1007f0 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -426,6 +426,30 @@ typedef enum {
     sz_contains_duplicates_k = -3,
 } sz_status_t;
 
+/**
+ *  @brief  Enumeration of SIMD capabilities of the target architecture.
+ *          Used to introspect the supported functionality of the dynamic library.
+ */
+typedef enum {
+    sz_cap_serial_k = 1,        ///< Serial (non-SIMD) capability
+    sz_cap_parallel_k = 1 << 2, ///< Serial (non-SIMD) capability
+    sz_cap_any_k = 0x7FFFFFFF,  ///< Mask representing any capability with `INT_MAX`
+
+    sz_cap_haswell_k = 1 << 5, ///< x86 AVX2 capability with FMA and F16C extensions
+    sz_cap_skylake_k = 1 << 6, ///< x86 AVX512 baseline capability
+    sz_cap_ice_k = 1 << 7,     ///< x86 AVX512 capability with advanced integer algos and AES extensions
+
+    sz_cap_neon_k = 1 << 10,     ///< ARM NEON baseline capability
+    sz_cap_neon_aes_k = 1 << 11, ///< ARM NEON baseline capability with AES extensions
+    sz_cap_sve_k = 1 << 12,      ///< ARM SVE baseline capability
+    sz_cap_sve2_k = 1 << 13,     ///< ARM SVE2 capability
+    sz_cap_sve2_aes_k = 1 << 14, ///< ARM SVE2 capability with AES extensions
+
+    sz_cap_cuda_k = 1 << 20,   ///< CUDA capability
+    sz_cap_hopper_k = 1 << 21, ///< CUDA capability
+
+} sz_capability_t;
+
 /**
  *  @brief Describes the length of a UTF-8 @b rune / character / codepoint in bytes, which can be 1 to 4.
  *  @see https://en.wikipedia.org/wiki/UTF-8

From c5fd4bcaa748e4f9ec133463c8c013a6e634bdfe Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 31 Mar 2025 10:15:06 +0000
Subject: [PATCH 273/751] Make: Separate StringCuZilla

Separate directory for parallel algorithms
---
 .../features.h => stringcuzilla/features.hpp} |  0
 .../similarities.cuh                          |  4 +-
 .../similarities.hpp                          |  0
 .../similarity.h                              | 16 ++++----
 include/stringcuzilla/stringcuzilla.h         | 37 +++++++++++++++++++
 .../{stringzilla => stringcuzilla}/types.cuh  |  0
 include/stringzilla/hash.h                    | 12 +++---
 include/stringzilla/stringzilla.h             | 20 ++++++++++
 include/stringzilla/types.h                   |  8 ++++
 include/stringzilla/types.hpp                 |  2 +-
 10 files changed, 83 insertions(+), 16 deletions(-)
 rename include/{stringzilla/features.h => stringcuzilla/features.hpp} (100%)
 rename include/{stringzilla => stringcuzilla}/similarities.cuh (99%)
 rename include/{stringzilla => stringcuzilla}/similarities.hpp (100%)
 rename include/{stringzilla => stringcuzilla}/similarity.h (99%)
 create mode 100644 include/stringcuzilla/stringcuzilla.h
 rename include/{stringzilla => stringcuzilla}/types.cuh (100%)

diff --git a/include/stringzilla/features.h b/include/stringcuzilla/features.hpp
similarity index 100%
rename from include/stringzilla/features.h
rename to include/stringcuzilla/features.hpp
diff --git a/include/stringzilla/similarities.cuh b/include/stringcuzilla/similarities.cuh
similarity index 99%
rename from include/stringzilla/similarities.cuh
rename to include/stringcuzilla/similarities.cuh
index 647f4d30..71e05c3b 100644
--- a/include/stringzilla/similarities.cuh
+++ b/include/stringcuzilla/similarities.cuh
@@ -355,7 +355,7 @@ status_t levenshtein_distances(
     cudaError_t error = cudaStreamSynchronize(stream);
     if (error != cudaSuccess) {
         if (error == cudaErrorMemoryAllocation) { return status_t::bad_alloc_k; }
-        else { return status_t::unknown_error_k; }
+        else { return status_t::unknown_k; }
     }
     return status_t::success_k;
 }
@@ -415,7 +415,7 @@ status_t needleman_wunsch_scores(
     cudaError_t error = cudaStreamSynchronize(stream);
     if (error != cudaSuccess) {
         if (error == cudaErrorMemoryAllocation) { return status_t::bad_alloc_k; }
-        else { return status_t::unknown_error_k; }
+        else { return status_t::unknown_k; }
     }
     return status_t::success_k;
 }
diff --git a/include/stringzilla/similarities.hpp b/include/stringcuzilla/similarities.hpp
similarity index 100%
rename from include/stringzilla/similarities.hpp
rename to include/stringcuzilla/similarities.hpp
diff --git a/include/stringzilla/similarity.h b/include/stringcuzilla/similarity.h
similarity index 99%
rename from include/stringzilla/similarity.h
rename to include/stringcuzilla/similarity.h
index c7c2a824..6a8e415c 100644
--- a/include/stringzilla/similarity.h
+++ b/include/stringcuzilla/similarity.h
@@ -319,7 +319,7 @@ SZ_INTERNAL sz_status_t _sz_levenshtein_distance_skewed_diagonals_serial( //
     previous_distances[0] = 0;
     current_distances[0] = current_distances[1] = 1;
 
-    // Progress through the upper triangle of the Levenshtein matrix.
+    // Progress through the upper-left triangle of the Levenshtein matrix.
     sz_size_t next_diagonal_index = 2;
     for (; next_diagonal_index != n; ++next_diagonal_index) {
         sz_size_t const next_diagonal_length = next_diagonal_index + 1;
@@ -338,7 +338,7 @@ SZ_INTERNAL sz_status_t _sz_levenshtein_distance_skewed_diagonals_serial( //
         next_distances = temporary;
     }
 
-    // By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a
+    // By now we've scanned through the upper-left triangle of the matrix, where each subsequent iteration results in a
     // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
     // index on either side, we will be cropping those values out.
     sz_size_t diagonals_count = n + n - 1;
@@ -836,7 +836,7 @@ SZ_INTERNAL sz_size_t _sz_levenshtein_distance_skewed_diagonals_upto63_ice( //
     sz_size_t next_diagonal_index = 2;
     __mmask64 next_diagonal_mask = 0;
 
-    // Progress through the upper triangle of the Levenshtein matrix.
+    // Progress through the upper-left triangle of the Levenshtein matrix.
     for (; next_diagonal_index != shorter_dim; ++next_diagonal_index) {
         // After this iteration, the values at offset `0` and `next_diagonal_index` in the `next_vec`
         // should be set to `next_diagonal_index`, but it's easier to broadcast the value to the whole vector,
@@ -869,7 +869,7 @@ SZ_INTERNAL sz_size_t _sz_levenshtein_distance_skewed_diagonals_upto63_ice( //
         if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return bound;
     }
 
-    // Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.
+    // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
     for (; next_diagonal_index != longer_dim; ++next_diagonal_index) {
         // After this iteration, the value `shorted_dim - 1` in the `next_vec`
         // should be set to `next_diagonal_index`, but it's easier to broadcast the value to the whole vector,
@@ -1072,7 +1072,7 @@ SZ_INTERNAL sz_status_t _sz_levenshtein_distance_skewed_diagonals_upto65k_ice( /
     // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
     sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
 
-    // Progress through the upper triangle of the Levenshtein matrix.
+    // Progress through the upper-left triangle of the Levenshtein matrix.
     sz_size_t next_diagonal_index = 2;
     for (; next_diagonal_index != shorter_dim; ++next_diagonal_index) {
         sz_size_t const next_diagonal_length = next_diagonal_index + 1;
@@ -1118,7 +1118,7 @@ SZ_INTERNAL sz_status_t _sz_levenshtein_distance_skewed_diagonals_upto65k_ice( /
         next_distances = temporary;
     }
 
-    // By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a
+    // By now we've scanned through the upper-left triangle of the matrix, where each subsequent iteration results in a
     // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
     // index on either side, we will be cropping those values out.
     for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
@@ -1216,7 +1216,9 @@ SZ_PUBLIC sz_status_t sz_levenshtein_distance_ice( //
 }
 
 /**
- *  Computes the Needleman Wunsch alignment score between two strings.
+ *  @brief  Computes the Needleman-Wunsch alignment score between two strings. Uses the Wagner-Fischer algorithm
+ *          with the AVX-512VBMI extensions, vectorizing the substitution costs in each row.
+ *
  *  The method uses 32-bit integers to accumulate the running score for every cell in the matrix.
  *  Assuming the costs of substitutions can be arbitrary signed 8-bit integers, the method is expected to be used
  *  on strings not exceeding 2^24 length or 16.7 million characters.
diff --git a/include/stringcuzilla/stringcuzilla.h b/include/stringcuzilla/stringcuzilla.h
new file mode 100644
index 00000000..4b617e86
--- /dev/null
+++ b/include/stringcuzilla/stringcuzilla.h
@@ -0,0 +1,37 @@
+/**
+ *  @brief  StringZilla is a collection of advanced string algorithms, designed to be used in Big Data applications.
+ *          It is generally faster than LibC, and has a broader & cleaner interface for safer @b length-bounded strings.
+ *          On modern CPUs it uses AVX2, AVX-512, NEON, SVE, & SVE2 @b SIMD instructions & provides SWAR for older CPUs.
+ *          On @b CUDA-capable GPUs it also provides C++ kernels for bulk processing.
+ *
+ *  @file   stringzilla.cuh
+ *  @author Ash Vardanian
+ */
+#ifndef STRINGZILLA_CUH_
+#define STRINGZILLA_CUH_
+
+#include "stringzilla.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u32tape( //
+    sz_cptr_t a_data, sz_u32_t const *a_lengths,         //
+    sz_cptr_t b_data, sz_u32_t const *b_lengths,         //
+    sz_size_t count,                                     //
+    sz_size_t bound,                                     //
+    sz_memory_allocator_t *alloc, sz_size_t *results);
+
+SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u32tape( //
+    sz_cptr_t a_data, sz_u32_t const *a_lengths,           //
+    sz_cptr_t b_data, sz_u32_t const *b_lengths,           //
+    sz_size_t count,                                       //
+    sz_error_cost_t const *subs, sz_error_cost_t gap,      //
+    sz_memory_allocator_t *alloc, sz_ssize_t *results);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // STRINGZILLA_CUH_
diff --git a/include/stringzilla/types.cuh b/include/stringcuzilla/types.cuh
similarity index 100%
rename from include/stringzilla/types.cuh
rename to include/stringcuzilla/types.cuh
diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index c3829d5d..52e04f0c 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -13,13 +13,13 @@
  *  Why the hell do we need a yet another hashing library?!
  *  Turns out, most existing libraries have noticeable constraints. Try finding a library that:
  *
- *  - Outputs 64-bit or 128-bit hashes and passes the SMHasher test suite.
- *  - Is fast for both short and long strings.
+ *  - Outputs 64-bit or 128-bit hashes and passes the @b SMHasher `--extra` tests.
+ *  - Is fast for both short @b (velocity) and long strings @b (throughput).
  *  - Supports incremental @b (streaming) hashing, when the data arrives in chunks.
- *  - Supports custom seeds hashes and secret strings for security.
- *  - Provides dynamic dispatch for different architectures to simplify deployment.
- *  - Uses modern SIMD, including not just AVX2 and NEON, but also AVX-512 and SVE2.
- *  - Documents its logic and guarantees the same output across different platforms.
+ *  - Supports custom @b seeds for hashes and have it affecting every bit of the output.
+ *  - Provides @b dynamic-dispatch for different architectures to simplify deployment.
+ *  - Uses @b SIMD, including not just AVX2 & NEON, but also masking AVX-512 & predicated SVE2.
+ *  - Documents its logic and @b guarantees the same output across different platforms.
  *
  *  This includes projects like "MurmurHash", "CityHash", "SpookyHash", "FarmHash", "MetroHash", "HighwayHash", etc.
  *  There are 2 libraries that are close to meeting these requirements: "xxHash" in C++ and "aHash" in Rust:
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index f42cfb25..cb790b72 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -12,6 +12,24 @@
  *
  *  @section    Introduction
  *
+ *  StringZilla is multi-language project designed for high-throughput string processing, differentiating
+ *  the low-level "embeddable" mostly-C core implementation, containing:
+ *
+ *  - `compare.h` - byte-level comparison functions.
+ *  - `memory.h` - copying, moving, and filling raw memory.
+ *  - `hash.h` - hash functions and checksum algorithms.
+ *  - `find.h` - searching for substrings and byte sets.
+ *  - `sort.h` - single-threaded sorting algorithms.
+ *  - `intersect.h` - intersections of unordered string sets.
+ *  - `small_string.h` - "Small String Optimization" in C 99.
+ *  - `stringzilla.h` - umbrella header for the core C API.
+ *  - `stringzilla.hpp` - umbrella header for the core C++ API.
+ *
+ *  It also provides many higher-level algorithms, mostly implemented in C++ with OpenMP and CUDA,
+ *  also exposed via the stable C 99 ABI, but requiring C++17 and CUDA 17 compilers to build the shared libraries:
+ *
+ *  - `similarity.hpp` - similarity measures, like Levenshtein distance, Needleman-Wunsch, & Smith-Waterman alignment.
+ *  - `features.hpp` - feature extraction for TF-IDF and other Machine Learning algorithms.
  *
  *  @section    Compilation Settings
  *
@@ -36,6 +54,8 @@
  *  - `SZ_USE_NEON=?` - whether to use NEON instructions on ARM.
  *  - `SZ_USE_SVE=?` - whether to use SVE instructions on ARM.
  *  - `SZ_USE_SVE2=?` - whether to use SVE2 instructions on ARM.
+ *  - `SZ_USE_CUDA=?` -
+ *  - `SZ_USE_OPENMP=?` -
  */
 #ifndef STRINGZILLA_H_
 #define STRINGZILLA_H_
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index ae1007f0..d643007a 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -411,6 +411,12 @@ typedef enum { sz_false_k = 0, sz_true_k = 1 } sz_bool_t;
  */
 typedef enum { sz_less_k = -1, sz_equal_k = 0, sz_greater_k = 1 } sz_ordering_t;
 
+/**
+ *  @brief Describes the alignment goal for string similarity algorithms.
+ *  @sa sz_align_global_k, sz_align_local_k
+ */
+typedef enum { sz_align_global_k = 0, sz_align_local_k = 1 } sz_alignment_locality_t;
+
 /**
  *  @brief A simple signed integer type describing the status of a faulty operation.
  *  @sa sz_success_k, sz_bad_alloc_k, sz_invalid_utf8_k, sz_contains_duplicates_k
@@ -424,6 +430,8 @@ typedef enum {
     sz_invalid_utf8_k = -2,
     /** For algorithms that take collections of unique elements, this status indicates presence of duplicates. */
     sz_contains_duplicates_k = -3,
+    /** A sink-hole status for unknown errors. */
+    sz_status_unknown_k = -4,
 } sz_status_t;
 
 /**
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index daabc9e7..cc3c31af 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -115,7 +115,7 @@ enum class status_t {
     bad_alloc_k = sz_bad_alloc_k,
     invalid_utf8_k = sz_invalid_utf8_k,
     contains_duplicates_k = sz_contains_duplicates_k,
-    unknown_error_k = sz_unknown_error_k,
+    unknown_k = sz_status_unknown_k,
 };
 
 struct uniform_substitution_cost_t {

From 02a248111c2f25a23d3f3a734885207a4a35c3ca Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 1 Apr 2025 11:47:01 +0000
Subject: [PATCH 274/751] Add: Horizontal scoring in OpenMP

---
 include/stringcuzilla/similarities.hpp | 403 +++++++++++++++++--------
 1 file changed, 278 insertions(+), 125 deletions(-)

diff --git a/include/stringcuzilla/similarities.hpp b/include/stringcuzilla/similarities.hpp
index 59c3858c..a3b41391 100644
--- a/include/stringcuzilla/similarities.hpp
+++ b/include/stringcuzilla/similarities.hpp
@@ -1,17 +1,17 @@
 /**
  *  @brief  OpenMP-accelerated string similarity scores in C++.
- *  @file   similarities.hpp
+ *  @file   similarity.hpp
  *  @author Ash Vardanian
  *
  *  Includes core APIs:
  *
- *  - `sz::levenshtein_distance` & `sz::levenshtein_distance_utf8` for Levenshtein edit-distances.
+ *  - `sz::levenshtein_distance` & `sz::levenshtein_distance_utf8` for Levenshtein edit-scores.
  *  - `sz::needleman_wunsch_score` for weighted Needleman-Wunsch global alignment.
  *  - `sz::smith_waterman_score` for weighted Smith-Waterman local alignment.
  *
  *  Also includes their batch-capable and parallel versions:
  *
- *  - `sz::levenshtein_distances` & `sz::levenshtein_distances_utf8` for Levenshtein edit-distances.
+ *  - `sz::levenshtein_scores` & `sz::levenshtein_scores_utf8` for Levenshtein edit-scores.
  *  - `sz::needleman_wunsch_scores` for weighted Needleman-Wunsch global alignment.
  *  - `sz::smith_waterman_scores` for weighted Smith-Waterman local alignment.
  *
@@ -53,31 +53,42 @@ namespace openmp {
  *  It updates the internal state to remember the last calculated value, as in Global Alignment it's
  *  always in the bottom-right corner of the DP matrix, which is evaluated last.
  */
-template <typename char_type = char, typename distance_type = sz_size_t,
-          typename get_substitution_cost_type = uniform_substitution_cost_t,
-          sz_capability_t capability = sz_cap_serial_k>
+template <                                                    //
+    typename first_iterator_type_ = char const *,             //
+    typename second_iterator_type_ = char const *,            //
+    typename score_type_ = sz_size_t,                         //
+    typename substituter_type_ = uniform_substitution_cost_t, //
+    sz_capability_t capability_ = sz_cap_serial_k             //
+    >
 struct global_aligner {
 
-    static constexpr bool is_parallel_k = capability & sz_cap_parallel_k;
+    using first_iterator_type = first_iterator_type_;
+    using second_iterator_type = second_iterator_type_;
+    using score_type = score_type_;
+    using substituter_type = substituter_type_;
+    static constexpr sz_capability_t capability = capability_;
 
-    get_substitution_cost_type get_substitution_cost_ {};
+    substituter_type substituter_ {};
     error_cost_t gap_cost_ {1};
-    distance_type last_cell_ {0};
+    score_type last_cell_ {0};
 
     global_aligner() = default;
-    global_aligner(get_substitution_cost_type &&get_substitution_cost, error_cost_t gap_cost) noexcept
-        : get_substitution_cost_(std::move(get_substitution_cost)), gap_cost_(gap_cost) {}
+    global_aligner(substituter_type &&substituter, error_cost_t gap_cost) noexcept
+        : substituter_(std::move(substituter)), gap_cost_(gap_cost) {}
+
+    static constexpr bool is_parallel() { return capability & sz_cap_parallel_k; }
 
     /**
      *  @brief Initializes a boundary value within a certain diagonal.
      *  @note Should only be called for the diagonals outside of the bottom-right triangle.
+     *  @note Should only be called for the top row and left column of the matrix.
      */
-    void init(distance_type &cell, sz_size_t diagonal_index) const noexcept { cell = gap_cost_ * diagonal_index; }
+    void init(score_type &cell, sz_size_t diagonal_index) const noexcept { cell = gap_cost_ * diagonal_index; }
 
     /**
      *  @brief Extract the final result of the scoring operation which will be always in the bottom-right corner.
      */
-    distance_type score() const noexcept { return last_cell_; }
+    score_type score() const noexcept { return last_cell_; }
 
     /**
      *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
@@ -87,30 +98,28 @@ struct global_aligner {
      */
     void operator()(                                                                             //
         char_type const *first_reversed_slice, char_type const *second_slice, sz_size_t const n, //
-        distance_type const *costs_pre_substitution,                                             //
-        distance_type const *costs_pre_insertion, distance_type const *costs_pre_deletion,       //
-        distance_type *costs_new) noexcept {
-
-        _sz_assert(costs_pre_insertion + 1 == costs_pre_deletion); // ? Those are expected to be in consecutive slots
+        score_type const *scores_pre_substitution,                                               //
+        score_type const *scores_pre_insertion, score_type const *scores_pre_deletion,           //
+        score_type *scores_new) noexcept {
 
-#pragma omp parallel for simd schedule(dynamic, 1) if (is_parallel_k)
+#pragma omp parallel for simd schedule(dynamic, 1) if (is_parallel())
         for (sz_size_t i = 0; i < n; ++i) {
-            distance_type cost_pre_substitution = costs_pre_substitution[i];
-            distance_type cost_pre_insertion = costs_pre_insertion[i];
-            distance_type cost_pre_deletion = costs_pre_deletion[i];
+            score_type score_pre_substitution = scores_pre_substitution[i];
+            score_type score_pre_insertion = scores_pre_insertion[i];
+            score_type score_pre_deletion = scores_pre_deletion[i];
 
             // ? Note that here we are still traversing both buffers in the same order,
-            // ? because the one of the strings has been reversed beforehand.
-            error_cost_t cost_of_substitution = get_substitution_cost_(first_reversed_slice[i], second_slice[i]);
-            distance_type cost_if_substitution = cost_pre_substitution + cost_of_substitution;
-            distance_type cost_if_deletion_or_insertion =
-                sz_min_of_two(cost_pre_deletion, cost_pre_insertion) + gap_cost_;
-            distance_type cell_score = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
-            costs_new[i] = cell_score;
+            // ? because one of the strings has been reversed beforehand.
+            error_cost_t cost_of_substitution = substituter_(first_reversed_slice[i], second_slice[i]);
+            score_type score_if_substitution = score_pre_substitution + cost_of_substitution;
+            score_type score_if_deletion_or_insertion =
+                sz_min_of_two(score_pre_deletion, score_pre_insertion) + gap_cost_;
+            score_type cell_score = sz_min_of_two(score_if_deletion_or_insertion, score_if_substitution);
+            scores_new[i] = cell_score;
         }
 
-        // The last element of the last diagonal is the result of the global alignment.
-        last_cell_ = costs_new[0];
+        // The last element of the last chunk is the result of the global alignment.
+        last_cell_ = scores_new[n - 1];
     }
 };
 
@@ -122,47 +131,66 @@ struct global_aligner {
  *  It updates the internal state to remember the minimum/maximum calculated value, as in Local Alignment
  *  it's always in the bottom-right corner of the DP matrix, which is evaluated last.
  */
-template <typename char_type = char, typename distance_type = sz_size_t,
-          typename get_substitution_cost_type = uniform_substitution_cost_t,
-          sz_capability_t capability = sz_cap_serial_k>
+template <                                                    //
+    typename first_iterator_type_ = char const *,             //
+    typename second_iterator_type_ = char const *,            //
+    typename score_type_ = sz_size_t,                         //
+    typename substituter_type_ = uniform_substitution_cost_t, //
+    sz_capability_t capability_ = sz_cap_serial_k             //
+    >
 struct local_aligner {
 
-    static constexpr bool is_parallel_k = capability & sz_cap_parallel_k;
+    using first_iterator_type = first_iterator_type_;
+    using second_iterator_type = second_iterator_type_;
+    using score_type = score_type_;
+    using substituter_type = substituter_type_;
+    static constexpr sz_capability_t capability = capability_;
 
-    get_substitution_cost_type get_substitution_cost_ {};
+    substituter_type substituter_ {};
     error_cost_t gap_cost_ {-1};
-    distance_type max_cell_ {0};
+    score_type max_cell_ {0};
 
     local_aligner() = default;
-    local_aligner(get_substitution_cost_type &&get_substitution_cost, error_cost_t gap_cost) noexcept
-        : get_substitution_cost_(std::move(get_substitution_cost)), gap_cost_(gap_cost) {}
+    local_aligner(substituter_type &&substituter, error_cost_t gap_cost) noexcept
+        : substituter_(std::move(substituter)), gap_cost_(gap_cost) {}
+
+    static constexpr bool is_parallel() { return capability & sz_cap_parallel_k; }
+
+    /**
+     *  @brief Initializes a boundary value within a certain diagonal.
+     *  @note Should only be called for the diagonals outside of the bottom-right triangle.
+     *  @note Should only be called for the top row and left column of the matrix.
+     */
+    void init(score_type &cell, sz_size_t /* diagonal_index */) const noexcept { cell = 0; }
 
-    void init(distance_type &cell, sz_size_t /*diagonal_index*/) const noexcept { cell = 0; }
-    distance_type score() const noexcept { return max_cell_; }
+    /**
+     *  @brief Extract the final result of the scoring operation which will be maximum encountered value.
+     */
+    score_type score() const noexcept { return max_cell_; }
 
     void operator()(char_type const *first_reversed_slice, char_type const *second_slice, sz_size_t const n,
-                    distance_type const *scores_pre_substitution, distance_type const *scores_pre_insertion,
-                    distance_type const *scores_pre_deletion, distance_type *scores_new) noexcept {
+                    score_type const *scores_pre_substitution, score_type const *scores_pre_insertion,
+                    score_type const *scores_pre_deletion, score_type *scores_new) noexcept {
 
-#pragma omp parallel for simd schedule(dynamic, 1) if (is_parallel_k)
+#pragma omp parallel for simd schedule(dynamic, 1) if (is_parallel())
         for (sz_size_t i = 0; i < n; ++i) {
-            distance_type score_pre_substitution = scores_pre_substitution[i];
-            distance_type score_pre_insertion = scores_pre_insertion[i];
-            distance_type score_pre_deletion = scores_pre_deletion[i];
+            score_type score_pre_substitution = scores_pre_substitution[i];
+            score_type score_pre_insertion = scores_pre_insertion[i];
+            score_type score_pre_deletion = scores_pre_deletion[i];
 
             // ? Note that here we are still traversing both buffers in the same order,
-            // ? because the one of the strings has been reversed beforehand.
-            error_cost_t substitution = get_substitution_cost_(first_reversed_slice[i], second_slice[i]);
-            distance_type cost_if_substitution = cost_pre_substitution + cost_of_substitution;
-            distance_type cost_if_deletion_or_insertion =
-                sz_min_of_two(cost_pre_deletion, cost_pre_insertion) + gap_cost_;
-            distance_type cell_score = sz_min_of_three(cost_if_deletion_or_insertion, cost_if_substitution, 0);
+            // ? because one of the strings has been reversed beforehand.
+            error_cost_t substitution = substituter_(first_reversed_slice[i], second_slice[i]);
+            score_type score_if_substitution = score_pre_substitution + cost_of_substitution;
+            score_type score_if_deletion_or_insertion =
+                sz_min_of_two(score_pre_deletion, score_pre_insertion) + gap_cost_;
+            score_type cell_score = sz_min_of_three(score_if_deletion_or_insertion, score_if_substitution, 0);
             scores_new[i] = cell_score;
 
             // Update the global maximum score if this cell beats it.
 #pragma omp critical
             {
-                if (cell_score > max_cell_) { max_cell_ = cell_score; }
+                if (cell_score > max_cell_) max_cell_ = cell_score;
             }
         }
     }
@@ -178,18 +206,18 @@ struct local_aligner {
  *  ? This algorithm provides a more flexible baseline implementation for future SIMD and GPGPU optimizations.
  *  ! This algorithm can't handle different "gap opening" and "gap extension" costs, those need 3x more memory.
  *  ! This algorithm may be suboptimal for very small strings, where a conventional Wagner-Fischer algorithm
- *  ! with horizontal traversal order and fewer loops may be faster.
+ *  ! with horizontal traversal order and fewer loops may be faster. That one, however, can't be parallel!
  *
  *  @param[in] first The first string.
  *  @param[in] second The second string.
  *  @param[out] result_ref Location to dump the calculated score.
  *  @param[in] gap_cost The uniform cost of a gap (insertion or deletion).
- *  @param[in] get_substitution_cost A commutative function returning the cost of substituting one char with another.
+ *  @param[in] substituter A commutative function returning the cost of substituting one char with another.
  *  @param[in] alloc A default-constructible allocator for the internal buffers.
  *
  *  @tparam char_type_ The type of the characters in the strings, generally `char` or @b `rune_t` for UTF-8.
- *  @tparam distance_type_ The smallest type that can hold the distance, ideally `sz_i8_t` or `sz_u8_t`.
- *  @tparam get_substitution_cost_ A callable type that takes two characters and returns the substitution cost.
+ *  @tparam score_type_ The smallest type that can hold the distance, ideally `sz_i8_t` or `sz_u8_t`.
+ *  @tparam substituter_ A callable type that takes two characters and returns the substitution cost.
  *  @tparam allocator_type_ A default-constructible allocator type for the internal buffers.
  *  @tparam multi_threaded_ Whether to use OpenMP for @b multi-threading or just vectorization.
  *  @tparam global_alignment_ Whether to use the global alignment algorithm or the local one.
@@ -198,30 +226,31 @@ struct local_aligner {
  *          in the `stringzilla.hpp` header, making compilation times shorter for the end-user.
  *  @sa     For lower-level API, check `sz_levenshtein_distance[_utf8]` and `sz_needleman_wunsch_score`.
  *  @sa     For simplicity, use the `sz::levenshtein_distance[_utf8]` and `sz::needleman_wunsch_score`.
- *  @sa     For bulk API, use `sz::levenshtein_distances[_utf8]`.
+ *  @sa     For bulk API, use `sz::levenshtein_scores[_utf8]`.
  */
-template <                                                         //
-    sz_capability_t capability_ = sz_cap_serial_k,                 //
-    sz_alignment_locality_t locality_ = sz_align_global_k,         //
-    typename char_type_ = char,                                    //
-    typename distance_type_ = sz_size_t,                           //
-    typename get_substitution_cost_ = uniform_substitution_cost_t, //
-    typename allocator_type_ = dummy_alloc_t                       //
+template <                                                 //
+    sz_capability_t capability_ = sz_cap_serial_k,         //
+    sz_alignment_locality_t locality_ = sz_align_global_k, //
+    typename char_type_ = char,                            //
+    typename score_type_ = sz_size_t,                      //
+    typename substituter_ = uniform_substitution_cost_t,   //
+    typename allocator_type_ = dummy_alloc_t               //
     >
-sz_status_t score_diagonally(                                                        //
-    span<char_type_ const> first, span<char_type_ const> second,                     //
-    distance_type_ &result_ref,                                                      //
-    sz_error_cost_t gap_cost = 1,                                                    //
-    get_substitution_cost_ &&get_substitution_cost = uniform_substitution_cost_t {}, //
-    allocator_type_ &&alloc = allocator_type_ {}                                     //
-) {
+sz_status_t score_diagonally(                                    //
+    span<char_type_ const> first, span<char_type_ const> second, //
+    score_type_ &result_ref,                                     //
+    sz_error_cost_t gap_cost = 1,                                //
+    substituter_ &&substituter = uniform_substitution_cost_t {}, //
+    allocator_type_ &&alloc = allocator_type_ {}                 //
+    ) noexcept {
+
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
     using allocated_type = typename allocator_type_::value_type;
     static_assert(sizeof(allocated_type) == sizeof(char), "Allocator must be byte-aligned");
     using char_type = char_type_;
-    using distance_type = distance_type_;
+    using score_type = score_type_;
 
-    using aligner_t = global_aligner<char_type, distance_type, get_substitution_cost_, capability_>;
+    using aligner_t = global_aligner<char_type, score_type, substituter_, capability_>;
 
     // Make sure the size relation between the strings is correct.
     char_type const *shorter = first.data(), *longer = second.data();
@@ -247,25 +276,24 @@ sz_status_t score_diagonally(
 
     // We want to avoid reverse-order iteration over the shorter string.
     // Let's allocate a bit more memory and reverse-export our shorter string into that buffer.
-    sz_size_t const buffer_length =
-        sizeof(distance_type) * max_diagonal_length * 3 + shorter_length * sizeof(char_type);
-    distance_type *const buffer = (distance_type *)alloc.allocate(buffer_length);
+    sz_size_t const buffer_length = sizeof(score_type) * max_diagonal_length * 3 + shorter_length * sizeof(char_type);
+    score_type *const buffer = (score_type *)alloc.allocate(buffer_length);
     if (!buffer) return sz_bad_alloc_k;
 
     // The next few pointers will be swapped around.
-    distance_type *previous_distances = buffer;
-    distance_type *current_distances = previous_distances + max_diagonal_length;
-    distance_type *next_distances = current_distances + max_diagonal_length;
-    char_type *const shorter_reversed = (char_type *)(next_distances + max_diagonal_length);
+    score_type *previous_scores = buffer;
+    score_type *current_scores = previous_scores + max_diagonal_length;
+    score_type *next_scores = current_scores + max_diagonal_length;
+    char_type *const shorter_reversed = (char_type *)(next_scores + max_diagonal_length);
 
     // Export the reversed string into the buffer.
     for (sz_size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
 
     // Initialize the first two diagonals:
     aligner_t diagonal_aligner;
-    diagonal_aligner.init(previous_distances[0], 0);
-    diagonal_aligner.init(current_distances[0], 1);
-    diagonal_aligner.init(current_distances[1], 1);
+    diagonal_aligner.init(previous_scores[0], 0);
+    diagonal_aligner.init(current_scores[0], 1);
+    diagonal_aligner.init(current_scores[1], 1);
 
     // We skip diagonals 0 and 1, as they are trivial.
     // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
@@ -278,22 +306,22 @@ sz_status_t score_diagonally(
 
         sz_size_t const next_diagonal_length = next_diagonal_index + 1;
         diagonal_aligner(                                                //
-            shorter_reversed + shorter_length - next_diagonal_index + 1, // first string
-            longer,                                                      // second string
-            next_diagonal_length - 2,                 // number of elements to compute with the `diagonal_aligner`
-            previous_distances,                       // costs pre substitution
-            current_distances, current_distances + 1, // costs pre insertion/deletion
-            next_distances + 1);
+            shorter_reversed + shorter_length - next_diagonal_index + 1, // first sequence of characters
+            longer,                                                      // second sequence of characters
+            next_diagonal_length - 2,           // number of elements to compute with the `diagonal_aligner`
+            previous_scores,                    // costs pre substitution
+            current_scores, current_scores + 1, // costs pre insertion/deletion
+            next_scores + 1);
 
         // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-        diagonal_aligner.init(next_distances[0], next_diagonal_index);
-        diagonal_aligner.init(next_distances[next_diagonal_length - 1], next_diagonal_index);
+        diagonal_aligner.init(next_scores[0], next_diagonal_index);
+        diagonal_aligner.init(next_scores[next_diagonal_length - 1], next_diagonal_index);
 
         // Perform a circular rotation of those buffers, to reuse the memory.
-        distance_type *temporary = previous_distances;
-        previous_distances = current_distances;
-        current_distances = next_distances;
-        next_distances = temporary;
+        score_type *temporary = previous_scores;
+        previous_scores = current_scores;
+        current_scores = next_scores;
+        next_scores = temporary;
     }
 
     // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
@@ -301,26 +329,26 @@ sz_status_t score_diagonally(
 
         sz_size_t const next_diagonal_length = shorter_dim;
         diagonal_aligner(                                        //
-            shorter_reversed + shorter_length - shorter_dim + 1, // first string
-            longer + next_diagonal_index - shorter_dim,          // second string
-            next_diagonal_length - 1,                 // number of elements to compute with the `diagonal_aligner`
-            previous_distances,                       // costs pre substitution
-            current_distances, current_distances + 1, // costs pre insertion/deletion
-            next_distances);
+            shorter_reversed + shorter_length - shorter_dim + 1, // first sequence of characters
+            longer + next_diagonal_index - shorter_dim,          // second sequence of characters
+            next_diagonal_length - 1,           // number of elements to compute with the `diagonal_aligner`
+            previous_scores,                    // costs pre substitution
+            current_scores, current_scores + 1, // costs pre insertion/deletion
+            next_scores);
 
         // Don't forget to populate the first row of the Levenshtein matrix.
-        diagonal_aligner.init(next_distances[next_diagonal_length - 1], next_diagonal_index);
+        diagonal_aligner.init(next_scores[next_diagonal_length - 1], next_diagonal_index);
 
         // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
         // dropping the first element in the current array.
-        distance_type *temporary = previous_distances;
-        previous_distances = current_distances;
-        current_distances = next_distances;
-        next_distances = temporary;
-
-        // ! Drop the first entry among the current distances.
-        sz_move((sz_ptr_t)(previous_distances), (sz_ptr_t)(previous_distances + 1),
-                (max_diagonal_length - 1) * sizeof(distance_type));
+        score_type *temporary = previous_scores;
+        previous_scores = current_scores;
+        current_scores = next_scores;
+        next_scores = temporary;
+
+        // ! Drop the first entry among the current scores.
+        sz_move((sz_ptr_t)(previous_scores), (sz_ptr_t)(previous_scores + 1),
+                (max_diagonal_length - 1) * sizeof(score_type));
     }
 
     // Now let's handle the bottom-right triangle of the matrix.
@@ -328,22 +356,23 @@ sz_status_t score_diagonally(
 
         sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
         diagonal_aligner(                                        //
-            shorter_reversed + shorter_length - shorter_dim + 1, // first string
-            longer + next_diagonal_index - shorter_dim,          // second string
-            next_diagonal_length,                     // number of elements to compute with the `diagonal_aligner`
-            previous_distances,                       // costs pre substitution
-            current_distances, current_distances + 1, // costs pre insertion/deletion
-            next_distances);
+            shorter_reversed + shorter_length - shorter_dim + 1, // first sequence of characters
+            longer + next_diagonal_index - shorter_dim,          // second sequence of characters
+            next_diagonal_length,               // number of elements to compute with the `diagonal_aligner`
+            previous_scores,                    // costs pre substitution
+            current_scores, current_scores + 1, // costs pre insertion/deletion
+            next_scores);
 
         // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
         // dropping the first element in the current array.
-        distance_type *temporary = previous_distances;
-        // ! Drop the first entry among the current distances.
+        score_type *temporary = previous_scores;
+
+        // ! Drop the first entry among the current scores.
         // ! Assuming every next diagonal is shorter by one element, we don't need a full-blown `sz_move`.
         // ! to shift the array by one element.
-        previous_distances = current_distances + 1;
-        current_distances = next_distances;
-        next_distances = temporary;
+        previous_scores = current_scores + 1;
+        current_scores = next_scores;
+        next_scores = temporary;
     }
 
     // Export the scalar before `free` call.
@@ -352,6 +381,115 @@ sz_status_t score_diagonally(
     return sz_success_k;
 }
 
+/**
+ *  @brief  Alignment Score and Edit Distance algorithm evaluating the Dynamic Programming matrix
+ *          @b two rows at a time on a CPU, using the conventional Wagner Fischer algorithm.
+ *
+ *  ! This algorithm can't handle different "gap opening" and "gap extension" costs, those need 3x more memory.
+ *  ! This algorithm doesn't parallelize well, check out the diagonal variants!
+ *
+ *  @param[in] first The first string.
+ *  @param[in] second The second string.
+ *  @param[out] result_ref Location to dump the calculated score.
+ *  @param[in] gap_cost The uniform cost of a gap (insertion or deletion).
+ *  @param[in] substituter A commutative function returning the cost of substituting one char with another.
+ *  @param[in] alloc A default-constructible allocator for the internal buffers.
+ *
+ *  @tparam char_type_ The type of the characters in the strings, generally `char` or @b `rune_t` for UTF-8.
+ *  @tparam score_type_ The smallest type that can hold the distance, ideally `sz_i8_t` or `sz_u8_t`.
+ *  @tparam substituter_ A callable type that takes two characters and returns the substitution cost.
+ *  @tparam allocator_type_ A default-constructible allocator type for the internal buffers.
+ *  @tparam multi_threaded_ Whether to use OpenMP for @b multi-threading or just vectorization.
+ *  @tparam global_alignment_ Whether to use the global alignment algorithm or the local one.
+ *
+ *  @note   The API of this algorithm is a bit weird, but it's designed to minimize the reliance on the definitions
+ *          in the `stringzilla.hpp` header, making compilation times shorter for the end-user.
+ *  @sa     For lower-level API, check `sz_levenshtein_distance[_utf8]` and `sz_needleman_wunsch_score`.
+ *  @sa     For simplicity, use the `sz::levenshtein_distance[_utf8]` and `sz::needleman_wunsch_score`.
+ *  @sa     For bulk API, use `sz::levenshtein_scores[_utf8]`.
+ */
+template <                                                 //
+    sz_capability_t capability_ = sz_cap_serial_k,         //
+    sz_alignment_locality_t locality_ = sz_align_global_k, //
+    typename char_type_ = char,                            //
+    typename score_type_ = sz_size_t,                      //
+    typename substituter_ = uniform_substitution_cost_t,   //
+    typename allocator_type_ = dummy_alloc_t               //
+    >
+sz_status_t score_horizontally(                                  //
+    span<char_type_ const> first, span<char_type_ const> second, //
+    score_type_ &result_ref,                                     //
+    sz_error_cost_t gap_cost = 1,                                //
+    substituter_ &&substituter = uniform_substitution_cost_t {}, //
+    allocator_type_ &&alloc = allocator_type_ {}                 //
+    ) noexcept {
+
+    static_assert((capability_ & sz_cap_parallel_k) == 0, "This algorithm is not parallelized!");
+
+    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
+    using allocated_type = typename allocator_type_::value_type;
+    static_assert(sizeof(allocated_type) == sizeof(char), "Allocator must be byte-aligned");
+    using char_type = char_type_;
+    using score_type = score_type_;
+
+    using aligner_t = global_aligner<char_type, score_type, substituter_, capability_>;
+
+    // Make sure the size relation between the strings is correct.
+    char_type const *shorter = first.data(), *longer = second.data();
+    sz_size_t shorter_length = first.size(), longer_length = second.size();
+    if (shorter_length > longer_length) {
+        std::swap(shorter, longer);
+        std::swap(shorter_length, longer_length);
+    }
+
+    // We are going to store 2 rows of the matrix. It will be either 2 rows of length `shorter_length + 1`
+    // or 2 rows of length `longer_length + 1`, depending on our preference - either minimizing the memory
+    // consumption or the inner loop performance.
+    sz_size_t const shorter_dim = shorter_length + 1;
+    sz_size_t const longer_dim = longer_length + 1;
+
+    // We decide to use less memory!
+    sz_size_t const buffer_length = sizeof(score_type) * shorter_dim * 2;
+    score_type *const buffer = (score_type *)alloc.allocate(buffer_length);
+    if (!buffer) return sz_bad_alloc_k;
+
+    // The next few pointers will be swapped around.
+    score_type *previous_scores = buffer;
+    score_type *current_scores = previous_scores + shorter_dim;
+
+    // Initialize the first row:
+    aligner_t horizontal_aligner;
+    for (sz_size_t col_idx = 0; col_idx < shorter_dim; ++col_idx)
+        horizontal_aligner.init(previous_scores[col_idx], col_idx);
+
+    // Progress through the matrix row-by-row:
+    for (sz_size_t row_idx = 1; row_idx < longer_dim; ++row_idx) {
+
+        // Don't forget to populate the first column of each row:
+        horizontal_aligner.init(next_scores[0], 1);
+
+        horizontal_aligner(           //
+            repeat {&shorter[i - 1]}, // first sequence of characters
+            longer,                   // second sequence of characters
+            next_diagonal_length - 2, // number of elements to compute with the `horizontal_aligner`
+            previous_scores,          // costs pre substitution
+            previous_scores + 1,      // costs pre insertion
+            current_scores,           // costs pre deletion
+            current_scores + 1);
+
+        // Perform a circular rotation of those buffers, to reuse the memory.
+        score_type *temporary = previous_scores;
+        previous_scores = current_scores;
+        current_scores = next_scores;
+        next_scores = temporary;
+    }
+
+    // Export the scalar before `free` call.
+    result_ref = horizontal_aligner.score();
+    alloc.deallocate((allocated_type *)buffer, buffer_length);
+    return sz_success_k;
+}
+
 /**
  *  @brief Computes the @b byte-level Levenshtein distance between two strings using the OpenMP backend.
  *  @param[in] first The first string.
@@ -378,7 +516,22 @@ inline sz_size_t levenshtein_distance( //
     if (second_length == 0) return first_length;
 
     // Estimate the maximum dimension of the DP matrix
+    sz_size_t const min_dim = sz_min_of_two(first_length, second_length) + 1;
     sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
+
+    // When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
+    if (min_dim < 16u) {
+        sz_u8_t result_u8;
+        sz_status_t status = score_horizontally<capability_, sz_align_global_k, char, sz_u8_t,
+                                                uniform_substitution_cost_t, allocator_type_>(
+            {first.data(), first_length}, {second.data(), second_length}, result_u8, 1, uniform_substitution_cost_t {},
+            std::forward<allocator_type_>(alloc));
+        if (status == sz_bad_alloc_k) throw std::bad_alloc();
+        return result_u8;
+    }
+
+    // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
+    // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
     if (max_dim < 256u) {
         sz_u8_t result_u8;
         sz_status_t status =
@@ -542,8 +695,8 @@ inline sz_ssize_t needleman_wunsch_score(                 //
     }
 }
 
-inline void levenshtein_distances() {}
-inline void levenshtein_distances_utf8() {}
+inline void levenshtein_scores() {}
+inline void levenshtein_scores_utf8() {}
 inline void needleman_wunsch_scores() {}
 
 } // namespace openmp

From 044c7cc5b9f7de0af3739007c9199bafd73b5a58 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 1 Apr 2025 11:47:36 +0000
Subject: [PATCH 275/751] Add: Baseline NW and SW alignment with ~O(NM) space

Needed for testing
---
 scripts/test.hpp | 79 ++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 73 insertions(+), 6 deletions(-)

diff --git a/scripts/test.hpp b/scripts/test.hpp
index 55a916ae..068e041b 100644
--- a/scripts/test.hpp
+++ b/scripts/test.hpp
@@ -1,5 +1,7 @@
 /**
  *  @brief  Helper structures and functions for C++ unit- and stress-tests.
+ *  @file   test.hpp
+ *  @author Ash Vardanian
  */
 #pragma once
 #include <fstream>  // `std::ifstream`
@@ -89,8 +91,8 @@ inline void iterate_in_random_slices(std::string const &text, slice_callback_typ
 }
 
 /**
- *  @brief  Inefficient baseline Levenshtein distance computation, as implemented in most codebases.
- *          Allocates a new matrix on every call, with rows potentially scattered around memory.
+ *  @brief Inefficient baseline Levenshtein distance computation, as implemented in most codebases.
+ *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
  */
 inline std::size_t levenshtein_baseline(char const *s1, std::size_t len1, char const *s2, std::size_t len2) {
     std::size_t const rows = len1 + 1;
@@ -105,16 +107,81 @@ inline std::size_t levenshtein_baseline(char const *s1, std::size_t len1, char c
         std::size_t const *last_row = &matrix_buffer[(i - 1) * cols];
         std::size_t *row = &matrix_buffer[i * cols];
         for (std::size_t j = 1; j < cols; ++j) {
-            std::size_t cost = (s1[i - 1] == s2[j - 1]) ? 0 : 1;
-            std::size_t deletion_or_insertion = std::min(last_row[j], row[j - 1]) + 1;
-            row[j] = std::min(deletion_or_insertion, last_row[j - 1] + cost);
+            std::size_t substitution_cost = (s1[i - 1] == s2[j - 1]) ? 0 : 1;
+            std::size_t if_deletion_or_insertion = std::min(last_row[j], row[j - 1]) + 1;
+            row[j] = std::min(if_deletion_or_insertion, last_row[j - 1] + substitution_cost);
         }
     }
 
     return matrix_buffer.back();
 }
 
-using error_costs_256x256_t = std::array<sz_error_cost_t, 256 * 256>;
+/**
+ *  @brief Inefficient baseline Needleman-Wunsch alignment score computation, as implemented in most codebases.
+ *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
+ */
+inline std::ptrdiff_t needleman_wunsch_baseline(char const *s1, std::size_t len1, char const *s2, std::size_t len2,
+                                                std::function<error_cost_t(char, char)> substitution_cost_for,
+                                                error_cost_t gap_cost) {
+    std::size_t const rows = len1 + 1;
+    std::size_t const cols = len2 + 1;
+    std::vector<std::ptrdiff_t> matrix_buffer(rows * cols);
+
+    // Initialize the borders of the matrix.
+    for (std::size_t i = 0; i < rows; ++i) matrix_buffer[i * cols + 0] /* [i][0] in 2D */ = i * gap_cost;
+    for (std::size_t j = 0; j < cols; ++j) matrix_buffer[0 * cols + j] /* [0][j] in 2D */ = j * gap_cost;
+
+    // Fill in the rest of the matrix.
+    for (std::size_t i = 1; i < rows; ++i) {
+        std::ptrdiff_t const *last_row = &matrix_buffer[(i - 1) * cols];
+        std::ptrdiff_t *row = &matrix_buffer[i * cols];
+        for (std::size_t j = 1; j < cols; ++j) {
+            std::ptrdiff_t substitution_cost = substitution_cost_for(s1[i - 1], s2[j - 1]);
+            std::ptrdiff_t if_deletion_or_insertion = std::min(last_row[j], row[j - 1]) + gap_cost;
+            row[j] = std::min(if_deletion_or_insertion, last_row[j - 1] + substitution_cost);
+        }
+    }
+
+    return matrix_buffer.back();
+}
+
+/**
+ *  @brief Inefficient baseline Smith-Waterman local alignment score computation, as implemented in most codebases.
+ *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
+ */
+inline std::ptrdiff_t smith_waterman_baseline(char const *s1, std::size_t len1, char const *s2, std::size_t len2,
+                                              std::function<error_cost_t(char, char)> substitution_cost_for,
+                                              error_cost_t gap_cost) {
+    std::size_t const rows = len1 + 1;
+    std::size_t const cols = len2 + 1;
+    std::vector<std::ptrdiff_t> matrix_buffer(rows * cols);
+
+    // Unlike the global alignment we need to track the largest score in the matrix.
+    std::ptrdiff_t max_score = 0;
+
+    // Initialize the borders of the matrix to 0.
+    for (std::size_t i = 0; i < rows; ++i) matrix_buffer[i * cols + 0] /* [i][0] in 2D */ = 0;
+    for (std::size_t j = 0; j < cols; ++j) matrix_buffer[0 * cols + j] /* [0][j] in 2D */ = 0;
+
+    // Fill in the rest of the matrix.
+    for (std::size_t i = 1; i < rows; ++i) {
+        std::ptrdiff_t const *last_row = &matrix_buffer[(i - 1) * cols];
+        std::ptrdiff_t *row = &matrix_buffer[i * cols];
+        for (std::size_t j = 1; j < cols; ++j) {
+            std::ptrdiff_t substitution_cost = substitution_cost_for(s1[i - 1], s2[j - 1]);
+            std::ptrdiff_t if_substitution = last_row[j - 1] + substitution_cost;
+            std::ptrdiff_t if_deletion = last_row[j] + gap_cost;
+            std::ptrdiff_t if_insertion = row[j - 1] + gap_cost;
+            std::ptrdiff_t score = std::max({std::ptrdiff_t(0), if_substitution, if_deletion, if_insertion});
+            row[j] = score;
+            max_score = std::max(max_score, score);
+        }
+    }
+
+    return max_score;
+}
+
+using error_costs_256x256_t = std::array<error_cost_t, 256 * 256>;
 
 /**
  *  @brief  Produces a substitution cost matrix for the Needleman-Wunsch alignment score,

From f42aa85b50ff2da5f616180279383cf85675f424 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 1 Apr 2025 11:54:53 +0000
Subject: [PATCH 276/751] Docs: StringCuZilla design choices

---
 include/stringcuzilla/features.hpp            |  8 ++--
 include/stringcuzilla/find_many.hpp           | 38 +++++++++++++++++++
 .../{similarities.cuh => similarity.cuh}      |  5 ++-
 .../{similarities.hpp => similarity.hpp}      |  0
 include/stringzilla/stringzilla.h             | 12 ++++--
 5 files changed, 53 insertions(+), 10 deletions(-)
 create mode 100644 include/stringcuzilla/find_many.hpp
 rename include/stringcuzilla/{similarities.cuh => similarity.cuh} (99%)
 rename include/stringcuzilla/{similarities.hpp => similarity.hpp} (100%)

diff --git a/include/stringcuzilla/features.hpp b/include/stringcuzilla/features.hpp
index 06a94338..665291bf 100644
--- a/include/stringcuzilla/features.hpp
+++ b/include/stringcuzilla/features.hpp
@@ -1,6 +1,6 @@
 /**
  *  @brief  Hardware-accelerated feature extractions for string collections.
- *  @file   features.h
+ *  @file   features.hpp
  *  @author Ash Vardanian
  *
  *  The `sklearn.feature_extraction` module for @b TF-IDF, `CountVectorizer`, and `HashingVectorizer`
@@ -28,8 +28,8 @@
  *    - output hashes into a high-dimensional bit-vector.
  *
  */
-#ifndef STRINGZILLA_FEATURES_H_
-#define STRINGZILLA_FEATURES_H_
+#ifndef STRINGZILLA_FEATURES_HPP_
+#define STRINGZILLA_FEATURES_HPP_
 
 #include "types.h"
 
@@ -142,4 +142,4 @@ SZ_PUBLIC sz_bool_t sz_detect_encoding(sz_cptr_t text, sz_size_t length) {
 #ifdef __cplusplus
 }
 #endif // __cplusplus
-#endif // STRINGZILLA_FEATURES_H_
+#endif // STRINGZILLA_FEATURES_HPP_
diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
new file mode 100644
index 00000000..f7dd1942
--- /dev/null
+++ b/include/stringcuzilla/find_many.hpp
@@ -0,0 +1,38 @@
+/**
+ *  @brief  Hardware-accelerated multi-pattern exact substring search.
+ *  @file   find_many.hpp
+ *  @author Ash Vardanian
+ *
+ *  One of the most broadly used algorithms in string processing is the multi-pattern Aho-Corasick
+ *  algorithm, that constructs a trie from the patterns, transforms it into a finite state machine,
+ *  and then uses it to search for all patterns in the text in a single pass.
+ *
+ *  One of its biggest issues is the memory consumption, as the naive implementation requires each
+ *  state to be proportional to the size of the alphabet, or 256 for byte-level processing. Such dense
+ *  representations simplify transition lookup down to a single memory access, but that access can be
+ *  expensive if the memory doesn't fir into the CPU caches for really large vocabulary sizes.
+ *
+ *  Addressing this, we provide a sparse layout variant of the FSM, that uses predicated SIMD instructions
+ *  to rapidly probe the transitions and find the next state. This allows us to use a much smaller state,
+ *  fitting in L1/L2 caches much more frequently.
+ */
+#ifndef STRINGZILLA_FIND_MANY_HPP_
+#define STRINGZILLA_FIND_MANY_HPP_
+
+#include "types.h"
+
+#include "compare.h" // `sz_compare`
+#include "memory.h"  // `sz_copy`
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#pragma region Core API
+
+#pragma endregion // Core API
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+#endif // STRINGZILLA_FIND_MANY_HPP_
diff --git a/include/stringcuzilla/similarities.cuh b/include/stringcuzilla/similarity.cuh
similarity index 99%
rename from include/stringcuzilla/similarities.cuh
rename to include/stringcuzilla/similarity.cuh
index 71e05c3b..9f82ff02 100644
--- a/include/stringcuzilla/similarities.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -1,9 +1,10 @@
 /**
  *  @brief  CUDA-accelerated string similarity utilities.
- *  @file   similarities.cuh
+ *  @file   similarity.cuh
  *  @author Ash Vardanian
  *
- *  Includes core APIs:
+ *  Unlike th OpenMP backed, which also has single-pair similarity scores, the CUDA backend focuses on
+ *  batch-processing of large collections of strings, generally, assigning a single warp to each string pair:
  *
  *  - `sz::cuda::levenshtein_distances` & `sz::cuda::levenshtein_distances_utf8` for Levenshtein edit-distances.
  *  - `sz::cuda::needleman_wunsch_score` for weighted Needleman-Wunsch global alignment.
diff --git a/include/stringcuzilla/similarities.hpp b/include/stringcuzilla/similarity.hpp
similarity index 100%
rename from include/stringcuzilla/similarities.hpp
rename to include/stringcuzilla/similarity.hpp
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index cb790b72..9589edd0 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -25,11 +25,15 @@
  *  - `stringzilla.h` - umbrella header for the core C API.
  *  - `stringzilla.hpp` - umbrella header for the core C++ API.
  *
- *  It also provides many higher-level algorithms, mostly implemented in C++ with OpenMP and CUDA,
- *  also exposed via the stable C 99 ABI, but requiring C++17 and CUDA 17 compilers to build the shared libraries:
+ *  It also provides many higher-level parallel algorithms, mostly implemented in C++ with OpenMP and CUDA, also exposed
+ *  via the stable C 99 ABI, but requiring C++17 and CUDA 17 compilers to build the shared @b StringCuZilla libraries:
  *
- *  - `similarity.hpp` - similarity measures, like Levenshtein distance, Needleman-Wunsch, & Smith-Waterman alignment.
- *  - `features.hpp` - feature extraction for TF-IDF and other Machine Learning algorithms.
+ *  - `similarity.{hpp,cuh}` - similarity measures, like Levenshtein, Needleman-Wunsch, & Smith-Waterman scores.
+ *  - `features.{hpp,cuh}` - feature extraction for TF-IDF and other Machine Learning algorithms.
+ *  - `find_many.{hpp,cuh}` - Aho-Corasick multi-pattern search.
+ *
+ *  The core implementations of those algorithms are mostly structured as callable structure templates, as opposed to
+ *  template functions to simplify specialized overloads and reusing the state between invocations.
  *
  *  @section    Compilation Settings
  *

From 7ac0fdd9cef1848c6d7a945f50901ad6452ff427 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 1 Apr 2025 11:55:09 +0000
Subject: [PATCH 277/751] Add: CUDA scoring benchmarks

---
 scripts/bench_similarity.cpp | 92 ++++++++++++++++++++++++++++++------
 1 file changed, 77 insertions(+), 15 deletions(-)

diff --git a/scripts/bench_similarity.cpp b/scripts/bench_similarity.cpp
index 60d55c7f..3ca8ab4c 100644
--- a/scripts/bench_similarity.cpp
+++ b/scripts/bench_similarity.cpp
@@ -51,7 +51,13 @@
 #include "bench.hpp"
 #include "test.hpp" // `levenshtein_baseline`, `unary_substitution_costs`
 
-#include "stringzilla/similarities.hpp"
+#if SZ_USE_CUDA
+#include <stringzilla/similarity.cuh> // Parallel string processing on CUDA or OpenMP
+#endif
+
+#if SZ_USE_OPENMP
+#include <stringzilla/similarity.hpp> // OpenMP templates for string similarity measures
+#endif
 
 using namespace ashvardanian::stringzilla::scripts;
 
@@ -65,7 +71,7 @@ struct hamming_from_sz {
     sz_size_t bound = SZ_SIZE_MAX;
 
     inline call_result_t operator()(std::size_t token_index) const noexcept {
-        return operator()(env.tokens[token_index], env.tokens[env.tokens.size() - 1 - token_index]);
+        return operator()(env[token_index], env[env.tokens.size() - 1 - token_index]);
     }
 
     inline call_result_t operator()(std::string_view a, std::string_view b) const noexcept {
@@ -100,7 +106,7 @@ struct levenshtein_from_sz {
     sz_size_t bound = SZ_SIZE_MAX;
 
     inline call_result_t operator()(std::size_t token_index) const noexcept {
-        return operator()(env.tokens[token_index], env.tokens[env.tokens.size() - 1 - token_index]);
+        return operator()(env[token_index], env[env.tokens.size() - 1 - token_index]);
     }
 
     inline call_result_t operator()(std::string_view a, std::string_view b) const noexcept {
@@ -125,7 +131,7 @@ struct alignment_score_from_sz {
     error_costs_256x256_t costs = unary_substitution_costs();
 
     inline call_result_t operator()(std::size_t token_index) const noexcept {
-        return operator()(env.tokens[token_index], env.tokens[env.tokens.size() - 1 - token_index]);
+        return operator()(env[token_index], env[env.tokens.size() - 1 - token_index]);
     }
 
     inline call_result_t operator()(std::string_view a, std::string_view b) const noexcept {
@@ -143,6 +149,8 @@ struct alignment_score_from_sz {
     }
 };
 
+#if SZ_USE_OPENMP
+
 /** @brief Wraps a hardware-specific Levenshtein-distance backend into something @b `bench_unary`-compatible . */
 struct levenshtein_from_sz_openmp {
 
@@ -150,11 +158,11 @@ struct levenshtein_from_sz_openmp {
     sz_size_t bound = SZ_SIZE_MAX;
 
     inline call_result_t operator()(std::size_t token_index) const noexcept {
-        return operator()(env.tokens[token_index], env.tokens[env.tokens.size() - 1 - token_index]);
+        return operator()(env[token_index], env[env.tokens.size() - 1 - token_index]);
     }
 
     inline call_result_t operator()(std::string_view a, std::string_view b) const noexcept(false) {
-        sz_size_t result_distance = sz::openmp::levenshtein_distance(a, b);
+        sz_size_t result_distance = sz::openmp::levenshtein_distance(a, b, std::allocator<char>());
         do_not_optimize(result_distance);
         std::size_t bytes_passed = std::min(a.size(), b.size());
         std::size_t cells_passed = a.size() * b.size();
@@ -162,6 +170,49 @@ struct levenshtein_from_sz_openmp {
     }
 };
 
+#endif
+
+#if SZ_USE_CUDA
+
+/** @brief Wraps a hardware-specific Levenshtein-distance backend into something @b `bench_unary`-compatible . */
+struct levenshtein_from_sz_cuda {
+
+    environment_t const &env;
+    std::vector<sz_size_t, sz::cuda::unified_alloc<sz_size_t>> results;
+    sz_size_t bound = SZ_SIZE_MAX;
+
+    levenshtein_from_sz_cuda(environment_t const &env, sz_size_t batch_size) : env(env), results(batch_size) {
+        if (env.tokens.size() <= batch_size) throw std::runtime_error("Batch size is too large.");
+    }
+
+    inline call_result_t operator()(std::size_t batch_index) noexcept(false) {
+        std::size_t const batch_size = results.size();
+        std::size_t const forward_token_index = (batch_index * batch_size) % (env.tokens.size() - batch_size);
+        std::size_t const backward_token_index = env.tokens.size() - forward_token_index - batch_size;
+
+        return operator()({env.tokens.data() + forward_token_index, batch_size},
+                          {env.tokens.data() + backward_token_index, batch_size});
+    }
+
+    inline call_result_t operator()(std::span<token_view_t const> a, std::span<token_view_t const> b) noexcept(false) {
+        sz::status_t status = sz::cuda::levenshtein_distances(a, b, results.data());
+        if (status != sz::status_t::success_k) throw std::runtime_error(cudaGetErrorString(cudaGetLastError()));
+        do_not_optimize(results);
+        std::size_t bytes_passed = 0, cells_passed = 0;
+        for (std::size_t i = 0; i < results.size(); ++i) {
+            bytes_passed += std::min(a[i].size(), b[i].size());
+            cells_passed += a[i].size() * b[i].size();
+        }
+        call_result_t call_result;
+        call_result.bytes_passed = bytes_passed;
+        call_result.operations = cells_passed;
+        call_result.inputs_processed = results.size();
+        return call_result;
+    }
+};
+
+#endif
+
 void bench_edits(environment_t const &env) {
     auto base_call = levenshtein_from_sz<sz_levenshtein_distance_serial>(env);
     bench_result_t base = bench_unary(env, "sz_levenshtein_distance_serial", base_call).log();
@@ -169,7 +220,12 @@ void bench_edits(environment_t const &env) {
     bench_result_t base_utf8 = bench_unary(env, "sz_levenshtein_distance_utf8_serial", base_utf8_call).log(base);
     sz_unused(base_utf8);
 
+#if SZ_USE_OPENMP
     bench_unary(env, "sz::openmp::levenshtein_distance", levenshtein_from_sz_openmp(env)).log(base);
+#endif
+#if SZ_USE_CUDA
+    bench_unary(env, "sz::cuda::levenshtein_distances(x1024)", levenshtein_from_sz_cuda(env, 1024)).log(base);
+#endif
 
 #if SZ_USE_ICE
     auto ice_call = levenshtein_from_sz<sz_levenshtein_distance_ice>(env);
@@ -185,16 +241,22 @@ void bench_edits(environment_t const &env) {
 int main(int argc, char const **argv) {
     std::printf("Welcome to StringZilla!\n");
 
-    std::printf("Building up the environment...\n");
-    environment_t env = build_environment( //
-        argc, argv,                        //
-        "xlsum.csv",                       // Preferred for UTF-8 content
-        environment_t::tokenization_t::words_k);
+    try {
+        std::printf("Building up the environment...\n");
+        environment_t env = build_environment( //
+            argc, argv,                        //
+            "xlsum.csv",                       // Preferred for UTF-8 content
+            environment_t::tokenization_t::lines_k);
 
-    std::printf("Starting string similarity benchmarks...\n");
-    bench_hamming(env);
-    bench_edits(env);
+        std::printf("Starting string similarity benchmarks...\n");
+        bench_hamming(env);
+        bench_edits(env);
+    }
+    catch (std::exception const &e) {
+        std::fprintf(stderr, "Failed with: %s\n", e.what());
+        return 1;
+    }
 
-    std::printf("All benchmarks passed.\n");
+    std::printf("All benchmarks finished.\n");
     return 0;
 }
\ No newline at end of file

From 0c6ff1fd8033f5ca3ca98bf2f7b745beded0b2e0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 1 Apr 2025 14:40:57 +0000
Subject: [PATCH 278/751] Make: Revert to C++ for core tests

---
 scripts/{test.cu => test.cpp} | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
 rename scripts/{test.cu => test.cpp} (99%)

diff --git a/scripts/test.cu b/scripts/test.cpp
similarity index 99%
rename from scripts/test.cu
rename to scripts/test.cpp
index e1b7d59f..eab21ab1 100644
--- a/scripts/test.cu
+++ b/scripts/test.cpp
@@ -46,12 +46,12 @@
  * ! They control the OpenMP CPU backend as well as the CUDA GPU backend.
  */
 #if SZ_USE_CUDA
-#include <stringzilla/types.cuh>        // `unified_alloc`
-#include <stringzilla/similarities.cuh> // Parallel string processing on CUDA or OpenMP
+#include <stringzilla/types.cuh>      // `unified_alloc`
+#include <stringzilla/similarity.cuh> // Parallel string processing on CUDA or OpenMP
 #endif
 
 #if SZ_USE_OPENMP
-#include <stringzilla/similarities.hpp> // OpenMP templates for string similarity measures
+#include <stringzilla/similarity.hpp> // OpenMP templates for string similarity measures
 #endif
 
 #if defined(__SANITIZE_ADDRESS__)

From 3ef1d2696ac85940301dce9f81b20bc42af8ba48 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 1 Apr 2025 18:04:16 +0000
Subject: [PATCH 279/751] Improve: Shorter type aliases

---
 include/stringzilla/types.hpp | 96 ++++++++++++++++++++---------------
 1 file changed, 54 insertions(+), 42 deletions(-)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index cc3c31af..1e03a133 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -13,10 +13,9 @@
  *
  *  The library also defines the following higher-level structures:
  *
- *  - `span<value_type>` -
- *  - `dummy_alloc_t` -
- *  - `dummy_alloc<value_type>` -
- *  - `arrow_strings_tape<char_type, offset_type>` -
+ *  - `span<value_type>` - a view to a contiguous memory block of `value_type` elements.
+ *  - `dummy_alloc<value_type>` - a dummy memory allocator that resembles the `std::allocator` interface.
+ *  - `arrow_strings_tape<char_type, offset_type>` - a tape data-structure to efficiently store a sequence strings.
  */
 #ifndef STRINGZILLA_TYPES_HPP_
 #define STRINGZILLA_TYPES_HPP_
@@ -86,6 +85,10 @@
 #define sz_constexpr_if_cpp20
 #endif
 
+#if !SZ_AVOID_STL
+#include <initializer_list> // `std::initializer_list` is only ~100 LOC
+#endif
+
 namespace ashvardanian {
 namespace stringzilla {
 
@@ -129,9 +132,9 @@ struct lookup_substitution_cost_t {
 
 template <typename value_type_>
 struct span {
-    using value_type = value_type_;
-    using size_type = sz_size_t;
-    using difference_type = sz_ssize_t;
+    using value_type = value_type_;     // ? For STL compatibility
+    using size_type = sz_size_t;        // ? For STL compatibility
+    using difference_type = sz_ssize_t; // ? For STL compatibility
 
     value_type *data_ {};
     size_type size_ {};
@@ -146,10 +149,10 @@ struct span {
 
 template <typename value_type_>
 struct dummy_alloc {
-    using value_type = value_type_;
-    using pointer = value_type *;
-    using size_type = size_t;
-    using difference_type = sz_ssize_t;
+    using value_type = value_type_;     // ? For STL compatibility
+    using pointer = value_type *;       // ? For STL compatibility
+    using size_type = size_t;           // ? For STL compatibility
+    using difference_type = sz_ssize_t; // ? For STL compatibility
 
     template <typename other_value_type_>
     struct rebind {
@@ -185,18 +188,19 @@ using dummy_alloc_t = dummy_alloc<char>;
  */
 template <typename char_type_, typename offset_type_>
 struct arrow_strings_view {
-    using char_type = char_type_;
-    using offset_type = offset_type_;
-    using value_type = span<char_type>;
+    using char_t = char_type_;
+    using offset_t = offset_type_;
+    using value_t = span<char_t>;
+    using value_type = value_t; // ? For STL compatibility
 
-    span<char_type> buffer_;
-    span<offset_type> offsets_;
+    span<char_t> buffer_;
+    span<offset_t> offsets_;
 
     constexpr arrow_strings_view() noexcept : buffer_ {}, offsets_ {} {}
-    constexpr arrow_strings_view(span<char_type> buf, span<offset_type> offs) noexcept : buffer_(buf), offsets_(offs) {}
+    constexpr arrow_strings_view(span<char_t> buf, span<offset_t> offs) noexcept : buffer_(buf), offsets_(offs) {}
     constexpr size_t size() const noexcept { return offsets_.size() - 1; }
 
-    constexpr span<char_type> operator[](size_t i) const noexcept {
+    constexpr span<char_t> operator[](size_t i) const noexcept {
         return {&buffer_[offsets_[i]], offsets_[i + 1] - offsets_[i] - 1};
     }
 };
@@ -208,19 +212,20 @@ struct arrow_strings_view {
  */
 template <typename char_type_, typename offset_type_, typename allocator_type_>
 struct arrow_strings_tape {
-    using char_type = char_type_;
-    using offset_type = offset_type_;
-    using allocator_type = allocator_type_;
+    using char_t = char_type_;
+    using offset_t = offset_type_;
+    using allocator_t = allocator_type_;
 
-    using value_type = span<char_type>;
-    using view_type = arrow_strings_view<char_type, offset_type>;
+    using value_t = span<char_t>;
+    using view_t = arrow_strings_view<char_t, offset_t>;
+    using value_type = value_t; // ? For STL compatibility
 
-    using char_alloc_t = typename allocator_type::template rebind<char_type>::other;
-    using offset_alloc_t = typename allocator_type::template rebind<offset_type>::other;
+    using char_alloc_t = typename allocator_t::template rebind<char_t>::other;
+    using offset_alloc_t = typename allocator_t::template rebind<offset_t>::other;
 
   private:
-    span<char_type> buffer_;
-    span<offset_type> offsets_;
+    span<char_t> buffer_;
+    span<offset_t> offsets_;
     char_alloc_t char_alloc_;
     offset_alloc_t offset_alloc_;
 
@@ -232,16 +237,16 @@ struct arrow_strings_tape {
     constexpr arrow_strings_tape(arrow_strings_tape &&) = delete;
     constexpr arrow_strings_tape &operator=(arrow_strings_tape &&) = delete;
 
-    constexpr arrow_strings_tape(span<char_type> buffer, span<offset_type> offsets, allocator_type alloc)
+    constexpr arrow_strings_tape(span<char_t> buffer, span<offset_t> offsets, allocator_t alloc)
         : buffer_(buffer), offsets_(offsets), char_alloc_(alloc), offset_alloc_(alloc) {}
 
-    template <typename strings_iterator_>
-    sz_constexpr_if_cpp14 status_t try_assign(strings_iterator_ first, strings_iterator_ last) noexcept {
+    template <typename strings_iterator_type_>
+    sz_constexpr_if_cpp14 status_t try_assign(strings_iterator_type_ first, strings_iterator_type_ last) noexcept {
         // Deallocate the previous memory if it was allocated
         if (buffer_.data_ && buffer_.size_)
-            char_alloc_.deallocate(const_cast<char_type *>(buffer_.data_), buffer_.size_), buffer_ = {};
+            char_alloc_.deallocate(const_cast<char_t *>(buffer_.data_), buffer_.size_), buffer_ = {};
         if (offsets_.data_ && offsets_.size_)
-            offset_alloc_.deallocate(const_cast<offset_type *>(offsets_.data_), offsets_.size_), offsets_ = {};
+            offset_alloc_.deallocate(const_cast<offset_t *>(offsets_.data_), offsets_.size_), offsets_ = {};
 
         // Estimate the required memory size
         size_t count = 0;
@@ -253,24 +258,31 @@ struct arrow_strings_tape {
         if (!buffer_.data_ || !offsets_.data_) return status_t::bad_alloc_k;
 
         // Copy the strings to the buffer and store the offsets
-        char_type *buffer_ptr = buffer_.data_;
-        offset_type *offsets_ptr = offsets_.data_;
+        char_t *buffer_ptr = buffer_.data_;
+        offset_t *offsets_ptr = offsets_.data_;
         for (auto it = first; it != last; ++it) {
-            *offsets_ptr++ = static_cast<offset_type>(buffer_ptr - buffer_.data_);
+            *offsets_ptr++ = static_cast<offset_t>(buffer_ptr - buffer_.data_);
             // Perform a byte-level copy of the string, similar to `sz_copy`
-            char_type const *from_ptr = it->data();
+            char_t const *from_ptr = it->data();
             size_t const from_length = it->length();
             for (size_t i = 0; i != from_length; ++i) *buffer_ptr++ = *from_ptr++;
             *buffer_ptr++ = '\0'; // ? NULL-terminated
         }
-        *offsets_ptr++ = static_cast<offset_type>(buffer_ptr - buffer_.data_);
+        *offsets_ptr++ = static_cast<offset_t>(buffer_ptr - buffer_.data_);
         return status_t::success_k;
     }
 
+#if !SZ_AVOID_STL
+    template <typename string_convertible_type_>
+    sz_constexpr_if_cpp14 status_t try_assign(std::initializer_list<string_convertible_type_> inits) noexcept {
+        return try_assign(inits.begin(), inits.end());
+    }
+#endif
+
     sz_constexpr_if_cpp20 ~arrow_strings_tape() noexcept {
-        if (buffer_.data_) char_alloc_.deallocate(const_cast<char_type *>(buffer_.data_), buffer_.size_), buffer_ = {};
+        if (buffer_.data_) char_alloc_.deallocate(const_cast<char_t *>(buffer_.data_), buffer_.size_), buffer_ = {};
         if (offsets_.data_)
-            offset_alloc_.deallocate(const_cast<offset_type *>(offsets_.data_), offsets_.size_), offsets_ = {};
+            offset_alloc_.deallocate(const_cast<offset_t *>(offsets_.data_), offsets_.size_), offsets_ = {};
     }
 
     constexpr value_type operator[](size_t i) const noexcept {
@@ -278,10 +290,10 @@ struct arrow_strings_tape {
     }
 
     constexpr size_t size() const noexcept { return offsets_.size() - 1; }
-    constexpr view_type view() const noexcept { return {buffer_, offsets_}; }
+    constexpr view_t view() const noexcept { return {buffer_, offsets_}; }
 
-    constexpr span<char_type> const &buffer() const noexcept { return buffer_; }
-    constexpr span<offset_type> const &offsets() const noexcept { return offsets_; }
+    constexpr span<char_t> const &buffer() const noexcept { return buffer_; }
+    constexpr span<offset_t> const &offsets() const noexcept { return offsets_; }
 };
 
 } // namespace stringzilla

From e4b1bbda8eda73ed74577c2c5402ff9425b5fea0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 2 Apr 2025 17:55:23 +0000
Subject: [PATCH 280/751] Fix: Avoid similarity scoring references

---
 include/stringzilla/stringzilla.hpp | 142 ----------------------------
 1 file changed, 142 deletions(-)

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 1be79425..9713284d 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -1959,7 +1959,6 @@ class basic_string_slice {
  *      * `replace`, `insert`, `erase`, `append`, `push_back`, `pop_back`, `resize`, `shrink_to_fit`... from STL,
  *      * `try_` exception-free "try" operations that returning non-zero values on success,
  *      * `replace_all` and `erase_all` similar to Boost,
- *      * `levenshtein_distance` - Levenshtein distance computation reusing the allocator,
  *      * `translate` - character mapping,
  *      * `randomize`, `random` - for fast random string generation.
  *
@@ -3289,15 +3288,6 @@ class basic_string {
 
     concatenation<string_view, string_view> operator|(string_view other) const noexcept { return {view(), other}; }
 
-    size_type levenshtein_distance(string_view other, size_type bound = std::numeric_limits<size_type>::max()) const
-        noexcept(false) {
-        size_type result = std::numeric_limits<size_type>::max();
-        raise(_with_alloc([&](sz_alloc_type &alloc) {
-            return sz_levenshtein_distance(data(), size(), other.data(), other.size(), bound, &alloc, &result);
-        }));
-        return result;
-    }
-
     /**  @brief Hashes the string, equivalent to `std::hash<string_view>{}(str)`. */
     size_type hash() const noexcept { return view().hash(); }
 
@@ -3762,138 +3752,6 @@ typename concatenation_result<first_type, second_type, following_types...>::type
             std::forward<following_types>(following)...));
 }
 
-/**
- *  @brief Calculates the Hamming edit distance in @b bytes between two strings.
- *  @sa sz_levenshtein_distance
- */
-template <typename char_type_>
-std::size_t hamming_distance(                                                         //
-    basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b, //
-    std::size_t bound = SZ_SIZE_MAX) noexcept {
-    std::size_t result;
-    sz_hamming_distance(a.data(), a.size(), b.data(), b.size(), bound, &result);
-    return result;
-}
-
-/**
- *  @brief Calculates the Hamming edit distance in @b bytes between two strings.
- *  @sa sz_levenshtein_distance
- */
-template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
-std::size_t hamming_distance(                                                                               //
-    basic_string<char_type_, allocator_type_> const &a, basic_string<char_type_, allocator_type_> const &b, //
-    std::size_t bound = SZ_SIZE_MAX) noexcept {
-    return ashvardanian::stringzilla::hamming_distance(a.view(), b.view(), bound);
-}
-
-/**
- *  @brief Calculates the Hamming edit distance in @b unicode codepoints between two strings.
- *  @sa sz_hamming_distance_utf8
- */
-template <typename char_type_>
-std::size_t hamming_distance_utf8( //
-    basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b,
-    std::size_t bound = SZ_SIZE_MAX) noexcept {
-    std::size_t result;
-    sz_hamming_distance_utf8(a.data(), a.size(), b.data(), b.size(), bound, &result);
-    return result;
-}
-
-/**
- *  @brief Calculates the Hamming edit distance in @b unicode codepoints between two strings.
- *  @sa sz_levenshtein_distance
- */
-template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
-std::size_t hamming_distance_utf8( //
-    basic_string<char_type_, allocator_type_> const &a, basic_string<char_type_, allocator_type_> const &b,
-    std::size_t bound = SZ_SIZE_MAX) noexcept {
-    return ashvardanian::stringzilla::hamming_distance_utf8(a.view(), b.view(), bound);
-}
-
-/**
- *  @brief Calculates the Levenshtein edit distance in @b bytes between two strings.
- *  @sa sz_levenshtein_distance
- */
-template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
-std::size_t levenshtein_distance( //
-    basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b, std::size_t bound = SZ_SIZE_MAX,
-    allocator_type_ &&allocator = allocator_type_ {}) noexcept(false) {
-    std::size_t result = SZ_SIZE_MAX;
-    raise(_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
-        return sz_levenshtein_distance(a.data(), a.size(), b.data(), b.size(), bound, &alloc, &result);
-    }));
-    return result;
-}
-
-/**
- *  @brief Calculates the Levenshtein edit distance in @b bytes between two strings.
- *  @sa sz_levenshtein_distance
- */
-template <typename char_type_, typename allocator_type_ = std::allocator<char_type_>>
-std::size_t levenshtein_distance(                                                                           //
-    basic_string<char_type_, allocator_type_> const &a, basic_string<char_type_, allocator_type_> const &b, //
-    std::size_t bound = SZ_SIZE_MAX) noexcept(false) {
-    return ashvardanian::stringzilla::levenshtein_distance(a.view(), b.view(), bound, a.get_allocator());
-}
-
-/**
- *  @brief Calculates the Levenshtein edit distance in @b unicode codepoints between two strings.
- *  @sa sz_levenshtein_distance_utf8
- */
-template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
-std::size_t levenshtein_distance_utf8(                                                //
-    basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b, //
-    std::size_t bound = SZ_SIZE_MAX, allocator_type_ &&allocator = allocator_type_ {}) noexcept(false) {
-    std::size_t result = SZ_SIZE_MAX;
-    raise(_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
-        return sz_levenshtein_distance_utf8(a.data(), a.size(), b.data(), b.size(), bound, &alloc, &result);
-    }));
-    return result;
-}
-
-/**
- *  @brief Calculates the Levenshtein edit distance in @b unicode codepoints between two strings.
- *  @sa sz_levenshtein_distance_utf8
- */
-template <typename char_type_, typename allocator_type_ = std::allocator<char_type_>>
-std::size_t levenshtein_distance_utf8(                                                                      //
-    basic_string<char_type_, allocator_type_> const &a, basic_string<char_type_, allocator_type_> const &b, //
-    std::size_t bound = SZ_SIZE_MAX) noexcept(false) {
-    return ashvardanian::stringzilla::levenshtein_distance_utf8(a.view(), b.view(), bound, a.get_allocator());
-}
-
-/**
- *  @brief Calculates the Needleman-Wunsch alignment score between two strings.
- *  @sa sz_needleman_wunsch_score
- */
-template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
-std::ptrdiff_t alignment_score(                                                       //
-    basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b, //
-    std::int8_t const (&subs)[256][256], std::int8_t gap = -1,
-    allocator_type_ &&allocator = allocator_type_ {}) noexcept(false) {
-
-    static_assert(sizeof(sz_error_cost_t) == sizeof(std::int8_t), "sz_error_cost_t must be 8-bit.");
-    static_assert(std::is_signed<sz_error_cost_t>() == std::is_signed<std::int8_t>(),
-                  "sz_error_cost_t must be signed.");
-
-    std::ptrdiff_t result = SZ_SSIZE_MIN;
-    raise(_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
-        return sz_needleman_wunsch_score(a.data(), a.size(), b.data(), b.size(), &subs[0][0], gap, &alloc, &result);
-    }));
-    return result;
-}
-
-/**
- *  @brief Calculates the Needleman-Wunsch alignment score between two strings.
- *  @sa sz_needleman_wunsch_score
- */
-template <typename char_type_, typename allocator_type_ = std::allocator<char_type_>>
-std::ptrdiff_t alignment_score(                                                                             //
-    basic_string<char_type_, allocator_type_> const &a, basic_string<char_type_, allocator_type_> const &b, //
-    std::int8_t const (&subs)[256][256], std::int8_t gap = -1) noexcept(false) {
-    return ashvardanian::stringzilla::alignment_score(a.view(), b.view(), subs, gap, a.get_allocator());
-}
-
 /**
  *  @brief Overwrites the @p string slice with random bytes.
  *  @param[in] string The string to overwrite.

From 238c86dca6ffb02e35952772865d443f78ba5a79 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 2 Apr 2025 17:55:41 +0000
Subject: [PATCH 281/751] Fix: Missing `sz_i16_t` definition

---
 include/stringzilla/types.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index d643007a..2c87a2ce 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -297,6 +297,7 @@ extern "C" {
 #if !SZ_AVOID_LIBC
 typedef int8_t sz_i8_t;       // Always 8 bits
 typedef uint8_t sz_u8_t;      // Always 8 bits
+typedef int16_t sz_i16_t;     // Always 16 bits
 typedef uint16_t sz_u16_t;    // Always 16 bits
 typedef int32_t sz_i32_t;     // Always 32 bits
 typedef uint32_t sz_u32_t;    // Always 32 bits

From 2ef667ca18c2b914ba0dfee2c3897a932c0c372d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 3 Apr 2025 12:06:45 +0000
Subject: [PATCH 282/751] Improve: Annotate throwing exceptions

---
 scripts/test.hpp | 51 ++++++++++++++++++++++++++----------------------
 1 file changed, 28 insertions(+), 23 deletions(-)

diff --git a/scripts/test.hpp b/scripts/test.hpp
index 068e041b..5424d34b 100644
--- a/scripts/test.hpp
+++ b/scripts/test.hpp
@@ -4,30 +4,32 @@
  *  @author Ash Vardanian
  */
 #pragma once
-#include <fstream>  // `std::ifstream`
-#include <iostream> // `std::cout`, `std::endl`
-#include <random>   // `std::random_device`
-#include <string>   // `std::string`
-#include <vector>   // `std::vector`
+#include <fstream>    // `std::ifstream`
+#include <iostream>   // `std::cout`, `std::endl`
+#include <random>     // `std::random_device`
+#include <string>     // `std::string`
+#include <vector>     // `std::vector`
+#include <array>      // `std::array`
+#include <functional> // `std::function`
 
 namespace ashvardanian {
 namespace stringzilla {
 namespace scripts {
 
-inline std::string read_file(std::string path) {
+inline std::string read_file(std::string path) noexcept(false) {
     std::ifstream stream(path);
     if (!stream.is_open()) throw std::runtime_error("Failed to open file: " + path);
     return std::string((std::istreambuf_iterator<char>(stream)), std::istreambuf_iterator<char>());
 }
 
-inline void write_file(std::string path, std::string content) {
+inline void write_file(std::string path, std::string content) noexcept(false) {
     std::ofstream stream(path);
     if (!stream.is_open()) throw std::runtime_error("Failed to open file: " + path);
     stream << content;
     stream.close();
 }
 
-inline std::mt19937 &global_random_generator() {
+inline std::mt19937 &global_random_generator() noexcept {
     static std::random_device seed_source; // Too expensive to construct every time
     static std::mt19937 generator(seed_source());
     return generator;
@@ -41,35 +43,37 @@ inline std::mt19937 &global_random_generator() {
  *  MSVC, for example, requires one of `short`, `int`, `long`, `long long`, `unsigned short`, `unsigned int`,
  *  `unsigned long`, or `unsigned long long`.
  */
-struct uniform_uint8_distribution_t {
+struct uniform_u8_distribution_t {
     std::uniform_int_distribution<std::uint32_t> distribution;
-    inline uniform_uint8_distribution_t(std::size_t alphabet_size = 255)
+
+    inline uniform_u8_distribution_t(std::size_t alphabet_size = 255)
         : distribution(1, static_cast<std::uint32_t>(alphabet_size)) {}
-    inline uniform_uint8_distribution_t(char from, char to)
+    inline uniform_u8_distribution_t(char from, char to)
         : distribution(static_cast<std::uint32_t>(from), static_cast<std::uint32_t>(to)) {}
-    template <typename generator_type>
-    std::uint8_t operator()(generator_type &&generator) {
+
+    template <typename generator_type_>
+    std::uint8_t operator()(generator_type_ &&generator) noexcept {
         return static_cast<std::uint8_t>(distribution(generator));
     }
 };
 
-inline void randomize_string(char *string, std::size_t length, char const *alphabet, std::size_t cardinality) {
-    uniform_uint8_distribution_t distribution(0, cardinality - 1);
+inline void randomize_string(char *string, std::size_t length, char const *alphabet, std::size_t cardinality) noexcept {
+    uniform_u8_distribution_t distribution(0, cardinality - 1);
     std::generate(string, string + length, [&]() -> char { return alphabet[distribution(global_random_generator())]; });
 }
 
-inline void randomize_string(char *string, std::size_t length) {
-    uniform_uint8_distribution_t distribution;
+inline void randomize_string(char *string, std::size_t length) noexcept {
+    uniform_u8_distribution_t distribution;
     std::generate(string, string + length, [&]() -> char { return distribution(global_random_generator()); });
 }
 
-inline std::string random_string(std::size_t length, char const *alphabet, std::size_t cardinality) {
+inline std::string random_string(std::size_t length, char const *alphabet, std::size_t cardinality) noexcept(false) {
     std::string result(length, '\0');
     randomize_string(&result[0], length, alphabet, cardinality);
     return result;
 }
 
-inline std::string repeat(std::string const &patten, std::size_t count) {
+inline std::string repeat(std::string const &patten, std::size_t count) noexcept(false) {
     std::string result(patten.size() * count, '\0');
     for (std::size_t i = 0; i < count; ++i) std::copy(patten.begin(), patten.end(), result.begin() + i * patten.size());
     return result;
@@ -80,7 +84,7 @@ inline std::string repeat(std::string const &patten, std::size_t count) {
  *  @warning Is @b single-threaded in nature, as it depends on the `global_random_generator`.
  */
 template <typename slice_callback_type_>
-inline void iterate_in_random_slices(std::string const &text, slice_callback_type_ &&slice_callback) {
+inline void iterate_in_random_slices(std::string const &text, slice_callback_type_ &&slice_callback) noexcept {
     std::size_t remaining = text.size();
     while (remaining > 0) {
         std::uniform_int_distribution<std::size_t> slice_length_distribution(1, remaining);
@@ -94,7 +98,8 @@ inline void iterate_in_random_slices(std::string const &text, slice_callback_typ
  *  @brief Inefficient baseline Levenshtein distance computation, as implemented in most codebases.
  *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
  */
-inline std::size_t levenshtein_baseline(char const *s1, std::size_t len1, char const *s2, std::size_t len2) {
+inline std::size_t levenshtein_baseline(char const *s1, std::size_t len1, char const *s2,
+                                        std::size_t len2) noexcept(false) {
     std::size_t const rows = len1 + 1;
     std::size_t const cols = len2 + 1;
     std::vector<std::size_t> matrix_buffer(rows * cols);
@@ -122,7 +127,7 @@ inline std::size_t levenshtein_baseline(char const *s1, std::size_t len1, char c
  */
 inline std::ptrdiff_t needleman_wunsch_baseline(char const *s1, std::size_t len1, char const *s2, std::size_t len2,
                                                 std::function<error_cost_t(char, char)> substitution_cost_for,
-                                                error_cost_t gap_cost) {
+                                                error_cost_t gap_cost) noexcept(false) {
     std::size_t const rows = len1 + 1;
     std::size_t const cols = len2 + 1;
     std::vector<std::ptrdiff_t> matrix_buffer(rows * cols);
@@ -151,7 +156,7 @@ inline std::ptrdiff_t needleman_wunsch_baseline(char const *s1, std::size_t len1
  */
 inline std::ptrdiff_t smith_waterman_baseline(char const *s1, std::size_t len1, char const *s2, std::size_t len2,
                                               std::function<error_cost_t(char, char)> substitution_cost_for,
-                                              error_cost_t gap_cost) {
+                                              error_cost_t gap_cost) noexcept(false) {
     std::size_t const rows = len1 + 1;
     std::size_t const cols = len2 + 1;
     std::vector<std::ptrdiff_t> matrix_buffer(rows * cols);

From 6d7f2219ff5360c2d743f1177b4b60d4d82c3241 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 3 Apr 2025 12:08:07 +0000
Subject: [PATCH 283/751] Add: `arrow_strings_tape::try_append`

---
 include/stringzilla/types.hpp | 81 +++++++++++++++++++++++++++++------
 1 file changed, 68 insertions(+), 13 deletions(-)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 1e03a133..877617c4 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -228,6 +228,7 @@ struct arrow_strings_tape {
     span<offset_t> offsets_;
     char_alloc_t char_alloc_;
     offset_alloc_t offset_alloc_;
+    size_t count_ = 0;
 
   public:
     constexpr arrow_strings_tape() = default;
@@ -240,19 +241,30 @@ struct arrow_strings_tape {
     constexpr arrow_strings_tape(span<char_t> buffer, span<offset_t> offsets, allocator_t alloc)
         : buffer_(buffer), offsets_(offsets), char_alloc_(alloc), offset_alloc_(alloc) {}
 
+    sz_constexpr_if_cpp20 ~arrow_strings_tape() noexcept { reset(); }
+    sz_constexpr_if_cpp20 void reset() noexcept {
+        if (buffer_.data_) char_alloc_.deallocate(const_cast<char_t *>(buffer_.data_), buffer_.size_), buffer_ = {};
+        if (offsets_.data_)
+            offset_alloc_.deallocate(const_cast<offset_t *>(offsets_.data_), offsets_.size_), offsets_ = {};
+        count_ = 0;
+    }
+
     template <typename strings_iterator_type_>
-    sz_constexpr_if_cpp14 status_t try_assign(strings_iterator_type_ first, strings_iterator_type_ last) noexcept {
-        // Deallocate the previous memory if it was allocated
+    status_t try_assign(strings_iterator_type_ first, strings_iterator_type_ last) noexcept {
+        // Deallocate previous memory if allocated
         if (buffer_.data_ && buffer_.size_)
             char_alloc_.deallocate(const_cast<char_t *>(buffer_.data_), buffer_.size_), buffer_ = {};
+
         if (offsets_.data_ && offsets_.size_)
             offset_alloc_.deallocate(const_cast<offset_t *>(offsets_.data_), offsets_.size_), offsets_ = {};
 
-        // Estimate the required memory size
+        // Estimate required memory: total characters + one extra per string for the NULL.
         size_t count = 0;
         size_t combined_length = 0;
         for (auto it = first; it != last; ++it, ++count) combined_length += it->length();
         combined_length += count; // ? NULL-terminate every string
+
+        // Allocate exactly the required memory
         buffer_ = {char_alloc_.allocate(combined_length), combined_length};
         offsets_ = {offset_alloc_.allocate(count + 1), count + 1};
         if (!buffer_.data_ || !offsets_.data_) return status_t::bad_alloc_k;
@@ -268,30 +280,73 @@ struct arrow_strings_tape {
             for (size_t i = 0; i != from_length; ++i) *buffer_ptr++ = *from_ptr++;
             *buffer_ptr++ = '\0'; // ? NULL-terminated
         }
-        *offsets_ptr++ = static_cast<offset_t>(buffer_ptr - buffer_.data_);
+        *offsets_ptr = static_cast<offset_t>(buffer_ptr - buffer_.data_);
+        count_ = count;
         return status_t::success_k;
     }
 
 #if !SZ_AVOID_STL
     template <typename string_convertible_type_>
-    sz_constexpr_if_cpp14 status_t try_assign(std::initializer_list<string_convertible_type_> inits) noexcept {
+    status_t try_assign(std::initializer_list<string_convertible_type_> inits) noexcept {
         return try_assign(inits.begin(), inits.end());
     }
 #endif
 
-    sz_constexpr_if_cpp20 ~arrow_strings_tape() noexcept {
-        if (buffer_.data_) char_alloc_.deallocate(const_cast<char_t *>(buffer_.data_), buffer_.size_), buffer_ = {};
-        if (offsets_.data_)
-            offset_alloc_.deallocate(const_cast<offset_t *>(offsets_.data_), offsets_.size_), offsets_ = {};
+    status_t try_append(span<char_t const> string) noexcept {
+        size_t const string_length = string.length();
+        size_t const required = string_length + 1; // Space needed for the new string and its NULL
+        size_t current_used = count_ > 0 ? offsets_.data_[count_] : 0;
+
+        // Reallocate the buffer if needed (oversubscribe in powers of two).
+        if (current_used + required > buffer_.size_) {
+            size_t new_capacity = sz_size_bit_ceil(current_used + required);
+            char_t *new_buffer = char_alloc_.allocate(new_capacity);
+            if (!new_buffer) return status_t::bad_alloc_k;
+            if (buffer_.data_) {
+                // Copy the existing data to the new array, before deallocating the old one.
+                char_t const *src = buffer_.data_, *end = buffer_.data_ + current_used;
+                char_t *tgt = new_buffer;
+                for (; src != end; ++src, ++tgt) *tgt = *src;
+                char_alloc_.deallocate(const_cast<char_t *>(buffer_.data_), buffer_.size_);
+            }
+            buffer_.data_ = new_buffer;
+            buffer_.size_ = new_capacity;
+        }
+
+        // Reallocate the offsets array if needed.
+        if (count_ + 1 >= offsets_.size_) { // need one extra slot for the new offset
+            size_t new_offsets_capacity = sz_size_bit_ceil(count_ + 1);
+            offset_t *new_offsets = offset_alloc_.allocate(new_offsets_capacity);
+            if (!new_offsets) return status_t::bad_alloc_k;
+            if (offsets_.data_) {
+                // Copy the existing offsets to the new array, before deallocating the old one.
+                offset_t const *src = offsets_.data_, *end = offsets_.data_ + count_ + 1;
+                offset_t *tgt = new_offsets;
+                for (; src != end; ++src, ++tgt) *tgt = *src;
+                offset_alloc_.deallocate(const_cast<offset_t *>(offsets_.data_), offsets_.size_);
+            }
+            offsets_.data_ = new_offsets;
+            offsets_.size_ = new_offsets_capacity;
+        }
+
+        // Record the starting offset for the new string.
+        offsets_.data_[count_] = static_cast<offset_t>(current_used);
+        // Copy the string into the buffer.
+        for (size_t i = 0; i < string_length; ++i) buffer_.data_[current_used++] = string[i];
+        // Append the NULL terminator.
+        buffer_.data_[current_used++] = '\0';
+        // Update the offsets array with the new end-of-buffer position.
+        offsets_.data_[++count_] = static_cast<offset_t>(current_used);
+        return status_t::success_k;
     }
 
     constexpr value_type operator[](size_t i) const noexcept {
-        return {&buffer_[0] + offsets_[i], offsets_[i + 1] - offsets_[i] - 1};
+        _sz_assert(i < count_ && "Index out of bounds");
+        return {buffer_.data_ + offsets_.data_[i], offsets_.data_[i + 1] - offsets_.data_[i] - 1};
     }
 
-    constexpr size_t size() const noexcept { return offsets_.size() - 1; }
-    constexpr view_t view() const noexcept { return {buffer_, offsets_}; }
-
+    constexpr size_t size() const noexcept { return count_; }
+    constexpr view_t view() const noexcept { return {buffer_, {offsets_.data_, count_ + 1}}; }
     constexpr span<char_t> const &buffer() const noexcept { return buffer_; }
     constexpr span<offset_t> const &offsets() const noexcept { return offsets_; }
 };

From 6751e79ca7b2a65f5fc7be8eb0009eb636dfc640 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 3 Apr 2025 12:08:43 +0000
Subject: [PATCH 284/751] Add: Thrust-like `constant_iterator`

---
 include/stringzilla/types.hpp | 61 +++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 877617c4..ae75c095 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -351,6 +351,67 @@ struct arrow_strings_tape {
     constexpr span<offset_t> const &offsets() const noexcept { return offsets_; }
 };
 
+/**
+ *  @brief  Similar to `thrust::constant_iterator`, always returning the same value.
+ */
+template <typename value_type_>
+struct constant_iterator {
+
+    using value_type = value_type_;
+    using reference = value_type_ const &;
+    using pointer = value_type_ const *;
+    using difference_type = std::ptrdiff_t;
+    using iterator_category = std::random_access_iterator_tag;
+
+    constant_iterator(value_type const &value, difference_type pos = 0) noexcept : value_(value), pos_(pos) {}
+
+    reference operator*() const { return value_; }
+    pointer operator->() const { return &value_; }
+
+    constant_iterator &operator++() {
+        ++pos_;
+        return *this;
+    }
+    constant_iterator operator++(int) {
+        constant_iterator tmp(*this);
+        ++pos_;
+        return tmp;
+    }
+    constant_iterator &operator--() {
+        --pos_;
+        return *this;
+    }
+    constant_iterator operator--(int) {
+        constant_iterator tmp(*this);
+        --pos_;
+        return tmp;
+    }
+
+    constant_iterator operator+(difference_type n) const { return constant_iterator(value_, pos_ + n); }
+    constant_iterator &operator+=(difference_type n) {
+        pos_ += n;
+        return *this;
+    }
+    constant_iterator operator-(difference_type n) const { return constant_iterator(value_, pos_ - n); }
+    constant_iterator &operator-=(difference_type n) {
+        pos_ -= n;
+        return *this;
+    }
+    difference_type operator-(constant_iterator const &other) const { return pos_ - other.pos_; }
+
+    reference operator[](difference_type) const { return value_; }
+    bool operator==(constant_iterator const &other) const { return pos_ == other.pos_; }
+    bool operator!=(constant_iterator const &other) const { return pos_ != other.pos_; }
+    bool operator<(constant_iterator const &other) const { return pos_ < other.pos_; }
+    bool operator>(constant_iterator const &other) const { return pos_ > other.pos_; }
+    bool operator<=(constant_iterator const &other) const { return pos_ <= other.pos_; }
+    bool operator>=(constant_iterator const &other) const { return pos_ >= other.pos_; }
+
+  private:
+    value_type value_;
+    difference_type pos_;
+};
+
 } // namespace stringzilla
 } // namespace ashvardanian
 

From 85b967523500ba0a8768fed14a6eec9786050405 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 3 Apr 2025 12:09:39 +0000
Subject: [PATCH 285/751] Add: Parallel SW & NW scoring in OpenMP

---
 include/stringcuzilla/similarity.hpp | 1429 +++++++++++++++++---------
 include/stringcuzilla/types.cuh      |    8 +-
 include/stringzilla/types.h          |    2 +-
 include/stringzilla/types.hpp        |   35 +-
 4 files changed, 985 insertions(+), 489 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index a3b41391..da0ccbdd 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -3,11 +3,11 @@
  *  @file   similarity.hpp
  *  @author Ash Vardanian
  *
- *  Includes core APIs:
+ *  Includes core APIs, defined as the following template objects:
  *
  *  - `sz::levenshtein_distance` & `sz::levenshtein_distance_utf8` for Levenshtein edit-scores.
- *  - `sz::needleman_wunsch_score` for weighted Needleman-Wunsch global alignment.
- *  - `sz::smith_waterman_score` for weighted Smith-Waterman local alignment.
+ *  - `sz::needleman_wunsch_score` for weighted Needleman-Wunsch @b (NW) global alignment.
+ *  - `sz::smith_waterman_score` for weighted Smith-Waterman @b (SW) local alignment.
  *
  *  Also includes their batch-capable and parallel versions:
  *
@@ -15,8 +15,8 @@
  *  - `sz::needleman_wunsch_scores` for weighted Needleman-Wunsch global alignment.
  *  - `sz::smith_waterman_scores` for weighted Smith-Waterman local alignment.
  *
- *  Those are mostly providing specialized overloads of the @b `sz::score_diagonally` wavefront-like template
- *  or @b `sz::score_horizontally` conventional Wagner-Fischer algorithm template, that may be more suitable
+ *  Those are mostly providing specialized overloads of the @b `sz::diagonal_walker` wavefront-like template
+ *  or @b `sz::horizontal_walker` conventional Wagner-Fischer algorithm template, that may be more suitable
  *  for large 256x256 substitution matrices on x86 CPUs.
  *
  *  @section    Why not reimplement this in pure C 99?
@@ -28,18 +28,32 @@
  *
  *  - The core algorithm for byte-level and UTF-32 alignment scoring is identical.
  *  - Local and global alignment algorithms are almost identical, only differing in one more `min`/`max`
- *    operation and the way the top row and left column of the DP matrix are initialized.
+ *    operation and the way the top row and left column of the Dynamic Programming @b (DP) matrix are initialized.
  *  - Different CPU cores may be scheduled to process different pairs individually, or collaborate to
  *    align very large strings, still using the same core logic.
  *  - Different substitution cost models require very different SIMD implementations in case of uniform
  *    costs, DNA scoring with 4x4 matrix, protein scoring with 20x20 matrix, or custom costs.
  *
  *  Each of those may just be a 2 line change in the core logic, but can produce a @b 1000 lines of boilerplate!
+ *
+ *  @section    Abstraction layers
+ *
+ *  Under the hood, each @b dense high-level algorithm, like Levenshtein, NW, or SW, builds on top of a "walker"
+ *  template object, which in turn builds on top of an "scorer" template object:
+ *
+ *  - the "walker" chooses the order in which the DP matrix is evaluated - row-wise or diagonal-wise.
+ *  - the "scorer" evaluates the actual DP matrix cells, taking 3+ inputs, for "local" and "global" alignment,
+ *    or the "affine local" and "affine global" alignments, differentiating the cost of gap opening & extension.
+ *
+ *  TODO: For @b sparse algorithms, the algorithms are constructed differently.
  */
 #ifndef STRINGZILLA_SIMILARITY_HPP_
 #define STRINGZILLA_SIMILARITY_HPP_
 
-#include "types.hpp"
+#include "stringzilla/memory.h"  // `sz_move`
+#include "stringzilla/types.hpp" // `sz::error_cost_t`
+
+#include <atomic> // `std::atomic` to synchronize OpenMP threads
 
 namespace ashvardanian {
 namespace stringzilla {
@@ -53,42 +67,47 @@ namespace openmp {
  *  It updates the internal state to remember the last calculated value, as in Global Alignment it's
  *  always in the bottom-right corner of the DP matrix, which is evaluated last.
  */
-template <                                                    //
-    typename first_iterator_type_ = char const *,             //
-    typename second_iterator_type_ = char const *,            //
-    typename score_type_ = sz_size_t,                         //
-    typename substituter_type_ = uniform_substitution_cost_t, //
-    sz_capability_t capability_ = sz_cap_serial_k             //
+template <                                              //
+    typename first_iterator_type_ = char const *,       //
+    typename second_iterator_type_ = char const *,      //
+    typename score_type_ = sz_size_t,                   //
+    typename substituter_type_ = error_costs_uniform_t, //
+    sz_capability_t capability_ = sz_cap_serial_k       //
     >
-struct global_aligner {
+struct global_scorer {
+
+    using first_iterator_t = first_iterator_type_;
+    using second_iterator_t = second_iterator_type_;
+    using score_t = score_type_;
+    using substituter_t = substituter_type_;
+    static constexpr sz_capability_t capability_k = capability_;
 
-    using first_iterator_type = first_iterator_type_;
-    using second_iterator_type = second_iterator_type_;
-    using score_type = score_type_;
-    using substituter_type = substituter_type_;
-    static constexpr sz_capability_t capability = capability_;
+    using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
+    using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
+    static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
+    using char_t = first_char_t;
 
-    substituter_type substituter_ {};
+    substituter_t substituter_ {};
     error_cost_t gap_cost_ {1};
-    score_type last_cell_ {0};
+    score_t last_cell_ {0};
 
-    global_aligner() = default;
-    global_aligner(substituter_type &&substituter, error_cost_t gap_cost) noexcept
-        : substituter_(std::move(substituter)), gap_cost_(gap_cost) {}
+    global_scorer() = default;
+    global_scorer(substituter_t substituter, error_cost_t gap_cost) noexcept
+        : substituter_(substituter), gap_cost_(gap_cost) {}
 
-    static constexpr bool is_parallel() { return capability & sz_cap_parallel_k; }
+    static constexpr bool is_parallel() { return capability_k & sz_cap_parallel_k; }
 
     /**
      *  @brief Initializes a boundary value within a certain diagonal.
      *  @note Should only be called for the diagonals outside of the bottom-right triangle.
      *  @note Should only be called for the top row and left column of the matrix.
      */
-    void init(score_type &cell, sz_size_t diagonal_index) const noexcept { cell = gap_cost_ * diagonal_index; }
+    void init(score_t &cell, sz_size_t diagonal_index) const noexcept { cell = gap_cost_ * diagonal_index; }
 
     /**
      *  @brief Extract the final result of the scoring operation which will be always in the bottom-right corner.
      */
-    score_type score() const noexcept { return last_cell_; }
+    score_t score() const noexcept { return last_cell_; }
 
     /**
      *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
@@ -96,25 +115,23 @@ struct global_aligner {
      *  @param second_slice The second string.
      *  @param n The length of the diagonal to evaluate and the number of characters to compare from each string.
      */
-    void operator()(                                                                             //
-        char_type const *first_reversed_slice, char_type const *second_slice, sz_size_t const n, //
-        score_type const *scores_pre_substitution,                                               //
-        score_type const *scores_pre_insertion, score_type const *scores_pre_deletion,           //
-        score_type *scores_new) noexcept {
+    void operator()(                                                                        //
+        first_iterator_t first_reversed_slice, second_iterator_t second_slice, sz_size_t n, //
+        score_t const *scores_pre_substitution, score_t const *scores_pre_insertion, score_t const *scores_pre_deletion,
+        score_t *scores_new) noexcept {
 
 #pragma omp parallel for simd schedule(dynamic, 1) if (is_parallel())
         for (sz_size_t i = 0; i < n; ++i) {
-            score_type score_pre_substitution = scores_pre_substitution[i];
-            score_type score_pre_insertion = scores_pre_insertion[i];
-            score_type score_pre_deletion = scores_pre_deletion[i];
+            score_t score_pre_substitution = scores_pre_substitution[i];
+            score_t score_pre_insertion = scores_pre_insertion[i];
+            score_t score_pre_deletion = scores_pre_deletion[i];
 
             // ? Note that here we are still traversing both buffers in the same order,
             // ? because one of the strings has been reversed beforehand.
             error_cost_t cost_of_substitution = substituter_(first_reversed_slice[i], second_slice[i]);
-            score_type score_if_substitution = score_pre_substitution + cost_of_substitution;
-            score_type score_if_deletion_or_insertion =
-                sz_min_of_two(score_pre_deletion, score_pre_insertion) + gap_cost_;
-            score_type cell_score = sz_min_of_two(score_if_deletion_or_insertion, score_if_substitution);
+            score_t score_if_substitution = score_pre_substitution + cost_of_substitution;
+            score_t score_if_deletion_or_insertion = sz_min_of_two(score_pre_deletion, score_pre_insertion) + gap_cost_;
+            score_t cell_score = sz_min_of_two(score_if_deletion_or_insertion, score_if_substitution);
             scores_new[i] = cell_score;
         }
 
@@ -131,60 +148,65 @@ struct global_aligner {
  *  It updates the internal state to remember the minimum/maximum calculated value, as in Local Alignment
  *  it's always in the bottom-right corner of the DP matrix, which is evaluated last.
  */
-template <                                                    //
-    typename first_iterator_type_ = char const *,             //
-    typename second_iterator_type_ = char const *,            //
-    typename score_type_ = sz_size_t,                         //
-    typename substituter_type_ = uniform_substitution_cost_t, //
-    sz_capability_t capability_ = sz_cap_serial_k             //
+template <                                              //
+    typename first_iterator_type_ = char const *,       //
+    typename second_iterator_type_ = char const *,      //
+    typename score_type_ = sz_size_t,                   //
+    typename substituter_type_ = error_costs_uniform_t, //
+    sz_capability_t capability_ = sz_cap_serial_k       //
     >
-struct local_aligner {
+struct local_scorer {
+
+    using first_iterator_t = first_iterator_type_;
+    using second_iterator_t = second_iterator_type_;
+    using score_t = score_type_;
+    using substituter_t = substituter_type_;
+    static constexpr sz_capability_t capability_k = capability_;
 
-    using first_iterator_type = first_iterator_type_;
-    using second_iterator_type = second_iterator_type_;
-    using score_type = score_type_;
-    using substituter_type = substituter_type_;
-    static constexpr sz_capability_t capability = capability_;
+    using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
+    using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
+    static_assert(std::is_same<first_char_t, second_char_t>(), "String characters must be of the same type.");
+    using char_t = first_char_t;
 
-    substituter_type substituter_ {};
+    substituter_t substituter_ {};
     error_cost_t gap_cost_ {-1};
-    score_type max_cell_ {0};
+    score_t max_cell_ {0};
 
-    local_aligner() = default;
-    local_aligner(substituter_type &&substituter, error_cost_t gap_cost) noexcept
-        : substituter_(std::move(substituter)), gap_cost_(gap_cost) {}
+    local_scorer() = default;
+    local_scorer(substituter_t substituter, error_cost_t gap_cost) noexcept
+        : substituter_(substituter), gap_cost_(gap_cost) {}
 
-    static constexpr bool is_parallel() { return capability & sz_cap_parallel_k; }
+    static constexpr bool is_parallel() { return capability_k & sz_cap_parallel_k; }
 
     /**
      *  @brief Initializes a boundary value within a certain diagonal.
      *  @note Should only be called for the diagonals outside of the bottom-right triangle.
      *  @note Should only be called for the top row and left column of the matrix.
      */
-    void init(score_type &cell, sz_size_t /* diagonal_index */) const noexcept { cell = 0; }
+    void init(score_t &cell, sz_size_t /* diagonal_index */) const noexcept { cell = 0; }
 
     /**
      *  @brief Extract the final result of the scoring operation which will be maximum encountered value.
      */
-    score_type score() const noexcept { return max_cell_; }
+    score_t score() const noexcept { return max_cell_; }
 
-    void operator()(char_type const *first_reversed_slice, char_type const *second_slice, sz_size_t const n,
-                    score_type const *scores_pre_substitution, score_type const *scores_pre_insertion,
-                    score_type const *scores_pre_deletion, score_type *scores_new) noexcept {
+    void operator()(                                                                              //
+        first_iterator_t first_reversed_slice, second_iterator_t second_slice, sz_size_t const n, //
+        score_t const *scores_pre_substitution, score_t const *scores_pre_insertion, score_t const *scores_pre_deletion,
+        score_t *scores_new) noexcept {
 
 #pragma omp parallel for simd schedule(dynamic, 1) if (is_parallel())
         for (sz_size_t i = 0; i < n; ++i) {
-            score_type score_pre_substitution = scores_pre_substitution[i];
-            score_type score_pre_insertion = scores_pre_insertion[i];
-            score_type score_pre_deletion = scores_pre_deletion[i];
+            score_t score_pre_substitution = scores_pre_substitution[i];
+            score_t score_pre_insertion = scores_pre_insertion[i];
+            score_t score_pre_deletion = scores_pre_deletion[i];
 
             // ? Note that here we are still traversing both buffers in the same order,
             // ? because one of the strings has been reversed beforehand.
-            error_cost_t substitution = substituter_(first_reversed_slice[i], second_slice[i]);
-            score_type score_if_substitution = score_pre_substitution + cost_of_substitution;
-            score_type score_if_deletion_or_insertion =
-                sz_min_of_two(score_pre_deletion, score_pre_insertion) + gap_cost_;
-            score_type cell_score = sz_min_of_three(score_if_deletion_or_insertion, score_if_substitution, 0);
+            error_cost_t cost_of_substitution = substituter_(first_reversed_slice[i], second_slice[i]);
+            score_t score_if_substitution = score_pre_substitution + cost_of_substitution;
+            score_t score_if_deletion_or_insertion = sz_min_of_two(score_pre_deletion, score_pre_insertion) + gap_cost_;
+            score_t cell_score = sz_min_of_three(score_if_deletion_or_insertion, score_if_substitution, 0);
             scores_new[i] = cell_score;
 
             // Update the global maximum score if this cell beats it.
@@ -208,19 +230,12 @@ struct local_aligner {
  *  ! This algorithm may be suboptimal for very small strings, where a conventional Wagner-Fischer algorithm
  *  ! with horizontal traversal order and fewer loops may be faster. That one, however, can't be parallel!
  *
- *  @param[in] first The first string.
- *  @param[in] second The second string.
- *  @param[out] result_ref Location to dump the calculated score.
- *  @param[in] gap_cost The uniform cost of a gap (insertion or deletion).
- *  @param[in] substituter A commutative function returning the cost of substituting one char with another.
- *  @param[in] alloc A default-constructible allocator for the internal buffers.
- *
  *  @tparam char_type_ The type of the characters in the strings, generally `char` or @b `rune_t` for UTF-8.
  *  @tparam score_type_ The smallest type that can hold the distance, ideally `sz_i8_t` or `sz_u8_t`.
- *  @tparam substituter_ A callable type that takes two characters and returns the substitution cost.
+ *  @tparam substituter_type_ A callable type that takes two characters and returns the substitution cost.
  *  @tparam allocator_type_ A default-constructible allocator type for the internal buffers.
- *  @tparam multi_threaded_ Whether to use OpenMP for @b multi-threading or just vectorization.
- *  @tparam global_alignment_ Whether to use the global alignment algorithm or the local one.
+ *  @tparam capability_ Whether to use OpenMP for @b multi-threading or some form of @b SIMD vectorization, or both.
+ *  @tparam locality_ Whether to use the global alignment algorithm or the local one.
  *
  *  @note   The API of this algorithm is a bit weird, but it's designed to minimize the reliance on the definitions
  *          in the `stringzilla.hpp` header, making compilation times shorter for the end-user.
@@ -233,153 +248,174 @@ template <                                                 //
     sz_alignment_locality_t locality_ = sz_align_global_k, //
     typename char_type_ = char,                            //
     typename score_type_ = sz_size_t,                      //
-    typename substituter_ = uniform_substitution_cost_t,   //
+    typename substituter_type_ = error_costs_uniform_t,    //
     typename allocator_type_ = dummy_alloc_t               //
     >
-sz_status_t score_diagonally(                                    //
-    span<char_type_ const> first, span<char_type_ const> second, //
-    score_type_ &result_ref,                                     //
-    sz_error_cost_t gap_cost = 1,                                //
-    substituter_ &&substituter = uniform_substitution_cost_t {}, //
-    allocator_type_ &&alloc = allocator_type_ {}                 //
-    ) noexcept {
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    using allocated_type = typename allocator_type_::value_type;
-    static_assert(sizeof(allocated_type) == sizeof(char), "Allocator must be byte-aligned");
-    using char_type = char_type_;
-    using score_type = score_type_;
-
-    using aligner_t = global_aligner<char_type, score_type, substituter_, capability_>;
-
-    // Make sure the size relation between the strings is correct.
-    char_type const *shorter = first.data(), *longer = second.data();
-    sz_size_t shorter_length = first.size(), longer_length = second.size();
-    if (shorter_length > longer_length) {
-        std::swap(shorter, longer);
-        std::swap(shorter_length, longer_length);
-    }
+struct diagonal_walker {
 
-    // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
-    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
-    sz_size_t const shorter_dim = shorter_length + 1;
-    sz_size_t const longer_dim = longer_length + 1;
-
-    // Let's say we are dealing with 3 and 5 letter words.
-    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
-    // It will have:
-    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
-    // - 2 diagonals of fixed length, at positions: 4, 5.
-    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
-    sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
-    sz_size_t const max_diagonal_length = shorter_length + 1;
-
-    // We want to avoid reverse-order iteration over the shorter string.
-    // Let's allocate a bit more memory and reverse-export our shorter string into that buffer.
-    sz_size_t const buffer_length = sizeof(score_type) * max_diagonal_length * 3 + shorter_length * sizeof(char_type);
-    score_type *const buffer = (score_type *)alloc.allocate(buffer_length);
-    if (!buffer) return sz_bad_alloc_k;
-
-    // The next few pointers will be swapped around.
-    score_type *previous_scores = buffer;
-    score_type *current_scores = previous_scores + max_diagonal_length;
-    score_type *next_scores = current_scores + max_diagonal_length;
-    char_type *const shorter_reversed = (char_type *)(next_scores + max_diagonal_length);
-
-    // Export the reversed string into the buffer.
-    for (sz_size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
-
-    // Initialize the first two diagonals:
-    aligner_t diagonal_aligner;
-    diagonal_aligner.init(previous_scores[0], 0);
-    diagonal_aligner.init(current_scores[0], 1);
-    diagonal_aligner.init(current_scores[1], 1);
-
-    // We skip diagonals 0 and 1, as they are trivial.
-    // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
-    // so we are effectively computing just one value, as will be marked by a single set bit in
-    // the `next_diagonal_mask` on the very first iteration.
-    sz_size_t next_diagonal_index = 2;
-
-    // Progress through the upper-left triangle of the Levenshtein matrix.
-    for (; next_diagonal_index < shorter_dim; ++next_diagonal_index) {
-
-        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
-        diagonal_aligner(                                                //
-            shorter_reversed + shorter_length - next_diagonal_index + 1, // first sequence of characters
-            longer,                                                      // second sequence of characters
-            next_diagonal_length - 2,           // number of elements to compute with the `diagonal_aligner`
-            previous_scores,                    // costs pre substitution
-            current_scores, current_scores + 1, // costs pre insertion/deletion
-            next_scores + 1);
-
-        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-        diagonal_aligner.init(next_scores[0], next_diagonal_index);
-        diagonal_aligner.init(next_scores[next_diagonal_length - 1], next_diagonal_index);
-
-        // Perform a circular rotation of those buffers, to reuse the memory.
-        score_type *temporary = previous_scores;
-        previous_scores = current_scores;
-        current_scores = next_scores;
-        next_scores = temporary;
-    }
+    using char_t = char_type_;
+    using score_t = score_type_;
+    using substituter_t = substituter_type_;
+    using allocator_t = allocator_type_;
 
-    // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
-    for (; next_diagonal_index < longer_dim; ++next_diagonal_index) {
-
-        sz_size_t const next_diagonal_length = shorter_dim;
-        diagonal_aligner(                                        //
-            shorter_reversed + shorter_length - shorter_dim + 1, // first sequence of characters
-            longer + next_diagonal_index - shorter_dim,          // second sequence of characters
-            next_diagonal_length - 1,           // number of elements to compute with the `diagonal_aligner`
-            previous_scores,                    // costs pre substitution
-            current_scores, current_scores + 1, // costs pre insertion/deletion
-            next_scores);
-
-        // Don't forget to populate the first row of the Levenshtein matrix.
-        diagonal_aligner.init(next_scores[next_diagonal_length - 1], next_diagonal_index);
-
-        // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
-        // dropping the first element in the current array.
-        score_type *temporary = previous_scores;
-        previous_scores = current_scores;
-        current_scores = next_scores;
-        next_scores = temporary;
-
-        // ! Drop the first entry among the current scores.
-        sz_move((sz_ptr_t)(previous_scores), (sz_ptr_t)(previous_scores + 1),
-                (max_diagonal_length - 1) * sizeof(score_type));
-    }
+    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_alignment_locality_t locality = locality_;
 
-    // Now let's handle the bottom-right triangle of the matrix.
-    for (; next_diagonal_index < diagonals_count; ++next_diagonal_index) {
-
-        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
-        diagonal_aligner(                                        //
-            shorter_reversed + shorter_length - shorter_dim + 1, // first sequence of characters
-            longer + next_diagonal_index - shorter_dim,          // second sequence of characters
-            next_diagonal_length,               // number of elements to compute with the `diagonal_aligner`
-            previous_scores,                    // costs pre substitution
-            current_scores, current_scores + 1, // costs pre insertion/deletion
-            next_scores);
-
-        // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
-        // dropping the first element in the current array.
-        score_type *temporary = previous_scores;
-
-        // ! Drop the first entry among the current scores.
-        // ! Assuming every next diagonal is shorter by one element, we don't need a full-blown `sz_move`.
-        // ! to shift the array by one element.
-        previous_scores = current_scores + 1;
-        current_scores = next_scores;
-        next_scores = temporary;
-    }
+    using allocated_t = typename allocator_t::value_type;
+    static_assert(sizeof(allocated_t) == sizeof(char), "Allocator must be byte-aligned");
+    using scorer_t = global_scorer<char_t const *, char_t const *, score_t, substituter_t, capability_k>;
 
-    // Export the scalar before `free` call.
-    result_ref = diagonal_aligner.score();
-    alloc.deallocate((allocated_type *)buffer, buffer_length);
-    return sz_success_k;
-}
+    substituter_t substituter_ {};
+    error_cost_t gap_cost_ {1};
+    allocator_t alloc_ {};
+
+    diagonal_walker(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
+
+    /**
+     *  @param[in] substituter A commutative function returning the cost of substituting one char with another.
+     *  @param[in] gap_cost The uniform cost of a gap (insertion or deletion).
+     *  @param[in] alloc A default-constructible allocator for the internal buffers.
+     *
+     */
+    diagonal_walker(substituter_t substituter, error_cost_t gap_cost, allocator_t alloc) noexcept
+        : substituter_(substituter), gap_cost_(gap_cost), alloc_(alloc) {}
+
+    /**
+     *  @param[in] first The first string.
+     *  @param[in] second The second string.
+     *  @param[out] result_ref Location to dump the calculated score.
+     */
+    status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref) noexcept {
+
+        // Make sure the size relation between the strings is correct.
+        char_t const *shorter = first.data(), *longer = second.data();
+        sz_size_t shorter_length = first.size(), longer_length = second.size();
+        if (shorter_length > longer_length) {
+            std::swap(shorter, longer);
+            std::swap(shorter_length, longer_length);
+        }
+
+        // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
+        // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
+        sz_size_t const shorter_dim = shorter_length + 1;
+        sz_size_t const longer_dim = longer_length + 1;
+
+        // Let's say we are dealing with 3 and 5 letter words.
+        // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
+        // It will have:
+        // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
+        // - 2 diagonals of fixed length, at positions: 4, 5.
+        // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
+        sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
+        sz_size_t const max_diagonal_length = shorter_length + 1;
+
+        // We want to avoid reverse-order iteration over the shorter string.
+        // Let's allocate a bit more memory and reverse-export our shorter string into that buffer.
+        sz_size_t const buffer_length = sizeof(score_t) * max_diagonal_length * 3 + shorter_length * sizeof(char_t);
+        score_t *const buffer = (score_t *)alloc_.allocate(buffer_length);
+        if (!buffer) return status_t::bad_alloc_k;
+
+        // The next few pointers will be swapped around.
+        score_t *previous_scores = buffer;
+        score_t *current_scores = previous_scores + max_diagonal_length;
+        score_t *next_scores = current_scores + max_diagonal_length;
+        char_t *const shorter_reversed = (char_t *)(next_scores + max_diagonal_length);
+
+        // Export the reversed string into the buffer.
+        for (sz_size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
+
+        // Initialize the first two diagonals:
+        scorer_t diagonal_aligner {substituter_, gap_cost_};
+        diagonal_aligner.init(previous_scores[0], 0);
+        diagonal_aligner.init(current_scores[0], 1);
+        diagonal_aligner.init(current_scores[1], 1);
+
+        // We skip diagonals 0 and 1, as they are trivial.
+        // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
+        // so we are effectively computing just one value, as will be marked by a single set bit in
+        // the `next_diagonal_mask` on the very first iteration.
+        sz_size_t next_diagonal_index = 2;
+
+        // Progress through the upper-left triangle of the Levenshtein matrix.
+        for (; next_diagonal_index < shorter_dim; ++next_diagonal_index) {
+
+            sz_size_t const next_diagonal_length = next_diagonal_index + 1;
+            diagonal_aligner(                                                //
+                shorter_reversed + shorter_length - next_diagonal_index + 1, // first sequence of characters
+                longer,                                                      // second sequence of characters
+                next_diagonal_length - 2,           // number of elements to compute with the `diagonal_aligner`
+                previous_scores,                    // costs pre substitution
+                current_scores, current_scores + 1, // costs pre insertion/deletion
+                next_scores + 1);
+
+            // Don't forget to populate the first row and the first column of the Levenshtein matrix.
+            diagonal_aligner.init(next_scores[0], next_diagonal_index);
+            diagonal_aligner.init(next_scores[next_diagonal_length - 1], next_diagonal_index);
+
+            // Perform a circular rotation of those buffers, to reuse the memory.
+            score_t *temporary = previous_scores;
+            previous_scores = current_scores;
+            current_scores = next_scores;
+            next_scores = temporary;
+        }
+
+        // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
+        for (; next_diagonal_index < longer_dim; ++next_diagonal_index) {
+
+            sz_size_t const next_diagonal_length = shorter_dim;
+            diagonal_aligner(                                        //
+                shorter_reversed + shorter_length - shorter_dim + 1, // first sequence of characters
+                longer + next_diagonal_index - shorter_dim,          // second sequence of characters
+                next_diagonal_length - 1,           // number of elements to compute with the `diagonal_aligner`
+                previous_scores,                    // costs pre substitution
+                current_scores, current_scores + 1, // costs pre insertion/deletion
+                next_scores);
+
+            // Don't forget to populate the first row of the Levenshtein matrix.
+            diagonal_aligner.init(next_scores[next_diagonal_length - 1], next_diagonal_index);
+
+            // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
+            // dropping the first element in the current array.
+            score_t *temporary = previous_scores;
+            previous_scores = current_scores;
+            current_scores = next_scores;
+            next_scores = temporary;
+
+            // ! Drop the first entry among the current scores.
+            sz_move((sz_ptr_t)(previous_scores), (sz_ptr_t)(previous_scores + 1),
+                    (max_diagonal_length - 1) * sizeof(score_t));
+        }
+
+        // Now let's handle the bottom-right triangle of the matrix.
+        for (; next_diagonal_index < diagonals_count; ++next_diagonal_index) {
+
+            sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
+            diagonal_aligner(                                        //
+                shorter_reversed + shorter_length - shorter_dim + 1, // first sequence of characters
+                longer + next_diagonal_index - shorter_dim,          // second sequence of characters
+                next_diagonal_length,               // number of elements to compute with the `diagonal_aligner`
+                previous_scores,                    // costs pre substitution
+                current_scores, current_scores + 1, // costs pre insertion/deletion
+                next_scores);
+
+            // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
+            // dropping the first element in the current array.
+            score_t *temporary = previous_scores;
+
+            // ! Drop the first entry among the current scores.
+            // ! Assuming every next diagonal is shorter by one element, we don't need a full-blown `sz_move`.
+            // ! to shift the array by one element.
+            previous_scores = current_scores + 1;
+            current_scores = next_scores;
+            next_scores = temporary;
+        }
+
+        // Export the scalar before `free` call.
+        result_ref = diagonal_aligner.score();
+        alloc_.deallocate((allocated_t *)buffer, buffer_length);
+        return status_t::success_k;
+    }
+};
 
 /**
  *  @brief  Alignment Score and Edit Distance algorithm evaluating the Dynamic Programming matrix
@@ -397,7 +433,7 @@ sz_status_t score_diagonally(                                    //
  *
  *  @tparam char_type_ The type of the characters in the strings, generally `char` or @b `rune_t` for UTF-8.
  *  @tparam score_type_ The smallest type that can hold the distance, ideally `sz_i8_t` or `sz_u8_t`.
- *  @tparam substituter_ A callable type that takes two characters and returns the substitution cost.
+ *  @tparam substituter_type_ A callable type that takes two characters and returns the substitution cost.
  *  @tparam allocator_type_ A default-constructible allocator type for the internal buffers.
  *  @tparam multi_threaded_ Whether to use OpenMP for @b multi-threading or just vectorization.
  *  @tparam global_alignment_ Whether to use the global alignment algorithm or the local one.
@@ -413,291 +449,728 @@ template <                                                 //
     sz_alignment_locality_t locality_ = sz_align_global_k, //
     typename char_type_ = char,                            //
     typename score_type_ = sz_size_t,                      //
-    typename substituter_ = uniform_substitution_cost_t,   //
+    typename substituter_type_ = error_costs_uniform_t,    //
     typename allocator_type_ = dummy_alloc_t               //
     >
-sz_status_t score_horizontally(                                  //
-    span<char_type_ const> first, span<char_type_ const> second, //
-    score_type_ &result_ref,                                     //
-    sz_error_cost_t gap_cost = 1,                                //
-    substituter_ &&substituter = uniform_substitution_cost_t {}, //
-    allocator_type_ &&alloc = allocator_type_ {}                 //
-    ) noexcept {
-
-    static_assert((capability_ & sz_cap_parallel_k) == 0, "This algorithm is not parallelized!");
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    using allocated_type = typename allocator_type_::value_type;
-    static_assert(sizeof(allocated_type) == sizeof(char), "Allocator must be byte-aligned");
-    using char_type = char_type_;
-    using score_type = score_type_;
-
-    using aligner_t = global_aligner<char_type, score_type, substituter_, capability_>;
-
-    // Make sure the size relation between the strings is correct.
-    char_type const *shorter = first.data(), *longer = second.data();
-    sz_size_t shorter_length = first.size(), longer_length = second.size();
-    if (shorter_length > longer_length) {
-        std::swap(shorter, longer);
-        std::swap(shorter_length, longer_length);
-    }
+struct horizontal_walker {
 
-    // We are going to store 2 rows of the matrix. It will be either 2 rows of length `shorter_length + 1`
-    // or 2 rows of length `longer_length + 1`, depending on our preference - either minimizing the memory
-    // consumption or the inner loop performance.
-    sz_size_t const shorter_dim = shorter_length + 1;
-    sz_size_t const longer_dim = longer_length + 1;
-
-    // We decide to use less memory!
-    sz_size_t const buffer_length = sizeof(score_type) * shorter_dim * 2;
-    score_type *const buffer = (score_type *)alloc.allocate(buffer_length);
-    if (!buffer) return sz_bad_alloc_k;
-
-    // The next few pointers will be swapped around.
-    score_type *previous_scores = buffer;
-    score_type *current_scores = previous_scores + shorter_dim;
-
-    // Initialize the first row:
-    aligner_t horizontal_aligner;
-    for (sz_size_t col_idx = 0; col_idx < shorter_dim; ++col_idx)
-        horizontal_aligner.init(previous_scores[col_idx], col_idx);
-
-    // Progress through the matrix row-by-row:
-    for (sz_size_t row_idx = 1; row_idx < longer_dim; ++row_idx) {
-
-        // Don't forget to populate the first column of each row:
-        horizontal_aligner.init(next_scores[0], 1);
-
-        horizontal_aligner(           //
-            repeat {&shorter[i - 1]}, // first sequence of characters
-            longer,                   // second sequence of characters
-            next_diagonal_length - 2, // number of elements to compute with the `horizontal_aligner`
-            previous_scores,          // costs pre substitution
-            previous_scores + 1,      // costs pre insertion
-            current_scores,           // costs pre deletion
-            current_scores + 1);
-
-        // Perform a circular rotation of those buffers, to reuse the memory.
-        score_type *temporary = previous_scores;
-        previous_scores = current_scores;
-        current_scores = next_scores;
-        next_scores = temporary;
-    }
+    using char_t = char_type_;
+    using score_t = score_type_;
+    using substituter_t = substituter_type_;
+    using allocator_t = allocator_type_;
 
-    // Export the scalar before `free` call.
-    result_ref = horizontal_aligner.score();
-    alloc.deallocate((allocated_type *)buffer, buffer_length);
-    return sz_success_k;
-}
+    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_alignment_locality_t locality = locality_;
+    static_assert((capability_k & sz_cap_parallel_k) == 0, "This algorithm is not parallelized!");
+
+    using allocated_t = typename allocator_t::value_type;
+    static_assert(sizeof(allocated_t) == sizeof(char), "Allocator must be byte-aligned");
+    using scorer_t = global_scorer<constant_iterator<char_t>, char_t const *, score_t, substituter_t, capability_k>;
+
+    substituter_t substituter_ {};
+    error_cost_t gap_cost_ {1};
+    allocator_t alloc_ {};
+
+    horizontal_walker(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
+
+    /**
+     *  @param[in] substituter A commutative function returning the cost of substituting one char with another.
+     *  @param[in] gap_cost The uniform cost of a gap (insertion or deletion).
+     *  @param[in] alloc A default-constructible allocator for the internal buffers.
+     *
+     */
+    horizontal_walker(substituter_t substituter, error_cost_t gap_cost, allocator_t alloc) noexcept
+        : substituter_(substituter), gap_cost_(gap_cost), alloc_(alloc) {}
+
+    /**
+     *  @param[in] first The first string.
+     *  @param[in] second The second string.
+     *  @param[out] result_ref Location to dump the calculated score.
+     */
+    status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref) noexcept {
+
+        // Make sure the size relation between the strings is correct.
+        char_t const *shorter = first.data(), *longer = second.data();
+        sz_size_t shorter_length = first.size(), longer_length = second.size();
+        if (shorter_length > longer_length) {
+            std::swap(shorter, longer);
+            std::swap(shorter_length, longer_length);
+        }
+
+        // We are going to store 2 rows of the matrix. It will be either 2 rows of length `shorter_length + 1`
+        // or 2 rows of length `longer_length + 1`, depending on our preference - either minimizing the memory
+        // consumption or the inner loop performance.
+        sz_size_t const shorter_dim = shorter_length + 1;
+        sz_size_t const longer_dim = longer_length + 1;
+
+        // We decide to use less memory!
+        sz_size_t const buffer_length = sizeof(score_t) * shorter_dim * 2;
+        score_t *const buffer = (score_t *)alloc_.allocate(buffer_length);
+        if (!buffer) return status_t::bad_alloc_k;
+
+        // The next few pointers will be swapped around.
+        score_t *previous_scores = buffer;
+        score_t *current_scores = previous_scores + shorter_dim;
+
+        // Initialize the first row:
+        scorer_t horizontal_aligner {substituter_, gap_cost_};
+        for (sz_size_t col_idx = 0; col_idx < shorter_dim; ++col_idx)
+            horizontal_aligner.init(previous_scores[col_idx], col_idx);
+
+        // Progress through the matrix row-by-row:
+        for (sz_size_t row_idx = 1; row_idx < longer_dim; ++row_idx) {
+
+            // Don't forget to populate the first column of each row:
+            horizontal_aligner.init(current_scores[0], 1);
+
+            horizontal_aligner(                                  //
+                constant_iterator<char_t> {longer[row_idx - 1]}, // first sequence of characters
+                shorter,                                         // second sequence of characters
+                shorter_dim - 1,     // number of elements to compute with the `horizontal_aligner`
+                previous_scores,     // costs pre substitution
+                previous_scores + 1, // costs pre insertion
+                current_scores,      // costs pre deletion
+                current_scores + 1);
+
+            // Reuse the memory.
+            std::swap(previous_scores, current_scores);
+        }
+
+        // Export the scalar before `free` call.
+        result_ref = horizontal_aligner.score();
+        alloc_.deallocate((allocated_t *)buffer, buffer_length);
+        return status_t::success_k;
+    }
+};
 
 /**
- *  @brief Computes the @b byte-level Levenshtein distance between two strings using the OpenMP backend.
- *  @param[in] first The first string.
- *  @param[in] second The second string.
- *  @param[in] alloc An allocator for the internal buffers.
- *  @return The Levenshtein distance between the two strings.
- *  @throws `std::bad_alloc` if the allocator fails to allocate memory.
- *  @sa `levenshtein_distance_utf8` for UTF-8 strings.
- *  @sa `score_diagonally` for the core algorithm.
+ *  @brief  Computes the @b byte-level Levenshtein distance between two strings using the OpenMP backend.
+ *  @sa     `levenshtein_distance_utf8` for UTF-8 strings.
  */
-template < //
-    sz_capability_t capability_ = sz_cap_serial_k,
-    typename first_type_ = span<char const>,  //
-    typename second_type_ = span<char const>, //
-    typename allocator_type_ = dummy_alloc_t  //
+template <                                         //
+    sz_capability_t capability_ = sz_cap_serial_k, //
+    typename char_type_ = char,                    //
+    typename allocator_type_ = dummy_alloc_t       //
     >
-inline sz_size_t levenshtein_distance( //
-    first_type_ const &first, second_type_ const &second,
-    allocator_type_ &&alloc = allocator_type_ {}) noexcept(false) {
-
-    sz_size_t const first_length = first.length();
-    sz_size_t const second_length = second.length();
-    if (first_length == 0) return second_length;
-    if (second_length == 0) return first_length;
-
-    // Estimate the maximum dimension of the DP matrix
-    sz_size_t const min_dim = sz_min_of_two(first_length, second_length) + 1;
-    sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
-
-    // When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
-    if (min_dim < 16u) {
-        sz_u8_t result_u8;
-        sz_status_t status = score_horizontally<capability_, sz_align_global_k, char, sz_u8_t,
-                                                uniform_substitution_cost_t, allocator_type_>(
-            {first.data(), first_length}, {second.data(), second_length}, result_u8, 1, uniform_substitution_cost_t {},
-            std::forward<allocator_type_>(alloc));
-        if (status == sz_bad_alloc_k) throw std::bad_alloc();
-        return result_u8;
+struct levenshtein_distance {
+
+    using char_t = char_type_;
+    using allocator_t = allocator_type_;
+
+    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
+
+    using horizontal_u8_t = horizontal_walker<capability_serialized_k, sz_align_global_k, char_t, sz_u8_t,
+                                              error_costs_uniform_t, allocator_t>;
+    using diagonal_u8_t = diagonal_walker<capability_serialized_k, sz_align_global_k, char_t, sz_u8_t,
+                                          error_costs_uniform_t, allocator_t>;
+    using diagonal_u16_t =
+        diagonal_walker<capability_k, sz_align_global_k, char_t, sz_u16_t, error_costs_uniform_t, allocator_t>;
+    using diagonal_u32_t =
+        diagonal_walker<capability_k, sz_align_global_k, char_t, sz_u32_t, error_costs_uniform_t, allocator_t>;
+    using diagonal_u64_t =
+        diagonal_walker<capability_k, sz_align_global_k, char_t, sz_u64_t, error_costs_uniform_t, allocator_t>;
+
+    allocator_t alloc_ {};
+
+    levenshtein_distance(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
+
+    /**
+     *  @param[in] first The first string.
+     *  @param[in] second The second string.
+     *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
+     */
+    status_t operator()(span<char_t const> first, span<char_t const> second, sz_size_t &result_ref) const noexcept {
+
+        sz_size_t const first_length = first.length();
+        sz_size_t const second_length = second.length();
+        if (first_length == 0) {
+            result_ref = second_length;
+            return status_t::success_k;
+        }
+        if (second_length == 0) {
+            result_ref = first_length;
+            return status_t::success_k;
+        }
+
+        // Estimate the maximum dimension of the DP matrix
+        sz_size_t const min_dim = sz_min_of_two(first_length, second_length) + 1;
+        sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
+
+        // When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
+        status_t status = status_t::success_k;
+        if (min_dim < 16u) {
+            sz_u8_t result_u8;
+            status = horizontal_u8_t {error_costs_uniform_t {}, 1, alloc_}(first, second, result_u8);
+            if (status == status_t::success_k) result_ref = result_u8;
+        }
+
+        // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
+        // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
+        if (max_dim < 256u) {
+            sz_u8_t result_u8;
+            status = diagonal_u8_t {error_costs_uniform_t {}, 1, alloc_}(first, second, result_u8);
+            if (status == status_t::success_k) result_ref = result_u8;
+        }
+        else if (max_dim < 65536u) {
+            sz_u16_t result_u16;
+            status = diagonal_u16_t {error_costs_uniform_t {}, 1, alloc_}(first, second, result_u16);
+            if (status == status_t::success_k) result_ref = result_u16;
+        }
+        else if (max_dim < 4294967296u) {
+            sz_u32_t result_u32;
+            status = diagonal_u32_t {error_costs_uniform_t {}, 1, alloc_}(first, second, result_u32);
+            if (status == status_t::success_k) result_ref = result_u32;
+        }
+        else {
+            sz_u64_t result_u64;
+            status = diagonal_u64_t {error_costs_uniform_t {}, 1, alloc_}(first, second, result_u64);
+            if (status == status_t::success_k) result_ref = result_u64;
+        }
+
+        return status;
     }
+};
+
+/**
+ *  @brief  Computes the @b rune-level Levenshtein distance between two UTF-8 strings using the OpenMP backend.
+ *  @sa     `levenshtein_distance` for binary strings.
+ */
+template <                                         //
+    sz_capability_t capability_ = sz_cap_serial_k, //
+    typename char_type_ = char,                    //
+    typename allocator_type_ = dummy_alloc_t       //
+    >
+struct levenshtein_distance_utf8 {
+
+    using char_t = char_type_;
+    using allocator_t = allocator_type_;
+
+    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
+
+    using horizontal_u8_t = horizontal_walker<capability_serialized_k, sz_align_global_k, sz_rune_t, sz_u8_t,
+                                              error_costs_uniform_t, allocator_t>;
+    using diagonal_u8_t = diagonal_walker<capability_serialized_k, sz_align_global_k, sz_rune_t, sz_u8_t,
+                                          error_costs_uniform_t, allocator_t>;
+    using diagonal_u16_t =
+        diagonal_walker<capability_k, sz_align_global_k, sz_rune_t, sz_u16_t, error_costs_uniform_t, allocator_t>;
+    using diagonal_u32_t =
+        diagonal_walker<capability_k, sz_align_global_k, sz_rune_t, sz_u32_t, error_costs_uniform_t, allocator_t>;
+    using diagonal_u64_t =
+        diagonal_walker<capability_k, sz_align_global_k, sz_rune_t, sz_u64_t, error_costs_uniform_t, allocator_t>;
+
+    using ascii_fallback_t = levenshtein_distance_utf8<capability_k, char_t, allocator_t>;
+
+    allocator_t alloc_ {};
+
+    levenshtein_distance_utf8(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
+
+    /**
+     *  @param[in] first The first string.
+     *  @param[in] second The second string.
+     *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
+     */
+    status_t operator()(span<char_t const> first, span<char_t const> second, sz_size_t &result_ref) const noexcept {
+
+        sz_size_t const first_length = first.length();
+        sz_size_t const second_length = second.length();
+        if (first_length == 0) {
+            result_ref = second_length;
+            return status_t::success_k;
+        }
+        if (second_length == 0) {
+            result_ref = first_length;
+            return status_t::success_k;
+        }
+
+        // Check if the strings are entirely composed of ASCII characters,
+        // and default to a simpler algorithm in that case.
+        if (sz_isascii(first.data(), first.length()) && sz_isascii(second.data(), second.length()))
+            return ascii_fallback_t {alloc_}(first, second);
+
+        // Allocate some memory to expand UTF-8 strings into UTF-32.
+        sz_size_t const max_utf32_bytes = first.size() * 4 + second.size() * 4;
+        sz_rune_t const *const first_data_utf32 = (sz_rune_t *)alloc_.allocate(max_utf32_bytes);
+        sz_rune_t const *const second_data_utf32 = first_data_utf32 + first.size();
+
+        // Export into UTF-32 buffer.
+        sz_rune_length_t rune_length;
+        sz_size_t first_length_utf32 = 0, second_length_utf32 = 0;
+        for (sz_size_t progress_utf8 = 0, progress_utf32 = 0; progress_utf8 < first.size();
+             progress_utf8 += rune_length, ++progress_utf32, ++first_length_utf32)
+            sz_rune_parse(first.data() + progress_utf8, first_data_utf32 + progress_utf32, &rune_length);
+        for (sz_size_t progress_utf8 = 0, progress_utf32 = 0; progress_utf8 < second.size();
+             progress_utf8 += rune_length, ++progress_utf32, ++second_length_utf32)
+            sz_rune_parse(second.data() + progress_utf8, second_data_utf32 + progress_utf32, &rune_length);
+
+        // Estimate the maximum dimension of the DP matrix
+        sz_size_t const min_dim = sz_min_of_two(first_length, second_length) + 1;
+        sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
+        span<sz_rune_t const> const first_utf32 {first_data_utf32, first_length_utf32};
+        span<sz_rune_t const> const second_utf32 {second_data_utf32, second_length_utf32};
+
+        // When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
+        status_t status = status_t::success_k;
+        if (min_dim < 16u) {
+            sz_u8_t result_u8;
+            status = horizontal_u8_t {error_costs_uniform_t {}, 1, alloc_}(first_utf32, second_utf32, result_u8);
+            if (status == status_t::success_k) result_ref = result_u8;
+        }
+
+        // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
+        // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
+        if (max_dim < 256u) {
+            sz_u8_t result_u8;
+            status = diagonal_u8_t {error_costs_uniform_t {}, 1, alloc_}(first_utf32, second_utf32, result_u8);
+            if (status == status_t::success_k) result_ref = result_u8;
+        }
+        else if (max_dim < 65536u) {
+            sz_u16_t result_u16;
+            status = diagonal_u16_t {error_costs_uniform_t {}, 1, alloc_}(first_utf32, second_utf32, result_u16);
+            if (status == status_t::success_k) result_ref = result_u16;
+        }
+        else if (max_dim < 4294967296u) {
+            sz_u32_t result_u32;
+            status = diagonal_u32_t {error_costs_uniform_t {}, 1, alloc_}(first_utf32, second_utf32, result_u32);
+            if (status == status_t::success_k) result_ref = result_u32;
+        }
+        else {
+            sz_u64_t result_u64;
+            status = diagonal_u64_t {error_costs_uniform_t {}, 1, alloc_}(first_utf32, second_utf32, result_u64);
+            if (status == status_t::success_k) result_ref = result_u64;
+        }
 
-    // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
-    // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
-    if (max_dim < 256u) {
-        sz_u8_t result_u8;
-        sz_status_t status =
-            score_diagonally<capability_, sz_align_global_k, char, sz_u8_t, uniform_substitution_cost_t,
-                             allocator_type_>({first.data(), first_length}, {second.data(), second_length}, result_u8,
-                                              1, uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
-        if (status == sz_bad_alloc_k) throw std::bad_alloc();
-        return result_u8;
+        return status;
     }
-    else if (max_dim < 65536u) {
-        sz_u16_t result_u16;
-        sz_status_t status =
-            score_diagonally<capability_, sz_align_global_k, char, sz_u16_t, uniform_substitution_cost_t,
-                             allocator_type_>({first.data(), first_length}, {second.data(), second_length}, result_u16,
-                                              1, uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
-        if (status == sz_bad_alloc_k) throw std::bad_alloc();
-        return result_u16;
+};
+
+/**
+ *  @brief  Computes the @b byte-level Needleman-Wunsch score between two strings using the OpenMP backend.
+ *  @sa     `levenshtein_distance` for uniform substitution and gap costs.
+ */
+template <                                                     //
+    sz_capability_t capability_ = sz_cap_serial_k,             //
+    typename char_type_ = char,                                //
+    typename substituter_type_ = error_costs_256x256_lookup_t, //
+    typename allocator_type_ = dummy_alloc_t                   //
+    >
+struct needleman_wunsch_score {
+
+    using char_t = char_type_;
+    using substituter_t = substituter_type_;
+    using allocator_t = allocator_type_;
+
+    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
+
+    using horizontal_i16_t =
+        horizontal_walker<capability_serialized_k, sz_align_global_k, char_t, sz_i16_t, substituter_t, allocator_t>;
+    using diagonal_i16_t =
+        diagonal_walker<capability_serialized_k, sz_align_global_k, char_t, sz_i16_t, substituter_t, allocator_t>;
+    using diagonal_i32_t =
+        diagonal_walker<capability_k, sz_align_global_k, char_t, sz_i32_t, substituter_t, allocator_t>;
+    using diagonal_i64_t =
+        diagonal_walker<capability_k, sz_align_global_k, char_t, sz_i64_t, substituter_t, allocator_t>;
+
+    substituter_t substituter_ {};
+    error_cost_t gap_cost_ {1};
+    allocator_t alloc_ {};
+
+    needleman_wunsch_score(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
+    needleman_wunsch_score(substituter_t subs, error_cost_t gap_cost, allocator_t alloc = allocator_t {}) noexcept
+        : substituter_(subs), gap_cost_(gap_cost), alloc_(alloc) {}
+
+    /**
+     *  @param[in] first The first string.
+     *  @param[in] second The second string.
+     *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
+     */
+    status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref) const noexcept {
+
+        sz_size_t const first_length = first.length();
+        sz_size_t const second_length = second.length();
+        if (first_length == 0) {
+            result_ref = second_length * gap_cost_;
+            return status_t::success_k;
+        }
+        if (second_length == 0) {
+            result_ref = first_length * gap_cost_;
+            return status_t::success_k;
+        }
+
+        // Estimate the maximum dimension of the DP matrix
+        sz_size_t const min_dim = sz_min_of_two(first_length, second_length) + 1;
+        sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
+
+        // When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
+        status_t status = status_t::success_k;
+        if (min_dim < 16u) {
+            sz_i16_t result_i16;
+            status = horizontal_i16_t {substituter_, gap_cost_, alloc_}(first, second, result_i16);
+            if (status == status_t::success_k) result_ref = result_i16;
+        }
+
+        // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
+        // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
+        // Assuming each individual cost falls in [-128, 127], the `i16` range of [-32768, 32767] is sufficient
+        // for inputs under (32768 / 128) = 256 characters.
+        if (max_dim < 256u) {
+            sz_i16_t result_i16;
+            status = diagonal_i16_t {substituter_, gap_cost_, alloc_}(first, second, result_i16);
+            if (status == status_t::success_k) result_ref = result_i16;
+        }
+        // Assuming each individual cost falls in [-128, 127], the `i32` range of [-2147483648, 2147483647] is
+        // sufficient for inputs under (2147483648 / 128) = 16777216 characters.
+        else if (max_dim < 16777216u) {
+            sz_i32_t result_i32;
+            status = diagonal_i32_t {substituter_, gap_cost_, alloc_}(first, second, result_i32);
+            if (status == status_t::success_k) result_ref = result_i32;
+        }
+        else {
+            sz_i64_t result_i64;
+            status = diagonal_i64_t {substituter_, gap_cost_, alloc_}(first, second, result_i64);
+            if (status == status_t::success_k) result_ref = result_i64;
+        }
+
+        return status;
     }
-    else {
-        sz_size_t result_size;
-        sz_status_t status =
-            score_diagonally<capability_, sz_align_global_k, char, sz_size_t, uniform_substitution_cost_t,
-                             allocator_type_>({first.data(), first_length}, {second.data(), second_length}, result_size,
-                                              1, uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
-        if (status == sz_bad_alloc_k) throw std::bad_alloc();
-        return result_size;
+};
+
+/**
+ *  @brief  Computes the @b byte-level Needleman-Wunsch score between two strings using the OpenMP backend.
+ *  @sa     `levenshtein_distance` for uniform substitution and gap costs.
+ */
+template <                                                     //
+    sz_capability_t capability_ = sz_cap_serial_k,             //
+    typename char_type_ = char,                                //
+    typename substituter_type_ = error_costs_256x256_lookup_t, //
+    typename allocator_type_ = dummy_alloc_t                   //
+    >
+struct smith_waterman_score {
+
+    using char_t = char_type_;
+    using substituter_t = substituter_type_;
+    using allocator_t = allocator_type_;
+
+    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
+
+    using horizontal_i16_t =
+        horizontal_walker<capability_serialized_k, sz_align_local_k, char_t, sz_i16_t, substituter_t, allocator_t>;
+    using diagonal_i16_t =
+        diagonal_walker<capability_serialized_k, sz_align_local_k, char_t, sz_i16_t, substituter_t, allocator_t>;
+    using diagonal_i32_t =
+        diagonal_walker<capability_k, sz_align_local_k, char_t, sz_i32_t, substituter_t, allocator_t>;
+    using diagonal_i64_t =
+        diagonal_walker<capability_k, sz_align_local_k, char_t, sz_i64_t, substituter_t, allocator_t>;
+
+    substituter_t substituter_ {};
+    error_cost_t gap_cost_ {1};
+    allocator_t alloc_ {};
+
+    smith_waterman_score(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
+    smith_waterman_score(substituter_t subs, error_cost_t gap_cost, allocator_t alloc = allocator_t {}) noexcept
+        : substituter_(subs), gap_cost_(gap_cost), alloc_(alloc) {}
+
+    /**
+     *  @param[in] first The first string.
+     *  @param[in] second The second string.
+     *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
+     */
+    status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref) const noexcept {
+
+        sz_size_t const first_length = first.length();
+        sz_size_t const second_length = second.length();
+        if (first_length == 0) {
+            result_ref = second_length * gap_cost_;
+            return status_t::success_k;
+        }
+        if (second_length == 0) {
+            result_ref = first_length * gap_cost_;
+            return status_t::success_k;
+        }
+
+        // Estimate the maximum dimension of the DP matrix
+        sz_size_t const min_dim = sz_min_of_two(first_length, second_length) + 1;
+        sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
+
+        // When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
+        status_t status = status_t::success_k;
+        if (min_dim < 16u) {
+            sz_i16_t result_i16;
+            status = horizontal_i16_t {substituter_, gap_cost_, alloc_}(first, second, result_i16);
+            if (status == status_t::success_k) result_ref = result_i16;
+        }
+
+        // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
+        // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
+        // Assuming each individual cost falls in [-128, 127], the `i16` range of [-32768, 32767] is sufficient
+        // for inputs under (32768 / 128) = 256 characters.
+        if (max_dim < 256u) {
+            sz_i16_t result_i16;
+            status = diagonal_i16_t {substituter_, gap_cost_, alloc_}(first, second, result_i16);
+            if (status == status_t::success_k) result_ref = result_i16;
+        }
+        // Assuming each individual cost falls in [-128, 127], the `i32` range of [-2147483648, 2147483647] is
+        // sufficient for inputs under (2147483648 / 128) = 16777216 characters.
+        else if (max_dim < 16777216u) {
+            sz_i32_t result_i32;
+            status = diagonal_i32_t {substituter_, gap_cost_, alloc_}(first, second, result_i32);
+            if (status == status_t::success_k) result_ref = result_i32;
+        }
+        else {
+            sz_i64_t result_i64;
+            status = diagonal_i64_t {substituter_, gap_cost_, alloc_}(first, second, result_i64);
+            if (status == status_t::success_k) result_ref = result_i64;
+        }
+
+        return status;
     }
-}
+};
 
 /**
- *  @brief Computes the @b rune-level Levenshtein distance between two UTF-8 strings using the OpenMP backend.
- *  @param[in] first The first string.
- *  @param[in] second The second string.
- *  @param[in] alloc An allocator for the internal buffers.
- *  @return The Levenshtein distance between the two strings.
- *  @throws `std::bad_alloc` if the allocator fails to allocate memory.
- *  @sa `levenshtein_distance` for binary strings.
- *  @sa `score_diagonally` for the core algorithm.
+ *  @brief  Helper method, applying the desired pairwise scoring kernel to all input pairs,
+ *          differentiating multi-threaded and single-threaded cases.
+ *          For pairs of very large strings, all cores cooperate to compute one distance maximizing
+ *          cache hits. For smaller strings, each core computes its own distance.
  */
-template < //
-    sz_capability_t capability_ = sz_cap_serial_k,
-    typename first_type_ = span<char const>,  //
-    typename second_type_ = span<char const>, //
-    typename allocator_type_ = dummy_alloc_t  //
+template <                                                                              //
+    typename score_type_,                                                               //
+    typename inter_pair_parallel_type_,                                                 //
+    typename intra_pair_parallel_type_,                                                 //
+    typename first_strings_type_, typename second_strings_type_, typename results_type_ //
     >
-inline sz_size_t levenshtein_distance_utf8( //
-    first_type_ const &first, second_type_ const &second,
-    allocator_type_ &&alloc = allocator_type_ {}) noexcept(false) {
-
-    sz_size_t const first_length = first.length();
-    sz_size_t const second_length = second.length();
-    if (first_length == 0) return second_length;
-    if (second_length == 0) return first_length;
-
-    // Check if the strings are entirely composed of ASCII characters,
-    // and default to a simpler algorithm in that case.
-    if (sz_isascii(first.data(), first.length()) && sz_isascii(second.data(), second.length()))
-        return levenshtein_distance(first, second);
-
-    // Allocate some memory to expand UTF-8 strings into UTF-32.
-    sz_size_t const max_utf32_bytes = first.size() * 4 + second.size() * 4;
-    sz_rune_t const *const first_utf32 = (sz_rune_t *)alloc.allocate(max_utf32_bytes);
-    sz_rune_t const *const second_utf32 = first_utf32 + first.size();
-
-    // Export into UTF-32 buffer.
-    sz_rune_length_t rune_length;
-    sz_size_t first_length_utf32 = 0, second_length_utf32 = 0;
-    for (sz_size_t progress_utf8 = 0, progress_utf32 = 0; progress_utf8 < first.size();
-         progress_utf8 += rune_length, ++progress_utf32, ++first_length_utf32)
-        sz_rune_parse(first.data() + progress_utf8, first_utf32 + progress_utf32, &rune_length);
-    for (sz_size_t progress_utf8 = 0, progress_utf32 = 0; progress_utf8 < second.size();
-         progress_utf8 += rune_length, ++progress_utf32, ++second_length_utf32)
-        sz_rune_parse(second.data() + progress_utf8, second_utf32 + progress_utf32, &rune_length);
-
-    // Infer the largest distance type we may need fr aggregated error costs.
-    // Estimate the maximum dimension of the DP matrix
-    sz_size_t const max_dim = sz_max_of_two(first_length_utf32, second_length_utf32) + 1;
-    if (max_dim < 256u) {
-        sz_u8_t result_u8;
-        sz_status_t status = score_diagonally<capability_, sz_align_global_k, sz_rune_t, sz_u8_t,
-                                              uniform_substitution_cost_t, allocator_type_>(
-            {first_utf32, first_length_utf32}, {second_utf32, second_length_utf32}, result_u8, 1,
-            uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
-        if (status == sz_bad_alloc_k) throw std::bad_alloc();
-        return result_u8;
+status_t _score_in_parallel(                         //
+    inter_pair_parallel_type_ &&intra_pair_parallel, //
+    intra_pair_parallel_type_ &&inter_pair_parallel, //
+    first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, results_type_ &&results,
+    cpu_specs_t specs = {}) noexcept {
+
+    using score_t = score_type_;
+
+    auto first_size = first_strings.size();
+    auto second_size = second_strings.size();
+    _sz_assert(first_size == second_size && "Expect equal number of strings");
+
+    // Separately handle shorter and longer strings.
+    constexpr sz_size_t threshold_size = 256;
+
+    // Use an atomic to store any error encountered.
+    std::atomic<status_t> error {status_t::success_k};
+
+    // ? There may be a huge variance in the lengths of the strings,
+    // ? so we need to use a dynamic schedule.
+#pragma omp parallel for schedule(dynamic, 1)
+    for (sz_size_t i = 0; i < first_size; ++i) {
+        if (error.load() != status_t::success_k) continue;
+        score_t result = 0;
+        auto const &first = first_strings[i];
+        auto const &second = second_strings[i];
+        auto largest_dimension = sz_max_of_two(first.length(), second.length());
+        if (largest_dimension >= threshold_size) continue;
+        status_t status = inter_pair_parallel({first.data(), first.length()}, {second.data(), second.length()}, result);
+        if (status == status_t::success_k) { results[i] = result; }
+        else { error.store(status); }
     }
-    else if (max_dim < 65536u) {
-        sz_u16_t result_u16;
-        sz_status_t status = score_diagonally<capability_, sz_align_global_k, sz_rune_t, sz_u16_t,
-                                              uniform_substitution_cost_t, allocator_type_>(
-            {first_utf32, first_length_utf32}, {second_utf32, second_length_utf32}, result_u16, 1,
-            uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
-        if (status == sz_bad_alloc_k) throw std::bad_alloc();
-        return result_u16;
+
+    // Now handle the larger strings.
+    for (sz_size_t i = 0; i < first_size && error.load() != status_t::success_k; ++i) {
+        score_t result = 0;
+        auto const &first = first_strings[i];
+        auto const &second = second_strings[i];
+        auto largest_dimension = sz_max_of_two(first.length(), second.length());
+        if (largest_dimension < threshold_size) continue;
+        status_t status = intra_pair_parallel({first.data(), first.length()}, {second.data(), second.length()}, result);
+        if (status == status_t::success_k) { results[i] = result; }
+        else { error.store(status); }
     }
-    else {
-        sz_size_t result_size;
-        sz_status_t status = score_diagonally<capability_, sz_align_global_k, sz_rune_t, sz_size_t,
-                                              uniform_substitution_cost_t, allocator_type_>(
-            {first_utf32, first_length_utf32}, {second_utf32, second_length_utf32}, result_size, 1,
-            uniform_substitution_cost_t {}, std::forward<allocator_type_>(alloc));
-        if (status == sz_bad_alloc_k) throw std::bad_alloc();
-        return result_size;
+    return error.load();
+}
+
+template <                                                                              //
+    typename score_type_,                                                               //
+    typename scoring_type_,                                                             //
+    typename first_strings_type_, typename second_strings_type_, typename results_type_ //
+    >
+status_t _score_sequentially(scoring_type_ &&scoring, first_strings_type_ const &first_strings,
+                             second_strings_type_ const &second_strings, results_type_ &&results) noexcept {
+    using scoring_t = scoring_type_;
+    using score_t = score_type_;
+
+    auto first_size = first_strings.size();
+    auto second_size = second_strings.size();
+    _sz_assert(first_size == second_size && "Expect equal number of strings");
+
+    for (sz_size_t i = 0; i < first_size; ++i) {
+        score_t result = 0;
+        auto const &first = first_strings[i];
+        auto const &second = second_strings[i];
+        status_t status = scoring({first.data(), first.length()}, {second.data(), second.length()}, result);
+        if (status == status_t::success_k) { results[i] = result; }
+        else { return status; }
     }
+    return status_t::success_k;
 }
 
 /**
- *  @brief Computes the @b byte-level Needleman-Wunsch score between two strings using the OpenMP backend.
- *  @param[in] first The first string.
- *  @param[in] second The second string.
- *  @param[in] alloc An allocator for the internal buffers.
- *  @return The Needleman-Wunsch global alignment score between the two strings.
- *  @throws `std::bad_alloc` if the allocator fails to allocate memory.
- *  @sa `levenshtein_distance` for uniform substitution and gap costs.
- *  @sa `score_diagonally` for the core algorithm.
+ *  @brief  Computes one or many pairwise Levenshtein distances in parallel using the OpenMP backend.
+ *          For pairs of very large strings, all cores cooperate to compute one distance maximizing
+ *          cache hits. For smaller strings, each core computes its own distance.
  */
-template < //
-    sz_capability_t capability_ = sz_cap_serial_k,
-    typename first_type_ = span<char const>,  //
-    typename second_type_ = span<char const>, //
-    typename allocator_type_ = dummy_alloc_t  //
+template <                                         //
+    sz_capability_t capability_ = sz_cap_serial_k, //
+    typename char_type_ = char,                    //
+    typename allocator_type_ = dummy_alloc_t       //
     >
-inline sz_ssize_t needleman_wunsch_score(                 //
-    first_type_ const &first, second_type_ const &second, //
-    sz_error_cost_t const *subs, sz_error_cost_t gap,     //
-    allocator_type_ &&alloc = allocator_type_ {}) noexcept(false) {
-
-    sz_size_t const first_length = first.length();
-    sz_size_t const second_length = second.length();
-    if (first_length == 0) return second_length * gap;
-    if (second_length == 0) return first_length * gap;
-
-    // Estimate the maximum dimension of the DP matrix
-    sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
-    if (max_dim < 256u) {
-        sz_u8_t result_u8;
-        sz_status_t status =
-            score_diagonally<capability_, sz_align_local_k, char, sz_u8_t, lookup_substitution_cost_t, allocator_type_>(
-                {first.data(), first_length}, {second.data(), second_length}, result_u8, gap,
-                lookup_substitution_cost_t {subs}, std::forward<allocator_type_>(alloc));
-        if (status == sz_bad_alloc_k) throw std::bad_alloc();
-        return result_u8;
+struct levenshtein_distances {
+
+    using char_t = char_type_;
+    using allocator_t = allocator_type_;
+
+    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
+
+    static constexpr bool is_parallel() { return (capability_k & sz_cap_parallel_k) != 0; }
+
+    using intra_pair_parallel_t = levenshtein_distance<capability_serialized_k, char_t, allocator_t>;
+    using inter_pair_parallel_t = levenshtein_distance<capability_k, char_t, allocator_t>;
+
+    allocator_t alloc_ {};
+
+    levenshtein_distances(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
+
+    template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
+    status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
+                        results_type_ &&results) const noexcept {
+
+        if constexpr (is_parallel())
+            return _score_in_parallel<sz_size_t>(inter_pair_parallel_t {alloc_}, intra_pair_parallel_t {alloc_},
+                                                 first_strings, second_strings, std::forward<results_type_>(results));
+        else
+            return _score_sequentially<sz_size_t>(intra_pair_parallel_t {alloc_}, first_strings, second_strings,
+                                                  std::forward<results_type_>(results));
     }
-    else if (max_dim < 65536u) {
-        sz_u16_t result_u16;
-        sz_status_t status = score_diagonally<capability_, sz_align_local_k, char, sz_u16_t, lookup_substitution_cost_t,
-                                              allocator_type_>(
-            {first.data(), first_length}, {second.data(), second_length}, result_u16, gap,
-            lookup_substitution_cost_t {subs}, std::forward<allocator_type_>(alloc));
-        if (status == sz_bad_alloc_k) throw std::bad_alloc();
-        return result_u16;
+};
+
+template <                                         //
+    sz_capability_t capability_ = sz_cap_serial_k, //
+    typename char_type_ = char,                    //
+    typename allocator_type_ = dummy_alloc_t       //
+    >
+struct levenshtein_distances_utf8 {
+
+    using char_t = char_type_;
+    using allocator_t = allocator_type_;
+
+    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
+
+    static constexpr bool is_parallel() { return (capability_k & sz_cap_parallel_k) != 0; }
+
+    using intra_pair_parallel_t = levenshtein_distance_utf8<capability_serialized_k, char_t, allocator_t>;
+    using inter_pair_parallel_t = levenshtein_distance_utf8<capability_k, char_t, allocator_t>;
+
+    allocator_t alloc_ {};
+
+    levenshtein_distances_utf8(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
+
+    template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
+    status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
+                        results_type_ &&results) const noexcept {
+
+        if constexpr (is_parallel())
+            return _score_in_parallel<sz_size_t>(inter_pair_parallel_t {alloc_}, intra_pair_parallel_t {alloc_},
+                                                 first_strings, second_strings, std::forward<results_type_>(results));
+        else
+            return _score_sequentially<sz_size_t>(intra_pair_parallel_t {alloc_}, first_strings, second_strings,
+                                                  std::forward<results_type_>(results));
     }
-    else {
-        sz_size_t result_size;
-        sz_status_t status = score_diagonally<capability_, sz_align_local_k, char, sz_size_t,
-                                              lookup_substitution_cost_t, allocator_type_>(
-            {first.data(), first_length}, {second.data(), second_length}, result_size, gap,
-            lookup_substitution_cost_t {subs}, std::forward<allocator_type_>(alloc));
-        if (status == sz_bad_alloc_k) throw std::bad_alloc();
-        return result_size;
+};
+
+template <                                                     //
+    sz_capability_t capability_ = sz_cap_serial_k,             //
+    typename char_type_ = char,                                //
+    typename substituter_type_ = error_costs_256x256_lookup_t, //
+    typename allocator_type_ = dummy_alloc_t                   //
+    >
+struct needleman_wunsch_scores {
+
+    using char_t = char_type_;
+    using substituter_t = substituter_type_;
+    using allocator_t = allocator_type_;
+
+    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
+
+    static constexpr bool is_parallel() { return (capability_k & sz_cap_parallel_k) != 0; }
+
+    using intra_pair_parallel_t = needleman_wunsch_score<capability_serialized_k, char_t, substituter_t, allocator_t>;
+    using inter_pair_parallel_t = needleman_wunsch_score<capability_k, char_t, substituter_t, allocator_t>;
+
+    substituter_t substituter_ {};
+    error_cost_t gap_cost_ {1};
+    allocator_t alloc_ {};
+
+    needleman_wunsch_scores(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
+    needleman_wunsch_scores(substituter_t subs, error_cost_t gap_cost, allocator_t alloc = allocator_t {}) noexcept
+        : substituter_(subs), gap_cost_(gap_cost), alloc_(alloc) {}
+
+    template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
+    status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
+                        results_type_ &&results) const noexcept {
+
+        if constexpr (is_parallel())
+            return _score_in_parallel<sz_ssize_t>(inter_pair_parallel_t {substituter_, gap_cost_, alloc_},
+                                                  intra_pair_parallel_t {substituter_, gap_cost_, alloc_},
+                                                  first_strings, second_strings, std::forward<results_type_>(results));
+        else
+            return _score_sequentially<sz_ssize_t>(intra_pair_parallel_t {substituter_, gap_cost_, alloc_},
+                                                   first_strings, second_strings, std::forward<results_type_>(results));
     }
-}
+};
+
+template <                                                     //
+    sz_capability_t capability_ = sz_cap_serial_k,             //
+    typename char_type_ = char,                                //
+    typename substituter_type_ = error_costs_256x256_lookup_t, //
+    typename allocator_type_ = dummy_alloc_t                   //
+    >
+struct smith_waterman_scores {
+
+    using char_t = char_type_;
+    using substituter_t = substituter_type_;
+    using allocator_t = allocator_type_;
 
-inline void levenshtein_scores() {}
-inline void levenshtein_scores_utf8() {}
-inline void needleman_wunsch_scores() {}
+    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
+
+    static constexpr bool is_parallel() { return (capability_k & sz_cap_parallel_k) != 0; }
+
+    using intra_pair_parallel_t = smith_waterman_score<capability_serialized_k, char_t, substituter_t, allocator_t>;
+    using inter_pair_parallel_t = smith_waterman_score<capability_k, char_t, substituter_t, allocator_t>;
+
+    substituter_t substituter_ {};
+    error_cost_t gap_cost_ {1};
+    allocator_t alloc_ {};
+
+    smith_waterman_scores(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
+    smith_waterman_scores(substituter_t subs, error_cost_t gap_cost, allocator_t alloc = allocator_t {}) noexcept
+        : substituter_(subs), gap_cost_(gap_cost), alloc_(alloc) {}
+
+    template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
+    status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
+                        results_type_ &&results) const noexcept {
+
+        if constexpr (is_parallel())
+            return _score_in_parallel<sz_ssize_t>(inter_pair_parallel_t {substituter_, gap_cost_, alloc_},
+                                                  intra_pair_parallel_t {substituter_, gap_cost_, alloc_},
+                                                  first_strings, second_strings, std::forward<results_type_>(results));
+        else
+            return _score_sequentially<sz_ssize_t>(intra_pair_parallel_t {substituter_, gap_cost_, alloc_},
+                                                   first_strings, second_strings, std::forward<results_type_>(results));
+    }
+};
 
 } // namespace openmp
 } // namespace stringzilla
diff --git a/include/stringcuzilla/types.cuh b/include/stringcuzilla/types.cuh
index f3a23f7c..c331c647 100644
--- a/include/stringcuzilla/types.cuh
+++ b/include/stringcuzilla/types.cuh
@@ -13,7 +13,7 @@
 
 #include <cuda_runtime.h> // `cudaMallocManaged`, `cudaFree`, `cudaSuccess`, `cudaGetErrorString`
 
-#include "types.hpp"
+#include "stringzilla/types.hpp"
 
 namespace ashvardanian {
 namespace stringzilla {
@@ -63,12 +63,6 @@ struct unified_alloc {
     }
 };
 
-struct specs_t {
-    size_t total_sm_count = 108;              // ? On A100
-    size_t blocks_per_sm = 128;               // ? Each, generally, with 32 threads
-    size_t shared_memory_per_sm = 192 * 1024; // ? On A100 it's 192 KB per SM
-};
-
 } // namespace cuda
 } // namespace stringzilla
 } // namespace ashvardanian
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 2c87a2ce..5a0556e6 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -441,7 +441,7 @@ typedef enum {
  */
 typedef enum {
     sz_cap_serial_k = 1,        ///< Serial (non-SIMD) capability
-    sz_cap_parallel_k = 1 << 2, ///< Serial (non-SIMD) capability
+    sz_cap_parallel_k = 1 << 2, ///< Multi-threading via OpenMP capability
     sz_cap_any_k = 0x7FFFFFFF,  ///< Mask representing any capability with `INT_MAX`
 
     sz_cap_haswell_k = 1 << 5, ///< x86 AVX2 capability with FMA and F16C extensions
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index ae75c095..80745fd1 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -121,12 +121,12 @@ enum class status_t {
     unknown_k = sz_status_unknown_k,
 };
 
-struct uniform_substitution_cost_t {
+struct error_costs_uniform_t {
     constexpr error_cost_t operator()(char a, char b) const noexcept { return a == b ? 0 : 1; }
 };
 
-struct lookup_substitution_cost_t {
-    error_cost_t const *costs;
+struct error_costs_256x256_lookup_t {
+    error_cost_t const *costs = nullptr;
     constexpr error_cost_t operator()(char a, char b) const noexcept { return costs[(u8_t)a * 256 + (u8_t)b]; }
 };
 
@@ -412,6 +412,35 @@ struct constant_iterator {
     difference_type pos_;
 };
 
+template <typename first_, typename second_>
+struct is_same_type;
+
+template <typename first_>
+struct is_same_type<first_, first_> {
+    static constexpr bool value = true;
+};
+
+template <typename first_, typename second_>
+struct is_same_type {
+    static_assert(std::is_same<first_, second_>::value, "First and second types differ!");
+    static constexpr bool value = false;
+};
+
+struct gpu_specs_t {
+    size_t total_sm_count = 108;              // ? On A100
+    size_t blocks_per_sm = 128;               // ? Each, generally, with 32 threads
+    size_t shared_memory_per_sm = 192 * 1024; // ? On A100 it's 192 KB per SM
+};
+
+struct cpu_specs_t {
+    size_t l1_bytes = 32 * 1024;       // ? typically around 32 KB
+    size_t l2_bytes = 256 * 1024;      // ? typically around 256 KB
+    size_t l3_bytes = 8 * 1024 * 1024; // ? typically around 8 MB
+    size_t cache_line_width = 64;      // ? 64 bytes on x86, sometimes 128 on ARM
+    size_t cores_per_socket = 1;       // ? at least 1 core
+    size_t sockets = 1;                // ? at least 1 socket
+};
+
 } // namespace stringzilla
 } // namespace ashvardanian
 

From 3cb8dd062ac37017c35cb1f33c0b68ae8fa6e531 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 3 Apr 2025 12:10:15 +0000
Subject: [PATCH 286/751] Add: Separate parallel & serial tests

---
 include/stringzilla/stringzilla.h |   1 -
 scripts/test.cpp                  | 348 +++++--------------------
 scripts/test.cu                   | 419 ++++++++++++++++++++++++++++++
 scripts/test.hpp                  |  59 ++++-
 4 files changed, 535 insertions(+), 292 deletions(-)
 create mode 100644 scripts/test.cu

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 9589edd0..94e5f8ff 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -74,7 +74,6 @@
 #include "hash.h"         // `sz_bytesum`, `sz_hash`, `sz_state_init`, `sz_state_stream`, `sz_state_fold`
 #include "find.h"         // `sz_find`, `sz_find_byteset`, `sz_rfind`
 #include "small_string.h" // `sz_string_t`, `sz_string_init`, `sz_string_free`
-#include "similarity.h"   // `sz_levenshtein_distance`, `sz_needleman_wunsch_score`
 #include "sort.h"         // `sz_sequence_argsort`, `sz_pgrams_sort`
 #include "intersect.h"    // `sz_sequence_intersect`
 
diff --git a/scripts/test.cpp b/scripts/test.cpp
index eab21ab1..bc804a76 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -1,6 +1,6 @@
 /**
  *  @brief   Extensive @b unit-testing suite for StringZilla, written in C++.
- *  @note    It tests one target hardware platform at a time and should be compiled and run separately for each.
+ *  @note    It mostly tests one target hardware platform at a time and should be compiled and run separately for each.
  *           To override the default hardware platform, overrides the @b `SZ_USE_*` flags at the top of this file.
  *
  *  @see     Stress-tests on real-world and synthetic data are integrated into the @b `scripts/bench*.cpp` benchmarks.
@@ -8,62 +8,46 @@
  *  @file    test.cpp
  *  @author  Ash Vardanian
  */
+#undef NDEBUG // ! Enable all assertions for testing
 
-#include <limits>
-#undef NDEBUG // Enable all assertions
-
-/* The Visual C++ run-time library detects incorrect iterator use,
- * and asserts and displays a dialog box at run time on Windows.
+/**
+ *  The Visual C++ run-time library detects incorrect iterator use,
+ *  and asserts and displays a dialog box at run time on Windows.
  */
 #if !defined(_ITERATOR_DEBUG_LEVEL) || _ITERATOR_DEBUG_LEVEL == 0
 #define _ITERATOR_DEBUG_LEVEL 1
 #endif
 
-#include <cassert> // assertions
-
 /**
- * ! Overload the following with caution.
- * ! Those parameters must never be explicitly set during releases,
- * ! but they come handy during development, if you want to validate
- * ! different ISA-specific implementations.
+ *  ! Overload the following with caution.
+ *  ! Those parameters must never be explicitly set during releases,
+ *  ! but they come handy during development, if you want to validate
+ *  ! different ISA-specific implementations.
+
+#define SZ_USE_HASWELL 0
+#define SZ_USE_SKYLAKE 0
+#define SZ_USE_ICE 0
+#define SZ_USE_NEON 0
+#define SZ_USE_SVE 0
  */
-// #define SZ_USE_HASWELL 0
-// #define SZ_USE_SKYLAKE 0
-// #define SZ_USE_ICE 0
-// #define SZ_USE_NEON 0
-// #define SZ_USE_SVE 0
-// #define SZ_USE_SVE2 0
+#define SZ_USE_SVE2 0
 #if defined(SZ_DEBUG)
 #undef SZ_DEBUG
 #endif
-#define SZ_DEBUG 1 // Enforce aggressive logging for this unit.
-
-#include <stringzilla/stringzilla.hpp> // C++ string class replacement
-#include <stringzilla/stringzilla.cuh> // Precompiled CUDA templates
+#define SZ_DEBUG 1 // ! Enforce aggressive logging in this translation unit
 
 /**
- * ! Overload the following with caution to enable parallelism.
- * ! They control the OpenMP CPU backend as well as the CUDA GPU backend.
+ *  Make sure to include the StringZilla headers before anything else,
+ *  to intercept missing `#include` directives and other issues.
  */
-#if SZ_USE_CUDA
-#include <stringzilla/types.cuh>      // `unified_alloc`
-#include <stringzilla/similarity.cuh> // Parallel string processing on CUDA or OpenMP
-#endif
-
-#if SZ_USE_OPENMP
-#include <stringzilla/similarity.hpp> // OpenMP templates for string similarity measures
-#endif
+#include <stringzilla/stringzilla.h>   // Primary C API
+#include <stringzilla/stringzilla.hpp> // C++ string class replacement
 
 #if defined(__SANITIZE_ADDRESS__)
 #include <sanitizer/asan_interface.h> // We use ASAN API to poison memory addresses
 #endif
 
-#if defined(__clang__) || defined(__GNUC__)
-#define ASAN_DISABLE __attribute__((no_sanitize_address))
-#else
-#define ASAN_DISABLE
-#endif
-
+#include <cassert>       // C-style assertions
 #include <algorithm>     // `std::transform`
 #include <cstdio>        // `std::printf`
 #include <cstring>       // `std::memcpy`
@@ -84,7 +68,7 @@
 #error "This test requires C++11 or later."
 #endif
 
-#include <test.hpp> // `levenshtein_baseline`
+#include "test.hpp" // `global_random_generator`, `random_string`
 
 namespace sz = ashvardanian::stringzilla;
 using namespace sz::scripts;
@@ -95,7 +79,7 @@ using sz::literals::operator""_bs; // for `sz::byteset`
 using namespace std::literals; // for ""sv
 #endif
 
-/*
+/**
  *  Instantiate all the templates to make the symbols visible and also check
  *  for weird compilation errors on uncommon paths.
  */
@@ -182,14 +166,39 @@ static void test_arithmetical_utilities() {
 /** @brief Validates `sz_sequence_t` and related construction utilities. */
 static void test_sequence_struct() {
     // Make sure the sequence helper functions work as expected
-    // for both trivial c-style arrays and
-    sz_sequence_t sequence;
-    sz_cptr_t strings[] = {"banana", "apple", "cherry"};
-    sz_sequence_from_null_terminated_strings(strings, 3, &sequence);
-    assert(sequence.count == 3);
-    assert("banana"_sv == sequence.get_start(sequence.handle, 0));
-    assert("apple"_sv == sequence.get_start(sequence.handle, 1));
-    assert("cherry"_sv == sequence.get_start(sequence.handle, 2));
+    // for both trivial c-style arrays and more complicated STL containers.
+    {
+        sz_sequence_t sequence;
+        sz_cptr_t strings[] = {"banana", "apple", "cherry"};
+        sz_sequence_from_null_terminated_strings(strings, 3, &sequence);
+        assert(sequence.count == 3);
+        assert("banana"_sv == sequence.get_start(sequence.handle, 0));
+        assert("apple"_sv == sequence.get_start(sequence.handle, 1));
+        assert("cherry"_sv == sequence.get_start(sequence.handle, 2));
+    }
+    // Do the same for STL:
+    {
+        using strings_vector_t = std::vector<std::string>;
+        strings_vector_t strings = {"banana", "apple", "cherry"};
+        sz_sequence_t sequence;
+        sequence.handle = &strings;
+        sequence.count = strings.size();
+        sequence.get_start =
+            reinterpret_cast<sz_sequence_member_start_t>(+[](void *handle, sz_size_t index) noexcept -> sz_cptr_t {
+                auto const &strings = *static_cast<strings_vector_t *>(handle);
+                return strings[index].c_str();
+            });
+        sequence.get_length =
+            reinterpret_cast<sz_sequence_member_length_t>(+[](void *handle, sz_size_t index) noexcept -> sz_size_t {
+                auto const &strings = *static_cast<strings_vector_t *>(handle);
+                return strings[index].size();
+            });
+
+        assert(sequence.count == 3);
+        assert("banana"_sv == sequence.get_start(sequence.handle, 0));
+        assert("apple"_sv == sequence.get_start(sequence.handle, 1));
+        assert("cherry"_sv == sequence.get_start(sequence.handle, 2));
+    }
 }
 
 /** @brief Validates `sz_memory_allocator_t` and related construction utilities. */
@@ -334,192 +343,8 @@ static void test_random_generator_equivalence(sz_fill_random_t generate_base, sz
             test_on_nonce(length, nonce);
 }
 
-/**
- *  @brief  Tests the correctness of the string class Levenshtein distance computation,
- *          as well as the similarity scoring functions for bioinformatics-like workloads.
- */
-template <typename base_operator_, typename simd_operator_>
-static void test_edit_distance_equivalence(base_operator_ distance_base, simd_operator_ &&distance_simd) {
-
-    // Let's log the error and the strings that caused it.
-    auto test_distance = [&](std::string const &l, std::string const &r) {
-        auto result_base = distance_base(l, r);
-        auto result_simd = distance_simd(l, r);
-        if (result_base == result_simd) return;
-
-        char const *ellipsis = l.length() > 22 || r.length() > 22 ? "..." : "";
-        std::printf("Edit Distance error: distance(\"%.22s%s\", \"%.22s%s\"); got %zd, expected %zd\n", //
-                    l.c_str(), ellipsis, r.c_str(), ellipsis, result_simd, result_base);
-    };
-
-    // Pick a few representative cases for ASCII and Unicode strings.
-    struct {
-        char const *left;
-        char const *right;
-        std::size_t edit_distance_bytes;
-        std::size_t edit_distance_utf8;
-    } explicit_cases[] = {
-        {"atca", "ctactcaccc", 6, 6},
-        {"listen", "silent", 4, 4},
-        {"A", "=", 1, 1},
-        {"a", "a", 0, 0},
-        {"", "", 0, 0},
-        {"", "abc", 3, 3},
-        {"abc", "", 3, 3},
-        {"abc", "ac", 1, 1},                   // one deletion
-        {"abc", "a_bc", 1, 1},                 // one insertion
-        {"abc", "adc", 1, 1},                  // one substitution
-        {"abc", "abc", 0, 0},                  // same string
-        {"ggbuzgjux{}l", "gbuzgjux{}l", 1, 1}, // one insertion (prepended)
-        {"apple", "aple", 1, 1},
-        //
-        // Unicode:
-        {"αβγδ", "αγδ", 2, 1},                      // Each Greek symbol is 2 bytes in size
-        {"مرحبا بالعالم", "مرحبا يا عالم", 3, 2},   // "Hello World" vs "Welcome to the World" ?
-        {"école", "école", 3, 2},                   // letter "é" as a single character vs "e" + "´"
-        {"Schön", "Scho\u0308n", 3, 2},             // "ö" represented as "o" + "¨"
-        {"💖", "💗", 1, 1},                         // 4-byte emojis: Different hearts
-        {"𠜎 𠜱 𠝹 𠱓", "𠜎𠜱𠝹𠱓", 3, 3},          // Ancient Chinese characters, no spaces vs spaces
-        {"München", "Muenchen", 2, 2},              // German name with umlaut vs. its transcription
-        {"façade", "facade", 2, 1},                 // "ç" represented as "c" with cedilla vs. plain "c"
-        {"こんにちは世界", "こんばんは世界", 3, 2}, // Japanese: "Good morning world" vs "Good evening world"
-        {"👩‍👩‍👧‍👦", "👨‍👩‍👧‍👦", 1, 1}, // Family emojis with different compositions
-        {"Data科学123", "Data科學321", 3, 3},
-        {"🙂🌍🚀", "🙂🌎✨", 5, 2},
-    };
-    for (auto explicit_case : explicit_cases) test_distance(explicit_case.left, explicit_case.right);
-
-    // Gradually increasing the length of the strings.
-    for (std::size_t length = 0; length != 1000; ++length) {
-        std::string left, right;
-        for (std::size_t i = 0; i != length; ++i) left.push_back('a'), right.push_back('b');
-        test_distance(left, right);
-    }
-
-    // Generate random strings and compare the results.
-    struct {
-        std::size_t length_upper_bound;
-        std::size_t iterations;
-    } fuzzy_cases[] = {
-        {10, 1000},
-        {64, 128},
-        {100, 100},
-        {1000, 10},
-    };
-    std::mt19937 &generator = global_random_generator();
-    sz::string first, second;
-    for (auto fuzzy_case : fuzzy_cases) {
-        char alphabet[4] = {'a', 'c', 'g', 't'};
-        std::uniform_int_distribution<std::size_t> length_distribution(0, fuzzy_case.length_upper_bound);
-        for (std::size_t i = 0; i != fuzzy_case.iterations; ++i) {
-            std::size_t first_length = length_distribution(generator);
-            std::size_t second_length = length_distribution(generator);
-            std::generate_n(std::back_inserter(first), first_length, [&]() { return alphabet[generator() % 4]; });
-            std::generate_n(std::back_inserter(second), second_length, [&]() { return alphabet[generator() % 4]; });
-            test_distance(first, second);
-
-            // Try computing the distance on equal-length chunks of those strings.
-            first.resize((std::min)(first_length, second_length));
-            second.resize((std::min)(first_length, second_length));
-            test_distance(first, second);
-
-            // Discard before the next iteration.
-            first.clear();
-            second.clear();
-        }
-    }
-}
-
-/** @brief Wraps a hardware-specific Levenshtein-distance backend. */
-template <sz_levenshtein_distance_t levenshtein_distance_>
-struct levenshtein_from_sz {
-
-    sz_size_t bound = SZ_SIZE_MAX;
-
-    inline sz_size_t operator()(std::string const &a, std::string const &b) const noexcept(false) {
-        sz_size_t result_distance;
-        sz_status_t status = levenshtein_distance_( //
-            a.data(), a.size(),                     //
-            b.data(), b.size(),                     //
-            bound, NULL, &result_distance);
-        assert(status == sz_success_k);
-        return result_distance;
-    }
-};
-
-/** @brief Wraps a hardware-specific Levenshtein-distance backend into something @b `bench_unary`-compatible . */
-template <sz_needleman_wunsch_score_t needleman_wunsch_>
-struct alignment_score_from_sz {
-
-    sz_size_t bound = SZ_SIZE_MAX;
-    error_costs_256x256_t costs = unary_substitution_costs();
-
-    inline sz_size_t operator()(std::string const &a, std::string const &b) const noexcept(false) {
-        sz_ssize_t result_score;
-        sz_status_t status = needleman_wunsch_( //
-            a.data(), a.size(),                 //
-            b.data(), b.size(),                 //
-            costs.data(), (sz_error_cost_t)-1,  //
-            NULL, &result_score);
-        sz_size_t result_distance = (sz_size_t)(-result_score);
-        assert(status == sz_success_k);
-        return result_distance;
-    }
-};
-
 static void test_equivalence() {
 
-    test_edit_distance_equivalence(                            //
-        levenshtein_from_sz<sz_levenshtein_distance_serial>(), //
-        alignment_score_from_sz<sz_needleman_wunsch_score_serial>());
-
-#if SZ_USE_OPENMP
-    test_edit_distance_equivalence(                            //
-        levenshtein_from_sz<sz_levenshtein_distance_serial>(), //
-        [](std::string const &a, std::string const &b) {
-            return sz::openmp::levenshtein_distance(a, b, std::allocator<char>());
-        });
-#endif
-
-#if 0
-    using arrow_strings_tape_cuda_t = sz::arrow_strings_tape<char, sz_u32_t, sz::cuda::unified_alloc<char>>;
-    arrow_strings_tape_cuda_t a_tape, b_tape;
-    std::vector<sz_size_t, sz::cuda::unified_alloc<sz_size_t>> cuda_results(1);
-    test_edit_distance_equivalence(                            //
-        levenshtein_from_sz<sz_levenshtein_distance_serial>(), //
-        [&](std::string const &a, std::string const &b) {
-            // Compiling heavy CUDA templates is tricky and time-consuming!
-            a_tape.try_assign(&a, &a + 1);
-            b_tape.try_assign(&b, &b + 1);
-            sz_status_t status = sz_levenshtein_distances_u32tape( //
-                a_tape.buffer().data(), a_tape.offsets().data(),   //
-                b_tape.buffer().data(), b_tape.offsets().data(),   //
-                1, SZ_SIZE_MAX, nullptr, cuda_results.data());
-            assert(status == sz_success_k);
-            return cuda_results[0];
-        });
-#endif
-
-#if SZ_USE_CUDA
-    using arrow_strings_tape_cuda_t = sz::arrow_strings_tape<char, sz_u32_t, sz::cuda::unified_alloc<char>>;
-    using malloc_size_cuda_t = sz::cuda::unified_alloc<sz_size_t>;
-    std::vector<sz_size_t, malloc_size_cuda_t> cuda_results(1);
-    test_edit_distance_equivalence(                            //
-        levenshtein_from_sz<sz_levenshtein_distance_serial>(), //
-        [&](std::string const &a, std::string const &b) {
-            // Compiling heavy CUDA templates is tricky and time-consuming!
-            arrow_strings_tape_cuda_t a_tape, b_tape;
-            sz::status_t status;
-            status = a_tape.try_assign(&a, &a + 1);
-            assert(status == sz::status_t::success_k);
-            status = b_tape.try_assign(&b, &b + 1);
-            assert(status == sz::status_t::success_k);
-            status = sz::cuda::levenshtein_distances(a_tape.view(), b_tape.view(), cuda_results.data());
-            assert(status == sz::status_t::success_k);
-            return cuda_results[0];
-        });
-#endif
-
 #if SZ_USE_HASWELL
     test_hash_equivalence(                                      //
         sz_hash_serial, sz_hash_state_init_serial,              //
@@ -727,7 +552,7 @@ static void test_memory_utilities( //
     // https://en.cppreference.com/w/cpp/numeric/random/exponential_distribution
     std::string dataset(max_l2_size, '-');
     auto &gen = global_random_generator();
-    uniform_uint8_distribution_t alphabet_distribution('a', 'z');
+    uniform_u8_distribution_t alphabet_distribution('a', 'z');
     std::uniform_int_distribution<std::size_t> length_distribution(1, max_l2_size);
     std::exponential_distribution<double> shift_distribution(1.0 / SZ_CACHE_LINE_WIDTH);
 
@@ -1261,63 +1086,6 @@ static void test_non_stl_extensions_for_reads() {
             "近来，加文出席微博之夜时对着镜头频繁摆出假笑表情、一度累瘫睡倒在沙发上的照片被广泛转发，引发对他失去童年、"
             "被过度消费的担忧。八岁的加文，已当网红近六年了，可以说，自懂事以来，他没有过过一天没有名气的日子。",
         (void)0, s.bytesum() == accumulate_bytes(s));
-
-    // Computing edit-distances.
-    assert(sz::hamming_distance(str("hello"), str("hello")) == 0);
-    assert(sz::hamming_distance(str("hello"), str("hell")) == 1);
-    assert(sz::hamming_distance(str("abc"), str("adc")) == 1);                // one substitution
-    assert(sz::hamming_distance(str("αβγδ"), str("αxxγδ")) == 2);             // replace Beta UTF8 codepoint
-    assert(sz::hamming_distance_utf8(str("abcdefgh"), str("_bcdefg_")) == 2); // replace ASCI prefix and suffix
-    assert(sz::hamming_distance_utf8(str("αβγδ"), str("αγγδ")) == 1);         // replace Beta UTF8 codepoint
-
-    assert(sz::levenshtein_distance(str("hello"), str("hello")) == 0);
-    assert(sz::levenshtein_distance(str("hello"), str("hell")) == 1);
-    assert(sz::levenshtein_distance(str(""), str("")) == 0);
-    assert(sz::levenshtein_distance(str(""), str("abc")) == 3);
-    assert(sz::levenshtein_distance(str("abc"), str("")) == 3);
-    assert(sz::levenshtein_distance(str("abc"), str("ac")) == 1);                   // one deletion
-    assert(sz::levenshtein_distance(str("abc"), str("a_bc")) == 1);                 // one insertion
-    assert(sz::levenshtein_distance(str("abc"), str("adc")) == 1);                  // one substitution
-    assert(sz::levenshtein_distance(str("ggbuzgjux{}l"), str("gbuzgjux{}l")) == 1); // one insertion (prepended)
-    assert(sz::levenshtein_distance(str("abcdefgABCDEFG"), str("ABCDEFGabcdefg")) == 14);
-
-    assert(sz::levenshtein_distance_utf8(str("hello"), str("hell")) == 1);           // no unicode symbols, just ASCII
-    assert(sz::levenshtein_distance_utf8(str("𠜎 𠜱 𠝹 𠱓"), str("𠜎𠜱𠝹𠱓")) == 3); // add 3 whitespaces in Chinese
-    assert(sz::levenshtein_distance_utf8(str("💖"), str("💗")) == 1);
-
-    assert(sz::levenshtein_distance_utf8(str("αβγδ"), str("αγδ")) == 1); // insert Beta
-    assert(sz::levenshtein_distance_utf8(str("école"), str("école")) ==
-           2); // etter "é" as a single character vs "e" + "´"
-    assert(sz::levenshtein_distance_utf8(str("façade"), str("facade")) == 1);     // "ç" with cedilla vs. plain
-    assert(sz::levenshtein_distance_utf8(str("Schön"), str("Scho\u0308n")) == 2); // "ö" represented as "o" + "¨"
-    assert(sz::levenshtein_distance_utf8(str("München"), str("Muenchen")) == 2); // German with umlaut vs. transcription
-    assert(sz::levenshtein_distance_utf8(str("こんにちは世界"), str("こんばんは世界")) == 2);
-
-    // Computing alignment scores.
-    using matrix_t = std::int8_t[256][256];
-    error_costs_256x256_t costs_vector = unary_substitution_costs();
-    matrix_t &costs = *reinterpret_cast<matrix_t *>(costs_vector.data());
-
-    assert(sz::alignment_score(str("listen"), str("silent"), costs, -1) == -4);
-    assert(sz::alignment_score(str("abcdefgABCDEFG"), str("ABCDEFGabcdefg"), costs, -1) == -14);
-    assert(sz::alignment_score(str("hello"), str("hello"), costs, -1) == 0);
-    assert(sz::alignment_score(str("hello"), str("hell"), costs, -1) == -1);
-
-#if _SZ_DEPRECATED_FINGERPRINTS
-
-    // Computing rolling fingerprints.
-    assert(sz::hashes_fingerprint<512>(str("aaaa"), 3).count() == 1);
-    assert(sz::hashes_fingerprint<512>(str("hello"), 4).count() == 2);
-    assert(sz::hashes_fingerprint<512>(str("hello"), 3).count() == 3);
-
-    // No matter how many times one repeats a character, the hash should only contain at most one set bit.
-    assert(sz::hashes_fingerprint<512>(str("a"), 3).count() == 0);
-    assert(sz::hashes_fingerprint<512>(str("aa"), 3).count() == 0);
-    assert(sz::hashes_fingerprint<512>(str("aaa"), 3).count() == 1);
-    assert(sz::hashes_fingerprint<512>(str("aaaa"), 3).count() == 1);
-    assert(sz::hashes_fingerprint<512>(str("aaaaa"), 3).count() == 1);
-#endif
-    // Computing fuzzy search results.
 }
 
 void test_non_stl_extensions_for_updates() {
diff --git a/scripts/test.cu b/scripts/test.cu
new file mode 100644
index 00000000..8d40170f
--- /dev/null
+++ b/scripts/test.cu
@@ -0,0 +1,419 @@
+/**
+ *  @brief   Extensive @b stress-testing suite for StringCuZilla parallel operations, written in CUDA C++.
+ *  @see     Stress-tests on real-world and synthetic data are integrated into the @b `scripts/bench*.cpp` benchmarks.
+ *
+ *  @file    test.cu
+ *  @author  Ash Vardanian
+ */
+#undef NDEBUG // ! Enable all assertions for testing
+
+/**
+ *  ! Overload the following with caution.
+ *  ! Those parameters must never be explicitly set during releases,
+ *  ! but they come handy during development, if you want to validate
+ *  ! different ISA-specific implementations.
+
+#define SZ_USE_HASWELL 0
+#define SZ_USE_SKYLAKE 0
+#define SZ_USE_ICE 0
+#define SZ_USE_NEON 0
+#define SZ_USE_SVE 0
+ */
+#if defined(SZ_DEBUG)
+#undef SZ_DEBUG
+#endif
+#define SZ_DEBUG 1 // Enforce aggressive logging for this unit.
+
+/**
+ *  ! Overload the following with caution to enable parallelism.
+ *  ! They control the OpenMP CPU backend as well as the CUDA GPU backend.
+ */
+#include <stringcuzilla/similarity.hpp>
+
+#if SZ_USE_CUDA
+#include <stringcuzilla/similarity.cuh>
+#endif
+
+#if !_SZ_IS_CPP17
+#error "This test requires C++17 or later."
+#endif
+
+#include "test.hpp" // `levenshtein_baseline`
+
+namespace sz = ashvardanian::stringzilla;
+using namespace sz::scripts;
+using namespace std::literals; // for ""sv
+
+#if !SZ_USE_CUDA
+using arrow_strings_tape_t = sz::arrow_strings_tape<char, sz_size_t, std::allocator<char>>;
+template <typename value_type_>
+using unified_vector = std::vector<value_type_, std::allocator<value_type_>>;
+#else
+using arrow_strings_tape_t = sz::arrow_strings_tape<char, sz_size_t, sz::cuda::unified_alloc<char>>;
+template <typename value_type_>
+using unified_vector = std::vector<value_type_, sz::cuda::unified_alloc<value_type_>>;
+#endif
+
+struct levenshtein_baseline_t {
+    sz::status_t operator()(arrow_strings_tape_t const &first, arrow_strings_tape_t const &second,
+                            sz_size_t *results) const {
+        _sz_assert(first.size() == second.size());
+#pragma omp parallel for
+        for (std::size_t i = 0; i != first.size(); ++i)
+            results[i] = sz::scripts::levenshtein_baseline(first[i].data(), first[i].size(), //
+                                                           second[i].data(), second[i].size());
+        return sz::status_t::success_k;
+    }
+};
+
+struct needleman_wunsch_baseline_t {
+
+    sz::scripts::error_costs_256x256_t substitution_costs = sz::scripts::error_costs_256x256_unary();
+    sz::error_cost_t gap_cost = -1;
+
+    sz::status_t operator()(arrow_strings_tape_t const &first, arrow_strings_tape_t const &second,
+                            sz_ssize_t *results) const {
+        _sz_assert(first.size() == second.size());
+
+#pragma omp parallel for
+        for (std::size_t i = 0; i != first.size(); ++i)
+            results[i] = sz::scripts::needleman_wunsch_baseline(
+                first[i].data(), first[i].size(),   //
+                second[i].data(), second[i].size(), //
+                sz::error_costs_256x256_lookup_t {substitution_costs.data()}, gap_cost);
+        return sz::status_t::success_k;
+    }
+};
+
+struct smith_waterman_baseline_t {
+
+    sz::scripts::error_costs_256x256_t substitution_costs = sz::scripts::error_costs_256x256_unary();
+    sz::error_cost_t gap_cost = -1;
+
+    sz::status_t operator()(arrow_strings_tape_t const &first, arrow_strings_tape_t const &second,
+                            sz_ssize_t *results) const {
+        _sz_assert(first.size() == second.size());
+
+#pragma omp parallel for
+        for (std::size_t i = 0; i != first.size(); ++i)
+            results[i] = sz::scripts::smith_waterman_baseline(
+                first[i].data(), first[i].size(),   //
+                second[i].data(), second[i].size(), //
+                sz::error_costs_256x256_lookup_t {substitution_costs.data()}, gap_cost);
+        return sz::status_t::success_k;
+    }
+};
+
+using levenshtein_serial_t = sz::openmp::levenshtein_distances<sz_cap_parallel_k, char, std::allocator<char>>;
+using levenshtein_utf8_serial_t = sz::openmp::levenshtein_distances_utf8<sz_cap_parallel_k, char, std::allocator<char>>;
+using needleman_wunsch_serial_t = sz::openmp::needleman_wunsch_scores<sz_cap_parallel_k, char, std::allocator<char>>;
+using smith_waterman_serial_t = sz::openmp::smith_waterman_scores<sz_cap_parallel_k, char, std::allocator<char>>;
+
+/**
+ *  In @b AVX-512:
+ *  - for Global Alignments, we can vectorize the min-max calculation for diagonal "walkers"
+ *  - for Local Alignments, we can vectorize the character substitution lookups for horizontal "walkers"
+ */
+using levenshtein_ice_t = sz::openmp::levenshtein_distances<sz_cap_ice_k, char, std::allocator<char>>;
+using levenshtein_utf8_ice_t = sz::openmp::levenshtein_distances_utf8<sz_cap_ice_k, char, std::allocator<char>>;
+using needleman_wunsch_ice_t = sz::openmp::needleman_wunsch_scores<sz_cap_ice_k, char, std::allocator<char>>;
+using smith_waterman_ice_t = sz::openmp::smith_waterman_scores<sz_cap_ice_k, char, std::allocator<char>>;
+
+#if 0
+/**
+ *  In @b CUDA:
+ *  - for GPUs before Hopper, we can use the @b SIMT model for warp-level parallelism using diagonal "walkers"
+ *  - for GPUs after Hopper, we compound that with thread-level @b SIMD via @b DPX instructions for min-max
+ */
+using levenshtein_cuda_t = sz::cuda::levenshtein_distances<sz_cap_cuda_k, char>;
+using levenshtein_utf8_cuda_t = sz::cuda::levenshtein_distances_utf8<sz_cap_cuda_k, char>;
+using needleman_wunsch_cuda_t = sz::cuda::needleman_wunsch_scores<sz_cap_cuda_k, char>;
+using smith_waterman_cuda_t = sz::cuda::smith_waterman_scores<sz_cap_cuda_k, char>;
+
+using levenshtein_hopper_t = sz::cuda::levenshtein_distances<sz_cap_hopper_k, char>;
+using levenshtein_utf8_hopper_t = sz::cuda::levenshtein_distances_utf8<sz_cap_hopper_k, char>;
+using needleman_wunsch_hopper_t = sz::cuda::needleman_wunsch_scores<sz_cap_hopper_k, char>;
+using smith_waterman_hopper_t = sz::cuda::smith_waterman_scores<sz_cap_hopper_k, char>;
+#endif
+
+template <typename score_type_>
+void edit_distance_log_mismatch(std::string const &first, std::string const &second, //
+                                score_type_ result_base, score_type_ result_simd) {
+    char const *ellipsis = first.length() > 22 || second.length() > 22 ? "..." : "";
+    char const *format_string;
+    constexpr bool is_signed = std::is_signed<score_type_>();
+    if constexpr (is_signed) {
+        format_string = "Edit Distance error (got %zd, expected %zd): \"%.22s%s\" ⇔ \"%.22s%s\" \n";
+    }
+    else { format_string = "Edit Distance error (got %zu, expected %zu): \"%.22s%s\" ⇔ \"%.22s%s\" \n"; }
+    std::printf(format_string, first.c_str(), ellipsis, second.c_str(), ellipsis, result_base, result_simd);
+}
+
+/**
+ *  @brief  Tests the correctness of the string class Levenshtein distance computation,
+ *          as well as the similarity scoring functions for bioinformatics-like workloads.
+ */
+template <typename score_type_, typename base_operator_, typename simd_operator_>
+static void edit_distances_compare(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
+                                   std::size_t batch_size = 1024 * 16, std::size_t max_string_length = 512) {
+
+    using score_t = score_type_;
+
+    std::vector<std::pair<std::string, std::string>> test_cases = {
+        {"abc", "abc"},                  // same string; distance ~ 0
+        {"listen", "silent"},            // distance ~ 4
+        {"atca", "ctactcaccc"},          // distance ~ 6
+        {"A", "="},                      // distance ~ 1
+        {"a", "a"},                      // distance ~ 0
+        {"", ""},                        // distance ~ 0
+        {"", "abc"},                     // distance ~ 3
+        {"abc", ""},                     // distance ~ 3
+        {"abc", "ac"},                   // one deletion; distance ~ 1
+        {"abc", "a_bc"},                 // one insertion; distance ~ 1
+        {"ggbuzgjux{}l", "gbuzgjux{}l"}, // one (prepended) insertion; distance ~ 1
+        {"abc", "adc"},                  // one substitution; distance ~ 1
+        {"apple", "aple"},               // distance ~ 1
+        //
+        // Unicode:
+        {"αβγδ", "αγδ"},                      // Each Greek symbol is 2 bytes in size; 2 bytes, 1 runes diff.
+        {"مرحبا بالعالم", "مرحبا يا عالم"},   // "Hello World" vs "Welcome to the World" ?; 3 bytes, 2 runes diff.
+        {"école", "école"},                   // letter "é" as a single character vs "e" + "´"; 3 bytes, 2 runes diff.
+        {"Schön", "Scho\u0308n"},             // "ö" represented as "o" + "¨"; 3 bytes, 2 runes diff.
+        {"💖", "💗"},                         // 4-byte emojis: Different hearts; 1 bytes, 1 runes diff.
+        {"𠜎 𠜱 𠝹 𠱓", "𠜎𠜱𠝹𠱓"},          // Ancient Chinese characters, no spaces vs spaces; 3 bytes, 3 runes
+        {"München", "Muenchen"},              // German name with umlaut vs. its transcription; 2 bytes, 2 runes
+        {"façade", "facade"},                 // "ç" represented as "c" with cedilla vs. plain "c"; 2 bytes, 1 runes
+        {"こんにちは世界", "こんばんは世界"}, // "Good morning world" vs "Good evening world"; 3 bytes, 2 runes
+        {"👩‍👩‍👧‍👦", "👨‍👩‍👧‍👦"}, // Different family emojis; 1 bytes, 1 runes
+        {"Data科学123", "Data科學321"},                             // 3 bytes, 3 runes
+        {"🙂🌍🚀", "🙂🌎✨"},                                       // 5 bytes, 2 runes
+    };
+
+    // First check with a batch-size of 1
+    unified_vector<score_t> results_base(1), results_simd(1);
+    arrow_strings_tape_t first_tape, second_tape;
+    for (auto [first, second] : test_cases) {
+
+        // Reset the tapes and results
+        results_base[0] = 0, results_simd[0] = 0;
+        first_tape.try_assign(&first, &first + 1);
+        second_tape.try_assign(&second, &second + 1);
+
+        // Compute with both backends
+        sz::status_t status_base = base_operator(first_tape, second_tape, results_base.data());
+        sz::status_t status_simd = simd_operator(first_tape, second_tape, results_simd.data());
+        _sz_assert(status_base == sz::status_t::success_k);
+        _sz_assert(status_simd == sz::status_t::success_k);
+        if (results_base[0] != results_simd[0])
+            edit_distance_log_mismatch(first, second, results_base[0], results_simd[0]);
+    }
+
+    // Unzip the test cases into two separate tapes and perform batch processing
+    results_base.resize(test_cases.size());
+    results_simd.resize(test_cases.size());
+    first_tape.reset();
+    second_tape.reset();
+    for (auto [first, second] : test_cases) {
+        _sz_assert(first_tape.try_append({first.data(), first.size()}) == sz::status_t::success_k);
+        _sz_assert(second_tape.try_append({second.data(), second.size()}) == sz::status_t::success_k);
+    }
+
+    // Compute with both backends
+    sz::status_t status_base = base_operator(first_tape, second_tape, results_base.data());
+    sz::status_t status_simd = simd_operator(first_tape, second_tape, results_simd.data());
+    _sz_assert(status_base == sz::status_t::success_k);
+    _sz_assert(status_simd == sz::status_t::success_k);
+
+    // Individually log the failed results
+    for (std::size_t i = 0; i != test_cases.size(); ++i) {
+        if (results_base[i] == results_simd[i]) continue;
+        edit_distance_log_mismatch(test_cases[i].first, test_cases[i].second, results_base[i], results_simd[i]);
+    }
+
+    // Generate some random strings, using a small alphabet
+    for (std::size_t iteration_idx = 0; iteration_idx < 10; ++iteration_idx) {
+        std::vector<std::string> first_array(batch_size), second_array(batch_size);
+        for (std::size_t i = 0; i != batch_size; ++i) {
+            std::size_t first_length = 1u + std::rand() % max_string_length;
+            std::size_t second_length = 1u + std::rand() % max_string_length;
+            first_array[i] = random_string(first_length, "abc", 3);
+            second_array[i] = random_string(second_length, "abc", 3);
+        }
+
+        // Convert to a GPU-friendly layout
+        first_tape.try_assign(first_array.data(), first_array.data() + batch_size);
+        second_tape.try_assign(second_array.data(), second_array.data() + batch_size);
+        results_base.resize(batch_size);
+        results_simd.resize(batch_size);
+
+        // Compute with both backends
+        sz::status_t status_base = base_operator(first_tape, second_tape, results_base.data());
+        sz::status_t status_simd = simd_operator(first_tape, second_tape, results_simd.data());
+        _sz_assert(status_base == sz::status_t::success_k);
+        _sz_assert(status_simd == sz::status_t::success_k);
+
+        // Individually log the failed results
+        for (std::size_t i = 0; i != test_cases.size(); ++i) {
+            if (results_base[i] == results_simd[i]) continue;
+            edit_distance_log_mismatch(first_array[i], second_array[i], results_base[i], results_simd[i]);
+        }
+    }
+}
+
+static void test_equivalence(std::size_t batch_size = 1024, std::size_t max_string_length = 100) {
+
+    constexpr sz_capability_t serial_k = sz_cap_serial_k;
+    constexpr sz_capability_t parallel_k = sz_cap_parallel_k;
+
+    edit_distances_compare<sz_size_t>(                                              //
+        levenshtein_baseline_t {},                                                  //
+        sz::openmp::levenshtein_distances<serial_k, char, std::allocator<char>> {}, //
+        batch_size, max_string_length);
+
+    edit_distances_compare<sz_size_t>(                                                //
+        levenshtein_baseline_t {},                                                    //
+        sz::openmp::levenshtein_distances<parallel_k, char, std::allocator<char>> {}, //
+        batch_size, max_string_length);
+
+    constexpr sz::error_cost_t blosum62_gap_opening_cost = 11;
+    constexpr sz::error_cost_t blosum62_gap_extension_cost = 1;
+    sz::scripts::error_costs_256x256_t blosum62 = sz::scripts::error_costs_256x256_blosum62();
+    using substituter_t = sz::error_costs_256x256_lookup_t;
+
+    edit_distances_compare<sz_ssize_t>(                                      //
+        needleman_wunsch_baseline_t {blosum62, blosum62_gap_extension_cost}, //
+        sz::openmp::needleman_wunsch_scores<serial_k, char, substituter_t, std::allocator<char>> {
+            substituter_t {blosum62.data()}, blosum62_gap_extension_cost}, //
+        batch_size, max_string_length);
+
+    edit_distances_compare<sz_ssize_t>(                                      //
+        needleman_wunsch_baseline_t {blosum62, blosum62_gap_extension_cost}, //
+        sz::openmp::needleman_wunsch_scores<parallel_k, char, substituter_t, std::allocator<char>> {
+            substituter_t {blosum62.data()}, blosum62_gap_extension_cost}, //
+        batch_size, max_string_length);
+
+    edit_distances_compare<sz_ssize_t>(                                    //
+        smith_waterman_baseline_t {blosum62, blosum62_gap_extension_cost}, //
+        sz::openmp::smith_waterman_scores<serial_k, char, substituter_t, std::allocator<char>> {
+            substituter_t {blosum62.data()}, blosum62_gap_extension_cost}, //
+        batch_size, max_string_length);
+
+    edit_distances_compare<sz_ssize_t>(                                    //
+        smith_waterman_baseline_t {blosum62, blosum62_gap_extension_cost}, //
+        sz::openmp::smith_waterman_scores<parallel_k, char, substituter_t, std::allocator<char>> {
+            substituter_t {blosum62.data()}, blosum62_gap_extension_cost}, //
+        batch_size, max_string_length);
+};
+
+#if 0
+/**
+ *  @brief  Invokes different C++ member methods of immutable strings to cover
+ *          extensions beyond the STL API.
+ */
+template <typename string_type>
+static void test_non_stl_extensions_for_reads() {
+    using str = string_type;
+
+    // Computing edit-distances.
+    _sz_assert(sz::hamming_distance(str("hello"), str("hello")) == 0);
+    _sz_assert(sz::hamming_distance(str("hello"), str("hell")) == 1);
+    _sz_assert(sz::hamming_distance(str("abc"), str("adc")) == 1);                // one substitution
+    _sz_assert(sz::hamming_distance(str("αβγδ"), str("αxxγδ")) == 2);             // replace Beta UTF8 codepoint
+    _sz_assert(sz::hamming_distance_utf8(str("abcdefgh"), str("_bcdefg_")) == 2); // replace ASCI prefix and suffix
+    _sz_assert(sz::hamming_distance_utf8(str("αβγδ"), str("αγγδ")) == 1);         // replace Beta UTF8 codepoint
+
+    _sz_assert(sz::levenshtein_distance(str("hello"), str("hello")) == 0);
+    _sz_assert(sz::levenshtein_distance(str("hello"), str("hell")) == 1);
+    _sz_assert(sz::levenshtein_distance(str(""), str("")) == 0);
+    _sz_assert(sz::levenshtein_distance(str(""), str("abc")) == 3);
+    _sz_assert(sz::levenshtein_distance(str("abc"), str("")) == 3);
+    _sz_assert(sz::levenshtein_distance(str("abc"), str("ac")) == 1);                   // one deletion
+    _sz_assert(sz::levenshtein_distance(str("abc"), str("a_bc")) == 1);                 // one insertion
+    _sz_assert(sz::levenshtein_distance(str("abc"), str("adc")) == 1);                  // one substitution
+    _sz_assert(sz::levenshtein_distance(str("ggbuzgjux{}l"), str("gbuzgjux{}l")) == 1); // one insertion (prepended)
+    _sz_assert(sz::levenshtein_distance(str("abcdefgABCDEFG"), str("ABCDEFGabcdefg")) == 14);
+
+    _sz_assert(sz::levenshtein_distance_utf8(str("hello"), str("hell")) == 1);           // no unicode symbols, just ASCII
+    _sz_assert(sz::levenshtein_distance_utf8(str("𠜎 𠜱 𠝹 𠱓"), str("𠜎𠜱𠝹𠱓")) == 3); // add 3 whitespaces in Chinese
+    _sz_assert(sz::levenshtein_distance_utf8(str("💖"), str("💗")) == 1);
+
+    _sz_assert(sz::levenshtein_distance_utf8(str("αβγδ"), str("αγδ")) == 1); // insert Beta
+    _sz_assert(sz::levenshtein_distance_utf8(str("école"), str("école")) ==
+           2); // etter "é" as a single character vs "e" + "´"
+    _sz_assert(sz::levenshtein_distance_utf8(str("façade"), str("facade")) == 1);     // "ç" with cedilla vs. plain
+    _sz_assert(sz::levenshtein_distance_utf8(str("Schön"), str("Scho\u0308n")) == 2); // "ö" represented as "o" + "¨"
+    _sz_assert(sz::levenshtein_distance_utf8(str("München"), str("Muenchen")) == 2); // German with umlaut vs. transcription
+    _sz_assert(sz::levenshtein_distance_utf8(str("こんにちは世界"), str("こんばんは世界")) == 2);
+
+    // Computing alignment scores.
+    using matrix_t = std::int8_t[256][256];
+    sz::scripts::error_costs_256x256_t substitution_costs = error_costs_256x256_unary();
+    matrix_t &costs = *reinterpret_cast<matrix_t *>(substitution_costs.data());
+
+    _sz_assert(sz::alignment_score(str("listen"), str("silent"), costs, -1) == -4);
+    _sz_assert(sz::alignment_score(str("abcdefgABCDEFG"), str("ABCDEFGabcdefg"), costs, -1) == -14);
+    _sz_assert(sz::alignment_score(str("hello"), str("hello"), costs, -1) == 0);
+    _sz_assert(sz::alignment_score(str("hello"), str("hell"), costs, -1) == -1);
+
+    // Computing rolling fingerprints.
+    _sz_assert(sz::hashes_fingerprint<512>(str("aaaa"), 3).count() == 1);
+    _sz_assert(sz::hashes_fingerprint<512>(str("hello"), 4).count() == 2);
+    _sz_assert(sz::hashes_fingerprint<512>(str("hello"), 3).count() == 3);
+
+    // No matter how many times one repeats a character, the hash should only contain at most one set bit.
+    _sz_assert(sz::hashes_fingerprint<512>(str("a"), 3).count() == 0);
+    _sz_assert(sz::hashes_fingerprint<512>(str("aa"), 3).count() == 0);
+    _sz_assert(sz::hashes_fingerprint<512>(str("aaa"), 3).count() == 1);
+    _sz_assert(sz::hashes_fingerprint<512>(str("aaaa"), 3).count() == 1);
+    _sz_assert(sz::hashes_fingerprint<512>(str("aaaaa"), 3).count() == 1);
+
+    // Computing fuzzy search results.
+}
+#endif
+
+int main(int argc, char const **argv) {
+
+    // Let's greet the user nicely
+    sz_unused(argc && argv);
+    std::printf("Hi, dear tester! You look nice today!\n");
+    std::printf("- Uses Haswell: %s \n", SZ_USE_HASWELL ? "yes" : "no");
+    std::printf("- Uses Skylake: %s \n", SZ_USE_SKYLAKE ? "yes" : "no");
+    std::printf("- Uses Ice Lake: %s \n", SZ_USE_ICE ? "yes" : "no");
+    std::printf("- Uses NEON: %s \n", SZ_USE_NEON ? "yes" : "no");
+    std::printf("- Uses SVE: %s \n", SZ_USE_SVE ? "yes" : "no");
+    std::printf("- Uses SVE2: %s \n", SZ_USE_SVE2 ? "yes" : "no");
+    std::printf("- Uses OpenMP: %s \n", SZ_USE_OPENMP ? "yes" : "no");
+    std::printf("- Uses CUDA: %s \n", SZ_USE_CUDA ? "yes" : "no");
+
+#if SZ_USE_CUDA
+    cudaError_t cuda_error = cudaFree(0); // Force context initialization
+    if (cuda_error != cudaSuccess) {
+        std::printf("CUDA initialization error: %s\n", cudaGetErrorString(cuda_error));
+        return 1;
+    }
+    int device_count = 0;
+    cuda_error = cudaGetDeviceCount(&device_count);
+    if (cuda_error != cudaSuccess) {
+        std::printf("CUDA error: %s\n", cudaGetErrorString(cuda_error));
+        return 1;
+    }
+    std::printf("CUDA device count: %d\n", device_count);
+    if (device_count == 0) {
+        std::printf("No CUDA devices found.\n");
+        return 1;
+    }
+    std::printf("- CUDA devices:\n");
+    cudaDeviceProp prop;
+    for (int i = 0; i < device_count; ++i) {
+        cuda_error = cudaGetDeviceProperties(&prop, i);
+        std::printf("  - %s\n", prop.name);
+    }
+    std::printf("- CUDA managed memory support: %s\n", prop.managedMemory == 1 ? "yes" : "no");
+    std::printf("- CUDA unified memory support: %s\n", prop.unifiedAddressing == 1 ? "yes" : "no");
+#endif
+
+    test_equivalence();
+
+    std::printf("All tests passed... Unbelievable!\n");
+    return 0;
+}
diff --git a/scripts/test.hpp b/scripts/test.hpp
index 5424d34b..161e1ae6 100644
--- a/scripts/test.hpp
+++ b/scripts/test.hpp
@@ -192,7 +192,7 @@ using error_costs_256x256_t = std::array<error_cost_t, 256 * 256>;
  *  @brief  Produces a substitution cost matrix for the Needleman-Wunsch alignment score,
  *          that would yield the same result as the negative Levenshtein distance.
  */
-inline error_costs_256x256_t unary_substitution_costs() {
+inline error_costs_256x256_t error_costs_256x256_unary() noexcept {
     error_costs_256x256_t result;
     for (std::size_t i = 0; i != 256; ++i)
         for (std::size_t j = 0; j != 256; ++j) //
@@ -200,6 +200,63 @@ inline error_costs_256x256_t unary_substitution_costs() {
     return result;
 }
 
+/**
+ *  @brief  Produces a substitution cost matrix using Blosum62 weights.
+ *
+ *  For characters corresponding to the 20 standard amino acids in Blosum62,
+ *  the matrix is initialized with the respective scores. For any other character,
+ *  a default penalty (e.g. -4) is used.
+ */
+inline error_costs_256x256_t error_costs_256x256_blosum62() noexcept {
+    error_costs_256x256_t result;
+
+    constexpr char amino_acids[] = "ARNDCQEGHILKMFPSTWYV";
+    constexpr int num_amino = 20;
+
+    // BLOSUM62 substitution matrix for the 20 amino acids.
+    constexpr int blosum62[num_amino][num_amino] = {
+        {4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0},
+        {-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3},
+        {-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3},
+        {-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3},
+        {0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1},
+        {-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2},
+        {-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2},
+        {0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3},
+        {-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3},
+        {-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3},
+        {-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1},
+        {-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2},
+        {-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1},
+        {-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1},
+        {-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2},
+        {1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2},
+        {0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0},
+        {-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3},
+        {-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1},
+        {0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4}};
+
+    // Build a lookup table to map any character (0–255) to an index in amino_acids,
+    // or -1 if the character is not one of the valid amino acids.
+    std::array<int, 256> amino_index;
+    amino_index.fill(-1);
+    for (int k = 0; k < num_amino; ++k) amino_index[static_cast<unsigned char>(amino_acids[k])] = k;
+
+    // Define a default penalty for characters not in Blosum62 (e.g., a typical gap or mismatch penalty)
+    constexpr error_cost_t default_penalty = -4;
+
+    // Initialize the substitution matrix.
+    for (std::size_t i = 0; i < 256; ++i) {
+        for (std::size_t j = 0; j < 256; ++j) {
+            int idx1 = amino_index[i];
+            int idx2 = amino_index[j];
+            if (idx1 != -1 && idx2 != -1) { result[i * 256 + j] = blosum62[idx1][idx2]; }
+            else { result[i * 256 + j] = default_penalty; }
+        }
+    }
+    return result;
+}
+
 } // namespace scripts
 } // namespace stringzilla
 } // namespace ashvardanian
\ No newline at end of file

From c96c5ed42a2a613dcfd939db56dcb80768fdede7 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 3 Apr 2025 16:47:05 +0000
Subject: [PATCH 287/751] Improve: Differentiate min/max-imizers

---
 include/stringcuzilla/similarity.hpp | 189 +++++++++++++++------------
 include/stringzilla/types.h          |   8 +-
 2 files changed, 113 insertions(+), 84 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index da0ccbdd..d5642328 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -67,12 +67,13 @@ namespace openmp {
  *  It updates the internal state to remember the last calculated value, as in Global Alignment it's
  *  always in the bottom-right corner of the DP matrix, which is evaluated last.
  */
-template <                                              //
-    typename first_iterator_type_ = char const *,       //
-    typename second_iterator_type_ = char const *,      //
-    typename score_type_ = sz_size_t,                   //
-    typename substituter_type_ = error_costs_uniform_t, //
-    sz_capability_t capability_ = sz_cap_serial_k       //
+template <                                                     //
+    typename first_iterator_type_ = char const *,              //
+    typename second_iterator_type_ = char const *,             //
+    typename score_type_ = sz_size_t,                          //
+    typename substituter_type_ = error_costs_uniform_t,        //
+    sz_alignment_direction_t direction_ = sz_align_maximize_k, //
+    sz_capability_t capability_ = sz_cap_serial_k              //
     >
 struct global_scorer {
 
@@ -87,10 +88,17 @@ struct global_scorer {
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
     using char_t = first_char_t;
 
+  private:
     substituter_t substituter_ {};
     error_cost_t gap_cost_ {1};
     score_t last_cell_ {0};
 
+    static inline score_t pick_best(score_t a, score_t b) noexcept {
+        if constexpr (direction_ == sz_align_minimize_k) { return sz_min_of_two(a, b); }
+        else { return sz_max_of_two(a, b); }
+    }
+
+  public:
     global_scorer() = default;
     global_scorer(substituter_t substituter, error_cost_t gap_cost) noexcept
         : substituter_(substituter), gap_cost_(gap_cost) {}
@@ -130,8 +138,8 @@ struct global_scorer {
             // ? because one of the strings has been reversed beforehand.
             error_cost_t cost_of_substitution = substituter_(first_reversed_slice[i], second_slice[i]);
             score_t score_if_substitution = score_pre_substitution + cost_of_substitution;
-            score_t score_if_deletion_or_insertion = sz_min_of_two(score_pre_deletion, score_pre_insertion) + gap_cost_;
-            score_t cell_score = sz_min_of_two(score_if_deletion_or_insertion, score_if_substitution);
+            score_t score_if_deletion_or_insertion = pick_best(score_pre_deletion, score_pre_insertion) + gap_cost_;
+            score_t cell_score = pick_best(score_if_deletion_or_insertion, score_if_substitution);
             scores_new[i] = cell_score;
         }
 
@@ -148,12 +156,13 @@ struct global_scorer {
  *  It updates the internal state to remember the minimum/maximum calculated value, as in Local Alignment
  *  it's always in the bottom-right corner of the DP matrix, which is evaluated last.
  */
-template <                                              //
-    typename first_iterator_type_ = char const *,       //
-    typename second_iterator_type_ = char const *,      //
-    typename score_type_ = sz_size_t,                   //
-    typename substituter_type_ = error_costs_uniform_t, //
-    sz_capability_t capability_ = sz_cap_serial_k       //
+template <                                                     //
+    typename first_iterator_type_ = char const *,              //
+    typename second_iterator_type_ = char const *,             //
+    typename score_type_ = sz_size_t,                          //
+    typename substituter_type_ = error_costs_uniform_t,        //
+    sz_alignment_direction_t direction_ = sz_align_maximize_k, //
+    sz_capability_t capability_ = sz_cap_serial_k              //
     >
 struct local_scorer {
 
@@ -168,10 +177,17 @@ struct local_scorer {
     static_assert(std::is_same<first_char_t, second_char_t>(), "String characters must be of the same type.");
     using char_t = first_char_t;
 
+  private:
     substituter_t substituter_ {};
-    error_cost_t gap_cost_ {-1};
-    score_t max_cell_ {0};
+    error_cost_t gap_cost_ {1};
+    score_t best_score_ {0};
 
+    static inline score_t pick_best(score_t a, score_t b) noexcept {
+        if constexpr (direction_ == sz_align_minimize_k) { return sz_min_of_two(a, b); }
+        else { return sz_max_of_two(a, b); }
+    }
+
+  public:
     local_scorer() = default;
     local_scorer(substituter_t substituter, error_cost_t gap_cost) noexcept
         : substituter_(substituter), gap_cost_(gap_cost) {}
@@ -188,14 +204,14 @@ struct local_scorer {
     /**
      *  @brief Extract the final result of the scoring operation which will be maximum encountered value.
      */
-    score_t score() const noexcept { return max_cell_; }
+    score_t score() const noexcept { return best_score_; }
 
     void operator()(                                                                              //
         first_iterator_t first_reversed_slice, second_iterator_t second_slice, sz_size_t const n, //
         score_t const *scores_pre_substitution, score_t const *scores_pre_insertion, score_t const *scores_pre_deletion,
         score_t *scores_new) noexcept {
 
-#pragma omp parallel for simd schedule(dynamic, 1) if (is_parallel())
+#pragma omp parallel for schedule(dynamic, 1) if (is_parallel())
         for (sz_size_t i = 0; i < n; ++i) {
             score_t score_pre_substitution = scores_pre_substitution[i];
             score_t score_pre_insertion = scores_pre_insertion[i];
@@ -205,15 +221,14 @@ struct local_scorer {
             // ? because one of the strings has been reversed beforehand.
             error_cost_t cost_of_substitution = substituter_(first_reversed_slice[i], second_slice[i]);
             score_t score_if_substitution = score_pre_substitution + cost_of_substitution;
-            score_t score_if_deletion_or_insertion = sz_min_of_two(score_pre_deletion, score_pre_insertion) + gap_cost_;
-            score_t cell_score = sz_min_of_three(score_if_deletion_or_insertion, score_if_substitution, 0);
+            score_t score_if_deletion_or_insertion = pick_best(score_pre_deletion, score_pre_insertion) + gap_cost_;
+            score_t score_if_substitution_or_reset = pick_best(score_if_substitution, 0);
+            score_t cell_score = pick_best(score_if_deletion_or_insertion, score_if_substitution_or_reset);
             scores_new[i] = cell_score;
 
             // Update the global maximum score if this cell beats it.
 #pragma omp critical
-            {
-                if (cell_score > max_cell_) max_cell_ = cell_score;
-            }
+            { best_score_ = pick_best(best_score_, cell_score); }
         }
     }
 };
@@ -243,13 +258,14 @@ struct local_scorer {
  *  @sa     For simplicity, use the `sz::levenshtein_distance[_utf8]` and `sz::needleman_wunsch_score`.
  *  @sa     For bulk API, use `sz::levenshtein_scores[_utf8]`.
  */
-template <                                                 //
-    sz_capability_t capability_ = sz_cap_serial_k,         //
-    sz_alignment_locality_t locality_ = sz_align_global_k, //
-    typename char_type_ = char,                            //
-    typename score_type_ = sz_size_t,                      //
-    typename substituter_type_ = error_costs_uniform_t,    //
-    typename allocator_type_ = dummy_alloc_t               //
+template <                                                     //
+    sz_capability_t capability_ = sz_cap_serial_k,             //
+    sz_alignment_direction_t direction_ = sz_align_maximize_k, //
+    sz_alignment_locality_t locality_ = sz_align_global_k,     //
+    typename char_type_ = char,                                //
+    typename score_type_ = sz_size_t,                          //
+    typename substituter_type_ = error_costs_uniform_t,        //
+    typename allocator_type_ = dummy_alloc_t                   //
     >
 struct diagonal_walker {
 
@@ -259,11 +275,16 @@ struct diagonal_walker {
     using allocator_t = allocator_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
-    static constexpr sz_alignment_locality_t locality = locality_;
+    static constexpr sz_alignment_locality_t locality_k = locality_;
+    static constexpr sz_alignment_direction_t direction_k = direction_;
 
     using allocated_t = typename allocator_t::value_type;
     static_assert(sizeof(allocated_t) == sizeof(char), "Allocator must be byte-aligned");
-    using scorer_t = global_scorer<char_t const *, char_t const *, score_t, substituter_t, capability_k>;
+    using global_scorer_t =
+        global_scorer<char_t const *, char_t const *, score_t, substituter_t, direction_k, capability_k>;
+    using local_scorer_t =
+        local_scorer<char_t const *, char_t const *, score_t, substituter_t, direction_k, capability_k>;
+    using scorer_t = std::conditional_t<locality_k == sz_align_local_k, local_scorer_t, global_scorer_t>;
 
     substituter_t substituter_ {};
     error_cost_t gap_cost_ {1};
@@ -444,13 +465,14 @@ struct diagonal_walker {
  *  @sa     For simplicity, use the `sz::levenshtein_distance[_utf8]` and `sz::needleman_wunsch_score`.
  *  @sa     For bulk API, use `sz::levenshtein_scores[_utf8]`.
  */
-template <                                                 //
-    sz_capability_t capability_ = sz_cap_serial_k,         //
-    sz_alignment_locality_t locality_ = sz_align_global_k, //
-    typename char_type_ = char,                            //
-    typename score_type_ = sz_size_t,                      //
-    typename substituter_type_ = error_costs_uniform_t,    //
-    typename allocator_type_ = dummy_alloc_t               //
+template <                                                     //
+    sz_capability_t capability_ = sz_cap_serial_k,             //
+    sz_alignment_direction_t direction_ = sz_align_maximize_k, //
+    sz_alignment_locality_t locality_ = sz_align_global_k,     //
+    typename char_type_ = char,                                //
+    typename score_type_ = sz_size_t,                          //
+    typename substituter_type_ = error_costs_uniform_t,        //
+    typename allocator_type_ = dummy_alloc_t                   //
     >
 struct horizontal_walker {
 
@@ -460,12 +482,17 @@ struct horizontal_walker {
     using allocator_t = allocator_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
-    static constexpr sz_alignment_locality_t locality = locality_;
+    static constexpr sz_alignment_locality_t locality_k = locality_;
+    static constexpr sz_alignment_direction_t direction_k = direction_;
     static_assert((capability_k & sz_cap_parallel_k) == 0, "This algorithm is not parallelized!");
 
     using allocated_t = typename allocator_t::value_type;
     static_assert(sizeof(allocated_t) == sizeof(char), "Allocator must be byte-aligned");
-    using scorer_t = global_scorer<constant_iterator<char_t>, char_t const *, score_t, substituter_t, capability_k>;
+    using global_scorer_t =
+        global_scorer<constant_iterator<char_t>, char_t const *, score_t, substituter_t, direction_k, capability_k>;
+    using local_scorer_t =
+        local_scorer<constant_iterator<char_t>, char_t const *, score_t, substituter_t, direction_k, capability_k>;
+    using scorer_t = std::conditional_t<locality_k == sz_align_local_k, local_scorer_t, global_scorer_t>;
 
     substituter_t substituter_ {};
     error_cost_t gap_cost_ {1};
@@ -560,16 +587,16 @@ struct levenshtein_distance {
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using horizontal_u8_t = horizontal_walker<capability_serialized_k, sz_align_global_k, char_t, sz_u8_t,
-                                              error_costs_uniform_t, allocator_t>;
-    using diagonal_u8_t = diagonal_walker<capability_serialized_k, sz_align_global_k, char_t, sz_u8_t,
-                                          error_costs_uniform_t, allocator_t>;
-    using diagonal_u16_t =
-        diagonal_walker<capability_k, sz_align_global_k, char_t, sz_u16_t, error_costs_uniform_t, allocator_t>;
-    using diagonal_u32_t =
-        diagonal_walker<capability_k, sz_align_global_k, char_t, sz_u32_t, error_costs_uniform_t, allocator_t>;
-    using diagonal_u64_t =
-        diagonal_walker<capability_k, sz_align_global_k, char_t, sz_u64_t, error_costs_uniform_t, allocator_t>;
+    using horizontal_u8_t = horizontal_walker<capability_serialized_k, sz_align_minimize_k, sz_align_global_k, char_t,
+                                              sz_u8_t, error_costs_uniform_t, allocator_t>;
+    using diagonal_u8_t = diagonal_walker<capability_serialized_k, sz_align_minimize_k, sz_align_global_k, char_t,
+                                          sz_u8_t, error_costs_uniform_t, allocator_t>;
+    using diagonal_u16_t = diagonal_walker<capability_k, sz_align_minimize_k, sz_align_global_k, char_t, sz_u16_t,
+                                           error_costs_uniform_t, allocator_t>;
+    using diagonal_u32_t = diagonal_walker<capability_k, sz_align_minimize_k, sz_align_global_k, char_t, sz_u32_t,
+                                           error_costs_uniform_t, allocator_t>;
+    using diagonal_u64_t = diagonal_walker<capability_k, sz_align_minimize_k, sz_align_global_k, char_t, sz_u64_t,
+                                           error_costs_uniform_t, allocator_t>;
 
     allocator_t alloc_ {};
 
@@ -649,16 +676,16 @@ struct levenshtein_distance_utf8 {
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using horizontal_u8_t = horizontal_walker<capability_serialized_k, sz_align_global_k, sz_rune_t, sz_u8_t,
-                                              error_costs_uniform_t, allocator_t>;
-    using diagonal_u8_t = diagonal_walker<capability_serialized_k, sz_align_global_k, sz_rune_t, sz_u8_t,
-                                          error_costs_uniform_t, allocator_t>;
-    using diagonal_u16_t =
-        diagonal_walker<capability_k, sz_align_global_k, sz_rune_t, sz_u16_t, error_costs_uniform_t, allocator_t>;
-    using diagonal_u32_t =
-        diagonal_walker<capability_k, sz_align_global_k, sz_rune_t, sz_u32_t, error_costs_uniform_t, allocator_t>;
-    using diagonal_u64_t =
-        diagonal_walker<capability_k, sz_align_global_k, sz_rune_t, sz_u64_t, error_costs_uniform_t, allocator_t>;
+    using horizontal_u8_t = horizontal_walker<capability_serialized_k, sz_align_minimize_k, sz_align_global_k,
+                                              sz_rune_t, sz_u8_t, error_costs_uniform_t, allocator_t>;
+    using diagonal_u8_t = diagonal_walker<capability_serialized_k, sz_align_minimize_k, sz_align_global_k, sz_rune_t,
+                                          sz_u8_t, error_costs_uniform_t, allocator_t>;
+    using diagonal_u16_t = diagonal_walker<capability_k, sz_align_minimize_k, sz_align_global_k, sz_rune_t, sz_u16_t,
+                                           error_costs_uniform_t, allocator_t>;
+    using diagonal_u32_t = diagonal_walker<capability_k, sz_align_minimize_k, sz_align_global_k, sz_rune_t, sz_u32_t,
+                                           error_costs_uniform_t, allocator_t>;
+    using diagonal_u64_t = diagonal_walker<capability_k, sz_align_minimize_k, sz_align_global_k, sz_rune_t, sz_u64_t,
+                                           error_costs_uniform_t, allocator_t>;
 
     using ascii_fallback_t = levenshtein_distance_utf8<capability_k, char_t, allocator_t>;
 
@@ -764,14 +791,14 @@ struct needleman_wunsch_score {
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using horizontal_i16_t =
-        horizontal_walker<capability_serialized_k, sz_align_global_k, char_t, sz_i16_t, substituter_t, allocator_t>;
-    using diagonal_i16_t =
-        diagonal_walker<capability_serialized_k, sz_align_global_k, char_t, sz_i16_t, substituter_t, allocator_t>;
-    using diagonal_i32_t =
-        diagonal_walker<capability_k, sz_align_global_k, char_t, sz_i32_t, substituter_t, allocator_t>;
-    using diagonal_i64_t =
-        diagonal_walker<capability_k, sz_align_global_k, char_t, sz_i64_t, substituter_t, allocator_t>;
+    using horizontal_i16_t = horizontal_walker<capability_serialized_k, sz_align_maximize_k, sz_align_global_k, char_t,
+                                               sz_i16_t, substituter_t, allocator_t>;
+    using diagonal_i16_t = diagonal_walker<capability_serialized_k, sz_align_maximize_k, sz_align_global_k, char_t,
+                                           sz_i16_t, substituter_t, allocator_t>;
+    using diagonal_i32_t = diagonal_walker<capability_k, sz_align_maximize_k, sz_align_global_k, char_t, sz_i32_t,
+                                           substituter_t, allocator_t>;
+    using diagonal_i64_t = diagonal_walker<capability_k, sz_align_maximize_k, sz_align_global_k, char_t, sz_i64_t,
+                                           substituter_t, allocator_t>;
 
     substituter_t substituter_ {};
     error_cost_t gap_cost_ {1};
@@ -856,14 +883,14 @@ struct smith_waterman_score {
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using horizontal_i16_t =
-        horizontal_walker<capability_serialized_k, sz_align_local_k, char_t, sz_i16_t, substituter_t, allocator_t>;
-    using diagonal_i16_t =
-        diagonal_walker<capability_serialized_k, sz_align_local_k, char_t, sz_i16_t, substituter_t, allocator_t>;
-    using diagonal_i32_t =
-        diagonal_walker<capability_k, sz_align_local_k, char_t, sz_i32_t, substituter_t, allocator_t>;
-    using diagonal_i64_t =
-        diagonal_walker<capability_k, sz_align_local_k, char_t, sz_i64_t, substituter_t, allocator_t>;
+    using horizontal_i16_t = horizontal_walker<capability_serialized_k, sz_align_maximize_k, sz_align_local_k, char_t,
+                                               sz_i16_t, substituter_t, allocator_t>;
+    using diagonal_i16_t = diagonal_walker<capability_serialized_k, sz_align_maximize_k, sz_align_local_k, char_t,
+                                           sz_i16_t, substituter_t, allocator_t>;
+    using diagonal_i32_t = diagonal_walker<capability_k, sz_align_maximize_k, sz_align_local_k, char_t, sz_i32_t,
+                                           substituter_t, allocator_t>;
+    using diagonal_i64_t = diagonal_walker<capability_k, sz_align_maximize_k, sz_align_local_k, char_t, sz_i64_t,
+                                           substituter_t, allocator_t>;
 
     substituter_t substituter_ {};
     error_cost_t gap_cost_ {1};
@@ -882,12 +909,8 @@ struct smith_waterman_score {
 
         sz_size_t const first_length = first.length();
         sz_size_t const second_length = second.length();
-        if (first_length == 0) {
-            result_ref = second_length * gap_cost_;
-            return status_t::success_k;
-        }
-        if (second_length == 0) {
-            result_ref = first_length * gap_cost_;
+        if (first_length == 0 || second_length == 0) {
+            result_ref = 0;
             return status_t::success_k;
         }
 
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 5a0556e6..73224e5f 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -413,11 +413,17 @@ typedef enum { sz_false_k = 0, sz_true_k = 1 } sz_bool_t;
 typedef enum { sz_less_k = -1, sz_equal_k = 0, sz_greater_k = 1 } sz_ordering_t;
 
 /**
- *  @brief Describes the alignment goal for string similarity algorithms.
+ *  @brief Describes the alignment scope for string similarity algorithms.
  *  @sa sz_align_global_k, sz_align_local_k
  */
 typedef enum { sz_align_global_k = 0, sz_align_local_k = 1 } sz_alignment_locality_t;
 
+/**
+ *  @brief Describes the alignment objective for string similarity algorithms.
+ *  @sa sz_align_minimize_k, sz_align_maximize_k
+ */
+typedef enum { sz_align_minimize_k = 0, sz_align_maximize_k = 1 } sz_alignment_direction_t;
+
 /**
  *  @brief A simple signed integer type describing the status of a faulty operation.
  *  @sa sz_success_k, sz_bad_alloc_k, sz_invalid_utf8_k, sz_contains_duplicates_k

From fea39edeb6661f9e35d7be1c9d07dd3fa378a51d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 3 Apr 2025 16:48:21 +0000
Subject: [PATCH 288/751] Fix: NW/SW test correspondence

---
 scripts/test.cu  | 81 +++++++++++++++++++++++++++++++-----------------
 scripts/test.hpp | 22 +++++++------
 2 files changed, 64 insertions(+), 39 deletions(-)

diff --git a/scripts/test.cu b/scripts/test.cu
index 8d40170f..d434e62c 100644
--- a/scripts/test.cu
+++ b/scripts/test.cu
@@ -54,9 +54,10 @@ template <typename value_type_>
 using unified_vector = std::vector<value_type_, sz::cuda::unified_alloc<value_type_>>;
 #endif
 
-struct levenshtein_baseline_t {
+struct levenshtein_baselines_t {
+    template <typename results_type_>
     sz::status_t operator()(arrow_strings_tape_t const &first, arrow_strings_tape_t const &second,
-                            sz_size_t *results) const {
+                            results_type_ *results) const {
         _sz_assert(first.size() == second.size());
 #pragma omp parallel for
         for (std::size_t i = 0; i != first.size(); ++i)
@@ -66,9 +67,9 @@ struct levenshtein_baseline_t {
     }
 };
 
-struct needleman_wunsch_baseline_t {
+struct needleman_wunsch_baselines_t {
 
-    sz::scripts::error_costs_256x256_t substitution_costs = sz::scripts::error_costs_256x256_unary();
+    sz::scripts::error_costs_256x256_t substitution_costs = sz::scripts::error_costs_256x256_diagonal();
     sz::error_cost_t gap_cost = -1;
 
     sz::status_t operator()(arrow_strings_tape_t const &first, arrow_strings_tape_t const &second,
@@ -85,9 +86,9 @@ struct needleman_wunsch_baseline_t {
     }
 };
 
-struct smith_waterman_baseline_t {
+struct smith_waterman_baselines_t {
 
-    sz::scripts::error_costs_256x256_t substitution_costs = sz::scripts::error_costs_256x256_unary();
+    sz::scripts::error_costs_256x256_t substitution_costs = sz::scripts::error_costs_256x256_diagonal();
     sz::error_cost_t gap_cost = -1;
 
     sz::status_t operator()(arrow_strings_tape_t const &first, arrow_strings_tape_t const &second,
@@ -146,7 +147,7 @@ void edit_distance_log_mismatch(std::string const &first, std::string const &sec
         format_string = "Edit Distance error (got %zd, expected %zd): \"%.22s%s\" ⇔ \"%.22s%s\" \n";
     }
     else { format_string = "Edit Distance error (got %zu, expected %zu): \"%.22s%s\" ⇔ \"%.22s%s\" \n"; }
-    std::printf(format_string, first.c_str(), ellipsis, second.c_str(), ellipsis, result_base, result_simd);
+    std::printf(format_string, result_simd, result_base, first.c_str(), ellipsis, second.c_str(), ellipsis);
 }
 
 /**
@@ -262,46 +263,68 @@ static void edit_distances_compare(base_operator_ &&base_operator, simd_operator
 
 static void test_equivalence(std::size_t batch_size = 1024, std::size_t max_string_length = 100) {
 
+    // Our logic of computing NW and SW alignment similarity scores differs in sign from most implementations.
+    // It's similar to how the "cosine distance" is the inverse of the "cosine similarity".
+    // In our case we compute the "distance" and by negating the sign, we can compute the "similarity".
+    constexpr sz::error_cost_t unary_match_score = 1;
+    constexpr sz::error_cost_t unary_mismatch_score = 0;
+    constexpr sz::error_cost_t unary_gap_score = 0;
+    using substituter_256x256_t = sz::error_costs_256x256_lookup_t;
+    sz::scripts::error_costs_256x256_t costs_unary =
+        sz::scripts::error_costs_256x256_diagonal(unary_match_score, unary_mismatch_score);
+    substituter_256x256_t substituter_unary(costs_unary.data());
+    {
+        auto distance_l = levenshtein_baseline("abcdefg", 7, "abc_efg", 7);
+        auto similarity_nw = needleman_wunsch_baseline("abcdefg", 7, "abc_efg", 7, substituter_unary, unary_gap_score);
+        auto similarity_sw = smith_waterman_baseline("abcdefg", 7, "abc_efg", 7, substituter_unary, unary_gap_score);
+        // Distance can be computed from the similarity, by inverting the sign around the length of the longest string:
+        auto distance_nw = std::max(7, 7) - similarity_nw;
+        auto distance_sw = std::max(7, 7) - similarity_sw;
+        _sz_assert(distance_l == 1);
+        _sz_assert(distance_nw == 1);
+        _sz_assert(distance_sw == 1);
+    }
+
+    // Now systematically compare the results of the baseline and SIMD implementations
     constexpr sz_capability_t serial_k = sz_cap_serial_k;
     constexpr sz_capability_t parallel_k = sz_cap_parallel_k;
 
     edit_distances_compare<sz_size_t>(                                              //
-        levenshtein_baseline_t {},                                                  //
+        levenshtein_baselines_t {},                                                 //
         sz::openmp::levenshtein_distances<serial_k, char, std::allocator<char>> {}, //
         batch_size, max_string_length);
 
     edit_distances_compare<sz_size_t>(                                                //
-        levenshtein_baseline_t {},                                                    //
+        levenshtein_baselines_t {},                                                   //
         sz::openmp::levenshtein_distances<parallel_k, char, std::allocator<char>> {}, //
         batch_size, max_string_length);
 
-    constexpr sz::error_cost_t blosum62_gap_opening_cost = 11;
-    constexpr sz::error_cost_t blosum62_gap_extension_cost = 1;
+    // Now let's take non-unary substitution costs, like BLOSUM62
+    constexpr sz::error_cost_t blosum62_gap_extension_cost = 4; // ? The inverted typical (-4) value
     sz::scripts::error_costs_256x256_t blosum62 = sz::scripts::error_costs_256x256_blosum62();
-    using substituter_t = sz::error_costs_256x256_lookup_t;
 
-    edit_distances_compare<sz_ssize_t>(                                      //
-        needleman_wunsch_baseline_t {blosum62, blosum62_gap_extension_cost}, //
-        sz::openmp::needleman_wunsch_scores<serial_k, char, substituter_t, std::allocator<char>> {
-            substituter_t {blosum62.data()}, blosum62_gap_extension_cost}, //
+    edit_distances_compare<sz_ssize_t>(                                       //
+        needleman_wunsch_baselines_t {blosum62, blosum62_gap_extension_cost}, //
+        sz::openmp::needleman_wunsch_scores<serial_k, char, substituter_256x256_t, std::allocator<char>> {
+            substituter_256x256_t {blosum62.data()}, blosum62_gap_extension_cost}, //
         batch_size, max_string_length);
 
-    edit_distances_compare<sz_ssize_t>(                                      //
-        needleman_wunsch_baseline_t {blosum62, blosum62_gap_extension_cost}, //
-        sz::openmp::needleman_wunsch_scores<parallel_k, char, substituter_t, std::allocator<char>> {
-            substituter_t {blosum62.data()}, blosum62_gap_extension_cost}, //
+    edit_distances_compare<sz_ssize_t>(                                       //
+        needleman_wunsch_baselines_t {blosum62, blosum62_gap_extension_cost}, //
+        sz::openmp::needleman_wunsch_scores<parallel_k, char, substituter_256x256_t, std::allocator<char>> {
+            substituter_256x256_t {blosum62.data()}, blosum62_gap_extension_cost}, //
         batch_size, max_string_length);
 
-    edit_distances_compare<sz_ssize_t>(                                    //
-        smith_waterman_baseline_t {blosum62, blosum62_gap_extension_cost}, //
-        sz::openmp::smith_waterman_scores<serial_k, char, substituter_t, std::allocator<char>> {
-            substituter_t {blosum62.data()}, blosum62_gap_extension_cost}, //
+    edit_distances_compare<sz_ssize_t>(                                     //
+        smith_waterman_baselines_t {blosum62, blosum62_gap_extension_cost}, //
+        sz::openmp::smith_waterman_scores<serial_k, char, substituter_256x256_t, std::allocator<char>> {
+            substituter_256x256_t {blosum62.data()}, blosum62_gap_extension_cost}, //
         batch_size, max_string_length);
 
-    edit_distances_compare<sz_ssize_t>(                                    //
-        smith_waterman_baseline_t {blosum62, blosum62_gap_extension_cost}, //
-        sz::openmp::smith_waterman_scores<parallel_k, char, substituter_t, std::allocator<char>> {
-            substituter_t {blosum62.data()}, blosum62_gap_extension_cost}, //
+    edit_distances_compare<sz_ssize_t>(                                     //
+        smith_waterman_baselines_t {blosum62, blosum62_gap_extension_cost}, //
+        sz::openmp::smith_waterman_scores<parallel_k, char, substituter_256x256_t, std::allocator<char>> {
+            substituter_256x256_t {blosum62.data()}, blosum62_gap_extension_cost}, //
         batch_size, max_string_length);
 };
 
@@ -347,7 +370,7 @@ static void test_non_stl_extensions_for_reads() {
 
     // Computing alignment scores.
     using matrix_t = std::int8_t[256][256];
-    sz::scripts::error_costs_256x256_t substitution_costs = error_costs_256x256_unary();
+    sz::scripts::error_costs_256x256_t substitution_costs = error_costs_256x256_diagonal();
     matrix_t &costs = *reinterpret_cast<matrix_t *>(substitution_costs.data());
 
     _sz_assert(sz::alignment_score(str("listen"), str("silent"), costs, -1) == -4);
diff --git a/scripts/test.hpp b/scripts/test.hpp
index 161e1ae6..c2216bad 100644
--- a/scripts/test.hpp
+++ b/scripts/test.hpp
@@ -142,8 +142,9 @@ inline std::ptrdiff_t needleman_wunsch_baseline(char const *s1, std::size_t len1
         std::ptrdiff_t *row = &matrix_buffer[i * cols];
         for (std::size_t j = 1; j < cols; ++j) {
             std::ptrdiff_t substitution_cost = substitution_cost_for(s1[i - 1], s2[j - 1]);
-            std::ptrdiff_t if_deletion_or_insertion = std::min(last_row[j], row[j - 1]) + gap_cost;
-            row[j] = std::min(if_deletion_or_insertion, last_row[j - 1] + substitution_cost);
+            std::ptrdiff_t if_substitution = last_row[j - 1] + substitution_cost;
+            std::ptrdiff_t if_deletion_or_insertion = std::max(last_row[j], row[j - 1]) + gap_cost;
+            row[j] = std::max(if_deletion_or_insertion, if_substitution);
         }
     }
 
@@ -162,7 +163,7 @@ inline std::ptrdiff_t smith_waterman_baseline(char const *s1, std::size_t len1,
     std::vector<std::ptrdiff_t> matrix_buffer(rows * cols);
 
     // Unlike the global alignment we need to track the largest score in the matrix.
-    std::ptrdiff_t max_score = 0;
+    std::ptrdiff_t best_score = 0;
 
     // Initialize the borders of the matrix to 0.
     for (std::size_t i = 0; i < rows; ++i) matrix_buffer[i * cols + 0] /* [i][0] in 2D */ = 0;
@@ -175,15 +176,15 @@ inline std::ptrdiff_t smith_waterman_baseline(char const *s1, std::size_t len1,
         for (std::size_t j = 1; j < cols; ++j) {
             std::ptrdiff_t substitution_cost = substitution_cost_for(s1[i - 1], s2[j - 1]);
             std::ptrdiff_t if_substitution = last_row[j - 1] + substitution_cost;
-            std::ptrdiff_t if_deletion = last_row[j] + gap_cost;
-            std::ptrdiff_t if_insertion = row[j - 1] + gap_cost;
-            std::ptrdiff_t score = std::max({std::ptrdiff_t(0), if_substitution, if_deletion, if_insertion});
+            std::ptrdiff_t if_deletion_or_insertion = std::max(row[j - 1], last_row[j]) + gap_cost;
+            std::ptrdiff_t if_substitution_or_reset = std::max<std::ptrdiff_t>(if_substitution, 0);
+            std::ptrdiff_t score = std::max(if_deletion_or_insertion, if_substitution_or_reset);
             row[j] = score;
-            max_score = std::max(max_score, score);
+            best_score = std::max(best_score, score);
         }
     }
 
-    return max_score;
+    return best_score;
 }
 
 using error_costs_256x256_t = std::array<error_cost_t, 256 * 256>;
@@ -192,11 +193,12 @@ using error_costs_256x256_t = std::array<error_cost_t, 256 * 256>;
  *  @brief  Produces a substitution cost matrix for the Needleman-Wunsch alignment score,
  *          that would yield the same result as the negative Levenshtein distance.
  */
-inline error_costs_256x256_t error_costs_256x256_unary() noexcept {
+inline error_costs_256x256_t error_costs_256x256_diagonal(error_cost_t match_score = 0,
+                                                          error_cost_t mismatch_score = -1) noexcept {
     error_costs_256x256_t result;
     for (std::size_t i = 0; i != 256; ++i)
         for (std::size_t j = 0; j != 256; ++j) //
-            result[i * 256 + j] = i == j ? 0 : -1;
+            result[i * 256 + j] = i == j ? match_score : mismatch_score;
     return result;
 }
 

From 8a9937349f79872764eacbeb75901bb841d3e779 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 5 Apr 2025 23:17:53 +0000
Subject: [PATCH 289/751] Add: Blosum62 and NUC.4.4 matrices

---
 include/stringcuzilla/similarity.hpp | 179 ++++++++++++++++++++++++++-
 scripts/test.hpp                     |  72 -----------
 2 files changed, 178 insertions(+), 73 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index d5642328..09b89327 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -1195,7 +1195,184 @@ struct smith_waterman_scores {
     }
 };
 
-} // namespace openmp
+/**
+ *  @brief  The default most @b space-intensive error costs matrix for byte-level similarity scoring.
+ *          Takes (256 x 256) ~ 65'536 bytes of memory. Which equates to 1/3 of the shared memory on the GPU,
+ *          so smaller variants should be preferred where possible.
+ */
+struct error_costs_256x256_t {
+    error_cost_t cells[256][256] = {0};
+
+    constexpr error_cost_t operator()(char a, char b) const noexcept { return cells[(sz_u8_t)a][(sz_u8_t)b]; }
+    constexpr error_cost_t operator()(sz_u8_t a, sz_u8_t b) const noexcept { return cells[a][b]; }
+
+    constexpr error_cost_t &operator()(char a, char b) noexcept { return cells[(sz_u8_t)a][(sz_u8_t)b]; }
+    constexpr error_cost_t &operator()(sz_u8_t a, sz_u8_t b) noexcept { return cells[a][b]; }
+
+    /**
+     *  @brief  Produces a substitution cost matrix for the Needleman-Wunsch alignment score,
+     *          that would yield the same result as the negative Levenshtein distance.
+     */
+    constexpr static error_costs_256x256_t diagonal(error_cost_t match_score = 0,
+                                                    error_cost_t mismatch_score = -1) noexcept {
+        error_costs_256x256_t result;
+        for (int i = 0; i != 256; ++i)
+            for (int j = 0; j != 256; ++j) //
+                result.cells[i][j] = i == j ? match_score : mismatch_score;
+        return result;
+    }
+};
+
+/**
+ *  @brief  The recommended @b space-efficient error costs matrix for case-insensitive English word
+ *          scoring or protein sequences, which conveniently require only 26 and 20 letters respectively.
+ *  @note   All lookups are performed by indexing rows/columns from the 'A' character, which is 65 in ASCII.
+ *
+ *  @section    Biological Data
+ *
+ *  For proteins, a (26 x 26) matrix takes 676 bytes, which is a steep 43% increase from (20 x 20) ~ 400 bytes.
+ *  Still, its an acceptable tradeoff given the convenience of using ASCII arithmetic for lookups, and occasional
+ *  use of special "ambiguous" characters. The 20 standard amino-acids are @b ARNDCQEGHILKMFPSTWYV. Others include:
+ *  - @b U: Selenocysteine, sometimes called the 21st amino acid.
+ *  - @b O: Pyrrolysine, occasionally referred to as the 22nd amino acid.
+ *  - @b B: An ambiguous code representing either Aspartic acid (D) or Asparagine (N).
+ *  - @b Z: An ambiguous code representing either Glutamic acid (E) or Glutamine (Q).
+ *  - @b X: Used when the identity of an amino acid is unknown or unspecified.
+ *  - @b *: Denotes a stop codon, signaling the end of the protein sequence during translation.
+ *  This leaves @b J as the only ASCII letter not used in protein sequences and @b (*) asterisk as the the only
+ *  non-letter character used.
+ *
+ *  For DNA and RNA sequences, often a (4 x 4) matrix can be enough, but in the general case, additional characters
+ *  are used to mark ambiguous reads. For nucleic acids the standard alphabets are @b ACGT for @b DNA and @b ACGU
+ *  for @b RNA. There are a lot more ambiguity codes though:
+ *
+ *      ------+----------+----------+----------+-----------
+ *       Code | Can be A | Can be C | Can be G | Can be T/U
+ *      ------+----------+----------+----------+-----------
+ *       A    |    X     |          |          |
+ *       C    |          |    X     |          |
+ *       G    |          |          |    X     |
+ *       T    |          |          |          |     X
+ *       R    |    X     |          |    X     |
+ *       Y    |          |    X     |          |     X
+ *       S    |          |    X     |    X     |
+ *       W    |    X     |          |          |     X
+ *       K    |          |          |    X     |     X
+ *       M    |    X     |    X     |          |
+ *       B    |          |    X     |    X     |     X
+ *       D    |    X     |          |    X     |     X
+ *       H    |    X     |    X     |          |     X
+ *       V    |    X     |    X     |    X     |
+ *       N    |    X     |    X     |    X     |     X
+ *      ------+----------+----------+----------+-----------
+ *
+ *  If the BLOSUM62 matrix is often used for proteins, the IUB or NUC.4.4 are often used for nucleic acids.
+ *  Both can be easily extracted from BioPython and converted to our ASCII order:
+ *
+ *  @code{.py}
+ *  import string
+ *  from Bio.Align import substitution_matrices
+ *
+ *  def map_to_new_alphabet(matrix, new_alphabet: str, default_value: int = -128):
+ *      old_alphabet = str(matrix.alphabet)
+ *      indices = {ch: old_alphabet.find(ch) for ch in new_alphabet}
+ *      return [
+ *          [matrix[indices[r], indices[c]] if indices[r] != -1 and indices[c] != -1 else default_value
+ *          for c in new_alphabet]
+ *          for r in new_alphabet
+ *      ]
+ *
+ *  matrix = substitution_matrices.load("BLOSUM62").astype(int) # Or "NUC.4.4"
+ *  print(map_to_new_alphabet(matrix, string.ascii_uppercase))
+ *  @endcode
+ */
+struct error_costs_26x26ascii_t {
+    error_cost_t cells[26][26] = {0};
+
+    constexpr error_cost_t operator()(char a, char b) const noexcept { return cells[(sz_u8_t)a - 65][(sz_u8_t)b - 65]; }
+    constexpr error_cost_t operator()(sz_u8_t a, sz_u8_t b) const noexcept { return cells[a - 65][b - 65]; }
+
+    constexpr error_cost_t &operator()(char a, char b) noexcept { return cells[(sz_u8_t)a - 65][(sz_u8_t)b - 65]; }
+    constexpr error_cost_t &operator()(sz_u8_t a, sz_u8_t b) noexcept { return cells[a - 65][b - 65]; }
+
+    constexpr error_costs_256x256_t decompressed() const noexcept {
+        error_costs_256x256_t result;
+        for (int i = 0; i != 26; ++i)
+            for (int j = 0; j != 26; ++j) //
+                result.cells[i + 65][j + 65] = cells[i][j];
+        return result;
+    }
+
+    /**
+     *  @brief BLOSUM62 substitution matrix for protein analysis in bioinformatics, reorganized for ASCII lookups.
+     *  @see https://en.wikipedia.org/wiki/BLOSUM
+     */
+    constexpr static error_costs_26x26ascii_t blosum62() {
+        constexpr error_cost_t na = -128; // Placeholder for unused characters
+        return {
+            {{4, -2, 0, -2, -1, -2, 0, -2, -1, na, -1, -1, -1, -2, na, -1, -1, -1, 1, 0, na, 0, -3, 0, -2, -1},
+             {-2, 4, -3, 4, 1, -3, -1, 0, -3, na, 0, -4, -3, 3, na, -2, 0, -1, 0, -1, na, -3, -4, -1, -3, 1},
+             {0, -3, 9, -3, -4, -2, -3, -3, -1, na, -3, -1, -1, -3, na, -3, -3, -3, -1, -1, na, -1, -2, -2, -2, -3},
+             {-2, 4, -3, 6, 2, -3, -1, -1, -3, na, -1, -4, -3, 1, na, -1, 0, -2, 0, -1, na, -3, -4, -1, -3, 1},
+             {-1, 1, -4, 2, 5, -3, -2, 0, -3, na, 1, -3, -2, 0, na, -1, 2, 0, 0, -1, na, -2, -3, -1, -2, 4},
+             {-2, -3, -2, -3, -3, 6, -3, -1, 0, na, -3, 0, 0, -3, na, -4, -3, -3, -2, -2, na, -1, 1, -1, 3, -3},
+             {0, -1, -3, -1, -2, -3, 6, -2, -4, na, -2, -4, -3, 0, na, -2, -2, -2, 0, -2, na, -3, -2, -1, -3, -2},
+             {-2, 0, -3, -1, 0, -1, -2, 8, -3, na, -1, -3, -2, 1, na, -2, 0, 0, -1, -2, na, -3, -2, -1, 2, 0},
+             {-1, -3, -1, -3, -3, 0, -4, -3, 4, na, -3, 2, 1, -3, na, -3, -3, -3, -2, -1, na, 3, -3, -1, -1, -3},
+             {na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na},
+             {-1, 0, -3, -1, 1, -3, -2, -1, -3, na, 5, -2, -1, 0, na, -1, 1, 2, 0, -1, na, -2, -3, -1, -2, 1},
+             {-1, -4, -1, -4, -3, 0, -4, -3, 2, na, -2, 4, 2, -3, na, -3, -2, -2, -2, -1, na, 1, -2, -1, -1, -3},
+             {-1, -3, -1, -3, -2, 0, -3, -2, 1, na, -1, 2, 5, -2, na, -2, 0, -1, -1, -1, na, 1, -1, -1, -1, -1},
+             {-2, 3, -3, 1, 0, -3, 0, 1, -3, na, 0, -3, -2, 6, na, -2, 0, 0, 1, 0, na, -3, -4, -1, -2, 0},
+             {na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na},
+             {-1, -2, -3, -1, -1, -4, -2, -2, -3, na, -1, -3, -2, -2, na, 7, -1, -2, -1, -1, na, -2, -4, -2, -3, -1},
+             {-1, 0, -3, 0, 2, -3, -2, 0, -3, na, 1, -2, 0, 0, na, -1, 5, 1, 0, -1, na, -2, -2, -1, -1, 3},
+             {-1, -1, -3, -2, 0, -3, -2, 0, -3, na, 2, -2, -1, 0, na, -2, 1, 5, -1, -1, na, -3, -3, -1, -2, 0},
+             {1, 0, -1, 0, 0, -2, 0, -1, -2, na, 0, -2, -1, 1, na, -1, 0, -1, 4, 1, na, -2, -3, 0, -2, 0},
+             {0, -1, -1, -1, -1, -2, -2, -2, -1, na, -1, -1, -1, 0, na, -1, -1, -1, 1, 5, na, 0, -2, 0, -2, -1},
+             {na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na},
+             {0, -3, -1, -3, -2, -1, -3, -3, 3, na, -2, 1, 1, -3, na, -2, -2, -3, -2, 0, na, 4, -3, -1, -1, -2},
+             {-3, -4, -2, -4, -3, 1, -2, -2, -3, na, -3, -2, -1, -4, na, -4, -2, -3, -3, -2, na, -3, 11, -2, 2, -3},
+             {0, -1, -2, -1, -1, -1, -1, -1, -1, na, -1, -1, -1, -1, na, -2, -1, -1, 0, 0, na, -1, -2, -1, -1, -1},
+             {-2, -3, -2, -3, -2, 3, -3, 2, -1, na, -2, -1, -1, -2, na, -3, -1, -2, -2, -2, na, -1, 2, -1, 7, -2},
+             {-1, 1, -3, 1, 4, -3, -2, 0, -3, na, 1, -3, -1, 0, na, -1, 3, 0, 0, -1, na, -2, -3, -1, -2, 4}}};
+    }
+    /**
+     *  @brief NUC.4.4 substitution matrix for DNA analysis in bioinformatics, reorganized for ASCII lookups.
+     *  @see https://www.biostars.org/p/73028/#93435
+     */
+    constexpr static error_costs_26x26ascii_t nuc44() {
+        constexpr error_cost_t na = -128; // Placeholder for unused characters
+        return {
+            {{5, -4, -4, -1, na, na, -4, -1, na, na, -4, na, 1, -2, na, na, na, 1, -4, -4, na, -1, 1, na, -4, na},
+             {-4, -1, -1, -2, na, na, -1, -2, na, na, -1, na, -3, -1, na, na, na, -3, -1, -1, na, -2, -3, na, -1, na},
+             {-4, -1, 5, -4, na, na, -4, -1, na, na, -4, na, 1, -2, na, na, na, -4, 1, -4, na, -1, -4, na, 1, na},
+             {-1, -2, -4, -1, na, na, -1, -2, na, na, -1, na, -3, -1, na, na, na, -1, -3, -1, na, -2, -1, na, -3, na},
+             {na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na},
+             {na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na},
+             {-4, -1, -4, -1, na, na, 5, -4, na, na, 1, na, -4, -2, na, na, na, 1, 1, -4, na, -1, -4, na, -4, na},
+             {-1, -2, -1, -2, na, na, -4, -1, na, na, -3, na, -1, -1, na, na, na, -3, -3, -1, na, -2, -1, na, -1, na},
+             {na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na},
+             {na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na},
+             {-4, -1, -4, -1, na, na, 1, -3, na, na, -1, na, -4, -1, na, na, na, -2, -2, 1, na, -3, -2, na, -2, na},
+             {na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na},
+             {1, -3, 1, -3, na, na, -4, -1, na, na, -4, na, -1, -1, na, na, na, -2, -2, -4, na, -1, -2, na, -2, na},
+             {-2, -1, -2, -1, na, na, -2, -1, na, na, -1, na, -1, -1, na, na, na, -1, -1, -2, na, -1, -1, na, -1, na},
+             {na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na},
+             {na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na},
+             {na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na},
+             {1, -3, -4, -1, na, na, 1, -3, na, na, -2, na, -2, -1, na, na, na, -1, -2, -4, na, -1, -2, na, -4, na},
+             {-4, -1, 1, -3, na, na, 1, -3, na, na, -2, na, -2, -1, na, na, na, -2, -1, -4, na, -1, -4, na, -2, na},
+             {-4, -1, -4, -1, na, na, -4, -1, na, na, 1, na, -4, -2, na, na, na, -4, -4, 5, na, -4, 1, na, 1, na},
+             {na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na},
+             {-1, -2, -1, -2, na, na, -1, -2, na, na, -3, na, -1, -1, na, na, na, -1, -1, -4, na, -1, -3, na, -3, na},
+             {1, -3, -4, -1, na, na, -4, -1, na, na, -2, na, -2, -1, na, na, na, -2, -4, 1, na, -3, -1, na, -2, na},
+             {na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na},
+             {-4, -1, 1, -3, na, na, -4, -1, na, na, -2, na, -2, -1, na, na, na, -4, -2, 1, na, -3, -2, na, -1, na},
+             {na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na, na}}};
+    }
+};
+
 } // namespace stringzilla
 } // namespace ashvardanian
 
diff --git a/scripts/test.hpp b/scripts/test.hpp
index c2216bad..049a943e 100644
--- a/scripts/test.hpp
+++ b/scripts/test.hpp
@@ -187,78 +187,6 @@ inline std::ptrdiff_t smith_waterman_baseline(char const *s1, std::size_t len1,
     return best_score;
 }
 
-using error_costs_256x256_t = std::array<error_cost_t, 256 * 256>;
-
-/**
- *  @brief  Produces a substitution cost matrix for the Needleman-Wunsch alignment score,
- *          that would yield the same result as the negative Levenshtein distance.
- */
-inline error_costs_256x256_t error_costs_256x256_diagonal(error_cost_t match_score = 0,
-                                                          error_cost_t mismatch_score = -1) noexcept {
-    error_costs_256x256_t result;
-    for (std::size_t i = 0; i != 256; ++i)
-        for (std::size_t j = 0; j != 256; ++j) //
-            result[i * 256 + j] = i == j ? match_score : mismatch_score;
-    return result;
-}
-
-/**
- *  @brief  Produces a substitution cost matrix using Blosum62 weights.
- *
- *  For characters corresponding to the 20 standard amino acids in Blosum62,
- *  the matrix is initialized with the respective scores. For any other character,
- *  a default penalty (e.g. -4) is used.
- */
-inline error_costs_256x256_t error_costs_256x256_blosum62() noexcept {
-    error_costs_256x256_t result;
-
-    constexpr char amino_acids[] = "ARNDCQEGHILKMFPSTWYV";
-    constexpr int num_amino = 20;
-
-    // BLOSUM62 substitution matrix for the 20 amino acids.
-    constexpr int blosum62[num_amino][num_amino] = {
-        {4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0},
-        {-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3},
-        {-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3},
-        {-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3},
-        {0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1},
-        {-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2},
-        {-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2},
-        {0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3},
-        {-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3},
-        {-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3},
-        {-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1},
-        {-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2},
-        {-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1},
-        {-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1},
-        {-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2},
-        {1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2},
-        {0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0},
-        {-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3},
-        {-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1},
-        {0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4}};
-
-    // Build a lookup table to map any character (0–255) to an index in amino_acids,
-    // or -1 if the character is not one of the valid amino acids.
-    std::array<int, 256> amino_index;
-    amino_index.fill(-1);
-    for (int k = 0; k < num_amino; ++k) amino_index[static_cast<unsigned char>(amino_acids[k])] = k;
-
-    // Define a default penalty for characters not in Blosum62 (e.g., a typical gap or mismatch penalty)
-    constexpr error_cost_t default_penalty = -4;
-
-    // Initialize the substitution matrix.
-    for (std::size_t i = 0; i < 256; ++i) {
-        for (std::size_t j = 0; j < 256; ++j) {
-            int idx1 = amino_index[i];
-            int idx2 = amino_index[j];
-            if (idx1 != -1 && idx2 != -1) { result[i * 256 + j] = blosum62[idx1][idx2]; }
-            else { result[i * 256 + j] = default_penalty; }
-        }
-    }
-    return result;
-}
-
 } // namespace scripts
 } // namespace stringzilla
 } // namespace ashvardanian
\ No newline at end of file

From ca0fece1f0f2169c9d94a1ecd37387112fd7db6f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 5 Apr 2025 23:19:01 +0000
Subject: [PATCH 290/751] Add: New NW and SW GPU kernels

---
 include/stringcuzilla/similarity.cuh | 991 +++++++++++++++++++--------
 include/stringcuzilla/similarity.hpp | 217 +++---
 include/stringcuzilla/types.cuh      |   2 -
 include/stringzilla/types.h          |  23 +-
 include/stringzilla/types.hpp        |   9 +-
 scripts/test.cu                      | 157 +++--
 6 files changed, 942 insertions(+), 457 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index 9f82ff02..f74349e7 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -3,28 +3,233 @@
  *  @file   similarity.cuh
  *  @author Ash Vardanian
  *
- *  Unlike th OpenMP backed, which also has single-pair similarity scores, the CUDA backend focuses on
+ *  Unlike th OpenMP backed, which also has single-pair similarity scores, the CUDA backend focuses @b only on
  *  batch-processing of large collections of strings, generally, assigning a single warp to each string pair:
  *
- *  - `sz::cuda::levenshtein_distances` & `sz::cuda::levenshtein_distances_utf8` for Levenshtein edit-distances.
- *  - `sz::cuda::needleman_wunsch_score` for weighted Needleman-Wunsch global alignment.
+ *  - `sz::levenshtein_distances` & `sz::levenshtein_distances_utf8` for Levenshtein edit-distances.
+ *  - `sz::needleman_wunsch_score` for weighted Needleman-Wunsch global alignment scores.
+ *  - `sz::smith_waterman_score` for weighted Smith-Waterman local alignment scores.
  *
- *  Unlike the trivially parallelizable CPU kernels in `stringzilla/similarity.h`, the GPU kernels in this file are
+ *  Unlike the trivially parallelizable CPU kernels in `stringzilla/similarity.hpp`, the GPU kernels in this file are
  *  designed for batch-processing of large collections of strings, assigning a single warp to each string pair.
  *  Thus, they should be used when hundreds of pairwise comparisons are needed, and the strings are long enough to
  *  amortize the cost of copying them to the GPU.
+ *
+ *  @section    Abstraction layers
+ *
+ *  Under the hood, each @b dense high-level algorithm, like Levenshtein, NW, or SW, builds on top of a "walker"
+ *  template object, which in turn builds on top of an "scorer" template object:
+ *
+ *  - the "walker" chooses the order in which the DP matrix is evaluated - row-wise or diagonal-wise.
+ *  - the "scorer" evaluates the actual DP matrix cells, taking 3+ inputs, for "local" and "global" alignment,
+ *    or the "affine local" and "affine global" alignments, differentiating the cost of gap opening & extension.
+ *
+ *  Those are later wrapped via 2 functions:
+ *
+ *  - `*_in_cuda` for on-GPU execution - implementing the actual similarity scoring.
+ *  - `*_via_cuda` for on-host execution - allocating memory, and dispatching the kernel.
+ *
+ *  Those are in-turn wrapped into the same-named function objects:
+ *
+ *  - `levenshtein_distances`: {CUDA and Kepler} for any chars and lengths, {Hopper} for 8-bit and 16-bit lengths.
+ *  - `needleman_wunsch_score`.
  */
 #ifndef STRINGZILLA_SIMILARITIES_CUH_
 #define STRINGZILLA_SIMILARITIES_CUH_
 
-#include "types.cuh"
+#include "stringcuzilla/types.cuh"
+#include "stringcuzilla/similarity.hpp"
 
 #include <cuda.h>
 #include <cuda_runtime.h>
 
 namespace ashvardanian {
 namespace stringzilla {
-namespace cuda {
+
+/**
+ *  @brief GPU adaptation of the `global_scorer` on CUDA, avoiding warp-level shuffles and DPX.
+ *  @note Uses 32-bit `uint` counter to iterate through the string slices, so it can't be over 4 billion characters.
+ */
+template <                               //
+    typename first_iterator_type_,       //
+    typename second_iterator_type_,      //
+    typename score_type_,                //
+    typename substituter_type_,          //
+    sz_similarity_objective_t objective_ //
+    >
+struct global_scorer<      //
+    first_iterator_type_,  //
+    second_iterator_type_, //
+    score_type_,           //
+    substituter_type_,     //
+    objective_,            //
+    sz_cap_cuda_k> {
+
+    using first_iterator_t = first_iterator_type_;
+    using second_iterator_t = second_iterator_type_;
+    using score_t = score_type_;
+    using substituter_t = substituter_type_;
+
+    static constexpr sz_similarity_objective_t objective_k = objective_;
+    static constexpr sz_capability_t capability_k = sz_cap_cuda_k;
+
+    using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
+    using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
+    static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
+    using char_t = first_char_t;
+
+  private:
+    substituter_t substituter_;
+    error_cost_t gap_cost_ {1};
+    score_t last_cell_ {0};
+
+    static constexpr score_t pick_best(score_t a, score_t b) noexcept {
+        if constexpr (objective_ == minimize_distance_k) { return sz_min_of_two(a, b); }
+        else { return sz_max_of_two(a, b); }
+    }
+
+  public:
+    __device__ global_scorer(substituter_t substituter, error_cost_t gap_cost) noexcept
+        : substituter_(substituter), gap_cost_(gap_cost) {}
+
+    /**
+     *  @brief Initializes a boundary value within a certain diagonal.
+     *  @note Should only be called for the diagonals outside of the bottom-right triangle.
+     *  @note Should only be called for the top row and left column of the matrix.
+     */
+    __device__ void init(score_t &cell, sz_size_t diagonal_index) const noexcept { cell = gap_cost_ * diagonal_index; }
+
+    /**
+     *  @brief Extract the final result of the scoring operation which will be always in the bottom-right corner.
+     */
+    __device__ score_t score() const noexcept { return last_cell_; }
+
+    /**
+     *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
+     *  @param first_reversed_slice The first string, @b reversed.
+     *  @param second_slice The second string.
+     *  @param n The length of the diagonal to evaluate and the number of characters to compare from each string.
+     */
+    __device__ void operator()(                                                        //
+        first_iterator_t first_reversed_slice, second_iterator_t second_slice, uint n, // ! Unlike CPU, uses `uint`
+        score_t const *scores_pre_substitution, score_t const *scores_pre_insertion, score_t const *scores_pre_deletion,
+        score_t *scores_new) noexcept {
+
+        for (uint i = threadIdx.x; i < n; i += blockDim.x) {
+            score_t pre_substitution = scores_pre_substitution[i];
+            score_t pre_insertion = scores_pre_insertion[i];
+            score_t pre_deletion = scores_pre_deletion[i];
+
+            // ? Note that here we are still traversing both buffers in the same order,
+            // ? because one of the strings has been reversed beforehand.
+            error_cost_t cost_of_substitution = substituter_(first_reversed_slice[i], second_slice[i]);
+            score_t if_substitution = pre_substitution + cost_of_substitution;
+            score_t if_deletion_or_insertion = pick_best(pre_deletion, pre_insertion) + gap_cost_;
+            score_t cell_score = pick_best(if_deletion_or_insertion, if_substitution);
+            scores_new[i] = cell_score;
+        }
+
+        // The last element of the last chunk is the result of the global alignment.
+        if (threadIdx.x == 0) last_cell_ = scores_new[n - 1];
+    }
+};
+
+/**
+ *  @brief GPU adaptation of the `local_scorer` on CUDA, avoiding warp-level shuffles and DPX.
+ *  @note Uses 32-bit `uint` counter to iterate through the string slices, so it can't be over 4 billion characters.
+ */
+template <                               //
+    typename first_iterator_type_,       //
+    typename second_iterator_type_,      //
+    typename score_type_,                //
+    typename substituter_type_,          //
+    sz_similarity_objective_t objective_ //
+    >
+struct local_scorer<       //
+    first_iterator_type_,  //
+    second_iterator_type_, //
+    score_type_,           //
+    substituter_type_,     //
+    objective_,            //
+    sz_cap_cuda_k> {
+
+    using first_iterator_t = first_iterator_type_;
+    using second_iterator_t = second_iterator_type_;
+    using score_t = score_type_;
+    using substituter_t = substituter_type_;
+
+    static constexpr sz_similarity_objective_t objective_k = objective_;
+    static constexpr sz_capability_t capability_k = sz_cap_cuda_k;
+
+    using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
+    using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
+    static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
+    using char_t = first_char_t;
+
+  private:
+    substituter_t substituter_;
+    error_cost_t gap_cost_ {1};
+    score_t best_score_ {0};
+
+    static constexpr score_t pick_best(score_t a, score_t b) noexcept {
+        if constexpr (objective_k == minimize_distance_k) { return sz_min_of_two(a, b); }
+        else { return sz_max_of_two(a, b); }
+    }
+
+  public:
+    __device__ local_scorer(substituter_t substituter, error_cost_t gap_cost) noexcept
+        : substituter_(substituter), gap_cost_(gap_cost) {}
+
+    /**
+     *  @brief Initializes a boundary value within a certain diagonal.
+     *  @note Should only be called for the diagonals outside of the bottom-right triangle.
+     *  @note Should only be called for the top row and left column of the matrix.
+     */
+    __device__ void init(score_t &cell, sz_size_t diagonal_index) const noexcept { cell = 0; }
+
+    /**
+     *  @brief Extract the final result of the scoring operation which will be always in the bottom-right corner.
+     */
+    __device__ score_t score() const noexcept { return best_score_; }
+
+    /**
+     *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
+     *  @param first_reversed_slice The first string, @b reversed.
+     *  @param second_slice The second string.
+     *  @param n The length of the diagonal to evaluate and the number of characters to compare from each string.
+     */
+    __device__ void operator()(                                                        //
+        first_iterator_t first_reversed_slice, second_iterator_t second_slice, uint n, // ! Unlike CPU, uses `uint`
+        score_t const *scores_pre_substitution, score_t const *scores_pre_insertion, score_t const *scores_pre_deletion,
+        score_t *scores_new) noexcept {
+
+        for (uint i = threadIdx.x; i < n; i += blockDim.x) {
+            score_t pre_substitution = scores_pre_substitution[i];
+            score_t pre_insertion = scores_pre_insertion[i];
+            score_t pre_deletion = scores_pre_deletion[i];
+
+            // ? Note that here we are still traversing both buffers in the same order,
+            // ? because one of the strings has been reversed beforehand.
+            error_cost_t cost_of_substitution = substituter_(first_reversed_slice[i], second_slice[i]);
+            score_t if_substitution = pre_substitution + cost_of_substitution;
+            score_t if_deletion_or_insertion = pick_best(pre_deletion, pre_insertion) + gap_cost_;
+            score_t if_substitution_or_reset = pick_best(if_substitution, 0);
+            score_t cell_score = pick_best(if_deletion_or_insertion, if_substitution_or_reset);
+            scores_new[i] = cell_score;
+
+            // Update the global maximum score if this cell beats it.
+            best_score_ = pick_best(best_score_, cell_score);
+        }
+
+        // ! Don't forget to pick the best among the best scores per thread.
+        // ! Assuming, that reducing across the warp is not possible, let's output the best score per thread
+        // ! into the expired set of cells in `scores_pre_substitution`, and sequentially reduce it afterwards.
+        scores_pre_substitution[threadIdx.x] = best_score_;
+        __syncthreads();
+        if (threadIdx.x == 0)
+            for (uint i = 1; i < blockDim.x; ++i) best_score_ = pick_best(best_score_, scores_pre_substitution[i]);
+    }
+};
 
 /**
  *  @brief  Alignment Score and Edit Distance algorithm evaluating the Dynamic Programming matrix
@@ -43,219 +248,268 @@ namespace cuda {
  *  the 8-bit `distance_type_` for the smaller strings, and 16-bit `distance_type_` for the larger one.
  *  The smaller the type, the more likely we are to use specialized @b SIMD instructions, like
  */
-template <                                                        //
-    typename char_type_,                                          //
-    typename distance_type_ = sz_size_t,                          //
-    typename get_substitution_cost_ = uniform_substitution_cost_t //
-
+template <                                                       //
+    sz_capability_t capability_ = sz_cap_cuda_k,                 //
+    sz_similarity_objective_t objective_ = maximize_score_k,     //
+    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
+    typename char_type_ = char,                                  //
+    typename score_type_ = sz_size_t,                            //
+    typename substituter_type_ = error_costs_uniform_t           //
     >
-__device__ void score_diagonally(                                                 //
-    span<char_type_ const> first, span<char_type_ const> second,                  //
-    distance_type_ &result_ref,                                                   //
-    sz_error_cost_t gap_cost = 1,                                                 //
-    get_substitution_cost_ get_substitution_cost = uniform_substitution_cost_t {} //
-) {
-    using char_type = char_type_;
-    using distance_type = distance_type_;
-
-    // Make sure the size relation between the strings is correct.
-    char_type const *shorter_global = first.data(), *longer_global = second.data();
-    sz_size_t shorter_length = first.size(), longer_length = second.size();
-    if (shorter_length > longer_length) {
-        std::swap(shorter_global, longer_global);
-        std::swap(shorter_length, longer_length);
-    }
-    assert(shorter_length <= longer_length);
-    assert(shorter_length > 0 && longer_length > 0);
-
-    // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
-    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
-    sz_size_t const shorter_dim = shorter_length + 1;
-    sz_size_t const longer_dim = longer_length + 1;
-
-    // Let's say we are dealing with 3 and 5 letter words.
-    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
-    // It will have:
-    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
-    // - 2 diagonals of fixed length, at positions: 4, 5.
-    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
-    sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
-    sz_size_t const max_diagonal_length = shorter_length + 1;
-
-    // Allocating shared memory is handled on the host side.
-    extern __shared__ char buffer[];
-
-    // The next few pointers will be swapped around.
-    distance_type *previous_distances = reinterpret_cast<distance_type *>(buffer);
-    distance_type *current_distances = previous_distances + max_diagonal_length;
-    distance_type *next_distances = current_distances + max_diagonal_length;
-    char_type *const longer = (char_type *)(next_distances + max_diagonal_length);
-    char_type *const shorter_reversed = longer + longer_length;
-
-    // Each thread in the warp will be loading it's own set of strided characters into shared memory.
-    for (sz_size_t i = threadIdx.x; i < longer_length; i += blockDim.x) longer[i] = longer_global[i];
-    for (sz_size_t i = threadIdx.x; i < shorter_length; i += blockDim.x)
-        shorter_reversed[i] = shorter_global[shorter_length - i - 1];
-
-    // Initialize the first two diagonals:
-    if (threadIdx.x == 0) {
-        previous_distances[0] = 0;
-        current_distances[0] = current_distances[1] = 1;
-    }
+struct diagonal_walker_per_warp {
+
+    using char_t = char_type_;
+    using score_t = score_type_;
+    using substituter_t = substituter_type_;
+
+    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_similarity_locality_t locality_k = locality_;
+    static constexpr sz_similarity_objective_t objective_k = objective_;
+
+    using global_scorer_t =
+        global_scorer<char_t const *, char_t const *, score_t, substituter_t, objective_k, capability_k>;
+    using local_scorer_t =
+        local_scorer<char_t const *, char_t const *, score_t, substituter_t, objective_k, capability_k>;
+    using scorer_t = std::conditional_t<locality_k == sz_similarity_local_k, local_scorer_t, global_scorer_t>;
+
+  private:
+    substituter_t substituter_;
+    error_cost_t gap_cost_ {1};
+
+  public:
+    /**
+     *  @param[in] substituter A commutative function returning the cost of substituting one char with another.
+     *  @param[in] gap_cost The uniform cost of a gap (insertion or deletion).
+     *  @param[in] alloc A default-constructible allocator for the internal buffers.
+     *
+     */
+    __device__ diagonal_walker_per_warp(substituter_t substituter, error_cost_t gap_cost) noexcept
+        : substituter_(substituter), gap_cost_(gap_cost) {}
+
+    /**
+     *  @param[in] first The first string.
+     *  @param[in] second The second string.
+     *  @param[out] result_ref Location to dump the calculated score.
+     */
+    __device__ void operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref,
+                               char *shared_memory_buffer) const noexcept {
+
+        // Make sure the size relation between the strings is correct.
+        char_t const *shorter_global = first.data(), *longer_global = second.data();
+        sz_size_t shorter_length = first.size(), longer_length = second.size();
+        if (shorter_length > longer_length) {
+            std::swap(shorter_global, longer_global);
+            std::swap(shorter_length, longer_length);
+        }
+        constexpr sz_size_t max_allowed_length_k = std::numeric_limits<uint>::max();
+        _sz_assert(shorter_length <= longer_length);
+        _sz_assert(shorter_length > 0 && longer_length > 0);
+        _sz_assert(longer_length < max_allowed_length_k);
+
+        // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
+        // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
+        uint const shorter_dim = static_cast<uint>(shorter_length + 1);
+        uint const longer_dim = static_cast<uint>(longer_length + 1);
+
+        // Let's say we are dealing with 3 and 5 letter words.
+        // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
+        // It will have:
+        // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
+        // - 2 diagonals of fixed length, at positions: 4, 5.
+        // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
+        uint const diagonals_count = shorter_dim + longer_dim - 1;
+        uint const max_diagonal_length = shorter_length + 1;
+
+        // The next few pointers will be swapped around.
+        score_t *previous_scores = reinterpret_cast<score_t *>(shared_memory_buffer);
+        score_t *current_scores = previous_scores + max_diagonal_length;
+        score_t *next_scores = current_scores + max_diagonal_length;
+        char_t *const longer = (char_t *)(next_scores + max_diagonal_length);
+        char_t *const shorter_reversed = longer + longer_length;
+
+        // Each thread in the warp will be loading it's own set of strided characters into shared memory.
+        for (uint i = threadIdx.x; i < longer_length; i += blockDim.x) longer[i] = longer_global[i];
+        for (uint i = threadIdx.x; i < shorter_length; i += blockDim.x)
+            shorter_reversed[i] = shorter_global[shorter_length - i - 1];
+
+        // Initialize the first two diagonals:
+        scorer_t diagonal_aligner {substituter_, gap_cost_};
+        if (threadIdx.x == 0) {
+            diagonal_aligner.init(previous_scores[0], 0);
+            diagonal_aligner.init(current_scores[0], 1);
+            diagonal_aligner.init(current_scores[1], 1);
+        }
 
-    // Make sure the shared memory is fully loaded.
-    __syncthreads();
+        // Make sure the shared memory is fully loaded.
+        __syncthreads();
 
-    // We skip diagonals 0 and 1, as they are trivial.
-    // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
-    // so we are effectively computing just one value, as will be marked by a single set bit in
-    // the `next_diagonal_mask` on the very first iteration.
-    sz_size_t next_diagonal_index = 2;
-
-    // Progress through the upper-left triangle of the Levenshtein matrix.
-    for (; next_diagonal_index < shorter_dim; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
-        for (sz_size_t offset_in_diagonal = threadIdx.x + 1; offset_in_diagonal + 1 < next_diagonal_length;
-             offset_in_diagonal += blockDim.x) {
-            // ? Note that here we are still traversing both buffers in the same order,
-            // ? because the shorter string has been reversed into `shorter_reversed`.
-            char_type shorter_char = shorter_reversed[shorter_length - next_diagonal_index + offset_in_diagonal];
-            char_type longer_char = longer[offset_in_diagonal - 1];
-            sz_error_cost_t cost_of_substitution = get_substitution_cost(shorter_char, longer_char);
-            distance_type cost_if_substitution = previous_distances[offset_in_diagonal - 1] + cost_of_substitution;
-            distance_type cost_if_deletion_or_insertion =      //
-                sz_min_of_two(                                 //
-                    current_distances[offset_in_diagonal - 1], //
-                    current_distances[offset_in_diagonal]      //
-                    ) +
-                gap_cost;
-            next_distances[offset_in_diagonal] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
+        // We skip diagonals 0 and 1, as they are trivial.
+        // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
+        // so we are effectively computing just one value, as will be marked by a single set bit in
+        // the `next_diagonal_mask` on the very first iteration.
+        uint next_diagonal_index = 2;
+
+        // Progress through the upper-left triangle of the Levenshtein matrix.
+        for (; next_diagonal_index < shorter_dim; ++next_diagonal_index) {
+
+            uint const next_diagonal_length = next_diagonal_index + 1;
+            diagonal_aligner(                                                //
+                shorter_reversed + shorter_length - next_diagonal_index + 1, // first sequence of characters
+                longer,                                                      // second sequence of characters
+                next_diagonal_length - 2,           // number of elements to compute with the `diagonal_aligner`
+                previous_scores,                    // costs pre substitution
+                current_scores, current_scores + 1, // costs pre insertion/deletion
+                next_scores + 1);
+
+            // Don't forget to populate the first row and the first column of the Levenshtein matrix.
+            if (threadIdx.x == 0) {
+                diagonal_aligner.init(next_scores[0], next_diagonal_index);
+                diagonal_aligner.init(next_scores[next_diagonal_length - 1], next_diagonal_index);
+            }
+            __syncthreads();
+
+            // Perform a circular rotation of those buffers, to reuse the memory.
+            score_t *temporary = previous_scores;
+            previous_scores = current_scores;
+            current_scores = next_scores;
+            next_scores = temporary;
         }
-        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-        if (threadIdx.x == 0) next_distances[0] = next_distances[next_diagonal_length - 1] = next_diagonal_index;
-        __syncthreads();
 
-        // Perform a circular rotation of those buffers, to reuse the memory.
-        distance_type *temporary = previous_distances;
-        previous_distances = current_distances;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
+        // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
+        for (; next_diagonal_index < longer_dim; ++next_diagonal_index) {
+
+            uint const next_diagonal_length = shorter_dim;
+            diagonal_aligner(                                        //
+                shorter_reversed + shorter_length - shorter_dim + 1, // first sequence of characters
+                longer + next_diagonal_index - shorter_dim,          // second sequence of characters
+                next_diagonal_length - 1,           // number of elements to compute with the `diagonal_aligner`
+                previous_scores,                    // costs pre substitution
+                current_scores, current_scores + 1, // costs pre insertion/deletion
+                next_scores);
+
+            // Don't forget to populate the first row of the Levenshtein matrix.
+            if (threadIdx.x == 0) diagonal_aligner.init(next_scores[next_diagonal_length - 1], next_diagonal_index);
+
+            __syncthreads();
+            // ! In the central anti-diagonal band, we can't just set the `current_scores + 1` to `previous_scores`
+            // ! for the circular shift, as we will end up spilling outside of the diagonal a few iterations later.
+            // ! Assuming in-place `memmove` is tricky on the GPU, so we will copy the data.
+            for (sz_size_t i = threadIdx.x; i + 1 < next_diagonal_length; i += blockDim.x)
+                previous_scores[i] = current_scores[i + 1];
+            __syncthreads();
+            for (sz_size_t i = threadIdx.x; i < next_diagonal_length; i += blockDim.x)
+                current_scores[i] = next_scores[i];
+            __syncthreads();
+        }
 
-    // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
-    for (; next_diagonal_index < longer_dim; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = shorter_dim;
-        for (sz_size_t offset_in_diagonal = threadIdx.x; offset_in_diagonal + 1 < next_diagonal_length;
-             offset_in_diagonal += blockDim.x) {
-            char_type shorter_char = shorter_reversed[shorter_length - shorter_dim + offset_in_diagonal + 1];
-            char_type longer_char = longer[next_diagonal_index - shorter_dim + offset_in_diagonal];
-            sz_error_cost_t cost_of_substitution = get_substitution_cost(shorter_char, longer_char);
-            distance_type cost_if_substitution = previous_distances[offset_in_diagonal] + cost_of_substitution;
-            distance_type cost_if_deletion_or_insertion =     //
-                sz_min_of_two(                                //
-                    current_distances[offset_in_diagonal],    //
-                    current_distances[offset_in_diagonal + 1] //
-                    ) +
-                gap_cost;
-            next_distances[offset_in_diagonal] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
+        // Now let's handle the bottom-right triangle of the matrix.
+        for (; next_diagonal_index < diagonals_count; ++next_diagonal_index) {
+
+            uint const next_diagonal_length = diagonals_count - next_diagonal_index;
+            diagonal_aligner(                                        //
+                shorter_reversed + shorter_length - shorter_dim + 1, // first sequence of characters
+                longer + next_diagonal_index - shorter_dim,          // second sequence of characters
+                next_diagonal_length,               // number of elements to compute with the `diagonal_aligner`
+                previous_scores,                    // costs pre substitution
+                current_scores, current_scores + 1, // costs pre insertion/deletion
+                next_scores);
+
+            // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
+            // dropping the first element in the current array.
+            score_t *temporary = previous_scores;
+            // ! Drop the first entry among the current distances.
+            // ! Assuming every next diagonal is shorter by one element, we don't need a full-blown `sz_move`.
+            // ! to shift the array by one element.
+            previous_scores = current_scores + 1;
+            current_scores = next_scores;
+            next_scores = temporary;
+            __syncthreads();
         }
-        if (threadIdx.x == 0) next_distances[next_diagonal_length - 1] = next_diagonal_index;
 
-        __syncthreads();
-        // ! In the central anti-diagonal band, we can't just assign the `current_distances + 1` to `previous_distances`
-        // ! for the circular shift, as we will end up spilling outside of the diagonal's buffer a few iterations later.
-        // ! Assuming in-place `memmove` is tricky on the GPU, so we will copy the data.
-        for (sz_size_t i = threadIdx.x; i + 1 < next_diagonal_length; i += blockDim.x)
-            previous_distances[i] = current_distances[i + 1];
-        __syncthreads();
-        for (sz_size_t i = threadIdx.x; i < next_diagonal_length; i += blockDim.x)
-            current_distances[i] = next_distances[i];
-        __syncthreads();
+        // Export one result per each block.
+        if (threadIdx.x == 0) result_ref = diagonal_aligner.score();
     }
+};
 
-    // Now let's handle the bottom-right triangle of the matrix.
-    for (; next_diagonal_index < diagonals_count; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
-        for (sz_size_t offset_in_diagonal = threadIdx.x; offset_in_diagonal < next_diagonal_length;
-             offset_in_diagonal += blockDim.x) {
-            char_type shorter_char = shorter_reversed[shorter_length - shorter_dim + offset_in_diagonal + 1];
-            char_type longer_char = longer[next_diagonal_index - shorter_dim + offset_in_diagonal];
-            sz_error_cost_t cost_of_substitution = get_substitution_cost(shorter_char, longer_char);
-            distance_type cost_if_substitution = previous_distances[offset_in_diagonal] + cost_of_substitution;
-            distance_type cost_if_deletion_or_insertion =     //
-                sz_min_of_two(                                //
-                    current_distances[offset_in_diagonal],    //
-                    current_distances[offset_in_diagonal + 1] //
-                    ) +
-                gap_cost;
-            next_distances[offset_in_diagonal] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
-        }
-        // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
-        // dropping the first element in the current array.
-        distance_type *temporary = previous_distances;
-        // ! Drop the first entry among the current distances.
-        // ! Assuming every next diagonal is shorter by one element, we don't need a full-blown `sz_move`.
-        // ! to shift the array by one element.
-        previous_distances = current_distances + 1;
-        current_distances = next_distances;
-        next_distances = temporary;
-        __syncthreads();
-    }
+template <typename first_strings_type_,
+          typename second_strings_type_>
+sz_size_t _scores_diagonally_warp_shared_memory_requirement( //
+    first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
+    error_cost_t max_cell_difference) noexcept {
 
-    // Export one result per each block.
-    if (threadIdx.x == 0) result_ref = current_distances[0];
+    sz_size_t max_required_shared_memory = 0;
+    for (sz_size_t i = 0; i < first_strings.size(); ++i) {
+        sz_size_t const first_length = first_strings[i].length();
+        sz_size_t const second_length = second_strings[i].length();
+        sz_size_t const shorter_length = sz_min_of_two(first_length, second_length);
+        sz_size_t const longer_length = sz_max_of_two(first_length, second_length);
+        sz_size_t const max_diagonal_length = shorter_length + 1;
+        sz_size_t const max_cell_value = (longer_length + 1) * max_cell_difference;
+        sz_size_t const bytes_per_cell = max_cell_value < 256 ? 1 : max_cell_value < 65536 ? 2 : 4;
+        // For each string we need to copy its contents, and allocate 3 bands proportional to the length
+        // of the shorter string with each cell being big enough to hold the length of the longer one.
+        sz_size_t const shared_memory_requirement = 3 * max_diagonal_length * bytes_per_cell + //
+                                                    first_length + second_length;
+        max_required_shared_memory = sz_max_of_two(max_required_shared_memory, shared_memory_requirement);
+    }
+    return max_required_shared_memory;
 }
 
 /**
- *  @brief  Alignment Scores and Edit Distances algorithm evaluating the Dynamic Programming matrix
+ *  @brief  Levenshtein edit distances algorithm evaluating the Dynamic Programming matrix
  *          @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization.
  *          Each pair of strings gets its own @b "block" of CUDA threads and shared memory.
  *
- *  @note   Unlike the `openmp::score_diagonally` kernel, 32-bit integers are used for offsets and lengths,
- *          as GPUs are often better optimized for 32-bit arithmetic.
- *
  *  @param[in] first_strings Array of first strings in each pair for score calculation.
  *  @param[in] second_strings Array of second strings in each pair for score calculation.
  *  @param[out] results_ptr Output array of scores for each pair of strings.
  */
-template <                                                        //
-    typename first_strings_type_,                                 //
-    typename second_strings_type_,                                //
-    typename global_distance_type_ = sz_size_t,                   //
-    typename get_substitution_cost_ = uniform_substitution_cost_t //
+template <                                         //
+    sz_capability_t capability_ = sz_cap_serial_k, //
+    typename first_strings_type_,                  //
+    typename second_strings_type_,                 //
+    typename score_type_ = sz_size_t               //
     >
-__global__ void scores_diagonally(                                                //
-    first_strings_type_ first_strings,                                            //
-    second_strings_type_ second_strings,                                          //
-    global_distance_type_ *results_ptr,                                           //
-    sz_error_cost_t gap_cost = 1,                                                 //
-    get_substitution_cost_ get_substitution_cost = uniform_substitution_cost_t {} //
+__global__ void _levenshtein_in_cuda(    //
+    first_strings_type_ first_strings,   //
+    second_strings_type_ second_strings, //
+    score_type_ *results_ptr             //
 ) {
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    using first_string_type = typename first_strings_type_::value_type;
-    using second_string_type = typename second_strings_type_::value_type;
-    using first_char_type = typename first_string_type::value_type;
-    using second_char_type = typename second_string_type::value_type;
-    static_assert(sizeof(first_char_type) == sizeof(second_char_type), "Character types don't match");
-    using char_type = first_char_type;
-    using distance_type = global_distance_type_;
+    using first_string_t = typename first_strings_type_::value_type;
+    using second_string_t = typename second_strings_type_::value_type;
+    using first_char_t = typename first_string_t::value_type;
+    using second_char_t = typename second_string_t::value_type;
+    static_assert(sizeof(first_char_t) == sizeof(second_char_t), "Character types don't match");
+    using char_t = first_char_t;
+    using score_t = score_type_;
+
+    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_similarity_objective_t objective_k = minimize_distance_k;
+    using walker_u8_t = diagonal_walker_per_warp<capability_k, objective_k, sz_similarity_global_k, char_t, sz_u8_t>;
+    using walker_u16_t = diagonal_walker_per_warp<capability_k, objective_k, sz_similarity_global_k, char_t, sz_u16_t>;
+    using walker_u32_t = diagonal_walker_per_warp<capability_k, objective_k, sz_similarity_global_k, char_t, sz_u32_t>;
+
+    // Allocating shared memory is handled on the host side.
+    extern __shared__ char shared_memory_buffer[];
+    printf("results_ptr: %p\n", results_ptr);
 
     // We are computing N edit distances for N pairs of strings. Not a cartesian product!
     // Each block/warp may end up receiving a different number of strings.
     for (sz_size_t pair_idx = blockIdx.x; pair_idx < first_strings.size(); pair_idx += gridDim.x) {
-        first_string_type const first_global = first_strings[pair_idx];
-        second_string_type const second_global = second_strings[pair_idx];
-        distance_type &result_ref = results_ptr[pair_idx];
+        first_string_t const first_global = first_strings[pair_idx];
+        second_string_t const second_global = second_strings[pair_idx];
+        score_t &result_ref = results_ptr[pair_idx];
 
         // Skip empty strings.
         sz_size_t const first_length = first_global.length();
         sz_size_t const second_length = second_global.length();
         if (first_length == 0) {
-            result_ref = second_length * gap_cost;
+            result_ref = second_length;
             continue;
         }
         if (second_length == 0) {
-            result_ref = first_length * gap_cost;
+            result_ref = first_length;
             continue;
         }
 
@@ -265,71 +519,49 @@ __global__ void scores_diagonally(
         span<char const> const second = {second_global.data(), second_length};
         if (max_cell_value < 256u) {
             sz_u8_t result_u8 = (sz_u8_t)-1;
-            score_diagonally(first, second, result_u8, gap_cost, get_substitution_cost);
+            walker_u8_t walker({}, 1);
+            walker(first, second, result_u8, shared_memory_buffer);
             if (threadIdx.x == 0) result_ref = result_u8;
         }
         else if (max_cell_value < 65536u) {
             sz_u16_t result_u16 = (sz_u16_t)-1;
-            score_diagonally(first, second, result_u16, gap_cost, get_substitution_cost);
+            walker_u16_t walker({}, 1);
+            walker(first, second, result_u16, shared_memory_buffer);
             if (threadIdx.x == 0) result_ref = result_u16;
         }
         else {
-            sz_size_t result_size = (sz_size_t)-1;
-            score_diagonally(first, second, result_size, gap_cost, get_substitution_cost);
-            if (threadIdx.x == 0) result_ref = result_size;
+            sz_u32_t result_u32 = (sz_u32_t)-1;
+            walker_u32_t walker({}, 1);
+            walker(first, second, result_u32, shared_memory_buffer);
+            if (threadIdx.x == 0) result_ref = result_u32;
         }
     }
 }
 
-template <typename first_strings_type_,
-          typename second_strings_type_>
-size_t scores_diagonally_shared_memory_requirement( //
-    first_strings_type_ const &first_strings, second_strings_type_ const &second_strings) noexcept {
-    sz_size_t max_required_shared_memory = 0;
-    for (sz_size_t i = 0; i < first_strings.size(); ++i) {
-        sz_size_t const first_length = first_strings[i].length();
-        sz_size_t const second_length = second_strings[i].length();
-        sz_size_t const shorter_length = sz_min_of_two(first_length, second_length);
-        sz_size_t const longer_length = sz_max_of_two(first_length, second_length);
-        sz_size_t const max_diagonal_length = shorter_length + 1;
-        sz_size_t const max_cell_value = longer_length + 1;
-        sz_size_t const bytes_per_cell = max_cell_value < 256 ? 1 : max_cell_value < 65536 ? 2 : 4;
-        // For each string we need to copy its contents, and allocate 3 bands proportional to the length
-        // of the shorter string with each cell being big enough to hold the length of the longer one.
-        sz_size_t const shared_memory_requirement = 3 * max_diagonal_length * bytes_per_cell + //
-                                                    first_length + second_length;
-        max_required_shared_memory = sz_max_of_two(max_required_shared_memory, shared_memory_requirement);
-    }
-    return max_required_shared_memory;
-}
-
-/**
- *  @brief Computes the @b byte-level Levenshtein distances between pairs of strings using the CUDA backend.
- *  @param[in] first_strings Array of first strings in each pair for score calculation.
- *  @param[in] second_strings Array of second strings in each pair for score calculation.
- *  @param[in] alloc An allocator for the internal buffers.
- *  @sa `levenshtein_distance_utf8` for UTF-8 strings.
- *  @sa `scores_diagonally` for the core algorithm.
- */
-template <                             //
-    typename first_strings_type_,      //
-    typename second_strings_type_,     //
-    typename results_type_ = sz_size_t //
+/** @brief Dispatches on @b `_levenshtein_in_cuda` on the device side from the host side. */
+template <                                         //
+    sz_capability_t capability_ = sz_cap_serial_k, //
+    typename first_strings_type_,                  //
+    typename second_strings_type_,                 //
+    typename score_type_ = sz_size_t               //
     >
-status_t levenshtein_distances(                                                           //
+status_t _levenshtein_via_cuda(                                                           //
     first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
-    results_type_ *results,                                                               //
-    specs_t specs = {}, cudaStream_t stream = 0) noexcept(false) {
+    score_type_ *results, gpu_specs_t specs = {}, cudaStream_t stream = 0) noexcept(false) {
 
     // We need to be able to copy these function arguments into GPU memory:
-    static_assert(
-        std::is_trivially_copyable<first_strings_type_>() && std::is_trivially_copyable<second_strings_type_>(),
-        "The first and second strings must be trivially copyable types - consider `arrow_strings_view`.");
+    static constexpr sz_capability_t capability_k = capability_;
+    using first_strings_t = first_strings_type_;
+    using second_strings_t = second_strings_type_;
+    using score_t = score_type_;
+    static_assert(std::is_trivially_copyable<first_strings_t>() && std::is_trivially_copyable<second_strings_t>(),
+                  "The first and second strings must be trivially copyable types - consider `arrow_strings_view`.");
 
     // Make sure that we don't string pairs that are too large to fit 3 matrix diagonals into shared memory.
     // H100 Streaming Multiprocessor can have up to 128 active warps concurrently and only 256 KB of shared memory.
     // A100 SMs had only 192 KB. We can't deal with blocks that require more memory than the SM can provide.
-    sz_size_t shared_memory_per_block = scores_diagonally_shared_memory_requirement(first_strings, second_strings);
+    sz_size_t shared_memory_per_block =
+        _scores_diagonally_warp_shared_memory_requirement(first_strings, second_strings, 1);
     if (shared_memory_per_block > specs.shared_memory_per_sm) return status_t::bad_alloc_k;
 
     // It may be the case that we've only received empty strings.
@@ -347,59 +579,203 @@ status_t levenshtein_distances(
 
     // Let's use all 32 threads in a warp.
     constexpr sz_size_t threads_per_block = 32u;
-    auto kernel =
-        &scores_diagonally<first_strings_type_, second_strings_type_, results_type_, uniform_substitution_cost_t>;
-    kernel<<<count_blocks, threads_per_block, shared_memory_per_block, stream>>>(first_strings, second_strings, results,
-                                                                                 1, {});
-
-    // Fetch the error:
-    cudaError_t error = cudaStreamSynchronize(stream);
-    if (error != cudaSuccess) {
-        if (error == cudaErrorMemoryAllocation) { return status_t::bad_alloc_k; }
+    auto kernel = &_levenshtein_in_cuda<capability_k, first_strings_t, second_strings_t, score_t>;
+    void *kernel_args[] = {(void *)&first_strings, (void *)&second_strings, (void *)&results};
+
+    // Enqueue the kernel for execution:
+    cudaError_t launch_error = cudaLaunchKernel( //
+        reinterpret_cast<void *>(kernel),        // Kernel function pointer
+        dim3(count_blocks),                      // Grid dimensions
+        dim3(threads_per_block),                 // Block dimensions
+        kernel_args,                             // Array of kernel argument pointers
+        shared_memory_per_block,                 // Shared memory per block (in bytes)
+        stream);                                 // CUDA stream
+    if (launch_error != cudaSuccess)
+        if (launch_error == cudaErrorMemoryAllocation) { return status_t::bad_alloc_k; }
+        else { return status_t::unknown_k; }
+
+    // Fetch the execution error:
+    cudaError_t execution_error = cudaStreamSynchronize(stream);
+    if (execution_error != cudaSuccess)
+        if (execution_error == cudaErrorMemoryAllocation) { return status_t::bad_alloc_k; }
         else { return status_t::unknown_k; }
-    }
     return status_t::success_k;
 }
 
+/** @brief Dispatches baseline Levenshtein edit distance algorithm to the GPU. */
+template <typename char_type_>
+struct levenshtein_distances<sz_cap_cuda_k, char_type_, dummy_alloc_t> {
+
+    template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
+    status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
+                        results_type_ &&results, gpu_specs_t specs = {}, cudaStream_t stream = 0) const noexcept {
+        return _levenshtein_via_cuda<sz_cap_cuda_k>(first_strings, second_strings, results, specs, stream);
+    }
+};
+
+/** @brief Dispatches Levenshtein edit distance algorithm to the GPU with Kepler shuffles. */
+template <typename char_type_>
+struct levenshtein_distances<sz_cap_kepler_k, char_type_, dummy_alloc_t> {
+
+    template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
+    status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
+                        results_type_ &&results, gpu_specs_t specs = {}, cudaStream_t stream = 0) const noexcept {
+        return _levenshtein_via_cuda<sz_cap_kepler_k>(first_strings, second_strings, results, specs, stream);
+    }
+};
+
+/** @brief Dispatches Levenshtein edit distance algorithm to the GPU with Kepler shuffles and Hopper DPX. */
+template <>
+struct levenshtein_distances<sz_cap_hopper_k, char, dummy_alloc_t> {
+
+    template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
+    status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
+                        results_type_ &&results, gpu_specs_t specs = {}, cudaStream_t stream = 0) const noexcept {
+        return _levenshtein_via_cuda<sz_cap_hopper_k>(first_strings, second_strings, results, specs, stream);
+    }
+};
+
 /**
- *  @brief Computes the @b byte-level Needleman-Wunsch scores between pairs of strings using the CUDA backend.
+ *  @brief  Needleman-Wunsch alignment cores algorithm evaluating the Dynamic Programming matrix
+ *          @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization.
+ *          Each pair of strings gets its own @b "block" of CUDA threads and shared memory.
+ *
  *  @param[in] first_strings Array of first strings in each pair for score calculation.
  *  @param[in] second_strings Array of second strings in each pair for score calculation.
- *  @param[in] alloc An allocator for the internal buffers.
- *  @sa `scores_diagonally` for the core algorithm.
+ *  @param[out] results_ptr Output array of scores for each pair of strings.
+ *
+ *  @tparam substituter_type_ Must be a trivially copyable object already placed in the GPU global memory.
  */
-template <                              //
-    typename first_strings_type_,       //
-    typename second_strings_type_,      //
-    typename results_type_ = sz_ssize_t //
+template <                                             //
+    sz_capability_t capability_ = sz_cap_serial_k,     //
+    typename first_strings_type_,                      //
+    typename second_strings_type_,                     //
+    typename score_type_ = sz_size_t,                  //
+    typename substituter_type_ = error_costs_uniform_t //
+    >
+__global__ void _needleman_wunsch_in_cuda(           //
+    first_strings_type_ first_strings,               //
+    second_strings_type_ second_strings,             //
+    score_type_ *results_ptr,                        //
+    substituter_type_ const *substituter_global_ptr, //
+    error_cost_t gap_cost = 1                        //
+) {
+    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
+    using first_string_t = typename first_strings_type_::value_type;
+    using second_string_t = typename second_strings_type_::value_type;
+    using first_char_t = typename first_string_t::value_type;
+    using second_char_t = typename second_string_t::value_type;
+    static_assert(sizeof(first_char_t) == sizeof(second_char_t), "Character types don't match");
+    using char_t = first_char_t;
+    using score_t = score_type_;
+    using substituter_t = substituter_type_;
+    static_assert(std::is_trivially_copyable<substituter_t>::value, "Substituter must be a trivially copyable type.");
+    static_assert(std::is_signed<score_t>::value, "Score must be a signed type.");
+
+    static constexpr sz_capability_t cap_k = capability_;
+    static constexpr sz_similarity_objective_t obj_k = maximize_score_k;
+    static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
+    using walker_i16_t = diagonal_walker_per_warp<cap_k, obj_k, locality_k, char_t, sz_i16_t, substituter_t &>;
+    using walker_i32_t = diagonal_walker_per_warp<cap_k, obj_k, locality_k, char_t, sz_i32_t, substituter_t &>;
+    using walker_i64_t = diagonal_walker_per_warp<cap_k, obj_k, locality_k, char_t, sz_i64_t, substituter_t &>;
+
+    // Allocating shared memory is handled on the host side.
+    extern __shared__ char shared_memory_buffer[];
+
+    // Assuming, all pairwise computations use the same substituter, load it into shared memory.
+    static constexpr uint substituter_size = sizeof(substituter_type_) / sizeof(error_cost_t);
+    error_cost_t *substituter_costs = reinterpret_cast<error_cost_t *>(shared_memory_buffer);
+    for (uint i = threadIdx.x; i < substituter_size; i += blockDim.x)
+        substituter_costs[i] = reinterpret_cast<error_cost_t const *>(substituter_global_ptr)[i];
+    substituter_t &substituter_shared = *reinterpret_cast<substituter_t *>(substituter_costs);
+    __syncthreads();
+
+    // We are computing N edit distances for N pairs of strings. Not a cartesian product!
+    // Each block/warp may end up receiving a different number of strings.
+    for (sz_size_t pair_idx = blockIdx.x; pair_idx < first_strings.size(); pair_idx += gridDim.x) {
+        first_string_t const first_global = first_strings[pair_idx];
+        second_string_t const second_global = second_strings[pair_idx];
+        score_t &result_ref = results_ptr[pair_idx];
+
+        // Skip empty strings.
+        sz_size_t const first_length = first_global.length();
+        sz_size_t const second_length = second_global.length();
+        if (first_length == 0) {
+            result_ref = second_length * gap_cost;
+            continue;
+        }
+        if (second_length == 0) {
+            result_ref = first_length * gap_cost;
+            continue;
+        }
+
+        // Estimate the maximum dimension of the DP matrix to pick the smallest fitting type.
+        sz_size_t const max_cell_value = sz_max_of_two(first_length, second_length) + 1;
+        span<char const> const first = {first_global.data(), first_length};
+        span<char const> const second = {second_global.data(), second_length};
+        if (max_cell_value < 256u) {
+            sz_i16_t result_i16 = std::numeric_limits<sz_i16_t>::min();
+            walker_i16_t walker(substituter_shared, gap_cost);
+            walker(first, second, result_i16, shared_memory_buffer + substituter_size);
+            if (threadIdx.x == 0) result_ref = result_i16;
+        }
+        else if (max_cell_value < 65536u) {
+            sz_i32_t result_i32 = std::numeric_limits<sz_i32_t>::min();
+            walker_i32_t walker(substituter_shared, gap_cost);
+            walker(first, second, result_i32, shared_memory_buffer + substituter_size);
+            if (threadIdx.x == 0) result_ref = result_i32;
+        }
+        else {
+            sz_i64_t result_i64 = std::numeric_limits<sz_i64_t>::min();
+            walker_i64_t walker(substituter_shared, gap_cost);
+            walker(first, second, result_i64, shared_memory_buffer + substituter_size);
+            if (threadIdx.x == 0) result_ref = result_i64;
+        }
+    }
+}
+
+/** @brief Dispatches on @b `_needleman_wunsch_in_cuda` on the device side from the host side. */
+template <                                             //
+    sz_capability_t capability_ = sz_cap_serial_k,     //
+    typename first_strings_type_,                      //
+    typename second_strings_type_,                     //
+    typename score_type_ = sz_ssize_t,                 //
+    typename substituter_type_ = error_costs_uniform_t //
     >
-status_t needleman_wunsch_scores(                                                         //
+status_t _needleman_wunsch_via_cuda(                                                      //
     first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
-    results_type_ *results,                                                               //
-    sz_error_cost_t const *subs, sz_error_cost_t gap,                                     //
-    specs_t specs = {}, cudaStream_t stream = 0) noexcept(false) {
+    score_type_ *results,
+    substituter_type_ const *substituter_global_ptr, //
+    error_cost_t gap_cost = 1,                       //
+    gpu_specs_t specs = {}, cudaStream_t stream = 0) noexcept(false) {
 
     // We need to be able to copy these function arguments into GPU memory:
-    static_assert(
-        std::is_trivially_copyable<first_strings_type_>() && std::is_trivially_copyable<second_strings_type_>(),
-        "The first and second strings must be trivially copyable types - consider `arrow_strings_view`.");
+    static constexpr sz_capability_t capability_k = capability_;
+    using first_strings_t = first_strings_type_;
+    using second_strings_t = second_strings_type_;
+    using score_t = score_type_;
+    using substituter_t = substituter_type_;
+    static_assert(std::is_trivially_copyable<first_strings_t>() && std::is_trivially_copyable<second_strings_t>(),
+                  "The first and second strings must be trivially copyable types - consider `arrow_strings_view`.");
 
     // Make sure that we don't string pairs that are too large to fit 3 matrix diagonals into shared memory.
     // H100 Streaming Multiprocessor can have up to 128 active warps concurrently and only 256 KB of shared memory.
     // A100 SMs had only 192 KB. We can't deal with blocks that require more memory than the SM can provide.
-    sz_size_t shared_memory_per_block = scores_diagonally_shared_memory_requirement(first_strings, second_strings);
-    // ! Add the space we need to preload the substitution costs.
-    // shared_memory_per_block += 256 * 256 * sizeof(sz_error_cost_t);
+    sz_size_t shared_memory_per_block =
+        _scores_diagonally_warp_shared_memory_requirement(first_strings, second_strings, 1);
     if (shared_memory_per_block > specs.shared_memory_per_sm) return status_t::bad_alloc_k;
 
     // It may be the case that we've only received empty strings.
     if (shared_memory_per_block == 0) {
         for (sz_size_t i = 0; i < first_strings.size(); ++i)
-            if (first_strings[i].length() == 0) { results[i] = second_strings[i].length() * gap; }
-            else if (second_strings[i].length() == 0) { results[i] = first_strings[i].length() * gap; }
+            if (first_strings[i].length() == 0) { results[i] = second_strings[i].length(); }
+            else if (second_strings[i].length() == 0) { results[i] = first_strings[i].length(); }
         return status_t::success_k;
     }
 
+    // ? Include the size of the substituter in the shared memory requirement.
+    shared_memory_per_block += sizeof(substituter_t);
+
     // In most cases we should be able to fir many blocks per SM.
     sz_size_t count_blocks = specs.shared_memory_per_sm / shared_memory_per_block;
     if (count_blocks > specs.blocks_per_sm) count_blocks = specs.blocks_per_sm;
@@ -407,21 +783,98 @@ status_t needleman_wunsch_scores(
 
     // Let's use all 32 threads in a warp.
     constexpr sz_size_t threads_per_block = 32u;
-    auto kernel =
-        &scores_diagonally<first_strings_type_, second_strings_type_, results_type_, lookup_substitution_cost_t>;
-    kernel<<<count_blocks, threads_per_block, shared_memory_per_block, stream>>>(
-        first_strings, second_strings, results, gap, lookup_substitution_cost_t {subs});
-
-    // Fetch the error:
-    cudaError_t error = cudaStreamSynchronize(stream);
-    if (error != cudaSuccess) {
-        if (error == cudaErrorMemoryAllocation) { return status_t::bad_alloc_k; }
+    auto kernel = &_needleman_wunsch_in_cuda<capability_k, first_strings_t, second_strings_t, score_t, substituter_t>;
+    void *kernel_args[] = {
+        (void *)&first_strings,         (void *)&second_strings, (void *)&results,
+        (void *)substituter_global_ptr, (void *)&gap_cost,
+    };
+
+    // Enqueue the kernel for execution:
+    cudaError_t launch_error = cudaLaunchKernel( //
+        reinterpret_cast<void *>(kernel),        // Kernel function pointer
+        dim3(count_blocks),                      // Grid dimensions
+        dim3(threads_per_block),                 // Block dimensions
+        kernel_args,                             // Array of kernel argument pointers
+        shared_memory_per_block,                 // Shared memory per block (in bytes)
+        stream);                                 // CUDA stream
+    if (launch_error != cudaSuccess)
+        if (launch_error == cudaErrorMemoryAllocation) { return status_t::bad_alloc_k; }
+        else { return status_t::unknown_k; }
+
+    // Fetch the execution error:
+    cudaError_t execution_error = cudaStreamSynchronize(stream);
+    if (execution_error != cudaSuccess)
+        if (execution_error == cudaErrorMemoryAllocation) { return status_t::bad_alloc_k; }
         else { return status_t::unknown_k; }
-    }
     return status_t::success_k;
 }
 
-} // namespace cuda
+/** @brief Dispatches baseline Needleman Wunsch algorithm to the GPU. */
+template <typename char_type_, typename substituter_type_>
+struct needleman_wunsch_scores<sz_cap_cuda_k, char_type_, substituter_type_ *, dummy_alloc_t> {
+
+    using char_t = char_type_;
+    using substituter_t = substituter_type_ *; // ! Note the pointer, it must be in the global memory
+
+    substituter_t substituter_ {};
+    error_cost_t gap_cost_ {1};
+
+    needleman_wunsch_scores() noexcept {}
+    needleman_wunsch_scores(substituter_t subs, error_cost_t gap_cost) noexcept
+        : substituter_(subs), gap_cost_(gap_cost) {}
+
+    template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
+    status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
+                        results_type_ &&results, gpu_specs_t specs = {}, cudaStream_t stream = 0) const noexcept {
+        return _needleman_wunsch_via_cuda<sz_cap_cuda_k>(first_strings, second_strings, results, substituter_,
+                                                         gap_cost_, specs, stream);
+    }
+};
+
+/** @brief Dispatches Needleman Wunsch algorithm to the GPU with Kepler shuffles. */
+template <typename char_type_, typename substituter_type_>
+struct needleman_wunsch_scores<sz_cap_kepler_k, char_type_, substituter_type_ *, dummy_alloc_t> {
+
+    using char_t = char_type_;
+    using substituter_t = substituter_type_ *; // ! Note the pointer, it must be in the global memory
+
+    substituter_t substituter_ {};
+    error_cost_t gap_cost_ {1};
+
+    needleman_wunsch_scores() noexcept {}
+    needleman_wunsch_scores(substituter_t subs, error_cost_t gap_cost) noexcept
+        : substituter_(subs), gap_cost_(gap_cost) {}
+
+    template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
+    status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
+                        results_type_ &&results, gpu_specs_t specs = {}, cudaStream_t stream = 0) const noexcept {
+        return _needleman_wunsch_via_cuda<sz_cap_kepler_k>(first_strings, second_strings, results, substituter_,
+                                                           gap_cost_, specs, stream);
+    }
+};
+
+/** @brief Dispatches Needleman Wunsch algorithm to the GPU with Kepler shuffles and Hopper DPX. */
+template <typename substituter_type_>
+struct needleman_wunsch_scores<sz_cap_hopper_k, char, substituter_type_, dummy_alloc_t> {
+
+    using char_t = char;
+    using substituter_t = substituter_type_ *; // ! Note the pointer, it must be in the global memory
+
+    substituter_t substituter_ {};
+    error_cost_t gap_cost_ {1};
+
+    needleman_wunsch_scores() noexcept {}
+    needleman_wunsch_scores(substituter_t subs, error_cost_t gap_cost) noexcept
+        : substituter_(subs), gap_cost_(gap_cost) {}
+
+    template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
+    status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
+                        results_type_ &&results, gpu_specs_t specs = {}, cudaStream_t stream = 0) const noexcept {
+        return _needleman_wunsch_via_cuda<sz_cap_hopper_k>(first_strings, second_strings, results, substituter_,
+                                                           gap_cost_, specs, stream);
+    }
+};
+
 } // namespace stringzilla
 } // namespace ashvardanian
 
diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index 09b89327..f4ff1805 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -57,7 +57,6 @@
 
 namespace ashvardanian {
 namespace stringzilla {
-namespace openmp {
 
 /**
  *  @brief  An operator to be applied to be applied to all 2x2 blocks of the DP matrix to produce
@@ -67,13 +66,13 @@ namespace openmp {
  *  It updates the internal state to remember the last calculated value, as in Global Alignment it's
  *  always in the bottom-right corner of the DP matrix, which is evaluated last.
  */
-template <                                                     //
-    typename first_iterator_type_ = char const *,              //
-    typename second_iterator_type_ = char const *,             //
-    typename score_type_ = sz_size_t,                          //
-    typename substituter_type_ = error_costs_uniform_t,        //
-    sz_alignment_direction_t direction_ = sz_align_maximize_k, //
-    sz_capability_t capability_ = sz_cap_serial_k              //
+template <                                                   //
+    typename first_iterator_type_ = char const *,            //
+    typename second_iterator_type_ = char const *,           //
+    typename score_type_ = sz_size_t,                        //
+    typename substituter_type_ = error_costs_uniform_t,      //
+    sz_similarity_objective_t objective_ = maximize_score_k, //
+    sz_capability_t capability_ = sz_cap_serial_k            //
     >
 struct global_scorer {
 
@@ -81,6 +80,8 @@ struct global_scorer {
     using second_iterator_t = second_iterator_type_;
     using score_t = score_type_;
     using substituter_t = substituter_type_;
+
+    static constexpr sz_similarity_objective_t objective_k = objective_;
     static constexpr sz_capability_t capability_k = capability_;
 
     using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
@@ -91,10 +92,10 @@ struct global_scorer {
   private:
     substituter_t substituter_ {};
     error_cost_t gap_cost_ {1};
-    score_t last_cell_ {0};
+    score_t last_score_ {0};
 
     static inline score_t pick_best(score_t a, score_t b) noexcept {
-        if constexpr (direction_ == sz_align_minimize_k) { return sz_min_of_two(a, b); }
+        if constexpr (objective_k == minimize_distance_k) { return sz_min_of_two(a, b); }
         else { return sz_max_of_two(a, b); }
     }
 
@@ -115,7 +116,7 @@ struct global_scorer {
     /**
      *  @brief Extract the final result of the scoring operation which will be always in the bottom-right corner.
      */
-    score_t score() const noexcept { return last_cell_; }
+    score_t score() const noexcept { return last_score_; }
 
     /**
      *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
@@ -130,21 +131,21 @@ struct global_scorer {
 
 #pragma omp parallel for simd schedule(dynamic, 1) if (is_parallel())
         for (sz_size_t i = 0; i < n; ++i) {
-            score_t score_pre_substitution = scores_pre_substitution[i];
-            score_t score_pre_insertion = scores_pre_insertion[i];
-            score_t score_pre_deletion = scores_pre_deletion[i];
+            score_t pre_substitution = scores_pre_substitution[i];
+            score_t pre_insertion = scores_pre_insertion[i];
+            score_t pre_deletion = scores_pre_deletion[i];
 
             // ? Note that here we are still traversing both buffers in the same order,
             // ? because one of the strings has been reversed beforehand.
             error_cost_t cost_of_substitution = substituter_(first_reversed_slice[i], second_slice[i]);
-            score_t score_if_substitution = score_pre_substitution + cost_of_substitution;
-            score_t score_if_deletion_or_insertion = pick_best(score_pre_deletion, score_pre_insertion) + gap_cost_;
-            score_t cell_score = pick_best(score_if_deletion_or_insertion, score_if_substitution);
+            score_t if_substitution = pre_substitution + cost_of_substitution;
+            score_t if_deletion_or_insertion = pick_best(pre_deletion, pre_insertion) + gap_cost_;
+            score_t cell_score = pick_best(if_deletion_or_insertion, if_substitution);
             scores_new[i] = cell_score;
         }
 
         // The last element of the last chunk is the result of the global alignment.
-        last_cell_ = scores_new[n - 1];
+        last_score_ = scores_new[n - 1];
     }
 };
 
@@ -156,13 +157,13 @@ struct global_scorer {
  *  It updates the internal state to remember the minimum/maximum calculated value, as in Local Alignment
  *  it's always in the bottom-right corner of the DP matrix, which is evaluated last.
  */
-template <                                                     //
-    typename first_iterator_type_ = char const *,              //
-    typename second_iterator_type_ = char const *,             //
-    typename score_type_ = sz_size_t,                          //
-    typename substituter_type_ = error_costs_uniform_t,        //
-    sz_alignment_direction_t direction_ = sz_align_maximize_k, //
-    sz_capability_t capability_ = sz_cap_serial_k              //
+template <                                                   //
+    typename first_iterator_type_ = char const *,            //
+    typename second_iterator_type_ = char const *,           //
+    typename score_type_ = sz_size_t,                        //
+    typename substituter_type_ = error_costs_uniform_t,      //
+    sz_similarity_objective_t objective_ = maximize_score_k, //
+    sz_capability_t capability_ = sz_cap_serial_k            //
     >
 struct local_scorer {
 
@@ -170,6 +171,8 @@ struct local_scorer {
     using second_iterator_t = second_iterator_type_;
     using score_t = score_type_;
     using substituter_t = substituter_type_;
+
+    static constexpr sz_similarity_objective_t objective_k = objective_;
     static constexpr sz_capability_t capability_k = capability_;
 
     using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
@@ -183,7 +186,7 @@ struct local_scorer {
     score_t best_score_ {0};
 
     static inline score_t pick_best(score_t a, score_t b) noexcept {
-        if constexpr (direction_ == sz_align_minimize_k) { return sz_min_of_two(a, b); }
+        if constexpr (objective_k == minimize_distance_k) { return sz_min_of_two(a, b); }
         else { return sz_max_of_two(a, b); }
     }
 
@@ -213,17 +216,17 @@ struct local_scorer {
 
 #pragma omp parallel for schedule(dynamic, 1) if (is_parallel())
         for (sz_size_t i = 0; i < n; ++i) {
-            score_t score_pre_substitution = scores_pre_substitution[i];
-            score_t score_pre_insertion = scores_pre_insertion[i];
-            score_t score_pre_deletion = scores_pre_deletion[i];
+            score_t pre_substitution = scores_pre_substitution[i];
+            score_t pre_insertion = scores_pre_insertion[i];
+            score_t pre_deletion = scores_pre_deletion[i];
 
             // ? Note that here we are still traversing both buffers in the same order,
             // ? because one of the strings has been reversed beforehand.
             error_cost_t cost_of_substitution = substituter_(first_reversed_slice[i], second_slice[i]);
-            score_t score_if_substitution = score_pre_substitution + cost_of_substitution;
-            score_t score_if_deletion_or_insertion = pick_best(score_pre_deletion, score_pre_insertion) + gap_cost_;
-            score_t score_if_substitution_or_reset = pick_best(score_if_substitution, 0);
-            score_t cell_score = pick_best(score_if_deletion_or_insertion, score_if_substitution_or_reset);
+            score_t if_substitution = pre_substitution + cost_of_substitution;
+            score_t if_deletion_or_insertion = pick_best(pre_deletion, pre_insertion) + gap_cost_;
+            score_t if_substitution_or_reset = pick_best(if_substitution, 0);
+            score_t cell_score = pick_best(if_deletion_or_insertion, if_substitution_or_reset);
             scores_new[i] = cell_score;
 
             // Update the global maximum score if this cell beats it.
@@ -258,14 +261,14 @@ struct local_scorer {
  *  @sa     For simplicity, use the `sz::levenshtein_distance[_utf8]` and `sz::needleman_wunsch_score`.
  *  @sa     For bulk API, use `sz::levenshtein_scores[_utf8]`.
  */
-template <                                                     //
-    sz_capability_t capability_ = sz_cap_serial_k,             //
-    sz_alignment_direction_t direction_ = sz_align_maximize_k, //
-    sz_alignment_locality_t locality_ = sz_align_global_k,     //
-    typename char_type_ = char,                                //
-    typename score_type_ = sz_size_t,                          //
-    typename substituter_type_ = error_costs_uniform_t,        //
-    typename allocator_type_ = dummy_alloc_t                   //
+template <                                                       //
+    sz_capability_t capability_ = sz_cap_serial_k,               //
+    sz_similarity_objective_t objective_ = maximize_score_k,     //
+    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
+    typename char_type_ = char,                                  //
+    typename score_type_ = sz_size_t,                            //
+    typename substituter_type_ = error_costs_uniform_t,          //
+    typename allocator_type_ = dummy_alloc_t                     //
     >
 struct diagonal_walker {
 
@@ -275,20 +278,20 @@ struct diagonal_walker {
     using allocator_t = allocator_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
-    static constexpr sz_alignment_locality_t locality_k = locality_;
-    static constexpr sz_alignment_direction_t direction_k = direction_;
+    static constexpr sz_similarity_locality_t locality_k = locality_;
+    static constexpr sz_similarity_objective_t objective_k = objective_;
 
     using allocated_t = typename allocator_t::value_type;
     static_assert(sizeof(allocated_t) == sizeof(char), "Allocator must be byte-aligned");
     using global_scorer_t =
-        global_scorer<char_t const *, char_t const *, score_t, substituter_t, direction_k, capability_k>;
+        global_scorer<char_t const *, char_t const *, score_t, substituter_t, objective_k, capability_k>;
     using local_scorer_t =
-        local_scorer<char_t const *, char_t const *, score_t, substituter_t, direction_k, capability_k>;
-    using scorer_t = std::conditional_t<locality_k == sz_align_local_k, local_scorer_t, global_scorer_t>;
+        local_scorer<char_t const *, char_t const *, score_t, substituter_t, objective_k, capability_k>;
+    using scorer_t = std::conditional_t<locality_k == sz_similarity_local_k, local_scorer_t, global_scorer_t>;
 
     substituter_t substituter_ {};
     error_cost_t gap_cost_ {1};
-    allocator_t alloc_ {};
+    mutable allocator_t alloc_ {};
 
     diagonal_walker(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
 
@@ -306,7 +309,7 @@ struct diagonal_walker {
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score.
      */
-    status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref) noexcept {
+    status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref) const noexcept {
 
         // Make sure the size relation between the strings is correct.
         char_t const *shorter = first.data(), *longer = second.data();
@@ -465,14 +468,14 @@ struct diagonal_walker {
  *  @sa     For simplicity, use the `sz::levenshtein_distance[_utf8]` and `sz::needleman_wunsch_score`.
  *  @sa     For bulk API, use `sz::levenshtein_scores[_utf8]`.
  */
-template <                                                     //
-    sz_capability_t capability_ = sz_cap_serial_k,             //
-    sz_alignment_direction_t direction_ = sz_align_maximize_k, //
-    sz_alignment_locality_t locality_ = sz_align_global_k,     //
-    typename char_type_ = char,                                //
-    typename score_type_ = sz_size_t,                          //
-    typename substituter_type_ = error_costs_uniform_t,        //
-    typename allocator_type_ = dummy_alloc_t                   //
+template <                                                       //
+    sz_capability_t capability_ = sz_cap_serial_k,               //
+    sz_similarity_objective_t objective_ = maximize_score_k,     //
+    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
+    typename char_type_ = char,                                  //
+    typename score_type_ = sz_size_t,                            //
+    typename substituter_type_ = error_costs_uniform_t,          //
+    typename allocator_type_ = dummy_alloc_t                     //
     >
 struct horizontal_walker {
 
@@ -482,21 +485,21 @@ struct horizontal_walker {
     using allocator_t = allocator_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
-    static constexpr sz_alignment_locality_t locality_k = locality_;
-    static constexpr sz_alignment_direction_t direction_k = direction_;
+    static constexpr sz_similarity_locality_t locality_k = locality_;
+    static constexpr sz_similarity_objective_t objective_k = objective_;
     static_assert((capability_k & sz_cap_parallel_k) == 0, "This algorithm is not parallelized!");
 
     using allocated_t = typename allocator_t::value_type;
     static_assert(sizeof(allocated_t) == sizeof(char), "Allocator must be byte-aligned");
     using global_scorer_t =
-        global_scorer<constant_iterator<char_t>, char_t const *, score_t, substituter_t, direction_k, capability_k>;
+        global_scorer<constant_iterator<char_t>, char_t const *, score_t, substituter_t, objective_k, capability_k>;
     using local_scorer_t =
-        local_scorer<constant_iterator<char_t>, char_t const *, score_t, substituter_t, direction_k, capability_k>;
-    using scorer_t = std::conditional_t<locality_k == sz_align_local_k, local_scorer_t, global_scorer_t>;
+        local_scorer<constant_iterator<char_t>, char_t const *, score_t, substituter_t, objective_k, capability_k>;
+    using scorer_t = std::conditional_t<locality_k == sz_similarity_local_k, local_scorer_t, global_scorer_t>;
 
     substituter_t substituter_ {};
     error_cost_t gap_cost_ {1};
-    allocator_t alloc_ {};
+    mutable allocator_t alloc_ {};
 
     horizontal_walker(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
 
@@ -514,7 +517,7 @@ struct horizontal_walker {
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score.
      */
-    status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref) noexcept {
+    status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref) const noexcept {
 
         // Make sure the size relation between the strings is correct.
         char_t const *shorter = first.data(), *longer = second.data();
@@ -587,15 +590,15 @@ struct levenshtein_distance {
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using horizontal_u8_t = horizontal_walker<capability_serialized_k, sz_align_minimize_k, sz_align_global_k, char_t,
-                                              sz_u8_t, error_costs_uniform_t, allocator_t>;
-    using diagonal_u8_t = diagonal_walker<capability_serialized_k, sz_align_minimize_k, sz_align_global_k, char_t,
+    using horizontal_u8_t = horizontal_walker<capability_serialized_k, minimize_distance_k, sz_similarity_global_k,
+                                              char_t, sz_u8_t, error_costs_uniform_t, allocator_t>;
+    using diagonal_u8_t = diagonal_walker<capability_serialized_k, minimize_distance_k, sz_similarity_global_k, char_t,
                                           sz_u8_t, error_costs_uniform_t, allocator_t>;
-    using diagonal_u16_t = diagonal_walker<capability_k, sz_align_minimize_k, sz_align_global_k, char_t, sz_u16_t,
+    using diagonal_u16_t = diagonal_walker<capability_k, minimize_distance_k, sz_similarity_global_k, char_t, sz_u16_t,
                                            error_costs_uniform_t, allocator_t>;
-    using diagonal_u32_t = diagonal_walker<capability_k, sz_align_minimize_k, sz_align_global_k, char_t, sz_u32_t,
+    using diagonal_u32_t = diagonal_walker<capability_k, minimize_distance_k, sz_similarity_global_k, char_t, sz_u32_t,
                                            error_costs_uniform_t, allocator_t>;
-    using diagonal_u64_t = diagonal_walker<capability_k, sz_align_minimize_k, sz_align_global_k, char_t, sz_u64_t,
+    using diagonal_u64_t = diagonal_walker<capability_k, minimize_distance_k, sz_similarity_global_k, char_t, sz_u64_t,
                                            error_costs_uniform_t, allocator_t>;
 
     allocator_t alloc_ {};
@@ -676,16 +679,16 @@ struct levenshtein_distance_utf8 {
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using horizontal_u8_t = horizontal_walker<capability_serialized_k, sz_align_minimize_k, sz_align_global_k,
+    using horizontal_u8_t = horizontal_walker<capability_serialized_k, minimize_distance_k, sz_similarity_global_k,
                                               sz_rune_t, sz_u8_t, error_costs_uniform_t, allocator_t>;
-    using diagonal_u8_t = diagonal_walker<capability_serialized_k, sz_align_minimize_k, sz_align_global_k, sz_rune_t,
-                                          sz_u8_t, error_costs_uniform_t, allocator_t>;
-    using diagonal_u16_t = diagonal_walker<capability_k, sz_align_minimize_k, sz_align_global_k, sz_rune_t, sz_u16_t,
-                                           error_costs_uniform_t, allocator_t>;
-    using diagonal_u32_t = diagonal_walker<capability_k, sz_align_minimize_k, sz_align_global_k, sz_rune_t, sz_u32_t,
-                                           error_costs_uniform_t, allocator_t>;
-    using diagonal_u64_t = diagonal_walker<capability_k, sz_align_minimize_k, sz_align_global_k, sz_rune_t, sz_u64_t,
-                                           error_costs_uniform_t, allocator_t>;
+    using diagonal_u8_t = diagonal_walker<capability_serialized_k, minimize_distance_k, sz_similarity_global_k,
+                                          sz_rune_t, sz_u8_t, error_costs_uniform_t, allocator_t>;
+    using diagonal_u16_t = diagonal_walker<capability_k, minimize_distance_k, sz_similarity_global_k, sz_rune_t,
+                                           sz_u16_t, error_costs_uniform_t, allocator_t>;
+    using diagonal_u32_t = diagonal_walker<capability_k, minimize_distance_k, sz_similarity_global_k, sz_rune_t,
+                                           sz_u32_t, error_costs_uniform_t, allocator_t>;
+    using diagonal_u64_t = diagonal_walker<capability_k, minimize_distance_k, sz_similarity_global_k, sz_rune_t,
+                                           sz_u64_t, error_costs_uniform_t, allocator_t>;
 
     using ascii_fallback_t = levenshtein_distance_utf8<capability_k, char_t, allocator_t>;
 
@@ -776,11 +779,11 @@ struct levenshtein_distance_utf8 {
  *  @brief  Computes the @b byte-level Needleman-Wunsch score between two strings using the OpenMP backend.
  *  @sa     `levenshtein_distance` for uniform substitution and gap costs.
  */
-template <                                                     //
-    sz_capability_t capability_ = sz_cap_serial_k,             //
-    typename char_type_ = char,                                //
-    typename substituter_type_ = error_costs_256x256_lookup_t, //
-    typename allocator_type_ = dummy_alloc_t                   //
+template <                                              //
+    sz_capability_t capability_ = sz_cap_serial_k,      //
+    typename char_type_ = char,                         //
+    typename substituter_type_ = error_costs_uniform_t, //
+    typename allocator_type_ = dummy_alloc_t            //
     >
 struct needleman_wunsch_score {
 
@@ -791,13 +794,13 @@ struct needleman_wunsch_score {
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using horizontal_i16_t = horizontal_walker<capability_serialized_k, sz_align_maximize_k, sz_align_global_k, char_t,
-                                               sz_i16_t, substituter_t, allocator_t>;
-    using diagonal_i16_t = diagonal_walker<capability_serialized_k, sz_align_maximize_k, sz_align_global_k, char_t,
+    using horizontal_i16_t = horizontal_walker<capability_serialized_k, maximize_score_k, sz_similarity_global_k,
+                                               char_t, sz_i16_t, substituter_t, allocator_t>;
+    using diagonal_i16_t = diagonal_walker<capability_serialized_k, maximize_score_k, sz_similarity_global_k, char_t,
                                            sz_i16_t, substituter_t, allocator_t>;
-    using diagonal_i32_t = diagonal_walker<capability_k, sz_align_maximize_k, sz_align_global_k, char_t, sz_i32_t,
+    using diagonal_i32_t = diagonal_walker<capability_k, maximize_score_k, sz_similarity_global_k, char_t, sz_i32_t,
                                            substituter_t, allocator_t>;
-    using diagonal_i64_t = diagonal_walker<capability_k, sz_align_maximize_k, sz_align_global_k, char_t, sz_i64_t,
+    using diagonal_i64_t = diagonal_walker<capability_k, maximize_score_k, sz_similarity_global_k, char_t, sz_i64_t,
                                            substituter_t, allocator_t>;
 
     substituter_t substituter_ {};
@@ -868,11 +871,11 @@ struct needleman_wunsch_score {
  *  @brief  Computes the @b byte-level Needleman-Wunsch score between two strings using the OpenMP backend.
  *  @sa     `levenshtein_distance` for uniform substitution and gap costs.
  */
-template <                                                     //
-    sz_capability_t capability_ = sz_cap_serial_k,             //
-    typename char_type_ = char,                                //
-    typename substituter_type_ = error_costs_256x256_lookup_t, //
-    typename allocator_type_ = dummy_alloc_t                   //
+template <                                              //
+    sz_capability_t capability_ = sz_cap_serial_k,      //
+    typename char_type_ = char,                         //
+    typename substituter_type_ = error_costs_uniform_t, //
+    typename allocator_type_ = dummy_alloc_t            //
     >
 struct smith_waterman_score {
 
@@ -883,13 +886,13 @@ struct smith_waterman_score {
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using horizontal_i16_t = horizontal_walker<capability_serialized_k, sz_align_maximize_k, sz_align_local_k, char_t,
+    using horizontal_i16_t = horizontal_walker<capability_serialized_k, maximize_score_k, sz_similarity_local_k, char_t,
                                                sz_i16_t, substituter_t, allocator_t>;
-    using diagonal_i16_t = diagonal_walker<capability_serialized_k, sz_align_maximize_k, sz_align_local_k, char_t,
+    using diagonal_i16_t = diagonal_walker<capability_serialized_k, maximize_score_k, sz_similarity_local_k, char_t,
                                            sz_i16_t, substituter_t, allocator_t>;
-    using diagonal_i32_t = diagonal_walker<capability_k, sz_align_maximize_k, sz_align_local_k, char_t, sz_i32_t,
+    using diagonal_i32_t = diagonal_walker<capability_k, maximize_score_k, sz_similarity_local_k, char_t, sz_i32_t,
                                            substituter_t, allocator_t>;
-    using diagonal_i64_t = diagonal_walker<capability_k, sz_align_maximize_k, sz_align_local_k, char_t, sz_i64_t,
+    using diagonal_i64_t = diagonal_walker<capability_k, maximize_score_k, sz_similarity_local_k, char_t, sz_i64_t,
                                            substituter_t, allocator_t>;
 
     substituter_t substituter_ {};
@@ -1111,11 +1114,11 @@ struct levenshtein_distances_utf8 {
     }
 };
 
-template <                                                     //
-    sz_capability_t capability_ = sz_cap_serial_k,             //
-    typename char_type_ = char,                                //
-    typename substituter_type_ = error_costs_256x256_lookup_t, //
-    typename allocator_type_ = dummy_alloc_t                   //
+template <                                              //
+    sz_capability_t capability_ = sz_cap_serial_k,      //
+    typename char_type_ = char,                         //
+    typename substituter_type_ = error_costs_uniform_t, //
+    typename allocator_type_ = dummy_alloc_t            //
     >
 struct needleman_wunsch_scores {
 
@@ -1153,11 +1156,11 @@ struct needleman_wunsch_scores {
     }
 };
 
-template <                                                     //
-    sz_capability_t capability_ = sz_cap_serial_k,             //
-    typename char_type_ = char,                                //
-    typename substituter_type_ = error_costs_256x256_lookup_t, //
-    typename allocator_type_ = dummy_alloc_t                   //
+template <                                              //
+    sz_capability_t capability_ = sz_cap_serial_k,      //
+    typename char_type_ = char,                         //
+    typename substituter_type_ = error_costs_uniform_t, //
+    typename allocator_type_ = dummy_alloc_t            //
     >
 struct smith_waterman_scores {
 
diff --git a/include/stringcuzilla/types.cuh b/include/stringcuzilla/types.cuh
index c331c647..386b740d 100644
--- a/include/stringcuzilla/types.cuh
+++ b/include/stringcuzilla/types.cuh
@@ -17,7 +17,6 @@
 
 namespace ashvardanian {
 namespace stringzilla {
-namespace cuda {
 
 /**
  *  @brief  A custom allocator that uses CUDA Unified Memory for allocation.
@@ -63,7 +62,6 @@ struct unified_alloc {
     }
 };
 
-} // namespace cuda
 } // namespace stringzilla
 } // namespace ashvardanian
 
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 73224e5f..85d25d64 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -414,15 +414,15 @@ typedef enum { sz_less_k = -1, sz_equal_k = 0, sz_greater_k = 1 } sz_ordering_t;
 
 /**
  *  @brief Describes the alignment scope for string similarity algorithms.
- *  @sa sz_align_global_k, sz_align_local_k
+ *  @sa sz_similarity_global_k, sz_similarity_local_k
  */
-typedef enum { sz_align_global_k = 0, sz_align_local_k = 1 } sz_alignment_locality_t;
+typedef enum { sz_similarity_global_k = 0, sz_similarity_local_k = 1 } sz_similarity_locality_t;
 
 /**
  *  @brief Describes the alignment objective for string similarity algorithms.
- *  @sa sz_align_minimize_k, sz_align_maximize_k
+ *  @sa minimize_distance_k, maximize_score_k
  */
-typedef enum { sz_align_minimize_k = 0, sz_align_maximize_k = 1 } sz_alignment_direction_t;
+typedef enum { minimize_distance_k = 0, maximize_score_k = 1 } sz_similarity_objective_t;
 
 /**
  *  @brief A simple signed integer type describing the status of a faulty operation.
@@ -461,7 +461,8 @@ typedef enum {
     sz_cap_sve2_aes_k = 1 << 14, ///< ARM SVE2 capability with AES extensions
 
     sz_cap_cuda_k = 1 << 20,   ///< CUDA capability
-    sz_cap_hopper_k = 1 << 21, ///< CUDA capability
+    sz_cap_kepler_k = 1 << 21, ///< CUDA capability with support with in-warp register shuffles
+    sz_cap_hopper_k = 1 << 22, ///< CUDA capability with support for Hopper's DPX instructions
 
 } sz_capability_t;
 
@@ -927,7 +928,8 @@ SZ_INTERNAL sz_ssize_t _sz_ssize_max(void) { return SZ_SSIZE_MAX; }
  *          to check the invariants of the library. It's a no-op in the "Release" mode.
  *  @note   If you want to catch it, put a breakpoint at @b `__GI_exit`
  */
-#if SZ_DEBUG && defined(SZ_AVOID_LIBC) && !SZ_AVOID_LIBC && !defined(SZ_PIC)
+#if SZ_DEBUG && defined(SZ_AVOID_LIBC) && !SZ_AVOID_LIBC && !defined(SZ_PIC) && \
+    !defined(__CUDA_ARCH__) // ? CPU code w/out LibC access
 SZ_PUBLIC void _sz_assert_failure(char const *condition, char const *file, int line) {
     fprintf(stderr, "Assertion failed: %s, in file %s, line %d\n", condition, file, line);
     exit(EXIT_FAILURE);
@@ -936,6 +938,15 @@ SZ_PUBLIC void _sz_assert_failure(char const *condition, char const *file, int l
     do {                                                                          \
         if (!(condition)) { _sz_assert_failure(#condition, __FILE__, __LINE__); } \
     } while (0)
+#elif SZ_DEBUG && defined(__CUDA_ARCH__) // ? CUDA code for GPUs
+__device__ __noinline__ void _sz_assert_cuda_failure(char const *condition, char const *file, int line) {
+    printf("Assertion failed: %s, in file %s, line %d\n", condition, file, line);
+    __trap();
+}
+#define _sz_assert(condition)                                                          \
+    do {                                                                               \
+        if (!(condition)) { _sz_assert_cuda_failure(#condition, __FILE__, __LINE__); } \
+    } while (0)
 #else
 #define _sz_assert(condition) ((void)(condition))
 #endif
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 80745fd1..eb36241f 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -121,15 +121,14 @@ enum class status_t {
     unknown_k = sz_status_unknown_k,
 };
 
+/**
+ *  @brief A trivial function object for uniform character substitution costs in Levenshtein-like similarity algorithms.
+ *  @sa error_costs_256x256_t, error_costs_26x26ascii_t
+ */
 struct error_costs_uniform_t {
     constexpr error_cost_t operator()(char a, char b) const noexcept { return a == b ? 0 : 1; }
 };
 
-struct error_costs_256x256_lookup_t {
-    error_cost_t const *costs = nullptr;
-    constexpr error_cost_t operator()(char a, char b) const noexcept { return costs[(u8_t)a * 256 + (u8_t)b]; }
-};
-
 template <typename value_type_>
 struct span {
     using value_type = value_type_;     // ? For STL compatibility
diff --git a/scripts/test.cu b/scripts/test.cu
index d434e62c..efe5e4c3 100644
--- a/scripts/test.cu
+++ b/scripts/test.cu
@@ -44,20 +44,21 @@ namespace sz = ashvardanian::stringzilla;
 using namespace sz::scripts;
 using namespace std::literals; // for ""sv
 
+using arrow_strings_view_t = sz::arrow_strings_view<char, sz_size_t>;
+
 #if !SZ_USE_CUDA
 using arrow_strings_tape_t = sz::arrow_strings_tape<char, sz_size_t, std::allocator<char>>;
 template <typename value_type_>
 using unified_vector = std::vector<value_type_, std::allocator<value_type_>>;
 #else
-using arrow_strings_tape_t = sz::arrow_strings_tape<char, sz_size_t, sz::cuda::unified_alloc<char>>;
+using arrow_strings_tape_t = sz::arrow_strings_tape<char, sz_size_t, sz::unified_alloc<char>>;
 template <typename value_type_>
-using unified_vector = std::vector<value_type_, sz::cuda::unified_alloc<value_type_>>;
+using unified_vector = std::vector<value_type_, sz::unified_alloc<value_type_>>;
 #endif
 
 struct levenshtein_baselines_t {
     template <typename results_type_>
-    sz::status_t operator()(arrow_strings_tape_t const &first, arrow_strings_tape_t const &second,
-                            results_type_ *results) const {
+    sz::status_t operator()(arrow_strings_view_t first, arrow_strings_view_t second, results_type_ *results) const {
         _sz_assert(first.size() == second.size());
 #pragma omp parallel for
         for (std::size_t i = 0; i != first.size(); ++i)
@@ -69,73 +70,67 @@ struct levenshtein_baselines_t {
 
 struct needleman_wunsch_baselines_t {
 
-    sz::scripts::error_costs_256x256_t substitution_costs = sz::scripts::error_costs_256x256_diagonal();
+    sz::error_costs_256x256_t substitution_costs = sz::error_costs_256x256_t::diagonal();
     sz::error_cost_t gap_cost = -1;
 
-    sz::status_t operator()(arrow_strings_tape_t const &first, arrow_strings_tape_t const &second,
-                            sz_ssize_t *results) const {
+    sz::status_t operator()(arrow_strings_view_t first, arrow_strings_view_t second, sz_ssize_t *results) const {
         _sz_assert(first.size() == second.size());
 
 #pragma omp parallel for
         for (std::size_t i = 0; i != first.size(); ++i)
-            results[i] = sz::scripts::needleman_wunsch_baseline(
-                first[i].data(), first[i].size(),   //
-                second[i].data(), second[i].size(), //
-                sz::error_costs_256x256_lookup_t {substitution_costs.data()}, gap_cost);
+            results[i] = sz::scripts::needleman_wunsch_baseline(first[i].data(), first[i].size(),   //
+                                                                second[i].data(), second[i].size(), //
+                                                                substitution_costs, gap_cost);
         return sz::status_t::success_k;
     }
 };
 
 struct smith_waterman_baselines_t {
 
-    sz::scripts::error_costs_256x256_t substitution_costs = sz::scripts::error_costs_256x256_diagonal();
+    sz::error_costs_256x256_t substitution_costs = sz::error_costs_256x256_t::diagonal();
     sz::error_cost_t gap_cost = -1;
 
-    sz::status_t operator()(arrow_strings_tape_t const &first, arrow_strings_tape_t const &second,
-                            sz_ssize_t *results) const {
+    sz::status_t operator()(arrow_strings_view_t first, arrow_strings_view_t second, sz_ssize_t *results) const {
         _sz_assert(first.size() == second.size());
 
 #pragma omp parallel for
         for (std::size_t i = 0; i != first.size(); ++i)
-            results[i] = sz::scripts::smith_waterman_baseline(
-                first[i].data(), first[i].size(),   //
-                second[i].data(), second[i].size(), //
-                sz::error_costs_256x256_lookup_t {substitution_costs.data()}, gap_cost);
+            results[i] = sz::scripts::smith_waterman_baseline(first[i].data(), first[i].size(),   //
+                                                              second[i].data(), second[i].size(), //
+                                                              substitution_costs, gap_cost);
         return sz::status_t::success_k;
     }
 };
 
-using levenshtein_serial_t = sz::openmp::levenshtein_distances<sz_cap_parallel_k, char, std::allocator<char>>;
-using levenshtein_utf8_serial_t = sz::openmp::levenshtein_distances_utf8<sz_cap_parallel_k, char, std::allocator<char>>;
-using needleman_wunsch_serial_t = sz::openmp::needleman_wunsch_scores<sz_cap_parallel_k, char, std::allocator<char>>;
-using smith_waterman_serial_t = sz::openmp::smith_waterman_scores<sz_cap_parallel_k, char, std::allocator<char>>;
+using levenshtein_serial_t = sz::levenshtein_distances<sz_cap_parallel_k, char, std::allocator<char>>;
+using levenshtein_utf8_serial_t = sz::levenshtein_distances_utf8<sz_cap_parallel_k, char, std::allocator<char>>;
+using needleman_wunsch_serial_t = sz::needleman_wunsch_scores<sz_cap_parallel_k, char, std::allocator<char>>;
+using smith_waterman_serial_t = sz::smith_waterman_scores<sz_cap_parallel_k, char, std::allocator<char>>;
 
 /**
  *  In @b AVX-512:
  *  - for Global Alignments, we can vectorize the min-max calculation for diagonal "walkers"
  *  - for Local Alignments, we can vectorize the character substitution lookups for horizontal "walkers"
  */
-using levenshtein_ice_t = sz::openmp::levenshtein_distances<sz_cap_ice_k, char, std::allocator<char>>;
-using levenshtein_utf8_ice_t = sz::openmp::levenshtein_distances_utf8<sz_cap_ice_k, char, std::allocator<char>>;
-using needleman_wunsch_ice_t = sz::openmp::needleman_wunsch_scores<sz_cap_ice_k, char, std::allocator<char>>;
-using smith_waterman_ice_t = sz::openmp::smith_waterman_scores<sz_cap_ice_k, char, std::allocator<char>>;
+using levenshtein_ice_t = sz::levenshtein_distances<sz_cap_ice_k, char, std::allocator<char>>;
+using levenshtein_utf8_ice_t = sz::levenshtein_distances_utf8<sz_cap_ice_k, char, std::allocator<char>>;
+using needleman_wunsch_ice_t = sz::needleman_wunsch_scores<sz_cap_ice_k, char, std::allocator<char>>;
+using smith_waterman_ice_t = sz::smith_waterman_scores<sz_cap_ice_k, char, std::allocator<char>>;
 
-#if 0
 /**
  *  In @b CUDA:
  *  - for GPUs before Hopper, we can use the @b SIMT model for warp-level parallelism using diagonal "walkers"
  *  - for GPUs after Hopper, we compound that with thread-level @b SIMD via @b DPX instructions for min-max
  */
-using levenshtein_cuda_t = sz::cuda::levenshtein_distances<sz_cap_cuda_k, char>;
-using levenshtein_utf8_cuda_t = sz::cuda::levenshtein_distances_utf8<sz_cap_cuda_k, char>;
-using needleman_wunsch_cuda_t = sz::cuda::needleman_wunsch_scores<sz_cap_cuda_k, char>;
-using smith_waterman_cuda_t = sz::cuda::smith_waterman_scores<sz_cap_cuda_k, char>;
-
-using levenshtein_hopper_t = sz::cuda::levenshtein_distances<sz_cap_hopper_k, char>;
-using levenshtein_utf8_hopper_t = sz::cuda::levenshtein_distances_utf8<sz_cap_hopper_k, char>;
-using needleman_wunsch_hopper_t = sz::cuda::needleman_wunsch_scores<sz_cap_hopper_k, char>;
-using smith_waterman_hopper_t = sz::cuda::smith_waterman_scores<sz_cap_hopper_k, char>;
-#endif
+using levenshtein_cuda_t = sz::levenshtein_distances<sz_cap_cuda_k, char>;
+using levenshtein_utf8_cuda_t = sz::levenshtein_distances_utf8<sz_cap_cuda_k, char>;
+using needleman_wunsch_cuda_t = sz::needleman_wunsch_scores<sz_cap_cuda_k, char>;
+using smith_waterman_cuda_t = sz::smith_waterman_scores<sz_cap_cuda_k, char>;
+
+using levenshtein_hopper_t = sz::levenshtein_distances<sz_cap_hopper_k, char>;
+using levenshtein_utf8_hopper_t = sz::levenshtein_distances_utf8<sz_cap_hopper_k, char>;
+using needleman_wunsch_hopper_t = sz::needleman_wunsch_scores<sz_cap_hopper_k, char>;
+using smith_waterman_hopper_t = sz::smith_waterman_scores<sz_cap_hopper_k, char>;
 
 template <typename score_type_>
 void edit_distance_log_mismatch(std::string const &first, std::string const &second, //
@@ -161,7 +156,7 @@ static void edit_distances_compare(base_operator_ &&base_operator, simd_operator
     using score_t = score_type_;
 
     std::vector<std::pair<std::string, std::string>> test_cases = {
-        {"abc", "abc"},                  // same string; distance ~ 0
+        {"ABC", "ABC"},                  // same string; distance ~ 0
         {"listen", "silent"},            // distance ~ 4
         {"atca", "ctactcaccc"},          // distance ~ 6
         {"A", "="},                      // distance ~ 1
@@ -201,8 +196,8 @@ static void edit_distances_compare(base_operator_ &&base_operator, simd_operator
         second_tape.try_assign(&second, &second + 1);
 
         // Compute with both backends
-        sz::status_t status_base = base_operator(first_tape, second_tape, results_base.data());
-        sz::status_t status_simd = simd_operator(first_tape, second_tape, results_simd.data());
+        sz::status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
+        sz::status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data());
         _sz_assert(status_base == sz::status_t::success_k);
         _sz_assert(status_simd == sz::status_t::success_k);
         if (results_base[0] != results_simd[0])
@@ -220,8 +215,8 @@ static void edit_distances_compare(base_operator_ &&base_operator, simd_operator
     }
 
     // Compute with both backends
-    sz::status_t status_base = base_operator(first_tape, second_tape, results_base.data());
-    sz::status_t status_simd = simd_operator(first_tape, second_tape, results_simd.data());
+    sz::status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
+    sz::status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data());
     _sz_assert(status_base == sz::status_t::success_k);
     _sz_assert(status_simd == sz::status_t::success_k);
 
@@ -248,8 +243,8 @@ static void edit_distances_compare(base_operator_ &&base_operator, simd_operator
         results_simd.resize(batch_size);
 
         // Compute with both backends
-        sz::status_t status_base = base_operator(first_tape, second_tape, results_base.data());
-        sz::status_t status_simd = simd_operator(first_tape, second_tape, results_simd.data());
+        sz::status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
+        sz::status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data());
         _sz_assert(status_base == sz::status_t::success_k);
         _sz_assert(status_simd == sz::status_t::success_k);
 
@@ -263,16 +258,17 @@ static void edit_distances_compare(base_operator_ &&base_operator, simd_operator
 
 static void test_equivalence(std::size_t batch_size = 1024, std::size_t max_string_length = 100) {
 
+    using error_t = sz::error_cost_t;
+    using error_matrix_t = sz::error_costs_256x256_t; // ? Full matrix for all 256 ASCII characters
+    using error_mat_t = sz::error_costs_26x26ascii_t; // ? Smaller compact form for 26 capital ASCII characters
+
     // Our logic of computing NW and SW alignment similarity scores differs in sign from most implementations.
     // It's similar to how the "cosine distance" is the inverse of the "cosine similarity".
     // In our case we compute the "distance" and by negating the sign, we can compute the "similarity".
-    constexpr sz::error_cost_t unary_match_score = 1;
-    constexpr sz::error_cost_t unary_mismatch_score = 0;
-    constexpr sz::error_cost_t unary_gap_score = 0;
-    using substituter_256x256_t = sz::error_costs_256x256_lookup_t;
-    sz::scripts::error_costs_256x256_t costs_unary =
-        sz::scripts::error_costs_256x256_diagonal(unary_match_score, unary_mismatch_score);
-    substituter_256x256_t substituter_unary(costs_unary.data());
+    constexpr error_t unary_match_score = 1;
+    constexpr error_t unary_mismatch_score = 0;
+    constexpr error_t unary_gap_score = 0;
+    error_matrix_t substituter_unary = error_matrix_t::diagonal(unary_match_score, unary_mismatch_score);
     {
         auto distance_l = levenshtein_baseline("abcdefg", 7, "abc_efg", 7);
         auto similarity_nw = needleman_wunsch_baseline("abcdefg", 7, "abc_efg", 7, substituter_unary, unary_gap_score);
@@ -288,43 +284,68 @@ static void test_equivalence(std::size_t batch_size = 1024, std::size_t max_stri
     // Now systematically compare the results of the baseline and SIMD implementations
     constexpr sz_capability_t serial_k = sz_cap_serial_k;
     constexpr sz_capability_t parallel_k = sz_cap_parallel_k;
+    constexpr sz_capability_t cuda_k = sz_cap_cuda_k;
 
-    edit_distances_compare<sz_size_t>(                                              //
-        levenshtein_baselines_t {},                                                 //
-        sz::openmp::levenshtein_distances<serial_k, char, std::allocator<char>> {}, //
+    // Single-threaded serial Levenshtein distance implementation
+    edit_distances_compare<sz_size_t>(                                      //
+        levenshtein_baselines_t {},                                         //
+        sz::levenshtein_distances<serial_k, char, std::allocator<char>> {}, //
         batch_size, max_string_length);
 
-    edit_distances_compare<sz_size_t>(                                                //
-        levenshtein_baselines_t {},                                                   //
-        sz::openmp::levenshtein_distances<parallel_k, char, std::allocator<char>> {}, //
+    // Multi-threaded parallel Levenshtein distance implementation
+    edit_distances_compare<sz_size_t>(                                        //
+        levenshtein_baselines_t {},                                           //
+        sz::levenshtein_distances<parallel_k, char, std::allocator<char>> {}, //
         batch_size, max_string_length);
 
     // Now let's take non-unary substitution costs, like BLOSUM62
-    constexpr sz::error_cost_t blosum62_gap_extension_cost = 4; // ? The inverted typical (-4) value
-    sz::scripts::error_costs_256x256_t blosum62 = sz::scripts::error_costs_256x256_blosum62();
+    constexpr error_t blosum62_gap_extension_cost = 4; // ? The inverted typical (-4) value
+    error_matrix_t blosum62 = sz::error_costs_26x26ascii_t::blosum62().decompressed();
 
+    // Single-threaded serial NW implementation
     edit_distances_compare<sz_ssize_t>(                                       //
         needleman_wunsch_baselines_t {blosum62, blosum62_gap_extension_cost}, //
-        sz::openmp::needleman_wunsch_scores<serial_k, char, substituter_256x256_t, std::allocator<char>> {
-            substituter_256x256_t {blosum62.data()}, blosum62_gap_extension_cost}, //
+        sz::needleman_wunsch_scores<serial_k, char, error_matrix_t, std::allocator<char>> {
+            blosum62, blosum62_gap_extension_cost}, //
         batch_size, max_string_length);
 
+    // Multi-threaded parallel NW implementation
     edit_distances_compare<sz_ssize_t>(                                       //
         needleman_wunsch_baselines_t {blosum62, blosum62_gap_extension_cost}, //
-        sz::openmp::needleman_wunsch_scores<parallel_k, char, substituter_256x256_t, std::allocator<char>> {
-            substituter_256x256_t {blosum62.data()}, blosum62_gap_extension_cost}, //
+        sz::needleman_wunsch_scores<parallel_k, char, error_matrix_t, std::allocator<char>> {
+            blosum62, blosum62_gap_extension_cost}, //
         batch_size, max_string_length);
 
+    // Single-threaded serial SW implementation
     edit_distances_compare<sz_ssize_t>(                                     //
         smith_waterman_baselines_t {blosum62, blosum62_gap_extension_cost}, //
-        sz::openmp::smith_waterman_scores<serial_k, char, substituter_256x256_t, std::allocator<char>> {
-            substituter_256x256_t {blosum62.data()}, blosum62_gap_extension_cost}, //
+        sz::smith_waterman_scores<serial_k, char, error_matrix_t, std::allocator<char>> {
+            blosum62, blosum62_gap_extension_cost}, //
         batch_size, max_string_length);
 
+    // Multi-threaded parallel SW implementation
     edit_distances_compare<sz_ssize_t>(                                     //
         smith_waterman_baselines_t {blosum62, blosum62_gap_extension_cost}, //
-        sz::openmp::smith_waterman_scores<parallel_k, char, substituter_256x256_t, std::allocator<char>> {
-            substituter_256x256_t {blosum62.data()}, blosum62_gap_extension_cost}, //
+        sz::smith_waterman_scores<parallel_k, char, error_matrix_t, std::allocator<char>> {
+            blosum62, blosum62_gap_extension_cost}, //
+        batch_size, max_string_length);
+
+    // Switch to the GPU, using an identical matrix, but move it into unified memory
+    unified_vector<error_matrix_t> blosum62_unified(1);
+    blosum62_unified[0] = blosum62;
+
+    // CUDA Levenshtein distance against Multi-threaded on CPU
+    edit_distances_compare<sz_size_t>(                                        //
+        sz::levenshtein_distances<parallel_k, char, std::allocator<char>> {}, //
+        sz::levenshtein_distances<cuda_k, char> {},                           //
+        batch_size, max_string_length);
+
+    // CUDA Needleman-Wunsch distance against Multi-threaded on CPU
+    edit_distances_compare<sz_ssize_t>( //
+        sz::needleman_wunsch_scores<parallel_k, char, error_matrix_t, std::allocator<char>> {
+            blosum62, blosum62_gap_extension_cost}, //
+        sz::needleman_wunsch_scores<cuda_k, char, error_matrix_t *> {blosum62_unified.data(),
+                                                                     blosum62_gap_extension_cost},
         batch_size, max_string_length);
 };
 
@@ -370,7 +391,7 @@ static void test_non_stl_extensions_for_reads() {
 
     // Computing alignment scores.
     using matrix_t = std::int8_t[256][256];
-    sz::scripts::error_costs_256x256_t substitution_costs = error_costs_256x256_diagonal();
+    sz::error_costs_256x256_t substitution_costs = error_costs_256x256_diagonal();
     matrix_t &costs = *reinterpret_cast<matrix_t *>(substitution_costs.data());
 
     _sz_assert(sz::alignment_score(str("listen"), str("silent"), costs, -1) == -4);

From 4c75d8105baefca54b0ca8c24e6b96d3e1a58972 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 5 Apr 2025 23:46:24 +0000
Subject: [PATCH 291/751] Fix: Passing CUDA similarity tests

---
 include/stringcuzilla/similarity.cuh |  13 ++-
 scripts/test.cu                      | 117 +++++++++++++++------------
 2 files changed, 73 insertions(+), 57 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index f74349e7..83612bb6 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -492,7 +492,6 @@ __global__ void _levenshtein_in_cuda(    //
 
     // Allocating shared memory is handled on the host side.
     extern __shared__ char shared_memory_buffer[];
-    printf("results_ptr: %p\n", results_ptr);
 
     // We are computing N edit distances for N pairs of strings. Not a cartesian product!
     // Each block/warp may end up receiving a different number of strings.
@@ -683,10 +682,10 @@ __global__ void _needleman_wunsch_in_cuda(           //
     extern __shared__ char shared_memory_buffer[];
 
     // Assuming, all pairwise computations use the same substituter, load it into shared memory.
-    static constexpr uint substituter_size = sizeof(substituter_type_) / sizeof(error_cost_t);
-    error_cost_t *substituter_costs = reinterpret_cast<error_cost_t *>(shared_memory_buffer);
+    static constexpr uint substituter_size = sizeof(substituter_type_);
+    char *substituter_costs = reinterpret_cast<char *>(shared_memory_buffer);
     for (uint i = threadIdx.x; i < substituter_size; i += blockDim.x)
-        substituter_costs[i] = reinterpret_cast<error_cost_t const *>(substituter_global_ptr)[i];
+        substituter_costs[i] = reinterpret_cast<char const *>(substituter_global_ptr)[i];
     substituter_t &substituter_shared = *reinterpret_cast<substituter_t *>(substituter_costs);
     __syncthreads();
 
@@ -762,7 +761,7 @@ status_t _needleman_wunsch_via_cuda(
     // H100 Streaming Multiprocessor can have up to 128 active warps concurrently and only 256 KB of shared memory.
     // A100 SMs had only 192 KB. We can't deal with blocks that require more memory than the SM can provide.
     sz_size_t shared_memory_per_block =
-        _scores_diagonally_warp_shared_memory_requirement(first_strings, second_strings, 1);
+        _scores_diagonally_warp_shared_memory_requirement(first_strings, second_strings, 127);
     if (shared_memory_per_block > specs.shared_memory_per_sm) return status_t::bad_alloc_k;
 
     // It may be the case that we've only received empty strings.
@@ -785,8 +784,8 @@ status_t _needleman_wunsch_via_cuda(
     constexpr sz_size_t threads_per_block = 32u;
     auto kernel = &_needleman_wunsch_in_cuda<capability_k, first_strings_t, second_strings_t, score_t, substituter_t>;
     void *kernel_args[] = {
-        (void *)&first_strings,         (void *)&second_strings, (void *)&results,
-        (void *)substituter_global_ptr, (void *)&gap_cost,
+        (void *)&first_strings,          (void *)&second_strings, (void *)&results,
+        (void *)&substituter_global_ptr, (void *)&gap_cost,
     };
 
     // Enqueue the kernel for execution:
diff --git a/scripts/test.cu b/scripts/test.cu
index efe5e4c3..14c52f6b 100644
--- a/scripts/test.cu
+++ b/scripts/test.cu
@@ -151,24 +151,25 @@ void edit_distance_log_mismatch(std::string const &first, std::string const &sec
  */
 template <typename score_type_, typename base_operator_, typename simd_operator_>
 static void edit_distances_compare(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
-                                   std::size_t batch_size = 1024 * 16, std::size_t max_string_length = 512) {
+                                   std::size_t batch_size = 1024 * 16, std::size_t max_string_length = 512,
+                                   std::string_view allowed_chars = {}) {
 
     using score_t = score_type_;
 
     std::vector<std::pair<std::string, std::string>> test_cases = {
         {"ABC", "ABC"},                  // same string; distance ~ 0
-        {"listen", "silent"},            // distance ~ 4
-        {"atca", "ctactcaccc"},          // distance ~ 6
+        {"LISTEN", "SILENT"},            // distance ~ 4
+        {"ATCA", "CTACTCACCC"},          // distance ~ 6
         {"A", "="},                      // distance ~ 1
-        {"a", "a"},                      // distance ~ 0
+        {"A", "A"},                      // distance ~ 0
         {"", ""},                        // distance ~ 0
-        {"", "abc"},                     // distance ~ 3
-        {"abc", ""},                     // distance ~ 3
-        {"abc", "ac"},                   // one deletion; distance ~ 1
-        {"abc", "a_bc"},                 // one insertion; distance ~ 1
+        {"", "ABC"},                     // distance ~ 3
+        {"ABC", ""},                     // distance ~ 3
+        {"ABC", "AC"},                   // one deletion; distance ~ 1
+        {"ABC", "A_BC"},                 // one insertion; distance ~ 1
         {"ggbuzgjux{}l", "gbuzgjux{}l"}, // one (prepended) insertion; distance ~ 1
-        {"abc", "adc"},                  // one substitution; distance ~ 1
-        {"apple", "aple"},               // distance ~ 1
+        {"ABC", "ADC"},                  // one substitution; distance ~ 1
+        {"APPLE", "APLE"},               // distance ~ 1
         //
         // Unicode:
         {"αβγδ", "αγδ"},                      // Each Greek symbol is 2 bytes in size; 2 bytes, 1 runes diff.
@@ -188,8 +189,18 @@ static void edit_distances_compare(base_operator_ &&base_operator, simd_operator
     // First check with a batch-size of 1
     unified_vector<score_t> results_base(1), results_simd(1);
     arrow_strings_tape_t first_tape, second_tape;
+    bool contains_missing_in_any_case = false;
     for (auto [first, second] : test_cases) {
 
+        // Check if the input strings fit into our allowed characters set
+        if (!allowed_chars.empty()) {
+            bool contains_missing = false;
+            for (auto c : first) contains_missing |= allowed_chars.find(c) == std::string_view::npos;
+            for (auto c : second) contains_missing |= allowed_chars.find(c) == std::string_view::npos;
+            contains_missing_in_any_case |= contains_missing;
+            if (contains_missing) continue;
+        }
+
         // Reset the tapes and results
         results_base[0] = 0, results_simd[0] = 0;
         first_tape.try_assign(&first, &first + 1);
@@ -205,25 +216,27 @@ static void edit_distances_compare(base_operator_ &&base_operator, simd_operator
     }
 
     // Unzip the test cases into two separate tapes and perform batch processing
-    results_base.resize(test_cases.size());
-    results_simd.resize(test_cases.size());
-    first_tape.reset();
-    second_tape.reset();
-    for (auto [first, second] : test_cases) {
-        _sz_assert(first_tape.try_append({first.data(), first.size()}) == sz::status_t::success_k);
-        _sz_assert(second_tape.try_append({second.data(), second.size()}) == sz::status_t::success_k);
-    }
+    if (!contains_missing_in_any_case) {
+        results_base.resize(test_cases.size());
+        results_simd.resize(test_cases.size());
+        first_tape.reset();
+        second_tape.reset();
+        for (auto [first, second] : test_cases) {
+            _sz_assert(first_tape.try_append({first.data(), first.size()}) == sz::status_t::success_k);
+            _sz_assert(second_tape.try_append({second.data(), second.size()}) == sz::status_t::success_k);
+        }
 
-    // Compute with both backends
-    sz::status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
-    sz::status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data());
-    _sz_assert(status_base == sz::status_t::success_k);
-    _sz_assert(status_simd == sz::status_t::success_k);
+        // Compute with both backends
+        sz::status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
+        sz::status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data());
+        _sz_assert(status_base == sz::status_t::success_k);
+        _sz_assert(status_simd == sz::status_t::success_k);
 
-    // Individually log the failed results
-    for (std::size_t i = 0; i != test_cases.size(); ++i) {
-        if (results_base[i] == results_simd[i]) continue;
-        edit_distance_log_mismatch(test_cases[i].first, test_cases[i].second, results_base[i], results_simd[i]);
+        // Individually log the failed results
+        for (std::size_t i = 0; i != test_cases.size(); ++i) {
+            if (results_base[i] == results_simd[i]) continue;
+            edit_distance_log_mismatch(test_cases[i].first, test_cases[i].second, results_base[i], results_simd[i]);
+        }
     }
 
     // Generate some random strings, using a small alphabet
@@ -232,8 +245,8 @@ static void edit_distances_compare(base_operator_ &&base_operator, simd_operator
         for (std::size_t i = 0; i != batch_size; ++i) {
             std::size_t first_length = 1u + std::rand() % max_string_length;
             std::size_t second_length = 1u + std::rand() % max_string_length;
-            first_array[i] = random_string(first_length, "abc", 3);
-            second_array[i] = random_string(second_length, "abc", 3);
+            first_array[i] = random_string(first_length, "ABC", 3);
+            second_array[i] = random_string(second_length, "ABC", 3);
         }
 
         // Convert to a GPU-friendly layout
@@ -299,40 +312,43 @@ static void test_equivalence(std::size_t batch_size = 1024, std::size_t max_stri
         batch_size, max_string_length);
 
     // Now let's take non-unary substitution costs, like BLOSUM62
-    constexpr error_t blosum62_gap_extension_cost = 4; // ? The inverted typical (-4) value
-    error_matrix_t blosum62 = sz::error_costs_26x26ascii_t::blosum62().decompressed();
+    constexpr error_t blosum62_gap_extension_cost = -4;
+    error_mat_t blosum62_mat = sz::error_costs_26x26ascii_t::blosum62();
+    error_matrix_t blosum62_matrix = blosum62_mat.decompressed();
 
+#if 0
     // Single-threaded serial NW implementation
-    edit_distances_compare<sz_ssize_t>(                                       //
-        needleman_wunsch_baselines_t {blosum62, blosum62_gap_extension_cost}, //
+    edit_distances_compare<sz_ssize_t>(                                              //
+        needleman_wunsch_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
         sz::needleman_wunsch_scores<serial_k, char, error_matrix_t, std::allocator<char>> {
-            blosum62, blosum62_gap_extension_cost}, //
+            blosum62_matrix, blosum62_gap_extension_cost}, //
         batch_size, max_string_length);
 
     // Multi-threaded parallel NW implementation
-    edit_distances_compare<sz_ssize_t>(                                       //
-        needleman_wunsch_baselines_t {blosum62, blosum62_gap_extension_cost}, //
+    edit_distances_compare<sz_ssize_t>(                                              //
+        needleman_wunsch_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
         sz::needleman_wunsch_scores<parallel_k, char, error_matrix_t, std::allocator<char>> {
-            blosum62, blosum62_gap_extension_cost}, //
+            blosum62_matrix, blosum62_gap_extension_cost}, //
         batch_size, max_string_length);
 
     // Single-threaded serial SW implementation
-    edit_distances_compare<sz_ssize_t>(                                     //
-        smith_waterman_baselines_t {blosum62, blosum62_gap_extension_cost}, //
+    edit_distances_compare<sz_ssize_t>(                                            //
+        smith_waterman_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
         sz::smith_waterman_scores<serial_k, char, error_matrix_t, std::allocator<char>> {
-            blosum62, blosum62_gap_extension_cost}, //
+            blosum62_matrix, blosum62_gap_extension_cost}, //
         batch_size, max_string_length);
 
     // Multi-threaded parallel SW implementation
-    edit_distances_compare<sz_ssize_t>(                                     //
-        smith_waterman_baselines_t {blosum62, blosum62_gap_extension_cost}, //
+    edit_distances_compare<sz_ssize_t>(                                            //
+        smith_waterman_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
         sz::smith_waterman_scores<parallel_k, char, error_matrix_t, std::allocator<char>> {
-            blosum62, blosum62_gap_extension_cost}, //
+            blosum62_matrix, blosum62_gap_extension_cost}, //
         batch_size, max_string_length);
+#endif
 
     // Switch to the GPU, using an identical matrix, but move it into unified memory
-    unified_vector<error_matrix_t> blosum62_unified(1);
-    blosum62_unified[0] = blosum62;
+    unified_vector<error_mat_t> blosum62_unified(1);
+    blosum62_unified[0] = blosum62_mat;
 
     // CUDA Levenshtein distance against Multi-threaded on CPU
     edit_distances_compare<sz_size_t>(                                        //
@@ -340,13 +356,14 @@ static void test_equivalence(std::size_t batch_size = 1024, std::size_t max_stri
         sz::levenshtein_distances<cuda_k, char> {},                           //
         batch_size, max_string_length);
 
-    // CUDA Needleman-Wunsch distance against Multi-threaded on CPU
+    // CUDA Needleman-Wunsch distance against Multi-threaded on CPU,
+    // using a compressed smaller matrix to fit into GPU shared memory
+    std::string_view ascii_alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
     edit_distances_compare<sz_ssize_t>( //
         sz::needleman_wunsch_scores<parallel_k, char, error_matrix_t, std::allocator<char>> {
-            blosum62, blosum62_gap_extension_cost}, //
-        sz::needleman_wunsch_scores<cuda_k, char, error_matrix_t *> {blosum62_unified.data(),
-                                                                     blosum62_gap_extension_cost},
-        batch_size, max_string_length);
+            blosum62_matrix, blosum62_gap_extension_cost}, //
+        sz::needleman_wunsch_scores<cuda_k, char, error_mat_t *> {blosum62_unified.data(), blosum62_gap_extension_cost},
+        batch_size, max_string_length, ascii_alphabet);
 };
 
 #if 0

From 53d3e0d682924c910d1814ba5b4a90a79cc8517b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 8 Apr 2025 13:42:44 +0000
Subject: [PATCH 292/751] Add: Mem consumption CUDA tests

---
 scripts/test.cu  | 252 ++++++++++++++++++++++++-----------------------
 scripts/test.hpp |  17 ++++
 2 files changed, 145 insertions(+), 124 deletions(-)

diff --git a/scripts/test.cu b/scripts/test.cu
index 14c52f6b..ca7de986 100644
--- a/scripts/test.cu
+++ b/scripts/test.cu
@@ -28,10 +28,10 @@
  *  ! Overload the following with caution to enable parallelism.
  *  ! They control the OpenMP CPU backend as well as the CUDA GPU backend.
  */
-#include <stringcuzilla/similarity.hpp>
+#include "stringcuzilla/similarity.hpp"
 
 #if SZ_USE_CUDA
-#include <stringcuzilla/similarity.cuh>
+#include "stringcuzilla/similarity.cuh"
 #endif
 
 #if !_SZ_IS_CPP17
@@ -44,18 +44,6 @@ namespace sz = ashvardanian::stringzilla;
 using namespace sz::scripts;
 using namespace std::literals; // for ""sv
 
-using arrow_strings_view_t = sz::arrow_strings_view<char, sz_size_t>;
-
-#if !SZ_USE_CUDA
-using arrow_strings_tape_t = sz::arrow_strings_tape<char, sz_size_t, std::allocator<char>>;
-template <typename value_type_>
-using unified_vector = std::vector<value_type_, std::allocator<value_type_>>;
-#else
-using arrow_strings_tape_t = sz::arrow_strings_tape<char, sz_size_t, sz::unified_alloc<char>>;
-template <typename value_type_>
-using unified_vector = std::vector<value_type_, sz::unified_alloc<value_type_>>;
-#endif
-
 struct levenshtein_baselines_t {
     template <typename results_type_>
     sz::status_t operator()(arrow_strings_view_t first, arrow_strings_view_t second, results_type_ *results) const {
@@ -147,14 +135,12 @@ void edit_distance_log_mismatch(std::string const &first, std::string const &sec
 
 /**
  *  @brief  Tests the correctness of the string class Levenshtein distance computation,
- *          as well as the similarity scoring functions for bioinformatics-like workloads.
+ *          as well as the similarity scoring functions for bioinformatics-like workloads
+ *          on a @b fixed set of different representative ASCII and UTF-8 strings.
  */
 template <typename score_type_, typename base_operator_, typename simd_operator_>
-static void edit_distances_compare(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
-                                   std::size_t batch_size = 1024 * 16, std::size_t max_string_length = 512,
-                                   std::string_view allowed_chars = {}) {
-
-    using score_t = score_type_;
+static void edit_distances_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
+                                 std::string_view allowed_chars = {}) {
 
     std::vector<std::pair<std::string, std::string>> test_cases = {
         {"ABC", "ABC"},                  // same string; distance ~ 0
@@ -187,6 +173,7 @@ static void edit_distances_compare(base_operator_ &&base_operator, simd_operator
     };
 
     // First check with a batch-size of 1
+    using score_t = score_type_;
     unified_vector<score_t> results_base(1), results_simd(1);
     arrow_strings_tape_t first_tape, second_tape;
     bool contains_missing_in_any_case = false;
@@ -238,22 +225,43 @@ static void edit_distances_compare(base_operator_ &&base_operator, simd_operator
             edit_distance_log_mismatch(test_cases[i].first, test_cases[i].second, results_base[i], results_simd[i]);
         }
     }
+}
+
+struct fuzzy_config_t {
+    std::string_view alphabet = "ABC";
+    std::size_t batch_size = 1024 * 16;
+    std::size_t min_string_length = 1;
+    std::size_t max_string_length = 512;
+    std::size_t iterations = 10;
+};
+
+/**
+ *  @brief  Tests the correctness of the string class Levenshtein distance computation,
+ *          as well as the similarity scoring functions for bioinformatics-like workloads
+ *          on a synthetic @b randomly-generated set of strings from a given @p alphabet.
+ */
+template <typename score_type_, typename base_operator_, typename simd_operator_>
+static void edit_distances_fuzzy(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
+                                 fuzzy_config_t config = {}) {
+
+    using score_t = score_type_;
+    unified_vector<score_t> results_base(config.batch_size), results_simd(config.batch_size);
+    std::vector<std::string> first_array(config.batch_size), second_array(config.batch_size);
+    arrow_strings_tape_t first_tape, second_tape;
+    std::uniform_int_distribution<std::size_t> length_distribution(config.min_string_length, config.max_string_length);
 
     // Generate some random strings, using a small alphabet
-    for (std::size_t iteration_idx = 0; iteration_idx < 10; ++iteration_idx) {
-        std::vector<std::string> first_array(batch_size), second_array(batch_size);
-        for (std::size_t i = 0; i != batch_size; ++i) {
-            std::size_t first_length = 1u + std::rand() % max_string_length;
-            std::size_t second_length = 1u + std::rand() % max_string_length;
-            first_array[i] = random_string(first_length, "ABC", 3);
-            second_array[i] = random_string(second_length, "ABC", 3);
+    for (std::size_t iteration_idx = 0; iteration_idx < config.iterations; ++iteration_idx) {
+        for (std::size_t i = 0; i != config.batch_size; ++i) {
+            std::size_t first_length = length_distribution(global_random_generator());
+            std::size_t second_length = length_distribution(global_random_generator());
+            first_array[i] = random_string(first_length, config.alphabet.data(), config.alphabet.size());
+            second_array[i] = random_string(second_length, config.alphabet.data(), config.alphabet.size());
         }
 
         // Convert to a GPU-friendly layout
-        first_tape.try_assign(first_array.data(), first_array.data() + batch_size);
-        second_tape.try_assign(second_array.data(), second_array.data() + batch_size);
-        results_base.resize(batch_size);
-        results_simd.resize(batch_size);
+        first_tape.try_assign(first_array.data(), first_array.data() + config.batch_size);
+        second_tape.try_assign(second_array.data(), second_array.data() + config.batch_size);
 
         // Compute with both backends
         sz::status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
@@ -262,14 +270,25 @@ static void edit_distances_compare(base_operator_ &&base_operator, simd_operator
         _sz_assert(status_simd == sz::status_t::success_k);
 
         // Individually log the failed results
-        for (std::size_t i = 0; i != test_cases.size(); ++i) {
+        for (std::size_t i = 0; i != config.batch_size; ++i) {
             if (results_base[i] == results_simd[i]) continue;
             edit_distance_log_mismatch(first_array[i], second_array[i], results_base[i], results_simd[i]);
         }
     }
 }
 
-static void test_equivalence(std::size_t batch_size = 1024, std::size_t max_string_length = 100) {
+template <typename score_type_, typename base_operator_, typename simd_operator_>
+static void edit_distances_fixed_and_fuzzy(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
+                                           std::string_view allowed_chars = {}, fuzzy_config_t config = {}) {
+    edit_distances_fixed<score_type_>(base_operator, simd_operator, allowed_chars);
+    edit_distances_fuzzy<score_type_>(base_operator, simd_operator, config);
+}
+
+/**
+ *  @brief  Tests the correctness of the string class Levenshtein distance, NW & SW score computation,
+ *          comparing the results to some baseline implementation for predefined and random inputs.
+ */
+static void test_equivalence() {
 
     using error_t = sz::error_cost_t;
     using error_matrix_t = sz::error_costs_256x256_t; // ? Full matrix for all 256 ASCII characters
@@ -298,139 +317,123 @@ static void test_equivalence(std::size_t batch_size = 1024, std::size_t max_stri
     constexpr sz_capability_t serial_k = sz_cap_serial_k;
     constexpr sz_capability_t parallel_k = sz_cap_parallel_k;
     constexpr sz_capability_t cuda_k = sz_cap_cuda_k;
+    constexpr sz_capability_t hopper_k = sz_cap_hopper_k;
 
     // Single-threaded serial Levenshtein distance implementation
-    edit_distances_compare<sz_size_t>(                                      //
-        levenshtein_baselines_t {},                                         //
-        sz::levenshtein_distances<serial_k, char, std::allocator<char>> {}, //
-        batch_size, max_string_length);
+    edit_distances_fixed_and_fuzzy<sz_size_t>( //
+        levenshtein_baselines_t {},            //
+        sz::levenshtein_distances<serial_k, char, std::allocator<char>> {});
 
     // Multi-threaded parallel Levenshtein distance implementation
-    edit_distances_compare<sz_size_t>(                                        //
-        levenshtein_baselines_t {},                                           //
-        sz::levenshtein_distances<parallel_k, char, std::allocator<char>> {}, //
-        batch_size, max_string_length);
+    edit_distances_fixed_and_fuzzy<sz_size_t>( //
+        levenshtein_baselines_t {},            //
+        sz::levenshtein_distances<parallel_k, char, std::allocator<char>> {});
 
     // Now let's take non-unary substitution costs, like BLOSUM62
     constexpr error_t blosum62_gap_extension_cost = -4;
     error_mat_t blosum62_mat = sz::error_costs_26x26ascii_t::blosum62();
     error_matrix_t blosum62_matrix = blosum62_mat.decompressed();
 
-#if 0
     // Single-threaded serial NW implementation
-    edit_distances_compare<sz_ssize_t>(                                              //
+    edit_distances_fixed_and_fuzzy<sz_ssize_t>(                                      //
         needleman_wunsch_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
         sz::needleman_wunsch_scores<serial_k, char, error_matrix_t, std::allocator<char>> {
-            blosum62_matrix, blosum62_gap_extension_cost}, //
-        batch_size, max_string_length);
+            blosum62_matrix, blosum62_gap_extension_cost});
 
     // Multi-threaded parallel NW implementation
-    edit_distances_compare<sz_ssize_t>(                                              //
+    edit_distances_fixed_and_fuzzy<sz_ssize_t>(                                      //
         needleman_wunsch_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
         sz::needleman_wunsch_scores<parallel_k, char, error_matrix_t, std::allocator<char>> {
-            blosum62_matrix, blosum62_gap_extension_cost}, //
-        batch_size, max_string_length);
+            blosum62_matrix, blosum62_gap_extension_cost});
 
     // Single-threaded serial SW implementation
-    edit_distances_compare<sz_ssize_t>(                                            //
+    edit_distances_fixed_and_fuzzy<sz_ssize_t>(                                    //
         smith_waterman_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
-        sz::smith_waterman_scores<serial_k, char, error_matrix_t, std::allocator<char>> {
-            blosum62_matrix, blosum62_gap_extension_cost}, //
-        batch_size, max_string_length);
+        sz::smith_waterman_scores<serial_k, char, error_matrix_t, std::allocator<char>> {blosum62_matrix,
+                                                                                         blosum62_gap_extension_cost});
 
     // Multi-threaded parallel SW implementation
-    edit_distances_compare<sz_ssize_t>(                                            //
+    edit_distances_fixed_and_fuzzy<sz_ssize_t>(                                    //
         smith_waterman_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
         sz::smith_waterman_scores<parallel_k, char, error_matrix_t, std::allocator<char>> {
-            blosum62_matrix, blosum62_gap_extension_cost}, //
-        batch_size, max_string_length);
-#endif
+            blosum62_matrix, blosum62_gap_extension_cost});
 
     // Switch to the GPU, using an identical matrix, but move it into unified memory
     unified_vector<error_mat_t> blosum62_unified(1);
     blosum62_unified[0] = blosum62_mat;
 
     // CUDA Levenshtein distance against Multi-threaded on CPU
-    edit_distances_compare<sz_size_t>(                                        //
+    edit_distances_fixed_and_fuzzy<sz_size_t>(                                //
         sz::levenshtein_distances<parallel_k, char, std::allocator<char>> {}, //
-        sz::levenshtein_distances<cuda_k, char> {},                           //
-        batch_size, max_string_length);
+        sz::levenshtein_distances<cuda_k, char> {});
+
+#if SZ_USE_HOPPER
+    // CUDA Levenshtein distance on Hopper against Multi-threaded on CPU
+    edit_distances_fixed_and_fuzzy<sz_size_t>(                                //
+        sz::levenshtein_distances<parallel_k, char, std::allocator<char>> {}, //
+        sz::levenshtein_distances<hopper_k, char> {});
+#endif
 
     // CUDA Needleman-Wunsch distance against Multi-threaded on CPU,
     // using a compressed smaller matrix to fit into GPU shared memory
     std::string_view ascii_alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
-    edit_distances_compare<sz_ssize_t>( //
+    edit_distances_fixed_and_fuzzy<sz_ssize_t>( //
         sz::needleman_wunsch_scores<parallel_k, char, error_matrix_t, std::allocator<char>> {
             blosum62_matrix, blosum62_gap_extension_cost}, //
         sz::needleman_wunsch_scores<cuda_k, char, error_mat_t *> {blosum62_unified.data(), blosum62_gap_extension_cost},
-        batch_size, max_string_length, ascii_alphabet);
-};
+        ascii_alphabet);
+}
 
-#if 0
 /**
- *  @brief  Invokes different C++ member methods of immutable strings to cover
- *          extensions beyond the STL API.
+ *  @brief  Many GPU algorithms depend on effective use of shared memory and scheduling its allocation for
+ *          long inputs or very large batches isn't trivial.
  */
-template <typename string_type>
-static void test_non_stl_extensions_for_reads() {
-    using str = string_type;
-
-    // Computing edit-distances.
-    _sz_assert(sz::hamming_distance(str("hello"), str("hello")) == 0);
-    _sz_assert(sz::hamming_distance(str("hello"), str("hell")) == 1);
-    _sz_assert(sz::hamming_distance(str("abc"), str("adc")) == 1);                // one substitution
-    _sz_assert(sz::hamming_distance(str("αβγδ"), str("αxxγδ")) == 2);             // replace Beta UTF8 codepoint
-    _sz_assert(sz::hamming_distance_utf8(str("abcdefgh"), str("_bcdefg_")) == 2); // replace ASCI prefix and suffix
-    _sz_assert(sz::hamming_distance_utf8(str("αβγδ"), str("αγγδ")) == 1);         // replace Beta UTF8 codepoint
-
-    _sz_assert(sz::levenshtein_distance(str("hello"), str("hello")) == 0);
-    _sz_assert(sz::levenshtein_distance(str("hello"), str("hell")) == 1);
-    _sz_assert(sz::levenshtein_distance(str(""), str("")) == 0);
-    _sz_assert(sz::levenshtein_distance(str(""), str("abc")) == 3);
-    _sz_assert(sz::levenshtein_distance(str("abc"), str("")) == 3);
-    _sz_assert(sz::levenshtein_distance(str("abc"), str("ac")) == 1);                   // one deletion
-    _sz_assert(sz::levenshtein_distance(str("abc"), str("a_bc")) == 1);                 // one insertion
-    _sz_assert(sz::levenshtein_distance(str("abc"), str("adc")) == 1);                  // one substitution
-    _sz_assert(sz::levenshtein_distance(str("ggbuzgjux{}l"), str("gbuzgjux{}l")) == 1); // one insertion (prepended)
-    _sz_assert(sz::levenshtein_distance(str("abcdefgABCDEFG"), str("ABCDEFGabcdefg")) == 14);
-
-    _sz_assert(sz::levenshtein_distance_utf8(str("hello"), str("hell")) == 1);           // no unicode symbols, just ASCII
-    _sz_assert(sz::levenshtein_distance_utf8(str("𠜎 𠜱 𠝹 𠱓"), str("𠜎𠜱𠝹𠱓")) == 3); // add 3 whitespaces in Chinese
-    _sz_assert(sz::levenshtein_distance_utf8(str("💖"), str("💗")) == 1);
-
-    _sz_assert(sz::levenshtein_distance_utf8(str("αβγδ"), str("αγδ")) == 1); // insert Beta
-    _sz_assert(sz::levenshtein_distance_utf8(str("école"), str("école")) ==
-           2); // etter "é" as a single character vs "e" + "´"
-    _sz_assert(sz::levenshtein_distance_utf8(str("façade"), str("facade")) == 1);     // "ç" with cedilla vs. plain
-    _sz_assert(sz::levenshtein_distance_utf8(str("Schön"), str("Scho\u0308n")) == 2); // "ö" represented as "o" + "¨"
-    _sz_assert(sz::levenshtein_distance_utf8(str("München"), str("Muenchen")) == 2); // German with umlaut vs. transcription
-    _sz_assert(sz::levenshtein_distance_utf8(str("こんにちは世界"), str("こんばんは世界")) == 2);
-
-    // Computing alignment scores.
-    using matrix_t = std::int8_t[256][256];
-    sz::error_costs_256x256_t substitution_costs = error_costs_256x256_diagonal();
-    matrix_t &costs = *reinterpret_cast<matrix_t *>(substitution_costs.data());
-
-    _sz_assert(sz::alignment_score(str("listen"), str("silent"), costs, -1) == -4);
-    _sz_assert(sz::alignment_score(str("abcdefgABCDEFG"), str("ABCDEFGabcdefg"), costs, -1) == -14);
-    _sz_assert(sz::alignment_score(str("hello"), str("hello"), costs, -1) == 0);
-    _sz_assert(sz::alignment_score(str("hello"), str("hell"), costs, -1) == -1);
-
-    // Computing rolling fingerprints.
-    _sz_assert(sz::hashes_fingerprint<512>(str("aaaa"), 3).count() == 1);
-    _sz_assert(sz::hashes_fingerprint<512>(str("hello"), 4).count() == 2);
-    _sz_assert(sz::hashes_fingerprint<512>(str("hello"), 3).count() == 3);
-
-    // No matter how many times one repeats a character, the hash should only contain at most one set bit.
-    _sz_assert(sz::hashes_fingerprint<512>(str("a"), 3).count() == 0);
-    _sz_assert(sz::hashes_fingerprint<512>(str("aa"), 3).count() == 0);
-    _sz_assert(sz::hashes_fingerprint<512>(str("aaa"), 3).count() == 1);
-    _sz_assert(sz::hashes_fingerprint<512>(str("aaaa"), 3).count() == 1);
-    _sz_assert(sz::hashes_fingerprint<512>(str("aaaaa"), 3).count() == 1);
-
-    // Computing fuzzy search results.
+void test_growing_memory_usage() {
+
+    // Now systematically compare the results of the baseline and SIMD implementations
+    constexpr sz_capability_t serial_k = sz_cap_serial_k;
+    constexpr sz_capability_t parallel_k = sz_cap_parallel_k;
+    constexpr sz_capability_t cuda_k = sz_cap_cuda_k;
+    constexpr sz_capability_t hopper_k = sz_cap_hopper_k;
+
+    std::vector<fuzzy_config_t> experiments = {
+        // Single string pair of same length:
+        {.batch_size = 1, .min_string_length = 512, .max_string_length = 512, .iterations = 1},
+        {.batch_size = 1, .min_string_length = 2048, .max_string_length = 2048, .iterations = 1},
+        {.batch_size = 1, .min_string_length = 8192, .max_string_length = 8192, .iterations = 1},
+        {.batch_size = 1, .min_string_length = 32768, .max_string_length = 32768, .iterations = 1},
+        {.batch_size = 1, .min_string_length = 131072, .max_string_length = 131072, .iterations = 1},
+        // Two strings of a same length:
+        {.batch_size = 2, .min_string_length = 512, .max_string_length = 512, .iterations = 1},
+        {.batch_size = 2, .min_string_length = 2048, .max_string_length = 2048, .iterations = 1},
+        {.batch_size = 2, .min_string_length = 8192, .max_string_length = 8192, .iterations = 1},
+        {.batch_size = 2, .min_string_length = 32768, .max_string_length = 32768, .iterations = 1},
+        {.batch_size = 2, .min_string_length = 131072, .max_string_length = 131072, .iterations = 1},
+        // Ten strings of random lengths:
+        {.batch_size = 10, .min_string_length = 1, .max_string_length = 512, .iterations = 1},
+        {.batch_size = 10, .min_string_length = 1, .max_string_length = 2048, .iterations = 1},
+        {.batch_size = 10, .min_string_length = 1, .max_string_length = 8192, .iterations = 1},
+        {.batch_size = 10, .min_string_length = 1, .max_string_length = 32768, .iterations = 1},
+        {.batch_size = 10, .min_string_length = 1, .max_string_length = 131072, .iterations = 1},
+    };
+
+    // Progress until something fails
+    for (fuzzy_config_t const &experiment : experiments) {
+        std::printf("Testing with batch size %zu, min length %zu, max length %zu, iterations %zu\n",
+                    experiment.batch_size, experiment.min_string_length, experiment.max_string_length,
+                    experiment.iterations);
+
+        // Single-threaded serial Levenshtein distance implementation
+        edit_distances_fuzzy<sz_size_t>( //
+            levenshtein_baselines_t {},  //
+            sz::levenshtein_distances<serial_k, char, std::allocator<char>> {}, experiment);
+
+        // Multi-threaded parallel Levenshtein distance implementation
+        edit_distances_fuzzy<sz_size_t>( //
+            levenshtein_baselines_t {},  //
+            sz::levenshtein_distances<parallel_k, char, std::allocator<char>> {}, experiment);
+    }
 }
-#endif
 
 int main(int argc, char const **argv) {
 
@@ -474,6 +477,7 @@ int main(int argc, char const **argv) {
 #endif
 
     test_equivalence();
+    test_growing_memory_usage();
 
     std::printf("All tests passed... Unbelievable!\n");
     return 0;
diff --git a/scripts/test.hpp b/scripts/test.hpp
index 049a943e..6c0349e8 100644
--- a/scripts/test.hpp
+++ b/scripts/test.hpp
@@ -12,10 +12,27 @@
 #include <array>      // `std::array`
 #include <functional> // `std::function`
 
+#include "stringzilla/types.hpp"
+#if SZ_USE_CUDA
+#include "stringcuzilla/types.cuh"
+#endif
+
 namespace ashvardanian {
 namespace stringzilla {
 namespace scripts {
 
+using arrow_strings_view_t = arrow_strings_view<char, sz_size_t>;
+
+#if !SZ_USE_CUDA
+using arrow_strings_tape_t = arrow_strings_tape<char, sz_size_t, std::allocator<char>>;
+template <typename value_type_>
+using unified_vector = std::vector<value_type_, std::allocator<value_type_>>;
+#else
+using arrow_strings_tape_t = arrow_strings_tape<char, sz_size_t, unified_alloc<char>>;
+template <typename value_type_>
+using unified_vector = std::vector<value_type_, unified_alloc<value_type_>>;
+#endif
+
 inline std::string read_file(std::string path) noexcept(false) {
     std::ifstream stream(path);
     if (!stream.is_open()) throw std::runtime_error("Failed to open file: " + path);

From 4a62715e9ce83dc70e406d84507cfd2689e06c6e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 8 Apr 2025 14:29:08 +0000
Subject: [PATCH 293/751] Add: Warp-shuffle optimizations

---
 include/stringcuzilla/similarity.cuh | 150 +++++++++++++++++++++------
 include/stringcuzilla/similarity.hpp |  84 +++++++--------
 include/stringcuzilla/types.cuh      |  16 +++
 include/stringzilla/types.h          |  15 +--
 include/stringzilla/types.hpp        |  11 +-
 5 files changed, 191 insertions(+), 85 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index 83612bb6..e2aab827 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -76,16 +76,19 @@ struct global_scorer<      //
     using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
     using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
-    using char_t = first_char_t;
+    using char_t = typename std::remove_cvref<first_char_t>::type;
 
-  private:
+    using scorer_t =
+        global_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k, capability_k>;
+
+  protected:
     substituter_t substituter_;
     error_cost_t gap_cost_ {1};
     score_t last_cell_ {0};
 
-    static constexpr score_t pick_best(score_t a, score_t b) noexcept {
-        if constexpr (objective_ == minimize_distance_k) { return sz_min_of_two(a, b); }
-        else { return sz_max_of_two(a, b); }
+    __device__ score_t pick_best(score_t a, score_t b) const noexcept {
+        if constexpr (objective_ == sz_minimize_distance_k) { return std::min(a, b); }
+        else { return std::max(a, b); }
     }
 
   public:
@@ -164,16 +167,19 @@ struct local_scorer<       //
     using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
     using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
-    using char_t = first_char_t;
+    using char_t = typename std::remove_cvref<first_char_t>::type;
+
+    using scorer_t =
+        local_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k, capability_k>;
 
-  private:
+  protected:
     substituter_t substituter_;
     error_cost_t gap_cost_ {1};
     score_t best_score_ {0};
 
-    static constexpr score_t pick_best(score_t a, score_t b) noexcept {
-        if constexpr (objective_k == minimize_distance_k) { return sz_min_of_two(a, b); }
-        else { return sz_max_of_two(a, b); }
+    __device__ score_t pick_best(score_t a, score_t b) const noexcept {
+        if constexpr (objective_k == sz_minimize_distance_k) { return std::min(a, b); }
+        else { return std::max(a, b); }
     }
 
   public:
@@ -225,12 +231,96 @@ struct local_scorer<       //
         // ! Assuming, that reducing across the warp is not possible, let's output the best score per thread
         // ! into the expired set of cells in `scores_pre_substitution`, and sequentially reduce it afterwards.
         scores_pre_substitution[threadIdx.x] = best_score_;
-        __syncthreads();
+        __syncwarp();
         if (threadIdx.x == 0)
             for (uint i = 1; i < blockDim.x; ++i) best_score_ = pick_best(best_score_, scores_pre_substitution[i]);
     }
 };
 
+#if SZ_USE_HOPPER
+
+/**
+ *  @brief GPU adaptation of the `global_scorer` - Minimizes Global Levenshtein distance.
+ *  @note Requires Hopper generation GPUs with DPX to handle 4x `u8` scores at a time.
+ */
+template <>
+struct global_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k,
+                     sz_cap_hopper_k> : public global_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t,
+                                                             sz_minimize_distance_k, sz_cap_cuda_k> {
+
+    using scorer_t::global_scorer; // Make the constructors visible
+
+    __device__ void operator()(                                             //
+        char const *first_reversed_slice, char const *second_slice, uint n, // ! Unlike CPU, uses `uint`
+        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion, sz_u8_t const *scores_pre_deletion,
+        sz_u8_t *scores_new) noexcept {
+
+        sz_u8_t const gap_cost = this->gap_cost_;
+        _sz_assert(gap_cost == 1);
+
+        // The hardest part of this kernel is dealing with unaligned loads!
+        // We want to minimize single-byte processing in favor of 4-byte SIMD loads and min/max operations.
+        // Assuming we are reading consecutive values from a buffer, in every cycle, most likely, we will be
+        // dealing with most values being unaligned!
+
+        using sz_u8x4_t = unsigned int;
+        uint const n_full_quad_bytes = n / 4;
+        for (uint i = threadIdx.x; i < n_full_quad_bytes; i += blockDim.x) {
+            sz_u8x4_t pre_substitution = ((sz_u8x4_t *)scores_pre_substitution)[i];
+            sz_u8x4_t pre_insertion = ((sz_u8x4_t *)scores_pre_insertion)[i];
+            sz_u8x4_t pre_deletion = ((sz_u8x4_t *)scores_pre_deletion)[i];
+            sz_u8x4_t &score_new = ((sz_u8x4_t *)scores_new)[i];
+
+            sz_u8x4_t first_reversed_chars = ((sz_u8x4_t *)first_reversed_slice)[i];
+            sz_u8x4_t second_chars = ((sz_u8x4_t *)second_slice)[i];
+
+            // Equality comparison will output 0xFF for each matching byte.
+            // Adding one to it will make it 0x00 for each matching byte, and 0x01 for each non-matching byte.
+            // Perfect for substitution cost!
+            sz_u8x4_t cost_of_substitution = __vaddus4(__vcmpeq4(first_reversed_chars, second_chars), 0x01010101);
+            sz_u8x4_t if_substitution = __vaddus4(pre_substitution, cost_of_substitution);
+            sz_u8x4_t if_deletion_or_insertion = __vaddus4(__vminu4(pre_deletion, pre_insertion), 0x01010101);
+            sz_u8x4_t cell_score = __vminu4(if_deletion_or_insertion, if_substitution);
+            score_new = cell_score;
+        }
+
+        // Don't forget the last 1-3 elements of the last chunk.
+        // We can offload them to the last thread in the warp.
+        // The last element of the last chunk is the result of the global alignment.
+        if (threadIdx.x == 0) {
+            for (uint i = n_full_quad_bytes * 4; i < n; ++i) {
+                sz_u8_t pre_substitution = scores_pre_substitution[i];
+                sz_u8_t pre_insertion = scores_pre_insertion[i];
+                sz_u8_t pre_deletion = scores_pre_deletion[i];
+                error_cost_t cost_of_substitution = first_reversed_slice[i] != second_slice[i];
+                score_t if_substitution = pre_substitution + cost_of_substitution;
+                score_t if_deletion_or_insertion = std::min(pre_deletion, pre_insertion) + gap_cost;
+                score_t cell_score = std::min(if_deletion_or_insertion, if_substitution);
+                scores_new[i] = cell_score;
+            }
+            this->last_cell_ = scores_new[n - 1];
+        }
+    }
+};
+
+template <>
+struct global_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
+                     sz_cap_hopper_k>
+    : public global_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
+                           sz_cap_cuda_k> {
+    using scorer_t::global_scorer; // Make the constructors visible
+};
+
+template <>
+struct global_scorer<char const *, char const *, sz_u32_t, error_costs_uniform_t, sz_minimize_distance_k,
+                     sz_cap_hopper_k>
+    : public global_scorer<char const *, char const *, sz_u32_t, error_costs_uniform_t, sz_minimize_distance_k,
+                           sz_cap_cuda_k> {
+    using scorer_t::global_scorer; // Make the constructors visible
+};
+
+#endif
+
 /**
  *  @brief  Alignment Score and Edit Distance algorithm evaluating the Dynamic Programming matrix
  *          @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization.
@@ -250,7 +340,7 @@ struct local_scorer<       //
  */
 template <                                                       //
     sz_capability_t capability_ = sz_cap_cuda_k,                 //
-    sz_similarity_objective_t objective_ = maximize_score_k,     //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
     sz_similarity_locality_t locality_ = sz_similarity_global_k, //
     typename char_type_ = char,                                  //
     typename score_type_ = sz_size_t,                            //
@@ -272,7 +362,7 @@ struct diagonal_walker_per_warp {
         local_scorer<char_t const *, char_t const *, score_t, substituter_t, objective_k, capability_k>;
     using scorer_t = std::conditional_t<locality_k == sz_similarity_local_k, local_scorer_t, global_scorer_t>;
 
-  private:
+  protected:
     substituter_t substituter_;
     error_cost_t gap_cost_ {1};
 
@@ -319,12 +409,13 @@ struct diagonal_walker_per_warp {
         // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
         uint const diagonals_count = shorter_dim + longer_dim - 1;
         uint const max_diagonal_length = shorter_length + 1;
+        uint const bytes_per_diagonal = round_up_to_multiple<uint>(max_diagonal_length * sizeof(score_t), 4);
 
         // The next few pointers will be swapped around.
         score_t *previous_scores = reinterpret_cast<score_t *>(shared_memory_buffer);
-        score_t *current_scores = previous_scores + max_diagonal_length;
-        score_t *next_scores = current_scores + max_diagonal_length;
-        char_t *const longer = (char_t *)(next_scores + max_diagonal_length);
+        score_t *current_scores = reinterpret_cast<score_t *>(shared_memory_buffer + bytes_per_diagonal);
+        score_t *next_scores = reinterpret_cast<score_t *>(shared_memory_buffer + 2 * bytes_per_diagonal);
+        char_t *const longer = reinterpret_cast<char_t *>(shared_memory_buffer + 3 * bytes_per_diagonal);
         char_t *const shorter_reversed = longer + longer_length;
 
         // Each thread in the warp will be loading it's own set of strided characters into shared memory.
@@ -341,7 +432,7 @@ struct diagonal_walker_per_warp {
         }
 
         // Make sure the shared memory is fully loaded.
-        __syncthreads();
+        __syncwarp();
 
         // We skip diagonals 0 and 1, as they are trivial.
         // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
@@ -366,7 +457,7 @@ struct diagonal_walker_per_warp {
                 diagonal_aligner.init(next_scores[0], next_diagonal_index);
                 diagonal_aligner.init(next_scores[next_diagonal_length - 1], next_diagonal_index);
             }
-            __syncthreads();
+            __syncwarp();
 
             // Perform a circular rotation of those buffers, to reuse the memory.
             score_t *temporary = previous_scores;
@@ -390,16 +481,16 @@ struct diagonal_walker_per_warp {
             // Don't forget to populate the first row of the Levenshtein matrix.
             if (threadIdx.x == 0) diagonal_aligner.init(next_scores[next_diagonal_length - 1], next_diagonal_index);
 
-            __syncthreads();
+            __syncwarp();
             // ! In the central anti-diagonal band, we can't just set the `current_scores + 1` to `previous_scores`
             // ! for the circular shift, as we will end up spilling outside of the diagonal a few iterations later.
             // ! Assuming in-place `memmove` is tricky on the GPU, so we will copy the data.
             for (sz_size_t i = threadIdx.x; i + 1 < next_diagonal_length; i += blockDim.x)
                 previous_scores[i] = current_scores[i + 1];
-            __syncthreads();
+            __syncwarp();
             for (sz_size_t i = threadIdx.x; i < next_diagonal_length; i += blockDim.x)
                 current_scores[i] = next_scores[i];
-            __syncthreads();
+            __syncwarp();
         }
 
         // Now let's handle the bottom-right triangle of the matrix.
@@ -423,7 +514,7 @@ struct diagonal_walker_per_warp {
             previous_scores = current_scores + 1;
             current_scores = next_scores;
             next_scores = temporary;
-            __syncthreads();
+            __syncwarp();
         }
 
         // Export one result per each block.
@@ -448,8 +539,9 @@ sz_size_t _scores_diagonally_warp_shared_memory_requirement( //
         sz_size_t const bytes_per_cell = max_cell_value < 256 ? 1 : max_cell_value < 65536 ? 2 : 4;
         // For each string we need to copy its contents, and allocate 3 bands proportional to the length
         // of the shorter string with each cell being big enough to hold the length of the longer one.
-        sz_size_t const shared_memory_requirement = 3 * max_diagonal_length * bytes_per_cell + //
-                                                    first_length + second_length;
+        // The diagonals should be aligned to 4 bytes to allow for SIMD operations.
+        sz_size_t const bytes_per_diagonal = round_up_to_multiple<sz_size_t>(max_diagonal_length * bytes_per_cell, 4);
+        sz_size_t const shared_memory_requirement = 3 * bytes_per_diagonal + first_length + second_length;
         max_required_shared_memory = sz_max_of_two(max_required_shared_memory, shared_memory_requirement);
     }
     return max_required_shared_memory;
@@ -481,11 +573,11 @@ __global__ void _levenshtein_in_cuda(    //
     using first_char_t = typename first_string_t::value_type;
     using second_char_t = typename second_string_t::value_type;
     static_assert(sizeof(first_char_t) == sizeof(second_char_t), "Character types don't match");
-    using char_t = first_char_t;
+    using char_t = typename std::remove_cvref<first_char_t>::type;
     using score_t = score_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
-    static constexpr sz_similarity_objective_t objective_k = minimize_distance_k;
+    static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
     using walker_u8_t = diagonal_walker_per_warp<capability_k, objective_k, sz_similarity_global_k, char_t, sz_u8_t>;
     using walker_u16_t = diagonal_walker_per_warp<capability_k, objective_k, sz_similarity_global_k, char_t, sz_u16_t>;
     using walker_u32_t = diagonal_walker_per_warp<capability_k, objective_k, sz_similarity_global_k, char_t, sz_u32_t>;
@@ -665,14 +757,14 @@ __global__ void _needleman_wunsch_in_cuda(           //
     using first_char_t = typename first_string_t::value_type;
     using second_char_t = typename second_string_t::value_type;
     static_assert(sizeof(first_char_t) == sizeof(second_char_t), "Character types don't match");
-    using char_t = first_char_t;
+    using char_t = typename std::remove_cvref<first_char_t>::type;
     using score_t = score_type_;
     using substituter_t = substituter_type_;
     static_assert(std::is_trivially_copyable<substituter_t>::value, "Substituter must be a trivially copyable type.");
     static_assert(std::is_signed<score_t>::value, "Score must be a signed type.");
 
     static constexpr sz_capability_t cap_k = capability_;
-    static constexpr sz_similarity_objective_t obj_k = maximize_score_k;
+    static constexpr sz_similarity_objective_t obj_k = sz_maximize_score_k;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
     using walker_i16_t = diagonal_walker_per_warp<cap_k, obj_k, locality_k, char_t, sz_i16_t, substituter_t &>;
     using walker_i32_t = diagonal_walker_per_warp<cap_k, obj_k, locality_k, char_t, sz_i32_t, substituter_t &>;
@@ -687,7 +779,7 @@ __global__ void _needleman_wunsch_in_cuda(           //
     for (uint i = threadIdx.x; i < substituter_size; i += blockDim.x)
         substituter_costs[i] = reinterpret_cast<char const *>(substituter_global_ptr)[i];
     substituter_t &substituter_shared = *reinterpret_cast<substituter_t *>(substituter_costs);
-    __syncthreads();
+    __syncwarp();
 
     // We are computing N edit distances for N pairs of strings. Not a cartesian product!
     // Each block/warp may end up receiving a different number of strings.
diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index f4ff1805..d910e54d 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -66,13 +66,13 @@ namespace stringzilla {
  *  It updates the internal state to remember the last calculated value, as in Global Alignment it's
  *  always in the bottom-right corner of the DP matrix, which is evaluated last.
  */
-template <                                                   //
-    typename first_iterator_type_ = char const *,            //
-    typename second_iterator_type_ = char const *,           //
-    typename score_type_ = sz_size_t,                        //
-    typename substituter_type_ = error_costs_uniform_t,      //
-    sz_similarity_objective_t objective_ = maximize_score_k, //
-    sz_capability_t capability_ = sz_cap_serial_k            //
+template <                                                      //
+    typename first_iterator_type_ = char const *,               //
+    typename second_iterator_type_ = char const *,              //
+    typename score_type_ = sz_size_t,                           //
+    typename substituter_type_ = error_costs_uniform_t,         //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k, //
+    sz_capability_t capability_ = sz_cap_serial_k               //
     >
 struct global_scorer {
 
@@ -87,7 +87,7 @@ struct global_scorer {
     using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
     using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
-    using char_t = first_char_t;
+    using char_t = typename std::remove_cvref<first_char_t>::type;
 
   private:
     substituter_t substituter_ {};
@@ -95,7 +95,7 @@ struct global_scorer {
     score_t last_score_ {0};
 
     static inline score_t pick_best(score_t a, score_t b) noexcept {
-        if constexpr (objective_k == minimize_distance_k) { return sz_min_of_two(a, b); }
+        if constexpr (objective_k == sz_minimize_distance_k) { return sz_min_of_two(a, b); }
         else { return sz_max_of_two(a, b); }
     }
 
@@ -157,13 +157,13 @@ struct global_scorer {
  *  It updates the internal state to remember the minimum/maximum calculated value, as in Local Alignment
  *  it's always in the bottom-right corner of the DP matrix, which is evaluated last.
  */
-template <                                                   //
-    typename first_iterator_type_ = char const *,            //
-    typename second_iterator_type_ = char const *,           //
-    typename score_type_ = sz_size_t,                        //
-    typename substituter_type_ = error_costs_uniform_t,      //
-    sz_similarity_objective_t objective_ = maximize_score_k, //
-    sz_capability_t capability_ = sz_cap_serial_k            //
+template <                                                      //
+    typename first_iterator_type_ = char const *,               //
+    typename second_iterator_type_ = char const *,              //
+    typename score_type_ = sz_size_t,                           //
+    typename substituter_type_ = error_costs_uniform_t,         //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k, //
+    sz_capability_t capability_ = sz_cap_serial_k               //
     >
 struct local_scorer {
 
@@ -186,7 +186,7 @@ struct local_scorer {
     score_t best_score_ {0};
 
     static inline score_t pick_best(score_t a, score_t b) noexcept {
-        if constexpr (objective_k == minimize_distance_k) { return sz_min_of_two(a, b); }
+        if constexpr (objective_k == sz_minimize_distance_k) { return sz_min_of_two(a, b); }
         else { return sz_max_of_two(a, b); }
     }
 
@@ -263,7 +263,7 @@ struct local_scorer {
  */
 template <                                                       //
     sz_capability_t capability_ = sz_cap_serial_k,               //
-    sz_similarity_objective_t objective_ = maximize_score_k,     //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
     sz_similarity_locality_t locality_ = sz_similarity_global_k, //
     typename char_type_ = char,                                  //
     typename score_type_ = sz_size_t,                            //
@@ -470,7 +470,7 @@ struct diagonal_walker {
  */
 template <                                                       //
     sz_capability_t capability_ = sz_cap_serial_k,               //
-    sz_similarity_objective_t objective_ = maximize_score_k,     //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
     sz_similarity_locality_t locality_ = sz_similarity_global_k, //
     typename char_type_ = char,                                  //
     typename score_type_ = sz_size_t,                            //
@@ -590,16 +590,16 @@ struct levenshtein_distance {
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using horizontal_u8_t = horizontal_walker<capability_serialized_k, minimize_distance_k, sz_similarity_global_k,
+    using horizontal_u8_t = horizontal_walker<capability_serialized_k, sz_minimize_distance_k, sz_similarity_global_k,
                                               char_t, sz_u8_t, error_costs_uniform_t, allocator_t>;
-    using diagonal_u8_t = diagonal_walker<capability_serialized_k, minimize_distance_k, sz_similarity_global_k, char_t,
-                                          sz_u8_t, error_costs_uniform_t, allocator_t>;
-    using diagonal_u16_t = diagonal_walker<capability_k, minimize_distance_k, sz_similarity_global_k, char_t, sz_u16_t,
-                                           error_costs_uniform_t, allocator_t>;
-    using diagonal_u32_t = diagonal_walker<capability_k, minimize_distance_k, sz_similarity_global_k, char_t, sz_u32_t,
-                                           error_costs_uniform_t, allocator_t>;
-    using diagonal_u64_t = diagonal_walker<capability_k, minimize_distance_k, sz_similarity_global_k, char_t, sz_u64_t,
-                                           error_costs_uniform_t, allocator_t>;
+    using diagonal_u8_t = diagonal_walker<capability_serialized_k, sz_minimize_distance_k, sz_similarity_global_k,
+                                          char_t, sz_u8_t, error_costs_uniform_t, allocator_t>;
+    using diagonal_u16_t = diagonal_walker<capability_k, sz_minimize_distance_k, sz_similarity_global_k, char_t,
+                                           sz_u16_t, error_costs_uniform_t, allocator_t>;
+    using diagonal_u32_t = diagonal_walker<capability_k, sz_minimize_distance_k, sz_similarity_global_k, char_t,
+                                           sz_u32_t, error_costs_uniform_t, allocator_t>;
+    using diagonal_u64_t = diagonal_walker<capability_k, sz_minimize_distance_k, sz_similarity_global_k, char_t,
+                                           sz_u64_t, error_costs_uniform_t, allocator_t>;
 
     allocator_t alloc_ {};
 
@@ -679,15 +679,15 @@ struct levenshtein_distance_utf8 {
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using horizontal_u8_t = horizontal_walker<capability_serialized_k, minimize_distance_k, sz_similarity_global_k,
+    using horizontal_u8_t = horizontal_walker<capability_serialized_k, sz_minimize_distance_k, sz_similarity_global_k,
                                               sz_rune_t, sz_u8_t, error_costs_uniform_t, allocator_t>;
-    using diagonal_u8_t = diagonal_walker<capability_serialized_k, minimize_distance_k, sz_similarity_global_k,
+    using diagonal_u8_t = diagonal_walker<capability_serialized_k, sz_minimize_distance_k, sz_similarity_global_k,
                                           sz_rune_t, sz_u8_t, error_costs_uniform_t, allocator_t>;
-    using diagonal_u16_t = diagonal_walker<capability_k, minimize_distance_k, sz_similarity_global_k, sz_rune_t,
+    using diagonal_u16_t = diagonal_walker<capability_k, sz_minimize_distance_k, sz_similarity_global_k, sz_rune_t,
                                            sz_u16_t, error_costs_uniform_t, allocator_t>;
-    using diagonal_u32_t = diagonal_walker<capability_k, minimize_distance_k, sz_similarity_global_k, sz_rune_t,
+    using diagonal_u32_t = diagonal_walker<capability_k, sz_minimize_distance_k, sz_similarity_global_k, sz_rune_t,
                                            sz_u32_t, error_costs_uniform_t, allocator_t>;
-    using diagonal_u64_t = diagonal_walker<capability_k, minimize_distance_k, sz_similarity_global_k, sz_rune_t,
+    using diagonal_u64_t = diagonal_walker<capability_k, sz_minimize_distance_k, sz_similarity_global_k, sz_rune_t,
                                            sz_u64_t, error_costs_uniform_t, allocator_t>;
 
     using ascii_fallback_t = levenshtein_distance_utf8<capability_k, char_t, allocator_t>;
@@ -794,13 +794,13 @@ struct needleman_wunsch_score {
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using horizontal_i16_t = horizontal_walker<capability_serialized_k, maximize_score_k, sz_similarity_global_k,
+    using horizontal_i16_t = horizontal_walker<capability_serialized_k, sz_maximize_score_k, sz_similarity_global_k,
                                                char_t, sz_i16_t, substituter_t, allocator_t>;
-    using diagonal_i16_t = diagonal_walker<capability_serialized_k, maximize_score_k, sz_similarity_global_k, char_t,
+    using diagonal_i16_t = diagonal_walker<capability_serialized_k, sz_maximize_score_k, sz_similarity_global_k, char_t,
                                            sz_i16_t, substituter_t, allocator_t>;
-    using diagonal_i32_t = diagonal_walker<capability_k, maximize_score_k, sz_similarity_global_k, char_t, sz_i32_t,
+    using diagonal_i32_t = diagonal_walker<capability_k, sz_maximize_score_k, sz_similarity_global_k, char_t, sz_i32_t,
                                            substituter_t, allocator_t>;
-    using diagonal_i64_t = diagonal_walker<capability_k, maximize_score_k, sz_similarity_global_k, char_t, sz_i64_t,
+    using diagonal_i64_t = diagonal_walker<capability_k, sz_maximize_score_k, sz_similarity_global_k, char_t, sz_i64_t,
                                            substituter_t, allocator_t>;
 
     substituter_t substituter_ {};
@@ -886,13 +886,13 @@ struct smith_waterman_score {
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using horizontal_i16_t = horizontal_walker<capability_serialized_k, maximize_score_k, sz_similarity_local_k, char_t,
-                                               sz_i16_t, substituter_t, allocator_t>;
-    using diagonal_i16_t = diagonal_walker<capability_serialized_k, maximize_score_k, sz_similarity_local_k, char_t,
+    using horizontal_i16_t = horizontal_walker<capability_serialized_k, sz_maximize_score_k, sz_similarity_local_k,
+                                               char_t, sz_i16_t, substituter_t, allocator_t>;
+    using diagonal_i16_t = diagonal_walker<capability_serialized_k, sz_maximize_score_k, sz_similarity_local_k, char_t,
                                            sz_i16_t, substituter_t, allocator_t>;
-    using diagonal_i32_t = diagonal_walker<capability_k, maximize_score_k, sz_similarity_local_k, char_t, sz_i32_t,
+    using diagonal_i32_t = diagonal_walker<capability_k, sz_maximize_score_k, sz_similarity_local_k, char_t, sz_i32_t,
                                            substituter_t, allocator_t>;
-    using diagonal_i64_t = diagonal_walker<capability_k, maximize_score_k, sz_similarity_local_k, char_t, sz_i64_t,
+    using diagonal_i64_t = diagonal_walker<capability_k, sz_maximize_score_k, sz_similarity_local_k, char_t, sz_i64_t,
                                            substituter_t, allocator_t>;
 
     substituter_t substituter_ {};
diff --git a/include/stringcuzilla/types.cuh b/include/stringcuzilla/types.cuh
index 386b740d..1c4940ea 100644
--- a/include/stringcuzilla/types.cuh
+++ b/include/stringcuzilla/types.cuh
@@ -15,6 +15,22 @@
 
 #include "stringzilla/types.hpp"
 
+#if !defined(SZ_USE_HOPPER)
+#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ < 11)
+#define SZ_USE_HOPPER (1)
+#else
+#define SZ_USE_HOPPER (0)
+#endif
+#endif
+
+#if !defined(SZ_USE_KEPLER)
+#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ < 3)
+#define SZ_USE_KEPLER (1)
+#else
+#define SZ_USE_KEPLER (0)
+#endif
+#endif
+
 namespace ashvardanian {
 namespace stringzilla {
 
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 85d25d64..1afd9930 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -420,9 +420,9 @@ typedef enum { sz_similarity_global_k = 0, sz_similarity_local_k = 1 } sz_simila
 
 /**
  *  @brief Describes the alignment objective for string similarity algorithms.
- *  @sa minimize_distance_k, maximize_score_k
+ *  @sa sz_minimize_distance_k, sz_maximize_score_k
  */
-typedef enum { minimize_distance_k = 0, maximize_score_k = 1 } sz_similarity_objective_t;
+typedef enum { sz_minimize_distance_k = 0, sz_maximize_score_k = 1 } sz_similarity_objective_t;
 
 /**
  *  @brief A simple signed integer type describing the status of a faulty operation.
@@ -655,17 +655,6 @@ typedef sz_cptr_t (*sz_find_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
 /** @brief Signature of `sz_find_byteset`. */
 typedef sz_cptr_t (*sz_find_byteset_t)(sz_cptr_t, sz_size_t, sz_byteset_t const *);
 
-/** @brief Signature of `sz_hamming_distance`. */
-typedef sz_status_t (*sz_hamming_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t, sz_size_t *);
-
-/** @brief Signature of `sz_levenshtein_distance`. */
-typedef sz_status_t (*sz_levenshtein_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t,
-                                                 sz_memory_allocator_t *, sz_size_t *);
-
-/** @brief Signature of `sz_needleman_wunsch_score`. */
-typedef sz_status_t (*sz_needleman_wunsch_score_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_error_cost_t const *,
-                                                   sz_error_cost_t, sz_memory_allocator_t *, sz_ssize_t *);
-
 /** @brief Signature of `sz_sequence_argsort`. */
 typedef sz_status_t (*sz_sequence_argsort_t)(struct sz_sequence_t const *, sz_memory_allocator_t *, sz_sorted_idx_t *);
 
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index eb36241f..60c71302 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -24,7 +24,7 @@
 
 /**
  *  @brief  When set to 1, the library will include the C++ STL headers and implement
- *          automatic conversion from and to `std::stirng_view` and `std::basic_string<any_allocator>`.
+ *          automatic conversion from and to `std::string_view` and `std::basic_string<any_allocator>`.
  */
 #ifndef SZ_AVOID_STL
 #define SZ_AVOID_STL (0) // true or false
@@ -440,6 +440,15 @@ struct cpu_specs_t {
     size_t sockets = 1;                // ? at least 1 socket
 };
 
+/**
+ *  @brief Rounds @p x up to the nearest multiple of @p divisor.
+ */
+template <typename scalar_type_>
+constexpr scalar_type_ round_up_to_multiple(scalar_type_ x, scalar_type_ divisor) {
+    _sz_assert(divisor > 0 && "Divisor must be positive");
+    return ((x + divisor - 1) / divisor) * divisor;
+}
+
 } // namespace stringzilla
 } // namespace ashvardanian
 

From 2eeab8338bb7f7f439d789cdb9effca1fe786901 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 8 Apr 2025 15:18:00 +0000
Subject: [PATCH 294/751] Make: Rename test files

---
 scripts/{test.cpp => test_stringzilla.cpp} | 0
 scripts/{test.hpp => test_stringzilla.hpp} | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename scripts/{test.cpp => test_stringzilla.cpp} (100%)
 rename scripts/{test.hpp => test_stringzilla.hpp} (100%)

diff --git a/scripts/test.cpp b/scripts/test_stringzilla.cpp
similarity index 100%
rename from scripts/test.cpp
rename to scripts/test_stringzilla.cpp
diff --git a/scripts/test.hpp b/scripts/test_stringzilla.hpp
similarity index 100%
rename from scripts/test.hpp
rename to scripts/test_stringzilla.hpp

From b1751032b9e9c0c15e941beae7d45b6f2b13d21a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 8 Apr 2025 15:18:40 +0000
Subject: [PATCH 295/751] Make: Separate Parallel C++ and CUDA tests

---
 CMakeLists.txt                              |  91 +++--
 scripts/test_stringcuzilla.cpp              |  45 +++
 scripts/test_stringcuzilla.cu               |  46 +++
 scripts/{test.cu => test_stringcuzilla.cuh} | 424 +++++++++++---------
 scripts/test_stringzilla.hpp                |  93 -----
 5 files changed, 395 insertions(+), 304 deletions(-)
 create mode 100644 scripts/test_stringcuzilla.cpp
 create mode 100644 scripts/test_stringcuzilla.cu
 rename scripts/{test.cu => test_stringcuzilla.cuh} (56%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fbfcc05e..41a02641 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -193,6 +193,8 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
         target_compile_options(${target} PRIVATE "-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas")
     elseif (${compiler_id} STREQUAL "MSVC")
         target_compile_options(${target} PRIVATE "/Bt;/wd4068;/wd4146;/utf-8;/WX")
+    elseif (${compiler_id} STREQUAL "NVIDIA")
+        target_compile_options(${target} PRIVATE "-Xcompiler=-Wfatal-errors;-Xcompiler=-Wall;-Xcompiler=-Wextra")
     endif ()
 
     # Enable OpenMP if available
@@ -233,12 +235,21 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
         endif ()
     elseif (${compiler_id} STREQUAL "NVIDIA")
         if (${CMAKE_BUILD_TYPE} STREQUAL "Debug" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
-            target_compile_options(${target} PRIVATE "-O0;-g")
-            target_compile_options(${target} PRIVATE "-G;-no-compress") # for device code debug symbols
-            target_compile_options(${target} PRIVATE "-Xptxas=-O0") # pass to the PTX assembler
-            target_compile_options(${target} PRIVATE "-Xcompiler=-fno-omit-frame-pointer")
-            target_compile_options(${target} PRIVATE "-Xcompiler=-fno-inline")
-            target_compile_options(${target} PRIVATE "-maxrregcount=0")
+            target_compile_options(
+                ${target}
+                PRIVATE "-G" # Device debug symbols
+                        "-lineinfo" # Include source line info in PTX
+                        "-O0" # Disable NVCC optimizations explicitly
+                        "-no-compress" # No compression of debug info
+                        "-Xptxas=-O0" # Disable PTX assembler optimizations
+                        "-Xcompiler=-g" # Host debugging symbols explicitly
+                        "-Xcompiler=-O0" # Host optimizations off
+                        "-Xcompiler=-fno-omit-frame-pointer" # Stack trace clarity
+                        "-Xcompiler=-Wall" # All warnings (host)
+                        "-Xcompiler=-Wextra" # Extra warnings (host)
+                        "-Xcompiler=-fno-inline" # Prevent host inlining
+                        "-maxrregcount=0" # No register count limits
+            )
         endif ()
         if (${CMAKE_BUILD_TYPE} STREQUAL "Release" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
             target_compile_options(${target} PRIVATE "-O3")
@@ -304,8 +315,8 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
                 # ! NVCC can't handle sanitizers?!
                 # https://stackoverflow.com/questions/75590579/cuda-fails-to-initialise-when-address-sanitizer-is-enabled
             else ()
-                target_compile_options(${target} PRIVATE "-fsanitize=address;-fsanitize=leak")
-                target_link_options(${target} PRIVATE "-fsanitize=address;-fsanitize=leak")
+                # target_compile_options(${target} PRIVATE "-fsanitize=address;-fsanitize=leak")
+                # target_link_options(${target} PRIVATE "-fsanitize=address;-fsanitize=leak")
             endif ()
         endif ()
     else ()
@@ -340,27 +351,36 @@ endfunction ()
 
 if (${STRINGZILLA_BUILD_BENCHMARK})
     define_launcher(stringzilla_bench_search scripts/bench_search.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
-    define_gpu_launcher(stringzilla_bench_similarity scripts/bench_similarity.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_sequence scripts/bench_sequence.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_token scripts/bench_token.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_container scripts/bench_container.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_memory scripts/bench_memory.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    if (ENABLE_CUDA)
+        define_gpu_launcher(stringcuzilla_bench_similarity scripts/bench_similarity.cu 20 "${STRINGZILLA_TARGET_ARCH}")
+    else ()
+        define_launcher(stringcuzilla_bench_similarity scripts/bench_similarity.cu 20 "${STRINGZILLA_TARGET_ARCH}")
+    endif ()
 endif ()
 
 if (${STRINGZILLA_BUILD_TEST})
-    # Make sure that the compilation passes for different C++ standards ! Keep in mind, MSVC only supports C++11 and
-    # newer.
-    define_launcher(stringzilla_test_cpp11 scripts/test.cu 11 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringzilla_test_cpp14 scripts/test.cu 14 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringzilla_test_cpp17 scripts/test.cu 17 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringzilla_test_cpp20 scripts/test.cu 20 "${STRINGZILLA_TARGET_ARCH}")
+    # Make sure that the compilation passes for different C++ standards!
+    #
+    # Keep in mind, MSVC only supports C++11 and newer.
+    define_launcher(stringzilla_test_cpp11 scripts/test_stringzilla.cpp 11 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_test_cpp14 scripts/test_stringzilla.cpp 14 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_test_cpp17 scripts/test_stringzilla.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_test_cpp20 scripts/test_stringzilla.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+
+    # Test parallel algorithms separately
+    define_launcher(stringcuzilla_test_cpp17 scripts/test_stringcuzilla.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringcuzilla_test_cpp20 scripts/test_stringcuzilla.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
 
     # To avoid bloating our codebase with `__device__` function annotations, we only target C++14 and newer to compile
     # `constexpr` functions on both host and device side. To avoid the complexity of defining too many template objects
     # and complex SFINAE, we only target C++17 anf newer to compile `if constexpr` compile-time SIMD dispatch.
     if (ENABLE_CUDA)
-        define_gpu_launcher(stringzilla_test_cu17 scripts/test.cu 17 "${STRINGZILLA_TARGET_ARCH}")
-        define_gpu_launcher(stringzilla_test_cu20 scripts/test.cu 20 "${STRINGZILLA_TARGET_ARCH}")
+        define_gpu_launcher(stringcuzilla_test_cu17 scripts/test_stringcuzilla.cu 17 "${STRINGZILLA_TARGET_ARCH}")
+        define_gpu_launcher(stringcuzilla_test_cu20 scripts/test_stringcuzilla.cu 20 "${STRINGZILLA_TARGET_ARCH}")
     endif ()
 
     # Check system architecture to avoid complex cross-compilation workflows, but compile multiple backends: disabling
@@ -368,19 +388,34 @@ if (${STRINGZILLA_BUILD_TEST})
     if (SZ_PLATFORM_X86)
         # x86 specific backends
         if (MSVC)
-            define_launcher(stringzilla_test_cpp20_serial scripts/test.cu 20 "AVX")
-            define_launcher(stringzilla_test_cpp20_haswell scripts/test.cu 20 "AVX2")
-            define_launcher(stringzilla_test_cpp20_ice scripts/test.cu 20 "AVX512")
+            define_launcher(stringzilla_test_cpp20_serial scripts/test_stringzilla.cpp 20 "AVX")
+            define_launcher(stringzilla_test_cpp20_haswell scripts/test_stringzilla.cpp 20 "AVX2")
+            define_launcher(stringzilla_test_cpp20_ice scripts/test_stringzilla.cpp 20 "AVX512")
+            if (ENABLE_CUDA)
+                define_gpu_launcher(stringcuzilla_test_cu20_serial scripts/test_stringcuzilla.cu 20 "AVX")
+                define_gpu_launcher(stringcuzilla_test_cu20_haswell scripts/test_stringcuzilla.cu 20 "AVX2")
+                define_gpu_launcher(stringcuzilla_test_cu20_ice scripts/test_stringcuzilla.cu 20 "AVX512")
+            endif ()
         else ()
-            define_launcher(stringzilla_test_cpp20_serial scripts/test.cu 20 "ivybridge")
-            define_launcher(stringzilla_test_cpp20_haswell scripts/test.cu 20 "haswell")
-            define_launcher(stringzilla_test_cpp20_ice scripts/test.cu 20 "sapphirerapids")
+            define_launcher(stringzilla_test_cpp20_serial scripts/test_stringzilla.cpp 20 "ivybridge")
+            define_launcher(stringzilla_test_cpp20_haswell scripts/test_stringzilla.cpp 20 "haswell")
+            define_launcher(stringzilla_test_cpp20_ice scripts/test_stringzilla.cpp 20 "sapphirerapids")
+            if (ENABLE_CUDA)
+                define_gpu_launcher(stringcuzilla_test_cu20_serial scripts/test_stringcuzilla.cu 20 "ivybridge")
+                define_gpu_launcher(stringcuzilla_test_cu20_haswell scripts/test_stringcuzilla.cu 20 "haswell")
+                define_gpu_launcher(stringcuzilla_test_cu20_ice scripts/test_stringcuzilla.cu 20 "sapphirerapids")
+            endif ()
         endif ()
     elseif (SZ_PLATFORM_ARM)
         # ARM specific backends
-        define_launcher(stringzilla_test_cpp20_serial scripts/test.cu 20 "armv8-a")
-        define_launcher(stringzilla_test_cpp20_neon scripts/test.cu 20 "armv8-a+simd")
-        define_launcher(stringzilla_test_cpp20_sve scripts/test.cu 20 "armv8.2-a+sve")
+        define_launcher(stringzilla_test_cpp20_serial scripts/test_stringzilla.cpp 20 "armv8-a")
+        define_launcher(stringzilla_test_cpp20_neon scripts/test_stringzilla.cpp 20 "armv8-a+simd")
+        define_launcher(stringzilla_test_cpp20_sve scripts/test_stringzilla.cpp 20 "armv8.2-a+sve")
+        if (ENABLE_CUDA)
+            define_gpu_launcher(stringcuzilla_test_cu20_serial scripts/test_stringcuzilla.cu 20 "armv8-a")
+            define_gpu_launcher(stringcuzilla_test_cu20_neon scripts/test_stringcuzilla.cu 20 "armv8-a+simd")
+            define_gpu_launcher(stringcuzilla_test_cu20_sve scripts/test_stringcuzilla.cu 20 "armv8.2-a+sve")
+        endif ()
     endif ()
 endif ()
 
@@ -395,8 +430,6 @@ if (${STRINGZILLA_BUILD_SHARED})
 
     function (define_shared target)
         add_library(${target} SHARED c/lib.c)
-        # if (ENABLE_CUDA) target_sources(${target} PRIVATE c/lib.cu) target_include_directories(${target} PRIVATE
-        # ${CUDAToolkit_INCLUDE_DIRS}) target_link_libraries(${target} PRIVATE CUDA::cudart CUDA::cuda_driver) endif()
         add_library(${PROJECT_NAME}::${target} ALIAS ${target})
 
         set_target_properties(
@@ -444,7 +477,7 @@ if (${STRINGZILLA_BUILD_SHARED})
     target_include_directories(stringzilla_shared PUBLIC include)
 
     # Try compiling a version without linking the LibC ! This is only for Linux and Windows, as on modern Arm-based
-    # MacOS machines ! we can't legally access Arm's "feature registers" without `sysctl` or `sysctlbyname`. So let's
+    # MacOS machines ! We can't legally access Arm's "feature registers" without `sysctl` or `sysctlbyname`. So let's
     # check if we are compiling for a Darwin-based OS.
     if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
         define_shared(stringzilla_bare)
diff --git a/scripts/test_stringcuzilla.cpp b/scripts/test_stringcuzilla.cpp
new file mode 100644
index 00000000..527092b3
--- /dev/null
+++ b/scripts/test_stringcuzilla.cpp
@@ -0,0 +1,45 @@
+/**
+ *  @brief   Extensive @b stress-testing suite for StringCuZilla parallel operations, written in CUDA C++.
+ *  @see     Stress-tests on real-world and synthetic data are integrated into the @b `scripts/bench*.cpp` benchmarks.
+ *
+ *  @file    test.cu
+ *  @author  Ash Vardanian
+ */
+#undef NDEBUG // ! Enable all assertions for testing
+
+/**
+ *  ! Overload the following with caution.
+ *  ! Those parameters must never be explicitly set during releases,
+ *  ! but they come handy during development, if you want to validate
+ *  ! different ISA-specific implementations.
+
+#define SZ_USE_HASWELL 0
+#define SZ_USE_SKYLAKE 0
+#define SZ_USE_ICE 0
+#define SZ_USE_NEON 0
+#define SZ_USE_SVE 0
+*/
+#define SZ_USE_OPENMP 1
+#define SZ_USE_CUDA 0
+#define SZ_USE_KEPLER 0
+#define SZ_USE_HOPPER 0
+#if defined(SZ_DEBUG)
+#undef SZ_DEBUG
+#endif
+#define SZ_DEBUG 1 // Enforce aggressive logging for this unit.
+
+#include "test_stringcuzilla.cuh"
+
+namespace sz = ashvardanian::stringzilla;
+
+int main(int argc, char const **argv) {
+    sz_unused(argc && argv);
+    std::printf("Hi, dear tester! You look nice today!\n");
+    sz::scripts::log_environment();
+
+    sz::scripts::test_similarity_scores_equivalence();
+    sz::scripts::test_similarity_scores_memory_usage();
+
+    std::printf("All tests passed... Unbelievable!\n");
+    return 0;
+}
diff --git a/scripts/test_stringcuzilla.cu b/scripts/test_stringcuzilla.cu
new file mode 100644
index 00000000..7954862c
--- /dev/null
+++ b/scripts/test_stringcuzilla.cu
@@ -0,0 +1,46 @@
+/**
+ *  @brief   Extensive @b stress-testing suite for StringCuZilla parallel operations, written in CUDA C++.
+ *  @see     Stress-tests on real-world and synthetic data are integrated into the @b `scripts/bench*.cpp` benchmarks.
+ *
+ *  @file    test.cu
+ *  @author  Ash Vardanian
+ */
+#undef NDEBUG // ! Enable all assertions for testing
+
+/**
+ *  ! Overload the following with caution.
+ *  ! Those parameters must never be explicitly set during releases,
+ *  ! but they come handy during development, if you want to validate
+ *  ! different ISA-specific implementations.
+
+#define SZ_USE_HASWELL 0
+#define SZ_USE_SKYLAKE 0
+#define SZ_USE_ICE 0
+#define SZ_USE_NEON 0
+#define SZ_USE_SVE 0
+*/
+#define SZ_USE_OPENMP 1
+#define SZ_USE_CUDA 1
+#define SZ_USE_KEPLER 1
+#define SZ_USE_HOPPER 1
+#if defined(SZ_DEBUG)
+#undef SZ_DEBUG
+#endif
+#define SZ_DEBUG 1 // Enforce aggressive logging for this unit.
+
+#include "test_stringcuzilla.cuh"
+
+namespace sz = ashvardanian::stringzilla;
+using namespace sz::scripts;
+
+int main(int argc, char const **argv) {
+    sz_unused(argc && argv);
+    std::printf("Hi, dear tester! You look nice today!\n");
+    scripts::log_environment();
+
+    scripts::test_similarity_scores_equivalence();
+    scripts::test_similarity_scores_memory_usage();
+
+    std::printf("All tests passed... Unbelievable!\n");
+    return 0;
+}
diff --git a/scripts/test.cu b/scripts/test_stringcuzilla.cuh
similarity index 56%
rename from scripts/test.cu
rename to scripts/test_stringcuzilla.cuh
index ca7de986..34efdcc0 100644
--- a/scripts/test.cu
+++ b/scripts/test_stringcuzilla.cuh
@@ -2,32 +2,9 @@
  *  @brief   Extensive @b stress-testing suite for StringCuZilla parallel operations, written in CUDA C++.
  *  @see     Stress-tests on real-world and synthetic data are integrated into the @b `scripts/bench*.cpp` benchmarks.
  *
- *  @file    test.cu
+ *  @file    test_stringcuzilla.cuh
  *  @author  Ash Vardanian
  */
-#undef NDEBUG // ! Enable all assertions for testing
-
-/**
- *  ! Overload the following with caution.
- *  ! Those parameters must never be explicitly set during releases,
- *  ! but they come handy during development, if you want to validate
- *  ! different ISA-specific implementations.
-
-#define SZ_USE_HASWELL 0
-#define SZ_USE_SKYLAKE 0
-#define SZ_USE_ICE 0
-#define SZ_USE_NEON 0
-#define SZ_USE_SVE 0
- */
-#if defined(SZ_DEBUG)
-#undef SZ_DEBUG
-#endif
-#define SZ_DEBUG 1 // Enforce aggressive logging for this unit.
-
-/**
- *  ! Overload the following with caution to enable parallelism.
- *  ! They control the OpenMP CPU backend as well as the CUDA GPU backend.
- */
 #include "stringcuzilla/similarity.hpp"
 
 #if SZ_USE_CUDA
@@ -38,87 +15,218 @@
 #error "This test requires C++17 or later."
 #endif
 
-#include "test.hpp" // `levenshtein_baseline`
+#include "test_stringzilla.hpp" // `arrow_strings_view_t`
+
+namespace ashvardanian {
+namespace stringzilla {
+namespace scripts {
+
+void log_environment() {
+    std::printf("- Uses Haswell: %s \n", SZ_USE_HASWELL ? "yes" : "no");
+    std::printf("- Uses Skylake: %s \n", SZ_USE_SKYLAKE ? "yes" : "no");
+    std::printf("- Uses Ice Lake: %s \n", SZ_USE_ICE ? "yes" : "no");
+    std::printf("- Uses NEON: %s \n", SZ_USE_NEON ? "yes" : "no");
+    std::printf("- Uses SVE: %s \n", SZ_USE_SVE ? "yes" : "no");
+    std::printf("- Uses SVE2: %s \n", SZ_USE_SVE2 ? "yes" : "no");
+    std::printf("- Uses OpenMP: %s \n", SZ_USE_OPENMP ? "yes" : "no");
+    std::printf("- Uses CUDA: %s \n", SZ_USE_CUDA ? "yes" : "no");
+
+#if SZ_USE_CUDA
+    cudaError_t cuda_error = cudaFree(0); // Force context initialization
+    if (cuda_error != cudaSuccess) {
+        std::printf("CUDA initialization error: %s\n", cudaGetErrorString(cuda_error));
+        return 1;
+    }
+    int device_count = 0;
+    cuda_error = cudaGetDeviceCount(&device_count);
+    if (cuda_error != cudaSuccess) {
+        std::printf("CUDA error: %s\n", cudaGetErrorString(cuda_error));
+        return 1;
+    }
+    std::printf("CUDA device count: %d\n", device_count);
+    if (device_count == 0) {
+        std::printf("No CUDA devices found.\n");
+        return 1;
+    }
+    std::printf("- CUDA devices:\n");
+    cudaDeviceProp prop;
+    for (int i = 0; i < device_count; ++i) {
+        cuda_error = cudaGetDeviceProperties(&prop, i);
+        std::printf("  - %s\n", prop.name);
+    }
+    std::printf("- CUDA managed memory support: %s\n", prop.managedMemory == 1 ? "yes" : "no");
+    std::printf("- CUDA unified memory support: %s\n", prop.unifiedAddressing == 1 ? "yes" : "no");
+#endif
+}
+
+/**
+ *  @brief Inefficient baseline Levenshtein distance computation, as implemented in most codebases.
+ *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
+ */
+inline std::size_t levenshtein_baseline(char const *s1, std::size_t len1, char const *s2,
+                                        std::size_t len2) noexcept(false) {
+    std::size_t const rows = len1 + 1;
+    std::size_t const cols = len2 + 1;
+    std::vector<std::size_t> matrix_buffer(rows * cols);
+
+    // Initialize the borders of the matrix.
+    for (std::size_t i = 0; i < rows; ++i) matrix_buffer[i * cols + 0] /* [i][0] in 2D */ = i;
+    for (std::size_t j = 0; j < cols; ++j) matrix_buffer[0 * cols + j] /* [0][j] in 2D */ = j;
+
+    for (std::size_t i = 1; i < rows; ++i) {
+        std::size_t const *last_row = &matrix_buffer[(i - 1) * cols];
+        std::size_t *row = &matrix_buffer[i * cols];
+        for (std::size_t j = 1; j < cols; ++j) {
+            std::size_t substitution_cost = (s1[i - 1] == s2[j - 1]) ? 0 : 1;
+            std::size_t if_deletion_or_insertion = std::min(last_row[j], row[j - 1]) + 1;
+            row[j] = std::min(if_deletion_or_insertion, last_row[j - 1] + substitution_cost);
+        }
+    }
+
+    return matrix_buffer.back();
+}
 
-namespace sz = ashvardanian::stringzilla;
-using namespace sz::scripts;
-using namespace std::literals; // for ""sv
+/**
+ *  @brief Inefficient baseline Needleman-Wunsch alignment score computation, as implemented in most codebases.
+ *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
+ */
+inline std::ptrdiff_t needleman_wunsch_baseline(char const *s1, std::size_t len1, char const *s2, std::size_t len2,
+                                                std::function<error_cost_t(char, char)> substitution_cost_for,
+                                                error_cost_t gap_cost) noexcept(false) {
+    std::size_t const rows = len1 + 1;
+    std::size_t const cols = len2 + 1;
+    std::vector<std::ptrdiff_t> matrix_buffer(rows * cols);
+
+    // Initialize the borders of the matrix.
+    for (std::size_t i = 0; i < rows; ++i) matrix_buffer[i * cols + 0] /* [i][0] in 2D */ = i * gap_cost;
+    for (std::size_t j = 0; j < cols; ++j) matrix_buffer[0 * cols + j] /* [0][j] in 2D */ = j * gap_cost;
+
+    // Fill in the rest of the matrix.
+    for (std::size_t i = 1; i < rows; ++i) {
+        std::ptrdiff_t const *last_row = &matrix_buffer[(i - 1) * cols];
+        std::ptrdiff_t *row = &matrix_buffer[i * cols];
+        for (std::size_t j = 1; j < cols; ++j) {
+            std::ptrdiff_t substitution_cost = substitution_cost_for(s1[i - 1], s2[j - 1]);
+            std::ptrdiff_t if_substitution = last_row[j - 1] + substitution_cost;
+            std::ptrdiff_t if_deletion_or_insertion = std::max(last_row[j], row[j - 1]) + gap_cost;
+            row[j] = std::max(if_deletion_or_insertion, if_substitution);
+        }
+    }
+
+    return matrix_buffer.back();
+}
+
+/**
+ *  @brief Inefficient baseline Smith-Waterman local alignment score computation, as implemented in most codebases.
+ *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
+ */
+inline std::ptrdiff_t smith_waterman_baseline(char const *s1, std::size_t len1, char const *s2, std::size_t len2,
+                                              std::function<error_cost_t(char, char)> substitution_cost_for,
+                                              error_cost_t gap_cost) noexcept(false) {
+    std::size_t const rows = len1 + 1;
+    std::size_t const cols = len2 + 1;
+    std::vector<std::ptrdiff_t> matrix_buffer(rows * cols);
+
+    // Unlike the global alignment we need to track the largest score in the matrix.
+    std::ptrdiff_t best_score = 0;
+
+    // Initialize the borders of the matrix to 0.
+    for (std::size_t i = 0; i < rows; ++i) matrix_buffer[i * cols + 0] /* [i][0] in 2D */ = 0;
+    for (std::size_t j = 0; j < cols; ++j) matrix_buffer[0 * cols + j] /* [0][j] in 2D */ = 0;
+
+    // Fill in the rest of the matrix.
+    for (std::size_t i = 1; i < rows; ++i) {
+        std::ptrdiff_t const *last_row = &matrix_buffer[(i - 1) * cols];
+        std::ptrdiff_t *row = &matrix_buffer[i * cols];
+        for (std::size_t j = 1; j < cols; ++j) {
+            std::ptrdiff_t substitution_cost = substitution_cost_for(s1[i - 1], s2[j - 1]);
+            std::ptrdiff_t if_substitution = last_row[j - 1] + substitution_cost;
+            std::ptrdiff_t if_deletion_or_insertion = std::max(row[j - 1], last_row[j]) + gap_cost;
+            std::ptrdiff_t if_substitution_or_reset = std::max<std::ptrdiff_t>(if_substitution, 0);
+            std::ptrdiff_t score = std::max(if_deletion_or_insertion, if_substitution_or_reset);
+            row[j] = score;
+            best_score = std::max(best_score, score);
+        }
+    }
+
+    return best_score;
+}
 
 struct levenshtein_baselines_t {
     template <typename results_type_>
-    sz::status_t operator()(arrow_strings_view_t first, arrow_strings_view_t second, results_type_ *results) const {
+    status_t operator()(arrow_strings_view_t first, arrow_strings_view_t second, results_type_ *results) const {
         _sz_assert(first.size() == second.size());
 #pragma omp parallel for
         for (std::size_t i = 0; i != first.size(); ++i)
-            results[i] = sz::scripts::levenshtein_baseline(first[i].data(), first[i].size(), //
-                                                           second[i].data(), second[i].size());
-        return sz::status_t::success_k;
+            results[i] = levenshtein_baseline(first[i].data(), first[i].size(), //
+                                              second[i].data(), second[i].size());
+        return status_t::success_k;
     }
 };
 
 struct needleman_wunsch_baselines_t {
 
-    sz::error_costs_256x256_t substitution_costs = sz::error_costs_256x256_t::diagonal();
-    sz::error_cost_t gap_cost = -1;
+    error_costs_256x256_t substitution_costs = error_costs_256x256_t::diagonal();
+    error_cost_t gap_cost = -1;
 
-    sz::status_t operator()(arrow_strings_view_t first, arrow_strings_view_t second, sz_ssize_t *results) const {
+    status_t operator()(arrow_strings_view_t first, arrow_strings_view_t second, sz_ssize_t *results) const {
         _sz_assert(first.size() == second.size());
 
 #pragma omp parallel for
         for (std::size_t i = 0; i != first.size(); ++i)
-            results[i] = sz::scripts::needleman_wunsch_baseline(first[i].data(), first[i].size(),   //
-                                                                second[i].data(), second[i].size(), //
-                                                                substitution_costs, gap_cost);
-        return sz::status_t::success_k;
+            results[i] = needleman_wunsch_baseline(first[i].data(), first[i].size(),   //
+                                                   second[i].data(), second[i].size(), //
+                                                   substitution_costs, gap_cost);
+        return status_t::success_k;
     }
 };
 
 struct smith_waterman_baselines_t {
 
-    sz::error_costs_256x256_t substitution_costs = sz::error_costs_256x256_t::diagonal();
-    sz::error_cost_t gap_cost = -1;
+    error_costs_256x256_t substitution_costs = error_costs_256x256_t::diagonal();
+    error_cost_t gap_cost = -1;
 
-    sz::status_t operator()(arrow_strings_view_t first, arrow_strings_view_t second, sz_ssize_t *results) const {
+    status_t operator()(arrow_strings_view_t first, arrow_strings_view_t second, sz_ssize_t *results) const {
         _sz_assert(first.size() == second.size());
 
 #pragma omp parallel for
         for (std::size_t i = 0; i != first.size(); ++i)
-            results[i] = sz::scripts::smith_waterman_baseline(first[i].data(), first[i].size(),   //
-                                                              second[i].data(), second[i].size(), //
-                                                              substitution_costs, gap_cost);
-        return sz::status_t::success_k;
+            results[i] = smith_waterman_baseline(first[i].data(), first[i].size(),   //
+                                                 second[i].data(), second[i].size(), //
+                                                 substitution_costs, gap_cost);
+        return status_t::success_k;
     }
 };
 
-using levenshtein_serial_t = sz::levenshtein_distances<sz_cap_parallel_k, char, std::allocator<char>>;
-using levenshtein_utf8_serial_t = sz::levenshtein_distances_utf8<sz_cap_parallel_k, char, std::allocator<char>>;
-using needleman_wunsch_serial_t = sz::needleman_wunsch_scores<sz_cap_parallel_k, char, std::allocator<char>>;
-using smith_waterman_serial_t = sz::smith_waterman_scores<sz_cap_parallel_k, char, std::allocator<char>>;
+using levenshtein_serial_t = levenshtein_distances<sz_cap_parallel_k, char, std::allocator<char>>;
+using levenshtein_utf8_serial_t = levenshtein_distances_utf8<sz_cap_parallel_k, char, std::allocator<char>>;
+using needleman_wunsch_serial_t = needleman_wunsch_scores<sz_cap_parallel_k, char, std::allocator<char>>;
+using smith_waterman_serial_t = smith_waterman_scores<sz_cap_parallel_k, char, std::allocator<char>>;
 
 /**
  *  In @b AVX-512:
  *  - for Global Alignments, we can vectorize the min-max calculation for diagonal "walkers"
  *  - for Local Alignments, we can vectorize the character substitution lookups for horizontal "walkers"
  */
-using levenshtein_ice_t = sz::levenshtein_distances<sz_cap_ice_k, char, std::allocator<char>>;
-using levenshtein_utf8_ice_t = sz::levenshtein_distances_utf8<sz_cap_ice_k, char, std::allocator<char>>;
-using needleman_wunsch_ice_t = sz::needleman_wunsch_scores<sz_cap_ice_k, char, std::allocator<char>>;
-using smith_waterman_ice_t = sz::smith_waterman_scores<sz_cap_ice_k, char, std::allocator<char>>;
+using levenshtein_ice_t = levenshtein_distances<sz_cap_ice_k, char, std::allocator<char>>;
+using levenshtein_utf8_ice_t = levenshtein_distances_utf8<sz_cap_ice_k, char, std::allocator<char>>;
+using needleman_wunsch_ice_t = needleman_wunsch_scores<sz_cap_ice_k, char, std::allocator<char>>;
+using smith_waterman_ice_t = smith_waterman_scores<sz_cap_ice_k, char, std::allocator<char>>;
 
 /**
  *  In @b CUDA:
  *  - for GPUs before Hopper, we can use the @b SIMT model for warp-level parallelism using diagonal "walkers"
  *  - for GPUs after Hopper, we compound that with thread-level @b SIMD via @b DPX instructions for min-max
  */
-using levenshtein_cuda_t = sz::levenshtein_distances<sz_cap_cuda_k, char>;
-using levenshtein_utf8_cuda_t = sz::levenshtein_distances_utf8<sz_cap_cuda_k, char>;
-using needleman_wunsch_cuda_t = sz::needleman_wunsch_scores<sz_cap_cuda_k, char>;
-using smith_waterman_cuda_t = sz::smith_waterman_scores<sz_cap_cuda_k, char>;
+using levenshtein_cuda_t = levenshtein_distances<sz_cap_cuda_k, char>;
+using levenshtein_utf8_cuda_t = levenshtein_distances_utf8<sz_cap_cuda_k, char>;
+using needleman_wunsch_cuda_t = needleman_wunsch_scores<sz_cap_cuda_k, char>;
+using smith_waterman_cuda_t = smith_waterman_scores<sz_cap_cuda_k, char>;
 
-using levenshtein_hopper_t = sz::levenshtein_distances<sz_cap_hopper_k, char>;
-using levenshtein_utf8_hopper_t = sz::levenshtein_distances_utf8<sz_cap_hopper_k, char>;
-using needleman_wunsch_hopper_t = sz::needleman_wunsch_scores<sz_cap_hopper_k, char>;
-using smith_waterman_hopper_t = sz::smith_waterman_scores<sz_cap_hopper_k, char>;
+using levenshtein_hopper_t = levenshtein_distances<sz_cap_hopper_k, char>;
+using levenshtein_utf8_hopper_t = levenshtein_distances_utf8<sz_cap_hopper_k, char>;
+using needleman_wunsch_hopper_t = needleman_wunsch_scores<sz_cap_hopper_k, char>;
+using smith_waterman_hopper_t = smith_waterman_scores<sz_cap_hopper_k, char>;
 
 template <typename score_type_>
 void edit_distance_log_mismatch(std::string const &first, std::string const &second, //
@@ -139,8 +247,8 @@ void edit_distance_log_mismatch(std::string const &first, std::string const &sec
  *          on a @b fixed set of different representative ASCII and UTF-8 strings.
  */
 template <typename score_type_, typename base_operator_, typename simd_operator_>
-static void edit_distances_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
-                                 std::string_view allowed_chars = {}) {
+void test_similarity_scores_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
+                                  std::string_view allowed_chars = {}) {
 
     std::vector<std::pair<std::string, std::string>> test_cases = {
         {"ABC", "ABC"},                  // same string; distance ~ 0
@@ -194,10 +302,10 @@ static void edit_distances_fixed(base_operator_ &&base_operator, simd_operator_
         second_tape.try_assign(&second, &second + 1);
 
         // Compute with both backends
-        sz::status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
-        sz::status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data());
-        _sz_assert(status_base == sz::status_t::success_k);
-        _sz_assert(status_simd == sz::status_t::success_k);
+        status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
+        status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data());
+        _sz_assert(status_base == status_t::success_k);
+        _sz_assert(status_simd == status_t::success_k);
         if (results_base[0] != results_simd[0])
             edit_distance_log_mismatch(first, second, results_base[0], results_simd[0]);
     }
@@ -209,15 +317,15 @@ static void edit_distances_fixed(base_operator_ &&base_operator, simd_operator_
         first_tape.reset();
         second_tape.reset();
         for (auto [first, second] : test_cases) {
-            _sz_assert(first_tape.try_append({first.data(), first.size()}) == sz::status_t::success_k);
-            _sz_assert(second_tape.try_append({second.data(), second.size()}) == sz::status_t::success_k);
+            _sz_assert(first_tape.try_append({first.data(), first.size()}) == status_t::success_k);
+            _sz_assert(second_tape.try_append({second.data(), second.size()}) == status_t::success_k);
         }
 
         // Compute with both backends
-        sz::status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
-        sz::status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data());
-        _sz_assert(status_base == sz::status_t::success_k);
-        _sz_assert(status_simd == sz::status_t::success_k);
+        status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
+        status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data());
+        _sz_assert(status_base == status_t::success_k);
+        _sz_assert(status_simd == status_t::success_k);
 
         // Individually log the failed results
         for (std::size_t i = 0; i != test_cases.size(); ++i) {
@@ -229,9 +337,9 @@ static void edit_distances_fixed(base_operator_ &&base_operator, simd_operator_
 
 struct fuzzy_config_t {
     std::string_view alphabet = "ABC";
-    std::size_t batch_size = 1024 * 16;
+    std::size_t batch_size = 16;
     std::size_t min_string_length = 1;
-    std::size_t max_string_length = 512;
+    std::size_t max_string_length = 200;
     std::size_t iterations = 10;
 };
 
@@ -241,8 +349,8 @@ struct fuzzy_config_t {
  *          on a synthetic @b randomly-generated set of strings from a given @p alphabet.
  */
 template <typename score_type_, typename base_operator_, typename simd_operator_>
-static void edit_distances_fuzzy(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
-                                 fuzzy_config_t config = {}) {
+void test_similarity_scores_fuzzy(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
+                                  fuzzy_config_t config = {}) {
 
     using score_t = score_type_;
     unified_vector<score_t> results_base(config.batch_size), results_simd(config.batch_size);
@@ -264,10 +372,10 @@ static void edit_distances_fuzzy(base_operator_ &&base_operator, simd_operator_
         second_tape.try_assign(second_array.data(), second_array.data() + config.batch_size);
 
         // Compute with both backends
-        sz::status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
-        sz::status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data());
-        _sz_assert(status_base == sz::status_t::success_k);
-        _sz_assert(status_simd == sz::status_t::success_k);
+        status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
+        status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data());
+        _sz_assert(status_base == status_t::success_k);
+        _sz_assert(status_simd == status_t::success_k);
 
         // Individually log the failed results
         for (std::size_t i = 0; i != config.batch_size; ++i) {
@@ -278,21 +386,21 @@ static void edit_distances_fuzzy(base_operator_ &&base_operator, simd_operator_
 }
 
 template <typename score_type_, typename base_operator_, typename simd_operator_>
-static void edit_distances_fixed_and_fuzzy(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
-                                           std::string_view allowed_chars = {}, fuzzy_config_t config = {}) {
-    edit_distances_fixed<score_type_>(base_operator, simd_operator, allowed_chars);
-    edit_distances_fuzzy<score_type_>(base_operator, simd_operator, config);
+void test_similarity_scores_fixed_and_fuzzy(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
+                                            std::string_view allowed_chars = {}, fuzzy_config_t config = {}) {
+    test_similarity_scores_fixed<score_type_>(base_operator, simd_operator, allowed_chars);
+    test_similarity_scores_fuzzy<score_type_>(base_operator, simd_operator, config);
 }
 
 /**
  *  @brief  Tests the correctness of the string class Levenshtein distance, NW & SW score computation,
  *          comparing the results to some baseline implementation for predefined and random inputs.
  */
-static void test_equivalence() {
+void test_similarity_scores_equivalence() {
 
-    using error_t = sz::error_cost_t;
-    using error_matrix_t = sz::error_costs_256x256_t; // ? Full matrix for all 256 ASCII characters
-    using error_mat_t = sz::error_costs_26x26ascii_t; // ? Smaller compact form for 26 capital ASCII characters
+    using error_t = error_cost_t;
+    using error_matrix_t = error_costs_256x256_t; // ? Full matrix for all 256 ASCII characters
+    using error_mat_t = error_costs_26x26ascii_t; // ? Smaller compact form for 26 capital ASCII characters
 
     // Our logic of computing NW and SW alignment similarity scores differs in sign from most implementations.
     // It's similar to how the "cosine distance" is the inverse of the "cosine similarity".
@@ -313,103 +421,99 @@ static void test_equivalence() {
         _sz_assert(distance_sw == 1);
     }
 
-    // Now systematically compare the results of the baseline and SIMD implementations
-    constexpr sz_capability_t serial_k = sz_cap_serial_k;
-    constexpr sz_capability_t parallel_k = sz_cap_parallel_k;
-    constexpr sz_capability_t cuda_k = sz_cap_cuda_k;
-    constexpr sz_capability_t hopper_k = sz_cap_hopper_k;
-
     // Single-threaded serial Levenshtein distance implementation
-    edit_distances_fixed_and_fuzzy<sz_size_t>( //
-        levenshtein_baselines_t {},            //
-        sz::levenshtein_distances<serial_k, char, std::allocator<char>> {});
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>( //
+        levenshtein_baselines_t {},                    //
+        levenshtein_distances<sz_cap_serial_k, char, std::allocator<char>> {});
 
     // Multi-threaded parallel Levenshtein distance implementation
-    edit_distances_fixed_and_fuzzy<sz_size_t>( //
-        levenshtein_baselines_t {},            //
-        sz::levenshtein_distances<parallel_k, char, std::allocator<char>> {});
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>( //
+        levenshtein_baselines_t {},                    //
+        levenshtein_distances<sz_cap_parallel_k, char, std::allocator<char>> {});
 
     // Now let's take non-unary substitution costs, like BLOSUM62
     constexpr error_t blosum62_gap_extension_cost = -4;
-    error_mat_t blosum62_mat = sz::error_costs_26x26ascii_t::blosum62();
+    error_mat_t blosum62_mat = error_costs_26x26ascii_t::blosum62();
     error_matrix_t blosum62_matrix = blosum62_mat.decompressed();
 
     // Single-threaded serial NW implementation
-    edit_distances_fixed_and_fuzzy<sz_ssize_t>(                                      //
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                              //
         needleman_wunsch_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
-        sz::needleman_wunsch_scores<serial_k, char, error_matrix_t, std::allocator<char>> {
+        needleman_wunsch_scores<sz_cap_serial_k, char, error_matrix_t, std::allocator<char>> {
             blosum62_matrix, blosum62_gap_extension_cost});
 
     // Multi-threaded parallel NW implementation
-    edit_distances_fixed_and_fuzzy<sz_ssize_t>(                                      //
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                              //
         needleman_wunsch_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
-        sz::needleman_wunsch_scores<parallel_k, char, error_matrix_t, std::allocator<char>> {
+        needleman_wunsch_scores<sz_cap_parallel_k, char, error_matrix_t, std::allocator<char>> {
             blosum62_matrix, blosum62_gap_extension_cost});
 
     // Single-threaded serial SW implementation
-    edit_distances_fixed_and_fuzzy<sz_ssize_t>(                                    //
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                            //
         smith_waterman_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
-        sz::smith_waterman_scores<serial_k, char, error_matrix_t, std::allocator<char>> {blosum62_matrix,
-                                                                                         blosum62_gap_extension_cost});
+        smith_waterman_scores<sz_cap_serial_k, char, error_matrix_t, std::allocator<char>> {
+            blosum62_matrix, blosum62_gap_extension_cost});
 
     // Multi-threaded parallel SW implementation
-    edit_distances_fixed_and_fuzzy<sz_ssize_t>(                                    //
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                            //
         smith_waterman_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
-        sz::smith_waterman_scores<parallel_k, char, error_matrix_t, std::allocator<char>> {
+        smith_waterman_scores<sz_cap_parallel_k, char, error_matrix_t, std::allocator<char>> {
             blosum62_matrix, blosum62_gap_extension_cost});
 
     // Switch to the GPU, using an identical matrix, but move it into unified memory
     unified_vector<error_mat_t> blosum62_unified(1);
     blosum62_unified[0] = blosum62_mat;
 
+#if SZ_USE_CUDA
     // CUDA Levenshtein distance against Multi-threaded on CPU
-    edit_distances_fixed_and_fuzzy<sz_size_t>(                                //
-        sz::levenshtein_distances<parallel_k, char, std::allocator<char>> {}, //
-        sz::levenshtein_distances<cuda_k, char> {});
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                           //
+        levenshtein_distances<sz_cap_parallel_k, char, std::allocator<char>> {}, //
+        levenshtein_distances<sz_cap_cuda_k, char> {});
+#endif
 
-#if SZ_USE_HOPPER
+#if SZ_USE_HOPPER && 0
     // CUDA Levenshtein distance on Hopper against Multi-threaded on CPU
-    edit_distances_fixed_and_fuzzy<sz_size_t>(                                //
-        sz::levenshtein_distances<parallel_k, char, std::allocator<char>> {}, //
-        sz::levenshtein_distances<hopper_k, char> {});
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                           //
+        levenshtein_distances<sz_cap_parallel_k, char, std::allocator<char>> {}, //
+        levenshtein_distances<sz_cap_hopper_k, char> {});
 #endif
 
+#if SZ_USE_CUDA
     // CUDA Needleman-Wunsch distance against Multi-threaded on CPU,
     // using a compressed smaller matrix to fit into GPU shared memory
     std::string_view ascii_alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
-    edit_distances_fixed_and_fuzzy<sz_ssize_t>( //
-        sz::needleman_wunsch_scores<parallel_k, char, error_matrix_t, std::allocator<char>> {
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
+        needleman_wunsch_scores<sz_cap_parallel_k, char, error_matrix_t, std::allocator<char>> {
             blosum62_matrix, blosum62_gap_extension_cost}, //
-        sz::needleman_wunsch_scores<cuda_k, char, error_mat_t *> {blosum62_unified.data(), blosum62_gap_extension_cost},
+        needleman_wunsch_scores<sz_cap_cuda_k, char, error_mat_t *> {blosum62_unified.data(),
+                                                                     blosum62_gap_extension_cost},
         ascii_alphabet);
+#endif
 }
 
 /**
  *  @brief  Many GPU algorithms depend on effective use of shared memory and scheduling its allocation for
  *          long inputs or very large batches isn't trivial.
  */
-void test_growing_memory_usage() {
-
-    // Now systematically compare the results of the baseline and SIMD implementations
-    constexpr sz_capability_t serial_k = sz_cap_serial_k;
-    constexpr sz_capability_t parallel_k = sz_cap_parallel_k;
-    constexpr sz_capability_t cuda_k = sz_cap_cuda_k;
-    constexpr sz_capability_t hopper_k = sz_cap_hopper_k;
+void test_similarity_scores_memory_usage() {
 
     std::vector<fuzzy_config_t> experiments = {
         // Single string pair of same length:
+        {.batch_size = 1, .min_string_length = 128, .max_string_length = 128, .iterations = 1},
         {.batch_size = 1, .min_string_length = 512, .max_string_length = 512, .iterations = 1},
         {.batch_size = 1, .min_string_length = 2048, .max_string_length = 2048, .iterations = 1},
         {.batch_size = 1, .min_string_length = 8192, .max_string_length = 8192, .iterations = 1},
         {.batch_size = 1, .min_string_length = 32768, .max_string_length = 32768, .iterations = 1},
         {.batch_size = 1, .min_string_length = 131072, .max_string_length = 131072, .iterations = 1},
         // Two strings of a same length:
+        {.batch_size = 2, .min_string_length = 128, .max_string_length = 128, .iterations = 1},
         {.batch_size = 2, .min_string_length = 512, .max_string_length = 512, .iterations = 1},
         {.batch_size = 2, .min_string_length = 2048, .max_string_length = 2048, .iterations = 1},
         {.batch_size = 2, .min_string_length = 8192, .max_string_length = 8192, .iterations = 1},
         {.batch_size = 2, .min_string_length = 32768, .max_string_length = 32768, .iterations = 1},
         {.batch_size = 2, .min_string_length = 131072, .max_string_length = 131072, .iterations = 1},
         // Ten strings of random lengths:
+        {.batch_size = 10, .min_string_length = 1, .max_string_length = 128, .iterations = 1},
         {.batch_size = 10, .min_string_length = 1, .max_string_length = 512, .iterations = 1},
         {.batch_size = 10, .min_string_length = 1, .max_string_length = 2048, .iterations = 1},
         {.batch_size = 10, .min_string_length = 1, .max_string_length = 8192, .iterations = 1},
@@ -424,61 +528,17 @@ void test_growing_memory_usage() {
                     experiment.iterations);
 
         // Single-threaded serial Levenshtein distance implementation
-        edit_distances_fuzzy<sz_size_t>( //
-            levenshtein_baselines_t {},  //
-            sz::levenshtein_distances<serial_k, char, std::allocator<char>> {}, experiment);
+        test_similarity_scores_fuzzy<sz_size_t>( //
+            levenshtein_baselines_t {},          //
+            levenshtein_distances<sz_cap_serial_k, char, std::allocator<char>> {}, experiment);
 
         // Multi-threaded parallel Levenshtein distance implementation
-        edit_distances_fuzzy<sz_size_t>( //
-            levenshtein_baselines_t {},  //
-            sz::levenshtein_distances<parallel_k, char, std::allocator<char>> {}, experiment);
+        test_similarity_scores_fuzzy<sz_size_t>( //
+            levenshtein_baselines_t {},          //
+            levenshtein_distances<sz_cap_parallel_k, char, std::allocator<char>> {}, experiment);
     }
 }
 
-int main(int argc, char const **argv) {
-
-    // Let's greet the user nicely
-    sz_unused(argc && argv);
-    std::printf("Hi, dear tester! You look nice today!\n");
-    std::printf("- Uses Haswell: %s \n", SZ_USE_HASWELL ? "yes" : "no");
-    std::printf("- Uses Skylake: %s \n", SZ_USE_SKYLAKE ? "yes" : "no");
-    std::printf("- Uses Ice Lake: %s \n", SZ_USE_ICE ? "yes" : "no");
-    std::printf("- Uses NEON: %s \n", SZ_USE_NEON ? "yes" : "no");
-    std::printf("- Uses SVE: %s \n", SZ_USE_SVE ? "yes" : "no");
-    std::printf("- Uses SVE2: %s \n", SZ_USE_SVE2 ? "yes" : "no");
-    std::printf("- Uses OpenMP: %s \n", SZ_USE_OPENMP ? "yes" : "no");
-    std::printf("- Uses CUDA: %s \n", SZ_USE_CUDA ? "yes" : "no");
-
-#if SZ_USE_CUDA
-    cudaError_t cuda_error = cudaFree(0); // Force context initialization
-    if (cuda_error != cudaSuccess) {
-        std::printf("CUDA initialization error: %s\n", cudaGetErrorString(cuda_error));
-        return 1;
-    }
-    int device_count = 0;
-    cuda_error = cudaGetDeviceCount(&device_count);
-    if (cuda_error != cudaSuccess) {
-        std::printf("CUDA error: %s\n", cudaGetErrorString(cuda_error));
-        return 1;
-    }
-    std::printf("CUDA device count: %d\n", device_count);
-    if (device_count == 0) {
-        std::printf("No CUDA devices found.\n");
-        return 1;
-    }
-    std::printf("- CUDA devices:\n");
-    cudaDeviceProp prop;
-    for (int i = 0; i < device_count; ++i) {
-        cuda_error = cudaGetDeviceProperties(&prop, i);
-        std::printf("  - %s\n", prop.name);
-    }
-    std::printf("- CUDA managed memory support: %s\n", prop.managedMemory == 1 ? "yes" : "no");
-    std::printf("- CUDA unified memory support: %s\n", prop.unifiedAddressing == 1 ? "yes" : "no");
-#endif
-
-    test_equivalence();
-    test_growing_memory_usage();
-
-    std::printf("All tests passed... Unbelievable!\n");
-    return 0;
-}
+} // namespace scripts
+} // namespace stringzilla
+} // namespace ashvardanian
diff --git a/scripts/test_stringzilla.hpp b/scripts/test_stringzilla.hpp
index 6c0349e8..8af23816 100644
--- a/scripts/test_stringzilla.hpp
+++ b/scripts/test_stringzilla.hpp
@@ -111,99 +111,6 @@ inline void iterate_in_random_slices(std::string const &text, slice_callback_typ
     }
 }
 
-/**
- *  @brief Inefficient baseline Levenshtein distance computation, as implemented in most codebases.
- *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
- */
-inline std::size_t levenshtein_baseline(char const *s1, std::size_t len1, char const *s2,
-                                        std::size_t len2) noexcept(false) {
-    std::size_t const rows = len1 + 1;
-    std::size_t const cols = len2 + 1;
-    std::vector<std::size_t> matrix_buffer(rows * cols);
-
-    // Initialize the borders of the matrix.
-    for (std::size_t i = 0; i < rows; ++i) matrix_buffer[i * cols + 0] /* [i][0] in 2D */ = i;
-    for (std::size_t j = 0; j < cols; ++j) matrix_buffer[0 * cols + j] /* [0][j] in 2D */ = j;
-
-    for (std::size_t i = 1; i < rows; ++i) {
-        std::size_t const *last_row = &matrix_buffer[(i - 1) * cols];
-        std::size_t *row = &matrix_buffer[i * cols];
-        for (std::size_t j = 1; j < cols; ++j) {
-            std::size_t substitution_cost = (s1[i - 1] == s2[j - 1]) ? 0 : 1;
-            std::size_t if_deletion_or_insertion = std::min(last_row[j], row[j - 1]) + 1;
-            row[j] = std::min(if_deletion_or_insertion, last_row[j - 1] + substitution_cost);
-        }
-    }
-
-    return matrix_buffer.back();
-}
-
-/**
- *  @brief Inefficient baseline Needleman-Wunsch alignment score computation, as implemented in most codebases.
- *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
- */
-inline std::ptrdiff_t needleman_wunsch_baseline(char const *s1, std::size_t len1, char const *s2, std::size_t len2,
-                                                std::function<error_cost_t(char, char)> substitution_cost_for,
-                                                error_cost_t gap_cost) noexcept(false) {
-    std::size_t const rows = len1 + 1;
-    std::size_t const cols = len2 + 1;
-    std::vector<std::ptrdiff_t> matrix_buffer(rows * cols);
-
-    // Initialize the borders of the matrix.
-    for (std::size_t i = 0; i < rows; ++i) matrix_buffer[i * cols + 0] /* [i][0] in 2D */ = i * gap_cost;
-    for (std::size_t j = 0; j < cols; ++j) matrix_buffer[0 * cols + j] /* [0][j] in 2D */ = j * gap_cost;
-
-    // Fill in the rest of the matrix.
-    for (std::size_t i = 1; i < rows; ++i) {
-        std::ptrdiff_t const *last_row = &matrix_buffer[(i - 1) * cols];
-        std::ptrdiff_t *row = &matrix_buffer[i * cols];
-        for (std::size_t j = 1; j < cols; ++j) {
-            std::ptrdiff_t substitution_cost = substitution_cost_for(s1[i - 1], s2[j - 1]);
-            std::ptrdiff_t if_substitution = last_row[j - 1] + substitution_cost;
-            std::ptrdiff_t if_deletion_or_insertion = std::max(last_row[j], row[j - 1]) + gap_cost;
-            row[j] = std::max(if_deletion_or_insertion, if_substitution);
-        }
-    }
-
-    return matrix_buffer.back();
-}
-
-/**
- *  @brief Inefficient baseline Smith-Waterman local alignment score computation, as implemented in most codebases.
- *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
- */
-inline std::ptrdiff_t smith_waterman_baseline(char const *s1, std::size_t len1, char const *s2, std::size_t len2,
-                                              std::function<error_cost_t(char, char)> substitution_cost_for,
-                                              error_cost_t gap_cost) noexcept(false) {
-    std::size_t const rows = len1 + 1;
-    std::size_t const cols = len2 + 1;
-    std::vector<std::ptrdiff_t> matrix_buffer(rows * cols);
-
-    // Unlike the global alignment we need to track the largest score in the matrix.
-    std::ptrdiff_t best_score = 0;
-
-    // Initialize the borders of the matrix to 0.
-    for (std::size_t i = 0; i < rows; ++i) matrix_buffer[i * cols + 0] /* [i][0] in 2D */ = 0;
-    for (std::size_t j = 0; j < cols; ++j) matrix_buffer[0 * cols + j] /* [0][j] in 2D */ = 0;
-
-    // Fill in the rest of the matrix.
-    for (std::size_t i = 1; i < rows; ++i) {
-        std::ptrdiff_t const *last_row = &matrix_buffer[(i - 1) * cols];
-        std::ptrdiff_t *row = &matrix_buffer[i * cols];
-        for (std::size_t j = 1; j < cols; ++j) {
-            std::ptrdiff_t substitution_cost = substitution_cost_for(s1[i - 1], s2[j - 1]);
-            std::ptrdiff_t if_substitution = last_row[j - 1] + substitution_cost;
-            std::ptrdiff_t if_deletion_or_insertion = std::max(row[j - 1], last_row[j]) + gap_cost;
-            std::ptrdiff_t if_substitution_or_reset = std::max<std::ptrdiff_t>(if_substitution, 0);
-            std::ptrdiff_t score = std::max(if_deletion_or_insertion, if_substitution_or_reset);
-            row[j] = score;
-            best_score = std::max(best_score, score);
-        }
-    }
-
-    return best_score;
-}
-
 } // namespace scripts
 } // namespace stringzilla
 } // namespace ashvardanian
\ No newline at end of file

From 05f17a61be17b1b374ab6c4c065c171ad104f236 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 8 Apr 2025 15:58:04 +0000
Subject: [PATCH 296/751] Fix: Report error codes in tests

---
 scripts/test_stringcuzilla.cpp |  2 +-
 scripts/test_stringcuzilla.cu  |  7 +++----
 scripts/test_stringcuzilla.cuh | 15 ++++++++-------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/scripts/test_stringcuzilla.cpp b/scripts/test_stringcuzilla.cpp
index 527092b3..7969bd33 100644
--- a/scripts/test_stringcuzilla.cpp
+++ b/scripts/test_stringcuzilla.cpp
@@ -35,7 +35,7 @@ namespace sz = ashvardanian::stringzilla;
 int main(int argc, char const **argv) {
     sz_unused(argc && argv);
     std::printf("Hi, dear tester! You look nice today!\n");
-    sz::scripts::log_environment();
+    if (auto code = sz::scripts::log_environment(); code != 0) return code;
 
     sz::scripts::test_similarity_scores_equivalence();
     sz::scripts::test_similarity_scores_memory_usage();
diff --git a/scripts/test_stringcuzilla.cu b/scripts/test_stringcuzilla.cu
index 7954862c..7042d63b 100644
--- a/scripts/test_stringcuzilla.cu
+++ b/scripts/test_stringcuzilla.cu
@@ -31,15 +31,14 @@
 #include "test_stringcuzilla.cuh"
 
 namespace sz = ashvardanian::stringzilla;
-using namespace sz::scripts;
 
 int main(int argc, char const **argv) {
     sz_unused(argc && argv);
     std::printf("Hi, dear tester! You look nice today!\n");
-    scripts::log_environment();
+    if (auto code = sz::scripts::log_environment(); code != 0) return code;
 
-    scripts::test_similarity_scores_equivalence();
-    scripts::test_similarity_scores_memory_usage();
+    sz::scripts::test_similarity_scores_equivalence();
+    sz::scripts::test_similarity_scores_memory_usage();
 
     std::printf("All tests passed... Unbelievable!\n");
     return 0;
diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index 34efdcc0..c3a50a11 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -21,7 +21,7 @@ namespace ashvardanian {
 namespace stringzilla {
 namespace scripts {
 
-void log_environment() {
+int log_environment() {
     std::printf("- Uses Haswell: %s \n", SZ_USE_HASWELL ? "yes" : "no");
     std::printf("- Uses Skylake: %s \n", SZ_USE_SKYLAKE ? "yes" : "no");
     std::printf("- Uses Ice Lake: %s \n", SZ_USE_ICE ? "yes" : "no");
@@ -57,6 +57,7 @@ void log_environment() {
     std::printf("- CUDA managed memory support: %s\n", prop.managedMemory == 1 ? "yes" : "no");
     std::printf("- CUDA unified memory support: %s\n", prop.unifiedAddressing == 1 ? "yes" : "no");
 #endif
+    return 0;
 }
 
 /**
@@ -527,15 +528,15 @@ void test_similarity_scores_memory_usage() {
                     experiment.batch_size, experiment.min_string_length, experiment.max_string_length,
                     experiment.iterations);
 
-        // Single-threaded serial Levenshtein distance implementation
-        test_similarity_scores_fuzzy<sz_size_t>( //
-            levenshtein_baselines_t {},          //
-            levenshtein_distances<sz_cap_serial_k, char, std::allocator<char>> {}, experiment);
-
-        // Multi-threaded parallel Levenshtein distance implementation
+        // Multi-threaded serial Levenshtein distance implementation
         test_similarity_scores_fuzzy<sz_size_t>( //
             levenshtein_baselines_t {},          //
             levenshtein_distances<sz_cap_parallel_k, char, std::allocator<char>> {}, experiment);
+
+        // CUDA Levenshtein distance against Multi-threaded on CPU
+        test_similarity_scores_fuzzy<sz_size_t>(                                     //
+            levenshtein_distances<sz_cap_parallel_k, char, std::allocator<char>> {}, //
+            levenshtein_distances<sz_cap_cuda_k, char, std::allocator<char>> {}, experiment);
     }
 }
 

From aca33015d3b3320463e5f2e5b8bf0d7ebdf56224 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 8 Apr 2025 21:48:05 +0000
Subject: [PATCH 297/751] Add: New batch similarity benchmarks

---
 scripts/bench_similarity.cpp | 189 +++++++++--------------------------
 1 file changed, 49 insertions(+), 140 deletions(-)

diff --git a/scripts/bench_similarity.cpp b/scripts/bench_similarity.cpp
index 3ca8ab4c..113544d3 100644
--- a/scripts/bench_similarity.cpp
+++ b/scripts/bench_similarity.cpp
@@ -49,139 +49,37 @@
  */
 
 #include "bench.hpp"
-#include "test.hpp" // `levenshtein_baseline`, `unary_substitution_costs`
+#include "test_stringcuzilla.cuh" // `levenshtein_baseline`, `error_costs_256x256_unary`
 
 #if SZ_USE_CUDA
-#include <stringzilla/similarity.cuh> // Parallel string processing on CUDA or OpenMP
+#include <stringcuzilla/similarity.cuh> // Parallel string processing on CUDA or OpenMP
 #endif
 
 #if SZ_USE_OPENMP
-#include <stringzilla/similarity.hpp> // OpenMP templates for string similarity measures
+#include <stringcuzilla/similarity.hpp> // OpenMP templates for string similarity measures
 #endif
 
-using namespace ashvardanian::stringzilla::scripts;
+namespace sz = ashvardanian::stringzilla;
+using namespace sz::scripts;
+using namespace std::literals; // for ""sv
 
-#pragma region Hamming Distance
-
-/** @brief Wraps a hardware-specific Hamming-distance backend into something @b `bench_unary`-compatible . */
-template <sz_hamming_distance_t hamming_distance_>
-struct hamming_from_sz {
-
-    environment_t const &env;
-    sz_size_t bound = SZ_SIZE_MAX;
-
-    inline call_result_t operator()(std::size_t token_index) const noexcept {
-        return operator()(env[token_index], env[env.tokens.size() - 1 - token_index]);
-    }
-
-    inline call_result_t operator()(std::string_view a, std::string_view b) const noexcept {
-        sz_size_t result_distance;
-        sz_status_t status = hamming_distance_( //
-            a.data(), a.size(),                 //
-            b.data(), b.size(),                 //
-            bound, &result_distance);
-        do_not_optimize(status);
-        std::size_t bytes_passed = std::min(a.size(), b.size());
-        return {bytes_passed, static_cast<check_value_t>(result_distance)};
-    }
-};
-
-void bench_hamming(environment_t const &env) {
-    auto base_call = hamming_from_sz<sz_hamming_distance_serial>(env);
-    bench_result_t base = bench_unary(env, "sz_hamming_distance_serial", base_call).log();
-    auto base_utf8_call = hamming_from_sz<sz_hamming_distance_utf8_serial>(env);
-    bench_result_t base_utf8 = bench_unary(env, "sz_hamming_distance_utf8_serial", base_utf8_call).log(base);
-    sz_unused(base_utf8);
-}
-
-#pragma endregion
+using similarities_t = unified_vector<sz_ssize_t>;
+using levenshtein_serial_t = sz::levenshtein_distances<sz_cap_parallel_k, char, std::allocator<char>>;
+using levenshtein_cuda_t = sz::levenshtein_distances<sz_cap_cuda_k, char>;
 
 #pragma region Levenshtein Distance and Alignment Scores
 
 /** @brief Wraps a hardware-specific Levenshtein-distance backend into something @b `bench_unary`-compatible . */
-template <sz_levenshtein_distance_t levenshtein_distance_>
-struct levenshtein_from_sz {
-
-    environment_t const &env;
-    sz_size_t bound = SZ_SIZE_MAX;
-
-    inline call_result_t operator()(std::size_t token_index) const noexcept {
-        return operator()(env[token_index], env[env.tokens.size() - 1 - token_index]);
-    }
-
-    inline call_result_t operator()(std::string_view a, std::string_view b) const noexcept {
-        sz_size_t result_distance;
-        sz_status_t status = levenshtein_distance_( //
-            a.data(), a.size(),                     //
-            b.data(), b.size(),                     //
-            bound, NULL, &result_distance);
-        do_not_optimize(status);
-        std::size_t bytes_passed = std::min(a.size(), b.size());
-        std::size_t cells_passed = a.size() * b.size();
-        return {bytes_passed, static_cast<check_value_t>(result_distance), cells_passed};
-    }
-};
-
-/** @brief Wraps a hardware-specific Levenshtein-distance backend into something @b `bench_unary`-compatible . */
-template <sz_needleman_wunsch_score_t needleman_wunsch_>
-struct alignment_score_from_sz {
+template <typename engine_type_>
+struct batch_callable {
+    using engine_t = engine_type_;
 
     environment_t const &env;
+    similarities_t &results;
     sz_size_t bound = SZ_SIZE_MAX;
-    error_costs_256x256_t costs = unary_substitution_costs();
-
-    inline call_result_t operator()(std::size_t token_index) const noexcept {
-        return operator()(env[token_index], env[env.tokens.size() - 1 - token_index]);
-    }
-
-    inline call_result_t operator()(std::string_view a, std::string_view b) const noexcept {
-        sz_ssize_t result_score;
-        sz_status_t status = needleman_wunsch_( //
-            a.data(), a.size(),                 //
-            b.data(), b.size(),                 //
-            costs.data(), (sz_error_cost_t)-1,  //
-            NULL, &result_score);
-        do_not_optimize(status);
-        sz_size_t result_distance = (sz_size_t)(-result_score);
-        std::size_t bytes_passed = std::min(a.size(), b.size());
-        std::size_t cells_passed = a.size() * b.size();
-        return {bytes_passed, static_cast<check_value_t>(result_distance), cells_passed};
-    }
-};
-
-#if SZ_USE_OPENMP
+    engine_t engine = {};
 
-/** @brief Wraps a hardware-specific Levenshtein-distance backend into something @b `bench_unary`-compatible . */
-struct levenshtein_from_sz_openmp {
-
-    environment_t const &env;
-    sz_size_t bound = SZ_SIZE_MAX;
-
-    inline call_result_t operator()(std::size_t token_index) const noexcept {
-        return operator()(env[token_index], env[env.tokens.size() - 1 - token_index]);
-    }
-
-    inline call_result_t operator()(std::string_view a, std::string_view b) const noexcept(false) {
-        sz_size_t result_distance = sz::openmp::levenshtein_distance(a, b, std::allocator<char>());
-        do_not_optimize(result_distance);
-        std::size_t bytes_passed = std::min(a.size(), b.size());
-        std::size_t cells_passed = a.size() * b.size();
-        return {bytes_passed, static_cast<check_value_t>(result_distance), cells_passed};
-    }
-};
-
-#endif
-
-#if SZ_USE_CUDA
-
-/** @brief Wraps a hardware-specific Levenshtein-distance backend into something @b `bench_unary`-compatible . */
-struct levenshtein_from_sz_cuda {
-
-    environment_t const &env;
-    std::vector<sz_size_t, sz::cuda::unified_alloc<sz_size_t>> results;
-    sz_size_t bound = SZ_SIZE_MAX;
-
-    levenshtein_from_sz_cuda(environment_t const &env, sz_size_t batch_size) : env(env), results(batch_size) {
+    batch_callable(environment_t const &env, similarities_t &res, sz_size_t batch_size) : env(env), results(res) {
         if (env.tokens.size() <= batch_size) throw std::runtime_error("Batch size is too large.");
     }
 
@@ -195,8 +93,8 @@ struct levenshtein_from_sz_cuda {
     }
 
     inline call_result_t operator()(std::span<token_view_t const> a, std::span<token_view_t const> b) noexcept(false) {
-        sz::status_t status = sz::cuda::levenshtein_distances(a, b, results.data());
-        if (status != sz::status_t::success_k) throw std::runtime_error(cudaGetErrorString(cudaGetLastError()));
+        sz::status_t status = engine(a, b, results.data());
+        if (status != sz::status_t::success_k) throw std::runtime_error("Failed to compute Levenshtein distance.");
         do_not_optimize(results);
         std::size_t bytes_passed = 0, cells_passed = 0;
         for (std::size_t i = 0; i < results.size(); ++i) {
@@ -207,33 +105,45 @@ struct levenshtein_from_sz_cuda {
         call_result.bytes_passed = bytes_passed;
         call_result.operations = cells_passed;
         call_result.inputs_processed = results.size();
+        call_result.check_value = reinterpret_cast<check_value_t>(&results);
         return call_result;
     }
 };
 
-#endif
+struct similarities_equality_t {
+    bool operator()(check_value_t const &a, check_value_t const &b) const {
+        similarities_t const &a_ = *reinterpret_cast<similarities_t const *>(a);
+        similarities_t const &b_ = *reinterpret_cast<similarities_t const *>(b);
+        if (a_.size() != b_.size()) return false;
+        for (std::size_t i = 0; i < a_.size(); ++i)
+            if (a_[i] != b_[i]) {
+                std::printf("Mismatch at index %zu: %zd != %zd\n", i, a_[i], b_[i]);
+                return false;
+            }
+        return true;
+    }
+};
 
-void bench_edits(environment_t const &env) {
-    auto base_call = levenshtein_from_sz<sz_levenshtein_distance_serial>(env);
-    bench_result_t base = bench_unary(env, "sz_levenshtein_distance_serial", base_call).log();
-    auto base_utf8_call = levenshtein_from_sz<sz_levenshtein_distance_utf8_serial>(env);
-    bench_result_t base_utf8 = bench_unary(env, "sz_levenshtein_distance_utf8_serial", base_utf8_call).log(base);
-    sz_unused(base_utf8);
+void bench_levenshtein(environment_t const &env) {
 
-#if SZ_USE_OPENMP
-    bench_unary(env, "sz::openmp::levenshtein_distance", levenshtein_from_sz_openmp(env)).log(base);
-#endif
-#if SZ_USE_CUDA
-    bench_unary(env, "sz::cuda::levenshtein_distances(x1024)", levenshtein_from_sz_cuda(env, 1024)).log(base);
-#endif
+    std::vector<std::size_t> batch_sizes = {1024 / 32, 1024, 1024 * 32};
+    similarities_t results_baseline, results_accelerated;
 
-#if SZ_USE_ICE
-    auto ice_call = levenshtein_from_sz<sz_levenshtein_distance_ice>(env);
-    bench_unary(env, "sz_levenshtein_distance_ice", ice_call).log(base);
-#endif
+    for (std::size_t batch_size : batch_sizes) {
+        results_baseline.resize(batch_size);
+        results_accelerated.resize(batch_size);
+
+        auto call_baseline = batch_callable<levenshtein_serial_t>(env, results_baseline, batch_size);
+        auto name_baseline = "levenshtein_serial:batch"s + std::to_string(batch_size);
+        bench_result_t baseline = bench_unary(env, name_baseline, call_baseline).log();
 
-    auto needleman_wunsch_call = alignment_score_from_sz<sz_needleman_wunsch_score_serial>(env);
-    bench_unary(env, "sz_needleman_wunsch_score_serial", needleman_wunsch_call).log(base);
+        bench_result_t accelerated =
+            bench_unary(env, "levenshtein_cuda:batch"s + std::to_string(batch_size), call_baseline,
+                        batch_callable<levenshtein_cuda_t>(env, results_accelerated, batch_size),
+                        callable_no_op_t {},        // preprocessing
+                        similarities_equality_t {}) // equality check
+                .log(baseline);
+    }
 }
 
 #pragma endregion
@@ -249,8 +159,7 @@ int main(int argc, char const **argv) {
             environment_t::tokenization_t::lines_k);
 
         std::printf("Starting string similarity benchmarks...\n");
-        bench_hamming(env);
-        bench_edits(env);
+        bench_levenshtein(env);
     }
     catch (std::exception const &e) {
         std::fprintf(stderr, "Failed with: %s\n", e.what());

From 90bfcb66da490884d45e3c0dde5727dfcd06114f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 8 Apr 2025 21:48:30 +0000
Subject: [PATCH 298/751] Make: Move similarity benchmarks

---
 scripts/{bench_similarity.cpp => bench_similarity.cu} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename scripts/{bench_similarity.cpp => bench_similarity.cu} (100%)

diff --git a/scripts/bench_similarity.cpp b/scripts/bench_similarity.cu
similarity index 100%
rename from scripts/bench_similarity.cpp
rename to scripts/bench_similarity.cu

From b3db596a21d5d7b08116fbf3cfbd95369b6d7fbc Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 8 Apr 2025 21:48:59 +0000
Subject: [PATCH 299/751] Fix: Avoid `std::iterator` dependency

---
 include/stringzilla/types.hpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 60c71302..747fc0ed 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -87,6 +87,7 @@
 
 #if !SZ_AVOID_STL
 #include <initializer_list> // `std::initializer_list` is only ~100 LOC
+#include <iterator>         // `std::random_access_iterator_tag` pulls 20K LOC
 #endif
 
 namespace ashvardanian {
@@ -359,8 +360,10 @@ struct constant_iterator {
     using value_type = value_type_;
     using reference = value_type_ const &;
     using pointer = value_type_ const *;
-    using difference_type = std::ptrdiff_t;
+    using difference_type = sz_ssize_t;
+#if !SZ_AVOID_STL
     using iterator_category = std::random_access_iterator_tag;
+#endif
 
     constant_iterator(value_type const &value, difference_type pos = 0) noexcept : value_(value), pos_(pos) {}
 

From 4c874048f397d1262e1c6dfa73dff4c41485d5bd Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 8 Apr 2025 21:49:20 +0000
Subject: [PATCH 300/751] Fix: `std::allocator::rebind` deprecated

---
 include/stringzilla/types.hpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 747fc0ed..e70d3c25 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -220,8 +220,13 @@ struct arrow_strings_tape {
     using view_t = arrow_strings_view<char_t, offset_t>;
     using value_type = value_t; // ? For STL compatibility
 
+#if _SZ_IS_CPP17
+    using char_alloc_t = typename std::allocator_traits<allocator_t>::rebind_alloc<char_t>;
+    using offset_alloc_t = typename std::allocator_traits<allocator_t>::rebind_alloc<offset_t>;
+#else
     using char_alloc_t = typename allocator_t::template rebind<char_t>::other;
     using offset_alloc_t = typename allocator_t::template rebind<offset_t>::other;
+#endif
 
   private:
     span<char_t> buffer_;

From bc59ee3e3747e3d8757599338b67f6c9cb5fb299 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 8 Apr 2025 21:49:50 +0000
Subject: [PATCH 301/751] Fix: Compiling `constant_iterator` in CUDA

---
 include/stringzilla/types.hpp | 45 +++++++++++++++++------------------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index e70d3c25..dcb8d237 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -370,49 +370,48 @@ struct constant_iterator {
     using iterator_category = std::random_access_iterator_tag;
 #endif
 
-    constant_iterator(value_type const &value, difference_type pos = 0) noexcept : value_(value), pos_(pos) {}
+    constexpr constant_iterator(value_type const &value, difference_type pos = 0) noexcept : value_(value), pos_(pos) {}
+    constexpr reference operator*() const { return value_; }
+    constexpr pointer operator->() const { return &value_; }
 
-    reference operator*() const { return value_; }
-    pointer operator->() const { return &value_; }
-
-    constant_iterator &operator++() {
+    constexpr constant_iterator &operator++() {
         ++pos_;
         return *this;
     }
-    constant_iterator operator++(int) {
-        constant_iterator tmp(*this);
+    constexpr constant_iterator operator++(int) {
+        constexpr constant_iterator tmp(*this);
         ++pos_;
         return tmp;
     }
-    constant_iterator &operator--() {
+    constexpr constant_iterator &operator--() {
         --pos_;
         return *this;
     }
-    constant_iterator operator--(int) {
-        constant_iterator tmp(*this);
+    constexpr constant_iterator operator--(int) {
+        constexpr constant_iterator tmp(*this);
         --pos_;
         return tmp;
     }
 
-    constant_iterator operator+(difference_type n) const { return constant_iterator(value_, pos_ + n); }
-    constant_iterator &operator+=(difference_type n) {
+    constexpr constant_iterator operator+(difference_type n) const { return constant_iterator(value_, pos_ + n); }
+    constexpr constant_iterator &operator+=(difference_type n) {
         pos_ += n;
         return *this;
     }
-    constant_iterator operator-(difference_type n) const { return constant_iterator(value_, pos_ - n); }
-    constant_iterator &operator-=(difference_type n) {
+    constexpr constant_iterator operator-(difference_type n) const { return constant_iterator(value_, pos_ - n); }
+    constexpr constant_iterator &operator-=(difference_type n) {
         pos_ -= n;
         return *this;
     }
-    difference_type operator-(constant_iterator const &other) const { return pos_ - other.pos_; }
-
-    reference operator[](difference_type) const { return value_; }
-    bool operator==(constant_iterator const &other) const { return pos_ == other.pos_; }
-    bool operator!=(constant_iterator const &other) const { return pos_ != other.pos_; }
-    bool operator<(constant_iterator const &other) const { return pos_ < other.pos_; }
-    bool operator>(constant_iterator const &other) const { return pos_ > other.pos_; }
-    bool operator<=(constant_iterator const &other) const { return pos_ <= other.pos_; }
-    bool operator>=(constant_iterator const &other) const { return pos_ >= other.pos_; }
+    constexpr difference_type operator-(constant_iterator const &other) const { return pos_ - other.pos_; }
+
+    constexpr reference operator[](difference_type) const { return value_; }
+    constexpr bool operator==(constant_iterator const &other) const { return pos_ == other.pos_; }
+    constexpr bool operator!=(constant_iterator const &other) const { return pos_ != other.pos_; }
+    constexpr bool operator<(constant_iterator const &other) const { return pos_ < other.pos_; }
+    constexpr bool operator>(constant_iterator const &other) const { return pos_ > other.pos_; }
+    constexpr bool operator<=(constant_iterator const &other) const { return pos_ <= other.pos_; }
+    constexpr bool operator>=(constant_iterator const &other) const { return pos_ >= other.pos_; }
 
   private:
     value_type value_;

From d0bdd175d94170d11c2a10f380aecfb1d827139f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 8 Apr 2025 21:50:49 +0000
Subject: [PATCH 302/751] Add: Fetching Nvidia GPU specs

---
 include/stringzilla/types.hpp  | 35 +++++++++++++++++++++++++++++++---
 scripts/test_stringcuzilla.cuh | 35 ++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 3 deletions(-)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index dcb8d237..acdf974e 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -433,9 +433,38 @@ struct is_same_type {
 };
 
 struct gpu_specs_t {
-    size_t total_sm_count = 108;              // ? On A100
-    size_t blocks_per_sm = 128;               // ? Each, generally, with 32 threads
-    size_t shared_memory_per_sm = 192 * 1024; // ? On A100 it's 192 KB per SM
+    size_t vram_bytes = 40ul * 1024 * 1024 * 1024; // ? On A100 it's 40 GB
+    size_t constant_memory_bytes = 64 * 1024;      // ? On A100 it's 64 KB
+    size_t shared_memory_bytes = 192 * 1024 * 108; // ? On A100 it's 192 KB per SM
+    size_t streaming_multiprocessors = 108;        // ? On A100
+    size_t cuda_cores = 6912;                      // ? On A100 for f32/i32 logic
+    size_t max_blocks_per_multiprocessor = 0;
+    size_t reserved_memory_per_block = 0;
+
+    inline size_t shared_memory_per_multiprocessor() const noexcept {
+        return shared_memory_bytes / streaming_multiprocessors;
+    }
+
+    inline static size_t cores_per_multiprocessor(int major, int minor) noexcept {
+        typedef struct {
+            size_t sm;
+            size_t cores;
+        } generation_to_core_count;
+        generation_to_core_count generations_to_core_counts[] = {
+            {(7 << 4) + 0, 64},  // Compute Capability 7.0 (V100)
+            {(7 << 4) + 5, 64},  // Compute Capability 7.5 (RTX 2080 Ti)
+            {(8 << 4) + 0, 64},  // Compute Capability 8.0 (A100)
+            {(8 << 4) + 6, 128}, // Compute Capability 8.6 (RTX 3090)
+            {(9 << 4) + 0, 128}, // Compute Capability 9.0 (H100)
+            {0, 0}};
+
+        // Create a numeric code: for SM 3.5, SM = (3 << 4 + 5) = 0x35.
+        size_t sm = ((major << 4) + minor);
+        size_t index = 0;
+        for (; generations_to_core_counts[index].sm != 0; ++index)
+            if (generations_to_core_counts[index].sm == sm) return generations_to_core_counts[index].cores;
+        return generations_to_core_counts[index - 1].cores;
+    }
 };
 
 struct cpu_specs_t {
diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index c3a50a11..a5dca536 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -21,6 +21,30 @@ namespace ashvardanian {
 namespace stringzilla {
 namespace scripts {
 
+inline gpu_specs_t gpu_specs(int device = 0) noexcept(false) {
+    gpu_specs_t specs;
+#if SZ_USE_CUDA
+    cudaDeviceProp prop;
+    cudaError_t cuda_error = cudaGetDeviceProperties(&prop, device);
+    if (cuda_error != cudaSuccess)
+        throw std::runtime_error(std::string("Error retrieving device properties: ") + cudaGetErrorString(cuda_error));
+
+    // Set the GPU specs
+    specs.streaming_multiprocessors = prop.multiProcessorCount;
+    specs.constant_memory_bytes = prop.totalConstMem;
+    specs.vram_bytes = prop.totalGlobalMem;
+
+    // Infer other global settings, that CUDA doesn't expose directly
+    specs.shared_memory_bytes = prop.sharedMemPerMultiprocessor * prop.multiProcessorCount;
+    specs.cuda_cores = gpu_specs_t::cores_per_multiprocessor(prop.major, prop.minor) * specs.streaming_multiprocessors;
+
+    // Scheduling-related constants
+    specs.max_blocks_per_multiprocessor = prop.maxBlocksPerMultiProcessor;
+    specs.reserved_memory_per_block = prop.reservedSharedMemPerBlock;
+#endif
+    return specs;
+}
+
 int log_environment() {
     std::printf("- Uses Haswell: %s \n", SZ_USE_HASWELL ? "yes" : "no");
     std::printf("- Uses Skylake: %s \n", SZ_USE_SKYLAKE ? "yes" : "no");
@@ -52,7 +76,18 @@ int log_environment() {
     cudaDeviceProp prop;
     for (int i = 0; i < device_count; ++i) {
         cuda_error = cudaGetDeviceProperties(&prop, i);
+        if (cuda_error != cudaSuccess) {
+            std::printf("Error retrieving properties for device %d: %s\n", i, cudaGetErrorString(cuda_error));
+            continue;
+        }
+        int warps_per_sm = prop.maxThreadsPerMultiProcessor / prop.warpSize;
+        int shared_memory_per_warp = (warps_per_sm > 0) ? (prop.sharedMemPerMultiprocessor / warps_per_sm) : 0;
         std::printf("  - %s\n", prop.name);
+        std::printf("    Shared Memory per SM: %zu bytes\n", prop.sharedMemPerMultiprocessor);
+        std::printf("    Maximum Threads per SM: %d\n", prop.maxThreadsPerMultiProcessor);
+        std::printf("    Warp Size: %d threads\n", prop.warpSize);
+        std::printf("    Max Warps per SM: %d warps\n", warps_per_sm);
+        std::printf("    Shared Memory per Warp: %d bytes\n", shared_memory_per_warp);
     }
     std::printf("- CUDA managed memory support: %s\n", prop.managedMemory == 1 ? "yes" : "no");
     std::printf("- CUDA unified memory support: %s\n", prop.unifiedAddressing == 1 ? "yes" : "no");

From bd2a21dadbd453592ae84c8456213c01dbd1987b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 8 Apr 2025 21:51:25 +0000
Subject: [PATCH 303/751] Improve: Allow custom validators in benchmarks

---
 scripts/bench.hpp              | 23 +++++++++++++----------
 scripts/test_stringcuzilla.cuh | 10 ++++++----
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index 6c6878a1..5795cdc3 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -47,14 +47,14 @@
 #include <string_view> // Requires C++17
 #include <span>        // Requires C++20, used to pass info to batch-capable parallel backends
 
-#include <stringzilla/stringzilla.h>
-#include <stringzilla/stringzilla.hpp>
+#include "stringzilla/stringzilla.h"
+#include "stringzilla/stringzilla.hpp"
 
 #if SZ_USE_CUDA
-#include <stringzilla/types.cuh> // `unified_alloc`
+#include "stringcuzilla/types.cuh" // `unified_alloc`
 #endif
 
-#include "test.hpp" // `read_file`
+#include "test_stringzilla.hpp" // `read_file`
 
 namespace sz = ashvardanian::stringzilla;
 namespace stdc = std::chrono;
@@ -692,19 +692,22 @@ bench_result_t bench_nullary(  //
  *  @param[in] baseline Optional serial analog, against which the accelerated function will be stress-tested.
  *  @param[in] callable Unary function taking a @b `std::size_t` token index and returning a @b `call_result_t`.
  *  @param[in] preprocessing Optional function to pre-process the data after the prediction.
+ *  @param[in] check_validator Optional function to validate the results of the benchmark.
  *  @return Profiling results, including the number of cycles, bytes processed, and error counts.
  */
-template <                                          //
-    typename callable_type_,                        //
-    typename baseline_type_ = callable_no_op_t,     //
-    typename preprocessing_type_ = callable_no_op_t //
+template <                                                        //
+    typename callable_type_,                                      //
+    typename baseline_type_ = callable_no_op_t,                   //
+    typename preprocessing_type_ = callable_no_op_t,              //
+    typename check_validator_type_ = std::equal_to<check_value_t> //
     >
 bench_result_t bench_unary(    //
     environment_t const &env,  //
     std::string const &name,   //
     baseline_type_ &&baseline, //
     callable_type_ &&callable, //
-    preprocessing_type_ &&preprocessing = callable_no_op_t()) {
+    preprocessing_type_ &&preprocessing = preprocessing_type_ {},
+    check_validator_type_ &&check_validator = check_validator_type_ {}) {
 
     bench_result_t result;
     result.name = name;
@@ -723,7 +726,7 @@ bench_result_t bench_unary(    //
             call_result_t const accelerated_result = callable(token_index);
             call_result_t const baseline_result = baseline(token_index);
             result.stress_calls += accelerated_result.inputs_processed;
-            if (accelerated_result.check_value == baseline_result.check_value) continue; // No failures
+            if (check_validator(accelerated_result.check_value, baseline_result.check_value)) continue; // No failures
 
             // If we got here, the error needs to be reported and investigated.
             ++result.errors;
diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index a5dca536..50f01f15 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -384,9 +384,9 @@ struct fuzzy_config_t {
  *          as well as the similarity scoring functions for bioinformatics-like workloads
  *          on a synthetic @b randomly-generated set of strings from a given @p alphabet.
  */
-template <typename score_type_, typename base_operator_, typename simd_operator_>
+template <typename score_type_, typename base_operator_, typename simd_operator_, typename... extra_args_>
 void test_similarity_scores_fuzzy(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
-                                  fuzzy_config_t config = {}) {
+                                  fuzzy_config_t config = {}, extra_args_ &&...extra_args) {
 
     using score_t = score_type_;
     unified_vector<score_t> results_base(config.batch_size), results_simd(config.batch_size);
@@ -409,7 +409,7 @@ void test_similarity_scores_fuzzy(base_operator_ &&base_operator, simd_operator_
 
         // Compute with both backends
         status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
-        status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data());
+        status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data(), extra_args...);
         _sz_assert(status_base == status_t::success_k);
         _sz_assert(status_simd == status_t::success_k);
 
@@ -557,6 +557,8 @@ void test_similarity_scores_memory_usage() {
         {.batch_size = 10, .min_string_length = 1, .max_string_length = 131072, .iterations = 1},
     };
 
+    gpu_specs_t first_gpu_specs = gpu_specs();
+
     // Progress until something fails
     for (fuzzy_config_t const &experiment : experiments) {
         std::printf("Testing with batch size %zu, min length %zu, max length %zu, iterations %zu\n",
@@ -571,7 +573,7 @@ void test_similarity_scores_memory_usage() {
         // CUDA Levenshtein distance against Multi-threaded on CPU
         test_similarity_scores_fuzzy<sz_size_t>(                                     //
             levenshtein_distances<sz_cap_parallel_k, char, std::allocator<char>> {}, //
-            levenshtein_distances<sz_cap_cuda_k, char, std::allocator<char>> {}, experiment);
+            levenshtein_distances<sz_cap_cuda_k, char> {}, experiment, first_gpu_specs);
     }
 }
 

From 3c8d181bf0fceb3fa29de9176eb8ad55b5e14df2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 9 Apr 2025 08:07:54 +0000
Subject: [PATCH 304/751] Add: Ice Lake similarity kernels

---
 include/stringcuzilla/similarity.hpp | 200 ++++++++++++++++++++++++++-
 scripts/test_stringcuzilla.cuh       |  25 +++-
 2 files changed, 215 insertions(+), 10 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index d910e54d..185426fc 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -89,7 +89,10 @@ struct global_scorer {
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
     using char_t = typename std::remove_cvref<first_char_t>::type;
 
-  private:
+    using scorer_t =
+        global_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k, capability_k>;
+
+  protected:
     substituter_t substituter_ {};
     error_cost_t gap_cost_ {1};
     score_t last_score_ {0};
@@ -145,7 +148,7 @@ struct global_scorer {
         }
 
         // The last element of the last chunk is the result of the global alignment.
-        last_score_ = scores_new[n - 1];
+        if (n == 1) last_score_ = scores_new[0];
     }
 };
 
@@ -180,7 +183,10 @@ struct local_scorer {
     static_assert(std::is_same<first_char_t, second_char_t>(), "String characters must be of the same type.");
     using char_t = first_char_t;
 
-  private:
+    using scorer_t =
+        local_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k, capability_k>;
+
+  protected:
     substituter_t substituter_ {};
     error_cost_t gap_cost_ {1};
     score_t best_score_ {0};
@@ -1376,6 +1382,194 @@ struct error_costs_26x26ascii_t {
     }
 };
 
+/*  AVX512 implementation of the string similarity algorithms for Ice Lake and newer CPUs.
+ *  Includes extensions:
+ *      - 2017 Skylake: F, CD, ER, PF, VL, DQ, BW,
+ *      - 2018 CannonLake: IFMA, VBMI,
+ *      - 2019 Ice Lake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES.
+ */
+#pragma region Ice Lake Implementation
+#if SZ_USE_ICE
+#pragma GCC push_options
+#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
+#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
+                             apply_to = function)
+
+/**
+ *  @brief Variant of `global_scorer` - Minimizes Levenshtein distance for inputs under 256 chars.
+ *  @note Requires Intel Ice Lake generation CPUs or newer.
+ */
+template <>
+struct global_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k, sz_cap_ice_k>
+    : public global_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k,
+                           sz_cap_serial_k> {
+
+    using scorer_t::global_scorer; // Make the constructors visible
+  protected:
+    __mmask64 load_mask_, mismatch_mask_;
+    sz_u512_vec_t first_vec_, second_vec_;
+    sz_u512_vec_t pre_substitution_vec_, pre_insertion_vec_, pre_deletion_vec_;
+    sz_u512_vec_t cost_if_substitution_vec_, cost_if_gap_vec_, cell_score_vec_;
+
+    // Initialize the constants
+    sz_u512_vec_t ones_vec_;
+
+  public:
+    inline void slice(                                                                        //
+        char const *first_reversed_slice, char const *second_slice, sz_size_t i, sz_size_t n, //
+        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion,          //
+        sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new) noexcept {
+        load_mask_ = _sz_u64_mask_until(n - i);
+        pre_substitution_vec_.zmm = _mm512_maskz_loadu_epi8(load_mask_, scores_pre_substitution + i);
+        pre_insertion_vec_.zmm = _mm512_maskz_loadu_epi8(load_mask_, scores_pre_insertion + i);
+        pre_deletion_vec_.zmm = _mm512_maskz_loadu_epi8(load_mask_, scores_pre_deletion + i);
+
+        // ? Note that here we are still traversing both buffers in the same order,
+        // ? because one of the strings has been reversed beforehand.
+        first_vec_.zmm = _mm512_maskz_loadu_epi8(load_mask_, first_reversed_slice + i);
+        second_vec_.zmm = _mm512_maskz_loadu_epi8(load_mask_, second_slice + i);
+        mismatch_mask_ = _mm512_cmpneq_epi8_mask(first_vec_.zmm, second_vec_.zmm);
+        cost_if_substitution_vec_.zmm =
+            _mm512_mask_add_epi8(pre_substitution_vec_.zmm, mismatch_mask_, pre_substitution_vec_.zmm, ones_vec_.zmm);
+        cost_if_gap_vec_.zmm =
+            _mm512_add_epi8(_mm512_min_epu8(pre_insertion_vec_.zmm, pre_deletion_vec_.zmm), ones_vec_.zmm);
+        cell_score_vec_.zmm = _mm512_min_epu8(cost_if_substitution_vec_.zmm, cost_if_gap_vec_.zmm);
+        _mm512_mask_storeu_epi8(scores_new + i, load_mask_, cell_score_vec_.zmm);
+    }
+
+    inline void operator()(                                                          //
+        char const *first_reversed_slice, char const *second_slice, sz_size_t n,     //
+        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion, //
+        sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new) noexcept {
+
+        // Initialize the constants
+        ones_vec_.zmm = _mm512_set1_epi8(1);
+
+        // In this variant we will need at most 4 loops per diagonal.
+        for (sz_size_t i = 0; i < n; i += 64)
+            slice(first_reversed_slice, second_slice, i, n, scores_pre_substitution, scores_pre_insertion,
+                  scores_pre_deletion, scores_new);
+
+        // The last element of the last chunk is the result of the global alignment.
+        if (n == 1) this->last_score_ = scores_new[0];
+    }
+};
+
+/**
+ *  @brief Variant of `global_scorer` - Minimizes Levenshtein distance for inputs in [256, 65K] chars.
+ *  @note Requires Intel Ice Lake generation CPUs or newer.
+ */
+template <>
+struct global_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k, sz_cap_ice_k>
+    : public global_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
+                           sz_cap_serial_k> {
+
+    using scorer_t::global_scorer; // Make the constructors visible
+    using ice_scorer_t = global_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t,
+                                       sz_minimize_distance_k, sz_cap_ice_k>;
+
+  protected:
+    __mmask32 load_mask_, mismatch_mask_;
+    sz_u256_vec_t first_vec_, second_vec_;
+    sz_u512_vec_t pre_substitution_vec_, pre_insertion_vec_, pre_deletion_vec_;
+    sz_u512_vec_t cost_if_substitution_vec_, cost_if_gap_vec_, cell_score_vec_;
+    sz_u512_vec_t ones_vec_;
+
+  public:
+    inline void slice(                                                                        //
+        char const *first_reversed_slice, char const *second_slice, sz_size_t i, sz_size_t n, //
+        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,        //
+        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
+        load_mask_ = _sz_u32_clamp_mask_until(n - i);
+        pre_substitution_vec_.zmm = _mm512_maskz_loadu_epi16(load_mask_, scores_pre_substitution + i);
+        pre_insertion_vec_.zmm = _mm512_maskz_loadu_epi16(load_mask_, scores_pre_insertion + i);
+        pre_deletion_vec_.zmm = _mm512_maskz_loadu_epi16(load_mask_, scores_pre_deletion + i);
+
+        // ? Note that here we are still traversing both buffers in the same order,
+        // ? because one of the strings has been reversed beforehand.
+        first_vec_.ymm = _mm256_maskz_loadu_epi8(load_mask_, first_reversed_slice + i);
+        second_vec_.ymm = _mm256_maskz_loadu_epi8(load_mask_, second_slice + i);
+        mismatch_mask_ = _mm256_cmpneq_epi8_mask(first_vec_.ymm, second_vec_.ymm);
+        cost_if_substitution_vec_.zmm =
+            _mm512_mask_add_epi16(pre_substitution_vec_.zmm, mismatch_mask_, pre_substitution_vec_.zmm, ones_vec_.zmm);
+        cost_if_gap_vec_.zmm =
+            _mm512_add_epi16(_mm512_min_epu16(pre_insertion_vec_.zmm, pre_deletion_vec_.zmm), ones_vec_.zmm);
+        cell_score_vec_.zmm = _mm512_min_epu16(cost_if_substitution_vec_.zmm, cost_if_gap_vec_.zmm);
+        _mm512_mask_storeu_epi16(scores_new + i, load_mask_, cell_score_vec_.zmm);
+    }
+
+    inline void operator()(                                                            //
+        char const *first_reversed_slice, char const *second_slice, sz_size_t n,       //
+        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion, //
+        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
+
+        // Initialize the constants
+        ones_vec_.zmm = _mm512_set1_epi16(1);
+
+        // In this variant we will need at most 64*1024/32 = 2048 loops per diagonal.
+        for (sz_size_t i = 0; i < n; i += 32)
+            slice(first_reversed_slice, second_slice, i, n, scores_pre_substitution, scores_pre_insertion,
+                  scores_pre_deletion, scores_new);
+
+        // The last element of the last chunk is the result of the global alignment.
+        if (n == 1) this->last_score_ = scores_new[0];
+    }
+};
+
+/**
+ *  @brief Variant of `global_scorer` - Minimizes Levenshtein distance for inputs in [256, 65K] chars in parallel.
+ *  @note Requires Intel Ice Lake generation CPUs or newer.
+ */
+template <>
+struct global_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
+                     (sz_capability_t)(sz_cap_parallel_k | sz_cap_ice_k)>
+    : public global_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
+                           sz_cap_ice_k> {
+
+    using ice_scorer_t::global_scorer; // Make the constructors visible
+
+    inline void operator()(                                                            //
+        char const *first_reversed_slice, char const *second_slice, sz_size_t n,       //
+        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion, //
+        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
+
+        // Initialize the constants
+        ice_scorer_t::ones_vec_.zmm = _mm512_set1_epi16(1);
+
+#pragma openmp parallel for
+        // In this variant we will need at most 64*1024/32 = 2048 loops per diagonal.
+        for (sz_size_t i = 0; i < n; i += 32)
+            ice_scorer_t::slice(first_reversed_slice, second_slice, i, n, scores_pre_substitution, scores_pre_insertion,
+                                scores_pre_deletion, scores_new);
+
+        // The last element of the last chunk is the result of the global alignment.
+        if (n == 1) this->last_score_ = scores_new[0];
+    }
+};
+
+#if 0
+template <>
+struct global_scorer<char const *, char const *, sz_i16_t, error_costs_uniform_t, sz_maximize_distance_k, sz_cap_ice_k>
+    : public global_scorer<char const *, char const *, sz_i16_t, error_costs_uniform_t, sz_maximize_distance_k,
+                           sz_cap_serial_k> {
+
+    using scorer_t::global_scorer; // Make the constructors visible
+
+    void operator()(                                                                   //
+        char const *first_reversed_slice, char const *second_slice, sz_size_t n,       //
+        sz_i16_t const *scores_pre_substitution, sz_i16_t const *scores_pre_insertion, //
+        sz_i16_t const *scores_pre_deletion, sz_i16_t *scores_new) noexcept {
+
+        //
+    }
+};
+#endif
+
+#pragma clang attribute pop
+#pragma GCC pop_options
+#endif            // SZ_USE_ICE
+#pragma endregion // Ice Lake Implementation
+
 } // namespace stringzilla
 } // namespace ashvardanian
 
diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index 50f01f15..7e053a41 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -244,10 +244,14 @@ using smith_waterman_serial_t = smith_waterman_scores<sz_cap_parallel_k, char, s
  *  - for Global Alignments, we can vectorize the min-max calculation for diagonal "walkers"
  *  - for Local Alignments, we can vectorize the character substitution lookups for horizontal "walkers"
  */
-using levenshtein_ice_t = levenshtein_distances<sz_cap_ice_k, char, std::allocator<char>>;
-using levenshtein_utf8_ice_t = levenshtein_distances_utf8<sz_cap_ice_k, char, std::allocator<char>>;
-using needleman_wunsch_ice_t = needleman_wunsch_scores<sz_cap_ice_k, char, std::allocator<char>>;
-using smith_waterman_ice_t = smith_waterman_scores<sz_cap_ice_k, char, std::allocator<char>>;
+using levenshtein_ice_t =
+    levenshtein_distances<(sz_capability_t)(sz_cap_parallel_k | sz_cap_ice_k), char, std::allocator<char>>;
+using levenshtein_utf8_ice_t =
+    levenshtein_distances_utf8<(sz_capability_t)(sz_cap_parallel_k | sz_cap_ice_k), char, std::allocator<char>>;
+using needleman_wunsch_ice_t =
+    needleman_wunsch_scores<(sz_capability_t)(sz_cap_parallel_k | sz_cap_ice_k), char, std::allocator<char>>;
+using smith_waterman_ice_t =
+    smith_waterman_scores<(sz_capability_t)(sz_cap_parallel_k | sz_cap_ice_k), char, std::allocator<char>>;
 
 /**
  *  In @b CUDA:
@@ -496,9 +500,12 @@ void test_similarity_scores_equivalence() {
         smith_waterman_scores<sz_cap_parallel_k, char, error_matrix_t, std::allocator<char>> {
             blosum62_matrix, blosum62_gap_extension_cost});
 
-    // Switch to the GPU, using an identical matrix, but move it into unified memory
-    unified_vector<error_mat_t> blosum62_unified(1);
-    blosum62_unified[0] = blosum62_mat;
+#if SZ_USE_ICE
+    // Ice Lake Levenshtein distance against Multi-threaded on CPU
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                           //
+        levenshtein_distances<sz_cap_parallel_k, char, std::allocator<char>> {}, //
+        levenshtein_distances<(sz_capability_t)(sz_cap_parallel_k | sz_cap_ice_k), char, std::allocator<char>> {});
+#endif
 
 #if SZ_USE_CUDA
     // CUDA Levenshtein distance against Multi-threaded on CPU
@@ -514,6 +521,10 @@ void test_similarity_scores_equivalence() {
         levenshtein_distances<sz_cap_hopper_k, char> {});
 #endif
 
+    // Switch to the GPU, using an identical matrix, but move it into unified memory
+    unified_vector<error_mat_t> blosum62_unified(1);
+    blosum62_unified[0] = blosum62_mat;
+
 #if SZ_USE_CUDA
     // CUDA Needleman-Wunsch distance against Multi-threaded on CPU,
     // using a compressed smaller matrix to fit into GPU shared memory

From c36d2b8fc3635afc914d585c518d933f9579f650 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 9 Apr 2025 08:08:51 +0000
Subject: [PATCH 305/751] Add: `gpu_specs` and `cuda_status_t`

---
 include/stringcuzilla/types.cuh | 34 +++++++++++++++++++++++++++++++--
 scripts/test_stringcuzilla.cuh  | 26 +------------------------
 2 files changed, 33 insertions(+), 27 deletions(-)

diff --git a/include/stringcuzilla/types.cuh b/include/stringcuzilla/types.cuh
index 1c4940ea..4f2e26bf 100644
--- a/include/stringcuzilla/types.cuh
+++ b/include/stringcuzilla/types.cuh
@@ -11,10 +11,11 @@
 #ifndef STRINGZILLA_TYPES_CUH_
 #define STRINGZILLA_TYPES_CUH_
 
-#include <cuda_runtime.h> // `cudaMallocManaged`, `cudaFree`, `cudaSuccess`, `cudaGetErrorString`
-
 #include "stringzilla/types.hpp"
 
+#include <cuda_runtime.h> // `cudaMallocManaged`, `cudaFree`, `cudaSuccess`, `cudaGetErrorString`
+#include <optional>       // `std::optional`
+
 #if !defined(SZ_USE_HOPPER)
 #if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ < 11)
 #define SZ_USE_HOPPER (1)
@@ -78,6 +79,35 @@ struct unified_alloc {
     }
 };
 
+inline std::optional<gpu_specs_t> gpu_specs(int device = 0) noexcept {
+    gpu_specs_t specs;
+    cudaDeviceProp prop;
+    cudaError_t cuda_error = cudaGetDeviceProperties(&prop, device);
+    if (cuda_error != cudaSuccess) return std::nullopt; // ! Failed to get device properties
+
+    // Set the GPU specs
+    specs.streaming_multiprocessors = prop.multiProcessorCount;
+    specs.constant_memory_bytes = prop.totalConstMem;
+    specs.vram_bytes = prop.totalGlobalMem;
+
+    // Infer other global settings, that CUDA doesn't expose directly
+    specs.shared_memory_bytes = prop.sharedMemPerMultiprocessor * prop.multiProcessorCount;
+    specs.cuda_cores = gpu_specs_t::cores_per_multiprocessor(prop.major, prop.minor) * specs.streaming_multiprocessors;
+
+    // Scheduling-related constants
+    specs.max_blocks_per_multiprocessor = prop.maxBlocksPerMultiProcessor;
+    specs.reserved_memory_per_block = prop.reservedSharedMemPerBlock;
+    return specs;
+}
+
+struct cuda_status_t {
+    status_t status = status_t::success_k;
+    cudaError_t cuda_error = cudaSuccess;
+    float elapsed_milliseconds = 0.0;
+
+    inline operator status_t() const noexcept { return status; }
+};
+
 } // namespace stringzilla
 } // namespace ashvardanian
 
diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index 7e053a41..3b5de381 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -21,30 +21,6 @@ namespace ashvardanian {
 namespace stringzilla {
 namespace scripts {
 
-inline gpu_specs_t gpu_specs(int device = 0) noexcept(false) {
-    gpu_specs_t specs;
-#if SZ_USE_CUDA
-    cudaDeviceProp prop;
-    cudaError_t cuda_error = cudaGetDeviceProperties(&prop, device);
-    if (cuda_error != cudaSuccess)
-        throw std::runtime_error(std::string("Error retrieving device properties: ") + cudaGetErrorString(cuda_error));
-
-    // Set the GPU specs
-    specs.streaming_multiprocessors = prop.multiProcessorCount;
-    specs.constant_memory_bytes = prop.totalConstMem;
-    specs.vram_bytes = prop.totalGlobalMem;
-
-    // Infer other global settings, that CUDA doesn't expose directly
-    specs.shared_memory_bytes = prop.sharedMemPerMultiprocessor * prop.multiProcessorCount;
-    specs.cuda_cores = gpu_specs_t::cores_per_multiprocessor(prop.major, prop.minor) * specs.streaming_multiprocessors;
-
-    // Scheduling-related constants
-    specs.max_blocks_per_multiprocessor = prop.maxBlocksPerMultiProcessor;
-    specs.reserved_memory_per_block = prop.reservedSharedMemPerBlock;
-#endif
-    return specs;
-}
-
 int log_environment() {
     std::printf("- Uses Haswell: %s \n", SZ_USE_HASWELL ? "yes" : "no");
     std::printf("- Uses Skylake: %s \n", SZ_USE_SKYLAKE ? "yes" : "no");
@@ -568,7 +544,7 @@ void test_similarity_scores_memory_usage() {
         {.batch_size = 10, .min_string_length = 1, .max_string_length = 131072, .iterations = 1},
     };
 
-    gpu_specs_t first_gpu_specs = gpu_specs();
+    gpu_specs_t first_gpu_specs = *gpu_specs();
 
     // Progress until something fails
     for (fuzzy_config_t const &experiment : experiments) {

From f64fc568fb47d0d798b4b3b9a036bd47d4707de3 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 9 Apr 2025 11:37:25 +0000
Subject: [PATCH 306/751] Add: Repeat similarity benchmarks on CPU

---
 scripts/bench_similarity.cpp |  75 ++++++++++++++++++
 scripts/bench_similarity.cu  |  98 +----------------------
 scripts/bench_similarity.cuh | 150 +++++++++++++++++++++++++++++++++++
 3 files changed, 226 insertions(+), 97 deletions(-)
 create mode 100644 scripts/bench_similarity.cpp
 create mode 100644 scripts/bench_similarity.cuh

diff --git a/scripts/bench_similarity.cpp b/scripts/bench_similarity.cpp
new file mode 100644
index 00000000..a685bcb1
--- /dev/null
+++ b/scripts/bench_similarity.cpp
@@ -0,0 +1,75 @@
+/**
+ *  @file   bench_similarity.cpp
+ *  @brief  Benchmarks string similarity computations.
+ *          It accepts a file with a list of words, and benchmarks the levenshtein edit-distance computations,
+ *          alignment scores, and fingerprinting techniques combined with the Hamming distance.
+ *
+ *  Benchmarks include:
+ *  - Linear-complexity basic & bounded Hamming distance computations.
+ *  - Quadratic-complexity basic & bounded Levenshtein edit-distance computations.
+ *  - Quadratic-complexity Needleman-Wunsch alignment scores for bioinformatics.
+ *
+ *  For Dynamic Programming algorithms, the number of operations per second are reported as the worst-case time
+ *  complexity of the Cells Updates Per Second @b (CUPS) metric, meaning O(N*M) for a pair of strings with N and M
+ *  characters, respectively.
+ *
+ *  Instead of CLI arguments, for compatibility with @b StringWa.rs, the following environment variables are used:
+ *  - `STRINGWARS_DATASET` : Path to the dataset file.
+ *  - `STRINGWARS_TOKENS=words` : Tokenization model ("file", "lines", "words", or positive integer [1:200] for N-grams
+ *  - `STRINGWARS_SEED=42` : Optional seed for shuffling reproducibility.
+ *
+ *  Unlike StringWa.rs, the following additional environment variables are supported:
+ *  - `STRINGWARS_DURATION=10` : Time limit (in seconds) per benchmark.
+ *  - `STRINGWARS_STRESS=1` : Test SIMD-accelerated functions against the serial baselines.
+ *  - `STRINGWARS_STRESS_DIR=/.tmp` : Output directory for stress-testing failures logs.
+ *  - `STRINGWARS_STRESS_LIMIT=1` : Controls the number of failures we're willing to tolerate.
+ *  - `STRINGWARS_STRESS_DURATION=10` : Stress-testing time limit (in seconds) per benchmark.
+ *  - `STRINGWARS_FILTER` : Regular Expression pattern to filter algorithm/backend names.
+ *
+ *  Here are a few build & run commands:
+ *
+ *  @code{.sh}
+ *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
+ *  cmake --build build_release --config Release --target stringzilla_bench_similarity
+ *  STRINGWARS_DATASET=xlsum.csv STRINGWARS_TOKENS=words build_release/stringzilla_bench_similarity
+ *  @endcode
+ *
+ *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
+ *  like all Skylake-X and newer kernels on a boundary-condition input length of 64 bytes (exactly 1 cache line),
+ *  your last command may look like:
+ *
+ *  @code{.sh}
+ *  STRINGWARS_DATASET=proteins.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
+ *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
+ *  build_release/stringzilla_bench_similarity
+ *  @endcode
+ *
+ *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
+ *  This file is the sibling of `bench_search.cpp`, `bench_token.cpp`, `bench_sequence.cpp`, and `bench_memory.cpp`.
+ */
+#include "bench_similarity.cuh"
+
+namespace sz = ashvardanian::stringzilla;
+using namespace sz::scripts;
+
+int main(int argc, char const **argv) {
+    std::printf("Welcome to StringZilla!\n");
+
+    try {
+        std::printf("Building up the environment...\n");
+        environment_t env = build_environment( //
+            argc, argv,                        //
+            "xlsum.csv",                       // Preferred for UTF-8 content
+            environment_t::tokenization_t::lines_k);
+
+        std::printf("Starting string similarity benchmarks...\n");
+        bench_levenshtein(env);
+    }
+    catch (std::exception const &e) {
+        std::fprintf(stderr, "Failed with: %s\n", e.what());
+        return 1;
+    }
+
+    std::printf("All benchmarks finished.\n");
+    return 0;
+}
\ No newline at end of file
diff --git a/scripts/bench_similarity.cu b/scripts/bench_similarity.cu
index 113544d3..a685bcb1 100644
--- a/scripts/bench_similarity.cu
+++ b/scripts/bench_similarity.cu
@@ -47,106 +47,10 @@
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
  *  This file is the sibling of `bench_search.cpp`, `bench_token.cpp`, `bench_sequence.cpp`, and `bench_memory.cpp`.
  */
-
-#include "bench.hpp"
-#include "test_stringcuzilla.cuh" // `levenshtein_baseline`, `error_costs_256x256_unary`
-
-#if SZ_USE_CUDA
-#include <stringcuzilla/similarity.cuh> // Parallel string processing on CUDA or OpenMP
-#endif
-
-#if SZ_USE_OPENMP
-#include <stringcuzilla/similarity.hpp> // OpenMP templates for string similarity measures
-#endif
+#include "bench_similarity.cuh"
 
 namespace sz = ashvardanian::stringzilla;
 using namespace sz::scripts;
-using namespace std::literals; // for ""sv
-
-using similarities_t = unified_vector<sz_ssize_t>;
-using levenshtein_serial_t = sz::levenshtein_distances<sz_cap_parallel_k, char, std::allocator<char>>;
-using levenshtein_cuda_t = sz::levenshtein_distances<sz_cap_cuda_k, char>;
-
-#pragma region Levenshtein Distance and Alignment Scores
-
-/** @brief Wraps a hardware-specific Levenshtein-distance backend into something @b `bench_unary`-compatible . */
-template <typename engine_type_>
-struct batch_callable {
-    using engine_t = engine_type_;
-
-    environment_t const &env;
-    similarities_t &results;
-    sz_size_t bound = SZ_SIZE_MAX;
-    engine_t engine = {};
-
-    batch_callable(environment_t const &env, similarities_t &res, sz_size_t batch_size) : env(env), results(res) {
-        if (env.tokens.size() <= batch_size) throw std::runtime_error("Batch size is too large.");
-    }
-
-    inline call_result_t operator()(std::size_t batch_index) noexcept(false) {
-        std::size_t const batch_size = results.size();
-        std::size_t const forward_token_index = (batch_index * batch_size) % (env.tokens.size() - batch_size);
-        std::size_t const backward_token_index = env.tokens.size() - forward_token_index - batch_size;
-
-        return operator()({env.tokens.data() + forward_token_index, batch_size},
-                          {env.tokens.data() + backward_token_index, batch_size});
-    }
-
-    inline call_result_t operator()(std::span<token_view_t const> a, std::span<token_view_t const> b) noexcept(false) {
-        sz::status_t status = engine(a, b, results.data());
-        if (status != sz::status_t::success_k) throw std::runtime_error("Failed to compute Levenshtein distance.");
-        do_not_optimize(results);
-        std::size_t bytes_passed = 0, cells_passed = 0;
-        for (std::size_t i = 0; i < results.size(); ++i) {
-            bytes_passed += std::min(a[i].size(), b[i].size());
-            cells_passed += a[i].size() * b[i].size();
-        }
-        call_result_t call_result;
-        call_result.bytes_passed = bytes_passed;
-        call_result.operations = cells_passed;
-        call_result.inputs_processed = results.size();
-        call_result.check_value = reinterpret_cast<check_value_t>(&results);
-        return call_result;
-    }
-};
-
-struct similarities_equality_t {
-    bool operator()(check_value_t const &a, check_value_t const &b) const {
-        similarities_t const &a_ = *reinterpret_cast<similarities_t const *>(a);
-        similarities_t const &b_ = *reinterpret_cast<similarities_t const *>(b);
-        if (a_.size() != b_.size()) return false;
-        for (std::size_t i = 0; i < a_.size(); ++i)
-            if (a_[i] != b_[i]) {
-                std::printf("Mismatch at index %zu: %zd != %zd\n", i, a_[i], b_[i]);
-                return false;
-            }
-        return true;
-    }
-};
-
-void bench_levenshtein(environment_t const &env) {
-
-    std::vector<std::size_t> batch_sizes = {1024 / 32, 1024, 1024 * 32};
-    similarities_t results_baseline, results_accelerated;
-
-    for (std::size_t batch_size : batch_sizes) {
-        results_baseline.resize(batch_size);
-        results_accelerated.resize(batch_size);
-
-        auto call_baseline = batch_callable<levenshtein_serial_t>(env, results_baseline, batch_size);
-        auto name_baseline = "levenshtein_serial:batch"s + std::to_string(batch_size);
-        bench_result_t baseline = bench_unary(env, name_baseline, call_baseline).log();
-
-        bench_result_t accelerated =
-            bench_unary(env, "levenshtein_cuda:batch"s + std::to_string(batch_size), call_baseline,
-                        batch_callable<levenshtein_cuda_t>(env, results_accelerated, batch_size),
-                        callable_no_op_t {},        // preprocessing
-                        similarities_equality_t {}) // equality check
-                .log(baseline);
-    }
-}
-
-#pragma endregion
 
 int main(int argc, char const **argv) {
     std::printf("Welcome to StringZilla!\n");
diff --git a/scripts/bench_similarity.cuh b/scripts/bench_similarity.cuh
new file mode 100644
index 00000000..e3258dff
--- /dev/null
+++ b/scripts/bench_similarity.cuh
@@ -0,0 +1,150 @@
+/**
+ *  @file   bench_similarity.cuh
+ *  @brief  Shared code for CPU and GPU batched string similarity kernels.
+ */
+
+#include "bench.hpp"
+#include "test_stringcuzilla.cuh" // `levenshtein_baseline`, `error_costs_256x256_unary`
+
+#if SZ_USE_CUDA
+#include <stringcuzilla/similarity.cuh> // Parallel string processing on CUDA or OpenMP
+#endif
+
+#if SZ_USE_OPENMP
+#include <stringcuzilla/similarity.hpp> // OpenMP templates for string similarity measures
+#endif
+
+#include <tuple> // `std::tuple`
+
+namespace ashvardanian {
+namespace stringzilla {
+namespace scripts {
+
+using similarities_t = unified_vector<sz_ssize_t>;
+
+#pragma region Levenshtein Distance and Alignment Scores
+
+/** @brief Wraps a hardware-specific Levenshtein-distance backend into something @b `bench_unary`-compatible . */
+template <typename engine_type_, typename... extra_args_>
+struct similarities_callable {
+    using engine_t = engine_type_;
+
+    environment_t const &env;
+    similarities_t &results;
+    sz_size_t bound = SZ_SIZE_MAX;
+    engine_t engine = {};
+    std::tuple<extra_args_...> extra_args = {};
+
+    similarities_callable(environment_t const &env, similarities_t &res, sz_size_t batch_size, extra_args_... args)
+        : env(env), results(res), extra_args(args...) {
+        if (env.tokens.size() <= batch_size) throw std::runtime_error("Batch size is too large.");
+    }
+
+    inline call_result_t operator()(std::size_t batch_index) noexcept(false) {
+        std::size_t const batch_size = results.size();
+        std::size_t const forward_token_index = (batch_index * batch_size) % (env.tokens.size() - batch_size);
+        std::size_t const backward_token_index = env.tokens.size() - forward_token_index - batch_size;
+
+        return operator()({env.tokens.data() + forward_token_index, batch_size},
+                          {env.tokens.data() + backward_token_index, batch_size});
+    }
+
+    inline call_result_t operator()(std::span<token_view_t const> a, std::span<token_view_t const> b) noexcept(false) {
+        // Unpack the extra arguments from `std::tuple` into the engine call using `std::apply`
+        sz::status_t status =
+            std::apply([&](auto &&...rest) { return engine(a, b, results.data(), rest...); }, extra_args);
+
+        if (status != sz::status_t::success_k) throw std::runtime_error("Failed to compute Levenshtein distance.");
+        do_not_optimize(results);
+        std::size_t bytes_passed = 0, cells_passed = 0;
+        for (std::size_t i = 0; i < results.size(); ++i) {
+            bytes_passed += std::min(a[i].size(), b[i].size());
+            cells_passed += a[i].size() * b[i].size();
+        }
+        call_result_t call_result;
+        call_result.bytes_passed = bytes_passed;
+        call_result.operations = cells_passed;
+        call_result.inputs_processed = results.size();
+        call_result.check_value = reinterpret_cast<check_value_t>(&results);
+        return call_result;
+    }
+};
+
+struct similarities_equality_t {
+    bool operator()(check_value_t const &a, check_value_t const &b) const {
+        similarities_t const &a_ = *reinterpret_cast<similarities_t const *>(a);
+        similarities_t const &b_ = *reinterpret_cast<similarities_t const *>(b);
+        if (a_.size() != b_.size()) return false;
+        for (std::size_t i = 0; i < a_.size(); ++i)
+            if (a_[i] != b_[i]) {
+                std::printf("Mismatch at index %zu: %zd != %zd\n", i, a_[i], b_[i]);
+                return false;
+            }
+        return true;
+    }
+};
+
+void bench_levenshtein(environment_t const &env) {
+
+    using namespace std::string_literals; // for "s" suffix
+
+#if SZ_USE_CUDA
+    sz::gpu_specs_t specs = *sz::gpu_specs();
+#endif
+    std::vector<std::size_t> batch_sizes = {1024 / 32, 1024, 1024 * 32};
+#if SZ_DEBUG
+    batch_sizes = {1, 2, 32};
+#endif
+    similarities_t results_baseline, results_accelerated;
+
+    auto scramble_accelerated_results = [&] {
+        std::shuffle(results_accelerated.begin(), results_accelerated.end(), global_random_generator());
+    };
+
+    for (std::size_t batch_size : batch_sizes) {
+        results_baseline.resize(batch_size);
+        results_accelerated.resize(batch_size);
+
+        auto call_baseline = similarities_callable<levenshtein_serial_t>(env, results_baseline, batch_size);
+        auto name_baseline = "levenshtein_serial:batch"s + std::to_string(batch_size);
+        bench_result_t baseline = bench_unary(env, name_baseline, call_baseline).log();
+
+#if SZ_USE_ICE
+        bench_unary(env, "levenshtein_ice:batch"s + std::to_string(batch_size), call_baseline,
+                    similarities_callable<levenshtein_ice_t>(env, results_accelerated, batch_size),
+                    callable_no_op_t {},        // preprocessing
+                    similarities_equality_t {}) // equality check
+            .log(baseline);
+        scramble_accelerated_results();
+#endif
+
+#if SZ_USE_CUDA
+        bench_unary(
+            env, "levenshtein_cuda:batch"s + std::to_string(batch_size), call_baseline,
+            similarities_callable<levenshtein_cuda_t, sz::gpu_specs_t>(env, results_accelerated, batch_size, specs),
+            callable_no_op_t {},        // preprocessing
+            similarities_equality_t {}) // equality check
+            .log(baseline);
+        scramble_accelerated_results();
+#endif
+
+        auto call_utf8_baseline = similarities_callable<levenshtein_utf8_serial_t>(env, results_baseline, batch_size);
+        auto name_utf8_baseline = "levenshtein_utf8_serial:batch"s + std::to_string(batch_size);
+        bench_result_t utf8_baseline = bench_unary(env, name_utf8_baseline, call_utf8_baseline).log();
+
+#if SZ_USE_ICE
+        bench_unary(env, "levenshtein_utf8_ice:batch"s + std::to_string(batch_size), call_baseline,
+                    similarities_callable<levenshtein_utf8_ice_t>(env, results_accelerated, batch_size),
+                    callable_no_op_t {},        // preprocessing
+                    similarities_equality_t {}) // equality check
+            .log(baseline);
+        scramble_accelerated_results();
+#endif
+    }
+}
+
+#pragma endregion
+
+} // namespace scripts
+} // namespace stringzilla
+} // namespace ashvardanian
\ No newline at end of file

From 35ba76cba3c8dc8d54d0b875c32f7db06ba970a8 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 10 Apr 2025 11:53:28 +0000
Subject: [PATCH 307/751] Improve: Use tagged C `enum`s

This helps with symbols visibility,
debugging tools, and forward
declarations.
---
 include/stringzilla/types.h | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 1afd9930..08ae5c64 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -416,19 +416,25 @@ typedef enum { sz_less_k = -1, sz_equal_k = 0, sz_greater_k = 1 } sz_ordering_t;
  *  @brief Describes the alignment scope for string similarity algorithms.
  *  @sa sz_similarity_global_k, sz_similarity_local_k
  */
-typedef enum { sz_similarity_global_k = 0, sz_similarity_local_k = 1 } sz_similarity_locality_t;
+typedef enum sz_similarity_locality_t {
+    sz_similarity_global_k = 0,
+    sz_similarity_local_k = 1
+} sz_similarity_locality_t;
 
 /**
  *  @brief Describes the alignment objective for string similarity algorithms.
  *  @sa sz_minimize_distance_k, sz_maximize_score_k
  */
-typedef enum { sz_minimize_distance_k = 0, sz_maximize_score_k = 1 } sz_similarity_objective_t;
+typedef enum sz_similarity_objective_t {
+    sz_minimize_distance_k = 0,
+    sz_maximize_score_k = 1
+} sz_similarity_objective_t;
 
 /**
  *  @brief A simple signed integer type describing the status of a faulty operation.
  *  @sa sz_success_k, sz_bad_alloc_k, sz_invalid_utf8_k, sz_contains_duplicates_k
  */
-typedef enum {
+typedef enum sz_status_t {
     /** For algorithms that return a status, this status indicates that the operation was successful. */
     sz_success_k = 0,
     /** For algorithms that require memory allocation, this status indicates that the allocation failed. */
@@ -445,7 +451,7 @@ typedef enum {
  *  @brief  Enumeration of SIMD capabilities of the target architecture.
  *          Used to introspect the supported functionality of the dynamic library.
  */
-typedef enum {
+typedef enum sz_capability_t {
     sz_cap_serial_k = 1,        ///< Serial (non-SIMD) capability
     sz_cap_parallel_k = 1 << 2, ///< Multi-threading via OpenMP capability
     sz_cap_any_k = 0x7FFFFFFF,  ///< Mask representing any capability with `INT_MAX`
@@ -470,7 +476,7 @@ typedef enum {
  *  @brief Describes the length of a UTF-8 @b rune / character / codepoint in bytes, which can be 1 to 4.
  *  @see https://en.wikipedia.org/wiki/UTF-8
  */
-typedef enum {
+typedef enum sz_rune_length_t {
     sz_utf8_invalid_k = 0,     //!< Invalid UTF8 character.
     sz_utf8_rune_1byte_k = 1,  //!< 1-byte UTF8 character.
     sz_utf8_rune_2bytes_k = 2, //!< 2-byte UTF8 character.

From 91df0ce8cb5b79eac092c92e7b2f64bc4712f1e4 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 11 Apr 2025 07:36:07 +0000
Subject: [PATCH 308/751] Fix: Initializing horizontal aligner

---
 include/stringcuzilla/similarity.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index 185426fc..7ed0c506 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -557,7 +557,7 @@ struct horizontal_walker {
         for (sz_size_t row_idx = 1; row_idx < longer_dim; ++row_idx) {
 
             // Don't forget to populate the first column of each row:
-            horizontal_aligner.init(current_scores[0], 1);
+            horizontal_aligner.init(current_scores[0], row_idx);
 
             horizontal_aligner(                                  //
                 constant_iterator<char_t> {longer[row_idx - 1]}, // first sequence of characters

From 10279f9addf8eef0fdb79e8ea63ec55446ca0ef9 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 11 Apr 2025 07:36:31 +0000
Subject: [PATCH 309/751] Improve: New template SFINAE in `similarity.hpp`

---
 include/stringcuzilla/similarity.hpp | 1178 ++++++++++++++++++++------
 1 file changed, 911 insertions(+), 267 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index 7ed0c506..32684bcd 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -42,8 +42,11 @@
  *  template object, which in turn builds on top of an "scorer" template object:
  *
  *  - the "walker" chooses the order in which the DP matrix is evaluated - row-wise or diagonal-wise.
- *  - the "scorer" evaluates the actual DP matrix cells, taking 3+ inputs, for "local" and "global" alignment,
- *    or the "affine local" and "affine global" alignments, differentiating the cost of gap opening & extension.
+ *  - the "scorer" evaluates the actual DP matrix cells, either with equivalent "linear" or "affine" scoring.
+ *
+ *  The linear scoring algorithms are best known in Computer Science have identical cost for initiating a gap
+ *  (insert/delete) and extending an existing one. In BioInformatics, however, we often want to assign much higher
+ *  penalty for the break itself, and a lower penalty for extending it. This is called "affine" scoring.
  *
  *  TODO: For @b sparse algorithms, the algorithms are constructed differently.
  */
@@ -58,23 +61,54 @@
 namespace ashvardanian {
 namespace stringzilla {
 
+/**
+ *  @brief  Helper method to guess the amount of SRAM we want to effectively process the input
+ *          without fetching from RAM/VRAM all the time, including the space for 3 diagonals
+ *          and the strings themselves.
+ */
+inline sz_size_t _diagonal_similarity_memory_usage( //
+    sz_size_t first_length, sz_size_t second_length, error_cost_t max_cell_difference) noexcept {
+    sz_size_t shorter_length = sz_min_of_two(first_length, second_length);
+    sz_size_t longer_length = sz_max_of_two(first_length, second_length);
+    sz_size_t max_diagonal_length = shorter_length + 1;
+    sz_size_t max_cell_value = (longer_length + 1) * max_cell_difference;
+    sz_size_t bytes_per_cell = max_cell_value < 256 ? 1 : max_cell_value < 65536 ? 2 : 4;
+    // For each string we need to copy its contents, and allocate 3 bands proportional to the length
+    // of the shorter string with each cell being big enough to hold the length of the longer one.
+    // The diagonals should be aligned to 4 bytes to allow for SIMD operations.
+    sz_size_t bytes_per_diagonal = round_up_to_multiple<sz_size_t>(max_diagonal_length * bytes_per_cell, 4);
+    sz_size_t shared_memory_requirement = 3 * bytes_per_diagonal + first_length + second_length;
+    return shared_memory_requirement;
+}
+
 /**
  *  @brief  An operator to be applied to be applied to all 2x2 blocks of the DP matrix to produce
- *          the bottom-right value from the 3x others in case of Global Alignment algorithms, like
- *          the Needleman-Wunsch or Levenshtein distance calculations.
+ *          the bottom-right value from the 3x others in case of @b Global Alignment algorithms, like
+ *          the @b Needleman-Wunsch or @b Levenshtein distance calculations.
  *
  *  It updates the internal state to remember the last calculated value, as in Global Alignment it's
  *  always in the bottom-right corner of the DP matrix, which is evaluated last.
  */
-template <                                                      //
-    typename first_iterator_type_ = char const *,               //
-    typename second_iterator_type_ = char const *,              //
-    typename score_type_ = sz_size_t,                           //
-    typename substituter_type_ = error_costs_uniform_t,         //
-    sz_similarity_objective_t objective_ = sz_maximize_score_k, //
-    sz_capability_t capability_ = sz_cap_serial_k               //
+template <                                                       //
+    typename first_iterator_type_ = char const *,                //
+    typename second_iterator_type_ = char const *,               //
+    typename score_type_ = sz_size_t,                            //
+    typename substituter_type_ = error_costs_uniform_t,          //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
+    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
+    sz_capability_t capability_ = sz_cap_serial_k,               //
+    typename enable_ = void                                      //
     >
-struct global_scorer {
+struct linear_scorer;
+
+constexpr bool is_serial_or_parallel(sz_capability_t capability) noexcept {
+    return (capability == sz_cap_serial_k) || (capability == (sz_capability_t)(sz_cap_serial_k | sz_cap_parallel_k));
+}
+
+template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
+          typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
+struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, objective_,
+                     sz_similarity_global_k, capability_, std::enable_if_t<is_serial_or_parallel(capability_)>> {
 
     using first_iterator_t = first_iterator_type_;
     using second_iterator_t = second_iterator_type_;
@@ -82,6 +116,7 @@ struct global_scorer {
     using substituter_t = substituter_type_;
 
     static constexpr sz_similarity_objective_t objective_k = objective_;
+    static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
     static constexpr sz_capability_t capability_k = capability_;
 
     using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
@@ -89,8 +124,8 @@ struct global_scorer {
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
     using char_t = typename std::remove_cvref<first_char_t>::type;
 
-    using scorer_t =
-        global_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k, capability_k>;
+    using scorer_t = linear_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k, locality_k,
+                                   capability_k>;
 
   protected:
     substituter_t substituter_ {};
@@ -103,12 +138,10 @@ struct global_scorer {
     }
 
   public:
-    global_scorer() = default;
-    global_scorer(substituter_t substituter, error_cost_t gap_cost) noexcept
+    linear_scorer() = default;
+    linear_scorer(substituter_t substituter, error_cost_t gap_cost) noexcept
         : substituter_(substituter), gap_cost_(gap_cost) {}
 
-    static constexpr bool is_parallel() { return capability_k & sz_cap_parallel_k; }
-
     /**
      *  @brief Initializes a boundary value within a certain diagonal.
      *  @note Should only be called for the diagonals outside of the bottom-right triangle.
@@ -132,7 +165,7 @@ struct global_scorer {
         score_t const *scores_pre_substitution, score_t const *scores_pre_insertion, score_t const *scores_pre_deletion,
         score_t *scores_new) noexcept {
 
-#pragma omp parallel for simd schedule(dynamic, 1) if (is_parallel())
+#pragma omp parallel for simd if (capability_k & sz_cap_parallel_k)
         for (sz_size_t i = 0; i < n; ++i) {
             score_t pre_substitution = scores_pre_substitution[i];
             score_t pre_insertion = scores_pre_insertion[i];
@@ -148,27 +181,22 @@ struct global_scorer {
         }
 
         // The last element of the last chunk is the result of the global alignment.
-        if (n == 1) last_score_ = scores_new[0];
+        last_score_ = scores_new[n - 1];
     }
 };
 
 /**
  *  @brief  An operator to be applied to be applied to all 2x2 blocks of the DP matrix to produce
- *          the bottom-right value from the 3x others in case of Global Alignment algorithms, like
- *          the Smith-Waterman score.
+ *          the bottom-right value from the 3x others in case of @b Local Alignment algorithms, like
+ *          the @b Smith-Waterman score.
  *
  *  It updates the internal state to remember the minimum/maximum calculated value, as in Local Alignment
  *  it's always in the bottom-right corner of the DP matrix, which is evaluated last.
  */
-template <                                                      //
-    typename first_iterator_type_ = char const *,               //
-    typename second_iterator_type_ = char const *,              //
-    typename score_type_ = sz_size_t,                           //
-    typename substituter_type_ = error_costs_uniform_t,         //
-    sz_similarity_objective_t objective_ = sz_maximize_score_k, //
-    sz_capability_t capability_ = sz_cap_serial_k               //
-    >
-struct local_scorer {
+template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
+          typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
+struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, objective_,
+                     sz_similarity_local_k, capability_, std::enable_if_t<is_serial_or_parallel(capability_)>> {
 
     using first_iterator_t = first_iterator_type_;
     using second_iterator_t = second_iterator_type_;
@@ -176,6 +204,7 @@ struct local_scorer {
     using substituter_t = substituter_type_;
 
     static constexpr sz_similarity_objective_t objective_k = objective_;
+    static constexpr sz_similarity_locality_t locality_k = sz_similarity_local_k;
     static constexpr sz_capability_t capability_k = capability_;
 
     using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
@@ -183,8 +212,8 @@ struct local_scorer {
     static_assert(std::is_same<first_char_t, second_char_t>(), "String characters must be of the same type.");
     using char_t = first_char_t;
 
-    using scorer_t =
-        local_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k, capability_k>;
+    using scorer_t = linear_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k, locality_k,
+                                   capability_k>;
 
   protected:
     substituter_t substituter_ {};
@@ -197,12 +226,10 @@ struct local_scorer {
     }
 
   public:
-    local_scorer() = default;
-    local_scorer(substituter_t substituter, error_cost_t gap_cost) noexcept
+    linear_scorer() = default;
+    linear_scorer(substituter_t substituter, error_cost_t gap_cost) noexcept
         : substituter_(substituter), gap_cost_(gap_cost) {}
 
-    static constexpr bool is_parallel() { return capability_k & sz_cap_parallel_k; }
-
     /**
      *  @brief Initializes a boundary value within a certain diagonal.
      *  @note Should only be called for the diagonals outside of the bottom-right triangle.
@@ -220,7 +247,7 @@ struct local_scorer {
         score_t const *scores_pre_substitution, score_t const *scores_pre_insertion, score_t const *scores_pre_deletion,
         score_t *scores_new) noexcept {
 
-#pragma omp parallel for schedule(dynamic, 1) if (is_parallel())
+#pragma omp parallel for if (capability_k & sz_cap_parallel_k)
         for (sz_size_t i = 0; i < n; ++i) {
             score_t pre_substitution = scores_pre_substitution[i];
             score_t pre_insertion = scores_pre_insertion[i];
@@ -231,11 +258,12 @@ struct local_scorer {
             error_cost_t cost_of_substitution = substituter_(first_reversed_slice[i], second_slice[i]);
             score_t if_substitution = pre_substitution + cost_of_substitution;
             score_t if_deletion_or_insertion = pick_best(pre_deletion, pre_insertion) + gap_cost_;
+            // ! This is the main difference with global alignment:
             score_t if_substitution_or_reset = pick_best(if_substitution, 0);
             score_t cell_score = pick_best(if_deletion_or_insertion, if_substitution_or_reset);
             scores_new[i] = cell_score;
 
-            // Update the global maximum score if this cell beats it.
+            // ! Update the global maximum score if this cell beats it - this is the costliest operation:
 #pragma omp critical
             { best_score_ = pick_best(best_score_, cell_score); }
         }
@@ -268,13 +296,14 @@ struct local_scorer {
  *  @sa     For bulk API, use `sz::levenshtein_scores[_utf8]`.
  */
 template <                                                       //
-    sz_capability_t capability_ = sz_cap_serial_k,               //
-    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
-    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
     typename char_type_ = char,                                  //
     typename score_type_ = sz_size_t,                            //
     typename substituter_type_ = error_costs_uniform_t,          //
-    typename allocator_type_ = dummy_alloc_t                     //
+    typename allocator_type_ = dummy_alloc_t,                    //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
+    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
+    sz_capability_t capability_ = sz_cap_serial_k,               //
+    typename enable_ = void                                      //
     >
 struct diagonal_walker {
 
@@ -284,16 +313,13 @@ struct diagonal_walker {
     using allocator_t = allocator_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
-    static constexpr sz_similarity_locality_t locality_k = locality_;
     static constexpr sz_similarity_objective_t objective_k = objective_;
+    static constexpr sz_similarity_locality_t locality_k = locality_;
 
     using allocated_t = typename allocator_t::value_type;
     static_assert(sizeof(allocated_t) == sizeof(char), "Allocator must be byte-aligned");
-    using global_scorer_t =
-        global_scorer<char_t const *, char_t const *, score_t, substituter_t, objective_k, capability_k>;
-    using local_scorer_t =
-        local_scorer<char_t const *, char_t const *, score_t, substituter_t, objective_k, capability_k>;
-    using scorer_t = std::conditional_t<locality_k == sz_similarity_local_k, local_scorer_t, global_scorer_t>;
+    using scorer_t =
+        linear_scorer<char_t const *, char_t const *, score_t, substituter_t, objective_k, locality_k, capability_k>;
 
     substituter_t substituter_ {};
     error_cost_t gap_cost_ {1};
@@ -475,13 +501,14 @@ struct diagonal_walker {
  *  @sa     For bulk API, use `sz::levenshtein_scores[_utf8]`.
  */
 template <                                                       //
-    sz_capability_t capability_ = sz_cap_serial_k,               //
-    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
-    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
     typename char_type_ = char,                                  //
     typename score_type_ = sz_size_t,                            //
     typename substituter_type_ = error_costs_uniform_t,          //
-    typename allocator_type_ = dummy_alloc_t                     //
+    typename allocator_type_ = dummy_alloc_t,                    //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
+    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
+    sz_capability_t capability_ = sz_cap_serial_k,               //
+    typename enable_ = void                                      //
     >
 struct horizontal_walker {
 
@@ -490,18 +517,14 @@ struct horizontal_walker {
     using substituter_t = substituter_type_;
     using allocator_t = allocator_type_;
 
-    static constexpr sz_capability_t capability_k = capability_;
-    static constexpr sz_similarity_locality_t locality_k = locality_;
     static constexpr sz_similarity_objective_t objective_k = objective_;
-    static_assert((capability_k & sz_cap_parallel_k) == 0, "This algorithm is not parallelized!");
+    static constexpr sz_similarity_locality_t locality_k = locality_;
+    static constexpr sz_capability_t capability_k = capability_;
 
     using allocated_t = typename allocator_t::value_type;
     static_assert(sizeof(allocated_t) == sizeof(char), "Allocator must be byte-aligned");
-    using global_scorer_t =
-        global_scorer<constant_iterator<char_t>, char_t const *, score_t, substituter_t, objective_k, capability_k>;
-    using local_scorer_t =
-        local_scorer<constant_iterator<char_t>, char_t const *, score_t, substituter_t, objective_k, capability_k>;
-    using scorer_t = std::conditional_t<locality_k == sz_similarity_local_k, local_scorer_t, global_scorer_t>;
+    using scorer_t = linear_scorer<constant_iterator<char_t>, char_t const *, score_t, substituter_t, objective_k,
+                                   locality_k, capability_k>;
 
     substituter_t substituter_ {};
     error_cost_t gap_cost_ {1};
@@ -584,9 +607,10 @@ struct horizontal_walker {
  *  @sa     `levenshtein_distance_utf8` for UTF-8 strings.
  */
 template <                                         //
-    sz_capability_t capability_ = sz_cap_serial_k, //
     typename char_type_ = char,                    //
-    typename allocator_type_ = dummy_alloc_t       //
+    typename allocator_type_ = dummy_alloc_t,      //
+    sz_capability_t capability_ = sz_cap_serial_k, //
+    typename enable_ = void                        //
     >
 struct levenshtein_distance {
 
@@ -596,16 +620,16 @@ struct levenshtein_distance {
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using horizontal_u8_t = horizontal_walker<capability_serialized_k, sz_minimize_distance_k, sz_similarity_global_k,
-                                              char_t, sz_u8_t, error_costs_uniform_t, allocator_t>;
-    using diagonal_u8_t = diagonal_walker<capability_serialized_k, sz_minimize_distance_k, sz_similarity_global_k,
-                                          char_t, sz_u8_t, error_costs_uniform_t, allocator_t>;
-    using diagonal_u16_t = diagonal_walker<capability_k, sz_minimize_distance_k, sz_similarity_global_k, char_t,
-                                           sz_u16_t, error_costs_uniform_t, allocator_t>;
-    using diagonal_u32_t = diagonal_walker<capability_k, sz_minimize_distance_k, sz_similarity_global_k, char_t,
-                                           sz_u32_t, error_costs_uniform_t, allocator_t>;
-    using diagonal_u64_t = diagonal_walker<capability_k, sz_minimize_distance_k, sz_similarity_global_k, char_t,
-                                           sz_u64_t, error_costs_uniform_t, allocator_t>;
+    using horizontal_u8_t = horizontal_walker<char_t, sz_u8_t, error_costs_uniform_t, allocator_t,
+                                              sz_minimize_distance_k, sz_similarity_global_k, capability_serialized_k>;
+    using diagonal_u8_t = diagonal_walker<char_t, sz_u8_t, error_costs_uniform_t, allocator_t, sz_minimize_distance_k,
+                                          sz_similarity_global_k, capability_serialized_k>;
+    using diagonal_u16_t = diagonal_walker<char_t, sz_u16_t, error_costs_uniform_t, allocator_t, sz_minimize_distance_k,
+                                           sz_similarity_global_k, capability_k>;
+    using diagonal_u32_t = diagonal_walker<char_t, sz_u32_t, error_costs_uniform_t, allocator_t, sz_minimize_distance_k,
+                                           sz_similarity_global_k, capability_k>;
+    using diagonal_u64_t = diagonal_walker<char_t, sz_u64_t, error_costs_uniform_t, allocator_t, sz_minimize_distance_k,
+                                           sz_similarity_global_k, capability_k>;
 
     allocator_t alloc_ {};
 
@@ -673,9 +697,10 @@ struct levenshtein_distance {
  *  @sa     `levenshtein_distance` for binary strings.
  */
 template <                                         //
-    sz_capability_t capability_ = sz_cap_serial_k, //
     typename char_type_ = char,                    //
-    typename allocator_type_ = dummy_alloc_t       //
+    typename allocator_type_ = dummy_alloc_t,      //
+    sz_capability_t capability_ = sz_cap_serial_k, //
+    typename enable_ = void                        //
     >
 struct levenshtein_distance_utf8 {
 
@@ -685,20 +710,20 @@ struct levenshtein_distance_utf8 {
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using horizontal_u8_t = horizontal_walker<capability_serialized_k, sz_minimize_distance_k, sz_similarity_global_k,
-                                              sz_rune_t, sz_u8_t, error_costs_uniform_t, allocator_t>;
-    using diagonal_u8_t = diagonal_walker<capability_serialized_k, sz_minimize_distance_k, sz_similarity_global_k,
-                                          sz_rune_t, sz_u8_t, error_costs_uniform_t, allocator_t>;
-    using diagonal_u16_t = diagonal_walker<capability_k, sz_minimize_distance_k, sz_similarity_global_k, sz_rune_t,
-                                           sz_u16_t, error_costs_uniform_t, allocator_t>;
-    using diagonal_u32_t = diagonal_walker<capability_k, sz_minimize_distance_k, sz_similarity_global_k, sz_rune_t,
-                                           sz_u32_t, error_costs_uniform_t, allocator_t>;
-    using diagonal_u64_t = diagonal_walker<capability_k, sz_minimize_distance_k, sz_similarity_global_k, sz_rune_t,
-                                           sz_u64_t, error_costs_uniform_t, allocator_t>;
+    using horizontal_u8_t = horizontal_walker<sz_rune_t, sz_u8_t, error_costs_uniform_t, allocator_t,
+                                              sz_minimize_distance_k, sz_similarity_global_k, capability_serialized_k>;
+    using diagonal_u8_t = diagonal_walker<sz_rune_t, sz_u8_t, error_costs_uniform_t, allocator_t,
+                                          sz_minimize_distance_k, sz_similarity_global_k, capability_serialized_k>;
+    using diagonal_u16_t = diagonal_walker<sz_rune_t, sz_u16_t, error_costs_uniform_t, allocator_t,
+                                           sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
+    using diagonal_u32_t = diagonal_walker<sz_rune_t, sz_u32_t, error_costs_uniform_t, allocator_t,
+                                           sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
+    using diagonal_u64_t = diagonal_walker<sz_rune_t, sz_u64_t, error_costs_uniform_t, allocator_t,
+                                           sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
 
-    using ascii_fallback_t = levenshtein_distance_utf8<capability_k, char_t, allocator_t>;
+    using ascii_fallback_t = levenshtein_distance<char_t, allocator_t, capability_k>;
 
-    allocator_t alloc_ {};
+    mutable allocator_t alloc_ {};
 
     levenshtein_distance_utf8(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
 
@@ -723,12 +748,12 @@ struct levenshtein_distance_utf8 {
         // Check if the strings are entirely composed of ASCII characters,
         // and default to a simpler algorithm in that case.
         if (sz_isascii(first.data(), first.length()) && sz_isascii(second.data(), second.length()))
-            return ascii_fallback_t {alloc_}(first, second);
+            return ascii_fallback_t {alloc_}(first, second, result_ref);
 
         // Allocate some memory to expand UTF-8 strings into UTF-32.
         sz_size_t const max_utf32_bytes = first.size() * 4 + second.size() * 4;
-        sz_rune_t const *const first_data_utf32 = (sz_rune_t *)alloc_.allocate(max_utf32_bytes);
-        sz_rune_t const *const second_data_utf32 = first_data_utf32 + first.size();
+        sz_rune_t *const first_data_utf32 = (sz_rune_t *)alloc_.allocate(max_utf32_bytes);
+        sz_rune_t *const second_data_utf32 = first_data_utf32 + first.size();
 
         // Export into UTF-32 buffer.
         sz_rune_length_t rune_length;
@@ -786,10 +811,11 @@ struct levenshtein_distance_utf8 {
  *  @sa     `levenshtein_distance` for uniform substitution and gap costs.
  */
 template <                                              //
-    sz_capability_t capability_ = sz_cap_serial_k,      //
     typename char_type_ = char,                         //
     typename substituter_type_ = error_costs_uniform_t, //
-    typename allocator_type_ = dummy_alloc_t            //
+    typename allocator_type_ = dummy_alloc_t,           //
+    sz_capability_t capability_ = sz_cap_serial_k,      //
+    typename enable_ = void                             //
     >
 struct needleman_wunsch_score {
 
@@ -800,14 +826,14 @@ struct needleman_wunsch_score {
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using horizontal_i16_t = horizontal_walker<capability_serialized_k, sz_maximize_score_k, sz_similarity_global_k,
-                                               char_t, sz_i16_t, substituter_t, allocator_t>;
-    using diagonal_i16_t = diagonal_walker<capability_serialized_k, sz_maximize_score_k, sz_similarity_global_k, char_t,
-                                           sz_i16_t, substituter_t, allocator_t>;
-    using diagonal_i32_t = diagonal_walker<capability_k, sz_maximize_score_k, sz_similarity_global_k, char_t, sz_i32_t,
-                                           substituter_t, allocator_t>;
-    using diagonal_i64_t = diagonal_walker<capability_k, sz_maximize_score_k, sz_similarity_global_k, char_t, sz_i64_t,
-                                           substituter_t, allocator_t>;
+    using horizontal_i16_t = horizontal_walker<char_t, sz_i16_t, substituter_t, allocator_t, sz_maximize_score_k,
+                                               sz_similarity_global_k, capability_serialized_k>;
+    using diagonal_i16_t = diagonal_walker<char_t, sz_i16_t, substituter_t, allocator_t, sz_maximize_score_k,
+                                           sz_similarity_global_k, capability_serialized_k>;
+    using diagonal_i32_t = diagonal_walker<char_t, sz_i32_t, substituter_t, allocator_t, sz_maximize_score_k,
+                                           sz_similarity_global_k, capability_k>;
+    using diagonal_i64_t = diagonal_walker<char_t, sz_i64_t, substituter_t, allocator_t, sz_maximize_score_k,
+                                           sz_similarity_global_k, capability_k>;
 
     substituter_t substituter_ {};
     error_cost_t gap_cost_ {1};
@@ -878,10 +904,11 @@ struct needleman_wunsch_score {
  *  @sa     `levenshtein_distance` for uniform substitution and gap costs.
  */
 template <                                              //
-    sz_capability_t capability_ = sz_cap_serial_k,      //
     typename char_type_ = char,                         //
     typename substituter_type_ = error_costs_uniform_t, //
-    typename allocator_type_ = dummy_alloc_t            //
+    typename allocator_type_ = dummy_alloc_t,           //
+    sz_capability_t capability_ = sz_cap_serial_k,      //
+    typename enable_ = void                             //
     >
 struct smith_waterman_score {
 
@@ -892,14 +919,14 @@ struct smith_waterman_score {
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using horizontal_i16_t = horizontal_walker<capability_serialized_k, sz_maximize_score_k, sz_similarity_local_k,
-                                               char_t, sz_i16_t, substituter_t, allocator_t>;
-    using diagonal_i16_t = diagonal_walker<capability_serialized_k, sz_maximize_score_k, sz_similarity_local_k, char_t,
-                                           sz_i16_t, substituter_t, allocator_t>;
-    using diagonal_i32_t = diagonal_walker<capability_k, sz_maximize_score_k, sz_similarity_local_k, char_t, sz_i32_t,
-                                           substituter_t, allocator_t>;
-    using diagonal_i64_t = diagonal_walker<capability_k, sz_maximize_score_k, sz_similarity_local_k, char_t, sz_i64_t,
-                                           substituter_t, allocator_t>;
+    using horizontal_i16_t = horizontal_walker<char_t, sz_i16_t, substituter_t, allocator_t, sz_maximize_score_k,
+                                               sz_similarity_local_k, capability_serialized_k>;
+    using diagonal_i16_t = diagonal_walker<char_t, sz_i16_t, substituter_t, allocator_t, sz_maximize_score_k,
+                                           sz_similarity_local_k, capability_serialized_k>;
+    using diagonal_i32_t = diagonal_walker<char_t, sz_i32_t, substituter_t, allocator_t, sz_maximize_score_k,
+                                           sz_similarity_local_k, capability_k>;
+    using diagonal_i64_t = diagonal_walker<char_t, sz_i64_t, substituter_t, allocator_t, sz_maximize_score_k,
+                                           sz_similarity_local_k, capability_k>;
 
     substituter_t substituter_ {};
     error_cost_t gap_cost_ {1};
@@ -967,27 +994,26 @@ struct smith_waterman_score {
  *          For pairs of very large strings, all cores cooperate to compute one distance maximizing
  *          cache hits. For smaller strings, each core computes its own distance.
  */
-template <                                                                              //
-    typename score_type_,                                                               //
-    typename inter_pair_parallel_type_,                                                 //
-    typename intra_pair_parallel_type_,                                                 //
-    typename first_strings_type_, typename second_strings_type_, typename results_type_ //
+template <                                                       //
+    typename score_type_,                                        //
+    typename core_per_input_type_,                               //
+    typename all_cores_per_input_type_,                          //
+    typename first_strings_type_, typename second_strings_type_, //
+    typename results_type_                                       //
     >
 status_t _score_in_parallel(                         //
-    inter_pair_parallel_type_ &&intra_pair_parallel, //
-    intra_pair_parallel_type_ &&inter_pair_parallel, //
+    core_per_input_type_ &&core_per_input,           //
+    all_cores_per_input_type_ &&all_cores_per_input, //
     first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, results_type_ &&results,
-    cpu_specs_t specs = {}) noexcept {
+    error_cost_t max_cell_difference, cpu_specs_t specs = {}) noexcept {
 
     using score_t = score_type_;
+    sz_unused(specs);
 
     auto first_size = first_strings.size();
     auto second_size = second_strings.size();
     _sz_assert(first_size == second_size && "Expect equal number of strings");
 
-    // Separately handle shorter and longer strings.
-    constexpr sz_size_t threshold_size = 256;
-
     // Use an atomic to store any error encountered.
     std::atomic<status_t> error {status_t::success_k};
 
@@ -999,35 +1025,40 @@ status_t _score_in_parallel(                         //
         score_t result = 0;
         auto const &first = first_strings[i];
         auto const &second = second_strings[i];
-        auto largest_dimension = sz_max_of_two(first.length(), second.length());
-        if (largest_dimension >= threshold_size) continue;
-        status_t status = inter_pair_parallel({first.data(), first.length()}, {second.data(), second.length()}, result);
+
+        // ! Longer strings will be handled separately
+        auto const shared_memory_requirement =
+            _diagonal_similarity_memory_usage(first.length(), second.length(), max_cell_difference);
+        if (shared_memory_requirement >= specs.l2_bytes) continue;
+        status_t status = core_per_input({first.data(), first.length()}, {second.data(), second.length()}, result);
         if (status == status_t::success_k) { results[i] = result; }
         else { error.store(status); }
     }
 
-    // Now handle the larger strings.
-    for (sz_size_t i = 0; i < first_size && error.load() != status_t::success_k; ++i) {
+    // Now handle the longer strings.
+    for (sz_size_t i = 0; i < first_size && error.load() == status_t::success_k; ++i) {
         score_t result = 0;
         auto const &first = first_strings[i];
         auto const &second = second_strings[i];
-        auto largest_dimension = sz_max_of_two(first.length(), second.length());
-        if (largest_dimension < threshold_size) continue;
-        status_t status = intra_pair_parallel({first.data(), first.length()}, {second.data(), second.length()}, result);
+        auto const shared_memory_requirement =
+            _diagonal_similarity_memory_usage(first.length(), second.length(), max_cell_difference);
+        if (shared_memory_requirement < specs.l2_bytes) continue;
+        status_t status = all_cores_per_input({first.data(), first.length()}, {second.data(), second.length()}, result);
         if (status == status_t::success_k) { results[i] = result; }
         else { error.store(status); }
     }
     return error.load();
 }
 
-template <                                                                              //
-    typename score_type_,                                                               //
-    typename scoring_type_,                                                             //
-    typename first_strings_type_, typename second_strings_type_, typename results_type_ //
+template <                                                       //
+    typename score_type_,                                        //
+    typename scoring_type_,                                      //
+    typename first_strings_type_, typename second_strings_type_, //
+    typename results_type_                                       //
     >
 status_t _score_sequentially(scoring_type_ &&scoring, first_strings_type_ const &first_strings,
                              second_strings_type_ const &second_strings, results_type_ &&results) noexcept {
-    using scoring_t = scoring_type_;
+
     using score_t = score_type_;
 
     auto first_size = first_strings.size();
@@ -1051,9 +1082,10 @@ status_t _score_sequentially(scoring_type_ &&scoring, first_strings_type_ const
  *          cache hits. For smaller strings, each core computes its own distance.
  */
 template <                                         //
-    sz_capability_t capability_ = sz_cap_serial_k, //
     typename char_type_ = char,                    //
-    typename allocator_type_ = dummy_alloc_t       //
+    typename allocator_type_ = dummy_alloc_t,      //
+    sz_capability_t capability_ = sz_cap_serial_k, //
+    typename enable_ = void                        //
     >
 struct levenshtein_distances {
 
@@ -1063,10 +1095,8 @@ struct levenshtein_distances {
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    static constexpr bool is_parallel() { return (capability_k & sz_cap_parallel_k) != 0; }
-
-    using intra_pair_parallel_t = levenshtein_distance<capability_serialized_k, char_t, allocator_t>;
-    using inter_pair_parallel_t = levenshtein_distance<capability_k, char_t, allocator_t>;
+    using all_cores_per_input_t = levenshtein_distance<char_t, allocator_t, capability_k>;
+    using core_per_input_t = levenshtein_distance<char_t, allocator_t, capability_serialized_k>;
 
     allocator_t alloc_ {};
 
@@ -1076,19 +1106,21 @@ struct levenshtein_distances {
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
                         results_type_ &&results) const noexcept {
 
-        if constexpr (is_parallel())
-            return _score_in_parallel<sz_size_t>(inter_pair_parallel_t {alloc_}, intra_pair_parallel_t {alloc_},
-                                                 first_strings, second_strings, std::forward<results_type_>(results));
+        if constexpr (capability_k & sz_cap_parallel_k)
+            return _score_in_parallel<sz_size_t>(core_per_input_t {alloc_}, all_cores_per_input_t {alloc_},
+                                                 first_strings, second_strings, std::forward<results_type_>(results),
+                                                 1);
         else
-            return _score_sequentially<sz_size_t>(intra_pair_parallel_t {alloc_}, first_strings, second_strings,
+            return _score_sequentially<sz_size_t>(all_cores_per_input_t {alloc_}, first_strings, second_strings,
                                                   std::forward<results_type_>(results));
     }
 };
 
 template <                                         //
-    sz_capability_t capability_ = sz_cap_serial_k, //
     typename char_type_ = char,                    //
-    typename allocator_type_ = dummy_alloc_t       //
+    typename allocator_type_ = dummy_alloc_t,      //
+    sz_capability_t capability_ = sz_cap_serial_k, //
+    typename enable_ = void                        //
     >
 struct levenshtein_distances_utf8 {
 
@@ -1098,10 +1130,8 @@ struct levenshtein_distances_utf8 {
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    static constexpr bool is_parallel() { return (capability_k & sz_cap_parallel_k) != 0; }
-
-    using intra_pair_parallel_t = levenshtein_distance_utf8<capability_serialized_k, char_t, allocator_t>;
-    using inter_pair_parallel_t = levenshtein_distance_utf8<capability_k, char_t, allocator_t>;
+    using all_cores_per_input_t = levenshtein_distance_utf8<char_t, allocator_t, capability_k>;
+    using core_per_input_t = levenshtein_distance_utf8<char_t, allocator_t, capability_serialized_k>;
 
     allocator_t alloc_ {};
 
@@ -1111,20 +1141,22 @@ struct levenshtein_distances_utf8 {
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
                         results_type_ &&results) const noexcept {
 
-        if constexpr (is_parallel())
-            return _score_in_parallel<sz_size_t>(inter_pair_parallel_t {alloc_}, intra_pair_parallel_t {alloc_},
-                                                 first_strings, second_strings, std::forward<results_type_>(results));
+        if constexpr (capability_k & sz_cap_parallel_k)
+            return _score_in_parallel<sz_size_t>(core_per_input_t {alloc_}, all_cores_per_input_t {alloc_},
+                                                 first_strings, second_strings, std::forward<results_type_>(results),
+                                                 1);
         else
-            return _score_sequentially<sz_size_t>(intra_pair_parallel_t {alloc_}, first_strings, second_strings,
+            return _score_sequentially<sz_size_t>(all_cores_per_input_t {alloc_}, first_strings, second_strings,
                                                   std::forward<results_type_>(results));
     }
 };
 
 template <                                              //
-    sz_capability_t capability_ = sz_cap_serial_k,      //
     typename char_type_ = char,                         //
     typename substituter_type_ = error_costs_uniform_t, //
-    typename allocator_type_ = dummy_alloc_t            //
+    typename allocator_type_ = dummy_alloc_t,           //
+    sz_capability_t capability_ = sz_cap_serial_k,      //
+    typename enable_ = void                             //
     >
 struct needleman_wunsch_scores {
 
@@ -1135,10 +1167,8 @@ struct needleman_wunsch_scores {
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    static constexpr bool is_parallel() { return (capability_k & sz_cap_parallel_k) != 0; }
-
-    using intra_pair_parallel_t = needleman_wunsch_score<capability_serialized_k, char_t, substituter_t, allocator_t>;
-    using inter_pair_parallel_t = needleman_wunsch_score<capability_k, char_t, substituter_t, allocator_t>;
+    using all_cores_per_input_t = needleman_wunsch_score<char_t, substituter_t, allocator_t, capability_k>;
+    using core_per_input_t = needleman_wunsch_score<char_t, substituter_t, allocator_t, capability_serialized_k>;
 
     substituter_t substituter_ {};
     error_cost_t gap_cost_ {1};
@@ -1152,21 +1182,23 @@ struct needleman_wunsch_scores {
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
                         results_type_ &&results) const noexcept {
 
-        if constexpr (is_parallel())
-            return _score_in_parallel<sz_ssize_t>(inter_pair_parallel_t {substituter_, gap_cost_, alloc_},
-                                                  intra_pair_parallel_t {substituter_, gap_cost_, alloc_},
-                                                  first_strings, second_strings, std::forward<results_type_>(results));
+        if constexpr (capability_k & sz_cap_parallel_k)
+            return _score_in_parallel<sz_ssize_t>(core_per_input_t {substituter_, gap_cost_, alloc_},
+                                                  all_cores_per_input_t {substituter_, gap_cost_, alloc_},
+                                                  first_strings, second_strings, std::forward<results_type_>(results),
+                                                  127);
         else
-            return _score_sequentially<sz_ssize_t>(intra_pair_parallel_t {substituter_, gap_cost_, alloc_},
+            return _score_sequentially<sz_ssize_t>(all_cores_per_input_t {substituter_, gap_cost_, alloc_},
                                                    first_strings, second_strings, std::forward<results_type_>(results));
     }
 };
 
 template <                                              //
-    sz_capability_t capability_ = sz_cap_serial_k,      //
     typename char_type_ = char,                         //
     typename substituter_type_ = error_costs_uniform_t, //
-    typename allocator_type_ = dummy_alloc_t            //
+    typename allocator_type_ = dummy_alloc_t,           //
+    sz_capability_t capability_ = sz_cap_serial_k,      //
+    typename enable_ = void                             //
     >
 struct smith_waterman_scores {
 
@@ -1177,10 +1209,8 @@ struct smith_waterman_scores {
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    static constexpr bool is_parallel() { return (capability_k & sz_cap_parallel_k) != 0; }
-
-    using intra_pair_parallel_t = smith_waterman_score<capability_serialized_k, char_t, substituter_t, allocator_t>;
-    using inter_pair_parallel_t = smith_waterman_score<capability_k, char_t, substituter_t, allocator_t>;
+    using all_cores_per_input_t = smith_waterman_score<char_t, substituter_t, allocator_t, capability_k>;
+    using core_per_input_t = smith_waterman_score<char_t, substituter_t, allocator_t, capability_serialized_k>;
 
     substituter_t substituter_ {};
     error_cost_t gap_cost_ {1};
@@ -1194,12 +1224,13 @@ struct smith_waterman_scores {
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
                         results_type_ &&results) const noexcept {
 
-        if constexpr (is_parallel())
-            return _score_in_parallel<sz_ssize_t>(inter_pair_parallel_t {substituter_, gap_cost_, alloc_},
-                                                  intra_pair_parallel_t {substituter_, gap_cost_, alloc_},
-                                                  first_strings, second_strings, std::forward<results_type_>(results));
+        if constexpr (capability_k & sz_cap_parallel_k)
+            return _score_in_parallel<sz_ssize_t>(core_per_input_t {substituter_, gap_cost_, alloc_},
+                                                  all_cores_per_input_t {substituter_, gap_cost_, alloc_},
+                                                  first_strings, second_strings, std::forward<results_type_>(results),
+                                                  127);
         else
-            return _score_sequentially<sz_ssize_t>(intra_pair_parallel_t {substituter_, gap_cost_, alloc_},
+            return _score_sequentially<sz_ssize_t>(all_cores_per_input_t {substituter_, gap_cost_, alloc_},
                                                    first_strings, second_strings, std::forward<results_type_>(results));
     }
 };
@@ -1396,45 +1427,51 @@ struct error_costs_26x26ascii_t {
                              apply_to = function)
 
 /**
- *  @brief Variant of `global_scorer` - Minimizes Levenshtein distance for inputs under 256 chars.
+ *  @brief Variant of `linear_scorer` - Minimizes Levenshtein distance for inputs under 256 bytes.
  *  @note Requires Intel Ice Lake generation CPUs or newer.
  */
-template <>
-struct global_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k, sz_cap_ice_k>
-    : public global_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k,
-                           sz_cap_serial_k> {
+template <sz_capability_t capability_>
+struct linear_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k,
+                     sz_similarity_global_k, capability_, std::enable_if_t<capability_ & sz_cap_ice_k>>
+    : public linear_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k,
+                           sz_similarity_global_k, sz_cap_serial_k, void> {
 
-    using scorer_t::global_scorer; // Make the constructors visible
-  protected:
-    __mmask64 load_mask_, mismatch_mask_;
-    sz_u512_vec_t first_vec_, second_vec_;
-    sz_u512_vec_t pre_substitution_vec_, pre_insertion_vec_, pre_deletion_vec_;
-    sz_u512_vec_t cost_if_substitution_vec_, cost_if_gap_vec_, cell_score_vec_;
+    using scorer_t::linear_scorer; // Make the constructors visible
 
-    // Initialize the constants
-    sz_u512_vec_t ones_vec_;
+    static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
+    static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
+    static constexpr sz_capability_t capability_k = capability_;
 
-  public:
     inline void slice(                                                                        //
         char const *first_reversed_slice, char const *second_slice, sz_size_t i, sz_size_t n, //
         sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion,          //
-        sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new) noexcept {
-        load_mask_ = _sz_u64_mask_until(n - i);
-        pre_substitution_vec_.zmm = _mm512_maskz_loadu_epi8(load_mask_, scores_pre_substitution + i);
-        pre_insertion_vec_.zmm = _mm512_maskz_loadu_epi8(load_mask_, scores_pre_insertion + i);
-        pre_deletion_vec_.zmm = _mm512_maskz_loadu_epi8(load_mask_, scores_pre_deletion + i);
+        sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new) const noexcept {
+
+        __mmask64 load_mask, mismatch_mask;
+        sz_u512_vec_t first_vec, second_vec;
+        sz_u512_vec_t pre_substitution_vec, pre_insertion_vec, pre_deletion_vec;
+        sz_u512_vec_t cost_if_substitution_vec, cost_if_gap_vec, cell_score_vec;
+
+        // Initialize constats:
+        sz_u512_vec_t ones_vec;
+        ones_vec.zmm = _mm512_set1_epi8(1);
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
-        first_vec_.zmm = _mm512_maskz_loadu_epi8(load_mask_, first_reversed_slice + i);
-        second_vec_.zmm = _mm512_maskz_loadu_epi8(load_mask_, second_slice + i);
-        mismatch_mask_ = _mm512_cmpneq_epi8_mask(first_vec_.zmm, second_vec_.zmm);
-        cost_if_substitution_vec_.zmm =
-            _mm512_mask_add_epi8(pre_substitution_vec_.zmm, mismatch_mask_, pre_substitution_vec_.zmm, ones_vec_.zmm);
-        cost_if_gap_vec_.zmm =
-            _mm512_add_epi8(_mm512_min_epu8(pre_insertion_vec_.zmm, pre_deletion_vec_.zmm), ones_vec_.zmm);
-        cell_score_vec_.zmm = _mm512_min_epu8(cost_if_substitution_vec_.zmm, cost_if_gap_vec_.zmm);
-        _mm512_mask_storeu_epi8(scores_new + i, load_mask_, cell_score_vec_.zmm);
+        load_mask = _sz_u64_mask_until(n - i);
+        first_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, first_reversed_slice + i);
+        second_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, second_slice + i);
+        pre_substitution_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, scores_pre_substitution + i);
+        pre_insertion_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, scores_pre_insertion + i);
+        pre_deletion_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, scores_pre_deletion + i);
+
+        mismatch_mask = _mm512_cmpneq_epi8_mask(first_vec.zmm, second_vec.zmm);
+        cost_if_substitution_vec.zmm =
+            _mm512_mask_add_epi8(pre_substitution_vec.zmm, mismatch_mask, pre_substitution_vec.zmm, ones_vec.zmm);
+        cost_if_gap_vec.zmm =
+            _mm512_add_epi8(_mm512_min_epu8(pre_insertion_vec.zmm, pre_deletion_vec.zmm), ones_vec.zmm);
+        cell_score_vec.zmm = _mm512_min_epu8(cost_if_substitution_vec.zmm, cost_if_gap_vec.zmm);
+        _mm512_mask_storeu_epi8(scores_new + i, load_mask, cell_score_vec.zmm);
     }
 
     inline void operator()(                                                          //
@@ -1442,10 +1479,7 @@ struct global_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t,
         sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion, //
         sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new) noexcept {
 
-        // Initialize the constants
-        ones_vec_.zmm = _mm512_set1_epi8(1);
-
-        // In this variant we will need at most 4 loops per diagonal.
+        // In this variant we will need at most 4 loops per diagonal
         for (sz_size_t i = 0; i < n; i += 64)
             slice(first_reversed_slice, second_slice, i, n, scores_pre_substitution, scores_pre_insertion,
                   scores_pre_deletion, scores_new);
@@ -1456,46 +1490,113 @@ struct global_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t,
 };
 
 /**
- *  @brief Variant of `global_scorer` - Minimizes Levenshtein distance for inputs in [256, 65K] chars.
+ *  @brief Variant of `scorer` - Minimizes Levenshtein distance for inputs under 256 runes.
  *  @note Requires Intel Ice Lake generation CPUs or newer.
  */
-template <>
-struct global_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k, sz_cap_ice_k>
-    : public global_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
-                           sz_cap_serial_k> {
+template <sz_capability_t capability_>
+struct linear_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k,
+                     sz_similarity_global_k, capability_, std::enable_if_t<capability_ & sz_cap_ice_k>>
+    : public linear_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k,
+                           sz_similarity_global_k, sz_cap_serial_k, void> {
 
-    using scorer_t::global_scorer; // Make the constructors visible
-    using ice_scorer_t = global_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t,
-                                       sz_minimize_distance_k, sz_cap_ice_k>;
+    using scorer_t::linear_scorer; // Make the constructors visible
 
-  protected:
-    __mmask32 load_mask_, mismatch_mask_;
-    sz_u256_vec_t first_vec_, second_vec_;
-    sz_u512_vec_t pre_substitution_vec_, pre_insertion_vec_, pre_deletion_vec_;
-    sz_u512_vec_t cost_if_substitution_vec_, cost_if_gap_vec_, cell_score_vec_;
-    sz_u512_vec_t ones_vec_;
+    static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
+    static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
+    static constexpr sz_capability_t capability_k = capability_;
+
+    inline void slice(                                                                                  //
+        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, sz_size_t i, sz_size_t n, //
+        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion,                    //
+        sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new) const noexcept {
+
+        __mmask16 load_mask, mismatch_mask;
+        sz_u512_vec_t first_vec, second_vec;
+        sz_u128_vec_t pre_substitution_vec, pre_insertion_vec, pre_deletion_vec;
+        sz_u128_vec_t cost_if_substitution_vec, cost_if_gap_vec, cell_score_vec;
+
+        // Initialize constats:
+        sz_u128_vec_t ones_vec;
+        ones_vec.xmm = _mm_set1_epi8(1);
+
+        // ? Note that here we are still traversing both buffers in the same order,
+        // ? because one of the strings has been reversed beforehand.
+        load_mask = _sz_u16_clamp_mask_until(n - i);
+        first_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, first_reversed_slice + i);
+        second_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, second_slice + i);
+        pre_substitution_vec.xmm = _mm_maskz_loadu_epi8(load_mask, scores_pre_substitution + i);
+        pre_insertion_vec.xmm = _mm_maskz_loadu_epi8(load_mask, scores_pre_insertion + i);
+        pre_deletion_vec.xmm = _mm_maskz_loadu_epi8(load_mask, scores_pre_deletion + i);
+
+        mismatch_mask = _mm512_cmpneq_epi32_mask(first_vec.zmm, second_vec.zmm);
+        cost_if_substitution_vec.xmm =
+            _mm_mask_add_epi8(pre_substitution_vec.xmm, mismatch_mask, pre_substitution_vec.xmm, ones_vec.xmm);
+        cost_if_gap_vec.xmm = _mm_add_epi8(_mm_min_epu8(pre_insertion_vec.xmm, pre_deletion_vec.xmm), ones_vec.xmm);
+        cell_score_vec.xmm = _mm_min_epu8(cost_if_substitution_vec.xmm, cost_if_gap_vec.xmm);
+        _mm_mask_storeu_epi8(scores_new + i, load_mask, cell_score_vec.xmm);
+    }
+
+    inline void operator()(                                                                //
+        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, sz_size_t n, //
+        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion,       //
+        sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new) noexcept {
+
+        // In this variant we will need at most (256 / 16) = 16 loops per diagonal.
+        for (sz_size_t i = 0; i < n; i += 16)
+            slice(first_reversed_slice, second_slice, i, n, scores_pre_substitution, scores_pre_insertion,
+                  scores_pre_deletion, scores_new);
+
+        // The last element of the last chunk is the result of the global alignment.
+        if (n == 1) this->last_score_ = scores_new[0];
+    }
+};
+
+/**
+ *  @brief Variant of `scorer` - Minimizes Levenshtein distance for inputs in [256, 65K] bytes.
+ *  @note Requires Intel Ice Lake generation CPUs or newer.
+ */
+template <sz_capability_t capability_>
+struct linear_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
+                     sz_similarity_global_k, capability_, std::enable_if_t<capability_ & sz_cap_ice_k>>
+    : public linear_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
+                           sz_similarity_global_k, sz_cap_serial_k, void> {
+
+    using scorer_t::linear_scorer; // Make the constructors visible
+
+    static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
+    static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
+    static constexpr sz_capability_t capability_k = capability_;
 
-  public:
     inline void slice(                                                                        //
         char const *first_reversed_slice, char const *second_slice, sz_size_t i, sz_size_t n, //
         sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,        //
-        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
-        load_mask_ = _sz_u32_clamp_mask_until(n - i);
-        pre_substitution_vec_.zmm = _mm512_maskz_loadu_epi16(load_mask_, scores_pre_substitution + i);
-        pre_insertion_vec_.zmm = _mm512_maskz_loadu_epi16(load_mask_, scores_pre_insertion + i);
-        pre_deletion_vec_.zmm = _mm512_maskz_loadu_epi16(load_mask_, scores_pre_deletion + i);
+        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) const noexcept {
+
+        __mmask32 load_mask, mismatch_mask;
+        sz_u256_vec_t first_vec, second_vec;
+        sz_u512_vec_t pre_substitution_vec, pre_insertion_vec, pre_deletion_vec;
+        sz_u512_vec_t cost_if_substitution_vec, cost_if_gap_vec, cell_score_vec;
+
+        // Initialize constats:
+        sz_u512_vec_t ones_vec;
+        ones_vec.zmm = _mm512_set1_epi16(1);
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
-        first_vec_.ymm = _mm256_maskz_loadu_epi8(load_mask_, first_reversed_slice + i);
-        second_vec_.ymm = _mm256_maskz_loadu_epi8(load_mask_, second_slice + i);
-        mismatch_mask_ = _mm256_cmpneq_epi8_mask(first_vec_.ymm, second_vec_.ymm);
-        cost_if_substitution_vec_.zmm =
-            _mm512_mask_add_epi16(pre_substitution_vec_.zmm, mismatch_mask_, pre_substitution_vec_.zmm, ones_vec_.zmm);
-        cost_if_gap_vec_.zmm =
-            _mm512_add_epi16(_mm512_min_epu16(pre_insertion_vec_.zmm, pre_deletion_vec_.zmm), ones_vec_.zmm);
-        cell_score_vec_.zmm = _mm512_min_epu16(cost_if_substitution_vec_.zmm, cost_if_gap_vec_.zmm);
-        _mm512_mask_storeu_epi16(scores_new + i, load_mask_, cell_score_vec_.zmm);
+        load_mask = _sz_u32_clamp_mask_until(n - i);
+        first_vec.ymm = _mm256_maskz_loadu_epi8(load_mask, first_reversed_slice + i);
+        second_vec.ymm = _mm256_maskz_loadu_epi8(load_mask, second_slice + i);
+        pre_substitution_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_substitution + i);
+        pre_insertion_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_insertion + i);
+        pre_deletion_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_deletion + i);
+
+        mismatch_mask = _mm256_cmpneq_epi8_mask(first_vec.ymm, second_vec.ymm);
+        cost_if_substitution_vec.zmm =
+            _mm512_mask_add_epi16(pre_substitution_vec.zmm, mismatch_mask, pre_substitution_vec.zmm, ones_vec.zmm);
+        cost_if_gap_vec.zmm =
+            _mm512_add_epi16(_mm512_min_epu16(pre_insertion_vec.zmm, pre_deletion_vec.zmm), ones_vec.zmm);
+        cell_score_vec.zmm = _mm512_min_epu16(cost_if_substitution_vec.zmm, cost_if_gap_vec.zmm);
+        _mm512_mask_storeu_epi16(scores_new + i, load_mask, cell_score_vec.zmm);
     }
 
     inline void operator()(                                                            //
@@ -1503,10 +1604,8 @@ struct global_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t
         sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion, //
         sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
 
-        // Initialize the constants
-        ones_vec_.zmm = _mm512_set1_epi16(1);
-
-        // In this variant we will need at most 64*1024/32 = 2048 loops per diagonal.
+#pragma omp parallel for simd if (capability_k & sz_cap_parallel_k)
+        // In this variant we will need at most (64 * 1024 / 32) = 2048 loops per diagonal.
         for (sz_size_t i = 0; i < n; i += 32)
             slice(first_reversed_slice, second_slice, i, n, scores_pre_substitution, scores_pre_insertion,
                   scores_pre_deletion, scores_new);
@@ -1517,53 +1616,598 @@ struct global_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t
 };
 
 /**
- *  @brief Variant of `global_scorer` - Minimizes Levenshtein distance for inputs in [256, 65K] chars in parallel.
+ *  @brief Variant of `scorer` - Minimizes Levenshtein distance for inputs in [256, 65K] runes in parallel.
  *  @note Requires Intel Ice Lake generation CPUs or newer.
  */
-template <>
-struct global_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
-                     (sz_capability_t)(sz_cap_parallel_k | sz_cap_ice_k)>
-    : public global_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
-                           sz_cap_ice_k> {
+template <sz_capability_t capability_>
+struct linear_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
+                     sz_similarity_global_k, capability_, std::enable_if_t<capability_ & sz_cap_ice_k>>
+    : public linear_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, error_costs_uniform_t,
+                           sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void> {
 
-    using ice_scorer_t::global_scorer; // Make the constructors visible
+    using scorer_t::linear_scorer; // Make the constructors visible
 
-    inline void operator()(                                                            //
-        char const *first_reversed_slice, char const *second_slice, sz_size_t n,       //
-        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion, //
-        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
+    static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
+    static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
+    static constexpr sz_capability_t capability_k = capability_;
 
-        // Initialize the constants
-        ice_scorer_t::ones_vec_.zmm = _mm512_set1_epi16(1);
+    inline void slice(                                                                                  //
+        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, sz_size_t i, sz_size_t n, //
+        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,                  //
+        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) const noexcept {
 
-#pragma openmp parallel for
-        // In this variant we will need at most 64*1024/32 = 2048 loops per diagonal.
-        for (sz_size_t i = 0; i < n; i += 32)
-            ice_scorer_t::slice(first_reversed_slice, second_slice, i, n, scores_pre_substitution, scores_pre_insertion,
-                                scores_pre_deletion, scores_new);
+        __mmask16 load_mask, mismatch_mask;
+        sz_u512_vec_t first_vec, second_vec;
+        sz_u256_vec_t pre_substitution_vec, pre_insertion_vec, pre_deletion_vec;
+        sz_u256_vec_t cost_if_substitution_vec, cost_if_gap_vec, cell_score_vec;
+
+        // Initialize constats:
+        sz_u256_vec_t ones_vec;
+        ones_vec.ymm = _mm256_set1_epi16(1);
+
+        // ? Note that here we are still traversing both buffers in the same order,
+        // ? because one of the strings has been reversed beforehand.
+        load_mask = _sz_u16_clamp_mask_until(n - i);
+        first_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, first_reversed_slice + i);
+        second_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, second_slice + i);
+        pre_substitution_vec.ymm = _mm256_maskz_loadu_epi16(load_mask, scores_pre_substitution + i);
+        pre_insertion_vec.ymm = _mm256_maskz_loadu_epi16(load_mask, scores_pre_insertion + i);
+        pre_deletion_vec.ymm = _mm256_maskz_loadu_epi16(load_mask, scores_pre_deletion + i);
+
+        mismatch_mask = _mm512_cmpneq_epi32_mask(first_vec.zmm, second_vec.zmm);
+        cost_if_substitution_vec.ymm =
+            _mm256_mask_add_epi16(pre_substitution_vec.ymm, mismatch_mask, pre_substitution_vec.ymm, ones_vec.ymm);
+        cost_if_gap_vec.ymm =
+            _mm256_add_epi16(_mm256_min_epu16(pre_insertion_vec.ymm, pre_deletion_vec.ymm), ones_vec.ymm);
+        cell_score_vec.ymm = _mm256_min_epu16(cost_if_substitution_vec.ymm, cost_if_gap_vec.ymm);
+        _mm256_mask_storeu_epi16(scores_new + i, load_mask, cell_score_vec.ymm);
+    }
+
+    inline void operator()(                                                                //
+        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, sz_size_t n, //
+        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,     //
+        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
+
+#pragma omp parallel for simd if (capability_k & sz_cap_parallel_k)
+        // In this variant we will need at most (64 * 1024 / 16) = 4096 loops per diagonal.
+        for (sz_size_t i = 0; i < n; i += 16)
+            slice(first_reversed_slice, second_slice, i, n, scores_pre_substitution, scores_pre_insertion,
+                  scores_pre_deletion, scores_new);
 
         // The last element of the last chunk is the result of the global alignment.
         if (n == 1) this->last_score_ = scores_new[0];
     }
 };
 
-#if 0
-template <>
-struct global_scorer<char const *, char const *, sz_i16_t, error_costs_uniform_t, sz_maximize_distance_k, sz_cap_ice_k>
-    : public global_scorer<char const *, char const *, sz_i16_t, error_costs_uniform_t, sz_maximize_distance_k,
-                           sz_cap_serial_k> {
+/**
+ *  @brief  Computes the @b byte-level Levenshtein distance between two strings using the OpenMP backend.
+ *  @sa     `levenshtein_distance_utf8` for UTF-8 strings.
+ */
+template <typename allocator_type_, sz_capability_t capability_>
+struct levenshtein_distance<char, allocator_type_, capability_, std::enable_if_t<capability_ & sz_cap_ice_k>> {
+
+    using char_t = char;
+    using allocator_t = allocator_type_;
+
+    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_capability_t capability_wout_simd_k = (sz_capability_t)(capability_k & ~sz_cap_ice_k);
+    static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
+
+    using diagonal_u8_t = diagonal_walker<char_t, sz_u8_t, error_costs_uniform_t, allocator_t, sz_minimize_distance_k,
+                                          sz_similarity_global_k, capability_serialized_k>;
+    using diagonal_u16_t = diagonal_walker<char_t, sz_u16_t, error_costs_uniform_t, allocator_t, sz_minimize_distance_k,
+                                           sz_similarity_global_k, capability_k>;
+    using diagonal_u32_t = diagonal_walker<char_t, sz_u32_t, error_costs_uniform_t, allocator_t, sz_minimize_distance_k,
+                                           sz_similarity_global_k, capability_wout_simd_k>;
+    using diagonal_u64_t = diagonal_walker<char_t, sz_u64_t, error_costs_uniform_t, allocator_t, sz_minimize_distance_k,
+                                           sz_similarity_global_k, capability_wout_simd_k>;
+
+    allocator_t alloc_ {};
+
+    levenshtein_distance(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
+
+    /**
+     *  @param[in] first The first string.
+     *  @param[in] second The second string.
+     *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
+     */
+    status_t operator()(span<char_t const> first, span<char_t const> second, sz_size_t &result_ref) const noexcept {
+
+        sz_size_t const first_length = first.length();
+        sz_size_t const second_length = second.length();
+        if (first_length == 0) {
+            result_ref = second_length;
+            return status_t::success_k;
+        }
+        if (second_length == 0) {
+            result_ref = first_length;
+            return status_t::success_k;
+        }
+
+        // Estimate the maximum dimension of the DP matrix and choose the best type for it.
+        sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
+        status_t status = status_t::success_k;
+        if (max_dim < 256u) {
+            sz_u8_t result_u8;
+            status = diagonal_u8_t {error_costs_uniform_t {}, 1, alloc_}(first, second, result_u8);
+            if (status == status_t::success_k) result_ref = result_u8;
+        }
+        else if (max_dim < 65536u) {
+            sz_u16_t result_u16;
+            status = diagonal_u16_t {error_costs_uniform_t {}, 1, alloc_}(first, second, result_u16);
+            if (status == status_t::success_k) result_ref = result_u16;
+        }
+        else if (max_dim < 4294967296u) {
+            sz_u32_t result_u32;
+            status = diagonal_u32_t {error_costs_uniform_t {}, 1, alloc_}(first, second, result_u32);
+            if (status == status_t::success_k) result_ref = result_u32;
+        }
+        else {
+            sz_u64_t result_u64;
+            status = diagonal_u64_t {error_costs_uniform_t {}, 1, alloc_}(first, second, result_u64);
+            if (status == status_t::success_k) result_ref = result_u64;
+        }
+
+        return status;
+    }
+};
+
+/**
+ *  @brief  Computes the @b rune-level Levenshtein distance between two UTF-8 strings using the OpenMP backend.
+ *  @sa     `levenshtein_distance` for binary strings.
+ */
+template <typename allocator_type_, sz_capability_t capability_>
+struct levenshtein_distance_utf8<char, allocator_type_, capability_, std::enable_if_t<capability_ & sz_cap_ice_k>> {
+
+    using char_t = char;
+    using allocator_t = allocator_type_;
+
+    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_capability_t capability_wout_simd_k = (sz_capability_t)(capability_k & ~sz_cap_ice_k);
+    static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
+
+    using diagonal_u8_t = diagonal_walker<sz_rune_t, sz_u8_t, error_costs_uniform_t, allocator_t,
+                                          sz_minimize_distance_k, sz_similarity_global_k, capability_serialized_k>;
+    using diagonal_u16_t = diagonal_walker<sz_rune_t, sz_u16_t, error_costs_uniform_t, allocator_t,
+                                           sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
+    using diagonal_u32_t = diagonal_walker<sz_rune_t, sz_u32_t, error_costs_uniform_t, allocator_t,
+                                           sz_minimize_distance_k, sz_similarity_global_k, capability_wout_simd_k>;
+    using diagonal_u64_t = diagonal_walker<sz_rune_t, sz_u64_t, error_costs_uniform_t, allocator_t,
+                                           sz_minimize_distance_k, sz_similarity_global_k, capability_wout_simd_k>;
+
+    using ascii_fallback_t = levenshtein_distance<char_t, allocator_t, capability_k>;
+
+    mutable allocator_t alloc_ {};
+
+    levenshtein_distance_utf8(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
+
+    /**
+     *  @param[in] first The first string.
+     *  @param[in] second The second string.
+     *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
+     */
+    status_t operator()(span<char_t const> first, span<char_t const> second, sz_size_t &result_ref) const noexcept {
+
+        sz_size_t const first_length = first.length();
+        sz_size_t const second_length = second.length();
+        if (first_length == 0) {
+            result_ref = second_length;
+            return status_t::success_k;
+        }
+        if (second_length == 0) {
+            result_ref = first_length;
+            return status_t::success_k;
+        }
+
+        // Check if the strings are entirely composed of ASCII characters,
+        // and default to a simpler algorithm in that case.
+        if (sz_isascii(first.data(), first.length()) && sz_isascii(second.data(), second.length()))
+            return ascii_fallback_t {alloc_}(first, second, result_ref);
+
+        // Allocate some memory to expand UTF-8 strings into UTF-32.
+        sz_size_t const max_utf32_bytes = first.size() * 4 + second.size() * 4;
+        sz_rune_t *const first_data_utf32 = (sz_rune_t *)alloc_.allocate(max_utf32_bytes);
+        sz_rune_t *const second_data_utf32 = first_data_utf32 + first.size();
+
+        // Export into UTF-32 buffer.
+        sz_rune_length_t rune_length;
+        sz_size_t first_length_utf32 = 0, second_length_utf32 = 0;
+        for (sz_size_t progress_utf8 = 0, progress_utf32 = 0; progress_utf8 < first.size();
+             progress_utf8 += rune_length, ++progress_utf32, ++first_length_utf32)
+            sz_rune_parse(first.data() + progress_utf8, first_data_utf32 + progress_utf32, &rune_length);
+        for (sz_size_t progress_utf8 = 0, progress_utf32 = 0; progress_utf8 < second.size();
+             progress_utf8 += rune_length, ++progress_utf32, ++second_length_utf32)
+            sz_rune_parse(second.data() + progress_utf8, second_data_utf32 + progress_utf32, &rune_length);
+
+        // Estimate the maximum dimension of the DP matrix and choose the best type for it.
+        sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
+        span<sz_rune_t const> const first_utf32 {first_data_utf32, first_length_utf32};
+        span<sz_rune_t const> const second_utf32 {second_data_utf32, second_length_utf32};
+        status_t status = status_t::success_k;
+        if (max_dim < 256u) {
+            sz_u8_t result_u8;
+            status = diagonal_u8_t {error_costs_uniform_t {}, 1, alloc_}(first_utf32, second_utf32, result_u8);
+            if (status == status_t::success_k) result_ref = result_u8;
+        }
+        else if (max_dim < 65536u) {
+            sz_u16_t result_u16;
+            status = diagonal_u16_t {error_costs_uniform_t {}, 1, alloc_}(first_utf32, second_utf32, result_u16);
+            if (status == status_t::success_k) result_ref = result_u16;
+        }
+        else if (max_dim < 4294967296u) {
+            sz_u32_t result_u32;
+            status = diagonal_u32_t {error_costs_uniform_t {}, 1, alloc_}(first_utf32, second_utf32, result_u32);
+            if (status == status_t::success_k) result_ref = result_u32;
+        }
+        else {
+            sz_u64_t result_u64;
+            status = diagonal_u64_t {error_costs_uniform_t {}, 1, alloc_}(first_utf32, second_utf32, result_u64);
+            if (status == status_t::success_k) result_ref = result_u64;
+        }
+
+        return status;
+    }
+};
+
+/**
+ *  @brief  Helper object optimizing the most expensive part of variable-substitution-cost alignment methods for
+ *          Ice Lake CPUs. It's designed for horizontal layout "walkers", where we look at just one row of (256 x 256)
+ *          substitution matrix and can fit 256 bytes worth of costs in the registers.
+ *
+ *  This is a common abstraction for both:
+ *  - Local SW and global NW alignment.
+ *  - Serial and parallel implementations.
+ *  - 8-bit, 16-bit, 32-bit, and even 64-bit costs.
+ *  - Any memory allocator used.
+ */
+struct lookup_in256bytes_ice_t {
+    sz_u512_vec_t row_subs_vecs_[4];
+    sz_u512_vec_t is_third_or_fourth_vec_, is_second_or_fourth_vec_;
+
+    inline lookup_in256bytes_ice_t() noexcept {
+        char is_third_or_fourth_check, is_second_or_fourth_check;
+        *(sz_u8_t *)&is_third_or_fourth_check = 0x80, *(sz_u8_t *)&is_second_or_fourth_check = 0x40;
+        is_third_or_fourth_vec_.zmm = _mm512_set1_epi8(is_third_or_fourth_check);
+        is_second_or_fourth_vec_.zmm = _mm512_set1_epi8(is_second_or_fourth_check);
+    }
+
+    inline void reload(sz_error_cost_t const *row_subs) noexcept {
+        row_subs_vecs_[0].zmm = _mm512_loadu_si512(row_subs + 64 * 0);
+        row_subs_vecs_[1].zmm = _mm512_loadu_si512(row_subs + 64 * 1);
+        row_subs_vecs_[2].zmm = _mm512_loadu_si512(row_subs + 64 * 2);
+        row_subs_vecs_[3].zmm = _mm512_loadu_si512(row_subs + 64 * 3);
+    }
+
+    inline sz_u512_vec_t lookup64(sz_u512_vec_t const &text_vec) const noexcept {
+
+        sz_u512_vec_t shuffled_subs_vecs[4];
+        sz_u512_vec_t substituted_vec;
+        __mmask64 is_third_or_fourth, is_second_or_fourth;
+
+        // Only the bottom 6 bits of a byte are used in `VPERB`, so we don't even need to mask.
+        shuffled_subs_vecs[0].zmm = _mm512_permutexvar_epi8(text_vec.zmm, row_subs_vecs_[0].zmm);
+        shuffled_subs_vecs[1].zmm = _mm512_permutexvar_epi8(text_vec.zmm, row_subs_vecs_[1].zmm);
+        shuffled_subs_vecs[2].zmm = _mm512_permutexvar_epi8(text_vec.zmm, row_subs_vecs_[2].zmm);
+        shuffled_subs_vecs[3].zmm = _mm512_permutexvar_epi8(text_vec.zmm, row_subs_vecs_[3].zmm);
+
+        // To blend we can invoke three `_mm512_cmplt_epu8_mask`, but we can also achieve the same using
+        // the AND logical operation, checking the top two bits of every byte. Continuing this thought,
+        // we can use the `VPTESTMB` instruction to output the mask after the AND.
+        is_third_or_fourth = _mm512_test_epi8_mask(text_vec.zmm, is_third_or_fourth_vec_.zmm);
+        is_second_or_fourth = _mm512_test_epi8_mask(text_vec.zmm, is_second_or_fourth_vec_.zmm);
+        substituted_vec.zmm = _mm512_mask_blend_epi8(
+            is_third_or_fourth,
+            // Choose between the first and the second.
+            _mm512_mask_blend_epi8(is_second_or_fourth, shuffled_subs_vecs[0].zmm, shuffled_subs_vecs[1].zmm),
+            // Choose between the third and the fourth.
+            _mm512_mask_blend_epi8(is_second_or_fourth, shuffled_subs_vecs[2].zmm, shuffled_subs_vecs[3].zmm));
+
+        return substituted_vec;
+    }
+};
+
+/**
+ *  @brief  Helper object for Ice Lake CPUs. It's designed for horizontal layout "walkers", operating over 16-bit costs.
+ *
+ *  This is a common abstraction for both:
+ *  - Local SW and global NW alignment.
+ *  - Serial and parallel implementations.
+ *  - Any memory allocator used.
+ */
+template <sz_similarity_locality_t locality_, sz_capability_t capability_>
+struct linear_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_256x256_t, sz_maximize_score_k,
+                     locality_, capability_, std::enable_if_t<capability_ & sz_cap_ice_k>>
+    : public linear_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_256x256_t, sz_maximize_score_k,
+                           locality_, sz_cap_serial_k, void> {
 
-    using scorer_t::global_scorer; // Make the constructors visible
+    using linear_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_256x256_t, sz_maximize_score_k,
+                        locality_, sz_cap_serial_k, void>::linear_scorer; // Make the constructors visible
+
+    static constexpr sz_similarity_objective_t objective_k = sz_maximize_score_k;
+    static constexpr sz_similarity_locality_t locality_k = locality_;
+    static constexpr sz_capability_t capability_k = capability_;
+
+    lookup_in256bytes_ice_t lookup_;
 
     void operator()(                                                                   //
-        char const *first_reversed_slice, char const *second_slice, sz_size_t n,       //
+        constant_iterator<char> first_char, char const *second_slice, sz_size_t n,     //
         sz_i16_t const *scores_pre_substitution, sz_i16_t const *scores_pre_insertion, //
         sz_i16_t const *scores_pre_deletion, sz_i16_t *scores_new) noexcept {
 
+        // Load a new substitution row.
+        sz_i16_t const gap = this->gap_cost_;
+        error_cost_t const *substitutions_row = &this->substituter_.cells[(sz_u8_t)*first_char][0];
+        lookup_.reload(substitutions_row);
+
+        sz_size_t const count_slices = n / 64;
+
+#pragma omp parallel for simd if (capability_k & sz_cap_parallel_k)
+        // Progress through the row 64 characters at a time.
+        for (sz_size_t idx_slice = 0; idx_slice != count_slices; ++idx_slice)
+            slice_64chars(second_slice, idx_slice * 64, scores_pre_substitution, scores_pre_insertion, scores_new);
+
+        // Handle the tail with a less efficient kernel - at most 2 iterations of the following loop:
+        for (sz_size_t idx_half_slice = count_slices * 2; idx_half_slice * 32 < n; ++idx_half_slice)
+            slice_under32chars(second_slice, idx_half_slice * 32, n, scores_pre_substitution, scores_pre_insertion,
+                               scores_new);
+
+        // Horizontally compute the running minimum of the last row.
+        // Simply disabling this operation results in 5x performance improvement, meaning
         //
+        // To perform the same operation in vectorized form, we need to perform a tree-like reduction,
+        // that will involve multiple steps. It's quite expensive and should be first tested in the
+        // "experimental" section.
+        _sz_assert(scores_pre_substitution + 1 == scores_pre_insertion && "Expects horizontal traversal of DP matrix");
+        _sz_assert(scores_pre_deletion + 1 == scores_new && "Expects horizontal traversal of DP matrix");
+        sz_i16_t last_in_row = scores_pre_deletion[0];
+        for (size_t i = 0; i < n; ++i) scores_new[i] = last_in_row = sz_max_of_two(scores_new[i], last_in_row + gap);
+        this->last_score_ = last_in_row;
+    }
+
+    void slice_64chars(char const *second_slice, sz_size_t i,                                         //
+                       sz_i16_t const *scores_pre_substitution, sz_i16_t const *scores_pre_insertion, //
+                       sz_i16_t *scores_new) const noexcept {
+
+        sz_u512_vec_t second_vec;
+        sz_u512_vec_t pre_substitution_vecs[2], pre_gap_vecs[2];
+        sz_u512_vec_t cost_of_substitution_i8_vec, cost_of_substitution_i16_vecs[2];
+        sz_u512_vec_t cost_if_substitution_vecs[2], cost_if_gap_vecs[2], cell_score_vecs[2];
+
+        // Initialize constats:
+        sz_u512_vec_t ones_vec;
+        ones_vec.zmm = _mm512_set1_epi16(1);
+
+        // Load the data without any masks:
+        second_vec.zmm = _mm512_loadu_epi8(second_slice + i);
+        pre_substitution_vecs[0].zmm = _mm512_loadu_epi16(scores_pre_substitution + i + 0);
+        pre_substitution_vecs[1].zmm = _mm512_loadu_epi16(scores_pre_substitution + i + 32);
+        pre_gap_vecs[0].zmm = _mm512_loadu_epi16(scores_pre_insertion + i + 0);
+        pre_gap_vecs[1].zmm = _mm512_loadu_epi16(scores_pre_insertion + i + 32);
+
+        // First, sign-extend the substitution cost vector.
+        cost_of_substitution_i8_vec = lookup_.lookup64(second_vec);
+        cost_of_substitution_i16_vecs[0].zmm =
+            _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(cost_of_substitution_i8_vec.zmm, 0));
+        cost_of_substitution_i16_vecs[1].zmm =
+            _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(cost_of_substitution_i8_vec.zmm, 1));
+
+        // Then compute the data-parallel part, assuming the cost of deletions will be propagated
+        // left to right outside of this loop.
+        cost_if_substitution_vecs[0].zmm =
+            _mm512_add_epi16(pre_substitution_vecs[0].zmm, cost_of_substitution_i16_vecs[0].zmm);
+        cost_if_substitution_vecs[1].zmm =
+            _mm512_add_epi16(pre_substitution_vecs[1].zmm, cost_of_substitution_i16_vecs[1].zmm);
+        cost_if_gap_vecs[0].zmm = _mm512_add_epi16(pre_gap_vecs[0].zmm, ones_vec.zmm);
+        cost_if_gap_vecs[1].zmm = _mm512_add_epi16(pre_gap_vecs[1].zmm, ones_vec.zmm);
+        cell_score_vecs[0].zmm = _mm512_max_epi16(cost_if_substitution_vecs[0].zmm, cost_if_gap_vecs[0].zmm);
+        cell_score_vecs[1].zmm = _mm512_max_epi16(cost_if_substitution_vecs[1].zmm, cost_if_gap_vecs[1].zmm);
+
+        // In Local Alignment for SW we also need to compare to zero and set the result to zero if negative.
+        if constexpr (locality_ == sz_similarity_local_k)
+            cell_score_vecs[0].zmm = _mm512_max_epi16(cell_score_vecs[0].zmm, _mm512_setzero_epi32()),
+            cell_score_vecs[1].zmm = _mm512_max_epi16(cell_score_vecs[1].zmm, _mm512_setzero_epi32());
+
+        // Dump partial results to the output buffer.
+        _mm512_storeu_epi16(scores_new + i + 0, cell_score_vecs[0].zmm);
+        _mm512_storeu_epi16(scores_new + i + 32, cell_score_vecs[1].zmm);
+    }
+
+    void slice_under32chars(char const *second_slice, sz_size_t i, sz_size_t n,                            //
+                            sz_i16_t const *scores_pre_substitution, sz_i16_t const *scores_pre_insertion, //
+                            sz_i16_t *scores_new) const noexcept {
+
+        __mmask32 load_mask;
+        sz_u512_vec_t second_vec; // ! Only up to 32 bytes in the low YMM section will be used
+        sz_u512_vec_t pre_substitution_vec, pre_gap_vec;
+        sz_u512_vec_t cost_of_substitution_vec;
+        sz_u512_vec_t cost_if_substitution_vec, cost_if_gap_vec, cell_score_vec;
+
+        // Initialize constats:
+        sz_u512_vec_t ones_vec;
+        ones_vec.zmm = _mm512_set1_epi16(1);
+
+        // Load the data with a mask:
+        load_mask = _sz_u32_clamp_mask_until(n - i);
+        second_vec.ymms[0] = _mm256_maskz_loadu_epi8(load_mask, second_slice + i);
+        pre_substitution_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_substitution + i);
+        pre_gap_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_insertion + i);
+
+        // First, sign-extend the substitution cost vector.
+        cost_of_substitution_vec.zmm = _mm512_cvtepi8_epi16(lookup_.lookup64(second_vec).ymms[0]);
+
+        // Then compute the data-parallel part, assuming the cost of deletions will be propagated
+        // left to right outside of this loop.
+        cost_if_substitution_vec.zmm = _mm512_add_epi16(pre_substitution_vec.zmm, cost_of_substitution_vec.zmm);
+        cost_if_gap_vec.zmm = _mm512_add_epi16(pre_gap_vec.zmm, ones_vec.zmm);
+        cell_score_vec.zmm = _mm512_max_epi16(cost_if_substitution_vec.zmm, cost_if_gap_vec.zmm);
+
+        // In Local Alignment for SW we also need to compare to zero and set the result to zero if negative.
+        if constexpr (locality_ == sz_similarity_local_k)
+            cell_score_vec.zmm = _mm512_max_epi16(cell_score_vec.zmm, _mm512_setzero_epi32());
+
+        // Dump partial results to the output buffer.
+        _mm512_mask_storeu_epi16(scores_new + i, load_mask, cell_score_vec.zmm);
+    }
+};
+
+/**
+ *  @brief  Computes the @b byte-level Needleman-Wunsch score between two strings using the Ice Lake (+OpenMP) backend.
+ *  @sa     `levenshtein_distance` for uniform substitution and gap costs.
+ */
+template <typename allocator_type_, sz_capability_t capability_>
+struct needleman_wunsch_score<char, error_costs_256x256_t, allocator_type_, capability_,
+                              std::enable_if_t<capability_ & sz_cap_ice_k>> {
+
+    using char_t = char;
+    using substituter_t = error_costs_256x256_t;
+    using allocator_t = allocator_type_;
+
+    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_capability_t capability_wout_simd_k = (sz_capability_t)(capability_k & ~sz_cap_ice_k);
+    static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
+
+    using horizontal_i16_t = horizontal_walker<char_t, sz_i16_t, substituter_t, allocator_t, sz_maximize_score_k,
+                                               sz_similarity_global_k, capability_serialized_k>;
+    using horizontal_i32_t = horizontal_walker<char_t, sz_i32_t, substituter_t, allocator_t, sz_maximize_score_k,
+                                               sz_similarity_global_k, capability_wout_simd_k>;
+    using horizontal_i64_t = horizontal_walker<char_t, sz_i64_t, substituter_t, allocator_t, sz_maximize_score_k,
+                                               sz_similarity_global_k, capability_wout_simd_k>;
+
+    substituter_t substituter_ {};
+    error_cost_t gap_cost_ {1};
+    allocator_t alloc_ {};
+
+    needleman_wunsch_score(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
+    needleman_wunsch_score(substituter_t subs, error_cost_t gap_cost, allocator_t alloc = allocator_t {}) noexcept
+        : substituter_(subs), gap_cost_(gap_cost), alloc_(alloc) {}
+
+    /**
+     *  @param[in] first The first string.
+     *  @param[in] second The second string.
+     *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
+     */
+    status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref) const noexcept {
+
+        sz_size_t const first_length = first.length();
+        sz_size_t const second_length = second.length();
+        if (first_length == 0) {
+            result_ref = second_length * gap_cost_;
+            return status_t::success_k;
+        }
+        if (second_length == 0) {
+            result_ref = first_length * gap_cost_;
+            return status_t::success_k;
+        }
+
+        // Estimate the maximum dimension of the DP matrix
+        sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
+
+        // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
+        // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
+        // Assuming each individual cost falls in [-128, 127], the `i16` range of [-32768, 32767] is sufficient
+        // for inputs under (32768 / 128) = 256 characters.
+        status_t status = status_t::success_k;
+        if (max_dim < 256u) {
+            sz_i16_t result_i16;
+            status = horizontal_i16_t {substituter_, gap_cost_, alloc_}(first, second, result_i16);
+            if (status == status_t::success_k) result_ref = result_i16;
+        }
+        // Assuming each individual cost falls in [-128, 127], the `i32` range of [-2147483648, 2147483647] is
+        // sufficient for inputs under (2147483648 / 128) = 16777216 characters.
+        else if (max_dim < 16777216u) {
+            sz_i32_t result_i32;
+            status = horizontal_i32_t {substituter_, gap_cost_, alloc_}(first, second, result_i32);
+            if (status == status_t::success_k) result_ref = result_i32;
+        }
+        else {
+            sz_i64_t result_i64;
+            status = horizontal_i64_t {substituter_, gap_cost_, alloc_}(first, second, result_i64);
+            if (status == status_t::success_k) result_ref = result_i64;
+        }
+
+        return status;
+    }
+};
+
+/**
+ *  @brief  Computes the @b byte-level Smith-Waterman score between two strings using the Ice Lake (+OpenMP) backend.
+ *  @sa     `levenshtein_distance` for uniform substitution and gap costs.
+ */
+template <typename allocator_type_, sz_capability_t capability_>
+struct smith_waterman_score<char, error_costs_256x256_t, allocator_type_, capability_,
+                            std::enable_if_t<capability_ & sz_cap_ice_k>> {
+
+    using char_t = char;
+    using substituter_t = error_costs_256x256_t;
+    using allocator_t = allocator_type_;
+
+    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_capability_t capability_wout_simd_k = (sz_capability_t)(capability_k & ~sz_cap_ice_k);
+    static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
+
+    using horizontal_i16_t = horizontal_walker<char_t, sz_i16_t, substituter_t, allocator_t, sz_maximize_score_k,
+                                               sz_similarity_local_k, capability_serialized_k>;
+    using horizontal_i32_t = horizontal_walker<char_t, sz_i32_t, substituter_t, allocator_t, sz_maximize_score_k,
+                                               sz_similarity_local_k, capability_wout_simd_k>;
+    using horizontal_i64_t = horizontal_walker<char_t, sz_i64_t, substituter_t, allocator_t, sz_maximize_score_k,
+                                               sz_similarity_local_k, capability_wout_simd_k>;
+
+    substituter_t substituter_ {};
+    error_cost_t gap_cost_ {1};
+    allocator_t alloc_ {};
+
+    smith_waterman_score(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
+    smith_waterman_score(substituter_t subs, error_cost_t gap_cost, allocator_t alloc = allocator_t {}) noexcept
+        : substituter_(subs), gap_cost_(gap_cost), alloc_(alloc) {}
+
+    /**
+     *  @param[in] first The first string.
+     *  @param[in] second The second string.
+     *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
+     */
+    status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref) const noexcept {
+
+        sz_size_t const first_length = first.length();
+        sz_size_t const second_length = second.length();
+        if (first_length == 0) {
+            result_ref = second_length * gap_cost_;
+            return status_t::success_k;
+        }
+        if (second_length == 0) {
+            result_ref = first_length * gap_cost_;
+            return status_t::success_k;
+        }
+
+        // Estimate the maximum dimension of the DP matrix
+        sz_size_t const min_dim = sz_min_of_two(first_length, second_length) + 1;
+        sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
+
+        // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
+        // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
+        // Assuming each individual cost falls in [-128, 127], the `i16` range of [-32768, 32767] is sufficient
+        // for inputs under (32768 / 128) = 256 characters.
+        status_t status = status_t::success_k;
+        if (max_dim < 256u) {
+            sz_i16_t result_i16;
+            status = horizontal_i16_t {substituter_, gap_cost_, alloc_}(first, second, result_i16);
+            if (status == status_t::success_k) result_ref = result_i16;
+        }
+        // Assuming each individual cost falls in [-128, 127], the `i32` range of [-2147483648, 2147483647] is
+        // sufficient for inputs under (2147483648 / 128) = 16777216 characters.
+        else if (max_dim < 16777216u) {
+            sz_i32_t result_i32;
+            status = horizontal_i32_t {substituter_, gap_cost_, alloc_}(first, second, result_i32);
+            if (status == status_t::success_k) result_ref = result_i32;
+        }
+        else {
+            sz_i64_t result_i64;
+            status = horizontal_i64_t {substituter_, gap_cost_, alloc_}(first, second, result_i64);
+            if (status == status_t::success_k) result_ref = result_i64;
+        }
+
+        return status;
     }
 };
-#endif
 
 #pragma clang attribute pop
 #pragma GCC pop_options

From 6edcf7f6d4a5df624afe2e29f550c3b9e60ee318 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 11 Apr 2025 07:36:50 +0000
Subject: [PATCH 310/751] Fix: Uniform costs for UTF-32 runes

---
 include/stringzilla/types.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index acdf974e..6259a311 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -128,6 +128,7 @@ enum class status_t {
  */
 struct error_costs_uniform_t {
     constexpr error_cost_t operator()(char a, char b) const noexcept { return a == b ? 0 : 1; }
+    constexpr error_cost_t operator()(sz_rune_t a, sz_rune_t b) const noexcept { return a == b ? 0 : 1; }
 };
 
 template <typename value_type_>

From ee38a22fc295b707a4159de7b048a53dd2459421 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 11 Apr 2025 07:39:25 +0000
Subject: [PATCH 311/751] Fix: Included filename

---
 scripts/test_stringzilla.cpp | 2 +-
 scripts/test_stringzilla.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/test_stringzilla.cpp b/scripts/test_stringzilla.cpp
index bc804a76..d7272c9f 100644
--- a/scripts/test_stringzilla.cpp
+++ b/scripts/test_stringzilla.cpp
@@ -68,7 +68,7 @@
 #error "This test requires C++11 or later."
 #endif
 
-#include "test.hpp" // `global_random_generator`, `random_string`
+#include "test_stringzilla.hpp" // `global_random_generator`, `random_string`
 
 namespace sz = ashvardanian::stringzilla;
 using namespace sz::scripts;
diff --git a/scripts/test_stringzilla.hpp b/scripts/test_stringzilla.hpp
index 8af23816..6674d648 100644
--- a/scripts/test_stringzilla.hpp
+++ b/scripts/test_stringzilla.hpp
@@ -1,6 +1,6 @@
 /**
  *  @brief  Helper structures and functions for C++ unit- and stress-tests.
- *  @file   test.hpp
+ *  @file   test_stringzilla.hpp
  *  @author Ash Vardanian
  */
 #pragma once

From 9db02870a7148a980664730462f05887dcfb01f2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 11 Apr 2025 07:40:39 +0000
Subject: [PATCH 312/751] Fix: Build issues

---
 c/lib.c                  | 44 ----------------------------------------
 scripts/bench.hpp        | 12 +++++------
 scripts/bench_memory.cpp |  2 +-
 3 files changed, 7 insertions(+), 51 deletions(-)

diff --git a/c/lib.c b/c/lib.c
index f83d72b6..d4e3b419 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -197,9 +197,6 @@ typedef struct sz_implementations_t {
     sz_find_byteset_t find_byteset;
     sz_find_byteset_t rfind_byteset;
 
-    sz_levenshtein_distance_t levenshtein_distance;
-    sz_needleman_wunsch_score_t alignment_score;
-
     sz_sequence_argsort_t sequence_argsort;
     sz_sequence_intersect_t sequence_intersect;
     sz_pgrams_sort_t pgrams_sort;
@@ -242,9 +239,6 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
     impl->find_byteset = sz_find_byteset_serial;
     impl->rfind_byteset = sz_rfind_byteset_serial;
 
-    impl->levenshtein_distance = sz_levenshtein_distance_serial;
-    impl->alignment_score = sz_needleman_wunsch_score_serial;
-
     impl->sequence_argsort = sz_sequence_argsort_serial;
     impl->sequence_intersect = sz_sequence_intersect_serial;
     impl->pgrams_sort = sz_pgrams_sort_serial;
@@ -307,9 +301,6 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->find_byteset = sz_find_byteset_ice;
         impl->rfind_byteset = sz_rfind_byteset_ice;
 
-        impl->levenshtein_distance = sz_levenshtein_distance_ice;
-        impl->alignment_score = sz_needleman_wunsch_score_ice;
-
         impl->lookup = sz_lookup_ice;
 
         impl->bytesum = sz_bytesum_ice;
@@ -471,41 +462,6 @@ SZ_DYNAMIC sz_cptr_t sz_rfind_byteset(sz_cptr_t text, sz_size_t length, sz_bytes
     return sz_dispatch_table.rfind_byteset(text, length, set);
 }
 
-SZ_DYNAMIC sz_status_t sz_hamming_distance( //
-    sz_cptr_t a, sz_size_t a_length,        //
-    sz_cptr_t b, sz_size_t b_length,        //
-    sz_size_t bound, sz_size_t *result) {
-    return sz_hamming_distance_serial(a, a_length, b, b_length, bound, result);
-}
-
-SZ_DYNAMIC sz_status_t sz_hamming_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length,             //
-    sz_cptr_t b, sz_size_t b_length,             //
-    sz_size_t bound, sz_size_t *result) {
-    return sz_hamming_distance_utf8_serial(a, a_length, b, b_length, bound, result);
-}
-
-SZ_DYNAMIC sz_status_t sz_levenshtein_distance( //
-    sz_cptr_t a, sz_size_t a_length,            //
-    sz_cptr_t b, sz_size_t b_length,            //
-    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result) {
-    return sz_dispatch_table.levenshtein_distance(a, a_length, b, b_length, bound, alloc, result);
-}
-
-SZ_DYNAMIC sz_status_t sz_levenshtein_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length,                 //
-    sz_cptr_t b, sz_size_t b_length,                 //
-    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result) {
-    return _sz_levenshtein_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc, result);
-}
-
-SZ_DYNAMIC sz_status_t sz_needleman_wunsch_score( //
-    sz_cptr_t a, sz_size_t a_length,              //
-    sz_cptr_t b, sz_size_t b_length,              //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc, sz_ssize_t *result) {
-    return sz_dispatch_table.alignment_score(a, a_length, b, b_length, subs, gap, alloc, result);
-}
-
 SZ_DYNAMIC sz_status_t sz_pgrams_sort(sz_pgram_t *array, sz_size_t count, sz_memory_allocator_t *alloc,
                                       sz_size_t *order) {
     return sz_dispatch_table.pgrams_sort(array, count, alloc, order);
diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index 5795cdc3..632ce039 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -104,7 +104,7 @@ inline std::uint64_t cpu_cycle_counter() {
     __asm__ volatile("rdtsc" : "=a"(lo), "=d"(hi));
     return (static_cast<std::uint64_t>(hi) << 32) | lo;
 #elif defined(__aarch64__) || defined(_SZ_IS_ARM64)
-    // On ARM64, read the virtual count register (CNTVCT_EL0) which provides cycle count.
+    // On ARM64, read the virtual count register `CNTVCT_EL0` which provides cycle count.
     std::uint64_t cnt;
     asm volatile("mrs %0, cntvct_el0" : "=r"(cnt));
     return cnt;
@@ -185,9 +185,9 @@ static void do_not_optimize(argument_type &&value) noexcept {
  */
 inline std::size_t bit_floor(std::size_t n) {
     if (n == 0) return 0;
-    std::size_t most_siginificant_bit_position = 0;
-    while (n > 1) n >>= 1, most_siginificant_bit_position++;
-    return static_cast<std::size_t>(1) << most_siginificant_bit_position;
+    std::size_t most_significant_bit_position = 0;
+    while (n > 1) n >>= 1, most_significant_bit_position++;
+    return static_cast<std::size_t>(1) << most_significant_bit_position;
 }
 
 #if !SZ_USE_CUDA
@@ -195,9 +195,9 @@ using dataset_t = std::string;
 using token_view_t = std::string_view;
 using tokens_t = std::vector<token_view_t>;
 #else
-using dataset_t = std::basic_string<char, std::char_traits<char>, sz::cuda::unified_alloc<char>>;
+using dataset_t = std::basic_string<char, std::char_traits<char>, sz::unified_alloc<char>>;
 using token_view_t = sz::span<char const>;
-using tokens_t = std::vector<token_view_t, sz::cuda::unified_alloc<token_view_t>>;
+using tokens_t = std::vector<token_view_t, sz::unified_alloc<token_view_t>>;
 #endif
 
 /**
diff --git a/scripts/bench_memory.cpp b/scripts/bench_memory.cpp
index 951b01cf..13f31b24 100644
--- a/scripts/bench_memory.cpp
+++ b/scripts/bench_memory.cpp
@@ -260,7 +260,7 @@ struct fill_random_from_sz {
 void memset_like_sz(sz_ptr_t output, sz_size_t length, sz_u8_t value) { std::memset(output, value, length); }
 
 void generate_like_sz(sz_ptr_t output, sz_size_t length, sz_u64_t nonce) {
-    uniform_uint8_distribution_t distribution;
+    uniform_u8_distribution_t distribution;
     std::generate(output, output + length, [&]() -> char { return distribution(global_random_generator()); });
     sz_unused(nonce);
 }

From 1ece547ec87f9973a6e6a699c35b081815e96a1a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 11 Apr 2025 07:42:07 +0000
Subject: [PATCH 313/751] Make: Parallel test launchers

---
 .vscode/launch.json | 86 ++++++++++++++++++++++++++++++++++++++++-----
 .vscode/tasks.json  | 26 +++++++++++++-
 CMakeLists.txt      |  3 +-
 3 files changed, 104 insertions(+), 11 deletions(-)

diff --git a/.vscode/launch.json b/.vscode/launch.json
index 70e06dc5..7c19acda 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -8,7 +8,7 @@
       "name": "Debug C++ Unit Tests",
       "type": "cppdbg",
       "request": "launch",
-      "preLaunchTask": "Build Test: Debug",
+      "preLaunchTask": "Build Test: Debug C++",
       "program": "${workspaceFolder}/build_debug/stringzilla_test_cpp20",
       "cwd": "${workspaceFolder}",
       "environment": [
@@ -37,14 +37,69 @@
         "miDebuggerPath": "C:\\MinGw\\bin\\gdb.exe"
       }
     },
+    {
+      "name": "Debug Parallel C++ Unit Tests",
+      "type": "cppdbg",
+      "request": "launch",
+      "preLaunchTask": "Build Test: Debug Parallel C++",
+      "program": "${workspaceFolder}/build_debug/stringcuzilla_test_cpp20",
+      "cwd": "${workspaceFolder}",
+      "environment": [
+        {
+          "name": "ASAN_OPTIONS",
+          "value": "detect_leaks=0:atexit=1:strict_init_order=1:strict_string_checks=1"
+        }
+      ],
+      "stopAtEntry": false,
+      "linux": {
+        "MIMode": "gdb",
+        "setupCommands": [
+          {
+            "description": "Enable pretty-printing for GDB",
+            "text": "-enable-pretty-printing",
+            "ignoreFailures": true
+          },
+          {
+            "description": "Enable object printing",
+            "text": "set print object on",
+            "ignoreFailures": true
+          }
+        ]
+      },
+      "osx": {
+        "MIMode": "lldb"
+      },
+      "windows": {
+        "program": "${workspaceFolder}\\build_debug\\stringcuzilla_test_cpp20.exe",
+        "MIMode": "gdb",
+        "miDebuggerPath": "C:\\MinGw\\bin\\gdb.exe"
+      }
+    },
+    {
+      "name": "Debug CUDA Unit Tests",
+      "type": "cuda-gdb",
+      "request": "launch",
+      "preLaunchTask": "Build Test: Debug CUDA",
+      "program": "${workspaceFolder}/build_debug/stringcuzilla_test_cu20",
+      "cwd": "${workspaceFolder}",
+      "environment": [
+        {
+          "name": "ASAN_OPTIONS",
+          "value": "detect_leaks=0:atexit=1:strict_init_order=1:strict_string_checks=1"
+        }
+      ],
+      "stopAtEntry": false,
+      "initCommands": [
+        "-enable-pretty-printing",
+        "set print frame-arguments all",
+        "set print asm-demangle on"
+      ],
+    },
     {
       "name": "Current C++ Benchmark",
       "type": "cppdbg",
       "request": "launch",
       "program": "${workspaceFolder}/build_debug/stringzilla_${fileBasenameNoExtension}",
-      "args": [
-        "leipzig1M.txt"
-      ],
       "cwd": "${workspaceFolder}",
       "environment": [
         {
@@ -53,7 +108,7 @@
         },
         {
           "name": "STRINGWARS_DATASET",
-          "value": "leipzig1M.txt"
+          "value": "utf8.txt"
         }
       ],
       "stopAtEntry": false,
@@ -77,9 +132,24 @@
         "miDebuggerPath": "C:\\MinGw\\bin\\gdb.exe"
       }
     },
+    {
+      "name": "Current CUDA Benchmark",
+      "type": "cuda-gdb",
+      "request": "launch",
+      "program": "${workspaceFolder}/build_debug/stringcuzilla_${fileBasenameNoExtension}",
+      "cwd": "${workspaceFolder}",
+      "environment": [
+        {
+          "name": "STRINGWARS_DATASET",
+          "value": "leipzig1M.txt"
+        }
+      ],
+      "stopAtEntry": false,
+      "preLaunchTask": "Build Benchmarks: Debug",
+    },
     {
       "name": "Current Python File",
-      "type": "python",
+      "type": "debugpy",
       "request": "launch",
       "program": "${file}",
       "console": "integratedTerminal",
@@ -87,7 +157,7 @@
     },
     {
       "name": "Current Python File with Leipzig1M arg",
-      "type": "python",
+      "type": "debugpy",
       "request": "launch",
       "program": "${file}",
       "console": "integratedTerminal",
@@ -98,7 +168,7 @@
     },
     {
       "name": "Current PyTest File",
-      "type": "python",
+      "type": "debugpy",
       "request": "launch",
       "module": "pytest",
       "args": [
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
index 1ffc6a28..841139b0 100644
--- a/.vscode/tasks.json
+++ b/.vscode/tasks.json
@@ -2,7 +2,7 @@
     "version": "2.0.0",
     "tasks": [
         {
-            "label": "Build Test: Debug",
+            "label": "Build Test: Debug C++",
             "command": "cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=1 -B build_debug && cmake --build build_debug --config Debug --target stringzilla_test_cpp20",
             "args": [],
             "type": "shell",
@@ -19,6 +19,30 @@
                 ]
             }
         },
+        {
+            "label": "Build Test: Debug Parallel C++",
+            "command": "cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=1 -B build_debug && cmake --build build_debug --config Debug --target stringcuzilla_test_cpp20",
+            "args": [],
+            "type": "shell",
+            "osx": {
+                "environment": [
+                    {
+                        "name": "CXX",
+                        "value": "$(brew --prefix llvm)/bin/clang++"
+                    },
+                    {
+                        "name": "CC",
+                        "value": "$(brew --prefix llvm)/bin/clang"
+                    }
+                ]
+            }
+        },
+        {
+            "label": "Build Test: Debug CUDA",
+            "command": "cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=1 -B build_debug && cmake --build build_debug --config Debug --target stringcuzilla_test_cu20",
+            "args": [],
+            "type": "shell",
+        },
         {
             "label": "Build Benchmarks: Debug",
             "command": "cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=0 -D STRINGZILLA_BUILD_BENCHMARK=1 -B build_debug && cmake --build build_debug --config Debug",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 41a02641..d0d5a83c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -355,10 +355,9 @@ if (${STRINGZILLA_BUILD_BENCHMARK})
     define_launcher(stringzilla_bench_token scripts/bench_token.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_container scripts/bench_container.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_memory scripts/bench_memory.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_bench_similarity scripts/bench_similarity.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     if (ENABLE_CUDA)
         define_gpu_launcher(stringcuzilla_bench_similarity scripts/bench_similarity.cu 20 "${STRINGZILLA_TARGET_ARCH}")
-    else ()
-        define_launcher(stringcuzilla_bench_similarity scripts/bench_similarity.cu 20 "${STRINGZILLA_TARGET_ARCH}")
     endif ()
 endif ()
 

From b04e934ccae21c8c014bd73ca5ffe1c402de208f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 11 Apr 2025 07:44:53 +0000
Subject: [PATCH 314/751] Improve: Better UTF8 tests for similarity

---
 scripts/test_stringcuzilla.cuh | 100 ++++++++++++++++++++++++---------
 1 file changed, 72 insertions(+), 28 deletions(-)

diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index 3b5de381..b88275aa 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -266,36 +266,80 @@ template <typename score_type_, typename base_operator_, typename simd_operator_
 void test_similarity_scores_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
                                   std::string_view allowed_chars = {}) {
 
-    std::vector<std::pair<std::string, std::string>> test_cases = {
-        {"ABC", "ABC"},                  // same string; distance ~ 0
-        {"LISTEN", "SILENT"},            // distance ~ 4
-        {"ATCA", "CTACTCACCC"},          // distance ~ 6
-        {"A", "="},                      // distance ~ 1
-        {"A", "A"},                      // distance ~ 0
-        {"", ""},                        // distance ~ 0
-        {"", "ABC"},                     // distance ~ 3
-        {"ABC", ""},                     // distance ~ 3
-        {"ABC", "AC"},                   // one deletion; distance ~ 1
-        {"ABC", "A_BC"},                 // one insertion; distance ~ 1
-        {"ggbuzgjux{}l", "gbuzgjux{}l"}, // one (prepended) insertion; distance ~ 1
-        {"ABC", "ADC"},                  // one substitution; distance ~ 1
-        {"APPLE", "APLE"},               // distance ~ 1
-        //
-        // Unicode:
-        {"αβγδ", "αγδ"},                      // Each Greek symbol is 2 bytes in size; 2 bytes, 1 runes diff.
-        {"مرحبا بالعالم", "مرحبا يا عالم"},   // "Hello World" vs "Welcome to the World" ?; 3 bytes, 2 runes diff.
-        {"école", "école"},                   // letter "é" as a single character vs "e" + "´"; 3 bytes, 2 runes diff.
-        {"Schön", "Scho\u0308n"},             // "ö" represented as "o" + "¨"; 3 bytes, 2 runes diff.
-        {"💖", "💗"},                         // 4-byte emojis: Different hearts; 1 bytes, 1 runes diff.
-        {"𠜎 𠜱 𠝹 𠱓", "𠜎𠜱𠝹𠱓"},          // Ancient Chinese characters, no spaces vs spaces; 3 bytes, 3 runes
-        {"München", "Muenchen"},              // German name with umlaut vs. its transcription; 2 bytes, 2 runes
-        {"façade", "facade"},                 // "ç" represented as "c" with cedilla vs. plain "c"; 2 bytes, 1 runes
-        {"こんにちは世界", "こんばんは世界"}, // "Good morning world" vs "Good evening world"; 3 bytes, 2 runes
-        {"👩‍👩‍👧‍👦", "👨‍👩‍👧‍👦"}, // Different family emojis; 1 bytes, 1 runes
-        {"Data科学123", "Data科學321"},                             // 3 bytes, 3 runes
-        {"🙂🌍🚀", "🙂🌎✨"},                                       // 5 bytes, 2 runes
+    std::vector<std::pair<std::string, std::string>> test_cases;
+    auto append = [&test_cases](std::string const &first, std::string const &second) {
+        test_cases.emplace_back(first, second);
     };
 
+    // Some vary basic variants:
+    append("ABC", "ABC");                  // same string; distance ~ 0
+    append("LISTEN", "SILENT");            // distance ~ 4
+    append("ATCA", "CTACTCACCC");          // distance ~ 6
+    append("A", "=");                      // distance ~ 1
+    append("A", "A");                      // distance ~ 0
+    append("", "");                        // distance ~ 0
+    append("", "ABC");                     // distance ~ 3
+    append("ABC", "");                     // distance ~ 3
+    append("ABC", "AC");                   // one deletion; distance ~ 1
+    append("ABC", "A_BC");                 // one insertion; distance ~ 1
+    append("ggbuzgjux{}l", "gbuzgjux{}l"); // one (prepended) insertion; distance ~ 1
+    append("ABC", "ADC");                  // one substitution; distance ~ 1
+    append("APPLE", "APLE");               // distance ~ 1
+
+    // Short Unicode samples that we also use on the Python side:
+    append("αβγδ", "αγδ");                      // Each Greek symbol is 2 bytes in size; 2 bytes, 1 runes diff.
+    append("école", "école");                   // letter "é" as a single character vs "e" + "´"; 3 bytes, 2 runes diff.
+    append("Schön", "Scho\u0308n");             // "ö" represented as "o" + "¨"; 3 bytes, 2 runes diff.
+    append("Data科学123", "Data科學321");       // 3 bytes, 3 runes
+    append("🙂🌍🚀", "🙂🌎✨");                 // 5 bytes, 2 runes
+    append("💖", "💗");                         // 4-byte emojis: Different hearts; 1 bytes, 1 runes diff.
+    append("مرحبا بالعالم", "مرحبا يا عالم");   // "Hello World" vs "Welcome to the World" ?; 3 bytes, 2 runes diff.
+    append("𠜎 𠜱 𠝹 𠱓", "𠜎𠜱𠝹𠱓");          // Ancient Chinese characters, no spaces vs spaces; 3 bytes, 3 runes
+    append("München", "Muenchen");              // German name with umlaut vs. its transcription; 2 bytes, 2 runes
+    append("façade", "facade");                 // "ç" represented as "c" with cedilla vs. plain "c"; 2 bytes, 1 runes
+    append("こんにちは世界", "こんばんは世界"); // "Good morning world" vs "Good evening world"; 3 bytes, 2 runes
+    append("👩‍👩‍👧‍👦", "👨‍👩‍👧‍👦"); // Different family emojis; 1 bytes, 1 runes
+
+    // ~20 characters; two similar integral expressions that differ in the upper limit.
+    append("∫₀¹ x² dx = 1/3", "∫₀² x² dx = 8/3");
+
+    // ~50 characters; typography test with box-drawing, quote style, currency symbol, dash type, and case differences.
+    append("╔══╦══╗ • ‘single’ and “double” quotes, € 14.95 — OK",
+           "╔══╦══╗ • ‘single’ and «double» quotes, $ 14.95 – ok");
+
+    // ~100 characters in one string combining Armenian, Georgian, and Greek:
+    append("Երևան, თბილისი, και Αθήνα – 3 մայրքաղաքներ: Բարի գալուստ, მოგესალმებით, και Καλώς ορίσατε!",
+           "Երևան, თბილისი, και Αθήνα – երեք մայրքաղաքներ: բարև, სტუმრები, και Καλώς ήρθατε!");
+
+    // ~200 characters in ASCII English, Traditional Chinese, and Russian, describing their capitals.
+    append("London, the iconic capital of the United Kingdom, seamlessly blends centuries-old traditions with bold "
+           "modernity;"
+           "倫敦作為英國的標誌性首都，其歷史沉澱與當代創新彼此交融，展現獨特風範;"
+           "Лондон, столица Великобритании, объединяет древние традиции с динамичной современностью, "
+           "offering a rich tapestry of cultural heritage and visionary progress.", // First string ends here ;)
+           "London, the renowned capital of the UK, fuses its rich historical legacy with a spirit of modern "
+           "innovation;"
+           "倫敦，作為英國的著名首都，以悠久歷史與現代創意相互融合，呈現獨特都市風貌;"
+           "Лондон – известная столица Великобритании, где древность встречается с современной энергией, "
+           "creating an inspiring environment for cultural exploration and future development.");
+
+    // ~300 characters; a complex variant with translations and visible regions of Korean, Japanese, Chinese
+    // (traditional and simplified), German, French, Spanish.
+    append("An epic voyage through multicultural realms: "
+           "In a city where ancient traditions fuse with modern innovation, dynamic energy permeates every street. "
+           "서울의 번화한 거리에선 전통과 현대가 어우러져 감동을 주며, 東京では伝統美と未来の夢が共鳴する。在這裡, "
+           "傳統文化與現代科技和諧並存, 而这里, 传统文化与现代科技交织创新; "
+           "Deutschland zeigt eine reiche Geschichte, "
+           "la France révèle une élégance subtile, "
+           "y España irradia pasión y color.", // First string ends here ;)
+           "An epic journey through diverse cultures: "
+           "In a town where old traditions fuse with innovation, energy permeates every historic street. "
+           "서울의 번화한 거리는 전통과 현대가 어울려 독특한 풍경을 이루며, "
+           "東京では伝統美と未来への展望が響き合う。在這裡, 傳統與現代科技融合無間, 而这里, 传统与现代科技紧密相连; "
+           "Deutschland offenbart eine stolze Geschichte, "
+           "la France incarne une élégance fine, "
+           "y España resplandece con pasión y vivacidad.");
+
     // First check with a batch-size of 1
     using score_t = score_type_;
     unified_vector<score_t> results_base(1), results_simd(1);

From f76fa40ffaec3df2bdbcd8fd0ab20f793ce855ef Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 11 Apr 2025 07:45:29 +0000
Subject: [PATCH 315/751] Add: Multi-flag `sz_caps`

---
 include/stringzilla/types.h    |   5 ++
 scripts/test_stringcuzilla.cuh | 109 +++++++++++++++++++--------------
 2 files changed, 68 insertions(+), 46 deletions(-)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 08ae5c64..e1db3167 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -470,6 +470,11 @@ typedef enum sz_capability_t {
     sz_cap_kepler_k = 1 << 21, ///< CUDA capability with support with in-warp register shuffles
     sz_cap_hopper_k = 1 << 22, ///< CUDA capability with support for Hopper's DPX instructions
 
+    sz_caps_sp_k = sz_cap_serial_k | sz_cap_parallel_k,                 ///< Serial code with OpenMP
+    sz_caps_spi_k = sz_cap_serial_k | sz_cap_parallel_k | sz_cap_ice_k, ///< Serial code with OpenMP and Ice Lake
+    sz_caps_sps_k = sz_cap_serial_k | sz_cap_parallel_k | sz_cap_sve_k, ///< Serial code with OpenMP and SVE
+    sz_caps_ckh_k = sz_cap_cuda_k | sz_cap_kepler_k | sz_cap_hopper_k,  ///< CUDA code with Kepler and Hopper
+
 } sz_capability_t;
 
 /**
diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index b88275aa..b41e37eb 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -210,39 +210,40 @@ struct smith_waterman_baselines_t {
     }
 };
 
-using levenshtein_serial_t = levenshtein_distances<sz_cap_parallel_k, char, std::allocator<char>>;
-using levenshtein_utf8_serial_t = levenshtein_distances_utf8<sz_cap_parallel_k, char, std::allocator<char>>;
-using needleman_wunsch_serial_t = needleman_wunsch_scores<sz_cap_parallel_k, char, std::allocator<char>>;
-using smith_waterman_serial_t = smith_waterman_scores<sz_cap_parallel_k, char, std::allocator<char>>;
+using malloc_t = std::allocator<char>;
+
+/**
+ *  In non-SIMD backends we still leverage OpenMP for parallelism.
+ */
+using levenshtein_serial_t = levenshtein_distances<char, malloc_t, sz_caps_sp_k>;
+using levenshtein_utf8_serial_t = levenshtein_distances_utf8<char, malloc_t, sz_caps_sp_k>;
+using needleman_wunsch_serial_t = needleman_wunsch_scores<char, error_costs_256x256_t, malloc_t, sz_caps_sp_k>;
+using smith_waterman_serial_t = smith_waterman_scores<char, error_costs_256x256_t, malloc_t, sz_caps_sp_k>;
 
 /**
  *  In @b AVX-512:
  *  - for Global Alignments, we can vectorize the min-max calculation for diagonal "walkers"
  *  - for Local Alignments, we can vectorize the character substitution lookups for horizontal "walkers"
  */
-using levenshtein_ice_t =
-    levenshtein_distances<(sz_capability_t)(sz_cap_parallel_k | sz_cap_ice_k), char, std::allocator<char>>;
-using levenshtein_utf8_ice_t =
-    levenshtein_distances_utf8<(sz_capability_t)(sz_cap_parallel_k | sz_cap_ice_k), char, std::allocator<char>>;
-using needleman_wunsch_ice_t =
-    needleman_wunsch_scores<(sz_capability_t)(sz_cap_parallel_k | sz_cap_ice_k), char, std::allocator<char>>;
-using smith_waterman_ice_t =
-    smith_waterman_scores<(sz_capability_t)(sz_cap_parallel_k | sz_cap_ice_k), char, std::allocator<char>>;
+using levenshtein_ice_t = levenshtein_distances<char, malloc_t, sz_caps_spi_k>;
+using levenshtein_utf8_ice_t = levenshtein_distances_utf8<char, malloc_t, sz_caps_spi_k>;
+using needleman_wunsch_ice_t = needleman_wunsch_scores<char, error_costs_256x256_t, malloc_t, sz_caps_spi_k>;
+using smith_waterman_ice_t = smith_waterman_scores<char, error_costs_256x256_t, malloc_t, sz_caps_spi_k>;
 
 /**
  *  In @b CUDA:
  *  - for GPUs before Hopper, we can use the @b SIMT model for warp-level parallelism using diagonal "walkers"
  *  - for GPUs after Hopper, we compound that with thread-level @b SIMD via @b DPX instructions for min-max
  */
-using levenshtein_cuda_t = levenshtein_distances<sz_cap_cuda_k, char>;
-using levenshtein_utf8_cuda_t = levenshtein_distances_utf8<sz_cap_cuda_k, char>;
-using needleman_wunsch_cuda_t = needleman_wunsch_scores<sz_cap_cuda_k, char>;
-using smith_waterman_cuda_t = smith_waterman_scores<sz_cap_cuda_k, char>;
+using levenshtein_cuda_t = levenshtein_distances<char, dummy_alloc_t, sz_cap_cuda_k>;
+using levenshtein_utf8_cuda_t = levenshtein_distances_utf8<char, dummy_alloc_t, sz_cap_cuda_k>;
+using needleman_wunsch_cuda_t = needleman_wunsch_scores<char, error_costs_256x256_t, dummy_alloc_t, sz_cap_cuda_k>;
+using smith_waterman_cuda_t = smith_waterman_scores<char, error_costs_256x256_t, dummy_alloc_t, sz_cap_cuda_k>;
 
-using levenshtein_hopper_t = levenshtein_distances<sz_cap_hopper_k, char>;
-using levenshtein_utf8_hopper_t = levenshtein_distances_utf8<sz_cap_hopper_k, char>;
-using needleman_wunsch_hopper_t = needleman_wunsch_scores<sz_cap_hopper_k, char>;
-using smith_waterman_hopper_t = smith_waterman_scores<sz_cap_hopper_k, char>;
+using levenshtein_hopper_t = levenshtein_distances<char, dummy_alloc_t, sz_caps_ckh_k>;
+using levenshtein_utf8_hopper_t = levenshtein_distances_utf8<char, dummy_alloc_t, sz_caps_ckh_k>;
+using needleman_wunsch_hopper_t = needleman_wunsch_scores<char, error_costs_256x256_t, dummy_alloc_t, sz_caps_ckh_k>;
+using smith_waterman_hopper_t = smith_waterman_scores<char, error_costs_256x256_t, dummy_alloc_t, sz_caps_ckh_k>;
 
 template <typename score_type_>
 void edit_distance_log_mismatch(std::string const &first, std::string const &second, //
@@ -484,12 +485,12 @@ void test_similarity_scores_equivalence() {
     // Single-threaded serial Levenshtein distance implementation
     test_similarity_scores_fixed_and_fuzzy<sz_size_t>( //
         levenshtein_baselines_t {},                    //
-        levenshtein_distances<sz_cap_serial_k, char, std::allocator<char>> {});
+        levenshtein_distances<char, malloc_t, sz_cap_serial_k> {});
 
     // Multi-threaded parallel Levenshtein distance implementation
     test_similarity_scores_fixed_and_fuzzy<sz_size_t>( //
         levenshtein_baselines_t {},                    //
-        levenshtein_distances<sz_cap_parallel_k, char, std::allocator<char>> {});
+        levenshtein_distances<char, malloc_t, sz_caps_sp_k> {});
 
     // Now let's take non-unary substitution costs, like BLOSUM62
     constexpr error_t blosum62_gap_extension_cost = -4;
@@ -499,46 +500,58 @@ void test_similarity_scores_equivalence() {
     // Single-threaded serial NW implementation
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                              //
         needleman_wunsch_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
-        needleman_wunsch_scores<sz_cap_serial_k, char, error_matrix_t, std::allocator<char>> {
-            blosum62_matrix, blosum62_gap_extension_cost});
+        needleman_wunsch_scores<char, error_matrix_t, malloc_t, sz_cap_serial_k> {blosum62_matrix,
+                                                                                  blosum62_gap_extension_cost});
 
     // Multi-threaded parallel NW implementation
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                              //
         needleman_wunsch_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
-        needleman_wunsch_scores<sz_cap_parallel_k, char, error_matrix_t, std::allocator<char>> {
-            blosum62_matrix, blosum62_gap_extension_cost});
+        needleman_wunsch_scores<char, error_matrix_t, malloc_t, sz_caps_sp_k> {blosum62_matrix,
+                                                                               blosum62_gap_extension_cost});
 
     // Single-threaded serial SW implementation
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                            //
         smith_waterman_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
-        smith_waterman_scores<sz_cap_serial_k, char, error_matrix_t, std::allocator<char>> {
-            blosum62_matrix, blosum62_gap_extension_cost});
+        smith_waterman_scores<char, error_matrix_t, malloc_t, sz_cap_serial_k> {blosum62_matrix,
+                                                                                blosum62_gap_extension_cost});
 
     // Multi-threaded parallel SW implementation
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                            //
         smith_waterman_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
-        smith_waterman_scores<sz_cap_parallel_k, char, error_matrix_t, std::allocator<char>> {
-            blosum62_matrix, blosum62_gap_extension_cost});
+        smith_waterman_scores<char, error_matrix_t, malloc_t, sz_caps_sp_k> {blosum62_matrix,
+                                                                             blosum62_gap_extension_cost});
 
 #if SZ_USE_ICE
     // Ice Lake Levenshtein distance against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                           //
-        levenshtein_distances<sz_cap_parallel_k, char, std::allocator<char>> {}, //
-        levenshtein_distances<(sz_capability_t)(sz_cap_parallel_k | sz_cap_ice_k), char, std::allocator<char>> {});
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(          //
+        levenshtein_distances<char, malloc_t, sz_caps_sp_k> {}, //
+        levenshtein_distances<char, malloc_t, sz_caps_spi_k> {});
+
+    // Ice Lake Levenshtein distance against Multi-threaded on CPU
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(               //
+        levenshtein_distances_utf8<char, malloc_t, sz_caps_sp_k> {}, //
+        levenshtein_distances_utf8<char, malloc_t, sz_caps_spi_k> {});
+
+    // Ice Lake Needleman-Wunsch distance against Multi-threaded on CPU
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                              //
+        needleman_wunsch_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
+        needleman_wunsch_scores<char, error_matrix_t, malloc_t, sz_caps_spi_k> {blosum62_matrix,
+                                                                                blosum62_gap_extension_cost});
+
 #endif
 
 #if SZ_USE_CUDA
     // CUDA Levenshtein distance against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                           //
-        levenshtein_distances<sz_cap_parallel_k, char, std::allocator<char>> {}, //
-        levenshtein_distances<sz_cap_cuda_k, char> {});
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(               //
+        levenshtein_distances<char, malloc_t, sz_cap_parallel_k> {}, //
+        levenshtein_distances<char, dummy_alloc_t, sz_cap_cuda_k> {});
 #endif
 
 #if SZ_USE_HOPPER && 0
     // CUDA Levenshtein distance on Hopper against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                           //
-        levenshtein_distances<sz_cap_parallel_k, char, std::allocator<char>> {}, //
-        levenshtein_distances<sz_cap_hopper_k, char> {});
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(               //
+        levenshtein_distances<char, malloc_t, sz_cap_parallel_k> {}, //
+        levenshtein_distances<char, sz_cap_hopper_k> {});
 #endif
 
     // Switch to the GPU, using an identical matrix, but move it into unified memory
@@ -550,9 +563,9 @@ void test_similarity_scores_equivalence() {
     // using a compressed smaller matrix to fit into GPU shared memory
     std::string_view ascii_alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
-        needleman_wunsch_scores<sz_cap_parallel_k, char, error_matrix_t, std::allocator<char>> {
-            blosum62_matrix, blosum62_gap_extension_cost}, //
-        needleman_wunsch_scores<sz_cap_cuda_k, char, error_mat_t *> {blosum62_unified.data(),
+        needleman_wunsch_scores<char, error_matrix_t, malloc_t, sz_cap_parallel_k> {blosum62_matrix,
+                                                                                    blosum62_gap_extension_cost}, //
+        needleman_wunsch_scores<char, error_mat_t *, sz_cap_cuda_k> {blosum62_unified.data(),
                                                                      blosum62_gap_extension_cost},
         ascii_alphabet);
 #endif
@@ -588,7 +601,9 @@ void test_similarity_scores_memory_usage() {
         {.batch_size = 10, .min_string_length = 1, .max_string_length = 131072, .iterations = 1},
     };
 
+#if SZ_USE_CUDA
     gpu_specs_t first_gpu_specs = *gpu_specs();
+#endif
 
     // Progress until something fails
     for (fuzzy_config_t const &experiment : experiments) {
@@ -599,12 +614,14 @@ void test_similarity_scores_memory_usage() {
         // Multi-threaded serial Levenshtein distance implementation
         test_similarity_scores_fuzzy<sz_size_t>( //
             levenshtein_baselines_t {},          //
-            levenshtein_distances<sz_cap_parallel_k, char, std::allocator<char>> {}, experiment);
+            levenshtein_distances<char, malloc_t, sz_caps_sp_k> {}, experiment);
 
+#if SZ_USE_CUDA
         // CUDA Levenshtein distance against Multi-threaded on CPU
-        test_similarity_scores_fuzzy<sz_size_t>(                                     //
-            levenshtein_distances<sz_cap_parallel_k, char, std::allocator<char>> {}, //
-            levenshtein_distances<sz_cap_cuda_k, char> {}, experiment, first_gpu_specs);
+        test_similarity_scores_fuzzy<sz_size_t>(                    //
+            levenshtein_distances<char, malloc_t, sz_caps_sp_k> {}, //
+            levenshtein_distances<char, dummy_alloc_t, sz_cap_cuda_k> {}, experiment, first_gpu_specs);
+#endif
     }
 }
 

From cb739e2c35e19c25278ef3a99237eceaaa9a6ff3 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 11 Apr 2025 07:45:59 +0000
Subject: [PATCH 316/751] Improve: Catch & log exceptions

---
 scripts/test_stringcuzilla.cpp | 10 ++++++++--
 scripts/test_stringcuzilla.cu  | 10 ++++++++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/scripts/test_stringcuzilla.cpp b/scripts/test_stringcuzilla.cpp
index 7969bd33..32356bd9 100644
--- a/scripts/test_stringcuzilla.cpp
+++ b/scripts/test_stringcuzilla.cpp
@@ -37,8 +37,14 @@ int main(int argc, char const **argv) {
     std::printf("Hi, dear tester! You look nice today!\n");
     if (auto code = sz::scripts::log_environment(); code != 0) return code;
 
-    sz::scripts::test_similarity_scores_equivalence();
-    sz::scripts::test_similarity_scores_memory_usage();
+    try {
+        sz::scripts::test_similarity_scores_equivalence();
+        sz::scripts::test_similarity_scores_memory_usage();
+    }
+    catch (std::exception const &e) {
+        std::fprintf(stderr, "Failed with: %s\n", e.what());
+        return 1;
+    }
 
     std::printf("All tests passed... Unbelievable!\n");
     return 0;
diff --git a/scripts/test_stringcuzilla.cu b/scripts/test_stringcuzilla.cu
index 7042d63b..880f3ffb 100644
--- a/scripts/test_stringcuzilla.cu
+++ b/scripts/test_stringcuzilla.cu
@@ -37,8 +37,14 @@ int main(int argc, char const **argv) {
     std::printf("Hi, dear tester! You look nice today!\n");
     if (auto code = sz::scripts::log_environment(); code != 0) return code;
 
-    sz::scripts::test_similarity_scores_equivalence();
-    sz::scripts::test_similarity_scores_memory_usage();
+    try {
+        sz::scripts::test_similarity_scores_equivalence();
+        sz::scripts::test_similarity_scores_memory_usage();
+    }
+    catch (std::exception const &e) {
+        std::fprintf(stderr, "Failed with: %s\n", e.what());
+        return 1;
+    }
 
     std::printf("All tests passed... Unbelievable!\n");
     return 0;

From 8cc9794ebb9813ec645887ada795cae39b820d28 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 11 Apr 2025 08:15:52 +0000
Subject: [PATCH 317/751] Fix: `i16` Ice Lake NW/SW alignment

---
 include/stringcuzilla/similarity.hpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index 32684bcd..e13d1d4f 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -1944,11 +1944,11 @@ struct linear_scorer<constant_iterator<char>, char const *, sz_i16_t, error_cost
 #pragma omp parallel for simd if (capability_k & sz_cap_parallel_k)
         // Progress through the row 64 characters at a time.
         for (sz_size_t idx_slice = 0; idx_slice != count_slices; ++idx_slice)
-            slice_64chars(second_slice, idx_slice * 64, scores_pre_substitution, scores_pre_insertion, scores_new);
+            slice_64chars(second_slice, idx_slice * 64, gap, scores_pre_substitution, scores_pre_insertion, scores_new);
 
         // Handle the tail with a less efficient kernel - at most 2 iterations of the following loop:
         for (sz_size_t idx_half_slice = count_slices * 2; idx_half_slice * 32 < n; ++idx_half_slice)
-            slice_under32chars(second_slice, idx_half_slice * 32, n, scores_pre_substitution, scores_pre_insertion,
+            slice_under32chars(second_slice, idx_half_slice * 32, n, gap, scores_pre_substitution, scores_pre_insertion,
                                scores_new);
 
         // Horizontally compute the running minimum of the last row.
@@ -1964,7 +1964,7 @@ struct linear_scorer<constant_iterator<char>, char const *, sz_i16_t, error_cost
         this->last_score_ = last_in_row;
     }
 
-    void slice_64chars(char const *second_slice, sz_size_t i,                                         //
+    void slice_64chars(char const *second_slice, sz_size_t i, sz_i16_t gap,                           //
                        sz_i16_t const *scores_pre_substitution, sz_i16_t const *scores_pre_insertion, //
                        sz_i16_t *scores_new) const noexcept {
 
@@ -1974,8 +1974,8 @@ struct linear_scorer<constant_iterator<char>, char const *, sz_i16_t, error_cost
         sz_u512_vec_t cost_if_substitution_vecs[2], cost_if_gap_vecs[2], cell_score_vecs[2];
 
         // Initialize constats:
-        sz_u512_vec_t ones_vec;
-        ones_vec.zmm = _mm512_set1_epi16(1);
+        sz_u512_vec_t gap_cost_vec;
+        gap_cost_vec.zmm = _mm512_set1_epi16(gap);
 
         // Load the data without any masks:
         second_vec.zmm = _mm512_loadu_epi8(second_slice + i);
@@ -1997,8 +1997,8 @@ struct linear_scorer<constant_iterator<char>, char const *, sz_i16_t, error_cost
             _mm512_add_epi16(pre_substitution_vecs[0].zmm, cost_of_substitution_i16_vecs[0].zmm);
         cost_if_substitution_vecs[1].zmm =
             _mm512_add_epi16(pre_substitution_vecs[1].zmm, cost_of_substitution_i16_vecs[1].zmm);
-        cost_if_gap_vecs[0].zmm = _mm512_add_epi16(pre_gap_vecs[0].zmm, ones_vec.zmm);
-        cost_if_gap_vecs[1].zmm = _mm512_add_epi16(pre_gap_vecs[1].zmm, ones_vec.zmm);
+        cost_if_gap_vecs[0].zmm = _mm512_add_epi16(pre_gap_vecs[0].zmm, gap_cost_vec.zmm);
+        cost_if_gap_vecs[1].zmm = _mm512_add_epi16(pre_gap_vecs[1].zmm, gap_cost_vec.zmm);
         cell_score_vecs[0].zmm = _mm512_max_epi16(cost_if_substitution_vecs[0].zmm, cost_if_gap_vecs[0].zmm);
         cell_score_vecs[1].zmm = _mm512_max_epi16(cost_if_substitution_vecs[1].zmm, cost_if_gap_vecs[1].zmm);
 
@@ -2012,7 +2012,7 @@ struct linear_scorer<constant_iterator<char>, char const *, sz_i16_t, error_cost
         _mm512_storeu_epi16(scores_new + i + 32, cell_score_vecs[1].zmm);
     }
 
-    void slice_under32chars(char const *second_slice, sz_size_t i, sz_size_t n,                            //
+    void slice_under32chars(char const *second_slice, sz_size_t i, sz_size_t n, sz_i16_t gap,              //
                             sz_i16_t const *scores_pre_substitution, sz_i16_t const *scores_pre_insertion, //
                             sz_i16_t *scores_new) const noexcept {
 
@@ -2023,8 +2023,8 @@ struct linear_scorer<constant_iterator<char>, char const *, sz_i16_t, error_cost
         sz_u512_vec_t cost_if_substitution_vec, cost_if_gap_vec, cell_score_vec;
 
         // Initialize constats:
-        sz_u512_vec_t ones_vec;
-        ones_vec.zmm = _mm512_set1_epi16(1);
+        sz_u512_vec_t gap_cost_vec;
+        gap_cost_vec.zmm = _mm512_set1_epi16(gap);
 
         // Load the data with a mask:
         load_mask = _sz_u32_clamp_mask_until(n - i);
@@ -2038,7 +2038,7 @@ struct linear_scorer<constant_iterator<char>, char const *, sz_i16_t, error_cost
         // Then compute the data-parallel part, assuming the cost of deletions will be propagated
         // left to right outside of this loop.
         cost_if_substitution_vec.zmm = _mm512_add_epi16(pre_substitution_vec.zmm, cost_of_substitution_vec.zmm);
-        cost_if_gap_vec.zmm = _mm512_add_epi16(pre_gap_vec.zmm, ones_vec.zmm);
+        cost_if_gap_vec.zmm = _mm512_add_epi16(pre_gap_vec.zmm, gap_cost_vec.zmm);
         cell_score_vec.zmm = _mm512_max_epi16(cost_if_substitution_vec.zmm, cost_if_gap_vec.zmm);
 
         // In Local Alignment for SW we also need to compare to zero and set the result to zero if negative.

From 382c05d0ecbb1263997e871d7fc4dcab818ea1a2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 11 Apr 2025 08:17:10 +0000
Subject: [PATCH 318/751] Improve: Show signed integers in SIMD types

---
 include/stringzilla/types.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index e1db3167..a40e49c0 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -727,9 +727,13 @@ typedef union sz_u128_vec_t {
     uint64x2_t u64x2;
 #endif
     sz_u64_t u64s[2];
+    sz_i64_t i64s[2];
     sz_u32_t u32s[4];
+    sz_i32_t i32s[4];
     sz_u16_t u16s[8];
+    sz_i16_t i16s[8];
     sz_u8_t u8s[16];
+    sz_i8_t i8s[16];
 } sz_u128_vec_t;
 
 /**
@@ -749,9 +753,13 @@ typedef union sz_u256_vec_t {
     uint64x2_t u64x2s[2];
 #endif
     sz_u64_t u64s[4];
+    sz_i64_t i64s[4];
     sz_u32_t u32s[8];
+    sz_i32_t i32s[8];
     sz_u16_t u16s[16];
+    sz_i16_t i16s[16];
     sz_u8_t u8s[32];
+    sz_i8_t i8s[32];
 } sz_u256_vec_t;
 
 /**
@@ -778,7 +786,9 @@ typedef union sz_u512_vec_t {
     sz_u32_t u32s[16];
     sz_i32_t i32s[16];
     sz_u16_t u16s[32];
+    sz_i16_t i16s[32];
     sz_u8_t u8s[64];
+    sz_i8_t i8s[64];
 } sz_u512_vec_t;
 
 #pragma endregion

From 0a04960549b79bfdab4c2a2d6aa39f0224566ed9 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 12 Apr 2025 17:08:08 +0000
Subject: [PATCH 319/751] Improve: Forward-declare substituters

---
 include/stringcuzilla/similarity.hpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index e13d1d4f..0040c2e2 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -61,6 +61,9 @@
 namespace ashvardanian {
 namespace stringzilla {
 
+struct error_costs_256x256_t;
+struct error_costs_26x26ascii_t;
+
 /**
  *  @brief  Helper method to guess the amount of SRAM we want to effectively process the input
  *          without fetching from RAM/VRAM all the time, including the space for 3 diagonals
@@ -812,7 +815,7 @@ struct levenshtein_distance_utf8 {
  */
 template <                                              //
     typename char_type_ = char,                         //
-    typename substituter_type_ = error_costs_uniform_t, //
+    typename substituter_type_ = error_costs_256x256_t, //
     typename allocator_type_ = dummy_alloc_t,           //
     sz_capability_t capability_ = sz_cap_serial_k,      //
     typename enable_ = void                             //
@@ -905,7 +908,7 @@ struct needleman_wunsch_score {
  */
 template <                                              //
     typename char_type_ = char,                         //
-    typename substituter_type_ = error_costs_uniform_t, //
+    typename substituter_type_ = error_costs_256x256_t, //
     typename allocator_type_ = dummy_alloc_t,           //
     sz_capability_t capability_ = sz_cap_serial_k,      //
     typename enable_ = void                             //
@@ -1153,7 +1156,7 @@ struct levenshtein_distances_utf8 {
 
 template <                                              //
     typename char_type_ = char,                         //
-    typename substituter_type_ = error_costs_uniform_t, //
+    typename substituter_type_ = error_costs_256x256_t, //
     typename allocator_type_ = dummy_alloc_t,           //
     sz_capability_t capability_ = sz_cap_serial_k,      //
     typename enable_ = void                             //
@@ -1195,7 +1198,7 @@ struct needleman_wunsch_scores {
 
 template <                                              //
     typename char_type_ = char,                         //
-    typename substituter_type_ = error_costs_uniform_t, //
+    typename substituter_type_ = error_costs_256x256_t, //
     typename allocator_type_ = dummy_alloc_t,           //
     sz_capability_t capability_ = sz_cap_serial_k,      //
     typename enable_ = void                             //

From cebd1801f2888b8d0516bc6314c0afd065709dea Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 12 Apr 2025 17:39:34 +0000
Subject: [PATCH 320/751] Improve: Use CUDA constant memory

---
 include/stringcuzilla/similarity.cuh | 530 +++++++++++++++------------
 scripts/test_stringcuzilla.cuh       |  22 +-
 2 files changed, 297 insertions(+), 255 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index e2aab827..24b50087 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -47,23 +47,13 @@ namespace ashvardanian {
 namespace stringzilla {
 
 /**
- *  @brief GPU adaptation of the `global_scorer` on CUDA, avoiding warp-level shuffles and DPX.
+ *  @brief GPU adaptation of the `scorer` on CUDA, avoiding warp-level shuffles and DPX.
  *  @note Uses 32-bit `uint` counter to iterate through the string slices, so it can't be over 4 billion characters.
  */
-template <                               //
-    typename first_iterator_type_,       //
-    typename second_iterator_type_,      //
-    typename score_type_,                //
-    typename substituter_type_,          //
-    sz_similarity_objective_t objective_ //
-    >
-struct global_scorer<      //
-    first_iterator_type_,  //
-    second_iterator_type_, //
-    score_type_,           //
-    substituter_type_,     //
-    objective_,            //
-    sz_cap_cuda_k> {
+template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
+          typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
+struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, objective_,
+                     sz_similarity_global_k, capability_, std::enable_if_t<capability_ & sz_cap_cuda_k>> {
 
     using first_iterator_t = first_iterator_type_;
     using second_iterator_t = second_iterator_type_;
@@ -71,28 +61,29 @@ struct global_scorer<      //
     using substituter_t = substituter_type_;
 
     static constexpr sz_similarity_objective_t objective_k = objective_;
-    static constexpr sz_capability_t capability_k = sz_cap_cuda_k;
+    static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
+    static constexpr sz_capability_t capability_k = capability_;
 
     using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
     using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
     using char_t = typename std::remove_cvref<first_char_t>::type;
 
-    using scorer_t =
-        global_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k, capability_k>;
+    using scorer_t = linear_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k,
+                                   sz_similarity_global_k, capability_k>;
 
   protected:
     substituter_t substituter_;
     error_cost_t gap_cost_ {1};
     score_t last_cell_ {0};
 
-    __device__ score_t pick_best(score_t a, score_t b) const noexcept {
+    __forceinline__ __device__ score_t pick_best(score_t a, score_t b) const noexcept {
         if constexpr (objective_ == sz_minimize_distance_k) { return std::min(a, b); }
         else { return std::max(a, b); }
     }
 
   public:
-    __device__ global_scorer(substituter_t substituter, error_cost_t gap_cost) noexcept
+    __forceinline__ __device__ linear_scorer(substituter_t substituter, error_cost_t gap_cost) noexcept
         : substituter_(substituter), gap_cost_(gap_cost) {}
 
     /**
@@ -100,12 +91,14 @@ struct global_scorer<      //
      *  @note Should only be called for the diagonals outside of the bottom-right triangle.
      *  @note Should only be called for the top row and left column of the matrix.
      */
-    __device__ void init(score_t &cell, sz_size_t diagonal_index) const noexcept { cell = gap_cost_ * diagonal_index; }
+    __forceinline__ __device__ void init(score_t &cell, sz_size_t diagonal_index) const noexcept {
+        cell = gap_cost_ * diagonal_index;
+    }
 
     /**
      *  @brief Extract the final result of the scoring operation which will be always in the bottom-right corner.
      */
-    __device__ score_t score() const noexcept { return last_cell_; }
+    __forceinline__ __device__ score_t score() const noexcept { return last_cell_; }
 
     /**
      *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
@@ -113,7 +106,7 @@ struct global_scorer<      //
      *  @param second_slice The second string.
      *  @param n The length of the diagonal to evaluate and the number of characters to compare from each string.
      */
-    __device__ void operator()(                                                        //
+    __forceinline__ __device__ void operator()(                                        //
         first_iterator_t first_reversed_slice, second_iterator_t second_slice, uint n, // ! Unlike CPU, uses `uint`
         score_t const *scores_pre_substitution, score_t const *scores_pre_insertion, score_t const *scores_pre_deletion,
         score_t *scores_new) noexcept {
@@ -141,20 +134,10 @@ struct global_scorer<      //
  *  @brief GPU adaptation of the `local_scorer` on CUDA, avoiding warp-level shuffles and DPX.
  *  @note Uses 32-bit `uint` counter to iterate through the string slices, so it can't be over 4 billion characters.
  */
-template <                               //
-    typename first_iterator_type_,       //
-    typename second_iterator_type_,      //
-    typename score_type_,                //
-    typename substituter_type_,          //
-    sz_similarity_objective_t objective_ //
-    >
-struct local_scorer<       //
-    first_iterator_type_,  //
-    second_iterator_type_, //
-    score_type_,           //
-    substituter_type_,     //
-    objective_,            //
-    sz_cap_cuda_k> {
+template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
+          typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
+struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, objective_,
+                     sz_similarity_local_k, capability_, std::enable_if_t<capability_ & sz_cap_cuda_k>> {
 
     using first_iterator_t = first_iterator_type_;
     using second_iterator_t = second_iterator_type_;
@@ -162,28 +145,29 @@ struct local_scorer<       //
     using substituter_t = substituter_type_;
 
     static constexpr sz_similarity_objective_t objective_k = objective_;
-    static constexpr sz_capability_t capability_k = sz_cap_cuda_k;
+    static constexpr sz_similarity_locality_t locality_k = sz_similarity_local_k;
+    static constexpr sz_capability_t capability_k = capability_;
 
     using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
     using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
     using char_t = typename std::remove_cvref<first_char_t>::type;
 
-    using scorer_t =
-        local_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k, capability_k>;
+    using scorer_t = linear_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k,
+                                   sz_similarity_local_k, capability_k>;
 
   protected:
     substituter_t substituter_;
     error_cost_t gap_cost_ {1};
     score_t best_score_ {0};
 
-    __device__ score_t pick_best(score_t a, score_t b) const noexcept {
+    __forceinline__ __device__ score_t pick_best(score_t a, score_t b) const noexcept {
         if constexpr (objective_k == sz_minimize_distance_k) { return std::min(a, b); }
         else { return std::max(a, b); }
     }
 
   public:
-    __device__ local_scorer(substituter_t substituter, error_cost_t gap_cost) noexcept
+    __forceinline__ __device__ linear_scorer(substituter_t substituter, error_cost_t gap_cost) noexcept
         : substituter_(substituter), gap_cost_(gap_cost) {}
 
     /**
@@ -191,12 +175,12 @@ struct local_scorer<       //
      *  @note Should only be called for the diagonals outside of the bottom-right triangle.
      *  @note Should only be called for the top row and left column of the matrix.
      */
-    __device__ void init(score_t &cell, sz_size_t diagonal_index) const noexcept { cell = 0; }
+    __forceinline__ __device__ void init(score_t &cell, sz_size_t diagonal_index) const noexcept { cell = 0; }
 
     /**
      *  @brief Extract the final result of the scoring operation which will be always in the bottom-right corner.
      */
-    __device__ score_t score() const noexcept { return best_score_; }
+    __forceinline__ __device__ score_t score() const noexcept { return best_score_; }
 
     /**
      *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
@@ -204,7 +188,7 @@ struct local_scorer<       //
      *  @param second_slice The second string.
      *  @param n The length of the diagonal to evaluate and the number of characters to compare from each string.
      */
-    __device__ void operator()(                                                        //
+    __forceinline__ __device__ void operator()(                                        //
         first_iterator_t first_reversed_slice, second_iterator_t second_slice, uint n, // ! Unlike CPU, uses `uint`
         score_t const *scores_pre_substitution, score_t const *scores_pre_insertion, score_t const *scores_pre_deletion,
         score_t *scores_new) noexcept {
@@ -237,20 +221,19 @@ struct local_scorer<       //
     }
 };
 
-#if SZ_USE_HOPPER
+#if SZ_USE_HOPPER && 0
 
 /**
- *  @brief GPU adaptation of the `global_scorer` - Minimizes Global Levenshtein distance.
+ *  @brief GPU adaptation of the `scorer` - Minimizes Global Levenshtein distance.
  *  @note Requires Hopper generation GPUs with DPX to handle 4x `u8` scores at a time.
  */
 template <>
-struct global_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k,
-                     sz_cap_hopper_k> : public global_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t,
-                                                             sz_minimize_distance_k, sz_cap_cuda_k> {
+struct scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k, sz_cap_hopper_k>
+    : public scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k, sz_cap_cuda_k> {
 
-    using scorer_t::global_scorer; // Make the constructors visible
+    using scorer_t::scorer; // Make the constructors visible
 
-    __device__ void operator()(                                             //
+    __forceinline__ __device__ void operator()(                             //
         char const *first_reversed_slice, char const *second_slice, uint n, // ! Unlike CPU, uses `uint`
         sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion, sz_u8_t const *scores_pre_deletion,
         sz_u8_t *scores_new) noexcept {
@@ -304,19 +287,17 @@ struct global_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t,
 };
 
 template <>
-struct global_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
-                     sz_cap_hopper_k>
-    : public global_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
-                           sz_cap_cuda_k> {
-    using scorer_t::global_scorer; // Make the constructors visible
+struct scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k, sz_cap_hopper_k>
+    : public scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
+                    sz_cap_cuda_k> {
+    using scorer_t::scorer; // Make the constructors visible
 };
 
 template <>
-struct global_scorer<char const *, char const *, sz_u32_t, error_costs_uniform_t, sz_minimize_distance_k,
-                     sz_cap_hopper_k>
-    : public global_scorer<char const *, char const *, sz_u32_t, error_costs_uniform_t, sz_minimize_distance_k,
-                           sz_cap_cuda_k> {
-    using scorer_t::global_scorer; // Make the constructors visible
+struct scorer<char const *, char const *, sz_u32_t, error_costs_uniform_t, sz_minimize_distance_k, sz_cap_hopper_k>
+    : public scorer<char const *, char const *, sz_u32_t, error_costs_uniform_t, sz_minimize_distance_k,
+                    sz_cap_cuda_k> {
+    using scorer_t::scorer; // Make the constructors visible
 };
 
 #endif
@@ -326,25 +307,20 @@ struct global_scorer<char const *, char const *, sz_u32_t, error_costs_uniform_t
  *          @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization.
  *          This function implements a logic for a single pair of strings.
  *
- *  @param[in] first The first string.
- *  @param[in] second The second string.
- *  @param[out] result_ref Location to dump the calculated score from the first thread in warp.
- *  @param[in] gap_cost The uniform cost of a gap (insertion or deletion).
- *  @param[in] get_substitution_cost A commutative function returning the cost of substituting one char with another.
- *
  *  We could have implemented the logic of this function as part of the `scores_diagonally` kernel,
  *  but we want to control the used @b `distance_type_` at the level of each warp and score computation.
  *  If all of the strings except for one are 100-ish bytes, but one is 1000-ish bytes, we want to use
  *  the 8-bit `distance_type_` for the smaller strings, and 16-bit `distance_type_` for the larger one.
- *  The smaller the type, the more likely we are to use specialized @b SIMD instructions, like
+ *  The smaller the type, the more likely we are to use specialized @b SIMD instructions, like DPX on Hopper.
  */
 template <                                                       //
-    sz_capability_t capability_ = sz_cap_cuda_k,                 //
-    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
-    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
     typename char_type_ = char,                                  //
     typename score_type_ = sz_size_t,                            //
-    typename substituter_type_ = error_costs_uniform_t           //
+    typename substituter_type_ = error_costs_uniform_t,          //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
+    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
+    sz_capability_t capability_ = sz_cap_cuda_k,                 //
+    typename enable_ = void                                      //
     >
 struct diagonal_walker_per_warp {
 
@@ -356,11 +332,8 @@ struct diagonal_walker_per_warp {
     static constexpr sz_similarity_locality_t locality_k = locality_;
     static constexpr sz_similarity_objective_t objective_k = objective_;
 
-    using global_scorer_t =
-        global_scorer<char_t const *, char_t const *, score_t, substituter_t, objective_k, capability_k>;
-    using local_scorer_t =
-        local_scorer<char_t const *, char_t const *, score_t, substituter_t, objective_k, capability_k>;
-    using scorer_t = std::conditional_t<locality_k == sz_similarity_local_k, local_scorer_t, global_scorer_t>;
+    using scorer_t =
+        linear_scorer<char_t const *, char_t const *, score_t, substituter_t, objective_k, locality_k, capability_k>;
 
   protected:
     substituter_t substituter_;
@@ -373,7 +346,7 @@ struct diagonal_walker_per_warp {
      *  @param[in] alloc A default-constructible allocator for the internal buffers.
      *
      */
-    __device__ diagonal_walker_per_warp(substituter_t substituter, error_cost_t gap_cost) noexcept
+    __forceinline__ __device__ diagonal_walker_per_warp(substituter_t substituter, error_cost_t gap_cost) noexcept
         : substituter_(substituter), gap_cost_(gap_cost) {}
 
     /**
@@ -381,8 +354,8 @@ struct diagonal_walker_per_warp {
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score.
      */
-    __device__ void operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref,
-                               char *shared_memory_buffer) const noexcept {
+    __forceinline__ __device__ void operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref,
+                                               char *shared_memory_buffer) const noexcept {
 
         // Make sure the size relation between the strings is correct.
         char_t const *shorter_global = first.data(), *longer_global = second.data();
@@ -550,23 +523,25 @@ sz_size_t _scores_diagonally_warp_shared_memory_requirement( //
 /**
  *  @brief  Levenshtein edit distances algorithm evaluating the Dynamic Programming matrix
  *          @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization.
- *          Each pair of strings gets its own @b "block" of CUDA threads and shared memory.
+ *          Each pair of strings gets its own @b "block" of CUDA threads forming one @b warp and shared memory.
  *
  *  @param[in] first_strings Array of first strings in each pair for score calculation.
  *  @param[in] second_strings Array of second strings in each pair for score calculation.
  *  @param[out] results_ptr Output array of scores for each pair of strings.
+ *  @param[in] max_length Maximum length of the strings to be processed. Everything above that will be @b skipped.
  */
-template <                                         //
-    sz_capability_t capability_ = sz_cap_serial_k, //
-    typename first_strings_type_,                  //
-    typename second_strings_type_,                 //
-    typename score_type_ = sz_size_t               //
+template <                                      //
+    typename first_strings_type_,               //
+    typename second_strings_type_,              //
+    typename score_type_ = sz_size_t,           //
+    sz_capability_t capability_ = sz_cap_cuda_k //
     >
-__global__ void _levenshtein_in_cuda(    //
-    first_strings_type_ first_strings,   //
-    second_strings_type_ second_strings, //
-    score_type_ *results_ptr             //
-) {
+__global__ void _levenshtein_in_cuda_warp( //
+    first_strings_type_ first_strings,     //
+    second_strings_type_ second_strings,   //
+    score_type_ *results_ptr,              //
+    size_t max_length = SZ_SIZE_MAX) {
+
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
     using first_string_t = typename first_strings_type_::value_type;
     using second_string_t = typename second_strings_type_::value_type;
@@ -578,9 +553,12 @@ __global__ void _levenshtein_in_cuda(    //
 
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
-    using walker_u8_t = diagonal_walker_per_warp<capability_k, objective_k, sz_similarity_global_k, char_t, sz_u8_t>;
-    using walker_u16_t = diagonal_walker_per_warp<capability_k, objective_k, sz_similarity_global_k, char_t, sz_u16_t>;
-    using walker_u32_t = diagonal_walker_per_warp<capability_k, objective_k, sz_similarity_global_k, char_t, sz_u32_t>;
+    using walker_u8_t = diagonal_walker_per_warp<char_t, sz_u8_t, error_costs_uniform_t, objective_k,
+                                                 sz_similarity_global_k, capability_k>;
+    using walker_u16_t = diagonal_walker_per_warp<char_t, sz_u16_t, error_costs_uniform_t, objective_k,
+                                                  sz_similarity_global_k, capability_k>;
+    using walker_u32_t = diagonal_walker_per_warp<char_t, sz_u32_t, error_costs_uniform_t, objective_k,
+                                                  sz_similarity_global_k, capability_k>;
 
     // Allocating shared memory is handled on the host side.
     extern __shared__ char shared_memory_buffer[];
@@ -606,6 +584,7 @@ __global__ void _levenshtein_in_cuda(    //
 
         // Estimate the maximum dimension of the DP matrix to pick the smallest fitting type.
         sz_size_t const max_cell_value = sz_max_of_two(first_length, second_length) + 1;
+        if (max_cell_value >= max_length) continue;
         span<char const> const first = {first_global.data(), first_length};
         span<char const> const second = {second_global.data(), second_length};
         if (max_cell_value < 256u) {
@@ -614,13 +593,89 @@ __global__ void _levenshtein_in_cuda(    //
             walker(first, second, result_u8, shared_memory_buffer);
             if (threadIdx.x == 0) result_ref = result_u8;
         }
-        else if (max_cell_value < 65536u) {
+        else {
+            _sz_assert(max_cell_value < 65536u && "Use `_levenshtein_in_cuda_device` for large inputs");
+            sz_u16_t result_u16 = (sz_u16_t)-1;
+            walker_u16_t walker({}, 1);
+            walker(first, second, result_u16, shared_memory_buffer);
+            if (threadIdx.x == 0) result_ref = result_u16;
+        }
+    }
+}
+
+/**
+ *  @brief  Levenshtein edit distances algorithm evaluating the Dynamic Programming matrix
+ *          @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization.
+ *          Each pair of strings takes the whole device to compute the score and should only be used for huge inputs.
+ *
+ *  @param[in] first_strings Array of first strings in each pair for score calculation.
+ *  @param[in] second_strings Array of second strings in each pair for score calculation.
+ *  @param[out] results_ptr Output array of scores for each pair of strings.
+ *  @param[in] min_length Minimum length of the strings to be processed. Everything below that will be @b skipped.
+ */
+template <                                      //
+    typename first_strings_type_,               //
+    typename second_strings_type_,              //
+    typename score_type_ = sz_size_t,           //
+    sz_capability_t capability_ = sz_cap_cuda_k //
+    >
+__global__ void _levenshtein_in_cuda_device( //
+    first_strings_type_ first_strings,       //
+    second_strings_type_ second_strings,     //
+    score_type_ *results_ptr,                //
+    sz_size_t min_length = 0) {
+
+    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
+    using first_string_t = typename first_strings_type_::value_type;
+    using second_string_t = typename second_strings_type_::value_type;
+    using first_char_t = typename first_string_t::value_type;
+    using second_char_t = typename second_string_t::value_type;
+    static_assert(sizeof(first_char_t) == sizeof(second_char_t), "Character types don't match");
+    using char_t = typename std::remove_cvref<first_char_t>::type;
+    using score_t = score_type_;
+
+    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
+    using walker_u16_t = diagonal_walker_per_warp<char_t, sz_u16_t, error_costs_uniform_t, objective_k,
+                                                  sz_similarity_global_k, capability_k>;
+    using walker_u32_t = diagonal_walker_per_warp<char_t, sz_u32_t, error_costs_uniform_t, objective_k,
+                                                  sz_similarity_global_k, capability_k>;
+
+    // Allocating shared memory is handled on the host side.
+    extern __shared__ char shared_memory_buffer[];
+
+    // We are computing N edit distances for N pairs of strings. Not a cartesian product!
+    // Each block/warp may end up receiving a different number of strings.
+    for (sz_size_t pair_idx = blockIdx.x; pair_idx < first_strings.size(); pair_idx += gridDim.x) {
+        first_string_t const first_global = first_strings[pair_idx];
+        second_string_t const second_global = second_strings[pair_idx];
+        score_t &result_ref = results_ptr[pair_idx];
+
+        // Skip empty strings.
+        sz_size_t const first_length = first_global.length();
+        sz_size_t const second_length = second_global.length();
+        if (first_length == 0) {
+            result_ref = second_length;
+            continue;
+        }
+        if (second_length == 0) {
+            result_ref = first_length;
+            continue;
+        }
+
+        // Estimate the maximum dimension of the DP matrix to pick the smallest fitting type.
+        sz_size_t const max_cell_value = sz_max_of_two(first_length, second_length) + 1;
+        if (max_cell_value >= min_length) continue;
+        span<char const> const first = {first_global.data(), first_length};
+        span<char const> const second = {second_global.data(), second_length};
+        if (max_cell_value < 65536u) {
             sz_u16_t result_u16 = (sz_u16_t)-1;
             walker_u16_t walker({}, 1);
             walker(first, second, result_u16, shared_memory_buffer);
             if (threadIdx.x == 0) result_ref = result_u16;
         }
         else {
+            _sz_assert(max_cell_value < 4294967296u && "Use heuristics-based algorithms for large inputs");
             sz_u32_t result_u32 = (sz_u32_t)-1;
             walker_u32_t walker({}, 1);
             walker(first, second, result_u32, shared_memory_buffer);
@@ -629,14 +684,14 @@ __global__ void _levenshtein_in_cuda(    //
     }
 }
 
-/** @brief Dispatches on @b `_levenshtein_in_cuda` on the device side from the host side. */
-template <                                         //
-    sz_capability_t capability_ = sz_cap_serial_k, //
-    typename first_strings_type_,                  //
-    typename second_strings_type_,                 //
-    typename score_type_ = sz_size_t               //
+/** @brief Dispatches on @b `_levenshtein_in_cuda_warp` on the device side from the host side. */
+template <                                       //
+    sz_capability_t capability_ = sz_cap_cuda_k, //
+    typename first_strings_type_,                //
+    typename second_strings_type_,               //
+    typename score_type_ = sz_size_t             //
     >
-status_t _levenshtein_via_cuda(                                                           //
+cuda_status_t _levenshtein_via_cuda_warp(                                                 //
     first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
     score_type_ *results, gpu_specs_t specs = {}, cudaStream_t stream = 0) noexcept(false) {
 
@@ -653,25 +708,47 @@ status_t _levenshtein_via_cuda(
     // A100 SMs had only 192 KB. We can't deal with blocks that require more memory than the SM can provide.
     sz_size_t shared_memory_per_block =
         _scores_diagonally_warp_shared_memory_requirement(first_strings, second_strings, 1);
-    if (shared_memory_per_block > specs.shared_memory_per_sm) return status_t::bad_alloc_k;
+    if (shared_memory_per_block > specs.shared_memory_per_multiprocessor()) return {status_t::bad_alloc_k};
 
     // It may be the case that we've only received empty strings.
     if (shared_memory_per_block == 0) {
         for (sz_size_t i = 0; i < first_strings.size(); ++i)
             if (first_strings[i].length() == 0) { results[i] = second_strings[i].length(); }
             else if (second_strings[i].length() == 0) { results[i] = first_strings[i].length(); }
-        return status_t::success_k;
+        return {status_t::success_k};
     }
 
-    // In most cases we should be able to fir many blocks per SM.
-    sz_size_t count_blocks = specs.shared_memory_per_sm / shared_memory_per_block;
-    if (count_blocks > specs.blocks_per_sm) count_blocks = specs.blocks_per_sm;
+    // In most cases we should be able to fit many blocks per SM.
+    sz_size_t count_blocks = specs.shared_memory_per_multiprocessor() / shared_memory_per_block;
+    if (count_blocks > specs.max_blocks_per_multiprocessor) count_blocks = specs.max_blocks_per_multiprocessor;
     if (count_blocks > first_strings.size()) count_blocks = first_strings.size();
+    // std::printf("max blocks to fit memory: %zu.\n", count_blocks);
+    // std::printf("max blocks to match SM thread-count: %zu.\n", count_blocks);
+    // std::printf("max blocks to match input size: %zu.\n", count_blocks);
 
     // Let's use all 32 threads in a warp.
     constexpr sz_size_t threads_per_block = 32u;
-    auto kernel = &_levenshtein_in_cuda<capability_k, first_strings_t, second_strings_t, score_t>;
-    void *kernel_args[] = {(void *)&first_strings, (void *)&second_strings, (void *)&results};
+    sz_size_t const max_input_length = 1024u * 16u;
+    auto kernel = &_levenshtein_in_cuda_warp<first_strings_t, second_strings_t, score_t, capability_k>;
+    void *kernel_args[] = {
+        (void *)&first_strings,
+        (void *)&second_strings,
+        (void *)&results,
+        (void *)&max_input_length,
+    };
+
+    // On Volta and newer GPUs, there is an extra flag to be set to use more than 48 KB of shared memory per block.
+    cudaError_t attribute_error =
+        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, specs.shared_memory_bytes);
+    if (attribute_error != cudaSuccess) return {status_t::unknown_k, attribute_error};
+
+    // Create CUDA events for timing
+    cudaEvent_t start_event, stop_event;
+    cudaEventCreate(&start_event);
+    cudaEventCreate(&stop_event);
+
+    // Record the start event
+    cudaEventRecord(start_event, stream);
 
     // Enqueue the kernel for execution:
     cudaError_t launch_error = cudaLaunchKernel( //
@@ -682,74 +759,65 @@ status_t _levenshtein_via_cuda(
         shared_memory_per_block,                 // Shared memory per block (in bytes)
         stream);                                 // CUDA stream
     if (launch_error != cudaSuccess)
-        if (launch_error == cudaErrorMemoryAllocation) { return status_t::bad_alloc_k; }
-        else { return status_t::unknown_k; }
+        if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
+        else { return {status_t::unknown_k, launch_error}; }
+
+    cudaEventRecord(stop_event, stream);
 
     // Fetch the execution error:
+    float execution_milliseconds = 0;
     cudaError_t execution_error = cudaStreamSynchronize(stream);
+    cudaEventElapsedTime(&execution_milliseconds, start_event, stop_event);
+
     if (execution_error != cudaSuccess)
-        if (execution_error == cudaErrorMemoryAllocation) { return status_t::bad_alloc_k; }
-        else { return status_t::unknown_k; }
-    return status_t::success_k;
+        if (execution_error == cudaErrorMemoryAllocation) {
+            return {status_t::bad_alloc_k, execution_error, execution_milliseconds};
+        }
+        else { return {status_t::unknown_k, execution_error, execution_milliseconds}; }
+    return {status_t::success_k, cudaSuccess, execution_milliseconds};
 }
 
 /** @brief Dispatches baseline Levenshtein edit distance algorithm to the GPU. */
 template <typename char_type_>
-struct levenshtein_distances<sz_cap_cuda_k, char_type_, dummy_alloc_t> {
-
-    template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
-    status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
-                        results_type_ &&results, gpu_specs_t specs = {}, cudaStream_t stream = 0) const noexcept {
-        return _levenshtein_via_cuda<sz_cap_cuda_k>(first_strings, second_strings, results, specs, stream);
-    }
-};
-
-/** @brief Dispatches Levenshtein edit distance algorithm to the GPU with Kepler shuffles. */
-template <typename char_type_>
-struct levenshtein_distances<sz_cap_kepler_k, char_type_, dummy_alloc_t> {
+struct levenshtein_distances<char_type_, dummy_alloc_t, sz_cap_cuda_k> {
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
-    status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
-                        results_type_ &&results, gpu_specs_t specs = {}, cudaStream_t stream = 0) const noexcept {
-        return _levenshtein_via_cuda<sz_cap_kepler_k>(first_strings, second_strings, results, specs, stream);
+    cuda_status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
+                             results_type_ &&results, gpu_specs_t specs = {}, cudaStream_t stream = 0) const noexcept {
+        return _levenshtein_via_cuda_warp<sz_cap_cuda_k>(first_strings, second_strings, results, specs, stream);
     }
 };
 
-/** @brief Dispatches Levenshtein edit distance algorithm to the GPU with Kepler shuffles and Hopper DPX. */
-template <>
-struct levenshtein_distances<sz_cap_hopper_k, char, dummy_alloc_t> {
-
-    template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
-    status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
-                        results_type_ &&results, gpu_specs_t specs = {}, cudaStream_t stream = 0) const noexcept {
-        return _levenshtein_via_cuda<sz_cap_hopper_k>(first_strings, second_strings, results, specs, stream);
-    }
-};
+/**
+ *  @brief  Convenience buffer of the size matching the size of the CUDA constant memory,
+ *          used to cheaper store and access the substitution costs for the characters.
+ *  @see    CUDA constant memory docs: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#constant
+ */
+__constant__ error_costs_256x256_t _error_costs_256x256_in_cuda_constant_memory;
 
 /**
  *  @brief  Needleman-Wunsch alignment cores algorithm evaluating the Dynamic Programming matrix
  *          @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization.
- *          Each pair of strings gets its own @b "block" of CUDA threads and shared memory.
+ *          Each pair of strings gets its own @b "block" of CUDA threads forming one @b warp and shared memory.
  *
  *  @param[in] first_strings Array of first strings in each pair for score calculation.
  *  @param[in] second_strings Array of second strings in each pair for score calculation.
  *  @param[out] results_ptr Output array of scores for each pair of strings.
  *
- *  @tparam substituter_type_ Must be a trivially copyable object already placed in the GPU global memory.
+ *  @tparam substituter_type_ Must be a trivially copyable object already placed in the GPU @b constant memory.
  */
-template <                                             //
-    sz_capability_t capability_ = sz_cap_serial_k,     //
-    typename first_strings_type_,                      //
-    typename second_strings_type_,                     //
-    typename score_type_ = sz_size_t,                  //
-    typename substituter_type_ = error_costs_uniform_t //
+template <                                              //
+    typename first_strings_type_,                       //
+    typename second_strings_type_,                      //
+    typename score_type_ = sz_size_t,                   //
+    typename substituter_type_ = error_costs_256x256_t, //
+    sz_capability_t capability_ = sz_cap_cuda_k         //
     >
-__global__ void _needleman_wunsch_in_cuda(           //
-    first_strings_type_ first_strings,               //
-    second_strings_type_ second_strings,             //
-    score_type_ *results_ptr,                        //
-    substituter_type_ const *substituter_global_ptr, //
-    error_cost_t gap_cost = 1                        //
+__global__ void _needleman_wunsch_in_cuda_warp( //
+    first_strings_type_ first_strings,          //
+    second_strings_type_ second_strings,        //
+    score_type_ *results_ptr,                   //
+    error_cost_t gap_cost = 1                   //
 ) {
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
     using first_string_t = typename first_strings_type_::value_type;
@@ -766,20 +834,16 @@ __global__ void _needleman_wunsch_in_cuda(           //
     static constexpr sz_capability_t cap_k = capability_;
     static constexpr sz_similarity_objective_t obj_k = sz_maximize_score_k;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
-    using walker_i16_t = diagonal_walker_per_warp<cap_k, obj_k, locality_k, char_t, sz_i16_t, substituter_t &>;
-    using walker_i32_t = diagonal_walker_per_warp<cap_k, obj_k, locality_k, char_t, sz_i32_t, substituter_t &>;
-    using walker_i64_t = diagonal_walker_per_warp<cap_k, obj_k, locality_k, char_t, sz_i64_t, substituter_t &>;
+    using walker_i16_t = diagonal_walker_per_warp<char_t, sz_i16_t, substituter_t const &, obj_k, locality_k, cap_k>;
+    using walker_i32_t = diagonal_walker_per_warp<char_t, sz_i32_t, substituter_t const &, obj_k, locality_k, cap_k>;
+    using walker_i64_t = diagonal_walker_per_warp<char_t, sz_i64_t, substituter_t const &, obj_k, locality_k, cap_k>;
 
     // Allocating shared memory is handled on the host side.
     extern __shared__ char shared_memory_buffer[];
 
-    // Assuming, all pairwise computations use the same substituter, load it into shared memory.
-    static constexpr uint substituter_size = sizeof(substituter_type_);
-    char *substituter_costs = reinterpret_cast<char *>(shared_memory_buffer);
-    for (uint i = threadIdx.x; i < substituter_size; i += blockDim.x)
-        substituter_costs[i] = reinterpret_cast<char const *>(substituter_global_ptr)[i];
-    substituter_t &substituter_shared = *reinterpret_cast<substituter_t *>(substituter_costs);
-    __syncwarp();
+    // We expect the substituter state to be already in the GPU constant memory.
+    substituter_t const &substituter_constant =
+        *reinterpret_cast<substituter_t const *>(&_error_costs_256x256_in_cuda_constant_memory);
 
     // We are computing N edit distances for N pairs of strings. Not a cartesian product!
     // Each block/warp may end up receiving a different number of strings.
@@ -806,38 +870,38 @@ __global__ void _needleman_wunsch_in_cuda(           //
         span<char const> const second = {second_global.data(), second_length};
         if (max_cell_value < 256u) {
             sz_i16_t result_i16 = std::numeric_limits<sz_i16_t>::min();
-            walker_i16_t walker(substituter_shared, gap_cost);
-            walker(first, second, result_i16, shared_memory_buffer + substituter_size);
+            walker_i16_t walker(substituter_constant, gap_cost);
+            walker(first, second, result_i16, shared_memory_buffer);
             if (threadIdx.x == 0) result_ref = result_i16;
         }
         else if (max_cell_value < 65536u) {
             sz_i32_t result_i32 = std::numeric_limits<sz_i32_t>::min();
-            walker_i32_t walker(substituter_shared, gap_cost);
-            walker(first, second, result_i32, shared_memory_buffer + substituter_size);
+            walker_i32_t walker(substituter_constant, gap_cost);
+            walker(first, second, result_i32, shared_memory_buffer);
             if (threadIdx.x == 0) result_ref = result_i32;
         }
         else {
             sz_i64_t result_i64 = std::numeric_limits<sz_i64_t>::min();
-            walker_i64_t walker(substituter_shared, gap_cost);
-            walker(first, second, result_i64, shared_memory_buffer + substituter_size);
+            walker_i64_t walker(substituter_constant, gap_cost);
+            walker(first, second, result_i64, shared_memory_buffer);
             if (threadIdx.x == 0) result_ref = result_i64;
         }
     }
 }
 
-/** @brief Dispatches on @b `_needleman_wunsch_in_cuda` on the device side from the host side. */
+/** @brief Dispatches on @b `_needleman_wunsch_in_cuda_warp` on the device side from the host side. */
 template <                                             //
-    sz_capability_t capability_ = sz_cap_serial_k,     //
+    sz_capability_t capability_ = sz_cap_cuda_k,       //
     typename first_strings_type_,                      //
     typename second_strings_type_,                     //
     typename score_type_ = sz_ssize_t,                 //
-    typename substituter_type_ = error_costs_uniform_t //
+    typename substituter_type_ = error_costs_256x256_t //
     >
-status_t _needleman_wunsch_via_cuda(                                                      //
+cuda_status_t _needleman_wunsch_via_cuda_warp(                                            //
     first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
     score_type_ *results,
-    substituter_type_ const *substituter_global_ptr, //
-    error_cost_t gap_cost = 1,                       //
+    substituter_type_ const &substituter, //
+    error_cost_t gap_cost = 1,            //
     gpu_specs_t specs = {}, cudaStream_t stream = 0) noexcept(false) {
 
     // We need to be able to copy these function arguments into GPU memory:
@@ -854,32 +918,51 @@ status_t _needleman_wunsch_via_cuda(
     // A100 SMs had only 192 KB. We can't deal with blocks that require more memory than the SM can provide.
     sz_size_t shared_memory_per_block =
         _scores_diagonally_warp_shared_memory_requirement(first_strings, second_strings, 127);
-    if (shared_memory_per_block > specs.shared_memory_per_sm) return status_t::bad_alloc_k;
+    if (shared_memory_per_block > specs.shared_memory_per_multiprocessor()) return {status_t::bad_alloc_k};
 
     // It may be the case that we've only received empty strings.
     if (shared_memory_per_block == 0) {
         for (sz_size_t i = 0; i < first_strings.size(); ++i)
             if (first_strings[i].length() == 0) { results[i] = second_strings[i].length(); }
             else if (second_strings[i].length() == 0) { results[i] = first_strings[i].length(); }
-        return status_t::success_k;
+        return {status_t::success_k};
     }
 
-    // ? Include the size of the substituter in the shared memory requirement.
-    shared_memory_per_block += sizeof(substituter_t);
-
-    // In most cases we should be able to fir many blocks per SM.
-    sz_size_t count_blocks = specs.shared_memory_per_sm / shared_memory_per_block;
-    if (count_blocks > specs.blocks_per_sm) count_blocks = specs.blocks_per_sm;
+    // In most cases we should be able to fit many blocks per SM.
+    sz_size_t count_blocks = specs.shared_memory_per_multiprocessor() / shared_memory_per_block;
+    if (count_blocks > specs.max_blocks_per_multiprocessor) count_blocks = specs.max_blocks_per_multiprocessor;
     if (count_blocks > first_strings.size()) count_blocks = first_strings.size();
 
     // Let's use all 32 threads in a warp.
     constexpr sz_size_t threads_per_block = 32u;
-    auto kernel = &_needleman_wunsch_in_cuda<capability_k, first_strings_t, second_strings_t, score_t, substituter_t>;
+    auto kernel =
+        &_needleman_wunsch_in_cuda_warp<first_strings_t, second_strings_t, score_t, substituter_t, capability_k>;
     void *kernel_args[] = {
-        (void *)&first_strings,          (void *)&second_strings, (void *)&results,
-        (void *)&substituter_global_ptr, (void *)&gap_cost,
+        (void *)&first_strings,
+        (void *)&second_strings,
+        (void *)&results,
+        (void *)&gap_cost,
     };
 
+    // On Volta and newer GPUs, there is an extra flag to be set to use more than 48 KB of shared memory per block.
+    cudaError_t attribute_error =
+        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, specs.shared_memory_bytes);
+    if (attribute_error != cudaSuccess) return {status_t::unknown_k, attribute_error};
+
+    // Create CUDA events for timing
+    cudaEvent_t start_event, stop_event;
+    cudaEventCreate(&start_event);
+    cudaEventCreate(&stop_event);
+
+    // Record the start event
+    cudaEventRecord(start_event, stream);
+
+    // Enqueue the transfer of the substituter to the constant memory:
+    cudaError_t copy_error =
+        cudaMemcpyToSymbolAsync((void *)&_error_costs_256x256_in_cuda_constant_memory, (void const *)&substituter,
+                                sizeof(substituter_t), 0, cudaMemcpyHostToDevice, stream);
+    if (copy_error != cudaSuccess) return {status_t::unknown_k, copy_error};
+
     // Enqueue the kernel for execution:
     cudaError_t launch_error = cudaLaunchKernel( //
         reinterpret_cast<void *>(kernel),        // Kernel function pointer
@@ -889,67 +972,30 @@ status_t _needleman_wunsch_via_cuda(
         shared_memory_per_block,                 // Shared memory per block (in bytes)
         stream);                                 // CUDA stream
     if (launch_error != cudaSuccess)
-        if (launch_error == cudaErrorMemoryAllocation) { return status_t::bad_alloc_k; }
-        else { return status_t::unknown_k; }
+        if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
+        else { return {status_t::unknown_k, launch_error}; }
+
+    cudaEventRecord(stop_event, stream);
 
     // Fetch the execution error:
+    float execution_milliseconds = 0;
     cudaError_t execution_error = cudaStreamSynchronize(stream);
+    cudaEventElapsedTime(&execution_milliseconds, start_event, stop_event);
+
     if (execution_error != cudaSuccess)
-        if (execution_error == cudaErrorMemoryAllocation) { return status_t::bad_alloc_k; }
-        else { return status_t::unknown_k; }
-    return status_t::success_k;
+        if (execution_error == cudaErrorMemoryAllocation) {
+            return {status_t::bad_alloc_k, execution_error, execution_milliseconds};
+        }
+        else { return {status_t::unknown_k, execution_error, execution_milliseconds}; }
+    return {status_t::success_k, cudaSuccess, execution_milliseconds};
 }
 
 /** @brief Dispatches baseline Needleman Wunsch algorithm to the GPU. */
 template <typename char_type_, typename substituter_type_>
-struct needleman_wunsch_scores<sz_cap_cuda_k, char_type_, substituter_type_ *, dummy_alloc_t> {
-
-    using char_t = char_type_;
-    using substituter_t = substituter_type_ *; // ! Note the pointer, it must be in the global memory
-
-    substituter_t substituter_ {};
-    error_cost_t gap_cost_ {1};
-
-    needleman_wunsch_scores() noexcept {}
-    needleman_wunsch_scores(substituter_t subs, error_cost_t gap_cost) noexcept
-        : substituter_(subs), gap_cost_(gap_cost) {}
-
-    template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
-    status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
-                        results_type_ &&results, gpu_specs_t specs = {}, cudaStream_t stream = 0) const noexcept {
-        return _needleman_wunsch_via_cuda<sz_cap_cuda_k>(first_strings, second_strings, results, substituter_,
-                                                         gap_cost_, specs, stream);
-    }
-};
-
-/** @brief Dispatches Needleman Wunsch algorithm to the GPU with Kepler shuffles. */
-template <typename char_type_, typename substituter_type_>
-struct needleman_wunsch_scores<sz_cap_kepler_k, char_type_, substituter_type_ *, dummy_alloc_t> {
+struct needleman_wunsch_scores<char_type_, substituter_type_, dummy_alloc_t, sz_cap_cuda_k> {
 
     using char_t = char_type_;
-    using substituter_t = substituter_type_ *; // ! Note the pointer, it must be in the global memory
-
-    substituter_t substituter_ {};
-    error_cost_t gap_cost_ {1};
-
-    needleman_wunsch_scores() noexcept {}
-    needleman_wunsch_scores(substituter_t subs, error_cost_t gap_cost) noexcept
-        : substituter_(subs), gap_cost_(gap_cost) {}
-
-    template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
-    status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
-                        results_type_ &&results, gpu_specs_t specs = {}, cudaStream_t stream = 0) const noexcept {
-        return _needleman_wunsch_via_cuda<sz_cap_kepler_k>(first_strings, second_strings, results, substituter_,
-                                                           gap_cost_, specs, stream);
-    }
-};
-
-/** @brief Dispatches Needleman Wunsch algorithm to the GPU with Kepler shuffles and Hopper DPX. */
-template <typename substituter_type_>
-struct needleman_wunsch_scores<sz_cap_hopper_k, char, substituter_type_, dummy_alloc_t> {
-
-    using char_t = char;
-    using substituter_t = substituter_type_ *; // ! Note the pointer, it must be in the global memory
+    using substituter_t = substituter_type_;
 
     substituter_t substituter_ {};
     error_cost_t gap_cost_ {1};
@@ -959,10 +1005,10 @@ struct needleman_wunsch_scores<sz_cap_hopper_k, char, substituter_type_, dummy_a
         : substituter_(subs), gap_cost_(gap_cost) {}
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
-    status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
-                        results_type_ &&results, gpu_specs_t specs = {}, cudaStream_t stream = 0) const noexcept {
-        return _needleman_wunsch_via_cuda<sz_cap_hopper_k>(first_strings, second_strings, results, substituter_,
-                                                           gap_cost_, specs, stream);
+    cuda_status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
+                             results_type_ &&results, gpu_specs_t specs = {}, cudaStream_t stream = 0) const noexcept {
+        return _needleman_wunsch_via_cuda_warp<sz_cap_cuda_k>(first_strings, second_strings, results, substituter_,
+                                                              gap_cost_, specs, stream);
     }
 };
 
diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index b41e37eb..2812266f 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -542,31 +542,27 @@ void test_similarity_scores_equivalence() {
 
 #if SZ_USE_CUDA
     // CUDA Levenshtein distance against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(               //
-        levenshtein_distances<char, malloc_t, sz_cap_parallel_k> {}, //
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(          //
+        levenshtein_distances<char, malloc_t, sz_caps_sp_k> {}, //
         levenshtein_distances<char, dummy_alloc_t, sz_cap_cuda_k> {});
 #endif
 
 #if SZ_USE_HOPPER && 0
     // CUDA Levenshtein distance on Hopper against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(               //
-        levenshtein_distances<char, malloc_t, sz_cap_parallel_k> {}, //
-        levenshtein_distances<char, sz_cap_hopper_k> {});
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(          //
+        levenshtein_distances<char, malloc_t, sz_caps_sp_k> {}, //
+        levenshtein_distances<char, dummy_alloc_t, sz_cap_hopper_k> {});
 #endif
 
-    // Switch to the GPU, using an identical matrix, but move it into unified memory
-    unified_vector<error_mat_t> blosum62_unified(1);
-    blosum62_unified[0] = blosum62_mat;
-
 #if SZ_USE_CUDA
     // CUDA Needleman-Wunsch distance against Multi-threaded on CPU,
     // using a compressed smaller matrix to fit into GPU shared memory
     std::string_view ascii_alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
-        needleman_wunsch_scores<char, error_matrix_t, malloc_t, sz_cap_parallel_k> {blosum62_matrix,
-                                                                                    blosum62_gap_extension_cost}, //
-        needleman_wunsch_scores<char, error_mat_t *, sz_cap_cuda_k> {blosum62_unified.data(),
-                                                                     blosum62_gap_extension_cost},
+        needleman_wunsch_scores<char, error_matrix_t, malloc_t, sz_caps_sp_k> {blosum62_matrix,
+                                                                               blosum62_gap_extension_cost}, //
+        needleman_wunsch_scores<char, error_matrix_t, dummy_alloc_t, sz_cap_cuda_k> {blosum62_matrix,
+                                                                                     blosum62_gap_extension_cost},
         ascii_alphabet);
 #endif
 }

From f7d365aa36d85c7cb7155594876bad8d2b6f44ec Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 12 Apr 2025 21:55:54 +0000
Subject: [PATCH 321/751] Fix: Missing STL includes

---
 include/stringcuzilla/similarity.hpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index 0040c2e2..85036220 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -56,7 +56,10 @@
 #include "stringzilla/memory.h"  // `sz_move`
 #include "stringzilla/types.hpp" // `sz::error_cost_t`
 
-#include <atomic> // `std::atomic` to synchronize OpenMP threads
+#include <atomic>      // `std::atomic` to synchronize OpenMP threads
+#include <type_traits> // `std::enable_if_t` for meta-programming
+#include <limits>      // `std::numeric_limits` for numeric types
+#include <iterator>    // `std::iterator_traits` for iterators
 
 namespace ashvardanian {
 namespace stringzilla {

From 7bd92c5b2f9581f69d8f8642ecd1e21d9c8c8e79 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 12 Apr 2025 21:57:29 +0000
Subject: [PATCH 322/751] Improve: Generalize memory requirements estimates

---
 include/stringcuzilla/similarity.cuh | 122 ++++---
 include/stringcuzilla/similarity.hpp | 468 ++++++++++++++++-----------
 include/stringzilla/types.h          |   1 +
 include/stringzilla/types.hpp        |   1 +
 4 files changed, 355 insertions(+), 237 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index 24b50087..317fd644 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -495,26 +495,21 @@ struct diagonal_walker_per_warp {
     }
 };
 
-template <typename first_strings_type_,
-          typename second_strings_type_>
+template <bool is_signed_, typename first_strings_type_, typename second_strings_type_>
 sz_size_t _scores_diagonally_warp_shared_memory_requirement( //
     first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
-    error_cost_t max_cell_difference) noexcept {
+    sz_size_t max_magnitude_change) noexcept {
+
+    using char_t = typename first_strings_type_::value_type::value_type;
+    using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, is_signed_>;
 
     sz_size_t max_required_shared_memory = 0;
     for (sz_size_t i = 0; i < first_strings.size(); ++i) {
         sz_size_t const first_length = first_strings[i].length();
         sz_size_t const second_length = second_strings[i].length();
-        sz_size_t const shorter_length = sz_min_of_two(first_length, second_length);
-        sz_size_t const longer_length = sz_max_of_two(first_length, second_length);
-        sz_size_t const max_diagonal_length = shorter_length + 1;
-        sz_size_t const max_cell_value = (longer_length + 1) * max_cell_difference;
-        sz_size_t const bytes_per_cell = max_cell_value < 256 ? 1 : max_cell_value < 65536 ? 2 : 4;
-        // For each string we need to copy its contents, and allocate 3 bands proportional to the length
-        // of the shorter string with each cell being big enough to hold the length of the longer one.
-        // The diagonals should be aligned to 4 bytes to allow for SIMD operations.
-        sz_size_t const bytes_per_diagonal = round_up_to_multiple<sz_size_t>(max_diagonal_length * bytes_per_cell, 4);
-        sz_size_t const shared_memory_requirement = 3 * bytes_per_diagonal + first_length + second_length;
+        sz_size_t const shared_memory_requirement =
+            similarity_memory_requirements_t(first_length, second_length, max_magnitude_change, sizeof(char_t), 4)
+                .total;
         max_required_shared_memory = sz_max_of_two(max_required_shared_memory, shared_memory_requirement);
     }
     return max_required_shared_memory;
@@ -528,7 +523,8 @@ sz_size_t _scores_diagonally_warp_shared_memory_requirement( //
  *  @param[in] first_strings Array of first strings in each pair for score calculation.
  *  @param[in] second_strings Array of second strings in each pair for score calculation.
  *  @param[out] results_ptr Output array of scores for each pair of strings.
- *  @param[in] max_length Maximum length of the strings to be processed. Everything above that will be @b skipped.
+ *  @param[in] max_diagonal_length Maximum length of the strings to be processed. Everything above that will be @b
+ * skipped.
  */
 template <                                      //
     typename first_strings_type_,               //
@@ -540,7 +536,7 @@ __global__ void _levenshtein_in_cuda_warp( //
     first_strings_type_ first_strings,     //
     second_strings_type_ second_strings,   //
     score_type_ *results_ptr,              //
-    size_t max_length = SZ_SIZE_MAX) {
+    size_t max_diagonal_length = SZ_SIZE_MAX) {
 
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
     using first_string_t = typename first_strings_type_::value_type;
@@ -583,18 +579,20 @@ __global__ void _levenshtein_in_cuda_warp( //
         }
 
         // Estimate the maximum dimension of the DP matrix to pick the smallest fitting type.
-        sz_size_t const max_cell_value = sz_max_of_two(first_length, second_length) + 1;
-        if (max_cell_value >= max_length) continue;
+        using similarity_memory_requirements_t = similarity_memory_requirements<uint, false>;
+        similarity_memory_requirements_t requirements(first_length, second_length, 1, sizeof(char_t), 4);
+        if (requirements.max_diagonal_length >= max_diagonal_length) continue;
+
         span<char const> const first = {first_global.data(), first_length};
         span<char const> const second = {second_global.data(), second_length};
-        if (max_cell_value < 256u) {
+
+        if (requirements.bytes_per_cell == 1) {
             sz_u8_t result_u8 = (sz_u8_t)-1;
             walker_u8_t walker({}, 1);
             walker(first, second, result_u8, shared_memory_buffer);
             if (threadIdx.x == 0) result_ref = result_u8;
         }
         else {
-            _sz_assert(max_cell_value < 65536u && "Use `_levenshtein_in_cuda_device` for large inputs");
             sz_u16_t result_u16 = (sz_u16_t)-1;
             walker_u16_t walker({}, 1);
             walker(first, second, result_u16, shared_memory_buffer);
@@ -611,7 +609,8 @@ __global__ void _levenshtein_in_cuda_warp( //
  *  @param[in] first_strings Array of first strings in each pair for score calculation.
  *  @param[in] second_strings Array of second strings in each pair for score calculation.
  *  @param[out] results_ptr Output array of scores for each pair of strings.
- *  @param[in] min_length Minimum length of the strings to be processed. Everything below that will be @b skipped.
+ *  @param[in] min_diagonal_length Minimum length of the strings to be processed. Everything below that will be @b
+ * skipped.
  */
 template <                                      //
     typename first_strings_type_,               //
@@ -623,7 +622,7 @@ __global__ void _levenshtein_in_cuda_device( //
     first_strings_type_ first_strings,       //
     second_strings_type_ second_strings,     //
     score_type_ *results_ptr,                //
-    sz_size_t min_length = 0) {
+    sz_size_t min_diagonal_length = 0) {
 
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
     using first_string_t = typename first_strings_type_::value_type;
@@ -640,6 +639,8 @@ __global__ void _levenshtein_in_cuda_device( //
                                                   sz_similarity_global_k, capability_k>;
     using walker_u32_t = diagonal_walker_per_warp<char_t, sz_u32_t, error_costs_uniform_t, objective_k,
                                                   sz_similarity_global_k, capability_k>;
+    using walker_u64_t = diagonal_walker_per_warp<char_t, sz_u64_t, error_costs_uniform_t, objective_k,
+                                                  sz_similarity_global_k, capability_k>;
 
     // Allocating shared memory is handled on the host side.
     extern __shared__ char shared_memory_buffer[];
@@ -664,23 +665,31 @@ __global__ void _levenshtein_in_cuda_device( //
         }
 
         // Estimate the maximum dimension of the DP matrix to pick the smallest fitting type.
-        sz_size_t const max_cell_value = sz_max_of_two(first_length, second_length) + 1;
-        if (max_cell_value >= min_length) continue;
+        using similarity_memory_requirements_t = similarity_memory_requirements<uint, false>;
+        similarity_memory_requirements_t requirements(first_length, second_length, 1, sizeof(char_t), 4);
+        if (requirements.max_diagonal_length >= min_diagonal_length) continue;
+
         span<char const> const first = {first_global.data(), first_length};
         span<char const> const second = {second_global.data(), second_length};
-        if (max_cell_value < 65536u) {
+
+        if (requirements.bytes_per_cell == 2) {
             sz_u16_t result_u16 = (sz_u16_t)-1;
             walker_u16_t walker({}, 1);
             walker(first, second, result_u16, shared_memory_buffer);
             if (threadIdx.x == 0) result_ref = result_u16;
         }
-        else {
-            _sz_assert(max_cell_value < 4294967296u && "Use heuristics-based algorithms for large inputs");
+        else if (requirements.bytes_per_cell == 4) {
             sz_u32_t result_u32 = (sz_u32_t)-1;
             walker_u32_t walker({}, 1);
             walker(first, second, result_u32, shared_memory_buffer);
             if (threadIdx.x == 0) result_ref = result_u32;
         }
+        else if (requirements.bytes_per_cell == 8) {
+            sz_u64_t result_u64 = (sz_u64_t)-1;
+            walker_u64_t walker({}, 1);
+            walker(first, second, result_u64, shared_memory_buffer);
+            if (threadIdx.x == 0) result_ref = result_u64;
+        }
     }
 }
 
@@ -707,7 +716,7 @@ cuda_status_t _levenshtein_via_cuda_warp(
     // H100 Streaming Multiprocessor can have up to 128 active warps concurrently and only 256 KB of shared memory.
     // A100 SMs had only 192 KB. We can't deal with blocks that require more memory than the SM can provide.
     sz_size_t shared_memory_per_block =
-        _scores_diagonally_warp_shared_memory_requirement(first_strings, second_strings, 1);
+        _scores_diagonally_warp_shared_memory_requirement<false>(first_strings, second_strings, 1);
     if (shared_memory_per_block > specs.shared_memory_per_multiprocessor()) return {status_t::bad_alloc_k};
 
     // It may be the case that we've only received empty strings.
@@ -738,8 +747,10 @@ cuda_status_t _levenshtein_via_cuda_warp(
     };
 
     // On Volta and newer GPUs, there is an extra flag to be set to use more than 48 KB of shared memory per block.
-    cudaError_t attribute_error =
-        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, specs.shared_memory_bytes);
+    // CUDA reserves 1 KB of shared memory per thread block, so on H100 we can use up to 227 KB of shared memory.
+    // https://docs.nvidia.com/cuda/hopper-tuning-guide/index.html#unified-shared-memory-l1-texture-cache
+    cudaError_t attribute_error = cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                                       specs.shared_memory_per_multiprocessor() - count_blocks * 1024);
     if (attribute_error != cudaSuccess) return {status_t::unknown_k, attribute_error};
 
     // Create CUDA events for timing
@@ -793,7 +804,7 @@ struct levenshtein_distances<char_type_, dummy_alloc_t, sz_cap_cuda_k> {
  *          used to cheaper store and access the substitution costs for the characters.
  *  @see    CUDA constant memory docs: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#constant
  */
-__constant__ error_costs_256x256_t _error_costs_256x256_in_cuda_constant_memory;
+__constant__ char _error_costs_in_cuda_constant_memory[256 * 256];
 
 /**
  *  @brief  Needleman-Wunsch alignment cores algorithm evaluating the Dynamic Programming matrix
@@ -809,7 +820,7 @@ __constant__ error_costs_256x256_t _error_costs_256x256_in_cuda_constant_memory;
 template <                                              //
     typename first_strings_type_,                       //
     typename second_strings_type_,                      //
-    typename score_type_ = sz_size_t,                   //
+    typename score_type_ = sz_ssize_t,                  //
     typename substituter_type_ = error_costs_256x256_t, //
     sz_capability_t capability_ = sz_cap_cuda_k         //
     >
@@ -817,8 +828,9 @@ __global__ void _needleman_wunsch_in_cuda_warp( //
     first_strings_type_ first_strings,          //
     second_strings_type_ second_strings,        //
     score_type_ *results_ptr,                   //
-    error_cost_t gap_cost = 1                   //
-) {
+    error_cost_t gap_cost,                      //
+    sz_size_t max_magnitude_change) {
+
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
     using first_string_t = typename first_strings_type_::value_type;
     using second_string_t = typename second_strings_type_::value_type;
@@ -834,6 +846,7 @@ __global__ void _needleman_wunsch_in_cuda_warp( //
     static constexpr sz_capability_t cap_k = capability_;
     static constexpr sz_similarity_objective_t obj_k = sz_maximize_score_k;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
+    using walker_i8_t = diagonal_walker_per_warp<char_t, sz_i8_t, substituter_t const &, obj_k, locality_k, cap_k>;
     using walker_i16_t = diagonal_walker_per_warp<char_t, sz_i16_t, substituter_t const &, obj_k, locality_k, cap_k>;
     using walker_i32_t = diagonal_walker_per_warp<char_t, sz_i32_t, substituter_t const &, obj_k, locality_k, cap_k>;
     using walker_i64_t = diagonal_walker_per_warp<char_t, sz_i64_t, substituter_t const &, obj_k, locality_k, cap_k>;
@@ -843,7 +856,7 @@ __global__ void _needleman_wunsch_in_cuda_warp( //
 
     // We expect the substituter state to be already in the GPU constant memory.
     substituter_t const &substituter_constant =
-        *reinterpret_cast<substituter_t const *>(&_error_costs_256x256_in_cuda_constant_memory);
+        *reinterpret_cast<substituter_t const *>(_error_costs_in_cuda_constant_memory);
 
     // We are computing N edit distances for N pairs of strings. Not a cartesian product!
     // Each block/warp may end up receiving a different number of strings.
@@ -865,22 +878,33 @@ __global__ void _needleman_wunsch_in_cuda_warp( //
         }
 
         // Estimate the maximum dimension of the DP matrix to pick the smallest fitting type.
-        sz_size_t const max_cell_value = sz_max_of_two(first_length, second_length) + 1;
+        using similarity_memory_requirements_t = similarity_memory_requirements<uint, true>;
+        similarity_memory_requirements_t requirements(first_length, second_length,
+                                                      static_cast<uint>(max_magnitude_change), sizeof(char_t), 4);
+
+        // Estimate the maximum dimension of the DP matrix to pick the smallest fitting type.
         span<char const> const first = {first_global.data(), first_length};
         span<char const> const second = {second_global.data(), second_length};
-        if (max_cell_value < 256u) {
+
+        if (requirements.bytes_per_cell == 1) {
+            sz_i8_t result_i8 = std::numeric_limits<sz_i8_t>::min();
+            walker_i8_t walker(substituter_constant, gap_cost);
+            walker(first, second, result_i8, shared_memory_buffer);
+            if (threadIdx.x == 0) result_ref = result_i8;
+        }
+        else if (requirements.bytes_per_cell == 2) {
             sz_i16_t result_i16 = std::numeric_limits<sz_i16_t>::min();
             walker_i16_t walker(substituter_constant, gap_cost);
             walker(first, second, result_i16, shared_memory_buffer);
             if (threadIdx.x == 0) result_ref = result_i16;
         }
-        else if (max_cell_value < 65536u) {
+        else if (requirements.bytes_per_cell == 4) {
             sz_i32_t result_i32 = std::numeric_limits<sz_i32_t>::min();
             walker_i32_t walker(substituter_constant, gap_cost);
             walker(first, second, result_i32, shared_memory_buffer);
             if (threadIdx.x == 0) result_ref = result_i32;
         }
-        else {
+        else if (requirements.bytes_per_cell == 8) {
             sz_i64_t result_i64 = std::numeric_limits<sz_i64_t>::min();
             walker_i64_t walker(substituter_constant, gap_cost);
             walker(first, second, result_i64, shared_memory_buffer);
@@ -916,8 +940,8 @@ cuda_status_t _needleman_wunsch_via_cuda_warp(
     // Make sure that we don't string pairs that are too large to fit 3 matrix diagonals into shared memory.
     // H100 Streaming Multiprocessor can have up to 128 active warps concurrently and only 256 KB of shared memory.
     // A100 SMs had only 192 KB. We can't deal with blocks that require more memory than the SM can provide.
-    sz_size_t shared_memory_per_block =
-        _scores_diagonally_warp_shared_memory_requirement(first_strings, second_strings, 127);
+    sz_size_t shared_memory_per_block = _scores_diagonally_warp_shared_memory_requirement<true>(
+        first_strings, second_strings, substituter.max_magnitude_change());
     if (shared_memory_per_block > specs.shared_memory_per_multiprocessor()) return {status_t::bad_alloc_k};
 
     // It may be the case that we've only received empty strings.
@@ -935,18 +959,19 @@ cuda_status_t _needleman_wunsch_via_cuda_warp(
 
     // Let's use all 32 threads in a warp.
     constexpr sz_size_t threads_per_block = 32u;
+    sz_size_t const max_magnitude_change = substituter.max_magnitude_change();
     auto kernel =
         &_needleman_wunsch_in_cuda_warp<first_strings_t, second_strings_t, score_t, substituter_t, capability_k>;
     void *kernel_args[] = {
-        (void *)&first_strings,
-        (void *)&second_strings,
-        (void *)&results,
-        (void *)&gap_cost,
+        (void *)&first_strings, (void *)&second_strings,       (void *)&results,
+        (void *)&gap_cost,      (void *)&max_magnitude_change,
     };
 
     // On Volta and newer GPUs, there is an extra flag to be set to use more than 48 KB of shared memory per block.
-    cudaError_t attribute_error =
-        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, specs.shared_memory_bytes);
+    // CUDA reserves 1 KB of shared memory per thread block, so on H100 we can use up to 227 KB of shared memory.
+    // https://docs.nvidia.com/cuda/hopper-tuning-guide/index.html#unified-shared-memory-l1-texture-cache
+    cudaError_t attribute_error = cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                                       specs.shared_memory_per_multiprocessor() - count_blocks * 1024);
     if (attribute_error != cudaSuccess) return {status_t::unknown_k, attribute_error};
 
     // Create CUDA events for timing
@@ -958,9 +983,8 @@ cuda_status_t _needleman_wunsch_via_cuda_warp(
     cudaEventRecord(start_event, stream);
 
     // Enqueue the transfer of the substituter to the constant memory:
-    cudaError_t copy_error =
-        cudaMemcpyToSymbolAsync((void *)&_error_costs_256x256_in_cuda_constant_memory, (void const *)&substituter,
-                                sizeof(substituter_t), 0, cudaMemcpyHostToDevice, stream);
+    cudaError_t copy_error = cudaMemcpyToSymbolAsync(_error_costs_in_cuda_constant_memory, (void const *)&substituter,
+                                                     sizeof(substituter_t), 0, cudaMemcpyHostToDevice, stream);
     if (copy_error != cudaSuccess) return {status_t::unknown_k, copy_error};
 
     // Enqueue the kernel for execution:
diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index 85036220..431002d6 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -68,24 +68,71 @@ struct error_costs_256x256_t;
 struct error_costs_26x26ascii_t;
 
 /**
- *  @brief  Helper method to guess the amount of SRAM we want to effectively process the input
+ *  @brief  Helper object to guess the amount of SRAM we want to effectively process the input
  *          without fetching from RAM/VRAM all the time, including the space for 3 diagonals
  *          and the strings themselves.
+ *
+ *  @tparam size_type_ The type of the size, usually `sz_size_t` for large inputs or `uint` on small inputs in CUDA.
  */
-inline sz_size_t _diagonal_similarity_memory_usage( //
-    sz_size_t first_length, sz_size_t second_length, error_cost_t max_cell_difference) noexcept {
-    sz_size_t shorter_length = sz_min_of_two(first_length, second_length);
-    sz_size_t longer_length = sz_max_of_two(first_length, second_length);
-    sz_size_t max_diagonal_length = shorter_length + 1;
-    sz_size_t max_cell_value = (longer_length + 1) * max_cell_difference;
-    sz_size_t bytes_per_cell = max_cell_value < 256 ? 1 : max_cell_value < 65536 ? 2 : 4;
-    // For each string we need to copy its contents, and allocate 3 bands proportional to the length
-    // of the shorter string with each cell being big enough to hold the length of the longer one.
-    // The diagonals should be aligned to 4 bytes to allow for SIMD operations.
-    sz_size_t bytes_per_diagonal = round_up_to_multiple<sz_size_t>(max_diagonal_length * bytes_per_cell, 4);
-    sz_size_t shared_memory_requirement = 3 * bytes_per_diagonal + first_length + second_length;
-    return shared_memory_requirement;
-}
+template <typename size_type_, bool is_signed_ = false>
+struct similarity_memory_requirements {
+    using size_t = size_type_;
+    static constexpr bool is_signed_k = is_signed_;
+
+    size_t max_diagonal_length = 0;
+    size_t bytes_per_cell = 0;
+    size_t bytes_per_diagonal = 0;
+    size_t total = 0;
+
+    /**
+     *  @param[in] first_length The length of the first string in characters/codepoints.
+     *  @param[in] second_length The length of the second string in characters/codepoints.
+     *  @param[in] max_magnitude_change The absolute value of the maximum change in nearby cells.
+     *  @param[in] bytes_per_character The number of bytes per character, 4 for UTF-32, 1 for ASCII.
+     *  @param[in] word_alignment The alignment of the data in bytes, 4 for CUDA, 64 for AVX-512.
+     *
+     *  To understand the @p max_magnitude_change parameter, consider the following example:
+     *  - substitution costs ranging from -16 to +15
+     *  - gap costs equal to -10
+     *  In that case, the biggest change will be `abs(-16) = 16`, so the passed argument should be 16.
+     */
+    constexpr similarity_memory_requirements(      //
+        size_t first_length, size_t second_length, //
+        size_t max_magnitude_change,               //
+        size_t bytes_per_character,                //
+        size_t word_alignment) noexcept {
+
+        // Each diagonal in the DP matrix is only by 1 longer than the shorter string.
+        size_t shorter_length = sz_min_of_two(first_length, second_length);
+        size_t longer_length = sz_max_of_two(first_length, second_length);
+        this->max_diagonal_length = shorter_length + 1;
+
+        // The amount of memory we need per diagonal, depends on the maximum number of the differences
+        // between 2 strings and the maximum cost of each change.
+        size_t max_cell_value = (longer_length + 1) * max_magnitude_change;
+        if constexpr (!is_signed_k)
+            this->bytes_per_cell = //
+                max_cell_value < 256          ? 1
+                : max_cell_value < 65536      ? 2
+                : max_cell_value < 4294967296 ? 4
+                                              : 8;
+        else
+            this->bytes_per_cell = //
+                max_cell_value < 127          ? 1
+                : max_cell_value < 32767      ? 2
+                : max_cell_value < 2147483647 ? 4
+                                              : 8;
+
+        // For each string we need to copy its contents, and allocate 3 bands proportional to the length
+        // of the shorter string with each cell being big enough to hold the length of the longer one.
+        // The diagonals should be aligned to `word_alignment` bytes to allow for SIMD operations.
+        this->bytes_per_diagonal = round_up_to_multiple<size_t>(max_diagonal_length * bytes_per_cell, word_alignment);
+        this->total =                                                                          //
+            3 * bytes_per_diagonal +                                                           //
+            round_up_to_multiple<size_t>(first_length * bytes_per_character, word_alignment) + //
+            round_up_to_multiple<size_t>(second_length * bytes_per_character, word_alignment);
+    }
+};
 
 /**
  *  @brief  An operator to be applied to be applied to all 2x2 blocks of the DP matrix to produce
@@ -659,42 +706,48 @@ struct levenshtein_distance {
             return status_t::success_k;
         }
 
-        // Estimate the maximum dimension of the DP matrix
-        sz_size_t const min_dim = sz_min_of_two(first_length, second_length) + 1;
-        sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
+        // Estimate the maximum dimension of the DP matrix and choose the best type for it.
+        using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, false>;
+        similarity_memory_requirements_t requirements(first_length, second_length, 1, sizeof(char_t),
+                                                      SZ_MAX_REGISTER_WIDTH);
 
         // When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
-        status_t status = status_t::success_k;
-        if (min_dim < 16u) {
-            sz_u8_t result_u8;
-            status = horizontal_u8_t {error_costs_uniform_t {}, 1, alloc_}(first, second, result_u8);
-            if (status == status_t::success_k) result_ref = result_u8;
+        error_costs_uniform_t substituter;
+        if (requirements.max_diagonal_length < 16) {
+            sz_u8_t result_u8 = std::numeric_limits<sz_u8_t>::max();
+            status_t status = horizontal_u8_t {substituter, 1, alloc_}(first, second, result_u8);
+            if (status != status_t::success_k) return status;
+            result_ref = result_u8;
         }
 
         // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
-        if (max_dim < 256u) {
-            sz_u8_t result_u8;
-            status = diagonal_u8_t {error_costs_uniform_t {}, 1, alloc_}(first, second, result_u8);
-            if (status == status_t::success_k) result_ref = result_u8;
+        if (requirements.bytes_per_cell == 1) {
+            sz_u8_t result_u8 = std::numeric_limits<sz_u8_t>::max();
+            status_t status = diagonal_u8_t {substituter, 1, alloc_}(first, second, result_u8);
+            if (status != status_t::success_k) return status;
+            result_ref = result_u8;
         }
-        else if (max_dim < 65536u) {
-            sz_u16_t result_u16;
-            status = diagonal_u16_t {error_costs_uniform_t {}, 1, alloc_}(first, second, result_u16);
-            if (status == status_t::success_k) result_ref = result_u16;
+        else if (requirements.bytes_per_cell == 2) {
+            sz_u16_t result_u16 = std::numeric_limits<sz_u16_t>::max();
+            status_t status = diagonal_u16_t {substituter, 1, alloc_}(first, second, result_u16);
+            if (status != status_t::success_k) return status;
+            result_ref = result_u16;
         }
-        else if (max_dim < 4294967296u) {
-            sz_u32_t result_u32;
-            status = diagonal_u32_t {error_costs_uniform_t {}, 1, alloc_}(first, second, result_u32);
-            if (status == status_t::success_k) result_ref = result_u32;
+        else if (requirements.bytes_per_cell == 4) {
+            sz_u32_t result_u32 = std::numeric_limits<sz_u32_t>::max();
+            status_t status = diagonal_u32_t {substituter, 1, alloc_}(first, second, result_u32);
+            if (status != status_t::success_k) return status;
+            result_ref = result_u32;
         }
-        else {
-            sz_u64_t result_u64;
-            status = diagonal_u64_t {error_costs_uniform_t {}, 1, alloc_}(first, second, result_u64);
-            if (status == status_t::success_k) result_ref = result_u64;
+        else if (requirements.bytes_per_cell == 8) {
+            sz_u64_t result_u64 = std::numeric_limits<sz_u64_t>::max();
+            status_t status = diagonal_u64_t {substituter, 1, alloc_}(first, second, result_u64);
+            if (status != status_t::success_k) return status;
+            result_ref = result_u64;
         }
 
-        return status;
+        return status_t::success_k;
     }
 };
 
@@ -771,44 +824,51 @@ struct levenshtein_distance_utf8 {
              progress_utf8 += rune_length, ++progress_utf32, ++second_length_utf32)
             sz_rune_parse(second.data() + progress_utf8, second_data_utf32 + progress_utf32, &rune_length);
 
-        // Estimate the maximum dimension of the DP matrix
-        sz_size_t const min_dim = sz_min_of_two(first_length, second_length) + 1;
-        sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
+        // Estimate the maximum dimension of the DP matrix and choose the best type for it.
+        using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, false>;
+        similarity_memory_requirements_t requirements(first_length, second_length, 1, sizeof(sz_rune_t),
+                                                      SZ_MAX_REGISTER_WIDTH);
+
         span<sz_rune_t const> const first_utf32 {first_data_utf32, first_length_utf32};
         span<sz_rune_t const> const second_utf32 {second_data_utf32, second_length_utf32};
 
         // When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
-        status_t status = status_t::success_k;
-        if (min_dim < 16u) {
-            sz_u8_t result_u8;
-            status = horizontal_u8_t {error_costs_uniform_t {}, 1, alloc_}(first_utf32, second_utf32, result_u8);
-            if (status == status_t::success_k) result_ref = result_u8;
+        error_costs_uniform_t substituter;
+        if (requirements.max_diagonal_length < 16) {
+            sz_u8_t result_u8 = std::numeric_limits<sz_u8_t>::max();
+            status_t status = horizontal_u8_t {substituter, 1, alloc_}(first_utf32, second_utf32, result_u8);
+            if (status != status_t::success_k) return status;
+            result_ref = result_u8;
         }
 
         // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
-        if (max_dim < 256u) {
-            sz_u8_t result_u8;
-            status = diagonal_u8_t {error_costs_uniform_t {}, 1, alloc_}(first_utf32, second_utf32, result_u8);
-            if (status == status_t::success_k) result_ref = result_u8;
+        if (requirements.bytes_per_cell == 1) {
+            sz_u8_t result_u8 = std::numeric_limits<sz_u8_t>::max();
+            status_t status = diagonal_u8_t {substituter, 1, alloc_}(first_utf32, second_utf32, result_u8);
+            if (status != status_t::success_k) return status;
+            result_ref = result_u8;
         }
-        else if (max_dim < 65536u) {
-            sz_u16_t result_u16;
-            status = diagonal_u16_t {error_costs_uniform_t {}, 1, alloc_}(first_utf32, second_utf32, result_u16);
-            if (status == status_t::success_k) result_ref = result_u16;
+        else if (requirements.bytes_per_cell == 2) {
+            sz_u16_t result_u16 = std::numeric_limits<sz_u16_t>::max();
+            status_t status = diagonal_u16_t {substituter, 1, alloc_}(first_utf32, second_utf32, result_u16);
+            if (status != status_t::success_k) return status;
+            result_ref = result_u16;
         }
-        else if (max_dim < 4294967296u) {
-            sz_u32_t result_u32;
-            status = diagonal_u32_t {error_costs_uniform_t {}, 1, alloc_}(first_utf32, second_utf32, result_u32);
-            if (status == status_t::success_k) result_ref = result_u32;
+        else if (requirements.bytes_per_cell == 4) {
+            sz_u32_t result_u32 = std::numeric_limits<sz_u32_t>::max();
+            status_t status = diagonal_u32_t {substituter, 1, alloc_}(first_utf32, second_utf32, result_u32);
+            if (status != status_t::success_k) return status;
+            result_ref = result_u32;
         }
-        else {
-            sz_u64_t result_u64;
-            status = diagonal_u64_t {error_costs_uniform_t {}, 1, alloc_}(first_utf32, second_utf32, result_u64);
-            if (status == status_t::success_k) result_ref = result_u64;
+        else if (requirements.bytes_per_cell == 8) {
+            sz_u64_t result_u64 = std::numeric_limits<sz_u64_t>::max();
+            status_t status = diagonal_u64_t {substituter, 1, alloc_}(first_utf32, second_utf32, result_u64);
+            if (status != status_t::success_k) return status;
+            result_ref = result_u64;
         }
 
-        return status;
+        return status_t::success_k;
     }
 };
 
@@ -867,36 +927,33 @@ struct needleman_wunsch_score {
             return status_t::success_k;
         }
 
-        // Estimate the maximum dimension of the DP matrix
-        sz_size_t const min_dim = sz_min_of_two(first_length, second_length) + 1;
-        sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
+        // Estimate the maximum dimension of the DP matrix and choose the best type for it.
+        using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, true>;
+        similarity_memory_requirements_t requirements(first_length, second_length, substituter_.max_magnitude_change(),
+                                                      sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
 
         // When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
         status_t status = status_t::success_k;
-        if (min_dim < 16u) {
-            sz_i16_t result_i16;
+        if (requirements.max_diagonal_length < 16) {
+            sz_i16_t result_i16 = std::numeric_limits<sz_i16_t>::min();
             status = horizontal_i16_t {substituter_, gap_cost_, alloc_}(first, second, result_i16);
             if (status == status_t::success_k) result_ref = result_i16;
         }
 
         // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
-        // Assuming each individual cost falls in [-128, 127], the `i16` range of [-32768, 32767] is sufficient
-        // for inputs under (32768 / 128) = 256 characters.
-        if (max_dim < 256u) {
-            sz_i16_t result_i16;
+        else if (requirements.bytes_per_cell == 2) {
+            sz_i16_t result_i16 = std::numeric_limits<sz_i16_t>::min();
             status = diagonal_i16_t {substituter_, gap_cost_, alloc_}(first, second, result_i16);
             if (status == status_t::success_k) result_ref = result_i16;
         }
-        // Assuming each individual cost falls in [-128, 127], the `i32` range of [-2147483648, 2147483647] is
-        // sufficient for inputs under (2147483648 / 128) = 16777216 characters.
-        else if (max_dim < 16777216u) {
-            sz_i32_t result_i32;
+        else if (requirements.bytes_per_cell == 4) {
+            sz_i32_t result_i32 = std::numeric_limits<sz_i32_t>::min();
             status = diagonal_i32_t {substituter_, gap_cost_, alloc_}(first, second, result_i32);
             if (status == status_t::success_k) result_ref = result_i32;
         }
-        else {
-            sz_i64_t result_i64;
+        else if (requirements.bytes_per_cell == 8) {
+            sz_i64_t result_i64 = std::numeric_limits<sz_i64_t>::min();
             status = diagonal_i64_t {substituter_, gap_cost_, alloc_}(first, second, result_i64);
             if (status == status_t::success_k) result_ref = result_i64;
         }
@@ -956,41 +1013,41 @@ struct smith_waterman_score {
             return status_t::success_k;
         }
 
-        // Estimate the maximum dimension of the DP matrix
-        sz_size_t const min_dim = sz_min_of_two(first_length, second_length) + 1;
-        sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
+        // Estimate the maximum dimension of the DP matrix and choose the best type for it.
+        using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, true>;
+        similarity_memory_requirements_t requirements(first_length, second_length, substituter_.max_magnitude_change(),
+                                                      sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
 
         // When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
-        status_t status = status_t::success_k;
-        if (min_dim < 16u) {
-            sz_i16_t result_i16;
-            status = horizontal_i16_t {substituter_, gap_cost_, alloc_}(first, second, result_i16);
-            if (status == status_t::success_k) result_ref = result_i16;
+        if (requirements.max_diagonal_length < 16) {
+            sz_i16_t result_i16 = std::numeric_limits<sz_i16_t>::min();
+            status_t status = horizontal_i16_t {substituter_, gap_cost_, alloc_}(first, second, result_i16);
+            if (status != status_t::success_k) return status;
+            result_ref = result_i16;
         }
 
         // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
-        // Assuming each individual cost falls in [-128, 127], the `i16` range of [-32768, 32767] is sufficient
-        // for inputs under (32768 / 128) = 256 characters.
-        if (max_dim < 256u) {
-            sz_i16_t result_i16;
-            status = diagonal_i16_t {substituter_, gap_cost_, alloc_}(first, second, result_i16);
-            if (status == status_t::success_k) result_ref = result_i16;
+        else if (requirements.bytes_per_cell == 2) {
+            sz_i16_t result_i16 = std::numeric_limits<sz_i16_t>::min();
+            status_t status = diagonal_i16_t {substituter_, gap_cost_, alloc_}(first, second, result_i16);
+            if (status != status_t::success_k) return status;
+            result_ref = result_i16;
         }
-        // Assuming each individual cost falls in [-128, 127], the `i32` range of [-2147483648, 2147483647] is
-        // sufficient for inputs under (2147483648 / 128) = 16777216 characters.
-        else if (max_dim < 16777216u) {
-            sz_i32_t result_i32;
-            status = diagonal_i32_t {substituter_, gap_cost_, alloc_}(first, second, result_i32);
-            if (status == status_t::success_k) result_ref = result_i32;
+        else if (requirements.bytes_per_cell == 4) {
+            sz_i32_t result_i32 = std::numeric_limits<sz_i32_t>::min();
+            status_t status = diagonal_i32_t {substituter_, gap_cost_, alloc_}(first, second, result_i32);
+            if (status != status_t::success_k) return status;
+            result_ref = result_i32;
         }
-        else {
-            sz_i64_t result_i64;
-            status = diagonal_i64_t {substituter_, gap_cost_, alloc_}(first, second, result_i64);
-            if (status == status_t::success_k) result_ref = result_i64;
+        else if (requirements.bytes_per_cell == 8) {
+            sz_i64_t result_i64 = std::numeric_limits<sz_i64_t>::min();
+            status_t status = diagonal_i64_t {substituter_, gap_cost_, alloc_}(first, second, result_i64);
+            if (status != status_t::success_k) return status;
+            result_ref = result_i64;
         }
 
-        return status;
+        return status_t::success_k;
     }
 };
 
@@ -1011,10 +1068,12 @@ status_t _score_in_parallel(                         //
     core_per_input_type_ &&core_per_input,           //
     all_cores_per_input_type_ &&all_cores_per_input, //
     first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, results_type_ &&results,
-    error_cost_t max_cell_difference, cpu_specs_t specs = {}) noexcept {
+    sz_size_t max_magnitude_change, cpu_specs_t specs = {}) noexcept {
 
     using score_t = score_type_;
-    sz_unused(specs);
+    constexpr bool score_is_signed_k = std::is_signed_v<score_t>;
+    using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, score_is_signed_k>;
+    using char_t = typename core_per_input_type_::char_t;
 
     auto first_size = first_strings.size();
     auto second_size = second_strings.size();
@@ -1033,9 +1092,9 @@ status_t _score_in_parallel(                         //
         auto const &second = second_strings[i];
 
         // ! Longer strings will be handled separately
-        auto const shared_memory_requirement =
-            _diagonal_similarity_memory_usage(first.length(), second.length(), max_cell_difference);
-        if (shared_memory_requirement >= specs.l2_bytes) continue;
+        similarity_memory_requirements_t requirements(first.length(), second.length(), max_magnitude_change,
+                                                      sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
+        if (requirements.total >= specs.l2_bytes) continue;
         status_t status = core_per_input({first.data(), first.length()}, {second.data(), second.length()}, result);
         if (status == status_t::success_k) { results[i] = result; }
         else { error.store(status); }
@@ -1046,9 +1105,9 @@ status_t _score_in_parallel(                         //
         score_t result = 0;
         auto const &first = first_strings[i];
         auto const &second = second_strings[i];
-        auto const shared_memory_requirement =
-            _diagonal_similarity_memory_usage(first.length(), second.length(), max_cell_difference);
-        if (shared_memory_requirement < specs.l2_bytes) continue;
+        similarity_memory_requirements_t requirements(first.length(), second.length(), max_magnitude_change,
+                                                      sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
+        if (requirements.total < specs.l2_bytes) continue;
         status_t status = all_cores_per_input({first.data(), first.length()}, {second.data(), second.length()}, result);
         if (status == status_t::success_k) { results[i] = result; }
         else { error.store(status); }
@@ -1110,12 +1169,12 @@ struct levenshtein_distances {
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
-                        results_type_ &&results) const noexcept {
+                        results_type_ &&results, cpu_specs_t const &specs = {}) const noexcept {
 
         if constexpr (capability_k & sz_cap_parallel_k)
             return _score_in_parallel<sz_size_t>(core_per_input_t {alloc_}, all_cores_per_input_t {alloc_},
-                                                 first_strings, second_strings, std::forward<results_type_>(results),
-                                                 1);
+                                                 first_strings, second_strings, std::forward<results_type_>(results), 1,
+                                                 specs);
         else
             return _score_sequentially<sz_size_t>(all_cores_per_input_t {alloc_}, first_strings, second_strings,
                                                   std::forward<results_type_>(results));
@@ -1145,12 +1204,12 @@ struct levenshtein_distances_utf8 {
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
-                        results_type_ &&results) const noexcept {
+                        results_type_ &&results, cpu_specs_t const &specs = {}) const noexcept {
 
         if constexpr (capability_k & sz_cap_parallel_k)
             return _score_in_parallel<sz_size_t>(core_per_input_t {alloc_}, all_cores_per_input_t {alloc_},
-                                                 first_strings, second_strings, std::forward<results_type_>(results),
-                                                 1);
+                                                 first_strings, second_strings, std::forward<results_type_>(results), 1,
+                                                 specs);
         else
             return _score_sequentially<sz_size_t>(all_cores_per_input_t {alloc_}, first_strings, second_strings,
                                                   std::forward<results_type_>(results));
@@ -1186,13 +1245,13 @@ struct needleman_wunsch_scores {
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
-                        results_type_ &&results) const noexcept {
+                        results_type_ &&results, cpu_specs_t const &specs = {}) const noexcept {
 
         if constexpr (capability_k & sz_cap_parallel_k)
             return _score_in_parallel<sz_ssize_t>(core_per_input_t {substituter_, gap_cost_, alloc_},
                                                   all_cores_per_input_t {substituter_, gap_cost_, alloc_},
                                                   first_strings, second_strings, std::forward<results_type_>(results),
-                                                  127);
+                                                  substituter_.max_magnitude_change(), specs);
         else
             return _score_sequentially<sz_ssize_t>(all_cores_per_input_t {substituter_, gap_cost_, alloc_},
                                                    first_strings, second_strings, std::forward<results_type_>(results));
@@ -1228,13 +1287,13 @@ struct smith_waterman_scores {
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
-                        results_type_ &&results) const noexcept {
+                        results_type_ &&results, cpu_specs_t const &specs = {}) const noexcept {
 
         if constexpr (capability_k & sz_cap_parallel_k)
             return _score_in_parallel<sz_ssize_t>(core_per_input_t {substituter_, gap_cost_, alloc_},
                                                   all_cores_per_input_t {substituter_, gap_cost_, alloc_},
                                                   first_strings, second_strings, std::forward<results_type_>(results),
-                                                  127);
+                                                  substituter_.max_magnitude_change(), specs);
         else
             return _score_sequentially<sz_ssize_t>(all_cores_per_input_t {substituter_, gap_cost_, alloc_},
                                                    first_strings, second_strings, std::forward<results_type_>(results));
@@ -1267,6 +1326,14 @@ struct error_costs_256x256_t {
                 result.cells[i][j] = i == j ? match_score : mismatch_score;
         return result;
     }
+
+    constexpr sz_size_t max_magnitude_change() const noexcept {
+        sz_size_t max_magnitude = 0;
+        for (int i = 0; i != 256; ++i)
+            for (int j = 0; j != 256; ++j) //
+                max_magnitude = std::max(max_magnitude, (sz_size_t)std::abs((int)cells[i][j]));
+        return max_magnitude;
+    }
 };
 
 /**
@@ -1349,6 +1416,14 @@ struct error_costs_26x26ascii_t {
         return result;
     }
 
+    constexpr sz_size_t max_magnitude_change() const noexcept {
+        sz_size_t max_magnitude = 0;
+        for (int i = 0; i != 26; ++i)
+            for (int j = 0; j != 26; ++j) //
+                max_magnitude = std::max(max_magnitude, (sz_size_t)std::abs((int)cells[i][j]));
+        return max_magnitude;
+    }
+
     /**
      *  @brief BLOSUM62 substitution matrix for protein analysis in bioinformatics, reorganized for ASCII lookups.
      *  @see https://en.wikipedia.org/wiki/BLOSUM
@@ -1731,30 +1806,39 @@ struct levenshtein_distance<char, allocator_type_, capability_, std::enable_if_t
         }
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
-        sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
-        status_t status = status_t::success_k;
-        if (max_dim < 256u) {
+        using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, false>;
+        similarity_memory_requirements_t requirements(first_length, second_length, 1, sizeof(char_t),
+                                                      SZ_MAX_REGISTER_WIDTH);
+
+        // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
+        // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
+        error_costs_uniform_t substituter;
+        if (requirements.bytes_per_cell == 1) {
             sz_u8_t result_u8;
-            status = diagonal_u8_t {error_costs_uniform_t {}, 1, alloc_}(first, second, result_u8);
-            if (status == status_t::success_k) result_ref = result_u8;
+            status_t status = diagonal_u8_t {substituter, 1, alloc_}(first, second, result_u8);
+            if (status != status_t::success_k) return status;
+            result_ref = result_u8;
         }
-        else if (max_dim < 65536u) {
+        else if (requirements.bytes_per_cell == 2) {
             sz_u16_t result_u16;
-            status = diagonal_u16_t {error_costs_uniform_t {}, 1, alloc_}(first, second, result_u16);
-            if (status == status_t::success_k) result_ref = result_u16;
+            status_t status = diagonal_u16_t {substituter, 1, alloc_}(first, second, result_u16);
+            if (status != status_t::success_k) return status;
+            result_ref = result_u16;
         }
-        else if (max_dim < 4294967296u) {
+        else if (requirements.bytes_per_cell == 4) {
             sz_u32_t result_u32;
-            status = diagonal_u32_t {error_costs_uniform_t {}, 1, alloc_}(first, second, result_u32);
-            if (status == status_t::success_k) result_ref = result_u32;
+            status_t status = diagonal_u32_t {substituter, 1, alloc_}(first, second, result_u32);
+            if (status != status_t::success_k) return status;
+            result_ref = result_u32;
         }
-        else {
+        else if (requirements.bytes_per_cell == 8) {
             sz_u64_t result_u64;
-            status = diagonal_u64_t {error_costs_uniform_t {}, 1, alloc_}(first, second, result_u64);
-            if (status == status_t::success_k) result_ref = result_u64;
+            status_t status = diagonal_u64_t {substituter, 1, alloc_}(first, second, result_u64);
+            if (status != status_t::success_k) return status;
+            result_ref = result_u64;
         }
 
-        return status;
+        return status_t::success_k;
     }
 };
 
@@ -1826,32 +1910,41 @@ struct levenshtein_distance_utf8<char, allocator_type_, capability_, std::enable
             sz_rune_parse(second.data() + progress_utf8, second_data_utf32 + progress_utf32, &rune_length);
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
-        sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
+        using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, false>;
+        similarity_memory_requirements_t requirements(first_length, second_length, 1, sizeof(sz_rune_t),
+                                                      SZ_MAX_REGISTER_WIDTH);
+
+        // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
+        // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
+        error_costs_uniform_t substituter;
         span<sz_rune_t const> const first_utf32 {first_data_utf32, first_length_utf32};
         span<sz_rune_t const> const second_utf32 {second_data_utf32, second_length_utf32};
-        status_t status = status_t::success_k;
-        if (max_dim < 256u) {
+        if (requirements.bytes_per_cell == 1) {
             sz_u8_t result_u8;
-            status = diagonal_u8_t {error_costs_uniform_t {}, 1, alloc_}(first_utf32, second_utf32, result_u8);
-            if (status == status_t::success_k) result_ref = result_u8;
+            status_t status = diagonal_u8_t {substituter, 1, alloc_}(first_utf32, second_utf32, result_u8);
+            if (status != status_t::success_k) return status;
+            result_ref = result_u8;
         }
-        else if (max_dim < 65536u) {
+        else if (requirements.bytes_per_cell == 2) {
             sz_u16_t result_u16;
-            status = diagonal_u16_t {error_costs_uniform_t {}, 1, alloc_}(first_utf32, second_utf32, result_u16);
-            if (status == status_t::success_k) result_ref = result_u16;
+            status_t status = diagonal_u16_t {substituter, 1, alloc_}(first_utf32, second_utf32, result_u16);
+            if (status != status_t::success_k) return status;
+            result_ref = result_u16;
         }
-        else if (max_dim < 4294967296u) {
+        else if (requirements.bytes_per_cell == 4) {
             sz_u32_t result_u32;
-            status = diagonal_u32_t {error_costs_uniform_t {}, 1, alloc_}(first_utf32, second_utf32, result_u32);
-            if (status == status_t::success_k) result_ref = result_u32;
+            status_t status = diagonal_u32_t {substituter, 1, alloc_}(first_utf32, second_utf32, result_u32);
+            if (status != status_t::success_k) return status;
+            result_ref = result_u32;
         }
-        else {
+        else if (requirements.bytes_per_cell == 8) {
             sz_u64_t result_u64;
-            status = diagonal_u64_t {error_costs_uniform_t {}, 1, alloc_}(first_utf32, second_utf32, result_u64);
-            if (status == status_t::success_k) result_ref = result_u64;
+            status_t status = diagonal_u64_t {substituter, 1, alloc_}(first_utf32, second_utf32, result_u64);
+            if (status != status_t::success_k) return status;
+            result_ref = result_u64;
         }
 
-        return status;
+        return status_t::success_k;
     }
 };
 
@@ -2105,33 +2198,33 @@ struct needleman_wunsch_score<char, error_costs_256x256_t, allocator_type_, capa
             return status_t::success_k;
         }
 
-        // Estimate the maximum dimension of the DP matrix
-        sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
+        // Estimate the maximum dimension of the DP matrix and choose the best type for it.
+        using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, true>;
+        similarity_memory_requirements_t requirements(first_length, second_length, substituter_.max_magnitude_change(),
+                                                      sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
 
         // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
-        // Assuming each individual cost falls in [-128, 127], the `i16` range of [-32768, 32767] is sufficient
-        // for inputs under (32768 / 128) = 256 characters.
-        status_t status = status_t::success_k;
-        if (max_dim < 256u) {
+        if (requirements.bytes_per_cell == 2) {
             sz_i16_t result_i16;
-            status = horizontal_i16_t {substituter_, gap_cost_, alloc_}(first, second, result_i16);
-            if (status == status_t::success_k) result_ref = result_i16;
+            status_t status = horizontal_i16_t {substituter_, gap_cost_, alloc_}(first, second, result_i16);
+            if (status != status_t::success_k) return status;
+            result_ref = result_i16;
         }
-        // Assuming each individual cost falls in [-128, 127], the `i32` range of [-2147483648, 2147483647] is
-        // sufficient for inputs under (2147483648 / 128) = 16777216 characters.
-        else if (max_dim < 16777216u) {
+        else if (requirements.bytes_per_cell == 4) {
             sz_i32_t result_i32;
-            status = horizontal_i32_t {substituter_, gap_cost_, alloc_}(first, second, result_i32);
-            if (status == status_t::success_k) result_ref = result_i32;
+            status_t status = horizontal_i32_t {substituter_, gap_cost_, alloc_}(first, second, result_i32);
+            if (status != status_t::success_k) return status;
+            result_ref = result_i32;
         }
-        else {
+        else if (requirements.bytes_per_cell == 8) {
             sz_i64_t result_i64;
-            status = horizontal_i64_t {substituter_, gap_cost_, alloc_}(first, second, result_i64);
-            if (status == status_t::success_k) result_ref = result_i64;
+            status_t status = horizontal_i64_t {substituter_, gap_cost_, alloc_}(first, second, result_i64);
+            if (status != status_t::success_k) return status;
+            result_ref = result_i64;
         }
 
-        return status;
+        return status_t::success_k;
     }
 };
 
@@ -2184,34 +2277,33 @@ struct smith_waterman_score<char, error_costs_256x256_t, allocator_type_, capabi
             return status_t::success_k;
         }
 
-        // Estimate the maximum dimension of the DP matrix
-        sz_size_t const min_dim = sz_min_of_two(first_length, second_length) + 1;
-        sz_size_t const max_dim = sz_max_of_two(first_length, second_length) + 1;
+        // Estimate the maximum dimension of the DP matrix and choose the best type for it.
+        using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, true>;
+        similarity_memory_requirements_t requirements(first_length, second_length, substituter_.max_magnitude_change(),
+                                                      sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
 
         // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
-        // Assuming each individual cost falls in [-128, 127], the `i16` range of [-32768, 32767] is sufficient
-        // for inputs under (32768 / 128) = 256 characters.
-        status_t status = status_t::success_k;
-        if (max_dim < 256u) {
+        if (requirements.bytes_per_cell == 2) {
             sz_i16_t result_i16;
-            status = horizontal_i16_t {substituter_, gap_cost_, alloc_}(first, second, result_i16);
-            if (status == status_t::success_k) result_ref = result_i16;
+            status_t status = horizontal_i16_t {substituter_, gap_cost_, alloc_}(first, second, result_i16);
+            if (status != status_t::success_k) return status;
+            result_ref = result_i16;
         }
-        // Assuming each individual cost falls in [-128, 127], the `i32` range of [-2147483648, 2147483647] is
-        // sufficient for inputs under (2147483648 / 128) = 16777216 characters.
-        else if (max_dim < 16777216u) {
+        else if (requirements.bytes_per_cell == 4) {
             sz_i32_t result_i32;
-            status = horizontal_i32_t {substituter_, gap_cost_, alloc_}(first, second, result_i32);
-            if (status == status_t::success_k) result_ref = result_i32;
+            status_t status = horizontal_i32_t {substituter_, gap_cost_, alloc_}(first, second, result_i32);
+            if (status != status_t::success_k) return status;
+            result_ref = result_i32;
         }
-        else {
+        else if (requirements.bytes_per_cell == 8) {
             sz_i64_t result_i64;
-            status = horizontal_i64_t {substituter_, gap_cost_, alloc_}(first, second, result_i64);
-            if (status == status_t::success_k) result_ref = result_i64;
+            status_t status = horizontal_i64_t {substituter_, gap_cost_, alloc_}(first, second, result_i64);
+            if (status != status_t::success_k) return status;
+            result_ref = result_i64;
         }
 
-        return status;
+        return status_t::success_k;
     }
 };
 
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index a40e49c0..9211fd2f 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -926,6 +926,7 @@ SZ_PUBLIC void sz_sequence_from_null_terminated_strings(sz_cptr_t *start, sz_siz
  *          like equality checks and relative order computing.
  */
 #define SZ_CACHE_LINE_WIDTH (64) // bytes
+#define SZ_MAX_REGISTER_WIDTH (64) // bytes
 #define SZ_SIZE_MAX ((sz_size_t)(-1))
 #define SZ_SSIZE_MAX ((sz_ssize_t)(SZ_SIZE_MAX >> 1))
 #define SZ_SSIZE_MIN ((sz_ssize_t)(-SZ_SSIZE_MAX - 1))
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 6259a311..b6d02282 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -129,6 +129,7 @@ enum class status_t {
 struct error_costs_uniform_t {
     constexpr error_cost_t operator()(char a, char b) const noexcept { return a == b ? 0 : 1; }
     constexpr error_cost_t operator()(sz_rune_t a, sz_rune_t b) const noexcept { return a == b ? 0 : 1; }
+    constexpr sz_size_t max_magnitude_change() const noexcept { return 1; }
 };
 
 template <typename value_type_>

From 9d86d4c8eca37cde40995fd9e504cc257622a56b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 12 Apr 2025 21:58:19 +0000
Subject: [PATCH 323/751] Fix: Forward GPU specs in CUDA tests

---
 scripts/test_stringcuzilla.cuh | 35 ++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index 2812266f..e3dfd350 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -263,9 +263,9 @@ void edit_distance_log_mismatch(std::string const &first, std::string const &sec
  *          as well as the similarity scoring functions for bioinformatics-like workloads
  *          on a @b fixed set of different representative ASCII and UTF-8 strings.
  */
-template <typename score_type_, typename base_operator_, typename simd_operator_>
+template <typename score_type_, typename base_operator_, typename simd_operator_, typename... extra_args_>
 void test_similarity_scores_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
-                                  std::string_view allowed_chars = {}) {
+                                  std::string_view allowed_chars = {}, extra_args_ &&...extra_args) {
 
     std::vector<std::pair<std::string, std::string>> test_cases;
     auto append = [&test_cases](std::string const &first, std::string const &second) {
@@ -346,7 +346,11 @@ void test_similarity_scores_fixed(base_operator_ &&base_operator, simd_operator_
     unified_vector<score_t> results_base(1), results_simd(1);
     arrow_strings_tape_t first_tape, second_tape;
     bool contains_missing_in_any_case = false;
-    for (auto [first, second] : test_cases) {
+
+    // Old C-style for-loops are much more debuggable than range-based loops!
+    for (std::size_t test_idx = 0; test_idx != test_cases.size(); ++test_idx) {
+        auto const &first = test_cases[test_idx].first;
+        auto const &second = test_cases[test_idx].second;
 
         // Check if the input strings fit into our allowed characters set
         if (!allowed_chars.empty()) {
@@ -364,7 +368,7 @@ void test_similarity_scores_fixed(base_operator_ &&base_operator, simd_operator_
 
         // Compute with both backends
         status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
-        status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data());
+        status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data(), extra_args...);
         _sz_assert(status_base == status_t::success_k);
         _sz_assert(status_simd == status_t::success_k);
         if (results_base[0] != results_simd[0])
@@ -384,7 +388,7 @@ void test_similarity_scores_fixed(base_operator_ &&base_operator, simd_operator_
 
         // Compute with both backends
         status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
-        status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data());
+        status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data(), extra_args...);
         _sz_assert(status_base == status_t::success_k);
         _sz_assert(status_simd == status_t::success_k);
 
@@ -446,11 +450,12 @@ void test_similarity_scores_fuzzy(base_operator_ &&base_operator, simd_operator_
     }
 }
 
-template <typename score_type_, typename base_operator_, typename simd_operator_>
+template <typename score_type_, typename base_operator_, typename simd_operator_, typename... extra_args_>
 void test_similarity_scores_fixed_and_fuzzy(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
-                                            std::string_view allowed_chars = {}, fuzzy_config_t config = {}) {
-    test_similarity_scores_fixed<score_type_>(base_operator, simd_operator, allowed_chars);
-    test_similarity_scores_fuzzy<score_type_>(base_operator, simd_operator, config);
+                                            std::string_view allowed_chars = {}, fuzzy_config_t config = {},
+                                            extra_args_ &&...extra_args) {
+    test_similarity_scores_fixed<score_type_>(base_operator, simd_operator, allowed_chars, extra_args...);
+    test_similarity_scores_fuzzy<score_type_>(base_operator, simd_operator, config, extra_args...);
 }
 
 /**
@@ -540,11 +545,15 @@ void test_similarity_scores_equivalence() {
 
 #endif
 
+#if SZ_USE_CUDA
+    gpu_specs_t first_gpu_specs = *gpu_specs();
+#endif
+
 #if SZ_USE_CUDA
     // CUDA Levenshtein distance against Multi-threaded on CPU
     test_similarity_scores_fixed_and_fuzzy<sz_size_t>(          //
         levenshtein_distances<char, malloc_t, sz_caps_sp_k> {}, //
-        levenshtein_distances<char, dummy_alloc_t, sz_cap_cuda_k> {});
+        levenshtein_distances<char, dummy_alloc_t, sz_cap_cuda_k> {}, {}, {}, first_gpu_specs);
 #endif
 
 #if SZ_USE_HOPPER && 0
@@ -555,15 +564,13 @@ void test_similarity_scores_equivalence() {
 #endif
 
 #if SZ_USE_CUDA
-    // CUDA Needleman-Wunsch distance against Multi-threaded on CPU,
-    // using a compressed smaller matrix to fit into GPU shared memory
-    std::string_view ascii_alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+    // CUDA Needleman-Wunsch distance against Multi-threaded on CPU
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
         needleman_wunsch_scores<char, error_matrix_t, malloc_t, sz_caps_sp_k> {blosum62_matrix,
                                                                                blosum62_gap_extension_cost}, //
         needleman_wunsch_scores<char, error_matrix_t, dummy_alloc_t, sz_cap_cuda_k> {blosum62_matrix,
                                                                                      blosum62_gap_extension_cost},
-        ascii_alphabet);
+        {}, {}, first_gpu_specs);
 #endif
 }
 

From 1879aebc2e29b12eb6eb720087a220c796e1e74f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 12 Apr 2025 22:01:31 +0000
Subject: [PATCH 324/751] Add: NW benchmarks on GPU

---
 scripts/bench_similarity.cpp |  1 +
 scripts/bench_similarity.cu  |  1 +
 scripts/bench_similarity.cuh | 55 +++++++++++++++++++++++++++++++++---
 3 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/scripts/bench_similarity.cpp b/scripts/bench_similarity.cpp
index a685bcb1..31738850 100644
--- a/scripts/bench_similarity.cpp
+++ b/scripts/bench_similarity.cpp
@@ -64,6 +64,7 @@ int main(int argc, char const **argv) {
 
         std::printf("Starting string similarity benchmarks...\n");
         bench_levenshtein(env);
+        bench_needleman_wunsch(env);
     }
     catch (std::exception const &e) {
         std::fprintf(stderr, "Failed with: %s\n", e.what());
diff --git a/scripts/bench_similarity.cu b/scripts/bench_similarity.cu
index a685bcb1..31738850 100644
--- a/scripts/bench_similarity.cu
+++ b/scripts/bench_similarity.cu
@@ -64,6 +64,7 @@ int main(int argc, char const **argv) {
 
         std::printf("Starting string similarity benchmarks...\n");
         bench_levenshtein(env);
+        bench_needleman_wunsch(env);
     }
     catch (std::exception const &e) {
         std::fprintf(stderr, "Failed with: %s\n", e.what());
diff --git a/scripts/bench_similarity.cuh b/scripts/bench_similarity.cuh
index e3258dff..9e50f9af 100644
--- a/scripts/bench_similarity.cuh
+++ b/scripts/bench_similarity.cuh
@@ -40,7 +40,7 @@ struct similarities_callable {
         if (env.tokens.size() <= batch_size) throw std::runtime_error("Batch size is too large.");
     }
 
-    inline call_result_t operator()(std::size_t batch_index) noexcept(false) {
+    call_result_t operator()(std::size_t batch_index) noexcept(false) {
         std::size_t const batch_size = results.size();
         std::size_t const forward_token_index = (batch_index * batch_size) % (env.tokens.size() - batch_size);
         std::size_t const backward_token_index = env.tokens.size() - forward_token_index - batch_size;
@@ -49,10 +49,11 @@ struct similarities_callable {
                           {env.tokens.data() + backward_token_index, batch_size});
     }
 
-    inline call_result_t operator()(std::span<token_view_t const> a, std::span<token_view_t const> b) noexcept(false) {
+    call_result_t operator()(std::span<token_view_t const> a, std::span<token_view_t const> b) noexcept(false) {
         // Unpack the extra arguments from `std::tuple` into the engine call using `std::apply`
         sz::status_t status =
             std::apply([&](auto &&...rest) { return engine(a, b, results.data(), rest...); }, extra_args);
+        do_not_optimize(status);
 
         if (status != sz::status_t::success_k) throw std::runtime_error("Failed to compute Levenshtein distance.");
         do_not_optimize(results);
@@ -71,7 +72,7 @@ struct similarities_callable {
 };
 
 struct similarities_equality_t {
-    bool operator()(check_value_t const &a, check_value_t const &b) const {
+    bool operator()(check_value_t const &a, check_value_t const &b) const noexcept {
         similarities_t const &a_ = *reinterpret_cast<similarities_t const *>(a);
         similarities_t const &b_ = *reinterpret_cast<similarities_t const *>(b);
         if (a_.size() != b_.size()) return false;
@@ -133,10 +134,56 @@ void bench_levenshtein(environment_t const &env) {
         bench_result_t utf8_baseline = bench_unary(env, name_utf8_baseline, call_utf8_baseline).log();
 
 #if SZ_USE_ICE
-        bench_unary(env, "levenshtein_utf8_ice:batch"s + std::to_string(batch_size), call_baseline,
+        bench_unary(env, "levenshtein_utf8_ice:batch"s + std::to_string(batch_size), call_utf8_baseline,
                     similarities_callable<levenshtein_utf8_ice_t>(env, results_accelerated, batch_size),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
+            .log(utf8_baseline);
+        scramble_accelerated_results();
+#endif
+    }
+}
+
+void bench_needleman_wunsch(environment_t const &env) {
+
+    using namespace std::string_literals; // for "s" suffix
+
+#if SZ_USE_CUDA
+    sz::gpu_specs_t specs = *sz::gpu_specs();
+#endif
+    std::vector<std::size_t> batch_sizes = {1024 / 32, 1024, 1024 * 32};
+#if SZ_DEBUG
+    batch_sizes = {1, 2, 32};
+#endif
+    similarities_t results_baseline, results_accelerated;
+
+    auto scramble_accelerated_results = [&] {
+        std::shuffle(results_accelerated.begin(), results_accelerated.end(), global_random_generator());
+    };
+
+    for (std::size_t batch_size : batch_sizes) {
+        results_baseline.resize(batch_size);
+        results_accelerated.resize(batch_size);
+
+        auto call_baseline = similarities_callable<needleman_wunsch_serial_t>(env, results_baseline, batch_size);
+        auto name_baseline = "needleman_wunsch_serial:batch"s + std::to_string(batch_size);
+        bench_result_t baseline = bench_unary(env, name_baseline, call_baseline).log();
+
+#if SZ_USE_ICE
+        bench_unary(env, "needleman_wunsch_ice:batch"s + std::to_string(batch_size), call_baseline,
+                    similarities_callable<needleman_wunsch_ice_t>(env, results_accelerated, batch_size),
+                    callable_no_op_t {},        // preprocessing
+                    similarities_equality_t {}) // equality check
+            .log(baseline);
+        scramble_accelerated_results();
+#endif
+
+#if SZ_USE_CUDA
+        bench_unary(env, "needleman_wunsch_cuda:batch"s + std::to_string(batch_size), call_baseline,
+                    similarities_callable<needleman_wunsch_cuda_t, sz::gpu_specs_t>(env, results_accelerated,
+                                                                                    batch_size, specs),
+                    callable_no_op_t {},        // preprocessing
+                    similarities_equality_t {}) // equality check
             .log(baseline);
         scramble_accelerated_results();
 #endif

From 41e1a6e28d99117c3ec6630618946eab9b2099e2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 12 Apr 2025 22:15:19 +0000
Subject: [PATCH 325/751] Fix: Using OpenMP directives

---
 include/stringcuzilla/similarity.hpp | 43 +++++++++++++++++++---------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index 431002d6..37b2adb5 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -88,8 +88,8 @@ struct similarity_memory_requirements {
      *  @param[in] first_length The length of the first string in characters/codepoints.
      *  @param[in] second_length The length of the second string in characters/codepoints.
      *  @param[in] max_magnitude_change The absolute value of the maximum change in nearby cells.
-     *  @param[in] bytes_per_character The number of bytes per character, 4 for UTF-32, 1 for ASCII.
-     *  @param[in] word_alignment The alignment of the data in bytes, 4 for CUDA, 64 for AVX-512.
+     *  @param[in] bytes_per_char The number of bytes per character, 4 for UTF-32, 1 for ASCII.
+     *  @param[in] register_width The alignment of the data in bytes, 4 for CUDA, 64 for AVX-512.
      *
      *  To understand the @p max_magnitude_change parameter, consider the following example:
      *  - substitution costs ranging from -16 to +15
@@ -99,8 +99,8 @@ struct similarity_memory_requirements {
     constexpr similarity_memory_requirements(      //
         size_t first_length, size_t second_length, //
         size_t max_magnitude_change,               //
-        size_t bytes_per_character,                //
-        size_t word_alignment) noexcept {
+        size_t bytes_per_char,                     //
+        size_t register_width) noexcept {
 
         // Each diagonal in the DP matrix is only by 1 longer than the shorter string.
         size_t shorter_length = sz_min_of_two(first_length, second_length);
@@ -125,12 +125,11 @@ struct similarity_memory_requirements {
 
         // For each string we need to copy its contents, and allocate 3 bands proportional to the length
         // of the shorter string with each cell being big enough to hold the length of the longer one.
-        // The diagonals should be aligned to `word_alignment` bytes to allow for SIMD operations.
-        this->bytes_per_diagonal = round_up_to_multiple<size_t>(max_diagonal_length * bytes_per_cell, word_alignment);
-        this->total =                                                                          //
-            3 * bytes_per_diagonal +                                                           //
-            round_up_to_multiple<size_t>(first_length * bytes_per_character, word_alignment) + //
-            round_up_to_multiple<size_t>(second_length * bytes_per_character, word_alignment);
+        // The diagonals should be aligned to `register_width` bytes to allow for SIMD operations.
+        this->bytes_per_diagonal = round_up_to_multiple<size_t>(max_diagonal_length * bytes_per_cell, register_width);
+        size_t first_length_bytes = round_up_to_multiple<size_t>(first_length * bytes_per_char, register_width);
+        size_t second_length_bytes = round_up_to_multiple<size_t>(second_length * bytes_per_char, register_width);
+        this->total = 3 * bytes_per_diagonal + first_length_bytes + second_length_bytes;
     }
 };
 
@@ -218,7 +217,11 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
         score_t const *scores_pre_substitution, score_t const *scores_pre_insertion, score_t const *scores_pre_deletion,
         score_t *scores_new) noexcept {
 
-#pragma omp parallel for simd if (capability_k & sz_cap_parallel_k)
+#if (capability_k & sz_cap_parallel_k)
+#pragma omp parallel for simd
+#else
+#pragma omp simd
+#endif
         for (sz_size_t i = 0; i < n; ++i) {
             score_t pre_substitution = scores_pre_substitution[i];
             score_t pre_insertion = scores_pre_insertion[i];
@@ -1685,7 +1688,11 @@ struct linear_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t
         sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion, //
         sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
 
-#pragma omp parallel for simd if (capability_k & sz_cap_parallel_k)
+#if (capability_k & sz_cap_parallel_k)
+#pragma omp parallel for simd
+#else
+#pragma omp simd
+#endif
         // In this variant we will need at most (64 * 1024 / 32) = 2048 loops per diagonal.
         for (sz_size_t i = 0; i < n; i += 32)
             slice(first_reversed_slice, second_slice, i, n, scores_pre_substitution, scores_pre_insertion,
@@ -1749,7 +1756,11 @@ struct linear_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, error_costs
         sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,     //
         sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
 
-#pragma omp parallel for simd if (capability_k & sz_cap_parallel_k)
+#if (capability_k & sz_cap_parallel_k)
+#pragma omp parallel for simd
+#else
+#pragma omp simd
+#endif
         // In this variant we will need at most (64 * 1024 / 16) = 4096 loops per diagonal.
         for (sz_size_t i = 0; i < n; i += 16)
             slice(first_reversed_slice, second_slice, i, n, scores_pre_substitution, scores_pre_insertion,
@@ -2040,7 +2051,11 @@ struct linear_scorer<constant_iterator<char>, char const *, sz_i16_t, error_cost
 
         sz_size_t const count_slices = n / 64;
 
-#pragma omp parallel for simd if (capability_k & sz_cap_parallel_k)
+#if (capability_k & sz_cap_parallel_k)
+#pragma omp parallel for simd
+#else
+#pragma omp simd
+#endif
         // Progress through the row 64 characters at a time.
         for (sz_size_t idx_slice = 0; idx_slice != count_slices; ++idx_slice)
             slice_64chars(second_slice, idx_slice * 64, gap, scores_pre_substitution, scores_pre_insertion, scores_new);

From 7a9a243aaa643d2ad7b2b7488122dca3ab42801d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 12 Apr 2025 22:52:18 +0000
Subject: [PATCH 326/751] Fix: Underutilized 99% of the H100

---
 include/stringcuzilla/similarity.cuh | 55 ++++++++++++++--------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index 317fd644..cb73e6a2 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -728,12 +728,10 @@ cuda_status_t _levenshtein_via_cuda_warp(
     }
 
     // In most cases we should be able to fit many blocks per SM.
-    sz_size_t count_blocks = specs.shared_memory_per_multiprocessor() / shared_memory_per_block;
-    if (count_blocks > specs.max_blocks_per_multiprocessor) count_blocks = specs.max_blocks_per_multiprocessor;
-    if (count_blocks > first_strings.size()) count_blocks = first_strings.size();
-    // std::printf("max blocks to fit memory: %zu.\n", count_blocks);
-    // std::printf("max blocks to match SM thread-count: %zu.\n", count_blocks);
-    // std::printf("max blocks to match input size: %zu.\n", count_blocks);
+    sz_size_t count_blocks_per_multiprocessor = specs.shared_memory_per_multiprocessor() / shared_memory_per_block;
+    if (count_blocks_per_multiprocessor > specs.max_blocks_per_multiprocessor)
+        count_blocks_per_multiprocessor = specs.max_blocks_per_multiprocessor;
+    if (count_blocks_per_multiprocessor > first_strings.size()) count_blocks_per_multiprocessor = first_strings.size();
 
     // Let's use all 32 threads in a warp.
     constexpr sz_size_t threads_per_block = 32u;
@@ -749,8 +747,9 @@ cuda_status_t _levenshtein_via_cuda_warp(
     // On Volta and newer GPUs, there is an extra flag to be set to use more than 48 KB of shared memory per block.
     // CUDA reserves 1 KB of shared memory per thread block, so on H100 we can use up to 227 KB of shared memory.
     // https://docs.nvidia.com/cuda/hopper-tuning-guide/index.html#unified-shared-memory-l1-texture-cache
-    cudaError_t attribute_error = cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                                       specs.shared_memory_per_multiprocessor() - count_blocks * 1024);
+    cudaError_t attribute_error =
+        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                             specs.shared_memory_per_multiprocessor() - count_blocks_per_multiprocessor * 1024);
     if (attribute_error != cudaSuccess) return {status_t::unknown_k, attribute_error};
 
     // Create CUDA events for timing
@@ -762,13 +761,13 @@ cuda_status_t _levenshtein_via_cuda_warp(
     cudaEventRecord(start_event, stream);
 
     // Enqueue the kernel for execution:
-    cudaError_t launch_error = cudaLaunchKernel( //
-        reinterpret_cast<void *>(kernel),        // Kernel function pointer
-        dim3(count_blocks),                      // Grid dimensions
-        dim3(threads_per_block),                 // Block dimensions
-        kernel_args,                             // Array of kernel argument pointers
-        shared_memory_per_block,                 // Shared memory per block (in bytes)
-        stream);                                 // CUDA stream
+    cudaError_t launch_error = cudaLaunchKernel(                                 //
+        reinterpret_cast<void *>(kernel),                                        // Kernel function pointer
+        dim3(count_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions
+        dim3(threads_per_block),                                                 // Block dimensions
+        kernel_args,                                                             // Array of kernel argument pointers
+        shared_memory_per_block,                                                 // Shared memory per block (in bytes)
+        stream);                                                                 // CUDA stream
     if (launch_error != cudaSuccess)
         if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
         else { return {status_t::unknown_k, launch_error}; }
@@ -953,9 +952,10 @@ cuda_status_t _needleman_wunsch_via_cuda_warp(
     }
 
     // In most cases we should be able to fit many blocks per SM.
-    sz_size_t count_blocks = specs.shared_memory_per_multiprocessor() / shared_memory_per_block;
-    if (count_blocks > specs.max_blocks_per_multiprocessor) count_blocks = specs.max_blocks_per_multiprocessor;
-    if (count_blocks > first_strings.size()) count_blocks = first_strings.size();
+    sz_size_t count_blocks_per_multiprocessor = specs.shared_memory_per_multiprocessor() / shared_memory_per_block;
+    if (count_blocks_per_multiprocessor > specs.max_blocks_per_multiprocessor)
+        count_blocks_per_multiprocessor = specs.max_blocks_per_multiprocessor;
+    if (count_blocks_per_multiprocessor > first_strings.size()) count_blocks_per_multiprocessor = first_strings.size();
 
     // Let's use all 32 threads in a warp.
     constexpr sz_size_t threads_per_block = 32u;
@@ -970,8 +970,9 @@ cuda_status_t _needleman_wunsch_via_cuda_warp(
     // On Volta and newer GPUs, there is an extra flag to be set to use more than 48 KB of shared memory per block.
     // CUDA reserves 1 KB of shared memory per thread block, so on H100 we can use up to 227 KB of shared memory.
     // https://docs.nvidia.com/cuda/hopper-tuning-guide/index.html#unified-shared-memory-l1-texture-cache
-    cudaError_t attribute_error = cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                                       specs.shared_memory_per_multiprocessor() - count_blocks * 1024);
+    cudaError_t attribute_error =
+        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                             specs.shared_memory_per_multiprocessor() - count_blocks_per_multiprocessor * 1024);
     if (attribute_error != cudaSuccess) return {status_t::unknown_k, attribute_error};
 
     // Create CUDA events for timing
@@ -988,13 +989,13 @@ cuda_status_t _needleman_wunsch_via_cuda_warp(
     if (copy_error != cudaSuccess) return {status_t::unknown_k, copy_error};
 
     // Enqueue the kernel for execution:
-    cudaError_t launch_error = cudaLaunchKernel( //
-        reinterpret_cast<void *>(kernel),        // Kernel function pointer
-        dim3(count_blocks),                      // Grid dimensions
-        dim3(threads_per_block),                 // Block dimensions
-        kernel_args,                             // Array of kernel argument pointers
-        shared_memory_per_block,                 // Shared memory per block (in bytes)
-        stream);                                 // CUDA stream
+    cudaError_t launch_error = cudaLaunchKernel(                                 //
+        reinterpret_cast<void *>(kernel),                                        // Kernel function pointer
+        dim3(count_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions
+        dim3(threads_per_block),                                                 // Block dimensions
+        kernel_args,                                                             // Array of kernel argument pointers
+        shared_memory_per_block,                                                 // Shared memory per block (in bytes)
+        stream);                                                                 // CUDA stream
     if (launch_error != cudaSuccess)
         if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
         else { return {status_t::unknown_k, launch_error}; }

From eabe60535a870b8a38c1f4679e41db12be914ad6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 12 Apr 2025 22:53:18 +0000
Subject: [PATCH 327/751] Fix: Propagate substitutions to benchmarks

---
 scripts/bench_similarity.cuh | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/scripts/bench_similarity.cuh b/scripts/bench_similarity.cuh
index 9e50f9af..0a14ee8d 100644
--- a/scripts/bench_similarity.cuh
+++ b/scripts/bench_similarity.cuh
@@ -35,8 +35,9 @@ struct similarities_callable {
     engine_t engine = {};
     std::tuple<extra_args_...> extra_args = {};
 
-    similarities_callable(environment_t const &env, similarities_t &res, sz_size_t batch_size, extra_args_... args)
-        : env(env), results(res), extra_args(args...) {
+    similarities_callable(environment_t const &env, similarities_t &res, sz_size_t batch_size, engine_t eng = {},
+                          extra_args_... args)
+        : env(env), results(res), engine(eng), extra_args(args...) {
         if (env.tokens.size() <= batch_size) throw std::runtime_error("Batch size is too large.");
     }
 
@@ -122,7 +123,7 @@ void bench_levenshtein(environment_t const &env) {
 #if SZ_USE_CUDA
         bench_unary(
             env, "levenshtein_cuda:batch"s + std::to_string(batch_size), call_baseline,
-            similarities_callable<levenshtein_cuda_t, sz::gpu_specs_t>(env, results_accelerated, batch_size, specs),
+            similarities_callable<levenshtein_cuda_t, sz::gpu_specs_t>(env, results_accelerated, batch_size, {}, specs),
             callable_no_op_t {},        // preprocessing
             similarities_equality_t {}) // equality check
             .log(baseline);
@@ -148,6 +149,10 @@ void bench_needleman_wunsch(environment_t const &env) {
 
     using namespace std::string_literals; // for "s" suffix
 
+    constexpr error_t blosum62_gap_extension_cost = -4;
+    auto blosum62_mat = error_costs_26x26ascii_t::blosum62();
+    auto blosum62_matrix = blosum62_mat.decompressed();
+
 #if SZ_USE_CUDA
     sz::gpu_specs_t specs = *sz::gpu_specs();
 #endif
@@ -165,13 +170,15 @@ void bench_needleman_wunsch(environment_t const &env) {
         results_baseline.resize(batch_size);
         results_accelerated.resize(batch_size);
 
-        auto call_baseline = similarities_callable<needleman_wunsch_serial_t>(env, results_baseline, batch_size);
+        auto call_baseline = similarities_callable<needleman_wunsch_serial_t>(
+            env, results_baseline, batch_size, {blosum62_matrix, blosum62_gap_extension_cost});
         auto name_baseline = "needleman_wunsch_serial:batch"s + std::to_string(batch_size);
         bench_result_t baseline = bench_unary(env, name_baseline, call_baseline).log();
 
 #if SZ_USE_ICE
         bench_unary(env, "needleman_wunsch_ice:batch"s + std::to_string(batch_size), call_baseline,
-                    similarities_callable<needleman_wunsch_ice_t>(env, results_accelerated, batch_size),
+                    similarities_callable<needleman_wunsch_ice_t>(env, results_accelerated, batch_size,
+                                                                  {blosum62_matrix, blosum62_gap_extension_cost}),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
             .log(baseline);
@@ -180,8 +187,8 @@ void bench_needleman_wunsch(environment_t const &env) {
 
 #if SZ_USE_CUDA
         bench_unary(env, "needleman_wunsch_cuda:batch"s + std::to_string(batch_size), call_baseline,
-                    similarities_callable<needleman_wunsch_cuda_t, sz::gpu_specs_t>(env, results_accelerated,
-                                                                                    batch_size, specs),
+                    similarities_callable<needleman_wunsch_cuda_t, sz::gpu_specs_t>(
+                        env, results_accelerated, batch_size, {blosum62_matrix, blosum62_gap_extension_cost}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
             .log(baseline);

From 6da5e1ef3ef8976f20576891e9f8d7a2356ead90 Mon Sep 17 00:00:00 2001
From: Niels Nes <niels@cwi.nl>
Date: Sun, 13 Apr 2025 16:15:52 +0200
Subject: [PATCH 328/751] Fix: `sz_copy_skylake` tail handling on large input
 (#222)

---
 include/stringzilla/memory.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/stringzilla/memory.h b/include/stringzilla/memory.h
index 3b8f2a47..8e33d2ce 100644
--- a/include/stringzilla/memory.h
+++ b/include/stringzilla/memory.h
@@ -835,7 +835,7 @@ SZ_PUBLIC void sz_copy_skylake(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
         __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
         _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
         _mm512_mask_storeu_epi8(target + head_length + body_length, tail_mask,
-                                _mm512_maskz_loadu_epi8(tail_mask, source));
+                                _mm512_maskz_loadu_epi8(tail_mask, source + head_length + body_length));
 
         // Now in the main loop, we can use non-temporal loads and stores,
         // performing the operation in both directions.

From d4d55fa7255bdb29f4fcc3493ee04a824a6a2aa2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 13 Apr 2025 14:30:15 +0000
Subject: [PATCH 329/751] Improve: Consistent shuffling behavior in benchmarks

---
 scripts/bench.hpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index 632ce039..6e221844 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -410,12 +410,12 @@ inline environment_t build_environment(                                        /
     env.tokens.resize(bit_floor(env.tokens.size())); // Shrink to the nearest power of two
 
     // In "RELEASE" mode, shuffle tokens to avoid bias.
-    char const *seed_message = " (not used in DEBUG mode)";
-#if !defined(SZ_DEBUG) || !SZ_DEBUG
-    std::mt19937_64 generator(static_cast<unsigned long>(env.seed));
-    std::shuffle(env.tokens.begin(), env.tokens.end(), generator);
-    seed_message = "";
-#endif
+    char const *seed_message = " (will avoid shuffling)";
+    if (env.seed != 0) {
+        std::mt19937_64 generator(static_cast<unsigned long>(env.seed));
+        std::shuffle(env.tokens.begin(), env.tokens.end(), generator);
+        seed_message = " (will shuffle tokens)";
+    }
 
     auto const mean_token_length =
         std::accumulate(env.tokens.begin(), env.tokens.end(), (std::size_t)0u,

From a83cdb52e1e57e5c0fd61d3425235176a88ceb7f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 14 Apr 2025 13:01:40 +0000
Subject: [PATCH 330/751] Add: Multi-pattern exact substring search

---
 include/stringcuzilla/find_many.hpp | 338 ++++++++++++++++++++++++++--
 scripts/test_stringcuzilla.cpp      |   1 +
 scripts/test_stringcuzilla.cu       |   1 +
 scripts/test_stringcuzilla.cuh      | 194 +++++++++++++++-
 4 files changed, 515 insertions(+), 19 deletions(-)

diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index f7dd1942..3de8bb0d 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -7,32 +7,338 @@
  *  algorithm, that constructs a trie from the patterns, transforms it into a finite state machine,
  *  and then uses it to search for all patterns in the text in a single pass.
  *
- *  One of its biggest issues is the memory consumption, as the naive implementation requires each
- *  state to be proportional to the size of the alphabet, or 256 for byte-level processing. Such dense
- *  representations simplify transition lookup down to a single memory access, but that access can be
- *  expensive if the memory doesn't fir into the CPU caches for really large vocabulary sizes.
+ *  One of its biggest issues is the memory consumption, as one would often build a dense state transition
+ *  table/matrix:
  *
- *  Addressing this, we provide a sparse layout variant of the FSM, that uses predicated SIMD instructions
+ *  - with number of columns proportional to the size of the alphabet,
+ *  - and number of rows proportional to the number of states, in worst case, the aggregate length
+ *    of all needles, if none share prefixes.
+ *
+ *  Such dense representations simplify transition lookup down to a single memory access, but that access
+ *  can be expensive if the memory doesn't fir into the CPU caches for really large vocabulary sizes.
+ *
+ *  Addressing this, we provide a sparse layout variants of the FSM, that uses predicated SIMD instructions
  *  to rapidly probe the transitions and find the next state. This allows us to use a much smaller state,
  *  fitting in L1/L2 caches much more frequently.
+ *
+ *  @section Use Cases
+ *
+ *  Before optimizing its relevant to understand the typical usecases for the algorithm. Typically,
+ *  we would use `uint32_t` for the state indicies, and 256 state transitions for byte-level FSM.
+ *
+ *  | Use Case                      | Number of States        | Memory Usage            |
+ *  |-------------------------------|-------------------------|-------------------------|
+ *  | Malware/Intrusion Detection   | 10,000 – 1,000,000      | 10.24 MB – 1.024 GB     |
+ *  | DNA/RNA Motif Scanning        | 100 – 100,000           | 0.1 MB – 102.4 MB       |
+ *  | Keyword Filtering/Moderation  | 100 – 10,000            | 0.1 MB – 10.24 MB       |
+ *  | Plagiarism/Code Similarity    | 1,000 – 100,000         | 1.024 MB – 102.4 MB     |
+ *  | Product Catalog Matching      | 100,000 – 1,000,000     | 102.4 MB – 1.024 GB     |
  */
 #ifndef STRINGZILLA_FIND_MANY_HPP_
 #define STRINGZILLA_FIND_MANY_HPP_
 
-#include "types.h"
+#include "stringzilla/memory.h"  // `sz_move`
+#include "stringzilla/types.hpp" // `status_t::status_t`
+
+#include <memory>      // `std::allocator_traits` to re-bind the allocator
+#include <type_traits> // `std::enable_if_t` for meta-programming
+#include <limits>      // `std::numeric_limits` for numeric types
+#include <iterator>    // `std::iterator_traits` for iterators
+
+namespace ashvardanian {
+namespace stringzilla {
+
+#pragma region - Dictionary
+
+struct find_many_match_t {
+
+    span<char const> haystack {};
+    /**
+     *  @brief  The substring of the @p haystack that matched the needle.
+     *          Can be used to infer the offset of the needle in the haystack.
+     */
+    span<char const> needle {};
+    size_t haystack_index {};
+    size_t needle_index {};
+};
+
+/**
+ *  @brief Aho-Corasick dictionary for multi-pattern exact byte-level substring search.
+ *  @note As FSM construction is almost never a bottleneck, we don't optimize it for speed.
+ *  @tparam state_id_type_ The type of the state ID. Default is `sz_u32_t`.
+ *  @tparam allocator_type_ The type of the allocator. Default is `dummy_alloc_t`.
+ *
+ *  Similar to the rest of the library, doesn't use `std::vector` or other STL containers
+ *  and avoid `std::bad_alloc` and other exceptions in favor of `status_t::status_t` error codes
+ *  and `try_`-prefixed functions.
+ */
+template <typename state_id_type_ = sz_u32_t, typename allocator_type_ = dummy_alloc_t>
+struct aho_corasick_dictionary {
+
+    using state_id_t = state_id_type_;
+    using allocator_t = allocator_type_;
+    using match_t = find_many_match_t;
+
+    static constexpr state_id_t alphabet_size_k = 256;
+    static constexpr state_id_t invalid_state_k = std::numeric_limits<state_id_t>::max();
+
+    using state_transitions_t = state_id_t[alphabet_size_k];
+
+    /**
+     *  @brief  State transitions for each state, at least `count_states_ * alphabet_size_k` in binary size.
+     *  @note   The transitions are being populated both during vocabulary construction and during search.
+     */
+    span<state_transitions_t> transitions_;
+    /**
+     *  @brief  Output needle IDs for each state, at least `count_states_` in size.
+     *  @note   They are being populated both during vocabulary construction and during search.
+     */
+    span<state_id_t> outputs_;
+    /**
+     *  @brief  Failure links for each state, at least `count_states_` in size.
+     *  @note   The failure links aren't very needed after the FSM construction, if we stick to a dense layout.
+     */
+    span<state_id_t> failures_;
+    /**
+     *  @brief  Number of states in the FSM, which should be smaller than the capacity of the transitions.
+     */
+    size_t count_ = 0;
+    allocator_t alloc_;
+
+    aho_corasick_dictionary() = default;
+    ~aho_corasick_dictionary() noexcept { reset(); }
+
+    void reset() noexcept {
+        if (transitions_.data())
+            alloc_.deallocate(reinterpret_cast<char *>(transitions_.data()),
+                              transitions_.size() * sizeof(state_transitions_t));
+        if (failures_.data())
+            alloc_.deallocate(reinterpret_cast<char *>(failures_.data()), failures_.size() * sizeof(state_id_t));
+        if (outputs_.data())
+            alloc_.deallocate(reinterpret_cast<char *>(outputs_.data()), outputs_.size() * sizeof(state_id_t));
+        transitions_ = {};
+        failures_ = {};
+        outputs_ = {};
+        count_ = 0;
+    }
+
+    size_t size() const noexcept { return count_; }
+    size_t capacity() const noexcept { return transitions_.size(); }
+
+    status_t try_reserve(size_t new_capacity) noexcept {
+
+        // Allocate new memory blocks.
+        state_transitions_t *new_transitions =
+            reinterpret_cast<state_transitions_t *>(alloc_.allocate(new_capacity * sizeof(state_transitions_t)));
+        state_id_t *new_failures = reinterpret_cast<state_id_t *>(alloc_.allocate(new_capacity * sizeof(state_id_t)));
+        state_id_t *new_outputs = reinterpret_cast<state_id_t *>(alloc_.allocate(new_capacity * sizeof(state_id_t)));
+        if (!new_transitions || !new_failures || !new_outputs) {
+            if (new_transitions)
+                alloc_.deallocate(reinterpret_cast<char *>(new_transitions),
+                                  new_capacity * sizeof(state_transitions_t));
+            if (new_failures)
+                alloc_.deallocate(reinterpret_cast<char *>(new_failures), new_capacity * sizeof(state_id_t));
+            if (new_outputs)
+                alloc_.deallocate(reinterpret_cast<char *>(new_outputs), new_capacity * sizeof(state_id_t));
+            return status_t::bad_alloc_k;
+        }
+
+        // Copy existing states.
+        for (size_t state = 0; state < count_; ++state) {
+            for (size_t index = 0; index < alphabet_size_k; ++index)
+                new_transitions[state][index] = transitions_[state][index];
+            new_failures[state] = failures_[state];
+            new_outputs[state] = outputs_[state];
+        }
+
+        // Initialize new states.
+        for (size_t state = count_; state < new_capacity; ++state) {
+            for (size_t index = 0; index < alphabet_size_k; ++index) new_transitions[state][index] = invalid_state_k;
+            new_failures[state] = 0;
+            new_outputs[state] = invalid_state_k;
+        }
+
+        // Free old memory and update pointers.
+        size_t old_count = count_;
+        reset();
+        count_ = std::max<size_t>(old_count, 1); // The effective size doesn't change, but we now have a root!
+        transitions_ = {new_transitions, new_capacity};
+        failures_ = {new_failures, new_capacity};
+        outputs_ = {new_outputs, new_capacity};
+        return status_t::success_k;
+    }
+
+    status_t try_insert(span<char const> needle, state_id_t needle_id) noexcept {
+        state_id_t current_state = 0;
+        for (size_t pos = 0; pos < needle.size(); ++pos) {
+            unsigned char const symbol = static_cast<unsigned char>(needle[pos]);
+            state_id_t *current_row = transitions_[current_state];
+            bool const has_root_state = transitions_.data() != nullptr;
+            if (!has_root_state || current_row[symbol] == invalid_state_k) {
+                if (count_ >= transitions_.size()) {
+                    status_t reserve_status = try_reserve(sz_size_bit_ceil(transitions_.size() + 1 + !has_root_state));
+                    if (reserve_status != status_t::success_k) return reserve_status;
+                    current_row = transitions_[current_state]; // Update the pointer!
+                }
+                current_row[symbol] = static_cast<state_id_t>(count_);
+                state_id_t new_state = static_cast<state_id_t>(count_);
+                for (size_t index = 0; index < alphabet_size_k; ++index)
+                    transitions_[new_state][index] = invalid_state_k;
+                failures_[new_state] = 0;
+                outputs_[new_state] = invalid_state_k;
+                ++count_;
+            }
+            current_state = current_row[symbol];
+        }
+        outputs_[current_state] = needle_id;
+        return status_t::success_k;
+    }
+
+    status_t try_build() noexcept {
+
+        // Allocate a queue for Breadth-First Search (BFS) traversal.
+        size_t queue_capacity = count_;
+        size_t *work_queue = reinterpret_cast<size_t *>(alloc_.allocate(queue_capacity * sizeof(size_t)));
+        if (!work_queue) return status_t::bad_alloc_k;
+
+        // Reset all root transitions to point to itself - forming a loop.
+        size_t queue_begin = 0, queue_end = 0;
+        for (size_t symbol = 0; symbol < alphabet_size_k; ++symbol) {
+            if (transitions_[0][symbol] == invalid_state_k) { transitions_[0][symbol] = 0; }
+            else { failures_[transitions_[0][symbol]] = 0, work_queue[queue_end++] = transitions_[0][symbol]; }
+        }
+
+        while (queue_begin < queue_end) {
+            size_t current_state = work_queue[queue_begin++];
+            for (size_t symbol = 0; symbol < alphabet_size_k; ++symbol) {
+
+                state_id_t next_state = transitions_[current_state][symbol];
+                if (next_state != invalid_state_k) {
+
+                    size_t failure_state = failures_[current_state];
+                    while (transitions_[failure_state][symbol] == invalid_state_k)
+                        failure_state = failures_[failure_state];
+
+                    failures_[next_state] = transitions_[failure_state][symbol];
+                    if (outputs_[failures_[next_state]] != invalid_state_k && outputs_[next_state] == invalid_state_k)
+                        outputs_[next_state] = outputs_[failures_[next_state]];
+                    work_queue[queue_end++] = next_state;
+                }
+                else { transitions_[current_state][symbol] = transitions_[failures_[current_state]][symbol]; }
+            }
+        }
+        alloc_.deallocate(work_queue, queue_capacity * sizeof(size_t));
+        return status_t::success_k;
+    }
+
+    /**
+     *  @brief Find all occurrences of the needles in the @p haystack.
+     *  @note This is a serial reference implementation only recommended for testing.
+     *  @param haystack The input string to search in.
+     *  @param callback The handler for a @b `match_t` match, returning `true` to continue.
+     */
+    template <typename callback_type_>
+    void find(span<char const> haystack, callback_type_ &&callback) const noexcept {
+        state_id_t current_state = 0;
+        for (size_t pos = 0; pos < haystack.size(); ++pos) {
+            unsigned char symbol = static_cast<unsigned char>(haystack[pos]);
+            current_state = transitions_[current_state][symbol];
+            if (outputs_[current_state] != invalid_state_k) {
+                span<char const> match_span(&haystack[pos], haystack.size() - pos);
+                match_t match {haystack, match_span, outputs_[current_state]};
+                if (!callback(match)) break;
+            }
+        }
+    }
+
+    /**
+     *  @brief Count the number of occurrences of all the needles in the @p haystack.
+     *  @return The number of potentially-overlapping occurrences.
+     */
+    size_t count(span<char const> haystack) const noexcept {
+        size_t count = 0;
+        state_id_t current_state = 0;
+        for (size_t pos = 0; pos < haystack.size(); ++pos) {
+            unsigned char symbol = static_cast<unsigned char>(haystack[pos]);
+            current_state = transitions_[current_state][symbol];
+            count += outputs_[current_state] != invalid_state_k;
+        }
+        return count;
+    }
+};
+
+#pragma endregion // Dictionary
+
+#pragma region - Primary API
+
+/**
+ *  @brief Aho-Corasick-based @b single-threaded multi-pattern exact substring search.
+ *  @tparam state_id_type_ The type of the state ID. Default is `sz_u32_t`.
+ *  @tparam allocator_type_ The type of the allocator. Default is `dummy_alloc_t`.
+ *  @tparam capability_ The capability of the dictionary. Default is `sz_cap_serial_k`.
+ */
+template <                                         //
+    typename state_id_type_ = sz_u32_t,            //
+    typename allocator_type_ = dummy_alloc_t,      //
+    sz_capability_t capability_ = sz_cap_serial_k, //
+    typename enable_ = void                        //
+    >
+struct find_many {
+    using dictionary_t = aho_corasick_dictionary<state_id_type_, allocator_type_>;
+    using state_id_t = typename dictionary_t::state_id_t;
+    using allocator_t = typename dictionary_t::allocator_t;
+    using match_t = typename dictionary_t::match_t;
+
+    template <typename needles_type_>
+    status_t try_build(needles_type_ &&needles_strings) noexcept {
+        for (auto const &needle : needles_strings) {
+            status_t status = dict_.try_insert(needle, static_cast<state_id_t>(dict_.size()));
+            if (status != status_t::success_k) return status;
+        }
+        return status_t::success_k;
+    }
+
+    void reset() noexcept { dict_.reset(); }
+
+    /**
+     *  @brief Counts the number of occurrences of the needles in the haystack. Relevant for filtering and ranking.
+     *  @param[in] haystacks The input strings to search in.
+     *  @param[out] counts The output buffer for the counts of each needle.
+     *  @return The total number of occurrences found.
+     */
+    template <typename haystacks_type_>
+    size_t count(haystacks_type_ &&haystacks, span<size_t> counts) const noexcept {
+        size_t count_total = 0;
+        for (size_t i = 0; i < counts.size(); ++i) count_total += counts[i] = dict_.count(haystacks[i]);
+        return count_total;
+    }
 
-#include "compare.h" // `sz_compare`
-#include "memory.h"  // `sz_copy`
+    /**
+     *  @brief Finds all occurrences of the needles in all the @p haystacks.
+     *  @param haystacks The input strings to search in, with support for random access iterators.
+     *  @param matches The output buffer for the matches, with support for random access iterators.
+     *  @return The number of matches found across all the @p haystacks.
+     *  @note The @p matches reference objects should be assignable from @b `match_t`.
+     */
+    template <typename haystacks_type_, typename output_matches_type_>
+    size_t find(haystacks_type_ &&haystacks, output_matches_type_ &&matches) const noexcept {
+        size_t count_found = 0, count_allowed = matches.size();
+        for (auto it = haystacks.begin(); it != haystacks.end() && count_found != count_allowed; ++it)
+            dict_.find(*it, [&](match_t const &match) {
+                matches[count_found] = match;
+                count_found++;
+                return count_found < count_allowed;
+            });
+        return count_found;
+    }
 
-#ifdef __cplusplus
-extern "C" {
-#endif
+  private:
+    dictionary_t dict_;
+};
 
-#pragma region Core API
+#pragma endregion // Primary API
 
-#pragma endregion // Core API
+} // namespace stringzilla
+} // namespace ashvardanian
 
-#ifdef __cplusplus
-}
-#endif // __cplusplus
 #endif // STRINGZILLA_FIND_MANY_HPP_
diff --git a/scripts/test_stringcuzilla.cpp b/scripts/test_stringcuzilla.cpp
index 32356bd9..f3983115 100644
--- a/scripts/test_stringcuzilla.cpp
+++ b/scripts/test_stringcuzilla.cpp
@@ -38,6 +38,7 @@ int main(int argc, char const **argv) {
     if (auto code = sz::scripts::log_environment(); code != 0) return code;
 
     try {
+        sz::scripts::test_find_many_equivalence();
         sz::scripts::test_similarity_scores_equivalence();
         sz::scripts::test_similarity_scores_memory_usage();
     }
diff --git a/scripts/test_stringcuzilla.cu b/scripts/test_stringcuzilla.cu
index 880f3ffb..cf11da1b 100644
--- a/scripts/test_stringcuzilla.cu
+++ b/scripts/test_stringcuzilla.cu
@@ -38,6 +38,7 @@ int main(int argc, char const **argv) {
     if (auto code = sz::scripts::log_environment(); code != 0) return code;
 
     try {
+        sz::scripts::test_find_many_equivalence();
         sz::scripts::test_similarity_scores_equivalence();
         sz::scripts::test_similarity_scores_memory_usage();
     }
diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index e3dfd350..04d0aca3 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -5,6 +5,7 @@
  *  @file    test_stringcuzilla.cuh
  *  @author  Ash Vardanian
  */
+#include "stringcuzilla/find_many.hpp"
 #include "stringcuzilla/similarity.hpp"
 
 #if SZ_USE_CUDA
@@ -348,9 +349,9 @@ void test_similarity_scores_fixed(base_operator_ &&base_operator, simd_operator_
     bool contains_missing_in_any_case = false;
 
     // Old C-style for-loops are much more debuggable than range-based loops!
-    for (std::size_t test_idx = 0; test_idx != test_cases.size(); ++test_idx) {
-        auto const &first = test_cases[test_idx].first;
-        auto const &second = test_cases[test_idx].second;
+    for (std::size_t pair_idx = 0; pair_idx != test_cases.size(); ++pair_idx) {
+        auto const &first = test_cases[pair_idx].first;
+        auto const &second = test_cases[pair_idx].second;
 
         // Check if the input strings fit into our allowed characters set
         if (!allowed_chars.empty()) {
@@ -628,6 +629,193 @@ void test_similarity_scores_memory_usage() {
     }
 }
 
+struct find_many_baselines_t {
+    using state_id_t = sz_u32_t;
+    using match_t = find_many_match_t;
+
+    arrow_strings_tape_t needles_;
+
+    template <typename needles_type_>
+    status_t try_build(needles_type_ &&needles) noexcept {
+        return needles_.try_assign(needles.begin(), needles.end());
+    }
+
+    void reset() noexcept { needles_.reset(); }
+
+    template <typename haystacks_type_, typename needles_type_, typename match_callback_type_>
+    void iterate_through_unsorted_matches(haystacks_type_ &&haystacks, needles_type_ &&needles,
+                                          match_callback_type_ &&callback) const noexcept {
+        for (std::size_t i = 0; i != haystacks.size(); ++i) {
+            auto const &haystack = haystacks[i];
+            for (std::size_t j = 0; j != needles.size(); ++j) {
+                auto const &needle = needles[j];
+                // Define iterators for the current haystack and the needle.
+                auto haystack_begin = haystack.begin();
+                auto haystack_end = haystack.end();
+                auto needle_begin = needle.begin();
+                auto needle_end = needle.end();
+
+                // Use `std::search` to find all occurrences of needle in haystack.
+                while (true) {
+                    auto it = std::search(haystack_begin, haystack_end, needle_begin, needle_end);
+                    if (it == haystack_end) break;
+
+                    // Compute the starting index of the found occurrence.
+                    std::size_t found = static_cast<std::size_t>(std::distance(haystack.begin(), it));
+
+                    // Construct a match record.
+                    match_t match;
+                    match.haystack_index = i;
+                    match.needle_index = j;
+                    match.haystack = {haystack.data(), haystack.size()};
+                    match.needle = {haystack.data() + found, needle.size()};
+
+                    // Invoke the callback. If it returns false, abort all further processing.
+                    if (!callback(match)) return;
+
+                    // Advance the starting iterator for the next search.
+                    haystack_begin = it + 1;
+                }
+            }
+        }
+    }
+
+    template <typename haystacks_type_>
+    size_t count(haystacks_type_ &&haystacks, span<size_t> counts) const noexcept {
+        size_t count_total = 0;
+        for (size_t &count : counts) count = 0;
+        iterate_through_unsorted_matches(haystacks, needles_, [&](match_t const &match) {
+            counts[match.haystack_index] += 1;
+            count_total += 1;
+            return true;
+        });
+        return count_total;
+    }
+
+    template <typename haystacks_type_, typename output_matches_type_>
+    size_t find(haystacks_type_ &&haystacks, output_matches_type_ &&matches) const noexcept {
+        size_t count_found = 0, count_allowed = matches.size();
+        iterate_through_unsorted_matches(haystacks, needles_, [&](match_t const &match) {
+            matches[count_found] = match;
+            count_found += 1;
+            return count_found < count_allowed;
+        });
+        return count_found;
+    }
+};
+
+using find_many_serial_t = find_many<sz_u32_t, malloc_t, sz_cap_serial_k>;
+using find_many_parallel_t = find_many<sz_u32_t, malloc_t, sz_caps_sp_k>;
+
+/**
+ *  @brief  Tests the correctness of the string class Levenshtein distance computation,
+ *          as well as the similarity scoring functions for bioinformatics-like workloads
+ *          on a @b fixed set of different representative ASCII and UTF-8 strings.
+ */
+template <typename base_operator_, typename simd_operator_, typename... extra_args_>
+void test_find_many_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_operator, extra_args_ &&...extra_args) {
+
+    std::vector<std::string> haystacks, needles;
+
+    // Some vary basic variants:
+    needles.emplace_back("his");
+    needles.emplace_back("is");
+    needles.emplace_back("she");
+    needles.emplace_back("her");
+
+    // Haystacks should contain arbitrary strings including those needles
+    // in different positions, potentially interleaving
+    haystacks.emplace_back("That is a test string"); // ? "only "is"
+    haystacks.emplace_back("This is a test string"); // ? "his", 2x "is"
+
+    using match_t = find_many_match_t;
+
+    // First check with a batch-size of 1
+    unified_vector<size_t> counts_base(1), counts_simd(1);
+    unified_vector<match_t> matches_base(1), matches_simd(1);
+    arrow_strings_tape_t haystacks_tape, needles_tape;
+    needles_tape.try_assign(needles.data(), needles.data() + needles.size());
+
+    // Old C-style for-loops are much more debuggable than range-based loops!
+    for (std::size_t haystack_idx = 0; haystack_idx != haystacks.size(); ++haystack_idx) {
+        auto const &haystack = haystacks[haystack_idx];
+
+        // Reset the tapes and results
+        counts_base[0] = 0, counts_simd[0] = 0;
+        matches_base.clear(), matches_simd.clear();
+        haystacks_tape.try_assign(&haystack, &haystack + 1);
+
+        // Construct the matchers
+        status_t status_base = base_operator.try_build(needles_tape.view());
+        status_t status_simd = simd_operator.try_build(needles_tape.view());
+        _sz_assert(status_base == status_t::success_k);
+        _sz_assert(status_simd == status_t::success_k);
+
+        // Count with both backends
+        span<size_t> counts_base_span {counts_base.data(), counts_base.size()};
+        span<size_t> counts_simd_span {counts_simd.data(), counts_simd.size()};
+        size_t total_found_base = base_operator.count(haystacks_tape, counts_base_span);
+        size_t total_found_simd = simd_operator.count(haystacks_tape, counts_simd_span, extra_args...);
+        _sz_assert(total_found_base == total_found_simd);
+        _sz_assert(counts_base[0] == counts_simd[0]);
+
+        // Check the matches themselves
+        matches_base.resize(total_found_base);
+        matches_simd.resize(total_found_simd);
+        size_t total_matched_base = base_operator.find(haystacks_tape, matches_base);
+        size_t total_matched_simd = simd_operator.find(haystacks_tape, matches_simd, extra_args...);
+        _sz_assert(total_matched_base == total_matched_simd);
+
+        // Check the contents and order of the matches
+        for (std::size_t i = 0; i != total_matched_base; ++i) {
+            _sz_assert(matches_base[i].haystack.data() == matches_simd[i].haystack.data());
+            _sz_assert(matches_base[i].needle.data() == matches_simd[i].needle.data());
+            _sz_assert(matches_base[i].needle_index == matches_simd[i].needle_index);
+        }
+    }
+
+    // Now test all the haystacks simultaneously
+    {
+        haystacks_tape.try_assign(haystacks.data(), haystacks.data() + haystacks.size());
+        counts_base.resize(haystacks.size());
+        counts_simd.resize(haystacks.size());
+
+        // Count with both backends
+        span<size_t> counts_base_span {counts_base.data(), counts_base.size()};
+        span<size_t> counts_simd_span {counts_simd.data(), counts_simd.size()};
+        size_t total_found_base = base_operator.count(haystacks_tape, counts_base_span);
+        size_t total_found_simd = simd_operator.count(haystacks_tape, counts_simd_span, extra_args...);
+        _sz_assert(total_found_base == total_found_simd);
+
+        // Check the matches themselves
+        matches_base.resize(total_found_base);
+        matches_simd.resize(total_found_simd);
+        size_t total_matched_base = base_operator.find(haystacks_tape, matches_base);
+        size_t total_matched_simd = simd_operator.find(haystacks_tape, matches_simd, extra_args...);
+        _sz_assert(total_matched_base == total_matched_simd);
+
+        // Check the contents and order of the matches
+        for (std::size_t i = 0; i != total_matched_base; ++i) {
+            _sz_assert(matches_base[i].haystack.data() == matches_simd[i].haystack.data());
+            _sz_assert(matches_base[i].needle.data() == matches_simd[i].needle.data());
+            _sz_assert(matches_base[i].needle_index == matches_simd[i].needle_index);
+        }
+    }
+}
+
+/**
+ *  @brief  Tests the multi-pattern exact substring search algorithm
+ *          against a baseline implementation for predefined and random inputs.
+ */
+void test_find_many_equivalence() {
+
+    // Single-threaded serial Levenshtein distance implementation
+    test_find_many_fixed(find_many_baselines_t {}, find_many_serial_t {});
+
+    // Multi-threaded parallel Levenshtein distance implementation
+    test_find_many_fixed(find_many_baselines_t {}, find_many_parallel_t {});
+}
+
 } // namespace scripts
 } // namespace stringzilla
 } // namespace ashvardanian

From e27f86f19cc5de0d3e48e7242789944a700179b2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 14 Apr 2025 21:18:36 +0000
Subject: [PATCH 331/751] Fix: Aho Corasick construction

---
 include/stringcuzilla/find_many.hpp | 101 +++++++++++++++++-----------
 1 file changed, 63 insertions(+), 38 deletions(-)

diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index 3de8bb0d..9d3c1e52 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -81,6 +81,7 @@ struct aho_corasick_dictionary {
 
     static constexpr state_id_t alphabet_size_k = 256;
     static constexpr state_id_t invalid_state_k = std::numeric_limits<state_id_t>::max();
+    static constexpr size_t invalid_length_k = std::numeric_limits<size_t>::max();
 
     using state_transitions_t = state_id_t[alphabet_size_k];
 
@@ -101,96 +102,120 @@ struct aho_corasick_dictionary {
     span<state_id_t> failures_;
     /**
      *  @brief  Number of states in the FSM, which should be smaller than the capacity of the transitions.
+     *          The value grows on each successful `try_insert` call, and doesn't change even in `try_build`.
      */
-    size_t count_ = 0;
+    size_t count_states_ = 0;
+    /**
+     *  @brief  Contains the lengths of the needles ending at each state, at least `count_states_` in size.
+     *          The values grow on each successful `try_insert` call, and doesn't change even in `try_build`.
+     */
+    span<size_t> outputs_lengths_;
+    /**
+     *  @brief  Contains number of needles ending at each state, at least `count_states_` in size.
+     *          The values grow on each successful `try_insert` call, and doesn't change even in `try_build`.
+     */
+    span<size_t> outputs_counts_;
+
     allocator_t alloc_;
 
     aho_corasick_dictionary() = default;
     ~aho_corasick_dictionary() noexcept { reset(); }
 
+    aho_corasick_dictionary(aho_corasick_dictionary const &) = delete;
+    aho_corasick_dictionary(aho_corasick_dictionary &&) = delete;
+    aho_corasick_dictionary &operator=(aho_corasick_dictionary const &) = delete;
+    aho_corasick_dictionary &operator=(aho_corasick_dictionary &&) = delete;
+
     void reset() noexcept {
         if (transitions_.data())
-            alloc_.deallocate(reinterpret_cast<char *>(transitions_.data()),
-                              transitions_.size() * sizeof(state_transitions_t));
-        if (failures_.data())
-            alloc_.deallocate(reinterpret_cast<char *>(failures_.data()), failures_.size() * sizeof(state_id_t));
-        if (outputs_.data())
-            alloc_.deallocate(reinterpret_cast<char *>(outputs_.data()), outputs_.size() * sizeof(state_id_t));
+            alloc_.deallocate((char *)(transitions_.data()), transitions_.size() * sizeof(state_transitions_t));
+        if (failures_.data()) alloc_.deallocate((char *)(failures_.data()), failures_.size() * sizeof(state_id_t));
+        if (outputs_.data()) alloc_.deallocate((char *)(outputs_.data()), outputs_.size() * sizeof(state_id_t));
+        if (outputs_lengths_.data())
+            alloc_.deallocate((char *)(outputs_lengths_.data()), outputs_lengths_.size() * sizeof(size_t));
         transitions_ = {};
         failures_ = {};
         outputs_ = {};
-        count_ = 0;
+        outputs_lengths_ = {};
+        count_states_ = 0;
     }
 
-    size_t size() const noexcept { return count_; }
+    size_t size() const noexcept { return count_states_; }
     size_t capacity() const noexcept { return transitions_.size(); }
 
     status_t try_reserve(size_t new_capacity) noexcept {
 
         // Allocate new memory blocks.
         state_transitions_t *new_transitions =
-            reinterpret_cast<state_transitions_t *>(alloc_.allocate(new_capacity * sizeof(state_transitions_t)));
-        state_id_t *new_failures = reinterpret_cast<state_id_t *>(alloc_.allocate(new_capacity * sizeof(state_id_t)));
-        state_id_t *new_outputs = reinterpret_cast<state_id_t *>(alloc_.allocate(new_capacity * sizeof(state_id_t)));
-        if (!new_transitions || !new_failures || !new_outputs) {
+            (state_transitions_t *)(alloc_.allocate(new_capacity * sizeof(state_transitions_t)));
+        state_id_t *new_failures = (state_id_t *)(alloc_.allocate(new_capacity * sizeof(state_id_t)));
+        state_id_t *new_outputs = (state_id_t *)(alloc_.allocate(new_capacity * sizeof(state_id_t)));
+        size_t *new_lengths = (size_t *)(alloc_.allocate(new_capacity * sizeof(size_t)));
+        if (!new_transitions || !new_failures || !new_outputs || !new_lengths) {
             if (new_transitions)
-                alloc_.deallocate(reinterpret_cast<char *>(new_transitions),
-                                  new_capacity * sizeof(state_transitions_t));
-            if (new_failures)
-                alloc_.deallocate(reinterpret_cast<char *>(new_failures), new_capacity * sizeof(state_id_t));
-            if (new_outputs)
-                alloc_.deallocate(reinterpret_cast<char *>(new_outputs), new_capacity * sizeof(state_id_t));
+                alloc_.deallocate((char *)(new_transitions), new_capacity * sizeof(state_transitions_t));
+            if (new_failures) alloc_.deallocate((char *)(new_failures), new_capacity * sizeof(state_id_t));
+            if (new_outputs) alloc_.deallocate((char *)(new_outputs), new_capacity * sizeof(state_id_t));
+            if (new_lengths) alloc_.deallocate((char *)(new_lengths), new_capacity * sizeof(size_t));
             return status_t::bad_alloc_k;
         }
 
-        // Copy existing states.
-        for (size_t state = 0; state < count_; ++state) {
-            for (size_t index = 0; index < alphabet_size_k; ++index)
-                new_transitions[state][index] = transitions_[state][index];
-            new_failures[state] = failures_[state];
-            new_outputs[state] = outputs_[state];
+        size_t old_count = count_states_;
+
+        // Copy existing states data. Use memcpy for POD types if spans are contiguous.
+        if (old_count > 0) {
+            sz_copy((sz_ptr_t)new_transitions, (sz_cptr_t)transitions_.data(), old_count * sizeof(state_transitions_t));
+            sz_copy((sz_ptr_t)new_outputs, (sz_cptr_t)outputs_.data(), old_count * sizeof(state_id_t));
+            sz_copy((sz_ptr_t)new_lengths, (sz_cptr_t)outputs_lengths_.data(), old_count * sizeof(size_t));
+            sz_copy((sz_ptr_t)new_failures, (sz_cptr_t)failures_.data(), old_count * sizeof(state_id_t));
         }
 
         // Initialize new states.
-        for (size_t state = count_; state < new_capacity; ++state) {
+        for (size_t state = old_count; state < new_capacity; ++state) {
             for (size_t index = 0; index < alphabet_size_k; ++index) new_transitions[state][index] = invalid_state_k;
-            new_failures[state] = 0;
             new_outputs[state] = invalid_state_k;
+            new_lengths[state] = invalid_length_k;
+            new_failures[state] = 0; // Default failure to root
         }
 
         // Free old memory and update pointers.
-        size_t old_count = count_;
         reset();
-        count_ = std::max<size_t>(old_count, 1); // The effective size doesn't change, but we now have a root!
         transitions_ = {new_transitions, new_capacity};
         failures_ = {new_failures, new_capacity};
         outputs_ = {new_outputs, new_capacity};
+        outputs_lengths_ = {new_lengths, new_capacity};
+        count_states_ = std::max<size_t>(old_count, 1); // The effective size doesn't change, but we now have a root!
         return status_t::success_k;
     }
 
+    /**
+     *  @brief Adds a single @p needle to the vocabulary, assigning it a unique @p needle_id.
+     */
     status_t try_insert(span<char const> needle, state_id_t needle_id) noexcept {
+        if (!needle.size()) return status_t::success_k; // Don't care about empty needles.
+
         state_id_t current_state = 0;
         for (size_t pos = 0; pos < needle.size(); ++pos) {
             unsigned char const symbol = static_cast<unsigned char>(needle[pos]);
             state_id_t *current_row = transitions_[current_state];
             bool const has_root_state = transitions_.data() != nullptr;
             if (!has_root_state || current_row[symbol] == invalid_state_k) {
-                if (count_ >= transitions_.size()) {
+                if (count_states_ >= transitions_.size()) {
                     status_t reserve_status = try_reserve(sz_size_bit_ceil(transitions_.size() + 1 + !has_root_state));
                     if (reserve_status != status_t::success_k) return reserve_status;
                     current_row = transitions_[current_state]; // Update the pointer!
                 }
-                current_row[symbol] = static_cast<state_id_t>(count_);
-                state_id_t new_state = static_cast<state_id_t>(count_);
-                for (size_t index = 0; index < alphabet_size_k; ++index)
-                    transitions_[new_state][index] = invalid_state_k;
-                failures_[new_state] = 0;
-                outputs_[new_state] = invalid_state_k;
-                ++count_;
+
+                // Use the next available state ID
+                state_id_t new_state = static_cast<state_id_t>(count_states_);
+                current_row[symbol] = new_state;
+                ++count_states_;
             }
             current_state = current_row[symbol];
         }
+
         outputs_[current_state] = needle_id;
+        outputs_lengths_[current_state] = static_cast<size_t>(needle.size());
         return status_t::success_k;
     }
 
@@ -295,7 +320,7 @@ struct find_many {
             status_t status = dict_.try_insert(needle, static_cast<state_id_t>(dict_.size()));
             if (status != status_t::success_k) return status;
         }
-        return status_t::success_k;
+        return dict_.try_build();
     }
 
     void reset() noexcept { dict_.reset(); }

From 0c6dd00b04e06771317b0f4c99b369c29665f3b8 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 14 Apr 2025 21:34:00 +0000
Subject: [PATCH 332/751] Fix: Use smaller types in BFS queue

---
 include/stringcuzilla/find_many.hpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index 9d3c1e52..ee2c3ab7 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -222,8 +222,8 @@ struct aho_corasick_dictionary {
     status_t try_build() noexcept {
 
         // Allocate a queue for Breadth-First Search (BFS) traversal.
-        size_t queue_capacity = count_;
-        size_t *work_queue = reinterpret_cast<size_t *>(alloc_.allocate(queue_capacity * sizeof(size_t)));
+        size_t queue_capacity = count_states_;
+        state_id_t *work_queue = (state_id_t *)(alloc_.allocate(queue_capacity * sizeof(state_id_t)));
         if (!work_queue) return status_t::bad_alloc_k;
 
         // Reset all root transitions to point to itself - forming a loop.
@@ -234,13 +234,13 @@ struct aho_corasick_dictionary {
         }
 
         while (queue_begin < queue_end) {
-            size_t current_state = work_queue[queue_begin++];
+            state_id_t current_state = work_queue[queue_begin++];
             for (size_t symbol = 0; symbol < alphabet_size_k; ++symbol) {
 
                 state_id_t next_state = transitions_[current_state][symbol];
                 if (next_state != invalid_state_k) {
 
-                    size_t failure_state = failures_[current_state];
+                    state_id_t failure_state = failures_[current_state];
                     while (transitions_[failure_state][symbol] == invalid_state_k)
                         failure_state = failures_[failure_state];
 
@@ -252,7 +252,7 @@ struct aho_corasick_dictionary {
                 else { transitions_[current_state][symbol] = transitions_[failures_[current_state]][symbol]; }
             }
         }
-        alloc_.deallocate(work_queue, queue_capacity * sizeof(size_t));
+        alloc_.deallocate((char *)work_queue, queue_capacity * sizeof(state_id_t));
         return status_t::success_k;
     }
 

From 42fa08d67316a1620229557b496933510f3b6c3f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 14 Apr 2025 21:34:17 +0000
Subject: [PATCH 333/751] Fix: Indexing needle IDs

---
 include/stringcuzilla/find_many.hpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index ee2c3ab7..19504b19 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -316,9 +316,11 @@ struct find_many {
 
     template <typename needles_type_>
     status_t try_build(needles_type_ &&needles_strings) noexcept {
+        state_id_t needle_id = 0;
         for (auto const &needle : needles_strings) {
-            status_t status = dict_.try_insert(needle, static_cast<state_id_t>(dict_.size()));
+            status_t status = dict_.try_insert(needle, needle_id);
             if (status != status_t::success_k) return status;
+            needle_id++;
         }
         return dict_.try_build();
     }

From d0ebee8412fcbfcccf9050a2471f13c89d603a3d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 14 Apr 2025 21:34:53 +0000
Subject: [PATCH 334/751] Fix: Calculating `find_many_match_t` properties

---
 include/stringcuzilla/find_many.hpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index 19504b19..13b77fc6 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -269,8 +269,9 @@ struct aho_corasick_dictionary {
             unsigned char symbol = static_cast<unsigned char>(haystack[pos]);
             current_state = transitions_[current_state][symbol];
             if (outputs_[current_state] != invalid_state_k) {
-                span<char const> match_span(&haystack[pos], haystack.size() - pos);
-                match_t match {haystack, match_span, outputs_[current_state]};
+                size_t match_length = outputs_lengths_[current_state];
+                span<char const> match_span(&haystack[pos + 1 - match_length], match_length);
+                match_t match {haystack, match_span, 0, outputs_[current_state]};
                 if (!callback(match)) break;
             }
         }
@@ -351,7 +352,8 @@ struct find_many {
     size_t find(haystacks_type_ &&haystacks, output_matches_type_ &&matches) const noexcept {
         size_t count_found = 0, count_allowed = matches.size();
         for (auto it = haystacks.begin(); it != haystacks.end() && count_found != count_allowed; ++it)
-            dict_.find(*it, [&](match_t const &match) {
+            dict_.find(*it, [&](match_t match) {
+                match.haystack_index = static_cast<size_t>(it - haystacks.begin());
                 matches[count_found] = match;
                 count_found++;
                 return count_found < count_allowed;

From faad971c1efb9fb797887226ddfc199c8b2f960b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 15 Apr 2025 12:09:29 +0000
Subject: [PATCH 335/751] Fix: Passing multi-needle tests

---
 include/stringcuzilla/find_many.hpp | 218 ++++++++++++++++++----------
 scripts/test_stringcuzilla.cuh      |  12 +-
 2 files changed, 149 insertions(+), 81 deletions(-)

diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index 13b77fc6..c8bb592b 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -83,60 +83,97 @@ struct aho_corasick_dictionary {
     static constexpr state_id_t invalid_state_k = std::numeric_limits<state_id_t>::max();
     static constexpr size_t invalid_length_k = std::numeric_limits<size_t>::max();
 
-    using state_transitions_t = state_id_t[alphabet_size_k];
+    using state_transitions_t = safe_array<state_id_t, alphabet_size_k>;
+    static_assert(std::is_unsigned_v<state_id_t>, "State ID should be unsigned");
+
+    using size_allocator_t = typename std::allocator_traits<allocator_t>::template rebind_alloc<size_t>;
+    using state_id_allocator_t = typename std::allocator_traits<allocator_t>::template rebind_alloc<state_id_t>;
+    using state_transitions_allocator_t =
+        typename std::allocator_traits<allocator_t>::template rebind_alloc<state_transitions_t>;
 
     /**
      *  @brief  State transitions for each state, at least `count_states_ * alphabet_size_k` in binary size.
-     *  @note   The transitions are being populated both during vocabulary construction and during search.
+     *          The transitions are being populated both during vocabulary construction and during search.
      */
-    span<state_transitions_t> transitions_;
+    safe_vector<state_transitions_t, state_transitions_allocator_t> transitions_;
+
     /**
-     *  @brief  Output needle IDs for each state, at least `count_states_` in size.
-     *  @note   They are being populated both during vocabulary construction and during search.
+     *  @brief  The IDs of the output transitions per state.
+     *
+     *  During `try_insert`, contains exactly one entry per state, generally set to `invalid_state_k`.
+     *  After `try_build`, contains at least as many entries as the number of unique needles provided,
+     *  or potentially more, given how failure links are being merged, if needles have shared suffixes.
      */
-    span<state_id_t> outputs_;
+    safe_vector<state_id_t, state_id_allocator_t> outputs_;
+
     /**
-     *  @brief  Failure links for each state, at least `count_states_` in size.
-     *  @note   The failure links aren't very needed after the FSM construction, if we stick to a dense layout.
+     *  @brief  Failure links for each state, exactly `count_states_` in effective size, potentially larger capacity.
+     *          The failure links aren't very needed after the FSM construction, if we stick to a dense layout.
      */
-    span<state_id_t> failures_;
+    safe_vector<state_id_t, state_id_allocator_t> failures_;
+
     /**
      *  @brief  Number of states in the FSM, which should be smaller than the capacity of the transitions.
      *          The value grows on each successful `try_insert` call, and doesn't change even in `try_build`.
      */
     size_t count_states_ = 0;
+
     /**
-     *  @brief  Contains the lengths of the needles ending at each state, at least `count_states_` in size.
-     *          The values grow on each successful `try_insert` call, and doesn't change even in `try_build`.
+     *  @brief  Contains number of needles ending at each state, exactly `count_states_` in size.
+     *          We can use any `size_t`-like counter, but the `state_id_t` is probably the smallest safe type here.
+     *
+     *  This object is used to navigate into the `outputs_` array after the FSM construction. For any state `I`, the
+     *  following slice defines all matches: `outputs_[outputs_offsets_[I], outputs_offsets_[I] + outputs_counts_[I]]`.
      */
-    span<size_t> outputs_lengths_;
+    safe_vector<state_id_t, state_id_allocator_t> outputs_counts_;
+
+    /**
+     *  @brief  Contains number of merged needles & failure outputs ending before each state, `count_states_` in size.
+     *          We can use any `size_t`-like counter, but the `state_id_t` is probably the smallest safe type here.
+     *
+     *  This object is used to navigate into the `outputs_` array after the FSM construction. It contains effectively
+     *  the exclusive prefix sum of `outputs_counts_`. For any state `I`, the following slice defines all matches:
+     *  `outputs_[outputs_offsets_[I], outputs_offsets_[I] + outputs_counts_[I]]`.
+     */
+    safe_vector<state_id_t, state_id_allocator_t> outputs_offsets_;
+
     /**
-     *  @brief  Contains number of needles ending at each state, at least `count_states_` in size.
-     *          The values grow on each successful `try_insert` call, and doesn't change even in `try_build`.
+     *  @brief  Contains the lengths of needles.
+     *          The array grows on each successful `try_insert` call, and doesn't change even in `try_build`.
      */
-    span<size_t> outputs_counts_;
+    safe_vector<size_t, size_allocator_t> needles_lengths_;
 
     allocator_t alloc_;
 
     aho_corasick_dictionary() = default;
     ~aho_corasick_dictionary() noexcept { reset(); }
 
+    aho_corasick_dictionary(allocator_t alloc) noexcept
+        : transitions_(alloc), failures_(alloc), outputs_(alloc), outputs_counts_(alloc), outputs_offsets_(alloc),
+          needles_lengths_(alloc), alloc_(alloc) {}
+
     aho_corasick_dictionary(aho_corasick_dictionary const &) = delete;
     aho_corasick_dictionary(aho_corasick_dictionary &&) = delete;
     aho_corasick_dictionary &operator=(aho_corasick_dictionary const &) = delete;
     aho_corasick_dictionary &operator=(aho_corasick_dictionary &&) = delete;
 
+    void clear() noexcept {
+        transitions_.clear();
+        failures_.clear();
+        outputs_.clear();
+        needles_lengths_.clear();
+        outputs_counts_.clear();
+        outputs_offsets_.clear();
+        count_states_ = 0;
+    }
+
     void reset() noexcept {
-        if (transitions_.data())
-            alloc_.deallocate((char *)(transitions_.data()), transitions_.size() * sizeof(state_transitions_t));
-        if (failures_.data()) alloc_.deallocate((char *)(failures_.data()), failures_.size() * sizeof(state_id_t));
-        if (outputs_.data()) alloc_.deallocate((char *)(outputs_.data()), outputs_.size() * sizeof(state_id_t));
-        if (outputs_lengths_.data())
-            alloc_.deallocate((char *)(outputs_lengths_.data()), outputs_lengths_.size() * sizeof(size_t));
-        transitions_ = {};
-        failures_ = {};
-        outputs_ = {};
-        outputs_lengths_ = {};
+        transitions_.reset();
+        failures_.reset();
+        outputs_.reset();
+        needles_lengths_.reset();
+        outputs_counts_.reset();
+        outputs_offsets_.reset();
         count_states_ = 0;
     }
 
@@ -146,64 +183,48 @@ struct aho_corasick_dictionary {
     status_t try_reserve(size_t new_capacity) noexcept {
 
         // Allocate new memory blocks.
-        state_transitions_t *new_transitions =
-            (state_transitions_t *)(alloc_.allocate(new_capacity * sizeof(state_transitions_t)));
-        state_id_t *new_failures = (state_id_t *)(alloc_.allocate(new_capacity * sizeof(state_id_t)));
-        state_id_t *new_outputs = (state_id_t *)(alloc_.allocate(new_capacity * sizeof(state_id_t)));
-        size_t *new_lengths = (size_t *)(alloc_.allocate(new_capacity * sizeof(size_t)));
-        if (!new_transitions || !new_failures || !new_outputs || !new_lengths) {
-            if (new_transitions)
-                alloc_.deallocate((char *)(new_transitions), new_capacity * sizeof(state_transitions_t));
-            if (new_failures) alloc_.deallocate((char *)(new_failures), new_capacity * sizeof(state_id_t));
-            if (new_outputs) alloc_.deallocate((char *)(new_outputs), new_capacity * sizeof(state_id_t));
-            if (new_lengths) alloc_.deallocate((char *)(new_lengths), new_capacity * sizeof(size_t));
-            return status_t::bad_alloc_k;
-        }
-
-        size_t old_count = count_states_;
-
-        // Copy existing states data. Use memcpy for POD types if spans are contiguous.
-        if (old_count > 0) {
-            sz_copy((sz_ptr_t)new_transitions, (sz_cptr_t)transitions_.data(), old_count * sizeof(state_transitions_t));
-            sz_copy((sz_ptr_t)new_outputs, (sz_cptr_t)outputs_.data(), old_count * sizeof(state_id_t));
-            sz_copy((sz_ptr_t)new_lengths, (sz_cptr_t)outputs_lengths_.data(), old_count * sizeof(size_t));
-            sz_copy((sz_ptr_t)new_failures, (sz_cptr_t)failures_.data(), old_count * sizeof(state_id_t));
-        }
+        if (transitions_.try_resize(new_capacity) != status_t::success_k) return status_t::bad_alloc_k;
+        if (failures_.try_resize(new_capacity) != status_t::success_k) return status_t::bad_alloc_k;
+        if (outputs_.try_resize(new_capacity) != status_t::success_k) return status_t::bad_alloc_k;
+        if (outputs_counts_.try_resize(new_capacity) != status_t::success_k) return status_t::bad_alloc_k;
+        if (outputs_offsets_.try_resize(new_capacity) != status_t::success_k) return status_t::bad_alloc_k;
 
         // Initialize new states.
+        size_t old_count = count_states_;
         for (size_t state = old_count; state < new_capacity; ++state) {
-            for (size_t index = 0; index < alphabet_size_k; ++index) new_transitions[state][index] = invalid_state_k;
-            new_outputs[state] = invalid_state_k;
-            new_lengths[state] = invalid_length_k;
-            new_failures[state] = 0; // Default failure to root
+            for (size_t index = 0; index < alphabet_size_k; ++index) transitions_[state][index] = invalid_state_k;
+            outputs_[state] = invalid_state_k;
+            failures_[state] = 0;       // Default failure to root
+            outputs_counts_[state] = 0; // Default count to zero
+            outputs_offsets_[state] = invalid_state_k;
         }
 
-        // Free old memory and update pointers.
-        reset();
-        transitions_ = {new_transitions, new_capacity};
-        failures_ = {new_failures, new_capacity};
-        outputs_ = {new_outputs, new_capacity};
-        outputs_lengths_ = {new_lengths, new_capacity};
-        count_states_ = std::max<size_t>(old_count, 1); // The effective size doesn't change, but we now have a root!
+        // The effective size doesn't change, but we now have a root!
+        count_states_ = std::max<size_t>(old_count, 1);
         return status_t::success_k;
     }
 
     /**
      *  @brief Adds a single @p needle to the vocabulary, assigning it a unique @p needle_id.
+     *  @note  Can't be called after `try_build`. Can't be called from multiple threads at the same time.
      */
-    status_t try_insert(span<char const> needle, state_id_t needle_id) noexcept {
+    status_t try_insert(span<char const> needle) noexcept {
         if (!needle.size()) return status_t::success_k; // Don't care about empty needles.
 
+        state_id_t const needle_id = static_cast<state_id_t>(needles_lengths_.size());
+        if (needles_lengths_.try_reserve(sz_size_bit_ceil(needles_lengths_.size() + 1)) != status_t::success_k)
+            return status_t::bad_alloc_k;
+
         state_id_t current_state = 0;
         for (size_t pos = 0; pos < needle.size(); ++pos) {
             unsigned char const symbol = static_cast<unsigned char>(needle[pos]);
-            state_id_t *current_row = transitions_[current_state];
+            state_id_t *current_row = &transitions_[current_state][0];
             bool const has_root_state = transitions_.data() != nullptr;
             if (!has_root_state || current_row[symbol] == invalid_state_k) {
                 if (count_states_ >= transitions_.size()) {
                     status_t reserve_status = try_reserve(sz_size_bit_ceil(transitions_.size() + 1 + !has_root_state));
                     if (reserve_status != status_t::success_k) return reserve_status;
-                    current_row = transitions_[current_state]; // Update the pointer!
+                    current_row = &transitions_[current_state][0]; // Update the pointer to the row of state transitions
                 }
 
                 // Use the next available state ID
@@ -215,16 +236,38 @@ struct aho_corasick_dictionary {
         }
 
         outputs_[current_state] = needle_id;
-        outputs_lengths_[current_state] = static_cast<size_t>(needle.size());
+        needles_lengths_.try_push_back(needle.size()); // ? Can't fail due to `try_reserve` above
+        outputs_counts_[current_state] = 1; // ? This will snowball in `try_build` if needles have shared suffixes
+        outputs_offsets_[current_state] = current_state;
         return status_t::success_k;
     }
 
+    /**
+     *  @brief  Construct the Finite State Machine (FSM) from the vocabulary. Can only be called @b once!
+     *  @note   This function is not thread safe and allocates a significant amount of memory, so it can fail.
+     */
     status_t try_build() noexcept {
 
         // Allocate a queue for Breadth-First Search (BFS) traversal.
-        size_t queue_capacity = count_states_;
-        state_id_t *work_queue = (state_id_t *)(alloc_.allocate(queue_capacity * sizeof(state_id_t)));
-        if (!work_queue) return status_t::bad_alloc_k;
+        safe_vector<state_id_t, state_id_allocator_t> work_queue(alloc_);
+        if (work_queue.try_resize(count_states_) != status_t::success_k) return status_t::bad_alloc_k;
+
+        // We will construct nested dynamically growing arrays (yes, too many memory allocations, I know),
+        // to expand and track all of the outputs for each state, merging the failure links.
+        // We will later use `outputs_merged` to populate `outputs_`, `outputs_offsets_`, and `outputs_counts_`.
+        using state_ids_vector_t = safe_vector<state_id_t, state_id_allocator_t>;
+        using state_ids_vector_allocator_t =
+            typename std::allocator_traits<allocator_t>::template rebind_alloc<state_ids_vector_t>;
+        using state_ids_per_state_vector_t = safe_vector<state_ids_vector_t, state_ids_vector_allocator_t>;
+        state_ids_per_state_vector_t outputs_merged(alloc_);
+        if (outputs_merged.try_resize(count_states_) != status_t::success_k) return status_t::bad_alloc_k;
+
+        // Populate the `outputs_merged` with the initial outputs.
+        for (size_t state = 0; state < count_states_; ++state) {
+            state_ids_vector_t &outputs = outputs_merged[state];
+            if (outputs_[state] != invalid_state_k && outputs.try_push_back(outputs_[state]) != status_t::success_k)
+                return status_t::bad_alloc_k;
+        }
 
         // Reset all root transitions to point to itself - forming a loop.
         size_t queue_begin = 0, queue_end = 0;
@@ -243,8 +286,13 @@ struct aho_corasick_dictionary {
                     state_id_t failure_state = failures_[current_state];
                     while (transitions_[failure_state][symbol] == invalid_state_k)
                         failure_state = failures_[failure_state];
-
                     failures_[next_state] = transitions_[failure_state][symbol];
+
+                    // Aggregate the outputs of the failure links
+                    if (outputs_merged[next_state].try_append(outputs_merged[failures_[next_state]]) !=
+                        status_t::success_k)
+                        return status_t::bad_alloc_k;
+
                     if (outputs_[failures_[next_state]] != invalid_state_k && outputs_[next_state] == invalid_state_k)
                         outputs_[next_state] = outputs_[failures_[next_state]];
                     work_queue[queue_end++] = next_state;
@@ -252,7 +300,24 @@ struct aho_corasick_dictionary {
                 else { transitions_[current_state][symbol] = transitions_[failures_[current_state]][symbol]; }
             }
         }
-        alloc_.deallocate((char *)work_queue, queue_capacity * sizeof(state_id_t));
+
+        // Re-populate the `outputs_` with a flattened version of `outputs_merged`.
+        // Also populate the `outputs_counts_` with the number of needles ending at each state.
+        size_t total_count = 0;
+        for (size_t state = 0; state < count_states_; ++state) {
+            state_ids_vector_t &outputs = outputs_merged[state];
+            outputs_counts_[state] = static_cast<state_id_t>(outputs.size());
+            outputs_offsets_[state] = static_cast<state_id_t>(total_count);
+            total_count += outputs.size();
+        }
+
+        // Now in the second pass, perform the flattening of the `outputs_merged` into `outputs_`.
+        if (outputs_.try_resize(total_count) != status_t::success_k) return status_t::bad_alloc_k;
+        for (size_t state = 0; state < count_states_; ++state) {
+            state_ids_vector_t &outputs = outputs_merged[state];
+            for (size_t i = 0; i < outputs.size(); ++i) outputs_[outputs_offsets_[state] + i] = outputs[i];
+        }
+
         return status_t::success_k;
     }
 
@@ -268,10 +333,15 @@ struct aho_corasick_dictionary {
         for (size_t pos = 0; pos < haystack.size(); ++pos) {
             unsigned char symbol = static_cast<unsigned char>(haystack[pos]);
             current_state = transitions_[current_state][symbol];
-            if (outputs_[current_state] != invalid_state_k) {
-                size_t match_length = outputs_lengths_[current_state];
+
+            size_t outputs_count = outputs_counts_[current_state];
+            if (outputs_count == 0) continue;
+            size_t outputs_offset = outputs_offsets_[current_state];
+            for (size_t i = 0; i < outputs_count; ++i) { // In small vocabulary, this is generally just 1 iteration
+                size_t needle_id = outputs_[outputs_offset + i];
+                size_t match_length = needles_lengths_[needle_id];
                 span<char const> match_span(&haystack[pos + 1 - match_length], match_length);
-                match_t match {haystack, match_span, 0, outputs_[current_state]};
+                match_t match {haystack, match_span, 0, needle_id};
                 if (!callback(match)) break;
             }
         }
@@ -287,7 +357,7 @@ struct aho_corasick_dictionary {
         for (size_t pos = 0; pos < haystack.size(); ++pos) {
             unsigned char symbol = static_cast<unsigned char>(haystack[pos]);
             current_state = transitions_[current_state][symbol];
-            count += outputs_[current_state] != invalid_state_k;
+            count += outputs_counts_[current_state];
         }
         return count;
     }
@@ -317,11 +387,9 @@ struct find_many {
 
     template <typename needles_type_>
     status_t try_build(needles_type_ &&needles_strings) noexcept {
-        state_id_t needle_id = 0;
         for (auto const &needle : needles_strings) {
-            status_t status = dict_.try_insert(needle, needle_id);
+            status_t status = dict_.try_insert(needle);
             if (status != status_t::success_k) return status;
-            needle_id++;
         }
         return dict_.try_build();
     }
diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index 04d0aca3..60d99143 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -736,6 +736,12 @@ void test_find_many_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_
     arrow_strings_tape_t haystacks_tape, needles_tape;
     needles_tape.try_assign(needles.data(), needles.data() + needles.size());
 
+    // Construct the matchers
+    status_t status_base = base_operator.try_build(needles_tape.view());
+    status_t status_simd = simd_operator.try_build(needles_tape.view());
+    _sz_assert(status_base == status_t::success_k);
+    _sz_assert(status_simd == status_t::success_k);
+
     // Old C-style for-loops are much more debuggable than range-based loops!
     for (std::size_t haystack_idx = 0; haystack_idx != haystacks.size(); ++haystack_idx) {
         auto const &haystack = haystacks[haystack_idx];
@@ -745,12 +751,6 @@ void test_find_many_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_
         matches_base.clear(), matches_simd.clear();
         haystacks_tape.try_assign(&haystack, &haystack + 1);
 
-        // Construct the matchers
-        status_t status_base = base_operator.try_build(needles_tape.view());
-        status_t status_simd = simd_operator.try_build(needles_tape.view());
-        _sz_assert(status_base == status_t::success_k);
-        _sz_assert(status_simd == status_t::success_k);
-
         // Count with both backends
         span<size_t> counts_base_span {counts_base.data(), counts_base.size()};
         span<size_t> counts_simd_span {counts_simd.data(), counts_simd.size()};

From a5ca2ac71cd2797d85038fdf8dfd403533986dd0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 15 Apr 2025 12:10:24 +0000
Subject: [PATCH 336/751] Add: `indexed_container_iterator`

---
 include/stringzilla/types.hpp | 114 ++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index b6d02282..d1311b24 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -183,6 +183,120 @@ struct dummy_alloc {
 
 using dummy_alloc_t = dummy_alloc<char>;
 
+/**
+ *  @brief  Random access iterator for any immutable container with indexed element lookup support.
+ *  @note   Designed for `arrow_strings_tape` and `arrow_strings_view` compatibility with STL algorithms and ranges.
+ */
+template <typename container_type_>
+struct indexed_container_iterator {
+    using container_t = container_type_;
+    using value_t = typename container_t::value_type;
+
+    using difference_type = sz_ssize_t;
+    using value_type = value_t;
+    using reference = value_t; // ! As our view returns by value
+    using pointer = void;      // ! Not providing direct pointer semantics
+
+#if !SZ_AVOID_STL
+    using iterator_category = std::random_access_iterator_tag;
+#endif
+
+  private:
+    container_t const *parent_;
+    size_t index_;
+
+  public:
+    constexpr indexed_container_iterator() noexcept : parent_(nullptr), index_(0) {}
+    constexpr indexed_container_iterator(container_t const &parent, size_t index) noexcept
+        : parent_(&parent), index_(index) {}
+    constexpr reference operator*() const noexcept { return (*parent_)[index_]; }
+
+    struct proxy {
+        value_type value;
+        constexpr proxy(value_type v) noexcept : value(v) {}
+        constexpr value_type const *operator->() const noexcept { return &value; }
+    };
+
+    constexpr proxy operator->() const noexcept { return proxy(operator*()); }
+    constexpr indexed_container_iterator &operator++() noexcept {
+        ++index_;
+        return *this;
+    }
+
+    constexpr indexed_container_iterator operator++(int) noexcept {
+        indexed_container_iterator temp = *this;
+        ++index_;
+        return temp;
+    }
+
+    constexpr indexed_container_iterator &operator--() noexcept {
+        --index_;
+        return *this;
+    }
+
+    constexpr indexed_container_iterator operator--(int) noexcept {
+        indexed_container_iterator temp = *this;
+        --index_;
+        return temp;
+    }
+
+    constexpr indexed_container_iterator &operator+=(difference_type n) noexcept {
+        index_ += n;
+        return *this;
+    }
+
+    constexpr indexed_container_iterator &operator-=(difference_type n) noexcept {
+        index_ -= n;
+        return *this;
+    }
+
+    constexpr indexed_container_iterator operator+(difference_type n) const noexcept {
+        indexed_container_iterator temp = *this;
+        return temp += n;
+    }
+
+    constexpr indexed_container_iterator operator-(difference_type n) const noexcept {
+        indexed_container_iterator temp = *this;
+        return temp -= n;
+    }
+
+    constexpr difference_type operator-(indexed_container_iterator const &other) const noexcept {
+        return static_cast<difference_type>(index_) - static_cast<difference_type>(other.index_);
+    }
+
+    constexpr value_type operator[](difference_type n) const noexcept { return *(*this + n); }
+
+    friend constexpr bool operator==(indexed_container_iterator const &lhs,
+                                     indexed_container_iterator const &rhs) noexcept {
+        return lhs.parent_ == rhs.parent_ && lhs.index_ == rhs.index_;
+    }
+
+    friend constexpr bool operator!=(indexed_container_iterator const &lhs,
+                                     indexed_container_iterator const &rhs) noexcept {
+        return !(lhs == rhs);
+    }
+
+    friend constexpr bool operator<(indexed_container_iterator const &lhs,
+                                    indexed_container_iterator const &rhs) noexcept {
+        return lhs.index_ < rhs.index_;
+    }
+
+    friend constexpr bool operator>(indexed_container_iterator const &lhs,
+                                    indexed_container_iterator const &rhs) noexcept {
+        return rhs < lhs;
+    }
+
+    friend constexpr bool operator<=(indexed_container_iterator const &lhs,
+                                     indexed_container_iterator const &rhs) noexcept {
+        return !(rhs < lhs);
+    }
+
+    friend constexpr bool operator>=(indexed_container_iterator const &lhs,
+                                     indexed_container_iterator const &rhs) noexcept {
+        return !(lhs < rhs);
+    }
+};
+
 /**
  *  @brief  Apache @b Arrow-compatible tape data-structure to store a sequence of variable length strings.
  *          Doesn't own the memory, but provides a view to the strings stored in a contiguous memory block.

From b70096bd9d8c6d2a5b13245faa648dceb5b78090 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 15 Apr 2025 12:10:58 +0000
Subject: [PATCH 337/751] Add: Immutable iterators for Arrow tapes

---
 include/stringzilla/types.hpp | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index d1311b24..9ad61c45 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -306,19 +306,29 @@ template <typename char_type_, typename offset_type_>
 struct arrow_strings_view {
     using char_t = char_type_;
     using offset_t = offset_type_;
-    using value_t = span<char_t>;
+    using self_t = arrow_strings_view<char_t, offset_t>;
+
+    using value_t = span<char_t const>;
     using value_type = value_t; // ? For STL compatibility
+    using iterator_t = indexed_container_iterator<self_t>;
+    using iterator = iterator_t; // ? For STL compatibility
 
-    span<char_t> buffer_;
-    span<offset_t> offsets_;
+    span<char_t const> buffer_;
+    span<offset_t const> offsets_;
 
     constexpr arrow_strings_view() noexcept : buffer_ {}, offsets_ {} {}
-    constexpr arrow_strings_view(span<char_t> buf, span<offset_t> offs) noexcept : buffer_(buf), offsets_(offs) {}
-    constexpr size_t size() const noexcept { return offsets_.size() - 1; }
+    constexpr arrow_strings_view(span<char_t const> buf, span<offset_t const> offs) noexcept
+        : buffer_(buf), offsets_(offs) {}
 
-    constexpr span<char_t> operator[](size_t i) const noexcept {
+    constexpr size_t size() const noexcept { return offsets_.size() - 1; }
+    constexpr value_t operator[](size_t i) const noexcept {
         return {&buffer_[offsets_[i]], offsets_[i + 1] - offsets_[i] - 1};
     }
+
+    constexpr iterator_t begin() const noexcept { return iterator_t(*this, 0); }
+    constexpr iterator_t end() const noexcept { return iterator_t(*this, size()); }
+    constexpr iterator_t cbegin() const noexcept { return begin(); }
+    constexpr iterator_t cend() const noexcept { return end(); }
 };
 
 /**
@@ -331,10 +341,13 @@ struct arrow_strings_tape {
     using char_t = char_type_;
     using offset_t = offset_type_;
     using allocator_t = allocator_type_;
+    using self_t = arrow_strings_tape<char_t, offset_t, allocator_t>;
 
-    using value_t = span<char_t>;
+    using value_t = span<char_t const>;
     using view_t = arrow_strings_view<char_t, offset_t>;
     using value_type = value_t; // ? For STL compatibility
+    using iterator_t = indexed_container_iterator<self_t>;
+    using iterator = iterator_t; // ? For STL compatibility
 
 #if _SZ_IS_CPP17
     using char_alloc_t = typename std::allocator_traits<allocator_t>::rebind_alloc<char_t>;
@@ -370,6 +383,11 @@ struct arrow_strings_tape {
         count_ = 0;
     }
 
+    constexpr iterator_t begin() const noexcept { return iterator_t(*this, 0); }
+    constexpr iterator_t end() const noexcept { return iterator_t(*this, size()); }
+    constexpr iterator_t cbegin() const noexcept { return begin(); }
+    constexpr iterator_t cend() const noexcept { return end(); }
+
     template <typename strings_iterator_type_>
     status_t try_assign(strings_iterator_type_ first, strings_iterator_type_ last) noexcept {
         // Deallocate previous memory if allocated
@@ -467,7 +485,7 @@ struct arrow_strings_tape {
     }
 
     constexpr size_t size() const noexcept { return count_; }
-    constexpr view_t view() const noexcept { return {buffer_, {offsets_.data_, count_ + 1}}; }
+    constexpr view_t view() const noexcept { return {{buffer_.data(), buffer_.size()}, {offsets_.data_, count_ + 1}}; }
     constexpr span<char_t> const &buffer() const noexcept { return buffer_; }
     constexpr span<offset_t> const &offsets() const noexcept { return offsets_; }
 };

From ac6218aa0a2c09b278780bca9f5bedb68db5bf81 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 15 Apr 2025 12:11:19 +0000
Subject: [PATCH 338/751] Add: `safe_vector`, `safe_array`

---
 include/stringzilla/types.hpp | 158 ++++++++++++++++++++++++++++++++++
 1 file changed, 158 insertions(+)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 9ad61c45..ad507f82 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -552,6 +552,164 @@ struct constant_iterator {
     difference_type pos_;
 };
 
+template <typename value_type_, size_t count_>
+struct safe_array {
+    using value_type = value_type_;
+    using size_type = size_t;
+    using iterator = value_type *;
+    using const_iterator = value_type const *;
+    static constexpr size_type count_k = count_;
+
+    value_type data_[count_k] = {};
+
+    constexpr value_type &operator[](size_type i) noexcept { return data_[i]; }
+    constexpr value_type const &operator[](size_type i) const noexcept { return data_[i]; }
+    constexpr size_type size() const noexcept { return count_k; }
+    constexpr value_type *data() noexcept { return data_; }
+    constexpr value_type const *data() const noexcept { return data_; }
+    constexpr iterator begin() noexcept { return data_; }
+    constexpr const_iterator begin() const noexcept { return data_; }
+    constexpr iterator end() noexcept { return data_ + count_k; }
+    constexpr const_iterator end() const noexcept { return data_ + count_k; }
+};
+
+/**
+ *  @brief  Safer alternative to `std::vector`, that avoids exceptions, copy constructors,
+ *          and provides alternative `try_push_back` and `try_reserve` for faulty memory allocations.
+ */
+template <typename value_type_, typename allocator_type_>
+class safe_vector {
+  public:
+    using value_type = value_type_;
+    using size_type = std::size_t;
+    using allocator_type = allocator_type_;
+
+  private:
+    value_type *data_;
+    size_type size_;
+    size_type capacity_;
+    allocator_type alloc_;
+
+  public:
+    safe_vector() noexcept : data_(nullptr), size_(0), capacity_(0), alloc_() {}
+    safe_vector(allocator_type alloc) noexcept : data_(nullptr), size_(0), capacity_(0), alloc_(alloc) {}
+    ~safe_vector() noexcept { reset(); }
+
+    void clear() noexcept {
+        if constexpr (!std::is_trivially_destructible<value_type>::value)
+            for (size_type i = 0; i < size_; ++i) data_[i].~value_type();
+        size_ = 0;
+    }
+
+    void reset() noexcept {
+        clear();
+        if (data_) alloc_.deallocate(data_, capacity_);
+        data_ = nullptr;
+        size_ = 0;
+        capacity_ = 0;
+    }
+
+    safe_vector(safe_vector const &other) = delete;
+    safe_vector &operator=(safe_vector const &other) noexcept = delete;
+
+    safe_vector(safe_vector &&other) noexcept
+        : data_(other.data_), size_(other.size_), capacity_(other.capacity_), alloc_(std::move(other.alloc_)) {
+        other.data_ = nullptr;
+        other.size_ = 0;
+        other.capacity_ = 0;
+    }
+
+    safe_vector &operator=(safe_vector &&other) noexcept {
+        if (this != &other) {
+            clear();
+            if (data_) alloc_.deallocate(data_, capacity_);
+            data_ = other.data_;
+            size_ = other.size_;
+            capacity_ = other.capacity_;
+            alloc_ = std::move(other.alloc_);
+            other.data_ = nullptr;
+            other.size_ = 0;
+            other.capacity_ = 0;
+        }
+        return *this;
+    }
+
+    status_t try_reserve(size_type new_cap) noexcept {
+        if (new_cap <= capacity_) return status_t::success_k;
+        value_type *new_data = alloc_.allocate(new_cap);
+        if (!new_data) return status_t::bad_alloc_k;
+        for (size_type i = 0; i < size_; ++i) {
+            new (new_data + i) value_type(std::move(data_[i]));
+            if constexpr (!std::is_trivially_destructible<value_type>::value) data_[i].~value_type();
+        }
+        if (data_) alloc_.deallocate(data_, capacity_);
+        data_ = new_data;
+        capacity_ = new_cap;
+        return status_t::success_k;
+    }
+
+    status_t try_resize(size_type new_size) noexcept {
+        if (new_size > capacity_ && try_reserve(new_size) != status_t::success_k) return status_t::bad_alloc_k;
+
+        if (new_size > size_) {
+            if constexpr (!std::is_trivially_constructible<value_type>::value)
+                for (size_type i = size_; i < new_size; ++i) new (data_ + i) value_type();
+        }
+        else if (new_size < size_) {
+            if constexpr (!std::is_trivially_destructible<value_type>::value)
+                for (size_type i = new_size; i < size_; ++i) data_[i].~value_type();
+        }
+
+        size_ = new_size;
+        return status_t::success_k;
+    }
+
+    status_t try_push_back(value_type const &val) noexcept {
+        if (size_ == capacity_) {
+            size_type new_cap = capacity_ ? capacity_ * 2 : 1;
+            if (try_reserve(new_cap) != status_t::success_k) return status_t::bad_alloc_k;
+        }
+        new (data_ + size_) value_type(val);
+        ++size_;
+        return status_t::success_k;
+    }
+
+    status_t try_push_back(value_type &&val) noexcept {
+        if (size_ == capacity_) {
+            size_type new_cap = capacity_ ? capacity_ * 2 : 1;
+            if (try_reserve(new_cap) != status_t::success_k) return status_t::bad_alloc_k;
+        }
+        new (data_ + size_) value_type(std::move(val));
+        ++size_;
+        return status_t::success_k;
+    }
+
+    status_t try_append(span<value_type const> source) noexcept {
+        size_type needed = size_ + source.size();
+        if (needed > capacity_) {
+            size_type new_cap = capacity_ ? capacity_ : 1;
+            while (new_cap < needed) new_cap *= 2;
+            if (try_reserve(new_cap) != status_t::success_k) return status_t::bad_alloc_k;
+        }
+        for (size_type i = 0; i < source.size(); ++i) new (data_ + size_ + i) value_type(source[i]);
+        size_ = needed;
+        return status_t::success_k;
+    }
+
+    value_type *begin() noexcept { return data_; }
+    value_type const *begin() const noexcept { return data_; }
+    value_type *end() noexcept { return data_ + size_; }
+    value_type const *end() const noexcept { return data_ + size_; }
+    value_type &operator[](size_type i) noexcept { return data_[i]; }
+    value_type const &operator[](size_type i) const noexcept { return data_[i]; }
+    value_type *data() noexcept { return data_; }
+    value_type const *data() const noexcept { return data_; }
+    size_type size() const noexcept { return size_; }
+    size_type capacity() const noexcept { return capacity_; }
+    operator span<value_type>() noexcept { return {data_, size_}; }
+    operator span<value_type const>() const noexcept { return {data_, size_}; }
+};
+
 template <typename first_, typename second_>
 struct is_same_type;
 

From 0a260590c8a0da57c84d5bec8309c594e2415e74 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 15 Apr 2025 21:56:19 +0000
Subject: [PATCH 339/751] Add: Random access ranges

---
 include/stringzilla/types.hpp | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index ad507f82..59f44504 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -552,6 +552,29 @@ struct constant_iterator {
     difference_type pos_;
 };
 
+template <typename begin_type_, typename end_type_>
+struct random_access_range {
+
+    using value_type = typename std::iterator_traits<begin_type_>::value_type;
+    using reference_type = typename std::iterator_traits<begin_type_>::reference;
+    using difference_type = typename std::iterator_traits<begin_type_>::difference_type;
+
+    begin_type_ begin_;
+    end_type_ end_;
+
+    constexpr std::size_t size() const { return static_cast<std::size_t>(end_ - begin_); }
+    constexpr begin_type_ begin() const { return begin_; }
+    constexpr end_type_ end() const { return end_; }
+
+    decltype(auto) operator[](std::size_t index) const {
+        _sz_assert(index < size());
+        return *(begin_ + index);
+    }
+};
+
+template <typename begin_type_, typename end_type_>
+random_access_range(begin_type_, end_type_) -> random_access_range<begin_type_, end_type_>;
+
 template <typename value_type_, size_t count_>
 struct safe_array {
     using value_type = value_type_;

From a0fd1360c06da0530eec36dd80cc9a00e6c12b01 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 15 Apr 2025 21:56:35 +0000
Subject: [PATCH 340/751] Improve: Pointer-constructible spans

---
 include/stringzilla/types.hpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 59f44504..31dbcae1 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -141,11 +141,16 @@ struct span {
     value_type *data_ {};
     size_type size_ {};
 
+    constexpr span() noexcept = default;
+    constexpr span(value_type *data, size_type size) noexcept : data_(data), size_(size) {}
+    constexpr span(value_type *data, value_type *end) noexcept : data_(data), size_(end - data) {}
+
     constexpr value_type *begin() const noexcept { return data_; }
     constexpr value_type *end() const noexcept { return data_ + size_; }
     constexpr value_type *data() const noexcept { return data_; }
     constexpr size_type size() const noexcept { return size_; }
     constexpr size_type length() const noexcept { return size_; }
+    constexpr size_type size_bytes() const noexcept { return size_ * sizeof(value_type); }
     constexpr value_type &operator[](size_type i) const noexcept { return data_[i]; }
 };
 

From 783cfa81298e7377aba68a302e11ee96f696b9ea Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 15 Apr 2025 21:57:00 +0000
Subject: [PATCH 341/751] Add: Overflow-risk error codes

---
 include/stringzilla/types.h   | 4 +++-
 include/stringzilla/types.hpp | 3 +++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 9211fd2f..047b0a30 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -443,8 +443,10 @@ typedef enum sz_status_t {
     sz_invalid_utf8_k = -2,
     /** For algorithms that take collections of unique elements, this status indicates presence of duplicates. */
     sz_contains_duplicates_k = -3,
+    /** For algorithms dealing with large inputs, this error reports the need to upcast the logic to larger types. */
+    sz_overflow_risk_k = -4,
     /** A sink-hole status for unknown errors. */
-    sz_status_unknown_k = -4,
+    sz_status_unknown_k = -5,
 } sz_status_t;
 
 /**
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 31dbcae1..2e2d2378 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -119,6 +119,7 @@ enum class status_t {
     bad_alloc_k = sz_bad_alloc_k,
     invalid_utf8_k = sz_invalid_utf8_k,
     contains_duplicates_k = sz_contains_duplicates_k,
+    overflow_risk_k = sz_overflow_risk_k,
     unknown_k = sz_status_unknown_k,
 };
 
@@ -794,6 +795,8 @@ struct cpu_specs_t {
     size_t cache_line_width = 64;      // ? 64 bytes on x86, sometimes 128 on ARM
     size_t cores_per_socket = 1;       // ? at least 1 core
     size_t sockets = 1;                // ? at least 1 socket
+
+    size_t cores_total() const noexcept { return cores_per_socket * sockets; }
 };
 
 /**

From cf53b9be429d64c421f6b88a14ff63955721dc06 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 15 Apr 2025 21:57:30 +0000
Subject: [PATCH 342/751] Add: Draft parallel substring search

---
 include/stringcuzilla/find_many.hpp | 309 ++++++++++++++++++++++++++--
 1 file changed, 289 insertions(+), 20 deletions(-)

diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index c8bb592b..7a0e38b1 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -50,6 +50,10 @@ namespace stringzilla {
 
 #pragma region - Dictionary
 
+/**
+ *  @brief  Light-weight structure to hold the result of a match in many-to-many
+ *          search with multiple haystacks and needles.
+ */
 struct find_many_match_t {
 
     span<char const> haystack {};
@@ -62,6 +66,36 @@ struct find_many_match_t {
     size_t needle_index {};
 };
 
+template <typename value_type_>
+struct min_max_sum {
+    value_type_ min = std::numeric_limits<value_type_>::max();
+    value_type_ max = std::numeric_limits<value_type_>::min();
+    value_type_ sum = 0;
+    size_t count = 0;
+
+    void add(value_type_ value) noexcept {
+        if (value < min) min = value;
+        if (value > max) max = value;
+        sum += value;
+        ++count;
+    }
+
+    template <typename other_value_type_ = value_type_>
+    other_value_type_ mean() const noexcept {
+        if (count == 0) return 0;
+        return (other_value_type_)sum / count;
+    }
+};
+
+/**
+ *  @brief Metadata for the Aho-Corasick dictionary.
+ */
+struct aho_corasick_metadata_t {
+    min_max_sum<size_t> transitions_per_state;
+    min_max_sum<size_t> matches_per_terminal_state;
+    min_max_sum<size_t> needle_lengths;
+};
+
 /**
  *  @brief Aho-Corasick dictionary for multi-pattern exact byte-level substring search.
  *  @note As FSM construction is almost never a bottleneck, we don't optimize it for speed.
@@ -79,10 +113,10 @@ struct aho_corasick_dictionary {
     using allocator_t = allocator_type_;
     using match_t = find_many_match_t;
 
+  private:
     static constexpr state_id_t alphabet_size_k = 256;
     static constexpr state_id_t invalid_state_k = std::numeric_limits<state_id_t>::max();
     static constexpr size_t invalid_length_k = std::numeric_limits<size_t>::max();
-
     using state_transitions_t = safe_array<state_id_t, alphabet_size_k>;
     static_assert(std::is_unsigned_v<state_id_t>, "State ID should be unsigned");
 
@@ -143,14 +177,19 @@ struct aho_corasick_dictionary {
      */
     safe_vector<size_t, size_allocator_t> needles_lengths_;
 
+    /**
+     *  @brief  The allocator state to be used both for the static FSM and for the dynamic data-structures
+     *          on the Breadth-First Search (BFS) construction phase.
+     */
     allocator_t alloc_;
 
+  public:
     aho_corasick_dictionary() = default;
     ~aho_corasick_dictionary() noexcept { reset(); }
 
     aho_corasick_dictionary(allocator_t alloc) noexcept
-        : transitions_(alloc), failures_(alloc), outputs_(alloc), outputs_counts_(alloc), outputs_offsets_(alloc),
-          needles_lengths_(alloc), alloc_(alloc) {}
+        : transitions_(alloc), outputs_(alloc), failures_(alloc), count_states_(0), outputs_counts_(alloc),
+          outputs_offsets_(alloc), needles_lengths_(alloc), alloc_(alloc) {}
 
     aho_corasick_dictionary(aho_corasick_dictionary const &) = delete;
     aho_corasick_dictionary(aho_corasick_dictionary &&) = delete;
@@ -179,9 +218,46 @@ struct aho_corasick_dictionary {
 
     size_t size() const noexcept { return count_states_; }
     size_t capacity() const noexcept { return transitions_.size(); }
+    size_t max_needle_length() const noexcept {
+        size_t max_length = 0;
+        for (size_t length : needles_lengths_) max_length = std::max(max_length, length);
+        return max_length;
+    }
+
+    allocator_t const &allocator() const noexcept { return alloc_; }
+
+    /**
+     *  @brief Returns the metadata for the Aho-Corasick dictionary.
+     *  @note The metadata is not thread-safe and should be used only after `try_build`.
+     */
+    aho_corasick_metadata_t metadata() const noexcept {
+        aho_corasick_metadata_t metadata;
+
+        // Estimate the number of transitions per state.
+        for (state_transitions_t const &row : transitions_) {
+            size_t count_valid = 0;
+            for (state_id_t const &state : row) count_valid += state != invalid_state_k;
+            metadata.transitions_per_state.add(count_valid);
+        }
+
+        // Estimate the number of matches per terminal state and needle lengths.
+        for (size_t count : outputs_counts_) metadata.matches_per_terminal_state.add(count);
+        for (size_t length : needles_lengths_) metadata.needle_lengths.add(length);
+        return metadata;
+    }
 
+    /**
+     *  @brief Reserves space for the FSM, allocating memory for the state transitions.
+     *  @param[in] new_capacity The new number of @b states to reserve, not needles!
+     *
+     *  @retval `status_t::success_k` The needle was successfully added.
+     *  @retval `status_t::bad_alloc_k` Memory allocation failed.
+     *  @retval `status_t::overflow_risk_k` Too many needles for the current state ID type.
+     */
     status_t try_reserve(size_t new_capacity) noexcept {
 
+        if (new_capacity > invalid_state_k) return status_t::overflow_risk_k;
+
         // Allocate new memory blocks.
         if (transitions_.try_resize(new_capacity) != status_t::success_k) return status_t::bad_alloc_k;
         if (failures_.try_resize(new_capacity) != status_t::success_k) return status_t::bad_alloc_k;
@@ -206,7 +282,11 @@ struct aho_corasick_dictionary {
 
     /**
      *  @brief Adds a single @p needle to the vocabulary, assigning it a unique @p needle_id.
-     *  @note  Can't be called after `try_build`. Can't be called from multiple threads at the same time.
+     *  @note Can't be called after `try_build`. Can't be called from multiple threads at the same time.
+     *
+     *  @retval `status_t::success_k` The needle was successfully added.
+     *  @retval `status_t::bad_alloc_k` Memory allocation failed.
+     *  @retval `status_t::overflow_risk_k` Too many needles for the current state ID type.
      */
     status_t try_insert(span<char const> needle) noexcept {
         if (!needle.size()) return status_t::success_k; // Don't care about empty needles.
@@ -243,8 +323,8 @@ struct aho_corasick_dictionary {
     }
 
     /**
-     *  @brief  Construct the Finite State Machine (FSM) from the vocabulary. Can only be called @b once!
-     *  @note   This function is not thread safe and allocates a significant amount of memory, so it can fail.
+     *  @brief Construct the Finite State Machine (FSM) from the vocabulary. Can only be called @b once!
+     *  @note This function is not thread safe and allocates a significant amount of memory, so it can fail.
      */
     status_t try_build() noexcept {
 
@@ -322,10 +402,10 @@ struct aho_corasick_dictionary {
     }
 
     /**
-     *  @brief Find all occurrences of the needles in the @p haystack.
+     *  @brief Find all occurrences of all needles in the @p haystack.
      *  @note This is a serial reference implementation only recommended for testing.
-     *  @param haystack The input string to search in.
-     *  @param callback The handler for a @b `match_t` match, returning `true` to continue.
+     *  @param[in] haystack The input string to search in.
+     *  @param[in] callback The handler for a @b `match_t` match, returning `true` to continue.
      */
     template <typename callback_type_>
     void find(span<char const> haystack, callback_type_ &&callback) const noexcept {
@@ -385,6 +465,8 @@ struct find_many {
     using allocator_t = typename dictionary_t::allocator_t;
     using match_t = typename dictionary_t::match_t;
 
+    find_many(allocator_t alloc = allocator_t()) noexcept : dict_(alloc) {}
+
     template <typename needles_type_>
     status_t try_build(needles_type_ &&needles_strings) noexcept {
         for (auto const &needle : needles_strings) {
@@ -397,27 +479,29 @@ struct find_many {
     void reset() noexcept { dict_.reset(); }
 
     /**
-     *  @brief Counts the number of occurrences of the needles in the haystack. Relevant for filtering and ranking.
+     *  @brief Counts the number of occurrences of all needles in all @p haystacks. Relevant for filtering and ranking.
      *  @param[in] haystacks The input strings to search in.
-     *  @param[out] counts The output buffer for the counts of each needle.
+     *  @param[in] counts The output buffer for the counts of all needles in each haystack.
      *  @return The total number of occurrences found.
      */
     template <typename haystacks_type_>
-    size_t count(haystacks_type_ &&haystacks, span<size_t> counts) const noexcept {
-        size_t count_total = 0;
-        for (size_t i = 0; i < counts.size(); ++i) count_total += counts[i] = dict_.count(haystacks[i]);
-        return count_total;
+    status_t try_count(haystacks_type_ &&haystacks, span<size_t> counts) const noexcept {
+        _sz_assert(counts.size() == haystacks.size());
+        for (size_t i = 0; i < counts.size(); ++i) counts[i] = dict_.count(haystacks[i]);
+        return status_t::success_k;
     }
 
     /**
-     *  @brief Finds all occurrences of the needles in all the @p haystacks.
-     *  @param haystacks The input strings to search in, with support for random access iterators.
-     *  @param matches The output buffer for the matches, with support for random access iterators.
+     *  @brief Finds all occurrences of all needles in all the @p haystacks.
+     *  @param[in] haystacks The input strings to search in, with support for random access iterators.
+     *  @param[in] matches The output buffer for the matches, with support for random access iterators.
+     *  @param[out] matches_count The number of matches found.
      *  @return The number of matches found across all the @p haystacks.
      *  @note The @p matches reference objects should be assignable from @b `match_t`.
      */
     template <typename haystacks_type_, typename output_matches_type_>
-    size_t find(haystacks_type_ &&haystacks, output_matches_type_ &&matches) const noexcept {
+    status_t try_find(haystacks_type_ &&haystacks, output_matches_type_ &&matches,
+                      size_t &matches_count) const noexcept {
         size_t count_found = 0, count_allowed = matches.size();
         for (auto it = haystacks.begin(); it != haystacks.end() && count_found != count_allowed; ++it)
             dict_.find(*it, [&](match_t match) {
@@ -426,7 +510,8 @@ struct find_many {
                 count_found++;
                 return count_found < count_allowed;
             });
-        return count_found;
+        matches_count = count_found;
+        return status_t::success_k;
     }
 
   private:
@@ -435,6 +520,190 @@ struct find_many {
 
 #pragma endregion // Primary API
 
+#pragma region - Parallel OpenMP Backend
+
+/**
+ *  @brief  Aho-Corasick-based @b multi-threaded multi-pattern exact substring search with OpenMP.
+ *  @note   Construction of the FSM is not parallelized, as it is not generally a bottleneck.
+ *
+ *  Implements 2 levels of parallelism: "core per input" for small haystacks and "all cores on each input"
+ *  for very large ones.
+ */
+template <typename state_id_type_, typename allocator_type_, typename enable_>
+struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
+
+    using dictionary_t = aho_corasick_dictionary<state_id_type_, allocator_type_>;
+    using state_id_t = typename dictionary_t::state_id_t;
+    using allocator_t = typename dictionary_t::allocator_t;
+    using match_t = typename dictionary_t::match_t;
+
+    using size_allocator_t = typename std::allocator_traits<allocator_t>::template rebind_alloc<size_t>;
+
+    template <typename needles_type_>
+    status_t try_build(needles_type_ &&needles_strings) noexcept {
+        for (auto const &needle : needles_strings)
+            if (status_t status = dict_.try_insert(needle); status != status_t::success_k) return status;
+        return dict_.try_build();
+    }
+
+    void reset() noexcept { dict_.reset(); }
+
+    /**
+     *  @brief Counts the number of occurrences of all needles in all @p haystacks. Relevant for filtering and ranking.
+     *  @param[in] haystacks The input strings to search in.
+     *  @param[in] counts The output buffer for the counts of all needles in each haystack.
+     *  @param[in] specs The CPU specifications on the current system to pick the right multi-threading strategy.
+     *  @return The total number of occurrences found.
+     */
+    template <typename haystacks_type_>
+    status_t try_count(haystacks_type_ &&haystacks, span<size_t> counts, cpu_specs_t const &specs = {}) const noexcept {
+
+        _sz_assert(counts.size() == haystacks.size());
+        size_t const cores_total = specs.cores_total();
+        size_t const max_needle_length = dict_.max_needle_length();
+
+        using haystacks_t = typename std::remove_reference_t<haystacks_type_>;
+        using haystack_t = typename haystacks_t::value_type;
+        using char_t = typename haystack_t::value_type;
+
+        // On small strings, individually compute the counts
+#pragma omp parallel for schedule(dynamic, 1) num_threads(cores_total)
+        for (size_t i = 0; i < counts.size(); ++i) {
+            haystack_t const &haystack = haystacks[i];
+            size_t haystack_length = haystack.size();
+            if (haystack_length > specs.l2_bytes) continue;
+            counts[i] = dict_.count(haystack);
+        }
+
+        // On longer strings, throw all cores on each haystack
+        for (size_t i = 0; i < counts.size(); ++i) {
+            haystack_t const &haystack = haystacks[i];
+            size_t const haystack_length = haystack.size();
+            // The shorter strings have already been processed
+            if (haystack_length <= specs.l2_bytes) continue;
+
+            // First, each core will process its own slice excluding the overlapping regions
+            char_t const *haystack_begin = haystack.data();
+            char_t const *const haystack_end = haystack_begin + haystack_length;
+            size_t const bytes_per_core_optimal = haystack_length / cores_total;
+            size_t count_matches_across_cores = 0;
+#pragma omp parallel for reduction(+ : count_matches_across_cores) schedule(static, 1) num_threads(cores_total)
+            for (size_t j = 0; j < cores_total; ++j) {
+                size_t const bytes_per_core =
+                    std::min(bytes_per_core_optimal, haystack_length - j * bytes_per_core_optimal);
+                char_t const *optimal_start = haystack_begin + j * bytes_per_core_optimal;
+                char_t const *optimal_end = optimal_start + bytes_per_core;
+                size_t const count_matches_non_overlapping = dict_.count({optimal_start, optimal_end});
+
+                // Now, each thread will take care of the subsequent overlapping regions,
+                // but we must be careful for cases when the core-specific slice is shorter
+                // than the longest needle! It's a very unlikely case in practice, but we
+                // still may want an optimization for it down the road.
+                char_t const *overlapping_start =
+                    std::min(optimal_start + bytes_per_core - max_needle_length + 1, haystack_end);
+                char_t const *overlapping_end = std::min(optimal_end + max_needle_length - 1, haystack_end);
+                size_t count_matches_overlapping = 0;
+                dict_.find({overlapping_start, overlapping_end}, [&](match_t match) noexcept {
+                    bool is_boundary = match.needle.begin() < optimal_end && match.needle.end() >= optimal_end;
+                    count_matches_overlapping += is_boundary;
+                    return true;
+                });
+
+                // Now, finally, aggregate the results
+                count_matches_across_cores += count_matches_non_overlapping;
+                count_matches_across_cores += count_matches_overlapping;
+            }
+        }
+
+        return status_t::success_k;
+    }
+
+    /**
+     *  @brief Finds all occurrences of all needles in all the @p haystacks.
+     *  @param[in] haystacks The input strings to search in, with support for random access iterators.
+     *  @param[in] matches The output buffer for the matches, with support for random access iterators.
+     *  @param[out] matches_count The number of matches found.
+     *  @return The number of matches found across all the @p haystacks.
+     *  @note The @p matches reference objects should be assignable from @b `match_t`.
+     *
+     *  The core problem of all such algorithm is the overlapping matches between the slices of text
+     *  processed by individual threads. One approach around it is to pass in a callback, and fire it
+     *  concurrently from different threads, leaving synchronization to a user... generally resorting
+     *  to mutexes, atomics, and other expensive primitives! We can do better!
+     *
+     *  A common approach to parallelizing such algorithms is to use a little memory for
+     */
+    template <typename haystacks_type_, typename output_matches_type_>
+    status_t try_find(haystacks_type_ &&haystacks, output_matches_type_ &&matches, size_t &matches_count,
+                      cpu_specs_t const &specs = {}) const noexcept {
+
+        safe_vector<size_t, size_allocator_t> counts_per_haystack(dict_.allocator());
+        if (counts_per_haystack.try_resize(haystacks.size()) != status_t::success_k) return status_t::bad_alloc_k;
+        status_t count_status = try_count(haystacks, counts_per_haystack, specs);
+        if (count_status != status_t::success_k) return count_status;
+        return try_find(haystacks, counts_per_haystack, matches, matches_count, specs);
+    }
+
+    /**
+     *  @brief Finds all occurrences of all needles in all the @p haystacks.
+     *  @param[in] haystacks The input strings to search in, with support for random access iterators.
+     *  @param[in] counts The input counts for the number of matches in each haystack.
+     *  @param[in] matches The output buffer for the matches, with support for random access iterators.
+     *  @param[out] matches_count The number of matches found.
+     *  @return The number of matches found across all the @p haystacks.
+     *  @note The @p matches reference objects should be assignable from @b `match_t`.
+     *
+     *  A common approach to parallelizing such algorithms is to us a little memory for
+     */
+    template <typename haystacks_type_, typename output_matches_type_>
+    status_t try_find(haystacks_type_ &&haystacks, span<size_t const> counts, output_matches_type_ &&matches,
+                      size_t &matches_count, cpu_specs_t const &specs = {}) const noexcept {
+
+        _sz_assert(counts.size() == haystacks.size());
+        size_t const cores_total = specs.cores_total();
+        // size_t const max_needle_length = dict_.max_needle_length();
+
+        using haystacks_t = typename std::remove_reference_t<haystacks_type_>;
+        using haystack_t = typename haystacks_t::value_type;
+        // using char_t = typename std::iterator_traits<haystack_t>::value_type;
+
+        // Calculate the exclusive prefix sum of the counts to navigate into the `matches` array
+        safe_vector<size_t, size_allocator_t> offsets_per_haystack(dict_.allocator());
+        if (offsets_per_haystack.try_resize(counts.size()) != status_t::success_k) return status_t::bad_alloc_k;
+        offsets_per_haystack[0] = 0;
+        for (size_t i = 1; i < counts.size(); ++i)
+            offsets_per_haystack[i] = offsets_per_haystack[i - 1] + counts[i - 1];
+
+        // Process the small haystacks, outputting their matches individually without any synchronization
+#pragma omp parallel for schedule(dynamic, 1) num_threads(cores_total)
+        for (size_t i = 0; i < counts.size(); ++i) {
+            haystack_t const &haystack = haystacks[i];
+            size_t haystack_length = haystack.size();
+            if (haystack_length > specs.l2_bytes) continue;
+            size_t matches_found = 0;
+            dict_.find({haystack.data(), haystack_length}, [&](match_t match) {
+                match.haystack_index = i;
+                matches[offsets_per_haystack[i] + matches_found] = match;
+                ++matches_found;
+                return true;
+            });
+            _sz_assert(counts[i] == matches_found);
+        }
+
+        // On longer strings, throw all cores on each haystack, but between the threads we need additional
+        // memory to track the number of matches within a core-specific slice of the haystack
+
+        matches_count = 0;
+        for (size_t count : counts) matches_count += count;
+        return status_t::success_k;
+    }
+
+  private:
+    dictionary_t dict_;
+};
+
+#pragma endregion // Parallel OpenMP Backend
+
 } // namespace stringzilla
 } // namespace ashvardanian
 

From c8dc314d3814e516d0be84c5ac1e73599dd41c09 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 16 Apr 2025 20:15:20 +0000
Subject: [PATCH 343/751] Add: Parallel multi-needle search with OpenMP

---
 include/stringcuzilla/find_many.hpp | 181 ++++++++++++++++-----
 scripts/test_stringcuzilla.cuh      | 237 ++++++++++++++++++++--------
 2 files changed, 312 insertions(+), 106 deletions(-)

diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index 7a0e38b1..7045a642 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -64,6 +64,11 @@ struct find_many_match_t {
     span<char const> needle {};
     size_t haystack_index {};
     size_t needle_index {};
+
+    inline static bool less_globally(find_many_match_t const &lhs, find_many_match_t const &rhs) noexcept {
+        return lhs.needle.data() < rhs.needle.data() ||
+               (lhs.needle.data() == rhs.needle.data() && lhs.needle.end() < rhs.needle.end());
+    }
 };
 
 template <typename value_type_>
@@ -287,6 +292,7 @@ struct aho_corasick_dictionary {
      *  @retval `status_t::success_k` The needle was successfully added.
      *  @retval `status_t::bad_alloc_k` Memory allocation failed.
      *  @retval `status_t::overflow_risk_k` Too many needles for the current state ID type.
+     *  @retval `status_t::contains_duplicates_k` The needle is already in the vocabulary.
      */
     status_t try_insert(span<char const> needle) noexcept {
         if (!needle.size()) return status_t::success_k; // Don't care about empty needles.
@@ -315,6 +321,10 @@ struct aho_corasick_dictionary {
             current_state = current_row[symbol];
         }
 
+        // If the terminal state's output is already set, the needle already exists.
+        if (outputs_[current_state] != invalid_state_k) return status_t::contains_duplicates_k;
+
+        // Populate the new state.
         outputs_[current_state] = needle_id;
         needles_lengths_.try_push_back(needle.size()); // ? Can't fail due to `try_reserve` above
         outputs_counts_[current_state] = 1; // ? This will snowball in `try_build` if needles have shared suffixes
@@ -466,18 +476,25 @@ struct find_many {
     using match_t = typename dictionary_t::match_t;
 
     find_many(allocator_t alloc = allocator_t()) noexcept : dict_(alloc) {}
+    void reset() noexcept { dict_.reset(); }
 
+    /**
+     *  @brief Indexes all of the @p needles strings into the FSM.
+     *  @retval `status_t::success_k` The needle was successfully added.
+     *  @retval `status_t::bad_alloc_k` Memory allocation failed.
+     *  @retval `status_t::overflow_risk_k` Too many needles for the current state ID type.
+     *  @retval `status_t::contains_duplicates_k` The needle is already in the vocabulary.
+     *  @note Before reusing, please `reset` the FSM.
+     */
     template <typename needles_type_>
-    status_t try_build(needles_type_ &&needles_strings) noexcept {
-        for (auto const &needle : needles_strings) {
+    status_t try_build(needles_type_ &&needles) noexcept {
+        for (auto const &needle : needles) {
             status_t status = dict_.try_insert(needle);
             if (status != status_t::success_k) return status;
         }
         return dict_.try_build();
     }
 
-    void reset() noexcept { dict_.reset(); }
-
     /**
      *  @brief Counts the number of occurrences of all needles in all @p haystacks. Relevant for filtering and ranking.
      *  @param[in] haystacks The input strings to search in.
@@ -539,15 +556,24 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
 
     using size_allocator_t = typename std::allocator_traits<allocator_t>::template rebind_alloc<size_t>;
 
+    find_many(allocator_t alloc = allocator_t()) noexcept : dict_(alloc) {}
+    void reset() noexcept { dict_.reset(); }
+
+    /**
+     *  @brief Indexes all of the @p needles strings into the FSM.
+     *  @retval `status_t::success_k` The needle was successfully added.
+     *  @retval `status_t::bad_alloc_k` Memory allocation failed.
+     *  @retval `status_t::overflow_risk_k` Too many needles for the current state ID type.
+     *  @retval `status_t::contains_duplicates_k` The needle is already in the vocabulary.
+     *  @note Before reusing, please `reset` the FSM.
+     */
     template <typename needles_type_>
-    status_t try_build(needles_type_ &&needles_strings) noexcept {
-        for (auto const &needle : needles_strings)
+    status_t try_build(needles_type_ &&needles) noexcept {
+        for (auto const &needle : needles)
             if (status_t status = dict_.try_insert(needle); status != status_t::success_k) return status;
         return dict_.try_build();
     }
 
-    void reset() noexcept { dict_.reset(); }
-
     /**
      *  @brief Counts the number of occurrences of all needles in all @p haystacks. Relevant for filtering and ranking.
      *  @param[in] haystacks The input strings to search in.
@@ -560,11 +586,9 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
 
         _sz_assert(counts.size() == haystacks.size());
         size_t const cores_total = specs.cores_total();
-        size_t const max_needle_length = dict_.max_needle_length();
 
         using haystacks_t = typename std::remove_reference_t<haystacks_type_>;
         using haystack_t = typename haystacks_t::value_type;
-        using char_t = typename haystack_t::value_type;
 
         // On small strings, individually compute the counts
 #pragma omp parallel for schedule(dynamic, 1) num_threads(cores_total)
@@ -582,37 +606,14 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
             // The shorter strings have already been processed
             if (haystack_length <= specs.l2_bytes) continue;
 
-            // First, each core will process its own slice excluding the overlapping regions
-            char_t const *haystack_begin = haystack.data();
-            char_t const *const haystack_end = haystack_begin + haystack_length;
-            size_t const bytes_per_core_optimal = haystack_length / cores_total;
             size_t count_matches_across_cores = 0;
 #pragma omp parallel for reduction(+ : count_matches_across_cores) schedule(static, 1) num_threads(cores_total)
-            for (size_t j = 0; j < cores_total; ++j) {
-                size_t const bytes_per_core =
-                    std::min(bytes_per_core_optimal, haystack_length - j * bytes_per_core_optimal);
-                char_t const *optimal_start = haystack_begin + j * bytes_per_core_optimal;
-                char_t const *optimal_end = optimal_start + bytes_per_core;
-                size_t const count_matches_non_overlapping = dict_.count({optimal_start, optimal_end});
-
-                // Now, each thread will take care of the subsequent overlapping regions,
-                // but we must be careful for cases when the core-specific slice is shorter
-                // than the longest needle! It's a very unlikely case in practice, but we
-                // still may want an optimization for it down the road.
-                char_t const *overlapping_start =
-                    std::min(optimal_start + bytes_per_core - max_needle_length + 1, haystack_end);
-                char_t const *overlapping_end = std::min(optimal_end + max_needle_length - 1, haystack_end);
-                size_t count_matches_overlapping = 0;
-                dict_.find({overlapping_start, overlapping_end}, [&](match_t match) noexcept {
-                    bool is_boundary = match.needle.begin() < optimal_end && match.needle.end() >= optimal_end;
-                    count_matches_overlapping += is_boundary;
-                    return true;
-                });
-
-                // Now, finally, aggregate the results
-                count_matches_across_cores += count_matches_non_overlapping;
-                count_matches_across_cores += count_matches_overlapping;
+            for (size_t core_index = 0; core_index < cores_total; ++core_index) {
+                size_t count_matches_on_one_core = count_matches_in_one_part(haystack, core_index, cores_total);
+                count_matches_across_cores += count_matches_on_one_core;
             }
+
+            counts[i] = count_matches_across_cores;
         }
 
         return status_t::success_k;
@@ -661,11 +662,11 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
 
         _sz_assert(counts.size() == haystacks.size());
         size_t const cores_total = specs.cores_total();
-        // size_t const max_needle_length = dict_.max_needle_length();
+        size_t const max_needle_length = dict_.max_needle_length();
 
         using haystacks_t = typename std::remove_reference_t<haystacks_type_>;
         using haystack_t = typename haystacks_t::value_type;
-        // using char_t = typename std::iterator_traits<haystack_t>::value_type;
+        using char_t = typename haystack_t::value_type;
 
         // Calculate the exclusive prefix sum of the counts to navigate into the `matches` array
         safe_vector<size_t, size_allocator_t> offsets_per_haystack(dict_.allocator());
@@ -691,7 +692,61 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
         }
 
         // On longer strings, throw all cores on each haystack, but between the threads we need additional
-        // memory to track the number of matches within a core-specific slice of the haystack
+        // memory to track the number of matches within a core-specific slice of the haystack.
+        safe_vector<size_t, size_allocator_t> counts_per_core(dict_.allocator());
+        if (counts_per_core.try_resize(cores_total) != status_t::success_k) return status_t::bad_alloc_k;
+        for (size_t i = 0; i < counts.size(); ++i) {
+            haystack_t const &haystack = haystacks[i];
+            char_t const *haystack_begin = haystack.data();
+            size_t const haystack_length = haystack.size();
+            // The shorter strings have already been processed
+            if (haystack_length <= specs.l2_bytes) continue;
+
+            // First, on each core, estimate the number of matches in the haystack
+#pragma omp parallel for schedule(static, 1) num_threads(cores_total)
+            for (size_t core_index = 0; core_index < cores_total; ++core_index)
+                counts_per_core[core_index] = count_matches_in_one_part(haystack, core_index, cores_total);
+
+            // Now that we know the number of matches to expect per slice, we can convert the counts
+            // into offsets using inclusive prefix sum
+#pragma omp barrier
+#pragma omp single
+            {
+                for (size_t core_index = 1; core_index < cores_total; ++core_index)
+                    counts_per_core[core_index] += counts_per_core[core_index - 1];
+            }
+
+            // On each core, pick an overlapping slice and go through all of the matches in it,
+            // that start before the end of the private slice.
+            size_t const bytes_per_core_optimal = haystack_length / cores_total;
+            size_t const count_matches_before_this_haystack = offsets_per_haystack[i];
+#pragma omp parallel for schedule(static, 1) num_threads(cores_total)
+            for (size_t core_index = 0; core_index < cores_total; ++core_index) {
+                size_t const count_matches_before_this_core = core_index ? counts_per_core[core_index - 1] : 0;
+                size_t const count_matches_expected_on_this_core =
+                    counts_per_core[core_index] - count_matches_before_this_core;
+
+                // The last core may have a smaller slice, so we need to be careful
+                size_t const bytes_per_core =
+                    std::min(bytes_per_core_optimal, haystack_length - core_index * bytes_per_core_optimal);
+                char_t const *optimal_start = haystack_begin + core_index * bytes_per_core_optimal;
+                char_t const *optimal_end = optimal_start + bytes_per_core;
+                char_t const *overlapping_end =
+                    std::min(optimal_start + bytes_per_core + max_needle_length - 1, haystack_begin + haystack_length);
+
+                // Iterate through the matches in the overlapping region
+                size_t count_matches_found_on_this_core = 0;
+                dict_.find({optimal_start, overlapping_end}, [&](match_t match) noexcept {
+                    bool blongs_to_this_core = match.needle.begin() < optimal_end;
+                    if (!blongs_to_this_core) return true;
+                    matches[count_matches_before_this_haystack + count_matches_before_this_core +
+                            count_matches_found_on_this_core] = match;
+                    count_matches_found_on_this_core++;
+                    return true;
+                });
+                _sz_assert(count_matches_found_on_this_core == count_matches_expected_on_this_core);
+            }
+        }
 
         matches_count = 0;
         for (size_t count : counts) matches_count += count;
@@ -700,6 +755,50 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
 
   private:
     dictionary_t dict_;
+
+    /**
+     *  @brief  Helper method implementing the core logic of the parallel `try_count` and part of `try_find`.
+     *          For a given single input haystack, assumes all of the cores are processing it in parallel,
+     *          and this method is called from each core with its own index to count the number of potentially
+     *          overlapping matches.
+     */
+    template <typename char_type_>
+    size_t count_matches_in_one_part(span<char_type_ const> haystack, size_t core_index,
+                                     size_t cores_total) const noexcept {
+
+        using char_t = char_type_;
+        char_t const *haystack_begin = haystack.data();
+        size_t const haystack_length = haystack.size();
+        char_t const *const haystack_end = haystack_begin + haystack_length;
+        size_t const bytes_per_core_optimal = haystack_length / cores_total;
+        size_t const max_needle_length = dict_.max_needle_length();
+
+        // The last core may have a smaller slice, so we need to be careful
+        size_t const bytes_per_core =
+            std::min(bytes_per_core_optimal, haystack_length - core_index * bytes_per_core_optimal);
+
+        // First, each core will process its own slice excluding the overlapping regions
+        char_t const *optimal_start = haystack_begin + core_index * bytes_per_core_optimal;
+        char_t const *optimal_end = optimal_start + bytes_per_core;
+        size_t const count_matches_non_overlapping = dict_.count({optimal_start, optimal_end});
+
+        // Now, each thread will take care of the subsequent overlapping regions,
+        // but we must be careful for cases when the core-specific slice is shorter
+        // than the longest needle! It's a very unlikely case in practice, but we
+        // still may want an optimization for it down the road.
+        char_t const *overlapping_start =
+            std::min(optimal_start + bytes_per_core - max_needle_length + 1, haystack_end);
+        char_t const *overlapping_end = std::min(optimal_end + max_needle_length - 1, haystack_end);
+        size_t count_matches_overlapping = 0;
+        dict_.find({overlapping_start, overlapping_end}, [&](match_t match) noexcept {
+            bool is_boundary = match.needle.begin() < optimal_end && match.needle.end() >= optimal_end;
+            count_matches_overlapping += is_boundary;
+            return true;
+        });
+
+        // Now, finally, aggregate the results
+        return count_matches_non_overlapping + count_matches_overlapping;
+    }
 };
 
 #pragma endregion // Parallel OpenMP Backend
diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index 60d99143..c7feaf97 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -406,9 +406,29 @@ struct fuzzy_config_t {
     std::size_t batch_size = 16;
     std::size_t min_string_length = 1;
     std::size_t max_string_length = 200;
-    std::size_t iterations = 10;
 };
 
+void randomize_strings(fuzzy_config_t config, std::vector<std::string> &array, arrow_strings_tape_t &tape,
+                       bool unique = false) {
+    array.resize(config.batch_size);
+
+    std::uniform_int_distribution<std::size_t> length_distribution(config.min_string_length, config.max_string_length);
+    for (std::size_t i = 0; i != config.batch_size; ++i) {
+        std::size_t length = length_distribution(global_random_generator());
+        array[i] = random_string(length, config.alphabet.data(), config.alphabet.size());
+    }
+
+    if (unique) {
+        std::sort(array.begin(), array.end());
+        auto last = std::unique(array.begin(), array.end());
+        array.erase(last, array.end());
+    }
+
+    // Convert to a GPU-friendly layout
+    status_t status = tape.try_assign(array.data(), array.data() + array.size());
+    _sz_assert(status == status_t::success_k);
+}
+
 /**
  *  @brief  Tests the correctness of the string class Levenshtein distance computation,
  *          as well as the similarity scoring functions for bioinformatics-like workloads
@@ -416,26 +436,17 @@ struct fuzzy_config_t {
  */
 template <typename score_type_, typename base_operator_, typename simd_operator_, typename... extra_args_>
 void test_similarity_scores_fuzzy(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
-                                  fuzzy_config_t config = {}, extra_args_ &&...extra_args) {
+                                  fuzzy_config_t config = {}, std::size_t iterations = 10,
+                                  extra_args_ &&...extra_args) {
 
-    using score_t = score_type_;
-    unified_vector<score_t> results_base(config.batch_size), results_simd(config.batch_size);
-    std::vector<std::string> first_array(config.batch_size), second_array(config.batch_size);
+    unified_vector<score_type_> results_base(config.batch_size), results_simd(config.batch_size);
+    std::vector<std::string> first_array, second_array;
     arrow_strings_tape_t first_tape, second_tape;
-    std::uniform_int_distribution<std::size_t> length_distribution(config.min_string_length, config.max_string_length);
 
     // Generate some random strings, using a small alphabet
-    for (std::size_t iteration_idx = 0; iteration_idx < config.iterations; ++iteration_idx) {
-        for (std::size_t i = 0; i != config.batch_size; ++i) {
-            std::size_t first_length = length_distribution(global_random_generator());
-            std::size_t second_length = length_distribution(global_random_generator());
-            first_array[i] = random_string(first_length, config.alphabet.data(), config.alphabet.size());
-            second_array[i] = random_string(second_length, config.alphabet.data(), config.alphabet.size());
-        }
-
-        // Convert to a GPU-friendly layout
-        first_tape.try_assign(first_array.data(), first_array.data() + config.batch_size);
-        second_tape.try_assign(second_array.data(), second_array.data() + config.batch_size);
+    for (std::size_t iteration_idx = 0; iteration_idx < iterations; ++iteration_idx) {
+        randomize_strings(config, first_array, first_tape);
+        randomize_strings(config, second_array, second_tape);
 
         // Compute with both backends
         status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
@@ -583,26 +594,26 @@ void test_similarity_scores_memory_usage() {
 
     std::vector<fuzzy_config_t> experiments = {
         // Single string pair of same length:
-        {.batch_size = 1, .min_string_length = 128, .max_string_length = 128, .iterations = 1},
-        {.batch_size = 1, .min_string_length = 512, .max_string_length = 512, .iterations = 1},
-        {.batch_size = 1, .min_string_length = 2048, .max_string_length = 2048, .iterations = 1},
-        {.batch_size = 1, .min_string_length = 8192, .max_string_length = 8192, .iterations = 1},
-        {.batch_size = 1, .min_string_length = 32768, .max_string_length = 32768, .iterations = 1},
-        {.batch_size = 1, .min_string_length = 131072, .max_string_length = 131072, .iterations = 1},
+        {.batch_size = 1, .min_string_length = 128, .max_string_length = 128},
+        {.batch_size = 1, .min_string_length = 512, .max_string_length = 512},
+        {.batch_size = 1, .min_string_length = 2048, .max_string_length = 2048},
+        {.batch_size = 1, .min_string_length = 8192, .max_string_length = 8192},
+        {.batch_size = 1, .min_string_length = 32768, .max_string_length = 32768},
+        {.batch_size = 1, .min_string_length = 131072, .max_string_length = 131072},
         // Two strings of a same length:
-        {.batch_size = 2, .min_string_length = 128, .max_string_length = 128, .iterations = 1},
-        {.batch_size = 2, .min_string_length = 512, .max_string_length = 512, .iterations = 1},
-        {.batch_size = 2, .min_string_length = 2048, .max_string_length = 2048, .iterations = 1},
-        {.batch_size = 2, .min_string_length = 8192, .max_string_length = 8192, .iterations = 1},
-        {.batch_size = 2, .min_string_length = 32768, .max_string_length = 32768, .iterations = 1},
-        {.batch_size = 2, .min_string_length = 131072, .max_string_length = 131072, .iterations = 1},
+        {.batch_size = 2, .min_string_length = 128, .max_string_length = 128},
+        {.batch_size = 2, .min_string_length = 512, .max_string_length = 512},
+        {.batch_size = 2, .min_string_length = 2048, .max_string_length = 2048},
+        {.batch_size = 2, .min_string_length = 8192, .max_string_length = 8192},
+        {.batch_size = 2, .min_string_length = 32768, .max_string_length = 32768},
+        {.batch_size = 2, .min_string_length = 131072, .max_string_length = 131072},
         // Ten strings of random lengths:
-        {.batch_size = 10, .min_string_length = 1, .max_string_length = 128, .iterations = 1},
-        {.batch_size = 10, .min_string_length = 1, .max_string_length = 512, .iterations = 1},
-        {.batch_size = 10, .min_string_length = 1, .max_string_length = 2048, .iterations = 1},
-        {.batch_size = 10, .min_string_length = 1, .max_string_length = 8192, .iterations = 1},
-        {.batch_size = 10, .min_string_length = 1, .max_string_length = 32768, .iterations = 1},
-        {.batch_size = 10, .min_string_length = 1, .max_string_length = 131072, .iterations = 1},
+        {.batch_size = 10, .min_string_length = 1, .max_string_length = 128},
+        {.batch_size = 10, .min_string_length = 1, .max_string_length = 512},
+        {.batch_size = 10, .min_string_length = 1, .max_string_length = 2048},
+        {.batch_size = 10, .min_string_length = 1, .max_string_length = 8192},
+        {.batch_size = 10, .min_string_length = 1, .max_string_length = 32768},
+        {.batch_size = 10, .min_string_length = 1, .max_string_length = 131072},
     };
 
 #if SZ_USE_CUDA
@@ -611,20 +622,19 @@ void test_similarity_scores_memory_usage() {
 
     // Progress until something fails
     for (fuzzy_config_t const &experiment : experiments) {
-        std::printf("Testing with batch size %zu, min length %zu, max length %zu, iterations %zu\n",
-                    experiment.batch_size, experiment.min_string_length, experiment.max_string_length,
-                    experiment.iterations);
+        std::printf("Testing with batch size %zu, min length %zu, max length %zu\n", experiment.batch_size,
+                    experiment.min_string_length, experiment.max_string_length);
 
         // Multi-threaded serial Levenshtein distance implementation
         test_similarity_scores_fuzzy<sz_size_t>( //
             levenshtein_baselines_t {},          //
-            levenshtein_distances<char, malloc_t, sz_caps_sp_k> {}, experiment);
+            levenshtein_distances<char, malloc_t, sz_caps_sp_k> {}, experiment, 1);
 
 #if SZ_USE_CUDA
         // CUDA Levenshtein distance against Multi-threaded on CPU
         test_similarity_scores_fuzzy<sz_size_t>(                    //
             levenshtein_distances<char, malloc_t, sz_caps_sp_k> {}, //
-            levenshtein_distances<char, dummy_alloc_t, sz_cap_cuda_k> {}, experiment, first_gpu_specs);
+            levenshtein_distances<char, dummy_alloc_t, sz_cap_cuda_k> {}, experiment, 1, first_gpu_specs);
 #endif
     }
 }
@@ -681,26 +691,26 @@ struct find_many_baselines_t {
     }
 
     template <typename haystacks_type_>
-    size_t count(haystacks_type_ &&haystacks, span<size_t> counts) const noexcept {
-        size_t count_total = 0;
+    status_t try_count(haystacks_type_ &&haystacks, span<size_t> counts) const noexcept {
         for (size_t &count : counts) count = 0;
         iterate_through_unsorted_matches(haystacks, needles_, [&](match_t const &match) {
             counts[match.haystack_index] += 1;
-            count_total += 1;
             return true;
         });
-        return count_total;
+        return status_t::success_k;
     }
 
     template <typename haystacks_type_, typename output_matches_type_>
-    size_t find(haystacks_type_ &&haystacks, output_matches_type_ &&matches) const noexcept {
+    status_t try_find(haystacks_type_ &&haystacks, output_matches_type_ &&matches,
+                      size_t &matches_total) const noexcept {
         size_t count_found = 0, count_allowed = matches.size();
         iterate_through_unsorted_matches(haystacks, needles_, [&](match_t const &match) {
             matches[count_found] = match;
             count_found += 1;
             return count_found < count_allowed;
         });
-        return count_found;
+        matches_total = count_found;
+        return status_t::success_k;
     }
 };
 
@@ -754,20 +764,26 @@ void test_find_many_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_
         // Count with both backends
         span<size_t> counts_base_span {counts_base.data(), counts_base.size()};
         span<size_t> counts_simd_span {counts_simd.data(), counts_simd.size()};
-        size_t total_found_base = base_operator.count(haystacks_tape, counts_base_span);
-        size_t total_found_simd = simd_operator.count(haystacks_tape, counts_simd_span, extra_args...);
-        _sz_assert(total_found_base == total_found_simd);
+        status_t status_count_base = base_operator.try_count(haystacks_tape, counts_base_span);
+        status_t status_count_simd = simd_operator.try_count(haystacks_tape, counts_simd_span, extra_args...);
+        _sz_assert(status_count_base == status_t::success_k);
+        _sz_assert(status_count_simd == status_t::success_k);
         _sz_assert(counts_base[0] == counts_simd[0]);
 
         // Check the matches themselves
-        matches_base.resize(total_found_base);
-        matches_simd.resize(total_found_simd);
-        size_t total_matched_base = base_operator.find(haystacks_tape, matches_base);
-        size_t total_matched_simd = simd_operator.find(haystacks_tape, matches_simd, extra_args...);
-        _sz_assert(total_matched_base == total_matched_simd);
+        matches_base.resize(std::accumulate(counts_base.begin(), counts_base.end(), 0));
+        matches_simd.resize(std::accumulate(counts_simd.begin(), counts_simd.end(), 0));
+        size_t total_found_base = 0, total_found_simd = 0;
+        status_t status_matched_base = base_operator.try_find(haystacks_tape, matches_base, total_found_base);
+        status_t status_matched_simd =
+            simd_operator.try_find(haystacks_tape, matches_simd, total_found_simd, extra_args...);
+        _sz_assert(status_matched_base == status_t::success_k);
+        _sz_assert(status_matched_simd == status_t::success_k);
+        _sz_assert(total_found_base == matches_base.size());
+        _sz_assert(total_found_simd == matches_simd.size());
 
         // Check the contents and order of the matches
-        for (std::size_t i = 0; i != total_matched_base; ++i) {
+        for (std::size_t i = 0; i != matches_base.size(); ++i) {
             _sz_assert(matches_base[i].haystack.data() == matches_simd[i].haystack.data());
             _sz_assert(matches_base[i].needle.data() == matches_simd[i].needle.data());
             _sz_assert(matches_base[i].needle_index == matches_simd[i].needle_index);
@@ -780,22 +796,29 @@ void test_find_many_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_
         counts_base.resize(haystacks.size());
         counts_simd.resize(haystacks.size());
 
-        // Count with both backends
+        // Count with both backends and compare all of the bounds
         span<size_t> counts_base_span {counts_base.data(), counts_base.size()};
         span<size_t> counts_simd_span {counts_simd.data(), counts_simd.size()};
-        size_t total_found_base = base_operator.count(haystacks_tape, counts_base_span);
-        size_t total_found_simd = simd_operator.count(haystacks_tape, counts_simd_span, extra_args...);
-        _sz_assert(total_found_base == total_found_simd);
+        status_t status_count_base = base_operator.try_count(haystacks_tape, counts_base_span);
+        status_t status_count_simd = simd_operator.try_count(haystacks_tape, counts_simd_span, extra_args...);
+        _sz_assert(status_count_base == status_t::success_k);
+        _sz_assert(status_count_simd == status_t::success_k);
+        _sz_assert(std::equal(counts_base.begin(), counts_base.end(), counts_simd.begin()));
 
         // Check the matches themselves
-        matches_base.resize(total_found_base);
-        matches_simd.resize(total_found_simd);
-        size_t total_matched_base = base_operator.find(haystacks_tape, matches_base);
-        size_t total_matched_simd = simd_operator.find(haystacks_tape, matches_simd, extra_args...);
-        _sz_assert(total_matched_base == total_matched_simd);
+        matches_base.resize(std::accumulate(counts_base.begin(), counts_base.end(), 0));
+        matches_simd.resize(std::accumulate(counts_simd.begin(), counts_simd.end(), 0));
+        size_t total_found_base = 0, total_found_simd = 0;
+        status_t status_matched_base = base_operator.try_find(haystacks_tape, matches_base, total_found_base);
+        status_t status_matched_simd =
+            simd_operator.try_find(haystacks_tape, matches_simd, total_found_simd, extra_args...);
+        _sz_assert(status_matched_base == status_t::success_k);
+        _sz_assert(status_matched_simd == status_t::success_k);
+        _sz_assert(total_found_base == matches_base.size());
+        _sz_assert(total_found_simd == matches_simd.size());
 
         // Check the contents and order of the matches
-        for (std::size_t i = 0; i != total_matched_base; ++i) {
+        for (std::size_t i = 0; i != matches_base.size(); ++i) {
             _sz_assert(matches_base[i].haystack.data() == matches_simd[i].haystack.data());
             _sz_assert(matches_base[i].needle.data() == matches_simd[i].needle.data());
             _sz_assert(matches_base[i].needle_index == matches_simd[i].needle_index);
@@ -803,17 +826,101 @@ void test_find_many_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_
     }
 }
 
+/**
+ *  @brief Fuzzy test for multi-pattern exact search algorithms using randomly-generated haystacks and needles.
+ */
+template <typename base_operator_, typename simd_operator_, typename... extra_args_>
+void test_find_many_fuzzy(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
+                          fuzzy_config_t needles_config = {}, fuzzy_config_t haystacks_config = {},
+                          std::size_t iterations = 10, extra_args_ &&...extra_args) {
+
+    using match_t = find_many_match_t;
+    unified_vector<match_t> results_base, results_simd;
+    unified_vector<size_t> counts_base, counts_simd;
+    std::vector<std::string> haystacks_array, needles_array;
+    arrow_strings_tape_t haystacks_tape, needles_tape;
+
+    // Generate some random strings, using a small alphabet
+    for (std::size_t iteration_idx = 0; iteration_idx < iterations; ++iteration_idx) {
+        randomize_strings(haystacks_config, haystacks_array, haystacks_tape);
+        randomize_strings(needles_config, needles_array, needles_tape, true);
+        counts_base.resize(haystacks_array.size());
+        counts_simd.resize(haystacks_array.size());
+
+        // Build the matchers
+        _sz_assert(base_operator.try_build(needles_tape.view()) == status_t::success_k);
+        _sz_assert(simd_operator.try_build(needles_tape.view()) == status_t::success_k);
+
+        // Count the number of matches with both backends
+        span<size_t> counts_base_span {counts_base.data(), counts_base.size()};
+        span<size_t> counts_simd_span {counts_simd.data(), counts_simd.size()};
+        status_t status_count_base = base_operator.try_count(haystacks_tape.view(), counts_base_span);
+        status_t status_count_simd = simd_operator.try_count(haystacks_tape.view(), counts_simd_span, extra_args...);
+        _sz_assert(status_count_base == status_t::success_k);
+        _sz_assert(status_count_simd == status_t::success_k);
+        size_t total_count_base = std::accumulate(counts_base.begin(), counts_base.end(), 0);
+        size_t total_count_simd = std::accumulate(counts_simd.begin(), counts_simd.end(), 0);
+        _sz_assert(total_count_base == total_count_simd);
+        _sz_assert(std::equal(counts_base.begin(), counts_base.end(), counts_simd.begin()));
+
+        // Compute with both backends
+        results_base.resize(total_count_base);
+        results_simd.resize(total_count_simd);
+        size_t count_base = 0, count_simd = 0;
+        status_t status_base = base_operator.try_find(haystacks_tape.view(), results_base, count_base);
+        status_t status_simd = simd_operator.try_find(haystacks_tape.view(), results_simd, count_simd, extra_args...);
+        _sz_assert(status_base == status_t::success_k);
+        _sz_assert(status_simd == status_t::success_k);
+        _sz_assert(count_base == count_simd);
+
+        // Individually log the failed results
+        std::sort(results_base.begin(), results_base.end(), match_t::less_globally);
+        std::sort(results_simd.begin(), results_simd.end(), match_t::less_globally);
+        for (std::size_t i = 0; i != results_base.size(); ++i) {
+            _sz_assert(results_base[i].haystack_index == results_simd[i].haystack_index);
+            _sz_assert(results_base[i].needle_index == results_simd[i].needle_index);
+            _sz_assert(results_base[i].needle.data() == results_simd[i].needle.data());
+        }
+
+        base_operator.reset();
+        simd_operator.reset();
+    }
+}
+
 /**
  *  @brief  Tests the multi-pattern exact substring search algorithm
  *          against a baseline implementation for predefined and random inputs.
  */
 void test_find_many_equivalence() {
 
+    cpu_specs_t default_cpu_specs;
+    fuzzy_config_t needles_short_config, needles_long_config, haystacks_long_config;
+    haystacks_long_config.batch_size = default_cpu_specs.cores_total() * 4;
+    haystacks_long_config.max_string_length = default_cpu_specs.l3_bytes;
+
+    needles_long_config.min_string_length = 8;
+    needles_long_config.max_string_length = 10;
+    needles_long_config.batch_size =
+        std::pow(needles_long_config.alphabet.size(), needles_long_config.max_string_length);
+
+    needles_long_config.min_string_length = 1;
+    needles_long_config.max_string_length = 6;
+    needles_long_config.batch_size =
+        std::pow(needles_long_config.alphabet.size(), needles_long_config.max_string_length);
+
     // Single-threaded serial Levenshtein distance implementation
     test_find_many_fixed(find_many_baselines_t {}, find_many_serial_t {});
+    test_find_many_fuzzy(find_many_baselines_t {}, find_many_serial_t {}, needles_short_config, haystacks_long_config,
+                         1);
+    test_find_many_fuzzy(find_many_baselines_t {}, find_many_serial_t {}, needles_long_config, haystacks_long_config,
+                         1);
 
     // Multi-threaded parallel Levenshtein distance implementation
     test_find_many_fixed(find_many_baselines_t {}, find_many_parallel_t {});
+    test_find_many_fuzzy(find_many_baselines_t {}, find_many_parallel_t {}, needles_short_config, haystacks_long_config,
+                         10);
+    test_find_many_fuzzy(find_many_baselines_t {}, find_many_parallel_t {}, needles_long_config, haystacks_long_config,
+                         10);
 }
 
 } // namespace scripts

From 83bc966166c0a66ff20d234c2e74bb7e711d1eb1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 16 Apr 2025 20:32:36 +0000
Subject: [PATCH 344/751] Fix: `bytes_per_core_optimal` estimate

---
 include/stringcuzilla/find_many.hpp | 40 ++++++++++++++++++-----------
 include/stringzilla/types.hpp       | 12 ++++++++-
 2 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index 7045a642..a2174c29 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -586,6 +586,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
 
         _sz_assert(counts.size() == haystacks.size());
         size_t const cores_total = specs.cores_total();
+        size_t const cache_line_width = specs.cache_line_width;
 
         using haystacks_t = typename std::remove_reference_t<haystacks_type_>;
         using haystack_t = typename haystacks_t::value_type;
@@ -609,7 +610,8 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
             size_t count_matches_across_cores = 0;
 #pragma omp parallel for reduction(+ : count_matches_across_cores) schedule(static, 1) num_threads(cores_total)
             for (size_t core_index = 0; core_index < cores_total; ++core_index) {
-                size_t count_matches_on_one_core = count_matches_in_one_part(haystack, core_index, cores_total);
+                size_t count_matches_on_one_core =
+                    count_matches_in_one_part(haystack, core_index, cores_total, cache_line_width);
                 count_matches_across_cores += count_matches_on_one_core;
             }
 
@@ -662,7 +664,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
 
         _sz_assert(counts.size() == haystacks.size());
         size_t const cores_total = specs.cores_total();
-        size_t const max_needle_length = dict_.max_needle_length();
+        size_t const cache_line_width = specs.cache_line_width;
 
         using haystacks_t = typename std::remove_reference_t<haystacks_type_>;
         using haystack_t = typename haystacks_t::value_type;
@@ -705,7 +707,8 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
             // First, on each core, estimate the number of matches in the haystack
 #pragma omp parallel for schedule(static, 1) num_threads(cores_total)
             for (size_t core_index = 0; core_index < cores_total; ++core_index)
-                counts_per_core[core_index] = count_matches_in_one_part(haystack, core_index, cores_total);
+                counts_per_core[core_index] =
+                    count_matches_in_one_part(haystack, core_index, cores_total, cache_line_width);
 
             // Now that we know the number of matches to expect per slice, we can convert the counts
             // into offsets using inclusive prefix sum
@@ -716,9 +719,13 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
                     counts_per_core[core_index] += counts_per_core[core_index - 1];
             }
 
+            // We shouldn't even consider needles longer than the haystack
+            size_t const max_needle_length = std::min(dict_.max_needle_length(), haystack_length);
+
             // On each core, pick an overlapping slice and go through all of the matches in it,
             // that start before the end of the private slice.
-            size_t const bytes_per_core_optimal = haystack_length / cores_total;
+            size_t const bytes_per_core_optimal =
+                round_up_to_multiple(divide_round_up(haystack_length, cores_total), cache_line_width);
             size_t const count_matches_before_this_haystack = offsets_per_haystack[i];
 #pragma omp parallel for schedule(static, 1) num_threads(cores_total)
             for (size_t core_index = 0; core_index < cores_total; ++core_index) {
@@ -727,12 +734,12 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
                     counts_per_core[core_index] - count_matches_before_this_core;
 
                 // The last core may have a smaller slice, so we need to be careful
-                size_t const bytes_per_core =
+                size_t const bytes_for_core =
                     std::min(bytes_per_core_optimal, haystack_length - core_index * bytes_per_core_optimal);
                 char_t const *optimal_start = haystack_begin + core_index * bytes_per_core_optimal;
-                char_t const *optimal_end = optimal_start + bytes_per_core;
+                char_t const *optimal_end = optimal_start + bytes_for_core;
                 char_t const *overlapping_end =
-                    std::min(optimal_start + bytes_per_core + max_needle_length - 1, haystack_begin + haystack_length);
+                    std::min(optimal_start + bytes_for_core + max_needle_length - 1, haystack_begin + haystack_length);
 
                 // Iterate through the matches in the overlapping region
                 size_t count_matches_found_on_this_core = 0;
@@ -748,6 +755,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
             }
         }
 
+        // Aggregate the results
         matches_count = 0;
         for (size_t count : counts) matches_count += count;
         return status_t::success_k;
@@ -763,31 +771,33 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
      *          overlapping matches.
      */
     template <typename char_type_>
-    size_t count_matches_in_one_part(span<char_type_ const> haystack, size_t core_index,
-                                     size_t cores_total) const noexcept {
+    size_t count_matches_in_one_part(span<char_type_ const> haystack, size_t core_index, size_t cores_total,
+                                     size_t cache_line_width) const noexcept {
 
         using char_t = char_type_;
         char_t const *haystack_begin = haystack.data();
         size_t const haystack_length = haystack.size();
         char_t const *const haystack_end = haystack_begin + haystack_length;
-        size_t const bytes_per_core_optimal = haystack_length / cores_total;
-        size_t const max_needle_length = dict_.max_needle_length();
+        size_t const bytes_per_core_optimal =
+            round_up_to_multiple(divide_round_up(haystack_length, cores_total), cache_line_width);
+
+        // We shouldn't even consider needles longer than the haystack
+        size_t const max_needle_length = std::min(dict_.max_needle_length(), haystack_length);
 
         // The last core may have a smaller slice, so we need to be careful
-        size_t const bytes_per_core =
+        size_t const bytes_for_core =
             std::min(bytes_per_core_optimal, haystack_length - core_index * bytes_per_core_optimal);
 
         // First, each core will process its own slice excluding the overlapping regions
         char_t const *optimal_start = haystack_begin + core_index * bytes_per_core_optimal;
-        char_t const *optimal_end = optimal_start + bytes_per_core;
+        char_t const *optimal_end = optimal_start + bytes_for_core;
         size_t const count_matches_non_overlapping = dict_.count({optimal_start, optimal_end});
 
         // Now, each thread will take care of the subsequent overlapping regions,
         // but we must be careful for cases when the core-specific slice is shorter
         // than the longest needle! It's a very unlikely case in practice, but we
         // still may want an optimization for it down the road.
-        char_t const *overlapping_start =
-            std::min(optimal_start + bytes_per_core - max_needle_length + 1, haystack_end);
+        char_t const *overlapping_start = std::max(optimal_end - max_needle_length + 1, haystack_begin);
         char_t const *overlapping_end = std::min(optimal_end + max_needle_length - 1, haystack_end);
         size_t count_matches_overlapping = 0;
         dict_.find({overlapping_start, overlapping_end}, [&](match_t match) noexcept {
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 2e2d2378..c8dcb094 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -799,13 +799,23 @@ struct cpu_specs_t {
     size_t cores_total() const noexcept { return cores_per_socket * sockets; }
 };
 
+/**
+ *  @brief Divides the @p x by @p divisor and rounds up to the nearest integer.
+ *  @note  This is equivalent to `ceil(x / divisor)`, but avoids floating-point arithmetic.
+ */
+template <typename scalar_type_>
+constexpr scalar_type_ divide_round_up(scalar_type_ x, scalar_type_ divisor) {
+    _sz_assert(divisor > 0 && "Divisor must be positive");
+    return (x + divisor - 1) / divisor;
+}
+
 /**
  *  @brief Rounds @p x up to the nearest multiple of @p divisor.
  */
 template <typename scalar_type_>
 constexpr scalar_type_ round_up_to_multiple(scalar_type_ x, scalar_type_ divisor) {
     _sz_assert(divisor > 0 && "Divisor must be positive");
-    return ((x + divisor - 1) / divisor) * divisor;
+    return divide_round_up(x, divisor) * divisor;
 }
 
 } // namespace stringzilla

From 6ebc7b0c6cc9c28d52429731a88609570a23066d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 16 Apr 2025 22:18:28 +0000
Subject: [PATCH 345/751] Improve: Parallel baseline for substring search

---
 scripts/test_stringcuzilla.cuh | 240 +++++++++++++++++++++------------
 1 file changed, 153 insertions(+), 87 deletions(-)

diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index c7feaf97..9e14a509 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -640,7 +640,6 @@ void test_similarity_scores_memory_usage() {
 }
 
 struct find_many_baselines_t {
-    using state_id_t = sz_u32_t;
     using match_t = find_many_match_t;
 
     arrow_strings_tape_t needles_;
@@ -652,39 +651,65 @@ struct find_many_baselines_t {
 
     void reset() noexcept { needles_.reset(); }
 
+    template <typename haystack_type_, typename needle_type_, typename match_callback_type_>
+    void one_pair(haystack_type_ const &haystack, needle_type_ const &needle,
+                  match_callback_type_ &&callback) const noexcept {
+
+        // Define iterators for the current haystack and the needle.
+        auto haystack_begin = haystack.begin();
+        auto haystack_end = haystack.end();
+        auto needle_begin = needle.begin();
+        auto needle_end = needle.end();
+
+        // Use `std::search` to find all occurrences of needle in haystack.
+        while (true) {
+            auto it = std::search(haystack_begin, haystack_end, needle_begin, needle_end);
+            if (it == haystack_end) break;
+
+            // Compute the starting index of the found occurrence.
+            std::size_t found = static_cast<std::size_t>(std::distance(haystack.begin(), it));
+
+            // Construct a match record.
+            match_t match;
+            match.haystack = {haystack.data(), haystack.size()};
+            match.needle = {haystack.data() + found, needle.size()};
+
+            // Invoke the callback. If it returns false, abort all further processing.
+            if (!callback(match)) return;
+
+            // Advance the starting iterator for the next search.
+            haystack_begin = it + 1;
+        }
+    }
+
     template <typename haystacks_type_, typename needles_type_, typename match_callback_type_>
-    void iterate_through_unsorted_matches(haystacks_type_ &&haystacks, needles_type_ &&needles,
-                                          match_callback_type_ &&callback) const noexcept {
-        for (std::size_t i = 0; i != haystacks.size(); ++i) {
-            auto const &haystack = haystacks[i];
-            for (std::size_t j = 0; j != needles.size(); ++j) {
+    void all_pairs(haystacks_type_ &&haystacks, needles_type_ &&needles,
+                   match_callback_type_ &&callback) const noexcept {
+
+        // A wise man once said, `omp parallel for collapse(2) schedule(dynamic, 1)`...
+        // But the compiler wasn't listening, and won't compile the cancellation point!
+        std::size_t const total_tasks = haystacks.size() * needles.size();
+#pragma omp parallel
+        {
+#pragma omp for schedule(dynamic, 1)
+            for (std::size_t task = 0; task != total_tasks; ++task) {
+#pragma omp cancellation point for
+                std::size_t const i = task / needles.size();
+                std::size_t const j = task % needles.size();
+
+                auto const &haystack = haystacks[i];
                 auto const &needle = needles[j];
-                // Define iterators for the current haystack and the needle.
-                auto haystack_begin = haystack.begin();
-                auto haystack_end = haystack.end();
-                auto needle_begin = needle.begin();
-                auto needle_end = needle.end();
-
-                // Use `std::search` to find all occurrences of needle in haystack.
-                while (true) {
-                    auto it = std::search(haystack_begin, haystack_end, needle_begin, needle_end);
-                    if (it == haystack_end) break;
-
-                    // Compute the starting index of the found occurrence.
-                    std::size_t found = static_cast<std::size_t>(std::distance(haystack.begin(), it));
-
-                    // Construct a match record.
-                    match_t match;
+                bool keep_going = true;
+                one_pair(haystack, needle, [&](match_t match) {
                     match.haystack_index = i;
                     match.needle_index = j;
-                    match.haystack = {haystack.data(), haystack.size()};
-                    match.needle = {haystack.data() + found, needle.size()};
-
-                    // Invoke the callback. If it returns false, abort all further processing.
-                    if (!callback(match)) return;
-
-                    // Advance the starting iterator for the next search.
-                    haystack_begin = it + 1;
+#pragma omp critical
+                    { keep_going = callback(match); }
+                    return keep_going;
+                });
+                // Quit the outer loop if the callback returns false
+                if (!keep_going) {
+#pragma omp cancel for
                 }
             }
         }
@@ -693,7 +718,7 @@ struct find_many_baselines_t {
     template <typename haystacks_type_>
     status_t try_count(haystacks_type_ &&haystacks, span<size_t> counts) const noexcept {
         for (size_t &count : counts) count = 0;
-        iterate_through_unsorted_matches(haystacks, needles_, [&](match_t const &match) {
+        all_pairs(haystacks, needles_, [&](match_t const &match) noexcept {
             counts[match.haystack_index] += 1;
             return true;
         });
@@ -703,8 +728,9 @@ struct find_many_baselines_t {
     template <typename haystacks_type_, typename output_matches_type_>
     status_t try_find(haystacks_type_ &&haystacks, output_matches_type_ &&matches,
                       size_t &matches_total) const noexcept {
+
         size_t count_found = 0, count_allowed = matches.size();
-        iterate_through_unsorted_matches(haystacks, needles_, [&](match_t const &match) {
+        all_pairs(haystacks, needles_, [&](match_t const &match) noexcept {
             matches[count_found] = match;
             count_found += 1;
             return count_found < count_allowed;
@@ -783,6 +809,8 @@ void test_find_many_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_
         _sz_assert(total_found_simd == matches_simd.size());
 
         // Check the contents and order of the matches
+        std::sort(matches_base.begin(), matches_base.end(), match_t::less_globally);
+        std::sort(matches_simd.begin(), matches_simd.end(), match_t::less_globally);
         for (std::size_t i = 0; i != matches_base.size(); ++i) {
             _sz_assert(matches_base[i].haystack.data() == matches_simd[i].haystack.data());
             _sz_assert(matches_base[i].needle.data() == matches_simd[i].needle.data());
@@ -818,6 +846,8 @@ void test_find_many_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_
         _sz_assert(total_found_simd == matches_simd.size());
 
         // Check the contents and order of the matches
+        std::sort(matches_base.begin(), matches_base.end(), match_t::less_globally);
+        std::sort(matches_simd.begin(), matches_simd.end(), match_t::less_globally);
         for (std::size_t i = 0; i != matches_base.size(); ++i) {
             _sz_assert(matches_base[i].haystack.data() == matches_simd[i].haystack.data());
             _sz_assert(matches_base[i].needle.data() == matches_simd[i].needle.data());
@@ -830,13 +860,64 @@ void test_find_many_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_
  *  @brief Fuzzy test for multi-pattern exact search algorithms using randomly-generated haystacks and needles.
  */
 template <typename base_operator_, typename simd_operator_, typename... extra_args_>
-void test_find_many_fuzzy(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
-                          fuzzy_config_t needles_config = {}, fuzzy_config_t haystacks_config = {},
-                          std::size_t iterations = 10, extra_args_ &&...extra_args) {
+void test_find_many(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
+                    arrow_strings_tape_t const &haystacks_tape, arrow_strings_tape_t const &needles_tape,
+                    extra_args_ &&...extra_args) {
 
     using match_t = find_many_match_t;
     unified_vector<match_t> results_base, results_simd;
     unified_vector<size_t> counts_base, counts_simd;
+
+    counts_base.resize(haystacks_tape.size());
+    counts_simd.resize(haystacks_tape.size());
+
+    // Build the matchers
+    _sz_assert(base_operator.try_build(needles_tape.view()) == status_t::success_k);
+    _sz_assert(simd_operator.try_build(needles_tape.view()) == status_t::success_k);
+
+    // Count the number of matches with both backends
+    span<size_t> counts_base_span {counts_base.data(), counts_base.size()};
+    span<size_t> counts_simd_span {counts_simd.data(), counts_simd.size()};
+    status_t status_count_base = base_operator.try_count(haystacks_tape.view(), counts_base_span);
+    status_t status_count_simd = simd_operator.try_count(haystacks_tape.view(), counts_simd_span, extra_args...);
+    _sz_assert(status_count_base == status_t::success_k);
+    _sz_assert(status_count_simd == status_t::success_k);
+    size_t total_count_base = std::accumulate(counts_base.begin(), counts_base.end(), 0);
+    size_t total_count_simd = std::accumulate(counts_simd.begin(), counts_simd.end(), 0);
+    _sz_assert(total_count_base == total_count_simd);
+    _sz_assert(std::equal(counts_base.begin(), counts_base.end(), counts_simd.begin()));
+
+    // Compute with both backends
+    results_base.resize(total_count_base);
+    results_simd.resize(total_count_simd);
+    size_t count_base = 0, count_simd = 0;
+    status_t status_base = base_operator.try_find(haystacks_tape.view(), results_base, count_base);
+    status_t status_simd = simd_operator.try_find(haystacks_tape.view(), results_simd, count_simd, extra_args...);
+    _sz_assert(status_base == status_t::success_k);
+    _sz_assert(status_simd == status_t::success_k);
+    _sz_assert(count_base == count_simd);
+
+    // Individually log the failed results
+    std::sort(results_base.begin(), results_base.end(), match_t::less_globally);
+    std::sort(results_simd.begin(), results_simd.end(), match_t::less_globally);
+    for (std::size_t i = 0; i != results_base.size(); ++i) {
+        _sz_assert(results_base[i].haystack_index == results_simd[i].haystack_index);
+        _sz_assert(results_base[i].needle_index == results_simd[i].needle_index);
+        _sz_assert(results_base[i].needle.data() == results_simd[i].needle.data());
+    }
+
+    base_operator.reset();
+    simd_operator.reset();
+}
+
+/**
+ *  @brief Fuzzy test for multi-pattern exact search algorithms using randomly-generated haystacks and needles.
+ */
+template <typename base_operator_, typename simd_operator_, typename... extra_args_>
+void test_find_many_fuzzy(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
+                          fuzzy_config_t needles_config = {}, fuzzy_config_t haystacks_config = {},
+                          std::size_t iterations = 10, extra_args_ &&...extra_args) {
+
     std::vector<std::string> haystacks_array, needles_array;
     arrow_strings_tape_t haystacks_tape, needles_tape;
 
@@ -844,46 +925,33 @@ void test_find_many_fuzzy(base_operator_ &&base_operator, simd_operator_ &&simd_
     for (std::size_t iteration_idx = 0; iteration_idx < iterations; ++iteration_idx) {
         randomize_strings(haystacks_config, haystacks_array, haystacks_tape);
         randomize_strings(needles_config, needles_array, needles_tape, true);
-        counts_base.resize(haystacks_array.size());
-        counts_simd.resize(haystacks_array.size());
+        test_find_many(base_operator, simd_operator, haystacks_tape, needles_tape, extra_args...);
+    }
+}
 
-        // Build the matchers
-        _sz_assert(base_operator.try_build(needles_tape.view()) == status_t::success_k);
-        _sz_assert(simd_operator.try_build(needles_tape.view()) == status_t::success_k);
+/**
+ *  @brief  Fuzzy test for multi-pattern exact search algorithms using randomly-generated haystacks,
+ *          and using incrementally longer potentially-overlapping substrings as needles.
+ */
+template <typename base_operator_, typename simd_operator_, typename... extra_args_>
+void test_find_many_prefixes(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
+                             fuzzy_config_t haystacks_config, std::size_t needle_length_limit,
+                             std::size_t iterations = 10, extra_args_ &&...extra_args) {
 
-        // Count the number of matches with both backends
-        span<size_t> counts_base_span {counts_base.data(), counts_base.size()};
-        span<size_t> counts_simd_span {counts_simd.data(), counts_simd.size()};
-        status_t status_count_base = base_operator.try_count(haystacks_tape.view(), counts_base_span);
-        status_t status_count_simd = simd_operator.try_count(haystacks_tape.view(), counts_simd_span, extra_args...);
-        _sz_assert(status_count_base == status_t::success_k);
-        _sz_assert(status_count_simd == status_t::success_k);
-        size_t total_count_base = std::accumulate(counts_base.begin(), counts_base.end(), 0);
-        size_t total_count_simd = std::accumulate(counts_simd.begin(), counts_simd.end(), 0);
-        _sz_assert(total_count_base == total_count_simd);
-        _sz_assert(std::equal(counts_base.begin(), counts_base.end(), counts_simd.begin()));
+    std::vector<std::string> haystacks_array;
+    std::vector<std::string_view> needles_array;
+    arrow_strings_tape_t haystacks_tape, needles_tape;
 
-        // Compute with both backends
-        results_base.resize(total_count_base);
-        results_simd.resize(total_count_simd);
-        size_t count_base = 0, count_simd = 0;
-        status_t status_base = base_operator.try_find(haystacks_tape.view(), results_base, count_base);
-        status_t status_simd = simd_operator.try_find(haystacks_tape.view(), results_simd, count_simd, extra_args...);
-        _sz_assert(status_base == status_t::success_k);
-        _sz_assert(status_simd == status_t::success_k);
-        _sz_assert(count_base == count_simd);
+    for (std::size_t iteration_idx = 0; iteration_idx < iterations; ++iteration_idx) {
+        randomize_strings(haystacks_config, haystacks_array, haystacks_tape);
 
-        // Individually log the failed results
-        std::sort(results_base.begin(), results_base.end(), match_t::less_globally);
-        std::sort(results_simd.begin(), results_simd.end(), match_t::less_globally);
-        for (std::size_t i = 0; i != results_base.size(); ++i) {
-            _sz_assert(results_base[i].haystack_index == results_simd[i].haystack_index);
-            _sz_assert(results_base[i].needle_index == results_simd[i].needle_index);
-            _sz_assert(results_base[i].needle.data() == results_simd[i].needle.data());
-        }
+        // Pick various substrings as needles from the first haystack
+        needles_array.resize(std::min(haystacks_array[0].size(), needle_length_limit));
+        for (std::size_t i = 0; i != needles_array.size(); ++i)
+            needles_array[i] = std::string_view(haystacks_array[0]).substr(0, i + 1);
+        needles_tape.try_assign(needles_array.data(), needles_array.data() + needles_array.size());
 
-        base_operator.reset();
-        simd_operator.reset();
+        test_find_many(base_operator, simd_operator, haystacks_tape, needles_tape, extra_args...);
     }
 }
 
@@ -894,33 +962,31 @@ void test_find_many_fuzzy(base_operator_ &&base_operator, simd_operator_ &&simd_
 void test_find_many_equivalence() {
 
     cpu_specs_t default_cpu_specs;
-    fuzzy_config_t needles_short_config, needles_long_config, haystacks_long_config;
-    haystacks_long_config.batch_size = default_cpu_specs.cores_total() * 4;
-    haystacks_long_config.max_string_length = default_cpu_specs.l3_bytes;
+    fuzzy_config_t needles_short_config, needles_long_config, haystacks_config;
+    haystacks_config.batch_size = default_cpu_specs.cores_total() * 4;
+    haystacks_config.max_string_length = default_cpu_specs.l3_bytes;
 
     needles_long_config.min_string_length = 8;
     needles_long_config.max_string_length = 10;
     needles_long_config.batch_size =
         std::pow(needles_long_config.alphabet.size(), needles_long_config.max_string_length);
 
-    needles_long_config.min_string_length = 1;
-    needles_long_config.max_string_length = 6;
-    needles_long_config.batch_size =
-        std::pow(needles_long_config.alphabet.size(), needles_long_config.max_string_length);
+    needles_short_config.min_string_length = 1;
+    needles_short_config.max_string_length = 6;
+    needles_short_config.batch_size =
+        std::pow(needles_short_config.alphabet.size(), needles_short_config.max_string_length);
 
-    // Single-threaded serial Levenshtein distance implementation
+    // Single-threaded serial Aho-Corasick implementation
     test_find_many_fixed(find_many_baselines_t {}, find_many_serial_t {});
-    test_find_many_fuzzy(find_many_baselines_t {}, find_many_serial_t {}, needles_short_config, haystacks_long_config,
-                         1);
-    test_find_many_fuzzy(find_many_baselines_t {}, find_many_serial_t {}, needles_long_config, haystacks_long_config,
-                         1);
+    test_find_many_fuzzy(find_many_baselines_t {}, find_many_serial_t {}, needles_short_config, haystacks_config, 1);
+    test_find_many_fuzzy(find_many_baselines_t {}, find_many_serial_t {}, needles_long_config, haystacks_config, 1);
+    test_find_many_prefixes(find_many_baselines_t {}, find_many_serial_t {}, haystacks_config, 1024, 1);
 
-    // Multi-threaded parallel Levenshtein distance implementation
+    // Multi-threaded parallel Aho-Corasick implementation
     test_find_many_fixed(find_many_baselines_t {}, find_many_parallel_t {});
-    test_find_many_fuzzy(find_many_baselines_t {}, find_many_parallel_t {}, needles_short_config, haystacks_long_config,
-                         10);
-    test_find_many_fuzzy(find_many_baselines_t {}, find_many_parallel_t {}, needles_long_config, haystacks_long_config,
-                         10);
+    test_find_many_fuzzy(find_many_baselines_t {}, find_many_parallel_t {}, needles_short_config, haystacks_config, 10);
+    test_find_many_fuzzy(find_many_baselines_t {}, find_many_parallel_t {}, needles_long_config, haystacks_config, 10);
+    test_find_many_prefixes(find_many_baselines_t {}, find_many_parallel_t {}, haystacks_config, 1024, 10);
 }
 
 } // namespace scripts

From f2282644463026a86ddd5113484ec94d3e1704c6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 17 Apr 2025 10:17:33 +0000
Subject: [PATCH 346/751] Improve: Use `std::execution` for baseline tests

`std::search` has a Parallel STL version
---
 scripts/test_stringcuzilla.cuh | 42 +++++++++++++++-------------------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index 9e14a509..9b844d4a 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -18,6 +18,8 @@
 
 #include "test_stringzilla.hpp" // `arrow_strings_view_t`
 
+#include <execution> // `std::execution::par_unseq`
+
 namespace ashvardanian {
 namespace stringzilla {
 namespace scripts {
@@ -663,7 +665,7 @@ struct find_many_baselines_t {
 
         // Use `std::search` to find all occurrences of needle in haystack.
         while (true) {
-            auto it = std::search(haystack_begin, haystack_end, needle_begin, needle_end);
+            auto it = std::search(std::execution::par_unseq, haystack_begin, haystack_end, needle_begin, needle_end);
             if (it == haystack_end) break;
 
             // Compute the starting index of the found occurrence.
@@ -689,29 +691,21 @@ struct find_many_baselines_t {
         // A wise man once said, `omp parallel for collapse(2) schedule(dynamic, 1)`...
         // But the compiler wasn't listening, and won't compile the cancellation point!
         std::size_t const total_tasks = haystacks.size() * needles.size();
-#pragma omp parallel
-        {
-#pragma omp for schedule(dynamic, 1)
-            for (std::size_t task = 0; task != total_tasks; ++task) {
-#pragma omp cancellation point for
-                std::size_t const i = task / needles.size();
-                std::size_t const j = task % needles.size();
-
-                auto const &haystack = haystacks[i];
-                auto const &needle = needles[j];
-                bool keep_going = true;
-                one_pair(haystack, needle, [&](match_t match) {
-                    match.haystack_index = i;
-                    match.needle_index = j;
-#pragma omp critical
-                    { keep_going = callback(match); }
-                    return keep_going;
-                });
-                // Quit the outer loop if the callback returns false
-                if (!keep_going) {
-#pragma omp cancel for
-                }
-            }
+        for (std::size_t task = 0; task != total_tasks; ++task) {
+            std::size_t const i = task / needles.size();
+            std::size_t const j = task % needles.size();
+
+            auto const &haystack = haystacks[i];
+            auto const &needle = needles[j];
+            bool keep_going = true;
+            one_pair(haystack, needle, [&](match_t match) {
+                match.haystack_index = i;
+                match.needle_index = j;
+                keep_going = callback(match);
+                return keep_going;
+            });
+            // Quit the outer loop if the callback returns false
+            if (!keep_going) return;
         }
     }
 

From 7bcc803df0acc552f8462c969590da2b02145202 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 18 Apr 2025 11:01:11 +0000
Subject: [PATCH 347/751] Fix: Slicing corner-cases in OpenMP

---
 include/stringcuzilla/find_many.hpp | 53 +++++++++++++++++------------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index a2174c29..2ba684e8 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -593,16 +593,16 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
 
         // On small strings, individually compute the counts
 #pragma omp parallel for schedule(dynamic, 1) num_threads(cores_total)
-        for (size_t i = 0; i < counts.size(); ++i) {
-            haystack_t const &haystack = haystacks[i];
+        for (size_t haystack_index = 0; haystack_index < counts.size(); ++haystack_index) {
+            haystack_t const &haystack = haystacks[haystack_index];
             size_t haystack_length = haystack.size();
             if (haystack_length > specs.l2_bytes) continue;
-            counts[i] = dict_.count(haystack);
+            counts[haystack_index] = dict_.count(haystack);
         }
 
         // On longer strings, throw all cores on each haystack
-        for (size_t i = 0; i < counts.size(); ++i) {
-            haystack_t const &haystack = haystacks[i];
+        for (size_t haystack_index = 0; haystack_index < counts.size(); ++haystack_index) {
+            haystack_t const &haystack = haystacks[haystack_index];
             size_t const haystack_length = haystack.size();
             // The shorter strings have already been processed
             if (haystack_length <= specs.l2_bytes) continue;
@@ -615,7 +615,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
                 count_matches_across_cores += count_matches_on_one_core;
             }
 
-            counts[i] = count_matches_across_cores;
+            counts[haystack_index] = count_matches_across_cores;
         }
 
         return status_t::success_k;
@@ -679,26 +679,26 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
 
         // Process the small haystacks, outputting their matches individually without any synchronization
 #pragma omp parallel for schedule(dynamic, 1) num_threads(cores_total)
-        for (size_t i = 0; i < counts.size(); ++i) {
-            haystack_t const &haystack = haystacks[i];
+        for (size_t haystack_index = 0; haystack_index < counts.size(); ++haystack_index) {
+            haystack_t const &haystack = haystacks[haystack_index];
             size_t haystack_length = haystack.size();
             if (haystack_length > specs.l2_bytes) continue;
             size_t matches_found = 0;
             dict_.find({haystack.data(), haystack_length}, [&](match_t match) {
-                match.haystack_index = i;
-                matches[offsets_per_haystack[i] + matches_found] = match;
+                match.haystack_index = haystack_index;
+                matches[offsets_per_haystack[haystack_index] + matches_found] = match;
                 ++matches_found;
                 return true;
             });
-            _sz_assert(counts[i] == matches_found);
+            _sz_assert(counts[haystack_index] == matches_found);
         }
 
         // On longer strings, throw all cores on each haystack, but between the threads we need additional
         // memory to track the number of matches within a core-specific slice of the haystack.
         safe_vector<size_t, size_allocator_t> counts_per_core(dict_.allocator());
         if (counts_per_core.try_resize(cores_total) != status_t::success_k) return status_t::bad_alloc_k;
-        for (size_t i = 0; i < counts.size(); ++i) {
-            haystack_t const &haystack = haystacks[i];
+        for (size_t haystack_index = 0; haystack_index < counts.size(); ++haystack_index) {
+            haystack_t const &haystack = haystacks[haystack_index];
             char_t const *haystack_begin = haystack.data();
             size_t const haystack_length = haystack.size();
             // The shorter strings have already been processed
@@ -726,7 +726,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
             // that start before the end of the private slice.
             size_t const bytes_per_core_optimal =
                 round_up_to_multiple(divide_round_up(haystack_length, cores_total), cache_line_width);
-            size_t const count_matches_before_this_haystack = offsets_per_haystack[i];
+            size_t const count_matches_before_this_haystack = offsets_per_haystack[haystack_index];
 #pragma omp parallel for schedule(static, 1) num_threads(cores_total)
             for (size_t core_index = 0; core_index < cores_total; ++core_index) {
                 size_t const count_matches_before_this_core = core_index ? counts_per_core[core_index - 1] : 0;
@@ -746,6 +746,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
                 dict_.find({optimal_start, overlapping_end}, [&](match_t match) noexcept {
                     bool blongs_to_this_core = match.needle.begin() < optimal_end;
                     if (!blongs_to_this_core) return true;
+                    match.haystack_index = haystack_index;
                     matches[count_matches_before_this_haystack + count_matches_before_this_core +
                             count_matches_found_on_this_core] = match;
                     count_matches_found_on_this_core++;
@@ -784,21 +785,31 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
         // We shouldn't even consider needles longer than the haystack
         size_t const max_needle_length = std::min(dict_.max_needle_length(), haystack_length);
 
-        // The last core may have a smaller slice, so we need to be careful
-        size_t const bytes_for_core =
-            std::min(bytes_per_core_optimal, haystack_length - core_index * bytes_per_core_optimal);
+        // We may have a case of a thread receiving no data at all
+        char_t const *optimal_start = haystack_begin + core_index * bytes_per_core_optimal;
+        if (optimal_start >= haystack_end) return 0;
 
         // First, each core will process its own slice excluding the overlapping regions
-        char_t const *optimal_start = haystack_begin + core_index * bytes_per_core_optimal;
-        char_t const *optimal_end = optimal_start + bytes_for_core;
+        char_t const *optimal_end = std::min(optimal_start + bytes_per_core_optimal, haystack_end);
         size_t const count_matches_non_overlapping = dict_.count({optimal_start, optimal_end});
 
         // Now, each thread will take care of the subsequent overlapping regions,
         // but we must be careful for cases when the core-specific slice is shorter
         // than the longest needle! It's a very unlikely case in practice, but we
         // still may want an optimization for it down the road.
-        char_t const *overlapping_start = std::max(optimal_end - max_needle_length + 1, haystack_begin);
-        char_t const *overlapping_end = std::min(optimal_end + max_needle_length - 1, haystack_end);
+        char_t const *overlapping_start;
+        char_t const *overlapping_end;
+        if (optimal_start + max_needle_length >= optimal_end) {
+            // Our needles are longer than a slice for the core
+            overlapping_start = optimal_start;
+            overlapping_end = std::min(optimal_start + max_needle_length, haystack_end);
+        }
+        else {
+            overlapping_start = std::max(optimal_end - max_needle_length + 1, haystack_begin);
+            overlapping_end = std::min(optimal_end + max_needle_length - 1, haystack_end);
+        }
+
+        // Count the matches that start in one core's slice and end in another
         size_t count_matches_overlapping = 0;
         dict_.find({overlapping_start, overlapping_end}, [&](match_t match) noexcept {
             bool is_boundary = match.needle.begin() < optimal_end && match.needle.end() >= optimal_end;

From d47d849c337f23d07f4eeb4ec302905d0fbe2dc4 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 18 Apr 2025 11:01:40 +0000
Subject: [PATCH 348/751] Improve: Splitting jobs in baseline multi-search

---
 scripts/test_stringcuzilla.cuh | 98 ++++++++++++++++++----------------
 1 file changed, 52 insertions(+), 46 deletions(-)

diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index 9b844d4a..485ab9ae 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -18,7 +18,8 @@
 
 #include "test_stringzilla.hpp" // `arrow_strings_view_t`
 
-#include <execution> // `std::execution::par_unseq`
+#include <cstring> // `std::memcmp`
+#include <thread>  // `std::thread::hardware_concurrency`
 
 namespace ashvardanian {
 namespace stringzilla {
@@ -653,59 +654,64 @@ struct find_many_baselines_t {
 
     void reset() noexcept { needles_.reset(); }
 
-    template <typename haystack_type_, typename needle_type_, typename match_callback_type_>
-    void one_pair(haystack_type_ const &haystack, needle_type_ const &needle,
-                  match_callback_type_ &&callback) const noexcept {
+    template <typename haystack_type_, typename needles_type_, typename match_callback_type_>
+    bool one_haystack(haystack_type_ const &haystack, needles_type_ const &needles,
+                      match_callback_type_ &&single_threaded_callback) const noexcept {
 
-        // Define iterators for the current haystack and the needle.
-        auto haystack_begin = haystack.begin();
-        auto haystack_end = haystack.end();
-        auto needle_begin = needle.begin();
-        auto needle_end = needle.end();
-
-        // Use `std::search` to find all occurrences of needle in haystack.
-        while (true) {
-            auto it = std::search(std::execution::par_unseq, haystack_begin, haystack_end, needle_begin, needle_end);
-            if (it == haystack_end) break;
-
-            // Compute the starting index of the found occurrence.
-            std::size_t found = static_cast<std::size_t>(std::distance(haystack.begin(), it));
-
-            // Construct a match record.
-            match_t match;
-            match.haystack = {haystack.data(), haystack.size()};
-            match.needle = {haystack.data() + found, needle.size()};
-
-            // Invoke the callback. If it returns false, abort all further processing.
-            if (!callback(match)) return;
-
-            // Advance the starting iterator for the next search.
-            haystack_begin = it + 1;
+        // A wise man once said, `omp parallel for collapse(2) schedule(dynamic, 1)`...
+        // But the compiler wasn't listening, and won't compile the cancellation point!
+        // So we resort to a much less intricate solution:
+        // - Manually slice the data per thread,
+        // - Keep one atomic variable to signal cancellation,
+        // - Use absolutely minimal OpenMP functionality just to assign N slices to N threads,
+        // - Call the callback function for each match found, but just from the main thread.
+        std::atomic<bool> aborted {false};
+        std::size_t const haystack_size = haystack.size();
+        std::size_t const threads_count = std::thread::hardware_concurrency();
+        std::size_t const start_offsets_per_thread = divide_round_up(haystack_size, threads_count);
+
+#pragma omp parallel for schedule(static, 1)
+        for (std::size_t thread_index = 0; thread_index != threads_count; ++thread_index) {
+            std::size_t const start_offset = std::min(thread_index * start_offsets_per_thread, haystack_size);
+            std::size_t const end_offset = std::min(start_offset + start_offsets_per_thread, haystack_size);
+
+            // Check for matches in the current slice
+            for (std::size_t match_offset = start_offset;
+                 match_offset != end_offset && !aborted.load(std::memory_order_relaxed); ++match_offset) {
+                for (std::size_t needle_index = 0; needle_index != needles.size(); ++needle_index) {
+                    auto const &needle = needles[needle_index];
+                    if (match_offset + needle.size() > haystack_size) continue;
+                    auto const same = std::memcmp(haystack.data() + match_offset, needle.data(), needle.size()) == 0;
+                    if (!same) continue;
+
+                    // Create a match object
+                    match_t match;
+                    match.haystack_index = 0;
+                    match.needle_index = needle_index;
+                    match.haystack = {haystack.data(), haystack.size()};
+                    match.needle = {haystack.data() + match_offset, needle.size()};
+#pragma omp critical
+                    {
+                        if (!single_threaded_callback(match)) aborted.store(true, std::memory_order_relaxed);
+                    }
+                }
+            }
         }
+
+        return !aborted.load(std::memory_order_relaxed);
     }
 
     template <typename haystacks_type_, typename needles_type_, typename match_callback_type_>
     void all_pairs(haystacks_type_ &&haystacks, needles_type_ &&needles,
                    match_callback_type_ &&callback) const noexcept {
 
-        // A wise man once said, `omp parallel for collapse(2) schedule(dynamic, 1)`...
-        // But the compiler wasn't listening, and won't compile the cancellation point!
-        std::size_t const total_tasks = haystacks.size() * needles.size();
-        for (std::size_t task = 0; task != total_tasks; ++task) {
-            std::size_t const i = task / needles.size();
-            std::size_t const j = task % needles.size();
-
-            auto const &haystack = haystacks[i];
-            auto const &needle = needles[j];
-            bool keep_going = true;
-            one_pair(haystack, needle, [&](match_t match) {
-                match.haystack_index = i;
-                match.needle_index = j;
-                keep_going = callback(match);
-                return keep_going;
-            });
-            // Quit the outer loop if the callback returns false
-            if (!keep_going) return;
+        for (std::size_t haystack_index = 0; haystack_index != haystacks.size(); ++haystack_index) {
+            auto const &haystack = haystacks[haystack_index];
+            if (!one_haystack(haystack, needles, [&](match_t match) noexcept {
+                    match.haystack_index = haystack_index;
+                    return callback(match);
+                }))
+                return;
         }
     }
 

From 0895d28fa05a7f73413d915a7322875e2e94de78 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 18 Apr 2025 11:54:26 +0000
Subject: [PATCH 349/751] Improve: Faster multi-needle tests

---
 scripts/test_stringcuzilla.cuh | 35 +++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index 485ab9ae..7ce9d0d5 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -656,15 +656,14 @@ struct find_many_baselines_t {
 
     template <typename haystack_type_, typename needles_type_, typename match_callback_type_>
     bool one_haystack(haystack_type_ const &haystack, needles_type_ const &needles,
-                      match_callback_type_ &&single_threaded_callback) const noexcept {
+                      match_callback_type_ &&callback) const noexcept {
 
         // A wise man once said, `omp parallel for collapse(2) schedule(dynamic, 1)`...
         // But the compiler wasn't listening, and won't compile the cancellation point!
         // So we resort to a much less intricate solution:
         // - Manually slice the data per thread,
         // - Keep one atomic variable to signal cancellation,
-        // - Use absolutely minimal OpenMP functionality just to assign N slices to N threads,
-        // - Call the callback function for each match found, but just from the main thread.
+        // - Use absolutely minimal OpenMP functionality just to assign N slices to N threads.
         std::atomic<bool> aborted {false};
         std::size_t const haystack_size = haystack.size();
         std::size_t const threads_count = std::thread::hardware_concurrency();
@@ -690,9 +689,9 @@ struct find_many_baselines_t {
                     match.needle_index = needle_index;
                     match.haystack = {haystack.data(), haystack.size()};
                     match.needle = {haystack.data() + match_offset, needle.size()};
-#pragma omp critical
-                    {
-                        if (!single_threaded_callback(match)) aborted.store(true, std::memory_order_relaxed);
+                    if (!callback(match)) {
+                        aborted.store(true, std::memory_order_relaxed);
+                        break;
                     }
                 }
             }
@@ -719,7 +718,8 @@ struct find_many_baselines_t {
     status_t try_count(haystacks_type_ &&haystacks, span<size_t> counts) const noexcept {
         for (size_t &count : counts) count = 0;
         all_pairs(haystacks, needles_, [&](match_t const &match) noexcept {
-            counts[match.haystack_index] += 1;
+            std::atomic_ref<size_t> count(counts[match.haystack_index]);
+            count.fetch_add(1, std::memory_order_relaxed);
             return true;
         });
         return status_t::success_k;
@@ -729,11 +729,12 @@ struct find_many_baselines_t {
     status_t try_find(haystacks_type_ &&haystacks, output_matches_type_ &&matches,
                       size_t &matches_total) const noexcept {
 
-        size_t count_found = 0, count_allowed = matches.size();
+        std::atomic<size_t> count_found {0};
+        std::size_t const count_allowed {matches.size()};
         all_pairs(haystacks, needles_, [&](match_t const &match) noexcept {
-            matches[count_found] = match;
-            count_found += 1;
-            return count_found < count_allowed;
+            size_t match_index = count_found.fetch_add(1, std::memory_order_relaxed);
+            matches[match_index] = match;
+            return match_index < count_allowed;
         });
         matches_total = count_found;
         return status_t::success_k;
@@ -966,16 +967,16 @@ void test_find_many_equivalence() {
     haystacks_config.batch_size = default_cpu_specs.cores_total() * 4;
     haystacks_config.max_string_length = default_cpu_specs.l3_bytes;
 
-    needles_long_config.min_string_length = 8;
-    needles_long_config.max_string_length = 10;
-    needles_long_config.batch_size =
-        std::pow(needles_long_config.alphabet.size(), needles_long_config.max_string_length);
-
     needles_short_config.min_string_length = 1;
-    needles_short_config.max_string_length = 6;
+    needles_short_config.max_string_length = 4;
     needles_short_config.batch_size =
         std::pow(needles_short_config.alphabet.size(), needles_short_config.max_string_length);
 
+    needles_long_config.min_string_length = 3;
+    needles_long_config.max_string_length = 6;
+    needles_long_config.batch_size =
+        std::pow(needles_long_config.alphabet.size(), needles_long_config.max_string_length);
+
     // Single-threaded serial Aho-Corasick implementation
     test_find_many_fixed(find_many_baselines_t {}, find_many_serial_t {});
     test_find_many_fuzzy(find_many_baselines_t {}, find_many_serial_t {}, needles_short_config, haystacks_config, 1);

From b85d3ca86f0fd3f59490ad61042fe439e5ae40d1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 18 Apr 2025 11:55:02 +0000
Subject: [PATCH 350/751] Fix: Over-estimating number of overlapping matches

---
 include/stringcuzilla/find_many.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index 2ba684e8..5126e8d2 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -812,7 +812,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
         // Count the matches that start in one core's slice and end in another
         size_t count_matches_overlapping = 0;
         dict_.find({overlapping_start, overlapping_end}, [&](match_t match) noexcept {
-            bool is_boundary = match.needle.begin() < optimal_end && match.needle.end() >= optimal_end;
+            bool is_boundary = match.needle.begin() < optimal_end && match.needle.end() > optimal_end;
             count_matches_overlapping += is_boundary;
             return true;
         });

From 938bf6f8c8eb69ee717ef7182f37c84b540f8ece Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 20 Apr 2025 15:30:27 +0000
Subject: [PATCH 351/751] Add: Levenshtein on Kepler

Custom instructions there don't lead
to higher performance yet. On a batch
of 32768 pairs of length 1K aminoacids,
the performance is:

- CUDA: 913 GCUPS
- Kepler: 698 GCUPS
---
 CONTRIBUTING.md                      |  31 ++++--
 include/stringcuzilla/similarity.cuh | 159 ++++++++++++++++++---------
 include/stringcuzilla/types.cuh      |  26 ++++-
 include/stringzilla/types.h          |   1 +
 scripts/bench_similarity.cuh         |  10 ++
 5 files changed, 168 insertions(+), 59 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3312f583..09ae6619 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -60,7 +60,7 @@ gzip -d xlsum.csv.gz
 # Human chromosome generator dataset generated by:
 # https://github.com/rghilduta/human-chromosome-data-generator/blob/main/generate_chromosome_data.sh
 # 1200 rows, each 800 characters long (939K)
-wget --no-clobber -O proteins.txt https://media.githubusercontent.com/media/rghilduta/human-chromosome-data-generator/main/examples/human_protein_1200row_800len.txt
+wget --no-clobber -O proteins.txt https://media.githubusercontent.com/media/rghilduta/human-chromosome-data-generator/main/examples/human_50000row_1000len.txt
 ```
 
 ## IDE Integrations
@@ -169,13 +169,28 @@ For benchmarks, you can use the following commands:
 
 ```bash
 cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -B build_release
-cmake --build build_release --config Release      # Produces the following targets:
-build_release/stringzilla_bench_memory <path>     # - for string copies and fills
-build_release/stringzilla_bench_search <path>     # - for substring search
-build_release/stringzilla_bench_token <path>      # - for hashing, equality comparisons, etc.
-build_release/stringzilla_bench_similarity <path> # - for edit distances and alignment scores
-build_release/stringzilla_bench_sequence <path>   # - for sorting arrays of strings
-build_release/stringzilla_bench_container <path>  # - for STL containers with string keys
+cmake --build build_release --config Release  # Produces the following targets:
+build_release/stringzilla_bench_memory        # - for string copies and fills
+build_release/stringzilla_bench_search        # - for substring search
+build_release/stringzilla_bench_token         # - for hashing, equality comparisons, etc.
+build_release/stringzilla_bench_sequence      # - for sorting arrays of strings
+build_release/stringzilla_bench_container     # - for STL containers with string keys
+```
+
+There are also parallel algorithms that need a very different benchmarking setup:
+
+```sh
+build_release/stringzilla_bench_similarity    # - for parallel edit distances and alignment scores on CPU
+build_release/stringcuzilla_bench_similarity  # - for parallel edit distances and alignment scores on GPU
+```
+
+All of them support customization via environment variables.
+Let's say you want to benchmark large-batch DNA similarity scoring kernels:
+
+```sh
+cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -B build_release
+cmake --build build_release --config Release --target stringcuzilla_bench_similarity
+STRINGWARS_FILTER=32768 STRINGWARS_DATASET="acgt_100k_1k.txt" build_release/stringcuzilla_bench_similarity
 ```
 
 Each benchmark originates from an identically named single-source file in the `scripts/` directory.
diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index cb73e6a2..d555905d 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -46,6 +46,8 @@
 namespace ashvardanian {
 namespace stringzilla {
 
+#pragma region - Algorithm Building Blocks
+
 /**
  *  @brief GPU adaptation of the `scorer` on CUDA, avoiding warp-level shuffles and DPX.
  *  @note Uses 32-bit `uint` counter to iterate through the string slices, so it can't be over 4 billion characters.
@@ -221,17 +223,23 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
     }
 };
 
-#if SZ_USE_HOPPER && 0
+#if SZ_USE_KEPLER
 
 /**
  *  @brief GPU adaptation of the `scorer` - Minimizes Global Levenshtein distance.
- *  @note Requires Hopper generation GPUs with DPX to handle 4x `u8` scores at a time.
+ *  @note Requires Kepler generation GPUs to handle 4x `u8` scores at a time.
+ *
+ *  Relies on following instruction families to output 4x @b `u8` scores per call:
+ *  - @b `prmt` to shuffle bytes in 32 bit registers.
+ *  - @b `vmax4,vmin4,vadd4` video-processing instructions.
  */
 template <>
-struct scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k, sz_cap_hopper_k>
-    : public scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k, sz_cap_cuda_k> {
+struct linear_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k,
+                     sz_similarity_global_k, sz_caps_ck_k>
+    : public linear_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k,
+                           sz_similarity_global_k, sz_cap_cuda_k> {
 
-    using scorer_t::scorer; // Make the constructors visible
+    using scorer_t::linear_scorer; // Make the constructors visible
 
     __forceinline__ __device__ void operator()(                             //
         char const *first_reversed_slice, char const *second_slice, uint n, // ! Unlike CPU, uses `uint`
@@ -245,59 +253,101 @@ struct scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_min
         // We want to minimize single-byte processing in favor of 4-byte SIMD loads and min/max operations.
         // Assuming we are reading consecutive values from a buffer, in every cycle, most likely, we will be
         // dealing with most values being unaligned!
-
-        using sz_u8x4_t = unsigned int;
-        uint const n_full_quad_bytes = n / 4;
-        for (uint i = threadIdx.x; i < n_full_quad_bytes; i += blockDim.x) {
-            sz_u8x4_t pre_substitution = ((sz_u8x4_t *)scores_pre_substitution)[i];
-            sz_u8x4_t pre_insertion = ((sz_u8x4_t *)scores_pre_insertion)[i];
-            sz_u8x4_t pre_deletion = ((sz_u8x4_t *)scores_pre_deletion)[i];
-            sz_u8x4_t &score_new = ((sz_u8x4_t *)scores_new)[i];
-
-            sz_u8x4_t first_reversed_chars = ((sz_u8x4_t *)first_reversed_slice)[i];
-            sz_u8x4_t second_chars = ((sz_u8x4_t *)second_slice)[i];
+        sz_u32_vec_t pre_substitution_vec, pre_insertion_vec, pre_deletion_vec;
+        sz_u32_vec_t first_reversed_vec, second_vec;
+        sz_u32_vec_t cost_of_substitution_vec, if_substitution_vec, if_deletion_or_insertion_vec;
+        sz_u32_vec_t cell_score_vec;
+
+        // ! As we are processing 4 bytes per loop, and have at least 32 threads per block (32 * 4 = 128),
+        // ! and deal with strings only under 256 bytes, this loop will fire at most twice per input.
+        for (uint i = threadIdx.x * 4; i < n; i += blockDim.x * 4) { // ! will spill outside of bounds, and it's OK!
+            pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
+            pre_insertion_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
+            pre_deletion_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
+            first_reversed_vec = sz_u32_load_unaligned(first_reversed_slice + i);
+            second_vec = sz_u32_load_unaligned(second_slice + i);
 
             // Equality comparison will output 0xFF for each matching byte.
             // Adding one to it will make it 0x00 for each matching byte, and 0x01 for each non-matching byte.
             // Perfect for substitution cost!
-            sz_u8x4_t cost_of_substitution = __vaddus4(__vcmpeq4(first_reversed_chars, second_chars), 0x01010101);
-            sz_u8x4_t if_substitution = __vaddus4(pre_substitution, cost_of_substitution);
-            sz_u8x4_t if_deletion_or_insertion = __vaddus4(__vminu4(pre_deletion, pre_insertion), 0x01010101);
-            sz_u8x4_t cell_score = __vminu4(if_deletion_or_insertion, if_substitution);
-            score_new = cell_score;
+            cost_of_substitution_vec.u32 = __vadd4(__vcmpeq4(first_reversed_vec.u32, second_vec.u32), 0x01010101);
+            if_substitution_vec.u32 = __vaddus4(pre_substitution_vec.u32, cost_of_substitution_vec.u32);
+            if_deletion_or_insertion_vec.u32 =
+                __vaddus4(__vminu4(pre_deletion_vec.u32, pre_insertion_vec.u32), 0x01010101);
+            cell_score_vec.u32 = __vminu4(if_deletion_or_insertion_vec.u32, if_substitution_vec.u32);
+
+            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
+            scores_new[i + 0] = cell_score_vec.u8s[0];
+            scores_new[i + 1] = cell_score_vec.u8s[1];
+            scores_new[i + 2] = cell_score_vec.u8s[2];
+            scores_new[i + 3] = cell_score_vec.u8s[3];
         }
 
-        // Don't forget the last 1-3 elements of the last chunk.
-        // We can offload them to the last thread in the warp.
-        // The last element of the last chunk is the result of the global alignment.
-        if (threadIdx.x == 0) {
-            for (uint i = n_full_quad_bytes * 4; i < n; ++i) {
-                sz_u8_t pre_substitution = scores_pre_substitution[i];
-                sz_u8_t pre_insertion = scores_pre_insertion[i];
-                sz_u8_t pre_deletion = scores_pre_deletion[i];
-                error_cost_t cost_of_substitution = first_reversed_slice[i] != second_slice[i];
-                score_t if_substitution = pre_substitution + cost_of_substitution;
-                score_t if_deletion_or_insertion = std::min(pre_deletion, pre_insertion) + gap_cost;
-                score_t cell_score = std::min(if_deletion_or_insertion, if_substitution);
-                scores_new[i] = cell_score;
-            }
-            this->last_cell_ = scores_new[n - 1];
-        }
+        // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
+        if (threadIdx.x == 0) this->last_cell_ = scores_new[0];
     }
 };
 
 template <>
-struct scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k, sz_cap_hopper_k>
-    : public scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
-                    sz_cap_cuda_k> {
-    using scorer_t::scorer; // Make the constructors visible
+struct linear_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
+                     sz_similarity_global_k, sz_caps_ck_k>
+    : public linear_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
+                           sz_similarity_global_k, sz_cap_cuda_k> {
+    using scorer_t::linear_scorer; // Make the constructors visible
+
+    __forceinline__ __device__ void operator()(                             //
+        char const *first_reversed_slice, char const *second_slice, uint n, // ! Unlike CPU, uses `uint`
+        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,
+        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
+
+        sz_u16_t const gap_cost = this->gap_cost_;
+        _sz_assert(gap_cost == 1);
+
+        // The hardest part of this kernel is dealing with unaligned loads!
+        // We want to minimize single-byte processing in favor of 2-byte SIMD loads and min/max operations.
+        // Assuming we are reading consecutive values from a buffer, in every cycle, most likely, we will be
+        // dealing with most values being unaligned!
+        sz_u32_vec_t pre_substitution_vec, pre_insertion_vec, pre_deletion_vec;
+        sz_u32_vec_t first_reversed_vec, second_vec;
+        sz_u32_vec_t cost_of_substitution_vec, if_substitution_vec, if_deletion_or_insertion_vec;
+        sz_u32_vec_t cell_score_vec;
+
+        // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
+        // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
+        for (uint i = threadIdx.x * 2; i < n; i += blockDim.x * 2) { // ! will spill outside of bounds, and it's OK!
+            pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
+            pre_insertion_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
+            pre_deletion_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
+            first_reversed_vec.u16s[0] = first_reversed_slice[i + 0];
+            first_reversed_vec.u16s[1] = first_reversed_slice[i + 1];
+            second_vec.u16s[0] = second_slice[i + 0];
+            second_vec.u16s[1] = second_slice[i + 1];
+
+            // Equality comparison will output 0xFFFF for each matching byte-pair.
+            // Adding one to it will make it 0x0000 for each matching byte-pair,
+            // and 0x0001 for each non-matching byte-pair. Perfect for substitution cost!
+            cost_of_substitution_vec.u32 = __vadd2(__vcmpeq2(first_reversed_vec.u32, second_vec.u32), 0x00010001);
+            if_substitution_vec.u32 = __vaddus2(pre_substitution_vec.u32, cost_of_substitution_vec.u32);
+            if_deletion_or_insertion_vec.u32 =
+                __vaddus2(__vminu2(pre_deletion_vec.u32, pre_insertion_vec.u32), 0x00010001);
+            cell_score_vec.u32 = __vminu2(if_deletion_or_insertion_vec.u32, if_substitution_vec.u32);
+
+            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
+            scores_new[i + 0] = cell_score_vec.u16s[0];
+            scores_new[i + 1] = cell_score_vec.u16s[1];
+        }
+
+        // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
+        if (threadIdx.x == 0) this->last_cell_ = scores_new[0];
+    }
 };
 
 template <>
-struct scorer<char const *, char const *, sz_u32_t, error_costs_uniform_t, sz_minimize_distance_k, sz_cap_hopper_k>
-    : public scorer<char const *, char const *, sz_u32_t, error_costs_uniform_t, sz_minimize_distance_k,
-                    sz_cap_cuda_k> {
-    using scorer_t::scorer; // Make the constructors visible
+struct linear_scorer<char const *, char const *, sz_u32_t, error_costs_uniform_t, sz_minimize_distance_k,
+                     sz_similarity_global_k, sz_caps_ck_k>
+    : public linear_scorer<char const *, char const *, sz_u32_t, error_costs_uniform_t, sz_minimize_distance_k,
+                           sz_similarity_global_k, sz_cap_cuda_k> {
+    using scorer_t::linear_scorer; // Make the constructors visible
 };
 
 #endif
@@ -423,7 +473,7 @@ struct diagonal_walker_per_warp {
                 next_diagonal_length - 2,           // number of elements to compute with the `diagonal_aligner`
                 previous_scores,                    // costs pre substitution
                 current_scores, current_scores + 1, // costs pre insertion/deletion
-                next_scores + 1);
+                next_scores + 1);                   // ! notice unaligned write destination
 
             // Don't forget to populate the first row and the first column of the Levenshtein matrix.
             if (threadIdx.x == 0) {
@@ -515,6 +565,10 @@ sz_size_t _scores_diagonally_warp_shared_memory_requirement( //
     return max_required_shared_memory;
 }
 
+#pragma endregion
+
+#pragma region - Levenshtein Distance in CUDA
+
 /**
  *  @brief  Levenshtein edit distances algorithm evaluating the Dynamic Programming matrix
  *          @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization.
@@ -732,6 +786,7 @@ cuda_status_t _levenshtein_via_cuda_warp(
     if (count_blocks_per_multiprocessor > specs.max_blocks_per_multiprocessor)
         count_blocks_per_multiprocessor = specs.max_blocks_per_multiprocessor;
     if (count_blocks_per_multiprocessor > first_strings.size()) count_blocks_per_multiprocessor = first_strings.size();
+    _sz_assert(count_blocks_per_multiprocessor > 0);
 
     // Let's use all 32 threads in a warp.
     constexpr sz_size_t threads_per_block = 32u;
@@ -788,16 +843,20 @@ cuda_status_t _levenshtein_via_cuda_warp(
 }
 
 /** @brief Dispatches baseline Levenshtein edit distance algorithm to the GPU. */
-template <typename char_type_>
-struct levenshtein_distances<char_type_, dummy_alloc_t, sz_cap_cuda_k> {
+template <typename char_type_, sz_capability_t capability_>
+struct levenshtein_distances<char_type_, dummy_alloc_t, capability_, std::enable_if_t<capability_ & sz_cap_cuda_k>> {
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
     cuda_status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
                              results_type_ &&results, gpu_specs_t specs = {}, cudaStream_t stream = 0) const noexcept {
-        return _levenshtein_via_cuda_warp<sz_cap_cuda_k>(first_strings, second_strings, results, specs, stream);
+        return _levenshtein_via_cuda_warp<capability_>(first_strings, second_strings, results, specs, stream);
     }
 };
 
+#pragma endregion
+
+#pragma region - Needleman Wunsch Scores in CUDA
+
 /**
  *  @brief  Convenience buffer of the size matching the size of the CUDA constant memory,
  *          used to cheaper store and access the substitution costs for the characters.
@@ -1037,6 +1096,8 @@ struct needleman_wunsch_scores<char_type_, substituter_type_, dummy_alloc_t, sz_
     }
 };
 
+#pragma endregion
+
 } // namespace stringzilla
 } // namespace ashvardanian
 
diff --git a/include/stringcuzilla/types.cuh b/include/stringcuzilla/types.cuh
index 4f2e26bf..b4773b6c 100644
--- a/include/stringcuzilla/types.cuh
+++ b/include/stringcuzilla/types.cuh
@@ -17,7 +17,7 @@
 #include <optional>       // `std::optional`
 
 #if !defined(SZ_USE_HOPPER)
-#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ < 11)
+#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
 #define SZ_USE_HOPPER (1)
 #else
 #define SZ_USE_HOPPER (0)
@@ -25,7 +25,7 @@
 #endif
 
 #if !defined(SZ_USE_KEPLER)
-#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ < 3)
+#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 3)
 #define SZ_USE_KEPLER (1)
 #else
 #define SZ_USE_KEPLER (0)
@@ -108,6 +108,28 @@ struct cuda_status_t {
     inline operator status_t() const noexcept { return status; }
 };
 
+/**
+ *  @brief  Loads 32 bits from an unaligned address using the well known @b `prmt` trick.
+ *  @see    https://stackoverflow.com/a/40198552/2766161
+ */
+__forceinline__ __device__ sz_u32_vec_t sz_u32_load_unaligned(void const *ptr) noexcept {
+    // In reality we load 64 bits, and then, with `.f4e`, we forward-extract
+    // four consecutive bytes into a 32-bit register.
+    sz_u32_vec_t result;
+    asm("{\n\t"
+        "   .reg .b64    aligned_ptr;\n\t"
+        "   .reg .b32    low, high, alignment;\n\t"
+        "   and.b64      aligned_ptr, %1, 0xfffffffffffffffc;\n\t"
+        "   ld.u32       low, [aligned_ptr];\n\t"
+        "   ld.u32       high, [aligned_ptr+4];\n\t"
+        "   cvt.u32.u64  alignment, %1;\n\t"
+        "   prmt.b32.f4e %0, low, high, alignment;\n\t"
+        "}"
+        : "=r"(result.u32)
+        : "l"(ptr));
+    return result;
+}
+
 } // namespace stringzilla
 } // namespace ashvardanian
 
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 047b0a30..bc3179e0 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -475,6 +475,7 @@ typedef enum sz_capability_t {
     sz_caps_sp_k = sz_cap_serial_k | sz_cap_parallel_k,                 ///< Serial code with OpenMP
     sz_caps_spi_k = sz_cap_serial_k | sz_cap_parallel_k | sz_cap_ice_k, ///< Serial code with OpenMP and Ice Lake
     sz_caps_sps_k = sz_cap_serial_k | sz_cap_parallel_k | sz_cap_sve_k, ///< Serial code with OpenMP and SVE
+    sz_caps_ck_k = sz_cap_cuda_k | sz_cap_kepler_k,                     ///< CUDA code with Kepler
     sz_caps_ckh_k = sz_cap_cuda_k | sz_cap_kepler_k | sz_cap_hopper_k,  ///< CUDA code with Kepler and Hopper
 
 } sz_capability_t;
diff --git a/scripts/bench_similarity.cuh b/scripts/bench_similarity.cuh
index 0a14ee8d..3618c4f3 100644
--- a/scripts/bench_similarity.cuh
+++ b/scripts/bench_similarity.cuh
@@ -130,6 +130,16 @@ void bench_levenshtein(environment_t const &env) {
         scramble_accelerated_results();
 #endif
 
+#if SZ_USE_KEPLER
+        bench_unary(env, "levenshtein_kepler:batch"s + std::to_string(batch_size), call_baseline,
+                    similarities_callable<levenshtein_kepler_t, sz::gpu_specs_t>(env, results_accelerated, batch_size,
+                                                                                 {}, specs),
+                    callable_no_op_t {},        // preprocessing
+                    similarities_equality_t {}) // equality check
+            .log(baseline);
+        scramble_accelerated_results();
+#endif
+
         auto call_utf8_baseline = similarities_callable<levenshtein_utf8_serial_t>(env, results_baseline, batch_size);
         auto name_utf8_baseline = "levenshtein_utf8_serial:batch"s + std::to_string(batch_size);
         bench_result_t utf8_baseline = bench_unary(env, name_utf8_baseline, call_utf8_baseline).log();

From 914e98d4f479d7804a6ed12eec94fa2cabd89f3c Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 22 Apr 2025 20:50:46 +0000
Subject: [PATCH 352/751] Improve: Warp-shuffle reductions in SW

---
 include/stringcuzilla/similarity.cuh | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index d555905d..9507feb7 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -168,6 +168,17 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
         else { return std::max(a, b); }
     }
 
+    __forceinline__ __device__ score_t pick_best_in_warp(score_t val) const noexcept {
+        // The `__shfl_down_sync` replaces `__shfl_down`
+        // https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
+        val = pick_best(__shfl_down_sync(0xffffffff, val, 16));
+        val = pick_best(__shfl_down_sync(0xffffffff, val, 8));
+        val = pick_best(__shfl_down_sync(0xffffffff, val, 4));
+        val = pick_best(__shfl_down_sync(0xffffffff, val, 2));
+        val = pick_best(__shfl_down_sync(0xffffffff, val, 1));
+        return val;
+    }
+
   public:
     __forceinline__ __device__ linear_scorer(substituter_t substituter, error_cost_t gap_cost) noexcept
         : substituter_(substituter), gap_cost_(gap_cost) {}
@@ -214,12 +225,7 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
         }
 
         // ! Don't forget to pick the best among the best scores per thread.
-        // ! Assuming, that reducing across the warp is not possible, let's output the best score per thread
-        // ! into the expired set of cells in `scores_pre_substitution`, and sequentially reduce it afterwards.
-        scores_pre_substitution[threadIdx.x] = best_score_;
-        __syncwarp();
-        if (threadIdx.x == 0)
-            for (uint i = 1; i < blockDim.x; ++i) best_score_ = pick_best(best_score_, scores_pre_substitution[i]);
+        best_score_ = pick_best_in_warp(best_score_);
     }
 };
 

From 871d7bc469fc9dff9ccc3c4c5d810dd443b411f3 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 22 Apr 2025 20:51:12 +0000
Subject: [PATCH 353/751] Add: Draft device-wide similarity kernels

---
 include/stringcuzilla/similarity.cuh | 352 ++++++++++++++++-----------
 1 file changed, 215 insertions(+), 137 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index 9507feb7..17a9a3ab 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -71,8 +71,8 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
     using char_t = typename std::remove_cvref<first_char_t>::type;
 
-    using scorer_t = linear_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k,
-                                   sz_similarity_global_k, capability_k>;
+    using warp_scorer_t = linear_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k,
+                                        sz_similarity_global_k, capability_k>;
 
   protected:
     substituter_t substituter_;
@@ -106,14 +106,28 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
      *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
      *  @param first_reversed_slice The first string, @b reversed.
      *  @param second_slice The second string.
-     *  @param n The length of the diagonal to evaluate and the number of characters to compare from each string.
+     *
+     *  @param tasks_offset The offset of the first character to compare from each string.
+     *  @param tasks_step The step size for the next character to compare from each string.
+     *  @param tasks_count The total number of characters to compare from input slices.
+     *
+     *  @tparam index_type_ @b `uint` is recommended if the strings are under 4 billion characters.
      */
-    __forceinline__ __device__ void operator()(                                        //
-        first_iterator_t first_reversed_slice, second_iterator_t second_slice, uint n, // ! Unlike CPU, uses `uint`
-        score_t const *scores_pre_substitution, score_t const *scores_pre_insertion, score_t const *scores_pre_deletion,
-        score_t *scores_new) noexcept {
-
-        for (uint i = threadIdx.x; i < n; i += blockDim.x) {
+    template <typename index_type_>
+    __forceinline__ __device__ void operator()(                                                      //
+        first_iterator_t first_reversed_slice, second_iterator_t second_slice,                       //
+        index_type_ const tasks_offset, index_type_ const tasks_step, index_type_ const tasks_count, //
+        score_t const *scores_pre_substitution, score_t const *scores_pre_insertion,                 //
+        score_t const *scores_pre_deletion, score_t *scores_new) noexcept {
+
+        // Make sure we are called for an anti-diagonal traversal order
+        _sz_assert(scores_pre_insertion + 1 == scores_pre_deletion);
+
+        // ? One weird observation, is that even though we can avoid fetching `pre_insertion`
+        // ? from shared memory on each cycle, by slicing the work differently between the threads,
+        // ? and allowing them to reuse the previous `pre_deletion` as the new `pre_insertion`,
+        // ? that code ends up being slower than the one below.
+        for (index_type_ i = tasks_offset; i < tasks_count; i += tasks_step) {
             score_t pre_substitution = scores_pre_substitution[i];
             score_t pre_insertion = scores_pre_insertion[i];
             score_t pre_deletion = scores_pre_deletion[i];
@@ -128,7 +142,7 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
         }
 
         // The last element of the last chunk is the result of the global alignment.
-        if (threadIdx.x == 0) last_cell_ = scores_new[n - 1];
+        if (tasks_offset == 0) last_cell_ = scores_new[0];
     }
 };
 
@@ -155,8 +169,8 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
     using char_t = typename std::remove_cvref<first_char_t>::type;
 
-    using scorer_t = linear_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k,
-                                   sz_similarity_local_k, capability_k>;
+    using warp_scorer_t = linear_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k,
+                                        sz_similarity_local_k, capability_k>;
 
   protected:
     substituter_t substituter_;
@@ -199,14 +213,28 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
      *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
      *  @param first_reversed_slice The first string, @b reversed.
      *  @param second_slice The second string.
-     *  @param n The length of the diagonal to evaluate and the number of characters to compare from each string.
+     *
+     *  @param tasks_offset The offset of the first character to compare from each string.
+     *  @param tasks_step The step size for the next character to compare from each string.
+     *  @param tasks_count The total number of characters to compare from input slices.
+     *
+     *  @tparam index_type_ @b `uint` is recommended if the strings are under 4 billion characters.
      */
-    __forceinline__ __device__ void operator()(                                        //
-        first_iterator_t first_reversed_slice, second_iterator_t second_slice, uint n, // ! Unlike CPU, uses `uint`
-        score_t const *scores_pre_substitution, score_t const *scores_pre_insertion, score_t const *scores_pre_deletion,
-        score_t *scores_new) noexcept {
-
-        for (uint i = threadIdx.x; i < n; i += blockDim.x) {
+    template <typename index_type_>
+    __forceinline__ __device__ void operator()(                                                      //
+        first_iterator_t first_reversed_slice, second_iterator_t second_slice,                       //
+        index_type_ const tasks_offset, index_type_ const tasks_step, index_type_ const tasks_count, //
+        score_t const *scores_pre_substitution, score_t const *scores_pre_insertion,                 //
+        score_t const *scores_pre_deletion, score_t *scores_new) noexcept {
+
+        // Make sure we are called for an anti-diagonal traversal order
+        _sz_assert(scores_pre_insertion + 1 == scores_pre_deletion);
+
+        // ? One weird observation, is that even though we can avoid fetching `pre_insertion`
+        // ? from shared memory on each cycle, by slicing the work differently between the threads,
+        // ? and allowing them to reuse the previous `pre_deletion` as the new `pre_insertion`,
+        // ? that code ends up being slower than the one below.
+        for (index_type_ i = tasks_offset; i < tasks_count; i += tasks_step) {
             score_t pre_substitution = scores_pre_substitution[i];
             score_t pre_insertion = scores_pre_insertion[i];
             score_t pre_deletion = scores_pre_deletion[i];
@@ -245,10 +273,11 @@ struct linear_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t,
     : public linear_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k,
                            sz_similarity_global_k, sz_cap_cuda_k> {
 
-    using scorer_t::linear_scorer; // Make the constructors visible
+    using warp_scorer_t::linear_scorer; // Make the constructors visible
 
-    __forceinline__ __device__ void operator()(                             //
-        char const *first_reversed_slice, char const *second_slice, uint n, // ! Unlike CPU, uses `uint`
+    __forceinline__ __device__ void operator()(                                 //
+        char const *first_reversed_slice, char const *second_slice,             //
+        uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
         sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion, sz_u8_t const *scores_pre_deletion,
         sz_u8_t *scores_new) noexcept {
 
@@ -266,7 +295,7 @@ struct linear_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t,
 
         // ! As we are processing 4 bytes per loop, and have at least 32 threads per block (32 * 4 = 128),
         // ! and deal with strings only under 256 bytes, this loop will fire at most twice per input.
-        for (uint i = threadIdx.x * 4; i < n; i += blockDim.x * 4) { // ! will spill outside of bounds, and it's OK!
+        for (uint i = tasks_offset * 4; i < tasks_count; i += tasks_step * 4) { // ! it's OK to spill beyond bounds
             pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
             pre_insertion_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
             pre_deletion_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
@@ -290,7 +319,7 @@ struct linear_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t,
         }
 
         // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
-        if (threadIdx.x == 0) this->last_cell_ = scores_new[0];
+        if (tasks_offset == 0) this->last_cell_ = scores_new[0];
     }
 };
 
@@ -299,10 +328,11 @@ struct linear_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t
                      sz_similarity_global_k, sz_caps_ck_k>
     : public linear_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
                            sz_similarity_global_k, sz_cap_cuda_k> {
-    using scorer_t::linear_scorer; // Make the constructors visible
+    using warp_scorer_t::linear_scorer; // Make the constructors visible
 
-    __forceinline__ __device__ void operator()(                             //
-        char const *first_reversed_slice, char const *second_slice, uint n, // ! Unlike CPU, uses `uint`
+    __forceinline__ __device__ void operator()(                                 //
+        char const *first_reversed_slice, char const *second_slice,             //
+        uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
         sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,
         sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
 
@@ -320,7 +350,7 @@ struct linear_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t
 
         // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
         // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
-        for (uint i = threadIdx.x * 2; i < n; i += blockDim.x * 2) { // ! will spill outside of bounds, and it's OK!
+        for (uint i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
             pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
             pre_insertion_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
             pre_deletion_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
@@ -344,7 +374,7 @@ struct linear_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t
         }
 
         // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
-        if (threadIdx.x == 0) this->last_cell_ = scores_new[0];
+        if (tasks_offset == 0) this->last_cell_ = scores_new[0];
     }
 };
 
@@ -353,7 +383,7 @@ struct linear_scorer<char const *, char const *, sz_u32_t, error_costs_uniform_t
                      sz_similarity_global_k, sz_caps_ck_k>
     : public linear_scorer<char const *, char const *, sz_u32_t, error_costs_uniform_t, sz_minimize_distance_k,
                            sz_similarity_global_k, sz_cap_cuda_k> {
-    using scorer_t::linear_scorer; // Make the constructors visible
+    using warp_scorer_t::linear_scorer; // Make the constructors visible
 };
 
 #endif
@@ -388,7 +418,7 @@ struct diagonal_walker_per_warp {
     static constexpr sz_similarity_locality_t locality_k = locality_;
     static constexpr sz_similarity_objective_t objective_k = objective_;
 
-    using scorer_t =
+    using warp_scorer_t =
         linear_scorer<char_t const *, char_t const *, score_t, substituter_t, objective_k, locality_k, capability_k>;
 
   protected:
@@ -400,7 +430,6 @@ struct diagonal_walker_per_warp {
      *  @param[in] substituter A commutative function returning the cost of substituting one char with another.
      *  @param[in] gap_cost The uniform cost of a gap (insertion or deletion).
      *  @param[in] alloc A default-constructible allocator for the internal buffers.
-     *
      */
     __forceinline__ __device__ diagonal_walker_per_warp(substituter_t substituter, error_cost_t gap_cost) noexcept
         : substituter_(substituter), gap_cost_(gap_cost) {}
@@ -453,7 +482,7 @@ struct diagonal_walker_per_warp {
             shorter_reversed[i] = shorter_global[shorter_length - i - 1];
 
         // Initialize the first two diagonals:
-        scorer_t diagonal_aligner {substituter_, gap_cost_};
+        warp_scorer_t diagonal_aligner {substituter_, gap_cost_};
         if (threadIdx.x == 0) {
             diagonal_aligner.init(previous_scores[0], 0);
             diagonal_aligner.init(current_scores[0], 1);
@@ -476,6 +505,7 @@ struct diagonal_walker_per_warp {
             diagonal_aligner(                                                //
                 shorter_reversed + shorter_length - next_diagonal_index + 1, // first sequence of characters
                 longer,                                                      // second sequence of characters
+                threadIdx.x, blockDim.x,                                     //
                 next_diagonal_length - 2,           // number of elements to compute with the `diagonal_aligner`
                 previous_scores,                    // costs pre substitution
                 current_scores, current_scores + 1, // costs pre insertion/deletion
@@ -502,6 +532,7 @@ struct diagonal_walker_per_warp {
             diagonal_aligner(                                        //
                 shorter_reversed + shorter_length - shorter_dim + 1, // first sequence of characters
                 longer + next_diagonal_index - shorter_dim,          // second sequence of characters
+                threadIdx.x, blockDim.x,                             //
                 next_diagonal_length - 1,           // number of elements to compute with the `diagonal_aligner`
                 previous_scores,                    // costs pre substitution
                 current_scores, current_scores + 1, // costs pre insertion/deletion
@@ -529,6 +560,7 @@ struct diagonal_walker_per_warp {
             diagonal_aligner(                                        //
                 shorter_reversed + shorter_length - shorter_dim + 1, // first sequence of characters
                 longer + next_diagonal_index - shorter_dim,          // second sequence of characters
+                threadIdx.x, blockDim.x,                             //
                 next_diagonal_length,               // number of elements to compute with the `diagonal_aligner`
                 previous_scores,                    // costs pre substitution
                 current_scores, current_scores + 1, // costs pre insertion/deletion
@@ -571,6 +603,123 @@ sz_size_t _scores_diagonally_warp_shared_memory_requirement( //
     return max_required_shared_memory;
 }
 
+/**
+ *  @brief  String similarity scoring algorithm evaluating a @b single Dynamic Programming matrix
+ *          @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization.
+ *          Unlike the `_levenshtein_in_cuda_warp` is designed to take one pair of very-longs string,
+ *          ideally @b Tens-of-Megabytes in size or more.
+ *
+ *  @param[in] shorter_string The shorter string in the pair for score calculation.
+ *  @param[in] longer_string The longer string in the pair for score calculation.
+ *  @param[out] result_ptr Output address of the score for the pair of strings.
+ *
+ *  The ideal plan is:
+ *  - Use cooperative groups abstractions for grid level synchronization between iterations.
+ *  - Keep 3 diagonals in shared memory, but not the texts, as depending on the diagonal, different
+ *    characters will be needed. Asynchronous copy of the characters from global memory to shared memory
+ *    will help hide that latency.
+ *  - Each block of threads takes its own slice of those 3 diagonals, constrained by the amount of shared
+ *    memory available on the device.
+ *  - Every neighboring pair of diagonals has a different length, and boundary elements of each slice need
+ *    to be exchanged through shared memory.
+ *
+ *  The current starter plan is much simpler:
+ *  - Keep everything in global memory - the strings and the diagonals.
+ *  - Execute the naive algorithm, expecting the hardware to handle coalescing the memory accesses.
+ */
+template <               //
+    typename char_type_, //
+    typename index_type_,
+    typename score_type_ = sz_size_t,           //
+    sz_capability_t capability_ = sz_cap_cuda_k //
+    >
+__global__ void _score_across_cuda_device(                              //
+    char_type_ const *shorter_reversed_ptr, index_type_ shorter_length, //
+    char_type_ const *longer_ptr, index_type_ longer_length,            //
+    score_type_ *result_ptr, score_type_ *diagonals_ptr) {
+
+    _sz_assert(shorter_string.size() > 0);
+    _sz_assert(longer_string.size() > 0);
+    _sz_assert(shorter_string.size() <= longer_string.size());
+    using char_t = char_type_;
+    using index_t = index_type_;
+    using score_t = score_type_;
+
+    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
+    static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
+    using warp_scorer_t =
+        linear_scorer<char_t const *, char_t const *, score_t, substituter_t, objective_k, locality_k, capability_k>;
+
+    // Only one thread will be initializing the top row and left column and outputting the result.
+    bool const is_main_thread = blockIdx.x == 0 && threadIdx.x == 0;
+
+    // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
+    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
+    index_t const shorter_dim = shorter_length + 1, longer_dim = longer_length + 1;
+
+    // Let's say we are dealing with 3 and 5 letter words.
+    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
+    // It will have:
+    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
+    // - 2 diagonals of fixed length, at positions: 4, 5.
+    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
+    index_t const diagonals_count = shorter_dim + longer_dim - 1;
+    index_t const max_diagonal_length = shorter_length + 1;
+
+    // The next few pointers will be swapped around.
+    score_t *previous_scores = diagonals.data();
+    score_t *current_scores = diagonals.data() + max_diagonal_length;
+    score_t *next_scores = diagonals.data() + 2 * max_diagonal_length;
+
+    // Initialize the first two diagonals:
+    warp_scorer_t diagonal_aligner {substituter_, gap_cost_};
+    if (is_main_thread == 0) {
+        diagonal_aligner.init(previous_scores[0], 0);
+        diagonal_aligner.init(current_scores[0], 1);
+        diagonal_aligner.init(current_scores[1], 1);
+    }
+
+    cg::grid_group grid = cg::this_grid();
+
+    // We skip diagonals 0 and 1, as they are trivial.
+    // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
+    // so we are effectively computing just one value, as will be marked by a single set bit in
+    // the `next_diagonal_mask` on the very first iteration.
+    index_t next_diagonal_index = 2;
+    index_t const global_thread_index = threadIdx.x + blockIdx.x * blockDim.x;
+    index_t const global_thread_step = blockDim.x * gridDim.x;
+
+    // Progress through the upper-left triangle of the Levenshtein matrix.
+    for (; next_diagonal_index < shorter_dim; ++next_diagonal_index) {
+
+        uint const next_diagonal_length = next_diagonal_index + 1;
+        diagonal_aligner(                                                //
+            shorter_reversed + shorter_length - next_diagonal_index + 1, // first sequence of characters
+            longer,                                                      // second sequence of characters
+            global_thread_index, global_thread_step,                     //
+            next_diagonal_length - 2,           // number of elements to compute with the `diagonal_aligner`
+            previous_scores,                    // costs pre substitution
+            current_scores, current_scores + 1, // costs pre insertion/deletion
+            next_scores + 1);                   // ! notice unaligned write destination
+
+        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
+        if (threadIdx.x == 0) {
+            diagonal_aligner.init(next_scores[0], next_diagonal_index);
+            diagonal_aligner.init(next_scores[next_diagonal_length - 1], next_diagonal_index);
+        }
+        grid.sync();
+
+        // Perform a circular rotation of those buffers, to reuse the memory.
+        score_t *temporary = previous_scores;
+        previous_scores = current_scores;
+        current_scores = next_scores;
+        next_scores = temporary;
+    }
+
+    if (is_main_thread) *result_ptr = current_scores;
+}
+
 #pragma endregion
 
 #pragma region - Levenshtein Distance in CUDA
@@ -583,8 +732,7 @@ sz_size_t _scores_diagonally_warp_shared_memory_requirement( //
  *  @param[in] first_strings Array of first strings in each pair for score calculation.
  *  @param[in] second_strings Array of second strings in each pair for score calculation.
  *  @param[out] results_ptr Output array of scores for each pair of strings.
- *  @param[in] max_diagonal_length Maximum length of the strings to be processed. Everything above that will be @b
- * skipped.
+ *  @param[in] max_diagonal_length Maximum length of the strings to process. Everything above that will be @b skipped.
  */
 template <                                      //
     typename first_strings_type_,               //
@@ -613,8 +761,6 @@ __global__ void _levenshtein_in_cuda_warp( //
                                                  sz_similarity_global_k, capability_k>;
     using walker_u16_t = diagonal_walker_per_warp<char_t, sz_u16_t, error_costs_uniform_t, objective_k,
                                                   sz_similarity_global_k, capability_k>;
-    using walker_u32_t = diagonal_walker_per_warp<char_t, sz_u32_t, error_costs_uniform_t, objective_k,
-                                                  sz_similarity_global_k, capability_k>;
 
     // Allocating shared memory is handled on the host side.
     extern __shared__ char shared_memory_buffer[];
@@ -630,18 +776,22 @@ __global__ void _levenshtein_in_cuda_warp( //
         sz_size_t const first_length = first_global.length();
         sz_size_t const second_length = second_global.length();
         if (first_length == 0) {
-            result_ref = second_length;
+            if (threadIdx.x == 0) result_ref = second_length;
             continue;
         }
         if (second_length == 0) {
-            result_ref = first_length;
+            if (threadIdx.x == 0) result_ref = first_length;
             continue;
         }
 
         // Estimate the maximum dimension of the DP matrix to pick the smallest fitting type.
         using similarity_memory_requirements_t = similarity_memory_requirements<uint, false>;
         similarity_memory_requirements_t requirements(first_length, second_length, 1, sizeof(char_t), 4);
-        if (requirements.max_diagonal_length >= max_diagonal_length) continue;
+        if (requirements.max_diagonal_length >= max_diagonal_length) {
+            // ! Overwrite the value to signal the need to process it separately
+            if (threadIdx.x == 0) result_ref = std::numeric_limits<score_t>::max();
+            continue;
+        }
 
         span<char const> const first = {first_global.data(), first_length};
         span<char const> const second = {second_global.data(), second_length};
@@ -653,6 +803,7 @@ __global__ void _levenshtein_in_cuda_warp( //
             if (threadIdx.x == 0) result_ref = result_u8;
         }
         else {
+            _sz_assert(requirements.bytes_per_cell == 2);
             sz_u16_t result_u16 = (sz_u16_t)-1;
             walker_u16_t walker({}, 1);
             walker(first, second, result_u16, shared_memory_buffer);
@@ -661,108 +812,18 @@ __global__ void _levenshtein_in_cuda_warp( //
     }
 }
 
-/**
- *  @brief  Levenshtein edit distances algorithm evaluating the Dynamic Programming matrix
- *          @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization.
- *          Each pair of strings takes the whole device to compute the score and should only be used for huge inputs.
- *
- *  @param[in] first_strings Array of first strings in each pair for score calculation.
- *  @param[in] second_strings Array of second strings in each pair for score calculation.
- *  @param[out] results_ptr Output array of scores for each pair of strings.
- *  @param[in] min_diagonal_length Minimum length of the strings to be processed. Everything below that will be @b
- * skipped.
- */
-template <                                      //
-    typename first_strings_type_,               //
-    typename second_strings_type_,              //
-    typename score_type_ = sz_size_t,           //
-    sz_capability_t capability_ = sz_cap_cuda_k //
-    >
-__global__ void _levenshtein_in_cuda_device( //
-    first_strings_type_ first_strings,       //
-    second_strings_type_ second_strings,     //
-    score_type_ *results_ptr,                //
-    sz_size_t min_diagonal_length = 0) {
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    using first_string_t = typename first_strings_type_::value_type;
-    using second_string_t = typename second_strings_type_::value_type;
-    using first_char_t = typename first_string_t::value_type;
-    using second_char_t = typename second_string_t::value_type;
-    static_assert(sizeof(first_char_t) == sizeof(second_char_t), "Character types don't match");
-    using char_t = typename std::remove_cvref<first_char_t>::type;
-    using score_t = score_type_;
-
-    static constexpr sz_capability_t capability_k = capability_;
-    static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
-    using walker_u16_t = diagonal_walker_per_warp<char_t, sz_u16_t, error_costs_uniform_t, objective_k,
-                                                  sz_similarity_global_k, capability_k>;
-    using walker_u32_t = diagonal_walker_per_warp<char_t, sz_u32_t, error_costs_uniform_t, objective_k,
-                                                  sz_similarity_global_k, capability_k>;
-    using walker_u64_t = diagonal_walker_per_warp<char_t, sz_u64_t, error_costs_uniform_t, objective_k,
-                                                  sz_similarity_global_k, capability_k>;
-
-    // Allocating shared memory is handled on the host side.
-    extern __shared__ char shared_memory_buffer[];
-
-    // We are computing N edit distances for N pairs of strings. Not a cartesian product!
-    // Each block/warp may end up receiving a different number of strings.
-    for (sz_size_t pair_idx = blockIdx.x; pair_idx < first_strings.size(); pair_idx += gridDim.x) {
-        first_string_t const first_global = first_strings[pair_idx];
-        second_string_t const second_global = second_strings[pair_idx];
-        score_t &result_ref = results_ptr[pair_idx];
-
-        // Skip empty strings.
-        sz_size_t const first_length = first_global.length();
-        sz_size_t const second_length = second_global.length();
-        if (first_length == 0) {
-            result_ref = second_length;
-            continue;
-        }
-        if (second_length == 0) {
-            result_ref = first_length;
-            continue;
-        }
-
-        // Estimate the maximum dimension of the DP matrix to pick the smallest fitting type.
-        using similarity_memory_requirements_t = similarity_memory_requirements<uint, false>;
-        similarity_memory_requirements_t requirements(first_length, second_length, 1, sizeof(char_t), 4);
-        if (requirements.max_diagonal_length >= min_diagonal_length) continue;
-
-        span<char const> const first = {first_global.data(), first_length};
-        span<char const> const second = {second_global.data(), second_length};
-
-        if (requirements.bytes_per_cell == 2) {
-            sz_u16_t result_u16 = (sz_u16_t)-1;
-            walker_u16_t walker({}, 1);
-            walker(first, second, result_u16, shared_memory_buffer);
-            if (threadIdx.x == 0) result_ref = result_u16;
-        }
-        else if (requirements.bytes_per_cell == 4) {
-            sz_u32_t result_u32 = (sz_u32_t)-1;
-            walker_u32_t walker({}, 1);
-            walker(first, second, result_u32, shared_memory_buffer);
-            if (threadIdx.x == 0) result_ref = result_u32;
-        }
-        else if (requirements.bytes_per_cell == 8) {
-            sz_u64_t result_u64 = (sz_u64_t)-1;
-            walker_u64_t walker({}, 1);
-            walker(first, second, result_u64, shared_memory_buffer);
-            if (threadIdx.x == 0) result_ref = result_u64;
-        }
-    }
-}
-
 /** @brief Dispatches on @b `_levenshtein_in_cuda_warp` on the device side from the host side. */
 template <                                       //
     sz_capability_t capability_ = sz_cap_cuda_k, //
     typename first_strings_type_,                //
     typename second_strings_type_,               //
+    typename allocator_type_,                    //
     typename score_type_ = sz_size_t             //
     >
-cuda_status_t _levenshtein_via_cuda_warp(                                                 //
+cuda_status_t _levenshtein_distances_implementation(                                      //
     first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
-    score_type_ *results, gpu_specs_t specs = {}, cudaStream_t stream = 0) noexcept(false) {
+    score_type_ *results, allocator_type_ const &allocator = {}, gpu_specs_t specs = {},  //
+    cudaStream_t stream = 0) noexcept(false) {
 
     // We need to be able to copy these function arguments into GPU memory:
     static constexpr sz_capability_t capability_k = capability_;
@@ -845,17 +906,34 @@ cuda_status_t _levenshtein_via_cuda_warp(
             return {status_t::bad_alloc_k, execution_error, execution_milliseconds};
         }
         else { return {status_t::unknown_k, execution_error, execution_milliseconds}; }
+
+    // Go through the results and check if any of them were too big to be processed
+    // by the warp-level implementation, and should be processed by the `_score_across_cuda_device`.
+    // Assuming we need to pre-reverse the shorter string in each such pair
+    sz_size_t combined_length_of_strings_to_reverse = 0;
+    for (sz_size_t i = 0; i < first_strings.size(); ++i) {
+        if (results[i] != std::numeric_limits<score_t>::max()) continue;
+        // We need to process this string pair separately.
+        auto const &first_global = first_strings[i];
+        auto const &second_global = second_strings[i];
+        auto const &shorter_global = first_global.length() < second_global.length() ? first_global : second_global;
+        auto const &longer_global = first_global.length() < second_global.length() ? second_global : first_global;
+
+        _score_across_cuda_device;
+    }
+
     return {status_t::success_k, cudaSuccess, execution_milliseconds};
 }
 
 /** @brief Dispatches baseline Levenshtein edit distance algorithm to the GPU. */
-template <typename char_type_, sz_capability_t capability_>
-struct levenshtein_distances<char_type_, dummy_alloc_t, capability_, std::enable_if_t<capability_ & sz_cap_cuda_k>> {
+template <typename char_type_, typename allocator_type_, sz_capability_t capability_>
+struct levenshtein_distances<char_type_, allocator_type_, capability_, std::enable_if_t<capability_ & sz_cap_cuda_k>> {
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
     cuda_status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
                              results_type_ &&results, gpu_specs_t specs = {}, cudaStream_t stream = 0) const noexcept {
-        return _levenshtein_via_cuda_warp<capability_>(first_strings, second_strings, results, specs, stream);
+        return _levenshtein_distances_implementation<capability_>(first_strings, second_strings, results, specs,
+                                                                  stream);
     }
 };
 

From 83dd5fb89bedeb0fb12662d22fb2e9d2fb24a868 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 23 Apr 2025 22:36:57 +0000
Subject: [PATCH 354/751] Improve: Fetch warp-size dynamically

---
 include/stringcuzilla/types.cuh | 1 +
 include/stringzilla/types.hpp   | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/stringcuzilla/types.cuh b/include/stringcuzilla/types.cuh
index b4773b6c..0f9cff39 100644
--- a/include/stringcuzilla/types.cuh
+++ b/include/stringcuzilla/types.cuh
@@ -89,6 +89,7 @@ inline std::optional<gpu_specs_t> gpu_specs(int device = 0) noexcept {
     specs.streaming_multiprocessors = prop.multiProcessorCount;
     specs.constant_memory_bytes = prop.totalConstMem;
     specs.vram_bytes = prop.totalGlobalMem;
+    specs.warp_size = prop.warpSize;
 
     // Infer other global settings, that CUDA doesn't expose directly
     specs.shared_memory_bytes = prop.sharedMemPerMultiprocessor * prop.multiProcessorCount;
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index c8dcb094..f230ac98 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -759,8 +759,9 @@ struct gpu_specs_t {
     size_t shared_memory_bytes = 192 * 1024 * 108; // ? On A100 it's 192 KB per SM
     size_t streaming_multiprocessors = 108;        // ? On A100
     size_t cuda_cores = 6912;                      // ? On A100 for f32/i32 logic
+    size_t reserved_memory_per_block = 1024;       // ? Typically, 1 KB per block is reserved for bookkeeping
+    size_t warp_size = 32;                         // ? Warp size is 32 threads on practically all GPUs
     size_t max_blocks_per_multiprocessor = 0;
-    size_t reserved_memory_per_block = 0;
 
     inline size_t shared_memory_per_multiprocessor() const noexcept {
         return shared_memory_bytes / streaming_multiprocessors;

From 9c5a56cc971b4506f707e2ffff0aade5ca7aa850 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 23 Apr 2025 22:38:53 +0000
Subject: [PATCH 355/751] Improve: Alloc type-size check in `safe_vector`

This improves the usability, if the allocator produces
instances of an equally sized type.
---
 include/stringzilla/types.hpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index f230ac98..822eda42 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -613,6 +613,10 @@ class safe_vector {
     using size_type = std::size_t;
     using allocator_type = allocator_type_;
 
+    using allocated_type = typename std::allocator_traits<allocator_type>::value_type;
+    static_assert(sizeof(value_type) == sizeof(allocated_type),
+                  "Allocator value type must be the same size as the vector value type");
+
   private:
     value_type *data_;
     size_type size_;
@@ -632,7 +636,7 @@ class safe_vector {
 
     void reset() noexcept {
         clear();
-        if (data_) alloc_.deallocate(data_, capacity_);
+        if (data_) alloc_.deallocate((allocated_type *)data_, capacity_);
         data_ = nullptr;
         size_ = 0;
         capacity_ = 0;
@@ -651,7 +655,7 @@ class safe_vector {
     safe_vector &operator=(safe_vector &&other) noexcept {
         if (this != &other) {
             clear();
-            if (data_) alloc_.deallocate(data_, capacity_);
+            if (data_) alloc_.deallocate((allocated_type *)data_, capacity_);
             data_ = other.data_;
             size_ = other.size_;
             capacity_ = other.capacity_;
@@ -665,13 +669,13 @@ class safe_vector {
 
     status_t try_reserve(size_type new_cap) noexcept {
         if (new_cap <= capacity_) return status_t::success_k;
-        value_type *new_data = alloc_.allocate(new_cap);
+        value_type *new_data = (value_type *)alloc_.allocate(new_cap);
         if (!new_data) return status_t::bad_alloc_k;
         for (size_type i = 0; i < size_; ++i) {
             new (new_data + i) value_type(std::move(data_[i]));
             if constexpr (!std::is_trivially_destructible<value_type>::value) data_[i].~value_type();
         }
-        if (data_) alloc_.deallocate(data_, capacity_);
+        if (data_) alloc_.deallocate((allocated_type *)data_, capacity_);
         data_ = new_data;
         capacity_ = new_cap;
         return status_t::success_k;

From 945613cd46b03f80550de2ad1db37e17513c26cb Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 23 Apr 2025 22:41:29 +0000
Subject: [PATCH 356/751] Add: Affine gap extensions baseline

This is the first step to support affine gaps
in string similarity measures for bioinformatics
usecases.
---
 scripts/test_stringcuzilla.cuh | 149 +++++++++++++++++++++++++++++++--
 1 file changed, 141 insertions(+), 8 deletions(-)

diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index 7ce9d0d5..f3567ab0 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -168,6 +168,116 @@ inline std::ptrdiff_t smith_waterman_baseline(char const *s1, std::size_t len1,
     return best_score;
 }
 
+/**
+ *  @brief Inefficient baseline Needleman-Wunsch-Gotoh alignment score computation, as implemented in most codebases.
+ *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
+ *  @see https://github.com/gata-bio/affine-gaps
+ */
+inline std::ptrdiff_t needleman_wunsch_gotoh_baseline(                  //
+    char const *s1, std::size_t len1, char const *s2, std::size_t len2, //
+    std::function<error_cost_t(char, char)> substitution_cost_for,      //
+    error_cost_t gap_opening_cost, error_cost_t gap_extension_cost) noexcept(false) {
+
+    std::size_t const rows = len1 + 1;
+    std::size_t const cols = len2 + 1;
+    std::vector<std::ptrdiff_t> matrix_scores(rows * cols);
+    std::vector<std::ptrdiff_t> matrix_inserts(rows * cols);
+    std::vector<std::ptrdiff_t> matrix_deletes(rows * cols);
+
+    // Initialize the borders of the matrix.
+    matrix_scores[0] = 0;
+    for (std::size_t i = 1; i < rows; ++i) {
+        matrix_scores[i * cols + 0] /* [i][0] in 2D */ = gap_opening_cost + (i - 1) * gap_extension_cost;
+        matrix_inserts[i * cols + 0] /* [i][0] in 2D */ = gap_opening_cost * 2 + i * gap_extension_cost;
+    }
+    for (std::size_t j = 0; j < cols; ++j) {
+        matrix_scores[0 * cols + j] /* [0][j] in 2D */ = gap_opening_cost + (j - 1) * gap_extension_cost;
+        matrix_deletes[0 * cols + j] /* [0][j] in 2D */ = gap_opening_cost * 2 + j * gap_extension_cost;
+    }
+
+    // Fill in the rest of the matrix.
+    for (std::size_t i = 1; i < rows; ++i) {
+        std::ptrdiff_t const *last_row = &matrix_scores[(i - 1) * cols];
+        std::ptrdiff_t *row = &matrix_scores[i * cols];
+        std::ptrdiff_t const *last_inserts_row = &matrix_inserts[(i - 1) * cols];
+        std::ptrdiff_t *row_inserts = &matrix_inserts[i * cols];
+        std::ptrdiff_t const *last_deletes_row = &matrix_deletes[(i - 1) * cols];
+        std::ptrdiff_t *row_deletes = &matrix_deletes[i * cols];
+        for (std::size_t j = 1; j < cols; ++j) {
+            std::ptrdiff_t substitution_cost = substitution_cost_for(s1[i - 1], s2[j - 1]);
+            std::ptrdiff_t if_substitution = last_row[j - 1] + substitution_cost;
+            std::ptrdiff_t if_deletion =
+                std::max(last_row[j] + gap_opening_cost, last_deletes_row[j] + gap_extension_cost);
+            std::ptrdiff_t if_insertion =
+                std::max(row[j - 1] + gap_opening_cost, last_inserts_row[j - 1] + gap_extension_cost);
+            std::ptrdiff_t if_deletion_or_insertion = std::max(if_deletion, if_insertion);
+            row[j] = std::max(if_deletion_or_insertion, if_substitution);
+            row_inserts[j] = if_insertion;
+            row_deletes[j] = if_deletion;
+        }
+    }
+
+    return matrix_scores.back();
+}
+
+/**
+ *  @brief Inefficient baseline Smith-Waterman-Gotoh alignment score computation, as implemented in most codebases.
+ *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
+ *  @see https://github.com/gata-bio/affine-gaps
+ */
+inline std::ptrdiff_t smith_waterman_gotoh_baseline(                    //
+    char const *s1, std::size_t len1, char const *s2, std::size_t len2, //
+    std::function<error_cost_t(char, char)> substitution_cost_for,      //
+    error_cost_t gap_opening_cost, error_cost_t gap_extension_cost) noexcept(false) {
+
+    std::size_t const rows = len1 + 1;
+    std::size_t const cols = len2 + 1;
+    std::vector<std::ptrdiff_t> matrix_scores(rows * cols);
+    std::vector<std::ptrdiff_t> matrix_inserts(rows * cols);
+    std::vector<std::ptrdiff_t> matrix_deletes(rows * cols);
+
+    // Unlike the global alignment we need to track the largest score in the matrix.
+    std::ptrdiff_t best_score = 0;
+
+    // Initialize the borders of the matrix.
+    matrix_scores[0] = 0;
+    for (std::size_t i = 1; i < rows; ++i) {
+        matrix_scores[i * cols + 0] /* [i][0] in 2D */ = 0;
+        matrix_inserts[i * cols + 0] /* [i][0] in 2D */ = gap_opening_cost + gap_extension_cost;
+    }
+    for (std::size_t j = 0; j < cols; ++j) {
+        matrix_scores[0 * cols + j] /* [0][j] in 2D */ = 0;
+        matrix_deletes[0 * cols + j] /* [0][j] in 2D */ = gap_opening_cost + gap_extension_cost;
+    }
+
+    // Fill in the rest of the matrix.
+    for (std::size_t i = 1; i < rows; ++i) {
+        std::ptrdiff_t const *last_row = &matrix_scores[(i - 1) * cols];
+        std::ptrdiff_t *row = &matrix_scores[i * cols];
+        std::ptrdiff_t const *last_inserts_row = &matrix_inserts[(i - 1) * cols];
+        std::ptrdiff_t *row_inserts = &matrix_inserts[i * cols];
+        std::ptrdiff_t const *last_deletes_row = &matrix_deletes[(i - 1) * cols];
+        std::ptrdiff_t *row_deletes = &matrix_deletes[i * cols];
+        for (std::size_t j = 1; j < cols; ++j) {
+            std::ptrdiff_t substitution_cost = substitution_cost_for(s1[i - 1], s2[j - 1]);
+            std::ptrdiff_t if_substitution = last_row[j - 1] + substitution_cost;
+            std::ptrdiff_t if_deletion =
+                std::max(last_row[j] + gap_opening_cost, last_deletes_row[j] + gap_extension_cost);
+            std::ptrdiff_t if_insertion =
+                std::max(row[j - 1] + gap_opening_cost, last_inserts_row[j - 1] + gap_extension_cost);
+            std::ptrdiff_t if_deletion_or_insertion = std::max(if_deletion, if_insertion);
+            std::ptrdiff_t if_substitution_or_reset = std::max<std::ptrdiff_t>(if_substitution, 0);
+            std::ptrdiff_t score = std::max(if_deletion_or_insertion, if_substitution_or_reset);
+            row[j] = score;
+            row_inserts[j] = if_insertion;
+            row_deletes[j] = if_deletion;
+            best_score = std::max(best_score, score);
+        }
+    }
+
+    return best_score;
+}
+
 struct levenshtein_baselines_t {
     template <typename results_type_>
     status_t operator()(arrow_strings_view_t first, arrow_strings_view_t second, results_type_ *results) const {
@@ -183,16 +293,28 @@ struct levenshtein_baselines_t {
 struct needleman_wunsch_baselines_t {
 
     error_costs_256x256_t substitution_costs = error_costs_256x256_t::diagonal();
-    error_cost_t gap_cost = -1;
+    error_cost_t gap_opening_cost = -1;
+    error_cost_t gap_extension_cost = -1;
+
+    needleman_wunsch_baselines_t() = default;
+    needleman_wunsch_baselines_t(error_costs_256x256_t subs, linear_gap_costs_t gap)
+        : substitution_costs(subs), gap_opening_cost(gap.open_or_extend), gap_extension_cost(gap.open_or_extend) {}
+    needleman_wunsch_baselines_t(error_costs_256x256_t subs, affine_gap_costs_t gap)
+        : substitution_costs(subs), gap_opening_cost(gap.open), gap_extension_cost(gap.extend) {}
 
     status_t operator()(arrow_strings_view_t first, arrow_strings_view_t second, sz_ssize_t *results) const {
         _sz_assert(first.size() == second.size());
 
 #pragma omp parallel for
         for (std::size_t i = 0; i != first.size(); ++i)
-            results[i] = needleman_wunsch_baseline(first[i].data(), first[i].size(),   //
-                                                   second[i].data(), second[i].size(), //
-                                                   substitution_costs, gap_cost);
+            results[i] =
+                gap_opening_cost == gap_extension_cost
+                    ? needleman_wunsch_baseline(first[i].data(), first[i].size(),   //
+                                                second[i].data(), second[i].size(), //
+                                                substitution_costs, gap_opening_cost)
+                    : needleman_wunsch_gotoh_baseline(first[i].data(), first[i].size(),   //
+                                                      second[i].data(), second[i].size(), //
+                                                      substitution_costs, gap_opening_cost, gap_extension_cost);
         return status_t::success_k;
     }
 };
@@ -200,16 +322,27 @@ struct needleman_wunsch_baselines_t {
 struct smith_waterman_baselines_t {
 
     error_costs_256x256_t substitution_costs = error_costs_256x256_t::diagonal();
-    error_cost_t gap_cost = -1;
+    error_cost_t gap_opening_cost = -1;
+    error_cost_t gap_extension_cost = -1;
+
+    smith_waterman_baselines_t() = default;
+    smith_waterman_baselines_t(error_costs_256x256_t subs, linear_gap_costs_t gap)
+        : substitution_costs(subs), gap_opening_cost(gap.open_or_extend), gap_extension_cost(gap.open_or_extend) {}
+    smith_waterman_baselines_t(error_costs_256x256_t subs, affine_gap_costs_t gap)
+        : substitution_costs(subs), gap_opening_cost(gap.open), gap_extension_cost(gap.extend) {}
 
     status_t operator()(arrow_strings_view_t first, arrow_strings_view_t second, sz_ssize_t *results) const {
         _sz_assert(first.size() == second.size());
 
 #pragma omp parallel for
         for (std::size_t i = 0; i != first.size(); ++i)
-            results[i] = smith_waterman_baseline(first[i].data(), first[i].size(),   //
-                                                 second[i].data(), second[i].size(), //
-                                                 substitution_costs, gap_cost);
+            results[i] = gap_opening_cost == gap_extension_cost
+                             ? smith_waterman_baseline(first[i].data(), first[i].size(),   //
+                                                       second[i].data(), second[i].size(), //
+                                                       substitution_costs, gap_opening_cost)
+                             : smith_waterman_gotoh_baseline(first[i].data(), first[i].size(),   //
+                                                             second[i].data(), second[i].size(), //
+                                                             substitution_costs, gap_opening_cost, gap_extension_cost);
         return status_t::success_k;
     }
 };

From 07196d0855865a0ca30b3bb1cd409bf87867142e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 23 Apr 2025 22:43:36 +0000
Subject: [PATCH 357/751] Add: Draft global scoring in CUDA

For cases when inputs are too to fit on a
single Streaming Multiprocessor, we resort
to scoring in global memory. This is only
a draft, given all the changes needed to
support affine gaps and high-utilization
scheduling.
---
 include/stringcuzilla/similarity.cuh | 859 +++++++++++++++------------
 1 file changed, 494 insertions(+), 365 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index 17a9a3ab..ffe0178b 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -42,6 +42,8 @@
 
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <cuda/pipeline>        // `cuda::pipeline`
+#include <cooperative_groups.h> // `cooperative_groups::this_grid()`
 
 namespace ashvardanian {
 namespace stringzilla {
@@ -54,8 +56,8 @@ namespace stringzilla {
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
-struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, objective_,
-                     sz_similarity_global_k, capability_, std::enable_if_t<capability_ & sz_cap_cuda_k>> {
+struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, objective_,
+                   sz_similarity_global_k, capability_, std::enable_if_t<capability_ & sz_cap_cuda_k>> {
 
     using first_iterator_t = first_iterator_type_;
     using second_iterator_t = second_iterator_type_;
@@ -71,8 +73,8 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
     using char_t = typename std::remove_cvref<first_char_t>::type;
 
-    using warp_scorer_t = linear_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k,
-                                        sz_similarity_global_k, capability_k>;
+    using warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k,
+                                      sz_similarity_global_k, capability_k>;
 
   protected:
     substituter_t substituter_;
@@ -85,7 +87,7 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
     }
 
   public:
-    __forceinline__ __device__ linear_scorer(substituter_t substituter, error_cost_t gap_cost) noexcept
+    __forceinline__ __device__ tile_scorer(substituter_t substituter, error_cost_t gap_cost) noexcept
         : substituter_(substituter), gap_cost_(gap_cost) {}
 
     /**
@@ -104,7 +106,7 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
 
     /**
      *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
-     *  @param first_reversed_slice The first string, @b reversed.
+     *  @param first_slice The first string, unlike the CPU variant @b NOT reversed.
      *  @param second_slice The second string.
      *
      *  @param tasks_offset The offset of the first character to compare from each string.
@@ -115,7 +117,7 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
      */
     template <typename index_type_>
     __forceinline__ __device__ void operator()(                                                      //
-        first_iterator_t first_reversed_slice, second_iterator_t second_slice,                       //
+        first_iterator_t first_slice, second_iterator_t second_slice,                                //
         index_type_ const tasks_offset, index_type_ const tasks_step, index_type_ const tasks_count, //
         score_t const *scores_pre_substitution, score_t const *scores_pre_insertion,                 //
         score_t const *scores_pre_deletion, score_t *scores_new) noexcept {
@@ -132,9 +134,7 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
             score_t pre_insertion = scores_pre_insertion[i];
             score_t pre_deletion = scores_pre_deletion[i];
 
-            // ? Note that here we are still traversing both buffers in the same order,
-            // ? because one of the strings has been reversed beforehand.
-            error_cost_t cost_of_substitution = substituter_(first_reversed_slice[i], second_slice[i]);
+            error_cost_t cost_of_substitution = substituter_(first_slice[tasks_count - i - 1], second_slice[i]);
             score_t if_substitution = pre_substitution + cost_of_substitution;
             score_t if_deletion_or_insertion = pick_best(pre_deletion, pre_insertion) + gap_cost_;
             score_t cell_score = pick_best(if_deletion_or_insertion, if_substitution);
@@ -152,8 +152,8 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
-struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, objective_,
-                     sz_similarity_local_k, capability_, std::enable_if_t<capability_ & sz_cap_cuda_k>> {
+struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, objective_,
+                   sz_similarity_local_k, capability_, std::enable_if_t<capability_ & sz_cap_cuda_k>> {
 
     using first_iterator_t = first_iterator_type_;
     using second_iterator_t = second_iterator_type_;
@@ -169,8 +169,8 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
     using char_t = typename std::remove_cvref<first_char_t>::type;
 
-    using warp_scorer_t = linear_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k,
-                                        sz_similarity_local_k, capability_k>;
+    using warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k,
+                                      sz_similarity_local_k, capability_k>;
 
   protected:
     substituter_t substituter_;
@@ -194,7 +194,7 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
     }
 
   public:
-    __forceinline__ __device__ linear_scorer(substituter_t substituter, error_cost_t gap_cost) noexcept
+    __forceinline__ __device__ tile_scorer(substituter_t substituter, error_cost_t gap_cost) noexcept
         : substituter_(substituter), gap_cost_(gap_cost) {}
 
     /**
@@ -211,7 +211,7 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
 
     /**
      *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
-     *  @param first_reversed_slice The first string, @b reversed.
+     *  @param first_slice The first string, unlike the CPU variant @b NOT reversed.
      *  @param second_slice The second string.
      *
      *  @param tasks_offset The offset of the first character to compare from each string.
@@ -222,7 +222,7 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
      */
     template <typename index_type_>
     __forceinline__ __device__ void operator()(                                                      //
-        first_iterator_t first_reversed_slice, second_iterator_t second_slice,                       //
+        first_iterator_t first_slice, second_iterator_t second_slice,                                //
         index_type_ const tasks_offset, index_type_ const tasks_step, index_type_ const tasks_count, //
         score_t const *scores_pre_substitution, score_t const *scores_pre_insertion,                 //
         score_t const *scores_pre_deletion, score_t *scores_new) noexcept {
@@ -239,9 +239,7 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
             score_t pre_insertion = scores_pre_insertion[i];
             score_t pre_deletion = scores_pre_deletion[i];
 
-            // ? Note that here we are still traversing both buffers in the same order,
-            // ? because one of the strings has been reversed beforehand.
-            error_cost_t cost_of_substitution = substituter_(first_reversed_slice[i], second_slice[i]);
+            error_cost_t cost_of_substitution = substituter_(first_slice[tasks_count - i - 1], second_slice[i]);
             score_t if_substitution = pre_substitution + cost_of_substitution;
             score_t if_deletion_or_insertion = pick_best(pre_deletion, pre_insertion) + gap_cost_;
             score_t if_substitution_or_reset = pick_best(if_substitution, 0);
@@ -268,15 +266,15 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
  *  - @b `vmax4,vmin4,vadd4` video-processing instructions.
  */
 template <>
-struct linear_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k,
-                     sz_similarity_global_k, sz_caps_ck_k>
-    : public linear_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k,
-                           sz_similarity_global_k, sz_cap_cuda_k> {
+struct tile_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k,
+                   sz_similarity_global_k, sz_caps_ck_k>
+    : public tile_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k,
+                         sz_similarity_global_k, sz_cap_cuda_k> {
 
-    using warp_scorer_t::linear_scorer; // Make the constructors visible
+    using warp_scorer_t::tile_scorer; // Make the constructors visible
 
     __forceinline__ __device__ void operator()(                                 //
-        char const *first_reversed_slice, char const *second_slice,             //
+        char const *first_slice, char const *second_slice,                      //
         uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
         sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion, sz_u8_t const *scores_pre_deletion,
         sz_u8_t *scores_new) noexcept {
@@ -289,7 +287,7 @@ struct linear_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t,
         // Assuming we are reading consecutive values from a buffer, in every cycle, most likely, we will be
         // dealing with most values being unaligned!
         sz_u32_vec_t pre_substitution_vec, pre_insertion_vec, pre_deletion_vec;
-        sz_u32_vec_t first_reversed_vec, second_vec;
+        sz_u32_vec_t first_vec, second_vec;
         sz_u32_vec_t cost_of_substitution_vec, if_substitution_vec, if_deletion_or_insertion_vec;
         sz_u32_vec_t cell_score_vec;
 
@@ -299,13 +297,14 @@ struct linear_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t,
             pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
             pre_insertion_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
             pre_deletion_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
-            first_reversed_vec = sz_u32_load_unaligned(first_reversed_slice + i);
+            first_vec = sz_u32_load_unaligned(first_slice + tasks_count - i - 4);
             second_vec = sz_u32_load_unaligned(second_slice + i);
+            first_vec.u32 = __nv_bswap32(first_vec.u32); // ! reverse the order of bytes in the first vector
 
             // Equality comparison will output 0xFF for each matching byte.
             // Adding one to it will make it 0x00 for each matching byte, and 0x01 for each non-matching byte.
             // Perfect for substitution cost!
-            cost_of_substitution_vec.u32 = __vadd4(__vcmpeq4(first_reversed_vec.u32, second_vec.u32), 0x01010101);
+            cost_of_substitution_vec.u32 = __vadd4(__vcmpeq4(first_vec.u32, second_vec.u32), 0x01010101);
             if_substitution_vec.u32 = __vaddus4(pre_substitution_vec.u32, cost_of_substitution_vec.u32);
             if_deletion_or_insertion_vec.u32 =
                 __vaddus4(__vminu4(pre_deletion_vec.u32, pre_insertion_vec.u32), 0x01010101);
@@ -324,14 +323,14 @@ struct linear_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t,
 };
 
 template <>
-struct linear_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
-                     sz_similarity_global_k, sz_caps_ck_k>
-    : public linear_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
-                           sz_similarity_global_k, sz_cap_cuda_k> {
-    using warp_scorer_t::linear_scorer; // Make the constructors visible
+struct tile_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
+                   sz_similarity_global_k, sz_caps_ck_k>
+    : public tile_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
+                         sz_similarity_global_k, sz_cap_cuda_k> {
+    using warp_scorer_t::tile_scorer; // Make the constructors visible
 
     __forceinline__ __device__ void operator()(                                 //
-        char const *first_reversed_slice, char const *second_slice,             //
+        char const *first_slice, char const *second_slice,                      //
         uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
         sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,
         sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
@@ -344,7 +343,7 @@ struct linear_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t
         // Assuming we are reading consecutive values from a buffer, in every cycle, most likely, we will be
         // dealing with most values being unaligned!
         sz_u32_vec_t pre_substitution_vec, pre_insertion_vec, pre_deletion_vec;
-        sz_u32_vec_t first_reversed_vec, second_vec;
+        sz_u32_vec_t first_vec, second_vec;
         sz_u32_vec_t cost_of_substitution_vec, if_substitution_vec, if_deletion_or_insertion_vec;
         sz_u32_vec_t cell_score_vec;
 
@@ -354,15 +353,15 @@ struct linear_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t
             pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
             pre_insertion_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
             pre_deletion_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
-            first_reversed_vec.u16s[0] = first_reversed_slice[i + 0];
-            first_reversed_vec.u16s[1] = first_reversed_slice[i + 1];
+            first_vec.u16s[0] = *(first_slice + tasks_count - i - 1); // ! with a [] lookup would underflow
+            first_vec.u16s[1] = *(first_slice + tasks_count - i - 2); // ! with a [] lookup would underflow
             second_vec.u16s[0] = second_slice[i + 0];
             second_vec.u16s[1] = second_slice[i + 1];
 
             // Equality comparison will output 0xFFFF for each matching byte-pair.
             // Adding one to it will make it 0x0000 for each matching byte-pair,
             // and 0x0001 for each non-matching byte-pair. Perfect for substitution cost!
-            cost_of_substitution_vec.u32 = __vadd2(__vcmpeq2(first_reversed_vec.u32, second_vec.u32), 0x00010001);
+            cost_of_substitution_vec.u32 = __vadd2(__vcmpeq2(first_vec.u32, second_vec.u32), 0x00010001);
             if_substitution_vec.u32 = __vaddus2(pre_substitution_vec.u32, cost_of_substitution_vec.u32);
             if_deletion_or_insertion_vec.u32 =
                 __vaddus2(__vminu2(pre_deletion_vec.u32, pre_insertion_vec.u32), 0x00010001);
@@ -379,80 +378,245 @@ struct linear_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t
 };
 
 template <>
-struct linear_scorer<char const *, char const *, sz_u32_t, error_costs_uniform_t, sz_minimize_distance_k,
-                     sz_similarity_global_k, sz_caps_ck_k>
-    : public linear_scorer<char const *, char const *, sz_u32_t, error_costs_uniform_t, sz_minimize_distance_k,
-                           sz_similarity_global_k, sz_cap_cuda_k> {
-    using warp_scorer_t::linear_scorer; // Make the constructors visible
+struct tile_scorer<char const *, char const *, sz_u32_t, error_costs_uniform_t, sz_minimize_distance_k,
+                   sz_similarity_global_k, sz_caps_ck_k>
+    : public tile_scorer<char const *, char const *, sz_u32_t, error_costs_uniform_t, sz_minimize_distance_k,
+                         sz_similarity_global_k, sz_cap_cuda_k> {
+    using warp_scorer_t::tile_scorer; // Make the constructors visible
 };
 
 #endif
 
 /**
- *  @brief  Alignment Score and Edit Distance algorithm evaluating the Dynamic Programming matrix
+ *  @brief  String similarity scoring algorithm evaluating a @b single Dynamic Programming matrix
  *          @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization.
- *          This function implements a logic for a single pair of strings.
+ *          Unlike the `_levenshtein_in_cuda_warp` is designed to take one pair of very-longs string,
+ *          ideally @b Tens-of-Megabytes in size or more.
  *
- *  We could have implemented the logic of this function as part of the `scores_diagonally` kernel,
- *  but we want to control the used @b `distance_type_` at the level of each warp and score computation.
- *  If all of the strings except for one are 100-ish bytes, but one is 1000-ish bytes, we want to use
- *  the 8-bit `distance_type_` for the smaller strings, and 16-bit `distance_type_` for the larger one.
- *  The smaller the type, the more likely we are to use specialized @b SIMD instructions, like DPX on Hopper.
+ *  @param[in] shorter_string The shorter string in the pair for score calculation.
+ *  @param[in] longer_string The longer string in the pair for score calculation.
+ *  @param[out] result_ptr Output address of the score for the pair of strings.
+ *
+ *  The ideal plan is:
+ *  - Use cooperative groups abstractions for grid level synchronization between iterations.
+ *  - Keep 3 diagonals in shared memory, but not the texts, as depending on the diagonal, different
+ *    characters will be needed. Asynchronous copy of the characters from global memory to shared memory
+ *    will help hide that latency.
+ *  - Each block of threads takes its own slice of those 3 diagonals, constrained by the amount of shared
+ *    memory available on the device.
+ *  - Every neighboring pair of diagonals has a different length, and boundary elements of each slice need
+ *    to be exchanged through shared memory.
+ *
+ *  The current starter plan is much simpler:
+ *  - Keep everything in global memory - the strings and the diagonals.
+ *  - Execute the naive algorithm, expecting the hardware to handle coalescing the memory accesses.
  */
 template <                                                       //
     typename char_type_ = char,                                  //
+    typename index_type_ = uint,                                 //
     typename score_type_ = sz_size_t,                            //
     typename substituter_type_ = error_costs_uniform_t,          //
     sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
     sz_similarity_locality_t locality_ = sz_similarity_global_k, //
-    sz_capability_t capability_ = sz_cap_cuda_k,                 //
-    typename enable_ = void                                      //
+    sz_capability_t capability_ = sz_cap_cuda_k                  //
     >
-struct diagonal_walker_per_warp {
+__global__ void _score_across_cuda_device(                     //
+    char_type_ const *shorter_ptr, index_type_ shorter_length, //
+    char_type_ const *longer_ptr, index_type_ longer_length,   //
+    score_type_ *result_ptr, score_type_ *diagonals_ptr,       //
+    substituter_type_ const *substituter_ptr, error_cost_t const gap_cost) noexcept {
+
+    namespace cg = cooperative_groups;
 
+    _sz_assert(shorter_length > 0);
+    _sz_assert(longer_length > 0);
+    _sz_assert(shorter_length <= longer_length);
     using char_t = char_type_;
+    using index_t = index_type_;
     using score_t = score_type_;
-    using substituter_t = substituter_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
-    static constexpr sz_similarity_locality_t locality_k = locality_;
-    static constexpr sz_similarity_objective_t objective_k = objective_;
-
+    static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
+    static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
+    using substituter_t = error_costs_uniform_t;
     using warp_scorer_t =
-        linear_scorer<char_t const *, char_t const *, score_t, substituter_t, objective_k, locality_k, capability_k>;
+        tile_scorer<char_t const *, char_t const *, score_t, substituter_t, objective_k, locality_k, capability_k>;
 
-  protected:
-    substituter_t substituter_;
-    error_cost_t gap_cost_ {1};
+    // Only one thread will be initializing the top row and left column and outputting the result.
+    bool const is_main_thread = blockIdx.x == 0 && threadIdx.x == 0;
 
-  public:
-    /**
-     *  @param[in] substituter A commutative function returning the cost of substituting one char with another.
-     *  @param[in] gap_cost The uniform cost of a gap (insertion or deletion).
-     *  @param[in] alloc A default-constructible allocator for the internal buffers.
-     */
-    __forceinline__ __device__ diagonal_walker_per_warp(substituter_t substituter, error_cost_t gap_cost) noexcept
-        : substituter_(substituter), gap_cost_(gap_cost) {}
+    // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
+    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
+    index_t const shorter_dim = shorter_length + 1, longer_dim = longer_length + 1;
 
-    /**
-     *  @param[in] first The first string.
-     *  @param[in] second The second string.
-     *  @param[out] result_ref Location to dump the calculated score.
-     */
-    __forceinline__ __device__ void operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref,
-                                               char *shared_memory_buffer) const noexcept {
-
-        // Make sure the size relation between the strings is correct.
-        char_t const *shorter_global = first.data(), *longer_global = second.data();
-        sz_size_t shorter_length = first.size(), longer_length = second.size();
-        if (shorter_length > longer_length) {
-            std::swap(shorter_global, longer_global);
-            std::swap(shorter_length, longer_length);
+    // Let's say we are dealing with 3 and 5 letter words.
+    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
+    // It will have:
+    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
+    // - 2 diagonals of fixed length, at positions: 4, 5.
+    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
+    index_t const diagonals_count = shorter_dim + longer_dim - 1;
+    index_t const max_diagonal_length = shorter_length + 1;
+
+    // The next few pointers will be swapped around.
+    score_t *previous_scores = diagonals_ptr;
+    score_t *current_scores = diagonals_ptr + max_diagonal_length;
+    score_t *next_scores = diagonals_ptr + 2 * max_diagonal_length;
+
+    // Initialize the first two diagonals:
+    substituter_t const substituter;
+    error_cost_t const gap_cost = 1;
+    warp_scorer_t diagonal_aligner {substituter, gap_cost};
+    if (is_main_thread) {
+        diagonal_aligner.init(previous_scores[0], 0);
+        diagonal_aligner.init(current_scores[0], 1);
+        diagonal_aligner.init(current_scores[1], 1);
+    }
+
+    cg::grid_group grid = cg::this_grid();
+
+    // We skip diagonals 0 and 1, as they are trivial.
+    // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
+    // so we are effectively computing just one value, as will be marked by a single set bit in
+    // the `next_diagonal_mask` on the very first iteration.
+    index_t next_diagonal_index = 2;
+    index_t const global_thread_index = threadIdx.x + blockIdx.x * blockDim.x;
+    index_t const global_thread_step = blockDim.x * gridDim.x;
+
+    // Progress through the upper-left triangle of the Levenshtein matrix.
+    for (; next_diagonal_index < shorter_dim; ++next_diagonal_index) {
+
+        index_t const next_diagonal_length = next_diagonal_index + 1;
+        diagonal_aligner(                            //
+            shorter_ptr,                             // first sequence of characters
+            longer_ptr,                              // second sequence of characters
+            global_thread_index, global_thread_step, //
+            (index_t)(next_diagonal_length - 2),     // number of elements to compute with the `diagonal_aligner`
+            previous_scores,                         // costs pre substitution
+            current_scores, current_scores + 1,      // costs pre insertion/deletion
+            next_scores + 1);                        // ! notice unaligned write destination
+
+        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
+        if (is_main_thread) {
+            diagonal_aligner.init(next_scores[0], next_diagonal_index);
+            diagonal_aligner.init(next_scores[next_diagonal_length - 1], next_diagonal_index);
         }
-        constexpr sz_size_t max_allowed_length_k = std::numeric_limits<uint>::max();
-        _sz_assert(shorter_length <= longer_length);
-        _sz_assert(shorter_length > 0 && longer_length > 0);
-        _sz_assert(longer_length < max_allowed_length_k);
+        // Guarantee that all the writes have finished, before progressing to the next diagonal.
+        grid.sync();
+
+        // Perform a circular rotation of those buffers, to reuse the memory.
+        score_t *temporary = previous_scores;
+        previous_scores = current_scores;
+        current_scores = next_scores;
+        next_scores = temporary;
+    }
+
+    __shared__ cuda::pipeline_shared_state<cuda::thread_scope_system, 2> memcpy_pipeline_state;
+    auto memcpy_pipeline = cuda::make_pipeline(grid, &memcpy_pipeline_state);
+
+    // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
+    for (; next_diagonal_index < longer_dim; ++next_diagonal_index) {
+
+        index_t const next_diagonal_length = shorter_dim;
+        diagonal_aligner(                                   //
+            shorter_ptr,                                    // first sequence of characters
+            longer_ptr + next_diagonal_index - shorter_dim, // second sequence of characters
+            global_thread_index, global_thread_step,        //
+            (index_t)(next_diagonal_length - 1),            // number of elements to compute with the `diagonal_aligner`
+            previous_scores,                                // costs pre substitution
+            current_scores, current_scores + 1,             // costs pre insertion/deletion
+            next_scores);
+
+        // Don't forget to populate the first row of the Levenshtein matrix.
+        if (is_main_thread) diagonal_aligner.init(next_scores[next_diagonal_length - 1], next_diagonal_index);
+
+        // Guarantee that all the writes have finished, before progressing to the next diagonal.
+        grid.sync();
+
+        // ! In the central anti-diagonal band, we can't just set the `current_scores + 1` to `previous_scores`
+        // ! for the circular shift, as we will end up spilling outside of the diagonal a few iterations later.
+        // ! Assuming in-place `memmove` is tricky on the GPU, so we will copy the data.
+        memcpy_pipeline.producer_acquire();
+        cuda::memcpy_async(grid, (void *)previous_scores, (void const *)(current_scores + 1),
+                           (next_diagonal_length - 1) * sizeof(score_t), memcpy_pipeline);
+        cuda::memcpy_async(grid, (void *)current_scores, (void const *)(next_scores),
+                           (next_diagonal_length) * sizeof(score_t), memcpy_pipeline);
+        memcpy_pipeline.producer_commit();
+        memcpy_pipeline.consumer_wait();
+        memcpy_pipeline.consumer_release();
+    }
+
+    // Now let's handle the bottom-right triangle of the matrix.
+    for (; next_diagonal_index < diagonals_count; ++next_diagonal_index) {
+
+        index_t const next_diagonal_length = diagonals_count - next_diagonal_index;
+        diagonal_aligner(                                   //
+            shorter_ptr + next_diagonal_index - longer_dim, // first sequence of characters
+            longer_ptr + next_diagonal_index - shorter_dim, // second sequence of characters
+            global_thread_index, global_thread_step,        //
+            next_diagonal_length,                           // number of elements to compute with the `diagonal_aligner`
+            previous_scores,                                // costs pre substitution
+            current_scores, current_scores + 1,             // costs pre insertion/deletion
+            next_scores);
+
+        // Guarantee that all the writes have finished, before progressing to the next diagonal.
+        grid.sync();
+
+        // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
+        // dropping the first element in the current array.
+        score_t *temporary = previous_scores;
+        // ! Drop the first entry among the current distances.
+        // ! Assuming every next diagonal is shorter by one element, we don't need a full-blown `sz_move`.
+        // ! to shift the array by one element.
+        previous_scores = current_scores + 1;
+        current_scores = next_scores;
+        next_scores = temporary;
+    }
+
+    // Export one result per each block.
+    if (is_main_thread) *result_ptr = diagonal_aligner.score();
+}
+
+/**
+ *  @brief  Levenshtein edit distances algorithm evaluating the Dynamic Programming matrix
+ *          @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization.
+ *          Each pair of strings gets its own @b "block" of CUDA threads forming one @b warp and shared memory.
+ *
+ *  @param[in] tasks Tasks containing the strings and output locations.
+ */
+template < //
+    typename task_type_,
+    typename char_type_ = char,                                  //
+    typename index_type_ = uint,                                 //
+    typename score_type_ = sz_size_t,                            //
+    typename substituter_type_ = error_costs_uniform_t,          //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
+    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
+    sz_capability_t capability_ = sz_cap_cuda_k                  //
+    >
+__global__ void _score_on_each_cuda_warp(task_type_ *tasks, sz_size_t tasks_count) {
+
+    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
+    using task_t = task_type_;
+    using char_t = char_type_;
+    using index_t = index_type_;
+    using score_t = score_type_;
+    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_similarity_locality_t locality_k = locality_;
+    static constexpr sz_similarity_objective_t objective_k = objective_;
+
+    // Allocating shared memory is handled on the host side.
+    extern __shared__ char shared_memory_buffer[];
+
+    // We are computing N edit distances for N pairs of strings. Not a cartesian product!
+    // Each block/warp may end up receiving a different number of strings.
+    for (sz_size_t task_idx = blockIdx.x; task_idx < first_strings.size(); task_idx += gridDim.x) {
+        task_t const &task = tasks[task_idx];
+        char_t const *shorter_global = task.shorter_ptr;
+        char_t const *longer_global = task.longer_ptr;
+        sz_size_t const shorter_length = task.shorter_length;
+        sz_size_t const longer_length = task.longer_length;
+        score_t &result_ref = *task.result_ptr;
 
         // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
         // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
@@ -474,12 +638,11 @@ struct diagonal_walker_per_warp {
         score_t *current_scores = reinterpret_cast<score_t *>(shared_memory_buffer + bytes_per_diagonal);
         score_t *next_scores = reinterpret_cast<score_t *>(shared_memory_buffer + 2 * bytes_per_diagonal);
         char_t *const longer = reinterpret_cast<char_t *>(shared_memory_buffer + 3 * bytes_per_diagonal);
-        char_t *const shorter_reversed = longer + longer_length;
+        char_t *const shorter = longer + longer_length;
 
         // Each thread in the warp will be loading it's own set of strided characters into shared memory.
         for (uint i = threadIdx.x; i < longer_length; i += blockDim.x) longer[i] = longer_global[i];
-        for (uint i = threadIdx.x; i < shorter_length; i += blockDim.x)
-            shorter_reversed[i] = shorter_global[shorter_length - i - 1];
+        for (uint i = threadIdx.x; i < shorter_length; i += blockDim.x) shorter[i] = shorter_global[i];
 
         // Initialize the first two diagonals:
         warp_scorer_t diagonal_aligner {substituter_, gap_cost_};
@@ -502,10 +665,10 @@ struct diagonal_walker_per_warp {
         for (; next_diagonal_index < shorter_dim; ++next_diagonal_index) {
 
             uint const next_diagonal_length = next_diagonal_index + 1;
-            diagonal_aligner(                                                //
-                shorter_reversed + shorter_length - next_diagonal_index + 1, // first sequence of characters
-                longer,                                                      // second sequence of characters
-                threadIdx.x, blockDim.x,                                     //
+            diagonal_aligner(                       //
+                shorter,                            // first sequence of characters
+                longer,                             // second sequence of characters
+                threadIdx.x, blockDim.x,            //
                 next_diagonal_length - 2,           // number of elements to compute with the `diagonal_aligner`
                 previous_scores,                    // costs pre substitution
                 current_scores, current_scores + 1, // costs pre insertion/deletion
@@ -529,13 +692,13 @@ struct diagonal_walker_per_warp {
         for (; next_diagonal_index < longer_dim; ++next_diagonal_index) {
 
             uint const next_diagonal_length = shorter_dim;
-            diagonal_aligner(                                        //
-                shorter_reversed + shorter_length - shorter_dim + 1, // first sequence of characters
-                longer + next_diagonal_index - shorter_dim,          // second sequence of characters
-                threadIdx.x, blockDim.x,                             //
-                next_diagonal_length - 1,           // number of elements to compute with the `diagonal_aligner`
-                previous_scores,                    // costs pre substitution
-                current_scores, current_scores + 1, // costs pre insertion/deletion
+            diagonal_aligner(                               //
+                shorter,                                    // first sequence of characters
+                longer + next_diagonal_index - shorter_dim, // second sequence of characters
+                threadIdx.x, blockDim.x,                    //
+                next_diagonal_length - 1,                   // number of elements to compute with the `diagonal_aligner`
+                previous_scores,                            // costs pre substitution
+                current_scores, current_scores + 1,         // costs pre insertion/deletion
                 next_scores);
 
             // Don't forget to populate the first row of the Levenshtein matrix.
@@ -557,13 +720,13 @@ struct diagonal_walker_per_warp {
         for (; next_diagonal_index < diagonals_count; ++next_diagonal_index) {
 
             uint const next_diagonal_length = diagonals_count - next_diagonal_index;
-            diagonal_aligner(                                        //
-                shorter_reversed + shorter_length - shorter_dim + 1, // first sequence of characters
-                longer + next_diagonal_index - shorter_dim,          // second sequence of characters
-                threadIdx.x, blockDim.x,                             //
-                next_diagonal_length,               // number of elements to compute with the `diagonal_aligner`
-                previous_scores,                    // costs pre substitution
-                current_scores, current_scores + 1, // costs pre insertion/deletion
+            diagonal_aligner(                               //
+                shorter + next_diagonal_index - longer_dim, // first sequence of characters
+                longer + next_diagonal_index - shorter_dim, // second sequence of characters
+                threadIdx.x, blockDim.x,                    //
+                next_diagonal_length,                       // number of elements to compute with the `diagonal_aligner`
+                previous_scores,                            // costs pre substitution
+                current_scores, current_scores + 1,         // costs pre insertion/deletion
                 next_scores);
 
             // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
@@ -581,237 +744,12 @@ struct diagonal_walker_per_warp {
         // Export one result per each block.
         if (threadIdx.x == 0) result_ref = diagonal_aligner.score();
     }
-};
-
-template <bool is_signed_, typename first_strings_type_, typename second_strings_type_>
-sz_size_t _scores_diagonally_warp_shared_memory_requirement( //
-    first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
-    sz_size_t max_magnitude_change) noexcept {
-
-    using char_t = typename first_strings_type_::value_type::value_type;
-    using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, is_signed_>;
-
-    sz_size_t max_required_shared_memory = 0;
-    for (sz_size_t i = 0; i < first_strings.size(); ++i) {
-        sz_size_t const first_length = first_strings[i].length();
-        sz_size_t const second_length = second_strings[i].length();
-        sz_size_t const shared_memory_requirement =
-            similarity_memory_requirements_t(first_length, second_length, max_magnitude_change, sizeof(char_t), 4)
-                .total;
-        max_required_shared_memory = sz_max_of_two(max_required_shared_memory, shared_memory_requirement);
-    }
-    return max_required_shared_memory;
-}
-
-/**
- *  @brief  String similarity scoring algorithm evaluating a @b single Dynamic Programming matrix
- *          @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization.
- *          Unlike the `_levenshtein_in_cuda_warp` is designed to take one pair of very-longs string,
- *          ideally @b Tens-of-Megabytes in size or more.
- *
- *  @param[in] shorter_string The shorter string in the pair for score calculation.
- *  @param[in] longer_string The longer string in the pair for score calculation.
- *  @param[out] result_ptr Output address of the score for the pair of strings.
- *
- *  The ideal plan is:
- *  - Use cooperative groups abstractions for grid level synchronization between iterations.
- *  - Keep 3 diagonals in shared memory, but not the texts, as depending on the diagonal, different
- *    characters will be needed. Asynchronous copy of the characters from global memory to shared memory
- *    will help hide that latency.
- *  - Each block of threads takes its own slice of those 3 diagonals, constrained by the amount of shared
- *    memory available on the device.
- *  - Every neighboring pair of diagonals has a different length, and boundary elements of each slice need
- *    to be exchanged through shared memory.
- *
- *  The current starter plan is much simpler:
- *  - Keep everything in global memory - the strings and the diagonals.
- *  - Execute the naive algorithm, expecting the hardware to handle coalescing the memory accesses.
- */
-template <               //
-    typename char_type_, //
-    typename index_type_,
-    typename score_type_ = sz_size_t,           //
-    sz_capability_t capability_ = sz_cap_cuda_k //
-    >
-__global__ void _score_across_cuda_device(                              //
-    char_type_ const *shorter_reversed_ptr, index_type_ shorter_length, //
-    char_type_ const *longer_ptr, index_type_ longer_length,            //
-    score_type_ *result_ptr, score_type_ *diagonals_ptr) {
-
-    _sz_assert(shorter_string.size() > 0);
-    _sz_assert(longer_string.size() > 0);
-    _sz_assert(shorter_string.size() <= longer_string.size());
-    using char_t = char_type_;
-    using index_t = index_type_;
-    using score_t = score_type_;
-
-    static constexpr sz_capability_t capability_k = capability_;
-    static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
-    static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
-    using warp_scorer_t =
-        linear_scorer<char_t const *, char_t const *, score_t, substituter_t, objective_k, locality_k, capability_k>;
-
-    // Only one thread will be initializing the top row and left column and outputting the result.
-    bool const is_main_thread = blockIdx.x == 0 && threadIdx.x == 0;
-
-    // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
-    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
-    index_t const shorter_dim = shorter_length + 1, longer_dim = longer_length + 1;
-
-    // Let's say we are dealing with 3 and 5 letter words.
-    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
-    // It will have:
-    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
-    // - 2 diagonals of fixed length, at positions: 4, 5.
-    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
-    index_t const diagonals_count = shorter_dim + longer_dim - 1;
-    index_t const max_diagonal_length = shorter_length + 1;
-
-    // The next few pointers will be swapped around.
-    score_t *previous_scores = diagonals.data();
-    score_t *current_scores = diagonals.data() + max_diagonal_length;
-    score_t *next_scores = diagonals.data() + 2 * max_diagonal_length;
-
-    // Initialize the first two diagonals:
-    warp_scorer_t diagonal_aligner {substituter_, gap_cost_};
-    if (is_main_thread == 0) {
-        diagonal_aligner.init(previous_scores[0], 0);
-        diagonal_aligner.init(current_scores[0], 1);
-        diagonal_aligner.init(current_scores[1], 1);
-    }
-
-    cg::grid_group grid = cg::this_grid();
-
-    // We skip diagonals 0 and 1, as they are trivial.
-    // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
-    // so we are effectively computing just one value, as will be marked by a single set bit in
-    // the `next_diagonal_mask` on the very first iteration.
-    index_t next_diagonal_index = 2;
-    index_t const global_thread_index = threadIdx.x + blockIdx.x * blockDim.x;
-    index_t const global_thread_step = blockDim.x * gridDim.x;
-
-    // Progress through the upper-left triangle of the Levenshtein matrix.
-    for (; next_diagonal_index < shorter_dim; ++next_diagonal_index) {
-
-        uint const next_diagonal_length = next_diagonal_index + 1;
-        diagonal_aligner(                                                //
-            shorter_reversed + shorter_length - next_diagonal_index + 1, // first sequence of characters
-            longer,                                                      // second sequence of characters
-            global_thread_index, global_thread_step,                     //
-            next_diagonal_length - 2,           // number of elements to compute with the `diagonal_aligner`
-            previous_scores,                    // costs pre substitution
-            current_scores, current_scores + 1, // costs pre insertion/deletion
-            next_scores + 1);                   // ! notice unaligned write destination
-
-        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-        if (threadIdx.x == 0) {
-            diagonal_aligner.init(next_scores[0], next_diagonal_index);
-            diagonal_aligner.init(next_scores[next_diagonal_length - 1], next_diagonal_index);
-        }
-        grid.sync();
-
-        // Perform a circular rotation of those buffers, to reuse the memory.
-        score_t *temporary = previous_scores;
-        previous_scores = current_scores;
-        current_scores = next_scores;
-        next_scores = temporary;
-    }
-
-    if (is_main_thread) *result_ptr = current_scores;
 }
 
 #pragma endregion
 
 #pragma region - Levenshtein Distance in CUDA
 
-/**
- *  @brief  Levenshtein edit distances algorithm evaluating the Dynamic Programming matrix
- *          @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization.
- *          Each pair of strings gets its own @b "block" of CUDA threads forming one @b warp and shared memory.
- *
- *  @param[in] first_strings Array of first strings in each pair for score calculation.
- *  @param[in] second_strings Array of second strings in each pair for score calculation.
- *  @param[out] results_ptr Output array of scores for each pair of strings.
- *  @param[in] max_diagonal_length Maximum length of the strings to process. Everything above that will be @b skipped.
- */
-template <                                      //
-    typename first_strings_type_,               //
-    typename second_strings_type_,              //
-    typename score_type_ = sz_size_t,           //
-    sz_capability_t capability_ = sz_cap_cuda_k //
-    >
-__global__ void _levenshtein_in_cuda_warp( //
-    first_strings_type_ first_strings,     //
-    second_strings_type_ second_strings,   //
-    score_type_ *results_ptr,              //
-    size_t max_diagonal_length = SZ_SIZE_MAX) {
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    using first_string_t = typename first_strings_type_::value_type;
-    using second_string_t = typename second_strings_type_::value_type;
-    using first_char_t = typename first_string_t::value_type;
-    using second_char_t = typename second_string_t::value_type;
-    static_assert(sizeof(first_char_t) == sizeof(second_char_t), "Character types don't match");
-    using char_t = typename std::remove_cvref<first_char_t>::type;
-    using score_t = score_type_;
-
-    static constexpr sz_capability_t capability_k = capability_;
-    static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
-    using walker_u8_t = diagonal_walker_per_warp<char_t, sz_u8_t, error_costs_uniform_t, objective_k,
-                                                 sz_similarity_global_k, capability_k>;
-    using walker_u16_t = diagonal_walker_per_warp<char_t, sz_u16_t, error_costs_uniform_t, objective_k,
-                                                  sz_similarity_global_k, capability_k>;
-
-    // Allocating shared memory is handled on the host side.
-    extern __shared__ char shared_memory_buffer[];
-
-    // We are computing N edit distances for N pairs of strings. Not a cartesian product!
-    // Each block/warp may end up receiving a different number of strings.
-    for (sz_size_t pair_idx = blockIdx.x; pair_idx < first_strings.size(); pair_idx += gridDim.x) {
-        first_string_t const first_global = first_strings[pair_idx];
-        second_string_t const second_global = second_strings[pair_idx];
-        score_t &result_ref = results_ptr[pair_idx];
-
-        // Skip empty strings.
-        sz_size_t const first_length = first_global.length();
-        sz_size_t const second_length = second_global.length();
-        if (first_length == 0) {
-            if (threadIdx.x == 0) result_ref = second_length;
-            continue;
-        }
-        if (second_length == 0) {
-            if (threadIdx.x == 0) result_ref = first_length;
-            continue;
-        }
-
-        // Estimate the maximum dimension of the DP matrix to pick the smallest fitting type.
-        using similarity_memory_requirements_t = similarity_memory_requirements<uint, false>;
-        similarity_memory_requirements_t requirements(first_length, second_length, 1, sizeof(char_t), 4);
-        if (requirements.max_diagonal_length >= max_diagonal_length) {
-            // ! Overwrite the value to signal the need to process it separately
-            if (threadIdx.x == 0) result_ref = std::numeric_limits<score_t>::max();
-            continue;
-        }
-
-        span<char const> const first = {first_global.data(), first_length};
-        span<char const> const second = {second_global.data(), second_length};
-
-        if (requirements.bytes_per_cell == 1) {
-            sz_u8_t result_u8 = (sz_u8_t)-1;
-            walker_u8_t walker({}, 1);
-            walker(first, second, result_u8, shared_memory_buffer);
-            if (threadIdx.x == 0) result_ref = result_u8;
-        }
-        else {
-            _sz_assert(requirements.bytes_per_cell == 2);
-            sz_u16_t result_u16 = (sz_u16_t)-1;
-            walker_u16_t walker({}, 1);
-            walker(first, second, result_u16, shared_memory_buffer);
-            if (threadIdx.x == 0) result_ref = result_u16;
-        }
-    }
-}
-
 /** @brief Dispatches on @b `_levenshtein_in_cuda_warp` on the device side from the host side. */
 template <                                       //
     sz_capability_t capability_ = sz_cap_cuda_k, //
@@ -829,13 +767,23 @@ cuda_status_t _levenshtein_distances_implementation(
     static constexpr sz_capability_t capability_k = capability_;
     using first_strings_t = first_strings_type_;
     using second_strings_t = second_strings_type_;
-    using score_t = score_type_;
+    using allocator_t = allocator_type_;
     static_assert(std::is_trivially_copyable<first_strings_t>() && std::is_trivially_copyable<second_strings_t>(),
                   "The first and second strings must be trivially copyable types - consider `arrow_strings_view`.");
+    using first_string_t = typename first_strings_t::value_type;
+    using second_string_t = typename second_strings_t::value_type;
+    static_assert(std::is_trivially_copyable<first_string_t>() && std::is_trivially_copyable<second_string_t>(),
+                  "The first and second strings must be trivially copyable types - consider `span<char>`.");
+    using first_char_t = typename first_string_t::value_type;
+    using second_char_t = typename second_string_t::value_type;
+    static_assert(sizeof(first_char_t) == sizeof(second_char_t), "Character types don't match");
+    using char_t = typename std::remove_cvref<first_char_t>::type;
+    using score_t = score_type_;
 
     // Make sure that we don't string pairs that are too large to fit 3 matrix diagonals into shared memory.
     // H100 Streaming Multiprocessor can have up to 128 active warps concurrently and only 256 KB of shared memory.
     // A100 SMs had only 192 KB. We can't deal with blocks that require more memory than the SM can provide.
+    using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, true>;
     sz_size_t shared_memory_per_block =
         _scores_diagonally_warp_shared_memory_requirement<false>(first_strings, second_strings, 1);
     if (shared_memory_per_block > specs.shared_memory_per_multiprocessor()) return {status_t::bad_alloc_k};
@@ -858,8 +806,8 @@ cuda_status_t _levenshtein_distances_implementation(
     // Let's use all 32 threads in a warp.
     constexpr sz_size_t threads_per_block = 32u;
     sz_size_t const max_input_length = 1024u * 16u;
-    auto kernel = &_levenshtein_in_cuda_warp<first_strings_t, second_strings_t, score_t, capability_k>;
-    void *kernel_args[] = {
+    auto warp_level_kernel = &_levenshtein_in_cuda_warp<first_strings_t, second_strings_t, score_t, capability_k>;
+    void *warp_level_kernel_args[] = {
         (void *)&first_strings,
         (void *)&second_strings,
         (void *)&results,
@@ -870,7 +818,7 @@ cuda_status_t _levenshtein_distances_implementation(
     // CUDA reserves 1 KB of shared memory per thread block, so on H100 we can use up to 227 KB of shared memory.
     // https://docs.nvidia.com/cuda/hopper-tuning-guide/index.html#unified-shared-memory-l1-texture-cache
     cudaError_t attribute_error =
-        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+        cudaFuncSetAttribute(warp_level_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
                              specs.shared_memory_per_multiprocessor() - count_blocks_per_multiprocessor * 1024);
     if (attribute_error != cudaSuccess) return {status_t::unknown_k, attribute_error};
 
@@ -882,58 +830,239 @@ cuda_status_t _levenshtein_distances_implementation(
     // Record the start event
     cudaEventRecord(start_event, stream);
 
-    // Enqueue the kernel for execution:
+    // Enqueue the warp_level_kernel for execution:
     cudaError_t launch_error = cudaLaunchKernel(                                 //
-        reinterpret_cast<void *>(kernel),                                        // Kernel function pointer
+        reinterpret_cast<void *>(warp_level_kernel),                             // Kernel function pointer
         dim3(count_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions
         dim3(threads_per_block),                                                 // Block dimensions
-        kernel_args,                                                             // Array of kernel argument pointers
+        warp_level_kernel_args,                                                  // Array of kernel argument pointers
         shared_memory_per_block,                                                 // Shared memory per block (in bytes)
         stream);                                                                 // CUDA stream
     if (launch_error != cudaSuccess)
         if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
         else { return {status_t::unknown_k, launch_error}; }
 
-    cudaEventRecord(stop_event, stream);
-
     // Fetch the execution error:
     float execution_milliseconds = 0;
-    cudaError_t execution_error = cudaStreamSynchronize(stream);
-    cudaEventElapsedTime(&execution_milliseconds, start_event, stop_event);
-
-    if (execution_error != cudaSuccess)
-        if (execution_error == cudaErrorMemoryAllocation) {
-            return {status_t::bad_alloc_k, execution_error, execution_milliseconds};
+    cudaError_t shorts_execution_error = cudaStreamSynchronize(stream);
+    if (shorts_execution_error != cudaSuccess)
+        if (shorts_execution_error == cudaErrorMemoryAllocation) {
+            return {status_t::bad_alloc_k, shorts_execution_error, execution_milliseconds};
         }
-        else { return {status_t::unknown_k, execution_error, execution_milliseconds}; }
+        else { return {status_t::unknown_k, shorts_execution_error, execution_milliseconds}; }
 
     // Go through the results and check if any of them were too big to be processed
     // by the warp-level implementation, and should be processed by the `_score_across_cuda_device`.
-    // Assuming we need to pre-reverse the shorter string in each such pair
-    sz_size_t combined_length_of_strings_to_reverse = 0;
+    sz_size_t count_longer_strings = 0;
+    sz_size_t longest_string_length = 0;
     for (sz_size_t i = 0; i < first_strings.size(); ++i) {
-        if (results[i] != std::numeric_limits<score_t>::max()) continue;
-        // We need to process this string pair separately.
-        auto const &first_global = first_strings[i];
-        auto const &second_global = second_strings[i];
-        auto const &shorter_global = first_global.length() < second_global.length() ? first_global : second_global;
-        auto const &longer_global = first_global.length() < second_global.length() ? second_global : first_global;
-
-        _score_across_cuda_device;
+        count_longer_strings += results[i] == std::numeric_limits<score_t>::max();
+        longest_string_length =
+            sz_max_of_two(longest_string_length, sz_max_of_two(first_strings[i].length(), second_strings[i].length()));
+    }
+
+    if (count_longer_strings) {
+        auto device_level_u16index_kernel = &_score_across_cuda_device<char_t, sz_u16_t, score_t, capability_k>;
+        auto device_level_u32index_kernel = &_score_across_cuda_device<char_t, sz_u32_t, score_t, capability_k>;
+        auto device_level_u64index_kernel = &_score_across_cuda_device<char_t, sz_u64_t, score_t, capability_k>;
+        void *device_level_kernel_args[6];
+        safe_vector<score_t, allocator_t> diagonals_buffer(allocator);
+        if (diagonals_buffer.try_resize((longest_string_length + 1) * 3) == status_t::bad_alloc_k)
+            return {status_t::bad_alloc_k};
+
+        // We will enqueue many such kernels one after another, without waiting for a completion of the previous one.
+        for (sz_size_t i = 0; i < first_strings.size(); ++i) {
+            if (results[i] != std::numeric_limits<score_t>::max()) continue;
+            // We need to process this string pair separately.
+            auto const &first_global = first_strings[i];
+            auto const &second_global = second_strings[i];
+
+            // Pick the shorter and longer string.
+            char_t const *shorter_ptr, *longer_ptr;
+            sz_size_t shorter_length, longer_length;
+            if (first_global.length() < second_global.length()) {
+                shorter_ptr = first_global.data(), longer_ptr = second_global.data(),
+                shorter_length = first_global.length(), longer_length = second_global.length();
+            }
+            else {
+                shorter_ptr = second_global.data(), longer_ptr = first_global.data(),
+                shorter_length = second_global.length(), longer_length = first_global.length();
+            }
+        }
     }
 
+    // Wait until everything is done.
+    cudaEventRecord(stop_event, stream);
+    cudaEventElapsedTime(&execution_milliseconds, start_event, stop_event);
+    cudaError_t longs_execution_error = cudaStreamSynchronize(stream);
+    if (longs_execution_error != cudaSuccess)
+        if (longs_execution_error == cudaErrorMemoryAllocation) {
+            return {status_t::bad_alloc_k, longs_execution_error, execution_milliseconds};
+        }
+        else { return {status_t::unknown_k, longs_execution_error, execution_milliseconds}; }
+
     return {status_t::success_k, cudaSuccess, execution_milliseconds};
 }
 
-/** @brief Dispatches baseline Levenshtein edit distance algorithm to the GPU. */
+/**
+ *  @brief  Dispatches baseline Levenshtein edit distance algorithm to the GPU.
+ *          Before starting the kernels, bins them by size to maximize the number of blocks
+ *          per grid that can run simultaneously, while fitting into the shared memory.
+ */
 template <typename char_type_, typename allocator_type_, sz_capability_t capability_>
 struct levenshtein_distances<char_type_, allocator_type_, capability_, std::enable_if_t<capability_ & sz_cap_cuda_k>> {
 
+    using char_t = char_type_;
+    using allocator_t = allocator_type_;
+    using scores_allocator_t = typename allocator_t::template rebind<sz_size_t>::other;
+
+    struct task_t {
+        char_t const *shorter_ptr = nullptr;
+        size_t shorter_length = 0;
+        char_t const *longer_ptr = nullptr;
+        size_t longer_length = 0;
+        size_t memory_requirement = 0;
+        size_t original_index = 0;
+
+        constexpr task_t(char_t const *first_ptr, size_t first_length, char_t const *second_ptr,
+                         size_t second_length) noexcept {
+            if (first_length < second_length)
+                shorter_ptr = first_ptr, shorter_length = first_length, longer_ptr = second_ptr,
+                longer_length = second_length;
+            else
+                shorter_ptr = second_ptr, shorter_length = second_length, longer_ptr = first_ptr,
+                longer_length = first_length;
+        }
+
+        constexpr size_t max_diagonal_length() const noexcept {
+            return sz_max_of_two(shorter_length, longer_length) + 1;
+        }
+    };
+    using tasks_allocator_t = typename allocator_t::template rebind<task_t>::other;
+
+    allocator_t allocator = {};
+
+    levenshtein_distances(allocator_t const &allocator = {}) noexcept : allocator(allocator) {}
+
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
     cuda_status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
                              results_type_ &&results, gpu_specs_t specs = {}, cudaStream_t stream = 0) const noexcept {
-        return _levenshtein_distances_implementation<capability_>(first_strings, second_strings, results, specs,
-                                                                  stream);
+
+        score_t *const results_ptr = results.data();
+        safe_vector<task_t, tasks_allocator_t> tasks(allocator_);
+        if (!tasks.try_resize(first_strings.size()) == status_t::bad_alloc_k) return {status_t::bad_alloc_k};
+
+        // Export all the tasks and sort them by decreasing memory requirement.
+        using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, is_signed_>;
+        for (sz_size_t i = 0; i < first_strings.size(); ++i) {
+            tasks[i] = task_t(                                      //
+                first_strings[i].data(), first_strings[i].length(), //
+                second_strings[i].data(), second_strings[i].length());
+            tasks[i].original_index = i;
+            tasks[i].memory_requirement = //
+                similarity_memory_requirements_t(tasks[i].shorter_length, tasks[i].longer_length, 1, sizeof(char_t), 4)
+                    .total;
+        }
+        std::sort(tasks.begin(), tasks.end(),
+                  [](task_t const &a, task_t const &b) { return a.memory_requirement > b.memory_requirement; });
+
+        // On very large inputs we will keep the diagonals in shared memory.
+        safe_vector<sz_u64_t, scores_allocator_t> diagonals_u64_buffer(allocator_);
+        auto device_level_u16_kernel = &_score_across_cuda_device<char_t, sz_u16_t, sz_u16_t, capability_k>;
+        auto device_level_u32_kernel = &_score_across_cuda_device<char_t, sz_u32_t, sz_u32_t, capability_k>;
+        auto device_level_u64_kernel = &_score_across_cuda_device<char_t, sz_u64_t, sz_u64_t, capability_k>;
+        void *device_level_kernel_args[6];
+
+        // Now we need to bin them based on the number of blocks per multiprocessor,
+        // starting with problems, that can't fit into the memory of a single SM.
+        size_t count_tasks_processed = 0;
+        size_t count_tasks_for_entire_device = 0;
+        for (; count_tasks_processed != tasks.size(); ++count_tasks_processed, ++count_tasks_for_entire_device) {
+            // Check if we've finally reached small-enough inputs.
+            task_t const &task = tasks[count_tasks_processed];
+            size_t const requirement_with_one_warp = task.memory_requirement + specs.reserved_memory_per_block * 1;
+            if (requirement_with_one_warp < specs.shared_memory_per_multiprocessor()) break;
+
+            // As tasks decrease in size, this can only fail on first iteration.
+            if (diagonals_u64_buffer.try_resize(task.max_diagonal_length() * 3) == status_t::bad_alloc_k)
+                return {status_t::bad_alloc_k};
+            device_level_kernel_args[0] = (void *)(task.shorter_ptr);
+            device_level_kernel_args[1] = (void *)(&task.shorter_length);
+            device_level_kernel_args[2] = (void *)(task.longer_ptr);
+            device_level_kernel_args[3] = (void *)(&task.longer_length);
+            device_level_kernel_args[4] = (void *)(results_ptr + i);
+            device_level_kernel_args[5] = (void *)(diagonals_u64_buffer.data());
+
+            // Pick the smallest fitting type for the diagonals.
+            void *device_level_kernel = reinterpret_cast<void *>(device_level_u16_kernel);
+            if (task.max_diagonal_length() >= std::numeric_limits<sz_u16_t>::max())
+                device_level_kernel = reinterpret_cast<void *>(device_level_u32_kernel);
+            if (task.max_diagonal_length() >= std::numeric_limits<sz_u32_t>::max())
+                device_level_kernel = reinterpret_cast<void *>(device_level_u64_kernel);
+
+            // TODO: We can be wiser about the dimensions of this grid.
+            uint const random_block_size = 128;
+            uint const random_blocks_per_multiprocessor = 32;
+            cudaError_t launch_error = cudaLaunchCooperativeKernel(                       //
+                reinterpret_cast<void *>(device_level_u64index_kernel),                   // Kernel function pointer
+                dim3(random_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions
+                dim3(random_block_size),                                                  // Block dimensions
+                device_level_kernel_args, // Array of kernel argument pointers
+                0,                        // Shared memory per block (in bytes)
+                stream);                  // CUDA stream
+            if (launch_error != cudaSuccess)
+                if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
+                else { return {status_t::unknown_k, launch_error}; }
+        }
+
+        // Now process remaining warp-level tasks, checking warp densities in reverse order.
+        // From the highest possible number of warps per multiprocessor to the lowest.
+        std::initializer_list<size_t> warps_per_multiprocessor_densities = {32, 16, 8, 4, 2, 1};
+        auto warp_level_u8_kernel = &_score_on_each_cuda_warp<task_t, char_t, sz_u8_t, sz_u8_t, capability_k>;
+        auto warp_level_u16_kernel = &_score_on_each_cuda_warp<task_t, char_t, sz_u16_t, sz_u16_t, capability_k>;
+        size_t count_tasks_for_this_density = 0;
+        for (size_t warps_per_multiprocessor_density : warps_per_multiprocessor_densities) {
+            for (; count_tasks_processed != tasks.size(); ++count_tasks_processed, ++count_tasks_for_this_density) {
+                // Check if the current warp density is still optimal.
+                task_t const &task = tasks[count_tasks_processed];
+                size_t const requirement_with_current_warp_density =
+                    task.memory_requirement + specs.reserved_memory_per_block * warps_per_multiprocessor_density;
+                if (requirement_with_current_warp_density > specs.shared_memory_per_multiprocessor()) break;
+            }
+
+            // Now check if any tasks of that size have been found and if their quantity is sufficient
+            // to fill the entire device with warps. If we don't have enough tasks for this density,
+            // part of our GPU will just sit idle...
+            //
+            //     Theoretically, that isn't true. On paper, several CUDA kernels can run concurrently
+            //     on the same physical device. It makes sense, assuming the Multi-Instance GPU (MIG)
+            //     virtualization feature on them. But even when pulling `cudaGetDeviceProperties`
+            //     and logging `cudaDeviceProp::concurrentKernels` for the H200 GPU with 140 GB of VRAM,
+            //     it states 1!
+            //
+            // Moreover, we need to overwrite the maximum addressable shared memory for that kernel
+            // as the default limits our blocks (in our case, single-warp blocks) to 48 KB, while
+            // device supports 4-5x more! Still, that API is synchronous and we must block the current
+            // thread to what until the task completes to change the shared memory amount.
+
+            device_level_kernel_args[0] = (void *)(task.shorter_ptr);
+            device_level_kernel_args[1] = (void *)(&task.shorter_length);
+            device_level_kernel_args[2] = (void *)(task.longer_ptr);
+            device_level_kernel_args[3] = (void *)(&task.longer_length);
+            device_level_kernel_args[4] = (void *)(results_ptr + i);
+            device_level_kernel_args[5] = (void *)(diagonals_u64_buffer.data());
+
+            // Pick the smallest fitting type for the diagonals.
+            void *device_level_kernel = reinterpret_cast<void *>(warp_level_u8_kernel);
+            if (task.max_diagonal_length() >= std::numeric_limits<sz_u8_t>::max())
+                device_level_kernel = reinterpret_cast<void *>(warp_level_u16_kernel);
+        }
+
+        // Now that everything went well, export the results back into the `results` array.
+        for (size_t i = 0; i < count_tasks_processed; ++i) {
+            task_t const &task = tasks[i];
+            results[task.original_index] = *task.result_ptr;
+        }
     }
 };
 
@@ -1159,8 +1288,8 @@ cuda_status_t _needleman_wunsch_via_cuda_warp(
 }
 
 /** @brief Dispatches baseline Needleman Wunsch algorithm to the GPU. */
-template <typename char_type_, typename substituter_type_>
-struct needleman_wunsch_scores<char_type_, substituter_type_, dummy_alloc_t, sz_cap_cuda_k> {
+template <typename char_type_, typename allocator_type_, typename substituter_type_>
+struct needleman_wunsch_scores<char_type_, substituter_type_, allocator_type_, sz_cap_cuda_k> {
 
     using char_t = char_type_;
     using substituter_t = substituter_type_;

From 25ab3b652408a98bb28538b35329d53ffc34c26e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 24 Apr 2025 15:19:20 +0000
Subject: [PATCH 358/751] Add: Affine Levenshtein-Gotoh variants

---
 scripts/test_stringcuzilla.cuh | 296 ++++++++++++++++++++++++---------
 1 file changed, 216 insertions(+), 80 deletions(-)

diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index f3567ab0..12eca7e7 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -79,22 +79,24 @@ int log_environment() {
  *  @brief Inefficient baseline Levenshtein distance computation, as implemented in most codebases.
  *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
  */
-inline std::size_t levenshtein_baseline(char const *s1, std::size_t len1, char const *s2,
-                                        std::size_t len2) noexcept(false) {
+inline std::size_t levenshtein_baseline(                                //
+    char const *s1, std::size_t len1, char const *s2, std::size_t len2, //
+    error_cost_t match_cost = 0, error_cost_t mismatch_cost = 1, error_cost_t gap_cost = 1) noexcept(false) {
+
     std::size_t const rows = len1 + 1;
     std::size_t const cols = len2 + 1;
     std::vector<std::size_t> matrix_buffer(rows * cols);
 
     // Initialize the borders of the matrix.
-    for (std::size_t i = 0; i < rows; ++i) matrix_buffer[i * cols + 0] /* [i][0] in 2D */ = i;
-    for (std::size_t j = 0; j < cols; ++j) matrix_buffer[0 * cols + j] /* [0][j] in 2D */ = j;
+    for (std::size_t i = 0; i < rows; ++i) matrix_buffer[i * cols + 0] /* [i][0] in 2D */ = i * gap_cost;
+    for (std::size_t j = 0; j < cols; ++j) matrix_buffer[0 * cols + j] /* [0][j] in 2D */ = j * gap_cost;
 
     for (std::size_t i = 1; i < rows; ++i) {
         std::size_t const *last_row = &matrix_buffer[(i - 1) * cols];
         std::size_t *row = &matrix_buffer[i * cols];
         for (std::size_t j = 1; j < cols; ++j) {
-            std::size_t substitution_cost = (s1[i - 1] == s2[j - 1]) ? 0 : 1;
-            std::size_t if_deletion_or_insertion = std::min(last_row[j], row[j - 1]) + 1;
+            std::size_t substitution_cost = (s1[i - 1] == s2[j - 1]) ? match_cost : mismatch_cost;
+            std::size_t if_deletion_or_insertion = std::min(last_row[j], row[j - 1]) + gap_cost;
             row[j] = std::min(if_deletion_or_insertion, last_row[j - 1] + substitution_cost);
         }
     }
@@ -106,9 +108,10 @@ inline std::size_t levenshtein_baseline(char const *s1, std::size_t len1, char c
  *  @brief Inefficient baseline Needleman-Wunsch alignment score computation, as implemented in most codebases.
  *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
  */
-inline std::ptrdiff_t needleman_wunsch_baseline(char const *s1, std::size_t len1, char const *s2, std::size_t len2,
-                                                std::function<error_cost_t(char, char)> substitution_cost_for,
-                                                error_cost_t gap_cost) noexcept(false) {
+inline std::ptrdiff_t needleman_wunsch_baseline(                        //
+    char const *s1, std::size_t len1, char const *s2, std::size_t len2, //
+    std::function<error_cost_t(char, char)> substitution_cost_for, error_cost_t gap_cost) noexcept(false) {
+
     std::size_t const rows = len1 + 1;
     std::size_t const cols = len2 + 1;
     std::vector<std::ptrdiff_t> matrix_buffer(rows * cols);
@@ -168,6 +171,59 @@ inline std::ptrdiff_t smith_waterman_baseline(char const *s1, std::size_t len1,
     return best_score;
 }
 
+/**
+ *  @brief Inefficient baseline Levenshtein-Gotoh distance computation, as implemented in most codebases.
+ *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
+ */
+inline std::size_t levenshtein_gotoh_baseline(                          //
+    char const *s1, std::size_t len1, char const *s2, std::size_t len2, //
+    error_cost_t match_cost, error_cost_t mismatch_cost,                //
+    error_cost_t gap_opening_cost, error_cost_t gap_extension_cost) noexcept(false) {
+
+    std::size_t const rows = len1 + 1;
+    std::size_t const cols = len2 + 1;
+    std::vector<std::size_t> matrix_scores(rows * cols);
+    std::vector<std::size_t> matrix_inserts(rows * cols);
+    std::vector<std::size_t> matrix_deletes(rows * cols);
+
+    // Initialize the borders of the matrix.
+    // The supplementary matrices are initialized with values of higher magnitude,
+    // which is equivalent to discarding them. That's better than using `SIZE_MAX`
+    // as subsequent additions won't overflow.
+    matrix_scores[0] = 0;
+    for (std::size_t j = 1; j < cols; ++j) {
+        matrix_scores[0 * cols + j] = gap_opening_cost + (j - 1) * gap_extension_cost;
+        matrix_deletes[0 * cols + j] = matrix_scores[0 * cols + j] + gap_opening_cost + gap_extension_cost;
+    }
+    for (std::size_t i = 1; i < rows; ++i) {
+        matrix_scores[i * cols + 0] = gap_opening_cost + (i - 1) * gap_extension_cost;
+        matrix_inserts[i * cols + 0] = matrix_scores[i * cols + 0] + gap_opening_cost + gap_extension_cost;
+    }
+
+    // Fill in the rest of the matrix.
+    for (std::size_t i = 1; i < rows; ++i) {
+        std::size_t const *last_row = &matrix_scores[(i - 1) * cols];
+        std::size_t *row = &matrix_scores[i * cols];
+        std::size_t *row_inserts = &matrix_inserts[i * cols];
+        std::size_t const *last_deletes_row = &matrix_deletes[(i - 1) * cols];
+        std::size_t *row_deletes = &matrix_deletes[i * cols];
+        for (std::size_t j = 1; j < cols; ++j) {
+            std::size_t substitution_cost = (s1[i - 1] == s2[j - 1]) ? match_cost : mismatch_cost;
+            std::size_t if_substitution = last_row[j - 1] + substitution_cost;
+            std::size_t if_deletion =
+                std::min<std::size_t>(last_row[j] + gap_opening_cost, last_deletes_row[j] + gap_extension_cost);
+            std::size_t if_insertion =
+                std::min<std::size_t>(row[j - 1] + gap_opening_cost, row_inserts[j - 1] + gap_extension_cost);
+            std::size_t if_deletion_or_insertion = std::min(if_deletion, if_insertion);
+            row[j] = std::min(if_deletion_or_insertion, if_substitution);
+            row_inserts[j] = if_insertion;
+            row_deletes[j] = if_deletion;
+        }
+    }
+
+    return matrix_scores.back();
+}
+
 /**
  *  @brief Inefficient baseline Needleman-Wunsch-Gotoh alignment score computation, as implemented in most codebases.
  *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
@@ -187,12 +243,12 @@ inline std::ptrdiff_t needleman_wunsch_gotoh_baseline(                  //
     // Initialize the borders of the matrix.
     matrix_scores[0] = 0;
     for (std::size_t i = 1; i < rows; ++i) {
-        matrix_scores[i * cols + 0] /* [i][0] in 2D */ = gap_opening_cost + (i - 1) * gap_extension_cost;
-        matrix_inserts[i * cols + 0] /* [i][0] in 2D */ = gap_opening_cost * 2 + i * gap_extension_cost;
+        matrix_scores[i * cols + 0] = gap_opening_cost + (i - 1) * gap_extension_cost;
+        matrix_inserts[i * cols + 0] = matrix_scores[i * cols + 0] + gap_opening_cost + gap_extension_cost;
     }
-    for (std::size_t j = 0; j < cols; ++j) {
-        matrix_scores[0 * cols + j] /* [0][j] in 2D */ = gap_opening_cost + (j - 1) * gap_extension_cost;
-        matrix_deletes[0 * cols + j] /* [0][j] in 2D */ = gap_opening_cost * 2 + j * gap_extension_cost;
+    for (std::size_t j = 1; j < cols; ++j) {
+        matrix_scores[0 * cols + j] = gap_opening_cost + (j - 1) * gap_extension_cost;
+        matrix_deletes[0 * cols + j] = matrix_scores[0 * cols + j] + gap_opening_cost + gap_extension_cost;
     }
 
     // Fill in the rest of the matrix.
@@ -242,12 +298,12 @@ inline std::ptrdiff_t smith_waterman_gotoh_baseline(                    //
     // Initialize the borders of the matrix.
     matrix_scores[0] = 0;
     for (std::size_t i = 1; i < rows; ++i) {
-        matrix_scores[i * cols + 0] /* [i][0] in 2D */ = 0;
-        matrix_inserts[i * cols + 0] /* [i][0] in 2D */ = gap_opening_cost + gap_extension_cost;
+        matrix_scores[i * cols + 0] = 0;
+        matrix_inserts[i * cols + 0] = gap_opening_cost + gap_extension_cost;
     }
-    for (std::size_t j = 0; j < cols; ++j) {
-        matrix_scores[0 * cols + j] /* [0][j] in 2D */ = 0;
-        matrix_deletes[0 * cols + j] /* [0][j] in 2D */ = gap_opening_cost + gap_extension_cost;
+    for (std::size_t j = 1; j < cols; ++j) {
+        matrix_scores[0 * cols + j] = 0;
+        matrix_deletes[0 * cols + j] = gap_opening_cost + gap_extension_cost;
     }
 
     // Fill in the rest of the matrix.
@@ -279,13 +335,31 @@ inline std::ptrdiff_t smith_waterman_gotoh_baseline(                    //
 }
 
 struct levenshtein_baselines_t {
+
+    uniform_substitution_costs_t substitution_costs = {0, 1};
+    error_cost_t gap_opening_cost = {1};
+    error_cost_t gap_extension_cost = {1};
+
+    levenshtein_baselines_t() = default;
+    levenshtein_baselines_t(uniform_substitution_costs_t subs, linear_gap_costs_t gap)
+        : substitution_costs(subs), gap_opening_cost(gap.open_or_extend), gap_extension_cost(gap.open_or_extend) {}
+    levenshtein_baselines_t(uniform_substitution_costs_t subs, affine_gap_costs_t gap)
+        : substitution_costs(subs), gap_opening_cost(gap.open), gap_extension_cost(gap.extend) {}
+
     template <typename results_type_>
     status_t operator()(arrow_strings_view_t first, arrow_strings_view_t second, results_type_ *results) const {
         _sz_assert(first.size() == second.size());
 #pragma omp parallel for
         for (std::size_t i = 0; i != first.size(); ++i)
-            results[i] = levenshtein_baseline(first[i].data(), first[i].size(), //
-                                              second[i].data(), second[i].size());
+            results[i] =
+                gap_opening_cost == gap_extension_cost
+                    ? levenshtein_baseline(first[i].data(), first[i].size(),   //
+                                           second[i].data(), second[i].size(), //
+                                           substitution_costs.match, substitution_costs.mismatch, gap_opening_cost)
+                    : levenshtein_gotoh_baseline(first[i].data(), first[i].size(),   //
+                                                 second[i].data(), second[i].size(), //
+                                                 substitution_costs.match, substitution_costs.mismatch,
+                                                 gap_opening_cost, gap_extension_cost);
         return status_t::success_k;
     }
 };
@@ -348,39 +422,59 @@ struct smith_waterman_baselines_t {
 };
 
 using malloc_t = std::allocator<char>;
+#if SZ_USE_CUDA
+using ualloc_t = unified_alloc<char>;
+#endif
 
 /**
  *  In non-SIMD backends we still leverage OpenMP for parallelism.
  */
-using levenshtein_serial_t = levenshtein_distances<char, malloc_t, sz_caps_sp_k>;
-using levenshtein_utf8_serial_t = levenshtein_distances_utf8<char, malloc_t, sz_caps_sp_k>;
-using needleman_wunsch_serial_t = needleman_wunsch_scores<char, error_costs_256x256_t, malloc_t, sz_caps_sp_k>;
-using smith_waterman_serial_t = smith_waterman_scores<char, error_costs_256x256_t, malloc_t, sz_caps_sp_k>;
+using levenshtein_serial_t = levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_sp_k>;
+using levenshtein_utf8_serial_t = levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_caps_sp_k>;
+using needleman_wunsch_serial_t =
+    needleman_wunsch_scores<char, error_costs_256x256_t, linear_gap_costs_t, malloc_t, sz_caps_sp_k>;
+using smith_waterman_serial_t =
+    smith_waterman_scores<char, error_costs_256x256_t, linear_gap_costs_t, malloc_t, sz_caps_sp_k>;
+
+using affine_levenshtein_serial_t = levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_caps_sp_k>;
+using affine_levenshtein_utf8_serial_t = levenshtein_distances_utf8<char, affine_gap_costs_t, malloc_t, sz_caps_sp_k>;
+using affine_needleman_wunsch_serial_t =
+    needleman_wunsch_scores<char, error_costs_256x256_t, affine_gap_costs_t, malloc_t, sz_caps_sp_k>;
+using affine_smith_waterman_serial_t =
+    smith_waterman_scores<char, error_costs_256x256_t, affine_gap_costs_t, malloc_t, sz_caps_sp_k>;
 
 /**
  *  In @b AVX-512:
  *  - for Global Alignments, we can vectorize the min-max calculation for diagonal "walkers"
  *  - for Local Alignments, we can vectorize the character substitution lookups for horizontal "walkers"
  */
-using levenshtein_ice_t = levenshtein_distances<char, malloc_t, sz_caps_spi_k>;
-using levenshtein_utf8_ice_t = levenshtein_distances_utf8<char, malloc_t, sz_caps_spi_k>;
-using needleman_wunsch_ice_t = needleman_wunsch_scores<char, error_costs_256x256_t, malloc_t, sz_caps_spi_k>;
-using smith_waterman_ice_t = smith_waterman_scores<char, error_costs_256x256_t, malloc_t, sz_caps_spi_k>;
+using levenshtein_ice_t = levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_spi_k>;
+using levenshtein_utf8_ice_t = levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_caps_spi_k>;
+using needleman_wunsch_ice_t =
+    needleman_wunsch_scores<char, error_costs_256x256_t, linear_gap_costs_t, malloc_t, sz_caps_spi_k>;
+using smith_waterman_ice_t =
+    smith_waterman_scores<char, error_costs_256x256_t, linear_gap_costs_t, malloc_t, sz_caps_spi_k>;
 
+#if SZ_USE_CUDA
 /**
  *  In @b CUDA:
  *  - for GPUs before Hopper, we can use the @b SIMT model for warp-level parallelism using diagonal "walkers"
  *  - for GPUs after Hopper, we compound that with thread-level @b SIMD via @b DPX instructions for min-max
  */
-using levenshtein_cuda_t = levenshtein_distances<char, dummy_alloc_t, sz_cap_cuda_k>;
-using levenshtein_utf8_cuda_t = levenshtein_distances_utf8<char, dummy_alloc_t, sz_cap_cuda_k>;
-using needleman_wunsch_cuda_t = needleman_wunsch_scores<char, error_costs_256x256_t, dummy_alloc_t, sz_cap_cuda_k>;
-using smith_waterman_cuda_t = smith_waterman_scores<char, error_costs_256x256_t, dummy_alloc_t, sz_cap_cuda_k>;
-
-using levenshtein_hopper_t = levenshtein_distances<char, dummy_alloc_t, sz_caps_ckh_k>;
-using levenshtein_utf8_hopper_t = levenshtein_distances_utf8<char, dummy_alloc_t, sz_caps_ckh_k>;
-using needleman_wunsch_hopper_t = needleman_wunsch_scores<char, error_costs_256x256_t, dummy_alloc_t, sz_caps_ckh_k>;
-using smith_waterman_hopper_t = smith_waterman_scores<char, error_costs_256x256_t, dummy_alloc_t, sz_caps_ckh_k>;
+using levenshtein_cuda_t = levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
+using levenshtein_utf8_cuda_t = levenshtein_distances_utf8<char, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
+using needleman_wunsch_cuda_t =
+    needleman_wunsch_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
+using smith_waterman_cuda_t =
+    smith_waterman_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
+
+using levenshtein_kepler_t = levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_caps_ck_k>;
+using levenshtein_utf8_hopper_t = levenshtein_distances_utf8<char, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k>;
+using needleman_wunsch_hopper_t =
+    needleman_wunsch_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k>;
+using smith_waterman_hopper_t =
+    smith_waterman_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k>;
+#endif
 
 template <typename score_type_>
 void edit_distance_log_mismatch(std::string const &first, std::string const &second, //
@@ -424,6 +518,10 @@ void test_similarity_scores_fixed(base_operator_ &&base_operator, simd_operator_
     append("ABC", "ADC");                  // one substitution; distance ~ 1
     append("APPLE", "APLE");               // distance ~ 1
 
+    // Longer strings made of simple characters:
+    append("ABCDEFGHIJKLMNOPQRSTUVWXYZ", "ABCDEFGHIJKLMNOPQRSTUVWXYZ"); // same string; distance ~ 0
+    append("ABCDEFGHIJKLMNOPQRSTUVWXYZ", "ABCD_FGHI_KLMNOP_RSTU_WXYZ"); // same length; 4 substitutions; distance ~ 4
+
     // Short Unicode samples that we also use on the Python side:
     append("αβγδ", "αγδ");                      // Each Greek symbol is 2 bytes in size; 2 bytes, 1 runes diff.
     append("école", "école");                   // letter "é" as a single character vs "e" + "´"; 3 bytes, 2 runes diff.
@@ -504,8 +602,12 @@ void test_similarity_scores_fixed(base_operator_ &&base_operator, simd_operator_
         second_tape.try_assign(&second, &second + 1);
 
         // Compute with both backends
-        status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
-        status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data(), extra_args...);
+        arrow_strings_view_t first_view = first_tape.view();
+        arrow_strings_view_t second_view = second_tape.view();
+        score_t *results_base_ptr = results_base.data();
+        score_t *results_simd_ptr = results_simd.data();
+        status_t status_base = base_operator(first_view, second_view, results_base_ptr);
+        status_t status_simd = simd_operator(first_view, second_view, results_simd_ptr, extra_args...);
         _sz_assert(status_base == status_t::success_k);
         _sz_assert(status_simd == status_t::success_k);
         if (results_base[0] != results_simd[0])
@@ -603,7 +705,7 @@ void test_similarity_scores_fixed_and_fuzzy(base_operator_ &&base_operator, simd
                                             std::string_view allowed_chars = {}, fuzzy_config_t config = {},
                                             extra_args_ &&...extra_args) {
     test_similarity_scores_fixed<score_type_>(base_operator, simd_operator, allowed_chars, extra_args...);
-    test_similarity_scores_fuzzy<score_type_>(base_operator, simd_operator, config, extra_args...);
+    test_similarity_scores_fuzzy<score_type_>(base_operator, simd_operator, config, 1, extra_args...);
 }
 
 /**
@@ -619,11 +721,11 @@ void test_similarity_scores_equivalence() {
     // Our logic of computing NW and SW alignment similarity scores differs in sign from most implementations.
     // It's similar to how the "cosine distance" is the inverse of the "cosine similarity".
     // In our case we compute the "distance" and by negating the sign, we can compute the "similarity".
-    constexpr error_t unary_match_score = 1;
-    constexpr error_t unary_mismatch_score = 0;
-    constexpr error_t unary_gap_score = 0;
-    error_matrix_t substituter_unary = error_matrix_t::diagonal(unary_match_score, unary_mismatch_score);
     {
+        constexpr error_t unary_match_score = 1;
+        constexpr error_t unary_mismatch_score = 0;
+        constexpr error_t unary_gap_score = 0;
+        error_matrix_t substituter_unary = error_matrix_t::diagonal(unary_match_score, unary_mismatch_score);
         auto distance_l = levenshtein_baseline("abcdefg", 7, "abc_efg", 7);
         auto similarity_nw = needleman_wunsch_baseline("abcdefg", 7, "abc_efg", 7, substituter_unary, unary_gap_score);
         auto similarity_sw = smith_waterman_baseline("abcdefg", 7, "abc_efg", 7, substituter_unary, unary_gap_score);
@@ -635,61 +737,86 @@ void test_similarity_scores_equivalence() {
         _sz_assert(distance_sw == 1);
     }
 
+    // Let's define some weird scoring schemes for Levenshtein-like distance, that are not unary:
+    constexpr linear_gap_costs_t weird_linear {3};
+    constexpr affine_gap_costs_t weird_affine {4, 2};
+    constexpr uniform_substitution_costs_t weird_uniform {1, 3};
+
     // Single-threaded serial Levenshtein distance implementation
     test_similarity_scores_fixed_and_fuzzy<sz_size_t>( //
         levenshtein_baselines_t {},                    //
-        levenshtein_distances<char, malloc_t, sz_cap_serial_k> {});
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {});
 
     // Multi-threaded parallel Levenshtein distance implementation
     test_similarity_scores_fixed_and_fuzzy<sz_size_t>( //
         levenshtein_baselines_t {},                    //
-        levenshtein_distances<char, malloc_t, sz_caps_sp_k> {});
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {});
+
+    // Single-threaded serial Levenshtein distance implementation with weird linear costs
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(         //
+        levenshtein_baselines_t {weird_uniform, weird_linear}, //
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear});
+
+    // Multi-threaded parallel Levenshtein distance implementation with weird linear costs
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(         //
+        levenshtein_baselines_t {weird_uniform, weird_linear}, //
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {weird_uniform, weird_linear});
+
+    // Single-threaded serial Levenshtein distance implementation with weird affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(         //
+        levenshtein_baselines_t {weird_uniform, weird_affine}, //
+        levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_affine});
+
+    // Multi-threaded parallel Levenshtein distance implementation with weird affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(         //
+        levenshtein_baselines_t {weird_uniform, weird_affine}, //
+        levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_caps_sp_k> {weird_uniform, weird_affine});
 
     // Now let's take non-unary substitution costs, like BLOSUM62
-    constexpr error_t blosum62_gap_extension_cost = -4;
+    constexpr linear_gap_costs_t blosum62_gap_extension_cost {-4};
     error_mat_t blosum62_mat = error_costs_26x26ascii_t::blosum62();
     error_matrix_t blosum62_matrix = blosum62_mat.decompressed();
 
     // Single-threaded serial NW implementation
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                              //
         needleman_wunsch_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
-        needleman_wunsch_scores<char, error_matrix_t, malloc_t, sz_cap_serial_k> {blosum62_matrix,
-                                                                                  blosum62_gap_extension_cost});
+        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_gap_extension_cost});
 
     // Multi-threaded parallel NW implementation
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                              //
         needleman_wunsch_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
-        needleman_wunsch_scores<char, error_matrix_t, malloc_t, sz_caps_sp_k> {blosum62_matrix,
-                                                                               blosum62_gap_extension_cost});
+        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {
+            blosum62_matrix, blosum62_gap_extension_cost});
 
     // Single-threaded serial SW implementation
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                            //
         smith_waterman_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
-        smith_waterman_scores<char, error_matrix_t, malloc_t, sz_cap_serial_k> {blosum62_matrix,
-                                                                                blosum62_gap_extension_cost});
+        smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_gap_extension_cost});
 
     // Multi-threaded parallel SW implementation
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                            //
         smith_waterman_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
-        smith_waterman_scores<char, error_matrix_t, malloc_t, sz_caps_sp_k> {blosum62_matrix,
-                                                                             blosum62_gap_extension_cost});
+        smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {
+            blosum62_matrix, blosum62_gap_extension_cost});
 
 #if SZ_USE_ICE
     // Ice Lake Levenshtein distance against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(          //
-        levenshtein_distances<char, malloc_t, sz_caps_sp_k> {}, //
-        levenshtein_distances<char, malloc_t, sz_caps_spi_k> {});
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                              //
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {}, //
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_spi_k> {});
 
     // Ice Lake Levenshtein distance against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(               //
-        levenshtein_distances_utf8<char, malloc_t, sz_caps_sp_k> {}, //
-        levenshtein_distances_utf8<char, malloc_t, sz_caps_spi_k> {});
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                   //
+        levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {}, //
+        levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_caps_spi_k> {});
 
     // Ice Lake Needleman-Wunsch distance against Multi-threaded on CPU
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                              //
         needleman_wunsch_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
-        needleman_wunsch_scores<char, error_matrix_t, malloc_t, sz_caps_spi_k> {blosum62_matrix,
-                                                                                blosum62_gap_extension_cost});
+        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_caps_spi_k> {
+            blosum62_matrix, blosum62_gap_extension_cost});
 
 #endif
 
@@ -699,25 +826,25 @@ void test_similarity_scores_equivalence() {
 
 #if SZ_USE_CUDA
     // CUDA Levenshtein distance against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(          //
-        levenshtein_distances<char, malloc_t, sz_caps_sp_k> {}, //
-        levenshtein_distances<char, dummy_alloc_t, sz_cap_cuda_k> {}, {}, {}, first_gpu_specs);
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                              //
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {}, //
+        levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k> {}, {}, {}, first_gpu_specs);
 #endif
 
-#if SZ_USE_HOPPER && 0
-    // CUDA Levenshtein distance on Hopper against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(          //
-        levenshtein_distances<char, malloc_t, sz_caps_sp_k> {}, //
-        levenshtein_distances<char, dummy_alloc_t, sz_cap_hopper_k> {});
+#if SZ_USE_KEPLER
+    // CUDA Levenshtein distance on Kepler against Multi-threaded on CPU
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                              //
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {}, //
+        levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_caps_ck_k> {}, {}, {}, first_gpu_specs);
 #endif
 
 #if SZ_USE_CUDA
     // CUDA Needleman-Wunsch distance against Multi-threaded on CPU
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
-        needleman_wunsch_scores<char, error_matrix_t, malloc_t, sz_caps_sp_k> {blosum62_matrix,
-                                                                               blosum62_gap_extension_cost}, //
-        needleman_wunsch_scores<char, error_matrix_t, dummy_alloc_t, sz_cap_cuda_k> {blosum62_matrix,
-                                                                                     blosum62_gap_extension_cost},
+        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {
+            blosum62_matrix, blosum62_gap_extension_cost}, //
+        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k> {
+            blosum62_matrix, blosum62_gap_extension_cost},
         {}, {}, first_gpu_specs);
 #endif
 }
@@ -764,13 +891,22 @@ void test_similarity_scores_memory_usage() {
         // Multi-threaded serial Levenshtein distance implementation
         test_similarity_scores_fuzzy<sz_size_t>( //
             levenshtein_baselines_t {},          //
-            levenshtein_distances<char, malloc_t, sz_caps_sp_k> {}, experiment, 1);
+            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {}, experiment, 1);
 
 #if SZ_USE_CUDA
         // CUDA Levenshtein distance against Multi-threaded on CPU
-        test_similarity_scores_fuzzy<sz_size_t>(                    //
-            levenshtein_distances<char, malloc_t, sz_caps_sp_k> {}, //
-            levenshtein_distances<char, dummy_alloc_t, sz_cap_cuda_k> {}, experiment, 1, first_gpu_specs);
+        test_similarity_scores_fuzzy<sz_size_t>(                                        //
+            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {}, //
+            levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k> {}, experiment, 10,
+            first_gpu_specs);
+#endif
+
+#if SZ_USE_KEPLER
+        // CUDA Levenshtein distance on Kepler against Multi-threaded on CPU
+        test_similarity_scores_fuzzy<sz_size_t>(                                        //
+            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {}, //
+            levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_caps_ck_k> {}, experiment, 10,
+            first_gpu_specs);
 #endif
     }
 }

From e5d85f34b3fb5a647be5e5a6273d5e8113ee00c3 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 24 Apr 2025 15:20:12 +0000
Subject: [PATCH 359/751] Fix: Horizontal Affine Walkers

---
 include/stringcuzilla/similarity.hpp | 1743 ++++++++++++++++++--------
 1 file changed, 1251 insertions(+), 492 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index 37b2adb5..a8d16b75 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -67,14 +67,62 @@ namespace stringzilla {
 struct error_costs_256x256_t;
 struct error_costs_26x26ascii_t;
 
+template <sz_similarity_objective_t objective_, typename score_type_>
+constexpr score_type_ min_or_max(score_type_ a, score_type_ b) noexcept {
+    if constexpr (objective_ == sz_minimize_distance_k) { return sz_min_of_two(a, b); }
+    else { return sz_max_of_two(a, b); }
+}
+
+template <typename value_type_>
+void rotate_three(value_type_ &a, value_type_ &b, value_type_ &c) noexcept {
+    value_type_ tmp = a;
+    a = b;
+    b = c;
+    c = tmp;
+}
+
+struct linear_gap_costs_t {
+    error_cost_t open_or_extend = 1;
+};
+
+struct affine_gap_costs_t {
+    error_cost_t open = 1;
+    error_cost_t extend = 1;
+};
+
+template <typename gap_costs_type_>
+constexpr sz_similarity_gaps_t gap_type() {
+    constexpr bool is_linear_k = std::is_same_v<gap_costs_type_, linear_gap_costs_t>;
+    constexpr bool is_affine_k = std::is_same_v<gap_costs_type_, affine_gap_costs_t>;
+    static_assert(is_linear_k || is_affine_k, "Invalid gap costs type");
+    if constexpr (is_linear_k) { return sz_gaps_linear_k; }
+    else { return sz_gaps_affine_k; }
+}
+
+/**
+ *  @brief A trivial function object for uniform character substitution costs in Levenshtein-like similarity algorithms.
+ *  @sa error_costs_256x256_t, error_costs_26x26ascii_t, unary_substitution_costs_t
+ */
+struct uniform_substitution_costs_t {
+    error_cost_t match = 0;
+    error_cost_t mismatch = 1;
+
+    constexpr error_cost_t operator()(char a, char b) const noexcept { return a == b ? match : mismatch; }
+    constexpr error_cost_t operator()(sz_rune_t a, sz_rune_t b) const noexcept { return a == b ? match : mismatch; }
+    constexpr sz_size_t max_magnitude_change() const noexcept { return std::max(std::abs(match), std::abs(mismatch)); }
+};
+
+#pragma region - Algorithm Building Blocks
+
 /**
  *  @brief  Helper object to guess the amount of SRAM we want to effectively process the input
  *          without fetching from RAM/VRAM all the time, including the space for 3 diagonals
  *          and the strings themselves.
  *
  *  @tparam size_type_ The type of the size, usually `sz_size_t` for large inputs or `uint` on small inputs in CUDA.
+ *  @tparam is_signed_ Whether the similarity scores can be negative or not.
  */
-template <typename size_type_, bool is_signed_ = false>
+template <typename size_type_, bool is_signed_>
 struct similarity_memory_requirements {
     using size_t = size_type_;
     static constexpr bool is_signed_k = is_signed_;
@@ -95,9 +143,11 @@ struct similarity_memory_requirements {
      *  - substitution costs ranging from -16 to +15
      *  - gap costs equal to -10
      *  In that case, the biggest change will be `abs(-16) = 16`, so the passed argument should be 16.
+     *  In case of default Levenshtein distance, the maximum change is 1, so the passed argument should be 1.
      */
     constexpr similarity_memory_requirements(      //
         size_t first_length, size_t second_length, //
+        sz_similarity_gaps_t gap_type,             //
         size_t max_magnitude_change,               //
         size_t bytes_per_char,                     //
         size_t register_width) noexcept {
@@ -127,45 +177,140 @@ struct similarity_memory_requirements {
         // of the shorter string with each cell being big enough to hold the length of the longer one.
         // The diagonals should be aligned to `register_width` bytes to allow for SIMD operations.
         this->bytes_per_diagonal = round_up_to_multiple<size_t>(max_diagonal_length * bytes_per_cell, register_width);
+
+        // When dealing with linear gaps, we need 3x diagonals of 1 matrix.
+        // When dealing with affine gaps, we need 3x diagonals of 1 matrix and 2x diagonals of 2 matrices.
+        size_t diagonals_count = gap_type == sz_gaps_linear_k ? 3 : 7;
         size_t first_length_bytes = round_up_to_multiple<size_t>(first_length * bytes_per_char, register_width);
         size_t second_length_bytes = round_up_to_multiple<size_t>(second_length * bytes_per_char, register_width);
-        this->total = 3 * bytes_per_diagonal + first_length_bytes + second_length_bytes;
+        this->total = diagonals_count * bytes_per_diagonal + first_length_bytes + second_length_bytes;
     }
 };
 
+#pragma region - Core Templates
+
 /**
- *  @brief  An operator to be applied to be applied to all 2x2 blocks of the DP matrix to produce
- *          the bottom-right value from the 3x others in case of @b Global Alignment algorithms, like
- *          the @b Needleman-Wunsch or @b Levenshtein distance calculations.
+ *  @brief  An operator to be applied to be applied to all @b 2x2 tiles of the DP matrix to produce
+ *          the bottom-right value from the 3x others when populating the Dynamic Programming matrix.
  *
- *  It updates the internal state to remember the last calculated value, as in Global Alignment it's
- *  always in the bottom-right corner of the DP matrix, which is evaluated last.
+ *  @tparam first_iterator_type_ Typically `char*`, `sz_rune_t*`, or a `constant_iterator`.
+ *  @tparam second_iterator_type_ Typically `char*` or `sz_rune_t*`.
+ *  @tparam score_type_ The type of the score, typically `sz_size_t` or `sz_ssize_t`.
+ *  @tparam substituter_type_ Typically `uniform_substitution_costs_t` or a lookup table.
+ *  @tparam gap_costs_type_ Either `linear_gap_costs_t` or `sz_gaps_affine_k`.
+ *  @tparam objective_ Either `sz_minimize_distance_k` or `sz_maximize_score_k`.
+ *  @tparam locality_ Either `sz_similarity_global_k` or `sz_similarity_local_k`.
+ *  @tparam capability_ The SIMD capabilities of the target architecture.
+ *  @tparam enable_ Used to enable/disable the specialization.
  */
 template <                                                       //
     typename first_iterator_type_ = char const *,                //
     typename second_iterator_type_ = char const *,               //
     typename score_type_ = sz_size_t,                            //
-    typename substituter_type_ = error_costs_uniform_t,          //
+    typename substituter_type_ = uniform_substitution_costs_t,   //
+    typename gap_costs_type_ = linear_gap_costs_t,               //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
+    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
+    sz_capability_t capability_ = sz_cap_serial_k,               //
+    typename enable_ = void                                      //
+    >
+struct tile_scorer;
+
+/**
+ *  @brief  Alignment Score and Edit Distance algorithm evaluating the Dynamic Programming matrix
+ *          @b (anti)diagonal-by-(anti)diagonal on a CPU, leveraging OpenMP.
+ *
+ *  Can be used for both global and local alignment, like Needleman-Wunsch and Smith-Waterman.
+ *  Can be used for both linear and affine gap penalties.
+ *
+ *  ? There are smarter algorithms for computing the Levenshtein distance, mostly based on bit-level operations.
+ *  ? Those, however, don't generalize well to arbitrary length inputs or non-uniform substitution costs.
+ *  ? This algorithm provides a more flexible baseline implementation for future SIMD and GPGPU optimizations.
+ *
+ *  @tparam char_type_ The type of the characters in the strings, generally `char` or @b `rune_t` for UTF-8.
+ *  @tparam score_type_ The smallest type that can hold the distance, ideally `sz_i8_t` or `sz_u8_t`.
+ *  @tparam substituter_type_ A callable type that takes two characters and returns the substitution cost.
+ *  @tparam gap_costs_type_ Whether to use linear or affine gap penalties.
+ *  @tparam allocator_type_ A default-constructible allocator type for the internal buffers.
+ *  @tparam objective_ Whether to minimize the distance or maximize the score.
+ *  @tparam locality_ Whether to use the global alignment algorithm or the local one.
+ *  @tparam capability_ Whether to use OpenMP for @b multi-threading or some form of @b SIMD vectorization, or both.
+ *  @tparam enable_ Used to enable/disable the specialization.
+ */
+template <                                                       //
+    typename char_type_ = char,                                  //
+    typename score_type_ = sz_size_t,                            //
+    typename substituter_type_ = uniform_substitution_costs_t,   //
+    typename gap_costs_type_ = linear_gap_costs_t,               //
+    typename allocator_type_ = dummy_alloc_t,                    //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
+    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
+    sz_capability_t capability_ = sz_cap_serial_k,               //
+    typename enable_ = void                                      //
+    >
+struct diagonal_walker;
+
+/**
+ *  @brief  Alignment Score and Edit Distance algorithm evaluating the Dynamic Programming matrix
+ *          @b row-by-row on a CPU, using the conventional Wagner-Fischer algorithm.
+ *
+ *  Can be used for both global and local alignment, like Needleman-Wunsch and Smith-Waterman.
+ *  Can be used for both linear and affine gap penalties.
+ *
+ *  @tparam char_type_ The type of the characters in the strings, generally `char` or @b `rune_t` for UTF-8.
+ *  @tparam score_type_ The smallest type that can hold the distance, ideally `sz_i8_t` or `sz_u8_t`.
+ *  @tparam substituter_type_ A callable type that takes two characters and returns the substitution cost.
+ *  @tparam gap_costs_type_ Whether to use linear or affine gap penalties.
+ *  @tparam allocator_type_ A default-constructible allocator type for the internal buffers.
+ *  @tparam objective_ Whether to minimize the distance or maximize the score.
+ *  @tparam locality_ Whether to use the global alignment algorithm or the local one.
+ *  @tparam capability_ Whether to use OpenMP for @b multi-threading or some form of @b SIMD vectorization, or both.
+ *  @tparam enable_ Used to enable/disable the specialization.
+ *
+ *  @note   The API of this algorithm is a bit weird, but it's designed to minimize the reliance on the definitions
+ *          in the `stringzilla.hpp` header, making compilation times shorter for the end-user.
+ *  @sa     For lower-level API, check `sz_levenshtein_distance[_utf8]` and `sz_needleman_wunsch_score`.
+ *  @sa     For simplicity, use the `sz::levenshtein_distance[_utf8]` and `sz::needleman_wunsch_score`.
+ *  @sa     For bulk API, use `sz::levenshtein_scores[_utf8]`.
+ */
+template <                                                       //
+    typename char_type_ = char,                                  //
+    typename score_type_ = sz_size_t,                            //
+    typename substituter_type_ = uniform_substitution_costs_t,   //
+    typename gap_costs_type_ = linear_gap_costs_t,               //
+    typename allocator_type_ = dummy_alloc_t,                    //
     sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
     sz_similarity_locality_t locality_ = sz_similarity_global_k, //
     sz_capability_t capability_ = sz_cap_serial_k,               //
     typename enable_ = void                                      //
     >
-struct linear_scorer;
+struct horizontal_walker;
+
+#pragma endregion - Core Templates
+
+#pragma region - Autovectorized Tile Scorer
 
 constexpr bool is_serial_or_parallel(sz_capability_t capability) noexcept {
     return (capability == sz_cap_serial_k) || (capability == (sz_capability_t)(sz_cap_serial_k | sz_cap_parallel_k));
 }
 
+/**
+ *  This overload handles:
+ *  - Only @b Global alignment, not Local!
+ *  - Only @b Linear gaps, not Affine!
+ *  - Both auto-vectorized @b Serial and @b Parallel execution, but not hand-rolled SIMD!
+ */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
-struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, objective_,
-                     sz_similarity_global_k, capability_, std::enable_if_t<is_serial_or_parallel(capability_)>> {
+struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, linear_gap_costs_t,
+                   objective_, sz_similarity_global_k, capability_,
+                   std::enable_if_t<is_serial_or_parallel(capability_)>> {
 
     using first_iterator_t = first_iterator_type_;
     using second_iterator_t = second_iterator_type_;
     using score_t = score_type_;
     using substituter_t = substituter_type_;
+    using gap_costs_t = linear_gap_costs_t;
 
     static constexpr sz_similarity_objective_t objective_k = objective_;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
@@ -176,30 +321,26 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
     using char_t = typename std::remove_cvref<first_char_t>::type;
 
-    using scorer_t = linear_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k, locality_k,
-                                   capability_k>;
+    using scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, gap_costs_t, objective_k,
+                                 locality_k, capability_k>;
 
   protected:
     substituter_t substituter_ {};
-    error_cost_t gap_cost_ {1};
+    linear_gap_costs_t gap_costs_ {};
     score_t last_score_ {0};
 
-    static inline score_t pick_best(score_t a, score_t b) noexcept {
-        if constexpr (objective_k == sz_minimize_distance_k) { return sz_min_of_two(a, b); }
-        else { return sz_max_of_two(a, b); }
-    }
-
   public:
-    linear_scorer() = default;
-    linear_scorer(substituter_t substituter, error_cost_t gap_cost) noexcept
-        : substituter_(substituter), gap_cost_(gap_cost) {}
+    tile_scorer() = default;
+    tile_scorer(substituter_t subs, linear_gap_costs_t gap_cost) noexcept : substituter_(subs), gap_costs_(gap_cost) {}
 
     /**
      *  @brief Initializes a boundary value within a certain diagonal.
      *  @note Should only be called for the diagonals outside of the bottom-right triangle.
      *  @note Should only be called for the top row and left column of the matrix.
      */
-    void init(score_t &cell, sz_size_t diagonal_index) const noexcept { cell = gap_cost_ * diagonal_index; }
+    void init_score(score_t &cell, sz_size_t diagonal_index) const noexcept {
+        cell = gap_costs_.open_or_extend * diagonal_index;
+    }
 
     /**
      *  @brief Extract the final result of the scoring operation which will be always in the bottom-right corner.
@@ -217,6 +358,8 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
         score_t const *scores_pre_substitution, score_t const *scores_pre_insertion, score_t const *scores_pre_deletion,
         score_t *scores_new) noexcept {
 
+        error_cost_t const gap_cost = gap_costs_.open_or_extend;
+
 #if (capability_k & sz_cap_parallel_k)
 #pragma omp parallel for simd
 #else
@@ -231,8 +374,8 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
             // ? because one of the strings has been reversed beforehand.
             error_cost_t cost_of_substitution = substituter_(first_reversed_slice[i], second_slice[i]);
             score_t if_substitution = pre_substitution + cost_of_substitution;
-            score_t if_deletion_or_insertion = pick_best(pre_deletion, pre_insertion) + gap_cost_;
-            score_t cell_score = pick_best(if_deletion_or_insertion, if_substitution);
+            score_t if_deletion_or_insertion = min_or_max<objective_k>(pre_deletion, pre_insertion) + gap_cost;
+            score_t cell_score = min_or_max<objective_k>(if_deletion_or_insertion, if_substitution);
             scores_new[i] = cell_score;
         }
 
@@ -242,22 +385,22 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
 };
 
 /**
- *  @brief  An operator to be applied to be applied to all 2x2 blocks of the DP matrix to produce
- *          the bottom-right value from the 3x others in case of @b Local Alignment algorithms, like
- *          the @b Smith-Waterman score.
- *
- *  It updates the internal state to remember the minimum/maximum calculated value, as in Local Alignment
- *  it's always in the bottom-right corner of the DP matrix, which is evaluated last.
+ *  This overload handles:
+ *  - Only @b Local alignment, not Global!
+ *  - Only @b Linear gaps, not Affine!
+ *  - Both auto-vectorized @b Serial and @b Parallel execution, but not hand-rolled SIMD!
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
-struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, objective_,
-                     sz_similarity_local_k, capability_, std::enable_if_t<is_serial_or_parallel(capability_)>> {
+struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, linear_gap_costs_t,
+                   objective_, sz_similarity_local_k, capability_,
+                   std::enable_if_t<is_serial_or_parallel(capability_)>> {
 
     using first_iterator_t = first_iterator_type_;
     using second_iterator_t = second_iterator_type_;
     using score_t = score_type_;
     using substituter_t = substituter_type_;
+    using gap_costs_t = linear_gap_costs_t;
 
     static constexpr sz_similarity_objective_t objective_k = objective_;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_local_k;
@@ -268,30 +411,24 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
     static_assert(std::is_same<first_char_t, second_char_t>(), "String characters must be of the same type.");
     using char_t = first_char_t;
 
-    using scorer_t = linear_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k, locality_k,
-                                   capability_k>;
+    using scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, gap_costs_t, objective_k,
+                                 locality_k, capability_k>;
 
   protected:
     substituter_t substituter_ {};
-    error_cost_t gap_cost_ {1};
+    linear_gap_costs_t gap_costs_ {};
     score_t best_score_ {0};
 
-    static inline score_t pick_best(score_t a, score_t b) noexcept {
-        if constexpr (objective_k == sz_minimize_distance_k) { return sz_min_of_two(a, b); }
-        else { return sz_max_of_two(a, b); }
-    }
-
   public:
-    linear_scorer() = default;
-    linear_scorer(substituter_t substituter, error_cost_t gap_cost) noexcept
-        : substituter_(substituter), gap_cost_(gap_cost) {}
+    tile_scorer() = default;
+    tile_scorer(substituter_t subs, linear_gap_costs_t gap_cost) noexcept : substituter_(subs), gap_costs_(gap_cost) {}
 
     /**
      *  @brief Initializes a boundary value within a certain diagonal.
      *  @note Should only be called for the diagonals outside of the bottom-right triangle.
      *  @note Should only be called for the top row and left column of the matrix.
      */
-    void init(score_t &cell, sz_size_t /* diagonal_index */) const noexcept { cell = 0; }
+    void init_score(score_t &cell, sz_size_t /* diagonal_index */) const noexcept { cell = 0; }
 
     /**
      *  @brief Extract the final result of the scoring operation which will be maximum encountered value.
@@ -303,94 +440,476 @@ struct linear_scorer<first_iterator_type_, second_iterator_type_, score_type_, s
         score_t const *scores_pre_substitution, score_t const *scores_pre_insertion, score_t const *scores_pre_deletion,
         score_t *scores_new) noexcept {
 
-#pragma omp parallel for if (capability_k & sz_cap_parallel_k)
-        for (sz_size_t i = 0; i < n; ++i) {
-            score_t pre_substitution = scores_pre_substitution[i];
-            score_t pre_insertion = scores_pre_insertion[i];
-            score_t pre_deletion = scores_pre_deletion[i];
+        error_cost_t const gap_cost = gap_costs_.open_or_extend;
+
+#pragma omp parallel for if (capability_k & sz_cap_parallel_k)
+        for (sz_size_t i = 0; i < n; ++i) {
+            score_t pre_substitution = scores_pre_substitution[i];
+            score_t pre_insertion = scores_pre_insertion[i];
+            score_t pre_deletion = scores_pre_deletion[i];
+
+            // ? Note that here we are still traversing both buffers in the same order,
+            // ? because one of the strings has been reversed beforehand.
+            error_cost_t cost_of_substitution = substituter_(first_reversed_slice[i], second_slice[i]);
+            score_t if_substitution = pre_substitution + cost_of_substitution;
+            score_t if_deletion_or_insertion = min_or_max<objective_k>(pre_deletion, pre_insertion) + gap_cost;
+            // ! This is the main difference with global alignment:
+            score_t if_substitution_or_reset = min_or_max<objective_k>(if_substitution, (score_t)0);
+            score_t cell_score = min_or_max<objective_k>(if_deletion_or_insertion, if_substitution_or_reset);
+            scores_new[i] = cell_score;
+
+            // ! Update the global maximum score if this cell beats it - this is the costliest operation:
+#pragma omp critical
+            { best_score_ = min_or_max<objective_k>(best_score_, cell_score); }
+        }
+    }
+};
+
+/**
+ *  This overload handles:
+ *  - Only @b Global alignment, not Local!
+ *  - Only @b Affine gaps, not Linear!
+ *  - Both auto-vectorized @b Serial and @b Parallel execution, but not hand-rolled SIMD!
+ */
+template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
+          typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
+struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, affine_gap_costs_t,
+                   objective_, sz_similarity_global_k, capability_,
+                   std::enable_if_t<is_serial_or_parallel(capability_)>> {
+
+    using first_iterator_t = first_iterator_type_;
+    using second_iterator_t = second_iterator_type_;
+    using score_t = score_type_;
+    using substituter_t = substituter_type_;
+    using gap_costs_t = affine_gap_costs_t;
+
+    static constexpr sz_similarity_objective_t objective_k = objective_;
+    static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
+    static constexpr sz_capability_t capability_k = capability_;
+
+    using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
+    using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
+    static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
+    using char_t = typename std::remove_cvref<first_char_t>::type;
+
+    using scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, gap_costs_t, objective_k,
+                                 locality_k, capability_k>;
+
+  protected:
+    substituter_t substituter_ {};
+    affine_gap_costs_t gap_costs_ {};
+    score_t last_score_ {0};
+
+  public:
+    tile_scorer() = default;
+    tile_scorer(substituter_t subs, affine_gap_costs_t gaps) noexcept : substituter_(subs), gap_costs_(gaps) {}
+
+    /**
+     *  @brief Initializes a boundary value within a certain diagonal.
+     *  @note Should only be called for the diagonals outside of the bottom-right triangle.
+     *  @note Should only be called for the top row and left column of the matrix.
+     */
+    void init_score(score_t &cell, sz_size_t diagonal_index) const noexcept {
+        cell = gap_costs_.open + gap_costs_.extend * (diagonal_index - 1);
+    }
+
+    void init_gap(score_t &cell, sz_size_t diagonal_index) const noexcept {
+        // Make sure the initial value of the gap is not smaller in magnitude than the primary.
+        // The supplementary matrices are initialized with values of higher magnitude,
+        // which is equivalent to discarding them. That's better than using `SIZE_MAX`
+        // as subsequent additions won't overflow.
+        cell = gap_costs_.open * 2 + gap_costs_.extend * diagonal_index;
+    }
+
+    /**
+     *  @brief Extract the final result of the scoring operation which will be always in the bottom-right corner.
+     */
+    score_t score() const noexcept { return last_score_; }
+
+    /**
+     *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
+     *  @param first_reversed_slice The first string, @b reversed.
+     *  @param second_slice The second string.
+     *  @param n The length of the diagonal to evaluate and the number of characters to compare from each string.
+     */
+    void operator()(                                                                        //
+        first_iterator_t first_reversed_slice, second_iterator_t second_slice, sz_size_t n, //
+        score_t const *scores_pre_substitution,                                             //
+        score_t const *scores_pre_insertion,                                                //
+        score_t const *scores_pre_deletion,                                                 //
+        score_t const *scores_running_insertions,                                           //
+        score_t const *scores_running_deletions,                                            //
+        score_t *scores_new,                                                                //
+        score_t *scores_new_insertions,                                                     //
+        score_t *scores_new_deletions) noexcept {
+
+#if (capability_k & sz_cap_parallel_k)
+#pragma omp parallel for simd
+#else
+#pragma omp simd
+#endif
+        for (sz_size_t i = 0; i < n; ++i) {
+            score_t pre_substitution = scores_pre_substitution[i];
+            score_t pre_insertion_opening = scores_pre_insertion[i];
+            score_t pre_deletion_opening = scores_pre_deletion[i];
+            score_t pre_insertion_expansion = scores_running_insertions[i];
+            score_t pre_deletion_expansion = scores_running_deletions[i];
+
+            // ? Note that here we are still traversing both buffers in the same order,
+            // ? because one of the strings has been reversed beforehand.
+            error_cost_t cost_of_substitution = substituter_(first_reversed_slice[i], second_slice[i]);
+            score_t if_substitution = pre_substitution + cost_of_substitution;
+            score_t if_insertion = min_or_max<objective_k>(pre_insertion_opening + gap_costs_.open,
+                                                           pre_insertion_expansion + gap_costs_.extend);
+            score_t if_deletion = min_or_max<objective_k>(pre_deletion_opening + gap_costs_.open,
+                                                          pre_deletion_expansion + gap_costs_.extend);
+            score_t if_deletion_or_insertion = min_or_max<objective_k>(if_deletion, if_insertion);
+            score_t cell_score = min_or_max<objective_k>(if_deletion_or_insertion, if_substitution);
+
+            // Export results.
+            scores_new[i] = cell_score;
+            scores_new_insertions[i] = if_insertion;
+            scores_new_deletions[i] = if_deletion;
+        }
+
+        // The last element of the last chunk is the result of the global alignment.
+        last_score_ = scores_new[n - 1];
+    }
+};
+
+/**
+ *  This overload handles:
+ *  - Only @b Local alignment, not Global!
+ *  - Only @b Affine gaps, not Linear!
+ *  - Both auto-vectorized @b Serial and @b Parallel execution, but not hand-rolled SIMD!
+ */
+template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
+          typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
+struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, affine_gap_costs_t,
+                   objective_, sz_similarity_local_k, capability_,
+                   std::enable_if_t<is_serial_or_parallel(capability_)>> {
+
+    using first_iterator_t = first_iterator_type_;
+    using second_iterator_t = second_iterator_type_;
+    using score_t = score_type_;
+    using substituter_t = substituter_type_;
+    using gap_costs_t = affine_gap_costs_t;
+
+    static constexpr sz_similarity_objective_t objective_k = objective_;
+    static constexpr sz_similarity_locality_t locality_k = sz_similarity_local_k;
+    static constexpr sz_capability_t capability_k = capability_;
+
+    using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
+    using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
+    static_assert(std::is_same<first_char_t, second_char_t>(), "String characters must be of the same type.");
+    using char_t = first_char_t;
+
+    using scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, gap_costs_t, objective_k,
+                                 locality_k, capability_k>;
+
+  protected:
+    substituter_t substituter_ {};
+    affine_gap_costs_t gap_costs_ {};
+    score_t best_score_ {0};
+
+  public:
+    tile_scorer() = default;
+    tile_scorer(substituter_t subs, affine_gap_costs_t gaps) noexcept : substituter_(subs), gap_costs_(gaps) {}
+
+    /**
+     *  @brief Initializes a boundary value within a certain diagonal.
+     *  @note Should only be called for the diagonals outside of the bottom-right triangle.
+     *  @note Should only be called for the top row and left column of the matrix.
+     */
+    void init_score(score_t &cell, sz_size_t /* diagonal_index */) const noexcept { cell = 0; }
+    void init_gap(score_t &cell, sz_size_t /* diagonal_index */) const noexcept {
+        // Make sure the initial value of the gap is not smaller in magnitude than the primary.
+        // The supplementary matrices are initialized with values of higher magnitude,
+        // which is equivalent to discarding them. That's better than using `SIZE_MAX`
+        // as subsequent additions won't overflow.
+        cell = gap_costs_.open + gap_costs_.extend;
+    }
+
+    /**
+     *  @brief Extract the final result of the scoring operation which will be maximum encountered value.
+     */
+    score_t score() const noexcept { return best_score_; }
+
+    void operator()(                                                                              //
+        first_iterator_t first_reversed_slice, second_iterator_t second_slice, sz_size_t const n, //
+        score_t const *scores_pre_substitution,                                                   //
+        score_t const *scores_pre_insertion,                                                      //
+        score_t const *scores_pre_deletion,                                                       //
+        score_t const *scores_running_insertions,                                                 //
+        score_t const *scores_running_deletions,                                                  //
+        score_t *scores_new,                                                                      //
+        score_t *scores_new_insertions,                                                           //
+        score_t *scores_new_deletions) noexcept {
+
+#pragma omp parallel for if (capability_k & sz_cap_parallel_k)
+        for (sz_size_t i = 0; i < n; ++i) {
+            score_t pre_substitution = scores_pre_substitution[i];
+            score_t pre_insertion_opening = scores_pre_insertion[i];
+            score_t pre_deletion_opening = scores_pre_deletion[i];
+            score_t pre_insertion_expansion = scores_running_insertions[i];
+            score_t pre_deletion_expansion = scores_running_deletions[i];
+
+            // ? Note that here we are still traversing both buffers in the same order,
+            // ? because one of the strings has been reversed beforehand.
+            error_cost_t cost_of_substitution = substituter_(first_reversed_slice[i], second_slice[i]);
+            score_t if_substitution = pre_substitution + cost_of_substitution;
+            score_t if_deletion = min_or_max<objective_k>(pre_deletion_opening + gap_costs_.open,
+                                                          pre_deletion_expansion + gap_costs_.extend);
+            score_t if_insertion = min_or_max<objective_k>(pre_insertion_opening + gap_costs_.open,
+                                                           pre_insertion_expansion + gap_costs_.extend);
+            score_t if_deletion_or_insertion = min_or_max<objective_k>(if_deletion, if_insertion);
+            // ! This is the main difference with global alignment:
+            score_t if_substitution_or_reset = min_or_max<objective_k>(if_substitution, 0);
+            score_t cell_score = min_or_max<objective_k>(if_deletion_or_insertion, if_substitution_or_reset);
+
+            // Export results.
+            scores_new[i] = cell_score;
+            scores_new_deletions[i] = if_deletion;
+            scores_new_insertions[i] = if_insertion;
+
+            // ! Update the global maximum score if this cell beats it - this is the costliest operation:
+#pragma omp critical
+            { best_score_ = min_or_max<objective_k>(best_score_, cell_score); }
+        }
+    }
+};
+
+#pragma endregion - Autovectorized Tile Scorer
+
+#pragma region - Diagonal Walker
+
+/**
+ *  This overload handles:
+ *  - Both @b Global and @b Local alignment!
+ *  - Only @b Linear gaps, not Affine!
+ *  - All @b CPU capability levels, but not used on the GPU side.
+ *
+ *  Allocates 3x diagonals of the DP matrix.
+ */
+template <typename char_type_, typename score_type_, typename substituter_type_, typename allocator_type_,
+          sz_similarity_objective_t objective_, sz_similarity_locality_t locality_, sz_capability_t capability_,
+          typename enable_>
+struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_costs_t, allocator_type_, objective_,
+                       locality_, capability_, enable_> {
+
+    using char_t = char_type_;
+    using score_t = score_type_;
+    using substituter_t = substituter_type_;
+    using gap_costs_t = linear_gap_costs_t;
+    using allocator_t = allocator_type_;
+
+    static constexpr sz_similarity_objective_t objective_k = objective_;
+    static constexpr sz_similarity_locality_t locality_k = locality_;
+    static constexpr sz_capability_t capability_k = capability_;
+
+    using allocated_t = typename allocator_t::value_type;
+    static_assert(sizeof(allocated_t) == sizeof(char), "Allocator must be byte-aligned");
+    using scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t, objective_k,
+                                 locality_k, capability_k>;
+
+    substituter_t substituter_ {};
+    linear_gap_costs_t gap_costs_ {};
+    mutable allocator_t alloc_ {};
+
+    diagonal_walker(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
+
+    /**
+     *  @param[in] subs A commutative function returning the cost of substituting one char with another.
+     *  @param[in] gaps The uniform cost of a gap (insertion or deletion).
+     *  @param[in] alloc A default-constructible allocator for the internal buffers.
+     */
+    diagonal_walker(substituter_t subs, linear_gap_costs_t gaps, allocator_t alloc) noexcept
+        : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
+
+    /**
+     *  @param[in] first The first string.
+     *  @param[in] second The second string.
+     *  @param[out] result_ref Location to dump the calculated score.
+     */
+    status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref) const noexcept {
+
+        // Early exit for empty strings.
+        if (first.empty() || second.empty()) {
+            result_ref = 0;
+            if constexpr (locality_k == sz_similarity_global_k) {
+                if (!first.empty() && second.empty()) { result_ref = gap_costs_.open_or_extend * first.size(); }
+                else if (first.empty() && !second.empty()) { result_ref = gap_costs_.open_or_extend * second.size(); }
+            }
+            return status_t::success_k;
+        }
+
+        // Make sure the size relation between the strings is correct.
+        char_t const *shorter = first.data(), *longer = second.data();
+        sz_size_t shorter_length = first.size(), longer_length = second.size();
+        if (shorter_length > longer_length) {
+            std::swap(shorter, longer);
+            std::swap(shorter_length, longer_length);
+        }
+
+        // We are going to store 3 diagonals of the matrix.
+        // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
+        sz_size_t const shorter_dim = shorter_length + 1;
+        sz_size_t const longer_dim = longer_length + 1;
+
+        // Let's say we are dealing with 3 and 5 letter words.
+        // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
+        // It will have:
+        // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
+        // - 2 diagonals of fixed length, at positions: 4, 5.
+        // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
+        sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
+        sz_size_t const max_diagonal_length = shorter_length + 1;
+
+        // We want to avoid reverse-order iteration over the shorter string.
+        // Let's allocate a bit more memory and reverse-export our shorter string into that buffer.
+        sz_size_t const buffer_length = sizeof(score_t) * max_diagonal_length * 3 + shorter_length * sizeof(char_t);
+        score_t *const buffer = (score_t *)alloc_.allocate(buffer_length);
+        if (!buffer) return status_t::bad_alloc_k;
+
+        // The next few pointers will be swapped around.
+        score_t *previous_scores = buffer;
+        score_t *current_scores = previous_scores + max_diagonal_length;
+        score_t *next_scores = current_scores + max_diagonal_length;
+        char_t *const shorter_reversed = (char_t *)(next_scores + max_diagonal_length);
+
+        // Export the reversed string into the buffer.
+        for (sz_size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
+
+        // Initialize the first two diagonals:
+        scorer_t diagonal_aligner {substituter_, gap_costs_};
+        diagonal_aligner.init_score(previous_scores[0], 0);
+        diagonal_aligner.init_score(current_scores[0], 1);
+        diagonal_aligner.init_score(current_scores[1], 1);
+
+        // We skip diagonals 0 and 1, as they are trivial.
+        // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
+        // so we are effectively computing just one value, as will be marked by a single set bit in
+        // the `next_diagonal_mask` on the very first iteration.
+        sz_size_t next_diagonal_index = 2;
+
+        // Progress through the upper-left triangle of the Levenshtein matrix.
+        for (; next_diagonal_index < shorter_dim; ++next_diagonal_index) {
+
+            sz_size_t const next_diagonal_length = next_diagonal_index + 1;
+            diagonal_aligner(                                                //
+                shorter_reversed + shorter_length - next_diagonal_index + 1, // first sequence of characters
+                longer,                                                      // second sequence of characters
+                next_diagonal_length - 2,           // number of elements to compute with the `diagonal_aligner`
+                previous_scores,                    // costs pre substitution
+                current_scores, current_scores + 1, // costs pre insertion/deletion
+                next_scores + 1);
+
+            // Don't forget to populate the first row and the first column of the Levenshtein matrix.
+            diagonal_aligner.init_score(next_scores[0], next_diagonal_index);
+            diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
+
+            // Perform a circular rotation of those buffers, to reuse the memory.
+            rotate_three(previous_scores, current_scores, next_scores);
+        }
+
+        // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
+        for (; next_diagonal_index < longer_dim; ++next_diagonal_index) {
+
+            sz_size_t const next_diagonal_length = shorter_dim;
+            diagonal_aligner(                                        //
+                shorter_reversed + shorter_length - shorter_dim + 1, // first sequence of characters
+                longer + next_diagonal_index - shorter_dim,          // second sequence of characters
+                next_diagonal_length - 1,           // number of elements to compute with the `diagonal_aligner`
+                previous_scores,                    // costs pre substitution
+                current_scores, current_scores + 1, // costs pre insertion/deletion
+                next_scores);
+
+            // Don't forget to populate the first row of the Levenshtein matrix.
+            diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
+
+            // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
+            // dropping the first element in the current array.
+            rotate_three(previous_scores, current_scores, next_scores);
+
+            // ! Drop the first entry among the current scores.
+            sz_move((sz_ptr_t)(previous_scores), (sz_ptr_t)(previous_scores + 1),
+                    (max_diagonal_length - 1) * sizeof(score_t));
+        }
+
+        // Now let's handle the bottom-right triangle of the matrix.
+        for (; next_diagonal_index < diagonals_count; ++next_diagonal_index) {
 
-            // ? Note that here we are still traversing both buffers in the same order,
-            // ? because one of the strings has been reversed beforehand.
-            error_cost_t cost_of_substitution = substituter_(first_reversed_slice[i], second_slice[i]);
-            score_t if_substitution = pre_substitution + cost_of_substitution;
-            score_t if_deletion_or_insertion = pick_best(pre_deletion, pre_insertion) + gap_cost_;
-            // ! This is the main difference with global alignment:
-            score_t if_substitution_or_reset = pick_best(if_substitution, 0);
-            score_t cell_score = pick_best(if_deletion_or_insertion, if_substitution_or_reset);
-            scores_new[i] = cell_score;
+            sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
+            diagonal_aligner(                                        //
+                shorter_reversed + shorter_length - shorter_dim + 1, // first sequence of characters
+                longer + next_diagonal_index - shorter_dim,          // second sequence of characters
+                next_diagonal_length,               // number of elements to compute with the `diagonal_aligner`
+                previous_scores,                    // costs pre substitution
+                current_scores, current_scores + 1, // costs pre insertion/deletion
+                next_scores);
 
-            // ! Update the global maximum score if this cell beats it - this is the costliest operation:
-#pragma omp critical
-            { best_score_ = pick_best(best_score_, cell_score); }
+            // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
+            // dropping the first element in the current array.
+            score_t *temporary = previous_scores;
+
+            // ! Drop the first entry among the current scores.
+            // ! Assuming every next diagonal is shorter by one element, we don't need a full-blown `sz_move`.
+            // ! to shift the array by one element.
+            previous_scores = current_scores + 1;
+            current_scores = next_scores;
+            next_scores = temporary;
         }
+
+        // Export the scalar before `free` call.
+        result_ref = diagonal_aligner.score();
+        alloc_.deallocate((allocated_t *)buffer, buffer_length);
+        return status_t::success_k;
     }
 };
 
 /**
- *  @brief  Alignment Score and Edit Distance algorithm evaluating the Dynamic Programming matrix
- *          @b three skewed (reverse) diagonals at a time on a CPU, leveraging OpenMP for parallelization.
- *          Can be used for both global and local alignment, like Needleman-Wunsch and Smith-Waterman.
- *
- *  ? There are smarter algorithms for computing the Levenshtein distance, mostly based on bit-level operations.
- *  ? Those, however, don't generalize well to arbitrary length inputs or non-uniform substitution costs.
- *  ? This algorithm provides a more flexible baseline implementation for future SIMD and GPGPU optimizations.
- *  ! This algorithm can't handle different "gap opening" and "gap extension" costs, those need 3x more memory.
- *  ! This algorithm may be suboptimal for very small strings, where a conventional Wagner-Fischer algorithm
- *  ! with horizontal traversal order and fewer loops may be faster. That one, however, can't be parallel!
- *
- *  @tparam char_type_ The type of the characters in the strings, generally `char` or @b `rune_t` for UTF-8.
- *  @tparam score_type_ The smallest type that can hold the distance, ideally `sz_i8_t` or `sz_u8_t`.
- *  @tparam substituter_type_ A callable type that takes two characters and returns the substitution cost.
- *  @tparam allocator_type_ A default-constructible allocator type for the internal buffers.
- *  @tparam capability_ Whether to use OpenMP for @b multi-threading or some form of @b SIMD vectorization, or both.
- *  @tparam locality_ Whether to use the global alignment algorithm or the local one.
+ *  This overload handles:
+ *  - Both @b Global and @b Local alignment!
+ *  - Only @b Affine gaps, not Linear!
+ *  - All @b CPU capability levels, but not used on the GPU side.
  *
- *  @note   The API of this algorithm is a bit weird, but it's designed to minimize the reliance on the definitions
- *          in the `stringzilla.hpp` header, making compilation times shorter for the end-user.
- *  @sa     For lower-level API, check `sz_levenshtein_distance[_utf8]` and `sz_needleman_wunsch_score`.
- *  @sa     For simplicity, use the `sz::levenshtein_distance[_utf8]` and `sz::needleman_wunsch_score`.
- *  @sa     For bulk API, use `sz::levenshtein_scores[_utf8]`.
+ *  Allocates 3x diagonals of the DP matrix and 2x diagonals of 2x affine gaps matrices.
  */
-template <                                                       //
-    typename char_type_ = char,                                  //
-    typename score_type_ = sz_size_t,                            //
-    typename substituter_type_ = error_costs_uniform_t,          //
-    typename allocator_type_ = dummy_alloc_t,                    //
-    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
-    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
-    sz_capability_t capability_ = sz_cap_serial_k,               //
-    typename enable_ = void                                      //
-    >
-struct diagonal_walker {
+template <typename char_type_, typename score_type_, typename substituter_type_, typename allocator_type_,
+          sz_similarity_objective_t objective_, sz_similarity_locality_t locality_, sz_capability_t capability_,
+          typename enable_>
+struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_costs_t, allocator_type_, objective_,
+                       locality_, capability_, enable_> {
 
     using char_t = char_type_;
     using score_t = score_type_;
     using substituter_t = substituter_type_;
+    using gap_costs_t = affine_gap_costs_t;
     using allocator_t = allocator_type_;
 
-    static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_similarity_objective_t objective_k = objective_;
     static constexpr sz_similarity_locality_t locality_k = locality_;
+    static constexpr sz_similarity_gaps_t gaps_k = sz_gaps_affine_k;
+    static constexpr sz_capability_t capability_k = capability_;
 
     using allocated_t = typename allocator_t::value_type;
     static_assert(sizeof(allocated_t) == sizeof(char), "Allocator must be byte-aligned");
-    using scorer_t =
-        linear_scorer<char_t const *, char_t const *, score_t, substituter_t, objective_k, locality_k, capability_k>;
+    using scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t, objective_k,
+                                 locality_k, capability_k>;
 
     substituter_t substituter_ {};
-    error_cost_t gap_cost_ {1};
+    affine_gap_costs_t gap_costs_ {};
     mutable allocator_t alloc_ {};
 
     diagonal_walker(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
 
     /**
-     *  @param[in] substituter A commutative function returning the cost of substituting one char with another.
-     *  @param[in] gap_cost The uniform cost of a gap (insertion or deletion).
+     *  @param[in] subs A commutative function returning the cost of substituting one char with another.
+     *  @param[in] gaps The affine costs of opening and extending a gap.
      *  @param[in] alloc A default-constructible allocator for the internal buffers.
-     *
      */
-    diagonal_walker(substituter_t substituter, error_cost_t gap_cost, allocator_t alloc) noexcept
-        : substituter_(substituter), gap_cost_(gap_cost), alloc_(alloc) {}
+    diagonal_walker(substituter_t subs, affine_gap_costs_t gaps, allocator_t alloc) noexcept
+        : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
     /**
      *  @param[in] first The first string.
@@ -399,6 +918,20 @@ struct diagonal_walker {
      */
     status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref) const noexcept {
 
+        // Early exit for empty strings.
+        if (first.empty() || second.empty()) {
+            result_ref = 0;
+            if constexpr (locality_k == sz_similarity_global_k) {
+                if (!first.empty() && second.empty()) {
+                    result_ref = gap_costs_.open + gap_costs_.extend * (first.size() - 1);
+                }
+                else if (first.empty() && !second.empty()) {
+                    result_ref = gap_costs_.open + gap_costs_.extend * (second.size() - 1);
+                }
+            }
+            return status_t::success_k;
+        }
+
         // Make sure the size relation between the strings is correct.
         char_t const *shorter = first.data(), *longer = second.data();
         sz_size_t shorter_length = first.size(), longer_length = second.size();
@@ -407,7 +940,7 @@ struct diagonal_walker {
             std::swap(shorter_length, longer_length);
         }
 
-        // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
+        // We are going to store 7 diagonals of the matrix.
         // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
         sz_size_t const shorter_dim = shorter_length + 1;
         sz_size_t const longer_dim = longer_length + 1;
@@ -423,7 +956,7 @@ struct diagonal_walker {
 
         // We want to avoid reverse-order iteration over the shorter string.
         // Let's allocate a bit more memory and reverse-export our shorter string into that buffer.
-        sz_size_t const buffer_length = sizeof(score_t) * max_diagonal_length * 3 + shorter_length * sizeof(char_t);
+        sz_size_t const buffer_length = sizeof(score_t) * max_diagonal_length * 7 + shorter_length * sizeof(char_t);
         score_t *const buffer = (score_t *)alloc_.allocate(buffer_length);
         if (!buffer) return status_t::bad_alloc_k;
 
@@ -431,16 +964,22 @@ struct diagonal_walker {
         score_t *previous_scores = buffer;
         score_t *current_scores = previous_scores + max_diagonal_length;
         score_t *next_scores = current_scores + max_diagonal_length;
-        char_t *const shorter_reversed = (char_t *)(next_scores + max_diagonal_length);
+        score_t *current_inserts = next_scores + max_diagonal_length;
+        score_t *next_inserts = current_inserts + max_diagonal_length;
+        score_t *current_deletes = next_inserts + max_diagonal_length;
+        score_t *next_deletes = current_deletes + max_diagonal_length;
+        char_t *const shorter_reversed = (char_t *)(next_deletes + max_diagonal_length);
 
         // Export the reversed string into the buffer.
         for (sz_size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
 
         // Initialize the first two diagonals:
-        scorer_t diagonal_aligner {substituter_, gap_cost_};
-        diagonal_aligner.init(previous_scores[0], 0);
-        diagonal_aligner.init(current_scores[0], 1);
-        diagonal_aligner.init(current_scores[1], 1);
+        scorer_t diagonal_aligner {substituter_, gap_costs_};
+        diagonal_aligner.init_score(previous_scores[0], 0);
+        diagonal_aligner.init_score(current_scores[0], 1);
+        diagonal_aligner.init_score(current_scores[1], 1);
+        diagonal_aligner.init_gap(current_inserts[0], 1);
+        diagonal_aligner.init_gap(current_deletes[1], 1);
 
         // We skip diagonals 0 and 1, as they are trivial.
         // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
@@ -455,20 +994,24 @@ struct diagonal_walker {
             diagonal_aligner(                                                //
                 shorter_reversed + shorter_length - next_diagonal_index + 1, // first sequence of characters
                 longer,                                                      // second sequence of characters
-                next_diagonal_length - 2,           // number of elements to compute with the `diagonal_aligner`
-                previous_scores,                    // costs pre substitution
-                current_scores, current_scores + 1, // costs pre insertion/deletion
-                next_scores + 1);
+                next_diagonal_length - 2,             // number of elements to compute with the `diagonal_aligner`
+                previous_scores,                      // costs pre substitution
+                current_scores, current_scores + 1,   // costs pre insertion/deletion opening
+                current_inserts, current_deletes + 1, // costs pre insertion/deletion extension
+                next_scores + 1,                      // updated similarity scores
+                next_inserts + 1, next_deletes + 1    // updated insertion/deletion extensions
+            );
 
             // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-            diagonal_aligner.init(next_scores[0], next_diagonal_index);
-            diagonal_aligner.init(next_scores[next_diagonal_length - 1], next_diagonal_index);
+            diagonal_aligner.init_score(next_scores[0], next_diagonal_index);
+            diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
+            diagonal_aligner.init_gap(next_inserts[0], next_diagonal_index);
+            diagonal_aligner.init_gap(next_deletes[next_diagonal_length - 1], next_diagonal_index);
 
             // Perform a circular rotation of those buffers, to reuse the memory.
-            score_t *temporary = previous_scores;
-            previous_scores = current_scores;
-            current_scores = next_scores;
-            next_scores = temporary;
+            rotate_three(previous_scores, current_scores, next_scores);
+            std::swap(current_inserts, next_inserts);
+            std::swap(current_deletes, next_deletes);
         }
 
         // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
@@ -478,20 +1021,23 @@ struct diagonal_walker {
             diagonal_aligner(                                        //
                 shorter_reversed + shorter_length - shorter_dim + 1, // first sequence of characters
                 longer + next_diagonal_index - shorter_dim,          // second sequence of characters
-                next_diagonal_length - 1,           // number of elements to compute with the `diagonal_aligner`
-                previous_scores,                    // costs pre substitution
-                current_scores, current_scores + 1, // costs pre insertion/deletion
-                next_scores);
+                next_diagonal_length - 1,             // number of elements to compute with the `diagonal_aligner`
+                previous_scores,                      // costs pre substitution
+                current_scores, current_scores + 1,   // costs pre insertion/deletion opening
+                current_inserts, current_deletes + 1, // costs pre insertion/deletion extension
+                next_scores,                          // updated similarity scores
+                next_inserts, next_deletes            // updated insertion/deletion extensions
+            );
 
             // Don't forget to populate the first row of the Levenshtein matrix.
-            diagonal_aligner.init(next_scores[next_diagonal_length - 1], next_diagonal_index);
+            diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
+            diagonal_aligner.init_gap(next_deletes[next_diagonal_length - 1], next_diagonal_index);
 
             // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
             // dropping the first element in the current array.
-            score_t *temporary = previous_scores;
-            previous_scores = current_scores;
-            current_scores = next_scores;
-            next_scores = temporary;
+            rotate_three(previous_scores, current_scores, next_scores);
+            std::swap(current_inserts, next_inserts);
+            std::swap(current_deletes, next_deletes);
 
             // ! Drop the first entry among the current scores.
             sz_move((sz_ptr_t)(previous_scores), (sz_ptr_t)(previous_scores + 1),
@@ -505,10 +1051,13 @@ struct diagonal_walker {
             diagonal_aligner(                                        //
                 shorter_reversed + shorter_length - shorter_dim + 1, // first sequence of characters
                 longer + next_diagonal_index - shorter_dim,          // second sequence of characters
-                next_diagonal_length,               // number of elements to compute with the `diagonal_aligner`
-                previous_scores,                    // costs pre substitution
-                current_scores, current_scores + 1, // costs pre insertion/deletion
-                next_scores);
+                next_diagonal_length,                 // number of elements to compute with the `diagonal_aligner`
+                previous_scores,                      // costs pre substitution
+                current_scores, current_scores + 1,   // costs pre insertion/deletion opening
+                current_inserts, current_deletes + 1, // costs pre insertion/deletion extension
+                next_scores,                          // updated similarity scores
+                next_inserts, next_deletes            // updated insertion/deletion extensions
+            );
 
             // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
             // dropping the first element in the current array.
@@ -520,6 +1069,14 @@ struct diagonal_walker {
             previous_scores = current_scores + 1;
             current_scores = next_scores;
             next_scores = temporary;
+
+            // ! Drop the first entry among the current insertions and deletions.
+            temporary = current_inserts;
+            current_inserts = next_inserts + 0;
+            next_inserts = temporary;
+            temporary = current_deletes;
+            current_deletes = next_deletes + 0;
+            next_deletes = temporary;
         }
 
         // Export the scalar before `free` call.
@@ -529,48 +1086,27 @@ struct diagonal_walker {
     }
 };
 
+#pragma endregion - Diagonal Walker
+#pragma region - Horizontal Walker
+
 /**
- *  @brief  Alignment Score and Edit Distance algorithm evaluating the Dynamic Programming matrix
- *          @b two rows at a time on a CPU, using the conventional Wagner Fischer algorithm.
- *
- *  ! This algorithm can't handle different "gap opening" and "gap extension" costs, those need 3x more memory.
- *  ! This algorithm doesn't parallelize well, check out the diagonal variants!
+ *  This overload handles:
+ *  - Both @b Global and @b Local alignment!
+ *  - Only @b Linear gaps, not Affine!
+ *  - All @b CPU capability levels, but not used on the GPU side.
  *
- *  @param[in] first The first string.
- *  @param[in] second The second string.
- *  @param[out] result_ref Location to dump the calculated score.
- *  @param[in] gap_cost The uniform cost of a gap (insertion or deletion).
- *  @param[in] substituter A commutative function returning the cost of substituting one char with another.
- *  @param[in] alloc A default-constructible allocator for the internal buffers.
- *
- *  @tparam char_type_ The type of the characters in the strings, generally `char` or @b `rune_t` for UTF-8.
- *  @tparam score_type_ The smallest type that can hold the distance, ideally `sz_i8_t` or `sz_u8_t`.
- *  @tparam substituter_type_ A callable type that takes two characters and returns the substitution cost.
- *  @tparam allocator_type_ A default-constructible allocator type for the internal buffers.
- *  @tparam multi_threaded_ Whether to use OpenMP for @b multi-threading or just vectorization.
- *  @tparam global_alignment_ Whether to use the global alignment algorithm or the local one.
- *
- *  @note   The API of this algorithm is a bit weird, but it's designed to minimize the reliance on the definitions
- *          in the `stringzilla.hpp` header, making compilation times shorter for the end-user.
- *  @sa     For lower-level API, check `sz_levenshtein_distance[_utf8]` and `sz_needleman_wunsch_score`.
- *  @sa     For simplicity, use the `sz::levenshtein_distance[_utf8]` and `sz::needleman_wunsch_score`.
- *  @sa     For bulk API, use `sz::levenshtein_scores[_utf8]`.
+ *  Allocates 2x rows of the DP matrix.
  */
-template <                                                       //
-    typename char_type_ = char,                                  //
-    typename score_type_ = sz_size_t,                            //
-    typename substituter_type_ = error_costs_uniform_t,          //
-    typename allocator_type_ = dummy_alloc_t,                    //
-    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
-    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
-    sz_capability_t capability_ = sz_cap_serial_k,               //
-    typename enable_ = void                                      //
-    >
-struct horizontal_walker {
+template <typename char_type_, typename score_type_, typename substituter_type_, typename allocator_type_,
+          sz_similarity_objective_t objective_, sz_similarity_locality_t locality_, sz_capability_t capability_,
+          typename enable_>
+struct horizontal_walker<char_type_, score_type_, substituter_type_, linear_gap_costs_t, allocator_type_, objective_,
+                         locality_, capability_, enable_> {
 
     using char_t = char_type_;
     using score_t = score_type_;
     using substituter_t = substituter_type_;
+    using gap_costs_t = linear_gap_costs_t;
     using allocator_t = allocator_type_;
 
     static constexpr sz_similarity_objective_t objective_k = objective_;
@@ -579,23 +1115,23 @@ struct horizontal_walker {
 
     using allocated_t = typename allocator_t::value_type;
     static_assert(sizeof(allocated_t) == sizeof(char), "Allocator must be byte-aligned");
-    using scorer_t = linear_scorer<constant_iterator<char_t>, char_t const *, score_t, substituter_t, objective_k,
-                                   locality_k, capability_k>;
+    using scorer_t = tile_scorer<constant_iterator<char_t>, char_t const *, score_t, substituter_t, gap_costs_t,
+                                 objective_k, locality_k, capability_k>;
 
     substituter_t substituter_ {};
-    error_cost_t gap_cost_ {1};
+    linear_gap_costs_t gap_costs_ {};
     mutable allocator_t alloc_ {};
 
     horizontal_walker(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
 
     /**
-     *  @param[in] substituter A commutative function returning the cost of substituting one char with another.
-     *  @param[in] gap_cost The uniform cost of a gap (insertion or deletion).
+     *  @param[in] subs A commutative function returning the cost of substituting one char with another.
+     *  @param[in] gaps The uniform cost of a gap (insertion or deletion).
      *  @param[in] alloc A default-constructible allocator for the internal buffers.
      *
      */
-    horizontal_walker(substituter_t substituter, error_cost_t gap_cost, allocator_t alloc) noexcept
-        : substituter_(substituter), gap_cost_(gap_cost), alloc_(alloc) {}
+    horizontal_walker(substituter_t subs, linear_gap_costs_t gaps, allocator_t alloc) noexcept
+        : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
     /**
      *  @param[in] first The first string.
@@ -604,6 +1140,16 @@ struct horizontal_walker {
      */
     status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref) const noexcept {
 
+        // Early exit for empty strings.
+        if (first.empty() || second.empty()) {
+            result_ref = 0;
+            if constexpr (locality_k == sz_similarity_global_k) {
+                if (!first.empty() && second.empty()) { result_ref = gap_costs_.open_or_extend * first.size(); }
+                else if (first.empty() && !second.empty()) { result_ref = gap_costs_.open_or_extend * second.size(); }
+            }
+            return status_t::success_k;
+        }
+
         // Make sure the size relation between the strings is correct.
         char_t const *shorter = first.data(), *longer = second.data();
         sz_size_t shorter_length = first.size(), longer_length = second.size();
@@ -628,15 +1174,15 @@ struct horizontal_walker {
         score_t *current_scores = previous_scores + shorter_dim;
 
         // Initialize the first row:
-        scorer_t horizontal_aligner {substituter_, gap_cost_};
+        scorer_t horizontal_aligner {substituter_, gap_costs_};
         for (sz_size_t col_idx = 0; col_idx < shorter_dim; ++col_idx)
-            horizontal_aligner.init(previous_scores[col_idx], col_idx);
+            horizontal_aligner.init_score(previous_scores[col_idx], col_idx);
 
         // Progress through the matrix row-by-row:
         for (sz_size_t row_idx = 1; row_idx < longer_dim; ++row_idx) {
 
             // Don't forget to populate the first column of each row:
-            horizontal_aligner.init(current_scores[0], row_idx);
+            horizontal_aligner.init_score(current_scores[0], row_idx);
 
             horizontal_aligner(                                  //
                 constant_iterator<char_t> {longer[row_idx - 1]}, // first sequence of characters
@@ -658,12 +1204,154 @@ struct horizontal_walker {
     }
 };
 
+/**
+ *  This overload handles:
+ *  - Both @b Global and @b Local alignment!
+ *  - Only @b Affine gaps, not Linear!
+ *  - All @b CPU capability levels, but not used on the GPU side.
+ *
+ *  Allocates 2x rows of the DP matrix and 2x rows of 2x affine gaps matrices.
+ */
+template <typename char_type_, typename score_type_, typename substituter_type_, typename allocator_type_,
+          sz_similarity_objective_t objective_, sz_similarity_locality_t locality_, sz_capability_t capability_,
+          typename enable_>
+struct horizontal_walker<char_type_, score_type_, substituter_type_, affine_gap_costs_t, allocator_type_, objective_,
+                         locality_, capability_, enable_> {
+
+    using char_t = char_type_;
+    using score_t = score_type_;
+    using substituter_t = substituter_type_;
+    using gap_costs_t = affine_gap_costs_t;
+    using allocator_t = allocator_type_;
+
+    static constexpr sz_similarity_objective_t objective_k = objective_;
+    static constexpr sz_similarity_locality_t locality_k = locality_;
+    static constexpr sz_capability_t capability_k = capability_;
+
+    using allocated_t = typename allocator_t::value_type;
+    static_assert(sizeof(allocated_t) == sizeof(char), "Allocator must be byte-aligned");
+    using scorer_t = tile_scorer<constant_iterator<char_t>, char_t const *, score_t, substituter_t, gap_costs_t,
+                                 objective_k, locality_k, capability_k>;
+
+    substituter_t substituter_ {};
+    affine_gap_costs_t gap_costs_ {};
+    mutable allocator_t alloc_ {};
+
+    horizontal_walker(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
+
+    /**
+     *  @param[in] subs A commutative function returning the cost of substituting one char with another.
+     *  @param[in] gap_opening_cost The cost of opening a gap (insertion or deletion).
+     *  @param[in] gap_extension_cost The cost of extending a gap (insertion or deletion).
+     *  @param[in] alloc A default-constructible allocator for the internal buffers.
+     */
+    horizontal_walker(substituter_t subs, affine_gap_costs_t gaps, allocator_t alloc) noexcept
+        : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
+
+    /**
+     *  @param[in] first The first string.
+     *  @param[in] second The second string.
+     *  @param[out] result_ref Location to dump the calculated score.
+     */
+    status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref) const noexcept {
+
+        // Early exit for empty strings.
+        if (first.empty() || second.empty()) {
+            result_ref = 0;
+            if constexpr (locality_k == sz_similarity_global_k) {
+                if (!first.empty() && second.empty()) {
+                    result_ref = gap_costs_.open + gap_costs_.extend * (first.size() - 1);
+                }
+                else if (first.empty() && !second.empty()) {
+                    result_ref = gap_costs_.open + gap_costs_.extend * (second.size() - 1);
+                }
+            }
+            return status_t::success_k;
+        }
+
+        // Make sure the size relation between the strings is correct.
+        char_t const *shorter = first.data(), *longer = second.data();
+        sz_size_t shorter_length = first.size(), longer_length = second.size();
+        if (shorter_length > longer_length) {
+            std::swap(shorter, longer);
+            std::swap(shorter_length, longer_length);
+        }
+
+        // We are going to store 2 rows of the matrix. It will be either 2 rows of length `shorter_length + 1`
+        // or 2 rows of length `longer_length + 1`, depending on our preference - either minimizing the memory
+        // consumption or the inner loop performance.
+        sz_size_t const shorter_dim = shorter_length + 1;
+        sz_size_t const longer_dim = longer_length + 1;
+
+        // We decide to use less memory!
+        sz_size_t const buffer_length = sizeof(score_t) * shorter_dim * 2 * 3; // 2x rows of 3x matrices
+        score_t *const buffer = (score_t *)alloc_.allocate(buffer_length);
+        if (!buffer) return status_t::bad_alloc_k;
+
+        // The next few pointers will be swapped around.
+        score_t *previous_scores = buffer;
+        score_t *current_scores = previous_scores + shorter_dim;
+        score_t *previous_inserts = current_scores + shorter_dim;
+        score_t *current_inserts = previous_inserts + shorter_dim;
+        score_t *previous_deletes = current_inserts + shorter_dim;
+        score_t *current_deletes = previous_deletes + shorter_dim;
+
+        // Initialize the first row:
+        scorer_t horizontal_aligner {substituter_, gap_costs_};
+        previous_scores[0] = 0;
+        for (sz_size_t col_idx = 1; col_idx < shorter_dim; ++col_idx) {
+            horizontal_aligner.init_score(previous_scores[col_idx], col_idx);
+            horizontal_aligner.init_gap(previous_deletes[col_idx], col_idx);
+        }
+
+        // Progress through the matrix row-by-row:
+        for (sz_size_t row_idx = 1; row_idx < longer_dim; ++row_idx) {
+
+            // Don't forget to populate the first column of each row:
+            horizontal_aligner.init_score(current_scores[0], row_idx);
+            horizontal_aligner.init_gap(current_inserts[0], row_idx);
+
+            horizontal_aligner(                                  //
+                constant_iterator<char_t> {longer[row_idx - 1]}, // first sequence of characters
+                shorter,                                         // second sequence of characters
+                shorter_dim - 1,                         // number of elements to compute with the `horizontal_aligner`
+                previous_scores,                         // costs pre substitution
+                current_scores, previous_scores + 1,     // costs pre insertion/deletion opening
+                current_inserts, previous_deletes + 1,   // costs pre insertion/deletion extension
+                current_scores + 1,                      // updated similarity scores
+                current_inserts + 1, current_deletes + 1 // updated insertion/deletion extensions
+            );
+
+            // Reuse the memory.
+            std::swap(previous_scores, current_scores);
+            std::swap(previous_inserts, current_inserts);
+            std::swap(previous_deletes, current_deletes);
+        }
+
+        // Export the scalar before `free` call.
+        result_ref = horizontal_aligner.score();
+        alloc_.deallocate((allocated_t *)buffer, buffer_length);
+        return status_t::success_k;
+    }
+};
+
+#pragma endregion - Horizontal Walker
+
+#pragma endregion - Algorithm Building Blocks
+
+#pragma region - Pairwise Algorithms on CPU
+
 /**
  *  @brief  Computes the @b byte-level Levenshtein distance between two strings using the OpenMP backend.
  *  @sa     `levenshtein_distance_utf8` for UTF-8 strings.
+ *
+ *  @tparam char_type_ Can be any POD integer type, but @b `char` and @b `sz_rune_t` are preferred.
+ *  @tparam gap_costs_type_ Can be either `linear_gap_costs_t` or `affine_gap_costs_t`.
+ *  @tparam capability_ Can be either `sz_cap_serial_k`, `sz_caps_sp_k`, `sz_caps_spi_k`, `sz_cap_cuda_k`.
  */
 template <                                         //
     typename char_type_ = char,                    //
+    typename gap_costs_type_ = linear_gap_costs_t, //
     typename allocator_type_ = dummy_alloc_t,      //
     sz_capability_t capability_ = sz_cap_serial_k, //
     typename enable_ = void                        //
@@ -671,25 +1359,38 @@ template <                                         //
 struct levenshtein_distance {
 
     using char_t = char_type_;
+    using gap_costs_t = gap_costs_type_;
     using allocator_t = allocator_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using horizontal_u8_t = horizontal_walker<char_t, sz_u8_t, error_costs_uniform_t, allocator_t,
-                                              sz_minimize_distance_k, sz_similarity_global_k, capability_serialized_k>;
-    using diagonal_u8_t = diagonal_walker<char_t, sz_u8_t, error_costs_uniform_t, allocator_t, sz_minimize_distance_k,
-                                          sz_similarity_global_k, capability_serialized_k>;
-    using diagonal_u16_t = diagonal_walker<char_t, sz_u16_t, error_costs_uniform_t, allocator_t, sz_minimize_distance_k,
-                                           sz_similarity_global_k, capability_k>;
-    using diagonal_u32_t = diagonal_walker<char_t, sz_u32_t, error_costs_uniform_t, allocator_t, sz_minimize_distance_k,
-                                           sz_similarity_global_k, capability_k>;
-    using diagonal_u64_t = diagonal_walker<char_t, sz_u64_t, error_costs_uniform_t, allocator_t, sz_minimize_distance_k,
-                                           sz_similarity_global_k, capability_k>;
-
+    using horizontal_u8_t =                                                                        //
+        horizontal_walker<char_t, sz_u8_t, uniform_substitution_costs_t, gap_costs_t, allocator_t, //
+                          sz_minimize_distance_k, sz_similarity_global_k, capability_serialized_k>;
+    using diagonal_u8_t =                                                                        //
+        diagonal_walker<char_t, sz_u8_t, uniform_substitution_costs_t, gap_costs_t, allocator_t, //
+                        sz_minimize_distance_k, sz_similarity_global_k, capability_serialized_k>;
+    using diagonal_u16_t =                                                                        //
+        diagonal_walker<char_t, sz_u16_t, uniform_substitution_costs_t, gap_costs_t, allocator_t, //
+                        sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
+    using diagonal_u32_t =                                                                        //
+        diagonal_walker<char_t, sz_u32_t, uniform_substitution_costs_t, gap_costs_t, allocator_t, //
+                        sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
+    using diagonal_u64_t =                                                                        //
+        diagonal_walker<char_t, sz_u64_t, uniform_substitution_costs_t, gap_costs_t, allocator_t, //
+                        sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
+
+    using linearized_fallback_t = levenshtein_distance<char_t, linear_gap_costs_t, allocator_t, capability_k>;
+
+    uniform_substitution_costs_t substituter_ {};
+    gap_costs_t gap_costs_ {};
     allocator_t alloc_ {};
 
     levenshtein_distance(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
+    levenshtein_distance(uniform_substitution_costs_t subs, gap_costs_t gaps,
+                         allocator_t alloc = allocator_t {}) noexcept
+        : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
     /**
      *  @param[in] first The first string.
@@ -698,54 +1399,53 @@ struct levenshtein_distance {
      */
     status_t operator()(span<char_t const> first, span<char_t const> second, sz_size_t &result_ref) const noexcept {
 
-        sz_size_t const first_length = first.length();
-        sz_size_t const second_length = second.length();
-        if (first_length == 0) {
-            result_ref = second_length;
-            return status_t::success_k;
-        }
-        if (second_length == 0) {
-            result_ref = first_length;
-            return status_t::success_k;
-        }
+        // If the cost of gap opening and extension is the same and we've mistakenly instantiated
+        // the more memory-intensive `affine_gap_costs_t`, we can fall-back to the linearized version.
+        if constexpr (std::is_same<gap_costs_t, affine_gap_costs_t>())
+            if (gap_costs_.open == gap_costs_.extend) {
+                linear_gap_costs_t linear_gap(gap_costs_.open);
+                linearized_fallback_t linear_backend(substituter_, linear_gap, alloc_);
+                return linear_backend(first, second, result_ref);
+            }
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
         using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, false>;
-        similarity_memory_requirements_t requirements(first_length, second_length, 1, sizeof(char_t),
-                                                      SZ_MAX_REGISTER_WIDTH);
+        similarity_memory_requirements_t requirements(                    //
+            first.size(), second.size(),                                  //
+            gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
+            sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
 
         // When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
-        error_costs_uniform_t substituter;
         if (requirements.max_diagonal_length < 16) {
             sz_u8_t result_u8 = std::numeric_limits<sz_u8_t>::max();
-            status_t status = horizontal_u8_t {substituter, 1, alloc_}(first, second, result_u8);
+            status_t status = horizontal_u8_t {substituter_, gap_costs_, alloc_}(first, second, result_u8);
             if (status != status_t::success_k) return status;
             result_ref = result_u8;
         }
 
         // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
-        if (requirements.bytes_per_cell == 1) {
+        else if (requirements.bytes_per_cell == 1) {
             sz_u8_t result_u8 = std::numeric_limits<sz_u8_t>::max();
-            status_t status = diagonal_u8_t {substituter, 1, alloc_}(first, second, result_u8);
+            status_t status = diagonal_u8_t {substituter_, gap_costs_, alloc_}(first, second, result_u8);
             if (status != status_t::success_k) return status;
             result_ref = result_u8;
         }
         else if (requirements.bytes_per_cell == 2) {
             sz_u16_t result_u16 = std::numeric_limits<sz_u16_t>::max();
-            status_t status = diagonal_u16_t {substituter, 1, alloc_}(first, second, result_u16);
+            status_t status = diagonal_u16_t {substituter_, gap_costs_, alloc_}(first, second, result_u16);
             if (status != status_t::success_k) return status;
             result_ref = result_u16;
         }
         else if (requirements.bytes_per_cell == 4) {
             sz_u32_t result_u32 = std::numeric_limits<sz_u32_t>::max();
-            status_t status = diagonal_u32_t {substituter, 1, alloc_}(first, second, result_u32);
+            status_t status = diagonal_u32_t {substituter_, gap_costs_, alloc_}(first, second, result_u32);
             if (status != status_t::success_k) return status;
             result_ref = result_u32;
         }
         else if (requirements.bytes_per_cell == 8) {
             sz_u64_t result_u64 = std::numeric_limits<sz_u64_t>::max();
-            status_t status = diagonal_u64_t {substituter, 1, alloc_}(first, second, result_u64);
+            status_t status = diagonal_u64_t {substituter_, gap_costs_, alloc_}(first, second, result_u64);
             if (status != status_t::success_k) return status;
             result_ref = result_u64;
         }
@@ -760,6 +1460,7 @@ struct levenshtein_distance {
  */
 template <                                         //
     typename char_type_ = char,                    //
+    typename gap_costs_type_ = linear_gap_costs_t, //
     typename allocator_type_ = dummy_alloc_t,      //
     sz_capability_t capability_ = sz_cap_serial_k, //
     typename enable_ = void                        //
@@ -767,27 +1468,39 @@ template <                                         //
 struct levenshtein_distance_utf8 {
 
     using char_t = char_type_;
+    using gap_costs_t = gap_costs_type_;
     using allocator_t = allocator_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using horizontal_u8_t = horizontal_walker<sz_rune_t, sz_u8_t, error_costs_uniform_t, allocator_t,
-                                              sz_minimize_distance_k, sz_similarity_global_k, capability_serialized_k>;
-    using diagonal_u8_t = diagonal_walker<sz_rune_t, sz_u8_t, error_costs_uniform_t, allocator_t,
-                                          sz_minimize_distance_k, sz_similarity_global_k, capability_serialized_k>;
-    using diagonal_u16_t = diagonal_walker<sz_rune_t, sz_u16_t, error_costs_uniform_t, allocator_t,
-                                           sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
-    using diagonal_u32_t = diagonal_walker<sz_rune_t, sz_u32_t, error_costs_uniform_t, allocator_t,
-                                           sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
-    using diagonal_u64_t = diagonal_walker<sz_rune_t, sz_u64_t, error_costs_uniform_t, allocator_t,
-                                           sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
-
-    using ascii_fallback_t = levenshtein_distance<char_t, allocator_t, capability_k>;
-
+    using horizontal_u8_t =                                                                           //
+        horizontal_walker<sz_rune_t, sz_u8_t, uniform_substitution_costs_t, gap_costs_t, allocator_t, //
+                          sz_minimize_distance_k, sz_similarity_global_k, capability_serialized_k>;
+    using diagonal_u8_t =                                                                           //
+        diagonal_walker<sz_rune_t, sz_u8_t, uniform_substitution_costs_t, gap_costs_t, allocator_t, //
+                        sz_minimize_distance_k, sz_similarity_global_k, capability_serialized_k>;
+    using diagonal_u16_t =                                                                           //
+        diagonal_walker<sz_rune_t, sz_u16_t, uniform_substitution_costs_t, gap_costs_t, allocator_t, //
+                        sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
+    using diagonal_u32_t =                                                                           //
+        diagonal_walker<sz_rune_t, sz_u32_t, uniform_substitution_costs_t, gap_costs_t, allocator_t, //
+                        sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
+    using diagonal_u64_t =                                                                           //
+        diagonal_walker<sz_rune_t, sz_u64_t, uniform_substitution_costs_t, gap_costs_t, allocator_t, //
+                        sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
+
+    using linearized_fallback_t = levenshtein_distance<char_t, linear_gap_costs_t, allocator_t, capability_k>;
+    using ascii_fallback_t = levenshtein_distance<char_t, gap_costs_t, allocator_t, capability_k>;
+
+    uniform_substitution_costs_t substituter_ {};
+    gap_costs_t gap_costs_ {};
     mutable allocator_t alloc_ {};
 
     levenshtein_distance_utf8(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
+    levenshtein_distance_utf8(uniform_substitution_costs_t subs, gap_costs_t gaps,
+                              allocator_t alloc = allocator_t {}) noexcept
+        : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
     /**
      *  @param[in] first The first string.
@@ -796,21 +1509,19 @@ struct levenshtein_distance_utf8 {
      */
     status_t operator()(span<char_t const> first, span<char_t const> second, sz_size_t &result_ref) const noexcept {
 
-        sz_size_t const first_length = first.length();
-        sz_size_t const second_length = second.length();
-        if (first_length == 0) {
-            result_ref = second_length;
-            return status_t::success_k;
-        }
-        if (second_length == 0) {
-            result_ref = first_length;
-            return status_t::success_k;
-        }
+        // If the cost of gap opening and extension is the same and we've mistakenly instantiated
+        // the more memory-intensive `affine_gap_costs_t`, we can fall-back to the linearized version.
+        if constexpr (std::is_same<gap_costs_t, affine_gap_costs_t>())
+            if (gap_costs_.open == gap_costs_.extend) {
+                linear_gap_costs_t linear_gap(gap_costs_.open);
+                linearized_fallback_t linear_backend(substituter_, linear_gap, alloc_);
+                return linear_backend(first, second, result_ref);
+            }
 
         // Check if the strings are entirely composed of ASCII characters,
         // and default to a simpler algorithm in that case.
-        if (sz_isascii(first.data(), first.length()) && sz_isascii(second.data(), second.length()))
-            return ascii_fallback_t {alloc_}(first, second, result_ref);
+        if (sz_isascii(first.data(), first.size()) && sz_isascii(second.data(), second.size()))
+            return ascii_fallback_t {substituter_, gap_costs_, alloc_}(first, second, result_ref);
 
         // Allocate some memory to expand UTF-8 strings into UTF-32.
         sz_size_t const max_utf32_bytes = first.size() * 4 + second.size() * 4;
@@ -829,44 +1540,45 @@ struct levenshtein_distance_utf8 {
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
         using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, false>;
-        similarity_memory_requirements_t requirements(first_length, second_length, 1, sizeof(sz_rune_t),
-                                                      SZ_MAX_REGISTER_WIDTH);
+        similarity_memory_requirements_t requirements(                    //
+            first.size(), second.size(),                                  //
+            gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
+            sizeof(sz_rune_t), SZ_MAX_REGISTER_WIDTH);
 
         span<sz_rune_t const> const first_utf32 {first_data_utf32, first_length_utf32};
         span<sz_rune_t const> const second_utf32 {second_data_utf32, second_length_utf32};
 
         // When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
-        error_costs_uniform_t substituter;
         if (requirements.max_diagonal_length < 16) {
             sz_u8_t result_u8 = std::numeric_limits<sz_u8_t>::max();
-            status_t status = horizontal_u8_t {substituter, 1, alloc_}(first_utf32, second_utf32, result_u8);
+            status_t status = horizontal_u8_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u8);
             if (status != status_t::success_k) return status;
             result_ref = result_u8;
         }
 
         // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
-        if (requirements.bytes_per_cell == 1) {
+        else if (requirements.bytes_per_cell == 1) {
             sz_u8_t result_u8 = std::numeric_limits<sz_u8_t>::max();
-            status_t status = diagonal_u8_t {substituter, 1, alloc_}(first_utf32, second_utf32, result_u8);
+            status_t status = diagonal_u8_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u8);
             if (status != status_t::success_k) return status;
             result_ref = result_u8;
         }
         else if (requirements.bytes_per_cell == 2) {
             sz_u16_t result_u16 = std::numeric_limits<sz_u16_t>::max();
-            status_t status = diagonal_u16_t {substituter, 1, alloc_}(first_utf32, second_utf32, result_u16);
+            status_t status = diagonal_u16_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u16);
             if (status != status_t::success_k) return status;
             result_ref = result_u16;
         }
         else if (requirements.bytes_per_cell == 4) {
             sz_u32_t result_u32 = std::numeric_limits<sz_u32_t>::max();
-            status_t status = diagonal_u32_t {substituter, 1, alloc_}(first_utf32, second_utf32, result_u32);
+            status_t status = diagonal_u32_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u32);
             if (status != status_t::success_k) return status;
             result_ref = result_u32;
         }
         else if (requirements.bytes_per_cell == 8) {
             sz_u64_t result_u64 = std::numeric_limits<sz_u64_t>::max();
-            status_t status = diagonal_u64_t {substituter, 1, alloc_}(first_utf32, second_utf32, result_u64);
+            status_t status = diagonal_u64_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u64);
             if (status != status_t::success_k) return status;
             result_ref = result_u64;
         }
@@ -882,6 +1594,7 @@ struct levenshtein_distance_utf8 {
 template <                                              //
     typename char_type_ = char,                         //
     typename substituter_type_ = error_costs_256x256_t, //
+    typename gap_costs_type_ = linear_gap_costs_t,      //
     typename allocator_type_ = dummy_alloc_t,           //
     sz_capability_t capability_ = sz_cap_serial_k,      //
     typename enable_ = void                             //
@@ -890,27 +1603,32 @@ struct needleman_wunsch_score {
 
     using char_t = char_type_;
     using substituter_t = substituter_type_;
+    using gap_costs_t = gap_costs_type_;
     using allocator_t = allocator_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using horizontal_i16_t = horizontal_walker<char_t, sz_i16_t, substituter_t, allocator_t, sz_maximize_score_k,
-                                               sz_similarity_global_k, capability_serialized_k>;
-    using diagonal_i16_t = diagonal_walker<char_t, sz_i16_t, substituter_t, allocator_t, sz_maximize_score_k,
-                                           sz_similarity_global_k, capability_serialized_k>;
-    using diagonal_i32_t = diagonal_walker<char_t, sz_i32_t, substituter_t, allocator_t, sz_maximize_score_k,
-                                           sz_similarity_global_k, capability_k>;
-    using diagonal_i64_t = diagonal_walker<char_t, sz_i64_t, substituter_t, allocator_t, sz_maximize_score_k,
-                                           sz_similarity_global_k, capability_k>;
+    using horizontal_i16_t =                                                         //
+        horizontal_walker<char_t, sz_i16_t, substituter_t, gap_costs_t, allocator_t, //
+                          sz_maximize_score_k, sz_similarity_global_k, capability_serialized_k>;
+    using diagonal_i16_t =                                                         //
+        diagonal_walker<char_t, sz_i16_t, substituter_t, gap_costs_t, allocator_t, //
+                        sz_maximize_score_k, sz_similarity_global_k, capability_serialized_k>;
+    using diagonal_i32_t =                                                         //
+        diagonal_walker<char_t, sz_i32_t, substituter_t, gap_costs_t, allocator_t, //
+                        sz_maximize_score_k, sz_similarity_global_k, capability_k>;
+    using diagonal_i64_t =                                                         //
+        diagonal_walker<char_t, sz_i64_t, substituter_t, gap_costs_t, allocator_t, //
+                        sz_maximize_score_k, sz_similarity_global_k, capability_k>;
 
     substituter_t substituter_ {};
-    error_cost_t gap_cost_ {1};
+    gap_costs_t gap_costs_ {};
     allocator_t alloc_ {};
 
     needleman_wunsch_score(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
-    needleman_wunsch_score(substituter_t subs, error_cost_t gap_cost, allocator_t alloc = allocator_t {}) noexcept
-        : substituter_(subs), gap_cost_(gap_cost), alloc_(alloc) {}
+    needleman_wunsch_score(substituter_t subs, gap_costs_t gaps, allocator_t alloc = allocator_t {}) noexcept
+        : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
     /**
      *  @param[in] first The first string.
@@ -919,27 +1637,18 @@ struct needleman_wunsch_score {
      */
     status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref) const noexcept {
 
-        sz_size_t const first_length = first.length();
-        sz_size_t const second_length = second.length();
-        if (first_length == 0) {
-            result_ref = second_length * gap_cost_;
-            return status_t::success_k;
-        }
-        if (second_length == 0) {
-            result_ref = first_length * gap_cost_;
-            return status_t::success_k;
-        }
-
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
         using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, true>;
-        similarity_memory_requirements_t requirements(first_length, second_length, substituter_.max_magnitude_change(),
-                                                      sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
+        similarity_memory_requirements_t requirements(                    //
+            first.size(), second.size(),                                  //
+            gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
+            sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
 
         // When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
         status_t status = status_t::success_k;
         if (requirements.max_diagonal_length < 16) {
             sz_i16_t result_i16 = std::numeric_limits<sz_i16_t>::min();
-            status = horizontal_i16_t {substituter_, gap_cost_, alloc_}(first, second, result_i16);
+            status = horizontal_i16_t {substituter_, gap_costs_, alloc_}(first, second, result_i16);
             if (status == status_t::success_k) result_ref = result_i16;
         }
 
@@ -947,17 +1656,17 @@ struct needleman_wunsch_score {
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
         else if (requirements.bytes_per_cell == 2) {
             sz_i16_t result_i16 = std::numeric_limits<sz_i16_t>::min();
-            status = diagonal_i16_t {substituter_, gap_cost_, alloc_}(first, second, result_i16);
+            status = diagonal_i16_t {substituter_, gap_costs_, alloc_}(first, second, result_i16);
             if (status == status_t::success_k) result_ref = result_i16;
         }
         else if (requirements.bytes_per_cell == 4) {
             sz_i32_t result_i32 = std::numeric_limits<sz_i32_t>::min();
-            status = diagonal_i32_t {substituter_, gap_cost_, alloc_}(first, second, result_i32);
+            status = diagonal_i32_t {substituter_, gap_costs_, alloc_}(first, second, result_i32);
             if (status == status_t::success_k) result_ref = result_i32;
         }
         else if (requirements.bytes_per_cell == 8) {
             sz_i64_t result_i64 = std::numeric_limits<sz_i64_t>::min();
-            status = diagonal_i64_t {substituter_, gap_cost_, alloc_}(first, second, result_i64);
+            status = diagonal_i64_t {substituter_, gap_costs_, alloc_}(first, second, result_i64);
             if (status == status_t::success_k) result_ref = result_i64;
         }
 
@@ -972,6 +1681,7 @@ struct needleman_wunsch_score {
 template <                                              //
     typename char_type_ = char,                         //
     typename substituter_type_ = error_costs_256x256_t, //
+    typename gap_costs_type_ = linear_gap_costs_t,      //
     typename allocator_type_ = dummy_alloc_t,           //
     sz_capability_t capability_ = sz_cap_serial_k,      //
     typename enable_ = void                             //
@@ -980,27 +1690,32 @@ struct smith_waterman_score {
 
     using char_t = char_type_;
     using substituter_t = substituter_type_;
+    using gap_costs_t = gap_costs_type_;
     using allocator_t = allocator_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using horizontal_i16_t = horizontal_walker<char_t, sz_i16_t, substituter_t, allocator_t, sz_maximize_score_k,
-                                               sz_similarity_local_k, capability_serialized_k>;
-    using diagonal_i16_t = diagonal_walker<char_t, sz_i16_t, substituter_t, allocator_t, sz_maximize_score_k,
-                                           sz_similarity_local_k, capability_serialized_k>;
-    using diagonal_i32_t = diagonal_walker<char_t, sz_i32_t, substituter_t, allocator_t, sz_maximize_score_k,
-                                           sz_similarity_local_k, capability_k>;
-    using diagonal_i64_t = diagonal_walker<char_t, sz_i64_t, substituter_t, allocator_t, sz_maximize_score_k,
-                                           sz_similarity_local_k, capability_k>;
+    using horizontal_i16_t =                                                         //
+        horizontal_walker<char_t, sz_i16_t, substituter_t, gap_costs_t, allocator_t, //
+                          sz_maximize_score_k, sz_similarity_local_k, capability_serialized_k>;
+    using diagonal_i16_t =                                                         //
+        diagonal_walker<char_t, sz_i16_t, substituter_t, gap_costs_t, allocator_t, //
+                        sz_maximize_score_k, sz_similarity_local_k, capability_serialized_k>;
+    using diagonal_i32_t =                                                         //
+        diagonal_walker<char_t, sz_i32_t, substituter_t, gap_costs_t, allocator_t, //
+                        sz_maximize_score_k, sz_similarity_local_k, capability_k>;
+    using diagonal_i64_t =                                                         //
+        diagonal_walker<char_t, sz_i64_t, substituter_t, gap_costs_t, allocator_t, //
+                        sz_maximize_score_k, sz_similarity_local_k, capability_k>;
 
     substituter_t substituter_ {};
-    error_cost_t gap_cost_ {1};
+    gap_costs_t gap_costs_ {};
     allocator_t alloc_ {};
 
     smith_waterman_score(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
-    smith_waterman_score(substituter_t subs, error_cost_t gap_cost, allocator_t alloc = allocator_t {}) noexcept
-        : substituter_(subs), gap_cost_(gap_cost), alloc_(alloc) {}
+    smith_waterman_score(substituter_t subs, gap_costs_t gaps, allocator_t alloc = allocator_t {}) noexcept
+        : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
     /**
      *  @param[in] first The first string.
@@ -1009,8 +1724,8 @@ struct smith_waterman_score {
      */
     status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref) const noexcept {
 
-        sz_size_t const first_length = first.length();
-        sz_size_t const second_length = second.length();
+        sz_size_t const first_length = first.size();
+        sz_size_t const second_length = second.size();
         if (first_length == 0 || second_length == 0) {
             result_ref = 0;
             return status_t::success_k;
@@ -1018,13 +1733,15 @@ struct smith_waterman_score {
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
         using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, true>;
-        similarity_memory_requirements_t requirements(first_length, second_length, substituter_.max_magnitude_change(),
-                                                      sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
+        similarity_memory_requirements_t requirements(                    //
+            first.size(), second.size(),                                  //
+            gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
+            sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
 
         // When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
         if (requirements.max_diagonal_length < 16) {
             sz_i16_t result_i16 = std::numeric_limits<sz_i16_t>::min();
-            status_t status = horizontal_i16_t {substituter_, gap_cost_, alloc_}(first, second, result_i16);
+            status_t status = horizontal_i16_t {substituter_, gap_costs_, alloc_}(first, second, result_i16);
             if (status != status_t::success_k) return status;
             result_ref = result_i16;
         }
@@ -1033,19 +1750,19 @@ struct smith_waterman_score {
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
         else if (requirements.bytes_per_cell == 2) {
             sz_i16_t result_i16 = std::numeric_limits<sz_i16_t>::min();
-            status_t status = diagonal_i16_t {substituter_, gap_cost_, alloc_}(first, second, result_i16);
+            status_t status = diagonal_i16_t {substituter_, gap_costs_, alloc_}(first, second, result_i16);
             if (status != status_t::success_k) return status;
             result_ref = result_i16;
         }
         else if (requirements.bytes_per_cell == 4) {
             sz_i32_t result_i32 = std::numeric_limits<sz_i32_t>::min();
-            status_t status = diagonal_i32_t {substituter_, gap_cost_, alloc_}(first, second, result_i32);
+            status_t status = diagonal_i32_t {substituter_, gap_costs_, alloc_}(first, second, result_i32);
             if (status != status_t::success_k) return status;
             result_ref = result_i32;
         }
         else if (requirements.bytes_per_cell == 8) {
             sz_i64_t result_i64 = std::numeric_limits<sz_i64_t>::min();
-            status_t status = diagonal_i64_t {substituter_, gap_cost_, alloc_}(first, second, result_i64);
+            status_t status = diagonal_i64_t {substituter_, gap_costs_, alloc_}(first, second, result_i64);
             if (status != status_t::success_k) return status;
             result_ref = result_i64;
         }
@@ -1054,6 +1771,10 @@ struct smith_waterman_score {
     }
 };
 
+#pragma endregion
+
+#pragma region - Parallel Batch Algorithms
+
 /**
  *  @brief  Helper method, applying the desired pairwise scoring kernel to all input pairs,
  *          differentiating multi-threaded and single-threaded cases.
@@ -1077,6 +1798,7 @@ status_t _score_in_parallel(                         //
     constexpr bool score_is_signed_k = std::is_signed_v<score_t>;
     using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, score_is_signed_k>;
     using char_t = typename core_per_input_type_::char_t;
+    using gap_costs_t = typename core_per_input_type_::gap_costs_t;
 
     auto first_size = first_strings.size();
     auto second_size = second_strings.size();
@@ -1095,10 +1817,13 @@ status_t _score_in_parallel(                         //
         auto const &second = second_strings[i];
 
         // ! Longer strings will be handled separately
-        similarity_memory_requirements_t requirements(first.length(), second.length(), max_magnitude_change,
-                                                      sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
-        if (requirements.total >= specs.l2_bytes) continue;
-        status_t status = core_per_input({first.data(), first.length()}, {second.data(), second.length()}, result);
+        similarity_memory_requirements_t requirements(     //
+            first.size(), second.size(),                   //
+            gap_type<gap_costs_t>(), max_magnitude_change, //
+            sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
+
+        if (requirements.total >= specs.l1_bytes) continue;
+        status_t status = core_per_input({first.data(), first.size()}, {second.data(), second.size()}, result);
         if (status == status_t::success_k) { results[i] = result; }
         else { error.store(status); }
     }
@@ -1108,10 +1833,13 @@ status_t _score_in_parallel(                         //
         score_t result = 0;
         auto const &first = first_strings[i];
         auto const &second = second_strings[i];
-        similarity_memory_requirements_t requirements(first.length(), second.length(), max_magnitude_change,
-                                                      sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
-        if (requirements.total < specs.l2_bytes) continue;
-        status_t status = all_cores_per_input({first.data(), first.length()}, {second.data(), second.length()}, result);
+        similarity_memory_requirements_t requirements(     //
+            first.size(), second.size(),                   //
+            gap_type<gap_costs_t>(), max_magnitude_change, //
+            sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
+
+        if (requirements.total < specs.l1_bytes) continue;
+        status_t status = all_cores_per_input({first.data(), first.size()}, {second.data(), second.size()}, result);
         if (status == status_t::success_k) { results[i] = result; }
         else { error.store(status); }
     }
@@ -1137,7 +1865,7 @@ status_t _score_sequentially(scoring_type_ &&scoring, first_strings_type_ const
         score_t result = 0;
         auto const &first = first_strings[i];
         auto const &second = second_strings[i];
-        status_t status = scoring({first.data(), first.length()}, {second.data(), second.length()}, result);
+        status_t status = scoring({first.data(), first.size()}, {second.data(), second.size()}, result);
         if (status == status_t::success_k) { results[i] = result; }
         else { return status; }
     }
@@ -1151,6 +1879,7 @@ status_t _score_sequentially(scoring_type_ &&scoring, first_strings_type_ const
  */
 template <                                         //
     typename char_type_ = char,                    //
+    typename gap_costs_type_ = linear_gap_costs_t, //
     typename allocator_type_ = dummy_alloc_t,      //
     sz_capability_t capability_ = sz_cap_serial_k, //
     typename enable_ = void                        //
@@ -1158,34 +1887,43 @@ template <                                         //
 struct levenshtein_distances {
 
     using char_t = char_type_;
+    using gap_costs_t = gap_costs_type_;
     using allocator_t = allocator_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using all_cores_per_input_t = levenshtein_distance<char_t, allocator_t, capability_k>;
-    using core_per_input_t = levenshtein_distance<char_t, allocator_t, capability_serialized_k>;
+    using all_cores_per_input_t = levenshtein_distance<char_t, gap_costs_t, allocator_t, capability_k>;
+    using core_per_input_t = levenshtein_distance<char_t, gap_costs_t, allocator_t, capability_serialized_k>;
 
+    uniform_substitution_costs_t substituter_ {};
+    gap_costs_t gap_costs_ {};
     allocator_t alloc_ {};
 
     levenshtein_distances(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
+    levenshtein_distances(uniform_substitution_costs_t subs, gap_costs_t gaps,
+                          allocator_t alloc = allocator_t {}) noexcept
+        : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
                         results_type_ &&results, cpu_specs_t const &specs = {}) const noexcept {
 
         if constexpr (capability_k & sz_cap_parallel_k)
-            return _score_in_parallel<sz_size_t>(core_per_input_t {alloc_}, all_cores_per_input_t {alloc_},
-                                                 first_strings, second_strings, std::forward<results_type_>(results), 1,
-                                                 specs);
+            return _score_in_parallel<sz_size_t>(                         //
+                core_per_input_t {substituter_, gap_costs_, alloc_},      //
+                all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
+                first_strings, second_strings, std::forward<results_type_>(results), 1, specs);
         else
-            return _score_sequentially<sz_size_t>(all_cores_per_input_t {alloc_}, first_strings, second_strings,
-                                                  std::forward<results_type_>(results));
+            return _score_sequentially<sz_size_t>(                        //
+                all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
+                first_strings, second_strings, std::forward<results_type_>(results));
     }
 };
 
 template <                                         //
     typename char_type_ = char,                    //
+    typename gap_costs_type_ = linear_gap_costs_t, //
     typename allocator_type_ = dummy_alloc_t,      //
     sz_capability_t capability_ = sz_cap_serial_k, //
     typename enable_ = void                        //
@@ -1193,35 +1931,44 @@ template <                                         //
 struct levenshtein_distances_utf8 {
 
     using char_t = char_type_;
+    using gap_costs_t = gap_costs_type_;
     using allocator_t = allocator_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using all_cores_per_input_t = levenshtein_distance_utf8<char_t, allocator_t, capability_k>;
-    using core_per_input_t = levenshtein_distance_utf8<char_t, allocator_t, capability_serialized_k>;
+    using all_cores_per_input_t = levenshtein_distance_utf8<char_t, gap_costs_t, allocator_t, capability_k>;
+    using core_per_input_t = levenshtein_distance_utf8<char_t, gap_costs_t, allocator_t, capability_serialized_k>;
 
+    uniform_substitution_costs_t substituter_ {};
+    gap_costs_t gap_costs_ {};
     allocator_t alloc_ {};
 
     levenshtein_distances_utf8(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
+    levenshtein_distances_utf8(uniform_substitution_costs_t subs, gap_costs_t gaps,
+                               allocator_t alloc = allocator_t {}) noexcept
+        : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
                         results_type_ &&results, cpu_specs_t const &specs = {}) const noexcept {
 
         if constexpr (capability_k & sz_cap_parallel_k)
-            return _score_in_parallel<sz_size_t>(core_per_input_t {alloc_}, all_cores_per_input_t {alloc_},
-                                                 first_strings, second_strings, std::forward<results_type_>(results), 1,
-                                                 specs);
+            return _score_in_parallel<sz_size_t>(                         //
+                core_per_input_t {substituter_, gap_costs_, alloc_},      //
+                all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
+                first_strings, second_strings, std::forward<results_type_>(results), 1, specs);
         else
-            return _score_sequentially<sz_size_t>(all_cores_per_input_t {alloc_}, first_strings, second_strings,
-                                                  std::forward<results_type_>(results));
+            return _score_sequentially<sz_size_t>(                        //
+                all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
+                first_strings, second_strings, std::forward<results_type_>(results));
     }
 };
 
 template <                                              //
     typename char_type_ = char,                         //
     typename substituter_type_ = error_costs_256x256_t, //
+    typename gap_costs_type_ = linear_gap_costs_t,      //
     typename allocator_type_ = dummy_alloc_t,           //
     sz_capability_t capability_ = sz_cap_serial_k,      //
     typename enable_ = void                             //
@@ -1230,40 +1977,45 @@ struct needleman_wunsch_scores {
 
     using char_t = char_type_;
     using substituter_t = substituter_type_;
+    using gap_costs_t = gap_costs_type_;
     using allocator_t = allocator_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using all_cores_per_input_t = needleman_wunsch_score<char_t, substituter_t, allocator_t, capability_k>;
-    using core_per_input_t = needleman_wunsch_score<char_t, substituter_t, allocator_t, capability_serialized_k>;
+    using all_cores_per_input_t = needleman_wunsch_score<char_t, substituter_t, gap_costs_t, allocator_t, capability_k>;
+    using core_per_input_t =
+        needleman_wunsch_score<char_t, substituter_t, gap_costs_t, allocator_t, capability_serialized_k>;
 
     substituter_t substituter_ {};
-    error_cost_t gap_cost_ {1};
+    gap_costs_t gap_costs_ {};
     allocator_t alloc_ {};
 
     needleman_wunsch_scores(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
-    needleman_wunsch_scores(substituter_t subs, error_cost_t gap_cost, allocator_t alloc = allocator_t {}) noexcept
-        : substituter_(subs), gap_cost_(gap_cost), alloc_(alloc) {}
+    needleman_wunsch_scores(substituter_t subs, gap_costs_t gaps, allocator_t alloc = allocator_t {}) noexcept
+        : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
                         results_type_ &&results, cpu_specs_t const &specs = {}) const noexcept {
 
         if constexpr (capability_k & sz_cap_parallel_k)
-            return _score_in_parallel<sz_ssize_t>(core_per_input_t {substituter_, gap_cost_, alloc_},
-                                                  all_cores_per_input_t {substituter_, gap_cost_, alloc_},
-                                                  first_strings, second_strings, std::forward<results_type_>(results),
-                                                  substituter_.max_magnitude_change(), specs);
+            return _score_in_parallel<sz_ssize_t>(                        //
+                core_per_input_t {substituter_, gap_costs_, alloc_},      //
+                all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
+                first_strings, second_strings, std::forward<results_type_>(results),
+                substituter_.max_magnitude_change(), specs);
         else
-            return _score_sequentially<sz_ssize_t>(all_cores_per_input_t {substituter_, gap_cost_, alloc_},
-                                                   first_strings, second_strings, std::forward<results_type_>(results));
+            return _score_sequentially<sz_ssize_t>(                       //
+                all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
+                first_strings, second_strings, std::forward<results_type_>(results));
     }
 };
 
 template <                                              //
     typename char_type_ = char,                         //
     typename substituter_type_ = error_costs_256x256_t, //
+    typename gap_costs_type_ = linear_gap_costs_t,      //
     typename allocator_type_ = dummy_alloc_t,           //
     sz_capability_t capability_ = sz_cap_serial_k,      //
     typename enable_ = void                             //
@@ -1272,37 +2024,45 @@ struct smith_waterman_scores {
 
     using char_t = char_type_;
     using substituter_t = substituter_type_;
+    using gap_costs_t = gap_costs_type_;
     using allocator_t = allocator_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using all_cores_per_input_t = smith_waterman_score<char_t, substituter_t, allocator_t, capability_k>;
-    using core_per_input_t = smith_waterman_score<char_t, substituter_t, allocator_t, capability_serialized_k>;
+    using all_cores_per_input_t = smith_waterman_score<char_t, substituter_t, gap_costs_t, allocator_t, capability_k>;
+    using core_per_input_t =
+        smith_waterman_score<char_t, substituter_t, gap_costs_t, allocator_t, capability_serialized_k>;
 
     substituter_t substituter_ {};
-    error_cost_t gap_cost_ {1};
+    gap_costs_t gap_costs_ {};
     allocator_t alloc_ {};
 
     smith_waterman_scores(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
-    smith_waterman_scores(substituter_t subs, error_cost_t gap_cost, allocator_t alloc = allocator_t {}) noexcept
-        : substituter_(subs), gap_cost_(gap_cost), alloc_(alloc) {}
+    smith_waterman_scores(substituter_t subs, gap_costs_t gaps, allocator_t alloc = allocator_t {}) noexcept
+        : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
                         results_type_ &&results, cpu_specs_t const &specs = {}) const noexcept {
 
         if constexpr (capability_k & sz_cap_parallel_k)
-            return _score_in_parallel<sz_ssize_t>(core_per_input_t {substituter_, gap_cost_, alloc_},
-                                                  all_cores_per_input_t {substituter_, gap_cost_, alloc_},
-                                                  first_strings, second_strings, std::forward<results_type_>(results),
-                                                  substituter_.max_magnitude_change(), specs);
+            return _score_in_parallel<sz_ssize_t>(                        //
+                core_per_input_t {substituter_, gap_costs_, alloc_},      //
+                all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
+                first_strings, second_strings, std::forward<results_type_>(results),
+                substituter_.max_magnitude_change(), specs);
         else
-            return _score_sequentially<sz_ssize_t>(all_cores_per_input_t {substituter_, gap_cost_, alloc_},
-                                                   first_strings, second_strings, std::forward<results_type_>(results));
+            return _score_sequentially<sz_ssize_t>(                       //
+                all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
+                first_strings, second_strings, std::forward<results_type_>(results));
     }
 };
 
+#pragma endregion
+
+#pragma region - Substitution Cost Matrices
+
 /**
  *  @brief  The default most @b space-intensive error costs matrix for byte-level similarity scoring.
  *          Takes (256 x 256) ~ 65'536 bytes of memory. Which equates to 1/3 of the shared memory on the GPU,
@@ -1497,13 +2257,15 @@ struct error_costs_26x26ascii_t {
     }
 };
 
+#pragma endregion
+
 /*  AVX512 implementation of the string similarity algorithms for Ice Lake and newer CPUs.
  *  Includes extensions:
  *      - 2017 Skylake: F, CD, ER, PF, VL, DQ, BW,
  *      - 2018 CannonLake: IFMA, VBMI,
  *      - 2019 Ice Lake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES.
  */
-#pragma region Ice Lake Implementation
+#pragma region - Ice Lake Implementation
 #if SZ_USE_ICE
 #pragma GCC push_options
 #pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
@@ -1511,16 +2273,17 @@ struct error_costs_26x26ascii_t {
                              apply_to = function)
 
 /**
- *  @brief Variant of `linear_scorer` - Minimizes Levenshtein distance for inputs under 256 bytes.
+ *  @brief Variant of `tile_scorer` - Minimizes Levenshtein distance for inputs under 256 bytes.
  *  @note Requires Intel Ice Lake generation CPUs or newer.
  */
 template <sz_capability_t capability_>
-struct linear_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k,
-                     sz_similarity_global_k, capability_, std::enable_if_t<capability_ & sz_cap_ice_k>>
-    : public linear_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k,
-                           sz_similarity_global_k, sz_cap_serial_k, void> {
+struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, capability_,
+                   std::enable_if_t<capability_ & sz_cap_ice_k>>
+    : public tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void> {
 
-    using scorer_t::linear_scorer; // Make the constructors visible
+    using scorer_t::tile_scorer; // Make the constructors visible
 
     static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
@@ -1537,8 +2300,8 @@ struct linear_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t,
         sz_u512_vec_t cost_if_substitution_vec, cost_if_gap_vec, cell_score_vec;
 
         // Initialize constats:
-        sz_u512_vec_t ones_vec;
-        ones_vec.zmm = _mm512_set1_epi8(1);
+        sz_u512_vec_t gap_cost_vec;
+        gap_cost_vec.zmm = _mm512_set1_epi8(1);
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
@@ -1551,9 +2314,9 @@ struct linear_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t,
 
         mismatch_mask = _mm512_cmpneq_epi8_mask(first_vec.zmm, second_vec.zmm);
         cost_if_substitution_vec.zmm =
-            _mm512_mask_add_epi8(pre_substitution_vec.zmm, mismatch_mask, pre_substitution_vec.zmm, ones_vec.zmm);
+            _mm512_mask_add_epi8(pre_substitution_vec.zmm, mismatch_mask, pre_substitution_vec.zmm, gap_cost_vec.zmm);
         cost_if_gap_vec.zmm =
-            _mm512_add_epi8(_mm512_min_epu8(pre_insertion_vec.zmm, pre_deletion_vec.zmm), ones_vec.zmm);
+            _mm512_add_epi8(_mm512_min_epu8(pre_insertion_vec.zmm, pre_deletion_vec.zmm), gap_cost_vec.zmm);
         cell_score_vec.zmm = _mm512_min_epu8(cost_if_substitution_vec.zmm, cost_if_gap_vec.zmm);
         _mm512_mask_storeu_epi8(scores_new + i, load_mask, cell_score_vec.zmm);
     }
@@ -1578,12 +2341,13 @@ struct linear_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t,
  *  @note Requires Intel Ice Lake generation CPUs or newer.
  */
 template <sz_capability_t capability_>
-struct linear_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k,
-                     sz_similarity_global_k, capability_, std::enable_if_t<capability_ & sz_cap_ice_k>>
-    : public linear_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k,
-                           sz_similarity_global_k, sz_cap_serial_k, void> {
+struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, capability_,
+                   std::enable_if_t<capability_ & sz_cap_ice_k>>
+    : public tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, uniform_substitution_costs_t,
+                         linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void> {
 
-    using scorer_t::linear_scorer; // Make the constructors visible
+    using scorer_t::tile_scorer; // Make the constructors visible
 
     static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
@@ -1600,8 +2364,8 @@ struct linear_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, error_costs_
         sz_u128_vec_t cost_if_substitution_vec, cost_if_gap_vec, cell_score_vec;
 
         // Initialize constats:
-        sz_u128_vec_t ones_vec;
-        ones_vec.xmm = _mm_set1_epi8(1);
+        sz_u128_vec_t gap_cost_vec;
+        gap_cost_vec.xmm = _mm_set1_epi8(1);
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
@@ -1614,8 +2378,8 @@ struct linear_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, error_costs_
 
         mismatch_mask = _mm512_cmpneq_epi32_mask(first_vec.zmm, second_vec.zmm);
         cost_if_substitution_vec.xmm =
-            _mm_mask_add_epi8(pre_substitution_vec.xmm, mismatch_mask, pre_substitution_vec.xmm, ones_vec.xmm);
-        cost_if_gap_vec.xmm = _mm_add_epi8(_mm_min_epu8(pre_insertion_vec.xmm, pre_deletion_vec.xmm), ones_vec.xmm);
+            _mm_mask_add_epi8(pre_substitution_vec.xmm, mismatch_mask, pre_substitution_vec.xmm, gap_cost_vec.xmm);
+        cost_if_gap_vec.xmm = _mm_add_epi8(_mm_min_epu8(pre_insertion_vec.xmm, pre_deletion_vec.xmm), gap_cost_vec.xmm);
         cell_score_vec.xmm = _mm_min_epu8(cost_if_substitution_vec.xmm, cost_if_gap_vec.xmm);
         _mm_mask_storeu_epi8(scores_new + i, load_mask, cell_score_vec.xmm);
     }
@@ -1640,12 +2404,13 @@ struct linear_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, error_costs_
  *  @note Requires Intel Ice Lake generation CPUs or newer.
  */
 template <sz_capability_t capability_>
-struct linear_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
-                     sz_similarity_global_k, capability_, std::enable_if_t<capability_ & sz_cap_ice_k>>
-    : public linear_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
-                           sz_similarity_global_k, sz_cap_serial_k, void> {
+struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, capability_,
+                   std::enable_if_t<capability_ & sz_cap_ice_k>>
+    : public tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void> {
 
-    using scorer_t::linear_scorer; // Make the constructors visible
+    using scorer_t::tile_scorer; // Make the constructors visible
 
     static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
@@ -1662,8 +2427,8 @@ struct linear_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t
         sz_u512_vec_t cost_if_substitution_vec, cost_if_gap_vec, cell_score_vec;
 
         // Initialize constats:
-        sz_u512_vec_t ones_vec;
-        ones_vec.zmm = _mm512_set1_epi16(1);
+        sz_u512_vec_t gap_cost_vec;
+        gap_cost_vec.zmm = _mm512_set1_epi16(1);
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
@@ -1676,9 +2441,9 @@ struct linear_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t
 
         mismatch_mask = _mm256_cmpneq_epi8_mask(first_vec.ymm, second_vec.ymm);
         cost_if_substitution_vec.zmm =
-            _mm512_mask_add_epi16(pre_substitution_vec.zmm, mismatch_mask, pre_substitution_vec.zmm, ones_vec.zmm);
+            _mm512_mask_add_epi16(pre_substitution_vec.zmm, mismatch_mask, pre_substitution_vec.zmm, gap_cost_vec.zmm);
         cost_if_gap_vec.zmm =
-            _mm512_add_epi16(_mm512_min_epu16(pre_insertion_vec.zmm, pre_deletion_vec.zmm), ones_vec.zmm);
+            _mm512_add_epi16(_mm512_min_epu16(pre_insertion_vec.zmm, pre_deletion_vec.zmm), gap_cost_vec.zmm);
         cell_score_vec.zmm = _mm512_min_epu16(cost_if_substitution_vec.zmm, cost_if_gap_vec.zmm);
         _mm512_mask_storeu_epi16(scores_new + i, load_mask, cell_score_vec.zmm);
     }
@@ -1708,12 +2473,13 @@ struct linear_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t
  *  @note Requires Intel Ice Lake generation CPUs or newer.
  */
 template <sz_capability_t capability_>
-struct linear_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
-                     sz_similarity_global_k, capability_, std::enable_if_t<capability_ & sz_cap_ice_k>>
-    : public linear_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, error_costs_uniform_t,
-                           sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void> {
+struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, capability_,
+                   std::enable_if_t<capability_ & sz_cap_ice_k>>
+    : public tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_substitution_costs_t,
+                         linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void> {
 
-    using scorer_t::linear_scorer; // Make the constructors visible
+    using scorer_t::tile_scorer; // Make the constructors visible
 
     static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
@@ -1730,8 +2496,8 @@ struct linear_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, error_costs
         sz_u256_vec_t cost_if_substitution_vec, cost_if_gap_vec, cell_score_vec;
 
         // Initialize constats:
-        sz_u256_vec_t ones_vec;
-        ones_vec.ymm = _mm256_set1_epi16(1);
+        sz_u256_vec_t gap_cost_vec;
+        gap_cost_vec.ymm = _mm256_set1_epi16(1);
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
@@ -1744,9 +2510,9 @@ struct linear_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, error_costs
 
         mismatch_mask = _mm512_cmpneq_epi32_mask(first_vec.zmm, second_vec.zmm);
         cost_if_substitution_vec.ymm =
-            _mm256_mask_add_epi16(pre_substitution_vec.ymm, mismatch_mask, pre_substitution_vec.ymm, ones_vec.ymm);
+            _mm256_mask_add_epi16(pre_substitution_vec.ymm, mismatch_mask, pre_substitution_vec.ymm, gap_cost_vec.ymm);
         cost_if_gap_vec.ymm =
-            _mm256_add_epi16(_mm256_min_epu16(pre_insertion_vec.ymm, pre_deletion_vec.ymm), ones_vec.ymm);
+            _mm256_add_epi16(_mm256_min_epu16(pre_insertion_vec.ymm, pre_deletion_vec.ymm), gap_cost_vec.ymm);
         cell_score_vec.ymm = _mm256_min_epu16(cost_if_substitution_vec.ymm, cost_if_gap_vec.ymm);
         _mm256_mask_storeu_epi16(scores_new + i, load_mask, cell_score_vec.ymm);
     }
@@ -1776,27 +2542,38 @@ struct linear_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, error_costs
  *  @sa     `levenshtein_distance_utf8` for UTF-8 strings.
  */
 template <typename allocator_type_, sz_capability_t capability_>
-struct levenshtein_distance<char, allocator_type_, capability_, std::enable_if_t<capability_ & sz_cap_ice_k>> {
+struct levenshtein_distance<char, linear_gap_costs_t, allocator_type_, capability_,
+                            std::enable_if_t<capability_ & sz_cap_ice_k>> {
 
     using char_t = char;
+    using gap_costs_t = linear_gap_costs_t;
     using allocator_t = allocator_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_wout_simd_k = (sz_capability_t)(capability_k & ~sz_cap_ice_k);
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using diagonal_u8_t = diagonal_walker<char_t, sz_u8_t, error_costs_uniform_t, allocator_t, sz_minimize_distance_k,
-                                          sz_similarity_global_k, capability_serialized_k>;
-    using diagonal_u16_t = diagonal_walker<char_t, sz_u16_t, error_costs_uniform_t, allocator_t, sz_minimize_distance_k,
-                                           sz_similarity_global_k, capability_k>;
-    using diagonal_u32_t = diagonal_walker<char_t, sz_u32_t, error_costs_uniform_t, allocator_t, sz_minimize_distance_k,
-                                           sz_similarity_global_k, capability_wout_simd_k>;
-    using diagonal_u64_t = diagonal_walker<char_t, sz_u64_t, error_costs_uniform_t, allocator_t, sz_minimize_distance_k,
-                                           sz_similarity_global_k, capability_wout_simd_k>;
-
+    using diagonal_u8_t =                                                                               //
+        diagonal_walker<char_t, sz_u8_t, uniform_substitution_costs_t, linear_gap_costs_t, allocator_t, //
+                        sz_minimize_distance_k, sz_similarity_global_k, capability_serialized_k>;
+    using diagonal_u16_t =                                                                               //
+        diagonal_walker<char_t, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t, allocator_t, //
+                        sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
+    using diagonal_u32_t =                                                                               //
+        diagonal_walker<char_t, sz_u32_t, uniform_substitution_costs_t, linear_gap_costs_t, allocator_t, //
+                        sz_minimize_distance_k, sz_similarity_global_k, capability_wout_simd_k>;
+    using diagonal_u64_t =                                                                               //
+        diagonal_walker<char_t, sz_u64_t, uniform_substitution_costs_t, linear_gap_costs_t, allocator_t, //
+                        sz_minimize_distance_k, sz_similarity_global_k, capability_wout_simd_k>;
+
+    uniform_substitution_costs_t substituter_ {};
+    linear_gap_costs_t gap_costs_ {};
     allocator_t alloc_ {};
 
     levenshtein_distance(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
+    levenshtein_distance(uniform_substitution_costs_t subs, linear_gap_costs_t gaps,
+                         allocator_t alloc = allocator_t {}) noexcept
+        : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
     /**
      *  @param[in] first The first string.
@@ -1805,46 +2582,36 @@ struct levenshtein_distance<char, allocator_type_, capability_, std::enable_if_t
      */
     status_t operator()(span<char_t const> first, span<char_t const> second, sz_size_t &result_ref) const noexcept {
 
-        sz_size_t const first_length = first.length();
-        sz_size_t const second_length = second.length();
-        if (first_length == 0) {
-            result_ref = second_length;
-            return status_t::success_k;
-        }
-        if (second_length == 0) {
-            result_ref = first_length;
-            return status_t::success_k;
-        }
-
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
         using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, false>;
-        similarity_memory_requirements_t requirements(first_length, second_length, 1, sizeof(char_t),
-                                                      SZ_MAX_REGISTER_WIDTH);
+        similarity_memory_requirements_t requirements(                    //
+            first.size(), second.size(),                                  //
+            gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
+            sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
 
         // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
-        error_costs_uniform_t substituter;
         if (requirements.bytes_per_cell == 1) {
             sz_u8_t result_u8;
-            status_t status = diagonal_u8_t {substituter, 1, alloc_}(first, second, result_u8);
+            status_t status = diagonal_u8_t {substituter_, gap_costs_, alloc_}(first, second, result_u8);
             if (status != status_t::success_k) return status;
             result_ref = result_u8;
         }
         else if (requirements.bytes_per_cell == 2) {
             sz_u16_t result_u16;
-            status_t status = diagonal_u16_t {substituter, 1, alloc_}(first, second, result_u16);
+            status_t status = diagonal_u16_t {substituter_, gap_costs_, alloc_}(first, second, result_u16);
             if (status != status_t::success_k) return status;
             result_ref = result_u16;
         }
         else if (requirements.bytes_per_cell == 4) {
             sz_u32_t result_u32;
-            status_t status = diagonal_u32_t {substituter, 1, alloc_}(first, second, result_u32);
+            status_t status = diagonal_u32_t {substituter_, gap_costs_, alloc_}(first, second, result_u32);
             if (status != status_t::success_k) return status;
             result_ref = result_u32;
         }
         else if (requirements.bytes_per_cell == 8) {
             sz_u64_t result_u64;
-            status_t status = diagonal_u64_t {substituter, 1, alloc_}(first, second, result_u64);
+            status_t status = diagonal_u64_t {substituter_, gap_costs_, alloc_}(first, second, result_u64);
             if (status != status_t::success_k) return status;
             result_ref = result_u64;
         }
@@ -1858,29 +2625,40 @@ struct levenshtein_distance<char, allocator_type_, capability_, std::enable_if_t
  *  @sa     `levenshtein_distance` for binary strings.
  */
 template <typename allocator_type_, sz_capability_t capability_>
-struct levenshtein_distance_utf8<char, allocator_type_, capability_, std::enable_if_t<capability_ & sz_cap_ice_k>> {
+struct levenshtein_distance_utf8<char, linear_gap_costs_t, allocator_type_, capability_,
+                                 std::enable_if_t<capability_ & sz_cap_ice_k>> {
 
     using char_t = char;
+    using gap_costs_t = linear_gap_costs_t;
     using allocator_t = allocator_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_wout_simd_k = (sz_capability_t)(capability_k & ~sz_cap_ice_k);
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using diagonal_u8_t = diagonal_walker<sz_rune_t, sz_u8_t, error_costs_uniform_t, allocator_t,
-                                          sz_minimize_distance_k, sz_similarity_global_k, capability_serialized_k>;
-    using diagonal_u16_t = diagonal_walker<sz_rune_t, sz_u16_t, error_costs_uniform_t, allocator_t,
-                                           sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
-    using diagonal_u32_t = diagonal_walker<sz_rune_t, sz_u32_t, error_costs_uniform_t, allocator_t,
-                                           sz_minimize_distance_k, sz_similarity_global_k, capability_wout_simd_k>;
-    using diagonal_u64_t = diagonal_walker<sz_rune_t, sz_u64_t, error_costs_uniform_t, allocator_t,
-                                           sz_minimize_distance_k, sz_similarity_global_k, capability_wout_simd_k>;
-
-    using ascii_fallback_t = levenshtein_distance<char_t, allocator_t, capability_k>;
-
+    using diagonal_u8_t =                                                                                  //
+        diagonal_walker<sz_rune_t, sz_u8_t, uniform_substitution_costs_t, linear_gap_costs_t, allocator_t, //
+                        sz_minimize_distance_k, sz_similarity_global_k, capability_serialized_k>;
+    using diagonal_u16_t =                                                                                  //
+        diagonal_walker<sz_rune_t, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t, allocator_t, //
+                        sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
+    using diagonal_u32_t =                                                                                  //
+        diagonal_walker<sz_rune_t, sz_u32_t, uniform_substitution_costs_t, linear_gap_costs_t, allocator_t, //
+                        sz_minimize_distance_k, sz_similarity_global_k, capability_wout_simd_k>;
+    using diagonal_u64_t =                                                                                  //
+        diagonal_walker<sz_rune_t, sz_u64_t, uniform_substitution_costs_t, linear_gap_costs_t, allocator_t, //
+                        sz_minimize_distance_k, sz_similarity_global_k, capability_wout_simd_k>;
+
+    using ascii_fallback_t = levenshtein_distance<char_t, linear_gap_costs_t, allocator_t, capability_k>;
+
+    uniform_substitution_costs_t substituter_ {};
+    linear_gap_costs_t gap_costs_ {};
     mutable allocator_t alloc_ {};
 
     levenshtein_distance_utf8(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
+    levenshtein_distance_utf8(uniform_substitution_costs_t subs, linear_gap_costs_t gaps,
+                              allocator_t alloc = allocator_t {}) noexcept
+        : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
     /**
      *  @param[in] first The first string.
@@ -1889,21 +2667,10 @@ struct levenshtein_distance_utf8<char, allocator_type_, capability_, std::enable
      */
     status_t operator()(span<char_t const> first, span<char_t const> second, sz_size_t &result_ref) const noexcept {
 
-        sz_size_t const first_length = first.length();
-        sz_size_t const second_length = second.length();
-        if (first_length == 0) {
-            result_ref = second_length;
-            return status_t::success_k;
-        }
-        if (second_length == 0) {
-            result_ref = first_length;
-            return status_t::success_k;
-        }
-
         // Check if the strings are entirely composed of ASCII characters,
         // and default to a simpler algorithm in that case.
-        if (sz_isascii(first.data(), first.length()) && sz_isascii(second.data(), second.length()))
-            return ascii_fallback_t {alloc_}(first, second, result_ref);
+        if (sz_isascii(first.data(), first.size()) && sz_isascii(second.data(), second.size()))
+            return ascii_fallback_t {substituter_, gap_costs_, alloc_}(first, second, result_ref);
 
         // Allocate some memory to expand UTF-8 strings into UTF-32.
         sz_size_t const max_utf32_bytes = first.size() * 4 + second.size() * 4;
@@ -1922,35 +2689,36 @@ struct levenshtein_distance_utf8<char, allocator_type_, capability_, std::enable
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
         using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, false>;
-        similarity_memory_requirements_t requirements(first_length, second_length, 1, sizeof(sz_rune_t),
-                                                      SZ_MAX_REGISTER_WIDTH);
+        similarity_memory_requirements_t requirements(                    //
+            first.size(), second.size(),                                  //
+            gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
+            sizeof(sz_rune_t), SZ_MAX_REGISTER_WIDTH);
 
         // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
-        error_costs_uniform_t substituter;
         span<sz_rune_t const> const first_utf32 {first_data_utf32, first_length_utf32};
         span<sz_rune_t const> const second_utf32 {second_data_utf32, second_length_utf32};
         if (requirements.bytes_per_cell == 1) {
             sz_u8_t result_u8;
-            status_t status = diagonal_u8_t {substituter, 1, alloc_}(first_utf32, second_utf32, result_u8);
+            status_t status = diagonal_u8_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u8);
             if (status != status_t::success_k) return status;
             result_ref = result_u8;
         }
         else if (requirements.bytes_per_cell == 2) {
             sz_u16_t result_u16;
-            status_t status = diagonal_u16_t {substituter, 1, alloc_}(first_utf32, second_utf32, result_u16);
+            status_t status = diagonal_u16_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u16);
             if (status != status_t::success_k) return status;
             result_ref = result_u16;
         }
         else if (requirements.bytes_per_cell == 4) {
             sz_u32_t result_u32;
-            status_t status = diagonal_u32_t {substituter, 1, alloc_}(first_utf32, second_utf32, result_u32);
+            status_t status = diagonal_u32_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u32);
             if (status != status_t::success_k) return status;
             result_ref = result_u32;
         }
         else if (requirements.bytes_per_cell == 8) {
             sz_u64_t result_u64;
-            status_t status = diagonal_u64_t {substituter, 1, alloc_}(first_utf32, second_utf32, result_u64);
+            status_t status = diagonal_u64_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u64);
             if (status != status_t::success_k) return status;
             result_ref = result_u64;
         }
@@ -2025,13 +2793,14 @@ struct lookup_in256bytes_ice_t {
  *  - Any memory allocator used.
  */
 template <sz_similarity_locality_t locality_, sz_capability_t capability_>
-struct linear_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_256x256_t, sz_maximize_score_k,
-                     locality_, capability_, std::enable_if_t<capability_ & sz_cap_ice_k>>
-    : public linear_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_256x256_t, sz_maximize_score_k,
-                           locality_, sz_cap_serial_k, void> {
+struct tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_256x256_t, linear_gap_costs_t,
+                   sz_maximize_score_k, locality_, capability_, std::enable_if_t<capability_ & sz_cap_ice_k>>
+    : public tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_256x256_t, linear_gap_costs_t,
+                         sz_maximize_score_k, locality_, sz_cap_serial_k, void> {
 
-    using linear_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_256x256_t, sz_maximize_score_k,
-                        locality_, sz_cap_serial_k, void>::linear_scorer; // Make the constructors visible
+    using tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_256x256_t, linear_gap_costs_t,
+                      sz_maximize_score_k, locality_, sz_cap_serial_k,
+                      void>::tile_scorer; // Make the constructors visible
 
     static constexpr sz_similarity_objective_t objective_k = sz_maximize_score_k;
     static constexpr sz_similarity_locality_t locality_k = locality_;
@@ -2045,7 +2814,7 @@ struct linear_scorer<constant_iterator<char>, char const *, sz_i16_t, error_cost
         sz_i16_t const *scores_pre_deletion, sz_i16_t *scores_new) noexcept {
 
         // Load a new substitution row.
-        sz_i16_t const gap = this->gap_cost_;
+        sz_i16_t const gap = static_cast<sz_i16_t>(this->gap_costs_.open_or_extend);
         error_cost_t const *substitutions_row = &this->substituter_.cells[(sz_u8_t)*first_char][0];
         lookup_.reload(substitutions_row);
 
@@ -2169,31 +2938,35 @@ struct linear_scorer<constant_iterator<char>, char const *, sz_i16_t, error_cost
  *  @sa     `levenshtein_distance` for uniform substitution and gap costs.
  */
 template <typename allocator_type_, sz_capability_t capability_>
-struct needleman_wunsch_score<char, error_costs_256x256_t, allocator_type_, capability_,
+struct needleman_wunsch_score<char, error_costs_256x256_t, linear_gap_costs_t, allocator_type_, capability_,
                               std::enable_if_t<capability_ & sz_cap_ice_k>> {
 
     using char_t = char;
     using substituter_t = error_costs_256x256_t;
+    using gap_costs_t = linear_gap_costs_t;
     using allocator_t = allocator_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_wout_simd_k = (sz_capability_t)(capability_k & ~sz_cap_ice_k);
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using horizontal_i16_t = horizontal_walker<char_t, sz_i16_t, substituter_t, allocator_t, sz_maximize_score_k,
-                                               sz_similarity_global_k, capability_serialized_k>;
-    using horizontal_i32_t = horizontal_walker<char_t, sz_i32_t, substituter_t, allocator_t, sz_maximize_score_k,
-                                               sz_similarity_global_k, capability_wout_simd_k>;
-    using horizontal_i64_t = horizontal_walker<char_t, sz_i64_t, substituter_t, allocator_t, sz_maximize_score_k,
-                                               sz_similarity_global_k, capability_wout_simd_k>;
+    using horizontal_i16_t =                                                         //
+        horizontal_walker<char_t, sz_i16_t, substituter_t, gap_costs_t, allocator_t, //
+                          sz_maximize_score_k, sz_similarity_global_k, capability_serialized_k>;
+    using horizontal_i32_t =                                                         //
+        horizontal_walker<char_t, sz_i32_t, substituter_t, gap_costs_t, allocator_t, //
+                          sz_maximize_score_k, sz_similarity_global_k, capability_wout_simd_k>;
+    using horizontal_i64_t =                                                         //
+        horizontal_walker<char_t, sz_i64_t, substituter_t, gap_costs_t, allocator_t, //
+                          sz_maximize_score_k, sz_similarity_global_k, capability_wout_simd_k>;
 
     substituter_t substituter_ {};
-    error_cost_t gap_cost_ {1};
+    linear_gap_costs_t gap_costs_ {};
     allocator_t alloc_ {};
 
     needleman_wunsch_score(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
-    needleman_wunsch_score(substituter_t subs, error_cost_t gap_cost, allocator_t alloc = allocator_t {}) noexcept
-        : substituter_(subs), gap_cost_(gap_cost), alloc_(alloc) {}
+    needleman_wunsch_score(substituter_t subs, linear_gap_costs_t gaps, allocator_t alloc = allocator_t {}) noexcept
+        : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
     /**
      *  @param[in] first The first string.
@@ -2202,39 +2975,30 @@ struct needleman_wunsch_score<char, error_costs_256x256_t, allocator_type_, capa
      */
     status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref) const noexcept {
 
-        sz_size_t const first_length = first.length();
-        sz_size_t const second_length = second.length();
-        if (first_length == 0) {
-            result_ref = second_length * gap_cost_;
-            return status_t::success_k;
-        }
-        if (second_length == 0) {
-            result_ref = first_length * gap_cost_;
-            return status_t::success_k;
-        }
-
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
         using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, true>;
-        similarity_memory_requirements_t requirements(first_length, second_length, substituter_.max_magnitude_change(),
-                                                      sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
+        similarity_memory_requirements_t requirements(                    //
+            first.size(), second.size(),                                  //
+            gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
+            sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
 
         // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
         if (requirements.bytes_per_cell == 2) {
             sz_i16_t result_i16;
-            status_t status = horizontal_i16_t {substituter_, gap_cost_, alloc_}(first, second, result_i16);
+            status_t status = horizontal_i16_t {substituter_, gap_costs_, alloc_}(first, second, result_i16);
             if (status != status_t::success_k) return status;
             result_ref = result_i16;
         }
         else if (requirements.bytes_per_cell == 4) {
             sz_i32_t result_i32;
-            status_t status = horizontal_i32_t {substituter_, gap_cost_, alloc_}(first, second, result_i32);
+            status_t status = horizontal_i32_t {substituter_, gap_costs_, alloc_}(first, second, result_i32);
             if (status != status_t::success_k) return status;
             result_ref = result_i32;
         }
         else if (requirements.bytes_per_cell == 8) {
             sz_i64_t result_i64;
-            status_t status = horizontal_i64_t {substituter_, gap_cost_, alloc_}(first, second, result_i64);
+            status_t status = horizontal_i64_t {substituter_, gap_costs_, alloc_}(first, second, result_i64);
             if (status != status_t::success_k) return status;
             result_ref = result_i64;
         }
@@ -2248,31 +3012,35 @@ struct needleman_wunsch_score<char, error_costs_256x256_t, allocator_type_, capa
  *  @sa     `levenshtein_distance` for uniform substitution and gap costs.
  */
 template <typename allocator_type_, sz_capability_t capability_>
-struct smith_waterman_score<char, error_costs_256x256_t, allocator_type_, capability_,
+struct smith_waterman_score<char, error_costs_256x256_t, linear_gap_costs_t, allocator_type_, capability_,
                             std::enable_if_t<capability_ & sz_cap_ice_k>> {
 
     using char_t = char;
     using substituter_t = error_costs_256x256_t;
+    using gap_costs_t = linear_gap_costs_t;
     using allocator_t = allocator_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_wout_simd_k = (sz_capability_t)(capability_k & ~sz_cap_ice_k);
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
-    using horizontal_i16_t = horizontal_walker<char_t, sz_i16_t, substituter_t, allocator_t, sz_maximize_score_k,
-                                               sz_similarity_local_k, capability_serialized_k>;
-    using horizontal_i32_t = horizontal_walker<char_t, sz_i32_t, substituter_t, allocator_t, sz_maximize_score_k,
-                                               sz_similarity_local_k, capability_wout_simd_k>;
-    using horizontal_i64_t = horizontal_walker<char_t, sz_i64_t, substituter_t, allocator_t, sz_maximize_score_k,
-                                               sz_similarity_local_k, capability_wout_simd_k>;
+    using horizontal_i16_t =                                                                //
+        horizontal_walker<char_t, sz_i16_t, substituter_t, linear_gap_costs_t, allocator_t, //
+                          sz_maximize_score_k, sz_similarity_local_k, capability_serialized_k>;
+    using horizontal_i32_t =                                                                //
+        horizontal_walker<char_t, sz_i32_t, substituter_t, linear_gap_costs_t, allocator_t, //
+                          sz_maximize_score_k, sz_similarity_local_k, capability_wout_simd_k>;
+    using horizontal_i64_t =                                                                //
+        horizontal_walker<char_t, sz_i64_t, substituter_t, linear_gap_costs_t, allocator_t, //
+                          sz_maximize_score_k, sz_similarity_local_k, capability_wout_simd_k>;
 
     substituter_t substituter_ {};
-    error_cost_t gap_cost_ {1};
+    linear_gap_costs_t gap_costs_ {};
     allocator_t alloc_ {};
 
     smith_waterman_score(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
-    smith_waterman_score(substituter_t subs, error_cost_t gap_cost, allocator_t alloc = allocator_t {}) noexcept
-        : substituter_(subs), gap_cost_(gap_cost), alloc_(alloc) {}
+    smith_waterman_score(substituter_t subs, linear_gap_costs_t gaps, allocator_t alloc = allocator_t {}) noexcept
+        : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
     /**
      *  @param[in] first The first string.
@@ -2281,39 +3049,30 @@ struct smith_waterman_score<char, error_costs_256x256_t, allocator_type_, capabi
      */
     status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref) const noexcept {
 
-        sz_size_t const first_length = first.length();
-        sz_size_t const second_length = second.length();
-        if (first_length == 0) {
-            result_ref = second_length * gap_cost_;
-            return status_t::success_k;
-        }
-        if (second_length == 0) {
-            result_ref = first_length * gap_cost_;
-            return status_t::success_k;
-        }
-
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
         using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, true>;
-        similarity_memory_requirements_t requirements(first_length, second_length, substituter_.max_magnitude_change(),
-                                                      sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
+        similarity_memory_requirements_t requirements(                    //
+            first.size(), second.size(),                                  //
+            gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
+            sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
 
         // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
         if (requirements.bytes_per_cell == 2) {
             sz_i16_t result_i16;
-            status_t status = horizontal_i16_t {substituter_, gap_cost_, alloc_}(first, second, result_i16);
+            status_t status = horizontal_i16_t {substituter_, gap_costs_, alloc_}(first, second, result_i16);
             if (status != status_t::success_k) return status;
             result_ref = result_i16;
         }
         else if (requirements.bytes_per_cell == 4) {
             sz_i32_t result_i32;
-            status_t status = horizontal_i32_t {substituter_, gap_cost_, alloc_}(first, second, result_i32);
+            status_t status = horizontal_i32_t {substituter_, gap_costs_, alloc_}(first, second, result_i32);
             if (status != status_t::success_k) return status;
             result_ref = result_i32;
         }
         else if (requirements.bytes_per_cell == 8) {
             sz_i64_t result_i64;
-            status_t status = horizontal_i64_t {substituter_, gap_cost_, alloc_}(first, second, result_i64);
+            status_t status = horizontal_i64_t {substituter_, gap_costs_, alloc_}(first, second, result_i64);
             if (status != status_t::success_k) return status;
             result_ref = result_i64;
         }

From 8c15447ab48f259133cb26b411c32052c274518b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 24 Apr 2025 19:53:20 +0000
Subject: [PATCH 360/751] Improve: Differentiate unary & uniform costs

---
 include/stringzilla/types.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 822eda42..e74e947a 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -127,7 +127,7 @@ enum class status_t {
  *  @brief A trivial function object for uniform character substitution costs in Levenshtein-like similarity algorithms.
  *  @sa error_costs_256x256_t, error_costs_26x26ascii_t
  */
-struct error_costs_uniform_t {
+struct error_costs_unary_t {
     constexpr error_cost_t operator()(char a, char b) const noexcept { return a == b ? 0 : 1; }
     constexpr error_cost_t operator()(sz_rune_t a, sz_rune_t b) const noexcept { return a == b ? 0 : 1; }
     constexpr sz_size_t max_magnitude_change() const noexcept { return 1; }
@@ -153,6 +153,7 @@ struct span {
     constexpr size_type length() const noexcept { return size_; }
     constexpr size_type size_bytes() const noexcept { return size_ * sizeof(value_type); }
     constexpr value_type &operator[](size_type i) const noexcept { return data_[i]; }
+    constexpr bool empty() const noexcept { return size_ == 0; }
 };
 
 template <typename value_type_>

From 640853f2d36e16a87a056b353dda49c2e675a36c Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 24 Apr 2025 19:54:08 +0000
Subject: [PATCH 361/751] Fix: Initializing affine DP matrices

---
 include/stringcuzilla/similarity.hpp | 35 +++++++++++-----------------
 1 file changed, 13 insertions(+), 22 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index a8d16b75..e9c5c47a 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -510,7 +510,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
      *  @note Should only be called for the top row and left column of the matrix.
      */
     void init_score(score_t &cell, sz_size_t diagonal_index) const noexcept {
-        cell = gap_costs_.open + gap_costs_.extend * (diagonal_index - 1);
+        cell = diagonal_index ? gap_costs_.open + gap_costs_.extend * (diagonal_index - 1) : 0;
     }
 
     void init_gap(score_t &cell, sz_size_t diagonal_index) const noexcept {
@@ -518,7 +518,8 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         // The supplementary matrices are initialized with values of higher magnitude,
         // which is equivalent to discarding them. That's better than using `SIZE_MAX`
         // as subsequent additions won't overflow.
-        cell = gap_costs_.open * 2 + gap_costs_.extend * diagonal_index;
+        cell = gap_costs_.open + gap_costs_.extend +
+               (diagonal_index ? gap_costs_.open + gap_costs_.extend * (diagonal_index - 1) : 0);
     }
 
     /**
@@ -850,14 +851,12 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_co
 
             // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
             // dropping the first element in the current array.
-            score_t *temporary = previous_scores;
+            rotate_three(previous_scores, current_scores, next_scores);
 
             // ! Drop the first entry among the current scores.
-            // ! Assuming every next diagonal is shorter by one element, we don't need a full-blown `sz_move`.
-            // ! to shift the array by one element.
-            previous_scores = current_scores + 1;
-            current_scores = next_scores;
-            next_scores = temporary;
+            // ! Assuming every next diagonal is shorter by one element,
+            // ! we don't need a full-blown `sz_move` to shift the array by one element.
+            previous_scores++;
         }
 
         // Export the scalar before `free` call.
@@ -1061,22 +1060,14 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
 
             // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
             // dropping the first element in the current array.
-            score_t *temporary = previous_scores;
+            rotate_three(previous_scores, current_scores, next_scores);
+            std::swap(current_inserts, next_inserts);
+            std::swap(current_deletes, next_deletes);
 
             // ! Drop the first entry among the current scores.
-            // ! Assuming every next diagonal is shorter by one element, we don't need a full-blown `sz_move`.
-            // ! to shift the array by one element.
-            previous_scores = current_scores + 1;
-            current_scores = next_scores;
-            next_scores = temporary;
-
-            // ! Drop the first entry among the current insertions and deletions.
-            temporary = current_inserts;
-            current_inserts = next_inserts + 0;
-            next_inserts = temporary;
-            temporary = current_deletes;
-            current_deletes = next_deletes + 0;
-            next_deletes = temporary;
+            // ! Assuming every next diagonal is shorter by one element,
+            // ! we don't need a full-blown `sz_move` to shift the array by one element.
+            previous_scores++;
         }
 
         // Export the scalar before `free` call.

From 84397aef178f167db7e5a9605bf760d40c77c48d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 24 Apr 2025 20:01:03 +0000
Subject: [PATCH 362/751] Fix: All Gotoh baselines

---
 scripts/test_stringcuzilla.cuh | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index 12eca7e7..be445073 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -210,10 +210,10 @@ inline std::size_t levenshtein_gotoh_baseline(                          //
         for (std::size_t j = 1; j < cols; ++j) {
             std::size_t substitution_cost = (s1[i - 1] == s2[j - 1]) ? match_cost : mismatch_cost;
             std::size_t if_substitution = last_row[j - 1] + substitution_cost;
-            std::size_t if_deletion =
-                std::min<std::size_t>(last_row[j] + gap_opening_cost, last_deletes_row[j] + gap_extension_cost);
             std::size_t if_insertion =
                 std::min<std::size_t>(row[j - 1] + gap_opening_cost, row_inserts[j - 1] + gap_extension_cost);
+            std::size_t if_deletion =
+                std::min<std::size_t>(last_row[j] + gap_opening_cost, last_deletes_row[j] + gap_extension_cost);
             std::size_t if_deletion_or_insertion = std::min(if_deletion, if_insertion);
             row[j] = std::min(if_deletion_or_insertion, if_substitution);
             row_inserts[j] = if_insertion;
@@ -255,17 +255,16 @@ inline std::ptrdiff_t needleman_wunsch_gotoh_baseline(                  //
     for (std::size_t i = 1; i < rows; ++i) {
         std::ptrdiff_t const *last_row = &matrix_scores[(i - 1) * cols];
         std::ptrdiff_t *row = &matrix_scores[i * cols];
-        std::ptrdiff_t const *last_inserts_row = &matrix_inserts[(i - 1) * cols];
         std::ptrdiff_t *row_inserts = &matrix_inserts[i * cols];
         std::ptrdiff_t const *last_deletes_row = &matrix_deletes[(i - 1) * cols];
         std::ptrdiff_t *row_deletes = &matrix_deletes[i * cols];
         for (std::size_t j = 1; j < cols; ++j) {
             std::ptrdiff_t substitution_cost = substitution_cost_for(s1[i - 1], s2[j - 1]);
             std::ptrdiff_t if_substitution = last_row[j - 1] + substitution_cost;
+            std::ptrdiff_t if_insertion =
+                std::max(row[j - 1] + gap_opening_cost, row_inserts[j - 1] + gap_extension_cost);
             std::ptrdiff_t if_deletion =
                 std::max(last_row[j] + gap_opening_cost, last_deletes_row[j] + gap_extension_cost);
-            std::ptrdiff_t if_insertion =
-                std::max(row[j - 1] + gap_opening_cost, last_inserts_row[j - 1] + gap_extension_cost);
             std::ptrdiff_t if_deletion_or_insertion = std::max(if_deletion, if_insertion);
             row[j] = std::max(if_deletion_or_insertion, if_substitution);
             row_inserts[j] = if_insertion;
@@ -310,17 +309,16 @@ inline std::ptrdiff_t smith_waterman_gotoh_baseline(                    //
     for (std::size_t i = 1; i < rows; ++i) {
         std::ptrdiff_t const *last_row = &matrix_scores[(i - 1) * cols];
         std::ptrdiff_t *row = &matrix_scores[i * cols];
-        std::ptrdiff_t const *last_inserts_row = &matrix_inserts[(i - 1) * cols];
         std::ptrdiff_t *row_inserts = &matrix_inserts[i * cols];
         std::ptrdiff_t const *last_deletes_row = &matrix_deletes[(i - 1) * cols];
         std::ptrdiff_t *row_deletes = &matrix_deletes[i * cols];
         for (std::size_t j = 1; j < cols; ++j) {
             std::ptrdiff_t substitution_cost = substitution_cost_for(s1[i - 1], s2[j - 1]);
             std::ptrdiff_t if_substitution = last_row[j - 1] + substitution_cost;
+            std::ptrdiff_t if_insertion =
+                std::max(row[j - 1] + gap_opening_cost, row_inserts[j - 1] + gap_extension_cost);
             std::ptrdiff_t if_deletion =
                 std::max(last_row[j] + gap_opening_cost, last_deletes_row[j] + gap_extension_cost);
-            std::ptrdiff_t if_insertion =
-                std::max(row[j - 1] + gap_opening_cost, last_inserts_row[j - 1] + gap_extension_cost);
             std::ptrdiff_t if_deletion_or_insertion = std::max(if_deletion, if_insertion);
             std::ptrdiff_t if_substitution_or_reset = std::max<std::ptrdiff_t>(if_substitution, 0);
             std::ptrdiff_t score = std::max(if_deletion_or_insertion, if_substitution_or_reset);

From 2b4fa09ba068afce41d086613e977e1cd0eeecf4 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 24 Apr 2025 21:38:36 +0000
Subject: [PATCH 363/751] Add: Unmasked Ice Lake Levenshtein

---
 include/stringcuzilla/similarity.hpp | 295 ++++++++++++++++++++-------
 1 file changed, 224 insertions(+), 71 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index e9c5c47a..e845a57b 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -665,7 +665,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
                                                            pre_insertion_expansion + gap_costs_.extend);
             score_t if_deletion_or_insertion = min_or_max<objective_k>(if_deletion, if_insertion);
             // ! This is the main difference with global alignment:
-            score_t if_substitution_or_reset = min_or_max<objective_k>(if_substitution, 0);
+            score_t if_substitution_or_reset = min_or_max<objective_k, score_t>(if_substitution, 0);
             score_t cell_score = min_or_max<objective_k>(if_deletion_or_insertion, if_substitution_or_reset);
 
             // Export results.
@@ -2280,34 +2280,63 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
     static constexpr sz_capability_t capability_k = capability_;
 
-    inline void slice(                                                                        //
+    sz_u512_vec_t match_cost_vec_;
+    sz_u512_vec_t mismatch_cost_vec_;
+    sz_u512_vec_t gap_cost_vec_;
+
+    inline void slice_64chars(                                                       //
+        char const *first_reversed_slice, char const *second_slice, sz_size_t i,     //
+        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion, //
+        sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new) const noexcept {
+
+        __mmask64 match_mask;
+        sz_u512_vec_t first_vec, second_vec;
+        sz_u512_vec_t pre_substitution_vec, pre_insert_vec, pre_delete_vec;
+        sz_u512_vec_t cost_of_substitution_vec;
+        sz_u512_vec_t cost_if_substitution_vec, cost_if_gap_vec, cell_score_vec;
+
+        // ? Note that here we are still traversing both buffers in the same order,
+        // ? because one of the strings has been reversed beforehand.
+        first_vec.zmm = _mm512_loadu_epi8(first_reversed_slice + i);
+        second_vec.zmm = _mm512_loadu_epi8(second_slice + i);
+        pre_substitution_vec.zmm = _mm512_loadu_epi8(scores_pre_substitution + i);
+        pre_insert_vec.zmm = _mm512_loadu_epi8(scores_pre_insertion + i);
+        pre_delete_vec.zmm = _mm512_loadu_epi8(scores_pre_deletion + i);
+
+        match_mask = _mm512_cmpeq_epi8_mask(first_vec.zmm, second_vec.zmm);
+        cost_of_substitution_vec.zmm = _mm512_mask_blend_epi8(match_mask, mismatch_cost_vec_.zmm, match_cost_vec_.zmm);
+        cost_if_substitution_vec.zmm = _mm512_add_epi8(pre_substitution_vec.zmm, cost_of_substitution_vec.zmm);
+        cost_if_gap_vec.zmm =
+            _mm512_add_epi8(_mm512_min_epu8(pre_insert_vec.zmm, pre_delete_vec.zmm), gap_cost_vec_.zmm);
+        cell_score_vec.zmm = _mm512_min_epu8(cost_if_substitution_vec.zmm, cost_if_gap_vec.zmm);
+        _mm512_storeu_epi8(scores_new + i, cell_score_vec.zmm);
+    }
+
+    inline void slice_under64chars(                                                           //
         char const *first_reversed_slice, char const *second_slice, sz_size_t i, sz_size_t n, //
         sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion,          //
         sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new) const noexcept {
 
-        __mmask64 load_mask, mismatch_mask;
+        __mmask64 load_mask, match_mask;
         sz_u512_vec_t first_vec, second_vec;
-        sz_u512_vec_t pre_substitution_vec, pre_insertion_vec, pre_deletion_vec;
+        sz_u512_vec_t pre_substitution_vec, pre_insert_vec, pre_delete_vec;
+        sz_u512_vec_t cost_of_substitution_vec;
         sz_u512_vec_t cost_if_substitution_vec, cost_if_gap_vec, cell_score_vec;
 
-        // Initialize constats:
-        sz_u512_vec_t gap_cost_vec;
-        gap_cost_vec.zmm = _mm512_set1_epi8(1);
-
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
         load_mask = _sz_u64_mask_until(n - i);
         first_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, first_reversed_slice + i);
         second_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, second_slice + i);
         pre_substitution_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, scores_pre_substitution + i);
-        pre_insertion_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, scores_pre_insertion + i);
-        pre_deletion_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, scores_pre_deletion + i);
+        pre_insert_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, scores_pre_insertion + i);
+        pre_delete_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, scores_pre_deletion + i);
 
-        mismatch_mask = _mm512_cmpneq_epi8_mask(first_vec.zmm, second_vec.zmm);
-        cost_if_substitution_vec.zmm =
-            _mm512_mask_add_epi8(pre_substitution_vec.zmm, mismatch_mask, pre_substitution_vec.zmm, gap_cost_vec.zmm);
+        match_mask = _mm512_cmpeq_epi8_mask(first_vec.zmm, second_vec.zmm);
+        cost_of_substitution_vec.zmm = _mm512_mask_blend_epi8(match_mask, mismatch_cost_vec_.zmm, match_cost_vec_.zmm);
+        cost_if_substitution_vec.zmm = _mm512_add_epi8(pre_substitution_vec.zmm, cost_of_substitution_vec.zmm);
         cost_if_gap_vec.zmm =
-            _mm512_add_epi8(_mm512_min_epu8(pre_insertion_vec.zmm, pre_deletion_vec.zmm), gap_cost_vec.zmm);
+            _mm512_add_epi8(_mm512_min_epu8(pre_insert_vec.zmm, pre_delete_vec.zmm), gap_cost_vec_.zmm);
         cell_score_vec.zmm = _mm512_min_epu8(cost_if_substitution_vec.zmm, cost_if_gap_vec.zmm);
         _mm512_mask_storeu_epi8(scores_new + i, load_mask, cell_score_vec.zmm);
     }
@@ -2317,10 +2346,19 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
         sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion, //
         sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new) noexcept {
 
+        // Initialize constats:
+        match_cost_vec_.zmm = _mm512_set1_epi8(this->substituter_.match);
+        mismatch_cost_vec_.zmm = _mm512_set1_epi8(this->substituter_.mismatch);
+        gap_cost_vec_.zmm = _mm512_set1_epi8(this->gap_costs_.open_or_extend);
+
         // In this variant we will need at most 4 loops per diagonal
-        for (sz_size_t i = 0; i < n; i += 64)
-            slice(first_reversed_slice, second_slice, i, n, scores_pre_substitution, scores_pre_insertion,
-                  scores_pre_deletion, scores_new);
+        sz_size_t i = 0;
+        for (; i + 64 <= n; i += 64)
+            slice_64chars(first_reversed_slice, second_slice, i, scores_pre_substitution, scores_pre_insertion,
+                          scores_pre_deletion, scores_new);
+        if (i < n)
+            slice_under64chars(first_reversed_slice, second_slice, i, n, scores_pre_substitution, scores_pre_insertion,
+                               scores_pre_deletion, scores_new);
 
         // The last element of the last chunk is the result of the global alignment.
         if (n == 1) this->last_score_ = scores_new[0];
@@ -2344,33 +2382,61 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, uniform_substi
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
     static constexpr sz_capability_t capability_k = capability_;
 
-    inline void slice(                                                                                  //
+    sz_u128_vec_t match_cost_vec_;
+    sz_u128_vec_t mismatch_cost_vec_;
+    sz_u128_vec_t gap_cost_vec_;
+
+    inline void slice_16chars(                                                             //
+        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, sz_size_t i, //
+        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion,       //
+        sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new) const noexcept {
+
+        __mmask16 match_mask;
+        sz_u512_vec_t first_vec, second_vec;
+        sz_u128_vec_t pre_substitution_vec, pre_insert_vec, pre_delete_vec;
+        sz_u128_vec_t cost_of_substitution_vec;
+        sz_u128_vec_t cost_if_substitution_vec, cost_if_gap_vec, cell_score_vec;
+
+        // ? Note that here we are still traversing both buffers in the same order,
+        // ? because one of the strings has been reversed beforehand.
+        first_vec.zmm = _mm512_loadu_epi32(first_reversed_slice + i);
+        second_vec.zmm = _mm512_loadu_epi32(second_slice + i);
+        pre_substitution_vec.xmm = _mm_loadu_epi8(scores_pre_substitution + i);
+        pre_insert_vec.xmm = _mm_loadu_epi8(scores_pre_insertion + i);
+        pre_delete_vec.xmm = _mm_loadu_epi8(scores_pre_deletion + i);
+
+        match_mask = _mm512_cmpeq_epi32_mask(first_vec.zmm, second_vec.zmm);
+        cost_of_substitution_vec.xmm = _mm_mask_blend_epi8(match_mask, mismatch_cost_vec_.xmm, match_cost_vec_.xmm);
+        cost_if_substitution_vec.xmm = _mm_add_epi8(pre_substitution_vec.xmm, cost_of_substitution_vec.xmm);
+        cost_if_gap_vec.xmm = _mm_add_epi8(_mm_min_epu8(pre_insert_vec.xmm, pre_delete_vec.xmm), gap_cost_vec_.xmm);
+        cell_score_vec.xmm = _mm_min_epu8(cost_if_substitution_vec.xmm, cost_if_gap_vec.xmm);
+        _mm_storeu_epi8(scores_new + i, cell_score_vec.xmm);
+    }
+
+    inline void slice_under16chars(                                                                     //
         sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, sz_size_t i, sz_size_t n, //
         sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion,                    //
         sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new) const noexcept {
 
-        __mmask16 load_mask, mismatch_mask;
+        __mmask16 load_mask, match_mask;
         sz_u512_vec_t first_vec, second_vec;
-        sz_u128_vec_t pre_substitution_vec, pre_insertion_vec, pre_deletion_vec;
+        sz_u128_vec_t pre_substitution_vec, pre_insert_vec, pre_delete_vec;
+        sz_u128_vec_t cost_of_substitution_vec;
         sz_u128_vec_t cost_if_substitution_vec, cost_if_gap_vec, cell_score_vec;
 
-        // Initialize constats:
-        sz_u128_vec_t gap_cost_vec;
-        gap_cost_vec.xmm = _mm_set1_epi8(1);
-
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
         load_mask = _sz_u16_clamp_mask_until(n - i);
         first_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, first_reversed_slice + i);
         second_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, second_slice + i);
         pre_substitution_vec.xmm = _mm_maskz_loadu_epi8(load_mask, scores_pre_substitution + i);
-        pre_insertion_vec.xmm = _mm_maskz_loadu_epi8(load_mask, scores_pre_insertion + i);
-        pre_deletion_vec.xmm = _mm_maskz_loadu_epi8(load_mask, scores_pre_deletion + i);
+        pre_insert_vec.xmm = _mm_maskz_loadu_epi8(load_mask, scores_pre_insertion + i);
+        pre_delete_vec.xmm = _mm_maskz_loadu_epi8(load_mask, scores_pre_deletion + i);
 
-        mismatch_mask = _mm512_cmpneq_epi32_mask(first_vec.zmm, second_vec.zmm);
-        cost_if_substitution_vec.xmm =
-            _mm_mask_add_epi8(pre_substitution_vec.xmm, mismatch_mask, pre_substitution_vec.xmm, gap_cost_vec.xmm);
-        cost_if_gap_vec.xmm = _mm_add_epi8(_mm_min_epu8(pre_insertion_vec.xmm, pre_deletion_vec.xmm), gap_cost_vec.xmm);
+        match_mask = _mm512_cmpeq_epi32_mask(first_vec.zmm, second_vec.zmm);
+        cost_of_substitution_vec.xmm = _mm_mask_blend_epi8(match_mask, mismatch_cost_vec_.xmm, match_cost_vec_.xmm);
+        cost_if_substitution_vec.xmm = _mm_add_epi8(pre_substitution_vec.xmm, cost_of_substitution_vec.xmm);
+        cost_if_gap_vec.xmm = _mm_add_epi8(_mm_min_epu8(pre_insert_vec.xmm, pre_delete_vec.xmm), gap_cost_vec_.xmm);
         cell_score_vec.xmm = _mm_min_epu8(cost_if_substitution_vec.xmm, cost_if_gap_vec.xmm);
         _mm_mask_storeu_epi8(scores_new + i, load_mask, cell_score_vec.xmm);
     }
@@ -2380,10 +2446,19 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, uniform_substi
         sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion,       //
         sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new) noexcept {
 
+        // Initialize constats:
+        match_cost_vec_.xmm = _mm_set1_epi8(this->substituter_.match);
+        mismatch_cost_vec_.xmm = _mm_set1_epi8(this->substituter_.mismatch);
+        gap_cost_vec_.xmm = _mm_set1_epi8(this->gap_costs_.open_or_extend);
+
         // In this variant we will need at most (256 / 16) = 16 loops per diagonal.
-        for (sz_size_t i = 0; i < n; i += 16)
-            slice(first_reversed_slice, second_slice, i, n, scores_pre_substitution, scores_pre_insertion,
-                  scores_pre_deletion, scores_new);
+        sz_size_t i = 0;
+        for (; i + 16 <= n; i += 16)
+            slice_16chars(first_reversed_slice, second_slice, i, scores_pre_substitution, scores_pre_insertion,
+                          scores_pre_deletion, scores_new);
+        if (i < n)
+            slice_under16chars(first_reversed_slice, second_slice, i, n, scores_pre_substitution, scores_pre_insertion,
+                               scores_pre_deletion, scores_new);
 
         // The last element of the last chunk is the result of the global alignment.
         if (n == 1) this->last_score_ = scores_new[0];
@@ -2407,34 +2482,63 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
     static constexpr sz_capability_t capability_k = capability_;
 
-    inline void slice(                                                                        //
+    sz_u512_vec_t match_cost_vec_;
+    sz_u512_vec_t mismatch_cost_vec_;
+    sz_u512_vec_t gap_cost_vec_;
+
+    inline void slice_32chars(                                                         //
+        char const *first_reversed_slice, char const *second_slice, sz_size_t i,       //
+        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion, //
+        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) const noexcept {
+
+        __mmask32 match_mask;
+        sz_u256_vec_t first_vec, second_vec;
+        sz_u512_vec_t pre_substitution_vec, pre_insert_vec, pre_delete_vec;
+        sz_u512_vec_t cost_of_substitution_vec;
+        sz_u512_vec_t cost_if_substitution_vec, cost_if_gap_vec, cell_score_vec;
+
+        // ? Note that here we are still traversing both buffers in the same order,
+        // ? because one of the strings has been reversed beforehand.
+        first_vec.ymm = _mm256_loadu_epi8(first_reversed_slice + i);
+        second_vec.ymm = _mm256_loadu_epi8(second_slice + i);
+        pre_substitution_vec.zmm = _mm512_loadu_epi16(scores_pre_substitution + i);
+        pre_insert_vec.zmm = _mm512_loadu_epi16(scores_pre_insertion + i);
+        pre_delete_vec.zmm = _mm512_loadu_epi16(scores_pre_deletion + i);
+
+        match_mask = _mm256_cmpeq_epi8_mask(first_vec.ymm, second_vec.ymm);
+        cost_of_substitution_vec.zmm = _mm512_mask_blend_epi16(match_mask, mismatch_cost_vec_.zmm, match_cost_vec_.zmm);
+        cost_if_substitution_vec.zmm = _mm512_add_epi16(pre_substitution_vec.zmm, cost_of_substitution_vec.zmm);
+        cost_if_gap_vec.zmm =
+            _mm512_add_epi16(_mm512_min_epu16(pre_insert_vec.zmm, pre_delete_vec.zmm), gap_cost_vec_.zmm);
+        cell_score_vec.zmm = _mm512_min_epu16(cost_if_substitution_vec.zmm, cost_if_gap_vec.zmm);
+        _mm512_storeu_epi16(scores_new + i, cell_score_vec.zmm);
+    }
+
+    inline void slice_under32chars(                                                           //
         char const *first_reversed_slice, char const *second_slice, sz_size_t i, sz_size_t n, //
         sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,        //
         sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) const noexcept {
 
-        __mmask32 load_mask, mismatch_mask;
+        __mmask32 load_mask, match_mask;
         sz_u256_vec_t first_vec, second_vec;
-        sz_u512_vec_t pre_substitution_vec, pre_insertion_vec, pre_deletion_vec;
+        sz_u512_vec_t pre_substitution_vec, pre_insert_vec, pre_delete_vec;
+        sz_u512_vec_t cost_of_substitution_vec;
         sz_u512_vec_t cost_if_substitution_vec, cost_if_gap_vec, cell_score_vec;
 
-        // Initialize constats:
-        sz_u512_vec_t gap_cost_vec;
-        gap_cost_vec.zmm = _mm512_set1_epi16(1);
-
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
         load_mask = _sz_u32_clamp_mask_until(n - i);
         first_vec.ymm = _mm256_maskz_loadu_epi8(load_mask, first_reversed_slice + i);
         second_vec.ymm = _mm256_maskz_loadu_epi8(load_mask, second_slice + i);
         pre_substitution_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_substitution + i);
-        pre_insertion_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_insertion + i);
-        pre_deletion_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_deletion + i);
+        pre_insert_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_insertion + i);
+        pre_delete_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_deletion + i);
 
-        mismatch_mask = _mm256_cmpneq_epi8_mask(first_vec.ymm, second_vec.ymm);
-        cost_if_substitution_vec.zmm =
-            _mm512_mask_add_epi16(pre_substitution_vec.zmm, mismatch_mask, pre_substitution_vec.zmm, gap_cost_vec.zmm);
+        match_mask = _mm256_cmpeq_epi8_mask(first_vec.ymm, second_vec.ymm);
+        cost_of_substitution_vec.zmm = _mm512_mask_blend_epi16(match_mask, mismatch_cost_vec_.zmm, match_cost_vec_.zmm);
+        cost_if_substitution_vec.zmm = _mm512_add_epi16(pre_substitution_vec.zmm, cost_of_substitution_vec.zmm);
         cost_if_gap_vec.zmm =
-            _mm512_add_epi16(_mm512_min_epu16(pre_insertion_vec.zmm, pre_deletion_vec.zmm), gap_cost_vec.zmm);
+            _mm512_add_epi16(_mm512_min_epu16(pre_insert_vec.zmm, pre_delete_vec.zmm), gap_cost_vec_.zmm);
         cell_score_vec.zmm = _mm512_min_epu16(cost_if_substitution_vec.zmm, cost_if_gap_vec.zmm);
         _mm512_mask_storeu_epi16(scores_new + i, load_mask, cell_score_vec.zmm);
     }
@@ -2444,15 +2548,25 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
         sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion, //
         sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
 
+        // Initialize constats:
+        match_cost_vec_.zmm = _mm512_set1_epi16(this->substituter_.match);
+        mismatch_cost_vec_.zmm = _mm512_set1_epi16(this->substituter_.mismatch);
+        gap_cost_vec_.zmm = _mm512_set1_epi16(this->gap_costs_.open_or_extend);
+
+        // Precompute the number of elements processed in full-size slices for OpenMP compatibility.
+        sz_size_t const n_rounded = (n / 32) * 32;
+        sz_size_t const n_remainder = n - n_rounded;
+
 #if (capability_k & sz_cap_parallel_k)
-#pragma omp parallel for simd
-#else
-#pragma omp simd
+#pragma omp parallel for
 #endif
         // In this variant we will need at most (64 * 1024 / 32) = 2048 loops per diagonal.
-        for (sz_size_t i = 0; i < n; i += 32)
-            slice(first_reversed_slice, second_slice, i, n, scores_pre_substitution, scores_pre_insertion,
-                  scores_pre_deletion, scores_new);
+        for (sz_size_t i = 0; i < n_rounded; i += 32)
+            slice_32chars(first_reversed_slice, second_slice, i, scores_pre_substitution, scores_pre_insertion,
+                          scores_pre_deletion, scores_new);
+        if (n_remainder > 0)
+            slice_under32chars(first_reversed_slice, second_slice, n_rounded, n, scores_pre_substitution,
+                               scores_pre_insertion, scores_pre_deletion, scores_new);
 
         // The last element of the last chunk is the result of the global alignment.
         if (n == 1) this->last_score_ = scores_new[0];
@@ -2476,34 +2590,63 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_subst
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
     static constexpr sz_capability_t capability_k = capability_;
 
-    inline void slice(                                                                                  //
+    sz_u256_vec_t match_cost_vec_;
+    sz_u256_vec_t mismatch_cost_vec_;
+    sz_u256_vec_t gap_cost_vec_;
+
+    inline void slice_16chars(                                                             //
+        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, sz_size_t i, //
+        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,     //
+        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) const noexcept {
+
+        __mmask16 match_mask;
+        sz_u512_vec_t first_vec, second_vec;
+        sz_u256_vec_t pre_substitution_vec, pre_insert_vec, pre_delete_vec;
+        sz_u256_vec_t cost_of_substitution_vec;
+        sz_u256_vec_t cost_if_substitution_vec, cost_if_gap_vec, cell_score_vec;
+
+        // ? Note that here we are still traversing both buffers in the same order,
+        // ? because one of the strings has been reversed beforehand.
+        first_vec.zmm = _mm512_loadu_epi32(first_reversed_slice + i);
+        second_vec.zmm = _mm512_loadu_epi32(second_slice + i);
+        pre_substitution_vec.ymm = _mm256_loadu_epi16(scores_pre_substitution + i);
+        pre_insert_vec.ymm = _mm256_loadu_epi16(scores_pre_insertion + i);
+        pre_delete_vec.ymm = _mm256_loadu_epi16(scores_pre_deletion + i);
+
+        match_mask = _mm512_cmpeq_epi32_mask(first_vec.zmm, second_vec.zmm);
+        cost_of_substitution_vec.ymm = _mm256_mask_blend_epi16(match_mask, mismatch_cost_vec_.ymm, match_cost_vec_.ymm);
+        cost_if_substitution_vec.ymm = _mm256_add_epi16(pre_substitution_vec.ymm, cost_of_substitution_vec.ymm);
+        cost_if_gap_vec.ymm =
+            _mm256_add_epi16(_mm256_min_epu16(pre_insert_vec.ymm, pre_delete_vec.ymm), gap_cost_vec_.ymm);
+        cell_score_vec.ymm = _mm256_min_epu16(cost_if_substitution_vec.ymm, cost_if_gap_vec.ymm);
+        _mm256_storeu_epi16(scores_new + i, cell_score_vec.ymm);
+    }
+
+    inline void slice_under16chars(                                                                     //
         sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, sz_size_t i, sz_size_t n, //
         sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,                  //
         sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) const noexcept {
 
-        __mmask16 load_mask, mismatch_mask;
+        __mmask16 load_mask, match_mask;
         sz_u512_vec_t first_vec, second_vec;
-        sz_u256_vec_t pre_substitution_vec, pre_insertion_vec, pre_deletion_vec;
+        sz_u256_vec_t pre_substitution_vec, pre_insert_vec, pre_delete_vec;
+        sz_u256_vec_t cost_of_substitution_vec;
         sz_u256_vec_t cost_if_substitution_vec, cost_if_gap_vec, cell_score_vec;
 
-        // Initialize constats:
-        sz_u256_vec_t gap_cost_vec;
-        gap_cost_vec.ymm = _mm256_set1_epi16(1);
-
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
         load_mask = _sz_u16_clamp_mask_until(n - i);
         first_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, first_reversed_slice + i);
         second_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, second_slice + i);
         pre_substitution_vec.ymm = _mm256_maskz_loadu_epi16(load_mask, scores_pre_substitution + i);
-        pre_insertion_vec.ymm = _mm256_maskz_loadu_epi16(load_mask, scores_pre_insertion + i);
-        pre_deletion_vec.ymm = _mm256_maskz_loadu_epi16(load_mask, scores_pre_deletion + i);
+        pre_insert_vec.ymm = _mm256_maskz_loadu_epi16(load_mask, scores_pre_insertion + i);
+        pre_delete_vec.ymm = _mm256_maskz_loadu_epi16(load_mask, scores_pre_deletion + i);
 
-        mismatch_mask = _mm512_cmpneq_epi32_mask(first_vec.zmm, second_vec.zmm);
-        cost_if_substitution_vec.ymm =
-            _mm256_mask_add_epi16(pre_substitution_vec.ymm, mismatch_mask, pre_substitution_vec.ymm, gap_cost_vec.ymm);
+        match_mask = _mm512_cmpeq_epi32_mask(first_vec.zmm, second_vec.zmm);
+        cost_of_substitution_vec.ymm = _mm256_mask_blend_epi16(match_mask, mismatch_cost_vec_.ymm, match_cost_vec_.ymm);
+        cost_if_substitution_vec.ymm = _mm256_add_epi16(pre_substitution_vec.ymm, cost_of_substitution_vec.ymm);
         cost_if_gap_vec.ymm =
-            _mm256_add_epi16(_mm256_min_epu16(pre_insertion_vec.ymm, pre_deletion_vec.ymm), gap_cost_vec.ymm);
+            _mm256_add_epi16(_mm256_min_epu16(pre_insert_vec.ymm, pre_delete_vec.ymm), gap_cost_vec_.ymm);
         cell_score_vec.ymm = _mm256_min_epu16(cost_if_substitution_vec.ymm, cost_if_gap_vec.ymm);
         _mm256_mask_storeu_epi16(scores_new + i, load_mask, cell_score_vec.ymm);
     }
@@ -2513,15 +2656,25 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_subst
         sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,     //
         sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
 
+        // Initialize constats:
+        match_cost_vec_.ymm = _mm256_set1_epi16(this->substituter_.match);
+        mismatch_cost_vec_.ymm = _mm256_set1_epi16(this->substituter_.mismatch);
+        gap_cost_vec_.ymm = _mm256_set1_epi16(this->gap_costs_.open_or_extend);
+
+        // Precompute the number of elements processed in full-size slices for OpenMP compatibility.
+        sz_size_t const n_rounded = (n / 16) * 16;
+        sz_size_t const n_remainder = n - n_rounded;
+
 #if (capability_k & sz_cap_parallel_k)
-#pragma omp parallel for simd
-#else
-#pragma omp simd
+#pragma omp parallel for
 #endif
         // In this variant we will need at most (64 * 1024 / 16) = 4096 loops per diagonal.
-        for (sz_size_t i = 0; i < n; i += 16)
-            slice(first_reversed_slice, second_slice, i, n, scores_pre_substitution, scores_pre_insertion,
-                  scores_pre_deletion, scores_new);
+        for (sz_size_t i = 0; i < n_rounded; i += 16)
+            slice_16chars(first_reversed_slice, second_slice, i, scores_pre_substitution, scores_pre_insertion,
+                          scores_pre_deletion, scores_new);
+        if (n_remainder > 0)
+            slice_under16chars(first_reversed_slice, second_slice, n_rounded, n, scores_pre_substitution,
+                               scores_pre_insertion, scores_pre_deletion, scores_new);
 
         // The last element of the last chunk is the result of the global alignment.
         if (n == 1) this->last_score_ = scores_new[0];

From 9b2b4e5d74ecb3d20733b780963857af278fbd4c Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 24 Apr 2025 21:56:49 +0000
Subject: [PATCH 364/751] Fix: Avoid horizontal walker overflow

On very unbalanced string pairs (very long &
very short) scoring the maximum length of DP
diagonal is low, but in global scoring small 8-16
bit integers will overflow. New checks avoid that.
---
 include/stringcuzilla/similarity.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index e845a57b..60aff722 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -1407,7 +1407,7 @@ struct levenshtein_distance {
             sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
 
         // When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
-        if (requirements.max_diagonal_length < 16) {
+        if (requirements.bytes_per_cell <= 1 && requirements.max_diagonal_length < 16) {
             sz_u8_t result_u8 = std::numeric_limits<sz_u8_t>::max();
             status_t status = horizontal_u8_t {substituter_, gap_costs_, alloc_}(first, second, result_u8);
             if (status != status_t::success_k) return status;
@@ -1540,7 +1540,7 @@ struct levenshtein_distance_utf8 {
         span<sz_rune_t const> const second_utf32 {second_data_utf32, second_length_utf32};
 
         // When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
-        if (requirements.max_diagonal_length < 16) {
+        if (requirements.bytes_per_cell <= 1 && requirements.max_diagonal_length < 16) {
             sz_u8_t result_u8 = std::numeric_limits<sz_u8_t>::max();
             status_t status = horizontal_u8_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u8);
             if (status != status_t::success_k) return status;
@@ -1637,7 +1637,7 @@ struct needleman_wunsch_score {
 
         // When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
         status_t status = status_t::success_k;
-        if (requirements.max_diagonal_length < 16) {
+        if (requirements.bytes_per_cell <= 2 && requirements.max_diagonal_length < 16) {
             sz_i16_t result_i16 = std::numeric_limits<sz_i16_t>::min();
             status = horizontal_i16_t {substituter_, gap_costs_, alloc_}(first, second, result_i16);
             if (status == status_t::success_k) result_ref = result_i16;
@@ -1730,7 +1730,7 @@ struct smith_waterman_score {
             sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
 
         // When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
-        if (requirements.max_diagonal_length < 16) {
+        if (requirements.bytes_per_cell <= 2 && requirements.max_diagonal_length < 16) {
             sz_i16_t result_i16 = std::numeric_limits<sz_i16_t>::min();
             status_t status = horizontal_i16_t {substituter_, gap_costs_, alloc_}(first, second, result_i16);
             if (status != status_t::success_k) return status;

From b4eb6a4f012db792803616ca2141327fa070d409 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 24 Apr 2025 21:58:16 +0000
Subject: [PATCH 365/751] Improve: Measure gap magnitude

In some scenarios the cost of a gap can
be higher than a mismatch and must be
included in estimates.
---
 include/stringcuzilla/similarity.hpp | 130 ++++++++++++++-------------
 include/stringzilla/types.hpp        |   2 +-
 2 files changed, 69 insertions(+), 63 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index 60aff722..a55451ec 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -83,11 +83,15 @@ void rotate_three(value_type_ &a, value_type_ &b, value_type_ &c) noexcept {
 
 struct linear_gap_costs_t {
     error_cost_t open_or_extend = 1;
+
+    constexpr sz_size_t magnitude() const noexcept { return std::abs(open_or_extend); }
 };
 
 struct affine_gap_costs_t {
     error_cost_t open = 1;
     error_cost_t extend = 1;
+
+    constexpr sz_size_t magnitude() const noexcept { return std::max(std::abs(open), std::abs(extend)); }
 };
 
 template <typename gap_costs_type_>
@@ -109,7 +113,7 @@ struct uniform_substitution_costs_t {
 
     constexpr error_cost_t operator()(char a, char b) const noexcept { return a == b ? match : mismatch; }
     constexpr error_cost_t operator()(sz_rune_t a, sz_rune_t b) const noexcept { return a == b ? match : mismatch; }
-    constexpr sz_size_t max_magnitude_change() const noexcept { return std::max(std::abs(match), std::abs(mismatch)); }
+    constexpr sz_size_t magnitude() const noexcept { return std::max(std::abs(match), std::abs(mismatch)); }
 };
 
 #pragma region - Algorithm Building Blocks
@@ -133,23 +137,22 @@ struct similarity_memory_requirements {
     size_t total = 0;
 
     /**
-     *  @param[in] first_length The length of the first string in characters/codepoints.
-     *  @param[in] second_length The length of the second string in characters/codepoints.
-     *  @param[in] max_magnitude_change The absolute value of the maximum change in nearby cells.
+     *  @param[in] first_length,second_length The lengths of strings in characters/codepoints/runes.
+     *  @param[in] substitute_magnitude,gap_magnitude The absolute value of the maximum change in nearby cells.
      *  @param[in] bytes_per_char The number of bytes per character, 4 for UTF-32, 1 for ASCII.
      *  @param[in] register_width The alignment of the data in bytes, 4 for CUDA, 64 for AVX-512.
      *
-     *  To understand the @p max_magnitude_change parameter, consider the following example:
+     *  To understand the @p magnitude() parameter, consider the following example:
      *  - substitution costs ranging from -16 to +15
      *  - gap costs equal to -10
      *  In that case, the biggest change will be `abs(-16) = 16`, so the passed argument should be 16.
      *  In case of default Levenshtein distance, the maximum change is 1, so the passed argument should be 1.
      */
-    constexpr similarity_memory_requirements(      //
-        size_t first_length, size_t second_length, //
-        sz_similarity_gaps_t gap_type,             //
-        size_t max_magnitude_change,               //
-        size_t bytes_per_char,                     //
+    constexpr similarity_memory_requirements(              //
+        size_t first_length, size_t second_length,         //
+        sz_similarity_gaps_t gap_type,                     //
+        size_t substitute_magnitude, size_t gap_magnitude, //
+        size_t bytes_per_char,                             //
         size_t register_width) noexcept {
 
         // Each diagonal in the DP matrix is only by 1 longer than the shorter string.
@@ -159,7 +162,8 @@ struct similarity_memory_requirements {
 
         // The amount of memory we need per diagonal, depends on the maximum number of the differences
         // between 2 strings and the maximum cost of each change.
-        size_t max_cell_value = (longer_length + 1) * max_magnitude_change;
+        size_t magnitude = sz_max_of_two(substitute_magnitude, gap_magnitude);
+        size_t max_cell_value = (longer_length + 1) * magnitude;
         if constexpr (!is_signed_k)
             this->bytes_per_cell = //
                 max_cell_value < 256          ? 1
@@ -1401,9 +1405,9 @@ struct levenshtein_distance {
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
         using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, false>;
-        similarity_memory_requirements_t requirements(                    //
-            first.size(), second.size(),                                  //
-            gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
+        similarity_memory_requirements_t requirements(                                 //
+            first.size(), second.size(),                                               //
+            gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
             sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
 
         // When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
@@ -1531,9 +1535,9 @@ struct levenshtein_distance_utf8 {
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
         using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, false>;
-        similarity_memory_requirements_t requirements(                    //
-            first.size(), second.size(),                                  //
-            gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
+        similarity_memory_requirements_t requirements(                                 //
+            first.size(), second.size(),                                               //
+            gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
             sizeof(sz_rune_t), SZ_MAX_REGISTER_WIDTH);
 
         span<sz_rune_t const> const first_utf32 {first_data_utf32, first_length_utf32};
@@ -1630,9 +1634,9 @@ struct needleman_wunsch_score {
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
         using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, true>;
-        similarity_memory_requirements_t requirements(                    //
-            first.size(), second.size(),                                  //
-            gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
+        similarity_memory_requirements_t requirements(                                 //
+            first.size(), second.size(),                                               //
+            gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
             sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
 
         // When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
@@ -1724,9 +1728,9 @@ struct smith_waterman_score {
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
         using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, true>;
-        similarity_memory_requirements_t requirements(                    //
-            first.size(), second.size(),                                  //
-            gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
+        similarity_memory_requirements_t requirements(                                 //
+            first.size(), second.size(),                                               //
+            gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
             sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
 
         // When dealing with very small inputs, we may want to use a simpler Wagner-Fischer algorithm.
@@ -1783,7 +1787,7 @@ status_t _score_in_parallel(                         //
     core_per_input_type_ &&core_per_input,           //
     all_cores_per_input_type_ &&all_cores_per_input, //
     first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, results_type_ &&results,
-    sz_size_t max_magnitude_change, cpu_specs_t specs = {}) noexcept {
+    size_t substitute_magnitude, size_t gap_magnitude, cpu_specs_t specs = {}) noexcept {
 
     using score_t = score_type_;
     constexpr bool score_is_signed_k = std::is_signed_v<score_t>;
@@ -1808,9 +1812,9 @@ status_t _score_in_parallel(                         //
         auto const &second = second_strings[i];
 
         // ! Longer strings will be handled separately
-        similarity_memory_requirements_t requirements(     //
-            first.size(), second.size(),                   //
-            gap_type<gap_costs_t>(), max_magnitude_change, //
+        similarity_memory_requirements_t requirements(                    //
+            first.size(), second.size(),                                  //
+            gap_type<gap_costs_t>(), substitute_magnitude, gap_magnitude, //
             sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
 
         if (requirements.total >= specs.l1_bytes) continue;
@@ -1824,9 +1828,9 @@ status_t _score_in_parallel(                         //
         score_t result = 0;
         auto const &first = first_strings[i];
         auto const &second = second_strings[i];
-        similarity_memory_requirements_t requirements(     //
-            first.size(), second.size(),                   //
-            gap_type<gap_costs_t>(), max_magnitude_change, //
+        similarity_memory_requirements_t requirements(                    //
+            first.size(), second.size(),                                  //
+            gap_type<gap_costs_t>(), substitute_magnitude, gap_magnitude, //
             sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
 
         if (requirements.total < specs.l1_bytes) continue;
@@ -1901,10 +1905,11 @@ struct levenshtein_distances {
                         results_type_ &&results, cpu_specs_t const &specs = {}) const noexcept {
 
         if constexpr (capability_k & sz_cap_parallel_k)
-            return _score_in_parallel<sz_size_t>(                         //
-                core_per_input_t {substituter_, gap_costs_, alloc_},      //
-                all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
-                first_strings, second_strings, std::forward<results_type_>(results), 1, specs);
+            return _score_in_parallel<sz_size_t>(                                    //
+                core_per_input_t {substituter_, gap_costs_, alloc_},                 //
+                all_cores_per_input_t {substituter_, gap_costs_, alloc_},            //
+                first_strings, second_strings, std::forward<results_type_>(results), //
+                substituter_.magnitude(), gap_costs_.magnitude(), specs);
         else
             return _score_sequentially<sz_size_t>(                        //
                 all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
@@ -1945,10 +1950,11 @@ struct levenshtein_distances_utf8 {
                         results_type_ &&results, cpu_specs_t const &specs = {}) const noexcept {
 
         if constexpr (capability_k & sz_cap_parallel_k)
-            return _score_in_parallel<sz_size_t>(                         //
-                core_per_input_t {substituter_, gap_costs_, alloc_},      //
-                all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
-                first_strings, second_strings, std::forward<results_type_>(results), 1, specs);
+            return _score_in_parallel<sz_size_t>(                                    //
+                core_per_input_t {substituter_, gap_costs_, alloc_},                 //
+                all_cores_per_input_t {substituter_, gap_costs_, alloc_},            //
+                first_strings, second_strings, std::forward<results_type_>(results), //
+                substituter_.magnitude(), gap_costs_.magnitude(), specs);
         else
             return _score_sequentially<sz_size_t>(                        //
                 all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
@@ -1991,11 +1997,11 @@ struct needleman_wunsch_scores {
                         results_type_ &&results, cpu_specs_t const &specs = {}) const noexcept {
 
         if constexpr (capability_k & sz_cap_parallel_k)
-            return _score_in_parallel<sz_ssize_t>(                        //
-                core_per_input_t {substituter_, gap_costs_, alloc_},      //
-                all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
-                first_strings, second_strings, std::forward<results_type_>(results),
-                substituter_.max_magnitude_change(), specs);
+            return _score_in_parallel<sz_ssize_t>(                                   //
+                core_per_input_t {substituter_, gap_costs_, alloc_},                 //
+                all_cores_per_input_t {substituter_, gap_costs_, alloc_},            //
+                first_strings, second_strings, std::forward<results_type_>(results), //
+                substituter_.magnitude(), gap_costs_.magnitude(), specs);
         else
             return _score_sequentially<sz_ssize_t>(                       //
                 all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
@@ -2038,11 +2044,11 @@ struct smith_waterman_scores {
                         results_type_ &&results, cpu_specs_t const &specs = {}) const noexcept {
 
         if constexpr (capability_k & sz_cap_parallel_k)
-            return _score_in_parallel<sz_ssize_t>(                        //
-                core_per_input_t {substituter_, gap_costs_, alloc_},      //
-                all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
-                first_strings, second_strings, std::forward<results_type_>(results),
-                substituter_.max_magnitude_change(), specs);
+            return _score_in_parallel<sz_ssize_t>(                                   //
+                core_per_input_t {substituter_, gap_costs_, alloc_},                 //
+                all_cores_per_input_t {substituter_, gap_costs_, alloc_},            //
+                first_strings, second_strings, std::forward<results_type_>(results), //
+                substituter_.magnitude(), gap_costs_.magnitude(), specs);
         else
             return _score_sequentially<sz_ssize_t>(                       //
                 all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
@@ -2081,7 +2087,7 @@ struct error_costs_256x256_t {
         return result;
     }
 
-    constexpr sz_size_t max_magnitude_change() const noexcept {
+    constexpr sz_size_t magnitude() const noexcept {
         sz_size_t max_magnitude = 0;
         for (int i = 0; i != 256; ++i)
             for (int j = 0; j != 256; ++j) //
@@ -2170,7 +2176,7 @@ struct error_costs_26x26ascii_t {
         return result;
     }
 
-    constexpr sz_size_t max_magnitude_change() const noexcept {
+    constexpr sz_size_t magnitude() const noexcept {
         sz_size_t max_magnitude = 0;
         for (int i = 0; i != 26; ++i)
             for (int j = 0; j != 26; ++j) //
@@ -2728,9 +2734,9 @@ struct levenshtein_distance<char, linear_gap_costs_t, allocator_type_, capabilit
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
         using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, false>;
-        similarity_memory_requirements_t requirements(                    //
-            first.size(), second.size(),                                  //
-            gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
+        similarity_memory_requirements_t requirements(                                 //
+            first.size(), second.size(),                                               //
+            gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
             sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
 
         // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
@@ -2833,9 +2839,9 @@ struct levenshtein_distance_utf8<char, linear_gap_costs_t, allocator_type_, capa
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
         using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, false>;
-        similarity_memory_requirements_t requirements(                    //
-            first.size(), second.size(),                                  //
-            gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
+        similarity_memory_requirements_t requirements(                                 //
+            first.size(), second.size(),                                               //
+            gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
             sizeof(sz_rune_t), SZ_MAX_REGISTER_WIDTH);
 
         // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
@@ -3121,9 +3127,9 @@ struct needleman_wunsch_score<char, error_costs_256x256_t, linear_gap_costs_t, a
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
         using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, true>;
-        similarity_memory_requirements_t requirements(                    //
-            first.size(), second.size(),                                  //
-            gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
+        similarity_memory_requirements_t requirements(                                 //
+            first.size(), second.size(),                                               //
+            gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
             sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
 
         // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
@@ -3195,9 +3201,9 @@ struct smith_waterman_score<char, error_costs_256x256_t, linear_gap_costs_t, all
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
         using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, true>;
-        similarity_memory_requirements_t requirements(                    //
-            first.size(), second.size(),                                  //
-            gap_type<gap_costs_t>(), substituter_.max_magnitude_change(), //
+        similarity_memory_requirements_t requirements(                                 //
+            first.size(), second.size(),                                               //
+            gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
             sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
 
         // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index e74e947a..3753e9d0 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -130,7 +130,7 @@ enum class status_t {
 struct error_costs_unary_t {
     constexpr error_cost_t operator()(char a, char b) const noexcept { return a == b ? 0 : 1; }
     constexpr error_cost_t operator()(sz_rune_t a, sz_rune_t b) const noexcept { return a == b ? 0 : 1; }
-    constexpr sz_size_t max_magnitude_change() const noexcept { return 1; }
+    constexpr sz_size_t magnitude() const noexcept { return 1; }
 };
 
 template <typename value_type_>

From 3998c1f2ec741ebf316eabb2065abb2cbaaaeb46 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 24 Apr 2025 22:29:19 +0000
Subject: [PATCH 366/751] Improve: Branchless K-mask calculation

---
 include/stringcuzilla/similarity.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index a55451ec..1e57cd81 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -2432,7 +2432,7 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, uniform_substi
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
-        load_mask = _sz_u16_clamp_mask_until(n - i);
+        load_mask = _sz_u16_mask_until(n - i);
         first_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, first_reversed_slice + i);
         second_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, second_slice + i);
         pre_substitution_vec.xmm = _mm_maskz_loadu_epi8(load_mask, scores_pre_substitution + i);
@@ -2533,7 +2533,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
-        load_mask = _sz_u32_clamp_mask_until(n - i);
+        load_mask = _sz_u32_mask_until(n - i);
         first_vec.ymm = _mm256_maskz_loadu_epi8(load_mask, first_reversed_slice + i);
         second_vec.ymm = _mm256_maskz_loadu_epi8(load_mask, second_slice + i);
         pre_substitution_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_substitution + i);
@@ -2641,7 +2641,7 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_subst
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
-        load_mask = _sz_u16_clamp_mask_until(n - i);
+        load_mask = _sz_u16_mask_until(n - i);
         first_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, first_reversed_slice + i);
         second_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, second_slice + i);
         pre_substitution_vec.ymm = _mm256_maskz_loadu_epi16(load_mask, scores_pre_substitution + i);
@@ -3060,7 +3060,7 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_
         gap_cost_vec.zmm = _mm512_set1_epi16(gap);
 
         // Load the data with a mask:
-        load_mask = _sz_u32_clamp_mask_until(n - i);
+        load_mask = _sz_u32_mask_until(n - i);
         second_vec.ymms[0] = _mm256_maskz_loadu_epi8(load_mask, second_slice + i);
         pre_substitution_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_substitution + i);
         pre_gap_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_insertion + i);

From 10b5405f52ee0127a590c68987466cc4a176fb08 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 25 Apr 2025 13:26:05 +0000
Subject: [PATCH 367/751] Improve: Aligned ZMM diagonal stores

---
 include/stringcuzilla/similarity.hpp | 688 +++++++++++++++++++--------
 include/stringzilla/types.hpp        |  42 ++
 2 files changed, 521 insertions(+), 209 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index 1e57cd81..8304c607 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -364,11 +364,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 
         error_cost_t const gap_cost = gap_costs_.open_or_extend;
 
-#if (capability_k & sz_cap_parallel_k)
-#pragma omp parallel for simd
-#else
-#pragma omp simd
-#endif
+#pragma omp parallel for simd if (capability_k & sz_cap_parallel_k)
         for (sz_size_t i = 0; i < n; ++i) {
             score_t pre_substitution = scores_pre_substitution[i];
             score_t pre_insertion = scores_pre_insertion[i];
@@ -548,11 +544,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         score_t *scores_new_insertions,                                                     //
         score_t *scores_new_deletions) noexcept {
 
-#if (capability_k & sz_cap_parallel_k)
-#pragma omp parallel for simd
-#else
-#pragma omp simd
-#endif
+#pragma omp parallel for simd if (capability_k & sz_cap_parallel_k)
         for (sz_size_t i = 0; i < n; ++i) {
             score_t pre_substitution = scores_pre_substitution[i];
             score_t pre_insertion_opening = scores_pre_insertion[i];
@@ -2286,14 +2278,18 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
     static constexpr sz_capability_t capability_k = capability_;
 
-    sz_u512_vec_t match_cost_vec_;
-    sz_u512_vec_t mismatch_cost_vec_;
-    sz_u512_vec_t gap_cost_vec_;
+    static constexpr sz_size_t step_k = 64;
 
-    inline void slice_64chars(                                                       //
-        char const *first_reversed_slice, char const *second_slice, sz_size_t i,     //
+    /**
+     *  @brief  Computes one diagonal of the `u8` DM matrix for exactly 64 characters,
+     *          using unaligned loads, but forcing @b aligned stores.
+     */
+    SZ_FORCE_INLINE void slice_aligned64chars(                                       //
+        char const *first_reversed_slice, char const *second_slice,                  //
         sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion, //
-        sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new) const noexcept {
+        sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new,                     //
+        sz_u512_vec_t match_cost_vec, sz_u512_vec_t mismatch_cost_vec,               //
+        sz_u512_vec_t gap_cost_vec) const noexcept {
 
         __mmask64 match_mask;
         sz_u512_vec_t first_vec, second_vec;
@@ -2303,25 +2299,31 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
-        first_vec.zmm = _mm512_loadu_epi8(first_reversed_slice + i);
-        second_vec.zmm = _mm512_loadu_epi8(second_slice + i);
-        pre_substitution_vec.zmm = _mm512_loadu_epi8(scores_pre_substitution + i);
-        pre_insert_vec.zmm = _mm512_loadu_epi8(scores_pre_insertion + i);
-        pre_delete_vec.zmm = _mm512_loadu_epi8(scores_pre_deletion + i);
+        first_vec.zmm = _mm512_loadu_epi8(first_reversed_slice);
+        second_vec.zmm = _mm512_loadu_epi8(second_slice);
+        pre_substitution_vec.zmm = _mm512_loadu_epi8(scores_pre_substitution);
+        pre_insert_vec.zmm = _mm512_loadu_epi8(scores_pre_insertion);
+        pre_delete_vec.zmm = _mm512_loadu_epi8(scores_pre_deletion);
 
         match_mask = _mm512_cmpeq_epi8_mask(first_vec.zmm, second_vec.zmm);
-        cost_of_substitution_vec.zmm = _mm512_mask_blend_epi8(match_mask, mismatch_cost_vec_.zmm, match_cost_vec_.zmm);
+        cost_of_substitution_vec.zmm = _mm512_mask_blend_epi8(match_mask, mismatch_cost_vec.zmm, match_cost_vec.zmm);
         cost_if_substitution_vec.zmm = _mm512_add_epi8(pre_substitution_vec.zmm, cost_of_substitution_vec.zmm);
         cost_if_gap_vec.zmm =
-            _mm512_add_epi8(_mm512_min_epu8(pre_insert_vec.zmm, pre_delete_vec.zmm), gap_cost_vec_.zmm);
+            _mm512_add_epi8(_mm512_min_epu8(pre_insert_vec.zmm, pre_delete_vec.zmm), gap_cost_vec.zmm);
         cell_score_vec.zmm = _mm512_min_epu8(cost_if_substitution_vec.zmm, cost_if_gap_vec.zmm);
-        _mm512_storeu_epi8(scores_new + i, cell_score_vec.zmm);
+        _mm512_store_si512(scores_new, cell_score_vec.zmm);
     }
 
-    inline void slice_under64chars(                                                           //
-        char const *first_reversed_slice, char const *second_slice, sz_size_t i, sz_size_t n, //
-        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion,          //
-        sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new) const noexcept {
+    /**
+     *  @brief  Computes one diagonal of the `u8` DM matrix for up to 64 characters,
+     *          using unaligned loads and stores.
+     */
+    SZ_FORCE_INLINE void slice_upto64chars(                                          //
+        char const *first_reversed_slice, char const *second_slice, sz_size_t n,     //
+        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion, //
+        sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new,                     //
+        sz_u512_vec_t match_cost_vec, sz_u512_vec_t mismatch_cost_vec,               //
+        sz_u512_vec_t gap_cost_vec) const noexcept {
 
         __mmask64 load_mask, match_mask;
         sz_u512_vec_t first_vec, second_vec;
@@ -2331,43 +2333,75 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
-        load_mask = _sz_u64_mask_until(n - i);
-        first_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, first_reversed_slice + i);
-        second_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, second_slice + i);
-        pre_substitution_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, scores_pre_substitution + i);
-        pre_insert_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, scores_pre_insertion + i);
-        pre_delete_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, scores_pre_deletion + i);
+        load_mask = _sz_u64_mask_until(n);
+        first_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, first_reversed_slice);
+        second_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, second_slice);
+        pre_substitution_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, scores_pre_substitution);
+        pre_insert_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, scores_pre_insertion);
+        pre_delete_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, scores_pre_deletion);
 
         match_mask = _mm512_cmpeq_epi8_mask(first_vec.zmm, second_vec.zmm);
-        cost_of_substitution_vec.zmm = _mm512_mask_blend_epi8(match_mask, mismatch_cost_vec_.zmm, match_cost_vec_.zmm);
+        cost_of_substitution_vec.zmm = _mm512_mask_blend_epi8(match_mask, mismatch_cost_vec.zmm, match_cost_vec.zmm);
         cost_if_substitution_vec.zmm = _mm512_add_epi8(pre_substitution_vec.zmm, cost_of_substitution_vec.zmm);
         cost_if_gap_vec.zmm =
-            _mm512_add_epi8(_mm512_min_epu8(pre_insert_vec.zmm, pre_delete_vec.zmm), gap_cost_vec_.zmm);
+            _mm512_add_epi8(_mm512_min_epu8(pre_insert_vec.zmm, pre_delete_vec.zmm), gap_cost_vec.zmm);
         cell_score_vec.zmm = _mm512_min_epu8(cost_if_substitution_vec.zmm, cost_if_gap_vec.zmm);
-        _mm512_mask_storeu_epi8(scores_new + i, load_mask, cell_score_vec.zmm);
+        _mm512_mask_storeu_epi8(scores_new, load_mask, cell_score_vec.zmm);
     }
 
-    inline void operator()(                                                          //
-        char const *first_reversed_slice, char const *second_slice, sz_size_t n,     //
-        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion, //
+    inline void operator()(                                                                 //
+        char const *first_reversed_slice, char const *second_slice, sz_size_t const length, //
+        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion,        //
         sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new) noexcept {
 
         // Initialize constats:
-        match_cost_vec_.zmm = _mm512_set1_epi8(this->substituter_.match);
-        mismatch_cost_vec_.zmm = _mm512_set1_epi8(this->substituter_.mismatch);
-        gap_cost_vec_.zmm = _mm512_set1_epi8(this->gap_costs_.open_or_extend);
-
-        // In this variant we will need at most 4 loops per diagonal
-        sz_size_t i = 0;
-        for (; i + 64 <= n; i += 64)
-            slice_64chars(first_reversed_slice, second_slice, i, scores_pre_substitution, scores_pre_insertion,
-                          scores_pre_deletion, scores_new);
-        if (i < n)
-            slice_under64chars(first_reversed_slice, second_slice, i, n, scores_pre_substitution, scores_pre_insertion,
-                               scores_pre_deletion, scores_new);
+        sz_u512_vec_t match_cost_vec, mismatch_cost_vec, gap_cost_vec;
+        match_cost_vec.zmm = _mm512_set1_epi8(this->substituter_.match);
+        mismatch_cost_vec.zmm = _mm512_set1_epi8(this->substituter_.mismatch);
+        gap_cost_vec.zmm = _mm512_set1_epi8(this->gap_costs_.open_or_extend);
+
+        // On very small inputs, avoid the headache of splitting the input into chunks:
+        if (length <= step_k) {
+            slice_upto64chars(                                                                  //
+                first_reversed_slice, second_slice, length,                                     //
+                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, scores_new, //
+                match_cost_vec, mismatch_cost_vec, gap_cost_vec);
+            // The last element of the last chunk is the result of the global alignment.
+            this->last_score_ = scores_new[0];
+            return;
+        }
+
+        // First handle the misaligned slice of the output buffer:
+        head_body_tail_t hbt = head_body_tail<step_k>(scores_new, length);
+
+        // Misaligned head:
+        if (hbt.head)
+            slice_upto64chars(                                                                  //
+                first_reversed_slice, second_slice, hbt.head,                                   //
+                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, scores_new, //
+                match_cost_vec, mismatch_cost_vec, gap_cost_vec);
+        first_reversed_slice += hbt.head, second_slice += hbt.head, scores_pre_substitution += hbt.head,
+            scores_pre_insertion += hbt.head, scores_pre_deletion += hbt.head, scores_new += hbt.head;
+
+        // In this variant we will need at most 4 loops per diagonal:
+        for (sz_size_t progress = 0; progress < hbt.body; //
+             progress += step_k,                          //
+             first_reversed_slice += step_k, second_slice += step_k, scores_pre_substitution += step_k,
+                       scores_pre_insertion += step_k, scores_pre_deletion += step_k, scores_new += step_k)
+            slice_aligned64chars(                                                               //
+                first_reversed_slice, second_slice,                                             //
+                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, scores_new, //
+                match_cost_vec, mismatch_cost_vec, gap_cost_vec);
+
+        // Shorter tail:
+        if (hbt.tail)
+            slice_upto64chars(                                                                  //
+                first_reversed_slice, second_slice, hbt.tail,                                   //
+                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, scores_new, //
+                match_cost_vec, mismatch_cost_vec, gap_cost_vec);
 
         // The last element of the last chunk is the result of the global alignment.
-        if (n == 1) this->last_score_ = scores_new[0];
+        if (length == 1) this->last_score_ = scores_new[0];
     }
 };
 
@@ -2388,14 +2422,18 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, uniform_substi
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
     static constexpr sz_capability_t capability_k = capability_;
 
-    sz_u128_vec_t match_cost_vec_;
-    sz_u128_vec_t mismatch_cost_vec_;
-    sz_u128_vec_t gap_cost_vec_;
+    static constexpr sz_size_t step_k = 16;
 
-    inline void slice_16chars(                                                             //
-        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, sz_size_t i, //
-        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion,       //
-        sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new) const noexcept {
+    /**
+     *  @brief  Computes one diagonal of the `u8` DM matrix for exactly 16 characters,
+     *          using unaligned loads, but forcing @b aligned stores.
+     */
+    SZ_FORCE_INLINE void slice_aligned16chars(                                       //
+        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice,        //
+        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion, //
+        sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new,                     //
+        sz_u128_vec_t match_cost_vec, sz_u128_vec_t mismatch_cost_vec,               //
+        sz_u128_vec_t gap_cost_vec) const noexcept {
 
         __mmask16 match_mask;
         sz_u512_vec_t first_vec, second_vec;
@@ -2405,24 +2443,30 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, uniform_substi
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
-        first_vec.zmm = _mm512_loadu_epi32(first_reversed_slice + i);
-        second_vec.zmm = _mm512_loadu_epi32(second_slice + i);
-        pre_substitution_vec.xmm = _mm_loadu_epi8(scores_pre_substitution + i);
-        pre_insert_vec.xmm = _mm_loadu_epi8(scores_pre_insertion + i);
-        pre_delete_vec.xmm = _mm_loadu_epi8(scores_pre_deletion + i);
+        first_vec.zmm = _mm512_loadu_epi32(first_reversed_slice);
+        second_vec.zmm = _mm512_loadu_epi32(second_slice);
+        pre_substitution_vec.xmm = _mm_lddqu_si128((__m128i const *)(scores_pre_substitution));
+        pre_insert_vec.xmm = _mm_lddqu_si128((__m128i const *)(scores_pre_insertion));
+        pre_delete_vec.xmm = _mm_lddqu_si128((__m128i const *)(scores_pre_deletion));
 
         match_mask = _mm512_cmpeq_epi32_mask(first_vec.zmm, second_vec.zmm);
-        cost_of_substitution_vec.xmm = _mm_mask_blend_epi8(match_mask, mismatch_cost_vec_.xmm, match_cost_vec_.xmm);
+        cost_of_substitution_vec.xmm = _mm_mask_blend_epi8(match_mask, mismatch_cost_vec.xmm, match_cost_vec.xmm);
         cost_if_substitution_vec.xmm = _mm_add_epi8(pre_substitution_vec.xmm, cost_of_substitution_vec.xmm);
-        cost_if_gap_vec.xmm = _mm_add_epi8(_mm_min_epu8(pre_insert_vec.xmm, pre_delete_vec.xmm), gap_cost_vec_.xmm);
+        cost_if_gap_vec.xmm = _mm_add_epi8(_mm_min_epu8(pre_insert_vec.xmm, pre_delete_vec.xmm), gap_cost_vec.xmm);
         cell_score_vec.xmm = _mm_min_epu8(cost_if_substitution_vec.xmm, cost_if_gap_vec.xmm);
-        _mm_storeu_epi8(scores_new + i, cell_score_vec.xmm);
+        _mm_store_si128((__m128i *)scores_new, cell_score_vec.xmm);
     }
 
-    inline void slice_under16chars(                                                                     //
-        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, sz_size_t i, sz_size_t n, //
-        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion,                    //
-        sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new) const noexcept {
+    /**
+     *  @brief  Computes one diagonal of the `u8` DM matrix for up to 16 characters,
+     *          using unaligned loads and stores.
+     */
+    SZ_FORCE_INLINE void slice_upto16chars(                                                //
+        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, sz_size_t n, //
+        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion,       //
+        sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new,                           //
+        sz_u128_vec_t match_cost_vec, sz_u128_vec_t mismatch_cost_vec,                     //
+        sz_u128_vec_t gap_cost_vec) const noexcept {
 
         __mmask16 load_mask, match_mask;
         sz_u512_vec_t first_vec, second_vec;
@@ -2432,42 +2476,74 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, uniform_substi
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
-        load_mask = _sz_u16_mask_until(n - i);
-        first_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, first_reversed_slice + i);
-        second_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, second_slice + i);
-        pre_substitution_vec.xmm = _mm_maskz_loadu_epi8(load_mask, scores_pre_substitution + i);
-        pre_insert_vec.xmm = _mm_maskz_loadu_epi8(load_mask, scores_pre_insertion + i);
-        pre_delete_vec.xmm = _mm_maskz_loadu_epi8(load_mask, scores_pre_deletion + i);
+        load_mask = _sz_u16_mask_until(n);
+        first_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, first_reversed_slice);
+        second_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, second_slice);
+        pre_substitution_vec.xmm = _mm_maskz_loadu_epi8(load_mask, scores_pre_substitution);
+        pre_insert_vec.xmm = _mm_maskz_loadu_epi8(load_mask, scores_pre_insertion);
+        pre_delete_vec.xmm = _mm_maskz_loadu_epi8(load_mask, scores_pre_deletion);
 
         match_mask = _mm512_cmpeq_epi32_mask(first_vec.zmm, second_vec.zmm);
-        cost_of_substitution_vec.xmm = _mm_mask_blend_epi8(match_mask, mismatch_cost_vec_.xmm, match_cost_vec_.xmm);
+        cost_of_substitution_vec.xmm = _mm_mask_blend_epi8(match_mask, mismatch_cost_vec.xmm, match_cost_vec.xmm);
         cost_if_substitution_vec.xmm = _mm_add_epi8(pre_substitution_vec.xmm, cost_of_substitution_vec.xmm);
-        cost_if_gap_vec.xmm = _mm_add_epi8(_mm_min_epu8(pre_insert_vec.xmm, pre_delete_vec.xmm), gap_cost_vec_.xmm);
+        cost_if_gap_vec.xmm = _mm_add_epi8(_mm_min_epu8(pre_insert_vec.xmm, pre_delete_vec.xmm), gap_cost_vec.xmm);
         cell_score_vec.xmm = _mm_min_epu8(cost_if_substitution_vec.xmm, cost_if_gap_vec.xmm);
-        _mm_mask_storeu_epi8(scores_new + i, load_mask, cell_score_vec.xmm);
+        _mm_mask_storeu_epi8(scores_new, load_mask, cell_score_vec.xmm);
     }
 
-    inline void operator()(                                                                //
-        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, sz_size_t n, //
-        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion,       //
+    inline void operator()(                                                                           //
+        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, sz_size_t const length, //
+        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion,                  //
         sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new) noexcept {
 
         // Initialize constats:
-        match_cost_vec_.xmm = _mm_set1_epi8(this->substituter_.match);
-        mismatch_cost_vec_.xmm = _mm_set1_epi8(this->substituter_.mismatch);
-        gap_cost_vec_.xmm = _mm_set1_epi8(this->gap_costs_.open_or_extend);
+        sz_u128_vec_t match_cost_vec, mismatch_cost_vec, gap_cost_vec;
+        match_cost_vec.xmm = _mm_set1_epi8(this->substituter_.match);
+        mismatch_cost_vec.xmm = _mm_set1_epi8(this->substituter_.mismatch);
+        gap_cost_vec.xmm = _mm_set1_epi8(this->gap_costs_.open_or_extend);
+
+        // On very small inputs, avoid the headache of splitting the input into chunks:
+        if (length <= step_k) {
+            slice_upto16chars(                                                                  //
+                first_reversed_slice, second_slice, length,                                     //
+                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, scores_new, //
+                match_cost_vec, mismatch_cost_vec, gap_cost_vec);
+            // The last element of the last chunk is the result of the global alignment.
+            this->last_score_ = scores_new[0];
+            return;
+        }
+
+        // First handle the misaligned slice of the output buffer:
+        head_body_tail_t hbt = head_body_tail<step_k>(scores_new, length);
+
+        // Misaligned head:
+        if (hbt.head)
+            slice_upto16chars(                                                                  //
+                first_reversed_slice, second_slice, hbt.head,                                   //
+                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, scores_new, //
+                match_cost_vec, mismatch_cost_vec, gap_cost_vec);
+        first_reversed_slice += hbt.head, second_slice += hbt.head, scores_pre_substitution += hbt.head,
+            scores_pre_insertion += hbt.head, scores_pre_deletion += hbt.head, scores_new += hbt.head;
 
         // In this variant we will need at most (256 / 16) = 16 loops per diagonal.
-        sz_size_t i = 0;
-        for (; i + 16 <= n; i += 16)
-            slice_16chars(first_reversed_slice, second_slice, i, scores_pre_substitution, scores_pre_insertion,
-                          scores_pre_deletion, scores_new);
-        if (i < n)
-            slice_under16chars(first_reversed_slice, second_slice, i, n, scores_pre_substitution, scores_pre_insertion,
-                               scores_pre_deletion, scores_new);
+        for (sz_size_t progress = 0; progress < hbt.body; //
+             progress += step_k,                          //
+             first_reversed_slice += step_k, second_slice += step_k, scores_pre_substitution += step_k,
+                       scores_pre_insertion += step_k, scores_pre_deletion += step_k, scores_new += step_k)
+            slice_aligned16chars(                                                               //
+                first_reversed_slice, second_slice,                                             //
+                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, scores_new, //
+                match_cost_vec, mismatch_cost_vec, gap_cost_vec);
+
+        // Shorter tail:
+        if (hbt.tail)
+            slice_upto16chars(                                                                  //
+                first_reversed_slice, second_slice, hbt.tail,                                   //
+                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, scores_new, //
+                match_cost_vec, mismatch_cost_vec, gap_cost_vec);
 
         // The last element of the last chunk is the result of the global alignment.
-        if (n == 1) this->last_score_ = scores_new[0];
+        if (length == 1) this->last_score_ = scores_new[0];
     }
 };
 
@@ -2488,14 +2564,17 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
     static constexpr sz_capability_t capability_k = capability_;
 
-    sz_u512_vec_t match_cost_vec_;
-    sz_u512_vec_t mismatch_cost_vec_;
-    sz_u512_vec_t gap_cost_vec_;
+    static constexpr sz_size_t step_k = 32;
 
-    inline void slice_32chars(                                                         //
-        char const *first_reversed_slice, char const *second_slice, sz_size_t i,       //
+    /**
+     *  @brief  Computes one diagonal of the `u16` DM matrix for exactly 16 characters,
+     *          using unaligned loads, but forcing @b aligned stores.
+     */
+    SZ_FORCE_INLINE void slice_aligned32chars(                                         //
+        char const *first_reversed_slice, char const *second_slice,                    //
         sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion, //
-        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) const noexcept {
+        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new,                     //
+        sz_u512_vec_t match_cost_vec, sz_u512_vec_t mismatch_cost_vec, sz_u512_vec_t gap_cost_vec) const noexcept {
 
         __mmask32 match_mask;
         sz_u256_vec_t first_vec, second_vec;
@@ -2505,25 +2584,26 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
-        first_vec.ymm = _mm256_loadu_epi8(first_reversed_slice + i);
-        second_vec.ymm = _mm256_loadu_epi8(second_slice + i);
-        pre_substitution_vec.zmm = _mm512_loadu_epi16(scores_pre_substitution + i);
-        pre_insert_vec.zmm = _mm512_loadu_epi16(scores_pre_insertion + i);
-        pre_delete_vec.zmm = _mm512_loadu_epi16(scores_pre_deletion + i);
+        first_vec.ymm = _mm256_loadu_epi8(first_reversed_slice);
+        second_vec.ymm = _mm256_loadu_epi8(second_slice);
+        pre_substitution_vec.zmm = _mm512_loadu_epi16(scores_pre_substitution);
+        pre_insert_vec.zmm = _mm512_loadu_epi16(scores_pre_insertion);
+        pre_delete_vec.zmm = _mm512_loadu_epi16(scores_pre_deletion);
 
         match_mask = _mm256_cmpeq_epi8_mask(first_vec.ymm, second_vec.ymm);
-        cost_of_substitution_vec.zmm = _mm512_mask_blend_epi16(match_mask, mismatch_cost_vec_.zmm, match_cost_vec_.zmm);
+        cost_of_substitution_vec.zmm = _mm512_mask_blend_epi16(match_mask, mismatch_cost_vec.zmm, match_cost_vec.zmm);
         cost_if_substitution_vec.zmm = _mm512_add_epi16(pre_substitution_vec.zmm, cost_of_substitution_vec.zmm);
         cost_if_gap_vec.zmm =
-            _mm512_add_epi16(_mm512_min_epu16(pre_insert_vec.zmm, pre_delete_vec.zmm), gap_cost_vec_.zmm);
+            _mm512_add_epi16(_mm512_min_epu16(pre_insert_vec.zmm, pre_delete_vec.zmm), gap_cost_vec.zmm);
         cell_score_vec.zmm = _mm512_min_epu16(cost_if_substitution_vec.zmm, cost_if_gap_vec.zmm);
-        _mm512_storeu_epi16(scores_new + i, cell_score_vec.zmm);
+        _mm512_store_si512(scores_new, cell_score_vec.zmm);
     }
 
-    inline void slice_under32chars(                                                           //
-        char const *first_reversed_slice, char const *second_slice, sz_size_t i, sz_size_t n, //
-        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,        //
-        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) const noexcept {
+    SZ_FORCE_INLINE void slice_upto32chars(                                            //
+        char const *first_reversed_slice, char const *second_slice, sz_size_t n,       //
+        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion, //
+        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new,                     //
+        sz_u512_vec_t match_cost_vec, sz_u512_vec_t mismatch_cost_vec, sz_u512_vec_t gap_cost_vec) const noexcept {
 
         __mmask32 load_mask, match_mask;
         sz_u256_vec_t first_vec, second_vec;
@@ -2533,49 +2613,72 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
-        load_mask = _sz_u32_mask_until(n - i);
-        first_vec.ymm = _mm256_maskz_loadu_epi8(load_mask, first_reversed_slice + i);
-        second_vec.ymm = _mm256_maskz_loadu_epi8(load_mask, second_slice + i);
-        pre_substitution_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_substitution + i);
-        pre_insert_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_insertion + i);
-        pre_delete_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_deletion + i);
+        load_mask = _sz_u32_mask_until(n);
+        first_vec.ymm = _mm256_maskz_loadu_epi8(load_mask, first_reversed_slice);
+        second_vec.ymm = _mm256_maskz_loadu_epi8(load_mask, second_slice);
+        pre_substitution_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_substitution);
+        pre_insert_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_insertion);
+        pre_delete_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_deletion);
 
         match_mask = _mm256_cmpeq_epi8_mask(first_vec.ymm, second_vec.ymm);
-        cost_of_substitution_vec.zmm = _mm512_mask_blend_epi16(match_mask, mismatch_cost_vec_.zmm, match_cost_vec_.zmm);
+        cost_of_substitution_vec.zmm = _mm512_mask_blend_epi16(match_mask, mismatch_cost_vec.zmm, match_cost_vec.zmm);
         cost_if_substitution_vec.zmm = _mm512_add_epi16(pre_substitution_vec.zmm, cost_of_substitution_vec.zmm);
         cost_if_gap_vec.zmm =
-            _mm512_add_epi16(_mm512_min_epu16(pre_insert_vec.zmm, pre_delete_vec.zmm), gap_cost_vec_.zmm);
+            _mm512_add_epi16(_mm512_min_epu16(pre_insert_vec.zmm, pre_delete_vec.zmm), gap_cost_vec.zmm);
         cell_score_vec.zmm = _mm512_min_epu16(cost_if_substitution_vec.zmm, cost_if_gap_vec.zmm);
-        _mm512_mask_storeu_epi16(scores_new + i, load_mask, cell_score_vec.zmm);
+        _mm512_mask_storeu_epi16(scores_new, load_mask, cell_score_vec.zmm);
     }
 
-    inline void operator()(                                                            //
-        char const *first_reversed_slice, char const *second_slice, sz_size_t n,       //
-        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion, //
+    inline void operator()(                                                                 //
+        char const *first_reversed_slice, char const *second_slice, sz_size_t const length, //
+        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,      //
         sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
 
         // Initialize constats:
-        match_cost_vec_.zmm = _mm512_set1_epi16(this->substituter_.match);
-        mismatch_cost_vec_.zmm = _mm512_set1_epi16(this->substituter_.mismatch);
-        gap_cost_vec_.zmm = _mm512_set1_epi16(this->gap_costs_.open_or_extend);
+        sz_u512_vec_t match_cost_vec, mismatch_cost_vec, gap_cost_vec;
+        match_cost_vec.zmm = _mm512_set1_epi16(this->substituter_.match);
+        mismatch_cost_vec.zmm = _mm512_set1_epi16(this->substituter_.mismatch);
+        gap_cost_vec.zmm = _mm512_set1_epi16(this->gap_costs_.open_or_extend);
+
+        // On very small inputs, avoid the headache of splitting the input into chunks:
+        if (length <= step_k) {
+            slice_upto32chars(                                                                  //
+                first_reversed_slice, second_slice, length,                                     //
+                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, scores_new, //
+                match_cost_vec, mismatch_cost_vec, gap_cost_vec);
+            // The last element of the last chunk is the result of the global alignment.
+            this->last_score_ = scores_new[0];
+            return;
+        }
 
-        // Precompute the number of elements processed in full-size slices for OpenMP compatibility.
-        sz_size_t const n_rounded = (n / 32) * 32;
-        sz_size_t const n_remainder = n - n_rounded;
+        // First handle the misaligned slice of the output buffer:
+        head_body_tail_t hbt = head_body_tail<step_k>(scores_new, length);
+
+        // Misaligned head and tail:
+        if (hbt.head)
+            slice_upto32chars(                                                                  //
+                first_reversed_slice, second_slice, hbt.head,                                   //
+                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, scores_new, //
+                match_cost_vec, mismatch_cost_vec, gap_cost_vec);
+        first_reversed_slice += hbt.head, second_slice += hbt.head, scores_pre_substitution += hbt.head,
+            scores_pre_insertion += hbt.head, scores_pre_deletion += hbt.head, scores_new += hbt.head;
+        if (hbt.tail)
+            slice_upto32chars(                                                       //
+                first_reversed_slice + hbt.body, second_slice + hbt.body, hbt.tail,  //
+                scores_pre_substitution + hbt.body, scores_pre_insertion + hbt.body, //
+                scores_pre_deletion + hbt.body, scores_new + hbt.body,               //
+                match_cost_vec, mismatch_cost_vec, gap_cost_vec);
 
-#if (capability_k & sz_cap_parallel_k)
-#pragma omp parallel for
-#endif
         // In this variant we will need at most (64 * 1024 / 32) = 2048 loops per diagonal.
-        for (sz_size_t i = 0; i < n_rounded; i += 32)
-            slice_32chars(first_reversed_slice, second_slice, i, scores_pre_substitution, scores_pre_insertion,
-                          scores_pre_deletion, scores_new);
-        if (n_remainder > 0)
-            slice_under32chars(first_reversed_slice, second_slice, n_rounded, n, scores_pre_substitution,
-                               scores_pre_insertion, scores_pre_deletion, scores_new);
+#pragma omp parallel for if (capability_k & sz_cap_parallel_k)
+        for (sz_size_t progress = 0; progress < hbt.body; progress += step_k)
+            slice_aligned32chars(                                                                             //
+                first_reversed_slice + progress, second_slice + progress, scores_pre_substitution + progress, //
+                scores_pre_insertion + progress, scores_pre_deletion + progress, scores_new + progress,       //
+                match_cost_vec, mismatch_cost_vec, gap_cost_vec);
 
         // The last element of the last chunk is the result of the global alignment.
-        if (n == 1) this->last_score_ = scores_new[0];
+        if (length == 1) this->last_score_ = scores_new[0];
     }
 };
 
@@ -2596,14 +2699,17 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_subst
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
     static constexpr sz_capability_t capability_k = capability_;
 
-    sz_u256_vec_t match_cost_vec_;
-    sz_u256_vec_t mismatch_cost_vec_;
-    sz_u256_vec_t gap_cost_vec_;
+    static constexpr sz_size_t step_k = 16;
 
-    inline void slice_16chars(                                                             //
-        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, sz_size_t i, //
-        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,     //
-        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) const noexcept {
+    /**
+     *  @brief  Computes one diagonal of the `u16` DM matrix for exactly 16 characters,
+     *          using unaligned loads, but forcing @b aligned stores.
+     */
+    SZ_FORCE_INLINE void slice_aligned16chars(                                         //
+        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice,          //
+        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion, //
+        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new,                     //
+        sz_u256_vec_t match_cost_vec, sz_u256_vec_t mismatch_cost_vec, sz_u256_vec_t gap_cost_vec) const noexcept {
 
         __mmask16 match_mask;
         sz_u512_vec_t first_vec, second_vec;
@@ -2613,25 +2719,30 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_subst
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
-        first_vec.zmm = _mm512_loadu_epi32(first_reversed_slice + i);
-        second_vec.zmm = _mm512_loadu_epi32(second_slice + i);
-        pre_substitution_vec.ymm = _mm256_loadu_epi16(scores_pre_substitution + i);
-        pre_insert_vec.ymm = _mm256_loadu_epi16(scores_pre_insertion + i);
-        pre_delete_vec.ymm = _mm256_loadu_epi16(scores_pre_deletion + i);
+        first_vec.zmm = _mm512_loadu_epi32(first_reversed_slice);
+        second_vec.zmm = _mm512_loadu_epi32(second_slice);
+        pre_substitution_vec.ymm = _mm256_loadu_epi16(scores_pre_substitution);
+        pre_insert_vec.ymm = _mm256_loadu_epi16(scores_pre_insertion);
+        pre_delete_vec.ymm = _mm256_loadu_epi16(scores_pre_deletion);
 
         match_mask = _mm512_cmpeq_epi32_mask(first_vec.zmm, second_vec.zmm);
-        cost_of_substitution_vec.ymm = _mm256_mask_blend_epi16(match_mask, mismatch_cost_vec_.ymm, match_cost_vec_.ymm);
+        cost_of_substitution_vec.ymm = _mm256_mask_blend_epi16(match_mask, mismatch_cost_vec.ymm, match_cost_vec.ymm);
         cost_if_substitution_vec.ymm = _mm256_add_epi16(pre_substitution_vec.ymm, cost_of_substitution_vec.ymm);
         cost_if_gap_vec.ymm =
-            _mm256_add_epi16(_mm256_min_epu16(pre_insert_vec.ymm, pre_delete_vec.ymm), gap_cost_vec_.ymm);
+            _mm256_add_epi16(_mm256_min_epu16(pre_insert_vec.ymm, pre_delete_vec.ymm), gap_cost_vec.ymm);
         cell_score_vec.ymm = _mm256_min_epu16(cost_if_substitution_vec.ymm, cost_if_gap_vec.ymm);
-        _mm256_storeu_epi16(scores_new + i, cell_score_vec.ymm);
+        _mm256_store_si256((__m256i *)scores_new, cell_score_vec.ymm);
     }
 
-    inline void slice_under16chars(                                                                     //
-        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, sz_size_t i, sz_size_t n, //
-        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,                  //
-        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) const noexcept {
+    /**
+     *  @brief  Computes one diagonal of the `u16` DM matrix for up to 16 characters,
+     *          using unaligned loads and stores.
+     */
+    SZ_FORCE_INLINE void slice_upto16chars(                                                //
+        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, sz_size_t n, //
+        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,     //
+        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new,                         //
+        sz_u256_vec_t match_cost_vec, sz_u256_vec_t mismatch_cost_vec, sz_u256_vec_t gap_cost_vec) const noexcept {
 
         __mmask16 load_mask, match_mask;
         sz_u512_vec_t first_vec, second_vec;
@@ -2641,49 +2752,210 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_subst
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
-        load_mask = _sz_u16_mask_until(n - i);
-        first_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, first_reversed_slice + i);
-        second_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, second_slice + i);
-        pre_substitution_vec.ymm = _mm256_maskz_loadu_epi16(load_mask, scores_pre_substitution + i);
-        pre_insert_vec.ymm = _mm256_maskz_loadu_epi16(load_mask, scores_pre_insertion + i);
-        pre_delete_vec.ymm = _mm256_maskz_loadu_epi16(load_mask, scores_pre_deletion + i);
+        load_mask = _sz_u16_mask_until(n);
+        first_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, first_reversed_slice);
+        second_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, second_slice);
+        pre_substitution_vec.ymm = _mm256_maskz_loadu_epi16(load_mask, scores_pre_substitution);
+        pre_insert_vec.ymm = _mm256_maskz_loadu_epi16(load_mask, scores_pre_insertion);
+        pre_delete_vec.ymm = _mm256_maskz_loadu_epi16(load_mask, scores_pre_deletion);
 
         match_mask = _mm512_cmpeq_epi32_mask(first_vec.zmm, second_vec.zmm);
-        cost_of_substitution_vec.ymm = _mm256_mask_blend_epi16(match_mask, mismatch_cost_vec_.ymm, match_cost_vec_.ymm);
+        cost_of_substitution_vec.ymm = _mm256_mask_blend_epi16(match_mask, mismatch_cost_vec.ymm, match_cost_vec.ymm);
         cost_if_substitution_vec.ymm = _mm256_add_epi16(pre_substitution_vec.ymm, cost_of_substitution_vec.ymm);
         cost_if_gap_vec.ymm =
-            _mm256_add_epi16(_mm256_min_epu16(pre_insert_vec.ymm, pre_delete_vec.ymm), gap_cost_vec_.ymm);
+            _mm256_add_epi16(_mm256_min_epu16(pre_insert_vec.ymm, pre_delete_vec.ymm), gap_cost_vec.ymm);
         cell_score_vec.ymm = _mm256_min_epu16(cost_if_substitution_vec.ymm, cost_if_gap_vec.ymm);
-        _mm256_mask_storeu_epi16(scores_new + i, load_mask, cell_score_vec.ymm);
+        _mm256_mask_storeu_epi16(scores_new, load_mask, cell_score_vec.ymm);
     }
 
-    inline void operator()(                                                                //
-        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, sz_size_t n, //
-        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,     //
+    inline void operator()(                                                                           //
+        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, sz_size_t const length, //
+        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,                //
         sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
 
         // Initialize constats:
-        match_cost_vec_.ymm = _mm256_set1_epi16(this->substituter_.match);
-        mismatch_cost_vec_.ymm = _mm256_set1_epi16(this->substituter_.mismatch);
-        gap_cost_vec_.ymm = _mm256_set1_epi16(this->gap_costs_.open_or_extend);
-
-        // Precompute the number of elements processed in full-size slices for OpenMP compatibility.
-        sz_size_t const n_rounded = (n / 16) * 16;
-        sz_size_t const n_remainder = n - n_rounded;
-
-#if (capability_k & sz_cap_parallel_k)
-#pragma omp parallel for
-#endif
-        // In this variant we will need at most (64 * 1024 / 16) = 4096 loops per diagonal.
-        for (sz_size_t i = 0; i < n_rounded; i += 16)
-            slice_16chars(first_reversed_slice, second_slice, i, scores_pre_substitution, scores_pre_insertion,
-                          scores_pre_deletion, scores_new);
-        if (n_remainder > 0)
-            slice_under16chars(first_reversed_slice, second_slice, n_rounded, n, scores_pre_substitution,
-                               scores_pre_insertion, scores_pre_deletion, scores_new);
+        sz_u256_vec_t match_cost_vec, mismatch_cost_vec, gap_cost_vec;
+        match_cost_vec.ymm = _mm256_set1_epi16(this->substituter_.match);
+        mismatch_cost_vec.ymm = _mm256_set1_epi16(this->substituter_.mismatch);
+        gap_cost_vec.ymm = _mm256_set1_epi16(this->gap_costs_.open_or_extend);
+
+        // On very small inputs, avoid the headache of splitting the input into chunks:
+        if (length <= step_k) {
+            slice_upto16chars(                                                                  //
+                first_reversed_slice, second_slice, length,                                     //
+                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, scores_new, //
+                match_cost_vec, mismatch_cost_vec, gap_cost_vec);
+            // The last element of the last chunk is the result of the global alignment.
+            this->last_score_ = scores_new[0];
+            return;
+        }
+
+        // First handle the misaligned slice of the output buffer:
+        head_body_tail_t hbt = head_body_tail<step_k>(scores_new, length);
+
+        // Misaligned head and tail:
+        if (hbt.head)
+            slice_upto16chars(                                                                  //
+                first_reversed_slice, second_slice, hbt.head,                                   //
+                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, scores_new, //
+                match_cost_vec, mismatch_cost_vec, gap_cost_vec);
+        first_reversed_slice += hbt.head, second_slice += hbt.head, scores_pre_substitution += hbt.head,
+            scores_pre_insertion += hbt.head, scores_pre_deletion += hbt.head, scores_new += hbt.head;
+        if (hbt.tail)
+            slice_upto16chars(                                                       //
+                first_reversed_slice + hbt.body, second_slice + hbt.body, hbt.tail,  //
+                scores_pre_substitution + hbt.body, scores_pre_insertion + hbt.body, //
+                scores_pre_deletion + hbt.body, scores_new + hbt.body,               //
+                match_cost_vec, mismatch_cost_vec, gap_cost_vec);
+
+// In this variant we will need at most (64 * 1024 / 16) = 4096 loops per diagonal.
+#pragma omp parallel for if (capability_k & sz_cap_parallel_k)
+        for (sz_size_t progress = 0; progress < hbt.body; progress += step_k)
+            slice_aligned16chars(                                                                             //
+                first_reversed_slice + progress, second_slice + progress, scores_pre_substitution + progress, //
+                scores_pre_insertion + progress, scores_pre_deletion + progress, scores_new + progress,       //
+                match_cost_vec, mismatch_cost_vec, gap_cost_vec);
 
         // The last element of the last chunk is the result of the global alignment.
-        if (n == 1) this->last_score_ = scores_new[0];
+        if (length == 1) this->last_score_ = scores_new[0];
+    }
+};
+
+/**
+ *  @brief Variant of `scorer` - Minimizes Levenshtein distance for inputs in [65K, 4B] bytes.
+ *  @note Requires Intel Ice Lake generation CPUs or newer.
+ */
+template <sz_capability_t capability_>
+struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, capability_,
+                   std::enable_if_t<capability_ & sz_cap_ice_k>>
+    : public tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void> {
+
+    using scorer_t::tile_scorer; // Make the constructors visible
+
+    static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
+    static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
+    static constexpr sz_capability_t capability_k = capability_;
+
+    static constexpr sz_size_t step_k = 16;
+
+    /**
+     *  @brief  Computes one diagonal of the `u32` DM matrix for exactly 16 characters,
+     *          using unaligned loads, but forcing @b aligned stores.
+     */
+    SZ_FORCE_INLINE void slice_aligned16chars(                                         //
+        char const *first_reversed_slice, char const *second_slice,                    //
+        sz_u32_t const *scores_pre_substitution, sz_u32_t const *scores_pre_insertion, //
+        sz_u32_t const *scores_pre_deletion, sz_u32_t *scores_new,                     //
+        sz_u512_vec_t match_cost_vec, sz_u512_vec_t mismatch_cost_vec, sz_u512_vec_t gap_cost_vec) const noexcept {
+
+        __mmask16 match_mask;
+        sz_u128_vec_t first_vec, second_vec;
+        sz_u512_vec_t pre_substitution_vec, pre_insert_vec, pre_delete_vec;
+        sz_u512_vec_t cost_of_substitution_vec;
+        sz_u512_vec_t cost_if_substitution_vec, cost_if_gap_vec, cell_score_vec;
+
+        // ? Note that here we are still traversing both buffers in the same order,
+        // ? because one of the strings has been reversed beforehand.
+        first_vec.xmm = _mm_lddqu_si128((__m128i const *)first_reversed_slice);
+        second_vec.xmm = _mm_lddqu_si128((__m128i const *)second_slice);
+        pre_substitution_vec.zmm = _mm512_loadu_epi32(scores_pre_substitution);
+        pre_insert_vec.zmm = _mm512_loadu_epi32(scores_pre_insertion);
+        pre_delete_vec.zmm = _mm512_loadu_epi32(scores_pre_deletion);
+
+        match_mask = _mm_cmpeq_epi8_mask(first_vec.xmm, second_vec.xmm);
+        cost_of_substitution_vec.zmm = _mm512_mask_blend_epi32(match_mask, mismatch_cost_vec.zmm, match_cost_vec.zmm);
+        cost_if_substitution_vec.zmm = _mm512_add_epi32(pre_substitution_vec.zmm, cost_of_substitution_vec.zmm);
+        cost_if_gap_vec.zmm =
+            _mm512_add_epi32(_mm512_min_epu32(pre_insert_vec.zmm, pre_delete_vec.zmm), gap_cost_vec.zmm);
+        cell_score_vec.zmm = _mm512_min_epu32(cost_if_substitution_vec.zmm, cost_if_gap_vec.zmm);
+        _mm512_store_si512((__m512i *)scores_new, cell_score_vec.zmm);
+    }
+
+    /**
+     *  @brief  Computes one diagonal of the `u32` DM matrix for up to 16 characters,
+     *          using unaligned loads and stores.
+     */
+    SZ_FORCE_INLINE void slice_upto16chars(                                            //
+        char const *first_reversed_slice, char const *second_slice, sz_size_t n,       //
+        sz_u32_t const *scores_pre_substitution, sz_u32_t const *scores_pre_insertion, //
+        sz_u32_t const *scores_pre_deletion, sz_u32_t *scores_new,                     //
+        sz_u512_vec_t match_cost_vec, sz_u512_vec_t mismatch_cost_vec, sz_u512_vec_t gap_cost_vec) const noexcept {
+
+        __mmask16 load_mask, match_mask;
+        sz_u128_vec_t first_vec, second_vec;
+        sz_u512_vec_t pre_substitution_vec, pre_insert_vec, pre_delete_vec;
+        sz_u512_vec_t cost_of_substitution_vec;
+        sz_u512_vec_t cost_if_substitution_vec, cost_if_gap_vec, cell_score_vec;
+
+        // ? Note that here we are still traversing both buffers in the same order,
+        // ? because one of the strings has been reversed beforehand.
+        load_mask = _sz_u16_mask_until(n);
+        first_vec.xmm = _mm_maskz_loadu_epi8(load_mask, first_reversed_slice);
+        second_vec.xmm = _mm_maskz_loadu_epi8(load_mask, second_slice);
+        pre_substitution_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, scores_pre_substitution);
+        pre_insert_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, scores_pre_insertion);
+        pre_delete_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, scores_pre_deletion);
+
+        match_mask = _mm_cmpeq_epi8_mask(first_vec.xmm, second_vec.xmm);
+        cost_of_substitution_vec.zmm = _mm512_mask_blend_epi32(match_mask, mismatch_cost_vec.zmm, match_cost_vec.zmm);
+        cost_if_substitution_vec.zmm = _mm512_add_epi32(pre_substitution_vec.zmm, cost_of_substitution_vec.zmm);
+        cost_if_gap_vec.zmm =
+            _mm512_add_epi32(_mm512_min_epu32(pre_insert_vec.zmm, pre_delete_vec.zmm), gap_cost_vec.zmm);
+        cell_score_vec.zmm = _mm512_min_epu32(cost_if_substitution_vec.zmm, cost_if_gap_vec.zmm);
+        _mm512_mask_storeu_epi32(scores_new, load_mask, cell_score_vec.zmm);
+    }
+
+    inline void operator()(                                                                 //
+        char const *first_reversed_slice, char const *second_slice, sz_size_t const length, //
+        sz_u32_t const *scores_pre_substitution, sz_u32_t const *scores_pre_insertion,      //
+        sz_u32_t const *scores_pre_deletion, sz_u32_t *scores_new) noexcept {
+
+        // Initialize constats:
+        sz_u512_vec_t match_cost_vec, mismatch_cost_vec, gap_cost_vec;
+        match_cost_vec.zmm = _mm512_set1_epi32(this->substituter_.match);
+        mismatch_cost_vec.zmm = _mm512_set1_epi32(this->substituter_.mismatch);
+        gap_cost_vec.zmm = _mm512_set1_epi32(this->gap_costs_.open_or_extend);
+
+        // On very small inputs, avoid the headache of splitting the input into chunks:
+        if (length <= step_k) {
+            slice_upto16chars(                                                                  //
+                first_reversed_slice, second_slice, length,                                     //
+                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, scores_new, //
+                match_cost_vec, mismatch_cost_vec, gap_cost_vec);
+            // The last element of the last chunk is the result of the global alignment.
+            this->last_score_ = scores_new[0];
+            return;
+        }
+
+        // First handle the misaligned slice of the output buffer:
+        head_body_tail_t hbt = head_body_tail<step_k>(scores_new, length);
+
+        // Misaligned head and tail:
+        if (hbt.head)
+            slice_upto16chars(                                                                  //
+                first_reversed_slice, second_slice, hbt.head,                                   //
+                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, scores_new, //
+                match_cost_vec, mismatch_cost_vec, gap_cost_vec);
+        first_reversed_slice += hbt.head, second_slice += hbt.head, scores_pre_substitution += hbt.head,
+            scores_pre_insertion += hbt.head, scores_pre_deletion += hbt.head, scores_new += hbt.head;
+        if (hbt.tail)
+            slice_upto16chars(                                                       //
+                first_reversed_slice + hbt.body, second_slice + hbt.body, hbt.tail,  //
+                scores_pre_substitution + hbt.body, scores_pre_insertion + hbt.body, //
+                scores_pre_deletion + hbt.body, scores_new + hbt.body,               //
+                match_cost_vec, mismatch_cost_vec, gap_cost_vec);
+
+#pragma omp parallel for if (capability_k & sz_cap_parallel_k)
+        for (sz_size_t progress = 0; progress < hbt.body; progress += step_k)
+            slice_aligned16chars(                                                                             //
+                first_reversed_slice + progress, second_slice + progress, scores_pre_substitution + progress, //
+                scores_pre_insertion + progress, scores_pre_deletion + progress, scores_new + progress,       //
+                match_cost_vec, mismatch_cost_vec, gap_cost_vec);
+
+        // The last element of the last chunk is the result of the global alignment.
+        if (length == 1) this->last_score_ = scores_new[0];
     }
 };
 
@@ -2711,7 +2983,7 @@ struct levenshtein_distance<char, linear_gap_costs_t, allocator_type_, capabilit
                         sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
     using diagonal_u32_t =                                                                               //
         diagonal_walker<char_t, sz_u32_t, uniform_substitution_costs_t, linear_gap_costs_t, allocator_t, //
-                        sz_minimize_distance_k, sz_similarity_global_k, capability_wout_simd_k>;
+                        sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
     using diagonal_u64_t =                                                                               //
         diagonal_walker<char_t, sz_u64_t, uniform_substitution_costs_t, linear_gap_costs_t, allocator_t, //
                         sz_minimize_distance_k, sz_similarity_global_k, capability_wout_simd_k>;
@@ -2879,8 +3151,8 @@ struct levenshtein_distance_utf8<char, linear_gap_costs_t, allocator_type_, capa
 
 /**
  *  @brief  Helper object optimizing the most expensive part of variable-substitution-cost alignment methods for
- *          Ice Lake CPUs. It's designed for horizontal layout "walkers", where we look at just one row of (256 x 256)
- *          substitution matrix and can fit 256 bytes worth of costs in the registers.
+ *          Ice Lake CPUs. It's designed for horizontal layout "walkers", where we look at just one row of (256 x
+ * 256) substitution matrix and can fit 256 bytes worth of costs in the registers.
  *
  *  This is a common abstraction for both:
  *  - Local SW and global NW alignment.
@@ -2935,7 +3207,8 @@ struct lookup_in256bytes_ice_t {
 };
 
 /**
- *  @brief  Helper object for Ice Lake CPUs. It's designed for horizontal layout "walkers", operating over 16-bit costs.
+ *  @brief  Helper object for Ice Lake CPUs. It's designed for horizontal layout "walkers", operating over 16-bit
+ * costs.
  *
  *  This is a common abstraction for both:
  *  - Local SW and global NW alignment.
@@ -2968,21 +3241,16 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_
         error_cost_t const *substitutions_row = &this->substituter_.cells[(sz_u8_t)*first_char][0];
         lookup_.reload(substitutions_row);
 
-        sz_size_t const count_slices = n / 64;
-
-#if (capability_k & sz_cap_parallel_k)
-#pragma omp parallel for simd
-#else
-#pragma omp simd
-#endif
         // Progress through the row 64 characters at a time.
+        sz_size_t const count_slices = n / 64;
+#pragma omp parallel for if (capability_k & sz_cap_parallel_k)
         for (sz_size_t idx_slice = 0; idx_slice != count_slices; ++idx_slice)
             slice_64chars(second_slice, idx_slice * 64, gap, scores_pre_substitution, scores_pre_insertion, scores_new);
 
         // Handle the tail with a less efficient kernel - at most 2 iterations of the following loop:
         for (sz_size_t idx_half_slice = count_slices * 2; idx_half_slice * 32 < n; ++idx_half_slice)
-            slice_under32chars(second_slice, idx_half_slice * 32, n, gap, scores_pre_substitution, scores_pre_insertion,
-                               scores_new);
+            slice_upto32chars(second_slice, idx_half_slice * 32, n, gap, scores_pre_substitution, scores_pre_insertion,
+                              scores_new);
 
         // Horizontally compute the running minimum of the last row.
         // Simply disabling this operation results in 5x performance improvement, meaning
@@ -3045,9 +3313,9 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_
         _mm512_storeu_epi16(scores_new + i + 32, cell_score_vecs[1].zmm);
     }
 
-    void slice_under32chars(char const *second_slice, sz_size_t i, sz_size_t n, sz_i16_t gap,              //
-                            sz_i16_t const *scores_pre_substitution, sz_i16_t const *scores_pre_insertion, //
-                            sz_i16_t *scores_new) const noexcept {
+    void slice_upto32chars(char const *second_slice, sz_size_t i, sz_size_t n, sz_i16_t gap,              //
+                           sz_i16_t const *scores_pre_substitution, sz_i16_t const *scores_pre_insertion, //
+                           sz_i16_t *scores_new) const noexcept {
 
         __mmask32 load_mask;
         sz_u512_vec_t second_vec; // ! Only up to 32 bytes in the low YMM section will be used
@@ -3084,7 +3352,8 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_
 };
 
 /**
- *  @brief  Computes the @b byte-level Needleman-Wunsch score between two strings using the Ice Lake (+OpenMP) backend.
+ *  @brief  Computes the @b byte-level Needleman-Wunsch score between two strings using the Ice Lake (+OpenMP)
+ * backend.
  *  @sa     `levenshtein_distance` for uniform substitution and gap costs.
  */
 template <typename allocator_type_, sz_capability_t capability_>
@@ -3158,7 +3427,8 @@ struct needleman_wunsch_score<char, error_costs_256x256_t, linear_gap_costs_t, a
 };
 
 /**
- *  @brief  Computes the @b byte-level Smith-Waterman score between two strings using the Ice Lake (+OpenMP) backend.
+ *  @brief  Computes the @b byte-level Smith-Waterman score between two strings using the Ice Lake (+OpenMP)
+ * backend.
  *  @sa     `levenshtein_distance` for uniform substitution and gap costs.
  */
 template <typename allocator_type_, sz_capability_t capability_>
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 3753e9d0..7e990da3 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -85,6 +85,12 @@
 #define sz_constexpr_if_cpp20
 #endif
 
+#if defined(__GNUC__) || defined(__clang__)
+#define SZ_FORCE_INLINE inline __attribute__((always_inline))
+#else
+#define SZ_FORCE_INLINE inline
+#endif
+
 #if !SZ_AVOID_STL
 #include <initializer_list> // `std::initializer_list` is only ~100 LOC
 #include <iterator>         // `std::random_access_iterator_tag` pulls 20K LOC
@@ -824,6 +830,42 @@ constexpr scalar_type_ round_up_to_multiple(scalar_type_ x, scalar_type_ divisor
     return divide_round_up(x, divisor) * divisor;
 }
 
+/**
+ *  @brief  Helper structure for dividing a range of data into three parts: head, body, and tail,
+ *          generally used to minimize misaligned (split) stores and operate on aligned pages.
+ */
+struct head_body_tail_t {
+    size_t head = 0;
+    size_t body = 0;
+    size_t tail = 0;
+};
+
+template <size_t elements_per_page_, typename element_type_>
+constexpr head_body_tail_t head_body_tail(element_type_ *first_address, size_t total_length) noexcept {
+    constexpr size_t bytes_per_element = sizeof(element_type_);
+    constexpr size_t bytes_per_page = elements_per_page_ * bytes_per_element;
+    static_assert(bytes_per_page > 0 && "Slice size must be positive");
+
+    // To split into head, body, and tail, we need the `first_address` to be
+    // a multiple of `bytes_per_element`, otherwise the `body` will always be a zero!
+    _sz_assert((size_t)first_address % bytes_per_element == 0);
+    size_t bytes_misalignment = (size_t)first_address % bytes_per_page;
+    size_t bytes_in_head = (bytes_per_page - bytes_misalignment) % bytes_per_page;
+    size_t elements_in_head = bytes_in_head / bytes_per_element;
+
+    // Round down the remaining count to a multiple of `elements_per_page_`.
+    size_t aligned_pages = (total_length - elements_in_head) / elements_per_page_;
+    size_t elements_in_body = aligned_pages * elements_per_page_;
+
+    // Tail is simply what remains:
+    size_t elements_in_tail = total_length - elements_in_head - elements_in_body;
+    _sz_assert(elements_in_head < elements_per_page_ && elements_in_head <= total_length);
+    _sz_assert(elements_in_tail < elements_per_page_ && elements_in_tail <= total_length);
+    _sz_assert(elements_in_body % elements_per_page_ == 0);
+
+    return {elements_in_head, elements_in_body, elements_in_tail};
+}
+
 } // namespace stringzilla
 } // namespace ashvardanian
 

From 0b33ac39229374190965ff329721906f014f5aad Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 25 Apr 2025 19:49:24 +0000
Subject: [PATCH 368/751] Docs: More datasets on HuggingFace

---
 .gitignore      |  8 +++++++-
 CONTRIBUTING.md | 35 ++++++++++++++++-------------------
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/.gitignore b/.gitignore
index 7484a986..fc34a7cb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,10 +32,16 @@ node_modules/
 *.tar.gz
 
 # Recommended datasets
+utf8.txt
 leipzig1M.txt
 enwik9.txt
 xlsum.csv
-proteins.txt
+acgt_100.txt
+acgt_100k.txt
+acgt_10k.txt
+acgt_10m.txt
+acgt_1k.txt
+acgt_1m.txt
 
 # StringZilla-specific log files
 /failed_sz_*
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 09ae6619..519c7ce6 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -39,28 +39,25 @@ The role of Python benchmarks is less to provide absolute number, but to compare
 ## Benchmarking Datasets
 
 It's not always easy to find good datasets for benchmarking strings workloads.
-I use several ASCII and UTF8 international datasets.
+I use several ASCII and UTF8 international datasets, all of them mirrored on the HuggingFace dataset hub, in the [StringKilla](https://huggingface.co/datasets/ashvardanian/StringKilla) repository.
 You can download them using the following commands:
 
 ```sh
-# English Leipzig Corpora Collection
-# 124 MB, 1'000'000 lines of ASCII, 8'388'608 tokens of mean length 5
-wget --no-clobber -O leipzig1M.txt https://introcs.cs.princeton.edu/python/42sort/leipzig1m.txt
-
-# Hutter Prize "enwik9" dataset for compression
-# 1 GB (0.3 GB compressed), 13'147'025 lines of ASCII, 67'108'864 tokens of mean length 6
-wget --no-clobber -O enwik9.zip http://mattmahoney.net/dc/enwik9.zip
-unzip enwik9.zip && rm enwik9.zip && mv enwik9 enwik9.txt
-
-# XL Sum dataset for multilingual extractive summarization
-# 4.7 GB (1.7 GB compressed), 1'004'598 lines of UTF8, 268'435'456 tokens of mean length 8
-wget --no-clobber -O xlsum.csv.gz https://github.com/ashvardanian/xl-sum/releases/download/v1.0.0/xlsum.csv.gz
-gzip -d xlsum.csv.gz
-
-# Human chromosome generator dataset generated by:
-# https://github.com/rghilduta/human-chromosome-data-generator/blob/main/generate_chromosome_data.sh
-# 1200 rows, each 800 characters long (939K)
-wget --no-clobber -O proteins.txt https://media.githubusercontent.com/media/rghilduta/human-chromosome-data-generator/main/examples/human_50000row_1000len.txt
+wget --no-clobber -O utf8.txt https://huggingface.co/datasets/ashvardanian/StringKilla/resolve/main/utf8.txt?download=true
+wget --no-clobber -O leipzig1M.txt https://huggingface.co/datasets/ashvardanian/StringKilla/resolve/main/leipzig1M.txt?download=true
+wget --no-clobber -O enwik9.txt https://huggingface.co/datasets/ashvardanian/StringKilla/resolve/main/enwik9.txt?download=true
+wget --no-clobber -O xlsum.csv https://huggingface.co/datasets/ashvardanian/StringKilla/resolve/main/xlsum.csv?download=true
+```
+
+For bioinformatics workloads, I use the following datasets with increasing string lengths:
+
+```sh
+wget --no-clobber -O acgt_100.txt https://huggingface.co/datasets/ashvardanian/StringKilla/resolve/main/acgt_100.txt?download=true
+wget --no-clobber -O acgt_1k.txt https://huggingface.co/datasets/ashvardanian/StringKilla/resolve/main/acgt_1k.txt?download=true
+wget --no-clobber -O acgt_10k.txt https://huggingface.co/datasets/ashvardanian/StringKilla/resolve/main/acgt_10k.txt?download=true
+wget --no-clobber -O acgt_100k.txt https://huggingface.co/datasets/ashvardanian/StringKilla/resolve/main/acgt_100k.txt?download=true
+wget --no-clobber -O acgt_1m.txt https://huggingface.co/datasets/ashvardanian/StringKilla/resolve/main/acgt_1m.txt?download=true
+wget --no-clobber -O acgt_10m.txt https://huggingface.co/datasets/ashvardanian/StringKilla/resolve/main/acgt_10m.txt?download=true
 ```
 
 ## IDE Integrations

From f7b091c2c4dea0bcc46087516b9629a2604261df Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 25 Apr 2025 21:29:32 +0000
Subject: [PATCH 369/751] Add: C++20 concepts

---
 include/stringcuzilla/similarity.hpp | 217 +++++++++++++++------------
 1 file changed, 119 insertions(+), 98 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index 8304c607..cdf30215 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -142,7 +142,7 @@ struct similarity_memory_requirements {
      *  @param[in] bytes_per_char The number of bytes per character, 4 for UTF-32, 1 for ASCII.
      *  @param[in] register_width The alignment of the data in bytes, 4 for CUDA, 64 for AVX-512.
      *
-     *  To understand the @p magnitude() parameter, consider the following example:
+     *  To understand the @p substitute_magnitude,gap_magnitude parameters, consider the following example:
      *  - substitution costs ranging from -16 to +15
      *  - gap costs equal to -10
      *  In that case, the biggest change will be `abs(-16) = 16`, so the passed argument should be 16.
@@ -193,6 +193,27 @@ struct similarity_memory_requirements {
 
 #pragma region - Core Templates
 
+template <typename iterator_type_>
+concept pointer_like = requires(iterator_type_ iterator, std::size_t idx) {
+    { ++iterator } -> std::same_as<iterator_type_ &>; // pre-increment
+    { *iterator };                                    // dereference
+    { iterator[idx] };                                // random access
+};
+
+template <typename value_type_>
+concept score_like = std::integral<value_type_> && std::is_trivial_v<value_type_>;
+
+template <typename substituter_type_>
+concept substituter_like = requires(substituter_type_ costs) {
+    { costs.magnitude() } -> std::convertible_to<std::size_t>;                 // retrieving the magnitude
+    { costs.operator()(char(), char()) } -> std::convertible_to<error_cost_t>; // cost of substitution
+};
+
+template <typename gap_costs_type_>
+concept gap_costs_like = requires(gap_costs_type_ costs) {
+    { costs.magnitude() } -> std::convertible_to<std::size_t>; // retrieving the magnitude
+};
+
 /**
  *  @brief  An operator to be applied to be applied to all @b 2x2 tiles of the DP matrix to produce
  *          the bottom-right value from the 3x others when populating the Dynamic Programming matrix.
@@ -207,16 +228,16 @@ struct similarity_memory_requirements {
  *  @tparam capability_ The SIMD capabilities of the target architecture.
  *  @tparam enable_ Used to enable/disable the specialization.
  */
-template <                                                       //
-    typename first_iterator_type_ = char const *,                //
-    typename second_iterator_type_ = char const *,               //
-    typename score_type_ = sz_size_t,                            //
-    typename substituter_type_ = uniform_substitution_costs_t,   //
-    typename gap_costs_type_ = linear_gap_costs_t,               //
-    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
-    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
-    sz_capability_t capability_ = sz_cap_serial_k,               //
-    typename enable_ = void                                      //
+template <                                                             //
+    pointer_like first_iterator_type_ = char const *,                  //
+    pointer_like second_iterator_type_ = char const *,                 //
+    score_like score_type_ = sz_size_t,                                //
+    substituter_like substituter_type_ = uniform_substitution_costs_t, //
+    gap_costs_like gap_costs_type_ = linear_gap_costs_t,               //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k,        //
+    sz_similarity_locality_t locality_ = sz_similarity_global_k,       //
+    sz_capability_t capability_ = sz_cap_serial_k,                     //
+    typename enable_ = void                                            //
     >
 struct tile_scorer;
 
@@ -241,16 +262,16 @@ struct tile_scorer;
  *  @tparam capability_ Whether to use OpenMP for @b multi-threading or some form of @b SIMD vectorization, or both.
  *  @tparam enable_ Used to enable/disable the specialization.
  */
-template <                                                       //
-    typename char_type_ = char,                                  //
-    typename score_type_ = sz_size_t,                            //
-    typename substituter_type_ = uniform_substitution_costs_t,   //
-    typename gap_costs_type_ = linear_gap_costs_t,               //
-    typename allocator_type_ = dummy_alloc_t,                    //
-    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
-    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
-    sz_capability_t capability_ = sz_cap_serial_k,               //
-    typename enable_ = void                                      //
+template <                                                             //
+    typename char_type_ = char,                                        //
+    score_like score_type_ = sz_size_t,                                //
+    substituter_like substituter_type_ = uniform_substitution_costs_t, //
+    gap_costs_like gap_costs_type_ = linear_gap_costs_t,               //
+    typename allocator_type_ = dummy_alloc_t,                          //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k,        //
+    sz_similarity_locality_t locality_ = sz_similarity_global_k,       //
+    sz_capability_t capability_ = sz_cap_serial_k,                     //
+    typename enable_ = void                                            //
     >
 struct diagonal_walker;
 
@@ -277,16 +298,16 @@ struct diagonal_walker;
  *  @sa     For simplicity, use the `sz::levenshtein_distance[_utf8]` and `sz::needleman_wunsch_score`.
  *  @sa     For bulk API, use `sz::levenshtein_scores[_utf8]`.
  */
-template <                                                       //
-    typename char_type_ = char,                                  //
-    typename score_type_ = sz_size_t,                            //
-    typename substituter_type_ = uniform_substitution_costs_t,   //
-    typename gap_costs_type_ = linear_gap_costs_t,               //
-    typename allocator_type_ = dummy_alloc_t,                    //
-    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
-    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
-    sz_capability_t capability_ = sz_cap_serial_k,               //
-    typename enable_ = void                                      //
+template <                                                             //
+    typename char_type_ = char,                                        //
+    score_like score_type_ = sz_size_t,                                //
+    substituter_like substituter_type_ = uniform_substitution_costs_t, //
+    gap_costs_like gap_costs_type_ = linear_gap_costs_t,               //
+    typename allocator_type_ = dummy_alloc_t,                          //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k,        //
+    sz_similarity_locality_t locality_ = sz_similarity_global_k,       //
+    sz_capability_t capability_ = sz_cap_serial_k,                     //
+    typename enable_ = void                                            //
     >
 struct horizontal_walker;
 
@@ -304,8 +325,8 @@ constexpr bool is_serial_or_parallel(sz_capability_t capability) noexcept {
  *  - Only @b Linear gaps, not Affine!
  *  - Both auto-vectorized @b Serial and @b Parallel execution, but not hand-rolled SIMD!
  */
-template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
-          typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
+template <pointer_like first_iterator_type_, pointer_like second_iterator_type_, score_like score_type_,
+          substituter_like substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
 struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, linear_gap_costs_t,
                    objective_, sz_similarity_global_k, capability_,
                    std::enable_if_t<is_serial_or_parallel(capability_)>> {
@@ -390,8 +411,8 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  *  - Only @b Linear gaps, not Affine!
  *  - Both auto-vectorized @b Serial and @b Parallel execution, but not hand-rolled SIMD!
  */
-template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
-          typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
+template <pointer_like first_iterator_type_, pointer_like second_iterator_type_, score_like score_type_,
+          substituter_like substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
 struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, linear_gap_costs_t,
                    objective_, sz_similarity_local_k, capability_,
                    std::enable_if_t<is_serial_or_parallel(capability_)>> {
@@ -454,7 +475,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
             score_t if_substitution = pre_substitution + cost_of_substitution;
             score_t if_deletion_or_insertion = min_or_max<objective_k>(pre_deletion, pre_insertion) + gap_cost;
             // ! This is the main difference with global alignment:
-            score_t if_substitution_or_reset = min_or_max<objective_k>(if_substitution, (score_t)0);
+            score_t if_substitution_or_reset = min_or_max<objective_k, score_t>(if_substitution, 0);
             score_t cell_score = min_or_max<objective_k>(if_deletion_or_insertion, if_substitution_or_reset);
             scores_new[i] = cell_score;
 
@@ -471,8 +492,8 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  *  - Only @b Affine gaps, not Linear!
  *  - Both auto-vectorized @b Serial and @b Parallel execution, but not hand-rolled SIMD!
  */
-template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
-          typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
+template <pointer_like first_iterator_type_, pointer_like second_iterator_type_, score_like score_type_,
+          substituter_like substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
 struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, affine_gap_costs_t,
                    objective_, sz_similarity_global_k, capability_,
                    std::enable_if_t<is_serial_or_parallel(capability_)>> {
@@ -580,8 +601,8 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  *  - Only @b Affine gaps, not Linear!
  *  - Both auto-vectorized @b Serial and @b Parallel execution, but not hand-rolled SIMD!
  */
-template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
-          typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
+template <pointer_like first_iterator_type_, pointer_like second_iterator_type_, score_like score_type_,
+          substituter_like substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
 struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, affine_gap_costs_t,
                    objective_, sz_similarity_local_k, capability_,
                    std::enable_if_t<is_serial_or_parallel(capability_)>> {
@@ -688,7 +709,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  *
  *  Allocates 3x diagonals of the DP matrix.
  */
-template <typename char_type_, typename score_type_, typename substituter_type_, typename allocator_type_,
+template <typename char_type_, score_like score_type_, substituter_like substituter_type_, typename allocator_type_,
           sz_similarity_objective_t objective_, sz_similarity_locality_t locality_, sz_capability_t capability_,
           typename enable_>
 struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_costs_t, allocator_type_, objective_,
@@ -870,7 +891,7 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_co
  *
  *  Allocates 3x diagonals of the DP matrix and 2x diagonals of 2x affine gaps matrices.
  */
-template <typename char_type_, typename score_type_, typename substituter_type_, typename allocator_type_,
+template <typename char_type_, score_like score_type_, substituter_like substituter_type_, typename allocator_type_,
           sz_similarity_objective_t objective_, sz_similarity_locality_t locality_, sz_capability_t capability_,
           typename enable_>
 struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_costs_t, allocator_type_, objective_,
@@ -1084,7 +1105,7 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
  *
  *  Allocates 2x rows of the DP matrix.
  */
-template <typename char_type_, typename score_type_, typename substituter_type_, typename allocator_type_,
+template <typename char_type_, score_like score_type_, substituter_like substituter_type_, typename allocator_type_,
           sz_similarity_objective_t objective_, sz_similarity_locality_t locality_, sz_capability_t capability_,
           typename enable_>
 struct horizontal_walker<char_type_, score_type_, substituter_type_, linear_gap_costs_t, allocator_type_, objective_,
@@ -1199,7 +1220,7 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, linear_gap_
  *
  *  Allocates 2x rows of the DP matrix and 2x rows of 2x affine gaps matrices.
  */
-template <typename char_type_, typename score_type_, typename substituter_type_, typename allocator_type_,
+template <typename char_type_, score_like score_type_, substituter_like substituter_type_, typename allocator_type_,
           sz_similarity_objective_t objective_, sz_similarity_locality_t locality_, sz_capability_t capability_,
           typename enable_>
 struct horizontal_walker<char_type_, score_type_, substituter_type_, affine_gap_costs_t, allocator_type_, objective_,
@@ -1336,12 +1357,12 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, affine_gap_
  *  @tparam gap_costs_type_ Can be either `linear_gap_costs_t` or `affine_gap_costs_t`.
  *  @tparam capability_ Can be either `sz_cap_serial_k`, `sz_caps_sp_k`, `sz_caps_spi_k`, `sz_cap_cuda_k`.
  */
-template <                                         //
-    typename char_type_ = char,                    //
-    typename gap_costs_type_ = linear_gap_costs_t, //
-    typename allocator_type_ = dummy_alloc_t,      //
-    sz_capability_t capability_ = sz_cap_serial_k, //
-    typename enable_ = void                        //
+template <                                               //
+    typename char_type_ = char,                          //
+    gap_costs_like gap_costs_type_ = linear_gap_costs_t, //
+    typename allocator_type_ = dummy_alloc_t,            //
+    sz_capability_t capability_ = sz_cap_serial_k,       //
+    typename enable_ = void                              //
     >
 struct levenshtein_distance {
 
@@ -1445,12 +1466,12 @@ struct levenshtein_distance {
  *  @brief  Computes the @b rune-level Levenshtein distance between two UTF-8 strings using the OpenMP backend.
  *  @sa     `levenshtein_distance` for binary strings.
  */
-template <                                         //
-    typename char_type_ = char,                    //
-    typename gap_costs_type_ = linear_gap_costs_t, //
-    typename allocator_type_ = dummy_alloc_t,      //
-    sz_capability_t capability_ = sz_cap_serial_k, //
-    typename enable_ = void                        //
+template <                                               //
+    typename char_type_ = char,                          //
+    gap_costs_like gap_costs_type_ = linear_gap_costs_t, //
+    typename allocator_type_ = dummy_alloc_t,            //
+    sz_capability_t capability_ = sz_cap_serial_k,       //
+    typename enable_ = void                              //
     >
 struct levenshtein_distance_utf8 {
 
@@ -1578,13 +1599,13 @@ struct levenshtein_distance_utf8 {
  *  @brief  Computes the @b byte-level Needleman-Wunsch score between two strings using the OpenMP backend.
  *  @sa     `levenshtein_distance` for uniform substitution and gap costs.
  */
-template <                                              //
-    typename char_type_ = char,                         //
-    typename substituter_type_ = error_costs_256x256_t, //
-    typename gap_costs_type_ = linear_gap_costs_t,      //
-    typename allocator_type_ = dummy_alloc_t,           //
-    sz_capability_t capability_ = sz_cap_serial_k,      //
-    typename enable_ = void                             //
+template <                                                      //
+    typename char_type_ = char,                                 //
+    substituter_like substituter_type_ = error_costs_256x256_t, //
+    gap_costs_like gap_costs_type_ = linear_gap_costs_t,        //
+    typename allocator_type_ = dummy_alloc_t,                   //
+    sz_capability_t capability_ = sz_cap_serial_k,              //
+    typename enable_ = void                                     //
     >
 struct needleman_wunsch_score {
 
@@ -1665,13 +1686,13 @@ struct needleman_wunsch_score {
  *  @brief  Computes the @b byte-level Needleman-Wunsch score between two strings using the OpenMP backend.
  *  @sa     `levenshtein_distance` for uniform substitution and gap costs.
  */
-template <                                              //
-    typename char_type_ = char,                         //
-    typename substituter_type_ = error_costs_256x256_t, //
-    typename gap_costs_type_ = linear_gap_costs_t,      //
-    typename allocator_type_ = dummy_alloc_t,           //
-    sz_capability_t capability_ = sz_cap_serial_k,      //
-    typename enable_ = void                             //
+template <                                                      //
+    typename char_type_ = char,                                 //
+    substituter_like substituter_type_ = error_costs_256x256_t, //
+    gap_costs_like gap_costs_type_ = linear_gap_costs_t,        //
+    typename allocator_type_ = dummy_alloc_t,                   //
+    sz_capability_t capability_ = sz_cap_serial_k,              //
+    typename enable_ = void                                     //
     >
 struct smith_waterman_score {
 
@@ -1769,7 +1790,7 @@ struct smith_waterman_score {
  *          cache hits. For smaller strings, each core computes its own distance.
  */
 template <                                                       //
-    typename score_type_,                                        //
+    score_like score_type_,                                      //
     typename core_per_input_type_,                               //
     typename all_cores_per_input_type_,                          //
     typename first_strings_type_, typename second_strings_type_, //
@@ -1834,7 +1855,7 @@ status_t _score_in_parallel(                         //
 }
 
 template <                                                       //
-    typename score_type_,                                        //
+    score_like score_type_,                                      //
     typename scoring_type_,                                      //
     typename first_strings_type_, typename second_strings_type_, //
     typename results_type_                                       //
@@ -1864,12 +1885,12 @@ status_t _score_sequentially(scoring_type_ &&scoring, first_strings_type_ const
  *          For pairs of very large strings, all cores cooperate to compute one distance maximizing
  *          cache hits. For smaller strings, each core computes its own distance.
  */
-template <                                         //
-    typename char_type_ = char,                    //
-    typename gap_costs_type_ = linear_gap_costs_t, //
-    typename allocator_type_ = dummy_alloc_t,      //
-    sz_capability_t capability_ = sz_cap_serial_k, //
-    typename enable_ = void                        //
+template <                                               //
+    typename char_type_ = char,                          //
+    gap_costs_like gap_costs_type_ = linear_gap_costs_t, //
+    typename allocator_type_ = dummy_alloc_t,            //
+    sz_capability_t capability_ = sz_cap_serial_k,       //
+    typename enable_ = void                              //
     >
 struct levenshtein_distances {
 
@@ -1909,12 +1930,12 @@ struct levenshtein_distances {
     }
 };
 
-template <                                         //
-    typename char_type_ = char,                    //
-    typename gap_costs_type_ = linear_gap_costs_t, //
-    typename allocator_type_ = dummy_alloc_t,      //
-    sz_capability_t capability_ = sz_cap_serial_k, //
-    typename enable_ = void                        //
+template <                                               //
+    typename char_type_ = char,                          //
+    gap_costs_like gap_costs_type_ = linear_gap_costs_t, //
+    typename allocator_type_ = dummy_alloc_t,            //
+    sz_capability_t capability_ = sz_cap_serial_k,       //
+    typename enable_ = void                              //
     >
 struct levenshtein_distances_utf8 {
 
@@ -1954,13 +1975,13 @@ struct levenshtein_distances_utf8 {
     }
 };
 
-template <                                              //
-    typename char_type_ = char,                         //
-    typename substituter_type_ = error_costs_256x256_t, //
-    typename gap_costs_type_ = linear_gap_costs_t,      //
-    typename allocator_type_ = dummy_alloc_t,           //
-    sz_capability_t capability_ = sz_cap_serial_k,      //
-    typename enable_ = void                             //
+template <                                                      //
+    typename char_type_ = char,                                 //
+    substituter_like substituter_type_ = error_costs_256x256_t, //
+    gap_costs_like gap_costs_type_ = linear_gap_costs_t,        //
+    typename allocator_type_ = dummy_alloc_t,                   //
+    sz_capability_t capability_ = sz_cap_serial_k,              //
+    typename enable_ = void                                     //
     >
 struct needleman_wunsch_scores {
 
@@ -2001,13 +2022,13 @@ struct needleman_wunsch_scores {
     }
 };
 
-template <                                              //
-    typename char_type_ = char,                         //
-    typename substituter_type_ = error_costs_256x256_t, //
-    typename gap_costs_type_ = linear_gap_costs_t,      //
-    typename allocator_type_ = dummy_alloc_t,           //
-    sz_capability_t capability_ = sz_cap_serial_k,      //
-    typename enable_ = void                             //
+template <                                                      //
+    typename char_type_ = char,                                 //
+    substituter_like substituter_type_ = error_costs_256x256_t, //
+    gap_costs_like gap_costs_type_ = linear_gap_costs_t,        //
+    typename allocator_type_ = dummy_alloc_t,                   //
+    sz_capability_t capability_ = sz_cap_serial_k,              //
+    typename enable_ = void                                     //
     >
 struct smith_waterman_scores {
 

From e1a37dea30803b544176d1a620158089b501e38a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 26 Apr 2025 21:35:01 +0000
Subject: [PATCH 370/751] Add: Draft executors for StringCuZilla

---
 include/stringcuzilla/types.hpp | 123 ++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 include/stringcuzilla/types.hpp

diff --git a/include/stringcuzilla/types.hpp b/include/stringcuzilla/types.hpp
new file mode 100644
index 00000000..a31b99d3
--- /dev/null
+++ b/include/stringcuzilla/types.hpp
@@ -0,0 +1,123 @@
+/**
+ *  @brief  Shared definitions for the StringCuZilla C++ library.
+ *  @file   types.hpp
+ *  @author Ash Vardanian
+ */
+#ifndef STRINGCUZILLA_TYPES_HPP_
+#define STRINGCUZILLA_TYPES_HPP_
+
+#include "stringzilla/types.hpp"
+
+namespace ashvardanian {
+namespace stringzilla {
+
+struct dummy_executor_t {
+
+    /**
+     *  @brief  Calls the @p function for each index from 0 to @p (n) in such
+     *          a way that consecutive elements are likely to be processed by
+     *          the same thread.
+     */
+    template <typename function_type_>
+    inline void slice(size_t n, function_type_ &&function) const noexcept {
+        for (size_t i = 0; i < n; ++i) function(i);
+    }
+
+    /**
+     *  @brief  Calls the @p function on each thread propagating a 2 indices
+     *          to the function. The first index is the start of the range
+     *          and the second index is the exclusive end of the range to be
+     *          handled by a particular thread.
+     */
+    template <typename function_type_>
+    inline void slice_range(size_t n, function_type_ &&function) const noexcept {
+        function(0, n);
+    }
+
+    /**
+     *  @brief  Calls the @p function for each index from 0 to @p (n) expecting
+     *          that individual invocations can have drastically different duration,
+     *          so each thread eagerly processes the next index in the range.
+     */
+    template <typename function_type_>
+    inline void eager(size_t n, function_type_ &&function) const noexcept {
+        for (size_t i = 0; i < n; ++i) function(i);
+    }
+};
+
+struct openmp_executor_t {
+
+    /**
+     *  @brief  Calls the @p function for each index from 0 to @p (n) in such
+     *          a way that consecutive elements are likely to be processed by
+     *          the same thread.
+     */
+    template <typename function_type_>
+    inline void slice(size_t n, function_type_ &&function) const noexcept {
+#pragma omp parallel for
+        for (size_t i = 0; i < n; ++i) function(i);
+    }
+
+    /**
+     *  @brief  Calls the @p function on each thread propagating a 2 indices
+     *          to the function. The first index is the start of the range
+     *          and the second index is the exclusive end of the range to be
+     *          handled by a particular thread.
+     */
+    template <typename function_type_>
+    inline void slice_range(size_t n, function_type_ &&function) const noexcept {
+        // OpenMP won't use more threads than the number of available cores
+        // and by using STL to query that number, we avoid the need to link
+        // against OpenMP libraries.
+        size_t const total_threads = std::thread::hardware_concurrency();
+        size_t const chunk_size = divide_round_up(n, total_threads);
+#pragma omp parallel for schedule(static, 1)
+        for (size_t i = 0; i < total_threads; ++i) {
+            size_t const start = i * chunk_size;
+            size_t const end = std::min(start + chunk_size, n);
+            function(start, end);
+        }
+    }
+
+    /**
+     *  @brief  Calls the @p function for each index from 0 to @p (n) expecting
+     *          that individual invocations can have drastically different duration,
+     *          so each thread eagerly processes the next index in the range.
+     */
+    template <typename function_type_>
+    inline void eager(size_t n, function_type_ &&function) const noexcept {
+#pragma omp parallel for schedule(dynamic, 1)
+        for (size_t i = 0; i < n; ++i) function(i);
+    }
+};
+
+template <typename executor_type_>
+concept executor_like = requires(executor_type_ executor) {
+    {
+        executor.slice(0u, [](size_t) {})
+    } -> std::same_as<void>;
+    {
+        executor.slice_range(0u, [](size_t, size_t) {})
+    } -> std::same_as<void>;
+    {
+        executor.eager(0u, [](size_t) {})
+    } -> std::same_as<void>;
+};
+
+static_assert(executor_like<dummy_executor_t>);
+static_assert(executor_like<openmp_executor_t>);
+static_assert(!executor_like<int>);
+
+template <typename continuous_type_>
+concept continuous_like = requires(continuous_type_ container) {
+    { container.data() } -> std::same_as<typename continuous_type_::value_type *>;
+    { container.size() } -> std::convertible_to<size_t>;
+};
+
+static_assert(continuous_like<span<char>>);
+static_assert(!continuous_like<int>);
+
+} // namespace stringzilla
+} // namespace ashvardanian
+
+#endif // STRINGCUZILLA_TYPES_HPP_

From 8e79fb710b9253d27565c15c67d167f15d5eaf62 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 26 Apr 2025 21:36:38 +0000
Subject: [PATCH 371/751] Fix: Inclusion guard macro names

---
 include/stringcuzilla/find_many.hpp   | 6 +++---
 include/stringcuzilla/similarity.cuh  | 6 +++---
 include/stringcuzilla/stringcuzilla.h | 8 ++++----
 include/stringcuzilla/types.cuh       | 6 +++---
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index 5126e8d2..317f9841 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -34,8 +34,8 @@
  *  | Plagiarism/Code Similarity    | 1,000 – 100,000         | 1.024 MB – 102.4 MB     |
  *  | Product Catalog Matching      | 100,000 – 1,000,000     | 102.4 MB – 1.024 GB     |
  */
-#ifndef STRINGZILLA_FIND_MANY_HPP_
-#define STRINGZILLA_FIND_MANY_HPP_
+#ifndef STRINGCUZILLA_FIND_MANY_HPP_
+#define STRINGCUZILLA_FIND_MANY_HPP_
 
 #include "stringzilla/memory.h"  // `sz_move`
 #include "stringzilla/types.hpp" // `status_t::status_t`
@@ -827,4 +827,4 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
 } // namespace stringzilla
 } // namespace ashvardanian
 
-#endif // STRINGZILLA_FIND_MANY_HPP_
+#endif // STRINGCUZILLA_FIND_MANY_HPP_
diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index ffe0178b..fd499cca 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -34,8 +34,8 @@
  *  - `levenshtein_distances`: {CUDA and Kepler} for any chars and lengths, {Hopper} for 8-bit and 16-bit lengths.
  *  - `needleman_wunsch_score`.
  */
-#ifndef STRINGZILLA_SIMILARITIES_CUH_
-#define STRINGZILLA_SIMILARITIES_CUH_
+#ifndef STRINGCUZILLA_SIMILARITY_CUH_
+#define STRINGCUZILLA_SIMILARITY_CUH_
 
 #include "stringcuzilla/types.cuh"
 #include "stringcuzilla/similarity.hpp"
@@ -1314,4 +1314,4 @@ struct needleman_wunsch_scores<char_type_, substituter_type_, allocator_type_, s
 } // namespace stringzilla
 } // namespace ashvardanian
 
-#endif // STRINGZILLA_SIMILARITIES_CUH_
\ No newline at end of file
+#endif // STRINGCUZILLA_SIMILARITY_CUH_
\ No newline at end of file
diff --git a/include/stringcuzilla/stringcuzilla.h b/include/stringcuzilla/stringcuzilla.h
index 4b617e86..0d66df41 100644
--- a/include/stringcuzilla/stringcuzilla.h
+++ b/include/stringcuzilla/stringcuzilla.h
@@ -4,11 +4,11 @@
  *          On modern CPUs it uses AVX2, AVX-512, NEON, SVE, & SVE2 @b SIMD instructions & provides SWAR for older CPUs.
  *          On @b CUDA-capable GPUs it also provides C++ kernels for bulk processing.
  *
- *  @file   stringzilla.cuh
+ *  @file   stringcuzilla.h
  *  @author Ash Vardanian
  */
-#ifndef STRINGZILLA_CUH_
-#define STRINGZILLA_CUH_
+#ifndef STRINGCUZILLA_H_
+#define STRINGCUZILLA_H_
 
 #include "stringzilla.h"
 
@@ -34,4 +34,4 @@ SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u32tape( //
 }
 #endif // __cplusplus
 
-#endif // STRINGZILLA_CUH_
+#endif // STRINGCUZILLA_H_
diff --git a/include/stringcuzilla/types.cuh b/include/stringcuzilla/types.cuh
index 0f9cff39..269b78a1 100644
--- a/include/stringcuzilla/types.cuh
+++ b/include/stringcuzilla/types.cuh
@@ -8,8 +8,8 @@
  *
  *  - `unified_alloc` - a custom allocator that uses CUDA Unified Memory for allocation.
  */
-#ifndef STRINGZILLA_TYPES_CUH_
-#define STRINGZILLA_TYPES_CUH_
+#ifndef STRINGCUZILLA_TYPES_CUH_
+#define STRINGCUZILLA_TYPES_CUH_
 
 #include "stringzilla/types.hpp"
 
@@ -134,4 +134,4 @@ __forceinline__ __device__ sz_u32_vec_t sz_u32_load_unaligned(void const *ptr) n
 } // namespace stringzilla
 } // namespace ashvardanian
 
-#endif // STRINGZILLA_TYPES_CUH_
+#endif // STRINGCUZILLA_TYPES_CUH_

From 197535191ebc44a5776e7494ed88ae7b2661e8c6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 26 Apr 2025 21:37:18 +0000
Subject: [PATCH 372/751] Add: Draft StringCuZilla C API

---
 include/stringcuzilla/stringcuzilla.h | 83 ++++++++++++++++++++++-----
 1 file changed, 70 insertions(+), 13 deletions(-)

diff --git a/include/stringcuzilla/stringcuzilla.h b/include/stringcuzilla/stringcuzilla.h
index 0d66df41..85968bc7 100644
--- a/include/stringcuzilla/stringcuzilla.h
+++ b/include/stringcuzilla/stringcuzilla.h
@@ -16,19 +16,76 @@
 extern "C" {
 #endif
 
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u32tape( //
-    sz_cptr_t a_data, sz_u32_t const *a_lengths,         //
-    sz_cptr_t b_data, sz_u32_t const *b_lengths,         //
-    sz_size_t count,                                     //
-    sz_size_t bound,                                     //
-    sz_memory_allocator_t *alloc, sz_size_t *results);
-
-SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u32tape( //
-    sz_cptr_t a_data, sz_u32_t const *a_lengths,           //
-    sz_cptr_t b_data, sz_u32_t const *b_lengths,           //
-    sz_size_t count,                                       //
-    sz_error_cost_t const *subs, sz_error_cost_t gap,      //
-    sz_memory_allocator_t *alloc, sz_ssize_t *results);
+struct sz_arrow_u32tape_t {
+    sz_cptr_t data;
+    sz_u32_t const *lengths;
+    sz_size_t count;
+};
+
+struct sz_arrow_u64tape_t {
+    sz_cptr_t data;
+    sz_u64_t const *lengths;
+    sz_size_t count;
+};
+
+/**
+ *  Doesn't aim to provide the same level of granularity as the C++ API.
+ *  It expects that the C functions will be called in bulk, generally,
+ *  by just a single caller, either targeting:
+ *
+ *  - a single CPU core,
+ *  - a fraction of CPU cores through some global thread pool,
+ *  - a single GPU device.
+ */
+struct sz_device_scope_t {
+    sz_ssize_t cpu_cores;
+    sz_ssize_t gpu_device;
+};
+
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_sequence(                                          //
+    sz_sequence_t const *a, sz_sequence_t const *b,                                                //
+    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
+    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_size_t *results);
+
+SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_sequence(                    //
+    sz_sequence_t const *a, sz_sequence_t const *b,                            //
+    sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
+    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_ssize_t *results);
+
+SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_sequence(                      //
+    sz_sequence_t const *a, sz_sequence_t const *b,                            //
+    sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
+    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_ssize_t *results);
+
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u32tape(                                           //
+    sz_arrow_u32tape_t const *a, sz_arrow_u32tape_t const *b,                                      //
+    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
+    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_size_t *results);
+
+SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u32tape(                     //
+    sz_arrow_u32tape_t const *a, sz_arrow_u32tape_t const *b,                  //
+    sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
+    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_ssize_t *results);
+
+SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u32tape(                       //
+    sz_arrow_u32tape_t const *a, sz_arrow_u32tape_t const *b,                  //
+    sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
+    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_ssize_t *results);
+
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u64tape(                                           //
+    sz_arrow_u64tape_t const *a, sz_arrow_u64tape_t const *b,                                      //
+    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
+    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_size_t *results);
+
+SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u64tape(                     //
+    sz_arrow_u64tape_t const *a, sz_arrow_u64tape_t const *b,                  //
+    sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
+    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_ssize_t *results);
+
+SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u64tape(                       //
+    sz_arrow_u64tape_t const *a, sz_arrow_u64tape_t const *b,                  //
+    sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
+    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_ssize_t *results);
 
 #ifdef __cplusplus
 }

From 6be0da54aef8e27920cdb0bcc6e449a8dc8a15fe Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 27 Apr 2025 10:33:43 +0000
Subject: [PATCH 373/751] Add: Concepts & new multithreading scheme

This commit rewrites most of the similarity scoring
functions introducing a much more flexible concurrency
model, where a custom executor can be passed down
and combined with various SIMD kernels
---
 include/stringcuzilla/similarity.hpp | 1039 ++++++++++++++------------
 1 file changed, 550 insertions(+), 489 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index cdf30215..0e336091 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -50,11 +50,12 @@
  *
  *  TODO: For @b sparse algorithms, the algorithms are constructed differently.
  */
-#ifndef STRINGZILLA_SIMILARITY_HPP_
-#define STRINGZILLA_SIMILARITY_HPP_
+#ifndef STRINGCUZILLA_SIMILARITY_HPP_
+#define STRINGCUZILLA_SIMILARITY_HPP_
 
-#include "stringzilla/memory.h"  // `sz_move`
-#include "stringzilla/types.hpp" // `sz::error_cost_t`
+#include "stringzilla/types.hpp"   // `sz::error_cost_t`
+#include "stringzilla/memory.h"    // `sz_move`
+#include "stringcuzilla/types.hpp" // `sz::executor_like`
 
 #include <atomic>      // `std::atomic` to synchronize OpenMP threads
 #include <type_traits> // `std::enable_if_t` for meta-programming
@@ -84,14 +85,14 @@ void rotate_three(value_type_ &a, value_type_ &b, value_type_ &c) noexcept {
 struct linear_gap_costs_t {
     error_cost_t open_or_extend = 1;
 
-    constexpr sz_size_t magnitude() const noexcept { return std::abs(open_or_extend); }
+    constexpr size_t magnitude() const noexcept { return std::abs(open_or_extend); }
 };
 
 struct affine_gap_costs_t {
     error_cost_t open = 1;
     error_cost_t extend = 1;
 
-    constexpr sz_size_t magnitude() const noexcept { return std::max(std::abs(open), std::abs(extend)); }
+    constexpr size_t magnitude() const noexcept { return std::max(std::abs(open), std::abs(extend)); }
 };
 
 template <typename gap_costs_type_>
@@ -113,7 +114,7 @@ struct uniform_substitution_costs_t {
 
     constexpr error_cost_t operator()(char a, char b) const noexcept { return a == b ? match : mismatch; }
     constexpr error_cost_t operator()(sz_rune_t a, sz_rune_t b) const noexcept { return a == b ? match : mismatch; }
-    constexpr sz_size_t magnitude() const noexcept { return std::max(std::abs(match), std::abs(mismatch)); }
+    constexpr size_t magnitude() const noexcept { return std::max(std::abs(match), std::abs(mismatch)); }
 };
 
 #pragma region - Algorithm Building Blocks
@@ -123,7 +124,7 @@ struct uniform_substitution_costs_t {
  *          without fetching from RAM/VRAM all the time, including the space for 3 diagonals
  *          and the strings themselves.
  *
- *  @tparam size_type_ The type of the size, usually `sz_size_t` for large inputs or `uint` on small inputs in CUDA.
+ *  @tparam size_type_ The type of the size, usually `size_t` for large inputs or `uint` on small inputs in CUDA.
  *  @tparam is_signed_ Whether the similarity scores can be negative or not.
  */
 template <typename size_type_, bool is_signed_>
@@ -205,13 +206,13 @@ concept score_like = std::integral<value_type_> && std::is_trivial_v<value_type_
 
 template <typename substituter_type_>
 concept substituter_like = requires(substituter_type_ costs) {
-    { costs.magnitude() } -> std::convertible_to<std::size_t>;                 // retrieving the magnitude
+    { costs.magnitude() } -> std::convertible_to<size_t>;                      // retrieving the magnitude
     { costs.operator()(char(), char()) } -> std::convertible_to<error_cost_t>; // cost of substitution
 };
 
 template <typename gap_costs_type_>
 concept gap_costs_like = requires(gap_costs_type_ costs) {
-    { costs.magnitude() } -> std::convertible_to<std::size_t>; // retrieving the magnitude
+    { costs.magnitude() } -> std::convertible_to<size_t>; // retrieving the magnitude
 };
 
 /**
@@ -220,7 +221,7 @@ concept gap_costs_like = requires(gap_costs_type_ costs) {
  *
  *  @tparam first_iterator_type_ Typically `char*`, `sz_rune_t*`, or a `constant_iterator`.
  *  @tparam second_iterator_type_ Typically `char*` or `sz_rune_t*`.
- *  @tparam score_type_ The type of the score, typically `sz_size_t` or `sz_ssize_t`.
+ *  @tparam score_type_ The type of the score, typically `size_t` or `sz_ssize_t`.
  *  @tparam substituter_type_ Typically `uniform_substitution_costs_t` or a lookup table.
  *  @tparam gap_costs_type_ Either `linear_gap_costs_t` or `sz_gaps_affine_k`.
  *  @tparam objective_ Either `sz_minimize_distance_k` or `sz_maximize_score_k`.
@@ -231,7 +232,7 @@ concept gap_costs_like = requires(gap_costs_type_ costs) {
 template <                                                             //
     pointer_like first_iterator_type_ = char const *,                  //
     pointer_like second_iterator_type_ = char const *,                 //
-    score_like score_type_ = sz_size_t,                                //
+    score_like score_type_ = size_t,                                   //
     substituter_like substituter_type_ = uniform_substitution_costs_t, //
     gap_costs_like gap_costs_type_ = linear_gap_costs_t,               //
     sz_similarity_objective_t objective_ = sz_maximize_score_k,        //
@@ -264,7 +265,7 @@ struct tile_scorer;
  */
 template <                                                             //
     typename char_type_ = char,                                        //
-    score_like score_type_ = sz_size_t,                                //
+    score_like score_type_ = size_t,                                   //
     substituter_like substituter_type_ = uniform_substitution_costs_t, //
     gap_costs_like gap_costs_type_ = linear_gap_costs_t,               //
     typename allocator_type_ = dummy_alloc_t,                          //
@@ -300,7 +301,7 @@ struct diagonal_walker;
  */
 template <                                                             //
     typename char_type_ = char,                                        //
-    score_like score_type_ = sz_size_t,                                //
+    score_like score_type_ = size_t,                                   //
     substituter_like substituter_type_ = uniform_substitution_costs_t, //
     gap_costs_like gap_costs_type_ = linear_gap_costs_t,               //
     typename allocator_type_ = dummy_alloc_t,                          //
@@ -315,10 +316,6 @@ struct horizontal_walker;
 
 #pragma region - Autovectorized Tile Scorer
 
-constexpr bool is_serial_or_parallel(sz_capability_t capability) noexcept {
-    return (capability == sz_cap_serial_k) || (capability == (sz_capability_t)(sz_cap_serial_k | sz_cap_parallel_k));
-}
-
 /**
  *  This overload handles:
  *  - Only @b Global alignment, not Local!
@@ -326,10 +323,9 @@ constexpr bool is_serial_or_parallel(sz_capability_t capability) noexcept {
  *  - Both auto-vectorized @b Serial and @b Parallel execution, but not hand-rolled SIMD!
  */
 template <pointer_like first_iterator_type_, pointer_like second_iterator_type_, score_like score_type_,
-          substituter_like substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
+          substituter_like substituter_type_, sz_similarity_objective_t objective_>
 struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, linear_gap_costs_t,
-                   objective_, sz_similarity_global_k, capability_,
-                   std::enable_if_t<is_serial_or_parallel(capability_)>> {
+                   objective_, sz_similarity_global_k, sz_cap_serial_k, void> {
 
     using first_iterator_t = first_iterator_type_;
     using second_iterator_t = second_iterator_type_;
@@ -339,15 +335,15 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 
     static constexpr sz_similarity_objective_t objective_k = objective_;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
-    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_capability_t capability_k = sz_cap_serial_k;
 
     using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
     using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
     using char_t = typename std::remove_cvref<first_char_t>::type;
 
-    using scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, gap_costs_t, objective_k,
-                                 locality_k, capability_k>;
+    using tile_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, gap_costs_t,
+                                      objective_k, locality_k, capability_k>;
 
   protected:
     substituter_t substituter_ {};
@@ -363,7 +359,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
      *  @note Should only be called for the diagonals outside of the bottom-right triangle.
      *  @note Should only be called for the top row and left column of the matrix.
      */
-    void init_score(score_t &cell, sz_size_t diagonal_index) const noexcept {
+    void init_score(score_t &cell, size_t diagonal_index) const noexcept {
         cell = gap_costs_.open_or_extend * diagonal_index;
     }
 
@@ -378,15 +374,15 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
      *  @param second_slice The second string.
      *  @param n The length of the diagonal to evaluate and the number of characters to compare from each string.
      */
-    void operator()(                                                                        //
-        first_iterator_t first_reversed_slice, second_iterator_t second_slice, sz_size_t n, //
+    template <executor_like executor_type_ = dummy_executor_t>
+    void operator()(                                                                     //
+        first_iterator_t first_reversed_slice, second_iterator_t second_slice, size_t n, //
         score_t const *scores_pre_substitution, score_t const *scores_pre_insertion, score_t const *scores_pre_deletion,
-        score_t *scores_new) noexcept {
+        score_t *scores_new, executor_type_ &&executor = {}) noexcept {
 
         error_cost_t const gap_cost = gap_costs_.open_or_extend;
 
-#pragma omp parallel for simd if (capability_k & sz_cap_parallel_k)
-        for (sz_size_t i = 0; i < n; ++i) {
+        executor.slice(n, [&](size_t i) {
             score_t pre_substitution = scores_pre_substitution[i];
             score_t pre_insertion = scores_pre_insertion[i];
             score_t pre_deletion = scores_pre_deletion[i];
@@ -398,7 +394,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
             score_t if_deletion_or_insertion = min_or_max<objective_k>(pre_deletion, pre_insertion) + gap_cost;
             score_t cell_score = min_or_max<objective_k>(if_deletion_or_insertion, if_substitution);
             scores_new[i] = cell_score;
-        }
+        });
 
         // The last element of the last chunk is the result of the global alignment.
         last_score_ = scores_new[n - 1];
@@ -412,10 +408,9 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  *  - Both auto-vectorized @b Serial and @b Parallel execution, but not hand-rolled SIMD!
  */
 template <pointer_like first_iterator_type_, pointer_like second_iterator_type_, score_like score_type_,
-          substituter_like substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
+          substituter_like substituter_type_, sz_similarity_objective_t objective_>
 struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, linear_gap_costs_t,
-                   objective_, sz_similarity_local_k, capability_,
-                   std::enable_if_t<is_serial_or_parallel(capability_)>> {
+                   objective_, sz_similarity_local_k, sz_cap_serial_k, void> {
 
     using first_iterator_t = first_iterator_type_;
     using second_iterator_t = second_iterator_type_;
@@ -425,15 +420,15 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 
     static constexpr sz_similarity_objective_t objective_k = objective_;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_local_k;
-    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_capability_t capability_k = sz_cap_serial_k;
 
     using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
     using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
     static_assert(std::is_same<first_char_t, second_char_t>(), "String characters must be of the same type.");
     using char_t = first_char_t;
 
-    using scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, gap_costs_t, objective_k,
-                                 locality_k, capability_k>;
+    using tile_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, gap_costs_t,
+                                      objective_k, locality_k, capability_k>;
 
   protected:
     substituter_t substituter_ {};
@@ -449,40 +444,44 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
      *  @note Should only be called for the diagonals outside of the bottom-right triangle.
      *  @note Should only be called for the top row and left column of the matrix.
      */
-    void init_score(score_t &cell, sz_size_t /* diagonal_index */) const noexcept { cell = 0; }
+    void init_score(score_t &cell, size_t /* diagonal_index */) const noexcept { cell = 0; }
 
     /**
      *  @brief Extract the final result of the scoring operation which will be maximum encountered value.
      */
     score_t score() const noexcept { return best_score_; }
 
-    void operator()(                                                                              //
-        first_iterator_t first_reversed_slice, second_iterator_t second_slice, sz_size_t const n, //
+    template <executor_like executor_type_ = dummy_executor_t>
+    void operator()(                                                                           //
+        first_iterator_t first_reversed_slice, second_iterator_t second_slice, size_t const n, //
         score_t const *scores_pre_substitution, score_t const *scores_pre_insertion, score_t const *scores_pre_deletion,
-        score_t *scores_new) noexcept {
+        score_t *scores_new, executor_type_ &&executor = {}) noexcept {
 
         error_cost_t const gap_cost = gap_costs_.open_or_extend;
-
-#pragma omp parallel for if (capability_k & sz_cap_parallel_k)
-        for (sz_size_t i = 0; i < n; ++i) {
-            score_t pre_substitution = scores_pre_substitution[i];
-            score_t pre_insertion = scores_pre_insertion[i];
-            score_t pre_deletion = scores_pre_deletion[i];
-
-            // ? Note that here we are still traversing both buffers in the same order,
-            // ? because one of the strings has been reversed beforehand.
-            error_cost_t cost_of_substitution = substituter_(first_reversed_slice[i], second_slice[i]);
-            score_t if_substitution = pre_substitution + cost_of_substitution;
-            score_t if_deletion_or_insertion = min_or_max<objective_k>(pre_deletion, pre_insertion) + gap_cost;
-            // ! This is the main difference with global alignment:
-            score_t if_substitution_or_reset = min_or_max<objective_k, score_t>(if_substitution, 0);
-            score_t cell_score = min_or_max<objective_k>(if_deletion_or_insertion, if_substitution_or_reset);
-            scores_new[i] = cell_score;
-
-            // ! Update the global maximum score if this cell beats it - this is the costliest operation:
-#pragma omp critical
-            { best_score_ = min_or_max<objective_k>(best_score_, cell_score); }
-        }
+        std::atomic<score_t> atomic_best_score {best_score_};
+        executor.slice_range(n, [&](size_t i_start, size_t i_end) {
+            score_t local_best_score = atomic_best_score;
+            for (size_t i = i_start; i < i_end; ++i) {
+                score_t pre_substitution = scores_pre_substitution[i];
+                score_t pre_insertion = scores_pre_insertion[i];
+                score_t pre_deletion = scores_pre_deletion[i];
+
+                // ? Note that here we are still traversing both buffers in the same order,
+                // ? because one of the strings has been reversed beforehand.
+                error_cost_t cost_of_substitution = substituter_(first_reversed_slice[i], second_slice[i]);
+                score_t if_substitution = pre_substitution + cost_of_substitution;
+                score_t if_deletion_or_insertion = min_or_max<objective_k>(pre_deletion, pre_insertion) + gap_cost;
+                // ! This is the main difference with global alignment:
+                score_t if_substitution_or_reset = min_or_max<objective_k, score_t>(if_substitution, 0);
+                score_t cell_score = min_or_max<objective_k>(if_deletion_or_insertion, if_substitution_or_reset);
+                scores_new[i] = cell_score;
+
+                // ! Update the global maximum score if this cell beats it - this is the costliest operation:
+                local_best_score = min_or_max<objective_k>(local_best_score, cell_score);
+            }
+            atomic_best_score = min_or_max<objective_k, score_t>(atomic_best_score, local_best_score);
+        });
+        best_score_ = min_or_max<objective_k, score_t>(best_score_, atomic_best_score);
     }
 };
 
@@ -493,10 +492,9 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  *  - Both auto-vectorized @b Serial and @b Parallel execution, but not hand-rolled SIMD!
  */
 template <pointer_like first_iterator_type_, pointer_like second_iterator_type_, score_like score_type_,
-          substituter_like substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
+          substituter_like substituter_type_, sz_similarity_objective_t objective_>
 struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, affine_gap_costs_t,
-                   objective_, sz_similarity_global_k, capability_,
-                   std::enable_if_t<is_serial_or_parallel(capability_)>> {
+                   objective_, sz_similarity_global_k, sz_cap_serial_k, void> {
 
     using first_iterator_t = first_iterator_type_;
     using second_iterator_t = second_iterator_type_;
@@ -506,15 +504,15 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 
     static constexpr sz_similarity_objective_t objective_k = objective_;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
-    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_capability_t capability_k = sz_cap_serial_k;
 
     using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
     using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
     using char_t = typename std::remove_cvref<first_char_t>::type;
 
-    using scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, gap_costs_t, objective_k,
-                                 locality_k, capability_k>;
+    using tile_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, gap_costs_t,
+                                      objective_k, locality_k, capability_k>;
 
   protected:
     substituter_t substituter_ {};
@@ -530,11 +528,11 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
      *  @note Should only be called for the diagonals outside of the bottom-right triangle.
      *  @note Should only be called for the top row and left column of the matrix.
      */
-    void init_score(score_t &cell, sz_size_t diagonal_index) const noexcept {
+    void init_score(score_t &cell, size_t diagonal_index) const noexcept {
         cell = diagonal_index ? gap_costs_.open + gap_costs_.extend * (diagonal_index - 1) : 0;
     }
 
-    void init_gap(score_t &cell, sz_size_t diagonal_index) const noexcept {
+    void init_gap(score_t &cell, size_t diagonal_index) const noexcept {
         // Make sure the initial value of the gap is not smaller in magnitude than the primary.
         // The supplementary matrices are initialized with values of higher magnitude,
         // which is equivalent to discarding them. That's better than using `SIZE_MAX`
@@ -554,19 +552,20 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
      *  @param second_slice The second string.
      *  @param n The length of the diagonal to evaluate and the number of characters to compare from each string.
      */
-    void operator()(                                                                        //
-        first_iterator_t first_reversed_slice, second_iterator_t second_slice, sz_size_t n, //
-        score_t const *scores_pre_substitution,                                             //
-        score_t const *scores_pre_insertion,                                                //
-        score_t const *scores_pre_deletion,                                                 //
-        score_t const *scores_running_insertions,                                           //
-        score_t const *scores_running_deletions,                                            //
-        score_t *scores_new,                                                                //
-        score_t *scores_new_insertions,                                                     //
-        score_t *scores_new_deletions) noexcept {
-
-#pragma omp parallel for simd if (capability_k & sz_cap_parallel_k)
-        for (sz_size_t i = 0; i < n; ++i) {
+    template <executor_like executor_type_ = dummy_executor_t>
+    void operator()(                                                                     //
+        first_iterator_t first_reversed_slice, second_iterator_t second_slice, size_t n, //
+        score_t const *scores_pre_substitution,                                          //
+        score_t const *scores_pre_insertion,                                             //
+        score_t const *scores_pre_deletion,                                              //
+        score_t const *scores_running_insertions,                                        //
+        score_t const *scores_running_deletions,                                         //
+        score_t *scores_new,                                                             //
+        score_t *scores_new_insertions,                                                  //
+        score_t *scores_new_deletions,                                                   //
+        executor_type_ &&executor = {}) noexcept {
+
+        executor.slice(n, [&](size_t i) {
             score_t pre_substitution = scores_pre_substitution[i];
             score_t pre_insertion_opening = scores_pre_insertion[i];
             score_t pre_deletion_opening = scores_pre_deletion[i];
@@ -588,7 +587,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
             scores_new[i] = cell_score;
             scores_new_insertions[i] = if_insertion;
             scores_new_deletions[i] = if_deletion;
-        }
+        });
 
         // The last element of the last chunk is the result of the global alignment.
         last_score_ = scores_new[n - 1];
@@ -602,10 +601,9 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  *  - Both auto-vectorized @b Serial and @b Parallel execution, but not hand-rolled SIMD!
  */
 template <pointer_like first_iterator_type_, pointer_like second_iterator_type_, score_like score_type_,
-          substituter_like substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
+          substituter_like substituter_type_, sz_similarity_objective_t objective_>
 struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, affine_gap_costs_t,
-                   objective_, sz_similarity_local_k, capability_,
-                   std::enable_if_t<is_serial_or_parallel(capability_)>> {
+                   objective_, sz_similarity_local_k, sz_cap_serial_k, void> {
 
     using first_iterator_t = first_iterator_type_;
     using second_iterator_t = second_iterator_type_;
@@ -615,15 +613,15 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 
     static constexpr sz_similarity_objective_t objective_k = objective_;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_local_k;
-    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_capability_t capability_k = sz_cap_serial_k;
 
     using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
     using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
     static_assert(std::is_same<first_char_t, second_char_t>(), "String characters must be of the same type.");
     using char_t = first_char_t;
 
-    using scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, gap_costs_t, objective_k,
-                                 locality_k, capability_k>;
+    using tile_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, gap_costs_t,
+                                      objective_k, locality_k, capability_k>;
 
   protected:
     substituter_t substituter_ {};
@@ -639,8 +637,8 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
      *  @note Should only be called for the diagonals outside of the bottom-right triangle.
      *  @note Should only be called for the top row and left column of the matrix.
      */
-    void init_score(score_t &cell, sz_size_t /* diagonal_index */) const noexcept { cell = 0; }
-    void init_gap(score_t &cell, sz_size_t /* diagonal_index */) const noexcept {
+    void init_score(score_t &cell, size_t /* diagonal_index */) const noexcept { cell = 0; }
+    void init_gap(score_t &cell, size_t /* diagonal_index */) const noexcept {
         // Make sure the initial value of the gap is not smaller in magnitude than the primary.
         // The supplementary matrices are initialized with values of higher magnitude,
         // which is equivalent to discarding them. That's better than using `SIZE_MAX`
@@ -653,47 +651,53 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
      */
     score_t score() const noexcept { return best_score_; }
 
-    void operator()(                                                                              //
-        first_iterator_t first_reversed_slice, second_iterator_t second_slice, sz_size_t const n, //
-        score_t const *scores_pre_substitution,                                                   //
-        score_t const *scores_pre_insertion,                                                      //
-        score_t const *scores_pre_deletion,                                                       //
-        score_t const *scores_running_insertions,                                                 //
-        score_t const *scores_running_deletions,                                                  //
-        score_t *scores_new,                                                                      //
-        score_t *scores_new_insertions,                                                           //
-        score_t *scores_new_deletions) noexcept {
-
-#pragma omp parallel for if (capability_k & sz_cap_parallel_k)
-        for (sz_size_t i = 0; i < n; ++i) {
-            score_t pre_substitution = scores_pre_substitution[i];
-            score_t pre_insertion_opening = scores_pre_insertion[i];
-            score_t pre_deletion_opening = scores_pre_deletion[i];
-            score_t pre_insertion_expansion = scores_running_insertions[i];
-            score_t pre_deletion_expansion = scores_running_deletions[i];
-
-            // ? Note that here we are still traversing both buffers in the same order,
-            // ? because one of the strings has been reversed beforehand.
-            error_cost_t cost_of_substitution = substituter_(first_reversed_slice[i], second_slice[i]);
-            score_t if_substitution = pre_substitution + cost_of_substitution;
-            score_t if_deletion = min_or_max<objective_k>(pre_deletion_opening + gap_costs_.open,
-                                                          pre_deletion_expansion + gap_costs_.extend);
-            score_t if_insertion = min_or_max<objective_k>(pre_insertion_opening + gap_costs_.open,
-                                                           pre_insertion_expansion + gap_costs_.extend);
-            score_t if_deletion_or_insertion = min_or_max<objective_k>(if_deletion, if_insertion);
-            // ! This is the main difference with global alignment:
-            score_t if_substitution_or_reset = min_or_max<objective_k, score_t>(if_substitution, 0);
-            score_t cell_score = min_or_max<objective_k>(if_deletion_or_insertion, if_substitution_or_reset);
-
-            // Export results.
-            scores_new[i] = cell_score;
-            scores_new_deletions[i] = if_deletion;
-            scores_new_insertions[i] = if_insertion;
-
-            // ! Update the global maximum score if this cell beats it - this is the costliest operation:
-#pragma omp critical
-            { best_score_ = min_or_max<objective_k>(best_score_, cell_score); }
-        }
+    template <executor_like executor_type_ = dummy_executor_t>
+    void operator()(                                                                           //
+        first_iterator_t first_reversed_slice, second_iterator_t second_slice, size_t const n, //
+        score_t const *scores_pre_substitution,                                                //
+        score_t const *scores_pre_insertion,                                                   //
+        score_t const *scores_pre_deletion,                                                    //
+        score_t const *scores_running_insertions,                                              //
+        score_t const *scores_running_deletions,                                               //
+        score_t *scores_new,                                                                   //
+        score_t *scores_new_insertions,                                                        //
+        score_t *scores_new_deletions,                                                         //
+        executor_type_ &&executor = {}) noexcept {
+
+        std::atomic<score_t> atomic_best_score {best_score_};
+        executor.slice_range(n, [&](size_t i_start, size_t i_end) {
+            score_t local_best_score = atomic_best_score;
+            for (size_t i = i_start; i < i_end; ++i) {
+                score_t pre_substitution = scores_pre_substitution[i];
+                score_t pre_insertion_opening = scores_pre_insertion[i];
+                score_t pre_deletion_opening = scores_pre_deletion[i];
+                score_t pre_insertion_expansion = scores_running_insertions[i];
+                score_t pre_deletion_expansion = scores_running_deletions[i];
+
+                // ? Note that here we are still traversing both buffers in the same order,
+                // ? because one of the strings has been reversed beforehand.
+                error_cost_t cost_of_substitution = substituter_(first_reversed_slice[i], second_slice[i]);
+                score_t if_substitution = pre_substitution + cost_of_substitution;
+                score_t if_deletion = min_or_max<objective_k>(pre_deletion_opening + gap_costs_.open,
+                                                              pre_deletion_expansion + gap_costs_.extend);
+                score_t if_insertion = min_or_max<objective_k>(pre_insertion_opening + gap_costs_.open,
+                                                               pre_insertion_expansion + gap_costs_.extend);
+                score_t if_deletion_or_insertion = min_or_max<objective_k>(if_deletion, if_insertion);
+                // ! This is the main difference with global alignment:
+                score_t if_substitution_or_reset = min_or_max<objective_k, score_t>(if_substitution, 0);
+                score_t cell_score = min_or_max<objective_k>(if_deletion_or_insertion, if_substitution_or_reset);
+
+                // Export results.
+                scores_new[i] = cell_score;
+                scores_new_deletions[i] = if_deletion;
+                scores_new_insertions[i] = if_insertion;
+
+                // ! Update the global maximum score if this cell beats it - this is the costliest operation:
+                local_best_score = min_or_max<objective_k>(local_best_score, cell_score);
+            }
+            atomic_best_score = min_or_max<objective_k, score_t>(atomic_best_score, local_best_score);
+        });
+        best_score_ = min_or_max<objective_k, score_t>(best_score_, atomic_best_score);
     }
 };
 
@@ -727,8 +731,8 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_co
 
     using allocated_t = typename allocator_t::value_type;
     static_assert(sizeof(allocated_t) == sizeof(char), "Allocator must be byte-aligned");
-    using scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t, objective_k,
-                                 locality_k, capability_k>;
+    using tile_scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t, objective_k,
+                                      locality_k, capability_k>;
 
     substituter_t substituter_ {};
     linear_gap_costs_t gap_costs_ {};
@@ -749,7 +753,9 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_co
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score.
      */
-    status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref) const noexcept {
+    template <executor_like executor_type_ = dummy_executor_t>
+    status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref,
+                        executor_type_ &&executor = {}) const noexcept {
 
         // Early exit for empty strings.
         if (first.empty() || second.empty()) {
@@ -763,7 +769,7 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_co
 
         // Make sure the size relation between the strings is correct.
         char_t const *shorter = first.data(), *longer = second.data();
-        sz_size_t shorter_length = first.size(), longer_length = second.size();
+        size_t shorter_length = first.size(), longer_length = second.size();
         if (shorter_length > longer_length) {
             std::swap(shorter, longer);
             std::swap(shorter_length, longer_length);
@@ -771,8 +777,8 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_co
 
         // We are going to store 3 diagonals of the matrix.
         // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
-        sz_size_t const shorter_dim = shorter_length + 1;
-        sz_size_t const longer_dim = longer_length + 1;
+        size_t const shorter_dim = shorter_length + 1;
+        size_t const longer_dim = longer_length + 1;
 
         // Let's say we are dealing with 3 and 5 letter words.
         // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
@@ -780,12 +786,12 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_co
         // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
         // - 2 diagonals of fixed length, at positions: 4, 5.
         // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
-        sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
-        sz_size_t const max_diagonal_length = shorter_length + 1;
+        size_t const diagonals_count = shorter_dim + longer_dim - 1;
+        size_t const max_diagonal_length = shorter_length + 1;
 
         // We want to avoid reverse-order iteration over the shorter string.
         // Let's allocate a bit more memory and reverse-export our shorter string into that buffer.
-        sz_size_t const buffer_length = sizeof(score_t) * max_diagonal_length * 3 + shorter_length * sizeof(char_t);
+        size_t const buffer_length = sizeof(score_t) * max_diagonal_length * 3 + shorter_length * sizeof(char_t);
         score_t *const buffer = (score_t *)alloc_.allocate(buffer_length);
         if (!buffer) return status_t::bad_alloc_k;
 
@@ -796,35 +802,36 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_co
         char_t *const shorter_reversed = (char_t *)(next_scores + max_diagonal_length);
 
         // Export the reversed string into the buffer.
-        for (sz_size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
+        for (size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
 
         // Initialize the first two diagonals:
-        scorer_t diagonal_aligner {substituter_, gap_costs_};
-        diagonal_aligner.init_score(previous_scores[0], 0);
-        diagonal_aligner.init_score(current_scores[0], 1);
-        diagonal_aligner.init_score(current_scores[1], 1);
+        tile_scorer_t scorer {substituter_, gap_costs_};
+        scorer.init_score(previous_scores[0], 0);
+        scorer.init_score(current_scores[0], 1);
+        scorer.init_score(current_scores[1], 1);
 
         // We skip diagonals 0 and 1, as they are trivial.
         // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
         // so we are effectively computing just one value, as will be marked by a single set bit in
         // the `next_diagonal_mask` on the very first iteration.
-        sz_size_t next_diagonal_index = 2;
+        size_t next_diagonal_index = 2;
 
         // Progress through the upper-left triangle of the Levenshtein matrix.
         for (; next_diagonal_index < shorter_dim; ++next_diagonal_index) {
 
-            sz_size_t const next_diagonal_length = next_diagonal_index + 1;
-            diagonal_aligner(                                                //
+            size_t const next_diagonal_length = next_diagonal_index + 1;
+            scorer(                                                          //
                 shorter_reversed + shorter_length - next_diagonal_index + 1, // first sequence of characters
                 longer,                                                      // second sequence of characters
-                next_diagonal_length - 2,           // number of elements to compute with the `diagonal_aligner`
+                next_diagonal_length - 2,           // number of elements to compute with the `scorer`
                 previous_scores,                    // costs pre substitution
                 current_scores, current_scores + 1, // costs pre insertion/deletion
-                next_scores + 1);
+                next_scores + 1,                    // new scores for the next diagonal
+                executor);                          // parallel execution within the diagonal
 
             // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-            diagonal_aligner.init_score(next_scores[0], next_diagonal_index);
-            diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
+            scorer.init_score(next_scores[0], next_diagonal_index);
+            scorer.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
 
             // Perform a circular rotation of those buffers, to reuse the memory.
             rotate_three(previous_scores, current_scores, next_scores);
@@ -833,17 +840,18 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_co
         // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
         for (; next_diagonal_index < longer_dim; ++next_diagonal_index) {
 
-            sz_size_t const next_diagonal_length = shorter_dim;
-            diagonal_aligner(                                        //
+            size_t const next_diagonal_length = shorter_dim;
+            scorer(                                                  //
                 shorter_reversed + shorter_length - shorter_dim + 1, // first sequence of characters
                 longer + next_diagonal_index - shorter_dim,          // second sequence of characters
-                next_diagonal_length - 1,           // number of elements to compute with the `diagonal_aligner`
-                previous_scores,                    // costs pre substitution
-                current_scores, current_scores + 1, // costs pre insertion/deletion
-                next_scores);
+                next_diagonal_length - 1,                            // number of elements to compute with the `scorer`
+                previous_scores,                                     // costs pre substitution
+                current_scores, current_scores + 1,                  // costs pre insertion/deletion
+                next_scores,                                         // new scores for the next diagonal
+                executor);                                           // parallel execution within the diagonal
 
             // Don't forget to populate the first row of the Levenshtein matrix.
-            diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
+            scorer.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
 
             // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
             // dropping the first element in the current array.
@@ -857,14 +865,15 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_co
         // Now let's handle the bottom-right triangle of the matrix.
         for (; next_diagonal_index < diagonals_count; ++next_diagonal_index) {
 
-            sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
-            diagonal_aligner(                                        //
+            size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
+            scorer(                                                  //
                 shorter_reversed + shorter_length - shorter_dim + 1, // first sequence of characters
                 longer + next_diagonal_index - shorter_dim,          // second sequence of characters
-                next_diagonal_length,               // number of elements to compute with the `diagonal_aligner`
-                previous_scores,                    // costs pre substitution
-                current_scores, current_scores + 1, // costs pre insertion/deletion
-                next_scores);
+                next_diagonal_length,                                // number of elements to compute with the `scorer`
+                previous_scores,                                     // costs pre substitution
+                current_scores, current_scores + 1,                  // costs pre insertion/deletion
+                next_scores,                                         // new scores for the next diagonal
+                executor);                                           // parallel execution within the diagonal
 
             // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
             // dropping the first element in the current array.
@@ -877,7 +886,7 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_co
         }
 
         // Export the scalar before `free` call.
-        result_ref = diagonal_aligner.score();
+        result_ref = scorer.score();
         alloc_.deallocate((allocated_t *)buffer, buffer_length);
         return status_t::success_k;
     }
@@ -910,8 +919,8 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
 
     using allocated_t = typename allocator_t::value_type;
     static_assert(sizeof(allocated_t) == sizeof(char), "Allocator must be byte-aligned");
-    using scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t, objective_k,
-                                 locality_k, capability_k>;
+    using tile_scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t, objective_k,
+                                      locality_k, capability_k>;
 
     substituter_t substituter_ {};
     affine_gap_costs_t gap_costs_ {};
@@ -932,7 +941,9 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score.
      */
-    status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref) const noexcept {
+    template <executor_like executor_type_ = dummy_executor_t>
+    status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref,
+                        executor_type_ &&executor = {}) const noexcept {
 
         // Early exit for empty strings.
         if (first.empty() || second.empty()) {
@@ -950,7 +961,7 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
 
         // Make sure the size relation between the strings is correct.
         char_t const *shorter = first.data(), *longer = second.data();
-        sz_size_t shorter_length = first.size(), longer_length = second.size();
+        size_t shorter_length = first.size(), longer_length = second.size();
         if (shorter_length > longer_length) {
             std::swap(shorter, longer);
             std::swap(shorter_length, longer_length);
@@ -958,8 +969,8 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
 
         // We are going to store 7 diagonals of the matrix.
         // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
-        sz_size_t const shorter_dim = shorter_length + 1;
-        sz_size_t const longer_dim = longer_length + 1;
+        size_t const shorter_dim = shorter_length + 1;
+        size_t const longer_dim = longer_length + 1;
 
         // Let's say we are dealing with 3 and 5 letter words.
         // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
@@ -967,12 +978,12 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
         // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
         // - 2 diagonals of fixed length, at positions: 4, 5.
         // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
-        sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
-        sz_size_t const max_diagonal_length = shorter_length + 1;
+        size_t const diagonals_count = shorter_dim + longer_dim - 1;
+        size_t const max_diagonal_length = shorter_length + 1;
 
         // We want to avoid reverse-order iteration over the shorter string.
         // Let's allocate a bit more memory and reverse-export our shorter string into that buffer.
-        sz_size_t const buffer_length = sizeof(score_t) * max_diagonal_length * 7 + shorter_length * sizeof(char_t);
+        size_t const buffer_length = sizeof(score_t) * max_diagonal_length * 7 + shorter_length * sizeof(char_t);
         score_t *const buffer = (score_t *)alloc_.allocate(buffer_length);
         if (!buffer) return status_t::bad_alloc_k;
 
@@ -987,42 +998,43 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
         char_t *const shorter_reversed = (char_t *)(next_deletes + max_diagonal_length);
 
         // Export the reversed string into the buffer.
-        for (sz_size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
+        for (size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
 
         // Initialize the first two diagonals:
-        scorer_t diagonal_aligner {substituter_, gap_costs_};
-        diagonal_aligner.init_score(previous_scores[0], 0);
-        diagonal_aligner.init_score(current_scores[0], 1);
-        diagonal_aligner.init_score(current_scores[1], 1);
-        diagonal_aligner.init_gap(current_inserts[0], 1);
-        diagonal_aligner.init_gap(current_deletes[1], 1);
+        tile_scorer_t scorer {substituter_, gap_costs_};
+        scorer.init_score(previous_scores[0], 0);
+        scorer.init_score(current_scores[0], 1);
+        scorer.init_score(current_scores[1], 1);
+        scorer.init_gap(current_inserts[0], 1);
+        scorer.init_gap(current_deletes[1], 1);
 
         // We skip diagonals 0 and 1, as they are trivial.
         // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
         // so we are effectively computing just one value, as will be marked by a single set bit in
         // the `next_diagonal_mask` on the very first iteration.
-        sz_size_t next_diagonal_index = 2;
+        size_t next_diagonal_index = 2;
 
         // Progress through the upper-left triangle of the Levenshtein matrix.
         for (; next_diagonal_index < shorter_dim; ++next_diagonal_index) {
 
-            sz_size_t const next_diagonal_length = next_diagonal_index + 1;
-            diagonal_aligner(                                                //
+            size_t const next_diagonal_length = next_diagonal_index + 1;
+            scorer(                                                          //
                 shorter_reversed + shorter_length - next_diagonal_index + 1, // first sequence of characters
                 longer,                                                      // second sequence of characters
-                next_diagonal_length - 2,             // number of elements to compute with the `diagonal_aligner`
+                next_diagonal_length - 2,             // number of elements to compute with the `scorer`
                 previous_scores,                      // costs pre substitution
                 current_scores, current_scores + 1,   // costs pre insertion/deletion opening
                 current_inserts, current_deletes + 1, // costs pre insertion/deletion extension
                 next_scores + 1,                      // updated similarity scores
-                next_inserts + 1, next_deletes + 1    // updated insertion/deletion extensions
+                next_inserts + 1, next_deletes + 1,   // updated insertion/deletion extensions
+                executor                              // parallel execution within the diagonal
             );
 
             // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-            diagonal_aligner.init_score(next_scores[0], next_diagonal_index);
-            diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
-            diagonal_aligner.init_gap(next_inserts[0], next_diagonal_index);
-            diagonal_aligner.init_gap(next_deletes[next_diagonal_length - 1], next_diagonal_index);
+            scorer.init_score(next_scores[0], next_diagonal_index);
+            scorer.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
+            scorer.init_gap(next_inserts[0], next_diagonal_index);
+            scorer.init_gap(next_deletes[next_diagonal_length - 1], next_diagonal_index);
 
             // Perform a circular rotation of those buffers, to reuse the memory.
             rotate_three(previous_scores, current_scores, next_scores);
@@ -1033,21 +1045,22 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
         // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
         for (; next_diagonal_index < longer_dim; ++next_diagonal_index) {
 
-            sz_size_t const next_diagonal_length = shorter_dim;
-            diagonal_aligner(                                        //
+            size_t const next_diagonal_length = shorter_dim;
+            scorer(                                                  //
                 shorter_reversed + shorter_length - shorter_dim + 1, // first sequence of characters
                 longer + next_diagonal_index - shorter_dim,          // second sequence of characters
-                next_diagonal_length - 1,             // number of elements to compute with the `diagonal_aligner`
-                previous_scores,                      // costs pre substitution
-                current_scores, current_scores + 1,   // costs pre insertion/deletion opening
-                current_inserts, current_deletes + 1, // costs pre insertion/deletion extension
-                next_scores,                          // updated similarity scores
-                next_inserts, next_deletes            // updated insertion/deletion extensions
+                next_diagonal_length - 1,                            // number of elements to compute with the `scorer`
+                previous_scores,                                     // costs pre substitution
+                current_scores, current_scores + 1,                  // costs pre insertion/deletion opening
+                current_inserts, current_deletes + 1,                // costs pre insertion/deletion extension
+                next_scores,                                         // updated similarity scores
+                next_inserts, next_deletes,                          // updated insertion/deletion extensions
+                executor                                             // parallel execution within the diagonal
             );
 
             // Don't forget to populate the first row of the Levenshtein matrix.
-            diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
-            diagonal_aligner.init_gap(next_deletes[next_diagonal_length - 1], next_diagonal_index);
+            scorer.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
+            scorer.init_gap(next_deletes[next_diagonal_length - 1], next_diagonal_index);
 
             // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
             // dropping the first element in the current array.
@@ -1063,16 +1076,17 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
         // Now let's handle the bottom-right triangle of the matrix.
         for (; next_diagonal_index < diagonals_count; ++next_diagonal_index) {
 
-            sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
-            diagonal_aligner(                                        //
+            size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
+            scorer(                                                  //
                 shorter_reversed + shorter_length - shorter_dim + 1, // first sequence of characters
                 longer + next_diagonal_index - shorter_dim,          // second sequence of characters
-                next_diagonal_length,                 // number of elements to compute with the `diagonal_aligner`
-                previous_scores,                      // costs pre substitution
-                current_scores, current_scores + 1,   // costs pre insertion/deletion opening
-                current_inserts, current_deletes + 1, // costs pre insertion/deletion extension
-                next_scores,                          // updated similarity scores
-                next_inserts, next_deletes            // updated insertion/deletion extensions
+                next_diagonal_length,                                // number of elements to compute with the `scorer`
+                previous_scores,                                     // costs pre substitution
+                current_scores, current_scores + 1,                  // costs pre insertion/deletion opening
+                current_inserts, current_deletes + 1,                // costs pre insertion/deletion extension
+                next_scores,                                         // updated similarity scores
+                next_inserts, next_deletes,                          // updated insertion/deletion extensions
+                executor                                             // parallel execution within the diagonal
             );
 
             // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
@@ -1088,7 +1102,7 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
         }
 
         // Export the scalar before `free` call.
-        result_ref = diagonal_aligner.score();
+        result_ref = scorer.score();
         alloc_.deallocate((allocated_t *)buffer, buffer_length);
         return status_t::success_k;
     }
@@ -1106,10 +1120,9 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
  *  Allocates 2x rows of the DP matrix.
  */
 template <typename char_type_, score_like score_type_, substituter_like substituter_type_, typename allocator_type_,
-          sz_similarity_objective_t objective_, sz_similarity_locality_t locality_, sz_capability_t capability_,
-          typename enable_>
+          sz_similarity_objective_t objective_, sz_similarity_locality_t locality_>
 struct horizontal_walker<char_type_, score_type_, substituter_type_, linear_gap_costs_t, allocator_type_, objective_,
-                         locality_, capability_, enable_> {
+                         locality_, sz_cap_serial_k, void> {
 
     using char_t = char_type_;
     using score_t = score_type_;
@@ -1119,12 +1132,14 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, linear_gap_
 
     static constexpr sz_similarity_objective_t objective_k = objective_;
     static constexpr sz_similarity_locality_t locality_k = locality_;
-    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_capability_t capability_k = sz_cap_serial_k;
+    using walker_t = horizontal_walker<char_t, score_t, substituter_t, gap_costs_t, allocator_t, objective_k,
+                                       locality_k, capability_k, void>;
 
     using allocated_t = typename allocator_t::value_type;
     static_assert(sizeof(allocated_t) == sizeof(char), "Allocator must be byte-aligned");
-    using scorer_t = tile_scorer<constant_iterator<char_t>, char_t const *, score_t, substituter_t, gap_costs_t,
-                                 objective_k, locality_k, capability_k>;
+    using tile_scorer_t = tile_scorer<constant_iterator<char_t>, char_t const *, score_t, substituter_t, gap_costs_t,
+                                      objective_k, locality_k, capability_k>;
 
     substituter_t substituter_ {};
     linear_gap_costs_t gap_costs_ {};
@@ -1160,7 +1175,7 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, linear_gap_
 
         // Make sure the size relation between the strings is correct.
         char_t const *shorter = first.data(), *longer = second.data();
-        sz_size_t shorter_length = first.size(), longer_length = second.size();
+        size_t shorter_length = first.size(), longer_length = second.size();
         if (shorter_length > longer_length) {
             std::swap(shorter, longer);
             std::swap(shorter_length, longer_length);
@@ -1169,11 +1184,11 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, linear_gap_
         // We are going to store 2 rows of the matrix. It will be either 2 rows of length `shorter_length + 1`
         // or 2 rows of length `longer_length + 1`, depending on our preference - either minimizing the memory
         // consumption or the inner loop performance.
-        sz_size_t const shorter_dim = shorter_length + 1;
-        sz_size_t const longer_dim = longer_length + 1;
+        size_t const shorter_dim = shorter_length + 1;
+        size_t const longer_dim = longer_length + 1;
 
         // We decide to use less memory!
-        sz_size_t const buffer_length = sizeof(score_t) * shorter_dim * 2;
+        size_t const buffer_length = sizeof(score_t) * shorter_dim * 2;
         score_t *const buffer = (score_t *)alloc_.allocate(buffer_length);
         if (!buffer) return status_t::bad_alloc_k;
 
@@ -1182,23 +1197,22 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, linear_gap_
         score_t *current_scores = previous_scores + shorter_dim;
 
         // Initialize the first row:
-        scorer_t horizontal_aligner {substituter_, gap_costs_};
-        for (sz_size_t col_idx = 0; col_idx < shorter_dim; ++col_idx)
-            horizontal_aligner.init_score(previous_scores[col_idx], col_idx);
+        tile_scorer_t scorer {substituter_, gap_costs_};
+        for (size_t col_idx = 0; col_idx < shorter_dim; ++col_idx) scorer.init_score(previous_scores[col_idx], col_idx);
 
         // Progress through the matrix row-by-row:
-        for (sz_size_t row_idx = 1; row_idx < longer_dim; ++row_idx) {
+        for (size_t row_idx = 1; row_idx < longer_dim; ++row_idx) {
 
             // Don't forget to populate the first column of each row:
-            horizontal_aligner.init_score(current_scores[0], row_idx);
+            scorer.init_score(current_scores[0], row_idx);
 
-            horizontal_aligner(                                  //
+            scorer(                                              //
                 constant_iterator<char_t> {longer[row_idx - 1]}, // first sequence of characters
                 shorter,                                         // second sequence of characters
-                shorter_dim - 1,     // number of elements to compute with the `horizontal_aligner`
-                previous_scores,     // costs pre substitution
-                previous_scores + 1, // costs pre insertion
-                current_scores,      // costs pre deletion
+                shorter_dim - 1,                                 // number of elements to compute with the `scorer`
+                previous_scores,                                 // costs pre substitution
+                previous_scores + 1,                             // costs pre insertion
+                current_scores,                                  // costs pre deletion
                 current_scores + 1);
 
             // Reuse the memory.
@@ -1206,7 +1220,7 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, linear_gap_
         }
 
         // Export the scalar before `free` call.
-        result_ref = horizontal_aligner.score();
+        result_ref = scorer.score();
         alloc_.deallocate((allocated_t *)buffer, buffer_length);
         return status_t::success_k;
     }
@@ -1221,10 +1235,9 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, linear_gap_
  *  Allocates 2x rows of the DP matrix and 2x rows of 2x affine gaps matrices.
  */
 template <typename char_type_, score_like score_type_, substituter_like substituter_type_, typename allocator_type_,
-          sz_similarity_objective_t objective_, sz_similarity_locality_t locality_, sz_capability_t capability_,
-          typename enable_>
+          sz_similarity_objective_t objective_, sz_similarity_locality_t locality_>
 struct horizontal_walker<char_type_, score_type_, substituter_type_, affine_gap_costs_t, allocator_type_, objective_,
-                         locality_, capability_, enable_> {
+                         locality_, sz_cap_serial_k, void> {
 
     using char_t = char_type_;
     using score_t = score_type_;
@@ -1234,12 +1247,14 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, affine_gap_
 
     static constexpr sz_similarity_objective_t objective_k = objective_;
     static constexpr sz_similarity_locality_t locality_k = locality_;
-    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_capability_t capability_k = sz_cap_serial_k;
+    using walker_t = horizontal_walker<char_t, score_t, substituter_t, gap_costs_t, allocator_t, objective_k,
+                                       locality_k, capability_k, void>;
 
     using allocated_t = typename allocator_t::value_type;
     static_assert(sizeof(allocated_t) == sizeof(char), "Allocator must be byte-aligned");
-    using scorer_t = tile_scorer<constant_iterator<char_t>, char_t const *, score_t, substituter_t, gap_costs_t,
-                                 objective_k, locality_k, capability_k>;
+    using tile_scorer_t = tile_scorer<constant_iterator<char_t>, char_t const *, score_t, substituter_t, gap_costs_t,
+                                      objective_k, locality_k, capability_k>;
 
     substituter_t substituter_ {};
     affine_gap_costs_t gap_costs_ {};
@@ -1279,7 +1294,7 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, affine_gap_
 
         // Make sure the size relation between the strings is correct.
         char_t const *shorter = first.data(), *longer = second.data();
-        sz_size_t shorter_length = first.size(), longer_length = second.size();
+        size_t shorter_length = first.size(), longer_length = second.size();
         if (shorter_length > longer_length) {
             std::swap(shorter, longer);
             std::swap(shorter_length, longer_length);
@@ -1288,11 +1303,11 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, affine_gap_
         // We are going to store 2 rows of the matrix. It will be either 2 rows of length `shorter_length + 1`
         // or 2 rows of length `longer_length + 1`, depending on our preference - either minimizing the memory
         // consumption or the inner loop performance.
-        sz_size_t const shorter_dim = shorter_length + 1;
-        sz_size_t const longer_dim = longer_length + 1;
+        size_t const shorter_dim = shorter_length + 1;
+        size_t const longer_dim = longer_length + 1;
 
         // We decide to use less memory!
-        sz_size_t const buffer_length = sizeof(score_t) * shorter_dim * 2 * 3; // 2x rows of 3x matrices
+        size_t const buffer_length = sizeof(score_t) * shorter_dim * 2 * 3; // 2x rows of 3x matrices
         score_t *const buffer = (score_t *)alloc_.allocate(buffer_length);
         if (!buffer) return status_t::bad_alloc_k;
 
@@ -1305,29 +1320,29 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, affine_gap_
         score_t *current_deletes = previous_deletes + shorter_dim;
 
         // Initialize the first row:
-        scorer_t horizontal_aligner {substituter_, gap_costs_};
+        tile_scorer_t scorer {substituter_, gap_costs_};
         previous_scores[0] = 0;
-        for (sz_size_t col_idx = 1; col_idx < shorter_dim; ++col_idx) {
-            horizontal_aligner.init_score(previous_scores[col_idx], col_idx);
-            horizontal_aligner.init_gap(previous_deletes[col_idx], col_idx);
+        for (size_t col_idx = 1; col_idx < shorter_dim; ++col_idx) {
+            scorer.init_score(previous_scores[col_idx], col_idx);
+            scorer.init_gap(previous_deletes[col_idx], col_idx);
         }
 
         // Progress through the matrix row-by-row:
-        for (sz_size_t row_idx = 1; row_idx < longer_dim; ++row_idx) {
+        for (size_t row_idx = 1; row_idx < longer_dim; ++row_idx) {
 
             // Don't forget to populate the first column of each row:
-            horizontal_aligner.init_score(current_scores[0], row_idx);
-            horizontal_aligner.init_gap(current_inserts[0], row_idx);
+            scorer.init_score(current_scores[0], row_idx);
+            scorer.init_gap(current_inserts[0], row_idx);
 
-            horizontal_aligner(                                  //
+            scorer(                                              //
                 constant_iterator<char_t> {longer[row_idx - 1]}, // first sequence of characters
                 shorter,                                         // second sequence of characters
-                shorter_dim - 1,                         // number of elements to compute with the `horizontal_aligner`
-                previous_scores,                         // costs pre substitution
-                current_scores, previous_scores + 1,     // costs pre insertion/deletion opening
-                current_inserts, previous_deletes + 1,   // costs pre insertion/deletion extension
-                current_scores + 1,                      // updated similarity scores
-                current_inserts + 1, current_deletes + 1 // updated insertion/deletion extensions
+                shorter_dim - 1,                                 // number of elements to compute with the `scorer`
+                previous_scores,                                 // costs pre substitution
+                current_scores, previous_scores + 1,             // costs pre insertion/deletion opening
+                current_inserts, previous_deletes + 1,           // costs pre insertion/deletion extension
+                current_scores + 1,                              // updated similarity scores
+                current_inserts + 1, current_deletes + 1         // updated insertion/deletion extensions
             );
 
             // Reuse the memory.
@@ -1337,7 +1352,7 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, affine_gap_
         }
 
         // Export the scalar before `free` call.
-        result_ref = horizontal_aligner.score();
+        result_ref = scorer.score();
         alloc_.deallocate((allocated_t *)buffer, buffer_length);
         return status_t::success_k;
     }
@@ -1355,7 +1370,7 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, affine_gap_
  *
  *  @tparam char_type_ Can be any POD integer type, but @b `char` and @b `sz_rune_t` are preferred.
  *  @tparam gap_costs_type_ Can be either `linear_gap_costs_t` or `affine_gap_costs_t`.
- *  @tparam capability_ Can be either `sz_cap_serial_k`, `sz_caps_sp_k`, `sz_caps_spi_k`, `sz_cap_cuda_k`.
+ *  @tparam capability_ Can be either `sz_cap_serial_k`, `sz_caps_si_k`, `sz_cap_cuda_k`.
  */
 template <                                               //
     typename char_type_ = char,                          //
@@ -1405,7 +1420,9 @@ struct levenshtein_distance {
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
-    status_t operator()(span<char_t const> first, span<char_t const> second, sz_size_t &result_ref) const noexcept {
+    template <executor_like executor_type_ = dummy_executor_t>
+    status_t operator()(span<char_t const> first, span<char_t const> second, size_t &result_ref,
+                        executor_type_ &&executor = {}) const noexcept {
 
         // If the cost of gap opening and extension is the same and we've mistakenly instantiated
         // the more memory-intensive `affine_gap_costs_t`, we can fall-back to the linearized version.
@@ -1413,11 +1430,11 @@ struct levenshtein_distance {
             if (gap_costs_.open == gap_costs_.extend) {
                 linear_gap_costs_t linear_gap(gap_costs_.open);
                 linearized_fallback_t linear_backend(substituter_, linear_gap, alloc_);
-                return linear_backend(first, second, result_ref);
+                return linear_backend(first, second, result_ref, executor);
             }
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
-        using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, false>;
+        using similarity_memory_requirements_t = similarity_memory_requirements<size_t, false>;
         similarity_memory_requirements_t requirements(                                 //
             first.size(), second.size(),                                               //
             gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
@@ -1435,25 +1452,25 @@ struct levenshtein_distance {
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
         else if (requirements.bytes_per_cell == 1) {
             sz_u8_t result_u8 = std::numeric_limits<sz_u8_t>::max();
-            status_t status = diagonal_u8_t {substituter_, gap_costs_, alloc_}(first, second, result_u8);
+            status_t status = diagonal_u8_t {substituter_, gap_costs_, alloc_}(first, second, result_u8, executor);
             if (status != status_t::success_k) return status;
             result_ref = result_u8;
         }
         else if (requirements.bytes_per_cell == 2) {
             sz_u16_t result_u16 = std::numeric_limits<sz_u16_t>::max();
-            status_t status = diagonal_u16_t {substituter_, gap_costs_, alloc_}(first, second, result_u16);
+            status_t status = diagonal_u16_t {substituter_, gap_costs_, alloc_}(first, second, result_u16, executor);
             if (status != status_t::success_k) return status;
             result_ref = result_u16;
         }
         else if (requirements.bytes_per_cell == 4) {
             sz_u32_t result_u32 = std::numeric_limits<sz_u32_t>::max();
-            status_t status = diagonal_u32_t {substituter_, gap_costs_, alloc_}(first, second, result_u32);
+            status_t status = diagonal_u32_t {substituter_, gap_costs_, alloc_}(first, second, result_u32, executor);
             if (status != status_t::success_k) return status;
             result_ref = result_u32;
         }
         else if (requirements.bytes_per_cell == 8) {
             sz_u64_t result_u64 = std::numeric_limits<sz_u64_t>::max();
-            status_t status = diagonal_u64_t {substituter_, gap_costs_, alloc_}(first, second, result_u64);
+            status_t status = diagonal_u64_t {substituter_, gap_costs_, alloc_}(first, second, result_u64, executor);
             if (status != status_t::success_k) return status;
             result_ref = result_u64;
         }
@@ -1515,7 +1532,9 @@ struct levenshtein_distance_utf8 {
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
-    status_t operator()(span<char_t const> first, span<char_t const> second, sz_size_t &result_ref) const noexcept {
+    template <executor_like executor_type_ = dummy_executor_t>
+    status_t operator()(span<char_t const> first, span<char_t const> second, size_t &result_ref,
+                        executor_type_ &&executor = {}) const noexcept {
 
         // If the cost of gap opening and extension is the same and we've mistakenly instantiated
         // the more memory-intensive `affine_gap_costs_t`, we can fall-back to the linearized version.
@@ -1523,31 +1542,31 @@ struct levenshtein_distance_utf8 {
             if (gap_costs_.open == gap_costs_.extend) {
                 linear_gap_costs_t linear_gap(gap_costs_.open);
                 linearized_fallback_t linear_backend(substituter_, linear_gap, alloc_);
-                return linear_backend(first, second, result_ref);
+                return linear_backend(first, second, result_ref, executor);
             }
 
         // Check if the strings are entirely composed of ASCII characters,
         // and default to a simpler algorithm in that case.
         if (sz_isascii(first.data(), first.size()) && sz_isascii(second.data(), second.size()))
-            return ascii_fallback_t {substituter_, gap_costs_, alloc_}(first, second, result_ref);
+            return ascii_fallback_t {substituter_, gap_costs_, alloc_}(first, second, result_ref, executor);
 
         // Allocate some memory to expand UTF-8 strings into UTF-32.
-        sz_size_t const max_utf32_bytes = first.size() * 4 + second.size() * 4;
+        size_t const max_utf32_bytes = first.size() * 4 + second.size() * 4;
         sz_rune_t *const first_data_utf32 = (sz_rune_t *)alloc_.allocate(max_utf32_bytes);
         sz_rune_t *const second_data_utf32 = first_data_utf32 + first.size();
 
         // Export into UTF-32 buffer.
         sz_rune_length_t rune_length;
-        sz_size_t first_length_utf32 = 0, second_length_utf32 = 0;
-        for (sz_size_t progress_utf8 = 0, progress_utf32 = 0; progress_utf8 < first.size();
+        size_t first_length_utf32 = 0, second_length_utf32 = 0;
+        for (size_t progress_utf8 = 0, progress_utf32 = 0; progress_utf8 < first.size();
              progress_utf8 += rune_length, ++progress_utf32, ++first_length_utf32)
             sz_rune_parse(first.data() + progress_utf8, first_data_utf32 + progress_utf32, &rune_length);
-        for (sz_size_t progress_utf8 = 0, progress_utf32 = 0; progress_utf8 < second.size();
+        for (size_t progress_utf8 = 0, progress_utf32 = 0; progress_utf8 < second.size();
              progress_utf8 += rune_length, ++progress_utf32, ++second_length_utf32)
             sz_rune_parse(second.data() + progress_utf8, second_data_utf32 + progress_utf32, &rune_length);
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
-        using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, false>;
+        using similarity_memory_requirements_t = similarity_memory_requirements<size_t, false>;
         similarity_memory_requirements_t requirements(                                 //
             first.size(), second.size(),                                               //
             gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
@@ -1568,25 +1587,29 @@ struct levenshtein_distance_utf8 {
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
         else if (requirements.bytes_per_cell == 1) {
             sz_u8_t result_u8 = std::numeric_limits<sz_u8_t>::max();
-            status_t status = diagonal_u8_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u8);
+            status_t status =
+                diagonal_u8_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u8, executor);
             if (status != status_t::success_k) return status;
             result_ref = result_u8;
         }
         else if (requirements.bytes_per_cell == 2) {
             sz_u16_t result_u16 = std::numeric_limits<sz_u16_t>::max();
-            status_t status = diagonal_u16_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u16);
+            status_t status =
+                diagonal_u16_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u16, executor);
             if (status != status_t::success_k) return status;
             result_ref = result_u16;
         }
         else if (requirements.bytes_per_cell == 4) {
             sz_u32_t result_u32 = std::numeric_limits<sz_u32_t>::max();
-            status_t status = diagonal_u32_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u32);
+            status_t status =
+                diagonal_u32_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u32, executor);
             if (status != status_t::success_k) return status;
             result_ref = result_u32;
         }
         else if (requirements.bytes_per_cell == 8) {
             sz_u64_t result_u64 = std::numeric_limits<sz_u64_t>::max();
-            status_t status = diagonal_u64_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u64);
+            status_t status =
+                diagonal_u64_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u64, executor);
             if (status != status_t::success_k) return status;
             result_ref = result_u64;
         }
@@ -1643,10 +1666,12 @@ struct needleman_wunsch_score {
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
-    status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref) const noexcept {
+    template <executor_like executor_type_ = dummy_executor_t>
+    status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref,
+                        executor_type_ &&executor = {}) const noexcept {
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
-        using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, true>;
+        using similarity_memory_requirements_t = similarity_memory_requirements<size_t, true>;
         similarity_memory_requirements_t requirements(                                 //
             first.size(), second.size(),                                               //
             gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
@@ -1664,17 +1689,17 @@ struct needleman_wunsch_score {
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
         else if (requirements.bytes_per_cell == 2) {
             sz_i16_t result_i16 = std::numeric_limits<sz_i16_t>::min();
-            status = diagonal_i16_t {substituter_, gap_costs_, alloc_}(first, second, result_i16);
+            status = diagonal_i16_t {substituter_, gap_costs_, alloc_}(first, second, result_i16, executor);
             if (status == status_t::success_k) result_ref = result_i16;
         }
         else if (requirements.bytes_per_cell == 4) {
             sz_i32_t result_i32 = std::numeric_limits<sz_i32_t>::min();
-            status = diagonal_i32_t {substituter_, gap_costs_, alloc_}(first, second, result_i32);
+            status = diagonal_i32_t {substituter_, gap_costs_, alloc_}(first, second, result_i32, executor);
             if (status == status_t::success_k) result_ref = result_i32;
         }
         else if (requirements.bytes_per_cell == 8) {
             sz_i64_t result_i64 = std::numeric_limits<sz_i64_t>::min();
-            status = diagonal_i64_t {substituter_, gap_costs_, alloc_}(first, second, result_i64);
+            status = diagonal_i64_t {substituter_, gap_costs_, alloc_}(first, second, result_i64, executor);
             if (status == status_t::success_k) result_ref = result_i64;
         }
 
@@ -1730,17 +1755,19 @@ struct smith_waterman_score {
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
-    status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref) const noexcept {
+    template <executor_like executor_type_ = dummy_executor_t>
+    status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref,
+                        executor_type_ &&executor = {}) const noexcept {
 
-        sz_size_t const first_length = first.size();
-        sz_size_t const second_length = second.size();
+        size_t const first_length = first.size();
+        size_t const second_length = second.size();
         if (first_length == 0 || second_length == 0) {
             result_ref = 0;
             return status_t::success_k;
         }
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
-        using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, true>;
+        using similarity_memory_requirements_t = similarity_memory_requirements<size_t, true>;
         similarity_memory_requirements_t requirements(                                 //
             first.size(), second.size(),                                               //
             gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
@@ -1758,19 +1785,19 @@ struct smith_waterman_score {
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
         else if (requirements.bytes_per_cell == 2) {
             sz_i16_t result_i16 = std::numeric_limits<sz_i16_t>::min();
-            status_t status = diagonal_i16_t {substituter_, gap_costs_, alloc_}(first, second, result_i16);
+            status_t status = diagonal_i16_t {substituter_, gap_costs_, alloc_}(first, second, result_i16, executor);
             if (status != status_t::success_k) return status;
             result_ref = result_i16;
         }
         else if (requirements.bytes_per_cell == 4) {
             sz_i32_t result_i32 = std::numeric_limits<sz_i32_t>::min();
-            status_t status = diagonal_i32_t {substituter_, gap_costs_, alloc_}(first, second, result_i32);
+            status_t status = diagonal_i32_t {substituter_, gap_costs_, alloc_}(first, second, result_i32, executor);
             if (status != status_t::success_k) return status;
             result_ref = result_i32;
         }
         else if (requirements.bytes_per_cell == 8) {
             sz_i64_t result_i64 = std::numeric_limits<sz_i64_t>::min();
-            status_t status = diagonal_i64_t {substituter_, gap_costs_, alloc_}(first, second, result_i64);
+            status_t status = diagonal_i64_t {substituter_, gap_costs_, alloc_}(first, second, result_i64, executor);
             if (status != status_t::success_k) return status;
             result_ref = result_i64;
         }
@@ -1789,24 +1816,24 @@ struct smith_waterman_score {
  *          For pairs of very large strings, all cores cooperate to compute one distance maximizing
  *          cache hits. For smaller strings, each core computes its own distance.
  */
-template <                                                       //
-    score_like score_type_,                                      //
-    typename core_per_input_type_,                               //
-    typename all_cores_per_input_type_,                          //
-    typename first_strings_type_, typename second_strings_type_, //
-    typename results_type_                                       //
+template <                                          //
+    score_like score_type_,                         //
+    typename scoring_type_,                         //
+    typename first_strings_type_,                   //
+    typename second_strings_type_,                  //
+    typename results_type_,                         //
+    executor_like executor_type_ = dummy_executor_t //
     >
-status_t _score_in_parallel(                         //
-    core_per_input_type_ &&core_per_input,           //
-    all_cores_per_input_type_ &&all_cores_per_input, //
-    first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, results_type_ &&results,
-    size_t substitute_magnitude, size_t gap_magnitude, cpu_specs_t specs = {}) noexcept {
+status_t _score_in_parallel(                                                                                       //
+    scoring_type_ &&scoring, first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
+    results_type_ &&results, size_t substitute_magnitude, size_t gap_magnitude,                                    //
+    executor_type_ &&executor = {}, cpu_specs_t specs = {}) noexcept {
 
     using score_t = score_type_;
     constexpr bool score_is_signed_k = std::is_signed_v<score_t>;
-    using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, score_is_signed_k>;
-    using char_t = typename core_per_input_type_::char_t;
-    using gap_costs_t = typename core_per_input_type_::gap_costs_t;
+    using similarity_memory_requirements_t = similarity_memory_requirements<size_t, score_is_signed_k>;
+    using char_t = typename scoring_type_::char_t;
+    using gap_costs_t = typename scoring_type_::gap_costs_t;
 
     auto first_size = first_strings.size();
     auto second_size = second_strings.size();
@@ -1817,9 +1844,8 @@ status_t _score_in_parallel(                         //
 
     // ? There may be a huge variance in the lengths of the strings,
     // ? so we need to use a dynamic schedule.
-#pragma omp parallel for schedule(dynamic, 1)
-    for (sz_size_t i = 0; i < first_size; ++i) {
-        if (error.load() != status_t::success_k) continue;
+    executor.eager(first_size, [&](size_t i) {
+        if (error.load() != status_t::success_k) return;
         score_t result = 0;
         auto const &first = first_strings[i];
         auto const &second = second_strings[i];
@@ -1830,14 +1856,14 @@ status_t _score_in_parallel(                         //
             gap_type<gap_costs_t>(), substitute_magnitude, gap_magnitude, //
             sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
 
-        if (requirements.total >= specs.l1_bytes) continue;
-        status_t status = core_per_input({first.data(), first.size()}, {second.data(), second.size()}, result);
+        if (requirements.total >= specs.l1_bytes) return;
+        status_t status = scoring({first.data(), first.size()}, {second.data(), second.size()}, result);
         if (status == status_t::success_k) { results[i] = result; }
         else { error.store(status); }
-    }
+    });
 
     // Now handle the longer strings.
-    for (sz_size_t i = 0; i < first_size && error.load() == status_t::success_k; ++i) {
+    for (size_t i = 0; i < first_size && error.load() == status_t::success_k; ++i) {
         score_t result = 0;
         auto const &first = first_strings[i];
         auto const &second = second_strings[i];
@@ -1847,21 +1873,23 @@ status_t _score_in_parallel(                         //
             sizeof(char_t), SZ_MAX_REGISTER_WIDTH);
 
         if (requirements.total < specs.l1_bytes) continue;
-        status_t status = all_cores_per_input({first.data(), first.size()}, {second.data(), second.size()}, result);
+        status_t status = scoring({first.data(), first.size()}, {second.data(), second.size()}, result, executor);
         if (status == status_t::success_k) { results[i] = result; }
         else { error.store(status); }
     }
     return error.load();
 }
 
-template <                                                       //
-    score_like score_type_,                                      //
-    typename scoring_type_,                                      //
-    typename first_strings_type_, typename second_strings_type_, //
-    typename results_type_                                       //
+template <                         //
+    score_like score_type_,        //
+    typename scoring_type_,        //
+    typename first_strings_type_,  //
+    typename second_strings_type_, //
+    typename results_type_         //
     >
-status_t _score_sequentially(scoring_type_ &&scoring, first_strings_type_ const &first_strings,
-                             second_strings_type_ const &second_strings, results_type_ &&results) noexcept {
+status_t _score_sequentially(                                                                                      //
+    scoring_type_ &&scoring, first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
+    results_type_ &&results) noexcept {
 
     using score_t = score_type_;
 
@@ -1869,7 +1897,7 @@ status_t _score_sequentially(scoring_type_ &&scoring, first_strings_type_ const
     auto second_size = second_strings.size();
     _sz_assert(first_size == second_size && "Expect equal number of strings");
 
-    for (sz_size_t i = 0; i < first_size; ++i) {
+    for (size_t i = 0; i < first_size; ++i) {
         score_t result = 0;
         auto const &first = first_strings[i];
         auto const &second = second_strings[i];
@@ -1899,10 +1927,7 @@ struct levenshtein_distances {
     using allocator_t = allocator_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
-    static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
-
-    using all_cores_per_input_t = levenshtein_distance<char_t, gap_costs_t, allocator_t, capability_k>;
-    using core_per_input_t = levenshtein_distance<char_t, gap_costs_t, allocator_t, capability_serialized_k>;
+    using scoring_t = levenshtein_distance<char_t, gap_costs_t, allocator_t, capability_k>;
 
     uniform_substitution_costs_t substituter_ {};
     gap_costs_t gap_costs_ {};
@@ -1915,18 +1940,21 @@ struct levenshtein_distances {
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
-                        results_type_ &&results, cpu_specs_t const &specs = {}) const noexcept {
-
-        if constexpr (capability_k & sz_cap_parallel_k)
-            return _score_in_parallel<sz_size_t>(                                    //
-                core_per_input_t {substituter_, gap_costs_, alloc_},                 //
-                all_cores_per_input_t {substituter_, gap_costs_, alloc_},            //
-                first_strings, second_strings, std::forward<results_type_>(results), //
-                substituter_.magnitude(), gap_costs_.magnitude(), specs);
-        else
-            return _score_sequentially<sz_size_t>(                        //
-                all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
-                first_strings, second_strings, std::forward<results_type_>(results));
+                        results_type_ &&results) const noexcept {
+        return _score_sequentially<size_t>(               //
+            scoring_t {substituter_, gap_costs_, alloc_}, //
+            first_strings, second_strings, std::forward<results_type_>(results));
+    }
+
+    template <typename first_strings_type_, typename second_strings_type_, typename results_type_,
+              executor_like executor_type_>
+    status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
+                        results_type_ &&results, executor_type_ &&executor,
+                        cpu_specs_t const &specs = {}) const noexcept {
+        return _score_in_parallel<size_t>(                                       //
+            scoring_t {substituter_, gap_costs_, alloc_},                        //
+            first_strings, second_strings, std::forward<results_type_>(results), //
+            substituter_.magnitude(), gap_costs_.magnitude(), executor, specs);
     }
 };
 
@@ -1944,10 +1972,7 @@ struct levenshtein_distances_utf8 {
     using allocator_t = allocator_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
-    static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
-
-    using all_cores_per_input_t = levenshtein_distance_utf8<char_t, gap_costs_t, allocator_t, capability_k>;
-    using core_per_input_t = levenshtein_distance_utf8<char_t, gap_costs_t, allocator_t, capability_serialized_k>;
+    using scoring_t = levenshtein_distance_utf8<char_t, gap_costs_t, allocator_t, capability_k>;
 
     uniform_substitution_costs_t substituter_ {};
     gap_costs_t gap_costs_ {};
@@ -1960,18 +1985,21 @@ struct levenshtein_distances_utf8 {
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
-                        results_type_ &&results, cpu_specs_t const &specs = {}) const noexcept {
-
-        if constexpr (capability_k & sz_cap_parallel_k)
-            return _score_in_parallel<sz_size_t>(                                    //
-                core_per_input_t {substituter_, gap_costs_, alloc_},                 //
-                all_cores_per_input_t {substituter_, gap_costs_, alloc_},            //
-                first_strings, second_strings, std::forward<results_type_>(results), //
-                substituter_.magnitude(), gap_costs_.magnitude(), specs);
-        else
-            return _score_sequentially<sz_size_t>(                        //
-                all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
-                first_strings, second_strings, std::forward<results_type_>(results));
+                        results_type_ &&results) const noexcept {
+        return _score_sequentially<size_t>(               //
+            scoring_t {substituter_, gap_costs_, alloc_}, //
+            first_strings, second_strings, std::forward<results_type_>(results));
+    }
+
+    template <typename first_strings_type_, typename second_strings_type_, typename results_type_,
+              executor_like executor_type_>
+    status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
+                        results_type_ &&results, executor_type_ &&executor,
+                        cpu_specs_t const &specs = {}) const noexcept {
+        return _score_in_parallel<size_t>(                                       //
+            scoring_t {substituter_, gap_costs_, alloc_},                        //
+            first_strings, second_strings, std::forward<results_type_>(results), //
+            substituter_.magnitude(), gap_costs_.magnitude(), executor, specs);
     }
 };
 
@@ -1991,11 +2019,7 @@ struct needleman_wunsch_scores {
     using allocator_t = allocator_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
-    static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
-
-    using all_cores_per_input_t = needleman_wunsch_score<char_t, substituter_t, gap_costs_t, allocator_t, capability_k>;
-    using core_per_input_t =
-        needleman_wunsch_score<char_t, substituter_t, gap_costs_t, allocator_t, capability_serialized_k>;
+    using scoring_t = needleman_wunsch_score<char_t, substituter_t, gap_costs_t, allocator_t, capability_k>;
 
     substituter_t substituter_ {};
     gap_costs_t gap_costs_ {};
@@ -2007,18 +2031,21 @@ struct needleman_wunsch_scores {
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
-                        results_type_ &&results, cpu_specs_t const &specs = {}) const noexcept {
-
-        if constexpr (capability_k & sz_cap_parallel_k)
-            return _score_in_parallel<sz_ssize_t>(                                   //
-                core_per_input_t {substituter_, gap_costs_, alloc_},                 //
-                all_cores_per_input_t {substituter_, gap_costs_, alloc_},            //
-                first_strings, second_strings, std::forward<results_type_>(results), //
-                substituter_.magnitude(), gap_costs_.magnitude(), specs);
-        else
-            return _score_sequentially<sz_ssize_t>(                       //
-                all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
-                first_strings, second_strings, std::forward<results_type_>(results));
+                        results_type_ &&results) const noexcept {
+        return _score_sequentially<ssize_t>(              //
+            scoring_t {substituter_, gap_costs_, alloc_}, //
+            first_strings, second_strings, std::forward<results_type_>(results));
+    }
+
+    template <typename first_strings_type_, typename second_strings_type_, typename results_type_,
+              executor_like executor_type_>
+    status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
+                        results_type_ &&results, executor_type_ &&executor,
+                        cpu_specs_t const &specs = {}) const noexcept {
+        return _score_in_parallel<ssize_t>(                                      //
+            scoring_t {substituter_, gap_costs_, alloc_},                        //
+            first_strings, second_strings, std::forward<results_type_>(results), //
+            substituter_.magnitude(), gap_costs_.magnitude(), executor, specs);
     }
 };
 
@@ -2038,11 +2065,7 @@ struct smith_waterman_scores {
     using allocator_t = allocator_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
-    static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
-
-    using all_cores_per_input_t = smith_waterman_score<char_t, substituter_t, gap_costs_t, allocator_t, capability_k>;
-    using core_per_input_t =
-        smith_waterman_score<char_t, substituter_t, gap_costs_t, allocator_t, capability_serialized_k>;
+    using scoring_t = smith_waterman_score<char_t, substituter_t, gap_costs_t, allocator_t, capability_k>;
 
     substituter_t substituter_ {};
     gap_costs_t gap_costs_ {};
@@ -2054,18 +2077,21 @@ struct smith_waterman_scores {
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
-                        results_type_ &&results, cpu_specs_t const &specs = {}) const noexcept {
-
-        if constexpr (capability_k & sz_cap_parallel_k)
-            return _score_in_parallel<sz_ssize_t>(                                   //
-                core_per_input_t {substituter_, gap_costs_, alloc_},                 //
-                all_cores_per_input_t {substituter_, gap_costs_, alloc_},            //
-                first_strings, second_strings, std::forward<results_type_>(results), //
-                substituter_.magnitude(), gap_costs_.magnitude(), specs);
-        else
-            return _score_sequentially<sz_ssize_t>(                       //
-                all_cores_per_input_t {substituter_, gap_costs_, alloc_}, //
-                first_strings, second_strings, std::forward<results_type_>(results));
+                        results_type_ &&results) const noexcept {
+        return _score_sequentially<ssize_t>(              //
+            scoring_t {substituter_, gap_costs_, alloc_}, //
+            first_strings, second_strings, std::forward<results_type_>(results));
+    }
+
+    template <typename first_strings_type_, typename second_strings_type_, typename results_type_,
+              executor_like executor_type_>
+    status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
+                        results_type_ &&results, executor_type_ &&executor,
+                        cpu_specs_t const &specs = {}) const noexcept {
+        return _score_in_parallel<ssize_t>(                                      //
+            scoring_t {substituter_, gap_costs_, alloc_},                        //
+            first_strings, second_strings, std::forward<results_type_>(results), //
+            substituter_.magnitude(), gap_costs_.magnitude(), executor, specs);
     }
 };
 
@@ -2100,11 +2126,11 @@ struct error_costs_256x256_t {
         return result;
     }
 
-    constexpr sz_size_t magnitude() const noexcept {
-        sz_size_t max_magnitude = 0;
+    constexpr size_t magnitude() const noexcept {
+        size_t max_magnitude = 0;
         for (int i = 0; i != 256; ++i)
             for (int j = 0; j != 256; ++j) //
-                max_magnitude = std::max(max_magnitude, (sz_size_t)std::abs((int)cells[i][j]));
+                max_magnitude = std::max(max_magnitude, (size_t)std::abs((int)cells[i][j]));
         return max_magnitude;
     }
 };
@@ -2189,11 +2215,11 @@ struct error_costs_26x26ascii_t {
         return result;
     }
 
-    constexpr sz_size_t magnitude() const noexcept {
-        sz_size_t max_magnitude = 0;
+    constexpr size_t magnitude() const noexcept {
+        size_t max_magnitude = 0;
         for (int i = 0; i != 26; ++i)
             for (int j = 0; j != 26; ++j) //
-                max_magnitude = std::max(max_magnitude, (sz_size_t)std::abs((int)cells[i][j]));
+                max_magnitude = std::max(max_magnitude, (size_t)std::abs((int)cells[i][j]));
         return max_magnitude;
     }
 
@@ -2293,13 +2319,13 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
     : public tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void> {
 
-    using scorer_t::tile_scorer; // Make the constructors visible
+    using tile_scorer_t::tile_scorer; // Make the constructors visible
 
     static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
     static constexpr sz_capability_t capability_k = capability_;
 
-    static constexpr sz_size_t step_k = 64;
+    static constexpr size_t step_k = 64;
 
     /**
      *  @brief  Computes one diagonal of the `u8` DM matrix for exactly 64 characters,
@@ -2340,7 +2366,7 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
      *          using unaligned loads and stores.
      */
     SZ_FORCE_INLINE void slice_upto64chars(                                          //
-        char const *first_reversed_slice, char const *second_slice, sz_size_t n,     //
+        char const *first_reversed_slice, char const *second_slice, size_t n,        //
         sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion, //
         sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new,                     //
         sz_u512_vec_t match_cost_vec, sz_u512_vec_t mismatch_cost_vec,               //
@@ -2370,10 +2396,13 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
         _mm512_mask_storeu_epi8(scores_new, load_mask, cell_score_vec.zmm);
     }
 
-    inline void operator()(                                                                 //
-        char const *first_reversed_slice, char const *second_slice, sz_size_t const length, //
-        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion,        //
-        sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new) noexcept {
+    inline void operator()(                                                              //
+        char const *first_reversed_slice, char const *second_slice, size_t const length, //
+        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion,     //
+        sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new,                         //
+        dummy_executor_t executor = {}) noexcept {
+
+        sz_unused(executor); // On such small inputs, we don't need to worry about parallelism.
 
         // Initialize constats:
         sz_u512_vec_t match_cost_vec, mismatch_cost_vec, gap_cost_vec;
@@ -2405,10 +2434,10 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
             scores_pre_insertion += hbt.head, scores_pre_deletion += hbt.head, scores_new += hbt.head;
 
         // In this variant we will need at most 4 loops per diagonal:
-        for (sz_size_t progress = 0; progress < hbt.body; //
-             progress += step_k,                          //
+        for (size_t progress = 0; progress < hbt.body; //
+             progress += step_k,                       //
              first_reversed_slice += step_k, second_slice += step_k, scores_pre_substitution += step_k,
-                       scores_pre_insertion += step_k, scores_pre_deletion += step_k, scores_new += step_k)
+                    scores_pre_insertion += step_k, scores_pre_deletion += step_k, scores_new += step_k)
             slice_aligned64chars(                                                               //
                 first_reversed_slice, second_slice,                                             //
                 scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, scores_new, //
@@ -2437,13 +2466,13 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, uniform_substi
     : public tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, uniform_substitution_costs_t,
                          linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void> {
 
-    using scorer_t::tile_scorer; // Make the constructors visible
+    using tile_scorer_t::tile_scorer; // Make the constructors visible
 
     static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
     static constexpr sz_capability_t capability_k = capability_;
 
-    static constexpr sz_size_t step_k = 16;
+    static constexpr size_t step_k = 16;
 
     /**
      *  @brief  Computes one diagonal of the `u8` DM matrix for exactly 16 characters,
@@ -2482,11 +2511,11 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, uniform_substi
      *  @brief  Computes one diagonal of the `u8` DM matrix for up to 16 characters,
      *          using unaligned loads and stores.
      */
-    SZ_FORCE_INLINE void slice_upto16chars(                                                //
-        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, sz_size_t n, //
-        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion,       //
-        sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new,                           //
-        sz_u128_vec_t match_cost_vec, sz_u128_vec_t mismatch_cost_vec,                     //
+    SZ_FORCE_INLINE void slice_upto16chars(                                             //
+        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, size_t n, //
+        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion,    //
+        sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new,                        //
+        sz_u128_vec_t match_cost_vec, sz_u128_vec_t mismatch_cost_vec,                  //
         sz_u128_vec_t gap_cost_vec) const noexcept {
 
         __mmask16 load_mask, match_mask;
@@ -2512,10 +2541,13 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, uniform_substi
         _mm_mask_storeu_epi8(scores_new, load_mask, cell_score_vec.xmm);
     }
 
-    inline void operator()(                                                                           //
-        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, sz_size_t const length, //
-        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion,                  //
-        sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new) noexcept {
+    inline void operator()(                                                                        //
+        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, size_t const length, //
+        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion,               //
+        sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new,                                   //
+        dummy_executor_t executor = {}) noexcept {
+
+        sz_unused(executor); // On such small inputs, we don't need to worry about parallelism.
 
         // Initialize constats:
         sz_u128_vec_t match_cost_vec, mismatch_cost_vec, gap_cost_vec;
@@ -2547,10 +2579,10 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, uniform_substi
             scores_pre_insertion += hbt.head, scores_pre_deletion += hbt.head, scores_new += hbt.head;
 
         // In this variant we will need at most (256 / 16) = 16 loops per diagonal.
-        for (sz_size_t progress = 0; progress < hbt.body; //
-             progress += step_k,                          //
+        for (size_t progress = 0; progress < hbt.body; //
+             progress += step_k,                       //
              first_reversed_slice += step_k, second_slice += step_k, scores_pre_substitution += step_k,
-                       scores_pre_insertion += step_k, scores_pre_deletion += step_k, scores_new += step_k)
+                    scores_pre_insertion += step_k, scores_pre_deletion += step_k, scores_new += step_k)
             slice_aligned16chars(                                                               //
                 first_reversed_slice, second_slice,                                             //
                 scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, scores_new, //
@@ -2579,13 +2611,13 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
     : public tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void> {
 
-    using scorer_t::tile_scorer; // Make the constructors visible
+    using tile_scorer_t::tile_scorer; // Make the constructors visible
 
     static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
     static constexpr sz_capability_t capability_k = capability_;
 
-    static constexpr sz_size_t step_k = 32;
+    static constexpr size_t step_k = 32;
 
     /**
      *  @brief  Computes one diagonal of the `u16` DM matrix for exactly 16 characters,
@@ -2621,7 +2653,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
     }
 
     SZ_FORCE_INLINE void slice_upto32chars(                                            //
-        char const *first_reversed_slice, char const *second_slice, sz_size_t n,       //
+        char const *first_reversed_slice, char const *second_slice, size_t n,          //
         sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion, //
         sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new,                     //
         sz_u512_vec_t match_cost_vec, sz_u512_vec_t mismatch_cost_vec, sz_u512_vec_t gap_cost_vec) const noexcept {
@@ -2650,10 +2682,12 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
         _mm512_mask_storeu_epi16(scores_new, load_mask, cell_score_vec.zmm);
     }
 
-    inline void operator()(                                                                 //
-        char const *first_reversed_slice, char const *second_slice, sz_size_t const length, //
-        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,      //
-        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
+    template <executor_like executor_type_ = dummy_executor_t>
+    inline void operator()(                                                              //
+        char const *first_reversed_slice, char const *second_slice, size_t const length, //
+        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,   //
+        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new,                       //
+        executor_type_ &&executor = {}) noexcept {
 
         // Initialize constats:
         sz_u512_vec_t match_cost_vec, mismatch_cost_vec, gap_cost_vec;
@@ -2691,12 +2725,14 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
                 match_cost_vec, mismatch_cost_vec, gap_cost_vec);
 
         // In this variant we will need at most (64 * 1024 / 32) = 2048 loops per diagonal.
-#pragma omp parallel for if (capability_k & sz_cap_parallel_k)
-        for (sz_size_t progress = 0; progress < hbt.body; progress += step_k)
+        size_t const body_pages = hbt.body / step_k;
+        executor.slice(body_pages, [&](size_t const page) {
+            size_t const progress = page * step_k;
             slice_aligned32chars(                                                                             //
                 first_reversed_slice + progress, second_slice + progress, scores_pre_substitution + progress, //
                 scores_pre_insertion + progress, scores_pre_deletion + progress, scores_new + progress,       //
                 match_cost_vec, mismatch_cost_vec, gap_cost_vec);
+        });
 
         // The last element of the last chunk is the result of the global alignment.
         if (length == 1) this->last_score_ = scores_new[0];
@@ -2714,13 +2750,13 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_subst
     : public tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_substitution_costs_t,
                          linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void> {
 
-    using scorer_t::tile_scorer; // Make the constructors visible
+    using tile_scorer_t::tile_scorer; // Make the constructors visible
 
     static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
     static constexpr sz_capability_t capability_k = capability_;
 
-    static constexpr sz_size_t step_k = 16;
+    static constexpr size_t step_k = 16;
 
     /**
      *  @brief  Computes one diagonal of the `u16` DM matrix for exactly 16 characters,
@@ -2759,10 +2795,10 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_subst
      *  @brief  Computes one diagonal of the `u16` DM matrix for up to 16 characters,
      *          using unaligned loads and stores.
      */
-    SZ_FORCE_INLINE void slice_upto16chars(                                                //
-        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, sz_size_t n, //
-        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,     //
-        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new,                         //
+    SZ_FORCE_INLINE void slice_upto16chars(                                             //
+        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, size_t n, //
+        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,  //
+        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new,                      //
         sz_u256_vec_t match_cost_vec, sz_u256_vec_t mismatch_cost_vec, sz_u256_vec_t gap_cost_vec) const noexcept {
 
         __mmask16 load_mask, match_mask;
@@ -2789,10 +2825,12 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_subst
         _mm256_mask_storeu_epi16(scores_new, load_mask, cell_score_vec.ymm);
     }
 
-    inline void operator()(                                                                           //
-        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, sz_size_t const length, //
-        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,                //
-        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
+    template <executor_like executor_type_ = dummy_executor_t>
+    inline void operator()(                                                                        //
+        sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, size_t const length, //
+        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,             //
+        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new,                                 //
+        executor_type_ &&executor = {}) noexcept {
 
         // Initialize constats:
         sz_u256_vec_t match_cost_vec, mismatch_cost_vec, gap_cost_vec;
@@ -2829,13 +2867,15 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_subst
                 scores_pre_deletion + hbt.body, scores_new + hbt.body,               //
                 match_cost_vec, mismatch_cost_vec, gap_cost_vec);
 
-// In this variant we will need at most (64 * 1024 / 16) = 4096 loops per diagonal.
-#pragma omp parallel for if (capability_k & sz_cap_parallel_k)
-        for (sz_size_t progress = 0; progress < hbt.body; progress += step_k)
+        // In this variant we will need at most (64 * 1024 / 16) = 4096 loops per diagonal.
+        size_t const body_pages = hbt.body / step_k;
+        executor.slice(body_pages, [&](size_t const page) {
+            size_t const progress = page * step_k;
             slice_aligned16chars(                                                                             //
                 first_reversed_slice + progress, second_slice + progress, scores_pre_substitution + progress, //
                 scores_pre_insertion + progress, scores_pre_deletion + progress, scores_new + progress,       //
                 match_cost_vec, mismatch_cost_vec, gap_cost_vec);
+        });
 
         // The last element of the last chunk is the result of the global alignment.
         if (length == 1) this->last_score_ = scores_new[0];
@@ -2853,13 +2893,13 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
     : public tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void> {
 
-    using scorer_t::tile_scorer; // Make the constructors visible
+    using tile_scorer_t::tile_scorer; // Make the constructors visible
 
     static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
     static constexpr sz_capability_t capability_k = capability_;
 
-    static constexpr sz_size_t step_k = 16;
+    static constexpr size_t step_k = 16;
 
     /**
      *  @brief  Computes one diagonal of the `u32` DM matrix for exactly 16 characters,
@@ -2899,7 +2939,7 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
      *          using unaligned loads and stores.
      */
     SZ_FORCE_INLINE void slice_upto16chars(                                            //
-        char const *first_reversed_slice, char const *second_slice, sz_size_t n,       //
+        char const *first_reversed_slice, char const *second_slice, size_t n,          //
         sz_u32_t const *scores_pre_substitution, sz_u32_t const *scores_pre_insertion, //
         sz_u32_t const *scores_pre_deletion, sz_u32_t *scores_new,                     //
         sz_u512_vec_t match_cost_vec, sz_u512_vec_t mismatch_cost_vec, sz_u512_vec_t gap_cost_vec) const noexcept {
@@ -2928,10 +2968,12 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
         _mm512_mask_storeu_epi32(scores_new, load_mask, cell_score_vec.zmm);
     }
 
-    inline void operator()(                                                                 //
-        char const *first_reversed_slice, char const *second_slice, sz_size_t const length, //
-        sz_u32_t const *scores_pre_substitution, sz_u32_t const *scores_pre_insertion,      //
-        sz_u32_t const *scores_pre_deletion, sz_u32_t *scores_new) noexcept {
+    template <executor_like executor_type_ = dummy_executor_t>
+    inline void operator()(                                                              //
+        char const *first_reversed_slice, char const *second_slice, size_t const length, //
+        sz_u32_t const *scores_pre_substitution, sz_u32_t const *scores_pre_insertion,   //
+        sz_u32_t const *scores_pre_deletion, sz_u32_t *scores_new,                       //
+        executor_type_ &&executor = {}) noexcept {
 
         // Initialize constats:
         sz_u512_vec_t match_cost_vec, mismatch_cost_vec, gap_cost_vec;
@@ -2968,12 +3010,14 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
                 scores_pre_deletion + hbt.body, scores_new + hbt.body,               //
                 match_cost_vec, mismatch_cost_vec, gap_cost_vec);
 
-#pragma omp parallel for if (capability_k & sz_cap_parallel_k)
-        for (sz_size_t progress = 0; progress < hbt.body; progress += step_k)
+        size_t const body_pages = hbt.body / step_k;
+        executor.slice(body_pages, [&](size_t const page) {
+            size_t const progress = page * step_k;
             slice_aligned16chars(                                                                             //
                 first_reversed_slice + progress, second_slice + progress, scores_pre_substitution + progress, //
                 scores_pre_insertion + progress, scores_pre_deletion + progress, scores_new + progress,       //
                 match_cost_vec, mismatch_cost_vec, gap_cost_vec);
+        });
 
         // The last element of the last chunk is the result of the global alignment.
         if (length == 1) this->last_score_ = scores_new[0];
@@ -2994,11 +3038,10 @@ struct levenshtein_distance<char, linear_gap_costs_t, allocator_type_, capabilit
 
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_wout_simd_k = (sz_capability_t)(capability_k & ~sz_cap_ice_k);
-    static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
     using diagonal_u8_t =                                                                               //
         diagonal_walker<char_t, sz_u8_t, uniform_substitution_costs_t, linear_gap_costs_t, allocator_t, //
-                        sz_minimize_distance_k, sz_similarity_global_k, capability_serialized_k>;
+                        sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
     using diagonal_u16_t =                                                                               //
         diagonal_walker<char_t, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t, allocator_t, //
                         sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
@@ -3023,10 +3066,12 @@ struct levenshtein_distance<char, linear_gap_costs_t, allocator_type_, capabilit
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
-    status_t operator()(span<char_t const> first, span<char_t const> second, sz_size_t &result_ref) const noexcept {
+    template <executor_like executor_type_ = dummy_executor_t>
+    status_t operator()(span<char_t const> first, span<char_t const> second, size_t &result_ref,
+                        executor_type_ &&executor = {}) const noexcept {
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
-        using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, false>;
+        using similarity_memory_requirements_t = similarity_memory_requirements<size_t, false>;
         similarity_memory_requirements_t requirements(                                 //
             first.size(), second.size(),                                               //
             gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
@@ -3036,25 +3081,25 @@ struct levenshtein_distance<char, linear_gap_costs_t, allocator_type_, capabilit
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
         if (requirements.bytes_per_cell == 1) {
             sz_u8_t result_u8;
-            status_t status = diagonal_u8_t {substituter_, gap_costs_, alloc_}(first, second, result_u8);
+            status_t status = diagonal_u8_t {substituter_, gap_costs_, alloc_}(first, second, result_u8 /* executor */);
             if (status != status_t::success_k) return status;
             result_ref = result_u8;
         }
         else if (requirements.bytes_per_cell == 2) {
             sz_u16_t result_u16;
-            status_t status = diagonal_u16_t {substituter_, gap_costs_, alloc_}(first, second, result_u16);
+            status_t status = diagonal_u16_t {substituter_, gap_costs_, alloc_}(first, second, result_u16, executor);
             if (status != status_t::success_k) return status;
             result_ref = result_u16;
         }
         else if (requirements.bytes_per_cell == 4) {
             sz_u32_t result_u32;
-            status_t status = diagonal_u32_t {substituter_, gap_costs_, alloc_}(first, second, result_u32);
+            status_t status = diagonal_u32_t {substituter_, gap_costs_, alloc_}(first, second, result_u32, executor);
             if (status != status_t::success_k) return status;
             result_ref = result_u32;
         }
         else if (requirements.bytes_per_cell == 8) {
             sz_u64_t result_u64;
-            status_t status = diagonal_u64_t {substituter_, gap_costs_, alloc_}(first, second, result_u64);
+            status_t status = diagonal_u64_t {substituter_, gap_costs_, alloc_}(first, second, result_u64, executor);
             if (status != status_t::success_k) return status;
             result_ref = result_u64;
         }
@@ -3077,11 +3122,10 @@ struct levenshtein_distance_utf8<char, linear_gap_costs_t, allocator_type_, capa
 
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_wout_simd_k = (sz_capability_t)(capability_k & ~sz_cap_ice_k);
-    static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
     using diagonal_u8_t =                                                                                  //
         diagonal_walker<sz_rune_t, sz_u8_t, uniform_substitution_costs_t, linear_gap_costs_t, allocator_t, //
-                        sz_minimize_distance_k, sz_similarity_global_k, capability_serialized_k>;
+                        sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
     using diagonal_u16_t =                                                                                  //
         diagonal_walker<sz_rune_t, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t, allocator_t, //
                         sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
@@ -3108,30 +3152,32 @@ struct levenshtein_distance_utf8<char, linear_gap_costs_t, allocator_type_, capa
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
-    status_t operator()(span<char_t const> first, span<char_t const> second, sz_size_t &result_ref) const noexcept {
+    template <executor_like executor_type_ = dummy_executor_t>
+    status_t operator()(span<char_t const> first, span<char_t const> second, size_t &result_ref,
+                        executor_type_ &&executor = {}) const noexcept {
 
         // Check if the strings are entirely composed of ASCII characters,
         // and default to a simpler algorithm in that case.
         if (sz_isascii(first.data(), first.size()) && sz_isascii(second.data(), second.size()))
-            return ascii_fallback_t {substituter_, gap_costs_, alloc_}(first, second, result_ref);
+            return ascii_fallback_t {substituter_, gap_costs_, alloc_}(first, second, result_ref, executor);
 
         // Allocate some memory to expand UTF-8 strings into UTF-32.
-        sz_size_t const max_utf32_bytes = first.size() * 4 + second.size() * 4;
+        size_t const max_utf32_bytes = first.size() * 4 + second.size() * 4;
         sz_rune_t *const first_data_utf32 = (sz_rune_t *)alloc_.allocate(max_utf32_bytes);
         sz_rune_t *const second_data_utf32 = first_data_utf32 + first.size();
 
         // Export into UTF-32 buffer.
         sz_rune_length_t rune_length;
-        sz_size_t first_length_utf32 = 0, second_length_utf32 = 0;
-        for (sz_size_t progress_utf8 = 0, progress_utf32 = 0; progress_utf8 < first.size();
+        size_t first_length_utf32 = 0, second_length_utf32 = 0;
+        for (size_t progress_utf8 = 0, progress_utf32 = 0; progress_utf8 < first.size();
              progress_utf8 += rune_length, ++progress_utf32, ++first_length_utf32)
             sz_rune_parse(first.data() + progress_utf8, first_data_utf32 + progress_utf32, &rune_length);
-        for (sz_size_t progress_utf8 = 0, progress_utf32 = 0; progress_utf8 < second.size();
+        for (size_t progress_utf8 = 0, progress_utf32 = 0; progress_utf8 < second.size();
              progress_utf8 += rune_length, ++progress_utf32, ++second_length_utf32)
             sz_rune_parse(second.data() + progress_utf8, second_data_utf32 + progress_utf32, &rune_length);
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
-        using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, false>;
+        using similarity_memory_requirements_t = similarity_memory_requirements<size_t, false>;
         similarity_memory_requirements_t requirements(                                 //
             first.size(), second.size(),                                               //
             gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
@@ -3143,25 +3189,29 @@ struct levenshtein_distance_utf8<char, linear_gap_costs_t, allocator_type_, capa
         span<sz_rune_t const> const second_utf32 {second_data_utf32, second_length_utf32};
         if (requirements.bytes_per_cell == 1) {
             sz_u8_t result_u8;
-            status_t status = diagonal_u8_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u8);
+            status_t status =
+                diagonal_u8_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u8 /* executor */);
             if (status != status_t::success_k) return status;
             result_ref = result_u8;
         }
         else if (requirements.bytes_per_cell == 2) {
             sz_u16_t result_u16;
-            status_t status = diagonal_u16_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u16);
+            status_t status =
+                diagonal_u16_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u16, executor);
             if (status != status_t::success_k) return status;
             result_ref = result_u16;
         }
         else if (requirements.bytes_per_cell == 4) {
             sz_u32_t result_u32;
-            status_t status = diagonal_u32_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u32);
+            status_t status =
+                diagonal_u32_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u32, executor);
             if (status != status_t::success_k) return status;
             result_ref = result_u32;
         }
         else if (requirements.bytes_per_cell == 8) {
             sz_u64_t result_u64;
-            status_t status = diagonal_u64_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u64);
+            status_t status =
+                diagonal_u64_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u64, executor);
             if (status != status_t::success_k) return status;
             result_ref = result_u64;
         }
@@ -3228,8 +3278,7 @@ struct lookup_in256bytes_ice_t {
 };
 
 /**
- *  @brief  Helper object for Ice Lake CPUs. It's designed for horizontal layout "walkers", operating over 16-bit
- * costs.
+ *  @brief  Helper object for Ice Lake CPUs, designed for horizontal layout "walkers", operating over 16-bit costs.
  *
  *  This is a common abstraction for both:
  *  - Local SW and global NW alignment.
@@ -3253,7 +3302,7 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_
     lookup_in256bytes_ice_t lookup_;
 
     void operator()(                                                                   //
-        constant_iterator<char> first_char, char const *second_slice, sz_size_t n,     //
+        constant_iterator<char> first_char, char const *second_slice, size_t n,        //
         sz_i16_t const *scores_pre_substitution, sz_i16_t const *scores_pre_insertion, //
         sz_i16_t const *scores_pre_deletion, sz_i16_t *scores_new) noexcept {
 
@@ -3263,13 +3312,13 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_
         lookup_.reload(substitutions_row);
 
         // Progress through the row 64 characters at a time.
-        sz_size_t const count_slices = n / 64;
+        size_t const count_slices = n / 64;
 #pragma omp parallel for if (capability_k & sz_cap_parallel_k)
-        for (sz_size_t idx_slice = 0; idx_slice != count_slices; ++idx_slice)
+        for (size_t idx_slice = 0; idx_slice != count_slices; ++idx_slice)
             slice_64chars(second_slice, idx_slice * 64, gap, scores_pre_substitution, scores_pre_insertion, scores_new);
 
         // Handle the tail with a less efficient kernel - at most 2 iterations of the following loop:
-        for (sz_size_t idx_half_slice = count_slices * 2; idx_half_slice * 32 < n; ++idx_half_slice)
+        for (size_t idx_half_slice = count_slices * 2; idx_half_slice * 32 < n; ++idx_half_slice)
             slice_upto32chars(second_slice, idx_half_slice * 32, n, gap, scores_pre_substitution, scores_pre_insertion,
                               scores_new);
 
@@ -3286,7 +3335,7 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_
         this->last_score_ = last_in_row;
     }
 
-    void slice_64chars(char const *second_slice, sz_size_t i, sz_i16_t gap,                           //
+    void slice_64chars(char const *second_slice, size_t i, sz_i16_t gap,                              //
                        sz_i16_t const *scores_pre_substitution, sz_i16_t const *scores_pre_insertion, //
                        sz_i16_t *scores_new) const noexcept {
 
@@ -3334,7 +3383,7 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_
         _mm512_storeu_epi16(scores_new + i + 32, cell_score_vecs[1].zmm);
     }
 
-    void slice_upto32chars(char const *second_slice, sz_size_t i, sz_size_t n, sz_i16_t gap,              //
+    void slice_upto32chars(char const *second_slice, size_t i, size_t n, sz_i16_t gap,                    //
                            sz_i16_t const *scores_pre_substitution, sz_i16_t const *scores_pre_insertion, //
                            sz_i16_t *scores_new) const noexcept {
 
@@ -3372,14 +3421,29 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_
     }
 };
 
+/** @brief Redirects the Ice Lake template specialization to the serial version. */
+template <typename char_type_, score_like score_type_, substituter_like substituter_type_,
+          gap_costs_like gap_costs_type_, typename allocator_type_, sz_similarity_objective_t objective_,
+          sz_similarity_locality_t locality_>
+struct horizontal_walker<char_type_, score_type_, substituter_type_, gap_costs_type_, allocator_type_, objective_,
+                         locality_, sz_caps_si_k, void>
+    : public horizontal_walker<char_type_, score_type_, substituter_type_, gap_costs_type_, allocator_type_, objective_,
+                               locality_, sz_cap_serial_k, void> {
+
+    using base_t = horizontal_walker<char_type_, score_type_, substituter_type_, gap_costs_type_, allocator_type_,
+                                     objective_, locality_, sz_cap_serial_k, void>;
+
+    using base_t::horizontal_walker;
+    using base_t::operator();
+};
+
 /**
- *  @brief  Computes the @b byte-level Needleman-Wunsch score between two strings using the Ice Lake (+OpenMP)
- * backend.
+ *  @brief  Computes the @b byte-level Needleman-Wunsch score between two strings using the Ice Lake backend.
  *  @sa     `levenshtein_distance` for uniform substitution and gap costs.
  */
 template <typename allocator_type_, sz_capability_t capability_>
 struct needleman_wunsch_score<char, error_costs_256x256_t, linear_gap_costs_t, allocator_type_, capability_,
-                              std::enable_if_t<capability_ & sz_cap_ice_k>> {
+                              std::enable_if_t<(capability_ & sz_cap_ice_k) && !(capability_ & sz_cap_parallel_k)>> {
 
     using char_t = char;
     using substituter_t = error_costs_256x256_t;
@@ -3388,11 +3452,10 @@ struct needleman_wunsch_score<char, error_costs_256x256_t, linear_gap_costs_t, a
 
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_wout_simd_k = (sz_capability_t)(capability_k & ~sz_cap_ice_k);
-    static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
     using horizontal_i16_t =                                                         //
         horizontal_walker<char_t, sz_i16_t, substituter_t, gap_costs_t, allocator_t, //
-                          sz_maximize_score_k, sz_similarity_global_k, capability_serialized_k>;
+                          sz_maximize_score_k, sz_similarity_global_k, capability_k>;
     using horizontal_i32_t =                                                         //
         horizontal_walker<char_t, sz_i32_t, substituter_t, gap_costs_t, allocator_t, //
                           sz_maximize_score_k, sz_similarity_global_k, capability_wout_simd_k>;
@@ -3416,7 +3479,7 @@ struct needleman_wunsch_score<char, error_costs_256x256_t, linear_gap_costs_t, a
     status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref) const noexcept {
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
-        using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, true>;
+        using similarity_memory_requirements_t = similarity_memory_requirements<size_t, true>;
         similarity_memory_requirements_t requirements(                                 //
             first.size(), second.size(),                                               //
             gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
@@ -3448,13 +3511,12 @@ struct needleman_wunsch_score<char, error_costs_256x256_t, linear_gap_costs_t, a
 };
 
 /**
- *  @brief  Computes the @b byte-level Smith-Waterman score between two strings using the Ice Lake (+OpenMP)
- * backend.
+ *  @brief  Computes the @b byte-level Smith-Waterman score between two strings using the Ice Lake backend.
  *  @sa     `levenshtein_distance` for uniform substitution and gap costs.
  */
 template <typename allocator_type_, sz_capability_t capability_>
 struct smith_waterman_score<char, error_costs_256x256_t, linear_gap_costs_t, allocator_type_, capability_,
-                            std::enable_if_t<capability_ & sz_cap_ice_k>> {
+                            std::enable_if_t<(capability_ & sz_cap_ice_k) && !(capability_ & sz_cap_parallel_k)>> {
 
     using char_t = char;
     using substituter_t = error_costs_256x256_t;
@@ -3463,11 +3525,10 @@ struct smith_waterman_score<char, error_costs_256x256_t, linear_gap_costs_t, all
 
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_wout_simd_k = (sz_capability_t)(capability_k & ~sz_cap_ice_k);
-    static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
 
     using horizontal_i16_t =                                                                //
         horizontal_walker<char_t, sz_i16_t, substituter_t, linear_gap_costs_t, allocator_t, //
-                          sz_maximize_score_k, sz_similarity_local_k, capability_serialized_k>;
+                          sz_maximize_score_k, sz_similarity_local_k, capability_k>;
     using horizontal_i32_t =                                                                //
         horizontal_walker<char_t, sz_i32_t, substituter_t, linear_gap_costs_t, allocator_t, //
                           sz_maximize_score_k, sz_similarity_local_k, capability_wout_simd_k>;
@@ -3491,7 +3552,7 @@ struct smith_waterman_score<char, error_costs_256x256_t, linear_gap_costs_t, all
     status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref) const noexcept {
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
-        using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, true>;
+        using similarity_memory_requirements_t = similarity_memory_requirements<size_t, true>;
         similarity_memory_requirements_t requirements(                                 //
             first.size(), second.size(),                                               //
             gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
@@ -3530,4 +3591,4 @@ struct smith_waterman_score<char, error_costs_256x256_t, linear_gap_costs_t, all
 } // namespace stringzilla
 } // namespace ashvardanian
 
-#endif // STRINGZILLA_SIMILARITY_HPP_
\ No newline at end of file
+#endif // STRINGCUZILLA_SIMILARITY_HPP_
\ No newline at end of file

From 29da524ba64841514b59e614609a5e5237685460 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 30 Apr 2025 13:31:38 +0000
Subject: [PATCH 374/751] Improve: Generalize Levenshtein in CUDA

---
 include/stringcuzilla/similarity.cuh | 530 ++++++++++++---------------
 1 file changed, 233 insertions(+), 297 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index fd499cca..ad65e508 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -54,15 +54,16 @@ namespace stringzilla {
  *  @brief GPU adaptation of the `scorer` on CUDA, avoiding warp-level shuffles and DPX.
  *  @note Uses 32-bit `uint` counter to iterate through the string slices, so it can't be over 4 billion characters.
  */
-template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
-          typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
-struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, objective_,
-                   sz_similarity_global_k, capability_, std::enable_if_t<capability_ & sz_cap_cuda_k>> {
+template <pointer_like first_iterator_type_, pointer_like second_iterator_type_, score_like score_type_,
+          substituter_like substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
+struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, linear_gap_costs_t,
+                   objective_, sz_similarity_global_k, capability_, std::enable_if_t<capability_ & sz_cap_cuda_k>> {
 
     using first_iterator_t = first_iterator_type_;
     using second_iterator_t = second_iterator_type_;
     using score_t = score_type_;
     using substituter_t = substituter_type_;
+    using gap_costs_t = linear_gap_costs_t;
 
     static constexpr sz_similarity_objective_t objective_k = objective_;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
@@ -73,12 +74,12 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
     using char_t = typename std::remove_cvref<first_char_t>::type;
 
-    using warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k,
-                                      sz_similarity_global_k, capability_k>;
+    using warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, linear_gap_costs_t,
+                                      objective_k, sz_similarity_global_k, capability_k>;
 
   protected:
-    substituter_t substituter_;
-    error_cost_t gap_cost_ {1};
+    substituter_t substituter_ {};
+    linear_gap_costs_t gap_costs_ {};
     score_t last_cell_ {0};
 
     __forceinline__ __device__ score_t pick_best(score_t a, score_t b) const noexcept {
@@ -87,8 +88,8 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     }
 
   public:
-    __forceinline__ __device__ tile_scorer(substituter_t substituter, error_cost_t gap_cost) noexcept
-        : substituter_(substituter), gap_cost_(gap_cost) {}
+    __forceinline__ __device__ tile_scorer(substituter_t subs, linear_gap_costs_t gaps) noexcept
+        : substituter_(subs), gap_costs_(gaps) {}
 
     /**
      *  @brief Initializes a boundary value within a certain diagonal.
@@ -96,7 +97,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
      *  @note Should only be called for the top row and left column of the matrix.
      */
     __forceinline__ __device__ void init(score_t &cell, sz_size_t diagonal_index) const noexcept {
-        cell = gap_cost_ * diagonal_index;
+        cell = gap_costs_.open_or_extend * diagonal_index;
     }
 
     /**
@@ -123,6 +124,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         score_t const *scores_pre_deletion, score_t *scores_new) noexcept {
 
         // Make sure we are called for an anti-diagonal traversal order
+        score_t const gap_costs = gap_costs_.open_or_extend;
         _sz_assert(scores_pre_insertion + 1 == scores_pre_deletion);
 
         // ? One weird observation, is that even though we can avoid fetching `pre_insertion`
@@ -136,7 +138,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 
             error_cost_t cost_of_substitution = substituter_(first_slice[tasks_count - i - 1], second_slice[i]);
             score_t if_substitution = pre_substitution + cost_of_substitution;
-            score_t if_deletion_or_insertion = pick_best(pre_deletion, pre_insertion) + gap_cost_;
+            score_t if_deletion_or_insertion = pick_best(pre_deletion, pre_insertion) + gap_costs;
             score_t cell_score = pick_best(if_deletion_or_insertion, if_substitution);
             scores_new[i] = cell_score;
         }
@@ -150,10 +152,10 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  *  @brief GPU adaptation of the `local_scorer` on CUDA, avoiding warp-level shuffles and DPX.
  *  @note Uses 32-bit `uint` counter to iterate through the string slices, so it can't be over 4 billion characters.
  */
-template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
-          typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
-struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, objective_,
-                   sz_similarity_local_k, capability_, std::enable_if_t<capability_ & sz_cap_cuda_k>> {
+template <pointer_like first_iterator_type_, pointer_like second_iterator_type_, score_like score_type_,
+          substituter_like substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
+struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, linear_gap_costs_t,
+                   objective_, sz_similarity_local_k, capability_, std::enable_if_t<capability_ & sz_cap_cuda_k>> {
 
     using first_iterator_t = first_iterator_type_;
     using second_iterator_t = second_iterator_type_;
@@ -169,12 +171,12 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
     using char_t = typename std::remove_cvref<first_char_t>::type;
 
-    using warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, objective_k,
-                                      sz_similarity_local_k, capability_k>;
+    using warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, linear_gap_costs_t,
+                                      objective_k, sz_similarity_local_k, capability_k>;
 
   protected:
-    substituter_t substituter_;
-    error_cost_t gap_cost_ {1};
+    substituter_t substituter_ {};
+    linear_gap_costs_t gap_costs_ {};
     score_t best_score_ {0};
 
     __forceinline__ __device__ score_t pick_best(score_t a, score_t b) const noexcept {
@@ -194,8 +196,8 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     }
 
   public:
-    __forceinline__ __device__ tile_scorer(substituter_t substituter, error_cost_t gap_cost) noexcept
-        : substituter_(substituter), gap_cost_(gap_cost) {}
+    __forceinline__ __device__ tile_scorer(substituter_t subs, linear_gap_costs_t gaps) noexcept
+        : substituter_(subs), gap_costs_(gaps) {}
 
     /**
      *  @brief Initializes a boundary value within a certain diagonal.
@@ -228,6 +230,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         score_t const *scores_pre_deletion, score_t *scores_new) noexcept {
 
         // Make sure we are called for an anti-diagonal traversal order
+        error_cost_t const gap_cost = gap_costs_.open_or_extend;
         _sz_assert(scores_pre_insertion + 1 == scores_pre_deletion);
 
         // ? One weird observation, is that even though we can avoid fetching `pre_insertion`
@@ -241,7 +244,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 
             error_cost_t cost_of_substitution = substituter_(first_slice[tasks_count - i - 1], second_slice[i]);
             score_t if_substitution = pre_substitution + cost_of_substitution;
-            score_t if_deletion_or_insertion = pick_best(pre_deletion, pre_insertion) + gap_cost_;
+            score_t if_deletion_or_insertion = pick_best(pre_deletion, pre_insertion) + gap_cost;
             score_t if_substitution_or_reset = pick_best(if_substitution, 0);
             score_t cell_score = pick_best(if_deletion_or_insertion, if_substitution_or_reset);
             scores_new[i] = cell_score;
@@ -266,10 +269,10 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  *  - @b `vmax4,vmin4,vadd4` video-processing instructions.
  */
 template <>
-struct tile_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k,
-                   sz_similarity_global_k, sz_caps_ck_k>
-    : public tile_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, sz_minimize_distance_k,
-                         sz_similarity_global_k, sz_cap_cuda_k> {
+struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
+    : public tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
 
     using warp_scorer_t::tile_scorer; // Make the constructors visible
 
@@ -279,7 +282,7 @@ struct tile_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, s
         sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion, sz_u8_t const *scores_pre_deletion,
         sz_u8_t *scores_new) noexcept {
 
-        sz_u8_t const gap_cost = this->gap_cost_;
+        sz_u8_t const gap_cost = this->gap_costs_.open_or_extend;
         _sz_assert(gap_cost == 1);
 
         // The hardest part of this kernel is dealing with unaligned loads!
@@ -323,10 +326,10 @@ struct tile_scorer<char const *, char const *, sz_u8_t, error_costs_uniform_t, s
 };
 
 template <>
-struct tile_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
-                   sz_similarity_global_k, sz_caps_ck_k>
-    : public tile_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t, sz_minimize_distance_k,
-                         sz_similarity_global_k, sz_cap_cuda_k> {
+struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
+    : public tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
     using warp_scorer_t::tile_scorer; // Make the constructors visible
 
     __forceinline__ __device__ void operator()(                                 //
@@ -335,7 +338,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t,
         sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,
         sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
 
-        sz_u16_t const gap_cost = this->gap_cost_;
+        sz_u16_t const gap_cost = this->gap_costs_.open_or_extend;
         _sz_assert(gap_cost == 1);
 
         // The hardest part of this kernel is dealing with unaligned loads!
@@ -378,10 +381,10 @@ struct tile_scorer<char const *, char const *, sz_u16_t, error_costs_uniform_t,
 };
 
 template <>
-struct tile_scorer<char const *, char const *, sz_u32_t, error_costs_uniform_t, sz_minimize_distance_k,
-                   sz_similarity_global_k, sz_caps_ck_k>
-    : public tile_scorer<char const *, char const *, sz_u32_t, error_costs_uniform_t, sz_minimize_distance_k,
-                         sz_similarity_global_k, sz_cap_cuda_k> {
+struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
+    : public tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
     using warp_scorer_t::tile_scorer; // Make the constructors visible
 };
 
@@ -411,20 +414,22 @@ struct tile_scorer<char const *, char const *, sz_u32_t, error_costs_uniform_t,
  *  - Keep everything in global memory - the strings and the diagonals.
  *  - Execute the naive algorithm, expecting the hardware to handle coalescing the memory accesses.
  */
-template <                                                       //
-    typename char_type_ = char,                                  //
-    typename index_type_ = uint,                                 //
-    typename score_type_ = sz_size_t,                            //
-    typename substituter_type_ = error_costs_uniform_t,          //
-    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
-    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
-    sz_capability_t capability_ = sz_cap_cuda_k                  //
+template <                                                             //
+    typename char_type_ = char,                                        //
+    typename index_type_ = uint,                                       //
+    score_like score_type_ = sz_size_t,                                //
+    score_like final_score_type_ = sz_size_t,                          //
+    substituter_like substituter_type_ = uniform_substitution_costs_t, //
+    gap_costs_like gap_costs_type_ = linear_gap_costs_t,               //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k,        //
+    sz_similarity_locality_t locality_ = sz_similarity_global_k,       //
+    sz_capability_t capability_ = sz_cap_cuda_k                        //
     >
 __global__ void _score_across_cuda_device(                     //
     char_type_ const *shorter_ptr, index_type_ shorter_length, //
     char_type_ const *longer_ptr, index_type_ longer_length,   //
-    score_type_ *result_ptr, score_type_ *diagonals_ptr,       //
-    substituter_type_ const *substituter_ptr, error_cost_t const gap_cost) noexcept {
+    final_score_type_ *result_ptr, score_type_ *diagonals_ptr, //
+    substituter_type_ const *substituter_ptr, gap_costs_type_ const *gaps_costs_ptr) noexcept {
 
     namespace cg = cooperative_groups;
 
@@ -434,13 +439,20 @@ __global__ void _score_across_cuda_device(                     //
     using char_t = char_type_;
     using index_t = index_type_;
     using score_t = score_type_;
+    using final_score_t = final_score_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
     static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
-    using substituter_t = error_costs_uniform_t;
-    using warp_scorer_t =
-        tile_scorer<char_t const *, char_t const *, score_t, substituter_t, objective_k, locality_k, capability_k>;
+
+    // Pre-load the substituter and gap costs.
+    using substituter_t = substituter_type_;
+    using gap_costs_t = gap_costs_type_;
+    substituter_t const substituter = *substituter_ptr;
+    gap_costs_t const gap_costs = *gaps_costs_ptr;
+
+    using warp_scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t, objective_k,
+                                      locality_k, capability_k>;
 
     // Only one thread will be initializing the top row and left column and outputting the result.
     bool const is_main_thread = blockIdx.x == 0 && threadIdx.x == 0;
@@ -464,9 +476,7 @@ __global__ void _score_across_cuda_device(                     //
     score_t *next_scores = diagonals_ptr + 2 * max_diagonal_length;
 
     // Initialize the first two diagonals:
-    substituter_t const substituter;
-    error_cost_t const gap_cost = 1;
-    warp_scorer_t diagonal_aligner {substituter, gap_cost};
+    warp_scorer_t diagonal_aligner {substituter, gap_costs};
     if (is_main_thread) {
         diagonal_aligner.init(previous_scores[0], 0);
         diagonal_aligner.init(current_scores[0], 1);
@@ -574,7 +584,7 @@ __global__ void _score_across_cuda_device(                     //
     }
 
     // Export one result per each block.
-    if (is_main_thread) *result_ptr = diagonal_aligner.score();
+    if (is_main_thread) *result_ptr = static_cast<final_score_t>(diagonal_aligner.score());
 }
 
 /**
@@ -586,15 +596,18 @@ __global__ void _score_across_cuda_device(                     //
  */
 template < //
     typename task_type_,
-    typename char_type_ = char,                                  //
-    typename index_type_ = uint,                                 //
-    typename score_type_ = sz_size_t,                            //
-    typename substituter_type_ = error_costs_uniform_t,          //
-    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
-    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
-    sz_capability_t capability_ = sz_cap_cuda_k                  //
+    typename char_type_ = char,                                        //
+    typename index_type_ = uint,                                       //
+    score_like score_type_ = sz_size_t,                                //
+    substituter_like substituter_type_ = uniform_substitution_costs_t, //
+    gap_costs_like gap_costs_type_ = linear_gap_costs_t,               //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k,        //
+    sz_similarity_locality_t locality_ = sz_similarity_global_k,       //
+    sz_capability_t capability_ = sz_cap_cuda_k                        //
     >
-__global__ void _score_on_each_cuda_warp(task_type_ *tasks, sz_size_t tasks_count) {
+__global__ void _score_on_each_cuda_warp(     //
+    task_type_ *tasks, sz_size_t tasks_count, //
+    substituter_type_ const *substituter_ptr, gap_costs_type_ const *gap_costs_ptr) noexcept {
 
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
     using task_t = task_type_;
@@ -605,18 +618,27 @@ __global__ void _score_on_each_cuda_warp(task_type_ *tasks, sz_size_t tasks_coun
     static constexpr sz_similarity_locality_t locality_k = locality_;
     static constexpr sz_similarity_objective_t objective_k = objective_;
 
+    // Pre-load the substituter and gap costs.
+    using substituter_t = substituter_type_;
+    using gap_costs_t = gap_costs_type_;
+    substituter_t const substituter = *substituter_ptr;
+    gap_costs_t const gap_costs = *gap_costs_ptr;
+
+    using warp_scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t, objective_k,
+                                      locality_k, capability_k>;
+
     // Allocating shared memory is handled on the host side.
     extern __shared__ char shared_memory_buffer[];
 
     // We are computing N edit distances for N pairs of strings. Not a cartesian product!
     // Each block/warp may end up receiving a different number of strings.
-    for (sz_size_t task_idx = blockIdx.x; task_idx < first_strings.size(); task_idx += gridDim.x) {
+    for (sz_size_t task_idx = blockIdx.x; task_idx < tasks_count; task_idx += gridDim.x) {
         task_t const &task = tasks[task_idx];
         char_t const *shorter_global = task.shorter_ptr;
         char_t const *longer_global = task.longer_ptr;
         sz_size_t const shorter_length = task.shorter_length;
         sz_size_t const longer_length = task.longer_length;
-        score_t &result_ref = *task.result_ptr;
+        auto &result_ref = task.result;
 
         // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
         // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
@@ -645,7 +667,7 @@ __global__ void _score_on_each_cuda_warp(task_type_ *tasks, sz_size_t tasks_coun
         for (uint i = threadIdx.x; i < shorter_length; i += blockDim.x) shorter[i] = shorter_global[i];
 
         // Initialize the first two diagonals:
-        warp_scorer_t diagonal_aligner {substituter_, gap_cost_};
+        warp_scorer_t diagonal_aligner {substituter, gap_costs};
         if (threadIdx.x == 0) {
             diagonal_aligner.init(previous_scores[0], 0);
             diagonal_aligner.init(current_scores[0], 1);
@@ -750,171 +772,20 @@ __global__ void _score_on_each_cuda_warp(task_type_ *tasks, sz_size_t tasks_coun
 
 #pragma region - Levenshtein Distance in CUDA
 
-/** @brief Dispatches on @b `_levenshtein_in_cuda_warp` on the device side from the host side. */
-template <                                       //
-    sz_capability_t capability_ = sz_cap_cuda_k, //
-    typename first_strings_type_,                //
-    typename second_strings_type_,               //
-    typename allocator_type_,                    //
-    typename score_type_ = sz_size_t             //
-    >
-cuda_status_t _levenshtein_distances_implementation(                                      //
-    first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
-    score_type_ *results, allocator_type_ const &allocator = {}, gpu_specs_t specs = {},  //
-    cudaStream_t stream = 0) noexcept(false) {
-
-    // We need to be able to copy these function arguments into GPU memory:
-    static constexpr sz_capability_t capability_k = capability_;
-    using first_strings_t = first_strings_type_;
-    using second_strings_t = second_strings_type_;
-    using allocator_t = allocator_type_;
-    static_assert(std::is_trivially_copyable<first_strings_t>() && std::is_trivially_copyable<second_strings_t>(),
-                  "The first and second strings must be trivially copyable types - consider `arrow_strings_view`.");
-    using first_string_t = typename first_strings_t::value_type;
-    using second_string_t = typename second_strings_t::value_type;
-    static_assert(std::is_trivially_copyable<first_string_t>() && std::is_trivially_copyable<second_string_t>(),
-                  "The first and second strings must be trivially copyable types - consider `span<char>`.");
-    using first_char_t = typename first_string_t::value_type;
-    using second_char_t = typename second_string_t::value_type;
-    static_assert(sizeof(first_char_t) == sizeof(second_char_t), "Character types don't match");
-    using char_t = typename std::remove_cvref<first_char_t>::type;
-    using score_t = score_type_;
-
-    // Make sure that we don't string pairs that are too large to fit 3 matrix diagonals into shared memory.
-    // H100 Streaming Multiprocessor can have up to 128 active warps concurrently and only 256 KB of shared memory.
-    // A100 SMs had only 192 KB. We can't deal with blocks that require more memory than the SM can provide.
-    using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, true>;
-    sz_size_t shared_memory_per_block =
-        _scores_diagonally_warp_shared_memory_requirement<false>(first_strings, second_strings, 1);
-    if (shared_memory_per_block > specs.shared_memory_per_multiprocessor()) return {status_t::bad_alloc_k};
-
-    // It may be the case that we've only received empty strings.
-    if (shared_memory_per_block == 0) {
-        for (sz_size_t i = 0; i < first_strings.size(); ++i)
-            if (first_strings[i].length() == 0) { results[i] = second_strings[i].length(); }
-            else if (second_strings[i].length() == 0) { results[i] = first_strings[i].length(); }
-        return {status_t::success_k};
-    }
-
-    // In most cases we should be able to fit many blocks per SM.
-    sz_size_t count_blocks_per_multiprocessor = specs.shared_memory_per_multiprocessor() / shared_memory_per_block;
-    if (count_blocks_per_multiprocessor > specs.max_blocks_per_multiprocessor)
-        count_blocks_per_multiprocessor = specs.max_blocks_per_multiprocessor;
-    if (count_blocks_per_multiprocessor > first_strings.size()) count_blocks_per_multiprocessor = first_strings.size();
-    _sz_assert(count_blocks_per_multiprocessor > 0);
-
-    // Let's use all 32 threads in a warp.
-    constexpr sz_size_t threads_per_block = 32u;
-    sz_size_t const max_input_length = 1024u * 16u;
-    auto warp_level_kernel = &_levenshtein_in_cuda_warp<first_strings_t, second_strings_t, score_t, capability_k>;
-    void *warp_level_kernel_args[] = {
-        (void *)&first_strings,
-        (void *)&second_strings,
-        (void *)&results,
-        (void *)&max_input_length,
-    };
-
-    // On Volta and newer GPUs, there is an extra flag to be set to use more than 48 KB of shared memory per block.
-    // CUDA reserves 1 KB of shared memory per thread block, so on H100 we can use up to 227 KB of shared memory.
-    // https://docs.nvidia.com/cuda/hopper-tuning-guide/index.html#unified-shared-memory-l1-texture-cache
-    cudaError_t attribute_error =
-        cudaFuncSetAttribute(warp_level_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
-                             specs.shared_memory_per_multiprocessor() - count_blocks_per_multiprocessor * 1024);
-    if (attribute_error != cudaSuccess) return {status_t::unknown_k, attribute_error};
-
-    // Create CUDA events for timing
-    cudaEvent_t start_event, stop_event;
-    cudaEventCreate(&start_event);
-    cudaEventCreate(&stop_event);
-
-    // Record the start event
-    cudaEventRecord(start_event, stream);
-
-    // Enqueue the warp_level_kernel for execution:
-    cudaError_t launch_error = cudaLaunchKernel(                                 //
-        reinterpret_cast<void *>(warp_level_kernel),                             // Kernel function pointer
-        dim3(count_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions
-        dim3(threads_per_block),                                                 // Block dimensions
-        warp_level_kernel_args,                                                  // Array of kernel argument pointers
-        shared_memory_per_block,                                                 // Shared memory per block (in bytes)
-        stream);                                                                 // CUDA stream
-    if (launch_error != cudaSuccess)
-        if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
-        else { return {status_t::unknown_k, launch_error}; }
-
-    // Fetch the execution error:
-    float execution_milliseconds = 0;
-    cudaError_t shorts_execution_error = cudaStreamSynchronize(stream);
-    if (shorts_execution_error != cudaSuccess)
-        if (shorts_execution_error == cudaErrorMemoryAllocation) {
-            return {status_t::bad_alloc_k, shorts_execution_error, execution_milliseconds};
-        }
-        else { return {status_t::unknown_k, shorts_execution_error, execution_milliseconds}; }
-
-    // Go through the results and check if any of them were too big to be processed
-    // by the warp-level implementation, and should be processed by the `_score_across_cuda_device`.
-    sz_size_t count_longer_strings = 0;
-    sz_size_t longest_string_length = 0;
-    for (sz_size_t i = 0; i < first_strings.size(); ++i) {
-        count_longer_strings += results[i] == std::numeric_limits<score_t>::max();
-        longest_string_length =
-            sz_max_of_two(longest_string_length, sz_max_of_two(first_strings[i].length(), second_strings[i].length()));
-    }
-
-    if (count_longer_strings) {
-        auto device_level_u16index_kernel = &_score_across_cuda_device<char_t, sz_u16_t, score_t, capability_k>;
-        auto device_level_u32index_kernel = &_score_across_cuda_device<char_t, sz_u32_t, score_t, capability_k>;
-        auto device_level_u64index_kernel = &_score_across_cuda_device<char_t, sz_u64_t, score_t, capability_k>;
-        void *device_level_kernel_args[6];
-        safe_vector<score_t, allocator_t> diagonals_buffer(allocator);
-        if (diagonals_buffer.try_resize((longest_string_length + 1) * 3) == status_t::bad_alloc_k)
-            return {status_t::bad_alloc_k};
-
-        // We will enqueue many such kernels one after another, without waiting for a completion of the previous one.
-        for (sz_size_t i = 0; i < first_strings.size(); ++i) {
-            if (results[i] != std::numeric_limits<score_t>::max()) continue;
-            // We need to process this string pair separately.
-            auto const &first_global = first_strings[i];
-            auto const &second_global = second_strings[i];
-
-            // Pick the shorter and longer string.
-            char_t const *shorter_ptr, *longer_ptr;
-            sz_size_t shorter_length, longer_length;
-            if (first_global.length() < second_global.length()) {
-                shorter_ptr = first_global.data(), longer_ptr = second_global.data(),
-                shorter_length = first_global.length(), longer_length = second_global.length();
-            }
-            else {
-                shorter_ptr = second_global.data(), longer_ptr = first_global.data(),
-                shorter_length = second_global.length(), longer_length = first_global.length();
-            }
-        }
-    }
-
-    // Wait until everything is done.
-    cudaEventRecord(stop_event, stream);
-    cudaEventElapsedTime(&execution_milliseconds, start_event, stop_event);
-    cudaError_t longs_execution_error = cudaStreamSynchronize(stream);
-    if (longs_execution_error != cudaSuccess)
-        if (longs_execution_error == cudaErrorMemoryAllocation) {
-            return {status_t::bad_alloc_k, longs_execution_error, execution_milliseconds};
-        }
-        else { return {status_t::unknown_k, longs_execution_error, execution_milliseconds}; }
-
-    return {status_t::success_k, cudaSuccess, execution_milliseconds};
-}
-
 /**
  *  @brief  Dispatches baseline Levenshtein edit distance algorithm to the GPU.
  *          Before starting the kernels, bins them by size to maximize the number of blocks
  *          per grid that can run simultaneously, while fitting into the shared memory.
  */
-template <typename char_type_, typename allocator_type_, sz_capability_t capability_>
-struct levenshtein_distances<char_type_, allocator_type_, capability_, std::enable_if_t<capability_ & sz_cap_cuda_k>> {
+template <typename char_type_, gap_costs_like gap_costs_type_, typename allocator_type_, sz_capability_t capability_>
+struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capability_,
+                             std::enable_if_t<capability_ & sz_cap_cuda_k>> {
 
     using char_t = char_type_;
+    using gap_costs_t = gap_costs_type_;
     using allocator_t = allocator_type_;
     using scores_allocator_t = typename allocator_t::template rebind<sz_size_t>::other;
+    static constexpr sz_capability_t capability_k = capability_;
 
     struct task_t {
         char_t const *shorter_ptr = nullptr;
@@ -922,10 +793,14 @@ struct levenshtein_distances<char_type_, allocator_type_, capability_, std::enab
         char_t const *longer_ptr = nullptr;
         size_t longer_length = 0;
         size_t memory_requirement = 0;
+        size_t bytes_per_cell = 0;
         size_t original_index = 0;
+        size_t result = 0;
 
-        constexpr task_t(char_t const *first_ptr, size_t first_length, char_t const *second_ptr,
-                         size_t second_length) noexcept {
+        constexpr task_t() = default;
+        constexpr task_t(                                 //
+            char_t const *first_ptr, size_t first_length, //
+            char_t const *second_ptr, size_t second_length) noexcept {
             if (first_length < second_length)
                 shorter_ptr = first_ptr, shorter_length = first_length, longer_ptr = second_ptr,
                 longer_length = second_length;
@@ -940,38 +815,60 @@ struct levenshtein_distances<char_type_, allocator_type_, capability_, std::enab
     };
     using tasks_allocator_t = typename allocator_t::template rebind<task_t>::other;
 
-    allocator_t allocator = {};
+    uniform_substitution_costs_t substituter_ {};
+    gap_costs_t gap_costs_ {};
+    allocator_t alloc_ {};
 
-    levenshtein_distances(allocator_t const &allocator = {}) noexcept : allocator(allocator) {}
+    levenshtein_distances(allocator_t const &alloc = {}) noexcept : alloc_(alloc) {}
+    levenshtein_distances(uniform_substitution_costs_t subs, gap_costs_t gaps,
+                          allocator_t const &alloc = allocator_t {}) noexcept
+        : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
-    cuda_status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
-                             results_type_ &&results, gpu_specs_t specs = {}, cudaStream_t stream = 0) const noexcept {
+    cuda_status_t operator()(                                                                 //
+        first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
+        results_type_ *results_ptr,                                                           //
+        gpu_specs_t specs = {}, cuda_executor_t executor = {}) const noexcept {
 
-        score_t *const results_ptr = results.data();
-        safe_vector<task_t, tasks_allocator_t> tasks(allocator_);
-        if (!tasks.try_resize(first_strings.size()) == status_t::bad_alloc_k) return {status_t::bad_alloc_k};
+        using final_score_t = results_type_;
+        safe_vector<task_t, tasks_allocator_t> tasks(alloc_);
+        if (tasks.try_resize(first_strings.size()) == status_t::bad_alloc_k) return {status_t::bad_alloc_k};
 
         // Export all the tasks and sort them by decreasing memory requirement.
-        using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, is_signed_>;
+        using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, false>;
         for (sz_size_t i = 0; i < first_strings.size(); ++i) {
-            tasks[i] = task_t(                                      //
+            task_t task(                                            //
                 first_strings[i].data(), first_strings[i].length(), //
                 second_strings[i].data(), second_strings[i].length());
-            tasks[i].original_index = i;
-            tasks[i].memory_requirement = //
-                similarity_memory_requirements_t(tasks[i].shorter_length, tasks[i].longer_length, 1, sizeof(char_t), 4)
-                    .total;
+            similarity_memory_requirements_t requirement(                                  //
+                task.shorter_length, task.longer_length,                                   //
+                gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
+                sizeof(char_t), 4);
+
+            task.result = std::numeric_limits<final_score_t>::max(); // Signal that we are not done yet.
+            task.original_index = i;
+            task.memory_requirement = requirement.total;
+            task.bytes_per_cell = requirement.bytes_per_cell;
+            tasks[i] = task;
         }
         std::sort(tasks.begin(), tasks.end(),
                   [](task_t const &a, task_t const &b) { return a.memory_requirement > b.memory_requirement; });
 
         // On very large inputs we will keep the diagonals in shared memory.
-        safe_vector<sz_u64_t, scores_allocator_t> diagonals_u64_buffer(allocator_);
-        auto device_level_u16_kernel = &_score_across_cuda_device<char_t, sz_u16_t, sz_u16_t, capability_k>;
-        auto device_level_u32_kernel = &_score_across_cuda_device<char_t, sz_u32_t, sz_u32_t, capability_k>;
-        auto device_level_u64_kernel = &_score_across_cuda_device<char_t, sz_u64_t, sz_u64_t, capability_k>;
-        void *device_level_kernel_args[6];
+        safe_vector<sz_u64_t, scores_allocator_t> diagonals_u64_buffer(alloc_);
+        auto device_level_u16_kernel =
+            &_score_across_cuda_device<char_t, sz_u16_t, sz_u16_t, final_score_t, uniform_substitution_costs_t,
+                                       linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k,
+                                       capability_k>;
+        auto device_level_u32_kernel =
+            &_score_across_cuda_device<char_t, sz_u32_t, sz_u32_t, final_score_t, uniform_substitution_costs_t,
+                                       linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k,
+                                       capability_k>;
+        auto device_level_u64_kernel =
+            &_score_across_cuda_device<char_t, sz_u64_t, sz_u64_t, final_score_t, uniform_substitution_costs_t,
+                                       linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k,
+                                       capability_k>;
+        void *device_level_kernel_args[8];
 
         // Now we need to bin them based on the number of blocks per multiprocessor,
         // starting with problems, that can't fit into the memory of a single SM.
@@ -990,26 +887,28 @@ struct levenshtein_distances<char_type_, allocator_type_, capability_, std::enab
             device_level_kernel_args[1] = (void *)(&task.shorter_length);
             device_level_kernel_args[2] = (void *)(task.longer_ptr);
             device_level_kernel_args[3] = (void *)(&task.longer_length);
-            device_level_kernel_args[4] = (void *)(results_ptr + i);
+            device_level_kernel_args[4] = (void *)(&task.result);
             device_level_kernel_args[5] = (void *)(diagonals_u64_buffer.data());
+            device_level_kernel_args[6] = (void *)(&substituter_);
+            device_level_kernel_args[7] = (void *)(&gap_costs_);
 
             // Pick the smallest fitting type for the diagonals.
             void *device_level_kernel = reinterpret_cast<void *>(device_level_u16_kernel);
-            if (task.max_diagonal_length() >= std::numeric_limits<sz_u16_t>::max())
+            if (task.bytes_per_cell >= sizeof(sz_u32_t))
                 device_level_kernel = reinterpret_cast<void *>(device_level_u32_kernel);
-            if (task.max_diagonal_length() >= std::numeric_limits<sz_u32_t>::max())
+            if (task.bytes_per_cell >= sizeof(sz_u64_t))
                 device_level_kernel = reinterpret_cast<void *>(device_level_u64_kernel);
 
             // TODO: We can be wiser about the dimensions of this grid.
             uint const random_block_size = 128;
             uint const random_blocks_per_multiprocessor = 32;
             cudaError_t launch_error = cudaLaunchCooperativeKernel(                       //
-                reinterpret_cast<void *>(device_level_u64index_kernel),                   // Kernel function pointer
+                reinterpret_cast<void *>(device_level_kernel),                            // Kernel function pointer
                 dim3(random_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions
                 dim3(random_block_size),                                                  // Block dimensions
                 device_level_kernel_args, // Array of kernel argument pointers
                 0,                        // Shared memory per block (in bytes)
-                stream);                  // CUDA stream
+                executor.stream);         // CUDA stream
             if (launch_error != cudaSuccess)
                 if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
                 else { return {status_t::unknown_k, launch_error}; }
@@ -1018,18 +917,34 @@ struct levenshtein_distances<char_type_, allocator_type_, capability_, std::enab
         // Now process remaining warp-level tasks, checking warp densities in reverse order.
         // From the highest possible number of warps per multiprocessor to the lowest.
         std::initializer_list<size_t> warps_per_multiprocessor_densities = {32, 16, 8, 4, 2, 1};
-        auto warp_level_u8_kernel = &_score_on_each_cuda_warp<task_t, char_t, sz_u8_t, sz_u8_t, capability_k>;
-        auto warp_level_u16_kernel = &_score_on_each_cuda_warp<task_t, char_t, sz_u16_t, sz_u16_t, capability_k>;
+        auto warp_level_u8_kernel =
+            &_score_on_each_cuda_warp<task_t, char_t, sz_u8_t, sz_u8_t, uniform_substitution_costs_t,
+                                      linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
+        auto warp_level_u16_kernel =
+            &_score_on_each_cuda_warp<task_t, char_t, sz_u16_t, sz_u16_t, uniform_substitution_costs_t,
+                                      linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
+        void *warp_level_kernel_args[4];
         size_t count_tasks_for_this_density = 0;
         for (size_t warps_per_multiprocessor_density : warps_per_multiprocessor_densities) {
             for (; count_tasks_processed != tasks.size(); ++count_tasks_processed, ++count_tasks_for_this_density) {
                 // Check if the current warp density is still optimal.
-                task_t const &task = tasks[count_tasks_processed];
+                size_t const count_tasks_for_warp_processed = count_tasks_processed - count_tasks_for_entire_device;
+                task_t const &task = tasks[tasks.size() - count_tasks_for_warp_processed - 1];
                 size_t const requirement_with_current_warp_density =
                     task.memory_requirement + specs.reserved_memory_per_block * warps_per_multiprocessor_density;
                 if (requirement_with_current_warp_density > specs.shared_memory_per_multiprocessor()) break;
             }
 
+            // If there are no more tasks left - exit!
+            if (!count_tasks_for_this_density) break;
+
+            // If we don't even have enough tasks to keep the entire device busy - continue shrinking
+            // the number of warps per multiprocessor, until we find a fitting density.
+            // ... Unless we are already at the lowest density.
+            if (count_tasks_for_this_density < (specs.streaming_multiprocessors * warps_per_multiprocessor_density) &&
+                warps_per_multiprocessor_density > 1)
+                continue;
+
             // Now check if any tasks of that size have been found and if their quantity is sufficient
             // to fill the entire device with warps. If we don't have enough tasks for this density,
             // part of our GPU will just sit idle...
@@ -1044,32 +959,48 @@ struct levenshtein_distances<char_type_, allocator_type_, capability_, std::enab
             // as the default limits our blocks (in our case, single-warp blocks) to 48 KB, while
             // device supports 4-5x more! Still, that API is synchronous and we must block the current
             // thread to what until the task completes to change the shared memory amount.
-
-            device_level_kernel_args[0] = (void *)(task.shorter_ptr);
-            device_level_kernel_args[1] = (void *)(&task.shorter_length);
-            device_level_kernel_args[2] = (void *)(task.longer_ptr);
-            device_level_kernel_args[3] = (void *)(&task.longer_length);
-            device_level_kernel_args[4] = (void *)(results_ptr + i);
-            device_level_kernel_args[5] = (void *)(diagonals_u64_buffer.data());
+            size_t const count_tasks_for_warp_processed = count_tasks_processed - count_tasks_for_entire_device;
+            size_t const first_task_index =
+                tasks.size() - count_tasks_for_warp_processed - count_tasks_for_this_density;
+            warp_level_kernel_args[0] = (void *)(tasks.data() + first_task_index);
+            warp_level_kernel_args[1] = (void *)(&count_tasks_for_this_density);
+            warp_level_kernel_args[2] = (void *)(&substituter_);
+            warp_level_kernel_args[3] = (void *)(&gap_costs_);
 
             // Pick the smallest fitting type for the diagonals.
-            void *device_level_kernel = reinterpret_cast<void *>(warp_level_u8_kernel);
-            if (task.max_diagonal_length() >= std::numeric_limits<sz_u8_t>::max())
-                device_level_kernel = reinterpret_cast<void *>(warp_level_u16_kernel);
+            void *warp_level_kernel = reinterpret_cast<void *>(warp_level_u8_kernel);
+            if (tasks[first_task_index].bytes_per_cell >= sizeof(sz_u16_t))
+                warp_level_kernel = reinterpret_cast<void *>(warp_level_u16_kernel);
+
+            // Warp-level algorithm clearly aligns with the warp size.
+            uint const warp_block_size = static_cast<uint>(specs.warp_size);
+            uint const warp_blocks_per_multiprocessor = static_cast<uint>(warps_per_multiprocessor_density);
+            size_t const shared_memory_per_block = tasks[first_task_index].memory_requirement;
+            cudaError_t launch_error = cudaLaunchCooperativeKernel(                     //
+                reinterpret_cast<void *>(warp_level_kernel),                            // Kernel function pointer
+                dim3(warp_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions
+                dim3(warp_block_size),                                                  // Block dimensions
+                device_level_kernel_args, // Array of kernel argument pointers
+                shared_memory_per_block,  // Shared memory per block (in bytes)
+                executor.stream);         // CUDA stream
+            if (launch_error != cudaSuccess)
+                if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
+                else { return {status_t::unknown_k, launch_error}; }
         }
 
         // Now that everything went well, export the results back into the `results` array.
         for (size_t i = 0; i < count_tasks_processed; ++i) {
             task_t const &task = tasks[i];
-            results[task.original_index] = *task.result_ptr;
+            results_ptr[task.original_index] = task.result;
         }
+        return {status_t::success_k};
     }
 };
 
 #pragma endregion
 
 #pragma region - Needleman Wunsch Scores in CUDA
-
+#if 0
 /**
  *  @brief  Convenience buffer of the size matching the size of the CUDA constant memory,
  *          used to cheaper store and access the substitution costs for the characters.
@@ -1088,19 +1019,19 @@ __constant__ char _error_costs_in_cuda_constant_memory[256 * 256];
  *
  *  @tparam substituter_type_ Must be a trivially copyable object already placed in the GPU @b constant memory.
  */
-template <                                              //
-    typename first_strings_type_,                       //
-    typename second_strings_type_,                      //
-    typename score_type_ = sz_ssize_t,                  //
-    typename substituter_type_ = error_costs_256x256_t, //
-    sz_capability_t capability_ = sz_cap_cuda_k         //
+template <                                                      //
+    typename first_strings_type_,                               //
+    typename second_strings_type_,                              //
+    score_like score_type_ = sz_ssize_t,                        //
+    substituter_like substituter_type_ = error_costs_256x256_t, //
+    gap_costs_like gap_costs_type_ = linear_gap_costs_t,        //
+    sz_capability_t capability_ = sz_cap_cuda_k                 //
     >
 __global__ void _needleman_wunsch_in_cuda_warp( //
     first_strings_type_ first_strings,          //
     second_strings_type_ second_strings,        //
     score_type_ *results_ptr,                   //
-    error_cost_t gap_cost,                      //
-    sz_size_t max_magnitude_change) {
+    gap_costs_type_ gaps) {
 
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
     using first_string_t = typename first_strings_type_::value_type;
@@ -1150,8 +1081,8 @@ __global__ void _needleman_wunsch_in_cuda_warp( //
 
         // Estimate the maximum dimension of the DP matrix to pick the smallest fitting type.
         using similarity_memory_requirements_t = similarity_memory_requirements<uint, true>;
-        similarity_memory_requirements_t requirements(first_length, second_length,
-                                                      static_cast<uint>(max_magnitude_change), sizeof(char_t), 4);
+        similarity_memory_requirements_t requirements(first_length, second_length, static_cast<uint>(magnitude()),
+                                                      sizeof(char_t), 4);
 
         // Estimate the maximum dimension of the DP matrix to pick the smallest fitting type.
         span<char const> const first = {first_global.data(), first_length};
@@ -1185,19 +1116,20 @@ __global__ void _needleman_wunsch_in_cuda_warp( //
 }
 
 /** @brief Dispatches on @b `_needleman_wunsch_in_cuda_warp` on the device side from the host side. */
-template <                                             //
-    sz_capability_t capability_ = sz_cap_cuda_k,       //
-    typename first_strings_type_,                      //
-    typename second_strings_type_,                     //
-    typename score_type_ = sz_ssize_t,                 //
-    typename substituter_type_ = error_costs_256x256_t //
+template <                                                      //
+    sz_capability_t capability_ = sz_cap_cuda_k,                //
+    typename first_strings_type_,                               //
+    typename second_strings_type_,                              //
+    score_like score_type_ = sz_ssize_t,                        //
+    substituter_like substituter_type_ = error_costs_256x256_t, //
+    gap_costs_like gap_costs_type_ = linear_gap_costs_t         //
     >
 cuda_status_t _needleman_wunsch_via_cuda_warp(                                            //
     first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
     score_type_ *results,
     substituter_type_ const &substituter, //
     error_cost_t gap_cost = 1,            //
-    gpu_specs_t specs = {}, cudaStream_t stream = 0) noexcept(false) {
+    gpu_specs_t specs = {}, cuda_executor_t executor = {}) noexcept(false) {
 
     // We need to be able to copy these function arguments into GPU memory:
     static constexpr sz_capability_t capability_k = capability_;
@@ -1211,8 +1143,8 @@ cuda_status_t _needleman_wunsch_via_cuda_warp(
     // Make sure that we don't string pairs that are too large to fit 3 matrix diagonals into shared memory.
     // H100 Streaming Multiprocessor can have up to 128 active warps concurrently and only 256 KB of shared memory.
     // A100 SMs had only 192 KB. We can't deal with blocks that require more memory than the SM can provide.
-    sz_size_t shared_memory_per_block = _scores_diagonally_warp_shared_memory_requirement<true>(
-        first_strings, second_strings, substituter.max_magnitude_change());
+    sz_size_t shared_memory_per_block =
+        _scores_diagonally_warp_shared_memory_requirement<true>(first_strings, second_strings, substituter.magnitude());
     if (shared_memory_per_block > specs.shared_memory_per_multiprocessor()) return {status_t::bad_alloc_k};
 
     // It may be the case that we've only received empty strings.
@@ -1231,12 +1163,11 @@ cuda_status_t _needleman_wunsch_via_cuda_warp(
 
     // Let's use all 32 threads in a warp.
     constexpr sz_size_t threads_per_block = 32u;
-    sz_size_t const max_magnitude_change = substituter.max_magnitude_change();
+    sz_size_t const magnitude() = substituter.magnitude();
     auto kernel =
         &_needleman_wunsch_in_cuda_warp<first_strings_t, second_strings_t, score_t, substituter_t, capability_k>;
     void *kernel_args[] = {
-        (void *)&first_strings, (void *)&second_strings,       (void *)&results,
-        (void *)&gap_cost,      (void *)&max_magnitude_change,
+        (void *)&first_strings, (void *)&second_strings, (void *)&results, (void *)&gap_cost, (void *)&magnitude(),
     };
 
     // On Volta and newer GPUs, there is an extra flag to be set to use more than 48 KB of shared memory per block.
@@ -1253,11 +1184,11 @@ cuda_status_t _needleman_wunsch_via_cuda_warp(
     cudaEventCreate(&stop_event);
 
     // Record the start event
-    cudaEventRecord(start_event, stream);
+    cudaEventRecord(start_event, executor.stream);
 
     // Enqueue the transfer of the substituter to the constant memory:
     cudaError_t copy_error = cudaMemcpyToSymbolAsync(_error_costs_in_cuda_constant_memory, (void const *)&substituter,
-                                                     sizeof(substituter_t), 0, cudaMemcpyHostToDevice, stream);
+                                                     sizeof(substituter_t), 0, cudaMemcpyHostToDevice, executor.stream);
     if (copy_error != cudaSuccess) return {status_t::unknown_k, copy_error};
 
     // Enqueue the kernel for execution:
@@ -1267,16 +1198,16 @@ cuda_status_t _needleman_wunsch_via_cuda_warp(
         dim3(threads_per_block),                                                 // Block dimensions
         kernel_args,                                                             // Array of kernel argument pointers
         shared_memory_per_block,                                                 // Shared memory per block (in bytes)
-        stream);                                                                 // CUDA stream
+        executor.stream);                                                        // CUDA stream
     if (launch_error != cudaSuccess)
         if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
         else { return {status_t::unknown_k, launch_error}; }
 
-    cudaEventRecord(stop_event, stream);
+    cudaEventRecord(stop_event, executor.stream);
 
     // Fetch the execution error:
     float execution_milliseconds = 0;
-    cudaError_t execution_error = cudaStreamSynchronize(stream);
+    cudaError_t execution_error = cudaStreamSynchronize(executor.stream);
     cudaEventElapsedTime(&execution_milliseconds, start_event, stop_event);
 
     if (execution_error != cudaSuccess)
@@ -1288,27 +1219,32 @@ cuda_status_t _needleman_wunsch_via_cuda_warp(
 }
 
 /** @brief Dispatches baseline Needleman Wunsch algorithm to the GPU. */
-template <typename char_type_, typename allocator_type_, typename substituter_type_>
-struct needleman_wunsch_scores<char_type_, substituter_type_, allocator_type_, sz_cap_cuda_k> {
+template <typename char_type_, substituter_like substituter_type_, gap_costs_like gap_costs_type_,
+          typename allocator_type_>
+struct needleman_wunsch_scores<char_type_, substituter_type_, gap_costs_type_, allocator_type_, sz_cap_cuda_k> {
 
     using char_t = char_type_;
     using substituter_t = substituter_type_;
+    using gap_costs_t = gap_costs_type_;
+    using allocator_t = allocator_type_;
 
     substituter_t substituter_ {};
-    error_cost_t gap_cost_ {1};
+    error_cost_t gap_costs_ {1};
+    allocator_t alloc_ {};
 
-    needleman_wunsch_scores() noexcept {}
-    needleman_wunsch_scores(substituter_t subs, error_cost_t gap_cost) noexcept
-        : substituter_(subs), gap_cost_(gap_cost) {}
+    needleman_wunsch_scores(allocator_t const &alloc = {}) noexcept : alloc_(alloc) {}
+    needleman_wunsch_scores(substituter_t subs, gap_costs_t gaps, allocator_t alloc = allocator_t {}) noexcept
+        : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
     cuda_status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
-                             results_type_ &&results, gpu_specs_t specs = {}, cudaStream_t stream = 0) const noexcept {
+                             results_type_ &&results, gpu_specs_t specs = {},
+                             cuda_executor_t executor = {}) const noexcept {
         return _needleman_wunsch_via_cuda_warp<sz_cap_cuda_k>(first_strings, second_strings, results, substituter_,
-                                                              gap_cost_, specs, stream);
+                                                              gap_costs_, specs, executor);
     }
 };
-
+#endif
 #pragma endregion
 
 } // namespace stringzilla

From 7e778d9e0e1680ec53e4be9863bac7d340dda612 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 30 Apr 2025 13:32:38 +0000
Subject: [PATCH 375/751] Improve: Naming "executor" interfaces

The `for_each` and `for_each_slice` are
more similar to Parallel STL, than `slice`
and `slice_range`.
---
 include/stringcuzilla/types.hpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/stringcuzilla/types.hpp b/include/stringcuzilla/types.hpp
index a31b99d3..ebb9aceb 100644
--- a/include/stringcuzilla/types.hpp
+++ b/include/stringcuzilla/types.hpp
@@ -6,6 +6,8 @@
 #ifndef STRINGCUZILLA_TYPES_HPP_
 #define STRINGCUZILLA_TYPES_HPP_
 
+#include <thread> // `std::thread::hardware_concurrency`
+
 #include "stringzilla/types.hpp"
 
 namespace ashvardanian {
@@ -19,7 +21,7 @@ struct dummy_executor_t {
      *          the same thread.
      */
     template <typename function_type_>
-    inline void slice(size_t n, function_type_ &&function) const noexcept {
+    inline void for_each(size_t n, function_type_ &&function) const noexcept {
         for (size_t i = 0; i < n; ++i) function(i);
     }
 
@@ -30,7 +32,7 @@ struct dummy_executor_t {
      *          handled by a particular thread.
      */
     template <typename function_type_>
-    inline void slice_range(size_t n, function_type_ &&function) const noexcept {
+    inline void for_each_range(size_t n, function_type_ &&function) const noexcept {
         function(0, n);
     }
 
@@ -53,7 +55,7 @@ struct openmp_executor_t {
      *          the same thread.
      */
     template <typename function_type_>
-    inline void slice(size_t n, function_type_ &&function) const noexcept {
+    inline void for_each(size_t n, function_type_ &&function) const noexcept {
 #pragma omp parallel for
         for (size_t i = 0; i < n; ++i) function(i);
     }
@@ -65,7 +67,7 @@ struct openmp_executor_t {
      *          handled by a particular thread.
      */
     template <typename function_type_>
-    inline void slice_range(size_t n, function_type_ &&function) const noexcept {
+    inline void for_each_range(size_t n, function_type_ &&function) const noexcept {
         // OpenMP won't use more threads than the number of available cores
         // and by using STL to query that number, we avoid the need to link
         // against OpenMP libraries.
@@ -94,10 +96,10 @@ struct openmp_executor_t {
 template <typename executor_type_>
 concept executor_like = requires(executor_type_ executor) {
     {
-        executor.slice(0u, [](size_t) {})
+        executor.for_each(0u, [](size_t) {})
     } -> std::same_as<void>;
     {
-        executor.slice_range(0u, [](size_t, size_t) {})
+        executor.for_each_range(0u, [](size_t, size_t) {})
     } -> std::same_as<void>;
     {
         executor.eager(0u, [](size_t) {})

From 22c691ea376b4b389b2ca42a89364a5695fb0323 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 30 Apr 2025 13:34:46 +0000
Subject: [PATCH 376/751] Improve: Use `requires` clause

NVCC has very hard time with concepts-
based template type constraints, so to
simplify the builds and allow C++17
compatibility we now use `requires`.
---
 include/stringcuzilla/similarity.hpp | 556 +++++++++++++++++++--------
 1 file changed, 386 insertions(+), 170 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index 0e336091..936dd191 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -9,6 +9,12 @@
  *  - `sz::needleman_wunsch_score` for weighted Needleman-Wunsch @b (NW) global alignment.
  *  - `sz::smith_waterman_score` for weighted Smith-Waterman @b (SW) local alignment.
  *
+ *  Using those may look like this:
+ *
+ *  @code{.cpp}
+ *  using levenshtein_t = sz::levenshtein_distance<sz::error_costs_256x256_t>;
+ *
+ *
  *  Also includes their batch-capable and parallel versions:
  *
  *  - `sz::levenshtein_scores` & `sz::levenshtein_scores_utf8` for Levenshtein edit-scores.
@@ -82,12 +88,20 @@ void rotate_three(value_type_ &a, value_type_ &b, value_type_ &c) noexcept {
     c = tmp;
 }
 
+/**
+ *  @brief  A trivial function object for linear and affine gap costs in Levenshtein-like similarity algorithms.
+ *  @sa     affine_gap_costs_t
+ */
 struct linear_gap_costs_t {
     error_cost_t open_or_extend = 1;
 
     constexpr size_t magnitude() const noexcept { return std::abs(open_or_extend); }
 };
 
+/**
+ *  @brief  A trivial function object for affine gap costs in Levenshtein-like similarity algorithms.
+ *  @sa     linear_gap_costs_t
+ */
 struct affine_gap_costs_t {
     error_cost_t open = 1;
     error_cost_t extend = 1;
@@ -117,6 +131,42 @@ struct uniform_substitution_costs_t {
     constexpr size_t magnitude() const noexcept { return std::max(std::abs(match), std::abs(mismatch)); }
 };
 
+/**
+ *  @brief  The default most @b space-intensive error costs matrix for byte-level similarity scoring.
+ *          Takes (256 x 256) ~ 65'536 bytes of memory. Which equates to 1/3 of the shared memory on the GPU,
+ *          so smaller variants should be preferred where possible.
+ */
+struct error_costs_256x256_t {
+    error_cost_t cells[256][256] = {0};
+
+    constexpr error_cost_t operator()(char a, char b) const noexcept { return cells[(sz_u8_t)a][(sz_u8_t)b]; }
+    constexpr error_cost_t operator()(sz_u8_t a, sz_u8_t b) const noexcept { return cells[a][b]; }
+
+    constexpr error_cost_t &operator()(char a, char b) noexcept { return cells[(sz_u8_t)a][(sz_u8_t)b]; }
+    constexpr error_cost_t &operator()(sz_u8_t a, sz_u8_t b) noexcept { return cells[a][b]; }
+
+    /**
+     *  @brief  Produces a substitution cost matrix for the Needleman-Wunsch alignment score,
+     *          that would yield the same result as the negative Levenshtein distance.
+     */
+    constexpr static error_costs_256x256_t diagonal(error_cost_t match_score = 0,
+                                                    error_cost_t mismatch_score = -1) noexcept {
+        error_costs_256x256_t result;
+        for (int i = 0; i != 256; ++i)
+            for (int j = 0; j != 256; ++j) //
+                result.cells[i][j] = i == j ? match_score : mismatch_score;
+        return result;
+    }
+
+    constexpr size_t magnitude() const noexcept {
+        size_t max_magnitude = 0;
+        for (int i = 0; i != 256; ++i)
+            for (int j = 0; j != 256; ++j) //
+                max_magnitude = std::max(max_magnitude, (size_t)std::abs((int)cells[i][j]));
+        return max_magnitude;
+    }
+};
+
 #pragma region - Algorithm Building Blocks
 
 /**
@@ -194,6 +244,8 @@ struct similarity_memory_requirements {
 
 #pragma region - Core Templates
 
+#if _SZ_IS_CPP20
+
 template <typename iterator_type_>
 concept pointer_like = requires(iterator_type_ iterator, std::size_t idx) {
     { ++iterator } -> std::same_as<iterator_type_ &>; // pre-increment
@@ -215,6 +267,8 @@ concept gap_costs_like = requires(gap_costs_type_ costs) {
     { costs.magnitude() } -> std::convertible_to<size_t>; // retrieving the magnitude
 };
 
+#endif
+
 /**
  *  @brief  An operator to be applied to be applied to all @b 2x2 tiles of the DP matrix to produce
  *          the bottom-right value from the 3x others when populating the Dynamic Programming matrix.
@@ -229,17 +283,21 @@ concept gap_costs_like = requires(gap_costs_type_ costs) {
  *  @tparam capability_ The SIMD capabilities of the target architecture.
  *  @tparam enable_ Used to enable/disable the specialization.
  */
-template <                                                             //
-    pointer_like first_iterator_type_ = char const *,                  //
-    pointer_like second_iterator_type_ = char const *,                 //
-    score_like score_type_ = size_t,                                   //
-    substituter_like substituter_type_ = uniform_substitution_costs_t, //
-    gap_costs_like gap_costs_type_ = linear_gap_costs_t,               //
-    sz_similarity_objective_t objective_ = sz_maximize_score_k,        //
-    sz_similarity_locality_t locality_ = sz_similarity_global_k,       //
-    sz_capability_t capability_ = sz_cap_serial_k,                     //
-    typename enable_ = void                                            //
+template <                                                       //
+    typename first_iterator_type_ = char const *,                //
+    typename second_iterator_type_ = char const *,               //
+    typename score_type_ = size_t,                               //
+    typename substituter_type_ = uniform_substitution_costs_t,   //
+    typename gap_costs_type_ = linear_gap_costs_t,               //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
+    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
+    sz_capability_t capability_ = sz_cap_serial_k,               //
+    typename enable_ = void                                      //
     >
+#if _SZ_IS_CPP20
+    requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
+             substituter_like<substituter_type_> && gap_costs_like<gap_costs_type_>
+#endif
 struct tile_scorer;
 
 /**
@@ -263,17 +321,20 @@ struct tile_scorer;
  *  @tparam capability_ Whether to use OpenMP for @b multi-threading or some form of @b SIMD vectorization, or both.
  *  @tparam enable_ Used to enable/disable the specialization.
  */
-template <                                                             //
-    typename char_type_ = char,                                        //
-    score_like score_type_ = size_t,                                   //
-    substituter_like substituter_type_ = uniform_substitution_costs_t, //
-    gap_costs_like gap_costs_type_ = linear_gap_costs_t,               //
-    typename allocator_type_ = dummy_alloc_t,                          //
-    sz_similarity_objective_t objective_ = sz_maximize_score_k,        //
-    sz_similarity_locality_t locality_ = sz_similarity_global_k,       //
-    sz_capability_t capability_ = sz_cap_serial_k,                     //
-    typename enable_ = void                                            //
+template <                                                       //
+    typename char_type_ = char,                                  //
+    typename score_type_ = size_t,                               //
+    typename substituter_type_ = uniform_substitution_costs_t,   //
+    typename gap_costs_type_ = linear_gap_costs_t,               //
+    typename allocator_type_ = dummy_alloc_t,                    //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
+    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
+    sz_capability_t capability_ = sz_cap_serial_k,               //
+    typename enable_ = void                                      //
     >
+#if _SZ_IS_CPP20
+    requires score_like<score_type_> && substituter_like<substituter_type_> && gap_costs_like<gap_costs_type_>
+#endif
 struct diagonal_walker;
 
 /**
@@ -299,21 +360,115 @@ struct diagonal_walker;
  *  @sa     For simplicity, use the `sz::levenshtein_distance[_utf8]` and `sz::needleman_wunsch_score`.
  *  @sa     For bulk API, use `sz::levenshtein_scores[_utf8]`.
  */
-template <                                                             //
-    typename char_type_ = char,                                        //
-    score_like score_type_ = size_t,                                   //
-    substituter_like substituter_type_ = uniform_substitution_costs_t, //
-    gap_costs_like gap_costs_type_ = linear_gap_costs_t,               //
-    typename allocator_type_ = dummy_alloc_t,                          //
-    sz_similarity_objective_t objective_ = sz_maximize_score_k,        //
-    sz_similarity_locality_t locality_ = sz_similarity_global_k,       //
-    sz_capability_t capability_ = sz_cap_serial_k,                     //
-    typename enable_ = void                                            //
+template <                                                       //
+    typename char_type_ = char,                                  //
+    typename score_type_ = size_t,                               //
+    typename substituter_type_ = uniform_substitution_costs_t,   //
+    typename gap_costs_type_ = linear_gap_costs_t,               //
+    typename allocator_type_ = dummy_alloc_t,                    //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
+    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
+    sz_capability_t capability_ = sz_cap_serial_k,               //
+    typename enable_ = void                                      //
     >
+#if _SZ_IS_CPP20
+    requires score_like<score_type_> && substituter_like<substituter_type_> && gap_costs_like<gap_costs_type_>
+#endif
 struct horizontal_walker;
 
+/**
+ *  @brief  Computes one or many pairwise Levenshtein distances in parallel using the OpenMP backend.
+ *          For pairs of very large strings, all cores cooperate to compute one distance maximizing
+ *          cache hits. For smaller strings, each core computes its own distance.
+ */
+template <                                         //
+    typename char_type_ = char,                    //
+    typename gap_costs_type_ = linear_gap_costs_t, //
+    typename allocator_type_ = dummy_alloc_t,      //
+    sz_capability_t capability_ = sz_cap_serial_k, //
+    typename enable_ = void                        //
+    >
+#if _SZ_IS_CPP20
+    requires gap_costs_like<gap_costs_type_>
+#endif
+struct levenshtein_distances;
+
+template <                                         //
+    typename char_type_ = char,                    //
+    typename gap_costs_type_ = linear_gap_costs_t, //
+    typename allocator_type_ = dummy_alloc_t,      //
+    sz_capability_t capability_ = sz_cap_serial_k, //
+    typename enable_ = void                        //
+    >
+#if _SZ_IS_CPP20
+    requires gap_costs_like<gap_costs_type_>
+#endif
+struct levenshtein_distances_utf8;
+
+template <                                              //
+    typename char_type_ = char,                         //
+    typename substituter_type_ = error_costs_256x256_t, //
+    typename gap_costs_type_ = linear_gap_costs_t,      //
+    typename allocator_type_ = dummy_alloc_t,           //
+    sz_capability_t capability_ = sz_cap_serial_k,      //
+    typename enable_ = void                             //
+    >
+#if _SZ_IS_CPP20
+    requires substituter_like<substituter_type_> && gap_costs_like<gap_costs_type_>
+#endif
+struct needleman_wunsch_scores;
+
+template <                                              //
+    typename char_type_ = char,                         //
+    typename substituter_type_ = error_costs_256x256_t, //
+    typename gap_costs_type_ = linear_gap_costs_t,      //
+    typename allocator_type_ = dummy_alloc_t,           //
+    sz_capability_t capability_ = sz_cap_serial_k,      //
+    typename enable_ = void                             //
+    >
+#if _SZ_IS_CPP20
+    requires substituter_like<substituter_type_> && gap_costs_like<gap_costs_type_>
+#endif
+struct smith_waterman_scores;
+
 #pragma endregion - Core Templates
 
+#pragma region - Common Aliases
+
+using malloc_t = std::allocator<char>;
+
+/**
+ *  In non-SIMD backends we still leverage OpenMP for parallelism.
+ */
+using levenshtein_serial_t = levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k>;
+using levenshtein_utf8_serial_t = levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k>;
+using needleman_wunsch_serial_t =
+    needleman_wunsch_scores<char, error_costs_256x256_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k>;
+using smith_waterman_serial_t =
+    smith_waterman_scores<char, error_costs_256x256_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k>;
+
+using affine_levenshtein_serial_t = levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_cap_serial_k>;
+using affine_levenshtein_utf8_serial_t =
+    levenshtein_distances_utf8<char, affine_gap_costs_t, malloc_t, sz_cap_serial_k>;
+using affine_needleman_wunsch_serial_t =
+    needleman_wunsch_scores<char, error_costs_256x256_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k>;
+using affine_smith_waterman_serial_t =
+    smith_waterman_scores<char, error_costs_256x256_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k>;
+
+/**
+ *  In @b AVX-512:
+ *  - for Global Alignments, we can vectorize the min-max calculation for diagonal "walkers"
+ *  - for Local Alignments, we can vectorize the character substitution lookups for horizontal "walkers"
+ */
+using levenshtein_ice_t = levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_si_k>;
+using levenshtein_utf8_ice_t = levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_caps_si_k>;
+using needleman_wunsch_ice_t =
+    needleman_wunsch_scores<char, error_costs_256x256_t, linear_gap_costs_t, malloc_t, sz_caps_si_k>;
+using smith_waterman_ice_t =
+    smith_waterman_scores<char, error_costs_256x256_t, linear_gap_costs_t, malloc_t, sz_caps_si_k>;
+
+#pragma endregion - Common Aliases
+
 #pragma region - Autovectorized Tile Scorer
 
 /**
@@ -322,8 +477,12 @@ struct horizontal_walker;
  *  - Only @b Linear gaps, not Affine!
  *  - Both auto-vectorized @b Serial and @b Parallel execution, but not hand-rolled SIMD!
  */
-template <pointer_like first_iterator_type_, pointer_like second_iterator_type_, score_like score_type_,
-          substituter_like substituter_type_, sz_similarity_objective_t objective_>
+template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
+          typename substituter_type_, sz_similarity_objective_t objective_>
+#if _SZ_IS_CPP20
+    requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
+             substituter_like<substituter_type_>
+#endif
 struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, linear_gap_costs_t,
                    objective_, sz_similarity_global_k, sz_cap_serial_k, void> {
 
@@ -352,7 +511,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 
   public:
     tile_scorer() = default;
-    tile_scorer(substituter_t subs, linear_gap_costs_t gap_cost) noexcept : substituter_(subs), gap_costs_(gap_cost) {}
+    tile_scorer(substituter_t subs, linear_gap_costs_t gaps) noexcept : substituter_(subs), gap_costs_(gaps) {}
 
     /**
      *  @brief Initializes a boundary value within a certain diagonal.
@@ -374,7 +533,10 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
      *  @param second_slice The second string.
      *  @param n The length of the diagonal to evaluate and the number of characters to compare from each string.
      */
-    template <executor_like executor_type_ = dummy_executor_t>
+    template <typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
     void operator()(                                                                     //
         first_iterator_t first_reversed_slice, second_iterator_t second_slice, size_t n, //
         score_t const *scores_pre_substitution, score_t const *scores_pre_insertion, score_t const *scores_pre_deletion,
@@ -382,7 +544,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 
         error_cost_t const gap_cost = gap_costs_.open_or_extend;
 
-        executor.slice(n, [&](size_t i) {
+        executor.for_each(n, [&](size_t i) {
             score_t pre_substitution = scores_pre_substitution[i];
             score_t pre_insertion = scores_pre_insertion[i];
             score_t pre_deletion = scores_pre_deletion[i];
@@ -407,8 +569,12 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  *  - Only @b Linear gaps, not Affine!
  *  - Both auto-vectorized @b Serial and @b Parallel execution, but not hand-rolled SIMD!
  */
-template <pointer_like first_iterator_type_, pointer_like second_iterator_type_, score_like score_type_,
-          substituter_like substituter_type_, sz_similarity_objective_t objective_>
+template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
+          typename substituter_type_, sz_similarity_objective_t objective_>
+#if _SZ_IS_CPP20
+    requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
+             substituter_like<substituter_type_>
+#endif
 struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, linear_gap_costs_t,
                    objective_, sz_similarity_local_k, sz_cap_serial_k, void> {
 
@@ -437,7 +603,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 
   public:
     tile_scorer() = default;
-    tile_scorer(substituter_t subs, linear_gap_costs_t gap_cost) noexcept : substituter_(subs), gap_costs_(gap_cost) {}
+    tile_scorer(substituter_t subs, linear_gap_costs_t gaps) noexcept : substituter_(subs), gap_costs_(gaps) {}
 
     /**
      *  @brief Initializes a boundary value within a certain diagonal.
@@ -451,7 +617,10 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
      */
     score_t score() const noexcept { return best_score_; }
 
-    template <executor_like executor_type_ = dummy_executor_t>
+    template <typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
     void operator()(                                                                           //
         first_iterator_t first_reversed_slice, second_iterator_t second_slice, size_t const n, //
         score_t const *scores_pre_substitution, score_t const *scores_pre_insertion, score_t const *scores_pre_deletion,
@@ -459,7 +628,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 
         error_cost_t const gap_cost = gap_costs_.open_or_extend;
         std::atomic<score_t> atomic_best_score {best_score_};
-        executor.slice_range(n, [&](size_t i_start, size_t i_end) {
+        executor.for_each_range(n, [&](size_t i_start, size_t i_end) {
             score_t local_best_score = atomic_best_score;
             for (size_t i = i_start; i < i_end; ++i) {
                 score_t pre_substitution = scores_pre_substitution[i];
@@ -491,8 +660,12 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  *  - Only @b Affine gaps, not Linear!
  *  - Both auto-vectorized @b Serial and @b Parallel execution, but not hand-rolled SIMD!
  */
-template <pointer_like first_iterator_type_, pointer_like second_iterator_type_, score_like score_type_,
-          substituter_like substituter_type_, sz_similarity_objective_t objective_>
+template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
+          typename substituter_type_, sz_similarity_objective_t objective_>
+#if _SZ_IS_CPP20
+    requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
+             substituter_like<substituter_type_>
+#endif
 struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, affine_gap_costs_t,
                    objective_, sz_similarity_global_k, sz_cap_serial_k, void> {
 
@@ -552,7 +725,10 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
      *  @param second_slice The second string.
      *  @param n The length of the diagonal to evaluate and the number of characters to compare from each string.
      */
-    template <executor_like executor_type_ = dummy_executor_t>
+    template <typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
     void operator()(                                                                     //
         first_iterator_t first_reversed_slice, second_iterator_t second_slice, size_t n, //
         score_t const *scores_pre_substitution,                                          //
@@ -565,7 +741,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         score_t *scores_new_deletions,                                                   //
         executor_type_ &&executor = {}) noexcept {
 
-        executor.slice(n, [&](size_t i) {
+        executor.for_each(n, [&](size_t i) {
             score_t pre_substitution = scores_pre_substitution[i];
             score_t pre_insertion_opening = scores_pre_insertion[i];
             score_t pre_deletion_opening = scores_pre_deletion[i];
@@ -600,8 +776,12 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  *  - Only @b Affine gaps, not Linear!
  *  - Both auto-vectorized @b Serial and @b Parallel execution, but not hand-rolled SIMD!
  */
-template <pointer_like first_iterator_type_, pointer_like second_iterator_type_, score_like score_type_,
-          substituter_like substituter_type_, sz_similarity_objective_t objective_>
+template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
+          typename substituter_type_, sz_similarity_objective_t objective_>
+#if _SZ_IS_CPP20
+    requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
+             substituter_like<substituter_type_>
+#endif
 struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, affine_gap_costs_t,
                    objective_, sz_similarity_local_k, sz_cap_serial_k, void> {
 
@@ -651,7 +831,10 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
      */
     score_t score() const noexcept { return best_score_; }
 
-    template <executor_like executor_type_ = dummy_executor_t>
+    template <typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
     void operator()(                                                                           //
         first_iterator_t first_reversed_slice, second_iterator_t second_slice, size_t const n, //
         score_t const *scores_pre_substitution,                                                //
@@ -665,7 +848,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         executor_type_ &&executor = {}) noexcept {
 
         std::atomic<score_t> atomic_best_score {best_score_};
-        executor.slice_range(n, [&](size_t i_start, size_t i_end) {
+        executor.for_each_range(n, [&](size_t i_start, size_t i_end) {
             score_t local_best_score = atomic_best_score;
             for (size_t i = i_start; i < i_end; ++i) {
                 score_t pre_substitution = scores_pre_substitution[i];
@@ -713,9 +896,12 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  *
  *  Allocates 3x diagonals of the DP matrix.
  */
-template <typename char_type_, score_like score_type_, substituter_like substituter_type_, typename allocator_type_,
+template <typename char_type_, typename score_type_, typename substituter_type_, typename allocator_type_,
           sz_similarity_objective_t objective_, sz_similarity_locality_t locality_, sz_capability_t capability_,
           typename enable_>
+#if _SZ_IS_CPP20
+    requires score_like<score_type_> && substituter_like<substituter_type_>
+#endif
 struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_costs_t, allocator_type_, objective_,
                        locality_, capability_, enable_> {
 
@@ -753,7 +939,10 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_co
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score.
      */
-    template <executor_like executor_type_ = dummy_executor_t>
+    template <typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
     status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref,
                         executor_type_ &&executor = {}) const noexcept {
 
@@ -900,9 +1089,12 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_co
  *
  *  Allocates 3x diagonals of the DP matrix and 2x diagonals of 2x affine gaps matrices.
  */
-template <typename char_type_, score_like score_type_, substituter_like substituter_type_, typename allocator_type_,
+template <typename char_type_, typename score_type_, typename substituter_type_, typename allocator_type_,
           sz_similarity_objective_t objective_, sz_similarity_locality_t locality_, sz_capability_t capability_,
           typename enable_>
+#if _SZ_IS_CPP20
+    requires score_like<score_type_> && substituter_like<substituter_type_>
+#endif
 struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_costs_t, allocator_type_, objective_,
                        locality_, capability_, enable_> {
 
@@ -941,7 +1133,10 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score.
      */
-    template <executor_like executor_type_ = dummy_executor_t>
+    template <typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
     status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref,
                         executor_type_ &&executor = {}) const noexcept {
 
@@ -1119,8 +1314,11 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
  *
  *  Allocates 2x rows of the DP matrix.
  */
-template <typename char_type_, score_like score_type_, substituter_like substituter_type_, typename allocator_type_,
+template <typename char_type_, typename score_type_, typename substituter_type_, typename allocator_type_,
           sz_similarity_objective_t objective_, sz_similarity_locality_t locality_>
+#if _SZ_IS_CPP20
+    requires score_like<score_type_> && substituter_like<substituter_type_>
+#endif
 struct horizontal_walker<char_type_, score_type_, substituter_type_, linear_gap_costs_t, allocator_type_, objective_,
                          locality_, sz_cap_serial_k, void> {
 
@@ -1234,8 +1432,11 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, linear_gap_
  *
  *  Allocates 2x rows of the DP matrix and 2x rows of 2x affine gaps matrices.
  */
-template <typename char_type_, score_like score_type_, substituter_like substituter_type_, typename allocator_type_,
+template <typename char_type_, typename score_type_, typename substituter_type_, typename allocator_type_,
           sz_similarity_objective_t objective_, sz_similarity_locality_t locality_>
+#if _SZ_IS_CPP20
+    requires score_like<score_type_> && substituter_like<substituter_type_>
+#endif
 struct horizontal_walker<char_type_, score_type_, substituter_type_, affine_gap_costs_t, allocator_type_, objective_,
                          locality_, sz_cap_serial_k, void> {
 
@@ -1372,13 +1573,16 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, affine_gap_
  *  @tparam gap_costs_type_ Can be either `linear_gap_costs_t` or `affine_gap_costs_t`.
  *  @tparam capability_ Can be either `sz_cap_serial_k`, `sz_caps_si_k`, `sz_cap_cuda_k`.
  */
-template <                                               //
-    typename char_type_ = char,                          //
-    gap_costs_like gap_costs_type_ = linear_gap_costs_t, //
-    typename allocator_type_ = dummy_alloc_t,            //
-    sz_capability_t capability_ = sz_cap_serial_k,       //
-    typename enable_ = void                              //
+template <                                         //
+    typename char_type_ = char,                    //
+    typename gap_costs_type_ = linear_gap_costs_t, //
+    typename allocator_type_ = dummy_alloc_t,      //
+    sz_capability_t capability_ = sz_cap_serial_k, //
+    typename enable_ = void                        //
     >
+#if _SZ_IS_CPP20
+    requires gap_costs_like<gap_costs_type_>
+#endif
 struct levenshtein_distance {
 
     using char_t = char_type_;
@@ -1420,7 +1624,10 @@ struct levenshtein_distance {
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
-    template <executor_like executor_type_ = dummy_executor_t>
+    template <typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
     status_t operator()(span<char_t const> first, span<char_t const> second, size_t &result_ref,
                         executor_type_ &&executor = {}) const noexcept {
 
@@ -1483,13 +1690,16 @@ struct levenshtein_distance {
  *  @brief  Computes the @b rune-level Levenshtein distance between two UTF-8 strings using the OpenMP backend.
  *  @sa     `levenshtein_distance` for binary strings.
  */
-template <                                               //
-    typename char_type_ = char,                          //
-    gap_costs_like gap_costs_type_ = linear_gap_costs_t, //
-    typename allocator_type_ = dummy_alloc_t,            //
-    sz_capability_t capability_ = sz_cap_serial_k,       //
-    typename enable_ = void                              //
+template <                                         //
+    typename char_type_ = char,                    //
+    typename gap_costs_type_ = linear_gap_costs_t, //
+    typename allocator_type_ = dummy_alloc_t,      //
+    sz_capability_t capability_ = sz_cap_serial_k, //
+    typename enable_ = void                        //
     >
+#if _SZ_IS_CPP20
+    requires gap_costs_like<gap_costs_type_>
+#endif
 struct levenshtein_distance_utf8 {
 
     using char_t = char_type_;
@@ -1532,7 +1742,10 @@ struct levenshtein_distance_utf8 {
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
-    template <executor_like executor_type_ = dummy_executor_t>
+    template <typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
     status_t operator()(span<char_t const> first, span<char_t const> second, size_t &result_ref,
                         executor_type_ &&executor = {}) const noexcept {
 
@@ -1622,14 +1835,17 @@ struct levenshtein_distance_utf8 {
  *  @brief  Computes the @b byte-level Needleman-Wunsch score between two strings using the OpenMP backend.
  *  @sa     `levenshtein_distance` for uniform substitution and gap costs.
  */
-template <                                                      //
-    typename char_type_ = char,                                 //
-    substituter_like substituter_type_ = error_costs_256x256_t, //
-    gap_costs_like gap_costs_type_ = linear_gap_costs_t,        //
-    typename allocator_type_ = dummy_alloc_t,                   //
-    sz_capability_t capability_ = sz_cap_serial_k,              //
-    typename enable_ = void                                     //
+template <                                              //
+    typename char_type_ = char,                         //
+    typename substituter_type_ = error_costs_256x256_t, //
+    typename gap_costs_type_ = linear_gap_costs_t,      //
+    typename allocator_type_ = dummy_alloc_t,           //
+    sz_capability_t capability_ = sz_cap_serial_k,      //
+    typename enable_ = void                             //
     >
+#if _SZ_IS_CPP20
+    requires gap_costs_like<gap_costs_type_>
+#endif
 struct needleman_wunsch_score {
 
     using char_t = char_type_;
@@ -1666,7 +1882,10 @@ struct needleman_wunsch_score {
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
-    template <executor_like executor_type_ = dummy_executor_t>
+    template <typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
     status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref,
                         executor_type_ &&executor = {}) const noexcept {
 
@@ -1711,14 +1930,17 @@ struct needleman_wunsch_score {
  *  @brief  Computes the @b byte-level Needleman-Wunsch score between two strings using the OpenMP backend.
  *  @sa     `levenshtein_distance` for uniform substitution and gap costs.
  */
-template <                                                      //
-    typename char_type_ = char,                                 //
-    substituter_like substituter_type_ = error_costs_256x256_t, //
-    gap_costs_like gap_costs_type_ = linear_gap_costs_t,        //
-    typename allocator_type_ = dummy_alloc_t,                   //
-    sz_capability_t capability_ = sz_cap_serial_k,              //
-    typename enable_ = void                                     //
+template <                                              //
+    typename char_type_ = char,                         //
+    typename substituter_type_ = error_costs_256x256_t, //
+    typename gap_costs_type_ = linear_gap_costs_t,      //
+    typename allocator_type_ = dummy_alloc_t,           //
+    sz_capability_t capability_ = sz_cap_serial_k,      //
+    typename enable_ = void                             //
     >
+#if _SZ_IS_CPP20
+    requires gap_costs_like<gap_costs_type_>
+#endif
 struct smith_waterman_score {
 
     using char_t = char_type_;
@@ -1755,7 +1977,10 @@ struct smith_waterman_score {
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
-    template <executor_like executor_type_ = dummy_executor_t>
+    template <typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
     status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref,
                         executor_type_ &&executor = {}) const noexcept {
 
@@ -1816,14 +2041,17 @@ struct smith_waterman_score {
  *          For pairs of very large strings, all cores cooperate to compute one distance maximizing
  *          cache hits. For smaller strings, each core computes its own distance.
  */
-template <                                          //
-    score_like score_type_,                         //
-    typename scoring_type_,                         //
-    typename first_strings_type_,                   //
-    typename second_strings_type_,                  //
-    typename results_type_,                         //
-    executor_like executor_type_ = dummy_executor_t //
+template <                                     //
+    typename score_type_,                      //
+    typename scoring_type_,                    //
+    typename first_strings_type_,              //
+    typename second_strings_type_,             //
+    typename results_type_,                    //
+    typename executor_type_ = dummy_executor_t //
     >
+#if _SZ_IS_CPP20
+    requires score_like<score_type_> && executor_like<executor_type_>
+#endif
 status_t _score_in_parallel(                                                                                       //
     scoring_type_ &&scoring, first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
     results_type_ &&results, size_t substitute_magnitude, size_t gap_magnitude,                                    //
@@ -1881,12 +2109,15 @@ status_t _score_in_parallel(
 }
 
 template <                         //
-    score_like score_type_,        //
+    typename score_type_,          //
     typename scoring_type_,        //
     typename first_strings_type_,  //
     typename second_strings_type_, //
     typename results_type_         //
     >
+#if _SZ_IS_CPP20
+    requires score_like<score_type_>
+#endif
 status_t _score_sequentially(                                                                                      //
     scoring_type_ &&scoring, first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
     results_type_ &&results) noexcept {
@@ -1908,18 +2139,16 @@ status_t _score_sequentially(
     return status_t::success_k;
 }
 
-/**
- *  @brief  Computes one or many pairwise Levenshtein distances in parallel using the OpenMP backend.
- *          For pairs of very large strings, all cores cooperate to compute one distance maximizing
- *          cache hits. For smaller strings, each core computes its own distance.
- */
-template <                                               //
-    typename char_type_ = char,                          //
-    gap_costs_like gap_costs_type_ = linear_gap_costs_t, //
-    typename allocator_type_ = dummy_alloc_t,            //
-    sz_capability_t capability_ = sz_cap_serial_k,       //
-    typename enable_ = void                              //
+template <                       //
+    typename char_type_,         //
+    typename gap_costs_type_,    //
+    typename allocator_type_,    //
+    sz_capability_t capability_, //
+    typename enable_             //
     >
+#if _SZ_IS_CPP20
+    requires gap_costs_like<gap_costs_type_>
+#endif
 struct levenshtein_distances {
 
     using char_t = char_type_;
@@ -1958,13 +2187,16 @@ struct levenshtein_distances {
     }
 };
 
-template <                                               //
-    typename char_type_ = char,                          //
-    gap_costs_like gap_costs_type_ = linear_gap_costs_t, //
-    typename allocator_type_ = dummy_alloc_t,            //
-    sz_capability_t capability_ = sz_cap_serial_k,       //
-    typename enable_ = void                              //
+template <                       //
+    typename char_type_,         //
+    typename gap_costs_type_,    //
+    typename allocator_type_,    //
+    sz_capability_t capability_, //
+    typename enable_             //
     >
+#if _SZ_IS_CPP20
+    requires gap_costs_like<gap_costs_type_>
+#endif
 struct levenshtein_distances_utf8 {
 
     using char_t = char_type_;
@@ -2003,14 +2235,17 @@ struct levenshtein_distances_utf8 {
     }
 };
 
-template <                                                      //
-    typename char_type_ = char,                                 //
-    substituter_like substituter_type_ = error_costs_256x256_t, //
-    gap_costs_like gap_costs_type_ = linear_gap_costs_t,        //
-    typename allocator_type_ = dummy_alloc_t,                   //
-    sz_capability_t capability_ = sz_cap_serial_k,              //
-    typename enable_ = void                                     //
+template <                       //
+    typename char_type_,         //
+    typename substituter_type_,  //
+    typename gap_costs_type_,    //
+    typename allocator_type_,    //
+    sz_capability_t capability_, //
+    typename enable_             //
     >
+#if _SZ_IS_CPP20
+    requires substituter_like<substituter_type_> && gap_costs_like<gap_costs_type_>
+#endif
 struct needleman_wunsch_scores {
 
     using char_t = char_type_;
@@ -2049,14 +2284,17 @@ struct needleman_wunsch_scores {
     }
 };
 
-template <                                                      //
-    typename char_type_ = char,                                 //
-    substituter_like substituter_type_ = error_costs_256x256_t, //
-    gap_costs_like gap_costs_type_ = linear_gap_costs_t,        //
-    typename allocator_type_ = dummy_alloc_t,                   //
-    sz_capability_t capability_ = sz_cap_serial_k,              //
-    typename enable_ = void                                     //
+template <                       //
+    typename char_type_,         //
+    typename substituter_type_,  //
+    typename gap_costs_type_,    //
+    typename allocator_type_,    //
+    sz_capability_t capability_, //
+    typename enable_             //
     >
+#if _SZ_IS_CPP20
+    requires substituter_like<substituter_type_> && gap_costs_like<gap_costs_type_>
+#endif
 struct smith_waterman_scores {
 
     using char_t = char_type_;
@@ -2099,42 +2337,6 @@ struct smith_waterman_scores {
 
 #pragma region - Substitution Cost Matrices
 
-/**
- *  @brief  The default most @b space-intensive error costs matrix for byte-level similarity scoring.
- *          Takes (256 x 256) ~ 65'536 bytes of memory. Which equates to 1/3 of the shared memory on the GPU,
- *          so smaller variants should be preferred where possible.
- */
-struct error_costs_256x256_t {
-    error_cost_t cells[256][256] = {0};
-
-    constexpr error_cost_t operator()(char a, char b) const noexcept { return cells[(sz_u8_t)a][(sz_u8_t)b]; }
-    constexpr error_cost_t operator()(sz_u8_t a, sz_u8_t b) const noexcept { return cells[a][b]; }
-
-    constexpr error_cost_t &operator()(char a, char b) noexcept { return cells[(sz_u8_t)a][(sz_u8_t)b]; }
-    constexpr error_cost_t &operator()(sz_u8_t a, sz_u8_t b) noexcept { return cells[a][b]; }
-
-    /**
-     *  @brief  Produces a substitution cost matrix for the Needleman-Wunsch alignment score,
-     *          that would yield the same result as the negative Levenshtein distance.
-     */
-    constexpr static error_costs_256x256_t diagonal(error_cost_t match_score = 0,
-                                                    error_cost_t mismatch_score = -1) noexcept {
-        error_costs_256x256_t result;
-        for (int i = 0; i != 256; ++i)
-            for (int j = 0; j != 256; ++j) //
-                result.cells[i][j] = i == j ? match_score : mismatch_score;
-        return result;
-    }
-
-    constexpr size_t magnitude() const noexcept {
-        size_t max_magnitude = 0;
-        for (int i = 0; i != 256; ++i)
-            for (int j = 0; j != 256; ++j) //
-                max_magnitude = std::max(max_magnitude, (size_t)std::abs((int)cells[i][j]));
-        return max_magnitude;
-    }
-};
-
 /**
  *  @brief  The recommended @b space-efficient error costs matrix for case-insensitive English word
  *          scoring or protein sequences, which conveniently require only 26 and 20 letters respectively.
@@ -2682,7 +2884,10 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
         _mm512_mask_storeu_epi16(scores_new, load_mask, cell_score_vec.zmm);
     }
 
-    template <executor_like executor_type_ = dummy_executor_t>
+    template <typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
     inline void operator()(                                                              //
         char const *first_reversed_slice, char const *second_slice, size_t const length, //
         sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,   //
@@ -2726,7 +2931,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
 
         // In this variant we will need at most (64 * 1024 / 32) = 2048 loops per diagonal.
         size_t const body_pages = hbt.body / step_k;
-        executor.slice(body_pages, [&](size_t const page) {
+        executor.for_each(body_pages, [&](size_t const page) {
             size_t const progress = page * step_k;
             slice_aligned32chars(                                                                             //
                 first_reversed_slice + progress, second_slice + progress, scores_pre_substitution + progress, //
@@ -2825,7 +3030,10 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_subst
         _mm256_mask_storeu_epi16(scores_new, load_mask, cell_score_vec.ymm);
     }
 
-    template <executor_like executor_type_ = dummy_executor_t>
+    template <typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
     inline void operator()(                                                                        //
         sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, size_t const length, //
         sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,             //
@@ -2869,7 +3077,7 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_subst
 
         // In this variant we will need at most (64 * 1024 / 16) = 4096 loops per diagonal.
         size_t const body_pages = hbt.body / step_k;
-        executor.slice(body_pages, [&](size_t const page) {
+        executor.for_each(body_pages, [&](size_t const page) {
             size_t const progress = page * step_k;
             slice_aligned16chars(                                                                             //
                 first_reversed_slice + progress, second_slice + progress, scores_pre_substitution + progress, //
@@ -2968,7 +3176,10 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
         _mm512_mask_storeu_epi32(scores_new, load_mask, cell_score_vec.zmm);
     }
 
-    template <executor_like executor_type_ = dummy_executor_t>
+    template <typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
     inline void operator()(                                                              //
         char const *first_reversed_slice, char const *second_slice, size_t const length, //
         sz_u32_t const *scores_pre_substitution, sz_u32_t const *scores_pre_insertion,   //
@@ -3011,7 +3222,7 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
                 match_cost_vec, mismatch_cost_vec, gap_cost_vec);
 
         size_t const body_pages = hbt.body / step_k;
-        executor.slice(body_pages, [&](size_t const page) {
+        executor.for_each(body_pages, [&](size_t const page) {
             size_t const progress = page * step_k;
             slice_aligned16chars(                                                                             //
                 first_reversed_slice + progress, second_slice + progress, scores_pre_substitution + progress, //
@@ -3066,7 +3277,10 @@ struct levenshtein_distance<char, linear_gap_costs_t, allocator_type_, capabilit
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
-    template <executor_like executor_type_ = dummy_executor_t>
+    template <typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
     status_t operator()(span<char_t const> first, span<char_t const> second, size_t &result_ref,
                         executor_type_ &&executor = {}) const noexcept {
 
@@ -3152,7 +3366,10 @@ struct levenshtein_distance_utf8<char, linear_gap_costs_t, allocator_type_, capa
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
-    template <executor_like executor_type_ = dummy_executor_t>
+    template <typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
     status_t operator()(span<char_t const> first, span<char_t const> second, size_t &result_ref,
                         executor_type_ &&executor = {}) const noexcept {
 
@@ -3422,9 +3639,8 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_
 };
 
 /** @brief Redirects the Ice Lake template specialization to the serial version. */
-template <typename char_type_, score_like score_type_, substituter_like substituter_type_,
-          gap_costs_like gap_costs_type_, typename allocator_type_, sz_similarity_objective_t objective_,
-          sz_similarity_locality_t locality_>
+template <typename char_type_, typename score_type_, typename substituter_type_, typename gap_costs_type_,
+          typename allocator_type_, sz_similarity_objective_t objective_, sz_similarity_locality_t locality_>
 struct horizontal_walker<char_type_, score_type_, substituter_type_, gap_costs_type_, allocator_type_, objective_,
                          locality_, sz_caps_si_k, void>
     : public horizontal_walker<char_type_, score_type_, substituter_type_, gap_costs_type_, allocator_type_, objective_,

From 23a7f5812fab915420efe58b0f9092c7ea7bc3f4 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 30 Apr 2025 13:35:11 +0000
Subject: [PATCH 377/751] Improve: Bounded methods not supported

---
 scripts/bench_similarity.cuh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/bench_similarity.cuh b/scripts/bench_similarity.cuh
index 3618c4f3..ea367c8a 100644
--- a/scripts/bench_similarity.cuh
+++ b/scripts/bench_similarity.cuh
@@ -31,7 +31,6 @@ struct similarities_callable {
 
     environment_t const &env;
     similarities_t &results;
-    sz_size_t bound = SZ_SIZE_MAX;
     engine_t engine = {};
     std::tuple<extra_args_...> extra_args = {};
 

From 60b99ab917c8c0b205801376a603bab8b4ad95e9 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 30 Apr 2025 13:35:50 +0000
Subject: [PATCH 378/751] Fix: Inconsistent timing in `bench_unary`

---
 scripts/bench.hpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index 6e221844..7bb74f53 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -740,7 +740,7 @@ bench_result_t bench_unary(    //
 
     // For profiling, we will first run the benchmark just once to get a rough estimate of the time.
     // But then we will repeat it in an unrolled fashion for a more accurate measurement.
-    result.profiled_seconds += seconds_per_call([&] {
+    auto const first_call_duration = seconds_per_call([&] {
         std::uint64_t cpu_cycles_at_start = cpu_cycle_counter();
         call_result_t const call_result = callable((std::size_t)0); //? Use the first token
         std::uint64_t cpu_cycles_at_end = cpu_cycle_counter();
@@ -752,10 +752,11 @@ bench_result_t bench_unary(    //
         result.profiled_cpu_cycles += cpu_cycles_at_end - cpu_cycles_at_start;
         result.cpu_cycles_histogram[cpu_cycles_at_end - cpu_cycles_at_start] += 1;
     });
-    if (result.profiled_seconds >= env.benchmark_seconds) return result;
+    result.profiled_seconds = first_call_duration;
+    if (first_call_duration >= env.benchmark_seconds) return result;
 
     // Repeat the benchmarks in unrolled batches until the time limit is reached.
-    for (auto running_seconds : repeat_up_to(env.benchmark_seconds - result.profiled_seconds)) {
+    for (auto running_seconds : repeat_up_to(env.benchmark_seconds - first_call_duration)) {
         std::uint64_t t0 = cpu_cycle_counter();
         call_result_t r0 = callable((result.profiled_calls + 0) & lookup_mask);
         std::uint64_t t1 = cpu_cycle_counter();
@@ -783,6 +784,7 @@ bench_result_t bench_unary(    //
         result.cpu_cycles_histogram[t4 - t3] += 1;
     }
 
+    result.profiled_seconds += first_call_duration;
     return result;
 }
 

From c6e907a3a5a6b92bdd40a4cb87d342bad714f657 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 30 Apr 2025 22:28:51 +0000
Subject: [PATCH 379/751] Improve: `bytes_per_cell_t` enum

---
 include/stringcuzilla/similarity.hpp | 29 ++++++++++++++++++----------
 include/stringcuzilla/types.hpp      |  8 ++++++++
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index 936dd191..a7f75dde 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -183,7 +183,7 @@ struct similarity_memory_requirements {
     static constexpr bool is_signed_k = is_signed_;
 
     size_t max_diagonal_length = 0;
-    size_t bytes_per_cell = 0;
+    bytes_per_cell_t bytes_per_cell = zero_bytes_per_cell_k;
     size_t bytes_per_diagonal = 0;
     size_t total = 0;
 
@@ -206,8 +206,17 @@ struct similarity_memory_requirements {
         size_t bytes_per_char,                             //
         size_t register_width) noexcept {
 
-        // Each diagonal in the DP matrix is only by 1 longer than the shorter string.
+        // If any of the strings is empty, we don't need any memory to perform the similarity scoring.
         size_t shorter_length = sz_min_of_two(first_length, second_length);
+        if (shorter_length == 0) {
+            this->max_diagonal_length = 0;
+            this->bytes_per_cell = zero_bytes_per_cell_k;
+            this->bytes_per_diagonal = 0;
+            this->total = 0;
+            return;
+        }
+
+        // Each diagonal in the DP matrix is only by 1 longer than the shorter string.
         size_t longer_length = sz_max_of_two(first_length, second_length);
         this->max_diagonal_length = shorter_length + 1;
 
@@ -217,16 +226,16 @@ struct similarity_memory_requirements {
         size_t max_cell_value = (longer_length + 1) * magnitude;
         if constexpr (!is_signed_k)
             this->bytes_per_cell = //
-                max_cell_value < 256          ? 1
-                : max_cell_value < 65536      ? 2
-                : max_cell_value < 4294967296 ? 4
-                                              : 8;
+                max_cell_value < 256          ? one_byte_per_cell_k
+                : max_cell_value < 65536      ? two_bytes_per_cell_k
+                : max_cell_value < 4294967296 ? four_bytes_per_cell_k
+                                              : eight_bytes_per_cell_k;
         else
             this->bytes_per_cell = //
-                max_cell_value < 127          ? 1
-                : max_cell_value < 32767      ? 2
-                : max_cell_value < 2147483647 ? 4
-                                              : 8;
+                max_cell_value < 127          ? one_byte_per_cell_k
+                : max_cell_value < 32767      ? two_bytes_per_cell_k
+                : max_cell_value < 2147483647 ? four_bytes_per_cell_k
+                                              : eight_bytes_per_cell_k;
 
         // For each string we need to copy its contents, and allocate 3 bands proportional to the length
         // of the shorter string with each cell being big enough to hold the length of the longer one.
diff --git a/include/stringcuzilla/types.hpp b/include/stringcuzilla/types.hpp
index ebb9aceb..01ef79f5 100644
--- a/include/stringcuzilla/types.hpp
+++ b/include/stringcuzilla/types.hpp
@@ -13,6 +13,14 @@
 namespace ashvardanian {
 namespace stringzilla {
 
+enum bytes_per_cell_t : uint {
+    zero_bytes_per_cell_k = 0,
+    one_byte_per_cell_k = 1,
+    two_bytes_per_cell_k = 2,
+    four_bytes_per_cell_k = 4,
+    eight_bytes_per_cell_k = 8,
+};
+
 struct dummy_executor_t {
 
     /**

From 6a61e1be8fe09bf276157b260610da8db90656d5 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 30 Apr 2025 22:30:29 +0000
Subject: [PATCH 380/751] Improve: Avoid `views::group_by` with callback

---
 include/stringcuzilla/types.hpp | 27 +++++++++++++++++++++++++++
 include/stringzilla/types.hpp   |  6 ++++++
 2 files changed, 33 insertions(+)

diff --git a/include/stringcuzilla/types.hpp b/include/stringcuzilla/types.hpp
index 01ef79f5..cb13b890 100644
--- a/include/stringcuzilla/types.hpp
+++ b/include/stringcuzilla/types.hpp
@@ -127,6 +127,33 @@ concept continuous_like = requires(continuous_type_ container) {
 static_assert(continuous_like<span<char>>);
 static_assert(!continuous_like<int>);
 
+/**
+ *  @brief  A function that takes a range of elements and a @p callback function and groups the elements
+ *          that @p equality function considers equal. Analogous to `std::ranges::group_by`.
+ *  @return The number of groups formed.
+ */
+template <typename begin_iterator_type_, typename end_iterator_type_, typename equality_type_,
+          typename slice_callback_type_>
+size_t group_by(begin_iterator_type_ const begin, end_iterator_type_ const end, equality_type_ &&equality,
+                slice_callback_type_ &&slice_callback) {
+
+    auto const size = std::distance(begin, end);
+    auto slice_start = begin;
+    size_t group_count = 0;
+
+    while (slice_start != end) {
+        // Find the end of the current group by advancing `slice_end`
+        auto slice_end = slice_start + 1;
+        while (slice_end != end && equality(*slice_start, *slice_end)) ++slice_end;
+        slice_callback(slice_start, slice_end);
+        group_count++;
+        // Move `slice_start` to the beginning of the next potential group
+        slice_start = slice_end;
+    }
+
+    return group_count;
+}
+
 } // namespace stringzilla
 } // namespace ashvardanian
 
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 7e990da3..ac21ad97 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -159,6 +159,8 @@ struct span {
     constexpr size_type length() const noexcept { return size_; }
     constexpr size_type size_bytes() const noexcept { return size_ * sizeof(value_type); }
     constexpr value_type &operator[](size_type i) const noexcept { return data_[i]; }
+    constexpr value_type &front() const noexcept { return data_[0]; }
+    constexpr value_type &back() const noexcept { return data_[size_ - 1]; }
     constexpr bool empty() const noexcept { return size_ == 0; }
 };
 
@@ -744,6 +746,10 @@ class safe_vector {
     value_type const &operator[](size_type i) const noexcept { return data_[i]; }
     value_type *data() noexcept { return data_; }
     value_type const *data() const noexcept { return data_; }
+    value_type &front() noexcept { return data_[0]; }
+    value_type const &front() const noexcept { return data_[0]; }
+    value_type &back() noexcept { return data_[size_ - 1]; }
+    value_type const &back() const noexcept { return data_[size_ - 1]; }
     size_type size() const noexcept { return size_; }
     size_type capacity() const noexcept { return capacity_; }
     operator span<value_type>() noexcept { return {data_, size_}; }

From 77b1087e6782ab763ee823b25143f3a2f5cf2c13 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 30 Apr 2025 22:31:44 +0000
Subject: [PATCH 381/751] Improve: Scheduling Levenshtein in CUDA

The `warp_tasks_grouping` can be reused for
NW and SW as well.
---
 include/stringcuzilla/similarity.cuh | 390 +++++++++++++++------------
 include/stringcuzilla/types.cuh      | 151 ++++++++++-
 2 files changed, 370 insertions(+), 171 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index ad65e508..6bf3e1bd 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -37,25 +37,54 @@
 #ifndef STRINGCUZILLA_SIMILARITY_CUH_
 #define STRINGCUZILLA_SIMILARITY_CUH_
 
-#include "stringcuzilla/types.cuh"
-#include "stringcuzilla/similarity.hpp"
-
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda/pipeline>        // `cuda::pipeline`
 #include <cooperative_groups.h> // `cooperative_groups::this_grid()`
 
+#include "stringcuzilla/types.cuh"
+#include "stringcuzilla/similarity.hpp"
+
 namespace ashvardanian {
 namespace stringzilla {
 
+#pragma region - Common Aliases
+
+using ualloc_t = unified_alloc<char>;
+
+/**
+ *  In @b CUDA:
+ *  - for GPUs before Hopper, we can use the @b SIMT model for warp-level parallelism using diagonal "walkers"
+ *  - for GPUs after Hopper, we compound that with thread-level @b SIMD via @b DPX instructions for min-max
+ */
+using levenshtein_cuda_t = levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
+using levenshtein_utf8_cuda_t = levenshtein_distances_utf8<char, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
+using needleman_wunsch_cuda_t =
+    needleman_wunsch_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
+using smith_waterman_cuda_t =
+    smith_waterman_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
+
+using levenshtein_kepler_t = levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_caps_ck_k>;
+using levenshtein_utf8_hopper_t = levenshtein_distances_utf8<char, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k>;
+using needleman_wunsch_hopper_t =
+    needleman_wunsch_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k>;
+using smith_waterman_hopper_t =
+    smith_waterman_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k>;
+
+#pragma endregion - Common Aliases
+
 #pragma region - Algorithm Building Blocks
 
 /**
  *  @brief GPU adaptation of the `scorer` on CUDA, avoiding warp-level shuffles and DPX.
  *  @note Uses 32-bit `uint` counter to iterate through the string slices, so it can't be over 4 billion characters.
  */
-template <pointer_like first_iterator_type_, pointer_like second_iterator_type_, score_like score_type_,
-          substituter_like substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
+template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
+          typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
+#if _SZ_IS_CPP20
+    requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
+             substituter_like<substituter_type_>
+#endif
 struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, linear_gap_costs_t,
                    objective_, sz_similarity_global_k, capability_, std::enable_if_t<capability_ & sz_cap_cuda_k>> {
 
@@ -152,8 +181,12 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  *  @brief GPU adaptation of the `local_scorer` on CUDA, avoiding warp-level shuffles and DPX.
  *  @note Uses 32-bit `uint` counter to iterate through the string slices, so it can't be over 4 billion characters.
  */
-template <pointer_like first_iterator_type_, pointer_like second_iterator_type_, score_like score_type_,
-          substituter_like substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
+template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
+          typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
+#if _SZ_IS_CPP20
+    requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
+             substituter_like<substituter_type_>
+#endif
 struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, linear_gap_costs_t,
                    objective_, sz_similarity_local_k, capability_, std::enable_if_t<capability_ & sz_cap_cuda_k>> {
 
@@ -414,22 +447,22 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
  *  - Keep everything in global memory - the strings and the diagonals.
  *  - Execute the naive algorithm, expecting the hardware to handle coalescing the memory accesses.
  */
-template <                                                             //
-    typename char_type_ = char,                                        //
-    typename index_type_ = uint,                                       //
-    score_like score_type_ = sz_size_t,                                //
-    score_like final_score_type_ = sz_size_t,                          //
-    substituter_like substituter_type_ = uniform_substitution_costs_t, //
-    gap_costs_like gap_costs_type_ = linear_gap_costs_t,               //
-    sz_similarity_objective_t objective_ = sz_maximize_score_k,        //
-    sz_similarity_locality_t locality_ = sz_similarity_global_k,       //
-    sz_capability_t capability_ = sz_cap_cuda_k                        //
+template <                                                       //
+    typename char_type_ = char,                                  //
+    typename index_type_ = uint,                                 //
+    typename score_type_ = sz_size_t,                            //
+    typename final_score_type_ = sz_size_t,                      //
+    typename substituter_type_ = uniform_substitution_costs_t,   //
+    typename gap_costs_type_ = linear_gap_costs_t,               //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
+    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
+    sz_capability_t capability_ = sz_cap_cuda_k                  //
     >
 __global__ void _score_across_cuda_device(                     //
     char_type_ const *shorter_ptr, index_type_ shorter_length, //
     char_type_ const *longer_ptr, index_type_ longer_length,   //
     final_score_type_ *result_ptr, score_type_ *diagonals_ptr, //
-    substituter_type_ const *substituter_ptr, gap_costs_type_ const *gaps_costs_ptr) noexcept {
+    substituter_type_ const substituter, gap_costs_type_ const gap_costs) {
 
     namespace cg = cooperative_groups;
 
@@ -448,8 +481,8 @@ __global__ void _score_across_cuda_device(                     //
     // Pre-load the substituter and gap costs.
     using substituter_t = substituter_type_;
     using gap_costs_t = gap_costs_type_;
-    substituter_t const substituter = *substituter_ptr;
-    gap_costs_t const gap_costs = *gaps_costs_ptr;
+    static_assert(std::is_trivially_copyable<substituter_t>::value, "Substituter must be trivially copyable.");
+    static_assert(std::is_trivially_copyable<gap_costs_t>::value, "Gap costs must be trivially copyable.");
 
     using warp_scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t, objective_k,
                                       locality_k, capability_k>;
@@ -593,21 +626,23 @@ __global__ void _score_across_cuda_device(                     //
  *          Each pair of strings gets its own @b "block" of CUDA threads forming one @b warp and shared memory.
  *
  *  @param[in] tasks Tasks containing the strings and output locations.
+ *  @param[in] tasks_count The number of tasks to process.
+ *
  */
 template < //
     typename task_type_,
-    typename char_type_ = char,                                        //
-    typename index_type_ = uint,                                       //
-    score_like score_type_ = sz_size_t,                                //
-    substituter_like substituter_type_ = uniform_substitution_costs_t, //
-    gap_costs_like gap_costs_type_ = linear_gap_costs_t,               //
-    sz_similarity_objective_t objective_ = sz_maximize_score_k,        //
-    sz_similarity_locality_t locality_ = sz_similarity_global_k,       //
-    sz_capability_t capability_ = sz_cap_cuda_k                        //
+    typename char_type_ = char,                                  //
+    typename index_type_ = uint,                                 //
+    typename score_type_ = sz_size_t,                            //
+    typename substituter_type_ = uniform_substitution_costs_t,   //
+    typename gap_costs_type_ = linear_gap_costs_t,               //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
+    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
+    sz_capability_t capability_ = sz_cap_cuda_k                  //
     >
 __global__ void _score_on_each_cuda_warp(     //
     task_type_ *tasks, sz_size_t tasks_count, //
-    substituter_type_ const *substituter_ptr, gap_costs_type_ const *gap_costs_ptr) noexcept {
+    substituter_type_ const substituter, gap_costs_type_ const gap_costs) {
 
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
     using task_t = task_type_;
@@ -621,8 +656,8 @@ __global__ void _score_on_each_cuda_warp(     //
     // Pre-load the substituter and gap costs.
     using substituter_t = substituter_type_;
     using gap_costs_t = gap_costs_type_;
-    substituter_t const substituter = *substituter_ptr;
-    gap_costs_t const gap_costs = *gap_costs_ptr;
+    static_assert(std::is_trivially_copyable<substituter_t>::value, "Substituter must be trivially copyable.");
+    static_assert(std::is_trivially_copyable<gap_costs_t>::value, "Gap costs must be trivially copyable.");
 
     using warp_scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t, objective_k,
                                       locality_k, capability_k>;
@@ -633,7 +668,7 @@ __global__ void _score_on_each_cuda_warp(     //
     // We are computing N edit distances for N pairs of strings. Not a cartesian product!
     // Each block/warp may end up receiving a different number of strings.
     for (sz_size_t task_idx = blockIdx.x; task_idx < tasks_count; task_idx += gridDim.x) {
-        task_t const &task = tasks[task_idx];
+        task_t &task = tasks[task_idx];
         char_t const *shorter_global = task.shorter_ptr;
         char_t const *longer_global = task.longer_ptr;
         sz_size_t const shorter_length = task.shorter_length;
@@ -777,7 +812,7 @@ __global__ void _score_on_each_cuda_warp(     //
  *          Before starting the kernels, bins them by size to maximize the number of blocks
  *          per grid that can run simultaneously, while fitting into the shared memory.
  */
-template <typename char_type_, gap_costs_like gap_costs_type_, typename allocator_type_, sz_capability_t capability_>
+template <typename char_type_, typename gap_costs_type_, typename allocator_type_, sz_capability_t capability_>
 struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capability_,
                              std::enable_if_t<capability_ & sz_cap_cuda_k>> {
 
@@ -793,9 +828,10 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
         char_t const *longer_ptr = nullptr;
         size_t longer_length = 0;
         size_t memory_requirement = 0;
-        size_t bytes_per_cell = 0;
         size_t original_index = 0;
-        size_t result = 0;
+        size_t result = std::numeric_limits<size_t>::max();       // ? Signal that we are not done yet.
+        bytes_per_cell_t bytes_per_cell = eight_bytes_per_cell_k; // ? Worst case, need the most memory per scalar.
+        warp_tasks_density_t density = warps_working_together_k;  // ? Worst case, we are not using shared memory.
 
         constexpr task_t() = default;
         constexpr task_t(                                 //
@@ -819,8 +855,7 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
     gap_costs_t gap_costs_ {};
     allocator_t alloc_ {};
 
-    levenshtein_distances(allocator_t const &alloc = {}) noexcept : alloc_(alloc) {}
-    levenshtein_distances(uniform_substitution_costs_t subs, gap_costs_t gaps,
+    levenshtein_distances(uniform_substitution_costs_t subs = {}, gap_costs_t gaps = {},
                           allocator_t const &alloc = allocator_t {}) noexcept
         : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
@@ -830,11 +865,21 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
         results_type_ *results_ptr,                                                           //
         gpu_specs_t specs = {}, cuda_executor_t executor = {}) const noexcept {
 
+        // Preallocate the events for GPU timing.
+        cudaEvent_t start_event, stop_event;
+        cudaEventCreate(&start_event, cudaEventBlockingSync);
+        cudaEventCreate(&stop_event, cudaEventBlockingSync);
+
         using final_score_t = results_type_;
         safe_vector<task_t, tasks_allocator_t> tasks(alloc_);
         if (tasks.try_resize(first_strings.size()) == status_t::bad_alloc_k) return {status_t::bad_alloc_k};
 
+        // Record the start event
+        cudaError_t start_event_error = cudaEventRecord(start_event, executor.stream);
+        if (start_event_error != cudaSuccess) return {status_t::unknown_k, start_event_error};
+
         // Export all the tasks and sort them by decreasing memory requirement.
+        size_t count_empty_tasks = 0;
         using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, false>;
         for (sz_size_t i = 0; i < first_strings.size(); ++i) {
             task_t task(                                            //
@@ -845,155 +890,160 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
                 gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
                 sizeof(char_t), 4);
 
-            task.result = std::numeric_limits<final_score_t>::max(); // Signal that we are not done yet.
             task.original_index = i;
             task.memory_requirement = requirement.total;
             task.bytes_per_cell = requirement.bytes_per_cell;
+            task.density = warp_tasks_density(requirement.total, specs);
+            if (task.density == infinite_warps_per_multiprocessor_k)
+                task.result = task.longer_length * gap_costs_.open_or_extend, count_empty_tasks++;
             tasks[i] = task;
         }
-        std::sort(tasks.begin(), tasks.end(),
-                  [](task_t const &a, task_t const &b) { return a.memory_requirement > b.memory_requirement; });
-
-        // On very large inputs we will keep the diagonals in shared memory.
-        safe_vector<sz_u64_t, scores_allocator_t> diagonals_u64_buffer(alloc_);
-        auto device_level_u16_kernel =
-            &_score_across_cuda_device<char_t, sz_u16_t, sz_u16_t, final_score_t, uniform_substitution_costs_t,
-                                       linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k,
-                                       capability_k>;
-        auto device_level_u32_kernel =
-            &_score_across_cuda_device<char_t, sz_u32_t, sz_u32_t, final_score_t, uniform_substitution_costs_t,
-                                       linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k,
-                                       capability_k>;
-        auto device_level_u64_kernel =
-            &_score_across_cuda_device<char_t, sz_u64_t, sz_u64_t, final_score_t, uniform_substitution_costs_t,
-                                       linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k,
-                                       capability_k>;
-        void *device_level_kernel_args[8];
-
-        // Now we need to bin them based on the number of blocks per multiprocessor,
-        // starting with problems, that can't fit into the memory of a single SM.
-        size_t count_tasks_processed = 0;
-        size_t count_tasks_for_entire_device = 0;
-        for (; count_tasks_processed != tasks.size(); ++count_tasks_processed, ++count_tasks_for_entire_device) {
-            // Check if we've finally reached small-enough inputs.
-            task_t const &task = tasks[count_tasks_processed];
-            size_t const requirement_with_one_warp = task.memory_requirement + specs.reserved_memory_per_block * 1;
-            if (requirement_with_one_warp < specs.shared_memory_per_multiprocessor()) break;
-
-            // As tasks decrease in size, this can only fail on first iteration.
-            if (diagonals_u64_buffer.try_resize(task.max_diagonal_length() * 3) == status_t::bad_alloc_k)
+
+        auto [device_level_tasks, warp_level_tasks, empty_tasks] =
+            warp_tasks_grouping<task_t>({tasks.data(), tasks.size()}, specs);
+
+        if (device_level_tasks.size()) {
+            auto device_level_u16_kernel =
+                &_score_across_cuda_device<char_t, sz_u16_t, sz_u16_t, final_score_t, uniform_substitution_costs_t,
+                                           linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k,
+                                           capability_k>;
+            auto device_level_u32_kernel =
+                &_score_across_cuda_device<char_t, sz_u32_t, sz_u32_t, final_score_t, uniform_substitution_costs_t,
+                                           linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k,
+                                           capability_k>;
+            auto device_level_u64_kernel =
+                &_score_across_cuda_device<char_t, sz_u64_t, sz_u64_t, final_score_t, uniform_substitution_costs_t,
+                                           linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k,
+                                           capability_k>;
+            void *device_level_kernel_args[8];
+
+            // On very large inputs we can't fit the diagonals in shared memory, and use the global one.
+            safe_vector<sz_u64_t, scores_allocator_t> diagonals_u64_buffer(alloc_);
+            task_t const &largest_task = device_level_tasks[0];
+            _sz_assert(largest_task.max_diagonal_length() >= device_level_tasks.back().max_diagonal_length());
+            if (diagonals_u64_buffer.try_resize(largest_task.max_diagonal_length() * 3) == status_t::bad_alloc_k)
                 return {status_t::bad_alloc_k};
-            device_level_kernel_args[0] = (void *)(task.shorter_ptr);
-            device_level_kernel_args[1] = (void *)(&task.shorter_length);
-            device_level_kernel_args[2] = (void *)(task.longer_ptr);
-            device_level_kernel_args[3] = (void *)(&task.longer_length);
-            device_level_kernel_args[4] = (void *)(&task.result);
-            device_level_kernel_args[5] = (void *)(diagonals_u64_buffer.data());
-            device_level_kernel_args[6] = (void *)(&substituter_);
-            device_level_kernel_args[7] = (void *)(&gap_costs_);
-
-            // Pick the smallest fitting type for the diagonals.
-            void *device_level_kernel = reinterpret_cast<void *>(device_level_u16_kernel);
-            if (task.bytes_per_cell >= sizeof(sz_u32_t))
-                device_level_kernel = reinterpret_cast<void *>(device_level_u32_kernel);
-            if (task.bytes_per_cell >= sizeof(sz_u64_t))
-                device_level_kernel = reinterpret_cast<void *>(device_level_u64_kernel);
-
-            // TODO: We can be wiser about the dimensions of this grid.
-            uint const random_block_size = 128;
-            uint const random_blocks_per_multiprocessor = 32;
-            cudaError_t launch_error = cudaLaunchCooperativeKernel(                       //
-                reinterpret_cast<void *>(device_level_kernel),                            // Kernel function pointer
-                dim3(random_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions
-                dim3(random_block_size),                                                  // Block dimensions
-                device_level_kernel_args, // Array of kernel argument pointers
-                0,                        // Shared memory per block (in bytes)
-                executor.stream);         // CUDA stream
-            if (launch_error != cudaSuccess)
-                if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
-                else { return {status_t::unknown_k, launch_error}; }
+
+            // Individually submit each task to the GPU.
+            void *const diagonals_buffer_ptr = (void *)diagonals_u64_buffer.data();
+            for (size_t i = 0; i < device_level_tasks.size(); ++i) {
+                task_t const &task = device_level_tasks[i];
+                device_level_kernel_args[0] = (void *)(&task.shorter_ptr);
+                device_level_kernel_args[1] = (void *)(&task.shorter_length);
+                device_level_kernel_args[2] = (void *)(&task.longer_ptr);
+                device_level_kernel_args[3] = (void *)(&task.longer_length);
+                device_level_kernel_args[4] = (void *)(&task.result);
+                device_level_kernel_args[5] = (void *)(&diagonals_buffer_ptr);
+                device_level_kernel_args[6] = (void *)(&substituter_);
+                device_level_kernel_args[7] = (void *)(&gap_costs_);
+
+                // Pick the smallest fitting type for the diagonals.
+                void *device_level_kernel = reinterpret_cast<void *>(device_level_u16_kernel);
+                if (task.bytes_per_cell >= sizeof(sz_u32_t))
+                    device_level_kernel = reinterpret_cast<void *>(device_level_u32_kernel);
+                if (task.bytes_per_cell >= sizeof(sz_u64_t))
+                    device_level_kernel = reinterpret_cast<void *>(device_level_u64_kernel);
+
+                // TODO: We can be wiser about the dimensions of this grid.
+                uint const random_block_size = 128;
+                uint const random_blocks_per_multiprocessor = 32;
+                cudaError_t launch_error = cudaLaunchCooperativeKernel(                       //
+                    reinterpret_cast<void *>(device_level_kernel),                            // Kernel function pointer
+                    dim3(random_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions
+                    dim3(random_block_size),                                                  // Block dimensions
+                    device_level_kernel_args, // Array of kernel argument pointers
+                    0,                        // Shared memory per block (in bytes)
+                    executor.stream);         // CUDA stream
+                if (launch_error != cudaSuccess)
+                    if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
+                    else { return {status_t::unknown_k, launch_error}; }
+            }
         }
 
         // Now process remaining warp-level tasks, checking warp densities in reverse order.
         // From the highest possible number of warps per multiprocessor to the lowest.
-        std::initializer_list<size_t> warps_per_multiprocessor_densities = {32, 16, 8, 4, 2, 1};
-        auto warp_level_u8_kernel =
-            &_score_on_each_cuda_warp<task_t, char_t, sz_u8_t, sz_u8_t, uniform_substitution_costs_t,
-                                      linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
-        auto warp_level_u16_kernel =
-            &_score_on_each_cuda_warp<task_t, char_t, sz_u16_t, sz_u16_t, uniform_substitution_costs_t,
-                                      linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
-        void *warp_level_kernel_args[4];
-        size_t count_tasks_for_this_density = 0;
-        for (size_t warps_per_multiprocessor_density : warps_per_multiprocessor_densities) {
-            for (; count_tasks_processed != tasks.size(); ++count_tasks_processed, ++count_tasks_for_this_density) {
-                // Check if the current warp density is still optimal.
-                size_t const count_tasks_for_warp_processed = count_tasks_processed - count_tasks_for_entire_device;
-                task_t const &task = tasks[tasks.size() - count_tasks_for_warp_processed - 1];
-                size_t const requirement_with_current_warp_density =
-                    task.memory_requirement + specs.reserved_memory_per_block * warps_per_multiprocessor_density;
-                if (requirement_with_current_warp_density > specs.shared_memory_per_multiprocessor()) break;
-            }
-
-            // If there are no more tasks left - exit!
-            if (!count_tasks_for_this_density) break;
-
-            // If we don't even have enough tasks to keep the entire device busy - continue shrinking
-            // the number of warps per multiprocessor, until we find a fitting density.
-            // ... Unless we are already at the lowest density.
-            if (count_tasks_for_this_density < (specs.streaming_multiprocessors * warps_per_multiprocessor_density) &&
-                warps_per_multiprocessor_density > 1)
-                continue;
-
-            // Now check if any tasks of that size have been found and if their quantity is sufficient
-            // to fill the entire device with warps. If we don't have enough tasks for this density,
-            // part of our GPU will just sit idle...
-            //
-            //     Theoretically, that isn't true. On paper, several CUDA kernels can run concurrently
-            //     on the same physical device. It makes sense, assuming the Multi-Instance GPU (MIG)
-            //     virtualization feature on them. But even when pulling `cudaGetDeviceProperties`
-            //     and logging `cudaDeviceProp::concurrentKernels` for the H200 GPU with 140 GB of VRAM,
-            //     it states 1!
-            //
-            // Moreover, we need to overwrite the maximum addressable shared memory for that kernel
-            // as the default limits our blocks (in our case, single-warp blocks) to 48 KB, while
-            // device supports 4-5x more! Still, that API is synchronous and we must block the current
-            // thread to what until the task completes to change the shared memory amount.
-            size_t const count_tasks_for_warp_processed = count_tasks_processed - count_tasks_for_entire_device;
-            size_t const first_task_index =
-                tasks.size() - count_tasks_for_warp_processed - count_tasks_for_this_density;
-            warp_level_kernel_args[0] = (void *)(tasks.data() + first_task_index);
-            warp_level_kernel_args[1] = (void *)(&count_tasks_for_this_density);
-            warp_level_kernel_args[2] = (void *)(&substituter_);
-            warp_level_kernel_args[3] = (void *)(&gap_costs_);
-
-            // Pick the smallest fitting type for the diagonals.
-            void *warp_level_kernel = reinterpret_cast<void *>(warp_level_u8_kernel);
-            if (tasks[first_task_index].bytes_per_cell >= sizeof(sz_u16_t))
-                warp_level_kernel = reinterpret_cast<void *>(warp_level_u16_kernel);
-
-            // Warp-level algorithm clearly aligns with the warp size.
-            uint const warp_block_size = static_cast<uint>(specs.warp_size);
-            uint const warp_blocks_per_multiprocessor = static_cast<uint>(warps_per_multiprocessor_density);
-            size_t const shared_memory_per_block = tasks[first_task_index].memory_requirement;
-            cudaError_t launch_error = cudaLaunchCooperativeKernel(                     //
-                reinterpret_cast<void *>(warp_level_kernel),                            // Kernel function pointer
-                dim3(warp_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions
-                dim3(warp_block_size),                                                  // Block dimensions
-                device_level_kernel_args, // Array of kernel argument pointers
-                shared_memory_per_block,  // Shared memory per block (in bytes)
-                executor.stream);         // CUDA stream
-            if (launch_error != cudaSuccess)
-                if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
-                else { return {status_t::unknown_k, launch_error}; }
+        if (warp_level_tasks.size()) {
+            auto warp_level_u8_kernel =
+                &_score_on_each_cuda_warp<task_t, char_t, sz_u8_t, sz_u8_t, uniform_substitution_costs_t,
+                                          linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k,
+                                          capability_k>;
+            auto warp_level_u16_kernel =
+                &_score_on_each_cuda_warp<task_t, char_t, sz_u16_t, sz_u16_t, uniform_substitution_costs_t,
+                                          linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k,
+                                          capability_k>;
+            void *warp_level_kernel_args[4];
+
+            cuda_status_t result;
+            auto const task_size_equality = [](task_t const &lhs, task_t const &rhs) {
+                return lhs.bytes_per_cell == rhs.bytes_per_cell && lhs.density == rhs.density;
+            };
+            auto const task_group_callback = [&](task_t const *tasks_begin, task_t const *tasks_end) {
+                // Check if we need to stop processing.
+                if (result.status != status_t::success_k) return;
+
+                size_t const count_tasks = tasks_end - tasks_begin;
+                warp_level_kernel_args[0] = (void *)(&tasks_begin);
+                warp_level_kernel_args[1] = (void *)(&count_tasks);
+                warp_level_kernel_args[2] = (void *)(&substituter_);
+                warp_level_kernel_args[3] = (void *)(&gap_costs_);
+
+                // Pick the smallest fitting type for the diagonals.
+                task_t const &indicative_task = tasks_begin[0];
+                void *warp_level_kernel = reinterpret_cast<void *>(warp_level_u8_kernel);
+                if (indicative_task.bytes_per_cell >= sizeof(sz_u16_t))
+                    warp_level_kernel = reinterpret_cast<void *>(warp_level_u16_kernel);
+
+                // Update the selected kernels properties.
+                size_t const shared_memory_per_block =
+                    indicative_task.memory_requirement * static_cast<size_t>(indicative_task.density);
+                _sz_assert(shared_memory_per_block > 0);
+                _sz_assert(shared_memory_per_block < specs.shared_memory_per_multiprocessor());
+                cudaError_t attribute_error = cudaFuncSetAttribute(
+                    warp_level_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_per_block);
+                if (attribute_error != cudaSuccess) {
+                    result = {status_t::unknown_k, attribute_error};
+                    return;
+                }
+
+                // Warp-level algorithm clearly aligns with the warp size.
+                uint const warp_block_size = static_cast<uint>(specs.warp_size);
+                uint const warp_blocks_per_multiprocessor = static_cast<uint>(indicative_task.density);
+                cudaError_t launch_error = cudaLaunchKernel(                                //
+                    reinterpret_cast<void *>(warp_level_kernel),                            // Kernel function pointer
+                    dim3(warp_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions
+                    dim3(warp_block_size),                                                  // Block dimensions
+                    warp_level_kernel_args,  // Array of kernel argument pointers
+                    shared_memory_per_block, // Shared memory per block (in bytes)
+                    executor.stream);        // CUDA stream
+                if (launch_error != cudaSuccess) {
+                    result = {launch_error == cudaErrorMemoryAllocation ? status_t::bad_alloc_k : status_t::unknown_k,
+                              launch_error};
+                    return;
+                }
+
+                // Wait until everything completes, as on the next iteration we will update the properties again.
+                cudaError_t execution_error = cudaStreamSynchronize(executor.stream);
+                if (execution_error != cudaSuccess) {
+                    result = {status_t::unknown_k, execution_error};
+                    return;
+                }
+            };
+            group_by(warp_level_tasks.begin(), warp_level_tasks.end(), task_size_equality, task_group_callback);
+            if (result.status != status_t::success_k) return result;
         }
 
+        // Calculate the duration:
+        cudaError_t stop_event_error = cudaEventRecord(stop_event, executor.stream);
+        if (stop_event_error != cudaSuccess) return {status_t::unknown_k, stop_event_error};
+        float execution_milliseconds = 0;
+        cudaEventElapsedTime(&execution_milliseconds, start_event, stop_event);
+
         // Now that everything went well, export the results back into the `results` array.
-        for (size_t i = 0; i < count_tasks_processed; ++i) {
+        for (size_t i = 0; i < tasks.size(); ++i) {
             task_t const &task = tasks[i];
             results_ptr[task.original_index] = task.result;
         }
-        return {status_t::success_k};
+        return {status_t::success_k, cudaSuccess, execution_milliseconds};
     }
 };
 
diff --git a/include/stringcuzilla/types.cuh b/include/stringcuzilla/types.cuh
index 269b78a1..65e15287 100644
--- a/include/stringcuzilla/types.cuh
+++ b/include/stringcuzilla/types.cuh
@@ -14,7 +14,9 @@
 #include "stringzilla/types.hpp"
 
 #include <cuda_runtime.h> // `cudaMallocManaged`, `cudaFree`, `cudaSuccess`, `cudaGetErrorString`
-#include <optional>       // `std::optional`
+
+#include <optional>  // `std::optional`
+#include <algorithm> // `std::sort`, `std::partition`
 
 #if !defined(SZ_USE_HOPPER)
 #if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
@@ -109,6 +111,10 @@ struct cuda_status_t {
     inline operator status_t() const noexcept { return status; }
 };
 
+struct cuda_executor_t {
+    cudaStream_t stream = 0;
+};
+
 /**
  *  @brief  Loads 32 bits from an unaligned address using the well known @b `prmt` trick.
  *  @see    https://stackoverflow.com/a/40198552/2766161
@@ -131,6 +137,149 @@ __forceinline__ __device__ sz_u32_vec_t sz_u32_load_unaligned(void const *ptr) n
     return result;
 }
 
+enum warp_tasks_density_t : uint {
+    warps_working_together_k = 0,
+    one_warp_per_multiprocessor_k = 1,
+    two_warps_per_multiprocessor_k = 2,
+    four_warps_per_multiprocessor_k = 4,
+    eight_warps_per_multiprocessor_k = 8,
+    sixteen_warps_per_multiprocessor_k = 16,
+    thirty_two_warps_per_multiprocessor_k = 32,
+    infinite_warps_per_multiprocessor_k = 0xFFFFFFFF
+};
+
+inline warp_tasks_density_t warp_tasks_density(size_t task_memory_requirement, gpu_specs_t const &specs) noexcept {
+    std::initializer_list<warp_tasks_density_t> densities {
+        thirty_two_warps_per_multiprocessor_k, sixteen_warps_per_multiprocessor_k, eight_warps_per_multiprocessor_k,
+        four_warps_per_multiprocessor_k,       two_warps_per_multiprocessor_k,     one_warp_per_multiprocessor_k};
+    if (task_memory_requirement == 0) return infinite_warps_per_multiprocessor_k;
+    for (auto density : densities) {
+        size_t required_block_memory = task_memory_requirement * density + specs.reserved_memory_per_block * density;
+        if (required_block_memory < specs.shared_memory_per_multiprocessor()) return density;
+    }
+    return warps_working_together_k;
+}
+
+template <typename task_type_>
+struct warp_tasks_groups {
+    span<task_type_> device_level_tasks;
+    span<task_type_> warp_level_tasks;
+    span<task_type_> empty_tasks;
+};
+
+/**
+ *  Let's say you have a list of GPU tasks of similar nature, but all of them require different amount of
+ *  shared memory for efficiency. Ideally, we want each warp to receive it's own task (independent of the
+ *  others), and have enough shared memory for more than one warp per multiprocessor.
+ *
+ *  To optimize the scheduling we will look into several factors of individual tasks.
+ *
+ *  @p `tasks[i].bytes_per_cell` - defines the actual kernel template to be used and serves as the natural boundary,
+ *  as different kernels *kind of* can't run concurrently.
+ *
+ *      Theoretically, that isn't true. On paper, several CUDA kernels can run concurrently on the same physical device.
+ *      It makes sense, assuming the Multi-Instance GPU (MIG) virtualization feature on them. But even when pulling
+ *      the `cudaGetDeviceProperties` and logging @b `cudaDeviceProp::concurrentKernels` for the H200 GPU with 140 GB
+ *      of VRAM, it states 1. 🤯
+ *
+ *  @p `tasks[i].density` - max number of warps per multiprocessor, executing simultaneously. It's inferred from the
+ *  shared memory amount, supported by device, which is of the order of 100-200 KB, minus the reserved memory per block.
+ *
+ *      Theoretically, it's true, but in practice, to address more than 48 KB of shared memory from a single block,
+ *      you need to override that @b `cudaFuncAttributeMaxDynamicSharedMemorySize` setting for each individual kernel,
+ *      also subtracting the reserved memory for the expected number of blocks. 🤯
+ *      Moreover, calling @b `cudaFuncSetAttribute` is a synchronous operation, so say goodbye to asynchronous kernel
+ *      launches and latency hiding. So to reduce the overall runtime, we need to have as few individual launches
+ *      as possible.
+ *
+ *  @p `specs.shared_memory_per_multiprocessor()` and @p `specs.reserved_memory_per_block` - our memory limits.
+ *  @p `specs.streaming_multiprocessors` - the actual number of physical multiprocessors on the device. It defines
+ *  the reasonable lower bound for a single kernel launch. On H100 it would be 132, so if a particular group of inputs
+ *  can have 2 simultaneous warps per multiprocessor, but has fewer than (132 / 2 = 66) blocks, it would be better to
+ *  merge it with a more memory-hungry group to avoid the overhead of additional kernel launches.
+ *
+ *  It's possible to have a very memory-efficient task, that allows for 32 warps per multiprocessor, and it would be
+ *  best schedule them in the largest possible block size - 1024 threads.
+ *
+ *  @throws `std::bad_alloc` if the `std::sort` or `std::partition` fails to allocate memory.
+ *
+ *  @post The @p tasks are sorted in-place, first containing the returned number of device-wide tasks,
+ *  and then the warp-wide tasks grouped by ( @p bytes_per_cell, @p density ) pairs. Use @b `group_by`
+ *  to navigate the output.
+ */
+template <typename task_type_>
+warp_tasks_groups<task_type_> warp_tasks_grouping(span<task_type_> tasks, gpu_specs_t const &specs) noexcept(false) {
+
+    using task_t = task_type_;
+    warp_tasks_groups<task_t> result;
+
+    // Determine if there are tasks that require the whole device memory.
+    size_t const device_level_tasks =
+        std::partition(tasks.begin(), tasks.end(),
+                       [](task_t const &task) { return task.density == warps_working_together_k; }) -
+        tasks.begin();
+
+    // Determine the number of empty tasks and put them aside.
+    auto const warp_tasks_begin = tasks.begin() + device_level_tasks;
+    size_t const non_empty_tasks =
+        std::partition(warp_tasks_begin, tasks.end(),
+                       [](task_t const &task) { return task.density != infinite_warps_per_multiprocessor_k; }) -
+        warp_tasks_begin;
+
+    // The remaining tasks will be sorted from smallest memory consumption to largest.
+    auto const warp_tasks_end = warp_tasks_begin + non_empty_tasks;
+    std::sort(warp_tasks_begin, warp_tasks_end, [](task_t const &lhs, task_t const &rhs) {
+        return lhs.bytes_per_cell == rhs.bytes_per_cell ? lhs.density < rhs.density
+                                                        : lhs.bytes_per_cell < rhs.bytes_per_cell;
+    });
+
+    result.device_level_tasks = {tasks.begin(), warp_tasks_begin};
+    result.warp_level_tasks = {warp_tasks_begin, warp_tasks_end};
+    result.empty_tasks = {warp_tasks_end, tasks.end()};
+
+    // The naive next step would be to simply group them by their memory requirements,
+    // but we our high-level goal isn't maximum utilization of the GPU, but rather
+    // the fastest execution time. And assuming the scheduling & synchronization
+    // costs, we may want to combine consecutive groups of tasks to ensure they are large enough.
+    size_t tasks_remaining = result.warp_level_tasks.size() - device_level_tasks;
+    while (tasks_remaining > 1) { // 1 task or less ~ nothing to merge
+        size_t const first_task_index = result.warp_level_tasks.size() - tasks_remaining;
+        task_t &indicative_task = tasks[first_task_index];
+        size_t const tasks_with_same_density =
+            std::find_if(&indicative_task, result.warp_level_tasks.end(),
+                         [&](task_t const &task) {
+                             return task.bytes_per_cell != indicative_task.bytes_per_cell ||
+                                    task.density != indicative_task.density;
+                         }) -
+            &indicative_task;
+        size_t const following_tasks = tasks_remaining - tasks_with_same_density;
+        if (!following_tasks) break; // No more tasks to merge
+
+        // Check if we have enough tasks to keep all the warps busy.
+        size_t const possible_warps = size_t(indicative_task.density) * specs.streaming_multiprocessors;
+        if (tasks_with_same_density > possible_warps) {
+            tasks_remaining -= tasks_with_same_density;
+            continue; // Jump to the next group
+        }
+
+        // If the next task has a different cell size, we can't merge them.
+        task_t &next_indicative_task = tasks[first_task_index + tasks_with_same_density];
+        if (indicative_task.bytes_per_cell != next_indicative_task.bytes_per_cell) {
+            tasks_remaining -= tasks_with_same_density;
+            continue; // Jump to the next group
+        }
+
+        // Update all the operations in the current group to have the same "sparser" density
+        // as the next group.
+        for (size_t i = 0; i < tasks_with_same_density; ++i) {
+            task_t &task = tasks[first_task_index + i];
+            task.density = next_indicative_task.density;
+        }
+    }
+
+    return result;
+}
+
 } // namespace stringzilla
 } // namespace ashvardanian
 

From 563a73db20b3b550b9f5cf302049588f2076c267 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 30 Apr 2025 22:53:56 +0000
Subject: [PATCH 382/751] Fix: Correcting blends on Kepler

---
 include/stringcuzilla/similarity.cuh | 32 +++++++++++++++++++---------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index 6bf3e1bd..f364feec 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -315,8 +315,13 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
         sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion, sz_u8_t const *scores_pre_deletion,
         sz_u8_t *scores_new) noexcept {
 
+        sz_u8_t const match_cost = this->substituter_.match;
+        sz_u8_t const mismatch_cost = this->substituter_.mismatch;
         sz_u8_t const gap_cost = this->gap_costs_.open_or_extend;
-        _sz_assert(gap_cost == 1);
+        sz_u32_vec_t match_cost_vec, mismatch_cost_vec, gap_cost_vec, equality_vec;
+        match_cost_vec.u32 = match_cost * 0x01010101u;       // ! 4x `u8` match costs
+        mismatch_cost_vec.u32 = mismatch_cost * 0x01010101u; // ! 4x `u8` mismatch costs
+        gap_cost_vec.u32 = gap_cost * 0x01010101u;           // ! 4x `u8` gap costs
 
         // The hardest part of this kernel is dealing with unaligned loads!
         // We want to minimize single-byte processing in favor of 4-byte SIMD loads and min/max operations.
@@ -338,12 +343,13 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
             first_vec.u32 = __nv_bswap32(first_vec.u32); // ! reverse the order of bytes in the first vector
 
             // Equality comparison will output 0xFF for each matching byte.
-            // Adding one to it will make it 0x00 for each matching byte, and 0x01 for each non-matching byte.
-            // Perfect for substitution cost!
-            cost_of_substitution_vec.u32 = __vadd4(__vcmpeq4(first_vec.u32, second_vec.u32), 0x01010101);
+            equality_vec.u32 = __vcmpeq4(first_vec.u32, second_vec.u32);
+            cost_of_substitution_vec.u32 =                //
+                (equality_vec.u32 & match_cost_vec.u32) + //
+                (~equality_vec.u32 & mismatch_cost_vec.u32);
             if_substitution_vec.u32 = __vaddus4(pre_substitution_vec.u32, cost_of_substitution_vec.u32);
             if_deletion_or_insertion_vec.u32 =
-                __vaddus4(__vminu4(pre_deletion_vec.u32, pre_insertion_vec.u32), 0x01010101);
+                __vaddus4(__vminu4(pre_deletion_vec.u32, pre_insertion_vec.u32), gap_cost_vec.u32);
             cell_score_vec.u32 = __vminu4(if_deletion_or_insertion_vec.u32, if_substitution_vec.u32);
 
             // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
@@ -371,8 +377,13 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
         sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,
         sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
 
+        sz_u16_t const match_cost = this->substituter_.match;
+        sz_u16_t const mismatch_cost = this->substituter_.mismatch;
         sz_u16_t const gap_cost = this->gap_costs_.open_or_extend;
-        _sz_assert(gap_cost == 1);
+        sz_u32_vec_t match_cost_vec, mismatch_cost_vec, gap_cost_vec, equality_vec;
+        match_cost_vec.u32 = match_cost * 0x00010001;       // ! 2x `u16` match costs
+        mismatch_cost_vec.u32 = mismatch_cost * 0x00010001; // ! 2x `u16` mismatch costs
+        gap_cost_vec.u32 = gap_cost * 0x00010001;           // ! 2x `u16` gap costs
 
         // The hardest part of this kernel is dealing with unaligned loads!
         // We want to minimize single-byte processing in favor of 2-byte SIMD loads and min/max operations.
@@ -395,12 +406,13 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
             second_vec.u16s[1] = second_slice[i + 1];
 
             // Equality comparison will output 0xFFFF for each matching byte-pair.
-            // Adding one to it will make it 0x0000 for each matching byte-pair,
-            // and 0x0001 for each non-matching byte-pair. Perfect for substitution cost!
-            cost_of_substitution_vec.u32 = __vadd2(__vcmpeq2(first_vec.u32, second_vec.u32), 0x00010001);
+            equality_vec.u32 = __vcmpeq2(first_vec.u32, second_vec.u32);
+            cost_of_substitution_vec.u32 =                //
+                (equality_vec.u32 & match_cost_vec.u32) + //
+                (~equality_vec.u32 & mismatch_cost_vec.u32);
             if_substitution_vec.u32 = __vaddus2(pre_substitution_vec.u32, cost_of_substitution_vec.u32);
             if_deletion_or_insertion_vec.u32 =
-                __vaddus2(__vminu2(pre_deletion_vec.u32, pre_insertion_vec.u32), 0x00010001);
+                __vaddus2(__vminu2(pre_deletion_vec.u32, pre_insertion_vec.u32), gap_cost_vec.u32);
             cell_score_vec.u32 = __vminu2(if_deletion_or_insertion_vec.u32, if_substitution_vec.u32);
 
             // When walking through the top-left triangle of the matrix, our output addresses are misaligned.

From f59ba5e37aeaf976238e0b22d79400ac92a2beca Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 1 May 2025 01:38:48 +0000
Subject: [PATCH 383/751] Fix: Ice Lake calls for empty inputs

---
 include/stringcuzilla/similarity.hpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index a7f75dde..c40fc562 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -1666,7 +1666,7 @@ struct levenshtein_distance {
 
         // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
-        else if (requirements.bytes_per_cell == 1) {
+        else if (requirements.bytes_per_cell <= 1) {
             sz_u8_t result_u8 = std::numeric_limits<sz_u8_t>::max();
             status_t status = diagonal_u8_t {substituter_, gap_costs_, alloc_}(first, second, result_u8, executor);
             if (status != status_t::success_k) return status;
@@ -1807,7 +1807,7 @@ struct levenshtein_distance_utf8 {
 
         // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
-        else if (requirements.bytes_per_cell == 1) {
+        else if (requirements.bytes_per_cell <= 1) {
             sz_u8_t result_u8 = std::numeric_limits<sz_u8_t>::max();
             status_t status =
                 diagonal_u8_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u8, executor);
@@ -3302,7 +3302,7 @@ struct levenshtein_distance<char, linear_gap_costs_t, allocator_type_, capabilit
 
         // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
-        if (requirements.bytes_per_cell == 1) {
+        if (requirements.bytes_per_cell <= 1) {
             sz_u8_t result_u8;
             status_t status = diagonal_u8_t {substituter_, gap_costs_, alloc_}(first, second, result_u8 /* executor */);
             if (status != status_t::success_k) return status;
@@ -3413,7 +3413,7 @@ struct levenshtein_distance_utf8<char, linear_gap_costs_t, allocator_type_, capa
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
         span<sz_rune_t const> const first_utf32 {first_data_utf32, first_length_utf32};
         span<sz_rune_t const> const second_utf32 {second_data_utf32, second_length_utf32};
-        if (requirements.bytes_per_cell == 1) {
+        if (requirements.bytes_per_cell <= 1) {
             sz_u8_t result_u8;
             status_t status =
                 diagonal_u8_t {substituter_, gap_costs_, alloc_}(first_utf32, second_utf32, result_u8 /* executor */);
@@ -3712,7 +3712,7 @@ struct needleman_wunsch_score<char, error_costs_256x256_t, linear_gap_costs_t, a
 
         // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
-        if (requirements.bytes_per_cell == 2) {
+        if (requirements.bytes_per_cell <= 2) {
             sz_i16_t result_i16;
             status_t status = horizontal_i16_t {substituter_, gap_costs_, alloc_}(first, second, result_i16);
             if (status != status_t::success_k) return status;
@@ -3785,7 +3785,7 @@ struct smith_waterman_score<char, error_costs_256x256_t, linear_gap_costs_t, all
 
         // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
-        if (requirements.bytes_per_cell == 2) {
+        if (requirements.bytes_per_cell <= 2) {
             sz_i16_t result_i16;
             status_t status = horizontal_i16_t {substituter_, gap_costs_, alloc_}(first, second, result_i16);
             if (status != status_t::success_k) return status;

From c70962168e14e1701ec9ea749e648c11778a90b9 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 3 May 2025 21:47:37 +0000
Subject: [PATCH 384/751] Make: Bump CUDA version

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d0d5a83c..3339e0b6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -63,7 +63,7 @@ check_language(CUDA)
 if (CMAKE_CUDA_COMPILER)
     enable_language(CUDA)
     set(ENABLE_CUDA ON)
-    set(CMAKE_CUDA_STANDARD 17)
+    set(CMAKE_CUDA_STANDARD 20)
     set(CMAKE_CUDA_STANDARD_REQUIRED ON)
     set(CMAKE_CUDA_EXTENSIONS OFF)
     set(CMAKE_CUDA_ARCHITECTURES 80 90)

From bfcd10e39e1e45e406ead1c011209c1b98b3a368 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 3 May 2025 21:48:04 +0000
Subject: [PATCH 385/751] Improve: Duplicate `bytesum` assignment

---
 c/lib.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/c/lib.c b/c/lib.c
index d4e3b419..9d3ed070 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -289,7 +289,6 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->rfind = sz_rfind_skylake;
         impl->find_byte = sz_find_byte_skylake;
         impl->rfind_byte = sz_rfind_byte_skylake;
-        impl->bytesum = sz_bytesum_skylake;
 
         impl->sequence_argsort = sz_sequence_argsort_skylake;
         impl->pgrams_sort = sz_pgrams_sort_skylake;

From 5b410b892d3926346951e75270ebd155906824a1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 3 May 2025 21:48:35 +0000
Subject: [PATCH 386/751] Add: `sz_similarity_gaps_t` enums

---
 include/stringzilla/types.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index bc3179e0..3a76c30b 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -430,6 +430,17 @@ typedef enum sz_similarity_objective_t {
     sz_maximize_score_k = 1
 } sz_similarity_objective_t;
 
+/**
+ *  @brief Describes the cost model for gap opening vs extension in string similarity algorithms.
+ *  @sa sz_gaps_linear_k, sz_gaps_affine_k
+ */
+typedef enum sz_similarity_gaps_t {
+    /** Linear costs require us to build only 1 DP matrix. */
+    sz_gaps_linear_k = 1,
+    /** Affine costs require us to build 3 DP matrices. */
+    sz_gaps_affine_k = 3
+} sz_similarity_gaps_t;
+
 /**
  *  @brief A simple signed integer type describing the status of a faulty operation.
  *  @sa sz_success_k, sz_bad_alloc_k, sz_invalid_utf8_k, sz_contains_duplicates_k
@@ -473,6 +484,7 @@ typedef enum sz_capability_t {
     sz_cap_hopper_k = 1 << 22, ///< CUDA capability with support for Hopper's DPX instructions
 
     sz_caps_sp_k = sz_cap_serial_k | sz_cap_parallel_k,                 ///< Serial code with OpenMP
+    sz_caps_si_k = sz_cap_serial_k | sz_cap_ice_k,                      ///< Serial code with Ice Lake
     sz_caps_spi_k = sz_cap_serial_k | sz_cap_parallel_k | sz_cap_ice_k, ///< Serial code with OpenMP and Ice Lake
     sz_caps_sps_k = sz_cap_serial_k | sz_cap_parallel_k | sz_cap_sve_k, ///< Serial code with OpenMP and SVE
     sz_caps_ck_k = sz_cap_cuda_k | sz_cap_kepler_k,                     ///< CUDA code with Kepler
@@ -928,7 +940,7 @@ SZ_PUBLIC void sz_sequence_from_null_terminated_strings(sz_cptr_t *start, sz_siz
  *  @brief  Cache-line width, that will affect the execution of some algorithms,
  *          like equality checks and relative order computing.
  */
-#define SZ_CACHE_LINE_WIDTH (64) // bytes
+#define SZ_CACHE_LINE_WIDTH (64)   // bytes
 #define SZ_MAX_REGISTER_WIDTH (64) // bytes
 #define SZ_SIZE_MAX ((sz_size_t)(-1))
 #define SZ_SSIZE_MAX ((sz_ssize_t)(SZ_SIZE_MAX >> 1))

From bd3d341507028c1de67d1cf27c391fc6f58e863c Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 4 May 2025 15:00:16 +0000
Subject: [PATCH 387/751] Make: Add `fork_union` dependency

---
 .gitmodules | 3 +++
 fork_union  | 1 +
 2 files changed, 4 insertions(+)
 create mode 100644 .gitmodules
 create mode 160000 fork_union

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..2c6b76b0
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "fork_union"]
+	path = fork_union
+	url = https://github.com/ashvardanian/fork_union.git
diff --git a/fork_union b/fork_union
new file mode 160000
index 00000000..34b8d8fe
--- /dev/null
+++ b/fork_union
@@ -0,0 +1 @@
+Subproject commit 34b8d8feef8190c8ce950fdd20413991d15e10b8

From 8b62f2cb51276eed2d69cb8dd6652055a3a4e42d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 4 May 2025 15:31:49 +0000
Subject: [PATCH 388/751] Docs: Inconsistent naming

---
 include/stringcuzilla/similarity.hpp | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index c40fc562..f3059cd0 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -9,15 +9,9 @@
  *  - `sz::needleman_wunsch_score` for weighted Needleman-Wunsch @b (NW) global alignment.
  *  - `sz::smith_waterman_score` for weighted Smith-Waterman @b (SW) local alignment.
  *
- *  Using those may look like this:
- *
- *  @code{.cpp}
- *  using levenshtein_t = sz::levenshtein_distance<sz::error_costs_256x256_t>;
- *
- *
  *  Also includes their batch-capable and parallel versions:
  *
- *  - `sz::levenshtein_scores` & `sz::levenshtein_scores_utf8` for Levenshtein edit-scores.
+ *  - `sz::levenshtein_distances` & `sz::levenshtein_distances_utf8` for Levenshtein edit-scores.
  *  - `sz::needleman_wunsch_scores` for weighted Needleman-Wunsch global alignment.
  *  - `sz::smith_waterman_scores` for weighted Smith-Waterman local alignment.
  *
@@ -367,7 +361,7 @@ struct diagonal_walker;
  *          in the `stringzilla.hpp` header, making compilation times shorter for the end-user.
  *  @sa     For lower-level API, check `sz_levenshtein_distance[_utf8]` and `sz_needleman_wunsch_score`.
  *  @sa     For simplicity, use the `sz::levenshtein_distance[_utf8]` and `sz::needleman_wunsch_score`.
- *  @sa     For bulk API, use `sz::levenshtein_scores[_utf8]`.
+ *  @sa     For bulk API, use `sz::levenshtein_distances[_utf8]`.
  */
 template <                                                       //
     typename char_type_ = char,                                  //
@@ -3448,8 +3442,8 @@ struct levenshtein_distance_utf8<char, linear_gap_costs_t, allocator_type_, capa
 
 /**
  *  @brief  Helper object optimizing the most expensive part of variable-substitution-cost alignment methods for
- *          Ice Lake CPUs. It's designed for horizontal layout "walkers", where we look at just one row of (256 x
- * 256) substitution matrix and can fit 256 bytes worth of costs in the registers.
+ *          Ice Lake CPUs. It's designed for horizontal layout "walkers", where we look at just one row
+ *          of (256 x 256) substitution matrix and can fit 256 bytes worth of costs in the registers.
  *
  *  This is a common abstraction for both:
  *  - Local SW and global NW alignment.

From 81463d4c40752b343dc0fd52bbb58fecb931b4b0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 4 May 2025 15:33:27 +0000
Subject: [PATCH 389/751] Improve: Use `fork_union` pools

---
 CMakeLists.txt                       |  1 +
 include/stringcuzilla/similarity.hpp | 16 +++---
 scripts/bench_similarity.cuh         | 77 +++++++++++++++++-----------
 3 files changed, 55 insertions(+), 39 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3339e0b6..54da7f07 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -153,6 +153,7 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
     get_target_property(target_type ${target} TYPE)
 
     target_include_directories(${target} PRIVATE scripts)
+    target_include_directories(${target} PRIVATE fork_union/include)
 
     # Set output directory for single-configuration generators (like Make)
     set_target_properties(${target} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/$<0:>)
diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index f3059cd0..bc0ec80a 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -547,7 +547,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 
         error_cost_t const gap_cost = gap_costs_.open_or_extend;
 
-        executor.for_each(n, [&](size_t i) {
+        executor.for_each_static(n, [&](size_t i) noexcept {
             score_t pre_substitution = scores_pre_substitution[i];
             score_t pre_insertion = scores_pre_insertion[i];
             score_t pre_deletion = scores_pre_deletion[i];
@@ -631,7 +631,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 
         error_cost_t const gap_cost = gap_costs_.open_or_extend;
         std::atomic<score_t> atomic_best_score {best_score_};
-        executor.for_each_range(n, [&](size_t i_start, size_t i_end) {
+        executor.for_each_slice(n, [&](size_t i_start, size_t i_end) noexcept {
             score_t local_best_score = atomic_best_score;
             for (size_t i = i_start; i < i_end; ++i) {
                 score_t pre_substitution = scores_pre_substitution[i];
@@ -744,7 +744,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         score_t *scores_new_deletions,                                                   //
         executor_type_ &&executor = {}) noexcept {
 
-        executor.for_each(n, [&](size_t i) {
+        executor.for_each_static(n, [&](size_t i) noexcept {
             score_t pre_substitution = scores_pre_substitution[i];
             score_t pre_insertion_opening = scores_pre_insertion[i];
             score_t pre_deletion_opening = scores_pre_deletion[i];
@@ -851,7 +851,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         executor_type_ &&executor = {}) noexcept {
 
         std::atomic<score_t> atomic_best_score {best_score_};
-        executor.for_each_range(n, [&](size_t i_start, size_t i_end) {
+        executor.for_each_slice(n, [&](size_t i_start, size_t i_end) noexcept {
             score_t local_best_score = atomic_best_score;
             for (size_t i = i_start; i < i_end; ++i) {
                 score_t pre_substitution = scores_pre_substitution[i];
@@ -2075,7 +2075,7 @@ status_t _score_in_parallel(
 
     // ? There may be a huge variance in the lengths of the strings,
     // ? so we need to use a dynamic schedule.
-    executor.eager(first_size, [&](size_t i) {
+    executor.for_each_dynamic(first_size, [&](size_t i) noexcept {
         if (error.load() != status_t::success_k) return;
         score_t result = 0;
         auto const &first = first_strings[i];
@@ -2934,7 +2934,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
 
         // In this variant we will need at most (64 * 1024 / 32) = 2048 loops per diagonal.
         size_t const body_pages = hbt.body / step_k;
-        executor.for_each(body_pages, [&](size_t const page) {
+        executor.for_each_static(body_pages, [&](size_t const page) noexcept {
             size_t const progress = page * step_k;
             slice_aligned32chars(                                                                             //
                 first_reversed_slice + progress, second_slice + progress, scores_pre_substitution + progress, //
@@ -3080,7 +3080,7 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_subst
 
         // In this variant we will need at most (64 * 1024 / 16) = 4096 loops per diagonal.
         size_t const body_pages = hbt.body / step_k;
-        executor.for_each(body_pages, [&](size_t const page) {
+        executor.for_each_static(body_pages, [&](size_t const page) noexcept {
             size_t const progress = page * step_k;
             slice_aligned16chars(                                                                             //
                 first_reversed_slice + progress, second_slice + progress, scores_pre_substitution + progress, //
@@ -3225,7 +3225,7 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
                 match_cost_vec, mismatch_cost_vec, gap_cost_vec);
 
         size_t const body_pages = hbt.body / step_k;
-        executor.for_each(body_pages, [&](size_t const page) {
+        executor.for_each_static(body_pages, [&](size_t const page) noexcept {
             size_t const progress = page * step_k;
             slice_aligned16chars(                                                                             //
                 first_reversed_slice + progress, second_slice + progress, scores_pre_substitution + progress, //
diff --git a/scripts/bench_similarity.cuh b/scripts/bench_similarity.cuh
index ea367c8a..a1e1aae9 100644
--- a/scripts/bench_similarity.cuh
+++ b/scripts/bench_similarity.cuh
@@ -2,19 +2,17 @@
  *  @file   bench_similarity.cuh
  *  @brief  Shared code for CPU and GPU batched string similarity kernels.
  */
+#include <tuple> // `std::tuple`
 
-#include "bench.hpp"
-#include "test_stringcuzilla.cuh" // `levenshtein_baseline`, `error_costs_256x256_unary`
+#include <fork_union.hpp> // Fork-join scoped thread pool
 
-#if SZ_USE_CUDA
-#include <stringcuzilla/similarity.cuh> // Parallel string processing on CUDA or OpenMP
-#endif
+#include <stringcuzilla/similarity.hpp> // C++ templates for string similarity measures
 
-#if SZ_USE_OPENMP
-#include <stringcuzilla/similarity.hpp> // OpenMP templates for string similarity measures
+#if SZ_USE_CUDA
+#include <stringcuzilla/similarity.cuh> // Parallel string processing in CUDA
 #endif
 
-#include <tuple> // `std::tuple`
+#include "bench.hpp"
 
 namespace ashvardanian {
 namespace stringzilla {
@@ -34,10 +32,9 @@ struct similarities_callable {
     engine_t engine = {};
     std::tuple<extra_args_...> extra_args = {};
 
-    similarities_callable(environment_t const &env, similarities_t &res, sz_size_t batch_size, engine_t eng = {},
-                          extra_args_... args)
+    similarities_callable(environment_t const &env, similarities_t &res, engine_t eng = {}, extra_args_... args)
         : env(env), results(res), engine(eng), extra_args(args...) {
-        if (env.tokens.size() <= batch_size) throw std::runtime_error("Batch size is too large.");
+        if (env.tokens.size() <= results.size()) throw std::runtime_error("Batch size is too large.");
     }
 
     call_result_t operator()(std::size_t batch_index) noexcept(false) {
@@ -59,7 +56,7 @@ struct similarities_callable {
         do_not_optimize(results);
         std::size_t bytes_passed = 0, cells_passed = 0;
         for (std::size_t i = 0; i < results.size(); ++i) {
-            bytes_passed += std::min(a[i].size(), b[i].size());
+            bytes_passed += a[i].size() + b[i].size();
             cells_passed += a[i].size() * b[i].size();
         }
         call_result_t call_result;
@@ -92,27 +89,43 @@ void bench_levenshtein(environment_t const &env) {
 #if SZ_USE_CUDA
     sz::gpu_specs_t specs = *sz::gpu_specs();
 #endif
-    std::vector<std::size_t> batch_sizes = {1024 / 32, 1024, 1024 * 32};
+    std::vector<std::size_t> batch_sizes = {1, 32, 1024, 32 * 1024};
 #if SZ_DEBUG
     batch_sizes = {1, 2, 32};
 #endif
     similarities_t results_baseline, results_accelerated;
+    fork_union_t pool;
+    if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
+    static_assert(executor_like<fork_union_t>);
 
     auto scramble_accelerated_results = [&] {
         std::shuffle(results_accelerated.begin(), results_accelerated.end(), global_random_generator());
     };
 
+    // Let's define some weird scoring schemes for Levenshtein-like distance, that are not unary:
+    constexpr linear_gap_costs_t weird_linear {3};
+    constexpr affine_gap_costs_t weird_affine {4, 2};
+    constexpr uniform_substitution_costs_t weird_uniform {1, 3};
+
     for (std::size_t batch_size : batch_sizes) {
         results_baseline.resize(batch_size);
         results_accelerated.resize(batch_size);
 
-        auto call_baseline = similarities_callable<levenshtein_serial_t>(env, results_baseline, batch_size);
+        auto call_baseline = similarities_callable<levenshtein_serial_t, fork_union_t &>(
+            env, results_baseline, levenshtein_serial_t {weird_uniform, weird_linear}, pool);
         auto name_baseline = "levenshtein_serial:batch"s + std::to_string(batch_size);
         bench_result_t baseline = bench_unary(env, name_baseline, call_baseline).log();
 
+        auto call_affine_baseline = similarities_callable<affine_levenshtein_serial_t, fork_union_t &>(
+            env, results_baseline, affine_levenshtein_serial_t {weird_uniform, weird_affine}, pool);
+        auto name_affine_baseline = "affine_levenshtein_serial:batch"s + std::to_string(batch_size);
+        bench_result_t affine_baseline = bench_unary(env, name_affine_baseline, call_affine_baseline).log(baseline);
+        sz_unused(affine_baseline);
+
 #if SZ_USE_ICE
         bench_unary(env, "levenshtein_ice:batch"s + std::to_string(batch_size), call_baseline,
-                    similarities_callable<levenshtein_ice_t>(env, results_accelerated, batch_size),
+                    similarities_callable<levenshtein_ice_t, fork_union_t &>(
+                        env, results_accelerated, levenshtein_ice_t {weird_uniform, weird_linear}, pool),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
             .log(baseline);
@@ -120,32 +133,34 @@ void bench_levenshtein(environment_t const &env) {
 #endif
 
 #if SZ_USE_CUDA
-        bench_unary(
-            env, "levenshtein_cuda:batch"s + std::to_string(batch_size), call_baseline,
-            similarities_callable<levenshtein_cuda_t, sz::gpu_specs_t>(env, results_accelerated, batch_size, {}, specs),
-            callable_no_op_t {},        // preprocessing
-            similarities_equality_t {}) // equality check
+        bench_unary(env, "levenshtein_cuda:batch"s + std::to_string(batch_size), call_baseline,
+                    similarities_callable<levenshtein_cuda_t, sz::gpu_specs_t>(
+                        env, results_accelerated, levenshtein_cuda_t {weird_uniform, weird_linear}, specs),
+                    callable_no_op_t {},        // preprocessing
+                    similarities_equality_t {}) // equality check
             .log(baseline);
         scramble_accelerated_results();
 #endif
 
 #if SZ_USE_KEPLER
         bench_unary(env, "levenshtein_kepler:batch"s + std::to_string(batch_size), call_baseline,
-                    similarities_callable<levenshtein_kepler_t, sz::gpu_specs_t>(env, results_accelerated, batch_size,
-                                                                                 {}, specs),
+                    similarities_callable<levenshtein_kepler_t, sz::gpu_specs_t>(
+                        env, results_accelerated, levenshtein_kepler_t {weird_uniform, weird_linear}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
             .log(baseline);
         scramble_accelerated_results();
 #endif
 
-        auto call_utf8_baseline = similarities_callable<levenshtein_utf8_serial_t>(env, results_baseline, batch_size);
+        auto call_utf8_baseline = similarities_callable<levenshtein_utf8_serial_t>(
+            env, results_baseline, levenshtein_utf8_serial_t {weird_uniform, weird_linear});
         auto name_utf8_baseline = "levenshtein_utf8_serial:batch"s + std::to_string(batch_size);
         bench_result_t utf8_baseline = bench_unary(env, name_utf8_baseline, call_utf8_baseline).log();
 
 #if SZ_USE_ICE
         bench_unary(env, "levenshtein_utf8_ice:batch"s + std::to_string(batch_size), call_utf8_baseline,
-                    similarities_callable<levenshtein_utf8_ice_t>(env, results_accelerated, batch_size),
+                    similarities_callable<levenshtein_utf8_ice_t>(env, results_accelerated,
+                                                                  levenshtein_utf8_ice_t {weird_uniform, weird_linear}),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
             .log(utf8_baseline);
@@ -158,7 +173,7 @@ void bench_needleman_wunsch(environment_t const &env) {
 
     using namespace std::string_literals; // for "s" suffix
 
-    constexpr error_t blosum62_gap_extension_cost = -4;
+    constexpr linear_gap_costs_t blosum62_linear_costs {-4};
     auto blosum62_mat = error_costs_26x26ascii_t::blosum62();
     auto blosum62_matrix = blosum62_mat.decompressed();
 
@@ -179,25 +194,25 @@ void bench_needleman_wunsch(environment_t const &env) {
         results_baseline.resize(batch_size);
         results_accelerated.resize(batch_size);
 
-        auto call_baseline = similarities_callable<needleman_wunsch_serial_t>(
-            env, results_baseline, batch_size, {blosum62_matrix, blosum62_gap_extension_cost});
+        auto call_baseline = similarities_callable<needleman_wunsch_serial_t>(env, results_baseline,
+                                                                              {blosum62_matrix, blosum62_linear_costs});
         auto name_baseline = "needleman_wunsch_serial:batch"s + std::to_string(batch_size);
         bench_result_t baseline = bench_unary(env, name_baseline, call_baseline).log();
 
 #if SZ_USE_ICE
         bench_unary(env, "needleman_wunsch_ice:batch"s + std::to_string(batch_size), call_baseline,
-                    similarities_callable<needleman_wunsch_ice_t>(env, results_accelerated, batch_size,
-                                                                  {blosum62_matrix, blosum62_gap_extension_cost}),
+                    similarities_callable<needleman_wunsch_ice_t>(env, results_accelerated,
+                                                                  {blosum62_matrix, blosum62_linear_costs}),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
             .log(baseline);
         scramble_accelerated_results();
 #endif
 
-#if SZ_USE_CUDA
+#if SZ_USE_CUDA && 0
         bench_unary(env, "needleman_wunsch_cuda:batch"s + std::to_string(batch_size), call_baseline,
                     similarities_callable<needleman_wunsch_cuda_t, sz::gpu_specs_t>(
-                        env, results_accelerated, batch_size, {blosum62_matrix, blosum62_gap_extension_cost}, specs),
+                        env, results_accelerated, {blosum62_matrix, blosum62_linear_costs}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
             .log(baseline);

From 1d6f58d4e6ad1eef64011eb2c0d0f343bc409260 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 4 May 2025 15:33:54 +0000
Subject: [PATCH 390/751] Improve: Match `fork_union` API in executors

---
 include/stringcuzilla/types.hpp | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/include/stringcuzilla/types.hpp b/include/stringcuzilla/types.hpp
index cb13b890..f45d26cb 100644
--- a/include/stringcuzilla/types.hpp
+++ b/include/stringcuzilla/types.hpp
@@ -29,7 +29,7 @@ struct dummy_executor_t {
      *          the same thread.
      */
     template <typename function_type_>
-    inline void for_each(size_t n, function_type_ &&function) const noexcept {
+    inline void for_each_static(size_t n, function_type_ &&function) const noexcept {
         for (size_t i = 0; i < n; ++i) function(i);
     }
 
@@ -40,7 +40,7 @@ struct dummy_executor_t {
      *          handled by a particular thread.
      */
     template <typename function_type_>
-    inline void for_each_range(size_t n, function_type_ &&function) const noexcept {
+    inline void for_each_slice(size_t n, function_type_ &&function) const noexcept {
         function(0, n);
     }
 
@@ -50,7 +50,7 @@ struct dummy_executor_t {
      *          so each thread eagerly processes the next index in the range.
      */
     template <typename function_type_>
-    inline void eager(size_t n, function_type_ &&function) const noexcept {
+    inline void for_each_dynamic(size_t n, function_type_ &&function) const noexcept {
         for (size_t i = 0; i < n; ++i) function(i);
     }
 };
@@ -63,7 +63,7 @@ struct openmp_executor_t {
      *          the same thread.
      */
     template <typename function_type_>
-    inline void for_each(size_t n, function_type_ &&function) const noexcept {
+    inline void for_each_static(size_t n, function_type_ &&function) const noexcept {
 #pragma omp parallel for
         for (size_t i = 0; i < n; ++i) function(i);
     }
@@ -75,7 +75,7 @@ struct openmp_executor_t {
      *          handled by a particular thread.
      */
     template <typename function_type_>
-    inline void for_each_range(size_t n, function_type_ &&function) const noexcept {
+    inline void for_each_slice(size_t n, function_type_ &&function) const noexcept {
         // OpenMP won't use more threads than the number of available cores
         // and by using STL to query that number, we avoid the need to link
         // against OpenMP libraries.
@@ -95,7 +95,7 @@ struct openmp_executor_t {
      *          so each thread eagerly processes the next index in the range.
      */
     template <typename function_type_>
-    inline void eager(size_t n, function_type_ &&function) const noexcept {
+    inline void for_each_dynamic(size_t n, function_type_ &&function) const noexcept {
 #pragma omp parallel for schedule(dynamic, 1)
         for (size_t i = 0; i < n; ++i) function(i);
     }
@@ -103,20 +103,26 @@ struct openmp_executor_t {
 
 template <typename executor_type_>
 concept executor_like = requires(executor_type_ executor) {
+#if !defined(__NVCC__)
     {
-        executor.for_each(0u, [](size_t) {})
+        executor.for_each_static(0u, [](size_t) {})
     } -> std::same_as<void>;
     {
-        executor.for_each_range(0u, [](size_t, size_t) {})
+        executor.for_each_slice(0u, [](size_t, size_t) {})
     } -> std::same_as<void>;
     {
-        executor.eager(0u, [](size_t) {})
+        executor.for_each_dynamic(0u, [](size_t) {})
     } -> std::same_as<void>;
+#else
+    sizeof(executor) > 0;
+#endif
 };
 
+#if !defined(__NVCC__)
 static_assert(executor_like<dummy_executor_t>);
 static_assert(executor_like<openmp_executor_t>);
 static_assert(!executor_like<int>);
+#endif
 
 template <typename continuous_type_>
 concept continuous_like = requires(continuous_type_ container) {

From 3bbebc8440ec54e0c82bb743194e1a44759f7837 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 4 May 2025 15:34:44 +0000
Subject: [PATCH 391/751] Improve: Test weird affine gaps

---
 scripts/test_stringcuzilla.cuh | 180 +++++++++++++++------------------
 1 file changed, 81 insertions(+), 99 deletions(-)

diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index be445073..2f3713b6 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -419,61 +419,6 @@ struct smith_waterman_baselines_t {
     }
 };
 
-using malloc_t = std::allocator<char>;
-#if SZ_USE_CUDA
-using ualloc_t = unified_alloc<char>;
-#endif
-
-/**
- *  In non-SIMD backends we still leverage OpenMP for parallelism.
- */
-using levenshtein_serial_t = levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_sp_k>;
-using levenshtein_utf8_serial_t = levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_caps_sp_k>;
-using needleman_wunsch_serial_t =
-    needleman_wunsch_scores<char, error_costs_256x256_t, linear_gap_costs_t, malloc_t, sz_caps_sp_k>;
-using smith_waterman_serial_t =
-    smith_waterman_scores<char, error_costs_256x256_t, linear_gap_costs_t, malloc_t, sz_caps_sp_k>;
-
-using affine_levenshtein_serial_t = levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_caps_sp_k>;
-using affine_levenshtein_utf8_serial_t = levenshtein_distances_utf8<char, affine_gap_costs_t, malloc_t, sz_caps_sp_k>;
-using affine_needleman_wunsch_serial_t =
-    needleman_wunsch_scores<char, error_costs_256x256_t, affine_gap_costs_t, malloc_t, sz_caps_sp_k>;
-using affine_smith_waterman_serial_t =
-    smith_waterman_scores<char, error_costs_256x256_t, affine_gap_costs_t, malloc_t, sz_caps_sp_k>;
-
-/**
- *  In @b AVX-512:
- *  - for Global Alignments, we can vectorize the min-max calculation for diagonal "walkers"
- *  - for Local Alignments, we can vectorize the character substitution lookups for horizontal "walkers"
- */
-using levenshtein_ice_t = levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_spi_k>;
-using levenshtein_utf8_ice_t = levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_caps_spi_k>;
-using needleman_wunsch_ice_t =
-    needleman_wunsch_scores<char, error_costs_256x256_t, linear_gap_costs_t, malloc_t, sz_caps_spi_k>;
-using smith_waterman_ice_t =
-    smith_waterman_scores<char, error_costs_256x256_t, linear_gap_costs_t, malloc_t, sz_caps_spi_k>;
-
-#if SZ_USE_CUDA
-/**
- *  In @b CUDA:
- *  - for GPUs before Hopper, we can use the @b SIMT model for warp-level parallelism using diagonal "walkers"
- *  - for GPUs after Hopper, we compound that with thread-level @b SIMD via @b DPX instructions for min-max
- */
-using levenshtein_cuda_t = levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
-using levenshtein_utf8_cuda_t = levenshtein_distances_utf8<char, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
-using needleman_wunsch_cuda_t =
-    needleman_wunsch_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
-using smith_waterman_cuda_t =
-    smith_waterman_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
-
-using levenshtein_kepler_t = levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_caps_ck_k>;
-using levenshtein_utf8_hopper_t = levenshtein_distances_utf8<char, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k>;
-using needleman_wunsch_hopper_t =
-    needleman_wunsch_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k>;
-using smith_waterman_hopper_t =
-    smith_waterman_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k>;
-#endif
-
 template <typename score_type_>
 void edit_distance_log_mismatch(std::string const &first, std::string const &second, //
                                 score_type_ result_base, score_type_ result_simd) {
@@ -748,7 +693,7 @@ void test_similarity_scores_equivalence() {
     // Multi-threaded parallel Levenshtein distance implementation
     test_similarity_scores_fixed_and_fuzzy<sz_size_t>( //
         levenshtein_baselines_t {},                    //
-        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {});
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {});
 
     // Single-threaded serial Levenshtein distance implementation with weird linear costs
     test_similarity_scores_fixed_and_fuzzy<sz_size_t>(         //
@@ -758,7 +703,7 @@ void test_similarity_scores_equivalence() {
     // Multi-threaded parallel Levenshtein distance implementation with weird linear costs
     test_similarity_scores_fixed_and_fuzzy<sz_size_t>(         //
         levenshtein_baselines_t {weird_uniform, weird_linear}, //
-        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {weird_uniform, weird_linear});
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear});
 
     // Single-threaded serial Levenshtein distance implementation with weird affine costs
     test_similarity_scores_fixed_and_fuzzy<sz_size_t>(         //
@@ -768,53 +713,88 @@ void test_similarity_scores_equivalence() {
     // Multi-threaded parallel Levenshtein distance implementation with weird affine costs
     test_similarity_scores_fixed_and_fuzzy<sz_size_t>(         //
         levenshtein_baselines_t {weird_uniform, weird_affine}, //
-        levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_caps_sp_k> {weird_uniform, weird_affine});
+        levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_affine});
 
     // Now let's take non-unary substitution costs, like BLOSUM62
-    constexpr linear_gap_costs_t blosum62_gap_extension_cost {-4};
+    constexpr linear_gap_costs_t blosum62_linear_cost {-4};
+    constexpr affine_gap_costs_t blosum62_affine_cost {-4, -1};
     error_mat_t blosum62_mat = error_costs_26x26ascii_t::blosum62();
     error_matrix_t blosum62_matrix = blosum62_mat.decompressed();
 
     // Single-threaded serial NW implementation
-    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                              //
-        needleman_wunsch_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                       //
+        needleman_wunsch_baselines_t {blosum62_matrix, blosum62_linear_cost}, //
         needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
-            blosum62_matrix, blosum62_gap_extension_cost});
+            blosum62_matrix, blosum62_linear_cost});
 
     // Multi-threaded parallel NW implementation
-    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                              //
-        needleman_wunsch_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
-        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {
-            blosum62_matrix, blosum62_gap_extension_cost});
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                       //
+        needleman_wunsch_baselines_t {blosum62_matrix, blosum62_linear_cost}, //
+        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_linear_cost});
 
     // Single-threaded serial SW implementation
-    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                            //
-        smith_waterman_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                     //
+        smith_waterman_baselines_t {blosum62_matrix, blosum62_linear_cost}, //
         smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
-            blosum62_matrix, blosum62_gap_extension_cost});
+            blosum62_matrix, blosum62_linear_cost});
 
     // Multi-threaded parallel SW implementation
-    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                            //
-        smith_waterman_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
-        smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {
-            blosum62_matrix, blosum62_gap_extension_cost});
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                     //
+        smith_waterman_baselines_t {blosum62_matrix, blosum62_linear_cost}, //
+        smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_linear_cost});
+
+    // Single-threaded serial NW implementation with weird affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                       //
+        needleman_wunsch_baselines_t {blosum62_matrix, blosum62_affine_cost}, //
+        needleman_wunsch_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_affine_cost});
+
+    // Multi-threaded parallel NW implementation with weird affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                       //
+        needleman_wunsch_baselines_t {blosum62_matrix, blosum62_affine_cost}, //
+        needleman_wunsch_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_affine_cost});
+
+    // Single-threaded serial SW implementation with weird affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                     //
+        smith_waterman_baselines_t {blosum62_matrix, blosum62_affine_cost}, //
+        smith_waterman_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_affine_cost});
+
+    // Multi-threaded parallel SW implementation with weird affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                     //
+        smith_waterman_baselines_t {blosum62_matrix, blosum62_affine_cost}, //
+        smith_waterman_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_affine_cost});
 
 #if SZ_USE_ICE
     // Ice Lake Levenshtein distance against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                              //
-        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {}, //
-        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_spi_k> {});
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                 //
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {}, //
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_si_k> {});
 
     // Ice Lake Levenshtein distance against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                   //
-        levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {}, //
-        levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_caps_spi_k> {});
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                      //
+        levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {}, //
+        levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_caps_si_k> {});
+
+    // Ice Lake Levenshtein distance against Multi-threaded on CPU with weird linear costs
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                                            //
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear}, //
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_si_k> {weird_uniform, weird_linear});
+
+    // Ice Lake Levenshtein distance against Multi-threaded on CPU with weird linear costs
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>( //
+        levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear},
+        levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_caps_si_k> {weird_uniform, weird_linear});
 
     // Ice Lake Needleman-Wunsch distance against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                              //
-        needleman_wunsch_baselines_t {blosum62_matrix, blosum62_gap_extension_cost}, //
-        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_caps_spi_k> {
-            blosum62_matrix, blosum62_gap_extension_cost});
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                       //
+        needleman_wunsch_baselines_t {blosum62_matrix, blosum62_linear_cost}, //
+        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_caps_si_k> {
+            blosum62_matrix, blosum62_linear_cost});
 
 #endif
 
@@ -824,25 +804,27 @@ void test_similarity_scores_equivalence() {
 
 #if SZ_USE_CUDA
     // CUDA Levenshtein distance against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                              //
-        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {}, //
-        levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k> {}, {}, {}, first_gpu_specs);
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                                            //
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear}, //
+        levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k> {weird_uniform, weird_linear}, {}, {},
+        first_gpu_specs);
 #endif
 
 #if SZ_USE_KEPLER
     // CUDA Levenshtein distance on Kepler against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                              //
-        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {}, //
-        levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_caps_ck_k> {}, {}, {}, first_gpu_specs);
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                                            //
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear}, //
+        levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_caps_ck_k> {weird_uniform, weird_linear}, {}, {},
+        first_gpu_specs);
 #endif
 
-#if SZ_USE_CUDA
+#if SZ_USE_CUDA && 0
     // CUDA Needleman-Wunsch distance against Multi-threaded on CPU
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
-        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {
-            blosum62_matrix, blosum62_gap_extension_cost}, //
+        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_linear_cost}, //
         needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k> {
-            blosum62_matrix, blosum62_gap_extension_cost},
+            blosum62_matrix, blosum62_linear_cost},
         {}, {}, first_gpu_specs);
 #endif
 }
@@ -889,20 +871,20 @@ void test_similarity_scores_memory_usage() {
         // Multi-threaded serial Levenshtein distance implementation
         test_similarity_scores_fuzzy<sz_size_t>( //
             levenshtein_baselines_t {},          //
-            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {}, experiment, 1);
+            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {}, experiment, 1);
 
 #if SZ_USE_CUDA
         // CUDA Levenshtein distance against Multi-threaded on CPU
-        test_similarity_scores_fuzzy<sz_size_t>(                                        //
-            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {}, //
+        test_similarity_scores_fuzzy<sz_size_t>(                                           //
+            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {}, //
             levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k> {}, experiment, 10,
             first_gpu_specs);
 #endif
 
 #if SZ_USE_KEPLER
         // CUDA Levenshtein distance on Kepler against Multi-threaded on CPU
-        test_similarity_scores_fuzzy<sz_size_t>(                                        //
-            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_sp_k> {}, //
+        test_similarity_scores_fuzzy<sz_size_t>(                                           //
+            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {}, //
             levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_caps_ck_k> {}, experiment, 10,
             first_gpu_specs);
 #endif
@@ -1009,7 +991,7 @@ struct find_many_baselines_t {
 };
 
 using find_many_serial_t = find_many<sz_u32_t, malloc_t, sz_cap_serial_k>;
-using find_many_parallel_t = find_many<sz_u32_t, malloc_t, sz_caps_sp_k>;
+using find_many_parallel_t = find_many<sz_u32_t, malloc_t, sz_cap_serial_k>;
 
 /**
  *  @brief  Tests the correctness of the string class Levenshtein distance computation,

From 3e1df93499eae7494e1601a70ba1f1d1d390adbb Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 4 May 2025 22:49:34 +0000
Subject: [PATCH 392/751] Add: Affine Levenshtein variants on GPU

---
 include/stringcuzilla/similarity.cuh | 876 ++++++++++++++++++++++++---
 include/stringcuzilla/similarity.hpp |   4 +-
 scripts/bench_sequence.cpp           |   2 +-
 scripts/bench_similarity.cuh         |   8 +
 scripts/test_stringcuzilla.cpp       |   2 +-
 scripts/test_stringcuzilla.cu        |  14 +-
 6 files changed, 802 insertions(+), 104 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index f364feec..30127b12 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -64,6 +64,13 @@ using needleman_wunsch_cuda_t =
 using smith_waterman_cuda_t =
     smith_waterman_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
 
+using affine_levenshtein_cuda_t = levenshtein_distances<char, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
+using affine_levenshtein_utf8_cuda_t = levenshtein_distances_utf8<char, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
+using affine_needleman_wunsch_cuda_t =
+    needleman_wunsch_scores<char, error_costs_256x256_t, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
+using affine_smith_waterman_cuda_t =
+    smith_waterman_scores<char, error_costs_256x256_t, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
+
 using levenshtein_kepler_t = levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_caps_ck_k>;
 using levenshtein_utf8_hopper_t = levenshtein_distances_utf8<char, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k>;
 using needleman_wunsch_hopper_t =
@@ -76,7 +83,7 @@ using smith_waterman_hopper_t =
 #pragma region - Algorithm Building Blocks
 
 /**
- *  @brief GPU adaptation of the `scorer` on CUDA, avoiding warp-level shuffles and DPX.
+ *  @brief GPU adaptation of the `tile_scorer` on CUDA, avoiding warp-level shuffles and DPX.
  *  @note Uses 32-bit `uint` counter to iterate through the string slices, so it can't be over 4 billion characters.
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
@@ -125,7 +132,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
      *  @note Should only be called for the diagonals outside of the bottom-right triangle.
      *  @note Should only be called for the top row and left column of the matrix.
      */
-    __forceinline__ __device__ void init(score_t &cell, sz_size_t diagonal_index) const noexcept {
+    __forceinline__ __device__ void init_score(score_t &cell, size_t diagonal_index) const noexcept {
         cell = gap_costs_.open_or_extend * diagonal_index;
     }
 
@@ -237,7 +244,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
      *  @note Should only be called for the diagonals outside of the bottom-right triangle.
      *  @note Should only be called for the top row and left column of the matrix.
      */
-    __forceinline__ __device__ void init(score_t &cell, sz_size_t diagonal_index) const noexcept { cell = 0; }
+    __forceinline__ __device__ void init_score(score_t &cell, size_t diagonal_index) const noexcept { cell = 0; }
 
     /**
      *  @brief Extract the final result of the scoring operation which will be always in the bottom-right corner.
@@ -291,10 +298,272 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     }
 };
 
+/**
+ *  @brief GPU adaptation of the `tile_scorer` on CUDA, avoiding warp-level shuffles and DPX.
+ *  @note Uses 32-bit `uint` counter to iterate through the string slices, so it can't be over 4 billion characters.
+ */
+template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
+          typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
+#if _SZ_IS_CPP20
+    requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
+             substituter_like<substituter_type_>
+#endif
+struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, affine_gap_costs_t,
+                   objective_, sz_similarity_global_k, capability_, std::enable_if_t<capability_ & sz_cap_cuda_k>> {
+
+    using first_iterator_t = first_iterator_type_;
+    using second_iterator_t = second_iterator_type_;
+    using score_t = score_type_;
+    using substituter_t = substituter_type_;
+    using gap_costs_t = affine_gap_costs_t;
+
+    static constexpr sz_similarity_objective_t objective_k = objective_;
+    static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
+    static constexpr sz_capability_t capability_k = capability_;
+
+    using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
+    using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
+    static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
+    using char_t = typename std::remove_cvref<first_char_t>::type;
+
+    using warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, affine_gap_costs_t,
+                                      objective_k, sz_similarity_global_k, capability_k>;
+
+  protected:
+    substituter_t substituter_ {};
+    affine_gap_costs_t gap_costs_ {};
+    score_t last_cell_ {0};
+
+    __forceinline__ __device__ score_t pick_best(score_t a, score_t b) const noexcept {
+        if constexpr (objective_ == sz_minimize_distance_k) { return std::min(a, b); }
+        else { return std::max(a, b); }
+    }
+
+  public:
+    __forceinline__ __device__ tile_scorer(substituter_t subs, affine_gap_costs_t gaps) noexcept
+        : substituter_(subs), gap_costs_(gaps) {}
+
+    /**
+     *  @brief Initializes a boundary value within a certain diagonal.
+     *  @note Should only be called for the diagonals outside of the bottom-right triangle.
+     *  @note Should only be called for the top row and left column of the matrix.
+     */
+    __forceinline__ __device__ void init_score(score_t &cell, size_t diagonal_index) const noexcept {
+        cell = diagonal_index ? gap_costs_.open + gap_costs_.extend * (diagonal_index - 1) : 0;
+    }
+
+    __forceinline__ __device__ void init_gap(score_t &cell, size_t diagonal_index) const noexcept {
+        // Make sure the initial value of the gap is not smaller in magnitude than the primary.
+        // The supplementary matrices are initialized with values of higher magnitude,
+        // which is equivalent to discarding them. That's better than using `SIZE_MAX`
+        // as subsequent additions won't overflow.
+        cell = (gap_costs_.open + gap_costs_.extend) +
+               (diagonal_index ? gap_costs_.open + gap_costs_.extend * (diagonal_index - 1) : 0);
+    }
+
+    /**
+     *  @brief Extract the final result of the scoring operation which will be always in the bottom-right corner.
+     */
+    __forceinline__ __device__ score_t score() const noexcept { return last_cell_; }
+
+    /**
+     *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
+     *  @param first_slice The first string, unlike the CPU variant @b NOT reversed.
+     *  @param second_slice The second string.
+     *
+     *  @param tasks_offset The offset of the first character to compare from each string.
+     *  @param tasks_step The step size for the next character to compare from each string.
+     *  @param tasks_count The total number of characters to compare from input slices.
+     *
+     *  @tparam index_type_ @b `uint` is recommended if the strings are under 4 billion characters.
+     */
+    template <typename index_type_>
+    __forceinline__ __device__ void operator()(                                                      //
+        first_iterator_t first_slice, second_iterator_t second_slice,                                //
+        index_type_ const tasks_offset, index_type_ const tasks_step, index_type_ const tasks_count, //
+        score_t const *scores_pre_substitution,                                                      //
+        score_t const *scores_pre_insertion,                                                         //
+        score_t const *scores_pre_deletion,                                                          //
+        score_t const *scores_running_insertions,                                                    //
+        score_t const *scores_running_deletions,                                                     //
+        score_t *scores_new,                                                                         //
+        score_t *scores_new_insertions,                                                              //
+        score_t *scores_new_deletions) noexcept {
+
+        // Make sure we are called for an anti-diagonal traversal order
+        _sz_assert(scores_pre_insertion + 1 == scores_pre_deletion);
+
+        // ? One weird observation, is that even though we can avoid fetching `pre_insertion`
+        // ? from shared memory on each cycle, by slicing the work differently between the threads,
+        // ? and allowing them to reuse the previous `pre_deletion` as the new `pre_insertion`,
+        // ? that code ends up being slower than the one below.
+        for (index_type_ i = tasks_offset; i < tasks_count; i += tasks_step) {
+            score_t pre_substitution = scores_pre_substitution[i];
+            score_t pre_insertion_opening = scores_pre_insertion[i];
+            score_t pre_deletion_opening = scores_pre_deletion[i];
+            score_t pre_insertion_expansion = scores_running_insertions[i];
+            score_t pre_deletion_expansion = scores_running_deletions[i];
+
+            error_cost_t cost_of_substitution = substituter_(first_slice[tasks_count - i - 1], second_slice[i]);
+            score_t if_substitution = pre_substitution + cost_of_substitution;
+            score_t if_insertion = min_or_max<objective_k>(pre_insertion_opening + gap_costs_.open,
+                                                           pre_insertion_expansion + gap_costs_.extend);
+            score_t if_deletion = min_or_max<objective_k>(pre_deletion_opening + gap_costs_.open,
+                                                          pre_deletion_expansion + gap_costs_.extend);
+            score_t if_deletion_or_insertion = min_or_max<objective_k>(if_deletion, if_insertion);
+            score_t cell_score = pick_best(if_deletion_or_insertion, if_substitution);
+
+            // Export results.
+            scores_new[i] = cell_score;
+            scores_new_insertions[i] = if_insertion;
+            scores_new_deletions[i] = if_deletion;
+        }
+
+        // The last element of the last chunk is the result of the global alignment.
+        if (tasks_offset == 0) last_cell_ = scores_new[0];
+    }
+};
+
+/**
+ *  @brief GPU adaptation of the `local_scorer` on CUDA, avoiding warp-level shuffles and DPX.
+ *  @note Uses 32-bit `uint` counter to iterate through the string slices, so it can't be over 4 billion characters.
+ */
+template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
+          typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
+#if _SZ_IS_CPP20
+    requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
+             substituter_like<substituter_type_>
+#endif
+struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, affine_gap_costs_t,
+                   objective_, sz_similarity_local_k, capability_, std::enable_if_t<capability_ & sz_cap_cuda_k>> {
+
+    using first_iterator_t = first_iterator_type_;
+    using second_iterator_t = second_iterator_type_;
+    using score_t = score_type_;
+    using substituter_t = substituter_type_;
+
+    static constexpr sz_similarity_objective_t objective_k = objective_;
+    static constexpr sz_similarity_locality_t locality_k = sz_similarity_local_k;
+    static constexpr sz_capability_t capability_k = capability_;
+
+    using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
+    using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
+    static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
+    using char_t = typename std::remove_cvref<first_char_t>::type;
+
+    using warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, affine_gap_costs_t,
+                                      objective_k, sz_similarity_local_k, capability_k>;
+
+  protected:
+    substituter_t substituter_ {};
+    affine_gap_costs_t gap_costs_ {};
+    score_t best_score_ {0};
+
+    __forceinline__ __device__ score_t pick_best(score_t a, score_t b) const noexcept {
+        if constexpr (objective_k == sz_minimize_distance_k) { return std::min(a, b); }
+        else { return std::max(a, b); }
+    }
+
+    __forceinline__ __device__ score_t pick_best_in_warp(score_t val) const noexcept {
+        // The `__shfl_down_sync` replaces `__shfl_down`
+        // https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
+        val = pick_best(__shfl_down_sync(0xffffffff, val, 16));
+        val = pick_best(__shfl_down_sync(0xffffffff, val, 8));
+        val = pick_best(__shfl_down_sync(0xffffffff, val, 4));
+        val = pick_best(__shfl_down_sync(0xffffffff, val, 2));
+        val = pick_best(__shfl_down_sync(0xffffffff, val, 1));
+        return val;
+    }
+
+  public:
+    __forceinline__ __device__ tile_scorer(substituter_t subs, affine_gap_costs_t gaps) noexcept
+        : substituter_(subs), gap_costs_(gaps) {}
+
+    /**
+     *  @brief Initializes a boundary value within a certain diagonal.
+     *  @note Should only be called for the diagonals outside of the bottom-right triangle.
+     *  @note Should only be called for the top row and left column of the matrix.
+     */
+    __forceinline__ __device__ void init_score(score_t &cell, size_t diagonal_index) const noexcept { cell = 0; }
+    __forceinline__ __device__ void init_gap(score_t &cell, size_t /* diagonal_index */) const noexcept {
+        // Make sure the initial value of the gap is not smaller in magnitude than the primary.
+        // The supplementary matrices are initialized with values of higher magnitude,
+        // which is equivalent to discarding them. That's better than using `SIZE_MAX`
+        // as subsequent additions won't overflow.
+        cell = gap_costs_.open + gap_costs_.extend;
+    }
+
+    /**
+     *  @brief Extract the final result of the scoring operation which will be always in the bottom-right corner.
+     */
+    __forceinline__ __device__ score_t score() const noexcept { return best_score_; }
+
+    /**
+     *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
+     *  @param first_slice The first string, unlike the CPU variant @b NOT reversed.
+     *  @param second_slice The second string.
+     *
+     *  @param tasks_offset The offset of the first character to compare from each string.
+     *  @param tasks_step The step size for the next character to compare from each string.
+     *  @param tasks_count The total number of characters to compare from input slices.
+     *
+     *  @tparam index_type_ @b `uint` is recommended if the strings are under 4 billion characters.
+     */
+    template <typename index_type_>
+    __forceinline__ __device__ void operator()(                                                      //
+        first_iterator_t first_slice, second_iterator_t second_slice,                                //
+        index_type_ const tasks_offset, index_type_ const tasks_step, index_type_ const tasks_count, //
+        score_t const *scores_pre_substitution,                                                      //
+        score_t const *scores_pre_insertion,                                                         //
+        score_t const *scores_pre_deletion,                                                          //
+        score_t const *scores_running_insertions,                                                    //
+        score_t const *scores_running_deletions,                                                     //
+        score_t *scores_new,                                                                         //
+        score_t *scores_new_insertions,                                                              //
+        score_t *scores_new_deletions) noexcept {
+
+        // Make sure we are called for an anti-diagonal traversal order
+        _sz_assert(scores_pre_insertion + 1 == scores_pre_deletion);
+
+        // ? One weird observation, is that even though we can avoid fetching `pre_insertion`
+        // ? from shared memory on each cycle, by slicing the work differently between the threads,
+        // ? and allowing them to reuse the previous `pre_deletion` as the new `pre_insertion`,
+        // ? that code ends up being slower than the one below.
+        for (index_type_ i = tasks_offset; i < tasks_count; i += tasks_step) {
+            score_t pre_substitution = scores_pre_substitution[i];
+            score_t pre_insertion_opening = scores_pre_insertion[i];
+            score_t pre_deletion_opening = scores_pre_deletion[i];
+            score_t pre_insertion_expansion = scores_running_insertions[i];
+            score_t pre_deletion_expansion = scores_running_deletions[i];
+
+            error_cost_t cost_of_substitution = substituter_(first_slice[tasks_count - i - 1], second_slice[i]);
+            score_t if_substitution = pre_substitution + cost_of_substitution;
+            score_t if_deletion = min_or_max<objective_k>(pre_deletion_opening + gap_costs_.open,
+                                                          pre_deletion_expansion + gap_costs_.extend);
+            score_t if_insertion = min_or_max<objective_k>(pre_insertion_opening + gap_costs_.open,
+                                                           pre_insertion_expansion + gap_costs_.extend);
+            score_t if_deletion_or_insertion = min_or_max<objective_k>(if_deletion, if_insertion);
+            score_t if_substitution_or_reset = pick_best(if_substitution, 0);
+            score_t cell_score = pick_best(if_deletion_or_insertion, if_substitution_or_reset);
+
+            // Export results.
+            scores_new[i] = cell_score;
+            scores_new_insertions[i] = if_insertion;
+            scores_new_deletions[i] = if_deletion;
+
+            // Update the global maximum score if this cell beats it.
+            best_score_ = pick_best(best_score_, cell_score);
+        }
+
+        // ! Don't forget to pick the best among the best scores per thread.
+        best_score_ = pick_best_in_warp(best_score_);
+    }
+};
+
 #if SZ_USE_KEPLER
 
 /**
- *  @brief GPU adaptation of the `scorer` - Minimizes Global Levenshtein distance.
+ *  @brief GPU adaptation of the `tile_scorer` - Minimizes Global Levenshtein distance.
  *  @note Requires Kepler generation GPUs to handle 4x `u8` scores at a time.
  *
  *  Relies on following instruction families to output 4x @b `u8` scores per call:
@@ -462,19 +731,18 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
 template <                                                       //
     typename char_type_ = char,                                  //
     typename index_type_ = uint,                                 //
-    typename score_type_ = sz_size_t,                            //
-    typename final_score_type_ = sz_size_t,                      //
+    typename score_type_ = size_t,                               //
+    typename final_score_type_ = size_t,                         //
     typename substituter_type_ = uniform_substitution_costs_t,   //
-    typename gap_costs_type_ = linear_gap_costs_t,               //
     sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
     sz_similarity_locality_t locality_ = sz_similarity_global_k, //
     sz_capability_t capability_ = sz_cap_cuda_k                  //
     >
-__global__ void _score_across_cuda_device(                     //
+__global__ void _linear_score_across_cuda_device(              //
     char_type_ const *shorter_ptr, index_type_ shorter_length, //
     char_type_ const *longer_ptr, index_type_ longer_length,   //
     final_score_type_ *result_ptr, score_type_ *diagonals_ptr, //
-    substituter_type_ const substituter, gap_costs_type_ const gap_costs) {
+    substituter_type_ const substituter, linear_gap_costs_t const gap_costs) {
 
     namespace cg = cooperative_groups;
 
@@ -492,7 +760,7 @@ __global__ void _score_across_cuda_device(                     //
 
     // Pre-load the substituter and gap costs.
     using substituter_t = substituter_type_;
-    using gap_costs_t = gap_costs_type_;
+    using gap_costs_t = linear_gap_costs_t;
     static_assert(std::is_trivially_copyable<substituter_t>::value, "Substituter must be trivially copyable.");
     static_assert(std::is_trivially_copyable<gap_costs_t>::value, "Gap costs must be trivially copyable.");
 
@@ -523,9 +791,9 @@ __global__ void _score_across_cuda_device(                     //
     // Initialize the first two diagonals:
     warp_scorer_t diagonal_aligner {substituter, gap_costs};
     if (is_main_thread) {
-        diagonal_aligner.init(previous_scores[0], 0);
-        diagonal_aligner.init(current_scores[0], 1);
-        diagonal_aligner.init(current_scores[1], 1);
+        diagonal_aligner.init_score(previous_scores[0], 0);
+        diagonal_aligner.init_score(current_scores[0], 1);
+        diagonal_aligner.init_score(current_scores[1], 1);
     }
 
     cg::grid_group grid = cg::this_grid();
@@ -553,17 +821,14 @@ __global__ void _score_across_cuda_device(                     //
 
         // Don't forget to populate the first row and the first column of the Levenshtein matrix.
         if (is_main_thread) {
-            diagonal_aligner.init(next_scores[0], next_diagonal_index);
-            diagonal_aligner.init(next_scores[next_diagonal_length - 1], next_diagonal_index);
+            diagonal_aligner.init_score(next_scores[0], next_diagonal_index);
+            diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
         }
         // Guarantee that all the writes have finished, before progressing to the next diagonal.
         grid.sync();
 
         // Perform a circular rotation of those buffers, to reuse the memory.
-        score_t *temporary = previous_scores;
-        previous_scores = current_scores;
-        current_scores = next_scores;
-        next_scores = temporary;
+        rotate_three(previous_scores, current_scores, next_scores);
     }
 
     __shared__ cuda::pipeline_shared_state<cuda::thread_scope_system, 2> memcpy_pipeline_state;
@@ -583,7 +848,7 @@ __global__ void _score_across_cuda_device(                     //
             next_scores);
 
         // Don't forget to populate the first row of the Levenshtein matrix.
-        if (is_main_thread) diagonal_aligner.init(next_scores[next_diagonal_length - 1], next_diagonal_index);
+        if (is_main_thread) diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
 
         // Guarantee that all the writes have finished, before progressing to the next diagonal.
         grid.sync();
@@ -617,15 +882,214 @@ __global__ void _score_across_cuda_device(                     //
         // Guarantee that all the writes have finished, before progressing to the next diagonal.
         grid.sync();
 
-        // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
-        // dropping the first element in the current array.
-        score_t *temporary = previous_scores;
-        // ! Drop the first entry among the current distances.
-        // ! Assuming every next diagonal is shorter by one element, we don't need a full-blown `sz_move`.
-        // ! to shift the array by one element.
-        previous_scores = current_scores + 1;
-        current_scores = next_scores;
-        next_scores = temporary;
+        // Perform a circular rotation of those buffers, to reuse the memory.
+        rotate_three(previous_scores, current_scores, next_scores);
+
+        // ! Drop the first entry among the current scores.
+        // ! Assuming every next diagonal is shorter by one element,
+        // ! we don't need a full-blown `sz_move` to shift the array by one element.
+        previous_scores++;
+    }
+
+    // Export one result per each block.
+    if (is_main_thread) *result_ptr = static_cast<final_score_t>(diagonal_aligner.score());
+}
+
+/**
+ *  @brief  String similarity scoring algorithm evaluating a @b single Dynamic Programming matrix
+ *          @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization.
+ *          Unlike the `_levenshtein_in_cuda_warp` is designed to take one pair of very-longs string,
+ *          ideally @b Tens-of-Megabytes in size or more.
+ *
+ *  @param[in] shorter_string The shorter string in the pair for score calculation.
+ *  @param[in] longer_string The longer string in the pair for score calculation.
+ *  @param[out] result_ptr Output address of the score for the pair of strings.
+ */
+template <                                                       //
+    typename char_type_ = char,                                  //
+    typename index_type_ = uint,                                 //
+    typename score_type_ = size_t,                               //
+    typename final_score_type_ = size_t,                         //
+    typename substituter_type_ = uniform_substitution_costs_t,   //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
+    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
+    sz_capability_t capability_ = sz_cap_cuda_k                  //
+    >
+__global__ void _affine_score_across_cuda_device(              //
+    char_type_ const *shorter_ptr, index_type_ shorter_length, //
+    char_type_ const *longer_ptr, index_type_ longer_length,   //
+    final_score_type_ *result_ptr, score_type_ *diagonals_ptr, //
+    substituter_type_ const substituter, affine_gap_costs_t const gap_costs) {
+
+    namespace cg = cooperative_groups;
+
+    _sz_assert(shorter_length > 0);
+    _sz_assert(longer_length > 0);
+    _sz_assert(shorter_length <= longer_length);
+    using char_t = char_type_;
+    using index_t = index_type_;
+    using score_t = score_type_;
+    using final_score_t = final_score_type_;
+
+    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
+    static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
+
+    // Pre-load the substituter and gap costs.
+    using substituter_t = substituter_type_;
+    using gap_costs_t = affine_gap_costs_t;
+    static_assert(std::is_trivially_copyable<substituter_t>::value, "Substituter must be trivially copyable.");
+    static_assert(std::is_trivially_copyable<gap_costs_t>::value, "Gap costs must be trivially copyable.");
+
+    using warp_scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t, objective_k,
+                                      locality_k, capability_k>;
+
+    // Only one thread will be initializing the top row and left column and outputting the result.
+    bool const is_main_thread = blockIdx.x == 0 && threadIdx.x == 0; // ! Differs for warp-wide
+
+    // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
+    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
+    index_t const shorter_dim = shorter_length + 1, longer_dim = longer_length + 1;
+
+    // Let's say we are dealing with 3 and 5 letter words.
+    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
+    // It will have:
+    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
+    // - 2 diagonals of fixed length, at positions: 4, 5.
+    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
+    index_t const diagonals_count = shorter_dim + longer_dim - 1;
+    index_t const max_diagonal_length = shorter_length + 1;
+
+    // The next few pointers will be swapped around.
+    score_t *previous_scores = diagonals_ptr;
+    score_t *current_scores = diagonals_ptr + max_diagonal_length;
+    score_t *next_scores = diagonals_ptr + 2 * max_diagonal_length;
+    score_t *current_inserts = diagonals_ptr + 3 * max_diagonal_length;
+    score_t *next_inserts = diagonals_ptr + 4 * max_diagonal_length;
+    score_t *current_deletes = diagonals_ptr + 5 * max_diagonal_length;
+    score_t *next_deletes = diagonals_ptr + 6 * max_diagonal_length;
+
+    // Initialize the first two diagonals:
+    warp_scorer_t diagonal_aligner {substituter, gap_costs};
+    if (is_main_thread) {
+        diagonal_aligner.init_score(previous_scores[0], 0);
+        diagonal_aligner.init_score(current_scores[0], 1);
+        diagonal_aligner.init_score(current_scores[1], 1);
+        diagonal_aligner.init_gap(current_inserts[0], 1);
+        diagonal_aligner.init_gap(current_deletes[1], 1);
+    }
+
+    cg::grid_group grid = cg::this_grid();
+
+    // We skip diagonals 0 and 1, as they are trivial.
+    // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
+    // so we are effectively computing just one value, as will be marked by a single set bit in
+    // the `next_diagonal_mask` on the very first iteration.
+    index_t next_diagonal_index = 2;
+    index_t const global_thread_index = threadIdx.x + blockIdx.x * blockDim.x;
+    index_t const global_thread_step = blockDim.x * gridDim.x;
+
+    // Progress through the upper-left triangle of the Levenshtein matrix.
+    for (; next_diagonal_index < shorter_dim; ++next_diagonal_index) {
+
+        index_t const next_diagonal_length = next_diagonal_index + 1;
+        diagonal_aligner(                            //
+            shorter_ptr,                             // first sequence of characters
+            longer_ptr,                              // second sequence of characters
+            global_thread_index, global_thread_step, //
+            (index_t)(next_diagonal_length - 2),     // number of elements to compute with the `diagonal_aligner`
+            previous_scores,                         // costs pre substitution
+            current_scores, current_scores + 1,      // costs pre insertion/deletion opening
+            current_inserts, current_deletes + 1,    // costs pre insertion/deletion extension
+            next_scores + 1,                         // ! notice unaligned write destination
+            next_inserts + 1, next_deletes + 1       // ! notice unaligned write destination
+        );
+
+        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
+        if (is_main_thread) {
+            diagonal_aligner.init_score(next_scores[0], next_diagonal_index);
+            diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
+            diagonal_aligner.init_gap(next_inserts[0], next_diagonal_index);
+            diagonal_aligner.init_gap(next_deletes[next_diagonal_length - 1], next_diagonal_index);
+        }
+        // Guarantee that all the writes have finished, before progressing to the next diagonal.
+        grid.sync();
+
+        // Perform a circular rotation of those buffers, to reuse the memory.
+        rotate_three(previous_scores, current_scores, next_scores);
+        std::swap(current_inserts, next_inserts);
+        std::swap(current_deletes, next_deletes);
+    }
+
+    __shared__ cuda::pipeline_shared_state<cuda::thread_scope_system, 2> memcpy_pipeline_state;
+    auto memcpy_pipeline = cuda::make_pipeline(grid, &memcpy_pipeline_state);
+
+    // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
+    for (; next_diagonal_index < longer_dim; ++next_diagonal_index) {
+
+        index_t const next_diagonal_length = shorter_dim;
+        diagonal_aligner(                                   //
+            shorter_ptr,                                    // first sequence of characters
+            longer_ptr + next_diagonal_index - shorter_dim, // second sequence of characters
+            global_thread_index, global_thread_step,        //
+            (index_t)(next_diagonal_length - 1),            // number of elements to compute with the `diagonal_aligner`
+            previous_scores,                                // costs pre substitution
+            current_scores, current_scores + 1,             // costs pre insertion/deletion opening
+            current_inserts, current_deletes + 1,           // costs pre insertion/deletion extension
+            next_scores,                                    // updated similarity scores
+            next_inserts, next_deletes                      // updated insertion/deletion extensions
+        );
+
+        // Don't forget to populate the first row of the Levenshtein matrix.
+        if (is_main_thread) diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
+
+        std::swap(current_inserts, next_inserts);
+        std::swap(current_deletes, next_deletes);
+
+        // Guarantee that all the writes have finished, before progressing to the next diagonal.
+        grid.sync();
+
+        // ! In the central anti-diagonal band, we can't just set the `current_scores + 1` to `previous_scores`
+        // ! for the circular shift, as we will end up spilling outside of the diagonal a few iterations later.
+        // ! Assuming in-place `memmove` is tricky on the GPU, so we will copy the data.
+        memcpy_pipeline.producer_acquire();
+        cuda::memcpy_async(grid, (void *)previous_scores, (void const *)(current_scores + 1),
+                           (next_diagonal_length - 1) * sizeof(score_t), memcpy_pipeline);
+        cuda::memcpy_async(grid, (void *)current_scores, (void const *)(next_scores),
+                           (next_diagonal_length) * sizeof(score_t), memcpy_pipeline);
+        memcpy_pipeline.producer_commit();
+        memcpy_pipeline.consumer_wait();
+        memcpy_pipeline.consumer_release();
+    }
+
+    // Now let's handle the bottom-right triangle of the matrix.
+    for (; next_diagonal_index < diagonals_count; ++next_diagonal_index) {
+
+        index_t const next_diagonal_length = diagonals_count - next_diagonal_index;
+        diagonal_aligner(                                   //
+            shorter_ptr + next_diagonal_index - longer_dim, // first sequence of characters
+            longer_ptr + next_diagonal_index - shorter_dim, // second sequence of characters
+            global_thread_index, global_thread_step,        //
+            next_diagonal_length,                           // number of elements to compute with the `diagonal_aligner`
+            previous_scores,                                // costs pre substitution
+            current_scores, current_scores + 1,             // costs pre insertion/deletion opening
+            current_inserts, current_deletes + 1,           // costs pre insertion/deletion extension
+            next_scores,                                    // updated similarity scores
+            next_inserts, next_deletes                      // updated insertion/deletion extensions
+        );
+
+        // Guarantee that all the writes have finished, before progressing to the next diagonal.
+        grid.sync();
+
+        // Perform a circular rotation of those buffers, to reuse the memory.
+        rotate_three(previous_scores, current_scores, next_scores);
+        std::swap(current_inserts, next_inserts);
+        std::swap(current_deletes, next_deletes);
+
+        // ! Drop the first entry among the current scores.
+        // ! Assuming every next diagonal is shorter by one element,
+        // ! we don't need a full-blown `sz_move` to shift the array by one element.
+        previous_scores++;
     }
 
     // Export one result per each block.
@@ -639,22 +1103,22 @@ __global__ void _score_across_cuda_device(                     //
  *
  *  @param[in] tasks Tasks containing the strings and output locations.
  *  @param[in] tasks_count The number of tasks to process.
- *
+ *  @param[in] substituter The substitution costs.
+ *  @param[in] gap_costs The @b linear gap costs.
  */
 template < //
     typename task_type_,
     typename char_type_ = char,                                  //
     typename index_type_ = uint,                                 //
-    typename score_type_ = sz_size_t,                            //
+    typename score_type_ = size_t,                               //
     typename substituter_type_ = uniform_substitution_costs_t,   //
-    typename gap_costs_type_ = linear_gap_costs_t,               //
     sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
     sz_similarity_locality_t locality_ = sz_similarity_global_k, //
     sz_capability_t capability_ = sz_cap_cuda_k                  //
     >
-__global__ void _score_on_each_cuda_warp(     //
-    task_type_ *tasks, sz_size_t tasks_count, //
-    substituter_type_ const substituter, gap_costs_type_ const gap_costs) {
+__global__ void _linear_score_on_each_cuda_warp( //
+    task_type_ *tasks, size_t tasks_count,       //
+    substituter_type_ const substituter, linear_gap_costs_t const gap_costs) {
 
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
     using task_t = task_type_;
@@ -667,7 +1131,7 @@ __global__ void _score_on_each_cuda_warp(     //
 
     // Pre-load the substituter and gap costs.
     using substituter_t = substituter_type_;
-    using gap_costs_t = gap_costs_type_;
+    using gap_costs_t = linear_gap_costs_t;
     static_assert(std::is_trivially_copyable<substituter_t>::value, "Substituter must be trivially copyable.");
     static_assert(std::is_trivially_copyable<gap_costs_t>::value, "Gap costs must be trivially copyable.");
 
@@ -679,12 +1143,12 @@ __global__ void _score_on_each_cuda_warp(     //
 
     // We are computing N edit distances for N pairs of strings. Not a cartesian product!
     // Each block/warp may end up receiving a different number of strings.
-    for (sz_size_t task_idx = blockIdx.x; task_idx < tasks_count; task_idx += gridDim.x) {
+    for (size_t task_idx = blockIdx.x; task_idx < tasks_count; task_idx += gridDim.x) {
         task_t &task = tasks[task_idx];
         char_t const *shorter_global = task.shorter_ptr;
         char_t const *longer_global = task.longer_ptr;
-        sz_size_t const shorter_length = task.shorter_length;
-        sz_size_t const longer_length = task.longer_length;
+        size_t const shorter_length = task.shorter_length;
+        size_t const longer_length = task.longer_length;
         auto &result_ref = task.result;
 
         // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
@@ -716,9 +1180,9 @@ __global__ void _score_on_each_cuda_warp(     //
         // Initialize the first two diagonals:
         warp_scorer_t diagonal_aligner {substituter, gap_costs};
         if (threadIdx.x == 0) {
-            diagonal_aligner.init(previous_scores[0], 0);
-            diagonal_aligner.init(current_scores[0], 1);
-            diagonal_aligner.init(current_scores[1], 1);
+            diagonal_aligner.init_score(previous_scores[0], 0);
+            diagonal_aligner.init_score(current_scores[0], 1);
+            diagonal_aligner.init_score(current_scores[1], 1);
         }
 
         // Make sure the shared memory is fully loaded.
@@ -745,16 +1209,13 @@ __global__ void _score_on_each_cuda_warp(     //
 
             // Don't forget to populate the first row and the first column of the Levenshtein matrix.
             if (threadIdx.x == 0) {
-                diagonal_aligner.init(next_scores[0], next_diagonal_index);
-                diagonal_aligner.init(next_scores[next_diagonal_length - 1], next_diagonal_index);
+                diagonal_aligner.init_score(next_scores[0], next_diagonal_index);
+                diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
             }
             __syncwarp();
 
             // Perform a circular rotation of those buffers, to reuse the memory.
-            score_t *temporary = previous_scores;
-            previous_scores = current_scores;
-            current_scores = next_scores;
-            next_scores = temporary;
+            rotate_three(previous_scores, current_scores, next_scores);
         }
 
         // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
@@ -771,17 +1232,17 @@ __global__ void _score_on_each_cuda_warp(     //
                 next_scores);
 
             // Don't forget to populate the first row of the Levenshtein matrix.
-            if (threadIdx.x == 0) diagonal_aligner.init(next_scores[next_diagonal_length - 1], next_diagonal_index);
+            if (threadIdx.x == 0)
+                diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
 
             __syncwarp();
             // ! In the central anti-diagonal band, we can't just set the `current_scores + 1` to `previous_scores`
             // ! for the circular shift, as we will end up spilling outside of the diagonal a few iterations later.
             // ! Assuming in-place `memmove` is tricky on the GPU, so we will copy the data.
-            for (sz_size_t i = threadIdx.x; i + 1 < next_diagonal_length; i += blockDim.x)
+            for (size_t i = threadIdx.x; i + 1 < next_diagonal_length; i += blockDim.x)
                 previous_scores[i] = current_scores[i + 1];
             __syncwarp();
-            for (sz_size_t i = threadIdx.x; i < next_diagonal_length; i += blockDim.x)
-                current_scores[i] = next_scores[i];
+            for (size_t i = threadIdx.x; i < next_diagonal_length; i += blockDim.x) current_scores[i] = next_scores[i];
             __syncwarp();
         }
 
@@ -798,15 +1259,13 @@ __global__ void _score_on_each_cuda_warp(     //
                 current_scores, current_scores + 1,         // costs pre insertion/deletion
                 next_scores);
 
-            // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
-            // dropping the first element in the current array.
-            score_t *temporary = previous_scores;
-            // ! Drop the first entry among the current distances.
-            // ! Assuming every next diagonal is shorter by one element, we don't need a full-blown `sz_move`.
-            // ! to shift the array by one element.
-            previous_scores = current_scores + 1;
-            current_scores = next_scores;
-            next_scores = temporary;
+            // Perform a circular rotation of those buffers, to reuse the memory.
+            rotate_three(previous_scores, current_scores, next_scores);
+
+            // ! Drop the first entry among the current scores.
+            // ! Assuming every next diagonal is shorter by one element,
+            // ! we don't need a full-blown `sz_move` to shift the array by one element.
+            previous_scores++;
             __syncwarp();
         }
 
@@ -815,6 +1274,210 @@ __global__ void _score_on_each_cuda_warp(     //
     }
 }
 
+/**
+ *  @brief  Levenshtein edit distances algorithm evaluating the Dynamic Programming matrix
+ *          @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization.
+ *          Each pair of strings gets its own @b "block" of CUDA threads forming one @b warp and shared memory.
+ *
+ *  @param[in] tasks Tasks containing the strings and output locations.
+ *  @param[in] tasks_count The number of tasks to process.
+ *  @param[in] substituter The substitution costs.
+ *  @param[in] gap_costs The @b affine gap costs.
+ */
+template < //
+    typename task_type_,
+    typename char_type_ = char,                                  //
+    typename index_type_ = uint,                                 //
+    typename score_type_ = size_t,                               //
+    typename substituter_type_ = uniform_substitution_costs_t,   //
+    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
+    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
+    sz_capability_t capability_ = sz_cap_cuda_k                  //
+    >
+__global__ void _affine_score_on_each_cuda_warp( //
+    task_type_ *tasks, size_t tasks_count,       //
+    substituter_type_ const substituter, affine_gap_costs_t const gap_costs) {
+
+    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
+    using task_t = task_type_;
+    using char_t = char_type_;
+    using index_t = index_type_;
+    using score_t = score_type_;
+    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_similarity_locality_t locality_k = locality_;
+    static constexpr sz_similarity_objective_t objective_k = objective_;
+
+    // Pre-load the substituter and gap costs.
+    using substituter_t = substituter_type_;
+    using gap_costs_t = affine_gap_costs_t;
+    static_assert(std::is_trivially_copyable<substituter_t>::value, "Substituter must be trivially copyable.");
+    static_assert(std::is_trivially_copyable<gap_costs_t>::value, "Gap costs must be trivially copyable.");
+
+    using warp_scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t, objective_k,
+                                      locality_k, capability_k>;
+
+    // Only one thread will be initializing the top row and left column and outputting the result.
+    bool const is_main_thread = threadIdx.x == 0; // ! Differs for device-wide
+
+    // Allocating shared memory is handled on the host side.
+    extern __shared__ char shared_memory_buffer[];
+
+    // We are computing N edit distances for N pairs of strings. Not a cartesian product!
+    // Each block/warp may end up receiving a different number of strings.
+    for (size_t task_idx = blockIdx.x; task_idx < tasks_count; task_idx += gridDim.x) {
+        task_t &task = tasks[task_idx];
+        char_t const *shorter_global = task.shorter_ptr;
+        char_t const *longer_global = task.longer_ptr;
+        size_t const shorter_length = task.shorter_length;
+        size_t const longer_length = task.longer_length;
+        auto &result_ref = task.result;
+
+        // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
+        // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
+        uint const shorter_dim = static_cast<uint>(shorter_length + 1);
+        uint const longer_dim = static_cast<uint>(longer_length + 1);
+
+        // Let's say we are dealing with 3 and 5 letter words.
+        // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
+        // It will have:
+        // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
+        // - 2 diagonals of fixed length, at positions: 4, 5.
+        // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
+        uint const diagonals_count = shorter_dim + longer_dim - 1;
+        uint const max_diagonal_length = shorter_length + 1;
+        uint const bytes_per_diagonal = round_up_to_multiple<uint>(max_diagonal_length * sizeof(score_t), 4);
+
+        // The next few pointers will be swapped around.
+        score_t *previous_scores = reinterpret_cast<score_t *>(shared_memory_buffer);
+        score_t *current_scores = reinterpret_cast<score_t *>(shared_memory_buffer + bytes_per_diagonal);
+        score_t *next_scores = reinterpret_cast<score_t *>(shared_memory_buffer + 2 * bytes_per_diagonal);
+        score_t *current_inserts = reinterpret_cast<score_t *>(shared_memory_buffer + 3 * bytes_per_diagonal);
+        score_t *next_inserts = reinterpret_cast<score_t *>(shared_memory_buffer + 4 * bytes_per_diagonal);
+        score_t *current_deletes = reinterpret_cast<score_t *>(shared_memory_buffer + 5 * bytes_per_diagonal);
+        score_t *next_deletes = reinterpret_cast<score_t *>(shared_memory_buffer + 6 * bytes_per_diagonal);
+        char_t *const longer = reinterpret_cast<char_t *>(shared_memory_buffer + 7 * bytes_per_diagonal);
+        char_t *const shorter = longer + longer_length;
+
+        // Each thread in the warp will be loading it's own set of strided characters into shared memory.
+        for (uint i = threadIdx.x; i < longer_length; i += blockDim.x) longer[i] = longer_global[i];
+        for (uint i = threadIdx.x; i < shorter_length; i += blockDim.x) shorter[i] = shorter_global[i];
+
+        // Initialize the first two diagonals:
+        warp_scorer_t diagonal_aligner {substituter, gap_costs};
+        if (is_main_thread) {
+            diagonal_aligner.init_score(previous_scores[0], 0);
+            diagonal_aligner.init_score(current_scores[0], 1);
+            diagonal_aligner.init_score(current_scores[1], 1);
+            diagonal_aligner.init_gap(current_inserts[0], 1);
+            diagonal_aligner.init_gap(current_deletes[1], 1);
+        }
+
+        // Make sure the shared memory is fully loaded.
+        __syncwarp();
+
+        // We skip diagonals 0 and 1, as they are trivial.
+        // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
+        // so we are effectively computing just one value, as will be marked by a single set bit in
+        // the `next_diagonal_mask` on the very first iteration.
+        uint next_diagonal_index = 2;
+
+        // Progress through the upper-left triangle of the Levenshtein matrix.
+        for (; next_diagonal_index < shorter_dim; ++next_diagonal_index) {
+
+            uint const next_diagonal_length = next_diagonal_index + 1;
+            diagonal_aligner(                         //
+                shorter,                              // first sequence of characters
+                longer,                               // second sequence of characters
+                threadIdx.x, blockDim.x,              //
+                next_diagonal_length - 2,             // number of elements to compute with the `diagonal_aligner`
+                previous_scores,                      // costs pre substitution
+                current_scores, current_scores + 1,   // costs pre insertion/deletion opening
+                current_inserts, current_deletes + 1, // costs pre insertion/deletion extension
+                next_scores + 1,                      // ! notice unaligned write destination
+                next_inserts + 1, next_deletes + 1    // ! notice unaligned write destination
+            );
+
+            // Don't forget to populate the first row and the first column of the Levenshtein matrix.
+            if (is_main_thread) {
+                diagonal_aligner.init_score(next_scores[0], next_diagonal_index);
+                diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
+                diagonal_aligner.init_gap(next_inserts[0], next_diagonal_index);
+                diagonal_aligner.init_gap(next_deletes[next_diagonal_length - 1], next_diagonal_index);
+            }
+            __syncwarp();
+
+            // Perform a circular rotation of those buffers, to reuse the memory.
+            rotate_three(previous_scores, current_scores, next_scores);
+            std::swap(current_inserts, next_inserts);
+            std::swap(current_deletes, next_deletes);
+        }
+
+        // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
+        for (; next_diagonal_index < longer_dim; ++next_diagonal_index) {
+
+            uint const next_diagonal_length = shorter_dim;
+            diagonal_aligner(                               //
+                shorter,                                    // first sequence of characters
+                longer + next_diagonal_index - shorter_dim, // second sequence of characters
+                threadIdx.x, blockDim.x,                    //
+                next_diagonal_length - 1,                   // number of elements to compute with the `diagonal_aligner`
+                previous_scores,                            // costs pre substitution
+                current_scores, current_scores + 1,         // costs pre insertion/deletion opening
+                current_inserts, current_deletes + 1,       // costs pre insertion/deletion extension
+                next_scores,                                // updated similarity scores
+                next_inserts, next_deletes                  // updated insertion/deletion extensions
+            );
+
+            // Don't forget to populate the first row of the Levenshtein matrix.
+            if (is_main_thread) diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
+
+            std::swap(current_inserts, next_inserts);
+            std::swap(current_deletes, next_deletes);
+
+            __syncwarp();
+            // ! In the central anti-diagonal band, we can't just set the `current_scores + 1` to `previous_scores`
+            // ! for the circular shift, as we will end up spilling outside of the diagonal a few iterations later.
+            // ! Assuming in-place `memmove` is tricky on the GPU, so we will copy the data.
+            for (size_t i = threadIdx.x; i + 1 < next_diagonal_length; i += blockDim.x)
+                previous_scores[i] = current_scores[i + 1];
+            __syncwarp();
+            for (size_t i = threadIdx.x; i < next_diagonal_length; i += blockDim.x) current_scores[i] = next_scores[i];
+            __syncwarp();
+        }
+
+        // Now let's handle the bottom-right triangle of the matrix.
+        for (; next_diagonal_index < diagonals_count; ++next_diagonal_index) {
+
+            uint const next_diagonal_length = diagonals_count - next_diagonal_index;
+            diagonal_aligner(                               //
+                shorter + next_diagonal_index - longer_dim, // first sequence of characters
+                longer + next_diagonal_index - shorter_dim, // second sequence of characters
+                threadIdx.x, blockDim.x,                    //
+                next_diagonal_length,                       // number of elements to compute with the `diagonal_aligner`
+                previous_scores,                            // costs pre substitution
+                current_scores, current_scores + 1,         // costs pre insertion/deletion opening
+                current_inserts, current_deletes + 1,       // costs pre insertion/deletion extension
+                next_scores,                                // updated similarity scores
+                next_inserts, next_deletes                  // updated insertion/deletion extensions
+            );
+
+            // Perform a circular rotation of those buffers, to reuse the memory.
+            rotate_three(previous_scores, current_scores, next_scores);
+            std::swap(current_inserts, next_inserts);
+            std::swap(current_deletes, next_deletes);
+
+            // ! Drop the first entry among the current scores.
+            // ! Assuming every next diagonal is shorter by one element,
+            // ! we don't need a full-blown `sz_move` to shift the array by one element.
+            previous_scores++;
+            __syncwarp();
+        }
+
+        // Export one result per each block.
+        if (is_main_thread) result_ref = diagonal_aligner.score();
+    }
+}
+
 #pragma endregion
 
 #pragma region - Levenshtein Distance in CUDA
@@ -831,7 +1494,7 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
     using char_t = char_type_;
     using gap_costs_t = gap_costs_type_;
     using allocator_t = allocator_type_;
-    using scores_allocator_t = typename allocator_t::template rebind<sz_size_t>::other;
+    using scores_allocator_t = typename allocator_t::template rebind<size_t>::other;
     static constexpr sz_capability_t capability_k = capability_;
 
     struct task_t {
@@ -877,6 +1540,9 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
         results_type_ *results_ptr,                                                           //
         gpu_specs_t specs = {}, cuda_executor_t executor = {}) const noexcept {
 
+        constexpr bool is_affine_k = std::is_same<gap_costs_t, affine_gap_costs_t>::value;
+        constexpr size_t count_diagonals_k = is_affine_k ? 7 : 3;
+
         // Preallocate the events for GPU timing.
         cudaEvent_t start_event, stop_event;
         cudaEventCreate(&start_event, cudaEventBlockingSync);
@@ -892,8 +1558,8 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
 
         // Export all the tasks and sort them by decreasing memory requirement.
         size_t count_empty_tasks = 0;
-        using similarity_memory_requirements_t = similarity_memory_requirements<sz_size_t, false>;
-        for (sz_size_t i = 0; i < first_strings.size(); ++i) {
+        using similarity_memory_requirements_t = similarity_memory_requirements<size_t, false>;
+        for (size_t i = 0; i < first_strings.size(); ++i) {
             task_t task(                                            //
                 first_strings[i].data(), first_strings[i].length(), //
                 second_strings[i].data(), second_strings[i].length());
@@ -906,8 +1572,11 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
             task.memory_requirement = requirement.total;
             task.bytes_per_cell = requirement.bytes_per_cell;
             task.density = warp_tasks_density(requirement.total, specs);
-            if (task.density == infinite_warps_per_multiprocessor_k)
-                task.result = task.longer_length * gap_costs_.open_or_extend, count_empty_tasks++;
+            if (task.density == infinite_warps_per_multiprocessor_k) {
+                if constexpr (!is_affine_k) { task.result = task.longer_length * gap_costs_.open_or_extend; }
+                else { task.result = (task.longer_length - 1) * gap_costs_.extend + gap_costs_.open; }
+                count_empty_tasks++;
+            }
             tasks[i] = task;
         }
 
@@ -916,24 +1585,37 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
 
         if (device_level_tasks.size()) {
             auto device_level_u16_kernel =
-                &_score_across_cuda_device<char_t, sz_u16_t, sz_u16_t, final_score_t, uniform_substitution_costs_t,
-                                           linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k,
-                                           capability_k>;
+                is_affine_k //
+                    ? (void *)&_affine_score_across_cuda_device<char_t, sz_u16_t, sz_u16_t, final_score_t,
+                                                                uniform_substitution_costs_t, sz_minimize_distance_k,
+                                                                sz_similarity_global_k, capability_k>
+                    : (void *)&_linear_score_across_cuda_device<char_t, sz_u16_t, sz_u16_t, final_score_t,
+                                                                uniform_substitution_costs_t, sz_minimize_distance_k,
+                                                                sz_similarity_global_k, capability_k>;
             auto device_level_u32_kernel =
-                &_score_across_cuda_device<char_t, sz_u32_t, sz_u32_t, final_score_t, uniform_substitution_costs_t,
-                                           linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k,
-                                           capability_k>;
+                is_affine_k //
+                    ? (void *)&_affine_score_across_cuda_device<char_t, sz_u32_t, sz_u32_t, final_score_t,
+                                                                uniform_substitution_costs_t, sz_minimize_distance_k,
+                                                                sz_similarity_global_k, capability_k>
+                    : (void *)&_linear_score_across_cuda_device<char_t, sz_u32_t, sz_u32_t, final_score_t,
+                                                                uniform_substitution_costs_t, sz_minimize_distance_k,
+                                                                sz_similarity_global_k, capability_k>;
             auto device_level_u64_kernel =
-                &_score_across_cuda_device<char_t, sz_u64_t, sz_u64_t, final_score_t, uniform_substitution_costs_t,
-                                           linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k,
-                                           capability_k>;
+                is_affine_k //
+                    ? (void *)&_affine_score_across_cuda_device<char_t, sz_u64_t, sz_u64_t, final_score_t,
+                                                                uniform_substitution_costs_t, sz_minimize_distance_k,
+                                                                sz_similarity_global_k, capability_k>
+                    : (void *)&_linear_score_across_cuda_device<char_t, sz_u64_t, sz_u64_t, final_score_t,
+                                                                uniform_substitution_costs_t, sz_minimize_distance_k,
+                                                                sz_similarity_global_k, capability_k>;
             void *device_level_kernel_args[8];
 
             // On very large inputs we can't fit the diagonals in shared memory, and use the global one.
             safe_vector<sz_u64_t, scores_allocator_t> diagonals_u64_buffer(alloc_);
             task_t const &largest_task = device_level_tasks[0];
             _sz_assert(largest_task.max_diagonal_length() >= device_level_tasks.back().max_diagonal_length());
-            if (diagonals_u64_buffer.try_resize(largest_task.max_diagonal_length() * 3) == status_t::bad_alloc_k)
+            if (diagonals_u64_buffer.try_resize(largest_task.max_diagonal_length() * count_diagonals_k) ==
+                status_t::bad_alloc_k)
                 return {status_t::bad_alloc_k};
 
             // Individually submit each task to the GPU.
@@ -976,13 +1658,21 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
         // From the highest possible number of warps per multiprocessor to the lowest.
         if (warp_level_tasks.size()) {
             auto warp_level_u8_kernel =
-                &_score_on_each_cuda_warp<task_t, char_t, sz_u8_t, sz_u8_t, uniform_substitution_costs_t,
-                                          linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k,
-                                          capability_k>;
+                is_affine_k
+                    ? (void *)&_affine_score_on_each_cuda_warp<task_t, char_t, sz_u8_t, sz_u8_t,
+                                                               uniform_substitution_costs_t, sz_minimize_distance_k,
+                                                               sz_similarity_global_k, capability_k>
+                    : (void *)&_linear_score_on_each_cuda_warp<task_t, char_t, sz_u8_t, sz_u8_t,
+                                                               uniform_substitution_costs_t, sz_minimize_distance_k,
+                                                               sz_similarity_global_k, capability_k>;
             auto warp_level_u16_kernel =
-                &_score_on_each_cuda_warp<task_t, char_t, sz_u16_t, sz_u16_t, uniform_substitution_costs_t,
-                                          linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k,
-                                          capability_k>;
+                is_affine_k
+                    ? (void *)&_affine_score_on_each_cuda_warp<task_t, char_t, sz_u16_t, sz_u16_t,
+                                                               uniform_substitution_costs_t, sz_minimize_distance_k,
+                                                               sz_similarity_global_k, capability_k>
+                    : (void *)&_linear_score_on_each_cuda_warp<task_t, char_t, sz_u16_t, sz_u16_t,
+                                                               uniform_substitution_costs_t, sz_minimize_distance_k,
+                                                               sz_similarity_global_k, capability_k>;
             void *warp_level_kernel_args[4];
 
             cuda_status_t result;
@@ -1084,9 +1774,9 @@ __constant__ char _error_costs_in_cuda_constant_memory[256 * 256];
 template <                                                      //
     typename first_strings_type_,                               //
     typename second_strings_type_,                              //
-    score_like score_type_ = sz_ssize_t,                        //
-    substituter_like substituter_type_ = error_costs_256x256_t, //
-    gap_costs_like gap_costs_type_ = linear_gap_costs_t,        //
+    typename score_type_ = sz_ssize_t,                        //
+    typename substituter_type_ = error_costs_256x256_t, //
+    typename gap_costs_type_ = linear_gap_costs_t,        //
     sz_capability_t capability_ = sz_cap_cuda_k                 //
     >
 __global__ void _needleman_wunsch_in_cuda_warp( //
@@ -1124,14 +1814,14 @@ __global__ void _needleman_wunsch_in_cuda_warp( //
 
     // We are computing N edit distances for N pairs of strings. Not a cartesian product!
     // Each block/warp may end up receiving a different number of strings.
-    for (sz_size_t pair_idx = blockIdx.x; pair_idx < first_strings.size(); pair_idx += gridDim.x) {
+    for (size_t pair_idx = blockIdx.x; pair_idx < first_strings.size(); pair_idx += gridDim.x) {
         first_string_t const first_global = first_strings[pair_idx];
         second_string_t const second_global = second_strings[pair_idx];
         score_t &result_ref = results_ptr[pair_idx];
 
         // Skip empty strings.
-        sz_size_t const first_length = first_global.length();
-        sz_size_t const second_length = second_global.length();
+        size_t const first_length = first_global.length();
+        size_t const second_length = second_global.length();
         if (first_length == 0) {
             result_ref = second_length * gap_cost;
             continue;
@@ -1205,27 +1895,27 @@ cuda_status_t _needleman_wunsch_via_cuda_warp(
     // Make sure that we don't string pairs that are too large to fit 3 matrix diagonals into shared memory.
     // H100 Streaming Multiprocessor can have up to 128 active warps concurrently and only 256 KB of shared memory.
     // A100 SMs had only 192 KB. We can't deal with blocks that require more memory than the SM can provide.
-    sz_size_t shared_memory_per_block =
+    size_t shared_memory_per_block =
         _scores_diagonally_warp_shared_memory_requirement<true>(first_strings, second_strings, substituter.magnitude());
     if (shared_memory_per_block > specs.shared_memory_per_multiprocessor()) return {status_t::bad_alloc_k};
 
     // It may be the case that we've only received empty strings.
     if (shared_memory_per_block == 0) {
-        for (sz_size_t i = 0; i < first_strings.size(); ++i)
+        for (size_t i = 0; i < first_strings.size(); ++i)
             if (first_strings[i].length() == 0) { results[i] = second_strings[i].length(); }
             else if (second_strings[i].length() == 0) { results[i] = first_strings[i].length(); }
         return {status_t::success_k};
     }
 
     // In most cases we should be able to fit many blocks per SM.
-    sz_size_t count_blocks_per_multiprocessor = specs.shared_memory_per_multiprocessor() / shared_memory_per_block;
+    size_t count_blocks_per_multiprocessor = specs.shared_memory_per_multiprocessor() / shared_memory_per_block;
     if (count_blocks_per_multiprocessor > specs.max_blocks_per_multiprocessor)
         count_blocks_per_multiprocessor = specs.max_blocks_per_multiprocessor;
     if (count_blocks_per_multiprocessor > first_strings.size()) count_blocks_per_multiprocessor = first_strings.size();
 
     // Let's use all 32 threads in a warp.
-    constexpr sz_size_t threads_per_block = 32u;
-    sz_size_t const magnitude() = substituter.magnitude();
+    constexpr size_t threads_per_block = 32u;
+    size_t const magnitude() = substituter.magnitude();
     auto kernel =
         &_needleman_wunsch_in_cuda_warp<first_strings_t, second_strings_t, score_t, substituter_t, capability_k>;
     void *kernel_args[] = {
diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index bc0ec80a..6a734b02 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -75,7 +75,7 @@ constexpr score_type_ min_or_max(score_type_ a, score_type_ b) noexcept {
 }
 
 template <typename value_type_>
-void rotate_three(value_type_ &a, value_type_ &b, value_type_ &c) noexcept {
+constexpr void rotate_three(value_type_ &a, value_type_ &b, value_type_ &c) noexcept {
     value_type_ tmp = a;
     a = b;
     b = c;
@@ -713,7 +713,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         // The supplementary matrices are initialized with values of higher magnitude,
         // which is equivalent to discarding them. That's better than using `SIZE_MAX`
         // as subsequent additions won't overflow.
-        cell = gap_costs_.open + gap_costs_.extend +
+        cell = (gap_costs_.open + gap_costs_.extend) +
                (diagonal_index ? gap_costs_.open + gap_costs_.extend * (diagonal_index - 1) : 0);
     }
 
diff --git a/scripts/bench_sequence.cpp b/scripts/bench_sequence.cpp
index 7560fdc0..f89fcdd9 100644
--- a/scripts/bench_sequence.cpp
+++ b/scripts/bench_sequence.cpp
@@ -57,7 +57,7 @@
 
 #define SZ_USE_MISALIGNED_LOADS (1)
 #include "bench.hpp"
-#include "test.hpp" // `global_random_generator`
+#include "test_stringzilla.hpp" // `global_random_generator`
 
 using namespace ashvardanian::stringzilla::scripts;
 
diff --git a/scripts/bench_similarity.cuh b/scripts/bench_similarity.cuh
index a1e1aae9..563cb2f8 100644
--- a/scripts/bench_similarity.cuh
+++ b/scripts/bench_similarity.cuh
@@ -140,6 +140,14 @@ void bench_levenshtein(environment_t const &env) {
                     similarities_equality_t {}) // equality check
             .log(baseline);
         scramble_accelerated_results();
+
+        bench_unary(env, "affine_levenshtein_cuda:batch"s + std::to_string(batch_size), call_baseline,
+                    similarities_callable<affine_levenshtein_cuda_t, sz::gpu_specs_t>(
+                        env, results_accelerated, affine_levenshtein_cuda_t {weird_uniform, weird_affine}, specs),
+                    callable_no_op_t {},        // preprocessing
+                    similarities_equality_t {}) // equality check
+            .log(baseline);
+        scramble_accelerated_results();
 #endif
 
 #if SZ_USE_KEPLER
diff --git a/scripts/test_stringcuzilla.cpp b/scripts/test_stringcuzilla.cpp
index f3983115..22189c3b 100644
--- a/scripts/test_stringcuzilla.cpp
+++ b/scripts/test_stringcuzilla.cpp
@@ -38,9 +38,9 @@ int main(int argc, char const **argv) {
     if (auto code = sz::scripts::log_environment(); code != 0) return code;
 
     try {
-        sz::scripts::test_find_many_equivalence();
         sz::scripts::test_similarity_scores_equivalence();
         sz::scripts::test_similarity_scores_memory_usage();
+        sz::scripts::test_find_many_equivalence();
     }
     catch (std::exception const &e) {
         std::fprintf(stderr, "Failed with: %s\n", e.what());
diff --git a/scripts/test_stringcuzilla.cu b/scripts/test_stringcuzilla.cu
index cf11da1b..17d9186c 100644
--- a/scripts/test_stringcuzilla.cu
+++ b/scripts/test_stringcuzilla.cu
@@ -13,12 +13,12 @@
  *  ! but they come handy during development, if you want to validate
  *  ! different ISA-specific implementations.
 
-#define SZ_USE_HASWELL 0
-#define SZ_USE_SKYLAKE 0
-#define SZ_USE_ICE 0
-#define SZ_USE_NEON 0
-#define SZ_USE_SVE 0
-*/
+ #define SZ_USE_NEON 0
+ #define SZ_USE_SVE 0
+ */
+#define SZ_USE_HASWELL 1
+#define SZ_USE_SKYLAKE 1
+#define SZ_USE_ICE 1
 #define SZ_USE_OPENMP 1
 #define SZ_USE_CUDA 1
 #define SZ_USE_KEPLER 1
@@ -38,9 +38,9 @@ int main(int argc, char const **argv) {
     if (auto code = sz::scripts::log_environment(); code != 0) return code;
 
     try {
-        sz::scripts::test_find_many_equivalence();
         sz::scripts::test_similarity_scores_equivalence();
         sz::scripts::test_similarity_scores_memory_usage();
+        sz::scripts::test_find_many_equivalence();
     }
     catch (std::exception const &e) {
         std::fprintf(stderr, "Failed with: %s\n", e.what());

From 076e58a6db9549d47637dcc319d74c8bd2888e68 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 5 May 2025 12:11:00 +0000
Subject: [PATCH 393/751] Fix: Levenshtein w. Affine costs on GPU for
 zero-length ins

---
 include/stringcuzilla/similarity.cuh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index 30127b12..5fb907da 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -1573,8 +1573,10 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
             task.bytes_per_cell = requirement.bytes_per_cell;
             task.density = warp_tasks_density(requirement.total, specs);
             if (task.density == infinite_warps_per_multiprocessor_k) {
-                if constexpr (!is_affine_k) { task.result = task.longer_length * gap_costs_.open_or_extend; }
-                else { task.result = (task.longer_length - 1) * gap_costs_.extend + gap_costs_.open; }
+                if constexpr (!is_affine_k) task.result = task.longer_length * gap_costs_.open_or_extend;
+                else
+                    task.result =
+                        task.longer_length ? (task.longer_length - 1) * gap_costs_.extend + gap_costs_.open : 0;
                 count_empty_tasks++;
             }
             tasks[i] = task;

From b9e4160eed89b60bae6caea2407af2e2e94e6e8a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 5 May 2025 12:11:19 +0000
Subject: [PATCH 394/751] Fix: Affine top row initialization

---
 include/stringcuzilla/similarity.cuh | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index 5fb907da..b19f274b 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -1041,7 +1041,10 @@ __global__ void _affine_score_across_cuda_device(              //
         );
 
         // Don't forget to populate the first row of the Levenshtein matrix.
-        if (is_main_thread) diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
+        if (is_main_thread) {
+            diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
+            diagonal_aligner.init_gap(next_deletes[next_diagonal_length - 1], next_diagonal_index);
+        }
 
         std::swap(current_inserts, next_inserts);
         std::swap(current_deletes, next_deletes);
@@ -1429,7 +1432,10 @@ __global__ void _affine_score_on_each_cuda_warp( //
             );
 
             // Don't forget to populate the first row of the Levenshtein matrix.
-            if (is_main_thread) diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
+            if (is_main_thread) {
+                diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
+                diagonal_aligner.init_gap(next_deletes[next_diagonal_length - 1], next_diagonal_index);
+            }
 
             std::swap(current_inserts, next_inserts);
             std::swap(current_deletes, next_deletes);

From 51afad528fc77982828a8be4bbd1bd884cca73c5 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 5 May 2025 14:24:09 +0000
Subject: [PATCH 395/751] Add: Ice Lake Affine Levenshtein kernels

---
 include/stringcuzilla/similarity.hpp | 609 ++++++++++++++++++++++++++-
 scripts/test_stringcuzilla.cuh       |  25 +-
 2 files changed, 614 insertions(+), 20 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index 6a734b02..de84560e 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -3238,40 +3238,623 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
     }
 };
 
+/**
+ *  @brief Variant of `tile_scorer` - Minimizes Levenshtein distance for inputs under 256 bytes.
+ *  @note Requires Intel Ice Lake generation CPUs or newer.
+ */
+template <sz_capability_t capability_>
+struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, capability_,
+                   std::enable_if_t<capability_ & sz_cap_ice_k>>
+    : public tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void> {
+
+    using tile_scorer_t::tile_scorer; // Make the constructors visible
+
+    static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
+    static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
+    static constexpr sz_capability_t capability_k = capability_;
+
+    static constexpr size_t step_k = 64;
+
+    /**
+     *  @brief  Computes one diagonal of the `u8` DM matrix for exactly 64 characters,
+     *          using unaligned loads, but forcing @b aligned stores.
+     */
+    SZ_FORCE_INLINE void slice_aligned64chars(                         //
+        char const *first_reversed_slice, char const *second_slice,    //
+        sz_u8_t const *scores_pre_substitution,                        //
+        sz_u8_t const *scores_pre_insertion,                           //
+        sz_u8_t const *scores_pre_deletion,                            //
+        sz_u8_t const *scores_running_insertions,                      //
+        sz_u8_t const *scores_running_deletions,                       //
+        sz_u8_t *scores_new,                                           //
+        sz_u8_t *scores_new_insertions,                                //
+        sz_u8_t *scores_new_deletions,                                 //
+        sz_u512_vec_t match_cost_vec, sz_u512_vec_t mismatch_cost_vec, //
+        sz_u512_vec_t gap_open_vec, sz_u512_vec_t gap_expand_vec) const noexcept {
+
+        __mmask64 match_mask;
+        sz_u512_vec_t first_vec, second_vec;
+        sz_u512_vec_t pre_substitution_vec, pre_insert_open_vec, pre_delete_open_vec, pre_insert_expand_vec,
+            pre_delete_expand_vec;
+        sz_u512_vec_t cost_of_substitution_vec;
+        sz_u512_vec_t cost_if_substitution_vec, cost_if_insert, cost_if_delete, cell_score_vec;
+
+        // ? Note that here we are still traversing both buffers in the same order,
+        // ? because one of the strings has been reversed beforehand.
+        first_vec.zmm = _mm512_loadu_epi8(first_reversed_slice);
+        second_vec.zmm = _mm512_loadu_epi8(second_slice);
+        pre_substitution_vec.zmm = _mm512_loadu_epi8(scores_pre_substitution);
+        pre_insert_open_vec.zmm = _mm512_loadu_epi8(scores_pre_insertion);
+        pre_delete_open_vec.zmm = _mm512_loadu_epi8(scores_pre_deletion);
+        pre_insert_expand_vec.zmm = _mm512_loadu_epi8(scores_running_insertions);
+        pre_delete_expand_vec.zmm = _mm512_loadu_epi8(scores_running_deletions);
+
+        match_mask = _mm512_cmpeq_epi8_mask(first_vec.zmm, second_vec.zmm);
+        cost_of_substitution_vec.zmm = _mm512_mask_blend_epi8(match_mask, mismatch_cost_vec.zmm, match_cost_vec.zmm);
+        cost_if_substitution_vec.zmm = _mm512_add_epi8(pre_substitution_vec.zmm, cost_of_substitution_vec.zmm);
+        cost_if_insert.zmm = _mm512_min_epu8(_mm512_add_epi8(pre_insert_expand_vec.zmm, gap_expand_vec.zmm),
+                                             _mm512_add_epi8(pre_insert_open_vec.zmm, gap_open_vec.zmm));
+        cost_if_delete.zmm = _mm512_min_epu8(_mm512_add_epi8(pre_delete_expand_vec.zmm, gap_expand_vec.zmm),
+                                             _mm512_add_epi8(pre_delete_open_vec.zmm, gap_open_vec.zmm));
+        cell_score_vec.zmm =
+            _mm512_min_epu8(cost_if_substitution_vec.zmm, _mm512_min_epu8(cost_if_insert.zmm, cost_if_delete.zmm));
+
+        // Export results.
+        _mm512_store_si512(scores_new, cell_score_vec.zmm);
+        _mm512_storeu_si512(scores_new_insertions, cost_if_insert.zmm);
+        _mm512_storeu_si512(scores_new_deletions, cost_if_delete.zmm);
+    }
+
+    /**
+     *  @brief  Computes one diagonal of the `u8` DM matrix for up to 64 characters,
+     *          using unaligned loads and stores.
+     */
+    SZ_FORCE_INLINE void slice_upto64chars(                                   //
+        char const *first_reversed_slice, char const *second_slice, size_t n, //
+        sz_u8_t const *scores_pre_substitution,                               //
+        sz_u8_t const *scores_pre_insertion,                                  //
+        sz_u8_t const *scores_pre_deletion,                                   //
+        sz_u8_t const *scores_running_insertions,                             //
+        sz_u8_t const *scores_running_deletions,                              //
+        sz_u8_t *scores_new,                                                  //
+        sz_u8_t *scores_new_insertions,                                       //
+        sz_u8_t *scores_new_deletions,                                        //
+        sz_u512_vec_t match_cost_vec, sz_u512_vec_t mismatch_cost_vec,        //
+        sz_u512_vec_t gap_open_vec, sz_u512_vec_t gap_expand_vec) const noexcept {
+
+        __mmask64 load_mask, match_mask;
+        sz_u512_vec_t first_vec, second_vec;
+        sz_u512_vec_t pre_substitution_vec, pre_insert_open_vec, pre_delete_open_vec, pre_insert_expand_vec,
+            pre_delete_expand_vec;
+        sz_u512_vec_t cost_of_substitution_vec;
+        sz_u512_vec_t cost_if_substitution_vec, cost_if_insert, cost_if_delete, cell_score_vec;
+
+        // ? Note that here we are still traversing both buffers in the same order,
+        // ? because one of the strings has been reversed beforehand.
+        load_mask = _sz_u64_mask_until(n);
+        first_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, first_reversed_slice);
+        second_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, second_slice);
+        pre_substitution_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, scores_pre_substitution);
+        pre_insert_open_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, scores_pre_insertion);
+        pre_delete_open_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, scores_pre_deletion);
+        pre_insert_expand_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, scores_running_insertions);
+        pre_delete_expand_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, scores_running_deletions);
+
+        match_mask = _mm512_cmpeq_epi8_mask(first_vec.zmm, second_vec.zmm);
+        cost_of_substitution_vec.zmm = _mm512_mask_blend_epi8(match_mask, mismatch_cost_vec.zmm, match_cost_vec.zmm);
+        cost_if_substitution_vec.zmm = _mm512_add_epi8(pre_substitution_vec.zmm, cost_of_substitution_vec.zmm);
+        cost_if_insert.zmm = _mm512_min_epu8(_mm512_add_epi8(pre_insert_expand_vec.zmm, gap_expand_vec.zmm),
+                                             _mm512_add_epi8(pre_insert_open_vec.zmm, gap_open_vec.zmm));
+        cost_if_delete.zmm = _mm512_min_epu8(_mm512_add_epi8(pre_delete_expand_vec.zmm, gap_expand_vec.zmm),
+                                             _mm512_add_epi8(pre_delete_open_vec.zmm, gap_open_vec.zmm));
+        cell_score_vec.zmm =
+            _mm512_min_epu8(cost_if_substitution_vec.zmm, _mm512_min_epu8(cost_if_insert.zmm, cost_if_delete.zmm));
+
+        // Export results.
+        _mm512_mask_storeu_epi8(scores_new, load_mask, cell_score_vec.zmm);
+        _mm512_mask_storeu_epi8(scores_new_insertions, load_mask, cost_if_insert.zmm);
+        _mm512_mask_storeu_epi8(scores_new_deletions, load_mask, cost_if_delete.zmm);
+    }
+
+    inline void operator()(                                                              //
+        char const *first_reversed_slice, char const *second_slice, size_t const length, //
+        sz_u8_t const *scores_pre_substitution,                                          //
+        sz_u8_t const *scores_pre_insertion,                                             //
+        sz_u8_t const *scores_pre_deletion,                                              //
+        sz_u8_t const *scores_running_insertions,                                        //
+        sz_u8_t const *scores_running_deletions,                                         //
+        sz_u8_t *scores_new,                                                             //
+        sz_u8_t *scores_new_insertions,                                                  //
+        sz_u8_t *scores_new_deletions,                                                   //
+        dummy_executor_t executor = {}) noexcept {
+
+        sz_unused(executor); // On such small inputs, we don't need to worry about parallelism.
+
+        // Initialize constats:
+        sz_u512_vec_t match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec;
+        match_cost_vec.zmm = _mm512_set1_epi8(this->substituter_.match);
+        mismatch_cost_vec.zmm = _mm512_set1_epi8(this->substituter_.mismatch);
+        gap_open_vec.zmm = _mm512_set1_epi8(this->gap_costs_.open);
+        gap_expand_vec.zmm = _mm512_set1_epi8(this->gap_costs_.extend);
+
+        // On very small inputs, avoid the headache of splitting the input into chunks:
+        if (length <= step_k) {
+            slice_upto64chars(                                                      //
+                first_reversed_slice, second_slice, length,                         //
+                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, //
+                scores_running_insertions, scores_running_deletions,                //
+                scores_new, scores_new_insertions, scores_new_deletions,            //
+                match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
+            // The last element of the last chunk is the result of the global alignment.
+            this->last_score_ = scores_new[0];
+            return;
+        }
+
+        // First handle the misaligned slice of the output buffer:
+        head_body_tail_t hbt = head_body_tail<step_k>(scores_new, length);
+
+        // Misaligned head:
+        if (hbt.head)
+            slice_upto64chars(                                                      //
+                first_reversed_slice, second_slice, hbt.head,                       //
+                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, //
+                scores_running_insertions, scores_running_deletions,                //
+                scores_new, scores_new_insertions, scores_new_deletions,            //
+                match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
+        first_reversed_slice += hbt.head, second_slice += hbt.head, scores_pre_substitution += hbt.head,
+            scores_pre_insertion += hbt.head, scores_pre_deletion += hbt.head, scores_new += hbt.head;
+
+        // In this variant we will need at most 4 loops per diagonal:
+        for (size_t progress = 0; progress < hbt.body; //
+             progress += step_k,                       //
+             first_reversed_slice += step_k, second_slice += step_k, scores_pre_substitution += step_k,
+                    scores_pre_insertion += step_k, scores_pre_deletion += step_k, scores_new += step_k)
+            slice_aligned64chars(                                                   //
+                first_reversed_slice, second_slice,                                 //
+                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, //
+                scores_running_insertions, scores_running_deletions,                //
+                scores_new, scores_new_insertions, scores_new_deletions,            //
+                match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
+
+        // Shorter tail:
+        if (hbt.tail)
+            slice_upto64chars(                                                      //
+                first_reversed_slice, second_slice, hbt.tail,                       //
+                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, //
+                scores_running_insertions, scores_running_deletions,                //
+                scores_new, scores_new_insertions, scores_new_deletions,            //
+                match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
+
+        // The last element of the last chunk is the result of the global alignment.
+        if (length == 1) this->last_score_ = scores_new[0];
+    }
+};
+
+/**
+ *  @brief Variant of `scorer` - Minimizes Levenshtein distance for inputs in [256, 65K] bytes.
+ *  @note Requires Intel Ice Lake generation CPUs or newer.
+ */
+template <sz_capability_t capability_>
+struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, capability_,
+                   std::enable_if_t<capability_ & sz_cap_ice_k>>
+    : public tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void> {
+
+    using tile_scorer_t::tile_scorer; // Make the constructors visible
+
+    static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
+    static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
+    static constexpr sz_capability_t capability_k = capability_;
+
+    static constexpr size_t step_k = 32;
+
+    /**
+     *  @brief  Computes one diagonal of the `u16` DM matrix for exactly 16 characters,
+     *          using unaligned loads, but forcing @b aligned stores.
+     */
+    SZ_FORCE_INLINE void slice_aligned32chars(                         //
+        char const *first_reversed_slice, char const *second_slice,    //
+        sz_u16_t const *scores_pre_substitution,                       //
+        sz_u16_t const *scores_pre_insertion,                          //
+        sz_u16_t const *scores_pre_deletion,                           //
+        sz_u16_t const *scores_running_insertions,                     //
+        sz_u16_t const *scores_running_deletions,                      //
+        sz_u16_t *scores_new,                                          //
+        sz_u16_t *scores_new_insertions,                               //
+        sz_u16_t *scores_new_deletions,                                //
+        sz_u512_vec_t match_cost_vec, sz_u512_vec_t mismatch_cost_vec, //
+        sz_u512_vec_t gap_open_vec, sz_u512_vec_t gap_expand_vec) const noexcept {
+
+        __mmask32 match_mask;
+        sz_u256_vec_t first_vec, second_vec;
+        sz_u512_vec_t pre_substitution_vec, pre_insert_open_vec, pre_delete_open_vec, pre_insert_expand_vec,
+            pre_delete_expand_vec;
+        sz_u512_vec_t cost_of_substitution_vec;
+        sz_u512_vec_t cost_if_substitution_vec, cost_if_insert, cost_if_delete, cell_score_vec;
+
+        // ? Note that here we are still traversing both buffers in the same order,
+        // ? because one of the strings has been reversed beforehand.
+        first_vec.ymm = _mm256_loadu_epi8(first_reversed_slice);
+        second_vec.ymm = _mm256_loadu_epi8(second_slice);
+        pre_substitution_vec.zmm = _mm512_loadu_epi16(scores_pre_substitution);
+        pre_insert_open_vec.zmm = _mm512_loadu_epi16(scores_pre_insertion);
+        pre_delete_open_vec.zmm = _mm512_loadu_epi16(scores_pre_deletion);
+        pre_insert_expand_vec.zmm = _mm512_loadu_epi16(scores_running_insertions);
+        pre_delete_expand_vec.zmm = _mm512_loadu_epi16(scores_running_deletions);
+
+        match_mask = _mm256_cmpeq_epi8_mask(first_vec.ymm, second_vec.ymm);
+        cost_of_substitution_vec.zmm = _mm512_mask_blend_epi16(match_mask, mismatch_cost_vec.zmm, match_cost_vec.zmm);
+        cost_if_substitution_vec.zmm = _mm512_add_epi16(pre_substitution_vec.zmm, cost_of_substitution_vec.zmm);
+        cost_if_insert.zmm = _mm512_min_epu16(_mm512_add_epi16(pre_insert_expand_vec.zmm, gap_expand_vec.zmm),
+                                              _mm512_add_epi16(pre_insert_open_vec.zmm, gap_open_vec.zmm));
+        cost_if_delete.zmm = _mm512_min_epu16(_mm512_add_epi16(pre_delete_expand_vec.zmm, gap_expand_vec.zmm),
+                                              _mm512_add_epi16(pre_delete_open_vec.zmm, gap_open_vec.zmm));
+        cell_score_vec.zmm =
+            _mm512_min_epu16(cost_if_substitution_vec.zmm, _mm512_min_epu16(cost_if_insert.zmm, cost_if_delete.zmm));
+
+        // Export results.
+        _mm512_store_si512(scores_new, cell_score_vec.zmm);
+        _mm512_storeu_si512(scores_new_insertions, cost_if_insert.zmm);
+        _mm512_storeu_si512(scores_new_deletions, cost_if_delete.zmm);
+    }
+
+    SZ_FORCE_INLINE void slice_upto32chars(                                   //
+        char const *first_reversed_slice, char const *second_slice, size_t n, //
+        sz_u16_t const *scores_pre_substitution,                              //
+        sz_u16_t const *scores_pre_insertion,                                 //
+        sz_u16_t const *scores_pre_deletion,                                  //
+        sz_u16_t const *scores_running_insertions,                            //
+        sz_u16_t const *scores_running_deletions,                             //
+        sz_u16_t *scores_new,                                                 //
+        sz_u16_t *scores_new_insertions,                                      //
+        sz_u16_t *scores_new_deletions,                                       //
+        sz_u512_vec_t match_cost_vec, sz_u512_vec_t mismatch_cost_vec,        //
+        sz_u512_vec_t gap_open_vec, sz_u512_vec_t gap_expand_vec) const noexcept {
+
+        __mmask32 load_mask, match_mask;
+        sz_u256_vec_t first_vec, second_vec;
+        sz_u512_vec_t pre_substitution_vec, pre_insert_open_vec, pre_delete_open_vec, pre_insert_expand_vec,
+            pre_delete_expand_vec;
+        sz_u512_vec_t cost_of_substitution_vec;
+        sz_u512_vec_t cost_if_substitution_vec, cost_if_insert, cost_if_delete, cell_score_vec;
+
+        // ? Note that here we are still traversing both buffers in the same order,
+        // ? because one of the strings has been reversed beforehand.
+        load_mask = _sz_u32_mask_until(n);
+        first_vec.ymm = _mm256_maskz_loadu_epi8(load_mask, first_reversed_slice);
+        second_vec.ymm = _mm256_maskz_loadu_epi8(load_mask, second_slice);
+        pre_substitution_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_substitution);
+        pre_insert_open_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_insertion);
+        pre_delete_open_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_deletion);
+        pre_insert_expand_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_running_insertions);
+        pre_delete_expand_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_running_deletions);
+
+        match_mask = _mm256_cmpeq_epi8_mask(first_vec.ymm, second_vec.ymm);
+        cost_of_substitution_vec.zmm = _mm512_mask_blend_epi16(match_mask, mismatch_cost_vec.zmm, match_cost_vec.zmm);
+        cost_if_substitution_vec.zmm = _mm512_add_epi16(pre_substitution_vec.zmm, cost_of_substitution_vec.zmm);
+        cost_if_insert.zmm = _mm512_min_epu16(_mm512_add_epi16(pre_insert_expand_vec.zmm, gap_expand_vec.zmm),
+                                              _mm512_add_epi16(pre_insert_open_vec.zmm, gap_open_vec.zmm));
+        cost_if_delete.zmm = _mm512_min_epu16(_mm512_add_epi16(pre_delete_expand_vec.zmm, gap_expand_vec.zmm),
+                                              _mm512_add_epi16(pre_delete_open_vec.zmm, gap_open_vec.zmm));
+        cell_score_vec.zmm =
+            _mm512_min_epu16(cost_if_substitution_vec.zmm, _mm512_min_epu16(cost_if_insert.zmm, cost_if_delete.zmm));
+
+        // Export results.
+        _mm512_mask_storeu_epi16(scores_new, load_mask, cell_score_vec.zmm);
+        _mm512_mask_storeu_epi16(scores_new_insertions, load_mask, cost_if_insert.zmm);
+        _mm512_mask_storeu_epi16(scores_new_deletions, load_mask, cost_if_delete.zmm);
+    }
+
+    template <typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
+    inline void operator()(                                                              //
+        char const *first_reversed_slice, char const *second_slice, size_t const length, //
+        sz_u16_t const *scores_pre_substitution,                                         //
+        sz_u16_t const *scores_pre_insertion,                                            //
+        sz_u16_t const *scores_pre_deletion,                                             //
+        sz_u16_t const *scores_running_insertions,                                       //
+        sz_u16_t const *scores_running_deletions,                                        //
+        sz_u16_t *scores_new,                                                            //
+        sz_u16_t *scores_new_insertions,                                                 //
+        sz_u16_t *scores_new_deletions,                                                  //
+        executor_type_ &&executor = {}) noexcept {
+
+        // Initialize constats:
+        sz_u512_vec_t match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec;
+        match_cost_vec.zmm = _mm512_set1_epi16(this->substituter_.match);
+        mismatch_cost_vec.zmm = _mm512_set1_epi16(this->substituter_.mismatch);
+        gap_open_vec.zmm = _mm512_set1_epi16(this->gap_costs_.open);
+        gap_expand_vec.zmm = _mm512_set1_epi16(this->gap_costs_.extend);
+
+        // On very small inputs, avoid the headache of splitting the input into chunks:
+        if (length <= step_k) {
+            slice_upto32chars(                                                      //
+                first_reversed_slice, second_slice, length,                         //
+                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, //
+                scores_running_insertions, scores_running_deletions,                //
+                scores_new, scores_new_insertions, scores_new_deletions,            //
+                match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
+            // The last element of the last chunk is the result of the global alignment.
+            this->last_score_ = scores_new[0];
+            return;
+        }
+
+        // First handle the misaligned slice of the output buffer:
+        head_body_tail_t hbt = head_body_tail<step_k>(scores_new, length);
+
+        // Misaligned head and tail:
+        if (hbt.head)
+            slice_upto32chars(                                                      //
+                first_reversed_slice, second_slice, hbt.head,                       //
+                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, //
+                scores_running_insertions, scores_running_deletions,                //
+                scores_new, scores_new_insertions, scores_new_deletions,            //
+                match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
+        first_reversed_slice += hbt.head, second_slice += hbt.head, scores_pre_substitution += hbt.head,
+            scores_pre_insertion += hbt.head, scores_pre_deletion += hbt.head, scores_new += hbt.head;
+        if (hbt.tail)
+            slice_upto32chars(                                                             //
+                first_reversed_slice + hbt.body, second_slice + hbt.body, hbt.tail,        //
+                scores_pre_substitution + hbt.body, scores_pre_insertion + hbt.body,       //
+                scores_pre_deletion + hbt.body,                                            //
+                scores_running_insertions + hbt.body, scores_running_deletions + hbt.body, //
+                scores_new + hbt.body,                                                     //
+                scores_new_insertions + hbt.body, scores_new_deletions + hbt.body,         //
+                match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
+
+        // In this variant we will need at most (64 * 1024 / 32) = 2048 loops per diagonal.
+        size_t const body_pages = hbt.body / step_k;
+        executor.for_each_static(body_pages, [&](size_t const page) noexcept {
+            size_t const progress = page * step_k;
+            slice_aligned32chars(                                                                             //
+                first_reversed_slice + progress, second_slice + progress, scores_pre_substitution + progress, //
+                scores_pre_insertion + progress, scores_pre_deletion + progress,                              //
+                scores_running_insertions + progress, scores_running_deletions + progress,                    //
+                scores_new + progress,                                                                        //
+                scores_new_insertions + progress, scores_new_deletions + progress,                            //
+                match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
+        });
+
+        // The last element of the last chunk is the result of the global alignment.
+        if (length == 1) this->last_score_ = scores_new[0];
+    }
+};
+
+/**
+ *  @brief Variant of `scorer` - Minimizes Levenshtein distance for inputs in [65K, 4B] bytes.
+ *  @note Requires Intel Ice Lake generation CPUs or newer.
+ */
+template <sz_capability_t capability_>
+struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, capability_,
+                   std::enable_if_t<capability_ & sz_cap_ice_k>>
+    : public tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void> {
+
+    using tile_scorer_t::tile_scorer; // Make the constructors visible
+
+    static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
+    static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
+    static constexpr sz_capability_t capability_k = capability_;
+
+    static constexpr size_t step_k = 16;
+
+    /**
+     *  @brief  Computes one diagonal of the `u32` DM matrix for exactly 16 characters,
+     *          using unaligned loads, but forcing @b aligned stores.
+     */
+    SZ_FORCE_INLINE void slice_aligned16chars(                         //
+        char const *first_reversed_slice, char const *second_slice,    //
+        sz_u32_t const *scores_pre_substitution,                       //
+        sz_u32_t const *scores_pre_insertion,                          //
+        sz_u32_t const *scores_pre_deletion,                           //
+        sz_u32_t const *scores_running_insertions,                     //
+        sz_u32_t const *scores_running_deletions,                      //
+        sz_u32_t *scores_new,                                          //
+        sz_u32_t *scores_new_insertions,                               //
+        sz_u32_t *scores_new_deletions,                                //
+        sz_u512_vec_t match_cost_vec, sz_u512_vec_t mismatch_cost_vec, //
+        sz_u512_vec_t gap_open_vec, sz_u512_vec_t gap_expand_vec) const noexcept {
+
+        __mmask16 match_mask;
+        sz_u128_vec_t first_vec, second_vec;
+        sz_u512_vec_t pre_substitution_vec, pre_insert_open_vec, pre_delete_open_vec, pre_insert_expand_vec,
+            pre_delete_expand_vec;
+        sz_u512_vec_t cost_of_substitution_vec;
+        sz_u512_vec_t cost_if_substitution_vec, cost_if_insert, cost_if_delete, cell_score_vec;
+
+        // ? Note that here we are still traversing both buffers in the same order,
+        // ? because one of the strings has been reversed beforehand.
+        first_vec.xmm = _mm_lddqu_si128((__m128i const *)first_reversed_slice);
+        second_vec.xmm = _mm_lddqu_si128((__m128i const *)second_slice);
+        pre_substitution_vec.zmm = _mm512_loadu_epi32(scores_pre_substitution);
+        pre_insert_open_vec.zmm = _mm512_loadu_epi32(scores_pre_insertion);
+        pre_delete_open_vec.zmm = _mm512_loadu_epi32(scores_pre_deletion);
+        pre_insert_expand_vec.zmm = _mm512_loadu_epi32(scores_running_insertions);
+        pre_delete_expand_vec.zmm = _mm512_loadu_epi32(scores_running_deletions);
+
+        match_mask = _mm_cmpeq_epi8_mask(first_vec.xmm, second_vec.xmm);
+        cost_of_substitution_vec.zmm = _mm512_mask_blend_epi32(match_mask, mismatch_cost_vec.zmm, match_cost_vec.zmm);
+        cost_if_substitution_vec.zmm = _mm512_add_epi32(pre_substitution_vec.zmm, cost_of_substitution_vec.zmm);
+        cost_if_insert.zmm = _mm512_min_epu32(_mm512_add_epi32(pre_insert_expand_vec.zmm, gap_expand_vec.zmm),
+                                              _mm512_add_epi32(pre_insert_open_vec.zmm, gap_open_vec.zmm));
+        cost_if_delete.zmm = _mm512_min_epu32(_mm512_add_epi32(pre_delete_expand_vec.zmm, gap_expand_vec.zmm),
+                                              _mm512_add_epi32(pre_delete_open_vec.zmm, gap_open_vec.zmm));
+        cell_score_vec.zmm =
+            _mm512_min_epu32(cost_if_substitution_vec.zmm, _mm512_min_epu32(cost_if_insert.zmm, cost_if_delete.zmm));
+
+        // Export results.
+        _mm512_store_si512((__m512i *)scores_new, cell_score_vec.zmm);
+        _mm512_store_si512((__m512i *)scores_new_insertions, cost_if_insert.zmm);
+        _mm512_store_si512((__m512i *)scores_new_deletions, cost_if_delete.zmm);
+    }
+
+    /**
+     *  @brief  Computes one diagonal of the `u32` DM matrix for up to 16 characters,
+     *          using unaligned loads and stores.
+     */
+    SZ_FORCE_INLINE void slice_upto16chars(                                   //
+        char const *first_reversed_slice, char const *second_slice, size_t n, //
+        sz_u32_t const *scores_pre_substitution,                              //
+        sz_u32_t const *scores_pre_insertion,                                 //
+        sz_u32_t const *scores_pre_deletion,                                  //
+        sz_u32_t const *scores_running_insertions,                            //
+        sz_u32_t const *scores_running_deletions,                             //
+        sz_u32_t *scores_new,                                                 //
+        sz_u32_t *scores_new_insertions,                                      //
+        sz_u32_t *scores_new_deletions,                                       //
+        sz_u512_vec_t match_cost_vec, sz_u512_vec_t mismatch_cost_vec,        //
+        sz_u512_vec_t gap_open_vec, sz_u512_vec_t gap_expand_vec) const noexcept {
+
+        __mmask16 load_mask, match_mask;
+        sz_u128_vec_t first_vec, second_vec;
+        sz_u512_vec_t pre_substitution_vec, pre_insert_open_vec, pre_delete_open_vec, pre_insert_expand_vec,
+            pre_delete_expand_vec;
+        sz_u512_vec_t cost_of_substitution_vec;
+        sz_u512_vec_t cost_if_substitution_vec, cost_if_insert, cost_if_delete, cell_score_vec;
+
+        // ? Note that here we are still traversing both buffers in the same order,
+        // ? because one of the strings has been reversed beforehand.
+        load_mask = _sz_u16_mask_until(n);
+        first_vec.xmm = _mm_maskz_loadu_epi8(load_mask, first_reversed_slice);
+        second_vec.xmm = _mm_maskz_loadu_epi8(load_mask, second_slice);
+        pre_substitution_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, scores_pre_substitution);
+        pre_insert_open_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, scores_pre_insertion);
+        pre_delete_open_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, scores_pre_deletion);
+        pre_insert_expand_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, scores_running_insertions);
+        pre_delete_expand_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, scores_running_deletions);
+
+        match_mask = _mm_cmpeq_epi8_mask(first_vec.xmm, second_vec.xmm);
+        cost_of_substitution_vec.zmm = _mm512_mask_blend_epi32(match_mask, mismatch_cost_vec.zmm, match_cost_vec.zmm);
+        cost_if_substitution_vec.zmm = _mm512_add_epi32(pre_substitution_vec.zmm, cost_of_substitution_vec.zmm);
+        cost_if_insert.zmm = _mm512_min_epu32(_mm512_add_epi32(pre_insert_expand_vec.zmm, gap_expand_vec.zmm),
+                                              _mm512_add_epi32(pre_insert_open_vec.zmm, gap_open_vec.zmm));
+        cost_if_delete.zmm = _mm512_min_epu32(_mm512_add_epi32(pre_delete_expand_vec.zmm, gap_expand_vec.zmm),
+                                              _mm512_add_epi32(pre_delete_open_vec.zmm, gap_open_vec.zmm));
+        cell_score_vec.zmm =
+            _mm512_min_epu32(cost_if_substitution_vec.zmm, _mm512_min_epu32(cost_if_insert.zmm, cost_if_delete.zmm));
+
+        // Export results.
+        _mm512_mask_storeu_epi32(scores_new, load_mask, cell_score_vec.zmm);
+        _mm512_mask_storeu_epi32(scores_new_insertions, load_mask, cost_if_insert.zmm);
+        _mm512_mask_storeu_epi32(scores_new_deletions, load_mask, cost_if_delete.zmm);
+    }
+
+    template <typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
+    inline void operator()(                                                              //
+        char const *first_reversed_slice, char const *second_slice, size_t const length, //
+        sz_u32_t const *scores_pre_substitution,                                         //
+        sz_u32_t const *scores_pre_insertion,                                            //
+        sz_u32_t const *scores_pre_deletion,                                             //
+        sz_u32_t const *scores_running_insertions,                                       //
+        sz_u32_t const *scores_running_deletions,                                        //
+        sz_u32_t *scores_new,                                                            //
+        sz_u32_t *scores_new_insertions,                                                 //
+        sz_u32_t *scores_new_deletions,                                                  //
+        executor_type_ &&executor = {}) noexcept {
+
+        // Initialize constats:
+        sz_u512_vec_t match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec;
+        match_cost_vec.zmm = _mm512_set1_epi32(this->substituter_.match);
+        mismatch_cost_vec.zmm = _mm512_set1_epi32(this->substituter_.mismatch);
+        gap_open_vec.zmm = _mm512_set1_epi32(this->gap_costs_.open);
+        gap_expand_vec.zmm = _mm512_set1_epi32(this->gap_costs_.extend);
+
+        // On very small inputs, avoid the headache of splitting the input into chunks:
+        if (length <= step_k) {
+            slice_upto16chars(                                                      //
+                first_reversed_slice, second_slice, length,                         //
+                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, //
+                scores_running_insertions, scores_running_deletions,                //
+                scores_new, scores_new_insertions, scores_new_deletions,            //
+                match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
+            // The last element of the last chunk is the result of the global alignment.
+            this->last_score_ = scores_new[0];
+            return;
+        }
+
+        // First handle the misaligned slice of the output buffer:
+        head_body_tail_t hbt = head_body_tail<step_k>(scores_new, length);
+
+        // Misaligned head and tail:
+        if (hbt.head)
+            slice_upto16chars(                                                      //
+                first_reversed_slice, second_slice, hbt.head,                       //
+                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, //
+                scores_running_insertions, scores_running_deletions,                //
+                scores_new, scores_new_insertions, scores_new_deletions,            //
+                match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
+        first_reversed_slice += hbt.head, second_slice += hbt.head, scores_pre_substitution += hbt.head,
+            scores_pre_insertion += hbt.head, scores_pre_deletion += hbt.head, scores_new += hbt.head;
+        if (hbt.tail)
+            slice_upto16chars(                                                             //
+                first_reversed_slice + hbt.body, second_slice + hbt.body, hbt.tail,        //
+                scores_pre_substitution + hbt.body, scores_pre_insertion + hbt.body,       //
+                scores_pre_deletion + hbt.body,                                            //
+                scores_running_insertions + hbt.body, scores_running_deletions + hbt.body, //
+                scores_new + hbt.body,                                                     //
+                scores_new_insertions + hbt.body, scores_new_deletions + hbt.body,         //
+                match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
+
+        size_t const body_pages = hbt.body / step_k;
+        executor.for_each_static(body_pages, [&](size_t const page) noexcept {
+            size_t const progress = page * step_k;
+            slice_aligned16chars(                                                                             //
+                first_reversed_slice + progress, second_slice + progress, scores_pre_substitution + progress, //
+                scores_pre_insertion + progress, scores_pre_deletion + progress,                              //
+                scores_running_insertions + progress, scores_running_deletions + progress,                    //
+                scores_new + progress,                                                                        //
+                scores_new_insertions + progress, scores_new_deletions + progress,                            //
+                match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
+        });
+
+        // The last element of the last chunk is the result of the global alignment.
+        if (length == 1) this->last_score_ = scores_new[0];
+    }
+};
+
 /**
  *  @brief  Computes the @b byte-level Levenshtein distance between two strings using the OpenMP backend.
  *  @sa     `levenshtein_distance_utf8` for UTF-8 strings.
  */
-template <typename allocator_type_, sz_capability_t capability_>
-struct levenshtein_distance<char, linear_gap_costs_t, allocator_type_, capability_,
+template <typename gap_costs_type_, typename allocator_type_, sz_capability_t capability_>
+struct levenshtein_distance<char, gap_costs_type_, allocator_type_, capability_,
                             std::enable_if_t<capability_ & sz_cap_ice_k>> {
 
     using char_t = char;
-    using gap_costs_t = linear_gap_costs_t;
+    using gap_costs_t = gap_costs_type_;
     using allocator_t = allocator_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_wout_simd_k = (sz_capability_t)(capability_k & ~sz_cap_ice_k);
 
-    using diagonal_u8_t =                                                                               //
-        diagonal_walker<char_t, sz_u8_t, uniform_substitution_costs_t, linear_gap_costs_t, allocator_t, //
+    using diagonal_u8_t =                                                                        //
+        diagonal_walker<char_t, sz_u8_t, uniform_substitution_costs_t, gap_costs_t, allocator_t, //
                         sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
-    using diagonal_u16_t =                                                                               //
-        diagonal_walker<char_t, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t, allocator_t, //
+    using diagonal_u16_t =                                                                        //
+        diagonal_walker<char_t, sz_u16_t, uniform_substitution_costs_t, gap_costs_t, allocator_t, //
                         sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
-    using diagonal_u32_t =                                                                               //
-        diagonal_walker<char_t, sz_u32_t, uniform_substitution_costs_t, linear_gap_costs_t, allocator_t, //
+    using diagonal_u32_t =                                                                        //
+        diagonal_walker<char_t, sz_u32_t, uniform_substitution_costs_t, gap_costs_t, allocator_t, //
                         sz_minimize_distance_k, sz_similarity_global_k, capability_k>;
-    using diagonal_u64_t =                                                                               //
-        diagonal_walker<char_t, sz_u64_t, uniform_substitution_costs_t, linear_gap_costs_t, allocator_t, //
+    using diagonal_u64_t =                                                                        //
+        diagonal_walker<char_t, sz_u64_t, uniform_substitution_costs_t, gap_costs_t, allocator_t, //
                         sz_minimize_distance_k, sz_similarity_global_k, capability_wout_simd_k>;
 
     uniform_substitution_costs_t substituter_ {};
-    linear_gap_costs_t gap_costs_ {};
+    gap_costs_t gap_costs_ {};
     allocator_t alloc_ {};
 
     levenshtein_distance(allocator_t alloc = allocator_t {}) noexcept : alloc_(alloc) {}
-    levenshtein_distance(uniform_substitution_costs_t subs, linear_gap_costs_t gaps,
+    levenshtein_distance(uniform_substitution_costs_t subs, gap_costs_t gaps,
                          allocator_t alloc = allocator_t {}) noexcept
         : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index 2f3713b6..402a43ea 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -775,17 +775,22 @@ void test_similarity_scores_equivalence() {
         levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {}, //
         levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_si_k> {});
 
-    // Ice Lake Levenshtein distance against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                      //
-        levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {}, //
-        levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_caps_si_k> {});
-
     // Ice Lake Levenshtein distance against Multi-threaded on CPU with weird linear costs
     test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                                            //
         levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear}, //
         levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_si_k> {weird_uniform, weird_linear});
 
-    // Ice Lake Levenshtein distance against Multi-threaded on CPU with weird linear costs
+    // Ice Lake Levenshtein distance against Multi-threaded on CPU with weird affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                                            //
+        levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_affine}, //
+        levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_caps_si_k> {weird_uniform, weird_affine});
+
+    // Ice Lake Levenshtein UTF8 distance against Multi-threaded on CPU
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                      //
+        levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {}, //
+        levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_caps_si_k> {});
+
+    // Ice Lake Levenshtein UTF8 distance against Multi-threaded on CPU with weird linear costs
     test_similarity_scores_fixed_and_fuzzy<sz_size_t>( //
         levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear},
         levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_caps_si_k> {weird_uniform, weird_linear});
@@ -803,11 +808,17 @@ void test_similarity_scores_equivalence() {
 #endif
 
 #if SZ_USE_CUDA
-    // CUDA Levenshtein distance against Multi-threaded on CPU
+    // CUDA Levenshtein distance against Multi-threaded on CPU with weird linear costs
     test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                                            //
         levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear}, //
         levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k> {weird_uniform, weird_linear}, {}, {},
         first_gpu_specs);
+
+    // CUDA Levenshtein distance against Multi-threaded on CPU with weird affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                                            //
+        levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_affine}, //
+        levenshtein_distances<char, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k> {weird_uniform, weird_affine}, {}, {},
+        first_gpu_specs);
 #endif
 
 #if SZ_USE_KEPLER

From 79e7a2f4097561b2af1f386fa0d7ec74805b4ea8 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 5 May 2025 15:32:37 +0000
Subject: [PATCH 396/751] Improve: Shrink Affine Ice Lake kernels

---
 include/stringcuzilla/similarity.hpp | 337 +++++----------------------
 1 file changed, 59 insertions(+), 278 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index de84560e..7b0e052d 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -470,6 +470,13 @@ using needleman_wunsch_ice_t =
 using smith_waterman_ice_t =
     smith_waterman_scores<char, error_costs_256x256_t, linear_gap_costs_t, malloc_t, sz_caps_si_k>;
 
+using affine_levenshtein_ice_t = levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_caps_si_k>;
+using affine_levenshtein_utf8_ice_t = levenshtein_distances_utf8<char, affine_gap_costs_t, malloc_t, sz_caps_si_k>;
+using affine_needleman_wunsch_ice_t =
+    needleman_wunsch_scores<char, error_costs_256x256_t, affine_gap_costs_t, malloc_t, sz_caps_si_k>;
+using affine_smith_waterman_ice_t =
+    smith_waterman_scores<char, error_costs_256x256_t, affine_gap_costs_t, malloc_t, sz_caps_si_k>;
+
 #pragma endregion - Common Aliases
 
 #pragma region - Autovectorized Tile Scorer
@@ -3257,56 +3264,6 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
 
     static constexpr size_t step_k = 64;
 
-    /**
-     *  @brief  Computes one diagonal of the `u8` DM matrix for exactly 64 characters,
-     *          using unaligned loads, but forcing @b aligned stores.
-     */
-    SZ_FORCE_INLINE void slice_aligned64chars(                         //
-        char const *first_reversed_slice, char const *second_slice,    //
-        sz_u8_t const *scores_pre_substitution,                        //
-        sz_u8_t const *scores_pre_insertion,                           //
-        sz_u8_t const *scores_pre_deletion,                            //
-        sz_u8_t const *scores_running_insertions,                      //
-        sz_u8_t const *scores_running_deletions,                       //
-        sz_u8_t *scores_new,                                           //
-        sz_u8_t *scores_new_insertions,                                //
-        sz_u8_t *scores_new_deletions,                                 //
-        sz_u512_vec_t match_cost_vec, sz_u512_vec_t mismatch_cost_vec, //
-        sz_u512_vec_t gap_open_vec, sz_u512_vec_t gap_expand_vec) const noexcept {
-
-        __mmask64 match_mask;
-        sz_u512_vec_t first_vec, second_vec;
-        sz_u512_vec_t pre_substitution_vec, pre_insert_open_vec, pre_delete_open_vec, pre_insert_expand_vec,
-            pre_delete_expand_vec;
-        sz_u512_vec_t cost_of_substitution_vec;
-        sz_u512_vec_t cost_if_substitution_vec, cost_if_insert, cost_if_delete, cell_score_vec;
-
-        // ? Note that here we are still traversing both buffers in the same order,
-        // ? because one of the strings has been reversed beforehand.
-        first_vec.zmm = _mm512_loadu_epi8(first_reversed_slice);
-        second_vec.zmm = _mm512_loadu_epi8(second_slice);
-        pre_substitution_vec.zmm = _mm512_loadu_epi8(scores_pre_substitution);
-        pre_insert_open_vec.zmm = _mm512_loadu_epi8(scores_pre_insertion);
-        pre_delete_open_vec.zmm = _mm512_loadu_epi8(scores_pre_deletion);
-        pre_insert_expand_vec.zmm = _mm512_loadu_epi8(scores_running_insertions);
-        pre_delete_expand_vec.zmm = _mm512_loadu_epi8(scores_running_deletions);
-
-        match_mask = _mm512_cmpeq_epi8_mask(first_vec.zmm, second_vec.zmm);
-        cost_of_substitution_vec.zmm = _mm512_mask_blend_epi8(match_mask, mismatch_cost_vec.zmm, match_cost_vec.zmm);
-        cost_if_substitution_vec.zmm = _mm512_add_epi8(pre_substitution_vec.zmm, cost_of_substitution_vec.zmm);
-        cost_if_insert.zmm = _mm512_min_epu8(_mm512_add_epi8(pre_insert_expand_vec.zmm, gap_expand_vec.zmm),
-                                             _mm512_add_epi8(pre_insert_open_vec.zmm, gap_open_vec.zmm));
-        cost_if_delete.zmm = _mm512_min_epu8(_mm512_add_epi8(pre_delete_expand_vec.zmm, gap_expand_vec.zmm),
-                                             _mm512_add_epi8(pre_delete_open_vec.zmm, gap_open_vec.zmm));
-        cell_score_vec.zmm =
-            _mm512_min_epu8(cost_if_substitution_vec.zmm, _mm512_min_epu8(cost_if_insert.zmm, cost_if_delete.zmm));
-
-        // Export results.
-        _mm512_store_si512(scores_new, cell_score_vec.zmm);
-        _mm512_storeu_si512(scores_new_insertions, cost_if_insert.zmm);
-        _mm512_storeu_si512(scores_new_deletions, cost_if_delete.zmm);
-    }
-
     /**
      *  @brief  Computes one diagonal of the `u8` DM matrix for up to 64 characters,
      *          using unaligned loads and stores.
@@ -3379,52 +3336,24 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
         gap_open_vec.zmm = _mm512_set1_epi8(this->gap_costs_.open);
         gap_expand_vec.zmm = _mm512_set1_epi8(this->gap_costs_.extend);
 
-        // On very small inputs, avoid the headache of splitting the input into chunks:
-        if (length <= step_k) {
-            slice_upto64chars(                                                      //
-                first_reversed_slice, second_slice, length,                         //
-                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, //
-                scores_running_insertions, scores_running_deletions,                //
-                scores_new, scores_new_insertions, scores_new_deletions,            //
-                match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
-            // The last element of the last chunk is the result of the global alignment.
-            this->last_score_ = scores_new[0];
-            return;
-        }
-
-        // First handle the misaligned slice of the output buffer:
-        head_body_tail_t hbt = head_body_tail<step_k>(scores_new, length);
-
-        // Misaligned head:
-        if (hbt.head)
-            slice_upto64chars(                                                      //
-                first_reversed_slice, second_slice, hbt.head,                       //
-                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, //
-                scores_running_insertions, scores_running_deletions,                //
-                scores_new, scores_new_insertions, scores_new_deletions,            //
-                match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
-        first_reversed_slice += hbt.head, second_slice += hbt.head, scores_pre_substitution += hbt.head,
-            scores_pre_insertion += hbt.head, scores_pre_deletion += hbt.head, scores_new += hbt.head;
-
         // In this variant we will need at most 4 loops per diagonal:
-        for (size_t progress = 0; progress < hbt.body; //
-             progress += step_k,                       //
-             first_reversed_slice += step_k, second_slice += step_k, scores_pre_substitution += step_k,
-                    scores_pre_insertion += step_k, scores_pre_deletion += step_k, scores_new += step_k)
-            slice_aligned64chars(                                                   //
-                first_reversed_slice, second_slice,                                 //
-                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, //
-                scores_running_insertions, scores_running_deletions,                //
-                scores_new, scores_new_insertions, scores_new_deletions,            //
+        size_t progress = 0;
+        for (; progress + step_k <= length; progress += step_k)
+            slice_upto64chars(                                                                                       //
+                first_reversed_slice, second_slice, step_k,                                                          //
+                scores_pre_substitution + progress, scores_pre_insertion + progress, scores_pre_deletion + progress, //
+                scores_running_insertions + progress, scores_running_deletions + progress,                           //
+                scores_new + progress, scores_new_insertions + progress, scores_new_deletions + progress,            //
                 match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
 
         // Shorter tail:
-        if (hbt.tail)
-            slice_upto64chars(                                                      //
-                first_reversed_slice, second_slice, hbt.tail,                       //
-                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, //
-                scores_running_insertions, scores_running_deletions,                //
-                scores_new, scores_new_insertions, scores_new_deletions,            //
+        size_t const tail = length - progress;
+        if (tail)
+            slice_upto64chars(                                                                                       //
+                first_reversed_slice + progress, second_slice + progress, tail,                                      //
+                scores_pre_substitution + progress, scores_pre_insertion + progress, scores_pre_deletion + progress, //
+                scores_running_insertions + progress, scores_running_deletions + progress,                           //
+                scores_new + progress, scores_new_insertions + progress, scores_new_deletions + progress,            //
                 match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
 
         // The last element of the last chunk is the result of the global alignment.
@@ -3451,56 +3380,6 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
 
     static constexpr size_t step_k = 32;
 
-    /**
-     *  @brief  Computes one diagonal of the `u16` DM matrix for exactly 16 characters,
-     *          using unaligned loads, but forcing @b aligned stores.
-     */
-    SZ_FORCE_INLINE void slice_aligned32chars(                         //
-        char const *first_reversed_slice, char const *second_slice,    //
-        sz_u16_t const *scores_pre_substitution,                       //
-        sz_u16_t const *scores_pre_insertion,                          //
-        sz_u16_t const *scores_pre_deletion,                           //
-        sz_u16_t const *scores_running_insertions,                     //
-        sz_u16_t const *scores_running_deletions,                      //
-        sz_u16_t *scores_new,                                          //
-        sz_u16_t *scores_new_insertions,                               //
-        sz_u16_t *scores_new_deletions,                                //
-        sz_u512_vec_t match_cost_vec, sz_u512_vec_t mismatch_cost_vec, //
-        sz_u512_vec_t gap_open_vec, sz_u512_vec_t gap_expand_vec) const noexcept {
-
-        __mmask32 match_mask;
-        sz_u256_vec_t first_vec, second_vec;
-        sz_u512_vec_t pre_substitution_vec, pre_insert_open_vec, pre_delete_open_vec, pre_insert_expand_vec,
-            pre_delete_expand_vec;
-        sz_u512_vec_t cost_of_substitution_vec;
-        sz_u512_vec_t cost_if_substitution_vec, cost_if_insert, cost_if_delete, cell_score_vec;
-
-        // ? Note that here we are still traversing both buffers in the same order,
-        // ? because one of the strings has been reversed beforehand.
-        first_vec.ymm = _mm256_loadu_epi8(first_reversed_slice);
-        second_vec.ymm = _mm256_loadu_epi8(second_slice);
-        pre_substitution_vec.zmm = _mm512_loadu_epi16(scores_pre_substitution);
-        pre_insert_open_vec.zmm = _mm512_loadu_epi16(scores_pre_insertion);
-        pre_delete_open_vec.zmm = _mm512_loadu_epi16(scores_pre_deletion);
-        pre_insert_expand_vec.zmm = _mm512_loadu_epi16(scores_running_insertions);
-        pre_delete_expand_vec.zmm = _mm512_loadu_epi16(scores_running_deletions);
-
-        match_mask = _mm256_cmpeq_epi8_mask(first_vec.ymm, second_vec.ymm);
-        cost_of_substitution_vec.zmm = _mm512_mask_blend_epi16(match_mask, mismatch_cost_vec.zmm, match_cost_vec.zmm);
-        cost_if_substitution_vec.zmm = _mm512_add_epi16(pre_substitution_vec.zmm, cost_of_substitution_vec.zmm);
-        cost_if_insert.zmm = _mm512_min_epu16(_mm512_add_epi16(pre_insert_expand_vec.zmm, gap_expand_vec.zmm),
-                                              _mm512_add_epi16(pre_insert_open_vec.zmm, gap_open_vec.zmm));
-        cost_if_delete.zmm = _mm512_min_epu16(_mm512_add_epi16(pre_delete_expand_vec.zmm, gap_expand_vec.zmm),
-                                              _mm512_add_epi16(pre_delete_open_vec.zmm, gap_open_vec.zmm));
-        cell_score_vec.zmm =
-            _mm512_min_epu16(cost_if_substitution_vec.zmm, _mm512_min_epu16(cost_if_insert.zmm, cost_if_delete.zmm));
-
-        // Export results.
-        _mm512_store_si512(scores_new, cell_score_vec.zmm);
-        _mm512_storeu_si512(scores_new_insertions, cost_if_insert.zmm);
-        _mm512_storeu_si512(scores_new_deletions, cost_if_delete.zmm);
-    }
-
     SZ_FORCE_INLINE void slice_upto32chars(                                   //
         char const *first_reversed_slice, char const *second_slice, size_t n, //
         sz_u16_t const *scores_pre_substitution,                              //
@@ -3571,55 +3450,29 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
         gap_open_vec.zmm = _mm512_set1_epi16(this->gap_costs_.open);
         gap_expand_vec.zmm = _mm512_set1_epi16(this->gap_costs_.extend);
 
-        // On very small inputs, avoid the headache of splitting the input into chunks:
-        if (length <= step_k) {
-            slice_upto32chars(                                                      //
-                first_reversed_slice, second_slice, length,                         //
-                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, //
-                scores_running_insertions, scores_running_deletions,                //
-                scores_new, scores_new_insertions, scores_new_deletions,            //
-                match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
-            // The last element of the last chunk is the result of the global alignment.
-            this->last_score_ = scores_new[0];
-            return;
-        }
-
-        // First handle the misaligned slice of the output buffer:
-        head_body_tail_t hbt = head_body_tail<step_k>(scores_new, length);
-
-        // Misaligned head and tail:
-        if (hbt.head)
-            slice_upto32chars(                                                      //
-                first_reversed_slice, second_slice, hbt.head,                       //
-                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, //
-                scores_running_insertions, scores_running_deletions,                //
-                scores_new, scores_new_insertions, scores_new_deletions,            //
-                match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
-        first_reversed_slice += hbt.head, second_slice += hbt.head, scores_pre_substitution += hbt.head,
-            scores_pre_insertion += hbt.head, scores_pre_deletion += hbt.head, scores_new += hbt.head;
-        if (hbt.tail)
-            slice_upto32chars(                                                             //
-                first_reversed_slice + hbt.body, second_slice + hbt.body, hbt.tail,        //
-                scores_pre_substitution + hbt.body, scores_pre_insertion + hbt.body,       //
-                scores_pre_deletion + hbt.body,                                            //
-                scores_running_insertions + hbt.body, scores_running_deletions + hbt.body, //
-                scores_new + hbt.body,                                                     //
-                scores_new_insertions + hbt.body, scores_new_deletions + hbt.body,         //
-                match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
-
         // In this variant we will need at most (64 * 1024 / 32) = 2048 loops per diagonal.
-        size_t const body_pages = hbt.body / step_k;
+        size_t const body_pages = length / step_k;
         executor.for_each_static(body_pages, [&](size_t const page) noexcept {
             size_t const progress = page * step_k;
-            slice_aligned32chars(                                                                             //
-                first_reversed_slice + progress, second_slice + progress, scores_pre_substitution + progress, //
-                scores_pre_insertion + progress, scores_pre_deletion + progress,                              //
-                scores_running_insertions + progress, scores_running_deletions + progress,                    //
-                scores_new + progress,                                                                        //
-                scores_new_insertions + progress, scores_new_deletions + progress,                            //
+            slice_upto32chars(                                                                                       //
+                first_reversed_slice + progress, second_slice + progress, step_k,                                    //
+                scores_pre_substitution + progress, scores_pre_insertion + progress, scores_pre_deletion + progress, //
+                scores_running_insertions + progress, scores_running_deletions + progress,                           //
+                scores_new + progress, scores_new_insertions + progress, scores_new_deletions + progress,            //
                 match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
         });
 
+        // Shorter tail:
+        size_t const progress = body_pages * step_k;
+        size_t const tail = length - progress;
+        if (tail)
+            slice_upto32chars(                                                                                       //
+                first_reversed_slice + progress, second_slice + progress, tail,                                      //
+                scores_pre_substitution + progress, scores_pre_insertion + progress, scores_pre_deletion + progress, //
+                scores_running_insertions + progress, scores_running_deletions + progress,                           //
+                scores_new + progress, scores_new_insertions + progress, scores_new_deletions + progress,            //
+                match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
+
         // The last element of the last chunk is the result of the global alignment.
         if (length == 1) this->last_score_ = scores_new[0];
     }
@@ -3644,56 +3497,6 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
 
     static constexpr size_t step_k = 16;
 
-    /**
-     *  @brief  Computes one diagonal of the `u32` DM matrix for exactly 16 characters,
-     *          using unaligned loads, but forcing @b aligned stores.
-     */
-    SZ_FORCE_INLINE void slice_aligned16chars(                         //
-        char const *first_reversed_slice, char const *second_slice,    //
-        sz_u32_t const *scores_pre_substitution,                       //
-        sz_u32_t const *scores_pre_insertion,                          //
-        sz_u32_t const *scores_pre_deletion,                           //
-        sz_u32_t const *scores_running_insertions,                     //
-        sz_u32_t const *scores_running_deletions,                      //
-        sz_u32_t *scores_new,                                          //
-        sz_u32_t *scores_new_insertions,                               //
-        sz_u32_t *scores_new_deletions,                                //
-        sz_u512_vec_t match_cost_vec, sz_u512_vec_t mismatch_cost_vec, //
-        sz_u512_vec_t gap_open_vec, sz_u512_vec_t gap_expand_vec) const noexcept {
-
-        __mmask16 match_mask;
-        sz_u128_vec_t first_vec, second_vec;
-        sz_u512_vec_t pre_substitution_vec, pre_insert_open_vec, pre_delete_open_vec, pre_insert_expand_vec,
-            pre_delete_expand_vec;
-        sz_u512_vec_t cost_of_substitution_vec;
-        sz_u512_vec_t cost_if_substitution_vec, cost_if_insert, cost_if_delete, cell_score_vec;
-
-        // ? Note that here we are still traversing both buffers in the same order,
-        // ? because one of the strings has been reversed beforehand.
-        first_vec.xmm = _mm_lddqu_si128((__m128i const *)first_reversed_slice);
-        second_vec.xmm = _mm_lddqu_si128((__m128i const *)second_slice);
-        pre_substitution_vec.zmm = _mm512_loadu_epi32(scores_pre_substitution);
-        pre_insert_open_vec.zmm = _mm512_loadu_epi32(scores_pre_insertion);
-        pre_delete_open_vec.zmm = _mm512_loadu_epi32(scores_pre_deletion);
-        pre_insert_expand_vec.zmm = _mm512_loadu_epi32(scores_running_insertions);
-        pre_delete_expand_vec.zmm = _mm512_loadu_epi32(scores_running_deletions);
-
-        match_mask = _mm_cmpeq_epi8_mask(first_vec.xmm, second_vec.xmm);
-        cost_of_substitution_vec.zmm = _mm512_mask_blend_epi32(match_mask, mismatch_cost_vec.zmm, match_cost_vec.zmm);
-        cost_if_substitution_vec.zmm = _mm512_add_epi32(pre_substitution_vec.zmm, cost_of_substitution_vec.zmm);
-        cost_if_insert.zmm = _mm512_min_epu32(_mm512_add_epi32(pre_insert_expand_vec.zmm, gap_expand_vec.zmm),
-                                              _mm512_add_epi32(pre_insert_open_vec.zmm, gap_open_vec.zmm));
-        cost_if_delete.zmm = _mm512_min_epu32(_mm512_add_epi32(pre_delete_expand_vec.zmm, gap_expand_vec.zmm),
-                                              _mm512_add_epi32(pre_delete_open_vec.zmm, gap_open_vec.zmm));
-        cell_score_vec.zmm =
-            _mm512_min_epu32(cost_if_substitution_vec.zmm, _mm512_min_epu32(cost_if_insert.zmm, cost_if_delete.zmm));
-
-        // Export results.
-        _mm512_store_si512((__m512i *)scores_new, cell_score_vec.zmm);
-        _mm512_store_si512((__m512i *)scores_new_insertions, cost_if_insert.zmm);
-        _mm512_store_si512((__m512i *)scores_new_deletions, cost_if_delete.zmm);
-    }
-
     /**
      *  @brief  Computes one diagonal of the `u32` DM matrix for up to 16 characters,
      *          using unaligned loads and stores.
@@ -3768,54 +3571,32 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
         gap_open_vec.zmm = _mm512_set1_epi32(this->gap_costs_.open);
         gap_expand_vec.zmm = _mm512_set1_epi32(this->gap_costs_.extend);
 
-        // On very small inputs, avoid the headache of splitting the input into chunks:
-        if (length <= step_k) {
-            slice_upto16chars(                                                      //
-                first_reversed_slice, second_slice, length,                         //
-                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, //
-                scores_running_insertions, scores_running_deletions,                //
-                scores_new, scores_new_insertions, scores_new_deletions,            //
-                match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
-            // The last element of the last chunk is the result of the global alignment.
-            this->last_score_ = scores_new[0];
-            return;
-        }
-
-        // First handle the misaligned slice of the output buffer:
-        head_body_tail_t hbt = head_body_tail<step_k>(scores_new, length);
-
-        // Misaligned head and tail:
-        if (hbt.head)
-            slice_upto16chars(                                                      //
-                first_reversed_slice, second_slice, hbt.head,                       //
-                scores_pre_substitution, scores_pre_insertion, scores_pre_deletion, //
-                scores_running_insertions, scores_running_deletions,                //
-                scores_new, scores_new_insertions, scores_new_deletions,            //
-                match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
-        first_reversed_slice += hbt.head, second_slice += hbt.head, scores_pre_substitution += hbt.head,
-            scores_pre_insertion += hbt.head, scores_pre_deletion += hbt.head, scores_new += hbt.head;
-        if (hbt.tail)
-            slice_upto16chars(                                                             //
-                first_reversed_slice + hbt.body, second_slice + hbt.body, hbt.tail,        //
-                scores_pre_substitution + hbt.body, scores_pre_insertion + hbt.body,       //
-                scores_pre_deletion + hbt.body,                                            //
-                scores_running_insertions + hbt.body, scores_running_deletions + hbt.body, //
-                scores_new + hbt.body,                                                     //
-                scores_new_insertions + hbt.body, scores_new_deletions + hbt.body,         //
-                match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
-
-        size_t const body_pages = hbt.body / step_k;
+        // Handle the body in parallel, despite having misaligned writes:
+        size_t const body_pages = length / step_k;
         executor.for_each_static(body_pages, [&](size_t const page) noexcept {
             size_t const progress = page * step_k;
-            slice_aligned16chars(                                                                             //
-                first_reversed_slice + progress, second_slice + progress, scores_pre_substitution + progress, //
-                scores_pre_insertion + progress, scores_pre_deletion + progress,                              //
-                scores_running_insertions + progress, scores_running_deletions + progress,                    //
-                scores_new + progress,                                                                        //
-                scores_new_insertions + progress, scores_new_deletions + progress,                            //
+            slice_upto16chars(                                                             //
+                first_reversed_slice + progress, second_slice + progress, step_k,          //
+                scores_pre_substitution + progress,                                        //
+                scores_pre_insertion + progress, scores_pre_deletion + progress,           //
+                scores_running_insertions + progress, scores_running_deletions + progress, //
+                scores_new + progress,                                                     //
+                scores_new_insertions + progress, scores_new_deletions + progress,         //
                 match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
         });
 
+        // Handle the tail:
+        size_t const progress = body_pages * step_k;
+        size_t const tail = length - progress;
+        if (tail)
+            slice_upto16chars(                                                        //
+                first_reversed_slice + progress, second_slice + progress, tail,       //
+                scores_pre_substitution + progress, scores_pre_insertion + progress,  //
+                scores_pre_deletion + progress, scores_running_insertions + progress, //
+                scores_running_deletions + progress, scores_new + progress,           //
+                scores_new_insertions + progress, scores_new_deletions + progress,    //
+                match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec);
+
         // The last element of the last chunk is the result of the global alignment.
         if (length == 1) this->last_score_ = scores_new[0];
     }

From d603f7d49f053becd53c4fbc1a14e51488ae2880 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 5 May 2025 15:33:16 +0000
Subject: [PATCH 397/751] Improve: Fuzzy test Ice Lake kernels

---
 scripts/test_stringcuzilla.cuh | 36 ++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index 402a43ea..366f21bb 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -874,6 +874,11 @@ void test_similarity_scores_memory_usage() {
     gpu_specs_t first_gpu_specs = *gpu_specs();
 #endif
 
+    // Let's define some weird scoring schemes for Levenshtein-like distance, that are not unary:
+    constexpr linear_gap_costs_t weird_linear {3};
+    constexpr affine_gap_costs_t weird_affine {4, 2};
+    constexpr uniform_substitution_costs_t weird_uniform {1, 3};
+
     // Progress until something fails
     for (fuzzy_config_t const &experiment : experiments) {
         std::printf("Testing with batch size %zu, min length %zu, max length %zu\n", experiment.batch_size,
@@ -884,6 +889,37 @@ void test_similarity_scores_memory_usage() {
             levenshtein_baselines_t {},          //
             levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {}, experiment, 1);
 
+        // Multi-threaded serial Levenshtein distance implementation with weird linear costs
+        test_similarity_scores_fuzzy<sz_size_t>(                   //
+            levenshtein_baselines_t {weird_uniform, weird_linear}, //
+            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear},
+            experiment, 1);
+
+        // Multi-threaded serial Levenshtein distance implementation with weird affine costs
+        test_similarity_scores_fuzzy<sz_size_t>(                   //
+            levenshtein_baselines_t {weird_uniform, weird_affine}, //
+            levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_affine},
+            experiment, 1);
+
+#if SZ_USE_ICE
+        // Ice Lake Levenshtein distance against Multi-threaded on CPU
+        test_similarity_scores_fuzzy<sz_size_t>( //
+            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {},
+            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_si_k> {}, experiment, 1);
+
+        // Ice Lake Levenshtein distance against Multi-threaded on CPU with weird linear costs
+        test_similarity_scores_fuzzy<sz_size_t>( //
+            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear},
+            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_si_k> {weird_uniform, weird_linear},
+            experiment, 1);
+
+        // Ice Lake Levenshtein distance against Multi-threaded on CPU with weird affine costs
+        test_similarity_scores_fuzzy<sz_size_t>( //
+            levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_affine},
+            levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_caps_si_k> {weird_uniform, weird_affine},
+            experiment, 1);
+#endif
+
 #if SZ_USE_CUDA
         // CUDA Levenshtein distance against Multi-threaded on CPU
         test_similarity_scores_fuzzy<sz_size_t>(                                           //

From 64d8f4da07d886051dbac74da5389668b0274457 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 5 May 2025 15:34:31 +0000
Subject: [PATCH 398/751] Fix: Comparing Affine benchmark results

---
 scripts/bench_similarity.cuh | 95 ++++++++++++++++++++----------------
 1 file changed, 53 insertions(+), 42 deletions(-)

diff --git a/scripts/bench_similarity.cuh b/scripts/bench_similarity.cuh
index 563cb2f8..c2f0631f 100644
--- a/scripts/bench_similarity.cuh
+++ b/scripts/bench_similarity.cuh
@@ -93,12 +93,14 @@ void bench_levenshtein(environment_t const &env) {
 #if SZ_DEBUG
     batch_sizes = {1, 2, 32};
 #endif
-    similarities_t results_baseline, results_accelerated;
+    similarities_t results_linear_baseline, results_linear_accelerated;
+    similarities_t results_affine_baseline, results_affine_accelerated;
+    similarities_t results_utf8_baseline, results_utf8_accelerated;
     fork_union_t pool;
     if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
     static_assert(executor_like<fork_union_t>);
 
-    auto scramble_accelerated_results = [&] {
+    auto scramble_accelerated_results = [&](similarities_t &results_accelerated) {
         std::shuffle(results_accelerated.begin(), results_accelerated.end(), global_random_generator());
     };
 
@@ -108,71 +110,80 @@ void bench_levenshtein(environment_t const &env) {
     constexpr uniform_substitution_costs_t weird_uniform {1, 3};
 
     for (std::size_t batch_size : batch_sizes) {
-        results_baseline.resize(batch_size);
-        results_accelerated.resize(batch_size);
+        results_linear_baseline.resize(batch_size), results_linear_accelerated.resize(batch_size);
+        results_affine_baseline.resize(batch_size), results_affine_accelerated.resize(batch_size);
+        results_utf8_baseline.resize(batch_size), results_utf8_accelerated.resize(batch_size);
 
-        auto call_baseline = similarities_callable<levenshtein_serial_t, fork_union_t &>(
-            env, results_baseline, levenshtein_serial_t {weird_uniform, weird_linear}, pool);
-        auto name_baseline = "levenshtein_serial:batch"s + std::to_string(batch_size);
-        bench_result_t baseline = bench_unary(env, name_baseline, call_baseline).log();
+        auto call_linear_baseline = similarities_callable<levenshtein_serial_t, fork_union_t &>(
+            env, results_linear_baseline, levenshtein_serial_t {weird_uniform, weird_linear}, pool);
+        auto name_linear_baseline = "levenshtein_serial:batch"s + std::to_string(batch_size);
+        bench_result_t linear_baseline = bench_unary(env, name_linear_baseline, call_linear_baseline).log();
+
+        auto call_utf8_baseline = similarities_callable<levenshtein_utf8_serial_t>(
+            env, results_utf8_baseline, levenshtein_utf8_serial_t {weird_uniform, weird_linear});
+        auto name_utf8_baseline = "levenshtein_utf8_serial:batch"s + std::to_string(batch_size);
+        bench_result_t utf8_baseline = bench_unary(env, name_utf8_baseline, call_utf8_baseline).log();
 
         auto call_affine_baseline = similarities_callable<affine_levenshtein_serial_t, fork_union_t &>(
-            env, results_baseline, affine_levenshtein_serial_t {weird_uniform, weird_affine}, pool);
+            env, results_affine_baseline, affine_levenshtein_serial_t {weird_uniform, weird_affine}, pool);
         auto name_affine_baseline = "affine_levenshtein_serial:batch"s + std::to_string(batch_size);
-        bench_result_t affine_baseline = bench_unary(env, name_affine_baseline, call_affine_baseline).log(baseline);
+        bench_result_t affine_baseline =
+            bench_unary(env, name_affine_baseline, call_affine_baseline).log(linear_baseline);
         sz_unused(affine_baseline);
 
 #if SZ_USE_ICE
-        bench_unary(env, "levenshtein_ice:batch"s + std::to_string(batch_size), call_baseline,
+        bench_unary(env, "levenshtein_ice:batch"s + std::to_string(batch_size), call_linear_baseline,
                     similarities_callable<levenshtein_ice_t, fork_union_t &>(
-                        env, results_accelerated, levenshtein_ice_t {weird_uniform, weird_linear}, pool),
+                        env, results_linear_accelerated, levenshtein_ice_t {weird_uniform, weird_linear}, pool),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
-            .log(baseline);
-        scramble_accelerated_results();
-#endif
+            .log(linear_baseline);
+        scramble_accelerated_results(results_linear_accelerated);
 
-#if SZ_USE_CUDA
-        bench_unary(env, "levenshtein_cuda:batch"s + std::to_string(batch_size), call_baseline,
-                    similarities_callable<levenshtein_cuda_t, sz::gpu_specs_t>(
-                        env, results_accelerated, levenshtein_cuda_t {weird_uniform, weird_linear}, specs),
+        bench_unary(env, "affine_levenshtein_ice:batch"s + std::to_string(batch_size), call_affine_baseline,
+                    similarities_callable<affine_levenshtein_ice_t, fork_union_t &>(
+                        env, results_affine_accelerated, affine_levenshtein_ice_t {weird_uniform, weird_affine}, pool),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
-            .log(baseline);
-        scramble_accelerated_results();
+            .log(linear_baseline, affine_baseline);
+        scramble_accelerated_results(results_affine_accelerated);
 
-        bench_unary(env, "affine_levenshtein_cuda:batch"s + std::to_string(batch_size), call_baseline,
-                    similarities_callable<affine_levenshtein_cuda_t, sz::gpu_specs_t>(
-                        env, results_accelerated, affine_levenshtein_cuda_t {weird_uniform, weird_affine}, specs),
+        bench_unary(env, "levenshtein_utf8_ice:batch"s + std::to_string(batch_size), call_utf8_baseline,
+                    similarities_callable<levenshtein_utf8_ice_t>(env, results_utf8_accelerated,
+                                                                  levenshtein_utf8_ice_t {weird_uniform, weird_linear}),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
-            .log(baseline);
-        scramble_accelerated_results();
+            .log(utf8_baseline);
+        scramble_accelerated_results(results_utf8_accelerated);
 #endif
 
-#if SZ_USE_KEPLER
-        bench_unary(env, "levenshtein_kepler:batch"s + std::to_string(batch_size), call_baseline,
-                    similarities_callable<levenshtein_kepler_t, sz::gpu_specs_t>(
-                        env, results_accelerated, levenshtein_kepler_t {weird_uniform, weird_linear}, specs),
+#if SZ_USE_CUDA
+        bench_unary(env, "levenshtein_cuda:batch"s + std::to_string(batch_size), call_linear_baseline,
+                    similarities_callable<levenshtein_cuda_t, sz::gpu_specs_t>(
+                        env, results_linear_accelerated, levenshtein_cuda_t {weird_uniform, weird_linear}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
-            .log(baseline);
+            .log(linear_baseline);
         scramble_accelerated_results();
-#endif
 
-        auto call_utf8_baseline = similarities_callable<levenshtein_utf8_serial_t>(
-            env, results_baseline, levenshtein_utf8_serial_t {weird_uniform, weird_linear});
-        auto name_utf8_baseline = "levenshtein_utf8_serial:batch"s + std::to_string(batch_size);
-        bench_result_t utf8_baseline = bench_unary(env, name_utf8_baseline, call_utf8_baseline).log();
+        bench_unary(
+            env, "affine_levenshtein_cuda:batch"s + std::to_string(batch_size), call_affine_baseline,
+            similarities_callable<affine_levenshtein_cuda_t, sz::gpu_specs_t>(
+                env, results_affine_accelerated, affine_levenshtein_cuda_t {weird_uniform, weird_affine}, specs),
+            callable_no_op_t {},        // preprocessing
+            similarities_equality_t {}) // equality check
+            .log(linear_baseline, affine_baseline);
+        scramble_accelerated_results(results_affine_accelerated);
+#endif
 
-#if SZ_USE_ICE
-        bench_unary(env, "levenshtein_utf8_ice:batch"s + std::to_string(batch_size), call_utf8_baseline,
-                    similarities_callable<levenshtein_utf8_ice_t>(env, results_accelerated,
-                                                                  levenshtein_utf8_ice_t {weird_uniform, weird_linear}),
+#if SZ_USE_KEPLER
+        bench_unary(env, "levenshtein_kepler:batch"s + std::to_string(batch_size), call_linear_baseline,
+                    similarities_callable<levenshtein_kepler_t, sz::gpu_specs_t>(
+                        env, results_linear_accelerated, levenshtein_kepler_t {weird_uniform, weird_linear}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
-            .log(utf8_baseline);
-        scramble_accelerated_results();
+            .log(linear_baseline);
+        scramble_accelerated_results(results_linear_accelerated);
 #endif
     }
 }

From 3b2f2637bacee099eb52d385345794d2160d1c4f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 6 May 2025 07:40:46 +0000
Subject: [PATCH 399/751] Improve: Run multiple warps per block

---
 include/stringcuzilla/similarity.cuh | 180 ++++++++++++++++-----------
 scripts/bench_similarity.cuh         |   2 +-
 2 files changed, 111 insertions(+), 71 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index b19f274b..5bfa211d 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -581,7 +581,9 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
     __forceinline__ __device__ void operator()(                                 //
         char const *first_slice, char const *second_slice,                      //
         uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
-        sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion, sz_u8_t const *scores_pre_deletion,
+        sz_u8_t const *scores_pre_substitution,                                 //
+        sz_u8_t const *scores_pre_insertion,                                    //
+        sz_u8_t const *scores_pre_deletion,                                     //
         sz_u8_t *scores_new) noexcept {
 
         sz_u8_t const match_cost = this->substituter_.match;
@@ -643,8 +645,10 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
     __forceinline__ __device__ void operator()(                                 //
         char const *first_slice, char const *second_slice,                      //
         uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
-        sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,
-        sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new) noexcept {
+        sz_u16_t const *scores_pre_substitution,                                //
+        sz_u16_t const *scores_pre_insertion,                                   //
+        sz_u16_t const *scores_pre_deletion,                                    //
+        sz_u16_t *scores_new) noexcept {
 
         sz_u16_t const match_cost = this->substituter_.match;
         sz_u16_t const mismatch_cost = this->substituter_.mismatch;
@@ -1119,9 +1123,10 @@ template < //
     sz_similarity_locality_t locality_ = sz_similarity_global_k, //
     sz_capability_t capability_ = sz_cap_cuda_k                  //
     >
-__global__ void _linear_score_on_each_cuda_warp( //
-    task_type_ *tasks, size_t tasks_count,       //
-    substituter_type_ const substituter, linear_gap_costs_t const gap_costs) {
+__global__ void _linear_score_on_each_cuda_warp(                             //
+    task_type_ *tasks, size_t tasks_count,                                   //
+    substituter_type_ const substituter, linear_gap_costs_t const gap_costs, //
+    uint const shared_memory_size) {
 
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
     using task_t = task_type_;
@@ -1141,12 +1146,25 @@ __global__ void _linear_score_on_each_cuda_warp( //
     using warp_scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t, objective_k,
                                       locality_k, capability_k>;
 
+    // We may have multiple warps operating in the same block.
+    uint const warp_size = warpSize;
+    size_t const global_thread_index = static_cast<uint>(blockIdx.x * blockDim.x + threadIdx.x);
+    size_t const global_warp_index = static_cast<uint>(global_thread_index / warp_size);
+    size_t const warps_per_block = static_cast<uint>(blockDim.x / warp_size);
+    size_t const warps_per_device = static_cast<uint>(gridDim.x * warps_per_block);
+    uint const warp_thread_index = static_cast<uint>(global_thread_index % warp_size);
+
     // Allocating shared memory is handled on the host side.
-    extern __shared__ char shared_memory_buffer[];
+    extern __shared__ char shared_memory_for_block[];
+    char *const shared_memory_for_warp =
+        shared_memory_for_block + (global_warp_index % warps_per_block) * (shared_memory_size / warps_per_block);
+
+    // Only one thread will be initializing the top row and left column and outputting the result.
+    bool const is_main_thread = warp_thread_index == 0;
 
     // We are computing N edit distances for N pairs of strings. Not a cartesian product!
     // Each block/warp may end up receiving a different number of strings.
-    for (size_t task_idx = blockIdx.x; task_idx < tasks_count; task_idx += gridDim.x) {
+    for (size_t task_idx = global_warp_index; task_idx < tasks_count; task_idx += warps_per_device) {
         task_t &task = tasks[task_idx];
         char_t const *shorter_global = task.shorter_ptr;
         char_t const *longer_global = task.longer_ptr;
@@ -1170,19 +1188,19 @@ __global__ void _linear_score_on_each_cuda_warp( //
         uint const bytes_per_diagonal = round_up_to_multiple<uint>(max_diagonal_length * sizeof(score_t), 4);
 
         // The next few pointers will be swapped around.
-        score_t *previous_scores = reinterpret_cast<score_t *>(shared_memory_buffer);
-        score_t *current_scores = reinterpret_cast<score_t *>(shared_memory_buffer + bytes_per_diagonal);
-        score_t *next_scores = reinterpret_cast<score_t *>(shared_memory_buffer + 2 * bytes_per_diagonal);
-        char_t *const longer = reinterpret_cast<char_t *>(shared_memory_buffer + 3 * bytes_per_diagonal);
+        score_t *previous_scores = reinterpret_cast<score_t *>(shared_memory_for_warp);
+        score_t *current_scores = reinterpret_cast<score_t *>(shared_memory_for_warp + bytes_per_diagonal);
+        score_t *next_scores = reinterpret_cast<score_t *>(shared_memory_for_warp + 2 * bytes_per_diagonal);
+        char_t *const longer = reinterpret_cast<char_t *>(shared_memory_for_warp + 3 * bytes_per_diagonal);
         char_t *const shorter = longer + longer_length;
 
         // Each thread in the warp will be loading it's own set of strided characters into shared memory.
-        for (uint i = threadIdx.x; i < longer_length; i += blockDim.x) longer[i] = longer_global[i];
-        for (uint i = threadIdx.x; i < shorter_length; i += blockDim.x) shorter[i] = shorter_global[i];
+        for (uint i = warp_thread_index; i < longer_length; i += warp_size) longer[i] = longer_global[i];
+        for (uint i = warp_thread_index; i < shorter_length; i += warp_size) shorter[i] = shorter_global[i];
 
         // Initialize the first two diagonals:
         warp_scorer_t diagonal_aligner {substituter, gap_costs};
-        if (threadIdx.x == 0) {
+        if (is_main_thread) {
             diagonal_aligner.init_score(previous_scores[0], 0);
             diagonal_aligner.init_score(current_scores[0], 1);
             diagonal_aligner.init_score(current_scores[1], 1);
@@ -1204,14 +1222,14 @@ __global__ void _linear_score_on_each_cuda_warp( //
             diagonal_aligner(                       //
                 shorter,                            // first sequence of characters
                 longer,                             // second sequence of characters
-                threadIdx.x, blockDim.x,            //
+                warp_thread_index, warp_size,       //
                 next_diagonal_length - 2,           // number of elements to compute with the `diagonal_aligner`
                 previous_scores,                    // costs pre substitution
                 current_scores, current_scores + 1, // costs pre insertion/deletion
                 next_scores + 1);                   // ! notice unaligned write destination
 
             // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-            if (threadIdx.x == 0) {
+            if (is_main_thread) {
                 diagonal_aligner.init_score(next_scores[0], next_diagonal_index);
                 diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
             }
@@ -1228,24 +1246,24 @@ __global__ void _linear_score_on_each_cuda_warp( //
             diagonal_aligner(                               //
                 shorter,                                    // first sequence of characters
                 longer + next_diagonal_index - shorter_dim, // second sequence of characters
-                threadIdx.x, blockDim.x,                    //
+                warp_thread_index, warp_size,               //
                 next_diagonal_length - 1,                   // number of elements to compute with the `diagonal_aligner`
                 previous_scores,                            // costs pre substitution
                 current_scores, current_scores + 1,         // costs pre insertion/deletion
                 next_scores);
 
             // Don't forget to populate the first row of the Levenshtein matrix.
-            if (threadIdx.x == 0)
-                diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
+            if (is_main_thread) diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
 
             __syncwarp();
             // ! In the central anti-diagonal band, we can't just set the `current_scores + 1` to `previous_scores`
             // ! for the circular shift, as we will end up spilling outside of the diagonal a few iterations later.
             // ! Assuming in-place `memmove` is tricky on the GPU, so we will copy the data.
-            for (size_t i = threadIdx.x; i + 1 < next_diagonal_length; i += blockDim.x)
+            for (size_t i = warp_thread_index; i + 1 < next_diagonal_length; i += warp_size)
                 previous_scores[i] = current_scores[i + 1];
             __syncwarp();
-            for (size_t i = threadIdx.x; i < next_diagonal_length; i += blockDim.x) current_scores[i] = next_scores[i];
+            for (size_t i = warp_thread_index; i < next_diagonal_length; i += warp_size)
+                current_scores[i] = next_scores[i];
             __syncwarp();
         }
 
@@ -1256,7 +1274,7 @@ __global__ void _linear_score_on_each_cuda_warp( //
             diagonal_aligner(                               //
                 shorter + next_diagonal_index - longer_dim, // first sequence of characters
                 longer + next_diagonal_index - shorter_dim, // second sequence of characters
-                threadIdx.x, blockDim.x,                    //
+                warp_thread_index, warp_size,               //
                 next_diagonal_length,                       // number of elements to compute with the `diagonal_aligner`
                 previous_scores,                            // costs pre substitution
                 current_scores, current_scores + 1,         // costs pre insertion/deletion
@@ -1273,7 +1291,7 @@ __global__ void _linear_score_on_each_cuda_warp( //
         }
 
         // Export one result per each block.
-        if (threadIdx.x == 0) result_ref = diagonal_aligner.score();
+        if (is_main_thread) result_ref = diagonal_aligner.score();
     }
 }
 
@@ -1297,9 +1315,10 @@ template < //
     sz_similarity_locality_t locality_ = sz_similarity_global_k, //
     sz_capability_t capability_ = sz_cap_cuda_k                  //
     >
-__global__ void _affine_score_on_each_cuda_warp( //
-    task_type_ *tasks, size_t tasks_count,       //
-    substituter_type_ const substituter, affine_gap_costs_t const gap_costs) {
+__global__ void _affine_score_on_each_cuda_warp(                             //
+    task_type_ *tasks, size_t tasks_count,                                   //
+    substituter_type_ const substituter, affine_gap_costs_t const gap_costs, //
+    uint const shared_memory_size) {
 
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
     using task_t = task_type_;
@@ -1319,15 +1338,25 @@ __global__ void _affine_score_on_each_cuda_warp( //
     using warp_scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t, objective_k,
                                       locality_k, capability_k>;
 
-    // Only one thread will be initializing the top row and left column and outputting the result.
-    bool const is_main_thread = threadIdx.x == 0; // ! Differs for device-wide
+    // We may have multiple warps operating in the same block.
+    uint const warp_size = warpSize;
+    size_t const global_thread_index = static_cast<uint>(blockIdx.x * blockDim.x + threadIdx.x);
+    size_t const global_warp_index = static_cast<uint>(global_thread_index / warp_size);
+    size_t const warps_per_block = static_cast<uint>(blockDim.x / warp_size);
+    size_t const warps_per_device = static_cast<uint>(gridDim.x * warps_per_block);
+    uint const warp_thread_index = static_cast<uint>(global_thread_index % warp_size);
 
     // Allocating shared memory is handled on the host side.
-    extern __shared__ char shared_memory_buffer[];
+    extern __shared__ char shared_memory_for_block[];
+    char *const shared_memory_for_warp =
+        shared_memory_for_block + (global_warp_index % warps_per_block) * (shared_memory_size / warps_per_block);
+
+    // Only one thread will be initializing the top row and left column and outputting the result.
+    bool const is_main_thread = warp_thread_index == 0;
 
     // We are computing N edit distances for N pairs of strings. Not a cartesian product!
     // Each block/warp may end up receiving a different number of strings.
-    for (size_t task_idx = blockIdx.x; task_idx < tasks_count; task_idx += gridDim.x) {
+    for (size_t task_idx = global_warp_index; task_idx < tasks_count; task_idx += warps_per_device) {
         task_t &task = tasks[task_idx];
         char_t const *shorter_global = task.shorter_ptr;
         char_t const *longer_global = task.longer_ptr;
@@ -1351,19 +1380,19 @@ __global__ void _affine_score_on_each_cuda_warp( //
         uint const bytes_per_diagonal = round_up_to_multiple<uint>(max_diagonal_length * sizeof(score_t), 4);
 
         // The next few pointers will be swapped around.
-        score_t *previous_scores = reinterpret_cast<score_t *>(shared_memory_buffer);
-        score_t *current_scores = reinterpret_cast<score_t *>(shared_memory_buffer + bytes_per_diagonal);
-        score_t *next_scores = reinterpret_cast<score_t *>(shared_memory_buffer + 2 * bytes_per_diagonal);
-        score_t *current_inserts = reinterpret_cast<score_t *>(shared_memory_buffer + 3 * bytes_per_diagonal);
-        score_t *next_inserts = reinterpret_cast<score_t *>(shared_memory_buffer + 4 * bytes_per_diagonal);
-        score_t *current_deletes = reinterpret_cast<score_t *>(shared_memory_buffer + 5 * bytes_per_diagonal);
-        score_t *next_deletes = reinterpret_cast<score_t *>(shared_memory_buffer + 6 * bytes_per_diagonal);
-        char_t *const longer = reinterpret_cast<char_t *>(shared_memory_buffer + 7 * bytes_per_diagonal);
+        score_t *previous_scores = reinterpret_cast<score_t *>(shared_memory_for_warp);
+        score_t *current_scores = reinterpret_cast<score_t *>(shared_memory_for_warp + bytes_per_diagonal);
+        score_t *next_scores = reinterpret_cast<score_t *>(shared_memory_for_warp + 2 * bytes_per_diagonal);
+        score_t *current_inserts = reinterpret_cast<score_t *>(shared_memory_for_warp + 3 * bytes_per_diagonal);
+        score_t *next_inserts = reinterpret_cast<score_t *>(shared_memory_for_warp + 4 * bytes_per_diagonal);
+        score_t *current_deletes = reinterpret_cast<score_t *>(shared_memory_for_warp + 5 * bytes_per_diagonal);
+        score_t *next_deletes = reinterpret_cast<score_t *>(shared_memory_for_warp + 6 * bytes_per_diagonal);
+        char_t *const longer = reinterpret_cast<char_t *>(shared_memory_for_warp + 7 * bytes_per_diagonal);
         char_t *const shorter = longer + longer_length;
 
         // Each thread in the warp will be loading it's own set of strided characters into shared memory.
-        for (uint i = threadIdx.x; i < longer_length; i += blockDim.x) longer[i] = longer_global[i];
-        for (uint i = threadIdx.x; i < shorter_length; i += blockDim.x) shorter[i] = shorter_global[i];
+        for (uint i = warp_thread_index; i < longer_length; i += warp_size) longer[i] = longer_global[i];
+        for (uint i = warp_thread_index; i < shorter_length; i += warp_size) shorter[i] = shorter_global[i];
 
         // Initialize the first two diagonals:
         warp_scorer_t diagonal_aligner {substituter, gap_costs};
@@ -1391,7 +1420,7 @@ __global__ void _affine_score_on_each_cuda_warp( //
             diagonal_aligner(                         //
                 shorter,                              // first sequence of characters
                 longer,                               // second sequence of characters
-                threadIdx.x, blockDim.x,              //
+                warp_thread_index, warp_size,         //
                 next_diagonal_length - 2,             // number of elements to compute with the `diagonal_aligner`
                 previous_scores,                      // costs pre substitution
                 current_scores, current_scores + 1,   // costs pre insertion/deletion opening
@@ -1422,7 +1451,7 @@ __global__ void _affine_score_on_each_cuda_warp( //
             diagonal_aligner(                               //
                 shorter,                                    // first sequence of characters
                 longer + next_diagonal_index - shorter_dim, // second sequence of characters
-                threadIdx.x, blockDim.x,                    //
+                warp_thread_index, warp_size,               //
                 next_diagonal_length - 1,                   // number of elements to compute with the `diagonal_aligner`
                 previous_scores,                            // costs pre substitution
                 current_scores, current_scores + 1,         // costs pre insertion/deletion opening
@@ -1444,10 +1473,11 @@ __global__ void _affine_score_on_each_cuda_warp( //
             // ! In the central anti-diagonal band, we can't just set the `current_scores + 1` to `previous_scores`
             // ! for the circular shift, as we will end up spilling outside of the diagonal a few iterations later.
             // ! Assuming in-place `memmove` is tricky on the GPU, so we will copy the data.
-            for (size_t i = threadIdx.x; i + 1 < next_diagonal_length; i += blockDim.x)
+            for (size_t i = warp_thread_index; i + 1 < next_diagonal_length; i += warp_size)
                 previous_scores[i] = current_scores[i + 1];
             __syncwarp();
-            for (size_t i = threadIdx.x; i < next_diagonal_length; i += blockDim.x) current_scores[i] = next_scores[i];
+            for (size_t i = warp_thread_index; i < next_diagonal_length; i += warp_size)
+                current_scores[i] = next_scores[i];
             __syncwarp();
         }
 
@@ -1458,7 +1488,7 @@ __global__ void _affine_score_on_each_cuda_warp( //
             diagonal_aligner(                               //
                 shorter + next_diagonal_index - longer_dim, // first sequence of characters
                 longer + next_diagonal_index - shorter_dim, // second sequence of characters
-                threadIdx.x, blockDim.x,                    //
+                warp_thread_index, warp_size,               //
                 next_diagonal_length,                       // number of elements to compute with the `diagonal_aligner`
                 previous_scores,                            // costs pre substitution
                 current_scores, current_scores + 1,         // costs pre insertion/deletion opening
@@ -1681,7 +1711,7 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
                     : (void *)&_linear_score_on_each_cuda_warp<task_t, char_t, sz_u16_t, sz_u16_t,
                                                                uniform_substitution_costs_t, sz_minimize_distance_k,
                                                                sz_similarity_global_k, capability_k>;
-            void *warp_level_kernel_args[4];
+            void *warp_level_kernel_args[5];
 
             cuda_status_t result;
             auto const task_size_equality = [](task_t const &lhs, task_t const &rhs) {
@@ -1691,21 +1721,25 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
                 // Check if we need to stop processing.
                 if (result.status != status_t::success_k) return;
 
-                size_t const count_tasks = tasks_end - tasks_begin;
-                warp_level_kernel_args[0] = (void *)(&tasks_begin);
-                warp_level_kernel_args[1] = (void *)(&count_tasks);
-                warp_level_kernel_args[2] = (void *)(&substituter_);
-                warp_level_kernel_args[3] = (void *)(&gap_costs_);
-
+                // Make sure all tasks can be handled by the same kernel template.
+                task_t const &first_task = *tasks_begin;
+                _sz_assert(std::all_of(tasks_begin, tasks_end, [&](task_t const &task) {
+                    return task.bytes_per_cell == first_task.bytes_per_cell && task.density == first_task.density;
+                }));
+
+                // Find the task in the batch that requires the most memory.
+                task_t const &indicative_task =
+                    *std::max_element(tasks_begin, tasks_end, [](task_t const &lhs, task_t const &rhs) {
+                        return lhs.memory_requirement < rhs.memory_requirement;
+                    });
                 // Pick the smallest fitting type for the diagonals.
-                task_t const &indicative_task = tasks_begin[0];
                 void *warp_level_kernel = reinterpret_cast<void *>(warp_level_u8_kernel);
                 if (indicative_task.bytes_per_cell >= sizeof(sz_u16_t))
                     warp_level_kernel = reinterpret_cast<void *>(warp_level_u16_kernel);
 
                 // Update the selected kernels properties.
-                size_t const shared_memory_per_block =
-                    indicative_task.memory_requirement * static_cast<size_t>(indicative_task.density);
+                uint const shared_memory_per_block =
+                    static_cast<uint>(indicative_task.memory_requirement * indicative_task.density);
                 _sz_assert(shared_memory_per_block > 0);
                 _sz_assert(shared_memory_per_block < specs.shared_memory_per_multiprocessor());
                 cudaError_t attribute_error = cudaFuncSetAttribute(
@@ -1715,16 +1749,22 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
                     return;
                 }
 
+                size_t const count_tasks = tasks_end - tasks_begin;
+                warp_level_kernel_args[0] = (void *)(&tasks_begin);
+                warp_level_kernel_args[1] = (void *)(&count_tasks);
+                warp_level_kernel_args[2] = (void *)(&substituter_);
+                warp_level_kernel_args[3] = (void *)(&gap_costs_);
+                warp_level_kernel_args[4] = (void *)(&shared_memory_per_block);
+
                 // Warp-level algorithm clearly aligns with the warp size.
-                uint const warp_block_size = static_cast<uint>(specs.warp_size);
-                uint const warp_blocks_per_multiprocessor = static_cast<uint>(indicative_task.density);
-                cudaError_t launch_error = cudaLaunchKernel(                                //
-                    reinterpret_cast<void *>(warp_level_kernel),                            // Kernel function pointer
-                    dim3(warp_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions
-                    dim3(warp_block_size),                                                  // Block dimensions
-                    warp_level_kernel_args,  // Array of kernel argument pointers
-                    shared_memory_per_block, // Shared memory per block (in bytes)
-                    executor.stream);        // CUDA stream
+                uint const threads_per_block = static_cast<uint>(specs.warp_size * indicative_task.density);
+                cudaError_t launch_error = cudaLaunchKernel(     //
+                    reinterpret_cast<void *>(warp_level_kernel), // Kernel function pointer
+                    dim3(specs.streaming_multiprocessors),       // Grid dimensions
+                    dim3(threads_per_block),                     // Block dimensions
+                    warp_level_kernel_args,                      // Array of kernel argument pointers
+                    shared_memory_per_block,                     // Shared memory per block (in bytes)
+                    executor.stream);                            // CUDA stream
                 if (launch_error != cudaSuccess) {
                     result = {launch_error == cudaErrorMemoryAllocation ? status_t::bad_alloc_k : status_t::unknown_k,
                               launch_error};
@@ -1903,12 +1943,12 @@ cuda_status_t _needleman_wunsch_via_cuda_warp(
     // Make sure that we don't string pairs that are too large to fit 3 matrix diagonals into shared memory.
     // H100 Streaming Multiprocessor can have up to 128 active warps concurrently and only 256 KB of shared memory.
     // A100 SMs had only 192 KB. We can't deal with blocks that require more memory than the SM can provide.
-    size_t shared_memory_per_block =
+    size_t shared_memory_per_multiprocessor =
         _scores_diagonally_warp_shared_memory_requirement<true>(first_strings, second_strings, substituter.magnitude());
-    if (shared_memory_per_block > specs.shared_memory_per_multiprocessor()) return {status_t::bad_alloc_k};
+    if (shared_memory_per_multiprocessor > specs.shared_memory_per_multiprocessor()) return {status_t::bad_alloc_k};
 
     // It may be the case that we've only received empty strings.
-    if (shared_memory_per_block == 0) {
+    if (shared_memory_per_multiprocessor == 0) {
         for (size_t i = 0; i < first_strings.size(); ++i)
             if (first_strings[i].length() == 0) { results[i] = second_strings[i].length(); }
             else if (second_strings[i].length() == 0) { results[i] = first_strings[i].length(); }
@@ -1916,7 +1956,7 @@ cuda_status_t _needleman_wunsch_via_cuda_warp(
     }
 
     // In most cases we should be able to fit many blocks per SM.
-    size_t count_blocks_per_multiprocessor = specs.shared_memory_per_multiprocessor() / shared_memory_per_block;
+    size_t count_blocks_per_multiprocessor = specs.shared_memory_per_multiprocessor() / shared_memory_per_multiprocessor;
     if (count_blocks_per_multiprocessor > specs.max_blocks_per_multiprocessor)
         count_blocks_per_multiprocessor = specs.max_blocks_per_multiprocessor;
     if (count_blocks_per_multiprocessor > first_strings.size()) count_blocks_per_multiprocessor = first_strings.size();
@@ -1957,7 +1997,7 @@ cuda_status_t _needleman_wunsch_via_cuda_warp(
         dim3(count_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions
         dim3(threads_per_block),                                                 // Block dimensions
         kernel_args,                                                             // Array of kernel argument pointers
-        shared_memory_per_block,                                                 // Shared memory per block (in bytes)
+        shared_memory_per_multiprocessor,                                                 // Shared memory per block (in bytes)
         executor.stream);                                                        // CUDA stream
     if (launch_error != cudaSuccess)
         if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
diff --git a/scripts/bench_similarity.cuh b/scripts/bench_similarity.cuh
index c2f0631f..68ef7980 100644
--- a/scripts/bench_similarity.cuh
+++ b/scripts/bench_similarity.cuh
@@ -164,7 +164,7 @@ void bench_levenshtein(environment_t const &env) {
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
             .log(linear_baseline);
-        scramble_accelerated_results();
+        scramble_accelerated_results(results_linear_accelerated);
 
         bench_unary(
             env, "affine_levenshtein_cuda:batch"s + std::to_string(batch_size), call_affine_baseline,

From 8a6a1851f17585fee960d3ce977cf33ff2df312a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 6 May 2025 07:42:38 +0000
Subject: [PATCH 400/751] Improve: Scheduling speculative kernels

---
 include/stringcuzilla/similarity.cuh | 23 ++++++++++++--------
 include/stringcuzilla/types.cuh      | 32 ++++++++++++++++++++++++++--
 2 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index 5bfa211d..494aaea6 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -1732,14 +1732,19 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
                     *std::max_element(tasks_begin, tasks_end, [](task_t const &lhs, task_t const &rhs) {
                         return lhs.memory_requirement < rhs.memory_requirement;
                     });
+
                 // Pick the smallest fitting type for the diagonals.
                 void *warp_level_kernel = reinterpret_cast<void *>(warp_level_u8_kernel);
                 if (indicative_task.bytes_per_cell >= sizeof(sz_u16_t))
                     warp_level_kernel = reinterpret_cast<void *>(warp_level_u16_kernel);
 
+                // Even if we can fit more warps per block we sometimes should not.
+                auto const [optimal_density, speculative_factor] =
+                    speculation_friendly_density(indicative_task.density);
+
                 // Update the selected kernels properties.
                 uint const shared_memory_per_block =
-                    static_cast<uint>(indicative_task.memory_requirement * indicative_task.density);
+                    static_cast<uint>(indicative_task.memory_requirement * optimal_density);
                 _sz_assert(shared_memory_per_block > 0);
                 _sz_assert(shared_memory_per_block < specs.shared_memory_per_multiprocessor());
                 cudaError_t attribute_error = cudaFuncSetAttribute(
@@ -1757,14 +1762,14 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
                 warp_level_kernel_args[4] = (void *)(&shared_memory_per_block);
 
                 // Warp-level algorithm clearly aligns with the warp size.
-                uint const threads_per_block = static_cast<uint>(specs.warp_size * indicative_task.density);
-                cudaError_t launch_error = cudaLaunchKernel(     //
-                    reinterpret_cast<void *>(warp_level_kernel), // Kernel function pointer
-                    dim3(specs.streaming_multiprocessors),       // Grid dimensions
-                    dim3(threads_per_block),                     // Block dimensions
-                    warp_level_kernel_args,                      // Array of kernel argument pointers
-                    shared_memory_per_block,                     // Shared memory per block (in bytes)
-                    executor.stream);                            // CUDA stream
+                uint const threads_per_block = static_cast<uint>(specs.warp_size * optimal_density);
+                cudaError_t launch_error = cudaLaunchKernel(                    //
+                    reinterpret_cast<void *>(warp_level_kernel),                // Kernel function pointer
+                    dim3(specs.streaming_multiprocessors * speculative_factor), // Grid dimensions
+                    dim3(threads_per_block),                                    // Block dimensions
+                    warp_level_kernel_args,                                     // Array of kernel argument pointers
+                    shared_memory_per_block,                                    // Shared memory per block (in bytes)
+                    executor.stream);                                           // CUDA stream
                 if (launch_error != cudaSuccess) {
                     result = {launch_error == cudaErrorMemoryAllocation ? status_t::bad_alloc_k : status_t::unknown_k,
                               launch_error};
diff --git a/include/stringcuzilla/types.cuh b/include/stringcuzilla/types.cuh
index 65e15287..9e57e425 100644
--- a/include/stringcuzilla/types.cuh
+++ b/include/stringcuzilla/types.cuh
@@ -137,6 +137,11 @@ __forceinline__ __device__ sz_u32_vec_t sz_u32_load_unaligned(void const *ptr) n
     return result;
 }
 
+/**
+ *  @brief  Defines the upper bound on the number of warps per multi processor we may theoretically
+ *          be able to run as part of one or many blocks. Generally this number depends on the amount
+ *          of shared memory available on the device, and the amount of reserved memory per block.
+ */
 enum warp_tasks_density_t : uint {
     warps_working_together_k = 0,
     one_warp_per_multiprocessor_k = 1,
@@ -145,21 +150,44 @@ enum warp_tasks_density_t : uint {
     eight_warps_per_multiprocessor_k = 8,
     sixteen_warps_per_multiprocessor_k = 16,
     thirty_two_warps_per_multiprocessor_k = 32,
+    sixty_four_warps_per_multiprocessor_k = 64,
     infinite_warps_per_multiprocessor_k = 0xFFFFFFFF
 };
 
 inline warp_tasks_density_t warp_tasks_density(size_t task_memory_requirement, gpu_specs_t const &specs) noexcept {
     std::initializer_list<warp_tasks_density_t> densities {
-        thirty_two_warps_per_multiprocessor_k, sixteen_warps_per_multiprocessor_k, eight_warps_per_multiprocessor_k,
-        four_warps_per_multiprocessor_k,       two_warps_per_multiprocessor_k,     one_warp_per_multiprocessor_k};
+        sixty_four_warps_per_multiprocessor_k, thirty_two_warps_per_multiprocessor_k,
+        sixteen_warps_per_multiprocessor_k,    eight_warps_per_multiprocessor_k,
+        four_warps_per_multiprocessor_k,       two_warps_per_multiprocessor_k,
+        one_warp_per_multiprocessor_k,
+    };
     if (task_memory_requirement == 0) return infinite_warps_per_multiprocessor_k;
     for (auto density : densities) {
+        if (density > specs.max_blocks_per_multiprocessor) continue;
         size_t required_block_memory = task_memory_requirement * density + specs.reserved_memory_per_block * density;
         if (required_block_memory < specs.shared_memory_per_multiprocessor()) return density;
     }
     return warps_working_together_k;
 }
 
+struct speculative_warp_tasks_density_t {
+    warp_tasks_density_t density = warps_working_together_k;
+    size_t speculative_factor = 0;
+};
+
+/**
+ *  @brief  Multiple warps can run concurrently on the same multiprocessor, which helps hide the latency
+ *          of memory operations. It only happens, if we have enough shared memory, so we may want to reduce
+ *          the density of the tasks proportional to the current GPU's speculative factor.
+ */
+inline speculative_warp_tasks_density_t speculation_friendly_density(warp_tasks_density_t maximum_density) noexcept {
+    // if (maximum_density >= 16) return {static_cast<warp_tasks_density_t>(maximum_density / 16), 16};
+    // if (maximum_density >= 8) return {static_cast<warp_tasks_density_t>(maximum_density / 8), 8};
+    if (maximum_density >= 4) return {static_cast<warp_tasks_density_t>(maximum_density / 4), 4};
+    if (maximum_density >= 2) return {static_cast<warp_tasks_density_t>(maximum_density / 2), 2};
+    return {maximum_density, 1};
+}
+
 template <typename task_type_>
 struct warp_tasks_groups {
     span<task_type_> device_level_tasks;

From ece416d08cc4aa2371295aa7f8b3de3979fedb1d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 6 May 2025 07:43:52 +0000
Subject: [PATCH 401/751] Improve: More noticeable signaling in tests

---
 scripts/test_stringcuzilla.cuh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index 366f21bb..3189f259 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -524,6 +524,7 @@ void test_similarity_scores_fixed(base_operator_ &&base_operator, simd_operator_
     unified_vector<score_t> results_base(1), results_simd(1);
     arrow_strings_tape_t first_tape, second_tape;
     bool contains_missing_in_any_case = false;
+    constexpr score_t signaling_score = std::numeric_limits<score_t>::max();
 
     // Old C-style for-loops are much more debuggable than range-based loops!
     for (std::size_t pair_idx = 0; pair_idx != test_cases.size(); ++pair_idx) {
@@ -540,7 +541,7 @@ void test_similarity_scores_fixed(base_operator_ &&base_operator, simd_operator_
         }
 
         // Reset the tapes and results
-        results_base[0] = 0, results_simd[0] = 0;
+        results_base[0] = signaling_score, results_simd[0] = signaling_score;
         first_tape.try_assign(&first, &first + 1);
         second_tape.try_assign(&second, &second + 1);
 
@@ -559,8 +560,8 @@ void test_similarity_scores_fixed(base_operator_ &&base_operator, simd_operator_
 
     // Unzip the test cases into two separate tapes and perform batch processing
     if (!contains_missing_in_any_case) {
-        results_base.resize(test_cases.size());
-        results_simd.resize(test_cases.size());
+        results_base.resize(test_cases.size(), signaling_score);
+        results_simd.resize(test_cases.size(), signaling_score);
         first_tape.reset();
         second_tape.reset();
         for (auto [first, second] : test_cases) {

From f09bbf9970dd870b1423af7dad18746445d05467 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 6 May 2025 08:48:44 +0000
Subject: [PATCH 402/751] Add: Affine gaps Levenshtein on Kepler

---
 include/stringcuzilla/similarity.cuh | 204 +++++++++++++++++++++++++--
 1 file changed, 196 insertions(+), 8 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index 494aaea6..65fcec35 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -58,25 +58,19 @@ using ualloc_t = unified_alloc<char>;
  *  - for GPUs after Hopper, we compound that with thread-level @b SIMD via @b DPX instructions for min-max
  */
 using levenshtein_cuda_t = levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
-using levenshtein_utf8_cuda_t = levenshtein_distances_utf8<char, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
 using needleman_wunsch_cuda_t =
     needleman_wunsch_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
 using smith_waterman_cuda_t =
     smith_waterman_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
 
 using affine_levenshtein_cuda_t = levenshtein_distances<char, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
-using affine_levenshtein_utf8_cuda_t = levenshtein_distances_utf8<char, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
 using affine_needleman_wunsch_cuda_t =
     needleman_wunsch_scores<char, error_costs_256x256_t, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
 using affine_smith_waterman_cuda_t =
     smith_waterman_scores<char, error_costs_256x256_t, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
 
 using levenshtein_kepler_t = levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_caps_ck_k>;
-using levenshtein_utf8_hopper_t = levenshtein_distances_utf8<char, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k>;
-using needleman_wunsch_hopper_t =
-    needleman_wunsch_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k>;
-using smith_waterman_hopper_t =
-    smith_waterman_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k>;
+using affine_levenshtein_kepler_t = levenshtein_distances<char, affine_gap_costs_t, ualloc_t, sz_caps_ck_k>;
 
 #pragma endregion - Common Aliases
 
@@ -563,7 +557,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 #if SZ_USE_KEPLER
 
 /**
- *  @brief GPU adaptation of the `tile_scorer` - Minimizes Global Levenshtein distance.
+ *  @brief GPU adaptation of the `tile_scorer` - Minimizes Global Levenshtein distance with linear gap costs.
  *  @note Requires Kepler generation GPUs to handle 4x `u8` scores at a time.
  *
  *  Relies on following instruction families to output 4x @b `u8` scores per call:
@@ -635,6 +629,14 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
     }
 };
 
+/**
+ *  @brief GPU adaptation of the `tile_scorer` - Minimizes Global Levenshtein distance with linear gap costs.
+ *  @note Requires Kepler generation GPUs to handle 2x `u8` scores at a time.
+ *
+ *  Relies on following instruction families to output 2x @b `u16` scores per call:
+ *  - @b `prmt` to shuffle bytes in 32 bit registers.
+ *  - @b `vmax2,vmin2,vadd2` video-processing instructions.
+ */
 template <>
 struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
@@ -706,6 +708,192 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
     using warp_scorer_t::tile_scorer; // Make the constructors visible
 };
 
+/**
+ *  @brief GPU adaptation of the `tile_scorer` - Minimizes Global Levenshtein distance with affine gap costs.
+ *  @note Requires Kepler generation GPUs to handle 4x `u8` scores at a time.
+ *
+ *  Relies on following instruction families to output 4x @b `u8` scores per call:
+ *  - @b `prmt` to shuffle bytes in 32 bit registers.
+ *  - @b `vmax4,vmin4,vadd4` video-processing instructions.
+ */
+template <>
+struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
+    : public tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
+
+    using warp_scorer_t::tile_scorer; // Make the constructors visible
+
+    __forceinline__ __device__ void operator()(                                 //
+        char const *first_slice, char const *second_slice,                      //
+        uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
+        sz_u8_t const *scores_pre_substitution,                                 //
+        sz_u8_t const *scores_pre_insertion,                                    //
+        sz_u8_t const *scores_pre_deletion,                                     //
+        sz_u8_t const *scores_running_insertions,                               //
+        sz_u8_t const *scores_running_deletions,                                //
+        sz_u8_t *scores_new,                                                    //
+        sz_u8_t *scores_new_insertions,                                         //
+        sz_u8_t *scores_new_deletions) noexcept {
+
+        sz_u8_t const match_cost = this->substituter_.match;
+        sz_u8_t const mismatch_cost = this->substituter_.mismatch;
+        sz_u8_t const gap_open_cost = this->gap_costs_.open;
+        sz_u8_t const gap_extend_cost = this->gap_costs_.extend;
+        sz_u32_vec_t match_cost_vec, mismatch_cost_vec, gap_open_cost_vec, gap_extend_cost_vec, equality_vec;
+        match_cost_vec.u32 = match_cost * 0x01010101u;           // ! 4x `u8` match costs
+        mismatch_cost_vec.u32 = mismatch_cost * 0x01010101u;     // ! 4x `u8` mismatch costs
+        gap_open_cost_vec.u32 = gap_open_cost * 0x01010101u;     // ! 4x `u8` gap costs
+        gap_extend_cost_vec.u32 = gap_extend_cost * 0x01010101u; // ! 4x `u8` gap costs
+
+        // The hardest part of this kernel is dealing with unaligned loads!
+        // We want to minimize single-byte processing in favor of 4-byte SIMD loads and min/max operations.
+        // Assuming we are reading consecutive values from a buffer, in every cycle, most likely, we will be
+        // dealing with most values being unaligned!
+        sz_u32_vec_t pre_substitution_vec, pre_insertion_opening_vec, pre_deletion_opening_vec;
+        sz_u32_vec_t pre_insertion_expansion_vec, pre_deletion_expansion_vec;
+        sz_u32_vec_t first_vec, second_vec;
+        sz_u32_vec_t cost_of_substitution_vec, if_substitution_vec, if_insertion_vec, if_deletion_vec;
+        sz_u32_vec_t cell_score_vec;
+
+        // ! As we are processing 4 bytes per loop, and have at least 32 threads per block (32 * 4 = 128),
+        // ! and deal with strings only under 256 bytes, this loop will fire at most twice per input.
+        for (uint i = tasks_offset * 4; i < tasks_count; i += tasks_step * 4) { // ! it's OK to spill beyond bounds
+            pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
+            pre_insertion_opening_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
+            pre_deletion_opening_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
+            pre_insertion_expansion_vec = sz_u32_load_unaligned(scores_running_insertions + i);
+            pre_deletion_expansion_vec = sz_u32_load_unaligned(scores_running_deletions + i);
+            first_vec = sz_u32_load_unaligned(first_slice + tasks_count - i - 4);
+            second_vec = sz_u32_load_unaligned(second_slice + i);
+            first_vec.u32 = __nv_bswap32(first_vec.u32); // ! reverse the order of bytes in the first vector
+
+            // Equality comparison will output 0xFF for each matching byte.
+            equality_vec.u32 = __vcmpeq4(first_vec.u32, second_vec.u32);
+            cost_of_substitution_vec.u32 =                //
+                (equality_vec.u32 & match_cost_vec.u32) + //
+                (~equality_vec.u32 & mismatch_cost_vec.u32);
+            if_substitution_vec.u32 = __vaddus4(pre_substitution_vec.u32, cost_of_substitution_vec.u32);
+            if_insertion_vec.u32 = __vminu4(__vaddus4(pre_insertion_opening_vec.u32, gap_open_cost_vec.u32),
+                                            __vaddus4(pre_insertion_expansion_vec.u32, gap_extend_cost_vec.u32));
+            if_deletion_vec.u32 = __vminu4(__vaddus4(pre_deletion_opening_vec.u32, gap_open_cost_vec.u32),
+                                           __vaddus4(pre_deletion_expansion_vec.u32, gap_extend_cost_vec.u32));
+            cell_score_vec.u32 = __vminu4(if_substitution_vec.u32, __vminu4(if_insertion_vec.u32, if_deletion_vec.u32));
+
+            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
+            scores_new[i + 0] = cell_score_vec.u8s[0];
+            scores_new[i + 1] = cell_score_vec.u8s[1];
+            scores_new[i + 2] = cell_score_vec.u8s[2];
+            scores_new[i + 3] = cell_score_vec.u8s[3];
+            scores_new_insertions[i + 0] = if_insertion_vec.u8s[0];
+            scores_new_insertions[i + 1] = if_insertion_vec.u8s[1];
+            scores_new_insertions[i + 2] = if_insertion_vec.u8s[2];
+            scores_new_insertions[i + 3] = if_insertion_vec.u8s[3];
+            scores_new_deletions[i + 0] = if_deletion_vec.u8s[0];
+            scores_new_deletions[i + 1] = if_deletion_vec.u8s[1];
+            scores_new_deletions[i + 2] = if_deletion_vec.u8s[2];
+            scores_new_deletions[i + 3] = if_deletion_vec.u8s[3];
+        }
+
+        // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
+        if (tasks_offset == 0) this->last_cell_ = scores_new[0];
+    }
+};
+
+/**
+ *  @brief GPU adaptation of the `tile_scorer` - Minimizes Global Levenshtein distance with affine gap costs.
+ *  @note Requires Kepler generation GPUs to handle 2x `u8` scores at a time.
+ *
+ *  Relies on following instruction families to output 2x @b `u16` scores per call:
+ *  - @b `prmt` to shuffle bytes in 32 bit registers.
+ *  - @b `vmax2,vmin2,vadd2` video-processing instructions.
+ */
+template <>
+struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
+    : public tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
+    using warp_scorer_t::tile_scorer; // Make the constructors visible
+
+    __forceinline__ __device__ void operator()(                                 //
+        char const *first_slice, char const *second_slice,                      //
+        uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
+        sz_u16_t const *scores_pre_substitution,                                //
+        sz_u16_t const *scores_pre_insertion,                                   //
+        sz_u16_t const *scores_pre_deletion,                                    //
+        sz_u16_t const *scores_running_insertions,                              //
+        sz_u16_t const *scores_running_deletions,                               //
+        sz_u16_t *scores_new,                                                   //
+        sz_u16_t *scores_new_insertions,                                        //
+        sz_u16_t *scores_new_deletions) noexcept {
+
+        sz_u16_t const match_cost = this->substituter_.match;
+        sz_u16_t const mismatch_cost = this->substituter_.mismatch;
+        sz_u16_t const gap_open_cost = this->gap_costs_.open;
+        sz_u16_t const gap_extend_cost = this->gap_costs_.extend;
+        sz_u32_vec_t match_cost_vec, mismatch_cost_vec, gap_open_cost_vec, gap_extend_cost_vec, equality_vec;
+        match_cost_vec.u32 = match_cost * 0x00010001;           // ! 2x `u16` match costs
+        mismatch_cost_vec.u32 = mismatch_cost * 0x00010001;     // ! 2x `u16` mismatch costs
+        gap_open_cost_vec.u32 = gap_open_cost * 0x00010001;     // ! 2x `u16` gap costs
+        gap_extend_cost_vec.u32 = gap_extend_cost * 0x00010001; // ! 2x `u16` gap costs
+
+        // The hardest part of this kernel is dealing with unaligned loads!
+        // We want to minimize single-byte processing in favor of 2-byte SIMD loads and min/max operations.
+        // Assuming we are reading consecutive values from a buffer, in every cycle, most likely, we will be
+        // dealing with most values being unaligned!
+        sz_u32_vec_t pre_substitution_vec, pre_insertion_opening_vec, pre_deletion_opening_vec;
+        sz_u32_vec_t pre_insertion_expansion_vec, pre_deletion_expansion_vec;
+        sz_u32_vec_t first_vec, second_vec;
+        sz_u32_vec_t cost_of_substitution_vec, if_substitution_vec, if_insertion_vec, if_deletion_vec;
+        sz_u32_vec_t cell_score_vec;
+
+        // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
+        // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
+        for (uint i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
+            pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
+            pre_insertion_opening_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
+            pre_deletion_opening_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
+            pre_insertion_expansion_vec = sz_u32_load_unaligned(scores_running_insertions + i);
+            pre_deletion_expansion_vec = sz_u32_load_unaligned(scores_running_deletions + i);
+            first_vec.u16s[0] = *(first_slice + tasks_count - i - 1); // ! with a [] lookup would underflow
+            first_vec.u16s[1] = *(first_slice + tasks_count - i - 2); // ! with a [] lookup would underflow
+            second_vec.u16s[0] = second_slice[i + 0];
+            second_vec.u16s[1] = second_slice[i + 1];
+
+            // Equality comparison will output 0xFFFF for each matching byte-pair.
+            equality_vec.u32 = __vcmpeq2(first_vec.u32, second_vec.u32);
+            cost_of_substitution_vec.u32 =                //
+                (equality_vec.u32 & match_cost_vec.u32) + //
+                (~equality_vec.u32 & mismatch_cost_vec.u32);
+            if_substitution_vec.u32 = __vaddus2(pre_substitution_vec.u32, cost_of_substitution_vec.u32);
+            if_insertion_vec.u32 = __vminu2(__vaddus2(pre_insertion_opening_vec.u32, gap_open_cost_vec.u32),
+                                            __vaddus2(pre_insertion_expansion_vec.u32, gap_extend_cost_vec.u32));
+            if_deletion_vec.u32 = __vminu2(__vaddus2(pre_deletion_opening_vec.u32, gap_open_cost_vec.u32),
+                                           __vaddus2(pre_deletion_expansion_vec.u32, gap_extend_cost_vec.u32));
+            cell_score_vec.u32 = __vminu2(if_substitution_vec.u32, __vminu2(if_insertion_vec.u32, if_deletion_vec.u32));
+
+            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
+            scores_new[i + 0] = cell_score_vec.u16s[0];
+            scores_new[i + 1] = cell_score_vec.u16s[1];
+            scores_new_insertions[i + 0] = if_insertion_vec.u16s[0];
+            scores_new_insertions[i + 1] = if_insertion_vec.u16s[1];
+            scores_new_deletions[i + 0] = if_deletion_vec.u16s[0];
+            scores_new_deletions[i + 1] = if_deletion_vec.u16s[1];
+        }
+
+        // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
+        if (tasks_offset == 0) this->last_cell_ = scores_new[0];
+    }
+};
+
+template <>
+struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
+    : public tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
+    using warp_scorer_t::tile_scorer; // Make the constructors visible
+};
+
 #endif
 
 /**

From cb7c48d8e1d929634ba65b8b385876dc268d834a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 6 May 2025 09:20:30 +0000
Subject: [PATCH 403/751] Add: Hopper Levenshtein kernels

---
 include/stringcuzilla/similarity.cuh | 287 ++++++++++++++++++++++++---
 scripts/bench_similarity.cuh         |  28 +++
 2 files changed, 283 insertions(+), 32 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index 65fcec35..1871f29c 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -58,20 +58,24 @@ using ualloc_t = unified_alloc<char>;
  *  - for GPUs after Hopper, we compound that with thread-level @b SIMD via @b DPX instructions for min-max
  */
 using levenshtein_cuda_t = levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
+using affine_levenshtein_cuda_t = levenshtein_distances<char, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
+
+using levenshtein_kepler_t = levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_caps_ck_k>;
+using affine_levenshtein_kepler_t = levenshtein_distances<char, affine_gap_costs_t, ualloc_t, sz_caps_ck_k>;
+
+using levenshtein_hopper_t = levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k>;
+using affine_levenshtein_hopper_t = levenshtein_distances<char, affine_gap_costs_t, ualloc_t, sz_caps_ckh_k>;
+
 using needleman_wunsch_cuda_t =
     needleman_wunsch_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
 using smith_waterman_cuda_t =
     smith_waterman_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
 
-using affine_levenshtein_cuda_t = levenshtein_distances<char, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
 using affine_needleman_wunsch_cuda_t =
     needleman_wunsch_scores<char, error_costs_256x256_t, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
 using affine_smith_waterman_cuda_t =
     smith_waterman_scores<char, error_costs_256x256_t, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
 
-using levenshtein_kepler_t = levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_caps_ck_k>;
-using affine_levenshtein_kepler_t = levenshtein_distances<char, affine_gap_costs_t, ualloc_t, sz_caps_ck_k>;
-
 #pragma endregion - Common Aliases
 
 #pragma region - Algorithm Building Blocks
@@ -104,8 +108,8 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
     using char_t = typename std::remove_cvref<first_char_t>::type;
 
-    using warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, linear_gap_costs_t,
-                                      objective_k, sz_similarity_global_k, capability_k>;
+    using cuda_warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t,
+                                           linear_gap_costs_t, objective_k, sz_similarity_global_k, capability_k>;
 
   protected:
     substituter_t substituter_ {};
@@ -205,8 +209,8 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
     using char_t = typename std::remove_cvref<first_char_t>::type;
 
-    using warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, linear_gap_costs_t,
-                                      objective_k, sz_similarity_local_k, capability_k>;
+    using cuda_warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t,
+                                           linear_gap_costs_t, objective_k, sz_similarity_local_k, capability_k>;
 
   protected:
     substituter_t substituter_ {};
@@ -320,8 +324,8 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
     using char_t = typename std::remove_cvref<first_char_t>::type;
 
-    using warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, affine_gap_costs_t,
-                                      objective_k, sz_similarity_global_k, capability_k>;
+    using cuda_warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t,
+                                           affine_gap_costs_t, objective_k, sz_similarity_global_k, capability_k>;
 
   protected:
     substituter_t substituter_ {};
@@ -445,8 +449,8 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
     using char_t = typename std::remove_cvref<first_char_t>::type;
 
-    using warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, affine_gap_costs_t,
-                                      objective_k, sz_similarity_local_k, capability_k>;
+    using cuda_warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t,
+                                           affine_gap_costs_t, objective_k, sz_similarity_local_k, capability_k>;
 
   protected:
     substituter_t substituter_ {};
@@ -554,6 +558,11 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     }
 };
 
+/*  On Kepler and newer GPUs we benefit from the following:
+ *  - processing 4x 8-bit values or 2x 16-bit values at a time, packed as 32-bit words.
+ *  - warp-level exchange primitives for fast reduction of the best score.
+ */
+
 #if SZ_USE_KEPLER
 
 /**
@@ -570,7 +579,10 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
     : public tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
 
-    using warp_scorer_t::tile_scorer; // Make the constructors visible
+    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
+    using kepler_warp_scorer_t =
+        tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>;
 
     __forceinline__ __device__ void operator()(                                 //
         char const *first_slice, char const *second_slice,                      //
@@ -631,7 +643,7 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
 
 /**
  *  @brief GPU adaptation of the `tile_scorer` - Minimizes Global Levenshtein distance with linear gap costs.
- *  @note Requires Kepler generation GPUs to handle 2x `u8` scores at a time.
+ *  @note Requires Kepler generation GPUs to handle 2x `u16` scores at a time.
  *
  *  Relies on following instruction families to output 2x @b `u16` scores per call:
  *  - @b `prmt` to shuffle bytes in 32 bit registers.
@@ -642,7 +654,11 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
     : public tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
-    using warp_scorer_t::tile_scorer; // Make the constructors visible
+
+    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
+    using kepler_warp_scorer_t =
+        tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>;
 
     __forceinline__ __device__ void operator()(                                 //
         char const *first_slice, char const *second_slice,                      //
@@ -705,7 +721,11 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
     : public tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
-    using warp_scorer_t::tile_scorer; // Make the constructors visible
+
+    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
+    using kepler_warp_scorer_t =
+        tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>;
 };
 
 /**
@@ -722,7 +742,10 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
     : public tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
 
-    using warp_scorer_t::tile_scorer; // Make the constructors visible
+    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
+    using kepler_warp_scorer_t =
+        tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>;
 
     __forceinline__ __device__ void operator()(                                 //
         char const *first_slice, char const *second_slice,                      //
@@ -802,7 +825,7 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
 
 /**
  *  @brief GPU adaptation of the `tile_scorer` - Minimizes Global Levenshtein distance with affine gap costs.
- *  @note Requires Kepler generation GPUs to handle 2x `u8` scores at a time.
+ *  @note Requires Kepler generation GPUs to handle 2x `u16` scores at a time.
  *
  *  Relies on following instruction families to output 2x @b `u16` scores per call:
  *  - @b `prmt` to shuffle bytes in 32 bit registers.
@@ -813,7 +836,11 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
     : public tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
-    using warp_scorer_t::tile_scorer; // Make the constructors visible
+
+    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
+    using kepler_warp_scorer_t =
+        tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>;
 
     __forceinline__ __device__ void operator()(                                 //
         char const *first_slice, char const *second_slice,                      //
@@ -891,7 +918,203 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
     : public tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, affine_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
-    using warp_scorer_t::tile_scorer; // Make the constructors visible
+    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
+};
+
+#endif
+
+/*  Hopper-generation optimizations are quite different from Kepler.
+ *  Our Kepler optimizations are mostly designed for 8-bit and 16-bit scalars packed as 32-bit words,
+ *  while Hopper optimizations are designed for 16-bit and 32-bit scalars, grouping chains of add/min/max
+ *  operations using DPX instructions.
+ */
+
+#if SZ_USE_HOPPER
+
+template <>
+struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
+    : public tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                         sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k> {
+    using kepler_warp_scorer_t::tile_scorer; // Make the constructors visible
+};
+
+/**
+ *  @brief GPU adaptation of the `tile_scorer` - Minimizes Global Levenshtein distance with linear gap costs.
+ *  @note Requires Hopper generation GPUs to handle 2x `u16` scores at a time.
+ */
+template <>
+struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
+    : public tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
+    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
+
+    __forceinline__ __device__ void operator()(                                 //
+        char const *first_slice, char const *second_slice,                      //
+        uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
+        sz_u16_t const *scores_pre_substitution,                                //
+        sz_u16_t const *scores_pre_insertion,                                   //
+        sz_u16_t const *scores_pre_deletion,                                    //
+        sz_u16_t *scores_new) noexcept {
+
+        sz_u16_t const match_cost = this->substituter_.match;
+        sz_u16_t const mismatch_cost = this->substituter_.mismatch;
+        sz_u16_t const gap_cost = this->gap_costs_.open_or_extend;
+        sz_u32_vec_t match_cost_vec, mismatch_cost_vec, gap_cost_vec, equality_vec;
+        match_cost_vec.u32 = match_cost * 0x00010001;       // ! 2x `u16` match costs
+        mismatch_cost_vec.u32 = mismatch_cost * 0x00010001; // ! 2x `u16` mismatch costs
+        gap_cost_vec.u32 = gap_cost * 0x00010001;           // ! 2x `u16` gap costs
+
+        // The hardest part of this kernel is dealing with unaligned loads!
+        // We want to minimize single-byte processing in favor of 2-byte SIMD loads and min/max operations.
+        // Assuming we are reading consecutive values from a buffer, in every cycle, most likely, we will be
+        // dealing with most values being unaligned!
+        sz_u32_vec_t pre_substitution_vec, pre_insertion_vec, pre_deletion_vec;
+        sz_u32_vec_t first_vec, second_vec;
+        sz_u32_vec_t cost_of_substitution_vec, if_substitution_vec, if_insertion_vec, if_deletion_vec;
+        sz_u32_vec_t cell_score_vec;
+
+        // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
+        // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
+        for (uint i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
+            pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
+            pre_insertion_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
+            pre_deletion_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
+            first_vec.u16s[0] = *(first_slice + tasks_count - i - 1); // ! with a [] lookup would underflow
+            first_vec.u16s[1] = *(first_slice + tasks_count - i - 2); // ! with a [] lookup would underflow
+            second_vec.u16s[0] = second_slice[i + 0];
+            second_vec.u16s[1] = second_slice[i + 1];
+
+            // Equality comparison will output 0xFFFF for each matching byte-pair.
+            equality_vec.u32 = __vcmpeq2(first_vec.u32, second_vec.u32);
+            cost_of_substitution_vec.u32 =                //
+                (equality_vec.u32 & match_cost_vec.u32) + //
+                (~equality_vec.u32 & mismatch_cost_vec.u32);
+            if_substitution_vec.u32 = __vaddus2(pre_substitution_vec.u32, cost_of_substitution_vec.u32);
+            if_insertion_vec.u32 = __vaddus2(pre_insertion_vec.u32, gap_cost_vec.u32);
+            if_deletion_vec.u32 = __vaddus2(pre_deletion_vec.u32, gap_cost_vec.u32);
+            cell_score_vec.u32 = __vimin3_u16x2(if_substitution_vec.u32, if_insertion_vec.u32, if_deletion_vec.u32);
+
+            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
+            scores_new[i + 0] = cell_score_vec.u16s[0];
+            scores_new[i + 1] = cell_score_vec.u16s[1];
+        }
+
+        // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
+        if (tasks_offset == 0) this->last_cell_ = scores_new[0];
+    }
+};
+
+template <>
+struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
+    : public tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
+
+    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
+};
+
+template <>
+struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
+    : public tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                         sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k> {
+    using kepler_warp_scorer_t::tile_scorer; // Make the constructors visible
+};
+
+/**
+ *  @brief GPU adaptation of the `tile_scorer` - Minimizes Global Levenshtein distance with affine gap costs.
+ *  @note Requires Hopper generation GPUs to handle 2x `u8` scores at a time.
+ */
+template <>
+struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
+    : public tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
+
+    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
+
+    __forceinline__ __device__ void operator()(                                 //
+        char const *first_slice, char const *second_slice,                      //
+        uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
+        sz_u16_t const *scores_pre_substitution,                                //
+        sz_u16_t const *scores_pre_insertion,                                   //
+        sz_u16_t const *scores_pre_deletion,                                    //
+        sz_u16_t const *scores_running_insertions,                              //
+        sz_u16_t const *scores_running_deletions,                               //
+        sz_u16_t *scores_new,                                                   //
+        sz_u16_t *scores_new_insertions,                                        //
+        sz_u16_t *scores_new_deletions) noexcept {
+
+        sz_u16_t const match_cost = this->substituter_.match;
+        sz_u16_t const mismatch_cost = this->substituter_.mismatch;
+        sz_u16_t const gap_open_cost = this->gap_costs_.open;
+        sz_u16_t const gap_extend_cost = this->gap_costs_.extend;
+        sz_u32_vec_t match_cost_vec, mismatch_cost_vec, gap_open_cost_vec, gap_extend_cost_vec, equality_vec;
+        match_cost_vec.u32 = match_cost * 0x00010001;           // ! 2x `u16` match costs
+        mismatch_cost_vec.u32 = mismatch_cost * 0x00010001;     // ! 2x `u16` mismatch costs
+        gap_open_cost_vec.u32 = gap_open_cost * 0x00010001;     // ! 2x `u16` gap costs
+        gap_extend_cost_vec.u32 = gap_extend_cost * 0x00010001; // ! 2x `u16` gap costs
+
+        // The hardest part of this kernel is dealing with unaligned loads!
+        // We want to minimize single-byte processing in favor of 2-byte SIMD loads and min/max operations.
+        // Assuming we are reading consecutive values from a buffer, in every cycle, most likely, we will be
+        // dealing with most values being unaligned!
+        sz_u32_vec_t pre_substitution_vec, pre_insertion_opening_vec, pre_deletion_opening_vec;
+        sz_u32_vec_t pre_insertion_expansion_vec, pre_deletion_expansion_vec;
+        sz_u32_vec_t first_vec, second_vec;
+        sz_u32_vec_t cost_of_substitution_vec, if_substitution_vec, if_insertion_vec, if_deletion_vec;
+        sz_u32_vec_t cell_score_vec;
+
+        // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
+        // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
+        for (uint i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
+            pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
+            pre_insertion_opening_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
+            pre_deletion_opening_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
+            pre_insertion_expansion_vec = sz_u32_load_unaligned(scores_running_insertions + i);
+            pre_deletion_expansion_vec = sz_u32_load_unaligned(scores_running_deletions + i);
+            first_vec.u16s[0] = *(first_slice + tasks_count - i - 1); // ! with a [] lookup would underflow
+            first_vec.u16s[1] = *(first_slice + tasks_count - i - 2); // ! with a [] lookup would underflow
+            second_vec.u16s[0] = second_slice[i + 0];
+            second_vec.u16s[1] = second_slice[i + 1];
+
+            // Equality comparison will output 0xFFFF for each matching byte-pair.
+            equality_vec.u32 = __vcmpeq2(first_vec.u32, second_vec.u32);
+            cost_of_substitution_vec.u32 =                //
+                (equality_vec.u32 & match_cost_vec.u32) + //
+                (~equality_vec.u32 & mismatch_cost_vec.u32);
+            if_substitution_vec.u32 = __vaddus2(pre_substitution_vec.u32, cost_of_substitution_vec.u32);
+            if_insertion_vec.u32 = //
+                __viaddmin_u16x2(pre_insertion_opening_vec.u32, gap_open_cost_vec.u32,
+                                 __vaddus2(pre_insertion_expansion_vec.u32, gap_extend_cost_vec.u32));
+            if_deletion_vec.u32 = //
+                __viaddmin_u16x2(pre_deletion_opening_vec.u32, gap_open_cost_vec.u32,
+                                 __vaddus2(pre_deletion_expansion_vec.u32, gap_extend_cost_vec.u32));
+            cell_score_vec.u32 = __vimin3_u16x2(if_substitution_vec.u32, if_insertion_vec.u32, if_deletion_vec.u32);
+
+            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
+            scores_new[i + 0] = cell_score_vec.u16s[0];
+            scores_new[i + 1] = cell_score_vec.u16s[1];
+            scores_new_insertions[i + 0] = if_insertion_vec.u16s[0];
+            scores_new_insertions[i + 1] = if_insertion_vec.u16s[1];
+            scores_new_deletions[i + 0] = if_deletion_vec.u16s[0];
+            scores_new_deletions[i + 1] = if_deletion_vec.u16s[1];
+        }
+
+        // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
+        if (tasks_offset == 0) this->last_cell_ = scores_new[0];
+    }
+};
+
+template <>
+struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
+    : public tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
+
+    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
 };
 
 #endif
@@ -956,8 +1179,8 @@ __global__ void _linear_score_across_cuda_device(              //
     static_assert(std::is_trivially_copyable<substituter_t>::value, "Substituter must be trivially copyable.");
     static_assert(std::is_trivially_copyable<gap_costs_t>::value, "Gap costs must be trivially copyable.");
 
-    using warp_scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t, objective_k,
-                                      locality_k, capability_k>;
+    using cuda_warp_scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t,
+                                           objective_k, locality_k, capability_k>;
 
     // Only one thread will be initializing the top row and left column and outputting the result.
     bool const is_main_thread = blockIdx.x == 0 && threadIdx.x == 0;
@@ -981,7 +1204,7 @@ __global__ void _linear_score_across_cuda_device(              //
     score_t *next_scores = diagonals_ptr + 2 * max_diagonal_length;
 
     // Initialize the first two diagonals:
-    warp_scorer_t diagonal_aligner {substituter, gap_costs};
+    cuda_warp_scorer_t diagonal_aligner {substituter, gap_costs};
     if (is_main_thread) {
         diagonal_aligner.init_score(previous_scores[0], 0);
         diagonal_aligner.init_score(current_scores[0], 1);
@@ -1133,8 +1356,8 @@ __global__ void _affine_score_across_cuda_device(              //
     static_assert(std::is_trivially_copyable<substituter_t>::value, "Substituter must be trivially copyable.");
     static_assert(std::is_trivially_copyable<gap_costs_t>::value, "Gap costs must be trivially copyable.");
 
-    using warp_scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t, objective_k,
-                                      locality_k, capability_k>;
+    using cuda_warp_scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t,
+                                           objective_k, locality_k, capability_k>;
 
     // Only one thread will be initializing the top row and left column and outputting the result.
     bool const is_main_thread = blockIdx.x == 0 && threadIdx.x == 0; // ! Differs for warp-wide
@@ -1162,7 +1385,7 @@ __global__ void _affine_score_across_cuda_device(              //
     score_t *next_deletes = diagonals_ptr + 6 * max_diagonal_length;
 
     // Initialize the first two diagonals:
-    warp_scorer_t diagonal_aligner {substituter, gap_costs};
+    cuda_warp_scorer_t diagonal_aligner {substituter, gap_costs};
     if (is_main_thread) {
         diagonal_aligner.init_score(previous_scores[0], 0);
         diagonal_aligner.init_score(current_scores[0], 1);
@@ -1331,8 +1554,8 @@ __global__ void _linear_score_on_each_cuda_warp(                             //
     static_assert(std::is_trivially_copyable<substituter_t>::value, "Substituter must be trivially copyable.");
     static_assert(std::is_trivially_copyable<gap_costs_t>::value, "Gap costs must be trivially copyable.");
 
-    using warp_scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t, objective_k,
-                                      locality_k, capability_k>;
+    using cuda_warp_scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t,
+                                           objective_k, locality_k, capability_k>;
 
     // We may have multiple warps operating in the same block.
     uint const warp_size = warpSize;
@@ -1387,7 +1610,7 @@ __global__ void _linear_score_on_each_cuda_warp(                             //
         for (uint i = warp_thread_index; i < shorter_length; i += warp_size) shorter[i] = shorter_global[i];
 
         // Initialize the first two diagonals:
-        warp_scorer_t diagonal_aligner {substituter, gap_costs};
+        cuda_warp_scorer_t diagonal_aligner {substituter, gap_costs};
         if (is_main_thread) {
             diagonal_aligner.init_score(previous_scores[0], 0);
             diagonal_aligner.init_score(current_scores[0], 1);
@@ -1523,8 +1746,8 @@ __global__ void _affine_score_on_each_cuda_warp(                             //
     static_assert(std::is_trivially_copyable<substituter_t>::value, "Substituter must be trivially copyable.");
     static_assert(std::is_trivially_copyable<gap_costs_t>::value, "Gap costs must be trivially copyable.");
 
-    using warp_scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t, objective_k,
-                                      locality_k, capability_k>;
+    using cuda_warp_scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t,
+                                           objective_k, locality_k, capability_k>;
 
     // We may have multiple warps operating in the same block.
     uint const warp_size = warpSize;
@@ -1583,7 +1806,7 @@ __global__ void _affine_score_on_each_cuda_warp(                             //
         for (uint i = warp_thread_index; i < shorter_length; i += warp_size) shorter[i] = shorter_global[i];
 
         // Initialize the first two diagonals:
-        warp_scorer_t diagonal_aligner {substituter, gap_costs};
+        cuda_warp_scorer_t diagonal_aligner {substituter, gap_costs};
         if (is_main_thread) {
             diagonal_aligner.init_score(previous_scores[0], 0);
             diagonal_aligner.init_score(current_scores[0], 1);
diff --git a/scripts/bench_similarity.cuh b/scripts/bench_similarity.cuh
index 68ef7980..37607c34 100644
--- a/scripts/bench_similarity.cuh
+++ b/scripts/bench_similarity.cuh
@@ -184,6 +184,34 @@ void bench_levenshtein(environment_t const &env) {
                     similarities_equality_t {}) // equality check
             .log(linear_baseline);
         scramble_accelerated_results(results_linear_accelerated);
+
+        bench_unary(
+            env, "affine_levenshtein_kepler:batch"s + std::to_string(batch_size), call_affine_baseline,
+            similarities_callable<affine_levenshtein_kepler_t, sz::gpu_specs_t>(
+                env, results_affine_accelerated, affine_levenshtein_kepler_t {weird_uniform, weird_affine}, specs),
+            callable_no_op_t {},        // preprocessing
+            similarities_equality_t {}) // equality check
+            .log(linear_baseline, affine_baseline);
+        scramble_accelerated_results(results_affine_accelerated);
+#endif
+
+#if SZ_USE_HOPPER
+        bench_unary(env, "levenshtein_hopper:batch"s + std::to_string(batch_size), call_linear_baseline,
+                    similarities_callable<levenshtein_hopper_t, sz::gpu_specs_t>(
+                        env, results_linear_accelerated, levenshtein_hopper_t {weird_uniform, weird_linear}, specs),
+                    callable_no_op_t {},        // preprocessing
+                    similarities_equality_t {}) // equality check
+            .log(linear_baseline);
+        scramble_accelerated_results(results_linear_accelerated);
+
+        bench_unary(
+            env, "affine_levenshtein_hopper:batch"s + std::to_string(batch_size), call_affine_baseline,
+            similarities_callable<affine_levenshtein_hopper_t, sz::gpu_specs_t>(
+                env, results_affine_accelerated, affine_levenshtein_hopper_t {weird_uniform, weird_affine}, specs),
+            callable_no_op_t {},        // preprocessing
+            similarities_equality_t {}) // equality check
+            .log(linear_baseline, affine_baseline);
+        scramble_accelerated_results(results_affine_accelerated);
 #endif
     }
 }

From e53c8d594856729e81dd81721bedcb1ca835a562 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 6 May 2025 16:06:33 +0000
Subject: [PATCH 404/751] Add: NW and SW for Hopper

---
 include/stringcuzilla/similarity.cuh | 964 +++++++++++++++++++--------
 include/stringcuzilla/similarity.hpp |   5 +-
 include/stringzilla/types.h          |   5 +
 scripts/bench_similarity.cpp         |   2 +-
 scripts/bench_similarity.cu          |   2 +-
 scripts/bench_similarity.cuh         | 102 ++-
 scripts/test_stringcuzilla.cuh       |  68 +-
 7 files changed, 856 insertions(+), 292 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index 1871f29c..f96f64f6 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -15,7 +15,7 @@
  *  Thus, they should be used when hundreds of pairwise comparisons are needed, and the strings are long enough to
  *  amortize the cost of copying them to the GPU.
  *
- *  @section    Abstraction layers
+ *  @section Abstraction layers
  *
  *  Under the hood, each @b dense high-level algorithm, like Levenshtein, NW, or SW, builds on top of a "walker"
  *  template object, which in turn builds on top of an "scorer" template object:
@@ -91,7 +91,7 @@ template <typename first_iterator_type_, typename second_iterator_type_, typenam
              substituter_like<substituter_type_>
 #endif
 struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, linear_gap_costs_t,
-                   objective_, sz_similarity_global_k, capability_, std::enable_if_t<capability_ & sz_cap_cuda_k>> {
+                   objective_, sz_similarity_global_k, capability_, std::enable_if_t<capability_ == sz_cap_cuda_k>> {
 
     using first_iterator_t = first_iterator_type_;
     using second_iterator_t = second_iterator_type_;
@@ -114,7 +114,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
   protected:
     substituter_t substituter_ {};
     linear_gap_costs_t gap_costs_ {};
-    score_t last_cell_ {0};
+    score_t last_score_ {0};
 
     __forceinline__ __device__ score_t pick_best(score_t a, score_t b) const noexcept {
         if constexpr (objective_ == sz_minimize_distance_k) { return std::min(a, b); }
@@ -137,7 +137,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     /**
      *  @brief Extract the final result of the scoring operation which will be always in the bottom-right corner.
      */
-    __forceinline__ __device__ score_t score() const noexcept { return last_cell_; }
+    __forceinline__ __device__ score_t score() const noexcept { return last_score_; }
 
     /**
      *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
@@ -178,7 +178,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         }
 
         // The last element of the last chunk is the result of the global alignment.
-        if (tasks_offset == 0) last_cell_ = scores_new[0];
+        if (tasks_offset == 0) last_score_ = scores_new[0];
     }
 };
 
@@ -193,7 +193,7 @@ template <typename first_iterator_type_, typename second_iterator_type_, typenam
              substituter_like<substituter_type_>
 #endif
 struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, linear_gap_costs_t,
-                   objective_, sz_similarity_local_k, capability_, std::enable_if_t<capability_ & sz_cap_cuda_k>> {
+                   objective_, sz_similarity_local_k, capability_, std::enable_if_t<capability_ == sz_cap_cuda_k>> {
 
     using first_iterator_t = first_iterator_type_;
     using second_iterator_t = second_iterator_type_;
@@ -225,11 +225,11 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     __forceinline__ __device__ score_t pick_best_in_warp(score_t val) const noexcept {
         // The `__shfl_down_sync` replaces `__shfl_down`
         // https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
-        val = pick_best(__shfl_down_sync(0xffffffff, val, 16));
-        val = pick_best(__shfl_down_sync(0xffffffff, val, 8));
-        val = pick_best(__shfl_down_sync(0xffffffff, val, 4));
-        val = pick_best(__shfl_down_sync(0xffffffff, val, 2));
-        val = pick_best(__shfl_down_sync(0xffffffff, val, 1));
+        val = pick_best(__shfl_down_sync(0xffffffff, val, 16), val);
+        val = pick_best(__shfl_down_sync(0xffffffff, val, 8), val);
+        val = pick_best(__shfl_down_sync(0xffffffff, val, 4), val);
+        val = pick_best(__shfl_down_sync(0xffffffff, val, 2), val);
+        val = pick_best(__shfl_down_sync(0xffffffff, val, 1), val);
         return val;
     }
 
@@ -307,7 +307,7 @@ template <typename first_iterator_type_, typename second_iterator_type_, typenam
              substituter_like<substituter_type_>
 #endif
 struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, affine_gap_costs_t,
-                   objective_, sz_similarity_global_k, capability_, std::enable_if_t<capability_ & sz_cap_cuda_k>> {
+                   objective_, sz_similarity_global_k, capability_, std::enable_if_t<capability_ == sz_cap_cuda_k>> {
 
     using first_iterator_t = first_iterator_type_;
     using second_iterator_t = second_iterator_type_;
@@ -330,7 +330,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
   protected:
     substituter_t substituter_ {};
     affine_gap_costs_t gap_costs_ {};
-    score_t last_cell_ {0};
+    score_t last_score_ {0};
 
     __forceinline__ __device__ score_t pick_best(score_t a, score_t b) const noexcept {
         if constexpr (objective_ == sz_minimize_distance_k) { return std::min(a, b); }
@@ -362,7 +362,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     /**
      *  @brief Extract the final result of the scoring operation which will be always in the bottom-right corner.
      */
-    __forceinline__ __device__ score_t score() const noexcept { return last_cell_; }
+    __forceinline__ __device__ score_t score() const noexcept { return last_score_; }
 
     /**
      *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
@@ -418,7 +418,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         }
 
         // The last element of the last chunk is the result of the global alignment.
-        if (tasks_offset == 0) last_cell_ = scores_new[0];
+        if (tasks_offset == 0) last_score_ = scores_new[0];
     }
 };
 
@@ -433,7 +433,7 @@ template <typename first_iterator_type_, typename second_iterator_type_, typenam
              substituter_like<substituter_type_>
 #endif
 struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, affine_gap_costs_t,
-                   objective_, sz_similarity_local_k, capability_, std::enable_if_t<capability_ & sz_cap_cuda_k>> {
+                   objective_, sz_similarity_local_k, capability_, std::enable_if_t<capability_ == sz_cap_cuda_k>> {
 
     using first_iterator_t = first_iterator_type_;
     using second_iterator_t = second_iterator_type_;
@@ -465,11 +465,11 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     __forceinline__ __device__ score_t pick_best_in_warp(score_t val) const noexcept {
         // The `__shfl_down_sync` replaces `__shfl_down`
         // https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
-        val = pick_best(__shfl_down_sync(0xffffffff, val, 16));
-        val = pick_best(__shfl_down_sync(0xffffffff, val, 8));
-        val = pick_best(__shfl_down_sync(0xffffffff, val, 4));
-        val = pick_best(__shfl_down_sync(0xffffffff, val, 2));
-        val = pick_best(__shfl_down_sync(0xffffffff, val, 1));
+        val = pick_best(__shfl_down_sync(0xffffffff, val, 16), val);
+        val = pick_best(__shfl_down_sync(0xffffffff, val, 8), val);
+        val = pick_best(__shfl_down_sync(0xffffffff, val, 4), val);
+        val = pick_best(__shfl_down_sync(0xffffffff, val, 2), val);
+        val = pick_best(__shfl_down_sync(0xffffffff, val, 1), val);
         return val;
     }
 
@@ -637,7 +637,7 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
         }
 
         // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
-        if (tasks_offset == 0) this->last_cell_ = scores_new[0];
+        if (tasks_offset == 0) this->last_score_ = scores_new[0];
     }
 };
 
@@ -712,7 +712,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
         }
 
         // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
-        if (tasks_offset == 0) this->last_cell_ = scores_new[0];
+        if (tasks_offset == 0) this->last_score_ = scores_new[0];
     }
 };
 
@@ -728,6 +728,18 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
                     sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>;
 };
 
+template <>
+struct tile_scorer<char const *, char const *, sz_u64_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
+    : public tile_scorer<char const *, char const *, sz_u64_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
+
+    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
+    using kepler_warp_scorer_t =
+        tile_scorer<char const *, char const *, sz_u64_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>;
+};
+
 /**
  *  @brief GPU adaptation of the `tile_scorer` - Minimizes Global Levenshtein distance with affine gap costs.
  *  @note Requires Kepler generation GPUs to handle 4x `u8` scores at a time.
@@ -819,7 +831,7 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
         }
 
         // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
-        if (tasks_offset == 0) this->last_cell_ = scores_new[0];
+        if (tasks_offset == 0) this->last_score_ = scores_new[0];
     }
 };
 
@@ -909,7 +921,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
         }
 
         // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
-        if (tasks_offset == 0) this->last_cell_ = scores_new[0];
+        if (tasks_offset == 0) this->last_score_ = scores_new[0];
     }
 };
 
@@ -921,6 +933,14 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
     using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
 };
 
+template <>
+struct tile_scorer<char const *, char const *, sz_u64_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
+    : public tile_scorer<char const *, char const *, sz_u64_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
+    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
+};
+
 #endif
 
 /*  Hopper-generation optimizations are quite different from Kepler.
@@ -1002,7 +1022,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
         }
 
         // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
-        if (tasks_offset == 0) this->last_cell_ = scores_new[0];
+        if (tasks_offset == 0) this->last_score_ = scores_new[0];
     }
 };
 
@@ -1015,6 +1035,15 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
     using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
 };
 
+template <>
+struct tile_scorer<char const *, char const *, sz_u64_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
+    : public tile_scorer<char const *, char const *, sz_u64_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
+
+    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
+};
+
 template <>
 struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
@@ -1104,7 +1133,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
         }
 
         // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
-        if (tasks_offset == 0) this->last_cell_ = scores_new[0];
+        if (tasks_offset == 0) this->last_score_ = scores_new[0];
     }
 };
 
@@ -1117,6 +1146,15 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
     using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
 };
 
+template <>
+struct tile_scorer<char const *, char const *, sz_u64_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
+    : public tile_scorer<char const *, char const *, sz_u64_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
+
+    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
+};
+
 #endif
 
 /**
@@ -1170,8 +1208,8 @@ __global__ void _linear_score_across_cuda_device(              //
     using final_score_t = final_score_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
-    static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
-    static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
+    static constexpr sz_similarity_locality_t locality_k = locality_;
+    static constexpr sz_similarity_objective_t objective_k = objective_;
 
     // Pre-load the substituter and gap costs.
     using substituter_t = substituter_type_;
@@ -1347,8 +1385,8 @@ __global__ void _affine_score_across_cuda_device(              //
     using final_score_t = final_score_type_;
 
     static constexpr sz_capability_t capability_k = capability_;
-    static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
-    static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
+    static constexpr sz_similarity_locality_t locality_k = locality_;
+    static constexpr sz_similarity_objective_t objective_k = objective_;
 
     // Pre-load the substituter and gap costs.
     using substituter_t = substituter_type_;
@@ -1544,6 +1582,7 @@ __global__ void _linear_score_on_each_cuda_warp(                             //
     using char_t = char_type_;
     using index_t = index_type_;
     using score_t = score_type_;
+
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_similarity_locality_t locality_k = locality_;
     static constexpr sz_similarity_objective_t objective_k = objective_;
@@ -1736,6 +1775,7 @@ __global__ void _affine_score_on_each_cuda_warp(                             //
     using char_t = char_type_;
     using index_t = index_type_;
     using score_t = score_type_;
+
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_similarity_locality_t locality_k = locality_;
     static constexpr sz_similarity_objective_t objective_k = objective_;
@@ -1929,6 +1969,35 @@ __global__ void _affine_score_on_each_cuda_warp(                             //
 
 #pragma region - Levenshtein Distance in CUDA
 
+template <typename char_type_>
+struct cuda_similarity_task {
+    using char_t = char_type_;
+
+    char_t const *shorter_ptr = nullptr;
+    size_t shorter_length = 0;
+    char_t const *longer_ptr = nullptr;
+    size_t longer_length = 0;
+    size_t memory_requirement = 0;
+    size_t original_index = 0;
+    size_t result = std::numeric_limits<size_t>::max();       // ? Signal that we are not done yet.
+    bytes_per_cell_t bytes_per_cell = eight_bytes_per_cell_k; // ? Worst case, need the most memory per scalar.
+    warp_tasks_density_t density = warps_working_together_k;  // ? Worst case, we are not using shared memory.
+
+    constexpr cuda_similarity_task() = default;
+    constexpr cuda_similarity_task(                   //
+        char_t const *first_ptr, size_t first_length, //
+        char_t const *second_ptr, size_t second_length) noexcept {
+        if (first_length < second_length)
+            shorter_ptr = first_ptr, shorter_length = first_length, longer_ptr = second_ptr,
+            longer_length = second_length;
+        else
+            shorter_ptr = second_ptr, shorter_length = second_length, longer_ptr = first_ptr,
+            longer_length = first_length;
+    }
+
+    constexpr size_t max_diagonal_length() const noexcept { return sz_max_of_two(shorter_length, longer_length) + 1; }
+};
+
 /**
  *  @brief  Dispatches baseline Levenshtein edit distance algorithm to the GPU.
  *          Before starting the kernels, bins them by size to maximize the number of blocks
@@ -1944,33 +2013,7 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
     using scores_allocator_t = typename allocator_t::template rebind<size_t>::other;
     static constexpr sz_capability_t capability_k = capability_;
 
-    struct task_t {
-        char_t const *shorter_ptr = nullptr;
-        size_t shorter_length = 0;
-        char_t const *longer_ptr = nullptr;
-        size_t longer_length = 0;
-        size_t memory_requirement = 0;
-        size_t original_index = 0;
-        size_t result = std::numeric_limits<size_t>::max();       // ? Signal that we are not done yet.
-        bytes_per_cell_t bytes_per_cell = eight_bytes_per_cell_k; // ? Worst case, need the most memory per scalar.
-        warp_tasks_density_t density = warps_working_together_k;  // ? Worst case, we are not using shared memory.
-
-        constexpr task_t() = default;
-        constexpr task_t(                                 //
-            char_t const *first_ptr, size_t first_length, //
-            char_t const *second_ptr, size_t second_length) noexcept {
-            if (first_length < second_length)
-                shorter_ptr = first_ptr, shorter_length = first_length, longer_ptr = second_ptr,
-                longer_length = second_length;
-            else
-                shorter_ptr = second_ptr, shorter_length = second_length, longer_ptr = first_ptr,
-                longer_length = first_length;
-        }
-
-        constexpr size_t max_diagonal_length() const noexcept {
-            return sz_max_of_two(shorter_length, longer_length) + 1;
-        }
-    };
+    using task_t = cuda_similarity_task<char_t>;
     using tasks_allocator_t = typename allocator_t::template rebind<task_t>::other;
 
     uniform_substitution_costs_t substituter_ {};
@@ -2215,8 +2258,8 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
 
 #pragma endregion
 
-#pragma region - Needleman Wunsch Scores in CUDA
-#if 0
+#pragma region - Needleman Wunsch and Smith Waterman Scores in CUDA
+
 /**
  *  @brief  Convenience buffer of the size matching the size of the CUDA constant memory,
  *          used to cheaper store and access the substitution costs for the characters.
@@ -2224,243 +2267,634 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
  */
 __constant__ char _error_costs_in_cuda_constant_memory[256 * 256];
 
+struct error_costs_256x256_in_cuda_constant_memory_t {
+    __host__ error_cost_t magnitude() const noexcept { return 0; }
+    __forceinline__ __host__ __device__ error_cost_t operator()(char a, char b) const noexcept {
+#if defined(__CUDA_ARCH__)
+        return _error_costs_in_cuda_constant_memory[static_cast<sz_u8_t>(a) * 256 + static_cast<sz_u8_t>(b)];
+#else
+        return 0;
+#endif
+    }
+};
+
+#if SZ_USE_HOPPER
+
 /**
- *  @brief  Needleman-Wunsch alignment cores algorithm evaluating the Dynamic Programming matrix
- *          @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization.
- *          Each pair of strings gets its own @b "block" of CUDA threads forming one @b warp and shared memory.
- *
- *  @param[in] first_strings Array of first strings in each pair for score calculation.
- *  @param[in] second_strings Array of second strings in each pair for score calculation.
- *  @param[out] results_ptr Output array of scores for each pair of strings.
- *
- *  @tparam substituter_type_ Must be a trivially copyable object already placed in the GPU @b constant memory.
+ *  @brief GPU adaptation of the `tile_scorer` - Maximizes Global or Local score with linear gap costs.
+ *  @note Requires Hopper generation GPUs to handle 2x `i16` scores at a time.
  */
-template <                                                      //
-    typename first_strings_type_,                               //
-    typename second_strings_type_,                              //
-    typename score_type_ = sz_ssize_t,                        //
-    typename substituter_type_ = error_costs_256x256_t, //
-    typename gap_costs_type_ = linear_gap_costs_t,        //
-    sz_capability_t capability_ = sz_cap_cuda_k                 //
-    >
-__global__ void _needleman_wunsch_in_cuda_warp( //
-    first_strings_type_ first_strings,          //
-    second_strings_type_ second_strings,        //
-    score_type_ *results_ptr,                   //
-    gap_costs_type_ gaps) {
+template <sz_similarity_locality_t locality_>
+struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_cuda_constant_memory_t,
+                   linear_gap_costs_t, sz_maximize_score_k, locality_, sz_caps_ckh_k>
+    : public tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_cuda_constant_memory_t,
+                         linear_gap_costs_t, sz_maximize_score_k, locality_, sz_cap_cuda_k> {
 
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    using first_string_t = typename first_strings_type_::value_type;
-    using second_string_t = typename second_strings_type_::value_type;
-    using first_char_t = typename first_string_t::value_type;
-    using second_char_t = typename second_string_t::value_type;
-    static_assert(sizeof(first_char_t) == sizeof(second_char_t), "Character types don't match");
-    using char_t = typename std::remove_cvref<first_char_t>::type;
-    using score_t = score_type_;
-    using substituter_t = substituter_type_;
-    static_assert(std::is_trivially_copyable<substituter_t>::value, "Substituter must be a trivially copyable type.");
-    static_assert(std::is_signed<score_t>::value, "Score must be a signed type.");
+    static constexpr sz_similarity_locality_t locality_k = locality_;
 
-    static constexpr sz_capability_t cap_k = capability_;
-    static constexpr sz_similarity_objective_t obj_k = sz_maximize_score_k;
-    static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
-    using walker_i8_t = diagonal_walker_per_warp<char_t, sz_i8_t, substituter_t const &, obj_k, locality_k, cap_k>;
-    using walker_i16_t = diagonal_walker_per_warp<char_t, sz_i16_t, substituter_t const &, obj_k, locality_k, cap_k>;
-    using walker_i32_t = diagonal_walker_per_warp<char_t, sz_i32_t, substituter_t const &, obj_k, locality_k, cap_k>;
-    using walker_i64_t = diagonal_walker_per_warp<char_t, sz_i64_t, substituter_t const &, obj_k, locality_k, cap_k>;
+    using tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_cuda_constant_memory_t,
+                      linear_gap_costs_t, sz_maximize_score_k, locality_,
+                      sz_cap_cuda_k>::tile_scorer; // Make the constructors visible
 
-    // Allocating shared memory is handled on the host side.
-    extern __shared__ char shared_memory_buffer[];
+    __forceinline__ __device__ void operator()(                                 //
+        char const *first_slice, char const *second_slice,                      //
+        uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
+        sz_i16_t const *scores_pre_substitution,                                //
+        sz_i16_t const *scores_pre_insertion,                                   //
+        sz_i16_t const *scores_pre_deletion,                                    //
+        sz_i16_t *scores_new) noexcept {
 
-    // We expect the substituter state to be already in the GPU constant memory.
-    substituter_t const &substituter_constant =
-        *reinterpret_cast<substituter_t const *>(_error_costs_in_cuda_constant_memory);
+        sz_i16_t const gap_cost = this->gap_costs_.open_or_extend;
+        sz_u32_vec_t gap_cost_vec;
+        gap_cost_vec.u32 = gap_cost * 0x00010001; // ! 2x `i16` gap costs
 
-    // We are computing N edit distances for N pairs of strings. Not a cartesian product!
-    // Each block/warp may end up receiving a different number of strings.
-    for (size_t pair_idx = blockIdx.x; pair_idx < first_strings.size(); pair_idx += gridDim.x) {
-        first_string_t const first_global = first_strings[pair_idx];
-        second_string_t const second_global = second_strings[pair_idx];
-        score_t &result_ref = results_ptr[pair_idx];
-
-        // Skip empty strings.
-        size_t const first_length = first_global.length();
-        size_t const second_length = second_global.length();
-        if (first_length == 0) {
-            result_ref = second_length * gap_cost;
-            continue;
+        // The hardest part of this kernel is dealing with unaligned loads!
+        // We want to minimize single-byte processing in favor of 2-byte SIMD loads and min/max operations.
+        // Assuming we are reading consecutive values from a buffer, in every cycle, most likely, we will be
+        // dealing with most values being unaligned!
+        sz_u32_vec_t pre_substitution_vec, pre_insertion_vec, pre_deletion_vec;
+        sz_u32_vec_t first_vec, second_vec;
+        sz_u32_vec_t cost_of_substitution_vec, if_deletion_or_insertion_vec;
+        sz_u32_vec_t cell_score_vec, best_score_vec;
+        best_score_vec.i16s[0] = best_score_vec.i16s[1] = std::numeric_limits<sz_i16_t>::min();
+
+        // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
+        // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
+        for (uint i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
+            pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
+            pre_insertion_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
+            pre_deletion_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
+            first_vec.u16s[0] = *(first_slice + tasks_count - i - 1); // ! with a [] lookup would underflow
+            first_vec.u16s[1] = *(first_slice + tasks_count - i - 2); // ! with a [] lookup would underflow
+            second_vec.u16s[0] = second_slice[i + 0];
+            second_vec.u16s[1] = second_slice[i + 1];
+
+            cost_of_substitution_vec.i16s[0] =
+                error_costs_256x256_in_cuda_constant_memory_t {}(first_vec.u16s[0], second_vec.u16s[0]);
+            cost_of_substitution_vec.i16s[1] =
+                error_costs_256x256_in_cuda_constant_memory_t {}(first_vec.u16s[1], second_vec.u16s[1]);
+            if_deletion_or_insertion_vec.u32 =
+                __vaddss2(__vmaxs2(pre_insertion_vec.u32, pre_deletion_vec.u32), gap_cost_vec.u32);
+
+            // For local scoring we should use the ReLU variants of 3-way `max`.
+            if constexpr (locality_k == sz_similarity_global_k) {
+                cell_score_vec.u32 = __viaddmax_s16x2(pre_substitution_vec.u32, cost_of_substitution_vec.u32,
+                                                      if_deletion_or_insertion_vec.u32);
+                sz_unused(best_score_vec);
+            }
+            else {
+                cell_score_vec.u32 = __viaddmax_s16x2_relu(pre_substitution_vec.u32, cost_of_substitution_vec.u32,
+                                                           if_deletion_or_insertion_vec.u32);
+                best_score_vec.u32 = __vmaxs2(cell_score_vec.u32, best_score_vec.u32);
+            }
+
+            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
+            scores_new[i + 0] = cell_score_vec.i16s[0];
+            scores_new[i + 1] = cell_score_vec.i16s[1];
+        }
+
+        // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
+        if constexpr (locality_k == sz_similarity_global_k) {
+            if (tasks_offset == 0) this->last_score_ = scores_new[0];
         }
-        if (second_length == 0) {
-            result_ref = first_length * gap_cost;
-            continue;
+        else { // Or the best score for local alignment.
+            this->best_score_ = this->pick_best_in_warp((std::max)(best_score_vec.i16s[0], best_score_vec.i16s[1]));
         }
+    }
+};
 
-        // Estimate the maximum dimension of the DP matrix to pick the smallest fitting type.
-        using similarity_memory_requirements_t = similarity_memory_requirements<uint, true>;
-        similarity_memory_requirements_t requirements(first_length, second_length, static_cast<uint>(magnitude()),
-                                                      sizeof(char_t), 4);
+template <sz_similarity_locality_t locality_>
+struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_cuda_constant_memory_t,
+                   linear_gap_costs_t, sz_maximize_score_k, locality_, sz_caps_ckh_k>
+    : public tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_cuda_constant_memory_t,
+                         linear_gap_costs_t, sz_maximize_score_k, locality_, sz_cap_cuda_k> {
 
-        // Estimate the maximum dimension of the DP matrix to pick the smallest fitting type.
-        span<char const> const first = {first_global.data(), first_length};
-        span<char const> const second = {second_global.data(), second_length};
+    static constexpr sz_similarity_locality_t locality_k = locality_;
 
-        if (requirements.bytes_per_cell == 1) {
-            sz_i8_t result_i8 = std::numeric_limits<sz_i8_t>::min();
-            walker_i8_t walker(substituter_constant, gap_cost);
-            walker(first, second, result_i8, shared_memory_buffer);
-            if (threadIdx.x == 0) result_ref = result_i8;
-        }
-        else if (requirements.bytes_per_cell == 2) {
-            sz_i16_t result_i16 = std::numeric_limits<sz_i16_t>::min();
-            walker_i16_t walker(substituter_constant, gap_cost);
-            walker(first, second, result_i16, shared_memory_buffer);
-            if (threadIdx.x == 0) result_ref = result_i16;
+    using tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_cuda_constant_memory_t,
+                      linear_gap_costs_t, sz_maximize_score_k, locality_,
+                      sz_cap_cuda_k>::tile_scorer; // Make the constructors visible
+
+    __forceinline__ __device__ void operator()(                                 //
+        char const *first_slice, char const *second_slice,                      //
+        uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
+        sz_i32_t const *scores_pre_substitution,                                //
+        sz_i32_t const *scores_pre_insertion,                                   //
+        sz_i32_t const *scores_pre_deletion,                                    //
+        sz_i32_t *scores_new) noexcept {
+
+        // Make sure we are called for an anti-diagonal traversal order
+        sz_i32_t const gap_costs = this->gap_costs_.open_or_extend;
+        _sz_assert(scores_pre_insertion + 1 == scores_pre_deletion);
+        sz_i32_t best_score = std::numeric_limits<sz_i32_t>::min();
+
+        for (uint i = tasks_offset; i < tasks_count; i += tasks_step) {
+            sz_i32_t pre_substitution = scores_pre_substitution[i];
+            sz_i32_t pre_insertion = scores_pre_insertion[i];
+            sz_i32_t pre_deletion = scores_pre_deletion[i];
+
+            error_cost_t cost_of_substitution =
+                error_costs_256x256_in_cuda_constant_memory_t {}(first_slice[tasks_count - i - 1], second_slice[i]);
+            sz_i32_t if_deletion_or_insertion = (std::max)(pre_deletion, pre_insertion) + gap_costs;
+            sz_i32_t cell_score;
+
+            // For local scoring we should use the ReLU variants of 3-way `max`.
+            if constexpr (locality_k == sz_similarity_global_k) {
+                cell_score = __viaddmax_s32(pre_substitution, cost_of_substitution, if_deletion_or_insertion);
+                sz_unused(best_score);
+            }
+            else {
+                cell_score = __viaddmax_s32_relu(pre_substitution, cost_of_substitution, if_deletion_or_insertion);
+                best_score = (std::max)(cell_score, best_score);
+            }
+
+            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
+            scores_new[i] = cell_score;
         }
-        else if (requirements.bytes_per_cell == 4) {
-            sz_i32_t result_i32 = std::numeric_limits<sz_i32_t>::min();
-            walker_i32_t walker(substituter_constant, gap_cost);
-            walker(first, second, result_i32, shared_memory_buffer);
-            if (threadIdx.x == 0) result_ref = result_i32;
+
+        // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
+        if constexpr (locality_k == sz_similarity_global_k) {
+            if (tasks_offset == 0) this->last_score_ = scores_new[0];
         }
-        else if (requirements.bytes_per_cell == 8) {
-            sz_i64_t result_i64 = std::numeric_limits<sz_i64_t>::min();
-            walker_i64_t walker(substituter_constant, gap_cost);
-            walker(first, second, result_i64, shared_memory_buffer);
-            if (threadIdx.x == 0) result_ref = result_i64;
+        else { // Or the best score for local alignment.
+            this->best_score_ = this->pick_best_in_warp(best_score);
         }
     }
-}
+};
 
-/** @brief Dispatches on @b `_needleman_wunsch_in_cuda_warp` on the device side from the host side. */
-template <                                                      //
-    sz_capability_t capability_ = sz_cap_cuda_k,                //
-    typename first_strings_type_,                               //
-    typename second_strings_type_,                              //
-    score_like score_type_ = sz_ssize_t,                        //
-    substituter_like substituter_type_ = error_costs_256x256_t, //
-    gap_costs_like gap_costs_type_ = linear_gap_costs_t         //
-    >
-cuda_status_t _needleman_wunsch_via_cuda_warp(                                            //
-    first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
-    score_type_ *results,
-    substituter_type_ const &substituter, //
-    error_cost_t gap_cost = 1,            //
-    gpu_specs_t specs = {}, cuda_executor_t executor = {}) noexcept(false) {
-
-    // We need to be able to copy these function arguments into GPU memory:
-    static constexpr sz_capability_t capability_k = capability_;
-    using first_strings_t = first_strings_type_;
-    using second_strings_t = second_strings_type_;
-    using score_t = score_type_;
-    using substituter_t = substituter_type_;
-    static_assert(std::is_trivially_copyable<first_strings_t>() && std::is_trivially_copyable<second_strings_t>(),
-                  "The first and second strings must be trivially copyable types - consider `arrow_strings_view`.");
-
-    // Make sure that we don't string pairs that are too large to fit 3 matrix diagonals into shared memory.
-    // H100 Streaming Multiprocessor can have up to 128 active warps concurrently and only 256 KB of shared memory.
-    // A100 SMs had only 192 KB. We can't deal with blocks that require more memory than the SM can provide.
-    size_t shared_memory_per_multiprocessor =
-        _scores_diagonally_warp_shared_memory_requirement<true>(first_strings, second_strings, substituter.magnitude());
-    if (shared_memory_per_multiprocessor > specs.shared_memory_per_multiprocessor()) return {status_t::bad_alloc_k};
-
-    // It may be the case that we've only received empty strings.
-    if (shared_memory_per_multiprocessor == 0) {
-        for (size_t i = 0; i < first_strings.size(); ++i)
-            if (first_strings[i].length() == 0) { results[i] = second_strings[i].length(); }
-            else if (second_strings[i].length() == 0) { results[i] = first_strings[i].length(); }
-        return {status_t::success_k};
+/**
+ *  @brief GPU adaptation of the `tile_scorer` - Maximizes Global or Local score with affine gap costs.
+ *  @note Requires Hopper generation GPUs to handle 2x `i16` scores at a time.
+ */
+template <sz_similarity_locality_t locality_>
+struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_cuda_constant_memory_t,
+                   affine_gap_costs_t, sz_maximize_score_k, locality_, sz_caps_ckh_k>
+    : public tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_cuda_constant_memory_t,
+                         affine_gap_costs_t, sz_maximize_score_k, locality_, sz_cap_cuda_k> {
+
+    static constexpr sz_similarity_locality_t locality_k = locality_;
+
+    using tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_cuda_constant_memory_t,
+                      affine_gap_costs_t, sz_maximize_score_k, locality_,
+                      sz_cap_cuda_k>::tile_scorer; // Make the constructors visible
+
+    __forceinline__ __device__ void operator()(                                 //
+        char const *first_slice, char const *second_slice,                      //
+        uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
+        sz_i16_t const *scores_pre_substitution,                                //
+        sz_i16_t const *scores_pre_insertion,                                   //
+        sz_i16_t const *scores_pre_deletion,                                    //
+        sz_i16_t const *scores_running_insertions,                              //
+        sz_i16_t const *scores_running_deletions,                               //
+        sz_i16_t *scores_new,                                                   //
+        sz_i16_t *scores_new_insertions,                                        //
+        sz_i16_t *scores_new_deletions) noexcept {
+
+        sz_i16_t const gap_open_cost = this->gap_costs_.open;
+        sz_i16_t const gap_extend_cost = this->gap_costs_.extend;
+        sz_u32_vec_t gap_open_cost_vec, gap_extend_cost_vec;
+        gap_open_cost_vec.u32 = gap_open_cost * 0x00010001;     // ! 2x `i16` gap costs
+        gap_extend_cost_vec.u32 = gap_extend_cost * 0x00010001; // ! 2x `i16` gap costs
+
+        // The hardest part of this kernel is dealing with unaligned loads!
+        // We want to minimize single-byte processing in favor of 2-byte SIMD loads and min/max operations.
+        // Assuming we are reading consecutive values from a buffer, in every cycle, most likely, we will be
+        // dealing with most values being unaligned!
+        sz_u32_vec_t pre_substitution_vec, pre_insertion_opening_vec, pre_deletion_opening_vec;
+        sz_u32_vec_t pre_insertion_expansion_vec, pre_deletion_expansion_vec;
+        sz_u32_vec_t first_vec, second_vec;
+        sz_u32_vec_t cost_of_substitution_vec, if_substitution_vec, if_insertion_vec, if_deletion_vec;
+        sz_u32_vec_t cell_score_vec, best_score_vec;
+        best_score_vec.i16s[0] = best_score_vec.i16s[1] = std::numeric_limits<sz_i16_t>::min();
+
+        // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
+        // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
+        for (uint i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
+            pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
+            pre_insertion_opening_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
+            pre_deletion_opening_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
+            pre_insertion_expansion_vec = sz_u32_load_unaligned(scores_running_insertions + i);
+            pre_deletion_expansion_vec = sz_u32_load_unaligned(scores_running_deletions + i);
+            first_vec.u16s[0] = *(first_slice + tasks_count - i - 1); // ! with a [] lookup would underflow
+            first_vec.u16s[1] = *(first_slice + tasks_count - i - 2); // ! with a [] lookup would underflow
+            second_vec.u16s[0] = second_slice[i + 0];
+            second_vec.u16s[1] = second_slice[i + 1];
+
+            cost_of_substitution_vec.i16s[0] =
+                error_costs_256x256_in_cuda_constant_memory_t {}(first_vec.u16s[0], second_vec.u16s[0]);
+            cost_of_substitution_vec.i16s[1] =
+                error_costs_256x256_in_cuda_constant_memory_t {}(first_vec.u16s[1], second_vec.u16s[1]);
+            if_substitution_vec.u32 = __vaddss2(pre_substitution_vec.u32, cost_of_substitution_vec.u32);
+            if_insertion_vec.u32 = //
+                __viaddmax_s16x2(pre_insertion_opening_vec.u32, gap_open_cost_vec.u32,
+                                 __vaddss2(pre_insertion_expansion_vec.u32, gap_extend_cost_vec.u32));
+            if_deletion_vec.u32 = //
+                __viaddmax_s16x2(pre_deletion_opening_vec.u32, gap_open_cost_vec.u32,
+                                 __vaddss2(pre_deletion_expansion_vec.u32, gap_extend_cost_vec.u32));
+
+            // For local scoring we should use the ReLU variants of 3-way `max`.
+            if constexpr (locality_k == sz_similarity_global_k) {
+                cell_score_vec.u32 = __vimax3_s16x2(if_substitution_vec.u32, if_insertion_vec.u32, if_deletion_vec.u32);
+                sz_unused(best_score_vec);
+            }
+            else {
+                cell_score_vec.u32 =
+                    __vimax3_s16x2_relu(if_substitution_vec.u32, if_insertion_vec.u32, if_deletion_vec.u32);
+                best_score_vec.u32 = __vmaxs2(cell_score_vec.u32, best_score_vec.u32);
+            }
+
+            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
+            scores_new[i + 0] = cell_score_vec.i16s[0];
+            scores_new[i + 1] = cell_score_vec.i16s[1];
+            scores_new_insertions[i + 0] = if_insertion_vec.i16s[0];
+            scores_new_insertions[i + 1] = if_insertion_vec.i16s[1];
+            scores_new_deletions[i + 0] = if_deletion_vec.i16s[0];
+            scores_new_deletions[i + 1] = if_deletion_vec.i16s[1];
+        }
+
+        // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
+        if constexpr (locality_k == sz_similarity_global_k) {
+            if (tasks_offset == 0) this->last_score_ = scores_new[0];
+        }
+        else { // Or the best score for local alignment.
+            this->best_score_ = this->pick_best_in_warp((std::max)(best_score_vec.i16s[0], best_score_vec.i16s[1]));
+        }
     }
+};
+
+template <sz_similarity_locality_t locality_>
+struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_cuda_constant_memory_t,
+                   affine_gap_costs_t, sz_maximize_score_k, locality_, sz_caps_ckh_k>
+    : public tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_cuda_constant_memory_t,
+                         affine_gap_costs_t, sz_maximize_score_k, locality_, sz_cap_cuda_k> {
+
+    static constexpr sz_similarity_locality_t locality_k = locality_;
+
+    using tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_cuda_constant_memory_t,
+                      affine_gap_costs_t, sz_maximize_score_k, locality_,
+                      sz_cap_cuda_k>::tile_scorer; // Make the constructors visible
+
+    __forceinline__ __device__ void operator()(                                 //
+        char const *first_slice, char const *second_slice,                      //
+        uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
+        sz_i32_t const *scores_pre_substitution,                                //
+        sz_i32_t const *scores_pre_insertion,                                   //
+        sz_i32_t const *scores_pre_deletion,                                    //
+        sz_i32_t const *scores_running_insertions,                              //
+        sz_i32_t const *scores_running_deletions,                               //
+        sz_i32_t *scores_new,                                                   //
+        sz_i32_t *scores_new_insertions,                                        //
+        sz_i32_t *scores_new_deletions) noexcept {
+
+        // Make sure we are called for an anti-diagonal traversal order
+        sz_i32_t const gap_open_cost = this->gap_costs_.open;
+        sz_i32_t const gap_extend_cost = this->gap_costs_.extend;
+        _sz_assert(scores_pre_insertion + 1 == scores_pre_deletion);
+        sz_i32_t best_score = std::numeric_limits<sz_i32_t>::min();
+
+        for (uint i = tasks_offset; i < tasks_count; i += tasks_step) {
+            sz_i32_t pre_substitution = scores_pre_substitution[i];
+            sz_i32_t pre_insertion_opening = scores_pre_insertion[i];
+            sz_i32_t pre_deletion_opening = scores_pre_deletion[i];
+            sz_i32_t pre_insertion_expansion = scores_running_insertions[i];
+            sz_i32_t pre_deletion_expansion = scores_running_deletions[i];
+
+            error_cost_t cost_of_substitution =
+                error_costs_256x256_in_cuda_constant_memory_t {}(first_slice[tasks_count - i - 1], second_slice[i]);
+            sz_i32_t if_substitution = pre_substitution + cost_of_substitution;
+            sz_i32_t if_insertion =
+                __viaddmax_s32(pre_insertion_opening, gap_open_cost, pre_insertion_expansion + gap_extend_cost);
+            sz_i32_t if_deletion =
+                __viaddmax_s32(pre_deletion_opening, gap_open_cost, pre_deletion_expansion + gap_extend_cost);
+            sz_i32_t cell_score;
+
+            // For local scoring we should use the ReLU variants of 3-way `max`.
+            if constexpr (locality_k == sz_similarity_global_k) {
+                cell_score = __vimax3_s32(if_substitution, if_insertion, if_deletion);
+                sz_unused(best_score);
+            }
+            else {
+                cell_score = __vimax3_s32_relu(if_substitution, if_insertion, if_deletion);
+                best_score = (std::max)(cell_score, best_score);
+            }
 
-    // In most cases we should be able to fit many blocks per SM.
-    size_t count_blocks_per_multiprocessor = specs.shared_memory_per_multiprocessor() / shared_memory_per_multiprocessor;
-    if (count_blocks_per_multiprocessor > specs.max_blocks_per_multiprocessor)
-        count_blocks_per_multiprocessor = specs.max_blocks_per_multiprocessor;
-    if (count_blocks_per_multiprocessor > first_strings.size()) count_blocks_per_multiprocessor = first_strings.size();
-
-    // Let's use all 32 threads in a warp.
-    constexpr size_t threads_per_block = 32u;
-    size_t const magnitude() = substituter.magnitude();
-    auto kernel =
-        &_needleman_wunsch_in_cuda_warp<first_strings_t, second_strings_t, score_t, substituter_t, capability_k>;
-    void *kernel_args[] = {
-        (void *)&first_strings, (void *)&second_strings, (void *)&results, (void *)&gap_cost, (void *)&magnitude(),
-    };
-
-    // On Volta and newer GPUs, there is an extra flag to be set to use more than 48 KB of shared memory per block.
-    // CUDA reserves 1 KB of shared memory per thread block, so on H100 we can use up to 227 KB of shared memory.
-    // https://docs.nvidia.com/cuda/hopper-tuning-guide/index.html#unified-shared-memory-l1-texture-cache
-    cudaError_t attribute_error =
-        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
-                             specs.shared_memory_per_multiprocessor() - count_blocks_per_multiprocessor * 1024);
-    if (attribute_error != cudaSuccess) return {status_t::unknown_k, attribute_error};
-
-    // Create CUDA events for timing
-    cudaEvent_t start_event, stop_event;
-    cudaEventCreate(&start_event);
-    cudaEventCreate(&stop_event);
-
-    // Record the start event
-    cudaEventRecord(start_event, executor.stream);
-
-    // Enqueue the transfer of the substituter to the constant memory:
-    cudaError_t copy_error = cudaMemcpyToSymbolAsync(_error_costs_in_cuda_constant_memory, (void const *)&substituter,
-                                                     sizeof(substituter_t), 0, cudaMemcpyHostToDevice, executor.stream);
-    if (copy_error != cudaSuccess) return {status_t::unknown_k, copy_error};
-
-    // Enqueue the kernel for execution:
-    cudaError_t launch_error = cudaLaunchKernel(                                 //
-        reinterpret_cast<void *>(kernel),                                        // Kernel function pointer
-        dim3(count_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions
-        dim3(threads_per_block),                                                 // Block dimensions
-        kernel_args,                                                             // Array of kernel argument pointers
-        shared_memory_per_multiprocessor,                                                 // Shared memory per block (in bytes)
-        executor.stream);                                                        // CUDA stream
-    if (launch_error != cudaSuccess)
-        if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
-        else { return {status_t::unknown_k, launch_error}; }
-
-    cudaEventRecord(stop_event, executor.stream);
-
-    // Fetch the execution error:
-    float execution_milliseconds = 0;
-    cudaError_t execution_error = cudaStreamSynchronize(executor.stream);
-    cudaEventElapsedTime(&execution_milliseconds, start_event, stop_event);
-
-    if (execution_error != cudaSuccess)
-        if (execution_error == cudaErrorMemoryAllocation) {
-            return {status_t::bad_alloc_k, execution_error, execution_milliseconds};
+            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
+            scores_new[i] = cell_score;
+            scores_new_insertions[i] = if_insertion;
+            scores_new_deletions[i] = if_deletion;
         }
-        else { return {status_t::unknown_k, execution_error, execution_milliseconds}; }
-    return {status_t::success_k, cudaSuccess, execution_milliseconds};
-}
 
-/** @brief Dispatches baseline Needleman Wunsch algorithm to the GPU. */
-template <typename char_type_, substituter_like substituter_type_, gap_costs_like gap_costs_type_,
-          typename allocator_type_>
-struct needleman_wunsch_scores<char_type_, substituter_type_, gap_costs_type_, allocator_type_, sz_cap_cuda_k> {
+        // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
+        if constexpr (locality_k == sz_similarity_global_k) {
+            if (tasks_offset == 0) this->last_score_ = scores_new[0];
+        }
+        else { // Or the best score for local alignment.
+            this->best_score_ = this->pick_best_in_warp(best_score);
+        }
+    }
+};
 
-    using char_t = char_type_;
-    using substituter_t = substituter_type_;
+template <sz_similarity_locality_t locality_>
+struct tile_scorer<char const *, char const *, sz_i64_t, error_costs_256x256_in_cuda_constant_memory_t,
+                   linear_gap_costs_t, sz_maximize_score_k, locality_, sz_caps_ckh_k>
+    : public tile_scorer<char const *, char const *, sz_i64_t, error_costs_256x256_in_cuda_constant_memory_t,
+                         linear_gap_costs_t, sz_maximize_score_k, locality_, sz_cap_cuda_k> {
+
+    using tile_scorer<char const *, char const *, sz_i64_t, error_costs_256x256_in_cuda_constant_memory_t,
+                      linear_gap_costs_t, sz_maximize_score_k, locality_,
+                      sz_cap_cuda_k>::tile_scorer; // Make the constructors visible
+};
+
+template <sz_similarity_locality_t locality_>
+struct tile_scorer<char const *, char const *, sz_i64_t, error_costs_256x256_in_cuda_constant_memory_t,
+                   affine_gap_costs_t, sz_maximize_score_k, locality_, sz_caps_ckh_k>
+    : public tile_scorer<char const *, char const *, sz_i64_t, error_costs_256x256_in_cuda_constant_memory_t,
+                         affine_gap_costs_t, sz_maximize_score_k, locality_, sz_cap_cuda_k> {
+
+    using tile_scorer<char const *, char const *, sz_i64_t, error_costs_256x256_in_cuda_constant_memory_t,
+                      affine_gap_costs_t, sz_maximize_score_k, locality_,
+                      sz_cap_cuda_k>::tile_scorer; // Make the constructors visible
+};
+
+#endif
+
+/**
+ *  @brief  Dispatches baseline NW or SW scoring algorithm to the GPU.
+ *          Before starting the kernels, bins them by size to maximize the number of blocks
+ *          per grid that can run simultaneously, while fitting into the shared memory.
+ *          Unlike the Levenshtein distances, also places byte-level @b `error_costs_256x256_t`
+ *          substitution costs into the CUDA @b constant memory, addressing it via
+ *          @b `error_costs_256x256_in_cuda_constant_memory_t` struct.
+ */
+template <typename gap_costs_type_, typename allocator_type_, sz_similarity_locality_t locality_,
+          sz_capability_t capability_>
+struct _cuda_nw_or_sw_byte_level_scores {
+
+    using char_t = char;
+    using substituter_t = error_costs_256x256_t;
     using gap_costs_t = gap_costs_type_;
     using allocator_t = allocator_type_;
+    using scores_allocator_t = typename allocator_t::template rebind<size_t>::other;
+    static constexpr sz_similarity_locality_t locality_k = locality_;
+    static constexpr sz_capability_t capability_k = capability_;
 
-    substituter_t substituter_ {};
-    error_cost_t gap_costs_ {1};
+    using task_t = cuda_similarity_task<char_t>;
+    using tasks_allocator_t = typename allocator_t::template rebind<task_t>::other;
+
+    error_costs_256x256_t substituter_ {};
+    gap_costs_t gap_costs_ {};
     allocator_t alloc_ {};
 
-    needleman_wunsch_scores(allocator_t const &alloc = {}) noexcept : alloc_(alloc) {}
-    needleman_wunsch_scores(substituter_t subs, gap_costs_t gaps, allocator_t alloc = allocator_t {}) noexcept
+    _cuda_nw_or_sw_byte_level_scores(error_costs_256x256_t subs = {}, gap_costs_t gaps = {},
+                                     allocator_t const &alloc = allocator_t {}) noexcept
         : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
-    cuda_status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
-                             results_type_ &&results, gpu_specs_t specs = {},
-                             cuda_executor_t executor = {}) const noexcept {
-        return _needleman_wunsch_via_cuda_warp<sz_cap_cuda_k>(first_strings, second_strings, results, substituter_,
-                                                              gap_costs_, specs, executor);
+    cuda_status_t operator()(                                                                 //
+        first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
+        results_type_ *results_ptr,                                                           //
+        gpu_specs_t specs = {}, cuda_executor_t executor = {}) const noexcept {
+
+        constexpr bool is_affine_k = std::is_same<gap_costs_t, affine_gap_costs_t>::value;
+        constexpr size_t count_diagonals_k = is_affine_k ? 7 : 3;
+
+        // Preallocate the events for GPU timing.
+        cudaEvent_t start_event, stop_event;
+        cudaEventCreate(&start_event, cudaEventBlockingSync);
+        cudaEventCreate(&stop_event, cudaEventBlockingSync);
+
+        using final_score_t = results_type_;
+        safe_vector<task_t, tasks_allocator_t> tasks(alloc_);
+        if (tasks.try_resize(first_strings.size()) == status_t::bad_alloc_k) return {status_t::bad_alloc_k};
+
+        // Record the start event
+        cudaError_t start_event_error = cudaEventRecord(start_event, executor.stream);
+        if (start_event_error != cudaSuccess) return {status_t::unknown_k, start_event_error};
+
+        // Enqueue the transfer of the substituter to the constant memory:
+        cudaError_t copy_error =
+            cudaMemcpyToSymbolAsync(_error_costs_in_cuda_constant_memory, (void const *)&substituter_,
+                                    sizeof(substituter_t), 0, cudaMemcpyHostToDevice, executor.stream);
+        if (copy_error != cudaSuccess) return {status_t::unknown_k, copy_error};
+
+        // Export all the tasks and sort them by decreasing memory requirement.
+        size_t count_empty_tasks = 0;
+        using similarity_memory_requirements_t = similarity_memory_requirements<size_t, true>;
+        for (size_t i = 0; i < first_strings.size(); ++i) {
+            task_t task(                                            //
+                first_strings[i].data(), first_strings[i].length(), //
+                second_strings[i].data(), second_strings[i].length());
+            similarity_memory_requirements_t requirement(                                  //
+                task.shorter_length, task.longer_length,                                   //
+                gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
+                sizeof(char_t), 4, two_bytes_per_cell_k);
+
+            task.original_index = i;
+            task.memory_requirement = requirement.total;
+            task.bytes_per_cell = requirement.bytes_per_cell;
+            task.density = warp_tasks_density(requirement.total, specs);
+            if (task.density == infinite_warps_per_multiprocessor_k) {
+                if constexpr (!is_affine_k) task.result = task.longer_length * gap_costs_.open_or_extend;
+                else
+                    task.result =
+                        task.longer_length ? (task.longer_length - 1) * gap_costs_.extend + gap_costs_.open : 0;
+                count_empty_tasks++;
+            }
+            tasks[i] = task;
+        }
+
+        auto [device_level_tasks, warp_level_tasks, empty_tasks] =
+            warp_tasks_grouping<task_t>({tasks.data(), tasks.size()}, specs);
+
+        if (device_level_tasks.size()) {
+            auto device_level_i32_kernel =
+                is_affine_k //
+                    ? (void *)&_affine_score_across_cuda_device<char_t, sz_u32_t, sz_i32_t, final_score_t,
+                                                                error_costs_256x256_in_cuda_constant_memory_t,
+                                                                sz_maximize_score_k, locality_k, capability_k>
+                    : (void *)&_linear_score_across_cuda_device<char_t, sz_u32_t, sz_i32_t, final_score_t,
+                                                                error_costs_256x256_in_cuda_constant_memory_t,
+                                                                sz_maximize_score_k, locality_k, capability_k>;
+            auto device_level_i64_kernel =
+                is_affine_k //
+                    ? (void *)&_affine_score_across_cuda_device<char_t, sz_u64_t, sz_i64_t, final_score_t,
+                                                                error_costs_256x256_in_cuda_constant_memory_t,
+                                                                sz_maximize_score_k, locality_k, capability_k>
+                    : (void *)&_linear_score_across_cuda_device<char_t, sz_u64_t, sz_i64_t, final_score_t,
+                                                                error_costs_256x256_in_cuda_constant_memory_t,
+                                                                sz_maximize_score_k, locality_k, capability_k>;
+            void *device_level_kernel_args[8];
+
+            // On very large inputs we can't fit the diagonals in shared memory, and use the global one.
+            safe_vector<sz_u64_t, scores_allocator_t> diagonals_u64_buffer(alloc_);
+            task_t const &largest_task = device_level_tasks[0];
+            _sz_assert(largest_task.max_diagonal_length() >= device_level_tasks.back().max_diagonal_length());
+            if (diagonals_u64_buffer.try_resize(largest_task.max_diagonal_length() * count_diagonals_k) ==
+                status_t::bad_alloc_k)
+                return {status_t::bad_alloc_k};
+
+            // Individually submit each task to the GPU.
+            void *const diagonals_buffer_ptr = (void *)diagonals_u64_buffer.data();
+            for (size_t i = 0; i < device_level_tasks.size(); ++i) {
+                task_t const &task = device_level_tasks[i];
+                device_level_kernel_args[0] = (void *)(&task.shorter_ptr);
+                device_level_kernel_args[1] = (void *)(&task.shorter_length);
+                device_level_kernel_args[2] = (void *)(&task.longer_ptr);
+                device_level_kernel_args[3] = (void *)(&task.longer_length);
+                device_level_kernel_args[4] = (void *)(&task.result);
+                device_level_kernel_args[5] = (void *)(&diagonals_buffer_ptr);
+                device_level_kernel_args[6] = (void *)(&substituter_);
+                device_level_kernel_args[7] = (void *)(&gap_costs_);
+
+                // Pick the smallest fitting type for the diagonals.
+                void *device_level_kernel = reinterpret_cast<void *>(device_level_i32_kernel);
+                if (task.bytes_per_cell >= sizeof(sz_i64_t))
+                    device_level_kernel = reinterpret_cast<void *>(device_level_i64_kernel);
+
+                // TODO: We can be wiser about the dimensions of this grid.
+                uint const random_block_size = 128;
+                uint const random_blocks_per_multiprocessor = 32;
+                cudaError_t launch_error = cudaLaunchCooperativeKernel(                       //
+                    reinterpret_cast<void *>(device_level_kernel),                            // Kernel function pointer
+                    dim3(random_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions
+                    dim3(random_block_size),                                                  // Block dimensions
+                    device_level_kernel_args, // Array of kernel argument pointers
+                    0,                        // Shared memory per block (in bytes)
+                    executor.stream);         // CUDA stream
+                if (launch_error != cudaSuccess)
+                    if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
+                    else { return {status_t::unknown_k, launch_error}; }
+            }
+        }
+
+        // Now process remaining warp-level tasks, checking warp densities in reverse order.
+        // From the highest possible number of warps per multiprocessor to the lowest.
+        if (warp_level_tasks.size()) {
+            auto warp_level_i16_kernel =
+                is_affine_k ? (void *)&_affine_score_on_each_cuda_warp<task_t, char_t, sz_u16_t, sz_i16_t,
+                                                                       error_costs_256x256_in_cuda_constant_memory_t,
+                                                                       sz_maximize_score_k, locality_k, capability_k>
+                            : (void *)&_linear_score_on_each_cuda_warp<task_t, char_t, sz_u16_t, sz_i16_t,
+                                                                       error_costs_256x256_in_cuda_constant_memory_t,
+                                                                       sz_maximize_score_k, locality_k, capability_k>;
+            auto warp_level_i32_kernel =
+                is_affine_k ? (void *)&_affine_score_on_each_cuda_warp<task_t, char_t, sz_u32_t, sz_i32_t,
+                                                                       error_costs_256x256_in_cuda_constant_memory_t,
+                                                                       sz_maximize_score_k, locality_k, capability_k>
+                            : (void *)&_linear_score_on_each_cuda_warp<task_t, char_t, sz_u32_t, sz_i32_t,
+                                                                       error_costs_256x256_in_cuda_constant_memory_t,
+                                                                       sz_maximize_score_k, locality_k, capability_k>;
+            void *warp_level_kernel_args[5];
+
+            cuda_status_t result;
+            auto const task_size_equality = [](task_t const &lhs, task_t const &rhs) {
+                return lhs.bytes_per_cell == rhs.bytes_per_cell && lhs.density == rhs.density;
+            };
+            auto const task_group_callback = [&](task_t const *tasks_begin, task_t const *tasks_end) {
+                // Check if we need to stop processing.
+                if (result.status != status_t::success_k) return;
+
+                // Make sure all tasks can be handled by the same kernel template.
+                task_t const &first_task = *tasks_begin;
+                _sz_assert(std::all_of(tasks_begin, tasks_end, [&](task_t const &task) {
+                    return task.bytes_per_cell == first_task.bytes_per_cell && task.density == first_task.density;
+                }));
+
+                // Find the task in the batch that requires the most memory.
+                task_t const &indicative_task =
+                    *std::max_element(tasks_begin, tasks_end, [](task_t const &lhs, task_t const &rhs) {
+                        return lhs.memory_requirement < rhs.memory_requirement;
+                    });
+
+                // Pick the smallest fitting type for the diagonals.
+                void *warp_level_kernel = reinterpret_cast<void *>(warp_level_i16_kernel);
+                if (indicative_task.bytes_per_cell >= sizeof(sz_i32_t))
+                    warp_level_kernel = reinterpret_cast<void *>(warp_level_i32_kernel);
+
+                // Even if we can fit more warps per block we sometimes should not.
+                auto const [optimal_density, speculative_factor] =
+                    speculation_friendly_density(indicative_task.density);
+
+                // Update the selected kernels properties.
+                uint const shared_memory_per_block =
+                    static_cast<uint>(indicative_task.memory_requirement * optimal_density);
+                _sz_assert(shared_memory_per_block > 0);
+                _sz_assert(shared_memory_per_block < specs.shared_memory_per_multiprocessor());
+                cudaError_t attribute_error = cudaFuncSetAttribute(
+                    warp_level_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_per_block);
+                if (attribute_error != cudaSuccess) {
+                    result = {status_t::unknown_k, attribute_error};
+                    return;
+                }
+
+                size_t const count_tasks = tasks_end - tasks_begin;
+                warp_level_kernel_args[0] = (void *)(&tasks_begin);
+                warp_level_kernel_args[1] = (void *)(&count_tasks);
+                warp_level_kernel_args[2] = (void *)(&substituter_);
+                warp_level_kernel_args[3] = (void *)(&gap_costs_);
+                warp_level_kernel_args[4] = (void *)(&shared_memory_per_block);
+
+                // Warp-level algorithm clearly aligns with the warp size.
+                uint const threads_per_block = static_cast<uint>(specs.warp_size * optimal_density);
+                cudaError_t launch_error = cudaLaunchKernel(                    //
+                    reinterpret_cast<void *>(warp_level_kernel),                // Kernel function pointer
+                    dim3(specs.streaming_multiprocessors * speculative_factor), // Grid dimensions
+                    dim3(threads_per_block),                                    // Block dimensions
+                    warp_level_kernel_args,                                     // Array of kernel argument pointers
+                    shared_memory_per_block,                                    // Shared memory per block (in bytes)
+                    executor.stream);                                           // CUDA stream
+                if (launch_error != cudaSuccess) {
+                    result = {launch_error == cudaErrorMemoryAllocation ? status_t::bad_alloc_k : status_t::unknown_k,
+                              launch_error};
+                    return;
+                }
+
+                // Wait until everything completes, as on the next iteration we will update the properties again.
+                cudaError_t execution_error = cudaStreamSynchronize(executor.stream);
+                if (execution_error != cudaSuccess) {
+                    result = {status_t::unknown_k, execution_error};
+                    return;
+                }
+            };
+            group_by(warp_level_tasks.begin(), warp_level_tasks.end(), task_size_equality, task_group_callback);
+            if (result.status != status_t::success_k) return result;
+        }
+
+        // Calculate the duration:
+        cudaError_t stop_event_error = cudaEventRecord(stop_event, executor.stream);
+        if (stop_event_error != cudaSuccess) return {status_t::unknown_k, stop_event_error};
+        float execution_milliseconds = 0;
+        cudaEventElapsedTime(&execution_milliseconds, start_event, stop_event);
+
+        // Now that everything went well, export the results back into the `results` array.
+        for (size_t i = 0; i < tasks.size(); ++i) {
+            task_t const &task = tasks[i];
+            results_ptr[task.original_index] = task.result;
+        }
+        return {status_t::success_k, cudaSuccess, execution_milliseconds};
     }
 };
-#endif
+
+/** @brief Dispatches baseline Needleman Wunsch algorithm to the GPU. */
+template <typename gap_costs_type_, typename allocator_type_, sz_capability_t capability_>
+struct needleman_wunsch_scores<char, error_costs_256x256_t, gap_costs_type_, allocator_type_, capability_,
+                               std::enable_if_t<capability_ & sz_cap_cuda_k>>
+    : public _cuda_nw_or_sw_byte_level_scores<gap_costs_type_, allocator_type_, sz_similarity_global_k, capability_> {
+
+    using _cuda_nw_or_sw_byte_level_scores<gap_costs_type_, allocator_type_, sz_similarity_global_k,
+                                           capability_>::_cuda_nw_or_sw_byte_level_scores;
+};
+
+/** @brief Dispatches baseline Smith Waterman algorithm to the GPU. */
+template <typename gap_costs_type_, typename allocator_type_, sz_capability_t capability_>
+struct smith_waterman_scores<char, error_costs_256x256_t, gap_costs_type_, allocator_type_, capability_,
+                             std::enable_if_t<capability_ & sz_cap_cuda_k>>
+    : public _cuda_nw_or_sw_byte_level_scores<gap_costs_type_, allocator_type_, sz_similarity_local_k, capability_> {
+
+    using _cuda_nw_or_sw_byte_level_scores<gap_costs_type_, allocator_type_, sz_similarity_local_k,
+                                           capability_>::_cuda_nw_or_sw_byte_level_scores;
+};
+
 #pragma endregion
 
 } // namespace stringzilla
diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index 7b0e052d..257a6148 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -186,6 +186,7 @@ struct similarity_memory_requirements {
      *  @param[in] substitute_magnitude,gap_magnitude The absolute value of the maximum change in nearby cells.
      *  @param[in] bytes_per_char The number of bytes per character, 4 for UTF-32, 1 for ASCII.
      *  @param[in] register_width The alignment of the data in bytes, 4 for CUDA, 64 for AVX-512.
+     *  @param[in] min_bytes_per_cell The minimum number of bytes per cell, if kernels for some types aren't available.
      *
      *  To understand the @p substitute_magnitude,gap_magnitude parameters, consider the following example:
      *  - substitution costs ranging from -16 to +15
@@ -198,7 +199,8 @@ struct similarity_memory_requirements {
         sz_similarity_gaps_t gap_type,                     //
         size_t substitute_magnitude, size_t gap_magnitude, //
         size_t bytes_per_char,                             //
-        size_t register_width) noexcept {
+        size_t register_width,                             //
+        bytes_per_cell_t min_bytes_per_cell = one_byte_per_cell_k) noexcept {
 
         // If any of the strings is empty, we don't need any memory to perform the similarity scoring.
         size_t shorter_length = sz_min_of_two(first_length, second_length);
@@ -230,6 +232,7 @@ struct similarity_memory_requirements {
                 : max_cell_value < 32767      ? two_bytes_per_cell_k
                 : max_cell_value < 2147483647 ? four_bytes_per_cell_k
                                               : eight_bytes_per_cell_k;
+        if (this->bytes_per_cell < min_bytes_per_cell) this->bytes_per_cell = min_bytes_per_cell;
 
         // For each string we need to copy its contents, and allocate 3 bands proportional to the length
         // of the shorter string with each cell being big enough to hold the length of the longer one.
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 3a76c30b..d4fe3d5d 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -712,7 +712,9 @@ typedef union sz_u16_vec_t {
 typedef union sz_u32_vec_t {
     sz_u32_t u32;
     sz_u16_t u16s[2];
+    sz_i16_t i16s[2];
     sz_u8_t u8s[4];
+    sz_i8_t i8s[4];
 } sz_u32_vec_t;
 
 /**
@@ -722,8 +724,11 @@ typedef union sz_u32_vec_t {
 typedef union sz_u64_vec_t {
     sz_u64_t u64;
     sz_u32_t u32s[2];
+    sz_i32_t i32s[2];
     sz_u16_t u16s[4];
+    sz_i16_t i16s[4];
     sz_u8_t u8s[8];
+    sz_i8_t i8s[8];
 } sz_u64_vec_t;
 
 /**
diff --git a/scripts/bench_similarity.cpp b/scripts/bench_similarity.cpp
index 31738850..962bd456 100644
--- a/scripts/bench_similarity.cpp
+++ b/scripts/bench_similarity.cpp
@@ -64,7 +64,7 @@ int main(int argc, char const **argv) {
 
         std::printf("Starting string similarity benchmarks...\n");
         bench_levenshtein(env);
-        bench_needleman_wunsch(env);
+        bench_needleman_wunsch_smith_waterman(env);
     }
     catch (std::exception const &e) {
         std::fprintf(stderr, "Failed with: %s\n", e.what());
diff --git a/scripts/bench_similarity.cu b/scripts/bench_similarity.cu
index 31738850..962bd456 100644
--- a/scripts/bench_similarity.cu
+++ b/scripts/bench_similarity.cu
@@ -64,7 +64,7 @@ int main(int argc, char const **argv) {
 
         std::printf("Starting string similarity benchmarks...\n");
         bench_levenshtein(env);
-        bench_needleman_wunsch(env);
+        bench_needleman_wunsch_smith_waterman(env);
     }
     catch (std::exception const &e) {
         std::fprintf(stderr, "Failed with: %s\n", e.what());
diff --git a/scripts/bench_similarity.cuh b/scripts/bench_similarity.cuh
index 37607c34..588b2a10 100644
--- a/scripts/bench_similarity.cuh
+++ b/scripts/bench_similarity.cuh
@@ -216,54 +216,112 @@ void bench_levenshtein(environment_t const &env) {
     }
 }
 
-void bench_needleman_wunsch(environment_t const &env) {
+void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
 
     using namespace std::string_literals; // for "s" suffix
 
-    constexpr linear_gap_costs_t blosum62_linear_costs {-4};
+    constexpr linear_gap_costs_t blosum62_linear_cost {-4};
+    constexpr affine_gap_costs_t blosum62_affine_cost {-4, -1};
     auto blosum62_mat = error_costs_26x26ascii_t::blosum62();
     auto blosum62_matrix = blosum62_mat.decompressed();
 
 #if SZ_USE_CUDA
     sz::gpu_specs_t specs = *sz::gpu_specs();
 #endif
-    std::vector<std::size_t> batch_sizes = {1024 / 32, 1024, 1024 * 32};
+    std::vector<std::size_t> batch_sizes = {1, 32, 1024, 32 * 1024};
 #if SZ_DEBUG
     batch_sizes = {1, 2, 32};
 #endif
-    similarities_t results_baseline, results_accelerated;
+    similarities_t results_linear_global_baseline, results_linear_global_accelerated;
+    similarities_t results_affine_global_baseline, results_affine_global_accelerated;
+    similarities_t results_linear_local_baseline, results_linear_local_accelerated;
+    similarities_t results_affine_local_baseline, results_affine_local_accelerated;
 
-    auto scramble_accelerated_results = [&] {
+    auto scramble_accelerated_results = [&](similarities_t &results_accelerated) {
         std::shuffle(results_accelerated.begin(), results_accelerated.end(), global_random_generator());
     };
 
     for (std::size_t batch_size : batch_sizes) {
-        results_baseline.resize(batch_size);
-        results_accelerated.resize(batch_size);
-
-        auto call_baseline = similarities_callable<needleman_wunsch_serial_t>(env, results_baseline,
-                                                                              {blosum62_matrix, blosum62_linear_costs});
-        auto name_baseline = "needleman_wunsch_serial:batch"s + std::to_string(batch_size);
-        bench_result_t baseline = bench_unary(env, name_baseline, call_baseline).log();
+        results_linear_global_baseline.resize(batch_size), results_linear_global_accelerated.resize(batch_size);
+        results_affine_global_baseline.resize(batch_size), results_affine_global_accelerated.resize(batch_size);
+        results_linear_local_baseline.resize(batch_size), results_linear_local_accelerated.resize(batch_size);
+        results_affine_local_baseline.resize(batch_size), results_affine_local_accelerated.resize(batch_size);
+
+        auto call_linear_global_baseline = similarities_callable<needleman_wunsch_serial_t>(
+            env, results_linear_global_baseline, {blosum62_matrix, blosum62_linear_cost});
+        auto name_linear_global_baseline = "needleman_wunsch_serial:batch"s + std::to_string(batch_size);
+        bench_result_t linear_global_baseline =
+            bench_unary(env, name_linear_global_baseline, call_linear_global_baseline).log();
+
+        auto call_linear_local_baseline = similarities_callable<smith_waterman_serial_t>(
+            env, results_linear_local_baseline, {blosum62_matrix, blosum62_linear_cost});
+        auto name_linear_local_baseline = "smith_waterman_serial:batch"s + std::to_string(batch_size);
+        bench_result_t linear_local_baseline =
+            bench_unary(env, name_linear_local_baseline, call_linear_local_baseline).log();
+
+        auto call_affine_global_baseline = similarities_callable<affine_needleman_wunsch_serial_t>(
+            env, results_affine_global_baseline, {blosum62_matrix, blosum62_affine_cost});
+        auto name_affine_global_baseline = "affine_needleman_wunsch_serial:batch"s + std::to_string(batch_size);
+        bench_result_t affine_global_baseline =
+            bench_unary(env, name_affine_global_baseline, call_affine_global_baseline).log();
+
+        auto call_affine_local_baseline = similarities_callable<affine_smith_waterman_serial_t>(
+            env, results_affine_local_baseline, {blosum62_matrix, blosum62_affine_cost});
+        auto name_affine_local_baseline = "affine_smith_waterman_serial:batch"s + std::to_string(batch_size);
+        bench_result_t affine_local_baseline =
+            bench_unary(env, name_affine_local_baseline, call_affine_local_baseline).log();
 
 #if SZ_USE_ICE
-        bench_unary(env, "needleman_wunsch_ice:batch"s + std::to_string(batch_size), call_baseline,
-                    similarities_callable<needleman_wunsch_ice_t>(env, results_accelerated,
-                                                                  {blosum62_matrix, blosum62_linear_costs}),
+        bench_unary(env, "needleman_wunsch_ice:batch"s + std::to_string(batch_size), call_linear_global_baseline,
+                    similarities_callable<needleman_wunsch_ice_t>(env, results_linear_global_accelerated,
+                                                                  {blosum62_matrix, blosum62_linear_cost}),
+                    callable_no_op_t {},        // preprocessing
+                    similarities_equality_t {}) // equality check
+            .log(linear_global_baseline);
+        scramble_accelerated_results(results_linear_global_accelerated);
+
+        bench_unary(env, "smith_waterman_ice:batch"s + std::to_string(batch_size), call_linear_local_baseline,
+                    similarities_callable<smith_waterman_ice_t>(env, results_linear_local_accelerated,
+                                                                {blosum62_matrix, blosum62_linear_cost}),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
-            .log(baseline);
-        scramble_accelerated_results();
+            .log(linear_local_baseline);
+        scramble_accelerated_results(results_linear_local_accelerated);
 #endif
 
-#if SZ_USE_CUDA && 0
-        bench_unary(env, "needleman_wunsch_cuda:batch"s + std::to_string(batch_size), call_baseline,
+#if SZ_USE_CUDA
+        bench_unary(env, "needleman_wunsch_cuda:batch"s + std::to_string(batch_size), call_linear_global_baseline,
                     similarities_callable<needleman_wunsch_cuda_t, sz::gpu_specs_t>(
-                        env, results_accelerated, {blosum62_matrix, blosum62_linear_costs}, specs),
+                        env, results_linear_global_accelerated, {blosum62_matrix, blosum62_linear_cost}, specs),
+                    callable_no_op_t {},        // preprocessing
+                    similarities_equality_t {}) // equality check
+            .log(linear_global_baseline);
+        scramble_accelerated_results(results_linear_global_accelerated);
+
+        bench_unary(env, "smith_waterman_cuda:batch"s + std::to_string(batch_size), call_linear_local_baseline,
+                    similarities_callable<smith_waterman_cuda_t, sz::gpu_specs_t>(
+                        env, results_linear_local_accelerated, {blosum62_matrix, blosum62_linear_cost}, specs),
+                    callable_no_op_t {},        // preprocessing
+                    similarities_equality_t {}) // equality check
+            .log(linear_local_baseline);
+        scramble_accelerated_results(results_linear_local_accelerated);
+
+        bench_unary(env, "affine_needleman_wunsch_cuda:batch"s + std::to_string(batch_size),
+                    call_affine_global_baseline,
+                    similarities_callable<affine_needleman_wunsch_cuda_t, sz::gpu_specs_t>(
+                        env, results_affine_global_accelerated, {blosum62_matrix, blosum62_affine_cost}, specs),
+                    callable_no_op_t {},        // preprocessing
+                    similarities_equality_t {}) // equality check
+            .log(affine_global_baseline);
+        scramble_accelerated_results(results_affine_global_accelerated);
+
+        bench_unary(env, "affine_smith_waterman_cuda:batch"s + std::to_string(batch_size), call_affine_local_baseline,
+                    similarities_callable<affine_smith_waterman_cuda_t, sz::gpu_specs_t>(
+                        env, results_affine_local_accelerated, {blosum62_matrix, blosum62_affine_cost}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
-            .log(baseline);
-        scramble_accelerated_results();
+            .log(affine_local_baseline);
+        scramble_accelerated_results(results_affine_local_accelerated);
 #endif
     }
 }
diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index 3189f259..51b3e74f 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -802,6 +802,12 @@ void test_similarity_scores_equivalence() {
         needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_caps_si_k> {
             blosum62_matrix, blosum62_linear_cost});
 
+    // Ice Lake Smith-Waterman distance against Multi-threaded on CPU
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                     //
+        smith_waterman_baselines_t {blosum62_matrix, blosum62_linear_cost}, //
+        smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_caps_si_k> {blosum62_matrix,
+                                                                                                 blosum62_linear_cost});
+
 #endif
 
 #if SZ_USE_CUDA
@@ -830,14 +836,72 @@ void test_similarity_scores_equivalence() {
         first_gpu_specs);
 #endif
 
-#if SZ_USE_CUDA && 0
-    // CUDA Needleman-Wunsch distance against Multi-threaded on CPU
+#if SZ_USE_CUDA
+    // CUDA Needleman-Wunsch score against Multi-threaded on CPU
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
         needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
             blosum62_matrix, blosum62_linear_cost}, //
         needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k> {
             blosum62_matrix, blosum62_linear_cost},
         {}, {}, first_gpu_specs);
+
+    // CUDA Needleman-Wunsch score against Multi-threaded on CPU with affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
+        needleman_wunsch_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_affine_cost}, //
+        needleman_wunsch_scores<char, error_matrix_t, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k> {
+            blosum62_matrix, blosum62_affine_cost},
+        {}, {}, first_gpu_specs);
+
+    // CUDA Smith-Waterman score against Multi-threaded on CPU
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
+        smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_linear_cost}, //
+        smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k> {blosum62_matrix,
+                                                                                                  blosum62_linear_cost},
+        {}, {}, first_gpu_specs);
+
+    // CUDA Smith-Waterman score against Multi-threaded on CPU with affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
+        smith_waterman_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_affine_cost}, //
+        smith_waterman_scores<char, error_matrix_t, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k> {blosum62_matrix,
+                                                                                                  blosum62_affine_cost},
+        {}, {}, first_gpu_specs);
+#endif
+
+#if SZ_USE_HOPPER
+    // CUDA Needleman-Wunsch score on Hopper  against Multi-threaded on CPU
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
+        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_linear_cost}, //
+        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k> {
+            blosum62_matrix, blosum62_linear_cost},
+        {}, {}, first_gpu_specs);
+
+    // CUDA Needleman-Wunsch score on Hopper  against Multi-threaded on CPU with affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
+        needleman_wunsch_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_affine_cost}, //
+        needleman_wunsch_scores<char, error_matrix_t, affine_gap_costs_t, ualloc_t, sz_caps_ckh_k> {
+            blosum62_matrix, blosum62_affine_cost},
+        {}, {}, first_gpu_specs);
+
+    // CUDA Smith-Waterman score on Hopper  against Multi-threaded on CPU
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
+        smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_linear_cost}, //
+        smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k> {blosum62_matrix,
+                                                                                                  blosum62_linear_cost},
+        {}, {}, first_gpu_specs);
+
+    // CUDA Smith-Waterman score on Hopper  against Multi-threaded on CPU with affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
+        smith_waterman_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_affine_cost}, //
+        smith_waterman_scores<char, error_matrix_t, affine_gap_costs_t, ualloc_t, sz_caps_ckh_k> {blosum62_matrix,
+                                                                                                  blosum62_affine_cost},
+        {}, {}, first_gpu_specs);
 #endif
 }
 

From 4134e44cb4dc4caab2694198c180a252d5d0db33 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 6 May 2025 17:47:57 +0000
Subject: [PATCH 405/751] Fix: NW & SW on Hopper

---
 include/stringcuzilla/similarity.cuh | 111 ++++++++++++++-------------
 1 file changed, 58 insertions(+), 53 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index f96f64f6..9e1af8c6 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -114,7 +114,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
   protected:
     substituter_t substituter_ {};
     linear_gap_costs_t gap_costs_ {};
-    score_t last_score_ {0};
+    score_t final_score_ {0};
 
     __forceinline__ __device__ score_t pick_best(score_t a, score_t b) const noexcept {
         if constexpr (objective_ == sz_minimize_distance_k) { return std::min(a, b); }
@@ -137,7 +137,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     /**
      *  @brief Extract the final result of the scoring operation which will be always in the bottom-right corner.
      */
-    __forceinline__ __device__ score_t score() const noexcept { return last_score_; }
+    __forceinline__ __device__ score_t score() const noexcept { return final_score_; }
 
     /**
      *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
@@ -178,7 +178,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         }
 
         // The last element of the last chunk is the result of the global alignment.
-        if (tasks_offset == 0) last_score_ = scores_new[0];
+        if (tasks_offset == 0) final_score_ = scores_new[0];
     }
 };
 
@@ -215,7 +215,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
   protected:
     substituter_t substituter_ {};
     linear_gap_costs_t gap_costs_ {};
-    score_t best_score_ {0};
+    score_t final_score_ {0};
 
     __forceinline__ __device__ score_t pick_best(score_t a, score_t b) const noexcept {
         if constexpr (objective_k == sz_minimize_distance_k) { return std::min(a, b); }
@@ -247,7 +247,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     /**
      *  @brief Extract the final result of the scoring operation which will be always in the bottom-right corner.
      */
-    __forceinline__ __device__ score_t score() const noexcept { return best_score_; }
+    __forceinline__ __device__ score_t score() const noexcept { return final_score_; }
 
     /**
      *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
@@ -288,11 +288,11 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
             scores_new[i] = cell_score;
 
             // Update the global maximum score if this cell beats it.
-            best_score_ = pick_best(best_score_, cell_score);
+            final_score_ = pick_best(final_score_, cell_score);
         }
 
         // ! Don't forget to pick the best among the best scores per thread.
-        best_score_ = pick_best_in_warp(best_score_);
+        final_score_ = pick_best_in_warp(final_score_);
     }
 };
 
@@ -330,7 +330,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
   protected:
     substituter_t substituter_ {};
     affine_gap_costs_t gap_costs_ {};
-    score_t last_score_ {0};
+    score_t final_score_ {0};
 
     __forceinline__ __device__ score_t pick_best(score_t a, score_t b) const noexcept {
         if constexpr (objective_ == sz_minimize_distance_k) { return std::min(a, b); }
@@ -362,7 +362,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     /**
      *  @brief Extract the final result of the scoring operation which will be always in the bottom-right corner.
      */
-    __forceinline__ __device__ score_t score() const noexcept { return last_score_; }
+    __forceinline__ __device__ score_t score() const noexcept { return final_score_; }
 
     /**
      *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
@@ -418,7 +418,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         }
 
         // The last element of the last chunk is the result of the global alignment.
-        if (tasks_offset == 0) last_score_ = scores_new[0];
+        if (tasks_offset == 0) final_score_ = scores_new[0];
     }
 };
 
@@ -455,7 +455,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
   protected:
     substituter_t substituter_ {};
     affine_gap_costs_t gap_costs_ {};
-    score_t best_score_ {0};
+    score_t final_score_ {0};
 
     __forceinline__ __device__ score_t pick_best(score_t a, score_t b) const noexcept {
         if constexpr (objective_k == sz_minimize_distance_k) { return std::min(a, b); }
@@ -494,7 +494,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     /**
      *  @brief Extract the final result of the scoring operation which will be always in the bottom-right corner.
      */
-    __forceinline__ __device__ score_t score() const noexcept { return best_score_; }
+    __forceinline__ __device__ score_t score() const noexcept { return final_score_; }
 
     /**
      *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
@@ -550,11 +550,11 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
             scores_new_deletions[i] = if_deletion;
 
             // Update the global maximum score if this cell beats it.
-            best_score_ = pick_best(best_score_, cell_score);
+            final_score_ = pick_best(final_score_, cell_score);
         }
 
         // ! Don't forget to pick the best among the best scores per thread.
-        best_score_ = pick_best_in_warp(best_score_);
+        final_score_ = pick_best_in_warp(final_score_);
     }
 };
 
@@ -637,7 +637,7 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
         }
 
         // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
-        if (tasks_offset == 0) this->last_score_ = scores_new[0];
+        if (tasks_offset == 0) this->final_score_ = scores_new[0];
     }
 };
 
@@ -712,7 +712,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
         }
 
         // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
-        if (tasks_offset == 0) this->last_score_ = scores_new[0];
+        if (tasks_offset == 0) this->final_score_ = scores_new[0];
     }
 };
 
@@ -831,7 +831,7 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
         }
 
         // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
-        if (tasks_offset == 0) this->last_score_ = scores_new[0];
+        if (tasks_offset == 0) this->final_score_ = scores_new[0];
     }
 };
 
@@ -921,7 +921,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
         }
 
         // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
-        if (tasks_offset == 0) this->last_score_ = scores_new[0];
+        if (tasks_offset == 0) this->final_score_ = scores_new[0];
     }
 };
 
@@ -1022,7 +1022,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
         }
 
         // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
-        if (tasks_offset == 0) this->last_score_ = scores_new[0];
+        if (tasks_offset == 0) this->final_score_ = scores_new[0];
     }
 };
 
@@ -1133,7 +1133,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
         }
 
         // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
-        if (tasks_offset == 0) this->last_score_ = scores_new[0];
+        if (tasks_offset == 0) this->final_score_ = scores_new[0];
     }
 };
 
@@ -2063,10 +2063,9 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
             task.bytes_per_cell = requirement.bytes_per_cell;
             task.density = warp_tasks_density(requirement.total, specs);
             if (task.density == infinite_warps_per_multiprocessor_k) {
-                if constexpr (!is_affine_k) task.result = task.longer_length * gap_costs_.open_or_extend;
-                else
-                    task.result =
-                        task.longer_length ? (task.longer_length - 1) * gap_costs_.extend + gap_costs_.open : 0;
+                if constexpr (!is_affine_k) { task.result = task.longer_length * gap_costs_.open_or_extend; }
+                else if (!task.longer_length) { task.result = 0; }
+                else { task.result = (task.longer_length - 1) * gap_costs_.extend + gap_costs_.open; }
                 count_empty_tasks++;
             }
             tasks[i] = task;
@@ -2273,6 +2272,7 @@ struct error_costs_256x256_in_cuda_constant_memory_t {
 #if defined(__CUDA_ARCH__)
         return _error_costs_in_cuda_constant_memory[static_cast<sz_u8_t>(a) * 256 + static_cast<sz_u8_t>(b)];
 #else
+        sz_unused(a && b);
         return 0;
 #endif
     }
@@ -2306,7 +2306,7 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
 
         sz_i16_t const gap_cost = this->gap_costs_.open_or_extend;
         sz_u32_vec_t gap_cost_vec;
-        gap_cost_vec.u32 = gap_cost * 0x00010001; // ! 2x `i16` gap costs
+        gap_cost_vec.i16s[0] = gap_cost_vec.i16s[1] = gap_cost;
 
         // The hardest part of this kernel is dealing with unaligned loads!
         // We want to minimize single-byte processing in favor of 2-byte SIMD loads and min/max operations.
@@ -2315,8 +2315,8 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
         sz_u32_vec_t pre_substitution_vec, pre_insertion_vec, pre_deletion_vec;
         sz_u32_vec_t first_vec, second_vec;
         sz_u32_vec_t cost_of_substitution_vec, if_deletion_or_insertion_vec;
-        sz_u32_vec_t cell_score_vec, best_score_vec;
-        best_score_vec.i16s[0] = best_score_vec.i16s[1] = std::numeric_limits<sz_i16_t>::min();
+        sz_u32_vec_t cell_score_vec, final_score_vec;
+        final_score_vec.i16s[0] = final_score_vec.i16s[1] = 0;
 
         // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
         // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
@@ -2340,12 +2340,12 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
             if constexpr (locality_k == sz_similarity_global_k) {
                 cell_score_vec.u32 = __viaddmax_s16x2(pre_substitution_vec.u32, cost_of_substitution_vec.u32,
                                                       if_deletion_or_insertion_vec.u32);
-                sz_unused(best_score_vec);
+                sz_unused(final_score_vec);
             }
             else {
                 cell_score_vec.u32 = __viaddmax_s16x2_relu(pre_substitution_vec.u32, cost_of_substitution_vec.u32,
                                                            if_deletion_or_insertion_vec.u32);
-                best_score_vec.u32 = __vmaxs2(cell_score_vec.u32, best_score_vec.u32);
+                final_score_vec.u32 = __vmaxs2(cell_score_vec.u32, final_score_vec.u32);
             }
 
             // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
@@ -2355,10 +2355,11 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
 
         // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
         if constexpr (locality_k == sz_similarity_global_k) {
-            if (tasks_offset == 0) this->last_score_ = scores_new[0];
+            if (tasks_offset == 0) this->final_score_ = scores_new[0];
         }
         else { // Or the best score for local alignment.
-            this->best_score_ = this->pick_best_in_warp((std::max)(best_score_vec.i16s[0], best_score_vec.i16s[1]));
+            this->final_score_ = __vimax3_s32(this->final_score_, final_score_vec.i16s[0], final_score_vec.i16s[1]);
+            this->final_score_ = this->pick_best_in_warp(this->final_score_);
         }
     }
 };
@@ -2386,7 +2387,7 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
         // Make sure we are called for an anti-diagonal traversal order
         sz_i32_t const gap_costs = this->gap_costs_.open_or_extend;
         _sz_assert(scores_pre_insertion + 1 == scores_pre_deletion);
-        sz_i32_t best_score = std::numeric_limits<sz_i32_t>::min();
+        sz_i32_t final_score = 0;
 
         for (uint i = tasks_offset; i < tasks_count; i += tasks_step) {
             sz_i32_t pre_substitution = scores_pre_substitution[i];
@@ -2401,11 +2402,11 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
             // For local scoring we should use the ReLU variants of 3-way `max`.
             if constexpr (locality_k == sz_similarity_global_k) {
                 cell_score = __viaddmax_s32(pre_substitution, cost_of_substitution, if_deletion_or_insertion);
-                sz_unused(best_score);
+                sz_unused(final_score);
             }
             else {
                 cell_score = __viaddmax_s32_relu(pre_substitution, cost_of_substitution, if_deletion_or_insertion);
-                best_score = (std::max)(cell_score, best_score);
+                final_score = (std::max)(cell_score, final_score);
             }
 
             // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
@@ -2414,10 +2415,11 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
 
         // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
         if constexpr (locality_k == sz_similarity_global_k) {
-            if (tasks_offset == 0) this->last_score_ = scores_new[0];
+            if (tasks_offset == 0) this->final_score_ = scores_new[0];
         }
         else { // Or the best score for local alignment.
-            this->best_score_ = this->pick_best_in_warp(best_score);
+            this->final_score_ = (std::max)(this->final_score_, final_score);
+            this->final_score_ = this->pick_best_in_warp(this->final_score_);
         }
     }
 };
@@ -2453,8 +2455,8 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
         sz_i16_t const gap_open_cost = this->gap_costs_.open;
         sz_i16_t const gap_extend_cost = this->gap_costs_.extend;
         sz_u32_vec_t gap_open_cost_vec, gap_extend_cost_vec;
-        gap_open_cost_vec.u32 = gap_open_cost * 0x00010001;     // ! 2x `i16` gap costs
-        gap_extend_cost_vec.u32 = gap_extend_cost * 0x00010001; // ! 2x `i16` gap costs
+        gap_open_cost_vec.i16s[0] = gap_open_cost_vec.i16s[1] = gap_open_cost;
+        gap_extend_cost_vec.i16s[0] = gap_extend_cost_vec.i16s[1] = gap_extend_cost;
 
         // The hardest part of this kernel is dealing with unaligned loads!
         // We want to minimize single-byte processing in favor of 2-byte SIMD loads and min/max operations.
@@ -2464,8 +2466,8 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
         sz_u32_vec_t pre_insertion_expansion_vec, pre_deletion_expansion_vec;
         sz_u32_vec_t first_vec, second_vec;
         sz_u32_vec_t cost_of_substitution_vec, if_substitution_vec, if_insertion_vec, if_deletion_vec;
-        sz_u32_vec_t cell_score_vec, best_score_vec;
-        best_score_vec.i16s[0] = best_score_vec.i16s[1] = std::numeric_limits<sz_i16_t>::min();
+        sz_u32_vec_t cell_score_vec, final_score_vec;
+        final_score_vec.i16s[0] = final_score_vec.i16s[1] = 0;
 
         // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
         // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
@@ -2495,12 +2497,12 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
             // For local scoring we should use the ReLU variants of 3-way `max`.
             if constexpr (locality_k == sz_similarity_global_k) {
                 cell_score_vec.u32 = __vimax3_s16x2(if_substitution_vec.u32, if_insertion_vec.u32, if_deletion_vec.u32);
-                sz_unused(best_score_vec);
+                sz_unused(final_score_vec);
             }
             else {
                 cell_score_vec.u32 =
                     __vimax3_s16x2_relu(if_substitution_vec.u32, if_insertion_vec.u32, if_deletion_vec.u32);
-                best_score_vec.u32 = __vmaxs2(cell_score_vec.u32, best_score_vec.u32);
+                final_score_vec.u32 = __vmaxs2(cell_score_vec.u32, final_score_vec.u32);
             }
 
             // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
@@ -2514,10 +2516,11 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
 
         // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
         if constexpr (locality_k == sz_similarity_global_k) {
-            if (tasks_offset == 0) this->last_score_ = scores_new[0];
+            if (tasks_offset == 0) this->final_score_ = scores_new[0];
         }
         else { // Or the best score for local alignment.
-            this->best_score_ = this->pick_best_in_warp((std::max)(best_score_vec.i16s[0], best_score_vec.i16s[1]));
+            this->final_score_ = __vimax3_s32(this->final_score_, final_score_vec.i16s[0], final_score_vec.i16s[1]);
+            this->final_score_ = this->pick_best_in_warp(this->final_score_);
         }
     }
 };
@@ -2550,7 +2553,7 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
         sz_i32_t const gap_open_cost = this->gap_costs_.open;
         sz_i32_t const gap_extend_cost = this->gap_costs_.extend;
         _sz_assert(scores_pre_insertion + 1 == scores_pre_deletion);
-        sz_i32_t best_score = std::numeric_limits<sz_i32_t>::min();
+        sz_i32_t final_score = 0;
 
         for (uint i = tasks_offset; i < tasks_count; i += tasks_step) {
             sz_i32_t pre_substitution = scores_pre_substitution[i];
@@ -2571,11 +2574,11 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
             // For local scoring we should use the ReLU variants of 3-way `max`.
             if constexpr (locality_k == sz_similarity_global_k) {
                 cell_score = __vimax3_s32(if_substitution, if_insertion, if_deletion);
-                sz_unused(best_score);
+                sz_unused(final_score);
             }
             else {
                 cell_score = __vimax3_s32_relu(if_substitution, if_insertion, if_deletion);
-                best_score = (std::max)(cell_score, best_score);
+                final_score = (std::max)(cell_score, final_score);
             }
 
             // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
@@ -2586,10 +2589,11 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
 
         // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
         if constexpr (locality_k == sz_similarity_global_k) {
-            if (tasks_offset == 0) this->last_score_ = scores_new[0];
+            if (tasks_offset == 0) this->final_score_ = scores_new[0];
         }
         else { // Or the best score for local alignment.
-            this->best_score_ = this->pick_best_in_warp(best_score);
+            this->final_score_ = (std::max)(this->final_score_, final_score);
+            this->final_score_ = this->pick_best_in_warp(this->final_score_);
         }
     }
 };
@@ -2655,6 +2659,7 @@ struct _cuda_nw_or_sw_byte_level_scores {
         results_type_ *results_ptr,                                                           //
         gpu_specs_t specs = {}, cuda_executor_t executor = {}) const noexcept {
 
+        constexpr bool is_local_k = locality_k == sz_similarity_local_k;
         constexpr bool is_affine_k = std::is_same<gap_costs_t, affine_gap_costs_t>::value;
         constexpr size_t count_diagonals_k = is_affine_k ? 7 : 3;
 
@@ -2694,10 +2699,10 @@ struct _cuda_nw_or_sw_byte_level_scores {
             task.bytes_per_cell = requirement.bytes_per_cell;
             task.density = warp_tasks_density(requirement.total, specs);
             if (task.density == infinite_warps_per_multiprocessor_k) {
-                if constexpr (!is_affine_k) task.result = task.longer_length * gap_costs_.open_or_extend;
-                else
-                    task.result =
-                        task.longer_length ? (task.longer_length - 1) * gap_costs_.extend + gap_costs_.open : 0;
+                if constexpr (is_local_k) { task.result = 0; }
+                else if constexpr (!is_affine_k) { task.result = task.longer_length * gap_costs_.open_or_extend; }
+                else if (!task.longer_length) { task.result = 0; }
+                else { task.result = (task.longer_length - 1) * gap_costs_.extend + gap_costs_.open; }
                 count_empty_tasks++;
             }
             tasks[i] = task;

From cd9ff1eef8f3d70967da43a207eb20eb6c66f506 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 7 May 2025 10:12:17 +0000
Subject: [PATCH 406/751] Fix: OOB impact on SW scoring

---
 include/stringcuzilla/similarity.cuh | 112 ++++++++++++++++++++++-----
 scripts/test_stringcuzilla.cuh       |  26 ++++---
 2 files changed, 109 insertions(+), 29 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index 9e1af8c6..35fe33b8 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -76,6 +76,16 @@ using affine_needleman_wunsch_cuda_t =
 using affine_smith_waterman_cuda_t =
     smith_waterman_scores<char, error_costs_256x256_t, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
 
+using needleman_wunsch_hopper_t =
+    needleman_wunsch_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k>;
+using smith_waterman_hopper_t =
+    smith_waterman_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k>;
+
+using affine_needleman_wunsch_hopper_t =
+    needleman_wunsch_scores<char, error_costs_256x256_t, affine_gap_costs_t, ualloc_t, sz_caps_ckh_k>;
+using affine_smith_waterman_hopper_t =
+    smith_waterman_scores<char, error_costs_256x256_t, affine_gap_costs_t, ualloc_t, sz_caps_ckh_k>;
+
 #pragma endregion - Common Aliases
 
 #pragma region - Algorithm Building Blocks
@@ -2304,6 +2314,7 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
         sz_i16_t const *scores_pre_deletion,                                    //
         sz_i16_t *scores_new) noexcept {
 
+        error_costs_256x256_in_cuda_constant_memory_t substituter;
         sz_i16_t const gap_cost = this->gap_costs_.open_or_extend;
         sz_u32_vec_t gap_cost_vec;
         gap_cost_vec.i16s[0] = gap_cost_vec.i16s[1] = gap_cost;
@@ -2320,19 +2331,18 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
 
         // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
         // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
-        for (uint i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
+        uint i = tasks_offset * 2;
+        for (; i + 2 <= tasks_count; i += tasks_step * 2) { // ! OOB is NOT OK
             pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
             pre_insertion_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
             pre_deletion_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
-            first_vec.u16s[0] = *(first_slice + tasks_count - i - 1); // ! with a [] lookup would underflow
-            first_vec.u16s[1] = *(first_slice + tasks_count - i - 2); // ! with a [] lookup would underflow
+            first_vec.u16s[0] = first_slice[tasks_count - i - 1];
+            first_vec.u16s[1] = first_slice[tasks_count - i - 2];
             second_vec.u16s[0] = second_slice[i + 0];
             second_vec.u16s[1] = second_slice[i + 1];
 
-            cost_of_substitution_vec.i16s[0] =
-                error_costs_256x256_in_cuda_constant_memory_t {}(first_vec.u16s[0], second_vec.u16s[0]);
-            cost_of_substitution_vec.i16s[1] =
-                error_costs_256x256_in_cuda_constant_memory_t {}(first_vec.u16s[1], second_vec.u16s[1]);
+            cost_of_substitution_vec.i16s[0] = substituter(first_vec.u16s[0], second_vec.u16s[0]);
+            cost_of_substitution_vec.i16s[1] = substituter(first_vec.u16s[1], second_vec.u16s[1]);
             if_deletion_or_insertion_vec.u32 =
                 __vaddss2(__vmaxs2(pre_insertion_vec.u32, pre_deletion_vec.u32), gap_cost_vec.u32);
 
@@ -2353,13 +2363,42 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
             scores_new[i + 1] = cell_score_vec.i16s[1];
         }
 
+        // Handle the tail - the single last entry
+        sz_i16_t final_score;
+        if constexpr (locality_k == sz_similarity_local_k)
+            final_score = __vimax3_s32(this->final_score_, final_score_vec.i16s[0], final_score_vec.i16s[1]);
+        if (i + 1 == tasks_count) {
+            sz_i16_t pre_substitution = scores_pre_substitution[i];
+            sz_i16_t pre_insertion = scores_pre_insertion[i];
+            sz_i16_t pre_deletion = scores_pre_deletion[i];
+            char first_char = first_slice[tasks_count - i - 1];
+            char second_char = second_slice[i + 0];
+
+            error_cost_t cost_of_substitution = substituter(first_char, second_char);
+            sz_i16_t if_deletion_or_insertion = (std::max)(pre_insertion, pre_deletion) + gap_cost;
+
+            // For local scoring we should use the ReLU variants of 3-way `max`.
+            sz_i16_t cell_score;
+            if constexpr (locality_k == sz_similarity_global_k) {
+                cell_score = (std::max<sz_i16_t>)(pre_substitution + cost_of_substitution, if_deletion_or_insertion);
+                sz_unused(final_score);
+            }
+            else {
+                if_deletion_or_insertion = (std::max<sz_i16_t>)(if_deletion_or_insertion, 0);
+                cell_score = (std::max<sz_i16_t>)(pre_substitution + cost_of_substitution, if_deletion_or_insertion);
+                final_score = (std::max)(cell_score, final_score);
+            }
+
+            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
+            scores_new[i + 0] = cell_score;
+        }
+
         // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
         if constexpr (locality_k == sz_similarity_global_k) {
             if (tasks_offset == 0) this->final_score_ = scores_new[0];
         }
         else { // Or the best score for local alignment.
-            this->final_score_ = __vimax3_s32(this->final_score_, final_score_vec.i16s[0], final_score_vec.i16s[1]);
-            this->final_score_ = this->pick_best_in_warp(this->final_score_);
+            this->final_score_ = this->pick_best_in_warp(final_score);
         }
     }
 };
@@ -2452,6 +2491,7 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
         sz_i16_t *scores_new_insertions,                                        //
         sz_i16_t *scores_new_deletions) noexcept {
 
+        error_costs_256x256_in_cuda_constant_memory_t substituter;
         sz_i16_t const gap_open_cost = this->gap_costs_.open;
         sz_i16_t const gap_extend_cost = this->gap_costs_.extend;
         sz_u32_vec_t gap_open_cost_vec, gap_extend_cost_vec;
@@ -2471,21 +2511,20 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
 
         // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
         // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
-        for (uint i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
+        uint i = tasks_offset * 2;
+        for (; i + 2 <= tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
             pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
             pre_insertion_opening_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
             pre_deletion_opening_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
             pre_insertion_expansion_vec = sz_u32_load_unaligned(scores_running_insertions + i);
             pre_deletion_expansion_vec = sz_u32_load_unaligned(scores_running_deletions + i);
-            first_vec.u16s[0] = *(first_slice + tasks_count - i - 1); // ! with a [] lookup would underflow
-            first_vec.u16s[1] = *(first_slice + tasks_count - i - 2); // ! with a [] lookup would underflow
+            first_vec.u16s[0] = first_slice[tasks_count - i - 1];
+            first_vec.u16s[1] = first_slice[tasks_count - i - 2];
             second_vec.u16s[0] = second_slice[i + 0];
             second_vec.u16s[1] = second_slice[i + 1];
 
-            cost_of_substitution_vec.i16s[0] =
-                error_costs_256x256_in_cuda_constant_memory_t {}(first_vec.u16s[0], second_vec.u16s[0]);
-            cost_of_substitution_vec.i16s[1] =
-                error_costs_256x256_in_cuda_constant_memory_t {}(first_vec.u16s[1], second_vec.u16s[1]);
+            cost_of_substitution_vec.i16s[0] = substituter(first_vec.u16s[0], second_vec.u16s[0]);
+            cost_of_substitution_vec.i16s[1] = substituter(first_vec.u16s[1], second_vec.u16s[1]);
             if_substitution_vec.u32 = __vaddss2(pre_substitution_vec.u32, cost_of_substitution_vec.u32);
             if_insertion_vec.u32 = //
                 __viaddmax_s16x2(pre_insertion_opening_vec.u32, gap_open_cost_vec.u32,
@@ -2514,13 +2553,50 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
             scores_new_deletions[i + 1] = if_deletion_vec.i16s[1];
         }
 
+        // Handle the tail - the single last entry
+        sz_i16_t final_score;
+        if constexpr (locality_k == sz_similarity_local_k)
+            final_score = __vimax3_s32(this->final_score_, final_score_vec.i16s[0], final_score_vec.i16s[1]);
+        if (i + 1 == tasks_count) {
+            sz_i16_t pre_substitution = scores_pre_substitution[i];
+            sz_i16_t pre_insertion_opening = scores_pre_insertion[i];
+            sz_i16_t pre_deletion_opening = scores_pre_deletion[i];
+            sz_i16_t pre_insertion_expansion = scores_running_insertions[i];
+            sz_i16_t pre_deletion_expansion = scores_running_deletions[i];
+            char first_char = first_slice[tasks_count - i - 1];
+            char second_char = second_slice[i + 0];
+
+            error_cost_t cost_of_substitution = substituter(first_char, second_char);
+            sz_i16_t if_insertion =
+                (std::max)(pre_insertion_opening + gap_open_cost, pre_insertion_expansion + gap_extend_cost);
+            sz_i16_t if_deletion =
+                (std::max)(pre_deletion_opening + gap_open_cost, pre_deletion_expansion + gap_extend_cost);
+            sz_i16_t if_deletion_or_insertion = (std::max)(if_insertion, if_deletion);
+
+            // For local scoring we should use the ReLU variants of 3-way `max`.
+            sz_i16_t cell_score;
+            if constexpr (locality_k == sz_similarity_global_k) {
+                cell_score = (std::max<sz_i16_t>)(pre_substitution + cost_of_substitution, if_deletion_or_insertion);
+                sz_unused(final_score);
+            }
+            else {
+                if_deletion_or_insertion = (std::max<sz_i16_t>)(if_deletion_or_insertion, 0);
+                cell_score = (std::max<sz_i16_t>)(pre_substitution + cost_of_substitution, if_deletion_or_insertion);
+                final_score = (std::max)(cell_score, final_score);
+            }
+
+            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
+            scores_new[i + 0] = cell_score;
+            scores_new_insertions[i + 0] = if_insertion;
+            scores_new_deletions[i + 0] = if_deletion;
+        }
+
         // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
         if constexpr (locality_k == sz_similarity_global_k) {
             if (tasks_offset == 0) this->final_score_ = scores_new[0];
         }
         else { // Or the best score for local alignment.
-            this->final_score_ = __vimax3_s32(this->final_score_, final_score_vec.i16s[0], final_score_vec.i16s[1]);
-            this->final_score_ = this->pick_best_in_warp(this->final_score_);
+            this->final_score_ = this->pick_best_in_warp(final_score);
         }
     }
 };
diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index 51b3e74f..77d5191d 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -447,18 +447,21 @@ void test_similarity_scores_fixed(base_operator_ &&base_operator, simd_operator_
     };
 
     // Some vary basic variants:
-    append("ABC", "ABC");                  // same string; distance ~ 0
-    append("LISTEN", "SILENT");            // distance ~ 4
-    append("ATCA", "CTACTCACCC");          // distance ~ 6
-    append("A", "=");                      // distance ~ 1
+    append("ggbuzgjux{}l", "gbuzgjux{}l"); // one (prepended) insertion; distance ~ 1
     append("A", "A");                      // distance ~ 0
+    append("A", "=");                      // distance ~ 1
     append("", "");                        // distance ~ 0
+    append("ABC", "ABC");                  // same string; distance ~ 0
+    append("ABC", "AABC");                 // distance ~ 1, prepended
+    append("ABC", "ABCC");                 // distance ~ 1, appended
     append("", "ABC");                     // distance ~ 3
     append("ABC", "");                     // distance ~ 3
     append("ABC", "AC");                   // one deletion; distance ~ 1
-    append("ABC", "A_BC");                 // one insertion; distance ~ 1
-    append("ggbuzgjux{}l", "gbuzgjux{}l"); // one (prepended) insertion; distance ~ 1
-    append("ABC", "ADC");                  // one substitution; distance ~ 1
+    append("ABC", "AXBC");                 // one X insertion; distance ~ 1
+    append("ABC", "AXC");                  // one X substitution; distance ~ 1
+    append("ABCDEFG", "ABCXEFG");          // one X substitution; distance ~ 1
+    append("LISTEN", "SILENT");            // distance ~ 4
+    append("ATCA", "CTACTCACCC");          // distance ~ 6
     append("APPLE", "APLE");               // distance ~ 1
 
     // Longer strings made of simple characters:
@@ -871,7 +874,7 @@ void test_similarity_scores_equivalence() {
 #endif
 
 #if SZ_USE_HOPPER
-    // CUDA Needleman-Wunsch score on Hopper  against Multi-threaded on CPU
+    // CUDA Needleman-Wunsch score on Hopper against Multi-threaded on CPU
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
         needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
             blosum62_matrix, blosum62_linear_cost}, //
@@ -879,7 +882,7 @@ void test_similarity_scores_equivalence() {
             blosum62_matrix, blosum62_linear_cost},
         {}, {}, first_gpu_specs);
 
-    // CUDA Needleman-Wunsch score on Hopper  against Multi-threaded on CPU with affine costs
+    // CUDA Needleman-Wunsch score on Hopper against Multi-threaded on CPU with affine costs
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
         needleman_wunsch_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
             blosum62_matrix, blosum62_affine_cost}, //
@@ -887,7 +890,7 @@ void test_similarity_scores_equivalence() {
             blosum62_matrix, blosum62_affine_cost},
         {}, {}, first_gpu_specs);
 
-    // CUDA Smith-Waterman score on Hopper  against Multi-threaded on CPU
+    // CUDA Smith-Waterman score on Hopper against Multi-threaded on CPU
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
         smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
             blosum62_matrix, blosum62_linear_cost}, //
@@ -895,13 +898,14 @@ void test_similarity_scores_equivalence() {
                                                                                                   blosum62_linear_cost},
         {}, {}, first_gpu_specs);
 
-    // CUDA Smith-Waterman score on Hopper  against Multi-threaded on CPU with affine costs
+    // CUDA Smith-Waterman score on Hopper against Multi-threaded on CPU with affine costs
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
         smith_waterman_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
             blosum62_matrix, blosum62_affine_cost}, //
         smith_waterman_scores<char, error_matrix_t, affine_gap_costs_t, ualloc_t, sz_caps_ckh_k> {blosum62_matrix,
                                                                                                   blosum62_affine_cost},
         {}, {}, first_gpu_specs);
+
 #endif
 }
 

From 817b15c6d9dbd474d2ab88394ad16ded942e6391 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 7 May 2025 10:26:11 +0000
Subject: [PATCH 407/751] Improve: Divergent branches on `i16` SW on Hopper

---
 include/stringcuzilla/similarity.cuh | 100 ++++++---------------------
 1 file changed, 20 insertions(+), 80 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index 35fe33b8..f4179c51 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -2331,13 +2331,12 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
 
         // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
         // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
-        uint i = tasks_offset * 2;
-        for (; i + 2 <= tasks_count; i += tasks_step * 2) { // ! OOB is NOT OK
+        for (uint i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
             pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
             pre_insertion_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
             pre_deletion_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
-            first_vec.u16s[0] = first_slice[tasks_count - i - 1];
-            first_vec.u16s[1] = first_slice[tasks_count - i - 2];
+            first_vec.u16s[0] = *(first_slice + tasks_count - i - 1); // ! with a [] lookup would underflow
+            first_vec.u16s[1] = *(first_slice + tasks_count - i - 2); // ! with a [] lookup would underflow
             second_vec.u16s[0] = second_slice[i + 0];
             second_vec.u16s[1] = second_slice[i + 1];
 
@@ -2355,7 +2354,11 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
             else {
                 cell_score_vec.u32 = __viaddmax_s16x2_relu(pre_substitution_vec.u32, cost_of_substitution_vec.u32,
                                                            if_deletion_or_insertion_vec.u32);
-                final_score_vec.u32 = __vmaxs2(cell_score_vec.u32, final_score_vec.u32);
+                // In the last iteration of the loop the second half-word contains noise,
+                // so we have to discard it from affecting the final score.
+                bool const is_tail = i + 1 == tasks_count;
+                final_score_vec.i16s[0] = (std::max)(cell_score_vec.i16s[0], final_score_vec.i16s[0]);
+                final_score_vec.i16s[1] = (std::max)(cell_score_vec.i16s[1 - is_tail], final_score_vec.i16s[1]);
             }
 
             // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
@@ -2363,42 +2366,13 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
             scores_new[i + 1] = cell_score_vec.i16s[1];
         }
 
-        // Handle the tail - the single last entry
-        sz_i16_t final_score;
-        if constexpr (locality_k == sz_similarity_local_k)
-            final_score = __vimax3_s32(this->final_score_, final_score_vec.i16s[0], final_score_vec.i16s[1]);
-        if (i + 1 == tasks_count) {
-            sz_i16_t pre_substitution = scores_pre_substitution[i];
-            sz_i16_t pre_insertion = scores_pre_insertion[i];
-            sz_i16_t pre_deletion = scores_pre_deletion[i];
-            char first_char = first_slice[tasks_count - i - 1];
-            char second_char = second_slice[i + 0];
-
-            error_cost_t cost_of_substitution = substituter(first_char, second_char);
-            sz_i16_t if_deletion_or_insertion = (std::max)(pre_insertion, pre_deletion) + gap_cost;
-
-            // For local scoring we should use the ReLU variants of 3-way `max`.
-            sz_i16_t cell_score;
-            if constexpr (locality_k == sz_similarity_global_k) {
-                cell_score = (std::max<sz_i16_t>)(pre_substitution + cost_of_substitution, if_deletion_or_insertion);
-                sz_unused(final_score);
-            }
-            else {
-                if_deletion_or_insertion = (std::max<sz_i16_t>)(if_deletion_or_insertion, 0);
-                cell_score = (std::max<sz_i16_t>)(pre_substitution + cost_of_substitution, if_deletion_or_insertion);
-                final_score = (std::max)(cell_score, final_score);
-            }
-
-            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
-            scores_new[i + 0] = cell_score;
-        }
-
         // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
         if constexpr (locality_k == sz_similarity_global_k) {
             if (tasks_offset == 0) this->final_score_ = scores_new[0];
         }
         else { // Or the best score for local alignment.
-            this->final_score_ = this->pick_best_in_warp(final_score);
+            this->final_score_ = __vimax3_s32(this->final_score_, final_score_vec.i16s[0], final_score_vec.i16s[1]);
+            this->final_score_ = this->pick_best_in_warp(this->final_score_);
         }
     }
 };
@@ -2511,15 +2485,14 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
 
         // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
         // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
-        uint i = tasks_offset * 2;
-        for (; i + 2 <= tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
+        for (uint i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
             pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
             pre_insertion_opening_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
             pre_deletion_opening_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
             pre_insertion_expansion_vec = sz_u32_load_unaligned(scores_running_insertions + i);
             pre_deletion_expansion_vec = sz_u32_load_unaligned(scores_running_deletions + i);
-            first_vec.u16s[0] = first_slice[tasks_count - i - 1];
-            first_vec.u16s[1] = first_slice[tasks_count - i - 2];
+            first_vec.u16s[0] = *(first_slice + tasks_count - i - 1); // ! with a [] lookup would underflow
+            first_vec.u16s[1] = *(first_slice + tasks_count - i - 2); // ! with a [] lookup would underflow
             second_vec.u16s[0] = second_slice[i + 0];
             second_vec.u16s[1] = second_slice[i + 1];
 
@@ -2541,7 +2514,11 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
             else {
                 cell_score_vec.u32 =
                     __vimax3_s16x2_relu(if_substitution_vec.u32, if_insertion_vec.u32, if_deletion_vec.u32);
-                final_score_vec.u32 = __vmaxs2(cell_score_vec.u32, final_score_vec.u32);
+                // In the last iteration of the loop the second half-word contains noise,
+                // so we have to discard it from affecting the final score.
+                bool const is_tail = i + 1 == tasks_count;
+                final_score_vec.i16s[0] = (std::max)(cell_score_vec.i16s[0], final_score_vec.i16s[0]);
+                final_score_vec.i16s[1] = (std::max)(cell_score_vec.i16s[1 - is_tail], final_score_vec.i16s[1]);
             }
 
             // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
@@ -2553,50 +2530,13 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
             scores_new_deletions[i + 1] = if_deletion_vec.i16s[1];
         }
 
-        // Handle the tail - the single last entry
-        sz_i16_t final_score;
-        if constexpr (locality_k == sz_similarity_local_k)
-            final_score = __vimax3_s32(this->final_score_, final_score_vec.i16s[0], final_score_vec.i16s[1]);
-        if (i + 1 == tasks_count) {
-            sz_i16_t pre_substitution = scores_pre_substitution[i];
-            sz_i16_t pre_insertion_opening = scores_pre_insertion[i];
-            sz_i16_t pre_deletion_opening = scores_pre_deletion[i];
-            sz_i16_t pre_insertion_expansion = scores_running_insertions[i];
-            sz_i16_t pre_deletion_expansion = scores_running_deletions[i];
-            char first_char = first_slice[tasks_count - i - 1];
-            char second_char = second_slice[i + 0];
-
-            error_cost_t cost_of_substitution = substituter(first_char, second_char);
-            sz_i16_t if_insertion =
-                (std::max)(pre_insertion_opening + gap_open_cost, pre_insertion_expansion + gap_extend_cost);
-            sz_i16_t if_deletion =
-                (std::max)(pre_deletion_opening + gap_open_cost, pre_deletion_expansion + gap_extend_cost);
-            sz_i16_t if_deletion_or_insertion = (std::max)(if_insertion, if_deletion);
-
-            // For local scoring we should use the ReLU variants of 3-way `max`.
-            sz_i16_t cell_score;
-            if constexpr (locality_k == sz_similarity_global_k) {
-                cell_score = (std::max<sz_i16_t>)(pre_substitution + cost_of_substitution, if_deletion_or_insertion);
-                sz_unused(final_score);
-            }
-            else {
-                if_deletion_or_insertion = (std::max<sz_i16_t>)(if_deletion_or_insertion, 0);
-                cell_score = (std::max<sz_i16_t>)(pre_substitution + cost_of_substitution, if_deletion_or_insertion);
-                final_score = (std::max)(cell_score, final_score);
-            }
-
-            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
-            scores_new[i + 0] = cell_score;
-            scores_new_insertions[i + 0] = if_insertion;
-            scores_new_deletions[i + 0] = if_deletion;
-        }
-
         // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
         if constexpr (locality_k == sz_similarity_global_k) {
             if (tasks_offset == 0) this->final_score_ = scores_new[0];
         }
         else { // Or the best score for local alignment.
-            this->final_score_ = this->pick_best_in_warp(final_score);
+            this->final_score_ = __vimax3_s32(this->final_score_, final_score_vec.i16s[0], final_score_vec.i16s[1]);
+            this->final_score_ = this->pick_best_in_warp(this->final_score_);
         }
     }
 };

From 6aaf16cf0d997d97e2fa69464a0b3285f1d46a6a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 7 May 2025 13:39:31 +0000
Subject: [PATCH 408/751] Add: Parallel Ice Lake variants

---
 include/stringcuzilla/similarity.hpp | 282 ++++++++++++++++++++++-----
 scripts/bench_similarity.cuh         |  93 +++++++--
 2 files changed, 315 insertions(+), 60 deletions(-)

diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index 257a6148..3ad1404b 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -475,10 +475,12 @@ using smith_waterman_ice_t =
 
 using affine_levenshtein_ice_t = levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_caps_si_k>;
 using affine_levenshtein_utf8_ice_t = levenshtein_distances_utf8<char, affine_gap_costs_t, malloc_t, sz_caps_si_k>;
-using affine_needleman_wunsch_ice_t =
-    needleman_wunsch_scores<char, error_costs_256x256_t, affine_gap_costs_t, malloc_t, sz_caps_si_k>;
-using affine_smith_waterman_ice_t =
-    smith_waterman_scores<char, error_costs_256x256_t, affine_gap_costs_t, malloc_t, sz_caps_si_k>;
+
+// TODO: Ice Lake optimizations don't yield massive improvements, but can be added later.
+// using affine_needleman_wunsch_ice_t =
+//     needleman_wunsch_scores<char, error_costs_256x256_t, affine_gap_costs_t, malloc_t, sz_caps_si_k>;
+// using affine_smith_waterman_ice_t =
+//     smith_waterman_scores<char, error_costs_256x256_t, affine_gap_costs_t, malloc_t, sz_caps_si_k>;
 
 #pragma endregion - Common Aliases
 
@@ -1372,7 +1374,12 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, linear_gap_
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score.
      */
-    status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref) const noexcept {
+    template <typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
+    status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref,
+                        executor_type_ &&executor = {}) const noexcept {
 
         // Early exit for empty strings.
         if (first.empty() || second.empty()) {
@@ -1424,7 +1431,9 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, linear_gap_
                 previous_scores,                                 // costs pre substitution
                 previous_scores + 1,                             // costs pre insertion
                 current_scores,                                  // costs pre deletion
-                current_scores + 1);
+                current_scores + 1,                              // new scores
+                executor                                         // ! note, most horizontal scorers are not parallel
+            );
 
             // Reuse the memory.
             std::swap(previous_scores, current_scores);
@@ -1490,7 +1499,12 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, affine_gap_
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score.
      */
-    status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref) const noexcept {
+    template <typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
+    status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref,
+                        executor_type_ &&executor = {}) const noexcept {
 
         // Early exit for empty strings.
         if (first.empty() || second.empty()) {
@@ -1556,7 +1570,8 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, affine_gap_
                 current_scores, previous_scores + 1,             // costs pre insertion/deletion opening
                 current_inserts, previous_deletes + 1,           // costs pre insertion/deletion extension
                 current_scores + 1,                              // updated similarity scores
-                current_inserts + 1, current_deletes + 1         // updated insertion/deletion extensions
+                current_inserts + 1, current_deletes + 1,        // updated insertion/deletion extensions
+                executor                                         // ! note, most horizontal scorers are not parallel
             );
 
             // Reuse the memory.
@@ -3818,11 +3833,11 @@ struct levenshtein_distance_utf8<char, linear_gap_costs_t, allocator_type_, capa
  *  - 8-bit, 16-bit, 32-bit, and even 64-bit costs.
  *  - Any memory allocator used.
  */
-struct lookup_in256bytes_ice_t {
+struct _lookup_in256bytes_ice_t {
     sz_u512_vec_t row_subs_vecs_[4];
     sz_u512_vec_t is_third_or_fourth_vec_, is_second_or_fourth_vec_;
 
-    inline lookup_in256bytes_ice_t() noexcept {
+    inline _lookup_in256bytes_ice_t() noexcept {
         char is_third_or_fourth_check, is_second_or_fourth_check;
         *(sz_u8_t *)&is_third_or_fourth_check = 0x80, *(sz_u8_t *)&is_second_or_fourth_check = 0x40;
         is_third_or_fourth_vec_.zmm = _mm512_set1_epi8(is_third_or_fourth_check);
@@ -3866,15 +3881,17 @@ struct lookup_in256bytes_ice_t {
 
 /**
  *  @brief  Helper object for Ice Lake CPUs, designed for horizontal layout "walkers", operating over 16-bit costs.
+ *          It's based on the idea, that substitutions are the most expensive part of the algorithm, so those are
+ *          parallelized, while the running minimums within a row are computed in a serial fashion.
  *
  *  This is a common abstraction for both:
  *  - Local SW and global NW alignment.
  *  - Serial and parallel implementations.
  *  - Any memory allocator used.
  */
-template <sz_similarity_locality_t locality_, sz_capability_t capability_>
+template <sz_similarity_locality_t locality_>
 struct tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_256x256_t, linear_gap_costs_t,
-                   sz_maximize_score_k, locality_, capability_, std::enable_if_t<capability_ & sz_cap_ice_k>>
+                   sz_maximize_score_k, locality_, sz_cap_ice_k>
     : public tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_256x256_t, linear_gap_costs_t,
                          sz_maximize_score_k, locality_, sz_cap_serial_k, void> {
 
@@ -3884,14 +3901,18 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_
 
     static constexpr sz_similarity_objective_t objective_k = sz_maximize_score_k;
     static constexpr sz_similarity_locality_t locality_k = locality_;
-    static constexpr sz_capability_t capability_k = capability_;
+    static constexpr sz_capability_t capability_k = sz_cap_ice_k;
 
-    lookup_in256bytes_ice_t lookup_;
+    _lookup_in256bytes_ice_t lookup_;
 
+    template <typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
     void operator()(                                                                   //
         constant_iterator<char> first_char, char const *second_slice, size_t n,        //
         sz_i16_t const *scores_pre_substitution, sz_i16_t const *scores_pre_insertion, //
-        sz_i16_t const *scores_pre_deletion, sz_i16_t *scores_new) noexcept {
+        sz_i16_t const *scores_pre_deletion, sz_i16_t *scores_new, executor_type_ &&executor = {}) noexcept {
 
         // Load a new substitution row.
         sz_i16_t const gap = static_cast<sz_i16_t>(this->gap_costs_.open_or_extend);
@@ -3900,9 +3921,9 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_
 
         // Progress through the row 64 characters at a time.
         size_t const count_slices = n / 64;
-#pragma omp parallel for if (capability_k & sz_cap_parallel_k)
-        for (size_t idx_slice = 0; idx_slice != count_slices; ++idx_slice)
+        executor.for_each_static(count_slices, [&](size_t idx_slice) noexcept {
             slice_64chars(second_slice, idx_slice * 64, gap, scores_pre_substitution, scores_pre_insertion, scores_new);
+        });
 
         // Handle the tail with a less efficient kernel - at most 2 iterations of the following loop:
         for (size_t idx_half_slice = count_slices * 2; idx_half_slice * 32 < n; ++idx_half_slice)
@@ -4008,11 +4029,182 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_
     }
 };
 
+template <sz_similarity_locality_t locality_>
+struct tile_scorer<constant_iterator<char>, char const *, sz_i32_t, error_costs_256x256_t, linear_gap_costs_t,
+                   sz_maximize_score_k, locality_, sz_cap_ice_k, void>
+    : public tile_scorer<constant_iterator<char>, char const *, sz_i32_t, error_costs_256x256_t, linear_gap_costs_t,
+                         sz_maximize_score_k, locality_, sz_cap_serial_k, void> {
+
+    using tile_scorer<constant_iterator<char>, char const *, sz_i32_t, error_costs_256x256_t, linear_gap_costs_t,
+                      sz_maximize_score_k, locality_, sz_cap_serial_k,
+                      void>::tile_scorer; // Make the constructors visible
+
+    static constexpr sz_similarity_objective_t objective_k = sz_maximize_score_k;
+    static constexpr sz_similarity_locality_t locality_k = locality_;
+    static constexpr sz_capability_t capability_k = sz_cap_ice_k;
+
+    _lookup_in256bytes_ice_t lookup_;
+
+    template <typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
+    void operator()(                                                                   //
+        constant_iterator<char> first_char, char const *second_slice, size_t n,        //
+        sz_i32_t const *scores_pre_substitution, sz_i32_t const *scores_pre_insertion, //
+        sz_i32_t const *scores_pre_deletion, sz_i32_t *scores_new, executor_type_ &&executor = {}) noexcept {
+
+        // Load a new substitution row.
+        sz_i32_t const gap = static_cast<sz_i32_t>(this->gap_costs_.open_or_extend);
+        error_cost_t const *substitutions_row = &this->substituter_.cells[(sz_u8_t)*first_char][0];
+        lookup_.reload(substitutions_row);
+
+        // Progress through the row 64 characters at a time.
+        size_t const count_slices = n / 64;
+        executor.for_each_static(count_slices, [&](size_t idx_slice) noexcept {
+            slice_64chars(second_slice, idx_slice * 64, gap, scores_pre_substitution, scores_pre_insertion, scores_new);
+        });
+
+        // Handle the tail with a less efficient kernel - at most 4 iterations of the following loop:
+        for (size_t idx_quarter_slice = count_slices * 4; idx_quarter_slice * 16 < n; ++idx_quarter_slice)
+            slice_upto16chars(second_slice, idx_quarter_slice * 16, n, gap, scores_pre_substitution,
+                              scores_pre_insertion, scores_new);
+
+        // Horizontally compute the running minimum of the last row.
+        // Simply disabling this operation results in 5x performance improvement, meaning
+        //
+        // To perform the same operation in vectorized form, we need to perform a tree-like reduction,
+        // that will involve multiple steps. It's quite expensive and should be first tested in the
+        // "experimental" section.
+        _sz_assert(scores_pre_substitution + 1 == scores_pre_insertion && "Expects horizontal traversal of DP matrix");
+        _sz_assert(scores_pre_deletion + 1 == scores_new && "Expects horizontal traversal of DP matrix");
+        sz_i32_t last_in_row = scores_pre_deletion[0];
+        for (size_t i = 0; i < n; ++i) scores_new[i] = last_in_row = sz_max_of_two(scores_new[i], last_in_row + gap);
+        this->last_score_ = last_in_row;
+    }
+
+    void slice_64chars(char const *second_slice, size_t i, sz_i32_t gap,                              //
+                       sz_i32_t const *scores_pre_substitution, sz_i32_t const *scores_pre_insertion, //
+                       sz_i32_t *scores_new) const noexcept {
+
+        sz_u512_vec_t second_vec;
+        sz_u512_vec_t pre_substitution_vecs[4], pre_gap_vecs[4];
+        sz_u512_vec_t cost_of_substitution_i8_vec, cost_of_substitution_i32_vecs[4];
+        sz_u512_vec_t cost_if_substitution_vecs[4], cost_if_gap_vecs[4], cell_score_vecs[4];
+
+        // Initialize constats:
+        sz_u512_vec_t gap_cost_vec;
+        gap_cost_vec.zmm = _mm512_set1_epi32(gap);
+
+        // Load the data without any masks:
+        second_vec.zmm = _mm512_loadu_epi8(second_slice + i);
+        pre_substitution_vecs[0].zmm = _mm512_loadu_epi32(scores_pre_substitution + i + 16 * 0);
+        pre_substitution_vecs[1].zmm = _mm512_loadu_epi32(scores_pre_substitution + i + 16 * 1);
+        pre_substitution_vecs[2].zmm = _mm512_loadu_epi32(scores_pre_substitution + i + 16 * 2);
+        pre_substitution_vecs[3].zmm = _mm512_loadu_epi32(scores_pre_substitution + i + 16 * 3);
+        pre_gap_vecs[0].zmm = _mm512_loadu_epi32(scores_pre_insertion + i + 16 * 0);
+        pre_gap_vecs[1].zmm = _mm512_loadu_epi32(scores_pre_insertion + i + 16 * 1);
+        pre_gap_vecs[2].zmm = _mm512_loadu_epi32(scores_pre_insertion + i + 16 * 2);
+        pre_gap_vecs[3].zmm = _mm512_loadu_epi32(scores_pre_insertion + i + 16 * 3);
+
+        // First, sign-extend the substitution cost vector.
+        cost_of_substitution_i8_vec = lookup_.lookup64(second_vec);
+        cost_of_substitution_i32_vecs[0].zmm =
+            _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32(cost_of_substitution_i8_vec.zmm, 0));
+        cost_of_substitution_i32_vecs[1].zmm =
+            _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32(cost_of_substitution_i8_vec.zmm, 1));
+        cost_of_substitution_i32_vecs[2].zmm =
+            _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32(cost_of_substitution_i8_vec.zmm, 2));
+        cost_of_substitution_i32_vecs[3].zmm =
+            _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32(cost_of_substitution_i8_vec.zmm, 3));
+
+        // Then compute the data-parallel part, assuming the cost of deletions will be propagated
+        // left to right outside of this loop.
+        cost_if_substitution_vecs[0].zmm =
+            _mm512_add_epi32(pre_substitution_vecs[0].zmm, cost_of_substitution_i32_vecs[0].zmm);
+        cost_if_substitution_vecs[1].zmm =
+            _mm512_add_epi32(pre_substitution_vecs[1].zmm, cost_of_substitution_i32_vecs[1].zmm);
+        cost_if_substitution_vecs[2].zmm =
+            _mm512_add_epi32(pre_substitution_vecs[2].zmm, cost_of_substitution_i32_vecs[2].zmm);
+        cost_if_substitution_vecs[3].zmm =
+            _mm512_add_epi32(pre_substitution_vecs[3].zmm, cost_of_substitution_i32_vecs[3].zmm);
+        cost_if_gap_vecs[0].zmm = _mm512_add_epi32(pre_gap_vecs[0].zmm, gap_cost_vec.zmm);
+        cost_if_gap_vecs[1].zmm = _mm512_add_epi32(pre_gap_vecs[1].zmm, gap_cost_vec.zmm);
+        cost_if_gap_vecs[2].zmm = _mm512_add_epi32(pre_gap_vecs[2].zmm, gap_cost_vec.zmm);
+        cost_if_gap_vecs[3].zmm = _mm512_add_epi32(pre_gap_vecs[3].zmm, gap_cost_vec.zmm);
+        cell_score_vecs[0].zmm = _mm512_max_epi32(cost_if_substitution_vecs[0].zmm, cost_if_gap_vecs[0].zmm);
+        cell_score_vecs[1].zmm = _mm512_max_epi32(cost_if_substitution_vecs[1].zmm, cost_if_gap_vecs[1].zmm);
+        cell_score_vecs[2].zmm = _mm512_max_epi32(cost_if_substitution_vecs[2].zmm, cost_if_gap_vecs[2].zmm);
+        cell_score_vecs[3].zmm = _mm512_max_epi32(cost_if_substitution_vecs[3].zmm, cost_if_gap_vecs[3].zmm);
+
+        // In Local Alignment for SW we also need to compare to zero and set the result to zero if negative.
+        if constexpr (locality_ == sz_similarity_local_k)
+            cell_score_vecs[0].zmm = _mm512_max_epi32(cell_score_vecs[0].zmm, _mm512_setzero_epi32()),
+            cell_score_vecs[1].zmm = _mm512_max_epi32(cell_score_vecs[1].zmm, _mm512_setzero_epi32()),
+            cell_score_vecs[2].zmm = _mm512_max_epi32(cell_score_vecs[2].zmm, _mm512_setzero_epi32()),
+            cell_score_vecs[3].zmm = _mm512_max_epi32(cell_score_vecs[3].zmm, _mm512_setzero_epi32());
+
+        // Dump partial results to the output buffer.
+        _mm512_storeu_epi32(scores_new + i + 16 * 0, cell_score_vecs[0].zmm);
+        _mm512_storeu_epi32(scores_new + i + 16 * 1, cell_score_vecs[1].zmm);
+        _mm512_storeu_epi32(scores_new + i + 16 * 2, cell_score_vecs[2].zmm);
+        _mm512_storeu_epi32(scores_new + i + 16 * 3, cell_score_vecs[3].zmm);
+    }
+
+    void slice_upto16chars(char const *second_slice, size_t i, size_t n, sz_i32_t gap,                    //
+                           sz_i32_t const *scores_pre_substitution, sz_i32_t const *scores_pre_insertion, //
+                           sz_i32_t *scores_new) const noexcept {
+
+        __mmask16 load_mask;
+        sz_u512_vec_t second_vec; // ! Only up to 16 bytes in the low YMM section will be used
+        sz_u512_vec_t pre_substitution_vec, pre_gap_vec;
+        sz_u512_vec_t cost_of_substitution_vec;
+        sz_u512_vec_t cost_if_substitution_vec, cost_if_gap_vec, cell_score_vec;
+
+        // Initialize constats:
+        sz_u512_vec_t gap_cost_vec;
+        gap_cost_vec.zmm = _mm512_set1_epi32(gap);
+
+        // Load the data with a mask:
+        load_mask = _sz_u16_clamp_mask_until(n - i);
+        second_vec.xmms[0] = _mm_maskz_loadu_epi8(load_mask, second_slice + i);
+        pre_substitution_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, scores_pre_substitution + i);
+        pre_gap_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, scores_pre_insertion + i);
+
+        // First, sign-extend the substitution cost vector.
+        cost_of_substitution_vec.zmm = _mm512_cvtepi8_epi32(lookup_.lookup64(second_vec).xmms[0]);
+
+        // Then compute the data-parallel part, assuming the cost of deletions will be propagated
+        // left to right outside of this loop.
+        cost_if_substitution_vec.zmm = _mm512_add_epi32(pre_substitution_vec.zmm, cost_of_substitution_vec.zmm);
+        cost_if_gap_vec.zmm = _mm512_add_epi32(pre_gap_vec.zmm, gap_cost_vec.zmm);
+        cell_score_vec.zmm = _mm512_max_epi32(cost_if_substitution_vec.zmm, cost_if_gap_vec.zmm);
+
+        // In Local Alignment for SW we also need to compare to zero and set the result to zero if negative.
+        if constexpr (locality_ == sz_similarity_local_k)
+            cell_score_vec.zmm = _mm512_max_epi32(cell_score_vec.zmm, _mm512_setzero_epi32());
+
+        // Dump partial results to the output buffer.
+        _mm512_mask_storeu_epi32(scores_new + i, load_mask, cell_score_vec.zmm);
+    }
+};
+
+template <sz_similarity_locality_t locality_>
+struct tile_scorer<constant_iterator<char>, char const *, sz_i64_t, error_costs_256x256_t, linear_gap_costs_t,
+                   sz_maximize_score_k, locality_, sz_cap_ice_k>
+    : public tile_scorer<constant_iterator<char>, char const *, sz_i64_t, error_costs_256x256_t, linear_gap_costs_t,
+                         sz_maximize_score_k, locality_, sz_cap_serial_k, void> {
+
+    using tile_scorer<constant_iterator<char>, char const *, sz_i64_t, error_costs_256x256_t, linear_gap_costs_t,
+                      sz_maximize_score_k, locality_, sz_cap_serial_k,
+                      void>::tile_scorer; // Make the constructors visible
+};
+
 /** @brief Redirects the Ice Lake template specialization to the serial version. */
 template <typename char_type_, typename score_type_, typename substituter_type_, typename gap_costs_type_,
           typename allocator_type_, sz_similarity_objective_t objective_, sz_similarity_locality_t locality_>
 struct horizontal_walker<char_type_, score_type_, substituter_type_, gap_costs_type_, allocator_type_, objective_,
-                         locality_, sz_caps_si_k, void>
+                         locality_, sz_cap_ice_k, void>
     : public horizontal_walker<char_type_, score_type_, substituter_type_, gap_costs_type_, allocator_type_, objective_,
                                locality_, sz_cap_serial_k, void> {
 
@@ -4027,27 +4219,23 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, gap_costs_t
  *  @brief  Computes the @b byte-level Needleman-Wunsch score between two strings using the Ice Lake backend.
  *  @sa     `levenshtein_distance` for uniform substitution and gap costs.
  */
-template <typename allocator_type_, sz_capability_t capability_>
-struct needleman_wunsch_score<char, error_costs_256x256_t, linear_gap_costs_t, allocator_type_, capability_,
-                              std::enable_if_t<(capability_ & sz_cap_ice_k) && !(capability_ & sz_cap_parallel_k)>> {
+template <typename allocator_type_>
+struct needleman_wunsch_score<char, error_costs_256x256_t, linear_gap_costs_t, allocator_type_, sz_caps_si_k> {
 
     using char_t = char;
     using substituter_t = error_costs_256x256_t;
     using gap_costs_t = linear_gap_costs_t;
     using allocator_t = allocator_type_;
 
-    static constexpr sz_capability_t capability_k = capability_;
-    static constexpr sz_capability_t capability_wout_simd_k = (sz_capability_t)(capability_k & ~sz_cap_ice_k);
-
     using horizontal_i16_t =                                                         //
         horizontal_walker<char_t, sz_i16_t, substituter_t, gap_costs_t, allocator_t, //
-                          sz_maximize_score_k, sz_similarity_global_k, capability_k>;
+                          sz_maximize_score_k, sz_similarity_global_k, sz_cap_ice_k>;
     using horizontal_i32_t =                                                         //
         horizontal_walker<char_t, sz_i32_t, substituter_t, gap_costs_t, allocator_t, //
-                          sz_maximize_score_k, sz_similarity_global_k, capability_wout_simd_k>;
+                          sz_maximize_score_k, sz_similarity_global_k, sz_cap_ice_k>;
     using horizontal_i64_t =                                                         //
         horizontal_walker<char_t, sz_i64_t, substituter_t, gap_costs_t, allocator_t, //
-                          sz_maximize_score_k, sz_similarity_global_k, capability_wout_simd_k>;
+                          sz_maximize_score_k, sz_similarity_global_k, sz_cap_serial_k>;
 
     substituter_t substituter_ {};
     linear_gap_costs_t gap_costs_ {};
@@ -4062,7 +4250,12 @@ struct needleman_wunsch_score<char, error_costs_256x256_t, linear_gap_costs_t, a
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
-    status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref) const noexcept {
+    template <typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
+    status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref,
+                        executor_type_ &&executor = {}) const noexcept {
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
         using similarity_memory_requirements_t = similarity_memory_requirements<size_t, true>;
@@ -4075,19 +4268,19 @@ struct needleman_wunsch_score<char, error_costs_256x256_t, linear_gap_costs_t, a
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
         if (requirements.bytes_per_cell <= 2) {
             sz_i16_t result_i16;
-            status_t status = horizontal_i16_t {substituter_, gap_costs_, alloc_}(first, second, result_i16);
+            status_t status = horizontal_i16_t {substituter_, gap_costs_, alloc_}(first, second, result_i16, executor);
             if (status != status_t::success_k) return status;
             result_ref = result_i16;
         }
         else if (requirements.bytes_per_cell == 4) {
             sz_i32_t result_i32;
-            status_t status = horizontal_i32_t {substituter_, gap_costs_, alloc_}(first, second, result_i32);
+            status_t status = horizontal_i32_t {substituter_, gap_costs_, alloc_}(first, second, result_i32, executor);
             if (status != status_t::success_k) return status;
             result_ref = result_i32;
         }
         else if (requirements.bytes_per_cell == 8) {
             sz_i64_t result_i64;
-            status_t status = horizontal_i64_t {substituter_, gap_costs_, alloc_}(first, second, result_i64);
+            status_t status = horizontal_i64_t {substituter_, gap_costs_, alloc_}(first, second, result_i64, executor);
             if (status != status_t::success_k) return status;
             result_ref = result_i64;
         }
@@ -4100,27 +4293,23 @@ struct needleman_wunsch_score<char, error_costs_256x256_t, linear_gap_costs_t, a
  *  @brief  Computes the @b byte-level Smith-Waterman score between two strings using the Ice Lake backend.
  *  @sa     `levenshtein_distance` for uniform substitution and gap costs.
  */
-template <typename allocator_type_, sz_capability_t capability_>
-struct smith_waterman_score<char, error_costs_256x256_t, linear_gap_costs_t, allocator_type_, capability_,
-                            std::enable_if_t<(capability_ & sz_cap_ice_k) && !(capability_ & sz_cap_parallel_k)>> {
+template <typename allocator_type_>
+struct smith_waterman_score<char, error_costs_256x256_t, linear_gap_costs_t, allocator_type_, sz_caps_si_k> {
 
     using char_t = char;
     using substituter_t = error_costs_256x256_t;
     using gap_costs_t = linear_gap_costs_t;
     using allocator_t = allocator_type_;
 
-    static constexpr sz_capability_t capability_k = capability_;
-    static constexpr sz_capability_t capability_wout_simd_k = (sz_capability_t)(capability_k & ~sz_cap_ice_k);
-
     using horizontal_i16_t =                                                                //
         horizontal_walker<char_t, sz_i16_t, substituter_t, linear_gap_costs_t, allocator_t, //
-                          sz_maximize_score_k, sz_similarity_local_k, capability_k>;
+                          sz_maximize_score_k, sz_similarity_local_k, sz_cap_ice_k>;
     using horizontal_i32_t =                                                                //
         horizontal_walker<char_t, sz_i32_t, substituter_t, linear_gap_costs_t, allocator_t, //
-                          sz_maximize_score_k, sz_similarity_local_k, capability_wout_simd_k>;
+                          sz_maximize_score_k, sz_similarity_local_k, sz_cap_ice_k>;
     using horizontal_i64_t =                                                                //
         horizontal_walker<char_t, sz_i64_t, substituter_t, linear_gap_costs_t, allocator_t, //
-                          sz_maximize_score_k, sz_similarity_local_k, capability_wout_simd_k>;
+                          sz_maximize_score_k, sz_similarity_local_k, sz_cap_serial_k>;
 
     substituter_t substituter_ {};
     linear_gap_costs_t gap_costs_ {};
@@ -4135,7 +4324,12 @@ struct smith_waterman_score<char, error_costs_256x256_t, linear_gap_costs_t, all
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
-    status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref) const noexcept {
+    template <typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
+    status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref,
+                        executor_type_ &&executor = {}) const noexcept {
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
         using similarity_memory_requirements_t = similarity_memory_requirements<size_t, true>;
@@ -4148,19 +4342,19 @@ struct smith_waterman_score<char, error_costs_256x256_t, linear_gap_costs_t, all
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
         if (requirements.bytes_per_cell <= 2) {
             sz_i16_t result_i16;
-            status_t status = horizontal_i16_t {substituter_, gap_costs_, alloc_}(first, second, result_i16);
+            status_t status = horizontal_i16_t {substituter_, gap_costs_, alloc_}(first, second, result_i16, executor);
             if (status != status_t::success_k) return status;
             result_ref = result_i16;
         }
         else if (requirements.bytes_per_cell == 4) {
             sz_i32_t result_i32;
-            status_t status = horizontal_i32_t {substituter_, gap_costs_, alloc_}(first, second, result_i32);
+            status_t status = horizontal_i32_t {substituter_, gap_costs_, alloc_}(first, second, result_i32, executor);
             if (status != status_t::success_k) return status;
             result_ref = result_i32;
         }
         else if (requirements.bytes_per_cell == 8) {
             sz_i64_t result_i64;
-            status_t status = horizontal_i64_t {substituter_, gap_costs_, alloc_}(first, second, result_i64);
+            status_t status = horizontal_i64_t {substituter_, gap_costs_, alloc_}(first, second, result_i64, executor);
             if (status != status_t::success_k) return status;
             result_ref = result_i64;
         }
diff --git a/scripts/bench_similarity.cuh b/scripts/bench_similarity.cuh
index 588b2a10..037de404 100644
--- a/scripts/bench_similarity.cuh
+++ b/scripts/bench_similarity.cuh
@@ -89,13 +89,15 @@ void bench_levenshtein(environment_t const &env) {
 #if SZ_USE_CUDA
     sz::gpu_specs_t specs = *sz::gpu_specs();
 #endif
-    std::vector<std::size_t> batch_sizes = {1, 32, 1024, 32 * 1024};
+    std::vector<std::size_t> batch_sizes = {1, 64, 1024, 32 * 1024};
 #if SZ_DEBUG
-    batch_sizes = {1, 2, 32};
+    batch_sizes = {1, 2, 64};
 #endif
     similarities_t results_linear_baseline, results_linear_accelerated;
     similarities_t results_affine_baseline, results_affine_accelerated;
     similarities_t results_utf8_baseline, results_utf8_accelerated;
+
+    // Let's reuse a thread-pool to amortize the cost of spawning threads.
     fork_union_t pool;
     if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
     static_assert(executor_like<fork_union_t>);
@@ -228,15 +230,20 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
 #if SZ_USE_CUDA
     sz::gpu_specs_t specs = *sz::gpu_specs();
 #endif
-    std::vector<std::size_t> batch_sizes = {1, 32, 1024, 32 * 1024};
+    std::vector<std::size_t> batch_sizes = {1, 64, 1024, 32 * 1024};
 #if SZ_DEBUG
-    batch_sizes = {1, 2, 32};
+    batch_sizes = {1, 2, 64};
 #endif
     similarities_t results_linear_global_baseline, results_linear_global_accelerated;
     similarities_t results_affine_global_baseline, results_affine_global_accelerated;
     similarities_t results_linear_local_baseline, results_linear_local_accelerated;
     similarities_t results_affine_local_baseline, results_affine_local_accelerated;
 
+    // Let's reuse a thread-pool to amortize the cost of spawning threads.
+    fork_union_t pool;
+    if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
+    static_assert(executor_like<fork_union_t>);
+
     auto scramble_accelerated_results = [&](similarities_t &results_accelerated) {
         std::shuffle(results_accelerated.begin(), results_accelerated.end(), global_random_generator());
     };
@@ -247,46 +254,65 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
         results_linear_local_baseline.resize(batch_size), results_linear_local_accelerated.resize(batch_size);
         results_affine_local_baseline.resize(batch_size), results_affine_local_accelerated.resize(batch_size);
 
-        auto call_linear_global_baseline = similarities_callable<needleman_wunsch_serial_t>(
-            env, results_linear_global_baseline, {blosum62_matrix, blosum62_linear_cost});
+        auto call_linear_global_baseline = similarities_callable<needleman_wunsch_serial_t, fork_union_t &>(
+            env, results_linear_global_baseline, {blosum62_matrix, blosum62_linear_cost}, pool);
         auto name_linear_global_baseline = "needleman_wunsch_serial:batch"s + std::to_string(batch_size);
         bench_result_t linear_global_baseline =
             bench_unary(env, name_linear_global_baseline, call_linear_global_baseline).log();
 
-        auto call_linear_local_baseline = similarities_callable<smith_waterman_serial_t>(
-            env, results_linear_local_baseline, {blosum62_matrix, blosum62_linear_cost});
+        auto call_linear_local_baseline = similarities_callable<smith_waterman_serial_t, fork_union_t &>(
+            env, results_linear_local_baseline, {blosum62_matrix, blosum62_linear_cost}, pool);
         auto name_linear_local_baseline = "smith_waterman_serial:batch"s + std::to_string(batch_size);
         bench_result_t linear_local_baseline =
             bench_unary(env, name_linear_local_baseline, call_linear_local_baseline).log();
 
-        auto call_affine_global_baseline = similarities_callable<affine_needleman_wunsch_serial_t>(
-            env, results_affine_global_baseline, {blosum62_matrix, blosum62_affine_cost});
+        auto call_affine_global_baseline = similarities_callable<affine_needleman_wunsch_serial_t, fork_union_t &>(
+            env, results_affine_global_baseline, {blosum62_matrix, blosum62_affine_cost}, pool);
         auto name_affine_global_baseline = "affine_needleman_wunsch_serial:batch"s + std::to_string(batch_size);
         bench_result_t affine_global_baseline =
             bench_unary(env, name_affine_global_baseline, call_affine_global_baseline).log();
 
-        auto call_affine_local_baseline = similarities_callable<affine_smith_waterman_serial_t>(
-            env, results_affine_local_baseline, {blosum62_matrix, blosum62_affine_cost});
+        auto call_affine_local_baseline = similarities_callable<affine_smith_waterman_serial_t, fork_union_t &>(
+            env, results_affine_local_baseline, {blosum62_matrix, blosum62_affine_cost}, pool);
         auto name_affine_local_baseline = "affine_smith_waterman_serial:batch"s + std::to_string(batch_size);
         bench_result_t affine_local_baseline =
             bench_unary(env, name_affine_local_baseline, call_affine_local_baseline).log();
 
 #if SZ_USE_ICE
         bench_unary(env, "needleman_wunsch_ice:batch"s + std::to_string(batch_size), call_linear_global_baseline,
-                    similarities_callable<needleman_wunsch_ice_t>(env, results_linear_global_accelerated,
-                                                                  {blosum62_matrix, blosum62_linear_cost}),
+                    similarities_callable<needleman_wunsch_ice_t, fork_union_t &>(
+                        env, results_linear_global_accelerated, {blosum62_matrix, blosum62_linear_cost}, pool),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
             .log(linear_global_baseline);
         scramble_accelerated_results(results_linear_global_accelerated);
 
         bench_unary(env, "smith_waterman_ice:batch"s + std::to_string(batch_size), call_linear_local_baseline,
-                    similarities_callable<smith_waterman_ice_t>(env, results_linear_local_accelerated,
-                                                                {blosum62_matrix, blosum62_linear_cost}),
+                    similarities_callable<smith_waterman_ice_t, fork_union_t &>(
+                        env, results_linear_local_accelerated, {blosum62_matrix, blosum62_linear_cost}, pool),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
             .log(linear_local_baseline);
         scramble_accelerated_results(results_linear_local_accelerated);
+
+        // TODO: Ice Lake optimizations don't yield massive improvements, but can be added later.
+        //
+        // bench_unary(env, "affine_needleman_wunsch_ice:batch"s + std::to_string(batch_size),
+        // call_affine_global_baseline,
+        //             similarities_callable<affine_needleman_wunsch_ice_t, fork_union_t &>(
+        //                 env, results_affine_global_accelerated, {blosum62_matrix, blosum62_affine_cost}, pool),
+        //             callable_no_op_t {},        // preprocessing
+        //             similarities_equality_t {}) // equality check
+        //     .log(affine_global_baseline);
+        // scramble_accelerated_results(results_affine_global_accelerated);
+        //
+        // bench_unary(env, "affine_smith_waterman_ice:batch"s + std::to_string(batch_size), call_affine_local_baseline,
+        //             similarities_callable<affine_smith_waterman_ice_t, fork_union_t &>(
+        //                 env, results_affine_local_accelerated, {blosum62_matrix, blosum62_affine_cost}, pool),
+        //             callable_no_op_t {},        // preprocessing
+        //             similarities_equality_t {}) // equality check
+        //     .log(affine_local_baseline);
+        // scramble_accelerated_results(results_affine_local_accelerated);
 #endif
 
 #if SZ_USE_CUDA
@@ -323,6 +349,41 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
             .log(affine_local_baseline);
         scramble_accelerated_results(results_affine_local_accelerated);
 #endif
+
+#if SZ_USE_HOPPER
+        bench_unary(env, "needleman_wunsch_hopper:batch"s + std::to_string(batch_size), call_linear_global_baseline,
+                    similarities_callable<needleman_wunsch_hopper_t, sz::gpu_specs_t>(
+                        env, results_linear_global_accelerated, {blosum62_matrix, blosum62_linear_cost}, specs),
+                    callable_no_op_t {},        // preprocessing
+                    similarities_equality_t {}) // equality check
+            .log(linear_global_baseline);
+        scramble_accelerated_results(results_linear_global_accelerated);
+
+        bench_unary(env, "smith_waterman_hopper:batch"s + std::to_string(batch_size), call_linear_local_baseline,
+                    similarities_callable<smith_waterman_hopper_t, sz::gpu_specs_t>(
+                        env, results_linear_local_accelerated, {blosum62_matrix, blosum62_linear_cost}, specs),
+                    callable_no_op_t {},        // preprocessing
+                    similarities_equality_t {}) // equality check
+            .log(linear_local_baseline);
+        scramble_accelerated_results(results_linear_local_accelerated);
+
+        bench_unary(env, "affine_needleman_wunsch_hopper:batch"s + std::to_string(batch_size),
+                    call_affine_global_baseline,
+                    similarities_callable<affine_needleman_wunsch_hopper_t, sz::gpu_specs_t>(
+                        env, results_affine_global_accelerated, {blosum62_matrix, blosum62_affine_cost}, specs),
+                    callable_no_op_t {},        // preprocessing
+                    similarities_equality_t {}) // equality check
+            .log(affine_global_baseline);
+        scramble_accelerated_results(results_affine_global_accelerated);
+
+        bench_unary(env, "affine_smith_waterman_hopper:batch"s + std::to_string(batch_size), call_affine_local_baseline,
+                    similarities_callable<affine_smith_waterman_hopper_t, sz::gpu_specs_t>(
+                        env, results_affine_local_accelerated, {blosum62_matrix, blosum62_affine_cost}, specs),
+                    callable_no_op_t {},        // preprocessing
+                    similarities_equality_t {}) // equality check
+            .log(affine_local_baseline);
+        scramble_accelerated_results(results_affine_local_accelerated);
+#endif
     }
 }
 

From d087afa5e3c0a68594da6a26402ab6b40dc30dec Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 7 May 2025 13:55:07 +0000
Subject: [PATCH 409/751] Add: CUDA Aho Corasick placeholder

---
 include/stringcuzilla/find_many.cuh | 158 ++++++++++++++++++++++++++++
 include/stringcuzilla/find_many.hpp |  31 +++---
 2 files changed, 177 insertions(+), 12 deletions(-)
 create mode 100644 include/stringcuzilla/find_many.cuh

diff --git a/include/stringcuzilla/find_many.cuh b/include/stringcuzilla/find_many.cuh
new file mode 100644
index 00000000..d315a1d1
--- /dev/null
+++ b/include/stringcuzilla/find_many.cuh
@@ -0,0 +1,158 @@
+/**
+ *  @brief  Hardware-accelerated multi-pattern exact substring search on CUDA-capable GPUs.
+ *  @file   find_many.cuh
+ *  @author Ash Vardanian
+ *
+ *  @section External Memory
+ *
+ *  When performing multi-pattern search, we assume that the set of needles must fit in VRAM (~ 50 GB),
+ *  may fit into the Shared Memory (~ 50 MB), and, in rare cases, may fit into the Constant Memory (~ 50 KB).
+ *  The haystacks, however, may be huge in size and can be fetched from external memory (e.g., NVMe SSDs).
+ *
+ *  That means, the coalesced memory is extremely important. Moreover, assuming we are mostly fetching each
+ *  haystack byte only once, we want to make the transfer asynchronous, using @b `cp.async` PTX instructions.
+ */
+#ifndef STRINGCUZILLA_FIND_MANY_CUH_
+#define STRINGCUZILLA_FIND_MANY_CUH_
+
+#include "stringcuzilla/types.cuh"
+#include "stringcuzilla/find_many.hpp"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+namespace ashvardanian {
+namespace stringzilla {
+
+#pragma region - Compressed State Machine
+
+/**
+ *  @brief  Reordered bit-unzipped form of @b `aho_corasick_dictionary` for CUDA.
+ *
+ *  GPUs have many levels of memory hierarchy, and the performance of a kernel heavily
+ *  depends on its utilization. Instead of branching the state into 256 next states,
+ *
+ *  - Constant Memory (~ 50 KB):
+ *      - for 256-level branching, and 4 bytes per state, fits ~ 50 states.
+ *      - for 256-level branching, and 1 bytes per state, fits ~ 200 states.
+ *      - for 2-level branching, and 4 bytes per state, fits ~ 6'000 states.
+ *      - for 2-level branching, and 2 bytes per state, fits ~ 12'000 states.
+ */
+struct compressed_aho_corasick_dictionary_t {
+
+    using u16s_allocator_t = unified_alloc<sz_u16_t>;
+    using u32s_allocator_t = unified_alloc<sz_u32_t>;
+    using u64s_allocator_t = unified_alloc<sz_u64_t>;
+
+    safe_vector<sz_u16_t, u16s_allocator_t> transitions_u16x2_;
+    safe_vector<sz_u32_t, u32s_allocator_t> transitions_u32x2_;
+    safe_vector<sz_u64_t, u64s_allocator_t> transitions_u64x2_;
+
+    safe_vector<sz_u32_t, u32s_allocator_t> outputs_counts_;
+    safe_vector<sz_u32_t, u32s_allocator_t> needles_lengths_;
+
+    template <typename state_id_type_, typename allocator_type_>
+    status_t try_build(aho_corasick_dictionary<state_id_type_, allocator_type_> const &dict) noexcept {
+        // We need to reorder the states, so that the most frequently used can be represented
+        // as `uint8_t`, next as `uint16_t`, and the rest as `uint32_t` and `uint64_t`.
+
+        //
+        return status_t::success_k;
+    }
+};
+
+#pragma endregion // Compressed State Machine
+
+#pragma region - General Purpose CUDA Backend
+
+/**
+ *  @brief  Multi-pattern exact substring search on CUDA-capable GPUs, assigning one or more warps
+ *          per haystack string, using atomic writes to global memory for final output.
+ */
+template < //
+    typename state_id_type_,
+    typename haystacks_strings_type_,           //
+    sz_capability_t capability_ = sz_cap_cuda_k //
+    >
+__global__ void _count_many_in_cuda_block( //
+    haystacks_strings_type_ haystacks, size_t const count_states, state_id_type_ const *transitions,
+    state_id_type_ const *count_outputs_per_state) {}
+
+/**
+ *  @brief Aho-Corasick-based @b SIMT multi-pattern exact substring search.
+ *  @tparam state_id_type_ The type of the state ID. Default is `sz_u32_t`.
+ *  @tparam allocator_type_ The type of the allocator. Default is `dummy_alloc_t`.
+ *  @tparam capability_ The capability of the dictionary. Default is `sz_cap_serial_k`.
+ */
+template <typename state_id_type_, typename allocator_type_, typename enable_>
+struct find_many<state_id_type_, allocator_type_, sz_cap_cuda_k, enable_> {
+
+    using dictionary_t = aho_corasick_dictionary<state_id_type_, allocator_type_>;
+    using state_id_t = typename dictionary_t::state_id_t;
+    using allocator_t = typename dictionary_t::allocator_t;
+    using match_t = typename dictionary_t::match_t;
+
+    find_many(allocator_t alloc = allocator_t()) noexcept : dict_(alloc) {}
+    void reset() noexcept { dict_.reset(); }
+
+    /**
+     *  @brief Indexes all of the @p needles strings into the FSM.
+     *  @retval `status_t::success_k` The needle was successfully added.
+     *  @retval `status_t::bad_alloc_k` Memory allocation failed.
+     *  @retval `status_t::overflow_risk_k` Too many needles for the current state ID type.
+     *  @retval `status_t::contains_duplicates_k` The needle is already in the vocabulary.
+     *  @note Before reusing, please `reset` the FSM.
+     */
+    template <typename needles_type_>
+    status_t try_build(needles_type_ &&needles) noexcept {
+        for (auto const &needle : needles)
+            if (status_t status = dict_.try_insert(needle); status != status_t::success_k) return status;
+        return dict_.try_build();
+    }
+
+    /**
+     *  @brief Counts the number of occurrences of all needles in all @p haystacks. Relevant for filtering and ranking.
+     *  @param[in] haystacks The input strings to search in.
+     *  @param[in] counts The output buffer for the counts of all needles in each haystack.
+     *  @return The total number of occurrences found.
+     */
+    template <typename haystacks_type_>
+    status_t try_count(haystacks_type_ &&haystacks, span<size_t> counts, gpu_specs_t const &specs = {}) const noexcept {
+        _sz_assert(counts.size() == haystacks.size());
+        for (size_t i = 0; i < counts.size(); ++i) counts[i] = dict_.count(haystacks[i]);
+        return status_t::success_k;
+    }
+
+    /**
+     *  @brief Finds all occurrences of all needles in all the @p haystacks.
+     *  @param[in] haystacks The input strings to search in, with support for random access iterators.
+     *  @param[in] matches The output buffer for the matches, with support for random access iterators.
+     *  @param[out] matches_count The number of matches found.
+     *  @return The number of matches found across all the @p haystacks.
+     *  @note The @p matches reference objects should be assignable from @b `match_t`.
+     */
+    template <typename haystacks_type_, typename output_matches_type_>
+    status_t try_find(haystacks_type_ &&haystacks, output_matches_type_ &&matches, size_t &matches_count,
+                      gpu_specs_t const &specs = {}) const noexcept {
+        size_t count_found = 0, count_allowed = matches.size();
+        for (auto it = haystacks.begin(); it != haystacks.end() && count_found != count_allowed; ++it)
+            dict_.find(*it, [&](match_t match) {
+                match.haystack_index = static_cast<size_t>(it - haystacks.begin());
+                matches[count_found] = match;
+                count_found++;
+                return count_found < count_allowed;
+            });
+        matches_count = count_found;
+        return status_t::success_k;
+    }
+
+  private:
+    dictionary_t dict_;
+};
+
+#pragma endregion // General Purpose CUDA Backend
+
+} // namespace stringzilla
+} // namespace ashvardanian
+
+#endif // STRINGCUZILLA_FIND_MANY_CUH_
diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index 317f9841..743d5d71 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -28,6 +28,7 @@
  *
  *  | Use Case                      | Number of States        | Memory Usage            |
  *  |-------------------------------|-------------------------|-------------------------|
+ *  | UTF-32 Mapping                | 10,000 – 100,000        | 10.24 MB – 102.4 MB     |
  *  | Malware/Intrusion Detection   | 10,000 – 1,000,000      | 10.24 MB – 1.024 GB     |
  *  | DNA/RNA Motif Scanning        | 100 – 100,000           | 0.1 MB – 102.4 MB       |
  *  | Keyword Filtering/Moderation  | 100 – 10,000            | 0.1 MB – 10.24 MB       |
@@ -65,6 +66,10 @@ struct find_many_match_t {
     size_t haystack_index {};
     size_t needle_index {};
 
+    /**
+     *  @brief  Helper function discouraged outside of testing and debugging, used to sort match lists
+     *          in many-to-many search, to compare the outputs of multiple algorithms.
+     */
     inline static bool less_globally(find_many_match_t const &lhs, find_many_match_t const &rhs) noexcept {
         return lhs.needle.data() < rhs.needle.data() ||
                (lhs.needle.data() == rhs.needle.data() && lhs.needle.end() < rhs.needle.end());
@@ -303,7 +308,7 @@ struct aho_corasick_dictionary {
 
         state_id_t current_state = 0;
         for (size_t pos = 0; pos < needle.size(); ++pos) {
-            unsigned char const symbol = static_cast<unsigned char>(needle[pos]);
+            sz_u8_t const symbol = static_cast<sz_u8_t>(needle[pos]);
             state_id_t *current_row = &transitions_[current_state][0];
             bool const has_root_state = transitions_.data() != nullptr;
             if (!has_root_state || current_row[symbol] == invalid_state_k) {
@@ -420,17 +425,19 @@ struct aho_corasick_dictionary {
     template <typename callback_type_>
     void find(span<char const> haystack, callback_type_ &&callback) const noexcept {
         state_id_t current_state = 0;
-        for (size_t pos = 0; pos < haystack.size(); ++pos) {
-            unsigned char symbol = static_cast<unsigned char>(haystack[pos]);
+        for (size_t haystack_offset = 0; haystack_offset < haystack.size(); ++haystack_offset) {
+            sz_u8_t symbol = static_cast<sz_u8_t>(haystack[haystack_offset]);
             current_state = transitions_[current_state][symbol];
 
-            size_t outputs_count = outputs_counts_[current_state];
+            size_t const outputs_count = outputs_counts_[current_state];
             if (outputs_count == 0) continue;
-            size_t outputs_offset = outputs_offsets_[current_state];
-            for (size_t i = 0; i < outputs_count; ++i) { // In small vocabulary, this is generally just 1 iteration
-                size_t needle_id = outputs_[outputs_offset + i];
+            size_t const outputs_offset = outputs_offsets_[current_state];
+
+            // In a small & diverse vocabulary, the following loop generally does just 1 iteration
+            for (size_t output_index = 0; output_index < outputs_count; ++output_index) {
+                size_t needle_id = outputs_[outputs_offset + output_index];
                 size_t match_length = needles_lengths_[needle_id];
-                span<char const> match_span(&haystack[pos + 1 - match_length], match_length);
+                span<char const> match_span(&haystack[haystack_offset + 1 - match_length], match_length);
                 match_t match {haystack, match_span, 0, needle_id};
                 if (!callback(match)) break;
             }
@@ -444,8 +451,8 @@ struct aho_corasick_dictionary {
     size_t count(span<char const> haystack) const noexcept {
         size_t count = 0;
         state_id_t current_state = 0;
-        for (size_t pos = 0; pos < haystack.size(); ++pos) {
-            unsigned char symbol = static_cast<unsigned char>(haystack[pos]);
+        for (size_t haystack_offset = 0; haystack_offset < haystack.size(); ++haystack_offset) {
+            sz_u8_t symbol = static_cast<sz_u8_t>(haystack[haystack_offset]);
             current_state = transitions_[current_state][symbol];
             count += outputs_counts_[current_state];
         }
@@ -521,7 +528,7 @@ struct find_many {
                       size_t &matches_count) const noexcept {
         size_t count_found = 0, count_allowed = matches.size();
         for (auto it = haystacks.begin(); it != haystacks.end() && count_found != count_allowed; ++it)
-            dict_.find(*it, [&](match_t match) {
+            dict_.find(*it, [&](match_t match) noexcept {
                 match.haystack_index = static_cast<size_t>(it - haystacks.begin());
                 matches[count_found] = match;
                 count_found++;
@@ -684,7 +691,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
             size_t haystack_length = haystack.size();
             if (haystack_length > specs.l2_bytes) continue;
             size_t matches_found = 0;
-            dict_.find({haystack.data(), haystack_length}, [&](match_t match) {
+            dict_.find({haystack.data(), haystack_length}, [&](match_t match) noexcept {
                 match.haystack_index = haystack_index;
                 matches[offsets_per_haystack[haystack_index] + matches_found] = match;
                 ++matches_found;

From 35fad3d754443b8dc8be39c46aa6b31e96ed5e99 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 7 May 2025 17:30:11 +0000
Subject: [PATCH 410/751] Improve: Support executors in multi-pattern search

---
 include/stringcuzilla/find_many.hpp | 82 ++++++++++++++++-------------
 include/stringcuzilla/types.hpp     | 72 +++++++++++++++++++------
 scripts/test_stringcuzilla.cuh      | 23 +++++---
 3 files changed, 116 insertions(+), 61 deletions(-)

diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index 743d5d71..7b1c0586 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -38,8 +38,9 @@
 #ifndef STRINGCUZILLA_FIND_MANY_HPP_
 #define STRINGCUZILLA_FIND_MANY_HPP_
 
-#include "stringzilla/memory.h"  // `sz_move`
-#include "stringzilla/types.hpp" // `status_t::status_t`
+#include "stringzilla/memory.h"    // `sz_move`
+#include "stringzilla/types.hpp"   // `status_t::status_t`
+#include "stringcuzilla/types.hpp" // `dummy_executor_t`
 
 #include <memory>      // `std::allocator_traits` to re-bind the allocator
 #include <type_traits> // `std::enable_if_t` for meta-programming
@@ -544,10 +545,10 @@ struct find_many {
 
 #pragma endregion // Primary API
 
-#pragma region - Parallel OpenMP Backend
+#pragma region - Parallel Backend
 
 /**
- *  @brief  Aho-Corasick-based @b multi-threaded multi-pattern exact substring search with OpenMP.
+ *  @brief  Aho-Corasick-based @b multi-threaded multi-pattern exact substring search with.
  *  @note   Construction of the FSM is not parallelized, as it is not generally a bottleneck.
  *
  *  Implements 2 levels of parallelism: "core per input" for small haystacks and "all cores on each input"
@@ -585,27 +586,30 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
      *  @brief Counts the number of occurrences of all needles in all @p haystacks. Relevant for filtering and ranking.
      *  @param[in] haystacks The input strings to search in.
      *  @param[in] counts The output buffer for the counts of all needles in each haystack.
+     *  @param[in] executor The executor to use for parallelization.
      *  @param[in] specs The CPU specifications on the current system to pick the right multi-threading strategy.
      *  @return The total number of occurrences found.
      */
-    template <typename haystacks_type_>
-    status_t try_count(haystacks_type_ &&haystacks, span<size_t> counts, cpu_specs_t const &specs = {}) const noexcept {
+    template <typename haystacks_type_, typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
+    status_t try_count(haystacks_type_ &&haystacks, span<size_t> counts, executor_type_ &&executor = {},
+                       cpu_specs_t const &specs = {}) const noexcept {
 
         _sz_assert(counts.size() == haystacks.size());
-        size_t const cores_total = specs.cores_total();
         size_t const cache_line_width = specs.cache_line_width;
 
         using haystacks_t = typename std::remove_reference_t<haystacks_type_>;
         using haystack_t = typename haystacks_t::value_type;
 
         // On small strings, individually compute the counts
-#pragma omp parallel for schedule(dynamic, 1) num_threads(cores_total)
-        for (size_t haystack_index = 0; haystack_index < counts.size(); ++haystack_index) {
+        executor.for_each_dynamic(counts.size(), [&](size_t haystack_index) noexcept {
             haystack_t const &haystack = haystacks[haystack_index];
             size_t haystack_length = haystack.size();
-            if (haystack_length > specs.l2_bytes) continue;
+            if (haystack_length > specs.l2_bytes) return;
             counts[haystack_index] = dict_.count(haystack);
-        }
+        });
 
         // On longer strings, throw all cores on each haystack
         for (size_t haystack_index = 0; haystack_index < counts.size(); ++haystack_index) {
@@ -614,15 +618,13 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
             // The shorter strings have already been processed
             if (haystack_length <= specs.l2_bytes) continue;
 
-            size_t count_matches_across_cores = 0;
-#pragma omp parallel for reduction(+ : count_matches_across_cores) schedule(static, 1) num_threads(cores_total)
-            for (size_t core_index = 0; core_index < cores_total; ++core_index) {
-                size_t count_matches_on_one_core =
-                    count_matches_in_one_part(haystack, core_index, cores_total, cache_line_width);
-                count_matches_across_cores += count_matches_on_one_core;
-            }
-
-            counts[haystack_index] = count_matches_across_cores;
+            std::atomic<size_t> count_across_cores = 0;
+            size_t const cores_total = executor.thread_count();
+            executor.for_each_thread([&](size_t core_index) noexcept {
+                size_t count_partial = count_matches_in_one_part(haystack, core_index, cores_total, cache_line_width);
+                count_across_cores.fetch_add(count_partial, std::memory_order_relaxed);
+            });
+            counts[haystack_index] = count_across_cores;
         }
 
         return status_t::success_k;
@@ -643,15 +645,18 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
      *
      *  A common approach to parallelizing such algorithms is to use a little memory for
      */
-    template <typename haystacks_type_, typename output_matches_type_>
+    template <typename haystacks_type_, typename output_matches_type_, typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
     status_t try_find(haystacks_type_ &&haystacks, output_matches_type_ &&matches, size_t &matches_count,
-                      cpu_specs_t const &specs = {}) const noexcept {
+                      executor_type_ &&executor = {}, cpu_specs_t const &specs = {}) const noexcept {
 
         safe_vector<size_t, size_allocator_t> counts_per_haystack(dict_.allocator());
         if (counts_per_haystack.try_resize(haystacks.size()) != status_t::success_k) return status_t::bad_alloc_k;
-        status_t count_status = try_count(haystacks, counts_per_haystack, specs);
+        status_t count_status = try_count(haystacks, counts_per_haystack, executor, specs);
         if (count_status != status_t::success_k) return count_status;
-        return try_find(haystacks, counts_per_haystack, matches, matches_count, specs);
+        return try_find(haystacks, counts_per_haystack, matches, matches_count, executor, specs);
     }
 
     /**
@@ -665,9 +670,13 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
      *
      *  A common approach to parallelizing such algorithms is to us a little memory for
      */
-    template <typename haystacks_type_, typename output_matches_type_>
+    template <typename haystacks_type_, typename output_matches_type_, typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
     status_t try_find(haystacks_type_ &&haystacks, span<size_t const> counts, output_matches_type_ &&matches,
-                      size_t &matches_count, cpu_specs_t const &specs = {}) const noexcept {
+                      size_t &matches_count, executor_type_ &&executor = {},
+                      cpu_specs_t const &specs = {}) const noexcept {
 
         _sz_assert(counts.size() == haystacks.size());
         size_t const cores_total = specs.cores_total();
@@ -685,11 +694,10 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
             offsets_per_haystack[i] = offsets_per_haystack[i - 1] + counts[i - 1];
 
         // Process the small haystacks, outputting their matches individually without any synchronization
-#pragma omp parallel for schedule(dynamic, 1) num_threads(cores_total)
-        for (size_t haystack_index = 0; haystack_index < counts.size(); ++haystack_index) {
+        executor.for_each_dynamic(counts.size(), [&](size_t haystack_index) noexcept {
             haystack_t const &haystack = haystacks[haystack_index];
             size_t haystack_length = haystack.size();
-            if (haystack_length > specs.l2_bytes) continue;
+            if (haystack_length > specs.l2_bytes) return;
             size_t matches_found = 0;
             dict_.find({haystack.data(), haystack_length}, [&](match_t match) noexcept {
                 match.haystack_index = haystack_index;
@@ -698,7 +706,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
                 return true;
             });
             _sz_assert(counts[haystack_index] == matches_found);
-        }
+        });
 
         // On longer strings, throw all cores on each haystack, but between the threads we need additional
         // memory to track the number of matches within a core-specific slice of the haystack.
@@ -712,15 +720,14 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
             if (haystack_length <= specs.l2_bytes) continue;
 
             // First, on each core, estimate the number of matches in the haystack
-#pragma omp parallel for schedule(static, 1) num_threads(cores_total)
-            for (size_t core_index = 0; core_index < cores_total; ++core_index)
+            size_t const cores_total = executor.thread_count();
+            executor.for_each_thread([&](size_t core_index) noexcept {
                 counts_per_core[core_index] =
                     count_matches_in_one_part(haystack, core_index, cores_total, cache_line_width);
+            });
 
             // Now that we know the number of matches to expect per slice, we can convert the counts
             // into offsets using inclusive prefix sum
-#pragma omp barrier
-#pragma omp single
             {
                 for (size_t core_index = 1; core_index < cores_total; ++core_index)
                     counts_per_core[core_index] += counts_per_core[core_index - 1];
@@ -734,8 +741,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
             size_t const bytes_per_core_optimal =
                 round_up_to_multiple(divide_round_up(haystack_length, cores_total), cache_line_width);
             size_t const count_matches_before_this_haystack = offsets_per_haystack[haystack_index];
-#pragma omp parallel for schedule(static, 1) num_threads(cores_total)
-            for (size_t core_index = 0; core_index < cores_total; ++core_index) {
+            executor.for_each_thread([&](size_t core_index) noexcept {
                 size_t const count_matches_before_this_core = core_index ? counts_per_core[core_index - 1] : 0;
                 size_t const count_matches_expected_on_this_core =
                     counts_per_core[core_index] - count_matches_before_this_core;
@@ -760,7 +766,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
                     return true;
                 });
                 _sz_assert(count_matches_found_on_this_core == count_matches_expected_on_this_core);
-            }
+            });
         }
 
         // Aggregate the results
@@ -829,7 +835,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
     }
 };
 
-#pragma endregion // Parallel OpenMP Backend
+#pragma endregion // Parallel Backend
 
 } // namespace stringzilla
 } // namespace ashvardanian
diff --git a/include/stringcuzilla/types.hpp b/include/stringcuzilla/types.hpp
index f45d26cb..12769c25 100644
--- a/include/stringcuzilla/types.hpp
+++ b/include/stringcuzilla/types.hpp
@@ -23,6 +23,8 @@ enum bytes_per_cell_t : uint {
 
 struct dummy_executor_t {
 
+    constexpr size_t thread_count() const noexcept { return 1; }
+
     /**
      *  @brief  Calls the @p function for each index from 0 to @p (n) in such
      *          a way that consecutive elements are likely to be processed by
@@ -53,6 +55,36 @@ struct dummy_executor_t {
     inline void for_each_dynamic(size_t n, function_type_ &&function) const noexcept {
         for (size_t i = 0; i < n; ++i) function(i);
     }
+
+    /**
+     *  @brief  Executes a function in parallel on the current and all worker threads.
+     *  @param[in] function The callback, receiving the thread index as an argument.
+     */
+    template <typename function_type_>
+    void for_each_thread(function_type_ const &function) noexcept {
+        function(0);
+    }
+};
+
+template <typename executor_type_>
+concept executor_like = requires(executor_type_ executor) {
+#if !defined(__NVCC__)
+    { executor.thread_count() } -> std::same_as<size_t>;
+    {
+        executor.for_each_static(0u, [](size_t) {})
+    } -> std::same_as<void>;
+    {
+        executor.for_each_slice(0u, [](size_t, size_t) {})
+    } -> std::same_as<void>;
+    {
+        executor.for_each_dynamic(0u, [](size_t) {})
+    } -> std::same_as<void>;
+    {
+        executor.for_each_thread([](size_t) {})
+    } -> std::same_as<void>;
+#else
+    sizeof(executor) > 0;
+#endif
 };
 
 struct openmp_executor_t {
@@ -99,23 +131,31 @@ struct openmp_executor_t {
 #pragma omp parallel for schedule(dynamic, 1)
         for (size_t i = 0; i < n; ++i) function(i);
     }
-};
 
-template <typename executor_type_>
-concept executor_like = requires(executor_type_ executor) {
-#if !defined(__NVCC__)
-    {
-        executor.for_each_static(0u, [](size_t) {})
-    } -> std::same_as<void>;
-    {
-        executor.for_each_slice(0u, [](size_t, size_t) {})
-    } -> std::same_as<void>;
-    {
-        executor.for_each_dynamic(0u, [](size_t) {})
-    } -> std::same_as<void>;
-#else
-    sizeof(executor) > 0;
-#endif
+    /**
+     *  @brief  Executes a function in parallel on the current and all worker threads.
+     *  @param[in] function The callback, receiving the thread index as an argument.
+     */
+    template <typename function_type_>
+    void for_each_thread(function_type_ const &function) noexcept {
+        // ! Using the `omp_get_thread_num()` would force us to include the OpenMP headers
+        // ! and link to the right symbols, which is not always possible.
+        std::atomic<size_t> atomic_thread_index = 0;
+#pragma omp parallel
+        {
+            size_t const thread_index = atomic_thread_index.fetch_add(1, std::memory_order_relaxed);
+            function(thread_index);
+        }
+    }
+
+    inline size_t thread_count() const noexcept {
+        // ! Using the `omp_get_num_threads()` would force us to include the OpenMP headers
+        // ! and link to the right symbols, which is not always possible.
+        std::atomic<size_t> atomic_thread_index = 0;
+#pragma omp parallel
+        { atomic_thread_index.fetch_add(1, std::memory_order_relaxed); }
+        return atomic_thread_index.load(std::memory_order_relaxed);
+    }
 };
 
 #if !defined(__NVCC__)
diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index 77d5191d..c468be22 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -5,6 +5,11 @@
  *  @file    test_stringcuzilla.cuh
  *  @author  Ash Vardanian
  */
+#include <cstring> // `std::memcmp`
+#include <thread>  // `std::thread::hardware_concurrency`
+
+#include <fork_union.hpp> // Fork-join scoped thread pool
+
 #include "stringcuzilla/find_many.hpp"
 #include "stringcuzilla/similarity.hpp"
 
@@ -18,9 +23,6 @@
 
 #include "test_stringzilla.hpp" // `arrow_strings_view_t`
 
-#include <cstring> // `std::memcmp`
-#include <thread>  // `std::thread::hardware_concurrency`
-
 namespace ashvardanian {
 namespace stringzilla {
 namespace scripts {
@@ -1107,7 +1109,7 @@ struct find_many_baselines_t {
 };
 
 using find_many_serial_t = find_many<sz_u32_t, malloc_t, sz_cap_serial_k>;
-using find_many_parallel_t = find_many<sz_u32_t, malloc_t, sz_cap_serial_k>;
+using find_many_parallel_t = find_many<sz_u32_t, malloc_t, sz_caps_sp_k>;
 
 /**
  *  @brief  Tests the correctness of the string class Levenshtein distance computation,
@@ -1349,10 +1351,17 @@ void test_find_many_equivalence() {
     test_find_many_prefixes(find_many_baselines_t {}, find_many_serial_t {}, haystacks_config, 1024, 1);
 
     // Multi-threaded parallel Aho-Corasick implementation
+    // Let's reuse a thread-pool to amortize the cost of spawning threads.
+    fork_union_t pool;
+    if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
+    static_assert(executor_like<fork_union_t>);
+
     test_find_many_fixed(find_many_baselines_t {}, find_many_parallel_t {});
-    test_find_many_fuzzy(find_many_baselines_t {}, find_many_parallel_t {}, needles_short_config, haystacks_config, 10);
-    test_find_many_fuzzy(find_many_baselines_t {}, find_many_parallel_t {}, needles_long_config, haystacks_config, 10);
-    test_find_many_prefixes(find_many_baselines_t {}, find_many_parallel_t {}, haystacks_config, 1024, 10);
+    test_find_many_fuzzy(find_many_baselines_t {}, find_many_parallel_t {}, needles_short_config, haystacks_config, 10,
+                         pool);
+    test_find_many_fuzzy(find_many_baselines_t {}, find_many_parallel_t {}, needles_long_config, haystacks_config, 10,
+                         pool);
+    test_find_many_prefixes(find_many_baselines_t {}, find_many_parallel_t {}, haystacks_config, 1024, 10, pool);
 }
 
 } // namespace scripts

From ade74f6c3911d906664a859cba95d1b9052eccf8 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 8 May 2025 14:02:52 +0000
Subject: [PATCH 411/751] Fix: Checking `STRINGWARS_STRESS` env-var

---
 scripts/bench.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index 7bb74f53..e0e86c4a 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -374,8 +374,8 @@ inline environment_t build_environment(                                        /
 
     // Extract the stress-testing settings
     if (char const *env_var = std::getenv("STRINGWARS_STRESS")) {
-        bool is_zero = std::strcmp(env_var, "0") != 0 || std::strcmp(env_var, "false") != 0;
-        bool is_one = std::strcmp(env_var, "1") != 0 || std::strcmp(env_var, "true") != 0;
+        bool is_zero = std::strcmp(env_var, "0") == 0 || std::strcmp(env_var, "false") == 0;
+        bool is_one = std::strcmp(env_var, "1") == 0 || std::strcmp(env_var, "true") == 0;
         env.stress = is_one;
         if (!is_zero && !is_one) throw std::invalid_argument("The stress-testing flag must be '0' or '1'.");
     }

From df1f7efc3f9f960e7ae7033321311cdb7e43f4fb Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 8 May 2025 14:12:04 +0000
Subject: [PATCH 412/751] Improve: New caching primitives

---
 include/stringcuzilla/similarity.cuh | 299 ++++++++++++++++-----------
 1 file changed, 173 insertions(+), 126 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index f4179c51..a7f72fdf 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -88,6 +88,55 @@ using affine_smith_waterman_hopper_t =
 
 #pragma endregion - Common Aliases
 
+#pragma region - Common Helpers
+
+/**
+ *  @brief  Dispatches min or max operation based on the compile-time objective.
+ */
+template <sz_similarity_objective_t objective_, typename scalar_type_>
+__forceinline__ __device__ scalar_type_ _pick_best(scalar_type_ a, scalar_type_ b) noexcept {
+    if constexpr (objective_ == sz_minimize_distance_k) { return std::min(a, b); }
+    else { return std::max(a, b); }
+}
+
+template <sz_similarity_objective_t objective_, typename scalar_type_>
+__forceinline__ __device__ scalar_type_ _pick_best_in_warp(scalar_type_ x) noexcept {
+    // The `__shfl_down_sync` replaces `__shfl_down`
+    // https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
+    x = _pick_best<objective_, scalar_type_>(__shfl_down_sync(0xffffffff, x, 16), x);
+    x = _pick_best<objective_, scalar_type_>(__shfl_down_sync(0xffffffff, x, 8), x);
+    x = _pick_best<objective_, scalar_type_>(__shfl_down_sync(0xffffffff, x, 4), x);
+    x = _pick_best<objective_, scalar_type_>(__shfl_down_sync(0xffffffff, x, 2), x);
+    x = _pick_best<objective_, scalar_type_>(__shfl_down_sync(0xffffffff, x, 1), x);
+    return x;
+}
+
+/**
+ *  @brief  Loads data with a hint, that it's frequently accessed and immutable throughout the kernel.
+ *  @see    https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#read-only-data-cache-load-function
+ *  @see    https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#global-memory-5-x
+ */
+template <typename scalar_type_>
+__forceinline__ __device__ scalar_type_ _load_immutable(scalar_type_ const *ptr) noexcept {
+    // The `__ldg` intrinsic translates into the `ld.global.nc` PTX instruction.
+    // It reads a value from global memory and caches it in the non-coherent cache.
+    // return __ldg(ptr);
+    return *ptr;
+}
+
+/**
+ *  @brief  Loads data with a cache hint, that it will not be accessed again.
+ *  @see    https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#load-functions-using-cache-hints
+ *  @see    https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cache-operators
+ */
+template <typename scalar_type_>
+__forceinline__ __device__ scalar_type_ _load_last_use(scalar_type_ const *ptr) noexcept {
+    // return __ldlu(ptr);
+    return *ptr;
+}
+
+#pragma endregion - Common Helpers
+
 #pragma region - Algorithm Building Blocks
 
 /**
@@ -126,11 +175,6 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     linear_gap_costs_t gap_costs_ {};
     score_t final_score_ {0};
 
-    __forceinline__ __device__ score_t pick_best(score_t a, score_t b) const noexcept {
-        if constexpr (objective_ == sz_minimize_distance_k) { return std::min(a, b); }
-        else { return std::max(a, b); }
-    }
-
   public:
     __forceinline__ __device__ tile_scorer(substituter_t subs, linear_gap_costs_t gaps) noexcept
         : substituter_(subs), gap_costs_(gaps) {}
@@ -176,14 +220,16 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         // ? and allowing them to reuse the previous `pre_deletion` as the new `pre_insertion`,
         // ? that code ends up being slower than the one below.
         for (index_type_ i = tasks_offset; i < tasks_count; i += tasks_step) {
-            score_t pre_substitution = scores_pre_substitution[i];
+            score_t pre_substitution = _load_last_use(scores_pre_substitution + i);
             score_t pre_insertion = scores_pre_insertion[i];
             score_t pre_deletion = scores_pre_deletion[i];
+            char first_char = _load_immutable(first_slice + tasks_count - i - 1);
+            char second_char = _load_immutable(second_slice + i);
 
-            error_cost_t cost_of_substitution = substituter_(first_slice[tasks_count - i - 1], second_slice[i]);
+            error_cost_t cost_of_substitution = substituter_(first_char, second_char);
             score_t if_substitution = pre_substitution + cost_of_substitution;
-            score_t if_deletion_or_insertion = pick_best(pre_deletion, pre_insertion) + gap_costs;
-            score_t cell_score = pick_best(if_deletion_or_insertion, if_substitution);
+            score_t if_deletion_or_insertion = _pick_best<objective_k>(pre_deletion, pre_insertion) + gap_costs;
+            score_t cell_score = _pick_best<objective_k>(if_deletion_or_insertion, if_substitution);
             scores_new[i] = cell_score;
         }
 
@@ -227,22 +273,6 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     linear_gap_costs_t gap_costs_ {};
     score_t final_score_ {0};
 
-    __forceinline__ __device__ score_t pick_best(score_t a, score_t b) const noexcept {
-        if constexpr (objective_k == sz_minimize_distance_k) { return std::min(a, b); }
-        else { return std::max(a, b); }
-    }
-
-    __forceinline__ __device__ score_t pick_best_in_warp(score_t val) const noexcept {
-        // The `__shfl_down_sync` replaces `__shfl_down`
-        // https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
-        val = pick_best(__shfl_down_sync(0xffffffff, val, 16), val);
-        val = pick_best(__shfl_down_sync(0xffffffff, val, 8), val);
-        val = pick_best(__shfl_down_sync(0xffffffff, val, 4), val);
-        val = pick_best(__shfl_down_sync(0xffffffff, val, 2), val);
-        val = pick_best(__shfl_down_sync(0xffffffff, val, 1), val);
-        return val;
-    }
-
   public:
     __forceinline__ __device__ tile_scorer(substituter_t subs, linear_gap_costs_t gaps) noexcept
         : substituter_(subs), gap_costs_(gaps) {}
@@ -286,23 +316,25 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         // ? and allowing them to reuse the previous `pre_deletion` as the new `pre_insertion`,
         // ? that code ends up being slower than the one below.
         for (index_type_ i = tasks_offset; i < tasks_count; i += tasks_step) {
-            score_t pre_substitution = scores_pre_substitution[i];
+            score_t pre_substitution = _load_last_use(scores_pre_substitution + i);
             score_t pre_insertion = scores_pre_insertion[i];
             score_t pre_deletion = scores_pre_deletion[i];
+            char first_char = _load_immutable(first_slice + tasks_count - i - 1);
+            char second_char = _load_immutable(second_slice + i);
 
-            error_cost_t cost_of_substitution = substituter_(first_slice[tasks_count - i - 1], second_slice[i]);
+            error_cost_t cost_of_substitution = substituter_(first_char, second_char);
             score_t if_substitution = pre_substitution + cost_of_substitution;
-            score_t if_deletion_or_insertion = pick_best(pre_deletion, pre_insertion) + gap_cost;
-            score_t if_substitution_or_reset = pick_best(if_substitution, 0);
-            score_t cell_score = pick_best(if_deletion_or_insertion, if_substitution_or_reset);
+            score_t if_deletion_or_insertion = _pick_best<objective_k>(pre_deletion, pre_insertion) + gap_cost;
+            score_t if_substitution_or_reset = _pick_best<objective_k, score_t>(if_substitution, 0);
+            score_t cell_score = _pick_best<objective_k>(if_deletion_or_insertion, if_substitution_or_reset);
             scores_new[i] = cell_score;
 
             // Update the global maximum score if this cell beats it.
-            final_score_ = pick_best(final_score_, cell_score);
+            final_score_ = _pick_best<objective_k>(final_score_, cell_score);
         }
 
         // ! Don't forget to pick the best among the best scores per thread.
-        final_score_ = pick_best_in_warp(final_score_);
+        final_score_ = _pick_best_in_warp<objective_k>(final_score_);
     }
 };
 
@@ -342,11 +374,6 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     affine_gap_costs_t gap_costs_ {};
     score_t final_score_ {0};
 
-    __forceinline__ __device__ score_t pick_best(score_t a, score_t b) const noexcept {
-        if constexpr (objective_ == sz_minimize_distance_k) { return std::min(a, b); }
-        else { return std::max(a, b); }
-    }
-
   public:
     __forceinline__ __device__ tile_scorer(substituter_t subs, affine_gap_costs_t gaps) noexcept
         : substituter_(subs), gap_costs_(gaps) {}
@@ -406,20 +433,22 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         // ? and allowing them to reuse the previous `pre_deletion` as the new `pre_insertion`,
         // ? that code ends up being slower than the one below.
         for (index_type_ i = tasks_offset; i < tasks_count; i += tasks_step) {
-            score_t pre_substitution = scores_pre_substitution[i];
+            score_t pre_substitution = _load_last_use(scores_pre_substitution + i);
             score_t pre_insertion_opening = scores_pre_insertion[i];
             score_t pre_deletion_opening = scores_pre_deletion[i];
             score_t pre_insertion_expansion = scores_running_insertions[i];
             score_t pre_deletion_expansion = scores_running_deletions[i];
+            char first_char = _load_immutable(first_slice + tasks_count - i - 1);
+            char second_char = _load_immutable(second_slice + i);
 
-            error_cost_t cost_of_substitution = substituter_(first_slice[tasks_count - i - 1], second_slice[i]);
+            error_cost_t cost_of_substitution = substituter_(first_char, second_char);
             score_t if_substitution = pre_substitution + cost_of_substitution;
             score_t if_insertion = min_or_max<objective_k>(pre_insertion_opening + gap_costs_.open,
                                                            pre_insertion_expansion + gap_costs_.extend);
             score_t if_deletion = min_or_max<objective_k>(pre_deletion_opening + gap_costs_.open,
                                                           pre_deletion_expansion + gap_costs_.extend);
             score_t if_deletion_or_insertion = min_or_max<objective_k>(if_deletion, if_insertion);
-            score_t cell_score = pick_best(if_deletion_or_insertion, if_substitution);
+            score_t cell_score = _pick_best<objective_k>(if_deletion_or_insertion, if_substitution);
 
             // Export results.
             scores_new[i] = cell_score;
@@ -467,22 +496,6 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     affine_gap_costs_t gap_costs_ {};
     score_t final_score_ {0};
 
-    __forceinline__ __device__ score_t pick_best(score_t a, score_t b) const noexcept {
-        if constexpr (objective_k == sz_minimize_distance_k) { return std::min(a, b); }
-        else { return std::max(a, b); }
-    }
-
-    __forceinline__ __device__ score_t pick_best_in_warp(score_t val) const noexcept {
-        // The `__shfl_down_sync` replaces `__shfl_down`
-        // https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
-        val = pick_best(__shfl_down_sync(0xffffffff, val, 16), val);
-        val = pick_best(__shfl_down_sync(0xffffffff, val, 8), val);
-        val = pick_best(__shfl_down_sync(0xffffffff, val, 4), val);
-        val = pick_best(__shfl_down_sync(0xffffffff, val, 2), val);
-        val = pick_best(__shfl_down_sync(0xffffffff, val, 1), val);
-        return val;
-    }
-
   public:
     __forceinline__ __device__ tile_scorer(substituter_t subs, affine_gap_costs_t gaps) noexcept
         : substituter_(subs), gap_costs_(gaps) {}
@@ -538,21 +551,23 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         // ? and allowing them to reuse the previous `pre_deletion` as the new `pre_insertion`,
         // ? that code ends up being slower than the one below.
         for (index_type_ i = tasks_offset; i < tasks_count; i += tasks_step) {
-            score_t pre_substitution = scores_pre_substitution[i];
+            score_t pre_substitution = _load_last_use(scores_pre_substitution + i);
             score_t pre_insertion_opening = scores_pre_insertion[i];
             score_t pre_deletion_opening = scores_pre_deletion[i];
             score_t pre_insertion_expansion = scores_running_insertions[i];
             score_t pre_deletion_expansion = scores_running_deletions[i];
+            char first_char = _load_immutable(first_slice + tasks_count - i - 1);
+            char second_char = _load_immutable(second_slice + i);
 
-            error_cost_t cost_of_substitution = substituter_(first_slice[tasks_count - i - 1], second_slice[i]);
+            error_cost_t cost_of_substitution = substituter_(first_char, second_char);
             score_t if_substitution = pre_substitution + cost_of_substitution;
             score_t if_deletion = min_or_max<objective_k>(pre_deletion_opening + gap_costs_.open,
                                                           pre_deletion_expansion + gap_costs_.extend);
             score_t if_insertion = min_or_max<objective_k>(pre_insertion_opening + gap_costs_.open,
                                                            pre_insertion_expansion + gap_costs_.extend);
             score_t if_deletion_or_insertion = min_or_max<objective_k>(if_deletion, if_insertion);
-            score_t if_substitution_or_reset = pick_best(if_substitution, 0);
-            score_t cell_score = pick_best(if_deletion_or_insertion, if_substitution_or_reset);
+            score_t if_substitution_or_reset = _pick_best<objective_k, score_t>(if_substitution, 0);
+            score_t cell_score = _pick_best<objective_k>(if_deletion_or_insertion, if_substitution_or_reset);
 
             // Export results.
             scores_new[i] = cell_score;
@@ -560,11 +575,11 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
             scores_new_deletions[i] = if_deletion;
 
             // Update the global maximum score if this cell beats it.
-            final_score_ = pick_best(final_score_, cell_score);
+            final_score_ = _pick_best<objective_k>(final_score_, cell_score);
         }
 
         // ! Don't forget to pick the best among the best scores per thread.
-        final_score_ = pick_best_in_warp(final_score_);
+        final_score_ = _pick_best_in_warp<objective_k>(final_score_);
     }
 };
 
@@ -625,8 +640,8 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
             pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
             pre_insertion_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
             pre_deletion_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
-            first_vec = sz_u32_load_unaligned(first_slice + tasks_count - i - 4);
-            second_vec = sz_u32_load_unaligned(second_slice + i);
+            first_vec = sz_u32_load_unaligned(first_slice + tasks_count - i - 4); // ! this may be OOB
+            second_vec = sz_u32_load_unaligned(second_slice + i);                 // ! this may be OOB, but padded
             first_vec.u32 = __nv_bswap32(first_vec.u32); // ! reverse the order of bytes in the first vector
 
             // Equality comparison will output 0xFF for each matching byte.
@@ -698,13 +713,16 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
         // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
         // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
         for (uint i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
-            pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
-            pre_insertion_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
-            pre_deletion_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
-            first_vec.u16s[0] = *(first_slice + tasks_count - i - 1); // ! with a [] lookup would underflow
-            first_vec.u16s[1] = *(first_slice + tasks_count - i - 2); // ! with a [] lookup would underflow
-            second_vec.u16s[0] = second_slice[i + 0];
-            second_vec.u16s[1] = second_slice[i + 1];
+            pre_substitution_vec.u16s[0] = scores_pre_substitution[i + 0];
+            pre_substitution_vec.u16s[1] = scores_pre_substitution[i + 1];
+            pre_insertion_vec.u16s[0] = scores_pre_insertion[i + 0];
+            pre_insertion_vec.u16s[1] = scores_pre_insertion[i + 1];
+            pre_deletion_vec.u16s[0] = scores_pre_deletion[i + 0];
+            pre_deletion_vec.u16s[1] = scores_pre_deletion[i + 1];
+            first_vec.u16s[0] = _load_immutable(first_slice + tasks_count - i - 1);
+            first_vec.u16s[1] = _load_immutable(first_slice + tasks_count - i - 2); // ! this may be OOB
+            second_vec.u16s[0] = _load_immutable(second_slice + i + 0);
+            second_vec.u16s[1] = _load_immutable(second_slice + i + 1); // ! this may be OOB, but padded
 
             // Equality comparison will output 0xFFFF for each matching byte-pair.
             equality_vec.u32 = __vcmpeq2(first_vec.u32, second_vec.u32);
@@ -809,8 +827,8 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
             pre_deletion_opening_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
             pre_insertion_expansion_vec = sz_u32_load_unaligned(scores_running_insertions + i);
             pre_deletion_expansion_vec = sz_u32_load_unaligned(scores_running_deletions + i);
-            first_vec = sz_u32_load_unaligned(first_slice + tasks_count - i - 4);
-            second_vec = sz_u32_load_unaligned(second_slice + i);
+            first_vec = sz_u32_load_unaligned(first_slice + tasks_count - i - 4); // ! this may be OOB
+            second_vec = sz_u32_load_unaligned(second_slice + i);                 // ! this may be OOB, but padded
             first_vec.u32 = __nv_bswap32(first_vec.u32); // ! reverse the order of bytes in the first vector
 
             // Equality comparison will output 0xFF for each matching byte.
@@ -899,15 +917,20 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
         // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
         // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
         for (uint i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
-            pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
-            pre_insertion_opening_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
-            pre_deletion_opening_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
-            pre_insertion_expansion_vec = sz_u32_load_unaligned(scores_running_insertions + i);
-            pre_deletion_expansion_vec = sz_u32_load_unaligned(scores_running_deletions + i);
-            first_vec.u16s[0] = *(first_slice + tasks_count - i - 1); // ! with a [] lookup would underflow
-            first_vec.u16s[1] = *(first_slice + tasks_count - i - 2); // ! with a [] lookup would underflow
-            second_vec.u16s[0] = second_slice[i + 0];
-            second_vec.u16s[1] = second_slice[i + 1];
+            pre_substitution_vec.u16s[0] = scores_pre_substitution[i + 0];
+            pre_substitution_vec.u16s[1] = scores_pre_substitution[i + 1];
+            pre_insertion_opening_vec.u16s[0] = scores_pre_insertion[i + 0];
+            pre_insertion_opening_vec.u16s[1] = scores_pre_insertion[i + 1];
+            pre_deletion_opening_vec.u16s[0] = scores_pre_deletion[i + 0];
+            pre_deletion_opening_vec.u16s[1] = scores_pre_deletion[i + 1];
+            pre_insertion_expansion_vec.u16s[0] = scores_running_insertions[i + 0];
+            pre_insertion_expansion_vec.u16s[1] = scores_running_insertions[i + 1];
+            pre_deletion_expansion_vec.u16s[0] = scores_running_deletions[i + 0];
+            pre_deletion_expansion_vec.u16s[1] = scores_running_deletions[i + 1];
+            first_vec.u16s[0] = _load_immutable(first_slice + tasks_count - i - 1);
+            first_vec.u16s[1] = _load_immutable(first_slice + tasks_count - i - 2); // ! this may be OOB
+            second_vec.u16s[0] = _load_immutable(second_slice + i + 0);
+            second_vec.u16s[1] = _load_immutable(second_slice + i + 1); // ! this may be OOB, but padded
 
             // Equality comparison will output 0xFFFF for each matching byte-pair.
             equality_vec.u32 = __vcmpeq2(first_vec.u32, second_vec.u32);
@@ -1008,13 +1031,16 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
         // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
         // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
         for (uint i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
-            pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
-            pre_insertion_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
-            pre_deletion_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
-            first_vec.u16s[0] = *(first_slice + tasks_count - i - 1); // ! with a [] lookup would underflow
-            first_vec.u16s[1] = *(first_slice + tasks_count - i - 2); // ! with a [] lookup would underflow
-            second_vec.u16s[0] = second_slice[i + 0];
-            second_vec.u16s[1] = second_slice[i + 1];
+            pre_substitution_vec.u16s[0] = scores_pre_substitution[i + 0];
+            pre_substitution_vec.u16s[1] = scores_pre_substitution[i + 1];
+            pre_insertion_vec.u16s[0] = scores_pre_insertion[i + 0];
+            pre_insertion_vec.u16s[1] = scores_pre_insertion[i + 1];
+            pre_deletion_vec.u16s[0] = scores_pre_deletion[i + 0];
+            pre_deletion_vec.u16s[1] = scores_pre_deletion[i + 1];
+            first_vec.u16s[0] = _load_immutable(first_slice + tasks_count - i - 1);
+            first_vec.u16s[1] = _load_immutable(first_slice + tasks_count - i - 2); // ! this may be OOB
+            second_vec.u16s[0] = _load_immutable(second_slice + i + 0);
+            second_vec.u16s[1] = _load_immutable(second_slice + i + 1); // ! this may be OOB, but padded
 
             // Equality comparison will output 0xFFFF for each matching byte-pair.
             equality_vec.u32 = __vcmpeq2(first_vec.u32, second_vec.u32);
@@ -1109,15 +1135,20 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
         // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
         // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
         for (uint i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
-            pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
-            pre_insertion_opening_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
-            pre_deletion_opening_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
-            pre_insertion_expansion_vec = sz_u32_load_unaligned(scores_running_insertions + i);
-            pre_deletion_expansion_vec = sz_u32_load_unaligned(scores_running_deletions + i);
-            first_vec.u16s[0] = *(first_slice + tasks_count - i - 1); // ! with a [] lookup would underflow
-            first_vec.u16s[1] = *(first_slice + tasks_count - i - 2); // ! with a [] lookup would underflow
-            second_vec.u16s[0] = second_slice[i + 0];
-            second_vec.u16s[1] = second_slice[i + 1];
+            pre_substitution_vec.u16s[0] = scores_pre_substitution[i + 0];
+            pre_substitution_vec.u16s[1] = scores_pre_substitution[i + 1];
+            pre_insertion_opening_vec.u16s[0] = scores_pre_insertion[i + 0];
+            pre_insertion_opening_vec.u16s[1] = scores_pre_insertion[i + 1];
+            pre_deletion_opening_vec.u16s[0] = scores_pre_deletion[i + 0];
+            pre_deletion_opening_vec.u16s[1] = scores_pre_deletion[i + 1];
+            pre_insertion_expansion_vec.u16s[0] = scores_running_insertions[i + 0];
+            pre_insertion_expansion_vec.u16s[1] = scores_running_insertions[i + 1];
+            pre_deletion_expansion_vec.u16s[0] = scores_running_deletions[i + 0];
+            pre_deletion_expansion_vec.u16s[1] = scores_running_deletions[i + 1];
+            first_vec.u16s[0] = _load_immutable(first_slice + tasks_count - i - 1);
+            first_vec.u16s[1] = _load_immutable(first_slice + tasks_count - i - 2); // ! this may be OOB
+            second_vec.u16s[0] = _load_immutable(second_slice + i + 0);
+            second_vec.u16s[1] = _load_immutable(second_slice + i + 1); // ! this may be OOB, but padded
 
             // Equality comparison will output 0xFFFF for each matching byte-pair.
             equality_vec.u32 = __vcmpeq2(first_vec.u32, second_vec.u32);
@@ -2301,6 +2332,7 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
                          linear_gap_costs_t, sz_maximize_score_k, locality_, sz_cap_cuda_k> {
 
     static constexpr sz_similarity_locality_t locality_k = locality_;
+    static constexpr sz_similarity_objective_t objective_k = sz_maximize_score_k;
 
     using tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_cuda_constant_memory_t,
                       linear_gap_costs_t, sz_maximize_score_k, locality_,
@@ -2332,13 +2364,16 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
         // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
         // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
         for (uint i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
-            pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
-            pre_insertion_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
-            pre_deletion_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
-            first_vec.u16s[0] = *(first_slice + tasks_count - i - 1); // ! with a [] lookup would underflow
-            first_vec.u16s[1] = *(first_slice + tasks_count - i - 2); // ! with a [] lookup would underflow
-            second_vec.u16s[0] = second_slice[i + 0];
-            second_vec.u16s[1] = second_slice[i + 1];
+            pre_substitution_vec.i16s[0] = _load_last_use(scores_pre_substitution + i + 0);
+            pre_substitution_vec.i16s[1] = _load_last_use(scores_pre_substitution + i + 1);
+            pre_insertion_vec.i16s[0] = scores_pre_insertion[i + 0];
+            pre_insertion_vec.i16s[1] = scores_pre_insertion[i + 1];
+            pre_deletion_vec.i16s[0] = scores_pre_deletion[i + 0];
+            pre_deletion_vec.i16s[1] = scores_pre_deletion[i + 1];
+            first_vec.u16s[0] = _load_immutable(first_slice + tasks_count - i - 1);
+            first_vec.u16s[1] = _load_immutable(first_slice + tasks_count - i - 2); // ! this may be OOB
+            second_vec.u16s[0] = _load_immutable(second_slice + i + 0);
+            second_vec.u16s[1] = _load_immutable(second_slice + i + 1); // ! this may be OOB, but padded
 
             cost_of_substitution_vec.i16s[0] = substituter(first_vec.u16s[0], second_vec.u16s[0]);
             cost_of_substitution_vec.i16s[1] = substituter(first_vec.u16s[1], second_vec.u16s[1]);
@@ -2372,7 +2407,7 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
         }
         else { // Or the best score for local alignment.
             this->final_score_ = __vimax3_s32(this->final_score_, final_score_vec.i16s[0], final_score_vec.i16s[1]);
-            this->final_score_ = this->pick_best_in_warp(this->final_score_);
+            this->final_score_ = _pick_best_in_warp<objective_k>(this->final_score_);
         }
     }
 };
@@ -2384,6 +2419,7 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
                          linear_gap_costs_t, sz_maximize_score_k, locality_, sz_cap_cuda_k> {
 
     static constexpr sz_similarity_locality_t locality_k = locality_;
+    static constexpr sz_similarity_objective_t objective_k = sz_maximize_score_k;
 
     using tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_cuda_constant_memory_t,
                       linear_gap_costs_t, sz_maximize_score_k, locality_,
@@ -2398,17 +2434,19 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
         sz_i32_t *scores_new) noexcept {
 
         // Make sure we are called for an anti-diagonal traversal order
-        sz_i32_t const gap_costs = this->gap_costs_.open_or_extend;
         _sz_assert(scores_pre_insertion + 1 == scores_pre_deletion);
+        error_costs_256x256_in_cuda_constant_memory_t substituter;
+        sz_i32_t const gap_costs = this->gap_costs_.open_or_extend;
         sz_i32_t final_score = 0;
 
         for (uint i = tasks_offset; i < tasks_count; i += tasks_step) {
-            sz_i32_t pre_substitution = scores_pre_substitution[i];
+            sz_i32_t pre_substitution = _load_last_use(scores_pre_substitution + i);
             sz_i32_t pre_insertion = scores_pre_insertion[i];
             sz_i32_t pre_deletion = scores_pre_deletion[i];
+            char first_char = _load_immutable(first_slice + tasks_count - i - 1);
+            char second_char = _load_immutable(second_slice + i);
 
-            error_cost_t cost_of_substitution =
-                error_costs_256x256_in_cuda_constant_memory_t {}(first_slice[tasks_count - i - 1], second_slice[i]);
+            error_cost_t cost_of_substitution = substituter(first_char, second_char);
             sz_i32_t if_deletion_or_insertion = (std::max)(pre_deletion, pre_insertion) + gap_costs;
             sz_i32_t cell_score;
 
@@ -2432,7 +2470,7 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
         }
         else { // Or the best score for local alignment.
             this->final_score_ = (std::max)(this->final_score_, final_score);
-            this->final_score_ = this->pick_best_in_warp(this->final_score_);
+            this->final_score_ = _pick_best_in_warp<objective_k>(this->final_score_);
         }
     }
 };
@@ -2448,6 +2486,7 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
                          affine_gap_costs_t, sz_maximize_score_k, locality_, sz_cap_cuda_k> {
 
     static constexpr sz_similarity_locality_t locality_k = locality_;
+    static constexpr sz_similarity_objective_t objective_k = sz_maximize_score_k;
 
     using tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_cuda_constant_memory_t,
                       affine_gap_costs_t, sz_maximize_score_k, locality_,
@@ -2486,15 +2525,20 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
         // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
         // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
         for (uint i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
-            pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
-            pre_insertion_opening_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
-            pre_deletion_opening_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
-            pre_insertion_expansion_vec = sz_u32_load_unaligned(scores_running_insertions + i);
-            pre_deletion_expansion_vec = sz_u32_load_unaligned(scores_running_deletions + i);
-            first_vec.u16s[0] = *(first_slice + tasks_count - i - 1); // ! with a [] lookup would underflow
-            first_vec.u16s[1] = *(first_slice + tasks_count - i - 2); // ! with a [] lookup would underflow
-            second_vec.u16s[0] = second_slice[i + 0];
-            second_vec.u16s[1] = second_slice[i + 1];
+            pre_substitution_vec.i16s[0] = _load_last_use(scores_pre_substitution + i + 0);
+            pre_substitution_vec.i16s[1] = _load_last_use(scores_pre_substitution + i + 1);
+            pre_insertion_opening_vec.i16s[0] = scores_pre_insertion[i + 0];
+            pre_insertion_opening_vec.i16s[1] = scores_pre_insertion[i + 1];
+            pre_deletion_opening_vec.i16s[0] = scores_pre_deletion[i + 0];
+            pre_deletion_opening_vec.i16s[1] = scores_pre_deletion[i + 1];
+            pre_insertion_expansion_vec.i16s[0] = scores_running_insertions[i + 0];
+            pre_insertion_expansion_vec.i16s[1] = scores_running_insertions[i + 1];
+            pre_deletion_expansion_vec.i16s[0] = scores_running_deletions[i + 0];
+            pre_deletion_expansion_vec.i16s[1] = scores_running_deletions[i + 1];
+            first_vec.u16s[0] = _load_immutable(first_slice + tasks_count - i - 1);
+            first_vec.u16s[1] = _load_immutable(first_slice + tasks_count - i - 2); // ! this may be OOB
+            second_vec.u16s[0] = _load_immutable(second_slice + i + 0);
+            second_vec.u16s[1] = _load_immutable(second_slice + i + 1); // ! this may be OOB, but padded
 
             cost_of_substitution_vec.i16s[0] = substituter(first_vec.u16s[0], second_vec.u16s[0]);
             cost_of_substitution_vec.i16s[1] = substituter(first_vec.u16s[1], second_vec.u16s[1]);
@@ -2536,7 +2580,7 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
         }
         else { // Or the best score for local alignment.
             this->final_score_ = __vimax3_s32(this->final_score_, final_score_vec.i16s[0], final_score_vec.i16s[1]);
-            this->final_score_ = this->pick_best_in_warp(this->final_score_);
+            this->final_score_ = _pick_best_in_warp<objective_k>(this->final_score_);
         }
     }
 };
@@ -2548,6 +2592,7 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
                          affine_gap_costs_t, sz_maximize_score_k, locality_, sz_cap_cuda_k> {
 
     static constexpr sz_similarity_locality_t locality_k = locality_;
+    static constexpr sz_similarity_objective_t objective_k = sz_maximize_score_k;
 
     using tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_cuda_constant_memory_t,
                       affine_gap_costs_t, sz_maximize_score_k, locality_,
@@ -2566,20 +2611,22 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
         sz_i32_t *scores_new_deletions) noexcept {
 
         // Make sure we are called for an anti-diagonal traversal order
+        _sz_assert(scores_pre_insertion + 1 == scores_pre_deletion);
         sz_i32_t const gap_open_cost = this->gap_costs_.open;
         sz_i32_t const gap_extend_cost = this->gap_costs_.extend;
-        _sz_assert(scores_pre_insertion + 1 == scores_pre_deletion);
+        error_costs_256x256_in_cuda_constant_memory_t substituter;
         sz_i32_t final_score = 0;
 
         for (uint i = tasks_offset; i < tasks_count; i += tasks_step) {
-            sz_i32_t pre_substitution = scores_pre_substitution[i];
+            sz_i32_t pre_substitution = _load_last_use(scores_pre_substitution + i);
             sz_i32_t pre_insertion_opening = scores_pre_insertion[i];
             sz_i32_t pre_deletion_opening = scores_pre_deletion[i];
             sz_i32_t pre_insertion_expansion = scores_running_insertions[i];
             sz_i32_t pre_deletion_expansion = scores_running_deletions[i];
+            char first_char = _load_immutable(first_slice + tasks_count - i - 1);
+            char second_char = _load_immutable(second_slice + i);
 
-            error_cost_t cost_of_substitution =
-                error_costs_256x256_in_cuda_constant_memory_t {}(first_slice[tasks_count - i - 1], second_slice[i]);
+            error_cost_t cost_of_substitution = substituter(first_char, second_char);
             sz_i32_t if_substitution = pre_substitution + cost_of_substitution;
             sz_i32_t if_insertion =
                 __viaddmax_s32(pre_insertion_opening, gap_open_cost, pre_insertion_expansion + gap_extend_cost);
@@ -2609,7 +2656,7 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
         }
         else { // Or the best score for local alignment.
             this->final_score_ = (std::max)(this->final_score_, final_score);
-            this->final_score_ = this->pick_best_in_warp(this->final_score_);
+            this->final_score_ = _pick_best_in_warp<objective_k>(this->final_score_);
         }
     }
 };

From 496e55a8082892e1cef9f8471e29274eec3203af Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 10 May 2025 13:02:46 +0000
Subject: [PATCH 413/751] Docs: Aho-Corasick CUDA design

---
 include/stringcuzilla/find_many.cuh | 128 +++++++++++++++++++---------
 1 file changed, 86 insertions(+), 42 deletions(-)

diff --git a/include/stringcuzilla/find_many.cuh b/include/stringcuzilla/find_many.cuh
index d315a1d1..d6a3ad03 100644
--- a/include/stringcuzilla/find_many.cuh
+++ b/include/stringcuzilla/find_many.cuh
@@ -9,8 +9,16 @@
  *  may fit into the Shared Memory (~ 50 MB), and, in rare cases, may fit into the Constant Memory (~ 50 KB).
  *  The haystacks, however, may be huge in size and can be fetched from external memory (e.g., NVMe SSDs).
  *
+ *  That means we may be inclined to compress the FSM into a smaller representation, so that it can fit into
+ *  the Shared Memory (as constant memory is too slow), but we will then likely increase the number of individual
+ *  loads... and the problem will resurface again.
+ *
+ *  @see How slow is constant memory? https://leimao.github.io/blog/CUDA-Constant-Memory/
+ *
  *  That means, the coalesced memory is extremely important. Moreover, assuming we are mostly fetching each
  *  haystack byte only once, we want to make the transfer asynchronous, using @b `cp.async` PTX instructions.
+ *
+ *
  */
 #ifndef STRINGCUZILLA_FIND_MANY_CUH_
 #define STRINGCUZILLA_FIND_MANY_CUH_
@@ -24,59 +32,95 @@
 namespace ashvardanian {
 namespace stringzilla {
 
-#pragma region - Compressed State Machine
+#pragma region - General Purpose CUDA Backend
 
 /**
- *  @brief  Reordered bit-unzipped form of @b `aho_corasick_dictionary` for CUDA.
+ *  @brief  Multi-pattern exact substring search on CUDA-capable GPUs, assigning just one warp per haystack.
  *
- *  GPUs have many levels of memory hierarchy, and the performance of a kernel heavily
- *  depends on its utilization. Instead of branching the state into 256 next states,
+ *  The serial Aho-Corasick algorithm's super-power is looking at each symbol of the haystack just once.
+ *  If we have a warp of @b (WS=32) threads, we have several strategies to enumerate the haystack:
  *
- *  - Constant Memory (~ 50 KB):
- *      - for 256-level branching, and 4 bytes per state, fits ~ 50 states.
- *      - for 256-level branching, and 1 bytes per state, fits ~ 200 states.
- *      - for 2-level branching, and 4 bytes per state, fits ~ 6'000 states.
- *      - for 2-level branching, and 2 bytes per state, fits ~ 12'000 states.
- */
-struct compressed_aho_corasick_dictionary_t {
-
-    using u16s_allocator_t = unified_alloc<sz_u16_t>;
-    using u32s_allocator_t = unified_alloc<sz_u32_t>;
-    using u64s_allocator_t = unified_alloc<sz_u64_t>;
-
-    safe_vector<sz_u16_t, u16s_allocator_t> transitions_u16x2_;
-    safe_vector<sz_u32_t, u32s_allocator_t> transitions_u32x2_;
-    safe_vector<sz_u64_t, u64s_allocator_t> transitions_u64x2_;
-
-    safe_vector<sz_u32_t, u32s_allocator_t> outputs_counts_;
-    safe_vector<sz_u32_t, u32s_allocator_t> needles_lengths_;
-
-    template <typename state_id_type_, typename allocator_type_>
-    status_t try_build(aho_corasick_dictionary<state_id_type_, allocator_type_> const &dict) noexcept {
-        // We need to reorder the states, so that the most frequently used can be represented
-        // as `uint8_t`, next as `uint16_t`, and the rest as `uint32_t` and `uint64_t`.
-
-        //
-        return status_t::success_k;
-    }
-};
-
-#pragma endregion // Compressed State Machine
-
-#pragma region - General Purpose CUDA Backend
-
-/**
- *  @brief  Multi-pattern exact substring search on CUDA-capable GPUs, assigning one or more warps
- *          per haystack string, using atomic writes to global memory for final output.
+ *  - Simple algorithm: split each haystack into WS continuous parts and assign each part to a thread.
+ *    That works great until the length of the longest needle is much smaller than the (haystack.size() / WS).
+ *  - Advanced algorithm: WS threads are walking through the haystack 2xWS symbols at a time, combining SIMT and
+ *    SIMD-style processing.
+ *
+ *  The problem with the "simple" solution is - imagine a haystack of 1 MB and a collection of 100 short needles
+ *  and just 1 long needle almost 1 MB in size. In the worst-case scenario, the first of WS=32 threads will immediately
+ *  start matching the longest needle. The (WS-1=31) will finish early, while 1 thread will have a WS longer runtime.
+ *  Assuming all the WS threads share a scheduler, our algorithm will be at least (WS-1) times slower than it can be.
+ *
+ *  The problem with the "advanced" solution is - with frequent failure links reaching back to the root, the threads
+ *  within the warp will be effectively observing the same paths once they receive the next character. So despite being
+ *  much more hardware-friendly with only sequential coalesced memory access, it directly harms the AC algorithm logic.
  */
 template < //
     typename state_id_type_,
     typename haystacks_strings_type_,           //
     sz_capability_t capability_ = sz_cap_cuda_k //
     >
-__global__ void _count_many_in_cuda_block( //
+__global__ void _count_matches_with_haystack_per_warp( //
     haystacks_strings_type_ haystacks, size_t const count_states, state_id_type_ const *transitions,
-    state_id_type_ const *count_outputs_per_state) {}
+    state_id_type_ const *count_outputs_per_state) {
+
+    using haystack_t = typename haystacks_strings_type_::value_type;
+    using char_t = typename haystack_t::value_type;
+    using state_id_t = state_id_type_;
+
+    // We may have multiple warps operating in the same block.
+    uint const warp_size = warpSize;
+    size_t const global_thread_index = static_cast<uint>(blockIdx.x * blockDim.x + threadIdx.x);
+    size_t const global_warp_index = static_cast<uint>(global_thread_index / warp_size);
+    size_t const warps_per_block = static_cast<uint>(blockDim.x / warp_size);
+    size_t const warps_per_device = static_cast<uint>(gridDim.x * warps_per_block);
+    uint const warp_thread_index = static_cast<uint>(global_thread_index % warp_size);
+    bool const is_last_in_warp = (warp_thread_index + 1 == warp_size);
+
+    for (size_t haystack_index = global_warp_index; haystack_index < haystacks.size();
+         haystack_index += warps_per_device) {
+        // Each warp is assigned to a single haystack.
+        haystack_t haystack = haystacks[haystack_index];
+        size_t const haystack_size = haystack.size();
+        size_t const haystack_offset = 0;
+        state_id_t current_state = 0;
+        state_id_t thread_matches_count = 0;
+
+        // Our text processing is happening left to right.
+        // On each cycle we could load 1 char, but we also can prefetch the one that will be used
+        // at the next warp-level shuffle. Sadly, there is `__shfl_down_sync` doesn't support circular
+        // rotation, so we need to use a reverse processing order for the `next_cycle_char`.
+        char_t current_char = haystack[warp_thread_index];
+        char_t next_cycle_char = haystack[warp_size + warp_thread_index];
+
+        // Fetch the new state and the number of possible matches ending here.
+        state_id_t next_state = transitions[current_state * 256 + static_cast<uint>(current_char)];
+        state_id_t current_output_count = count_outputs_per_state[next_state];
+
+        // Aggregate.
+        current_state = next_state;
+        thread_matches_count += current_output_count;
+
+#pragma unroll
+        for (size_t window_offset = 0; window_offset < warp_size; ++window_offset) {
+            // Shift down the current character, so all the threads except the last one - step forward.
+            current_char = __shfl_down_sync(0xFFFFFFFF, current_char, 1, warp_size);
+            // Select what is the end of the logical window observed by all threads in warp at `window_offset`.
+            char_t last_char_in_window = __shfl_sync(0xFFFFFFFF, next_cycle_char, window_offset, warp_size);
+            // Update the current character for the last thread in the warp.
+            current_char = is_last_in_warp ? current_char : last_char_in_window; // ? Hope this is branch-free
+
+            // Fetch the new state and the number of possible matches ending here.
+            next_state = transitions[current_state * 256 + static_cast<uint>(current_char)];
+            current_output_count = count_outputs_per_state[next_state];
+
+            // Aggregate.
+            current_state = next_state;
+            thread_matches_count += current_output_count;
+        }
+
+        // Now that we've went through `warp_size` starting positions,
+    }
+}
 
 /**
  *  @brief Aho-Corasick-based @b SIMT multi-pattern exact substring search.

From 6f656243533bbf91f1ae2fa9bec484ce75569d61 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 11 May 2025 14:35:32 +0000
Subject: [PATCH 414/751] Make: Rename `bench_search` -> `bench_find`

---
 scripts/bench_container.cpp                  | 2 +-
 scripts/{bench_search.cpp => bench_find.cpp} | 8 ++++----
 scripts/{bench_search.py => bench_find.py}   | 0
 scripts/bench_memory.cpp                     | 2 +-
 scripts/bench_sequence.cpp                   | 2 +-
 scripts/bench_similarity.cpp                 | 2 +-
 scripts/bench_similarity.cu                  | 2 +-
 7 files changed, 9 insertions(+), 9 deletions(-)
 rename scripts/{bench_search.cpp => bench_find.cpp} (99%)
 rename scripts/{bench_search.py => bench_find.py} (100%)

diff --git a/scripts/bench_container.cpp b/scripts/bench_container.cpp
index 7b6f841c..1c3dd0a7 100644
--- a/scripts/bench_container.cpp
+++ b/scripts/bench_container.cpp
@@ -3,7 +3,7 @@
  *  @brief  Benchmarks STL associative containers with @b `std::string_view`-compatible keys.
  *          The program accepts a file path to a dataset, tokenizes it, and benchmarks the lookup operations.
  *
- *  This file is the sibling of `bench_sequence.cpp`, `bench_search.cpp` and `bench_token.cpp`.
+ *  This file is the sibling of `bench_sequence.cpp`, `bench_find.cpp` and `bench_token.cpp`.
  *  It accepts a file with a list of words, constructs associative containers with string keys,
  *  using `std::string`, `std::string_view`, `sz::string_view`, and `sz::string`, and then
  *  evaluates the latency of lookups.
diff --git a/scripts/bench_search.cpp b/scripts/bench_find.cpp
similarity index 99%
rename from scripts/bench_search.cpp
rename to scripts/bench_find.cpp
index c1d5b744..676fcc96 100644
--- a/scripts/bench_search.cpp
+++ b/scripts/bench_find.cpp
@@ -1,5 +1,5 @@
 /**
- *  @file   bench_search.cpp
+ *  @file   bench_find.cpp
  *  @brief  Benchmarks for bidirectional string search operations.
  *          The program accepts a file path to a dataset, tokenizes it, and benchmarks the search operations,
  *          validating the SIMD-accelerated backends against the serial baselines.
@@ -35,8 +35,8 @@
  *
  *  @code{.sh}
  *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
- *  cmake --build build_release --config Release --target stringzilla_bench_search
- *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringzilla_bench_search
+ *  cmake --build build_release --config Release --target stringzilla_bench_find
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringzilla_bench_find
  *  @endcode
  *
  *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
@@ -46,7 +46,7 @@
  *  @code{.sh}
  *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
  *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
- *  build_release/stringzilla_bench_search
+ *  build_release/stringzilla_bench_find
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
diff --git a/scripts/bench_search.py b/scripts/bench_find.py
similarity index 100%
rename from scripts/bench_search.py
rename to scripts/bench_find.py
diff --git a/scripts/bench_memory.cpp b/scripts/bench_memory.cpp
index 13f31b24..710c262c 100644
--- a/scripts/bench_memory.cpp
+++ b/scripts/bench_memory.cpp
@@ -36,7 +36,7 @@
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
- *  This file is the sibling of `bench_search.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_sequence.cpp`.
+ *  This file is the sibling of `bench_find.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_sequence.cpp`.
  */
 #include <cstring> // `memmem`
 #include <memory>  // `std::unique_ptr`
diff --git a/scripts/bench_sequence.cpp b/scripts/bench_sequence.cpp
index f89fcdd9..653a8159 100644
--- a/scripts/bench_sequence.cpp
+++ b/scripts/bench_sequence.cpp
@@ -45,7 +45,7 @@
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
- *  This file is the sibling of `bench_search.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
+ *  This file is the sibling of `bench_find.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
  */
 #include <memory>        // `std::memcpy`
 #include <numeric>       // `std::iota`
diff --git a/scripts/bench_similarity.cpp b/scripts/bench_similarity.cpp
index 962bd456..61930bf1 100644
--- a/scripts/bench_similarity.cpp
+++ b/scripts/bench_similarity.cpp
@@ -45,7 +45,7 @@
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
- *  This file is the sibling of `bench_search.cpp`, `bench_token.cpp`, `bench_sequence.cpp`, and `bench_memory.cpp`.
+ *  This file is the sibling of `bench_find.cpp`, `bench_token.cpp`, `bench_sequence.cpp`, and `bench_memory.cpp`.
  */
 #include "bench_similarity.cuh"
 
diff --git a/scripts/bench_similarity.cu b/scripts/bench_similarity.cu
index 962bd456..61930bf1 100644
--- a/scripts/bench_similarity.cu
+++ b/scripts/bench_similarity.cu
@@ -45,7 +45,7 @@
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
- *  This file is the sibling of `bench_search.cpp`, `bench_token.cpp`, `bench_sequence.cpp`, and `bench_memory.cpp`.
+ *  This file is the sibling of `bench_find.cpp`, `bench_token.cpp`, `bench_sequence.cpp`, and `bench_memory.cpp`.
  */
 #include "bench_similarity.cuh"
 

From aaa6927b300f6da49ac9257fff359c53d4d17809 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 11 May 2025 14:35:54 +0000
Subject: [PATCH 415/751] Improve: Benchmark early exit

---
 scripts/bench.hpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index e0e86c4a..e1eaef4a 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -150,7 +150,7 @@ struct repeat_up_to {
         inline bool operator!=(end_sentinel) const {
             accurate_clock_t::time_point current_time = accurate_clock_t::now();
             passed_seconds_ = stdc::duration_cast<stdc::nanoseconds>(current_time - start_time_).count() / 1.e9;
-            return passed_seconds_ < max_seconds_;
+            return max_seconds_ != 0 && passed_seconds_ < max_seconds_;
         }
         inline double operator*() const { return passed_seconds_; }
         constexpr void operator++() {} // No-op
@@ -438,6 +438,7 @@ inline environment_t build_environment(                                        /
     default: std::printf("%zu-grams\n", static_cast<std::size_t>(env.tokenization)); break;
     }
     std::printf(" - Seed: %zu%s\n", env.seed, seed_message);
+    std::printf(" - Stress-testing: %s\n", env.stress ? "yes" : "no");
     std::printf(" - Loaded dataset size: %zu bytes\n", env.dataset.size());
     std::printf(" - Number of tokens: %zu\n", env.tokens.size());
     std::printf(" - Mean token length: %.2f bytes\n", mean_token_length);
@@ -648,7 +649,7 @@ bench_result_t bench_nullary(  //
 
     // Perform the testing against the baseline, if provided.
     if constexpr (!std::is_same<baseline_type_, callable_no_op_t>())
-        for (auto running_seconds : repeat_up_to(env.stress_seconds)) {
+        for (auto running_seconds : repeat_up_to(env.stress ? env.stress_seconds : 0)) {
             call_result_t const accelerated_result = callable();
             call_result_t const baseline_result = baseline();
             ++result.stress_calls;
@@ -721,7 +722,7 @@ bench_result_t bench_unary(    //
 
     std::size_t const lookup_mask = bit_floor(env.tokens.size()) - 1;
     if constexpr (!std::is_same<baseline_type_, callable_no_op_t>())
-        for (auto running_seconds : repeat_up_to(env.stress_seconds)) {
+        for (auto running_seconds : repeat_up_to(env.stress ? env.stress_seconds : 0)) {
             std::size_t const token_index = (result.stress_calls++) & lookup_mask;
             call_result_t const accelerated_result = callable(token_index);
             call_result_t const baseline_result = baseline(token_index);

From 0cf994ad853bba044ba6d855a8b93c6e2ce53b49 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 13 May 2025 09:27:15 +0000
Subject: [PATCH 416/751] Improve: Custom validators for nullary benchmarks

---
 scripts/bench.hpp | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index e1eaef4a..f8a5f84d 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -618,24 +618,27 @@ struct bench_result_t {
 };
 
 /**
- *  @brief Loops over all tokens (in loop-unrolled batches) in environment and applies the given @b nullary function.
+ *  @brief Repeatedly calls and profiles a given @b nullary function, comparing it against a baseline.
  *  @param[in] env Environment with the dataset and tokens.
  *  @param[in] name Name of the benchmark, used for logging.
  *  @param[in] baseline Optional serial analog, against which the accelerated function will be stress-tested.
  *  @param[in] callable Nullary function taking no arguments and returning a @b `call_result_t`.
+ *  @param[in] check_validator Optional function to validate the results of the benchmark.
  *  @return Profiling results, including the number of cycles, bytes processed, and error counts.
  */
-template <                                          //
-    typename callable_type_,                        //
-    typename baseline_type_ = callable_no_op_t,     //
-    typename preprocessing_type_ = callable_no_op_t //
+template <                                                        //
+    typename callable_type_,                                      //
+    typename baseline_type_ = callable_no_op_t,                   //
+    typename preprocessing_type_ = callable_no_op_t,              //
+    typename check_validator_type_ = std::equal_to<check_value_t> //
     >
 bench_result_t bench_nullary(  //
     environment_t const &env,  //
     std::string const &name,   //
     baseline_type_ &&baseline, //
     callable_type_ &&callable, //
-    preprocessing_type_ &&preprocessing = callable_no_op_t()) {
+    preprocessing_type_ &&preprocessing = preprocessing_type_ {},
+    check_validator_type_ &&check_validator = check_validator_type_ {}) {
 
     bench_result_t result;
     result.name = name;
@@ -654,7 +657,7 @@ bench_result_t bench_nullary(  //
             call_result_t const baseline_result = baseline();
             ++result.stress_calls;
             result.stress_inputs += accelerated_result.inputs_processed;
-            if (accelerated_result.check_value == baseline_result.check_value) continue; // No failures
+            if (check_validator(accelerated_result.check_value, baseline_result.check_value)) continue; // No failures
 
             // If we got here, the error needs to be reported and investigated.
             ++result.errors;

From 2d594e443dcdd4d76fbe4df8c688d6d8279a9f71 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 13 May 2025 09:28:03 +0000
Subject: [PATCH 417/751] Add: `bench_find_many.cpp`

---
 include/stringcuzilla/find_many.hpp | 211 ++++++++++++++++++++++++----
 scripts/bench_find_many.cpp         |  71 ++++++++++
 scripts/bench_find_many.cuh         | 183 ++++++++++++++++++++++++
 3 files changed, 438 insertions(+), 27 deletions(-)
 create mode 100644 scripts/bench_find_many.cpp
 create mode 100644 scripts/bench_find_many.cuh

diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index 7b1c0586..3010b0b1 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -72,9 +72,16 @@ struct find_many_match_t {
      *          in many-to-many search, to compare the outputs of multiple algorithms.
      */
     inline static bool less_globally(find_many_match_t const &lhs, find_many_match_t const &rhs) noexcept {
-        return lhs.needle.data() < rhs.needle.data() ||
+        return (lhs.needle.data() < rhs.needle.data()) ||
                (lhs.needle.data() == rhs.needle.data() && lhs.needle.end() < rhs.needle.end());
     }
+
+    inline bool operator==(find_many_match_t const &other) const noexcept {
+        return haystack.begin() == other.haystack.begin() && needle.begin() == other.needle.begin() &&
+               needle.end() == other.needle.end();
+    }
+
+    inline bool operator!=(find_many_match_t const &other) const noexcept { return !(*this == other); }
 };
 
 template <typename value_type_>
@@ -202,10 +209,57 @@ struct aho_corasick_dictionary {
         : transitions_(alloc), outputs_(alloc), failures_(alloc), count_states_(0), outputs_counts_(alloc),
           outputs_offsets_(alloc), needles_lengths_(alloc), alloc_(alloc) {}
 
+    aho_corasick_dictionary(aho_corasick_dictionary &&) noexcept = default;
+    aho_corasick_dictionary &operator=(aho_corasick_dictionary &&) noexcept = default;
+
     aho_corasick_dictionary(aho_corasick_dictionary const &) = delete;
-    aho_corasick_dictionary(aho_corasick_dictionary &&) = delete;
     aho_corasick_dictionary &operator=(aho_corasick_dictionary const &) = delete;
-    aho_corasick_dictionary &operator=(aho_corasick_dictionary &&) = delete;
+
+    /**
+     *  @brief  Copy‐assign from another dictionary, possibly with a different allocator.
+     *          If the operation fails, no side effects are expected. The state remains @b unchanged.
+     *  @retval `status_t::success_k` The needle was successfully added.
+     *  @retval `status_t::bad_alloc_k` Memory allocation failed.
+     */
+    template <typename other_allocator_t>
+    status_t try_assign(aho_corasick_dictionary<state_id_type_, other_allocator_t> const &other) noexcept {
+        using alloc_traits = std::allocator_traits<allocator_t>;
+
+        allocator_t alloc;
+        if constexpr (alloc_traits::propagate_on_container_copy_assignment::value) alloc = other.alloc_;
+
+        safe_vector<state_transitions_t, state_transitions_allocator_t> transitions(alloc);
+        safe_vector<state_id_t, state_id_allocator_t> outputs(alloc);
+        safe_vector<state_id_t, state_id_allocator_t> failures(alloc);
+        safe_vector<state_id_t, state_id_allocator_t> outputs_counts(alloc);
+        safe_vector<state_id_t, state_id_allocator_t> outputs_offsets(alloc);
+        safe_vector<size_t, size_allocator_t> needles_lengths(alloc);
+
+        status_t s;
+        if ((s = transitions.try_reserve(other.transitions_.size()))) return s;
+        if ((s = outputs.try_reserve(other.outputs_.size()))) return s;
+        if ((s = failures.try_reserve(other.failures_.size()))) return s;
+        if ((s = outputs_counts.try_reserve(other.outputs_counts_.size()))) return s;
+        if ((s = outputs_offsets.try_reserve(other.outputs_offsets_.size()))) return s;
+        if ((s = needles_lengths.try_reserve(other.needles_lengths_.size()))) return s;
+
+        sz_assert(transitions.try_assign(other.transitions_) == status_t::success_k);
+        sz_assert(outputs.try_assign(other.outputs_) == status_t::success_k);
+        sz_assert(failures.try_assign(other.failures_) == status_t::success_k);
+        sz_assert(outputs_counts.try_assign(other.outputs_counts_) == status_t::success_k);
+        sz_assert(outputs_offsets.try_assign(other.outputs_offsets_) == status_t::success_k);
+        sz_assert(needles_lengths.try_assign(other.needles_lengths_) == status_t::success_k);
+
+        alloc_ = std::move(alloc);
+        transitions_ = std::move(transitions);
+        outputs_ = std::move(outputs);
+        failures_ = std::move(failures);
+        outputs_counts_ = std::move(outputs_counts);
+        outputs_offsets_ = std::move(outputs_offsets);
+        needles_lengths_ = std::move(needles_lengths);
+        count_states_ = other.count_states_;
+        return status_t::success_k;
+    }
 
     void clear() noexcept {
         transitions_.clear();
@@ -227,13 +281,19 @@ struct aho_corasick_dictionary {
         count_states_ = 0;
     }
 
-    size_t size() const noexcept { return count_states_; }
-    size_t capacity() const noexcept { return transitions_.size(); }
+    size_t count_states() const noexcept { return count_states_; }
+    size_t capacity_states() const noexcept { return transitions_.size(); }
+    size_t count_needles() const noexcept { return needles_lengths_.size(); }
     size_t max_needle_length() const noexcept {
         size_t max_length = 0;
         for (size_t length : needles_lengths_) max_length = std::max(max_length, length);
         return max_length;
     }
+    size_t total_needles_length() const noexcept {
+        size_t total_length = 0;
+        for (size_t length : needles_lengths_) total_length += length;
+        return total_length;
+    }
 
     allocator_t const &allocator() const noexcept { return alloc_; }
 
@@ -449,12 +509,13 @@ struct aho_corasick_dictionary {
      *  @brief Count the number of occurrences of all the needles in the @p haystack.
      *  @return The number of potentially-overlapping occurrences.
      */
-    size_t count(span<char const> haystack) const noexcept {
+    inline size_t count(span<char const> haystack) const noexcept {
         size_t count = 0;
         state_id_t current_state = 0;
-        for (size_t haystack_offset = 0; haystack_offset < haystack.size(); ++haystack_offset) {
-            sz_u8_t symbol = static_cast<sz_u8_t>(haystack[haystack_offset]);
-            current_state = transitions_[current_state][symbol];
+        sz_u8_t const *haystack_data = reinterpret_cast<sz_u8_t const *>(haystack.data());
+        sz_u8_t const *haystack_end = haystack_data + haystack.size();
+        for (; haystack_data != haystack_end; ++haystack_data) {
+            current_state = transitions_[current_state][*haystack_data];
             count += outputs_counts_[current_state];
         }
         return count;
@@ -485,6 +546,7 @@ struct find_many {
 
     find_many(allocator_t alloc = allocator_t()) noexcept : dict_(alloc) {}
     void reset() noexcept { dict_.reset(); }
+    dictionary_t const &dictionary() const noexcept { return dict_; }
 
     /**
      *  @brief Indexes all of the @p needles strings into the FSM.
@@ -507,7 +569,6 @@ struct find_many {
      *  @brief Counts the number of occurrences of all needles in all @p haystacks. Relevant for filtering and ranking.
      *  @param[in] haystacks The input strings to search in.
      *  @param[in] counts The output buffer for the counts of all needles in each haystack.
-     *  @return The total number of occurrences found.
      */
     template <typename haystacks_type_>
     status_t try_count(haystacks_type_ &&haystacks, span<size_t> counts) const noexcept {
@@ -516,15 +577,51 @@ struct find_many {
         return status_t::success_k;
     }
 
+    /**
+     *  @brief Finds all occurrences of all needles in all the @p haystacks.
+     *  @param[in] haystacks The input strings to search in, with support for random access iterators.
+     *  @param[in] counts The number of needle matches in each haystack precomputed with `try_count`.
+     *  @param[in] matches The output buffer for the matches, with support for random access iterators.
+     *  @pre Expects the @p matches.size() to match `try_count` for the @p haystacks.
+     *  @note The @p matches reference objects should be assignable from @b `match_t`.
+     */
+    template <typename haystacks_type_, typename output_matches_type_ = span<find_many_match_t>>
+    status_t try_find(haystacks_type_ &&haystacks, span<size_t const> counts,
+                      output_matches_type_ &&matches) const noexcept {
+        sz_unused(counts);
+        size_t matches_expected = matches.size();
+        size_t matches_found = 0;
+        status_t status = try_find(haystacks, matches, matches_found);
+        if (status != status_t::success_k) return status;
+        if (matches_found != matches_expected) return status_t::unexpected_dimensions_k;
+        return status_t::success_k;
+    }
+
+    /**
+     *  @brief Finds all occurrences of all needles in all the @p haystacks.
+     *  @param[in] haystacks The input strings to search in, with support for random access iterators.
+     *  @param[in] matches The output buffer for the matches, with support for random access iterators.
+     *  @pre Expects the @p matches.size() to match `try_count` for the @p haystacks.
+     *  @note The @p matches reference objects should be assignable from @b `match_t`.
+     */
+    template <typename haystacks_type_, typename output_matches_type_ = span<find_many_match_t>>
+    status_t try_find(haystacks_type_ &&haystacks, output_matches_type_ &&matches) const noexcept {
+        size_t matches_expected = matches.size();
+        size_t matches_found = 0;
+        status_t status = try_find(haystacks, matches, matches_found);
+        if (status != status_t::success_k) return status;
+        if (matches_found != matches_expected) return status_t::unexpected_dimensions_k;
+        return status_t::success_k;
+    }
+
     /**
      *  @brief Finds all occurrences of all needles in all the @p haystacks.
      *  @param[in] haystacks The input strings to search in, with support for random access iterators.
      *  @param[in] matches The output buffer for the matches, with support for random access iterators.
      *  @param[out] matches_count The number of matches found.
-     *  @return The number of matches found across all the @p haystacks.
      *  @note The @p matches reference objects should be assignable from @b `match_t`.
      */
-    template <typename haystacks_type_, typename output_matches_type_>
+    template <typename haystacks_type_, typename output_matches_type_ = span<find_many_match_t>>
     status_t try_find(haystacks_type_ &&haystacks, output_matches_type_ &&matches,
                       size_t &matches_count) const noexcept {
         size_t count_found = 0, count_allowed = matches.size();
@@ -551,8 +648,17 @@ struct find_many {
  *  @brief  Aho-Corasick-based @b multi-threaded multi-pattern exact substring search with.
  *  @note   Construction of the FSM is not parallelized, as it is not generally a bottleneck.
  *
- *  Implements 2 levels of parallelism: "core per input" for small haystacks and "all cores on each input"
- *  for very large ones.
+ *  Implements 2 levels of parallelism: "core per input" for small haystacks and "all cores
+ *  on each input" for very large ones.
+ *
+ *  The core problem of all such algorithm is the overlapping matches between the slices of text
+ *  processed by individual threads. One approach around it is to pass in a callback, and fire it
+ *  concurrently from different threads, leaving synchronization to a user... generally resorting
+ *  to mutexes, atomics, and other expensive primitives! We can do better!
+ *
+ *  We first count the number of matches in each slice, and then we process the slices in parallel,
+ *  minimizing lock contention and bank conflicts on writes. That requires a negligible amount of
+ *  memory, but results in a significant speedup.
  */
 template <typename state_id_type_, typename allocator_type_, typename enable_>
 struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
@@ -566,6 +672,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
 
     find_many(allocator_t alloc = allocator_t()) noexcept : dict_(alloc) {}
     void reset() noexcept { dict_.reset(); }
+    dictionary_t const &dictionary() const noexcept { return dict_; }
 
     /**
      *  @brief Indexes all of the @p needles strings into the FSM.
@@ -620,10 +727,21 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
 
             std::atomic<size_t> count_across_cores = 0;
             size_t const cores_total = executor.thread_count();
-            executor.for_each_thread([&](size_t core_index) noexcept {
-                size_t count_partial = count_matches_in_one_part(haystack, core_index, cores_total, cache_line_width);
-                count_across_cores.fetch_add(count_partial, std::memory_order_relaxed);
-            });
+            size_t const padded_max_needle_length = dict_.max_needle_length() + specs.cache_line_width;
+            bool const longest_needle_fits_on_one_core = padded_max_needle_length * cores_total < haystack_length;
+            if (longest_needle_fits_on_one_core)
+                executor.for_each_thread([&](size_t core_index) noexcept {
+                    count_short_needle_matches_in_one_part_t partial_result =
+                        count_short_needle_matches_in_one_part(haystack, core_index, cores_total, cache_line_width);
+                    if (core_index != 0) partial_result.total -= partial_result.prefix;
+                    count_across_cores.fetch_add(partial_result.total, std::memory_order_relaxed);
+                });
+            else
+                executor.for_each_thread([&](size_t core_index) noexcept {
+                    size_t partial_result =
+                        count_matches_in_one_part(haystack, core_index, cores_total, cache_line_width);
+                    count_across_cores.fetch_add(partial_result, std::memory_order_relaxed);
+                });
             counts[haystack_index] = count_across_cores;
         }
 
@@ -637,13 +755,6 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
      *  @param[out] matches_count The number of matches found.
      *  @return The number of matches found across all the @p haystacks.
      *  @note The @p matches reference objects should be assignable from @b `match_t`.
-     *
-     *  The core problem of all such algorithm is the overlapping matches between the slices of text
-     *  processed by individual threads. One approach around it is to pass in a callback, and fire it
-     *  concurrently from different threads, leaving synchronization to a user... generally resorting
-     *  to mutexes, atomics, and other expensive primitives! We can do better!
-     *
-     *  A common approach to parallelizing such algorithms is to use a little memory for
      */
     template <typename haystacks_type_, typename output_matches_type_, typename executor_type_ = dummy_executor_t>
 #if _SZ_IS_CPP20
@@ -667,8 +778,6 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
      *  @param[out] matches_count The number of matches found.
      *  @return The number of matches found across all the @p haystacks.
      *  @note The @p matches reference objects should be assignable from @b `match_t`.
-     *
-     *  A common approach to parallelizing such algorithms is to us a little memory for
      */
     template <typename haystacks_type_, typename output_matches_type_, typename executor_type_ = dummy_executor_t>
 #if _SZ_IS_CPP20
@@ -821,6 +930,8 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
             overlapping_start = std::max(optimal_end - max_needle_length + 1, haystack_begin);
             overlapping_end = std::min(optimal_end + max_needle_length - 1, haystack_end);
         }
+        // A great optimization, is keeping track of entries found in the prefix of the current slice,
+        // that also began in this slice, which will help us
 
         // Count the matches that start in one core's slice and end in another
         size_t count_matches_overlapping = 0;
@@ -833,8 +944,54 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
         // Now, finally, aggregate the results
         return count_matches_non_overlapping + count_matches_overlapping;
     }
+
+    struct count_short_needle_matches_in_one_part_t {
+        size_t total = 0;
+        size_t prefix = 0;
+    };
+
+    /**
+     *  @brief  Helper method implementing the core logic of the parallel `try_count` and part of `try_find`.
+     *
+     *  A more optimized alternative to the `count_matches_in_one_part`, that assumes that the length of the longest
+     *  needle is smaller than the length of a single core slice. It means that in the least convenient case, the
+     *  match can only spill into 2 core regions, starting in one and ending in another.
+     *
+     *  In such case, we can keep
+     */
+    template <typename char_type_>
+    count_short_needle_matches_in_one_part_t count_short_needle_matches_in_one_part(
+        span<char_type_ const> haystack, size_t core_index, size_t cores_total,
+        size_t cache_line_width) const noexcept {
+
+        using char_t = char_type_;
+        char_t const *haystack_begin = haystack.data();
+        size_t const haystack_length = haystack.size();
+        char_t const *const haystack_end = haystack_begin + haystack_length;
+        size_t const bytes_per_core_optimal =
+            round_up_to_multiple(divide_round_up(haystack_length, cores_total), cache_line_width);
+
+        // We won't face needles longer than the slice for the core
+        size_t const max_needle_length = dict_.max_needle_length();
+        _sz_assert(max_needle_length < bytes_per_core_optimal);
+
+        // We may have a case of a thread receiving no data at all
+        char_t const *optimal_start = std::min(haystack_begin + core_index * bytes_per_core_optimal, haystack_end);
+        char_t const *prefix_end = std::min(optimal_start + max_needle_length - 1, haystack_end);
+        char_t const *overlapping_end =
+            std::min(optimal_start + bytes_per_core_optimal + max_needle_length, haystack_end);
+
+        // Reimplement the serial `count` keeping track of the matches, entirely fitting in the prefix
+        count_short_needle_matches_in_one_part_t result;
+
+        return result;
+    }
 };
 
+using find_many_u32_dictionary_t = aho_corasick_dictionary<sz_u32_t, std::allocator<char>>;
+using find_many_u32_serial_t = find_many<sz_u32_t, std::allocator<char>, sz_cap_serial_k>;
+using find_many_u32_parallel_t = find_many<sz_u32_t, std::allocator<char>, sz_caps_sp_k>;
+
 #pragma endregion // Parallel Backend
 
 } // namespace stringzilla
diff --git a/scripts/bench_find_many.cpp b/scripts/bench_find_many.cpp
new file mode 100644
index 00000000..07108ba5
--- /dev/null
+++ b/scripts/bench_find_many.cpp
@@ -0,0 +1,71 @@
+/**
+ *  @file   bench_find_many.cpp
+ *  @brief  Benchmarks for exact multi-pattern substring search algorithms.
+ *          The program accepts a file path to a dataset, tokenizes it, and benchmarks the search operations,
+ *          validating the SIMD-accelerated backends against the serial baselines.
+ *
+ *  Benchmarks include:
+ *  - Multi-pattern substring match counting.
+ *  - Multi-pattern substring search.
+ *  - Multi-pattern matcher construction time.
+ *
+ *  Instead of CLI arguments, for compatibility with @b StringWa.rs, the following environment variables are used:
+ *  - `STRINGWARS_DATASET` : Path to the dataset file.
+ *  - `STRINGWARS_TOKENS=words` : Tokenization model ("file", "lines", "words", or positive integer [1:200] for N-grams
+ *  - `STRINGWARS_SEED=42` : Optional seed for shuffling reproducibility.
+ *
+ *  Unlike StringWa.rs, the following additional environment variables are supported:
+ *  - `STRINGWARS_DURATION=10` : Time limit (in seconds) per benchmark.
+ *  - `STRINGWARS_STRESS=1` : Test SIMD-accelerated functions against the serial baselines.
+ *  - `STRINGWARS_STRESS_DIR=/.tmp` : Output directory for stress-testing failures logs.
+ *  - `STRINGWARS_STRESS_LIMIT=1` : Controls the number of failures we're willing to tolerate.
+ *  - `STRINGWARS_STRESS_DURATION=10` : Stress-testing time limit (in seconds) per benchmark.
+ *  - `STRINGWARS_FILTER` : Regular Expression pattern to filter algorithm/backend names.
+ *
+ *  Here are a few build & run commands:
+ *
+ *  @code{.sh}
+ *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
+ *  cmake --build build_release --config Release --target stringzilla_bench_find_many
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringzilla_bench_find_many
+ *  @endcode
+ *
+ *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
+ *  like all Skylake-X and newer kernels on a boundary-condition input length of 64 bytes (exactly 1 cache line),
+ *  your last command may look like:
+ *
+ *  @code{.sh}
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
+ *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
+ *  build_release/stringzilla_bench_find_many
+ *  @endcode
+ *
+ *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
+ *  This file is the sibling of `bench_sequence.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
+ */
+#include "bench_find_many.cuh"
+
+namespace sz = ashvardanian::stringzilla;
+using namespace sz::scripts;
+
+int main(int argc, char const **argv) {
+    std::printf("Welcome to StringZilla!\n");
+
+    try {
+        std::printf("Building up the environment...\n");
+        environment_t env = build_environment( //
+            argc, argv,                        //
+            "xlsum.csv",                       // Preferred for UTF-8 content
+            environment_t::tokenization_t::lines_k);
+
+        std::printf("Starting string multi-pattern search benchmarks...\n");
+        bench_find_many(env);
+    }
+    catch (std::exception const &e) {
+        std::fprintf(stderr, "Failed with: %s\n", e.what());
+        return 1;
+    }
+
+    std::printf("All benchmarks finished.\n");
+    return 0;
+}
\ No newline at end of file
diff --git a/scripts/bench_find_many.cuh b/scripts/bench_find_many.cuh
new file mode 100644
index 00000000..845f9035
--- /dev/null
+++ b/scripts/bench_find_many.cuh
@@ -0,0 +1,183 @@
+/**
+ *  @file   bench_find_many.cuh
+ *  @brief  Shared code for CPU and GPU batched parallel exact substring search.
+ */
+#include <tuple> // `std::tuple`
+#include <span>  // `std::span`
+
+#include <fork_union.hpp> // Fork-join scoped thread pool
+
+#include <stringcuzilla/find_many.hpp> // C++ templates for string processing
+
+#if SZ_USE_CUDA
+#include <stringcuzilla/find_many.cuh> // Parallel string processing in CUDA
+#endif
+
+#include "bench.hpp"
+
+namespace ashvardanian {
+namespace stringzilla {
+namespace scripts {
+
+using counts_t = unified_vector<size_t>;
+using matches_t = unified_vector<find_many_match_t>;
+
+#pragma region Multi-Pattern Search
+
+/** @brief Wraps a hardware-specific multi-pattern search backend into something @b `bench_nullary`-compatible . */
+template <typename engine_type_, typename results_type_, typename... extra_args_>
+struct find_many_callable {
+    using engine_t = engine_type_;
+    using results_t = results_type_;
+    using result_t = typename results_t::value_type;
+
+    environment_t const &env;
+    counts_t &results_counts_per_haystack;
+    matches_t &results_matches_per_haystack;
+    find_many_u32_dictionary_t const &dictionary;
+    engine_t engine = {};
+    std::tuple<extra_args_...> extra_args = {};
+
+    find_many_callable(environment_t const &env, counts_t &counts, matches_t &matches,
+                       find_many_u32_dictionary_t const &dict, engine_t eng = {}, extra_args_... args)
+        : env(env), results_counts_per_haystack(counts), results_matches_per_haystack(matches), dictionary(dict),
+          engine(std::move(eng)), extra_args(args...) {}
+
+    call_result_t operator()() noexcept(false) {
+
+        token_view_t const dataset_view = {env.dataset.data(), env.dataset.size()};
+        std::span<token_view_t const> haystacks = {&dataset_view, 1};
+
+        sz::status_t status = engine.try_build(dictionary);
+        if (status != sz::status_t::success_k) throw std::runtime_error("Failed to build dictionary.");
+        span<size_t> counts_span = {results_counts_per_haystack.data(), results_counts_per_haystack.size()};
+        span<find_many_match_t> matches_span = {results_matches_per_haystack.data(),
+                                                results_matches_per_haystack.size()};
+
+        // Unpack the extra arguments from `std::tuple` into the engine call using `std::apply`
+        constexpr bool only_counts_k = std::is_same_v<results_t, counts_t>;
+        if constexpr (only_counts_k)
+            status = std::apply([&](auto &&...rest) { return engine.try_count(haystacks, counts_span, rest...); },
+                                extra_args);
+        else
+            status = std::apply(
+                [&](auto &&...rest) { return engine.try_find(haystacks, counts_span, matches_span, rest...); },
+                extra_args);
+        do_not_optimize(status);
+
+        if (status != sz::status_t::success_k) throw std::runtime_error("Failed multi-pattern search.");
+        do_not_optimize(counts_span);
+        std::size_t needle_characters = engine.dictionary().total_needles_length();
+        std::size_t bytes_passed = 0, character_comparisons = 0;
+        for (std::size_t i = 0; i < haystacks.size(); ++i) {
+            bytes_passed += haystacks[i].size();
+            character_comparisons += haystacks[i].size() * needle_characters;
+        }
+        call_result_t call_result;
+        call_result.bytes_passed = bytes_passed;
+        call_result.operations = character_comparisons;
+        call_result.inputs_processed = haystacks.size();
+        call_result.check_value = only_counts_k ? reinterpret_cast<check_value_t>(&results_counts_per_haystack)
+                                                : reinterpret_cast<check_value_t>(&results_matches_per_haystack);
+        return call_result;
+    }
+};
+
+template <typename value_type_>
+struct arrays_equality {
+    using vector_t = unified_vector<value_type_>;
+    bool operator()(check_value_t const &a, check_value_t const &b) const noexcept {
+        vector_t const &a_ = *reinterpret_cast<vector_t const *>(a);
+        vector_t const &b_ = *reinterpret_cast<vector_t const *>(b);
+        if (a_.size() != b_.size()) return false;
+        for (std::size_t i = 0; i < a_.size(); ++i)
+            if (a_[i] != b_[i]) {
+                std::printf("Mismatch at index %zu\n", i);
+                return false;
+            }
+        return true;
+    }
+};
+
+void bench_find_many(environment_t const &env) {
+
+    using namespace std::string_literals; // for "s" suffix
+
+#if SZ_USE_CUDA
+    sz::gpu_specs_t specs = *sz::gpu_specs();
+#endif
+    std::vector<std::size_t> vocabulary_sizes = {1, 64, 1024, 32 * 1024};
+#if SZ_DEBUG
+    vocabulary_sizes = {1, 2, 64};
+#endif
+    counts_t counts_baseline, counts_accelerated;
+    matches_t matches_baseline, matches_accelerated;
+
+    using counts_equality_t = arrays_equality<size_t>;
+    using matches_equality_t = arrays_equality<find_many_match_t>;
+
+    // Let's reuse a thread-pool to amortize the cost of spawning threads.
+    fork_union_t pool;
+    if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
+    static_assert(executor_like<fork_union_t>);
+
+    auto scramble_accelerated_results = [&](auto &results_accelerated) {
+        std::shuffle(results_accelerated.begin(), results_accelerated.end(), global_random_generator());
+    };
+
+    for (std::size_t vocabulary_size : vocabulary_sizes) {
+        auto shape_suffix = std::to_string(vocabulary_size) + "needles"s;
+        if (vocabulary_size > env.tokens.size()) continue;
+
+        // Construct the dictionary for the current vocabulary size
+        find_many_u32_dictionary_t dict;
+        if (dict.try_reserve(vocabulary_size) != sz::status_t::success_k)
+            throw std::runtime_error("Failed to reserve space for dictionary.");
+        for (std::size_t token_index = 0; token_index < vocabulary_size; ++token_index) {
+            auto const &token = env.tokens[token_index];
+            if (dict.try_insert({token.data(), token.size()}) != sz::status_t::success_k)
+                throw std::runtime_error("Failed to insert token into dictionary.");
+        }
+        if (dict.try_build() != sz::status_t::success_k) throw std::runtime_error("Failed to build dictionary.");
+
+        // Estimate the amount of memory needed for the results
+        std::size_t const results_count = dict.count({env.dataset.data(), env.dataset.size()});
+        counts_baseline.resize(1), counts_accelerated.resize(1);
+        matches_baseline.resize(results_count), matches_accelerated.resize(results_count);
+
+        // Perform the benchmarks, passing the dictionary to the engines
+        auto call_count_baseline =
+            find_many_callable<find_many_u32_serial_t, counts_t>(env, counts_baseline, matches_baseline, dict);
+        auto name_count_baseline = "count_many_serial:"s + shape_suffix;
+        bench_result_t count_baseline = bench_nullary(env, name_count_baseline, call_count_baseline).log();
+
+        auto call_find_baseline =
+            find_many_callable<find_many_u32_serial_t, matches_t>(env, counts_baseline, matches_baseline, dict);
+        auto name_find_baseline = "find_many_serial:"s + shape_suffix;
+        bench_result_t find_baseline = bench_nullary(env, name_find_baseline, call_find_baseline).log();
+
+        // Parallel search
+        bench_nullary( //
+            env, "count_many_parallel:"s + shape_suffix, call_count_baseline,
+            find_many_callable<find_many_u32_parallel_t, counts_t>(env, counts_accelerated, matches_accelerated, dict),
+            callable_no_op_t {},  // preprocessing
+            counts_equality_t {}) // equality check
+            .log(count_baseline);
+
+        bench_nullary( //
+            env, "find_many_parallel:"s + shape_suffix, call_find_baseline,
+            find_many_callable<find_many_u32_parallel_t, matches_t>(env, counts_accelerated, matches_accelerated, dict),
+            callable_no_op_t {},   // preprocessing
+            matches_equality_t {}) // equality check
+            .log(find_baseline);
+
+        scramble_accelerated_results(counts_accelerated);
+        scramble_accelerated_results(matches_accelerated);
+    }
+}
+
+#pragma endregion
+
+} // namespace scripts
+} // namespace stringzilla
+} // namespace ashvardanian
\ No newline at end of file

From 0c442d1290f6bd5290308ddc048e1b6841e30c09 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 14 May 2025 10:02:26 +0000
Subject: [PATCH 418/751] Improve: Propagate allocators in `safe_vector`

---
 include/stringzilla/types.hpp | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index ac21ad97..cc7ff4f9 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -622,9 +622,12 @@ class safe_vector {
     using size_type = std::size_t;
     using allocator_type = allocator_type_;
 
-    using allocated_type = typename std::allocator_traits<allocator_type>::value_type;
+    using allocator_traits = std::allocator_traits<allocator_type>;
+    using allocated_type = typename allocator_traits::value_type;
     static_assert(sizeof(value_type) == sizeof(allocated_type),
                   "Allocator value type must be the same size as the vector value type");
+    static_assert(allocator_traits::propagate_on_container_move_assignment::value,
+                  "Allocator must propagate on move assignment, otherwise the move assignment won't be `noexcept`.");
 
   private:
     value_type *data_;
@@ -651,8 +654,10 @@ class safe_vector {
         capacity_ = 0;
     }
 
+    /** @warning Use `try_assign` instead to handle out-of-memory failures. */
     safe_vector(safe_vector const &other) = delete;
-    safe_vector &operator=(safe_vector const &other) noexcept = delete;
+    /** @warning Use `try_assign` instead to handle out-of-memory failures. */
+    safe_vector &operator=(safe_vector const &other) = delete;
 
     safe_vector(safe_vector &&other) noexcept
         : data_(other.data_), size_(other.size_), capacity_(other.capacity_), alloc_(std::move(other.alloc_)) {
@@ -676,6 +681,29 @@ class safe_vector {
         return *this;
     }
 
+    template <typename other_allocator_type_ = allocator_type>
+    status_t try_assign(safe_vector<value_type, other_allocator_type_> const &other) noexcept {
+        reset();
+
+        if constexpr (allocator_traits::propagate_on_container_copy_assignment::value) alloc_ = other.alloc_;
+        if (other.size_ == 0) return status_t::success_k; // Nothing to do :)
+
+        // Allocate exact needed capacity
+        size_type new_cap = other.size_;
+        allocated_type *raw = allocator_traits::allocate(alloc_, new_cap);
+        if (!raw) return status_t::bad_alloc_k;
+        data_ = reinterpret_cast<value_type *>(raw);
+        capacity_ = new_cap;
+
+        // Copy‐construct each element
+        if constexpr (!std::is_trivially_constructible<value_type>::value)
+            for (size_type i = 0; i < other.size_; ++i) new (data_ + i) value_type(other.data_[i]);
+        else
+            for (size_type i = 0; i < other.size_; ++i) data_[i] = other.data_[i];
+        size_ = other.size_;
+        return status_t::success_k;
+    }
+
     status_t try_reserve(size_type new_cap) noexcept {
         if (new_cap <= capacity_) return status_t::success_k;
         value_type *new_data = (value_type *)alloc_.allocate(new_cap);

From 85c5bf85029030c3c87d239bdfb4e47eefd09225 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 14 May 2025 10:03:41 +0000
Subject: [PATCH 419/751] Improve: Multi-byte characters support

---
 include/stringzilla/types.h    | 13 +++--
 include/stringzilla/types.hpp  | 10 ++++
 scripts/bench_find_many.cpp    |  2 +-
 scripts/bench_find_many.cuh    | 10 ++--
 scripts/test_stringcuzilla.cuh | 92 +++++++++++++++++++++-------------
 5 files changed, 83 insertions(+), 44 deletions(-)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index d4fe3d5d..7f80a0bd 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -391,6 +391,7 @@ typedef sz_i32_t sz_ssize_t; // ? Preferred over the `__PTRDIFF_TYPE__` and `__I
 sz_static_assert(sizeof(sz_size_t) == sizeof(void *), sz_size_t_must_be_pointer_size);
 sz_static_assert(sizeof(sz_ssize_t) == sizeof(void *), sz_ssize_t_must_be_pointer_size);
 
+typedef unsigned char sz_byte_t; // A byte is an 8-bit unsigned integer
 typedef char *sz_ptr_t;          // A type alias for `char *`
 typedef char const *sz_cptr_t;   // A type alias for `char const *`
 typedef sz_i8_t sz_error_cost_t; // Character mismatch cost for fuzzy matching functions
@@ -449,15 +450,17 @@ typedef enum sz_status_t {
     /** For algorithms that return a status, this status indicates that the operation was successful. */
     sz_success_k = 0,
     /** For algorithms that require memory allocation, this status indicates that the allocation failed. */
-    sz_bad_alloc_k = -1,
+    sz_bad_alloc_k = -10,
     /** For algorithms that require UTF8 input, this status indicates that the input is invalid. */
-    sz_invalid_utf8_k = -2,
+    sz_invalid_utf8_k = -12,
     /** For algorithms that take collections of unique elements, this status indicates presence of duplicates. */
-    sz_contains_duplicates_k = -3,
+    sz_contains_duplicates_k = -13,
     /** For algorithms dealing with large inputs, this error reports the need to upcast the logic to larger types. */
-    sz_overflow_risk_k = -4,
+    sz_overflow_risk_k = -14,
+    /** For algorithms with multi-stage pipelines indicates input/output size mismatch. */
+    sz_unexpected_dimensions_k = -15,
     /** A sink-hole status for unknown errors. */
-    sz_status_unknown_k = -5,
+    sz_status_unknown_k = -1,
 } sz_status_t;
 
 /**
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index cc7ff4f9..45fab94f 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -108,6 +108,7 @@ using u64_t = sz_u64_t;
 using i64_t = sz_i64_t;
 using size_t = sz_size_t;
 using ssize_t = sz_ssize_t;
+using byte_t = sz_byte_t;
 
 using ptr_t = sz_ptr_t;
 using cptr_t = sz_cptr_t;
@@ -126,6 +127,7 @@ enum class status_t {
     invalid_utf8_k = sz_invalid_utf8_k,
     contains_duplicates_k = sz_contains_duplicates_k,
     overflow_risk_k = sz_overflow_risk_k,
+    unexpected_dimensions_k = sz_unexpected_dimensions_k,
     unknown_k = sz_status_unknown_k,
 };
 
@@ -162,6 +164,14 @@ struct span {
     constexpr value_type &front() const noexcept { return data_[0]; }
     constexpr value_type &back() const noexcept { return data_[size_ - 1]; }
     constexpr bool empty() const noexcept { return size_ == 0; }
+
+    operator span<value_type const>() const noexcept { return span<value_type const>(data_, size_); }
+
+    template <typename other_value_type_>
+    constexpr span<other_value_type_> cast() const noexcept {
+        return span<other_value_type_>(reinterpret_cast<other_value_type_ *>(data_),
+                                       size_ * sizeof(value_type) / sizeof(other_value_type_));
+    }
 };
 
 template <typename value_type_>
diff --git a/scripts/bench_find_many.cpp b/scripts/bench_find_many.cpp
index 07108ba5..4c7ec184 100644
--- a/scripts/bench_find_many.cpp
+++ b/scripts/bench_find_many.cpp
@@ -56,7 +56,7 @@ int main(int argc, char const **argv) {
         environment_t env = build_environment( //
             argc, argv,                        //
             "xlsum.csv",                       // Preferred for UTF-8 content
-            environment_t::tokenization_t::lines_k);
+            environment_t::tokenization_t::words_k);
 
         std::printf("Starting string multi-pattern search benchmarks...\n");
         bench_find_many(env);
diff --git a/scripts/bench_find_many.cuh b/scripts/bench_find_many.cuh
index 845f9035..83004d84 100644
--- a/scripts/bench_find_many.cuh
+++ b/scripts/bench_find_many.cuh
@@ -45,8 +45,8 @@ struct find_many_callable {
 
     call_result_t operator()() noexcept(false) {
 
-        token_view_t const dataset_view = {env.dataset.data(), env.dataset.size()};
-        std::span<token_view_t const> haystacks = {&dataset_view, 1};
+        span<char const> const dataset_view = {env.dataset.data(), env.dataset.size()};
+        span<span<char const> const> haystacks = {&dataset_view, 1};
 
         sz::status_t status = engine.try_build(dictionary);
         if (status != sz::status_t::success_k) throw std::runtime_error("Failed to build dictionary.");
@@ -159,14 +159,16 @@ void bench_find_many(environment_t const &env) {
         // Parallel search
         bench_nullary( //
             env, "count_many_parallel:"s + shape_suffix, call_count_baseline,
-            find_many_callable<find_many_u32_parallel_t, counts_t>(env, counts_accelerated, matches_accelerated, dict),
+            find_many_callable<find_many_u32_parallel_t, counts_t, fork_union_t &>( //
+                env, counts_accelerated, matches_accelerated, dict, {}, pool),
             callable_no_op_t {},  // preprocessing
             counts_equality_t {}) // equality check
             .log(count_baseline);
 
         bench_nullary( //
             env, "find_many_parallel:"s + shape_suffix, call_find_baseline,
-            find_many_callable<find_many_u32_parallel_t, matches_t>(env, counts_accelerated, matches_accelerated, dict),
+            find_many_callable<find_many_u32_parallel_t, matches_t, fork_union_t &>( //
+                env, counts_accelerated, matches_accelerated, dict, {}, pool),
             callable_no_op_t {},   // preprocessing
             matches_equality_t {}) // equality check
             .log(find_baseline);
diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index c468be22..e08723de 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -1054,8 +1054,8 @@ struct find_many_baselines_t {
                     match_t match;
                     match.haystack_index = 0;
                     match.needle_index = needle_index;
-                    match.haystack = {haystack.data(), haystack.size()};
-                    match.needle = {haystack.data() + match_offset, needle.size()};
+                    match.haystack = {reinterpret_cast<byte_t const *>(haystack.data()), haystack.size()};
+                    match.needle = {reinterpret_cast<byte_t const *>(haystack.data() + match_offset), needle.size()};
                     if (!callback(match)) {
                         aborted.store(true, std::memory_order_relaxed);
                         break;
@@ -1093,9 +1093,10 @@ struct find_many_baselines_t {
     }
 
     template <typename haystacks_type_, typename output_matches_type_>
-    status_t try_find(haystacks_type_ &&haystacks, output_matches_type_ &&matches,
-                      size_t &matches_total) const noexcept {
+    status_t try_find(haystacks_type_ &&haystacks, span<size_t const> counts,
+                      output_matches_type_ &&matches) const noexcept {
 
+        sz_unused(counts);
         std::atomic<size_t> count_found {0};
         std::size_t const count_allowed {matches.size()};
         all_pairs(haystacks, needles_, [&](match_t const &match) noexcept {
@@ -1103,14 +1104,10 @@ struct find_many_baselines_t {
             matches[match_index] = match;
             return match_index < count_allowed;
         });
-        matches_total = count_found;
         return status_t::success_k;
     }
 };
 
-using find_many_serial_t = find_many<sz_u32_t, malloc_t, sz_cap_serial_k>;
-using find_many_parallel_t = find_many<sz_u32_t, malloc_t, sz_caps_sp_k>;
-
 /**
  *  @brief  Tests the correctness of the string class Levenshtein distance computation,
  *          as well as the similarity scoring functions for bioinformatics-like workloads
@@ -1127,11 +1124,39 @@ void test_find_many_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_
     needles.emplace_back("she");
     needles.emplace_back("her");
 
+    needles.emplace_back("école"), needles.emplace_back("école");                   // decomposed
+    needles.emplace_back("Schön"), needles.emplace_back("Scho\u0308n");             // combining diaeresis
+    needles.emplace_back("naïve"), needles.emplace_back("naive");                   // stripped diaeresis
+    needles.emplace_back("façade"), needles.emplace_back("facade");                 // no cedilla
+    needles.emplace_back("office"), needles.emplace_back("ofﬁce");                  // “fi” ligature
+    needles.emplace_back("Straße"), needles.emplace_back("Strasse");                // ß vs ss
+    needles.emplace_back("ABBA"), needles.emplace_back("\u0410\u0412\u0412\u0410"); // Latin vs Cyrillic
+    needles.emplace_back("中国"), needles.emplace_back("中國");                     // simplified vs traditional
+    needles.emplace_back("🙂"), needles.emplace_back("☺️");                          // emoji variants
+    needles.emplace_back("€100"), needles.emplace_back("EUR 100");                  // currency symbol vs abbreviation
+
     // Haystacks should contain arbitrary strings including those needles
     // in different positions, potentially interleaving
     haystacks.emplace_back("That is a test string"); // ? "only "is"
     haystacks.emplace_back("This is a test string"); // ? "his", 2x "is"
-
+    haystacks.emplace_back("ahishers");              // textbook example
+    haystacks.emplace_back("hishishersherishis");    // heavy overlap, prefix & suffix collisions
+    haystacks.emplace_back("si siht si a tset gnirts; reh ton si ehs, tub sih ti si."); // no real matches
+    haystacks.emplace_back("his\0is\r\nshe\0her");                                      // null-included
+
+    // ~260 chars – dense English with overlapping words (“his”, “is”, “she”, “her”)
+    haystacks.emplace_back(R"(
+    In this historic thesis, the historian highlights his findings: this is the synthesis of data.
+    She examined the theory, he shared her methodology. In this chapter, he lists his equipment:
+    microscope, test kit, sensor. It is here that she erred: misalignment arises.
+    )");
+
+    // ~320 chars – multilingual snippet with needles in Latin, Arabic, Chinese, English
+    haystacks.emplace_back(R"(
+    The conference in 北京 attracted researchers from across the globe. His presentation “AI in Healthcare”
+    was a hit—she received awards. الباحثون استعرضوا الأبحاث، واستشارت her colleagues. 这是一次重要的会议。
+    She said: “This is only the beginning.” In her report, his name appears seventeen times.
+    )");
     using match_t = find_many_match_t;
 
     // First check with a batch-size of 1
@@ -1167,14 +1192,11 @@ void test_find_many_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_
         // Check the matches themselves
         matches_base.resize(std::accumulate(counts_base.begin(), counts_base.end(), 0));
         matches_simd.resize(std::accumulate(counts_simd.begin(), counts_simd.end(), 0));
-        size_t total_found_base = 0, total_found_simd = 0;
-        status_t status_matched_base = base_operator.try_find(haystacks_tape, matches_base, total_found_base);
+        status_t status_matched_base = base_operator.try_find(haystacks_tape, counts_base_span, matches_base);
         status_t status_matched_simd =
-            simd_operator.try_find(haystacks_tape, matches_simd, total_found_simd, extra_args...);
+            simd_operator.try_find(haystacks_tape, counts_simd_span, matches_simd, extra_args...);
         _sz_assert(status_matched_base == status_t::success_k);
         _sz_assert(status_matched_simd == status_t::success_k);
-        _sz_assert(total_found_base == matches_base.size());
-        _sz_assert(total_found_simd == matches_simd.size());
 
         // Check the contents and order of the matches
         std::sort(matches_base.begin(), matches_base.end(), match_t::less_globally);
@@ -1204,14 +1226,11 @@ void test_find_many_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_
         // Check the matches themselves
         matches_base.resize(std::accumulate(counts_base.begin(), counts_base.end(), 0));
         matches_simd.resize(std::accumulate(counts_simd.begin(), counts_simd.end(), 0));
-        size_t total_found_base = 0, total_found_simd = 0;
-        status_t status_matched_base = base_operator.try_find(haystacks_tape, matches_base, total_found_base);
+        status_t status_matched_base = base_operator.try_find(haystacks_tape, counts_base_span, matches_base);
         status_t status_matched_simd =
-            simd_operator.try_find(haystacks_tape, matches_simd, total_found_simd, extra_args...);
+            simd_operator.try_find(haystacks_tape, counts_simd_span, matches_simd, extra_args...);
         _sz_assert(status_matched_base == status_t::success_k);
         _sz_assert(status_matched_simd == status_t::success_k);
-        _sz_assert(total_found_base == matches_base.size());
-        _sz_assert(total_found_simd == matches_simd.size());
 
         // Check the contents and order of the matches
         std::sort(matches_base.begin(), matches_base.end(), match_t::less_globally);
@@ -1259,8 +1278,8 @@ void test_find_many(base_operator_ &&base_operator, simd_operator_ &&simd_operat
     results_base.resize(total_count_base);
     results_simd.resize(total_count_simd);
     size_t count_base = 0, count_simd = 0;
-    status_t status_base = base_operator.try_find(haystacks_tape.view(), results_base, count_base);
-    status_t status_simd = simd_operator.try_find(haystacks_tape.view(), results_simd, count_simd, extra_args...);
+    status_t status_base = base_operator.try_find(haystacks_tape.view(), counts_base_span, results_base);
+    status_t status_simd = simd_operator.try_find(haystacks_tape.view(), counts_simd_span, results_simd, extra_args...);
     _sz_assert(status_base == status_t::success_k);
     _sz_assert(status_simd == status_t::success_k);
     _sz_assert(count_base == count_simd);
@@ -1344,24 +1363,29 @@ void test_find_many_equivalence() {
     needles_long_config.batch_size =
         std::pow(needles_long_config.alphabet.size(), needles_long_config.max_string_length);
 
-    // Single-threaded serial Aho-Corasick implementation
-    test_find_many_fixed(find_many_baselines_t {}, find_many_serial_t {});
-    test_find_many_fuzzy(find_many_baselines_t {}, find_many_serial_t {}, needles_short_config, haystacks_config, 1);
-    test_find_many_fuzzy(find_many_baselines_t {}, find_many_serial_t {}, needles_long_config, haystacks_config, 1);
-    test_find_many_prefixes(find_many_baselines_t {}, find_many_serial_t {}, haystacks_config, 1024, 1);
-
-    // Multi-threaded parallel Aho-Corasick implementation
     // Let's reuse a thread-pool to amortize the cost of spawning threads.
     fork_union_t pool;
     if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
     static_assert(executor_like<fork_union_t>);
 
-    test_find_many_fixed(find_many_baselines_t {}, find_many_parallel_t {});
-    test_find_many_fuzzy(find_many_baselines_t {}, find_many_parallel_t {}, needles_short_config, haystacks_config, 10,
-                         pool);
-    test_find_many_fuzzy(find_many_baselines_t {}, find_many_parallel_t {}, needles_long_config, haystacks_config, 10,
-                         pool);
-    test_find_many_prefixes(find_many_baselines_t {}, find_many_parallel_t {}, haystacks_config, 1024, 10, pool);
+    // Single-threaded serial Aho-Corasick implementation
+    test_find_many_fixed(find_many_baselines_t {}, find_many_u32_serial_t {});
+
+    // Multi-threaded parallel Aho-Corasick implementation
+    test_find_many_fixed(find_many_baselines_t {}, find_many_u32_parallel_t {}, pool);
+
+    // Fuzzy tests with random inputs
+    test_find_many_fuzzy(find_many_baselines_t {}, find_many_u32_serial_t {}, needles_short_config, haystacks_config,
+                         1);
+    test_find_many_fuzzy(find_many_baselines_t {}, find_many_u32_serial_t {}, needles_long_config, haystacks_config, 1);
+    test_find_many_prefixes(find_many_baselines_t {}, find_many_u32_serial_t {}, haystacks_config, 1024, 1);
+
+    // Fuzzy tests with random inputs for multi-threaded CPU backend
+    test_find_many_fuzzy(find_many_baselines_t {}, find_many_u32_parallel_t {}, needles_short_config, haystacks_config,
+                         10, pool);
+    test_find_many_fuzzy(find_many_baselines_t {}, find_many_u32_parallel_t {}, needles_long_config, haystacks_config,
+                         10, pool);
+    test_find_many_prefixes(find_many_baselines_t {}, find_many_u32_parallel_t {}, haystacks_config, 1024, 10, pool);
 }
 
 } // namespace scripts

From 853a0fa504b5be605a8228819260890eccaf401e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 14 May 2025 10:04:30 +0000
Subject: [PATCH 420/751] Fix: Missing `std::generate` include

---
 scripts/test_stringzilla.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/test_stringzilla.hpp b/scripts/test_stringzilla.hpp
index 6674d648..5ef9841e 100644
--- a/scripts/test_stringzilla.hpp
+++ b/scripts/test_stringzilla.hpp
@@ -11,6 +11,7 @@
 #include <vector>     // `std::vector`
 #include <array>      // `std::array`
 #include <functional> // `std::function`
+#include <algorithm>  // `std::copy`, `std::generate`
 
 #include "stringzilla/types.hpp"
 #if SZ_USE_CUDA

From 20c9135d4a1ae1381a139b3026c54d8792c14f28 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 14 May 2025 10:05:16 +0000
Subject: [PATCH 421/751] Improve: New multi-pattern search APIs

---
 include/stringcuzilla/find_many.hpp | 250 ++++++++++++++--------------
 1 file changed, 121 insertions(+), 129 deletions(-)

diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index 3010b0b1..c59454e3 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -58,12 +58,12 @@ namespace stringzilla {
  */
 struct find_many_match_t {
 
-    span<char const> haystack {};
+    span<byte_t const> haystack {};
     /**
      *  @brief  The substring of the @p haystack that matched the needle.
      *          Can be used to infer the offset of the needle in the haystack.
      */
-    span<char const> needle {};
+    span<byte_t const> needle {};
     size_t haystack_index {};
     size_t needle_index {};
 
@@ -115,16 +115,16 @@ struct aho_corasick_metadata_t {
 };
 
 /**
- *  @brief Aho-Corasick dictionary for multi-pattern exact byte-level substring search.
+ *  @brief Dense @b byte-level Aho-Corasick dictionary for multi-pattern exact substring search.
  *  @note As FSM construction is almost never a bottleneck, we don't optimize it for speed.
- *  @tparam state_id_type_ The type of the state ID. Default is `sz_u32_t`.
+ *  @tparam state_id_type_ The type of the state ID. Default is `u32_t`.
  *  @tparam allocator_type_ The type of the allocator. Default is `dummy_alloc_t`.
  *
  *  Similar to the rest of the library, doesn't use `std::vector` or other STL containers
  *  and avoid `std::bad_alloc` and other exceptions in favor of `status_t::status_t` error codes
  *  and `try_`-prefixed functions.
  */
-template <typename state_id_type_ = sz_u32_t, typename allocator_type_ = dummy_alloc_t>
+template <typename state_id_type_ = u32_t, typename allocator_type_ = dummy_alloc_t>
 struct aho_corasick_dictionary {
 
     using state_id_t = state_id_type_;
@@ -236,19 +236,19 @@ struct aho_corasick_dictionary {
         safe_vector<size_t, size_allocator_t> needles_lengths(alloc);
 
         status_t s;
-        if ((s = transitions.try_reserve(other.transitions_.size()))) return s;
-        if ((s = outputs.try_reserve(other.outputs_.size()))) return s;
-        if ((s = failures.try_reserve(other.failures_.size()))) return s;
-        if ((s = outputs_counts.try_reserve(other.outputs_counts_.size()))) return s;
-        if ((s = outputs_offsets.try_reserve(other.outputs_offsets_.size()))) return s;
-        if ((s = needles_lengths.try_reserve(other.needles_lengths_.size()))) return s;
-
-        sz_assert(transitions.try_assign(other.transitions_) == status_t::success_k);
-        sz_assert(outputs.try_assign(other.outputs_) == status_t::success_k);
-        sz_assert(failures.try_assign(other.failures_) == status_t::success_k);
-        sz_assert(outputs_counts.try_assign(other.outputs_counts_) == status_t::success_k);
-        sz_assert(outputs_offsets.try_assign(other.outputs_offsets_) == status_t::success_k);
-        sz_assert(needles_lengths.try_assign(other.needles_lengths_) == status_t::success_k);
+        if ((s = transitions.try_reserve(other.transitions_.size())) != status_t::success_k) return s;
+        if ((s = outputs.try_reserve(other.outputs_.size())) != status_t::success_k) return s;
+        if ((s = failures.try_reserve(other.failures_.size())) != status_t::success_k) return s;
+        if ((s = outputs_counts.try_reserve(other.outputs_counts_.size())) != status_t::success_k) return s;
+        if ((s = outputs_offsets.try_reserve(other.outputs_offsets_.size())) != status_t::success_k) return s;
+        if ((s = needles_lengths.try_reserve(other.needles_lengths_.size())) != status_t::success_k) return s;
+
+        _sz_assert(transitions.try_assign(other.transitions_) == status_t::success_k);
+        _sz_assert(outputs.try_assign(other.outputs_) == status_t::success_k);
+        _sz_assert(failures.try_assign(other.failures_) == status_t::success_k);
+        _sz_assert(outputs_counts.try_assign(other.outputs_counts_) == status_t::success_k);
+        _sz_assert(outputs_offsets.try_assign(other.outputs_offsets_) == status_t::success_k);
+        _sz_assert(needles_lengths.try_assign(other.needles_lengths_) == status_t::success_k);
 
         alloc_ = std::move(alloc);
         transitions_ = std::move(transitions);
@@ -297,6 +297,11 @@ struct aho_corasick_dictionary {
 
     allocator_t const &allocator() const noexcept { return alloc_; }
 
+    span<state_transitions_t const> transitions() const noexcept { return transitions_; }
+    span<state_id_t const> outputs() const noexcept { return outputs_; }
+    span<state_id_t const> failures() const noexcept { return failures_; }
+    span<state_id_t const> outputs_counts() const noexcept { return outputs_counts_; }
+
     /**
      *  @brief Returns the metadata for the Aho-Corasick dictionary.
      *  @note The metadata is not thread-safe and should be used only after `try_build`.
@@ -360,7 +365,7 @@ struct aho_corasick_dictionary {
      *  @retval `status_t::overflow_risk_k` Too many needles for the current state ID type.
      *  @retval `status_t::contains_duplicates_k` The needle is already in the vocabulary.
      */
-    status_t try_insert(span<char const> needle) noexcept {
+    status_t try_insert(span<byte_t const> needle) noexcept {
         if (!needle.size()) return status_t::success_k; // Don't care about empty needles.
 
         state_id_t const needle_id = static_cast<state_id_t>(needles_lengths_.size());
@@ -368,11 +373,11 @@ struct aho_corasick_dictionary {
             return status_t::bad_alloc_k;
 
         state_id_t current_state = 0;
-        for (size_t pos = 0; pos < needle.size(); ++pos) {
-            sz_u8_t const symbol = static_cast<sz_u8_t>(needle[pos]);
+        for (size_t needle_offset = 0; needle_offset < needle.size(); ++needle_offset) {
+            byte_t const needle_byte = needle[needle_offset];
             state_id_t *current_row = &transitions_[current_state][0];
             bool const has_root_state = transitions_.data() != nullptr;
-            if (!has_root_state || current_row[symbol] == invalid_state_k) {
+            if (!has_root_state || current_row[needle_byte] == invalid_state_k) {
                 if (count_states_ >= transitions_.size()) {
                     status_t reserve_status = try_reserve(sz_size_bit_ceil(transitions_.size() + 1 + !has_root_state));
                     if (reserve_status != status_t::success_k) return reserve_status;
@@ -381,10 +386,10 @@ struct aho_corasick_dictionary {
 
                 // Use the next available state ID
                 state_id_t new_state = static_cast<state_id_t>(count_states_);
-                current_row[symbol] = new_state;
+                current_row[needle_byte] = new_state;
                 ++count_states_;
             }
-            current_state = current_row[symbol];
+            current_state = current_row[needle_byte];
         }
 
         // If the terminal state's output is already set, the needle already exists.
@@ -398,6 +403,8 @@ struct aho_corasick_dictionary {
         return status_t::success_k;
     }
 
+    status_t try_insert(span<char const> needle) noexcept { return try_insert(needle.template cast<byte_t const>()); }
+
     /**
      *  @brief Construct the Finite State Machine (FSM) from the vocabulary. Can only be called @b once!
      *  @note This function is not thread safe and allocates a significant amount of memory, so it can fail.
@@ -484,11 +491,11 @@ struct aho_corasick_dictionary {
      *  @param[in] callback The handler for a @b `match_t` match, returning `true` to continue.
      */
     template <typename callback_type_>
-    void find(span<char const> haystack, callback_type_ &&callback) const noexcept {
+    void find(span<byte_t const> haystack, callback_type_ &&callback) const noexcept {
         state_id_t current_state = 0;
         for (size_t haystack_offset = 0; haystack_offset < haystack.size(); ++haystack_offset) {
-            sz_u8_t symbol = static_cast<sz_u8_t>(haystack[haystack_offset]);
-            current_state = transitions_[current_state][symbol];
+            byte_t const haystack_byte = haystack[haystack_offset];
+            current_state = transitions_[current_state][haystack_byte];
 
             size_t const outputs_count = outputs_counts_[current_state];
             if (outputs_count == 0) continue;
@@ -498,7 +505,7 @@ struct aho_corasick_dictionary {
             for (size_t output_index = 0; output_index < outputs_count; ++output_index) {
                 size_t needle_id = outputs_[outputs_offset + output_index];
                 size_t match_length = needles_lengths_[needle_id];
-                span<char const> match_span(&haystack[haystack_offset + 1 - match_length], match_length);
+                span<byte_t const> match_span(&haystack[haystack_offset + 1 - match_length], match_length);
                 match_t match {haystack, match_span, 0, needle_id};
                 if (!callback(match)) break;
             }
@@ -509,17 +516,26 @@ struct aho_corasick_dictionary {
      *  @brief Count the number of occurrences of all the needles in the @p haystack.
      *  @return The number of potentially-overlapping occurrences.
      */
-    inline size_t count(span<char const> haystack) const noexcept {
+    inline size_t count(span<byte_t const> haystack) const noexcept {
         size_t count = 0;
         state_id_t current_state = 0;
-        sz_u8_t const *haystack_data = reinterpret_cast<sz_u8_t const *>(haystack.data());
-        sz_u8_t const *haystack_end = haystack_data + haystack.size();
+        byte_t const *haystack_data = haystack.data();
+        byte_t const *const haystack_end = haystack_data + haystack.size();
         for (; haystack_data != haystack_end; ++haystack_data) {
             current_state = transitions_[current_state][*haystack_data];
             count += outputs_counts_[current_state];
         }
         return count;
     }
+
+    template <typename callback_type_>
+    void find(span<char const> haystack, callback_type_ &&callback) const noexcept {
+        return find(haystack.template cast<byte_t const>(), std::forward<callback_type_>(callback));
+    }
+
+    inline size_t count(span<char const> haystack) const noexcept {
+        return count(haystack.template cast<byte_t const>());
+    }
 };
 
 #pragma endregion // Dictionary
@@ -528,12 +544,12 @@ struct aho_corasick_dictionary {
 
 /**
  *  @brief Aho-Corasick-based @b single-threaded multi-pattern exact substring search.
- *  @tparam state_id_type_ The type of the state ID. Default is `sz_u32_t`.
+ *  @tparam state_id_type_ The type of the state ID. Default is `u32_t`.
  *  @tparam allocator_type_ The type of the allocator. Default is `dummy_alloc_t`.
  *  @tparam capability_ The capability of the dictionary. Default is `sz_cap_serial_k`.
  */
 template <                                         //
-    typename state_id_type_ = sz_u32_t,            //
+    typename state_id_type_ = u32_t,               //
     typename allocator_type_ = dummy_alloc_t,      //
     sz_capability_t capability_ = sz_cap_serial_k, //
     typename enable_ = void                        //
@@ -548,6 +564,11 @@ struct find_many {
     void reset() noexcept { dict_.reset(); }
     dictionary_t const &dictionary() const noexcept { return dict_; }
 
+    template <typename other_allocator_type_>
+    status_t try_build(aho_corasick_dictionary<state_id_t, other_allocator_type_> const &other) noexcept {
+        return dict_.try_assign(other);
+    }
+
     /**
      *  @brief Indexes all of the @p needles strings into the FSM.
      *  @retval `status_t::success_k` The needle was successfully added.
@@ -580,51 +601,16 @@ struct find_many {
     /**
      *  @brief Finds all occurrences of all needles in all the @p haystacks.
      *  @param[in] haystacks The input strings to search in, with support for random access iterators.
-     *  @param[in] counts The number of needle matches in each haystack precomputed with `try_count`.
      *  @param[in] matches The output buffer for the matches, with support for random access iterators.
-     *  @pre Expects the @p matches.size() to match `try_count` for the @p haystacks.
      *  @note The @p matches reference objects should be assignable from @b `match_t`.
      */
     template <typename haystacks_type_, typename output_matches_type_ = span<find_many_match_t>>
     status_t try_find(haystacks_type_ &&haystacks, span<size_t const> counts,
                       output_matches_type_ &&matches) const noexcept {
-        sz_unused(counts);
-        size_t matches_expected = matches.size();
-        size_t matches_found = 0;
-        status_t status = try_find(haystacks, matches, matches_found);
-        if (status != status_t::success_k) return status;
-        if (matches_found != matches_expected) return status_t::unexpected_dimensions_k;
-        return status_t::success_k;
-    }
 
-    /**
-     *  @brief Finds all occurrences of all needles in all the @p haystacks.
-     *  @param[in] haystacks The input strings to search in, with support for random access iterators.
-     *  @param[in] matches The output buffer for the matches, with support for random access iterators.
-     *  @pre Expects the @p matches.size() to match `try_count` for the @p haystacks.
-     *  @note The @p matches reference objects should be assignable from @b `match_t`.
-     */
-    template <typename haystacks_type_, typename output_matches_type_ = span<find_many_match_t>>
-    status_t try_find(haystacks_type_ &&haystacks, output_matches_type_ &&matches) const noexcept {
-        size_t matches_expected = matches.size();
-        size_t matches_found = 0;
-        status_t status = try_find(haystacks, matches, matches_found);
-        if (status != status_t::success_k) return status;
-        if (matches_found != matches_expected) return status_t::unexpected_dimensions_k;
-        return status_t::success_k;
-    }
-
-    /**
-     *  @brief Finds all occurrences of all needles in all the @p haystacks.
-     *  @param[in] haystacks The input strings to search in, with support for random access iterators.
-     *  @param[in] matches The output buffer for the matches, with support for random access iterators.
-     *  @param[out] matches_count The number of matches found.
-     *  @note The @p matches reference objects should be assignable from @b `match_t`.
-     */
-    template <typename haystacks_type_, typename output_matches_type_ = span<find_many_match_t>>
-    status_t try_find(haystacks_type_ &&haystacks, output_matches_type_ &&matches,
-                      size_t &matches_count) const noexcept {
-        size_t count_found = 0, count_allowed = matches.size();
+        sz_unused(counts); // ? We only keep it for API compatibility with parallel algos
+        size_t count_found = 0;
+        size_t const count_allowed = matches.size();
         for (auto it = haystacks.begin(); it != haystacks.end() && count_found != count_allowed; ++it)
             dict_.find(*it, [&](match_t match) noexcept {
                 match.haystack_index = static_cast<size_t>(it - haystacks.begin());
@@ -632,7 +618,7 @@ struct find_many {
                 count_found++;
                 return count_found < count_allowed;
             });
-        matches_count = count_found;
+        if (count_found != count_allowed) return status_t::unexpected_dimensions_k;
         return status_t::success_k;
     }
 
@@ -674,6 +660,11 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
     void reset() noexcept { dict_.reset(); }
     dictionary_t const &dictionary() const noexcept { return dict_; }
 
+    template <typename other_allocator_type_>
+    status_t try_build(aho_corasick_dictionary<state_id_t, other_allocator_type_> const &other) noexcept {
+        return dict_.try_assign(other);
+    }
+
     /**
      *  @brief Indexes all of the @p needles strings into the FSM.
      *  @retval `status_t::success_k` The needle was successfully added.
@@ -709,21 +700,23 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
 
         using haystacks_t = typename std::remove_reference_t<haystacks_type_>;
         using haystack_t = typename haystacks_t::value_type;
+        static_assert(std::is_trivially_copyable_v<haystack_t>,
+                      "The haystack should be trivially copyable for higher compatibility.");
 
         // On small strings, individually compute the counts
-        executor.for_each_dynamic(counts.size(), [&](size_t haystack_index) noexcept {
-            haystack_t const &haystack = haystacks[haystack_index];
-            size_t haystack_length = haystack.size();
-            if (haystack_length > specs.l2_bytes) return;
-            counts[haystack_index] = dict_.count(haystack);
-        });
+        // executor.for_each_dynamic(counts.size(), [&](size_t haystack_index) noexcept {
+        //     haystack_t const &haystack = haystacks[haystack_index];
+        //     size_t haystack_length = haystack.size();
+        //     if (haystack_length > specs.l2_bytes) return;
+        //     counts[haystack_index] = dict_.count(haystack);
+        // });
 
         // On longer strings, throw all cores on each haystack
         for (size_t haystack_index = 0; haystack_index < counts.size(); ++haystack_index) {
             haystack_t const &haystack = haystacks[haystack_index];
             size_t const haystack_length = haystack.size();
             // The shorter strings have already been processed
-            if (haystack_length <= specs.l2_bytes) continue;
+            // if (haystack_length <= specs.l2_bytes) continue;
 
             std::atomic<size_t> count_across_cores = 0;
             size_t const cores_total = executor.thread_count();
@@ -752,22 +745,20 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
      *  @brief Finds all occurrences of all needles in all the @p haystacks.
      *  @param[in] haystacks The input strings to search in, with support for random access iterators.
      *  @param[in] matches The output buffer for the matches, with support for random access iterators.
-     *  @param[out] matches_count The number of matches found.
-     *  @return The number of matches found across all the @p haystacks.
      *  @note The @p matches reference objects should be assignable from @b `match_t`.
      */
     template <typename haystacks_type_, typename output_matches_type_, typename executor_type_ = dummy_executor_t>
 #if _SZ_IS_CPP20
         requires executor_like<executor_type_>
 #endif
-    status_t try_find(haystacks_type_ &&haystacks, output_matches_type_ &&matches, size_t &matches_count,
+    status_t try_find(haystacks_type_ &&haystacks, output_matches_type_ &&matches, //
                       executor_type_ &&executor = {}, cpu_specs_t const &specs = {}) const noexcept {
 
         safe_vector<size_t, size_allocator_t> counts_per_haystack(dict_.allocator());
         if (counts_per_haystack.try_resize(haystacks.size()) != status_t::success_k) return status_t::bad_alloc_k;
         status_t count_status = try_count(haystacks, counts_per_haystack, executor, specs);
         if (count_status != status_t::success_k) return count_status;
-        return try_find(haystacks, counts_per_haystack, matches, matches_count, executor, specs);
+        return try_find(haystacks, counts_per_haystack, matches, executor, specs);
     }
 
     /**
@@ -775,8 +766,6 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
      *  @param[in] haystacks The input strings to search in, with support for random access iterators.
      *  @param[in] counts The input counts for the number of matches in each haystack.
      *  @param[in] matches The output buffer for the matches, with support for random access iterators.
-     *  @param[out] matches_count The number of matches found.
-     *  @return The number of matches found across all the @p haystacks.
      *  @note The @p matches reference objects should be assignable from @b `match_t`.
      */
     template <typename haystacks_type_, typename output_matches_type_, typename executor_type_ = dummy_executor_t>
@@ -784,8 +773,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
         requires executor_like<executor_type_>
 #endif
     status_t try_find(haystacks_type_ &&haystacks, span<size_t const> counts, output_matches_type_ &&matches,
-                      size_t &matches_count, executor_type_ &&executor = {},
-                      cpu_specs_t const &specs = {}) const noexcept {
+                      executor_type_ &&executor = {}, cpu_specs_t const &specs = {}) const noexcept {
 
         _sz_assert(counts.size() == haystacks.size());
         size_t const cores_total = specs.cores_total();
@@ -805,10 +793,12 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
         // Process the small haystacks, outputting their matches individually without any synchronization
         executor.for_each_dynamic(counts.size(), [&](size_t haystack_index) noexcept {
             haystack_t const &haystack = haystacks[haystack_index];
-            size_t haystack_length = haystack.size();
-            if (haystack_length > specs.l2_bytes) return;
+            byte_t const *const haystack_data = reinterpret_cast<byte_t const *>(haystack.data());
+            size_t const haystack_bytes_length = haystack.size() * sizeof(char_t);
+            if (haystack_bytes_length > specs.l2_bytes) return;
+
             size_t matches_found = 0;
-            dict_.find({haystack.data(), haystack_length}, [&](match_t match) noexcept {
+            dict_.find({haystack_data, haystack_bytes_length}, [&](match_t match) noexcept {
                 match.haystack_index = haystack_index;
                 matches[offsets_per_haystack[haystack_index] + matches_found] = match;
                 ++matches_found;
@@ -823,10 +813,11 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
         if (counts_per_core.try_resize(cores_total) != status_t::success_k) return status_t::bad_alloc_k;
         for (size_t haystack_index = 0; haystack_index < counts.size(); ++haystack_index) {
             haystack_t const &haystack = haystacks[haystack_index];
-            char_t const *haystack_begin = haystack.data();
-            size_t const haystack_length = haystack.size();
+            byte_t const *const haystack_data = reinterpret_cast<byte_t const *>(haystack.data());
+            size_t const haystack_bytes_length = haystack.size() * sizeof(char_t);
+            byte_t const *const haystack_end = haystack_data + haystack_bytes_length;
             // The shorter strings have already been processed
-            if (haystack_length <= specs.l2_bytes) continue;
+            if (haystack_bytes_length <= specs.l2_bytes) continue;
 
             // First, on each core, estimate the number of matches in the haystack
             size_t const cores_total = executor.thread_count();
@@ -843,12 +834,12 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
             }
 
             // We shouldn't even consider needles longer than the haystack
-            size_t const max_needle_length = std::min(dict_.max_needle_length(), haystack_length);
+            size_t const max_needle_length = std::min(dict_.max_needle_length(), haystack_bytes_length);
 
             // On each core, pick an overlapping slice and go through all of the matches in it,
             // that start before the end of the private slice.
             size_t const bytes_per_core_optimal =
-                round_up_to_multiple(divide_round_up(haystack_length, cores_total), cache_line_width);
+                round_up_to_multiple(divide_round_up(haystack_bytes_length, cores_total), cache_line_width);
             size_t const count_matches_before_this_haystack = offsets_per_haystack[haystack_index];
             executor.for_each_thread([&](size_t core_index) noexcept {
                 size_t const count_matches_before_this_core = core_index ? counts_per_core[core_index - 1] : 0;
@@ -856,12 +847,10 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
                     counts_per_core[core_index] - count_matches_before_this_core;
 
                 // The last core may have a smaller slice, so we need to be careful
-                size_t const bytes_for_core =
-                    std::min(bytes_per_core_optimal, haystack_length - core_index * bytes_per_core_optimal);
-                char_t const *optimal_start = haystack_begin + core_index * bytes_per_core_optimal;
-                char_t const *optimal_end = optimal_start + bytes_for_core;
-                char_t const *overlapping_end =
-                    std::min(optimal_start + bytes_for_core + max_needle_length - 1, haystack_begin + haystack_length);
+                byte_t const *optimal_start =
+                    std::min(haystack_data + core_index * bytes_per_core_optimal, haystack_end);
+                byte_t const *const optimal_end = std::min(optimal_start + bytes_per_core_optimal, haystack_end);
+                byte_t const *const overlapping_end = std::min(optimal_end + max_needle_length - 1, haystack_end);
 
                 // Iterate through the matches in the overlapping region
                 size_t count_matches_found_on_this_core = 0;
@@ -878,9 +867,6 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
             });
         }
 
-        // Aggregate the results
-        matches_count = 0;
-        for (size_t count : counts) matches_count += count;
         return status_t::success_k;
     }
 
@@ -898,40 +884,38 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
                                      size_t cache_line_width) const noexcept {
 
         using char_t = char_type_;
-        char_t const *haystack_begin = haystack.data();
-        size_t const haystack_length = haystack.size();
-        char_t const *const haystack_end = haystack_begin + haystack_length;
+        byte_t const *const haystack_data = reinterpret_cast<byte_t const *>(haystack.data());
+        size_t const haystack_bytes_length = haystack.size() * sizeof(char_t);
+        byte_t const *const haystack_end = haystack_data + haystack_bytes_length;
         size_t const bytes_per_core_optimal =
-            round_up_to_multiple(divide_round_up(haystack_length, cores_total), cache_line_width);
+            round_up_to_multiple(divide_round_up(haystack_bytes_length, cores_total), cache_line_width);
 
         // We shouldn't even consider needles longer than the haystack
-        size_t const max_needle_length = std::min(dict_.max_needle_length(), haystack_length);
+        size_t const max_needle_length = std::min(dict_.max_needle_length(), haystack_bytes_length);
 
         // We may have a case of a thread receiving no data at all
-        char_t const *optimal_start = haystack_begin + core_index * bytes_per_core_optimal;
+        byte_t const *optimal_start = haystack_data + core_index * bytes_per_core_optimal;
         if (optimal_start >= haystack_end) return 0;
 
         // First, each core will process its own slice excluding the overlapping regions
-        char_t const *optimal_end = std::min(optimal_start + bytes_per_core_optimal, haystack_end);
+        byte_t const *optimal_end = std::min(optimal_start + bytes_per_core_optimal, haystack_end);
         size_t const count_matches_non_overlapping = dict_.count({optimal_start, optimal_end});
 
         // Now, each thread will take care of the subsequent overlapping regions,
         // but we must be careful for cases when the core-specific slice is shorter
         // than the longest needle! It's a very unlikely case in practice, but we
         // still may want an optimization for it down the road.
-        char_t const *overlapping_start;
-        char_t const *overlapping_end;
+        byte_t const *overlapping_start;
+        byte_t const *overlapping_end;
         if (optimal_start + max_needle_length >= optimal_end) {
             // Our needles are longer than a slice for the core
             overlapping_start = optimal_start;
             overlapping_end = std::min(optimal_start + max_needle_length, haystack_end);
         }
         else {
-            overlapping_start = std::max(optimal_end - max_needle_length + 1, haystack_begin);
+            overlapping_start = std::max(optimal_end - max_needle_length + 1, haystack_data);
             overlapping_end = std::min(optimal_end + max_needle_length - 1, haystack_end);
         }
-        // A great optimization, is keeping track of entries found in the prefix of the current slice,
-        // that also began in this slice, which will help us
 
         // Count the matches that start in one core's slice and end in another
         size_t count_matches_overlapping = 0;
@@ -956,8 +940,6 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
      *  A more optimized alternative to the `count_matches_in_one_part`, that assumes that the length of the longest
      *  needle is smaller than the length of a single core slice. It means that in the least convenient case, the
      *  match can only spill into 2 core regions, starting in one and ending in another.
-     *
-     *  In such case, we can keep
      */
     template <typename char_type_>
     count_short_needle_matches_in_one_part_t count_short_needle_matches_in_one_part(
@@ -965,32 +947,42 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
         size_t cache_line_width) const noexcept {
 
         using char_t = char_type_;
-        char_t const *haystack_begin = haystack.data();
-        size_t const haystack_length = haystack.size();
-        char_t const *const haystack_end = haystack_begin + haystack_length;
+        byte_t const *const haystack_data = reinterpret_cast<byte_t const *>(haystack.data());
+        size_t const haystack_bytes_length = haystack.size() * sizeof(char_t);
+        byte_t const *const haystack_end = haystack_data + haystack_bytes_length;
         size_t const bytes_per_core_optimal =
-            round_up_to_multiple(divide_round_up(haystack_length, cores_total), cache_line_width);
+            round_up_to_multiple(divide_round_up(haystack_bytes_length, cores_total), cache_line_width);
 
         // We won't face needles longer than the slice for the core
         size_t const max_needle_length = dict_.max_needle_length();
         _sz_assert(max_needle_length < bytes_per_core_optimal);
 
         // We may have a case of a thread receiving no data at all
-        char_t const *optimal_start = std::min(haystack_begin + core_index * bytes_per_core_optimal, haystack_end);
-        char_t const *prefix_end = std::min(optimal_start + max_needle_length - 1, haystack_end);
-        char_t const *overlapping_end =
+        byte_t const *optimal_start = std::min(haystack_data + core_index * bytes_per_core_optimal, haystack_end);
+        byte_t const *const prefix_end = std::min(optimal_start + max_needle_length - 1, haystack_end);
+        byte_t const *const overlapping_end =
             std::min(optimal_start + bytes_per_core_optimal + max_needle_length, haystack_end);
 
-        // Reimplement the serial `count` keeping track of the matches, entirely fitting in the prefix
+        // Reimplement the serial `aho_corasick_dictionary::count` keeping track of the matches,
+        // entirely fitting in the prefix
         count_short_needle_matches_in_one_part_t result;
+        state_id_t current_state = 0;
+        auto const outputs_counts = dict_.outputs_counts();
+        auto const transitions = dict_.transitions();
+        for (; optimal_start != overlapping_end; ++optimal_start) {
+            current_state = transitions[current_state][*optimal_start];
+            auto const outputs_count = outputs_counts[current_state];
+            result.total += outputs_count;
+            result.prefix += outputs_count * (optimal_start < prefix_end);
+        }
 
         return result;
     }
 };
 
-using find_many_u32_dictionary_t = aho_corasick_dictionary<sz_u32_t, std::allocator<char>>;
-using find_many_u32_serial_t = find_many<sz_u32_t, std::allocator<char>, sz_cap_serial_k>;
-using find_many_u32_parallel_t = find_many<sz_u32_t, std::allocator<char>, sz_caps_sp_k>;
+using find_many_u32_dictionary_t = aho_corasick_dictionary<u32_t, std::allocator<char>>;
+using find_many_u32_serial_t = find_many<u32_t, std::allocator<char>, sz_cap_serial_k>;
+using find_many_u32_parallel_t = find_many<u32_t, std::allocator<char>, sz_caps_sp_k>;
 
 #pragma endregion // Parallel Backend
 

From 5302a4de45ee9499269e4434d353568e627e3aa6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 17 May 2025 15:22:14 +0000
Subject: [PATCH 422/751] Improve: Skip vocabulary duplicates

---
 scripts/bench_find_many.cuh | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/scripts/bench_find_many.cuh b/scripts/bench_find_many.cuh
index 83004d84..529feef0 100644
--- a/scripts/bench_find_many.cuh
+++ b/scripts/bench_find_many.cuh
@@ -106,7 +106,12 @@ void bench_find_many(environment_t const &env) {
 #if SZ_USE_CUDA
     sz::gpu_specs_t specs = *sz::gpu_specs();
 #endif
-    std::vector<std::size_t> vocabulary_sizes = {1, 64, 1024, 32 * 1024};
+    std::vector<std::size_t> vocabulary_sizes = {
+        1024,
+        64,
+        32 * 1024,
+        1,
+    };
 #if SZ_DEBUG
     vocabulary_sizes = {1, 2, 64};
 #endif
@@ -126,17 +131,20 @@ void bench_find_many(environment_t const &env) {
     };
 
     for (std::size_t vocabulary_size : vocabulary_sizes) {
-        auto shape_suffix = std::to_string(vocabulary_size) + "needles"s;
+        auto shape_suffix = vocabulary_size == 1 ? std::to_string(vocabulary_size) + "needle"s
+                                                 : std::to_string(vocabulary_size) + "needles"s;
         if (vocabulary_size > env.tokens.size()) continue;
 
         // Construct the dictionary for the current vocabulary size
         find_many_u32_dictionary_t dict;
         if (dict.try_reserve(vocabulary_size) != sz::status_t::success_k)
             throw std::runtime_error("Failed to reserve space for dictionary.");
-        for (std::size_t token_index = 0; token_index < vocabulary_size; ++token_index) {
+        for (std::size_t token_index = 0; dict.count_needles() < vocabulary_size && token_index < env.tokens.size();
+             ++token_index) {
             auto const &token = env.tokens[token_index];
-            if (dict.try_insert({token.data(), token.size()}) != sz::status_t::success_k)
-                throw std::runtime_error("Failed to insert token into dictionary.");
+            auto status = dict.try_insert({token.data(), token.size()});
+            if (status == sz::status_t::contains_duplicates_k) continue; // Skip duplicates
+            if (status != sz::status_t::success_k) throw std::runtime_error("Failed to insert token into dictionary.");
         }
         if (dict.try_build() != sz::status_t::success_k) throw std::runtime_error("Failed to build dictionary.");
 

From 887c16be2bafe3d2cfc8ba8b0216ac5e96e9ab1d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 17 May 2025 16:01:40 +0000
Subject: [PATCH 423/751] Fix: Buffer overflow due to wrong thread count

---
 include/stringcuzilla/find_many.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index c59454e3..3ccff77a 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -776,7 +776,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
                       executor_type_ &&executor = {}, cpu_specs_t const &specs = {}) const noexcept {
 
         _sz_assert(counts.size() == haystacks.size());
-        size_t const cores_total = specs.cores_total();
+        size_t const cores_total = executor.thread_count();
         size_t const cache_line_width = specs.cache_line_width;
 
         using haystacks_t = typename std::remove_reference_t<haystacks_type_>;
@@ -820,7 +820,6 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
             if (haystack_bytes_length <= specs.l2_bytes) continue;
 
             // First, on each core, estimate the number of matches in the haystack
-            size_t const cores_total = executor.thread_count();
             executor.for_each_thread([&](size_t core_index) noexcept {
                 counts_per_core[core_index] =
                     count_matches_in_one_part(haystack, core_index, cores_total, cache_line_width);

From 3ee167607727c4ed7448307e017ef8bad8e05b62 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 17 May 2025 16:02:02 +0000
Subject: [PATCH 424/751] Fix: Including the entire haystack into match

---
 include/stringcuzilla/find_many.hpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index 3ccff77a..3a01409f 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -854,8 +854,9 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
                 // Iterate through the matches in the overlapping region
                 size_t count_matches_found_on_this_core = 0;
                 dict_.find({optimal_start, overlapping_end}, [&](match_t match) noexcept {
-                    bool blongs_to_this_core = match.needle.begin() < optimal_end;
-                    if (!blongs_to_this_core) return true;
+                    bool belongs_to_this_core = match.needle.begin() < optimal_end;
+                    if (!belongs_to_this_core) return true;
+                    match.haystack = {haystack_data, haystack_bytes_length};
                     match.haystack_index = haystack_index;
                     matches[count_matches_before_this_haystack + count_matches_before_this_core +
                             count_matches_found_on_this_core] = match;

From 69c7b6a4ea4a22150d6515d5708aca4475d4e3a5 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 17 May 2025 16:02:28 +0000
Subject: [PATCH 425/751] Fix: Prefix length in parallel counting of short
 needles

---
 include/stringcuzilla/find_many.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index 3a01409f..8ef11a5c 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -959,7 +959,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
 
         // We may have a case of a thread receiving no data at all
         byte_t const *optimal_start = std::min(haystack_data + core_index * bytes_per_core_optimal, haystack_end);
-        byte_t const *const prefix_end = std::min(optimal_start + max_needle_length - 1, haystack_end);
+        byte_t const *const prefix_end = std::min(optimal_start + max_needle_length, haystack_end);
         byte_t const *const overlapping_end =
             std::min(optimal_start + bytes_per_core_optimal + max_needle_length, haystack_end);
 

From 16f5fa49f8960edffb2ba0859456e75950417e70 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 17 May 2025 16:24:11 +0000
Subject: [PATCH 426/751] Fix: Compiler over-optimizing `bench_find_many`

---
 scripts/bench_find_many.cuh | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/scripts/bench_find_many.cuh b/scripts/bench_find_many.cuh
index 529feef0..b051e2e0 100644
--- a/scripts/bench_find_many.cuh
+++ b/scripts/bench_find_many.cuh
@@ -48,7 +48,8 @@ struct find_many_callable {
         span<char const> const dataset_view = {env.dataset.data(), env.dataset.size()};
         span<span<char const> const> haystacks = {&dataset_view, 1};
 
-        sz::status_t status = engine.try_build(dictionary);
+        // Without `volatile`, the serial logic keeps being optimized out!
+        volatile sz::status_t status = engine.try_build(dictionary);
         if (status != sz::status_t::success_k) throw std::runtime_error("Failed to build dictionary.");
         span<size_t> counts_span = {results_counts_per_haystack.data(), results_counts_per_haystack.size()};
         span<find_many_match_t> matches_span = {results_matches_per_haystack.data(),
@@ -57,16 +58,25 @@ struct find_many_callable {
         // Unpack the extra arguments from `std::tuple` into the engine call using `std::apply`
         constexpr bool only_counts_k = std::is_same_v<results_t, counts_t>;
         if constexpr (only_counts_k)
-            status = std::apply([&](auto &&...rest) { return engine.try_count(haystacks, counts_span, rest...); },
-                                extra_args);
+            status = std::apply(
+                [&](auto &&...rest) {
+                    auto result = engine.try_count(haystacks, counts_span, rest...);
+                    for (auto &count : counts_span) do_not_optimize(count);
+                    return result;
+                },
+                extra_args);
         else
             status = std::apply(
-                [&](auto &&...rest) { return engine.try_find(haystacks, counts_span, matches_span, rest...); },
+                [&](auto &&...rest) {
+                    auto result = engine.try_find(haystacks, counts_span, matches_span, rest...);
+                    for (auto &match : matches_span) do_not_optimize(match);
+                    return result;
+                },
                 extra_args);
-        do_not_optimize(status);
 
+        do_not_optimize(status);
         if (status != sz::status_t::success_k) throw std::runtime_error("Failed multi-pattern search.");
-        do_not_optimize(counts_span);
+
         std::size_t needle_characters = engine.dictionary().total_needles_length();
         std::size_t bytes_passed = 0, character_comparisons = 0;
         for (std::size_t i = 0; i < haystacks.size(); ++i) {

From 3972cb391e27eeb179e3569ac1c96a98def331fc Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 18 May 2025 16:19:29 +0000
Subject: [PATCH 427/751] Docs: Move segmenting & features drafts

---
 drafts/features.hpp                |   38 +
 drafts/segments.hpp                |   91 ++
 include/stringcuzilla/features.hpp |  145 ---
 include/stringcuzilla/similarity.h | 1553 ----------------------------
 4 files changed, 129 insertions(+), 1698 deletions(-)
 create mode 100644 drafts/features.hpp
 create mode 100644 drafts/segments.hpp
 delete mode 100644 include/stringcuzilla/features.hpp
 delete mode 100644 include/stringcuzilla/similarity.h

diff --git a/drafts/features.hpp b/drafts/features.hpp
new file mode 100644
index 00000000..7480a0e0
--- /dev/null
+++ b/drafts/features.hpp
@@ -0,0 +1,38 @@
+/**
+ *  @brief  Hardware-accelerated feature extractions for string collections.
+ *  @file   features.hpp
+ *  @author Ash Vardanian
+ *
+ *  The `sklearn.feature_extraction` module for @b TF-IDF, `CountVectorizer`, and `HashingVectorizer`
+ *  is one of the most commonly used in the industry due to its extreme flexibility. It can:
+ *
+ *  - Tokenize by words, N-grams, or in-word N-grams.
+ *  - Use arbitrary Regular Expressions as word separators.
+ *  - Return matrices of different types, normalized or not.
+ *  - Exclude "stop words" and remove ASCII and Unicode accents.
+ *  - Dynamically build a vocabulary or use a fixed list/dictionary.
+ *
+ *  That level of flexibility is not feasible for a hardware-accelerated SIMD library, but we
+ *  can provide a set of APIs that can be used to build such a library on top of StringCuZilla.
+ *  That functionality can reuse our @b Trie data-structure for vocabulary building histograms.
+ *
+ *  In this file, we mostly focus on batch-level hashing operations, similar to the `intersect.h`
+ *  module. There, we cross-reference two sets of strings, and here we only analyze one at a time.
+ *
+ *  - The text comes in pre-tokenized form, as a stream, not even indexed-lookup is needed,
+ *    unlike the `sz_sequence_t` in `sz_intersect` APIs.
+ *  - We scatter those tokens into the output in multiple forms:
+ *
+ *    - output hashes into a continuous buffer.
+ *    - output hashes into a hash-map with counts.
+ *    - output hashes into a high-dimensional bit-vector.
+ *
+ */
+#ifndef STRINGZILLA_FEATURES_HPP_
+#define STRINGZILLA_FEATURES_HPP_
+
+#include "stringzilla/memory.h"  // `sz_move`
+#include "stringzilla/types.hpp" // `sz::error_cost_t`
+
+#include <limits>   // `std::numeric_limits` for numeric types
+#include <iterator> // `std::iterator_traits` for iterators
diff --git a/drafts/segments.hpp b/drafts/segments.hpp
new file mode 100644
index 00000000..5fbbee15
--- /dev/null
+++ b/drafts/segments.hpp
@@ -0,0 +1,91 @@
+/**
+ *  @brief  Hardware-accelerated UTF-8 segments, locating graphemes, word- and sentence-boundaries.
+ *  @file   segment.hpp
+ *  @author Ash Vardanian
+ *
+ *  Includes core APIs:
+ *
+ *  - `sz_runes_count(bytes_view, runes_limit)` → (bytes_skipped, runes_found)
+ *  - `sz_runes_parse(bytes_view, runes_span)` → (bytes_skipped, runes_found)
+ *  - `sz_clusters_count(runes_view, grapheme_clusters_limit)` → (bytes_skipped, runes_found)
+ *  - `sz_clusters_parse(runes_view, grapheme_clusters_span)` → (bytes_skipped, runes_found)
+ *
+ *  The first: counts the number of UTF-8 runes in a given string, up to a given limit.
+ *  The second: parses a UTF-8 string into an array of UTF-32 runes - optimized for batch-decoding of 64 runes/call.
+ *  To render the text, however, we need to know the size of different grapheme clusters, which is defined by:
+ *
+ *  - UAX #29 "Unicode Text Segmentation" rules: https://unicode.org/reports/tr29/
+ *  - UTS #51 "Unicode Emoji" rules: https://www.unicode.org/reports/tr51/
+ *
+ *  @see "Blazing fast Unicode-aware ILIKE with AVX-512" in Sneller:
+ *       https://sneller.ai/blog/accelerating-ilike-using-avx-512
+ *  @see For fast any-to-any transcoding: https://github.com/simdutf/simdutf
+ *  @see For UTF-8 validation: https://github.com/lemire/fastvalidate-utf-8
+ *
+ */
+#ifndef STRINGZILLA_SEGMENTS_HPP_
+#define STRINGZILLA_SEGMENTS_HPP_
+
+#include "types.h"
+
+#include "compare.h" // `sz_equal`
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum sz_encoding_t {
+    sz_encoding_unknown_k = 0,
+    sz_encoding_ascii_k = 1,
+    sz_encoding_utf8_k = 2,
+    sz_encoding_utf16_k = 3,
+    sz_encoding_utf32_k = 4,
+    sz_encoding_jwt_k = 5,
+    sz_encoding_base64_k = 6,
+    // Low priority encodings:
+    sz_encoding_utf8bom_k = 7,
+    sz_encoding_utf16le_k = 8,
+    sz_encoding_utf16be_k = 9,
+    sz_encoding_utf32le_k = 10,
+    sz_encoding_utf32be_k = 11,
+};
+
+// Character Set Detection is one of the most commonly performed operations in data processing with
+// [Chardet](https://github.com/chardet/chardet), [Charset Normalizer](https://github.com/jawah/charset_normalizer),
+// [cChardet](https://github.com/PyYoshi/cChardet) being the most commonly used options in the Python ecosystem.
+// All of them are notoriously slow.
+//
+// Moreover, as of October 2024, UTF-8 is the dominant character encoding on the web, used by 98.4% of websites.
+// Other have minimal usage, according to [W3Techs](https://w3techs.com/technologies/overview/character_encoding):
+// - ISO-8859-1: 1.2%
+// - Windows-1252: 0.3%
+// - Windows-1251: 0.2%
+// - EUC-JP: 0.1%
+// - Shift JIS: 0.1%
+// - EUC-KR: 0.1%
+// - GB2312: 0.1%
+// - Windows-1250: 0.1%
+// Within programming language implementations and database management systems, 16-bit and 32-bit fixed-width encodings
+// are also very popular and we need a way to efficiently differentiate between the most common UTF flavors, ASCII, and
+// the rest.
+//
+// One good solution is the [simdutf](https://github.com/simdutf/simdutf) library, but it depends on the C++ runtime
+// and focuses more on incremental validation & transcoding, rather than detection.
+//
+// So we need a very fast and efficient way of determining
+SZ_PUBLIC sz_bool_t sz_detect_encoding(sz_cptr_t text, sz_size_t length) {
+    // https://github.com/simdutf/simdutf/blob/master/src/icelake/icelake_utf8_validation.inl.cpp
+    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_from_utf8.inl.cpp#L81
+    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L661
+    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L788
+
+    // We can implement this operation simpler & differently, assuming most of the time continuous chunks of memory
+    // have identical encoding. With Russian and many European languages, we generally deal with 2-byte codepoints
+    // with occasional 1-byte punctuation marks. In the case of Chinese, Japanese, and Korean, we deal with 3-byte
+    // codepoints. In the case of emojis, we deal with 4-byte codepoints.
+    // We can also use the idea, that misaligned reads are quite cheap on modern CPUs.
+    int can_be_ascii = 1, can_be_utf8 = 1, can_be_utf16 = 1, can_be_utf32 = 1;
+    sz_unused(can_be_ascii + can_be_utf8 + can_be_utf16 + can_be_utf32);
+    sz_unused(text && length);
+    return sz_false_k;
+}
diff --git a/include/stringcuzilla/features.hpp b/include/stringcuzilla/features.hpp
deleted file mode 100644
index 665291bf..00000000
--- a/include/stringcuzilla/features.hpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/**
- *  @brief  Hardware-accelerated feature extractions for string collections.
- *  @file   features.hpp
- *  @author Ash Vardanian
- *
- *  The `sklearn.feature_extraction` module for @b TF-IDF, `CountVectorizer`, and `HashingVectorizer`
- *  is one of the most commonly used in the industry due to its extreme flexibility. It can:
- *
- *  - Tokenize by words, N-grams, or in-word N-grams.
- *  - Use arbitrary Regular Expressions as word separators.
- *  - Return matrices of different types, normalized or not.
- *  - Exclude "stop words" and remove ASCII and Unicode accents.
- *  - Dynamically build a vocabulary or use a fixed list/dictionary.
- *
- *  That level of flexibility is not feasible for a hardware-accelerated SIMD library, but we
- *  can provide a set of APIs that can be used to build such a library on top of StringZilla.
- *  That functionality can reuse our @b Trie data-structure for vocabulary building histograms.
- *
- *  In this file, we mostly focus on batch-level hashing operations, similar to the `intersect.h`
- *  module. There, we cross-reference two sets of strings, and here we only analyze one at a time.
- *
- *  - The text comes in pre-tokenized form, as a stream, not even indexed-lookup is needed,
- *    unlike the `sz_sequence_t` in `sz_intersect` APIs.
- *  - We scatter those tokens into the output in multiple forms:
- *
- *    - output hashes into a continuous buffer.
- *    - output hashes into a hash-map with counts.
- *    - output hashes into a high-dimensional bit-vector.
- *
- */
-#ifndef STRINGZILLA_FEATURES_HPP_
-#define STRINGZILLA_FEATURES_HPP_
-
-#include "types.h"
-
-#include "compare.h" // `sz_compare`
-#include "memory.h"  // `sz_copy`
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#pragma region Core API
-
-/**
- *  @brief  Faster @b arg-sort for an arbitrary @b string sequence, using QuickSort.
- *          Outputs the @p order of elements in the immutable @p sequence, that would sort it.
- *
- *  @param[in] sequence Immutable sequence of strings to sort.
- *  @param[in] alloc Optional memory allocator for temporary storage.
- *  @param[out] order Output permutation that sorts the elements.
- *
- *  @retval `sz_success_k` if the operation was successful.
- *  @retval `sz_bad_alloc_k` if the operation failed due to memory allocation failure.
- *  @pre The @p order array must fit at least `sequence->count` integers.
- *  @post The @p order array will contain a valid permutation of `[0, sequence->count - 1]`.
- *
- *  Example usage:
- *
- *  @code{.c}
- *      #include <stringzilla/features.h>
- *      int main() {
- *          char const *strings[] = {"banana", "apple", "cherry"};
- *          sz_sequence_t sequence;
- *          sz_sequence_from_null_terminated_strings(strings, 3, &sequence);
- *          sz_sorted_idx_t order[3];
- *          sz_status_t status = sz_sequence_argsort(&sequence, NULL, order);
- *          return status == sz_success_k && order[0] == 1 && order[1] == 0 && order[2] == 2 ? 0 : 1;
- *      }
- *  @endcode
- *
- *  @note   The algorithm has linear memory complexity, quadratic worst-case and log-linear average time complexity.
- *  @see    https://en.wikipedia.org/wiki/Quicksort
- *
- *  @note   This algorithm is @b unstable: equal elements may change relative order.
- *  @sa     sz_sequence_argsort_stabilize
- *
- *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
- *  @sa     sz_sequence_argsort_serial, sz_sequence_argsort_skylake, sz_sequence_argsort_sve
- */
-SZ_DYNAMIC sz_status_t sz_sequence_argsort(sz_sequence_t const *sequence, sz_memory_allocator_t *alloc,
-                                           sz_sorted_idx_t *order);
-
-enum sz_encoding_t {
-    sz_encoding_unknown_k = 0,
-    sz_encoding_ascii_k = 1,
-    sz_encoding_utf8_k = 2,
-    sz_encoding_utf16_k = 3,
-    sz_encoding_utf32_k = 4,
-    sz_encoding_jwt_k = 5,
-    sz_encoding_base64_k = 6,
-    // Low priority encodings:
-    sz_encoding_utf8bom_k = 7,
-    sz_encoding_utf16le_k = 8,
-    sz_encoding_utf16be_k = 9,
-    sz_encoding_utf32le_k = 10,
-    sz_encoding_utf32be_k = 11,
-};
-
-// Character Set Detection is one of the most commonly performed operations in data processing with
-// [Chardet](https://github.com/chardet/chardet), [Charset Normalizer](https://github.com/jawah/charset_normalizer),
-// [cChardet](https://github.com/PyYoshi/cChardet) being the most commonly used options in the Python ecosystem.
-// All of them are notoriously slow.
-//
-// Moreover, as of October 2024, UTF-8 is the dominant character encoding on the web, used by 98.4% of websites.
-// Other have minimal usage, according to [W3Techs](https://w3techs.com/technologies/overview/character_encoding):
-// - ISO-8859-1: 1.2%
-// - Windows-1252: 0.3%
-// - Windows-1251: 0.2%
-// - EUC-JP: 0.1%
-// - Shift JIS: 0.1%
-// - EUC-KR: 0.1%
-// - GB2312: 0.1%
-// - Windows-1250: 0.1%
-// Within programming language implementations and database management systems, 16-bit and 32-bit fixed-width encodings
-// are also very popular and we need a way to efficiently differentiate between the most common UTF flavors, ASCII, and
-// the rest.
-//
-// One good solution is the [simdutf](https://github.com/simdutf/simdutf) library, but it depends on the C++ runtime
-// and focuses more on incremental validation & transcoding, rather than detection.
-//
-// So we need a very fast and efficient way of determining
-SZ_PUBLIC sz_bool_t sz_detect_encoding(sz_cptr_t text, sz_size_t length) {
-    // https://github.com/simdutf/simdutf/blob/master/src/icelake/icelake_utf8_validation.inl.cpp
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_from_utf8.inl.cpp#L81
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L661
-    // https://github.com/simdutf/simdutf/blob/603070affe68101e9e08ea2de19ea5f3f154cf5d/src/icelake/icelake_utf8_common.inl.cpp#L788
-
-    // We can implement this operation simpler & differently, assuming most of the time continuous chunks of memory
-    // have identical encoding. With Russian and many European languages, we generally deal with 2-byte codepoints
-    // with occasional 1-byte punctuation marks. In the case of Chinese, Japanese, and Korean, we deal with 3-byte
-    // codepoints. In the case of emojis, we deal with 4-byte codepoints.
-    // We can also use the idea, that misaligned reads are quite cheap on modern CPUs.
-    int can_be_ascii = 1, can_be_utf8 = 1, can_be_utf16 = 1, can_be_utf32 = 1;
-    sz_unused(can_be_ascii + can_be_utf8 + can_be_utf16 + can_be_utf32);
-    sz_unused(text && length);
-    return sz_false_k;
-}
-
-#pragma endregion // Core API
-
-#ifdef __cplusplus
-}
-#endif // __cplusplus
-#endif // STRINGZILLA_FEATURES_HPP_
diff --git a/include/stringcuzilla/similarity.h b/include/stringcuzilla/similarity.h
deleted file mode 100644
index 6a8e415c..00000000
--- a/include/stringcuzilla/similarity.h
+++ /dev/null
@@ -1,1553 +0,0 @@
-/**
- *  @brief  Hardware-accelerated string similarity utilities.
- *  @file   similarity.h
- *  @author Ash Vardanian
- *
- *  Includes core APIs:
- *
- *  - `sz_hamming_distance` & `sz_hamming_distance_utf8` for Hamming distance computation.
- *  - `sz_levenshtein_distance` & `sz_levenshtein_distance_utf8` for Levenshtein edit-distance computation.
- *  - `sz_needleman_wunsch_score` for weighted Needleman-Wunsch global alignment.
- *
- *  The Hamming distance is rarely used in string processing, so only minimal compatibility is provided.
- *  The Levenshtein distance, however, is much more popular and computationally intensive.
- *  So a huge part of this file is focused on optimizing it for different input alphabet sizes and input lengths.
- */
-#ifndef STRINGZILLA_SIMILARITY_H_
-#define STRINGZILLA_SIMILARITY_H_
-
-#include "find.h"
-#include "types.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#pragma region Core API
-
-/**
- *  @brief  Computes the Hamming distance between two strings.
- *  @see    https://en.wikipedia.org/wiki/Hamming_distance
- *
- *  The Hamming distance is defined as the number of positions at which the corresponding bytes differ.
- *  If the strings have different lengths, the extra characters in the longer string are treated as mismatches.
- *
- *  If the running distance reaches the @p bound, the computation aborts early. If the @p bound is zero,
- *  the function merely checks for equality. If the @p bound is larger than the maximum length of the strings,
- *  the function will compute the full "unbounded" distance.
- *
- *  @param[in] a Pointer to the first string.
- *  @param[in] a_length Number of bytes in the first string.
- *  @param[in] b Pointer to the second string.
- *  @param[in] b_length Number of bytes in the second string.
- *  @param[in] bound Exclusive upper bound on the computed distance.
- *  @param[out] result On success, the computed byte-level Hamming distance is stored here.
- *  @retval `sz_success_k` if the operation was successful.
- *  @retval `sz_bad_alloc_k` if the operation failed due to memory allocation failure.
- *
- *  Example usage:
- *
- *  @code{.c}
- *      #include <stringzilla/similarity.h>
- *      int main(void) {
- *          char const *s1 = "1011101";
- *          char const *s2 = "1001001";
- *          sz_size_t result, length = 7, bound = 10;
- *          sz_status_t status = sz_hamming_distance(s1, length, s2, length, bound, &result);
- *          return (status == sz_success_k && result == 2) ? 0 : 1;
- *      }
- *  @endcode
- *
- *  @note   This function isn't intended for UTF-8 texts and is not heavily optimized.
- *  @sa     sz_hamming_distance_utf8
- */
-SZ_DYNAMIC sz_status_t sz_hamming_distance( //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound, sz_size_t *result);
-
-/**
- *  @brief  Computes the Hamming distance between two @b UTF-8 encoded strings.
- *  @see    https://en.wikipedia.org/wiki/Hamming_distance
- *
- *  The Hamming distance is defined as the number of positions at which the corresponding Unicode runes differ.
- *  If the strings have different lengths, the extra characters in the longer string are treated as mismatches.
- *
- *  If the running distance reaches the @p bound, the computation aborts early. If the @p bound is zero,
- *  the function merely checks for equality. If the @p bound is larger than the maximum length of the strings,
- *  the function will compute the full "unbounded" distance.
- *
- *  @param[in] a Pointer to the first string.
- *  @param[in] a_length Number of bytes in the first string.
- *  @param[in] b Pointer to the second string.
- *  @param[in] b_length Number of bytes in the second string.
- *  @param[in] bound Exclusive upper bound on the computed distance.
- *  @param[out] result On success, the computed Unicode character-level Hamming distance is stored here.
- *  @retval `sz_success_k` if the operation was successful.
- *  @retval `sz_bad_alloc_k` if the operation failed due to memory allocation failure.
- *  @retval `sz_invalid_utf8_k` if the input strings are not valid UTF-8.
- *
- *  Example usage:
- *
- *  @code{.c}
- *      #include <stringzilla/similarity.h>
- *      int main(void) {
- *          char const *s1 = "café";
- *          char const *s2 = "cafe";
- *          sz_size_t result, length1 = 5, length2 = 4, bound = 10;
- *          sz_status_t status = sz_hamming_distance_utf8(s1, length1, s2, length2, bound, &result);
- *          return (status == sz_success_k && result == 1) ? 0 : 1;
- *      }
- *  @endcode
- *
- *  @note   This function isn't heavily optimized.
- *  @sa     sz_hamming_distance
- */
-SZ_DYNAMIC sz_status_t sz_hamming_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, sz_size_t bound, sz_size_t *result);
-
-/**
- *  @brief  Computes the Levenshtein edit-distance between two strings using the Wagner-Fisher algorithm.
- *          Similar to the Needleman-Wunsch alignment algorithm. Often used in fuzzy string matching.
- *
- *  If the running distance reaches the @p bound, the computation aborts early. If the @p bound is zero,
- *  the function merely checks for equality. If the @p bound is larger than the maximum length of the strings,
- *  the function will compute the full "unbounded" distance.
- *
- *  @param[in] a Pointer to the first string.
- *  @param[in] a_length Number of bytes in the first string.
- *  @param[in] b Pointer to the second string.
- *  @param[in] b_length Number of bytes in the second string.
- *  @param[in] bound Exclusive upper bound on the computed distance.
- *  @param[in] alloc Optional memory allocator. If `NULL` is passed, will use to the systems default `malloc`.
- *
- *  @param[out] result On success, the computed byte-level Levenshtein distance is stored here.
- *  @retval `sz_success_k` if the operation was successful.
- *  @retval `sz_bad_alloc_k` if the operation failed due to memory allocation failure.
- *  @retval `sz_invalid_utf8_k` if the input strings are not valid UTF-8.
- *
- *  Example usage:
- *
- *  @code{.c}
- *      #include <stringzilla/similarity.h>
- *      int main(void) {
- *          char const *s1 = "kitten";
- *          char const *s2 = "sitting";
- *          sz_size_t result, length1 = 6, length2 = 7, bound = 10;
- *          sz_status_t status = sz_levenshtein_distance(s1, length1, s2, length2, bound, NULL, &result);
- *          return (status == sz_success_k && result == 3) ? 0 : 1;
- *      }
- *  @endcode
- *
- *  @note   The algorithm has linear memory complexity and @p a_length * @p b_length time complexity.
- *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
- *
- *  @note   This function isn't intended for UTF-8 texts.
- *  @sa     sz_levenshtein_distance_utf8
- *
- *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
- *  @sa     sz_levenshtein_distance_serial, sz_levenshtein_distance_ice
- */
-SZ_DYNAMIC sz_status_t sz_levenshtein_distance(                       //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result);
-
-/**
- *  @brief  Computes the Levenshtein edit-distance between two @b UTF-8 strings using the Wagner-Fisher algorithm.
- *          Similar to the Needleman-Wunsch alignment algorithm. Often used in fuzzy string matching.
- *
- *  If the running distance reaches the @p bound, the computation aborts early. If the @p bound is zero,
- *  the function merely checks for equality. If the @p bound is larger than the maximum length of the strings,
- *  the function will compute the full "unbounded" distance.
- *
- *  @param[in] a Pointer to the first string.
- *  @param[in] a_length Number of bytes in the first string.
- *  @param[in] b Pointer to the second string.
- *  @param[in] b_length Number of bytes in the second string.
- *  @param[in] bound Exclusive upper bound on the computed distance.
- *  @param[in] alloc Optional memory allocator. If `NULL` is passed, will use to the systems default `malloc`.
- *
- *  @param[out] result On success, the computed byte-level Levenshtein distance is stored here.
- *  @retval `sz_success_k` if the operation was successful.
- *  @retval `sz_bad_alloc_k` if the operation failed due to memory allocation failure.
- *  @retval `sz_invalid_utf8_k` if the input strings are not valid UTF-8.
- *
- *  Example usage:
- *
- *  @code{.c}
- *      #include <stringzilla/similarity.h>
- *      int main(void) {
- *          char const *s1 = "café";
- *          char const *s2 = "cafe";
- *          sz_size_t result, length1 = 5, length2 = 4, bound = 10;
- *          sz_status_t status = sz_levenshtein_distance_utf8(s1, length1, s2, length2, bound, NULL, &result);
- *          return (status == sz_success_k && result == 1) ? 0 : 1;
- *      }
- *  @endcode
- *
- *  @note   The algorithm has linear memory complexity and @p a_length * @p b_length time complexity.
- *  @see    https://en.wikipedia.org/wiki/Levenshtein_distance
- *
- *  @sa     sz_levenshtein_distance
- *
- *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
- *  @sa     sz_levenshtein_distance_utf8_serial
- */
-SZ_DYNAMIC sz_status_t sz_levenshtein_distance_utf8(                  //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result);
-
-/**
- *  @brief  Computes the Needleman–Wunsch alignment score for two strings.
- *          Often used in bioinformatics for sequence alignment.
- *
- *  This function calculates a similarity score by applying gap and substitution penalties,
- *  following the Needleman–Wunsch algorithm. Note that the result is generally @b not-commutative —
- *  that is, `sz_needleman_wunsch_score(a, b)` may differ from `sz_needleman_wunsch_score(b, a)`
- *  unless the @p subs matrix is symmetric. With a @p gap penalty of -1 and substitution costs defined
- *  as 0 for matches and -1 for mismatches, the score is equivalent to the negative Levenshtein distance.
- *
- *  @param[in] a Pointer to the first string.
- *  @param[in] a_length Number of bytes in the first string.
- *  @param[in] b Pointer to the second string.
- *  @param[in] b_length Number of bytes in the second string.
- *  @param[in] subs Substitution cost matrix (256×256) for all pairs of characters.
- *  @param[in] gap Penalty cost for gaps (insertions and deletions).
- *  @param[in] alloc Optional memory allocator. If `NULL` is passed, the system default `malloc` is used.
- *  @param[out] result On success, the computed byte-level Levenshtein distance is stored here.
- *  @retval `sz_success_k` if the operation was successful.
- *  @retval `sz_bad_alloc_k` if the operation failed due to memory allocation failure.
- *
- *  Example usage:
- *
- *  @code{.c}
- *      #include <stringzilla/similarity.h>
- *      int main(void) {
- *          char const *s1 = "GATTACA";
- *          char const *s2 = "GCATGCU";
- *          sz_error_cost_t subs[256][256] = { ... };
- *          sz_error_cost_t gap = -1;
- *          sz_ssize_t score;
- *          sz_status_t status = sz_needleman_wunsch_score(s1, 7, s2, 7, subs, gap, NULL, &score);
- *          return (status == sz_success_k) ? 0 : 1;
- *      }
- *  @endcode
- *
- *  @note   Algorithm has @p a_length * @p b_length worst-case time complexity and linear memory complexity.
- *  @see    https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
- *  @note   Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`.
- *  @sa     sz_needleman_wunsch_score_serial, sz_needleman_wunsch_score_ice
- */
-SZ_DYNAMIC sz_status_t sz_needleman_wunsch_score(                     //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-    sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-    sz_memory_allocator_t *alloc, sz_ssize_t *result);
-
-/** @copydoc sz_hamming_distance */
-SZ_PUBLIC sz_status_t sz_hamming_distance_serial(                     //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-    sz_size_t bound, sz_size_t *result);
-
-/** @copydoc sz_hamming_distance_utf8 */
-SZ_PUBLIC sz_status_t sz_hamming_distance_utf8_serial(                //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-    sz_size_t bound, sz_size_t *result);
-
-/** @copydoc sz_levenshtein_distance */
-SZ_PUBLIC sz_status_t sz_levenshtein_distance_serial(                 //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result);
-
-/** @copydoc sz_levenshtein_distance_utf8 */
-SZ_PUBLIC sz_status_t sz_levenshtein_distance_utf8_serial(            //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result);
-
-/** @copydoc sz_needleman_wunsch_score */
-SZ_PUBLIC sz_status_t sz_needleman_wunsch_score_serial(               //
-    sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
-    sz_error_cost_t const *subs, sz_error_cost_t gap,                 //
-    sz_memory_allocator_t *alloc, sz_ssize_t *result);
-
-#if SZ_USE_ICE
-
-/** @copydoc sz_levenshtein_distance */
-SZ_PUBLIC sz_status_t sz_levenshtein_distance_ice( //
-    sz_cptr_t shorter, sz_size_t shorter_length,   //
-    sz_cptr_t longer, sz_size_t longer_length,     //
-    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result);
-
-/** @copydoc sz_needleman_wunsch_score */
-SZ_PUBLIC sz_status_t sz_needleman_wunsch_score_ice( //
-    sz_cptr_t shorter, sz_size_t shorter_length,     //
-    sz_cptr_t longer, sz_size_t longer_length,       //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc, sz_ssize_t *result);
-
-#endif
-
-#pragma endregion // Core API
-
-#pragma region Serial Implementation
-
-SZ_INTERNAL sz_status_t _sz_levenshtein_distance_skewed_diagonals_serial( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                          //
-    sz_cptr_t longer, sz_size_t longer_length,                            //
-    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result_ptr) {
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // TODO: Generalize to remove the following asserts!
-    _sz_assert(bound >= longer_length && "For bounded search the method should only evaluate one band of the matrix.");
-    _sz_assert(shorter_length == longer_length && "The method hasn't been generalized to different length inputs yet.");
-    sz_unused(longer_length && bound);
-
-    // We are going to store 3 diagonals of the matrix.
-    // The length of the longest (main) diagonal would be `n = (shorter_length + 1)`.
-    sz_size_t n = shorter_length + 1;
-    sz_size_t buffer_length = sizeof(sz_size_t) * n * 3;
-    sz_size_t *distances = (sz_size_t *)alloc->allocate(buffer_length, alloc->handle);
-    if (!distances) return sz_bad_alloc_k;
-
-    sz_size_t *previous_distances = distances;
-    sz_size_t *current_distances = previous_distances + n;
-    sz_size_t *next_distances = previous_distances + n * 2;
-
-    // Initialize the first two diagonals:
-    previous_distances[0] = 0;
-    current_distances[0] = current_distances[1] = 1;
-
-    // Progress through the upper-left triangle of the Levenshtein matrix.
-    sz_size_t next_diagonal_index = 2;
-    for (; next_diagonal_index != n; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
-        for (sz_size_t i = 0; i + 2 < next_diagonal_length; ++i) {
-            sz_size_t cost_of_substitution = shorter[next_diagonal_index - i - 2] != longer[i];
-            sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
-            sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
-            next_distances[i + 1] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
-        }
-        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-        next_distances[0] = next_distances[next_diagonal_length - 1] = next_diagonal_index;
-        // Perform a circular rotation of those buffers, to reuse the memory.
-        sz_size_t *temporary = previous_distances;
-        previous_distances = current_distances;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // By now we've scanned through the upper-left triangle of the matrix, where each subsequent iteration results in a
-    // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
-    // index on either side, we will be cropping those values out.
-    sz_size_t diagonals_count = n + n - 1;
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
-        for (sz_size_t i = 0; i != next_diagonal_length; ++i) {
-            sz_size_t cost_of_substitution = shorter[shorter_length - 1 - i] != longer[next_diagonal_index - n + i];
-            sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
-            sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
-            next_distances[i] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
-        }
-        // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
-        // dropping the first element in the current array.
-        sz_size_t *temporary = previous_distances;
-        previous_distances = current_distances + 1;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // Cache scalar before `free` call.
-    sz_size_t result = current_distances[0];
-    alloc->free(distances, buffer_length, alloc->handle);
-    *result_ptr = result;
-    return sz_success_k;
-}
-
-/**
- *  @brief  Compute the Levenshtein distance between two strings using the Wagner-Fisher algorithm.
- *          Stores only 2 rows of the Levenshtein matrix, but uses 64-bit integers for the distance values,
- *          and upcasts UTF8 variable-length codepoints to 64-bit integers for faster addressing.
- *
- *  ! In the worst case for 2 strings of length 100, that contain just one 16-bit codepoint this algorithm
- *  ! will require 2400 bytes of memory:
- *  !    + 2 rows * 100 slots * 8 bytes/slot = 1600 bytes of memory for the two rows of the Levenshtein matrix rows.
- *  !    + 100 codepoints * 2 strings * 4 bytes/codepoint = 800 bytes of memory for the UTF8 buffer.
- *  !    = 2400 bytes of memory or @b 12x memory amplification!
- */
-SZ_INTERNAL sz_status_t _sz_levenshtein_distance_wagner_fisher_serial( //
-    sz_cptr_t longer, sz_size_t longer_length,                         //
-    sz_cptr_t shorter, sz_size_t shorter_length,                       //
-    sz_size_t bound, sz_bool_t can_be_unicode, sz_memory_allocator_t *alloc, sz_size_t *result_ptr) {
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // A good idea may be to dispatch different kernels for different string lengths.
-    // Like using `uint8_t` counters for strings under 255 characters long.
-    // Good in theory, this results in frequent upcasts and downcasts in serial code.
-    // On strings over 20 bytes, using `uint8` over `uint64` on 64-bit x86 CPU doubles the execution time.
-    // So one must be very cautious with such optimizations.
-    typedef sz_size_t _distance_t;
-
-    // Compute the number of columns in our Levenshtein matrix.
-    sz_size_t const n = shorter_length + 1;
-
-    // If a buffering memory-allocator is provided, this operation is practically free,
-    // and cheaper than allocating even 512 bytes (for small distance matrices) on stack.
-    sz_size_t buffer_length = sizeof(_distance_t) * (n * 2);
-
-    // If the strings contain Unicode characters, let's estimate the max character width,
-    // and use it to allocate a larger buffer to decode UTF8.
-    sz_byteset_t ascii_byteset;
-    sz_byteset_init_ascii(&ascii_byteset);
-    sz_byteset_invert(&ascii_byteset);
-    int const longer_is_ascii = sz_find_byteset_serial(longer, longer_length, &ascii_byteset) == SZ_NULL_CHAR;
-    int const shorter_is_ascii = sz_find_byteset_serial(shorter, shorter_length, &ascii_byteset) == SZ_NULL_CHAR;
-    int const will_convert_to_unicode = can_be_unicode == sz_true_k && (!longer_is_ascii || !shorter_is_ascii);
-    if (will_convert_to_unicode) { buffer_length += (shorter_length + longer_length) * sizeof(sz_rune_t); }
-    else { can_be_unicode = sz_false_k; }
-
-    // If the allocation fails, return the maximum distance.
-    sz_ptr_t const buffer = (sz_ptr_t)alloc->allocate(buffer_length, alloc->handle);
-    if (!buffer) return sz_bad_alloc_k;
-
-    // Let's export the UTF8 sequence into the newly allocated buffer at the end.
-    if (can_be_unicode == sz_true_k) {
-        sz_rune_t *const longer_utf32 = (sz_rune_t *)(buffer + sizeof(_distance_t) * (n * 2));
-        sz_rune_t *const shorter_utf32 = longer_utf32 + longer_length;
-        // Export the UTF8 sequences into the newly allocated buffer.
-        longer_length = sz_runes_parse(longer, longer_length, longer_utf32);
-        shorter_length = sz_runes_parse(shorter, shorter_length, shorter_utf32);
-        longer = (sz_cptr_t)longer_utf32;
-        shorter = (sz_cptr_t)shorter_utf32;
-    }
-
-    // Let's parameterize the core logic for different character types and distance types.
-#define _wagner_fisher_unbounded(_distance_t, _char_t)                                                                \
-    /* Now let's cast our pointer to avoid it in subsequent sections. */                                              \
-    _char_t const *const longer_chars = (_char_t const *)longer;                                                      \
-    _char_t const *const shorter_chars = (_char_t const *)shorter;                                                    \
-    _distance_t *previous_distances = (_distance_t *)buffer;                                                          \
-    _distance_t *current_distances = previous_distances + n;                                                          \
-    /*  Initialize the first row of the Levenshtein matrix with `iota`-style arithmetic progression. */               \
-    for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
-    /* The main loop of the algorithm with quadratic complexity. */                                                   \
-    for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {                                     \
-        _char_t const longer_char = longer_chars[idx_longer];                                                         \
-        /* Using pure pointer arithmetic is faster than iterating with an index. */                                   \
-        _char_t const *shorter_ptr = shorter_chars;                                                                   \
-        _distance_t const *previous_ptr = previous_distances;                                                         \
-        _distance_t *current_ptr = current_distances;                                                                 \
-        _distance_t *const current_end = current_ptr + shorter_length;                                                \
-        current_ptr[0] = idx_longer + 1;                                                                              \
-        for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) {                            \
-            _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]);           \
-            /* We can avoid `+1` for costs here, shifting it to post-minimum computation, */                          \
-            /* saving one increment operation. */                                                                     \
-            _distance_t cost_deletion = previous_ptr[1];                                                              \
-            _distance_t cost_insertion = current_ptr[0];                                                              \
-            /* ? It might be a good idea to enforce branchless execution here. */                                     \
-            /* ? The caveat being that the benchmarks on longer sequences backfire and more research is needed. */    \
-            current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1);      \
-        }                                                                                                             \
-        /* Swap `previous_distances` and `current_distances` pointers. */                                             \
-        _distance_t *temporary = previous_distances;                                                                  \
-        previous_distances = current_distances;                                                                       \
-        current_distances = temporary;                                                                                \
-    }                                                                                                                 \
-    /* Cache scalar before `free` call. */                                                                            \
-    sz_size_t result = previous_distances[shorter_length];                                                            \
-    alloc->free(buffer, buffer_length, alloc->handle);                                                                \
-    *result_ptr = result;                                                                                             \
-    return sz_success_k;
-
-    // Let's define a separate variant for bounded distance computation.
-    // Practically the same as unbounded, but also collecting the running minimum within each row for early exit.
-#define _wagner_fisher_bounded(_distance_t, _char_t)                                                                  \
-    _char_t const *const longer_chars = (_char_t const *)longer;                                                      \
-    _char_t const *const shorter_chars = (_char_t const *)shorter;                                                    \
-    _distance_t *previous_distances = (_distance_t *)buffer;                                                          \
-    _distance_t *current_distances = previous_distances + n;                                                          \
-    for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
-    for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {                                     \
-        _char_t const longer_char = longer_chars[idx_longer];                                                         \
-        _char_t const *shorter_ptr = shorter_chars;                                                                   \
-        _distance_t const *previous_ptr = previous_distances;                                                         \
-        _distance_t *current_ptr = current_distances;                                                                 \
-        _distance_t *const current_end = current_ptr + shorter_length;                                                \
-        current_ptr[0] = idx_longer + 1;                                                                              \
-        /* Initialize min_distance with a value greater than bound */                                                 \
-        _distance_t min_distance = bound - 1;                                                                         \
-        for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) {                            \
-            _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]);           \
-            _distance_t cost_deletion = previous_ptr[1];                                                              \
-            _distance_t cost_insertion = current_ptr[0];                                                              \
-            current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1);      \
-            /* Keep track of the minimum distance seen so far in this row */                                          \
-            min_distance = sz_min_of_two(current_ptr[1], min_distance);                                               \
-        }                                                                                                             \
-        /* If the minimum distance in this row exceeded the bound, return early */                                    \
-        if (min_distance >= bound) {                                                                                  \
-            alloc->free(buffer, buffer_length, alloc->handle);                                                        \
-            *result_ptr = bound;                                                                                      \
-            return sz_success_k;                                                                                      \
-        }                                                                                                             \
-        _distance_t *temporary = previous_distances;                                                                  \
-        previous_distances = current_distances;                                                                       \
-        current_distances = temporary;                                                                                \
-    }                                                                                                                 \
-    sz_size_t result = previous_distances[shorter_length];                                                            \
-    alloc->free(buffer, buffer_length, alloc->handle);                                                                \
-    *result_ptr = result;                                                                                             \
-    return sz_success_k;
-
-    // Dispatch the actual computation.
-    if (!bound) {
-        if (can_be_unicode == sz_true_k) { _wagner_fisher_unbounded(sz_size_t, sz_rune_t); }
-        else { _wagner_fisher_unbounded(sz_size_t, sz_u8_t); }
-    }
-    else {
-        if (can_be_unicode == sz_true_k) { _wagner_fisher_bounded(sz_size_t, sz_rune_t); }
-        else { _wagner_fisher_bounded(sz_size_t, sz_u8_t); }
-    }
-}
-
-SZ_PUBLIC sz_status_t sz_levenshtein_distance_serial( //
-    sz_cptr_t longer, sz_size_t longer_length,        //
-    sz_cptr_t shorter, sz_size_t shorter_length,      //
-    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result_ptr) {
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        _sz_swap(sz_size_t, longer_length, shorter_length);
-        _sz_swap(sz_cptr_t, longer, shorter);
-    }
-
-    // Skip the matching prefixes and suffixes, they won't affect the distance.
-    for (sz_cptr_t a_end = longer + longer_length, b_end = shorter + shorter_length;
-         longer != a_end && shorter != b_end && *longer == *shorter;
-         ++longer, ++shorter, --longer_length, --shorter_length);
-    for (; longer_length && shorter_length && longer[longer_length - 1] == shorter[shorter_length - 1];
-         --longer_length, --shorter_length);
-
-    // Bounded computations may exit early.
-    int const is_bounded = bound < longer_length;
-    if (is_bounded) {
-        // If one of the strings is empty - the edit distance is equal to the length of the other one.
-        if (longer_length == 0) {
-            *result_ptr = sz_min_of_two(shorter_length, bound);
-            return sz_success_k;
-        }
-        if (shorter_length == 0) {
-            *result_ptr = sz_min_of_two(longer_length, bound);
-            return sz_success_k;
-        }
-        // If the difference in length is beyond the `bound`, there is no need to check at all.
-        if (longer_length - shorter_length > bound) {
-            *result_ptr = bound;
-            return sz_success_k;
-        }
-    }
-
-    // If no mismatches were found - the distance is zero.
-    if (shorter_length == 0) {
-        *result_ptr = longer_length;
-        return sz_success_k;
-    }
-    if (shorter_length == longer_length && !is_bounded)
-        return _sz_levenshtein_distance_skewed_diagonals_serial(longer, longer_length, shorter, shorter_length, bound,
-                                                                alloc, result_ptr);
-    return _sz_levenshtein_distance_wagner_fisher_serial( //
-        longer, longer_length, shorter, shorter_length, bound, sz_false_k, alloc, result_ptr);
-}
-
-SZ_PUBLIC sz_status_t sz_levenshtein_distance_utf8_serial( //
-    sz_cptr_t a, sz_size_t a_length,                       //
-    sz_cptr_t b, sz_size_t b_length,                       //
-    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result_ptr) {
-
-    if (sz_isascii(a, a_length) && sz_isascii(b, b_length))
-        return sz_levenshtein_distance_serial(a, a_length, b, b_length, bound, alloc, result_ptr);
-
-    return _sz_levenshtein_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc, result_ptr);
-}
-
-SZ_PUBLIC sz_status_t sz_needleman_wunsch_score_serial( //
-    sz_cptr_t longer, sz_size_t longer_length,          //
-    sz_cptr_t shorter, sz_size_t shorter_length,        //
-    sz_error_cost_t const *subs, sz_error_cost_t gap,   //
-    sz_memory_allocator_t *alloc, sz_ssize_t *result_ptr) {
-
-    // If one of the strings is empty - the edit distance is equal to the length of the other one
-    if (longer_length == 0) {
-        *result_ptr = (sz_ssize_t)shorter_length * gap;
-        return sz_success_k;
-    }
-    if (shorter_length == 0) {
-        *result_ptr = (sz_ssize_t)longer_length * gap;
-        return sz_success_k;
-    }
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        _sz_swap(sz_size_t, longer_length, shorter_length);
-        _sz_swap(sz_cptr_t, longer, shorter);
-    }
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    sz_size_t n = shorter_length + 1;
-    sz_size_t buffer_length = sizeof(sz_ssize_t) * n * 2;
-    sz_ssize_t *distances = (sz_ssize_t *)alloc->allocate(buffer_length, alloc->handle);
-    sz_ssize_t *previous_distances = distances;
-    sz_ssize_t *current_distances = previous_distances + n;
-
-    for (sz_size_t idx_shorter = 0; idx_shorter != n; ++idx_shorter)
-        previous_distances[idx_shorter] = (sz_ssize_t)idx_shorter * gap;
-
-    sz_u8_t const *shorter_unsigned = (sz_u8_t const *)shorter;
-    sz_u8_t const *longer_unsigned = (sz_u8_t const *)longer;
-    for (sz_size_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {
-        current_distances[0] = ((sz_ssize_t)idx_longer + 1) * gap;
-
-        // Initialize min_distance with a value greater than bound
-        sz_error_cost_t const *a_subs = subs + longer_unsigned[idx_longer] * 256ul;
-        for (sz_size_t idx_shorter = 0; idx_shorter != shorter_length; ++idx_shorter) {
-            sz_ssize_t cost_deletion = previous_distances[idx_shorter + 1] + gap;
-            sz_ssize_t cost_insertion = current_distances[idx_shorter] + gap;
-            sz_ssize_t cost_substitution = previous_distances[idx_shorter] + a_subs[shorter_unsigned[idx_shorter]];
-            current_distances[idx_shorter + 1] = sz_max_of_three(cost_deletion, cost_insertion, cost_substitution);
-        }
-
-        // Swap previous_distances and current_distances pointers
-        _sz_swap(sz_ssize_t *, previous_distances, current_distances);
-    }
-
-    // Cache scalar before `free` call.
-    sz_ssize_t result = previous_distances[shorter_length];
-    alloc->free(distances, buffer_length, alloc->handle);
-    *result_ptr = result;
-    return sz_success_k;
-}
-
-SZ_PUBLIC sz_status_t sz_hamming_distance_serial( //
-    sz_cptr_t a, sz_size_t a_length,              //
-    sz_cptr_t b, sz_size_t b_length,              //
-    sz_size_t bound, sz_size_t *result_ptr) {
-
-    sz_size_t const min_length = sz_min_of_two(a_length, b_length);
-    sz_size_t const max_length = sz_max_of_two(a_length, b_length);
-    sz_cptr_t const a_end = a + min_length;
-    bound = bound == 0 ? max_length : bound;
-
-    // Walk through both strings using SWAR and counting the number of differing characters.
-    sz_size_t distance = max_length - min_length;
-#if SZ_USE_MISALIGNED_LOADS && !_SZ_IS_BIG_ENDIAN
-    if (min_length >= SZ_SWAR_THRESHOLD) {
-        sz_u64_vec_t a_vec, b_vec, match_vec;
-        for (; a + 8 <= a_end && distance < bound; a += 8, b += 8) {
-            a_vec.u64 = sz_u64_load(a).u64;
-            b_vec.u64 = sz_u64_load(b).u64;
-            match_vec = _sz_u64_each_byte_equal(a_vec, b_vec);
-            distance += sz_u64_popcount((~match_vec.u64) & 0x8080808080808080ull);
-        }
-    }
-#endif
-
-    for (; a != a_end && distance < bound; ++a, ++b) { distance += (*a != *b); }
-    *result_ptr = sz_min_of_two(distance, bound);
-    return sz_success_k;
-}
-
-SZ_PUBLIC sz_status_t sz_hamming_distance_utf8_serial( //
-    sz_cptr_t a, sz_size_t a_length,                   //
-    sz_cptr_t b, sz_size_t b_length,                   //
-    sz_size_t bound, sz_size_t *result_ptr) {
-
-    if (sz_isascii(a, a_length) && sz_isascii(b, b_length))
-        return sz_hamming_distance_serial(a, a_length, b, b_length, bound, result_ptr);
-
-    sz_cptr_t const a_end = a + a_length;
-    sz_cptr_t const b_end = b + b_length;
-    sz_size_t distance = 0;
-
-    sz_rune_t a_rune, b_rune;
-    sz_rune_length_t a_rune_length, b_rune_length;
-
-    if (bound) {
-        for (; a < a_end && b < b_end && distance < bound; a += a_rune_length, b += b_rune_length) {
-            sz_rune_parse(a, &a_rune, &a_rune_length);
-            sz_rune_parse(b, &b_rune, &b_rune_length);
-            distance += (a_rune != b_rune);
-        }
-        // If one string has more runes, we need to go through the tail.
-        if (distance < bound) {
-            for (; a < a_end && distance < bound; a += a_rune_length, ++distance)
-                sz_rune_parse(a, &a_rune, &a_rune_length);
-
-            for (; b < b_end && distance < bound; b += b_rune_length, ++distance)
-                sz_rune_parse(b, &b_rune, &b_rune_length);
-        }
-    }
-    else {
-        for (; a < a_end && b < b_end; a += a_rune_length, b += b_rune_length) {
-            sz_rune_parse(a, &a_rune, &a_rune_length);
-            sz_rune_parse(b, &b_rune, &b_rune_length);
-            distance += (a_rune != b_rune);
-        }
-        // If one string has more runes, we need to go through the tail.
-        for (; a < a_end; a += a_rune_length, ++distance) sz_rune_parse(a, &a_rune, &a_rune_length);
-        for (; b < b_end; b += b_rune_length, ++distance) sz_rune_parse(b, &b_rune, &b_rune_length);
-    }
-    *result_ptr = distance;
-    return sz_success_k;
-}
-
-#pragma endregion // Serial Implementation
-
-/*  AVX2 implementation of the string similarity algorithms for Haswell processors and newer.
- *  Very minimalistic (compared to AVX-512), but still faster than the serial implementation.
- */
-#pragma region Haswell Implementation
-#if SZ_USE_HASWELL
-#pragma GCC push_options
-#pragma GCC target("avx2")
-#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif            // SZ_USE_HASWELL
-#pragma endregion // Haswell Implementation
-
-/*  AVX512 implementation of the string similarity algorithms for Skylake and newer CPUs.
- *  Includes extensions: F, CD, ER, PF, VL, DQ, BW.
- *
- *  This is the "starting level" for the advanced algorithms using K-mask registers on x86.
- */
-#pragma region Skylake Implementation
-#if SZ_USE_SKYLAKE
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif            // SZ_USE_SKYLAKE
-#pragma endregion // Skylake Implementation
-
-/*  AVX512 implementation of the string similarity algorithms for Ice Lake and newer CPUs.
- *  Includes extensions:
- *      - 2017 Skylake: F, CD, ER, PF, VL, DQ, BW,
- *      - 2018 CannonLake: IFMA, VBMI,
- *      - 2019 Ice Lake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES.
- */
-#pragma region Ice Lake Implementation
-#if SZ_USE_ICE
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
-                             apply_to = function)
-
-/**
- *  @brief Computes the edit distance between two very short byte-strings using the AVX-512VBMI extensions.
- *  @sa `sz::levenshtein_distance_openmp`.
- *
- *  Applies to string lengths up to 63, and evaluates at most (63 * 2 + 1 = 127) diagonals, or just as many loop
- *  cycles. Supports an early exit, if the distance is bounded. Keeps all of the data and Levenshtein matrices skew
- *  diagonal in just a couple of registers. Benefits from the @b `vpermb` instructions, that can rotate the bytes
- *  across the entire ZMM register.
- *
- *  ? Bounds check, for inputs ranging from 33 to 64 bytes doesn't affect the performance at all.
- *  ? It's also worth exploring `_mm512_alignr_epi8` and `_mm512_maskz_compress_epi8` for the shift.
- */
-SZ_INTERNAL sz_size_t _sz_levenshtein_distance_skewed_diagonals_upto63_ice( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                            //
-    sz_cptr_t longer, sz_size_t longer_length,                              //
-    sz_size_t bound) {
-
-    sz_size_t const max_length = 63u;
-    _sz_assert(shorter_length <= longer_length && "The 'shorter' string is longer than the 'longer' one.");
-    _sz_assert(shorter_length < max_length && "The length must fit into 16-bit integer. Otherwise use serial variant.");
-
-    // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
-    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
-    sz_size_t const shorter_dim = shorter_length + 1;
-    sz_size_t const longer_dim = longer_length + 1;
-
-    // The next few buffers will be swapped around.
-    sz_u512_vec_t previous_vec, current_vec, next_vec;
-    sz_u512_vec_t gaps_vec, substitutions_vec;
-
-    // Load the strings into ZMM registers - just once.
-    sz_u512_vec_t longer_vec, shorter_vec, shorter_rotated_vec, rotate_left_vec, rotate_right_vec, ones_vec, bound_vec;
-    longer_vec.zmm = _mm512_maskz_loadu_epi8(_sz_u64_mask_until(longer_length), longer);
-    rotate_left_vec.zmm = _mm512_set_epi8(                              //
-        0, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,  //
-        48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, //
-        32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, //
-        16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-    rotate_right_vec.zmm = _mm512_set_epi8(                             //
-        62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48,     //
-        47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, //
-        31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, //
-        15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 63);
-    ones_vec.zmm = _mm512_set1_epi8(1);
-    bound_vec.zmm = _mm512_set1_epi8(bound <= 255 ? (sz_u8_t)bound : 255);
-
-    // To simplify comparisons and traversals, we want to reverse the order of bytes in the shorter string.
-    shorter_vec.zmm = _mm512_setzero_si512(); //? To simplify debugging, but can be noise
-    for (sz_size_t i = 0; i != shorter_length; ++i) shorter_vec.u8s[63 - i] = shorter[i];
-    shorter_rotated_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, shorter_vec.zmm);
-
-    // Let's say we are dealing with 3 and 5 letter words.
-    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
-    // It will have:
-    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
-    // - 2 diagonals of fixed length, at positions: 4, 5.
-    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
-    sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
-
-    // Initialize the first two diagonals:
-    //
-    //      previous_vec.u8s[0] = 0;
-    //      current_vec.u8s[0] = current_vec.u8s[1] = 1;
-    //
-    // We can do a similar thing with vector ops:
-    previous_vec.zmm = _mm512_setzero_si512();
-    current_vec.zmm = _mm512_set1_epi8(1);
-
-    // We skip diagonals 0 and 1, as they are trivial.
-    // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
-    // so we are effectively computing just one value, as will be marked by a single set bit in
-    // the `next_diagonal_mask` on the very first iteration.
-    sz_size_t next_diagonal_index = 2;
-    __mmask64 next_diagonal_mask = 0;
-
-    // Progress through the upper-left triangle of the Levenshtein matrix.
-    for (; next_diagonal_index != shorter_dim; ++next_diagonal_index) {
-        // After this iteration, the values at offset `0` and `next_diagonal_index` in the `next_vec`
-        // should be set to `next_diagonal_index`, but it's easier to broadcast the value to the whole vector,
-        // and later merge with a mask with new values.
-        next_vec.zmm = _mm512_set1_epi8((sz_u8_t)next_diagonal_index);
-
-        // The mask also adds one set bit.
-        next_diagonal_mask = _kor_mask64(next_diagonal_mask, 1);
-        next_diagonal_mask = _kshiftli_mask64(next_diagonal_mask, 1);
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        substitutions_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, substitutions_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(_mm512_permutexvar_epi8(rotate_right_vec.zmm, current_vec.zmm), current_vec.zmm),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_mask_min_epu8(next_vec.zmm, next_diagonal_mask, gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = current_vec.zmm;
-        current_vec.zmm = next_vec.zmm;
-
-        // Shift the shorter string
-        shorter_rotated_vec.zmm = _mm512_permutexvar_epi8(rotate_right_vec.zmm, shorter_rotated_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return bound;
-    }
-
-    // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
-    for (; next_diagonal_index != longer_dim; ++next_diagonal_index) {
-        // After this iteration, the value `shorted_dim - 1` in the `next_vec`
-        // should be set to `next_diagonal_index`, but it's easier to broadcast the value to the whole vector,
-        // and later merge with a mask with new values.
-        next_vec.zmm = _mm512_set1_epi8((sz_u8_t)next_diagonal_index);
-
-        // Make sure we update the first entry.
-        next_diagonal_mask = _kor_mask64(next_diagonal_mask, 1);
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(current_vec.zmm, _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm)),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_mask_min_epu8(next_vec.zmm, next_diagonal_mask, gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm);
-        current_vec.zmm = next_vec.zmm;
-
-        // Let's shift the longer string now.
-        longer_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, longer_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return bound;
-    }
-
-    // Now let's handle the bottom right triangle.
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-
-        // Check for equality between string slices.
-        __mmask64 conflict_mask = _mm512_cmpneq_epi8_mask(longer_vec.zmm, shorter_rotated_vec.zmm);
-        substitutions_vec.zmm = _mm512_mask_add_epi8(previous_vec.zmm, conflict_mask, previous_vec.zmm, ones_vec.zmm);
-        gaps_vec.zmm = _mm512_add_epi8(
-            // Insertions or deletions
-            _mm512_min_epu8(current_vec.zmm, _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm)),
-            ones_vec.zmm);
-        next_vec.zmm = _mm512_min_epu8(gaps_vec.zmm, substitutions_vec.zmm);
-
-        // Mark the current skewed diagonal as the previous one and the next one as the current one.
-        previous_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, current_vec.zmm);
-        current_vec.zmm = next_vec.zmm;
-
-        // Let's shift the longer string now.
-        longer_vec.zmm = _mm512_permutexvar_epi8(rotate_left_vec.zmm, longer_vec.zmm);
-
-        // Check if we can exit early - if none of the diagonals values are smaller than the upper distance bound.
-        __mmask64 within_bound_mask = _mm512_cmple_epu8_mask(next_vec.zmm, bound_vec.zmm);
-        if (_ktestz_mask64_u8(within_bound_mask, next_diagonal_mask) == 1) return bound;
-
-        // In every following iterations we take use a shorter prefix of each register,
-        // but we don't need to update the `next_diagonal_mask` anymore... except for the early exit.
-        next_diagonal_mask = _kshiftri_mask64(next_diagonal_mask, 1);
-    }
-    return current_vec.u8s[0];
-}
-
-/**
- *  @brief  Computes the edit distance between two somewhat short bytes-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 127, and evaluates at most (127 * 2 + 1 = 255) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *  Benefits from the @b `vpermi2b` instructions, that can rotate the bytes in 2 registers at once.
- *
- *  This may be one of the most frequently called kernels for:
- *  - source code analysis, assuming most lines are either under 80 or under 120 characters long.
- *  - DNA sequence alignment, as most short reads are 50-300 characters long.
- */
-SZ_INTERNAL sz_size_t _sz_levenshtein_distance_skewed_diagonals_upto127_ice( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                             //
-    sz_cptr_t longer, sz_size_t longer_length,                               //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two longer bytes-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 255, and evaluates at most (255 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *
- *  Each of 2x string ends up occupying 4 ZMM registers, and each of 3x diagonals uses 4 ZMM registers.
- *  So 20x of the 32x are persistently occupied, and the rest are used for math temporarily.
- *  This is the largest space-efficient variant, as strings beyond 255 characters may require
- *  16-bit accumulators, which would be a significant bottleneck.
- */
-SZ_INTERNAL sz_size_t _sz_levenshtein_distance_skewed_diagonals_upto_ice( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                          //
-    sz_cptr_t longer, sz_size_t longer_length,                            //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two longer bytes-strings using the AVX-512VBMI extensions,
- *          assuming the upper distance bound can not exceed 255, but the string length can be arbitrary.
- *
- *  Applies to string lengths up to 255, and evaluates at most (255 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Uses a lot more CPU registers space, than the `upto63` variant.
- *
- *  Each of 2x string ends up occupying 4 ZMM registers, and each of 3x diagonals uses 4 ZMM registers.
- *  So 20x of the 32x are persistently occupied, and the rest are used for math temporarily.
- *  This is the largest space-efficient variant, as strings beyond 255 characters may require
- *  16-bit accumulators, which would be a significant bottleneck.
- */
-SZ_INTERNAL sz_size_t _sz_levenshtein_distance_skewed_diagonals_upto255bound_ice( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                                  //
-    sz_cptr_t longer, sz_size_t longer_length,                                    //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-/**
- *  @brief  Computes the edit distance between two mid-length UTF-8-strings using the AVX-512VBMI extensions.
- *
- *  Applies to string lengths up to 127, and evaluates at most (127 * 2 + 1 = 511) diagonals.
- *  Supports an early exit, if the distance is bounded.
- *  Benefits from the @b `valignd` instructions used to rotate UTF-32 unpacked unicode codepoints.
- *
- *  Each string is unpacked into 128 characters * 4 bytes per character / 64 bytes per register = 8 registers.
- */
-SZ_INTERNAL sz_size_t _sz_levenshtein_distance_utf8_skewed_diagonals_upto127_ice( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                                  //
-    sz_cptr_t longer, sz_size_t longer_length,                                    //
-    sz_size_t bound) {
-    sz_unused(shorter && shorter_length && longer && longer_length && bound);
-    return 0;
-}
-
-SZ_INTERNAL sz_status_t _sz_levenshtein_distance_skewed_diagonals_upto65k_ice( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                               //
-    sz_cptr_t longer, sz_size_t longer_length,                                 //
-    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result_ptr) {
-
-    sz_unused(shorter && longer && bound && alloc && result_ptr);
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    // TODO: Generalize!
-    sz_size_t const max_length = 256u * 256u;
-    _sz_assert(shorter_length <= longer_length && "The 'shorter' string is longer than the 'longer' one.");
-    _sz_assert(shorter_length < max_length && "The length must fit into 16-bit integer. Otherwise use serial variant.");
-    sz_unused(longer_length && bound && max_length);
-
-#if 0
-    // We are going to store 3 diagonals of the matrix.
-    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
-    sz_size_t const shorter_dim = shorter_length + 1;
-    sz_size_t const longer_dim = longer_length + 1;
-    // Unlike the serial version, we also want to avoid reverse-order iteration over the shorter string.
-    // So let's allocate a bit more memory and reverse-export our shorter string into that buffer.
-    sz_size_t const buffer_length = sizeof(sz_u16_t) * longer_dim * 3 + shorter_length;
-    sz_u16_t *const distances = (sz_u16_t *)alloc->allocate(buffer_length, alloc->handle);
-    if (!distances) return SZ_SIZE_MAX;
-
-    // The next few pointers will be swapped around.
-    sz_u16_t *previous_distances = distances;
-    sz_u16_t *current_distances = previous_distances + longer_dim;
-    sz_u16_t *next_distances = current_distances + longer_dim;
-    sz_ptr_t const shorter_reversed = (sz_ptr_t)(next_distances + longer_dim);
-
-    // Export the reversed string into the buffer.
-    for (sz_size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
-
-    // Initialize the first two diagonals:
-    previous_distances[0] = 0;
-    current_distances[0] = current_distances[1] = 1;
-
-    // Using ZMM registers, we can process 32x 16-bit values at once,
-    // storing 16 bytes of each string in YMM registers.
-    sz_u512_vec_t insertions_vec, deletions_vec, substitutions_vec, next_vec;
-    sz_u512_vec_t ones_u16_vec;
-    ones_u16_vec.zmm = _mm512_set1_epi16(1);
-
-    // This is a mixed-precision implementation, using 8-bit representations for part of the operations.
-    // Even there, in case `SZ_USE_HASWELL=0`, let's use the `sz_u512_vec_t` type, addressing the first YMM halfs.
-    sz_u512_vec_t shorter_vec, longer_vec;
-    sz_u512_vec_t ones_u8_vec;
-    ones_u8_vec.ymms[0] = _mm256_set1_epi8(1);
-
-    // Let's say we are dealing with 3 and 5 letter words.
-    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
-    // It will have:
-    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
-    // - 2 diagonals of fixed length, at positions: 4, 5.
-    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
-    sz_size_t const diagonals_count = shorter_dim + longer_dim - 1;
-
-    // Progress through the upper-left triangle of the Levenshtein matrix.
-    sz_size_t next_diagonal_index = 2;
-    for (; next_diagonal_index != shorter_dim; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = next_diagonal_index + 1;
-        for (sz_size_t offset_within_diagonal = 0; offset_within_diagonal + 2 < next_diagonal_length;) {
-            sz_u32_t remaining_length = (sz_u32_t)(next_diagonal_length - offset_within_diagonal - 2);
-            sz_u32_t register_length = remaining_length < 32 ? remaining_length : 32;
-            sz_u32_t remaining_length_mask = _bzhi_u32(0xFFFFFFFFu, register_length);
-            longer_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, longer + offset_within_diagonal);
-            // Our original code addressed the shorter string `[next_diagonal_index - offset_within_diagonal - 2]`
-            // for growing `offset_within_diagonal`. If the `shorter` string was reversed, the
-            // `[next_diagonal_index - offset_within_diagonal - 2]` would be equal to `[shorter_length - 1 -
-            // next_diagonal_index + offset_within_diagonal + 2]`. Which simplified would be equal to
-            // `[shorter_length - next_diagonal_index + offset_within_diagonal + 1]`.
-            shorter_vec.ymms[0] = _mm256_maskz_loadu_epi8( //
-                remaining_length_mask,
-                shorter_reversed + shorter_length - next_diagonal_index + offset_within_diagonal + 1);
-            // For substitutions, perform the equality comparison using AVX2 instead of AVX-512
-            // to get the result as a vector, instead of a bitmask. Adding 1 to every scalar we can overflow
-            // transforming from {0xFF, 0} values to {0, 1} values - exactly what we need. Then - upcast to 16-bit.
-            substitutions_vec.zmm = _mm512_cvtepi8_epi16( //
-                _mm256_add_epi8(_mm256_cmpeq_epi8(longer_vec.ymms[0], shorter_vec.ymms[0]), ones_u8_vec.ymms[0]));
-            substitutions_vec.zmm = _mm512_add_epi16( //
-                substitutions_vec.zmm,
-                _mm512_maskz_loadu_epi16(remaining_length_mask, previous_distances + offset_within_diagonal));
-            // For insertions and deletions, on modern hardware, it's faster to issue two separate loads,
-            // than rotate the bytes in the ZMM register.
-            insertions_vec.zmm =
-                _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + offset_within_diagonal);
-            deletions_vec.zmm =
-                _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + offset_within_diagonal + 1);
-            // First get the minimum of insertions and deletions.
-            next_vec.zmm = _mm512_add_epi16(_mm512_min_epu16(insertions_vec.zmm, deletions_vec.zmm), ones_u16_vec.zmm);
-            next_vec.zmm = _mm512_min_epu16(next_vec.zmm, substitutions_vec.zmm);
-            _mm512_mask_storeu_epi16(next_distances + offset_within_diagonal + 1, remaining_length_mask, next_vec.zmm);
-            offset_within_diagonal += register_length;
-        }
-        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
-        next_distances[0] = next_distances[next_diagonal_length - 1] = (sz_u16_t)next_diagonal_index;
-        // Perform a circular rotation (three-way swap) of those buffers, to reuse the memory.
-        sz_u16_t *temporary = previous_distances;
-        previous_distances = current_distances;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // By now we've scanned through the upper-left triangle of the matrix, where each subsequent iteration results in a
-    // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
-    // index on either side, we will be cropping those values out.
-    for (; next_diagonal_index != diagonals_count; ++next_diagonal_index) {
-        sz_size_t const next_diagonal_length = diagonals_count - next_diagonal_index;
-        for (sz_size_t i = 0; i != next_diagonal_length;) {
-            sz_u32_t remaining_length = (sz_u32_t)(next_diagonal_length - i);
-            sz_u32_t register_length = remaining_length < 32 ? remaining_length : 32;
-            sz_u32_t remaining_length_mask = _bzhi_u32(0xFFFFFFFFu, register_length);
-            longer_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, longer + next_diagonal_index - n + i);
-            // Our original code addressed the shorter string `[shorter_length - 1 - i]` for growing `i`.
-            // If the `shorter` string was reversed, the `[shorter_length - 1 - i]` would
-            // be equal to `[shorter_length - 1 - shorter_length + 1 + i]`.
-            // Which simplified would be equal to just `[i]`. Beautiful!
-            shorter_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, shorter_reversed + i);
-            // For substitutions, perform the equality comparison using AVX2 instead of AVX-512
-            // to get the result as a vector, instead of a bitmask. The compare it against the accumulated
-            // substitution costs.
-            substitutions_vec.zmm = _mm512_cvtepi8_epi16( //
-                _mm256_add_epi8(_mm256_cmpeq_epi8(longer_vec.ymms[0], shorter_vec.ymms[0]), ones_u8_vec.ymms[0]));
-            substitutions_vec.zmm = _mm512_add_epi16( //
-                substitutions_vec.zmm, _mm512_maskz_loadu_epi16(remaining_length_mask, previous_distances + i));
-            // For insertions and deletions, on modern hardware, it's faster to issue two separate loads,
-            // than rotate the bytes in the ZMM register.
-            insertions_vec.zmm = _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + i);
-            deletions_vec.zmm = _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + i + 1);
-            // First get the minimum of insertions and deletions.
-            next_vec.zmm = _mm512_add_epi16(_mm512_min_epu16(insertions_vec.zmm, deletions_vec.zmm), ones_u16_vec.zmm);
-            next_vec.zmm = _mm512_min_epu16(next_vec.zmm, substitutions_vec.zmm);
-            _mm512_mask_storeu_epi16(next_distances + i, remaining_length_mask, next_vec.zmm);
-            i += register_length;
-        }
-
-        // Perform a circular rotation (three-way swap) of those buffers, to reuse the memory, this time, with a shift,
-        // dropping the first element in the current array.
-        sz_u16_t *temporary = previous_distances;
-        previous_distances = current_distances + 1;
-        current_distances = next_distances;
-        next_distances = temporary;
-    }
-
-    // Cache scalar before `free` call.
-    sz_size_t result = current_distances[0];
-    alloc->free(distances, buffer_length, alloc->handle);
-    return result;
-#endif
-    return sz_success_k;
-}
-
-SZ_PUBLIC sz_status_t sz_levenshtein_distance_ice( //
-    sz_cptr_t shorter, sz_size_t shorter_length,   //
-    sz_cptr_t longer, sz_size_t longer_length,     //
-    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result_ptr) {
-
-    // Bounded computations may exit early.
-    int const is_bounded = bound < longer_length;
-    if (is_bounded) {
-        // If one of the strings is empty - the edit distance is equal to the length of the other one.
-        if (longer_length == 0) {
-            *result_ptr = sz_min_of_two(shorter_length, bound);
-            return sz_success_k;
-        }
-        if (shorter_length == 0) {
-            *result_ptr = sz_min_of_two(longer_length, bound);
-            return sz_success_k;
-        }
-        // If the difference in length is beyond the `bound`, there is no need to check at all.
-        if (longer_length - shorter_length > bound) {
-            *result_ptr = bound;
-            return sz_success_k;
-        }
-    }
-
-    // Make sure the shorter string is actually shorter.
-    if (shorter_length > longer_length) {
-        sz_cptr_t temporary = shorter;
-        shorter = longer;
-        longer = temporary;
-        sz_size_t temporary_length = shorter_length;
-        shorter_length = longer_length;
-        longer_length = temporary_length;
-    }
-
-    // Dispatch the right implementation based on the length of the strings.
-    if (longer_length < 64u) {
-        *result_ptr = _sz_levenshtein_distance_skewed_diagonals_upto63_ice( //
-            shorter, shorter_length, longer, longer_length, bound);
-        return sz_success_k;
-    }
-
-    // else if (longer_length < 256u * 256u)
-    //     return _sz_levenshtein_distance_skewed_diagonals_upto65k_ice( //
-    //         shorter, shorter_length, longer, longer_length, bound, alloc);
-    else
-        return sz_levenshtein_distance_serial(shorter, shorter_length, longer, longer_length, bound, alloc, result_ptr);
-}
-
-/**
- *  @brief  Computes the Needleman-Wunsch alignment score between two strings. Uses the Wagner-Fischer algorithm
- *          with the AVX-512VBMI extensions, vectorizing the substitution costs in each row.
- *
- *  The method uses 32-bit integers to accumulate the running score for every cell in the matrix.
- *  Assuming the costs of substitutions can be arbitrary signed 8-bit integers, the method is expected to be used
- *  on strings not exceeding 2^24 length or 16.7 million characters.
- *
- *  Unlike the `_sz_levenshtein_distance_skewed_diagonals_upto65k_avx512` method, this one uses signed integers to store
- *  the accumulated score. Moreover, it's primary bottleneck is the latency of gathering the substitution costs
- *  from the substitution matrix. If we use the diagonal order, we will be comparing a slice of the first string
- *  with a slice of the second. If we stick to the conventional horizontal order, we will be comparing one character
- *  against a slice, which is much easier to optimize. In that case we are sampling costs not from arbitrary parts of
- *  a 256 x 256 matrix, but from a single row!
- */
-SZ_INTERNAL sz_status_t _sz_needleman_wunsch_score_wagner_fisher_upto17m_ice( //
-    sz_cptr_t shorter, sz_size_t shorter_length,                              //
-    sz_cptr_t longer, sz_size_t longer_length,                                //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc, sz_ssize_t *result_ptr) {
-
-    // If one of the strings is empty - the edit distance is equal to the length of the other one
-    if (longer_length == 0) {
-        *result_ptr = (sz_ssize_t)shorter_length * gap;
-        return sz_success_k;
-    }
-    if (shorter_length == 0) {
-        *result_ptr = (sz_ssize_t)longer_length * gap;
-        return sz_success_k;
-    }
-
-    // Let's make sure that we use the amount proportional to the
-    // number of elements in the shorter string, not the larger.
-    if (shorter_length > longer_length) {
-        _sz_swap(sz_size_t, longer_length, shorter_length);
-        _sz_swap(sz_cptr_t, longer, shorter);
-    }
-
-    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
-    sz_memory_allocator_t global_alloc;
-    if (!alloc) {
-        sz_memory_allocator_init_default(&global_alloc);
-        alloc = &global_alloc;
-    }
-
-    sz_size_t const max_length = 256ull * 256ull * 256ull;
-    sz_size_t const n = longer_length + 1;
-    _sz_assert(n < max_length && "The length must fit into 24-bit integer. Otherwise use serial variant.");
-    sz_unused(longer_length && max_length);
-
-    sz_size_t buffer_length = sizeof(sz_i32_t) * n * 2;
-    sz_i32_t *distances = (sz_i32_t *)alloc->allocate(buffer_length, alloc->handle);
-    if (!distances) return sz_bad_alloc_k;
-    sz_i32_t *previous_distances = distances;
-    sz_i32_t *current_distances = previous_distances + n;
-
-    // Initialize the first row of the Levenshtein matrix with `iota`.
-    for (sz_size_t idx_longer = 0; idx_longer != n; ++idx_longer)
-        previous_distances[idx_longer] = (sz_i32_t)idx_longer * gap;
-
-    /// Contains up to 16 consecutive characters from the longer string.
-    sz_u512_vec_t longer_vec;
-    sz_u512_vec_t cost_deletion_vec, cost_substitution_vec, lookup_substitution_vec, current_vec;
-    sz_u512_vec_t row_first_subs_vec, row_second_subs_vec, row_third_subs_vec, row_fourth_subs_vec;
-    sz_u512_vec_t shuffled_first_subs_vec, shuffled_second_subs_vec, shuffled_third_subs_vec, shuffled_fourth_subs_vec;
-
-    // Prepare constants and masks.
-    sz_u512_vec_t is_third_or_fourth_vec, is_second_or_fourth_vec, gap_vec;
-    {
-        char is_third_or_fourth_check, is_second_or_fourth_check;
-        *(sz_u8_t *)&is_third_or_fourth_check = 0x80, *(sz_u8_t *)&is_second_or_fourth_check = 0x40;
-        is_third_or_fourth_vec.zmm = _mm512_set1_epi8(is_third_or_fourth_check);
-        is_second_or_fourth_vec.zmm = _mm512_set1_epi8(is_second_or_fourth_check);
-        gap_vec.zmm = _mm512_set1_epi32(gap);
-    }
-
-    sz_u8_t const *shorter_unsigned = (sz_u8_t const *)shorter;
-    for (sz_size_t idx_shorter = 0; idx_shorter != shorter_length; ++idx_shorter) {
-        sz_i32_t last_in_row = current_distances[0] = (sz_i32_t)(idx_shorter + 1) * gap;
-
-        // Load one row of the substitution matrix into four ZMM registers.
-        sz_error_cost_t const *row_subs = subs + shorter_unsigned[idx_shorter] * 256u;
-        row_first_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 0);
-        row_second_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 1);
-        row_third_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 2);
-        row_fourth_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 3);
-
-        // In the serial version we have one forward pass, that computes the deletion,
-        // insertion, and substitution costs at once.
-        //    for (sz_size_t idx_longer = 0; idx_longer < longer_length; ++idx_longer) {
-        //        sz_ssize_t cost_deletion = previous_distances[idx_longer + 1] + gap;
-        //        sz_ssize_t cost_insertion = current_distances[idx_longer] + gap;
-        //        sz_ssize_t cost_substitution = previous_distances[idx_longer] +
-        //        row_subs[longer_unsigned[idx_longer]]; current_distances[idx_longer + 1] =
-        //        sz_min_of_three(cost_deletion, cost_insertion, cost_substitution);
-        //    }
-        //
-        // Given the complexity of handling the data-dependency between consecutive insertion cost computations
-        // within a Levenshtein matrix, the simplest design would be to vectorize every kind of cost computation
-        // separately.
-        //      1. Compute substitution costs for up to 64 characters at once, upcasting from 8-bit integers to 32.
-        //      2. Compute the pairwise minimum with deletion costs.
-        //      3. Inclusive prefix minimum computation to combine with addition costs.
-        // Proceeding with substitutions:
-        for (sz_size_t idx_longer = 0; idx_longer < longer_length; idx_longer += 64) {
-            sz_size_t register_length = sz_min_of_two(longer_length - idx_longer, 64);
-            __mmask64 mask = _sz_u64_mask_until(register_length);
-            longer_vec.zmm = _mm512_maskz_loadu_epi8(mask, longer + idx_longer);
-
-            // Blend the `row_(first|second|third|fourth)_subs_vec` into `current_vec`, picking the right source
-            // for every character in `longer_vec`. Before that, we need to permute the subsititution vectors.
-            // Only the bottom 6 bits of a byte are used in VPERB, so we don't even need to mask.
-            shuffled_first_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_first_subs_vec.zmm);
-            shuffled_second_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_second_subs_vec.zmm);
-            shuffled_third_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_third_subs_vec.zmm);
-            shuffled_fourth_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_fourth_subs_vec.zmm);
-
-            // To blend we can invoke three `_mm512_cmplt_epu8_mask`, but we can also achieve the same using
-            // the AND logical operation, checking the top two bits of every byte.
-            // Continuing this thought, we can use the VPTESTMB instruction to output the mask after the AND.
-            __mmask64 is_third_or_fourth = _mm512_mask_test_epi8_mask(mask, longer_vec.zmm, is_third_or_fourth_vec.zmm);
-            __mmask64 is_second_or_fourth =
-                _mm512_mask_test_epi8_mask(mask, longer_vec.zmm, is_second_or_fourth_vec.zmm);
-            lookup_substitution_vec.zmm = _mm512_mask_blend_epi8(
-                is_third_or_fourth,
-                // Choose between the first and the second.
-                _mm512_mask_blend_epi8(is_second_or_fourth, shuffled_first_subs_vec.zmm, shuffled_second_subs_vec.zmm),
-                // Choose between the third and the fourth.
-                _mm512_mask_blend_epi8(is_second_or_fourth, shuffled_third_subs_vec.zmm, shuffled_fourth_subs_vec.zmm));
-
-            // First, sign-extend lower and upper 16 bytes to 16-bit integers.
-            __m512i current_0_31_vec = _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(lookup_substitution_vec.zmm, 0));
-            __m512i current_32_63_vec = _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(lookup_substitution_vec.zmm, 1));
-
-            // Now extend those 16-bit integers to 32-bit.
-            // This isn't free, same as the subsequent store, so we only want to do that for the populated lanes.
-            // To minimize the number of loads and stores, we can combine our substitution costs with the previous
-            // distances, containing the deletion costs.
-            {
-                cost_substitution_vec.zmm = _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_0_31_vec, 0)));
-                cost_deletion_vec.zmm = _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Inclusive prefix minimum computation to combine with insertion costs.
-                // Simply disabling this operation results in 5x performance improvement, meaning
-                // that this operation is responsible for 80% of the total runtime.
-                //    for (sz_size_t idx_longer = 0; idx_longer < longer_length; ++idx_longer) {
-                //        current_distances[idx_longer + 1] =
-                //            sz_max_of_two(current_distances[idx_longer] + gap, current_distances[idx_longer + 1]);
-                //    }
-                //
-                // To perform the same operation in vectorized form, we need to perform a tree-like reduction,
-                // that will involve multiple steps. It's quite expensive and should be first tested in the
-                // "experimental" section.
-                //
-                // Another approach might be loop unrolling:
-                //      current_vec.i32s[0] = last_in_row = sz_i32_max_of_two(current_vec.i32s[0], last_in_row +
-                //      gap); current_vec.i32s[1] = last_in_row = sz_i32_max_of_two(current_vec.i32s[1], last_in_row
-                //      + gap); current_vec.i32s[2] = last_in_row = sz_i32_max_of_two(current_vec.i32s[2],
-                //      last_in_row + gap);
-                //      ... yet this approach is also quite expensive.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 16 to 31.
-            if (register_length > 16) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 16);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_0_31_vec, 1)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 16);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 16, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 32 to 47.
-            if (register_length > 32) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 32);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_32_63_vec, 0)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 32);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 32, (__mmask16)mask, current_vec.zmm);
-            }
-
-            // Export the values from 32 to 47.
-            if (register_length > 48) {
-                mask = _kshiftri_mask64(mask, 16);
-                cost_substitution_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + idx_longer + 48);
-                cost_substitution_vec.zmm = _mm512_add_epi32(
-                    cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_32_63_vec, 1)));
-                cost_deletion_vec.zmm =
-                    _mm512_maskz_loadu_epi32((__mmask16)mask, previous_distances + 1 + idx_longer + 48);
-                cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
-                current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
-
-                // Aggregate running insertion costs within the register.
-                for (int i = 0; i != 16; ++i)
-                    current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
-                _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 48, (__mmask16)mask, current_vec.zmm);
-            }
-        }
-
-        // Swap previous_distances and current_distances pointers
-        _sz_swap(sz_i32_t *, previous_distances, current_distances);
-    }
-
-    // Cache scalar before `free` call.
-    sz_ssize_t result = previous_distances[longer_length];
-    alloc->free(distances, buffer_length, alloc->handle);
-    *result_ptr = result;
-    return sz_success_k;
-}
-
-SZ_PUBLIC sz_status_t sz_needleman_wunsch_score_ice( //
-    sz_cptr_t shorter, sz_size_t shorter_length,     //
-    sz_cptr_t longer, sz_size_t longer_length,       //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc, sz_ssize_t *result_ptr) {
-
-    if (sz_max_of_two(shorter_length, longer_length) < (256ull * 256ull * 256ull))
-        return _sz_needleman_wunsch_score_wagner_fisher_upto17m_ice(shorter, shorter_length, longer, longer_length,
-                                                                    subs, gap, alloc, result_ptr);
-    else
-        return sz_needleman_wunsch_score_serial(shorter, shorter_length, longer, longer_length, subs, gap, alloc,
-                                                result_ptr);
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif            // SZ_USE_ICE
-#pragma endregion // Ice Lake Implementation
-
-/*  Implementation of the similarity algorithms using the Arm NEON instruction set, available on 64-bit
- *  Arm processors. Covers billions of mobile CPUs worldwide, including Apple's A-series, and Qualcomm's Snapdragon.
- */
-#pragma region NEON Implementation
-#if SZ_USE_NEON
-#pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+simd")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif            // SZ_USE_NEON
-#pragma endregion // NEON Implementation
-
-/*  Implementation of the string search algorithms using the Arm SVE variable-length registers,
- *  available in Arm v9 processors, like in Apple M4+ and Graviton 3+ CPUs.
- */
-#pragma region SVE Implementation
-#if SZ_USE_SVE
-#pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+sve")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif            // SZ_USE_SVE
-#pragma endregion // SVE Implementation
-
-/*  Pick the right implementation for the string search algorithms.
- *  To override this behavior and precompile all backends - set `SZ_DYNAMIC_DISPATCH` to 1.
- */
-#pragma region Compile Time Dispatching
-#if !SZ_DYNAMIC_DISPATCH
-
-SZ_DYNAMIC sz_status_t sz_hamming_distance( //
-    sz_cptr_t a, sz_size_t a_length,        //
-    sz_cptr_t b, sz_size_t b_length,        //
-    sz_size_t bound, sz_size_t *result_ptr) {
-    return sz_hamming_distance_serial(a, a_length, b, b_length, bound, result_ptr);
-}
-
-SZ_DYNAMIC sz_status_t sz_hamming_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length,             //
-    sz_cptr_t b, sz_size_t b_length,             //
-    sz_size_t bound, sz_size_t *result_ptr) {
-    return sz_hamming_distance_utf8_serial(a, a_length, b, b_length, bound, result_ptr);
-}
-
-SZ_DYNAMIC sz_status_t sz_levenshtein_distance( //
-    sz_cptr_t a, sz_size_t a_length,            //
-    sz_cptr_t b, sz_size_t b_length,            //
-    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result_ptr) {
-#if SZ_USE_ICE
-    return sz_levenshtein_distance_ice(a, a_length, b, b_length, bound, alloc, result_ptr);
-#else
-    return sz_levenshtein_distance_serial(a, a_length, b, b_length, bound, alloc, result_ptr);
-#endif
-}
-
-SZ_DYNAMIC sz_status_t sz_levenshtein_distance_utf8( //
-    sz_cptr_t a, sz_size_t a_length,                 //
-    sz_cptr_t b, sz_size_t b_length,                 //
-    sz_size_t bound, sz_memory_allocator_t *alloc, sz_size_t *result_ptr) {
-    return sz_levenshtein_distance_utf8_serial(a, a_length, b, b_length, bound, alloc, result_ptr);
-}
-
-SZ_DYNAMIC sz_status_t sz_needleman_wunsch_score( //
-    sz_cptr_t a, sz_size_t a_length,              //
-    sz_cptr_t b, sz_size_t b_length,              //
-    sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc, sz_ssize_t *result_ptr) {
-#if SZ_USE_ICE
-    return sz_needleman_wunsch_score_ice(a, a_length, b, b_length, subs, gap, alloc, result_ptr);
-#else
-    return sz_needleman_wunsch_score_serial(a, a_length, b, b_length, subs, gap, alloc, result_ptr);
-#endif
-}
-
-#endif            // !SZ_DYNAMIC_DISPATCH
-#pragma endregion // Compile Time Dispatching
-
-#ifdef __cplusplus
-}
-#endif // __cplusplus
-#endif // STRINGZILLA_SIMISLARITY_H_

From 13e0201856dc8fac15acc52c55d3e7ea95dc338c Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 18 May 2025 16:20:06 +0000
Subject: [PATCH 428/751] Fix: `unified_alloc` propagation

---
 include/stringcuzilla/types.cuh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/stringcuzilla/types.cuh b/include/stringcuzilla/types.cuh
index 9e57e425..f5afef9b 100644
--- a/include/stringcuzilla/types.cuh
+++ b/include/stringcuzilla/types.cuh
@@ -47,6 +47,12 @@ struct unified_alloc {
     using size_type = sz_size_t;
     using difference_type = sz_ssize_t;
 
+    /*  Those are needed for compatibility with our custom containers.
+     *  @see https://en.cppreference.com/w/cpp/memory/allocator_traits
+     */
+    using propagate_on_container_move_assignment = std::true_type;
+    using propagate_on_container_copy_assignment = std::false_type;
+
     template <typename other_value_type_>
     struct rebind {
         using other = unified_alloc<other_value_type_>;

From c45017dd6db5e9b7aef7f94cb27ec265d68be8cd Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 18 May 2025 16:21:10 +0000
Subject: [PATCH 429/751] Improve: `__reduce_max_sync` in SW on Hopper

This should be faster than multiple
rounds of shuffles.
---
 include/stringcuzilla/similarity.cuh | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/include/stringcuzilla/similarity.cuh b/include/stringcuzilla/similarity.cuh
index a7f72fdf..8736d62d 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringcuzilla/similarity.cuh
@@ -2407,7 +2407,9 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
         }
         else { // Or the best score for local alignment.
             this->final_score_ = __vimax3_s32(this->final_score_, final_score_vec.i16s[0], final_score_vec.i16s[1]);
-            this->final_score_ = _pick_best_in_warp<objective_k>(this->final_score_);
+            // On Hopper we can use specialized warp reductions for up-to 32-bit values:
+            // this->final_score_ = _pick_best_in_warp<sz_maximize_score_k>(this->final_score_);
+            this->final_score_ = __reduce_max_sync(0xFFFFFFFF, this->final_score_);
         }
     }
 };
@@ -2470,7 +2472,9 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
         }
         else { // Or the best score for local alignment.
             this->final_score_ = (std::max)(this->final_score_, final_score);
-            this->final_score_ = _pick_best_in_warp<objective_k>(this->final_score_);
+            // On Hopper we can use specialized warp reductions for up-to 32-bit values:
+            // this->final_score_ = _pick_best_in_warp<sz_maximize_score_k>(this->final_score_);
+            this->final_score_ = __reduce_max_sync(0xFFFFFFFF, this->final_score_);
         }
     }
 };
@@ -2580,7 +2584,9 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
         }
         else { // Or the best score for local alignment.
             this->final_score_ = __vimax3_s32(this->final_score_, final_score_vec.i16s[0], final_score_vec.i16s[1]);
-            this->final_score_ = _pick_best_in_warp<objective_k>(this->final_score_);
+            // On Hopper we can use specialized warp reductions for up-to 32-bit values:
+            // this->final_score_ = _pick_best_in_warp<sz_maximize_score_k>(this->final_score_);
+            this->final_score_ = __reduce_max_sync(0xFFFFFFFF, this->final_score_);
         }
     }
 };
@@ -2656,7 +2662,9 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
         }
         else { // Or the best score for local alignment.
             this->final_score_ = (std::max)(this->final_score_, final_score);
-            this->final_score_ = _pick_best_in_warp<objective_k>(this->final_score_);
+            // On Hopper we can use specialized warp reductions for up-to 32-bit values:
+            // this->final_score_ = _pick_best_in_warp<sz_maximize_score_k>(this->final_score_);
+            this->final_score_ = __reduce_max_sync(0xFFFFFFFF, this->final_score_);
         }
     }
 };

From 0acbeeb2b5fdc0897f0911f15a1f0628dbcfad13 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 18 May 2025 16:21:43 +0000
Subject: [PATCH 430/751] Fix: Revert to separate `find_many` algos for
 different length

---
 include/stringcuzilla/find_many.hpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index 8ef11a5c..d4416435 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -704,19 +704,19 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
                       "The haystack should be trivially copyable for higher compatibility.");
 
         // On small strings, individually compute the counts
-        // executor.for_each_dynamic(counts.size(), [&](size_t haystack_index) noexcept {
-        //     haystack_t const &haystack = haystacks[haystack_index];
-        //     size_t haystack_length = haystack.size();
-        //     if (haystack_length > specs.l2_bytes) return;
-        //     counts[haystack_index] = dict_.count(haystack);
-        // });
+        executor.for_each_dynamic(counts.size(), [&](size_t haystack_index) noexcept {
+            haystack_t const &haystack = haystacks[haystack_index];
+            size_t haystack_length = haystack.size();
+            if (haystack_length > specs.l2_bytes) return;
+            counts[haystack_index] = dict_.count(haystack);
+        });
 
         // On longer strings, throw all cores on each haystack
         for (size_t haystack_index = 0; haystack_index < counts.size(); ++haystack_index) {
             haystack_t const &haystack = haystacks[haystack_index];
             size_t const haystack_length = haystack.size();
             // The shorter strings have already been processed
-            // if (haystack_length <= specs.l2_bytes) continue;
+            if (haystack_length <= specs.l2_bytes) continue;
 
             std::atomic<size_t> count_across_cores = 0;
             size_t const cores_total = executor.thread_count();

From 3f5de03cd6b88755c7ae6857c4a5e104592e16b4 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 18 May 2025 16:30:21 +0000
Subject: [PATCH 431/751] Make: Bump `fork_union`

---
 fork_union | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fork_union b/fork_union
index 34b8d8fe..5ff617de 160000
--- a/fork_union
+++ b/fork_union
@@ -1 +1 @@
-Subproject commit 34b8d8feef8190c8ce950fdd20413991d15e10b8
+Subproject commit 5ff617def00217dc0463f57136fa12ab438eb800

From f5a94b12bd222048d04ca1bfb332be9e7ba8f25e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 3 Jun 2025 12:12:35 +0000
Subject: [PATCH 432/751] Docs: Sections & links

---
 .vscode/settings.json                | 37 +++++++++++++++++++++++++++-
 include/stringcuzilla/similarity.hpp | 22 ++++++++++++++---
 include/stringzilla/stringzilla.h    |  4 +--
 3 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index a4854ce9..81369c12 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -18,6 +18,8 @@
   },
   "cmake.sourceDirectory": "${workspaceRoot}",
   "cSpell.words": [
+    "ACGT",
+    "ACGU",
     "aesdec",
     "allowoverlap",
     "aminoacid",
@@ -26,7 +28,9 @@
     "Appleby",
     "ASAN",
     "ashvardanian",
+    "Aspartic",
     "Aumasson",
+    "Autovectorized",
     "Baeza",
     "basicsize",
     "bigram",
@@ -46,12 +50,16 @@
     "Cawley",
     "chardet",
     "cheminformatics",
+    "chrono",
     "cibuildwheel",
     "CONCAT",
     "constexpr",
     "copydoc",
     "Corasick",
     "cptr",
+    "CUDA",
+    "CUHPP",
+    "cvref",
     "DRBG",
     "endregion",
     "endswith",
@@ -62,13 +70,16 @@
     "getslice",
     "Giancarlo",
     "Giordano",
+    "Glutamic",
     "Gonnet",
     "Gotoh",
+    "GPGPU",
     "Haswell",
     "Heikki",
     "hexdigits",
     "Hirschberg's",
     "Horspool",
+    "HPPCU",
     "Hutter",
     "Hyyro",
     "illformed",
@@ -83,6 +94,7 @@
     "keeplinebreaks",
     "keepseparator",
     "Kernighan",
+    "Killa",
     "kwargs",
     "kwds",
     "kwnames",
@@ -91,9 +103,11 @@
     "LIBC",
     "libdivide",
     "lstrip",
+    "Malloc",
     "Manber",
     "maxsplit",
     "memcpy",
+    "memmove",
     "Merkle-Damgård",
     "Mersenne",
     "misalign",
@@ -115,14 +129,19 @@
     "NOTIMPLEMENTED",
     "npos",
     "nullary",
+    "nullptr",
     "numpy",
+    "NVCC",
     "octdigits",
     "octogram",
+    "openmp",
+    "parallelizable",
     "pgram",
     "pgrams",
     "Plouffe",
     "printables",
     "ptrdiff",
+    "Pyrrolysine",
     "pytest",
     "Pythonic",
     "qsort",
@@ -132,6 +151,7 @@
     "Reini",
     "releasebuffer",
     "repr",
+    "retval",
     "rfind",
     "rfinds",
     "richcompare",
@@ -144,15 +164,22 @@
     "rsplits",
     "rstrip",
     "Sankoff",
+    "Selenocysteine",
     "Sergey",
+    "SFINAE",
+    "shfl",
     "SIMD",
+    "SIMT",
     "sklearn",
     "Skylake",
     "Slotin",
+    "Sneller",
     "splitlines",
     "ssize",
     "startswith",
+    "stdc",
     "STL",
+    "strided",
     "StringWa.rs",
     "STRINGWARS",
     "stringzilla",
@@ -174,9 +201,11 @@
     "VBMI",
     "vectorcallfunc",
     "Vectorizer",
+    "vectorizing",
     "Vintsyuk",
     "Wagner",
     "whitespaces",
+    "wout",
     "Wunsch",
     "XDECREF",
     "xmms",
@@ -312,6 +341,12 @@
     "xtr1common": "cpp",
     "xtree": "cpp",
     "xutility": "cpp",
-    "regex": "cpp"
+    "regex": "cpp",
+    "types.h": "c",
+    "cfenv": "cpp",
+    "__availability": "cpp",
+    "barrier": "cpp",
+    "pipeline": "cpp",
+    "__functional_03": "cpp"
   }
 }
\ No newline at end of file
diff --git a/include/stringcuzilla/similarity.hpp b/include/stringcuzilla/similarity.hpp
index 3ad1404b..fcdca8f9 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringcuzilla/similarity.hpp
@@ -19,7 +19,7 @@
  *  or @b `sz::horizontal_walker` conventional Wagner-Fischer algorithm template, that may be more suitable
  *  for large 256x256 substitution matrices on x86 CPUs.
  *
- *  @section    Why not reimplement this in pure C 99?
+ *  @section Why not reimplement this in pure C 99?
  *
  *  In bioinformatics and other string processing applications we are exposed to too much variability in the
  *  form of inputs and the kind of processing optimizations we want to apply. Many of those optimizations are
@@ -36,7 +36,7 @@
  *
  *  Each of those may just be a 2 line change in the core logic, but can produce a @b 1000 lines of boilerplate!
  *
- *  @section    Abstraction layers
+ *  @section Abstraction layers
  *
  *  Under the hood, each @b dense high-level algorithm, like Levenshtein, NW, or SW, builds on top of a "walker"
  *  template object, which in turn builds on top of an "scorer" template object:
@@ -49,6 +49,22 @@
  *  penalty for the break itself, and a lower penalty for extending it. This is called "affine" scoring.
  *
  *  TODO: For @b sparse algorithms, the algorithms are constructed differently.
+ *
+ *  @section Other Sequence Similarity Tools
+ *
+ *  "Many-against-Many SEQuence Searching" v2 @b (MMseqs2) and the "Basic Local Alignment Search Tool" @b (BLAST)
+ *  libraries are the two most popular sequence alignment libraries.
+ *
+ *  @see https://github.com/soedinglab/mmseqs2
+ *  @see https://blast.ncbi.nlm.nih.gov/Blast.cgi
+ *
+ *  "WaveFront Alignment" v2 algorithm library is a well-known collection of exact alignment algorithms using
+ *  various heuristics to avoid enumerating a quadratic number of DP cells. A GPU port of that library is also
+ *  available:
+ *
+ *  @see https://github.com/smarco/WFA2-lib
+ *  @see https://github.com/quim0/WFA-GPU
+ *  @see https://github.com/asbschmidt/CUDASW4
  */
 #ifndef STRINGCUZILLA_SIMILARITY_HPP_
 #define STRINGCUZILLA_SIMILARITY_HPP_
@@ -2370,7 +2386,7 @@ struct smith_waterman_scores {
  *          scoring or protein sequences, which conveniently require only 26 and 20 letters respectively.
  *  @note   All lookups are performed by indexing rows/columns from the 'A' character, which is 65 in ASCII.
  *
- *  @section    Biological Data
+ *  @section Biological Data
  *
  *  For proteins, a (26 x 26) matrix takes 676 bytes, which is a steep 43% increase from (20 x 20) ~ 400 bytes.
  *  Still, its an acceptable tradeoff given the convenience of using ASCII arithmetic for lookups, and occasional
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 94e5f8ff..89cf180c 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -10,7 +10,7 @@
  *  @see    StringZilla docs: https://github.com/ashvardanian/StringZilla/blob/main/README.md
  *  @see    LibC string docs: https://pubs.opengroup.org/onlinepubs/009695399/basedefs/string.h.html
  *
- *  @section    Introduction
+ *  @section Introduction
  *
  *  StringZilla is multi-language project designed for high-throughput string processing, differentiating
  *  the low-level "embeddable" mostly-C core implementation, containing:
@@ -35,7 +35,7 @@
  *  The core implementations of those algorithms are mostly structured as callable structure templates, as opposed to
  *  template functions to simplify specialized overloads and reusing the state between invocations.
  *
- *  @section    Compilation Settings
+ *  @section Compilation Settings
  *
  *  Consider overriding the following macros to customize the library:
  *

From 966dd0923894d01e6ca85cbf184a401197805a50 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 3 Jun 2025 12:12:45 +0000
Subject: [PATCH 433/751] Fix: Match C++ class names

---
 rust/lib.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rust/lib.rs b/rust/lib.rs
index 4d8bcf27..4f8f36a7 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -1104,7 +1104,7 @@ pub mod sz {
     /// A 256x256 array of `i8`, where each element represents the substitution cost
     /// between two characters (byte values). Matching characters are assigned a cost
     /// of 0, and non-matching characters are assigned a cost of -1.
-    pub fn unary_substitution_costs() -> [[i8; 256]; 256] {
+    pub fn error_costs_256x256_unary() -> [[i8; 256]; 256] {
         let mut result = [[0i8; 256]; 256];
 
         for i in 0..256 {
@@ -1852,7 +1852,7 @@ where
     ///
     /// let first = "kitten";
     /// let second = "sitting";
-    /// let matrix = sz::unary_substitution_costs();
+    /// let matrix = sz::error_costs_256x256_unary();
     /// let gap_penalty = -1;
     /// assert_eq!(first.sz_needleman_wunsch_score(second.as_bytes(), matrix, gap_penalty), Ok(-3));
     /// ```
@@ -2166,7 +2166,7 @@ mod tests {
 
     #[test]
     fn needleman() {
-        let costs_vector = sz::unary_substitution_costs();
+        let costs_vector = sz::error_costs_256x256_unary();
         assert_eq!(sz::alignment_score("listen", "silent", costs_vector, -1), Ok(-4));
         assert_eq!(
             sz::alignment_score("abcdefgABCDEFG", "ABCDEFGabcdefg", costs_vector, -1),

From 503e4706d01571fbcd4cb3fe44ae355bd55cbc78 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 3 Jun 2025 12:13:58 +0000
Subject: [PATCH 434/751] Docs: Target names

---
 CMakeLists.txt      |  6 ++++--
 CONTRIBUTING.md     | 49 +++++++++++++++++++++++++++++++++++++--------
 drafts/features.hpp |  2 ++
 3 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 54da7f07..69842e3b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,7 +23,7 @@
 #
 # Benchmarks:
 #
-# * stringzilla_bench_search: A benchmark for substring search operations.
+# * stringzilla_bench_find: A benchmark for substring search operations.
 # * stringzilla_bench_similarity: A benchmark for similarity operations.
 # * stringzilla_bench_sequence: A benchmark for string array-level operations.
 # * stringzilla_bench_token: A benchmark for comparators and hash functions.
@@ -351,14 +351,16 @@ function (define_gpu_launcher exec_name source cuda_standard target_arch)
 endfunction ()
 
 if (${STRINGZILLA_BUILD_BENCHMARK})
-    define_launcher(stringzilla_bench_search scripts/bench_search.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_bench_find scripts/bench_find.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_sequence scripts/bench_sequence.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_token scripts/bench_token.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_container scripts/bench_container.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_memory scripts/bench_memory.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_similarity scripts/bench_similarity.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_bench_find_many scripts/bench_find_many.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     if (ENABLE_CUDA)
         define_gpu_launcher(stringcuzilla_bench_similarity scripts/bench_similarity.cu 20 "${STRINGZILLA_TARGET_ARCH}")
+        # define_gpu_launcher(stringcuzilla_bench_find_many scripts/bench_find_many.cu 20 "${STRINGZILLA_TARGET_ARCH}")
     endif ()
 endif ()
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 519c7ce6..c2a8f261 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -5,6 +5,14 @@ Depending on the type of contribution, you may need to follow different steps.
 
 ---
 
+Before building the first time, please pull `git` submodules.
+That's how we bring in `fork_union` and other optional dependencies to test all of the available functionality.
+
+
+```sh
+git submodule update --init --recursive
+```
+
 ## Project Structure
 
 The project is split into the following parts:
@@ -25,14 +33,14 @@ At the C++ level all benchmarks also validate the results against the STL baseli
 They have the broadest coverage of the library, and are the most important to keep up-to-date:
 
 - `scripts/bench_token.cpp` - token-level ops, like hashing, ordering, equality checks.
-- `scripts/bench_search.cpp` - bidirectional substring search, both exact and fuzzy.
+- `scripts/bench_find.cpp` - bidirectional substring search, both exact and fuzzy.
 - `scripts/bench_similarity.cpp` - benchmark all edit distance backends.
 - `scripts/bench_sequence.cpp` - sorting, partitioning, merging.
 - `scripts/bench_container.cpp` - STL containers with different string keys.
 
 The role of Python benchmarks is less to provide absolute number, but to compare against popular tools in the Python ecosystem.
 
-- `scripts/bench_search.(py|ipynb)` - compares against native Python `str`.
+- `scripts/bench_find.(py|ipynb)` - compares against native Python `str`.
 - `scripts/bench_sequence.(py|ipynb)` - compares against `pandas`.
 - `scripts/bench_similarity.(ipynb)` - compares against `jellyfish`, `editdistance`, etc.
 
@@ -85,7 +93,7 @@ Modern IDEs, like VS Code, can be configured to automatically format the code on
 For C++ code:
 
 - Explicitly use `std::` or `sz::` namespaces over global `memcpy`, `uint64_t`, etc.
-- Explicitly mark `noexcept` or `noexcept(false)` for all library interfaces.
+- Explicitly mark `noexcept` or `noexcept(false)` for all library interfaces, except for `__global__` CUDA functions.
 - Document all possible exceptions of an interface using `@throw` in Doxygen.
 - Avoid C-style variadic arguments in favor of templates.
 - Avoid C-style casts in favor of `static_cast`, `reinterpret_cast`, and `const_cast`, except for places where a C function is called.
@@ -168,7 +176,7 @@ For benchmarks, you can use the following commands:
 cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -B build_release
 cmake --build build_release --config Release  # Produces the following targets:
 build_release/stringzilla_bench_memory        # - for string copies and fills
-build_release/stringzilla_bench_search        # - for substring search
+build_release/stringzilla_bench_find        # - for substring search
 build_release/stringzilla_bench_token         # - for hashing, equality comparisons, etc.
 build_release/stringzilla_bench_sequence      # - for sorting arrays of strings
 build_release/stringzilla_bench_container     # - for STL containers with string keys
@@ -177,6 +185,7 @@ build_release/stringzilla_bench_container     # - for STL containers with string
 There are also parallel algorithms that need a very different benchmarking setup:
 
 ```sh
+build_release/stringzilla_bench_find_many     # - for parallel multi-pattern search on CPU
 build_release/stringzilla_bench_similarity    # - for parallel edit distances and alignment scores on CPU
 build_release/stringcuzilla_bench_similarity  # - for parallel edit distances and alignment scores on GPU
 ```
@@ -186,8 +195,13 @@ Let's say you want to benchmark large-batch DNA similarity scoring kernels:
 
 ```sh
 cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -B build_release
-cmake --build build_release --config Release --target stringcuzilla_bench_similarity
-STRINGWARS_FILTER=32768 STRINGWARS_DATASET="acgt_100k_1k.txt" build_release/stringcuzilla_bench_similarity
+cmake --build build_release --config Release --target stringzilla_bench_similarity      # CPU
+cmake --build build_release --config Release --target stringcuzilla_bench_similarity    # GPU
+STRINGWARS_FILTER=32768 STRINGWARS_DATASET="acgt_1k.txt" build_release/stringzilla_bench_similarity
+STRINGWARS_FILTER=1 STRINGWARS_DATASET="acgt_100k.txt" build_release/stringzilla_bench_similarity
+
+STRINGWARS_FILTER="(cuda|kepler|hopper).*:batch32768" STRINGWARS_DATASET="acgt_1k.txt" build_release/stringcuzilla_bench_similarity
+STRINGWARS_STRESS=0 STRINGWARS_FILTER="(cuda|kepler|hopper).*:batch1" STRINGWARS_DATASET="acgt_100k.txt" build_release/stringcuzilla_bench_similarity
 ```
 
 Each benchmark originates from an identically named single-source file in the `scripts/` directory.
@@ -375,6 +389,25 @@ cmake -D CMAKE_BUILD_TYPE=Release \
 cmake --build build_artifacts --config Release
 ```
 
+## Contributing in Parallel C++ and CUDA
+
+```sh
+cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=1 -B build_debug
+cmake --build build_debug --config Debug --target stringcuzilla_test_cpp20
+cmake --build build_debug --config Debug --target stringcuzilla_test_cu20
+```
+
+```sh
+cmake -D CMAKE_BUILD_TYPE=Release -D STRINGZILLA_BUILD_TEST=1 -B build_release
+cmake --build build_release --config Release --target stringcuzilla_test_cpp20
+cmake --build build_release --config Release --target stringcuzilla_test_cu20
+```
+
+```sh
+cuda-gdb ./build_debug/stringcuzilla_test_cu20
+cuda-memcheck ./build_debug/stringcuzilla_test_cu20
+```
+
 ## Contributing in Python
 
 Python bindings are implemented using pure CPython, so you wouldn't need to install SWIG, PyBind11, or any other third-party library.
@@ -438,8 +471,8 @@ For high-performance low-latency benchmarking, stick to C/C++ native benchmarks,
 For benchmarking, the following scripts are provided.
 
 ```sh
-python scripts/bench_search.py --haystack_path "your file" --needle "your pattern" # real data
-python scripts/bench_search.py --haystack_pattern "abcd" --haystack_length 1e9 --needle "abce" # synthetic data
+python scripts/bench_find.py --haystack_path "your file" --needle "your pattern" # real data
+python scripts/bench_find.py --haystack_pattern "abcd" --haystack_length 1e9 --needle "abce" # synthetic data
 python scripts/similarity_bench.py --text_path "your file" # edit distance computations
 ```
 
diff --git a/drafts/features.hpp b/drafts/features.hpp
index 7480a0e0..ae863e09 100644
--- a/drafts/features.hpp
+++ b/drafts/features.hpp
@@ -27,6 +27,8 @@
  *    - output hashes into a hash-map with counts.
  *    - output hashes into a high-dimensional bit-vector.
  *
+ *  @see https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
+ *  @see https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
  */
 #ifndef STRINGZILLA_FEATURES_HPP_
 #define STRINGZILLA_FEATURES_HPP_

From d88842a01ec078312f71c7d895509bd38f2e9a0e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 3 Jul 2025 17:23:44 +0000
Subject: [PATCH 435/751] Fix: Warning around immutable span conversion

---
 include/stringzilla/types.hpp | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 45fab94f..c0f10a4a 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -165,8 +165,11 @@ struct span {
     constexpr value_type &back() const noexcept { return data_[size_ - 1]; }
     constexpr bool empty() const noexcept { return size_ == 0; }
 
-    operator span<value_type const>() const noexcept { return span<value_type const>(data_, size_); }
-
+    template <typename same_value_type_ = value_type,
+              typename = std::enable_if_t<!std::is_const<same_value_type_>::value>>
+    constexpr operator span<std::add_const_t<same_value_type_>>() const noexcept {
+        return {data_, size_};
+    }
     template <typename other_value_type_>
     constexpr span<other_value_type_> cast() const noexcept {
         return span<other_value_type_>(reinterpret_cast<other_value_type_ *>(data_),
@@ -691,15 +694,13 @@ class safe_vector {
         return *this;
     }
 
-    template <typename other_allocator_type_ = allocator_type>
-    status_t try_assign(safe_vector<value_type, other_allocator_type_> const &other) noexcept {
+    status_t try_assign(span<value_type const> const other) noexcept {
         reset();
 
-        if constexpr (allocator_traits::propagate_on_container_copy_assignment::value) alloc_ = other.alloc_;
-        if (other.size_ == 0) return status_t::success_k; // Nothing to do :)
+        if (other.size() == 0) return status_t::success_k; // Nothing to do :)
 
         // Allocate exact needed capacity
-        size_type new_cap = other.size_;
+        size_type new_cap = other.size();
         allocated_type *raw = allocator_traits::allocate(alloc_, new_cap);
         if (!raw) return status_t::bad_alloc_k;
         data_ = reinterpret_cast<value_type *>(raw);
@@ -707,13 +708,19 @@ class safe_vector {
 
         // Copy‐construct each element
         if constexpr (!std::is_trivially_constructible<value_type>::value)
-            for (size_type i = 0; i < other.size_; ++i) new (data_ + i) value_type(other.data_[i]);
+            for (size_type i = 0; i < other.size(); ++i) new (data_ + i) value_type(other[i]);
         else
-            for (size_type i = 0; i < other.size_; ++i) data_[i] = other.data_[i];
-        size_ = other.size_;
+            for (size_type i = 0; i < other.size(); ++i) data_[i] = other[i];
+        size_ = other.size();
         return status_t::success_k;
     }
 
+    template <typename other_allocator_type_ = allocator_type>
+    status_t try_assign(safe_vector<value_type, other_allocator_type_> const &other) noexcept {
+        if constexpr (allocator_traits::propagate_on_container_copy_assignment::value) alloc_ = other.alloc_;
+        return try_assign(span<value_type>(other.data(), other.size()));
+    }
+
     status_t try_reserve(size_type new_cap) noexcept {
         if (new_cap <= capacity_) return status_t::success_k;
         value_type *new_data = (value_type *)alloc_.allocate(new_cap);

From 4f9d2db60a7d60814bfc0fa448333844e86aec54 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 3 Jul 2025 17:25:38 +0000
Subject: [PATCH 436/751] Fix: AC dictionary `try_assign` with different alloc

---
 include/stringcuzilla/find_many.hpp | 32 +++++++++++++++--------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index d4416435..31f4a4e8 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -160,7 +160,7 @@ struct aho_corasick_dictionary {
 
     /**
      *  @brief  Failure links for each state, exactly `count_states_` in effective size, potentially larger capacity.
-     *          The failure links aren't very needed after the FSM construction, if we stick to a dense layout.
+     *          The failure links aren't needed after the FSM construction, if we stick to a dense layout.
      */
     safe_vector<state_id_t, state_id_allocator_t> failures_;
 
@@ -236,19 +236,19 @@ struct aho_corasick_dictionary {
         safe_vector<size_t, size_allocator_t> needles_lengths(alloc);
 
         status_t s;
-        if ((s = transitions.try_reserve(other.transitions_.size())) != status_t::success_k) return s;
-        if ((s = outputs.try_reserve(other.outputs_.size())) != status_t::success_k) return s;
-        if ((s = failures.try_reserve(other.failures_.size())) != status_t::success_k) return s;
-        if ((s = outputs_counts.try_reserve(other.outputs_counts_.size())) != status_t::success_k) return s;
-        if ((s = outputs_offsets.try_reserve(other.outputs_offsets_.size())) != status_t::success_k) return s;
-        if ((s = needles_lengths.try_reserve(other.needles_lengths_.size())) != status_t::success_k) return s;
-
-        _sz_assert(transitions.try_assign(other.transitions_) == status_t::success_k);
-        _sz_assert(outputs.try_assign(other.outputs_) == status_t::success_k);
-        _sz_assert(failures.try_assign(other.failures_) == status_t::success_k);
-        _sz_assert(outputs_counts.try_assign(other.outputs_counts_) == status_t::success_k);
-        _sz_assert(outputs_offsets.try_assign(other.outputs_offsets_) == status_t::success_k);
-        _sz_assert(needles_lengths.try_assign(other.needles_lengths_) == status_t::success_k);
+        if ((s = transitions.try_reserve(other.transitions().size())) != status_t::success_k) return s;
+        if ((s = outputs.try_reserve(other.outputs().size())) != status_t::success_k) return s;
+        if ((s = failures.try_reserve(other.failures().size())) != status_t::success_k) return s;
+        if ((s = outputs_counts.try_reserve(other.outputs_counts().size())) != status_t::success_k) return s;
+        if ((s = outputs_offsets.try_reserve(other.outputs_offsets().size())) != status_t::success_k) return s;
+        if ((s = needles_lengths.try_reserve(other.needles_lengths().size())) != status_t::success_k) return s;
+
+        _sz_assert(transitions.try_assign(other.transitions()) == status_t::success_k);
+        _sz_assert(outputs.try_assign(other.outputs()) == status_t::success_k);
+        _sz_assert(failures.try_assign(other.failures()) == status_t::success_k);
+        _sz_assert(outputs_counts.try_assign(other.outputs_counts()) == status_t::success_k);
+        _sz_assert(outputs_offsets.try_assign(other.outputs_offsets()) == status_t::success_k);
+        _sz_assert(needles_lengths.try_assign(other.needles_lengths()) == status_t::success_k);
 
         alloc_ = std::move(alloc);
         transitions_ = std::move(transitions);
@@ -257,7 +257,7 @@ struct aho_corasick_dictionary {
         outputs_counts_ = std::move(outputs_counts);
         outputs_offsets_ = std::move(outputs_offsets);
         needles_lengths_ = std::move(needles_lengths);
-        count_states_ = other.count_states_;
+        count_states_ = other.count_states();
         return status_t::success_k;
     }
 
@@ -301,6 +301,8 @@ struct aho_corasick_dictionary {
     span<state_id_t const> outputs() const noexcept { return outputs_; }
     span<state_id_t const> failures() const noexcept { return failures_; }
     span<state_id_t const> outputs_counts() const noexcept { return outputs_counts_; }
+    span<state_id_t const> outputs_offsets() const noexcept { return outputs_offsets_; }
+    span<size_t const> needles_lengths() const noexcept { return needles_lengths_; }
 
     /**
      *  @brief Returns the metadata for the Aho-Corasick dictionary.

From 3c615d966546aaca16b590eca8aa83c3f5a998a0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 3 Jul 2025 17:26:57 +0000
Subject: [PATCH 437/751] Add: `find_many` minimal counter & benchmarks

---
 CMakeLists.txt                      |   2 +-
 include/stringcuzilla/find_many.cuh | 322 ++++++++++++++++++++++------
 include/stringcuzilla/find_many.hpp |  16 +-
 scripts/bench_find_many.cu          |  71 ++++++
 scripts/bench_find_many.cuh         |  34 ++-
 5 files changed, 370 insertions(+), 75 deletions(-)
 create mode 100644 scripts/bench_find_many.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 69842e3b..d81429c0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -360,7 +360,7 @@ if (${STRINGZILLA_BUILD_BENCHMARK})
     define_launcher(stringzilla_bench_find_many scripts/bench_find_many.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     if (ENABLE_CUDA)
         define_gpu_launcher(stringcuzilla_bench_similarity scripts/bench_similarity.cu 20 "${STRINGZILLA_TARGET_ARCH}")
-        # define_gpu_launcher(stringcuzilla_bench_find_many scripts/bench_find_many.cu 20 "${STRINGZILLA_TARGET_ARCH}")
+        define_gpu_launcher(stringcuzilla_bench_find_many scripts/bench_find_many.cu 20 "${STRINGZILLA_TARGET_ARCH}")
     endif ()
 endif ()
 
diff --git a/include/stringcuzilla/find_many.cuh b/include/stringcuzilla/find_many.cuh
index d6a3ad03..3191f143 100644
--- a/include/stringcuzilla/find_many.cuh
+++ b/include/stringcuzilla/find_many.cuh
@@ -34,35 +34,161 @@ namespace stringzilla {
 
 #pragma region - General Purpose CUDA Backend
 
+template <typename value_type_>
+constexpr value_type_ non_zero_if(value_type_ value, value_type_ condition) noexcept {
+    return value * condition;
+}
+
 /**
- *  @brief  Multi-pattern exact substring search on CUDA-capable GPUs, assigning just one warp per haystack.
- *
- *  The serial Aho-Corasick algorithm's super-power is looking at each symbol of the haystack just once.
- *  If we have a warp of @b (WS=32) threads, we have several strategies to enumerate the haystack:
+ *  @brief A more generic alternative to `__reduce_add_sync`.
+ */
+template <typename scalar_type_>
+__forceinline__ __device__ scalar_type_ _reduce_in_warp(scalar_type_ x) noexcept {
+    // The `__shfl_down_sync` replaces `__shfl_down`
+    // https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
+    x += __shfl_down_sync(0xffffffff, x, 16);
+    x += __shfl_down_sync(0xffffffff, x, 8);
+    x += __shfl_down_sync(0xffffffff, x, 4);
+    x += __shfl_down_sync(0xffffffff, x, 2);
+    x += __shfl_down_sync(0xffffffff, x, 1);
+    return x;
+}
+
+/**
+ *  Each warp receives a unique haystack. All threads in a warp take continuous overlapping slices of the haystack.
+ *  Overlapping match counts are reported and later aggregated in the calling function, accounting for the overlaps.
+ *  It's expected, that the length of the longest needle is smaller than the length of a haystack slice.
  *
- *  - Simple algorithm: split each haystack into WS continuous parts and assign each part to a thread.
- *    That works great until the length of the longest needle is much smaller than the (haystack.size() / WS).
- *  - Advanced algorithm: WS threads are walking through the haystack 2xWS symbols at a time, combining SIMT and
- *    SIMD-style processing.
+ *  @tparam small_size_type_ Helps us avoid 64-bit arithmetic in favor of smaller 16- or 32-bit offsets/lengths.
+ */
+template <typename small_size_type_, typename char_type_, typename state_id_type_>
+__device__ _count_short_needle_matches_in_one_part_t _count_short_needle_matches_in_one_part_per_warp_thread( //
+    char_type_ const *haystack_begin, size_t const haystack_length,                                           //
+    size_t const count_states,                                                                                //
+    state_id_type_ const *transitions,                                                                        //
+    state_id_type_ const *outputs_counts,                                                                     //
+    size_t const max_needle_length) noexcept {
+
+    using char_t = char_type_;
+    using state_id_t = state_id_type_;
+    using small_size_t = small_size_type_;
+
+    byte_t const *const haystack_data = reinterpret_cast<byte_t const *>(haystack_begin);
+    small_size_t const haystack_bytes_length = static_cast<small_size_t>(haystack_length * sizeof(char_t));
+    byte_t const *const haystack_end = haystack_data + haystack_bytes_length;
+
+    small_size_t const warp_size = static_cast<small_size_t>(warpSize);
+    small_size_t const warp_thread_index = static_cast<small_size_t>(threadIdx.x % warp_size);
+    small_size_t const bytes_per_thread_optimal = divide_round_up(haystack_bytes_length, warp_size);
+
+    // We may have a case of a thread receiving no data at all
+    byte_t const *optimal_start = std::min(haystack_data + warp_thread_index * bytes_per_thread_optimal, haystack_end);
+    byte_t const *const prefix_end = std::min(optimal_start + max_needle_length, haystack_end);
+    byte_t const *const overlapping_end =
+        std::min(optimal_start + bytes_per_thread_optimal + max_needle_length, haystack_end);
+
+    // Reimplement the serial `aho_corasick_dictionary::count` keeping track of the matches,
+    // entirely fitting in the prefix
+    state_id_t current_state = 0;
+    small_size_t result_total = 0;
+    small_size_t result_prefix = 0;
+    for (; optimal_start != overlapping_end; ++optimal_start) {
+        current_state = transitions[current_state][*optimal_start];
+        small_size_t const outputs_count = static_cast<small_size_t>(outputs_counts[current_state]);
+        result_total += outputs_count;
+        result_prefix += non_zero_if(outputs_count, optimal_start < prefix_end);
+    }
+
+    // Re-package into larger output types:
+    _count_short_needle_matches_in_one_part_t result;
+    result.total = result_total;
+    result.prefix = result_prefix;
+    return result;
+}
+
+/**
+ *  Each warp receives a unique haystack. All threads in a warp take continuous @b heavily overlapping slices of the
+ *  haystack. The length of the longest needle may be larger than many slices combined, or match the entire haystack.
  *
- *  The problem with the "simple" solution is - imagine a haystack of 1 MB and a collection of 100 short needles
- *  and just 1 long needle almost 1 MB in size. In the worst-case scenario, the first of WS=32 threads will immediately
- *  start matching the longest needle. The (WS-1=31) will finish early, while 1 thread will have a WS longer runtime.
- *  Assuming all the WS threads share a scheduler, our algorithm will be at least (WS-1) times slower than it can be.
+ *  @note In the worst case this kernel is highly inefficient and should be reconsidered in the future.
+ *  There are countless divergent branches in this solution, that depending on the vocabulary can result
+ *  in extremely low performance.
+ */
+template <typename small_size_type_, typename char_type_, typename state_id_type_>
+__device__ small_size_type_ _count_needle_matches_in_one_part_per_warp_thread( //
+    char_type_ const *haystack_begin, size_t const haystack_length,            //
+    size_t const count_states,                                                 //
+    state_id_type_ const *transitions,                                         //
+    state_id_type_ const *outputs,                                             //
+    state_id_type_ const *outputs_counts,                                      //
+    state_id_type_ const *outputs_offsets,                                     //
+    size_t const *needles_lengths,                                             //
+    size_t const max_needle_length) noexcept {
+
+    using char_t = char_type_;
+    using state_id_t = state_id_type_;
+    using small_size_t = small_size_type_;
+
+    byte_t const *const haystack_data = reinterpret_cast<byte_t const *>(haystack_begin);
+    small_size_t const haystack_bytes_length = static_cast<small_size_t>(haystack_length * sizeof(char_t));
+    byte_t const *const haystack_end = haystack_data + haystack_bytes_length;
+
+    small_size_t const warp_size = static_cast<small_size_t>(warpSize);
+    small_size_t const warp_thread_index = static_cast<small_size_t>(threadIdx.x % warp_size);
+    small_size_t const bytes_per_thread_optimal = divide_round_up(haystack_bytes_length, warp_size);
+
+    // We may have a case of a thread receiving no data at all
+    byte_t const *optimal_start = std::min(haystack_data + warp_thread_index * bytes_per_thread_optimal, haystack_end);
+    byte_t const *const optimal_end = std::min(optimal_start + bytes_per_thread_optimal, haystack_end);
+    byte_t const *const overlapping_end =
+        std::min(optimal_start + bytes_per_thread_optimal + max_needle_length, haystack_end);
+
+    // Reimplement the serial `aho_corasick_dictionary::count` keeping track of the matches,
+    // entirely fitting in the prefix
+    state_id_t current_state = 0;
+    small_size_t result_total = 0;
+    for (; optimal_start != overlapping_end; ++optimal_start) {
+        current_state = transitions[current_state][*optimal_start];
+        small_size_t const outputs_count = static_cast<small_size_t>(outputs_counts[current_state]);
+        if (outputs_count == 0) continue;
+
+        // In a small & diverse vocabulary, the following loop generally does just 1 iteration
+        size_t const outputs_offset = outputs_offsets[current_state];
+        for (size_t output_index = 0; output_index < outputs_count; ++output_index) {
+            size_t needle_id = outputs[outputs_offset + output_index];
+            size_t match_length = needles_lengths[needle_id];
+            byte_t const *match_ptr = optimal_start + 1 - match_length;
+            result_total += match_ptr < optimal_end;
+        }
+    }
+
+    return result_total;
+}
+
+/**
+ *  @brief  Multi-pattern exact substring search on CUDA-capable GPUs, assigning just one warp per haystack.
  *
- *  The problem with the "advanced" solution is - with frequent failure links reaching back to the root, the threads
- *  within the warp will be effectively observing the same paths once they receive the next character. So despite being
- *  much more hardware-friendly with only sequential coalesced memory access, it directly harms the AC algorithm logic.
+ *  Nothing smart here. Each warp takes its own haystack from @p `haystacks`.
+ *  Different threads in a warp take different continuous slices of a shared haystack.
+ *  This works best for fairly short needles and a large quantity of haystacks.
  */
 template < //
     typename state_id_type_,
     typename haystacks_strings_type_,           //
     sz_capability_t capability_ = sz_cap_cuda_k //
     >
-__global__ void _count_matches_with_haystack_per_warp( //
-    haystacks_strings_type_ haystacks, size_t const count_states, state_id_type_ const *transitions,
-    state_id_type_ const *count_outputs_per_state) {
+__global__ void _count_matches_with_haystack_per_warp(              //
+    haystacks_strings_type_ haystacks, size_t *counts_per_haystack, //
+    size_t const count_states,                                      //
+    state_id_type_ const *transitions,                              //
+    state_id_type_ const *outputs,                                  //
+    state_id_type_ const *outputs_counts,                           //
+    state_id_type_ const *outputs_offsets,                          //
+    size_t const *needles_lengths,                                  //
+    size_t const max_length_among_needles) {
 
+    // We only use this kernel for small haystacks, where a smaller integer type is enough for size.
+    using small_size_t = uint;
     using haystack_t = typename haystacks_strings_type_::value_type;
     using char_t = typename haystack_t::value_type;
     using state_id_t = state_id_type_;
@@ -74,54 +200,70 @@ __global__ void _count_matches_with_haystack_per_warp( //
     size_t const warps_per_block = static_cast<uint>(blockDim.x / warp_size);
     size_t const warps_per_device = static_cast<uint>(gridDim.x * warps_per_block);
     uint const warp_thread_index = static_cast<uint>(global_thread_index % warp_size);
-    bool const is_last_in_warp = (warp_thread_index + 1 == warp_size);
 
     for (size_t haystack_index = global_warp_index; haystack_index < haystacks.size();
          haystack_index += warps_per_device) {
         // Each warp is assigned to a single haystack.
         haystack_t haystack = haystacks[haystack_index];
-        size_t const haystack_size = haystack.size();
-        size_t const haystack_offset = 0;
-        state_id_t current_state = 0;
-        state_id_t thread_matches_count = 0;
-
-        // Our text processing is happening left to right.
-        // On each cycle we could load 1 char, but we also can prefetch the one that will be used
-        // at the next warp-level shuffle. Sadly, there is `__shfl_down_sync` doesn't support circular
-        // rotation, so we need to use a reverse processing order for the `next_cycle_char`.
-        char_t current_char = haystack[warp_thread_index];
-        char_t next_cycle_char = haystack[warp_size + warp_thread_index];
-
-        // Fetch the new state and the number of possible matches ending here.
-        state_id_t next_state = transitions[current_state * 256 + static_cast<uint>(current_char)];
-        state_id_t current_output_count = count_outputs_per_state[next_state];
-
-        // Aggregate.
-        current_state = next_state;
-        thread_matches_count += current_output_count;
-
-#pragma unroll
-        for (size_t window_offset = 0; window_offset < warp_size; ++window_offset) {
-            // Shift down the current character, so all the threads except the last one - step forward.
-            current_char = __shfl_down_sync(0xFFFFFFFF, current_char, 1, warp_size);
-            // Select what is the end of the logical window observed by all threads in warp at `window_offset`.
-            char_t last_char_in_window = __shfl_sync(0xFFFFFFFF, next_cycle_char, window_offset, warp_size);
-            // Update the current character for the last thread in the warp.
-            current_char = is_last_in_warp ? current_char : last_char_in_window; // ? Hope this is branch-free
-
-            // Fetch the new state and the number of possible matches ending here.
-            next_state = transitions[current_state * 256 + static_cast<uint>(current_char)];
-            current_output_count = count_outputs_per_state[next_state];
-
-            // Aggregate.
-            current_state = next_state;
-            thread_matches_count += current_output_count;
+        char_t const *const haystack_begin = haystack.data();
+        small_size_t const haystack_length = static_cast<small_size_t>(haystack.size());
+        small_size_t const chars_per_core_optimal = divide_round_up<small_size_t>(haystack_length, warp_size);
+
+        // We shouldn't even consider needles longer than the haystack
+        small_size_t const max_needle_length =
+            std::min(static_cast<small_size_t>(max_length_among_needles), haystack_length);
+        bool const longest_needle_fits_on_one_thread = max_needle_length * warp_size < haystack_length;
+        small_size_t results_per_thread = 0;
+        if (longest_needle_fits_on_one_thread) {
+            _count_short_needle_matches_in_one_part_t partial_result =
+                _count_short_needle_matches_in_one_part_per_warp_thread<small_size_t, char_t, state_id_t>( //
+                    haystack_begin, haystack_length,                                                       //
+                    count_states, transitions,                                                             //
+                    outputs_counts, max_length_among_needles);
+            results_per_thread =
+                partial_result.total - non_zero_if<small_size_t>(partial_result.prefix, warp_thread_index != 0);
+        }
+        else {
+            results_per_thread = _count_matches_in_one_part_per_warp_thread<small_size_t, char_t, state_id_t>( //
+                haystack_begin, haystack_length,                                                               //
+                count_states, transitions,                                                                     //
+                outputs, outputs_counts, outputs_offsets,                                                      //
+                needles_lengths, max_length_among_needles);
         }
 
-        // Now that we've went through `warp_size` starting positions,
+        small_size_t results_across_warp = _reduce_in_warp(results_per_thread);
+        counts_per_haystack[haystack_index] = results_across_warp;
     }
 }
 
+/**
+ *  @brief  Multi-pattern exact substring search on CUDA-capable GPUs, assigning just one warp per haystack.
+ *
+ *  The serial Aho-Corasick algorithm's super-power is looking at each symbol of the haystack just once.
+ *  If we have a warp of @b (WS=32) threads, we have several strategies to enumerate the haystack:
+ *
+ *  - Simple algorithm: split each haystack into WS continuous parts and assign each part to a thread.
+ *    That works great until the length of the longest needle is much smaller than the (haystack.size() / WS).
+ *  - Advanced algorithm: WS threads are walking through the haystack 2xWS symbols at a time, combining SIMT and
+ *    SIMD-style processing.
+ *
+ *  The problem with the "simple" solution is - imagine a haystack of 1 MB and a collection of 100 short needles
+ *  and just 1 long needle almost 1 MB in size. In the worst-case scenario, the first of WS=32 threads will immediately
+ *  start matching the longest needle. The (WS-1=31) will finish early, while 1 thread will have a WS longer runtime.
+ *  Assuming all the WS threads share a scheduler, our algorithm will be at least (WS-1) times slower than it can be.
+ *
+ *  The problem with the "advanced" solution is - with frequent failure links reaching back to the root, the threads
+ *  within the warp will be effectively observing the same paths once they receive the next character. So despite being
+ *  much more hardware-friendly with only sequential coalesced memory access, it directly harms the AC algorithm logic.
+ *
+ *  A hybrid, however, may be interesting! We can process the majority of the content in a "simple" fashion, with each
+ *  thread taking care of its own chunk privately, afterwards using the "advanced" solution to process overlaps and
+ *  filter them by needle length?
+ *
+ *  The most common case is having more input characters per thread in warp than the length of the longest needle.
+ *  In that case we can avoid the complex
+ */
+
 /**
  *  @brief Aho-Corasick-based @b SIMT multi-pattern exact substring search.
  *  @tparam state_id_type_ The type of the state ID. Default is `sz_u32_t`.
@@ -138,6 +280,12 @@ struct find_many<state_id_type_, allocator_type_, sz_cap_cuda_k, enable_> {
 
     find_many(allocator_t alloc = allocator_t()) noexcept : dict_(alloc) {}
     void reset() noexcept { dict_.reset(); }
+    dictionary_t const &dictionary() const noexcept { return dict_; }
+
+    template <typename other_allocator_type_>
+    status_t try_build(aho_corasick_dictionary<state_id_t, other_allocator_type_> const &other) noexcept {
+        return dict_.try_assign(other);
+    }
 
     /**
      *  @brief Indexes all of the @p needles strings into the FSM.
@@ -161,10 +309,62 @@ struct find_many<state_id_type_, allocator_type_, sz_cap_cuda_k, enable_> {
      *  @return The total number of occurrences found.
      */
     template <typename haystacks_type_>
-    status_t try_count(haystacks_type_ &&haystacks, span<size_t> counts, gpu_specs_t const &specs = {}) const noexcept {
+    cuda_status_t try_count(                              //
+        haystacks_type_ &&haystacks, span<size_t> counts, //
+        cuda_executor_t executor = {}, gpu_specs_t const &specs = {}) const noexcept {
+
         _sz_assert(counts.size() == haystacks.size());
-        for (size_t i = 0; i < counts.size(); ++i) counts[i] = dict_.count(haystacks[i]);
-        return status_t::success_k;
+
+        // Preallocate the events for GPU timing.
+        cudaEvent_t start_event, stop_event;
+        cudaEventCreate(&start_event, cudaEventBlockingSync);
+        cudaEventCreate(&stop_event, cudaEventBlockingSync);
+
+        // Record the start event
+        cudaError_t start_event_error = cudaEventRecord(start_event, executor.stream);
+        if (start_event_error != cudaSuccess) return {status_t::unknown_k, start_event_error};
+
+        // Allocate GPU memory buffer using safe_vector
+        safe_vector<size_t, allocator_type_> counts_buffer(dict_.allocator());
+        if (counts_buffer.try_resize(counts.size()) != status_t::success_k)
+            return {status_t::bad_alloc_k, cudaErrorMemoryAllocation};
+
+        // Calculate optimal thread and block configuration
+        uint const threads_per_block = specs.warp_size * 4;               // 4 warps per block
+        uint const blocks_per_grid = specs.streaming_multiprocessors * 2; // 2 blocks per SM
+
+        // Launch the kernel
+        auto kernel = &_count_matches_with_haystack_per_warp<state_id_t, haystacks_type_, sz_cap_cuda_k>;
+        kernel<<<blocks_per_grid, threads_per_block, 0, executor.stream>>>(
+            haystacks, counts_buffer.data(), dict_.count_states(), dict_.transitions_data(), dict_.outputs_data(),
+            dict_.outputs_counts_data(), dict_.outputs_offsets_data(), dict_.needles_lengths_data(),
+            dict_.max_needle_length());
+
+        // Check for kernel launch errors
+        cudaError_t launch_error = cudaGetLastError();
+        if (launch_error != cudaSuccess) return {status_t::unknown_k, launch_error};
+
+        // Wait for kernel completion
+        cudaError_t execution_error = cudaStreamSynchronize(executor.stream);
+        if (execution_error != cudaSuccess) return {status_t::unknown_k, execution_error};
+
+        // Copy results back to host
+        cudaError_t copy_error = cudaMemcpyAsync(counts.data(), counts_buffer.data(), counts.size() * sizeof(size_t),
+                                                 cudaMemcpyDeviceToHost, executor.stream);
+        if (copy_error != cudaSuccess) return {status_t::unknown_k, copy_error};
+
+        // Record stop event and calculate timing
+        cudaError_t stop_event_error = cudaEventRecord(stop_event, executor.stream);
+        if (stop_event_error != cudaSuccess) return {status_t::unknown_k, stop_event_error};
+
+        float execution_milliseconds = 0;
+        cudaEventElapsedTime(&execution_milliseconds, start_event, stop_event);
+
+        // Clean up events
+        cudaEventDestroy(start_event);
+        cudaEventDestroy(stop_event);
+
+        return {status_t::success_k, cudaSuccess, execution_milliseconds};
     }
 
     /**
@@ -177,7 +377,7 @@ struct find_many<state_id_type_, allocator_type_, sz_cap_cuda_k, enable_> {
      */
     template <typename haystacks_type_, typename output_matches_type_>
     status_t try_find(haystacks_type_ &&haystacks, output_matches_type_ &&matches, size_t &matches_count,
-                      gpu_specs_t const &specs = {}) const noexcept {
+                      cuda_executor_t executor = {}, gpu_specs_t const &specs = {}) const noexcept {
         size_t count_found = 0, count_allowed = matches.size();
         for (auto it = haystacks.begin(); it != haystacks.end() && count_found != count_allowed; ++it)
             dict_.find(*it, [&](match_t match) {
@@ -196,6 +396,8 @@ struct find_many<state_id_type_, allocator_type_, sz_cap_cuda_k, enable_> {
 
 #pragma endregion // General Purpose CUDA Backend
 
+using find_many_u32_cuda_t = find_many<u32_t, unified_alloc<char>, sz_cap_cuda_k>;
+
 } // namespace stringzilla
 } // namespace ashvardanian
 
diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index 31f4a4e8..f3dbbdbc 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -632,6 +632,11 @@ struct find_many {
 
 #pragma region - Parallel Backend
 
+struct _count_short_needle_matches_in_one_part_t {
+    size_t total = 0;
+    size_t prefix = 0;
+};
+
 /**
  *  @brief  Aho-Corasick-based @b multi-threaded multi-pattern exact substring search with.
  *  @note   Construction of the FSM is not parallelized, as it is not generally a bottleneck.
@@ -726,7 +731,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
             bool const longest_needle_fits_on_one_core = padded_max_needle_length * cores_total < haystack_length;
             if (longest_needle_fits_on_one_core)
                 executor.for_each_thread([&](size_t core_index) noexcept {
-                    count_short_needle_matches_in_one_part_t partial_result =
+                    _count_short_needle_matches_in_one_part_t partial_result =
                         count_short_needle_matches_in_one_part(haystack, core_index, cores_total, cache_line_width);
                     if (core_index != 0) partial_result.total -= partial_result.prefix;
                     count_across_cores.fetch_add(partial_result.total, std::memory_order_relaxed);
@@ -931,11 +936,6 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
         return count_matches_non_overlapping + count_matches_overlapping;
     }
 
-    struct count_short_needle_matches_in_one_part_t {
-        size_t total = 0;
-        size_t prefix = 0;
-    };
-
     /**
      *  @brief  Helper method implementing the core logic of the parallel `try_count` and part of `try_find`.
      *
@@ -944,7 +944,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
      *  match can only spill into 2 core regions, starting in one and ending in another.
      */
     template <typename char_type_>
-    count_short_needle_matches_in_one_part_t count_short_needle_matches_in_one_part(
+    _count_short_needle_matches_in_one_part_t count_short_needle_matches_in_one_part(
         span<char_type_ const> haystack, size_t core_index, size_t cores_total,
         size_t cache_line_width) const noexcept {
 
@@ -967,7 +967,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
 
         // Reimplement the serial `aho_corasick_dictionary::count` keeping track of the matches,
         // entirely fitting in the prefix
-        count_short_needle_matches_in_one_part_t result;
+        _count_short_needle_matches_in_one_part_t result;
         state_id_t current_state = 0;
         auto const outputs_counts = dict_.outputs_counts();
         auto const transitions = dict_.transitions();
diff --git a/scripts/bench_find_many.cu b/scripts/bench_find_many.cu
new file mode 100644
index 00000000..e0a0d108
--- /dev/null
+++ b/scripts/bench_find_many.cu
@@ -0,0 +1,71 @@
+/**
+ *  @file   bench_find_many.cu
+ *  @brief  Benchmarks for exact multi-pattern substring search algorithms on the GPU.
+ *          The program accepts a file path to a dataset, tokenizes it, and benchmarks the search operations,
+ *          validating the SIMD-accelerated backends against the serial baselines.
+ *
+ *  Benchmarks include:
+ *  - Multi-pattern substring match counting.
+ *  - Multi-pattern substring search.
+ *  - Multi-pattern matcher construction time.
+ *
+ *  Instead of CLI arguments, for compatibility with @b StringWa.rs, the following environment variables are used:
+ *  - `STRINGWARS_DATASET` : Path to the dataset file.
+ *  - `STRINGWARS_TOKENS=words` : Tokenization model ("file", "lines", "words", or positive integer [1:200] for N-grams
+ *  - `STRINGWARS_SEED=42` : Optional seed for shuffling reproducibility.
+ *
+ *  Unlike StringWa.rs, the following additional environment variables are supported:
+ *  - `STRINGWARS_DURATION=10` : Time limit (in seconds) per benchmark.
+ *  - `STRINGWARS_STRESS=1` : Test SIMD-accelerated functions against the serial baselines.
+ *  - `STRINGWARS_STRESS_DIR=/.tmp` : Output directory for stress-testing failures logs.
+ *  - `STRINGWARS_STRESS_LIMIT=1` : Controls the number of failures we're willing to tolerate.
+ *  - `STRINGWARS_STRESS_DURATION=10` : Stress-testing time limit (in seconds) per benchmark.
+ *  - `STRINGWARS_FILTER` : Regular Expression pattern to filter algorithm/backend names.
+ *
+ *  Here are a few build & run commands:
+ *
+ *  @code{.sh}
+ *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
+ *  cmake --build build_release --config Release --target stringcuzilla_bench_find_many
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringcuzilla_bench_find_many
+ *  @endcode
+ *
+ *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
+ *  like all Skylake-X and newer kernels on a boundary-condition input length of 64 bytes (exactly 1 cache line),
+ *  your last command may look like:
+ *
+ *  @code{.sh}
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
+ *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
+ *  build_release/stringcuzilla_bench_find_many
+ *  @endcode
+ *
+ *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
+ *  This file is the sibling of `bench_sequence.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
+ */
+#include "bench_find_many.cuh"
+
+namespace sz = ashvardanian::stringzilla;
+using namespace sz::scripts;
+
+int main(int argc, char const **argv) {
+    std::printf("Welcome to StringZilla!\n");
+
+    try {
+        std::printf("Building up the environment...\n");
+        environment_t env = build_environment( //
+            argc, argv,                        //
+            "xlsum.csv",                       // Preferred for UTF-8 content
+            environment_t::tokenization_t::words_k);
+
+        std::printf("Starting string multi-pattern search benchmarks...\n");
+        bench_find_many(env);
+    }
+    catch (std::exception const &e) {
+        std::fprintf(stderr, "Failed with: %s\n", e.what());
+        return 1;
+    }
+
+    std::printf("All benchmarks finished.\n");
+    return 0;
+}
\ No newline at end of file
diff --git a/scripts/bench_find_many.cuh b/scripts/bench_find_many.cuh
index b051e2e0..2318ad66 100644
--- a/scripts/bench_find_many.cuh
+++ b/scripts/bench_find_many.cuh
@@ -59,16 +59,16 @@ struct find_many_callable {
         constexpr bool only_counts_k = std::is_same_v<results_t, counts_t>;
         if constexpr (only_counts_k)
             status = std::apply(
-                [&](auto &&...rest) {
-                    auto result = engine.try_count(haystacks, counts_span, rest...);
+                [&](auto &&...rest) mutable {
+                    volatile auto result = engine.try_count(haystacks, counts_span, rest...);
                     for (auto &count : counts_span) do_not_optimize(count);
                     return result;
                 },
                 extra_args);
         else
             status = std::apply(
-                [&](auto &&...rest) {
-                    auto result = engine.try_find(haystacks, counts_span, matches_span, rest...);
+                [&](auto &&...rest) mutable {
+                    volatile auto result = engine.try_find(haystacks, counts_span, matches_span, rest...);
                     for (auto &match : matches_span) do_not_optimize(match);
                     return result;
                 },
@@ -83,13 +83,13 @@ struct find_many_callable {
             bytes_passed += haystacks[i].size();
             character_comparisons += haystacks[i].size() * needle_characters;
         }
-        call_result_t call_result;
+        volatile call_result_t call_result;
         call_result.bytes_passed = bytes_passed;
         call_result.operations = character_comparisons;
         call_result.inputs_processed = haystacks.size();
         call_result.check_value = only_counts_k ? reinterpret_cast<check_value_t>(&results_counts_per_haystack)
                                                 : reinterpret_cast<check_value_t>(&results_matches_per_haystack);
-        return call_result;
+        return (call_result_t const &)call_result;
     }
 };
 
@@ -193,6 +193,28 @@ void bench_find_many(environment_t const &env) {
 
         scramble_accelerated_results(counts_accelerated);
         scramble_accelerated_results(matches_accelerated);
+
+        // CUDA-accelerated search
+#if SZ_USE_CUDA
+        bench_nullary( //
+            env, "count_many_cuda:"s + shape_suffix, call_count_baseline,
+            find_many_callable<find_many_u32_cuda_t, counts_t, cuda_executor_t, gpu_specs_t>( //
+                env, counts_accelerated, matches_accelerated, dict, {}, {}, specs),
+            callable_no_op_t {},  // preprocessing
+            counts_equality_t {}) // equality check
+            .log(count_baseline);
+
+        bench_nullary( //
+            env, "find_many_cuda:"s + shape_suffix, call_find_baseline,
+            find_many_callable<find_many_u32_cuda_t, matches_t, cuda_executor_t, gpu_specs_t>( //
+                env, counts_accelerated, matches_accelerated, dict, {}, {}, specs),
+            callable_no_op_t {},   // preprocessing
+            matches_equality_t {}) // equality check
+            .log(find_baseline);
+
+        scramble_accelerated_results(counts_accelerated);
+        scramble_accelerated_results(matches_accelerated);
+#endif
     }
 }
 

From 72affc239e08b7a628294a9673cb78de26b90707 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 4 Jul 2025 21:13:30 +0000
Subject: [PATCH 438/751] Fix: `find_many.cuh` compilation issues

---
 include/stringcuzilla/find_many.cuh | 55 +++++++++++++++++------------
 include/stringcuzilla/find_many.hpp |  4 +--
 scripts/bench_find_many.cuh         |  9 ++---
 scripts/test_stringcuzilla.cpp      |  2 +-
 4 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/include/stringcuzilla/find_many.cuh b/include/stringcuzilla/find_many.cuh
index 3191f143..52a08d92 100644
--- a/include/stringcuzilla/find_many.cuh
+++ b/include/stringcuzilla/find_many.cuh
@@ -61,11 +61,11 @@ __forceinline__ __device__ scalar_type_ _reduce_in_warp(scalar_type_ x) noexcept
  *
  *  @tparam small_size_type_ Helps us avoid 64-bit arithmetic in favor of smaller 16- or 32-bit offsets/lengths.
  */
-template <typename small_size_type_, typename char_type_, typename state_id_type_>
+template <typename small_size_type_, typename char_type_, typename state_id_type_, state_id_type_ alphabet_size_ = 256>
 __device__ _count_short_needle_matches_in_one_part_t _count_short_needle_matches_in_one_part_per_warp_thread( //
     char_type_ const *haystack_begin, size_t const haystack_length,                                           //
     size_t const count_states,                                                                                //
-    state_id_type_ const *transitions,                                                                        //
+    safe_array<state_id_type_, alphabet_size_> const *transitions,                                            //
     state_id_type_ const *outputs_counts,                                                                     //
     size_t const max_needle_length) noexcept {
 
@@ -96,7 +96,7 @@ __device__ _count_short_needle_matches_in_one_part_t _count_short_needle_matches
         current_state = transitions[current_state][*optimal_start];
         small_size_t const outputs_count = static_cast<small_size_t>(outputs_counts[current_state]);
         result_total += outputs_count;
-        result_prefix += non_zero_if(outputs_count, optimal_start < prefix_end);
+        result_prefix += non_zero_if<small_size_t>(outputs_count, optimal_start < prefix_end);
     }
 
     // Re-package into larger output types:
@@ -114,11 +114,11 @@ __device__ _count_short_needle_matches_in_one_part_t _count_short_needle_matches
  *  There are countless divergent branches in this solution, that depending on the vocabulary can result
  *  in extremely low performance.
  */
-template <typename small_size_type_, typename char_type_, typename state_id_type_>
+template <typename small_size_type_, typename char_type_, typename state_id_type_, state_id_type_ alphabet_size_>
 __device__ small_size_type_ _count_needle_matches_in_one_part_per_warp_thread( //
     char_type_ const *haystack_begin, size_t const haystack_length,            //
     size_t const count_states,                                                 //
-    state_id_type_ const *transitions,                                         //
+    safe_array<state_id_type_, alphabet_size_> const *transitions,             //
     state_id_type_ const *outputs,                                             //
     state_id_type_ const *outputs_counts,                                      //
     state_id_type_ const *outputs_offsets,                                     //
@@ -174,13 +174,14 @@ __device__ small_size_type_ _count_needle_matches_in_one_part_per_warp_thread( /
  */
 template < //
     typename state_id_type_,
-    typename haystacks_strings_type_,           //
+    typename haystacks_strings_type_, //
+    state_id_type_ alphabet_size_ = 256,
     sz_capability_t capability_ = sz_cap_cuda_k //
     >
 __global__ void _count_matches_with_haystack_per_warp(              //
     haystacks_strings_type_ haystacks, size_t *counts_per_haystack, //
     size_t const count_states,                                      //
-    state_id_type_ const *transitions,                              //
+    safe_array<state_id_type_, alphabet_size_> const *transitions,  //
     state_id_type_ const *outputs,                                  //
     state_id_type_ const *outputs_counts,                           //
     state_id_type_ const *outputs_offsets,                          //
@@ -216,19 +217,21 @@ __global__ void _count_matches_with_haystack_per_warp(              //
         small_size_t results_per_thread = 0;
         if (longest_needle_fits_on_one_thread) {
             _count_short_needle_matches_in_one_part_t partial_result =
-                _count_short_needle_matches_in_one_part_per_warp_thread<small_size_t, char_t, state_id_t>( //
-                    haystack_begin, haystack_length,                                                       //
-                    count_states, transitions,                                                             //
+                _count_short_needle_matches_in_one_part_per_warp_thread<small_size_t, char_t, state_id_t,
+                                                                        alphabet_size_>( //
+                    haystack_begin, haystack_length,                                     //
+                    count_states, transitions,                                           //
                     outputs_counts, max_length_among_needles);
             results_per_thread =
                 partial_result.total - non_zero_if<small_size_t>(partial_result.prefix, warp_thread_index != 0);
         }
         else {
-            results_per_thread = _count_matches_in_one_part_per_warp_thread<small_size_t, char_t, state_id_t>( //
-                haystack_begin, haystack_length,                                                               //
-                count_states, transitions,                                                                     //
-                outputs, outputs_counts, outputs_offsets,                                                      //
-                needles_lengths, max_length_among_needles);
+            results_per_thread =
+                _count_needle_matches_in_one_part_per_warp_thread<small_size_t, char_t, state_id_t, alphabet_size_>( //
+                    haystack_begin, haystack_length,                                                                 //
+                    count_states, transitions,                                                                       //
+                    outputs, outputs_counts, outputs_offsets,                                                        //
+                    needles_lengths, max_length_among_needles);
         }
 
         small_size_t results_across_warp = _reduce_in_warp(results_per_thread);
@@ -277,6 +280,9 @@ struct find_many<state_id_type_, allocator_type_, sz_cap_cuda_k, enable_> {
     using state_id_t = typename dictionary_t::state_id_t;
     using allocator_t = typename dictionary_t::allocator_t;
     using match_t = typename dictionary_t::match_t;
+    using state_transitions_t = typename dictionary_t::state_transitions_t;
+
+    static constexpr state_id_t alphabet_size_k = dictionary_t::alphabet_size_k;
 
     find_many(allocator_t alloc = allocator_t()) noexcept : dict_(alloc) {}
     void reset() noexcept { dict_.reset(); }
@@ -315,6 +321,10 @@ struct find_many<state_id_type_, allocator_type_, sz_cap_cuda_k, enable_> {
 
         _sz_assert(counts.size() == haystacks.size());
 
+        using haystacks_t = typename std::decay_t<haystacks_type_>;
+        static_assert(std::is_nothrow_copy_constructible_v<haystacks_t>,
+                      "Haystack type must be nothrow copy constructible");
+
         // Preallocate the events for GPU timing.
         cudaEvent_t start_event, stop_event;
         cudaEventCreate(&start_event, cudaEventBlockingSync);
@@ -325,7 +335,8 @@ struct find_many<state_id_type_, allocator_type_, sz_cap_cuda_k, enable_> {
         if (start_event_error != cudaSuccess) return {status_t::unknown_k, start_event_error};
 
         // Allocate GPU memory buffer using safe_vector
-        safe_vector<size_t, allocator_type_> counts_buffer(dict_.allocator());
+        using size_allocator_t = typename std::allocator_traits<allocator_t>::template rebind_alloc<size_t>;
+        safe_vector<size_t, size_allocator_t> counts_buffer(dict_.allocator());
         if (counts_buffer.try_resize(counts.size()) != status_t::success_k)
             return {status_t::bad_alloc_k, cudaErrorMemoryAllocation};
 
@@ -334,11 +345,12 @@ struct find_many<state_id_type_, allocator_type_, sz_cap_cuda_k, enable_> {
         uint const blocks_per_grid = specs.streaming_multiprocessors * 2; // 2 blocks per SM
 
         // Launch the kernel
-        auto kernel = &_count_matches_with_haystack_per_warp<state_id_t, haystacks_type_, sz_cap_cuda_k>;
+        auto kernel = &_count_matches_with_haystack_per_warp<state_id_t, haystacks_t, alphabet_size_k, sz_cap_cuda_k>;
         kernel<<<blocks_per_grid, threads_per_block, 0, executor.stream>>>(
-            haystacks, counts_buffer.data(), dict_.count_states(), dict_.transitions_data(), dict_.outputs_data(),
-            dict_.outputs_counts_data(), dict_.outputs_offsets_data(), dict_.needles_lengths_data(),
-            dict_.max_needle_length());
+            haystacks, counts_buffer.data(),                                                       //
+            dict_.count_states(), dict_.transitions().data(),                                      //
+            dict_.outputs().data(), dict_.outputs_counts().data(), dict_.outputs_offsets().data(), //
+            dict_.needles_lengths().data(), dict_.max_needle_length());
 
         // Check for kernel launch errors
         cudaError_t launch_error = cudaGetLastError();
@@ -376,7 +388,7 @@ struct find_many<state_id_type_, allocator_type_, sz_cap_cuda_k, enable_> {
      *  @note The @p matches reference objects should be assignable from @b `match_t`.
      */
     template <typename haystacks_type_, typename output_matches_type_>
-    status_t try_find(haystacks_type_ &&haystacks, output_matches_type_ &&matches, size_t &matches_count,
+    status_t try_find(haystacks_type_ &&haystacks, span<size_t const> counts, output_matches_type_ &&matches,
                       cuda_executor_t executor = {}, gpu_specs_t const &specs = {}) const noexcept {
         size_t count_found = 0, count_allowed = matches.size();
         for (auto it = haystacks.begin(); it != haystacks.end() && count_found != count_allowed; ++it)
@@ -386,7 +398,6 @@ struct find_many<state_id_type_, allocator_type_, sz_cap_cuda_k, enable_> {
                 count_found++;
                 return count_found < count_allowed;
             });
-        matches_count = count_found;
         return status_t::success_k;
     }
 
diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index f3dbbdbc..60720e88 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -130,14 +130,14 @@ struct aho_corasick_dictionary {
     using state_id_t = state_id_type_;
     using allocator_t = allocator_type_;
     using match_t = find_many_match_t;
+    static_assert(std::is_unsigned_v<state_id_t>, "State ID should be unsigned");
 
-  private:
     static constexpr state_id_t alphabet_size_k = 256;
     static constexpr state_id_t invalid_state_k = std::numeric_limits<state_id_t>::max();
     static constexpr size_t invalid_length_k = std::numeric_limits<size_t>::max();
     using state_transitions_t = safe_array<state_id_t, alphabet_size_k>;
-    static_assert(std::is_unsigned_v<state_id_t>, "State ID should be unsigned");
 
+  private:
     using size_allocator_t = typename std::allocator_traits<allocator_t>::template rebind_alloc<size_t>;
     using state_id_allocator_t = typename std::allocator_traits<allocator_t>::template rebind_alloc<state_id_t>;
     using state_transitions_allocator_t =
diff --git a/scripts/bench_find_many.cuh b/scripts/bench_find_many.cuh
index 2318ad66..b9280ebc 100644
--- a/scripts/bench_find_many.cuh
+++ b/scripts/bench_find_many.cuh
@@ -45,8 +45,9 @@ struct find_many_callable {
 
     call_result_t operator()() noexcept(false) {
 
-        span<char const> const dataset_view = {env.dataset.data(), env.dataset.size()};
-        span<span<char const> const> haystacks = {&dataset_view, 1};
+        using chars_view_t = span<char const>;
+        chars_view_t const dataset_view = {env.dataset.data(), env.dataset.size()};
+        span<chars_view_t const> haystacks = {&dataset_view, 1};
 
         // Without `volatile`, the serial logic keeps being optimized out!
         volatile sz::status_t status = engine.try_build(dictionary);
@@ -60,7 +61,7 @@ struct find_many_callable {
         if constexpr (only_counts_k)
             status = std::apply(
                 [&](auto &&...rest) mutable {
-                    volatile auto result = engine.try_count(haystacks, counts_span, rest...);
+                    auto result = engine.try_count(haystacks, counts_span, rest...);
                     for (auto &count : counts_span) do_not_optimize(count);
                     return result;
                 },
@@ -68,7 +69,7 @@ struct find_many_callable {
         else
             status = std::apply(
                 [&](auto &&...rest) mutable {
-                    volatile auto result = engine.try_find(haystacks, counts_span, matches_span, rest...);
+                    auto result = engine.try_find(haystacks, counts_span, matches_span, rest...);
                     for (auto &match : matches_span) do_not_optimize(match);
                     return result;
                 },
diff --git a/scripts/test_stringcuzilla.cpp b/scripts/test_stringcuzilla.cpp
index 22189c3b..f3983115 100644
--- a/scripts/test_stringcuzilla.cpp
+++ b/scripts/test_stringcuzilla.cpp
@@ -38,9 +38,9 @@ int main(int argc, char const **argv) {
     if (auto code = sz::scripts::log_environment(); code != 0) return code;
 
     try {
+        sz::scripts::test_find_many_equivalence();
         sz::scripts::test_similarity_scores_equivalence();
         sz::scripts::test_similarity_scores_memory_usage();
-        sz::scripts::test_find_many_equivalence();
     }
     catch (std::exception const &e) {
         std::fprintf(stderr, "Failed with: %s\n", e.what());

From 1133cce3345840653e3c099d7eb99a79e4321165 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 4 Jul 2025 21:14:04 +0000
Subject: [PATCH 439/751] Make: Missing `bench_search` target

---
 .github/workflows/prerelease.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index e2d92970..c72f7493 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -88,7 +88,7 @@ jobs:
       - name: Test on Real World Data
         run: |
           build_artifacts/stringzilla_bench_memory ${DATASET_PATH}     # for string copies and fills
-          build_artifacts/stringzilla_bench_search ${DATASET_PATH}     # for substring search
+          build_artifacts/stringzilla_bench_find ${DATASET_PATH}     # for substring search
           build_artifacts/stringzilla_bench_token ${DATASET_PATH}      # for hashing, equality comparisons, etc.
           build_artifacts/stringzilla_bench_similarity ${DATASET_PATH} # for edit distances and alignment scores
           build_artifacts/stringzilla_bench_sequence ${DATASET_PATH}   # for sorting arrays of strings
@@ -172,7 +172,7 @@ jobs:
         run: build_artifacts/stringzilla_test_cpp20
       - name: Test on Real World Data
         run: |
-          build_artifacts/stringzilla_bench_search ${DATASET_PATH}     # for substring search
+          build_artifacts/stringzilla_bench_find ${DATASET_PATH}     # for substring search
           build_artifacts/stringzilla_bench_token ${DATASET_PATH}      # for hashing, equality comparisons, etc.
           build_artifacts/stringzilla_bench_similarity ${DATASET_PATH} # for edit distances and alignment scores
           build_artifacts/stringzilla_bench_sequence ${DATASET_PATH}       # for sorting arrays of strings
@@ -304,7 +304,7 @@ jobs:
         run: build_artifacts/stringzilla_test_cpp17
       - name: Test on Real World Data
         run: |
-          build_artifacts/stringzilla_bench_search ${DATASET_PATH}     # for substring search
+          build_artifacts/stringzilla_bench_find ${DATASET_PATH}     # for substring search
           build_artifacts/stringzilla_bench_token ${DATASET_PATH}      # for hashing, equality comparisons, etc.
           build_artifacts/stringzilla_bench_similarity ${DATASET_PATH} # for edit distances and alignment scores
           build_artifacts/stringzilla_bench_sequence ${DATASET_PATH}       # for sorting arrays of strings
@@ -377,7 +377,7 @@ jobs:
         run: .\build_artifacts\stringzilla_test_cpp20.exe
       - name: Test on Real World Data
         run: |
-          .\build_artifacts\stringzilla_bench_search.exe ${DATASET_PATH}     # for substring search
+          .\build_artifacts\stringzilla_bench_find.exe ${DATASET_PATH}     # for substring search
           .\build_artifacts\stringzilla_bench_token.exe ${DATASET_PATH}      # for hashing, equality comparisons, etc.
           .\build_artifacts\stringzilla_bench_similarity.exe ${DATASET_PATH} # for edit distances and alignment scores
           .\build_artifacts\stringzilla_bench_sequence.exe ${DATASET_PATH}       # for sorting arrays of strings

From 3e0415734aadc552d1a7fc691168cc42618aa73b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 4 Jul 2025 21:14:53 +0000
Subject: [PATCH 440/751] Break: Drop fingerprinting bench

---
 scripts/bench_fingerprint.cpp | 139 ----------------------------------
 1 file changed, 139 deletions(-)
 delete mode 100644 scripts/bench_fingerprint.cpp

diff --git a/scripts/bench_fingerprint.cpp b/scripts/bench_fingerprint.cpp
deleted file mode 100644
index cbc2812c..00000000
--- a/scripts/bench_fingerprint.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/**
- *  @file   bench_token.cpp
- *  @brief  Benchmarks token-level operations like hashing, equality, ordering, and copies.
- *
- *  This file is the sibling of `bench_sort.cpp`, `bench_search.cpp` and `bench_similarity.cpp`.
- */
-#include <numeric> // `std::accumulate`
-
-#include <bench.hpp>
-#include <test.hpp> // `random_string`
-
-using namespace ashvardanian::stringzilla::scripts;
-
-tracked_unary_functions_t sliding_hashing_functions(std::size_t window_width, std::size_t step) {
-#if _SZ_DEPRECATED_FINGERPRINTS
-    auto wrap_sz = [=](auto function) -> unary_function_t {
-        return unary_function_t([function, window_width, step](std::string_view s) {
-            sz_size_t mixed_hash = 0;
-            function(s.data(), s.size(), window_width, step, _sz_hashes_fingerprint_scalar_callback, &mixed_hash);
-            return mixed_hash;
-        });
-    };
-#endif
-    std::string suffix = std::to_string(window_width) + ":step" + std::to_string(step);
-    tracked_unary_functions_t result = {
-#if _SZ_DEPRECATED_FINGERPRINTS
-#if SZ_USE_ICE
-        {"sz_hashes_ice:" + suffix, wrap_sz(sz_hashes_ice)},
-#endif
-#if SZ_USE_HASWELL
-        {"sz_hashes_haswell:" + suffix, wrap_sz(sz_hashes_haswell)},
-#endif
-        {"sz_hashes_serial:" + suffix, wrap_sz(sz_hashes_serial)},
-#endif
-    };
-    return result;
-}
-
-tracked_unary_functions_t fingerprinting_functions(std::size_t window_width = 8, std::size_t fingerprint_bytes = 4096) {
-    using fingerprint_slot_t = std::uint8_t;
-    static std::vector<fingerprint_slot_t> fingerprint;
-    fingerprint.resize(fingerprint_bytes / sizeof(fingerprint_slot_t));
-    auto wrap_sz = [](auto function) -> unary_function_t {
-        return unary_function_t([function](std::string_view s) {
-            sz_size_t mixed_hash = 0;
-            sz_unused(s);
-            return mixed_hash;
-        });
-    };
-    tracked_unary_functions_t result = {};
-    sz_unused(window_width && fingerprint_bytes);
-    sz_unused(wrap_sz);
-    return result;
-}
-
-tracked_unary_functions_t random_generation_functions(std::size_t token_length) {
-    static std::vector<char> buffer;
-    if (buffer.size() < token_length) buffer.resize(token_length);
-
-    auto suffix = ", " + std::to_string(token_length) + " chars";
-    tracked_unary_functions_t result = {
-        {"std::rand % uint8" + suffix, unary_function_t([token_length](std::string_view alphabet) -> std::size_t {
-             using max_alphabet_size_t = std::uint8_t;
-             auto max_alphabet_size = static_cast<max_alphabet_size_t>(alphabet.size());
-             for (std::size_t i = 0; i < token_length; ++i) { buffer[i] = alphabet[std::rand() % max_alphabet_size]; }
-             return token_length;
-         })},
-        {"std::uniform_int<uint8>" + suffix, unary_function_t([token_length](std::string_view alphabet) -> std::size_t {
-             randomize_string(buffer.data(), token_length, alphabet.data(), alphabet.size());
-             return token_length;
-         })},
-        {"sz::randomize" + suffix, unary_function_t([token_length](std::string_view alphabet) -> std::size_t {
-             sz::string_span span(buffer.data(), token_length);
-             sz::randomize(span, global_random_generator(), alphabet);
-             return token_length;
-         })},
-    };
-    return result;
-}
-
-template <typename string_type>
-void bench_dereferencing(std::string name, std::vector<string_type> strings) {
-    auto func = unary_function_t([](std::string_view s) { return s.size(); });
-    tracked_unary_functions_t converts = {{name, func}};
-    bench_unary_functions(strings, converts);
-}
-
-template <typename strings_type>
-void bench(strings_type &&strings) {
-    if (strings.size() == 0) return;
-
-    // Benchmark logical operations
-    bench_unary_functions(strings, bytesum_functions());
-    bench_unary_functions(strings, hashing_functions());
-    bench_binary_functions(strings, equality_functions());
-    bench_binary_functions(strings, ordering_functions());
-
-    // Benchmark the cost of converting `std::string` and `sz::string` to `std::string_view`.
-    // ! The results on a mixture of short and long strings should be similar.
-    // ! If the dataset is made of exclusively short or long strings, STL will look much better
-    // ! in this micro-benchmark, as the correct branch of the SSO will be predicted every time.
-    bench_dereferencing<std::string>("std::string -> std::string_view", {strings.begin(), strings.end()});
-    bench_dereferencing<sz::string>("sz::string -> std::string_view", {strings.begin(), strings.end()});
-}
-
-void bench_on_input_data(int argc, char const **argv) {
-    dataset_t dataset = prepare_benchmark_environment(argc, argv);
-    std::printf("Benchmarking on the entire dataset:\n");
-
-    // When performing fingerprinting, it's extremely important to:
-    //      1. Have small output fingerprints that fit the cache.
-    //      2. Have that memory in close affinity to the core, ideally on stack, to avoid cache coherency problems.
-    // This introduces an additional challenge for efficient fingerprinting, as the CPU caches vary a lot.
-    // On the Intel Sapphire Rapids 6455B Gold CPU they are 96 KiB x2 for L1d, 4 MiB x2 for L2.
-    // Spilling into the L3 is a bad idea.
-    bench_unary_functions<std::vector<std::string_view>>({dataset.text}, sliding_hashing_functions(7, 1));
-    bench_unary_functions<std::vector<std::string_view>>({dataset.text}, sliding_hashing_functions(17, 4));
-    bench_unary_functions<std::vector<std::string_view>>({dataset.text}, sliding_hashing_functions(33, 8));
-    bench_unary_functions<std::vector<std::string_view>>({dataset.text}, sliding_hashing_functions(127, 16));
-
-    bench_unary_functions<std::vector<std::string_view>>({dataset.text}, fingerprinting_functions(128, 4 * 1024));
-    bench_unary_functions<std::vector<std::string_view>>({dataset.text}, fingerprinting_functions(128, 64 * 1024));
-    bench_unary_functions<std::vector<std::string_view>>({dataset.text}, fingerprinting_functions(128, 1024 * 1024));
-}
-
-void bench_on_synthetic_data() {
-    // Generate some random words
-}
-
-int main(int argc, char const **argv) {
-    std::printf("StringZilla. Starting token-level benchmarks.\n");
-    std::printf("- Seconds per benchmark: %zu\n", seconds_per_benchmark);
-
-    if (argc < 2) { bench_on_synthetic_data(); }
-    else { bench_on_input_data(argc, argv); }
-
-    std::printf("All benchmarks passed.\n");
-    return 0;
-}
\ No newline at end of file

From cf7d9a5b78ae21314a50d746a02b6a7ea0bd0d36 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 9 Jul 2025 13:45:11 +0000
Subject: [PATCH 441/751] Improve: Use CUDA Atomics to aggregate globally

This was used in `find_many.cuh` to simplify
the kernel handling very long haystacks.
---
 include/stringcuzilla/find_many.cuh | 112 +++++++++++++++++++++++-----
 include/stringzilla/types.hpp       |  10 +++
 2 files changed, 103 insertions(+), 19 deletions(-)

diff --git a/include/stringcuzilla/find_many.cuh b/include/stringcuzilla/find_many.cuh
index 52a08d92..67bf5984 100644
--- a/include/stringcuzilla/find_many.cuh
+++ b/include/stringcuzilla/find_many.cuh
@@ -28,17 +28,13 @@
 
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <cuda/atomic>
 
 namespace ashvardanian {
 namespace stringzilla {
 
 #pragma region - General Purpose CUDA Backend
 
-template <typename value_type_>
-constexpr value_type_ non_zero_if(value_type_ value, value_type_ condition) noexcept {
-    return value * condition;
-}
-
 /**
  *  @brief A more generic alternative to `__reduce_add_sync`.
  */
@@ -67,7 +63,8 @@ __device__ _count_short_needle_matches_in_one_part_t _count_short_needle_matches
     size_t const count_states,                                                                                //
     safe_array<state_id_type_, alphabet_size_> const *transitions,                                            //
     state_id_type_ const *outputs_counts,                                                                     //
-    size_t const max_needle_length) noexcept {
+    size_t const max_needle_length,                                                                           //
+    size_t const thread_index, size_t const thread_pool_size) noexcept {
 
     using char_t = char_type_;
     using state_id_t = state_id_type_;
@@ -77,12 +74,10 @@ __device__ _count_short_needle_matches_in_one_part_t _count_short_needle_matches
     small_size_t const haystack_bytes_length = static_cast<small_size_t>(haystack_length * sizeof(char_t));
     byte_t const *const haystack_end = haystack_data + haystack_bytes_length;
 
-    small_size_t const warp_size = static_cast<small_size_t>(warpSize);
-    small_size_t const warp_thread_index = static_cast<small_size_t>(threadIdx.x % warp_size);
-    small_size_t const bytes_per_thread_optimal = divide_round_up(haystack_bytes_length, warp_size);
+    small_size_t const bytes_per_thread_optimal = divide_round_up(haystack_bytes_length, thread_pool_size);
 
     // We may have a case of a thread receiving no data at all
-    byte_t const *optimal_start = std::min(haystack_data + warp_thread_index * bytes_per_thread_optimal, haystack_end);
+    byte_t const *optimal_start = std::min(haystack_data + thread_index * bytes_per_thread_optimal, haystack_end);
     byte_t const *const prefix_end = std::min(optimal_start + max_needle_length, haystack_end);
     byte_t const *const overlapping_end =
         std::min(optimal_start + bytes_per_thread_optimal + max_needle_length, haystack_end);
@@ -123,7 +118,8 @@ __device__ small_size_type_ _count_needle_matches_in_one_part_per_warp_thread( /
     state_id_type_ const *outputs_counts,                                      //
     state_id_type_ const *outputs_offsets,                                     //
     size_t const *needles_lengths,                                             //
-    size_t const max_needle_length) noexcept {
+    size_t const max_needle_length,                                            //
+    size_t const thread_index, size_t const thread_pool_size) noexcept {
 
     using char_t = char_type_;
     using state_id_t = state_id_type_;
@@ -133,12 +129,10 @@ __device__ small_size_type_ _count_needle_matches_in_one_part_per_warp_thread( /
     small_size_t const haystack_bytes_length = static_cast<small_size_t>(haystack_length * sizeof(char_t));
     byte_t const *const haystack_end = haystack_data + haystack_bytes_length;
 
-    small_size_t const warp_size = static_cast<small_size_t>(warpSize);
-    small_size_t const warp_thread_index = static_cast<small_size_t>(threadIdx.x % warp_size);
-    small_size_t const bytes_per_thread_optimal = divide_round_up(haystack_bytes_length, warp_size);
+    small_size_t const bytes_per_thread_optimal = divide_round_up(haystack_bytes_length, thread_pool_size);
 
     // We may have a case of a thread receiving no data at all
-    byte_t const *optimal_start = std::min(haystack_data + warp_thread_index * bytes_per_thread_optimal, haystack_end);
+    byte_t const *optimal_start = std::min(haystack_data + thread_index * bytes_per_thread_optimal, haystack_end);
     byte_t const *const optimal_end = std::min(optimal_start + bytes_per_thread_optimal, haystack_end);
     byte_t const *const overlapping_end =
         std::min(optimal_start + bytes_per_thread_optimal + max_needle_length, haystack_end);
@@ -221,7 +215,8 @@ __global__ void _count_matches_with_haystack_per_warp(              //
                                                                         alphabet_size_>( //
                     haystack_begin, haystack_length,                                     //
                     count_states, transitions,                                           //
-                    outputs_counts, max_length_among_needles);
+                    outputs_counts, max_length_among_needles,                            //
+                    warp_thread_index, warp_size);
             results_per_thread =
                 partial_result.total - non_zero_if<small_size_t>(partial_result.prefix, warp_thread_index != 0);
         }
@@ -231,11 +226,90 @@ __global__ void _count_matches_with_haystack_per_warp(              //
                     haystack_begin, haystack_length,                                                                 //
                     count_states, transitions,                                                                       //
                     outputs, outputs_counts, outputs_offsets,                                                        //
-                    needles_lengths, max_length_among_needles);
+                    needles_lengths, max_length_among_needles,                                                       //
+                    warp_thread_index, warp_size);
         }
 
         small_size_t results_across_warp = _reduce_in_warp(results_per_thread);
-        counts_per_haystack[haystack_index] = results_across_warp;
+        if (warp_thread_index == 0) counts_per_haystack[haystack_index] = results_across_warp;
+    }
+}
+
+/**
+ *  @brief  Multi-pattern exact substring search on CUDA-capable GPUs, assigning the entire device to one haystack.
+ *
+ *  Nothing smart here. Each warp takes its own part of a single @p `haystack`.
+ *  Different threads in a warp take different continuous slices of a shared haystack.
+ *  This works best for fairly short needles and a large quantity of haystacks.
+ */
+template < //
+    typename state_id_type_,
+    typename haystack_string_type_, //
+    state_id_type_ alphabet_size_ = 256,
+    sz_capability_t capability_ = sz_cap_cuda_k //
+    >
+__global__ void _count_matches_with_haystack_per_device(           //
+    haystack_string_type_ haystack, size_t haystack_index,         //
+    size_t const count_states,                                     //
+    safe_array<state_id_type_, alphabet_size_> const *transitions, //
+    state_id_type_ const *outputs,                                 //
+    state_id_type_ const *outputs_counts,                          //
+    state_id_type_ const *outputs_offsets,                         //
+    size_t const *needles_lengths,                                 //
+    size_t const max_length_among_needles) {
+
+    // We only use this kernel for small haystacks, where a smaller integer type is enough for size.
+    using small_size_t = uint;
+    using haystack_t = haystack_string_type_;
+    using char_t = typename haystack_t::value_type;
+    using state_id_t = state_id_type_;
+
+    // We may have multiple warps operating in the same block.
+    uint const warp_size = warpSize;
+    size_t const global_thread_index = static_cast<uint>(blockIdx.x * blockDim.x + threadIdx.x);
+    size_t const global_warp_index = static_cast<uint>(global_thread_index / warp_size);
+    size_t const warps_per_block = static_cast<uint>(blockDim.x / warp_size);
+    size_t const warps_per_device = static_cast<uint>(gridDim.x * warps_per_block);
+    size_t const threads_per_device = warps_per_device * warp_size;
+    uint const warp_thread_index = static_cast<uint>(global_thread_index % warp_size);
+
+    char_t const *const haystack_begin = haystack.data();
+    small_size_t const haystack_length = static_cast<small_size_t>(haystack.size());
+    small_size_t const chars_per_core_optimal = divide_round_up<small_size_t>(haystack_length, threads_per_device);
+
+    // We shouldn't even consider needles longer than the haystack
+    small_size_t const max_needle_length =
+        std::min(static_cast<small_size_t>(max_length_among_needles), haystack_length);
+    bool const longest_needle_fits_on_one_thread = max_needle_length * threads_per_device < haystack_length;
+    small_size_t results_per_thread = 0;
+    if (longest_needle_fits_on_one_thread) {
+        _count_short_needle_matches_in_one_part_t partial_result =
+            _count_short_needle_matches_in_one_part_per_warp_thread<small_size_t, char_t, state_id_t,
+                                                                    alphabet_size_>( //
+                haystack_begin, haystack_length,                                     //
+                count_states, transitions,                                           //
+                outputs_counts, max_length_among_needles,                            //
+                global_thread_index, threads_per_device);
+        results_per_thread =
+            partial_result.total - non_zero_if<small_size_t>(partial_result.prefix, global_thread_index != 0);
+    }
+    else {
+        results_per_thread =
+            _count_needle_matches_in_one_part_per_warp_thread<small_size_t, char_t, state_id_t, alphabet_size_>( //
+                haystack_begin, haystack_length,                                                                 //
+                count_states, transitions,                                                                       //
+                outputs, outputs_counts, outputs_offsets,                                                        //
+                needles_lengths, max_length_among_needles,                                                       //
+                global_thread_index, threads_per_device);
+    }
+
+    // Instead of the efficient tree-like shared-memory reductions with subsequent writes, we simply use atomic
+    // references to global memory. Benchmarks suggest that modern GPUs are great at pipelining relaxed increments.
+    // To slightly reduce the traffic, we can aggregate within the warp first.
+    small_size_t results_across_warp = _reduce_in_warp(results_per_thread);
+    if (warp_thread_index == 0) {
+        cuda::atomic_ref<size_t> count_for_haystack(counts_per_haystack[haystack_index]);
+        count_for_haystack.fetch_add(results_across_warp, cuda::std::memory_order_relaxed);
     }
 }
 
@@ -388,7 +462,7 @@ struct find_many<state_id_type_, allocator_type_, sz_cap_cuda_k, enable_> {
      *  @note The @p matches reference objects should be assignable from @b `match_t`.
      */
     template <typename haystacks_type_, typename output_matches_type_>
-    status_t try_find(haystacks_type_ &&haystacks, span<size_t const> counts, output_matches_type_ &&matches,
+    status_t try_find(haystacks_type_ &&haystacks, span<size_t const>, output_matches_type_ &&matches,
                       cuda_executor_t executor = {}, gpu_specs_t const &specs = {}) const noexcept {
         size_t count_found = 0, count_allowed = matches.size();
         for (auto it = haystacks.begin(); it != haystacks.end() && count_found != count_allowed; ++it)
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index c0f10a4a..c140b1b0 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -881,6 +881,16 @@ constexpr scalar_type_ round_up_to_multiple(scalar_type_ x, scalar_type_ divisor
     return divide_round_up(x, divisor) * divisor;
 }
 
+/**
+ *  @brief Equivalent to `(condition ? value : 0)`, but avoids branching.
+ */
+template <typename value_type_>
+constexpr value_type_ non_zero_if(value_type_ value, value_type_ condition) noexcept {
+    static_assert(std::is_unsigned<value_type_>::value, "Value type must be unsigned integer");
+    _sz_assert((condition == 0 || condition == 1) && "Condition must be either 0 or 1 unsigned integer");
+    return value * condition;
+}
+
 /**
  *  @brief  Helper structure for dividing a range of data into three parts: head, body, and tail,
  *          generally used to minimize misaligned (split) stores and operate on aligned pages.

From 170b61b8799c027f43e6b8357145dea853810e9b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 9 Jul 2025 13:46:28 +0000
Subject: [PATCH 442/751] Fix: Forwarding dataset `nothrow`-copyable views in
 tests

---
 scripts/test_stringcuzilla.cuh | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringcuzilla.cuh
index e08723de..2c35c613 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringcuzilla.cuh
@@ -14,6 +14,7 @@
 #include "stringcuzilla/similarity.hpp"
 
 #if SZ_USE_CUDA
+#include "stringcuzilla/find_many.cuh"
 #include "stringcuzilla/similarity.cuh"
 #endif
 
@@ -1183,8 +1184,8 @@ void test_find_many_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_
         // Count with both backends
         span<size_t> counts_base_span {counts_base.data(), counts_base.size()};
         span<size_t> counts_simd_span {counts_simd.data(), counts_simd.size()};
-        status_t status_count_base = base_operator.try_count(haystacks_tape, counts_base_span);
-        status_t status_count_simd = simd_operator.try_count(haystacks_tape, counts_simd_span, extra_args...);
+        status_t status_count_base = base_operator.try_count(haystacks_tape.view(), counts_base_span);
+        status_t status_count_simd = simd_operator.try_count(haystacks_tape.view(), counts_simd_span, extra_args...);
         _sz_assert(status_count_base == status_t::success_k);
         _sz_assert(status_count_simd == status_t::success_k);
         _sz_assert(counts_base[0] == counts_simd[0]);
@@ -1192,9 +1193,9 @@ void test_find_many_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_
         // Check the matches themselves
         matches_base.resize(std::accumulate(counts_base.begin(), counts_base.end(), 0));
         matches_simd.resize(std::accumulate(counts_simd.begin(), counts_simd.end(), 0));
-        status_t status_matched_base = base_operator.try_find(haystacks_tape, counts_base_span, matches_base);
+        status_t status_matched_base = base_operator.try_find(haystacks_tape.view(), counts_base_span, matches_base);
         status_t status_matched_simd =
-            simd_operator.try_find(haystacks_tape, counts_simd_span, matches_simd, extra_args...);
+            simd_operator.try_find(haystacks_tape.view(), counts_simd_span, matches_simd, extra_args...);
         _sz_assert(status_matched_base == status_t::success_k);
         _sz_assert(status_matched_simd == status_t::success_k);
 
@@ -1217,8 +1218,8 @@ void test_find_many_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_
         // Count with both backends and compare all of the bounds
         span<size_t> counts_base_span {counts_base.data(), counts_base.size()};
         span<size_t> counts_simd_span {counts_simd.data(), counts_simd.size()};
-        status_t status_count_base = base_operator.try_count(haystacks_tape, counts_base_span);
-        status_t status_count_simd = simd_operator.try_count(haystacks_tape, counts_simd_span, extra_args...);
+        status_t status_count_base = base_operator.try_count(haystacks_tape.view(), counts_base_span);
+        status_t status_count_simd = simd_operator.try_count(haystacks_tape.view(), counts_simd_span, extra_args...);
         _sz_assert(status_count_base == status_t::success_k);
         _sz_assert(status_count_simd == status_t::success_k);
         _sz_assert(std::equal(counts_base.begin(), counts_base.end(), counts_simd.begin()));
@@ -1226,9 +1227,9 @@ void test_find_many_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_
         // Check the matches themselves
         matches_base.resize(std::accumulate(counts_base.begin(), counts_base.end(), 0));
         matches_simd.resize(std::accumulate(counts_simd.begin(), counts_simd.end(), 0));
-        status_t status_matched_base = base_operator.try_find(haystacks_tape, counts_base_span, matches_base);
+        status_t status_matched_base = base_operator.try_find(haystacks_tape.view(), counts_base_span, matches_base);
         status_t status_matched_simd =
-            simd_operator.try_find(haystacks_tape, counts_simd_span, matches_simd, extra_args...);
+            simd_operator.try_find(haystacks_tape.view(), counts_simd_span, matches_simd, extra_args...);
         _sz_assert(status_matched_base == status_t::success_k);
         _sz_assert(status_matched_simd == status_t::success_k);
 
@@ -1368,12 +1369,26 @@ void test_find_many_equivalence() {
     if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
     static_assert(executor_like<fork_union_t>);
 
+#if SZ_USE_CUDA
+    gpu_specs_t first_gpu_specs = *gpu_specs();
+#endif
+
     // Single-threaded serial Aho-Corasick implementation
     test_find_many_fixed(find_many_baselines_t {}, find_many_u32_serial_t {});
 
     // Multi-threaded parallel Aho-Corasick implementation
     test_find_many_fixed(find_many_baselines_t {}, find_many_u32_parallel_t {}, pool);
 
+#if SZ_USE_CUDA
+    test_find_many_fixed(find_many_baselines_t {}, find_many_u32_cuda_t {}, cuda_executor_t {});
+    test_find_many_fuzzy(find_many_baselines_t {}, find_many_u32_cuda_t {}, needles_short_config, haystacks_config, 1,
+                         cuda_executor_t {});
+    test_find_many_fuzzy(find_many_baselines_t {}, find_many_u32_cuda_t {}, needles_long_config, haystacks_config, 1,
+                         cuda_executor_t {});
+    test_find_many_prefixes(find_many_baselines_t {}, find_many_u32_cuda_t {}, haystacks_config, 1024, 1,
+                            cuda_executor_t {});
+#endif
+
     // Fuzzy tests with random inputs
     test_find_many_fuzzy(find_many_baselines_t {}, find_many_u32_serial_t {}, needles_short_config, haystacks_config,
                          1);

From 2f0334a0986687f125fac173994cd743a9df9146 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 9 Jul 2025 13:47:44 +0000
Subject: [PATCH 443/751] Improve: `SZ_DYNAMIC` attributes

---
 include/stringzilla/types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 7f80a0bd..da312ede 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -151,7 +151,7 @@
 #define SZ_PUBLIC inline static
 #define SZ_INTERNAL inline static
 #else
-#define SZ_DYNAMIC __attribute__((visibility("default")))
+#define SZ_DYNAMIC extern __attribute__((visibility("default")))
 #define SZ_EXTERNAL extern
 #define SZ_PUBLIC __attribute__((unused)) inline static
 #define SZ_INTERNAL __attribute__((always_inline)) inline static

From 7a433a915b64441b7a2b3b876cf6cbbd8b1699b4 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 9 Jul 2025 13:48:52 +0000
Subject: [PATCH 444/751] Improve: Prioritize find-many tests

---
 scripts/test_stringcuzilla.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/test_stringcuzilla.cu b/scripts/test_stringcuzilla.cu
index 17d9186c..92acfdde 100644
--- a/scripts/test_stringcuzilla.cu
+++ b/scripts/test_stringcuzilla.cu
@@ -38,9 +38,9 @@ int main(int argc, char const **argv) {
     if (auto code = sz::scripts::log_environment(); code != 0) return code;
 
     try {
+        sz::scripts::test_find_many_equivalence();
         sz::scripts::test_similarity_scores_equivalence();
         sz::scripts::test_similarity_scores_memory_usage();
-        sz::scripts::test_find_many_equivalence();
     }
     catch (std::exception const &e) {
         std::fprintf(stderr, "Failed with: %s\n", e.what());

From 3e93b751dc795f29d24c793516db213d30037cb6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 9 Jul 2025 13:50:02 +0000
Subject: [PATCH 445/751] Improve: Cleaner haystack splitting

---
 include/stringcuzilla/find_many.hpp | 126 ++++++++++++++--------------
 1 file changed, 64 insertions(+), 62 deletions(-)

diff --git a/include/stringcuzilla/find_many.hpp b/include/stringcuzilla/find_many.hpp
index 60720e88..c1a4019e 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringcuzilla/find_many.hpp
@@ -632,6 +632,25 @@ struct find_many {
 
 #pragma region - Parallel Backend
 
+/**
+ *  @brief Helper splitting the haystack into slices for each core, taking into account the cache line width.
+ *  @warning Doesn't consider the dictionary and needle length at all!
+ */
+constexpr span<byte_t const> haystack_part_for_core(span<byte_t const> haystack, size_t core_index, size_t cores_total,
+                                                    size_t cache_line_width) noexcept {
+
+    size_t const bytes_per_core_optimal =
+        round_up_to_multiple(divide_round_up(haystack.size(), cores_total), cache_line_width);
+
+    // We may have a case of a thread receiving no data at all
+    byte_t const *optimal_start = std::min(haystack.data() + core_index * bytes_per_core_optimal, haystack.end());
+    if (optimal_start >= haystack.end()) return {};
+
+    // First, each core will process its own slice excluding the overlapping regions
+    byte_t const *optimal_end = std::min(optimal_start + bytes_per_core_optimal, haystack.end());
+    return {optimal_start, optimal_end};
+}
+
 struct _count_short_needle_matches_in_one_part_t {
     size_t total = 0;
     size_t prefix = 0;
@@ -707,13 +726,14 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
 
         using haystacks_t = typename std::remove_reference_t<haystacks_type_>;
         using haystack_t = typename haystacks_t::value_type;
+        using char_t = typename haystack_t::value_type;
         static_assert(std::is_trivially_copyable_v<haystack_t>,
                       "The haystack should be trivially copyable for higher compatibility.");
 
         // On small strings, individually compute the counts
         executor.for_each_dynamic(counts.size(), [&](size_t haystack_index) noexcept {
             haystack_t const &haystack = haystacks[haystack_index];
-            size_t haystack_length = haystack.size();
+            size_t haystack_length = haystack.size_bytes();
             if (haystack_length > specs.l2_bytes) return;
             counts[haystack_index] = dict_.count(haystack);
         });
@@ -721,7 +741,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
         // On longer strings, throw all cores on each haystack
         for (size_t haystack_index = 0; haystack_index < counts.size(); ++haystack_index) {
             haystack_t const &haystack = haystacks[haystack_index];
-            size_t const haystack_length = haystack.size();
+            size_t const haystack_length = haystack.size_bytes();
             // The shorter strings have already been processed
             if (haystack_length <= specs.l2_bytes) continue;
 
@@ -729,17 +749,18 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
             size_t const cores_total = executor.thread_count();
             size_t const padded_max_needle_length = dict_.max_needle_length() + specs.cache_line_width;
             bool const longest_needle_fits_on_one_core = padded_max_needle_length * cores_total < haystack_length;
+            auto haystack_bytes = span<char_t const>(haystack.data(), haystack.size()).template cast<byte_t const>();
             if (longest_needle_fits_on_one_core)
                 executor.for_each_thread([&](size_t core_index) noexcept {
-                    _count_short_needle_matches_in_one_part_t partial_result =
-                        count_short_needle_matches_in_one_part(haystack, core_index, cores_total, cache_line_width);
+                    _count_short_needle_matches_in_one_part_t partial_result = count_short_needle_matches_in_one_part(
+                        haystack_bytes, core_index, cores_total, cache_line_width);
                     if (core_index != 0) partial_result.total -= partial_result.prefix;
                     count_across_cores.fetch_add(partial_result.total, std::memory_order_relaxed);
                 });
             else
                 executor.for_each_thread([&](size_t core_index) noexcept {
                     size_t partial_result =
-                        count_matches_in_one_part(haystack, core_index, cores_total, cache_line_width);
+                        count_matches_in_one_part(haystack_bytes, core_index, cores_total, cache_line_width);
                     count_across_cores.fetch_add(partial_result, std::memory_order_relaxed);
                 });
             counts[haystack_index] = count_across_cores;
@@ -800,12 +821,11 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
         // Process the small haystacks, outputting their matches individually without any synchronization
         executor.for_each_dynamic(counts.size(), [&](size_t haystack_index) noexcept {
             haystack_t const &haystack = haystacks[haystack_index];
-            byte_t const *const haystack_data = reinterpret_cast<byte_t const *>(haystack.data());
-            size_t const haystack_bytes_length = haystack.size() * sizeof(char_t);
-            if (haystack_bytes_length > specs.l2_bytes) return;
+            auto haystack_bytes = span<char_t const>(haystack.data(), haystack.size()).template cast<byte_t const>();
+            if (haystack_bytes.size() > specs.l2_bytes) return;
 
             size_t matches_found = 0;
-            dict_.find({haystack_data, haystack_bytes_length}, [&](match_t match) noexcept {
+            dict_.find(haystack_bytes, [&](match_t match) noexcept {
                 match.haystack_index = haystack_index;
                 matches[offsets_per_haystack[haystack_index] + matches_found] = match;
                 ++matches_found;
@@ -820,16 +840,14 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
         if (counts_per_core.try_resize(cores_total) != status_t::success_k) return status_t::bad_alloc_k;
         for (size_t haystack_index = 0; haystack_index < counts.size(); ++haystack_index) {
             haystack_t const &haystack = haystacks[haystack_index];
-            byte_t const *const haystack_data = reinterpret_cast<byte_t const *>(haystack.data());
-            size_t const haystack_bytes_length = haystack.size() * sizeof(char_t);
-            byte_t const *const haystack_end = haystack_data + haystack_bytes_length;
+            auto haystack_bytes = span<char_t const>(haystack.data(), haystack.size()).template cast<byte_t const>();
             // The shorter strings have already been processed
-            if (haystack_bytes_length <= specs.l2_bytes) continue;
+            if (haystack_bytes.size() <= specs.l2_bytes) continue;
 
             // First, on each core, estimate the number of matches in the haystack
             executor.for_each_thread([&](size_t core_index) noexcept {
                 counts_per_core[core_index] =
-                    count_matches_in_one_part(haystack, core_index, cores_total, cache_line_width);
+                    count_matches_in_one_part(haystack_bytes, core_index, cores_total, cache_line_width);
             });
 
             // Now that we know the number of matches to expect per slice, we can convert the counts
@@ -840,30 +858,31 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
             }
 
             // We shouldn't even consider needles longer than the haystack
-            size_t const max_needle_length = std::min(dict_.max_needle_length(), haystack_bytes_length);
+            size_t const max_needle_length = std::min(dict_.max_needle_length(), haystack_bytes.size());
 
             // On each core, pick an overlapping slice and go through all of the matches in it,
             // that start before the end of the private slice.
-            size_t const bytes_per_core_optimal =
-                round_up_to_multiple(divide_round_up(haystack_bytes_length, cores_total), cache_line_width);
             size_t const count_matches_before_this_haystack = offsets_per_haystack[haystack_index];
             executor.for_each_thread([&](size_t core_index) noexcept {
                 size_t const count_matches_before_this_core = core_index ? counts_per_core[core_index - 1] : 0;
                 size_t const count_matches_expected_on_this_core =
                     counts_per_core[core_index] - count_matches_before_this_core;
 
-                // The last core may have a smaller slice, so we need to be careful
-                byte_t const *optimal_start =
-                    std::min(haystack_data + core_index * bytes_per_core_optimal, haystack_end);
-                byte_t const *const optimal_end = std::min(optimal_start + bytes_per_core_optimal, haystack_end);
-                byte_t const *const overlapping_end = std::min(optimal_end + max_needle_length - 1, haystack_end);
+                // Get the optimal slice for this core
+                auto optimal_slice = haystack_part_for_core(haystack_bytes, core_index, cores_total, cache_line_width);
+                if (optimal_slice.empty()) return; // No data for this core
+
+                byte_t const *optimal_start = optimal_slice.begin();
+                byte_t const *const optimal_end = optimal_slice.end();
+                byte_t const *const overlapping_end =
+                    std::min(optimal_end + max_needle_length - 1, haystack_bytes.end());
 
                 // Iterate through the matches in the overlapping region
                 size_t count_matches_found_on_this_core = 0;
                 dict_.find({optimal_start, overlapping_end}, [&](match_t match) noexcept {
                     bool belongs_to_this_core = match.needle.begin() < optimal_end;
                     if (!belongs_to_this_core) return true;
-                    match.haystack = {haystack_data, haystack_bytes_length};
+                    match.haystack = haystack_bytes;
                     match.haystack_index = haystack_index;
                     matches[count_matches_before_this_haystack + count_matches_before_this_core +
                             count_matches_found_on_this_core] = match;
@@ -886,27 +905,18 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
      *          and this method is called from each core with its own index to count the number of potentially
      *          overlapping matches.
      */
-    template <typename char_type_>
-    size_t count_matches_in_one_part(span<char_type_ const> haystack, size_t core_index, size_t cores_total,
+    size_t count_matches_in_one_part(span<byte_t const> haystack, size_t core_index, size_t cores_total,
                                      size_t cache_line_width) const noexcept {
 
-        using char_t = char_type_;
-        byte_t const *const haystack_data = reinterpret_cast<byte_t const *>(haystack.data());
-        size_t const haystack_bytes_length = haystack.size() * sizeof(char_t);
-        byte_t const *const haystack_end = haystack_data + haystack_bytes_length;
-        size_t const bytes_per_core_optimal =
-            round_up_to_multiple(divide_round_up(haystack_bytes_length, cores_total), cache_line_width);
-
         // We shouldn't even consider needles longer than the haystack
-        size_t const max_needle_length = std::min(dict_.max_needle_length(), haystack_bytes_length);
+        size_t const max_needle_length = std::min(dict_.max_needle_length(), haystack.size());
 
-        // We may have a case of a thread receiving no data at all
-        byte_t const *optimal_start = haystack_data + core_index * bytes_per_core_optimal;
-        if (optimal_start >= haystack_end) return 0;
+        // Get the optimal slice for this core
+        auto optimal_slice = haystack_part_for_core(haystack, core_index, cores_total, cache_line_width);
+        if (optimal_slice.empty()) return 0;
 
         // First, each core will process its own slice excluding the overlapping regions
-        byte_t const *optimal_end = std::min(optimal_start + bytes_per_core_optimal, haystack_end);
-        size_t const count_matches_non_overlapping = dict_.count({optimal_start, optimal_end});
+        size_t const count_matches_non_overlapping = dict_.count(optimal_slice);
 
         // Now, each thread will take care of the subsequent overlapping regions,
         // but we must be careful for cases when the core-specific slice is shorter
@@ -914,21 +924,21 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
         // still may want an optimization for it down the road.
         byte_t const *overlapping_start;
         byte_t const *overlapping_end;
-        if (optimal_start + max_needle_length >= optimal_end) {
+        if (optimal_slice.begin() + max_needle_length >= optimal_slice.end()) {
             // Our needles are longer than a slice for the core
-            overlapping_start = optimal_start;
-            overlapping_end = std::min(optimal_start + max_needle_length, haystack_end);
+            overlapping_start = optimal_slice.begin();
+            overlapping_end = std::min(optimal_slice.begin() + max_needle_length, haystack.end());
         }
         else {
-            overlapping_start = std::max(optimal_end - max_needle_length + 1, haystack_data);
-            overlapping_end = std::min(optimal_end + max_needle_length - 1, haystack_end);
+            overlapping_start = std::max(optimal_slice.end() - max_needle_length + 1, haystack.begin());
+            overlapping_end = std::min(optimal_slice.end() + max_needle_length - 1, haystack.end());
         }
 
         // Count the matches that start in one core's slice and end in another
         size_t count_matches_overlapping = 0;
         dict_.find({overlapping_start, overlapping_end}, [&](match_t match) noexcept {
-            bool is_boundary = match.needle.begin() < optimal_end && match.needle.end() > optimal_end;
-            count_matches_overlapping += is_boundary;
+            bool belongs_to_this_core = match.needle.begin() < optimal_slice.end();
+            count_matches_overlapping += belongs_to_this_core;
             return true;
         });
 
@@ -943,27 +953,19 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
      *  needle is smaller than the length of a single core slice. It means that in the least convenient case, the
      *  match can only spill into 2 core regions, starting in one and ending in another.
      */
-    template <typename char_type_>
     _count_short_needle_matches_in_one_part_t count_short_needle_matches_in_one_part(
-        span<char_type_ const> haystack, size_t core_index, size_t cores_total,
-        size_t cache_line_width) const noexcept {
-
-        using char_t = char_type_;
-        byte_t const *const haystack_data = reinterpret_cast<byte_t const *>(haystack.data());
-        size_t const haystack_bytes_length = haystack.size() * sizeof(char_t);
-        byte_t const *const haystack_end = haystack_data + haystack_bytes_length;
-        size_t const bytes_per_core_optimal =
-            round_up_to_multiple(divide_round_up(haystack_bytes_length, cores_total), cache_line_width);
+        span<byte_t const> haystack, size_t core_index, size_t cores_total, size_t cache_line_width) const noexcept {
 
         // We won't face needles longer than the slice for the core
         size_t const max_needle_length = dict_.max_needle_length();
-        _sz_assert(max_needle_length < bytes_per_core_optimal);
 
-        // We may have a case of a thread receiving no data at all
-        byte_t const *optimal_start = std::min(haystack_data + core_index * bytes_per_core_optimal, haystack_end);
-        byte_t const *const prefix_end = std::min(optimal_start + max_needle_length, haystack_end);
-        byte_t const *const overlapping_end =
-            std::min(optimal_start + bytes_per_core_optimal + max_needle_length, haystack_end);
+        // Get the optimal slice for this core
+        auto optimal_slice = haystack_part_for_core(haystack, core_index, cores_total, cache_line_width);
+        if (optimal_slice.empty()) return {};
+
+        byte_t const *optimal_start = optimal_slice.data();
+        byte_t const *const prefix_end = std::min(optimal_start + max_needle_length, haystack.end());
+        byte_t const *const overlapping_end = std::min(optimal_slice.end() + max_needle_length, haystack.end());
 
         // Reimplement the serial `aho_corasick_dictionary::count` keeping track of the matches,
         // entirely fitting in the prefix
@@ -975,7 +977,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
             current_state = transitions[current_state][*optimal_start];
             auto const outputs_count = outputs_counts[current_state];
             result.total += outputs_count;
-            result.prefix += outputs_count * (optimal_start < prefix_end);
+            result.prefix += non_zero_if<size_t>(outputs_count, optimal_start < prefix_end);
         }
 
         return result;

From 0f3f928028ee4896a3bd896bd6786c039b3940fb Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 9 Jul 2025 14:04:27 +0000
Subject: [PATCH 446/751] Improve: Switch to "StringParaZilla" naming

This naming is cleaner than StringCuZilla,
and also helps differentiate C++20 and
CUDA 20 builds with suffix, like:
`stringparazilla_bench_find_many_cu20`.
---
 .github/workflows/prerelease.yml              | 48 ++++++-----
 .vscode/launch.json                           |  8 +-
 .vscode/settings.json                         |  4 +-
 .vscode/tasks.json                            |  4 +-
 CMakeLists.txt                                | 82 ++++++++++++-------
 CONTRIBUTING.md                               | 53 ++++++------
 .../find_many.cuh                             | 14 ++--
 .../find_many.hpp                             | 16 ++--
 .../similarity.cuh                            | 14 ++--
 .../similarity.hpp                            | 16 ++--
 .../stringparazilla.h}                        | 10 +--
 .../types.cuh                                 | 10 +--
 .../types.hpp                                 | 12 +--
 scripts/bench.hpp                             |  2 +-
 scripts/bench_container.cpp                   |  6 +-
 scripts/bench_find.cpp                        |  6 +-
 scripts/bench_find_many.cpp                   | 12 +--
 scripts/bench_find_many.cu                    | 12 +--
 scripts/bench_find_many.cuh                   | 10 ++-
 scripts/bench_memory.cpp                      |  6 +-
 scripts/bench_sequence.cpp                    |  6 +-
 scripts/bench_similarity.cpp                  | 12 +--
 scripts/bench_similarity.cu                   | 12 +--
 scripts/bench_similarity.cuh                  | 10 ++-
 scripts/bench_token.cpp                       |  6 +-
 ...ngcuzilla.cpp => test_stringparazilla.cpp} |  2 +-
 ...ringcuzilla.cu => test_stringparazilla.cu} |  2 +-
 ...ngcuzilla.cuh => test_stringparazilla.cuh} | 14 ++--
 scripts/test_stringzilla.hpp                  |  2 +-
 29 files changed, 223 insertions(+), 188 deletions(-)
 rename include/{stringcuzilla => stringparazilla}/find_many.cuh (99%)
 rename include/{stringcuzilla => stringparazilla}/find_many.hpp (99%)
 rename include/{stringcuzilla => stringparazilla}/similarity.cuh (99%)
 rename include/{stringcuzilla => stringparazilla}/similarity.hpp (99%)
 rename include/{stringcuzilla/stringcuzilla.h => stringparazilla/stringparazilla.h} (94%)
 rename include/{stringcuzilla => stringparazilla}/types.cuh (98%)
 rename include/{stringcuzilla => stringparazilla}/types.hpp (97%)
 rename scripts/{test_stringcuzilla.cpp => test_stringparazilla.cpp} (97%)
 rename scripts/{test_stringcuzilla.cu => test_stringparazilla.cu} (97%)
 rename scripts/{test_stringcuzilla.cuh => test_stringparazilla.cuh} (99%)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index c72f7493..c03c5b34 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -87,12 +87,21 @@ jobs:
         run: build_artifacts/stringzilla_test_cpp20
       - name: Test on Real World Data
         run: |
-          build_artifacts/stringzilla_bench_memory ${DATASET_PATH}     # for string copies and fills
-          build_artifacts/stringzilla_bench_find ${DATASET_PATH}     # for substring search
-          build_artifacts/stringzilla_bench_token ${DATASET_PATH}      # for hashing, equality comparisons, etc.
-          build_artifacts/stringzilla_bench_similarity ${DATASET_PATH} # for edit distances and alignment scores
-          build_artifacts/stringzilla_bench_sequence ${DATASET_PATH}   # for sorting arrays of strings
-          build_artifacts/stringzilla_bench_container ${DATASET_PATH}  # for STL containers with string keys
+          build_artifacts/stringzilla_bench_memory_cpp20 ${DATASET_PATH}     # for string copies and fills
+          build_artifacts/stringzilla_bench_find_cpp20 ${DATASET_PATH}       # for substring search
+          build_artifacts/stringzilla_bench_token_cpp20 ${DATASET_PATH}      # for hashing, equality comparisons, etc.
+          build_artifacts/stringzilla_bench_sequence_cpp20 ${DATASET_PATH}   # for sorting arrays of strings
+          build_artifacts/stringzilla_bench_container_cpp20 ${DATASET_PATH}  # for STL containers with string keys
+        env:
+          DATASET_PATH: ./README.md
+        # Don't overload GitHub with our benchmarks.
+        # The results in such an unstable environment will be meaningless anyway.
+        if: 0
+
+      - name: Test Parallel Algorithms on Real World Data
+        run: |
+          build_artifacts/stringparazilla_bench_similarity_cpp20 ${DATASET_PATH} # for edit distances and alignment scores
+          build_artifacts/stringparazilla_bench_find_many_cpp20 ${DATASET_PATH}  # for multi-needle search in many strings
         env:
           DATASET_PATH: ./README.md
         # Don't overload GitHub with our benchmarks.
@@ -172,11 +181,10 @@ jobs:
         run: build_artifacts/stringzilla_test_cpp20
       - name: Test on Real World Data
         run: |
-          build_artifacts/stringzilla_bench_find ${DATASET_PATH}     # for substring search
-          build_artifacts/stringzilla_bench_token ${DATASET_PATH}      # for hashing, equality comparisons, etc.
-          build_artifacts/stringzilla_bench_similarity ${DATASET_PATH} # for edit distances and alignment scores
-          build_artifacts/stringzilla_bench_sequence ${DATASET_PATH}       # for sorting arrays of strings
-          build_artifacts/stringzilla_bench_container ${DATASET_PATH}  # for STL containers with string keys
+          build_artifacts/stringzilla_bench_find_cpp20 ${DATASET_PATH}      # for substring search
+          build_artifacts/stringzilla_bench_token_cpp20 ${DATASET_PATH}     # for hashing, equality comparisons, etc.
+          build_artifacts/stringzilla_bench_sequence_cpp20 ${DATASET_PATH}  # for sorting arrays of strings
+          build_artifacts/stringzilla_bench_container_cpp20 ${DATASET_PATH} # for STL containers with string keys
         env:
           DATASET_PATH: ./README.md
         # Don't overload GitHub with our benchmarks.
@@ -304,11 +312,10 @@ jobs:
         run: build_artifacts/stringzilla_test_cpp17
       - name: Test on Real World Data
         run: |
-          build_artifacts/stringzilla_bench_find ${DATASET_PATH}     # for substring search
-          build_artifacts/stringzilla_bench_token ${DATASET_PATH}      # for hashing, equality comparisons, etc.
-          build_artifacts/stringzilla_bench_similarity ${DATASET_PATH} # for edit distances and alignment scores
-          build_artifacts/stringzilla_bench_sequence ${DATASET_PATH}       # for sorting arrays of strings
-          build_artifacts/stringzilla_bench_container ${DATASET_PATH}  # for STL containers with string keys
+          build_artifacts/stringzilla_bench_find_cpp20 ${DATASET_PATH}      # for substring search
+          build_artifacts/stringzilla_bench_token_cpp20 ${DATASET_PATH}     # for hashing, equality comparisons, etc.
+          build_artifacts/stringzilla_bench_sequence_cpp20 ${DATASET_PATH}  # for sorting arrays of strings
+          build_artifacts/stringzilla_bench_container_cpp20 ${DATASET_PATH} # for STL containers with string keys
         env:
           DATASET_PATH: ./README.md
         # Don't overload GitHub with our benchmarks.
@@ -377,11 +384,10 @@ jobs:
         run: .\build_artifacts\stringzilla_test_cpp20.exe
       - name: Test on Real World Data
         run: |
-          .\build_artifacts\stringzilla_bench_find.exe ${DATASET_PATH}     # for substring search
-          .\build_artifacts\stringzilla_bench_token.exe ${DATASET_PATH}      # for hashing, equality comparisons, etc.
-          .\build_artifacts\stringzilla_bench_similarity.exe ${DATASET_PATH} # for edit distances and alignment scores
-          .\build_artifacts\stringzilla_bench_sequence.exe ${DATASET_PATH}       # for sorting arrays of strings
-          .\build_artifacts\stringzilla_bench_container.exe ${DATASET_PATH}  # for STL containers with string keys
+          .\build_artifacts\stringzilla_bench_find_cpp20.exe ${DATASET_PATH}      # for substring search
+          .\build_artifacts\stringzilla_bench_token_cpp20.exe ${DATASET_PATH}     # for hashing, equality comparisons, etc.
+          .\build_artifacts\stringzilla_bench_sequence_cpp20.exe ${DATASET_PATH}  # for sorting arrays of strings
+          .\build_artifacts\stringzilla_bench_container_cpp20.exe ${DATASET_PATH} # for STL containers with string keys
         env:
           DATASET_PATH: ./README.md
         # Don't overload GitHub with our benchmarks.
diff --git a/.vscode/launch.json b/.vscode/launch.json
index 7c19acda..e64c3463 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -42,7 +42,7 @@
       "type": "cppdbg",
       "request": "launch",
       "preLaunchTask": "Build Test: Debug Parallel C++",
-      "program": "${workspaceFolder}/build_debug/stringcuzilla_test_cpp20",
+      "program": "${workspaceFolder}/build_debug/stringparazilla_test_cpp20",
       "cwd": "${workspaceFolder}",
       "environment": [
         {
@@ -70,7 +70,7 @@
         "MIMode": "lldb"
       },
       "windows": {
-        "program": "${workspaceFolder}\\build_debug\\stringcuzilla_test_cpp20.exe",
+        "program": "${workspaceFolder}\\build_debug\\stringparazilla_test_cpp20.exe",
         "MIMode": "gdb",
         "miDebuggerPath": "C:\\MinGw\\bin\\gdb.exe"
       }
@@ -80,7 +80,7 @@
       "type": "cuda-gdb",
       "request": "launch",
       "preLaunchTask": "Build Test: Debug CUDA",
-      "program": "${workspaceFolder}/build_debug/stringcuzilla_test_cu20",
+      "program": "${workspaceFolder}/build_debug/stringparazilla_test_cu20",
       "cwd": "${workspaceFolder}",
       "environment": [
         {
@@ -136,7 +136,7 @@
       "name": "Current CUDA Benchmark",
       "type": "cuda-gdb",
       "request": "launch",
-      "program": "${workspaceFolder}/build_debug/stringcuzilla_${fileBasenameNoExtension}",
+      "program": "${workspaceFolder}/build_debug/stringparazilla_${fileBasenameNoExtension}",
       "cwd": "${workspaceFolder}",
       "environment": [
         {
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 81369c12..9f7267f5 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -180,6 +180,7 @@
     "stdc",
     "STL",
     "strided",
+    "STRINGPARAZILLA",
     "StringWa.rs",
     "STRINGWARS",
     "stringzilla",
@@ -347,6 +348,7 @@
     "__availability": "cpp",
     "barrier": "cpp",
     "pipeline": "cpp",
-    "__functional_03": "cpp"
+    "__functional_03": "cpp",
+    "__functional_base_03": "cpp"
   }
 }
\ No newline at end of file
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
index 841139b0..15a1e9c7 100644
--- a/.vscode/tasks.json
+++ b/.vscode/tasks.json
@@ -21,7 +21,7 @@
         },
         {
             "label": "Build Test: Debug Parallel C++",
-            "command": "cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=1 -B build_debug && cmake --build build_debug --config Debug --target stringcuzilla_test_cpp20",
+            "command": "cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=1 -B build_debug && cmake --build build_debug --config Debug --target stringparazilla_test_cpp20",
             "args": [],
             "type": "shell",
             "osx": {
@@ -39,7 +39,7 @@
         },
         {
             "label": "Build Test: Debug CUDA",
-            "command": "cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=1 -B build_debug && cmake --build build_debug --config Debug --target stringcuzilla_test_cu20",
+            "command": "cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=1 -B build_debug && cmake --build build_debug --config Debug --target stringparazilla_test_cu20",
             "args": [],
             "type": "shell",
         },
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d81429c0..1ffe13b5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,14 +21,20 @@
 # * stringzilla_test_cpp20_neon: A test executable for ARM Neon.
 # * stringzilla_test_cpp20_sve: A test executable for ARM Scalable Vector Extension.
 #
-# Benchmarks:
+# Serial Benchmarks:
 #
-# * stringzilla_bench_find: A benchmark for substring search operations.
-# * stringzilla_bench_similarity: A benchmark for similarity operations.
-# * stringzilla_bench_sequence: A benchmark for string array-level operations.
-# * stringzilla_bench_token: A benchmark for comparators and hash functions.
-# * stringzilla_bench_container: A benchmark for STL containers powered by StringZilla.
-# * stringzilla_bench_memory: A benchmark for LibC-style low-level memory operations.
+# * stringzilla_bench_find_cpp20: A benchmark for substring search operations.
+# * stringzilla_bench_sequence_cpp20: A benchmark for string array-level operations.
+# * stringzilla_bench_token_cpp20: A benchmark for comparators and hash functions.
+# * stringzilla_bench_container_cpp20: A benchmark for STL containers powered by StringZilla.
+# * stringzilla_bench_memory_cpp20: A benchmark for LibC-style low-level memory operations.
+#
+# Parallel Benchmarks:
+#
+# * stringparazilla_bench_similarity_cpp20: A benchmark for similarity operations.
+# * stringparazilla_bench_similarity_cu20: A benchmark for similarity operations on GPU.
+# * stringparazilla_bench_find_many_cpp20: A benchmark for finding many substrings.
+# * stringparazilla_bench_find_many_cu20: A benchmark for finding many substrings on GPU.
 #
 # For higher-level language bindings separate build scripts are provided, native to each toolchain.
 cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
@@ -188,14 +194,22 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
     # MSVC uses numeric values: > 4068 for "unknown pragmas". > 4146 for "unary minus operator applied to unsigned type,
     # result still unsigned". We also specify `/utf-8` to properly UTF-8 symbols in tests.
     if (${compiler_id} STREQUAL "GNU")
-        target_compile_options(${target} PRIVATE "-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas")
+        target_compile_options(
+            ${target}
+            PRIVATE
+                "-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas;-Wno-cast-function-type;-Wno-unused-function"
+        )
         target_compile_options(${target} PRIVATE "-Wno-cast-function-type;-Wno-unused-function") # ? Unique to GCC
     elseif (${compiler_id} STREQUAL "Clang" OR ${compiler_id} STREQUAL "AppleClang")
         target_compile_options(${target} PRIVATE "-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas")
     elseif (${compiler_id} STREQUAL "MSVC")
         target_compile_options(${target} PRIVATE "/Bt;/wd4068;/wd4146;/utf-8;/WX")
     elseif (${compiler_id} STREQUAL "NVIDIA")
-        target_compile_options(${target} PRIVATE "-Xcompiler=-Wfatal-errors;-Xcompiler=-Wall;-Xcompiler=-Wextra")
+        target_compile_options(
+            ${target}
+            PRIVATE
+                "-Xcompiler=-Wfatal-errors;-Xcompiler=-Wall;-Xcompiler=-Wextra;-Wno-unknown-pragmas;-Wno-cast-function-type;-Wno-unused-function"
+        )
     endif ()
 
     # Enable OpenMP if available
@@ -351,16 +365,22 @@ function (define_gpu_launcher exec_name source cuda_standard target_arch)
 endfunction ()
 
 if (${STRINGZILLA_BUILD_BENCHMARK})
-    define_launcher(stringzilla_bench_find scripts/bench_find.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringzilla_bench_sequence scripts/bench_sequence.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringzilla_bench_token scripts/bench_token.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringzilla_bench_container scripts/bench_container.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringzilla_bench_memory scripts/bench_memory.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringzilla_bench_similarity scripts/bench_similarity.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringzilla_bench_find_many scripts/bench_find_many.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_bench_find_cpp20 scripts/bench_find.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_bench_sequence_cpp20 scripts/bench_sequence.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_bench_token_cpp20 scripts/bench_token.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_bench_container_cpp20 scripts/bench_container.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzilla_bench_memory_cpp20 scripts/bench_memory.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+
+    # Parallel benchmarks
+    define_launcher(stringparazilla_bench_similarity_cpp20 scripts/bench_similarity.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringparazilla_bench_find_many_cpp20 scripts/bench_find_many.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     if (ENABLE_CUDA)
-        define_gpu_launcher(stringcuzilla_bench_similarity scripts/bench_similarity.cu 20 "${STRINGZILLA_TARGET_ARCH}")
-        define_gpu_launcher(stringcuzilla_bench_find_many scripts/bench_find_many.cu 20 "${STRINGZILLA_TARGET_ARCH}")
+        define_gpu_launcher(
+            stringparazilla_bench_similarity_cu20 scripts/bench_similarity.cu 20 "${STRINGZILLA_TARGET_ARCH}"
+        )
+        define_gpu_launcher(
+            stringparazilla_bench_find_many_cu20 scripts/bench_find_many.cu 20 "${STRINGZILLA_TARGET_ARCH}"
+        )
     endif ()
 endif ()
 
@@ -374,15 +394,15 @@ if (${STRINGZILLA_BUILD_TEST})
     define_launcher(stringzilla_test_cpp20 scripts/test_stringzilla.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
 
     # Test parallel algorithms separately
-    define_launcher(stringcuzilla_test_cpp17 scripts/test_stringcuzilla.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringcuzilla_test_cpp20 scripts/test_stringcuzilla.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringparazilla_test_cpp17 scripts/test_stringparazilla.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringparazilla_test_cpp20 scripts/test_stringparazilla.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
 
     # To avoid bloating our codebase with `__device__` function annotations, we only target C++14 and newer to compile
     # `constexpr` functions on both host and device side. To avoid the complexity of defining too many template objects
     # and complex SFINAE, we only target C++17 anf newer to compile `if constexpr` compile-time SIMD dispatch.
     if (ENABLE_CUDA)
-        define_gpu_launcher(stringcuzilla_test_cu17 scripts/test_stringcuzilla.cu 17 "${STRINGZILLA_TARGET_ARCH}")
-        define_gpu_launcher(stringcuzilla_test_cu20 scripts/test_stringcuzilla.cu 20 "${STRINGZILLA_TARGET_ARCH}")
+        define_gpu_launcher(stringparazilla_test_cu17 scripts/test_stringparazilla.cu 17 "${STRINGZILLA_TARGET_ARCH}")
+        define_gpu_launcher(stringparazilla_test_cu20 scripts/test_stringparazilla.cu 20 "${STRINGZILLA_TARGET_ARCH}")
     endif ()
 
     # Check system architecture to avoid complex cross-compilation workflows, but compile multiple backends: disabling
@@ -394,18 +414,18 @@ if (${STRINGZILLA_BUILD_TEST})
             define_launcher(stringzilla_test_cpp20_haswell scripts/test_stringzilla.cpp 20 "AVX2")
             define_launcher(stringzilla_test_cpp20_ice scripts/test_stringzilla.cpp 20 "AVX512")
             if (ENABLE_CUDA)
-                define_gpu_launcher(stringcuzilla_test_cu20_serial scripts/test_stringcuzilla.cu 20 "AVX")
-                define_gpu_launcher(stringcuzilla_test_cu20_haswell scripts/test_stringcuzilla.cu 20 "AVX2")
-                define_gpu_launcher(stringcuzilla_test_cu20_ice scripts/test_stringcuzilla.cu 20 "AVX512")
+                define_gpu_launcher(stringparazilla_test_cu20_serial scripts/test_stringparazilla.cu 20 "AVX")
+                define_gpu_launcher(stringparazilla_test_cu20_haswell scripts/test_stringparazilla.cu 20 "AVX2")
+                define_gpu_launcher(stringparazilla_test_cu20_ice scripts/test_stringparazilla.cu 20 "AVX512")
             endif ()
         else ()
             define_launcher(stringzilla_test_cpp20_serial scripts/test_stringzilla.cpp 20 "ivybridge")
             define_launcher(stringzilla_test_cpp20_haswell scripts/test_stringzilla.cpp 20 "haswell")
             define_launcher(stringzilla_test_cpp20_ice scripts/test_stringzilla.cpp 20 "sapphirerapids")
             if (ENABLE_CUDA)
-                define_gpu_launcher(stringcuzilla_test_cu20_serial scripts/test_stringcuzilla.cu 20 "ivybridge")
-                define_gpu_launcher(stringcuzilla_test_cu20_haswell scripts/test_stringcuzilla.cu 20 "haswell")
-                define_gpu_launcher(stringcuzilla_test_cu20_ice scripts/test_stringcuzilla.cu 20 "sapphirerapids")
+                define_gpu_launcher(stringparazilla_test_cu20_serial scripts/test_stringparazilla.cu 20 "ivybridge")
+                define_gpu_launcher(stringparazilla_test_cu20_haswell scripts/test_stringparazilla.cu 20 "haswell")
+                define_gpu_launcher(stringparazilla_test_cu20_ice scripts/test_stringparazilla.cu 20 "sapphirerapids")
             endif ()
         endif ()
     elseif (SZ_PLATFORM_ARM)
@@ -414,9 +434,9 @@ if (${STRINGZILLA_BUILD_TEST})
         define_launcher(stringzilla_test_cpp20_neon scripts/test_stringzilla.cpp 20 "armv8-a+simd")
         define_launcher(stringzilla_test_cpp20_sve scripts/test_stringzilla.cpp 20 "armv8.2-a+sve")
         if (ENABLE_CUDA)
-            define_gpu_launcher(stringcuzilla_test_cu20_serial scripts/test_stringcuzilla.cu 20 "armv8-a")
-            define_gpu_launcher(stringcuzilla_test_cu20_neon scripts/test_stringcuzilla.cu 20 "armv8-a+simd")
-            define_gpu_launcher(stringcuzilla_test_cu20_sve scripts/test_stringcuzilla.cu 20 "armv8.2-a+sve")
+            define_gpu_launcher(stringparazilla_test_cu20_serial scripts/test_stringparazilla.cu 20 "armv8-a")
+            define_gpu_launcher(stringparazilla_test_cu20_neon scripts/test_stringparazilla.cu 20 "armv8-a+simd")
+            define_gpu_launcher(stringparazilla_test_cu20_sve scripts/test_stringparazilla.cu 20 "armv8.2-a+sve")
         endif ()
     endif ()
 endif ()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index c2a8f261..beb81d39 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -174,20 +174,21 @@ For benchmarks, you can use the following commands:
 
 ```bash
 cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -B build_release
-cmake --build build_release --config Release  # Produces the following targets:
-build_release/stringzilla_bench_memory        # - for string copies and fills
-build_release/stringzilla_bench_find        # - for substring search
-build_release/stringzilla_bench_token         # - for hashing, equality comparisons, etc.
-build_release/stringzilla_bench_sequence      # - for sorting arrays of strings
-build_release/stringzilla_bench_container     # - for STL containers with string keys
+cmake --build build_release --config Release    # Produces the following targets:
+build_release/stringzilla_bench_memory_cpp20    # - for string copies and fills
+build_release/stringzilla_bench_find_cpp20      # - for substring search
+build_release/stringzilla_bench_token_cpp20     # - for hashing, equality comparisons, etc.
+build_release/stringzilla_bench_sequence_cpp20  # - for sorting arrays of strings
+build_release/stringzilla_bench_container_cpp20 # - for STL containers with string keys
 ```
 
 There are also parallel algorithms that need a very different benchmarking setup:
 
 ```sh
-build_release/stringzilla_bench_find_many     # - for parallel multi-pattern search on CPU
-build_release/stringzilla_bench_similarity    # - for parallel edit distances and alignment scores on CPU
-build_release/stringcuzilla_bench_similarity  # - for parallel edit distances and alignment scores on GPU
+build_release/stringparazilla_bench_find_many_cpp20  # - for parallel multi-pattern search on CPU
+build_release/stringparazilla_bench_find_many_cu20   # - for parallel multi-pattern search on GPU
+build_release/stringparazilla_bench_similarity_cpp20 # - for parallel edit distances and alignment scores on CPU
+build_release/stringparazilla_bench_similarity_cu20  # - for parallel edit distances and alignment scores on GPU
 ```
 
 All of them support customization via environment variables.
@@ -195,13 +196,13 @@ Let's say you want to benchmark large-batch DNA similarity scoring kernels:
 
 ```sh
 cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -B build_release
-cmake --build build_release --config Release --target stringzilla_bench_similarity      # CPU
-cmake --build build_release --config Release --target stringcuzilla_bench_similarity    # GPU
-STRINGWARS_FILTER=32768 STRINGWARS_DATASET="acgt_1k.txt" build_release/stringzilla_bench_similarity
-STRINGWARS_FILTER=1 STRINGWARS_DATASET="acgt_100k.txt" build_release/stringzilla_bench_similarity
+cmake --build build_release --config Release --target stringparazilla_bench_similarity_cpp20    # CPU
+cmake --build build_release --config Release --target stringparazilla_bench_similarity_cu20     # GPU
+STRINGWARS_FILTER=32768 STRINGWARS_DATASET="acgt_1k.txt" build_release/stringparazilla_bench_similarity_cpp20
+STRINGWARS_FILTER=1 STRINGWARS_DATASET="acgt_100k.txt" build_release/stringparazilla_bench_similarity_cu20
 
-STRINGWARS_FILTER="(cuda|kepler|hopper).*:batch32768" STRINGWARS_DATASET="acgt_1k.txt" build_release/stringcuzilla_bench_similarity
-STRINGWARS_STRESS=0 STRINGWARS_FILTER="(cuda|kepler|hopper).*:batch1" STRINGWARS_DATASET="acgt_100k.txt" build_release/stringcuzilla_bench_similarity
+STRINGWARS_FILTER="(cuda|kepler|hopper).*:batch32768" STRINGWARS_DATASET="acgt_1k.txt" build_release/stringparazilla_bench_similarity_cu20
+STRINGWARS_STRESS=0 STRINGWARS_FILTER="(cuda|kepler|hopper).*:batch1" STRINGWARS_DATASET="acgt_100k.txt" build_release/stringparazilla_bench_similarity_cu20
 ```
 
 Each benchmark originates from an identically named single-source file in the `scripts/` directory.
@@ -248,7 +249,7 @@ cmake -D CMAKE_BUILD_TYPE=Release -D STRINGZILLA_BUILD_BENCHMARK=1 -D STRINGZILL
 ### Profiling
 
 To simplify tracing and profiling, build with symbols using the `RelWithDebInfo` configuration.
-Here is an example for profiling one target - `stringzilla_bench_token`.
+Here is an example for profiling one target - `stringzilla_bench_token_cpp20`.
 
 ```bash
 cmake -D STRINGZILLA_BUILD_BENCHMARK=1 \
@@ -256,14 +257,14 @@ cmake -D STRINGZILLA_BUILD_BENCHMARK=1 \
     -D STRINGZILLA_BUILD_SHARED=1 \
     -D CMAKE_BUILD_TYPE=RelWithDebInfo \
     -B build_profile
-cmake --build build_profile --config Release --target stringzilla_bench_token
+cmake --build build_profile --config Release --target stringzilla_bench_token_cpp20
 
 # Check that the debugging symbols are there with your favorite tool
-readelf --sections build_profile/stringzilla_bench_token | grep debug
-objdump -h build_profile/stringzilla_bench_token | grep debug
+readelf --sections build_profile/stringzilla_bench_token_cpp20 | grep debug
+objdump -h build_profile/stringzilla_bench_token_cpp20 | grep debug
 
 # Profile
-sudo perf record -g build_profile/stringzilla_bench_token ./leipzig1M.txt
+sudo perf record -g build_profile/stringzilla_bench_token_cpp20 ./leipzig1M.txt
 sudo perf report
 ```
 
@@ -393,19 +394,19 @@ cmake --build build_artifacts --config Release
 
 ```sh
 cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=1 -B build_debug
-cmake --build build_debug --config Debug --target stringcuzilla_test_cpp20
-cmake --build build_debug --config Debug --target stringcuzilla_test_cu20
+cmake --build build_debug --config Debug --target stringparazilla_test_cpp20
+cmake --build build_debug --config Debug --target stringparazilla_test_cu20
 ```
 
 ```sh
 cmake -D CMAKE_BUILD_TYPE=Release -D STRINGZILLA_BUILD_TEST=1 -B build_release
-cmake --build build_release --config Release --target stringcuzilla_test_cpp20
-cmake --build build_release --config Release --target stringcuzilla_test_cu20
+cmake --build build_release --config Release --target stringparazilla_test_cpp20
+cmake --build build_release --config Release --target stringparazilla_test_cu20
 ```
 
 ```sh
-cuda-gdb ./build_debug/stringcuzilla_test_cu20
-cuda-memcheck ./build_debug/stringcuzilla_test_cu20
+cuda-gdb ./build_debug/stringparazilla_test_cu20
+cuda-memcheck ./build_debug/stringparazilla_test_cu20
 ```
 
 ## Contributing in Python
diff --git a/include/stringcuzilla/find_many.cuh b/include/stringparazilla/find_many.cuh
similarity index 99%
rename from include/stringcuzilla/find_many.cuh
rename to include/stringparazilla/find_many.cuh
index 67bf5984..a24627be 100644
--- a/include/stringcuzilla/find_many.cuh
+++ b/include/stringparazilla/find_many.cuh
@@ -20,18 +20,18 @@
  *
  *
  */
-#ifndef STRINGCUZILLA_FIND_MANY_CUH_
-#define STRINGCUZILLA_FIND_MANY_CUH_
+#ifndef STRINGPARAZILLA_FIND_MANY_CUH_
+#define STRINGPARAZILLA_FIND_MANY_CUH_
 
-#include "stringcuzilla/types.cuh"
-#include "stringcuzilla/find_many.hpp"
+#include "stringparazilla/types.cuh"
+#include "stringparazilla/find_many.hpp"
 
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda/atomic>
 
 namespace ashvardanian {
-namespace stringzilla {
+namespace stringparazilla {
 
 #pragma region - General Purpose CUDA Backend
 
@@ -483,7 +483,7 @@ struct find_many<state_id_type_, allocator_type_, sz_cap_cuda_k, enable_> {
 
 using find_many_u32_cuda_t = find_many<u32_t, unified_alloc<char>, sz_cap_cuda_k>;
 
-} // namespace stringzilla
+} // namespace stringparazilla
 } // namespace ashvardanian
 
-#endif // STRINGCUZILLA_FIND_MANY_CUH_
+#endif // STRINGPARAZILLA_FIND_MANY_CUH_
diff --git a/include/stringcuzilla/find_many.hpp b/include/stringparazilla/find_many.hpp
similarity index 99%
rename from include/stringcuzilla/find_many.hpp
rename to include/stringparazilla/find_many.hpp
index c1a4019e..8edeb1c5 100644
--- a/include/stringcuzilla/find_many.hpp
+++ b/include/stringparazilla/find_many.hpp
@@ -35,12 +35,12 @@
  *  | Plagiarism/Code Similarity    | 1,000 – 100,000         | 1.024 MB – 102.4 MB     |
  *  | Product Catalog Matching      | 100,000 – 1,000,000     | 102.4 MB – 1.024 GB     |
  */
-#ifndef STRINGCUZILLA_FIND_MANY_HPP_
-#define STRINGCUZILLA_FIND_MANY_HPP_
+#ifndef STRINGPARAZILLA_FIND_MANY_HPP_
+#define STRINGPARAZILLA_FIND_MANY_HPP_
 
-#include "stringzilla/memory.h"    // `sz_move`
-#include "stringzilla/types.hpp"   // `status_t::status_t`
-#include "stringcuzilla/types.hpp" // `dummy_executor_t`
+#include "stringzilla/memory.h"      // `sz_move`
+#include "stringzilla/types.hpp"     // `status_t::status_t`
+#include "stringparazilla/types.hpp" // `dummy_executor_t`
 
 #include <memory>      // `std::allocator_traits` to re-bind the allocator
 #include <type_traits> // `std::enable_if_t` for meta-programming
@@ -48,7 +48,7 @@
 #include <iterator>    // `std::iterator_traits` for iterators
 
 namespace ashvardanian {
-namespace stringzilla {
+namespace stringparazilla {
 
 #pragma region - Dictionary
 
@@ -990,7 +990,7 @@ using find_many_u32_parallel_t = find_many<u32_t, std::allocator<char>, sz_caps_
 
 #pragma endregion // Parallel Backend
 
-} // namespace stringzilla
+} // namespace stringparazilla
 } // namespace ashvardanian
 
-#endif // STRINGCUZILLA_FIND_MANY_HPP_
+#endif // STRINGPARAZILLA_FIND_MANY_HPP_
diff --git a/include/stringcuzilla/similarity.cuh b/include/stringparazilla/similarity.cuh
similarity index 99%
rename from include/stringcuzilla/similarity.cuh
rename to include/stringparazilla/similarity.cuh
index 8736d62d..d627c3eb 100644
--- a/include/stringcuzilla/similarity.cuh
+++ b/include/stringparazilla/similarity.cuh
@@ -34,19 +34,19 @@
  *  - `levenshtein_distances`: {CUDA and Kepler} for any chars and lengths, {Hopper} for 8-bit and 16-bit lengths.
  *  - `needleman_wunsch_score`.
  */
-#ifndef STRINGCUZILLA_SIMILARITY_CUH_
-#define STRINGCUZILLA_SIMILARITY_CUH_
+#ifndef STRINGPARAZILLA_SIMILARITY_CUH_
+#define STRINGPARAZILLA_SIMILARITY_CUH_
 
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda/pipeline>        // `cuda::pipeline`
 #include <cooperative_groups.h> // `cooperative_groups::this_grid()`
 
-#include "stringcuzilla/types.cuh"
-#include "stringcuzilla/similarity.hpp"
+#include "stringparazilla/types.cuh"
+#include "stringparazilla/similarity.hpp"
 
 namespace ashvardanian {
-namespace stringzilla {
+namespace stringparazilla {
 
 #pragma region - Common Aliases
 
@@ -2973,7 +2973,7 @@ struct smith_waterman_scores<char, error_costs_256x256_t, gap_costs_type_, alloc
 
 #pragma endregion
 
-} // namespace stringzilla
+} // namespace stringparazilla
 } // namespace ashvardanian
 
-#endif // STRINGCUZILLA_SIMILARITY_CUH_
\ No newline at end of file
+#endif // STRINGPARAZILLA_SIMILARITY_CUH_
\ No newline at end of file
diff --git a/include/stringcuzilla/similarity.hpp b/include/stringparazilla/similarity.hpp
similarity index 99%
rename from include/stringcuzilla/similarity.hpp
rename to include/stringparazilla/similarity.hpp
index fcdca8f9..48c8f455 100644
--- a/include/stringcuzilla/similarity.hpp
+++ b/include/stringparazilla/similarity.hpp
@@ -66,12 +66,12 @@
  *  @see https://github.com/quim0/WFA-GPU
  *  @see https://github.com/asbschmidt/CUDASW4
  */
-#ifndef STRINGCUZILLA_SIMILARITY_HPP_
-#define STRINGCUZILLA_SIMILARITY_HPP_
+#ifndef STRINGPARAZILLA_SIMILARITY_HPP_
+#define STRINGPARAZILLA_SIMILARITY_HPP_
 
-#include "stringzilla/types.hpp"   // `sz::error_cost_t`
-#include "stringzilla/memory.h"    // `sz_move`
-#include "stringcuzilla/types.hpp" // `sz::executor_like`
+#include "stringzilla/types.hpp"     // `sz::error_cost_t`
+#include "stringzilla/memory.h"      // `sz_move`
+#include "stringparazilla/types.hpp" // `sz::executor_like`
 
 #include <atomic>      // `std::atomic` to synchronize OpenMP threads
 #include <type_traits> // `std::enable_if_t` for meta-programming
@@ -79,7 +79,7 @@
 #include <iterator>    // `std::iterator_traits` for iterators
 
 namespace ashvardanian {
-namespace stringzilla {
+namespace stringparazilla {
 
 struct error_costs_256x256_t;
 struct error_costs_26x26ascii_t;
@@ -4384,7 +4384,7 @@ struct smith_waterman_score<char, error_costs_256x256_t, linear_gap_costs_t, all
 #endif            // SZ_USE_ICE
 #pragma endregion // Ice Lake Implementation
 
-} // namespace stringzilla
+} // namespace stringparazilla
 } // namespace ashvardanian
 
-#endif // STRINGCUZILLA_SIMILARITY_HPP_
\ No newline at end of file
+#endif // STRINGPARAZILLA_SIMILARITY_HPP_
\ No newline at end of file
diff --git a/include/stringcuzilla/stringcuzilla.h b/include/stringparazilla/stringparazilla.h
similarity index 94%
rename from include/stringcuzilla/stringcuzilla.h
rename to include/stringparazilla/stringparazilla.h
index 85968bc7..07cf0482 100644
--- a/include/stringcuzilla/stringcuzilla.h
+++ b/include/stringparazilla/stringparazilla.h
@@ -1,14 +1,14 @@
 /**
- *  @brief  StringZilla is a collection of advanced string algorithms, designed to be used in Big Data applications.
+ *  @brief  StringParaZilla is a collection of advanced string algorithms, designed to be used in Big Data applications.
  *          It is generally faster than LibC, and has a broader & cleaner interface for safer @b length-bounded strings.
  *          On modern CPUs it uses AVX2, AVX-512, NEON, SVE, & SVE2 @b SIMD instructions & provides SWAR for older CPUs.
  *          On @b CUDA-capable GPUs it also provides C++ kernels for bulk processing.
  *
- *  @file   stringcuzilla.h
+ *  @file   stringparazilla.h
  *  @author Ash Vardanian
  */
-#ifndef STRINGCUZILLA_H_
-#define STRINGCUZILLA_H_
+#ifndef STRINGPARAZILLA_H_
+#define STRINGPARAZILLA_H_
 
 #include "stringzilla.h"
 
@@ -91,4 +91,4 @@ SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u64tape(                       /
 }
 #endif // __cplusplus
 
-#endif // STRINGCUZILLA_H_
+#endif // STRINGPARAZILLA_H_
diff --git a/include/stringcuzilla/types.cuh b/include/stringparazilla/types.cuh
similarity index 98%
rename from include/stringcuzilla/types.cuh
rename to include/stringparazilla/types.cuh
index f5afef9b..60d07b2e 100644
--- a/include/stringcuzilla/types.cuh
+++ b/include/stringparazilla/types.cuh
@@ -8,8 +8,8 @@
  *
  *  - `unified_alloc` - a custom allocator that uses CUDA Unified Memory for allocation.
  */
-#ifndef STRINGCUZILLA_TYPES_CUH_
-#define STRINGCUZILLA_TYPES_CUH_
+#ifndef STRINGPARAZILLA_TYPES_CUH_
+#define STRINGPARAZILLA_TYPES_CUH_
 
 #include "stringzilla/types.hpp"
 
@@ -35,7 +35,7 @@
 #endif
 
 namespace ashvardanian {
-namespace stringzilla {
+namespace stringparazilla {
 
 /**
  *  @brief  A custom allocator that uses CUDA Unified Memory for allocation.
@@ -314,7 +314,7 @@ warp_tasks_groups<task_type_> warp_tasks_grouping(span<task_type_> tasks, gpu_sp
     return result;
 }
 
-} // namespace stringzilla
+} // namespace stringparazilla
 } // namespace ashvardanian
 
-#endif // STRINGCUZILLA_TYPES_CUH_
+#endif // STRINGPARAZILLA_TYPES_CUH_
diff --git a/include/stringcuzilla/types.hpp b/include/stringparazilla/types.hpp
similarity index 97%
rename from include/stringcuzilla/types.hpp
rename to include/stringparazilla/types.hpp
index 12769c25..393198cc 100644
--- a/include/stringcuzilla/types.hpp
+++ b/include/stringparazilla/types.hpp
@@ -3,15 +3,17 @@
  *  @file   types.hpp
  *  @author Ash Vardanian
  */
-#ifndef STRINGCUZILLA_TYPES_HPP_
-#define STRINGCUZILLA_TYPES_HPP_
+#ifndef STRINGPARAZILLA_TYPES_HPP_
+#define STRINGPARAZILLA_TYPES_HPP_
 
 #include <thread> // `std::thread::hardware_concurrency`
 
 #include "stringzilla/types.hpp"
 
 namespace ashvardanian {
-namespace stringzilla {
+namespace stringparazilla {
+
+using namespace ashvardanian::stringzilla;
 
 enum bytes_per_cell_t : uint {
     zero_bytes_per_cell_k = 0,
@@ -200,7 +202,7 @@ size_t group_by(begin_iterator_type_ const begin, end_iterator_type_ const end,
     return group_count;
 }
 
-} // namespace stringzilla
+} // namespace stringparazilla
 } // namespace ashvardanian
 
-#endif // STRINGCUZILLA_TYPES_HPP_
+#endif // STRINGPARAZILLA_TYPES_HPP_
diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index f8a5f84d..5780c007 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -51,7 +51,7 @@
 #include "stringzilla/stringzilla.hpp"
 
 #if SZ_USE_CUDA
-#include "stringcuzilla/types.cuh" // `unified_alloc`
+#include "stringparazilla/types.cuh" // `unified_alloc`
 #endif
 
 #include "test_stringzilla.hpp" // `read_file`
diff --git a/scripts/bench_container.cpp b/scripts/bench_container.cpp
index 1c3dd0a7..eb4c2387 100644
--- a/scripts/bench_container.cpp
+++ b/scripts/bench_container.cpp
@@ -21,8 +21,8 @@
  *
  *  @code{.sh}
  *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
- *  cmake --build build_release --config Release --target stringzilla_bench_container
- *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=lines build_release/stringzilla_bench_container
+ *  cmake --build build_release --config Release --target stringzilla_bench_container_cpp20
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=lines build_release/stringzilla_bench_container_cpp20
  *  @endcode
  *
  *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
@@ -31,7 +31,7 @@
  *
  *  @code{.sh}
  *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
- *  build_release/stringzilla_bench_container
+ *  build_release/stringzilla_bench_container_cpp20
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
diff --git a/scripts/bench_find.cpp b/scripts/bench_find.cpp
index 676fcc96..f1c5f5e7 100644
--- a/scripts/bench_find.cpp
+++ b/scripts/bench_find.cpp
@@ -35,8 +35,8 @@
  *
  *  @code{.sh}
  *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
- *  cmake --build build_release --config Release --target stringzilla_bench_find
- *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringzilla_bench_find
+ *  cmake --build build_release --config Release --target stringzilla_bench_find_cpp20
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringzilla_bench_find_cpp20
  *  @endcode
  *
  *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
@@ -46,7 +46,7 @@
  *  @code{.sh}
  *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
  *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
- *  build_release/stringzilla_bench_find
+ *  build_release/stringzilla_bench_find_cpp20
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
diff --git a/scripts/bench_find_many.cpp b/scripts/bench_find_many.cpp
index 4c7ec184..78709cb4 100644
--- a/scripts/bench_find_many.cpp
+++ b/scripts/bench_find_many.cpp
@@ -26,8 +26,8 @@
  *
  *  @code{.sh}
  *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
- *  cmake --build build_release --config Release --target stringzilla_bench_find_many
- *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringzilla_bench_find_many
+ *  cmake --build build_release --config Release --target stringparazilla_bench_find_many_cpp20
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringparazilla_bench_find_many_cpp20
  *  @endcode
  *
  *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
@@ -37,7 +37,7 @@
  *  @code{.sh}
  *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
  *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
- *  build_release/stringzilla_bench_find_many
+ *  build_release/stringparazilla_bench_find_many_cpp20
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
@@ -45,11 +45,11 @@
  */
 #include "bench_find_many.cuh"
 
-namespace sz = ashvardanian::stringzilla;
-using namespace sz::scripts;
+namespace szp = ashvardanian::stringparazilla;
+using namespace szp::scripts;
 
 int main(int argc, char const **argv) {
-    std::printf("Welcome to StringZilla!\n");
+    std::printf("Welcome to StringParaZilla on CPU!\n");
 
     try {
         std::printf("Building up the environment...\n");
diff --git a/scripts/bench_find_many.cu b/scripts/bench_find_many.cu
index e0a0d108..25f4b49c 100644
--- a/scripts/bench_find_many.cu
+++ b/scripts/bench_find_many.cu
@@ -26,8 +26,8 @@
  *
  *  @code{.sh}
  *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
- *  cmake --build build_release --config Release --target stringcuzilla_bench_find_many
- *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringcuzilla_bench_find_many
+ *  cmake --build build_release --config Release --target stringparazilla_bench_find_many_cu20
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringparazilla_bench_find_many_cu20
  *  @endcode
  *
  *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
@@ -37,7 +37,7 @@
  *  @code{.sh}
  *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
  *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
- *  build_release/stringcuzilla_bench_find_many
+ *  build_release/stringparazilla_bench_find_many_cu20
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
@@ -45,11 +45,11 @@
  */
 #include "bench_find_many.cuh"
 
-namespace sz = ashvardanian::stringzilla;
-using namespace sz::scripts;
+namespace szp = ashvardanian::stringparazilla;
+using namespace szp::scripts;
 
 int main(int argc, char const **argv) {
-    std::printf("Welcome to StringZilla!\n");
+    std::printf("Welcome to StringParaZilla on GPU!\n");
 
     try {
         std::printf("Building up the environment...\n");
diff --git a/scripts/bench_find_many.cuh b/scripts/bench_find_many.cuh
index b9280ebc..f8b72e68 100644
--- a/scripts/bench_find_many.cuh
+++ b/scripts/bench_find_many.cuh
@@ -7,18 +7,20 @@
 
 #include <fork_union.hpp> // Fork-join scoped thread pool
 
-#include <stringcuzilla/find_many.hpp> // C++ templates for string processing
+#include <stringparazilla/find_many.hpp> // C++ templates for string processing
 
 #if SZ_USE_CUDA
-#include <stringcuzilla/find_many.cuh> // Parallel string processing in CUDA
+#include <stringparazilla/find_many.cuh> // Parallel string processing in CUDA
 #endif
 
 #include "bench.hpp"
 
 namespace ashvardanian {
-namespace stringzilla {
+namespace stringparazilla {
 namespace scripts {
 
+using namespace ashvardanian::stringzilla::scripts;
+
 using counts_t = unified_vector<size_t>;
 using matches_t = unified_vector<find_many_match_t>;
 
@@ -222,5 +224,5 @@ void bench_find_many(environment_t const &env) {
 #pragma endregion
 
 } // namespace scripts
-} // namespace stringzilla
+} // namespace stringparazilla
 } // namespace ashvardanian
\ No newline at end of file
diff --git a/scripts/bench_memory.cpp b/scripts/bench_memory.cpp
index 710c262c..d905e763 100644
--- a/scripts/bench_memory.cpp
+++ b/scripts/bench_memory.cpp
@@ -21,8 +21,8 @@
  *
  *  @code{.sh}
  *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
- *  cmake --build build_release --config Release --target stringzilla_bench_memory
- *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=lines build_release/stringzilla_bench_memory
+ *  cmake --build build_release --config Release --target stringzilla_bench_memory_cpp20
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=lines build_release/stringzilla_bench_memory_cpp20
  *  @endcode
  *
  *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
@@ -32,7 +32,7 @@
  *  @code{.sh}
  *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
  *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
- *  build_release/stringzilla_bench_memory
+ *  build_release/stringzilla_bench_memory_cpp20
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
diff --git a/scripts/bench_sequence.cpp b/scripts/bench_sequence.cpp
index 653a8159..61011e09 100644
--- a/scripts/bench_sequence.cpp
+++ b/scripts/bench_sequence.cpp
@@ -30,8 +30,8 @@
  *
  *  @code{.sh}
  *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
- *  cmake --build build_release --config Release --target stringzilla_bench_sequence
- *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringzilla_bench_sequence
+ *  cmake --build build_release --config Release --target stringzilla_bench_sequence_cpp20
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringzilla_bench_sequence_cpp20
  *  @endcode
  *
  *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
@@ -41,7 +41,7 @@
  *  @code{.sh}
  *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
  *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
- *  build_release/stringzilla_bench_sequence
+ *  build_release/stringzilla_bench_sequence_cpp20
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
diff --git a/scripts/bench_similarity.cpp b/scripts/bench_similarity.cpp
index 61930bf1..7ceb9f28 100644
--- a/scripts/bench_similarity.cpp
+++ b/scripts/bench_similarity.cpp
@@ -30,8 +30,8 @@
  *
  *  @code{.sh}
  *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
- *  cmake --build build_release --config Release --target stringzilla_bench_similarity
- *  STRINGWARS_DATASET=xlsum.csv STRINGWARS_TOKENS=words build_release/stringzilla_bench_similarity
+ *  cmake --build build_release --config Release --target stringparazilla_bench_similarity_cpp20
+ *  STRINGWARS_DATASET=xlsum.csv STRINGWARS_TOKENS=words build_release/stringparazilla_bench_similarity_cpp20
  *  @endcode
  *
  *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
@@ -41,7 +41,7 @@
  *  @code{.sh}
  *  STRINGWARS_DATASET=proteins.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
  *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
- *  build_release/stringzilla_bench_similarity
+ *  build_release/stringparazilla_bench_similarity_cpp20
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
@@ -49,11 +49,11 @@
  */
 #include "bench_similarity.cuh"
 
-namespace sz = ashvardanian::stringzilla;
-using namespace sz::scripts;
+namespace szp = ashvardanian::stringparazilla;
+using namespace szp::scripts;
 
 int main(int argc, char const **argv) {
-    std::printf("Welcome to StringZilla!\n");
+    std::printf("Welcome to StringParaZilla on CPU!\n");
 
     try {
         std::printf("Building up the environment...\n");
diff --git a/scripts/bench_similarity.cu b/scripts/bench_similarity.cu
index 61930bf1..8fe938a5 100644
--- a/scripts/bench_similarity.cu
+++ b/scripts/bench_similarity.cu
@@ -30,8 +30,8 @@
  *
  *  @code{.sh}
  *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
- *  cmake --build build_release --config Release --target stringzilla_bench_similarity
- *  STRINGWARS_DATASET=xlsum.csv STRINGWARS_TOKENS=words build_release/stringzilla_bench_similarity
+ *  cmake --build build_release --config Release --target stringparazilla_bench_similarity_cu20
+ *  STRINGWARS_DATASET=xlsum.csv STRINGWARS_TOKENS=words build_release/stringparazilla_bench_similarity_cu20
  *  @endcode
  *
  *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
@@ -41,7 +41,7 @@
  *  @code{.sh}
  *  STRINGWARS_DATASET=proteins.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
  *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
- *  build_release/stringzilla_bench_similarity
+ *  build_release/stringparazilla_bench_similarity_cu20
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
@@ -49,11 +49,11 @@
  */
 #include "bench_similarity.cuh"
 
-namespace sz = ashvardanian::stringzilla;
-using namespace sz::scripts;
+namespace szp = ashvardanian::stringparazilla;
+using namespace szp::scripts;
 
 int main(int argc, char const **argv) {
-    std::printf("Welcome to StringZilla!\n");
+    std::printf("Welcome to StringParaZilla on GPU!\n");
 
     try {
         std::printf("Building up the environment...\n");
diff --git a/scripts/bench_similarity.cuh b/scripts/bench_similarity.cuh
index 037de404..2505170a 100644
--- a/scripts/bench_similarity.cuh
+++ b/scripts/bench_similarity.cuh
@@ -6,18 +6,20 @@
 
 #include <fork_union.hpp> // Fork-join scoped thread pool
 
-#include <stringcuzilla/similarity.hpp> // C++ templates for string similarity measures
+#include <stringparazilla/similarity.hpp> // C++ templates for string similarity measures
 
 #if SZ_USE_CUDA
-#include <stringcuzilla/similarity.cuh> // Parallel string processing in CUDA
+#include <stringparazilla/similarity.cuh> // Parallel string processing in CUDA
 #endif
 
 #include "bench.hpp"
 
 namespace ashvardanian {
-namespace stringzilla {
+namespace stringparazilla {
 namespace scripts {
 
+using namespace ashvardanian::stringzilla::scripts;
+
 using similarities_t = unified_vector<sz_ssize_t>;
 
 #pragma region Levenshtein Distance and Alignment Scores
@@ -390,5 +392,5 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
 #pragma endregion
 
 } // namespace scripts
-} // namespace stringzilla
+} // namespace stringparazilla
 } // namespace ashvardanian
\ No newline at end of file
diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index ebd4a9df..589d4263 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -30,8 +30,8 @@
  *
  *  @code{.sh}
  *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
- *  cmake --build build_release --config Release --target stringzilla_bench_token
- *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=lines build_release/stringzilla_bench_token
+ *  cmake --build build_release --config Release --target stringzilla_bench_token_cpp20
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=lines build_release/stringzilla_bench_token_cpp20
  *  @endcode
  *
  *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
@@ -41,7 +41,7 @@
  *  @code{.sh}
  *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
  *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
- *  build_release/stringzilla_bench_token
+ *  build_release/stringzilla_bench_token_cpp20
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
diff --git a/scripts/test_stringcuzilla.cpp b/scripts/test_stringparazilla.cpp
similarity index 97%
rename from scripts/test_stringcuzilla.cpp
rename to scripts/test_stringparazilla.cpp
index f3983115..a33c112d 100644
--- a/scripts/test_stringcuzilla.cpp
+++ b/scripts/test_stringparazilla.cpp
@@ -28,7 +28,7 @@
 #endif
 #define SZ_DEBUG 1 // Enforce aggressive logging for this unit.
 
-#include "test_stringcuzilla.cuh"
+#include "test_stringparazilla.cuh"
 
 namespace sz = ashvardanian::stringzilla;
 
diff --git a/scripts/test_stringcuzilla.cu b/scripts/test_stringparazilla.cu
similarity index 97%
rename from scripts/test_stringcuzilla.cu
rename to scripts/test_stringparazilla.cu
index 92acfdde..678edc1f 100644
--- a/scripts/test_stringcuzilla.cu
+++ b/scripts/test_stringparazilla.cu
@@ -28,7 +28,7 @@
 #endif
 #define SZ_DEBUG 1 // Enforce aggressive logging for this unit.
 
-#include "test_stringcuzilla.cuh"
+#include "test_stringparazilla.cuh"
 
 namespace sz = ashvardanian::stringzilla;
 
diff --git a/scripts/test_stringcuzilla.cuh b/scripts/test_stringparazilla.cuh
similarity index 99%
rename from scripts/test_stringcuzilla.cuh
rename to scripts/test_stringparazilla.cuh
index 2c35c613..93e9618c 100644
--- a/scripts/test_stringcuzilla.cuh
+++ b/scripts/test_stringparazilla.cuh
@@ -2,7 +2,7 @@
  *  @brief   Extensive @b stress-testing suite for StringCuZilla parallel operations, written in CUDA C++.
  *  @see     Stress-tests on real-world and synthetic data are integrated into the @b `scripts/bench*.cpp` benchmarks.
  *
- *  @file    test_stringcuzilla.cuh
+ *  @file    test_stringparazilla.cuh
  *  @author  Ash Vardanian
  */
 #include <cstring> // `std::memcmp`
@@ -10,12 +10,12 @@
 
 #include <fork_union.hpp> // Fork-join scoped thread pool
 
-#include "stringcuzilla/find_many.hpp"
-#include "stringcuzilla/similarity.hpp"
+#include "stringparazilla/find_many.hpp"
+#include "stringparazilla/similarity.hpp"
 
 #if SZ_USE_CUDA
-#include "stringcuzilla/find_many.cuh"
-#include "stringcuzilla/similarity.cuh"
+#include "stringparazilla/find_many.cuh"
+#include "stringparazilla/similarity.cuh"
 #endif
 
 #if !_SZ_IS_CPP17
@@ -25,7 +25,7 @@
 #include "test_stringzilla.hpp" // `arrow_strings_view_t`
 
 namespace ashvardanian {
-namespace stringzilla {
+namespace stringparazilla {
 namespace scripts {
 
 int log_environment() {
@@ -1404,5 +1404,5 @@ void test_find_many_equivalence() {
 }
 
 } // namespace scripts
-} // namespace stringzilla
+} // namespace stringparazilla
 } // namespace ashvardanian
diff --git a/scripts/test_stringzilla.hpp b/scripts/test_stringzilla.hpp
index 5ef9841e..bd2f8fcb 100644
--- a/scripts/test_stringzilla.hpp
+++ b/scripts/test_stringzilla.hpp
@@ -15,7 +15,7 @@
 
 #include "stringzilla/types.hpp"
 #if SZ_USE_CUDA
-#include "stringcuzilla/types.cuh"
+#include "stringparazilla/types.cuh"
 #endif
 
 namespace ashvardanian {

From 0b22dd979fc3bbf17ad66585458ca1ed2125ad3a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 9 Jul 2025 16:22:05 +0000
Subject: [PATCH 447/751] Fix: Wrong boundary conditions for
 `count_many_parallel`

---
 include/stringparazilla/find_many.hpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/stringparazilla/find_many.hpp b/include/stringparazilla/find_many.hpp
index 8edeb1c5..04d0693f 100644
--- a/include/stringparazilla/find_many.hpp
+++ b/include/stringparazilla/find_many.hpp
@@ -927,17 +927,19 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
         if (optimal_slice.begin() + max_needle_length >= optimal_slice.end()) {
             // Our needles are longer than a slice for the core
             overlapping_start = optimal_slice.begin();
-            overlapping_end = std::min(optimal_slice.begin() + max_needle_length, haystack.end());
+            overlapping_end = std::min(optimal_slice.end() + max_needle_length, haystack.end());
         }
         else {
-            overlapping_start = std::max(optimal_slice.end() - max_needle_length + 1, haystack.begin());
+            overlapping_start = std::max(optimal_slice.end() - max_needle_length + 1, optimal_slice.begin());
             overlapping_end = std::min(optimal_slice.end() + max_needle_length - 1, haystack.end());
         }
 
         // Count the matches that start in one core's slice and end in another
         size_t count_matches_overlapping = 0;
         dict_.find({overlapping_start, overlapping_end}, [&](match_t match) noexcept {
-            bool belongs_to_this_core = match.needle.begin() < optimal_slice.end();
+            bool belongs_to_this_core =                       //
+                match.needle.begin() < optimal_slice.end() && // ? Starts within the core's slice
+                match.needle.end() > optimal_slice.end();     // ? Ends in another core's slice
             count_matches_overlapping += belongs_to_this_core;
             return true;
         });

From 40bd3ede7ce6231709092d29289746a97159dc28 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 9 Jul 2025 16:22:51 +0000
Subject: [PATCH 448/751] Fix: Replace `+g` with `+m,r` like GB

---
 scripts/bench.hpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index 5780c007..343111a0 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -174,8 +174,10 @@ static void do_not_optimize(argument_type &&value) noexcept {
     // Use the `volatile` keyword and a memory barrier to prevent optimization
     volatile plain_type *p = &value;
     _ReadWriteBarrier();
-#else // Other compilers (GCC, Clang, etc.)
-    __asm__ __volatile__("" : "+g"(value) : : "memory");
+#elif defined(__clang__)
+    asm volatile("" : "+r,m"(value) : : "memory");
+#else // GCC
+    asm volatile("" : "+m,r"(value) : : "memory");
 #endif
 }
 
@@ -195,9 +197,9 @@ using dataset_t = std::string;
 using token_view_t = std::string_view;
 using tokens_t = std::vector<token_view_t>;
 #else
-using dataset_t = std::basic_string<char, std::char_traits<char>, sz::unified_alloc<char>>;
-using token_view_t = sz::span<char const>;
-using tokens_t = std::vector<token_view_t, sz::unified_alloc<token_view_t>>;
+using dataset_t = std::basic_string<char, std::char_traits<char>, stringparazilla::unified_alloc<char>>;
+using token_view_t = stringzilla::span<char const>;
+using tokens_t = std::vector<token_view_t, stringparazilla::unified_alloc<token_view_t>>;
 #endif
 
 /**

From aafcbbd4994d21b3bd92e29eaae4c7d723364c2d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 9 Jul 2025 16:23:26 +0000
Subject: [PATCH 449/751] Fix: Correct namespaces for scripts

---
 scripts/bench_find_many.cuh  | 16 ++++++++--------
 scripts/bench_similarity.cuh | 37 ++++++++++++++++++------------------
 scripts/test_stringzilla.hpp |  4 ++--
 3 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/scripts/bench_find_many.cuh b/scripts/bench_find_many.cuh
index f8b72e68..a18fdeca 100644
--- a/scripts/bench_find_many.cuh
+++ b/scripts/bench_find_many.cuh
@@ -52,8 +52,8 @@ struct find_many_callable {
         span<chars_view_t const> haystacks = {&dataset_view, 1};
 
         // Without `volatile`, the serial logic keeps being optimized out!
-        volatile sz::status_t status = engine.try_build(dictionary);
-        if (status != sz::status_t::success_k) throw std::runtime_error("Failed to build dictionary.");
+        volatile status_t status = engine.try_build(dictionary);
+        if (status != status_t::success_k) throw std::runtime_error("Failed to build dictionary.");
         span<size_t> counts_span = {results_counts_per_haystack.data(), results_counts_per_haystack.size()};
         span<find_many_match_t> matches_span = {results_matches_per_haystack.data(),
                                                 results_matches_per_haystack.size()};
@@ -78,7 +78,7 @@ struct find_many_callable {
                 extra_args);
 
         do_not_optimize(status);
-        if (status != sz::status_t::success_k) throw std::runtime_error("Failed multi-pattern search.");
+        if (status != status_t::success_k) throw std::runtime_error("Failed multi-pattern search.");
 
         std::size_t needle_characters = engine.dictionary().total_needles_length();
         std::size_t bytes_passed = 0, character_comparisons = 0;
@@ -117,7 +117,7 @@ void bench_find_many(environment_t const &env) {
     using namespace std::string_literals; // for "s" suffix
 
 #if SZ_USE_CUDA
-    sz::gpu_specs_t specs = *sz::gpu_specs();
+    gpu_specs_t specs = *gpu_specs();
 #endif
     std::vector<std::size_t> vocabulary_sizes = {
         1024,
@@ -150,16 +150,16 @@ void bench_find_many(environment_t const &env) {
 
         // Construct the dictionary for the current vocabulary size
         find_many_u32_dictionary_t dict;
-        if (dict.try_reserve(vocabulary_size) != sz::status_t::success_k)
+        if (dict.try_reserve(vocabulary_size) != status_t::success_k)
             throw std::runtime_error("Failed to reserve space for dictionary.");
         for (std::size_t token_index = 0; dict.count_needles() < vocabulary_size && token_index < env.tokens.size();
              ++token_index) {
             auto const &token = env.tokens[token_index];
             auto status = dict.try_insert({token.data(), token.size()});
-            if (status == sz::status_t::contains_duplicates_k) continue; // Skip duplicates
-            if (status != sz::status_t::success_k) throw std::runtime_error("Failed to insert token into dictionary.");
+            if (status == status_t::contains_duplicates_k) continue; // Skip duplicates
+            if (status != status_t::success_k) throw std::runtime_error("Failed to insert token into dictionary.");
         }
-        if (dict.try_build() != sz::status_t::success_k) throw std::runtime_error("Failed to build dictionary.");
+        if (dict.try_build() != status_t::success_k) throw std::runtime_error("Failed to build dictionary.");
 
         // Estimate the amount of memory needed for the results
         std::size_t const results_count = dict.count({env.dataset.data(), env.dataset.size()});
diff --git a/scripts/bench_similarity.cuh b/scripts/bench_similarity.cuh
index 2505170a..4fbd2210 100644
--- a/scripts/bench_similarity.cuh
+++ b/scripts/bench_similarity.cuh
@@ -50,11 +50,10 @@ struct similarities_callable {
 
     call_result_t operator()(std::span<token_view_t const> a, std::span<token_view_t const> b) noexcept(false) {
         // Unpack the extra arguments from `std::tuple` into the engine call using `std::apply`
-        sz::status_t status =
-            std::apply([&](auto &&...rest) { return engine(a, b, results.data(), rest...); }, extra_args);
+        status_t status = std::apply([&](auto &&...rest) { return engine(a, b, results.data(), rest...); }, extra_args);
         do_not_optimize(status);
 
-        if (status != sz::status_t::success_k) throw std::runtime_error("Failed to compute Levenshtein distance.");
+        if (status != status_t::success_k) throw std::runtime_error("Failed to compute Levenshtein distance.");
         do_not_optimize(results);
         std::size_t bytes_passed = 0, cells_passed = 0;
         for (std::size_t i = 0; i < results.size(); ++i) {
@@ -89,7 +88,7 @@ void bench_levenshtein(environment_t const &env) {
     using namespace std::string_literals; // for "s" suffix
 
 #if SZ_USE_CUDA
-    sz::gpu_specs_t specs = *sz::gpu_specs();
+    gpu_specs_t specs = *gpu_specs();
 #endif
     std::vector<std::size_t> batch_sizes = {1, 64, 1024, 32 * 1024};
 #if SZ_DEBUG
@@ -163,7 +162,7 @@ void bench_levenshtein(environment_t const &env) {
 
 #if SZ_USE_CUDA
         bench_unary(env, "levenshtein_cuda:batch"s + std::to_string(batch_size), call_linear_baseline,
-                    similarities_callable<levenshtein_cuda_t, sz::gpu_specs_t>(
+                    similarities_callable<levenshtein_cuda_t, gpu_specs_t>(
                         env, results_linear_accelerated, levenshtein_cuda_t {weird_uniform, weird_linear}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
@@ -172,7 +171,7 @@ void bench_levenshtein(environment_t const &env) {
 
         bench_unary(
             env, "affine_levenshtein_cuda:batch"s + std::to_string(batch_size), call_affine_baseline,
-            similarities_callable<affine_levenshtein_cuda_t, sz::gpu_specs_t>(
+            similarities_callable<affine_levenshtein_cuda_t, gpu_specs_t>(
                 env, results_affine_accelerated, affine_levenshtein_cuda_t {weird_uniform, weird_affine}, specs),
             callable_no_op_t {},        // preprocessing
             similarities_equality_t {}) // equality check
@@ -182,7 +181,7 @@ void bench_levenshtein(environment_t const &env) {
 
 #if SZ_USE_KEPLER
         bench_unary(env, "levenshtein_kepler:batch"s + std::to_string(batch_size), call_linear_baseline,
-                    similarities_callable<levenshtein_kepler_t, sz::gpu_specs_t>(
+                    similarities_callable<levenshtein_kepler_t, gpu_specs_t>(
                         env, results_linear_accelerated, levenshtein_kepler_t {weird_uniform, weird_linear}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
@@ -191,7 +190,7 @@ void bench_levenshtein(environment_t const &env) {
 
         bench_unary(
             env, "affine_levenshtein_kepler:batch"s + std::to_string(batch_size), call_affine_baseline,
-            similarities_callable<affine_levenshtein_kepler_t, sz::gpu_specs_t>(
+            similarities_callable<affine_levenshtein_kepler_t, gpu_specs_t>(
                 env, results_affine_accelerated, affine_levenshtein_kepler_t {weird_uniform, weird_affine}, specs),
             callable_no_op_t {},        // preprocessing
             similarities_equality_t {}) // equality check
@@ -201,7 +200,7 @@ void bench_levenshtein(environment_t const &env) {
 
 #if SZ_USE_HOPPER
         bench_unary(env, "levenshtein_hopper:batch"s + std::to_string(batch_size), call_linear_baseline,
-                    similarities_callable<levenshtein_hopper_t, sz::gpu_specs_t>(
+                    similarities_callable<levenshtein_hopper_t, gpu_specs_t>(
                         env, results_linear_accelerated, levenshtein_hopper_t {weird_uniform, weird_linear}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
@@ -210,7 +209,7 @@ void bench_levenshtein(environment_t const &env) {
 
         bench_unary(
             env, "affine_levenshtein_hopper:batch"s + std::to_string(batch_size), call_affine_baseline,
-            similarities_callable<affine_levenshtein_hopper_t, sz::gpu_specs_t>(
+            similarities_callable<affine_levenshtein_hopper_t, gpu_specs_t>(
                 env, results_affine_accelerated, affine_levenshtein_hopper_t {weird_uniform, weird_affine}, specs),
             callable_no_op_t {},        // preprocessing
             similarities_equality_t {}) // equality check
@@ -230,7 +229,7 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
     auto blosum62_matrix = blosum62_mat.decompressed();
 
 #if SZ_USE_CUDA
-    sz::gpu_specs_t specs = *sz::gpu_specs();
+    gpu_specs_t specs = *gpu_specs();
 #endif
     std::vector<std::size_t> batch_sizes = {1, 64, 1024, 32 * 1024};
 #if SZ_DEBUG
@@ -319,7 +318,7 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
 
 #if SZ_USE_CUDA
         bench_unary(env, "needleman_wunsch_cuda:batch"s + std::to_string(batch_size), call_linear_global_baseline,
-                    similarities_callable<needleman_wunsch_cuda_t, sz::gpu_specs_t>(
+                    similarities_callable<needleman_wunsch_cuda_t, gpu_specs_t>(
                         env, results_linear_global_accelerated, {blosum62_matrix, blosum62_linear_cost}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
@@ -327,7 +326,7 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
         scramble_accelerated_results(results_linear_global_accelerated);
 
         bench_unary(env, "smith_waterman_cuda:batch"s + std::to_string(batch_size), call_linear_local_baseline,
-                    similarities_callable<smith_waterman_cuda_t, sz::gpu_specs_t>(
+                    similarities_callable<smith_waterman_cuda_t, gpu_specs_t>(
                         env, results_linear_local_accelerated, {blosum62_matrix, blosum62_linear_cost}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
@@ -336,7 +335,7 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
 
         bench_unary(env, "affine_needleman_wunsch_cuda:batch"s + std::to_string(batch_size),
                     call_affine_global_baseline,
-                    similarities_callable<affine_needleman_wunsch_cuda_t, sz::gpu_specs_t>(
+                    similarities_callable<affine_needleman_wunsch_cuda_t, gpu_specs_t>(
                         env, results_affine_global_accelerated, {blosum62_matrix, blosum62_affine_cost}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
@@ -344,7 +343,7 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
         scramble_accelerated_results(results_affine_global_accelerated);
 
         bench_unary(env, "affine_smith_waterman_cuda:batch"s + std::to_string(batch_size), call_affine_local_baseline,
-                    similarities_callable<affine_smith_waterman_cuda_t, sz::gpu_specs_t>(
+                    similarities_callable<affine_smith_waterman_cuda_t, gpu_specs_t>(
                         env, results_affine_local_accelerated, {blosum62_matrix, blosum62_affine_cost}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
@@ -354,7 +353,7 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
 
 #if SZ_USE_HOPPER
         bench_unary(env, "needleman_wunsch_hopper:batch"s + std::to_string(batch_size), call_linear_global_baseline,
-                    similarities_callable<needleman_wunsch_hopper_t, sz::gpu_specs_t>(
+                    similarities_callable<needleman_wunsch_hopper_t, gpu_specs_t>(
                         env, results_linear_global_accelerated, {blosum62_matrix, blosum62_linear_cost}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
@@ -362,7 +361,7 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
         scramble_accelerated_results(results_linear_global_accelerated);
 
         bench_unary(env, "smith_waterman_hopper:batch"s + std::to_string(batch_size), call_linear_local_baseline,
-                    similarities_callable<smith_waterman_hopper_t, sz::gpu_specs_t>(
+                    similarities_callable<smith_waterman_hopper_t, gpu_specs_t>(
                         env, results_linear_local_accelerated, {blosum62_matrix, blosum62_linear_cost}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
@@ -371,7 +370,7 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
 
         bench_unary(env, "affine_needleman_wunsch_hopper:batch"s + std::to_string(batch_size),
                     call_affine_global_baseline,
-                    similarities_callable<affine_needleman_wunsch_hopper_t, sz::gpu_specs_t>(
+                    similarities_callable<affine_needleman_wunsch_hopper_t, gpu_specs_t>(
                         env, results_affine_global_accelerated, {blosum62_matrix, blosum62_affine_cost}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
@@ -379,7 +378,7 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
         scramble_accelerated_results(results_affine_global_accelerated);
 
         bench_unary(env, "affine_smith_waterman_hopper:batch"s + std::to_string(batch_size), call_affine_local_baseline,
-                    similarities_callable<affine_smith_waterman_hopper_t, sz::gpu_specs_t>(
+                    similarities_callable<affine_smith_waterman_hopper_t, gpu_specs_t>(
                         env, results_affine_local_accelerated, {blosum62_matrix, blosum62_affine_cost}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
diff --git a/scripts/test_stringzilla.hpp b/scripts/test_stringzilla.hpp
index bd2f8fcb..4ba6485b 100644
--- a/scripts/test_stringzilla.hpp
+++ b/scripts/test_stringzilla.hpp
@@ -29,9 +29,9 @@ using arrow_strings_tape_t = arrow_strings_tape<char, sz_size_t, std::allocator<
 template <typename value_type_>
 using unified_vector = std::vector<value_type_, std::allocator<value_type_>>;
 #else
-using arrow_strings_tape_t = arrow_strings_tape<char, sz_size_t, unified_alloc<char>>;
+using arrow_strings_tape_t = arrow_strings_tape<char, sz_size_t, stringparazilla::unified_alloc<char>>;
 template <typename value_type_>
-using unified_vector = std::vector<value_type_, unified_alloc<value_type_>>;
+using unified_vector = std::vector<value_type_, stringparazilla::unified_alloc<value_type_>>;
 #endif
 
 inline std::string read_file(std::string path) noexcept(false) {

From 51c9171a1a4fcacbba14045672f1fe5b97558cc6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 10 Jul 2025 12:19:02 +0000
Subject: [PATCH 450/751] Add: Long haystack CUDA kernels for find-many

---
 include/stringparazilla/find_many.cuh  | 264 ++++++++++++-------------
 include/stringparazilla/find_many.hpp  |  30 ++-
 include/stringparazilla/similarity.cuh |   4 +
 3 files changed, 162 insertions(+), 136 deletions(-)

diff --git a/include/stringparazilla/find_many.cuh b/include/stringparazilla/find_many.cuh
index a24627be..5e0c84b6 100644
--- a/include/stringparazilla/find_many.cuh
+++ b/include/stringparazilla/find_many.cuh
@@ -15,10 +15,11 @@
  *
  *  @see How slow is constant memory? https://leimao.github.io/blog/CUDA-Constant-Memory/
  *
- *  That means, the coalesced memory is extremely important. Moreover, assuming we are mostly fetching each
- *  haystack byte only once, we want to make the transfer asynchronous, using @b `cp.async` PTX instructions.
- *
+ *  @section Future Development
  *
+ *  Current kernels oversimplify scheduling. They assume either a huge number of shorter haystacks or just a few
+ *  really long ones. Reality is often fuzzier, with a mix of both. A better scheduling approach may be to treat
+ *  all haystacks as a single tape, regrouping into sub-haystack-level and haystack-level "tasks".
  */
 #ifndef STRINGPARAZILLA_FIND_MANY_CUH_
 #define STRINGPARAZILLA_FIND_MANY_CUH_
@@ -50,6 +51,28 @@ __forceinline__ __device__ scalar_type_ _reduce_in_warp(scalar_type_ x) noexcept
     return x;
 }
 
+/**
+ *  @brief Wraps a single task for the CUDA-based @b byte-level multi-needle "count" kernels.
+ *  @note Used to allow sorting/grouping inputs to differentiate device-wide and warp-wide tasks.
+ */
+struct cuda_count_many_task_t {
+    span<byte_t const> haystack {};
+    size_t task_index {0};
+    size_t result {0};
+};
+
+/**
+ *  @brief Wraps a single task for the CUDA-based @b byte-level multi-needle "find" kernels.
+ *  @note Used to allow sorting/grouping inputs to differentiate device-wide and warp-wide tasks.
+ */
+struct cuda_find_task_t {
+    span<byte_t const> haystack;
+    size_t task_index {0};
+    span<size_t> result_needle_ids {};
+    span<size_t> result_offsets {};
+    span<size_t> result_lengths {};
+};
+
 /**
  *  Each warp receives a unique haystack. All threads in a warp take continuous overlapping slices of the haystack.
  *  Overlapping match counts are reported and later aggregated in the calling function, accounting for the overlaps.
@@ -57,30 +80,22 @@ __forceinline__ __device__ scalar_type_ _reduce_in_warp(scalar_type_ x) noexcept
  *
  *  @tparam small_size_type_ Helps us avoid 64-bit arithmetic in favor of smaller 16- or 32-bit offsets/lengths.
  */
-template <typename small_size_type_, typename char_type_, typename state_id_type_, state_id_type_ alphabet_size_ = 256>
+template <typename small_size_type_, typename state_id_type_>
 __device__ _count_short_needle_matches_in_one_part_t _count_short_needle_matches_in_one_part_per_warp_thread( //
-    char_type_ const *haystack_begin, size_t const haystack_length,                                           //
-    size_t const count_states,                                                                                //
-    safe_array<state_id_type_, alphabet_size_> const *transitions,                                            //
-    state_id_type_ const *outputs_counts,                                                                     //
-    size_t const max_needle_length,                                                                           //
+    span<byte_t const> const &haystack,                                                                       //
+    aho_corasick_dictionary_view<state_id_type_> const &dict,                                                 //
     size_t const thread_index, size_t const thread_pool_size) noexcept {
 
-    using char_t = char_type_;
     using state_id_t = state_id_type_;
     using small_size_t = small_size_type_;
 
-    byte_t const *const haystack_data = reinterpret_cast<byte_t const *>(haystack_begin);
-    small_size_t const haystack_bytes_length = static_cast<small_size_t>(haystack_length * sizeof(char_t));
-    byte_t const *const haystack_end = haystack_data + haystack_bytes_length;
-
-    small_size_t const bytes_per_thread_optimal = divide_round_up(haystack_bytes_length, thread_pool_size);
+    small_size_t const bytes_per_thread_optimal = divide_round_up(haystack.size(), thread_pool_size);
 
     // We may have a case of a thread receiving no data at all
-    byte_t const *optimal_start = std::min(haystack_data + thread_index * bytes_per_thread_optimal, haystack_end);
-    byte_t const *const prefix_end = std::min(optimal_start + max_needle_length, haystack_end);
+    byte_t const *optimal_start = std::min(haystack.data() + thread_index * bytes_per_thread_optimal, haystack.end());
+    byte_t const *const prefix_end = std::min(optimal_start + dict.max_needle_length, haystack.end());
     byte_t const *const overlapping_end =
-        std::min(optimal_start + bytes_per_thread_optimal + max_needle_length, haystack_end);
+        std::min(optimal_start + bytes_per_thread_optimal + dict.max_needle_length, haystack.end());
 
     // Reimplement the serial `aho_corasick_dictionary::count` keeping track of the matches,
     // entirely fitting in the prefix
@@ -88,8 +103,8 @@ __device__ _count_short_needle_matches_in_one_part_t _count_short_needle_matches
     small_size_t result_total = 0;
     small_size_t result_prefix = 0;
     for (; optimal_start != overlapping_end; ++optimal_start) {
-        current_state = transitions[current_state][*optimal_start];
-        small_size_t const outputs_count = static_cast<small_size_t>(outputs_counts[current_state]);
+        current_state = dict.transitions[current_state][*optimal_start];
+        small_size_t const outputs_count = static_cast<small_size_t>(dict.outputs_counts[current_state]);
         result_total += outputs_count;
         result_prefix += non_zero_if<small_size_t>(outputs_count, optimal_start < prefix_end);
     }
@@ -109,48 +124,42 @@ __device__ _count_short_needle_matches_in_one_part_t _count_short_needle_matches
  *  There are countless divergent branches in this solution, that depending on the vocabulary can result
  *  in extremely low performance.
  */
-template <typename small_size_type_, typename char_type_, typename state_id_type_, state_id_type_ alphabet_size_>
+template <typename small_size_type_, typename state_id_type_>
 __device__ small_size_type_ _count_needle_matches_in_one_part_per_warp_thread( //
-    char_type_ const *haystack_begin, size_t const haystack_length,            //
-    size_t const count_states,                                                 //
-    safe_array<state_id_type_, alphabet_size_> const *transitions,             //
-    state_id_type_ const *outputs,                                             //
-    state_id_type_ const *outputs_counts,                                      //
-    state_id_type_ const *outputs_offsets,                                     //
-    size_t const *needles_lengths,                                             //
-    size_t const max_needle_length,                                            //
+    span<byte_t const> const &haystack,                                        //
+    aho_corasick_dictionary_view<state_id_type_> const &dict,                  //
     size_t const thread_index, size_t const thread_pool_size) noexcept {
 
-    using char_t = char_type_;
     using state_id_t = state_id_type_;
     using small_size_t = small_size_type_;
 
-    byte_t const *const haystack_data = reinterpret_cast<byte_t const *>(haystack_begin);
-    small_size_t const haystack_bytes_length = static_cast<small_size_t>(haystack_length * sizeof(char_t));
+    byte_t const *const haystack_data = haystack.data();
+    small_size_t const haystack_bytes_length = static_cast<small_size_t>(haystack.size());
     byte_t const *const haystack_end = haystack_data + haystack_bytes_length;
 
-    small_size_t const bytes_per_thread_optimal = divide_round_up(haystack_bytes_length, thread_pool_size);
+    small_size_t const bytes_per_thread_optimal =
+        divide_round_up<small_size_t>(haystack_bytes_length, thread_pool_size);
 
     // We may have a case of a thread receiving no data at all
     byte_t const *optimal_start = std::min(haystack_data + thread_index * bytes_per_thread_optimal, haystack_end);
     byte_t const *const optimal_end = std::min(optimal_start + bytes_per_thread_optimal, haystack_end);
     byte_t const *const overlapping_end =
-        std::min(optimal_start + bytes_per_thread_optimal + max_needle_length, haystack_end);
+        std::min(optimal_start + bytes_per_thread_optimal + dict.max_needle_length, haystack_end);
 
     // Reimplement the serial `aho_corasick_dictionary::count` keeping track of the matches,
     // entirely fitting in the prefix
     state_id_t current_state = 0;
     small_size_t result_total = 0;
     for (; optimal_start != overlapping_end; ++optimal_start) {
-        current_state = transitions[current_state][*optimal_start];
-        small_size_t const outputs_count = static_cast<small_size_t>(outputs_counts[current_state]);
+        current_state = dict.transitions[current_state][*optimal_start];
+        small_size_t const outputs_count = static_cast<small_size_t>(dict.outputs_counts[current_state]);
         if (outputs_count == 0) continue;
 
         // In a small & diverse vocabulary, the following loop generally does just 1 iteration
-        size_t const outputs_offset = outputs_offsets[current_state];
+        size_t const outputs_offset = dict.outputs_offsets[current_state];
         for (size_t output_index = 0; output_index < outputs_count; ++output_index) {
-            size_t needle_id = outputs[outputs_offset + output_index];
-            size_t match_length = needles_lengths[needle_id];
+            size_t needle_id = dict.outputs[outputs_offset + output_index];
+            size_t match_length = dict.needles_lengths[needle_id];
             byte_t const *match_ptr = optimal_start + 1 - match_length;
             result_total += match_ptr < optimal_end;
         }
@@ -166,26 +175,12 @@ __device__ small_size_type_ _count_needle_matches_in_one_part_per_warp_thread( /
  *  Different threads in a warp take different continuous slices of a shared haystack.
  *  This works best for fairly short needles and a large quantity of haystacks.
  */
-template < //
-    typename state_id_type_,
-    typename haystacks_strings_type_, //
-    state_id_type_ alphabet_size_ = 256,
-    sz_capability_t capability_ = sz_cap_cuda_k //
-    >
-__global__ void _count_matches_with_haystack_per_warp(              //
-    haystacks_strings_type_ haystacks, size_t *counts_per_haystack, //
-    size_t const count_states,                                      //
-    safe_array<state_id_type_, alphabet_size_> const *transitions,  //
-    state_id_type_ const *outputs,                                  //
-    state_id_type_ const *outputs_counts,                           //
-    state_id_type_ const *outputs_offsets,                          //
-    size_t const *needles_lengths,                                  //
-    size_t const max_length_among_needles) {
+template <typename state_id_type_, sz_capability_t capability_ = sz_cap_cuda_k>
+__global__ void _count_matches_with_haystack_per_warp( //
+    span<cuda_count_many_task_t> tasks, aho_corasick_dictionary_view<state_id_type_> dict) {
 
     // We only use this kernel for small haystacks, where a smaller integer type is enough for size.
     using small_size_t = uint;
-    using haystack_t = typename haystacks_strings_type_::value_type;
-    using char_t = typename haystack_t::value_type;
     using state_id_t = state_id_type_;
 
     // We may have multiple warps operating in the same block.
@@ -196,42 +191,31 @@ __global__ void _count_matches_with_haystack_per_warp(              //
     size_t const warps_per_device = static_cast<uint>(gridDim.x * warps_per_block);
     uint const warp_thread_index = static_cast<uint>(global_thread_index % warp_size);
 
-    for (size_t haystack_index = global_warp_index; haystack_index < haystacks.size();
-         haystack_index += warps_per_device) {
-        // Each warp is assigned to a single haystack.
-        haystack_t haystack = haystacks[haystack_index];
-        char_t const *const haystack_begin = haystack.data();
+    for (size_t task_index = global_warp_index; task_index < tasks.size(); task_index += warps_per_device) {
+        // Each warp is assigned to a single task.
+        auto &task = tasks[task_index];
+        span<byte_t const> const &haystack = task.haystack;
         small_size_t const haystack_length = static_cast<small_size_t>(haystack.size());
-        small_size_t const chars_per_core_optimal = divide_round_up<small_size_t>(haystack_length, warp_size);
 
         // We shouldn't even consider needles longer than the haystack
         small_size_t const max_needle_length =
-            std::min(static_cast<small_size_t>(max_length_among_needles), haystack_length);
+            std::min(static_cast<small_size_t>(dict.max_needle_length), haystack_length);
         bool const longest_needle_fits_on_one_thread = max_needle_length * warp_size < haystack_length;
         small_size_t results_per_thread = 0;
         if (longest_needle_fits_on_one_thread) {
             _count_short_needle_matches_in_one_part_t partial_result =
-                _count_short_needle_matches_in_one_part_per_warp_thread<small_size_t, char_t, state_id_t,
-                                                                        alphabet_size_>( //
-                    haystack_begin, haystack_length,                                     //
-                    count_states, transitions,                                           //
-                    outputs_counts, max_length_among_needles,                            //
-                    warp_thread_index, warp_size);
+                _count_short_needle_matches_in_one_part_per_warp_thread<small_size_t, state_id_t>( //
+                    haystack, dict, warp_thread_index, warp_size);
             results_per_thread =
                 partial_result.total - non_zero_if<small_size_t>(partial_result.prefix, warp_thread_index != 0);
         }
         else {
-            results_per_thread =
-                _count_needle_matches_in_one_part_per_warp_thread<small_size_t, char_t, state_id_t, alphabet_size_>( //
-                    haystack_begin, haystack_length,                                                                 //
-                    count_states, transitions,                                                                       //
-                    outputs, outputs_counts, outputs_offsets,                                                        //
-                    needles_lengths, max_length_among_needles,                                                       //
-                    warp_thread_index, warp_size);
+            results_per_thread = _count_needle_matches_in_one_part_per_warp_thread<small_size_t, state_id_t>( //
+                haystack, dict, warp_thread_index, warp_size);
         }
 
         small_size_t results_across_warp = _reduce_in_warp(results_per_thread);
-        if (warp_thread_index == 0) counts_per_haystack[haystack_index] = results_across_warp;
+        if (warp_thread_index == 0) task.result = results_across_warp;
     }
 }
 
@@ -244,63 +228,37 @@ __global__ void _count_matches_with_haystack_per_warp(              //
  */
 template < //
     typename state_id_type_,
-    typename haystack_string_type_, //
-    state_id_type_ alphabet_size_ = 256,
     sz_capability_t capability_ = sz_cap_cuda_k //
     >
-__global__ void _count_matches_with_haystack_per_device(           //
-    haystack_string_type_ haystack, size_t haystack_index,         //
-    size_t const count_states,                                     //
-    safe_array<state_id_type_, alphabet_size_> const *transitions, //
-    state_id_type_ const *outputs,                                 //
-    state_id_type_ const *outputs_counts,                          //
-    state_id_type_ const *outputs_offsets,                         //
-    size_t const *needles_lengths,                                 //
-    size_t const max_length_among_needles) {
+__global__ void _count_matches_with_haystack_per_device( //
+    span<byte_t const> haystack, aho_corasick_dictionary_view<state_id_type_> dict, size_t *count_for_haystack_ptr) {
 
     // We only use this kernel for small haystacks, where a smaller integer type is enough for size.
-    using small_size_t = uint;
-    using haystack_t = haystack_string_type_;
-    using char_t = typename haystack_t::value_type;
     using state_id_t = state_id_type_;
 
     // We may have multiple warps operating in the same block.
-    uint const warp_size = warpSize;
-    size_t const global_thread_index = static_cast<uint>(blockIdx.x * blockDim.x + threadIdx.x);
-    size_t const global_warp_index = static_cast<uint>(global_thread_index / warp_size);
-    size_t const warps_per_block = static_cast<uint>(blockDim.x / warp_size);
-    size_t const warps_per_device = static_cast<uint>(gridDim.x * warps_per_block);
-    size_t const threads_per_device = warps_per_device * warp_size;
-    uint const warp_thread_index = static_cast<uint>(global_thread_index % warp_size);
+    size_t const warp_size = warpSize;
+    size_t const global_thread_index = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
+    size_t const threads_per_device = static_cast<size_t>(blockDim.x * gridDim.x);
+    size_t const warp_thread_index = static_cast<size_t>(global_thread_index % warp_size);
 
-    char_t const *const haystack_begin = haystack.data();
-    small_size_t const haystack_length = static_cast<small_size_t>(haystack.size());
-    small_size_t const chars_per_core_optimal = divide_round_up<small_size_t>(haystack_length, threads_per_device);
+    size_t const haystack_length = haystack.size();
+    size_t const chars_per_core_optimal = divide_round_up<size_t>(haystack_length, threads_per_device);
 
     // We shouldn't even consider needles longer than the haystack
-    small_size_t const max_needle_length =
-        std::min(static_cast<small_size_t>(max_length_among_needles), haystack_length);
+    size_t const max_needle_length = std::min(static_cast<size_t>(dict.max_needle_length), haystack_length);
     bool const longest_needle_fits_on_one_thread = max_needle_length * threads_per_device < haystack_length;
-    small_size_t results_per_thread = 0;
+    size_t results_per_thread = 0;
     if (longest_needle_fits_on_one_thread) {
         _count_short_needle_matches_in_one_part_t partial_result =
-            _count_short_needle_matches_in_one_part_per_warp_thread<small_size_t, char_t, state_id_t,
-                                                                    alphabet_size_>( //
-                haystack_begin, haystack_length,                                     //
-                count_states, transitions,                                           //
-                outputs_counts, max_length_among_needles,                            //
-                global_thread_index, threads_per_device);
+            _count_short_needle_matches_in_one_part_per_warp_thread<size_t, state_id_t>( //
+                haystack, dict, global_thread_index, threads_per_device);
         results_per_thread =
-            partial_result.total - non_zero_if<small_size_t>(partial_result.prefix, global_thread_index != 0);
+            partial_result.total - non_zero_if<size_t>(partial_result.prefix, global_thread_index != 0);
     }
     else {
-        results_per_thread =
-            _count_needle_matches_in_one_part_per_warp_thread<small_size_t, char_t, state_id_t, alphabet_size_>( //
-                haystack_begin, haystack_length,                                                                 //
-                count_states, transitions,                                                                       //
-                outputs, outputs_counts, outputs_offsets,                                                        //
-                needles_lengths, max_length_among_needles,                                                       //
-                global_thread_index, threads_per_device);
+        results_per_thread = _count_needle_matches_in_one_part_per_warp_thread<size_t, state_id_t>( //
+            haystack, dict, global_thread_index, threads_per_device);
     }
 
     // Instead of the efficient tree-like shared-memory reductions with subsequent writes, we simply use atomic
@@ -308,7 +266,7 @@ __global__ void _count_matches_with_haystack_per_device(           //
     // To slightly reduce the traffic, we can aggregate within the warp first.
     small_size_t results_across_warp = _reduce_in_warp(results_per_thread);
     if (warp_thread_index == 0) {
-        cuda::atomic_ref<size_t> count_for_haystack(counts_per_haystack[haystack_index]);
+        cuda::atomic_ref<size_t> count_for_haystack(*count_for_haystack_ptr);
         count_for_haystack.fetch_add(results_across_warp, cuda::std::memory_order_relaxed);
     }
 }
@@ -395,7 +353,9 @@ struct find_many<state_id_type_, allocator_type_, sz_cap_cuda_k, enable_> {
 
         _sz_assert(counts.size() == haystacks.size());
 
-        using haystacks_t = typename std::decay_t<haystacks_type_>;
+        using haystacks_t = typename std::remove_reference_t<haystacks_type_>;
+        using haystack_t = typename haystacks_t::value_type;
+        using char_t = typename haystack_t::value_type;
         static_assert(std::is_nothrow_copy_constructible_v<haystacks_t>,
                       "Haystack type must be nothrow copy constructible");
 
@@ -409,22 +369,57 @@ struct find_many<state_id_type_, allocator_type_, sz_cap_cuda_k, enable_> {
         if (start_event_error != cudaSuccess) return {status_t::unknown_k, start_event_error};
 
         // Allocate GPU memory buffer using safe_vector
-        using size_allocator_t = typename std::allocator_traits<allocator_t>::template rebind_alloc<size_t>;
-        safe_vector<size_t, size_allocator_t> counts_buffer(dict_.allocator());
-        if (counts_buffer.try_resize(counts.size()) != status_t::success_k)
+        using task_t = cuda_count_many_task_t;
+        using task_allocator_t = typename std::allocator_traits<allocator_t>::template rebind_alloc<task_t>;
+        safe_vector<task_t, task_allocator_t> tasks_buffer(dict_.allocator());
+        if (tasks_buffer.try_resize(counts.size()) != status_t::success_k)
             return {status_t::bad_alloc_k, cudaErrorMemoryAllocation};
 
+        // Populate the tasks buffer with haystacks
+        for (size_t i = 0; i < haystacks.size(); ++i) {
+            auto &haystack = haystacks[i];
+            auto haystack_bytes = span<char_t const>(haystack.data(), haystack.size()).template cast<byte_t const>();
+            tasks_buffer[i].haystack = haystack_bytes;
+            tasks_buffer[i].task_index = i;
+            tasks_buffer[i].result = 0; // Initialize result to zero
+        }
+
+        // Sort the tasks by size of the haystack and isolate the ones that should be processed across the device.
+        std::sort(tasks_buffer.begin(), tasks_buffer.end(),
+                  [](task_t const &a, task_t const &b) { return a.haystack.size() < b.haystack.size(); });
+
         // Calculate optimal thread and block configuration
         uint const threads_per_block = specs.warp_size * 4;               // 4 warps per block
         uint const blocks_per_grid = specs.streaming_multiprocessors * 2; // 2 blocks per SM
-
-        // Launch the kernel
-        auto kernel = &_count_matches_with_haystack_per_warp<state_id_t, haystacks_t, alphabet_size_k, sz_cap_cuda_k>;
+        uint const threads_per_device = blocks_per_grid * threads_per_block;
+        uint const warps_per_device = threads_per_device / specs.warp_size;
+
+        // Our warp-wide matchers are more efficient if we have enough haystacks to saturate the device.
+        // The weird corner case is having many short haystacks and just a couple of very long ones.
+        // TODO: Processing such inputs would be extremely inefficient.
+        size_t const min_length_for_device_wide_processing =
+            round_up_to_multiple<size_t>(dict_.max_needle_length(), 128) * threads_per_device;
+        size_t const haystacks_with_device_wide_processing = std::count_if(
+            tasks_buffer.begin(), tasks_buffer.end(),
+            [&](task_t const &task) { return task.haystack.size() >= min_length_for_device_wide_processing; });
+
+        // We can't move the dictionary to the GPU, but we can pass a view
+        aho_corasick_dictionary_view<state_id_t> dict_view(dict_);
+
+        // Launch the kernel for warp-wide processing of haystacks.
+        auto kernel = &_count_matches_with_haystack_per_warp<state_id_t, sz_cap_cuda_k>;
         kernel<<<blocks_per_grid, threads_per_block, 0, executor.stream>>>(
-            haystacks, counts_buffer.data(),                                                       //
-            dict_.count_states(), dict_.transitions().data(),                                      //
-            dict_.outputs().data(), dict_.outputs_counts().data(), dict_.outputs_offsets().data(), //
-            dict_.needles_lengths().data(), dict_.max_needle_length());
+            {tasks_buffer.data(), tasks_buffer.size() - haystacks_with_device_wide_processing}, dict_view);
+
+        // Handle the last haystacks that are too long for warp-wide processing.
+        for (size_t i = tasks_buffer.size() - haystacks_with_device_wide_processing; i < tasks_buffer.size(); ++i) {
+            auto &task = tasks_buffer[i];
+            auto device_kernel = &_count_matches_with_haystack_per_device<state_id_t, sz_cap_cuda_k>;
+
+            // Launch the device-wide kernel for this large haystack, passing pointer to task result
+            device_kernel<<<blocks_per_grid, threads_per_block, 0, executor.stream>>>(task.haystack, dict_view,
+                                                                                      &task.result);
+        }
 
         // Check for kernel launch errors
         cudaError_t launch_error = cudaGetLastError();
@@ -434,10 +429,11 @@ struct find_many<state_id_type_, allocator_type_, sz_cap_cuda_k, enable_> {
         cudaError_t execution_error = cudaStreamSynchronize(executor.stream);
         if (execution_error != cudaSuccess) return {status_t::unknown_k, execution_error};
 
-        // Copy results back to host
-        cudaError_t copy_error = cudaMemcpyAsync(counts.data(), counts_buffer.data(), counts.size() * sizeof(size_t),
-                                                 cudaMemcpyDeviceToHost, executor.stream);
-        if (copy_error != cudaSuccess) return {status_t::unknown_k, copy_error};
+        // Copy results back to host - extract results from tasks and put them in the correct order
+        for (size_t i = 0; i < tasks_buffer.size(); ++i) {
+            auto &task = tasks_buffer[i];
+            counts[task.task_index] = task.result;
+        }
 
         // Record stop event and calculate timing
         cudaError_t stop_event_error = cudaEventRecord(stop_event, executor.stream);
diff --git a/include/stringparazilla/find_many.hpp b/include/stringparazilla/find_many.hpp
index 04d0693f..83d354bc 100644
--- a/include/stringparazilla/find_many.hpp
+++ b/include/stringparazilla/find_many.hpp
@@ -540,6 +540,32 @@ struct aho_corasick_dictionary {
     }
 };
 
+/**
+ *  @brief A view into an immutable Aho-Corasick dictionary to simplify passing it to GPU-like accelerators.
+ */
+template <typename state_id_type_ = u32_t>
+struct aho_corasick_dictionary_view {
+    using state_id_t = state_id_type_;
+    static constexpr state_id_t alphabet_size_k = 256;
+    using state_transitions_t = safe_array<state_id_t, alphabet_size_k>;
+
+    state_transitions_t const *transitions = nullptr;
+    size_t count_states = 0;
+    state_id_t const *outputs = nullptr;
+    state_id_t const *outputs_counts = nullptr;
+    state_id_t const *outputs_offsets = nullptr;
+    size_t const *needles_lengths = nullptr;
+    size_t max_needle_length = 0;
+
+    constexpr aho_corasick_dictionary_view() = default;
+
+    template <typename allocator_type_>
+    explicit aho_corasick_dictionary_view(aho_corasick_dictionary<state_id_type_, allocator_type_> const &dict) noexcept
+        : transitions(dict.transitions().data()), count_states(dict.count_states()), outputs(dict.outputs().data()),
+          outputs_counts(dict.outputs_counts().data()), outputs_offsets(dict.outputs_offsets().data()),
+          needles_lengths(dict.needles_lengths().data()), max_needle_length(dict.max_needle_length()) {}
+};
+
 #pragma endregion // Dictionary
 
 #pragma region - Primary API
@@ -589,7 +615,7 @@ struct find_many {
     }
 
     /**
-     *  @brief Counts the number of occurrences of all needles in all @p haystacks. Relevant for filtering and ranking.
+     *  @brief Counts the number of occurrences of all needles in all @p haystacks. Relevant for filtering & ranking.
      *  @param[in] haystacks The input strings to search in.
      *  @param[in] counts The output buffer for the counts of all needles in each haystack.
      */
@@ -707,7 +733,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
     }
 
     /**
-     *  @brief Counts the number of occurrences of all needles in all @p haystacks. Relevant for filtering and ranking.
+     *  @brief Counts the number of occurrences of all needles in all @p haystacks. Relevant for filtering & ranking.
      *  @param[in] haystacks The input strings to search in.
      *  @param[in] counts The output buffer for the counts of all needles in each haystack.
      *  @param[in] executor The executor to use for parallelization.
diff --git a/include/stringparazilla/similarity.cuh b/include/stringparazilla/similarity.cuh
index d627c3eb..7dfc3265 100644
--- a/include/stringparazilla/similarity.cuh
+++ b/include/stringparazilla/similarity.cuh
@@ -2010,6 +2010,10 @@ __global__ void _affine_score_on_each_cuda_warp(                             //
 
 #pragma region - Levenshtein Distance in CUDA
 
+/**
+ *  @brief Wraps a single task for the CUDA-based @b byte-level "similarity" kernels.
+ *  @note Used to allow sorting/grouping inputs to differentiate device-wide and warp-wide tasks.
+ */
 template <typename char_type_>
 struct cuda_similarity_task {
     using char_t = char_type_;

From 766e2500887d7a42d7ecd509bc028cce33086561 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 10 Jul 2025 12:23:43 +0000
Subject: [PATCH 451/751] Docs: Similar wording in "Explore Levenshtein"

---
 scripts/explore_levenshtein.ipynb | 2244 ++++++++++++++++-------------
 1 file changed, 1221 insertions(+), 1023 deletions(-)

diff --git a/scripts/explore_levenshtein.ipynb b/scripts/explore_levenshtein.ipynb
index 49b343c7..d922363e 100644
--- a/scripts/explore_levenshtein.ipynb
+++ b/scripts/explore_levenshtein.ipynb
@@ -1,1025 +1,1223 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Exploring the Impact of Evaluation Order on Edit Distance Algorithms\n",
-    "\n",
-    "Removing data-dependencies in the Wagner-Fisher, Needleman-Wunsch, Smith-Waterman, and Gotoh Dynamic Programming algorithms to explain the hardware-accelerated variants in StringZilla."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Levenshtein Distance\n",
-    "\n",
-    "Levenshtein edit distance is one of the most broadly studied string similarity metrics.\n",
-    "It is defined as the minimum number of single-character insertions, deletions, and substitutions required to change one string into another.\n",
-    "The Levenshtein distance between two strings is calculated using dynamic programming algorithms, such as the Wagner-Fisher algorithm, and its variations for Bioinformatics: \n",
-    "\n",
-    "- Needleman-Wunsch for global alignment with substitution matrices, \n",
-    "- Smith-Waterman for local alignment with substitution matrices, \n",
-    "- Gotoh for different penalties for gap opening and extensions.\n",
-    "\n",
-    "Given the shared nature of these algorithms, the same tricks can be applied to all of them to improve their performance."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Warner-Fisher Algorithm\n",
-    "\n",
-    "Wagner-Fisher algorithm, in its most naive form, has a time and space complexity of $O(NM)$, where $N$ and $M$ are the lengths of the two strings being compared.\n",
-    "A rectangular matrix of size $(N+1) \\times (M+1)$ is created to store the edit distances between all prefixes of the two strings.\n",
-    "The first row and column are, naturally, initialized with ${0, 1, 2, ..., N}$ and ${0, 1, 2, ..., M}$ respectively."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from typing import Tuple\n",
-    "import numpy as np # NumPy for matrices\n",
-    "\n",
-    "def wagner_fisher(s1: str, s2: str) -> Tuple[int, np.ndarray]:\n",
-    "    # Create a matrix of size (len(s1)+1) x (len(s2)+1)\n",
-    "    matrix = np.zeros((len(s1) + 1, len(s2) + 1), dtype=int)\n",
-    "\n",
-    "    # Initialize the first column and first row of the matrix\n",
-    "    for i in range(len(s1) + 1):\n",
-    "        matrix[i, 0] = i\n",
-    "    for j in range(len(s2) + 1):\n",
-    "        matrix[0, j] = j\n",
-    "\n",
-    "    # Compute Levenshtein distance\n",
-    "    for i in range(1, len(s1) + 1):\n",
-    "        for j in range(1, len(s2) + 1):\n",
-    "            substitution_cost = s1[i - 1] != s2[j - 1]\n",
-    "            matrix[i, j] = min(\n",
-    "                matrix[i - 1, j] + 1,                      #? Deletion cost\n",
-    "                matrix[i, j - 1] + 1,                      #? Insertion cost\n",
-    "                matrix[i - 1, j - 1] + substitution_cost,  #? Substitution cost\n",
-    "            )\n",
-    "\n",
-    "    # The distance will be placed in the bottom right corner of the matrix\n",
-    "    return matrix[len(s1), len(s2)], matrix"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "s1 = \"kiten\"\n",
-    "s2 = \"katerinas\"\n",
-    "distance_wf, matrix_wf = wagner_fisher(s1, s2)\n",
-    "s1, s2, f\"{distance_wf = }\", matrix_wf"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This algorithm is almost never recommended for practical use, as it has a quadratic space complexity.\n",
-    "It's trivial to see that the space complexity can be reduced to $O(min(N, M))$ by only storing the last two rows of the matrix, but we want to keep the entire matrix as a reference to allow debugging and visualization."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "To feel safer, while designing our alternative traversal algorithm, let's define an extraction function, that will get the values of a certain skewed diagonal."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_skewed_diagonal(matrix: np.ndarray, index: int):\n",
-    "    flipped_matrix = np.fliplr(matrix)\n",
-    "    return np.flip(np.diag(flipped_matrix, k= matrix.shape[1] - index - 1))\n",
-    "\n",
-    "# Let's test this function right away.\n",
-    "matrix = np.array([\n",
-    "    [1, 2, 3],\n",
-    "    [4, 5, 6],\n",
-    "    [7, 8, 9]])\n",
-    "assert np.all(get_skewed_diagonal(matrix, 2) == [7, 5, 3])\n",
-    "assert np.all(get_skewed_diagonal(matrix, 1) == [4, 2])\n",
-    "assert np.all(get_skewed_diagonal(matrix, 4) == [9])\n",
-    "\n",
-    "# Let's test this function right away.\n",
-    "matrix = np.array([\n",
-    "    [1, 2, 3],\n",
-    "    [4, 5, 6]])\n",
-    "assert np.all(get_skewed_diagonal(matrix, 0) == [1])\n",
-    "assert np.all(get_skewed_diagonal(matrix, 1) == [4, 2])\n",
-    "assert np.all(get_skewed_diagonal(matrix, 2) == [5, 3])\n",
-    "assert np.all(get_skewed_diagonal(matrix, 3) == [6])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "get_skewed_diagonal(matrix_wf, 10)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Diagonal Evaluation Order\n",
-    "\n",
-    "Accelerating this exact algorithm with SIMD instructions isn't trivial, is the `matrix[i, j]` value has a dependency on the `matrix[i, j - 1]` value.\n",
-    "So we can't brute-force accelerate the inner loop.\n",
-    "Instead, we can show that we can evaluate the matrix in a different order, and still get the same result.\n",
-    "\n",
-    "![Skewed Diagonals Evaluation Order](https://mathworld.wolfram.com/images/eps-svg/SkewDiagonal_1000.svg)\n",
-    "\n",
-    "But before complicating things too much, let's start with a simple case - when both strings have identical lengths and the DP matrix has a square shape."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from typing import Optional\n",
-    "\n",
-    "def square_skewed_diagonals(\n",
-    "    s1: str, s2: str, \n",
-    "    verbose: bool = False, \n",
-    "    baseline: Optional[np.ndarray] = None) -> Tuple[int, np.ndarray]:\n",
-    "\n",
-    "    assert len(s1) == len(s2), \"First define an algo for square matrices!\"\n",
-    "    # Create a matrix of size (len(s1)+1) x (len(s2)+1)\n",
-    "    matrix = np.zeros((len(s1) + 1, len(s2) + 1), dtype=int)\n",
-    "    matrix[:, :] = 99\n",
-    "\n",
-    "    # Initialize the first column and first row of the matrix\n",
-    "    for i in range(len(s1) + 1):\n",
-    "        matrix[i, 0] = i\n",
-    "    for j in range(len(s2) + 1):\n",
-    "        matrix[0, j] = j\n",
-    "\n",
-    "    # Number of rows and columns in the square matrix.\n",
-    "    n = len(s1) + 1\n",
-    "    \n",
-    "    # Number of diagonals and skewed diagonals in the square matrix of size (n x n).\n",
-    "    diagonals_count = 2 * n - 1\n",
-    "    \n",
-    "    # Populate the matrix in 2 separate loops: for the top left triangle and for the bottom right triangle.\n",
-    "    for skew_diagonal_index in range(2, n):\n",
-    "        skew_diagonal_length = skew_diagonal_index + 1\n",
-    "        for offset_within_diagonal in range(1, skew_diagonal_length - 1):\n",
-    "            # If we haven't passed the main skew diagonal yet, \n",
-    "            # then we have to skip the first and the last operation,\n",
-    "            # as those are already pre-populated and form the first column \n",
-    "            # and the first row of the Levenshtein matrix respectively.\n",
-    "            i = skew_diagonal_index - offset_within_diagonal\n",
-    "            j = offset_within_diagonal\n",
-    "            if verbose:\n",
-    "                print(f\"top left triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
-    "            substitution_cost = s1[i - 1] != s2[j - 1]\n",
-    "            matrix[i, j] = min(\n",
-    "                matrix[i - 1, j] + 1,                      #? Deletion cost\n",
-    "                matrix[i, j - 1] + 1,                      #? Insertion cost\n",
-    "                matrix[i - 1, j - 1] + substitution_cost,  #? Substitution cost\n",
-    "            )\n",
-    "            \n",
-    "            if baseline is not None:\n",
-    "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
-    "            \n",
-    "    # Now the bottom right triangle of the matrix.\n",
-    "    for skew_diagonal_index in range(n, diagonals_count):\n",
-    "        skew_diagonal_length = 2 * n - skew_diagonal_index - 1\n",
-    "        for offset_within_diagonal in range(skew_diagonal_length):\n",
-    "            i = n - offset_within_diagonal - 1\n",
-    "            j = skew_diagonal_index - n + offset_within_diagonal + 1\n",
-    "            if verbose:\n",
-    "                print(f\"bottom right triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
-    "            substitution_cost = s1[i - 1] != s2[j - 1]\n",
-    "            matrix[i, j] = min(\n",
-    "                matrix[i - 1, j] + 1,                      #? Deletion cost\n",
-    "                matrix[i, j - 1] + 1,                      #? Insertion cost\n",
-    "                matrix[i - 1, j - 1] + substitution_cost,  #? Substitution cost\n",
-    "            )\n",
-    "            \n",
-    "            if baseline is not None:\n",
-    "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
-    "\n",
-    "    # Similarly, the distance will be placed in the bottom right corner of the matrix\n",
-    "    return matrix[len(s1), len(s2)], matrix"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's generate some random strings and make sure we produce the right result."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import random\n",
-    "for _ in range(10):\n",
-    "    s1 = ''.join(random.choices(\"abc\", k=50))\n",
-    "    s2 = ''.join(random.choices(\"abc\", k=50))\n",
-    "    distance_wf, matrix_wf = wagner_fisher(s1, s2)\n",
-    "    distance_sd, matrix_sd = square_skewed_diagonals(s1, s2, baseline=matrix_wf)\n",
-    "    assert distance_wf == distance_sd, f\"{distance_wf = } != {distance_sd = }\"\n",
-    "    assert np.all(matrix_wf == matrix_sd), f\"{matrix_wf = }\\n{matrix_sd = }\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Vectorizing the Skewed Diagonals Algorithm\n",
-    "\n",
-    "Going further, we can avoid storing the whole matrix, and only store three diagonals at a time.\n",
-    "The longer will never exceed `n` in length.\n",
-    "The others are always at most `n-1`.\n",
-    "Let's try vectorizing different parts of our algorithm, validating it against the output of the naive algorithm for 2 strings: `\"BCDE\"` and `\"FKPU\"`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "s1 = \"BCDE\"\n",
-    "s2 = \"FKPU\"\n",
-    "distance_wf, matrix_wf = wagner_fisher(s1, s2)\n",
-    "s1, s2, f\"{distance_wf = }\", matrix_wf"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Replacing the letters with numbers and annotating with a header row and column for `\"BCDE\"` and `\"FKPU\"`:\n",
-    "\n",
-    "|       |     | **B** | **C** | **D** | **E** |\n",
-    "| ----- | --- | ----- | ----- | ----- | ----- |\n",
-    "|       | a   | b     | c     | d     | e     |\n",
-    "| **F** | f   | g     | h     | i     | j     |\n",
-    "| **K** | k   | l     | m     | n     | o     |\n",
-    "| **P** | p   | q     | r     | s     | t     |\n",
-    "| **U** | u   | v     | w     | x     | y     |\n",
-    "\n",
-    "At any point we will be working with 3 diagonals:\n",
-    "\n",
-    "- `previous` set to `[a]` at start\n",
-    "- `current` set to `[f, b]` at start\n",
-    "- `following` set to `[k, g, c]` at start"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "assert len(s1) == len(s2), \"First define an algo for square matrices!\"\n",
-    "# Number of rows and columns in the square matrix.\n",
-    "n = len(s1) + 1\n",
-    "\n",
-    "following = np.zeros(n, dtype=np.uint) # let's assume we are computing the main skew diagonal: [u, q, m, i, e]\n",
-    "current = np.zeros(n, dtype=np.uint) # will contain: [p, l, h, e]\n",
-    "previous = np.zeros(n, dtype=np.uint) # will contain: [k, g, c]\n",
-    "\n",
-    "# Initialize the first two diagonals.\n",
-    "# The `previous` would contain the values [a].\n",
-    "# The `current` would contain the values [f, b]. \n",
-    "previous[0] = 0\n",
-    "current[0:2] = 1\n",
-    "previous, current, following"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now we can rewrite the first nested loop for the upper triangle of the matrix in NumPy primitives, using it's `np.minimum` function to calculate the minimum of three values."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# To evaluate every subsequent entry:\n",
-    "next_diagonal_index = 2\n",
-    "while next_diagonal_index < n:\n",
-    "    next_skew_diagonal_length = next_diagonal_index + 1\n",
-    "\n",
-    "    old_substitution_costs = previous[:next_skew_diagonal_length - 2]\n",
-    "    added_substitution_costs = [s1[next_diagonal_index - i - 2] != s2[i] for i in range(next_skew_diagonal_length - 2)]\n",
-    "    substitution_costs = old_substitution_costs + added_substitution_costs\n",
-    "\n",
-    "    following[1:next_skew_diagonal_length - 1] = np.minimum(current[1:next_skew_diagonal_length - 1] + 1, current[:next_skew_diagonal_length - 2] + 1) # Insertions or deletions\n",
-    "    following[1:next_skew_diagonal_length - 1] = np.minimum(following[1:next_skew_diagonal_length - 1], substitution_costs) # Substitutions\n",
-    "    following[0] = next_diagonal_index\n",
-    "    following[next_skew_diagonal_length - 1] = next_diagonal_index\n",
-    "    assert np.all(following[:next_skew_diagonal_length] == get_skewed_diagonal(matrix_wf, next_diagonal_index))\n",
-    "    \n",
-    "    previous[:] = current[:]\n",
-    "    current[:] = following[:]\n",
-    "    next_diagonal_index += 1\n",
-    "\n",
-    "previous, current, following # Log the state"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a larger diagonal.\n",
-    "From now onwards, we will be shrinking.\n",
-    "Instead of adding value equal to the skewed diagonal index on either side, we will be cropping those values out."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "while next_diagonal_index < 2 * n - 1:\n",
-    "    next_skew_diagonal_length = 2 * n - 1 - next_diagonal_index\n",
-    "    old_substitution_costs = previous[:next_skew_diagonal_length]\n",
-    "    added_substitution_costs = [s1[len(s1) - i - 1] != s2[next_diagonal_index - n + i] for i in range(next_skew_diagonal_length)]\n",
-    "    substitution_costs = old_substitution_costs + added_substitution_costs\n",
-    "    \n",
-    "    following[:next_skew_diagonal_length] = np.minimum(current[:next_skew_diagonal_length] + 1, current[1 : next_skew_diagonal_length + 1] + 1) # Insertions or deletions\n",
-    "    following[:next_skew_diagonal_length] = np.minimum(following[:next_skew_diagonal_length], substitution_costs) # Substitutions\n",
-    "    assert np.all(following[:next_skew_diagonal_length] == get_skewed_diagonal(matrix_wf, next_diagonal_index)), f\"\\n{following[:next_skew_diagonal_length]} not equal to \\n{get_skewed_diagonal(baseline, next_diagonal_index)}\"\n",
-    "    \n",
-    "    previous[:next_skew_diagonal_length] = current[1:next_skew_diagonal_length + 1]\n",
-    "    current[:next_skew_diagonal_length] = following[:next_skew_diagonal_length]\n",
-    "    next_diagonal_index += 1\n",
-    "\n",
-    "previous, current, following # Log the state"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "assert distance_wf == following[0], f\"{distance_wf = } != {following[0] = }\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Generalizing to Non-Square Matrices\n",
-    "\n",
-    "Let's imaging 2 inputs of length 3 and 5: `\"KPU\"` and `\"BCDEF\"`:\n",
-    "\n",
-    "|       |     | **B** | **C** | **D** | **E** | **F** |\n",
-    "| ----- | --- | ----- | ----- | ----- | ----- | ----- |\n",
-    "|       | a   | b     | c     | d     | e     | f     |\n",
-    "| **K** | g   | h     | i     | j     | k     | l     |\n",
-    "| **P** | m   | n     | o     | p     | q     | r     |\n",
-    "| **U** | s   | t     | u     | v     | w     | x     |\n",
-    "\n",
-    "At any point we will be working with 3 diagonals:\n",
-    "\n",
-    "- `previous` set to `[a]` at start\n",
-    "- `current` set to `[g, b]` at start\n",
-    "- `next` set to `[m, h, c]` at start\n",
-    "\n",
-    "Once we proceed to for X cycles:\n",
-    "\n",
-    "- `previous` set to `[s, n, i, d]`\n",
-    "- `current` set to `[t, o, j, e]`\n",
-    "- `next` set to `[u, p, k, f]`\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from typing import Optional\n",
-    "\n",
-    "def skewed_diagonals(\n",
-    "    s1: str, s2: str, \n",
-    "    verbose: bool = False, \n",
-    "    baseline: Optional[np.ndarray] = None) -> Tuple[int, np.ndarray]:\n",
-    "    \n",
-    "    shorter, longer = (s1, s2) if len(s1) <= len(s2) else (s2, s1)    \n",
-    "    baseline = baseline if len(s1) <= len(s2) else baseline.T\n",
-    "    shorter_dim = len(shorter) + 1\n",
-    "    longer_dim = len(longer) + 1\n",
-    "    if verbose:\n",
-    "        print(f\"{shorter=}, {longer=}, {shorter_dim=}, {longer_dim=}\")\n",
-    "    \n",
-    "    # Create a matrix of size (shorter_dim) x (longer_dim)\n",
-    "    matrix = np.zeros((shorter_dim, longer_dim), dtype=int)\n",
-    "    matrix[:, :] = longer_dim + 1 # or +inf \n",
-    "\n",
-    "    # Initialize the first column and first row of the matrix\n",
-    "    for i in range(shorter_dim):\n",
-    "        matrix[i, 0] = i\n",
-    "    for j in range(longer_dim):\n",
-    "        matrix[0, j] = j\n",
-    "\n",
-    "    # Let's say we are dealing with 3 and 5 letter words.\n",
-    "    # The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).\n",
-    "    # It will have:\n",
-    "    # - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.\n",
-    "    # - 2 diagonals of fixed length, at positions: 4, 5.\n",
-    "    # - 3 diagonals of decreasing length, at positions: 6, 7, 8.\n",
-    "    diagonals_count = shorter_dim + longer_dim - 1\n",
-    "\n",
-    "    # Same as with square matrices, the 0th diagonal contains - just one element - zero - skipping it.\n",
-    "    # Same as with square matrices, the 1st diagonal contains the values 1 and 1 - skipping it.\n",
-    "    # Now let's handle the rest of the upper triangle.\n",
-    "    for skew_diagonal_index in range(2, shorter_dim):\n",
-    "        skew_diagonal_length = (skew_diagonal_index + 1)\n",
-    "        for offset_within_diagonal in range(1, skew_diagonal_length - 1): # ! Skip the first column & row\n",
-    "            # If we haven't passed the main skew diagonal yet, \n",
-    "            # then we have to skip the first and the last operation,\n",
-    "            # as those are already pre-populated and form the first column \n",
-    "            # and the first row of the Levenshtein matrix respectively.\n",
-    "            i = skew_diagonal_index - offset_within_diagonal\n",
-    "            j = offset_within_diagonal\n",
-    "            if verbose:\n",
-    "                print(f\"top left triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
-    "            shorter_char = shorter[i - 1]\n",
-    "            longer_char = longer[j - 1]\n",
-    "            substitution_cost = shorter_char != longer_char\n",
-    "            matrix[i, j] = min(\n",
-    "                matrix[i - 1, j] + 1,                      # ? Deletion cost\n",
-    "                matrix[i, j - 1] + 1,                      # ? Insertion cost\n",
-    "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
-    "            )\n",
-    "            \n",
-    "            if baseline is not None:\n",
-    "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
-    "                            \n",
-    "    # Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.        \n",
-    "    for skew_diagonal_index in range(shorter_dim, longer_dim):\n",
-    "        skew_diagonal_length = shorter_dim\n",
-    "        for offset_within_diagonal in range(skew_diagonal_length - 1): # ! Skip the first row\n",
-    "            i = shorter_dim - offset_within_diagonal - 1\n",
-    "            j = skew_diagonal_index - shorter_dim + offset_within_diagonal + 1\n",
-    "            if verbose:\n",
-    "                print(f\"anti-band: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
-    "            shorter_char = shorter[i - 1]\n",
-    "            longer_char = longer[j - 1]\n",
-    "            substitution_cost = shorter_char != longer_char\n",
-    "            matrix[i, j] = min(\n",
-    "                matrix[i - 1, j] + 1,                      # ? Deletion cost\n",
-    "                matrix[i, j - 1] + 1,                      # ? Insertion cost\n",
-    "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
-    "            )\n",
-    "            \n",
-    "            if baseline is not None:\n",
-    "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
-    "    \n",
-    "    # Now let's handle the bottom right triangle.\n",
-    "    for skew_diagonal_index in range(longer_dim, diagonals_count):\n",
-    "        skew_diagonal_length = diagonals_count - skew_diagonal_index\n",
-    "        for offset_within_diagonal in range(skew_diagonal_length):\n",
-    "            i = shorter_dim - offset_within_diagonal - 1\n",
-    "            j = skew_diagonal_index - shorter_dim + offset_within_diagonal + 1\n",
-    "            if verbose:\n",
-    "                print(f\"bottom right triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
-    "            assert (i - 1) >= 0 and (i - 1) < len(shorter), f\"{i = }\"\n",
-    "            assert (j - 1) >= 0 and (j - 1) < len(longer), f\"{j = }\"\n",
-    "            shorter_char = shorter[i - 1]\n",
-    "            longer_char = longer[j - 1]\n",
-    "            substitution_cost = shorter_char != longer_char\n",
-    "            matrix[i, j] = min(\n",
-    "                matrix[i - 1, j] + 1,                      # ? Deletion cost\n",
-    "                matrix[i, j - 1] + 1,                      # ? Insertion cost\n",
-    "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
-    "            )\n",
-    "            \n",
-    "            if baseline is not None:\n",
-    "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
-    "\n",
-    "    # Return the Levenshtein distance\n",
-    "    distance = matrix[len(shorter), len(longer)]\n",
-    "    if len(s1) > len(s2):\n",
-    "        matrix = matrix.T\n",
-    "    return distance, matrix"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import random\n",
-    "for _ in range(100):\n",
-    "    len1 = random.randint(1, 50)\n",
-    "    len2 = random.randint(1, 50)\n",
-    "    s1 = ''.join(random.choices(\"abc\", k=len1))\n",
-    "    s2 = ''.join(random.choices(\"abc\", k=len2))\n",
-    "    distance_wf, matrix_wf = wagner_fisher(s1, s2)\n",
-    "    distance_sd, matrix_sd = skewed_diagonals(s1, s2, baseline=matrix_wf, verbose=False)\n",
-    "    assert distance_wf == distance_sd, f\"{distance_wf = } != {distance_sd = }\"\n",
-    "    assert np.all(matrix_wf == matrix_sd), f\"{matrix_wf = }\\n{matrix_sd = }\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "s1 = \"listeners\"\n",
-    "s2 = \"silents\"\n",
-    "distance_wf, matrix_wf = wagner_fisher(s1, s2)\n",
-    "distance_sd, matrix_sd = skewed_diagonals(s1, s2, baseline=matrix_wf)\n",
-    "s1, s2, f\"{distance_sd = }\", matrix_sd"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Reversing the Input\n",
-    "\n",
-    "One of the issues with vectorizing this algorithm is the traversal order of the shorter string.\n",
-    "It's different from the longer string and different from the natural traversal order of the loop.\n",
-    "To make the indexing simpler, we can pre-reverse the shorter string."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from typing import Optional\n",
-    "\n",
-    "def skewed_diagonals_reversed(\n",
-    "    s1: str, s2: str, \n",
-    "    verbose: bool = False, \n",
-    "    baseline: Optional[np.ndarray] = None) -> Tuple[int, np.ndarray]:\n",
-    "    \n",
-    "    shorter, longer = (s1, s2) if len(s1) <= len(s2) else (s2, s1)    \n",
-    "    baseline = baseline if len(s1) <= len(s2) else baseline.T\n",
-    "    shorter_dim = len(shorter) + 1\n",
-    "    longer_dim = len(longer) + 1\n",
-    "    if verbose:\n",
-    "        print(f\"{shorter=}, {longer=}, {shorter_dim=}, {longer_dim=}\")\n",
-    "    \n",
-    "    # Create a matrix of size (shorter_dim) x (longer_dim)\n",
-    "    matrix = np.zeros((shorter_dim, longer_dim), dtype=int)\n",
-    "    matrix[:, :] = longer_dim + 1 # or +inf \n",
-    "\n",
-    "    # Initialize the first column and first row of the matrix\n",
-    "    for i in range(shorter_dim):\n",
-    "        matrix[i, 0] = i\n",
-    "    for j in range(longer_dim):\n",
-    "        matrix[0, j] = j\n",
-    "\n",
-    "    # Let's say we are dealing with 3 and 5 letter words.\n",
-    "    # The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).\n",
-    "    # It will have:\n",
-    "    # - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.\n",
-    "    # - 2 diagonals of fixed length, at positions: 4, 5.\n",
-    "    # - 3 diagonals of decreasing length, at positions: 6, 7, 8.\n",
-    "    diagonals_count = shorter_dim + longer_dim - 1\n",
-    "    shorter_reversed = \"\".join(reversed(shorter))\n",
-    "\n",
-    "    # In reality, we need to keep only 3 diagonals to produce the same score in the end.\n",
-    "    previous_distances = np.zeros(shorter_dim, dtype=np.uint)\n",
-    "    current_distances = np.zeros(shorter_dim, dtype=np.uint)\n",
-    "    next_distances = np.zeros(shorter_dim, dtype=np.uint)\n",
-    "    temporary_distances = np.zeros(shorter_dim, dtype=np.uint)\n",
-    "    previous_distances[0] = 0\n",
-    "    current_distances[0] = current_distances[1] = 1\n",
-    "\n",
-    "    # Same as with square matrices, the 0th diagonal contains - just one element - zero - skipping it.\n",
-    "    # Same as with square matrices, the 1st diagonal contains the values 1 and 1 - skipping it.\n",
-    "    # Now let's handle the rest of the upper triangle.\n",
-    "    for skew_diagonal_index in range(2, shorter_dim):\n",
-    "        skew_diagonal_length = (skew_diagonal_index + 1)\n",
-    "        for offset_within_diagonal in range(1, skew_diagonal_length - 1): # ! Skip the left column & top row\n",
-    "            # If we haven't passed the main skew diagonal yet, \n",
-    "            # then we have to skip the first and the last operation,\n",
-    "            # as those are already pre-populated and form the first column \n",
-    "            # and the first row of the Levenshtein matrix respectively.\n",
-    "            i = skew_diagonal_index - offset_within_diagonal\n",
-    "            j = offset_within_diagonal\n",
-    "            if verbose:\n",
-    "                print(f\"top left triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
-    "            shorter_char = shorter_reversed[len(shorter) - i]\n",
-    "            longer_char = longer[j - 1]\n",
-    "            substitution_cost = shorter_char != longer_char\n",
-    "            matrix[i, j] = min(\n",
-    "                matrix[i - 1, j] + 1,                      # ? Deletion cost\n",
-    "                matrix[i, j - 1] + 1,                      # ? Insertion cost\n",
-    "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
-    "            )\n",
-    "\n",
-    "            # ? For reproducibility let's also try doing the same only using the info in the 3 diagonals\n",
-    "            next_distances[offset_within_diagonal] = min(\n",
-    "                current_distances[offset_within_diagonal - 1] + 1,\n",
-    "                current_distances[offset_within_diagonal] + 1,\n",
-    "                previous_distances[offset_within_diagonal - 1] + substitution_cost,\n",
-    "            )\n",
-    "            \n",
-    "            if baseline is not None:\n",
-    "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
-    "            \n",
-    "        next_distances[0] = next_distances[skew_diagonal_length-1] = skew_diagonal_index\n",
-    "        \n",
-    "        # ? Let's validate the contents of the diagonal\n",
-    "        skew_diagonal_expected = get_skewed_diagonal(matrix, skew_diagonal_index)\n",
-    "        assert len(skew_diagonal_expected) == skew_diagonal_length\n",
-    "        assert (skew_diagonal_expected == next_distances[:skew_diagonal_length]).all(), f\"diagonal:{skew_diagonal_index}\\nexpected:{skew_diagonal_expected}\\nproduced:{next_distances[:skew_diagonal_length]}\"\n",
-    "        temporary_distances[:] = previous_distances[:]\n",
-    "        previous_distances[:] = current_distances[:]\n",
-    "        current_distances[:] = next_distances[:]\n",
-    "        next_distances[:] = temporary_distances[:]\n",
-    "\n",
-    "    # Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.        \n",
-    "    for skew_diagonal_index in range(shorter_dim, longer_dim):\n",
-    "        skew_diagonal_length = shorter_dim\n",
-    "        for offset_within_diagonal in range(skew_diagonal_length - 1): # ! Skip the top row\n",
-    "            i = shorter_dim - offset_within_diagonal - 1\n",
-    "            j = skew_diagonal_index - shorter_dim + offset_within_diagonal + 1\n",
-    "            if verbose:\n",
-    "                print(f\"anti-band: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
-    "            shorter_char = shorter_reversed[len(shorter) - i]\n",
-    "            longer_char = longer[j - 1]\n",
-    "            substitution_cost = shorter_char != longer_char\n",
-    "            matrix[i, j] = min(\n",
-    "                matrix[i - 1, j] + 1,                      # ? Deletion cost\n",
-    "                matrix[i, j - 1] + 1,                      # ? Insertion cost\n",
-    "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
-    "            )\n",
-    "            \n",
-    "            if baseline is not None:\n",
-    "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
-    "    \n",
-    "            # ? For reproducibility let's also try doing the same only using the info in the 3 diagonals\n",
-    "            next_distances[offset_within_diagonal] = min(\n",
-    "                current_distances[offset_within_diagonal] + 1,\n",
-    "                current_distances[offset_within_diagonal + 1] + 1,\n",
-    "                previous_distances[offset_within_diagonal] + substitution_cost,\n",
-    "            )\n",
-    "            \n",
-    "        next_distances[shorter_dim-1] = skew_diagonal_index\n",
-    "        \n",
-    "        # ? Let's validate the contents of the diagonal\n",
-    "        skew_diagonal_expected = get_skewed_diagonal(matrix, skew_diagonal_index)\n",
-    "        assert len(skew_diagonal_expected) == skew_diagonal_length\n",
-    "        assert (skew_diagonal_expected == next_distances[:skew_diagonal_length]).all(), f\"diagonal:{skew_diagonal_index}\\nexpected:{skew_diagonal_expected}\\nproduced:{next_distances[:skew_diagonal_length]}\"\n",
-    "        temporary_distances[:] = previous_distances[:]\n",
-    "        previous_distances[:-1] = current_distances[1:] # ! Note we shift here\n",
-    "        current_distances[:] = next_distances[:]\n",
-    "        next_distances[:] = temporary_distances[:]\n",
-    "    \n",
-    "    # Now let's handle the bottom right triangle.\n",
-    "    for skew_diagonal_index in range(longer_dim, diagonals_count):\n",
-    "        skew_diagonal_length = diagonals_count - skew_diagonal_index\n",
-    "        for offset_within_diagonal in range(skew_diagonal_length):\n",
-    "            i = shorter_dim - offset_within_diagonal - 1\n",
-    "            j = skew_diagonal_index - shorter_dim + offset_within_diagonal + 1\n",
-    "            if verbose:\n",
-    "                print(f\"bottom right triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
-    "            assert (i - 1) >= 0 and (i - 1) < len(shorter), f\"{i = }\"\n",
-    "            assert (j - 1) >= 0 and (j - 1) < len(longer), f\"{j = }\"\n",
-    "            shorter_char = shorter_reversed[len(shorter) - i]\n",
-    "            longer_char = longer[j - 1]\n",
-    "            substitution_cost = shorter_char != longer_char\n",
-    "            matrix[i, j] = min(\n",
-    "                matrix[i - 1, j] + 1,                      # ? Deletion cost\n",
-    "                matrix[i, j - 1] + 1,                      # ? Insertion cost\n",
-    "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
-    "            )\n",
-    "            \n",
-    "            if baseline is not None:\n",
-    "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
-    "\n",
-    "            # ? For reproducibility let's also try doing the same only using the info in the 3 diagonals\n",
-    "            next_distances[offset_within_diagonal] = min(\n",
-    "                current_distances[offset_within_diagonal] + 1,\n",
-    "                current_distances[offset_within_diagonal + 1] + 1,\n",
-    "                previous_distances[offset_within_diagonal] + substitution_cost,\n",
-    "            )\n",
-    "\n",
-    "        # ? Let's validate the contents of the diagonal\n",
-    "        skew_diagonal_expected = get_skewed_diagonal(matrix, skew_diagonal_index)\n",
-    "        assert len(skew_diagonal_expected) == skew_diagonal_length\n",
-    "        assert (skew_diagonal_expected == next_distances[:skew_diagonal_length]).all(), f\"diagonal:{skew_diagonal_index}\\nexpected:{skew_diagonal_expected}\\nproduced:{next_distances[:skew_diagonal_length]}\"\n",
-    "        temporary_distances[:] = previous_distances[:]\n",
-    "        previous_distances[:-1] = current_distances[1:] # ! Note we shift here\n",
-    "        current_distances[:] = next_distances[:]\n",
-    "        next_distances[:] = temporary_distances[:]\n",
-    "\n",
-    "    # Return the Levenshtein distance\n",
-    "    distance_from_matrix = matrix[len(shorter), len(longer)]\n",
-    "    distance_from_diagonal = current_distances[0]\n",
-    "    assert distance_from_diagonal == distance_from_matrix\n",
-    "    if len(s1) > len(s2):\n",
-    "        matrix = matrix.T\n",
-    "    return distance_from_matrix, matrix"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import random\n",
-    "for _ in range(100):\n",
-    "    len1 = random.randint(1, 50)\n",
-    "    len2 = random.randint(1, 50)\n",
-    "    s1 = ''.join(random.choices(\"abc\", k=len1))\n",
-    "    s2 = ''.join(random.choices(\"abc\", k=len2))\n",
-    "    distance_wf, matrix_wf = wagner_fisher(s1, s2)\n",
-    "    distance_sd, matrix_sd = skewed_diagonals_reversed(s1, s2, baseline=matrix_wf, verbose=False)\n",
-    "    assert distance_wf == distance_sd, f\"{distance_wf = } != {distance_sd = }\"\n",
-    "    assert np.all(matrix_wf == matrix_sd), f\"{matrix_wf = }\\n{matrix_sd = }\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "s1 = \"listeners\"\n",
-    "s2 = \"silents\"\n",
-    "distance_wf, matrix_wf = wagner_fisher(s1, s2)\n",
-    "distance_sd, matrix_sd = skewed_diagonals_reversed(s1, s2, baseline=matrix_wf)\n",
-    "s1, s2, f\"{distance_wf = }\", f\"{distance_sd = }\", matrix_sd"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Bounding the Error\n",
-    "\n",
-    "It's easy to spot that the algorithm can be further optimized if we are dealing with \"bounded\" edit distances, where the maximum allowed number of edits is known in advance.\n",
-    "In such cases, we only need to evaluate a band around the main diagonal, and can skip the rest of the matrix.\n",
-    "For the bound $k$, we only need to evaluate $2k+1$ diagonals."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from typing import Optional\n",
-    "\n",
-    "\n",
-    "def bounded_skewed_diagonals(\n",
-    "    s1: str,\n",
-    "    s2: str,\n",
-    "    verbose: bool = False,\n",
-    "    bound: Optional[int] = None,\n",
-    "    baseline: Optional[np.ndarray] = None,\n",
-    ") -> Tuple[int, np.ndarray]:\n",
-    "\n",
-    "    shorter, longer = (s1, s2) if len(s1) <= len(s2) else (s2, s1)\n",
-    "    baseline = baseline if len(s1) <= len(s2) else baseline.T\n",
-    "    shorter_dim = len(shorter) + 1\n",
-    "    longer_dim = len(longer) + 1\n",
-    "    if verbose:\n",
-    "        print(f\"{shorter=}, {longer=}, {shorter_dim=}, {longer_dim=}\")\n",
-    "\n",
-    "    # Create a matrix of size (shorter_dim) x (longer_dim)\n",
-    "    matrix = np.zeros((shorter_dim, longer_dim), dtype=int)\n",
-    "    matrix[:, :] = np.iinfo(matrix.dtype).max\n",
-    "\n",
-    "    # Initialize the first column and first row of the matrix\n",
-    "    for i in range(shorter_dim):\n",
-    "        matrix[i, 0] = i\n",
-    "    for j in range(longer_dim):\n",
-    "        matrix[0, j] = j\n",
-    "\n",
-    "    # Let's say we are dealing with 3 and 5 letter words.\n",
-    "    # The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).\n",
-    "    # It will have:\n",
-    "    # - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.\n",
-    "    # - 2 diagonals of fixed length, at positions: 4, 5.\n",
-    "    # - 3 diagonals of decreasing length, at positions: 6, 7, 8.\n",
-    "    diagonals_count = shorter_dim + longer_dim - 1\n",
-    "\n",
-    "    # Same as with square matrices, the 0th diagonal contains - just one element - zero - skipping it.\n",
-    "    # Same as with square matrices, the 1st diagonal contains the values 1 and 1 - skipping it.\n",
-    "    # In unbounded case, we the upper triangle will have `shorter_dim` rows and columns.\n",
-    "    # In bounded case, we will have `min(bound, shorter_dim)` rows and columns.\n",
-    "    upper_triangle_dim = min(bound, shorter_dim) if bound is not None else shorter_dim\n",
-    "    for skew_diagonal_index in range(2, upper_triangle_dim):\n",
-    "        skew_diagonal_length = skew_diagonal_index + 1\n",
-    "        for offset_within_diagonal in range(\n",
-    "            1, skew_diagonal_length - 1\n",
-    "        ):  #! Skip the first column & row\n",
-    "            # If we haven't passed the main skew diagonal yet,\n",
-    "            # then we have to skip the first and the last operation,\n",
-    "            # as those are already pre-populated and form the first column\n",
-    "            # and the first row of the Levenshtein matrix respectively.\n",
-    "            i = skew_diagonal_index - offset_within_diagonal\n",
-    "            j = offset_within_diagonal\n",
-    "            if verbose:\n",
-    "                print(\n",
-    "                    f\"top left triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\"\n",
-    "                )\n",
-    "            shorter_char = shorter[i - 1]\n",
-    "            longer_char = longer[j - 1]\n",
-    "            substitution_cost = shorter_char != longer_char\n",
-    "            matrix[i, j] = min(\n",
-    "                matrix[i - 1, j] + 1,  # ? Deletion cost\n",
-    "                matrix[i, j - 1] + 1,  # ? Insertion cost\n",
-    "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
-    "            )\n",
-    "            \n",
-    "            # Validation checks:\n",
-    "            if baseline is not None:\n",
-    "                assert (\n",
-    "                    matrix[i, j] == baseline[i, j]\n",
-    "                ), f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
-    "\n",
-    "    # Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.\n",
-    "    # In the unbounded case, we will enumerate diagonal indices from `shorter_dim` to `longer_dim`.\n",
-    "    # In the bounded case, we go through the same \n",
-    "    for skew_diagonal_index in range(shorter_dim, longer_dim):\n",
-    "        skew_diagonal_length = shorter_dim\n",
-    "        for offset_within_diagonal in range(\n",
-    "            skew_diagonal_length - 1\n",
-    "        ):  #! Skip the first row\n",
-    "            i = shorter_dim - offset_within_diagonal - 1\n",
-    "            j = skew_diagonal_index - shorter_dim + offset_within_diagonal + 1\n",
-    "            if verbose:\n",
-    "                print(\n",
-    "                    f\"anti-band: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\"\n",
-    "                )\n",
-    "            shorter_char = shorter[i - 1]\n",
-    "            longer_char = longer[j - 1]\n",
-    "            substitution_cost = shorter_char != longer_char\n",
-    "            matrix[i, j] = min(\n",
-    "                matrix[i - 1, j] + 1,  # ? Deletion cost\n",
-    "                matrix[i, j - 1] + 1,  # ? Insertion cost\n",
-    "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
-    "            )\n",
-    "\n",
-    "            if baseline is not None:\n",
-    "                assert (\n",
-    "                    matrix[i, j] == baseline[i, j]\n",
-    "                ), f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
-    "\n",
-    "    # Now let's handle the bottom right triangle.\n",
-    "    for skew_diagonal_index in range(longer_dim, diagonals_count):\n",
-    "        skew_diagonal_length = diagonals_count - skew_diagonal_index\n",
-    "        for offset_within_diagonal in range(skew_diagonal_length):\n",
-    "            i = shorter_dim - offset_within_diagonal - 1\n",
-    "            j = skew_diagonal_index - shorter_dim + offset_within_diagonal + 1\n",
-    "            if verbose:\n",
-    "                print(\n",
-    "                    f\"bottom right triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\"\n",
-    "                )\n",
-    "            assert (i - 1) >= 0 and (i - 1) < len(shorter), f\"{i = }\"\n",
-    "            assert (j - 1) >= 0 and (j - 1) < len(longer), f\"{j = }\"\n",
-    "            shorter_char = shorter[i - 1]\n",
-    "            longer_char = longer[j - 1]\n",
-    "            substitution_cost = shorter_char != longer_char\n",
-    "            matrix[i, j] = min(\n",
-    "                matrix[i - 1, j] + 1,  # ? Deletion cost\n",
-    "                matrix[i, j - 1] + 1,  # ? Insertion cost\n",
-    "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
-    "            )\n",
-    "\n",
-    "            if baseline is not None:\n",
-    "                assert (\n",
-    "                    matrix[i, j] == baseline[i, j]\n",
-    "                ), f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
-    "\n",
-    "    # Return the Levenshtein distance\n",
-    "    distance = matrix[len(shorter), len(longer)]\n",
-    "    if len(s1) > len(s2):\n",
-    "        matrix = matrix.T\n",
-    "    return distance, matrix"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Putting Everything Together"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def vectorized_skewed_diagonals(\n",
-    "    s1: str, s2: str, \n",
-    "    verbose: bool = False, \n",
-    "    baseline: Optional[np.ndarray] = None) -> Tuple[int, np.ndarray]:\n",
-    "    \n",
-    "    shorter, longer = (s1, s2) if len(s1) <= len(s2) else (s2, s1)    \n",
-    "    baseline = baseline if len(s1) <= len(s2) else baseline.T\n",
-    "    shorter_dim = len(shorter) + 1\n",
-    "    longer_dim = len(longer) + 1\n",
-    "    if verbose:\n",
-    "        print(f\"{shorter=}, {longer=}, {shorter_dim=}, {longer_dim=}\")\n",
-    "    \n",
-    "    # Create a matrix of size (shorter_dim) x (longer_dim)\n",
-    "    matrix = np.zeros((shorter_dim, longer_dim), dtype=int)\n",
-    "    matrix[:, :] = longer_dim + 1 # or +inf \n",
-    "\n",
-    "    # Initialize the first column and first row of the matrix\n",
-    "    for i in range(shorter_dim):\n",
-    "        matrix[i, 0] = i\n",
-    "    for j in range(longer_dim):\n",
-    "        matrix[0, j] = j\n",
-    "\n",
-    "    # Let's say we are dealing with 3 and 5 letter words.\n",
-    "    # The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).\n",
-    "    # It will have:\n",
-    "    # - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.\n",
-    "    # - 2 diagonals of fixed length, at positions: 4, 5.\n",
-    "    # - 3 diagonals of decreasing length, at positions: 6, 7, 8.\n",
-    "    diagonals_count = shorter_dim + longer_dim - 1\n",
-    "\n",
-    "    # Same as with square matrices, the 0th diagonal contains - just one element - zero - skipping it.\n",
-    "    # Same as with square matrices, the 1st diagonal contains the values 1 and 1 - skipping it.\n",
-    "    # Now let's handle the rest of the upper triangle.\n",
-    "    next_diagonal_index = 2\n",
-    "    while next_diagonal_index < shorter_dim:\n",
-    "        next_skew_diagonal_length = next_diagonal_index + 1\n",
-    "\n",
-    "        old_substitution_costs = previous[:next_skew_diagonal_length - 2]\n",
-    "        added_substitution_costs = [shorter[next_diagonal_index - offset_within_diagonal - 2] != longer[offset_within_diagonal] for offset_within_diagonal in range(next_skew_diagonal_length - 2)]\n",
-    "        substitution_costs = old_substitution_costs + added_substitution_costs\n",
-    "\n",
-    "        following[1:next_skew_diagonal_length - 1] = np.minimum(current[1:next_skew_diagonal_length - 1] + 1, current[:next_skew_diagonal_length - 2] + 1) # Insertions or deletions\n",
-    "        following[1:next_skew_diagonal_length - 1] = np.minimum(following[1:next_skew_diagonal_length - 1], substitution_costs) # Substitutions\n",
-    "        following[0] = next_diagonal_index\n",
-    "        following[next_skew_diagonal_length - 1] = next_diagonal_index\n",
-    "        assert np.all(following[:next_skew_diagonal_length] == get_skewed_diagonal(baseline, next_diagonal_index))\n",
-    "        \n",
-    "        previous[:] = current[:]\n",
-    "        current[:] = following[:]\n",
-    "        next_diagonal_index += 1\n",
-    "                        \n",
-    "    # Now let's handle the anti-diagonal band of the matrix, between the top and bottom triangles.        \n",
-    "    while next_diagonal_index < longer_dim:\n",
-    "        next_skew_diagonal_length = shorter_dim\n",
-    "    \n",
-    "        ..."
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "# Exploring the Impact of Evaluation Order on Edit Distance Algorithms\n",
+                "\n",
+                "Removing data-dependencies in the Wagner-Fisher, Needleman-Wunsch, Smith-Waterman, and Gotoh Dynamic Programming algorithms to explain the hardware-accelerated variants in StringZilla."
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## Levenshtein Distance\n",
+                "\n",
+                "Levenshtein edit distance is one of the most broadly studied string similarity metrics.\n",
+                "It is defined as the minimum number of single-character insertions, deletions, and substitutions required to change one string into another.\n",
+                "The Levenshtein distance between two strings is calculated using dynamic programming algorithms, such as the Wagner-Fisher algorithm, and its variations for Bioinformatics: \n",
+                "\n",
+                "- Needleman-Wunsch for global alignment with substitution matrices, \n",
+                "- Smith-Waterman for local alignment with substitution matrices, \n",
+                "- Gotoh for different penalties for gap opening and extensions.\n",
+                "\n",
+                "Given the shared nature of these algorithms, the same tricks can be applied to all of them to improve their performance."
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## Warner-Fisher Algorithm\n",
+                "\n",
+                "Wagner-Fisher algorithm, in its most naive form, has a time and space complexity of $O(NM)$, where $N$ and $M$ are the lengths of the two strings being compared.\n",
+                "A rectangular matrix of size $(N+1) \\times (M+1)$ is created to store the edit distances between all prefixes of the two strings.\n",
+                "The first row and column are, naturally, initialized with ${0, 1, 2, ..., N}$ and ${0, 1, 2, ..., M}$ respectively."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "from typing import Tuple\n",
+                "import numpy as np # NumPy for matrices\n",
+                "\n",
+                "def wagner_fisher(s1: str, s2: str) -> Tuple[int, np.ndarray]:\n",
+                "    # Create a matrix of size (len(s1)+1) x (len(s2)+1)\n",
+                "    matrix = np.zeros((len(s1) + 1, len(s2) + 1), dtype=int)\n",
+                "\n",
+                "    # Initialize the first column and first row of the matrix\n",
+                "    for i in range(len(s1) + 1):\n",
+                "        matrix[i, 0] = i\n",
+                "    for j in range(len(s2) + 1):\n",
+                "        matrix[0, j] = j\n",
+                "\n",
+                "    # Compute Levenshtein distance\n",
+                "    for i in range(1, len(s1) + 1):\n",
+                "        for j in range(1, len(s2) + 1):\n",
+                "            substitution_cost = s1[i - 1] != s2[j - 1]\n",
+                "            matrix[i, j] = min(\n",
+                "                matrix[i - 1, j] + 1,                      #? Deletion cost\n",
+                "                matrix[i, j - 1] + 1,                      #? Insertion cost\n",
+                "                matrix[i - 1, j - 1] + substitution_cost,  #? Substitution cost\n",
+                "            )\n",
+                "\n",
+                "    # The distance will be placed in the bottom right corner of the matrix\n",
+                "    return matrix[len(s1), len(s2)], matrix"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "s1 = \"kiten\"\n",
+                "s2 = \"katerinas\"\n",
+                "distance_wf, matrix_wf = wagner_fisher(s1, s2)\n",
+                "s1, s2, f\"{distance_wf = }\", matrix_wf"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "This algorithm is almost never recommended for practical use, as it has a quadratic space complexity.\n",
+                "It's trivial to see that the space complexity can be reduced to $O(min(N, M))$ by only storing the last two rows of the matrix, but we want to keep the entire matrix as a reference to allow debugging and visualization."
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "To feel safer, while designing our alternative traversal algorithm, let's define an extraction function, that will get the values of a certain skewed diagonal."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "def get_skewed_diagonal(matrix: np.ndarray, index: int):\n",
+                "    flipped_matrix = np.fliplr(matrix)\n",
+                "    return np.flip(np.diag(flipped_matrix, k= matrix.shape[1] - index - 1))\n",
+                "\n",
+                "# Let's test this function right away.\n",
+                "matrix = np.array([\n",
+                "    [1, 2, 3],\n",
+                "    [4, 5, 6],\n",
+                "    [7, 8, 9]])\n",
+                "assert np.all(get_skewed_diagonal(matrix, 2) == [7, 5, 3])\n",
+                "assert np.all(get_skewed_diagonal(matrix, 1) == [4, 2])\n",
+                "assert np.all(get_skewed_diagonal(matrix, 4) == [9])\n",
+                "\n",
+                "# Let's test this function right away.\n",
+                "matrix = np.array([\n",
+                "    [1, 2, 3],\n",
+                "    [4, 5, 6]])\n",
+                "assert np.all(get_skewed_diagonal(matrix, 0) == [1])\n",
+                "assert np.all(get_skewed_diagonal(matrix, 1) == [4, 2])\n",
+                "assert np.all(get_skewed_diagonal(matrix, 2) == [5, 3])\n",
+                "assert np.all(get_skewed_diagonal(matrix, 3) == [6])"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "get_skewed_diagonal(matrix_wf, 10)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## Diagonal Evaluation Order\n",
+                "\n",
+                "Accelerating this exact algorithm with SIMD instructions isn't trivial, is the `matrix[i, j]` value has a dependency on the `matrix[i, j - 1]` value.\n",
+                "So we can't brute-force accelerate the inner loop.\n",
+                "Instead, we can show that we can evaluate the matrix in a different order, and still get the same result.\n",
+                "\n",
+                "![Skewed Diagonals Evaluation Order](https://mathworld.wolfram.com/images/eps-svg/SkewDiagonal_1000.svg)\n",
+                "\n",
+                "But before complicating things too much, let's start with a simple case - when both strings have identical lengths and the DP matrix has a square shape."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "from typing import Optional\n",
+                "\n",
+                "def square_skewed_diagonals(\n",
+                "    s1: str, s2: str, \n",
+                "    verbose: bool = False, \n",
+                "    baseline: Optional[np.ndarray] = None) -> Tuple[int, np.ndarray]:\n",
+                "\n",
+                "    assert len(s1) == len(s2), \"First define an algo for square matrices!\"\n",
+                "    # Create a matrix of size (len(s1)+1) x (len(s2)+1)\n",
+                "    matrix = np.zeros((len(s1) + 1, len(s2) + 1), dtype=int)\n",
+                "    matrix[:, :] = 99\n",
+                "\n",
+                "    # Initialize the first column and first row of the matrix\n",
+                "    for i in range(len(s1) + 1):\n",
+                "        matrix[i, 0] = i\n",
+                "    for j in range(len(s2) + 1):\n",
+                "        matrix[0, j] = j\n",
+                "\n",
+                "    # Number of rows and columns in the square matrix.\n",
+                "    n = len(s1) + 1\n",
+                "    \n",
+                "    # Number of diagonals and skewed diagonals in the square matrix of size (n x n).\n",
+                "    diagonals_count = 2 * n - 1\n",
+                "    \n",
+                "    # Populate the matrix in 2 separate loops: for the top left triangle and for the bottom right triangle.\n",
+                "    for skew_diagonal_index in range(2, n):\n",
+                "        skew_diagonal_length = skew_diagonal_index + 1\n",
+                "        for offset_within_diagonal in range(1, skew_diagonal_length - 1):\n",
+                "            # If we haven't passed the main skew diagonal yet, \n",
+                "            # then we have to skip the first and the last operation,\n",
+                "            # as those are already pre-populated and form the first column \n",
+                "            # and the first row of the Levenshtein matrix respectively.\n",
+                "            i = skew_diagonal_index - offset_within_diagonal\n",
+                "            j = offset_within_diagonal\n",
+                "            if verbose:\n",
+                "                print(f\"top left triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+                "            substitution_cost = s1[i - 1] != s2[j - 1]\n",
+                "            matrix[i, j] = min(\n",
+                "                matrix[i - 1, j] + 1,                      #? Deletion cost\n",
+                "                matrix[i, j - 1] + 1,                      #? Insertion cost\n",
+                "                matrix[i - 1, j - 1] + substitution_cost,  #? Substitution cost\n",
+                "            )\n",
+                "            \n",
+                "            if baseline is not None:\n",
+                "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
+                "            \n",
+                "    # Now the bottom right triangle of the matrix.\n",
+                "    for skew_diagonal_index in range(n, diagonals_count):\n",
+                "        skew_diagonal_length = 2 * n - skew_diagonal_index - 1\n",
+                "        for offset_within_diagonal in range(skew_diagonal_length):\n",
+                "            i = n - offset_within_diagonal - 1\n",
+                "            j = skew_diagonal_index - n + offset_within_diagonal + 1\n",
+                "            if verbose:\n",
+                "                print(f\"bottom right triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+                "            substitution_cost = s1[i - 1] != s2[j - 1]\n",
+                "            matrix[i, j] = min(\n",
+                "                matrix[i - 1, j] + 1,                      #? Deletion cost\n",
+                "                matrix[i, j - 1] + 1,                      #? Insertion cost\n",
+                "                matrix[i - 1, j - 1] + substitution_cost,  #? Substitution cost\n",
+                "            )\n",
+                "            \n",
+                "            if baseline is not None:\n",
+                "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
+                "\n",
+                "    # Similarly, the distance will be placed in the bottom right corner of the matrix\n",
+                "    return matrix[len(s1), len(s2)], matrix"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "Let's generate some random strings and make sure we produce the right result."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "import random\n",
+                "for _ in range(10):\n",
+                "    s1 = ''.join(random.choices(\"abc\", k=50))\n",
+                "    s2 = ''.join(random.choices(\"abc\", k=50))\n",
+                "    distance_wf, matrix_wf = wagner_fisher(s1, s2)\n",
+                "    distance_sd, matrix_sd = square_skewed_diagonals(s1, s2, baseline=matrix_wf)\n",
+                "    assert distance_wf == distance_sd, f\"{distance_wf = } != {distance_sd = }\"\n",
+                "    assert np.all(matrix_wf == matrix_sd), f\"{matrix_wf = }\\n{matrix_sd = }\""
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## Vectorizing the Skewed Diagonals Algorithm\n",
+                "\n",
+                "Going further, we can avoid storing the whole matrix, and only store three diagonals at a time.\n",
+                "The longer will never exceed `n` in length.\n",
+                "The others are always at most `n-1`.\n",
+                "Let's try vectorizing different parts of our algorithm, validating it against the output of the naive algorithm for 2 strings: `\"BCDE\"` and `\"FKPU\"`."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "s1 = \"BCDE\"\n",
+                "s2 = \"FKPU\"\n",
+                "distance_wf, matrix_wf = wagner_fisher(s1, s2)\n",
+                "s1, s2, f\"{distance_wf = }\", matrix_wf"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "Replacing the letters with numbers and annotating with a header row and column for `\"BCDE\"` and `\"FKPU\"`:\n",
+                "\n",
+                "|       |     | **B** | **C** | **D** | **E** |\n",
+                "| ----- | --- | ----- | ----- | ----- | ----- |\n",
+                "|       | a   | b     | c     | d     | e     |\n",
+                "| **F** | f   | g     | h     | i     | j     |\n",
+                "| **K** | k   | l     | m     | n     | o     |\n",
+                "| **P** | p   | q     | r     | s     | t     |\n",
+                "| **U** | u   | v     | w     | x     | y     |\n",
+                "\n",
+                "At any point we will be working with 3 diagonals:\n",
+                "\n",
+                "- `previous` set to `[a]` at start\n",
+                "- `current` set to `[f, b]` at start\n",
+                "- `following` set to `[k, g, c]` at start"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "assert len(s1) == len(s2), \"First define an algo for square matrices!\"\n",
+                "# Number of rows and columns in the square matrix.\n",
+                "n = len(s1) + 1\n",
+                "\n",
+                "following = np.zeros(n, dtype=np.uint) # let's assume we are computing the main skew diagonal: [u, q, m, i, e]\n",
+                "current = np.zeros(n, dtype=np.uint) # will contain: [p, l, h, e]\n",
+                "previous = np.zeros(n, dtype=np.uint) # will contain: [k, g, c]\n",
+                "\n",
+                "# Initialize the first two diagonals.\n",
+                "# The `previous` would contain the values [a].\n",
+                "# The `current` would contain the values [f, b]. \n",
+                "previous[0] = 0\n",
+                "current[0:2] = 1\n",
+                "previous, current, following"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "Now we can rewrite the first nested loop for the upper-left triangle of the matrix in NumPy primitives, using it's `np.minimum` function to calculate the minimum of three values."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "# To evaluate every subsequent entry:\n",
+                "next_diagonal_index = 2\n",
+                "while next_diagonal_index < n:\n",
+                "    next_skew_diagonal_length = next_diagonal_index + 1\n",
+                "\n",
+                "    old_substitution_costs = previous[:next_skew_diagonal_length - 2]\n",
+                "    added_substitution_costs = [s1[next_diagonal_index - i - 2] != s2[i] for i in range(next_skew_diagonal_length - 2)]\n",
+                "    substitution_costs = old_substitution_costs + added_substitution_costs\n",
+                "\n",
+                "    following[1:next_skew_diagonal_length - 1] = np.minimum(current[1:next_skew_diagonal_length - 1] + 1, current[:next_skew_diagonal_length - 2] + 1) # Insertions or deletions\n",
+                "    following[1:next_skew_diagonal_length - 1] = np.minimum(following[1:next_skew_diagonal_length - 1], substitution_costs) # Substitutions\n",
+                "    following[0] = next_diagonal_index\n",
+                "    following[next_skew_diagonal_length - 1] = next_diagonal_index\n",
+                "    assert np.all(following[:next_skew_diagonal_length] == get_skewed_diagonal(matrix_wf, next_diagonal_index))\n",
+                "    \n",
+                "    previous[:] = current[:]\n",
+                "    current[:] = following[:]\n",
+                "    next_diagonal_index += 1\n",
+                "\n",
+                "previous, current, following # Log the state"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "By now we've scanned through the upper-left triangle of the matrix, where each subsequent iteration results in a larger diagonal.\n",
+                "From now onwards, we will be shrinking.\n",
+                "Instead of adding value equal to the skewed diagonal index on either side, we will be cropping those values out."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "while next_diagonal_index < 2 * n - 1:\n",
+                "    next_skew_diagonal_length = 2 * n - 1 - next_diagonal_index\n",
+                "    old_substitution_costs = previous[:next_skew_diagonal_length]\n",
+                "    added_substitution_costs = [s1[len(s1) - i - 1] != s2[next_diagonal_index - n + i] for i in range(next_skew_diagonal_length)]\n",
+                "    substitution_costs = old_substitution_costs + added_substitution_costs\n",
+                "    \n",
+                "    following[:next_skew_diagonal_length] = np.minimum(current[:next_skew_diagonal_length] + 1, current[1 : next_skew_diagonal_length + 1] + 1) # Insertions or deletions\n",
+                "    following[:next_skew_diagonal_length] = np.minimum(following[:next_skew_diagonal_length], substitution_costs) # Substitutions\n",
+                "    assert np.all(following[:next_skew_diagonal_length] == get_skewed_diagonal(matrix_wf, next_diagonal_index)), f\"\\n{following[:next_skew_diagonal_length]} not equal to \\n{get_skewed_diagonal(baseline, next_diagonal_index)}\"\n",
+                "    \n",
+                "    previous[:next_skew_diagonal_length] = current[1:next_skew_diagonal_length + 1]\n",
+                "    current[:next_skew_diagonal_length] = following[:next_skew_diagonal_length]\n",
+                "    next_diagonal_index += 1\n",
+                "\n",
+                "previous, current, following # Log the state"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "assert distance_wf == following[0], f\"{distance_wf = } != {following[0] = }\""
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## Generalizing to Non-Square Matrices\n",
+                "\n",
+                "Let's imaging 2 inputs of length 3 and 5: `\"KPU\"` and `\"BCDEF\"`:\n",
+                "\n",
+                "|       |     | **B** | **C** | **D** | **E** | **F** |\n",
+                "| ----- | --- | ----- | ----- | ----- | ----- | ----- |\n",
+                "|       | a   | b     | c     | d     | e     | f     |\n",
+                "| **K** | g   | h     | i     | j     | k     | l     |\n",
+                "| **P** | m   | n     | o     | p     | q     | r     |\n",
+                "| **U** | s   | t     | u     | v     | w     | x     |\n",
+                "\n",
+                "At any point we will be working with 3 diagonals:\n",
+                "\n",
+                "- `previous` set to `[a]` at start\n",
+                "- `current` set to `[g, b]` at start\n",
+                "- `next` set to `[m, h, c]` at start\n",
+                "\n",
+                "Once we proceed to for X cycles:\n",
+                "\n",
+                "- `previous` set to `[s, n, i, d]`\n",
+                "- `current` set to `[t, o, j, e]`\n",
+                "- `next` set to `[u, p, k, f]`\n"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "from typing import Optional\n",
+                "\n",
+                "def skewed_diagonals(\n",
+                "    s1: str, s2: str, \n",
+                "    verbose: bool = False, \n",
+                "    baseline: Optional[np.ndarray] = None) -> Tuple[int, np.ndarray]:\n",
+                "    \n",
+                "    shorter, longer = (s1, s2) if len(s1) <= len(s2) else (s2, s1)    \n",
+                "    baseline = baseline if len(s1) <= len(s2) else baseline.T\n",
+                "    shorter_dim = len(shorter) + 1\n",
+                "    longer_dim = len(longer) + 1\n",
+                "    if verbose:\n",
+                "        print(f\"{shorter=}, {longer=}, {shorter_dim=}, {longer_dim=}\")\n",
+                "    \n",
+                "    # Create a matrix of size (shorter_dim) x (longer_dim)\n",
+                "    matrix = np.zeros((shorter_dim, longer_dim), dtype=int)\n",
+                "    matrix[:, :] = longer_dim + 1 # or +inf \n",
+                "\n",
+                "    # Initialize the first column and first row of the matrix\n",
+                "    for i in range(shorter_dim):\n",
+                "        matrix[i, 0] = i\n",
+                "    for j in range(longer_dim):\n",
+                "        matrix[0, j] = j\n",
+                "\n",
+                "    # Let's say we are dealing with 3 and 5 letter words.\n",
+                "    # The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).\n",
+                "    # It will have:\n",
+                "    # - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.\n",
+                "    # - 2 diagonals of fixed length, at positions: 4, 5.\n",
+                "    # - 3 diagonals of decreasing length, at positions: 6, 7, 8.\n",
+                "    diagonals_count = shorter_dim + longer_dim - 1\n",
+                "\n",
+                "    # Same as with square matrices, the 0th diagonal contains - just one element - zero - skipping it.\n",
+                "    # Same as with square matrices, the 1st diagonal contains the values 1 and 1 - skipping it.\n",
+                "    # Now let's handle the rest of the upper-left triangle.\n",
+                "    for skew_diagonal_index in range(2, shorter_dim):\n",
+                "        skew_diagonal_length = (skew_diagonal_index + 1)\n",
+                "        for offset_within_diagonal in range(1, skew_diagonal_length - 1): # ! Skip the first column & row\n",
+                "            # If we haven't passed the main skew diagonal yet, \n",
+                "            # then we have to skip the first and the last operation,\n",
+                "            # as those are already pre-populated and form the first column \n",
+                "            # and the first row of the Levenshtein matrix respectively.\n",
+                "            i = skew_diagonal_index - offset_within_diagonal\n",
+                "            j = offset_within_diagonal\n",
+                "            if verbose:\n",
+                "                print(f\"top left triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+                "            shorter_char = shorter[i - 1]\n",
+                "            longer_char = longer[j - 1]\n",
+                "            substitution_cost = shorter_char != longer_char\n",
+                "            matrix[i, j] = min(\n",
+                "                matrix[i - 1, j] + 1,                      # ? Deletion cost\n",
+                "                matrix[i, j - 1] + 1,                      # ? Insertion cost\n",
+                "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
+                "            )\n",
+                "            \n",
+                "            if baseline is not None:\n",
+                "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
+                "                            \n",
+                "    # Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.        \n",
+                "    for skew_diagonal_index in range(shorter_dim, longer_dim):\n",
+                "        skew_diagonal_length = shorter_dim\n",
+                "        for offset_within_diagonal in range(skew_diagonal_length - 1): # ! Skip the first row\n",
+                "            i = shorter_dim - offset_within_diagonal - 1\n",
+                "            j = skew_diagonal_index - shorter_dim + offset_within_diagonal + 1\n",
+                "            if verbose:\n",
+                "                print(f\"anti-band: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+                "            shorter_char = shorter[i - 1]\n",
+                "            longer_char = longer[j - 1]\n",
+                "            substitution_cost = shorter_char != longer_char\n",
+                "            matrix[i, j] = min(\n",
+                "                matrix[i - 1, j] + 1,                      # ? Deletion cost\n",
+                "                matrix[i, j - 1] + 1,                      # ? Insertion cost\n",
+                "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
+                "            )\n",
+                "            \n",
+                "            if baseline is not None:\n",
+                "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
+                "    \n",
+                "    # Now let's handle the bottom right triangle.\n",
+                "    for skew_diagonal_index in range(longer_dim, diagonals_count):\n",
+                "        skew_diagonal_length = diagonals_count - skew_diagonal_index\n",
+                "        for offset_within_diagonal in range(skew_diagonal_length):\n",
+                "            i = shorter_dim - offset_within_diagonal - 1\n",
+                "            j = skew_diagonal_index - shorter_dim + offset_within_diagonal + 1\n",
+                "            if verbose:\n",
+                "                print(f\"bottom right triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+                "            assert (i - 1) >= 0 and (i - 1) < len(shorter), f\"{i = }\"\n",
+                "            assert (j - 1) >= 0 and (j - 1) < len(longer), f\"{j = }\"\n",
+                "            shorter_char = shorter[i - 1]\n",
+                "            longer_char = longer[j - 1]\n",
+                "            substitution_cost = shorter_char != longer_char\n",
+                "            matrix[i, j] = min(\n",
+                "                matrix[i - 1, j] + 1,                      # ? Deletion cost\n",
+                "                matrix[i, j - 1] + 1,                      # ? Insertion cost\n",
+                "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
+                "            )\n",
+                "            \n",
+                "            if baseline is not None:\n",
+                "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
+                "\n",
+                "    # Return the Levenshtein distance\n",
+                "    distance = matrix[len(shorter), len(longer)]\n",
+                "    if len(s1) > len(s2):\n",
+                "        matrix = matrix.T\n",
+                "    return distance, matrix"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "import random\n",
+                "for _ in range(100):\n",
+                "    len1 = random.randint(1, 50)\n",
+                "    len2 = random.randint(1, 50)\n",
+                "    s1 = ''.join(random.choices(\"abc\", k=len1))\n",
+                "    s2 = ''.join(random.choices(\"abc\", k=len2))\n",
+                "    distance_wf, matrix_wf = wagner_fisher(s1, s2)\n",
+                "    distance_sd, matrix_sd = skewed_diagonals(s1, s2, baseline=matrix_wf, verbose=False)\n",
+                "    assert distance_wf == distance_sd, f\"{distance_wf = } != {distance_sd = }\"\n",
+                "    assert np.all(matrix_wf == matrix_sd), f\"{matrix_wf = }\\n{matrix_sd = }\""
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "s1 = \"listeners\"\n",
+                "s2 = \"silents\"\n",
+                "distance_wf, matrix_wf = wagner_fisher(s1, s2)\n",
+                "distance_sd, matrix_sd = skewed_diagonals(s1, s2, baseline=matrix_wf)\n",
+                "s1, s2, f\"{distance_sd = }\", matrix_sd"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## Reversing the Input\n",
+                "\n",
+                "One of the issues with vectorizing this algorithm is the traversal order of the shorter string.\n",
+                "It's different from the longer string and different from the natural traversal order of the loop.\n",
+                "To make the indexing simpler, we can pre-reverse the shorter string."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "from typing import Optional\n",
+                "\n",
+                "def skewed_diagonals_reversed(\n",
+                "    s1: str, s2: str, \n",
+                "    verbose: bool = False, \n",
+                "    baseline: Optional[np.ndarray] = None) -> Tuple[int, np.ndarray]:\n",
+                "    \n",
+                "    shorter, longer = (s1, s2) if len(s1) <= len(s2) else (s2, s1)    \n",
+                "    baseline = baseline if len(s1) <= len(s2) else baseline.T\n",
+                "    shorter_dim = len(shorter) + 1\n",
+                "    longer_dim = len(longer) + 1\n",
+                "    if verbose:\n",
+                "        print(f\"{shorter=}, {longer=}, {shorter_dim=}, {longer_dim=}\")\n",
+                "    \n",
+                "    # Create a matrix of size (shorter_dim) x (longer_dim)\n",
+                "    matrix = np.zeros((shorter_dim, longer_dim), dtype=int)\n",
+                "    matrix[:, :] = longer_dim + 1 # or +inf \n",
+                "\n",
+                "    # Initialize the first column and first row of the matrix\n",
+                "    for i in range(shorter_dim):\n",
+                "        matrix[i, 0] = i\n",
+                "    for j in range(longer_dim):\n",
+                "        matrix[0, j] = j\n",
+                "\n",
+                "    # Let's say we are dealing with 3 and 5 letter words.\n",
+                "    # The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).\n",
+                "    # It will have:\n",
+                "    # - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.\n",
+                "    # - 2 diagonals of fixed length, at positions: 4, 5.\n",
+                "    # - 3 diagonals of decreasing length, at positions: 6, 7, 8.\n",
+                "    diagonals_count = shorter_dim + longer_dim - 1\n",
+                "    shorter_reversed = \"\".join(reversed(shorter))\n",
+                "\n",
+                "    # In reality, we need to keep only 3 diagonals to produce the same score in the end.\n",
+                "    previous_distances = np.zeros(shorter_dim, dtype=np.uint)\n",
+                "    current_distances = np.zeros(shorter_dim, dtype=np.uint)\n",
+                "    next_distances = np.zeros(shorter_dim, dtype=np.uint)\n",
+                "    temporary_distances = np.zeros(shorter_dim, dtype=np.uint)\n",
+                "    previous_distances[0] = 0\n",
+                "    current_distances[0] = current_distances[1] = 1\n",
+                "\n",
+                "    # Same as with square matrices, the 0th diagonal contains - just one element - zero - skipping it.\n",
+                "    # Same as with square matrices, the 1st diagonal contains the values 1 and 1 - skipping it.\n",
+                "    # Now let's handle the rest of the upper-left triangle.\n",
+                "    for skew_diagonal_index in range(2, shorter_dim):\n",
+                "        skew_diagonal_length = (skew_diagonal_index + 1)\n",
+                "        for offset_in_diagonal in range(1, skew_diagonal_length - 1): # ! Skip the left column & top row\n",
+                "            # If we haven't passed the main skew diagonal yet, \n",
+                "            # then we have to skip the first and the last operation,\n",
+                "            # as those are already pre-populated and form the first column \n",
+                "            # and the first row of the Levenshtein matrix respectively.\n",
+                "            i = skew_diagonal_index - offset_in_diagonal\n",
+                "            j = offset_in_diagonal\n",
+                "            if verbose:\n",
+                "                print(f\"top left triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+                "            shorter_char = shorter_reversed[len(shorter) - i]\n",
+                "            longer_char = longer[j - 1]\n",
+                "            substitution_cost = shorter_char != longer_char\n",
+                "            matrix[i, j] = min(\n",
+                "                matrix[i - 1, j] + 1,                      # ? Deletion cost\n",
+                "                matrix[i, j - 1] + 1,                      # ? Insertion cost\n",
+                "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
+                "            )\n",
+                "\n",
+                "            # ? For reproducibility let's also try doing the same only using the info in the 3 diagonals\n",
+                "            next_distances[offset_in_diagonal] = min(\n",
+                "                current_distances[offset_in_diagonal - 1] + 1,\n",
+                "                current_distances[offset_in_diagonal] + 1,\n",
+                "                previous_distances[offset_in_diagonal - 1] + substitution_cost,\n",
+                "            )\n",
+                "            \n",
+                "            if baseline is not None:\n",
+                "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
+                "            \n",
+                "        next_distances[0] = next_distances[skew_diagonal_length-1] = skew_diagonal_index\n",
+                "        \n",
+                "        # ? Let's validate the contents of the diagonal\n",
+                "        skew_diagonal_expected = get_skewed_diagonal(matrix, skew_diagonal_index)\n",
+                "        assert len(skew_diagonal_expected) == skew_diagonal_length\n",
+                "        assert (skew_diagonal_expected == next_distances[:skew_diagonal_length]).all(), f\"diagonal:{skew_diagonal_index}\\nexpected:{skew_diagonal_expected}\\nproduced:{next_distances[:skew_diagonal_length]}\"\n",
+                "        temporary_distances[:] = previous_distances[:]\n",
+                "        previous_distances[:] = current_distances[:]\n",
+                "        current_distances[:] = next_distances[:]\n",
+                "        next_distances[:] = temporary_distances[:]\n",
+                "\n",
+                "    # Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.        \n",
+                "    for skew_diagonal_index in range(shorter_dim, longer_dim):\n",
+                "        skew_diagonal_length = shorter_dim\n",
+                "        for offset_in_diagonal in range(skew_diagonal_length - 1): # ! Skip the top row\n",
+                "            i = shorter_dim - offset_in_diagonal - 1\n",
+                "            j = skew_diagonal_index - shorter_dim + offset_in_diagonal + 1\n",
+                "            if verbose:\n",
+                "                print(f\"anti-band: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+                "            shorter_char = shorter_reversed[len(shorter) - i]\n",
+                "            longer_char = longer[j - 1]\n",
+                "            substitution_cost = shorter_char != longer_char\n",
+                "            matrix[i, j] = min(\n",
+                "                matrix[i - 1, j] + 1,                      # ? Deletion cost\n",
+                "                matrix[i, j - 1] + 1,                      # ? Insertion cost\n",
+                "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
+                "            )\n",
+                "            \n",
+                "            if baseline is not None:\n",
+                "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
+                "    \n",
+                "            # ? For reproducibility let's also try doing the same only using the info in the 3 diagonals\n",
+                "            next_distances[offset_in_diagonal] = min(\n",
+                "                current_distances[offset_in_diagonal] + 1,\n",
+                "                current_distances[offset_in_diagonal + 1] + 1,\n",
+                "                previous_distances[offset_in_diagonal] + substitution_cost,\n",
+                "            )\n",
+                "            \n",
+                "        next_distances[shorter_dim-1] = skew_diagonal_index\n",
+                "        \n",
+                "        # ? Let's validate the contents of the diagonal\n",
+                "        skew_diagonal_expected = get_skewed_diagonal(matrix, skew_diagonal_index)\n",
+                "        assert len(skew_diagonal_expected) == skew_diagonal_length\n",
+                "        assert (skew_diagonal_expected == next_distances[:skew_diagonal_length]).all(), f\"diagonal:{skew_diagonal_index}\\nexpected:{skew_diagonal_expected}\\nproduced:{next_distances[:skew_diagonal_length]}\"\n",
+                "        temporary_distances[:] = previous_distances[:]\n",
+                "        previous_distances[:-1] = current_distances[1:] # ! Note we shift here\n",
+                "        current_distances[:] = next_distances[:]\n",
+                "        next_distances[:] = temporary_distances[:]\n",
+                "    \n",
+                "    # Now let's handle the bottom right triangle.\n",
+                "    for skew_diagonal_index in range(longer_dim, diagonals_count):\n",
+                "        skew_diagonal_length = diagonals_count - skew_diagonal_index\n",
+                "        for offset_in_diagonal in range(skew_diagonal_length):\n",
+                "            i = shorter_dim - offset_in_diagonal - 1\n",
+                "            j = skew_diagonal_index - shorter_dim + offset_in_diagonal + 1\n",
+                "            if verbose:\n",
+                "                print(f\"bottom right triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+                "            assert (i - 1) >= 0 and (i - 1) < len(shorter), f\"{i = }\"\n",
+                "            assert (j - 1) >= 0 and (j - 1) < len(longer), f\"{j = }\"\n",
+                "            shorter_char = shorter_reversed[len(shorter) - i]\n",
+                "            longer_char = longer[j - 1]\n",
+                "            print(f\"{shorter_char=}, {longer_char=}\")\n",
+                "            substitution_cost = shorter_char != longer_char\n",
+                "            matrix[i, j] = min(\n",
+                "                matrix[i - 1, j] + 1,                      # ? Deletion cost\n",
+                "                matrix[i, j - 1] + 1,                      # ? Insertion cost\n",
+                "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
+                "            )\n",
+                "            \n",
+                "            if baseline is not None:\n",
+                "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
+                "\n",
+                "            # ? For reproducibility let's also try doing the same only using the info in the 3 diagonals\n",
+                "            next_distances[offset_in_diagonal] = min(\n",
+                "                current_distances[offset_in_diagonal] + 1,\n",
+                "                current_distances[offset_in_diagonal + 1] + 1,\n",
+                "                previous_distances[offset_in_diagonal] + substitution_cost,\n",
+                "            )\n",
+                "\n",
+                "        # ? Let's validate the contents of the diagonal\n",
+                "        skew_diagonal_expected = get_skewed_diagonal(matrix, skew_diagonal_index)\n",
+                "        assert len(skew_diagonal_expected) == skew_diagonal_length\n",
+                "        assert (skew_diagonal_expected == next_distances[:skew_diagonal_length]).all(), f\"diagonal:{skew_diagonal_index}\\nexpected:{skew_diagonal_expected}\\nproduced:{next_distances[:skew_diagonal_length]}\"\n",
+                "        temporary_distances[:] = previous_distances[:]\n",
+                "        previous_distances[:-1] = current_distances[1:] # ! Note we shift here\n",
+                "        current_distances[:] = next_distances[:]\n",
+                "        next_distances[:] = temporary_distances[:]\n",
+                "\n",
+                "    # Return the Levenshtein distance\n",
+                "    distance_from_matrix = matrix[len(shorter), len(longer)]\n",
+                "    distance_from_diagonal = current_distances[0]\n",
+                "    assert distance_from_diagonal == distance_from_matrix\n",
+                "    if len(s1) > len(s2):\n",
+                "        matrix = matrix.T\n",
+                "    return distance_from_matrix, matrix"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "import random\n",
+                "for _ in range(100):\n",
+                "    len1 = random.randint(1, 50)\n",
+                "    len2 = random.randint(1, 50)\n",
+                "    s1 = ''.join(random.choices(\"abc\", k=len1))\n",
+                "    s2 = ''.join(random.choices(\"abc\", k=len2))\n",
+                "    distance_wf, matrix_wf = wagner_fisher(s1, s2)\n",
+                "    distance_sd, matrix_sd = skewed_diagonals_reversed(s1, s2, baseline=matrix_wf, verbose=False)\n",
+                "    assert distance_wf == distance_sd, f\"{distance_wf = } != {distance_sd = }\"\n",
+                "    assert np.all(matrix_wf == matrix_sd), f\"{matrix_wf = }\\n{matrix_sd = }\""
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "s1 = \"listeners\"\n",
+                "s2 = \"silents\"\n",
+                "distance_wf, matrix_wf = wagner_fisher(s1, s2)\n",
+                "distance_sd, matrix_sd = skewed_diagonals_reversed(s1, s2, baseline=matrix_wf)\n",
+                "s1, s2, f\"{distance_wf = }\", f\"{distance_sd = }\", matrix_sd"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "s1 = \"atca\"\n",
+                "s2 = \"ctactcaccc\"\n",
+                "distance_wf, matrix_wf = wagner_fisher(s1, s2)\n",
+                "distance_sd, matrix_sd = skewed_diagonals_reversed(s1, s2, baseline=matrix_wf)\n",
+                "s1, s2, f\"{distance_wf = }\", f\"{distance_sd = }\", matrix_sd"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## Shift-less with Reverse Order"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "from typing import Optional\n",
+                "\n",
+                "def skewed_diagonals_reversed(\n",
+                "    s1: str, s2: str, \n",
+                "    verbose: bool = False, \n",
+                "    baseline: Optional[np.ndarray] = None) -> Tuple[int, np.ndarray]:\n",
+                "    \n",
+                "    shorter, longer = (s1, s2) if len(s1) <= len(s2) else (s2, s1)    \n",
+                "    baseline = baseline if len(s1) <= len(s2) else baseline.T\n",
+                "    shorter_dim = len(shorter) + 1\n",
+                "    longer_dim = len(longer) + 1\n",
+                "    if verbose:\n",
+                "        print(f\"{shorter=}, {longer=}, {shorter_dim=}, {longer_dim=}\")\n",
+                "    \n",
+                "    # Create a matrix of size (shorter_dim) x (longer_dim)\n",
+                "    matrix = np.zeros((shorter_dim, longer_dim), dtype=int)\n",
+                "    matrix[:, :] = longer_dim + 1 # or +inf \n",
+                "\n",
+                "    # Initialize the first column and first row of the matrix\n",
+                "    for i in range(shorter_dim):\n",
+                "        matrix[i, 0] = i\n",
+                "    for j in range(longer_dim):\n",
+                "        matrix[0, j] = j\n",
+                "\n",
+                "    # Let's say we are dealing with 3 and 5 letter words.\n",
+                "    # The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).\n",
+                "    # It will have:\n",
+                "    # - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.\n",
+                "    # - 2 diagonals of fixed length, at positions: 4, 5.\n",
+                "    # - 3 diagonals of decreasing length, at positions: 6, 7, 8.\n",
+                "    diagonals_count = shorter_dim + longer_dim - 1\n",
+                "    shorter_reversed = \"\".join(reversed(shorter))\n",
+                "\n",
+                "    # In reality, we need to keep only 3 diagonals to produce the same score in the end.\n",
+                "    previous_distances = np.zeros(shorter_dim, dtype=np.uint)\n",
+                "    current_distances = np.zeros(shorter_dim, dtype=np.uint)\n",
+                "    next_distances = np.zeros(shorter_dim, dtype=np.uint)\n",
+                "    temporary_distances = np.zeros(shorter_dim, dtype=np.uint)\n",
+                "    previous_distances[0] = 0\n",
+                "    current_distances[0] = current_distances[1] = 1\n",
+                "\n",
+                "    # Same as with square matrices, the 0th diagonal contains - just one element - zero - skipping it.\n",
+                "    # Same as with square matrices, the 1st diagonal contains the values 1 and 1 - skipping it.\n",
+                "    # Now let's handle the rest of the upper-left triangle.\n",
+                "    for skew_diagonal_index in range(2, shorter_dim):\n",
+                "        skew_diagonal_length = (skew_diagonal_index + 1)\n",
+                "        for offset_in_diagonal in range(1, skew_diagonal_length - 1): # ! Skip the left column & top row\n",
+                "            # If we haven't passed the main skew diagonal yet, \n",
+                "            # then we have to skip the first and the last operation,\n",
+                "            # as those are already pre-populated and form the first column \n",
+                "            # and the first row of the Levenshtein matrix respectively.\n",
+                "            i = skew_diagonal_index - offset_in_diagonal\n",
+                "            j = offset_in_diagonal\n",
+                "            if verbose:\n",
+                "                print(f\"top left triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+                "            shorter_char = shorter_reversed[len(shorter) - i]\n",
+                "            longer_char = longer[j - 1]\n",
+                "            substitution_cost = shorter_char != longer_char\n",
+                "            matrix[i, j] = min(\n",
+                "                matrix[i - 1, j] + 1,                      # ? Deletion cost\n",
+                "                matrix[i, j - 1] + 1,                      # ? Insertion cost\n",
+                "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
+                "            )\n",
+                "\n",
+                "            # ? For reproducibility let's also try doing the same only using the info in the 3 diagonals\n",
+                "            next_distances[offset_in_diagonal] = min(\n",
+                "                current_distances[offset_in_diagonal - 1] + 1,\n",
+                "                current_distances[offset_in_diagonal] + 1,\n",
+                "                previous_distances[offset_in_diagonal - 1] + substitution_cost,\n",
+                "            )\n",
+                "            \n",
+                "            if baseline is not None:\n",
+                "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
+                "            \n",
+                "        next_distances[0] = next_distances[skew_diagonal_length-1] = skew_diagonal_index\n",
+                "        \n",
+                "        # ? Let's validate the contents of the diagonal\n",
+                "        skew_diagonal_expected = get_skewed_diagonal(matrix, skew_diagonal_index)\n",
+                "        assert len(skew_diagonal_expected) == skew_diagonal_length\n",
+                "        assert (skew_diagonal_expected == next_distances[:skew_diagonal_length]).all(), f\"diagonal:{skew_diagonal_index}\\nexpected:{skew_diagonal_expected}\\nproduced:{next_distances[:skew_diagonal_length]}\"\n",
+                "        temporary_distances[:] = previous_distances[:]\n",
+                "        previous_distances[:] = current_distances[:]\n",
+                "        current_distances[:] = next_distances[:]\n",
+                "        next_distances[:] = temporary_distances[:]\n",
+                "\n",
+                "    # Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.        \n",
+                "    for skew_diagonal_index in range(shorter_dim, longer_dim):\n",
+                "        skew_diagonal_length = shorter_dim\n",
+                "        for offset_in_diagonal in range(skew_diagonal_length - 1): # ! Skip the top row\n",
+                "            i = shorter_dim - offset_in_diagonal - 1\n",
+                "            j = skew_diagonal_index - shorter_dim + offset_in_diagonal + 1\n",
+                "            if verbose:\n",
+                "                print(f\"anti-band: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+                "            shorter_char = shorter_reversed[len(shorter) - i]\n",
+                "            longer_char = longer[j - 1]\n",
+                "            substitution_cost = shorter_char != longer_char\n",
+                "            matrix[i, j] = min(\n",
+                "                matrix[i - 1, j] + 1,                      # ? Deletion cost\n",
+                "                matrix[i, j - 1] + 1,                      # ? Insertion cost\n",
+                "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
+                "            )\n",
+                "            \n",
+                "            if baseline is not None:\n",
+                "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
+                "    \n",
+                "            # ? For reproducibility let's also try doing the same only using the info in the 3 diagonals\n",
+                "            next_distances[offset_in_diagonal] = min(\n",
+                "                current_distances[offset_in_diagonal] + 1,\n",
+                "                current_distances[offset_in_diagonal + 1] + 1,\n",
+                "                previous_distances[offset_in_diagonal] + substitution_cost,\n",
+                "            )\n",
+                "            \n",
+                "        next_distances[shorter_dim-1] = skew_diagonal_index\n",
+                "        \n",
+                "        # ? Let's validate the contents of the diagonal\n",
+                "        skew_diagonal_expected = get_skewed_diagonal(matrix, skew_diagonal_index)\n",
+                "        assert len(skew_diagonal_expected) == skew_diagonal_length\n",
+                "        assert (skew_diagonal_expected == next_distances[:skew_diagonal_length]).all(), f\"diagonal:{skew_diagonal_index}\\nexpected:{skew_diagonal_expected}\\nproduced:{next_distances[:skew_diagonal_length]}\"\n",
+                "        temporary_distances[:] = previous_distances[:]\n",
+                "        previous_distances[:-1] = current_distances[1:] # ! Note we shift here\n",
+                "        current_distances[:] = next_distances[:]\n",
+                "        next_distances[:] = temporary_distances[:]\n",
+                "    \n",
+                "    # Now let's handle the bottom right triangle.\n",
+                "    for skew_diagonal_index in range(longer_dim, diagonals_count):\n",
+                "        skew_diagonal_length = diagonals_count - skew_diagonal_index\n",
+                "        for offset_in_diagonal in range(skew_diagonal_length):\n",
+                "            i = shorter_dim - offset_in_diagonal - 1\n",
+                "            j = skew_diagonal_index - shorter_dim + offset_in_diagonal + 1\n",
+                "            if verbose:\n",
+                "                print(f\"bottom right triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\")\n",
+                "            assert (i - 1) >= 0 and (i - 1) < len(shorter), f\"{i = }\"\n",
+                "            assert (j - 1) >= 0 and (j - 1) < len(longer), f\"{j = }\"\n",
+                "            shorter_char = shorter_reversed[len(shorter) - i]\n",
+                "            longer_char = longer[j - 1]\n",
+                "            print(f\"{shorter_char=}, {longer_char=}\")\n",
+                "            substitution_cost = shorter_char != longer_char\n",
+                "            matrix[i, j] = min(\n",
+                "                matrix[i - 1, j] + 1,                      # ? Deletion cost\n",
+                "                matrix[i, j - 1] + 1,                      # ? Insertion cost\n",
+                "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
+                "            )\n",
+                "            \n",
+                "            if baseline is not None:\n",
+                "                assert matrix[i, j] == baseline[i, j], f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
+                "\n",
+                "            # ? For reproducibility let's also try doing the same only using the info in the 3 diagonals\n",
+                "            next_distances[offset_in_diagonal] = min(\n",
+                "                current_distances[offset_in_diagonal] + 1,\n",
+                "                current_distances[offset_in_diagonal + 1] + 1,\n",
+                "                previous_distances[offset_in_diagonal] + substitution_cost,\n",
+                "            )\n",
+                "\n",
+                "        # ? Let's validate the contents of the diagonal\n",
+                "        skew_diagonal_expected = get_skewed_diagonal(matrix, skew_diagonal_index)\n",
+                "        assert len(skew_diagonal_expected) == skew_diagonal_length\n",
+                "        assert (skew_diagonal_expected == next_distances[:skew_diagonal_length]).all(), f\"diagonal:{skew_diagonal_index}\\nexpected:{skew_diagonal_expected}\\nproduced:{next_distances[:skew_diagonal_length]}\"\n",
+                "        temporary_distances[:] = previous_distances[:]\n",
+                "        previous_distances[:-1] = current_distances[1:] # ! Note we shift here\n",
+                "        current_distances[:] = next_distances[:]\n",
+                "        next_distances[:] = temporary_distances[:]\n",
+                "\n",
+                "    # Return the Levenshtein distance\n",
+                "    distance_from_matrix = matrix[len(shorter), len(longer)]\n",
+                "    distance_from_diagonal = current_distances[0]\n",
+                "    assert distance_from_diagonal == distance_from_matrix\n",
+                "    if len(s1) > len(s2):\n",
+                "        matrix = matrix.T\n",
+                "    return distance_from_matrix, matrix"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## Bounding the Error\n",
+                "\n",
+                "It's easy to spot that the algorithm can be further optimized if we are dealing with \"bounded\" edit distances, where the maximum allowed number of edits is known in advance.\n",
+                "In such cases, we only need to evaluate a band around the main diagonal, and can skip the rest of the matrix.\n",
+                "For the bound $k$, we only need to evaluate $2k+1$ diagonals."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "from typing import Optional\n",
+                "\n",
+                "\n",
+                "def bounded_skewed_diagonals(\n",
+                "    s1: str,\n",
+                "    s2: str,\n",
+                "    verbose: bool = False,\n",
+                "    bound: Optional[int] = None,\n",
+                "    baseline: Optional[np.ndarray] = None,\n",
+                ") -> Tuple[int, np.ndarray]:\n",
+                "\n",
+                "    shorter, longer = (s1, s2) if len(s1) <= len(s2) else (s2, s1)\n",
+                "    baseline = baseline if len(s1) <= len(s2) else baseline.T\n",
+                "    shorter_dim = len(shorter) + 1\n",
+                "    longer_dim = len(longer) + 1\n",
+                "    if verbose:\n",
+                "        print(f\"{shorter=}, {longer=}, {shorter_dim=}, {longer_dim=}\")\n",
+                "\n",
+                "    # Create a matrix of size (shorter_dim) x (longer_dim)\n",
+                "    matrix = np.zeros((shorter_dim, longer_dim), dtype=int)\n",
+                "    matrix[:, :] = np.iinfo(matrix.dtype).max\n",
+                "\n",
+                "    # Initialize the first column and first row of the matrix\n",
+                "    for i in range(shorter_dim):\n",
+                "        matrix[i, 0] = i\n",
+                "    for j in range(longer_dim):\n",
+                "        matrix[0, j] = j\n",
+                "\n",
+                "    # Let's say we are dealing with 3 and 5 letter words.\n",
+                "    # The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).\n",
+                "    # It will have:\n",
+                "    # - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.\n",
+                "    # - 2 diagonals of fixed length, at positions: 4, 5.\n",
+                "    # - 3 diagonals of decreasing length, at positions: 6, 7, 8.\n",
+                "    diagonals_count = shorter_dim + longer_dim - 1\n",
+                "\n",
+                "    # Same as with square matrices, the 0th diagonal contains - just one element - zero - skipping it.\n",
+                "    # Same as with square matrices, the 1st diagonal contains the values 1 and 1 - skipping it.\n",
+                "    # In unbounded case, we the upper-left triangle will have `shorter_dim` rows and columns.\n",
+                "    # In bounded case, we will have `min(bound, shorter_dim)` rows and columns.\n",
+                "    upper_triangle_dim = min(bound, shorter_dim) if bound is not None else shorter_dim\n",
+                "    for skew_diagonal_index in range(2, upper_triangle_dim):\n",
+                "        skew_diagonal_length = skew_diagonal_index + 1\n",
+                "        for offset_within_diagonal in range(\n",
+                "            1, skew_diagonal_length - 1\n",
+                "        ):  #! Skip the first column & row\n",
+                "            # If we haven't passed the main skew diagonal yet,\n",
+                "            # then we have to skip the first and the last operation,\n",
+                "            # as those are already pre-populated and form the first column\n",
+                "            # and the first row of the Levenshtein matrix respectively.\n",
+                "            i = skew_diagonal_index - offset_within_diagonal\n",
+                "            j = offset_within_diagonal\n",
+                "            if verbose:\n",
+                "                print(\n",
+                "                    f\"top left triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\"\n",
+                "                )\n",
+                "            shorter_char = shorter[i - 1]\n",
+                "            longer_char = longer[j - 1]\n",
+                "            substitution_cost = shorter_char != longer_char\n",
+                "            matrix[i, j] = min(\n",
+                "                matrix[i - 1, j] + 1,  # ? Deletion cost\n",
+                "                matrix[i, j - 1] + 1,  # ? Insertion cost\n",
+                "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
+                "            )\n",
+                "            \n",
+                "            # Validation checks:\n",
+                "            if baseline is not None:\n",
+                "                assert (\n",
+                "                    matrix[i, j] == baseline[i, j]\n",
+                "                ), f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
+                "\n",
+                "    # Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.\n",
+                "    # In the unbounded case, we will enumerate diagonal indices from `shorter_dim` to `longer_dim`.\n",
+                "    # In the bounded case, we go through the same \n",
+                "    for skew_diagonal_index in range(shorter_dim, longer_dim):\n",
+                "        skew_diagonal_length = shorter_dim\n",
+                "        for offset_within_diagonal in range(\n",
+                "            skew_diagonal_length - 1\n",
+                "        ):  #! Skip the first row\n",
+                "            i = shorter_dim - offset_within_diagonal - 1\n",
+                "            j = skew_diagonal_index - shorter_dim + offset_within_diagonal + 1\n",
+                "            if verbose:\n",
+                "                print(\n",
+                "                    f\"anti-band: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\"\n",
+                "                )\n",
+                "            shorter_char = shorter[i - 1]\n",
+                "            longer_char = longer[j - 1]\n",
+                "            substitution_cost = shorter_char != longer_char\n",
+                "            matrix[i, j] = min(\n",
+                "                matrix[i - 1, j] + 1,  # ? Deletion cost\n",
+                "                matrix[i, j - 1] + 1,  # ? Insertion cost\n",
+                "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
+                "            )\n",
+                "\n",
+                "            if baseline is not None:\n",
+                "                assert (\n",
+                "                    matrix[i, j] == baseline[i, j]\n",
+                "                ), f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
+                "\n",
+                "    # Now let's handle the bottom right triangle.\n",
+                "    for skew_diagonal_index in range(longer_dim, diagonals_count):\n",
+                "        skew_diagonal_length = diagonals_count - skew_diagonal_index\n",
+                "        for offset_within_diagonal in range(skew_diagonal_length):\n",
+                "            i = shorter_dim - offset_within_diagonal - 1\n",
+                "            j = skew_diagonal_index - shorter_dim + offset_within_diagonal + 1\n",
+                "            if verbose:\n",
+                "                print(\n",
+                "                    f\"bottom right triangle: {skew_diagonal_index=}, {skew_diagonal_length=}, {i=}, {j=}\"\n",
+                "                )\n",
+                "            assert (i - 1) >= 0 and (i - 1) < len(shorter), f\"{i = }\"\n",
+                "            assert (j - 1) >= 0 and (j - 1) < len(longer), f\"{j = }\"\n",
+                "            shorter_char = shorter[i - 1]\n",
+                "            longer_char = longer[j - 1]\n",
+                "            substitution_cost = shorter_char != longer_char\n",
+                "            matrix[i, j] = min(\n",
+                "                matrix[i - 1, j] + 1,  # ? Deletion cost\n",
+                "                matrix[i, j - 1] + 1,  # ? Insertion cost\n",
+                "                matrix[i - 1, j - 1] + substitution_cost,  # ? Substitution cost\n",
+                "            )\n",
+                "\n",
+                "            if baseline is not None:\n",
+                "                assert (\n",
+                "                    matrix[i, j] == baseline[i, j]\n",
+                "                ), f\"{matrix[i, j]} != {baseline[i, j]} at {i=}, {j=}\"\n",
+                "\n",
+                "    # Return the Levenshtein distance\n",
+                "    distance = matrix[len(shorter), len(longer)]\n",
+                "    if len(s1) > len(s2):\n",
+                "        matrix = matrix.T\n",
+                "    return distance, matrix"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## Putting Everything Together"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "def vectorized_skewed_diagonals(\n",
+                "    s1: str, s2: str, \n",
+                "    verbose: bool = False, \n",
+                "    baseline: Optional[np.ndarray] = None) -> Tuple[int, np.ndarray]:\n",
+                "    \n",
+                "    shorter, longer = (s1, s2) if len(s1) <= len(s2) else (s2, s1)    \n",
+                "    baseline = baseline if len(s1) <= len(s2) else baseline.T\n",
+                "    shorter_dim = len(shorter) + 1\n",
+                "    longer_dim = len(longer) + 1\n",
+                "    if verbose:\n",
+                "        print(f\"{shorter=}, {longer=}, {shorter_dim=}, {longer_dim=}\")\n",
+                "    \n",
+                "    # Create a matrix of size (shorter_dim) x (longer_dim)\n",
+                "    matrix = np.zeros((shorter_dim, longer_dim), dtype=int)\n",
+                "    matrix[:, :] = longer_dim + 1 # or +inf \n",
+                "\n",
+                "    # Initialize the first column and first row of the matrix\n",
+                "    for i in range(shorter_dim):\n",
+                "        matrix[i, 0] = i\n",
+                "    for j in range(longer_dim):\n",
+                "        matrix[0, j] = j\n",
+                "\n",
+                "    # Let's say we are dealing with 3 and 5 letter words.\n",
+                "    # The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).\n",
+                "    # It will have:\n",
+                "    # - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.\n",
+                "    # - 2 diagonals of fixed length, at positions: 4, 5.\n",
+                "    # - 3 diagonals of decreasing length, at positions: 6, 7, 8.\n",
+                "    diagonals_count = shorter_dim + longer_dim - 1\n",
+                "\n",
+                "    # Same as with square matrices, the 0th diagonal contains - just one element - zero - skipping it.\n",
+                "    # Same as with square matrices, the 1st diagonal contains the values 1 and 1 - skipping it.\n",
+                "    # Now let's handle the rest of the upper-left triangle.\n",
+                "    next_diagonal_index = 2\n",
+                "    while next_diagonal_index < shorter_dim:\n",
+                "        next_skew_diagonal_length = next_diagonal_index + 1\n",
+                "\n",
+                "        old_substitution_costs = previous[:next_skew_diagonal_length - 2]\n",
+                "        added_substitution_costs = [shorter[next_diagonal_index - offset_within_diagonal - 2] != longer[offset_within_diagonal] for offset_within_diagonal in range(next_skew_diagonal_length - 2)]\n",
+                "        substitution_costs = old_substitution_costs + added_substitution_costs\n",
+                "\n",
+                "        following[1:next_skew_diagonal_length - 1] = np.minimum(current[1:next_skew_diagonal_length - 1] + 1, current[:next_skew_diagonal_length - 2] + 1) # Insertions or deletions\n",
+                "        following[1:next_skew_diagonal_length - 1] = np.minimum(following[1:next_skew_diagonal_length - 1], substitution_costs) # Substitutions\n",
+                "        following[0] = next_diagonal_index\n",
+                "        following[next_skew_diagonal_length - 1] = next_diagonal_index\n",
+                "        assert np.all(following[:next_skew_diagonal_length] == get_skewed_diagonal(baseline, next_diagonal_index))\n",
+                "        \n",
+                "        previous[:] = current[:]\n",
+                "        current[:] = following[:]\n",
+                "        next_diagonal_index += 1\n",
+                "                        \n",
+                "    # Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.        \n",
+                "    while next_diagonal_index < longer_dim:\n",
+                "        next_skew_diagonal_length = shorter_dim\n",
+                "    \n",
+                "        ..."
+            ]
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "StringZilla",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.11.11"
+        }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 2
 }

From 94147ed734b988b4674ee657da9455d0f43d2349 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 10 Jul 2025 12:24:43 +0000
Subject: [PATCH 452/751] Add: Draft parallel library backend

---
 c/lib.cu | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 c/lib.cu

diff --git a/c/lib.cu b/c/lib.cu
new file mode 100644
index 00000000..cc94491d
--- /dev/null
+++ b/c/lib.cu
@@ -0,0 +1,59 @@
+/**
+ *  @file       lib.cu
+ *  @brief      StringParaZilla library for parallel string operations using CUDA C++ and OpenMP backends.
+ *  @author     Ash Vardanian
+ *  @date       March 23, 2025
+ */
+#include <fork_union.hpp> // Fork-join scoped thread pool
+
+#include <stringparazilla/find_many.hpp>  // C++ templates for string processing
+#include <stringparazilla/similarity.hpp> // C++ templates for string similarity
+
+#if SZ_USE_CUDA
+#include <stringparazilla/find_many.cuh>  // Parallel string processing in CUDA
+#include <stringparazilla/similarity.cuh> // Parallel string similarity in CUDA
+#endif
+
+namespace sz = ashvardanian::stringzilla;
+namespace spz = ashvardanian::stringparazilla;
+
+extern "C" {
+
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u32tape( //
+    sz_cptr_t a_data, sz_u32_t const *a_lengths,         //
+    sz_cptr_t b_data, sz_u32_t const *b_lengths,         //
+    sz_size_t count,                                     //
+    sz_size_t bound,                                     //
+    sz_memory_allocator_t *alloc, sz_size_t *results) {
+
+    _sz_unused(bound && alloc);
+
+    using tape_t = sz::arrow_strings_tape<char, sz_u32_t, sz::dummy_alloc_t>;
+    sz_size_t const a_total_length = a_lengths[count];
+    sz_size_t const b_total_length = b_lengths[count];
+    tape_t a({a_data, a_total_length}, {a_lengths, count + 1}, {});
+    tape_t b({b_data, b_total_length}, {b_lengths, count + 1}, {});
+
+    sz::status_t result = sz::cuda::levenshtein_distances(a, b, results);
+    return (sz_status_t)result;
+}
+
+SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u32tape( //
+    sz_cptr_t a_data, sz_u32_t const *a_lengths,           //
+    sz_cptr_t b_data, sz_u32_t const *b_lengths,           //
+    sz_size_t count,                                       //
+    sz_error_cost_t const *subs, sz_error_cost_t gap,      //
+    sz_memory_allocator_t *alloc, sz_ssize_t *results) {
+
+    _sz_unused(alloc);
+
+    using tape_t = sz::arrow_strings_tape<char, sz_u32_t, sz::dummy_alloc_t>;
+    sz_size_t const a_total_length = a_lengths[count];
+    sz_size_t const b_total_length = b_lengths[count];
+    tape_t a({a_data, a_total_length}, {a_lengths, count + 1}, {});
+    tape_t b({b_data, b_total_length}, {b_lengths, count + 1}, {});
+
+    sz::status_t result = sz::cuda::needleman_wunsch_scores(a, b, results, subs, gap);
+    return (sz_status_t)result;
+}
+} // extern "C"
\ No newline at end of file

From fb56b6032fdbcad4d0f2377b5aa2543b78a16d5d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 10 Jul 2025 12:32:58 +0000
Subject: [PATCH 453/751] Break: Rename again

---
 .github/workflows/prerelease.yml              |  4 +-
 .vscode/launch.json                           |  8 ++--
 .vscode/settings.json                         |  5 +--
 .vscode/tasks.json                            |  4 +-
 CMakeLists.txt                                | 42 +++++++++----------
 CONTRIBUTING.md                               | 32 +++++++-------
 c/{lib.c => stringzilla.c}                    |  2 +-
 c/{lib.cu => stringzillas.cu}                 | 14 +++----
 .../find_many.cuh                             | 14 +++----
 .../find_many.hpp                             | 16 +++----
 .../similarity.cuh                            | 14 +++----
 .../similarity.hpp                            | 16 +++----
 .../stringzillas.h}                           | 10 ++---
 .../types.cuh                                 | 10 ++---
 .../types.hpp                                 | 10 ++---
 scripts/bench.hpp                             |  6 +--
 scripts/bench_find_many.cpp                   | 12 +++---
 scripts/bench_find_many.cu                    | 12 +++---
 scripts/bench_find_many.cuh                   |  8 ++--
 scripts/bench_similarity.cpp                  | 12 +++---
 scripts/bench_similarity.cu                   | 12 +++---
 scripts/bench_similarity.cuh                  |  8 ++--
 scripts/test_stringzilla.hpp                  |  6 +--
 ...ingparazilla.cpp => test_stringzillas.cpp} | 12 +++---
 ...tringparazilla.cu => test_stringzillas.cu} | 12 +++---
 ...ingparazilla.cuh => test_stringzillas.cuh} | 17 ++++----
 26 files changed, 160 insertions(+), 158 deletions(-)
 rename c/{lib.c => stringzilla.c} (99%)
 rename c/{lib.cu => stringzillas.cu} (79%)
 rename include/{stringparazilla => stringzillas}/find_many.cuh (98%)
 rename include/{stringparazilla => stringzillas}/find_many.hpp (99%)
 rename include/{stringparazilla => stringzillas}/similarity.cuh (99%)
 rename include/{stringparazilla => stringzillas}/similarity.hpp (99%)
 rename include/{stringparazilla/stringparazilla.h => stringzillas/stringzillas.h} (94%)
 rename include/{stringparazilla => stringzillas}/types.cuh (98%)
 rename include/{stringparazilla => stringzillas}/types.hpp (97%)
 rename scripts/{test_stringparazilla.cpp => test_stringzillas.cpp} (79%)
 rename scripts/{test_stringparazilla.cu => test_stringzillas.cu} (79%)
 rename scripts/{test_stringparazilla.cuh => test_stringzillas.cuh} (99%)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index c03c5b34..067ea918 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -100,8 +100,8 @@ jobs:
 
       - name: Test Parallel Algorithms on Real World Data
         run: |
-          build_artifacts/stringparazilla_bench_similarity_cpp20 ${DATASET_PATH} # for edit distances and alignment scores
-          build_artifacts/stringparazilla_bench_find_many_cpp20 ${DATASET_PATH}  # for multi-needle search in many strings
+          build_artifacts/stringzillas_bench_similarity_cpp20 ${DATASET_PATH} # for edit distances and alignment scores
+          build_artifacts/stringzillas_bench_find_many_cpp20 ${DATASET_PATH}  # for multi-needle search in many strings
         env:
           DATASET_PATH: ./README.md
         # Don't overload GitHub with our benchmarks.
diff --git a/.vscode/launch.json b/.vscode/launch.json
index e64c3463..5cd4a5f1 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -42,7 +42,7 @@
       "type": "cppdbg",
       "request": "launch",
       "preLaunchTask": "Build Test: Debug Parallel C++",
-      "program": "${workspaceFolder}/build_debug/stringparazilla_test_cpp20",
+      "program": "${workspaceFolder}/build_debug/stringzillas_test_cpp20",
       "cwd": "${workspaceFolder}",
       "environment": [
         {
@@ -70,7 +70,7 @@
         "MIMode": "lldb"
       },
       "windows": {
-        "program": "${workspaceFolder}\\build_debug\\stringparazilla_test_cpp20.exe",
+        "program": "${workspaceFolder}\\build_debug\\stringzillas_test_cpp20.exe",
         "MIMode": "gdb",
         "miDebuggerPath": "C:\\MinGw\\bin\\gdb.exe"
       }
@@ -80,7 +80,7 @@
       "type": "cuda-gdb",
       "request": "launch",
       "preLaunchTask": "Build Test: Debug CUDA",
-      "program": "${workspaceFolder}/build_debug/stringparazilla_test_cu20",
+      "program": "${workspaceFolder}/build_debug/stringzillas_test_cu20",
       "cwd": "${workspaceFolder}",
       "environment": [
         {
@@ -136,7 +136,7 @@
       "name": "Current CUDA Benchmark",
       "type": "cuda-gdb",
       "request": "launch",
-      "program": "${workspaceFolder}/build_debug/stringparazilla_${fileBasenameNoExtension}",
+      "program": "${workspaceFolder}/build_debug/stringzillas_${fileBasenameNoExtension}",
       "cwd": "${workspaceFolder}",
       "environment": [
         {
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 9f7267f5..0878455f 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -180,11 +180,10 @@
     "stdc",
     "STL",
     "strided",
-    "STRINGPARAZILLA",
     "StringWa.rs",
-    "STRINGWARS",
+    "StringWars",
     "stringzilla",
-    "stringzilla_bare",
+    "stringzillas",
     "Strs",
     "strzl",
     "substr",
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
index 15a1e9c7..1d79dae1 100644
--- a/.vscode/tasks.json
+++ b/.vscode/tasks.json
@@ -21,7 +21,7 @@
         },
         {
             "label": "Build Test: Debug Parallel C++",
-            "command": "cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=1 -B build_debug && cmake --build build_debug --config Debug --target stringparazilla_test_cpp20",
+            "command": "cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=1 -B build_debug && cmake --build build_debug --config Debug --target stringzillas_test_cpp20",
             "args": [],
             "type": "shell",
             "osx": {
@@ -39,7 +39,7 @@
         },
         {
             "label": "Build Test: Debug CUDA",
-            "command": "cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=1 -B build_debug && cmake --build build_debug --config Debug --target stringparazilla_test_cu20",
+            "command": "cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=1 -B build_debug && cmake --build build_debug --config Debug --target stringzillas_test_cu20",
             "args": [],
             "type": "shell",
         },
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1ffe13b5..1fbf9703 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,10 +31,10 @@
 #
 # Parallel Benchmarks:
 #
-# * stringparazilla_bench_similarity_cpp20: A benchmark for similarity operations.
-# * stringparazilla_bench_similarity_cu20: A benchmark for similarity operations on GPU.
-# * stringparazilla_bench_find_many_cpp20: A benchmark for finding many substrings.
-# * stringparazilla_bench_find_many_cu20: A benchmark for finding many substrings on GPU.
+# * stringzillas_bench_similarity_cpp20: A benchmark for similarity operations.
+# * stringzillas_bench_similarity_cu20: A benchmark for similarity operations on GPU.
+# * stringzillas_bench_find_many_cpp20: A benchmark for finding many substrings.
+# * stringzillas_bench_find_many_cu20: A benchmark for finding many substrings on GPU.
 #
 # For higher-level language bindings separate build scripts are provided, native to each toolchain.
 cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
@@ -372,14 +372,14 @@ if (${STRINGZILLA_BUILD_BENCHMARK})
     define_launcher(stringzilla_bench_memory_cpp20 scripts/bench_memory.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
 
     # Parallel benchmarks
-    define_launcher(stringparazilla_bench_similarity_cpp20 scripts/bench_similarity.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringparazilla_bench_find_many_cpp20 scripts/bench_find_many.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzillas_bench_similarity_cpp20 scripts/bench_similarity.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzillas_bench_find_many_cpp20 scripts/bench_find_many.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     if (ENABLE_CUDA)
         define_gpu_launcher(
-            stringparazilla_bench_similarity_cu20 scripts/bench_similarity.cu 20 "${STRINGZILLA_TARGET_ARCH}"
+            stringzillas_bench_similarity_cu20 scripts/bench_similarity.cu 20 "${STRINGZILLA_TARGET_ARCH}"
         )
         define_gpu_launcher(
-            stringparazilla_bench_find_many_cu20 scripts/bench_find_many.cu 20 "${STRINGZILLA_TARGET_ARCH}"
+            stringzillas_bench_find_many_cu20 scripts/bench_find_many.cu 20 "${STRINGZILLA_TARGET_ARCH}"
         )
     endif ()
 endif ()
@@ -394,15 +394,15 @@ if (${STRINGZILLA_BUILD_TEST})
     define_launcher(stringzilla_test_cpp20 scripts/test_stringzilla.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
 
     # Test parallel algorithms separately
-    define_launcher(stringparazilla_test_cpp17 scripts/test_stringparazilla.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringparazilla_test_cpp20 scripts/test_stringparazilla.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzillas_test_cpp17 scripts/test_stringzillas.cpp 17 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzillas_test_cpp20 scripts/test_stringzillas.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
 
     # To avoid bloating our codebase with `__device__` function annotations, we only target C++14 and newer to compile
     # `constexpr` functions on both host and device side. To avoid the complexity of defining too many template objects
     # and complex SFINAE, we only target C++17 anf newer to compile `if constexpr` compile-time SIMD dispatch.
     if (ENABLE_CUDA)
-        define_gpu_launcher(stringparazilla_test_cu17 scripts/test_stringparazilla.cu 17 "${STRINGZILLA_TARGET_ARCH}")
-        define_gpu_launcher(stringparazilla_test_cu20 scripts/test_stringparazilla.cu 20 "${STRINGZILLA_TARGET_ARCH}")
+        define_gpu_launcher(stringzillas_test_cu17 scripts/test_stringzillas.cu 17 "${STRINGZILLA_TARGET_ARCH}")
+        define_gpu_launcher(stringzillas_test_cu20 scripts/test_stringzillas.cu 20 "${STRINGZILLA_TARGET_ARCH}")
     endif ()
 
     # Check system architecture to avoid complex cross-compilation workflows, but compile multiple backends: disabling
@@ -414,18 +414,18 @@ if (${STRINGZILLA_BUILD_TEST})
             define_launcher(stringzilla_test_cpp20_haswell scripts/test_stringzilla.cpp 20 "AVX2")
             define_launcher(stringzilla_test_cpp20_ice scripts/test_stringzilla.cpp 20 "AVX512")
             if (ENABLE_CUDA)
-                define_gpu_launcher(stringparazilla_test_cu20_serial scripts/test_stringparazilla.cu 20 "AVX")
-                define_gpu_launcher(stringparazilla_test_cu20_haswell scripts/test_stringparazilla.cu 20 "AVX2")
-                define_gpu_launcher(stringparazilla_test_cu20_ice scripts/test_stringparazilla.cu 20 "AVX512")
+                define_gpu_launcher(stringzillas_test_cu20_serial scripts/test_stringzillas.cu 20 "AVX")
+                define_gpu_launcher(stringzillas_test_cu20_haswell scripts/test_stringzillas.cu 20 "AVX2")
+                define_gpu_launcher(stringzillas_test_cu20_ice scripts/test_stringzillas.cu 20 "AVX512")
             endif ()
         else ()
             define_launcher(stringzilla_test_cpp20_serial scripts/test_stringzilla.cpp 20 "ivybridge")
             define_launcher(stringzilla_test_cpp20_haswell scripts/test_stringzilla.cpp 20 "haswell")
             define_launcher(stringzilla_test_cpp20_ice scripts/test_stringzilla.cpp 20 "sapphirerapids")
             if (ENABLE_CUDA)
-                define_gpu_launcher(stringparazilla_test_cu20_serial scripts/test_stringparazilla.cu 20 "ivybridge")
-                define_gpu_launcher(stringparazilla_test_cu20_haswell scripts/test_stringparazilla.cu 20 "haswell")
-                define_gpu_launcher(stringparazilla_test_cu20_ice scripts/test_stringparazilla.cu 20 "sapphirerapids")
+                define_gpu_launcher(stringzillas_test_cu20_serial scripts/test_stringzillas.cu 20 "ivybridge")
+                define_gpu_launcher(stringzillas_test_cu20_haswell scripts/test_stringzillas.cu 20 "haswell")
+                define_gpu_launcher(stringzillas_test_cu20_ice scripts/test_stringzillas.cu 20 "sapphirerapids")
             endif ()
         endif ()
     elseif (SZ_PLATFORM_ARM)
@@ -434,9 +434,9 @@ if (${STRINGZILLA_BUILD_TEST})
         define_launcher(stringzilla_test_cpp20_neon scripts/test_stringzilla.cpp 20 "armv8-a+simd")
         define_launcher(stringzilla_test_cpp20_sve scripts/test_stringzilla.cpp 20 "armv8.2-a+sve")
         if (ENABLE_CUDA)
-            define_gpu_launcher(stringparazilla_test_cu20_serial scripts/test_stringparazilla.cu 20 "armv8-a")
-            define_gpu_launcher(stringparazilla_test_cu20_neon scripts/test_stringparazilla.cu 20 "armv8-a+simd")
-            define_gpu_launcher(stringparazilla_test_cu20_sve scripts/test_stringparazilla.cu 20 "armv8.2-a+sve")
+            define_gpu_launcher(stringzillas_test_cu20_serial scripts/test_stringzillas.cu 20 "armv8-a")
+            define_gpu_launcher(stringzillas_test_cu20_neon scripts/test_stringzillas.cu 20 "armv8-a+simd")
+            define_gpu_launcher(stringzillas_test_cu20_sve scripts/test_stringzillas.cu 20 "armv8.2-a+sve")
         endif ()
     endif ()
 endif ()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index beb81d39..b2494c06 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -185,10 +185,10 @@ build_release/stringzilla_bench_container_cpp20 # - for STL containers with stri
 There are also parallel algorithms that need a very different benchmarking setup:
 
 ```sh
-build_release/stringparazilla_bench_find_many_cpp20  # - for parallel multi-pattern search on CPU
-build_release/stringparazilla_bench_find_many_cu20   # - for parallel multi-pattern search on GPU
-build_release/stringparazilla_bench_similarity_cpp20 # - for parallel edit distances and alignment scores on CPU
-build_release/stringparazilla_bench_similarity_cu20  # - for parallel edit distances and alignment scores on GPU
+build_release/stringzillas_bench_find_many_cpp20  # - for parallel multi-pattern search on CPU
+build_release/stringzillas_bench_find_many_cu20   # - for parallel multi-pattern search on GPU
+build_release/stringzillas_bench_similarity_cpp20 # - for parallel edit distances and alignment scores on CPU
+build_release/stringzillas_bench_similarity_cu20  # - for parallel edit distances and alignment scores on GPU
 ```
 
 All of them support customization via environment variables.
@@ -196,13 +196,13 @@ Let's say you want to benchmark large-batch DNA similarity scoring kernels:
 
 ```sh
 cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -B build_release
-cmake --build build_release --config Release --target stringparazilla_bench_similarity_cpp20    # CPU
-cmake --build build_release --config Release --target stringparazilla_bench_similarity_cu20     # GPU
-STRINGWARS_FILTER=32768 STRINGWARS_DATASET="acgt_1k.txt" build_release/stringparazilla_bench_similarity_cpp20
-STRINGWARS_FILTER=1 STRINGWARS_DATASET="acgt_100k.txt" build_release/stringparazilla_bench_similarity_cu20
+cmake --build build_release --config Release --target stringzillas_bench_similarity_cpp20    # CPU
+cmake --build build_release --config Release --target stringzillas_bench_similarity_cu20     # GPU
+STRINGWARS_FILTER=32768 STRINGWARS_DATASET="acgt_1k.txt" build_release/stringzillas_bench_similarity_cpp20
+STRINGWARS_FILTER=1 STRINGWARS_DATASET="acgt_100k.txt" build_release/stringzillas_bench_similarity_cu20
 
-STRINGWARS_FILTER="(cuda|kepler|hopper).*:batch32768" STRINGWARS_DATASET="acgt_1k.txt" build_release/stringparazilla_bench_similarity_cu20
-STRINGWARS_STRESS=0 STRINGWARS_FILTER="(cuda|kepler|hopper).*:batch1" STRINGWARS_DATASET="acgt_100k.txt" build_release/stringparazilla_bench_similarity_cu20
+STRINGWARS_FILTER="(cuda|kepler|hopper).*:batch32768" STRINGWARS_DATASET="acgt_1k.txt" build_release/stringzillas_bench_similarity_cu20
+STRINGWARS_STRESS=0 STRINGWARS_FILTER="(cuda|kepler|hopper).*:batch1" STRINGWARS_DATASET="acgt_100k.txt" build_release/stringzillas_bench_similarity_cu20
 ```
 
 Each benchmark originates from an identically named single-source file in the `scripts/` directory.
@@ -394,19 +394,19 @@ cmake --build build_artifacts --config Release
 
 ```sh
 cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=1 -B build_debug
-cmake --build build_debug --config Debug --target stringparazilla_test_cpp20
-cmake --build build_debug --config Debug --target stringparazilla_test_cu20
+cmake --build build_debug --config Debug --target stringzillas_test_cpp20
+cmake --build build_debug --config Debug --target stringzillas_test_cu20
 ```
 
 ```sh
 cmake -D CMAKE_BUILD_TYPE=Release -D STRINGZILLA_BUILD_TEST=1 -B build_release
-cmake --build build_release --config Release --target stringparazilla_test_cpp20
-cmake --build build_release --config Release --target stringparazilla_test_cu20
+cmake --build build_release --config Release --target stringzillas_test_cpp20
+cmake --build build_release --config Release --target stringzillas_test_cu20
 ```
 
 ```sh
-cuda-gdb ./build_debug/stringparazilla_test_cu20
-cuda-memcheck ./build_debug/stringparazilla_test_cu20
+cuda-gdb ./build_debug/stringzillas_test_cu20
+cuda-memcheck ./build_debug/stringzillas_test_cu20
 ```
 
 ## Contributing in Python
diff --git a/c/lib.c b/c/stringzilla.c
similarity index 99%
rename from c/lib.c
rename to c/stringzilla.c
index 9d3ed070..f6cb2f7b 100644
--- a/c/lib.c
+++ b/c/stringzilla.c
@@ -1,5 +1,5 @@
 /**
- *  @file       lib.c
+ *  @file       stringzilla.c
  *  @brief      StringZilla C library with dynamic backed dispatch for the most appropriate implementation.
  *  @author     Ash Vardanian
  *  @date       January 16, 2024
diff --git a/c/lib.cu b/c/stringzillas.cu
similarity index 79%
rename from c/lib.cu
rename to c/stringzillas.cu
index cc94491d..25ace82b 100644
--- a/c/lib.cu
+++ b/c/stringzillas.cu
@@ -1,21 +1,21 @@
 /**
- *  @file       lib.cu
- *  @brief      StringParaZilla library for parallel string operations using CUDA C++ and OpenMP backends.
+ *  @file       stringzillas.cu
+ *  @brief      StringZillas library for parallel string operations using CUDA C++ and OpenMP backends.
  *  @author     Ash Vardanian
  *  @date       March 23, 2025
  */
 #include <fork_union.hpp> // Fork-join scoped thread pool
 
-#include <stringparazilla/find_many.hpp>  // C++ templates for string processing
-#include <stringparazilla/similarity.hpp> // C++ templates for string similarity
+#include <stringzillas/find_many.hpp>  // C++ templates for string processing
+#include <stringzillas/similarity.hpp> // C++ templates for string similarity
 
 #if SZ_USE_CUDA
-#include <stringparazilla/find_many.cuh>  // Parallel string processing in CUDA
-#include <stringparazilla/similarity.cuh> // Parallel string similarity in CUDA
+#include <stringzillas/find_many.cuh>  // Parallel string processing in CUDA
+#include <stringzillas/similarity.cuh> // Parallel string similarity in CUDA
 #endif
 
 namespace sz = ashvardanian::stringzilla;
-namespace spz = ashvardanian::stringparazilla;
+namespace szs = ashvardanian::stringzillas;
 
 extern "C" {
 
diff --git a/include/stringparazilla/find_many.cuh b/include/stringzillas/find_many.cuh
similarity index 98%
rename from include/stringparazilla/find_many.cuh
rename to include/stringzillas/find_many.cuh
index 5e0c84b6..a0c1d659 100644
--- a/include/stringparazilla/find_many.cuh
+++ b/include/stringzillas/find_many.cuh
@@ -21,18 +21,18 @@
  *  really long ones. Reality is often fuzzier, with a mix of both. A better scheduling approach may be to treat
  *  all haystacks as a single tape, regrouping into sub-haystack-level and haystack-level "tasks".
  */
-#ifndef STRINGPARAZILLA_FIND_MANY_CUH_
-#define STRINGPARAZILLA_FIND_MANY_CUH_
+#ifndef STRINGZILLAS_FIND_MANY_CUH_
+#define STRINGZILLAS_FIND_MANY_CUH_
 
-#include "stringparazilla/types.cuh"
-#include "stringparazilla/find_many.hpp"
+#include "stringzillas/types.cuh"
+#include "stringzillas/find_many.hpp"
 
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda/atomic>
 
 namespace ashvardanian {
-namespace stringparazilla {
+namespace stringzillas {
 
 #pragma region - General Purpose CUDA Backend
 
@@ -479,7 +479,7 @@ struct find_many<state_id_type_, allocator_type_, sz_cap_cuda_k, enable_> {
 
 using find_many_u32_cuda_t = find_many<u32_t, unified_alloc<char>, sz_cap_cuda_k>;
 
-} // namespace stringparazilla
+} // namespace stringzillas
 } // namespace ashvardanian
 
-#endif // STRINGPARAZILLA_FIND_MANY_CUH_
+#endif // STRINGZILLAS_FIND_MANY_CUH_
diff --git a/include/stringparazilla/find_many.hpp b/include/stringzillas/find_many.hpp
similarity index 99%
rename from include/stringparazilla/find_many.hpp
rename to include/stringzillas/find_many.hpp
index 83d354bc..7441bba9 100644
--- a/include/stringparazilla/find_many.hpp
+++ b/include/stringzillas/find_many.hpp
@@ -35,12 +35,12 @@
  *  | Plagiarism/Code Similarity    | 1,000 – 100,000         | 1.024 MB – 102.4 MB     |
  *  | Product Catalog Matching      | 100,000 – 1,000,000     | 102.4 MB – 1.024 GB     |
  */
-#ifndef STRINGPARAZILLA_FIND_MANY_HPP_
-#define STRINGPARAZILLA_FIND_MANY_HPP_
+#ifndef STRINGZILLAS_FIND_MANY_HPP_
+#define STRINGZILLAS_FIND_MANY_HPP_
 
-#include "stringzilla/memory.h"      // `sz_move`
-#include "stringzilla/types.hpp"     // `status_t::status_t`
-#include "stringparazilla/types.hpp" // `dummy_executor_t`
+#include "stringzilla/memory.h"   // `sz_move`
+#include "stringzilla/types.hpp"  // `status_t::status_t`
+#include "stringzillas/types.hpp" // `dummy_executor_t`
 
 #include <memory>      // `std::allocator_traits` to re-bind the allocator
 #include <type_traits> // `std::enable_if_t` for meta-programming
@@ -48,7 +48,7 @@
 #include <iterator>    // `std::iterator_traits` for iterators
 
 namespace ashvardanian {
-namespace stringparazilla {
+namespace stringzillas {
 
 #pragma region - Dictionary
 
@@ -1018,7 +1018,7 @@ using find_many_u32_parallel_t = find_many<u32_t, std::allocator<char>, sz_caps_
 
 #pragma endregion // Parallel Backend
 
-} // namespace stringparazilla
+} // namespace stringzillas
 } // namespace ashvardanian
 
-#endif // STRINGPARAZILLA_FIND_MANY_HPP_
+#endif // STRINGZILLAS_FIND_MANY_HPP_
diff --git a/include/stringparazilla/similarity.cuh b/include/stringzillas/similarity.cuh
similarity index 99%
rename from include/stringparazilla/similarity.cuh
rename to include/stringzillas/similarity.cuh
index 7dfc3265..e2528546 100644
--- a/include/stringparazilla/similarity.cuh
+++ b/include/stringzillas/similarity.cuh
@@ -34,19 +34,19 @@
  *  - `levenshtein_distances`: {CUDA and Kepler} for any chars and lengths, {Hopper} for 8-bit and 16-bit lengths.
  *  - `needleman_wunsch_score`.
  */
-#ifndef STRINGPARAZILLA_SIMILARITY_CUH_
-#define STRINGPARAZILLA_SIMILARITY_CUH_
+#ifndef STRINGZILLAS_SIMILARITY_CUH_
+#define STRINGZILLAS_SIMILARITY_CUH_
 
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda/pipeline>        // `cuda::pipeline`
 #include <cooperative_groups.h> // `cooperative_groups::this_grid()`
 
-#include "stringparazilla/types.cuh"
-#include "stringparazilla/similarity.hpp"
+#include "stringzillas/types.cuh"
+#include "stringzillas/similarity.hpp"
 
 namespace ashvardanian {
-namespace stringparazilla {
+namespace stringzillas {
 
 #pragma region - Common Aliases
 
@@ -2977,7 +2977,7 @@ struct smith_waterman_scores<char, error_costs_256x256_t, gap_costs_type_, alloc
 
 #pragma endregion
 
-} // namespace stringparazilla
+} // namespace stringzillas
 } // namespace ashvardanian
 
-#endif // STRINGPARAZILLA_SIMILARITY_CUH_
\ No newline at end of file
+#endif // STRINGZILLAS_SIMILARITY_CUH_
\ No newline at end of file
diff --git a/include/stringparazilla/similarity.hpp b/include/stringzillas/similarity.hpp
similarity index 99%
rename from include/stringparazilla/similarity.hpp
rename to include/stringzillas/similarity.hpp
index 48c8f455..dfc0e53d 100644
--- a/include/stringparazilla/similarity.hpp
+++ b/include/stringzillas/similarity.hpp
@@ -66,12 +66,12 @@
  *  @see https://github.com/quim0/WFA-GPU
  *  @see https://github.com/asbschmidt/CUDASW4
  */
-#ifndef STRINGPARAZILLA_SIMILARITY_HPP_
-#define STRINGPARAZILLA_SIMILARITY_HPP_
+#ifndef STRINGZILLAS_SIMILARITY_HPP_
+#define STRINGZILLAS_SIMILARITY_HPP_
 
-#include "stringzilla/types.hpp"     // `sz::error_cost_t`
-#include "stringzilla/memory.h"      // `sz_move`
-#include "stringparazilla/types.hpp" // `sz::executor_like`
+#include "stringzilla/types.hpp"  // `sz::error_cost_t`
+#include "stringzilla/memory.h"   // `sz_move`
+#include "stringzillas/types.hpp" // `sz::executor_like`
 
 #include <atomic>      // `std::atomic` to synchronize OpenMP threads
 #include <type_traits> // `std::enable_if_t` for meta-programming
@@ -79,7 +79,7 @@
 #include <iterator>    // `std::iterator_traits` for iterators
 
 namespace ashvardanian {
-namespace stringparazilla {
+namespace stringzillas {
 
 struct error_costs_256x256_t;
 struct error_costs_26x26ascii_t;
@@ -4384,7 +4384,7 @@ struct smith_waterman_score<char, error_costs_256x256_t, linear_gap_costs_t, all
 #endif            // SZ_USE_ICE
 #pragma endregion // Ice Lake Implementation
 
-} // namespace stringparazilla
+} // namespace stringzillas
 } // namespace ashvardanian
 
-#endif // STRINGPARAZILLA_SIMILARITY_HPP_
\ No newline at end of file
+#endif // STRINGZILLAS_SIMILARITY_HPP_
\ No newline at end of file
diff --git a/include/stringparazilla/stringparazilla.h b/include/stringzillas/stringzillas.h
similarity index 94%
rename from include/stringparazilla/stringparazilla.h
rename to include/stringzillas/stringzillas.h
index 07cf0482..485bb562 100644
--- a/include/stringparazilla/stringparazilla.h
+++ b/include/stringzillas/stringzillas.h
@@ -1,14 +1,14 @@
 /**
- *  @brief  StringParaZilla is a collection of advanced string algorithms, designed to be used in Big Data applications.
+ *  @brief  StringZillas is a collection of advanced string algorithms, designed to be used in Big Data applications.
  *          It is generally faster than LibC, and has a broader & cleaner interface for safer @b length-bounded strings.
  *          On modern CPUs it uses AVX2, AVX-512, NEON, SVE, & SVE2 @b SIMD instructions & provides SWAR for older CPUs.
  *          On @b CUDA-capable GPUs it also provides C++ kernels for bulk processing.
  *
- *  @file   stringparazilla.h
+ *  @file   stringzillas.h
  *  @author Ash Vardanian
  */
-#ifndef STRINGPARAZILLA_H_
-#define STRINGPARAZILLA_H_
+#ifndef STRINGZILLAS_H_
+#define STRINGZILLAS_H_
 
 #include "stringzilla.h"
 
@@ -91,4 +91,4 @@ SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u64tape(                       /
 }
 #endif // __cplusplus
 
-#endif // STRINGPARAZILLA_H_
+#endif // STRINGZILLAS_H_
diff --git a/include/stringparazilla/types.cuh b/include/stringzillas/types.cuh
similarity index 98%
rename from include/stringparazilla/types.cuh
rename to include/stringzillas/types.cuh
index 60d07b2e..9575d390 100644
--- a/include/stringparazilla/types.cuh
+++ b/include/stringzillas/types.cuh
@@ -8,8 +8,8 @@
  *
  *  - `unified_alloc` - a custom allocator that uses CUDA Unified Memory for allocation.
  */
-#ifndef STRINGPARAZILLA_TYPES_CUH_
-#define STRINGPARAZILLA_TYPES_CUH_
+#ifndef STRINGZILLAS_TYPES_CUH_
+#define STRINGZILLAS_TYPES_CUH_
 
 #include "stringzilla/types.hpp"
 
@@ -35,7 +35,7 @@
 #endif
 
 namespace ashvardanian {
-namespace stringparazilla {
+namespace stringzillas {
 
 /**
  *  @brief  A custom allocator that uses CUDA Unified Memory for allocation.
@@ -314,7 +314,7 @@ warp_tasks_groups<task_type_> warp_tasks_grouping(span<task_type_> tasks, gpu_sp
     return result;
 }
 
-} // namespace stringparazilla
+} // namespace stringzillas
 } // namespace ashvardanian
 
-#endif // STRINGPARAZILLA_TYPES_CUH_
+#endif // STRINGZILLAS_TYPES_CUH_
diff --git a/include/stringparazilla/types.hpp b/include/stringzillas/types.hpp
similarity index 97%
rename from include/stringparazilla/types.hpp
rename to include/stringzillas/types.hpp
index 393198cc..0dd7fd19 100644
--- a/include/stringparazilla/types.hpp
+++ b/include/stringzillas/types.hpp
@@ -3,15 +3,15 @@
  *  @file   types.hpp
  *  @author Ash Vardanian
  */
-#ifndef STRINGPARAZILLA_TYPES_HPP_
-#define STRINGPARAZILLA_TYPES_HPP_
+#ifndef STRINGZILLAS_TYPES_HPP_
+#define STRINGZILLAS_TYPES_HPP_
 
 #include <thread> // `std::thread::hardware_concurrency`
 
 #include "stringzilla/types.hpp"
 
 namespace ashvardanian {
-namespace stringparazilla {
+namespace stringzillas {
 
 using namespace ashvardanian::stringzilla;
 
@@ -202,7 +202,7 @@ size_t group_by(begin_iterator_type_ const begin, end_iterator_type_ const end,
     return group_count;
 }
 
-} // namespace stringparazilla
+} // namespace stringzillas
 } // namespace ashvardanian
 
-#endif // STRINGPARAZILLA_TYPES_HPP_
+#endif // STRINGZILLAS_TYPES_HPP_
diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index 343111a0..20251a30 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -51,7 +51,7 @@
 #include "stringzilla/stringzilla.hpp"
 
 #if SZ_USE_CUDA
-#include "stringparazilla/types.cuh" // `unified_alloc`
+#include "stringzillas/types.cuh" // `unified_alloc`
 #endif
 
 #include "test_stringzilla.hpp" // `read_file`
@@ -197,9 +197,9 @@ using dataset_t = std::string;
 using token_view_t = std::string_view;
 using tokens_t = std::vector<token_view_t>;
 #else
-using dataset_t = std::basic_string<char, std::char_traits<char>, stringparazilla::unified_alloc<char>>;
+using dataset_t = std::basic_string<char, std::char_traits<char>, stringzillas::unified_alloc<char>>;
 using token_view_t = stringzilla::span<char const>;
-using tokens_t = std::vector<token_view_t, stringparazilla::unified_alloc<token_view_t>>;
+using tokens_t = std::vector<token_view_t, stringzillas::unified_alloc<token_view_t>>;
 #endif
 
 /**
diff --git a/scripts/bench_find_many.cpp b/scripts/bench_find_many.cpp
index 78709cb4..cec4bb67 100644
--- a/scripts/bench_find_many.cpp
+++ b/scripts/bench_find_many.cpp
@@ -26,8 +26,8 @@
  *
  *  @code{.sh}
  *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
- *  cmake --build build_release --config Release --target stringparazilla_bench_find_many_cpp20
- *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringparazilla_bench_find_many_cpp20
+ *  cmake --build build_release --config Release --target stringzillas_bench_find_many_cpp20
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringzillas_bench_find_many_cpp20
  *  @endcode
  *
  *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
@@ -37,7 +37,7 @@
  *  @code{.sh}
  *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
  *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
- *  build_release/stringparazilla_bench_find_many_cpp20
+ *  build_release/stringzillas_bench_find_many_cpp20
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
@@ -45,11 +45,11 @@
  */
 #include "bench_find_many.cuh"
 
-namespace szp = ashvardanian::stringparazilla;
-using namespace szp::scripts;
+namespace szs = ashvardanian::stringzillas;
+using namespace szs::scripts;
 
 int main(int argc, char const **argv) {
-    std::printf("Welcome to StringParaZilla on CPU!\n");
+    std::printf("Welcome to StringZillas on CPU!\n");
 
     try {
         std::printf("Building up the environment...\n");
diff --git a/scripts/bench_find_many.cu b/scripts/bench_find_many.cu
index 25f4b49c..0747a00a 100644
--- a/scripts/bench_find_many.cu
+++ b/scripts/bench_find_many.cu
@@ -26,8 +26,8 @@
  *
  *  @code{.sh}
  *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
- *  cmake --build build_release --config Release --target stringparazilla_bench_find_many_cu20
- *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringparazilla_bench_find_many_cu20
+ *  cmake --build build_release --config Release --target stringzillas_bench_find_many_cu20
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringzillas_bench_find_many_cu20
  *  @endcode
  *
  *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
@@ -37,7 +37,7 @@
  *  @code{.sh}
  *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
  *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
- *  build_release/stringparazilla_bench_find_many_cu20
+ *  build_release/stringzillas_bench_find_many_cu20
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
@@ -45,11 +45,11 @@
  */
 #include "bench_find_many.cuh"
 
-namespace szp = ashvardanian::stringparazilla;
-using namespace szp::scripts;
+namespace szs = ashvardanian::stringzillas;
+using namespace szs::scripts;
 
 int main(int argc, char const **argv) {
-    std::printf("Welcome to StringParaZilla on GPU!\n");
+    std::printf("Welcome to StringZillas on GPU!\n");
 
     try {
         std::printf("Building up the environment...\n");
diff --git a/scripts/bench_find_many.cuh b/scripts/bench_find_many.cuh
index a18fdeca..5b9556a3 100644
--- a/scripts/bench_find_many.cuh
+++ b/scripts/bench_find_many.cuh
@@ -7,16 +7,16 @@
 
 #include <fork_union.hpp> // Fork-join scoped thread pool
 
-#include <stringparazilla/find_many.hpp> // C++ templates for string processing
+#include <stringzillas/find_many.hpp> // C++ templates for string processing
 
 #if SZ_USE_CUDA
-#include <stringparazilla/find_many.cuh> // Parallel string processing in CUDA
+#include <stringzillas/find_many.cuh> // Parallel string processing in CUDA
 #endif
 
 #include "bench.hpp"
 
 namespace ashvardanian {
-namespace stringparazilla {
+namespace stringzillas {
 namespace scripts {
 
 using namespace ashvardanian::stringzilla::scripts;
@@ -224,5 +224,5 @@ void bench_find_many(environment_t const &env) {
 #pragma endregion
 
 } // namespace scripts
-} // namespace stringparazilla
+} // namespace stringzillas
 } // namespace ashvardanian
\ No newline at end of file
diff --git a/scripts/bench_similarity.cpp b/scripts/bench_similarity.cpp
index 7ceb9f28..0b1a5914 100644
--- a/scripts/bench_similarity.cpp
+++ b/scripts/bench_similarity.cpp
@@ -30,8 +30,8 @@
  *
  *  @code{.sh}
  *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
- *  cmake --build build_release --config Release --target stringparazilla_bench_similarity_cpp20
- *  STRINGWARS_DATASET=xlsum.csv STRINGWARS_TOKENS=words build_release/stringparazilla_bench_similarity_cpp20
+ *  cmake --build build_release --config Release --target stringzillas_bench_similarity_cpp20
+ *  STRINGWARS_DATASET=xlsum.csv STRINGWARS_TOKENS=words build_release/stringzillas_bench_similarity_cpp20
  *  @endcode
  *
  *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
@@ -41,7 +41,7 @@
  *  @code{.sh}
  *  STRINGWARS_DATASET=proteins.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
  *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
- *  build_release/stringparazilla_bench_similarity_cpp20
+ *  build_release/stringzillas_bench_similarity_cpp20
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
@@ -49,11 +49,11 @@
  */
 #include "bench_similarity.cuh"
 
-namespace szp = ashvardanian::stringparazilla;
-using namespace szp::scripts;
+namespace szs = ashvardanian::stringzillas;
+using namespace szs::scripts;
 
 int main(int argc, char const **argv) {
-    std::printf("Welcome to StringParaZilla on CPU!\n");
+    std::printf("Welcome to StringZillas on CPU!\n");
 
     try {
         std::printf("Building up the environment...\n");
diff --git a/scripts/bench_similarity.cu b/scripts/bench_similarity.cu
index 8fe938a5..6727d150 100644
--- a/scripts/bench_similarity.cu
+++ b/scripts/bench_similarity.cu
@@ -30,8 +30,8 @@
  *
  *  @code{.sh}
  *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
- *  cmake --build build_release --config Release --target stringparazilla_bench_similarity_cu20
- *  STRINGWARS_DATASET=xlsum.csv STRINGWARS_TOKENS=words build_release/stringparazilla_bench_similarity_cu20
+ *  cmake --build build_release --config Release --target stringzillas_bench_similarity_cu20
+ *  STRINGWARS_DATASET=xlsum.csv STRINGWARS_TOKENS=words build_release/stringzillas_bench_similarity_cu20
  *  @endcode
  *
  *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
@@ -41,7 +41,7 @@
  *  @code{.sh}
  *  STRINGWARS_DATASET=proteins.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
  *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
- *  build_release/stringparazilla_bench_similarity_cu20
+ *  build_release/stringzillas_bench_similarity_cu20
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
@@ -49,11 +49,11 @@
  */
 #include "bench_similarity.cuh"
 
-namespace szp = ashvardanian::stringparazilla;
-using namespace szp::scripts;
+namespace szs = ashvardanian::stringzillas;
+using namespace szs::scripts;
 
 int main(int argc, char const **argv) {
-    std::printf("Welcome to StringParaZilla on GPU!\n");
+    std::printf("Welcome to StringZillas on GPU!\n");
 
     try {
         std::printf("Building up the environment...\n");
diff --git a/scripts/bench_similarity.cuh b/scripts/bench_similarity.cuh
index 4fbd2210..0165ae56 100644
--- a/scripts/bench_similarity.cuh
+++ b/scripts/bench_similarity.cuh
@@ -6,16 +6,16 @@
 
 #include <fork_union.hpp> // Fork-join scoped thread pool
 
-#include <stringparazilla/similarity.hpp> // C++ templates for string similarity measures
+#include <stringzillas/similarity.hpp> // C++ templates for string similarity measures
 
 #if SZ_USE_CUDA
-#include <stringparazilla/similarity.cuh> // Parallel string processing in CUDA
+#include <stringzillas/similarity.cuh> // Parallel string processing in CUDA
 #endif
 
 #include "bench.hpp"
 
 namespace ashvardanian {
-namespace stringparazilla {
+namespace stringzillas {
 namespace scripts {
 
 using namespace ashvardanian::stringzilla::scripts;
@@ -391,5 +391,5 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
 #pragma endregion
 
 } // namespace scripts
-} // namespace stringparazilla
+} // namespace stringzillas
 } // namespace ashvardanian
\ No newline at end of file
diff --git a/scripts/test_stringzilla.hpp b/scripts/test_stringzilla.hpp
index 4ba6485b..7c0f7efc 100644
--- a/scripts/test_stringzilla.hpp
+++ b/scripts/test_stringzilla.hpp
@@ -15,7 +15,7 @@
 
 #include "stringzilla/types.hpp"
 #if SZ_USE_CUDA
-#include "stringparazilla/types.cuh"
+#include "stringzillas/types.cuh"
 #endif
 
 namespace ashvardanian {
@@ -29,9 +29,9 @@ using arrow_strings_tape_t = arrow_strings_tape<char, sz_size_t, std::allocator<
 template <typename value_type_>
 using unified_vector = std::vector<value_type_, std::allocator<value_type_>>;
 #else
-using arrow_strings_tape_t = arrow_strings_tape<char, sz_size_t, stringparazilla::unified_alloc<char>>;
+using arrow_strings_tape_t = arrow_strings_tape<char, sz_size_t, stringzillas::unified_alloc<char>>;
 template <typename value_type_>
-using unified_vector = std::vector<value_type_, stringparazilla::unified_alloc<value_type_>>;
+using unified_vector = std::vector<value_type_, stringzillas::unified_alloc<value_type_>>;
 #endif
 
 inline std::string read_file(std::string path) noexcept(false) {
diff --git a/scripts/test_stringparazilla.cpp b/scripts/test_stringzillas.cpp
similarity index 79%
rename from scripts/test_stringparazilla.cpp
rename to scripts/test_stringzillas.cpp
index a33c112d..38a7c3b4 100644
--- a/scripts/test_stringparazilla.cpp
+++ b/scripts/test_stringzillas.cpp
@@ -28,19 +28,19 @@
 #endif
 #define SZ_DEBUG 1 // Enforce aggressive logging for this unit.
 
-#include "test_stringparazilla.cuh"
+#include "test_stringzillas.cuh"
 
-namespace sz = ashvardanian::stringzilla;
+namespace szs = ashvardanian::stringzillas;
 
 int main(int argc, char const **argv) {
     sz_unused(argc && argv);
     std::printf("Hi, dear tester! You look nice today!\n");
-    if (auto code = sz::scripts::log_environment(); code != 0) return code;
+    if (auto code = szs::scripts::log_environment(); code != 0) return code;
 
     try {
-        sz::scripts::test_find_many_equivalence();
-        sz::scripts::test_similarity_scores_equivalence();
-        sz::scripts::test_similarity_scores_memory_usage();
+        szs::scripts::test_find_many_equivalence();
+        szs::scripts::test_similarity_scores_equivalence();
+        szs::scripts::test_similarity_scores_memory_usage();
     }
     catch (std::exception const &e) {
         std::fprintf(stderr, "Failed with: %s\n", e.what());
diff --git a/scripts/test_stringparazilla.cu b/scripts/test_stringzillas.cu
similarity index 79%
rename from scripts/test_stringparazilla.cu
rename to scripts/test_stringzillas.cu
index 678edc1f..d200508b 100644
--- a/scripts/test_stringparazilla.cu
+++ b/scripts/test_stringzillas.cu
@@ -28,19 +28,19 @@
 #endif
 #define SZ_DEBUG 1 // Enforce aggressive logging for this unit.
 
-#include "test_stringparazilla.cuh"
+#include "test_stringzillas.cuh"
 
-namespace sz = ashvardanian::stringzilla;
+namespace szs = ashvardanian::stringzillas;
 
 int main(int argc, char const **argv) {
     sz_unused(argc && argv);
     std::printf("Hi, dear tester! You look nice today!\n");
-    if (auto code = sz::scripts::log_environment(); code != 0) return code;
+    if (auto code = szs::scripts::log_environment(); code != 0) return code;
 
     try {
-        sz::scripts::test_find_many_equivalence();
-        sz::scripts::test_similarity_scores_equivalence();
-        sz::scripts::test_similarity_scores_memory_usage();
+        szs::scripts::test_find_many_equivalence();
+        szs::scripts::test_similarity_scores_equivalence();
+        szs::scripts::test_similarity_scores_memory_usage();
     }
     catch (std::exception const &e) {
         std::fprintf(stderr, "Failed with: %s\n", e.what());
diff --git a/scripts/test_stringparazilla.cuh b/scripts/test_stringzillas.cuh
similarity index 99%
rename from scripts/test_stringparazilla.cuh
rename to scripts/test_stringzillas.cuh
index 93e9618c..b4b80316 100644
--- a/scripts/test_stringparazilla.cuh
+++ b/scripts/test_stringzillas.cuh
@@ -2,7 +2,7 @@
  *  @brief   Extensive @b stress-testing suite for StringCuZilla parallel operations, written in CUDA C++.
  *  @see     Stress-tests on real-world and synthetic data are integrated into the @b `scripts/bench*.cpp` benchmarks.
  *
- *  @file    test_stringparazilla.cuh
+ *  @file    test_stringzillas.cuh
  *  @author  Ash Vardanian
  */
 #include <cstring> // `std::memcmp`
@@ -10,12 +10,12 @@
 
 #include <fork_union.hpp> // Fork-join scoped thread pool
 
-#include "stringparazilla/find_many.hpp"
-#include "stringparazilla/similarity.hpp"
+#include "stringzillas/find_many.hpp"
+#include "stringzillas/similarity.hpp"
 
 #if SZ_USE_CUDA
-#include "stringparazilla/find_many.cuh"
-#include "stringparazilla/similarity.cuh"
+#include "stringzillas/find_many.cuh"
+#include "stringzillas/similarity.cuh"
 #endif
 
 #if !_SZ_IS_CPP17
@@ -25,9 +25,12 @@
 #include "test_stringzilla.hpp" // `arrow_strings_view_t`
 
 namespace ashvardanian {
-namespace stringparazilla {
+namespace stringzillas {
 namespace scripts {
 
+using namespace stringzilla;
+using namespace stringzilla::scripts;
+
 int log_environment() {
     std::printf("- Uses Haswell: %s \n", SZ_USE_HASWELL ? "yes" : "no");
     std::printf("- Uses Skylake: %s \n", SZ_USE_SKYLAKE ? "yes" : "no");
@@ -1404,5 +1407,5 @@ void test_find_many_equivalence() {
 }
 
 } // namespace scripts
-} // namespace stringparazilla
+} // namespace stringzillas
 } // namespace ashvardanian

From fb5f42948db3484f3184aac3b61d0f7157e56137 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 11 Jul 2025 15:25:37 +0000
Subject: [PATCH 454/751] Improve: Upgrade Fork Union

---
 fork_union | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fork_union b/fork_union
index 5ff617de..f5641d86 160000
--- a/fork_union
+++ b/fork_union
@@ -1 +1 @@
-Subproject commit 5ff617def00217dc0463f57136fa12ab438eb800
+Subproject commit f5641d863aeed6a3efffce7a079a32c994e3343c

From c80ce609589243810470acc4f5baff621cbd5120 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 11 Jul 2025 15:26:02 +0000
Subject: [PATCH 455/751] Improve: Extend `find_many` tests

---
 scripts/test_stringzillas.cuh | 159 ++++++++++++++++++++++++----------
 1 file changed, 111 insertions(+), 48 deletions(-)

diff --git a/scripts/test_stringzillas.cuh b/scripts/test_stringzillas.cuh
index b4b80316..2f8f4d4a 100644
--- a/scripts/test_stringzillas.cuh
+++ b/scripts/test_stringzillas.cuh
@@ -8,6 +8,7 @@
 #include <cstring> // `std::memcmp`
 #include <thread>  // `std::thread::hardware_concurrency`
 
+#define FU_ENABLE_NUMA 0
 #include <fork_union.hpp> // Fork-join scoped thread pool
 
 #include "stringzillas/find_many.hpp"
@@ -28,6 +29,7 @@ namespace ashvardanian {
 namespace stringzillas {
 namespace scripts {
 
+namespace fu = fork_union;
 using namespace stringzilla;
 using namespace stringzilla::scripts;
 
@@ -1118,49 +1120,9 @@ struct find_many_baselines_t {
  *          on a @b fixed set of different representative ASCII and UTF-8 strings.
  */
 template <typename base_operator_, typename simd_operator_, typename... extra_args_>
-void test_find_many_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_operator, extra_args_ &&...extra_args) {
-
-    std::vector<std::string> haystacks, needles;
+void test_find_many_on(std::vector<std::string> haystacks, std::vector<std::string> needles,
+                       base_operator_ &&base_operator, simd_operator_ &&simd_operator, extra_args_ &&...extra_args) {
 
-    // Some vary basic variants:
-    needles.emplace_back("his");
-    needles.emplace_back("is");
-    needles.emplace_back("she");
-    needles.emplace_back("her");
-
-    needles.emplace_back("école"), needles.emplace_back("école");                   // decomposed
-    needles.emplace_back("Schön"), needles.emplace_back("Scho\u0308n");             // combining diaeresis
-    needles.emplace_back("naïve"), needles.emplace_back("naive");                   // stripped diaeresis
-    needles.emplace_back("façade"), needles.emplace_back("facade");                 // no cedilla
-    needles.emplace_back("office"), needles.emplace_back("ofﬁce");                  // “fi” ligature
-    needles.emplace_back("Straße"), needles.emplace_back("Strasse");                // ß vs ss
-    needles.emplace_back("ABBA"), needles.emplace_back("\u0410\u0412\u0412\u0410"); // Latin vs Cyrillic
-    needles.emplace_back("中国"), needles.emplace_back("中國");                     // simplified vs traditional
-    needles.emplace_back("🙂"), needles.emplace_back("☺️");                          // emoji variants
-    needles.emplace_back("€100"), needles.emplace_back("EUR 100");                  // currency symbol vs abbreviation
-
-    // Haystacks should contain arbitrary strings including those needles
-    // in different positions, potentially interleaving
-    haystacks.emplace_back("That is a test string"); // ? "only "is"
-    haystacks.emplace_back("This is a test string"); // ? "his", 2x "is"
-    haystacks.emplace_back("ahishers");              // textbook example
-    haystacks.emplace_back("hishishersherishis");    // heavy overlap, prefix & suffix collisions
-    haystacks.emplace_back("si siht si a tset gnirts; reh ton si ehs, tub sih ti si."); // no real matches
-    haystacks.emplace_back("his\0is\r\nshe\0her");                                      // null-included
-
-    // ~260 chars – dense English with overlapping words (“his”, “is”, “she”, “her”)
-    haystacks.emplace_back(R"(
-    In this historic thesis, the historian highlights his findings: this is the synthesis of data.
-    She examined the theory, he shared her methodology. In this chapter, he lists his equipment:
-    microscope, test kit, sensor. It is here that she erred: misalignment arises.
-    )");
-
-    // ~320 chars – multilingual snippet with needles in Latin, Arabic, Chinese, English
-    haystacks.emplace_back(R"(
-    The conference in 北京 attracted researchers from across the globe. His presentation “AI in Healthcare”
-    was a hit—she received awards. الباحثون استعرضوا الأبحاث، واستشارت her colleagues. 这是一次重要的会议。
-    She said: “This is only the beginning.” In her report, his name appears seventeen times.
-    )");
     using match_t = find_many_match_t;
 
     // First check with a batch-size of 1
@@ -1247,6 +1209,102 @@ void test_find_many_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_
     }
 }
 
+/**
+ *  @brief  Tests the correctness of the string class Levenshtein distance computation,
+ *          as well as the similarity scoring functions for bioinformatics-like workloads
+ *          on a @b fixed set of different representative ASCII and UTF-8 strings.
+ */
+template <typename base_operator_, typename simd_operator_, typename... extra_args_>
+void test_find_many_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_operator, extra_args_ &&...extra_args) {
+
+    {
+        std::vector<std::string> haystacks, needles;
+
+        // Some vary basic variants:
+        needles.emplace_back("his");
+        needles.emplace_back("is");
+        needles.emplace_back("she");
+        needles.emplace_back("her");
+
+        needles.emplace_back("école"), needles.emplace_back("école");                   // decomposed
+        needles.emplace_back("Schön"), needles.emplace_back("Scho\u0308n");             // combining diaeresis
+        needles.emplace_back("naïve"), needles.emplace_back("naive");                   // stripped diaeresis
+        needles.emplace_back("façade"), needles.emplace_back("facade");                 // no cedilla
+        needles.emplace_back("office"), needles.emplace_back("ofﬁce");                  // “fi” ligature
+        needles.emplace_back("Straße"), needles.emplace_back("Strasse");                // ß vs ss
+        needles.emplace_back("ABBA"), needles.emplace_back("\u0410\u0412\u0412\u0410"); // Latin vs Cyrillic
+        needles.emplace_back("中国"), needles.emplace_back("中國");                     // simplified vs traditional
+        needles.emplace_back("🙂"), needles.emplace_back("☺️");                          // emoji variants
+        needles.emplace_back("€100"), needles.emplace_back("EUR 100"); // currency symbol vs abbreviation
+
+        // Haystacks should contain arbitrary strings including those needles
+        // in different positions, potentially interleaving
+        haystacks.emplace_back("That is a test string"); // ? "only "is"
+        haystacks.emplace_back("This is a test string"); // ? "his", 2x "is"
+        haystacks.emplace_back("ahishers");              // textbook example
+        haystacks.emplace_back("hishishersherishis");    // heavy overlap, prefix & suffix collisions
+        haystacks.emplace_back("si siht si a tset gnirts; reh ton si ehs, tub sih ti si."); // no real matches
+        haystacks.emplace_back("his\0is\r\nshe\0her");                                      // null-included
+
+        // ~260 chars – dense English with overlapping words (“his”, “is”, “she”, “her”)
+        haystacks.emplace_back(R"(
+        In this historic thesis, the historian highlights his findings: this is the synthesis of data.
+        She examined the theory, he shared her methodology. In this chapter, he lists his equipment:
+        microscope, test kit, sensor. It is here that she erred: misalignment arises.
+        )");
+
+        // ~320 chars – multilingual snippet with needles in Latin, Arabic, Chinese, English
+        haystacks.emplace_back(R"(
+        The conference in 北京 attracted researchers from across the globe. His presentation “AI in Healthcare”
+        was a hit—she received awards. الباحثون استعرضوا الأبحاث، واستشارت her colleagues. 这是一次重要的会议。
+        She said: “This is only the beginning.” In her report, his name appears seventeen times.
+        )");
+
+        test_find_many_on(haystacks, needles, base_operator, simd_operator, extra_args...);
+    }
+
+    // Many of our algorithms depend on the idea that needles are shorter than the slices that each core may receive
+    {
+        std::vector<std::string> haystacks, needles;
+        needles.emplace_back("is");
+        needles.emplace_back("his");
+
+        haystacks.emplace_back("this is his, that is his, those are his, these are his");
+        haystacks.emplace_back("his is this, his is that, his are those, his are these");
+        haystacks.emplace_back(R"(
+        1 is this 2 is this 3 is this 4 is this 5 is this 6 is this 7 is this 8 is this
+        9 is this 10 is this 11 is this 12 is this 13 is this 14 is this 15 is this 16 is this
+        )");
+
+        test_find_many_on(haystacks, needles, base_operator, simd_operator, extra_args...);
+    }
+
+    // Try even simpler alphabets
+    {
+        std::vector<std::string> haystacks, needles;
+        needles.emplace_back("ab");
+        needles.emplace_back("aba");
+
+        haystacks.emplace_back("abababababababababababababababababababababababababababababababababab");
+        haystacks.emplace_back("abbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabba");
+
+        test_find_many_on(haystacks, needles, base_operator, simd_operator, extra_args...);
+    }
+
+    // Try a combination of very short and very long needles
+    {
+        std::vector<std::string> haystacks, needles;
+        needles.emplace_back("a");
+        needles.emplace_back("b");
+        needles.emplace_back("abracadabra");
+
+        haystacks.emplace_back("abracadabra");
+        haystacks.emplace_back("abracadabracadabra");
+
+        test_find_many_on(haystacks, needles, base_operator, simd_operator, extra_args...);
+    }
+}
+
 /**
  *  @brief Fuzzy test for multi-pattern exact search algorithms using randomly-generated haystacks and needles.
  */
@@ -1367,11 +1425,6 @@ void test_find_many_equivalence() {
     needles_long_config.batch_size =
         std::pow(needles_long_config.alphabet.size(), needles_long_config.max_string_length);
 
-    // Let's reuse a thread-pool to amortize the cost of spawning threads.
-    fork_union_t pool;
-    if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
-    static_assert(executor_like<fork_union_t>);
-
 #if SZ_USE_CUDA
     gpu_specs_t first_gpu_specs = *gpu_specs();
 #endif
@@ -1380,7 +1433,17 @@ void test_find_many_equivalence() {
     test_find_many_fixed(find_many_baselines_t {}, find_many_u32_serial_t {});
 
     // Multi-threaded parallel Aho-Corasick implementation
-    test_find_many_fixed(find_many_baselines_t {}, find_many_u32_parallel_t {}, pool);
+    for (std::size_t threads : {2, 3, 4, 5}) {
+        fu::basic_pool_t pool;
+        if (!pool.try_spawn(threads)) throw std::runtime_error("Failed to spawn thread pool.");
+        static_assert(executor_like<fu::basic_pool_t>);
+        test_find_many_fixed(find_many_baselines_t {}, find_many_u32_parallel_t {}, pool);
+    }
+
+    // Let's reuse a thread-pool to amortize the cost of spawning threads.
+    fu::basic_pool_t pool;
+    if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
+    static_assert(executor_like<fu::basic_pool_t>);
 
 #if SZ_USE_CUDA
     test_find_many_fixed(find_many_baselines_t {}, find_many_u32_cuda_t {}, cuda_executor_t {});

From a6f799f9ccf3caeabad3956c3bb1de7149d1ef36 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 11 Jul 2025 16:37:22 +0000
Subject: [PATCH 456/751] Make: Deprecate Find Many kernels

---
 CMakeLists.txt                                |    2 +-
 {scripts => drafts}/bench_find_many.cpp       |    0
 {scripts => drafts}/bench_find_many.cu        |    0
 {scripts => drafts}/bench_find_many.cuh       |    1 +
 .../stringzillas => drafts}/find_many.cuh     |    0
 .../stringzillas => drafts}/find_many.hpp     |  175 +--
 drafts/test_find_many.cuh                     |  490 ++++++
 include/stringzillas/similarity.hpp           |   24 +-
 include/stringzillas/types.hpp                |   42 +-
 scripts/bench_similarity.cuh                  |    1 +
 scripts/test_similarity.cuh                   |  958 +++++++++++
 scripts/test_stringzillas.cuh                 | 1394 -----------------
 12 files changed, 1562 insertions(+), 1525 deletions(-)
 rename {scripts => drafts}/bench_find_many.cpp (100%)
 rename {scripts => drafts}/bench_find_many.cu (100%)
 rename {scripts => drafts}/bench_find_many.cuh (99%)
 rename {include/stringzillas => drafts}/find_many.cuh (100%)
 rename {include/stringzillas => drafts}/find_many.hpp (87%)
 create mode 100644 drafts/test_find_many.cuh
 create mode 100644 scripts/test_similarity.cuh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1fbf9703..58e972cc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -451,7 +451,7 @@ target_include_directories(
 if (${STRINGZILLA_BUILD_SHARED})
 
     function (define_shared target)
-        add_library(${target} SHARED c/lib.c)
+        add_library(${target} SHARED c/stringzilla.c)
         add_library(${PROJECT_NAME}::${target} ALIAS ${target})
 
         set_target_properties(
diff --git a/scripts/bench_find_many.cpp b/drafts/bench_find_many.cpp
similarity index 100%
rename from scripts/bench_find_many.cpp
rename to drafts/bench_find_many.cpp
diff --git a/scripts/bench_find_many.cu b/drafts/bench_find_many.cu
similarity index 100%
rename from scripts/bench_find_many.cu
rename to drafts/bench_find_many.cu
diff --git a/scripts/bench_find_many.cuh b/drafts/bench_find_many.cuh
similarity index 99%
rename from scripts/bench_find_many.cuh
rename to drafts/bench_find_many.cuh
index 5b9556a3..f0a0fda2 100644
--- a/scripts/bench_find_many.cuh
+++ b/drafts/bench_find_many.cuh
@@ -5,6 +5,7 @@
 #include <tuple> // `std::tuple`
 #include <span>  // `std::span`
 
+#define FU_ENABLE_NUMA 0
 #include <fork_union.hpp> // Fork-join scoped thread pool
 
 #include <stringzillas/find_many.hpp> // C++ templates for string processing
diff --git a/include/stringzillas/find_many.cuh b/drafts/find_many.cuh
similarity index 100%
rename from include/stringzillas/find_many.cuh
rename to drafts/find_many.cuh
diff --git a/include/stringzillas/find_many.hpp b/drafts/find_many.hpp
similarity index 87%
rename from include/stringzillas/find_many.hpp
rename to drafts/find_many.hpp
index 7441bba9..ac57debc 100644
--- a/include/stringzillas/find_many.hpp
+++ b/drafts/find_many.hpp
@@ -47,9 +47,14 @@
 #include <limits>      // `std::numeric_limits` for numeric types
 #include <iterator>    // `std::iterator_traits` for iterators
 
+#define FU_ENABLE_NUMA 0  // We only need supplementary types from `fork_union.hpp`
+#include <fork_union.hpp> // `fu::indexed_split_t` for parallel processing
+
 namespace ashvardanian {
 namespace stringzillas {
 
+namespace fu = fork_union;
+
 #pragma region - Dictionary
 
 /**
@@ -138,10 +143,10 @@ struct aho_corasick_dictionary {
     using state_transitions_t = safe_array<state_id_t, alphabet_size_k>;
 
   private:
-    using size_allocator_t = typename std::allocator_traits<allocator_t>::template rebind_alloc<size_t>;
-    using state_id_allocator_t = typename std::allocator_traits<allocator_t>::template rebind_alloc<state_id_t>;
-    using state_transitions_allocator_t =
-        typename std::allocator_traits<allocator_t>::template rebind_alloc<state_transitions_t>;
+    using allocator_traits_t = std::allocator_traits<allocator_t>;
+    using size_allocator_t = typename allocator_traits_t::template rebind_alloc<size_t>;
+    using state_id_allocator_t = typename allocator_traits_t::template rebind_alloc<state_id_t>;
+    using state_transitions_allocator_t = typename allocator_traits_t::template rebind_alloc<state_transitions_t>;
 
     /**
      *  @brief  State transitions for each state, at least `count_states_ * alphabet_size_k` in binary size.
@@ -223,10 +228,9 @@ struct aho_corasick_dictionary {
      */
     template <typename other_allocator_t>
     status_t try_assign(aho_corasick_dictionary<state_id_type_, other_allocator_t> const &other) noexcept {
-        using alloc_traits = std::allocator_traits<allocator_t>;
 
         allocator_t alloc;
-        if constexpr (alloc_traits::propagate_on_container_copy_assignment::value) alloc = other.alloc_;
+        if constexpr (allocator_traits_t::propagate_on_container_copy_assignment::value) alloc = other.alloc_;
 
         safe_vector<state_transitions_t, state_transitions_allocator_t> transitions(alloc);
         safe_vector<state_id_t, state_id_allocator_t> outputs(alloc);
@@ -394,9 +398,6 @@ struct aho_corasick_dictionary {
             current_state = current_row[needle_byte];
         }
 
-        // If the terminal state's output is already set, the needle already exists.
-        if (outputs_[current_state] != invalid_state_k) return status_t::contains_duplicates_k;
-
         // Populate the new state.
         outputs_[current_state] = needle_id;
         needles_lengths_.try_push_back(needle.size()); // ? Can't fail due to `try_reserve` above
@@ -421,8 +422,7 @@ struct aho_corasick_dictionary {
         // to expand and track all of the outputs for each state, merging the failure links.
         // We will later use `outputs_merged` to populate `outputs_`, `outputs_offsets_`, and `outputs_counts_`.
         using state_ids_vector_t = safe_vector<state_id_t, state_id_allocator_t>;
-        using state_ids_vector_allocator_t =
-            typename std::allocator_traits<allocator_t>::template rebind_alloc<state_ids_vector_t>;
+        using state_ids_vector_allocator_t = typename allocator_traits_t::template rebind_alloc<state_ids_vector_t>;
         using state_ids_per_state_vector_t = safe_vector<state_ids_vector_t, state_ids_vector_allocator_t>;
         state_ids_per_state_vector_t outputs_merged(alloc_);
         if (outputs_merged.try_resize(count_states_) != status_t::success_k) return status_t::bad_alloc_k;
@@ -658,26 +658,7 @@ struct find_many {
 
 #pragma region - Parallel Backend
 
-/**
- *  @brief Helper splitting the haystack into slices for each core, taking into account the cache line width.
- *  @warning Doesn't consider the dictionary and needle length at all!
- */
-constexpr span<byte_t const> haystack_part_for_core(span<byte_t const> haystack, size_t core_index, size_t cores_total,
-                                                    size_t cache_line_width) noexcept {
-
-    size_t const bytes_per_core_optimal =
-        round_up_to_multiple(divide_round_up(haystack.size(), cores_total), cache_line_width);
-
-    // We may have a case of a thread receiving no data at all
-    byte_t const *optimal_start = std::min(haystack.data() + core_index * bytes_per_core_optimal, haystack.end());
-    if (optimal_start >= haystack.end()) return {};
-
-    // First, each core will process its own slice excluding the overlapping regions
-    byte_t const *optimal_end = std::min(optimal_start + bytes_per_core_optimal, haystack.end());
-    return {optimal_start, optimal_end};
-}
-
-struct _count_short_needle_matches_in_one_part_t {
+struct _count_short_matches_in_one_part_t {
     size_t total = 0;
     size_t prefix = 0;
 };
@@ -748,7 +729,6 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
                        cpu_specs_t const &specs = {}) const noexcept {
 
         _sz_assert(counts.size() == haystacks.size());
-        size_t const cache_line_width = specs.cache_line_width;
 
         using haystacks_t = typename std::remove_reference_t<haystacks_type_>;
         using haystack_t = typename haystacks_t::value_type;
@@ -757,7 +737,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
                       "The haystack should be trivially copyable for higher compatibility.");
 
         // On small strings, individually compute the counts
-        executor.for_each_dynamic(counts.size(), [&](size_t haystack_index) noexcept {
+        executor.for_n_dynamic(counts.size(), [&](size_t haystack_index) noexcept {
             haystack_t const &haystack = haystacks[haystack_index];
             size_t haystack_length = haystack.size_bytes();
             if (haystack_length > specs.l2_bytes) return;
@@ -768,27 +748,30 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
         for (size_t haystack_index = 0; haystack_index < counts.size(); ++haystack_index) {
             haystack_t const &haystack = haystacks[haystack_index];
             size_t const haystack_length = haystack.size_bytes();
+
             // The shorter strings have already been processed
             if (haystack_length <= specs.l2_bytes) continue;
+            auto haystack_bytes = span<char_t const>(haystack.data(), haystack.size()).template cast<byte_t const>();
 
+            // Aggregate into one atomic - efficient enough :)
             std::atomic<size_t> count_across_cores = 0;
-            size_t const cores_total = executor.thread_count();
-            size_t const padded_max_needle_length = dict_.max_needle_length() + specs.cache_line_width;
-            bool const longest_needle_fits_on_one_core = padded_max_needle_length * cores_total < haystack_length;
-            auto haystack_bytes = span<char_t const>(haystack.data(), haystack.size()).template cast<byte_t const>();
-            if (longest_needle_fits_on_one_core)
-                executor.for_each_thread([&](size_t core_index) noexcept {
-                    _count_short_needle_matches_in_one_part_t partial_result = count_short_needle_matches_in_one_part(
-                        haystack_bytes, core_index, cores_total, cache_line_width);
-                    if (core_index != 0) partial_result.total -= partial_result.prefix;
-                    count_across_cores.fetch_add(partial_result.total, std::memory_order_relaxed);
-                });
-            else
-                executor.for_each_thread([&](size_t core_index) noexcept {
-                    size_t partial_result =
-                        count_matches_in_one_part(haystack_bytes, core_index, cores_total, cache_line_width);
-                    count_across_cores.fetch_add(partial_result, std::memory_order_relaxed);
-                });
+            size_t const cores_total = executor.threads_count();
+            size_t const max_needle_length = dict_.max_needle_length();
+            fu::indexed_split_t const optimal_split = fu::indexed_split_t(haystack_bytes.size(), cores_total);
+            bool const longest_needle_fits_on_one_core = optimal_split.smallest_size() >= max_needle_length;
+
+            executor.for_threads([&](size_t core_index) noexcept {
+                fu::indexed_range_t const optimal_subrange = optimal_split[core_index];
+                size_t partial_count;
+                if (!longest_needle_fits_on_one_core)
+                    partial_count = count_matches_in_one_part(haystack_bytes, optimal_subrange);
+                else {
+                    auto partial_result = count_short_matches_in_one_part(haystack_bytes, optimal_subrange);
+                    partial_count = partial_result.total;
+                    partial_count -= non_zero_if<size_t>(partial_result.prefix, core_index > 0);
+                }
+                count_across_cores.fetch_add(partial_count, std::memory_order_relaxed);
+            });
             counts[haystack_index] = count_across_cores;
         }
 
@@ -830,8 +813,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
                       executor_type_ &&executor = {}, cpu_specs_t const &specs = {}) const noexcept {
 
         _sz_assert(counts.size() == haystacks.size());
-        size_t const cores_total = executor.thread_count();
-        size_t const cache_line_width = specs.cache_line_width;
+        size_t const cores_total = executor.threads_count();
 
         using haystacks_t = typename std::remove_reference_t<haystacks_type_>;
         using haystack_t = typename haystacks_t::value_type;
@@ -845,7 +827,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
             offsets_per_haystack[i] = offsets_per_haystack[i - 1] + counts[i - 1];
 
         // Process the small haystacks, outputting their matches individually without any synchronization
-        executor.for_each_dynamic(counts.size(), [&](size_t haystack_index) noexcept {
+        executor.for_n_dynamic(counts.size(), [&](size_t haystack_index) noexcept {
             haystack_t const &haystack = haystacks[haystack_index];
             auto haystack_bytes = span<char_t const>(haystack.data(), haystack.size()).template cast<byte_t const>();
             if (haystack_bytes.size() > specs.l2_bytes) return;
@@ -871,9 +853,10 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
             if (haystack_bytes.size() <= specs.l2_bytes) continue;
 
             // First, on each core, estimate the number of matches in the haystack
-            executor.for_each_thread([&](size_t core_index) noexcept {
-                counts_per_core[core_index] =
-                    count_matches_in_one_part(haystack_bytes, core_index, cores_total, cache_line_width);
+            fu::indexed_split_t const optimal_split = fu::indexed_split_t(haystack.size(), cores_total);
+            executor.for_threads([&](size_t core_index) noexcept {
+                fu::indexed_range_t const optimal_subrange = optimal_split[core_index];
+                counts_per_core[core_index] = count_matches_in_one_part(haystack_bytes, optimal_subrange);
             });
 
             // Now that we know the number of matches to expect per slice, we can convert the counts
@@ -889,23 +872,21 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
             // On each core, pick an overlapping slice and go through all of the matches in it,
             // that start before the end of the private slice.
             size_t const count_matches_before_this_haystack = offsets_per_haystack[haystack_index];
-            executor.for_each_thread([&](size_t core_index) noexcept {
+            executor.for_threads([&](size_t core_index) noexcept {
                 size_t const count_matches_before_this_core = core_index ? counts_per_core[core_index - 1] : 0;
                 size_t const count_matches_expected_on_this_core =
                     counts_per_core[core_index] - count_matches_before_this_core;
 
-                // Get the optimal slice for this core
-                auto optimal_slice = haystack_part_for_core(haystack_bytes, core_index, cores_total, cache_line_width);
-                if (optimal_slice.empty()) return; // No data for this core
-
-                byte_t const *optimal_start = optimal_slice.begin();
-                byte_t const *const optimal_end = optimal_slice.end();
+                // Scope the optimal slice for this core
+                fu::indexed_range_t const optimal_subrange = optimal_split[core_index];
+                byte_t const *optimal_begin = haystack_bytes.begin() + optimal_subrange.first;
+                byte_t const *const optimal_end = optimal_begin + optimal_subrange.count;
                 byte_t const *const overlapping_end =
                     std::min(optimal_end + max_needle_length - 1, haystack_bytes.end());
 
                 // Iterate through the matches in the overlapping region
                 size_t count_matches_found_on_this_core = 0;
-                dict_.find({optimal_start, overlapping_end}, [&](match_t match) noexcept {
+                dict_.find({optimal_begin, overlapping_end}, [&](match_t match) noexcept {
                     bool belongs_to_this_core = match.needle.begin() < optimal_end;
                     if (!belongs_to_this_core) return true;
                     match.haystack = haystack_bytes;
@@ -927,22 +908,24 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
 
     /**
      *  @brief  Helper method implementing the core logic of the parallel `try_count` and part of `try_find`.
-     *          For a given single input haystack, assumes all of the cores are processing it in parallel,
-     *          and this method is called from each core with its own index to count the number of potentially
-     *          overlapping matches.
+     *  @return Number of matches that @b begin in this core's slice and may end in another core's slice.
+     *
+     *  For a given single input haystack, assumes all of the cores are processing it in parallel,
+     *  and this method is called from each core with its own index to count the number of potentially
+     *  overlapping matches.
      */
-    size_t count_matches_in_one_part(span<byte_t const> haystack, size_t core_index, size_t cores_total,
-                                     size_t cache_line_width) const noexcept {
+    size_t count_matches_in_one_part(span<byte_t const> haystack,
+                                     fu::indexed_range_t const optimal_subrange) const noexcept {
 
         // We shouldn't even consider needles longer than the haystack
         size_t const max_needle_length = std::min(dict_.max_needle_length(), haystack.size());
 
-        // Get the optimal slice for this core
-        auto optimal_slice = haystack_part_for_core(haystack, core_index, cores_total, cache_line_width);
-        if (optimal_slice.empty()) return 0;
+        // Scope the optimal slice for this core
+        byte_t const *optimal_begin = haystack.begin() + optimal_subrange.first;
+        byte_t const *const optimal_end = optimal_begin + optimal_subrange.count;
 
         // First, each core will process its own slice excluding the overlapping regions
-        size_t const count_matches_non_overlapping = dict_.count(optimal_slice);
+        size_t const count_matches_non_overlapping = dict_.count({optimal_begin, optimal_end});
 
         // Now, each thread will take care of the subsequent overlapping regions,
         // but we must be careful for cases when the core-specific slice is shorter
@@ -950,22 +933,23 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
         // still may want an optimization for it down the road.
         byte_t const *overlapping_start;
         byte_t const *overlapping_end;
-        if (optimal_slice.begin() + max_needle_length >= optimal_slice.end()) {
+        if (optimal_begin + max_needle_length >= optimal_end) {
             // Our needles are longer than a slice for the core
-            overlapping_start = optimal_slice.begin();
-            overlapping_end = std::min(optimal_slice.end() + max_needle_length, haystack.end());
+            overlapping_start = optimal_begin;
+            overlapping_end = std::min(optimal_end + max_needle_length, haystack.end());
         }
         else {
-            overlapping_start = std::max(optimal_slice.end() - max_needle_length + 1, optimal_slice.begin());
-            overlapping_end = std::min(optimal_slice.end() + max_needle_length - 1, haystack.end());
+            overlapping_start = std::max(optimal_end - max_needle_length + 1, optimal_begin);
+            overlapping_end = std::min(optimal_end + max_needle_length - 1, haystack.end());
         }
 
         // Count the matches that start in one core's slice and end in another
         size_t count_matches_overlapping = 0;
         dict_.find({overlapping_start, overlapping_end}, [&](match_t match) noexcept {
-            bool belongs_to_this_core =                       //
-                match.needle.begin() < optimal_slice.end() && // ? Starts within the core's slice
-                match.needle.end() > optimal_slice.end();     // ? Ends in another core's slice
+            bool belongs_to_this_core =                  //
+                match.needle.begin() >= optimal_begin && // ? Starts within or after this core's slice
+                match.needle.begin() < optimal_end &&    // ? Starts before this core's slice ends
+                match.needle.end() > optimal_end;        // ? Ends beyond this core's slice
             count_matches_overlapping += belongs_to_this_core;
             return true;
         });
@@ -975,37 +959,34 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
     }
 
     /**
-     *  @brief  Helper method implementing the core logic of the parallel `try_count` and part of `try_find`.
+     *  @brief  More optimized alternative to the `count_matches_in_one_part`, that assumes short needles.
+     *  @return Number of matches that @b begin in this core's slice and may end in another core's slice.
      *
      *  A more optimized alternative to the `count_matches_in_one_part`, that assumes that the length of the longest
      *  needle is smaller than the length of a single core slice. It means that in the least convenient case, the
      *  match can only spill into 2 core regions, starting in one and ending in another.
      */
-    _count_short_needle_matches_in_one_part_t count_short_needle_matches_in_one_part(
-        span<byte_t const> haystack, size_t core_index, size_t cores_total, size_t cache_line_width) const noexcept {
+    _count_short_matches_in_one_part_t count_short_matches_in_one_part(
+        span<byte_t const> haystack, fu::indexed_range_t const optimal_subrange) const noexcept {
 
-        // We won't face needles longer than the slice for the core
+        // Scope the optimal slice for this core
         size_t const max_needle_length = dict_.max_needle_length();
-
-        // Get the optimal slice for this core
-        auto optimal_slice = haystack_part_for_core(haystack, core_index, cores_total, cache_line_width);
-        if (optimal_slice.empty()) return {};
-
-        byte_t const *optimal_start = optimal_slice.data();
-        byte_t const *const prefix_end = std::min(optimal_start + max_needle_length, haystack.end());
-        byte_t const *const overlapping_end = std::min(optimal_slice.end() + max_needle_length, haystack.end());
+        byte_t const *optimal_begin = haystack.begin() + optimal_subrange.first;
+        byte_t const *const optimal_end = optimal_begin + optimal_subrange.count;
+        byte_t const *const prefix_end = std::min(optimal_begin + max_needle_length, haystack.end());
+        byte_t const *const overlapping_end = std::min(optimal_end + max_needle_length, haystack.end());
 
         // Reimplement the serial `aho_corasick_dictionary::count` keeping track of the matches,
         // entirely fitting in the prefix
-        _count_short_needle_matches_in_one_part_t result;
+        _count_short_matches_in_one_part_t result;
         state_id_t current_state = 0;
         auto const outputs_counts = dict_.outputs_counts();
         auto const transitions = dict_.transitions();
-        for (; optimal_start != overlapping_end; ++optimal_start) {
-            current_state = transitions[current_state][*optimal_start];
+        for (; optimal_begin != overlapping_end; ++optimal_begin) {
+            current_state = transitions[current_state][*optimal_begin];
             auto const outputs_count = outputs_counts[current_state];
             result.total += outputs_count;
-            result.prefix += non_zero_if<size_t>(outputs_count, optimal_start < prefix_end);
+            result.prefix += non_zero_if<size_t>(outputs_count, optimal_begin < prefix_end);
         }
 
         return result;
diff --git a/drafts/test_find_many.cuh b/drafts/test_find_many.cuh
new file mode 100644
index 00000000..729ea306
--- /dev/null
+++ b/drafts/test_find_many.cuh
@@ -0,0 +1,490 @@
+/**
+ *  @brief   Extensive @b stress-testing suite for StringCuZilla parallel operations, written in CUDA C++.
+ *  @see     Stress-tests on real-world and synthetic data are integrated into the @b `scripts/bench*.cpp` benchmarks.
+ *
+ *  @file    test_stringzillas.cuh
+ *  @author  Ash Vardanian
+ */
+#include <cstring> // `std::memcmp`
+#include <thread>  // `std::thread::hardware_concurrency`
+
+#define FU_ENABLE_NUMA 0
+#include <fork_union.hpp> // Fork-join scoped thread pool
+
+#include "stringzillas/find_many.hpp"
+
+#if SZ_USE_CUDA
+#include "stringzillas/find_many.cuh"
+#endif
+
+#if !_SZ_IS_CPP17
+#error "This test requires C++17 or later."
+#endif
+
+#include "test_stringzilla.hpp" // `arrow_strings_view_t`
+
+namespace ashvardanian {
+namespace stringzillas {
+namespace scripts {
+
+namespace fu = fork_union;
+using namespace stringzilla;
+using namespace stringzilla::scripts;
+
+struct find_many_baselines_t {
+    using match_t = find_many_match_t;
+
+    arrow_strings_tape_t needles_;
+
+    template <typename needles_type_>
+    status_t try_build(needles_type_ &&needles) noexcept {
+        return needles_.try_assign(needles.begin(), needles.end());
+    }
+
+    void reset() noexcept { needles_.reset(); }
+
+    template <typename haystack_type_, typename needles_type_, typename match_callback_type_>
+    bool one_haystack(haystack_type_ const &haystack, needles_type_ const &needles,
+                      match_callback_type_ &&callback) const noexcept {
+
+        // A wise man once said, `omp parallel for collapse(2) schedule(dynamic, 1)`...
+        // But the compiler wasn't listening, and won't compile the cancellation point!
+        // So we resort to a much less intricate solution:
+        // - Manually slice the data per thread,
+        // - Keep one atomic variable to signal cancellation,
+        // - Use absolutely minimal OpenMP functionality just to assign N slices to N threads.
+        std::atomic<bool> aborted {false};
+        std::size_t const haystack_size = haystack.size();
+        std::size_t const threads_count = std::thread::hardware_concurrency();
+        std::size_t const start_offsets_per_thread = divide_round_up(haystack_size, threads_count);
+
+#pragma omp parallel for schedule(static, 1)
+        for (std::size_t thread_index = 0; thread_index != threads_count; ++thread_index) {
+            std::size_t const start_offset = std::min(thread_index * start_offsets_per_thread, haystack_size);
+            std::size_t const end_offset = std::min(start_offset + start_offsets_per_thread, haystack_size);
+
+            // Check for matches in the current slice
+            for (std::size_t match_offset = start_offset;
+                 match_offset != end_offset && !aborted.load(std::memory_order_relaxed); ++match_offset) {
+                for (std::size_t needle_index = 0; needle_index != needles.size(); ++needle_index) {
+                    auto const &needle = needles[needle_index];
+                    if (match_offset + needle.size() > haystack_size) continue;
+                    auto const same = std::memcmp(haystack.data() + match_offset, needle.data(), needle.size()) == 0;
+                    if (!same) continue;
+
+                    // Create a match object
+                    match_t match;
+                    match.haystack_index = 0;
+                    match.needle_index = needle_index;
+                    match.haystack = {reinterpret_cast<byte_t const *>(haystack.data()), haystack.size()};
+                    match.needle = {reinterpret_cast<byte_t const *>(haystack.data() + match_offset), needle.size()};
+                    if (!callback(match)) {
+                        aborted.store(true, std::memory_order_relaxed);
+                        break;
+                    }
+                }
+            }
+        }
+
+        return !aborted.load(std::memory_order_relaxed);
+    }
+
+    template <typename haystacks_type_, typename needles_type_, typename match_callback_type_>
+    void all_pairs(haystacks_type_ &&haystacks, needles_type_ &&needles,
+                   match_callback_type_ &&callback) const noexcept {
+
+        for (std::size_t haystack_index = 0; haystack_index != haystacks.size(); ++haystack_index) {
+            auto const &haystack = haystacks[haystack_index];
+            if (!one_haystack(haystack, needles, [&](match_t match) noexcept {
+                    match.haystack_index = haystack_index;
+                    return callback(match);
+                }))
+                return;
+        }
+    }
+
+    template <typename haystacks_type_>
+    status_t try_count(haystacks_type_ &&haystacks, span<size_t> counts) const noexcept {
+        for (size_t &count : counts) count = 0;
+        all_pairs(haystacks, needles_, [&](match_t const &match) noexcept {
+            std::atomic_ref<size_t> count(counts[match.haystack_index]);
+            count.fetch_add(1, std::memory_order_relaxed);
+            return true;
+        });
+        return status_t::success_k;
+    }
+
+    template <typename haystacks_type_, typename output_matches_type_>
+    status_t try_find(haystacks_type_ &&haystacks, span<size_t const> counts,
+                      output_matches_type_ &&matches) const noexcept {
+
+        sz_unused(counts);
+        std::atomic<size_t> count_found {0};
+        std::size_t const count_allowed {matches.size()};
+        all_pairs(haystacks, needles_, [&](match_t const &match) noexcept {
+            size_t match_index = count_found.fetch_add(1, std::memory_order_relaxed);
+            matches[match_index] = match;
+            return match_index < count_allowed;
+        });
+        return status_t::success_k;
+    }
+};
+
+/**
+ *  @brief  Tests the correctness of the string class Levenshtein distance computation,
+ *          as well as the similarity scoring functions for bioinformatics-like workloads
+ *          on a @b fixed set of different representative ASCII and UTF-8 strings.
+ */
+template <typename base_operator_, typename simd_operator_, typename... extra_args_>
+void test_find_many_on(std::vector<std::string> haystacks, std::vector<std::string> needles,
+                       base_operator_ &&base_operator, simd_operator_ &&simd_operator, extra_args_ &&...extra_args) {
+
+    using match_t = find_many_match_t;
+
+    // First check with a batch-size of 1
+    unified_vector<size_t> counts_base(1), counts_simd(1);
+    unified_vector<match_t> matches_base(1), matches_simd(1);
+    arrow_strings_tape_t haystacks_tape, needles_tape;
+    needles_tape.try_assign(needles.data(), needles.data() + needles.size());
+
+    // Construct the matchers
+    status_t status_base = base_operator.try_build(needles_tape.view());
+    status_t status_simd = simd_operator.try_build(needles_tape.view());
+    _sz_assert(status_base == status_t::success_k);
+    _sz_assert(status_simd == status_t::success_k);
+
+    // Old C-style for-loops are much more debuggable than range-based loops!
+    for (std::size_t haystack_idx = 0; haystack_idx != haystacks.size(); ++haystack_idx) {
+        auto const &haystack = haystacks[haystack_idx];
+
+        // Reset the tapes and results
+        counts_base[0] = 0, counts_simd[0] = 0;
+        matches_base.clear(), matches_simd.clear();
+        haystacks_tape.try_assign(&haystack, &haystack + 1);
+
+        // Count with both backends
+        span<size_t> counts_base_span {counts_base.data(), counts_base.size()};
+        span<size_t> counts_simd_span {counts_simd.data(), counts_simd.size()};
+        status_t status_count_base = base_operator.try_count(haystacks_tape.view(), counts_base_span);
+        status_t status_count_simd = simd_operator.try_count(haystacks_tape.view(), counts_simd_span, extra_args...);
+        _sz_assert(status_count_base == status_t::success_k);
+        _sz_assert(status_count_simd == status_t::success_k);
+        _sz_assert(counts_base[0] == counts_simd[0]);
+
+        // Check the matches themselves
+        matches_base.resize(std::accumulate(counts_base.begin(), counts_base.end(), 0));
+        matches_simd.resize(std::accumulate(counts_simd.begin(), counts_simd.end(), 0));
+        status_t status_matched_base = base_operator.try_find(haystacks_tape.view(), counts_base_span, matches_base);
+        status_t status_matched_simd =
+            simd_operator.try_find(haystacks_tape.view(), counts_simd_span, matches_simd, extra_args...);
+        _sz_assert(status_matched_base == status_t::success_k);
+        _sz_assert(status_matched_simd == status_t::success_k);
+
+        // Check the contents and order of the matches
+        std::sort(matches_base.begin(), matches_base.end(), match_t::less_globally);
+        std::sort(matches_simd.begin(), matches_simd.end(), match_t::less_globally);
+        for (std::size_t i = 0; i != matches_base.size(); ++i) {
+            _sz_assert(matches_base[i].haystack.data() == matches_simd[i].haystack.data());
+            _sz_assert(matches_base[i].needle.data() == matches_simd[i].needle.data());
+            _sz_assert(matches_base[i].needle_index == matches_simd[i].needle_index);
+        }
+    }
+
+    // Now test all the haystacks simultaneously
+    {
+        haystacks_tape.try_assign(haystacks.data(), haystacks.data() + haystacks.size());
+        counts_base.resize(haystacks.size());
+        counts_simd.resize(haystacks.size());
+
+        // Count with both backends and compare all of the bounds
+        span<size_t> counts_base_span {counts_base.data(), counts_base.size()};
+        span<size_t> counts_simd_span {counts_simd.data(), counts_simd.size()};
+        status_t status_count_base = base_operator.try_count(haystacks_tape.view(), counts_base_span);
+        status_t status_count_simd = simd_operator.try_count(haystacks_tape.view(), counts_simd_span, extra_args...);
+        _sz_assert(status_count_base == status_t::success_k);
+        _sz_assert(status_count_simd == status_t::success_k);
+        _sz_assert(std::equal(counts_base.begin(), counts_base.end(), counts_simd.begin()));
+
+        // Check the matches themselves
+        matches_base.resize(std::accumulate(counts_base.begin(), counts_base.end(), 0));
+        matches_simd.resize(std::accumulate(counts_simd.begin(), counts_simd.end(), 0));
+        status_t status_matched_base = base_operator.try_find(haystacks_tape.view(), counts_base_span, matches_base);
+        status_t status_matched_simd =
+            simd_operator.try_find(haystacks_tape.view(), counts_simd_span, matches_simd, extra_args...);
+        _sz_assert(status_matched_base == status_t::success_k);
+        _sz_assert(status_matched_simd == status_t::success_k);
+
+        // Check the contents and order of the matches
+        std::sort(matches_base.begin(), matches_base.end(), match_t::less_globally);
+        std::sort(matches_simd.begin(), matches_simd.end(), match_t::less_globally);
+        for (std::size_t i = 0; i != matches_base.size(); ++i) {
+            _sz_assert(matches_base[i].haystack.data() == matches_simd[i].haystack.data());
+            _sz_assert(matches_base[i].needle.data() == matches_simd[i].needle.data());
+            _sz_assert(matches_base[i].needle_index == matches_simd[i].needle_index);
+        }
+    }
+}
+
+/**
+ *  @brief  Tests the correctness of the string class Levenshtein distance computation,
+ *          as well as the similarity scoring functions for bioinformatics-like workloads
+ *          on a @b fixed set of different representative ASCII and UTF-8 strings.
+ */
+template <typename base_operator_, typename simd_operator_, typename... extra_args_>
+void test_find_many_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_operator, extra_args_ &&...extra_args) {
+
+    {
+        std::vector<std::string> haystacks, needles;
+
+        // Some vary basic variants:
+        needles.emplace_back("his");
+        needles.emplace_back("is");
+        needles.emplace_back("she");
+        needles.emplace_back("her");
+
+        needles.emplace_back("école"), needles.emplace_back("école");                   // decomposed
+        needles.emplace_back("Schön"), needles.emplace_back("Scho\u0308n");             // combining diaeresis
+        needles.emplace_back("naïve"), needles.emplace_back("naive");                   // stripped diaeresis
+        needles.emplace_back("façade"), needles.emplace_back("facade");                 // no cedilla
+        needles.emplace_back("office"), needles.emplace_back("ofﬁce");                  // “fi” ligature
+        needles.emplace_back("Straße"), needles.emplace_back("Strasse");                // ß vs ss
+        needles.emplace_back("ABBA"), needles.emplace_back("\u0410\u0412\u0412\u0410"); // Latin vs Cyrillic
+        needles.emplace_back("中国"), needles.emplace_back("中國");                     // simplified vs traditional
+        needles.emplace_back("🙂"), needles.emplace_back("☺️");                          // emoji variants
+        needles.emplace_back("€100"), needles.emplace_back("EUR 100"); // currency symbol vs abbreviation
+
+        // Haystacks should contain arbitrary strings including those needles
+        // in different positions, potentially interleaving
+        haystacks.emplace_back("That is a test string"); // ? "only "is"
+        haystacks.emplace_back("This is a test string"); // ? "his", 2x "is"
+        haystacks.emplace_back("ahishers");              // textbook example
+        haystacks.emplace_back("hishishersherishis");    // heavy overlap, prefix & suffix collisions
+        haystacks.emplace_back("si siht si a tset gnirts; reh ton si ehs, tub sih ti si."); // no real matches
+        haystacks.emplace_back("his\0is\r\nshe\0her");                                      // null-included
+
+        // ~260 chars – dense English with overlapping words (“his”, “is”, “she”, “her”)
+        haystacks.emplace_back(R"(
+        In this historic thesis, the historian highlights his findings: this is the synthesis of data.
+        She examined the theory, he shared her methodology. In this chapter, he lists his equipment:
+        microscope, test kit, sensor. It is here that she erred: misalignment arises.
+        )");
+
+        // ~320 chars – multilingual snippet with needles in Latin, Arabic, Chinese, English
+        haystacks.emplace_back(R"(
+        The conference in 北京 attracted researchers from across the globe. His presentation “AI in Healthcare”
+        was a hit—she received awards. الباحثون استعرضوا الأبحاث، واستشارت her colleagues. 这是一次重要的会议。
+        She said: “This is only the beginning.” In her report, his name appears seventeen times.
+        )");
+
+        test_find_many_on(haystacks, needles, base_operator, simd_operator, extra_args...);
+    }
+
+    // Many of our algorithms depend on the idea that needles are shorter than the slices that each core may receive
+    {
+        std::vector<std::string> haystacks, needles;
+        needles.emplace_back("is");
+        needles.emplace_back("his");
+
+        haystacks.emplace_back("this is his, that is his, those are his, these are his");
+        haystacks.emplace_back("his is this, his is that, his are those, his are these");
+        haystacks.emplace_back(R"(
+        1 is this 2 is this 3 is this 4 is this 5 is this 6 is this 7 is this 8 is this
+        9 is this 10 is this 11 is this 12 is this 13 is this 14 is this 15 is this 16 is this
+        )");
+
+        test_find_many_on(haystacks, needles, base_operator, simd_operator, extra_args...);
+    }
+
+    // Try even simpler alphabets
+    {
+        std::vector<std::string> haystacks, needles;
+        needles.emplace_back("ab");
+        needles.emplace_back("aba");
+
+        haystacks.emplace_back("abababababababababababababababababababababababababababababababababab");
+        haystacks.emplace_back("abbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabba");
+
+        test_find_many_on(haystacks, needles, base_operator, simd_operator, extra_args...);
+    }
+
+    // Try a combination of very short and very long needles
+    {
+        std::vector<std::string> haystacks, needles;
+        needles.emplace_back("a");
+        needles.emplace_back("b");
+        needles.emplace_back("abracadabra");
+
+        haystacks.emplace_back("abracadabra");
+        haystacks.emplace_back("abracadabracadabra");
+
+        test_find_many_on(haystacks, needles, base_operator, simd_operator, extra_args...);
+    }
+}
+
+/**
+ *  @brief Fuzzy test for multi-pattern exact search algorithms using randomly-generated haystacks and needles.
+ */
+template <typename base_operator_, typename simd_operator_, typename... extra_args_>
+void test_find_many(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
+                    arrow_strings_tape_t const &haystacks_tape, arrow_strings_tape_t const &needles_tape,
+                    extra_args_ &&...extra_args) {
+
+    using match_t = find_many_match_t;
+    unified_vector<match_t> results_base, results_simd;
+    unified_vector<size_t> counts_base, counts_simd;
+
+    counts_base.resize(haystacks_tape.size());
+    counts_simd.resize(haystacks_tape.size());
+
+    // Build the matchers
+    _sz_assert(base_operator.try_build(needles_tape.view()) == status_t::success_k);
+    _sz_assert(simd_operator.try_build(needles_tape.view()) == status_t::success_k);
+
+    // Count the number of matches with both backends
+    span<size_t> counts_base_span {counts_base.data(), counts_base.size()};
+    span<size_t> counts_simd_span {counts_simd.data(), counts_simd.size()};
+    status_t status_count_base = base_operator.try_count(haystacks_tape.view(), counts_base_span);
+    status_t status_count_simd = simd_operator.try_count(haystacks_tape.view(), counts_simd_span, extra_args...);
+    _sz_assert(status_count_base == status_t::success_k);
+    _sz_assert(status_count_simd == status_t::success_k);
+    size_t total_count_base = std::accumulate(counts_base.begin(), counts_base.end(), 0);
+    size_t total_count_simd = std::accumulate(counts_simd.begin(), counts_simd.end(), 0);
+    _sz_assert(total_count_base == total_count_simd);
+    _sz_assert(std::equal(counts_base.begin(), counts_base.end(), counts_simd.begin()));
+
+    // Compute with both backends
+    results_base.resize(total_count_base);
+    results_simd.resize(total_count_simd);
+    size_t count_base = 0, count_simd = 0;
+    status_t status_base = base_operator.try_find(haystacks_tape.view(), counts_base_span, results_base);
+    status_t status_simd = simd_operator.try_find(haystacks_tape.view(), counts_simd_span, results_simd, extra_args...);
+    _sz_assert(status_base == status_t::success_k);
+    _sz_assert(status_simd == status_t::success_k);
+    _sz_assert(count_base == count_simd);
+
+    // Individually log the failed results
+    std::sort(results_base.begin(), results_base.end(), match_t::less_globally);
+    std::sort(results_simd.begin(), results_simd.end(), match_t::less_globally);
+    for (std::size_t i = 0; i != results_base.size(); ++i) {
+        _sz_assert(results_base[i].haystack_index == results_simd[i].haystack_index);
+        _sz_assert(results_base[i].needle_index == results_simd[i].needle_index);
+        _sz_assert(results_base[i].needle.data() == results_simd[i].needle.data());
+    }
+
+    base_operator.reset();
+    simd_operator.reset();
+}
+
+/**
+ *  @brief Fuzzy test for multi-pattern exact search algorithms using randomly-generated haystacks and needles.
+ */
+template <typename base_operator_, typename simd_operator_, typename... extra_args_>
+void test_find_many_fuzzy(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
+                          fuzzy_config_t needles_config = {}, fuzzy_config_t haystacks_config = {},
+                          std::size_t iterations = 10, extra_args_ &&...extra_args) {
+
+    std::vector<std::string> haystacks_array, needles_array;
+    arrow_strings_tape_t haystacks_tape, needles_tape;
+
+    // Generate some random strings, using a small alphabet
+    for (std::size_t iteration_idx = 0; iteration_idx < iterations; ++iteration_idx) {
+        randomize_strings(haystacks_config, haystacks_array, haystacks_tape);
+        randomize_strings(needles_config, needles_array, needles_tape, true);
+        test_find_many(base_operator, simd_operator, haystacks_tape, needles_tape, extra_args...);
+    }
+}
+
+/**
+ *  @brief  Fuzzy test for multi-pattern exact search algorithms using randomly-generated haystacks,
+ *          and using incrementally longer potentially-overlapping substrings as needles.
+ */
+template <typename base_operator_, typename simd_operator_, typename... extra_args_>
+void test_find_many_prefixes(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
+                             fuzzy_config_t haystacks_config, std::size_t needle_length_limit,
+                             std::size_t iterations = 10, extra_args_ &&...extra_args) {
+
+    std::vector<std::string> haystacks_array;
+    std::vector<std::string_view> needles_array;
+    arrow_strings_tape_t haystacks_tape, needles_tape;
+
+    for (std::size_t iteration_idx = 0; iteration_idx < iterations; ++iteration_idx) {
+        randomize_strings(haystacks_config, haystacks_array, haystacks_tape);
+
+        // Pick various substrings as needles from the first haystack
+        needles_array.resize(std::min(haystacks_array[0].size(), needle_length_limit));
+        for (std::size_t i = 0; i != needles_array.size(); ++i)
+            needles_array[i] = std::string_view(haystacks_array[0]).substr(0, i + 1);
+        needles_tape.try_assign(needles_array.data(), needles_array.data() + needles_array.size());
+
+        test_find_many(base_operator, simd_operator, haystacks_tape, needles_tape, extra_args...);
+    }
+}
+
+/**
+ *  @brief  Tests the multi-pattern exact substring search algorithm
+ *          against a baseline implementation for predefined and random inputs.
+ */
+void test_find_many_equivalence() {
+
+    cpu_specs_t default_cpu_specs;
+    fuzzy_config_t needles_short_config, needles_long_config, haystacks_config;
+    haystacks_config.batch_size = default_cpu_specs.cores_total() * 4;
+    haystacks_config.max_string_length = default_cpu_specs.l3_bytes;
+
+    needles_short_config.min_string_length = 1;
+    needles_short_config.max_string_length = 4;
+    needles_short_config.batch_size =
+        std::pow(needles_short_config.alphabet.size(), needles_short_config.max_string_length);
+
+    needles_long_config.min_string_length = 3;
+    needles_long_config.max_string_length = 6;
+    needles_long_config.batch_size =
+        std::pow(needles_long_config.alphabet.size(), needles_long_config.max_string_length);
+
+#if SZ_USE_CUDA
+    gpu_specs_t first_gpu_specs = *gpu_specs();
+#endif
+
+    // Single-threaded serial Aho-Corasick implementation
+    test_find_many_fixed(find_many_baselines_t {}, find_many_u32_serial_t {});
+
+    // Multi-threaded parallel Aho-Corasick implementation
+    for (std::size_t threads : {2, 3, 4, 5}) {
+        fu::basic_pool_t pool;
+        if (!pool.try_spawn(threads)) throw std::runtime_error("Failed to spawn thread pool.");
+        static_assert(executor_like<fu::basic_pool_t>);
+        test_find_many_fixed(find_many_baselines_t {}, find_many_u32_parallel_t {}, pool);
+    }
+
+    // Let's reuse a thread-pool to amortize the cost of spawning threads.
+    fu::basic_pool_t pool;
+    if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
+    static_assert(executor_like<fu::basic_pool_t>);
+
+#if SZ_USE_CUDA
+    test_find_many_fixed(find_many_baselines_t {}, find_many_u32_cuda_t {}, cuda_executor_t {});
+    test_find_many_fuzzy(find_many_baselines_t {}, find_many_u32_cuda_t {}, needles_short_config, haystacks_config, 1,
+                         cuda_executor_t {});
+    test_find_many_fuzzy(find_many_baselines_t {}, find_many_u32_cuda_t {}, needles_long_config, haystacks_config, 1,
+                         cuda_executor_t {});
+    test_find_many_prefixes(find_many_baselines_t {}, find_many_u32_cuda_t {}, haystacks_config, 1024, 1,
+                            cuda_executor_t {});
+#endif
+
+    // Fuzzy tests with random inputs
+    test_find_many_fuzzy(find_many_baselines_t {}, find_many_u32_serial_t {}, needles_short_config, haystacks_config,
+                         1);
+    test_find_many_fuzzy(find_many_baselines_t {}, find_many_u32_serial_t {}, needles_long_config, haystacks_config, 1);
+    test_find_many_prefixes(find_many_baselines_t {}, find_many_u32_serial_t {}, haystacks_config, 1024, 1);
+
+    // Fuzzy tests with random inputs for multi-threaded CPU backend
+    test_find_many_fuzzy(find_many_baselines_t {}, find_many_u32_parallel_t {}, needles_short_config, haystacks_config,
+                         10, pool);
+    test_find_many_fuzzy(find_many_baselines_t {}, find_many_u32_parallel_t {}, needles_long_config, haystacks_config,
+                         10, pool);
+    test_find_many_prefixes(find_many_baselines_t {}, find_many_u32_parallel_t {}, haystacks_config, 1024, 10, pool);
+}
+
+} // namespace scripts
+} // namespace stringzillas
+} // namespace ashvardanian
diff --git a/include/stringzillas/similarity.hpp b/include/stringzillas/similarity.hpp
index dfc0e53d..554ee610 100644
--- a/include/stringzillas/similarity.hpp
+++ b/include/stringzillas/similarity.hpp
@@ -575,7 +575,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 
         error_cost_t const gap_cost = gap_costs_.open_or_extend;
 
-        executor.for_each_static(n, [&](size_t i) noexcept {
+        executor.for_n(n, [&](size_t i) noexcept {
             score_t pre_substitution = scores_pre_substitution[i];
             score_t pre_insertion = scores_pre_insertion[i];
             score_t pre_deletion = scores_pre_deletion[i];
@@ -659,7 +659,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 
         error_cost_t const gap_cost = gap_costs_.open_or_extend;
         std::atomic<score_t> atomic_best_score {best_score_};
-        executor.for_each_slice(n, [&](size_t i_start, size_t i_end) noexcept {
+        executor.for_slices(n, [&](size_t i_start, size_t i_end) noexcept {
             score_t local_best_score = atomic_best_score;
             for (size_t i = i_start; i < i_end; ++i) {
                 score_t pre_substitution = scores_pre_substitution[i];
@@ -772,7 +772,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         score_t *scores_new_deletions,                                                   //
         executor_type_ &&executor = {}) noexcept {
 
-        executor.for_each_static(n, [&](size_t i) noexcept {
+        executor.for_n(n, [&](size_t i) noexcept {
             score_t pre_substitution = scores_pre_substitution[i];
             score_t pre_insertion_opening = scores_pre_insertion[i];
             score_t pre_deletion_opening = scores_pre_deletion[i];
@@ -879,7 +879,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         executor_type_ &&executor = {}) noexcept {
 
         std::atomic<score_t> atomic_best_score {best_score_};
-        executor.for_each_slice(n, [&](size_t i_start, size_t i_end) noexcept {
+        executor.for_slices(n, [&](size_t i_start, size_t i_end) noexcept {
             score_t local_best_score = atomic_best_score;
             for (size_t i = i_start; i < i_end; ++i) {
                 score_t pre_substitution = scores_pre_substitution[i];
@@ -2116,7 +2116,7 @@ status_t _score_in_parallel(
 
     // ? There may be a huge variance in the lengths of the strings,
     // ? so we need to use a dynamic schedule.
-    executor.for_each_dynamic(first_size, [&](size_t i) noexcept {
+    executor.for_n_dynamic(first_size, [&](size_t i) noexcept {
         if (error.load() != status_t::success_k) return;
         score_t result = 0;
         auto const &first = first_strings[i];
@@ -2975,7 +2975,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
 
         // In this variant we will need at most (64 * 1024 / 32) = 2048 loops per diagonal.
         size_t const body_pages = hbt.body / step_k;
-        executor.for_each_static(body_pages, [&](size_t const page) noexcept {
+        executor.for_n(body_pages, [&](size_t const page) noexcept {
             size_t const progress = page * step_k;
             slice_aligned32chars(                                                                             //
                 first_reversed_slice + progress, second_slice + progress, scores_pre_substitution + progress, //
@@ -3121,7 +3121,7 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_subst
 
         // In this variant we will need at most (64 * 1024 / 16) = 4096 loops per diagonal.
         size_t const body_pages = hbt.body / step_k;
-        executor.for_each_static(body_pages, [&](size_t const page) noexcept {
+        executor.for_n(body_pages, [&](size_t const page) noexcept {
             size_t const progress = page * step_k;
             slice_aligned16chars(                                                                             //
                 first_reversed_slice + progress, second_slice + progress, scores_pre_substitution + progress, //
@@ -3266,7 +3266,7 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
                 match_cost_vec, mismatch_cost_vec, gap_cost_vec);
 
         size_t const body_pages = hbt.body / step_k;
-        executor.for_each_static(body_pages, [&](size_t const page) noexcept {
+        executor.for_n(body_pages, [&](size_t const page) noexcept {
             size_t const progress = page * step_k;
             slice_aligned16chars(                                                                             //
                 first_reversed_slice + progress, second_slice + progress, scores_pre_substitution + progress, //
@@ -3486,7 +3486,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
 
         // In this variant we will need at most (64 * 1024 / 32) = 2048 loops per diagonal.
         size_t const body_pages = length / step_k;
-        executor.for_each_static(body_pages, [&](size_t const page) noexcept {
+        executor.for_n(body_pages, [&](size_t const page) noexcept {
             size_t const progress = page * step_k;
             slice_upto32chars(                                                                                       //
                 first_reversed_slice + progress, second_slice + progress, step_k,                                    //
@@ -3607,7 +3607,7 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
 
         // Handle the body in parallel, despite having misaligned writes:
         size_t const body_pages = length / step_k;
-        executor.for_each_static(body_pages, [&](size_t const page) noexcept {
+        executor.for_n(body_pages, [&](size_t const page) noexcept {
             size_t const progress = page * step_k;
             slice_upto16chars(                                                             //
                 first_reversed_slice + progress, second_slice + progress, step_k,          //
@@ -3937,7 +3937,7 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_
 
         // Progress through the row 64 characters at a time.
         size_t const count_slices = n / 64;
-        executor.for_each_static(count_slices, [&](size_t idx_slice) noexcept {
+        executor.for_n(count_slices, [&](size_t idx_slice) noexcept {
             slice_64chars(second_slice, idx_slice * 64, gap, scores_pre_substitution, scores_pre_insertion, scores_new);
         });
 
@@ -4077,7 +4077,7 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i32_t, error_costs_
 
         // Progress through the row 64 characters at a time.
         size_t const count_slices = n / 64;
-        executor.for_each_static(count_slices, [&](size_t idx_slice) noexcept {
+        executor.for_n(count_slices, [&](size_t idx_slice) noexcept {
             slice_64chars(second_slice, idx_slice * 64, gap, scores_pre_substitution, scores_pre_insertion, scores_new);
         });
 
diff --git a/include/stringzillas/types.hpp b/include/stringzillas/types.hpp
index 0dd7fd19..a2feee75 100644
--- a/include/stringzillas/types.hpp
+++ b/include/stringzillas/types.hpp
@@ -25,7 +25,7 @@ enum bytes_per_cell_t : uint {
 
 struct dummy_executor_t {
 
-    constexpr size_t thread_count() const noexcept { return 1; }
+    constexpr size_t threads_count() const noexcept { return 1; }
 
     /**
      *  @brief  Calls the @p function for each index from 0 to @p (n) in such
@@ -33,7 +33,7 @@ struct dummy_executor_t {
      *          the same thread.
      */
     template <typename function_type_>
-    inline void for_each_static(size_t n, function_type_ &&function) const noexcept {
+    inline void for_n(size_t n, function_type_ &&function) const noexcept {
         for (size_t i = 0; i < n; ++i) function(i);
     }
 
@@ -44,7 +44,7 @@ struct dummy_executor_t {
      *          handled by a particular thread.
      */
     template <typename function_type_>
-    inline void for_each_slice(size_t n, function_type_ &&function) const noexcept {
+    inline void for_slices(size_t n, function_type_ &&function) const noexcept {
         function(0, n);
     }
 
@@ -54,7 +54,7 @@ struct dummy_executor_t {
      *          so each thread eagerly processes the next index in the range.
      */
     template <typename function_type_>
-    inline void for_each_dynamic(size_t n, function_type_ &&function) const noexcept {
+    inline void for_n_dynamic(size_t n, function_type_ &&function) const noexcept {
         for (size_t i = 0; i < n; ++i) function(i);
     }
 
@@ -63,27 +63,27 @@ struct dummy_executor_t {
      *  @param[in] function The callback, receiving the thread index as an argument.
      */
     template <typename function_type_>
-    void for_each_thread(function_type_ const &function) noexcept {
+    void for_threads(function_type_ &&function) noexcept {
         function(0);
     }
 };
 
 template <typename executor_type_>
 concept executor_like = requires(executor_type_ executor) {
-#if !defined(__NVCC__)
-    { executor.thread_count() } -> std::same_as<size_t>;
+#if !defined(__NVCC__) && 0
+    { executor.threads_count() } -> std::same_as<size_t>;
     {
-        executor.for_each_static(0u, [](size_t) {})
-    } -> std::same_as<void>;
+        executor.for_n(0u, [](size_t) {})
+    };
     {
-        executor.for_each_slice(0u, [](size_t, size_t) {})
-    } -> std::same_as<void>;
+        executor.for_slices(0u, [](size_t, size_t) {})
+    };
     {
-        executor.for_each_dynamic(0u, [](size_t) {})
-    } -> std::same_as<void>;
+        executor.for_n_dynamic(0u, [](size_t) {})
+    };
     {
-        executor.for_each_thread([](size_t) {})
-    } -> std::same_as<void>;
+        executor.for_threads([](size_t) {})
+    };
 #else
     sizeof(executor) > 0;
 #endif
@@ -97,7 +97,7 @@ struct openmp_executor_t {
      *          the same thread.
      */
     template <typename function_type_>
-    inline void for_each_static(size_t n, function_type_ &&function) const noexcept {
+    inline void for_n(size_t n, function_type_ &&function) const noexcept {
 #pragma omp parallel for
         for (size_t i = 0; i < n; ++i) function(i);
     }
@@ -109,7 +109,7 @@ struct openmp_executor_t {
      *          handled by a particular thread.
      */
     template <typename function_type_>
-    inline void for_each_slice(size_t n, function_type_ &&function) const noexcept {
+    inline void for_slices(size_t n, function_type_ &&function) const noexcept {
         // OpenMP won't use more threads than the number of available cores
         // and by using STL to query that number, we avoid the need to link
         // against OpenMP libraries.
@@ -129,7 +129,7 @@ struct openmp_executor_t {
      *          so each thread eagerly processes the next index in the range.
      */
     template <typename function_type_>
-    inline void for_each_dynamic(size_t n, function_type_ &&function) const noexcept {
+    inline void for_n_dynamic(size_t n, function_type_ &&function) const noexcept {
 #pragma omp parallel for schedule(dynamic, 1)
         for (size_t i = 0; i < n; ++i) function(i);
     }
@@ -139,7 +139,7 @@ struct openmp_executor_t {
      *  @param[in] function The callback, receiving the thread index as an argument.
      */
     template <typename function_type_>
-    void for_each_thread(function_type_ const &function) noexcept {
+    void for_threads(function_type_ const &function) noexcept {
         // ! Using the `omp_get_thread_num()` would force us to include the OpenMP headers
         // ! and link to the right symbols, which is not always possible.
         std::atomic<size_t> atomic_thread_index = 0;
@@ -150,7 +150,7 @@ struct openmp_executor_t {
         }
     }
 
-    inline size_t thread_count() const noexcept {
+    inline size_t threads_count() const noexcept {
         // ! Using the `omp_get_num_threads()` would force us to include the OpenMP headers
         // ! and link to the right symbols, which is not always possible.
         std::atomic<size_t> atomic_thread_index = 0;
@@ -163,7 +163,7 @@ struct openmp_executor_t {
 #if !defined(__NVCC__)
 static_assert(executor_like<dummy_executor_t>);
 static_assert(executor_like<openmp_executor_t>);
-static_assert(!executor_like<int>);
+// static_assert(!executor_like<int>);
 #endif
 
 template <typename continuous_type_>
diff --git a/scripts/bench_similarity.cuh b/scripts/bench_similarity.cuh
index 0165ae56..26848b6c 100644
--- a/scripts/bench_similarity.cuh
+++ b/scripts/bench_similarity.cuh
@@ -4,6 +4,7 @@
  */
 #include <tuple> // `std::tuple`
 
+#define FU_ENABLE_NUMA 0
 #include <fork_union.hpp> // Fork-join scoped thread pool
 
 #include <stringzillas/similarity.hpp> // C++ templates for string similarity measures
diff --git a/scripts/test_similarity.cuh b/scripts/test_similarity.cuh
new file mode 100644
index 00000000..09d2bf75
--- /dev/null
+++ b/scripts/test_similarity.cuh
@@ -0,0 +1,958 @@
+/**
+ *  @brief   Extensive @b stress-testing suite for StringCuZilla parallel operations, written in CUDA C++.
+ *  @see     Stress-tests on real-world and synthetic data are integrated into the @b `scripts/bench*.cpp` benchmarks.
+ *
+ *  @file    test_similarity.cuh
+ *  @author  Ash Vardanian
+ */
+#include "stringzillas/similarity.hpp"
+
+#if SZ_USE_CUDA
+#include "stringzillas/similarity.cuh"
+#endif
+
+#if !_SZ_IS_CPP17
+#error "This test requires C++17 or later."
+#endif
+
+#include "test_stringzilla.hpp" // `arrow_strings_view_t`
+
+namespace ashvardanian {
+namespace stringzillas {
+namespace scripts {
+
+/**
+ *  @brief Inefficient baseline Levenshtein distance computation, as implemented in most codebases.
+ *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
+ */
+inline std::size_t levenshtein_baseline(                                //
+    char const *s1, std::size_t len1, char const *s2, std::size_t len2, //
+    error_cost_t match_cost = 0, error_cost_t mismatch_cost = 1, error_cost_t gap_cost = 1) noexcept(false) {
+
+    std::size_t const rows = len1 + 1;
+    std::size_t const cols = len2 + 1;
+    std::vector<std::size_t> matrix_buffer(rows * cols);
+
+    // Initialize the borders of the matrix.
+    for (std::size_t i = 0; i < rows; ++i) matrix_buffer[i * cols + 0] /* [i][0] in 2D */ = i * gap_cost;
+    for (std::size_t j = 0; j < cols; ++j) matrix_buffer[0 * cols + j] /* [0][j] in 2D */ = j * gap_cost;
+
+    for (std::size_t i = 1; i < rows; ++i) {
+        std::size_t const *last_row = &matrix_buffer[(i - 1) * cols];
+        std::size_t *row = &matrix_buffer[i * cols];
+        for (std::size_t j = 1; j < cols; ++j) {
+            std::size_t substitution_cost = (s1[i - 1] == s2[j - 1]) ? match_cost : mismatch_cost;
+            std::size_t if_deletion_or_insertion = std::min(last_row[j], row[j - 1]) + gap_cost;
+            row[j] = std::min(if_deletion_or_insertion, last_row[j - 1] + substitution_cost);
+        }
+    }
+
+    return matrix_buffer.back();
+}
+
+/**
+ *  @brief Inefficient baseline Needleman-Wunsch alignment score computation, as implemented in most codebases.
+ *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
+ */
+inline std::ptrdiff_t needleman_wunsch_baseline(                        //
+    char const *s1, std::size_t len1, char const *s2, std::size_t len2, //
+    std::function<error_cost_t(char, char)> substitution_cost_for, error_cost_t gap_cost) noexcept(false) {
+
+    std::size_t const rows = len1 + 1;
+    std::size_t const cols = len2 + 1;
+    std::vector<std::ptrdiff_t> matrix_buffer(rows * cols);
+
+    // Initialize the borders of the matrix.
+    for (std::size_t i = 0; i < rows; ++i) matrix_buffer[i * cols + 0] /* [i][0] in 2D */ = i * gap_cost;
+    for (std::size_t j = 0; j < cols; ++j) matrix_buffer[0 * cols + j] /* [0][j] in 2D */ = j * gap_cost;
+
+    // Fill in the rest of the matrix.
+    for (std::size_t i = 1; i < rows; ++i) {
+        std::ptrdiff_t const *last_row = &matrix_buffer[(i - 1) * cols];
+        std::ptrdiff_t *row = &matrix_buffer[i * cols];
+        for (std::size_t j = 1; j < cols; ++j) {
+            std::ptrdiff_t substitution_cost = substitution_cost_for(s1[i - 1], s2[j - 1]);
+            std::ptrdiff_t if_substitution = last_row[j - 1] + substitution_cost;
+            std::ptrdiff_t if_deletion_or_insertion = std::max(last_row[j], row[j - 1]) + gap_cost;
+            row[j] = std::max(if_deletion_or_insertion, if_substitution);
+        }
+    }
+
+    return matrix_buffer.back();
+}
+
+/**
+ *  @brief Inefficient baseline Smith-Waterman local alignment score computation, as implemented in most codebases.
+ *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
+ */
+inline std::ptrdiff_t smith_waterman_baseline(char const *s1, std::size_t len1, char const *s2, std::size_t len2,
+                                              std::function<error_cost_t(char, char)> substitution_cost_for,
+                                              error_cost_t gap_cost) noexcept(false) {
+    std::size_t const rows = len1 + 1;
+    std::size_t const cols = len2 + 1;
+    std::vector<std::ptrdiff_t> matrix_buffer(rows * cols);
+
+    // Unlike the global alignment we need to track the largest score in the matrix.
+    std::ptrdiff_t best_score = 0;
+
+    // Initialize the borders of the matrix to 0.
+    for (std::size_t i = 0; i < rows; ++i) matrix_buffer[i * cols + 0] /* [i][0] in 2D */ = 0;
+    for (std::size_t j = 0; j < cols; ++j) matrix_buffer[0 * cols + j] /* [0][j] in 2D */ = 0;
+
+    // Fill in the rest of the matrix.
+    for (std::size_t i = 1; i < rows; ++i) {
+        std::ptrdiff_t const *last_row = &matrix_buffer[(i - 1) * cols];
+        std::ptrdiff_t *row = &matrix_buffer[i * cols];
+        for (std::size_t j = 1; j < cols; ++j) {
+            std::ptrdiff_t substitution_cost = substitution_cost_for(s1[i - 1], s2[j - 1]);
+            std::ptrdiff_t if_substitution = last_row[j - 1] + substitution_cost;
+            std::ptrdiff_t if_deletion_or_insertion = std::max(row[j - 1], last_row[j]) + gap_cost;
+            std::ptrdiff_t if_substitution_or_reset = std::max<std::ptrdiff_t>(if_substitution, 0);
+            std::ptrdiff_t score = std::max(if_deletion_or_insertion, if_substitution_or_reset);
+            row[j] = score;
+            best_score = std::max(best_score, score);
+        }
+    }
+
+    return best_score;
+}
+
+/**
+ *  @brief Inefficient baseline Levenshtein-Gotoh distance computation, as implemented in most codebases.
+ *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
+ */
+inline std::size_t levenshtein_gotoh_baseline(                          //
+    char const *s1, std::size_t len1, char const *s2, std::size_t len2, //
+    error_cost_t match_cost, error_cost_t mismatch_cost,                //
+    error_cost_t gap_opening_cost, error_cost_t gap_extension_cost) noexcept(false) {
+
+    std::size_t const rows = len1 + 1;
+    std::size_t const cols = len2 + 1;
+    std::vector<std::size_t> matrix_scores(rows * cols);
+    std::vector<std::size_t> matrix_inserts(rows * cols);
+    std::vector<std::size_t> matrix_deletes(rows * cols);
+
+    // Initialize the borders of the matrix.
+    // The supplementary matrices are initialized with values of higher magnitude,
+    // which is equivalent to discarding them. That's better than using `SIZE_MAX`
+    // as subsequent additions won't overflow.
+    matrix_scores[0] = 0;
+    for (std::size_t j = 1; j < cols; ++j) {
+        matrix_scores[0 * cols + j] = gap_opening_cost + (j - 1) * gap_extension_cost;
+        matrix_deletes[0 * cols + j] = matrix_scores[0 * cols + j] + gap_opening_cost + gap_extension_cost;
+    }
+    for (std::size_t i = 1; i < rows; ++i) {
+        matrix_scores[i * cols + 0] = gap_opening_cost + (i - 1) * gap_extension_cost;
+        matrix_inserts[i * cols + 0] = matrix_scores[i * cols + 0] + gap_opening_cost + gap_extension_cost;
+    }
+
+    // Fill in the rest of the matrix.
+    for (std::size_t i = 1; i < rows; ++i) {
+        std::size_t const *last_row = &matrix_scores[(i - 1) * cols];
+        std::size_t *row = &matrix_scores[i * cols];
+        std::size_t *row_inserts = &matrix_inserts[i * cols];
+        std::size_t const *last_deletes_row = &matrix_deletes[(i - 1) * cols];
+        std::size_t *row_deletes = &matrix_deletes[i * cols];
+        for (std::size_t j = 1; j < cols; ++j) {
+            std::size_t substitution_cost = (s1[i - 1] == s2[j - 1]) ? match_cost : mismatch_cost;
+            std::size_t if_substitution = last_row[j - 1] + substitution_cost;
+            std::size_t if_insertion =
+                std::min<std::size_t>(row[j - 1] + gap_opening_cost, row_inserts[j - 1] + gap_extension_cost);
+            std::size_t if_deletion =
+                std::min<std::size_t>(last_row[j] + gap_opening_cost, last_deletes_row[j] + gap_extension_cost);
+            std::size_t if_deletion_or_insertion = std::min(if_deletion, if_insertion);
+            row[j] = std::min(if_deletion_or_insertion, if_substitution);
+            row_inserts[j] = if_insertion;
+            row_deletes[j] = if_deletion;
+        }
+    }
+
+    return matrix_scores.back();
+}
+
+/**
+ *  @brief Inefficient baseline Needleman-Wunsch-Gotoh alignment score computation, as implemented in most codebases.
+ *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
+ *  @see https://github.com/gata-bio/affine-gaps
+ */
+inline std::ptrdiff_t needleman_wunsch_gotoh_baseline(                  //
+    char const *s1, std::size_t len1, char const *s2, std::size_t len2, //
+    std::function<error_cost_t(char, char)> substitution_cost_for,      //
+    error_cost_t gap_opening_cost, error_cost_t gap_extension_cost) noexcept(false) {
+
+    std::size_t const rows = len1 + 1;
+    std::size_t const cols = len2 + 1;
+    std::vector<std::ptrdiff_t> matrix_scores(rows * cols);
+    std::vector<std::ptrdiff_t> matrix_inserts(rows * cols);
+    std::vector<std::ptrdiff_t> matrix_deletes(rows * cols);
+
+    // Initialize the borders of the matrix.
+    matrix_scores[0] = 0;
+    for (std::size_t i = 1; i < rows; ++i) {
+        matrix_scores[i * cols + 0] = gap_opening_cost + (i - 1) * gap_extension_cost;
+        matrix_inserts[i * cols + 0] = matrix_scores[i * cols + 0] + gap_opening_cost + gap_extension_cost;
+    }
+    for (std::size_t j = 1; j < cols; ++j) {
+        matrix_scores[0 * cols + j] = gap_opening_cost + (j - 1) * gap_extension_cost;
+        matrix_deletes[0 * cols + j] = matrix_scores[0 * cols + j] + gap_opening_cost + gap_extension_cost;
+    }
+
+    // Fill in the rest of the matrix.
+    for (std::size_t i = 1; i < rows; ++i) {
+        std::ptrdiff_t const *last_row = &matrix_scores[(i - 1) * cols];
+        std::ptrdiff_t *row = &matrix_scores[i * cols];
+        std::ptrdiff_t *row_inserts = &matrix_inserts[i * cols];
+        std::ptrdiff_t const *last_deletes_row = &matrix_deletes[(i - 1) * cols];
+        std::ptrdiff_t *row_deletes = &matrix_deletes[i * cols];
+        for (std::size_t j = 1; j < cols; ++j) {
+            std::ptrdiff_t substitution_cost = substitution_cost_for(s1[i - 1], s2[j - 1]);
+            std::ptrdiff_t if_substitution = last_row[j - 1] + substitution_cost;
+            std::ptrdiff_t if_insertion =
+                std::max(row[j - 1] + gap_opening_cost, row_inserts[j - 1] + gap_extension_cost);
+            std::ptrdiff_t if_deletion =
+                std::max(last_row[j] + gap_opening_cost, last_deletes_row[j] + gap_extension_cost);
+            std::ptrdiff_t if_deletion_or_insertion = std::max(if_deletion, if_insertion);
+            row[j] = std::max(if_deletion_or_insertion, if_substitution);
+            row_inserts[j] = if_insertion;
+            row_deletes[j] = if_deletion;
+        }
+    }
+
+    return matrix_scores.back();
+}
+
+/**
+ *  @brief Inefficient baseline Smith-Waterman-Gotoh alignment score computation, as implemented in most codebases.
+ *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
+ *  @see https://github.com/gata-bio/affine-gaps
+ */
+inline std::ptrdiff_t smith_waterman_gotoh_baseline(                    //
+    char const *s1, std::size_t len1, char const *s2, std::size_t len2, //
+    std::function<error_cost_t(char, char)> substitution_cost_for,      //
+    error_cost_t gap_opening_cost, error_cost_t gap_extension_cost) noexcept(false) {
+
+    std::size_t const rows = len1 + 1;
+    std::size_t const cols = len2 + 1;
+    std::vector<std::ptrdiff_t> matrix_scores(rows * cols);
+    std::vector<std::ptrdiff_t> matrix_inserts(rows * cols);
+    std::vector<std::ptrdiff_t> matrix_deletes(rows * cols);
+
+    // Unlike the global alignment we need to track the largest score in the matrix.
+    std::ptrdiff_t best_score = 0;
+
+    // Initialize the borders of the matrix.
+    matrix_scores[0] = 0;
+    for (std::size_t i = 1; i < rows; ++i) {
+        matrix_scores[i * cols + 0] = 0;
+        matrix_inserts[i * cols + 0] = gap_opening_cost + gap_extension_cost;
+    }
+    for (std::size_t j = 1; j < cols; ++j) {
+        matrix_scores[0 * cols + j] = 0;
+        matrix_deletes[0 * cols + j] = gap_opening_cost + gap_extension_cost;
+    }
+
+    // Fill in the rest of the matrix.
+    for (std::size_t i = 1; i < rows; ++i) {
+        std::ptrdiff_t const *last_row = &matrix_scores[(i - 1) * cols];
+        std::ptrdiff_t *row = &matrix_scores[i * cols];
+        std::ptrdiff_t *row_inserts = &matrix_inserts[i * cols];
+        std::ptrdiff_t const *last_deletes_row = &matrix_deletes[(i - 1) * cols];
+        std::ptrdiff_t *row_deletes = &matrix_deletes[i * cols];
+        for (std::size_t j = 1; j < cols; ++j) {
+            std::ptrdiff_t substitution_cost = substitution_cost_for(s1[i - 1], s2[j - 1]);
+            std::ptrdiff_t if_substitution = last_row[j - 1] + substitution_cost;
+            std::ptrdiff_t if_insertion =
+                std::max(row[j - 1] + gap_opening_cost, row_inserts[j - 1] + gap_extension_cost);
+            std::ptrdiff_t if_deletion =
+                std::max(last_row[j] + gap_opening_cost, last_deletes_row[j] + gap_extension_cost);
+            std::ptrdiff_t if_deletion_or_insertion = std::max(if_deletion, if_insertion);
+            std::ptrdiff_t if_substitution_or_reset = std::max<std::ptrdiff_t>(if_substitution, 0);
+            std::ptrdiff_t score = std::max(if_deletion_or_insertion, if_substitution_or_reset);
+            row[j] = score;
+            row_inserts[j] = if_insertion;
+            row_deletes[j] = if_deletion;
+            best_score = std::max(best_score, score);
+        }
+    }
+
+    return best_score;
+}
+
+struct levenshtein_baselines_t {
+
+    uniform_substitution_costs_t substitution_costs = {0, 1};
+    error_cost_t gap_opening_cost = {1};
+    error_cost_t gap_extension_cost = {1};
+
+    levenshtein_baselines_t() = default;
+    levenshtein_baselines_t(uniform_substitution_costs_t subs, linear_gap_costs_t gap)
+        : substitution_costs(subs), gap_opening_cost(gap.open_or_extend), gap_extension_cost(gap.open_or_extend) {}
+    levenshtein_baselines_t(uniform_substitution_costs_t subs, affine_gap_costs_t gap)
+        : substitution_costs(subs), gap_opening_cost(gap.open), gap_extension_cost(gap.extend) {}
+
+    template <typename results_type_>
+    status_t operator()(arrow_strings_view_t first, arrow_strings_view_t second, results_type_ *results) const {
+        _sz_assert(first.size() == second.size());
+#pragma omp parallel for
+        for (std::size_t i = 0; i != first.size(); ++i)
+            results[i] =
+                gap_opening_cost == gap_extension_cost
+                    ? levenshtein_baseline(first[i].data(), first[i].size(),   //
+                                           second[i].data(), second[i].size(), //
+                                           substitution_costs.match, substitution_costs.mismatch, gap_opening_cost)
+                    : levenshtein_gotoh_baseline(first[i].data(), first[i].size(),   //
+                                                 second[i].data(), second[i].size(), //
+                                                 substitution_costs.match, substitution_costs.mismatch,
+                                                 gap_opening_cost, gap_extension_cost);
+        return status_t::success_k;
+    }
+};
+
+struct needleman_wunsch_baselines_t {
+
+    error_costs_256x256_t substitution_costs = error_costs_256x256_t::diagonal();
+    error_cost_t gap_opening_cost = -1;
+    error_cost_t gap_extension_cost = -1;
+
+    needleman_wunsch_baselines_t() = default;
+    needleman_wunsch_baselines_t(error_costs_256x256_t subs, linear_gap_costs_t gap)
+        : substitution_costs(subs), gap_opening_cost(gap.open_or_extend), gap_extension_cost(gap.open_or_extend) {}
+    needleman_wunsch_baselines_t(error_costs_256x256_t subs, affine_gap_costs_t gap)
+        : substitution_costs(subs), gap_opening_cost(gap.open), gap_extension_cost(gap.extend) {}
+
+    status_t operator()(arrow_strings_view_t first, arrow_strings_view_t second, sz_ssize_t *results) const {
+        _sz_assert(first.size() == second.size());
+
+#pragma omp parallel for
+        for (std::size_t i = 0; i != first.size(); ++i)
+            results[i] =
+                gap_opening_cost == gap_extension_cost
+                    ? needleman_wunsch_baseline(first[i].data(), first[i].size(),   //
+                                                second[i].data(), second[i].size(), //
+                                                substitution_costs, gap_opening_cost)
+                    : needleman_wunsch_gotoh_baseline(first[i].data(), first[i].size(),   //
+                                                      second[i].data(), second[i].size(), //
+                                                      substitution_costs, gap_opening_cost, gap_extension_cost);
+        return status_t::success_k;
+    }
+};
+
+struct smith_waterman_baselines_t {
+
+    error_costs_256x256_t substitution_costs = error_costs_256x256_t::diagonal();
+    error_cost_t gap_opening_cost = -1;
+    error_cost_t gap_extension_cost = -1;
+
+    smith_waterman_baselines_t() = default;
+    smith_waterman_baselines_t(error_costs_256x256_t subs, linear_gap_costs_t gap)
+        : substitution_costs(subs), gap_opening_cost(gap.open_or_extend), gap_extension_cost(gap.open_or_extend) {}
+    smith_waterman_baselines_t(error_costs_256x256_t subs, affine_gap_costs_t gap)
+        : substitution_costs(subs), gap_opening_cost(gap.open), gap_extension_cost(gap.extend) {}
+
+    status_t operator()(arrow_strings_view_t first, arrow_strings_view_t second, sz_ssize_t *results) const {
+        _sz_assert(first.size() == second.size());
+
+#pragma omp parallel for
+        for (std::size_t i = 0; i != first.size(); ++i)
+            results[i] = gap_opening_cost == gap_extension_cost
+                             ? smith_waterman_baseline(first[i].data(), first[i].size(),   //
+                                                       second[i].data(), second[i].size(), //
+                                                       substitution_costs, gap_opening_cost)
+                             : smith_waterman_gotoh_baseline(first[i].data(), first[i].size(),   //
+                                                             second[i].data(), second[i].size(), //
+                                                             substitution_costs, gap_opening_cost, gap_extension_cost);
+        return status_t::success_k;
+    }
+};
+
+template <typename score_type_>
+void edit_distance_log_mismatch(std::string const &first, std::string const &second, //
+                                score_type_ result_base, score_type_ result_simd) {
+    char const *ellipsis = first.length() > 22 || second.length() > 22 ? "..." : "";
+    char const *format_string;
+    constexpr bool is_signed = std::is_signed<score_type_>();
+    if constexpr (is_signed) {
+        format_string = "Edit Distance error (got %zd, expected %zd): \"%.22s%s\" ⇔ \"%.22s%s\" \n";
+    }
+    else { format_string = "Edit Distance error (got %zu, expected %zu): \"%.22s%s\" ⇔ \"%.22s%s\" \n"; }
+    std::printf(format_string, result_simd, result_base, first.c_str(), ellipsis, second.c_str(), ellipsis);
+}
+
+/**
+ *  @brief  Tests the correctness of the string class Levenshtein distance computation,
+ *          as well as the similarity scoring functions for bioinformatics-like workloads
+ *          on a @b fixed set of different representative ASCII and UTF-8 strings.
+ */
+template <typename score_type_, typename base_operator_, typename simd_operator_, typename... extra_args_>
+void test_similarity_scores_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
+                                  std::string_view allowed_chars = {}, extra_args_ &&...extra_args) {
+
+    std::vector<std::pair<std::string, std::string>> test_cases;
+    auto append = [&test_cases](std::string const &first, std::string const &second) {
+        test_cases.emplace_back(first, second);
+    };
+
+    // Some vary basic variants:
+    append("ggbuzgjux{}l", "gbuzgjux{}l"); // one (prepended) insertion; distance ~ 1
+    append("A", "A");                      // distance ~ 0
+    append("A", "=");                      // distance ~ 1
+    append("", "");                        // distance ~ 0
+    append("ABC", "ABC");                  // same string; distance ~ 0
+    append("ABC", "AABC");                 // distance ~ 1, prepended
+    append("ABC", "ABCC");                 // distance ~ 1, appended
+    append("", "ABC");                     // distance ~ 3
+    append("ABC", "");                     // distance ~ 3
+    append("ABC", "AC");                   // one deletion; distance ~ 1
+    append("ABC", "AXBC");                 // one X insertion; distance ~ 1
+    append("ABC", "AXC");                  // one X substitution; distance ~ 1
+    append("ABCDEFG", "ABCXEFG");          // one X substitution; distance ~ 1
+    append("LISTEN", "SILENT");            // distance ~ 4
+    append("ATCA", "CTACTCACCC");          // distance ~ 6
+    append("APPLE", "APLE");               // distance ~ 1
+
+    // Longer strings made of simple characters:
+    append("ABCDEFGHIJKLMNOPQRSTUVWXYZ", "ABCDEFGHIJKLMNOPQRSTUVWXYZ"); // same string; distance ~ 0
+    append("ABCDEFGHIJKLMNOPQRSTUVWXYZ", "ABCD_FGHI_KLMNOP_RSTU_WXYZ"); // same length; 4 substitutions; distance ~ 4
+
+    // Short Unicode samples that we also use on the Python side:
+    append("αβγδ", "αγδ");                      // Each Greek symbol is 2 bytes in size; 2 bytes, 1 runes diff.
+    append("école", "école");                   // letter "é" as a single character vs "e" + "´"; 3 bytes, 2 runes diff.
+    append("Schön", "Scho\u0308n");             // "ö" represented as "o" + "¨"; 3 bytes, 2 runes diff.
+    append("Data科学123", "Data科學321");       // 3 bytes, 3 runes
+    append("🙂🌍🚀", "🙂🌎✨");                 // 5 bytes, 2 runes
+    append("💖", "💗");                         // 4-byte emojis: Different hearts; 1 bytes, 1 runes diff.
+    append("مرحبا بالعالم", "مرحبا يا عالم");   // "Hello World" vs "Welcome to the World" ?; 3 bytes, 2 runes diff.
+    append("𠜎 𠜱 𠝹 𠱓", "𠜎𠜱𠝹𠱓");          // Ancient Chinese characters, no spaces vs spaces; 3 bytes, 3 runes
+    append("München", "Muenchen");              // German name with umlaut vs. its transcription; 2 bytes, 2 runes
+    append("façade", "facade");                 // "ç" represented as "c" with cedilla vs. plain "c"; 2 bytes, 1 runes
+    append("こんにちは世界", "こんばんは世界"); // "Good morning world" vs "Good evening world"; 3 bytes, 2 runes
+    append("👩‍👩‍👧‍👦", "👨‍👩‍👧‍👦"); // Different family emojis; 1 bytes, 1 runes
+
+    // ~20 characters; two similar integral expressions that differ in the upper limit.
+    append("∫₀¹ x² dx = 1/3", "∫₀² x² dx = 8/3");
+
+    // ~50 characters; typography test with box-drawing, quote style, currency symbol, dash type, and case differences.
+    append("╔══╦══╗ • ‘single’ and “double” quotes, € 14.95 — OK",
+           "╔══╦══╗ • ‘single’ and «double» quotes, $ 14.95 – ok");
+
+    // ~100 characters in one string combining Armenian, Georgian, and Greek:
+    append("Երևան, თბილისი, και Αθήνα – 3 մայրքաղաքներ: Բարի գալուստ, მოგესალმებით, και Καλώς ορίσατε!",
+           "Երևան, თბილისი, και Αθήνα – երեք մայրքաղաքներ: բարև, სტუმრები, και Καλώς ήρθατε!");
+
+    // ~200 characters in ASCII English, Traditional Chinese, and Russian, describing their capitals.
+    append("London, the iconic capital of the United Kingdom, seamlessly blends centuries-old traditions with bold "
+           "modernity;"
+           "倫敦作為英國的標誌性首都，其歷史沉澱與當代創新彼此交融，展現獨特風範;"
+           "Лондон, столица Великобритании, объединяет древние традиции с динамичной современностью, "
+           "offering a rich tapestry of cultural heritage and visionary progress.", // First string ends here ;)
+           "London, the renowned capital of the UK, fuses its rich historical legacy with a spirit of modern "
+           "innovation;"
+           "倫敦，作為英國的著名首都，以悠久歷史與現代創意相互融合，呈現獨特都市風貌;"
+           "Лондон – известная столица Великобритании, где древность встречается с современной энергией, "
+           "creating an inspiring environment for cultural exploration and future development.");
+
+    // ~300 characters; a complex variant with translations and visible regions of Korean, Japanese, Chinese
+    // (traditional and simplified), German, French, Spanish.
+    append("An epic voyage through multicultural realms: "
+           "In a city where ancient traditions fuse with modern innovation, dynamic energy permeates every street. "
+           "서울의 번화한 거리에선 전통과 현대가 어우러져 감동을 주며, 東京では伝統美と未来の夢が共鳴する。在這裡, "
+           "傳統文化與現代科技和諧並存, 而这里, 传统文化与现代科技交织创新; "
+           "Deutschland zeigt eine reiche Geschichte, "
+           "la France révèle une élégance subtile, "
+           "y España irradia pasión y color.", // First string ends here ;)
+           "An epic journey through diverse cultures: "
+           "In a town where old traditions fuse with innovation, energy permeates every historic street. "
+           "서울의 번화한 거리는 전통과 현대가 어울려 독특한 풍경을 이루며, "
+           "東京では伝統美と未来への展望が響き合う。在這裡, 傳統與現代科技融合無間, 而这里, 传统与现代科技紧密相连; "
+           "Deutschland offenbart eine stolze Geschichte, "
+           "la France incarne une élégance fine, "
+           "y España resplandece con pasión y vivacidad.");
+
+    // First check with a batch-size of 1
+    using score_t = score_type_;
+    unified_vector<score_t> results_base(1), results_simd(1);
+    arrow_strings_tape_t first_tape, second_tape;
+    bool contains_missing_in_any_case = false;
+    constexpr score_t signaling_score = std::numeric_limits<score_t>::max();
+
+    // Old C-style for-loops are much more debuggable than range-based loops!
+    for (std::size_t pair_idx = 0; pair_idx != test_cases.size(); ++pair_idx) {
+        auto const &first = test_cases[pair_idx].first;
+        auto const &second = test_cases[pair_idx].second;
+
+        // Check if the input strings fit into our allowed characters set
+        if (!allowed_chars.empty()) {
+            bool contains_missing = false;
+            for (auto c : first) contains_missing |= allowed_chars.find(c) == std::string_view::npos;
+            for (auto c : second) contains_missing |= allowed_chars.find(c) == std::string_view::npos;
+            contains_missing_in_any_case |= contains_missing;
+            if (contains_missing) continue;
+        }
+
+        // Reset the tapes and results
+        results_base[0] = signaling_score, results_simd[0] = signaling_score;
+        first_tape.try_assign(&first, &first + 1);
+        second_tape.try_assign(&second, &second + 1);
+
+        // Compute with both backends
+        arrow_strings_view_t first_view = first_tape.view();
+        arrow_strings_view_t second_view = second_tape.view();
+        score_t *results_base_ptr = results_base.data();
+        score_t *results_simd_ptr = results_simd.data();
+        status_t status_base = base_operator(first_view, second_view, results_base_ptr);
+        status_t status_simd = simd_operator(first_view, second_view, results_simd_ptr, extra_args...);
+        _sz_assert(status_base == status_t::success_k);
+        _sz_assert(status_simd == status_t::success_k);
+        if (results_base[0] != results_simd[0])
+            edit_distance_log_mismatch(first, second, results_base[0], results_simd[0]);
+    }
+
+    // Unzip the test cases into two separate tapes and perform batch processing
+    if (!contains_missing_in_any_case) {
+        results_base.resize(test_cases.size(), signaling_score);
+        results_simd.resize(test_cases.size(), signaling_score);
+        first_tape.reset();
+        second_tape.reset();
+        for (auto [first, second] : test_cases) {
+            _sz_assert(first_tape.try_append({first.data(), first.size()}) == status_t::success_k);
+            _sz_assert(second_tape.try_append({second.data(), second.size()}) == status_t::success_k);
+        }
+
+        // Compute with both backends
+        status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
+        status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data(), extra_args...);
+        _sz_assert(status_base == status_t::success_k);
+        _sz_assert(status_simd == status_t::success_k);
+
+        // Individually log the failed results
+        for (std::size_t i = 0; i != test_cases.size(); ++i) {
+            if (results_base[i] == results_simd[i]) continue;
+            edit_distance_log_mismatch(test_cases[i].first, test_cases[i].second, results_base[i], results_simd[i]);
+        }
+    }
+}
+
+struct fuzzy_config_t {
+    std::string_view alphabet = "ABC";
+    std::size_t batch_size = 16;
+    std::size_t min_string_length = 1;
+    std::size_t max_string_length = 200;
+};
+
+void randomize_strings(fuzzy_config_t config, std::vector<std::string> &array, arrow_strings_tape_t &tape,
+                       bool unique = false) {
+    array.resize(config.batch_size);
+
+    std::uniform_int_distribution<std::size_t> length_distribution(config.min_string_length, config.max_string_length);
+    for (std::size_t i = 0; i != config.batch_size; ++i) {
+        std::size_t length = length_distribution(global_random_generator());
+        array[i] = random_string(length, config.alphabet.data(), config.alphabet.size());
+    }
+
+    if (unique) {
+        std::sort(array.begin(), array.end());
+        auto last = std::unique(array.begin(), array.end());
+        array.erase(last, array.end());
+    }
+
+    // Convert to a GPU-friendly layout
+    status_t status = tape.try_assign(array.data(), array.data() + array.size());
+    _sz_assert(status == status_t::success_k);
+}
+
+/**
+ *  @brief  Tests the correctness of the string class Levenshtein distance computation,
+ *          as well as the similarity scoring functions for bioinformatics-like workloads
+ *          on a synthetic @b randomly-generated set of strings from a given @p alphabet.
+ */
+template <typename score_type_, typename base_operator_, typename simd_operator_, typename... extra_args_>
+void test_similarity_scores_fuzzy(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
+                                  fuzzy_config_t config = {}, std::size_t iterations = 10,
+                                  extra_args_ &&...extra_args) {
+
+    unified_vector<score_type_> results_base(config.batch_size), results_simd(config.batch_size);
+    std::vector<std::string> first_array, second_array;
+    arrow_strings_tape_t first_tape, second_tape;
+
+    // Generate some random strings, using a small alphabet
+    for (std::size_t iteration_idx = 0; iteration_idx < iterations; ++iteration_idx) {
+        randomize_strings(config, first_array, first_tape);
+        randomize_strings(config, second_array, second_tape);
+
+        // Compute with both backends
+        status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
+        status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data(), extra_args...);
+        _sz_assert(status_base == status_t::success_k);
+        _sz_assert(status_simd == status_t::success_k);
+
+        // Individually log the failed results
+        for (std::size_t i = 0; i != config.batch_size; ++i) {
+            if (results_base[i] == results_simd[i]) continue;
+            edit_distance_log_mismatch(first_array[i], second_array[i], results_base[i], results_simd[i]);
+        }
+    }
+}
+
+template <typename score_type_, typename base_operator_, typename simd_operator_, typename... extra_args_>
+void test_similarity_scores_fixed_and_fuzzy(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
+                                            std::string_view allowed_chars = {}, fuzzy_config_t config = {},
+                                            extra_args_ &&...extra_args) {
+    test_similarity_scores_fixed<score_type_>(base_operator, simd_operator, allowed_chars, extra_args...);
+    test_similarity_scores_fuzzy<score_type_>(base_operator, simd_operator, config, 1, extra_args...);
+}
+
+/**
+ *  @brief  Tests the correctness of the string class Levenshtein distance, NW & SW score computation,
+ *          comparing the results to some baseline implementation for predefined and random inputs.
+ */
+void test_similarity_scores_equivalence() {
+
+    using error_t = error_cost_t;
+    using error_matrix_t = error_costs_256x256_t; // ? Full matrix for all 256 ASCII characters
+    using error_mat_t = error_costs_26x26ascii_t; // ? Smaller compact form for 26 capital ASCII characters
+
+    // Our logic of computing NW and SW alignment similarity scores differs in sign from most implementations.
+    // It's similar to how the "cosine distance" is the inverse of the "cosine similarity".
+    // In our case we compute the "distance" and by negating the sign, we can compute the "similarity".
+    {
+        constexpr error_t unary_match_score = 1;
+        constexpr error_t unary_mismatch_score = 0;
+        constexpr error_t unary_gap_score = 0;
+        error_matrix_t substituter_unary = error_matrix_t::diagonal(unary_match_score, unary_mismatch_score);
+        auto distance_l = levenshtein_baseline("abcdefg", 7, "abc_efg", 7);
+        auto similarity_nw = needleman_wunsch_baseline("abcdefg", 7, "abc_efg", 7, substituter_unary, unary_gap_score);
+        auto similarity_sw = smith_waterman_baseline("abcdefg", 7, "abc_efg", 7, substituter_unary, unary_gap_score);
+        // Distance can be computed from the similarity, by inverting the sign around the length of the longest string:
+        auto distance_nw = std::max(7, 7) - similarity_nw;
+        auto distance_sw = std::max(7, 7) - similarity_sw;
+        _sz_assert(distance_l == 1);
+        _sz_assert(distance_nw == 1);
+        _sz_assert(distance_sw == 1);
+    }
+
+    // Let's define some weird scoring schemes for Levenshtein-like distance, that are not unary:
+    constexpr linear_gap_costs_t weird_linear {3};
+    constexpr affine_gap_costs_t weird_affine {4, 2};
+    constexpr uniform_substitution_costs_t weird_uniform {1, 3};
+
+    // Single-threaded serial Levenshtein distance implementation
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>( //
+        levenshtein_baselines_t {},                    //
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {});
+
+    // Multi-threaded parallel Levenshtein distance implementation
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>( //
+        levenshtein_baselines_t {},                    //
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {});
+
+    // Single-threaded serial Levenshtein distance implementation with weird linear costs
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(         //
+        levenshtein_baselines_t {weird_uniform, weird_linear}, //
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear});
+
+    // Multi-threaded parallel Levenshtein distance implementation with weird linear costs
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(         //
+        levenshtein_baselines_t {weird_uniform, weird_linear}, //
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear});
+
+    // Single-threaded serial Levenshtein distance implementation with weird affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(         //
+        levenshtein_baselines_t {weird_uniform, weird_affine}, //
+        levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_affine});
+
+    // Multi-threaded parallel Levenshtein distance implementation with weird affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(         //
+        levenshtein_baselines_t {weird_uniform, weird_affine}, //
+        levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_affine});
+
+    // Now let's take non-unary substitution costs, like BLOSUM62
+    constexpr linear_gap_costs_t blosum62_linear_cost {-4};
+    constexpr affine_gap_costs_t blosum62_affine_cost {-4, -1};
+    error_mat_t blosum62_mat = error_costs_26x26ascii_t::blosum62();
+    error_matrix_t blosum62_matrix = blosum62_mat.decompressed();
+
+    // Single-threaded serial NW implementation
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                       //
+        needleman_wunsch_baselines_t {blosum62_matrix, blosum62_linear_cost}, //
+        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_linear_cost});
+
+    // Multi-threaded parallel NW implementation
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                       //
+        needleman_wunsch_baselines_t {blosum62_matrix, blosum62_linear_cost}, //
+        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_linear_cost});
+
+    // Single-threaded serial SW implementation
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                     //
+        smith_waterman_baselines_t {blosum62_matrix, blosum62_linear_cost}, //
+        smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_linear_cost});
+
+    // Multi-threaded parallel SW implementation
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                     //
+        smith_waterman_baselines_t {blosum62_matrix, blosum62_linear_cost}, //
+        smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_linear_cost});
+
+    // Single-threaded serial NW implementation with weird affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                       //
+        needleman_wunsch_baselines_t {blosum62_matrix, blosum62_affine_cost}, //
+        needleman_wunsch_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_affine_cost});
+
+    // Multi-threaded parallel NW implementation with weird affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                       //
+        needleman_wunsch_baselines_t {blosum62_matrix, blosum62_affine_cost}, //
+        needleman_wunsch_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_affine_cost});
+
+    // Single-threaded serial SW implementation with weird affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                     //
+        smith_waterman_baselines_t {blosum62_matrix, blosum62_affine_cost}, //
+        smith_waterman_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_affine_cost});
+
+    // Multi-threaded parallel SW implementation with weird affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                     //
+        smith_waterman_baselines_t {blosum62_matrix, blosum62_affine_cost}, //
+        smith_waterman_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_affine_cost});
+
+#if SZ_USE_ICE
+    // Ice Lake Levenshtein distance against Multi-threaded on CPU
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                 //
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {}, //
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_si_k> {});
+
+    // Ice Lake Levenshtein distance against Multi-threaded on CPU with weird linear costs
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                                            //
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear}, //
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_si_k> {weird_uniform, weird_linear});
+
+    // Ice Lake Levenshtein distance against Multi-threaded on CPU with weird affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                                            //
+        levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_affine}, //
+        levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_caps_si_k> {weird_uniform, weird_affine});
+
+    // Ice Lake Levenshtein UTF8 distance against Multi-threaded on CPU
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                      //
+        levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {}, //
+        levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_caps_si_k> {});
+
+    // Ice Lake Levenshtein UTF8 distance against Multi-threaded on CPU with weird linear costs
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>( //
+        levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear},
+        levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_caps_si_k> {weird_uniform, weird_linear});
+
+    // Ice Lake Needleman-Wunsch distance against Multi-threaded on CPU
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                       //
+        needleman_wunsch_baselines_t {blosum62_matrix, blosum62_linear_cost}, //
+        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_caps_si_k> {
+            blosum62_matrix, blosum62_linear_cost});
+
+    // Ice Lake Smith-Waterman distance against Multi-threaded on CPU
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                     //
+        smith_waterman_baselines_t {blosum62_matrix, blosum62_linear_cost}, //
+        smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_caps_si_k> {blosum62_matrix,
+                                                                                                 blosum62_linear_cost});
+
+#endif
+
+#if SZ_USE_CUDA
+    gpu_specs_t first_gpu_specs = *gpu_specs();
+#endif
+
+#if SZ_USE_CUDA
+    // CUDA Levenshtein distance against Multi-threaded on CPU with weird linear costs
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                                            //
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear}, //
+        levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k> {weird_uniform, weird_linear}, {}, {},
+        first_gpu_specs);
+
+    // CUDA Levenshtein distance against Multi-threaded on CPU with weird affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                                            //
+        levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_affine}, //
+        levenshtein_distances<char, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k> {weird_uniform, weird_affine}, {}, {},
+        first_gpu_specs);
+#endif
+
+#if SZ_USE_KEPLER
+    // CUDA Levenshtein distance on Kepler against Multi-threaded on CPU
+    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                                            //
+        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear}, //
+        levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_caps_ck_k> {weird_uniform, weird_linear}, {}, {},
+        first_gpu_specs);
+#endif
+
+#if SZ_USE_CUDA
+    // CUDA Needleman-Wunsch score against Multi-threaded on CPU
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
+        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_linear_cost}, //
+        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k> {
+            blosum62_matrix, blosum62_linear_cost},
+        {}, {}, first_gpu_specs);
+
+    // CUDA Needleman-Wunsch score against Multi-threaded on CPU with affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
+        needleman_wunsch_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_affine_cost}, //
+        needleman_wunsch_scores<char, error_matrix_t, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k> {
+            blosum62_matrix, blosum62_affine_cost},
+        {}, {}, first_gpu_specs);
+
+    // CUDA Smith-Waterman score against Multi-threaded on CPU
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
+        smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_linear_cost}, //
+        smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k> {blosum62_matrix,
+                                                                                                  blosum62_linear_cost},
+        {}, {}, first_gpu_specs);
+
+    // CUDA Smith-Waterman score against Multi-threaded on CPU with affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
+        smith_waterman_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_affine_cost}, //
+        smith_waterman_scores<char, error_matrix_t, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k> {blosum62_matrix,
+                                                                                                  blosum62_affine_cost},
+        {}, {}, first_gpu_specs);
+#endif
+
+#if SZ_USE_HOPPER
+    // CUDA Needleman-Wunsch score on Hopper against Multi-threaded on CPU
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
+        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_linear_cost}, //
+        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k> {
+            blosum62_matrix, blosum62_linear_cost},
+        {}, {}, first_gpu_specs);
+
+    // CUDA Needleman-Wunsch score on Hopper against Multi-threaded on CPU with affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
+        needleman_wunsch_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_affine_cost}, //
+        needleman_wunsch_scores<char, error_matrix_t, affine_gap_costs_t, ualloc_t, sz_caps_ckh_k> {
+            blosum62_matrix, blosum62_affine_cost},
+        {}, {}, first_gpu_specs);
+
+    // CUDA Smith-Waterman score on Hopper against Multi-threaded on CPU
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
+        smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_linear_cost}, //
+        smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k> {blosum62_matrix,
+                                                                                                  blosum62_linear_cost},
+        {}, {}, first_gpu_specs);
+
+    // CUDA Smith-Waterman score on Hopper against Multi-threaded on CPU with affine costs
+    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
+        smith_waterman_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
+            blosum62_matrix, blosum62_affine_cost}, //
+        smith_waterman_scores<char, error_matrix_t, affine_gap_costs_t, ualloc_t, sz_caps_ckh_k> {blosum62_matrix,
+                                                                                                  blosum62_affine_cost},
+        {}, {}, first_gpu_specs);
+
+#endif
+}
+
+/**
+ *  @brief  Many GPU algorithms depend on effective use of shared memory and scheduling its allocation for
+ *          long inputs or very large batches isn't trivial.
+ */
+void test_similarity_scores_memory_usage() {
+
+    std::vector<fuzzy_config_t> experiments = {
+        // Single string pair of same length:
+        {.batch_size = 1, .min_string_length = 128, .max_string_length = 128},
+        {.batch_size = 1, .min_string_length = 512, .max_string_length = 512},
+        {.batch_size = 1, .min_string_length = 2048, .max_string_length = 2048},
+        {.batch_size = 1, .min_string_length = 8192, .max_string_length = 8192},
+        {.batch_size = 1, .min_string_length = 32768, .max_string_length = 32768},
+        {.batch_size = 1, .min_string_length = 131072, .max_string_length = 131072},
+        // Two strings of a same length:
+        {.batch_size = 2, .min_string_length = 128, .max_string_length = 128},
+        {.batch_size = 2, .min_string_length = 512, .max_string_length = 512},
+        {.batch_size = 2, .min_string_length = 2048, .max_string_length = 2048},
+        {.batch_size = 2, .min_string_length = 8192, .max_string_length = 8192},
+        {.batch_size = 2, .min_string_length = 32768, .max_string_length = 32768},
+        {.batch_size = 2, .min_string_length = 131072, .max_string_length = 131072},
+        // Ten strings of random lengths:
+        {.batch_size = 10, .min_string_length = 1, .max_string_length = 128},
+        {.batch_size = 10, .min_string_length = 1, .max_string_length = 512},
+        {.batch_size = 10, .min_string_length = 1, .max_string_length = 2048},
+        {.batch_size = 10, .min_string_length = 1, .max_string_length = 8192},
+        {.batch_size = 10, .min_string_length = 1, .max_string_length = 32768},
+        {.batch_size = 10, .min_string_length = 1, .max_string_length = 131072},
+    };
+
+#if SZ_USE_CUDA
+    gpu_specs_t first_gpu_specs = *gpu_specs();
+#endif
+
+    // Let's define some weird scoring schemes for Levenshtein-like distance, that are not unary:
+    constexpr linear_gap_costs_t weird_linear {3};
+    constexpr affine_gap_costs_t weird_affine {4, 2};
+    constexpr uniform_substitution_costs_t weird_uniform {1, 3};
+
+    // Progress until something fails
+    for (fuzzy_config_t const &experiment : experiments) {
+        std::printf("Testing with batch size %zu, min length %zu, max length %zu\n", experiment.batch_size,
+                    experiment.min_string_length, experiment.max_string_length);
+
+        // Multi-threaded serial Levenshtein distance implementation
+        test_similarity_scores_fuzzy<sz_size_t>( //
+            levenshtein_baselines_t {},          //
+            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {}, experiment, 1);
+
+        // Multi-threaded serial Levenshtein distance implementation with weird linear costs
+        test_similarity_scores_fuzzy<sz_size_t>(                   //
+            levenshtein_baselines_t {weird_uniform, weird_linear}, //
+            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear},
+            experiment, 1);
+
+        // Multi-threaded serial Levenshtein distance implementation with weird affine costs
+        test_similarity_scores_fuzzy<sz_size_t>(                   //
+            levenshtein_baselines_t {weird_uniform, weird_affine}, //
+            levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_affine},
+            experiment, 1);
+
+#if SZ_USE_ICE
+        // Ice Lake Levenshtein distance against Multi-threaded on CPU
+        test_similarity_scores_fuzzy<sz_size_t>( //
+            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {},
+            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_si_k> {}, experiment, 1);
+
+        // Ice Lake Levenshtein distance against Multi-threaded on CPU with weird linear costs
+        test_similarity_scores_fuzzy<sz_size_t>( //
+            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear},
+            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_si_k> {weird_uniform, weird_linear},
+            experiment, 1);
+
+        // Ice Lake Levenshtein distance against Multi-threaded on CPU with weird affine costs
+        test_similarity_scores_fuzzy<sz_size_t>( //
+            levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_affine},
+            levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_caps_si_k> {weird_uniform, weird_affine},
+            experiment, 1);
+#endif
+
+#if SZ_USE_CUDA
+        // CUDA Levenshtein distance against Multi-threaded on CPU
+        test_similarity_scores_fuzzy<sz_size_t>(                                           //
+            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {}, //
+            levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k> {}, experiment, 10,
+            first_gpu_specs);
+#endif
+
+#if SZ_USE_KEPLER
+        // CUDA Levenshtein distance on Kepler against Multi-threaded on CPU
+        test_similarity_scores_fuzzy<sz_size_t>(                                           //
+            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {}, //
+            levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_caps_ck_k> {}, experiment, 10,
+            first_gpu_specs);
+#endif
+    }
+}
+
+} // namespace scripts
+} // namespace stringzillas
+} // namespace ashvardanian
diff --git a/scripts/test_stringzillas.cuh b/scripts/test_stringzillas.cuh
index 2f8f4d4a..59088ea8 100644
--- a/scripts/test_stringzillas.cuh
+++ b/scripts/test_stringzillas.cuh
@@ -11,14 +11,6 @@
 #define FU_ENABLE_NUMA 0
 #include <fork_union.hpp> // Fork-join scoped thread pool
 
-#include "stringzillas/find_many.hpp"
-#include "stringzillas/similarity.hpp"
-
-#if SZ_USE_CUDA
-#include "stringzillas/find_many.cuh"
-#include "stringzillas/similarity.cuh"
-#endif
-
 #if !_SZ_IS_CPP17
 #error "This test requires C++17 or later."
 #endif
@@ -83,1392 +75,6 @@ int log_environment() {
     return 0;
 }
 
-/**
- *  @brief Inefficient baseline Levenshtein distance computation, as implemented in most codebases.
- *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
- */
-inline std::size_t levenshtein_baseline(                                //
-    char const *s1, std::size_t len1, char const *s2, std::size_t len2, //
-    error_cost_t match_cost = 0, error_cost_t mismatch_cost = 1, error_cost_t gap_cost = 1) noexcept(false) {
-
-    std::size_t const rows = len1 + 1;
-    std::size_t const cols = len2 + 1;
-    std::vector<std::size_t> matrix_buffer(rows * cols);
-
-    // Initialize the borders of the matrix.
-    for (std::size_t i = 0; i < rows; ++i) matrix_buffer[i * cols + 0] /* [i][0] in 2D */ = i * gap_cost;
-    for (std::size_t j = 0; j < cols; ++j) matrix_buffer[0 * cols + j] /* [0][j] in 2D */ = j * gap_cost;
-
-    for (std::size_t i = 1; i < rows; ++i) {
-        std::size_t const *last_row = &matrix_buffer[(i - 1) * cols];
-        std::size_t *row = &matrix_buffer[i * cols];
-        for (std::size_t j = 1; j < cols; ++j) {
-            std::size_t substitution_cost = (s1[i - 1] == s2[j - 1]) ? match_cost : mismatch_cost;
-            std::size_t if_deletion_or_insertion = std::min(last_row[j], row[j - 1]) + gap_cost;
-            row[j] = std::min(if_deletion_or_insertion, last_row[j - 1] + substitution_cost);
-        }
-    }
-
-    return matrix_buffer.back();
-}
-
-/**
- *  @brief Inefficient baseline Needleman-Wunsch alignment score computation, as implemented in most codebases.
- *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
- */
-inline std::ptrdiff_t needleman_wunsch_baseline(                        //
-    char const *s1, std::size_t len1, char const *s2, std::size_t len2, //
-    std::function<error_cost_t(char, char)> substitution_cost_for, error_cost_t gap_cost) noexcept(false) {
-
-    std::size_t const rows = len1 + 1;
-    std::size_t const cols = len2 + 1;
-    std::vector<std::ptrdiff_t> matrix_buffer(rows * cols);
-
-    // Initialize the borders of the matrix.
-    for (std::size_t i = 0; i < rows; ++i) matrix_buffer[i * cols + 0] /* [i][0] in 2D */ = i * gap_cost;
-    for (std::size_t j = 0; j < cols; ++j) matrix_buffer[0 * cols + j] /* [0][j] in 2D */ = j * gap_cost;
-
-    // Fill in the rest of the matrix.
-    for (std::size_t i = 1; i < rows; ++i) {
-        std::ptrdiff_t const *last_row = &matrix_buffer[(i - 1) * cols];
-        std::ptrdiff_t *row = &matrix_buffer[i * cols];
-        for (std::size_t j = 1; j < cols; ++j) {
-            std::ptrdiff_t substitution_cost = substitution_cost_for(s1[i - 1], s2[j - 1]);
-            std::ptrdiff_t if_substitution = last_row[j - 1] + substitution_cost;
-            std::ptrdiff_t if_deletion_or_insertion = std::max(last_row[j], row[j - 1]) + gap_cost;
-            row[j] = std::max(if_deletion_or_insertion, if_substitution);
-        }
-    }
-
-    return matrix_buffer.back();
-}
-
-/**
- *  @brief Inefficient baseline Smith-Waterman local alignment score computation, as implemented in most codebases.
- *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
- */
-inline std::ptrdiff_t smith_waterman_baseline(char const *s1, std::size_t len1, char const *s2, std::size_t len2,
-                                              std::function<error_cost_t(char, char)> substitution_cost_for,
-                                              error_cost_t gap_cost) noexcept(false) {
-    std::size_t const rows = len1 + 1;
-    std::size_t const cols = len2 + 1;
-    std::vector<std::ptrdiff_t> matrix_buffer(rows * cols);
-
-    // Unlike the global alignment we need to track the largest score in the matrix.
-    std::ptrdiff_t best_score = 0;
-
-    // Initialize the borders of the matrix to 0.
-    for (std::size_t i = 0; i < rows; ++i) matrix_buffer[i * cols + 0] /* [i][0] in 2D */ = 0;
-    for (std::size_t j = 0; j < cols; ++j) matrix_buffer[0 * cols + j] /* [0][j] in 2D */ = 0;
-
-    // Fill in the rest of the matrix.
-    for (std::size_t i = 1; i < rows; ++i) {
-        std::ptrdiff_t const *last_row = &matrix_buffer[(i - 1) * cols];
-        std::ptrdiff_t *row = &matrix_buffer[i * cols];
-        for (std::size_t j = 1; j < cols; ++j) {
-            std::ptrdiff_t substitution_cost = substitution_cost_for(s1[i - 1], s2[j - 1]);
-            std::ptrdiff_t if_substitution = last_row[j - 1] + substitution_cost;
-            std::ptrdiff_t if_deletion_or_insertion = std::max(row[j - 1], last_row[j]) + gap_cost;
-            std::ptrdiff_t if_substitution_or_reset = std::max<std::ptrdiff_t>(if_substitution, 0);
-            std::ptrdiff_t score = std::max(if_deletion_or_insertion, if_substitution_or_reset);
-            row[j] = score;
-            best_score = std::max(best_score, score);
-        }
-    }
-
-    return best_score;
-}
-
-/**
- *  @brief Inefficient baseline Levenshtein-Gotoh distance computation, as implemented in most codebases.
- *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
- */
-inline std::size_t levenshtein_gotoh_baseline(                          //
-    char const *s1, std::size_t len1, char const *s2, std::size_t len2, //
-    error_cost_t match_cost, error_cost_t mismatch_cost,                //
-    error_cost_t gap_opening_cost, error_cost_t gap_extension_cost) noexcept(false) {
-
-    std::size_t const rows = len1 + 1;
-    std::size_t const cols = len2 + 1;
-    std::vector<std::size_t> matrix_scores(rows * cols);
-    std::vector<std::size_t> matrix_inserts(rows * cols);
-    std::vector<std::size_t> matrix_deletes(rows * cols);
-
-    // Initialize the borders of the matrix.
-    // The supplementary matrices are initialized with values of higher magnitude,
-    // which is equivalent to discarding them. That's better than using `SIZE_MAX`
-    // as subsequent additions won't overflow.
-    matrix_scores[0] = 0;
-    for (std::size_t j = 1; j < cols; ++j) {
-        matrix_scores[0 * cols + j] = gap_opening_cost + (j - 1) * gap_extension_cost;
-        matrix_deletes[0 * cols + j] = matrix_scores[0 * cols + j] + gap_opening_cost + gap_extension_cost;
-    }
-    for (std::size_t i = 1; i < rows; ++i) {
-        matrix_scores[i * cols + 0] = gap_opening_cost + (i - 1) * gap_extension_cost;
-        matrix_inserts[i * cols + 0] = matrix_scores[i * cols + 0] + gap_opening_cost + gap_extension_cost;
-    }
-
-    // Fill in the rest of the matrix.
-    for (std::size_t i = 1; i < rows; ++i) {
-        std::size_t const *last_row = &matrix_scores[(i - 1) * cols];
-        std::size_t *row = &matrix_scores[i * cols];
-        std::size_t *row_inserts = &matrix_inserts[i * cols];
-        std::size_t const *last_deletes_row = &matrix_deletes[(i - 1) * cols];
-        std::size_t *row_deletes = &matrix_deletes[i * cols];
-        for (std::size_t j = 1; j < cols; ++j) {
-            std::size_t substitution_cost = (s1[i - 1] == s2[j - 1]) ? match_cost : mismatch_cost;
-            std::size_t if_substitution = last_row[j - 1] + substitution_cost;
-            std::size_t if_insertion =
-                std::min<std::size_t>(row[j - 1] + gap_opening_cost, row_inserts[j - 1] + gap_extension_cost);
-            std::size_t if_deletion =
-                std::min<std::size_t>(last_row[j] + gap_opening_cost, last_deletes_row[j] + gap_extension_cost);
-            std::size_t if_deletion_or_insertion = std::min(if_deletion, if_insertion);
-            row[j] = std::min(if_deletion_or_insertion, if_substitution);
-            row_inserts[j] = if_insertion;
-            row_deletes[j] = if_deletion;
-        }
-    }
-
-    return matrix_scores.back();
-}
-
-/**
- *  @brief Inefficient baseline Needleman-Wunsch-Gotoh alignment score computation, as implemented in most codebases.
- *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
- *  @see https://github.com/gata-bio/affine-gaps
- */
-inline std::ptrdiff_t needleman_wunsch_gotoh_baseline(                  //
-    char const *s1, std::size_t len1, char const *s2, std::size_t len2, //
-    std::function<error_cost_t(char, char)> substitution_cost_for,      //
-    error_cost_t gap_opening_cost, error_cost_t gap_extension_cost) noexcept(false) {
-
-    std::size_t const rows = len1 + 1;
-    std::size_t const cols = len2 + 1;
-    std::vector<std::ptrdiff_t> matrix_scores(rows * cols);
-    std::vector<std::ptrdiff_t> matrix_inserts(rows * cols);
-    std::vector<std::ptrdiff_t> matrix_deletes(rows * cols);
-
-    // Initialize the borders of the matrix.
-    matrix_scores[0] = 0;
-    for (std::size_t i = 1; i < rows; ++i) {
-        matrix_scores[i * cols + 0] = gap_opening_cost + (i - 1) * gap_extension_cost;
-        matrix_inserts[i * cols + 0] = matrix_scores[i * cols + 0] + gap_opening_cost + gap_extension_cost;
-    }
-    for (std::size_t j = 1; j < cols; ++j) {
-        matrix_scores[0 * cols + j] = gap_opening_cost + (j - 1) * gap_extension_cost;
-        matrix_deletes[0 * cols + j] = matrix_scores[0 * cols + j] + gap_opening_cost + gap_extension_cost;
-    }
-
-    // Fill in the rest of the matrix.
-    for (std::size_t i = 1; i < rows; ++i) {
-        std::ptrdiff_t const *last_row = &matrix_scores[(i - 1) * cols];
-        std::ptrdiff_t *row = &matrix_scores[i * cols];
-        std::ptrdiff_t *row_inserts = &matrix_inserts[i * cols];
-        std::ptrdiff_t const *last_deletes_row = &matrix_deletes[(i - 1) * cols];
-        std::ptrdiff_t *row_deletes = &matrix_deletes[i * cols];
-        for (std::size_t j = 1; j < cols; ++j) {
-            std::ptrdiff_t substitution_cost = substitution_cost_for(s1[i - 1], s2[j - 1]);
-            std::ptrdiff_t if_substitution = last_row[j - 1] + substitution_cost;
-            std::ptrdiff_t if_insertion =
-                std::max(row[j - 1] + gap_opening_cost, row_inserts[j - 1] + gap_extension_cost);
-            std::ptrdiff_t if_deletion =
-                std::max(last_row[j] + gap_opening_cost, last_deletes_row[j] + gap_extension_cost);
-            std::ptrdiff_t if_deletion_or_insertion = std::max(if_deletion, if_insertion);
-            row[j] = std::max(if_deletion_or_insertion, if_substitution);
-            row_inserts[j] = if_insertion;
-            row_deletes[j] = if_deletion;
-        }
-    }
-
-    return matrix_scores.back();
-}
-
-/**
- *  @brief Inefficient baseline Smith-Waterman-Gotoh alignment score computation, as implemented in most codebases.
- *  @warning Allocates a new matrix on every call, with rows potentially scattered around memory.
- *  @see https://github.com/gata-bio/affine-gaps
- */
-inline std::ptrdiff_t smith_waterman_gotoh_baseline(                    //
-    char const *s1, std::size_t len1, char const *s2, std::size_t len2, //
-    std::function<error_cost_t(char, char)> substitution_cost_for,      //
-    error_cost_t gap_opening_cost, error_cost_t gap_extension_cost) noexcept(false) {
-
-    std::size_t const rows = len1 + 1;
-    std::size_t const cols = len2 + 1;
-    std::vector<std::ptrdiff_t> matrix_scores(rows * cols);
-    std::vector<std::ptrdiff_t> matrix_inserts(rows * cols);
-    std::vector<std::ptrdiff_t> matrix_deletes(rows * cols);
-
-    // Unlike the global alignment we need to track the largest score in the matrix.
-    std::ptrdiff_t best_score = 0;
-
-    // Initialize the borders of the matrix.
-    matrix_scores[0] = 0;
-    for (std::size_t i = 1; i < rows; ++i) {
-        matrix_scores[i * cols + 0] = 0;
-        matrix_inserts[i * cols + 0] = gap_opening_cost + gap_extension_cost;
-    }
-    for (std::size_t j = 1; j < cols; ++j) {
-        matrix_scores[0 * cols + j] = 0;
-        matrix_deletes[0 * cols + j] = gap_opening_cost + gap_extension_cost;
-    }
-
-    // Fill in the rest of the matrix.
-    for (std::size_t i = 1; i < rows; ++i) {
-        std::ptrdiff_t const *last_row = &matrix_scores[(i - 1) * cols];
-        std::ptrdiff_t *row = &matrix_scores[i * cols];
-        std::ptrdiff_t *row_inserts = &matrix_inserts[i * cols];
-        std::ptrdiff_t const *last_deletes_row = &matrix_deletes[(i - 1) * cols];
-        std::ptrdiff_t *row_deletes = &matrix_deletes[i * cols];
-        for (std::size_t j = 1; j < cols; ++j) {
-            std::ptrdiff_t substitution_cost = substitution_cost_for(s1[i - 1], s2[j - 1]);
-            std::ptrdiff_t if_substitution = last_row[j - 1] + substitution_cost;
-            std::ptrdiff_t if_insertion =
-                std::max(row[j - 1] + gap_opening_cost, row_inserts[j - 1] + gap_extension_cost);
-            std::ptrdiff_t if_deletion =
-                std::max(last_row[j] + gap_opening_cost, last_deletes_row[j] + gap_extension_cost);
-            std::ptrdiff_t if_deletion_or_insertion = std::max(if_deletion, if_insertion);
-            std::ptrdiff_t if_substitution_or_reset = std::max<std::ptrdiff_t>(if_substitution, 0);
-            std::ptrdiff_t score = std::max(if_deletion_or_insertion, if_substitution_or_reset);
-            row[j] = score;
-            row_inserts[j] = if_insertion;
-            row_deletes[j] = if_deletion;
-            best_score = std::max(best_score, score);
-        }
-    }
-
-    return best_score;
-}
-
-struct levenshtein_baselines_t {
-
-    uniform_substitution_costs_t substitution_costs = {0, 1};
-    error_cost_t gap_opening_cost = {1};
-    error_cost_t gap_extension_cost = {1};
-
-    levenshtein_baselines_t() = default;
-    levenshtein_baselines_t(uniform_substitution_costs_t subs, linear_gap_costs_t gap)
-        : substitution_costs(subs), gap_opening_cost(gap.open_or_extend), gap_extension_cost(gap.open_or_extend) {}
-    levenshtein_baselines_t(uniform_substitution_costs_t subs, affine_gap_costs_t gap)
-        : substitution_costs(subs), gap_opening_cost(gap.open), gap_extension_cost(gap.extend) {}
-
-    template <typename results_type_>
-    status_t operator()(arrow_strings_view_t first, arrow_strings_view_t second, results_type_ *results) const {
-        _sz_assert(first.size() == second.size());
-#pragma omp parallel for
-        for (std::size_t i = 0; i != first.size(); ++i)
-            results[i] =
-                gap_opening_cost == gap_extension_cost
-                    ? levenshtein_baseline(first[i].data(), first[i].size(),   //
-                                           second[i].data(), second[i].size(), //
-                                           substitution_costs.match, substitution_costs.mismatch, gap_opening_cost)
-                    : levenshtein_gotoh_baseline(first[i].data(), first[i].size(),   //
-                                                 second[i].data(), second[i].size(), //
-                                                 substitution_costs.match, substitution_costs.mismatch,
-                                                 gap_opening_cost, gap_extension_cost);
-        return status_t::success_k;
-    }
-};
-
-struct needleman_wunsch_baselines_t {
-
-    error_costs_256x256_t substitution_costs = error_costs_256x256_t::diagonal();
-    error_cost_t gap_opening_cost = -1;
-    error_cost_t gap_extension_cost = -1;
-
-    needleman_wunsch_baselines_t() = default;
-    needleman_wunsch_baselines_t(error_costs_256x256_t subs, linear_gap_costs_t gap)
-        : substitution_costs(subs), gap_opening_cost(gap.open_or_extend), gap_extension_cost(gap.open_or_extend) {}
-    needleman_wunsch_baselines_t(error_costs_256x256_t subs, affine_gap_costs_t gap)
-        : substitution_costs(subs), gap_opening_cost(gap.open), gap_extension_cost(gap.extend) {}
-
-    status_t operator()(arrow_strings_view_t first, arrow_strings_view_t second, sz_ssize_t *results) const {
-        _sz_assert(first.size() == second.size());
-
-#pragma omp parallel for
-        for (std::size_t i = 0; i != first.size(); ++i)
-            results[i] =
-                gap_opening_cost == gap_extension_cost
-                    ? needleman_wunsch_baseline(first[i].data(), first[i].size(),   //
-                                                second[i].data(), second[i].size(), //
-                                                substitution_costs, gap_opening_cost)
-                    : needleman_wunsch_gotoh_baseline(first[i].data(), first[i].size(),   //
-                                                      second[i].data(), second[i].size(), //
-                                                      substitution_costs, gap_opening_cost, gap_extension_cost);
-        return status_t::success_k;
-    }
-};
-
-struct smith_waterman_baselines_t {
-
-    error_costs_256x256_t substitution_costs = error_costs_256x256_t::diagonal();
-    error_cost_t gap_opening_cost = -1;
-    error_cost_t gap_extension_cost = -1;
-
-    smith_waterman_baselines_t() = default;
-    smith_waterman_baselines_t(error_costs_256x256_t subs, linear_gap_costs_t gap)
-        : substitution_costs(subs), gap_opening_cost(gap.open_or_extend), gap_extension_cost(gap.open_or_extend) {}
-    smith_waterman_baselines_t(error_costs_256x256_t subs, affine_gap_costs_t gap)
-        : substitution_costs(subs), gap_opening_cost(gap.open), gap_extension_cost(gap.extend) {}
-
-    status_t operator()(arrow_strings_view_t first, arrow_strings_view_t second, sz_ssize_t *results) const {
-        _sz_assert(first.size() == second.size());
-
-#pragma omp parallel for
-        for (std::size_t i = 0; i != first.size(); ++i)
-            results[i] = gap_opening_cost == gap_extension_cost
-                             ? smith_waterman_baseline(first[i].data(), first[i].size(),   //
-                                                       second[i].data(), second[i].size(), //
-                                                       substitution_costs, gap_opening_cost)
-                             : smith_waterman_gotoh_baseline(first[i].data(), first[i].size(),   //
-                                                             second[i].data(), second[i].size(), //
-                                                             substitution_costs, gap_opening_cost, gap_extension_cost);
-        return status_t::success_k;
-    }
-};
-
-template <typename score_type_>
-void edit_distance_log_mismatch(std::string const &first, std::string const &second, //
-                                score_type_ result_base, score_type_ result_simd) {
-    char const *ellipsis = first.length() > 22 || second.length() > 22 ? "..." : "";
-    char const *format_string;
-    constexpr bool is_signed = std::is_signed<score_type_>();
-    if constexpr (is_signed) {
-        format_string = "Edit Distance error (got %zd, expected %zd): \"%.22s%s\" ⇔ \"%.22s%s\" \n";
-    }
-    else { format_string = "Edit Distance error (got %zu, expected %zu): \"%.22s%s\" ⇔ \"%.22s%s\" \n"; }
-    std::printf(format_string, result_simd, result_base, first.c_str(), ellipsis, second.c_str(), ellipsis);
-}
-
-/**
- *  @brief  Tests the correctness of the string class Levenshtein distance computation,
- *          as well as the similarity scoring functions for bioinformatics-like workloads
- *          on a @b fixed set of different representative ASCII and UTF-8 strings.
- */
-template <typename score_type_, typename base_operator_, typename simd_operator_, typename... extra_args_>
-void test_similarity_scores_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
-                                  std::string_view allowed_chars = {}, extra_args_ &&...extra_args) {
-
-    std::vector<std::pair<std::string, std::string>> test_cases;
-    auto append = [&test_cases](std::string const &first, std::string const &second) {
-        test_cases.emplace_back(first, second);
-    };
-
-    // Some vary basic variants:
-    append("ggbuzgjux{}l", "gbuzgjux{}l"); // one (prepended) insertion; distance ~ 1
-    append("A", "A");                      // distance ~ 0
-    append("A", "=");                      // distance ~ 1
-    append("", "");                        // distance ~ 0
-    append("ABC", "ABC");                  // same string; distance ~ 0
-    append("ABC", "AABC");                 // distance ~ 1, prepended
-    append("ABC", "ABCC");                 // distance ~ 1, appended
-    append("", "ABC");                     // distance ~ 3
-    append("ABC", "");                     // distance ~ 3
-    append("ABC", "AC");                   // one deletion; distance ~ 1
-    append("ABC", "AXBC");                 // one X insertion; distance ~ 1
-    append("ABC", "AXC");                  // one X substitution; distance ~ 1
-    append("ABCDEFG", "ABCXEFG");          // one X substitution; distance ~ 1
-    append("LISTEN", "SILENT");            // distance ~ 4
-    append("ATCA", "CTACTCACCC");          // distance ~ 6
-    append("APPLE", "APLE");               // distance ~ 1
-
-    // Longer strings made of simple characters:
-    append("ABCDEFGHIJKLMNOPQRSTUVWXYZ", "ABCDEFGHIJKLMNOPQRSTUVWXYZ"); // same string; distance ~ 0
-    append("ABCDEFGHIJKLMNOPQRSTUVWXYZ", "ABCD_FGHI_KLMNOP_RSTU_WXYZ"); // same length; 4 substitutions; distance ~ 4
-
-    // Short Unicode samples that we also use on the Python side:
-    append("αβγδ", "αγδ");                      // Each Greek symbol is 2 bytes in size; 2 bytes, 1 runes diff.
-    append("école", "école");                   // letter "é" as a single character vs "e" + "´"; 3 bytes, 2 runes diff.
-    append("Schön", "Scho\u0308n");             // "ö" represented as "o" + "¨"; 3 bytes, 2 runes diff.
-    append("Data科学123", "Data科學321");       // 3 bytes, 3 runes
-    append("🙂🌍🚀", "🙂🌎✨");                 // 5 bytes, 2 runes
-    append("💖", "💗");                         // 4-byte emojis: Different hearts; 1 bytes, 1 runes diff.
-    append("مرحبا بالعالم", "مرحبا يا عالم");   // "Hello World" vs "Welcome to the World" ?; 3 bytes, 2 runes diff.
-    append("𠜎 𠜱 𠝹 𠱓", "𠜎𠜱𠝹𠱓");          // Ancient Chinese characters, no spaces vs spaces; 3 bytes, 3 runes
-    append("München", "Muenchen");              // German name with umlaut vs. its transcription; 2 bytes, 2 runes
-    append("façade", "facade");                 // "ç" represented as "c" with cedilla vs. plain "c"; 2 bytes, 1 runes
-    append("こんにちは世界", "こんばんは世界"); // "Good morning world" vs "Good evening world"; 3 bytes, 2 runes
-    append("👩‍👩‍👧‍👦", "👨‍👩‍👧‍👦"); // Different family emojis; 1 bytes, 1 runes
-
-    // ~20 characters; two similar integral expressions that differ in the upper limit.
-    append("∫₀¹ x² dx = 1/3", "∫₀² x² dx = 8/3");
-
-    // ~50 characters; typography test with box-drawing, quote style, currency symbol, dash type, and case differences.
-    append("╔══╦══╗ • ‘single’ and “double” quotes, € 14.95 — OK",
-           "╔══╦══╗ • ‘single’ and «double» quotes, $ 14.95 – ok");
-
-    // ~100 characters in one string combining Armenian, Georgian, and Greek:
-    append("Երևան, თბილისი, και Αθήνα – 3 մայրքաղաքներ: Բարի գալուստ, მოგესალმებით, και Καλώς ορίσατε!",
-           "Երևան, თბილისი, και Αθήνα – երեք մայրքաղաքներ: բարև, სტუმრები, και Καλώς ήρθατε!");
-
-    // ~200 characters in ASCII English, Traditional Chinese, and Russian, describing their capitals.
-    append("London, the iconic capital of the United Kingdom, seamlessly blends centuries-old traditions with bold "
-           "modernity;"
-           "倫敦作為英國的標誌性首都，其歷史沉澱與當代創新彼此交融，展現獨特風範;"
-           "Лондон, столица Великобритании, объединяет древние традиции с динамичной современностью, "
-           "offering a rich tapestry of cultural heritage and visionary progress.", // First string ends here ;)
-           "London, the renowned capital of the UK, fuses its rich historical legacy with a spirit of modern "
-           "innovation;"
-           "倫敦，作為英國的著名首都，以悠久歷史與現代創意相互融合，呈現獨特都市風貌;"
-           "Лондон – известная столица Великобритании, где древность встречается с современной энергией, "
-           "creating an inspiring environment for cultural exploration and future development.");
-
-    // ~300 characters; a complex variant with translations and visible regions of Korean, Japanese, Chinese
-    // (traditional and simplified), German, French, Spanish.
-    append("An epic voyage through multicultural realms: "
-           "In a city where ancient traditions fuse with modern innovation, dynamic energy permeates every street. "
-           "서울의 번화한 거리에선 전통과 현대가 어우러져 감동을 주며, 東京では伝統美と未来の夢が共鳴する。在這裡, "
-           "傳統文化與現代科技和諧並存, 而这里, 传统文化与现代科技交织创新; "
-           "Deutschland zeigt eine reiche Geschichte, "
-           "la France révèle une élégance subtile, "
-           "y España irradia pasión y color.", // First string ends here ;)
-           "An epic journey through diverse cultures: "
-           "In a town where old traditions fuse with innovation, energy permeates every historic street. "
-           "서울의 번화한 거리는 전통과 현대가 어울려 독특한 풍경을 이루며, "
-           "東京では伝統美と未来への展望が響き合う。在這裡, 傳統與現代科技融合無間, 而这里, 传统与现代科技紧密相连; "
-           "Deutschland offenbart eine stolze Geschichte, "
-           "la France incarne une élégance fine, "
-           "y España resplandece con pasión y vivacidad.");
-
-    // First check with a batch-size of 1
-    using score_t = score_type_;
-    unified_vector<score_t> results_base(1), results_simd(1);
-    arrow_strings_tape_t first_tape, second_tape;
-    bool contains_missing_in_any_case = false;
-    constexpr score_t signaling_score = std::numeric_limits<score_t>::max();
-
-    // Old C-style for-loops are much more debuggable than range-based loops!
-    for (std::size_t pair_idx = 0; pair_idx != test_cases.size(); ++pair_idx) {
-        auto const &first = test_cases[pair_idx].first;
-        auto const &second = test_cases[pair_idx].second;
-
-        // Check if the input strings fit into our allowed characters set
-        if (!allowed_chars.empty()) {
-            bool contains_missing = false;
-            for (auto c : first) contains_missing |= allowed_chars.find(c) == std::string_view::npos;
-            for (auto c : second) contains_missing |= allowed_chars.find(c) == std::string_view::npos;
-            contains_missing_in_any_case |= contains_missing;
-            if (contains_missing) continue;
-        }
-
-        // Reset the tapes and results
-        results_base[0] = signaling_score, results_simd[0] = signaling_score;
-        first_tape.try_assign(&first, &first + 1);
-        second_tape.try_assign(&second, &second + 1);
-
-        // Compute with both backends
-        arrow_strings_view_t first_view = first_tape.view();
-        arrow_strings_view_t second_view = second_tape.view();
-        score_t *results_base_ptr = results_base.data();
-        score_t *results_simd_ptr = results_simd.data();
-        status_t status_base = base_operator(first_view, second_view, results_base_ptr);
-        status_t status_simd = simd_operator(first_view, second_view, results_simd_ptr, extra_args...);
-        _sz_assert(status_base == status_t::success_k);
-        _sz_assert(status_simd == status_t::success_k);
-        if (results_base[0] != results_simd[0])
-            edit_distance_log_mismatch(first, second, results_base[0], results_simd[0]);
-    }
-
-    // Unzip the test cases into two separate tapes and perform batch processing
-    if (!contains_missing_in_any_case) {
-        results_base.resize(test_cases.size(), signaling_score);
-        results_simd.resize(test_cases.size(), signaling_score);
-        first_tape.reset();
-        second_tape.reset();
-        for (auto [first, second] : test_cases) {
-            _sz_assert(first_tape.try_append({first.data(), first.size()}) == status_t::success_k);
-            _sz_assert(second_tape.try_append({second.data(), second.size()}) == status_t::success_k);
-        }
-
-        // Compute with both backends
-        status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
-        status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data(), extra_args...);
-        _sz_assert(status_base == status_t::success_k);
-        _sz_assert(status_simd == status_t::success_k);
-
-        // Individually log the failed results
-        for (std::size_t i = 0; i != test_cases.size(); ++i) {
-            if (results_base[i] == results_simd[i]) continue;
-            edit_distance_log_mismatch(test_cases[i].first, test_cases[i].second, results_base[i], results_simd[i]);
-        }
-    }
-}
-
-struct fuzzy_config_t {
-    std::string_view alphabet = "ABC";
-    std::size_t batch_size = 16;
-    std::size_t min_string_length = 1;
-    std::size_t max_string_length = 200;
-};
-
-void randomize_strings(fuzzy_config_t config, std::vector<std::string> &array, arrow_strings_tape_t &tape,
-                       bool unique = false) {
-    array.resize(config.batch_size);
-
-    std::uniform_int_distribution<std::size_t> length_distribution(config.min_string_length, config.max_string_length);
-    for (std::size_t i = 0; i != config.batch_size; ++i) {
-        std::size_t length = length_distribution(global_random_generator());
-        array[i] = random_string(length, config.alphabet.data(), config.alphabet.size());
-    }
-
-    if (unique) {
-        std::sort(array.begin(), array.end());
-        auto last = std::unique(array.begin(), array.end());
-        array.erase(last, array.end());
-    }
-
-    // Convert to a GPU-friendly layout
-    status_t status = tape.try_assign(array.data(), array.data() + array.size());
-    _sz_assert(status == status_t::success_k);
-}
-
-/**
- *  @brief  Tests the correctness of the string class Levenshtein distance computation,
- *          as well as the similarity scoring functions for bioinformatics-like workloads
- *          on a synthetic @b randomly-generated set of strings from a given @p alphabet.
- */
-template <typename score_type_, typename base_operator_, typename simd_operator_, typename... extra_args_>
-void test_similarity_scores_fuzzy(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
-                                  fuzzy_config_t config = {}, std::size_t iterations = 10,
-                                  extra_args_ &&...extra_args) {
-
-    unified_vector<score_type_> results_base(config.batch_size), results_simd(config.batch_size);
-    std::vector<std::string> first_array, second_array;
-    arrow_strings_tape_t first_tape, second_tape;
-
-    // Generate some random strings, using a small alphabet
-    for (std::size_t iteration_idx = 0; iteration_idx < iterations; ++iteration_idx) {
-        randomize_strings(config, first_array, first_tape);
-        randomize_strings(config, second_array, second_tape);
-
-        // Compute with both backends
-        status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
-        status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data(), extra_args...);
-        _sz_assert(status_base == status_t::success_k);
-        _sz_assert(status_simd == status_t::success_k);
-
-        // Individually log the failed results
-        for (std::size_t i = 0; i != config.batch_size; ++i) {
-            if (results_base[i] == results_simd[i]) continue;
-            edit_distance_log_mismatch(first_array[i], second_array[i], results_base[i], results_simd[i]);
-        }
-    }
-}
-
-template <typename score_type_, typename base_operator_, typename simd_operator_, typename... extra_args_>
-void test_similarity_scores_fixed_and_fuzzy(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
-                                            std::string_view allowed_chars = {}, fuzzy_config_t config = {},
-                                            extra_args_ &&...extra_args) {
-    test_similarity_scores_fixed<score_type_>(base_operator, simd_operator, allowed_chars, extra_args...);
-    test_similarity_scores_fuzzy<score_type_>(base_operator, simd_operator, config, 1, extra_args...);
-}
-
-/**
- *  @brief  Tests the correctness of the string class Levenshtein distance, NW & SW score computation,
- *          comparing the results to some baseline implementation for predefined and random inputs.
- */
-void test_similarity_scores_equivalence() {
-
-    using error_t = error_cost_t;
-    using error_matrix_t = error_costs_256x256_t; // ? Full matrix for all 256 ASCII characters
-    using error_mat_t = error_costs_26x26ascii_t; // ? Smaller compact form for 26 capital ASCII characters
-
-    // Our logic of computing NW and SW alignment similarity scores differs in sign from most implementations.
-    // It's similar to how the "cosine distance" is the inverse of the "cosine similarity".
-    // In our case we compute the "distance" and by negating the sign, we can compute the "similarity".
-    {
-        constexpr error_t unary_match_score = 1;
-        constexpr error_t unary_mismatch_score = 0;
-        constexpr error_t unary_gap_score = 0;
-        error_matrix_t substituter_unary = error_matrix_t::diagonal(unary_match_score, unary_mismatch_score);
-        auto distance_l = levenshtein_baseline("abcdefg", 7, "abc_efg", 7);
-        auto similarity_nw = needleman_wunsch_baseline("abcdefg", 7, "abc_efg", 7, substituter_unary, unary_gap_score);
-        auto similarity_sw = smith_waterman_baseline("abcdefg", 7, "abc_efg", 7, substituter_unary, unary_gap_score);
-        // Distance can be computed from the similarity, by inverting the sign around the length of the longest string:
-        auto distance_nw = std::max(7, 7) - similarity_nw;
-        auto distance_sw = std::max(7, 7) - similarity_sw;
-        _sz_assert(distance_l == 1);
-        _sz_assert(distance_nw == 1);
-        _sz_assert(distance_sw == 1);
-    }
-
-    // Let's define some weird scoring schemes for Levenshtein-like distance, that are not unary:
-    constexpr linear_gap_costs_t weird_linear {3};
-    constexpr affine_gap_costs_t weird_affine {4, 2};
-    constexpr uniform_substitution_costs_t weird_uniform {1, 3};
-
-    // Single-threaded serial Levenshtein distance implementation
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>( //
-        levenshtein_baselines_t {},                    //
-        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {});
-
-    // Multi-threaded parallel Levenshtein distance implementation
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>( //
-        levenshtein_baselines_t {},                    //
-        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {});
-
-    // Single-threaded serial Levenshtein distance implementation with weird linear costs
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(         //
-        levenshtein_baselines_t {weird_uniform, weird_linear}, //
-        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear});
-
-    // Multi-threaded parallel Levenshtein distance implementation with weird linear costs
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(         //
-        levenshtein_baselines_t {weird_uniform, weird_linear}, //
-        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear});
-
-    // Single-threaded serial Levenshtein distance implementation with weird affine costs
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(         //
-        levenshtein_baselines_t {weird_uniform, weird_affine}, //
-        levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_affine});
-
-    // Multi-threaded parallel Levenshtein distance implementation with weird affine costs
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(         //
-        levenshtein_baselines_t {weird_uniform, weird_affine}, //
-        levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_affine});
-
-    // Now let's take non-unary substitution costs, like BLOSUM62
-    constexpr linear_gap_costs_t blosum62_linear_cost {-4};
-    constexpr affine_gap_costs_t blosum62_affine_cost {-4, -1};
-    error_mat_t blosum62_mat = error_costs_26x26ascii_t::blosum62();
-    error_matrix_t blosum62_matrix = blosum62_mat.decompressed();
-
-    // Single-threaded serial NW implementation
-    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                       //
-        needleman_wunsch_baselines_t {blosum62_matrix, blosum62_linear_cost}, //
-        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
-            blosum62_matrix, blosum62_linear_cost});
-
-    // Multi-threaded parallel NW implementation
-    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                       //
-        needleman_wunsch_baselines_t {blosum62_matrix, blosum62_linear_cost}, //
-        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
-            blosum62_matrix, blosum62_linear_cost});
-
-    // Single-threaded serial SW implementation
-    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                     //
-        smith_waterman_baselines_t {blosum62_matrix, blosum62_linear_cost}, //
-        smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
-            blosum62_matrix, blosum62_linear_cost});
-
-    // Multi-threaded parallel SW implementation
-    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                     //
-        smith_waterman_baselines_t {blosum62_matrix, blosum62_linear_cost}, //
-        smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
-            blosum62_matrix, blosum62_linear_cost});
-
-    // Single-threaded serial NW implementation with weird affine costs
-    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                       //
-        needleman_wunsch_baselines_t {blosum62_matrix, blosum62_affine_cost}, //
-        needleman_wunsch_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
-            blosum62_matrix, blosum62_affine_cost});
-
-    // Multi-threaded parallel NW implementation with weird affine costs
-    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                       //
-        needleman_wunsch_baselines_t {blosum62_matrix, blosum62_affine_cost}, //
-        needleman_wunsch_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
-            blosum62_matrix, blosum62_affine_cost});
-
-    // Single-threaded serial SW implementation with weird affine costs
-    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                     //
-        smith_waterman_baselines_t {blosum62_matrix, blosum62_affine_cost}, //
-        smith_waterman_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
-            blosum62_matrix, blosum62_affine_cost});
-
-    // Multi-threaded parallel SW implementation with weird affine costs
-    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                     //
-        smith_waterman_baselines_t {blosum62_matrix, blosum62_affine_cost}, //
-        smith_waterman_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
-            blosum62_matrix, blosum62_affine_cost});
-
-#if SZ_USE_ICE
-    // Ice Lake Levenshtein distance against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                 //
-        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {}, //
-        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_si_k> {});
-
-    // Ice Lake Levenshtein distance against Multi-threaded on CPU with weird linear costs
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                                            //
-        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear}, //
-        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_si_k> {weird_uniform, weird_linear});
-
-    // Ice Lake Levenshtein distance against Multi-threaded on CPU with weird affine costs
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                                            //
-        levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_affine}, //
-        levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_caps_si_k> {weird_uniform, weird_affine});
-
-    // Ice Lake Levenshtein UTF8 distance against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                      //
-        levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {}, //
-        levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_caps_si_k> {});
-
-    // Ice Lake Levenshtein UTF8 distance against Multi-threaded on CPU with weird linear costs
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>( //
-        levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear},
-        levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_caps_si_k> {weird_uniform, weird_linear});
-
-    // Ice Lake Needleman-Wunsch distance against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                       //
-        needleman_wunsch_baselines_t {blosum62_matrix, blosum62_linear_cost}, //
-        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_caps_si_k> {
-            blosum62_matrix, blosum62_linear_cost});
-
-    // Ice Lake Smith-Waterman distance against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>(                     //
-        smith_waterman_baselines_t {blosum62_matrix, blosum62_linear_cost}, //
-        smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_caps_si_k> {blosum62_matrix,
-                                                                                                 blosum62_linear_cost});
-
-#endif
-
-#if SZ_USE_CUDA
-    gpu_specs_t first_gpu_specs = *gpu_specs();
-#endif
-
-#if SZ_USE_CUDA
-    // CUDA Levenshtein distance against Multi-threaded on CPU with weird linear costs
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                                            //
-        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear}, //
-        levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k> {weird_uniform, weird_linear}, {}, {},
-        first_gpu_specs);
-
-    // CUDA Levenshtein distance against Multi-threaded on CPU with weird affine costs
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                                            //
-        levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_affine}, //
-        levenshtein_distances<char, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k> {weird_uniform, weird_affine}, {}, {},
-        first_gpu_specs);
-#endif
-
-#if SZ_USE_KEPLER
-    // CUDA Levenshtein distance on Kepler against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                                            //
-        levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear}, //
-        levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_caps_ck_k> {weird_uniform, weird_linear}, {}, {},
-        first_gpu_specs);
-#endif
-
-#if SZ_USE_CUDA
-    // CUDA Needleman-Wunsch score against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
-        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
-            blosum62_matrix, blosum62_linear_cost}, //
-        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k> {
-            blosum62_matrix, blosum62_linear_cost},
-        {}, {}, first_gpu_specs);
-
-    // CUDA Needleman-Wunsch score against Multi-threaded on CPU with affine costs
-    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
-        needleman_wunsch_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
-            blosum62_matrix, blosum62_affine_cost}, //
-        needleman_wunsch_scores<char, error_matrix_t, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k> {
-            blosum62_matrix, blosum62_affine_cost},
-        {}, {}, first_gpu_specs);
-
-    // CUDA Smith-Waterman score against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
-        smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
-            blosum62_matrix, blosum62_linear_cost}, //
-        smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k> {blosum62_matrix,
-                                                                                                  blosum62_linear_cost},
-        {}, {}, first_gpu_specs);
-
-    // CUDA Smith-Waterman score against Multi-threaded on CPU with affine costs
-    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
-        smith_waterman_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
-            blosum62_matrix, blosum62_affine_cost}, //
-        smith_waterman_scores<char, error_matrix_t, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k> {blosum62_matrix,
-                                                                                                  blosum62_affine_cost},
-        {}, {}, first_gpu_specs);
-#endif
-
-#if SZ_USE_HOPPER
-    // CUDA Needleman-Wunsch score on Hopper against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
-        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
-            blosum62_matrix, blosum62_linear_cost}, //
-        needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k> {
-            blosum62_matrix, blosum62_linear_cost},
-        {}, {}, first_gpu_specs);
-
-    // CUDA Needleman-Wunsch score on Hopper against Multi-threaded on CPU with affine costs
-    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
-        needleman_wunsch_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
-            blosum62_matrix, blosum62_affine_cost}, //
-        needleman_wunsch_scores<char, error_matrix_t, affine_gap_costs_t, ualloc_t, sz_caps_ckh_k> {
-            blosum62_matrix, blosum62_affine_cost},
-        {}, {}, first_gpu_specs);
-
-    // CUDA Smith-Waterman score on Hopper against Multi-threaded on CPU
-    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
-        smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {
-            blosum62_matrix, blosum62_linear_cost}, //
-        smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k> {blosum62_matrix,
-                                                                                                  blosum62_linear_cost},
-        {}, {}, first_gpu_specs);
-
-    // CUDA Smith-Waterman score on Hopper against Multi-threaded on CPU with affine costs
-    test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
-        smith_waterman_scores<char, error_matrix_t, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {
-            blosum62_matrix, blosum62_affine_cost}, //
-        smith_waterman_scores<char, error_matrix_t, affine_gap_costs_t, ualloc_t, sz_caps_ckh_k> {blosum62_matrix,
-                                                                                                  blosum62_affine_cost},
-        {}, {}, first_gpu_specs);
-
-#endif
-}
-
-/**
- *  @brief  Many GPU algorithms depend on effective use of shared memory and scheduling its allocation for
- *          long inputs or very large batches isn't trivial.
- */
-void test_similarity_scores_memory_usage() {
-
-    std::vector<fuzzy_config_t> experiments = {
-        // Single string pair of same length:
-        {.batch_size = 1, .min_string_length = 128, .max_string_length = 128},
-        {.batch_size = 1, .min_string_length = 512, .max_string_length = 512},
-        {.batch_size = 1, .min_string_length = 2048, .max_string_length = 2048},
-        {.batch_size = 1, .min_string_length = 8192, .max_string_length = 8192},
-        {.batch_size = 1, .min_string_length = 32768, .max_string_length = 32768},
-        {.batch_size = 1, .min_string_length = 131072, .max_string_length = 131072},
-        // Two strings of a same length:
-        {.batch_size = 2, .min_string_length = 128, .max_string_length = 128},
-        {.batch_size = 2, .min_string_length = 512, .max_string_length = 512},
-        {.batch_size = 2, .min_string_length = 2048, .max_string_length = 2048},
-        {.batch_size = 2, .min_string_length = 8192, .max_string_length = 8192},
-        {.batch_size = 2, .min_string_length = 32768, .max_string_length = 32768},
-        {.batch_size = 2, .min_string_length = 131072, .max_string_length = 131072},
-        // Ten strings of random lengths:
-        {.batch_size = 10, .min_string_length = 1, .max_string_length = 128},
-        {.batch_size = 10, .min_string_length = 1, .max_string_length = 512},
-        {.batch_size = 10, .min_string_length = 1, .max_string_length = 2048},
-        {.batch_size = 10, .min_string_length = 1, .max_string_length = 8192},
-        {.batch_size = 10, .min_string_length = 1, .max_string_length = 32768},
-        {.batch_size = 10, .min_string_length = 1, .max_string_length = 131072},
-    };
-
-#if SZ_USE_CUDA
-    gpu_specs_t first_gpu_specs = *gpu_specs();
-#endif
-
-    // Let's define some weird scoring schemes for Levenshtein-like distance, that are not unary:
-    constexpr linear_gap_costs_t weird_linear {3};
-    constexpr affine_gap_costs_t weird_affine {4, 2};
-    constexpr uniform_substitution_costs_t weird_uniform {1, 3};
-
-    // Progress until something fails
-    for (fuzzy_config_t const &experiment : experiments) {
-        std::printf("Testing with batch size %zu, min length %zu, max length %zu\n", experiment.batch_size,
-                    experiment.min_string_length, experiment.max_string_length);
-
-        // Multi-threaded serial Levenshtein distance implementation
-        test_similarity_scores_fuzzy<sz_size_t>( //
-            levenshtein_baselines_t {},          //
-            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {}, experiment, 1);
-
-        // Multi-threaded serial Levenshtein distance implementation with weird linear costs
-        test_similarity_scores_fuzzy<sz_size_t>(                   //
-            levenshtein_baselines_t {weird_uniform, weird_linear}, //
-            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear},
-            experiment, 1);
-
-        // Multi-threaded serial Levenshtein distance implementation with weird affine costs
-        test_similarity_scores_fuzzy<sz_size_t>(                   //
-            levenshtein_baselines_t {weird_uniform, weird_affine}, //
-            levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_affine},
-            experiment, 1);
-
-#if SZ_USE_ICE
-        // Ice Lake Levenshtein distance against Multi-threaded on CPU
-        test_similarity_scores_fuzzy<sz_size_t>( //
-            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {},
-            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_si_k> {}, experiment, 1);
-
-        // Ice Lake Levenshtein distance against Multi-threaded on CPU with weird linear costs
-        test_similarity_scores_fuzzy<sz_size_t>( //
-            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear},
-            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_caps_si_k> {weird_uniform, weird_linear},
-            experiment, 1);
-
-        // Ice Lake Levenshtein distance against Multi-threaded on CPU with weird affine costs
-        test_similarity_scores_fuzzy<sz_size_t>( //
-            levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_affine},
-            levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_caps_si_k> {weird_uniform, weird_affine},
-            experiment, 1);
-#endif
-
-#if SZ_USE_CUDA
-        // CUDA Levenshtein distance against Multi-threaded on CPU
-        test_similarity_scores_fuzzy<sz_size_t>(                                           //
-            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {}, //
-            levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k> {}, experiment, 10,
-            first_gpu_specs);
-#endif
-
-#if SZ_USE_KEPLER
-        // CUDA Levenshtein distance on Kepler against Multi-threaded on CPU
-        test_similarity_scores_fuzzy<sz_size_t>(                                           //
-            levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {}, //
-            levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_caps_ck_k> {}, experiment, 10,
-            first_gpu_specs);
-#endif
-    }
-}
-
-struct find_many_baselines_t {
-    using match_t = find_many_match_t;
-
-    arrow_strings_tape_t needles_;
-
-    template <typename needles_type_>
-    status_t try_build(needles_type_ &&needles) noexcept {
-        return needles_.try_assign(needles.begin(), needles.end());
-    }
-
-    void reset() noexcept { needles_.reset(); }
-
-    template <typename haystack_type_, typename needles_type_, typename match_callback_type_>
-    bool one_haystack(haystack_type_ const &haystack, needles_type_ const &needles,
-                      match_callback_type_ &&callback) const noexcept {
-
-        // A wise man once said, `omp parallel for collapse(2) schedule(dynamic, 1)`...
-        // But the compiler wasn't listening, and won't compile the cancellation point!
-        // So we resort to a much less intricate solution:
-        // - Manually slice the data per thread,
-        // - Keep one atomic variable to signal cancellation,
-        // - Use absolutely minimal OpenMP functionality just to assign N slices to N threads.
-        std::atomic<bool> aborted {false};
-        std::size_t const haystack_size = haystack.size();
-        std::size_t const threads_count = std::thread::hardware_concurrency();
-        std::size_t const start_offsets_per_thread = divide_round_up(haystack_size, threads_count);
-
-#pragma omp parallel for schedule(static, 1)
-        for (std::size_t thread_index = 0; thread_index != threads_count; ++thread_index) {
-            std::size_t const start_offset = std::min(thread_index * start_offsets_per_thread, haystack_size);
-            std::size_t const end_offset = std::min(start_offset + start_offsets_per_thread, haystack_size);
-
-            // Check for matches in the current slice
-            for (std::size_t match_offset = start_offset;
-                 match_offset != end_offset && !aborted.load(std::memory_order_relaxed); ++match_offset) {
-                for (std::size_t needle_index = 0; needle_index != needles.size(); ++needle_index) {
-                    auto const &needle = needles[needle_index];
-                    if (match_offset + needle.size() > haystack_size) continue;
-                    auto const same = std::memcmp(haystack.data() + match_offset, needle.data(), needle.size()) == 0;
-                    if (!same) continue;
-
-                    // Create a match object
-                    match_t match;
-                    match.haystack_index = 0;
-                    match.needle_index = needle_index;
-                    match.haystack = {reinterpret_cast<byte_t const *>(haystack.data()), haystack.size()};
-                    match.needle = {reinterpret_cast<byte_t const *>(haystack.data() + match_offset), needle.size()};
-                    if (!callback(match)) {
-                        aborted.store(true, std::memory_order_relaxed);
-                        break;
-                    }
-                }
-            }
-        }
-
-        return !aborted.load(std::memory_order_relaxed);
-    }
-
-    template <typename haystacks_type_, typename needles_type_, typename match_callback_type_>
-    void all_pairs(haystacks_type_ &&haystacks, needles_type_ &&needles,
-                   match_callback_type_ &&callback) const noexcept {
-
-        for (std::size_t haystack_index = 0; haystack_index != haystacks.size(); ++haystack_index) {
-            auto const &haystack = haystacks[haystack_index];
-            if (!one_haystack(haystack, needles, [&](match_t match) noexcept {
-                    match.haystack_index = haystack_index;
-                    return callback(match);
-                }))
-                return;
-        }
-    }
-
-    template <typename haystacks_type_>
-    status_t try_count(haystacks_type_ &&haystacks, span<size_t> counts) const noexcept {
-        for (size_t &count : counts) count = 0;
-        all_pairs(haystacks, needles_, [&](match_t const &match) noexcept {
-            std::atomic_ref<size_t> count(counts[match.haystack_index]);
-            count.fetch_add(1, std::memory_order_relaxed);
-            return true;
-        });
-        return status_t::success_k;
-    }
-
-    template <typename haystacks_type_, typename output_matches_type_>
-    status_t try_find(haystacks_type_ &&haystacks, span<size_t const> counts,
-                      output_matches_type_ &&matches) const noexcept {
-
-        sz_unused(counts);
-        std::atomic<size_t> count_found {0};
-        std::size_t const count_allowed {matches.size()};
-        all_pairs(haystacks, needles_, [&](match_t const &match) noexcept {
-            size_t match_index = count_found.fetch_add(1, std::memory_order_relaxed);
-            matches[match_index] = match;
-            return match_index < count_allowed;
-        });
-        return status_t::success_k;
-    }
-};
-
-/**
- *  @brief  Tests the correctness of the string class Levenshtein distance computation,
- *          as well as the similarity scoring functions for bioinformatics-like workloads
- *          on a @b fixed set of different representative ASCII and UTF-8 strings.
- */
-template <typename base_operator_, typename simd_operator_, typename... extra_args_>
-void test_find_many_on(std::vector<std::string> haystacks, std::vector<std::string> needles,
-                       base_operator_ &&base_operator, simd_operator_ &&simd_operator, extra_args_ &&...extra_args) {
-
-    using match_t = find_many_match_t;
-
-    // First check with a batch-size of 1
-    unified_vector<size_t> counts_base(1), counts_simd(1);
-    unified_vector<match_t> matches_base(1), matches_simd(1);
-    arrow_strings_tape_t haystacks_tape, needles_tape;
-    needles_tape.try_assign(needles.data(), needles.data() + needles.size());
-
-    // Construct the matchers
-    status_t status_base = base_operator.try_build(needles_tape.view());
-    status_t status_simd = simd_operator.try_build(needles_tape.view());
-    _sz_assert(status_base == status_t::success_k);
-    _sz_assert(status_simd == status_t::success_k);
-
-    // Old C-style for-loops are much more debuggable than range-based loops!
-    for (std::size_t haystack_idx = 0; haystack_idx != haystacks.size(); ++haystack_idx) {
-        auto const &haystack = haystacks[haystack_idx];
-
-        // Reset the tapes and results
-        counts_base[0] = 0, counts_simd[0] = 0;
-        matches_base.clear(), matches_simd.clear();
-        haystacks_tape.try_assign(&haystack, &haystack + 1);
-
-        // Count with both backends
-        span<size_t> counts_base_span {counts_base.data(), counts_base.size()};
-        span<size_t> counts_simd_span {counts_simd.data(), counts_simd.size()};
-        status_t status_count_base = base_operator.try_count(haystacks_tape.view(), counts_base_span);
-        status_t status_count_simd = simd_operator.try_count(haystacks_tape.view(), counts_simd_span, extra_args...);
-        _sz_assert(status_count_base == status_t::success_k);
-        _sz_assert(status_count_simd == status_t::success_k);
-        _sz_assert(counts_base[0] == counts_simd[0]);
-
-        // Check the matches themselves
-        matches_base.resize(std::accumulate(counts_base.begin(), counts_base.end(), 0));
-        matches_simd.resize(std::accumulate(counts_simd.begin(), counts_simd.end(), 0));
-        status_t status_matched_base = base_operator.try_find(haystacks_tape.view(), counts_base_span, matches_base);
-        status_t status_matched_simd =
-            simd_operator.try_find(haystacks_tape.view(), counts_simd_span, matches_simd, extra_args...);
-        _sz_assert(status_matched_base == status_t::success_k);
-        _sz_assert(status_matched_simd == status_t::success_k);
-
-        // Check the contents and order of the matches
-        std::sort(matches_base.begin(), matches_base.end(), match_t::less_globally);
-        std::sort(matches_simd.begin(), matches_simd.end(), match_t::less_globally);
-        for (std::size_t i = 0; i != matches_base.size(); ++i) {
-            _sz_assert(matches_base[i].haystack.data() == matches_simd[i].haystack.data());
-            _sz_assert(matches_base[i].needle.data() == matches_simd[i].needle.data());
-            _sz_assert(matches_base[i].needle_index == matches_simd[i].needle_index);
-        }
-    }
-
-    // Now test all the haystacks simultaneously
-    {
-        haystacks_tape.try_assign(haystacks.data(), haystacks.data() + haystacks.size());
-        counts_base.resize(haystacks.size());
-        counts_simd.resize(haystacks.size());
-
-        // Count with both backends and compare all of the bounds
-        span<size_t> counts_base_span {counts_base.data(), counts_base.size()};
-        span<size_t> counts_simd_span {counts_simd.data(), counts_simd.size()};
-        status_t status_count_base = base_operator.try_count(haystacks_tape.view(), counts_base_span);
-        status_t status_count_simd = simd_operator.try_count(haystacks_tape.view(), counts_simd_span, extra_args...);
-        _sz_assert(status_count_base == status_t::success_k);
-        _sz_assert(status_count_simd == status_t::success_k);
-        _sz_assert(std::equal(counts_base.begin(), counts_base.end(), counts_simd.begin()));
-
-        // Check the matches themselves
-        matches_base.resize(std::accumulate(counts_base.begin(), counts_base.end(), 0));
-        matches_simd.resize(std::accumulate(counts_simd.begin(), counts_simd.end(), 0));
-        status_t status_matched_base = base_operator.try_find(haystacks_tape.view(), counts_base_span, matches_base);
-        status_t status_matched_simd =
-            simd_operator.try_find(haystacks_tape.view(), counts_simd_span, matches_simd, extra_args...);
-        _sz_assert(status_matched_base == status_t::success_k);
-        _sz_assert(status_matched_simd == status_t::success_k);
-
-        // Check the contents and order of the matches
-        std::sort(matches_base.begin(), matches_base.end(), match_t::less_globally);
-        std::sort(matches_simd.begin(), matches_simd.end(), match_t::less_globally);
-        for (std::size_t i = 0; i != matches_base.size(); ++i) {
-            _sz_assert(matches_base[i].haystack.data() == matches_simd[i].haystack.data());
-            _sz_assert(matches_base[i].needle.data() == matches_simd[i].needle.data());
-            _sz_assert(matches_base[i].needle_index == matches_simd[i].needle_index);
-        }
-    }
-}
-
-/**
- *  @brief  Tests the correctness of the string class Levenshtein distance computation,
- *          as well as the similarity scoring functions for bioinformatics-like workloads
- *          on a @b fixed set of different representative ASCII and UTF-8 strings.
- */
-template <typename base_operator_, typename simd_operator_, typename... extra_args_>
-void test_find_many_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_operator, extra_args_ &&...extra_args) {
-
-    {
-        std::vector<std::string> haystacks, needles;
-
-        // Some vary basic variants:
-        needles.emplace_back("his");
-        needles.emplace_back("is");
-        needles.emplace_back("she");
-        needles.emplace_back("her");
-
-        needles.emplace_back("école"), needles.emplace_back("école");                   // decomposed
-        needles.emplace_back("Schön"), needles.emplace_back("Scho\u0308n");             // combining diaeresis
-        needles.emplace_back("naïve"), needles.emplace_back("naive");                   // stripped diaeresis
-        needles.emplace_back("façade"), needles.emplace_back("facade");                 // no cedilla
-        needles.emplace_back("office"), needles.emplace_back("ofﬁce");                  // “fi” ligature
-        needles.emplace_back("Straße"), needles.emplace_back("Strasse");                // ß vs ss
-        needles.emplace_back("ABBA"), needles.emplace_back("\u0410\u0412\u0412\u0410"); // Latin vs Cyrillic
-        needles.emplace_back("中国"), needles.emplace_back("中國");                     // simplified vs traditional
-        needles.emplace_back("🙂"), needles.emplace_back("☺️");                          // emoji variants
-        needles.emplace_back("€100"), needles.emplace_back("EUR 100"); // currency symbol vs abbreviation
-
-        // Haystacks should contain arbitrary strings including those needles
-        // in different positions, potentially interleaving
-        haystacks.emplace_back("That is a test string"); // ? "only "is"
-        haystacks.emplace_back("This is a test string"); // ? "his", 2x "is"
-        haystacks.emplace_back("ahishers");              // textbook example
-        haystacks.emplace_back("hishishersherishis");    // heavy overlap, prefix & suffix collisions
-        haystacks.emplace_back("si siht si a tset gnirts; reh ton si ehs, tub sih ti si."); // no real matches
-        haystacks.emplace_back("his\0is\r\nshe\0her");                                      // null-included
-
-        // ~260 chars – dense English with overlapping words (“his”, “is”, “she”, “her”)
-        haystacks.emplace_back(R"(
-        In this historic thesis, the historian highlights his findings: this is the synthesis of data.
-        She examined the theory, he shared her methodology. In this chapter, he lists his equipment:
-        microscope, test kit, sensor. It is here that she erred: misalignment arises.
-        )");
-
-        // ~320 chars – multilingual snippet with needles in Latin, Arabic, Chinese, English
-        haystacks.emplace_back(R"(
-        The conference in 北京 attracted researchers from across the globe. His presentation “AI in Healthcare”
-        was a hit—she received awards. الباحثون استعرضوا الأبحاث، واستشارت her colleagues. 这是一次重要的会议。
-        She said: “This is only the beginning.” In her report, his name appears seventeen times.
-        )");
-
-        test_find_many_on(haystacks, needles, base_operator, simd_operator, extra_args...);
-    }
-
-    // Many of our algorithms depend on the idea that needles are shorter than the slices that each core may receive
-    {
-        std::vector<std::string> haystacks, needles;
-        needles.emplace_back("is");
-        needles.emplace_back("his");
-
-        haystacks.emplace_back("this is his, that is his, those are his, these are his");
-        haystacks.emplace_back("his is this, his is that, his are those, his are these");
-        haystacks.emplace_back(R"(
-        1 is this 2 is this 3 is this 4 is this 5 is this 6 is this 7 is this 8 is this
-        9 is this 10 is this 11 is this 12 is this 13 is this 14 is this 15 is this 16 is this
-        )");
-
-        test_find_many_on(haystacks, needles, base_operator, simd_operator, extra_args...);
-    }
-
-    // Try even simpler alphabets
-    {
-        std::vector<std::string> haystacks, needles;
-        needles.emplace_back("ab");
-        needles.emplace_back("aba");
-
-        haystacks.emplace_back("abababababababababababababababababababababababababababababababababab");
-        haystacks.emplace_back("abbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabbaabba");
-
-        test_find_many_on(haystacks, needles, base_operator, simd_operator, extra_args...);
-    }
-
-    // Try a combination of very short and very long needles
-    {
-        std::vector<std::string> haystacks, needles;
-        needles.emplace_back("a");
-        needles.emplace_back("b");
-        needles.emplace_back("abracadabra");
-
-        haystacks.emplace_back("abracadabra");
-        haystacks.emplace_back("abracadabracadabra");
-
-        test_find_many_on(haystacks, needles, base_operator, simd_operator, extra_args...);
-    }
-}
-
-/**
- *  @brief Fuzzy test for multi-pattern exact search algorithms using randomly-generated haystacks and needles.
- */
-template <typename base_operator_, typename simd_operator_, typename... extra_args_>
-void test_find_many(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
-                    arrow_strings_tape_t const &haystacks_tape, arrow_strings_tape_t const &needles_tape,
-                    extra_args_ &&...extra_args) {
-
-    using match_t = find_many_match_t;
-    unified_vector<match_t> results_base, results_simd;
-    unified_vector<size_t> counts_base, counts_simd;
-
-    counts_base.resize(haystacks_tape.size());
-    counts_simd.resize(haystacks_tape.size());
-
-    // Build the matchers
-    _sz_assert(base_operator.try_build(needles_tape.view()) == status_t::success_k);
-    _sz_assert(simd_operator.try_build(needles_tape.view()) == status_t::success_k);
-
-    // Count the number of matches with both backends
-    span<size_t> counts_base_span {counts_base.data(), counts_base.size()};
-    span<size_t> counts_simd_span {counts_simd.data(), counts_simd.size()};
-    status_t status_count_base = base_operator.try_count(haystacks_tape.view(), counts_base_span);
-    status_t status_count_simd = simd_operator.try_count(haystacks_tape.view(), counts_simd_span, extra_args...);
-    _sz_assert(status_count_base == status_t::success_k);
-    _sz_assert(status_count_simd == status_t::success_k);
-    size_t total_count_base = std::accumulate(counts_base.begin(), counts_base.end(), 0);
-    size_t total_count_simd = std::accumulate(counts_simd.begin(), counts_simd.end(), 0);
-    _sz_assert(total_count_base == total_count_simd);
-    _sz_assert(std::equal(counts_base.begin(), counts_base.end(), counts_simd.begin()));
-
-    // Compute with both backends
-    results_base.resize(total_count_base);
-    results_simd.resize(total_count_simd);
-    size_t count_base = 0, count_simd = 0;
-    status_t status_base = base_operator.try_find(haystacks_tape.view(), counts_base_span, results_base);
-    status_t status_simd = simd_operator.try_find(haystacks_tape.view(), counts_simd_span, results_simd, extra_args...);
-    _sz_assert(status_base == status_t::success_k);
-    _sz_assert(status_simd == status_t::success_k);
-    _sz_assert(count_base == count_simd);
-
-    // Individually log the failed results
-    std::sort(results_base.begin(), results_base.end(), match_t::less_globally);
-    std::sort(results_simd.begin(), results_simd.end(), match_t::less_globally);
-    for (std::size_t i = 0; i != results_base.size(); ++i) {
-        _sz_assert(results_base[i].haystack_index == results_simd[i].haystack_index);
-        _sz_assert(results_base[i].needle_index == results_simd[i].needle_index);
-        _sz_assert(results_base[i].needle.data() == results_simd[i].needle.data());
-    }
-
-    base_operator.reset();
-    simd_operator.reset();
-}
-
-/**
- *  @brief Fuzzy test for multi-pattern exact search algorithms using randomly-generated haystacks and needles.
- */
-template <typename base_operator_, typename simd_operator_, typename... extra_args_>
-void test_find_many_fuzzy(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
-                          fuzzy_config_t needles_config = {}, fuzzy_config_t haystacks_config = {},
-                          std::size_t iterations = 10, extra_args_ &&...extra_args) {
-
-    std::vector<std::string> haystacks_array, needles_array;
-    arrow_strings_tape_t haystacks_tape, needles_tape;
-
-    // Generate some random strings, using a small alphabet
-    for (std::size_t iteration_idx = 0; iteration_idx < iterations; ++iteration_idx) {
-        randomize_strings(haystacks_config, haystacks_array, haystacks_tape);
-        randomize_strings(needles_config, needles_array, needles_tape, true);
-        test_find_many(base_operator, simd_operator, haystacks_tape, needles_tape, extra_args...);
-    }
-}
-
-/**
- *  @brief  Fuzzy test for multi-pattern exact search algorithms using randomly-generated haystacks,
- *          and using incrementally longer potentially-overlapping substrings as needles.
- */
-template <typename base_operator_, typename simd_operator_, typename... extra_args_>
-void test_find_many_prefixes(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
-                             fuzzy_config_t haystacks_config, std::size_t needle_length_limit,
-                             std::size_t iterations = 10, extra_args_ &&...extra_args) {
-
-    std::vector<std::string> haystacks_array;
-    std::vector<std::string_view> needles_array;
-    arrow_strings_tape_t haystacks_tape, needles_tape;
-
-    for (std::size_t iteration_idx = 0; iteration_idx < iterations; ++iteration_idx) {
-        randomize_strings(haystacks_config, haystacks_array, haystacks_tape);
-
-        // Pick various substrings as needles from the first haystack
-        needles_array.resize(std::min(haystacks_array[0].size(), needle_length_limit));
-        for (std::size_t i = 0; i != needles_array.size(); ++i)
-            needles_array[i] = std::string_view(haystacks_array[0]).substr(0, i + 1);
-        needles_tape.try_assign(needles_array.data(), needles_array.data() + needles_array.size());
-
-        test_find_many(base_operator, simd_operator, haystacks_tape, needles_tape, extra_args...);
-    }
-}
-
-/**
- *  @brief  Tests the multi-pattern exact substring search algorithm
- *          against a baseline implementation for predefined and random inputs.
- */
-void test_find_many_equivalence() {
-
-    cpu_specs_t default_cpu_specs;
-    fuzzy_config_t needles_short_config, needles_long_config, haystacks_config;
-    haystacks_config.batch_size = default_cpu_specs.cores_total() * 4;
-    haystacks_config.max_string_length = default_cpu_specs.l3_bytes;
-
-    needles_short_config.min_string_length = 1;
-    needles_short_config.max_string_length = 4;
-    needles_short_config.batch_size =
-        std::pow(needles_short_config.alphabet.size(), needles_short_config.max_string_length);
-
-    needles_long_config.min_string_length = 3;
-    needles_long_config.max_string_length = 6;
-    needles_long_config.batch_size =
-        std::pow(needles_long_config.alphabet.size(), needles_long_config.max_string_length);
-
-#if SZ_USE_CUDA
-    gpu_specs_t first_gpu_specs = *gpu_specs();
-#endif
-
-    // Single-threaded serial Aho-Corasick implementation
-    test_find_many_fixed(find_many_baselines_t {}, find_many_u32_serial_t {});
-
-    // Multi-threaded parallel Aho-Corasick implementation
-    for (std::size_t threads : {2, 3, 4, 5}) {
-        fu::basic_pool_t pool;
-        if (!pool.try_spawn(threads)) throw std::runtime_error("Failed to spawn thread pool.");
-        static_assert(executor_like<fu::basic_pool_t>);
-        test_find_many_fixed(find_many_baselines_t {}, find_many_u32_parallel_t {}, pool);
-    }
-
-    // Let's reuse a thread-pool to amortize the cost of spawning threads.
-    fu::basic_pool_t pool;
-    if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
-    static_assert(executor_like<fu::basic_pool_t>);
-
-#if SZ_USE_CUDA
-    test_find_many_fixed(find_many_baselines_t {}, find_many_u32_cuda_t {}, cuda_executor_t {});
-    test_find_many_fuzzy(find_many_baselines_t {}, find_many_u32_cuda_t {}, needles_short_config, haystacks_config, 1,
-                         cuda_executor_t {});
-    test_find_many_fuzzy(find_many_baselines_t {}, find_many_u32_cuda_t {}, needles_long_config, haystacks_config, 1,
-                         cuda_executor_t {});
-    test_find_many_prefixes(find_many_baselines_t {}, find_many_u32_cuda_t {}, haystacks_config, 1024, 1,
-                            cuda_executor_t {});
-#endif
-
-    // Fuzzy tests with random inputs
-    test_find_many_fuzzy(find_many_baselines_t {}, find_many_u32_serial_t {}, needles_short_config, haystacks_config,
-                         1);
-    test_find_many_fuzzy(find_many_baselines_t {}, find_many_u32_serial_t {}, needles_long_config, haystacks_config, 1);
-    test_find_many_prefixes(find_many_baselines_t {}, find_many_u32_serial_t {}, haystacks_config, 1024, 1);
-
-    // Fuzzy tests with random inputs for multi-threaded CPU backend
-    test_find_many_fuzzy(find_many_baselines_t {}, find_many_u32_parallel_t {}, needles_short_config, haystacks_config,
-                         10, pool);
-    test_find_many_fuzzy(find_many_baselines_t {}, find_many_u32_parallel_t {}, needles_long_config, haystacks_config,
-                         10, pool);
-    test_find_many_prefixes(find_many_baselines_t {}, find_many_u32_parallel_t {}, haystacks_config, 1024, 10, pool);
-}
-
 } // namespace scripts
 } // namespace stringzillas
 } // namespace ashvardanian

From cf6077e13e4fda84e8ec4ecd4cc7282fd0b47dc7 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 11 Jul 2025 16:37:40 +0000
Subject: [PATCH 457/751] Improve: Merge fingerprinting drafts

---
 drafts/features.hpp                       | 40 -----------------------
 drafts/{fingerprint.h => fingerprint.hpp} | 34 ++++++++++++++++---
 2 files changed, 29 insertions(+), 45 deletions(-)
 delete mode 100644 drafts/features.hpp
 rename drafts/{fingerprint.h => fingerprint.hpp} (94%)

diff --git a/drafts/features.hpp b/drafts/features.hpp
deleted file mode 100644
index ae863e09..00000000
--- a/drafts/features.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/**
- *  @brief  Hardware-accelerated feature extractions for string collections.
- *  @file   features.hpp
- *  @author Ash Vardanian
- *
- *  The `sklearn.feature_extraction` module for @b TF-IDF, `CountVectorizer`, and `HashingVectorizer`
- *  is one of the most commonly used in the industry due to its extreme flexibility. It can:
- *
- *  - Tokenize by words, N-grams, or in-word N-grams.
- *  - Use arbitrary Regular Expressions as word separators.
- *  - Return matrices of different types, normalized or not.
- *  - Exclude "stop words" and remove ASCII and Unicode accents.
- *  - Dynamically build a vocabulary or use a fixed list/dictionary.
- *
- *  That level of flexibility is not feasible for a hardware-accelerated SIMD library, but we
- *  can provide a set of APIs that can be used to build such a library on top of StringCuZilla.
- *  That functionality can reuse our @b Trie data-structure for vocabulary building histograms.
- *
- *  In this file, we mostly focus on batch-level hashing operations, similar to the `intersect.h`
- *  module. There, we cross-reference two sets of strings, and here we only analyze one at a time.
- *
- *  - The text comes in pre-tokenized form, as a stream, not even indexed-lookup is needed,
- *    unlike the `sz_sequence_t` in `sz_intersect` APIs.
- *  - We scatter those tokens into the output in multiple forms:
- *
- *    - output hashes into a continuous buffer.
- *    - output hashes into a hash-map with counts.
- *    - output hashes into a high-dimensional bit-vector.
- *
- *  @see https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
- *  @see https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
- */
-#ifndef STRINGZILLA_FEATURES_HPP_
-#define STRINGZILLA_FEATURES_HPP_
-
-#include "stringzilla/memory.h"  // `sz_move`
-#include "stringzilla/types.hpp" // `sz::error_cost_t`
-
-#include <limits>   // `std::numeric_limits` for numeric types
-#include <iterator> // `std::iterator_traits` for iterators
diff --git a/drafts/fingerprint.h b/drafts/fingerprint.hpp
similarity index 94%
rename from drafts/fingerprint.h
rename to drafts/fingerprint.hpp
index a442d7a0..c06ec4e8 100644
--- a/drafts/fingerprint.h
+++ b/drafts/fingerprint.hpp
@@ -1,10 +1,34 @@
 /**
- *  @brief  Hardware-accelerated rolling string hashes or fingerprints.
- *  @file   hash.h
+ *  @brief  Hardware-accelerated feature extractions for string collections.
+ *  @file   features.hpp
  *  @author Ash Vardanian
-
- *  - `sz_hashes_fingerprint`
- *  - `sz_hashes_intersection`
+ *
+ *  The `sklearn.feature_extraction` module for @b TF-IDF, `CountVectorizer`, and `HashingVectorizer`
+ *  is one of the most commonly used in the industry due to its extreme flexibility. It can:
+ *
+ *  - Tokenize by words, N-grams, or in-word N-grams.
+ *  - Use arbitrary Regular Expressions as word separators.
+ *  - Return matrices of different types, normalized or not.
+ *  - Exclude "stop words" and remove ASCII and Unicode accents.
+ *  - Dynamically build a vocabulary or use a fixed list/dictionary.
+ *
+ *  That level of flexibility is not feasible for a hardware-accelerated SIMD library, but we
+ *  can provide a set of APIs that can be used to build such a library on top of StringCuZilla.
+ *  That functionality can reuse our @b Trie data-structure for vocabulary building histograms.
+ *
+ *  In this file, we mostly focus on batch-level hashing operations, similar to the `intersect.h`
+ *  module. There, we cross-reference two sets of strings, and here we only analyze one at a time.
+ *
+ *  - The text comes in pre-tokenized form, as a stream, not even indexed-lookup is needed,
+ *    unlike the `sz_sequence_t` in `sz_intersect` APIs.
+ *  - We scatter those tokens into the output in multiple forms:
+ *
+ *    - output hashes into a continuous buffer.
+ *    - output hashes into a hash-map with counts.
+ *    - output hashes into a high-dimensional bit-vector.
+ *
+ *  @see https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
+ *  @see https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
  */
 #ifndef STRINGZILLA_HASH_H_
 #define STRINGZILLA_HASH_H_

From 9ab4a2b12720693d2eabfad187f757b4044711d7 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 14 Jul 2025 12:30:13 +0000
Subject: [PATCH 458/751] Add: Fingerprinting drafts are back :)

---
 .github/workflows/prerelease.yml              |   2 +-
 .vscode/settings.json                         |   2 +
 CMakeLists.txt                                |   8 +-
 CONTRIBUTING.md                               |   8 +-
 c/stringzillas.cu                             |   8 +-
 include/stringzilla/stringzilla.h             |   2 +-
 .../stringzillas}/fingerprint.hpp             | 457 ++++++++++++++----
 7 files changed, 390 insertions(+), 97 deletions(-)
 rename {drafts => include/stringzillas}/fingerprint.hpp (57%)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 067ea918..897e3b6b 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -101,7 +101,7 @@ jobs:
       - name: Test Parallel Algorithms on Real World Data
         run: |
           build_artifacts/stringzillas_bench_similarity_cpp20 ${DATASET_PATH} # for edit distances and alignment scores
-          build_artifacts/stringzillas_bench_find_many_cpp20 ${DATASET_PATH}  # for multi-needle search in many strings
+          build_artifacts/stringzillas_bench_fingerprint_cpp20 ${DATASET_PATH}  # for multi-needle search in many strings
         env:
           DATASET_PATH: ./README.md
         # Don't overload GitHub with our benchmarks.
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 0878455f..e313d862 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -74,6 +74,7 @@
     "Gonnet",
     "Gotoh",
     "GPGPU",
+    "hashers",
     "Haswell",
     "Heikki",
     "hexdigits",
@@ -118,6 +119,7 @@
     "Nadav",
     "napi",
     "nargsf",
+    "ndarray",
     "ndim",
     "Needleman",
     "newfunc",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 58e972cc..056d1c53 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,8 +33,8 @@
 #
 # * stringzillas_bench_similarity_cpp20: A benchmark for similarity operations.
 # * stringzillas_bench_similarity_cu20: A benchmark for similarity operations on GPU.
-# * stringzillas_bench_find_many_cpp20: A benchmark for finding many substrings.
-# * stringzillas_bench_find_many_cu20: A benchmark for finding many substrings on GPU.
+# * stringzillas_bench_fingerprint_cpp20: A benchmark for finding many substrings.
+# * stringzillas_bench_fingerprint_cu20: A benchmark for finding many substrings on GPU.
 #
 # For higher-level language bindings separate build scripts are provided, native to each toolchain.
 cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
@@ -373,13 +373,13 @@ if (${STRINGZILLA_BUILD_BENCHMARK})
 
     # Parallel benchmarks
     define_launcher(stringzillas_bench_similarity_cpp20 scripts/bench_similarity.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringzillas_bench_find_many_cpp20 scripts/bench_find_many.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(stringzillas_bench_fingerprint_cpp20 scripts/bench_fingerprint.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     if (ENABLE_CUDA)
         define_gpu_launcher(
             stringzillas_bench_similarity_cu20 scripts/bench_similarity.cu 20 "${STRINGZILLA_TARGET_ARCH}"
         )
         define_gpu_launcher(
-            stringzillas_bench_find_many_cu20 scripts/bench_find_many.cu 20 "${STRINGZILLA_TARGET_ARCH}"
+            stringzillas_bench_fingerprint_cu20 scripts/bench_fingerprint.cu 20 "${STRINGZILLA_TARGET_ARCH}"
         )
     endif ()
 endif ()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b2494c06..29f46bd8 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -185,10 +185,10 @@ build_release/stringzilla_bench_container_cpp20 # - for STL containers with stri
 There are also parallel algorithms that need a very different benchmarking setup:
 
 ```sh
-build_release/stringzillas_bench_find_many_cpp20  # - for parallel multi-pattern search on CPU
-build_release/stringzillas_bench_find_many_cu20   # - for parallel multi-pattern search on GPU
-build_release/stringzillas_bench_similarity_cpp20 # - for parallel edit distances and alignment scores on CPU
-build_release/stringzillas_bench_similarity_cu20  # - for parallel edit distances and alignment scores on GPU
+build_release/stringzillas_bench_fingerprint_cpp20  # - for parallel multi-pattern search on CPU
+build_release/stringzillas_bench_fingerprint_cu20   # - for parallel multi-pattern search on GPU
+build_release/stringzillas_bench_similarity_cpp20   # - for parallel edit distances and alignment scores on CPU
+build_release/stringzillas_bench_similarity_cu20    # - for parallel edit distances and alignment scores on GPU
 ```
 
 All of them support customization via environment variables.
diff --git a/c/stringzillas.cu b/c/stringzillas.cu
index 25ace82b..39952445 100644
--- a/c/stringzillas.cu
+++ b/c/stringzillas.cu
@@ -6,12 +6,12 @@
  */
 #include <fork_union.hpp> // Fork-join scoped thread pool
 
-#include <stringzillas/find_many.hpp>  // C++ templates for string processing
-#include <stringzillas/similarity.hpp> // C++ templates for string similarity
+#include <stringzillas/fingerprint.hpp> // C++ templates for string processing
+#include <stringzillas/similarity.hpp>  // C++ templates for string similarity
 
 #if SZ_USE_CUDA
-#include <stringzillas/find_many.cuh>  // Parallel string processing in CUDA
-#include <stringzillas/similarity.cuh> // Parallel string similarity in CUDA
+#include <stringzillas/fingerprint.cuh> // Parallel string processing in CUDA
+#include <stringzillas/similarity.cuh>  // Parallel string similarity in CUDA
 #endif
 
 namespace sz = ashvardanian::stringzilla;
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 89cf180c..3029806a 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -30,7 +30,7 @@
  *
  *  - `similarity.{hpp,cuh}` - similarity measures, like Levenshtein, Needleman-Wunsch, & Smith-Waterman scores.
  *  - `features.{hpp,cuh}` - feature extraction for TF-IDF and other Machine Learning algorithms.
- *  - `find_many.{hpp,cuh}` - Aho-Corasick multi-pattern search.
+ *  - `fingerprint.{hpp,cuh}` - Aho-Corasick multi-pattern search.
  *
  *  The core implementations of those algorithms are mostly structured as callable structure templates, as opposed to
  *  template functions to simplify specialized overloads and reusing the state between invocations.
diff --git a/drafts/fingerprint.hpp b/include/stringzillas/fingerprint.hpp
similarity index 57%
rename from drafts/fingerprint.hpp
rename to include/stringzillas/fingerprint.hpp
index c06ec4e8..1eecfd61 100644
--- a/drafts/fingerprint.hpp
+++ b/include/stringzillas/fingerprint.hpp
@@ -1,9 +1,9 @@
 /**
- *  @brief  Hardware-accelerated feature extractions for string collections.
- *  @file   features.hpp
+ *  @brief  Hardware-accelerated Min-Hash fingerprinting for string collections.
+ *  @file   fingerprint.hpp
  *  @author Ash Vardanian
  *
- *  The `sklearn.feature_extraction` module for @b TF-IDF, `CountVectorizer`, and `HashingVectorizer`
+ *  The `sklearn.feature_extraction` module for @b TF-IDF, `CountVectorizer`, and @b `HashingVectorizer`
  *  is one of the most commonly used in the industry due to its extreme flexibility. It can:
  *
  *  - Tokenize by words, N-grams, or in-word N-grams.
@@ -12,29 +12,319 @@
  *  - Exclude "stop words" and remove ASCII and Unicode accents.
  *  - Dynamically build a vocabulary or use a fixed list/dictionary.
  *
- *  That level of flexibility is not feasible for a hardware-accelerated SIMD library, but we
- *  can provide a set of APIs that can be used to build such a library on top of StringCuZilla.
- *  That functionality can reuse our @b Trie data-structure for vocabulary building histograms.
+ *  @see https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
+ *  @see https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
  *
- *  In this file, we mostly focus on batch-level hashing operations, similar to the `intersect.h`
- *  module. There, we cross-reference two sets of strings, and here we only analyze one at a time.
+ *  That level of flexibility is not feasible for a hardware-accelerated SIMD library, but we can provide a
+ *  subset of that functionality for producing fixed-size "sketches" or "fingerprints" of documents for large-scale
+ *  retrieval tasks. We must also keep in mind, that however costly, the "fingerprinting" is a one-time operation, and
+ *  the quality of the resulting "sketch" is no less important than the speed of the algorithm.
  *
- *  - The text comes in pre-tokenized form, as a stream, not even indexed-lookup is needed,
- *    unlike the `sz_sequence_t` in `sz_intersect` APIs.
- *  - We scatter those tokens into the output in multiple forms:
+ *  At its core we compute many Karp-Rabin-like "rolling hashes" over multiple window widths and multipliers.
+ *  We avoid 64-bit hashes, due to the lack of hardware support for efficient multiplication and modulo operations.
+ *  That's especially noticeable on GPUs, where 64-bit ops are often emulated using 32-bit and can be 8-32x slower.
+ *  Instead, we use 32-bit hashes, and windows of size 4, 8, 16, and 32 bytes, including up to 8x UTF-32 characters.
  *
- *    - output hashes into a continuous buffer.
- *    - output hashes into a hash-map with counts.
- *    - output hashes into a high-dimensional bit-vector.
+ *  @see https://en.wikipedia.org/wiki/MinHash
+ *  @see https://en.wikipedia.org/wiki/Universal_hashing
  *
- *  @see https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
- *  @see https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
+ *  For every byte T(i) we see, the update rule for the hash H(i) is:
+ *
+ *  1. multiply the hashes by a constant,
+ *  2. broadcast the new byte across the register,
+ *  3. add broadcasted byte to the hashes,
+ *  4. compute the modulo of the hashes with a large prime number.
+ *
+ *  The typical instructions for high-resolution integer multiplication are like are:
+ *
+ *  - `VPMULLQ (ZMM, ZMM, ZMM)` for `_mm512_mullo_epi64`:
+ *    - on Intel Ice Lake: 15 cycles on port 0.
+ *    - on AMD Zen4: 3 cycles on ports 0 or 1.
+ *  - `VPMULLD (ZMM, ZMM, ZMM)` for `_mm512_mullo_epi32`:
+ *    - on Intel Ice Lake: 10 cycles on port 0.
+ *    - on AMD Zen4: 3 cycles on ports 0 or 1.
+ *  - `VPMULLW (ZMM, ZMM, ZMM)` for `_mm512_mullo_epi16`:
+ *    - on Intel Ice Lake: 5 cycles on port 0.
+ *    - on AMD Zen4: 3 cycles on ports 0 or 1.
+ *  - `VPMADD52LUQ (ZMM, ZMM, ZMM)` for `_mm512_madd52lo_epu64` for 52-bit multiplication:
+ *    - on Intel Ice Lake: 4 cycles on port 0.
+ *    - on AMD Zen4: 4 cycles on ports 0 or 1.
+ *
+ *  Such multiplication is typically much more expensive than smaller integer types, and one may expect
+ *  more such SIMD instructions appearing due to the AI demand for quantized dot-products, but currently
+ *  they don't seem much cheaper:
+ *
+ *  - `VPDPWSSDS (ZMM, ZMM, ZMM)` for `_mm512_dpwssds_epi32` for 16-bit signed FMA into 32-bit:
+ *    - on Intel Ice Lake: 5 cycles on port 0.
+ *    - on AMD Zen4: 4 cycles on ports 0 or 1.
+ *
+ *  An alternative may be to switch to floating-point arithmetic:
+ *
+ *  - `VFMADD132PS (ZMM, ZMM, ZMM)` for `_mm512_fmadd_ps` for 32-bit FMA:
+ *    - on Intel Ice Lake: 4 cycles on port 0.
+ *    - on AMD Zen4: 4 cycles on ports 0 or 1.
+ *  - `VFMADD132PD (ZMM, ZMM, ZMM)` for `_mm512_fmadd_pd` for 64-bit FMA:
+ *    - on Intel Ice Lake: 4 cycles on port 0.
+ *    - on AMD Zen4: 4 cycles on ports 0 or 1.
+ *
+ *  The significand of a `double` can store at least 52 bits worth of unique values, and the latencies of
+ *  the `VFMADD132PD` and `VPMADD52LUQ` seem identical, which suggests that under the hood, those instructions
+ *  may be using the same machinery. Importantly, floating-point division is still expensive:
+ *
+ *  - `VDIVPS (ZMM, ZMM, ZMM)` for `_mm512_div_ps` for 32-bit division:
+ *    - on Intel Ice Lake: 17 cycles on port 0.
+ *    - on AMD Zen4: 11 cycles on ports 0 or 1.
+ *  - `VDIVPD (ZMM, ZMM, ZMM)` for `_mm512_div_pd` for 64-bit division:
+ *    - on Intel Ice Lake: 23 cycles on port 0.
+ *    - on AMD Zen4: 13 cycles on ports 0 or 1.
+ *
+ *  So optimizations, like the Barrett reduction can still be useful.
+ */
+#ifndef STRINGZILLAS_FINGERPRINT_HPP_
+#define STRINGZILLAS_FINGERPRINT_HPP_
+
+#include "stringzilla/types.hpp"  // `sz::error_cost_t`
+#include "stringzilla/memory.h"   // `sz_move`
+#include "stringzillas/types.hpp" // `sz::executor_like`
+
+#include <limits>   // `std::numeric_limits` for numeric types
+#include <iterator> // `std::iterator_traits` for iterators
+#include <cmath>    // `std::fabsf` for `f32_rolling_hasher`
+
+namespace ashvardanian {
+namespace stringzillas {
+
+/**
+ *  @brief The simplest example of a rolling hash function, leveraging 2^N modulo arithmetic.
+ *  @tparam hash_type_ Type of the hash value, e.g., `std::uint64_t`.
+ */
+template <typename hash_type_ = std::uint64_t>
+struct multiplying_rolling_hasher {
+    using hash_t = hash_type_;
+
+    explicit multiplying_rolling_hasher(std::size_t window_width, hash_t multiplier = static_cast<hash_t>(257)) noexcept
+        : window_width_ {window_width}, multiplier_ {multiplier}, highest_power_ {1} {
+
+        _sz_assert(window_width_ > 1 && "Window width must be > 1");
+        _sz_assert(multiplier_ > 0 && "Multiplier must be positive");
+
+        for (std::size_t i = 0; i + 1 < window_width_; ++i) highest_power_ = highest_power_ * multiplier_;
+    }
+
+    inline std::size_t window_width() const noexcept { return window_width_; }
+
+    inline hash_t update(hash_t old_hash, hash_t new_char) const noexcept { return old_hash * multiplier_ + new_char; }
+
+    inline hash_t update(hash_t const old_hash, hash_t const old_char, hash_t const new_char) const noexcept {
+        hash_t const without_head = old_hash - old_char * highest_power_;
+        return without_head * multiplier_ + new_char;
+    }
+
+  private:
+    std::size_t window_width_;
+    hash_t multiplier_;
+    hash_t highest_power_;
+};
+
+/**
+ *  @brief Rabin-Karp–style rolling polynomial hash function.
+ *  @tparam hash_type_ Type of the hash value, e.g., `std::uint32_t`.
+ *  @tparam accumulator_type_ Type used for modulo arithmetic, e.g., `std::uint64_t`.
+ *
+ *  Barrett's reduction can be used to avoid overflow in the multiplication and modulo operations.
+ *  That, however, is quite tricky and computationally expensive, so this algorithm is provided merely
+ *  as a baseline for retrieval benchmarks.
+ *  @sa `multiplying_rolling_hasher`
+ */
+template <typename hash_type_ = std::uint32_t, typename accumulator_type_ = std::uint64_t>
+struct polynomial_rolling_hasher {
+    using hash_t = hash_type_;
+    using accumulator_t = accumulator_type_;
+
+    explicit polynomial_rolling_hasher(std::size_t window_width, hash_t prime, hash_t modulo_base) noexcept
+        : window_width_ {window_width}, modulo_base_ {modulo_base}, prime_ {prime}, prime_power_ {1} {
+
+        _sz_assert(window_width_ > 1 && "Window width must be > 1");
+        _sz_assert(prime_ > 0 && "Prime must be positive");
+        _sz_assert(modulo_base_ > 1 && "Modulo base must be > 1");
+
+        for (std::size_t i = 0; i + 1 < window_width_; ++i) prime_power_ = mul_mod(prime_power_, prime_);
+    }
+
+    inline std::size_t window_width() const noexcept { return window_width_; }
+
+    inline hash_t update(hash_t old_hash, hash_t new_char) const noexcept {
+        return add_mod(mul_mod(old_hash, prime_), new_char);
+    }
+
+    inline hash_t update(hash_t const old_hash, hash_t const old_char, hash_t const new_char) const noexcept {
+        hash_t const term_to_subtract = mul_mod(old_char, prime_power_);
+        hash_t const without_head = sub_mod(old_hash, term_to_subtract);
+        return add_mod(mul_mod(without_head, prime_), new_char);
+    }
+
+  private:
+    inline hash_t mul_mod(hash_t const a, hash_t const b) const noexcept {
+        accumulator_t const prod = accumulator_t {a} * accumulator_t {b};
+        return static_cast<hash_t>(prod % modulo_base_);
+    }
+
+    inline hash_t add_mod(hash_t const a, hash_t const b) const noexcept {
+        accumulator_t const sum = accumulator_t {a} + accumulator_t {b};
+        return static_cast<hash_t>(sum % modulo_base_);
+    }
+
+    inline hash_t sub_mod(hash_t const a, hash_t const b) const noexcept {
+        accumulator_t diff = accumulator_t {a} + modulo_base_ - accumulator_t {b};
+        return static_cast<hash_t>(diff % modulo_base_);
+    }
+
+    std::size_t window_width_;
+    hash_t modulo_base_;
+    hash_t prime_;
+    hash_t prime_power_;
+};
+
+/**
+ *  @brief BuzHash rolling hash function leveraging a fixed-size lookup table and bitwise operations.
+ *  @tparam hash_type_ Type of the hash value, e.g., `std::uint64_t`.
+ *  @sa `multiplying_rolling_hasher`, `polynomial_rolling_hasher`
+ */
+template <typename hash_type_ = std::uint64_t>
+struct buz_rolling_hasher {
+    using hash_t = hash_type_;
+
+    explicit buz_rolling_hasher(std::size_t window_width, std::uint64_t seed = 0x9E3779B97F4A7C15ull) noexcept
+        : window_width_ {window_width} {
+
+        _sz_assert(window_width_ > 1 && "Window width must be > 1");
+        for (std::size_t i = 0; i < 256; ++i) table_[i] = split_mix64(seed);
+    }
+
+    inline std::size_t window_width() const noexcept { return window_width_; }
+
+    inline hash_t update(hash_t old_hash, hash_t new_char) const noexcept {
+        return rotl(old_hash, 1) ^ table_[new_char & 0xFFu];
+    }
+
+    inline hash_t update(hash_t const old_hash, hash_t const old_char, hash_t const new_char) const noexcept {
+        constexpr unsigned bits_k = sizeof(hash_t) * 8u;
+
+        hash_t const rolled = rotl(old_hash, 1);
+        hash_t const remove_term = rotl(table_[old_char & 0xFFu], window_width_ & (bits_k - 1u));
+        return rolled ^ remove_term ^ table_[new_char & 0xFFu];
+    }
+
+  private:
+    static inline hash_t rotl(hash_t const v, unsigned const r) noexcept {
+        constexpr unsigned bits_k = sizeof(hash_t) * 8u;
+        return (v << r) | (v >> (bits_k - r));
+    }
+
+    static inline std::uint64_t split_mix64(std::uint64_t &state) noexcept {
+        state += 0x9E3779B97F4A7C15ull;
+        std::uint64_t z = state;
+        z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
+        z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
+        return z ^ (z >> 31);
+    }
+
+    std::size_t window_width_;
+    hash_t table_[256];
+};
+
+template <typename float_type_ = float>
+struct floating_rolling_hasher;
+
+/**
+ *  @brief Rabin-Karp-style Rolling hash function for single-precision floating-point numbers.
+ *  @tparam float_type_ Type of the floating-point number, e.g., `float`.
+ *
+ *  The IEEE 754 single-precision `float` has a 24-bit significand (23 explicit bits + 1 implicit bit).
+ *  For simplicity, we just focus on the 23-bit part, which is capable of exactly representing integers
+ *  up to (2²³ - 1) = (8'388'607).
+ *
+ *  Some of the large primes fitting right before that limit are:
+ *      8'388'539, 8'388'547, 8'388'571, 8'388'581, 8'388'587, 8'388'593
+ *
+ *  @note It's fair to say that this hash at least 23 bits of information, but it may not be enough for many apps.
+ *  @sa `floating_rolling_hasher<double>` for 52 bit variant.
  */
-#ifndef STRINGZILLA_HASH_H_
-#define STRINGZILLA_HASH_H_
+template <>
+struct floating_rolling_hasher<float> {
+    using hash_t = std::uint32_t;
+    using float_t = float;
+
+    constexpr static float_t limit_k = 8'388'607.0f;
+
+    explicit floating_rolling_hasher(std::size_t const window_width, hash_t const multiplier = 257,
+                                     hash_t const modulo = 8388593) noexcept
+        : window_width_ {window_width}, multiplier_ {multiplier}, modulo_ {modulo}, inverse_modulo_ {1.f / modulo_},
+          negative_highest_pow_ {1.0f} {
+
+        _sz_assert(window_width_ > 1 && "Window width must be > 1");
+        _sz_assert(multiplier_ > 0 && "Multiplier must be positive");
+        _sz_assert(modulo_ > 1 && "Modulo must be > 1");
+
+        // If we want to avoid hitting +inf or NaN, we need to make sure that the product of our post-modulo
+        // normalized number with the multiplier and added subsequent term stays within the exactly representable range.
+        float_t const largest_input_term = std::numeric_limits<byte_t>::max() + 1.0f;
+        float_t const largest_normalized_state = modulo_ - 1;
+        float_t const largest_intermediary = largest_normalized_state * multiplier_ + largest_input_term;
+        _sz_assert(largest_intermediary < limit_k && "Intermediate state overflows the limit");
+
+        for (std::size_t i = 0; i + 1 < window_width_; ++i)
+            negative_highest_pow_ = std::fmodf(negative_highest_pow_ * multiplier_, modulo_);
+        negative_highest_pow_ = -negative_highest_pow_;
+    }
+
+    inline std::size_t window_width() const noexcept { return window_width_; }
+
+    inline hash_t update(hash_t const old_hash, byte_t const new_char) const noexcept {
+
+        float_t state = sz_bitcast(float_t, old_hash);
+        float_t new_term = float_t(new_char) + 1.0f;
+
+        state = std::fmaf(state, multiplier_, new_term);
+        state = reduce(state);
+
+        return sz_bitcast(hash_t, state);
+    }
+
+    inline hash_t update(hash_t const old_hash, byte_t const old_char, byte_t const new_char) const noexcept {
+
+        float_t state = sz_bitcast(float_t, old_hash);
+        float_t old_term = float_t(old_char) + 1.0f;
+        float_t new_term = float_t(new_char) + 1.0f;
+
+        state = std::fmaf(state, negative_highest_pow_, old_term); // Remove tail
+        state = std::fmaf(state, multiplier_, new_term);           // Add head
+        state = reduce(state);
+
+        return sz_bitcast(hash_t, state);
+    }
+
+  private:
+    /** @brief Barrett-style `std::fmodf` alternative to avoid overflow. */
+    inline float_t reduce(float_t h) const noexcept {
+        h -= modulo_ * std::floor(h * inverse_modulo_);
+        // Clamp into the [0, modulo_) range.
+        h += modulo_ * (h < 0.0f);
+        h -= modulo_ * (h >= modulo_);
+        return h;
+    }
+
+    std::size_t window_width_;
+    float_t multiplier_;
+    float_t modulo_;
+    float_t inverse_modulo_;
+    float_t negative_highest_pow_;
+};
 
-#include "types.h"
+} // namespace stringzillas
+} // namespace ashvardanian
 
+#if 0
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -52,22 +342,22 @@ extern "C" {
  *       1. Kernighan and Ritchie's function uses 31, a prime close to the size of English alphabet.
  *       2. To be friendlier to byte-arrays and UTF8, we use 257 for the second function.
  *
- *  Choosing the right ::window_length is task- and domain-dependant. For example, most English words are
+ *  Choosing the right ::window_width is task- and domain-dependant. For example, most English words are
  *  between 3 and 7 characters long, so a window of 4 bytes would be a good choice. For DNA sequences,
- *  the ::window_length might be a multiple of 3, as the codons are 3 (nucleotides) bytes long.
+ *  the ::window_width might be a multiple of 3, as the codons are 3 (nucleotides) bytes long.
  *  With such minimalistic alphabets of just four characters (AGCT) longer windows might be needed.
  *  For protein sequences the alphabet is 20 characters long, so the window can be shorter, than for DNAs.
  *
  *  @param text             String to hash.
  *  @param length           Number of bytes in the string.
- *  @param window_length    Length of the rolling window in bytes.
- *  @param window_step      Step of reported hashes. @b Must be power of two. Should be smaller than `window_length`.
+ *  @param window_width    Length of the rolling window in bytes.
+ *  @param window_step      Step of reported hashes. @b Must be power of two. Should be smaller than `window_width`.
  *  @param callback         Function receiving the start & length of a substring, the hash, and the `callback_handle`.
  *  @param callback_handle  Optional user-provided pointer to be passed to the `callback`.
  *  @see                    sz_hashes_fingerprint, sz_hashes_intersection
  */
 SZ_DYNAMIC void sz_hashes(                                                            //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
+    sz_cptr_t text, sz_size_t length, sz_size_t window_width, sz_size_t window_step, //
     sz_hash_callback_t callback, void *callback_handle);
 
 /**
@@ -76,7 +366,7 @@ SZ_DYNAMIC void sz_hashes(
  *
  *  The algorithm doesn't clear the fingerprint buffer on start, so it can be invoked multiple times
  *  to produce a fingerprint of a longer string, by passing the previous fingerprint as the ::fingerprint.
- *  It can also be reused to produce multi-resolution fingerprints by changing the ::window_length
+ *  It can also be reused to produce multi-resolution fingerprints by changing the ::window_width
  *  and calling the same function multiple times for the same input ::text.
  *
  *  Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
@@ -86,13 +376,13 @@ SZ_DYNAMIC void sz_hashes(
  *  @param length               Number of bytes in the string.
  *  @param fingerprint          Output fingerprint buffer.
  *  @param fingerprint_bytes    Number of bytes in the fingerprint buffer.
- *  @param window_length        Length of the rolling window in bytes.
+ *  @param window_width        Length of the rolling window in bytes.
  *  @see                        sz_hashes, sz_hashes_intersection
  */
 SZ_PUBLIC void sz_hashes_fingerprint(                          //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
+    sz_cptr_t text, sz_size_t length, sz_size_t window_width, //
     sz_ptr_t fingerprint, sz_size_t fingerprint_bytes) {
-    sz_unused(text && length && window_length && fingerprint && fingerprint_bytes);
+    sz_unused(text && length && window_width && fingerprint && fingerprint_bytes);
 }
 
 /**
@@ -106,16 +396,16 @@ SZ_PUBLIC void sz_hashes_fingerprint(                          //
  *  @param length               Number of bytes in the input document.
  *  @param fingerprint          Reference document fingerprint.
  *  @param fingerprint_bytes    Number of bytes in the reference documents fingerprint.
- *  @param window_length        Length of the rolling window in bytes.
+ *  @param window_width        Length of the rolling window in bytes.
  *  @see                        sz_hashes, sz_hashes_fingerprint
  */
 SZ_PUBLIC sz_size_t sz_hashes_intersection(                    //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
+    sz_cptr_t text, sz_size_t length, sz_size_t window_width, //
     sz_cptr_t fingerprint, sz_size_t fingerprint_bytes);
 
 /** @copydoc sz_hashes */
 SZ_PUBLIC void sz_hashes_serial(                                                      //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
+    sz_cptr_t text, sz_size_t length, sz_size_t window_width, sz_size_t window_step, //
     sz_hash_callback_t callback, void *callback_handle);
 }
 
@@ -137,28 +427,28 @@ SZ_PUBLIC void sz_hashes_serial(
 #define _sz_shift_high(x) ((x + 77ull) & 0xFFull)
 #define _sz_prime_mod(x) (x % SZ_U64_MAX_PRIME)
 
-SZ_PUBLIC void sz_hashes_serial(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
+SZ_PUBLIC void sz_hashes_serial(sz_cptr_t start, sz_size_t length, sz_size_t window_width, sz_size_t step, //
                                 sz_hash_callback_t callback, void *callback_handle) {
 
-    if (length < window_length || !window_length) return;
+    if (length < window_width || !window_width) return;
     sz_u8_t const *text = (sz_u8_t const *)start;
     sz_u8_t const *text_end = text + length;
 
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
+    // Prepare the `prime ^ window_width` values, that we are going to use for modulo arithmetic.
     sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
+    for (sz_size_t i = 0; i + 1 < window_width; ++i)
         prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
         prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
 
     // Compute the initial hash value for the first window.
     sz_u64_t hash_low = 0, hash_high = 0, hash_mix;
-    for (sz_u8_t const *first_end = text + window_length; text < first_end; ++text)
+    for (sz_u8_t const *first_end = text + window_width; text < first_end; ++text)
         hash_low = (hash_low * 31ull + _sz_shift_low(*text)) % SZ_U64_MAX_PRIME,
         hash_high = (hash_high * 257ull + _sz_shift_high(*text)) % SZ_U64_MAX_PRIME;
 
     // In most cases the fingerprint length will be a power of two.
     hash_mix = _sz_hash_mix(hash_low, hash_high);
-    callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
+    callback((sz_cptr_t)text, window_width, hash_mix, callback_handle);
 
     // Compute the hash value for every window, exporting into the fingerprint,
     // using the expensive modulo operation.
@@ -166,8 +456,8 @@ SZ_PUBLIC void sz_hashes_serial(sz_cptr_t start, sz_size_t length, sz_size_t win
     sz_size_t const step_mask = step - 1;
     for (; text < text_end; ++text, ++cycles) {
         // Discard one character:
-        hash_low -= _sz_shift_low(*(text - window_length)) * prime_power_low;
-        hash_high -= _sz_shift_high(*(text - window_length)) * prime_power_high;
+        hash_low -= _sz_shift_low(*(text - window_width)) * prime_power_low;
+        hash_high -= _sz_shift_high(*(text - window_width)) * prime_power_high;
         // And add a new one:
         hash_low = 31ull * hash_low + _sz_shift_low(*text);
         hash_high = 257ull * hash_high + _sz_shift_high(*text);
@@ -177,7 +467,7 @@ SZ_PUBLIC void sz_hashes_serial(sz_cptr_t start, sz_size_t length, sz_size_t win
         // Mix only if we've skipped enough hashes.
         if ((cycles & step_mask) == 0) {
             hash_mix = _sz_hash_mix(hash_low, hash_high);
-            callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
+            callback((sz_cptr_t)text, window_width, hash_mix, callback_handle);
         }
     }
 }
@@ -240,18 +530,18 @@ SZ_INTERNAL __m256i _mm256_mul_epu64(__m256i a, __m256i b) {
     return prod;
 }
 
-SZ_PUBLIC void sz_hashes_haswell(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
+SZ_PUBLIC void sz_hashes_haswell(sz_cptr_t start, sz_size_t length, sz_size_t window_width, sz_size_t step, //
                                  sz_hash_callback_t callback, void *callback_handle) {
 
-    if (length < window_length || !window_length) return;
-    if (length < 4 * window_length) {
-        sz_hashes_serial(start, length, window_length, step, callback, callback_handle);
+    if (length < window_width || !window_width) return;
+    if (length < 4 * window_width) {
+        sz_hashes_serial(start, length, window_width, step, callback, callback_handle);
         return;
     }
 
     // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
     // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
-    sz_size_t const max_hashes = length - window_length + 1;
+    sz_size_t const max_hashes = length - window_width + 1;
     sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
     sz_u8_t const *text_first = (sz_u8_t const *)start;
     sz_u8_t const *text_second = text_first + min_hashes_per_thread;
@@ -259,9 +549,9 @@ SZ_PUBLIC void sz_hashes_haswell(sz_cptr_t start, sz_size_t length, sz_size_t wi
     sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
     sz_u8_t const *text_end = text_first + length;
 
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
+    // Prepare the `prime ^ window_width` values, that we are going to use for modulo arithmetic.
     sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
+    for (sz_size_t i = 0; i + 1 < window_width; ++i)
         prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
         prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
 
@@ -280,7 +570,7 @@ SZ_PUBLIC void sz_hashes_haswell(sz_cptr_t start, sz_size_t length, sz_size_t wi
     sz_u256_vec_t hash_low_vec, hash_high_vec, hash_mix_vec, chars_low_vec, chars_high_vec;
     hash_low_vec.ymm = _mm256_setzero_si256();
     hash_high_vec.ymm = _mm256_setzero_si256();
-    for (sz_u8_t const *prefix_end = text_first + window_length; text_first < prefix_end;
+    for (sz_u8_t const *prefix_end = text_first + window_width; text_first < prefix_end;
          ++text_first, ++text_second, ++text_third, ++text_fourth) {
 
         // 1. Multiply the hashes by the base.
@@ -311,10 +601,10 @@ SZ_PUBLIC void sz_hashes_haswell(sz_cptr_t start, sz_size_t length, sz_size_t wi
     hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
     hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
     hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
-    callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-    callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-    callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-    callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
+    callback((sz_cptr_t)text_first, window_width, hash_mix_vec.u64s[0], callback_handle);
+    callback((sz_cptr_t)text_second, window_width, hash_mix_vec.u64s[1], callback_handle);
+    callback((sz_cptr_t)text_third, window_width, hash_mix_vec.u64s[2], callback_handle);
+    callback((sz_cptr_t)text_fourth, window_width, hash_mix_vec.u64s[3], callback_handle);
 
     // Now repeat that operation for the remaining characters, discarding older characters.
     sz_size_t cycle = 1;
@@ -322,8 +612,8 @@ SZ_PUBLIC void sz_hashes_haswell(sz_cptr_t start, sz_size_t length, sz_size_t wi
     for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
         // 0. Load again the four characters we are dropping, shift them, and subtract.
         chars_low_vec.ymm = _mm256_set_epi64x( //
-            text_fourth[-window_length], text_third[-window_length], text_second[-window_length],
-            text_first[-window_length]);
+            text_fourth[-window_width], text_third[-window_width], text_second[-window_width],
+            text_first[-window_width]);
         chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
         hash_low_vec.ymm =
             _mm256_sub_epi64(hash_low_vec.ymm, _mm256_mul_epu64(chars_low_vec.ymm, prime_power_low_vec.ymm));
@@ -358,10 +648,10 @@ SZ_PUBLIC void sz_hashes_haswell(sz_cptr_t start, sz_size_t length, sz_size_t wi
         hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
         hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
         if ((cycle & step_mask) == 0) {
-            callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-            callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-            callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-            callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
+            callback((sz_cptr_t)text_first, window_width, hash_mix_vec.u64s[0], callback_handle);
+            callback((sz_cptr_t)text_second, window_width, hash_mix_vec.u64s[1], callback_handle);
+            callback((sz_cptr_t)text_third, window_width, hash_mix_vec.u64s[2], callback_handle);
+            callback((sz_cptr_t)text_fourth, window_width, hash_mix_vec.u64s[3], callback_handle);
         }
     }
 }
@@ -400,18 +690,18 @@ SZ_PUBLIC void sz_hashes_haswell(sz_cptr_t start, sz_size_t length, sz_size_t wi
 #pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
                              apply_to = function)
 
-SZ_PUBLIC void sz_hashes_ice(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
+SZ_PUBLIC void sz_hashes_ice(sz_cptr_t start, sz_size_t length, sz_size_t window_width, sz_size_t step, //
                              sz_hash_callback_t callback, void *callback_handle) {
 
-    if (length < window_length || !window_length) return;
-    if (length < 4 * window_length) {
-        sz_hashes_serial(start, length, window_length, step, callback, callback_handle);
+    if (length < window_width || !window_width) return;
+    if (length < 4 * window_width) {
+        sz_hashes_serial(start, length, window_width, step, callback, callback_handle);
         return;
     }
 
     // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
     // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
-    sz_size_t const max_hashes = length - window_length + 1;
+    sz_size_t const max_hashes = length - window_width + 1;
     sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
     sz_u8_t const *text_first = (sz_u8_t const *)start;
     sz_u8_t const *text_second = text_first + min_hashes_per_thread;
@@ -425,9 +715,9 @@ SZ_PUBLIC void sz_hashes_ice(sz_cptr_t start, sz_size_t length, sz_size_t window
     prime_vec.zmm = _mm512_set1_epi64(SZ_U64_MAX_PRIME);
     golden_ratio_vec.zmm = _mm512_set1_epi64(11400714819323198485ull);
 
-    // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
+    // Prepare the `prime ^ window_width` values, that we are going to use for modulo arithmetic.
     sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_length; ++i)
+    for (sz_size_t i = 0; i + 1 < window_width; ++i)
         prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
         prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
 
@@ -442,7 +732,7 @@ SZ_PUBLIC void sz_hashes_ice(sz_cptr_t start, sz_size_t length, sz_size_t window
     // Compute the initial hash values for every one of the four windows.
     sz_u512_vec_t hash_vec, chars_vec;
     hash_vec.zmm = _mm512_setzero_si512();
-    for (sz_u8_t const *prefix_end = text_first + window_length; text_first < prefix_end;
+    for (sz_u8_t const *prefix_end = text_first + window_width; text_first < prefix_end;
          ++text_first, ++text_second, ++text_third, ++text_fourth) {
 
         // 1. Multiply the hashes by the base.
@@ -470,20 +760,20 @@ SZ_PUBLIC void sz_hashes_ice(sz_cptr_t start, sz_size_t length, sz_size_t window
     hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
                                             _mm512_extracti64x4_epi64(hash_mix_vec.zmm, 0));
 
-    callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-    callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-    callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-    callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
+    callback((sz_cptr_t)text_first, window_width, hash_mix_vec.u64s[0], callback_handle);
+    callback((sz_cptr_t)text_second, window_width, hash_mix_vec.u64s[1], callback_handle);
+    callback((sz_cptr_t)text_third, window_width, hash_mix_vec.u64s[2], callback_handle);
+    callback((sz_cptr_t)text_fourth, window_width, hash_mix_vec.u64s[3], callback_handle);
 
     // Now repeat that operation for the remaining characters, discarding older characters.
     sz_size_t cycle = 1;
     sz_size_t step_mask = step - 1;
     for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
         // 0. Load again the four characters we are dropping, shift them, and subtract.
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[-window_length], text_third[-window_length],
-                                         text_second[-window_length], text_first[-window_length], //
-                                         text_fourth[-window_length], text_third[-window_length],
-                                         text_second[-window_length], text_first[-window_length]);
+        chars_vec.zmm = _mm512_set_epi64(text_fourth[-window_width], text_third[-window_width],
+                                         text_second[-window_width], text_first[-window_width], //
+                                         text_fourth[-window_width], text_third[-window_width],
+                                         text_second[-window_width], text_first[-window_width]);
         chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
         hash_vec.zmm = _mm512_sub_epi64(hash_vec.zmm, _mm512_mullo_epi64(chars_vec.zmm, prime_power_vec.zmm));
 
@@ -517,10 +807,10 @@ SZ_PUBLIC void sz_hashes_ice(sz_cptr_t start, sz_size_t length, sz_size_t window
                                                 _mm512_castsi512_si256(hash_mix_vec.zmm));
 
         if ((cycle & step_mask) == 0) {
-            callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
-            callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
-            callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
-            callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
+            callback((sz_cptr_t)text_first, window_width, hash_mix_vec.u64s[0], callback_handle);
+            callback((sz_cptr_t)text_second, window_width, hash_mix_vec.u64s[1], callback_handle);
+            callback((sz_cptr_t)text_third, window_width, hash_mix_vec.u64s[2], callback_handle);
+            callback((sz_cptr_t)text_fourth, window_width, hash_mix_vec.u64s[3], callback_handle);
         }
     }
 }
@@ -564,14 +854,14 @@ SZ_PUBLIC void sz_hashes_ice(sz_cptr_t start, sz_size_t length, sz_size_t window
 #pragma region Compile Time Dispatching
 #if !SZ_DYNAMIC_DISPATCH
 
-SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
+SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_width, sz_size_t window_step, //
                           sz_hash_callback_t callback, void *callback_handle) {
 #if SZ_USE_ICE
-    sz_hashes_ice(text, length, window_length, window_step, callback, callback_handle);
+    sz_hashes_ice(text, length, window_width, window_step, callback, callback_handle);
 #elif SZ_USE_HASWELL
-    sz_hashes_haswell(text, length, window_length, window_step, callback, callback_handle);
+    sz_hashes_haswell(text, length, window_width, window_step, callback, callback_handle);
 #else
-    sz_hashes_serial(text, length, window_length, window_step, callback, callback_handle);
+    sz_hashes_serial(text, length, window_width, window_step, callback, callback_handle);
 #endif
 }
 
@@ -581,4 +871,5 @@ SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_len
 #ifdef __cplusplus
 }
 #endif // __cplusplus
-#endif // STRINGZILLA_HASH_H_
+#endif // STRINGZILLAS_FINGERPRINT_HPP_
+#endif
\ No newline at end of file

From a065aa09594cf71d41ff8286cec3dd23eeec9371 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 14 Jul 2025 12:30:36 +0000
Subject: [PATCH 459/751] Add: Test rolling hashes

---
 scripts/test_fingerprint.cuh  | 192 ++++++++++++++++++++++++++++++++++
 scripts/test_stringzillas.cpp |   5 +-
 scripts/test_stringzillas.cu  |   2 +-
 scripts/test_stringzillas.cuh |   6 +-
 4 files changed, 200 insertions(+), 5 deletions(-)
 create mode 100644 scripts/test_fingerprint.cuh

diff --git a/scripts/test_fingerprint.cuh b/scripts/test_fingerprint.cuh
new file mode 100644
index 00000000..95a603c2
--- /dev/null
+++ b/scripts/test_fingerprint.cuh
@@ -0,0 +1,192 @@
+/**
+ *  @brief   Extensive @b stress-testing suite for StringCuZilla parallel operations, written in CUDA C++.
+ *  @see     Stress-tests on real-world and synthetic data are integrated into the @b `scripts/bench*.cpp` benchmarks.
+ *
+ *  @file    test_fingerprint.cuh
+ *  @author  Ash Vardanian
+ */
+#include <cstring> // `std::memcmp`
+#include <thread>  // `std::thread::hardware_concurrency`
+
+#define FU_ENABLE_NUMA 0
+#include <fork_union.hpp> // Fork-join scoped thread pool
+
+#include "stringzillas/fingerprint.hpp"
+
+#if SZ_USE_CUDA && 0
+#include "stringzillas/fingerprint.cuh"
+#endif
+
+#if !_SZ_IS_CPP17
+#error "This test requires C++17 or later."
+#endif
+
+#include "test_stringzilla.hpp" // `arrow_strings_view_t`
+
+namespace ashvardanian {
+namespace stringzillas {
+namespace scripts {
+
+namespace fu = fork_union;
+using namespace stringzilla;
+using namespace stringzilla::scripts;
+
+template <typename hasher_type_>
+void test_rolling_hasher(hasher_type_ &&hasher, std::vector<std::string> const &strs) {
+
+    // Let's make sure that all slice hashes are the same as rolling hashes
+    std::size_t window_width = hasher.window_width();
+    using hasher_t = typename std::decay<hasher_type_>::type;
+    using hash_t = typename hasher_t::hash_t;
+
+    for (std::size_t i = 0; i != strs.size(); ++i) {
+        auto const &str = strs[i];
+        if (str.size() <= window_width) continue; // Skip very short inputs
+
+        // Compute the hash of the slice
+        std::size_t count_hashes = str.size() - window_width + 1;
+        std::vector<hash_t> hashes(count_hashes);
+        for (std::size_t j = 0; j < count_hashes; ++j) {
+            hash_t slice_hash = 0;
+            for (std::size_t k = 0; k < window_width; ++k) slice_hash = hasher.update(slice_hash, str[j + k]);
+            hashes[j] = slice_hash;
+        }
+
+        // Pre-populate the rolling-hash state until the first window ends
+        hash_t rolling_hash = 0;
+        for (std::size_t j = 0; j < window_width; ++j) rolling_hash = hasher.update(rolling_hash, str[j]);
+        _sz_assert(rolling_hash == hashes[0]);
+
+        // Now compute the rolling hash and compare it to the slice hashes
+        for (std::size_t j = window_width; j < str.size(); ++j) {
+            rolling_hash = hasher.update(rolling_hash, str[j - window_width], str[j]);
+            _sz_assert(rolling_hash == hashes[j - window_width + 1]);
+        }
+    }
+}
+
+void test_rolling_hasher() {
+
+    // Some vary basic variants:
+    std::vector<std::string> strings;
+    strings.emplace_back("his");
+    strings.emplace_back("is");
+    strings.emplace_back("she");
+    strings.emplace_back("her");
+    strings.emplace_back("this is");
+    strings.emplace_back("That is a test string");
+    strings.emplace_back("ahishers");
+    strings.emplace_back("hishishersherishis");
+    strings.emplace_back("si siht si a tset gnirts; reh ton si ehs, tub sih ti si.");
+    strings.emplace_back("his\0is\r\nshe\0her");
+
+    // Unicode variants:
+    strings.emplace_back("école"), strings.emplace_back("école");                   // decomposed
+    strings.emplace_back("Schön"), strings.emplace_back("Scho\u0308n");             // combining diaeresis
+    strings.emplace_back("naïve"), strings.emplace_back("naive");                   // stripped diaeresis
+    strings.emplace_back("façade"), strings.emplace_back("facade");                 // no cedilla
+    strings.emplace_back("office"), strings.emplace_back("ofﬁce");                  // “fi” ligature
+    strings.emplace_back("Straße"), strings.emplace_back("Strasse");                // ß vs ss
+    strings.emplace_back("ABBA"), strings.emplace_back("\u0410\u0412\u0412\u0410"); // Latin vs Cyrillic
+    strings.emplace_back("中国"), strings.emplace_back("中國");                     // simplified vs traditional
+    strings.emplace_back("🙂"), strings.emplace_back("☺️");                          // emoji variants
+    strings.emplace_back("€100"), strings.emplace_back("EUR 100");                  // currency symbol vs abbreviation
+
+    using u16u32_hasher_t = polynomial_rolling_hasher<u16_t, u32_t>;
+    using u32u64_hasher_t = polynomial_rolling_hasher<u32_t, u64_t>;
+    using u32mul_hasher_t = multiplying_rolling_hasher<u32_t>;
+    using i32mul_hasher_t = multiplying_rolling_hasher<i32_t>;
+    using u64mul_hasher_t = multiplying_rolling_hasher<u64_t>;
+    using u32buz_hasher_t = buz_rolling_hasher<u32_t>;
+    using u64buz_hasher_t = buz_rolling_hasher<u64_t>;
+    using f32u32_hasher_t = floating_rolling_hasher<float>;
+
+    std::vector<u16u32_hasher_t> u16u32_hashers;
+    u16u32_hashers.emplace_back(3, 31, 65521);
+    u16u32_hashers.emplace_back(5, 31, 65521);
+    u16u32_hashers.emplace_back(7, 31, 65521);
+    for (auto hasher : u16u32_hashers) test_rolling_hasher(hasher, strings);
+
+    std::vector<u32u64_hasher_t> u32u64_hashers;
+    u32u64_hashers.emplace_back(3, 31, 65521);
+    u32u64_hashers.emplace_back(5, 31, 65521);
+    u32u64_hashers.emplace_back(4, 257, SZ_U32_MAX_PRIME);
+    u32u64_hashers.emplace_back(7, 257, SZ_U32_MAX_PRIME);
+    for (auto hasher : u32u64_hashers) test_rolling_hasher(hasher, strings);
+
+    std::vector<u32mul_hasher_t> u32mul_hashers;
+    u32mul_hashers.emplace_back(3);
+    u32mul_hashers.emplace_back(5);
+    u32mul_hashers.emplace_back(4);
+    u32mul_hashers.emplace_back(7);
+    u32mul_hashers.emplace_back(3, 31);
+    u32mul_hashers.emplace_back(5, 65521);
+    u32mul_hashers.emplace_back(4, 257);
+    u32mul_hashers.emplace_back(7, SZ_U32_MAX_PRIME);
+    for (auto hasher : u32mul_hashers) test_rolling_hasher(hasher, strings);
+
+    std::vector<i32mul_hasher_t> i32mul_hashers;
+    i32mul_hashers.emplace_back(3);
+    i32mul_hashers.emplace_back(5);
+    i32mul_hashers.emplace_back(4);
+    i32mul_hashers.emplace_back(7);
+    i32mul_hashers.emplace_back(3, 31);
+    i32mul_hashers.emplace_back(5, 65521);
+    i32mul_hashers.emplace_back(4, 257);
+    i32mul_hashers.emplace_back(7, SZ_U32_MAX_PRIME);
+    for (auto hasher : i32mul_hashers) test_rolling_hasher(hasher, strings);
+
+    std::vector<u64mul_hasher_t> u64mul_hashers;
+    u64mul_hashers.emplace_back(3, 31);
+    u64mul_hashers.emplace_back(5, 65521);
+    u64mul_hashers.emplace_back(4, 257);
+    u64mul_hashers.emplace_back(7, SZ_U32_MAX_PRIME);
+    u64mul_hashers.emplace_back(4, 257);
+    u64mul_hashers.emplace_back(7, SZ_U32_MAX_PRIME);
+    u64mul_hashers.emplace_back(4, 257);
+    u64mul_hashers.emplace_back(7, SZ_U64_MAX_PRIME);
+    u64mul_hashers.emplace_back(32, 257);
+    for (auto hasher : u64mul_hashers) test_rolling_hasher(hasher, strings);
+
+    std::vector<u32buz_hasher_t> u32buz_hashers;
+    u32buz_hashers.emplace_back(3);
+    u32buz_hashers.emplace_back(5);
+    u32buz_hashers.emplace_back(4);
+    u32buz_hashers.emplace_back(7);
+    u32buz_hashers.emplace_back(3, 31);
+    u32buz_hashers.emplace_back(5, 65521);
+    u32buz_hashers.emplace_back(4, 257);
+    u32buz_hashers.emplace_back(7, SZ_U32_MAX_PRIME);
+    for (auto hasher : u32buz_hashers) test_rolling_hasher(hasher, strings);
+
+    std::vector<u64buz_hasher_t> u64buz_hashers;
+    u64buz_hashers.emplace_back(3, 31);
+    u64buz_hashers.emplace_back(5, 65521);
+    u64buz_hashers.emplace_back(4, 257);
+    u64buz_hashers.emplace_back(7, SZ_U32_MAX_PRIME);
+    u64buz_hashers.emplace_back(4, 257);
+    u64buz_hashers.emplace_back(7, SZ_U32_MAX_PRIME);
+    u64buz_hashers.emplace_back(4, 257);
+    u64buz_hashers.emplace_back(7, SZ_U64_MAX_PRIME);
+    u64buz_hashers.emplace_back(32, 257);
+    for (auto hasher : u64buz_hashers) test_rolling_hasher(hasher, strings);
+
+    std::vector<f32u32_hasher_t> f32u32_hashers;
+    f32u32_hashers.emplace_back(3, 31);
+    f32u32_hashers.emplace_back(5, 65521);
+    f32u32_hashers.emplace_back(4, 257);
+    f32u32_hashers.emplace_back(4, 257);
+    f32u32_hashers.emplace_back(4, 257);
+    f32u32_hashers.emplace_back(32, 257);
+    f32u32_hashers.emplace_back(32, 65521);
+    f32u32_hashers.emplace_back(3);
+    f32u32_hashers.emplace_back(32);
+    for (auto hasher : f32u32_hashers) test_rolling_hasher(hasher, strings);
+
+    void *ptr = nullptr;
+    (void)ptr;
+}
+
+} // namespace scripts
+} // namespace stringzillas
+} // namespace ashvardanian
diff --git a/scripts/test_stringzillas.cpp b/scripts/test_stringzillas.cpp
index 38a7c3b4..e15d0f1d 100644
--- a/scripts/test_stringzillas.cpp
+++ b/scripts/test_stringzillas.cpp
@@ -30,6 +30,9 @@
 
 #include "test_stringzillas.cuh"
 
+#include "test_fingerprint.cuh"
+#include "test_similarity.cuh"
+
 namespace szs = ashvardanian::stringzillas;
 
 int main(int argc, char const **argv) {
@@ -38,7 +41,7 @@ int main(int argc, char const **argv) {
     if (auto code = szs::scripts::log_environment(); code != 0) return code;
 
     try {
-        szs::scripts::test_find_many_equivalence();
+        szs::scripts::test_rolling_hasher();
         szs::scripts::test_similarity_scores_equivalence();
         szs::scripts::test_similarity_scores_memory_usage();
     }
diff --git a/scripts/test_stringzillas.cu b/scripts/test_stringzillas.cu
index d200508b..1c3793e2 100644
--- a/scripts/test_stringzillas.cu
+++ b/scripts/test_stringzillas.cu
@@ -38,7 +38,7 @@ int main(int argc, char const **argv) {
     if (auto code = szs::scripts::log_environment(); code != 0) return code;
 
     try {
-        szs::scripts::test_find_many_equivalence();
+        szs::scripts::test_fingerprint_equivalence();
         szs::scripts::test_similarity_scores_equivalence();
         szs::scripts::test_similarity_scores_memory_usage();
     }
diff --git a/scripts/test_stringzillas.cuh b/scripts/test_stringzillas.cuh
index 59088ea8..038d00e0 100644
--- a/scripts/test_stringzillas.cuh
+++ b/scripts/test_stringzillas.cuh
@@ -11,9 +11,9 @@
 #define FU_ENABLE_NUMA 0
 #include <fork_union.hpp> // Fork-join scoped thread pool
 
-#if !_SZ_IS_CPP17
-#error "This test requires C++17 or later."
-#endif
+// #if !_SZ_IS_CPP17
+// #error "This test requires C++17 or later."
+// #endif
 
 #include "test_stringzilla.hpp" // `arrow_strings_view_t`
 

From 9ed60066534f9d25c0d563f63372da4ac1a39d49 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 14 Jul 2025 13:51:53 +0000
Subject: [PATCH 460/751] Add: 64-bit `double` fingerprinting

---
 .vscode/settings.json                |   1 +
 include/stringzillas/fingerprint.hpp | 179 ++++++++++++++++++++++-----
 scripts/test_fingerprint.cuh         | 112 +++++++++++++++--
 scripts/test_similarity.cuh          |  28 -----
 scripts/test_stringzilla.hpp         |  33 +++++
 5 files changed, 281 insertions(+), 72 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index e313d862..9ff6123b 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -54,6 +54,7 @@
     "cibuildwheel",
     "CONCAT",
     "constexpr",
+    "coprime",
     "copydoc",
     "Corasick",
     "cptr",
diff --git a/include/stringzillas/fingerprint.hpp b/include/stringzillas/fingerprint.hpp
index 1eecfd61..77680fac 100644
--- a/include/stringzillas/fingerprint.hpp
+++ b/include/stringzillas/fingerprint.hpp
@@ -113,9 +113,9 @@ struct multiplying_rolling_hasher {
 
     inline std::size_t window_width() const noexcept { return window_width_; }
 
-    inline hash_t update(hash_t old_hash, hash_t new_char) const noexcept { return old_hash * multiplier_ + new_char; }
+    inline hash_t update(hash_t old_hash, byte_t new_char) const noexcept { return old_hash * multiplier_ + new_char; }
 
-    inline hash_t update(hash_t const old_hash, hash_t const old_char, hash_t const new_char) const noexcept {
+    inline hash_t update(hash_t const old_hash, byte_t const old_char, byte_t const new_char) const noexcept {
         hash_t const without_head = old_hash - old_char * highest_power_;
         return without_head * multiplier_ + new_char;
     }
@@ -141,48 +141,49 @@ struct polynomial_rolling_hasher {
     using hash_t = hash_type_;
     using accumulator_t = accumulator_type_;
 
-    explicit polynomial_rolling_hasher(std::size_t window_width, hash_t prime, hash_t modulo_base) noexcept
-        : window_width_ {window_width}, modulo_base_ {modulo_base}, prime_ {prime}, prime_power_ {1} {
+    explicit polynomial_rolling_hasher(std::size_t window_width, hash_t multiplier, hash_t modulo) noexcept
+        : window_width_ {window_width}, modulo_ {modulo}, multiplier_ {multiplier}, discarding_multiplier_ {1} {
 
         _sz_assert(window_width_ > 1 && "Window width must be > 1");
-        _sz_assert(prime_ > 0 && "Prime must be positive");
-        _sz_assert(modulo_base_ > 1 && "Modulo base must be > 1");
+        _sz_assert(multiplier_ > 0 && "Multiplier must be positive");
+        _sz_assert(modulo_ > 1 && "Modulo base must be > 1");
 
-        for (std::size_t i = 0; i + 1 < window_width_; ++i) prime_power_ = mul_mod(prime_power_, prime_);
+        for (std::size_t i = 0; i + 1 < window_width_; ++i)
+            discarding_multiplier_ = mul_mod(discarding_multiplier_, multiplier_);
     }
 
     inline std::size_t window_width() const noexcept { return window_width_; }
 
-    inline hash_t update(hash_t old_hash, hash_t new_char) const noexcept {
-        return add_mod(mul_mod(old_hash, prime_), new_char);
+    inline hash_t update(hash_t old_hash, byte_t new_char) const noexcept {
+        return add_mod(mul_mod(old_hash, multiplier_), new_char);
     }
 
-    inline hash_t update(hash_t const old_hash, hash_t const old_char, hash_t const new_char) const noexcept {
-        hash_t const term_to_subtract = mul_mod(old_char, prime_power_);
+    inline hash_t update(hash_t const old_hash, byte_t const old_char, byte_t const new_char) const noexcept {
+        hash_t const term_to_subtract = mul_mod(old_char, discarding_multiplier_);
         hash_t const without_head = sub_mod(old_hash, term_to_subtract);
-        return add_mod(mul_mod(without_head, prime_), new_char);
+        return add_mod(mul_mod(without_head, multiplier_), new_char);
     }
 
   private:
     inline hash_t mul_mod(hash_t const a, hash_t const b) const noexcept {
         accumulator_t const prod = accumulator_t {a} * accumulator_t {b};
-        return static_cast<hash_t>(prod % modulo_base_);
+        return static_cast<hash_t>(prod % modulo_);
     }
 
     inline hash_t add_mod(hash_t const a, hash_t const b) const noexcept {
         accumulator_t const sum = accumulator_t {a} + accumulator_t {b};
-        return static_cast<hash_t>(sum % modulo_base_);
+        return static_cast<hash_t>(sum % modulo_);
     }
 
     inline hash_t sub_mod(hash_t const a, hash_t const b) const noexcept {
-        accumulator_t diff = accumulator_t {a} + modulo_base_ - accumulator_t {b};
-        return static_cast<hash_t>(diff % modulo_base_);
+        accumulator_t diff = accumulator_t {a} + modulo_ - accumulator_t {b};
+        return static_cast<hash_t>(diff % modulo_);
     }
 
     std::size_t window_width_;
-    hash_t modulo_base_;
-    hash_t prime_;
-    hash_t prime_power_;
+    hash_t modulo_;
+    hash_t multiplier_;
+    hash_t discarding_multiplier_;
 };
 
 /**
@@ -203,11 +204,11 @@ struct buz_rolling_hasher {
 
     inline std::size_t window_width() const noexcept { return window_width_; }
 
-    inline hash_t update(hash_t old_hash, hash_t new_char) const noexcept {
+    inline hash_t update(hash_t old_hash, byte_t new_char) const noexcept {
         return rotl(old_hash, 1) ^ table_[new_char & 0xFFu];
     }
 
-    inline hash_t update(hash_t const old_hash, hash_t const old_char, hash_t const new_char) const noexcept {
+    inline hash_t update(hash_t const old_hash, byte_t const old_char, byte_t const new_char) const noexcept {
         constexpr unsigned bits_k = sizeof(hash_t) * 8u;
 
         hash_t const rolled = rotl(old_hash, 1);
@@ -233,6 +234,25 @@ struct buz_rolling_hasher {
     hash_t table_[256];
 };
 
+/**
+ *  @brief Helper function to pick the second co-prime "modulo" base for the Karp-Rabin rolling hashes.
+ *  @retval 0 on failure, or a valid prime number otherwise.
+ */
+inline std::uint64_t choose_coprime_modulo(std::uint64_t multiplier, std::uint64_t limit) noexcept {
+    if (multiplier == 0 || multiplier >= limit || limit <= 1) return 0;
+
+    // Upper bound guaranteeing no overflow in non-discarding `update` calls
+    std::uint64_t max_input = std::numeric_limits<byte_t>::max() + 1u;
+    std::uint64_t bound = (limit - (max_input + 1)) / multiplier + 1;
+
+    if (!(bound & 1u)) --bound; // Make odd
+
+    for (std::uint64_t p = bound; p >= 3; p -= 2)
+        if (std::gcd(p, multiplier) != 1) continue;
+
+    return 0;
+}
+
 template <typename float_type_ = float>
 struct floating_rolling_hasher;
 
@@ -245,7 +265,7 @@ struct floating_rolling_hasher;
  *  up to (2²³ - 1) = (8'388'607).
  *
  *  Some of the large primes fitting right before that limit are:
- *      8'388'539, 8'388'547, 8'388'571, 8'388'581, 8'388'587, 8'388'593
+ *      8'388'539, 8'388'547, 8'388'571, 8'388'581, 8'388'587, 8'388'593.
  *
  *  @note It's fair to say that this hash at least 23 bits of information, but it may not be enough for many apps.
  *  @sa `floating_rolling_hasher<double>` for 52 bit variant.
@@ -257,10 +277,13 @@ struct floating_rolling_hasher<float> {
 
     constexpr static float_t limit_k = 8'388'607.0f;
 
-    explicit floating_rolling_hasher(std::size_t const window_width, hash_t const multiplier = 257,
-                                     hash_t const modulo = 8388593) noexcept
+    explicit floating_rolling_hasher(std::size_t const window_width) noexcept
+        : floating_rolling_hasher(window_width, static_cast<float_t>(257), static_cast<float_t>(8388593)) {}
+
+    explicit floating_rolling_hasher(std::size_t const window_width, hash_t const multiplier,
+                                     hash_t const modulo) noexcept
         : window_width_ {window_width}, multiplier_ {multiplier}, modulo_ {modulo}, inverse_modulo_ {1.f / modulo_},
-          negative_highest_pow_ {1.0f} {
+          negative_discarding_multiplier_ {1.0f} {
 
         _sz_assert(window_width_ > 1 && "Window width must be > 1");
         _sz_assert(multiplier_ > 0 && "Multiplier must be positive");
@@ -274,8 +297,8 @@ struct floating_rolling_hasher<float> {
         _sz_assert(largest_intermediary < limit_k && "Intermediate state overflows the limit");
 
         for (std::size_t i = 0; i + 1 < window_width_; ++i)
-            negative_highest_pow_ = std::fmodf(negative_highest_pow_ * multiplier_, modulo_);
-        negative_highest_pow_ = -negative_highest_pow_;
+            negative_discarding_multiplier_ = std::fmodf(negative_discarding_multiplier_ * multiplier_, modulo_);
+        negative_discarding_multiplier_ = -negative_discarding_multiplier_;
     }
 
     inline std::size_t window_width() const noexcept { return window_width_; }
@@ -297,15 +320,18 @@ struct floating_rolling_hasher<float> {
         float_t old_term = float_t(old_char) + 1.0f;
         float_t new_term = float_t(new_char) + 1.0f;
 
-        state = std::fmaf(state, negative_highest_pow_, old_term); // Remove tail
-        state = std::fmaf(state, multiplier_, new_term);           // Add head
+        state = std::fmaf(state, negative_discarding_multiplier_, old_term); // Remove tail
+        state = std::fmaf(state, multiplier_, new_term);                     // Add head
         state = reduce(state);
 
         return sz_bitcast(hash_t, state);
     }
 
   private:
-    /** @brief Barrett-style `std::fmodf` alternative to avoid overflow. */
+    /**
+     *  @brief Barrett-style `std::fmodf` alternative to avoid overflow.
+     *  @see https://en.cppreference.com/w/cpp/numeric/math/fmod
+     */
     inline float_t reduce(float_t h) const noexcept {
         h -= modulo_ * std::floor(h * inverse_modulo_);
         // Clamp into the [0, modulo_) range.
@@ -318,7 +344,98 @@ struct floating_rolling_hasher<float> {
     float_t multiplier_;
     float_t modulo_;
     float_t inverse_modulo_;
-    float_t negative_highest_pow_;
+    float_t negative_discarding_multiplier_;
+};
+
+/**
+ *  @brief Rabin-Karp-style Rolling hash function for double-precision floating-point numbers.
+ *  @tparam float_type_ Type of the floating-point number, e.g., `float`.
+ *
+ *  The IEEE 754 double-precision `float` has a 53-bit significand (52 explicit bits + 1 implicit bit).
+ *  For simplicity, we just focus on the 52-bit part, which is capable of exactly representing integers
+ *  up to (2⁵² - 1) = (4'503'599'627'370'495).
+ *
+ *  Some of the large primes fitting right before that limit are:
+ *      4'503'599'627'370'287, 4'503'599'627'370'299, 4'503'599'627'370'313,
+ *      4'503'599'627'370'323, 4'503'599'627'370'353, 4'503'599'627'370'449.
+ *
+ *  @sa `floating_rolling_hasher<double>` for 52 bit variant.
+ */
+template <>
+struct floating_rolling_hasher<double> {
+    using hash_t = std::uint64_t;
+    using float_t = double;
+
+    constexpr static float_t limit_k = 4503599627370495.0f;
+
+    explicit floating_rolling_hasher(std::size_t const window_width) noexcept
+        : floating_rolling_hasher(window_width, static_cast<float_t>(257), static_cast<float_t>(4503599627370449)) {}
+
+    explicit floating_rolling_hasher(std::size_t const window_width, hash_t const multiplier,
+                                     hash_t const modulo) noexcept
+        : window_width_ {window_width}, multiplier_ {multiplier}, modulo_ {modulo}, inverse_modulo_ {1.f / modulo_},
+          negative_discarding_multiplier_ {1.0} {
+
+        _sz_assert(window_width_ > 1 && "Window width must be > 1");
+        _sz_assert(multiplier_ > 0 && "Multiplier must be positive");
+        _sz_assert(modulo_ > 1 && "Modulo must be > 1");
+
+        // If we want to avoid hitting +inf or NaN, we need to make sure that the product of our post-modulo
+        // normalized number with the multiplier and added subsequent term stays within the exactly representable range.
+        float_t const largest_input_term = std::numeric_limits<byte_t>::max() + 1.0;
+        float_t const largest_normalized_state = modulo_ - 1;
+        float_t const largest_intermediary = largest_normalized_state * multiplier_ + largest_input_term;
+        _sz_assert(largest_intermediary < limit_k && "Intermediate state overflows the limit");
+
+        for (std::size_t i = 0; i + 1 < window_width_; ++i)
+            negative_discarding_multiplier_ = std::fmod(negative_discarding_multiplier_ * multiplier_, modulo_);
+        negative_discarding_multiplier_ = -negative_discarding_multiplier_;
+    }
+
+    inline std::size_t window_width() const noexcept { return window_width_; }
+
+    inline hash_t update(hash_t const old_hash, byte_t const new_char) const noexcept {
+
+        float_t state = sz_bitcast(float_t, old_hash);
+        float_t new_term = float_t(new_char) + 1.0;
+
+        state = std::fma(state, multiplier_, new_term);
+        state = reduce(state);
+
+        return sz_bitcast(hash_t, state);
+    }
+
+    inline hash_t update(hash_t const old_hash, byte_t const old_char, byte_t const new_char) const noexcept {
+
+        float_t state = sz_bitcast(float_t, old_hash);
+        float_t old_term = float_t(old_char) + 1.0;
+        float_t new_term = float_t(new_char) + 1.0;
+
+        state = std::fma(state, negative_discarding_multiplier_, old_term); // Remove tail
+        state = std::fma(state, multiplier_, new_term);                     // Add head
+        state = reduce(state);
+
+        return sz_bitcast(hash_t, state);
+    }
+
+  private:
+    /**
+     *  @brief Barrett-style `std::fmod` alternative to avoid overflow.
+     *  @see https://en.cppreference.com/w/cpp/numeric/math/fmod
+     */
+    inline float_t reduce(float_t h) const noexcept {
+        h -= modulo_ * std::floor(h * inverse_modulo_);
+        // Clamp into the [0, modulo_) range.
+        h += modulo_ * (h < 0.0);
+        h -= modulo_ * (h >= modulo_);
+        return h;
+    }
+
+    std::size_t window_width_;
+    float_t multiplier_;
+    float_t modulo_;
+    float_t inverse_modulo_;
+    float_t negative_discarding_multiplier_;
 };
 
 } // namespace stringzillas
diff --git a/scripts/test_fingerprint.cuh b/scripts/test_fingerprint.cuh
index 95a603c2..f31b83fe 100644
--- a/scripts/test_fingerprint.cuh
+++ b/scripts/test_fingerprint.cuh
@@ -65,10 +65,9 @@ void test_rolling_hasher(hasher_type_ &&hasher, std::vector<std::string> const &
     }
 }
 
-void test_rolling_hasher() {
-
-    // Some vary basic variants:
+std::vector<std::string> rolling_hasher_basic_inputs() {
     std::vector<std::string> strings;
+
     strings.emplace_back("his");
     strings.emplace_back("is");
     strings.emplace_back("she");
@@ -92,6 +91,57 @@ void test_rolling_hasher() {
     strings.emplace_back("🙂"), strings.emplace_back("☺️");                          // emoji variants
     strings.emplace_back("€100"), strings.emplace_back("EUR 100");                  // currency symbol vs abbreviation
 
+    return strings;
+}
+
+std::vector<std::string> rolling_hasher_dna_like_inputs() {
+    std::vector<std::string> strings;
+
+    fuzzy_config_t config;
+    config.alphabet = "ACGT";
+    config.batch_size = 100;
+    config.min_string_length = 100;
+    config.max_string_length = 100 * 1024;
+
+    randomize_strings(config, strings);
+    return strings;
+}
+
+std::vector<std::string> rolling_hasher_dna_like_inputs() {
+    std::vector<std::string> strings;
+
+    fuzzy_config_t config;
+    config.alphabet = "ACGT";
+    config.batch_size = 100;
+    config.min_string_length = 100;
+    config.max_string_length = 100 * 1024;
+
+    randomize_strings(config, strings);
+    return strings;
+}
+
+std::vector<std::string> rolling_hasher_inconvenient_inputs() {
+    std::vector<std::string> strings;
+
+    static std::uint8_t const inconvenient_chars[4] = {0x00, 0x01, 0x7F, 0xFF};
+
+    fuzzy_config_t config;
+    config.alphabet = {reinterpret_cast<char const *>(&inconvenient_chars[0]), 4};
+    config.batch_size = 100;
+    config.min_string_length = 100;
+    config.max_string_length = 100 * 1024;
+
+    randomize_strings(config, strings);
+    return strings;
+}
+
+void test_rolling_hasher() {
+
+    // Some very basic variants:
+    auto unit_strings = rolling_hasher_basic_inputs();
+    auto dna_like_strings = rolling_hasher_dna_like_inputs();
+    auto inconvenient_strings = rolling_hasher_inconvenient_inputs();
+
     using u16u32_hasher_t = polynomial_rolling_hasher<u16_t, u32_t>;
     using u32u64_hasher_t = polynomial_rolling_hasher<u32_t, u64_t>;
     using u32mul_hasher_t = multiplying_rolling_hasher<u32_t>;
@@ -100,19 +150,24 @@ void test_rolling_hasher() {
     using u32buz_hasher_t = buz_rolling_hasher<u32_t>;
     using u64buz_hasher_t = buz_rolling_hasher<u64_t>;
     using f32u32_hasher_t = floating_rolling_hasher<float>;
+    using f64u64_hasher_t = floating_rolling_hasher<double>;
 
     std::vector<u16u32_hasher_t> u16u32_hashers;
     u16u32_hashers.emplace_back(3, 31, 65521);
     u16u32_hashers.emplace_back(5, 31, 65521);
     u16u32_hashers.emplace_back(7, 31, 65521);
-    for (auto hasher : u16u32_hashers) test_rolling_hasher(hasher, strings);
+    for (auto hasher : u16u32_hashers)
+        test_rolling_hasher(hasher, unit_strings), test_rolling_hasher(hasher, dna_like_strings),
+            test_rolling_hasher(hasher, inconvenient_strings);
 
     std::vector<u32u64_hasher_t> u32u64_hashers;
     u32u64_hashers.emplace_back(3, 31, 65521);
     u32u64_hashers.emplace_back(5, 31, 65521);
     u32u64_hashers.emplace_back(4, 257, SZ_U32_MAX_PRIME);
     u32u64_hashers.emplace_back(7, 257, SZ_U32_MAX_PRIME);
-    for (auto hasher : u32u64_hashers) test_rolling_hasher(hasher, strings);
+    for (auto hasher : u32u64_hashers)
+        test_rolling_hasher(hasher, unit_strings), test_rolling_hasher(hasher, dna_like_strings),
+            test_rolling_hasher(hasher, inconvenient_strings);
 
     std::vector<u32mul_hasher_t> u32mul_hashers;
     u32mul_hashers.emplace_back(3);
@@ -123,7 +178,9 @@ void test_rolling_hasher() {
     u32mul_hashers.emplace_back(5, 65521);
     u32mul_hashers.emplace_back(4, 257);
     u32mul_hashers.emplace_back(7, SZ_U32_MAX_PRIME);
-    for (auto hasher : u32mul_hashers) test_rolling_hasher(hasher, strings);
+    for (auto hasher : u32mul_hashers)
+        test_rolling_hasher(hasher, unit_strings), test_rolling_hasher(hasher, dna_like_strings),
+            test_rolling_hasher(hasher, inconvenient_strings);
 
     std::vector<i32mul_hasher_t> i32mul_hashers;
     i32mul_hashers.emplace_back(3);
@@ -134,7 +191,9 @@ void test_rolling_hasher() {
     i32mul_hashers.emplace_back(5, 65521);
     i32mul_hashers.emplace_back(4, 257);
     i32mul_hashers.emplace_back(7, SZ_U32_MAX_PRIME);
-    for (auto hasher : i32mul_hashers) test_rolling_hasher(hasher, strings);
+    for (auto hasher : i32mul_hashers)
+        test_rolling_hasher(hasher, unit_strings), test_rolling_hasher(hasher, dna_like_strings),
+            test_rolling_hasher(hasher, inconvenient_strings);
 
     std::vector<u64mul_hasher_t> u64mul_hashers;
     u64mul_hashers.emplace_back(3, 31);
@@ -146,7 +205,9 @@ void test_rolling_hasher() {
     u64mul_hashers.emplace_back(4, 257);
     u64mul_hashers.emplace_back(7, SZ_U64_MAX_PRIME);
     u64mul_hashers.emplace_back(32, 257);
-    for (auto hasher : u64mul_hashers) test_rolling_hasher(hasher, strings);
+    for (auto hasher : u64mul_hashers)
+        test_rolling_hasher(hasher, unit_strings), test_rolling_hasher(hasher, dna_like_strings),
+            test_rolling_hasher(hasher, inconvenient_strings);
 
     std::vector<u32buz_hasher_t> u32buz_hashers;
     u32buz_hashers.emplace_back(3);
@@ -157,7 +218,9 @@ void test_rolling_hasher() {
     u32buz_hashers.emplace_back(5, 65521);
     u32buz_hashers.emplace_back(4, 257);
     u32buz_hashers.emplace_back(7, SZ_U32_MAX_PRIME);
-    for (auto hasher : u32buz_hashers) test_rolling_hasher(hasher, strings);
+    for (auto hasher : u32buz_hashers)
+        test_rolling_hasher(hasher, unit_strings), test_rolling_hasher(hasher, dna_like_strings),
+            test_rolling_hasher(hasher, inconvenient_strings);
 
     std::vector<u64buz_hasher_t> u64buz_hashers;
     u64buz_hashers.emplace_back(3, 31);
@@ -169,7 +232,9 @@ void test_rolling_hasher() {
     u64buz_hashers.emplace_back(4, 257);
     u64buz_hashers.emplace_back(7, SZ_U64_MAX_PRIME);
     u64buz_hashers.emplace_back(32, 257);
-    for (auto hasher : u64buz_hashers) test_rolling_hasher(hasher, strings);
+    for (auto hasher : u64buz_hashers)
+        test_rolling_hasher(hasher, unit_strings), test_rolling_hasher(hasher, dna_like_strings),
+            test_rolling_hasher(hasher, inconvenient_strings);
 
     std::vector<f32u32_hasher_t> f32u32_hashers;
     f32u32_hashers.emplace_back(3, 31);
@@ -181,10 +246,31 @@ void test_rolling_hasher() {
     f32u32_hashers.emplace_back(32, 65521);
     f32u32_hashers.emplace_back(3);
     f32u32_hashers.emplace_back(32);
-    for (auto hasher : f32u32_hashers) test_rolling_hasher(hasher, strings);
+    f32u32_hashers.emplace_back(65);
+    f32u32_hashers.emplace_back(257);   // Super-wide window
+    f32u32_hashers.emplace_back(1000);  // Super-wide window
+    f32u32_hashers.emplace_back(30000); // Super-wide window
+    for (auto hasher : f32u32_hashers)
+        test_rolling_hasher(hasher, unit_strings), test_rolling_hasher(hasher, dna_like_strings),
+            test_rolling_hasher(hasher, inconvenient_strings);
 
-    void *ptr = nullptr;
-    (void)ptr;
+    std::vector<f64u64_hasher_t> f64u64_hashers;
+    f64u64_hashers.emplace_back(3, 31);
+    f64u64_hashers.emplace_back(5, 65521);
+    f64u64_hashers.emplace_back(4, 257);
+    f64u64_hashers.emplace_back(4, 257);
+    f64u64_hashers.emplace_back(4, 257);
+    f64u64_hashers.emplace_back(32, 257);
+    f64u64_hashers.emplace_back(32, 65521);
+    f64u64_hashers.emplace_back(3);
+    f64u64_hashers.emplace_back(32);
+    f64u64_hashers.emplace_back(65);
+    f64u64_hashers.emplace_back(257);   // Super-wide window
+    f64u64_hashers.emplace_back(1000);  // Super-wide window
+    f64u64_hashers.emplace_back(30000); // Super-wide window
+    for (auto hasher : f64u64_hashers)
+        test_rolling_hasher(hasher, unit_strings), test_rolling_hasher(hasher, dna_like_strings),
+            test_rolling_hasher(hasher, inconvenient_strings);
 }
 
 } // namespace scripts
diff --git a/scripts/test_similarity.cuh b/scripts/test_similarity.cuh
index 09d2bf75..2b645d8a 100644
--- a/scripts/test_similarity.cuh
+++ b/scripts/test_similarity.cuh
@@ -532,34 +532,6 @@ void test_similarity_scores_fixed(base_operator_ &&base_operator, simd_operator_
     }
 }
 
-struct fuzzy_config_t {
-    std::string_view alphabet = "ABC";
-    std::size_t batch_size = 16;
-    std::size_t min_string_length = 1;
-    std::size_t max_string_length = 200;
-};
-
-void randomize_strings(fuzzy_config_t config, std::vector<std::string> &array, arrow_strings_tape_t &tape,
-                       bool unique = false) {
-    array.resize(config.batch_size);
-
-    std::uniform_int_distribution<std::size_t> length_distribution(config.min_string_length, config.max_string_length);
-    for (std::size_t i = 0; i != config.batch_size; ++i) {
-        std::size_t length = length_distribution(global_random_generator());
-        array[i] = random_string(length, config.alphabet.data(), config.alphabet.size());
-    }
-
-    if (unique) {
-        std::sort(array.begin(), array.end());
-        auto last = std::unique(array.begin(), array.end());
-        array.erase(last, array.end());
-    }
-
-    // Convert to a GPU-friendly layout
-    status_t status = tape.try_assign(array.data(), array.data() + array.size());
-    _sz_assert(status == status_t::success_k);
-}
-
 /**
  *  @brief  Tests the correctness of the string class Levenshtein distance computation,
  *          as well as the similarity scoring functions for bioinformatics-like workloads
diff --git a/scripts/test_stringzilla.hpp b/scripts/test_stringzilla.hpp
index 7c0f7efc..df5a90a7 100644
--- a/scripts/test_stringzilla.hpp
+++ b/scripts/test_stringzilla.hpp
@@ -112,6 +112,39 @@ inline void iterate_in_random_slices(std::string const &text, slice_callback_typ
     }
 }
 
+struct fuzzy_config_t {
+    std::string_view alphabet = "ABC";
+    std::size_t batch_size = 16;
+    std::size_t min_string_length = 1;
+    std::size_t max_string_length = 200;
+};
+
+void randomize_strings(fuzzy_config_t config, std::vector<std::string> &array, bool unique = false) {
+    array.resize(config.batch_size);
+
+    std::uniform_int_distribution<std::size_t> length_distribution(config.min_string_length, config.max_string_length);
+    for (std::size_t i = 0; i != config.batch_size; ++i) {
+        std::size_t length = length_distribution(global_random_generator());
+        array[i] = random_string(length, config.alphabet.data(), config.alphabet.size());
+    }
+
+    if (unique) {
+        std::sort(array.begin(), array.end());
+        auto last = std::unique(array.begin(), array.end());
+        array.erase(last, array.end());
+    }
+}
+
+void randomize_strings(fuzzy_config_t config, std::vector<std::string> &array, arrow_strings_tape_t &tape,
+                       bool unique = false) {
+
+    randomize_strings(config, array, unique);
+
+    // Convert to a GPU-friendly layout
+    status_t status = tape.try_assign(array.data(), array.data() + array.size());
+    _sz_assert(status == status_t::success_k);
+}
+
 } // namespace scripts
 } // namespace stringzilla
 } // namespace ashvardanian
\ No newline at end of file

From e96d26f155540551f960298224ee37d7b8379373 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 15 Jul 2025 18:06:43 +0000
Subject: [PATCH 461/751] Improve: Move `arrays_equality`

---
 drafts/bench_find_many.cuh | 16 ----------------
 scripts/bench.hpp          | 16 ++++++++++++++++
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/drafts/bench_find_many.cuh b/drafts/bench_find_many.cuh
index f0a0fda2..82a41b2e 100644
--- a/drafts/bench_find_many.cuh
+++ b/drafts/bench_find_many.cuh
@@ -97,22 +97,6 @@ struct find_many_callable {
     }
 };
 
-template <typename value_type_>
-struct arrays_equality {
-    using vector_t = unified_vector<value_type_>;
-    bool operator()(check_value_t const &a, check_value_t const &b) const noexcept {
-        vector_t const &a_ = *reinterpret_cast<vector_t const *>(a);
-        vector_t const &b_ = *reinterpret_cast<vector_t const *>(b);
-        if (a_.size() != b_.size()) return false;
-        for (std::size_t i = 0; i < a_.size(); ++i)
-            if (a_[i] != b_[i]) {
-                std::printf("Mismatch at index %zu\n", i);
-                return false;
-            }
-        return true;
-    }
-};
-
 void bench_find_many(environment_t const &env) {
 
     using namespace std::string_literals; // for "s" suffix
diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index 20251a30..c49dd269 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -818,6 +818,22 @@ bench_result_t bench_unary(environment_t const &env, std::string const &name, ca
     return bench_unary(env, name, callable_no_op_t {}, callable);
 }
 
+template <typename value_type_>
+struct arrays_equality {
+    using vector_t = unified_vector<value_type_>;
+    bool operator()(check_value_t const &a, check_value_t const &b) const noexcept {
+        vector_t const &a_ = *reinterpret_cast<vector_t const *>(a);
+        vector_t const &b_ = *reinterpret_cast<vector_t const *>(b);
+        if (a_.size() != b_.size()) return false;
+        for (std::size_t i = 0; i < a_.size(); ++i)
+            if (a_[i] != b_[i]) {
+                std::printf("Mismatch at index %zu\n", i);
+                return false;
+            }
+        return true;
+    }
+};
+
 } // namespace scripts
 } // namespace stringzilla
 } // namespace ashvardanian
\ No newline at end of file

From 8355b6edc55aa28a4c5e86c2fc03211823f0a651 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 15 Jul 2025 18:07:46 +0000
Subject: [PATCH 462/751] Improve: Compile-time-known `span` extents

---
 include/stringzilla/types.hpp | 46 ++++++++++++++++++++++++++++++++---
 1 file changed, 42 insertions(+), 4 deletions(-)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index c140b1b0..3579166d 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -141,11 +141,49 @@ struct error_costs_unary_t {
     constexpr sz_size_t magnitude() const noexcept { return 1; }
 };
 
-template <typename value_type_>
+template <typename value_type_, sz_size_t extent_ = SZ_SIZE_MAX>
 struct span {
-    using value_type = value_type_;     // ? For STL compatibility
-    using size_type = sz_size_t;        // ? For STL compatibility
-    using difference_type = sz_ssize_t; // ? For STL compatibility
+
+    using value_type = value_type_;              // ? For STL compatibility
+    using size_type = sz_size_t;                 // ? For STL compatibility
+    using difference_type = sz_ssize_t;          // ? For STL compatibility
+    static constexpr sz_size_t extent = extent_; // ? For STL compatibility
+
+    value_type *data_ {};
+
+    constexpr span() noexcept = default;
+    constexpr span(value_type *data) noexcept : data_(data) {}
+    constexpr span(value_type *data) noexcept : data_(data) {}
+
+    constexpr value_type *begin() const noexcept { return data_; }
+    constexpr value_type *end() const noexcept { return data_ + extent; }
+    constexpr value_type *data() const noexcept { return data_; }
+    constexpr size_type size() const noexcept { return extent; }
+    constexpr size_type length() const noexcept { return extent; }
+    constexpr size_type size_bytes() const noexcept { return extent * sizeof(value_type); }
+    constexpr value_type &operator[](size_type i) const noexcept { return data_[i]; }
+    constexpr value_type &front() const noexcept { return data_[0]; }
+    constexpr value_type &back() const noexcept { return data_[extent - 1]; }
+    constexpr bool empty() const noexcept { return extent == 0; }
+
+    template <typename same_value_type_ = value_type,
+              typename = std::enable_if_t<!std::is_const<same_value_type_>::value>>
+    constexpr operator span<std::add_const_t<same_value_type_>>() const noexcept {
+        return {data_};
+    }
+    template <typename other_value_type_>
+    constexpr span<other_value_type_, extent * sizeof(value_type) / sizeof(other_value_type_)> cast() const noexcept {
+        return span<other_value_type_, extent * sizeof(value_type) / sizeof(other_value_type_)>(
+            reinterpret_cast<other_value_type_ *>(data_));
+    }
+};
+
+template <typename value_type_>
+struct span<value_type_, SZ_SIZE_MAX> {
+    using value_type = value_type_;                  // ? For STL compatibility
+    using size_type = sz_size_t;                     // ? For STL compatibility
+    using difference_type = sz_ssize_t;              // ? For STL compatibility
+    static constexpr sz_size_t extent = SZ_SIZE_MAX; // ? For STL compatibility
 
     value_type *data_ {};
     size_type size_ {};

From 00c68d5eac6bf0364f0e46bce0fa50858867b869 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 15 Jul 2025 18:09:56 +0000
Subject: [PATCH 463/751] Add: Fingerprinting benchmarks

---
 scripts/bench_fingerprint.cpp |  66 +++++++++++++++
 scripts/bench_fingerprint.cu  |  66 +++++++++++++++
 scripts/bench_fingerprint.cuh | 154 ++++++++++++++++++++++++++++++++++
 3 files changed, 286 insertions(+)
 create mode 100644 scripts/bench_fingerprint.cpp
 create mode 100644 scripts/bench_fingerprint.cu
 create mode 100644 scripts/bench_fingerprint.cuh

diff --git a/scripts/bench_fingerprint.cpp b/scripts/bench_fingerprint.cpp
new file mode 100644
index 00000000..32303313
--- /dev/null
+++ b/scripts/bench_fingerprint.cpp
@@ -0,0 +1,66 @@
+/**
+ *  @file   bench_fingerprint.cpp
+ *  @brief  Benchmarks for exact multi-pattern substring search algorithms.
+ *          The program accepts a file path to a dataset, tokenizes it, and benchmarks the search operations,
+ *          validating the SIMD-accelerated backends against the serial baselines.
+ *
+ *  Instead of CLI arguments, for compatibility with @b StringWa.rs, the following environment variables are used:
+ *  - `STRINGWARS_DATASET` : Path to the dataset file.
+ *  - `STRINGWARS_TOKENS=lines` : Tokenization model ("file", "lines", "words", or positive integer [1:200] for N-grams
+ *  - `STRINGWARS_SEED=42` : Optional seed for shuffling reproducibility.
+ *
+ *  Unlike StringWa.rs, the following additional environment variables are supported:
+ *  - `STRINGWARS_DURATION=10` : Time limit (in seconds) per benchmark.
+ *  - `STRINGWARS_STRESS=1` : Test SIMD-accelerated functions against the serial baselines.
+ *  - `STRINGWARS_STRESS_DIR=/.tmp` : Output directory for stress-testing failures logs.
+ *  - `STRINGWARS_STRESS_LIMIT=1` : Controls the number of failures we're willing to tolerate.
+ *  - `STRINGWARS_STRESS_DURATION=10` : Stress-testing time limit (in seconds) per benchmark.
+ *  - `STRINGWARS_FILTER` : Regular Expression pattern to filter algorithm/backend names.
+ *
+ *  Here are a few build & run commands:
+ *
+ *  @code{.sh}
+ *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
+ *  cmake --build build_release --config Release --target stringzillas_bench_fingerprint_cpp20
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringzillas_bench_fingerprint_cpp20
+ *  @endcode
+ *
+ *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
+ *  like all Skylake-X and newer kernels on a boundary-condition input length of 64 bytes (exactly 1 cache line),
+ *  your last command may look like:
+ *
+ *  @code{.sh}
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
+ *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
+ *  build_release/stringzillas_bench_fingerprint_cpp20
+ *  @endcode
+ *
+ *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
+ *  This file is the sibling of `bench_sequence.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
+ */
+#include "bench_fingerprint.cuh"
+
+namespace szs = ashvardanian::stringzillas;
+using namespace szs::scripts;
+
+int main(int argc, char const **argv) {
+    std::printf("Welcome to StringZillas on CPU!\n");
+
+    try {
+        std::printf("Building up the environment...\n");
+        environment_t env = build_environment( //
+            argc, argv,                        //
+            "xlsum.csv",                       // Preferred for UTF-8 content
+            environment_t::tokenization_t::lines_k);
+
+        std::printf("Starting string multi-pattern search benchmarks...\n");
+        bench_fingerprint(env);
+    }
+    catch (std::exception const &e) {
+        std::fprintf(stderr, "Failed with: %s\n", e.what());
+        return 1;
+    }
+
+    std::printf("All benchmarks finished.\n");
+    return 0;
+}
\ No newline at end of file
diff --git a/scripts/bench_fingerprint.cu b/scripts/bench_fingerprint.cu
new file mode 100644
index 00000000..f322ac67
--- /dev/null
+++ b/scripts/bench_fingerprint.cu
@@ -0,0 +1,66 @@
+/**
+ *  @file   bench_fingerprint.cu
+ *  @brief  Benchmarks for exact multi-pattern substring search algorithms on the GPU.
+ *          The program accepts a file path to a dataset, tokenizes it, and benchmarks the search operations,
+ *          validating the SIMD-accelerated backends against the serial baselines.
+ *
+ *  Instead of CLI arguments, for compatibility with @b StringWa.rs, the following environment variables are used:
+ *  - `STRINGWARS_DATASET` : Path to the dataset file.
+ *  - `STRINGWARS_TOKENS=lines` : Tokenization model ("file", "lines", "words", or positive integer [1:200] for N-grams
+ *  - `STRINGWARS_SEED=42` : Optional seed for shuffling reproducibility.
+ *
+ *  Unlike StringWa.rs, the following additional environment variables are supported:
+ *  - `STRINGWARS_DURATION=10` : Time limit (in seconds) per benchmark.
+ *  - `STRINGWARS_STRESS=1` : Test SIMD-accelerated functions against the serial baselines.
+ *  - `STRINGWARS_STRESS_DIR=/.tmp` : Output directory for stress-testing failures logs.
+ *  - `STRINGWARS_STRESS_LIMIT=1` : Controls the number of failures we're willing to tolerate.
+ *  - `STRINGWARS_STRESS_DURATION=10` : Stress-testing time limit (in seconds) per benchmark.
+ *  - `STRINGWARS_FILTER` : Regular Expression pattern to filter algorithm/backend names.
+ *
+ *  Here are a few build & run commands:
+ *
+ *  @code{.sh}
+ *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
+ *  cmake --build build_release --config Release --target stringzillas_bench_fingerprint_cu20
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringzillas_bench_fingerprint_cu20
+ *  @endcode
+ *
+ *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
+ *  like all Skylake-X and newer kernels on a boundary-condition input length of 64 bytes (exactly 1 cache line),
+ *  your last command may look like:
+ *
+ *  @code{.sh}
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
+ *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
+ *  build_release/stringzillas_bench_fingerprint_cu20
+ *  @endcode
+ *
+ *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
+ *  This file is the sibling of `bench_sequence.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
+ */
+#include "bench_fingerprint.cuh"
+
+namespace szs = ashvardanian::stringzillas;
+using namespace szs::scripts;
+
+int main(int argc, char const **argv) {
+    std::printf("Welcome to StringZillas on GPU!\n");
+
+    try {
+        std::printf("Building up the environment...\n");
+        environment_t env = build_environment( //
+            argc, argv,                        //
+            "xlsum.csv",                       // Preferred for UTF-8 content
+            environment_t::tokenization_t::lines_k);
+
+        std::printf("Starting string multi-pattern search benchmarks...\n");
+        bench_fingerprint(env);
+    }
+    catch (std::exception const &e) {
+        std::fprintf(stderr, "Failed with: %s\n", e.what());
+        return 1;
+    }
+
+    std::printf("All benchmarks finished.\n");
+    return 0;
+}
\ No newline at end of file
diff --git a/scripts/bench_fingerprint.cuh b/scripts/bench_fingerprint.cuh
new file mode 100644
index 00000000..9936a43c
--- /dev/null
+++ b/scripts/bench_fingerprint.cuh
@@ -0,0 +1,154 @@
+/**
+ *  @file   bench_fingerprint.cuh
+ *  @brief  Shared code for CPU and GPU batched parallel exact substring search.
+ */
+#include <tuple> // `std::tuple`
+#include <span>  // `std::span`
+
+#define FU_ENABLE_NUMA 0
+#include <fork_union.hpp> // Fork-join scoped thread pool
+
+#include <stringzillas/fingerprint.hpp> // C++ templates for string processing
+
+#if SZ_USE_CUDA
+#include <stringzillas/fingerprint.cuh> // Parallel string processing in CUDA
+#endif
+
+#include "bench.hpp"
+
+namespace ashvardanian {
+namespace stringzillas {
+namespace scripts {
+
+using namespace ashvardanian::stringzilla::scripts;
+
+static constexpr std::size_t default_embedding_dims_k = 768;
+static constexpr std::size_t default_window_width_k = 7;
+
+using fingerprint_t = std::array<std::uint32_t, default_embedding_dims_k>;
+using fingerprints_t = unified_vector<fingerprint_t>;
+
+#pragma region Multi-Pattern Search
+
+/** @brief Wraps a hardware-specific fingerprinting backend into something @b `bench_nullary`-compatible . */
+template <typename engine_type_, typename... extra_args_>
+struct fingerprint_callable {
+    using engine_t = engine_type_;
+
+    environment_t const &env;
+    fingerprints_t &fingerprints;
+    engine_t engine = {};
+    std::tuple<extra_args_...> extra_args = {};
+
+    fingerprint_callable(environment_t const &env, fingerprints_t &fingerprints, engine_t eng = {}, extra_args_... args)
+        : env(env), fingerprints(fingerprints), engine(std::move(eng)), extra_args(args...) {}
+
+    call_result_t operator()() noexcept(false) {
+
+        // Unpack the extra arguments from `std::tuple` into the engine call using `std::apply`
+        status_t status = std::apply(
+            [&](auto &&...rest) mutable {
+                auto result = engine(env.tokens, fingerprints, rest...);
+                for (auto &fingerprint : fingerprints) do_not_optimize(fingerprint);
+                return result;
+            },
+            extra_args);
+
+        do_not_optimize(status);
+        if (status != status_t::success_k) throw std::runtime_error("Failed multi-pattern search.");
+
+        std::size_t bytes_passed = 0;
+        for (std::size_t i = 0; i < env.tokens.size(); ++i) bytes_passed += env.tokens[i].size();
+
+        volatile call_result_t call_result;
+        call_result.bytes_passed = bytes_passed;
+        call_result.operations = bytes_passed * default_embedding_dims_k;
+        call_result.inputs_processed = env.tokens.size();
+        call_result.check_value = reinterpret_cast<check_value_t>(&fingerprints);
+        return (call_result_t const &)call_result;
+    }
+};
+
+void bench_fingerprint(environment_t const &env) {
+
+    // Preallocate buffers for resulting fingerprints,
+    // so that we can compare baseline and accelerated results for exact matches
+    using fingerprints_equality_t = arrays_equality<fingerprints_t>;
+    fingerprints_t fingerprints_baseline, fingerprints_accelerated;
+    fingerprints_baseline.resize(env.tokens.size()), fingerprints_accelerated.resize(env.tokens.size());
+    auto scramble_accelerated_results = [&]() {
+        std::shuffle(fingerprints_accelerated.begin(), fingerprints_accelerated.end(), global_random_generator());
+    };
+
+    // Allocate all hashers on heap
+    using rabin_u64_t = basic_rolling_hashers<rabin_karp_rolling_hasher<std::uint32_t, std::uint64_t>>;
+    auto rabin_u64 = std::make_unique<rabin_u64_t>();
+    if (rabin_u64->try_extend(default_window_width_k, default_embedding_dims_k) != status_t::success_k)
+        throw std::runtime_error("Can't build Rabin Karp u64/u32 Hasher.");
+
+    using buz_u32_t = basic_rolling_hashers<buz_rolling_hasher<std::uint32_t>>;
+    auto buz_u32 = std::make_unique<buz_u32_t>();
+    if (buz_u32->try_extend(default_window_width_k, default_embedding_dims_k) != status_t::success_k)
+        throw std::runtime_error("Can't build Buz Hasher.");
+
+    using multiply_u32_t = basic_rolling_hashers<multiplying_rolling_hasher<std::uint32_t>>;
+    auto multiply_u32 = std::make_unique<multiply_u32_t>();
+    if (multiply_u32->try_extend(default_window_width_k, default_embedding_dims_k) != status_t::success_k)
+        throw std::runtime_error("Can't build Multiplying Hasher.");
+
+    using rolling_f64_t = basic_rolling_hashers<floating_rolling_hasher<double>>;
+    auto rolling_f64 = std::make_unique<rolling_f64_t>();
+    if (rolling_f64->try_extend(default_window_width_k, default_embedding_dims_k) != status_t::success_k)
+        throw std::runtime_error("Can't build Floating f64 Rolling Hasher.");
+
+    using rolling_f32_t = basic_rolling_hashers<floating_rolling_hasher<float>>;
+    auto rolling_f32 = std::make_unique<rolling_f32_t>();
+    if (rolling_f32->try_extend(default_window_width_k, default_embedding_dims_k) != status_t::success_k)
+        throw std::runtime_error("Can't build Floating f32 Rolling Hasher.");
+
+    using rolling_serial_t =
+        floating_rolling_hashers<sz_cap_serial_k, default_window_width_k, default_embedding_dims_k>;
+    auto rolling_serial = std::make_unique<rolling_serial_t>();
+    if (rolling_serial->try_seed() != status_t::success_k)
+        throw std::runtime_error("Can't build Serial Floating Hasher.");
+
+    using rolling_skylake_t =
+        floating_rolling_hashers<sz_cap_skylake_k, default_window_width_k, default_embedding_dims_k>;
+    auto rolling_skylake = std::make_unique<rolling_skylake_t>();
+    if (rolling_skylake->try_seed() != status_t::success_k)
+        throw std::runtime_error("Can't build Skylake Floating Hasher.");
+
+    // Perform the benchmarks, passing the dictionary to the engines
+    bench_result_t baseline =
+        bench_nullary(env, "rabin_u64", fingerprint_callable<rabin_u64_t>(env, fingerprints_accelerated, *rabin_u64))
+            .log();
+
+    // Semi-serial variants
+    bench_nullary(env, "buz_u32", fingerprint_callable<buz_u32_t>(env, fingerprints_accelerated, *buz_u32)) //
+        .log(baseline);
+    bench_nullary(env, "multiply_u32",
+                  fingerprint_callable<multiply_u32_t>(env, fingerprints_accelerated, *multiply_u32))
+        .log(baseline);
+    bench_nullary(env, "rolling_f64", fingerprint_callable<rolling_f64_t>(env, fingerprints_accelerated, *rolling_f64))
+        .log(baseline);
+    bench_nullary(env, "rolling_f32", fingerprint_callable<rolling_f32_t>(env, fingerprints_accelerated, *rolling_f32))
+        .log(baseline);
+
+    // Actually unrolled hard-coded variants, including SIMD ports
+    auto call_serial = fingerprint_callable<rolling_serial_t>(env, fingerprints_baseline, *rolling_serial);
+    bench_result_t serial = bench_nullary(env, "rolling_serial", call_serial).log(baseline);
+
+    bench_nullary(                                                                                //
+        env, "rolling_skylake", call_serial,                                                      //
+        fingerprint_callable<rolling_skylake_t>(env, fingerprints_accelerated, *rolling_skylake), //
+        callable_no_op_t {},                                                                      // preprocessing
+        fingerprints_equality_t {})                                                               // equality check
+        .log(baseline, serial);
+    scramble_accelerated_results();
+}
+
+#pragma endregion
+
+} // namespace scripts
+} // namespace stringzillas
+} // namespace ashvardanian
\ No newline at end of file

From 63447d387f90b3d4ce56f1173f1a4e02dece14cc Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 15 Jul 2025 18:10:19 +0000
Subject: [PATCH 464/751] Add: Min-Hashing `basic_rolling_hashers`

---
 include/stringzillas/fingerprint.hpp | 391 +++++++++++++++++++++++++--
 scripts/test_fingerprint.cuh         |   4 +-
 2 files changed, 377 insertions(+), 18 deletions(-)

diff --git a/include/stringzillas/fingerprint.hpp b/include/stringzillas/fingerprint.hpp
index 77680fac..6b1ae9c2 100644
--- a/include/stringzillas/fingerprint.hpp
+++ b/include/stringzillas/fingerprint.hpp
@@ -20,6 +20,8 @@
  *  retrieval tasks. We must also keep in mind, that however costly, the "fingerprinting" is a one-time operation, and
  *  the quality of the resulting "sketch" is no less important than the speed of the algorithm.
  *
+ *  @section Rolling Hashes
+ *
  *  At its core we compute many Karp-Rabin-like "rolling hashes" over multiple window widths and multipliers.
  *  We avoid 64-bit hashes, due to the lack of hardware support for efficient multiplication and modulo operations.
  *  That's especially noticeable on GPUs, where 64-bit ops are often emulated using 32-bit and can be 8-32x slower.
@@ -79,6 +81,17 @@
  *    - on AMD Zen4: 13 cycles on ports 0 or 1.
  *
  *  So optimizations, like the Barrett reduction can still be useful.
+ *
+ *  Choosing the right "window width" is task- and domain-dependant. For example, most English words are
+ *  between 3 and 7 characters long, so a window of 4 bytes would be a good choice. For DNA sequences,
+ *  the "window width" might be a multiple of 3, as the codons are 3 (nucleotides) bytes long.
+ *  With such minimalistic alphabets of just four characters (AGCT) longer windows might be needed.
+ *  For protein sequences the alphabet is 20 characters long, so the window can be shorter, than for DNAs.
+ *
+ *  @section Min-Hashing
+ *
+ *  Computing one such hash won't help us much in large-scale retrieval tasks, but there is a common technique
+ *  called "Min-Hashing" that can be used to produce a fixed-size fingerprint of a string. For the same wind
  */
 #ifndef STRINGZILLAS_FINGERPRINT_HPP_
 #define STRINGZILLAS_FINGERPRINT_HPP_
@@ -94,6 +107,8 @@
 namespace ashvardanian {
 namespace stringzillas {
 
+#pragma region - Baseline Rolling Hashers
+
 /**
  *  @brief The simplest example of a rolling hash function, leveraging 2^N modulo arithmetic.
  *  @tparam hash_type_ Type of the hash value, e.g., `std::uint64_t`.
@@ -137,11 +152,13 @@ struct multiplying_rolling_hasher {
  *  @sa `multiplying_rolling_hasher`
  */
 template <typename hash_type_ = std::uint32_t, typename accumulator_type_ = std::uint64_t>
-struct polynomial_rolling_hasher {
+struct rabin_karp_rolling_hasher {
     using hash_t = hash_type_;
     using accumulator_t = accumulator_type_;
 
-    explicit polynomial_rolling_hasher(std::size_t window_width, hash_t multiplier, hash_t modulo) noexcept
+    explicit rabin_karp_rolling_hasher(std::size_t window_width);
+
+    explicit rabin_karp_rolling_hasher(std::size_t window_width, hash_t multiplier, hash_t modulo) noexcept
         : window_width_ {window_width}, modulo_ {modulo}, multiplier_ {multiplier}, discarding_multiplier_ {1} {
 
         _sz_assert(window_width_ > 1 && "Window width must be > 1");
@@ -189,7 +206,7 @@ struct polynomial_rolling_hasher {
 /**
  *  @brief BuzHash rolling hash function leveraging a fixed-size lookup table and bitwise operations.
  *  @tparam hash_type_ Type of the hash value, e.g., `std::uint64_t`.
- *  @sa `multiplying_rolling_hasher`, `polynomial_rolling_hasher`
+ *  @sa `multiplying_rolling_hasher`, `rabin_karp_rolling_hasher`
  */
 template <typename hash_type_ = std::uint64_t>
 struct buz_rolling_hasher {
@@ -262,12 +279,19 @@ struct floating_rolling_hasher;
  *
  *  The IEEE 754 single-precision `float` has a 24-bit significand (23 explicit bits + 1 implicit bit).
  *  For simplicity, we just focus on the 23-bit part, which is capable of exactly representing integers
- *  up to (2²³ - 1) = (8'388'607).
+ *  up to (2²³ - 1) = (8'388'607), available in @b `limit_k`.
  *
  *  Some of the large primes fitting right before that limit are:
  *      8'388'539, 8'388'547, 8'388'571, 8'388'581, 8'388'587, 8'388'593.
  *
- *  @note It's fair to say that this hash at least 23 bits of information, but it may not be enough for many apps.
+ *  Assuming the multipliers are typically within @b [256;~1000) and the additive factor is always within @b [1;257],
+ *  a safer choice of modulo is the largest prime under `limit_k/1000-257`:
+ *
+ *     8'089, 8'093, 8'101, 8'111, 8'117, 8'123
+ *
+ *  ! Notice how small those modulo values are, so there's going to be very little information encoded in hashes.
+ *  ! So the `floating_rolling_hasher<float>` should only be used for exploratory purposes & testing.
+ *
  *  @sa `floating_rolling_hasher<double>` for 52 bit variant.
  */
 template <>
@@ -276,13 +300,15 @@ struct floating_rolling_hasher<float> {
     using float_t = float;
 
     constexpr static float_t limit_k = 8'388'607.0f;
+    constexpr static hash_t default_alphabet_size_k = 256u;
+    constexpr static hash_t default_modulo_base_k = 8123u;
 
     explicit floating_rolling_hasher(std::size_t const window_width) noexcept
-        : floating_rolling_hasher(window_width, static_cast<float_t>(257), static_cast<float_t>(8388593)) {}
+        : floating_rolling_hasher(window_width, default_alphabet_size_k, default_modulo_base_k) {}
 
     explicit floating_rolling_hasher(std::size_t const window_width, hash_t const multiplier,
                                      hash_t const modulo) noexcept
-        : window_width_ {window_width}, multiplier_ {multiplier}, modulo_ {modulo}, inverse_modulo_ {1.f / modulo_},
+        : window_width_ {window_width}, multiplier_ {multiplier}, modulo_ {modulo}, inverse_modulo_ {1.0f / modulo_},
           negative_discarding_multiplier_ {1.0f} {
 
         _sz_assert(window_width_ > 1 && "Window width must be > 1");
@@ -353,12 +379,18 @@ struct floating_rolling_hasher<float> {
  *
  *  The IEEE 754 double-precision `float` has a 53-bit significand (52 explicit bits + 1 implicit bit).
  *  For simplicity, we just focus on the 52-bit part, which is capable of exactly representing integers
- *  up to (2⁵² - 1) = (4'503'599'627'370'495).
+ *  up to (2⁵² - 1) = (4'503'599'627'370'495), available in @b `limit_k`.
  *
  *  Some of the large primes fitting right before that limit are:
  *      4'503'599'627'370'287, 4'503'599'627'370'299, 4'503'599'627'370'313,
  *      4'503'599'627'370'323, 4'503'599'627'370'353, 4'503'599'627'370'449.
  *
+ *  Assuming the multipliers are typically within @b [256;~1000) and the additive factor is always within @b [1;257],
+ *  a safer choice of modulo is the largest prime under `limit_k/1000-257`:
+ *
+ *      4'503'599'626'781, 4'503'599'626'783, 4'503'599'626'807,
+ *      4'503'599'626'907, 4'503'599'626'957, 4'503'599'626'977.
+ *
  *  @sa `floating_rolling_hasher<double>` for 52 bit variant.
  */
 template <>
@@ -366,14 +398,16 @@ struct floating_rolling_hasher<double> {
     using hash_t = std::uint64_t;
     using float_t = double;
 
-    constexpr static float_t limit_k = 4503599627370495.0f;
+    constexpr static float_t limit_k = 4503599627370495.0;
+    constexpr static hash_t default_alphabet_size_k = 256u;
+    constexpr static hash_t default_modulo_base_k = 4503599626977u;
 
     explicit floating_rolling_hasher(std::size_t const window_width) noexcept
-        : floating_rolling_hasher(window_width, static_cast<float_t>(257), static_cast<float_t>(4503599627370449)) {}
+        : floating_rolling_hasher(window_width, default_alphabet_size_k, default_modulo_base_k) {}
 
     explicit floating_rolling_hasher(std::size_t const window_width, hash_t const multiplier,
                                      hash_t const modulo) noexcept
-        : window_width_ {window_width}, multiplier_ {multiplier}, modulo_ {modulo}, inverse_modulo_ {1.f / modulo_},
+        : window_width_ {window_width}, multiplier_ {multiplier}, modulo_ {modulo}, inverse_modulo_ {1.0 / modulo_},
           negative_discarding_multiplier_ {1.0} {
 
         _sz_assert(window_width_ > 1 && "Window width must be > 1");
@@ -418,6 +452,11 @@ struct floating_rolling_hasher<double> {
         return sz_bitcast(hash_t, state);
     }
 
+    inline float_t multiplier() const noexcept { return multiplier_; }
+    inline float_t modulo() const noexcept { return modulo_; }
+    inline float_t inverse_modulo() const noexcept { return inverse_modulo_; }
+    inline float_t negative_discarding_multiplier() const noexcept { return negative_discarding_multiplier_; }
+
   private:
     /**
      *  @brief Barrett-style `std::fmod` alternative to avoid overflow.
@@ -438,6 +477,331 @@ struct floating_rolling_hasher<double> {
     float_t negative_discarding_multiplier_;
 };
 
+#pragma endregion - Baseline Rolling Hashers
+
+#pragma region - Optimized Rolling MinHashers
+
+/**
+ *  @brief Boring Min-Hash implementation on top of the baseline Rabin Karp algorithm just for benchmarking.
+ *  @tparam hasher_type_ Can be the Rabin-Karp, BuzHash, or anything else compatible.
+ */
+template <                                                                           //
+    typename hasher_type_ = rabin_karp_rolling_hasher<std::uint32_t, std::uint64_t>, //
+    typename allocator_type_ = std::allocator<hasher_type_>,                         //
+    typename scalar_type_ = typename hasher_type_::hash_t                            //
+    >
+struct basic_rolling_hashers {
+
+    using hasher_t = hasher_type_;
+    using min_hash_t = scalar_type_;
+    using allocator_t = allocator_type_;
+
+  private:
+    using rolling_hash_t = typename hasher_t::hash_t;
+    struct state_t {
+        rolling_hash_t last = 0;
+        min_hash_t minimum = std::numeric_limits<min_hash_t>::max();
+    };
+
+    using allocator_traits_t = std::allocator_traits<allocator_type_>;
+    using hasher_allocator_t = typename allocator_traits_t::template rebind_alloc<hasher_t>;
+    using state_allocator_t = typename allocator_traits_t::template rebind_alloc<state_t>;
+
+    using hashers_t = safe_vector<hasher_t, hasher_allocator_t>;
+    using states_t = safe_vector<state_t, state_allocator_t>;
+
+    hashers_t hashers_;
+    allocator_t allocator_;
+    std::size_t max_window_width_ = 0;
+
+  public:
+    basic_rolling_hashers(allocator_t allocator = {}) noexcept
+        : allocator_(std::move(allocator)),
+          hashers_(allocator_traits_t::select_on_container_copy_construction(allocator)) {}
+
+    /**
+     *  @brief Appends multiple new rolling hashers for a given @p window_width.
+     *
+     *  @param[in] window_width Width of the rolling window, typically 3, 4, 5, 6, or 7.
+     *  @param[in] dimensions Number of hash functions to use, typically 768, 1024, or 1536.
+     *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
+     *  @retval status_t::success_k on success, or an error code otherwise.
+     *  @retval status_t::bad_alloc_k if the memory allocation fails.
+     *
+     *  Typical usage of this interface (error handling aside) would be like:
+     *
+     *  @code{.cpp}
+     *  basic_rolling_hashers<rabin_karp_rolling_hasher<std::uint32_t>> hashers;
+     *  hashers.try_extend(3, 32); // 32 dimensions for 3-grams
+     *  hashers.try_extend(5, 32); // 32 dimensions for 5-grams
+     *  hashers.try_extend(7, 64); // 64 dimensions for 7-grams
+     *  std::array<std::uint32_t, 128> fingerprint; // 128 total dimensions
+     *  hashers("some text", fingerprint);
+     *  @endcode
+     */
+    status_t try_extend(std::size_t window_width, std::size_t dimensions, std::size_t alphabet_size = 256) noexcept {
+        if (!hashers_.reserve(dimensions)) return status_t::bad_alloc_k;
+        for (std::size_t i = 0; i < dimensions; ++i) try_append(hasher_t(window_width, alphabet_size + i));
+        return status_t::success_k;
+    }
+
+    /**
+     *  @brief Appends a new rolling @p hasher to the collection via `try_append`.
+     *  @retval status_t::success_k on success, or an error code otherwise.
+     *  @retval status_t::bad_alloc_k if the memory allocation fails.
+     */
+    status_t try_append(hasher_t hasher) noexcept {
+        if (!hashers_.push_back(std::move(hasher))) return status_t::bad_alloc_k;
+
+        max_window_width_ = (std::max)(hasher.window_width(), max_window_width_);
+        return status_t::success_k;
+    }
+
+    /**
+     *  @brief Computes the @p fingerprint of a single @p text on the current thread.
+     *  @param[in] text The input text to hash, typically a UTF-8 encoded string.
+     *  @param[out] fingerprint The output fingerprint, a vector of minimum hashes.
+     */
+    template <size_t dimensionality_ = SZ_SIZE_MAX>
+    status_t operator()(span<byte_t const> text, span<min_hash_t, dimensionality_> fingerprint) const noexcept {
+        _sz_assert(fingerprint.size() >= hashers_.size() && "Dimensions number & hashers number mismatch");
+
+        // Allocate states
+        states_t states(allocator_traits_t::select_on_container_copy_construction(allocator_));
+        if (!states.try_resize(hashers_.size())) return status_t::bad_alloc_k;
+
+        // Until we reach the maximum window length, use a branching code version
+        for (std::size_t i = 0; i < (std::min)(text.size(), max_window_width_); ++i) {
+            byte_t const new_char = text[i];
+            for (std::size_t j = 0; j < fingerprint.size(); ++j) {
+                auto &hasher = hashers_[j];
+                auto &state = states[j];
+                if (hasher.window_width() > i) { state.last = hasher.update(state.last, new_char); }
+                else {
+                    auto const old_char = text[i - hasher.window_width()];
+                    state.last = hasher.update(state.last, old_char, new_char);
+                    state.minimum = (std::min)(state.minimum, state.last);
+                }
+            }
+        }
+
+        // Now we can avoid a branch in the nested loop, as we are passed the longest window width
+        for (std::size_t i = max_window_width_; i < text.size(); ++i) {
+            byte_t const new_char = text[i];
+            for (std::size_t j = 0; j < fingerprint.size(); ++j) {
+                auto &hasher = hashers_[j];
+                auto &state = states[j];
+                auto const old_char = text[i - hasher.window_width()];
+                state.last = hasher.update(state.last, old_char, new_char);
+                state.minimum = (std::min)(state.minimum, state.last);
+            }
+        }
+
+        // Export the minimum hashes to the fingerprint
+        for (std::size_t j = 0; j < fingerprint.size(); ++j) {
+            auto const &hasher = hashers_[j];
+            auto &state = states[j];
+            auto &fingerprint = fingerprint[j];
+            if (hasher.window_width() < text.size()) { fingerprint = std::numeric_limits<min_hash_t>::max(); }
+            else { fingerprint = static_cast<min_hash_t>(state.minimum); }
+        }
+
+        return status_t::success_k;
+    }
+
+    /**
+     *  @brief Computes many @p fingerprints in parallel for input @p texts via an @p executor.
+     *  @param[in] texts The input texts to hash, typically a UTF-8 encoded string.
+     *  @param[out] fingerprints The output fingerprints, a array of vectors of minimum hashes.
+     *  @param[in] executor The executor to use for parallel processing, defaults to a dummy executor.
+     *  @param[in] specs The CPU specifications to use, defaults to an empty `cpu_specs_t`.
+     */
+    template <typename texts_type_, typename fingerprints_type_, typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
+    status_t operator()(                                             //
+        texts_type_ const &texts, fingerprints_type_ &&fingerprints, //
+        executor_type_ &&executor = {}, cpu_specs_t specs = {}) const noexcept {
+
+        using texts_t = typename std::decay<texts_type_>::type;
+        using text_t = typename texts_t::value_type;
+        using char_t = typename text_t::value_type;
+
+        // Depending on document sizes, choose the appropriate parallelization strategy
+        // - Either split each text into chunks across threads
+        // - Or split the texts themselves across threads
+        std::size_t const text_size_threshold = specs.l2_bytes * executor.threads_count();
+
+        // Process small texts by individual threads
+        executor.for_n_dynamic(texts.size(), [&](std::size_t i) noexcept {
+            auto const &text = texts[i];
+            if (text.size() >= text_size_threshold) continue;
+            auto &fingerprint = fingerprints[i];
+            operator()(text, fingerprint);
+        });
+
+        // Process large texts by splitting them into chunks
+        for (std::size_t i = 0; i < texts.size(); ++i) {
+            auto const &text = texts[i];
+            if (text.size() < text_size_threshold) continue;
+
+            // Split the text into chunks of the maximum window width
+            std::size_t const chunk_size = round_up_to_multiple(        //
+                divide_round_up(text.size(), executor.threads_count()), //
+                specs.cache_line_width);
+
+            executor.for_threads([&](std::size_t j) noexcept {
+                auto text_start = text.data() + std::min(text.size(), j * chunk_size);
+                auto text_end = std::min(text_start + chunk_size + window_width_k - 1, text.end());
+            });
+        }
+
+        return status_t::success_k;
+    }
+};
+
+/**
+ *  @brief Optimized rolling Min-Hashers via floats, @b constrained to a certain dimensionality & window width.
+ *
+ *  This set of CPU kernels is likely to be composed into combinations for different dimensionalities and window
+ * widths, thus covering a subset of the dimensions in a final fingerprint. An example would be, having:
+ *  - 32 dimensions for 3-grams,
+ *  - 32 dimensions for 5-grams,
+ *  - 64 dimensions for 7-grams.
+ *
+ *  @tparam capability_ The CPU capability, e.g., `sz_cap_serial_k`, `sz_cap_ice_k`, etc.
+ *  @tparam window_width_ The width of the rolling window, e.g., 3, 4, 5, 6, etc.
+ *  @tparam dimensionality_ The number of dimensions in the fingerprint, ideally a multiple of 16, like 64 or 80.
+ *  @tparam enable_ A type used to enable or disable this specialization, e.g., `void` for default.
+ */
+template <                                         //
+    sz_capability_t capability_ = sz_cap_serial_k, //
+    std::size_t window_width_ = SZ_SIZE_MAX,       //
+    std::size_t dimensionality_ = 64,              //
+    typename enable_ = void                        //
+    >
+struct floating_rolling_hashers {
+
+    using hasher_t = floating_rolling_hasher<double>;
+    using rolling_hash_t = typename hasher_t::hash_t;
+    using min_hash_t = std::uint32_t;
+    using fingerprint_span_t = span<min_hash_t, dimensionality_k>;
+
+    static constexpr std::size_t window_width_k = window_width_;
+    static constexpr std::size_t dimensionality_k = dimensionality_;
+
+  private:
+    using float_t = typename hasher_t::float_t;
+
+    float_t multipliers_[dimensionality_k];
+    float_t modulos_[dimensionality_k];
+    float_t inverse_modulos_[dimensionality_k];
+    float_t negative_discarding_multipliers_[dimensionality_k];
+
+  public:
+    inline std::size_t window_width() const noexcept { return window_width_; }
+
+    /**
+     *  @brief Initializes several rolling hashers with different multipliers and modulos.
+     *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
+     */
+    status_t try_seed(std::size_t alphabet_size = 256) noexcept {
+        for (std::size_t j = 0; j < dimensionality_k; ++j) {
+            hasher_t hasher(window_width_k, alphabet_size + j, hasher_t::default_modulo_base_k);
+            multipliers_[j] = hasher.multiplier();
+            modulos_[j] = hasher.modulo();
+            inverse_modulos_[j] = hasher.inverse_modulo();
+            negative_discarding_multipliers_[j] = hasher.negative_discarding_multiplier();
+        }
+        return status_t::success_k;
+    }
+
+    /**
+     *
+     */
+    status_t operator()(span<byte_t const> text, fingerprint_span_t fingerprint) const noexcept {
+
+        if (text.size() < window_width_k) return status_t::success_k;
+
+        // Reset states
+        rolling_hash_t last_hashes[dimensionality_k];
+        min_hash_t minimum_hashes[dimensionality_k];
+        for (std::size_t i = 0; i < dimensionality_k; ++i) {
+            last_hashes[i] = 0;
+            minimum_hashes[i] = std::numeric_limits<min_hash_t>::max();
+        }
+
+        // Until we reach the maximum window length, use a branching code version
+        for (std::size_t i = 0; i < window_width_k; ++i) {
+            byte_t const new_char = text[i];
+            float_t const new_term = static_cast<float_t>(new_char) + 1.0;
+            for (std::size_t j = 0; j < dimensionality_k; ++j) {
+                rolling_hash_t &hash = last_hashes[j];
+                float_t state = sz_bitcast(float_t, hash);
+                state += multipliers_[j] * new_term;
+                state = reduce(state);
+
+                // Save back
+                hash = sz_bitcast(rolling_hash_t, state);
+                minimum_hashes[j] = std::min(minimum_hashes[j], hash);
+            }
+        }
+
+        // Now we can avoid a branch in the nested loop, as we are passed the longest window width
+        for (std::size_t i = window_width_k; i < text.size(); ++i) {
+            byte_t const new_char = text[i];
+            byte_t const old_char = text[i - window_width_k];
+            float_t const new_term = static_cast<float_t>(new_char) + 1.0;
+            float_t const old_term = static_cast<float_t>(old_char) + 1.0;
+            for (std::size_t j = 0; j < dimensionality_k; ++j) {
+                rolling_hash_t &hash = last_hashes[j];
+                float_t state = sz_bitcast(float_t, hash);
+                state += negative_discarding_multipliers_[j] * old_term; // Remove tail
+                state += multipliers_[j] * new_term;                     // Add head
+                state = reduce(state);
+
+                // Save back
+                hash = sz_bitcast(rolling_hash_t, state);
+                minimum_hashes[j] = std::min(minimum_hashes[j], hash);
+            }
+        }
+
+        // Export the minimum hashes to the fingerprint
+        for (std::size_t j = 0; j < dimensionality_k; ++j) fingerprint[j] = static_cast<min_hash_t>(minimum_hashes[j]);
+        return status_t::success_k;
+    }
+};
+
+/**
+ *  @brief Optimized rolling Min-Hashers built around floating-point numbers.
+ *
+ *  In a single ZMM register we can store 8 `double` values, so we can process 8 hashes per register.
+ *  Assuming 32x ZMM registers, and roughly 10ish scalars for intermediaries per hash, we can unroll
+ *  2-3x times, and process 16-24 hashes in parallel.
+ */
+template <std::size_t dimensionality_>
+struct floating_rolling_hashers<sz_cap_ice_k, dimensionality_> {
+
+    using hasher_t = floating_rolling_hasher<double>;
+    using rolling_hash_t = typename hasher_t::hash_t;
+    using min_hash_t = std::uint32_t;
+    static constexpr std::size_t dimensionality_k = dimensionality_;
+
+    void operator()(span<byte_t const> text, span<hasher_t const, dimensionality_k> hashers,
+                    span<min_hash_t, dimensionality_k> fingerprint) const noexcept {
+
+        constexpr std::size_t unroll_factor_k = 2;
+        constexpr std::size_t unrolled_hashes_k = unroll_factor_k * sizeof(sz_u512_vec_t) / sizeof(rolling_hash_t);
+
+        sz_u512_vec_t window_widths[unroll_factor_k], multipliers[unroll_factor_k],
+            negative_discarding_multipliers[unroll_factor_k], modulos[unroll_factor_k],
+            inverse_modulos[unroll_factor_k], states_[unroll_factor_k], min_hashes[unroll_factor_k];
+    }
+};
+
+#pragma endregion - Optimized Rolling MinHashers
+
 } // namespace stringzillas
 } // namespace ashvardanian
 
@@ -459,11 +823,6 @@ extern "C" {
  *       1. Kernighan and Ritchie's function uses 31, a prime close to the size of English alphabet.
  *       2. To be friendlier to byte-arrays and UTF8, we use 257 for the second function.
  *
- *  Choosing the right ::window_width is task- and domain-dependant. For example, most English words are
- *  between 3 and 7 characters long, so a window of 4 bytes would be a good choice. For DNA sequences,
- *  the ::window_width might be a multiple of 3, as the codons are 3 (nucleotides) bytes long.
- *  With such minimalistic alphabets of just four characters (AGCT) longer windows might be needed.
- *  For protein sequences the alphabet is 20 characters long, so the window can be shorter, than for DNAs.
  *
  *  @param text             String to hash.
  *  @param length           Number of bytes in the string.
diff --git a/scripts/test_fingerprint.cuh b/scripts/test_fingerprint.cuh
index f31b83fe..a3879737 100644
--- a/scripts/test_fingerprint.cuh
+++ b/scripts/test_fingerprint.cuh
@@ -142,8 +142,8 @@ void test_rolling_hasher() {
     auto dna_like_strings = rolling_hasher_dna_like_inputs();
     auto inconvenient_strings = rolling_hasher_inconvenient_inputs();
 
-    using u16u32_hasher_t = polynomial_rolling_hasher<u16_t, u32_t>;
-    using u32u64_hasher_t = polynomial_rolling_hasher<u32_t, u64_t>;
+    using u16u32_hasher_t = rabin_karp_rolling_hasher<u16_t, u32_t>;
+    using u32u64_hasher_t = rabin_karp_rolling_hasher<u32_t, u64_t>;
     using u32mul_hasher_t = multiplying_rolling_hasher<u32_t>;
     using i32mul_hasher_t = multiplying_rolling_hasher<i32_t>;
     using u64mul_hasher_t = multiplying_rolling_hasher<u64_t>;

From 5def3af9f587dcc3e31a1afa791750ec0bc88003 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 17 Jul 2025 23:46:28 +0000
Subject: [PATCH 465/751] Fix: Wrong Fork Union class name

---
 drafts/bench_find_many.cuh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drafts/bench_find_many.cuh b/drafts/bench_find_many.cuh
index 82a41b2e..e3136c25 100644
--- a/drafts/bench_find_many.cuh
+++ b/drafts/bench_find_many.cuh
@@ -120,9 +120,9 @@ void bench_find_many(environment_t const &env) {
     using matches_equality_t = arrays_equality<find_many_match_t>;
 
     // Let's reuse a thread-pool to amortize the cost of spawning threads.
-    fork_union_t pool;
+    fu::basic_pool_t pool;
     if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
-    static_assert(executor_like<fork_union_t>);
+    static_assert(executor_like<fu::basic_pool_t>);
 
     auto scramble_accelerated_results = [&](auto &results_accelerated) {
         std::shuffle(results_accelerated.begin(), results_accelerated.end(), global_random_generator());
@@ -165,7 +165,7 @@ void bench_find_many(environment_t const &env) {
         // Parallel search
         bench_nullary( //
             env, "count_many_parallel:"s + shape_suffix, call_count_baseline,
-            find_many_callable<find_many_u32_parallel_t, counts_t, fork_union_t &>( //
+            find_many_callable<find_many_u32_parallel_t, counts_t, fu::basic_pool_t &>( //
                 env, counts_accelerated, matches_accelerated, dict, {}, pool),
             callable_no_op_t {},  // preprocessing
             counts_equality_t {}) // equality check
@@ -173,7 +173,7 @@ void bench_find_many(environment_t const &env) {
 
         bench_nullary( //
             env, "find_many_parallel:"s + shape_suffix, call_find_baseline,
-            find_many_callable<find_many_u32_parallel_t, matches_t, fork_union_t &>( //
+            find_many_callable<find_many_u32_parallel_t, matches_t, fu::basic_pool_t &>( //
                 env, counts_accelerated, matches_accelerated, dict, {}, pool),
             callable_no_op_t {},   // preprocessing
             matches_equality_t {}) // equality check

From 4fb92838f6bb1dfefc53d57d3dcf3d8ae47352f6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 17 Jul 2025 23:46:51 +0000
Subject: [PATCH 466/751] Add: `to_span`, `to_view` helpers

---
 include/stringzilla/types.hpp | 41 ++++++++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 3579166d..4cffd3a8 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -94,6 +94,7 @@
 #if !SZ_AVOID_STL
 #include <initializer_list> // `std::initializer_list` is only ~100 LOC
 #include <iterator>         // `std::random_access_iterator_tag` pulls 20K LOC
+#include <type_traits>      // `std::is_same`, `std::enable_if`, etc.
 #endif
 
 namespace ashvardanian {
@@ -153,7 +154,10 @@ struct span {
 
     constexpr span() noexcept = default;
     constexpr span(value_type *data) noexcept : data_(data) {}
-    constexpr span(value_type *data) noexcept : data_(data) {}
+    sz_constexpr_if_cpp14 span(value_type *data, size_type size) noexcept : data_(data) {
+        _sz_assert(extent == size && "The second argument is only intended for compatibility");
+        sz_unused(size);
+    }
 
     constexpr value_type *begin() const noexcept { return data_; }
     constexpr value_type *end() const noexcept { return data_ + extent; }
@@ -167,15 +171,21 @@ struct span {
     constexpr bool empty() const noexcept { return extent == 0; }
 
     template <typename same_value_type_ = value_type,
-              typename = std::enable_if_t<!std::is_const<same_value_type_>::value>>
-    constexpr operator span<std::add_const_t<same_value_type_>>() const noexcept {
+              typename = typename std::enable_if<!std::is_const<same_value_type_>::value>::type>
+    constexpr operator span<typename std::add_const<same_value_type_>::type>() const noexcept {
         return {data_};
     }
+
     template <typename other_value_type_>
     constexpr span<other_value_type_, extent * sizeof(value_type) / sizeof(other_value_type_)> cast() const noexcept {
         return span<other_value_type_, extent * sizeof(value_type) / sizeof(other_value_type_)>(
             reinterpret_cast<other_value_type_ *>(data_));
     }
+
+    sz_constexpr_if_cpp14 span<value_type, SZ_SIZE_MAX> subspan(size_type offset, size_type count) const noexcept {
+        _sz_assert(offset + count <= extent && "Subspan out of bounds");
+        return span<value_type, SZ_SIZE_MAX>(data_ + offset, count);
+    }
 };
 
 template <typename value_type_>
@@ -204,17 +214,38 @@ struct span<value_type_, SZ_SIZE_MAX> {
     constexpr bool empty() const noexcept { return size_ == 0; }
 
     template <typename same_value_type_ = value_type,
-              typename = std::enable_if_t<!std::is_const<same_value_type_>::value>>
-    constexpr operator span<std::add_const_t<same_value_type_>>() const noexcept {
+              typename = typename std::enable_if<!std::is_const<same_value_type_>::value>::type>
+    constexpr operator span<typename std::add_const<same_value_type_>::type>() const noexcept {
         return {data_, size_};
     }
+
     template <typename other_value_type_>
     constexpr span<other_value_type_> cast() const noexcept {
         return span<other_value_type_>(reinterpret_cast<other_value_type_ *>(data_),
                                        size_ * sizeof(value_type) / sizeof(other_value_type_));
     }
+
+    sz_constexpr_if_cpp14 span subspan(size_type offset, size_type count) const noexcept {
+        _sz_assert(offset + count <= size_ && "Subspan out of bounds");
+        return span(data_ + offset, count);
+    }
 };
 
+template <std::size_t extent_ = SZ_SIZE_MAX, typename container_type_ = void>
+span<typename container_type_::value_type, extent_> to_span(container_type_ &container) noexcept {
+    return {container.data(), container.size()};
+}
+
+template <std::size_t extent_ = SZ_SIZE_MAX, typename container_type_ = void>
+span<typename container_type_::value_type const, extent_> to_view(container_type_ const &container) noexcept {
+    return {container.data(), container.size()};
+}
+
+template <typename container_type_>
+span<byte_t const> to_bytes_view(container_type_ const &container) noexcept {
+    return to_view(container).template cast<byte_t const>();
+}
+
 template <typename value_type_>
 struct dummy_alloc {
     using value_type = value_type_;     // ? For STL compatibility

From 498d72abedb90ad1a51a3898e0389ac391858af4 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 17 Jul 2025 23:47:27 +0000
Subject: [PATCH 467/751] Improve: Extend dummy executors API

---
 include/stringzillas/types.hpp | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/include/stringzillas/types.hpp b/include/stringzillas/types.hpp
index a2feee75..e33c7f84 100644
--- a/include/stringzillas/types.hpp
+++ b/include/stringzillas/types.hpp
@@ -23,9 +23,22 @@ enum bytes_per_cell_t : uint {
     eight_bytes_per_cell_k = 8,
 };
 
+struct dummy_mutex_t {
+    constexpr void lock() noexcept {}
+    constexpr void unlock() noexcept {}
+};
+
+struct dummy_prong_t {
+    std::size_t task = 0;
+    std::size_t thread = 0;
+
+    operator std::size_t() const noexcept { return task; }
+};
+
 struct dummy_executor_t {
 
     constexpr size_t threads_count() const noexcept { return 1; }
+    constexpr dummy_mutex_t make_mutex() const noexcept { return {}; }
 
     /**
      *  @brief  Calls the @p function for each index from 0 to @p (n) in such
@@ -34,7 +47,7 @@ struct dummy_executor_t {
      */
     template <typename function_type_>
     inline void for_n(size_t n, function_type_ &&function) const noexcept {
-        for (size_t i = 0; i < n; ++i) function(i);
+        for (size_t i = 0; i < n; ++i) function(dummy_prong_t {i, 0});
     }
 
     /**
@@ -55,7 +68,7 @@ struct dummy_executor_t {
      */
     template <typename function_type_>
     inline void for_n_dynamic(size_t n, function_type_ &&function) const noexcept {
-        for (size_t i = 0; i < n; ++i) function(i);
+        for (size_t i = 0; i < n; ++i) function(dummy_prong_t {i, 0});
     }
 
     /**

From 4ccac06f940bc6d14360e1c9630362a0dac866d1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 17 Jul 2025 23:47:52 +0000
Subject: [PATCH 468/751] Add: Parallel fingerprinting

---
 include/stringzillas/fingerprint.hpp | 490 +++++++++++++++++++--------
 1 file changed, 343 insertions(+), 147 deletions(-)

diff --git a/include/stringzillas/fingerprint.hpp b/include/stringzillas/fingerprint.hpp
index 6b1ae9c2..a2c74bc1 100644
--- a/include/stringzillas/fingerprint.hpp
+++ b/include/stringzillas/fingerprint.hpp
@@ -20,7 +20,7 @@
  *  retrieval tasks. We must also keep in mind, that however costly, the "fingerprinting" is a one-time operation, and
  *  the quality of the resulting "sketch" is no less important than the speed of the algorithm.
  *
- *  @section Rolling Hashes
+ *  @section Polynomial @b Rolling Hashes
  *
  *  At its core we compute many Karp-Rabin-like "rolling hashes" over multiple window widths and multipliers.
  *  We avoid 64-bit hashes, due to the lack of hardware support for efficient multiplication and modulo operations.
@@ -88,10 +88,10 @@
  *  With such minimalistic alphabets of just four characters (AGCT) longer windows might be needed.
  *  For protein sequences the alphabet is 20 characters long, so the window can be shorter, than for DNAs.
  *
- *  @section Min-Hashing
+ *  @section Fingerprinting/Sketching via @b Weighted-Min-Hashing
  *
  *  Computing one such hash won't help us much in large-scale retrieval tasks, but there is a common technique
- *  called "Min-Hashing" that can be used to produce a fixed-size fingerprint of a string. For the same wind
+ *  called "Min-Hashing" that can. The idea is built around the T
  */
 #ifndef STRINGZILLAS_FINGERPRINT_HPP_
 #define STRINGZILLAS_FINGERPRINT_HPP_
@@ -103,6 +103,7 @@
 #include <limits>   // `std::numeric_limits` for numeric types
 #include <iterator> // `std::iterator_traits` for iterators
 #include <cmath>    // `std::fabsf` for `f32_rolling_hasher`
+#include <numeric>  // `std::gcd` for `choose_coprime_modulo`
 
 namespace ashvardanian {
 namespace stringzillas {
@@ -143,7 +144,7 @@ struct multiplying_rolling_hasher {
 
 /**
  *  @brief Rabin-Karp–style rolling polynomial hash function.
- *  @tparam hash_type_ Type of the hash value, e.g., `std::uint32_t`.
+ *  @tparam hash_type_ Type of the hash value, can be `u16`, `u32`, or `u64`.
  *  @tparam accumulator_type_ Type used for modulo arithmetic, e.g., `std::uint64_t`.
  *
  *  Barrett's reduction can be used to avoid overflow in the multiplication and modulo operations.
@@ -156,9 +157,19 @@ struct rabin_karp_rolling_hasher {
     using hash_t = hash_type_;
     using accumulator_t = accumulator_type_;
 
-    explicit rabin_karp_rolling_hasher(std::size_t window_width);
+    static_assert(std::is_same_v<hash_t, std::uint16_t> || std::is_same_v<hash_t, std::uint32_t> ||
+                  std::is_same_v<hash_t, std::uint64_t>);
 
-    explicit rabin_karp_rolling_hasher(std::size_t window_width, hash_t multiplier, hash_t modulo) noexcept
+    static constexpr hash_t default_alphabet_size_k = 256u;
+    static constexpr hash_t default_modulo_base_k = //
+        std::is_same_v<hash_t, std::uint16_t>   ? SZ_U16_MAX_PRIME
+        : std::is_same_v<hash_t, std::uint32_t> ? SZ_U32_MAX_PRIME
+                                                : SZ_U64_MAX_PRIME;
+
+    explicit rabin_karp_rolling_hasher(              //
+        std::size_t window_width,                    //
+        hash_t multiplier = default_alphabet_size_k, //
+        hash_t modulo = default_modulo_base_k) noexcept
         : window_width_ {window_width}, modulo_ {modulo}, multiplier_ {multiplier}, discarding_multiplier_ {1} {
 
         _sz_assert(window_width_ > 1 && "Window width must be > 1");
@@ -299,16 +310,16 @@ struct floating_rolling_hasher<float> {
     using hash_t = std::uint32_t;
     using float_t = float;
 
-    constexpr static float_t limit_k = 8'388'607.0f;
-    constexpr static hash_t default_alphabet_size_k = 256u;
-    constexpr static hash_t default_modulo_base_k = 8123u;
-
-    explicit floating_rolling_hasher(std::size_t const window_width) noexcept
-        : floating_rolling_hasher(window_width, default_alphabet_size_k, default_modulo_base_k) {}
+    static constexpr float_t limit_k = 8'388'607.0f;
+    static constexpr hash_t default_alphabet_size_k = 256u;
+    static constexpr hash_t default_modulo_base_k = 8123u;
 
-    explicit floating_rolling_hasher(std::size_t const window_width, hash_t const multiplier,
-                                     hash_t const modulo) noexcept
-        : window_width_ {window_width}, multiplier_ {multiplier}, modulo_ {modulo}, inverse_modulo_ {1.0f / modulo_},
+    explicit floating_rolling_hasher(                      //
+        std::size_t const window_width,                    //
+        hash_t const multiplier = default_alphabet_size_k, //
+        hash_t const modulo = default_modulo_base_k) noexcept
+        : window_width_ {window_width}, multiplier_ {static_cast<float_t>(multiplier)},
+          modulo_ {static_cast<float_t>(modulo)}, inverse_modulo_ {1.0f / modulo_},
           negative_discarding_multiplier_ {1.0f} {
 
         _sz_assert(window_width_ > 1 && "Window width must be > 1");
@@ -322,8 +333,9 @@ struct floating_rolling_hasher<float> {
         float_t const largest_intermediary = largest_normalized_state * multiplier_ + largest_input_term;
         _sz_assert(largest_intermediary < limit_k && "Intermediate state overflows the limit");
 
+        // ! The GCC header misses the `std::fmodf` overload, so we use the underlying C version
         for (std::size_t i = 0; i + 1 < window_width_; ++i)
-            negative_discarding_multiplier_ = std::fmodf(negative_discarding_multiplier_ * multiplier_, modulo_);
+            negative_discarding_multiplier_ = ::fmodf(negative_discarding_multiplier_ * multiplier_, modulo_);
         negative_discarding_multiplier_ = -negative_discarding_multiplier_;
     }
 
@@ -398,16 +410,16 @@ struct floating_rolling_hasher<double> {
     using hash_t = std::uint64_t;
     using float_t = double;
 
-    constexpr static float_t limit_k = 4503599627370495.0;
-    constexpr static hash_t default_alphabet_size_k = 256u;
-    constexpr static hash_t default_modulo_base_k = 4503599626977u;
+    static constexpr float_t limit_k = 4503599627370495.0;
+    static constexpr hash_t default_alphabet_size_k = 256u;
+    static constexpr hash_t default_modulo_base_k = 4503599626977u;
 
-    explicit floating_rolling_hasher(std::size_t const window_width) noexcept
-        : floating_rolling_hasher(window_width, default_alphabet_size_k, default_modulo_base_k) {}
-
-    explicit floating_rolling_hasher(std::size_t const window_width, hash_t const multiplier,
-                                     hash_t const modulo) noexcept
-        : window_width_ {window_width}, multiplier_ {multiplier}, modulo_ {modulo}, inverse_modulo_ {1.0 / modulo_},
+    explicit floating_rolling_hasher(                      //
+        std::size_t const window_width,                    //
+        hash_t const multiplier = default_alphabet_size_k, //
+        hash_t const modulo = default_modulo_base_k) noexcept
+        : window_width_ {window_width}, multiplier_ {static_cast<float_t>(multiplier)},
+          modulo_ {static_cast<float_t>(modulo)}, inverse_modulo_ {1.0 / modulo_},
           negative_discarding_multiplier_ {1.0} {
 
         _sz_assert(window_width_ > 1 && "Window width must be > 1");
@@ -493,16 +505,19 @@ template <
 struct basic_rolling_hashers {
 
     using hasher_t = hasher_type_;
-    using min_hash_t = scalar_type_;
+    using rolling_hash_t = typename hasher_t::hash_t;
+    using result_scalar_t = scalar_type_;
     using allocator_t = allocator_type_;
 
-  private:
-    using rolling_hash_t = typename hasher_t::hash_t;
+    static constexpr rolling_hash_t skipped_rolling_hash_k = std::numeric_limits<rolling_hash_t>::max();
+    static constexpr result_scalar_t max_result_scalar_k = std::numeric_limits<result_scalar_t>::max();
+
     struct state_t {
         rolling_hash_t last = 0;
-        min_hash_t minimum = std::numeric_limits<min_hash_t>::max();
+        rolling_hash_t minimum = skipped_rolling_hash_k;
     };
 
+  private:
     using allocator_traits_t = std::allocator_traits<allocator_type_>;
     using hasher_allocator_t = typename allocator_traits_t::template rebind_alloc<hasher_t>;
     using state_allocator_t = typename allocator_traits_t::template rebind_alloc<state_t>;
@@ -510,8 +525,8 @@ struct basic_rolling_hashers {
     using hashers_t = safe_vector<hasher_t, hasher_allocator_t>;
     using states_t = safe_vector<state_t, state_allocator_t>;
 
-    hashers_t hashers_;
     allocator_t allocator_;
+    hashers_t hashers_;
     std::size_t max_window_width_ = 0;
 
   public:
@@ -519,11 +534,14 @@ struct basic_rolling_hashers {
         : allocator_(std::move(allocator)),
           hashers_(allocator_traits_t::select_on_container_copy_construction(allocator)) {}
 
+    std::size_t max_window_width() const noexcept { return max_window_width_; }
+    std::size_t dimensions() const noexcept { return hashers_.size(); }
+
     /**
      *  @brief Appends multiple new rolling hashers for a given @p window_width.
      *
      *  @param[in] window_width Width of the rolling window, typically 3, 4, 5, 6, or 7.
-     *  @param[in] dimensions Number of hash functions to use, typically 768, 1024, or 1536.
+     *  @param[in] dims Number of hash functions to use, typically 768, 1024, or 1536.
      *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
      *  @retval status_t::success_k on success, or an error code otherwise.
      *  @retval status_t::bad_alloc_k if the memory allocation fails.
@@ -532,16 +550,19 @@ struct basic_rolling_hashers {
      *
      *  @code{.cpp}
      *  basic_rolling_hashers<rabin_karp_rolling_hasher<std::uint32_t>> hashers;
-     *  hashers.try_extend(3, 32); // 32 dimensions for 3-grams
-     *  hashers.try_extend(5, 32); // 32 dimensions for 5-grams
-     *  hashers.try_extend(7, 64); // 64 dimensions for 7-grams
-     *  std::array<std::uint32_t, 128> fingerprint; // 128 total dimensions
+     *  hashers.try_extend(3, 32); // 32 dims for 3-grams
+     *  hashers.try_extend(5, 32); // 32 dims for 5-grams
+     *  hashers.try_extend(7, 64); // 64 dims for 7-grams
+     *  std::array<std::uint32_t, 128> fingerprint; // 128 total dims
      *  hashers("some text", fingerprint);
      *  @endcode
      */
-    status_t try_extend(std::size_t window_width, std::size_t dimensions, std::size_t alphabet_size = 256) noexcept {
-        if (!hashers_.reserve(dimensions)) return status_t::bad_alloc_k;
-        for (std::size_t i = 0; i < dimensions; ++i) try_append(hasher_t(window_width, alphabet_size + i));
+    status_t try_extend(std::size_t window_width, std::size_t dims, std::size_t alphabet_size = 256) noexcept {
+        if (hashers_.try_reserve(dims) != status_t::success_k) return status_t::bad_alloc_k;
+        for (std::size_t dim = 0; dim < dims; ++dim) {
+            status_t status = try_append(hasher_t(window_width, alphabet_size + dim));
+            _sz_assert(status == status_t::success_k && "Couldn't fail after the reserve");
+        }
         return status_t::success_k;
     }
 
@@ -551,163 +572,226 @@ struct basic_rolling_hashers {
      *  @retval status_t::bad_alloc_k if the memory allocation fails.
      */
     status_t try_append(hasher_t hasher) noexcept {
-        if (!hashers_.push_back(std::move(hasher))) return status_t::bad_alloc_k;
+        auto const new_window_width = hasher.window_width();
+        if (hashers_.try_push_back(std::move(hasher)) != status_t::success_k) return status_t::bad_alloc_k;
 
-        max_window_width_ = (std::max)(hasher.window_width(), max_window_width_);
+        max_window_width_ = (std::max)(new_window_width, max_window_width_);
         return status_t::success_k;
     }
 
     /**
-     *  @brief Computes the @p fingerprint of a single @p text on the current thread.
+     *  @brief Computes the fingerprint of a single @p text on the current thread.
      *  @param[in] text The input text to hash, typically a UTF-8 encoded string.
-     *  @param[out] fingerprint The output fingerprint, a vector of minimum hashes.
+     *  @param[out] result The output fingerprint, a vector of minimum hashes.
+     *  @retval status_t::success_k on success, or an error code otherwise.
+     *  @retval status_t::bad_alloc_k if the memory allocation fails.
      */
-    template <size_t dimensionality_ = SZ_SIZE_MAX>
-    status_t operator()(span<byte_t const> text, span<min_hash_t, dimensionality_> fingerprint) const noexcept {
-        _sz_assert(fingerprint.size() >= hashers_.size() && "Dimensions number & hashers number mismatch");
+    template <size_t dimensions_ = SZ_SIZE_MAX>
+    status_t try_fingerprint(span<byte_t const> text, span<result_scalar_t, dimensions_> result) const noexcept {
+        _sz_assert(result.size() == dimensions() && "Dimensions number & hashers number mismatch");
 
-        // Allocate states
+        // Allocate temporary states
         states_t states(allocator_traits_t::select_on_container_copy_construction(allocator_));
-        if (!states.try_resize(hashers_.size())) return status_t::bad_alloc_k;
+        if (!states.try_resize(dimensions())) return status_t::bad_alloc_k;
 
-        // Until we reach the maximum window length, use a branching code version
-        for (std::size_t i = 0; i < (std::min)(text.size(), max_window_width_); ++i) {
-            byte_t const new_char = text[i];
-            for (std::size_t j = 0; j < fingerprint.size(); ++j) {
-                auto &hasher = hashers_[j];
-                auto &state = states[j];
-                if (hasher.window_width() > i) { state.last = hasher.update(state.last, new_char); }
-                else {
-                    auto const old_char = text[i - hasher.window_width()];
-                    state.last = hasher.update(state.last, old_char, new_char);
-                    state.minimum = (std::min)(state.minimum, state.last);
-                }
-            }
-        }
+        fingerprint(text, states, result);
+        return status_t::success_k;
+    }
 
-        // Now we can avoid a branch in the nested loop, as we are passed the longest window width
-        for (std::size_t i = max_window_width_; i < text.size(); ++i) {
-            byte_t const new_char = text[i];
-            for (std::size_t j = 0; j < fingerprint.size(); ++j) {
-                auto &hasher = hashers_[j];
-                auto &state = states[j];
-                auto const old_char = text[i - hasher.window_width()];
-                state.last = hasher.update(state.last, old_char, new_char);
-                state.minimum = (std::min)(state.minimum, state.last);
-            }
-        }
+    /**
+     *  @brief Computes the fingerprint of a single @p text on the current thread.
+     *  @param[in] text The input text to hash, typically a UTF-8 encoded string.
+     *  @param[inout] states The states of the hashers, used to avoid re-allocating temporary buffers.
+     *  @param[out] result The output fingerprint, a vector of minimum hashes.
+     */
+    template <size_t dimensions_ = SZ_SIZE_MAX>
+    void fingerprint(span<byte_t const> text, span<state_t, dimensions_> states,
+                     span<result_scalar_t, dimensions_> result) const noexcept {
+
+        _sz_assert(result.size() == dimensions() && "Dimensions number & hashers number mismatch");
+        _sz_assert(states.size() == dimensions() && "Dimensions number & states number mismatch");
+
+        fill_states_(text, states);
 
         // Export the minimum hashes to the fingerprint
-        for (std::size_t j = 0; j < fingerprint.size(); ++j) {
-            auto const &hasher = hashers_[j];
-            auto &state = states[j];
-            auto &fingerprint = fingerprint[j];
-            if (hasher.window_width() < text.size()) { fingerprint = std::numeric_limits<min_hash_t>::max(); }
-            else { fingerprint = static_cast<min_hash_t>(state.minimum); }
+        for (std::size_t dim = 0; dim < result.size(); ++dim) {
+            hasher_t const &hasher = hashers_[dim];
+            rolling_hash_t minimum = hasher.window_width() < text.size() ? skipped_rolling_hash_k : states[dim].minimum;
+            result[dim] = static_cast<result_scalar_t>(minimum & max_result_scalar_k);
         }
-
-        return status_t::success_k;
     }
 
     /**
-     *  @brief Computes many @p fingerprints in parallel for input @p texts via an @p executor.
+     *  @brief Computes many fingerprints in parallel for input @p texts via an @p executor.
      *  @param[in] texts The input texts to hash, typically a UTF-8 encoded string.
-     *  @param[out] fingerprints The output fingerprints, a array of vectors of minimum hashes.
+     *  @param[out] results The output fingerprints, a array of vectors of minimum hashes.
      *  @param[in] executor The executor to use for parallel processing, defaults to a dummy executor.
      *  @param[in] specs The CPU specifications to use, defaults to an empty `cpu_specs_t`.
+     *  @retval status_t::success_k on success, or an error code otherwise.
+     *  @retval status_t::bad_alloc_k if the memory allocation fails.
      */
     template <typename texts_type_, typename fingerprints_type_, typename executor_type_ = dummy_executor_t>
 #if _SZ_IS_CPP20
         requires executor_like<executor_type_>
 #endif
-    status_t operator()(                                             //
-        texts_type_ const &texts, fingerprints_type_ &&fingerprints, //
+    status_t operator()(                                        //
+        texts_type_ const &texts, fingerprints_type_ &&results, //
         executor_type_ &&executor = {}, cpu_specs_t specs = {}) const noexcept {
 
-        using texts_t = typename std::decay<texts_type_>::type;
-        using text_t = typename texts_t::value_type;
-        using char_t = typename text_t::value_type;
-
         // Depending on document sizes, choose the appropriate parallelization strategy
         // - Either split each text into chunks across threads
         // - Or split the texts themselves across threads
         std::size_t const text_size_threshold = specs.l2_bytes * executor.threads_count();
+        std::size_t const dims = dimensions();
+
+        // Allocate enough temporary states for all cores to have individual states
+        states_t states(allocator_traits_t::select_on_container_copy_construction(allocator_));
+        if (states.try_resize(executor.threads_count() * dims) != status_t::success_k) return status_t::bad_alloc_k;
 
         // Process small texts by individual threads
-        executor.for_n_dynamic(texts.size(), [&](std::size_t i) noexcept {
-            auto const &text = texts[i];
-            if (text.size() >= text_size_threshold) continue;
-            auto &fingerprint = fingerprints[i];
-            operator()(text, fingerprint);
+        executor.for_n_dynamic(texts.size(), [&](auto prong) noexcept {
+            auto const text_index = prong.task;
+            auto const thread_index = prong.thread;
+
+            auto const &text = texts[text_index];
+            if (text.size() >= text_size_threshold) return;
+
+            auto text_view = to_bytes_view(text);
+            auto thread_local_states = to_span(states).subspan(thread_index * dims, dims);
+            auto result = to_span(results[text_index]);
+            fingerprint<SZ_SIZE_MAX>(text_view, thread_local_states, result);
         });
 
         // Process large texts by splitting them into chunks
-        for (std::size_t i = 0; i < texts.size(); ++i) {
-            auto const &text = texts[i];
+        for (std::size_t text_index = 0; text_index < texts.size(); ++text_index) {
+
+            auto const &text = texts[text_index];
             if (text.size() < text_size_threshold) continue;
 
             // Split the text into chunks of the maximum window width
-            std::size_t const chunk_size = round_up_to_multiple(        //
-                divide_round_up(text.size(), executor.threads_count()), //
+            auto const text_view = to_bytes_view(text);
+            std::size_t const chunk_size = round_up_to_multiple(             //
+                divide_round_up(text_view.size(), executor.threads_count()), //
                 specs.cache_line_width);
 
-            executor.for_threads([&](std::size_t j) noexcept {
-                auto text_start = text.data() + std::min(text.size(), j * chunk_size);
-                auto text_end = std::min(text_start + chunk_size + window_width_k - 1, text.end());
+            // Distribute overlapping chunks across threads
+            executor.for_threads([&](std::size_t thread_index) noexcept {
+                auto text_start = text_view.data() + std::min(text_view.size(), thread_index * chunk_size);
+                // ? This overlap will be different for different window widths, but assuming we are
+                // ? computing the non-weighted Min-Hash, recomputing & comparing a few hashes for the
+                // ? same slices isn't a big deal.
+                auto overlapping_text_end = std::min(text_start + chunk_size + max_window_width_ - 1, text_view.end());
+                auto thread_local_text = span<byte_t const>(text_start, overlapping_text_end);
+                auto thread_local_states = to_span(states).subspan(thread_index * dims, dims);
+                fill_states_<SZ_SIZE_MAX>(thread_local_text, thread_local_states);
             });
+
+            // Compute the minimums of each thread's local states
+            auto &result = results[text_index];
+            for (std::size_t dim = 0; dim < result.size(); ++dim) {
+                rolling_hash_t minimum_across_thread_local_chunks = skipped_rolling_hash_k;
+                for (std::size_t thread_index = 0; thread_index < executor.threads_count(); ++thread_index) {
+                    auto const &state = states[thread_index * dims + dim];
+                    minimum_across_thread_local_chunks = (std::min)(minimum_across_thread_local_chunks, state.minimum);
+                }
+                minimum_across_thread_local_chunks &= max_result_scalar_k; // Clamp to the result scalar range
+                result[dim] = static_cast<result_scalar_t>(minimum_across_thread_local_chunks);
+            }
         }
 
         return status_t::success_k;
     }
+
+  private:
+    template <size_t dimensions_ = SZ_SIZE_MAX>
+    void fill_states_(span<byte_t const> text, span<state_t, dimensions_> states) const noexcept {
+
+        _sz_assert(states.size() >= hashers_.size() && "Dimensions number & states number mismatch");
+
+        // Clear the states
+        for (auto &state : states) state = state_t {};
+
+        // Until we reach the maximum window length, use a branching code version
+        std::size_t const prefix_length = (std::min)(text.size(), max_window_width_);
+        for (std::size_t new_char_offset = 0; new_char_offset < prefix_length; ++new_char_offset) {
+            byte_t const new_char = text[new_char_offset];
+            for (std::size_t dim = 0; dim < states.size(); ++dim) {
+                auto &hasher = hashers_[dim];
+                auto &state = states[dim];
+                if (hasher.window_width() > new_char_offset) { state.last = hasher.update(state.last, new_char); }
+                else {
+                    auto const old_char = text[new_char_offset - hasher.window_width()];
+                    state.last = hasher.update(state.last, old_char, new_char);
+                    state.minimum = (std::min)(state.minimum, state.last);
+                }
+            }
+        }
+
+        // Now we can avoid a branch in the nested loop, as we are passed the longest window width
+        for (std::size_t new_char_offset = max_window_width_; new_char_offset < text.size(); ++new_char_offset) {
+            byte_t const new_char = text[new_char_offset];
+            for (std::size_t dim = 0; dim < states.size(); ++dim) {
+                auto &hasher = hashers_[dim];
+                auto &state = states[dim];
+                auto const old_char = text[new_char_offset - hasher.window_width()];
+                state.last = hasher.update(state.last, old_char, new_char);
+                state.minimum = (std::min)(state.minimum, state.last);
+            }
+        }
+    }
 };
 
 /**
  *  @brief Optimized rolling Min-Hashers via floats, @b constrained to a certain dimensionality & window width.
+ *  @note Window width can't be too big to fit on the stack! 16 or 64 is the sweet spot.
  *
  *  This set of CPU kernels is likely to be composed into combinations for different dimensionalities and window
- * widths, thus covering a subset of the dimensions in a final fingerprint. An example would be, having:
+ *  widths, thus covering a subset of the dimensions in a final fingerprint. An example would be, having:
  *  - 32 dimensions for 3-grams,
  *  - 32 dimensions for 5-grams,
  *  - 64 dimensions for 7-grams.
  *
  *  @tparam capability_ The CPU capability, e.g., `sz_cap_serial_k`, `sz_cap_ice_k`, etc.
  *  @tparam window_width_ The width of the rolling window, e.g., 3, 4, 5, 6, etc.
- *  @tparam dimensionality_ The number of dimensions in the fingerprint, ideally a multiple of 16, like 64 or 80.
+ *  @tparam dimensions_ The number of dimensions in the fingerprint, ideally a multiple of 16, like 64 or 80.
  *  @tparam enable_ A type used to enable or disable this specialization, e.g., `void` for default.
  */
 template <                                         //
     sz_capability_t capability_ = sz_cap_serial_k, //
     std::size_t window_width_ = SZ_SIZE_MAX,       //
-    std::size_t dimensionality_ = 64,              //
+    std::size_t dimensions_ = 16,                  //
     typename enable_ = void                        //
     >
 struct floating_rolling_hashers {
 
     using hasher_t = floating_rolling_hasher<double>;
     using rolling_hash_t = typename hasher_t::hash_t;
-    using min_hash_t = std::uint32_t;
-    using fingerprint_span_t = span<min_hash_t, dimensionality_k>;
+    using result_scalar_t = std::uint32_t;
 
     static constexpr std::size_t window_width_k = window_width_;
-    static constexpr std::size_t dimensionality_k = dimensionality_;
+    static constexpr std::size_t dimensions_k = dimensions_;
+    static constexpr result_scalar_t skipped_rolling_hash_k = std::numeric_limits<result_scalar_t>::max();
+
+    using fingerprint_span_t = span<result_scalar_t, dimensions_k>;
 
   private:
     using float_t = typename hasher_t::float_t;
 
-    float_t multipliers_[dimensionality_k];
-    float_t modulos_[dimensionality_k];
-    float_t inverse_modulos_[dimensionality_k];
-    float_t negative_discarding_multipliers_[dimensionality_k];
+    float_t multipliers_[dimensions_k];
+    float_t modulos_[dimensions_k];
+    float_t inverse_modulos_[dimensions_k];
+    float_t negative_discarding_multipliers_[dimensions_k];
 
   public:
-    inline std::size_t window_width() const noexcept { return window_width_; }
+    constexpr std::size_t window_width() const noexcept { return window_width_k; }
+    constexpr std::size_t dimensions() const noexcept { return dimensions_k; }
 
     /**
      *  @brief Initializes several rolling hashers with different multipliers and modulos.
      *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
      */
     status_t try_seed(std::size_t alphabet_size = 256) noexcept {
-        for (std::size_t j = 0; j < dimensionality_k; ++j) {
+        for (std::size_t j = 0; j < dimensions_k; ++j) {
             hasher_t hasher(window_width_k, alphabet_size + j, hasher_t::default_modulo_base_k);
             multipliers_[j] = hasher.multiplier();
             modulos_[j] = hasher.modulo();
@@ -718,58 +802,170 @@ struct floating_rolling_hashers {
     }
 
     /**
-     *
+     *  @brief Computes the fingerprint of a single @p text on the current thread.
+     *  @param[in] text The input text to hash, typically a UTF-8 encoded string.
+     *  @param[out] result The output fingerprint, a vector of minimum hashes.
      */
-    status_t operator()(span<byte_t const> text, fingerprint_span_t fingerprint) const noexcept {
+    void fingerprint(span<byte_t const> text, fingerprint_span_t result) const noexcept {
+        if (text.size() < window_width_k) return;
+
+        // Fill the states
+        rolling_hash_t last_hashes[dimensions_k];
+        rolling_hash_t minimum_hashes[dimensions_k];
+        fill_states_(text, last_hashes, minimum_hashes);
 
-        if (text.size() < window_width_k) return status_t::success_k;
+        // Export the minimum hashes to the fingerprint
+        for (std::size_t dim = 0; dim < dimensions_k; ++dim)
+            result[dim] = static_cast<result_scalar_t>(minimum_hashes[dim]);
+    }
+
+    /**
+     *  @brief Computes the fingerprint of a single @p text on the current thread.
+     *  @param[in] text The input text to hash, typically a UTF-8 encoded string.
+     *  @param[out] result The output fingerprint, a vector of minimum hashes.
+     */
+    status_t try_fingerprint(span<byte_t const> text, fingerprint_span_t result) const noexcept {
+        fingerprint(text, result);
+        return status_t::success_k;
+    }
 
-        // Reset states
-        rolling_hash_t last_hashes[dimensionality_k];
-        min_hash_t minimum_hashes[dimensionality_k];
-        for (std::size_t i = 0; i < dimensionality_k; ++i) {
-            last_hashes[i] = 0;
-            minimum_hashes[i] = std::numeric_limits<min_hash_t>::max();
+    /**
+     *  @brief Computes many fingerprints in parallel for input @p texts via an @p executor.
+     *  @param[in] texts The input texts to hash, typically a UTF-8 encoded string.
+     *  @param[out] results The output fingerprints, a array of vectors of minimum hashes.
+     *  @param[in] executor The executor to use for parallel processing, defaults to a dummy executor.
+     *  @param[in] specs The CPU specifications to use, defaults to an empty `cpu_specs_t`.
+     *  @retval status_t::success_k on success, or an error code otherwise.
+     *  @retval status_t::bad_alloc_k if the memory allocation fails.
+     */
+    template <typename texts_type_, typename fingerprints_type_, typename executor_type_ = dummy_executor_t>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
+    status_t operator()(                                        //
+        texts_type_ const &texts, fingerprints_type_ &&results, //
+        executor_type_ &&executor = {}, cpu_specs_t specs = {}) const noexcept {
+
+        // Depending on document sizes, choose the appropriate parallelization strategy
+        // - Either split each text into chunks across threads
+        // - Or split the texts themselves across threads
+        std::size_t const text_size_threshold = specs.l2_bytes * executor.threads_count();
+
+        // Process small texts by individual threads
+        executor.for_n_dynamic(texts.size(), [&](auto prong) noexcept {
+            auto const text_index = prong.task;
+
+            auto const &text = texts[text_index];
+            if (text.size() >= text_size_threshold) return;
+
+            auto text_view = to_bytes_view(text);
+            auto result = to_span<dimensions_k>(results[text_index]);
+            fingerprint(text_view, result);
+        });
+
+        // Process large texts by splitting them into chunks
+        for (std::size_t text_index = 0; text_index < texts.size(); ++text_index) {
+
+            auto const &text = texts[text_index];
+            if (text.size() < text_size_threshold) continue;
+
+            // Split the text into chunks of the maximum window width
+            auto text_view = to_bytes_view(text);
+            std::size_t const chunk_size = round_up_to_multiple(             //
+                divide_round_up(text_view.size(), executor.threads_count()), //
+                specs.cache_line_width);
+
+            auto gather_mutex = executor.make_mutex();
+            rolling_hash_t minimum_hashes[dimensions_k];
+
+            // Distribute overlapping chunks across threads
+            executor.for_threads([&](std::size_t thread_index) noexcept {
+                auto text_start = text_view.data() + std::min(text_view.size(), thread_index * chunk_size);
+                // ? This overlap will be different for different window widths, but assuming we are
+                // ? computing the non-weighted Min-Hash, recomputing & comparing a few hashes for the
+                // ? same slices isn't a big deal.
+                auto overlapping_text_end = std::min(text_start + chunk_size + window_width_k - 1, text_view.end());
+                auto thread_local_text = span<byte_t const>(text_start, overlapping_text_end);
+
+                rolling_hash_t thread_local_last_hashes[dimensions_k];
+                rolling_hash_t thread_local_minimum_hashes[dimensions_k];
+                fill_states_(thread_local_text, thread_local_last_hashes, thread_local_minimum_hashes);
+
+                std::lock_guard lock(gather_mutex);
+                for (std::size_t dim = 0; dim < dimensions_k; ++dim)
+                    minimum_hashes[dim] = (std::min)(minimum_hashes[dim], thread_local_minimum_hashes[dim]);
+            });
+
+            // Compute the minimums of each thread's local states
+            auto &result = results[text_index];
+            for (std::size_t dim = 0; dim < dimensions_k; ++dim)
+                result[dim] = static_cast<result_scalar_t>(minimum_hashes[dim]);
+        }
+
+        return status_t::success_k;
+    }
+
+  private:
+    inline float_t reduce(float_t h, std::size_t dim) const noexcept {
+        float_t const modulo = modulos_[dim];
+        float_t const inverse_modulo = inverse_modulos_[dim];
+        // Using `trunc` instead of `floor` for modulo.
+        // This is branchless and avoids STL.
+        h -= modulo * static_cast<float_t>(static_cast<long long>(h * inverse_modulo));
+
+        // Branchless clamp to [0, modulo).
+        // `h` is in (-modulo, modulo).
+        // If h is negative, add modulo.
+        h += modulo * static_cast<float_t>(sz_bitcast(std::uint64_t, h) >> 63);
+        // Now `h` is in [0, 2*modulo).
+        // If h is >= modulo, subtract modulo.
+        long long is_ge = static_cast<long long>(h * inverse_modulo);
+        h -= modulo * static_cast<float_t>(is_ge);
+        return h;
+    }
+
+    void fill_states_(span<byte_t const> text, span<rolling_hash_t, dimensions_k> last_hashes,
+                      span<rolling_hash_t, dimensions_k> minimum_hashes) const noexcept {
+
+        for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
+            last_hashes[dim] = 0;
+            minimum_hashes[dim] = skipped_rolling_hash_k;
         }
 
         // Until we reach the maximum window length, use a branching code version
-        for (std::size_t i = 0; i < window_width_k; ++i) {
-            byte_t const new_char = text[i];
+        for (std::size_t new_char_offset = 0; new_char_offset < window_width_k; ++new_char_offset) {
+            byte_t const new_char = text[new_char_offset];
             float_t const new_term = static_cast<float_t>(new_char) + 1.0;
-            for (std::size_t j = 0; j < dimensionality_k; ++j) {
-                rolling_hash_t &hash = last_hashes[j];
+            for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
+                rolling_hash_t &hash = last_hashes[dim];
                 float_t state = sz_bitcast(float_t, hash);
-                state += multipliers_[j] * new_term;
-                state = reduce(state);
+                state += multipliers_[dim] * new_term;
+                state = reduce(state, dim);
 
                 // Save back
                 hash = sz_bitcast(rolling_hash_t, state);
-                minimum_hashes[j] = std::min(minimum_hashes[j], hash);
+                minimum_hashes[dim] = std::min(minimum_hashes[dim], hash);
             }
         }
 
         // Now we can avoid a branch in the nested loop, as we are passed the longest window width
-        for (std::size_t i = window_width_k; i < text.size(); ++i) {
-            byte_t const new_char = text[i];
-            byte_t const old_char = text[i - window_width_k];
+        for (std::size_t new_char_offset = window_width_k; new_char_offset < text.size(); ++new_char_offset) {
+            byte_t const new_char = text[new_char_offset];
+            byte_t const old_char = text[new_char_offset - window_width_k];
             float_t const new_term = static_cast<float_t>(new_char) + 1.0;
             float_t const old_term = static_cast<float_t>(old_char) + 1.0;
-            for (std::size_t j = 0; j < dimensionality_k; ++j) {
-                rolling_hash_t &hash = last_hashes[j];
+            for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
+                rolling_hash_t &hash = last_hashes[dim];
                 float_t state = sz_bitcast(float_t, hash);
-                state += negative_discarding_multipliers_[j] * old_term; // Remove tail
-                state += multipliers_[j] * new_term;                     // Add head
-                state = reduce(state);
+                state += negative_discarding_multipliers_[dim] * old_term; // Remove tail
+                state += multipliers_[dim] * new_term;                     // Add head
+                state = reduce(state, dim);
 
                 // Save back
                 hash = sz_bitcast(rolling_hash_t, state);
-                minimum_hashes[j] = std::min(minimum_hashes[j], hash);
+                minimum_hashes[dim] = std::min(minimum_hashes[dim], hash);
             }
         }
-
-        // Export the minimum hashes to the fingerprint
-        for (std::size_t j = 0; j < dimensionality_k; ++j) fingerprint[j] = static_cast<min_hash_t>(minimum_hashes[j]);
-        return status_t::success_k;
     }
 };
 
@@ -780,16 +976,16 @@ struct floating_rolling_hashers {
  *  Assuming 32x ZMM registers, and roughly 10ish scalars for intermediaries per hash, we can unroll
  *  2-3x times, and process 16-24 hashes in parallel.
  */
-template <std::size_t dimensionality_>
-struct floating_rolling_hashers<sz_cap_ice_k, dimensionality_> {
+template <std::size_t dimensions_>
+struct floating_rolling_hashers<sz_cap_ice_k, dimensions_> {
 
     using hasher_t = floating_rolling_hasher<double>;
     using rolling_hash_t = typename hasher_t::hash_t;
-    using min_hash_t = std::uint32_t;
-    static constexpr std::size_t dimensionality_k = dimensionality_;
+    using result_scalar_t = std::uint32_t;
+    static constexpr std::size_t dimensions_k = dimensions_;
 
-    void operator()(span<byte_t const> text, span<hasher_t const, dimensionality_k> hashers,
-                    span<min_hash_t, dimensionality_k> fingerprint) const noexcept {
+    void operator()(span<byte_t const> text, span<hasher_t const, dimensions_k> hashers,
+                    span<result_scalar_t, dimensions_k> fingerprint) const noexcept {
 
         constexpr std::size_t unroll_factor_k = 2;
         constexpr std::size_t unrolled_hashes_k = unroll_factor_k * sizeof(sz_u512_vec_t) / sizeof(rolling_hash_t);

From 46a6d6364290b33e3569e932c0efb7bde87cb891 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 17 Jul 2025 23:49:08 +0000
Subject: [PATCH 469/751] Fix: Minor build issues

---
 CONTRIBUTING.md                     |  3 ++-
 include/stringzilla/types.h         |  6 ++++-
 include/stringzillas/similarity.hpp |  7 +++---
 scripts/bench_fingerprint.cuh       | 10 +++++----
 scripts/bench_similarity.cuh        | 34 +++++++++++++++--------------
 5 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 29f46bd8..529aea60 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -34,9 +34,10 @@ They have the broadest coverage of the library, and are the most important to ke
 
 - `scripts/bench_token.cpp` - token-level ops, like hashing, ordering, equality checks.
 - `scripts/bench_find.cpp` - bidirectional substring search, both exact and fuzzy.
-- `scripts/bench_similarity.cpp` - benchmark all edit distance backends.
 - `scripts/bench_sequence.cpp` - sorting, partitioning, merging.
 - `scripts/bench_container.cpp` - STL containers with different string keys.
+- `scripts/bench_similarity.cpp` - benchmark all edit distance backends.
+- `scripts/bench_fingerprint.cpp` - benchmark all Min-Hash fingerprinting backends.
 
 The role of Python benchmarks is less to provide absolute number, but to compare against popular tools in the Python ecosystem.
 
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index da312ede..14c20793 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -175,9 +175,13 @@
 #define _SZ_ALIGN64
 #endif
 
+/**
+ *  @brief  Largest prime number that fits into 16 bits.
+ */
+#define SZ_U16_MAX_PRIME (65521u)
+
 /**
  *  @brief  Largest prime number that fits into 31 bits.
- *  @see    https://mersenneforum.org/showthread.php?t=3471
  */
 #define SZ_U32_MAX_PRIME (2147483647u)
 
diff --git a/include/stringzillas/similarity.hpp b/include/stringzillas/similarity.hpp
index 554ee610..933a52e0 100644
--- a/include/stringzillas/similarity.hpp
+++ b/include/stringzillas/similarity.hpp
@@ -159,7 +159,7 @@ struct error_costs_256x256_t {
      *  @brief  Produces a substitution cost matrix for the Needleman-Wunsch alignment score,
      *          that would yield the same result as the negative Levenshtein distance.
      */
-    constexpr static error_costs_256x256_t diagonal(error_cost_t match_score = 0,
+    static constexpr error_costs_256x256_t diagonal(error_cost_t match_score = 0,
                                                     error_cost_t mismatch_score = -1) noexcept {
         error_costs_256x256_t result;
         for (int i = 0; i != 256; ++i)
@@ -2473,7 +2473,7 @@ struct error_costs_26x26ascii_t {
      *  @brief BLOSUM62 substitution matrix for protein analysis in bioinformatics, reorganized for ASCII lookups.
      *  @see https://en.wikipedia.org/wiki/BLOSUM
      */
-    constexpr static error_costs_26x26ascii_t blosum62() {
+    static constexpr error_costs_26x26ascii_t blosum62() {
         constexpr error_cost_t na = -128; // Placeholder for unused characters
         return {
             {{4, -2, 0, -2, -1, -2, 0, -2, -1, na, -1, -1, -1, -2, na, -1, -1, -1, 1, 0, na, 0, -3, 0, -2, -1},
@@ -2503,11 +2503,12 @@ struct error_costs_26x26ascii_t {
              {-2, -3, -2, -3, -2, 3, -3, 2, -1, na, -2, -1, -1, -2, na, -3, -1, -2, -2, -2, na, -1, 2, -1, 7, -2},
              {-1, 1, -3, 1, 4, -3, -2, 0, -3, na, 1, -3, -1, 0, na, -1, 3, 0, 0, -1, na, -2, -3, -1, -2, 4}}};
     }
+
     /**
      *  @brief NUC.4.4 substitution matrix for DNA analysis in bioinformatics, reorganized for ASCII lookups.
      *  @see https://www.biostars.org/p/73028/#93435
      */
-    constexpr static error_costs_26x26ascii_t nuc44() {
+    static constexpr error_costs_26x26ascii_t nuc44() {
         constexpr error_cost_t na = -128; // Placeholder for unused characters
         return {
             {{5, -4, -4, -1, na, na, -4, -1, na, na, -4, na, 1, -2, na, na, na, 1, -4, -4, na, -1, 1, na, -4, na},
diff --git a/scripts/bench_fingerprint.cuh b/scripts/bench_fingerprint.cuh
index 9936a43c..920b589f 100644
--- a/scripts/bench_fingerprint.cuh
+++ b/scripts/bench_fingerprint.cuh
@@ -37,11 +37,12 @@ struct fingerprint_callable {
 
     environment_t const &env;
     fingerprints_t &fingerprints;
-    engine_t engine = {};
+    engine_t const &engine;
     std::tuple<extra_args_...> extra_args = {};
 
-    fingerprint_callable(environment_t const &env, fingerprints_t &fingerprints, engine_t eng = {}, extra_args_... args)
-        : env(env), fingerprints(fingerprints), engine(std::move(eng)), extra_args(args...) {}
+    fingerprint_callable(environment_t const &env, fingerprints_t &fingerprints, engine_t const &eng,
+                         extra_args_... args)
+        : env(env), fingerprints(fingerprints), engine(eng), extra_args(args...) {}
 
     call_result_t operator()() noexcept(false) {
 
@@ -96,7 +97,8 @@ void bench_fingerprint(environment_t const &env) {
     if (multiply_u32->try_extend(default_window_width_k, default_embedding_dims_k) != status_t::success_k)
         throw std::runtime_error("Can't build Multiplying Hasher.");
 
-    using rolling_f64_t = basic_rolling_hashers<floating_rolling_hasher<double>>;
+    using rolling_f64_t = basic_rolling_hashers<floating_rolling_hasher<double>,
+                                                std::allocator<floating_rolling_hasher<double>>, std::uint32_t>;
     auto rolling_f64 = std::make_unique<rolling_f64_t>();
     if (rolling_f64->try_extend(default_window_width_k, default_embedding_dims_k) != status_t::success_k)
         throw std::runtime_error("Can't build Floating f64 Rolling Hasher.");
diff --git a/scripts/bench_similarity.cuh b/scripts/bench_similarity.cuh
index 26848b6c..d6dfd125 100644
--- a/scripts/bench_similarity.cuh
+++ b/scripts/bench_similarity.cuh
@@ -87,6 +87,7 @@ struct similarities_equality_t {
 void bench_levenshtein(environment_t const &env) {
 
     using namespace std::string_literals; // for "s" suffix
+    namespace fu = fork_union;
 
 #if SZ_USE_CUDA
     gpu_specs_t specs = *gpu_specs();
@@ -100,9 +101,9 @@ void bench_levenshtein(environment_t const &env) {
     similarities_t results_utf8_baseline, results_utf8_accelerated;
 
     // Let's reuse a thread-pool to amortize the cost of spawning threads.
-    fork_union_t pool;
+    fu::basic_pool_t pool;
     if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
-    static_assert(executor_like<fork_union_t>);
+    static_assert(executor_like<fu::basic_pool_t>);
 
     auto scramble_accelerated_results = [&](similarities_t &results_accelerated) {
         std::shuffle(results_accelerated.begin(), results_accelerated.end(), global_random_generator());
@@ -118,7 +119,7 @@ void bench_levenshtein(environment_t const &env) {
         results_affine_baseline.resize(batch_size), results_affine_accelerated.resize(batch_size);
         results_utf8_baseline.resize(batch_size), results_utf8_accelerated.resize(batch_size);
 
-        auto call_linear_baseline = similarities_callable<levenshtein_serial_t, fork_union_t &>(
+        auto call_linear_baseline = similarities_callable<levenshtein_serial_t, fu::basic_pool_t &>(
             env, results_linear_baseline, levenshtein_serial_t {weird_uniform, weird_linear}, pool);
         auto name_linear_baseline = "levenshtein_serial:batch"s + std::to_string(batch_size);
         bench_result_t linear_baseline = bench_unary(env, name_linear_baseline, call_linear_baseline).log();
@@ -128,7 +129,7 @@ void bench_levenshtein(environment_t const &env) {
         auto name_utf8_baseline = "levenshtein_utf8_serial:batch"s + std::to_string(batch_size);
         bench_result_t utf8_baseline = bench_unary(env, name_utf8_baseline, call_utf8_baseline).log();
 
-        auto call_affine_baseline = similarities_callable<affine_levenshtein_serial_t, fork_union_t &>(
+        auto call_affine_baseline = similarities_callable<affine_levenshtein_serial_t, fu::basic_pool_t &>(
             env, results_affine_baseline, affine_levenshtein_serial_t {weird_uniform, weird_affine}, pool);
         auto name_affine_baseline = "affine_levenshtein_serial:batch"s + std::to_string(batch_size);
         bench_result_t affine_baseline =
@@ -137,7 +138,7 @@ void bench_levenshtein(environment_t const &env) {
 
 #if SZ_USE_ICE
         bench_unary(env, "levenshtein_ice:batch"s + std::to_string(batch_size), call_linear_baseline,
-                    similarities_callable<levenshtein_ice_t, fork_union_t &>(
+                    similarities_callable<levenshtein_ice_t, fu::basic_pool_t &>(
                         env, results_linear_accelerated, levenshtein_ice_t {weird_uniform, weird_linear}, pool),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
@@ -145,7 +146,7 @@ void bench_levenshtein(environment_t const &env) {
         scramble_accelerated_results(results_linear_accelerated);
 
         bench_unary(env, "affine_levenshtein_ice:batch"s + std::to_string(batch_size), call_affine_baseline,
-                    similarities_callable<affine_levenshtein_ice_t, fork_union_t &>(
+                    similarities_callable<affine_levenshtein_ice_t, fu::basic_pool_t &>(
                         env, results_affine_accelerated, affine_levenshtein_ice_t {weird_uniform, weird_affine}, pool),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
@@ -223,6 +224,7 @@ void bench_levenshtein(environment_t const &env) {
 void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
 
     using namespace std::string_literals; // for "s" suffix
+    namespace fu = fork_union;
 
     constexpr linear_gap_costs_t blosum62_linear_cost {-4};
     constexpr affine_gap_costs_t blosum62_affine_cost {-4, -1};
@@ -242,9 +244,9 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
     similarities_t results_affine_local_baseline, results_affine_local_accelerated;
 
     // Let's reuse a thread-pool to amortize the cost of spawning threads.
-    fork_union_t pool;
+    fu::basic_pool_t pool;
     if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
-    static_assert(executor_like<fork_union_t>);
+    static_assert(executor_like<fu::basic_pool_t>);
 
     auto scramble_accelerated_results = [&](similarities_t &results_accelerated) {
         std::shuffle(results_accelerated.begin(), results_accelerated.end(), global_random_generator());
@@ -256,25 +258,25 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
         results_linear_local_baseline.resize(batch_size), results_linear_local_accelerated.resize(batch_size);
         results_affine_local_baseline.resize(batch_size), results_affine_local_accelerated.resize(batch_size);
 
-        auto call_linear_global_baseline = similarities_callable<needleman_wunsch_serial_t, fork_union_t &>(
+        auto call_linear_global_baseline = similarities_callable<needleman_wunsch_serial_t, fu::basic_pool_t &>(
             env, results_linear_global_baseline, {blosum62_matrix, blosum62_linear_cost}, pool);
         auto name_linear_global_baseline = "needleman_wunsch_serial:batch"s + std::to_string(batch_size);
         bench_result_t linear_global_baseline =
             bench_unary(env, name_linear_global_baseline, call_linear_global_baseline).log();
 
-        auto call_linear_local_baseline = similarities_callable<smith_waterman_serial_t, fork_union_t &>(
+        auto call_linear_local_baseline = similarities_callable<smith_waterman_serial_t, fu::basic_pool_t &>(
             env, results_linear_local_baseline, {blosum62_matrix, blosum62_linear_cost}, pool);
         auto name_linear_local_baseline = "smith_waterman_serial:batch"s + std::to_string(batch_size);
         bench_result_t linear_local_baseline =
             bench_unary(env, name_linear_local_baseline, call_linear_local_baseline).log();
 
-        auto call_affine_global_baseline = similarities_callable<affine_needleman_wunsch_serial_t, fork_union_t &>(
+        auto call_affine_global_baseline = similarities_callable<affine_needleman_wunsch_serial_t, fu::basic_pool_t &>(
             env, results_affine_global_baseline, {blosum62_matrix, blosum62_affine_cost}, pool);
         auto name_affine_global_baseline = "affine_needleman_wunsch_serial:batch"s + std::to_string(batch_size);
         bench_result_t affine_global_baseline =
             bench_unary(env, name_affine_global_baseline, call_affine_global_baseline).log();
 
-        auto call_affine_local_baseline = similarities_callable<affine_smith_waterman_serial_t, fork_union_t &>(
+        auto call_affine_local_baseline = similarities_callable<affine_smith_waterman_serial_t, fu::basic_pool_t &>(
             env, results_affine_local_baseline, {blosum62_matrix, blosum62_affine_cost}, pool);
         auto name_affine_local_baseline = "affine_smith_waterman_serial:batch"s + std::to_string(batch_size);
         bench_result_t affine_local_baseline =
@@ -282,7 +284,7 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
 
 #if SZ_USE_ICE
         bench_unary(env, "needleman_wunsch_ice:batch"s + std::to_string(batch_size), call_linear_global_baseline,
-                    similarities_callable<needleman_wunsch_ice_t, fork_union_t &>(
+                    similarities_callable<needleman_wunsch_ice_t, fu::basic_pool_t &>(
                         env, results_linear_global_accelerated, {blosum62_matrix, blosum62_linear_cost}, pool),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
@@ -290,7 +292,7 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
         scramble_accelerated_results(results_linear_global_accelerated);
 
         bench_unary(env, "smith_waterman_ice:batch"s + std::to_string(batch_size), call_linear_local_baseline,
-                    similarities_callable<smith_waterman_ice_t, fork_union_t &>(
+                    similarities_callable<smith_waterman_ice_t, fu::basic_pool_t &>(
                         env, results_linear_local_accelerated, {blosum62_matrix, blosum62_linear_cost}, pool),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
@@ -301,7 +303,7 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
         //
         // bench_unary(env, "affine_needleman_wunsch_ice:batch"s + std::to_string(batch_size),
         // call_affine_global_baseline,
-        //             similarities_callable<affine_needleman_wunsch_ice_t, fork_union_t &>(
+        //             similarities_callable<affine_needleman_wunsch_ice_t, fu::basic_pool_t &>(
         //                 env, results_affine_global_accelerated, {blosum62_matrix, blosum62_affine_cost}, pool),
         //             callable_no_op_t {},        // preprocessing
         //             similarities_equality_t {}) // equality check
@@ -309,7 +311,7 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
         // scramble_accelerated_results(results_affine_global_accelerated);
         //
         // bench_unary(env, "affine_smith_waterman_ice:batch"s + std::to_string(batch_size), call_affine_local_baseline,
-        //             similarities_callable<affine_smith_waterman_ice_t, fork_union_t &>(
+        //             similarities_callable<affine_smith_waterman_ice_t, fu::basic_pool_t &>(
         //                 env, results_affine_local_accelerated, {blosum62_matrix, blosum62_affine_cost}, pool),
         //             callable_no_op_t {},        // preprocessing
         //             similarities_equality_t {}) // equality check

From 866e2f27635f423896675f35c329f5e2e15706ed Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 18 Jul 2025 00:06:44 +0000
Subject: [PATCH 470/751] Fix: Limit `constexpr` use in C++11

---
 include/stringzilla/types.hpp | 52 ++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 4cffd3a8..56336ee2 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -315,34 +315,34 @@ struct indexed_container_iterator {
     };
 
     constexpr proxy operator->() const noexcept { return proxy(operator*()); }
-    constexpr indexed_container_iterator &operator++() noexcept {
+    sz_constexpr_if_cpp14 indexed_container_iterator &operator++() noexcept {
         ++index_;
         return *this;
     }
 
-    constexpr indexed_container_iterator operator++(int) noexcept {
+    sz_constexpr_if_cpp14 indexed_container_iterator operator++(int) noexcept {
         indexed_container_iterator temp = *this;
         ++index_;
         return temp;
     }
 
-    constexpr indexed_container_iterator &operator--() noexcept {
+    sz_constexpr_if_cpp14 indexed_container_iterator &operator--() noexcept {
         --index_;
         return *this;
     }
 
-    constexpr indexed_container_iterator operator--(int) noexcept {
+    sz_constexpr_if_cpp14 indexed_container_iterator operator--(int) noexcept {
         indexed_container_iterator temp = *this;
         --index_;
         return temp;
     }
 
-    constexpr indexed_container_iterator &operator+=(difference_type n) noexcept {
+    sz_constexpr_if_cpp14 indexed_container_iterator &operator+=(difference_type n) noexcept {
         index_ += n;
         return *this;
     }
 
-    constexpr indexed_container_iterator &operator-=(difference_type n) noexcept {
+    sz_constexpr_if_cpp14 indexed_container_iterator &operator-=(difference_type n) noexcept {
         index_ -= n;
         return *this;
     }
@@ -576,7 +576,7 @@ struct arrow_strings_tape {
         return status_t::success_k;
     }
 
-    constexpr value_type operator[](size_t i) const noexcept {
+    sz_constexpr_if_cpp14 value_type operator[](size_t i) const noexcept {
         _sz_assert(i < count_ && "Index out of bounds");
         return {buffer_.data_ + offsets_.data_[i], offsets_.data_[i + 1] - offsets_.data_[i] - 1};
     }
@@ -605,35 +605,35 @@ struct constant_iterator {
     constexpr reference operator*() const { return value_; }
     constexpr pointer operator->() const { return &value_; }
 
-    constexpr constant_iterator &operator++() {
+    sz_constexpr_if_cpp14 constant_iterator &operator++() {
         ++pos_;
         return *this;
     }
-    constexpr constant_iterator operator++(int) {
+    sz_constexpr_if_cpp14 constant_iterator operator++(int) {
         constexpr constant_iterator tmp(*this);
         ++pos_;
         return tmp;
     }
-    constexpr constant_iterator &operator--() {
+    sz_constexpr_if_cpp14 constant_iterator &operator--() {
         --pos_;
         return *this;
     }
-    constexpr constant_iterator operator--(int) {
+    sz_constexpr_if_cpp14 constant_iterator operator--(int) {
         constexpr constant_iterator tmp(*this);
         --pos_;
         return tmp;
     }
-
-    constexpr constant_iterator operator+(difference_type n) const { return constant_iterator(value_, pos_ + n); }
-    constexpr constant_iterator &operator+=(difference_type n) {
+    sz_constexpr_if_cpp14 constant_iterator &operator+=(difference_type n) {
         pos_ += n;
         return *this;
     }
-    constexpr constant_iterator operator-(difference_type n) const { return constant_iterator(value_, pos_ - n); }
-    constexpr constant_iterator &operator-=(difference_type n) {
+    sz_constexpr_if_cpp14 constant_iterator &operator-=(difference_type n) {
         pos_ -= n;
         return *this;
     }
+
+    constexpr constant_iterator operator+(difference_type n) const { return constant_iterator(value_, pos_ + n); }
+    constexpr constant_iterator operator-(difference_type n) const { return constant_iterator(value_, pos_ - n); }
     constexpr difference_type operator-(constant_iterator const &other) const { return pos_ - other.pos_; }
 
     constexpr reference operator[](difference_type) const { return value_; }
@@ -663,14 +663,16 @@ struct random_access_range {
     constexpr begin_type_ begin() const { return begin_; }
     constexpr end_type_ end() const { return end_; }
 
-    decltype(auto) operator[](std::size_t index) const {
+    reference_type operator[](std::size_t index) const {
         _sz_assert(index < size());
         return *(begin_ + index);
     }
 };
 
+#if _SZ_IS_CPP17 // ? Template deduction guides are available in C++17 and later
 template <typename begin_type_, typename end_type_>
 random_access_range(begin_type_, end_type_) -> random_access_range<begin_type_, end_type_>;
+#endif
 
 template <typename value_type_, size_t count_>
 struct safe_array {
@@ -682,14 +684,14 @@ struct safe_array {
 
     value_type data_[count_k] = {};
 
-    constexpr value_type &operator[](size_type i) noexcept { return data_[i]; }
+    sz_constexpr_if_cpp14 value_type &operator[](size_type i) noexcept { return data_[i]; }
     constexpr value_type const &operator[](size_type i) const noexcept { return data_[i]; }
     constexpr size_type size() const noexcept { return count_k; }
-    constexpr value_type *data() noexcept { return data_; }
+    sz_constexpr_if_cpp14 value_type *data() noexcept { return data_; }
     constexpr value_type const *data() const noexcept { return data_; }
-    constexpr iterator begin() noexcept { return data_; }
+    sz_constexpr_if_cpp14 iterator begin() noexcept { return data_; }
     constexpr const_iterator begin() const noexcept { return data_; }
-    constexpr iterator end() noexcept { return data_ + count_k; }
+    sz_constexpr_if_cpp14 iterator end() noexcept { return data_ + count_k; }
     constexpr const_iterator end() const noexcept { return data_ + count_k; }
 };
 
@@ -936,7 +938,7 @@ struct cpu_specs_t {
  *  @note  This is equivalent to `ceil(x / divisor)`, but avoids floating-point arithmetic.
  */
 template <typename scalar_type_>
-constexpr scalar_type_ divide_round_up(scalar_type_ x, scalar_type_ divisor) {
+sz_constexpr_if_cpp14 scalar_type_ divide_round_up(scalar_type_ x, scalar_type_ divisor) {
     _sz_assert(divisor > 0 && "Divisor must be positive");
     return (x + divisor - 1) / divisor;
 }
@@ -945,7 +947,7 @@ constexpr scalar_type_ divide_round_up(scalar_type_ x, scalar_type_ divisor) {
  *  @brief Rounds @p x up to the nearest multiple of @p divisor.
  */
 template <typename scalar_type_>
-constexpr scalar_type_ round_up_to_multiple(scalar_type_ x, scalar_type_ divisor) {
+sz_constexpr_if_cpp14 scalar_type_ round_up_to_multiple(scalar_type_ x, scalar_type_ divisor) {
     _sz_assert(divisor > 0 && "Divisor must be positive");
     return divide_round_up(x, divisor) * divisor;
 }
@@ -954,7 +956,7 @@ constexpr scalar_type_ round_up_to_multiple(scalar_type_ x, scalar_type_ divisor
  *  @brief Equivalent to `(condition ? value : 0)`, but avoids branching.
  */
 template <typename value_type_>
-constexpr value_type_ non_zero_if(value_type_ value, value_type_ condition) noexcept {
+sz_constexpr_if_cpp14 value_type_ non_zero_if(value_type_ value, value_type_ condition) noexcept {
     static_assert(std::is_unsigned<value_type_>::value, "Value type must be unsigned integer");
     _sz_assert((condition == 0 || condition == 1) && "Condition must be either 0 or 1 unsigned integer");
     return value * condition;
@@ -971,7 +973,7 @@ struct head_body_tail_t {
 };
 
 template <size_t elements_per_page_, typename element_type_>
-constexpr head_body_tail_t head_body_tail(element_type_ *first_address, size_t total_length) noexcept {
+sz_constexpr_if_cpp14 head_body_tail_t head_body_tail(element_type_ *first_address, size_t total_length) noexcept {
     constexpr size_t bytes_per_element = sizeof(element_type_);
     constexpr size_t bytes_per_page = elements_per_page_ * bytes_per_element;
     static_assert(bytes_per_page > 0 && "Slice size must be positive");

From 78d8c96b4b37cb6b38f3c081e32c9c08d3ce1d84 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 18 Jul 2025 00:28:03 +0000
Subject: [PATCH 471/751] Improve: Move `safe_vector`

The parallel StringZillas part only requires C++17
compatibility, so we can use `if constexpr`.
---
 include/stringzilla/types.hpp  | 181 +--------------------------------
 include/stringzillas/types.hpp | 177 ++++++++++++++++++++++++++++++++
 2 files changed, 179 insertions(+), 179 deletions(-)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 56336ee2..bfffa505 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -695,183 +695,6 @@ struct safe_array {
     constexpr const_iterator end() const noexcept { return data_ + count_k; }
 };
 
-/**
- *  @brief  Safer alternative to `std::vector`, that avoids exceptions, copy constructors,
- *          and provides alternative `try_push_back` and `try_reserve` for faulty memory allocations.
- */
-template <typename value_type_, typename allocator_type_>
-class safe_vector {
-  public:
-    using value_type = value_type_;
-    using size_type = std::size_t;
-    using allocator_type = allocator_type_;
-
-    using allocator_traits = std::allocator_traits<allocator_type>;
-    using allocated_type = typename allocator_traits::value_type;
-    static_assert(sizeof(value_type) == sizeof(allocated_type),
-                  "Allocator value type must be the same size as the vector value type");
-    static_assert(allocator_traits::propagate_on_container_move_assignment::value,
-                  "Allocator must propagate on move assignment, otherwise the move assignment won't be `noexcept`.");
-
-  private:
-    value_type *data_;
-    size_type size_;
-    size_type capacity_;
-    allocator_type alloc_;
-
-  public:
-    safe_vector() noexcept : data_(nullptr), size_(0), capacity_(0), alloc_() {}
-    safe_vector(allocator_type alloc) noexcept : data_(nullptr), size_(0), capacity_(0), alloc_(alloc) {}
-    ~safe_vector() noexcept { reset(); }
-
-    void clear() noexcept {
-        if constexpr (!std::is_trivially_destructible<value_type>::value)
-            for (size_type i = 0; i < size_; ++i) data_[i].~value_type();
-        size_ = 0;
-    }
-
-    void reset() noexcept {
-        clear();
-        if (data_) alloc_.deallocate((allocated_type *)data_, capacity_);
-        data_ = nullptr;
-        size_ = 0;
-        capacity_ = 0;
-    }
-
-    /** @warning Use `try_assign` instead to handle out-of-memory failures. */
-    safe_vector(safe_vector const &other) = delete;
-    /** @warning Use `try_assign` instead to handle out-of-memory failures. */
-    safe_vector &operator=(safe_vector const &other) = delete;
-
-    safe_vector(safe_vector &&other) noexcept
-        : data_(other.data_), size_(other.size_), capacity_(other.capacity_), alloc_(std::move(other.alloc_)) {
-        other.data_ = nullptr;
-        other.size_ = 0;
-        other.capacity_ = 0;
-    }
-
-    safe_vector &operator=(safe_vector &&other) noexcept {
-        if (this != &other) {
-            clear();
-            if (data_) alloc_.deallocate((allocated_type *)data_, capacity_);
-            data_ = other.data_;
-            size_ = other.size_;
-            capacity_ = other.capacity_;
-            alloc_ = std::move(other.alloc_);
-            other.data_ = nullptr;
-            other.size_ = 0;
-            other.capacity_ = 0;
-        }
-        return *this;
-    }
-
-    status_t try_assign(span<value_type const> const other) noexcept {
-        reset();
-
-        if (other.size() == 0) return status_t::success_k; // Nothing to do :)
-
-        // Allocate exact needed capacity
-        size_type new_cap = other.size();
-        allocated_type *raw = allocator_traits::allocate(alloc_, new_cap);
-        if (!raw) return status_t::bad_alloc_k;
-        data_ = reinterpret_cast<value_type *>(raw);
-        capacity_ = new_cap;
-
-        // Copy‐construct each element
-        if constexpr (!std::is_trivially_constructible<value_type>::value)
-            for (size_type i = 0; i < other.size(); ++i) new (data_ + i) value_type(other[i]);
-        else
-            for (size_type i = 0; i < other.size(); ++i) data_[i] = other[i];
-        size_ = other.size();
-        return status_t::success_k;
-    }
-
-    template <typename other_allocator_type_ = allocator_type>
-    status_t try_assign(safe_vector<value_type, other_allocator_type_> const &other) noexcept {
-        if constexpr (allocator_traits::propagate_on_container_copy_assignment::value) alloc_ = other.alloc_;
-        return try_assign(span<value_type>(other.data(), other.size()));
-    }
-
-    status_t try_reserve(size_type new_cap) noexcept {
-        if (new_cap <= capacity_) return status_t::success_k;
-        value_type *new_data = (value_type *)alloc_.allocate(new_cap);
-        if (!new_data) return status_t::bad_alloc_k;
-        for (size_type i = 0; i < size_; ++i) {
-            new (new_data + i) value_type(std::move(data_[i]));
-            if constexpr (!std::is_trivially_destructible<value_type>::value) data_[i].~value_type();
-        }
-        if (data_) alloc_.deallocate((allocated_type *)data_, capacity_);
-        data_ = new_data;
-        capacity_ = new_cap;
-        return status_t::success_k;
-    }
-
-    status_t try_resize(size_type new_size) noexcept {
-        if (new_size > capacity_ && try_reserve(new_size) != status_t::success_k) return status_t::bad_alloc_k;
-
-        if (new_size > size_) {
-            if constexpr (!std::is_trivially_constructible<value_type>::value)
-                for (size_type i = size_; i < new_size; ++i) new (data_ + i) value_type();
-        }
-        else if (new_size < size_) {
-            if constexpr (!std::is_trivially_destructible<value_type>::value)
-                for (size_type i = new_size; i < size_; ++i) data_[i].~value_type();
-        }
-
-        size_ = new_size;
-        return status_t::success_k;
-    }
-
-    status_t try_push_back(value_type const &val) noexcept {
-        if (size_ == capacity_) {
-            size_type new_cap = capacity_ ? capacity_ * 2 : 1;
-            if (try_reserve(new_cap) != status_t::success_k) return status_t::bad_alloc_k;
-        }
-        new (data_ + size_) value_type(val);
-        ++size_;
-        return status_t::success_k;
-    }
-
-    status_t try_push_back(value_type &&val) noexcept {
-        if (size_ == capacity_) {
-            size_type new_cap = capacity_ ? capacity_ * 2 : 1;
-            if (try_reserve(new_cap) != status_t::success_k) return status_t::bad_alloc_k;
-        }
-        new (data_ + size_) value_type(std::move(val));
-        ++size_;
-        return status_t::success_k;
-    }
-
-    status_t try_append(span<value_type const> source) noexcept {
-        size_type needed = size_ + source.size();
-        if (needed > capacity_) {
-            size_type new_cap = capacity_ ? capacity_ : 1;
-            while (new_cap < needed) new_cap *= 2;
-            if (try_reserve(new_cap) != status_t::success_k) return status_t::bad_alloc_k;
-        }
-        for (size_type i = 0; i < source.size(); ++i) new (data_ + size_ + i) value_type(source[i]);
-        size_ = needed;
-        return status_t::success_k;
-    }
-
-    value_type *begin() noexcept { return data_; }
-    value_type const *begin() const noexcept { return data_; }
-    value_type *end() noexcept { return data_ + size_; }
-    value_type const *end() const noexcept { return data_ + size_; }
-    value_type &operator[](size_type i) noexcept { return data_[i]; }
-    value_type const &operator[](size_type i) const noexcept { return data_[i]; }
-    value_type *data() noexcept { return data_; }
-    value_type const *data() const noexcept { return data_; }
-    value_type &front() noexcept { return data_[0]; }
-    value_type const &front() const noexcept { return data_[0]; }
-    value_type &back() noexcept { return data_[size_ - 1]; }
-    value_type const &back() const noexcept { return data_[size_ - 1]; }
-    size_type size() const noexcept { return size_; }
-    size_type capacity() const noexcept { return capacity_; }
-    operator span<value_type>() noexcept { return {data_, size_}; }
-    operator span<value_type const>() const noexcept { return {data_, size_}; }
-};
-
 template <typename first_, typename second_>
 struct is_same_type;
 
@@ -976,7 +799,7 @@ template <size_t elements_per_page_, typename element_type_>
 sz_constexpr_if_cpp14 head_body_tail_t head_body_tail(element_type_ *first_address, size_t total_length) noexcept {
     constexpr size_t bytes_per_element = sizeof(element_type_);
     constexpr size_t bytes_per_page = elements_per_page_ * bytes_per_element;
-    static_assert(bytes_per_page > 0 && "Slice size must be positive");
+    static_assert(bytes_per_page > 0, "Slice size must be positive");
 
     // To split into head, body, and tail, we need the `first_address` to be
     // a multiple of `bytes_per_element`, otherwise the `body` will always be a zero!
@@ -995,7 +818,7 @@ sz_constexpr_if_cpp14 head_body_tail_t head_body_tail(element_type_ *first_addre
     _sz_assert(elements_in_tail < elements_per_page_ && elements_in_tail <= total_length);
     _sz_assert(elements_in_body % elements_per_page_ == 0);
 
-    return {elements_in_head, elements_in_body, elements_in_tail};
+    return head_body_tail_t {elements_in_head, elements_in_body, elements_in_tail};
 }
 
 } // namespace stringzilla
diff --git a/include/stringzillas/types.hpp b/include/stringzillas/types.hpp
index e33c7f84..bcdfe55b 100644
--- a/include/stringzillas/types.hpp
+++ b/include/stringzillas/types.hpp
@@ -215,6 +215,183 @@ size_t group_by(begin_iterator_type_ const begin, end_iterator_type_ const end,
     return group_count;
 }
 
+/**
+ *  @brief  Safer alternative to `std::vector`, that avoids exceptions, copy constructors,
+ *          and provides alternative `try_push_back` and `try_reserve` for faulty memory allocations.
+ */
+template <typename value_type_, typename allocator_type_>
+class safe_vector {
+  public:
+    using value_type = value_type_;
+    using size_type = std::size_t;
+    using allocator_type = allocator_type_;
+
+    using allocator_traits = std::allocator_traits<allocator_type>;
+    using allocated_type = typename allocator_traits::value_type;
+    static_assert(sizeof(value_type) == sizeof(allocated_type),
+                  "Allocator value type must be the same size as the vector value type");
+    static_assert(allocator_traits::propagate_on_container_move_assignment::value,
+                  "Allocator must propagate on move assignment, otherwise the move assignment won't be `noexcept`.");
+
+  private:
+    value_type *data_;
+    size_type size_;
+    size_type capacity_;
+    allocator_type alloc_;
+
+  public:
+    safe_vector() noexcept : data_(nullptr), size_(0), capacity_(0), alloc_() {}
+    safe_vector(allocator_type alloc) noexcept : data_(nullptr), size_(0), capacity_(0), alloc_(alloc) {}
+    ~safe_vector() noexcept { reset(); }
+
+    void clear() noexcept {
+        if constexpr (!std::is_trivially_destructible<value_type>::value)
+            for (size_type i = 0; i < size_; ++i) data_[i].~value_type();
+        size_ = 0;
+    }
+
+    void reset() noexcept {
+        clear();
+        if (data_) alloc_.deallocate((allocated_type *)data_, capacity_);
+        data_ = nullptr;
+        size_ = 0;
+        capacity_ = 0;
+    }
+
+    /** @warning Use `try_assign` instead to handle out-of-memory failures. */
+    safe_vector(safe_vector const &other) = delete;
+    /** @warning Use `try_assign` instead to handle out-of-memory failures. */
+    safe_vector &operator=(safe_vector const &other) = delete;
+
+    safe_vector(safe_vector &&other) noexcept
+        : data_(other.data_), size_(other.size_), capacity_(other.capacity_), alloc_(std::move(other.alloc_)) {
+        other.data_ = nullptr;
+        other.size_ = 0;
+        other.capacity_ = 0;
+    }
+
+    safe_vector &operator=(safe_vector &&other) noexcept {
+        if (this != &other) {
+            clear();
+            if (data_) alloc_.deallocate((allocated_type *)data_, capacity_);
+            data_ = other.data_;
+            size_ = other.size_;
+            capacity_ = other.capacity_;
+            alloc_ = std::move(other.alloc_);
+            other.data_ = nullptr;
+            other.size_ = 0;
+            other.capacity_ = 0;
+        }
+        return *this;
+    }
+
+    status_t try_assign(span<value_type const> const other) noexcept {
+        reset();
+
+        if (other.size() == 0) return status_t::success_k; // Nothing to do :)
+
+        // Allocate exact needed capacity
+        size_type new_cap = other.size();
+        allocated_type *raw = allocator_traits::allocate(alloc_, new_cap);
+        if (!raw) return status_t::bad_alloc_k;
+        data_ = reinterpret_cast<value_type *>(raw);
+        capacity_ = new_cap;
+
+        // Copy‐construct each element
+        if constexpr (!std::is_trivially_constructible<value_type>::value)
+            for (size_type i = 0; i < other.size(); ++i) new (data_ + i) value_type(other[i]);
+        else
+            for (size_type i = 0; i < other.size(); ++i) data_[i] = other[i];
+        size_ = other.size();
+        return status_t::success_k;
+    }
+
+    template <typename other_allocator_type_ = allocator_type>
+    status_t try_assign(safe_vector<value_type, other_allocator_type_> const &other) noexcept {
+        if constexpr (allocator_traits::propagate_on_container_copy_assignment::value) alloc_ = other.alloc_;
+        return try_assign(span<value_type>(other.data(), other.size()));
+    }
+
+    status_t try_reserve(size_type new_cap) noexcept {
+        if (new_cap <= capacity_) return status_t::success_k;
+        value_type *new_data = (value_type *)alloc_.allocate(new_cap);
+        if (!new_data) return status_t::bad_alloc_k;
+        for (size_type i = 0; i < size_; ++i) {
+            new (new_data + i) value_type(std::move(data_[i]));
+            if constexpr (!std::is_trivially_destructible<value_type>::value) data_[i].~value_type();
+        }
+        if (data_) alloc_.deallocate((allocated_type *)data_, capacity_);
+        data_ = new_data;
+        capacity_ = new_cap;
+        return status_t::success_k;
+    }
+
+    status_t try_resize(size_type new_size) noexcept {
+        if (new_size > capacity_ && try_reserve(new_size) != status_t::success_k) return status_t::bad_alloc_k;
+
+        if (new_size > size_) {
+            if constexpr (!std::is_trivially_constructible<value_type>::value)
+                for (size_type i = size_; i < new_size; ++i) new (data_ + i) value_type();
+        }
+        else if (new_size < size_) {
+            if constexpr (!std::is_trivially_destructible<value_type>::value)
+                for (size_type i = new_size; i < size_; ++i) data_[i].~value_type();
+        }
+
+        size_ = new_size;
+        return status_t::success_k;
+    }
+
+    status_t try_push_back(value_type const &val) noexcept {
+        if (size_ == capacity_) {
+            size_type new_cap = capacity_ ? capacity_ * 2 : 1;
+            if (try_reserve(new_cap) != status_t::success_k) return status_t::bad_alloc_k;
+        }
+        new (data_ + size_) value_type(val);
+        ++size_;
+        return status_t::success_k;
+    }
+
+    status_t try_push_back(value_type &&val) noexcept {
+        if (size_ == capacity_) {
+            size_type new_cap = capacity_ ? capacity_ * 2 : 1;
+            if (try_reserve(new_cap) != status_t::success_k) return status_t::bad_alloc_k;
+        }
+        new (data_ + size_) value_type(std::move(val));
+        ++size_;
+        return status_t::success_k;
+    }
+
+    status_t try_append(span<value_type const> source) noexcept {
+        size_type needed = size_ + source.size();
+        if (needed > capacity_) {
+            size_type new_cap = capacity_ ? capacity_ : 1;
+            while (new_cap < needed) new_cap *= 2;
+            if (try_reserve(new_cap) != status_t::success_k) return status_t::bad_alloc_k;
+        }
+        for (size_type i = 0; i < source.size(); ++i) new (data_ + size_ + i) value_type(source[i]);
+        size_ = needed;
+        return status_t::success_k;
+    }
+
+    value_type *begin() noexcept { return data_; }
+    value_type const *begin() const noexcept { return data_; }
+    value_type *end() noexcept { return data_ + size_; }
+    value_type const *end() const noexcept { return data_ + size_; }
+    value_type &operator[](size_type i) noexcept { return data_[i]; }
+    value_type const &operator[](size_type i) const noexcept { return data_[i]; }
+    value_type *data() noexcept { return data_; }
+    value_type const *data() const noexcept { return data_; }
+    value_type &front() noexcept { return data_[0]; }
+    value_type const &front() const noexcept { return data_[0]; }
+    value_type &back() noexcept { return data_[size_ - 1]; }
+    value_type const &back() const noexcept { return data_[size_ - 1]; }
+    size_type size() const noexcept { return size_; }
+    size_type capacity() const noexcept { return capacity_; }
+    operator span<value_type>() noexcept { return {data_, size_}; }
+    operator span<value_type const>() const noexcept { return {data_, size_}; }
+};
+
 } // namespace stringzillas
 } // namespace ashvardanian
 

From e6cdb93b17b23fcaae4b51beb5a5c6354a7e2e4d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 18 Jul 2025 00:28:33 +0000
Subject: [PATCH 472/751] Add: `lock_guard` to avoid STL

---
 include/stringzillas/fingerprint.hpp |  2 +-
 include/stringzillas/types.hpp       | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/include/stringzillas/fingerprint.hpp b/include/stringzillas/fingerprint.hpp
index a2c74bc1..cfd52b45 100644
--- a/include/stringzillas/fingerprint.hpp
+++ b/include/stringzillas/fingerprint.hpp
@@ -891,7 +891,7 @@ struct floating_rolling_hashers {
                 rolling_hash_t thread_local_minimum_hashes[dimensions_k];
                 fill_states_(thread_local_text, thread_local_last_hashes, thread_local_minimum_hashes);
 
-                std::lock_guard lock(gather_mutex);
+                lock_guard lock(gather_mutex);
                 for (std::size_t dim = 0; dim < dimensions_k; ++dim)
                     minimum_hashes[dim] = (std::min)(minimum_hashes[dim], thread_local_minimum_hashes[dim]);
             });
diff --git a/include/stringzillas/types.hpp b/include/stringzillas/types.hpp
index bcdfe55b..ec47279d 100644
--- a/include/stringzillas/types.hpp
+++ b/include/stringzillas/types.hpp
@@ -28,6 +28,24 @@ struct dummy_mutex_t {
     constexpr void unlock() noexcept {}
 };
 
+/**
+ *  @brief  Simple RAII lock guard analog to `std::lock_guard` for C++11 compatibility.
+ *          Automatically locks the mutex on construction and unlocks on destruction.
+ */
+template <typename mutex_type_>
+class lock_guard {
+    mutex_type_ &mutex_;
+
+  public:
+    explicit lock_guard(mutex_type_ &mutex) noexcept : mutex_(mutex) { mutex_.lock(); }
+    ~lock_guard() noexcept { mutex_.unlock(); }
+
+    lock_guard(lock_guard &&) = delete;
+    lock_guard(lock_guard const &) = delete;
+    lock_guard &operator=(lock_guard &&) = delete;
+    lock_guard &operator=(lock_guard const &) = delete;
+};
+
 struct dummy_prong_t {
     std::size_t task = 0;
     std::size_t thread = 0;

From ffee12b1bbc0f577418b854c3f63286704d5fc23 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 18 Jul 2025 00:29:05 +0000
Subject: [PATCH 473/751] Fix: Backport `std::remove_cvref` to C++17

---
 include/stringzillas/similarity.hpp | 4 ++--
 include/stringzillas/types.hpp      | 7 +++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/include/stringzillas/similarity.hpp b/include/stringzillas/similarity.hpp
index 933a52e0..5ba250fa 100644
--- a/include/stringzillas/similarity.hpp
+++ b/include/stringzillas/similarity.hpp
@@ -530,7 +530,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
     using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
-    using char_t = typename std::remove_cvref<first_char_t>::type;
+    using char_t = remove_cvref<first_char_t>;
 
     using tile_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, gap_costs_t,
                                       objective_k, locality_k, capability_k>;
@@ -713,7 +713,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
     using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
-    using char_t = typename std::remove_cvref<first_char_t>::type;
+    using char_t = remove_cvref<first_char_t>;
 
     using tile_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, gap_costs_t,
                                       objective_k, locality_k, capability_k>;
diff --git a/include/stringzillas/types.hpp b/include/stringzillas/types.hpp
index ec47279d..6ebb7ce7 100644
--- a/include/stringzillas/types.hpp
+++ b/include/stringzillas/types.hpp
@@ -53,6 +53,13 @@ struct dummy_prong_t {
     operator std::size_t() const noexcept { return task; }
 };
 
+/**
+ *  @brief  C++17-compatible equivalent of std::remove_cvref (which was added in C++20).
+ *          Removes const, volatile, and reference qualifiers from a type.
+ */
+template <typename type_>
+using remove_cvref = typename std::remove_cv<typename std::remove_reference<type_>::type>::type;
+
 struct dummy_executor_t {
 
     constexpr size_t threads_count() const noexcept { return 1; }

From 60763b3751ccad479356fbcd07c64618fcca0970 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 18 Jul 2025 00:29:25 +0000
Subject: [PATCH 474/751] Fix: Guard C++20 concepts use

---
 include/stringzillas/types.hpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/stringzillas/types.hpp b/include/stringzillas/types.hpp
index 6ebb7ce7..f4745a31 100644
--- a/include/stringzillas/types.hpp
+++ b/include/stringzillas/types.hpp
@@ -106,6 +106,7 @@ struct dummy_executor_t {
     }
 };
 
+#if _SZ_IS_CPP20
 template <typename executor_type_>
 concept executor_like = requires(executor_type_ executor) {
 #if !defined(__NVCC__) && 0
@@ -126,6 +127,7 @@ concept executor_like = requires(executor_type_ executor) {
     sizeof(executor) > 0;
 #endif
 };
+#endif
 
 struct openmp_executor_t {
 
@@ -198,6 +200,7 @@ struct openmp_executor_t {
     }
 };
 
+#if _SZ_IS_CPP20
 #if !defined(__NVCC__)
 static_assert(executor_like<dummy_executor_t>);
 static_assert(executor_like<openmp_executor_t>);
@@ -212,6 +215,7 @@ concept continuous_like = requires(continuous_type_ container) {
 
 static_assert(continuous_like<span<char>>);
 static_assert(!continuous_like<int>);
+#endif
 
 /**
  *  @brief  A function that takes a range of elements and a @p callback function and groups the elements

From 7ea685d146d3781ab9c732ea0abbf22bffd62d43 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 18 Jul 2025 00:35:49 +0000
Subject: [PATCH 475/751] Fix: C++17 compatibility issues

---
 include/stringzillas/fingerprint.hpp |  5 ++--
 include/stringzillas/similarity.cuh  |  8 +++---
 include/stringzillas/similarity.hpp  | 20 ++++++++++++---
 scripts/test_fingerprint.cuh         | 13 ----------
 scripts/test_similarity.cuh          | 38 ++++++++++++++--------------
 scripts/test_stringzilla.hpp         |  2 +-
 6 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/include/stringzillas/fingerprint.hpp b/include/stringzillas/fingerprint.hpp
index cfd52b45..57c60eab 100644
--- a/include/stringzillas/fingerprint.hpp
+++ b/include/stringzillas/fingerprint.hpp
@@ -157,8 +157,9 @@ struct rabin_karp_rolling_hasher {
     using hash_t = hash_type_;
     using accumulator_t = accumulator_type_;
 
-    static_assert(std::is_same_v<hash_t, std::uint16_t> || std::is_same_v<hash_t, std::uint32_t> ||
-                  std::is_same_v<hash_t, std::uint64_t>);
+    static_assert(std::is_same<hash_t, std::uint16_t>::value || std::is_same<hash_t, std::uint32_t>::value ||
+                      std::is_same<hash_t, std::uint64_t>::value,
+                  "Unsupported hash type");
 
     static constexpr hash_t default_alphabet_size_k = 256u;
     static constexpr hash_t default_modulo_base_k = //
diff --git a/include/stringzillas/similarity.cuh b/include/stringzillas/similarity.cuh
index e2528546..56d6eb3f 100644
--- a/include/stringzillas/similarity.cuh
+++ b/include/stringzillas/similarity.cuh
@@ -165,7 +165,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
     using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
-    using char_t = typename std::remove_cvref<first_char_t>::type;
+    using char_t = remove_cvref<first_char_t>;
 
     using cuda_warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t,
                                            linear_gap_costs_t, objective_k, sz_similarity_global_k, capability_k>;
@@ -263,7 +263,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
     using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
-    using char_t = typename std::remove_cvref<first_char_t>::type;
+    using char_t = remove_cvref<first_char_t>;
 
     using cuda_warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t,
                                            linear_gap_costs_t, objective_k, sz_similarity_local_k, capability_k>;
@@ -364,7 +364,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
     using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
-    using char_t = typename std::remove_cvref<first_char_t>::type;
+    using char_t = remove_cvref<first_char_t>;
 
     using cuda_warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t,
                                            affine_gap_costs_t, objective_k, sz_similarity_global_k, capability_k>;
@@ -486,7 +486,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
     using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
     static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
-    using char_t = typename std::remove_cvref<first_char_t>::type;
+    using char_t = remove_cvref<first_char_t>;
 
     using cuda_warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t,
                                            affine_gap_costs_t, objective_k, sz_similarity_local_k, capability_k>;
diff --git a/include/stringzillas/similarity.hpp b/include/stringzillas/similarity.hpp
index 5ba250fa..186d7b17 100644
--- a/include/stringzillas/similarity.hpp
+++ b/include/stringzillas/similarity.hpp
@@ -2220,7 +2220,10 @@ struct levenshtein_distances {
     }
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_,
-              executor_like executor_type_>
+              typename executor_type_>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
                         results_type_ &&results, executor_type_ &&executor,
                         cpu_specs_t const &specs = {}) const noexcept {
@@ -2268,7 +2271,10 @@ struct levenshtein_distances_utf8 {
     }
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_,
-              executor_like executor_type_>
+              typename executor_type_>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
                         results_type_ &&results, executor_type_ &&executor,
                         cpu_specs_t const &specs = {}) const noexcept {
@@ -2317,7 +2323,10 @@ struct needleman_wunsch_scores {
     }
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_,
-              executor_like executor_type_>
+              typename executor_type_>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
                         results_type_ &&results, executor_type_ &&executor,
                         cpu_specs_t const &specs = {}) const noexcept {
@@ -2366,7 +2375,10 @@ struct smith_waterman_scores {
     }
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_,
-              executor_like executor_type_>
+              typename executor_type_>
+#if _SZ_IS_CPP20
+        requires executor_like<executor_type_>
+#endif
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
                         results_type_ &&results, executor_type_ &&executor,
                         cpu_specs_t const &specs = {}) const noexcept {
diff --git a/scripts/test_fingerprint.cuh b/scripts/test_fingerprint.cuh
index a3879737..c3805bce 100644
--- a/scripts/test_fingerprint.cuh
+++ b/scripts/test_fingerprint.cuh
@@ -107,19 +107,6 @@ std::vector<std::string> rolling_hasher_dna_like_inputs() {
     return strings;
 }
 
-std::vector<std::string> rolling_hasher_dna_like_inputs() {
-    std::vector<std::string> strings;
-
-    fuzzy_config_t config;
-    config.alphabet = "ACGT";
-    config.batch_size = 100;
-    config.min_string_length = 100;
-    config.max_string_length = 100 * 1024;
-
-    randomize_strings(config, strings);
-    return strings;
-}
-
 std::vector<std::string> rolling_hasher_inconvenient_inputs() {
     std::vector<std::string> strings;
 
diff --git a/scripts/test_similarity.cuh b/scripts/test_similarity.cuh
index 2b645d8a..60c88a40 100644
--- a/scripts/test_similarity.cuh
+++ b/scripts/test_similarity.cuh
@@ -833,28 +833,28 @@ void test_similarity_scores_equivalence() {
  */
 void test_similarity_scores_memory_usage() {
 
-    std::vector<fuzzy_config_t> experiments = {
+    std::vector<fuzzy_config_t> experiments {
         // Single string pair of same length:
-        {.batch_size = 1, .min_string_length = 128, .max_string_length = 128},
-        {.batch_size = 1, .min_string_length = 512, .max_string_length = 512},
-        {.batch_size = 1, .min_string_length = 2048, .max_string_length = 2048},
-        {.batch_size = 1, .min_string_length = 8192, .max_string_length = 8192},
-        {.batch_size = 1, .min_string_length = 32768, .max_string_length = 32768},
-        {.batch_size = 1, .min_string_length = 131072, .max_string_length = 131072},
+        {"ABC", /* batch_size */ 1, /* min_string_length */ 128, /* max_string_length */ 128},
+        {"ABC", /* batch_size */ 1, /* min_string_length */ 512, /* max_string_length */ 512},
+        {"ABC", /* batch_size */ 1, /* min_string_length */ 2048, /* max_string_length */ 2048},
+        {"ABC", /* batch_size */ 1, /* min_string_length */ 8192, /* max_string_length */ 8192},
+        {"ABC", /* batch_size */ 1, /* min_string_length */ 32768, /* max_string_length */ 32768},
+        {"ABC", /* batch_size */ 1, /* min_string_length */ 131072, /* max_string_length */ 131072},
         // Two strings of a same length:
-        {.batch_size = 2, .min_string_length = 128, .max_string_length = 128},
-        {.batch_size = 2, .min_string_length = 512, .max_string_length = 512},
-        {.batch_size = 2, .min_string_length = 2048, .max_string_length = 2048},
-        {.batch_size = 2, .min_string_length = 8192, .max_string_length = 8192},
-        {.batch_size = 2, .min_string_length = 32768, .max_string_length = 32768},
-        {.batch_size = 2, .min_string_length = 131072, .max_string_length = 131072},
+        {"ABC", /* batch_size */ 2, /* min_string_length */ 128, /* max_string_length */ 128},
+        {"ABC", /* batch_size */ 2, /* min_string_length */ 512, /* max_string_length */ 512},
+        {"ABC", /* batch_size */ 2, /* min_string_length */ 2048, /* max_string_length */ 2048},
+        {"ABC", /* batch_size */ 2, /* min_string_length */ 8192, /* max_string_length */ 8192},
+        {"ABC", /* batch_size */ 2, /* min_string_length */ 32768, /* max_string_length */ 32768},
+        {"ABC", /* batch_size */ 2, /* min_string_length */ 131072, /* max_string_length */ 131072},
         // Ten strings of random lengths:
-        {.batch_size = 10, .min_string_length = 1, .max_string_length = 128},
-        {.batch_size = 10, .min_string_length = 1, .max_string_length = 512},
-        {.batch_size = 10, .min_string_length = 1, .max_string_length = 2048},
-        {.batch_size = 10, .min_string_length = 1, .max_string_length = 8192},
-        {.batch_size = 10, .min_string_length = 1, .max_string_length = 32768},
-        {.batch_size = 10, .min_string_length = 1, .max_string_length = 131072},
+        {"ABC", /* batch_size */ 10, /* min_string_length */ 1, /* max_string_length */ 128},
+        {"ABC", /* batch_size */ 10, /* min_string_length */ 1, /* max_string_length */ 512},
+        {"ABC", /* batch_size */ 10, /* min_string_length */ 1, /* max_string_length */ 2048},
+        {"ABC", /* batch_size */ 10, /* min_string_length */ 1, /* max_string_length */ 8192},
+        {"ABC", /* batch_size */ 10, /* min_string_length */ 1, /* max_string_length */ 32768},
+        {"ABC", /* batch_size */ 10, /* min_string_length */ 1, /* max_string_length */ 131072},
     };
 
 #if SZ_USE_CUDA
diff --git a/scripts/test_stringzilla.hpp b/scripts/test_stringzilla.hpp
index df5a90a7..cc8d5c09 100644
--- a/scripts/test_stringzilla.hpp
+++ b/scripts/test_stringzilla.hpp
@@ -113,7 +113,7 @@ inline void iterate_in_random_slices(std::string const &text, slice_callback_typ
 }
 
 struct fuzzy_config_t {
-    std::string_view alphabet = "ABC";
+    std::string alphabet = "ABC";
     std::size_t batch_size = 16;
     std::size_t min_string_length = 1;
     std::size_t max_string_length = 200;

From c0aea26ecbe502f23c59da08eb79eed450900a35 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 18 Jul 2025 10:29:43 +0000
Subject: [PATCH 476/751] Fix: Avoid `std::swap` in device code

---
 include/stringzilla/stringzilla.hpp |  9 ++++---
 include/stringzilla/types.hpp       | 11 ++++++++
 include/stringzillas/similarity.cuh | 24 ++++++++---------
 include/stringzillas/similarity.hpp | 40 ++++++++++++++---------------
 4 files changed, 49 insertions(+), 35 deletions(-)

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 9713284d..d1200e07 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -1222,7 +1222,10 @@ class basic_string_slice {
     basic_string_slice(std::nullptr_t) = delete;
 
     /**  @brief Exchanges the view with that of the `other`. */
-    void swap(string_slice &other) noexcept { std::swap(start_, other.start_), std::swap(length_, other.length_); }
+    void swap(string_slice &other) noexcept {
+        trivial_swap(start_, other.start_);
+        trivial_swap(length_, other.length_);
+    }
 
 #if !SZ_AVOID_STL
 
@@ -2136,7 +2139,7 @@ class basic_string {
 
     /**  @brief Exchanges the string contents witt the `other` string. */
     void swap(basic_string &other) noexcept {
-        // If at least one of the strings is on the stack, a basic `std::swap(string_, other.string_)` won't work,
+        // If at least one of the strings is on the stack, a basic `swap(string_, other.string_)` won't work,
         // as the pointer to the stack-allocated memory will be swapped, instead of the contents.
         sz_ptr_t first_start, second_start;
         sz_size_t first_length, second_length;
@@ -2144,7 +2147,7 @@ class basic_string {
         sz_bool_t first_is_external, second_is_external;
         sz_string_unpack(&string_, &first_start, &first_length, &first_space, &first_is_external);
         sz_string_unpack(&other.string_, &second_start, &second_length, &second_space, &second_is_external);
-        std::swap(string_, other.string_);
+        trivial_swap(string_, other.string_);
         if (!first_is_external) other.string_.internal.start = &other.string_.internal.chars[0];
         if (!second_is_external) string_.internal.start = &string_.internal.chars[0];
     }
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index bfffa505..62f961a9 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -785,6 +785,17 @@ sz_constexpr_if_cpp14 value_type_ non_zero_if(value_type_ value, value_type_ con
     return value * condition;
 }
 
+/**
+ *  @brief Analog to `std::swap` from `<utility>`, but generates also device code, unlike STL.
+ */
+template <typename value_type_>
+sz_constexpr_if_cpp14 void trivial_swap(value_type_ &x, value_type_ &y) noexcept {
+    static_assert(std::is_trivially_copyable<value_type_>::value, "Value type must be trivially copyable");
+    value_type_ temp = x;
+    x = y;
+    y = temp;
+}
+
 /**
  *  @brief  Helper structure for dividing a range of data into three parts: head, body, and tail,
  *          generally used to minimize misaligned (split) stores and operate on aligned pages.
diff --git a/include/stringzillas/similarity.cuh b/include/stringzillas/similarity.cuh
index 56d6eb3f..05ce4d2d 100644
--- a/include/stringzillas/similarity.cuh
+++ b/include/stringzillas/similarity.cuh
@@ -1511,8 +1511,8 @@ __global__ void _affine_score_across_cuda_device(              //
 
         // Perform a circular rotation of those buffers, to reuse the memory.
         rotate_three(previous_scores, current_scores, next_scores);
-        std::swap(current_inserts, next_inserts);
-        std::swap(current_deletes, next_deletes);
+        trivial_swap(current_inserts, next_inserts);
+        trivial_swap(current_deletes, next_deletes);
     }
 
     __shared__ cuda::pipeline_shared_state<cuda::thread_scope_system, 2> memcpy_pipeline_state;
@@ -1540,8 +1540,8 @@ __global__ void _affine_score_across_cuda_device(              //
             diagonal_aligner.init_gap(next_deletes[next_diagonal_length - 1], next_diagonal_index);
         }
 
-        std::swap(current_inserts, next_inserts);
-        std::swap(current_deletes, next_deletes);
+        trivial_swap(current_inserts, next_inserts);
+        trivial_swap(current_deletes, next_deletes);
 
         // Guarantee that all the writes have finished, before progressing to the next diagonal.
         grid.sync();
@@ -1580,8 +1580,8 @@ __global__ void _affine_score_across_cuda_device(              //
 
         // Perform a circular rotation of those buffers, to reuse the memory.
         rotate_three(previous_scores, current_scores, next_scores);
-        std::swap(current_inserts, next_inserts);
-        std::swap(current_deletes, next_deletes);
+        trivial_swap(current_inserts, next_inserts);
+        trivial_swap(current_deletes, next_deletes);
 
         // ! Drop the first entry among the current scores.
         // ! Assuming every next diagonal is shorter by one element,
@@ -1932,8 +1932,8 @@ __global__ void _affine_score_on_each_cuda_warp(                             //
 
             // Perform a circular rotation of those buffers, to reuse the memory.
             rotate_three(previous_scores, current_scores, next_scores);
-            std::swap(current_inserts, next_inserts);
-            std::swap(current_deletes, next_deletes);
+            trivial_swap(current_inserts, next_inserts);
+            trivial_swap(current_deletes, next_deletes);
         }
 
         // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
@@ -1958,8 +1958,8 @@ __global__ void _affine_score_on_each_cuda_warp(                             //
                 diagonal_aligner.init_gap(next_deletes[next_diagonal_length - 1], next_diagonal_index);
             }
 
-            std::swap(current_inserts, next_inserts);
-            std::swap(current_deletes, next_deletes);
+            trivial_swap(current_inserts, next_inserts);
+            trivial_swap(current_deletes, next_deletes);
 
             __syncwarp();
             // ! In the central anti-diagonal band, we can't just set the `current_scores + 1` to `previous_scores`
@@ -1991,8 +1991,8 @@ __global__ void _affine_score_on_each_cuda_warp(                             //
 
             // Perform a circular rotation of those buffers, to reuse the memory.
             rotate_three(previous_scores, current_scores, next_scores);
-            std::swap(current_inserts, next_inserts);
-            std::swap(current_deletes, next_deletes);
+            trivial_swap(current_inserts, next_inserts);
+            trivial_swap(current_deletes, next_deletes);
 
             // ! Drop the first entry among the current scores.
             // ! Assuming every next diagonal is shorter by one element,
diff --git a/include/stringzillas/similarity.hpp b/include/stringzillas/similarity.hpp
index 186d7b17..06fbb35a 100644
--- a/include/stringzillas/similarity.hpp
+++ b/include/stringzillas/similarity.hpp
@@ -991,8 +991,8 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_co
         char_t const *shorter = first.data(), *longer = second.data();
         size_t shorter_length = first.size(), longer_length = second.size();
         if (shorter_length > longer_length) {
-            std::swap(shorter, longer);
-            std::swap(shorter_length, longer_length);
+            trivial_swap(shorter, longer);
+            trivial_swap(shorter_length, longer_length);
         }
 
         // We are going to store 3 diagonals of the matrix.
@@ -1189,8 +1189,8 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
         char_t const *shorter = first.data(), *longer = second.data();
         size_t shorter_length = first.size(), longer_length = second.size();
         if (shorter_length > longer_length) {
-            std::swap(shorter, longer);
-            std::swap(shorter_length, longer_length);
+            trivial_swap(shorter, longer);
+            trivial_swap(shorter_length, longer_length);
         }
 
         // We are going to store 7 diagonals of the matrix.
@@ -1264,8 +1264,8 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
 
             // Perform a circular rotation of those buffers, to reuse the memory.
             rotate_three(previous_scores, current_scores, next_scores);
-            std::swap(current_inserts, next_inserts);
-            std::swap(current_deletes, next_deletes);
+            trivial_swap(current_inserts, next_inserts);
+            trivial_swap(current_deletes, next_deletes);
         }
 
         // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
@@ -1291,8 +1291,8 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
             // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
             // dropping the first element in the current array.
             rotate_three(previous_scores, current_scores, next_scores);
-            std::swap(current_inserts, next_inserts);
-            std::swap(current_deletes, next_deletes);
+            trivial_swap(current_inserts, next_inserts);
+            trivial_swap(current_deletes, next_deletes);
 
             // ! Drop the first entry among the current scores.
             sz_move((sz_ptr_t)(previous_scores), (sz_ptr_t)(previous_scores + 1),
@@ -1318,8 +1318,8 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
             // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
             // dropping the first element in the current array.
             rotate_three(previous_scores, current_scores, next_scores);
-            std::swap(current_inserts, next_inserts);
-            std::swap(current_deletes, next_deletes);
+            trivial_swap(current_inserts, next_inserts);
+            trivial_swap(current_deletes, next_deletes);
 
             // ! Drop the first entry among the current scores.
             // ! Assuming every next diagonal is shorter by one element,
@@ -1411,8 +1411,8 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, linear_gap_
         char_t const *shorter = first.data(), *longer = second.data();
         size_t shorter_length = first.size(), longer_length = second.size();
         if (shorter_length > longer_length) {
-            std::swap(shorter, longer);
-            std::swap(shorter_length, longer_length);
+            trivial_swap(shorter, longer);
+            trivial_swap(shorter_length, longer_length);
         }
 
         // We are going to store 2 rows of the matrix. It will be either 2 rows of length `shorter_length + 1`
@@ -1452,7 +1452,7 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, linear_gap_
             );
 
             // Reuse the memory.
-            std::swap(previous_scores, current_scores);
+            trivial_swap(previous_scores, current_scores);
         }
 
         // Export the scalar before `free` call.
@@ -1540,8 +1540,8 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, affine_gap_
         char_t const *shorter = first.data(), *longer = second.data();
         size_t shorter_length = first.size(), longer_length = second.size();
         if (shorter_length > longer_length) {
-            std::swap(shorter, longer);
-            std::swap(shorter_length, longer_length);
+            trivial_swap(shorter, longer);
+            trivial_swap(shorter_length, longer_length);
         }
 
         // We are going to store 2 rows of the matrix. It will be either 2 rows of length `shorter_length + 1`
@@ -1591,9 +1591,9 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, affine_gap_
             );
 
             // Reuse the memory.
-            std::swap(previous_scores, current_scores);
-            std::swap(previous_inserts, current_inserts);
-            std::swap(previous_deletes, current_deletes);
+            trivial_swap(previous_scores, current_scores);
+            trivial_swap(previous_inserts, current_inserts);
+            trivial_swap(previous_deletes, current_deletes);
         }
 
         // Export the scalar before `free` call.
@@ -1679,7 +1679,7 @@ struct levenshtein_distance {
         // the more memory-intensive `affine_gap_costs_t`, we can fall-back to the linearized version.
         if constexpr (std::is_same<gap_costs_t, affine_gap_costs_t>())
             if (gap_costs_.open == gap_costs_.extend) {
-                linear_gap_costs_t linear_gap(gap_costs_.open);
+                linear_gap_costs_t linear_gap {gap_costs_.open};
                 linearized_fallback_t linear_backend(substituter_, linear_gap, alloc_);
                 return linear_backend(first, second, result_ref, executor);
             }
@@ -1797,7 +1797,7 @@ struct levenshtein_distance_utf8 {
         // the more memory-intensive `affine_gap_costs_t`, we can fall-back to the linearized version.
         if constexpr (std::is_same<gap_costs_t, affine_gap_costs_t>())
             if (gap_costs_.open == gap_costs_.extend) {
-                linear_gap_costs_t linear_gap(gap_costs_.open);
+                linear_gap_costs_t linear_gap {gap_costs_.open};
                 linearized_fallback_t linear_backend(substituter_, linear_gap, alloc_);
                 return linear_backend(first, second, result_ref, executor);
             }

From 80b97de7487b06fe60adb377b377ec9cd8ddd483 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 18 Jul 2025 10:40:16 +0000
Subject: [PATCH 477/751] Fix: `sz_bitcast` strict aliasing

---
 include/stringzilla/types.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 14c20793..10cf43a9 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -933,7 +933,11 @@ SZ_PUBLIC void sz_sequence_from_null_terminated_strings(sz_cptr_t *start, sz_siz
 #define sz_unused(x) ((void)(x))
 
 /** @brief Helper-macro casting a variable to another type of the same size. */
+#if defined(__has_builtin) && __has_builtin(__builtin_bit_cast)
+#define sz_bitcast(type, value) __builtin_bit_cast(type, (value))
+#else
 #define sz_bitcast(type, value) (*((type *)&(value)))
+#endif
 
 /**
  *  @brief  Defines `SZ_NULL`, analogous to `NULL`.

From 74e3b6fce1a94820c26ab0d91efe08a483d1368d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 18 Jul 2025 11:36:02 +0000
Subject: [PATCH 478/751] Fix: Avoid UB with underscore prefixes

---
 build.rs                               |   8 +-
 c/stringzilla.c                        |  44 +-
 c/stringzillas.cu                      |   4 +-
 drafts/bench_find_many.cuh             |   2 +-
 drafts/bitap.h                         |  86 +--
 drafts/find_many.cuh                   |   2 +-
 drafts/find_many.hpp                   |  30 +-
 drafts/segments.hpp                    |   4 +-
 drafts/sort.h                          |   6 +-
 drafts/test_find_many.cuh              |  64 +--
 include/stringzilla/compare.h          |  24 +-
 include/stringzilla/find.h             | 212 +++----
 include/stringzilla/hash.h             | 746 ++++++++++++-------------
 include/stringzilla/intersect.h        |  54 +-
 include/stringzilla/memory.h           |  46 +-
 include/stringzilla/small_string.h     |  34 +-
 include/stringzilla/sort.h             | 206 +++----
 include/stringzilla/stringzilla.h      |   4 +-
 include/stringzilla/stringzilla.hpp    |  32 +-
 include/stringzilla/types.h            | 157 +++---
 include/stringzilla/types.hpp          |  65 ++-
 include/stringzillas/fingerprint.hpp   | 128 ++---
 include/stringzillas/similarity.cuh    |  62 +-
 include/stringzillas/similarity.hpp    | 182 +++---
 include/stringzillas/types.cuh         |   2 +
 include/stringzillas/types.hpp         |   4 +-
 pyproject.toml                         |  20 +-
 python/lib.c                           |   4 +-
 scripts/bench.hpp                      |  14 +-
 scripts/bench_fingerprint.cuh          |  10 +-
 scripts/bench_memory.cpp               |   2 +-
 scripts/bench_sequence.cpp             |  12 +-
 scripts/bench_similarity.cuh           |   2 +-
 scripts/test_fingerprint.cuh           |   6 +-
 scripts/test_similarity.cuh            |  30 +-
 scripts/test_stringzilla.cpp           |  38 +-
 scripts/test_stringzilla.hpp           |   2 +-
 scripts/test_stringzillas.cpp          |   2 +-
 scripts/test_stringzillas.cu           |   7 +-
 scripts/test_stringzillas.cuh          |   2 +-
 swift/StringProtocol+StringZilla.swift |   2 +-
 41 files changed, 1182 insertions(+), 1179 deletions(-)

diff --git a/build.rs b/build.rs
index 9622457f..260d0b09 100644
--- a/build.rs
+++ b/build.rs
@@ -27,11 +27,11 @@ fn main() {
     }
 
     if target_arch == "x86_64" {
-        build.define("_SZ_IS_X86_64", "1");
-        build.define("_SZ_IS_ARM64", "0");
+        build.define("SZ_IS_64BIT_X86_", "1");
+        build.define("SZ_IS_64BIT_ARM_", "0");
     } else if target_arch == "aarch64" {
-        build.define("_SZ_IS_X86_64", "0");
-        build.define("_SZ_IS_ARM64", "1");
+        build.define("SZ_IS_64BIT_X86_", "0");
+        build.define("SZ_IS_64BIT_ARM_", "1");
     }
 
     // At start we will try compiling with all SIMD backends enabled
diff --git a/c/stringzilla.c b/c/stringzilla.c
index f6cb2f7b..f94e426e 100644
--- a/c/stringzilla.c
+++ b/c/stringzilla.c
@@ -35,32 +35,32 @@ extern void *malloc(size_t length);
 
 // Inferring target OS: Windows, MacOS, or Linux
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || defined(__CYGWIN__)
-#define _SZ_IS_WINDOWS 1
+#define SZ_IS_WINDOWS_ 1
 #elif defined(__APPLE__) && defined(__MACH__)
-#define _SZ_IS_APPLE 1
+#define SZ_IS_APPLE_ 1
 #elif defined(__linux__)
-#define _SZ_IS_LINUX 1
+#define SZ_IS_LINUX_ 1
 #endif
 
 // On Apple Silicon, `mrs` is not allowed in user-space, so we need to use the `sysctl` API.
-#if defined(_SZ_IS_APPLE)
+#if defined(SZ_IS_APPLE_)
 #include <sys/sysctl.h>
 #endif
 
-#if defined(_SZ_IS_WINDOWS)
+#if defined(SZ_IS_WINDOWS_)
 #include <windows.h> // `DllMain`
 #endif
 
-#if _SZ_IS_ARM64
+#if SZ_IS_64BIT_ARM_
 
 /**
  *  @brief  Function to determine the SIMD capabilities of the current 64-bit Arm machine at @b runtime.
  *  @return A bitmask of the SIMD capabilities represented as a `sz_capability_t` enum value.
  */
-SZ_INTERNAL sz_capability_t _sz_capabilities_arm(void) {
+SZ_INTERNAL sz_capability_t sz_capabilities_arm_(void) {
     // https://github.com/ashvardanian/SimSIMD/blob/28e536083602f85ad0c59456782c8864463ffb0e/include/simsimd/simsimd.h#L434
     // for documentation on how we detect capabilities across different ARM platforms.
-#if defined(_SZ_IS_APPLE)
+#if defined(SZ_IS_APPLE_)
 
     // On Apple Silicon, `mrs` is not allowed in user-space, so we need to use the `sysctl` API.
     uint32_t supports_neon = 0;
@@ -71,7 +71,7 @@ SZ_INTERNAL sz_capability_t _sz_capabilities_arm(void) {
         (sz_cap_neon_k * (supports_neon)) | //
         (sz_cap_serial_k));
 
-#elif defined(_SZ_IS_LINUX)
+#elif defined(SZ_IS_LINUX_)
 
     // Read CPUID registers directly
     unsigned long id_aa64isar0_el1 = 0, id_aa64isar1_el1 = 0, id_aa64pfr0_el1 = 0, id_aa64zfr0_el1 = 0;
@@ -104,16 +104,16 @@ SZ_INTERNAL sz_capability_t _sz_capabilities_arm(void) {
         (sz_cap_sve2_k * (supports_sve2)) | //
         (sz_cap_serial_k));
 
-#else // if !defined(_SZ_IS_APPLE) && !defined(_SZ_IS_LINUX)
+#else // if !defined(SZ_IS_APPLE_) && !defined(SZ_IS_LINUX_)
     return sz_cap_serial_k;
 #endif
 }
 
-#endif // _SZ_IS_ARM64
+#endif // SZ_IS_64BIT_ARM_
 
-#if _SZ_IS_X86_64
+#if SZ_IS_64BIT_X86_
 
-SZ_INTERNAL sz_capability_t _sz_capabilities_x86(void) {
+SZ_INTERNAL sz_capability_t sz_capabilities_x86_(void) {
 
 #if SZ_USE_HASWELL || SZ_USE_SKYLAKE || SZ_USE_ICE
 
@@ -158,17 +158,17 @@ SZ_INTERNAL sz_capability_t _sz_capabilities_x86(void) {
     return sz_cap_serial_k;
 #endif
 }
-#endif // _SZ_IS_X86_64
+#endif // SZ_IS_64BIT_X86_
 
 /**
  *  @brief  Function to determine the SIMD capabilities of the current 64-bit x86 machine at @b runtime.
  *  @return A bitmask of the SIMD capabilities represented as a `sz_capability_t` enum value.
  */
 SZ_DYNAMIC sz_capability_t sz_capabilities(void) {
-#if _SZ_IS_X86_64
-    return _sz_capabilities_x86();
-#elif _SZ_IS_ARM64
-    return _sz_capabilities_arm();
+#if SZ_IS_64BIT_X86_
+    return sz_capabilities_x86_();
+#elif SZ_IS_64BIT_ARM_
+    return sz_capabilities_arm_();
 #else
     return sz_cap_serial_k;
 #endif
@@ -216,7 +216,7 @@ __attribute__((aligned(64))) static sz_implementations_t sz_dispatch_table;
 SZ_DYNAMIC void sz_dispatch_table_init(void) {
     sz_implementations_t *impl = &sz_dispatch_table;
     sz_capability_t caps = sz_capabilities();
-    sz_unused(caps); //< Unused when compiling on pre-SIMD machines.
+    sz_unused_(caps); //< Unused when compiling on pre-SIMD machines.
 
     impl->equal = sz_equal_serial;
     impl->order = sz_order_serial;
@@ -354,9 +354,9 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
  *  alphabetically (exclusive). The Microsoft C++ compiler puts C++ initialisation code in .CRT$XCU, so avoid that
  *  section: https://learn.microsoft.com/en-us/cpp/c-runtime-library/crt-initialization?view=msvc-170
  */
-#pragma comment(linker, "/INCLUDE:_sz_dispatch_table_init")
+#pragma comment(linker, "/INCLUDE:sz_dispatch_table_init_")
 #pragma section(".CRT$XCS", read)
-__declspec(allocate(".CRT$XCS")) void (*_sz_dispatch_table_init)() = sz_dispatch_table_init;
+__declspec(allocate(".CRT$XCS")) void (*sz_dispatch_table_init_)() = sz_dispatch_table_init;
 
 /*  Called either from CRT code or out own `_DLLMainCRTStartup`, when a DLL is loaded. */
 BOOL WINAPI DllMain(HINSTANCE hints, DWORD forward_reason, LPVOID lp) {
@@ -388,7 +388,7 @@ SZ_DYNAMIC int sz_version_major(void) { return STRINGZILLA_H_VERSION_MAJOR; }
 SZ_DYNAMIC int sz_version_minor(void) { return STRINGZILLA_H_VERSION_MINOR; }
 SZ_DYNAMIC int sz_version_patch(void) { return STRINGZILLA_H_VERSION_PATCH; }
 SZ_DYNAMIC sz_cptr_t sz_capabilities_to_string(sz_capability_t caps) {
-    return _sz_capabilities_to_string_implementation(caps);
+    return sz_capabilities_to_string_implementation_(caps);
 }
 
 SZ_DYNAMIC sz_u64_t sz_bytesum(sz_cptr_t text, sz_size_t length) { return sz_dispatch_table.bytesum(text, length); }
diff --git a/c/stringzillas.cu b/c/stringzillas.cu
index 39952445..2433622c 100644
--- a/c/stringzillas.cu
+++ b/c/stringzillas.cu
@@ -26,7 +26,7 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u32tape( //
     sz_size_t bound,                                     //
     sz_memory_allocator_t *alloc, sz_size_t *results) {
 
-    _sz_unused(bound && alloc);
+    sz_unused_(bound && alloc);
 
     using tape_t = sz::arrow_strings_tape<char, sz_u32_t, sz::dummy_alloc_t>;
     sz_size_t const a_total_length = a_lengths[count];
@@ -45,7 +45,7 @@ SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u32tape( //
     sz_error_cost_t const *subs, sz_error_cost_t gap,      //
     sz_memory_allocator_t *alloc, sz_ssize_t *results) {
 
-    _sz_unused(alloc);
+    sz_unused_(alloc);
 
     using tape_t = sz::arrow_strings_tape<char, sz_u32_t, sz::dummy_alloc_t>;
     sz_size_t const a_total_length = a_lengths[count];
diff --git a/drafts/bench_find_many.cuh b/drafts/bench_find_many.cuh
index e3136c25..43cfe84e 100644
--- a/drafts/bench_find_many.cuh
+++ b/drafts/bench_find_many.cuh
@@ -60,7 +60,7 @@ struct find_many_callable {
                                                 results_matches_per_haystack.size()};
 
         // Unpack the extra arguments from `std::tuple` into the engine call using `std::apply`
-        constexpr bool only_counts_k = std::is_same_v<results_t, counts_t>;
+        constexpr bool only_counts_k = is_same_type<results_t, counts_t>::value;
         if constexpr (only_counts_k)
             status = std::apply(
                 [&](auto &&...rest) mutable {
diff --git a/drafts/bitap.h b/drafts/bitap.h
index 49099cbe..b49b0063 100644
--- a/drafts/bitap.h
+++ b/drafts/bitap.h
@@ -16,7 +16,7 @@ extern "C" {
  *  @brief  Bitap algo for exact matching of patterns up to @b 8-bytes long.
  *          https://en.wikipedia.org/wiki/Bitap_algorithm
  */
-SZ_INTERNAL sz_cptr_t _sz_find_bitap_upto_8bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
+SZ_INTERNAL sz_cptr_t sz_find_bitap_upto_8bytes_serial_(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
                                                         sz_size_t n_length) {
     sz_u8_t const *n_unsigned = (sz_u8_t const *)n;
     sz_u8_t const *h_unsigned = (sz_u8_t const *)h;
@@ -67,7 +67,7 @@ SZ_INTERNAL sz_cptr_t _sz_find_bitap_upto_8bytes_serial(sz_cptr_t h, sz_size_t h
  *  @brief  Bitap algorithm for exact matching of patterns up to @b 8-bytes long in @b reverse order.
  *          https://en.wikipedia.org/wiki/Bitap_algorithm
  */
-SZ_INTERNAL sz_cptr_t _sz_rfind_bitap_upto_8bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
+SZ_INTERNAL sz_cptr_t sz_rfind_bitap_upto_8bytes_serial_(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
                                                          sz_size_t n_length) {
     sz_u8_t const *h_unsigned = (sz_u8_t const *)h;
     sz_u8_t const *n_unsigned = (sz_u8_t const *)n;
@@ -87,7 +87,7 @@ SZ_INTERNAL sz_cptr_t _sz_rfind_bitap_upto_8bytes_serial(sz_cptr_t h, sz_size_t
  *  @brief  Bitap algo for exact matching of patterns up to @b 16-bytes long.
  *          https://en.wikipedia.org/wiki/Bitap_algorithm
  */
-SZ_INTERNAL sz_cptr_t _sz_find_bitap_upto_16bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
+SZ_INTERNAL sz_cptr_t sz_find_bitap_upto_16bytes_serial_(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
                                                          sz_size_t n_length) {
     sz_u8_t const *h_unsigned = (sz_u8_t const *)h;
     sz_u8_t const *n_unsigned = (sz_u8_t const *)n;
@@ -107,7 +107,7 @@ SZ_INTERNAL sz_cptr_t _sz_find_bitap_upto_16bytes_serial(sz_cptr_t h, sz_size_t
  *  @brief  Bitap algorithm for exact matching of patterns up to @b 16-bytes long in @b reverse order.
  *          https://en.wikipedia.org/wiki/Bitap_algorithm
  */
-SZ_INTERNAL sz_cptr_t _sz_rfind_bitap_upto_16bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
+SZ_INTERNAL sz_cptr_t sz_rfind_bitap_upto_16bytes_serial_(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
                                                           sz_size_t n_length) {
     sz_u8_t const *h_unsigned = (sz_u8_t const *)h;
     sz_u8_t const *n_unsigned = (sz_u8_t const *)n;
@@ -127,7 +127,7 @@ SZ_INTERNAL sz_cptr_t _sz_rfind_bitap_upto_16bytes_serial(sz_cptr_t h, sz_size_t
  *  @brief  Bitap algo for exact matching of patterns up to @b 32-bytes long.
  *          https://en.wikipedia.org/wiki/Bitap_algorithm
  */
-SZ_INTERNAL sz_cptr_t _sz_find_bitap_upto_32bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
+SZ_INTERNAL sz_cptr_t sz_find_bitap_upto_32bytes_serial_(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
                                                          sz_size_t n_length) {
     sz_u8_t const *h_unsigned = (sz_u8_t const *)h;
     sz_u8_t const *n_unsigned = (sz_u8_t const *)n;
@@ -147,7 +147,7 @@ SZ_INTERNAL sz_cptr_t _sz_find_bitap_upto_32bytes_serial(sz_cptr_t h, sz_size_t
  *  @brief  Bitap algorithm for exact matching of patterns up to @b 32-bytes long in @b reverse order.
  *          https://en.wikipedia.org/wiki/Bitap_algorithm
  */
-SZ_INTERNAL sz_cptr_t _sz_rfind_bitap_upto_32bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
+SZ_INTERNAL sz_cptr_t sz_rfind_bitap_upto_32bytes_serial_(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
                                                           sz_size_t n_length) {
     sz_u8_t const *h_unsigned = (sz_u8_t const *)h;
     sz_u8_t const *n_unsigned = (sz_u8_t const *)n;
@@ -167,7 +167,7 @@ SZ_INTERNAL sz_cptr_t _sz_rfind_bitap_upto_32bytes_serial(sz_cptr_t h, sz_size_t
  *  @brief  Bitap algo for exact matching of patterns up to @b 64-bytes long.
  *          https://en.wikipedia.org/wiki/Bitap_algorithm
  */
-SZ_INTERNAL sz_cptr_t _sz_find_bitap_upto_64bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
+SZ_INTERNAL sz_cptr_t sz_find_bitap_upto_64bytes_serial_(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
                                                          sz_size_t n_length) {
     sz_u8_t const *h_unsigned = (sz_u8_t const *)h;
     sz_u8_t const *n_unsigned = (sz_u8_t const *)n;
@@ -187,7 +187,7 @@ SZ_INTERNAL sz_cptr_t _sz_find_bitap_upto_64bytes_serial(sz_cptr_t h, sz_size_t
  *  @brief  Bitap algorithm for exact matching of patterns up to @b 64-bytes long in @b reverse order.
  *          https://en.wikipedia.org/wiki/Bitap_algorithm
  */
-SZ_INTERNAL sz_cptr_t _sz_rfind_bitap_upto_64bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
+SZ_INTERNAL sz_cptr_t sz_rfind_bitap_upto_64bytes_serial_(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
                                                           sz_size_t n_length) {
     sz_u8_t const *h_unsigned = (sz_u8_t const *)h;
     sz_u8_t const *n_unsigned = (sz_u8_t const *)n;
@@ -207,7 +207,7 @@ SZ_INTERNAL sz_cptr_t _sz_rfind_bitap_upto_64bytes_serial(sz_cptr_t h, sz_size_t
  *  @brief  Bitap algo for approximate matching of patterns up to @b 64-bytes long.
  *          https://en.wikipedia.org/wiki/Bitap_algorithm
  */
-SZ_INTERNAL sz_cptr_t _sz_find_bounded_bitap_upto_64bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
+SZ_INTERNAL sz_cptr_t sz_find_bounded_bitap_upto_64bytes_serial_(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
                                                                  sz_size_t n_length) {
     sz_u8_t const *h_unsigned = (sz_u8_t const *)h;
     sz_u8_t const *n_unsigned = (sz_u8_t const *)n;
@@ -227,7 +227,7 @@ SZ_INTERNAL sz_cptr_t _sz_find_bounded_bitap_upto_64bytes_serial(sz_cptr_t h, sz
  *  @brief  Bitap algorithm for approximate matching of patterns up to @b 64-bytes long in @b reverse order.
  *          https://en.wikipedia.org/wiki/Bitap_algorithm
  */
-SZ_INTERNAL sz_cptr_t _sz_find_bounded_last_bitap_upto_64bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
+SZ_INTERNAL sz_cptr_t sz_find_bounded_last_bitap_upto_64bytes_serial_(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
                                                                       sz_size_t n_length) {
     sz_u8_t const *h_unsigned = (sz_u8_t const *)h;
     sz_u8_t const *n_unsigned = (sz_u8_t const *)n;
@@ -254,7 +254,7 @@ SZ_PUBLIC sz_size_t sz_edit_distance_avx512(     //
     sz_u512_vec_t cost_deletion_vec, cost_insertion_vec, cost_substitution_vec;
     sz_size_t min_distance;
 
-    b_vec.zmm = _mm512_maskz_loadu_epi8(_sz_u64_mask_until(b_length), b);
+    b_vec.zmm = _mm512_maskz_loadu_epi8(sz_u64_mask_until_(b_length), b);
     previous_vec.zmm = _mm512_set_epi8(63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, //
                                        47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, //
                                        31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, //
@@ -342,24 +342,24 @@ sz_u512_vec_t sz_inclusive_min(sz_i32_t previous, sz_error_cost_t gap, sz_u512_v
     shifted_vec.i32s[0] = previous;
     shifted_vec.zmm = _mm512_add_epi32(shifted_vec.zmm, gap_vec.zmm);
     new_vec.zmm = _mm512_mask_max_epi32(new_vec.zmm, mask_skip_one, new_vec.zmm, shifted_vec.zmm);
-    _sz_assert(new_vec.i32s[0] == max(previous + gap, base_vec.i32s[0]));
+    sz_assert_(new_vec.i32s[0] == max(previous + gap, base_vec.i32s[0]));
 
     shifted_vec.zmm = _mm512_permutexvar_epi32(shift_by_two_vec.zmm, new_vec.zmm);
     shifted_vec.zmm = _mm512_add_epi32(shifted_vec.zmm, gap_double_vec.zmm);
     new_vec.zmm = _mm512_mask_max_epi32(new_vec.zmm, mask_skip_two, new_vec.zmm, shifted_vec.zmm);
-    _sz_assert(new_vec.i32s[0] == max(previous + gap, base_vec.i32s[0]));
+    sz_assert_(new_vec.i32s[0] == max(previous + gap, base_vec.i32s[0]));
 
     shifted_vec.zmm = _mm512_permutexvar_epi32(shift_by_four_vec.zmm, new_vec.zmm);
     shifted_vec.zmm = _mm512_add_epi32(shifted_vec.zmm, gap_quad_vec.zmm);
     new_vec.zmm = _mm512_mask_max_epi32(new_vec.zmm, mask_skip_four, new_vec.zmm, shifted_vec.zmm);
-    _sz_assert(new_vec.i32s[0] == max(previous + gap, base_vec.i32s[0]));
+    sz_assert_(new_vec.i32s[0] == max(previous + gap, base_vec.i32s[0]));
 
     shifted_vec.zmm = _mm512_permutexvar_epi32(shift_by_eight_vec.zmm, new_vec.zmm);
     shifted_vec.zmm = _mm512_add_epi32(shifted_vec.zmm, gap_octa_vec.zmm);
     new_vec.zmm = _mm512_mask_max_epi32(new_vec.zmm, mask_skip_eight, new_vec.zmm, shifted_vec.zmm);
 
-    _sz_assert(new_vec.i32s[0] == max(previous + gap, base_vec.i32s[0]));
-    for (sz_size_t i = 1; i < 16; i++) _sz_assert(new_vec.i32s[i] == max(new_vec.i32s[i - 1] + gap, new_vec.i32s[i]));
+    sz_assert_(new_vec.i32s[0] == max(previous + gap, base_vec.i32s[0]));
+    for (sz_size_t i = 1; i < 16; i++) sz_assert_(new_vec.i32s[i] == max(new_vec.i32s[i - 1] + gap, new_vec.i32s[i]));
 
     return new_vec;
 }
@@ -385,7 +385,7 @@ SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_
         // 1. Find corresponding word in a set.
         // 2. Produce a bitmask to check against that word.
         load_length = sz_min_of_two(length, 32);
-        load_mask = _sz_u64_mask_until(load_length);
+        load_mask = sz_u64_mask_until_(load_length);
         text_vec.ymms[0] = _mm256_maskz_loadu_epi8(load_mask, text);
 
         // To shift right every byte by 3 bits we can use the GF2 affine transformations.
@@ -440,7 +440,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx512(sz_cptr_t text, sz_size_t length, sz
         // 1. Find corresponding word in a set.
         // 2. Produce a bitmask to check against that word.
         load_length = sz_min_of_two(length, 32);
-        load_mask = _sz_u64_mask_until(load_length);
+        load_mask = sz_u64_mask_until_(load_length);
         text_vec.ymms[0] = _mm256_maskz_loadu_epi8(load_mask, text + length - load_length);
 
         // To shift right every byte by 3 bits we can use the GF2 affine transformations.
@@ -570,7 +570,7 @@ SZ_PUBLIC sz_cptr_t sz_find_neon_too_smart(sz_cptr_t h, sz_size_t h_length, sz_c
     else {
         // Pick the parts of the needle that are worth comparing.
         sz_size_t offset_first, offset_mid, offset_last;
-        _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
+        sz_locate_needle_anomalies_(n, n_length, &offset_first, &offset_mid, &offset_last);
         // Broadcast those characters into SIMD registers.
         sz_u64_t matches;
         sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
@@ -961,7 +961,7 @@ SZ_PUBLIC sz_ordering_t sz_order_avx2(sz_cptr_t a, sz_size_t a_length, sz_cptr_t
     // while (a_length >= 8 && b_length >= 8) {
     //     sz_u64_t a_u64 = *(sz_u64_t *)a;
     //     sz_u64_t b_u64 = *(sz_u64_t *)b;
-    //     if (a_u64 != b_u64) return _sz_order_scalars(a_u64, b_u64);
+    //     if (a_u64 != b_u64) return sz_order_scalars_(a_u64, b_u64);
     //     a += 8, b += 8, a_length -= 8, b_length -= 8;
     // }
 
@@ -979,7 +979,7 @@ SZ_PUBLIC sz_ordering_t sz_order_avx2(sz_cptr_t a, sz_size_t a_length, sz_cptr_t
         //          sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
         //          char a_char = a[first_diff];
         //          char b_char = b[first_diff];
-        //          return _sz_order_scalars(a_char, b_char);
+        //          return sz_order_scalars_(a_char, b_char);
         //      }
         //
         // A wiser approach to avoid serial code, is to perform 2 vector comparisons instead of quality check.
@@ -992,15 +992,15 @@ SZ_PUBLIC sz_ordering_t sz_order_avx2(sz_cptr_t a, sz_size_t a_length, sz_cptr_t
         // The `_tzcnt_u64` trailing zeros computation, however, also has latency of 3 cycles.
         unsigned char all_equal = _kortestz_mask8_u8(less_mask, greater_mask);
         if (all_equal) { a += 64, b += 64, a_length -= 64, b_length -= 64; }
-        else { return _sz_order_scalars(_tzcnt_u64(less_mask), _tzcnt_u64(greater_mask)); }
+        else { return sz_order_scalars_(_tzcnt_u64(less_mask), _tzcnt_u64(greater_mask)); }
     }
 
     // Assume a case like `("abc\0" < "abc")`.
     // Knowing the length masks of both strings, we can find the bytes that make up the difference
     // and enable them in the `greater_mask`, to signal the presence of null-characters in the end.
     //
-    //      __mmask64 a_mask = _sz_u64_clamp_mask_until(a_length);
-    //      __mmask64 b_mask = _sz_u64_clamp_mask_until(b_length);
+    //      __mmask64 a_mask = sz_u64_clamp_mask_until_(a_length);
+    //      __mmask64 b_mask = sz_u64_clamp_mask_until_(b_length);
     //      a_vec.zmm = _mm512_maskz_loadu_epi8(a_mask, a);
     //      b_vec.zmm = _mm512_maskz_loadu_epi8(b_mask, b);
     //      __mmask64 after_a_before_b_mask = _kandn_mask64(a_mask, b_mask);
@@ -1032,7 +1032,7 @@ SZ_PUBLIC sz_ordering_t sz_order_skylake(sz_cptr_t a, sz_size_t a_length, sz_cpt
         //          sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
         //          char a_char = a[first_diff];
         //          char b_char = b[first_diff];
-        //          return _sz_order_scalars(a_char, b_char);
+        //          return sz_order_scalars_(a_char, b_char);
         //      }
         //
         // A wiser approach to avoid serial code, is to perform 2 vector comparisons instead of quality check.
@@ -1045,15 +1045,15 @@ SZ_PUBLIC sz_ordering_t sz_order_skylake(sz_cptr_t a, sz_size_t a_length, sz_cpt
         // The `_tzcnt_u64` trailing zeros computation, however, also has latency of 3 cycles.
         unsigned char all_equal = _kortestz_mask8_u8(less_mask, greater_mask);
         if (all_equal) { a += 64, b += 64, a_length -= 64, b_length -= 64; }
-        else { return _sz_order_scalars(_tzcnt_u64(less_mask), _tzcnt_u64(greater_mask)); }
+        else { return sz_order_scalars_(_tzcnt_u64(less_mask), _tzcnt_u64(greater_mask)); }
     }
 
     // Assume a case like `("abc\0" < "abc")`.
     // Knowing the length masks of both strings, we can find the bytes that make up the difference
     // and enable them in the `greater_mask`, to signal the presence of null-characters in the end.
     //
-    //      __mmask64 a_mask = _sz_u64_clamp_mask_until(a_length);
-    //      __mmask64 b_mask = _sz_u64_clamp_mask_until(b_length);
+    //      __mmask64 a_mask = sz_u64_clamp_mask_until_(a_length);
+    //      __mmask64 b_mask = sz_u64_clamp_mask_until_(b_length);
     //      a_vec.zmm = _mm512_maskz_loadu_epi8(a_mask, a);
     //      b_vec.zmm = _mm512_maskz_loadu_epi8(b_mask, b);
     //      __mmask64 after_a_before_b_mask = _kandn_mask64(a_mask, b_mask);
@@ -1124,8 +1124,8 @@ SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
             for (; length >= 128; target += 64, source += 64, length -= 64) {
                 second_vec.zmm = _mm512_load_si512(target + 64);
                 combined_vec.zmm = _mm512_permutex2var_epi8(first_vec.zmm, selector_vec.zmm, second_vec.zmm);
-                _sz_assert(combined_vec.u8s[0] == source[0]);
-                _sz_assert(combined_vec.u8s[63] == source[63]);
+                sz_assert_(combined_vec.u8s[0] == source[0]);
+                sz_assert_(combined_vec.u8s[63] == source[63]);
                 _mm512_store_si512(target, combined_vec.zmm);
                 first_vec.zmm = second_vec.zmm;
             }
@@ -1147,8 +1147,8 @@ SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
                 second_vec.zmm = _mm512_load_si512(target + 64);
                 first_shuffled_vec.zmm = _mm512_shuffle_epi8(first_vec.zmm, first_byte_permute_vec.zmm);
                 second_shuffled_vec.zmm = _mm512_shuffle_epi8(second_vec.zmm, second_byte_permute_vec.zmm);
-                _sz_assert(first_shuffled_vec.u8s[0] == source[0]);
-                _sz_assert(second_shuffled_vec.u8s[63] == source[63]);
+                sz_assert_(first_shuffled_vec.u8s[0] == source[0]);
+                sz_assert_(second_shuffled_vec.u8s[63] == source[63]);
                 combined_vec.zmm = _mm512_or_si512(first_shuffled_vec.zmm, second_shuffled_vec.zmm);
                 _mm512_store_si512(target, combined_vec.zmm);
                 first_vec.zmm = second_vec.zmm;
@@ -1162,7 +1162,7 @@ SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
             for (; length >= 64; target += 64, source += 64, length -= 64)
                 _mm512_store_si512(target, _mm512_loadu_si512(source));
             // At this point the length is guaranteed to be under 64.
-            __mmask64 mask = _sz_u64_mask_until(length);
+            __mmask64 mask = sz_u64_mask_until_(length);
             _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
         }
     }
@@ -1173,7 +1173,7 @@ SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
         for (target += length, source += length; length >= 64; length -= 64)
             _mm512_store_si512(target -= 64, _mm512_loadu_si512(source -= 64));
         // At this point the length is guaranteed to be under 64.
-        __mmask64 mask = _sz_u64_mask_until(length);
+        __mmask64 mask = sz_u64_mask_until_(length);
         _mm512_mask_storeu_epi8(target - length, mask, _mm512_maskz_loadu_epi8(mask, source - length));
     }
 }
@@ -1189,7 +1189,7 @@ SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
 
     // On very short buffers, that are one cache line in width or less, we don't need any loops.
     if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
+        __mmask64 mask = sz_u64_mask_until_(length);
         _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
         return;
     }
@@ -1201,8 +1201,8 @@ SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
     sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
     sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
     sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-    __mmask64 head_mask = _sz_u64_mask_until(head_length);
-    __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
+    __mmask64 head_mask = sz_u64_mask_until_(head_length);
+    __mmask64 tail_mask = sz_u64_mask_until_(tail_length);
 
     // The absolute most common case of using "moves" is shifting the data within a continuous buffer
     // when adding a removing some values in it. In such cases, a typical shift is by 1, 2, 4, 8, 16,
@@ -1272,15 +1272,15 @@ SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
 
         if (body_length >= 128) {
             // Now that the permutations are prepared, pre-load the first cache line and start the loop.
-            __mmask64 blend_mask = _sz_u64_mask_until(shift_in_page);
+            __mmask64 blend_mask = sz_u64_mask_until_(shift_in_page);
             sz_cptr_t source_page = source - (sz_size_t)source % 64;
             first_vec.zmm = _mm512_load_si512(source_page);
             for (; body_length >= 128; target += 64, source += 64, source_page += 64, body_length -= 64) {
                 second_vec.zmm = _mm512_load_si512(source_page + 64);
                 second_vec.zmm = _mm512_permutexvar_epi8(selector_vec.zmm, second_vec.zmm);
                 combined_vec.zmm = _mm512_mask_blend_epi8(blend_mask, second_vec.zmm, first_vec.zmm);
-                _sz_assert(combined_vec.u8s[0] == source[0]);
-                _sz_assert(combined_vec.u8s[63] == source[63]);
+                sz_assert_(combined_vec.u8s[0] == source[0]);
+                sz_assert_(combined_vec.u8s[63] == source[63]);
                 _mm512_store_si512(target, combined_vec.zmm);
                 first_vec.zmm = second_vec.zmm;
             }
@@ -1306,15 +1306,15 @@ SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
 
         if (body_length >= 128) {
             // Now that the permutations are prepared, pre-load the first cache line and start the loop.
-            __mmask64 blend_mask = _sz_u64_mask_until(shift_in_page);
+            __mmask64 blend_mask = sz_u64_mask_until_(shift_in_page);
             sz_cptr_t source_second_page = source + body_length - (sz_size_t)(source + body_length) % 64;
             first_vec.zmm = _mm512_load_si512(source_second_page);
             for (; body_length >= 128; source_second_page -= 64, body_length -= 64) {
                 second_vec.zmm = _mm512_load_si512(source_second_page - 64);
                 second_vec.zmm = _mm512_permutexvar_epi8(selector_vec.zmm, second_vec.zmm);
                 combined_vec.zmm = _mm512_mask_blend_epi8(blend_mask, second_vec.zmm, first_vec.zmm);
-                _sz_assert(combined_vec.u8s[0] == source[0]);
-                _sz_assert(combined_vec.u8s[63] == source[63]);
+                sz_assert_(combined_vec.u8s[0] == source[0]);
+                sz_assert_(combined_vec.u8s[63] == source[63]);
                 _mm512_store_si512(target + head_length + body_length, combined_vec.zmm);
                 first_vec.zmm = second_vec.zmm;
             }
diff --git a/drafts/find_many.cuh b/drafts/find_many.cuh
index a0c1d659..85fc0f0e 100644
--- a/drafts/find_many.cuh
+++ b/drafts/find_many.cuh
@@ -351,7 +351,7 @@ struct find_many<state_id_type_, allocator_type_, sz_cap_cuda_k, enable_> {
         haystacks_type_ &&haystacks, span<size_t> counts, //
         cuda_executor_t executor = {}, gpu_specs_t const &specs = {}) const noexcept {
 
-        _sz_assert(counts.size() == haystacks.size());
+        sz_assert_(counts.size() == haystacks.size());
 
         using haystacks_t = typename std::remove_reference_t<haystacks_type_>;
         using haystack_t = typename haystacks_t::value_type;
diff --git a/drafts/find_many.hpp b/drafts/find_many.hpp
index ac57debc..0c8006c2 100644
--- a/drafts/find_many.hpp
+++ b/drafts/find_many.hpp
@@ -247,12 +247,12 @@ struct aho_corasick_dictionary {
         if ((s = outputs_offsets.try_reserve(other.outputs_offsets().size())) != status_t::success_k) return s;
         if ((s = needles_lengths.try_reserve(other.needles_lengths().size())) != status_t::success_k) return s;
 
-        _sz_assert(transitions.try_assign(other.transitions()) == status_t::success_k);
-        _sz_assert(outputs.try_assign(other.outputs()) == status_t::success_k);
-        _sz_assert(failures.try_assign(other.failures()) == status_t::success_k);
-        _sz_assert(outputs_counts.try_assign(other.outputs_counts()) == status_t::success_k);
-        _sz_assert(outputs_offsets.try_assign(other.outputs_offsets()) == status_t::success_k);
-        _sz_assert(needles_lengths.try_assign(other.needles_lengths()) == status_t::success_k);
+        sz_assert_(transitions.try_assign(other.transitions()) == status_t::success_k);
+        sz_assert_(outputs.try_assign(other.outputs()) == status_t::success_k);
+        sz_assert_(failures.try_assign(other.failures()) == status_t::success_k);
+        sz_assert_(outputs_counts.try_assign(other.outputs_counts()) == status_t::success_k);
+        sz_assert_(outputs_offsets.try_assign(other.outputs_offsets()) == status_t::success_k);
+        sz_assert_(needles_lengths.try_assign(other.needles_lengths()) == status_t::success_k);
 
         alloc_ = std::move(alloc);
         transitions_ = std::move(transitions);
@@ -621,7 +621,7 @@ struct find_many {
      */
     template <typename haystacks_type_>
     status_t try_count(haystacks_type_ &&haystacks, span<size_t> counts) const noexcept {
-        _sz_assert(counts.size() == haystacks.size());
+        sz_assert_(counts.size() == haystacks.size());
         for (size_t i = 0; i < counts.size(); ++i) counts[i] = dict_.count(haystacks[i]);
         return status_t::success_k;
     }
@@ -636,7 +636,7 @@ struct find_many {
     status_t try_find(haystacks_type_ &&haystacks, span<size_t const> counts,
                       output_matches_type_ &&matches) const noexcept {
 
-        sz_unused(counts); // ? We only keep it for API compatibility with parallel algos
+        sz_unused_(counts); // ? We only keep it for API compatibility with parallel algos
         size_t count_found = 0;
         size_t const count_allowed = matches.size();
         for (auto it = haystacks.begin(); it != haystacks.end() && count_found != count_allowed; ++it)
@@ -722,13 +722,13 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
      *  @return The total number of occurrences found.
      */
     template <typename haystacks_type_, typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     status_t try_count(haystacks_type_ &&haystacks, span<size_t> counts, executor_type_ &&executor = {},
                        cpu_specs_t const &specs = {}) const noexcept {
 
-        _sz_assert(counts.size() == haystacks.size());
+        sz_assert_(counts.size() == haystacks.size());
 
         using haystacks_t = typename std::remove_reference_t<haystacks_type_>;
         using haystack_t = typename haystacks_t::value_type;
@@ -785,7 +785,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
      *  @note The @p matches reference objects should be assignable from @b `match_t`.
      */
     template <typename haystacks_type_, typename output_matches_type_, typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     status_t try_find(haystacks_type_ &&haystacks, output_matches_type_ &&matches, //
@@ -806,13 +806,13 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
      *  @note The @p matches reference objects should be assignable from @b `match_t`.
      */
     template <typename haystacks_type_, typename output_matches_type_, typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     status_t try_find(haystacks_type_ &&haystacks, span<size_t const> counts, output_matches_type_ &&matches,
                       executor_type_ &&executor = {}, cpu_specs_t const &specs = {}) const noexcept {
 
-        _sz_assert(counts.size() == haystacks.size());
+        sz_assert_(counts.size() == haystacks.size());
         size_t const cores_total = executor.threads_count();
 
         using haystacks_t = typename std::remove_reference_t<haystacks_type_>;
@@ -839,7 +839,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
                 ++matches_found;
                 return true;
             });
-            _sz_assert(counts[haystack_index] == matches_found);
+            sz_assert_(counts[haystack_index] == matches_found);
         });
 
         // On longer strings, throw all cores on each haystack, but between the threads we need additional
@@ -896,7 +896,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
                     count_matches_found_on_this_core++;
                     return true;
                 });
-                _sz_assert(count_matches_found_on_this_core == count_matches_expected_on_this_core);
+                sz_assert_(count_matches_found_on_this_core == count_matches_expected_on_this_core);
             });
         }
 
diff --git a/drafts/segments.hpp b/drafts/segments.hpp
index 5fbbee15..79e185ce 100644
--- a/drafts/segments.hpp
+++ b/drafts/segments.hpp
@@ -85,7 +85,7 @@ SZ_PUBLIC sz_bool_t sz_detect_encoding(sz_cptr_t text, sz_size_t length) {
     // codepoints. In the case of emojis, we deal with 4-byte codepoints.
     // We can also use the idea, that misaligned reads are quite cheap on modern CPUs.
     int can_be_ascii = 1, can_be_utf8 = 1, can_be_utf16 = 1, can_be_utf32 = 1;
-    sz_unused(can_be_ascii + can_be_utf8 + can_be_utf16 + can_be_utf32);
-    sz_unused(text && length);
+    sz_unused_(can_be_ascii + can_be_utf8 + can_be_utf16 + can_be_utf32);
+    sz_unused_(text && length);
     return sz_false_k;
 }
diff --git a/drafts/sort.h b/drafts/sort.h
index bc1bb34e..37560ecf 100644
--- a/drafts/sort.h
+++ b/drafts/sort.h
@@ -83,19 +83,19 @@ SZ_INTERNAL void cswap_argsort_avx512(__m512i *pgrams, __m512i *offsets, __m512i
         _mm512_storeu_si512(pgrams_array, *pgrams);
         _mm512_storeu_si512(offsets_array, *offsets);
         for (sz_size_t i = 1; i < 8; ++i)
-            _sz_assert(pgrams_array[i - 1] <= pgrams_array[i] &&
+            sz_assert_(pgrams_array[i - 1] <= pgrams_array[i] &&
                        "The sorting network must sort the pgrams in ascending order.");
     }
 }
 
-SZ_PUBLIC void _sz_sequence_argsort_ice_recursively(                    //
+SZ_PUBLIC void sz_sequence_argsort_ice_recursively_(                    //
     sz_sequence_t const *const collection,                              //
     sz_pgram_t *const global_pgrams, sz_size_t *const global_order,     //
     sz_size_t const start_in_sequence, sz_size_t const end_in_sequence, //
     sz_size_t const start_character) {
 
     // Prepare the new range of windows
-    _sz_sequence_argsort_serial_export_next_pgrams(collection, global_pgrams, global_order, start_in_sequence,
+    sz_sequence_argsort_serial_export_next_pgrams_(collection, global_pgrams, global_order, start_in_sequence,
                                                    end_in_sequence, start_character);
 
     // We can implement a form of a Radix sort here, that will count the number of elements with
diff --git a/drafts/test_find_many.cuh b/drafts/test_find_many.cuh
index 729ea306..07b2372d 100644
--- a/drafts/test_find_many.cuh
+++ b/drafts/test_find_many.cuh
@@ -17,7 +17,7 @@
 #include "stringzillas/find_many.cuh"
 #endif
 
-#if !_SZ_IS_CPP17
+#if !SZ_IS_CPP17_
 #error "This test requires C++17 or later."
 #endif
 
@@ -118,7 +118,7 @@ struct find_many_baselines_t {
     status_t try_find(haystacks_type_ &&haystacks, span<size_t const> counts,
                       output_matches_type_ &&matches) const noexcept {
 
-        sz_unused(counts);
+        sz_unused_(counts);
         std::atomic<size_t> count_found {0};
         std::size_t const count_allowed {matches.size()};
         all_pairs(haystacks, needles_, [&](match_t const &match) noexcept {
@@ -150,8 +150,8 @@ void test_find_many_on(std::vector<std::string> haystacks, std::vector<std::stri
     // Construct the matchers
     status_t status_base = base_operator.try_build(needles_tape.view());
     status_t status_simd = simd_operator.try_build(needles_tape.view());
-    _sz_assert(status_base == status_t::success_k);
-    _sz_assert(status_simd == status_t::success_k);
+    sz_assert_(status_base == status_t::success_k);
+    sz_assert_(status_simd == status_t::success_k);
 
     // Old C-style for-loops are much more debuggable than range-based loops!
     for (std::size_t haystack_idx = 0; haystack_idx != haystacks.size(); ++haystack_idx) {
@@ -167,9 +167,9 @@ void test_find_many_on(std::vector<std::string> haystacks, std::vector<std::stri
         span<size_t> counts_simd_span {counts_simd.data(), counts_simd.size()};
         status_t status_count_base = base_operator.try_count(haystacks_tape.view(), counts_base_span);
         status_t status_count_simd = simd_operator.try_count(haystacks_tape.view(), counts_simd_span, extra_args...);
-        _sz_assert(status_count_base == status_t::success_k);
-        _sz_assert(status_count_simd == status_t::success_k);
-        _sz_assert(counts_base[0] == counts_simd[0]);
+        sz_assert_(status_count_base == status_t::success_k);
+        sz_assert_(status_count_simd == status_t::success_k);
+        sz_assert_(counts_base[0] == counts_simd[0]);
 
         // Check the matches themselves
         matches_base.resize(std::accumulate(counts_base.begin(), counts_base.end(), 0));
@@ -177,16 +177,16 @@ void test_find_many_on(std::vector<std::string> haystacks, std::vector<std::stri
         status_t status_matched_base = base_operator.try_find(haystacks_tape.view(), counts_base_span, matches_base);
         status_t status_matched_simd =
             simd_operator.try_find(haystacks_tape.view(), counts_simd_span, matches_simd, extra_args...);
-        _sz_assert(status_matched_base == status_t::success_k);
-        _sz_assert(status_matched_simd == status_t::success_k);
+        sz_assert_(status_matched_base == status_t::success_k);
+        sz_assert_(status_matched_simd == status_t::success_k);
 
         // Check the contents and order of the matches
         std::sort(matches_base.begin(), matches_base.end(), match_t::less_globally);
         std::sort(matches_simd.begin(), matches_simd.end(), match_t::less_globally);
         for (std::size_t i = 0; i != matches_base.size(); ++i) {
-            _sz_assert(matches_base[i].haystack.data() == matches_simd[i].haystack.data());
-            _sz_assert(matches_base[i].needle.data() == matches_simd[i].needle.data());
-            _sz_assert(matches_base[i].needle_index == matches_simd[i].needle_index);
+            sz_assert_(matches_base[i].haystack.data() == matches_simd[i].haystack.data());
+            sz_assert_(matches_base[i].needle.data() == matches_simd[i].needle.data());
+            sz_assert_(matches_base[i].needle_index == matches_simd[i].needle_index);
         }
     }
 
@@ -201,9 +201,9 @@ void test_find_many_on(std::vector<std::string> haystacks, std::vector<std::stri
         span<size_t> counts_simd_span {counts_simd.data(), counts_simd.size()};
         status_t status_count_base = base_operator.try_count(haystacks_tape.view(), counts_base_span);
         status_t status_count_simd = simd_operator.try_count(haystacks_tape.view(), counts_simd_span, extra_args...);
-        _sz_assert(status_count_base == status_t::success_k);
-        _sz_assert(status_count_simd == status_t::success_k);
-        _sz_assert(std::equal(counts_base.begin(), counts_base.end(), counts_simd.begin()));
+        sz_assert_(status_count_base == status_t::success_k);
+        sz_assert_(status_count_simd == status_t::success_k);
+        sz_assert_(std::equal(counts_base.begin(), counts_base.end(), counts_simd.begin()));
 
         // Check the matches themselves
         matches_base.resize(std::accumulate(counts_base.begin(), counts_base.end(), 0));
@@ -211,16 +211,16 @@ void test_find_many_on(std::vector<std::string> haystacks, std::vector<std::stri
         status_t status_matched_base = base_operator.try_find(haystacks_tape.view(), counts_base_span, matches_base);
         status_t status_matched_simd =
             simd_operator.try_find(haystacks_tape.view(), counts_simd_span, matches_simd, extra_args...);
-        _sz_assert(status_matched_base == status_t::success_k);
-        _sz_assert(status_matched_simd == status_t::success_k);
+        sz_assert_(status_matched_base == status_t::success_k);
+        sz_assert_(status_matched_simd == status_t::success_k);
 
         // Check the contents and order of the matches
         std::sort(matches_base.begin(), matches_base.end(), match_t::less_globally);
         std::sort(matches_simd.begin(), matches_simd.end(), match_t::less_globally);
         for (std::size_t i = 0; i != matches_base.size(); ++i) {
-            _sz_assert(matches_base[i].haystack.data() == matches_simd[i].haystack.data());
-            _sz_assert(matches_base[i].needle.data() == matches_simd[i].needle.data());
-            _sz_assert(matches_base[i].needle_index == matches_simd[i].needle_index);
+            sz_assert_(matches_base[i].haystack.data() == matches_simd[i].haystack.data());
+            sz_assert_(matches_base[i].needle.data() == matches_simd[i].needle.data());
+            sz_assert_(matches_base[i].needle_index == matches_simd[i].needle_index);
         }
     }
 }
@@ -337,20 +337,20 @@ void test_find_many(base_operator_ &&base_operator, simd_operator_ &&simd_operat
     counts_simd.resize(haystacks_tape.size());
 
     // Build the matchers
-    _sz_assert(base_operator.try_build(needles_tape.view()) == status_t::success_k);
-    _sz_assert(simd_operator.try_build(needles_tape.view()) == status_t::success_k);
+    sz_assert_(base_operator.try_build(needles_tape.view()) == status_t::success_k);
+    sz_assert_(simd_operator.try_build(needles_tape.view()) == status_t::success_k);
 
     // Count the number of matches with both backends
     span<size_t> counts_base_span {counts_base.data(), counts_base.size()};
     span<size_t> counts_simd_span {counts_simd.data(), counts_simd.size()};
     status_t status_count_base = base_operator.try_count(haystacks_tape.view(), counts_base_span);
     status_t status_count_simd = simd_operator.try_count(haystacks_tape.view(), counts_simd_span, extra_args...);
-    _sz_assert(status_count_base == status_t::success_k);
-    _sz_assert(status_count_simd == status_t::success_k);
+    sz_assert_(status_count_base == status_t::success_k);
+    sz_assert_(status_count_simd == status_t::success_k);
     size_t total_count_base = std::accumulate(counts_base.begin(), counts_base.end(), 0);
     size_t total_count_simd = std::accumulate(counts_simd.begin(), counts_simd.end(), 0);
-    _sz_assert(total_count_base == total_count_simd);
-    _sz_assert(std::equal(counts_base.begin(), counts_base.end(), counts_simd.begin()));
+    sz_assert_(total_count_base == total_count_simd);
+    sz_assert_(std::equal(counts_base.begin(), counts_base.end(), counts_simd.begin()));
 
     // Compute with both backends
     results_base.resize(total_count_base);
@@ -358,17 +358,17 @@ void test_find_many(base_operator_ &&base_operator, simd_operator_ &&simd_operat
     size_t count_base = 0, count_simd = 0;
     status_t status_base = base_operator.try_find(haystacks_tape.view(), counts_base_span, results_base);
     status_t status_simd = simd_operator.try_find(haystacks_tape.view(), counts_simd_span, results_simd, extra_args...);
-    _sz_assert(status_base == status_t::success_k);
-    _sz_assert(status_simd == status_t::success_k);
-    _sz_assert(count_base == count_simd);
+    sz_assert_(status_base == status_t::success_k);
+    sz_assert_(status_simd == status_t::success_k);
+    sz_assert_(count_base == count_simd);
 
     // Individually log the failed results
     std::sort(results_base.begin(), results_base.end(), match_t::less_globally);
     std::sort(results_simd.begin(), results_simd.end(), match_t::less_globally);
     for (std::size_t i = 0; i != results_base.size(); ++i) {
-        _sz_assert(results_base[i].haystack_index == results_simd[i].haystack_index);
-        _sz_assert(results_base[i].needle_index == results_simd[i].needle_index);
-        _sz_assert(results_base[i].needle.data() == results_simd[i].needle.data());
+        sz_assert_(results_base[i].haystack_index == results_simd[i].haystack_index);
+        sz_assert_(results_base[i].needle_index == results_simd[i].needle_index);
+        sz_assert_(results_base[i].needle.data() == results_simd[i].needle.data());
     }
 
     base_operator.reset();
diff --git a/include/stringzilla/compare.h b/include/stringzilla/compare.h
index 11b82e17..609bea9d 100644
--- a/include/stringzilla/compare.h
+++ b/include/stringzilla/compare.h
@@ -139,19 +139,19 @@ SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr
     sz_bool_t a_shorter = (sz_bool_t)(a_length < b_length);
     sz_size_t min_length = a_shorter ? a_length : b_length;
     sz_cptr_t min_end = a + min_length;
-#if SZ_USE_MISALIGNED_LOADS && !_SZ_IS_BIG_ENDIAN
+#if SZ_USE_MISALIGNED_LOADS && !SZ_IS_BIG_ENDIAN_
     for (sz_u64_vec_t a_vec, b_vec; a + 8 <= min_end; a += 8, b += 8) {
         a_vec = sz_u64_load(a);
         b_vec = sz_u64_load(b);
         if (a_vec.u64 != b_vec.u64)
-            return _sz_order_scalars(sz_u64_bytes_reverse(a_vec.u64), sz_u64_bytes_reverse(b_vec.u64));
+            return sz_order_scalars_(sz_u64_bytes_reverse(a_vec.u64), sz_u64_bytes_reverse(b_vec.u64));
     }
 #endif
     for (; a != min_end; ++a, ++b)
-        if (*a != *b) return _sz_order_scalars(*a, *b);
+        if (*a != *b) return sz_order_scalars_(*a, *b);
 
     // If the strings are equal up to `min_end`, then the shorter string is smaller
-    return _sz_order_scalars(a_length, b_length);
+    return sz_order_scalars_(a_length, b_length);
 }
 
 #pragma endregion // Serial Implementation
@@ -255,7 +255,7 @@ SZ_PUBLIC sz_ordering_t sz_order_skylake(sz_cptr_t a, sz_size_t a_length, sz_cpt
     a_head_length = a_head_length < a_length ? a_head_length : a_length;
     b_head_length = b_head_length < b_length ? b_head_length : b_length;
     sz_size_t head_length = a_head_length < b_head_length ? a_head_length : b_head_length;
-    __mmask64 head_mask = _sz_u64_mask_until(head_length);
+    __mmask64 head_mask = sz_u64_mask_until_(head_length);
     a_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, a);
     b_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, b);
     __mmask64 mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
@@ -263,7 +263,7 @@ SZ_PUBLIC sz_ordering_t sz_order_skylake(sz_cptr_t a, sz_size_t a_length, sz_cpt
         sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
         char a_char = a_vec.u8s[first_diff];
         char b_char = b_vec.u8s[first_diff];
-        return _sz_order_scalars(a_char, b_char);
+        return sz_order_scalars_(a_char, b_char);
     }
     else if (head_length == a_length && head_length == b_length) { return sz_equal_k; }
     else { a += head_length, b += head_length, a_length -= head_length, b_length -= head_length; }
@@ -278,15 +278,15 @@ SZ_PUBLIC sz_ordering_t sz_order_skylake(sz_cptr_t a, sz_size_t a_length, sz_cpt
             sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
             char a_char = a_vec.u8s[first_diff];
             char b_char = b_vec.u8s[first_diff];
-            return _sz_order_scalars(a_char, b_char);
+            return sz_order_scalars_(a_char, b_char);
         }
         a += 64, b += 64, a_length -= 64, b_length -= 64;
     }
 
     // In most common scenarios at least one of the strings is under 64 bytes.
     if (a_length | b_length) {
-        a_mask = _sz_u64_clamp_mask_until(a_length);
-        b_mask = _sz_u64_clamp_mask_until(b_length);
+        a_mask = sz_u64_clamp_mask_until_(a_length);
+        b_mask = sz_u64_clamp_mask_until_(b_length);
         a_vec.zmm = _mm512_maskz_loadu_epi8(a_mask, a);
         b_vec.zmm = _mm512_maskz_loadu_epi8(b_mask, b);
         // The AVX-512 `_mm512_mask_cmpneq_epi8_mask` intrinsics are generally handy in such environments.
@@ -297,11 +297,11 @@ SZ_PUBLIC sz_ordering_t sz_order_skylake(sz_cptr_t a, sz_size_t a_length, sz_cpt
             sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
             char a_char = a_vec.u8s[first_diff];
             char b_char = b_vec.u8s[first_diff];
-            return _sz_order_scalars(a_char, b_char);
+            return sz_order_scalars_(a_char, b_char);
         }
         // From logic perspective, the hardest cases are "abc\0" and "abc".
         // The result must be `sz_greater_k`, as the latter is shorter.
-        else { return _sz_order_scalars(a_length, b_length); }
+        else { return sz_order_scalars_(a_length, b_length); }
     }
 
     return sz_equal_k;
@@ -320,7 +320,7 @@ SZ_PUBLIC sz_bool_t sz_equal_skylake(sz_cptr_t a, sz_cptr_t b, sz_size_t length)
     }
 
     if (length) {
-        mask = _sz_u64_mask_until(length);
+        mask = sz_u64_mask_until_(length);
         a_vec.zmm = _mm512_maskz_loadu_epi8(mask, a);
         b_vec.zmm = _mm512_maskz_loadu_epi8(mask, b);
         // Reuse the same `mask` variable to find the bit that doesn't match
diff --git a/include/stringzilla/find.h b/include/stringzilla/find.h
index 2e8724c3..6b3a306b 100644
--- a/include/stringzilla/find.h
+++ b/include/stringzilla/find.h
@@ -241,7 +241,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byte_not_from(sz_cptr_t h, sz_size_t h_length, sz_c
  *  into [0x0430, 0x044F] for lowercase [а, я]. Scanning through a text written in Russian, half of the
  *  bytes will carry absolutely no value and will be equal to 0x04.
  */
-SZ_INTERNAL void _sz_locate_needle_anomalies( //
+SZ_INTERNAL void sz_locate_needle_anomalies_( //
     sz_cptr_t start, sz_size_t length,        //
     sz_size_t *first, sz_size_t *second, sz_size_t *third) {
 
@@ -329,7 +329,7 @@ SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr
     if (!h_length) return SZ_NULL_CHAR;
     sz_cptr_t const h_end = h + h_length;
 
-#if !_SZ_IS_BIG_ENDIAN       // Use SWAR only on little-endian platforms for brevity.
+#if !SZ_IS_BIG_ENDIAN_       // Use SWAR only on little-endian platforms for brevity.
 #if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
         if (*h == *n) return h;
@@ -342,7 +342,7 @@ SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr
     n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
     for (; h + 8 <= h_end; h += 8) {
         h_vec.u64 = *(sz_u64_t const *)h;
-        match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
+        match_vec = sz_u64_each_byte_equal_(h_vec, n_vec);
         if (match_vec.u64) return h + sz_u64_ctz(match_vec.u64) / 8;
     }
 #endif
@@ -365,7 +365,7 @@ sz_cptr_t sz_rfind_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
     // Reposition the `h` pointer to the end, as we will be walking backwards.
     h = h + h_length - 1;
 
-#if !_SZ_IS_BIG_ENDIAN       // Use SWAR only on little-endian platforms for brevity.
+#if !SZ_IS_BIG_ENDIAN_       // Use SWAR only on little-endian platforms for brevity.
 #if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((sz_size_t)(h + 1) & 7ull) && h >= h_start; --h)
         if (*h == *n) return h;
@@ -377,7 +377,7 @@ sz_cptr_t sz_rfind_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
     n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
     for (; h >= h_start + 7; h -= 8) {
         h_vec.u64 = *(sz_u64_t const *)(h - 7);
-        match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
+        match_vec = sz_u64_each_byte_equal_(h_vec, n_vec);
         if (match_vec.u64) return h - sz_u64_clz(match_vec.u64) / 8;
     }
 #endif
@@ -391,7 +391,7 @@ sz_cptr_t sz_rfind_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
  *  @brief  2Byte-level equality comparison between two 64-bit integers.
  *  @return 64-bit integer, where every top bit in each 2byte signifies a match.
  */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_2byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
+SZ_INTERNAL sz_u64_vec_t sz_u64_each_2byte_equal_(sz_u64_vec_t a, sz_u64_vec_t b) {
     sz_u64_vec_t vec;
     vec.u64 = ~(a.u64 ^ b.u64);
     // The match is valid, if every bit within each 2byte is set.
@@ -401,13 +401,13 @@ SZ_INTERNAL sz_u64_vec_t _sz_u64_each_2byte_equal(sz_u64_vec_t a, sz_u64_vec_t b
     return vec;
 }
 
-SZ_INTERNAL sz_cptr_t _sz_find_1byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_unused(n_length); //? We keep this argument only for `sz_find_t` signature compatibility.
+SZ_INTERNAL sz_cptr_t sz_find_1byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+    sz_unused_(n_length); //? We keep this argument only for `sz_find_t` signature compatibility.
     return sz_find_byte_serial(h, h_length, n);
 }
 
-SZ_INTERNAL sz_cptr_t _sz_rfind_1byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    sz_unused(n_length); //? We keep this argument only for `sz_rfind_t` signature compatibility.
+SZ_INTERNAL sz_cptr_t sz_rfind_1byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+    sz_unused_(n_length); //? We keep this argument only for `sz_rfind_t` signature compatibility.
     return sz_rfind_byte_serial(h, h_length, n);
 }
 
@@ -415,11 +415,11 @@ SZ_INTERNAL sz_cptr_t _sz_rfind_1byte_serial(sz_cptr_t h, sz_size_t h_length, sz
  *  @brief  Find the first occurrence of a @b two-character needle in an arbitrary length haystack.
  *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
  */
-SZ_INTERNAL sz_cptr_t _sz_find_2byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+SZ_INTERNAL sz_cptr_t sz_find_2byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
 
     // This is an internal method, and the haystack is guaranteed to be at least 2 bytes long.
-    _sz_assert(h_length >= 2 && "The haystack is too short.");
-    sz_unused(n_length); //? We keep this argument only for `sz_find_t` signature compatibility.
+    sz_assert_(h_length >= 2 && "The haystack is too short.");
+    sz_unused_(n_length); //? We keep this argument only for `sz_find_t` signature compatibility.
     sz_cptr_t const h_end = h + h_length;
 
 #if !SZ_USE_MISALIGNED_LOADS
@@ -437,8 +437,8 @@ SZ_INTERNAL sz_cptr_t _sz_find_2byte_serial(sz_cptr_t h, sz_size_t h_length, sz_
     for (; h + 9 <= h_end; h += 8) {
         h_even_vec.u64 = *(sz_u64_t *)h;
         h_odd_vec.u64 = (h_even_vec.u64 >> 8) | ((sz_u64_t)h[8] << 56);
-        matches_even_vec = _sz_u64_each_2byte_equal(h_even_vec, n_vec);
-        matches_odd_vec = _sz_u64_each_2byte_equal(h_odd_vec, n_vec);
+        matches_even_vec = sz_u64_each_2byte_equal_(h_even_vec, n_vec);
+        matches_odd_vec = sz_u64_each_2byte_equal_(h_odd_vec, n_vec);
 
         matches_even_vec.u64 >>= 8;
         if (matches_even_vec.u64 + matches_odd_vec.u64) {
@@ -456,7 +456,7 @@ SZ_INTERNAL sz_cptr_t _sz_find_2byte_serial(sz_cptr_t h, sz_size_t h_length, sz_
  *  @brief  4Byte-level equality comparison between two 64-bit integers.
  *  @return 64-bit integer, where every top bit in each 4byte signifies a match.
  */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_4byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
+SZ_INTERNAL sz_u64_vec_t sz_u64_each_4byte_equal_(sz_u64_vec_t a, sz_u64_vec_t b) {
     sz_u64_vec_t vec;
     vec.u64 = ~(a.u64 ^ b.u64);
     // The match is valid, if every bit within each 4byte is set.
@@ -470,11 +470,11 @@ SZ_INTERNAL sz_u64_vec_t _sz_u64_each_4byte_equal(sz_u64_vec_t a, sz_u64_vec_t b
  *  @brief  Find the first occurrence of a @b four-character needle in an arbitrary length haystack.
  *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
  */
-SZ_INTERNAL sz_cptr_t _sz_find_4byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+SZ_INTERNAL sz_cptr_t sz_find_4byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
 
     // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
-    _sz_assert(h_length >= 4 && "The haystack is too short.");
-    sz_unused(n_length); //? We keep this argument only for `sz_find_t` signature compatibility.
+    sz_assert_(h_length >= 4 && "The haystack is too short.");
+    sz_unused_(n_length); //? We keep this argument only for `sz_find_t` signature compatibility.
     sz_cptr_t const h_end = h + h_length;
 
 #if !SZ_USE_MISALIGNED_LOADS
@@ -498,10 +498,10 @@ SZ_INTERNAL sz_cptr_t _sz_find_4byte_serial(sz_cptr_t h, sz_size_t h_length, sz_
         h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
         h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
         h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
-        matches0_vec = _sz_u64_each_4byte_equal(h0_vec, n_vec);
-        matches1_vec = _sz_u64_each_4byte_equal(h1_vec, n_vec);
-        matches2_vec = _sz_u64_each_4byte_equal(h2_vec, n_vec);
-        matches3_vec = _sz_u64_each_4byte_equal(h3_vec, n_vec);
+        matches0_vec = sz_u64_each_4byte_equal_(h0_vec, n_vec);
+        matches1_vec = sz_u64_each_4byte_equal_(h1_vec, n_vec);
+        matches2_vec = sz_u64_each_4byte_equal_(h2_vec, n_vec);
+        matches3_vec = sz_u64_each_4byte_equal_(h3_vec, n_vec);
 
         if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64) {
             matches0_vec.u64 >>= 24;
@@ -521,7 +521,7 @@ SZ_INTERNAL sz_cptr_t _sz_find_4byte_serial(sz_cptr_t h, sz_size_t h_length, sz_
  *  @brief  3Byte-level equality comparison between two 64-bit integers.
  *  @return 64-bit integer, where every top bit in each 3byte signifies a match.
  */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_3byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
+SZ_INTERNAL sz_u64_vec_t sz_u64_each_3byte_equal_(sz_u64_vec_t a, sz_u64_vec_t b) {
     sz_u64_vec_t vec;
     vec.u64 = ~(a.u64 ^ b.u64);
     // The match is valid, if every bit within each 4byte is set.
@@ -535,11 +535,11 @@ SZ_INTERNAL sz_u64_vec_t _sz_u64_each_3byte_equal(sz_u64_vec_t a, sz_u64_vec_t b
  *  @brief  Find the first occurrence of a @b three-character needle in an arbitrary length haystack.
  *          This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
  */
-SZ_INTERNAL sz_cptr_t _sz_find_3byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+SZ_INTERNAL sz_cptr_t sz_find_3byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
 
     // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
-    _sz_assert(h_length >= 3 && "The haystack is too short.");
-    sz_unused(n_length); //? We keep this argument only for `sz_find_t` signature compatibility.
+    sz_assert_(h_length >= 3 && "The haystack is too short.");
+    sz_unused_(n_length); //? We keep this argument only for `sz_find_t` signature compatibility.
     sz_cptr_t const h_end = h + h_length;
 
 #if !SZ_USE_MISALIGNED_LOADS
@@ -567,11 +567,11 @@ SZ_INTERNAL sz_cptr_t _sz_find_3byte_serial(sz_cptr_t h, sz_size_t h_length, sz_
         h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
         h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
         h4_vec.u64 = (h_page_current >> 32) | (h_page_next << 32);
-        matches0_vec = _sz_u64_each_3byte_equal(h0_vec, n_vec);
-        matches1_vec = _sz_u64_each_3byte_equal(h1_vec, n_vec);
-        matches2_vec = _sz_u64_each_3byte_equal(h2_vec, n_vec);
-        matches3_vec = _sz_u64_each_3byte_equal(h3_vec, n_vec);
-        matches4_vec = _sz_u64_each_3byte_equal(h4_vec, n_vec);
+        matches0_vec = sz_u64_each_3byte_equal_(h0_vec, n_vec);
+        matches1_vec = sz_u64_each_3byte_equal_(h1_vec, n_vec);
+        matches2_vec = sz_u64_each_3byte_equal_(h2_vec, n_vec);
+        matches3_vec = sz_u64_each_3byte_equal_(h3_vec, n_vec);
+        matches4_vec = sz_u64_each_3byte_equal_(h4_vec, n_vec);
 
         if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64) {
             matches0_vec.u64 >>= 16;
@@ -593,10 +593,10 @@ SZ_INTERNAL sz_cptr_t _sz_find_3byte_serial(sz_cptr_t h, sz_size_t h_length, sz_
  *  @brief  Boyer-Moore-Horspool algorithm for exact matching of patterns up to @b 256-bytes long.
  *          Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
  */
-SZ_INTERNAL sz_cptr_t _sz_find_horspool_upto_256bytes_serial( //
+SZ_INTERNAL sz_cptr_t sz_find_horspool_upto_256bytes_serial_( //
     sz_cptr_t h_chars, sz_size_t h_length,                    //
     sz_cptr_t n_chars, sz_size_t n_length) {
-    _sz_assert(n_length <= 256 && "The pattern is too long.");
+    sz_assert_(n_length <= 256 && "The pattern is too long.");
     // Several popular string matching algorithms are using a bad-character shift table.
     // Boyer Moore: https://www-igm.univ-mlv.fr/~lecroq/string/node14.html
     // Quick Search: https://www-igm.univ-mlv.fr/~lecroq/string/node19.html
@@ -623,7 +623,7 @@ SZ_INTERNAL sz_cptr_t _sz_find_horspool_upto_256bytes_serial( //
 
     // Pick the parts of the needle that are worth comparing.
     sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
+    sz_locate_needle_anomalies_(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
 
     // Broadcast those characters into an unsigned integer.
     n_vec.u8s[0] = n[offset_first];
@@ -647,10 +647,10 @@ SZ_INTERNAL sz_cptr_t _sz_find_horspool_upto_256bytes_serial( //
  *  @brief  Boyer-Moore-Horspool algorithm for @b reverse-order exact matching of patterns up to @b 256-bytes long.
  *          Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
  */
-SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_upto_256bytes_serial( //
+SZ_INTERNAL sz_cptr_t sz_rfind_horspool_upto_256bytes_serial_( //
     sz_cptr_t h_chars, sz_size_t h_length,                     //
     sz_cptr_t n_chars, sz_size_t n_length) {
-    _sz_assert(n_length <= 256 && "The pattern is too long.");
+    sz_assert_(n_length <= 256 && "The pattern is too long.");
     union {
         sz_u8_t jumps[256];
         sz_u64_vec_t vecs[64];
@@ -674,7 +674,7 @@ SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_upto_256bytes_serial( //
 
     // Pick the parts of the needle that are worth comparing.
     sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
+    sz_locate_needle_anomalies_(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
 
     // Broadcast those characters into an unsigned integer.
     n_vec.u8s[0] = n[offset_first];
@@ -699,7 +699,7 @@ SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_upto_256bytes_serial( //
  *  @brief  Exact substring search helper function, that finds the first occurrence of a prefix of the needle
  *          using a given search function, and then verifies the remaining part of the needle.
  */
-SZ_INTERNAL sz_cptr_t _sz_find_with_prefix( //
+SZ_INTERNAL sz_cptr_t sz_find_with_prefix_( //
     sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length, sz_find_t find_prefix, sz_size_t prefix_length) {
 
     sz_size_t suffix_length = n_length - prefix_length;
@@ -725,7 +725,7 @@ SZ_INTERNAL sz_cptr_t _sz_find_with_prefix( //
  *  @brief  Exact reverse-order substring search helper function, that finds the last occurrence of a suffix of the
  *          needle using a given search function, and then verifies the remaining part of the needle.
  */
-SZ_INTERNAL sz_cptr_t _sz_rfind_with_suffix(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length,
+SZ_INTERNAL sz_cptr_t sz_rfind_with_suffix_(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length,
                                             sz_find_t find_suffix, sz_size_t suffix_length) {
 
     sz_size_t prefix_length = n_length - suffix_length;
@@ -746,44 +746,44 @@ SZ_INTERNAL sz_cptr_t _sz_rfind_with_suffix(sz_cptr_t h, sz_size_t h_length, sz_
     return SZ_NULL_CHAR;
 }
 
-SZ_INTERNAL sz_cptr_t _sz_find_over_4bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    return _sz_find_with_prefix(h, h_length, n, n_length, (sz_find_t)_sz_find_4byte_serial, 4);
+SZ_INTERNAL sz_cptr_t sz_find_over_4bytes_serial_(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
+    return sz_find_with_prefix_(h, h_length, n, n_length, (sz_find_t)sz_find_4byte_serial_, 4);
 }
 
-SZ_INTERNAL sz_cptr_t _sz_find_horspool_over_256bytes_serial( //
+SZ_INTERNAL sz_cptr_t sz_find_horspool_over_256bytes_serial_( //
     sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    return _sz_find_with_prefix(h, h_length, n, n_length, _sz_find_horspool_upto_256bytes_serial, 256);
+    return sz_find_with_prefix_(h, h_length, n, n_length, sz_find_horspool_upto_256bytes_serial_, 256);
 }
 
-SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_over_256bytes_serial( //
+SZ_INTERNAL sz_cptr_t sz_rfind_horspool_over_256bytes_serial_( //
     sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
-    return _sz_rfind_with_suffix(h, h_length, n, n_length, _sz_rfind_horspool_upto_256bytes_serial, 256);
+    return sz_rfind_with_suffix_(h, h_length, n, n_length, sz_rfind_horspool_upto_256bytes_serial_, 256);
 }
 
 SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
     // This almost never fires, but it's better to be safe than sorry.
     if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
 
-#if _SZ_IS_BIG_ENDIAN
+#if SZ_IS_BIG_ENDIAN_
     sz_find_t backends[] = {
-        _sz_find_1byte_serial,
-        _sz_find_horspool_upto_256bytes_serial,
-        _sz_find_horspool_over_256bytes_serial,
+        sz_find_1byte_serial_,
+        sz_find_horspool_upto_256bytes_serial_,
+        sz_find_horspool_over_256bytes_serial_,
     };
 
     return backends[(n_length > 1) + (n_length > 256)](h, h_length, n, n_length);
 #else
     sz_find_t backends[] = {
         // For very short strings brute-force SWAR makes sense.
-        _sz_find_1byte_serial,
-        _sz_find_2byte_serial,
-        _sz_find_3byte_serial,
-        _sz_find_4byte_serial,
+        sz_find_1byte_serial_,
+        sz_find_2byte_serial_,
+        sz_find_3byte_serial_,
+        sz_find_4byte_serial_,
         // To avoid constructing the skip-table, let's use the prefixed approach.
-        _sz_find_over_4bytes_serial,
+        sz_find_over_4bytes_serial_,
         // For longer needles - use skip tables.
-        _sz_find_horspool_upto_256bytes_serial,
-        _sz_find_horspool_over_256bytes_serial,
+        sz_find_horspool_upto_256bytes_serial_,
+        sz_find_horspool_over_256bytes_serial_,
     };
 
     return backends[
@@ -803,16 +803,16 @@ SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n
 
     sz_find_t backends[] = {
         // For very short strings brute-force SWAR makes sense.
-        _sz_rfind_1byte_serial,
+        sz_rfind_1byte_serial_,
         //  TODO: implement reverse-order SWAR for 2/3/4 byte variants.
-        //  TODO: _sz_rfind_2byte_serial,
-        //  TODO: _sz_rfind_3byte_serial,
-        //  TODO: _sz_rfind_4byte_serial,
+        //  TODO: sz_rfind_2byte_serial_,
+        //  TODO: sz_rfind_3byte_serial_,
+        //  TODO: sz_rfind_4byte_serial_,
         // To avoid constructing the skip-table, let's use the prefixed approach.
-        // _sz_rfind_over_4bytes_serial,
+        // sz_rfind_over_4bytes_serial_,
         // For longer needles - use skip tables.
-        _sz_rfind_horspool_upto_256bytes_serial,
-        _sz_rfind_horspool_over_256bytes_serial,
+        sz_rfind_horspool_upto_256bytes_serial_,
+        sz_rfind_horspool_over_256bytes_serial_,
     };
 
     return backends[
@@ -873,7 +873,7 @@ SZ_PUBLIC sz_cptr_t sz_find_haswell(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n
 
     // Pick the parts of the needle that are worth comparing.
     sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
+    sz_locate_needle_anomalies_(n, n_length, &offset_first, &offset_mid, &offset_last);
 
     // Broadcast those characters into YMM registers.
     int matches;
@@ -909,7 +909,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_haswell(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
 
     // Pick the parts of the needle that are worth comparing.
     sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
+    sz_locate_needle_anomalies_(n, n_length, &offset_first, &offset_mid, &offset_last);
 
     // Broadcast those characters into YMM registers.
     int matches;
@@ -987,7 +987,7 @@ SZ_PUBLIC sz_cptr_t sz_find_byteset_haswell(sz_cptr_t text, sz_size_t length, sz
         //          sz_u8_t input = *(sz_u8_t const *)(text + i);
         //          sz_u8_t lo_nibble = input & 0x0f;
         //          sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //          _sz_assert(bitmask_vec.u8s[i] == bitmask);
+        //          sz_assert_(bitmask_vec.u8s[i] == bitmask);
         //      }
         //
         // Shift right every byte by 4 bits.
@@ -1005,8 +1005,8 @@ SZ_PUBLIC sz_cptr_t sz_find_byteset_haswell(sz_cptr_t text, sz_size_t length, sz
         //          sz_u8_t hi_nibble = input >> 4;
         //          sz_u8_t bitset_even = bitset_ptr[hi_nibble * 2];
         //          sz_u8_t bitset_odd = bitset_ptr[hi_nibble * 2 + 1];
-        //          _sz_assert(bitset_even_vec.u8s[i] == bitset_even);
-        //          _sz_assert(bitset_odd_vec.u8s[i] == bitset_odd);
+        //          sz_assert_(bitset_even_vec.u8s[i] == bitset_even);
+        //          sz_assert_(bitset_odd_vec.u8s[i] == bitset_odd);
         //      }
         //
         __m256i take_first = _mm256_cmpgt_epi8(_mm256_set1_epi8(8), lower_nibbles_vec.ymm);
@@ -1061,7 +1061,7 @@ SZ_PUBLIC sz_cptr_t sz_find_byte_skylake(sz_cptr_t h, sz_size_t h_length, sz_cpt
     }
 
     if (h_length) {
-        mask = _sz_u64_mask_until(h_length);
+        mask = sz_u64_mask_until_(h_length);
         h_vec.zmm = _mm512_maskz_loadu_epi8(mask, h);
         // Reuse the same `mask` variable to find the bit that doesn't match
         mask = _mm512_mask_cmpeq_epu8_mask(mask, h_vec.zmm, n_vec.zmm);
@@ -1079,7 +1079,7 @@ SZ_PUBLIC sz_cptr_t sz_find_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n
 
     // Pick the parts of the needle that are worth comparing.
     sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
+    sz_locate_needle_anomalies_(n, n_length, &offset_first, &offset_mid, &offset_last);
 
     // Broadcast those characters into ZMM registers.
     __mmask64 matches;
@@ -1130,7 +1130,7 @@ SZ_PUBLIC sz_cptr_t sz_find_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n
     // to avoid the the inner-most nested loop and compare the entire needle against a haystack
     // slice in 3 CPU cycles.
     else {
-        __mmask64 n_mask = _sz_u64_mask_until(n_length);
+        __mmask64 n_mask = sz_u64_mask_until_(n_length);
         sz_u512_vec_t n_full_vec, h_full_vec;
         n_full_vec.zmm = _mm512_maskz_loadu_epi8(n_mask, n);
         for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
@@ -1154,7 +1154,7 @@ SZ_PUBLIC sz_cptr_t sz_find_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n
 
     // The "tail" of the function uses masked loads to process the remaining bytes.
     {
-        mask = _sz_u64_mask_until(h_length - n_length + 1);
+        mask = sz_u64_mask_until_(h_length - n_length + 1);
         h_first_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_first);
         h_mid_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_mid);
         h_last_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_last);
@@ -1185,7 +1185,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byte_skylake(sz_cptr_t h, sz_size_t h_length, sz_cp
     }
 
     if (h_length) {
-        mask = _sz_u64_mask_until(h_length);
+        mask = sz_u64_mask_until_(h_length);
         h_vec.zmm = _mm512_maskz_loadu_epi8(mask, h);
         // Reuse the same `mask` variable to find the bit that doesn't match
         mask = _mm512_mask_cmpeq_epu8_mask(mask, h_vec.zmm, n_vec.zmm);
@@ -1203,7 +1203,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
 
     // Pick the parts of the needle that are worth comparing.
     sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
+    sz_locate_needle_anomalies_(n, n_length, &offset_first, &offset_mid, &offset_last);
 
     // Broadcast those characters into ZMM registers.
     __mmask64 mask;
@@ -1229,7 +1229,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
             int potential_offset = sz_u64_clz(matches);
             if (n_length <= 3 || sz_equal_skylake(h + h_length - n_length - potential_offset, n, n_length))
                 return h + h_length - n_length - potential_offset;
-            _sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
+            sz_assert_((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
                        "The bit must be set before we squash it");
             matches &= ~((sz_u64_t)1 << (63 - potential_offset));
         }
@@ -1237,7 +1237,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
 
     // The "tail" of the function uses masked loads to process the remaining bytes.
     {
-        mask = _sz_u64_mask_until(h_length - n_length + 1);
+        mask = sz_u64_mask_until_(h_length - n_length + 1);
         h_first_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_first);
         h_mid_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_mid);
         h_last_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_last);
@@ -1250,7 +1250,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
             int potential_offset = sz_u64_clz(matches);
             if (n_length <= 3 || sz_equal_skylake(h + 64 - potential_offset - 1, n, n_length))
                 return h + 64 - potential_offset - 1;
-            _sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
+            sz_assert_((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
                        "The bit must be set before we squash it");
             matches &= ~((sz_u64_t)1 << (63 - potential_offset));
         }
@@ -1309,14 +1309,14 @@ SZ_PUBLIC sz_cptr_t sz_find_byteset_ice(sz_cptr_t text, sz_size_t length, sz_byt
     // After the unzipping operation, we can validate the contents of the vectors like this:
     //
     //      for (sz_size_t i = 0; i != 16; ++i) {
-    //          _sz_assert(filter_even_vec.u8s[i] == filter->_u8s[i * 2]);
-    //          _sz_assert(filter_odd_vec.u8s[i] == filter->_u8s[i * 2 + 1]);
-    //          _sz_assert(filter_even_vec.u8s[i + 16] == filter->_u8s[i * 2]);
-    //          _sz_assert(filter_odd_vec.u8s[i + 16] == filter->_u8s[i * 2 + 1]);
-    //          _sz_assert(filter_even_vec.u8s[i + 32] == filter->_u8s[i * 2]);
-    //          _sz_assert(filter_odd_vec.u8s[i + 32] == filter->_u8s[i * 2 + 1]);
-    //          _sz_assert(filter_even_vec.u8s[i + 48] == filter->_u8s[i * 2]);
-    //          _sz_assert(filter_odd_vec.u8s[i + 48] == filter->_u8s[i * 2 + 1]);
+    //          sz_assert_(filter_even_vec.u8s[i] == filter->_u8s[i * 2]);
+    //          sz_assert_(filter_odd_vec.u8s[i] == filter->_u8s[i * 2 + 1]);
+    //          sz_assert_(filter_even_vec.u8s[i + 16] == filter->_u8s[i * 2]);
+    //          sz_assert_(filter_odd_vec.u8s[i + 16] == filter->_u8s[i * 2 + 1]);
+    //          sz_assert_(filter_even_vec.u8s[i + 32] == filter->_u8s[i * 2]);
+    //          sz_assert_(filter_odd_vec.u8s[i + 32] == filter->_u8s[i * 2 + 1]);
+    //          sz_assert_(filter_even_vec.u8s[i + 48] == filter->_u8s[i * 2]);
+    //          sz_assert_(filter_odd_vec.u8s[i + 48] == filter->_u8s[i * 2 + 1]);
     //      }
     //
     sz_u512_vec_t text_vec;
@@ -1348,7 +1348,7 @@ SZ_PUBLIC sz_cptr_t sz_find_byteset_ice(sz_cptr_t text, sz_size_t length, sz_byt
         // The nice part about this, loading the strided data is vey easy with Arm NEON,
         // while with x86 CPUs after AVX, shuffles within 256 bits shouldn't be an issue either.
         sz_size_t load_length = sz_min_of_two(length, 64);
-        __mmask64 load_mask = _sz_u64_mask_until(load_length);
+        __mmask64 load_mask = sz_u64_mask_until_(load_length);
         text_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, text);
         lower_nibbles_vec.zmm = _mm512_and_si512(text_vec.zmm, _mm512_set1_epi8(0x0f));
         bitmask_vec.zmm = _mm512_shuffle_epi8(bitmask_lookup_vec.zmm, lower_nibbles_vec.zmm);
@@ -1359,7 +1359,7 @@ SZ_PUBLIC sz_cptr_t sz_find_byteset_ice(sz_cptr_t text, sz_size_t length, sz_byt
         //          sz_u8_t input = *(sz_u8_t const *)(text + i);
         //          sz_u8_t lo_nibble = input & 0x0f;
         //          sz_u8_t bitmask = (1 << (lo_nibble & 0x7));
-        //          _sz_assert(bitmask_vec.u8s[i] == bitmask);
+        //          sz_assert_(bitmask_vec.u8s[i] == bitmask);
         //      }
         //
         // Shift right every byte by 4 bits.
@@ -1377,8 +1377,8 @@ SZ_PUBLIC sz_cptr_t sz_find_byteset_ice(sz_cptr_t text, sz_size_t length, sz_byt
         //          sz_u8_t hi_nibble = input >> 4;
         //          sz_u8_t bitset_even = bitset_ptr[hi_nibble * 2];
         //          sz_u8_t bitset_odd = bitset_ptr[hi_nibble * 2 + 1];
-        //          _sz_assert(bitset_even_vec.u8s[i] == bitset_even);
-        //          _sz_assert(bitset_odd_vec.u8s[i] == bitset_odd);
+        //          sz_assert_(bitset_even_vec.u8s[i] == bitset_even);
+        //          sz_assert_(bitset_odd_vec.u8s[i] == bitset_odd);
         //      }
         //
         // TODO: Is this a good place for ternary logic?
@@ -1413,7 +1413,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byteset_ice(sz_cptr_t text, sz_size_t length, sz_by
 #pragma GCC target("arch=armv8.2-a+simd")
 #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
 
-SZ_INTERNAL sz_u64_t _sz_vreinterpretq_u8_u4(uint8x16_t vec) {
+SZ_INTERNAL sz_u64_t sz_vreinterpretq_u8_u4_(uint8x16_t vec) {
     // Use `vshrn` to produce a bitmask, similar to `movemask` in SSE.
     // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
     return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(vec), 4)), 0) & 0x8888888888888888ull;
@@ -1430,7 +1430,7 @@ SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
         // In Arm NEON we don't have a `movemask` to combine it with `ctz` and get the offset of the match.
         // But assuming the `vmaxvq` is cheap, we can use it to find the first match, by blending (bitwise selecting)
         // the vector with a relative offsets array.
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
+        matches = sz_vreinterpretq_u8_u4_(matches_vec.u8x16);
         if (matches) return h + sz_u64_ctz(matches) / 4;
 
         h += 16, h_length -= 16;
@@ -1447,7 +1447,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_
     while (h_length >= 16) {
         h_vec.u8x16 = vld1q_u8((sz_u8_t const *)h + h_length - 16);
         matches_vec.u8x16 = vceqq_u8(h_vec.u8x16, n_vec.u8x16);
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
+        matches = sz_vreinterpretq_u8_u4_(matches_vec.u8x16);
         if (matches) return h + h_length - 1 - sz_u64_clz(matches) / 4;
         h_length -= 16;
     }
@@ -1455,7 +1455,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_
     return sz_rfind_byte_serial(h, h_length, n);
 }
 
-SZ_PUBLIC sz_u64_t _sz_find_byteset_neon_register( //
+SZ_PUBLIC sz_u64_t sz_find_byteset_neon_register_( //
     sz_u128_vec_t h_vec, uint8x16_t set_top_vec_u8x16, uint8x16_t set_bottom_vec_u8x16) {
 
     // Once we've read the characters in the haystack, we want to
@@ -1472,7 +1472,7 @@ SZ_PUBLIC sz_u64_t _sz_find_byteset_neon_register( //
     uint8x16_t matches_vec = vorrq_u8(matches_top_vec, matches_bottom_vec);
     // Istead of pure `vandq_u8`, we can immediately broadcast a match presence across each 8-bit word.
     matches_vec = vtstq_u8(matches_vec, byte_mask_vec);
-    return _sz_vreinterpretq_u8_u4(matches_vec);
+    return sz_vreinterpretq_u8_u4_(matches_vec);
 }
 
 SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
@@ -1497,7 +1497,7 @@ SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, s
             h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 1));
             matches_vec.u8x16 =
                 vandq_u8(vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
+            matches = sz_vreinterpretq_u8_u4_(matches_vec.u8x16);
             if (matches) return h + sz_u64_ctz(matches) / 4;
         }
     }
@@ -1519,14 +1519,14 @@ SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, s
                     vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
                     vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
                 vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
+            matches = sz_vreinterpretq_u8_u4_(matches_vec.u8x16);
             if (matches) return h + sz_u64_ctz(matches) / 4;
         }
     }
     else {
         // Pick the parts of the needle that are worth comparing.
         sz_size_t offset_first, offset_mid, offset_last;
-        _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
+        sz_locate_needle_anomalies_(n, n_length, &offset_first, &offset_mid, &offset_last);
         // Broadcast those characters into SIMD registers.
         sz_u64_t matches;
         sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
@@ -1543,7 +1543,7 @@ SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, s
                     vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
                     vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
                 vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-            matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
+            matches = sz_vreinterpretq_u8_u4_(matches_vec.u8x16);
             while (matches) {
                 int potential_offset = sz_u64_ctz(matches) / 4;
                 if (sz_equal_neon(h + potential_offset, n, n_length)) return h + potential_offset;
@@ -1563,7 +1563,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
 
     // Pick the parts of the needle that are worth comparing.
     sz_size_t offset_first, offset_mid, offset_last;
-    _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
+    sz_locate_needle_anomalies_(n, n_length, &offset_first, &offset_mid, &offset_last);
 
     // Will contain 4 bits per character.
     sz_u64_t matches;
@@ -1583,12 +1583,12 @@ SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
                 vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
                 vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
             vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
-        matches = _sz_vreinterpretq_u8_u4(matches_vec.u8x16);
+        matches = sz_vreinterpretq_u8_u4_(matches_vec.u8x16);
         while (matches) {
             int potential_offset = sz_u64_clz(matches) / 4;
             if (sz_equal_neon(h + h_length - n_length - potential_offset, n, n_length))
                 return h + h_length - n_length - potential_offset;
-            _sz_assert((matches & (1ull << (63 - potential_offset * 4))) != 0 &&
+            sz_assert_((matches & (1ull << (63 - potential_offset * 4))) != 0 &&
                        "The bit must be set before we squash it");
             matches &= ~(1ull << (63 - potential_offset * 4));
         }
@@ -1605,7 +1605,7 @@ SZ_PUBLIC sz_cptr_t sz_find_byteset_neon(sz_cptr_t h, sz_size_t h_length, sz_byt
 
     for (; h_length >= 16; h += 16, h_length -= 16) {
         h_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h));
-        matches = _sz_find_byteset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
+        matches = sz_find_byteset_neon_register_(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
         if (matches) return h + sz_u64_ctz(matches) / 4;
     }
 
@@ -1621,7 +1621,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byteset_neon(sz_cptr_t h, sz_size_t h_length, sz_by
     // Check `sz_find_byteset_neon` for explanations.
     for (; h_length >= 16; h_length -= 16) {
         h_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h) + h_length - 16);
-        matches = _sz_find_byteset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
+        matches = sz_find_byteset_neon_register_(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
         if (matches) return h + h_length - 1 - sz_u64_clz(matches) / 4;
     }
 
@@ -1733,7 +1733,7 @@ SZ_PUBLIC sz_cptr_t sz_find_sve(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz
     else {
         // For longer needles we first pick "anomalies" (i.e. informative offsets)
         sz_size_t offset_first, offset_mid, offset_last;
-        _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
+        sz_locate_needle_anomalies_(n, n_length, &offset_first, &offset_mid, &offset_last);
         // Broadcast the selected needle bytes.
         sz_u8_t n_first = ((sz_u8_t *)n)[offset_first];
         sz_u8_t n_mid = ((sz_u8_t *)n)[offset_mid];
@@ -1757,7 +1757,7 @@ SZ_PUBLIC sz_cptr_t sz_find_sve(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz
                     return h + progress + forward_offset_in_register;
                 // If it doesn't match - clear the first bit and continue
                 svbool_t first_match = svpnext_b8(svptrue_b8(), pred_to_skip);
-                _sz_assert(svcntp_b8(svptrue_b8(), first_match) == 1);
+                sz_assert_(svcntp_b8(svptrue_b8(), first_match) == 1);
                 matches = svbic_b_z(svptrue_b8(), matches, first_match);
             }
             progress += vector_bytes;
diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 52e04f0c..87e31cf1 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -180,11 +180,11 @@ typedef struct sz_hash_state_t {
     sz_size_t ins_length;
 } sz_hash_state_t;
 
-typedef struct _sz_hash_minimal_t {
+typedef struct sz_hash_minimal_t_ {
     sz_u128_vec_t aes;
     sz_u128_vec_t sum;
     sz_u128_vec_t key;
-} _sz_hash_minimal_t;
+} sz_hash_minimal_t_;
 
 /**
  *  @brief  Initializes the state for incremental construction of a hash.
@@ -339,7 +339,7 @@ SZ_PUBLIC sz_u64_t sz_bytesum_serial(sz_cptr_t text, sz_size_t length) {
  *  @return Result of `MixColumns(SubBytes(ShiftRows(state))) ^ round_key`.
  *  @see    Based on Jean-Philippe Aumasson's reference implementation: https://github.com/veorq/aesenc-noNI
  */
-SZ_INTERNAL sz_u128_vec_t _sz_emulate_aesenc_si128_serial(sz_u128_vec_t state_vec, sz_u128_vec_t round_key_vec) {
+SZ_INTERNAL sz_u128_vec_t sz_emulate_aesenc_si128_serial_(sz_u128_vec_t state_vec, sz_u128_vec_t round_key_vec) {
     static sz_u8_t const sbox[256] = {
         // 0     1    2      3     4    5     6     7      8    9     A      B     C     D     E     F
         0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, //
@@ -383,39 +383,39 @@ SZ_INTERNAL sz_u128_vec_t _sz_emulate_aesenc_si128_serial(sz_u128_vec_t state_ve
     state_2d[0][3] = sbox[state_vec.u8s[15]];
 
     // Perform `MixColumns` using GF2 multiplication by 2
-#define _sz_gf2_double(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
+#define sz_gf2_double_(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
     // Row 0:
     sz_u8_t t0 = state_2d[0][0];
     sz_u8_t u0 = state_2d[0][0] ^ state_2d[0][1] ^ state_2d[0][2] ^ state_2d[0][3];
-    state_2d[0][0] ^= u0 ^ _sz_gf2_double(state_2d[0][0] ^ state_2d[0][1]);
-    state_2d[0][1] ^= u0 ^ _sz_gf2_double(state_2d[0][1] ^ state_2d[0][2]);
-    state_2d[0][2] ^= u0 ^ _sz_gf2_double(state_2d[0][2] ^ state_2d[0][3]);
-    state_2d[0][3] ^= u0 ^ _sz_gf2_double(state_2d[0][3] ^ t0);
+    state_2d[0][0] ^= u0 ^ sz_gf2_double_(state_2d[0][0] ^ state_2d[0][1]);
+    state_2d[0][1] ^= u0 ^ sz_gf2_double_(state_2d[0][1] ^ state_2d[0][2]);
+    state_2d[0][2] ^= u0 ^ sz_gf2_double_(state_2d[0][2] ^ state_2d[0][3]);
+    state_2d[0][3] ^= u0 ^ sz_gf2_double_(state_2d[0][3] ^ t0);
 
     // Row 1:
     sz_u8_t t1 = state_2d[1][0];
     sz_u8_t u1 = state_2d[1][0] ^ state_2d[1][1] ^ state_2d[1][2] ^ state_2d[1][3];
-    state_2d[1][0] ^= u1 ^ _sz_gf2_double(state_2d[1][0] ^ state_2d[1][1]);
-    state_2d[1][1] ^= u1 ^ _sz_gf2_double(state_2d[1][1] ^ state_2d[1][2]);
-    state_2d[1][2] ^= u1 ^ _sz_gf2_double(state_2d[1][2] ^ state_2d[1][3]);
-    state_2d[1][3] ^= u1 ^ _sz_gf2_double(state_2d[1][3] ^ t1);
+    state_2d[1][0] ^= u1 ^ sz_gf2_double_(state_2d[1][0] ^ state_2d[1][1]);
+    state_2d[1][1] ^= u1 ^ sz_gf2_double_(state_2d[1][1] ^ state_2d[1][2]);
+    state_2d[1][2] ^= u1 ^ sz_gf2_double_(state_2d[1][2] ^ state_2d[1][3]);
+    state_2d[1][3] ^= u1 ^ sz_gf2_double_(state_2d[1][3] ^ t1);
 
     // Row 2:
     sz_u8_t t2 = state_2d[2][0];
     sz_u8_t u2 = state_2d[2][0] ^ state_2d[2][1] ^ state_2d[2][2] ^ state_2d[2][3];
-    state_2d[2][0] ^= u2 ^ _sz_gf2_double(state_2d[2][0] ^ state_2d[2][1]);
-    state_2d[2][1] ^= u2 ^ _sz_gf2_double(state_2d[2][1] ^ state_2d[2][2]);
-    state_2d[2][2] ^= u2 ^ _sz_gf2_double(state_2d[2][2] ^ state_2d[2][3]);
-    state_2d[2][3] ^= u2 ^ _sz_gf2_double(state_2d[2][3] ^ t2);
+    state_2d[2][0] ^= u2 ^ sz_gf2_double_(state_2d[2][0] ^ state_2d[2][1]);
+    state_2d[2][1] ^= u2 ^ sz_gf2_double_(state_2d[2][1] ^ state_2d[2][2]);
+    state_2d[2][2] ^= u2 ^ sz_gf2_double_(state_2d[2][2] ^ state_2d[2][3]);
+    state_2d[2][3] ^= u2 ^ sz_gf2_double_(state_2d[2][3] ^ t2);
 
     // Row 3:
     sz_u8_t t3 = state_2d[3][0];
     sz_u8_t u3 = state_2d[3][0] ^ state_2d[3][1] ^ state_2d[3][2] ^ state_2d[3][3];
-    state_2d[3][0] ^= u3 ^ _sz_gf2_double(state_2d[3][0] ^ state_2d[3][1]);
-    state_2d[3][1] ^= u3 ^ _sz_gf2_double(state_2d[3][1] ^ state_2d[3][2]);
-    state_2d[3][2] ^= u3 ^ _sz_gf2_double(state_2d[3][2] ^ state_2d[3][3]);
-    state_2d[3][3] ^= u3 ^ _sz_gf2_double(state_2d[3][3] ^ t3);
-#undef _sz_gf2_double
+    state_2d[3][0] ^= u3 ^ sz_gf2_double_(state_2d[3][0] ^ state_2d[3][1]);
+    state_2d[3][1] ^= u3 ^ sz_gf2_double_(state_2d[3][1] ^ state_2d[3][2]);
+    state_2d[3][2] ^= u3 ^ sz_gf2_double_(state_2d[3][2] ^ state_2d[3][3]);
+    state_2d[3][3] ^= u3 ^ sz_gf2_double_(state_2d[3][3] ^ t3);
+#undef sz_gf2_double_
 
     // Export `XOR`-ing with the round key
     sz_u128_vec_t result = *(sz_u128_vec_t *)state_2d;
@@ -424,7 +424,7 @@ SZ_INTERNAL sz_u128_vec_t _sz_emulate_aesenc_si128_serial(sz_u128_vec_t state_ve
     return result;
 }
 
-SZ_INTERNAL sz_u128_vec_t _sz_emulate_shuffle_epi8_serial(sz_u128_vec_t state_vec, sz_u8_t const order[16]) {
+SZ_INTERNAL sz_u128_vec_t sz_emulate_shuffle_epi8_serial_(sz_u128_vec_t state_vec, sz_u8_t const order[16]) {
     sz_u128_vec_t result;
     // Unroll the loop for 16 bytes
     result.u8s[0] = state_vec.u8s[order[0]];
@@ -474,8 +474,8 @@ SZ_INTERNAL sz_u128_vec_t _sz_emulate_shuffle_epi8_serial(sz_u128_vec_t state_ve
  *          https://giordano.github.io/blog/2017-11-21-hexadecimal-pi/
  *
  */
-SZ_INTERNAL sz_u64_t const *_sz_hash_pi_constants(void) {
-    static _SZ_ALIGN64 sz_u64_t const pi[16] = {
+SZ_INTERNAL sz_u64_t const *sz_hash_pi_constants_(void) {
+    static SZ_ALIGN64 sz_u64_t const pi[16] = {
         0x243F6A8885A308D3ull, 0x13198A2E03707344ull, 0xA4093822299F31D0ull, 0x082EFA98EC4E6C89ull,
         0x452821E638D01377ull, 0xBE5466CF34E90C6Cull, 0xC0AC29B7C97C50DDull, 0x3F84D5B5B5470917ull,
         0x9216D5D98979FB1Bull, 0xD1310BA698DFB5ACull, 0x2FFD72DBD01ADFB7ull, 0xB8E1AFED6A267E96ull,
@@ -488,8 +488,8 @@ SZ_INTERNAL sz_u64_t const *_sz_hash_pi_constants(void) {
  *  @brief  Provides a shuffle mask for the additive part, identical to "aHash" in a single lane.
  *  @return Pointer aligned to 64 bytes on SIMD-capable platforms.
  */
-SZ_INTERNAL sz_u8_t const *_sz_hash_u8x16x4_shuffle(void) {
-    static _SZ_ALIGN64 sz_u8_t const shuffle[64] = {
+SZ_INTERNAL sz_u8_t const *sz_hash_u8x16x4_shuffle_(void) {
+    static SZ_ALIGN64 sz_u8_t const shuffle[64] = {
         0x04, 0x0b, 0x09, 0x06, 0x08, 0x0d, 0x0f, 0x05, //
         0x0e, 0x03, 0x01, 0x0c, 0x00, 0x07, 0x0a, 0x02, //
         0x04, 0x0b, 0x09, 0x06, 0x08, 0x0d, 0x0f, 0x05, //
@@ -502,42 +502,42 @@ SZ_INTERNAL sz_u8_t const *_sz_hash_u8x16x4_shuffle(void) {
     return &shuffle[0];
 }
 
-SZ_INTERNAL void _sz_hash_minimal_init_serial(_sz_hash_minimal_t *state, sz_u64_t seed) {
+SZ_INTERNAL void sz_hash_minimal_init_serial_(sz_hash_minimal_t_ *state, sz_u64_t seed) {
 
     // The key is made from the seed and half of it will be mixed with the length in the end
     state->key.u64s[1] = seed;
     state->key.u64s[0] = seed;
 
     // XOR the user-supplied keys with the two "pi" constants
-    sz_u64_t const *pi = _sz_hash_pi_constants();
+    sz_u64_t const *pi = sz_hash_pi_constants_();
     state->aes.u64s[0] = seed ^ pi[0];
     state->aes.u64s[1] = seed ^ pi[1];
     state->sum.u64s[0] = seed ^ pi[8];
     state->sum.u64s[1] = seed ^ pi[9];
 }
 
-SZ_INTERNAL void _sz_hash_minimal_update_serial(_sz_hash_minimal_t *state, sz_u128_vec_t block) {
-    sz_u8_t const *shuffle = _sz_hash_u8x16x4_shuffle();
-    state->aes = _sz_emulate_aesenc_si128_serial(state->aes, block);
-    state->sum = _sz_emulate_shuffle_epi8_serial(state->sum, shuffle);
+SZ_INTERNAL void sz_hash_minimal_update_serial_(sz_hash_minimal_t_ *state, sz_u128_vec_t block) {
+    sz_u8_t const *shuffle = sz_hash_u8x16x4_shuffle_();
+    state->aes = sz_emulate_aesenc_si128_serial_(state->aes, block);
+    state->sum = sz_emulate_shuffle_epi8_serial_(state->sum, shuffle);
     state->sum.u64s[0] += block.u64s[0], state->sum.u64s[1] += block.u64s[1];
 }
 
-SZ_INTERNAL sz_u64_t _sz_hash_minimal_finalize_serial(_sz_hash_minimal_t const *state, sz_size_t length) {
+SZ_INTERNAL sz_u64_t sz_hash_minimal_finalize_serial_(sz_hash_minimal_t_ const *state, sz_size_t length) {
     // Mix the length into the key
     sz_u128_vec_t key_with_length = state->key;
     key_with_length.u64s[0] += length;
     // Combine the "sum" and the "AES" blocks
-    sz_u128_vec_t mixed = _sz_emulate_aesenc_si128_serial(state->sum, state->aes);
+    sz_u128_vec_t mixed = sz_emulate_aesenc_si128_serial_(state->sum, state->aes);
     // Make sure the "key" mixes enough with the state,
     // as with less than 2 rounds - SMHasher fails
     sz_u128_vec_t mixed_in_register =
-        _sz_emulate_aesenc_si128_serial(_sz_emulate_aesenc_si128_serial(mixed, key_with_length), mixed);
+        sz_emulate_aesenc_si128_serial_(sz_emulate_aesenc_si128_serial_(mixed, key_with_length), mixed);
     // Extract the low 64 bits
     return mixed_in_register.u64s[0];
 }
 
-SZ_INTERNAL void _sz_hash_shift_in_register_serial(sz_u128_vec_t *vec, int shift_bytes) {
+SZ_INTERNAL void sz_hash_shift_in_register_serial_(sz_u128_vec_t *vec, int shift_bytes) {
     // One of the ridiculous things about x86, the `bsrli` instruction requires its operand to be an immediate.
     // On GCC and Clang, we could use the provided `__int128` type, but MSVC doesn't support it.
     // So we need to emulate it with 2x 64-bit shifts.
@@ -558,7 +558,7 @@ SZ_PUBLIC void sz_hash_state_init_serial(sz_hash_state_t *state, sz_u64_t seed)
     state->key.u64s[1] = seed;
 
     // XOR the user-supplied keys with the two "pi" constants
-    sz_u64_t const *pi = _sz_hash_pi_constants();
+    sz_u64_t const *pi = sz_hash_pi_constants_();
     for (int i = 0; i < 8; ++i) state->aes.u64s[i] = seed ^ pi[i];
     for (int i = 0; i < 8; ++i) state->sum.u64s[i] = seed ^ pi[i + 8];
 
@@ -567,8 +567,8 @@ SZ_PUBLIC void sz_hash_state_init_serial(sz_hash_state_t *state, sz_u64_t seed)
     state->ins_length = 0;
 }
 
-SZ_INTERNAL void _sz_hash_state_update_serial(sz_hash_state_t *state) {
-    sz_u8_t const *shuffle = _sz_hash_u8x16x4_shuffle();
+SZ_INTERNAL void sz_hash_state_update_serial_(sz_hash_state_t *state) {
+    sz_u8_t const *shuffle = sz_hash_u8x16x4_shuffle_();
 
     // To reuse the snippets above, let's cast to our familiar 128-bit vectors
     sz_u128_vec_t *aes_vecs = (sz_u128_vec_t *)&state->aes.u64s[0];
@@ -576,27 +576,27 @@ SZ_INTERNAL void _sz_hash_state_update_serial(sz_hash_state_t *state) {
     sz_u128_vec_t *ins_vecs = (sz_u128_vec_t *)&state->ins.u64s[0];
 
     // First 128-bit block
-    aes_vecs[0] = _sz_emulate_aesenc_si128_serial(aes_vecs[0], ins_vecs[0]);
-    sum_vecs[0] = _sz_emulate_shuffle_epi8_serial(sum_vecs[0], shuffle);
+    aes_vecs[0] = sz_emulate_aesenc_si128_serial_(aes_vecs[0], ins_vecs[0]);
+    sum_vecs[0] = sz_emulate_shuffle_epi8_serial_(sum_vecs[0], shuffle);
     sum_vecs[0].u64s[0] += ins_vecs[0].u64s[0], sum_vecs[0].u64s[1] += ins_vecs[0].u64s[1];
 
     // Second 128-bit block
-    aes_vecs[1] = _sz_emulate_aesenc_si128_serial(aes_vecs[1], ins_vecs[1]);
-    sum_vecs[1] = _sz_emulate_shuffle_epi8_serial(sum_vecs[1], shuffle);
+    aes_vecs[1] = sz_emulate_aesenc_si128_serial_(aes_vecs[1], ins_vecs[1]);
+    sum_vecs[1] = sz_emulate_shuffle_epi8_serial_(sum_vecs[1], shuffle);
     sum_vecs[1].u64s[0] += ins_vecs[1].u64s[0], sum_vecs[1].u64s[1] += ins_vecs[1].u64s[1];
 
     // Third 128-bit block
-    aes_vecs[2] = _sz_emulate_aesenc_si128_serial(aes_vecs[2], ins_vecs[2]);
-    sum_vecs[2] = _sz_emulate_shuffle_epi8_serial(sum_vecs[2], shuffle);
+    aes_vecs[2] = sz_emulate_aesenc_si128_serial_(aes_vecs[2], ins_vecs[2]);
+    sum_vecs[2] = sz_emulate_shuffle_epi8_serial_(sum_vecs[2], shuffle);
     sum_vecs[2].u64s[0] += ins_vecs[2].u64s[0], sum_vecs[2].u64s[1] += ins_vecs[2].u64s[1];
 
     // Fourth 128-bit block
-    aes_vecs[3] = _sz_emulate_aesenc_si128_serial(aes_vecs[3], ins_vecs[3]);
-    sum_vecs[3] = _sz_emulate_shuffle_epi8_serial(sum_vecs[3], shuffle);
+    aes_vecs[3] = sz_emulate_aesenc_si128_serial_(aes_vecs[3], ins_vecs[3]);
+    sum_vecs[3] = sz_emulate_shuffle_epi8_serial_(sum_vecs[3], shuffle);
     sum_vecs[3].u64s[0] += ins_vecs[3].u64s[0], sum_vecs[3].u64s[1] += ins_vecs[3].u64s[1];
 }
 
-SZ_INTERNAL sz_u64_t _sz_hash_state_finalize_serial(sz_hash_state_t const *state) {
+SZ_INTERNAL sz_u64_t sz_hash_state_finalize_serial_(sz_hash_state_t const *state) {
 
     // Mix the length into the key
     sz_u128_vec_t key_with_length = state->key;
@@ -607,20 +607,20 @@ SZ_INTERNAL sz_u64_t _sz_hash_state_finalize_serial(sz_hash_state_t const *state
     sz_u128_vec_t *sum_vecs = (sz_u128_vec_t *)&state->sum.u64s[0];
 
     // Combine the "sum" and the "AES" blocks
-    sz_u128_vec_t mixed0 = _sz_emulate_aesenc_si128_serial(sum_vecs[0], aes_vecs[0]);
-    sz_u128_vec_t mixed1 = _sz_emulate_aesenc_si128_serial(sum_vecs[1], aes_vecs[1]);
-    sz_u128_vec_t mixed2 = _sz_emulate_aesenc_si128_serial(sum_vecs[2], aes_vecs[2]);
-    sz_u128_vec_t mixed3 = _sz_emulate_aesenc_si128_serial(sum_vecs[3], aes_vecs[3]);
+    sz_u128_vec_t mixed0 = sz_emulate_aesenc_si128_serial_(sum_vecs[0], aes_vecs[0]);
+    sz_u128_vec_t mixed1 = sz_emulate_aesenc_si128_serial_(sum_vecs[1], aes_vecs[1]);
+    sz_u128_vec_t mixed2 = sz_emulate_aesenc_si128_serial_(sum_vecs[2], aes_vecs[2]);
+    sz_u128_vec_t mixed3 = sz_emulate_aesenc_si128_serial_(sum_vecs[3], aes_vecs[3]);
 
     // Combine the mixed registers
-    sz_u128_vec_t mixed01 = _sz_emulate_aesenc_si128_serial(mixed0, mixed1);
-    sz_u128_vec_t mixed23 = _sz_emulate_aesenc_si128_serial(mixed2, mixed3);
-    sz_u128_vec_t mixed = _sz_emulate_aesenc_si128_serial(mixed01, mixed23);
+    sz_u128_vec_t mixed01 = sz_emulate_aesenc_si128_serial_(mixed0, mixed1);
+    sz_u128_vec_t mixed23 = sz_emulate_aesenc_si128_serial_(mixed2, mixed3);
+    sz_u128_vec_t mixed = sz_emulate_aesenc_si128_serial_(mixed01, mixed23);
 
     // Make sure the "key" mixes enough with the state,
     // as with less than 2 rounds - SMHasher fails
     sz_u128_vec_t mixed_in_register =
-        _sz_emulate_aesenc_si128_serial(_sz_emulate_aesenc_si128_serial(mixed, key_with_length), mixed);
+        sz_emulate_aesenc_si128_serial_(sz_emulate_aesenc_si128_serial_(mixed, key_with_length), mixed);
 
     // Extract the low 64 bits
     return mixed_in_register.u64s[0];
@@ -629,19 +629,19 @@ SZ_INTERNAL sz_u64_t _sz_hash_state_finalize_serial(sz_hash_state_t const *state
 SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
     if (length <= 16) {
         // Initialize the AES block with a given seed
-        _sz_hash_minimal_t state;
-        _sz_hash_minimal_init_serial(&state, seed);
+        sz_hash_minimal_t_ state;
+        sz_hash_minimal_init_serial_(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data_vec;
         data_vec.u64s[0] = data_vec.u64s[1] = 0;
         for (sz_size_t i = 0; i < length; ++i) data_vec.u8s[i] = start[i];
-        _sz_hash_minimal_update_serial(&state, data_vec);
-        return _sz_hash_minimal_finalize_serial(&state, length);
+        sz_hash_minimal_update_serial_(&state, data_vec);
+        return sz_hash_minimal_finalize_serial_(&state, length);
     }
     else if (length <= 32) {
         // Initialize the AES block with a given seed
-        _sz_hash_minimal_t state;
-        _sz_hash_minimal_init_serial(&state, seed);
+        sz_hash_minimal_t_ state;
+        sz_hash_minimal_init_serial_(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec;
         data0_vec.u64s[0] = *(sz_u64_t const *)(start);
@@ -649,15 +649,15 @@ SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length, sz_u64_t se
         data1_vec.u64s[0] = *(sz_u64_t const *)(start + length - 16);
         data1_vec.u64s[1] = *(sz_u64_t const *)(start + length - 8);
         // Let's shift the data within the register to de-interleave the bytes.
-        _sz_hash_shift_in_register_serial(&data1_vec, 32 - length);
-        _sz_hash_minimal_update_serial(&state, data0_vec);
-        _sz_hash_minimal_update_serial(&state, data1_vec);
-        return _sz_hash_minimal_finalize_serial(&state, length);
+        sz_hash_shift_in_register_serial_(&data1_vec, 32 - length);
+        sz_hash_minimal_update_serial_(&state, data0_vec);
+        sz_hash_minimal_update_serial_(&state, data1_vec);
+        return sz_hash_minimal_finalize_serial_(&state, length);
     }
     else if (length <= 48) {
         // Initialize the AES block with a given seed
-        _sz_hash_minimal_t state;
-        _sz_hash_minimal_init_serial(&state, seed);
+        sz_hash_minimal_t_ state;
+        sz_hash_minimal_init_serial_(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec;
         data0_vec.u64s[0] = *(sz_u64_t const *)(start);
@@ -667,16 +667,16 @@ SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length, sz_u64_t se
         data2_vec.u64s[0] = *(sz_u64_t const *)(start + length - 16);
         data2_vec.u64s[1] = *(sz_u64_t const *)(start + length - 8);
         // Let's shift the data within the register to de-interleave the bytes.
-        _sz_hash_shift_in_register_serial(&data2_vec, 48 - length);
-        _sz_hash_minimal_update_serial(&state, data0_vec);
-        _sz_hash_minimal_update_serial(&state, data1_vec);
-        _sz_hash_minimal_update_serial(&state, data2_vec);
-        return _sz_hash_minimal_finalize_serial(&state, length);
+        sz_hash_shift_in_register_serial_(&data2_vec, 48 - length);
+        sz_hash_minimal_update_serial_(&state, data0_vec);
+        sz_hash_minimal_update_serial_(&state, data1_vec);
+        sz_hash_minimal_update_serial_(&state, data2_vec);
+        return sz_hash_minimal_finalize_serial_(&state, length);
     }
     else if (length <= 64) {
         // Initialize the AES block with a given seed
-        _sz_hash_minimal_t state;
-        _sz_hash_minimal_init_serial(&state, seed);
+        sz_hash_minimal_t_ state;
+        sz_hash_minimal_init_serial_(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec, data3_vec;
         data0_vec.u64s[0] = *(sz_u64_t const *)(start);
@@ -688,17 +688,17 @@ SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length, sz_u64_t se
         data3_vec.u64s[0] = *(sz_u64_t const *)(start + length - 16);
         data3_vec.u64s[1] = *(sz_u64_t const *)(start + length - 8);
         // Let's shift the data within the register to de-interleave the bytes.
-        _sz_hash_shift_in_register_serial(&data3_vec, 64 - length);
-        _sz_hash_minimal_update_serial(&state, data0_vec);
-        _sz_hash_minimal_update_serial(&state, data1_vec);
-        _sz_hash_minimal_update_serial(&state, data2_vec);
-        _sz_hash_minimal_update_serial(&state, data3_vec);
-        return _sz_hash_minimal_finalize_serial(&state, length);
+        sz_hash_shift_in_register_serial_(&data3_vec, 64 - length);
+        sz_hash_minimal_update_serial_(&state, data0_vec);
+        sz_hash_minimal_update_serial_(&state, data1_vec);
+        sz_hash_minimal_update_serial_(&state, data2_vec);
+        sz_hash_minimal_update_serial_(&state, data3_vec);
+        return sz_hash_minimal_finalize_serial_(&state, length);
     }
     else {
         // Use a larger state to handle the main loop and add different offsets
         // to different lanes of the register
-        _SZ_ALIGN64 sz_hash_state_t state;
+        SZ_ALIGN64 sz_hash_state_t state;
         sz_hash_state_init_serial(&state, seed);
 
         for (; state.ins_length + 64 <= length; state.ins_length += 64) {
@@ -710,16 +710,16 @@ SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length, sz_u64_t se
             state.ins.u64s[5] = *(sz_u64_t const *)(start + state.ins_length + 40);
             state.ins.u64s[6] = *(sz_u64_t const *)(start + state.ins_length + 48);
             state.ins.u64s[7] = *(sz_u64_t const *)(start + state.ins_length + 56);
-            _sz_hash_state_update_serial(&state);
+            sz_hash_state_update_serial_(&state);
         }
         if (state.ins_length < length) {
             for (sz_size_t i = 0; i != 8; ++i) state.ins.u64s[i] = 0;
             for (sz_size_t i = 0; state.ins_length < length; ++i, ++state.ins_length)
                 state.ins.u8s[i] = start[state.ins_length];
-            _sz_hash_state_update_serial(&state);
+            sz_hash_state_update_serial_(&state);
             state.ins_length = length;
         }
-        return _sz_hash_state_finalize_serial(&state);
+        return sz_hash_state_finalize_serial_(&state);
     }
 }
 
@@ -735,7 +735,7 @@ SZ_PUBLIC void sz_hash_state_stream_serial(sz_hash_state_t *state, sz_cptr_t tex
         while (to_copy--) state->ins.u8s[progress_in_block++] = *text++;
         // If we've reached the end of the buffer, update the state
         if (will_fill_block) {
-            _sz_hash_state_update_serial(state);
+            sz_hash_state_update_serial_(state);
             // Reset to zeros now, so we don't have to overwrite an immutable buffer in the folding state
             for (int i = 0; i < 8; ++i) state->ins.u64s[i] = 0;
         }
@@ -744,10 +744,10 @@ SZ_PUBLIC void sz_hash_state_stream_serial(sz_hash_state_t *state, sz_cptr_t tex
 
 SZ_PUBLIC sz_u64_t sz_hash_state_fold_serial(sz_hash_state_t const *state) {
     sz_size_t length = state->ins_length;
-    if (length >= 64) return _sz_hash_state_finalize_serial(state);
+    if (length >= 64) return sz_hash_state_finalize_serial_(state);
 
     // Switch back to a smaller "minimal" state for small inputs
-    _sz_hash_minimal_t minimal_state;
+    sz_hash_minimal_t_ minimal_state;
     minimal_state.key = state->key;
     minimal_state.aes = *(sz_u128_vec_t const *)&state->aes.u64s[0];
     minimal_state.sum = *(sz_u128_vec_t const *)&state->sum.u64s[0];
@@ -755,31 +755,31 @@ SZ_PUBLIC sz_u64_t sz_hash_state_fold_serial(sz_hash_state_t const *state) {
     // The logic is different depending on the length of the input
     sz_u128_vec_t const *ins_vecs = (sz_u128_vec_t const *)&state->ins.u64s[0];
     if (length <= 16) {
-        _sz_hash_minimal_update_serial(&minimal_state, ins_vecs[0]);
-        return _sz_hash_minimal_finalize_serial(&minimal_state, length);
+        sz_hash_minimal_update_serial_(&minimal_state, ins_vecs[0]);
+        return sz_hash_minimal_finalize_serial_(&minimal_state, length);
     }
     else if (length <= 32) {
-        _sz_hash_minimal_update_serial(&minimal_state, ins_vecs[0]);
-        _sz_hash_minimal_update_serial(&minimal_state, ins_vecs[1]);
-        return _sz_hash_minimal_finalize_serial(&minimal_state, length);
+        sz_hash_minimal_update_serial_(&minimal_state, ins_vecs[0]);
+        sz_hash_minimal_update_serial_(&minimal_state, ins_vecs[1]);
+        return sz_hash_minimal_finalize_serial_(&minimal_state, length);
     }
     else if (length <= 48) {
-        _sz_hash_minimal_update_serial(&minimal_state, ins_vecs[0]);
-        _sz_hash_minimal_update_serial(&minimal_state, ins_vecs[1]);
-        _sz_hash_minimal_update_serial(&minimal_state, ins_vecs[2]);
-        return _sz_hash_minimal_finalize_serial(&minimal_state, length);
+        sz_hash_minimal_update_serial_(&minimal_state, ins_vecs[0]);
+        sz_hash_minimal_update_serial_(&minimal_state, ins_vecs[1]);
+        sz_hash_minimal_update_serial_(&minimal_state, ins_vecs[2]);
+        return sz_hash_minimal_finalize_serial_(&minimal_state, length);
     }
     else {
-        _sz_hash_minimal_update_serial(&minimal_state, ins_vecs[0]);
-        _sz_hash_minimal_update_serial(&minimal_state, ins_vecs[1]);
-        _sz_hash_minimal_update_serial(&minimal_state, ins_vecs[2]);
-        _sz_hash_minimal_update_serial(&minimal_state, ins_vecs[3]);
-        return _sz_hash_minimal_finalize_serial(&minimal_state, length);
+        sz_hash_minimal_update_serial_(&minimal_state, ins_vecs[0]);
+        sz_hash_minimal_update_serial_(&minimal_state, ins_vecs[1]);
+        sz_hash_minimal_update_serial_(&minimal_state, ins_vecs[2]);
+        sz_hash_minimal_update_serial_(&minimal_state, ins_vecs[3]);
+        return sz_hash_minimal_finalize_serial_(&minimal_state, length);
     }
 }
 
 SZ_PUBLIC void sz_fill_random_serial(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
-    sz_u64_t const *pi_ptr = _sz_hash_pi_constants();
+    sz_u64_t const *pi_ptr = sz_hash_pi_constants_();
     sz_u128_vec_t input_vec, pi_vec, key_vec, generated_vec;
     for (sz_size_t lane_index = 0; length; ++lane_index) {
         // Each 128-bit block is initialized with the same nonce
@@ -788,7 +788,7 @@ SZ_PUBLIC void sz_fill_random_serial(sz_ptr_t text, sz_size_t length, sz_u64_t n
         pi_vec = ((sz_u128_vec_t const *)pi_ptr)[lane_index % 4];
         key_vec.u64s[0] = nonce ^ pi_vec.u64s[0];
         key_vec.u64s[1] = nonce ^ pi_vec.u64s[1];
-        generated_vec = _sz_emulate_aesenc_si128_serial(input_vec, key_vec);
+        generated_vec = sz_emulate_aesenc_si128_serial_(input_vec, key_vec);
         // Export back to the user-supplied buffer
         for (int i = 0; i < 16 && length; ++i, --length) *text++ = generated_vec.u8s[i];
     }
@@ -870,7 +870,7 @@ SZ_PUBLIC sz_u64_t sz_bytesum_haswell(sz_cptr_t text, sz_size_t length) {
                     sums_reversed_vec.ymm, _mm256_sad_epu8(text_reversed_vec.ymm, _mm256_setzero_si256()));
             }
             if (body_length >= 32) {
-                _sz_assert(body_length == 32);
+                sz_assert_(body_length == 32);
                 text_vec.ymm = _mm256_stream_load_si256((__m256i *)(text));
                 sums_vec.ymm = _mm256_add_epi64(sums_vec.ymm, _mm256_sad_epu8(text_vec.ymm, _mm256_setzero_si256()));
                 text += 32;
@@ -889,14 +889,14 @@ SZ_PUBLIC sz_u64_t sz_bytesum_haswell(sz_cptr_t text, sz_size_t length) {
     }
 }
 
-SZ_INTERNAL void _sz_hash_minimal_init_haswell(_sz_hash_minimal_t *state, sz_u64_t seed) {
+SZ_INTERNAL void sz_hash_minimal_init_haswell_(sz_hash_minimal_t_ *state, sz_u64_t seed) {
 
     // The key is made from the seed and half of it will be mixed with the length in the end
     __m128i seed_vec = _mm_set1_epi64x(seed);
     state->key.xmm = seed_vec;
 
     // XOR the user-supplied keys with the two "pi" constants
-    sz_u64_t const *pi = _sz_hash_pi_constants();
+    sz_u64_t const *pi = sz_hash_pi_constants_();
     __m128i const pi0 = _mm_load_si128((__m128i const *)(pi));
     __m128i const pi1 = _mm_load_si128((__m128i const *)(pi + 8));
     __m128i k1 = _mm_xor_si128(seed_vec, pi0);
@@ -907,7 +907,7 @@ SZ_INTERNAL void _sz_hash_minimal_init_haswell(_sz_hash_minimal_t *state, sz_u64
     state->sum.xmm = k2;
 }
 
-SZ_INTERNAL sz_u64_t _sz_hash_minimal_finalize_haswell(_sz_hash_minimal_t const *state, sz_size_t length) {
+SZ_INTERNAL sz_u64_t sz_hash_minimal_finalize_haswell_(sz_hash_minimal_t_ const *state, sz_size_t length) {
     // Mix the length into the key
     __m128i key_with_length = _mm_add_epi64(state->key.xmm, _mm_set_epi64x(0, length));
     // Combine the "sum" and the "AES" blocks
@@ -919,10 +919,10 @@ SZ_INTERNAL sz_u64_t _sz_hash_minimal_finalize_haswell(_sz_hash_minimal_t const
     return _mm_cvtsi128_si64(mixed_in_register);
 }
 
-SZ_INTERNAL void _sz_hash_minimal_update_haswell(_sz_hash_minimal_t *state, __m128i block) {
+SZ_INTERNAL void sz_hash_minimal_update_haswell_(sz_hash_minimal_t_ *state, __m128i block) {
     // ? In this kernel, assuming it's only used internally on properly aligned `state`,
     // ? we don't need `_mm_storeu_si128` stores to update the state.
-    __m128i const shuffle_mask = _mm_load_si128((__m128i const *)_sz_hash_u8x16x4_shuffle());
+    __m128i const shuffle_mask = _mm_load_si128((__m128i const *)sz_hash_u8x16x4_shuffle_());
     state->aes.xmm = _mm_aesenc_si128(_mm_lddqu_si128(&state->aes.xmm), block);
     state->sum.xmm = _mm_add_epi64(_mm_shuffle_epi8(_mm_lddqu_si128(&state->sum.xmm), shuffle_mask), block);
 }
@@ -936,7 +936,7 @@ SZ_PUBLIC void sz_hash_state_init_haswell(sz_hash_state_t *state, sz_u64_t seed)
     _mm_storeu_si128(&state->key.xmm, seed_vec);
 
     // XOR the user-supplied keys with the two "pi" constants
-    sz_u64_t const *pi = _sz_hash_pi_constants();
+    sz_u64_t const *pi = sz_hash_pi_constants_();
     for (int i = 0; i < 4; ++i)
         _mm_storeu_si128(&state->aes.xmms[i], _mm_xor_si128(seed_vec, _mm_load_si128((__m128i const *)(pi + i * 2))));
     for (int i = 0; i < 4; ++i)
@@ -951,8 +951,8 @@ SZ_PUBLIC void sz_hash_state_init_haswell(sz_hash_state_t *state, sz_u64_t seed)
     state->ins_length = 0;
 }
 
-SZ_INTERNAL void _sz_hash_state_update_haswell(sz_hash_state_t *state) {
-    __m128i const shuffle_mask = _mm_load_si128((__m128i const *)_sz_hash_u8x16x4_shuffle());
+SZ_INTERNAL void sz_hash_state_update_haswell_(sz_hash_state_t *state) {
+    __m128i const shuffle_mask = _mm_load_si128((__m128i const *)sz_hash_u8x16x4_shuffle_());
     _mm_storeu_si128( //
         &state->aes.xmms[0],
         _mm_aesenc_si128(_mm_lddqu_si128(&state->aes.xmms[0]), _mm_lddqu_si128(&state->ins.xmms[0])));
@@ -979,7 +979,7 @@ SZ_INTERNAL void _sz_hash_state_update_haswell(sz_hash_state_t *state) {
                                            _mm_lddqu_si128(&state->ins.xmms[3])));
 }
 
-SZ_INTERNAL sz_u64_t _sz_hash_state_finalize_haswell(sz_hash_state_t const *state) {
+SZ_INTERNAL sz_u64_t sz_hash_state_finalize_haswell_(sz_hash_state_t const *state) {
     // Mix the length into the key
     __m128i key_with_length = _mm_add_epi64(_mm_lddqu_si128(&state->key.xmm), _mm_set_epi64x(0, state->ins_length));
     // Combine the "sum" and the "AES" blocks
@@ -1002,49 +1002,49 @@ SZ_PUBLIC sz_u64_t sz_hash_haswell(sz_cptr_t start, sz_size_t length, sz_u64_t s
 
     if (length <= 16) {
         // Initialize the AES block with a given seed
-        _sz_hash_minimal_t state;
-        _sz_hash_minimal_init_haswell(&state, seed);
+        sz_hash_minimal_t_ state;
+        sz_hash_minimal_init_haswell_(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data_vec;
         data_vec.xmm = _mm_setzero_si128();
         for (sz_size_t i = 0; i < length; ++i) data_vec.u8s[i] = start[i];
-        _sz_hash_minimal_update_haswell(&state, data_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state, length);
+        sz_hash_minimal_update_haswell_(&state, data_vec.xmm);
+        return sz_hash_minimal_finalize_haswell_(&state, length);
     }
     else if (length <= 32) {
         // Initialize the AES block with a given seed
-        _sz_hash_minimal_t state;
-        _sz_hash_minimal_init_haswell(&state, seed);
+        sz_hash_minimal_t_ state;
+        sz_hash_minimal_init_haswell_(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec;
         data0_vec.xmm = _mm_lddqu_si128((__m128i const *)(start));
         data1_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + length - 16));
         // Let's shift the data within the register to de-interleave the bytes.
-        _sz_hash_shift_in_register_serial(&data1_vec, 32 - length);
-        _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
-        _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state, length);
+        sz_hash_shift_in_register_serial_(&data1_vec, 32 - length);
+        sz_hash_minimal_update_haswell_(&state, data0_vec.xmm);
+        sz_hash_minimal_update_haswell_(&state, data1_vec.xmm);
+        return sz_hash_minimal_finalize_haswell_(&state, length);
     }
     else if (length <= 48) {
         // Initialize the AES block with a given seed
-        _sz_hash_minimal_t state;
-        _sz_hash_minimal_init_haswell(&state, seed);
+        sz_hash_minimal_t_ state;
+        sz_hash_minimal_init_haswell_(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec;
         data0_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 0));
         data1_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 16));
         data2_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + length - 16));
         // Let's shift the data within the register to de-interleave the bytes.
-        _sz_hash_shift_in_register_serial(&data2_vec, 48 - length);
-        _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
-        _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
-        _sz_hash_minimal_update_haswell(&state, data2_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state, length);
+        sz_hash_shift_in_register_serial_(&data2_vec, 48 - length);
+        sz_hash_minimal_update_haswell_(&state, data0_vec.xmm);
+        sz_hash_minimal_update_haswell_(&state, data1_vec.xmm);
+        sz_hash_minimal_update_haswell_(&state, data2_vec.xmm);
+        return sz_hash_minimal_finalize_haswell_(&state, length);
     }
     else if (length <= 64) {
         // Initialize the AES block with a given seed
-        _sz_hash_minimal_t state;
-        _sz_hash_minimal_init_haswell(&state, seed);
+        sz_hash_minimal_t_ state;
+        sz_hash_minimal_init_haswell_(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec, data3_vec;
         data0_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 0));
@@ -1052,24 +1052,24 @@ SZ_PUBLIC sz_u64_t sz_hash_haswell(sz_cptr_t start, sz_size_t length, sz_u64_t s
         data2_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 32));
         data3_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + length - 16));
         // Let's shift the data within the register to de-interleave the bytes.
-        _sz_hash_shift_in_register_serial(&data3_vec, 64 - length);
-        _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
-        _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
-        _sz_hash_minimal_update_haswell(&state, data2_vec.xmm);
-        _sz_hash_minimal_update_haswell(&state, data3_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state, length);
+        sz_hash_shift_in_register_serial_(&data3_vec, 64 - length);
+        sz_hash_minimal_update_haswell_(&state, data0_vec.xmm);
+        sz_hash_minimal_update_haswell_(&state, data1_vec.xmm);
+        sz_hash_minimal_update_haswell_(&state, data2_vec.xmm);
+        sz_hash_minimal_update_haswell_(&state, data3_vec.xmm);
+        return sz_hash_minimal_finalize_haswell_(&state, length);
     }
     else {
         // Use a larger state to handle the main loop and add different offsets
         // to different lanes of the register
-        _SZ_ALIGN64 sz_hash_state_t state;
+        SZ_ALIGN64 sz_hash_state_t state;
         sz_hash_state_init_haswell(&state, seed);
         for (; state.ins_length + 64 <= length; state.ins_length += 64) {
             state.ins.xmms[0] = _mm_lddqu_si128((__m128i const *)(start + state.ins_length + 0));
             state.ins.xmms[1] = _mm_lddqu_si128((__m128i const *)(start + state.ins_length + 16));
             state.ins.xmms[2] = _mm_lddqu_si128((__m128i const *)(start + state.ins_length + 32));
             state.ins.xmms[3] = _mm_lddqu_si128((__m128i const *)(start + state.ins_length + 48));
-            _sz_hash_state_update_haswell(&state);
+            sz_hash_state_update_haswell_(&state);
         }
         // Handle the tail, resetting the registers to zero first
         if (state.ins_length < length) {
@@ -1079,10 +1079,10 @@ SZ_PUBLIC sz_u64_t sz_hash_haswell(sz_cptr_t start, sz_size_t length, sz_u64_t s
             state.ins.xmms[3] = _mm_setzero_si128();
             for (sz_size_t i = 0; state.ins_length < length; ++i, ++state.ins_length)
                 state.ins.u8s[i] = start[state.ins_length];
-            _sz_hash_state_update_haswell(&state);
+            sz_hash_state_update_haswell_(&state);
             state.ins_length = length;
         }
-        return _sz_hash_state_finalize_haswell(&state);
+        return sz_hash_state_finalize_haswell_(&state);
     }
 }
 
@@ -1094,7 +1094,7 @@ SZ_PUBLIC void sz_hash_state_stream_haswell(sz_hash_state_t *state, sz_cptr_t te
             _mm_storeu_si128(&state->ins.xmms[1], _mm_lddqu_si128((__m128i const *)(text + 16)));
             _mm_storeu_si128(&state->ins.xmms[2], _mm_lddqu_si128((__m128i const *)(text + 32)));
             _mm_storeu_si128(&state->ins.xmms[3], _mm_lddqu_si128((__m128i const *)(text + 48)));
-            _sz_hash_state_update_haswell(state);
+            sz_hash_state_update_haswell_(state);
             state->ins_length += 64;
             text += 64;
             length -= 64;
@@ -1111,7 +1111,7 @@ SZ_PUBLIC void sz_hash_state_stream_haswell(sz_hash_state_t *state, sz_cptr_t te
             while (to_copy--) state->ins.u8s[progress_in_block++] = *text++;
             // If we've reached the end of the buffer, update the state
             if (will_fill_block) {
-                _sz_hash_state_update_haswell(state);
+                sz_hash_state_update_haswell_(state);
                 // Reset to zeros now, so we don't have to overwrite an immutable buffer in the folding state
                 for (int i = 0; i < 4; ++i) _mm_storeu_si128(&state->ins.xmms[i], _mm_setzero_si128());
             }
@@ -1121,10 +1121,10 @@ SZ_PUBLIC void sz_hash_state_stream_haswell(sz_hash_state_t *state, sz_cptr_t te
 
 SZ_PUBLIC sz_u64_t sz_hash_state_fold_haswell(sz_hash_state_t const *state) {
     sz_size_t length = state->ins_length;
-    if (length >= 64) return _sz_hash_state_finalize_haswell(state);
+    if (length >= 64) return sz_hash_state_finalize_haswell_(state);
 
     // Switch back to a smaller "minimal" state for small inputs
-    _SZ_ALIGN64 _sz_hash_minimal_t minimal_state;
+    SZ_ALIGN64 sz_hash_minimal_t_ minimal_state;
     minimal_state.key.xmm = _mm_lddqu_si128(&state->key.xmm);
     minimal_state.aes.xmm = _mm_lddqu_si128(&state->aes.xmms[0]);
     minimal_state.sum.xmm = _mm_lddqu_si128(&state->sum.xmms[0]);
@@ -1132,31 +1132,31 @@ SZ_PUBLIC sz_u64_t sz_hash_state_fold_haswell(sz_hash_state_t const *state) {
     // The logic is different depending on the length of the input
     __m128i const *ins_xmms = (__m128i const *)&state->ins.xmms[0];
     if (length <= 16) {
-        _sz_hash_minimal_update_haswell(&minimal_state, _mm_lddqu_si128(&ins_xmms[0]));
-        return _sz_hash_minimal_finalize_haswell(&minimal_state, length);
+        sz_hash_minimal_update_haswell_(&minimal_state, _mm_lddqu_si128(&ins_xmms[0]));
+        return sz_hash_minimal_finalize_haswell_(&minimal_state, length);
     }
     else if (length <= 32) {
-        _sz_hash_minimal_update_haswell(&minimal_state, _mm_lddqu_si128(&ins_xmms[0]));
-        _sz_hash_minimal_update_haswell(&minimal_state, _mm_lddqu_si128(&ins_xmms[1]));
-        return _sz_hash_minimal_finalize_haswell(&minimal_state, length);
+        sz_hash_minimal_update_haswell_(&minimal_state, _mm_lddqu_si128(&ins_xmms[0]));
+        sz_hash_minimal_update_haswell_(&minimal_state, _mm_lddqu_si128(&ins_xmms[1]));
+        return sz_hash_minimal_finalize_haswell_(&minimal_state, length);
     }
     else if (length <= 48) {
-        _sz_hash_minimal_update_haswell(&minimal_state, _mm_lddqu_si128(&ins_xmms[0]));
-        _sz_hash_minimal_update_haswell(&minimal_state, _mm_lddqu_si128(&ins_xmms[1]));
-        _sz_hash_minimal_update_haswell(&minimal_state, _mm_lddqu_si128(&ins_xmms[2]));
-        return _sz_hash_minimal_finalize_haswell(&minimal_state, length);
+        sz_hash_minimal_update_haswell_(&minimal_state, _mm_lddqu_si128(&ins_xmms[0]));
+        sz_hash_minimal_update_haswell_(&minimal_state, _mm_lddqu_si128(&ins_xmms[1]));
+        sz_hash_minimal_update_haswell_(&minimal_state, _mm_lddqu_si128(&ins_xmms[2]));
+        return sz_hash_minimal_finalize_haswell_(&minimal_state, length);
     }
     else {
-        _sz_hash_minimal_update_haswell(&minimal_state, _mm_lddqu_si128(&ins_xmms[0]));
-        _sz_hash_minimal_update_haswell(&minimal_state, _mm_lddqu_si128(&ins_xmms[1]));
-        _sz_hash_minimal_update_haswell(&minimal_state, _mm_lddqu_si128(&ins_xmms[2]));
-        _sz_hash_minimal_update_haswell(&minimal_state, _mm_lddqu_si128(&ins_xmms[3]));
-        return _sz_hash_minimal_finalize_haswell(&minimal_state, length);
+        sz_hash_minimal_update_haswell_(&minimal_state, _mm_lddqu_si128(&ins_xmms[0]));
+        sz_hash_minimal_update_haswell_(&minimal_state, _mm_lddqu_si128(&ins_xmms[1]));
+        sz_hash_minimal_update_haswell_(&minimal_state, _mm_lddqu_si128(&ins_xmms[2]));
+        sz_hash_minimal_update_haswell_(&minimal_state, _mm_lddqu_si128(&ins_xmms[3]));
+        return sz_hash_minimal_finalize_haswell_(&minimal_state, length);
     }
 }
 
 SZ_PUBLIC void sz_fill_random_haswell(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
-    sz_u64_t const *pi_ptr = _sz_hash_pi_constants();
+    sz_u64_t const *pi_ptr = sz_hash_pi_constants_();
     if (length <= 16) {
         __m128i input = _mm_set1_epi64x(nonce);
         __m128i pi = _mm_load_si128((__m128i const *)pi_ptr);
@@ -1278,7 +1278,7 @@ SZ_PUBLIC sz_u64_t sz_bytesum_skylake(sz_cptr_t text, sz_size_t length) {
     // When the buffer is small, there isn't much to innovate.
     // Separately handling even smaller payloads doesn't increase performance even on synthetic benchmarks.
     if (length <= 16) {
-        __mmask16 mask = _sz_u16_mask_until(length);
+        __mmask16 mask = sz_u16_mask_until_(length);
         text_vec.xmms[0] = _mm_maskz_loadu_epi8(mask, text);
         sums_vec.xmms[0] = _mm_sad_epu8(text_vec.xmms[0], _mm_setzero_si128());
         sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_vec.xmms[0]);
@@ -1286,7 +1286,7 @@ SZ_PUBLIC sz_u64_t sz_bytesum_skylake(sz_cptr_t text, sz_size_t length) {
         return low + high;
     }
     else if (length <= 32) {
-        __mmask32 mask = _sz_u32_mask_until(length);
+        __mmask32 mask = sz_u32_mask_until_(length);
         text_vec.ymms[0] = _mm256_maskz_loadu_epi8(mask, text);
         sums_vec.ymms[0] = _mm256_sad_epu8(text_vec.ymms[0], _mm256_setzero_si256());
         // Accumulating 256 bits is harder, as we need to extract the 128-bit sums first.
@@ -1298,7 +1298,7 @@ SZ_PUBLIC sz_u64_t sz_bytesum_skylake(sz_cptr_t text, sz_size_t length) {
         return low + high;
     }
     else if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
+        __mmask64 mask = sz_u64_mask_until_(length);
         text_vec.zmm = _mm512_maskz_loadu_epi8(mask, text);
         sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
         return _mm512_reduce_add_epi64(sums_vec.zmm);
@@ -1314,9 +1314,9 @@ SZ_PUBLIC sz_u64_t sz_bytesum_skylake(sz_cptr_t text, sz_size_t length) {
         sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64; // 63 or less.
         sz_size_t tail_length = (sz_size_t)(text + length) % 64;    // 63 or less.
         sz_size_t body_length = length - head_length - tail_length; // Multiple of 64.
-        _sz_assert(body_length % 64 == 0 && head_length < 64 && tail_length < 64);
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
+        sz_assert_(body_length % 64 == 0 && head_length < 64 && tail_length < 64);
+        __mmask64 head_mask = sz_u64_mask_until_(head_length);
+        __mmask64 tail_mask = sz_u64_mask_until_(tail_length);
 
         text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
         sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
@@ -1340,8 +1340,8 @@ SZ_PUBLIC sz_u64_t sz_bytesum_skylake(sz_cptr_t text, sz_size_t length) {
         sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64;
         sz_size_t tail_length = (sz_size_t)(text + length) % 64;
         sz_size_t body_length = length - head_length - tail_length;
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
+        __mmask64 head_mask = sz_u64_mask_until_(head_length);
+        __mmask64 tail_mask = sz_u64_mask_until_(tail_length);
 
         text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
         sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
@@ -1373,7 +1373,7 @@ SZ_PUBLIC void sz_hash_state_init_skylake(sz_hash_state_t *state, sz_u64_t seed)
     _mm_storeu_si128(&state->key.xmm, _mm512_castsi512_si128(seed_vec));
 
     // XOR the user-supplied keys with the two "pi" constants
-    sz_u64_t const *pi = _sz_hash_pi_constants();
+    sz_u64_t const *pi = sz_hash_pi_constants_();
     __m512i const pi0 = _mm512_load_epi64((__m512i const *)(pi));
     __m512i const pi1 = _mm512_load_epi64((__m512i const *)(pi + 8));
     _mm512_storeu_si512(&state->aes.zmm, _mm512_xor_si512(seed_vec, pi0));
@@ -1388,73 +1388,73 @@ SZ_PUBLIC sz_u64_t sz_hash_skylake(sz_cptr_t start, sz_size_t length, sz_u64_t s
 
     if (length <= 16) {
         // Initialize the AES block with a given seed
-        _sz_hash_minimal_t state;
-        _sz_hash_minimal_init_haswell(&state, seed);
+        sz_hash_minimal_t_ state;
+        sz_hash_minimal_init_haswell_(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data_vec;
-        data_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length), start);
-        _sz_hash_minimal_update_haswell(&state, data_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state, length);
+        data_vec.xmm = _mm_maskz_loadu_epi8(sz_u16_mask_until_(length), start);
+        sz_hash_minimal_update_haswell_(&state, data_vec.xmm);
+        return sz_hash_minimal_finalize_haswell_(&state, length);
     }
     else if (length <= 32) {
         // Initialize the AES block with a given seed
-        _sz_hash_minimal_t state;
-        _sz_hash_minimal_init_haswell(&state, seed);
+        sz_hash_minimal_t_ state;
+        sz_hash_minimal_init_haswell_(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec;
         data0_vec.xmm = _mm_lddqu_si128((__m128i const *)(start));
-        data1_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length - 16), start + 16);
-        _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
-        _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state, length);
+        data1_vec.xmm = _mm_maskz_loadu_epi8(sz_u16_mask_until_(length - 16), start + 16);
+        sz_hash_minimal_update_haswell_(&state, data0_vec.xmm);
+        sz_hash_minimal_update_haswell_(&state, data1_vec.xmm);
+        return sz_hash_minimal_finalize_haswell_(&state, length);
     }
     else if (length <= 48) {
         // Initialize the AES block with a given seed
-        _sz_hash_minimal_t state;
-        _sz_hash_minimal_init_haswell(&state, seed);
+        sz_hash_minimal_t_ state;
+        sz_hash_minimal_init_haswell_(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec;
         data0_vec.xmm = _mm_lddqu_si128((__m128i const *)(start));
         data1_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 16));
-        data2_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length - 32), start + 32);
-        _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
-        _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
-        _sz_hash_minimal_update_haswell(&state, data2_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state, length);
+        data2_vec.xmm = _mm_maskz_loadu_epi8(sz_u16_mask_until_(length - 32), start + 32);
+        sz_hash_minimal_update_haswell_(&state, data0_vec.xmm);
+        sz_hash_minimal_update_haswell_(&state, data1_vec.xmm);
+        sz_hash_minimal_update_haswell_(&state, data2_vec.xmm);
+        return sz_hash_minimal_finalize_haswell_(&state, length);
     }
     else if (length <= 64) {
         // Initialize the AES block with a given seed
-        _sz_hash_minimal_t state;
-        _sz_hash_minimal_init_haswell(&state, seed);
+        sz_hash_minimal_t_ state;
+        sz_hash_minimal_init_haswell_(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec, data3_vec;
         data0_vec.xmm = _mm_lddqu_si128((__m128i const *)(start));
         data1_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 16));
         data2_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 32));
-        data3_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length - 48), start + 48);
-        _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
-        _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
-        _sz_hash_minimal_update_haswell(&state, data2_vec.xmm);
-        _sz_hash_minimal_update_haswell(&state, data3_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state, length);
+        data3_vec.xmm = _mm_maskz_loadu_epi8(sz_u16_mask_until_(length - 48), start + 48);
+        sz_hash_minimal_update_haswell_(&state, data0_vec.xmm);
+        sz_hash_minimal_update_haswell_(&state, data1_vec.xmm);
+        sz_hash_minimal_update_haswell_(&state, data2_vec.xmm);
+        sz_hash_minimal_update_haswell_(&state, data3_vec.xmm);
+        return sz_hash_minimal_finalize_haswell_(&state, length);
     }
     else {
         // Use a larger state to handle the main loop and add different offsets
         // to different lanes of the register
-        _SZ_ALIGN64 sz_hash_state_t state;
+        SZ_ALIGN64 sz_hash_state_t state;
         sz_hash_state_init_skylake(&state, seed);
 
         for (; state.ins_length + 64 <= length; state.ins_length += 64) {
             state.ins.zmm = _mm512_loadu_epi8(start + state.ins_length);
-            _sz_hash_state_update_haswell(&state);
+            sz_hash_state_update_haswell_(&state);
         }
         if (state.ins_length < length) {
             state.ins.zmm = _mm512_maskz_loadu_epi8( //
-                _sz_u64_mask_until(length - state.ins_length), start + state.ins_length);
-            _sz_hash_state_update_haswell(&state);
+                sz_u64_mask_until_(length - state.ins_length), start + state.ins_length);
+            sz_hash_state_update_haswell_(&state);
             state.ins_length = length;
         }
-        return _sz_hash_state_finalize_haswell(&state);
+        return sz_hash_state_finalize_haswell_(&state);
     }
 }
 
@@ -1467,13 +1467,13 @@ SZ_PUBLIC void sz_hash_state_stream_skylake(sz_hash_state_t *state, sz_cptr_t te
         state->ins_length += to_copy;
         length -= to_copy;
         // Append to the internal buffer until it's full
-        __mmask64 to_copy_mask = _sz_u64_mask_until(to_copy);
+        __mmask64 to_copy_mask = sz_u64_mask_until_(to_copy);
         _mm512_mask_storeu_epi8(&state->ins.u8s[0] + progress_in_block, to_copy_mask,
                                 _mm512_maskz_loadu_epi8(to_copy_mask, text));
         text += to_copy;
         // If we've reached the end of the buffer, update the state
         if (will_fill_block) {
-            _sz_hash_state_update_haswell(state);
+            sz_hash_state_update_haswell_(state);
             // Reset to zeros now, so we don't have to overwrite an immutable buffer in the folding state
             _mm512_storeu_si512(&state->ins.zmm, _mm512_setzero_si512());
         }
@@ -1522,7 +1522,7 @@ SZ_PUBLIC sz_u64_t sz_bytesum_ice(sz_cptr_t text, sz_size_t length) {
     // When the buffer is small, there isn't much to innovate.
     // Separately handling even smaller payloads doesn't increase performance even on synthetic benchmarks.
     if (length <= 16) {
-        __mmask16 mask = _sz_u16_mask_until(length);
+        __mmask16 mask = sz_u16_mask_until_(length);
         text_vec.xmms[0] = _mm_maskz_loadu_epi8(mask, text);
         sums_vec.xmms[0] = _mm_sad_epu8(text_vec.xmms[0], _mm_setzero_si128());
         sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_vec.xmms[0]);
@@ -1530,7 +1530,7 @@ SZ_PUBLIC sz_u64_t sz_bytesum_ice(sz_cptr_t text, sz_size_t length) {
         return low + high;
     }
     else if (length <= 32) {
-        __mmask32 mask = _sz_u32_mask_until(length);
+        __mmask32 mask = sz_u32_mask_until_(length);
         text_vec.ymms[0] = _mm256_maskz_loadu_epi8(mask, text);
         sums_vec.ymms[0] = _mm256_sad_epu8(text_vec.ymms[0], _mm256_setzero_si256());
         // Accumulating 256 bits is harder, as we need to extract the 128-bit sums first.
@@ -1542,7 +1542,7 @@ SZ_PUBLIC sz_u64_t sz_bytesum_ice(sz_cptr_t text, sz_size_t length) {
         return low + high;
     }
     else if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
+        __mmask64 mask = sz_u64_mask_until_(length);
         text_vec.zmm = _mm512_maskz_loadu_epi8(mask, text);
         sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
         return _mm512_reduce_add_epi64(sums_vec.zmm);
@@ -1567,9 +1567,9 @@ SZ_PUBLIC sz_u64_t sz_bytesum_ice(sz_cptr_t text, sz_size_t length) {
         sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64; // 63 or less.
         sz_size_t tail_length = (sz_size_t)(text + length) % 64;    // 63 or less.
         sz_size_t body_length = length - head_length - tail_length; // Multiple of 64.
-        _sz_assert(body_length % 64 == 0 && head_length < 64 && tail_length < 64);
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
+        sz_assert_(body_length % 64 == 0 && head_length < 64 && tail_length < 64);
+        __mmask64 head_mask = sz_u64_mask_until_(head_length);
+        __mmask64 tail_mask = sz_u64_mask_until_(tail_length);
 
         sz_u512_vec_t zeros_vec, ones_vec;
         zeros_vec.zmm = _mm512_setzero_si512();
@@ -1591,7 +1591,7 @@ SZ_PUBLIC sz_u64_t sz_bytesum_ice(sz_cptr_t text, sz_size_t length) {
         }
         // There may be an aligned chunk of 64 bytes left.
         if (body_length >= 64) {
-            _sz_assert(body_length == 64);
+            sz_assert_(body_length == 64);
             text_vec.zmm = _mm512_load_si512((__m512i *)(text));
             sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, zeros_vec.zmm));
         }
@@ -1610,8 +1610,8 @@ SZ_PUBLIC sz_u64_t sz_bytesum_ice(sz_cptr_t text, sz_size_t length) {
         sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64;
         sz_size_t tail_length = (sz_size_t)(text + length) % 64;
         sz_size_t body_length = length - head_length - tail_length;
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
+        __mmask64 head_mask = sz_u64_mask_until_(head_length);
+        __mmask64 tail_mask = sz_u64_mask_until_(tail_length);
 
         text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
         sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
@@ -1635,8 +1635,8 @@ SZ_PUBLIC sz_u64_t sz_bytesum_ice(sz_cptr_t text, sz_size_t length) {
     }
 }
 
-SZ_INTERNAL void _sz_hash_state_update_ice(sz_hash_state_t *state) {
-    __m512i const shuffle_mask = _mm512_load_si512((__m512i const *)_sz_hash_u8x16x4_shuffle());
+SZ_INTERNAL void sz_hash_state_update_ice_(sz_hash_state_t *state) {
+    __m512i const shuffle_mask = _mm512_load_si512((__m512i const *)sz_hash_u8x16x4_shuffle_());
     // ! In this kernel, assuming it may be called on arbitrarily misaligned `state`,
     // ! we must use `_mm512_storeu_si512` stores to update the state.
     _mm512_storeu_si512(&state->aes.zmm,
@@ -1652,74 +1652,74 @@ SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t start, sz_size_t length, sz_u64_t seed)
     // the "logic" is identical to Haswell.
     if (length <= 16) {
         // Initialize the AES block with a given seed
-        _sz_hash_minimal_t state;
-        _sz_hash_minimal_init_haswell(&state, seed);
+        sz_hash_minimal_t_ state;
+        sz_hash_minimal_init_haswell_(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data_vec;
-        data_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length), start);
-        _sz_hash_minimal_update_haswell(&state, data_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state, length);
+        data_vec.xmm = _mm_maskz_loadu_epi8(sz_u16_mask_until_(length), start);
+        sz_hash_minimal_update_haswell_(&state, data_vec.xmm);
+        return sz_hash_minimal_finalize_haswell_(&state, length);
     }
     else if (length <= 32) {
         // Initialize the AES block with a given seed
-        _sz_hash_minimal_t state;
-        _sz_hash_minimal_init_haswell(&state, seed);
+        sz_hash_minimal_t_ state;
+        sz_hash_minimal_init_haswell_(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec;
         data0_vec.xmm = _mm_lddqu_si128((__m128i const *)(start));
-        data1_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length - 16), start + 16);
-        _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
-        _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state, length);
+        data1_vec.xmm = _mm_maskz_loadu_epi8(sz_u16_mask_until_(length - 16), start + 16);
+        sz_hash_minimal_update_haswell_(&state, data0_vec.xmm);
+        sz_hash_minimal_update_haswell_(&state, data1_vec.xmm);
+        return sz_hash_minimal_finalize_haswell_(&state, length);
     }
     else if (length <= 48) {
         // Initialize the AES block with a given seed
-        _sz_hash_minimal_t state;
-        _sz_hash_minimal_init_haswell(&state, seed);
+        sz_hash_minimal_t_ state;
+        sz_hash_minimal_init_haswell_(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec;
         data0_vec.xmm = _mm_lddqu_si128((__m128i const *)(start));
         data1_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 16));
-        data2_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length - 32), start + 32);
-        _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
-        _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
-        _sz_hash_minimal_update_haswell(&state, data2_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state, length);
+        data2_vec.xmm = _mm_maskz_loadu_epi8(sz_u16_mask_until_(length - 32), start + 32);
+        sz_hash_minimal_update_haswell_(&state, data0_vec.xmm);
+        sz_hash_minimal_update_haswell_(&state, data1_vec.xmm);
+        sz_hash_minimal_update_haswell_(&state, data2_vec.xmm);
+        return sz_hash_minimal_finalize_haswell_(&state, length);
     }
     else if (length <= 64) {
         // Initialize the AES block with a given seed
-        _sz_hash_minimal_t state;
-        _sz_hash_minimal_init_haswell(&state, seed);
+        sz_hash_minimal_t_ state;
+        sz_hash_minimal_init_haswell_(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec, data3_vec;
         data0_vec.xmm = _mm_lddqu_si128((__m128i const *)(start));
         data1_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 16));
         data2_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 32));
-        data3_vec.xmm = _mm_maskz_loadu_epi8(_sz_u16_mask_until(length - 48), start + 48);
-        _sz_hash_minimal_update_haswell(&state, data0_vec.xmm);
-        _sz_hash_minimal_update_haswell(&state, data1_vec.xmm);
-        _sz_hash_minimal_update_haswell(&state, data2_vec.xmm);
-        _sz_hash_minimal_update_haswell(&state, data3_vec.xmm);
-        return _sz_hash_minimal_finalize_haswell(&state, length);
+        data3_vec.xmm = _mm_maskz_loadu_epi8(sz_u16_mask_until_(length - 48), start + 48);
+        sz_hash_minimal_update_haswell_(&state, data0_vec.xmm);
+        sz_hash_minimal_update_haswell_(&state, data1_vec.xmm);
+        sz_hash_minimal_update_haswell_(&state, data2_vec.xmm);
+        sz_hash_minimal_update_haswell_(&state, data3_vec.xmm);
+        return sz_hash_minimal_finalize_haswell_(&state, length);
     }
     // This is where the logic differs from Skylake-X and other pre-Ice Lake CPUs:
     else {
         // Use a larger state to handle the main loop and add different offsets
         // to different lanes of the register
-        _SZ_ALIGN64 sz_hash_state_t state;
+        SZ_ALIGN64 sz_hash_state_t state;
         sz_hash_state_init_skylake(&state, seed);
 
         for (; state.ins_length + 64 <= length; state.ins_length += 64) {
             state.ins.zmm = _mm512_loadu_epi8(start + state.ins_length);
-            _sz_hash_state_update_ice(&state);
+            sz_hash_state_update_ice_(&state);
         }
         if (state.ins_length < length) {
             state.ins.zmm =
-                _mm512_maskz_loadu_epi8(_sz_u64_mask_until(length - state.ins_length), start + state.ins_length);
-            _sz_hash_state_update_ice(&state);
+                _mm512_maskz_loadu_epi8(sz_u64_mask_until_(length - state.ins_length), start + state.ins_length);
+            sz_hash_state_update_ice_(&state);
             state.ins_length = length;
         }
-        return _sz_hash_state_finalize_haswell(&state);
+        return sz_hash_state_finalize_haswell_(&state);
     }
 }
 
@@ -1736,13 +1736,13 @@ SZ_PUBLIC void sz_hash_state_stream_ice(sz_hash_state_t *state, sz_cptr_t text,
         state->ins_length += to_copy;
         length -= to_copy;
         // Append to the internal buffer until it's full
-        __mmask64 to_copy_mask = _sz_u64_mask_until(to_copy);
+        __mmask64 to_copy_mask = sz_u64_mask_until_(to_copy);
         _mm512_mask_storeu_epi8(state->ins.u8s + progress_in_block, to_copy_mask,
                                 _mm512_maskz_loadu_epi8(to_copy_mask, text));
         text += to_copy;
         // If we've reached the end of the buffer, update the state
         if (will_fill_block) {
-            _sz_hash_state_update_ice(state);
+            sz_hash_state_update_ice_(state);
             // Reset to zeros now, so we don't have to overwrite an immutable buffer in the folding state
             _mm512_storeu_si512(&state->ins.zmm, _mm512_setzero_si512());
         }
@@ -1757,20 +1757,20 @@ SZ_PUBLIC sz_u64_t sz_hash_state_fold_ice(sz_hash_state_t const *state) {
 SZ_PUBLIC void sz_fill_random_ice(sz_ptr_t output, sz_size_t length, sz_u64_t nonce) {
     if (length <= 16) {
         __m128i input = _mm_set1_epi64x(nonce);
-        __m128i pi = _mm_load_si128((__m128i const *)_sz_hash_pi_constants());
+        __m128i pi = _mm_load_si128((__m128i const *)sz_hash_pi_constants_());
         __m128i key = _mm_xor_si128(_mm_set1_epi64x(nonce), pi);
         __m128i generated = _mm_aesenc_si128(input, key);
-        __mmask16 store_mask = _sz_u16_mask_until(length);
+        __mmask16 store_mask = sz_u16_mask_until_(length);
         _mm_mask_storeu_epi8((void *)output, store_mask, generated);
     }
     // Assuming the YMM register contains two 128-bit blocks, the input to the generator
     // will be more complex, containing the sum of the nonce and the block number.
     else if (length <= 32) {
         __m256i input = _mm256_set_epi64x(nonce + 1, nonce + 1, nonce, nonce);
-        __m256i pi = _mm256_load_si256((__m256i const *)_sz_hash_pi_constants());
+        __m256i pi = _mm256_load_si256((__m256i const *)sz_hash_pi_constants_());
         __m256i key = _mm256_xor_si256(_mm256_set1_epi64x(nonce), pi);
         __m256i generated = _mm256_aesenc_epi128(input, key);
-        __mmask32 store_mask = _sz_u32_mask_until(length);
+        __mmask32 store_mask = sz_u32_mask_until_(length);
         _mm256_mask_storeu_epi8((void *)output, store_mask, generated);
     }
     // The last special case we handle outside of the primary loop is for buffers up to 64 bytes long.
@@ -1778,10 +1778,10 @@ SZ_PUBLIC void sz_fill_random_ice(sz_ptr_t output, sz_size_t length, sz_u64_t no
         __m512i input = _mm512_set_epi64(               //
             nonce + 3, nonce + 3, nonce + 2, nonce + 2, //
             nonce + 1, nonce + 1, nonce, nonce);
-        __m512i pi = _mm512_load_si512((__m512i const *)_sz_hash_pi_constants());
+        __m512i pi = _mm512_load_si512((__m512i const *)sz_hash_pi_constants_());
         __m512i key = _mm512_xor_si512(_mm512_set1_epi64(nonce), pi);
         __m512i generated = _mm512_aesenc_epi128(input, key);
-        __mmask64 store_mask = _sz_u64_mask_until(length);
+        __mmask64 store_mask = sz_u64_mask_until_(length);
         _mm512_mask_storeu_epi8((void *)output, store_mask, generated);
     }
     // The final part of the function is the primary loop, which processes the buffer in 64-byte chunks.
@@ -1790,7 +1790,7 @@ SZ_PUBLIC void sz_fill_random_ice(sz_ptr_t output, sz_size_t length, sz_u64_t no
         __m512i input = _mm512_set_epi64(               //
             nonce + 3, nonce + 3, nonce + 2, nonce + 2, //
             nonce + 1, nonce + 1, nonce, nonce);
-        __m512i const pi = _mm512_load_si512((__m512i const *)_sz_hash_pi_constants());
+        __m512i const pi = _mm512_load_si512((__m512i const *)sz_hash_pi_constants_());
         __m512i const key = _mm512_xor_si512(_mm512_set1_epi64(nonce), pi);
 
         // Produce the output, fixing the key and enumerating input chunks.
@@ -1803,30 +1803,30 @@ SZ_PUBLIC void sz_fill_random_ice(sz_ptr_t output, sz_size_t length, sz_u64_t no
 
         // Handle the tail of the buffer.
         __m512i generated = _mm512_aesenc_epi128(input, key);
-        __mmask64 store_mask = _sz_u64_mask_until(length - i);
+        __mmask64 store_mask = sz_u64_mask_until_(length - i);
         _mm512_mask_storeu_epi8((void *)(output + i), store_mask, generated);
     }
 }
 
 /**
- *  @brief  A wider parallel analog of `_sz_hash_minimal_t`, which is not used for computing individual hashes,
+ *  @brief  A wider parallel analog of `sz_hash_minimal_t_`, which is not used for computing individual hashes,
  *          but for parallel hashing of @b short 4x separate strings under 16 bytes long.
  *          Useful for higher-level Database and Machine Learning operations.
  */
-typedef struct _sz_hash_minimal_x4_t {
+typedef struct sz_hash_minimal_x4_t_ {
     sz_u512_vec_t aes;
     sz_u512_vec_t sum;
     sz_u512_vec_t key;
-} _sz_hash_minimal_x4_t;
+} sz_hash_minimal_x4_t_;
 
-SZ_INTERNAL void _sz_hash_minimal_x4_init_ice(_sz_hash_minimal_x4_t *state, sz_u64_t seed) {
+SZ_INTERNAL void sz_hash_minimal_x4_init_ice_(sz_hash_minimal_x4_t_ *state, sz_u64_t seed) {
 
     // The key is made from the seed and half of it will be mixed with the length in the end
     __m512i seed_vec = _mm512_set1_epi64(seed);
     state->key.zmm = seed_vec; //! This will definitely be aligned
 
     // XOR the user-supplied keys with the two "pi" constants
-    sz_u64_t const *pi = _sz_hash_pi_constants();
+    sz_u64_t const *pi = sz_hash_pi_constants_();
     __m512i pi0 = _mm512_load_si512((__m512i const *)(pi));
     __m512i pi1 = _mm512_load_si512((__m512i const *)(pi + 8));
     // We will load the entire 512-bit values, but will only use the first 128 bits,
@@ -1842,7 +1842,7 @@ SZ_INTERNAL void _sz_hash_minimal_x4_init_ice(_sz_hash_minimal_x4_t *state, sz_u
     state->sum.zmm = k2;
 }
 
-SZ_INTERNAL __m256i _sz_hash_minimal_x4_finalize_ice(_sz_hash_minimal_x4_t const *state, //
+SZ_INTERNAL __m256i sz_hash_minimal_x4_finalize_ice_(sz_hash_minimal_x4_t_ const *state, //
                                                      sz_size_t length0, sz_size_t length1, sz_size_t length2,
                                                      sz_size_t length3) {
     __m512i const padded_lengths = _mm512_set_epi64(0, length3, 0, length2, 0, length1, 0, length0);
@@ -1859,8 +1859,8 @@ SZ_INTERNAL __m256i _sz_hash_minimal_x4_finalize_ice(_sz_hash_minimal_x4_t const
         _mm512_permutexvar_epi64(_mm512_set_epi64(0, 0, 0, 0, 6, 4, 2, 0), mixed_in_register));
 }
 
-SZ_INTERNAL void _sz_hash_minimal_x4_update_ice(_sz_hash_minimal_x4_t *state, __m512i blocks) {
-    __m512i const shuffle_mask = _mm512_load_si512((__m512i const *)_sz_hash_u8x16x4_shuffle());
+SZ_INTERNAL void sz_hash_minimal_x4_update_ice_(sz_hash_minimal_x4_t_ *state, __m512i blocks) {
+    __m512i const shuffle_mask = _mm512_load_si512((__m512i const *)sz_hash_u8x16x4_shuffle_());
     state->aes.zmm = _mm512_aesenc_epi128(state->aes.zmm, blocks);
     state->sum.zmm = _mm512_add_epi64(_mm512_shuffle_epi8(state->sum.zmm, shuffle_mask), blocks);
 }
@@ -1902,25 +1902,25 @@ SZ_PUBLIC sz_u64_t sz_bytesum_neon(sz_cptr_t text, sz_size_t length) {
  *  @see    "Emulating x86 AES Intrinsics on ARMv8-A" by Michael Brase:
  *          https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a/
  */
-SZ_INTERNAL uint8x16_t _sz_emulate_aesenc_u8x16_neon(uint8x16_t state_vec, uint8x16_t round_key_vec) {
+SZ_INTERNAL uint8x16_t sz_emulate_aesenc_u8x16_neon_(uint8x16_t state_vec, uint8x16_t round_key_vec) {
     return veorq_u8(vaesmcq_u8(vaeseq_u8(state_vec, vdupq_n_u8(0))), round_key_vec);
 }
 
-SZ_INTERNAL uint64x2_t _sz_emulate_aesenc_u64x2_neon(uint64x2_t state_vec, uint64x2_t round_key_vec) {
+SZ_INTERNAL uint64x2_t sz_emulate_aesenc_u64x2_neon_(uint64x2_t state_vec, uint64x2_t round_key_vec) {
     return vreinterpretq_u64_u8(             //
-        _sz_emulate_aesenc_u8x16_neon(       //
+        sz_emulate_aesenc_u8x16_neon_(       //
             vreinterpretq_u8_u64(state_vec), //
             vreinterpretq_u8_u64(round_key_vec)));
 }
 
-SZ_INTERNAL void _sz_hash_minimal_init_neon(_sz_hash_minimal_t *state, sz_u64_t seed) {
+SZ_INTERNAL void sz_hash_minimal_init_neon_(sz_hash_minimal_t_ *state, sz_u64_t seed) {
 
     // The key is made from the seed and half of it will be mixed with the length in the end
     uint64x2_t seed_vec = vdupq_n_u64(seed);
     state->key.u64x2 = seed_vec;
 
     // XOR the user-supplied keys with the two "pi" constants
-    sz_u64_t const *pi = _sz_hash_pi_constants();
+    sz_u64_t const *pi = sz_hash_pi_constants_();
     uint64x2_t const pi0 = vld1q_u64(pi);
     uint64x2_t const pi1 = vld1q_u64(pi + 8);
     uint64x2_t k1 = veorq_u64(seed_vec, pi0);
@@ -1931,22 +1931,22 @@ SZ_INTERNAL void _sz_hash_minimal_init_neon(_sz_hash_minimal_t *state, sz_u64_t
     state->sum.u64x2 = k2;
 }
 
-SZ_INTERNAL sz_u64_t _sz_hash_minimal_finalize_neon(_sz_hash_minimal_t const *state, sz_size_t length) {
+SZ_INTERNAL sz_u64_t sz_hash_minimal_finalize_neon_(sz_hash_minimal_t_ const *state, sz_size_t length) {
     // Mix the length into the key
     uint64x2_t key_with_length = vaddq_u64(state->key.u64x2, vsetq_lane_u64(length, vdupq_n_u64(0), 0));
     // Combine the "sum" and the "AES" blocks
-    uint8x16_t mixed = _sz_emulate_aesenc_u8x16_neon(state->sum.u8x16, state->aes.u8x16);
+    uint8x16_t mixed = sz_emulate_aesenc_u8x16_neon_(state->sum.u8x16, state->aes.u8x16);
     // Make sure the "key" mixes enough with the state,
     // as with less than 2 rounds - SMHasher fails
-    uint8x16_t mixed_in_register = _sz_emulate_aesenc_u8x16_neon(
-        _sz_emulate_aesenc_u8x16_neon(mixed, vreinterpretq_u8_u64(key_with_length)), mixed);
+    uint8x16_t mixed_in_register = sz_emulate_aesenc_u8x16_neon_(
+        sz_emulate_aesenc_u8x16_neon_(mixed, vreinterpretq_u8_u64(key_with_length)), mixed);
     // Extract the low 64 bits
     return vgetq_lane_u64(vreinterpretq_u64_u8(mixed_in_register), 0);
 }
 
-SZ_INTERNAL void _sz_hash_minimal_update_neon(_sz_hash_minimal_t *state, uint8x16_t block) {
-    uint8x16_t const shuffle_mask = vld1q_u8(_sz_hash_u8x16x4_shuffle());
-    state->aes.u8x16 = _sz_emulate_aesenc_u8x16_neon(state->aes.u8x16, block);
+SZ_INTERNAL void sz_hash_minimal_update_neon_(sz_hash_minimal_t_ *state, uint8x16_t block) {
+    uint8x16_t const shuffle_mask = vld1q_u8(sz_hash_u8x16x4_shuffle_());
+    state->aes.u8x16 = sz_emulate_aesenc_u8x16_neon_(state->aes.u8x16, block);
     uint8x16_t sum_shuffled = vqtbl1q_u8(vreinterpretq_u8_u64(state->sum.u64x2), shuffle_mask);
     state->sum.u64x2 = vaddq_u64(vreinterpretq_u64_u8(sum_shuffled), vreinterpretq_u64_u8(block));
 }
@@ -1957,7 +1957,7 @@ SZ_PUBLIC void sz_hash_state_init_neon(sz_hash_state_t *state, sz_u64_t seed) {
     state->key.u64x2 = seed_vec;
 
     // XOR the user-supplied keys with the two "pi" constants
-    sz_u64_t const *pi = _sz_hash_pi_constants();
+    sz_u64_t const *pi = sz_hash_pi_constants_();
     for (int i = 0; i < 4; ++i) state->aes.u64x2s[i] = veorq_u64(seed_vec, vld1q_u64(pi + i * 2));
     for (int i = 0; i < 4; ++i) state->sum.u64x2s[i] = veorq_u64(seed_vec, vld1q_u64(pi + i * 2 + 8));
 
@@ -1966,38 +1966,38 @@ SZ_PUBLIC void sz_hash_state_init_neon(sz_hash_state_t *state, sz_u64_t seed) {
     state->ins_length = 0;
 }
 
-SZ_INTERNAL void _sz_hash_state_update_neon(sz_hash_state_t *state) {
-    uint8x16_t const shuffle_mask = vld1q_u8(_sz_hash_u8x16x4_shuffle());
-    state->aes.u8x16s[0] = _sz_emulate_aesenc_u8x16_neon(state->aes.u8x16s[0], state->ins.u8x16s[0]);
+SZ_INTERNAL void sz_hash_state_update_neon_(sz_hash_state_t *state) {
+    uint8x16_t const shuffle_mask = vld1q_u8(sz_hash_u8x16x4_shuffle_());
+    state->aes.u8x16s[0] = sz_emulate_aesenc_u8x16_neon_(state->aes.u8x16s[0], state->ins.u8x16s[0]);
     uint8x16_t sum_shuffled0 = vqtbl1q_u8(vreinterpretq_u8_u64(state->sum.u64x2s[0]), shuffle_mask);
     state->sum.u64x2s[0] = vaddq_u64(vreinterpretq_u64_u8(sum_shuffled0), state->ins.u64x2s[0]);
-    state->aes.u8x16s[1] = _sz_emulate_aesenc_u8x16_neon(state->aes.u8x16s[1], state->ins.u8x16s[1]);
+    state->aes.u8x16s[1] = sz_emulate_aesenc_u8x16_neon_(state->aes.u8x16s[1], state->ins.u8x16s[1]);
     uint8x16_t sum_shuffled1 = vqtbl1q_u8(vreinterpretq_u8_u64(state->sum.u64x2s[1]), shuffle_mask);
     state->sum.u64x2s[1] = vaddq_u64(vreinterpretq_u64_u8(sum_shuffled1), state->ins.u64x2s[1]);
-    state->aes.u8x16s[2] = _sz_emulate_aesenc_u8x16_neon(state->aes.u8x16s[2], state->ins.u8x16s[2]);
+    state->aes.u8x16s[2] = sz_emulate_aesenc_u8x16_neon_(state->aes.u8x16s[2], state->ins.u8x16s[2]);
     uint8x16_t sum_shuffled2 = vqtbl1q_u8(vreinterpretq_u8_u64(state->sum.u64x2s[2]), shuffle_mask);
     state->sum.u64x2s[2] = vaddq_u64(vreinterpretq_u64_u8(sum_shuffled2), state->ins.u64x2s[2]);
-    state->aes.u8x16s[3] = _sz_emulate_aesenc_u8x16_neon(state->aes.u8x16s[3], state->ins.u8x16s[3]);
+    state->aes.u8x16s[3] = sz_emulate_aesenc_u8x16_neon_(state->aes.u8x16s[3], state->ins.u8x16s[3]);
     uint8x16_t sum_shuffled3 = vqtbl1q_u8(vreinterpretq_u8_u64(state->sum.u64x2s[3]), shuffle_mask);
     state->sum.u64x2s[3] = vaddq_u64(vreinterpretq_u64_u8(sum_shuffled3), state->ins.u64x2s[3]);
 }
 
-SZ_INTERNAL sz_u64_t _sz_hash_state_finalize_neon(sz_hash_state_t const *state) {
+SZ_INTERNAL sz_u64_t sz_hash_state_finalize_neon_(sz_hash_state_t const *state) {
     // Mix the length into the key
     uint64x2_t key_with_length = vaddq_u64(state->key.u64x2, vsetq_lane_u64(state->ins_length, vdupq_n_u64(0), 0));
     // Combine the "sum" and the "AES" blocks
-    uint8x16_t mixed0 = _sz_emulate_aesenc_u8x16_neon(state->sum.u8x16s[0], state->aes.u8x16s[0]);
-    uint8x16_t mixed1 = _sz_emulate_aesenc_u8x16_neon(state->sum.u8x16s[1], state->aes.u8x16s[1]);
-    uint8x16_t mixed2 = _sz_emulate_aesenc_u8x16_neon(state->sum.u8x16s[2], state->aes.u8x16s[2]);
-    uint8x16_t mixed3 = _sz_emulate_aesenc_u8x16_neon(state->sum.u8x16s[3], state->aes.u8x16s[3]);
+    uint8x16_t mixed0 = sz_emulate_aesenc_u8x16_neon_(state->sum.u8x16s[0], state->aes.u8x16s[0]);
+    uint8x16_t mixed1 = sz_emulate_aesenc_u8x16_neon_(state->sum.u8x16s[1], state->aes.u8x16s[1]);
+    uint8x16_t mixed2 = sz_emulate_aesenc_u8x16_neon_(state->sum.u8x16s[2], state->aes.u8x16s[2]);
+    uint8x16_t mixed3 = sz_emulate_aesenc_u8x16_neon_(state->sum.u8x16s[3], state->aes.u8x16s[3]);
     // Combine the mixed registers
-    uint8x16_t mixed01 = _sz_emulate_aesenc_u8x16_neon(mixed0, mixed1);
-    uint8x16_t mixed23 = _sz_emulate_aesenc_u8x16_neon(mixed2, mixed3);
-    uint8x16_t mixed = _sz_emulate_aesenc_u8x16_neon(mixed01, mixed23);
+    uint8x16_t mixed01 = sz_emulate_aesenc_u8x16_neon_(mixed0, mixed1);
+    uint8x16_t mixed23 = sz_emulate_aesenc_u8x16_neon_(mixed2, mixed3);
+    uint8x16_t mixed = sz_emulate_aesenc_u8x16_neon_(mixed01, mixed23);
     // Make sure the "key" mixes enough with the state,
     // as with less than 2 rounds - SMHasher fails
-    uint8x16_t mixed_in_register = _sz_emulate_aesenc_u8x16_neon(
-        _sz_emulate_aesenc_u8x16_neon(mixed, vreinterpretq_u8_u64(key_with_length)), mixed);
+    uint8x16_t mixed_in_register = sz_emulate_aesenc_u8x16_neon_(
+        sz_emulate_aesenc_u8x16_neon_(mixed, vreinterpretq_u8_u64(key_with_length)), mixed);
     // Extract the low 64 bits
     return vgetq_lane_u64(vreinterpretq_u64_u8(mixed_in_register), 0);
 }
@@ -2011,7 +2011,7 @@ SZ_PUBLIC void sz_hash_state_stream_neon(sz_hash_state_t *state, sz_cptr_t text,
             state->ins.u8x16s[1] = vld1q_u8((sz_u8_t const *)(text + 16));
             state->ins.u8x16s[2] = vld1q_u8((sz_u8_t const *)(text + 32));
             state->ins.u8x16s[3] = vld1q_u8((sz_u8_t const *)(text + 48));
-            _sz_hash_state_update_neon(state);
+            sz_hash_state_update_neon_(state);
             state->ins_length += 64;
             text += 64;
             length -= 64;
@@ -2028,7 +2028,7 @@ SZ_PUBLIC void sz_hash_state_stream_neon(sz_hash_state_t *state, sz_cptr_t text,
             while (to_copy--) state->ins.u8s[progress_in_block++] = *text++;
             // If we've reached the end of the buffer, update the state
             if (will_fill_block) {
-                _sz_hash_state_update_neon(state);
+                sz_hash_state_update_neon_(state);
                 // Reset to zeros now, so we don't have to overwrite an immutable buffer in the folding state
                 for (int i = 0; i < 4; ++i) vst1q_u8(state->ins.u8s + i * 16, vdupq_n_u8(0));
             }
@@ -2039,10 +2039,10 @@ SZ_PUBLIC void sz_hash_state_stream_neon(sz_hash_state_t *state, sz_cptr_t text,
 SZ_PUBLIC sz_u64_t sz_hash_state_fold_neon(sz_hash_state_t const *state) {
     // This whole function is identical to Haswell.
     sz_size_t length = state->ins_length;
-    if (length >= 64) return _sz_hash_state_finalize_neon(state);
+    if (length >= 64) return sz_hash_state_finalize_neon_(state);
 
     // Switch back to a smaller "minimal" state for small inputs
-    _sz_hash_minimal_t minimal_state;
+    sz_hash_minimal_t_ minimal_state;
     minimal_state.key.u8x16 = state->key.u8x16;
     minimal_state.aes.u8x16 = state->aes.u8x16s[0];
     minimal_state.sum.u8x16 = state->sum.u8x16s[0];
@@ -2050,75 +2050,75 @@ SZ_PUBLIC sz_u64_t sz_hash_state_fold_neon(sz_hash_state_t const *state) {
     // The logic is different depending on the length of the input
     uint8x16_t const *ins_vecs = (uint8x16_t const *)&state->ins.u8x16s[0];
     if (length <= 16) {
-        _sz_hash_minimal_update_neon(&minimal_state, ins_vecs[0]);
-        return _sz_hash_minimal_finalize_neon(&minimal_state, length);
+        sz_hash_minimal_update_neon_(&minimal_state, ins_vecs[0]);
+        return sz_hash_minimal_finalize_neon_(&minimal_state, length);
     }
     else if (length <= 32) {
-        _sz_hash_minimal_update_neon(&minimal_state, ins_vecs[0]);
-        _sz_hash_minimal_update_neon(&minimal_state, ins_vecs[1]);
-        return _sz_hash_minimal_finalize_neon(&minimal_state, length);
+        sz_hash_minimal_update_neon_(&minimal_state, ins_vecs[0]);
+        sz_hash_minimal_update_neon_(&minimal_state, ins_vecs[1]);
+        return sz_hash_minimal_finalize_neon_(&minimal_state, length);
     }
     else if (length <= 48) {
-        _sz_hash_minimal_update_neon(&minimal_state, ins_vecs[0]);
-        _sz_hash_minimal_update_neon(&minimal_state, ins_vecs[1]);
-        _sz_hash_minimal_update_neon(&minimal_state, ins_vecs[2]);
-        return _sz_hash_minimal_finalize_neon(&minimal_state, length);
+        sz_hash_minimal_update_neon_(&minimal_state, ins_vecs[0]);
+        sz_hash_minimal_update_neon_(&minimal_state, ins_vecs[1]);
+        sz_hash_minimal_update_neon_(&minimal_state, ins_vecs[2]);
+        return sz_hash_minimal_finalize_neon_(&minimal_state, length);
     }
     else {
-        _sz_hash_minimal_update_neon(&minimal_state, ins_vecs[0]);
-        _sz_hash_minimal_update_neon(&minimal_state, ins_vecs[1]);
-        _sz_hash_minimal_update_neon(&minimal_state, ins_vecs[2]);
-        _sz_hash_minimal_update_neon(&minimal_state, ins_vecs[3]);
-        return _sz_hash_minimal_finalize_neon(&minimal_state, length);
+        sz_hash_minimal_update_neon_(&minimal_state, ins_vecs[0]);
+        sz_hash_minimal_update_neon_(&minimal_state, ins_vecs[1]);
+        sz_hash_minimal_update_neon_(&minimal_state, ins_vecs[2]);
+        sz_hash_minimal_update_neon_(&minimal_state, ins_vecs[3]);
+        return sz_hash_minimal_finalize_neon_(&minimal_state, length);
     }
 }
 
 SZ_PUBLIC sz_u64_t sz_hash_neon(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
     if (length <= 16) {
         // Initialize the AES block with a given seed
-        _sz_hash_minimal_t state;
-        _sz_hash_minimal_init_neon(&state, seed);
+        sz_hash_minimal_t_ state;
+        sz_hash_minimal_init_neon_(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data_vec;
         data_vec.u8x16 = vdupq_n_u8(0);
         for (sz_size_t i = 0; i < length; ++i) data_vec.u8s[i] = start[i];
-        _sz_hash_minimal_update_neon(&state, data_vec.u8x16);
-        return _sz_hash_minimal_finalize_neon(&state, length);
+        sz_hash_minimal_update_neon_(&state, data_vec.u8x16);
+        return sz_hash_minimal_finalize_neon_(&state, length);
     }
     else if (length <= 32) {
         // Initialize the AES block with a given seed
-        _sz_hash_minimal_t state;
-        _sz_hash_minimal_init_neon(&state, seed);
+        sz_hash_minimal_t_ state;
+        sz_hash_minimal_init_neon_(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec;
         data0_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + 0));
         data1_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + length - 16));
         // Let's shift the data within the register to de-interleave the bytes.
-        _sz_hash_shift_in_register_serial(&data1_vec, 32 - length); //! `vextq_u8` requires immediates
-        _sz_hash_minimal_update_neon(&state, data0_vec.u8x16);
-        _sz_hash_minimal_update_neon(&state, data1_vec.u8x16);
-        return _sz_hash_minimal_finalize_neon(&state, length);
+        sz_hash_shift_in_register_serial_(&data1_vec, 32 - length); //! `vextq_u8` requires immediates
+        sz_hash_minimal_update_neon_(&state, data0_vec.u8x16);
+        sz_hash_minimal_update_neon_(&state, data1_vec.u8x16);
+        return sz_hash_minimal_finalize_neon_(&state, length);
     }
     else if (length <= 48) {
         // Initialize the AES block with a given seed
-        _sz_hash_minimal_t state;
-        _sz_hash_minimal_init_neon(&state, seed);
+        sz_hash_minimal_t_ state;
+        sz_hash_minimal_init_neon_(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec;
         data0_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + 0));
         data1_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + 16));
         data2_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + length - 16));
         // Let's shift the data within the register to de-interleave the bytes.
-        _sz_hash_shift_in_register_serial(&data2_vec, 48 - length); //! `vextq_u8` requires immediates
-        _sz_hash_minimal_update_neon(&state, data0_vec.u8x16);
-        _sz_hash_minimal_update_neon(&state, data1_vec.u8x16);
-        _sz_hash_minimal_update_neon(&state, data2_vec.u8x16);
-        return _sz_hash_minimal_finalize_neon(&state, length);
+        sz_hash_shift_in_register_serial_(&data2_vec, 48 - length); //! `vextq_u8` requires immediates
+        sz_hash_minimal_update_neon_(&state, data0_vec.u8x16);
+        sz_hash_minimal_update_neon_(&state, data1_vec.u8x16);
+        sz_hash_minimal_update_neon_(&state, data2_vec.u8x16);
+        return sz_hash_minimal_finalize_neon_(&state, length);
     }
     else if (length <= 64) {
         // Initialize the AES block with a given seed
-        _sz_hash_minimal_t state;
-        _sz_hash_minimal_init_neon(&state, seed);
+        sz_hash_minimal_t_ state;
+        sz_hash_minimal_init_neon_(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec, data3_vec;
         data0_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + 0));
@@ -2126,24 +2126,24 @@ SZ_PUBLIC sz_u64_t sz_hash_neon(sz_cptr_t start, sz_size_t length, sz_u64_t seed
         data2_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + 32));
         data3_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + length - 16));
         // Let's shift the data within the register to de-interleave the bytes.
-        _sz_hash_shift_in_register_serial(&data3_vec, 64 - length); //! `vextq_u8` requires immediates
-        _sz_hash_minimal_update_neon(&state, data0_vec.u8x16);
-        _sz_hash_minimal_update_neon(&state, data1_vec.u8x16);
-        _sz_hash_minimal_update_neon(&state, data2_vec.u8x16);
-        _sz_hash_minimal_update_neon(&state, data3_vec.u8x16);
-        return _sz_hash_minimal_finalize_neon(&state, length);
+        sz_hash_shift_in_register_serial_(&data3_vec, 64 - length); //! `vextq_u8` requires immediates
+        sz_hash_minimal_update_neon_(&state, data0_vec.u8x16);
+        sz_hash_minimal_update_neon_(&state, data1_vec.u8x16);
+        sz_hash_minimal_update_neon_(&state, data2_vec.u8x16);
+        sz_hash_minimal_update_neon_(&state, data3_vec.u8x16);
+        return sz_hash_minimal_finalize_neon_(&state, length);
     }
     else {
         // Use a larger state to handle the main loop and add different offsets
         // to different lanes of the register
-        _SZ_ALIGN64 sz_hash_state_t state;
+        SZ_ALIGN64 sz_hash_state_t state;
         sz_hash_state_init_neon(&state, seed);
         for (; state.ins_length + 64 <= length; state.ins_length += 64) {
             state.ins.u8x16s[0] = vld1q_u8((sz_u8_t const *)(start + state.ins_length + 0));
             state.ins.u8x16s[1] = vld1q_u8((sz_u8_t const *)(start + state.ins_length + 16));
             state.ins.u8x16s[2] = vld1q_u8((sz_u8_t const *)(start + state.ins_length + 32));
             state.ins.u8x16s[3] = vld1q_u8((sz_u8_t const *)(start + state.ins_length + 48));
-            _sz_hash_state_update_neon(&state);
+            sz_hash_state_update_neon_(&state);
         }
         // Handle the tail, resetting the registers to zero first
         if (state.ins_length < length) {
@@ -2153,20 +2153,20 @@ SZ_PUBLIC sz_u64_t sz_hash_neon(sz_cptr_t start, sz_size_t length, sz_u64_t seed
             state.ins.u8x16s[3] = vdupq_n_u8(0);
             for (sz_size_t i = 0; state.ins_length < length; ++i, ++state.ins_length)
                 state.ins.u8s[i] = start[state.ins_length];
-            _sz_hash_state_update_neon(&state);
+            sz_hash_state_update_neon_(&state);
             state.ins_length = length;
         }
-        return _sz_hash_state_finalize_neon(&state);
+        return sz_hash_state_finalize_neon_(&state);
     }
 }
 
 SZ_PUBLIC void sz_fill_random_neon(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
-    sz_u64_t const *pi_ptr = _sz_hash_pi_constants();
+    sz_u64_t const *pi_ptr = sz_hash_pi_constants_();
     if (length <= 16) {
         uint64x2_t input = vdupq_n_u64(nonce);
         uint64x2_t pi = vld1q_u64(pi_ptr);
         uint64x2_t key = veorq_u64(vdupq_n_u64(nonce), pi);
-        uint64x2_t generated = _sz_emulate_aesenc_u64x2_neon(input, key);
+        uint64x2_t generated = sz_emulate_aesenc_u64x2_neon_(input, key);
         // Now the tricky part is outputting this data to the user-supplied buffer
         // without masked writes, like in AVX-512.
         for (sz_size_t i = 0; i < length; ++i) text[i] = ((sz_u8_t *)&generated)[i];
@@ -2181,8 +2181,8 @@ SZ_PUBLIC void sz_fill_random_neon(sz_ptr_t text, sz_size_t length, sz_u64_t non
         pis[1] = vld1q_u64(pi_ptr + 2);
         keys[0] = veorq_u64(vdupq_n_u64(nonce), pis[0]);
         keys[1] = veorq_u64(vdupq_n_u64(nonce), pis[1]);
-        generated[0] = _sz_emulate_aesenc_u64x2_neon(inputs[0], keys[0]);
-        generated[1] = _sz_emulate_aesenc_u64x2_neon(inputs[1], keys[1]);
+        generated[0] = sz_emulate_aesenc_u64x2_neon_(inputs[0], keys[0]);
+        generated[1] = sz_emulate_aesenc_u64x2_neon_(inputs[1], keys[1]);
         // The first store can easily be vectorized, but the second can be serial for now
         vst1q_u64((sz_u64_t *)(text), generated[0]);
         for (sz_size_t i = 16; i < length; ++i) text[i] = ((sz_u8_t *)&generated[1])[i - 16];
@@ -2199,9 +2199,9 @@ SZ_PUBLIC void sz_fill_random_neon(sz_ptr_t text, sz_size_t length, sz_u64_t non
         keys[0] = veorq_u64(vdupq_n_u64(nonce), pis[0]);
         keys[1] = veorq_u64(vdupq_n_u64(nonce), pis[1]);
         keys[2] = veorq_u64(vdupq_n_u64(nonce), pis[2]);
-        generated[0] = _sz_emulate_aesenc_u64x2_neon(inputs[0], keys[0]);
-        generated[1] = _sz_emulate_aesenc_u64x2_neon(inputs[1], keys[1]);
-        generated[2] = _sz_emulate_aesenc_u64x2_neon(inputs[2], keys[2]);
+        generated[0] = sz_emulate_aesenc_u64x2_neon_(inputs[0], keys[0]);
+        generated[1] = sz_emulate_aesenc_u64x2_neon_(inputs[1], keys[1]);
+        generated[2] = sz_emulate_aesenc_u64x2_neon_(inputs[2], keys[2]);
         // The first store can easily be vectorized, but the second can be serial for now
         vst1q_u64((sz_u64_t *)(text + 0), generated[0]);
         vst1q_u64((sz_u64_t *)(text + 16), generated[1]);
@@ -2229,10 +2229,10 @@ SZ_PUBLIC void sz_fill_random_neon(sz_ptr_t text, sz_size_t length, sz_u64_t non
         sz_size_t i = 0;
         uint64x2_t const increment = vdupq_n_u64(4);
         for (; i + 64 <= length; i += 64) {
-            generated[0] = _sz_emulate_aesenc_u64x2_neon(inputs[0], keys[0]);
-            generated[1] = _sz_emulate_aesenc_u64x2_neon(inputs[1], keys[1]);
-            generated[2] = _sz_emulate_aesenc_u64x2_neon(inputs[2], keys[2]);
-            generated[3] = _sz_emulate_aesenc_u64x2_neon(inputs[3], keys[3]);
+            generated[0] = sz_emulate_aesenc_u64x2_neon_(inputs[0], keys[0]);
+            generated[1] = sz_emulate_aesenc_u64x2_neon_(inputs[1], keys[1]);
+            generated[2] = sz_emulate_aesenc_u64x2_neon_(inputs[2], keys[2]);
+            generated[3] = sz_emulate_aesenc_u64x2_neon_(inputs[3], keys[3]);
             vst1q_u64((sz_u64_t *)(text + i + 0), generated[0]);
             vst1q_u64((sz_u64_t *)(text + i + 16), generated[1]);
             vst1q_u64((sz_u64_t *)(text + i + 32), generated[2]);
@@ -2245,10 +2245,10 @@ SZ_PUBLIC void sz_fill_random_neon(sz_ptr_t text, sz_size_t length, sz_u64_t non
 
         // Handle the tail of the buffer.
         {
-            generated[0] = _sz_emulate_aesenc_u64x2_neon(inputs[0], keys[0]);
-            generated[1] = _sz_emulate_aesenc_u64x2_neon(inputs[1], keys[1]);
-            generated[2] = _sz_emulate_aesenc_u64x2_neon(inputs[2], keys[2]);
-            generated[3] = _sz_emulate_aesenc_u64x2_neon(inputs[3], keys[3]);
+            generated[0] = sz_emulate_aesenc_u64x2_neon_(inputs[0], keys[0]);
+            generated[1] = sz_emulate_aesenc_u64x2_neon_(inputs[1], keys[1]);
+            generated[2] = sz_emulate_aesenc_u64x2_neon_(inputs[2], keys[2]);
+            generated[3] = sz_emulate_aesenc_u64x2_neon_(inputs[3], keys[3]);
             for (sz_size_t j = 0; i < length; ++i, ++j) text[i] = ((sz_u8_t *)generated)[j];
         }
     }
@@ -2330,37 +2330,37 @@ SZ_PUBLIC sz_u64_t sz_bytesum_sve2(sz_cptr_t text, sz_size_t length) {
  *  @see    "Emulating x86 AES Intrinsics on ARMv8-A" by Michael Brase:
  *          https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a/
  */
-SZ_INTERNAL svuint8_t _sz_emulate_aesenc_u8x16_sve2(svuint8_t state_vec, svuint8_t round_key_vec) {
+SZ_INTERNAL svuint8_t sz_emulate_aesenc_u8x16_sve2_(svuint8_t state_vec, svuint8_t round_key_vec) {
     return sveor_u8_x(svptrue_b8(), svaesmc_u8(svaese_u8(state_vec, svdup_n_u8(0))), round_key_vec);
 }
 
-SZ_INTERNAL svuint64_t _sz_emulate_aesenc_u64x2_sve2(svuint64_t state_vec, svuint64_t round_key_vec) {
-    return svreinterpret_u64_u8(_sz_emulate_aesenc_u8x16_sve2( //
+SZ_INTERNAL svuint64_t sz_emulate_aesenc_u64x2_sve2_(svuint64_t state_vec, svuint64_t round_key_vec) {
+    return svreinterpret_u64_u8(sz_emulate_aesenc_u8x16_sve2_( //
         svreinterpret_u8_u64(state_vec),                       //
         svreinterpret_u8_u64(round_key_vec)));
 }
 
 /** @brief A variant of `sz_hash_sve2` for strings up to 16 bytes long - smallest SVE register size. */
-SZ_PUBLIC sz_u64_t _sz_hash_sve2_upto16(sz_cptr_t text, sz_size_t length, sz_u64_t seed) {
+SZ_PUBLIC sz_u64_t sz_hash_sve2_upto16_(sz_cptr_t text, sz_size_t length, sz_u64_t seed) {
     svuint8_t state_aes, state_sum, state_key;
 
     // To load and store the seed, we don't even need a `svwhilelt_b64(0, 2)`.
     state_key = svreinterpret_u8_u64(svdup_n_u64(seed));
 
     // XOR the user-supplied keys with the two "pi" constants
-    sz_u64_t const *pi = _sz_hash_pi_constants();
+    sz_u64_t const *pi = sz_hash_pi_constants_();
     svuint64_t pi0 = svld1_u64(svptrue_b64(), pi);
     svuint64_t pi1 = svld1_u64(svptrue_b64(), pi + 8);
     state_aes = sveor_u8_x(svptrue_b8(), state_key, svreinterpret_u8_u64(pi0));
     state_sum = sveor_u8_x(svptrue_b8(), state_key, svreinterpret_u8_u64(pi1));
 
     // We will only use the first 128 bits of the shuffle mask
-    svuint8_t const shuffle_mask = svld1_u8(svptrue_b8(), _sz_hash_u8x16x4_shuffle());
+    svuint8_t const shuffle_mask = svld1_u8(svptrue_b8(), sz_hash_u8x16x4_shuffle_());
 
     // This is our best case for SVE2 dominance over NEON - we can load the data in one go with a predicate.
     svuint8_t block = svld1_u8(svwhilelt_b8((sz_u64_t)0, (sz_u64_t)length), (sz_u8_t const *)text);
     // One round of hashing logic
-    state_aes = _sz_emulate_aesenc_u8x16_sve2(state_aes, block);
+    state_aes = sz_emulate_aesenc_u8x16_sve2_(state_aes, block);
     svuint8_t sum_shuffled = svtbl_u8(state_sum, shuffle_mask);
     state_sum = svreinterpret_u8_u64(
         svadd_u64_x(svptrue_b64(), svreinterpret_u64_u8(sum_shuffled), svreinterpret_u64_u8(block)));
@@ -2368,11 +2368,11 @@ SZ_PUBLIC sz_u64_t _sz_hash_sve2_upto16(sz_cptr_t text, sz_size_t length, sz_u64
     // Now mix, folding the length into the key
     svuint64_t key_with_length = svadd_u64_x(svptrue_b64(), svreinterpret_u64_u8(state_key), svdupq_n_u64(length, 0));
     // Combine the "sum" and the "AES" blocks
-    svuint8_t mixed = _sz_emulate_aesenc_u8x16_sve2(state_sum, state_aes);
+    svuint8_t mixed = sz_emulate_aesenc_u8x16_sve2_(state_sum, state_aes);
     // Make sure the "key" mixes enough with the state,
     // as with less than 2 rounds - SMHasher fails
-    svuint8_t mixed_in_register = _sz_emulate_aesenc_u8x16_sve2(
-        _sz_emulate_aesenc_u8x16_sve2(mixed, svreinterpret_u8_u64(key_with_length)), mixed);
+    svuint8_t mixed_in_register = sz_emulate_aesenc_u8x16_sve2_(
+        sz_emulate_aesenc_u8x16_sve2_(mixed, svreinterpret_u8_u64(key_with_length)), mixed);
     // Extract the low 64 bits
     svuint64_t mixed_in_register_u64 = svreinterpret_u64_u8(mixed_in_register);
     return svlasta_u64(svpfalse_b(), mixed_in_register_u64); // Extract the first element
@@ -2389,7 +2389,7 @@ SZ_PUBLIC sz_u64_t sz_hash_state_fold_sve2(sz_hash_state_t const *state) { //
 }
 
 SZ_PUBLIC sz_u64_t sz_hash_sve2(sz_cptr_t text, sz_size_t length, sz_u64_t seed) {
-    if (length <= 16) { return _sz_hash_sve2_upto16(text, length, seed); }
+    if (length <= 16) { return sz_hash_sve2_upto16_(text, length, seed); }
     else { return sz_hash_neon(text, length, seed); }
 }
 
@@ -2405,21 +2405,21 @@ SZ_PUBLIC void sz_fill_random_sve2(sz_ptr_t text, sz_size_t length, sz_u64_t non
  *          16 such individual AES blocks.
  *          It's relevant for set intersection operations and is faster than hashing each string individually.
  */
-SZ_PUBLIC void _sz_hash_sve2_upto16x16(char texts[16][16], sz_size_t length[16], sz_u64_t seed, sz_u64_t hashes[16]) {
+SZ_PUBLIC void sz_hash_sve2_upto16x16_(char texts[16][16], sz_size_t length[16], sz_u64_t seed, sz_u64_t hashes[16]) {
     svuint8_t state_aes, state_sum, state_key;
 
     // To load and store the seed, we don't even need a `svwhilelt_b64(0, 2)`.
     state_key = svreinterpret_u8_u64(svdup_n_u64(seed));
 
     // XOR the user-supplied keys with the two "pi" constants
-    sz_u64_t const *pi = _sz_hash_pi_constants();
+    sz_u64_t const *pi = sz_hash_pi_constants_();
     svuint64_t pi0 = svdupq_n_u64(pi[0], pi[1]);
     svuint64_t pi1 = svdupq_n_u64(pi[8], pi[9]);
     state_aes = sveor_u8_x(svptrue_b8(), state_key, svreinterpret_u8_u64(pi0));
     state_sum = sveor_u8_x(svptrue_b8(), state_key, svreinterpret_u8_u64(pi1));
 
     // We will only use the first 128 bits of the shuffle mask
-    sz_u8_t const *shuffle_mask = _sz_hash_u8x16x4_shuffle();
+    sz_u8_t const *shuffle_mask = sz_hash_u8x16x4_shuffle_();
     svuint8_t const shuffle_mask = svreinterpret_u8_u64(svdupq_n_u64( //
         *(sz_u64_t const *)(shuffle_mask + 0),                        //
         *(sz_u64_t const *)(shuffle_mask + 8)));
@@ -2433,7 +2433,7 @@ SZ_PUBLIC void _sz_hash_sve2_upto16x16(char texts[16][16], sz_size_t length[16],
             svld1_u8(svwhilelt_b8((sz_u64_t)progress_bytes, (sz_u64_t)256), (sz_u8_t const *)(&texts[0][0] + progress_bytes));
 
         // One round of hashing logic for multiple blocks
-        svuint8_t blocks_aes = _sz_emulate_aesenc_u8x16_sve2(state_aes, blocks);
+        svuint8_t blocks_aes = sz_emulate_aesenc_u8x16_sve2_(state_aes, blocks);
         svuint8_t blocks_sum = svreinterpret_u8_u64(
             svadd_u64_x(svptrue_b64(), svreinterpret_u64_u8(sum_shuffled), svreinterpret_u64_u8(blocks)));
 
@@ -2442,12 +2442,12 @@ SZ_PUBLIC void _sz_hash_sve2_upto16x16(char texts[16][16], sz_size_t length[16],
             svadd_u64_x(svptrue_b64(), svreinterpret_u64_u8(state_key), svdupq_n_u64(length, 0));
 
         // Combine the "sum" and the "AES" blocks
-        svuint8_t mixed = _sz_emulate_aesenc_u8x16_sve2(blocks_sum, blocks_aes);
+        svuint8_t mixed = sz_emulate_aesenc_u8x16_sve2_(blocks_sum, blocks_aes);
 
         // Make sure the "key" mixes enough with the state,
         // as with less than 2 rounds - SMHasher fails
-        svuint8_t mixed_in_register = _sz_emulate_aesenc_u8x16_sve2(
-            _sz_emulate_aesenc_u8x16_sve2(mixed, svreinterpret_u8_u64(key_with_lengths)), mixed);
+        svuint8_t mixed_in_register = sz_emulate_aesenc_u8x16_sve2_(
+            sz_emulate_aesenc_u8x16_sve2_(mixed, svreinterpret_u8_u64(key_with_lengths)), mixed);
 
         // Extract the low 64 bits from each lane
         svuint64_t mixed_in_register_u64 = svreinterpret_u64_u8(mixed_in_register);
diff --git a/include/stringzilla/intersect.h b/include/stringzilla/intersect.h
index 5bda1717..62a0e62c 100644
--- a/include/stringzilla/intersect.h
+++ b/include/stringzilla/intersect.h
@@ -355,7 +355,7 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_serial(
     __attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vnni,bmi,bmi2,aes,vaes"))), \
     apply_to = function)
 
-SZ_INTERNAL int _sz_u64x4_contains_collisions_haswell(__m256i v) {
+SZ_INTERNAL int sz_u64x4_contains_collisions_haswell_(__m256i v) {
     // Assume `v` stores values: [a, b, c, d].
     __m256i cmp1 = _mm256_cmpeq_epi64(v, _mm256_permute4x64_epi64(v, 0xB1)); // 0xB1 produces [b, a, d, c]
     __m256i cmp2 = _mm256_cmpeq_epi64(v, _mm256_permute4x64_epi64(v, 0x4E)); // 0x4E produces [c, d, a, b]
@@ -414,8 +414,8 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_ice(
     //
     // For larger entries, we will use a separate loop afterwards to decrease the likelihood of collisions
     // on the shorter entries, that can benefit from vectorized processing.
-    _sz_hash_minimal_x4_t batch_hashes_states_initial;
-    _sz_hash_minimal_x4_init_ice(&batch_hashes_states_initial, seed);
+    sz_hash_minimal_x4_t_ batch_hashes_states_initial;
+    sz_hash_minimal_x4_init_ice_(&batch_hashes_states_initial, seed);
     sz_size_t count_longer = 0;
     for (sz_size_t small_position = 0; small_position < small_sequence->count;) {
         sz_string_view_t batch[4];
@@ -453,27 +453,27 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_ice(
             // Now let's load the first bytes of each string.
             sz_u256_vec_t batch_hashes;
             sz_u512_vec_t batch_prefixes;
-            batch_prefixes.xmms[0] = _mm_maskz_loadu_epi8(_sz_u16_mask_until(batch[0].length), batch[0].start);
-            batch_prefixes.xmms[1] = _mm_maskz_loadu_epi8(_sz_u16_mask_until(batch[1].length), batch[1].start);
-            batch_prefixes.xmms[2] = _mm_maskz_loadu_epi8(_sz_u16_mask_until(batch[2].length), batch[2].start);
-            batch_prefixes.xmms[3] = _mm_maskz_loadu_epi8(_sz_u16_mask_until(batch[3].length), batch[3].start);
+            batch_prefixes.xmms[0] = _mm_maskz_loadu_epi8(sz_u16_mask_until_(batch[0].length), batch[0].start);
+            batch_prefixes.xmms[1] = _mm_maskz_loadu_epi8(sz_u16_mask_until_(batch[1].length), batch[1].start);
+            batch_prefixes.xmms[2] = _mm_maskz_loadu_epi8(sz_u16_mask_until_(batch[2].length), batch[2].start);
+            batch_prefixes.xmms[3] = _mm_maskz_loadu_epi8(sz_u16_mask_until_(batch[3].length), batch[3].start);
 
             // Reuse the already computed state for hashes
-            _sz_hash_minimal_x4_t batch_hashes_states = batch_hashes_states_initial;
-            _sz_hash_minimal_x4_update_ice(&batch_hashes_states, batch_prefixes.zmm);
-            batch_hashes.ymm = _sz_hash_minimal_x4_finalize_ice(&batch_hashes_states, batch[0].length, batch[1].length,
+            sz_hash_minimal_x4_t_ batch_hashes_states = batch_hashes_states_initial;
+            sz_hash_minimal_x4_update_ice_(&batch_hashes_states, batch_prefixes.zmm);
+            batch_hashes.ymm = sz_hash_minimal_x4_finalize_ice_(&batch_hashes_states, batch[0].length, batch[1].length,
                                                                 batch[2].length, batch[3].length);
-            _sz_assert(batch_hashes.u64s[0] == sz_hash(batch[0].start, batch[0].length, seed));
-            _sz_assert(batch_hashes.u64s[1] == sz_hash(batch[1].start, batch[1].length, seed));
-            _sz_assert(batch_hashes.u64s[2] == sz_hash(batch[2].start, batch[2].length, seed));
-            _sz_assert(batch_hashes.u64s[3] == sz_hash(batch[3].start, batch[3].length, seed));
+            sz_assert_(batch_hashes.u64s[0] == sz_hash(batch[0].start, batch[0].length, seed));
+            sz_assert_(batch_hashes.u64s[1] == sz_hash(batch[1].start, batch[1].length, seed));
+            sz_assert_(batch_hashes.u64s[2] == sz_hash(batch[2].start, batch[2].length, seed));
+            sz_assert_(batch_hashes.u64s[3] == sz_hash(batch[3].start, batch[3].length, seed));
 
             // Now let's perform an optimistic hash-table lookup using vectorized gathers
             sz_u256_vec_t batch_slots, existing_hashes;
             batch_slots.ymm = _mm256_and_si256(batch_hashes.ymm, _mm256_set1_epi64x(hash_table_slots - 1));
 
             // In case of very small inputs, it's more likely, that some of the 4x hashes or their slots will collide
-            int const has_slot_collisions = _sz_u64x4_contains_collisions_haswell(batch_slots.ymm);
+            int const has_slot_collisions = sz_u64x4_contains_collisions_haswell_(batch_slots.ymm);
 
             // Before scattering the new positions - gather the pre-existing ones.
             // In case of `has_slot_collisions`, this will practically be a "prefetch" operation.
@@ -555,20 +555,20 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_ice(
             // Now let's load the first bytes of each string.
             sz_u256_vec_t batch_hashes;
             sz_u512_vec_t batch_prefixes;
-            batch_prefixes.xmms[0] = _mm_maskz_loadu_epi8(_sz_u16_mask_until(batch[0].length), batch[0].start);
-            batch_prefixes.xmms[1] = _mm_maskz_loadu_epi8(_sz_u16_mask_until(batch[1].length), batch[1].start);
-            batch_prefixes.xmms[2] = _mm_maskz_loadu_epi8(_sz_u16_mask_until(batch[2].length), batch[2].start);
-            batch_prefixes.xmms[3] = _mm_maskz_loadu_epi8(_sz_u16_mask_until(batch[3].length), batch[3].start);
+            batch_prefixes.xmms[0] = _mm_maskz_loadu_epi8(sz_u16_mask_until_(batch[0].length), batch[0].start);
+            batch_prefixes.xmms[1] = _mm_maskz_loadu_epi8(sz_u16_mask_until_(batch[1].length), batch[1].start);
+            batch_prefixes.xmms[2] = _mm_maskz_loadu_epi8(sz_u16_mask_until_(batch[2].length), batch[2].start);
+            batch_prefixes.xmms[3] = _mm_maskz_loadu_epi8(sz_u16_mask_until_(batch[3].length), batch[3].start);
 
             // Reuse the already computed state for hashes
-            _sz_hash_minimal_x4_t batch_hashes_states = batch_hashes_states_initial;
-            _sz_hash_minimal_x4_update_ice(&batch_hashes_states, batch_prefixes.zmm);
-            batch_hashes.ymm = _sz_hash_minimal_x4_finalize_ice(&batch_hashes_states, batch[0].length, batch[1].length,
+            sz_hash_minimal_x4_t_ batch_hashes_states = batch_hashes_states_initial;
+            sz_hash_minimal_x4_update_ice_(&batch_hashes_states, batch_prefixes.zmm);
+            batch_hashes.ymm = sz_hash_minimal_x4_finalize_ice_(&batch_hashes_states, batch[0].length, batch[1].length,
                                                                 batch[2].length, batch[3].length);
-            _sz_assert(batch_hashes.u64s[0] == sz_hash(batch[0].start, batch[0].length, seed));
-            _sz_assert(batch_hashes.u64s[1] == sz_hash(batch[1].start, batch[1].length, seed));
-            _sz_assert(batch_hashes.u64s[2] == sz_hash(batch[2].start, batch[2].length, seed));
-            _sz_assert(batch_hashes.u64s[3] == sz_hash(batch[3].start, batch[3].length, seed));
+            sz_assert_(batch_hashes.u64s[0] == sz_hash(batch[0].start, batch[0].length, seed));
+            sz_assert_(batch_hashes.u64s[1] == sz_hash(batch[1].start, batch[1].length, seed));
+            sz_assert_(batch_hashes.u64s[2] == sz_hash(batch[2].start, batch[2].length, seed));
+            sz_assert_(batch_hashes.u64s[3] == sz_hash(batch[3].start, batch[3].length, seed));
 
             // Now let's perform an optimistic hash-table lookup using vectorized gathers.
             sz_u256_vec_t batch_slots, existing_hashes;
@@ -724,7 +724,7 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_sve(sz_sequence_t const *first_seque
                                                 sz_memory_allocator_t *alloc, sz_u64_t seed,
                                                 sz_size_t *intersection_size, sz_sorted_idx_t *first_positions,
                                                 sz_sorted_idx_t *second_positions) {
-    // TODO: Finalize `_sz_hash_sve2_upto16x16` and integrate here
+    // TODO: Finalize `sz_hash_sve2_upto16x16_` and integrate here
     return sz_sequence_intersect_serial( //
         first_sequence, second_sequence, //
         alloc, seed, intersection_size,  //
diff --git a/include/stringzilla/memory.h b/include/stringzilla/memory.h
index 8e33d2ce..f07850ec 100644
--- a/include/stringzilla/memory.h
+++ b/include/stringzilla/memory.h
@@ -409,13 +409,13 @@ SZ_PUBLIC void sz_fill_haswell(sz_ptr_t target, sz_size_t length, sz_u8_t value)
         if (head_length & 8) *(sz_u64_t *)target = value64, target += 8, head_length -= 8;
         if (head_length & 16)
             _mm_store_si128((__m128i *)target, _mm_set1_epi8(value_char)), target += 16, head_length -= 16;
-        _sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
+        sz_assert_((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
 
         // Fill the aligned body of the buffer.
         for (; body_length >= 32; target += 32, body_length -= 32) _mm256_store_si256((__m256i *)target, value_vec);
 
         // Fill the tail of the buffer. This part is much cleaner with AVX-512.
-        _sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
+        sz_assert_((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
         if (tail_length & 16)
             _mm_store_si128((__m128i *)target, _mm_set1_epi8(value_char)), target += 16, tail_length -= 16;
         if (tail_length & 8) *(sz_u64_t *)target = value64, target += 8, tail_length -= 8;
@@ -499,7 +499,7 @@ SZ_PUBLIC void sz_copy_haswell(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
         if (head_length & 16)
             _mm_store_si128((__m128i *)target, _mm_lddqu_si128((__m128i const *)source)), target += 16, source += 16,
                 head_length -= 16;
-        _sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
+        sz_assert_((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
 
         // Fill the aligned body of the buffer.
         if (!is_huge) {
@@ -517,7 +517,7 @@ SZ_PUBLIC void sz_copy_haswell(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
         }
 
         // Fill the tail of the buffer. This part is much cleaner with AVX-512.
-        _sz_assert((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
+        sz_assert_((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
         if (tail_length & 16)
             _mm_store_si128((__m128i *)target, _mm_lddqu_si128((__m128i const *)source)), target += 16, source += 16,
                 tail_length -= 16;
@@ -747,11 +747,11 @@ SZ_PUBLIC void sz_fill_skylake(sz_ptr_t target, sz_size_t length, sz_u8_t value)
     // It assumes the CPU is great at handling unaligned "stores".
     //
     //    for (; length >= 64; target += 64, length -= 64) _mm512_storeu_si512(target, value_vec);
-    //    _mm512_mask_storeu_epi8(target, _sz_u64_mask_until(length), value_vec);
+    //    _mm512_mask_storeu_epi8(target, sz_u64_mask_until_(length), value_vec);
     //
     // When the buffer is small, there isn't much to innovate.
     if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
+        __mmask64 mask = sz_u64_mask_until_(length);
         _mm512_mask_storeu_epi8(target, mask, value_vec);
     }
     // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
@@ -762,8 +762,8 @@ SZ_PUBLIC void sz_fill_skylake(sz_ptr_t target, sz_size_t length, sz_u8_t value)
         sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
         sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
         sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
+        __mmask64 head_mask = sz_u64_mask_until_(head_length);
+        __mmask64 tail_mask = sz_u64_mask_until_(tail_length);
         _mm512_mask_storeu_epi8(target, head_mask, value_vec);
         for (target += head_length; body_length >= 64; target += 64, body_length -= 64)
             _mm512_store_si512(target, value_vec);
@@ -777,7 +777,7 @@ SZ_PUBLIC void sz_copy_skylake(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
     //
     //    for (; length >= 64; target += 64, source += 64, length -= 64)
     //        _mm512_storeu_si512(target, _mm512_loadu_si512(source));
-    //    __mmask64 mask = _sz_u64_mask_until(length);
+    //    __mmask64 mask = sz_u64_mask_until_(length);
     //    _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
     //
     // A typical AWS Sapphire Rapids instance can have 48 KB x 2 blocks of L1 data cache per core,
@@ -787,7 +787,7 @@ SZ_PUBLIC void sz_copy_skylake(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
 
     // When the buffer is small, there isn't much to innovate.
     if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
+        __mmask64 mask = sz_u64_mask_until_(length);
         _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
     }
     // When dealing with larger arrays, the optimization is not as simple as with the `sz_fill_skylake` function,
@@ -797,7 +797,7 @@ SZ_PUBLIC void sz_copy_skylake(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
         for (; length >= 64; target += 64, source += 64, length -= 64)
             _mm512_store_si512(target, _mm512_load_si512(source));
         // At this point the length is guaranteed to be under 64.
-        __mmask64 mask = _sz_u64_mask_until(length);
+        __mmask64 mask = sz_u64_mask_until_(length);
         // Aligned load and stores would work too, but it's not defined.
         _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
     }
@@ -808,8 +808,8 @@ SZ_PUBLIC void sz_copy_skylake(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
         sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
         sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
         sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
+        __mmask64 head_mask = sz_u64_mask_until_(head_length);
+        __mmask64 tail_mask = sz_u64_mask_until_(tail_length);
         _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
         for (target += head_length, source += head_length; body_length >= 64;
              target += 64, source += 64, body_length -= 64)
@@ -831,8 +831,8 @@ SZ_PUBLIC void sz_copy_skylake(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
         sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64;
         sz_size_t tail_length = (sz_size_t)(target + length) % 64;
         sz_size_t body_length = length - head_length - tail_length;
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
+        __mmask64 head_mask = sz_u64_mask_until_(head_length);
+        __mmask64 tail_mask = sz_u64_mask_until_(tail_length);
         _mm512_mask_storeu_epi8(target, head_mask, _mm512_maskz_loadu_epi8(head_mask, source));
         _mm512_mask_storeu_epi8(target + head_length + body_length, tail_mask,
                                 _mm512_maskz_loadu_epi8(tail_mask, source + head_length + body_length));
@@ -856,12 +856,12 @@ SZ_PUBLIC void sz_move_skylake(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
     // We can also avoid any data-dependencies between iterations, assuming we have 32 registers
     // to pre-load the data, before writing it back.
     if (length <= 64) {
-        __mmask64 mask = _sz_u64_mask_until(length);
+        __mmask64 mask = sz_u64_mask_until_(length);
         _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
     }
     else if (length <= 128) {
         sz_size_t last_length = length - 64;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
+        __mmask64 mask = sz_u64_mask_until_(last_length);
         __m512i source0 = _mm512_loadu_epi8(source);
         __m512i source1 = _mm512_maskz_loadu_epi8(mask, source + 64);
         _mm512_storeu_epi8(target, source0);
@@ -869,7 +869,7 @@ SZ_PUBLIC void sz_move_skylake(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
     }
     else if (length <= 192) {
         sz_size_t last_length = length - 128;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
+        __mmask64 mask = sz_u64_mask_until_(last_length);
         __m512i source0 = _mm512_loadu_epi8(source);
         __m512i source1 = _mm512_loadu_epi8(source + 64);
         __m512i source2 = _mm512_maskz_loadu_epi8(mask, source + 128);
@@ -879,7 +879,7 @@ SZ_PUBLIC void sz_move_skylake(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
     }
     else if (length <= 256) {
         sz_size_t last_length = length - 192;
-        __mmask64 mask = _sz_u64_mask_until(last_length);
+        __mmask64 mask = sz_u64_mask_until_(last_length);
         __m512i source0 = _mm512_loadu_epi8(source);
         __m512i source1 = _mm512_loadu_epi8(source + 64);
         __m512i source2 = _mm512_loadu_epi8(source + 128);
@@ -901,8 +901,8 @@ SZ_PUBLIC void sz_move_skylake(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
         sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
         sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
         sz_size_t body_length = length - head_length - tail_length;   // Multiple of 64.
-        __mmask64 head_mask = _sz_u64_mask_until(head_length);
-        __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
+        __mmask64 head_mask = sz_u64_mask_until_(head_length);
+        __mmask64 tail_mask = sz_u64_mask_until_(tail_length);
 
         // The absolute most common case of using "moves" is shifting the data within a continuous buffer
         // when adding a removing some values in it. In such cases, a typical shift is by 1, 2, 4, 8, 16,
@@ -988,8 +988,8 @@ SZ_PUBLIC void sz_lookup_ice(sz_ptr_t target, sz_size_t length, sz_cptr_t source
     // for the body.
     sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
     sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
-    __mmask64 head_mask = _sz_u64_mask_until(head_length);
-    __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
+    __mmask64 head_mask = sz_u64_mask_until_(head_length);
+    __mmask64 tail_mask = sz_u64_mask_until_(tail_length);
 
     // We need to pull the lookup table into 4x ZMM registers.
     // We can use `vpermi2b` instruction to perform the look in two ZMM registers with `_mm512_permutex2var_epi8`
diff --git a/include/stringzilla/small_string.h b/include/stringzilla/small_string.h
index c5c70773..630a756d 100644
--- a/include/stringzilla/small_string.h
+++ b/include/stringzilla/small_string.h
@@ -39,10 +39,10 @@ extern "C" {
  *  @brief  The number of bytes a stack-allocated string can hold, including the SZ_NULL termination character.
  *          ! This can't be changed from outside. Don't use the `#error` as it may already be included and set.
  */
-#ifdef _SZ_STRING_INTERNAL_SPACE
-#undef _SZ_STRING_INTERNAL_SPACE
+#ifdef SZ_STRING_INTERNAL_SPACE
+#undef SZ_STRING_INTERNAL_SPACE
 #endif
-#define _SZ_STRING_INTERNAL_SPACE (sizeof(sz_size_t) * 3 - 1) // 3 pointers minus one byte for an 8-bit length
+#define SZ_STRING_INTERNAL_SPACE (sizeof(sz_size_t) * 3 - 1) // 3 pointers minus one byte for an 8-bit length
 
 /**
  *  @brief  Tiny memory-owning string structure with a Small String Optimization (SSO).
@@ -59,7 +59,7 @@ extern "C" {
  */
 typedef union sz_string_t {
 
-#if !_SZ_IS_BIG_ENDIAN
+#if !SZ_IS_BIG_ENDIAN_
 
     struct external {
         sz_ptr_t start;
@@ -71,7 +71,7 @@ typedef union sz_string_t {
     struct internal {
         sz_ptr_t start;
         sz_u8_t length;
-        char chars[_SZ_STRING_INTERNAL_SPACE];
+        char chars[SZ_STRING_INTERNAL_SPACE];
     } internal;
 
 #else
@@ -85,7 +85,7 @@ typedef union sz_string_t {
 
     struct internal {
         sz_ptr_t start;
-        char chars[_SZ_STRING_INTERNAL_SPACE];
+        char chars[SZ_STRING_INTERNAL_SPACE];
         sz_u8_t length;
     } internal;
 
@@ -223,7 +223,7 @@ SZ_PUBLIC void sz_string_unpack( //
     // If the string is small, use branch-less approach to mask-out the top 7 bytes of the length.
     *length = string->external.length & (0x00000000000000FFull | is_big_mask);
     // In case the string is small, the `is_small - 1ull` will become 0xFFFFFFFFFFFFFFFFull.
-    *space = sz_u64_blend(_SZ_STRING_INTERNAL_SPACE, string->external.space, is_big_mask);
+    *space = sz_u64_blend(SZ_STRING_INTERNAL_SPACE, string->external.space, is_big_mask);
     *is_external = (sz_bool_t)!is_small;
 }
 
@@ -261,7 +261,7 @@ SZ_PUBLIC sz_ordering_t sz_string_order(sz_string_t const *a, sz_string_t const
 }
 
 SZ_PUBLIC void sz_string_init(sz_string_t *string) {
-    _sz_assert(string && "String can't be SZ_NULL.");
+    sz_assert_(string && "String can't be SZ_NULL.");
 
     // Only 8 + 1 + 1 need to be initialized.
     string->internal.start = &string->internal.chars[0];
@@ -275,13 +275,13 @@ SZ_PUBLIC void sz_string_init(sz_string_t *string) {
 
 SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length, sz_memory_allocator_t *allocator) {
     sz_size_t space_needed = length + 1; // space for trailing \0
-    _sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
+    sz_assert_(string && allocator && "String and allocator can't be SZ_NULL.");
     // Initialize the string to zeros for safety.
     string->words[1] = 0;
     string->words[2] = 0;
     string->words[3] = 0;
     // If we are lucky, no memory allocations will be needed.
-    if (space_needed <= _SZ_STRING_INTERNAL_SPACE) {
+    if (space_needed <= SZ_STRING_INTERNAL_SPACE) {
         string->internal.start = &string->internal.chars[0];
         string->internal.length = (sz_u8_t)length;
     }
@@ -292,24 +292,24 @@ SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length,
         string->external.length = length;
         string->external.space = space_needed;
     }
-    _sz_assert(&string->internal.start == &string->external.start && "Alignment confusion");
+    sz_assert_(&string->internal.start == &string->external.start && "Alignment confusion");
     string->external.start[length] = 0;
     return string->external.start;
 }
 
 SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity, sz_memory_allocator_t *allocator) {
 
-    _sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
+    sz_assert_(string && allocator && "Strings and allocators can't be SZ_NULL.");
 
     sz_size_t new_space = new_capacity + 1;
-    if (new_space <= _SZ_STRING_INTERNAL_SPACE) return string->external.start;
+    if (new_space <= SZ_STRING_INTERNAL_SPACE) return string->external.start;
 
     sz_ptr_t string_start;
     sz_size_t string_length;
     sz_size_t string_space;
     sz_bool_t string_is_external;
     sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
-    _sz_assert(new_space > string_space && "New space must be larger than current.");
+    sz_assert_(new_space > string_space && "New space must be larger than current.");
 
     sz_ptr_t new_start = (sz_ptr_t)allocator->allocate(new_space, allocator->handle);
     if (!new_start) return SZ_NULL_CHAR;
@@ -327,7 +327,7 @@ SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity
 
 SZ_PUBLIC sz_ptr_t sz_string_shrink_to_fit(sz_string_t *string, sz_memory_allocator_t *allocator) {
 
-    _sz_assert(string && allocator && "Strings and allocators can't be SZ_NULL.");
+    sz_assert_(string && allocator && "Strings and allocators can't be SZ_NULL.");
 
     sz_ptr_t string_start;
     sz_size_t string_length;
@@ -356,7 +356,7 @@ SZ_PUBLIC sz_ptr_t sz_string_shrink_to_fit(sz_string_t *string, sz_memory_alloca
 SZ_PUBLIC sz_ptr_t sz_string_expand( //
     sz_string_t *string, sz_size_t offset, sz_size_t added_length, sz_memory_allocator_t *allocator) {
 
-    _sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
+    sz_assert_(string && allocator && "String and allocator can't be SZ_NULL.");
 
     sz_ptr_t string_start;
     sz_size_t string_length;
@@ -393,7 +393,7 @@ SZ_PUBLIC sz_ptr_t sz_string_expand( //
 
 SZ_PUBLIC sz_size_t sz_string_erase(sz_string_t *string, sz_size_t offset, sz_size_t length) {
 
-    _sz_assert(string && "String can't be SZ_NULL.");
+    sz_assert_(string && "String can't be SZ_NULL.");
 
     sz_ptr_t string_start;
     sz_size_t string_length;
diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index f4dc23f4..187b63b4 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -207,7 +207,7 @@ SZ_PUBLIC void sz_pgrams_sort_with_insertion(sz_pgram_t *pgrams, sz_size_t count
 
 #if SZ_DEBUG
     for (sz_size_t i = 1; i < count; ++i)
-        _sz_assert(pgrams[i - 1] <= pgrams[i] && "The pgrams should be sorted in ascending order.");
+        sz_assert_(pgrams[i - 1] <= pgrams[i] && "The pgrams should be sorted in ascending order.");
 #endif
 }
 
@@ -219,19 +219,19 @@ SZ_PUBLIC void sz_pgrams_sort_with_insertion(sz_pgram_t *pgrams, sz_size_t count
  *  @brief  Convenience macro for of conditional swap of "pgrams" and their indices for a sorting network.
  *  @see    https://en.wikipedia.org/wiki/Sorting_network
  */
-#define _sz_sequence_sorting_network_conditional_swap(i, j)    \
+#define sz_sequence_sorting_network_conditional_swap_(i, j)    \
     do {                                                       \
         if (pgrams[i] > pgrams[j]) {                           \
-            _sz_swap(sz_pgram_t, pgrams[i], pgrams[j]);        \
-            _sz_swap(sz_sorted_idx_t, offsets[i], offsets[j]); \
+            sz_swap_(sz_pgram_t, pgrams[i], pgrams[j]);        \
+            sz_swap_(sz_sorted_idx_t, offsets[i], offsets[j]); \
         }                                                      \
     } while (0)
 
 /**
  *  @brief  Sorting network for 2 elements is just a single compare–swap.
  */
-SZ_INTERNAL void _sz_sequence_sorting_network_2x(sz_pgram_t *pgrams, sz_sorted_idx_t *offsets) {
-    _sz_sequence_sorting_network_conditional_swap(0, 1);
+SZ_INTERNAL void sz_sequence_sorting_network_2x_(sz_pgram_t *pgrams, sz_sorted_idx_t *offsets) {
+    sz_sequence_sorting_network_conditional_swap_(0, 1);
 }
 
 /**
@@ -243,15 +243,15 @@ SZ_INTERNAL void _sz_sequence_sorting_network_2x(sz_pgram_t *pgrams, sz_sorted_i
  *      Stage 2: (0, 2)
  *      Stage 3: (1, 2)
  */
-SZ_INTERNAL void _sz_sequence_sorting_network_3x(sz_pgram_t *pgrams, sz_sorted_idx_t *offsets) {
+SZ_INTERNAL void sz_sequence_sorting_network_3x_(sz_pgram_t *pgrams, sz_sorted_idx_t *offsets) {
 
-    _sz_sequence_sorting_network_conditional_swap(0, 1);
-    _sz_sequence_sorting_network_conditional_swap(0, 2);
-    _sz_sequence_sorting_network_conditional_swap(1, 2);
+    sz_sequence_sorting_network_conditional_swap_(0, 1);
+    sz_sequence_sorting_network_conditional_swap_(0, 2);
+    sz_sequence_sorting_network_conditional_swap_(1, 2);
 
 #if SZ_DEBUG
     for (sz_size_t i = 1; i < 3; ++i)
-        _sz_assert(pgrams[i - 1] <= pgrams[i] && "Sorting network for 3 elements failed.");
+        sz_assert_(pgrams[i - 1] <= pgrams[i] && "Sorting network for 3 elements failed.");
 #endif
 }
 
@@ -265,24 +265,24 @@ SZ_INTERNAL void _sz_sequence_sorting_network_3x(sz_pgram_t *pgrams, sz_sorted_i
  *      Stage 3: (1, 3)
  *      Stage 4: (1, 2)
  */
-SZ_INTERNAL void _sz_sequence_sorting_network_4x(sz_pgram_t *pgrams, sz_sorted_idx_t *offsets) {
+SZ_INTERNAL void sz_sequence_sorting_network_4x_(sz_pgram_t *pgrams, sz_sorted_idx_t *offsets) {
 
     // Stage 1: Compare–swap adjacent pairs.
-    _sz_sequence_sorting_network_conditional_swap(0, 1);
-    _sz_sequence_sorting_network_conditional_swap(2, 3);
+    sz_sequence_sorting_network_conditional_swap_(0, 1);
+    sz_sequence_sorting_network_conditional_swap_(2, 3);
 
     // Stage 2: Compare–swap (0, 2)
-    _sz_sequence_sorting_network_conditional_swap(0, 2);
+    sz_sequence_sorting_network_conditional_swap_(0, 2);
 
     // Stage 3: Compare–swap (1, 3)
-    _sz_sequence_sorting_network_conditional_swap(1, 3);
+    sz_sequence_sorting_network_conditional_swap_(1, 3);
 
     // Stage 4: Final compare–swap (1, 2)
-    _sz_sequence_sorting_network_conditional_swap(1, 2);
+    sz_sequence_sorting_network_conditional_swap_(1, 2);
 
 #if SZ_DEBUG
     for (sz_size_t i = 1; i < 4; ++i)
-        _sz_assert(pgrams[i - 1] <= pgrams[i] && "Sorting network for 4 elements failed.");
+        sz_assert_(pgrams[i - 1] <= pgrams[i] && "Sorting network for 4 elements failed.");
 #endif
 }
 
@@ -300,53 +300,53 @@ SZ_INTERNAL void _sz_sequence_sorting_network_4x(sz_pgram_t *pgrams, sz_sorted_i
  *      Stage 5: (2,4), (3,5)
  *      Stage 6: (1,2), (3,4), (5,6)
  */
-SZ_INTERNAL void _sz_sequence_sorting_network_8x(sz_pgram_t *pgrams, sz_sorted_idx_t *offsets) {
+SZ_INTERNAL void sz_sequence_sorting_network_8x_(sz_pgram_t *pgrams, sz_sorted_idx_t *offsets) {
 
     // Stage 1: Compare–swap adjacent pairs.
-    _sz_sequence_sorting_network_conditional_swap(0, 1);
-    _sz_sequence_sorting_network_conditional_swap(2, 3);
-    _sz_sequence_sorting_network_conditional_swap(4, 5);
-    _sz_sequence_sorting_network_conditional_swap(6, 7);
+    sz_sequence_sorting_network_conditional_swap_(0, 1);
+    sz_sequence_sorting_network_conditional_swap_(2, 3);
+    sz_sequence_sorting_network_conditional_swap_(4, 5);
+    sz_sequence_sorting_network_conditional_swap_(6, 7);
 
     // Stage 2: Compare–swap with stride 2.
-    _sz_sequence_sorting_network_conditional_swap(0, 2);
-    _sz_sequence_sorting_network_conditional_swap(1, 3);
-    _sz_sequence_sorting_network_conditional_swap(4, 6);
-    _sz_sequence_sorting_network_conditional_swap(5, 7);
+    sz_sequence_sorting_network_conditional_swap_(0, 2);
+    sz_sequence_sorting_network_conditional_swap_(1, 3);
+    sz_sequence_sorting_network_conditional_swap_(4, 6);
+    sz_sequence_sorting_network_conditional_swap_(5, 7);
 
     // Stage 3: Compare–swap between middle elements.
-    _sz_sequence_sorting_network_conditional_swap(1, 2);
-    _sz_sequence_sorting_network_conditional_swap(5, 6);
+    sz_sequence_sorting_network_conditional_swap_(1, 2);
+    sz_sequence_sorting_network_conditional_swap_(5, 6);
 
     // Stage 4: Compare–swap across the two halves.
-    _sz_sequence_sorting_network_conditional_swap(0, 4);
-    _sz_sequence_sorting_network_conditional_swap(1, 5);
-    _sz_sequence_sorting_network_conditional_swap(2, 6);
-    _sz_sequence_sorting_network_conditional_swap(3, 7);
+    sz_sequence_sorting_network_conditional_swap_(0, 4);
+    sz_sequence_sorting_network_conditional_swap_(1, 5);
+    sz_sequence_sorting_network_conditional_swap_(2, 6);
+    sz_sequence_sorting_network_conditional_swap_(3, 7);
 
     // Stage 5: Compare–swap within each half.
-    _sz_sequence_sorting_network_conditional_swap(2, 4);
-    _sz_sequence_sorting_network_conditional_swap(3, 5);
+    sz_sequence_sorting_network_conditional_swap_(2, 4);
+    sz_sequence_sorting_network_conditional_swap_(3, 5);
 
     // Stage 6: Final compare–swap of adjacent elements.
-    _sz_sequence_sorting_network_conditional_swap(1, 2);
-    _sz_sequence_sorting_network_conditional_swap(3, 4);
-    _sz_sequence_sorting_network_conditional_swap(5, 6);
+    sz_sequence_sorting_network_conditional_swap_(1, 2);
+    sz_sequence_sorting_network_conditional_swap_(3, 4);
+    sz_sequence_sorting_network_conditional_swap_(5, 6);
 
 #if SZ_DEBUG
     // Validate the sorting network.
     for (sz_size_t i = 1; i < 8; ++i)
-        _sz_assert(pgrams[i - 1] <= pgrams[i] && "The sorting network must sort the pgrams in ascending order.");
+        sz_assert_(pgrams[i - 1] <= pgrams[i] && "The sorting network must sort the pgrams in ascending order.");
 #endif
 }
 
-#undef _sz_sequence_sorting_network_conditional_swap
+#undef sz_sequence_sorting_network_conditional_swap_
 
 #pragma endregion // Generic Internal Helpers
 
 #pragma region Serial QuickSort Implementation
 
-SZ_INTERNAL void _sz_sequence_argsort_serial_export_next_pgrams(                //
+SZ_INTERNAL void sz_sequence_argsort_serial_export_next_pgrams_(                //
     sz_sequence_t const *const sequence,                                        //
     sz_pgram_t *const global_pgrams, sz_sorted_idx_t const *const global_order, //
     sz_size_t const start_in_sequence, sz_size_t const end_in_sequence,         //
@@ -362,7 +362,7 @@ SZ_INTERNAL void _sz_sequence_argsort_serial_export_next_pgrams(
         // On the first recursion level, the `global_order` is the identity permutation.
         sz_sorted_idx_t const partial_order_index = global_order[i];
         if (SZ_DEBUG && start_character == 0)
-            _sz_assert(partial_order_index == i && "At start this must be an identity permutation.");
+            sz_assert_(partial_order_index == i && "At start this must be an identity permutation.");
 
         // Get the string slice in global memory.
         sz_cptr_t const source_str = sequence->get_start(sequence->handle, partial_order_index);
@@ -376,12 +376,12 @@ SZ_INTERNAL void _sz_sequence_argsort_serial_export_next_pgrams(
         *target_pgram = 0;
         for (sz_size_t j = 0; j < exported_length; ++j) target_str[j] = source_str[j + start_character];
         target_str[pgram_capacity] = exported_length;
-#if defined(_SZ_IS_64_BIT)
+#if defined(SZ_IS_64BIT_)
         *target_pgram = sz_u64_bytes_reverse(*target_pgram);
 #else
         *target_pgram = sz_u32_bytes_reverse(*target_pgram);
 #endif
-        _sz_assert(                                                //
+        sz_assert_(                                                //
             (length <= start_character) == (*target_pgram == 0) && //
             "We can have a zero value if only the string is shorter than other strings at this position.");
     }
@@ -399,7 +399,7 @@ SZ_INTERNAL void _sz_sequence_argsort_serial_export_next_pgrams(
             sz_ordering_t const ordering = sz_order(                                               //
                 previous_str, previous_length > pgram_capacity ? pgram_capacity : previous_length, //
                 current_str, current_length > pgram_capacity ? pgram_capacity : current_length);
-            _sz_assert(                                                        //
+            sz_assert_(                                                        //
                 (previous_pgram < current_pgram) == (ordering == sz_less_k) && //
                 "The exported pgrams should be in the same order as the original strings.");
         }
@@ -409,7 +409,7 @@ SZ_INTERNAL void _sz_sequence_argsort_serial_export_next_pgrams(
  *  @brief  Picks the "pivot" value for the QuickSort algorithm's partitioning step using Robert Sedgewick's method,
  *          the median of three elements - the first, the middle, and the last element of the given range.
  */
-SZ_INTERNAL sz_pgram_t const *_sz_sequence_partitioning_pivot(sz_pgram_t const *pgrams, sz_size_t count) {
+SZ_INTERNAL sz_pgram_t const *sz_sequence_partitioning_pivot_(sz_pgram_t const *pgrams, sz_size_t count) {
     sz_size_t const middle_offset = count / 2;
     sz_pgram_t const *first_pgram = &pgrams[0];
     sz_pgram_t const *middle_pgram = &pgrams[middle_offset];
@@ -435,7 +435,7 @@ SZ_INTERNAL sz_pgram_t const *_sz_sequence_partitioning_pivot(sz_pgram_t const *
  *
  *  @see https://en.wikipedia.org/wiki/Dutch_national_flag_problem
  */
-SZ_INTERNAL void _sz_sequence_argsort_serial_3way_partition(              //
+SZ_INTERNAL void sz_sequence_argsort_serial_3way_partition_(              //
     sz_pgram_t *const global_pgrams, sz_sorted_idx_t *const global_order, //
     sz_size_t const start_in_sequence, sz_size_t const end_in_sequence,   //
     sz_size_t *first_pivot_offset, sz_size_t *last_pivot_offset) {
@@ -445,16 +445,16 @@ SZ_INTERNAL void _sz_sequence_argsort_serial_3way_partition(              //
     if (count <= 4) {
         sz_pgram_t *const pgrams = global_pgrams + start_in_sequence;
         sz_sorted_idx_t *const offsets = global_order + start_in_sequence;
-        if (count == 2) { _sz_sequence_sorting_network_2x(pgrams, offsets); }
-        else if (count == 3) { _sz_sequence_sorting_network_3x(pgrams, offsets); }
-        else if (count == 4) { _sz_sequence_sorting_network_4x(pgrams, offsets); }
+        if (count == 2) { sz_sequence_sorting_network_2x_(pgrams, offsets); }
+        else if (count == 3) { sz_sequence_sorting_network_3x_(pgrams, offsets); }
+        else if (count == 4) { sz_sequence_sorting_network_4x_(pgrams, offsets); }
         *first_pivot_offset = start_in_sequence;
         *last_pivot_offset = end_in_sequence;
         return;
     }
 
     // Chose the pivot offset with Sedgewick's method.
-    sz_pgram_t const pivot_pgram = *_sz_sequence_partitioning_pivot(global_pgrams + start_in_sequence, count);
+    sz_pgram_t const pivot_pgram = *sz_sequence_partitioning_pivot_(global_pgrams + start_in_sequence, count);
 
     // Loop through the collection and move the elements around the pivot with the 3-way partitioning.
     sz_size_t partitioning_progress = start_in_sequence; // Current index.
@@ -464,15 +464,15 @@ SZ_INTERNAL void _sz_sequence_argsort_serial_3way_partition(              //
     while (partitioning_progress <= greater_offset) {
         // Element is less than pivot: swap into the < pivot region.
         if (global_pgrams[partitioning_progress] < pivot_pgram) {
-            _sz_swap(sz_sorted_idx_t, global_order[partitioning_progress], global_order[smaller_offset]);
-            _sz_swap(sz_pgram_t, global_pgrams[partitioning_progress], global_pgrams[smaller_offset]);
+            sz_swap_(sz_sorted_idx_t, global_order[partitioning_progress], global_order[smaller_offset]);
+            sz_swap_(sz_pgram_t, global_pgrams[partitioning_progress], global_pgrams[smaller_offset]);
             ++partitioning_progress;
             ++smaller_offset;
         }
         // Element is greater than pivot: swap into the > pivot region.
         else if (global_pgrams[partitioning_progress] > pivot_pgram) {
-            _sz_swap(sz_sorted_idx_t, global_order[partitioning_progress], global_order[greater_offset]);
-            _sz_swap(sz_pgram_t, global_pgrams[partitioning_progress], global_pgrams[greater_offset]);
+            sz_swap_(sz_sorted_idx_t, global_order[partitioning_progress], global_order[greater_offset]);
+            sz_swap_(sz_pgram_t, global_pgrams[partitioning_progress], global_pgrams[greater_offset]);
             --greater_offset;
         }
         // Element equals `pivot_pgram`: leave it in place.
@@ -485,45 +485,45 @@ SZ_INTERNAL void _sz_sequence_argsort_serial_3way_partition(              //
 
 /**
  *  @brief  Recursive Quick-Sort implementation backing both the `sz_sequence_argsort` and `sz_pgrams_sort`,
- *          and using the `_sz_sequence_argsort_serial_3way_partition` under the hood.
+ *          and using the `sz_sequence_argsort_serial_3way_partition_` under the hood.
  */
-SZ_PUBLIC void _sz_sequence_argsort_serial_recursively(                   //
+SZ_PUBLIC void sz_sequence_argsort_serial_recursively_(                   //
     sz_pgram_t *const global_pgrams, sz_sorted_idx_t *const global_order, //
     sz_size_t const start_in_sequence, sz_size_t const end_in_sequence) {
 
     // Partition the collection around some pivot or 2 pivots in a 3-way partitioning
     sz_size_t first_pivot_index, last_pivot_index;
-    _sz_sequence_argsort_serial_3way_partition( //
+    sz_sequence_argsort_serial_3way_partition_( //
         global_pgrams, global_order,            //
         start_in_sequence, end_in_sequence,     //
         &first_pivot_index, &last_pivot_index);
 
     // Recursively sort the left partition
     if (start_in_sequence < first_pivot_index)
-        _sz_sequence_argsort_serial_recursively(global_pgrams, global_order, start_in_sequence, first_pivot_index);
+        sz_sequence_argsort_serial_recursively_(global_pgrams, global_order, start_in_sequence, first_pivot_index);
 
     // Recursively sort the right partition
     if (last_pivot_index + 1 < end_in_sequence)
-        _sz_sequence_argsort_serial_recursively(global_pgrams, global_order, last_pivot_index + 1, end_in_sequence);
+        sz_sequence_argsort_serial_recursively_(global_pgrams, global_order, last_pivot_index + 1, end_in_sequence);
 }
 
 /**
  *  @brief  Recursive Quick-Sort adaptation for strings, that processes the strings a few N-grams at a time.
- *          It combines `_sz_sequence_argsort_serial_export_next_pgrams` and `_sz_sequence_argsort_serial_recursively`,
+ *          It combines `sz_sequence_argsort_serial_export_next_pgrams_` and `sz_sequence_argsort_serial_recursively_`,
  *          recursively diving into the identical pgrams.
  */
-SZ_PUBLIC void _sz_sequence_argsort_serial_next_pgrams(                   //
+SZ_PUBLIC void sz_sequence_argsort_serial_next_pgrams_(                   //
     sz_sequence_t const *const sequence,                                  //
     sz_pgram_t *const global_pgrams, sz_sorted_idx_t *const global_order, //
     sz_size_t const start_in_sequence, sz_size_t const end_in_sequence,   //
     sz_size_t const start_character) {
 
     // Prepare the new range of pgrams
-    _sz_sequence_argsort_serial_export_next_pgrams(sequence, global_pgrams, global_order, start_in_sequence,
+    sz_sequence_argsort_serial_export_next_pgrams_(sequence, global_pgrams, global_order, start_in_sequence,
                                                    end_in_sequence, start_character);
 
     // Sort current pgrams with a quicksort
-    _sz_sequence_argsort_serial_recursively(global_pgrams, global_order, start_in_sequence, end_in_sequence);
+    sz_sequence_argsort_serial_recursively_(global_pgrams, global_order, start_in_sequence, end_in_sequence);
 
     // Depending on the architecture, we will export a different number of bytes.
     // On 32-bit architectures, we will export 3 bytes, and on 64-bit architectures - 7 bytes.
@@ -543,7 +543,7 @@ SZ_PUBLIC void _sz_sequence_argsort_serial_next_pgrams(                   //
         int has_multiple_strings = nested_end - nested_start > 1;
         int has_more_characters_in_each = current_pgram_length == pgram_capacity;
         if (has_multiple_strings && has_more_characters_in_each) {
-            _sz_sequence_argsort_serial_next_pgrams(sequence, global_pgrams, global_order, nested_start, nested_end,
+            sz_sequence_argsort_serial_next_pgrams_(sequence, global_pgrams, global_order, nested_start, nested_end,
                                                     start_character + pgram_capacity);
         }
         // Move to the next
@@ -587,7 +587,7 @@ SZ_PUBLIC sz_status_t sz_sequence_argsort_serial(sz_sequence_t const *sequence,
     if (!pgrams) return sz_bad_alloc_k;
 
     // Recursively sort the whole sequence.
-    _sz_sequence_argsort_serial_next_pgrams(sequence, pgrams, order, 0, sequence->count, 0);
+    sz_sequence_argsort_serial_next_pgrams_(sequence, pgrams, order, 0, sequence->count, 0);
 
     // Free temporary storage.
     alloc->free(pgrams, memory_usage, alloc);
@@ -596,11 +596,11 @@ SZ_PUBLIC sz_status_t sz_sequence_argsort_serial(sz_sequence_t const *sequence,
 
 SZ_PUBLIC sz_status_t sz_pgrams_sort_serial(sz_pgram_t *pgrams, sz_size_t count, sz_memory_allocator_t *alloc,
                                             sz_sorted_idx_t *order) {
-    sz_unused(alloc);
+    sz_unused_(alloc);
     // First, initialize the `order` with `std::iota`-like behavior.
     for (sz_size_t i = 0; i != count; ++i) order[i] = i;
     // Reuse the string sorting algorithm for sorting the "pgrams".
-    _sz_sequence_argsort_serial_recursively((sz_pgram_t *)pgrams, order, 0, count);
+    sz_sequence_argsort_serial_recursively_((sz_pgram_t *)pgrams, order, 0, count);
     return sz_success_k;
 }
 
@@ -612,7 +612,7 @@ SZ_PUBLIC sz_status_t sz_pgrams_sort_serial(sz_pgram_t *pgrams, sz_size_t count,
  *  @brief  Helper function similar to `std::set_union` over pairs of integers and their original indices.
  *  @see    https://en.cppreference.com/w/cpp/algorithm/set_union
  */
-SZ_INTERNAL void _sz_pgrams_union_serial(                                                           //
+SZ_INTERNAL void sz_pgrams_union_serial_(                                                           //
     sz_pgram_t const *first_pgrams, sz_sorted_idx_t const *first_indices, sz_size_t first_count,    //
     sz_pgram_t const *second_pgrams, sz_sorted_idx_t const *second_indices, sz_size_t second_count, //
     sz_pgram_t *result_pgrams, sz_sorted_idx_t *result_indices) {
@@ -656,7 +656,7 @@ SZ_INTERNAL void _sz_pgrams_union_serial(
     // Validate the merged result.
     if (SZ_DEBUG)
         for (sz_size_t i = 1; i < first_count + second_count; ++i)
-            _sz_assert(merged_begin[i - 1] <= merged_begin[i] && "The merged pgrams must be in ascending order.");
+            sz_assert_(merged_begin[i - 1] <= merged_begin[i] && "The merged pgrams must be in ascending order.");
 }
 
 #pragma endregion // Serial MergeSort Implementation
@@ -679,7 +679,7 @@ SZ_INTERNAL void _sz_pgrams_union_serial(
  *  @brief The most important part of the QuickSort algorithm partitioning the elements around the pivot.
  *  @note Unlike the serial algorithm, uses compressed stores to filter and move the elements around the pivot.
  */
-SZ_INTERNAL void _sz_sequence_argsort_skylake_3way_partition(                       //
+SZ_INTERNAL void sz_sequence_argsort_skylake_3way_partition_(                       //
     sz_pgram_t *const initial_pgrams, sz_sorted_idx_t *const initial_order,         //
     sz_pgram_t *const partitioned_pgrams, sz_sorted_idx_t *const partitioned_order, //
     sz_size_t const start_in_sequence, sz_size_t const end_in_sequence,             //
@@ -689,7 +689,7 @@ SZ_INTERNAL void _sz_sequence_argsort_skylake_3way_partition(
     sz_size_t const pgrams_per_register = sizeof(sz_u512_vec_t) / sizeof(sz_pgram_t);
 
     // Choose the pivot offset with Sedgewick's method.
-    sz_pgram_t const *pivot_pgram_ptr = _sz_sequence_partitioning_pivot(initial_pgrams + start_in_sequence, count);
+    sz_pgram_t const *pivot_pgram_ptr = sz_sequence_partitioning_pivot_(initial_pgrams + start_in_sequence, count);
     sz_pgram_t const pivot_pgram = *pivot_pgram_ptr;
     sz_u512_vec_t pivot_vec;
     pivot_vec.zmm = _mm512_set1_epi64(pivot_pgram);
@@ -698,7 +698,7 @@ SZ_INTERNAL void _sz_sequence_argsort_skylake_3way_partition(
     // we know exactly, how many elements are smaller or greater than the pivot.
     sz_size_t count_smaller = 0, count_greater = 0;
     sz_size_t const tail_count = count & 7u;
-    __mmask8 const tail_mask = _sz_u8_mask_until(tail_count);
+    __mmask8 const tail_mask = sz_u8_mask_until_(tail_count);
 
     sz_u512_vec_t pgrams_vec, order_vec;
     for (sz_size_t i = start_in_sequence; i + pgrams_per_register <= end_in_sequence; i += pgrams_per_register) {
@@ -715,8 +715,8 @@ SZ_INTERNAL void _sz_sequence_argsort_skylake_3way_partition(
     // Now all we need to do is to loop through the collection and export them into the temporary buffer
     // in 3 separate segments - smaller, equal, and greater than the pivot.
     sz_size_t const count_equal = count - count_smaller - count_greater;
-    _sz_assert(count_equal >= 1 && "The pivot must be present in the collection.");
-    _sz_assert(count_smaller + count_equal + count_greater == count && "The partitioning must be exhaustive.");
+    sz_assert_(count_equal >= 1 && "The pivot must be present in the collection.");
+    sz_assert_(count_smaller + count_equal + count_greater == count && "The partitioning must be exhaustive.");
     sz_size_t smaller_offset = start_in_sequence;
     sz_size_t equal_offset = start_in_sequence + count_smaller;
     sz_size_t greater_offset = start_in_sequence + count_smaller + count_equal;
@@ -760,9 +760,9 @@ SZ_INTERNAL void _sz_sequence_argsort_skylake_3way_partition(
 
 /**
  *  @brief Recursive Quick-Sort implementation backing both the `sz_sequence_argsort_skylake` and
- * `sz_pgrams_sort_skylake`, and using the `_sz_sequence_argsort_skylake_3way_partition` under the hood.
+ * `sz_pgrams_sort_skylake`, and using the `sz_sequence_argsort_skylake_3way_partition_` under the hood.
  */
-SZ_PUBLIC void _sz_sequence_argsort_skylake_recursively(            //
+SZ_PUBLIC void sz_sequence_argsort_skylake_recursively_(            //
     sz_pgram_t *initial_pgrams, sz_sorted_idx_t *initial_order,     //
     sz_pgram_t *temporary_pgrams, sz_sorted_idx_t *temporary_order, //
     sz_size_t const start_in_sequence, sz_size_t const end_in_sequence) {
@@ -779,18 +779,18 @@ SZ_PUBLIC void _sz_sequence_argsort_skylake_recursively(            //
 
     // Partition the collection around some pivot
     sz_size_t first_pivot_index, last_pivot_index;
-    _sz_sequence_argsort_skylake_3way_partition(                          //
+    sz_sequence_argsort_skylake_3way_partition_(                          //
         initial_pgrams, initial_order, temporary_pgrams, temporary_order, //
         start_in_sequence, end_in_sequence,                               //
         &first_pivot_index, &last_pivot_index);
 
     // Recursively sort the left and right partitions, if there are at least 2 elements in each
     if (start_in_sequence + 1 < first_pivot_index)
-        _sz_sequence_argsort_skylake_recursively(                             //
+        sz_sequence_argsort_skylake_recursively_(                             //
             initial_pgrams, initial_order, temporary_pgrams, temporary_order, //
             start_in_sequence, first_pivot_index);
     if (last_pivot_index + 2 < end_in_sequence)
-        _sz_sequence_argsort_skylake_recursively(                             //
+        sz_sequence_argsort_skylake_recursively_(                             //
             initial_pgrams, initial_order, temporary_pgrams, temporary_order, //
             last_pivot_index + 1, end_in_sequence);
 }
@@ -815,7 +815,7 @@ SZ_PUBLIC sz_status_t sz_pgrams_sort_skylake(sz_pgram_t *pgrams, sz_size_t count
     if (!temporary_pgrams) return sz_bad_alloc_k;
 
     // Reuse the string sorting algorithm for sorting the "pgrams".
-    _sz_sequence_argsort_skylake_recursively(pgrams, order, temporary_pgrams, temporary_order, 0, count);
+    sz_sequence_argsort_skylake_recursively_(pgrams, order, temporary_pgrams, temporary_order, 0, count);
 
     // Deallocate the temporary memory used for partitioning.
     alloc->free(temporary_pgrams, memory_usage, alloc);
@@ -825,10 +825,10 @@ SZ_PUBLIC sz_status_t sz_pgrams_sort_skylake(sz_pgram_t *pgrams, sz_size_t count
 /**
  *  @brief Recursive Quick-Sort adaptation for strings, that processes the strings a few N-grams at a time.
  *
- *  It combines `_sz_sequence_argsort_serial_export_next_pgrams` and `_sz_sequence_argsort_serial_recursively`,
+ *  It combines `sz_sequence_argsort_serial_export_next_pgrams_` and `sz_sequence_argsort_serial_recursively_`,
  *  recursively diving into the identical pgrams.
  */
-SZ_PUBLIC void _sz_sequence_argsort_skylake_next_pgrams(                        //
+SZ_PUBLIC void sz_sequence_argsort_skylake_next_pgrams_(                        //
     sz_sequence_t const *const sequence,                                        //
     sz_pgram_t *const global_pgrams, sz_sorted_idx_t *const global_order,       //
     sz_pgram_t *const temporary_pgrams, sz_sorted_idx_t *const temporary_order, //
@@ -836,11 +836,11 @@ SZ_PUBLIC void _sz_sequence_argsort_skylake_next_pgrams(
     sz_size_t const start_character) {
 
     // Prepare the new range of pgrams
-    _sz_sequence_argsort_serial_export_next_pgrams( //
+    sz_sequence_argsort_serial_export_next_pgrams_( //
         sequence, global_pgrams, global_order, start_in_sequence, end_in_sequence, start_character);
 
     // Sort current pgrams with a quicksort
-    _sz_sequence_argsort_skylake_recursively( //
+    sz_sequence_argsort_skylake_recursively_( //
         global_pgrams, global_order, temporary_pgrams, temporary_order, start_in_sequence, end_in_sequence);
 
     // Depending on the architecture, we will export a different number of bytes.
@@ -861,7 +861,7 @@ SZ_PUBLIC void _sz_sequence_argsort_skylake_next_pgrams(
         int has_multiple_strings = nested_end - nested_start > 1;
         int has_more_characters_in_each = current_pgram_length == pgram_capacity;
         if (has_multiple_strings && has_more_characters_in_each)
-            _sz_sequence_argsort_skylake_next_pgrams( //
+            sz_sequence_argsort_skylake_next_pgrams_( //
                 sequence, global_pgrams, global_order, temporary_pgrams, temporary_order, nested_start, nested_end,
                 start_character + pgram_capacity);
 
@@ -899,7 +899,7 @@ SZ_PUBLIC sz_status_t sz_sequence_argsort_skylake(sz_sequence_t const *sequence,
     if (!global_pgrams) return sz_bad_alloc_k;
 
     // Recursively sort the whole sequence.
-    _sz_sequence_argsort_skylake_next_pgrams(sequence, global_pgrams, order, temporary_pgrams, temporary_order, //
+    sz_sequence_argsort_skylake_next_pgrams_(sequence, global_pgrams, order, temporary_pgrams, temporary_order, //
                                              0, count, 0);
 
     // Free temporary storage.
@@ -923,7 +923,7 @@ SZ_PUBLIC sz_status_t sz_sequence_argsort_skylake(sz_sequence_t const *sequence,
  *  @note Unlike the serial algorithm, uses compressed stores to filter and move the elements around the pivot.
  *  @sa Identical to @b Skylake implementation, but uses variable length SVE registers.
  */
-SZ_INTERNAL void _sz_sequence_argsort_sve_3way_partition(
+SZ_INTERNAL void sz_sequence_argsort_sve_3way_partition_(
     sz_pgram_t *const initial_pgrams, sz_sorted_idx_t *const initial_order, sz_pgram_t *const partitioned_pgrams,
     sz_sorted_idx_t *const partitioned_order, sz_size_t const start_in_sequence, sz_size_t const end_in_sequence,
     sz_size_t *const first_pivot_offset, sz_size_t *const last_pivot_offset) {
@@ -933,7 +933,7 @@ SZ_INTERNAL void _sz_sequence_argsort_sve_3way_partition(
     sz_size_t const pgrams_per_vector = svcntd();
 
     // Choose the pivot with Sedgewick's method.
-    sz_pgram_t const *pivot_pgram_ptr = _sz_sequence_partitioning_pivot(initial_pgrams + start_in_sequence, count);
+    sz_pgram_t const *pivot_pgram_ptr = sz_sequence_partitioning_pivot_(initial_pgrams + start_in_sequence, count);
     sz_pgram_t const pivot_pgram = *pivot_pgram_ptr;
     svuint64_t pivot_vec = svdup_n_u64(pivot_pgram);
 
@@ -949,8 +949,8 @@ SZ_INTERNAL void _sz_sequence_argsort_sve_3way_partition(
     }
 
     sz_size_t const count_equal = count - count_smaller - count_greater;
-    _sz_assert(count_equal >= 1 && "The pivot must be present in the collection.");
-    _sz_assert(count_smaller + count_equal + count_greater == count && "The partitioning must be exhaustive.");
+    sz_assert_(count_equal >= 1 && "The pivot must be present in the collection.");
+    sz_assert_(count_smaller + count_equal + count_greater == count && "The partitioning must be exhaustive.");
 
     // Set offsets for each partition.
     sz_size_t smaller_offset = start_in_sequence;
@@ -1012,10 +1012,10 @@ SZ_INTERNAL void _sz_sequence_argsort_sve_3way_partition(
 
 /**
  *  @brief Recursive Quick-Sort implementation backing both the `sz_sequence_argsort_skylake` and
- * `sz_pgrams_sort_skylake`, and using the `_sz_sequence_argsort_skylake_3way_partition` under the hood.
+ * `sz_pgrams_sort_skylake`, and using the `sz_sequence_argsort_skylake_3way_partition_` under the hood.
  *  @sa Identical to @b Skylake implementation, but uses variable length SVE registers.
  */
-SZ_PUBLIC void _sz_sequence_argsort_sve_recursively(sz_pgram_t *initial_pgrams, sz_sorted_idx_t *initial_order,
+SZ_PUBLIC void sz_sequence_argsort_sve_recursively_(sz_pgram_t *initial_pgrams, sz_sorted_idx_t *initial_order,
                                                     sz_pgram_t *temporary_pgrams, sz_sorted_idx_t *temporary_order,
                                                     sz_size_t const start_in_sequence,
                                                     sz_size_t const end_in_sequence) {
@@ -1028,14 +1028,14 @@ SZ_PUBLIC void _sz_sequence_argsort_sve_recursively(sz_pgram_t *initial_pgrams,
     }
 
     sz_size_t first_pivot_index, last_pivot_index;
-    _sz_sequence_argsort_sve_3way_partition(initial_pgrams, initial_order, temporary_pgrams, temporary_order,
+    sz_sequence_argsort_sve_3way_partition_(initial_pgrams, initial_order, temporary_pgrams, temporary_order,
                                             start_in_sequence, end_in_sequence, &first_pivot_index, &last_pivot_index);
 
     if (start_in_sequence + 1 < first_pivot_index)
-        _sz_sequence_argsort_sve_recursively(initial_pgrams, initial_order, temporary_pgrams, temporary_order,
+        sz_sequence_argsort_sve_recursively_(initial_pgrams, initial_order, temporary_pgrams, temporary_order,
                                              start_in_sequence, first_pivot_index);
     if (last_pivot_index + 2 < end_in_sequence)
-        _sz_sequence_argsort_sve_recursively(initial_pgrams, initial_order, temporary_pgrams, temporary_order,
+        sz_sequence_argsort_sve_recursively_(initial_pgrams, initial_order, temporary_pgrams, temporary_order,
                                              last_pivot_index + 1, end_in_sequence);
 }
 
@@ -1056,7 +1056,7 @@ SZ_PUBLIC sz_status_t sz_pgrams_sort_sve(sz_pgram_t *pgrams, sz_size_t count, sz
     sz_sorted_idx_t *temporary_order = (sz_sorted_idx_t *)(temporary_pgrams + count);
     if (!temporary_pgrams) return sz_bad_alloc_k;
 
-    _sz_sequence_argsort_sve_recursively(pgrams, order, temporary_pgrams, temporary_order, 0, count);
+    sz_sequence_argsort_sve_recursively_(pgrams, order, temporary_pgrams, temporary_order, 0, count);
 
     alloc->free(temporary_pgrams, memory_usage, alloc);
     return sz_success_k;
@@ -1066,20 +1066,20 @@ SZ_PUBLIC sz_status_t sz_pgrams_sort_sve(sz_pgram_t *pgrams, sz_size_t count, sz
  *  @brief Recursive Quick-Sort adaptation for strings, that processes the strings a few N-grams at a time.
  *  @sa Identical to @b Skylake implementation, but uses variable length SVE registers.
  *
- *  It combines `_sz_sequence_argsort_serial_export_next_pgrams` and `_sz_sequence_argsort_serial_recursively`,
+ *  It combines `sz_sequence_argsort_serial_export_next_pgrams_` and `sz_sequence_argsort_serial_recursively_`,
  *  recursively diving into the identical pgrams.
  */
-SZ_PUBLIC void _sz_sequence_argsort_sve_next_pgrams(
+SZ_PUBLIC void sz_sequence_argsort_sve_next_pgrams_(
     sz_sequence_t const *const sequence, sz_pgram_t *const global_pgrams, sz_sorted_idx_t *const global_order,
     sz_pgram_t *const temporary_pgrams, sz_sorted_idx_t *const temporary_order, sz_size_t const start_in_sequence,
     sz_size_t const end_in_sequence, sz_size_t const start_character) {
 
     // Export the next pgrams from the sequence.
-    _sz_sequence_argsort_serial_export_next_pgrams(sequence, global_pgrams, global_order, start_in_sequence,
+    sz_sequence_argsort_serial_export_next_pgrams_(sequence, global_pgrams, global_order, start_in_sequence,
                                                    end_in_sequence, start_character);
 
     // Sort the current pgrams with the SVE quicksort.
-    _sz_sequence_argsort_sve_recursively(global_pgrams, global_order, temporary_pgrams, temporary_order,
+    sz_sequence_argsort_sve_recursively_(global_pgrams, global_order, temporary_pgrams, temporary_order,
                                          start_in_sequence, end_in_sequence);
 
     // For each group of equal pgrams, if there are multiple strings and more characters,
@@ -1096,7 +1096,7 @@ SZ_PUBLIC void _sz_sequence_argsort_sve_next_pgrams(
         int has_multiple_strings = nested_end - nested_start > 1;
         int has_more_characters_in_each = current_pgram_length == pgram_capacity;
         if (has_multiple_strings && has_more_characters_in_each)
-            _sz_sequence_argsort_sve_next_pgrams(sequence, global_pgrams, global_order, temporary_pgrams,
+            sz_sequence_argsort_sve_next_pgrams_(sequence, global_pgrams, global_order, temporary_pgrams,
                                                  temporary_order, nested_start, nested_end,
                                                  start_character + pgram_capacity);
         nested_start = nested_end;
@@ -1125,7 +1125,7 @@ SZ_PUBLIC sz_status_t sz_sequence_argsort_sve(sz_sequence_t const *sequence, sz_
     sz_sorted_idx_t *temporary_order = (sz_sorted_idx_t *)(temporary_pgrams + count);
     if (!global_pgrams) return sz_bad_alloc_k;
 
-    _sz_sequence_argsort_sve_next_pgrams(sequence, global_pgrams, order, temporary_pgrams, temporary_order, 0, count,
+    sz_sequence_argsort_sve_next_pgrams_(sequence, global_pgrams, order, temporary_pgrams, temporary_order, 0, count,
                                          0);
 
     alloc->free(global_pgrams, memory_usage, alloc);
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 3029806a..1c45678f 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -91,7 +91,7 @@ SZ_DYNAMIC sz_capability_t sz_capabilities(void);
  *  @brief Internal helper function to convert SIMD capabilities to a string.
  *  @sa    sz_capabilities_to_string, sz_capabilities
  */
-SZ_INTERNAL sz_cptr_t _sz_capabilities_to_string_implementation(sz_capability_t caps) {
+SZ_INTERNAL sz_cptr_t sz_capabilities_to_string_implementation_(sz_capability_t caps) {
 
     static char buf[256];
     char *p = buf;
@@ -151,7 +151,7 @@ SZ_PUBLIC int sz_version_major(void) { return STRINGZILLA_H_VERSION_MAJOR; }
 SZ_PUBLIC int sz_version_minor(void) { return STRINGZILLA_H_VERSION_MINOR; }
 SZ_PUBLIC int sz_version_patch(void) { return STRINGZILLA_H_VERSION_PATCH; }
 SZ_PUBLIC sz_cptr_t sz_capabilities_to_string(sz_capability_t caps) {
-    return _sz_capabilities_to_string_implementation(caps);
+    return sz_capabilities_to_string_implementation_(caps);
 }
 
 #endif
diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index d1200e07..519795e8 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -252,14 +252,14 @@ class basic_byteset {
     }
     explicit sz_constexpr_if_cpp14 basic_byteset(std::initializer_list<char_type> chars) noexcept : basic_byteset() {
         // ! Instead of relying on the `sz_byteset_add(&bitset_, c)`, we have to reimplement it to support `constexpr`.
-        for (auto c : chars) bitset_._u64s[sz_bitcast(sz_u8_t, c) >> 6] |= (1ull << (sz_bitcast(sz_u8_t, c) & 63u));
+        for (auto c : chars) bitset_._u64s[sz_bitcast_(sz_u8_t, c) >> 6] |= (1ull << (sz_bitcast_(sz_u8_t, c) & 63u));
     }
 
     explicit sz_constexpr_if_cpp14 basic_byteset(char_type const *chars, std::size_t count_characters) noexcept
         : basic_byteset() {
         for (std::size_t i = 0; i < count_characters; ++i) {
             char_type c = chars[i];
-            bitset_._u64s[sz_bitcast(sz_u8_t, c) >> 6] |= (1ull << (sz_bitcast(sz_u8_t, c) & 63u));
+            bitset_._u64s[sz_bitcast_(sz_u8_t, c) >> 6] |= (1ull << (sz_bitcast_(sz_u8_t, c) & 63u));
         }
     }
 
@@ -269,7 +269,7 @@ class basic_byteset {
         static_assert(count_characters > 0, "Character array cannot be empty");
         for (std::size_t i = 0; i < count_characters; ++i) {
             char_type c = chars[i];
-            bitset_._u64s[sz_bitcast(sz_u8_t, c) >> 6] |= (1ull << (sz_bitcast(sz_u8_t, c) & 63u));
+            bitset_._u64s[sz_bitcast_(sz_u8_t, c) >> 6] |= (1ull << (sz_bitcast_(sz_u8_t, c) & 63u));
         }
     }
 
@@ -287,7 +287,7 @@ class basic_byteset {
     }
 
     inline basic_byteset &add(char_type c) noexcept {
-        sz_byteset_add(&bitset_, sz_bitcast(sz_u8_t, c));
+        sz_byteset_add(&bitset_, sz_bitcast_(sz_u8_t, c));
         return *this;
     }
     inline std::size_t size() const noexcept {
@@ -297,7 +297,7 @@ class basic_byteset {
     }
     inline sz_byteset_t &raw() noexcept { return bitset_; }
     inline sz_byteset_t const &raw() const noexcept { return bitset_; }
-    inline bool contains(char_type c) const noexcept { return sz_byteset_contains(&bitset_, sz_bitcast(sz_u8_t, c)); }
+    inline bool contains(char_type c) const noexcept { return sz_byteset_contains(&bitset_, sz_bitcast_(sz_u8_t, c)); }
     inline basic_byteset inverted() const noexcept {
         basic_byteset result = *this;
         sz_byteset_invert(&result.bitset_);
@@ -362,8 +362,8 @@ class basic_look_up_table {
     }
 
     inline sz_cptr_t raw() const noexcept { return reinterpret_cast<sz_cptr_t>(&lut_[0]); }
-    inline char_type &operator[](char_type c) noexcept { return lut_[sz_bitcast(unsigned_type_, c)]; }
-    inline char_type const &operator[](char_type c) const noexcept { return lut_[sz_bitcast(unsigned_type_, c)]; }
+    inline char_type &operator[](char_type c) noexcept { return lut_[sz_bitcast_(unsigned_type_, c)]; }
+    inline char_type const &operator[](char_type c) const noexcept { return lut_[sz_bitcast_(unsigned_type_, c)]; }
 };
 
 using look_up_table = basic_look_up_table<char>;
@@ -376,7 +376,7 @@ struct end_sentinel_type {};
 struct include_overlaps_type {};
 struct exclude_overlaps_type {};
 
-#if _SZ_IS_CPP17
+#if SZ_IS_CPP17_
 inline static constexpr end_sentinel_type end_sentinel;
 inline static constexpr include_overlaps_type include_overlaps;
 inline static constexpr exclude_overlaps_type exclude_overlaps;
@@ -396,7 +396,7 @@ struct matcher_find {
     size_type operator()(string_type_ haystack) const noexcept { return haystack.find(needle_); }
     size_type skip_length() const noexcept {
         // TODO: Apply Galil rule to match repetitive patterns in strictly linear time.
-        return std::is_same<overlaps_type, include_overlaps_type>() ? 1 : needle_.length();
+        return is_same_type<overlaps_type, include_overlaps_type>() ? 1 : needle_.length();
     }
 };
 
@@ -414,7 +414,7 @@ struct matcher_rfind {
     size_type operator()(string_type_ haystack) const noexcept { return haystack.rfind(needle_); }
     size_type skip_length() const noexcept {
         // TODO: Apply Galil rule to match repetitive patterns in strictly linear time.
-        return std::is_same<overlaps_type, include_overlaps_type>() ? 1 : needle_.length();
+        return is_same_type<overlaps_type, include_overlaps_type>() ? 1 : needle_.length();
     }
 };
 
@@ -1259,7 +1259,7 @@ class basic_string_slice {
         return os.write(str.data(), str.size());
     }
 
-#if _SZ_IS_CPP17 && __cpp_lib_string_view
+#if SZ_IS_CPP17_ && __cpp_lib_string_view
 
     template <typename sfinae_ = char_type, typename std::enable_if<std::is_const<sfinae_>::value, int>::type = 0>
     sz_constexpr_if_cpp20 basic_string_slice(std::string_view const &other) noexcept
@@ -1491,7 +1491,7 @@ class basic_string_slice {
                sz_equal(data() + other.first.size(), other.second.data(), other.second.size()) == sz_true_k;
     }
 
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
 
     /**  @brief Computes the lexicographic ordering between this and the ::other string. */
     std::strong_ordering operator<=>(string_view other) const noexcept {
@@ -2171,7 +2171,7 @@ class basic_string {
         return os.write(str.data(), str.size());
     }
 
-#if _SZ_IS_CPP17 && __cpp_lib_string_view
+#if SZ_IS_CPP17_ && __cpp_lib_string_view
 
     basic_string(std::string_view other) noexcept(false) : basic_string(other.data(), other.size()) {}
     basic_string &operator=(std::string_view other) noexcept(false) { return assign({other.data(), other.size()}); }
@@ -2417,7 +2417,7 @@ class basic_string {
     bool operator==(string_view other) const noexcept { return view() == other; }
     bool operator==(const_pointer other) const noexcept { return view() == string_view(other); }
 
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
 
     /**  @brief Computes the lexicographic ordering between this and the @p other string. */
     std::strong_ordering operator<=>(basic_string const &other) const noexcept { return view() <=> other.view(); }
@@ -3537,7 +3537,7 @@ bool basic_string<char_type_, allocator_>::try_replace_all_(pattern_type pattern
     // 1. The pattern and the replacement are of the same length. Piece of cake!
     // 2. The pattern is longer than the replacement. We need to compact the strings.
     // 3. The pattern is shorter than the replacement. We may have to allocate more memory.
-    using matcher_type = typename std::conditional<std::is_same<pattern_type, byteset>::value,
+    using matcher_type = typename std::conditional<is_same_type<pattern_type, byteset>::value,
                                                    matcher_find_first_of<string_view, pattern_type>,
                                                    matcher_find<string_view, exclude_overlaps_type>>::type;
     matcher_type matcher({pattern});
@@ -3583,7 +3583,7 @@ bool basic_string<char_type_, allocator_>::try_replace_all_(pattern_type pattern
 
     // 3. The pattern is shorter than the replacement. We may have to allocate more memory.
     else {
-        using rmatcher_type = typename std::conditional<std::is_same<pattern_type, byteset>::value,
+        using rmatcher_type = typename std::conditional<is_same_type<pattern_type, byteset>::value,
                                                         matcher_find_last_of<string_view, pattern_type>,
                                                         matcher_rfind<string_view, exclude_overlaps_type>>::type;
         using rmatches_type = range_rmatches<string_view, rmatcher_type>;
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 10cf43a9..ea059525 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -24,13 +24,13 @@
  *  - `sz_sequence_t` - a wrapper to access strings forming a sequential container.
  *  - `sz_byteset_t` - a bitset for 256 possible byte values.
  */
-#ifndef STRINGZILLA_TYPES_H_
+#if !defined(STRINGZILLA_TYPES_H_)
 #define STRINGZILLA_TYPES_H_
 
 /*
  *  Debugging and testing.
  */
-#ifndef SZ_DEBUG
+#if !defined(SZ_DEBUG)
 #if defined(DEBUG) || defined(_DEBUG) // This means "Not using DEBUG information".
 #define SZ_DEBUG (1)
 #else
@@ -46,7 +46,7 @@
  *  You may also avoid them, if you are very sensitive to compilation time and avoid pre-compiled headers.
  *  https://artificial-mind.net/projects/compile-health/
  */
-#ifndef SZ_AVOID_LIBC
+#if !defined(SZ_AVOID_LIBC)
 #define SZ_AVOID_LIBC (0) // true or false
 #endif
 
@@ -56,7 +56,7 @@
  *          that runs the program, rather than the most advanced backend supported by the CPU
  *          used to compile the library or the downstream application.
  */
-#ifndef SZ_DYNAMIC_DISPATCH
+#if !defined(SZ_DYNAMIC_DISPATCH)
 #define SZ_DYNAMIC_DISPATCH (0) // true or false
 #endif
 
@@ -67,7 +67,7 @@
  *  Most platforms support it, but there is no industry standard way to check for those.
  *  This value will mostly affect the performance of the serial (SWAR) backend.
  */
-#ifndef SZ_USE_MISALIGNED_LOADS
+#if !defined(SZ_USE_MISALIGNED_LOADS)
 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
 #define SZ_USE_MISALIGNED_LOADS (1) // true or false
 #else
@@ -81,9 +81,9 @@
  *          32-bit on platforms where pointers are 32-bit.
  */
 #if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64)
-#define _SZ_IS_64_BIT (1)
+#define SZ_IS_64BIT_ (1)
 #else
-#define _SZ_IS_64_BIT (0)
+#define SZ_IS_64BIT_ (0)
 #endif
 
 /**
@@ -96,31 +96,31 @@
  *  In Python one can check `sys.byteorder == 'big'` in the `setup.py` script and pass the appropriate macro.
  *  https://stackoverflow.com/a/27054190
  */
-#ifndef _SZ_IS_BIG_ENDIAN
+#if !defined(SZ_IS_BIG_ENDIAN_)
 #if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN || defined(__BIG_ENDIAN__) || defined(__ARMEB__) || \
     defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || defined(__MIBSEB) || defined(__MIBSEB__)
-#define _SZ_IS_BIG_ENDIAN (1) //< It's a big-endian target architecture
+#define SZ_IS_BIG_ENDIAN_ (1) //< It's a big-endian target architecture
 #else
-#define _SZ_IS_BIG_ENDIAN (0) //< It's a little-endian target architecture
+#define SZ_IS_BIG_ENDIAN_ (0) //< It's a little-endian target architecture
 #endif
 #endif
 
 /**
- *  @brief  Infer the target architecture.
+ *  @brief  Infer the target architecture, unless it's overriden by the build system.
  *          At this point we only provide optimized backends for x86_64 and ARM64.
  */
-#ifndef _SZ_IS_X86_64
+#if !defined(SZ_IS_64BIT_X86_)
 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
-#define _SZ_IS_X86_64 (1)
+#define SZ_IS_64BIT_X86_ (1)
 #else
-#define _SZ_IS_X86_64 (0)
+#define SZ_IS_64BIT_X86_ (0)
 #endif
 #endif
-#ifndef _SZ_IS_ARM64
+#if !defined(SZ_IS_64BIT_ARM_)
 #if defined(__aarch64__) || defined(__arm64__) || defined(__arm64) || defined(_M_ARM64)
-#define _SZ_IS_ARM64 (1)
+#define SZ_IS_64BIT_ARM_ (1)
 #else
-#define _SZ_IS_ARM64 (0)
+#define SZ_IS_64BIT_ARM_ (0)
 #endif
 #endif
 
@@ -129,7 +129,7 @@
  *          On very short strings, under 16 bytes long, at most a single word will be processed with SWAR.
  *          Assuming potentially misaligned loads, SWAR makes sense only after ~24 bytes.
  */
-#ifndef SZ_SWAR_THRESHOLD
+#if !defined(SZ_SWAR_THRESHOLD)
 #if SZ_DEBUG
 #define SZ_SWAR_THRESHOLD (8u) // 8 bytes in debug builds
 #else
@@ -142,8 +142,8 @@
  *  - `SZ_PUBLIC` is used for functions that are part of the public API.
  *  - `SZ_INTERNAL` is used for internal helper functions with unstable APIs.
  *  - `SZ_DYNAMIC` is used for functions that are part of the public API, but are dispatched at runtime.
+ *  - `SZ_EXTERNAL` is used for third-party libraries that are linked dynamically.
  */
-#ifndef SZ_DYNAMIC
 #if SZ_DYNAMIC_DISPATCH
 #if defined(_WIN32) || defined(__CYGWIN__)
 #define SZ_DYNAMIC __declspec(dllexport)
@@ -162,17 +162,16 @@
 #define SZ_PUBLIC inline static
 #define SZ_INTERNAL inline static
 #endif // SZ_DYNAMIC_DISPATCH
-#endif // SZ_DYNAMIC
 
 /**
  *  @brief  Alignment macro for 64-byte alignment.
  */
 #if defined(_MSC_VER)
-#define _SZ_ALIGN64 __declspec(align(64))
+#define SZ_ALIGN64 __declspec(align(64))
 #elif defined(__GNUC__) || defined(__clang__)
-#define _SZ_ALIGN64 __attribute__((aligned(64)))
+#define SZ_ALIGN64 __attribute__((aligned(64)))
 #else
-#define _SZ_ALIGN64
+#define SZ_ALIGN64
 #endif
 
 /**
@@ -200,7 +199,7 @@
 #include <stdint.h> // `uint8_t`
 #endif
 
-/*  The headers needed for the `_sz_assert_failure` function. */
+/*  The headers needed for the `sz_assert_failure_` function. */
 #if SZ_DEBUG && defined(SZ_AVOID_LIBC) && !SZ_AVOID_LIBC && !defined(SZ_PIC)
 #include <stdio.h>  // `fprintf`, `stderr`
 #include <stdlib.h> // `EXIT_FAILURE`
@@ -209,7 +208,7 @@
 /*  Compile-time hardware features detection.
  *  All of those can be controlled by the user.
  */
-#ifndef SZ_USE_HASWELL
+#if !defined(SZ_USE_HASWELL)
 #ifdef __AVX2__
 #define SZ_USE_HASWELL (1)
 #else
@@ -217,7 +216,7 @@
 #endif
 #endif
 
-#ifndef SZ_USE_SKYLAKE
+#if !defined(SZ_USE_SKYLAKE)
 #ifdef __AVX512F__
 #define SZ_USE_SKYLAKE (1)
 #else
@@ -225,7 +224,7 @@
 #endif
 #endif
 
-#ifndef SZ_USE_ICE
+#if !defined(SZ_USE_ICE)
 #ifdef __AVX512BW__
 #define SZ_USE_ICE (1)
 #else
@@ -233,7 +232,7 @@
 #endif
 #endif
 
-#ifndef SZ_USE_NEON
+#if !defined(SZ_USE_NEON)
 #ifdef __ARM_NEON
 #define SZ_USE_NEON (1)
 #else
@@ -241,7 +240,7 @@
 #endif
 #endif
 
-#ifndef SZ_USE_SVE
+#if !defined(SZ_USE_SVE)
 #ifdef __ARM_FEATURE_SVE
 #define SZ_USE_SVE (1)
 #else
@@ -249,7 +248,7 @@
 #endif
 #endif
 
-#ifndef SZ_USE_SVE2
+#if !defined(SZ_USE_SVE2)
 #ifdef __ARM_FEATURE_SVE2
 #define SZ_USE_SVE2 (1)
 #else
@@ -257,7 +256,7 @@
 #endif
 #endif
 
-#ifndef SZ_USE_OPENMP
+#if !defined(SZ_USE_OPENMP)
 #ifdef _OPENMP
 #define SZ_USE_OPENMP (1)
 #else
@@ -265,7 +264,7 @@
 #endif
 #endif
 
-#ifndef SZ_USE_CUDA
+#if !defined(SZ_USE_CUDA)
 #ifdef __NVCC__
 #define SZ_USE_CUDA (1)
 #else
@@ -378,13 +377,13 @@ typedef unsigned long long sz_u64_t;
  *
  *  Source: https://learn.microsoft.com/en-us/windows/win32/winprog64/abstract-data-models
  */
-#if _SZ_IS_64_BIT
+#if SZ_IS_64BIT_
 typedef sz_u64_t sz_size_t;  // ? Preferred over the `__SIZE_TYPE__` and `__UINTMAX_TYPE__` macros
 typedef sz_i64_t sz_ssize_t; // ? Preferred over the `__PTRDIFF_TYPE__` and `__INTMAX_TYPE__` macros
 #else
 typedef sz_u32_t sz_size_t;  // ? Preferred over the `__SIZE_TYPE__` and `__UINTMAX_TYPE__` macros
 typedef sz_i32_t sz_ssize_t; // ? Preferred over the `__PTRDIFF_TYPE__` and `__INTMAX_TYPE__` macros
-#endif // _SZ_IS_64_BIT
+#endif // SZ_IS_64BIT_
 #endif // SZ_AVOID_LIBC
 
 /**
@@ -930,13 +929,13 @@ SZ_PUBLIC void sz_sequence_from_null_terminated_strings(sz_cptr_t *start, sz_siz
  */
 
 /** @brief Helper-macro to mark potentially unused variables. */
-#define sz_unused(x) ((void)(x))
+#define sz_unused_(x) ((void)(x))
 
 /** @brief Helper-macro casting a variable to another type of the same size. */
 #if defined(__has_builtin) && __has_builtin(__builtin_bit_cast)
-#define sz_bitcast(type, value) __builtin_bit_cast(type, (value))
+#define sz_bitcast_(type, value) __builtin_bit_cast(type, (value))
 #else
-#define sz_bitcast(type, value) (*((type *)&(value)))
+#define sz_bitcast_(type, value) (*((type *)&(value)))
 #endif
 
 /**
@@ -962,35 +961,35 @@ SZ_PUBLIC void sz_sequence_from_null_terminated_strings(sz_cptr_t *start, sz_siz
 #define SZ_SSIZE_MAX ((sz_ssize_t)(SZ_SIZE_MAX >> 1))
 #define SZ_SSIZE_MIN ((sz_ssize_t)(-SZ_SSIZE_MAX - 1))
 
-SZ_INTERNAL sz_size_t _sz_size_max(void) { return SZ_SIZE_MAX; }
-SZ_INTERNAL sz_ssize_t _sz_ssize_max(void) { return SZ_SSIZE_MAX; }
+SZ_INTERNAL sz_size_t sz_size_max_(void) { return SZ_SIZE_MAX; }
+SZ_INTERNAL sz_ssize_t sz_ssize_max_(void) { return SZ_SSIZE_MAX; }
 
 /**
- *  @brief  Similar to `assert`, the `_sz_assert` is used in the `SZ_DEBUG` mode
+ *  @brief  Similar to `assert`, the `sz_assert_` is used in the `SZ_DEBUG` mode
  *          to check the invariants of the library. It's a no-op in the "Release" mode.
  *  @note   If you want to catch it, put a breakpoint at @b `__GI_exit`
  */
 #if SZ_DEBUG && defined(SZ_AVOID_LIBC) && !SZ_AVOID_LIBC && !defined(SZ_PIC) && \
     !defined(__CUDA_ARCH__) // ? CPU code w/out LibC access
-SZ_PUBLIC void _sz_assert_failure(char const *condition, char const *file, int line) {
+SZ_PUBLIC void sz_assert_failure_(char const *condition, char const *file, int line) {
     fprintf(stderr, "Assertion failed: %s, in file %s, line %d\n", condition, file, line);
     exit(EXIT_FAILURE);
 }
-#define _sz_assert(condition)                                                     \
+#define sz_assert_(condition)                                                     \
     do {                                                                          \
-        if (!(condition)) { _sz_assert_failure(#condition, __FILE__, __LINE__); } \
+        if (!(condition)) { sz_assert_failure_(#condition, __FILE__, __LINE__); } \
     } while (0)
 #elif SZ_DEBUG && defined(__CUDA_ARCH__) // ? CUDA code for GPUs
-__device__ __noinline__ void _sz_assert_cuda_failure(char const *condition, char const *file, int line) {
+__device__ __noinline__ void sz_assert_cuda_failure_(char const *condition, char const *file, int line) {
     printf("Assertion failed: %s, in file %s, line %d\n", condition, file, line);
     __trap();
 }
-#define _sz_assert(condition)                                                          \
+#define sz_assert_(condition)                                                          \
     do {                                                                               \
-        if (!(condition)) { _sz_assert_cuda_failure(#condition, __FILE__, __LINE__); } \
+        if (!(condition)) { sz_assert_cuda_failure_(#condition, __FILE__, __LINE__); } \
     } while (0)
 #else
-#define _sz_assert(condition) ((void)(condition))
+#define sz_assert_(condition) ((void)(condition))
 #endif
 
 /*  Intrinsics aliases for MSVC, GCC, Clang, and Clang-Cl.
@@ -1010,13 +1009,13 @@ __device__ __noinline__ void _sz_assert_cuda_failure(char const *condition, char
  */
 #if (defined(_WIN32) && !defined(_WIN64)) || defined(_M_ARM) || defined(_M_ARM64)
 SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) {
-    _sz_assert(x != 0);
+    sz_assert_(x != 0);
     int n = 0;
     while ((x & 1) == 0) { n++, x >>= 1; }
     return n;
 }
 SZ_INTERNAL int sz_u64_clz(sz_u64_t x) {
-    _sz_assert(x != 0);
+    sz_assert_(x != 0);
     int n = 0;
     while ((x & 0x8000000000000000ull) == 0) { n++, x <<= 1; }
     return n;
@@ -1027,13 +1026,13 @@ SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) {
     return (((x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Full) * 0x0101010101010101ull) >> 56;
 }
 SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) {
-    _sz_assert(x != 0);
+    sz_assert_(x != 0);
     int n = 0;
     while ((x & 1) == 0) { n++, x >>= 1; }
     return n;
 }
 SZ_INTERNAL int sz_u32_clz(sz_u32_t x) {
-    _sz_assert(x != 0);
+    sz_assert_(x != 0);
     int n = 0;
     while ((x & 0x80000000u) == 0) { n++, x <<= 1; }
     return n;
@@ -1116,12 +1115,12 @@ SZ_INTERNAL sz_u64_t sz_u64_blend(sz_u64_t a, sz_u64_t b, sz_u64_t mask) { retur
  *  A cleaner option is to perform two comparisons and a subtraction.
  *  One instruction more, but no data-dependency.
  */
-#define _sz_order_scalars(a, b) ((sz_ordering_t)((a > b) - (a < b)))
+#define sz_order_scalars_(a, b) ((sz_ordering_t)((a > b) - (a < b)))
 
 /**
  *  Convenience macro to swap two values of the same type.
  */
-#define _sz_swap(type, a, b) \
+#define sz_swap_(type, a, b) \
     do {                     \
         type _tmp = (a);     \
         (a) = (b);           \
@@ -1143,14 +1142,14 @@ SZ_INTERNAL sz_i32_t sz_i32_max_of_two(sz_i32_t x, sz_i32_t y) { return x - ((x
 #pragma GCC push_options
 #pragma GCC target("bmi", "bmi2")
 #pragma clang attribute push(__attribute__((target("bmi,bmi2"))), apply_to = function)
-SZ_INTERNAL __mmask8 _sz_u8_mask_until(sz_size_t n) { return (__mmask8)_bzhi_u32(0xFFu, n); }
-SZ_INTERNAL __mmask16 _sz_u16_mask_until(sz_size_t n) { return (__mmask16)_bzhi_u32(0xFFFFu, n); }
-SZ_INTERNAL __mmask32 _sz_u32_mask_until(sz_size_t n) { return (__mmask32)_bzhi_u64(0xFFFFFFFFu, n); }
-SZ_INTERNAL __mmask64 _sz_u64_mask_until(sz_size_t n) { return (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n); }
-SZ_INTERNAL __mmask16 _sz_u16_clamp_mask_until(sz_size_t n) { return n < 16 ? _sz_u16_mask_until(n) : 0xFFFFu; }
-SZ_INTERNAL __mmask32 _sz_u32_clamp_mask_until(sz_size_t n) { return n < 32 ? _sz_u32_mask_until(n) : 0xFFFFFFFFu; }
-SZ_INTERNAL __mmask64 _sz_u64_clamp_mask_until(sz_size_t n) {
-    return n < 64 ? _sz_u64_mask_until(n) : 0xFFFFFFFFFFFFFFFFull;
+SZ_INTERNAL __mmask8 sz_u8_mask_until_(sz_size_t n) { return (__mmask8)_bzhi_u32(0xFFu, n); }
+SZ_INTERNAL __mmask16 sz_u16_mask_until_(sz_size_t n) { return (__mmask16)_bzhi_u32(0xFFFFu, n); }
+SZ_INTERNAL __mmask32 sz_u32_mask_until_(sz_size_t n) { return (__mmask32)_bzhi_u64(0xFFFFFFFFu, n); }
+SZ_INTERNAL __mmask64 sz_u64_mask_until_(sz_size_t n) { return (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n); }
+SZ_INTERNAL __mmask16 sz_u16_clamp_mask_until_(sz_size_t n) { return n < 16 ? sz_u16_mask_until_(n) : 0xFFFFu; }
+SZ_INTERNAL __mmask32 sz_u32_clamp_mask_until_(sz_size_t n) { return n < 32 ? sz_u32_mask_until_(n) : 0xFFFFFFFFu; }
+SZ_INTERNAL __mmask64 sz_u64_clamp_mask_until_(sz_size_t n) {
+    return n < 64 ? sz_u64_mask_until_(n) : 0xFFFFFFFFFFFFFFFFull;
 }
 #pragma GCC pop_options
 #pragma clang attribute pop
@@ -1160,7 +1159,7 @@ SZ_INTERNAL __mmask64 _sz_u64_clamp_mask_until(sz_size_t n) {
  *  @brief  Byte-level equality comparison between two 64-bit integers.
  *  @return 64-bit integer, where every top bit in each byte signifies a match.
  */
-SZ_INTERNAL sz_u64_vec_t _sz_u64_each_byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
+SZ_INTERNAL sz_u64_vec_t sz_u64_each_byte_equal_(sz_u64_vec_t a, sz_u64_vec_t b) {
     sz_u64_vec_t vec;
     vec.u64 = ~(a.u64 ^ b.u64);
     // The match is valid, if every bit within each byte is set.
@@ -1198,7 +1197,7 @@ SZ_INTERNAL void sz_ssize_clamp_interval( //
  *  @pre Input must be a positive number, as the logarithm of zero is undefined.
  */
 SZ_INTERNAL sz_size_t sz_size_log2i_nonzero(sz_size_t x) {
-    _sz_assert(x > 0 && "Non-positive numbers have no defined logarithm");
+    sz_assert_(x > 0 && "Non-positive numbers have no defined logarithm");
     sz_size_t leading_zeros = sz_u64_clz(x);
     return 63 - leading_zeros;
 }
@@ -1215,7 +1214,7 @@ SZ_INTERNAL sz_size_t sz_size_bit_ceil(sz_size_t x) {
     x |= x >> 4;
     x |= x >> 8;
     x |= x >> 16;
-#if _SZ_IS_64_BIT
+#if SZ_IS_64BIT_
     x |= x >> 32;
 #endif
     x++;
@@ -1307,7 +1306,7 @@ SZ_INTERNAL sz_u64_vec_t sz_u64_load(sz_cptr_t ptr) {
 }
 
 /** @brief Helper function, using the supplied fixed-capacity buffer to allocate memory. */
-SZ_INTERNAL sz_ptr_t _sz_memory_allocate_fixed(sz_size_t length, void *handle) {
+SZ_INTERNAL sz_ptr_t sz_memory_allocate_fixed_(sz_size_t length, void *handle) {
 
     sz_size_t const capacity = *(sz_size_t *)handle;
     sz_size_t const consumed_capacity = *((sz_size_t *)handle + 1);
@@ -1318,8 +1317,8 @@ SZ_INTERNAL sz_ptr_t _sz_memory_allocate_fixed(sz_size_t length, void *handle) {
 }
 
 /** @brief Helper "no-op" function, simulating memory deallocation when we use a "static" memory buffer. */
-SZ_INTERNAL void _sz_memory_free_fixed(sz_ptr_t start, sz_size_t length, void *handle) {
-    sz_unused(start && length && handle);
+SZ_INTERNAL void sz_memory_free_fixed_(sz_ptr_t start, sz_size_t length, void *handle) {
+    sz_unused_(start && length && handle);
 }
 
 #pragma GCC visibility pop
@@ -1331,13 +1330,13 @@ SZ_INTERNAL void _sz_memory_free_fixed(sz_ptr_t start, sz_size_t length, void *h
 #include <stdio.h>  // `fprintf`
 #include <stdlib.h> // `malloc`, `EXIT_FAILURE`
 
-SZ_PUBLIC void *_sz_memory_allocate_default(sz_size_t length, void *handle) {
-    sz_unused(handle);
+SZ_PUBLIC void *sz_memory_allocate_default_(sz_size_t length, void *handle) {
+    sz_unused_(handle);
     if (length == 0) return SZ_NULL;
     return malloc(length);
 }
-SZ_PUBLIC void _sz_memory_free_default(sz_ptr_t start, sz_size_t length, void *handle) {
-    sz_unused(handle && length);
+SZ_PUBLIC void sz_memory_free_default_(sz_ptr_t start, sz_size_t length, void *handle) {
+    sz_unused_(handle && length);
     free(start);
 }
 
@@ -1345,8 +1344,8 @@ SZ_PUBLIC void _sz_memory_free_default(sz_ptr_t start, sz_size_t length, void *h
 
 SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc) {
 #if !SZ_AVOID_LIBC
-    alloc->allocate = (sz_memory_allocate_t)_sz_memory_allocate_default;
-    alloc->free = (sz_memory_free_t)_sz_memory_free_default;
+    alloc->allocate = (sz_memory_allocate_t)sz_memory_allocate_default_;
+    alloc->free = (sz_memory_free_t)sz_memory_free_default_;
 #else
     alloc->allocate = (sz_memory_allocate_t)SZ_NULL;
     alloc->free = (sz_memory_free_t)SZ_NULL;
@@ -1358,19 +1357,19 @@ SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void
     // The logic here is simple - put the buffer capacity in the first slots of the buffer.
     // The second slot is used to store the current consumed capacity.
     // The rest of the buffer is used for the actual data.
-    alloc->allocate = (sz_memory_allocate_t)_sz_memory_allocate_fixed;
-    alloc->free = (sz_memory_free_t)_sz_memory_free_fixed;
+    alloc->allocate = (sz_memory_allocate_t)sz_memory_allocate_fixed_;
+    alloc->free = (sz_memory_free_t)sz_memory_free_fixed_;
     alloc->handle = buffer;
     *(sz_size_t *)buffer = length;
     *((sz_ptr_t)buffer + sizeof(sz_size_t)) = sizeof(sz_size_t) * 2; // The capacity and consumption so far
 }
 
-SZ_PUBLIC sz_cptr_t _sz_sequence_from_null_terminated_strings_get_start(void const *handle, sz_size_t i) {
+SZ_PUBLIC sz_cptr_t sz_sequence_from_null_terminated_strings_get_start_(void const *handle, sz_size_t i) {
     sz_cptr_t const *start = (sz_cptr_t const *)handle;
     return start[i];
 }
 
-SZ_PUBLIC sz_size_t _sz_sequence_from_null_terminated_strings_get_length(void const *handle, sz_size_t i) {
+SZ_PUBLIC sz_size_t sz_sequence_from_null_terminated_strings_get_length_(void const *handle, sz_size_t i) {
     sz_cptr_t const *start = (sz_cptr_t const *)handle;
     sz_size_t length = 0;
     for (sz_cptr_t ptr = start[i]; *ptr; ptr++) length++;
@@ -1380,8 +1379,8 @@ SZ_PUBLIC sz_size_t _sz_sequence_from_null_terminated_strings_get_length(void co
 SZ_PUBLIC void sz_sequence_from_null_terminated_strings(sz_cptr_t *start, sz_size_t count, sz_sequence_t *sequence) {
     sequence->handle = start;
     sequence->count = count;
-    sequence->get_start = _sz_sequence_from_null_terminated_strings_get_start;
-    sequence->get_length = _sz_sequence_from_null_terminated_strings_get_length;
+    sequence->get_start = sz_sequence_from_null_terminated_strings_get_start_;
+    sequence->get_length = sz_sequence_from_null_terminated_strings_get_length_;
 }
 
 #pragma endregion
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 62f961a9..9cd4912e 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -34,34 +34,34 @@
  *  This will affect recent features like `operator<=>` and tests against STL.
  */
 #if __cplusplus >= 202101L
-#define _SZ_IS_CPP23 1
+#define SZ_IS_CPP23_ 1
 #else
-#define _SZ_IS_CPP23 0
+#define SZ_IS_CPP23_ 0
 #endif
 #if __cplusplus >= 202002L
-#define _SZ_IS_CPP20 1
+#define SZ_IS_CPP20_ 1
 #else
-#define _SZ_IS_CPP20 0
+#define SZ_IS_CPP20_ 0
 #endif
 #if __cplusplus >= 201703L
-#define _SZ_IS_CPP17 1
+#define SZ_IS_CPP17_ 1
 #else
-#define _SZ_IS_CPP17 0
+#define SZ_IS_CPP17_ 0
 #endif
 #if __cplusplus >= 201402L
-#define _SZ_IS_CPP14 1
+#define SZ_IS_CPP14_ 1
 #else
-#define _SZ_IS_CPP14 0
+#define SZ_IS_CPP14_ 0
 #endif
 #if __cplusplus >= 201103L
-#define _SZ_IS_CPP11 1
+#define SZ_IS_CPP11_ 1
 #else
-#define _SZ_IS_CPP11 0
+#define SZ_IS_CPP11_ 0
 #endif
 #if __cplusplus >= 199711L
-#define _SZ_IS_CPP98 1
+#define SZ_IS_CPP98_ 1
 #else
-#define _SZ_IS_CPP98 0
+#define SZ_IS_CPP98_ 0
 #endif
 
 /**
@@ -74,27 +74,27 @@
  *  - C++17: Added the `if constexpr` construct for compile-time branching.
  *  - C++20: Added some dynamic memory allocations, `virtual` functions, and `try`/`catch` blocks.
  */
-#if _SZ_IS_CPP14
+#if SZ_IS_CPP14_
 #define sz_constexpr_if_cpp14 constexpr
 #else
 #define sz_constexpr_if_cpp14
 #endif
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
 #define sz_constexpr_if_cpp20 constexpr
 #else
 #define sz_constexpr_if_cpp20
 #endif
 
 #if defined(__GNUC__) || defined(__clang__)
-#define SZ_FORCE_INLINE inline __attribute__((always_inline))
+#define SZ_INLINE inline __attribute__((always_inline))
 #else
-#define SZ_FORCE_INLINE inline
+#define SZ_INLINE inline
 #endif
 
 #if !SZ_AVOID_STL
 #include <initializer_list> // `std::initializer_list` is only ~100 LOC
 #include <iterator>         // `std::random_access_iterator_tag` pulls 20K LOC
-#include <type_traits>      // `std::is_same`, `std::enable_if`, etc.
+#include <type_traits>      // `is_same_type`, `std::enable_if`, etc.
 #endif
 
 namespace ashvardanian {
@@ -155,8 +155,8 @@ struct span {
     constexpr span() noexcept = default;
     constexpr span(value_type *data) noexcept : data_(data) {}
     sz_constexpr_if_cpp14 span(value_type *data, size_type size) noexcept : data_(data) {
-        _sz_assert(extent == size && "The second argument is only intended for compatibility");
-        sz_unused(size);
+        sz_assert_(extent == size && "The second argument is only intended for compatibility");
+        sz_unused_(size);
     }
 
     constexpr value_type *begin() const noexcept { return data_; }
@@ -183,7 +183,7 @@ struct span {
     }
 
     sz_constexpr_if_cpp14 span<value_type, SZ_SIZE_MAX> subspan(size_type offset, size_type count) const noexcept {
-        _sz_assert(offset + count <= extent && "Subspan out of bounds");
+        sz_assert_(offset + count <= extent && "Subspan out of bounds");
         return span<value_type, SZ_SIZE_MAX>(data_ + offset, count);
     }
 };
@@ -226,7 +226,7 @@ struct span<value_type_, SZ_SIZE_MAX> {
     }
 
     sz_constexpr_if_cpp14 span subspan(size_type offset, size_type count) const noexcept {
-        _sz_assert(offset + count <= size_ && "Subspan out of bounds");
+        sz_assert_(offset + count <= size_ && "Subspan out of bounds");
         return span(data_ + offset, count);
     }
 };
@@ -446,7 +446,7 @@ struct arrow_strings_tape {
     using iterator_t = indexed_container_iterator<self_t>;
     using iterator = iterator_t; // ? For STL compatibility
 
-#if _SZ_IS_CPP17
+#if SZ_IS_CPP17_
     using char_alloc_t = typename std::allocator_traits<allocator_t>::rebind_alloc<char_t>;
     using offset_alloc_t = typename std::allocator_traits<allocator_t>::rebind_alloc<offset_t>;
 #else
@@ -577,7 +577,7 @@ struct arrow_strings_tape {
     }
 
     sz_constexpr_if_cpp14 value_type operator[](size_t i) const noexcept {
-        _sz_assert(i < count_ && "Index out of bounds");
+        sz_assert_(i < count_ && "Index out of bounds");
         return {buffer_.data_ + offsets_.data_[i], offsets_.data_[i + 1] - offsets_.data_[i] - 1};
     }
 
@@ -664,12 +664,12 @@ struct random_access_range {
     constexpr end_type_ end() const { return end_; }
 
     reference_type operator[](std::size_t index) const {
-        _sz_assert(index < size());
+        sz_assert_(index < size());
         return *(begin_ + index);
     }
 };
 
-#if _SZ_IS_CPP17 // ? Template deduction guides are available in C++17 and later
+#if SZ_IS_CPP17_ // ? Template deduction guides are available in C++17 and later
 template <typename begin_type_, typename end_type_>
 random_access_range(begin_type_, end_type_) -> random_access_range<begin_type_, end_type_>;
 #endif
@@ -705,7 +705,6 @@ struct is_same_type<first_, first_> {
 
 template <typename first_, typename second_>
 struct is_same_type {
-    static_assert(std::is_same<first_, second_>::value, "First and second types differ!");
     static constexpr bool value = false;
 };
 
@@ -762,7 +761,7 @@ struct cpu_specs_t {
  */
 template <typename scalar_type_>
 sz_constexpr_if_cpp14 scalar_type_ divide_round_up(scalar_type_ x, scalar_type_ divisor) {
-    _sz_assert(divisor > 0 && "Divisor must be positive");
+    sz_assert_(divisor > 0 && "Divisor must be positive");
     return (x + divisor - 1) / divisor;
 }
 
@@ -771,7 +770,7 @@ sz_constexpr_if_cpp14 scalar_type_ divide_round_up(scalar_type_ x, scalar_type_
  */
 template <typename scalar_type_>
 sz_constexpr_if_cpp14 scalar_type_ round_up_to_multiple(scalar_type_ x, scalar_type_ divisor) {
-    _sz_assert(divisor > 0 && "Divisor must be positive");
+    sz_assert_(divisor > 0 && "Divisor must be positive");
     return divide_round_up(x, divisor) * divisor;
 }
 
@@ -781,7 +780,7 @@ sz_constexpr_if_cpp14 scalar_type_ round_up_to_multiple(scalar_type_ x, scalar_t
 template <typename value_type_>
 sz_constexpr_if_cpp14 value_type_ non_zero_if(value_type_ value, value_type_ condition) noexcept {
     static_assert(std::is_unsigned<value_type_>::value, "Value type must be unsigned integer");
-    _sz_assert((condition == 0 || condition == 1) && "Condition must be either 0 or 1 unsigned integer");
+    sz_assert_((condition == 0 || condition == 1) && "Condition must be either 0 or 1 unsigned integer");
     return value * condition;
 }
 
@@ -814,7 +813,7 @@ sz_constexpr_if_cpp14 head_body_tail_t head_body_tail(element_type_ *first_addre
 
     // To split into head, body, and tail, we need the `first_address` to be
     // a multiple of `bytes_per_element`, otherwise the `body` will always be a zero!
-    _sz_assert((size_t)first_address % bytes_per_element == 0);
+    sz_assert_((size_t)first_address % bytes_per_element == 0);
     size_t bytes_misalignment = (size_t)first_address % bytes_per_page;
     size_t bytes_in_head = (bytes_per_page - bytes_misalignment) % bytes_per_page;
     size_t elements_in_head = bytes_in_head / bytes_per_element;
@@ -825,9 +824,9 @@ sz_constexpr_if_cpp14 head_body_tail_t head_body_tail(element_type_ *first_addre
 
     // Tail is simply what remains:
     size_t elements_in_tail = total_length - elements_in_head - elements_in_body;
-    _sz_assert(elements_in_head < elements_per_page_ && elements_in_head <= total_length);
-    _sz_assert(elements_in_tail < elements_per_page_ && elements_in_tail <= total_length);
-    _sz_assert(elements_in_body % elements_per_page_ == 0);
+    sz_assert_(elements_in_head < elements_per_page_ && elements_in_head <= total_length);
+    sz_assert_(elements_in_tail < elements_per_page_ && elements_in_tail <= total_length);
+    sz_assert_(elements_in_body % elements_per_page_ == 0);
 
     return head_body_tail_t {elements_in_head, elements_in_body, elements_in_tail};
 }
diff --git a/include/stringzillas/fingerprint.hpp b/include/stringzillas/fingerprint.hpp
index 57c60eab..f5a1e707 100644
--- a/include/stringzillas/fingerprint.hpp
+++ b/include/stringzillas/fingerprint.hpp
@@ -121,8 +121,8 @@ struct multiplying_rolling_hasher {
     explicit multiplying_rolling_hasher(std::size_t window_width, hash_t multiplier = static_cast<hash_t>(257)) noexcept
         : window_width_ {window_width}, multiplier_ {multiplier}, highest_power_ {1} {
 
-        _sz_assert(window_width_ > 1 && "Window width must be > 1");
-        _sz_assert(multiplier_ > 0 && "Multiplier must be positive");
+        sz_assert_(window_width_ > 1 && "Window width must be > 1");
+        sz_assert_(multiplier_ > 0 && "Multiplier must be positive");
 
         for (std::size_t i = 0; i + 1 < window_width_; ++i) highest_power_ = highest_power_ * multiplier_;
     }
@@ -157,15 +157,15 @@ struct rabin_karp_rolling_hasher {
     using hash_t = hash_type_;
     using accumulator_t = accumulator_type_;
 
-    static_assert(std::is_same<hash_t, std::uint16_t>::value || std::is_same<hash_t, std::uint32_t>::value ||
-                      std::is_same<hash_t, std::uint64_t>::value,
+    static_assert(is_same_type<hash_t, std::uint16_t>::value || is_same_type<hash_t, std::uint32_t>::value ||
+                      is_same_type<hash_t, std::uint64_t>::value,
                   "Unsupported hash type");
 
     static constexpr hash_t default_alphabet_size_k = 256u;
     static constexpr hash_t default_modulo_base_k = //
-        std::is_same_v<hash_t, std::uint16_t>   ? SZ_U16_MAX_PRIME
-        : std::is_same_v<hash_t, std::uint32_t> ? SZ_U32_MAX_PRIME
-                                                : SZ_U64_MAX_PRIME;
+        is_same_type<hash_t, std::uint16_t>::value   ? SZ_U16_MAX_PRIME
+        : is_same_type<hash_t, std::uint32_t>::value ? SZ_U32_MAX_PRIME
+                                                     : SZ_U64_MAX_PRIME;
 
     explicit rabin_karp_rolling_hasher(              //
         std::size_t window_width,                    //
@@ -173,9 +173,9 @@ struct rabin_karp_rolling_hasher {
         hash_t modulo = default_modulo_base_k) noexcept
         : window_width_ {window_width}, modulo_ {modulo}, multiplier_ {multiplier}, discarding_multiplier_ {1} {
 
-        _sz_assert(window_width_ > 1 && "Window width must be > 1");
-        _sz_assert(multiplier_ > 0 && "Multiplier must be positive");
-        _sz_assert(modulo_ > 1 && "Modulo base must be > 1");
+        sz_assert_(window_width_ > 1 && "Window width must be > 1");
+        sz_assert_(multiplier_ > 0 && "Multiplier must be positive");
+        sz_assert_(modulo_ > 1 && "Modulo base must be > 1");
 
         for (std::size_t i = 0; i + 1 < window_width_; ++i)
             discarding_multiplier_ = mul_mod(discarding_multiplier_, multiplier_);
@@ -227,7 +227,7 @@ struct buz_rolling_hasher {
     explicit buz_rolling_hasher(std::size_t window_width, std::uint64_t seed = 0x9E3779B97F4A7C15ull) noexcept
         : window_width_ {window_width} {
 
-        _sz_assert(window_width_ > 1 && "Window width must be > 1");
+        sz_assert_(window_width_ > 1 && "Window width must be > 1");
         for (std::size_t i = 0; i < 256; ++i) table_[i] = split_mix64(seed);
     }
 
@@ -323,16 +323,16 @@ struct floating_rolling_hasher<float> {
           modulo_ {static_cast<float_t>(modulo)}, inverse_modulo_ {1.0f / modulo_},
           negative_discarding_multiplier_ {1.0f} {
 
-        _sz_assert(window_width_ > 1 && "Window width must be > 1");
-        _sz_assert(multiplier_ > 0 && "Multiplier must be positive");
-        _sz_assert(modulo_ > 1 && "Modulo must be > 1");
+        sz_assert_(window_width_ > 1 && "Window width must be > 1");
+        sz_assert_(multiplier_ > 0 && "Multiplier must be positive");
+        sz_assert_(modulo_ > 1 && "Modulo must be > 1");
 
         // If we want to avoid hitting +inf or NaN, we need to make sure that the product of our post-modulo
         // normalized number with the multiplier and added subsequent term stays within the exactly representable range.
         float_t const largest_input_term = std::numeric_limits<byte_t>::max() + 1.0f;
         float_t const largest_normalized_state = modulo_ - 1;
         float_t const largest_intermediary = largest_normalized_state * multiplier_ + largest_input_term;
-        _sz_assert(largest_intermediary < limit_k && "Intermediate state overflows the limit");
+        sz_assert_(largest_intermediary < limit_k && "Intermediate state overflows the limit");
 
         // ! The GCC header misses the `std::fmodf` overload, so we use the underlying C version
         for (std::size_t i = 0; i + 1 < window_width_; ++i)
@@ -344,18 +344,18 @@ struct floating_rolling_hasher<float> {
 
     inline hash_t update(hash_t const old_hash, byte_t const new_char) const noexcept {
 
-        float_t state = sz_bitcast(float_t, old_hash);
+        float_t state = sz_bitcast_(float_t, old_hash);
         float_t new_term = float_t(new_char) + 1.0f;
 
         state = std::fmaf(state, multiplier_, new_term);
         state = reduce(state);
 
-        return sz_bitcast(hash_t, state);
+        return sz_bitcast_(hash_t, state);
     }
 
     inline hash_t update(hash_t const old_hash, byte_t const old_char, byte_t const new_char) const noexcept {
 
-        float_t state = sz_bitcast(float_t, old_hash);
+        float_t state = sz_bitcast_(float_t, old_hash);
         float_t old_term = float_t(old_char) + 1.0f;
         float_t new_term = float_t(new_char) + 1.0f;
 
@@ -363,7 +363,7 @@ struct floating_rolling_hasher<float> {
         state = std::fmaf(state, multiplier_, new_term);                     // Add head
         state = reduce(state);
 
-        return sz_bitcast(hash_t, state);
+        return sz_bitcast_(hash_t, state);
     }
 
   private:
@@ -423,16 +423,16 @@ struct floating_rolling_hasher<double> {
           modulo_ {static_cast<float_t>(modulo)}, inverse_modulo_ {1.0 / modulo_},
           negative_discarding_multiplier_ {1.0} {
 
-        _sz_assert(window_width_ > 1 && "Window width must be > 1");
-        _sz_assert(multiplier_ > 0 && "Multiplier must be positive");
-        _sz_assert(modulo_ > 1 && "Modulo must be > 1");
+        sz_assert_(window_width_ > 1 && "Window width must be > 1");
+        sz_assert_(multiplier_ > 0 && "Multiplier must be positive");
+        sz_assert_(modulo_ > 1 && "Modulo must be > 1");
 
         // If we want to avoid hitting +inf or NaN, we need to make sure that the product of our post-modulo
         // normalized number with the multiplier and added subsequent term stays within the exactly representable range.
         float_t const largest_input_term = std::numeric_limits<byte_t>::max() + 1.0;
         float_t const largest_normalized_state = modulo_ - 1;
         float_t const largest_intermediary = largest_normalized_state * multiplier_ + largest_input_term;
-        _sz_assert(largest_intermediary < limit_k && "Intermediate state overflows the limit");
+        sz_assert_(largest_intermediary < limit_k && "Intermediate state overflows the limit");
 
         for (std::size_t i = 0; i + 1 < window_width_; ++i)
             negative_discarding_multiplier_ = std::fmod(negative_discarding_multiplier_ * multiplier_, modulo_);
@@ -443,18 +443,18 @@ struct floating_rolling_hasher<double> {
 
     inline hash_t update(hash_t const old_hash, byte_t const new_char) const noexcept {
 
-        float_t state = sz_bitcast(float_t, old_hash);
+        float_t state = sz_bitcast_(float_t, old_hash);
         float_t new_term = float_t(new_char) + 1.0;
 
         state = std::fma(state, multiplier_, new_term);
         state = reduce(state);
 
-        return sz_bitcast(hash_t, state);
+        return sz_bitcast_(hash_t, state);
     }
 
     inline hash_t update(hash_t const old_hash, byte_t const old_char, byte_t const new_char) const noexcept {
 
-        float_t state = sz_bitcast(float_t, old_hash);
+        float_t state = sz_bitcast_(float_t, old_hash);
         float_t old_term = float_t(old_char) + 1.0;
         float_t new_term = float_t(new_char) + 1.0;
 
@@ -462,7 +462,7 @@ struct floating_rolling_hasher<double> {
         state = std::fma(state, multiplier_, new_term);                     // Add head
         state = reduce(state);
 
-        return sz_bitcast(hash_t, state);
+        return sz_bitcast_(hash_t, state);
     }
 
     inline float_t multiplier() const noexcept { return multiplier_; }
@@ -562,7 +562,7 @@ struct basic_rolling_hashers {
         if (hashers_.try_reserve(dims) != status_t::success_k) return status_t::bad_alloc_k;
         for (std::size_t dim = 0; dim < dims; ++dim) {
             status_t status = try_append(hasher_t(window_width, alphabet_size + dim));
-            _sz_assert(status == status_t::success_k && "Couldn't fail after the reserve");
+            sz_assert_(status == status_t::success_k && "Couldn't fail after the reserve");
         }
         return status_t::success_k;
     }
@@ -589,7 +589,7 @@ struct basic_rolling_hashers {
      */
     template <size_t dimensions_ = SZ_SIZE_MAX>
     status_t try_fingerprint(span<byte_t const> text, span<result_scalar_t, dimensions_> result) const noexcept {
-        _sz_assert(result.size() == dimensions() && "Dimensions number & hashers number mismatch");
+        sz_assert_(result.size() == dimensions() && "Dimensions number & hashers number mismatch");
 
         // Allocate temporary states
         states_t states(allocator_traits_t::select_on_container_copy_construction(allocator_));
@@ -609,8 +609,8 @@ struct basic_rolling_hashers {
     void fingerprint(span<byte_t const> text, span<state_t, dimensions_> states,
                      span<result_scalar_t, dimensions_> result) const noexcept {
 
-        _sz_assert(result.size() == dimensions() && "Dimensions number & hashers number mismatch");
-        _sz_assert(states.size() == dimensions() && "Dimensions number & states number mismatch");
+        sz_assert_(result.size() == dimensions() && "Dimensions number & hashers number mismatch");
+        sz_assert_(states.size() == dimensions() && "Dimensions number & states number mismatch");
 
         fill_states_(text, states);
 
@@ -632,7 +632,7 @@ struct basic_rolling_hashers {
      *  @retval status_t::bad_alloc_k if the memory allocation fails.
      */
     template <typename texts_type_, typename fingerprints_type_, typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(                                        //
@@ -707,7 +707,7 @@ struct basic_rolling_hashers {
     template <size_t dimensions_ = SZ_SIZE_MAX>
     void fill_states_(span<byte_t const> text, span<state_t, dimensions_> states) const noexcept {
 
-        _sz_assert(states.size() >= hashers_.size() && "Dimensions number & states number mismatch");
+        sz_assert_(states.size() >= hashers_.size() && "Dimensions number & states number mismatch");
 
         // Clear the states
         for (auto &state : states) state = state_t {};
@@ -840,7 +840,7 @@ struct floating_rolling_hashers {
      *  @retval status_t::bad_alloc_k if the memory allocation fails.
      */
     template <typename texts_type_, typename fingerprints_type_, typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(                                        //
@@ -917,7 +917,7 @@ struct floating_rolling_hashers {
         // Branchless clamp to [0, modulo).
         // `h` is in (-modulo, modulo).
         // If h is negative, add modulo.
-        h += modulo * static_cast<float_t>(sz_bitcast(std::uint64_t, h) >> 63);
+        h += modulo * static_cast<float_t>(sz_bitcast_(std::uint64_t, h) >> 63);
         // Now `h` is in [0, 2*modulo).
         // If h is >= modulo, subtract modulo.
         long long is_ge = static_cast<long long>(h * inverse_modulo);
@@ -939,12 +939,12 @@ struct floating_rolling_hashers {
             float_t const new_term = static_cast<float_t>(new_char) + 1.0;
             for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
                 rolling_hash_t &hash = last_hashes[dim];
-                float_t state = sz_bitcast(float_t, hash);
+                float_t state = sz_bitcast_(float_t, hash);
                 state += multipliers_[dim] * new_term;
                 state = reduce(state, dim);
 
                 // Save back
-                hash = sz_bitcast(rolling_hash_t, state);
+                hash = sz_bitcast_(rolling_hash_t, state);
                 minimum_hashes[dim] = std::min(minimum_hashes[dim], hash);
             }
         }
@@ -957,13 +957,13 @@ struct floating_rolling_hashers {
             float_t const old_term = static_cast<float_t>(old_char) + 1.0;
             for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
                 rolling_hash_t &hash = last_hashes[dim];
-                float_t state = sz_bitcast(float_t, hash);
+                float_t state = sz_bitcast_(float_t, hash);
                 state += negative_discarding_multipliers_[dim] * old_term; // Remove tail
                 state += multipliers_[dim] * new_term;                     // Add head
                 state = reduce(state, dim);
 
                 // Save back
-                hash = sz_bitcast(rolling_hash_t, state);
+                hash = sz_bitcast_(rolling_hash_t, state);
                 minimum_hashes[dim] = std::min(minimum_hashes[dim], hash);
             }
         }
@@ -1055,7 +1055,7 @@ SZ_DYNAMIC void sz_hashes(
 SZ_PUBLIC void sz_hashes_fingerprint(                          //
     sz_cptr_t text, sz_size_t length, sz_size_t window_width, //
     sz_ptr_t fingerprint, sz_size_t fingerprint_bytes) {
-    sz_unused(text && length && window_width && fingerprint && fingerprint_bytes);
+    sz_unused_(text && length && window_width && fingerprint && fingerprint_bytes);
 }
 
 /**
@@ -1095,10 +1095,10 @@ SZ_PUBLIC void sz_hashes_serial(
  *  Let's stick to the Fibonacci hash trick using the golden ratio.
  *  https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
  */
-#define _sz_hash_mix(first, second) ((first * 11400714819323198485ull) ^ (second * 11400714819323198485ull))
-#define _sz_shift_low(x) (x)
-#define _sz_shift_high(x) ((x + 77ull) & 0xFFull)
-#define _sz_prime_mod(x) (x % SZ_U64_MAX_PRIME)
+#define sz_hash_mix_(first, second) ((first * 11400714819323198485ull) ^ (second * 11400714819323198485ull))
+#define sz_shift_low_(x) (x)
+#define sz_shift_high_(x) ((x + 77ull) & 0xFFull)
+#define sz_prime_mod_(x) (x % SZ_U64_MAX_PRIME)
 
 SZ_PUBLIC void sz_hashes_serial(sz_cptr_t start, sz_size_t length, sz_size_t window_width, sz_size_t step, //
                                 sz_hash_callback_t callback, void *callback_handle) {
@@ -1116,11 +1116,11 @@ SZ_PUBLIC void sz_hashes_serial(sz_cptr_t start, sz_size_t length, sz_size_t win
     // Compute the initial hash value for the first window.
     sz_u64_t hash_low = 0, hash_high = 0, hash_mix;
     for (sz_u8_t const *first_end = text + window_width; text < first_end; ++text)
-        hash_low = (hash_low * 31ull + _sz_shift_low(*text)) % SZ_U64_MAX_PRIME,
-        hash_high = (hash_high * 257ull + _sz_shift_high(*text)) % SZ_U64_MAX_PRIME;
+        hash_low = (hash_low * 31ull + sz_shift_low_(*text)) % SZ_U64_MAX_PRIME,
+        hash_high = (hash_high * 257ull + sz_shift_high_(*text)) % SZ_U64_MAX_PRIME;
 
     // In most cases the fingerprint length will be a power of two.
-    hash_mix = _sz_hash_mix(hash_low, hash_high);
+    hash_mix = sz_hash_mix_(hash_low, hash_high);
     callback((sz_cptr_t)text, window_width, hash_mix, callback_handle);
 
     // Compute the hash value for every window, exporting into the fingerprint,
@@ -1129,53 +1129,53 @@ SZ_PUBLIC void sz_hashes_serial(sz_cptr_t start, sz_size_t length, sz_size_t win
     sz_size_t const step_mask = step - 1;
     for (; text < text_end; ++text, ++cycles) {
         // Discard one character:
-        hash_low -= _sz_shift_low(*(text - window_width)) * prime_power_low;
-        hash_high -= _sz_shift_high(*(text - window_width)) * prime_power_high;
+        hash_low -= sz_shift_low_(*(text - window_width)) * prime_power_low;
+        hash_high -= sz_shift_high_(*(text - window_width)) * prime_power_high;
         // And add a new one:
-        hash_low = 31ull * hash_low + _sz_shift_low(*text);
-        hash_high = 257ull * hash_high + _sz_shift_high(*text);
+        hash_low = 31ull * hash_low + sz_shift_low_(*text);
+        hash_high = 257ull * hash_high + sz_shift_high_(*text);
         // Wrap the hashes around:
-        hash_low = _sz_prime_mod(hash_low);
-        hash_high = _sz_prime_mod(hash_high);
+        hash_low = sz_prime_mod_(hash_low);
+        hash_high = sz_prime_mod_(hash_high);
         // Mix only if we've skipped enough hashes.
         if ((cycles & step_mask) == 0) {
-            hash_mix = _sz_hash_mix(hash_low, hash_high);
+            hash_mix = sz_hash_mix_(hash_low, hash_high);
             callback((sz_cptr_t)text, window_width, hash_mix, callback_handle);
         }
     }
 }
 
 /** @brief  An internal callback used to set a bit in a power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void _sz_hashes_fingerprint_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *handle) {
+SZ_INTERNAL void sz_hashes_fingerprint_pow2_callback_(sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *handle) {
     sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
     sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
     sz_size_t fingerprint_bytes = fingerprint_buffer->length;
     fingerprint_u8s[(hash / 8) & (fingerprint_bytes - 1)] |= (1 << (hash & 7));
-    sz_unused(start && length);
+    sz_unused_(start && length);
 }
 
 /** @brief  An internal callback used to set a bit in a @b non power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void _sz_hashes_fingerprint_non_pow2_callback( //
+SZ_INTERNAL void sz_hashes_fingerprint_non_pow2_callback_( //
     sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *handle) {
     sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
     sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
     sz_size_t fingerprint_bytes = fingerprint_buffer->length;
     fingerprint_u8s[(hash / 8) % fingerprint_bytes] |= (1 << (hash & 7));
-    sz_unused(start && length);
+    sz_unused_(start && length);
 }
 
 /** @brief  An internal callback, used to mix all the running hashes into one pointer-size value. */
-SZ_INTERNAL void _sz_hashes_fingerprint_scalar_callback( //
+SZ_INTERNAL void sz_hashes_fingerprint_scalar_callback_( //
     sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *scalar_handle) {
-    sz_unused(start && length && hash && scalar_handle);
+    sz_unused_(start && length && hash && scalar_handle);
     sz_size_t *scalar_ptr = (sz_size_t *)scalar_handle;
     *scalar_ptr ^= hash;
 }
 
-#undef _sz_shift_low
-#undef _sz_shift_high
-#undef _sz_hash_mix
-#undef _sz_prime_mod
+#undef sz_shift_low_
+#undef sz_shift_high_
+#undef sz_hash_mix_
+#undef sz_prime_mod_
 
 #pragma endregion // Serial Implementation
 
diff --git a/include/stringzillas/similarity.cuh b/include/stringzillas/similarity.cuh
index 05ce4d2d..1a9cfa4b 100644
--- a/include/stringzillas/similarity.cuh
+++ b/include/stringzillas/similarity.cuh
@@ -145,7 +145,7 @@ __forceinline__ __device__ scalar_type_ _load_last_use(scalar_type_ const *ptr)
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
              substituter_like<substituter_type_>
 #endif
@@ -213,7 +213,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 
         // Make sure we are called for an anti-diagonal traversal order
         score_t const gap_costs = gap_costs_.open_or_extend;
-        _sz_assert(scores_pre_insertion + 1 == scores_pre_deletion);
+        sz_assert_(scores_pre_insertion + 1 == scores_pre_deletion);
 
         // ? One weird observation, is that even though we can avoid fetching `pre_insertion`
         // ? from shared memory on each cycle, by slicing the work differently between the threads,
@@ -244,7 +244,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
              substituter_like<substituter_type_>
 #endif
@@ -309,7 +309,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 
         // Make sure we are called for an anti-diagonal traversal order
         error_cost_t const gap_cost = gap_costs_.open_or_extend;
-        _sz_assert(scores_pre_insertion + 1 == scores_pre_deletion);
+        sz_assert_(scores_pre_insertion + 1 == scores_pre_deletion);
 
         // ? One weird observation, is that even though we can avoid fetching `pre_insertion`
         // ? from shared memory on each cycle, by slicing the work differently between the threads,
@@ -344,7 +344,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
              substituter_like<substituter_type_>
 #endif
@@ -426,7 +426,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         score_t *scores_new_deletions) noexcept {
 
         // Make sure we are called for an anti-diagonal traversal order
-        _sz_assert(scores_pre_insertion + 1 == scores_pre_deletion);
+        sz_assert_(scores_pre_insertion + 1 == scores_pre_deletion);
 
         // ? One weird observation, is that even though we can avoid fetching `pre_insertion`
         // ? from shared memory on each cycle, by slicing the work differently between the threads,
@@ -467,7 +467,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
              substituter_like<substituter_type_>
 #endif
@@ -544,7 +544,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         score_t *scores_new_deletions) noexcept {
 
         // Make sure we are called for an anti-diagonal traversal order
-        _sz_assert(scores_pre_insertion + 1 == scores_pre_deletion);
+        sz_assert_(scores_pre_insertion + 1 == scores_pre_deletion);
 
         // ? One weird observation, is that even though we can avoid fetching `pre_insertion`
         // ? from shared memory on each cycle, by slicing the work differently between the threads,
@@ -1240,9 +1240,9 @@ __global__ void _linear_score_across_cuda_device(              //
 
     namespace cg = cooperative_groups;
 
-    _sz_assert(shorter_length > 0);
-    _sz_assert(longer_length > 0);
-    _sz_assert(shorter_length <= longer_length);
+    sz_assert_(shorter_length > 0);
+    sz_assert_(longer_length > 0);
+    sz_assert_(shorter_length <= longer_length);
     using char_t = char_type_;
     using index_t = index_type_;
     using score_t = score_type_;
@@ -1417,9 +1417,9 @@ __global__ void _affine_score_across_cuda_device(              //
 
     namespace cg = cooperative_groups;
 
-    _sz_assert(shorter_length > 0);
-    _sz_assert(longer_length > 0);
-    _sz_assert(shorter_length <= longer_length);
+    sz_assert_(shorter_length > 0);
+    sz_assert_(longer_length > 0);
+    sz_assert_(shorter_length <= longer_length);
     using char_t = char_type_;
     using index_t = index_type_;
     using score_t = score_type_;
@@ -2075,7 +2075,7 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
         results_type_ *results_ptr,                                                           //
         gpu_specs_t specs = {}, cuda_executor_t executor = {}) const noexcept {
 
-        constexpr bool is_affine_k = std::is_same<gap_costs_t, affine_gap_costs_t>::value;
+        constexpr bool is_affine_k = is_same_type<gap_costs_t, affine_gap_costs_t>::value;
         constexpr size_t count_diagonals_k = is_affine_k ? 7 : 3;
 
         // Preallocate the events for GPU timing.
@@ -2149,7 +2149,7 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
             // On very large inputs we can't fit the diagonals in shared memory, and use the global one.
             safe_vector<sz_u64_t, scores_allocator_t> diagonals_u64_buffer(alloc_);
             task_t const &largest_task = device_level_tasks[0];
-            _sz_assert(largest_task.max_diagonal_length() >= device_level_tasks.back().max_diagonal_length());
+            sz_assert_(largest_task.max_diagonal_length() >= device_level_tasks.back().max_diagonal_length());
             if (diagonals_u64_buffer.try_resize(largest_task.max_diagonal_length() * count_diagonals_k) ==
                 status_t::bad_alloc_k)
                 return {status_t::bad_alloc_k};
@@ -2221,7 +2221,7 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
 
                 // Make sure all tasks can be handled by the same kernel template.
                 task_t const &first_task = *tasks_begin;
-                _sz_assert(std::all_of(tasks_begin, tasks_end, [&](task_t const &task) {
+                sz_assert_(std::all_of(tasks_begin, tasks_end, [&](task_t const &task) {
                     return task.bytes_per_cell == first_task.bytes_per_cell && task.density == first_task.density;
                 }));
 
@@ -2243,8 +2243,8 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
                 // Update the selected kernels properties.
                 uint const shared_memory_per_block =
                     static_cast<uint>(indicative_task.memory_requirement * optimal_density);
-                _sz_assert(shared_memory_per_block > 0);
-                _sz_assert(shared_memory_per_block < specs.shared_memory_per_multiprocessor());
+                sz_assert_(shared_memory_per_block > 0);
+                sz_assert_(shared_memory_per_block < specs.shared_memory_per_multiprocessor());
                 cudaError_t attribute_error = cudaFuncSetAttribute(
                     warp_level_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_per_block);
                 if (attribute_error != cudaSuccess) {
@@ -2317,7 +2317,7 @@ struct error_costs_256x256_in_cuda_constant_memory_t {
 #if defined(__CUDA_ARCH__)
         return _error_costs_in_cuda_constant_memory[static_cast<sz_u8_t>(a) * 256 + static_cast<sz_u8_t>(b)];
 #else
-        sz_unused(a && b);
+        sz_unused_(a && b);
         return 0;
 #endif
     }
@@ -2388,7 +2388,7 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
             if constexpr (locality_k == sz_similarity_global_k) {
                 cell_score_vec.u32 = __viaddmax_s16x2(pre_substitution_vec.u32, cost_of_substitution_vec.u32,
                                                       if_deletion_or_insertion_vec.u32);
-                sz_unused(final_score_vec);
+                sz_unused_(final_score_vec);
             }
             else {
                 cell_score_vec.u32 = __viaddmax_s16x2_relu(pre_substitution_vec.u32, cost_of_substitution_vec.u32,
@@ -2440,7 +2440,7 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
         sz_i32_t *scores_new) noexcept {
 
         // Make sure we are called for an anti-diagonal traversal order
-        _sz_assert(scores_pre_insertion + 1 == scores_pre_deletion);
+        sz_assert_(scores_pre_insertion + 1 == scores_pre_deletion);
         error_costs_256x256_in_cuda_constant_memory_t substituter;
         sz_i32_t const gap_costs = this->gap_costs_.open_or_extend;
         sz_i32_t final_score = 0;
@@ -2459,7 +2459,7 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
             // For local scoring we should use the ReLU variants of 3-way `max`.
             if constexpr (locality_k == sz_similarity_global_k) {
                 cell_score = __viaddmax_s32(pre_substitution, cost_of_substitution, if_deletion_or_insertion);
-                sz_unused(final_score);
+                sz_unused_(final_score);
             }
             else {
                 cell_score = __viaddmax_s32_relu(pre_substitution, cost_of_substitution, if_deletion_or_insertion);
@@ -2561,7 +2561,7 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
             // For local scoring we should use the ReLU variants of 3-way `max`.
             if constexpr (locality_k == sz_similarity_global_k) {
                 cell_score_vec.u32 = __vimax3_s16x2(if_substitution_vec.u32, if_insertion_vec.u32, if_deletion_vec.u32);
-                sz_unused(final_score_vec);
+                sz_unused_(final_score_vec);
             }
             else {
                 cell_score_vec.u32 =
@@ -2621,7 +2621,7 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
         sz_i32_t *scores_new_deletions) noexcept {
 
         // Make sure we are called for an anti-diagonal traversal order
-        _sz_assert(scores_pre_insertion + 1 == scores_pre_deletion);
+        sz_assert_(scores_pre_insertion + 1 == scores_pre_deletion);
         sz_i32_t const gap_open_cost = this->gap_costs_.open;
         sz_i32_t const gap_extend_cost = this->gap_costs_.extend;
         error_costs_256x256_in_cuda_constant_memory_t substituter;
@@ -2647,7 +2647,7 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
             // For local scoring we should use the ReLU variants of 3-way `max`.
             if constexpr (locality_k == sz_similarity_global_k) {
                 cell_score = __vimax3_s32(if_substitution, if_insertion, if_deletion);
-                sz_unused(final_score);
+                sz_unused_(final_score);
             }
             else {
                 cell_score = __vimax3_s32_relu(if_substitution, if_insertion, if_deletion);
@@ -2735,7 +2735,7 @@ struct _cuda_nw_or_sw_byte_level_scores {
         gpu_specs_t specs = {}, cuda_executor_t executor = {}) const noexcept {
 
         constexpr bool is_local_k = locality_k == sz_similarity_local_k;
-        constexpr bool is_affine_k = std::is_same<gap_costs_t, affine_gap_costs_t>::value;
+        constexpr bool is_affine_k = is_same_type<gap_costs_t, affine_gap_costs_t>::value;
         constexpr size_t count_diagonals_k = is_affine_k ? 7 : 3;
 
         // Preallocate the events for GPU timing.
@@ -2808,7 +2808,7 @@ struct _cuda_nw_or_sw_byte_level_scores {
             // On very large inputs we can't fit the diagonals in shared memory, and use the global one.
             safe_vector<sz_u64_t, scores_allocator_t> diagonals_u64_buffer(alloc_);
             task_t const &largest_task = device_level_tasks[0];
-            _sz_assert(largest_task.max_diagonal_length() >= device_level_tasks.back().max_diagonal_length());
+            sz_assert_(largest_task.max_diagonal_length() >= device_level_tasks.back().max_diagonal_length());
             if (diagonals_u64_buffer.try_resize(largest_task.max_diagonal_length() * count_diagonals_k) ==
                 status_t::bad_alloc_k)
                 return {status_t::bad_alloc_k};
@@ -2876,7 +2876,7 @@ struct _cuda_nw_or_sw_byte_level_scores {
 
                 // Make sure all tasks can be handled by the same kernel template.
                 task_t const &first_task = *tasks_begin;
-                _sz_assert(std::all_of(tasks_begin, tasks_end, [&](task_t const &task) {
+                sz_assert_(std::all_of(tasks_begin, tasks_end, [&](task_t const &task) {
                     return task.bytes_per_cell == first_task.bytes_per_cell && task.density == first_task.density;
                 }));
 
@@ -2898,8 +2898,8 @@ struct _cuda_nw_or_sw_byte_level_scores {
                 // Update the selected kernels properties.
                 uint const shared_memory_per_block =
                     static_cast<uint>(indicative_task.memory_requirement * optimal_density);
-                _sz_assert(shared_memory_per_block > 0);
-                _sz_assert(shared_memory_per_block < specs.shared_memory_per_multiprocessor());
+                sz_assert_(shared_memory_per_block > 0);
+                sz_assert_(shared_memory_per_block < specs.shared_memory_per_multiprocessor());
                 cudaError_t attribute_error = cudaFuncSetAttribute(
                     warp_level_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_per_block);
                 if (attribute_error != cudaSuccess) {
diff --git a/include/stringzillas/similarity.hpp b/include/stringzillas/similarity.hpp
index 06fbb35a..2a2882a6 100644
--- a/include/stringzillas/similarity.hpp
+++ b/include/stringzillas/similarity.hpp
@@ -121,8 +121,8 @@ struct affine_gap_costs_t {
 
 template <typename gap_costs_type_>
 constexpr sz_similarity_gaps_t gap_type() {
-    constexpr bool is_linear_k = std::is_same_v<gap_costs_type_, linear_gap_costs_t>;
-    constexpr bool is_affine_k = std::is_same_v<gap_costs_type_, affine_gap_costs_t>;
+    constexpr bool is_linear_k = is_same_type<gap_costs_type_, linear_gap_costs_t>::value;
+    constexpr bool is_affine_k = is_same_type<gap_costs_type_, affine_gap_costs_t>::value;
     static_assert(is_linear_k || is_affine_k, "Invalid gap costs type");
     if constexpr (is_linear_k) { return sz_gaps_linear_k; }
     else { return sz_gaps_affine_k; }
@@ -266,7 +266,7 @@ struct similarity_memory_requirements {
 
 #pragma region - Core Templates
 
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
 
 template <typename iterator_type_>
 concept pointer_like = requires(iterator_type_ iterator, std::size_t idx) {
@@ -316,7 +316,7 @@ template <                                                       //
     sz_capability_t capability_ = sz_cap_serial_k,               //
     typename enable_ = void                                      //
     >
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
              substituter_like<substituter_type_> && gap_costs_like<gap_costs_type_>
 #endif
@@ -354,7 +354,7 @@ template <                                                       //
     sz_capability_t capability_ = sz_cap_serial_k,               //
     typename enable_ = void                                      //
     >
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires score_like<score_type_> && substituter_like<substituter_type_> && gap_costs_like<gap_costs_type_>
 #endif
 struct diagonal_walker;
@@ -393,7 +393,7 @@ template <                                                       //
     sz_capability_t capability_ = sz_cap_serial_k,               //
     typename enable_ = void                                      //
     >
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires score_like<score_type_> && substituter_like<substituter_type_> && gap_costs_like<gap_costs_type_>
 #endif
 struct horizontal_walker;
@@ -410,7 +410,7 @@ template <                                         //
     sz_capability_t capability_ = sz_cap_serial_k, //
     typename enable_ = void                        //
     >
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires gap_costs_like<gap_costs_type_>
 #endif
 struct levenshtein_distances;
@@ -422,7 +422,7 @@ template <                                         //
     sz_capability_t capability_ = sz_cap_serial_k, //
     typename enable_ = void                        //
     >
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires gap_costs_like<gap_costs_type_>
 #endif
 struct levenshtein_distances_utf8;
@@ -435,7 +435,7 @@ template <                                              //
     sz_capability_t capability_ = sz_cap_serial_k,      //
     typename enable_ = void                             //
     >
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires substituter_like<substituter_type_> && gap_costs_like<gap_costs_type_>
 #endif
 struct needleman_wunsch_scores;
@@ -448,7 +448,7 @@ template <                                              //
     sz_capability_t capability_ = sz_cap_serial_k,      //
     typename enable_ = void                             //
     >
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires substituter_like<substituter_type_> && gap_costs_like<gap_costs_type_>
 #endif
 struct smith_waterman_scores;
@@ -510,7 +510,7 @@ using affine_levenshtein_utf8_ice_t = levenshtein_distances_utf8<char, affine_ga
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
              substituter_like<substituter_type_>
 #endif
@@ -565,7 +565,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
      *  @param n The length of the diagonal to evaluate and the number of characters to compare from each string.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     void operator()(                                                                     //
@@ -602,7 +602,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
              substituter_like<substituter_type_>
 #endif
@@ -621,7 +621,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 
     using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
     using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
-    static_assert(std::is_same<first_char_t, second_char_t>(), "String characters must be of the same type.");
+    static_assert(is_same_type<first_char_t, second_char_t>(), "String characters must be of the same type.");
     using char_t = first_char_t;
 
     using tile_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, gap_costs_t,
@@ -649,7 +649,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     score_t score() const noexcept { return best_score_; }
 
     template <typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     void operator()(                                                                           //
@@ -693,7 +693,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
              substituter_like<substituter_type_>
 #endif
@@ -757,7 +757,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
      *  @param n The length of the diagonal to evaluate and the number of characters to compare from each string.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     void operator()(                                                                     //
@@ -809,7 +809,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
              substituter_like<substituter_type_>
 #endif
@@ -828,7 +828,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 
     using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
     using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
-    static_assert(std::is_same<first_char_t, second_char_t>(), "String characters must be of the same type.");
+    static_assert(is_same_type<first_char_t, second_char_t>(), "String characters must be of the same type.");
     using char_t = first_char_t;
 
     using tile_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, gap_costs_t,
@@ -863,7 +863,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     score_t score() const noexcept { return best_score_; }
 
     template <typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     void operator()(                                                                           //
@@ -930,7 +930,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 template <typename char_type_, typename score_type_, typename substituter_type_, typename allocator_type_,
           sz_similarity_objective_t objective_, sz_similarity_locality_t locality_, sz_capability_t capability_,
           typename enable_>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires score_like<score_type_> && substituter_like<substituter_type_>
 #endif
 struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_costs_t, allocator_type_, objective_,
@@ -971,7 +971,7 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_co
      *  @param[out] result_ref Location to dump the calculated score.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref,
@@ -1123,7 +1123,7 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_co
 template <typename char_type_, typename score_type_, typename substituter_type_, typename allocator_type_,
           sz_similarity_objective_t objective_, sz_similarity_locality_t locality_, sz_capability_t capability_,
           typename enable_>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires score_like<score_type_> && substituter_like<substituter_type_>
 #endif
 struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_costs_t, allocator_type_, objective_,
@@ -1165,7 +1165,7 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
      *  @param[out] result_ref Location to dump the calculated score.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref,
@@ -1347,7 +1347,7 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
  */
 template <typename char_type_, typename score_type_, typename substituter_type_, typename allocator_type_,
           sz_similarity_objective_t objective_, sz_similarity_locality_t locality_>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires score_like<score_type_> && substituter_like<substituter_type_>
 #endif
 struct horizontal_walker<char_type_, score_type_, substituter_type_, linear_gap_costs_t, allocator_type_, objective_,
@@ -1391,7 +1391,7 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, linear_gap_
      *  @param[out] result_ref Location to dump the calculated score.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref,
@@ -1472,7 +1472,7 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, linear_gap_
  */
 template <typename char_type_, typename score_type_, typename substituter_type_, typename allocator_type_,
           sz_similarity_objective_t objective_, sz_similarity_locality_t locality_>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires score_like<score_type_> && substituter_like<substituter_type_>
 #endif
 struct horizontal_walker<char_type_, score_type_, substituter_type_, affine_gap_costs_t, allocator_type_, objective_,
@@ -1516,7 +1516,7 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, affine_gap_
      *  @param[out] result_ref Location to dump the calculated score.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref,
@@ -1624,7 +1624,7 @@ template <                                         //
     sz_capability_t capability_ = sz_cap_serial_k, //
     typename enable_ = void                        //
     >
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires gap_costs_like<gap_costs_type_>
 #endif
 struct levenshtein_distance {
@@ -1669,7 +1669,7 @@ struct levenshtein_distance {
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, size_t &result_ref,
@@ -1677,7 +1677,7 @@ struct levenshtein_distance {
 
         // If the cost of gap opening and extension is the same and we've mistakenly instantiated
         // the more memory-intensive `affine_gap_costs_t`, we can fall-back to the linearized version.
-        if constexpr (std::is_same<gap_costs_t, affine_gap_costs_t>())
+        if constexpr (is_same_type<gap_costs_t, affine_gap_costs_t>())
             if (gap_costs_.open == gap_costs_.extend) {
                 linear_gap_costs_t linear_gap {gap_costs_.open};
                 linearized_fallback_t linear_backend(substituter_, linear_gap, alloc_);
@@ -1741,7 +1741,7 @@ template <                                         //
     sz_capability_t capability_ = sz_cap_serial_k, //
     typename enable_ = void                        //
     >
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires gap_costs_like<gap_costs_type_>
 #endif
 struct levenshtein_distance_utf8 {
@@ -1787,7 +1787,7 @@ struct levenshtein_distance_utf8 {
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, size_t &result_ref,
@@ -1795,7 +1795,7 @@ struct levenshtein_distance_utf8 {
 
         // If the cost of gap opening and extension is the same and we've mistakenly instantiated
         // the more memory-intensive `affine_gap_costs_t`, we can fall-back to the linearized version.
-        if constexpr (std::is_same<gap_costs_t, affine_gap_costs_t>())
+        if constexpr (is_same_type<gap_costs_t, affine_gap_costs_t>())
             if (gap_costs_.open == gap_costs_.extend) {
                 linear_gap_costs_t linear_gap {gap_costs_.open};
                 linearized_fallback_t linear_backend(substituter_, linear_gap, alloc_);
@@ -1887,7 +1887,7 @@ template <                                              //
     sz_capability_t capability_ = sz_cap_serial_k,      //
     typename enable_ = void                             //
     >
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires gap_costs_like<gap_costs_type_>
 #endif
 struct needleman_wunsch_score {
@@ -1927,7 +1927,7 @@ struct needleman_wunsch_score {
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref,
@@ -1982,7 +1982,7 @@ template <                                              //
     sz_capability_t capability_ = sz_cap_serial_k,      //
     typename enable_ = void                             //
     >
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires gap_costs_like<gap_costs_type_>
 #endif
 struct smith_waterman_score {
@@ -2022,7 +2022,7 @@ struct smith_waterman_score {
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref,
@@ -2093,7 +2093,7 @@ template <                                     //
     typename results_type_,                    //
     typename executor_type_ = dummy_executor_t //
     >
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires score_like<score_type_> && executor_like<executor_type_>
 #endif
 status_t _score_in_parallel(                                                                                       //
@@ -2109,7 +2109,7 @@ status_t _score_in_parallel(
 
     auto first_size = first_strings.size();
     auto second_size = second_strings.size();
-    _sz_assert(first_size == second_size && "Expect equal number of strings");
+    sz_assert_(first_size == second_size && "Expect equal number of strings");
 
     // Use an atomic to store any error encountered.
     std::atomic<status_t> error {status_t::success_k};
@@ -2159,7 +2159,7 @@ template <                         //
     typename second_strings_type_, //
     typename results_type_         //
     >
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires score_like<score_type_>
 #endif
 status_t _score_sequentially(                                                                                      //
@@ -2170,7 +2170,7 @@ status_t _score_sequentially(
 
     auto first_size = first_strings.size();
     auto second_size = second_strings.size();
-    _sz_assert(first_size == second_size && "Expect equal number of strings");
+    sz_assert_(first_size == second_size && "Expect equal number of strings");
 
     for (size_t i = 0; i < first_size; ++i) {
         score_t result = 0;
@@ -2190,7 +2190,7 @@ template <                       //
     sz_capability_t capability_, //
     typename enable_             //
     >
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires gap_costs_like<gap_costs_type_>
 #endif
 struct levenshtein_distances {
@@ -2221,7 +2221,7 @@ struct levenshtein_distances {
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_,
               typename executor_type_>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
@@ -2241,7 +2241,7 @@ template <                       //
     sz_capability_t capability_, //
     typename enable_             //
     >
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires gap_costs_like<gap_costs_type_>
 #endif
 struct levenshtein_distances_utf8 {
@@ -2272,7 +2272,7 @@ struct levenshtein_distances_utf8 {
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_,
               typename executor_type_>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
@@ -2293,7 +2293,7 @@ template <                       //
     sz_capability_t capability_, //
     typename enable_             //
     >
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires substituter_like<substituter_type_> && gap_costs_like<gap_costs_type_>
 #endif
 struct needleman_wunsch_scores {
@@ -2324,7 +2324,7 @@ struct needleman_wunsch_scores {
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_,
               typename executor_type_>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
@@ -2345,7 +2345,7 @@ template <                       //
     sz_capability_t capability_, //
     typename enable_             //
     >
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
     requires substituter_like<substituter_type_> && gap_costs_like<gap_costs_type_>
 #endif
 struct smith_waterman_scores {
@@ -2376,7 +2376,7 @@ struct smith_waterman_scores {
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_,
               typename executor_type_>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
@@ -2590,7 +2590,7 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
      *  @brief  Computes one diagonal of the `u8` DM matrix for exactly 64 characters,
      *          using unaligned loads, but forcing @b aligned stores.
      */
-    SZ_FORCE_INLINE void slice_aligned64chars(                                       //
+    SZ_INLINE void slice_aligned64chars(                                             //
         char const *first_reversed_slice, char const *second_slice,                  //
         sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion, //
         sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new,                     //
@@ -2624,7 +2624,7 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
      *  @brief  Computes one diagonal of the `u8` DM matrix for up to 64 characters,
      *          using unaligned loads and stores.
      */
-    SZ_FORCE_INLINE void slice_upto64chars(                                          //
+    SZ_INLINE void slice_upto64chars(                                                //
         char const *first_reversed_slice, char const *second_slice, size_t n,        //
         sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion, //
         sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new,                     //
@@ -2639,7 +2639,7 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
-        load_mask = _sz_u64_mask_until(n);
+        load_mask = sz_u64_mask_until_(n);
         first_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, first_reversed_slice);
         second_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, second_slice);
         pre_substitution_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, scores_pre_substitution);
@@ -2661,7 +2661,7 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
         sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new,                         //
         dummy_executor_t executor = {}) noexcept {
 
-        sz_unused(executor); // On such small inputs, we don't need to worry about parallelism.
+        sz_unused_(executor); // On such small inputs, we don't need to worry about parallelism.
 
         // Initialize constats:
         sz_u512_vec_t match_cost_vec, mismatch_cost_vec, gap_cost_vec;
@@ -2737,7 +2737,7 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, uniform_substi
      *  @brief  Computes one diagonal of the `u8` DM matrix for exactly 16 characters,
      *          using unaligned loads, but forcing @b aligned stores.
      */
-    SZ_FORCE_INLINE void slice_aligned16chars(                                       //
+    SZ_INLINE void slice_aligned16chars(                                             //
         sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice,        //
         sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion, //
         sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new,                     //
@@ -2770,7 +2770,7 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, uniform_substi
      *  @brief  Computes one diagonal of the `u8` DM matrix for up to 16 characters,
      *          using unaligned loads and stores.
      */
-    SZ_FORCE_INLINE void slice_upto16chars(                                             //
+    SZ_INLINE void slice_upto16chars(                                                   //
         sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, size_t n, //
         sz_u8_t const *scores_pre_substitution, sz_u8_t const *scores_pre_insertion,    //
         sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new,                        //
@@ -2785,7 +2785,7 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, uniform_substi
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
-        load_mask = _sz_u16_mask_until(n);
+        load_mask = sz_u16_mask_until_(n);
         first_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, first_reversed_slice);
         second_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, second_slice);
         pre_substitution_vec.xmm = _mm_maskz_loadu_epi8(load_mask, scores_pre_substitution);
@@ -2806,7 +2806,7 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, uniform_substi
         sz_u8_t const *scores_pre_deletion, sz_u8_t *scores_new,                                   //
         dummy_executor_t executor = {}) noexcept {
 
-        sz_unused(executor); // On such small inputs, we don't need to worry about parallelism.
+        sz_unused_(executor); // On such small inputs, we don't need to worry about parallelism.
 
         // Initialize constats:
         sz_u128_vec_t match_cost_vec, mismatch_cost_vec, gap_cost_vec;
@@ -2882,7 +2882,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
      *  @brief  Computes one diagonal of the `u16` DM matrix for exactly 16 characters,
      *          using unaligned loads, but forcing @b aligned stores.
      */
-    SZ_FORCE_INLINE void slice_aligned32chars(                                         //
+    SZ_INLINE void slice_aligned32chars(                                               //
         char const *first_reversed_slice, char const *second_slice,                    //
         sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion, //
         sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new,                     //
@@ -2911,7 +2911,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
         _mm512_store_si512(scores_new, cell_score_vec.zmm);
     }
 
-    SZ_FORCE_INLINE void slice_upto32chars(                                            //
+    SZ_INLINE void slice_upto32chars(                                                  //
         char const *first_reversed_slice, char const *second_slice, size_t n,          //
         sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion, //
         sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new,                     //
@@ -2925,7 +2925,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
-        load_mask = _sz_u32_mask_until(n);
+        load_mask = sz_u32_mask_until_(n);
         first_vec.ymm = _mm256_maskz_loadu_epi8(load_mask, first_reversed_slice);
         second_vec.ymm = _mm256_maskz_loadu_epi8(load_mask, second_slice);
         pre_substitution_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_substitution);
@@ -2942,7 +2942,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
     }
 
     template <typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     inline void operator()(                                                              //
@@ -3024,7 +3024,7 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_subst
      *  @brief  Computes one diagonal of the `u16` DM matrix for exactly 16 characters,
      *          using unaligned loads, but forcing @b aligned stores.
      */
-    SZ_FORCE_INLINE void slice_aligned16chars(                                         //
+    SZ_INLINE void slice_aligned16chars(                                               //
         sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice,          //
         sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion, //
         sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new,                     //
@@ -3057,7 +3057,7 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_subst
      *  @brief  Computes one diagonal of the `u16` DM matrix for up to 16 characters,
      *          using unaligned loads and stores.
      */
-    SZ_FORCE_INLINE void slice_upto16chars(                                             //
+    SZ_INLINE void slice_upto16chars(                                                   //
         sz_rune_t const *first_reversed_slice, sz_rune_t const *second_slice, size_t n, //
         sz_u16_t const *scores_pre_substitution, sz_u16_t const *scores_pre_insertion,  //
         sz_u16_t const *scores_pre_deletion, sz_u16_t *scores_new,                      //
@@ -3071,7 +3071,7 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_subst
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
-        load_mask = _sz_u16_mask_until(n);
+        load_mask = sz_u16_mask_until_(n);
         first_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, first_reversed_slice);
         second_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, second_slice);
         pre_substitution_vec.ymm = _mm256_maskz_loadu_epi16(load_mask, scores_pre_substitution);
@@ -3088,7 +3088,7 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_subst
     }
 
     template <typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     inline void operator()(                                                                        //
@@ -3170,7 +3170,7 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
      *  @brief  Computes one diagonal of the `u32` DM matrix for exactly 16 characters,
      *          using unaligned loads, but forcing @b aligned stores.
      */
-    SZ_FORCE_INLINE void slice_aligned16chars(                                         //
+    SZ_INLINE void slice_aligned16chars(                                               //
         char const *first_reversed_slice, char const *second_slice,                    //
         sz_u32_t const *scores_pre_substitution, sz_u32_t const *scores_pre_insertion, //
         sz_u32_t const *scores_pre_deletion, sz_u32_t *scores_new,                     //
@@ -3203,7 +3203,7 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
      *  @brief  Computes one diagonal of the `u32` DM matrix for up to 16 characters,
      *          using unaligned loads and stores.
      */
-    SZ_FORCE_INLINE void slice_upto16chars(                                            //
+    SZ_INLINE void slice_upto16chars(                                                  //
         char const *first_reversed_slice, char const *second_slice, size_t n,          //
         sz_u32_t const *scores_pre_substitution, sz_u32_t const *scores_pre_insertion, //
         sz_u32_t const *scores_pre_deletion, sz_u32_t *scores_new,                     //
@@ -3217,7 +3217,7 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
-        load_mask = _sz_u16_mask_until(n);
+        load_mask = sz_u16_mask_until_(n);
         first_vec.xmm = _mm_maskz_loadu_epi8(load_mask, first_reversed_slice);
         second_vec.xmm = _mm_maskz_loadu_epi8(load_mask, second_slice);
         pre_substitution_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, scores_pre_substitution);
@@ -3234,7 +3234,7 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
     }
 
     template <typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     inline void operator()(                                                              //
@@ -3315,7 +3315,7 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
      *  @brief  Computes one diagonal of the `u8` DM matrix for up to 64 characters,
      *          using unaligned loads and stores.
      */
-    SZ_FORCE_INLINE void slice_upto64chars(                                   //
+    SZ_INLINE void slice_upto64chars(                                         //
         char const *first_reversed_slice, char const *second_slice, size_t n, //
         sz_u8_t const *scores_pre_substitution,                               //
         sz_u8_t const *scores_pre_insertion,                                  //
@@ -3337,7 +3337,7 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
-        load_mask = _sz_u64_mask_until(n);
+        load_mask = sz_u64_mask_until_(n);
         first_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, first_reversed_slice);
         second_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, second_slice);
         pre_substitution_vec.zmm = _mm512_maskz_loadu_epi8(load_mask, scores_pre_substitution);
@@ -3374,7 +3374,7 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
         sz_u8_t *scores_new_deletions,                                                   //
         dummy_executor_t executor = {}) noexcept {
 
-        sz_unused(executor); // On such small inputs, we don't need to worry about parallelism.
+        sz_unused_(executor); // On such small inputs, we don't need to worry about parallelism.
 
         // Initialize constats:
         sz_u512_vec_t match_cost_vec, mismatch_cost_vec, gap_open_vec, gap_expand_vec;
@@ -3427,7 +3427,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
 
     static constexpr size_t step_k = 32;
 
-    SZ_FORCE_INLINE void slice_upto32chars(                                   //
+    SZ_INLINE void slice_upto32chars(                                         //
         char const *first_reversed_slice, char const *second_slice, size_t n, //
         sz_u16_t const *scores_pre_substitution,                              //
         sz_u16_t const *scores_pre_insertion,                                 //
@@ -3449,7 +3449,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
-        load_mask = _sz_u32_mask_until(n);
+        load_mask = sz_u32_mask_until_(n);
         first_vec.ymm = _mm256_maskz_loadu_epi8(load_mask, first_reversed_slice);
         second_vec.ymm = _mm256_maskz_loadu_epi8(load_mask, second_slice);
         pre_substitution_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_substitution);
@@ -3475,7 +3475,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
     }
 
     template <typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     inline void operator()(                                                              //
@@ -3548,7 +3548,7 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
      *  @brief  Computes one diagonal of the `u32` DM matrix for up to 16 characters,
      *          using unaligned loads and stores.
      */
-    SZ_FORCE_INLINE void slice_upto16chars(                                   //
+    SZ_INLINE void slice_upto16chars(                                         //
         char const *first_reversed_slice, char const *second_slice, size_t n, //
         sz_u32_t const *scores_pre_substitution,                              //
         sz_u32_t const *scores_pre_insertion,                                 //
@@ -3570,7 +3570,7 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
 
         // ? Note that here we are still traversing both buffers in the same order,
         // ? because one of the strings has been reversed beforehand.
-        load_mask = _sz_u16_mask_until(n);
+        load_mask = sz_u16_mask_until_(n);
         first_vec.xmm = _mm_maskz_loadu_epi8(load_mask, first_reversed_slice);
         second_vec.xmm = _mm_maskz_loadu_epi8(load_mask, second_slice);
         pre_substitution_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, scores_pre_substitution);
@@ -3596,7 +3596,7 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
     }
 
     template <typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     inline void operator()(                                                              //
@@ -3692,7 +3692,7 @@ struct levenshtein_distance<char, gap_costs_type_, allocator_type_, capability_,
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, size_t &result_ref,
@@ -3781,7 +3781,7 @@ struct levenshtein_distance_utf8<char, linear_gap_costs_t, allocator_type_, capa
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, size_t &result_ref,
@@ -3935,7 +3935,7 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_
     _lookup_in256bytes_ice_t lookup_;
 
     template <typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     void operator()(                                                                   //
@@ -3965,8 +3965,8 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_
         // To perform the same operation in vectorized form, we need to perform a tree-like reduction,
         // that will involve multiple steps. It's quite expensive and should be first tested in the
         // "experimental" section.
-        _sz_assert(scores_pre_substitution + 1 == scores_pre_insertion && "Expects horizontal traversal of DP matrix");
-        _sz_assert(scores_pre_deletion + 1 == scores_new && "Expects horizontal traversal of DP matrix");
+        sz_assert_(scores_pre_substitution + 1 == scores_pre_insertion && "Expects horizontal traversal of DP matrix");
+        sz_assert_(scores_pre_deletion + 1 == scores_new && "Expects horizontal traversal of DP matrix");
         sz_i16_t last_in_row = scores_pre_deletion[0];
         for (size_t i = 0; i < n; ++i) scores_new[i] = last_in_row = sz_max_of_two(scores_new[i], last_in_row + gap);
         this->last_score_ = last_in_row;
@@ -4035,7 +4035,7 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_
         gap_cost_vec.zmm = _mm512_set1_epi16(gap);
 
         // Load the data with a mask:
-        load_mask = _sz_u32_mask_until(n - i);
+        load_mask = sz_u32_mask_until_(n - i);
         second_vec.ymms[0] = _mm256_maskz_loadu_epi8(load_mask, second_slice + i);
         pre_substitution_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_substitution + i);
         pre_gap_vec.zmm = _mm512_maskz_loadu_epi16(load_mask, scores_pre_insertion + i);
@@ -4075,7 +4075,7 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i32_t, error_costs_
     _lookup_in256bytes_ice_t lookup_;
 
     template <typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     void operator()(                                                                   //
@@ -4105,8 +4105,8 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i32_t, error_costs_
         // To perform the same operation in vectorized form, we need to perform a tree-like reduction,
         // that will involve multiple steps. It's quite expensive and should be first tested in the
         // "experimental" section.
-        _sz_assert(scores_pre_substitution + 1 == scores_pre_insertion && "Expects horizontal traversal of DP matrix");
-        _sz_assert(scores_pre_deletion + 1 == scores_new && "Expects horizontal traversal of DP matrix");
+        sz_assert_(scores_pre_substitution + 1 == scores_pre_insertion && "Expects horizontal traversal of DP matrix");
+        sz_assert_(scores_pre_deletion + 1 == scores_new && "Expects horizontal traversal of DP matrix");
         sz_i32_t last_in_row = scores_pre_deletion[0];
         for (size_t i = 0; i < n; ++i) scores_new[i] = last_in_row = sz_max_of_two(scores_new[i], last_in_row + gap);
         this->last_score_ = last_in_row;
@@ -4195,7 +4195,7 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i32_t, error_costs_
         gap_cost_vec.zmm = _mm512_set1_epi32(gap);
 
         // Load the data with a mask:
-        load_mask = _sz_u16_clamp_mask_until(n - i);
+        load_mask = sz_u16_clamp_mask_until_(n - i);
         second_vec.xmms[0] = _mm_maskz_loadu_epi8(load_mask, second_slice + i);
         pre_substitution_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, scores_pre_substitution + i);
         pre_gap_vec.zmm = _mm512_maskz_loadu_epi32(load_mask, scores_pre_insertion + i);
@@ -4280,7 +4280,7 @@ struct needleman_wunsch_score<char, error_costs_256x256_t, linear_gap_costs_t, a
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref,
@@ -4354,7 +4354,7 @@ struct smith_waterman_score<char, error_costs_256x256_t, linear_gap_costs_t, all
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref,
diff --git a/include/stringzillas/types.cuh b/include/stringzillas/types.cuh
index 9575d390..fedcc634 100644
--- a/include/stringzillas/types.cuh
+++ b/include/stringzillas/types.cuh
@@ -37,6 +37,8 @@
 namespace ashvardanian {
 namespace stringzillas {
 
+using namespace stringzilla;
+
 /**
  *  @brief  A custom allocator that uses CUDA Unified Memory for allocation.
  */
diff --git a/include/stringzillas/types.hpp b/include/stringzillas/types.hpp
index f4745a31..0ba92bb7 100644
--- a/include/stringzillas/types.hpp
+++ b/include/stringzillas/types.hpp
@@ -106,7 +106,7 @@ struct dummy_executor_t {
     }
 };
 
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
 template <typename executor_type_>
 concept executor_like = requires(executor_type_ executor) {
 #if !defined(__NVCC__) && 0
@@ -200,7 +200,7 @@ struct openmp_executor_t {
     }
 };
 
-#if _SZ_IS_CPP20
+#if SZ_IS_CPP20_
 #if !defined(__NVCC__)
 static_assert(executor_like<dummy_executor_t>);
 static_assert(executor_like<openmp_executor_t>);
diff --git a/pyproject.toml b/pyproject.toml
index 3f32dfb2..ab3d7e02 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -79,50 +79,50 @@ before-build = ["rd /s /q {project}\\build || echo Done"]
 [[tool.cibuildwheel.overrides]]
 select = "*-win_amd64"
 inherit.environment = "append"
-environment._SZ_IS_X86_64 = "1"
+environment.SZ_IS_64BIT_X86_ = "1"
 
 [[tool.cibuildwheel.overrides]]
 select = "*-manylinux*_x86_64"
 inherit.environment = "append"
-environment._SZ_IS_X86_64 = "1"
+environment.SZ_IS_64BIT_X86_ = "1"
 
 [[tool.cibuildwheel.overrides]]
 select = "*-musllinux*_x86_64"
 inherit.environment = "append"
-environment._SZ_IS_X86_64 = "1"
+environment.SZ_IS_64BIT_X86_ = "1"
 
 [[tool.cibuildwheel.overrides]]
 select = "*-macos*_x86_64"
 inherit.environment = "append"
-environment._SZ_IS_X86_64 = "1"
+environment.SZ_IS_64BIT_X86_ = "1"
 
 # Detect ARM 64-bit builds
 [[tool.cibuildwheel.overrides]]
 select = "*-win_arm64"
 inherit.environment = "append"
-environment._SZ_IS_ARM64 = "1"
+environment.SZ_IS_64BIT_ARM_ = "1"
 
 [[tool.cibuildwheel.overrides]]
 select = "*-manylinux*_aarch64"
 inherit.environment = "append"
-environment._SZ_IS_ARM64 = "1"
+environment.SZ_IS_64BIT_ARM_ = "1"
 
 [[tool.cibuildwheel.overrides]]
 select = "*-musllinux*_aarch64"
 inherit.environment = "append"
-environment._SZ_IS_ARM64 = "1"
+environment.SZ_IS_64BIT_ARM_ = "1"
 
 [[tool.cibuildwheel.overrides]]
 select = "*-macos*_arm64"
 inherit.environment = "append"
-environment._SZ_IS_ARM64 = "1"
+environment.SZ_IS_64BIT_ARM_ = "1"
 
 # Detect MacOS Universal2 builds
 [[tool.cibuildwheel.overrides]]
 select = "*-macos*_universal2"
 inherit.environment = "append"
-environment._SZ_IS_X86_64 = "1"
-environment._SZ_IS_ARM64 = "1"
+environment.SZ_IS_64BIT_X86_ = "1"
+environment.SZ_IS_64BIT_ARM_ = "1"
 
 [tool.cibuildwheel.macos.environment]
 MACOSX_DEPLOYMENT_TARGET = "10.11"
diff --git a/python/lib.c b/python/lib.c
index d6ce5f64..b73b39aa 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -333,7 +333,7 @@ void wrap_current_exception(sz_cptr_t comment) {
     // ? Prior to Python 3.12 we need to fetch and restore the exception state using
     // ? `PyErr_Fetch` and `PyErr_Restore` to avoid overwriting the current exception.
     // ? After Python 3.12 we can use `PyErr_GetRaisedException` and `PyErr_SetRaisedException`.
-    sz_unused(comment);
+    sz_unused_(comment);
 }
 
 typedef void (*get_string_at_offset_t)(Strs *, Py_ssize_t, Py_ssize_t, PyObject **, char const **, size_t *);
@@ -2264,7 +2264,7 @@ static PyObject *Str_translate(PyObject *self, PyObject *args, PyObject *kwargs)
     }
 
     sz_string_view_t look_up_table_str;
-    _SZ_ALIGN64 char look_up_table[256];
+    SZ_ALIGN64 char look_up_table[256];
     if (PyDict_Check(look_up_table_obj)) {
 
         // If any character is not defined, it will be replaced with itself:
diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index c49dd269..25c598d4 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -103,7 +103,7 @@ inline std::uint64_t cpu_cycle_counter() {
     unsigned int lo, hi;
     __asm__ volatile("rdtsc" : "=a"(lo), "=d"(hi));
     return (static_cast<std::uint64_t>(hi) << 32) | lo;
-#elif defined(__aarch64__) || defined(_SZ_IS_ARM64)
+#elif defined(__aarch64__) || defined(SZ_IS_64BIT_ARM_)
     // On ARM64, read the virtual count register `CNTVCT_EL0` which provides cycle count.
     std::uint64_t cnt;
     asm volatile("mrs %0, cntvct_el0" : "=r"(cnt));
@@ -333,7 +333,7 @@ inline environment_t build_environment(                                        /
     std::size_t default_seed = 0                                               //
     ) noexcept(false) {
 
-    sz_unused(argc && argv); // Unused in this context
+    sz_unused_(argc && argv); // Unused in this context
     environment_t env;
 
     // Use `STRINGWARS_DATASET` if set, otherwise `default_dataset`
@@ -613,7 +613,7 @@ struct bench_result_t {
 
         // Expand over all provided baselines.
         (void)std::initializer_list<int> {(log_relative(bases), 0)...};
-        sz_unused(log_relative); // In case no `bases` were provided
+        sz_unused_(log_relative); // In case no `bases` were provided
 
         return *this;
     }
@@ -650,10 +650,10 @@ bench_result_t bench_nullary(  //
     }
 
     // Pre-process before testing
-    if constexpr (!std::is_same<preprocessing_type_, callable_no_op_t>()) preprocessing();
+    if constexpr (!is_same_type<preprocessing_type_, callable_no_op_t>()) preprocessing();
 
     // Perform the testing against the baseline, if provided.
-    if constexpr (!std::is_same<baseline_type_, callable_no_op_t>())
+    if constexpr (!is_same_type<baseline_type_, callable_no_op_t>())
         for (auto running_seconds : repeat_up_to(env.stress ? env.stress_seconds : 0)) {
             call_result_t const accelerated_result = callable();
             call_result_t const baseline_result = baseline();
@@ -723,10 +723,10 @@ bench_result_t bench_unary(    //
     }
 
     // Pre-process before testing
-    if constexpr (!std::is_same<preprocessing_type_, callable_no_op_t>()) preprocessing();
+    if constexpr (!is_same_type<preprocessing_type_, callable_no_op_t>()) preprocessing();
 
     std::size_t const lookup_mask = bit_floor(env.tokens.size()) - 1;
-    if constexpr (!std::is_same<baseline_type_, callable_no_op_t>())
+    if constexpr (!is_same_type<baseline_type_, callable_no_op_t>())
         for (auto running_seconds : repeat_up_to(env.stress ? env.stress_seconds : 0)) {
             std::size_t const token_index = (result.stress_calls++) & lookup_mask;
             call_result_t const accelerated_result = callable(token_index);
diff --git a/scripts/bench_fingerprint.cuh b/scripts/bench_fingerprint.cuh
index 920b589f..ee0d9e9c 100644
--- a/scripts/bench_fingerprint.cuh
+++ b/scripts/bench_fingerprint.cuh
@@ -140,11 +140,11 @@ void bench_fingerprint(environment_t const &env) {
     auto call_serial = fingerprint_callable<rolling_serial_t>(env, fingerprints_baseline, *rolling_serial);
     bench_result_t serial = bench_nullary(env, "rolling_serial", call_serial).log(baseline);
 
-    bench_nullary(                                                                                //
-        env, "rolling_skylake", call_serial,                                                      //
-        fingerprint_callable<rolling_skylake_t>(env, fingerprints_accelerated, *rolling_skylake), //
-        callable_no_op_t {},                                                                      // preprocessing
-        fingerprints_equality_t {})                                                               // equality check
+    bench_nullary(                                                                              //
+        env, "rolling_serial", call_serial,                                                     //
+        fingerprint_callable<rolling_serial_t>(env, fingerprints_accelerated, *rolling_serial), //
+        callable_no_op_t {},                                                                    // preprocessing
+        fingerprints_equality_t {})                                                             // equality check
         .log(baseline, serial);
     scramble_accelerated_results();
 }
diff --git a/scripts/bench_memory.cpp b/scripts/bench_memory.cpp
index d905e763..b566d09c 100644
--- a/scripts/bench_memory.cpp
+++ b/scripts/bench_memory.cpp
@@ -262,7 +262,7 @@ void memset_like_sz(sz_ptr_t output, sz_size_t length, sz_u8_t value) { std::mem
 void generate_like_sz(sz_ptr_t output, sz_size_t length, sz_u64_t nonce) {
     uniform_u8_distribution_t distribution;
     std::generate(output, output + length, [&]() -> char { return distribution(global_random_generator()); });
-    sz_unused(nonce);
+    sz_unused_(nonce);
 }
 
 /**
diff --git a/scripts/bench_sequence.cpp b/scripts/bench_sequence.cpp
index 61011e09..e3caaede 100644
--- a/scripts/bench_sequence.cpp
+++ b/scripts/bench_sequence.cpp
@@ -66,9 +66,9 @@ using strings_t = std::vector<std::string_view>;
 using permute_t = std::vector<sz_sorted_idx_t>;
 
 #if __linux__ && defined(_GNU_SOURCE) && !defined(__BIONIC__)
-#define _SZ_HAS_QSORT_R 1
+#define SZ_HAS_QSORT_R_ 1
 #elif defined(_MSC_VER)
-#define _SZ_HAS_QSORT_S 1
+#define SZ_HAS_QSORT_S_ 1
 #endif
 
 /** @brief Helper function to distill a large @b `permute_t` object down to a single comparable hash integer. */
@@ -142,7 +142,7 @@ struct argsort_strings_via_std_t {
     }
 };
 
-#if defined(_SZ_HAS_QSORT_R) || defined(_SZ_HAS_QSORT_S)
+#if defined(SZ_HAS_QSORT_R_) || defined(SZ_HAS_QSORT_S_)
 
 struct argsort_strings_via_qsort_t {
     strings_t const &input;
@@ -158,9 +158,9 @@ struct argsort_strings_via_qsort_t {
         array.handle = &input;
         array.get_start = get_start;
         array.get_length = get_length;
-#if defined(_SZ_HAS_QSORT_R)
+#if defined(SZ_HAS_QSORT_R_)
         qsort_r(output.data(), array.count, sizeof(sz_sorted_idx_t), _get_qsort_order, &array);
-#elif defined(_SZ_HAS_QSORT_S)
+#elif defined(SZ_HAS_QSORT_S_)
         qsort_s(output.data(), array.count, sizeof(sz_sorted_idx_t), _get_qsort_order, &array);
 #endif
 
@@ -224,7 +224,7 @@ void bench_sequenceing_strings(environment_t const &env) {
 #endif
 
     // Include POSIX and WinAPI functionality
-#if defined(_SZ_HAS_QSORT_R) || defined(_SZ_HAS_QSORT_S)
+#if defined(SZ_HAS_QSORT_R_) || defined(SZ_HAS_QSORT_S_)
     auto qsort_call = argsort_strings_via_qsort_t {env.tokens, permute_buffer};
     bench_nullary(env, "sequence_argsort<qsort>", base_call, qsort_call).log(base);
 #endif
diff --git a/scripts/bench_similarity.cuh b/scripts/bench_similarity.cuh
index d6dfd125..de8efb52 100644
--- a/scripts/bench_similarity.cuh
+++ b/scripts/bench_similarity.cuh
@@ -134,7 +134,7 @@ void bench_levenshtein(environment_t const &env) {
         auto name_affine_baseline = "affine_levenshtein_serial:batch"s + std::to_string(batch_size);
         bench_result_t affine_baseline =
             bench_unary(env, name_affine_baseline, call_affine_baseline).log(linear_baseline);
-        sz_unused(affine_baseline);
+        sz_unused_(affine_baseline);
 
 #if SZ_USE_ICE
         bench_unary(env, "levenshtein_ice:batch"s + std::to_string(batch_size), call_linear_baseline,
diff --git a/scripts/test_fingerprint.cuh b/scripts/test_fingerprint.cuh
index c3805bce..c1373029 100644
--- a/scripts/test_fingerprint.cuh
+++ b/scripts/test_fingerprint.cuh
@@ -17,7 +17,7 @@
 #include "stringzillas/fingerprint.cuh"
 #endif
 
-#if !_SZ_IS_CPP17
+#if !SZ_IS_CPP17_
 #error "This test requires C++17 or later."
 #endif
 
@@ -55,12 +55,12 @@ void test_rolling_hasher(hasher_type_ &&hasher, std::vector<std::string> const &
         // Pre-populate the rolling-hash state until the first window ends
         hash_t rolling_hash = 0;
         for (std::size_t j = 0; j < window_width; ++j) rolling_hash = hasher.update(rolling_hash, str[j]);
-        _sz_assert(rolling_hash == hashes[0]);
+        sz_assert_(rolling_hash == hashes[0]);
 
         // Now compute the rolling hash and compare it to the slice hashes
         for (std::size_t j = window_width; j < str.size(); ++j) {
             rolling_hash = hasher.update(rolling_hash, str[j - window_width], str[j]);
-            _sz_assert(rolling_hash == hashes[j - window_width + 1]);
+            sz_assert_(rolling_hash == hashes[j - window_width + 1]);
         }
     }
 }
diff --git a/scripts/test_similarity.cuh b/scripts/test_similarity.cuh
index 60c88a40..579bb305 100644
--- a/scripts/test_similarity.cuh
+++ b/scripts/test_similarity.cuh
@@ -11,7 +11,7 @@
 #include "stringzillas/similarity.cuh"
 #endif
 
-#if !_SZ_IS_CPP17
+#if !SZ_IS_CPP17_
 #error "This test requires C++17 or later."
 #endif
 
@@ -292,7 +292,7 @@ struct levenshtein_baselines_t {
 
     template <typename results_type_>
     status_t operator()(arrow_strings_view_t first, arrow_strings_view_t second, results_type_ *results) const {
-        _sz_assert(first.size() == second.size());
+        sz_assert_(first.size() == second.size());
 #pragma omp parallel for
         for (std::size_t i = 0; i != first.size(); ++i)
             results[i] =
@@ -321,7 +321,7 @@ struct needleman_wunsch_baselines_t {
         : substitution_costs(subs), gap_opening_cost(gap.open), gap_extension_cost(gap.extend) {}
 
     status_t operator()(arrow_strings_view_t first, arrow_strings_view_t second, sz_ssize_t *results) const {
-        _sz_assert(first.size() == second.size());
+        sz_assert_(first.size() == second.size());
 
 #pragma omp parallel for
         for (std::size_t i = 0; i != first.size(); ++i)
@@ -350,7 +350,7 @@ struct smith_waterman_baselines_t {
         : substitution_costs(subs), gap_opening_cost(gap.open), gap_extension_cost(gap.extend) {}
 
     status_t operator()(arrow_strings_view_t first, arrow_strings_view_t second, sz_ssize_t *results) const {
-        _sz_assert(first.size() == second.size());
+        sz_assert_(first.size() == second.size());
 
 #pragma omp parallel for
         for (std::size_t i = 0; i != first.size(); ++i)
@@ -501,8 +501,8 @@ void test_similarity_scores_fixed(base_operator_ &&base_operator, simd_operator_
         score_t *results_simd_ptr = results_simd.data();
         status_t status_base = base_operator(first_view, second_view, results_base_ptr);
         status_t status_simd = simd_operator(first_view, second_view, results_simd_ptr, extra_args...);
-        _sz_assert(status_base == status_t::success_k);
-        _sz_assert(status_simd == status_t::success_k);
+        sz_assert_(status_base == status_t::success_k);
+        sz_assert_(status_simd == status_t::success_k);
         if (results_base[0] != results_simd[0])
             edit_distance_log_mismatch(first, second, results_base[0], results_simd[0]);
     }
@@ -514,15 +514,15 @@ void test_similarity_scores_fixed(base_operator_ &&base_operator, simd_operator_
         first_tape.reset();
         second_tape.reset();
         for (auto [first, second] : test_cases) {
-            _sz_assert(first_tape.try_append({first.data(), first.size()}) == status_t::success_k);
-            _sz_assert(second_tape.try_append({second.data(), second.size()}) == status_t::success_k);
+            sz_assert_(first_tape.try_append({first.data(), first.size()}) == status_t::success_k);
+            sz_assert_(second_tape.try_append({second.data(), second.size()}) == status_t::success_k);
         }
 
         // Compute with both backends
         status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
         status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data(), extra_args...);
-        _sz_assert(status_base == status_t::success_k);
-        _sz_assert(status_simd == status_t::success_k);
+        sz_assert_(status_base == status_t::success_k);
+        sz_assert_(status_simd == status_t::success_k);
 
         // Individually log the failed results
         for (std::size_t i = 0; i != test_cases.size(); ++i) {
@@ -554,8 +554,8 @@ void test_similarity_scores_fuzzy(base_operator_ &&base_operator, simd_operator_
         // Compute with both backends
         status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
         status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data(), extra_args...);
-        _sz_assert(status_base == status_t::success_k);
-        _sz_assert(status_simd == status_t::success_k);
+        sz_assert_(status_base == status_t::success_k);
+        sz_assert_(status_simd == status_t::success_k);
 
         // Individually log the failed results
         for (std::size_t i = 0; i != config.batch_size; ++i) {
@@ -597,9 +597,9 @@ void test_similarity_scores_equivalence() {
         // Distance can be computed from the similarity, by inverting the sign around the length of the longest string:
         auto distance_nw = std::max(7, 7) - similarity_nw;
         auto distance_sw = std::max(7, 7) - similarity_sw;
-        _sz_assert(distance_l == 1);
-        _sz_assert(distance_nw == 1);
-        _sz_assert(distance_sw == 1);
+        sz_assert_(distance_l == 1);
+        sz_assert_(distance_nw == 1);
+        sz_assert_(distance_sw == 1);
     }
 
     // Let's define some weird scoring schemes for Levenshtein-like distance, that are not unary:
diff --git a/scripts/test_stringzilla.cpp b/scripts/test_stringzilla.cpp
index d7272c9f..9ccf9d2c 100644
--- a/scripts/test_stringzilla.cpp
+++ b/scripts/test_stringzilla.cpp
@@ -64,7 +64,7 @@
 #include <string>      // Baseline
 #include <string_view> // Baseline
 
-#if !_SZ_IS_CPP11
+#if !SZ_IS_CPP11_
 #error "This test requires C++11 or later."
 #endif
 
@@ -75,7 +75,7 @@ using namespace sz::scripts;
 using sz::literals::operator""_sv; // for `sz::string_view`
 using sz::literals::operator""_bs; // for `sz::byteset`
 
-#if _SZ_IS_CPP17
+#if SZ_IS_CPP17_
 using namespace std::literals; // for ""sv
 #endif
 
@@ -83,7 +83,7 @@ using namespace std::literals; // for ""sv
  *  Instantiate all the templates to make the symbols visible and also check
  *  for weird compilation errors on uncommon paths.
  */
-#if _SZ_IS_CPP17 && __cpp_lib_string_view
+#if SZ_IS_CPP17_ && __cpp_lib_string_view
 template class std::basic_string_view<char>;
 #endif
 template class sz::basic_string_slice<char>;
@@ -152,7 +152,7 @@ static void test_arithmetical_utilities() {
     assert(sz_size_bit_ceil(1000000000ull) == (1ull << 30));
     assert(sz_size_bit_ceil(2000000000ull) == (1ull << 31));
 
-#if _SZ_IS_64_BIT
+#if SZ_IS_64BIT_
     assert(sz_size_bit_ceil(4000000000ull) == (1ull << 32));
     assert(sz_size_bit_ceil(8000000000ull) == (1ull << 33));
     assert(sz_size_bit_ceil(16000000000ull) == (1ull << 34));
@@ -276,7 +276,7 @@ static void test_hash_equivalence(                                      //
 
         // Let's also create an intentionally misaligned version of the state,
         // assuming some of the SIMD instructions may require alignment.
-        _SZ_ALIGN64 char state_misaligned_buffer[sizeof(sz_hash_state_t) + 1];
+        SZ_ALIGN64 char state_misaligned_buffer[sizeof(sz_hash_state_t) + 1];
         sz_hash_state_t &state_misaligned = *reinterpret_cast<sz_hash_state_t *>(state_misaligned_buffer + 1);
         init_simd(&state_misaligned, seed);
         assert(sz_hash_state_equal(&state_base, &state_misaligned) == sz_true_k); // Same across platforms
@@ -540,7 +540,7 @@ static void test_memory_utilities( //
         reverse_offset += fill_length;
     }
 
-    sz_unused(experiments);
+    sz_unused_(experiments);
 
 #if 0 // TODO:
 
@@ -575,7 +575,7 @@ static void test_memory_utilities( //
     do {                                          \
         bool threw = false;                       \
         try {                                     \
-            sz_unused(expression);                \
+            sz_unused_(expression);               \
         }                                         \
         catch (exception_type const &) {          \
             threw = true;                         \
@@ -746,7 +746,7 @@ static void test_stl_compatibility_for_reads() {
     assert(str("b") >= str("a"));
     assert(str("a") < str("aa"));
 
-#if _SZ_IS_CPP20 && __cpp_lib_three_way_comparison
+#if SZ_IS_CPP20_ && __cpp_lib_three_way_comparison
     // Spaceship operator instead of conventional comparions.
     assert((str("a") <=> str("b")) == std::strong_ordering::less);
     assert((str("b") <=> str("a")) == std::strong_ordering::greater);
@@ -789,7 +789,7 @@ static void test_stl_compatibility_for_reads() {
     assert(str("hello world").compare(6, 5, "worlds", 5) == 0);    // Substring "world" in both strings
     assert(str("hello world").compare(6, 5, "worlds", 6) < 0);     // Substring "world" is less than "worlds"
 
-#if _SZ_IS_CPP20 && __cpp_lib_starts_ends_with
+#if SZ_IS_CPP20_ && __cpp_lib_starts_ends_with
     // Prefix and suffix checks against strings.
     assert(str("https://cppreference.com").starts_with(str("http")) == true);
     assert(str("https://cppreference.com").starts_with(str("ftp")) == false);
@@ -809,7 +809,7 @@ static void test_stl_compatibility_for_reads() {
     assert(str("string_view").ends_with("View") == false);
 #endif
 
-#if _SZ_IS_CPP23 && __cpp_lib_string_contains
+#if SZ_IS_CPP23_ && __cpp_lib_string_contains
     // Checking basic substring presence.
     assert(str("hello").contains(str("ell")) == true);
     assert(str("hello").contains(str("oll")) == false);
@@ -840,7 +840,7 @@ static void test_stl_compatibility_for_reads() {
     assert(std::hash<str> {}("hello") != 0);
     assert_scoped(std::ostringstream os, os << str("hello"), os.str() == "hello");
 
-#if _SZ_IS_CPP14
+#if SZ_IS_CPP14_
     // Comparison function objects are a C++14 feature.
     assert(std::equal_to<str> {}("hello", "world") == false);
     assert(std::less<str> {}("hello", "world") == true);
@@ -900,7 +900,7 @@ static void test_stl_compatibility_for_updates() {
 
     // On 32-bit systems the base capacity can be larger than our `z::string::min_capacity`.
     // It's true for MSVC: https://github.com/ashvardanian/StringZilla/issues/168
-    if (_SZ_IS_64_BIT) assert_scoped(str s = "hello", s.shrink_to_fit(), s.capacity() <= sz::string::min_capacity);
+    if (SZ_IS_64BIT_) assert_scoped(str s = "hello", s.shrink_to_fit(), s.capacity() <= sz::string::min_capacity);
 
     // Concatenation.
     // Following are missing in strings, but are present in vectors.
@@ -991,10 +991,10 @@ static void test_stl_conversions() {
         std::string const stl {"hello"};
         sz::string sz = stl;
         sz::string_view szv = stl;
-        sz_unused(sz);
-        sz_unused(szv);
+        sz_unused_(sz);
+        sz_unused_(szv);
     }
-#if _SZ_IS_CPP17 && __cpp_lib_string_view
+#if SZ_IS_CPP17_ && __cpp_lib_string_view
     // From STL `string_view` to StringZilla and vice-versa.
     {
         std::string_view stl {"hello"};
@@ -1469,7 +1469,7 @@ static void test_search() {
     assert(rsplits[4] == "");
 }
 
-#if _SZ_IS_CPP17 && __cpp_lib_string_view
+#if SZ_IS_CPP17_ && __cpp_lib_string_view
 
 /**
  *  Evaluates the correctness of a "matcher", searching for all the occurrences of the `needle_stl`
@@ -1868,7 +1868,7 @@ static void test_stl_containers() {
 int main(int argc, char const **argv) {
 
     // Let's greet the user nicely
-    sz_unused(argc && argv);
+    sz_unused_(argc && argv);
     std::printf("Hi, dear tester! You look nice today!\n");
     std::printf("- Uses Haswell: %s \n", SZ_USE_HASWELL ? "yes" : "no");
     std::printf("- Uses Skylake: %s \n", SZ_USE_SKYLAKE ? "yes" : "no");
@@ -1925,7 +1925,7 @@ int main(int argc, char const **argv) {
     test_replacements();
 
 // Compatibility with STL
-#if _SZ_IS_CPP17 && __cpp_lib_string_view
+#if SZ_IS_CPP17_ && __cpp_lib_string_view
     test_stl_compatibility_for_reads<std::string_view>();
 #endif
     test_stl_compatibility_for_reads<std::string>();
@@ -1950,7 +1950,7 @@ int main(int argc, char const **argv) {
     test_stl_conversions();
     test_comparisons();
     test_search();
-#if _SZ_IS_CPP17 && __cpp_lib_string_view
+#if SZ_IS_CPP17_ && __cpp_lib_string_view
     test_search_with_misaligned_repetitions();
 #endif
 
diff --git a/scripts/test_stringzilla.hpp b/scripts/test_stringzilla.hpp
index cc8d5c09..f2ee9049 100644
--- a/scripts/test_stringzilla.hpp
+++ b/scripts/test_stringzilla.hpp
@@ -142,7 +142,7 @@ void randomize_strings(fuzzy_config_t config, std::vector<std::string> &array, a
 
     // Convert to a GPU-friendly layout
     status_t status = tape.try_assign(array.data(), array.data() + array.size());
-    _sz_assert(status == status_t::success_k);
+    sz_assert_(status == status_t::success_k);
 }
 
 } // namespace scripts
diff --git a/scripts/test_stringzillas.cpp b/scripts/test_stringzillas.cpp
index e15d0f1d..c481ea38 100644
--- a/scripts/test_stringzillas.cpp
+++ b/scripts/test_stringzillas.cpp
@@ -36,7 +36,7 @@
 namespace szs = ashvardanian::stringzillas;
 
 int main(int argc, char const **argv) {
-    sz_unused(argc && argv);
+    sz_unused_(argc && argv);
     std::printf("Hi, dear tester! You look nice today!\n");
     if (auto code = szs::scripts::log_environment(); code != 0) return code;
 
diff --git a/scripts/test_stringzillas.cu b/scripts/test_stringzillas.cu
index 1c3793e2..08a0c185 100644
--- a/scripts/test_stringzillas.cu
+++ b/scripts/test_stringzillas.cu
@@ -30,15 +30,18 @@
 
 #include "test_stringzillas.cuh"
 
+#include "test_fingerprint.cuh"
+#include "test_similarity.cuh"
+
 namespace szs = ashvardanian::stringzillas;
 
 int main(int argc, char const **argv) {
-    sz_unused(argc && argv);
+    sz_unused_(argc && argv);
     std::printf("Hi, dear tester! You look nice today!\n");
     if (auto code = szs::scripts::log_environment(); code != 0) return code;
 
     try {
-        szs::scripts::test_fingerprint_equivalence();
+        szs::scripts::test_rolling_hasher();
         szs::scripts::test_similarity_scores_equivalence();
         szs::scripts::test_similarity_scores_memory_usage();
     }
diff --git a/scripts/test_stringzillas.cuh b/scripts/test_stringzillas.cuh
index 038d00e0..688713be 100644
--- a/scripts/test_stringzillas.cuh
+++ b/scripts/test_stringzillas.cuh
@@ -11,7 +11,7 @@
 #define FU_ENABLE_NUMA 0
 #include <fork_union.hpp> // Fork-join scoped thread pool
 
-// #if !_SZ_IS_CPP17
+// #if !SZ_IS_CPP17_
 // #error "This test requires C++17 or later."
 // #endif
 
diff --git a/swift/StringProtocol+StringZilla.swift b/swift/StringProtocol+StringZilla.swift
index e573b609..3f759e71 100644
--- a/swift/StringProtocol+StringZilla.swift
+++ b/swift/StringProtocol+StringZilla.swift
@@ -258,7 +258,7 @@ public extension StringZillaViewable {
         // Swift has a ridiculous issue with casting unsigned 64-bit to unsigned 64-bit
         // values which results in "Fatal error: Not enough bits to represent the passed value".
         // Let's just copy the bytes: https://stackoverflow.com/a/68650250/2766161
-        let effectiveBound: sz_size_t = bound.map { sz_size_t($0) } ?? _sz_size_max()
+        let effectiveBound: sz_size_t = bound.map { sz_size_t($0) } ?? sz_size_max_()
         let status = try withStringZillaScope { hPointer, hLength in
             try other.withStringZillaScope { nPointer, nLength in
                 // Pass a mutable pointer for the result.

From 7fc73239a83db4b60d987b503c3fde06687a60f6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 18 Jul 2025 11:36:39 +0000
Subject: [PATCH 479/751] Improve: Ignore previous UB commit in blame

---
 .git-blame-ignore-revs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index 0f60e2c9..a57e3f0f 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -40,3 +40,4 @@ bd547453122e9f8565e5be15f137e7b0de37caca
 ecb377541d0c706cf8997faff4f026b07e3f76f3
 0d982a45f842287d7e344f0d8b360f52482017f5
 467b4b81cb4bc0e9a64844748a417762378918c9
+74e3b6fce1a94820c26ab0d91efe08a483d1368d

From 0ab57106166006f1cbb8780c53e5514d071acbb1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 18 Jul 2025 11:40:44 +0000
Subject: [PATCH 480/751] Fix: `is_same_type` usage over `std::is_same`

---
 include/stringzilla/stringzilla.hpp | 6 +++---
 include/stringzillas/similarity.hpp | 8 ++++----
 scripts/bench.hpp                   | 8 ++++----
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 519795e8..1b8da5fd 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -396,7 +396,7 @@ struct matcher_find {
     size_type operator()(string_type_ haystack) const noexcept { return haystack.find(needle_); }
     size_type skip_length() const noexcept {
         // TODO: Apply Galil rule to match repetitive patterns in strictly linear time.
-        return is_same_type<overlaps_type, include_overlaps_type>() ? 1 : needle_.length();
+        return is_same_type<overlaps_type, include_overlaps_type>::value ? 1 : needle_.length();
     }
 };
 
@@ -414,7 +414,7 @@ struct matcher_rfind {
     size_type operator()(string_type_ haystack) const noexcept { return haystack.rfind(needle_); }
     size_type skip_length() const noexcept {
         // TODO: Apply Galil rule to match repetitive patterns in strictly linear time.
-        return is_same_type<overlaps_type, include_overlaps_type>() ? 1 : needle_.length();
+        return is_same_type<overlaps_type, include_overlaps_type>::value ? 1 : needle_.length();
     }
 };
 
@@ -2066,7 +2066,7 @@ class basic_string {
      *  @brief  The number of characters that can be stored in the internal buffer.
      *          Depends on the size of the internal buffer for the "Small String Optimization".
      */
-    static constexpr size_type min_capacity = _SZ_STRING_INTERNAL_SPACE - 1;
+    static constexpr size_type min_capacity = SZ_STRING_INTERNAL_SPACE - 1;
 
 #pragma region Constructors and STL Utilities
 
diff --git a/include/stringzillas/similarity.hpp b/include/stringzillas/similarity.hpp
index 2a2882a6..28b4ada3 100644
--- a/include/stringzillas/similarity.hpp
+++ b/include/stringzillas/similarity.hpp
@@ -621,7 +621,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 
     using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
     using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
-    static_assert(is_same_type<first_char_t, second_char_t>(), "String characters must be of the same type.");
+    static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
     using char_t = first_char_t;
 
     using tile_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, gap_costs_t,
@@ -828,7 +828,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 
     using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
     using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
-    static_assert(is_same_type<first_char_t, second_char_t>(), "String characters must be of the same type.");
+    static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
     using char_t = first_char_t;
 
     using tile_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t, gap_costs_t,
@@ -1677,7 +1677,7 @@ struct levenshtein_distance {
 
         // If the cost of gap opening and extension is the same and we've mistakenly instantiated
         // the more memory-intensive `affine_gap_costs_t`, we can fall-back to the linearized version.
-        if constexpr (is_same_type<gap_costs_t, affine_gap_costs_t>())
+        if constexpr (is_same_type<gap_costs_t, affine_gap_costs_t>::value)
             if (gap_costs_.open == gap_costs_.extend) {
                 linear_gap_costs_t linear_gap {gap_costs_.open};
                 linearized_fallback_t linear_backend(substituter_, linear_gap, alloc_);
@@ -1795,7 +1795,7 @@ struct levenshtein_distance_utf8 {
 
         // If the cost of gap opening and extension is the same and we've mistakenly instantiated
         // the more memory-intensive `affine_gap_costs_t`, we can fall-back to the linearized version.
-        if constexpr (is_same_type<gap_costs_t, affine_gap_costs_t>())
+        if constexpr (is_same_type<gap_costs_t, affine_gap_costs_t>::value)
             if (gap_costs_.open == gap_costs_.extend) {
                 linear_gap_costs_t linear_gap {gap_costs_.open};
                 linearized_fallback_t linear_backend(substituter_, linear_gap, alloc_);
diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index 25c598d4..07b3be5d 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -650,10 +650,10 @@ bench_result_t bench_nullary(  //
     }
 
     // Pre-process before testing
-    if constexpr (!is_same_type<preprocessing_type_, callable_no_op_t>()) preprocessing();
+    if constexpr (!is_same_type<preprocessing_type_, callable_no_op_t>::value) preprocessing();
 
     // Perform the testing against the baseline, if provided.
-    if constexpr (!is_same_type<baseline_type_, callable_no_op_t>())
+    if constexpr (!is_same_type<baseline_type_, callable_no_op_t>::value)
         for (auto running_seconds : repeat_up_to(env.stress ? env.stress_seconds : 0)) {
             call_result_t const accelerated_result = callable();
             call_result_t const baseline_result = baseline();
@@ -723,10 +723,10 @@ bench_result_t bench_unary(    //
     }
 
     // Pre-process before testing
-    if constexpr (!is_same_type<preprocessing_type_, callable_no_op_t>()) preprocessing();
+    if constexpr (!is_same_type<preprocessing_type_, callable_no_op_t>::value) preprocessing();
 
     std::size_t const lookup_mask = bit_floor(env.tokens.size()) - 1;
-    if constexpr (!is_same_type<baseline_type_, callable_no_op_t>())
+    if constexpr (!is_same_type<baseline_type_, callable_no_op_t>::value)
         for (auto running_seconds : repeat_up_to(env.stress ? env.stress_seconds : 0)) {
             std::size_t const token_index = (result.stress_calls++) & lookup_mask;
             call_result_t const accelerated_result = callable(token_index);

From e4aa3f7997eaeec4c3fdfb4355673780fd7907f9 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 18 Jul 2025 17:00:36 +0000
Subject: [PATCH 481/751] Improve: Test fingerprint equivalence

---
 include/stringzilla/types.hpp        |  3 ++
 include/stringzillas/fingerprint.hpp | 18 +++++---
 scripts/test_fingerprint.cuh         | 66 +++++++++++++++++++++++++++-
 scripts/test_stringzillas.cpp        |  1 +
 4 files changed, 79 insertions(+), 9 deletions(-)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 9cd4912e..fd693dc2 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -693,6 +693,9 @@ struct safe_array {
     constexpr const_iterator begin() const noexcept { return data_; }
     sz_constexpr_if_cpp14 iterator end() noexcept { return data_ + count_k; }
     constexpr const_iterator end() const noexcept { return data_ + count_k; }
+
+    operator span<value_type, count_k>() noexcept { return span<value_type, count_k>(data_); }
+    operator span<value_type const, count_k>() const noexcept { return span<value_type const, count_k>(data_); }
 };
 
 template <typename first_, typename second_>
diff --git a/include/stringzillas/fingerprint.hpp b/include/stringzillas/fingerprint.hpp
index f5a1e707..f7a7d83e 100644
--- a/include/stringzillas/fingerprint.hpp
+++ b/include/stringzillas/fingerprint.hpp
@@ -593,9 +593,9 @@ struct basic_rolling_hashers {
 
         // Allocate temporary states
         states_t states(allocator_traits_t::select_on_container_copy_construction(allocator_));
-        if (!states.try_resize(dimensions())) return status_t::bad_alloc_k;
+        if (states.try_resize(dimensions()) != status_t::success_k) return status_t::bad_alloc_k;
 
-        fingerprint(text, states, result);
+        fingerprint<dimensions_>(text, {states.data(), states.size()}, result);
         return status_t::success_k;
     }
 
@@ -988,12 +988,16 @@ struct floating_rolling_hashers<sz_cap_ice_k, dimensions_> {
     void operator()(span<byte_t const> text, span<hasher_t const, dimensions_k> hashers,
                     span<result_scalar_t, dimensions_k> fingerprint) const noexcept {
 
-        constexpr std::size_t unroll_factor_k = 2;
-        constexpr std::size_t unrolled_hashes_k = unroll_factor_k * sizeof(sz_u512_vec_t) / sizeof(rolling_hash_t);
+        // constexpr std::size_t unroll_factor_k = 2;
+        // constexpr std::size_t unrolled_hashes_k = unroll_factor_k * sizeof(sz_u512_vec_t) / sizeof(rolling_hash_t);
 
-        sz_u512_vec_t window_widths[unroll_factor_k], multipliers[unroll_factor_k],
-            negative_discarding_multipliers[unroll_factor_k], modulos[unroll_factor_k],
-            inverse_modulos[unroll_factor_k], states_[unroll_factor_k], min_hashes[unroll_factor_k];
+        // sz_u512_vec_t window_widths[unroll_factor_k], multipliers[unroll_factor_k],
+        //     negative_discarding_multipliers[unroll_factor_k], modulos[unroll_factor_k],
+        //     inverse_modulos[unroll_factor_k], states_[unroll_factor_k], min_hashes[unroll_factor_k];
+
+        sz_unused_(text);
+        sz_unused_(hashers);
+        sz_unused_(fingerprint);
     }
 };
 
diff --git a/scripts/test_fingerprint.cuh b/scripts/test_fingerprint.cuh
index c1373029..d4f5ee48 100644
--- a/scripts/test_fingerprint.cuh
+++ b/scripts/test_fingerprint.cuh
@@ -225,12 +225,12 @@ void test_rolling_hasher() {
 
     std::vector<f32u32_hasher_t> f32u32_hashers;
     f32u32_hashers.emplace_back(3, 31);
-    f32u32_hashers.emplace_back(5, 65521);
+    f32u32_hashers.emplace_back(5, 7001);
     f32u32_hashers.emplace_back(4, 257);
     f32u32_hashers.emplace_back(4, 257);
     f32u32_hashers.emplace_back(4, 257);
     f32u32_hashers.emplace_back(32, 257);
-    f32u32_hashers.emplace_back(32, 65521);
+    f32u32_hashers.emplace_back(32, 7001);
     f32u32_hashers.emplace_back(3);
     f32u32_hashers.emplace_back(32);
     f32u32_hashers.emplace_back(65);
@@ -260,6 +260,68 @@ void test_rolling_hasher() {
             test_rolling_hasher(hasher, inconvenient_strings);
 }
 
+/**
+ *  Compares the equivalence of SIMD backends to @b `floating_rolling_hashers<sz_cap_serial_k>`
+ *  and the simpler `basic_rolling_hashers<floating_rolling_hasher<double>, ..., std::uint32_t>`.
+ */
+template <std::size_t window_width_>
+void test_rolling_hashers_equivalence_for_width() {
+
+    constexpr std::size_t embedding_dims_k = 32;
+    constexpr std::size_t window_width_k = window_width_;
+    using fingerprint_t = safe_array<std::uint32_t, embedding_dims_k>;
+
+    auto test_against_baseline = [&](auto const &strings, auto const &baseline_hasher, auto const &accelerated_hasher) {
+        fingerprint_t fingerprint_accelerated;
+        fingerprint_t fingerprint_serial;
+
+        // Compute the fingerprints
+        for (auto const &str : strings) {
+            auto bytes = to_bytes_view(str);
+            baseline_hasher.template try_fingerprint<embedding_dims_k>(bytes, fingerprint_serial);
+            accelerated_hasher.try_fingerprint(bytes, fingerprint_accelerated);
+
+            // Compare the results
+            std::size_t const first_mismatch_index =
+                std::mismatch(fingerprint_serial.begin(), fingerprint_serial.end(), fingerprint_accelerated.begin())
+                    .first -
+                fingerprint_serial.begin();
+            sz_assert_(first_mismatch_index == fingerprint_serial.size() && "Fingerprints do not match");
+        }
+    };
+
+    // Define hasher classes
+    using rolling_f64_t = basic_rolling_hashers<floating_rolling_hasher<double>,
+                                                std::allocator<floating_rolling_hasher<double>>, std::uint32_t>;
+    using rolling_serial_t = floating_rolling_hashers<sz_cap_serial_k, window_width_k, embedding_dims_k>;
+    using rolling_skylake_t = floating_rolling_hashers<sz_cap_skylake_k, window_width_k, embedding_dims_k>;
+
+    // Instantiate all rolling hashers
+    rolling_f64_t rolling_f64;
+    rolling_serial_t rolling_serial;
+    rolling_skylake_t rolling_skylake;
+    sz_assert_(rolling_f64.try_extend(window_width_k, embedding_dims_k) == status_t::success_k);
+    sz_assert_(rolling_serial.try_seed() == status_t::success_k);
+    sz_assert_(rolling_skylake.try_seed() == status_t::success_k);
+
+    // Allocate test datasets
+    auto unit_strings = rolling_hasher_basic_inputs();
+    auto dna_like_strings = rolling_hasher_dna_like_inputs();
+    auto inconvenient_strings = rolling_hasher_inconvenient_inputs();
+
+    for (auto const &dataset : {unit_strings, dna_like_strings, /*inconvenient_strings*/})
+        test_against_baseline(dataset, rolling_f64, rolling_serial);
+    for (auto const &dataset : {unit_strings, dna_like_strings, /*inconvenient_strings*/})
+        test_against_baseline(dataset, rolling_f64, rolling_skylake);
+}
+
+void test_rolling_hashers_equivalence() {
+    test_rolling_hashers_equivalence_for_width<3>();
+    test_rolling_hashers_equivalence_for_width<7>();
+    test_rolling_hashers_equivalence_for_width<33>();
+    test_rolling_hashers_equivalence_for_width<64>();
+}
+
 } // namespace scripts
 } // namespace stringzillas
 } // namespace ashvardanian
diff --git a/scripts/test_stringzillas.cpp b/scripts/test_stringzillas.cpp
index c481ea38..d278cc1b 100644
--- a/scripts/test_stringzillas.cpp
+++ b/scripts/test_stringzillas.cpp
@@ -41,6 +41,7 @@ int main(int argc, char const **argv) {
     if (auto code = szs::scripts::log_environment(); code != 0) return code;
 
     try {
+        szs::scripts::test_rolling_hashers_equivalence();
         szs::scripts::test_rolling_hasher();
         szs::scripts::test_similarity_scores_equivalence();
         szs::scripts::test_similarity_scores_memory_usage();

From 3955eee4189bb5bae051623ff920769e4a982da5 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 19 Jul 2025 18:52:17 +0000
Subject: [PATCH 482/751] Add: Draft exploration of `float` fingerprints

---
 scripts/explore_fingerprint.ipynb | 427 ++++++++++++++++++++++++++++++
 1 file changed, 427 insertions(+)
 create mode 100644 scripts/explore_fingerprint.ipynb

diff --git a/scripts/explore_fingerprint.ipynb b/scripts/explore_fingerprint.ipynb
new file mode 100644
index 00000000..4c9e795e
--- /dev/null
+++ b/scripts/explore_fingerprint.ipynb
@@ -0,0 +1,427 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "# Exploring Rabin-Karp-style Min-Hash Fingerprinting\n",
+                "\n",
+                "This document showcases the differences between different numeric types that one can use to implement a Rabin-Karp-style min-hash fingerprinting algorithm.\n",
+                "It answers several important questions:\n",
+                "\n",
+                "- How to use floating-point numbers for a traditionally integer-based task - \"hashing\"?\n",
+                "- How to properly compose many such hash functions to maximize the quality of fingerprints?"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## Rabin-Karp Rolling Hashing"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "from typing import Generator\n",
+                "\n",
+                "\n",
+                "def rabin_karp_ints(\n",
+                "    s: str,\n",
+                "    window_width: int,\n",
+                "    multiplier: int,\n",
+                "    modulo: int,\n",
+                "    alphabet_size: int = 256,\n",
+                ") -> Generator[int, None, None]:\n",
+                "    \"\"\"Return the rolling polynomial hashes of every length-`window_width` substring of `s`\"\"\"\n",
+                "    \n",
+                "    assert window_width > 0, \"Window width must be positive\"\n",
+                "    assert multiplier > 0, \"Multiplier must be positive\"\n",
+                "    assert modulo > 0, \"Modulo must be positive\"\n",
+                "    assert multiplier < modulo, \"Multiplier must be less than modulo\"\n",
+                "\n",
+                "    if len(s) < window_width:\n",
+                "        return\n",
+                "\n",
+                "    current_hash: int = 0\n",
+                "    for char in s[:window_width]:\n",
+                "        new_term = ord(char) + 1\n",
+                "        assert new_term <= alphabet_size, \"Pass correct `alphabet_size`\"\n",
+                "        current_hash = (current_hash * multiplier + new_term) % modulo\n",
+                "    yield current_hash\n",
+                "\n",
+                "    discarding_multiplier: int = pow(multiplier, window_width - 1, modulo)\n",
+                "    total_hashes = len(s) - window_width + 1\n",
+                "    for i in range(1, total_hashes):  # First hash is already yielded\n",
+                "        old_term = ord(s[i - 1]) + 1\n",
+                "        new_term = ord(s[i + window_width - 1]) + 1\n",
+                "        \n",
+                "        # Remove leftmost char and add the new rightmost one.\n",
+                "        # All operations must be modulo `modulo`, but assuming the infinite precision of integers,\n",
+                "        # we don't care in this draft.\n",
+                "        current_hash = (current_hash - old_term * discarding_multiplier) % modulo\n",
+                "        current_hash = (current_hash * multiplier + new_term) % modulo\n",
+                "        yield current_hash\n",
+                "\n",
+                "\n",
+                "# Quick sanity-check\n",
+                "assert list(rabin_karp_ints(\"abcd\", 3, 31, 1_000_000_007)) == [\n",
+                "    next(rabin_karp_ints(\"abc\", 3, 31, 1_000_000_007)),\n",
+                "    next(rabin_karp_ints(\"bcd\", 3, 31, 1_000_000_007)),\n",
+                "]\n",
+                "assert list(rabin_karp_ints(\"abcdefdhijklmnopqr\", 17, 31, 65521)) == [\n",
+                "    next(rabin_karp_ints(\"abcdefdhijklmnopq\", 17, 31, 65521)),\n",
+                "    next(rabin_karp_ints(\"bcdefdhijklmnopqr\", 17, 31, 65521)),\n",
+                "]"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Rabin-Karp Rolling Hashing via Floats\n",
+                "\n",
+                "The Python's `int` type is unbounded, so it can be used to implement the Rabin-Karp rolling hash algorithm without worrying about overflow.\n",
+                "It is, however, insanely expensive to use, and doesn't allow us to explore optimization opportunities.\n",
+                "The `float`, on the other hand, is just a double-precision IEEE 754 floating-point number, which can exactly represent 52-bit integers!\n",
+                "Thus, we can convert our arithmetic to use `float`s, if we guarantee, that no intermediate result will exceed that limit."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 10,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "from typing import Generator\n",
+                "\n",
+                "LARGEST_INTEGRAL_FLOAT: float = 4503599627370495.0\n",
+                "\n",
+                "def rabin_karp_floats(\n",
+                "    s: str,\n",
+                "    window_width: int,\n",
+                "    multiplier: int,\n",
+                "    modulo: int,\n",
+                "    alphabet_size: int = 256,\n",
+                ") -> Generator[int, None, None]:\n",
+                "    \"\"\"Return the rolling polynomial hashes of every length-`window_width` substring of `s`\"\"\"\n",
+                "    \n",
+                "    assert window_width > 0, \"Window width must be positive\"\n",
+                "    assert multiplier > 0, \"Multiplier must be positive\"\n",
+                "    assert modulo > 0, \"Modulo must be positive\"\n",
+                "    assert multiplier < modulo, \"Multiplier must be less than modulo\"\n",
+                "\n",
+                "    if len(s) < window_width:\n",
+                "        return\n",
+                "\n",
+                "    multiplier = float(multiplier)\n",
+                "    modulo = float(modulo)\n",
+                "    assert modulo < LARGEST_INTEGRAL_FLOAT, \"Modulo can't exceed the largest integral float value\"\n",
+                "    \n",
+                "    # Ensure, we won't overflow the floating-point representation\n",
+                "    largest_post_modulo = modulo - 1\n",
+                "    max_possible_term = alphabet_size\n",
+                "    assert largest_post_modulo * multiplier + max_possible_term <= LARGEST_INTEGRAL_FLOAT, \"Will overflow\"\n",
+                "    \n",
+                "    # All of the operations will happen with a modulo:\n",
+                "    def mul_mod(a: float, b: float) -> float:\n",
+                "        return (a * b) % modulo\n",
+                "\n",
+                "    def add_mod(a: float, b: float) -> float:\n",
+                "        return (a + b) % modulo\n",
+                "\n",
+                "    def sub_mod(a: float, b: float) -> float:\n",
+                "        return (a - b) % modulo\n",
+                "    \n",
+                "    # Precompute the discarding multiplier\n",
+                "    discarding_multiplier: float = 1\n",
+                "    for _ in range(window_width - 1):\n",
+                "        discarding_multiplier = mul_mod(discarding_multiplier, multiplier)\n",
+                "\n",
+                "    # Handle the first window - without dropping any characters\n",
+                "    current_hash: float = 0.0\n",
+                "    for char in s[:window_width]:\n",
+                "        new_term = float(ord(char) + 1)\n",
+                "        assert new_term <= alphabet_size, \"Pass correct `alphabet_size`\"\n",
+                "        current_hash = add_mod(mul_mod(current_hash, multiplier), new_term)\n",
+                "    yield int(current_hash)\n",
+                "\n",
+                "    # Roll through the rest of the string\n",
+                "    total_hashes = len(s) - window_width + 1\n",
+                "    for i in range(1, total_hashes):  # First hash is already yielded\n",
+                "        old_term = float(ord(s[i - 1]) + 1)\n",
+                "        new_term = float(ord(s[i + window_width - 1]) + 1)\n",
+                "        \n",
+                "        # Remove leftmost char and add the new rightmost one.\n",
+                "        # All operations must be modulo `modulo`, but assuming the infinite precision of integers,\n",
+                "        # we don't care in this draft.\n",
+                "        current_hash = sub_mod(current_hash, mul_mod(old_term, discarding_multiplier))\n",
+                "        current_hash = add_mod(mul_mod(current_hash, multiplier), new_term)\n",
+                "        yield int(current_hash)\n",
+                "\n",
+                "\n",
+                "# Quick sanity-check\n",
+                "assert list(rabin_karp_floats(\"abcd\", 3, 31, 1_000_000_007)) == [\n",
+                "    next(rabin_karp_floats(\"abc\", 3, 31, 1_000_000_007)),\n",
+                "    next(rabin_karp_floats(\"bcd\", 3, 31, 1_000_000_007)),\n",
+                "]\n",
+                "assert list(rabin_karp_floats(\"abcdefdhijklmnopqr\", 17, 31, 65521)) == [\n",
+                "    next(rabin_karp_floats(\"abcdefdhijklmnopq\", 17, 31, 65521)),\n",
+                "    next(rabin_karp_floats(\"bcdefdhijklmnopqr\", 17, 31, 65521)),\n",
+                "]"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "Let's load some data and ensure that the outputs are identical between the `int` and `float` implementations."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 11,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "from pathlib import Path\n",
+                "\n",
+                "dataset_directory = Path(\"..\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 12,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "textual_dataset_path = dataset_directory / \"leipzig1M.txt\"\n",
+                "textual_dataset = open(textual_dataset_path, \"r\").read().strip()"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 13,
+            "metadata": {},
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "Loaded 1000000 lines of mean length 128.64 characters\n"
+                    ]
+                }
+            ],
+            "source": [
+                "textual_lines = textual_dataset.split(\"\\n\")\n",
+                "print(f\"Loaded {len(textual_lines)} lines of mean length {sum(len(line) for line in textual_lines) / len(textual_lines):.2f} characters\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 16,
+            "metadata": {},
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "Line: A rebel statement sent to Lisbon from Jamba said 86 government soldiers and 13 guerrillas were killed in the fighting that ended Jan. 3. It said the rebel forces sill held Mavinga.\n",
+                        "Int Hashes:   [39214, 58636, 27178, 56589, 55578, 20249, 42817, 9257, 3408, 19872, 51144, 34335, 40889, 27513, 38487, 18584, 3184, 58010, 7771, 24162, 33155, 61759, 39219, 14922, 28769, 36392, 58327, 32414, 10374, 39320, 40408, 4077, 12114, 26399, 16417, 50534, 55902, 1252, 17344, 47927, 138, 41634, 46625, 25284, 41597, 24437, 58118, 20946, 19491, 58575, 57224, 56366, 22315, 406, 8929, 63975, 22447, 40979, 42287, 37755, 61352, 18591, 11389, 3971, 37410, 42182, 55091, 29653, 55815, 31582, 62078, 4119, 40697, 38010, 15788, 56146, 15508, 45089, 3719, 28289, 54549, 57318, 12654, 64992, 49444, 14053, 34856, 24544, 61435, 49726, 13041, 39965, 64324, 28738, 43965, 31012, 56925, 49485, 19391, 3776, 56393, 4412, 49781, 24572, 57835, 2348, 32024, 18779, 62846, 26631, 23656, 945, 58585, 55891, 53897, 21132, 16665, 21737, 62623, 454, 22855, 9117, 45384, 9409, 34465, 36913, 55815, 31574, 21275, 48366, 26797, 305, 26224, 45699, 19243, 54332, 65261, 35983, 56487, 6857, 60050, 6789, 35265, 33220, 6863, 60157, 55119, 13997, 45644, 17545, 19981, 34581, 24537, 44779, 4542, 53727, 11835, 35605, 55759, 5340, 39363, 20691, 30243, 65519]\n",
+                        "Float Hashes: [39214, 58636, 27178, 56589, 55578, 20249, 42817, 9257, 3408, 19872, 51144, 34335, 40889, 27513, 38487, 18584, 3184, 58010, 7771, 24162, 33155, 61759, 39219, 14922, 28769, 36392, 58327, 32414, 10374, 39320, 40408, 4077, 12114, 26399, 16417, 50534, 55902, 1252, 17344, 47927, 138, 41634, 46625, 25284, 41597, 24437, 58118, 20946, 19491, 58575, 57224, 56366, 22315, 406, 8929, 63975, 22447, 40979, 42287, 37755, 61352, 18591, 11389, 3971, 37410, 42182, 55091, 29653, 55815, 31582, 62078, 4119, 40697, 38010, 15788, 56146, 15508, 45089, 3719, 28289, 54549, 57318, 12654, 64992, 49444, 14053, 34856, 24544, 61435, 49726, 13041, 39965, 64324, 28738, 43965, 31012, 56925, 49485, 19391, 3776, 56393, 4412, 49781, 24572, 57835, 2348, 32024, 18779, 62846, 26631, 23656, 945, 58585, 55891, 53897, 21132, 16665, 21737, 62623, 454, 22855, 9117, 45384, 9409, 34465, 36913, 55815, 31574, 21275, 48366, 26797, 305, 26224, 45699, 19243, 54332, 65261, 35983, 56487, 6857, 60050, 6789, 35265, 33220, 6863, 60157, 55119, 13997, 45644, 17545, 19981, 34581, 24537, 44779, 4542, 53727, 11835, 35605, 55759, 5340, 39363, 20691, 30243, 65519]\n",
+                        "\n",
+                        "Line: Authorities last week issued a vacate order for a club in Manhattan and closed another in the Bronx.\n",
+                        "Int Hashes:   [56616, 8496, 5609, 2142, 9737, 36023, 3183, 21507, 36347, 1257, 43823, 27906, 57280, 64465, 54087, 18503, 8726, 52456, 16927, 5440, 42454, 18429, 25656, 62959, 31433, 36943, 35706, 63337, 23183, 41988, 12592, 41193, 15888, 55183, 52999, 26322, 54572, 58560, 24739, 42511, 7753, 3555, 49516, 28332, 5043, 9641, 33112, 43995, 31955, 29157, 30618, 12148, 41314, 40161, 953, 7989, 39435, 59952, 2449, 48837, 28335, 43449, 45280, 49106, 40138, 24215, 51320, 35257, 23146, 18179, 56150, 62473, 15078, 54576, 46181, 52002, 19372, 15630, 51213, 59148, 20381, 58969, 55320, 36145]\n",
+                        "Float Hashes: [56616, 8496, 5609, 2142, 9737, 36023, 3183, 21507, 36347, 1257, 43823, 27906, 57280, 64465, 54087, 18503, 8726, 52456, 16927, 5440, 42454, 18429, 25656, 62959, 31433, 36943, 35706, 63337, 23183, 41988, 12592, 41193, 15888, 55183, 52999, 26322, 54572, 58560, 24739, 42511, 7753, 3555, 49516, 28332, 5043, 9641, 33112, 43995, 31955, 29157, 30618, 12148, 41314, 40161, 953, 7989, 39435, 59952, 2449, 48837, 28335, 43449, 45280, 49106, 40138, 24215, 51320, 35257, 23146, 18179, 56150, 62473, 15078, 54576, 46181, 52002, 19372, 15630, 51213, 59148, 20381, 58969, 55320, 36145]\n",
+                        "\n"
+                    ]
+                }
+            ],
+            "source": [
+                "for line in textual_lines[:2]:\n",
+                "    int_hashes = list(rabin_karp_ints(line, 17, 31, 65521))\n",
+                "    float_hashes = list(rabin_karp_floats(line, 17, 31, 65521))\n",
+                "    assert int_hashes == float_hashes, \"Hashes do not match between int and float implementations\"\n",
+                "    print(f\"Line: {line}\\nInt Hashes:   {int_hashes}\\nFloat Hashes: {float_hashes}\\n\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "A bigger question now is, will the same hold, if we use much larger modulo values?"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 19,
+            "metadata": {},
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "All tests passed for larger modulo values!\n"
+                    ]
+                }
+            ],
+            "source": [
+                "LARGEST_MODULO_SAFE_MODULO = 4503599626977\n",
+                "\n",
+                "for line in textual_lines[:50]:\n",
+                "    for window_width in [3, 17, 64]:\n",
+                "        int_hashes = list(rabin_karp_ints(line, window_width=window_width, multiplier=257, modulo=LARGEST_MODULO_SAFE_MODULO))\n",
+                "        float_hashes = list(rabin_karp_floats(line, window_width=window_width, multiplier=257, modulo=LARGEST_MODULO_SAFE_MODULO))\n",
+                "        assert int_hashes == float_hashes, \"Hashes do not match between int and float implementations\"\n",
+                "\n",
+                "print(\"All tests passed for larger modulo values!\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Rabin-Karp Rolling Hashing via FMAs\n",
+                "\n",
+                "- How aggressively can we use **FMA** (Fused Multiply-Add) operations to optimize the algorithm?\n",
+                "- How many of the modulo operations can we avoid?\n",
+                "- How can we simplify the `%` modulo operation?"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "from typing import Generator\n",
+                "\n",
+                "LARGEST_INTEGRAL_FLOAT: float = 4503599627370495.0\n",
+                "\n",
+                "def rabin_karp_fma(\n",
+                "    s: str,\n",
+                "    window_width: int,\n",
+                "    multiplier: int,\n",
+                "    modulo: int,\n",
+                "    alphabet_size: int = 256,\n",
+                ") -> Generator[int, None, None]:\n",
+                "    \"\"\"Return the rolling polynomial hashes of every length-`window_width` substring of `s`\"\"\"\n",
+                "    \n",
+                "    assert window_width > 0, \"Window width must be positive\"\n",
+                "    assert multiplier > 0, \"Multiplier must be positive\"\n",
+                "    assert modulo > 0, \"Modulo must be positive\"\n",
+                "    assert multiplier < modulo, \"Multiplier must be less than modulo\"\n",
+                "\n",
+                "    if len(s) < window_width:\n",
+                "        return\n",
+                "\n",
+                "    multiplier = float(multiplier)\n",
+                "    modulo = float(modulo)\n",
+                "    assert modulo < LARGEST_INTEGRAL_FLOAT, \"Modulo can't exceed the largest integral float value\"\n",
+                "    \n",
+                "    # Ensure, we won't overflow the floating-point representation\n",
+                "    largest_post_modulo = modulo - 1\n",
+                "    max_possible_term = alphabet_size\n",
+                "    assert largest_post_modulo * multiplier + max_possible_term <= LARGEST_INTEGRAL_FLOAT, \"Will overflow\"\n",
+                "    \n",
+                "    ..."
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## Min-Hash Fingerprinting"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "from typing import List\n",
+                "\n",
+                "def min_hash(rolling_hashes: List[np.ndarray]) -> np.ndarray:\n",
+                "    ...\n"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 5,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "def hamming_distance(a: np.ndarray, b: np.ndarray) -> int:\n",
+                "    \"\"\"Return the Hamming distance between two arrays of the same length.\"\"\"\n",
+                "    if len(a) != len(b):\n",
+                "        raise ValueError(\"Arrays must be of the same length\")\n",
+                "    return np.sum(a != b)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 9,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "dna_dataset_path = dataset_directory / \"acgt_10k.txt\"\n",
+                "dna_dataset = open(dna_dataset_path, \"r\").read().strip()"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 10,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "window_widths = [3, 4, 8, 16, 32]\n",
+                "precision_levels = [np.uint8, np.uint16, np.uint32, np.uint64]\n",
+                "modulo_per_precision = [251, 65521, 2**31 - 1, 2**63 - 1]"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "for window_width in window_widths:\n",
+                "    for precision, modulo in zip(precision_levels, modulo_per_precision):\n",
+                "        print(\n",
+                "            f\"Window width: {window_width}, Precision: {precision}, Modulo: {modulo}\"\n",
+                "        )\n",
+                "        \n",
+                "        # Textual dataset\n",
+                "        textual_hashes = rolling_hashes(\n",
+                "            textual_dataset, window_width, 31, modulo, dtype=precision\n",
+                "        )\n",
+                "        print(f\"Textual hashes: {textual_hashes[:10]}...\")"
+            ]
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "StringZilla",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.11.11"
+        }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 2
+}

From 46dd7d016ac7a302ad5579e169fa3707b80484df Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 20 Jul 2025 14:29:26 +0000
Subject: [PATCH 483/751] Improve: Explore Min-Hashes

---
 scripts/explore_fingerprint.ipynb | 691 +++++++++++++++++++++++++-----
 1 file changed, 591 insertions(+), 100 deletions(-)

diff --git a/scripts/explore_fingerprint.ipynb b/scripts/explore_fingerprint.ipynb
index 4c9e795e..440a0367 100644
--- a/scripts/explore_fingerprint.ipynb
+++ b/scripts/explore_fingerprint.ipynb
@@ -82,7 +82,7 @@
             "cell_type": "markdown",
             "metadata": {},
             "source": [
-                "### Rabin-Karp Rolling Hashing via Floats\n",
+                "## Rabin-Karp Rolling Hashing via Floats\n",
                 "\n",
                 "The Python's `int` type is unbounded, so it can be used to implement the Rabin-Karp rolling hash algorithm without worrying about overflow.\n",
                 "It is, however, insanely expensive to use, and doesn't allow us to explore optimization opportunities.\n",
@@ -92,7 +92,7 @@
         },
         {
             "cell_type": "code",
-            "execution_count": 10,
+            "execution_count": null,
             "metadata": {},
             "outputs": [],
             "source": [
@@ -100,6 +100,7 @@
                 "\n",
                 "LARGEST_INTEGRAL_FLOAT: float = 4503599627370495.0\n",
                 "\n",
+                "\n",
                 "def rabin_karp_floats(\n",
                 "    s: str,\n",
                 "    window_width: int,\n",
@@ -108,7 +109,7 @@
                 "    alphabet_size: int = 256,\n",
                 ") -> Generator[int, None, None]:\n",
                 "    \"\"\"Return the rolling polynomial hashes of every length-`window_width` substring of `s`\"\"\"\n",
-                "    \n",
+                "\n",
                 "    assert window_width > 0, \"Window width must be positive\"\n",
                 "    assert multiplier > 0, \"Multiplier must be positive\"\n",
                 "    assert modulo > 0, \"Modulo must be positive\"\n",
@@ -119,13 +120,17 @@
                 "\n",
                 "    multiplier = float(multiplier)\n",
                 "    modulo = float(modulo)\n",
-                "    assert modulo < LARGEST_INTEGRAL_FLOAT, \"Modulo can't exceed the largest integral float value\"\n",
-                "    \n",
+                "    assert (\n",
+                "        modulo < LARGEST_INTEGRAL_FLOAT\n",
+                "    ), \"Modulo can't exceed the largest integral float value\"\n",
+                "\n",
                 "    # Ensure, we won't overflow the floating-point representation\n",
                 "    largest_post_modulo = modulo - 1\n",
                 "    max_possible_term = alphabet_size\n",
-                "    assert largest_post_modulo * multiplier + max_possible_term <= LARGEST_INTEGRAL_FLOAT, \"Will overflow\"\n",
-                "    \n",
+                "    assert (\n",
+                "        largest_post_modulo * multiplier + max_possible_term <= LARGEST_INTEGRAL_FLOAT\n",
+                "    ), \"Will overflow\"\n",
+                "\n",
                 "    # All of the operations will happen with a modulo:\n",
                 "    def mul_mod(a: float, b: float) -> float:\n",
                 "        return (a * b) % modulo\n",
@@ -135,9 +140,9 @@
                 "\n",
                 "    def sub_mod(a: float, b: float) -> float:\n",
                 "        return (a - b) % modulo\n",
-                "    \n",
+                "\n",
                 "    # Precompute the discarding multiplier\n",
-                "    discarding_multiplier: float = 1\n",
+                "    discarding_multiplier: float = 1.0\n",
                 "    for _ in range(window_width - 1):\n",
                 "        discarding_multiplier = mul_mod(discarding_multiplier, multiplier)\n",
                 "\n",
@@ -154,10 +159,8 @@
                 "    for i in range(1, total_hashes):  # First hash is already yielded\n",
                 "        old_term = float(ord(s[i - 1]) + 1)\n",
                 "        new_term = float(ord(s[i + window_width - 1]) + 1)\n",
-                "        \n",
+                "\n",
                 "        # Remove leftmost char and add the new rightmost one.\n",
-                "        # All operations must be modulo `modulo`, but assuming the infinite precision of integers,\n",
-                "        # we don't care in this draft.\n",
                 "        current_hash = sub_mod(current_hash, mul_mod(old_term, discarding_multiplier))\n",
                 "        current_hash = add_mod(mul_mod(current_hash, multiplier), new_term)\n",
                 "        yield int(current_hash)\n",
@@ -183,7 +186,7 @@
         },
         {
             "cell_type": "code",
-            "execution_count": 11,
+            "execution_count": null,
             "metadata": {},
             "outputs": [],
             "source": [
@@ -194,7 +197,7 @@
         },
         {
             "cell_type": "code",
-            "execution_count": 12,
+            "execution_count": null,
             "metadata": {},
             "outputs": [],
             "source": [
@@ -204,48 +207,34 @@
         },
         {
             "cell_type": "code",
-            "execution_count": 13,
-            "metadata": {},
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "Loaded 1000000 lines of mean length 128.64 characters\n"
-                    ]
-                }
-            ],
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
             "source": [
                 "textual_lines = textual_dataset.split(\"\\n\")\n",
-                "print(f\"Loaded {len(textual_lines)} lines of mean length {sum(len(line) for line in textual_lines) / len(textual_lines):.2f} characters\")"
+                "print(f\"Loaded {len(textual_lines):,} lines of mean length {sum(len(line) for line in textual_lines) / len(textual_lines):.2f} characters\")"
             ]
         },
         {
             "cell_type": "code",
-            "execution_count": 16,
-            "metadata": {},
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "Line: A rebel statement sent to Lisbon from Jamba said 86 government soldiers and 13 guerrillas were killed in the fighting that ended Jan. 3. It said the rebel forces sill held Mavinga.\n",
-                        "Int Hashes:   [39214, 58636, 27178, 56589, 55578, 20249, 42817, 9257, 3408, 19872, 51144, 34335, 40889, 27513, 38487, 18584, 3184, 58010, 7771, 24162, 33155, 61759, 39219, 14922, 28769, 36392, 58327, 32414, 10374, 39320, 40408, 4077, 12114, 26399, 16417, 50534, 55902, 1252, 17344, 47927, 138, 41634, 46625, 25284, 41597, 24437, 58118, 20946, 19491, 58575, 57224, 56366, 22315, 406, 8929, 63975, 22447, 40979, 42287, 37755, 61352, 18591, 11389, 3971, 37410, 42182, 55091, 29653, 55815, 31582, 62078, 4119, 40697, 38010, 15788, 56146, 15508, 45089, 3719, 28289, 54549, 57318, 12654, 64992, 49444, 14053, 34856, 24544, 61435, 49726, 13041, 39965, 64324, 28738, 43965, 31012, 56925, 49485, 19391, 3776, 56393, 4412, 49781, 24572, 57835, 2348, 32024, 18779, 62846, 26631, 23656, 945, 58585, 55891, 53897, 21132, 16665, 21737, 62623, 454, 22855, 9117, 45384, 9409, 34465, 36913, 55815, 31574, 21275, 48366, 26797, 305, 26224, 45699, 19243, 54332, 65261, 35983, 56487, 6857, 60050, 6789, 35265, 33220, 6863, 60157, 55119, 13997, 45644, 17545, 19981, 34581, 24537, 44779, 4542, 53727, 11835, 35605, 55759, 5340, 39363, 20691, 30243, 65519]\n",
-                        "Float Hashes: [39214, 58636, 27178, 56589, 55578, 20249, 42817, 9257, 3408, 19872, 51144, 34335, 40889, 27513, 38487, 18584, 3184, 58010, 7771, 24162, 33155, 61759, 39219, 14922, 28769, 36392, 58327, 32414, 10374, 39320, 40408, 4077, 12114, 26399, 16417, 50534, 55902, 1252, 17344, 47927, 138, 41634, 46625, 25284, 41597, 24437, 58118, 20946, 19491, 58575, 57224, 56366, 22315, 406, 8929, 63975, 22447, 40979, 42287, 37755, 61352, 18591, 11389, 3971, 37410, 42182, 55091, 29653, 55815, 31582, 62078, 4119, 40697, 38010, 15788, 56146, 15508, 45089, 3719, 28289, 54549, 57318, 12654, 64992, 49444, 14053, 34856, 24544, 61435, 49726, 13041, 39965, 64324, 28738, 43965, 31012, 56925, 49485, 19391, 3776, 56393, 4412, 49781, 24572, 57835, 2348, 32024, 18779, 62846, 26631, 23656, 945, 58585, 55891, 53897, 21132, 16665, 21737, 62623, 454, 22855, 9117, 45384, 9409, 34465, 36913, 55815, 31574, 21275, 48366, 26797, 305, 26224, 45699, 19243, 54332, 65261, 35983, 56487, 6857, 60050, 6789, 35265, 33220, 6863, 60157, 55119, 13997, 45644, 17545, 19981, 34581, 24537, 44779, 4542, 53727, 11835, 35605, 55759, 5340, 39363, 20691, 30243, 65519]\n",
-                        "\n",
-                        "Line: Authorities last week issued a vacate order for a club in Manhattan and closed another in the Bronx.\n",
-                        "Int Hashes:   [56616, 8496, 5609, 2142, 9737, 36023, 3183, 21507, 36347, 1257, 43823, 27906, 57280, 64465, 54087, 18503, 8726, 52456, 16927, 5440, 42454, 18429, 25656, 62959, 31433, 36943, 35706, 63337, 23183, 41988, 12592, 41193, 15888, 55183, 52999, 26322, 54572, 58560, 24739, 42511, 7753, 3555, 49516, 28332, 5043, 9641, 33112, 43995, 31955, 29157, 30618, 12148, 41314, 40161, 953, 7989, 39435, 59952, 2449, 48837, 28335, 43449, 45280, 49106, 40138, 24215, 51320, 35257, 23146, 18179, 56150, 62473, 15078, 54576, 46181, 52002, 19372, 15630, 51213, 59148, 20381, 58969, 55320, 36145]\n",
-                        "Float Hashes: [56616, 8496, 5609, 2142, 9737, 36023, 3183, 21507, 36347, 1257, 43823, 27906, 57280, 64465, 54087, 18503, 8726, 52456, 16927, 5440, 42454, 18429, 25656, 62959, 31433, 36943, 35706, 63337, 23183, 41988, 12592, 41193, 15888, 55183, 52999, 26322, 54572, 58560, 24739, 42511, 7753, 3555, 49516, 28332, 5043, 9641, 33112, 43995, 31955, 29157, 30618, 12148, 41314, 40161, 953, 7989, 39435, 59952, 2449, 48837, 28335, 43449, 45280, 49106, 40138, 24215, 51320, 35257, 23146, 18179, 56150, 62473, 15078, 54576, 46181, 52002, 19372, 15630, 51213, 59148, 20381, 58969, 55320, 36145]\n",
-                        "\n"
-                    ]
-                }
-            ],
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
             "source": [
+                "def compare_hashes(line, make_baseline_generator, make_test_generator):\n",
+                "    int_hashes = list(make_baseline_generator(line))\n",
+                "    float_hashes = list(make_test_generator(line))\n",
+                "    if int_hashes != float_hashes:\n",
+                "        print(f\"Int Hashes:   {int_hashes}\")\n",
+                "        print(f\"Float Hashes: {float_hashes}\")\n",
+                "\n",
+                "\n",
                 "for line in textual_lines[:2]:\n",
-                "    int_hashes = list(rabin_karp_ints(line, 17, 31, 65521))\n",
-                "    float_hashes = list(rabin_karp_floats(line, 17, 31, 65521))\n",
-                "    assert int_hashes == float_hashes, \"Hashes do not match between int and float implementations\"\n",
-                "    print(f\"Line: {line}\\nInt Hashes:   {int_hashes}\\nFloat Hashes: {float_hashes}\\n\")"
+                "    compare_hashes(\n",
+                "        line,\n",
+                "        lambda l: rabin_karp_ints(l, 17, 31, 65521),\n",
+                "        lambda l: rabin_karp_floats(l, 17, 31, 65521),\n",
+                "    )"
             ]
         },
         {
@@ -257,34 +246,26 @@
         },
         {
             "cell_type": "code",
-            "execution_count": 19,
-            "metadata": {},
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "All tests passed for larger modulo values!\n"
-                    ]
-                }
-            ],
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
             "source": [
                 "LARGEST_MODULO_SAFE_MODULO = 4503599626977\n",
                 "\n",
-                "for line in textual_lines[:50]:\n",
-                "    for window_width in [3, 17, 64]:\n",
-                "        int_hashes = list(rabin_karp_ints(line, window_width=window_width, multiplier=257, modulo=LARGEST_MODULO_SAFE_MODULO))\n",
-                "        float_hashes = list(rabin_karp_floats(line, window_width=window_width, multiplier=257, modulo=LARGEST_MODULO_SAFE_MODULO))\n",
-                "        assert int_hashes == float_hashes, \"Hashes do not match between int and float implementations\"\n",
-                "\n",
-                "print(\"All tests passed for larger modulo values!\")"
+                "for window_width in [3, 17, 64]:\n",
+                "    for line in textual_lines[:50]:\n",
+                "        compare_hashes(\n",
+                "            line,\n",
+                "            lambda l: rabin_karp_ints(l, window_width=window_width, multiplier=257, modulo=LARGEST_MODULO_SAFE_MODULO),\n",
+                "            lambda l: rabin_karp_floats(l, window_width=window_width, multiplier=257, modulo=LARGEST_MODULO_SAFE_MODULO))\n",
+                "    print(f\"Passed for window width: {window_width}!\")"
             ]
         },
         {
             "cell_type": "markdown",
             "metadata": {},
             "source": [
-                "### Rabin-Karp Rolling Hashing via FMAs\n",
+                "## Rabin-Karp Rolling Hashing via FMAs\n",
                 "\n",
                 "- How aggressively can we use **FMA** (Fused Multiply-Add) operations to optimize the algorithm?\n",
                 "- How many of the modulo operations can we avoid?\n",
@@ -297,10 +278,12 @@
             "metadata": {},
             "outputs": [],
             "source": [
+                "import math\n",
                 "from typing import Generator\n",
                 "\n",
                 "LARGEST_INTEGRAL_FLOAT: float = 4503599627370495.0\n",
                 "\n",
+                "\n",
                 "def rabin_karp_fma(\n",
                 "    s: str,\n",
                 "    window_width: int,\n",
@@ -308,8 +291,9 @@
                 "    modulo: int,\n",
                 "    alphabet_size: int = 256,\n",
                 ") -> Generator[int, None, None]:\n",
-                "    \"\"\"Return the rolling polynomial hashes of every length-`window_width` substring of `s`\"\"\"\n",
-                "    \n",
+                "    \"\"\"Return the rolling polynomial hashes of every length-`window_width` substring of `s`\n",
+                "    using Fused-Multiply-Add (FMA) operations & Barrett reduction for performance.\"\"\"\n",
+                "\n",
                 "    assert window_width > 0, \"Window width must be positive\"\n",
                 "    assert multiplier > 0, \"Multiplier must be positive\"\n",
                 "    assert modulo > 0, \"Modulo must be positive\"\n",
@@ -320,21 +304,165 @@
                 "\n",
                 "    multiplier = float(multiplier)\n",
                 "    modulo = float(modulo)\n",
-                "    assert modulo < LARGEST_INTEGRAL_FLOAT, \"Modulo can't exceed the largest integral float value\"\n",
-                "    \n",
+                "    assert (\n",
+                "        modulo < LARGEST_INTEGRAL_FLOAT\n",
+                "    ), \"Modulo can't exceed the largest integral float value\"\n",
+                "\n",
                 "    # Ensure, we won't overflow the floating-point representation\n",
                 "    largest_post_modulo = modulo - 1\n",
                 "    max_possible_term = alphabet_size\n",
-                "    assert largest_post_modulo * multiplier + max_possible_term <= LARGEST_INTEGRAL_FLOAT, \"Will overflow\"\n",
-                "    \n",
-                "    ..."
+                "    assert (\n",
+                "        largest_post_modulo * multiplier + max_possible_term <= LARGEST_INTEGRAL_FLOAT\n",
+                "    ), \"Will overflow\"\n",
+                "\n",
+                "    inverse_modulo: float = 1.0 / modulo\n",
+                "\n",
+                "    # Barrett reduction function\n",
+                "    # It will be used to reduce the intermediate results to the modulo range\n",
+                "    def barrett_mod(x: float) -> float:\n",
+                "        q = math.floor(x * inverse_modulo)\n",
+                "        result = x - q * modulo\n",
+                "        # Handle potential off-by-one errors\n",
+                "        if result >= modulo:\n",
+                "            result -= modulo\n",
+                "        elif result < 0:\n",
+                "            result += modulo\n",
+                "        assert result == (x % modulo), \"Barrett reduction failed\"\n",
+                "        return result\n",
+                "\n",
+                "    # All of the operations will happen with a modulo:\n",
+                "    def fma_mod(a: float, b: float, c: float) -> float:\n",
+                "        intermediate = a * b + c\n",
+                "        assert intermediate <= LARGEST_INTEGRAL_FLOAT, \"FMA did exceed integral range\"\n",
+                "        return barrett_mod(intermediate)\n",
+                "\n",
+                "    # Precompute the discarding multiplier\n",
+                "    negative_discarding_multiplier: float = 1.0\n",
+                "    for _ in range(window_width - 1):\n",
+                "        negative_discarding_multiplier = fma_mod(\n",
+                "            negative_discarding_multiplier, multiplier, 0.0\n",
+                "        )\n",
+                "    negative_discarding_multiplier = (\n",
+                "        -negative_discarding_multiplier\n",
+                "    )  # Negate for FMA compatibility\n",
+                "\n",
+                "    # Handle the first window - without dropping any characters\n",
+                "    current_hash: float = 0.0\n",
+                "    for char in s[:window_width]:\n",
+                "        new_term = float(ord(char) + 1)\n",
+                "        assert new_term <= alphabet_size, \"Pass correct `alphabet_size`\"\n",
+                "        current_hash = fma_mod(current_hash, multiplier, new_term)\n",
+                "    yield int(current_hash)\n",
+                "\n",
+                "    # Roll through the rest of the string\n",
+                "    total_hashes = len(s) - window_width + 1\n",
+                "    for i in range(1, total_hashes):  # First hash is already yielded\n",
+                "        old_term = float(ord(s[i - 1]) + 1)\n",
+                "        new_term = float(ord(s[i + window_width - 1]) + 1)\n",
+                "\n",
+                "        # Remove leftmost char and add the new rightmost one.\n",
+                "        current_hash = fma_mod(old_term, negative_discarding_multiplier, current_hash)\n",
+                "        assert (\n",
+                "            current_hash >= -modulo\n",
+                "        ), \"Intermediate hash may be negative, but within modulo range\"\n",
+                "        current_hash = fma_mod(current_hash, multiplier, new_term)\n",
+                "        assert current_hash >= 0, \"Current hash should not be negative\"\n",
+                "        yield int(current_hash)\n",
+                "\n",
+                "\n",
+                "# Quick sanity-check\n",
+                "assert list(rabin_karp_fma(\"abcd\", 3, 31, 1_000_000_007)) == [\n",
+                "    next(rabin_karp_fma(\"abc\", 3, 31, 1_000_000_007)),\n",
+                "    next(rabin_karp_fma(\"bcd\", 3, 31, 1_000_000_007)),\n",
+                "]\n",
+                "assert list(rabin_karp_fma(\"abcdefdhijklmnopqr\", 17, 31, 65521)) == [\n",
+                "    next(rabin_karp_fma(\"abcdefdhijklmnopq\", 17, 31, 65521)),\n",
+                "    next(rabin_karp_fma(\"bcdefdhijklmnopqr\", 17, 31, 65521)),\n",
+                "]"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "LARGEST_MODULO_SAFE_MODULO = 4503599626977\n",
+                "\n",
+                "for window_width in [3, 17, 64]:\n",
+                "    for line in textual_lines[:50]:\n",
+                "        compare_hashes(\n",
+                "            line,\n",
+                "            lambda l: rabin_karp_ints(l, window_width=window_width, multiplier=257, modulo=LARGEST_MODULO_SAFE_MODULO),\n",
+                "            lambda l: rabin_karp_fma(l, window_width=window_width, multiplier=257, modulo=LARGEST_MODULO_SAFE_MODULO))\n",
+                "    print(f\"Passed for window width: {window_width}!\")"
             ]
         },
         {
             "cell_type": "markdown",
             "metadata": {},
             "source": [
-                "## Min-Hash Fingerprinting"
+                "As we can handle typical texts, let's try several tricky inputs... where we'll be at a brink of an overflow! Some uncomfortable character values are: `\\x00`, `\\x01`, `\\x7F`, `\\xFF`. To really stress-test, let's pick the largest prime number below `LARGEST_INTEGRAL_FLOAT`, that can be used safely for a given alphabet size."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "from typing import Final, List\n",
+                "\n",
+                "# Fixed witnesses that make Miller-Rabin exact for n < 2**64\n",
+                "MR_BASES: Final[List[int]] = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37]\n",
+                "\n",
+                "\n",
+                "def _is_prime_64(n: int) -> bool:\n",
+                "    \"\"\"Exact primality for 0 < n < 2**64.\"\"\"\n",
+                "    if n < 2:\n",
+                "        return False\n",
+                "    # Quick reject: small prime factors\n",
+                "    for p in MR_BASES:  # covers all primes ≤ 37\n",
+                "        if n == p:\n",
+                "            return True\n",
+                "        if n % p == 0:\n",
+                "            return False\n",
+                "\n",
+                "    # Write n-1 = d · 2ˢ  with d odd\n",
+                "    d, s = n - 1, 0\n",
+                "    while d & 1 == 0:\n",
+                "        d >>= 1\n",
+                "        s += 1\n",
+                "\n",
+                "    # Strong-probable-prime test for each base\n",
+                "    for a in MR_BASES:\n",
+                "        x = pow(a, d, n)\n",
+                "        if x in (1, n - 1):  # self-loop or −1 ⇒ may be prime\n",
+                "            continue\n",
+                "        for _ in range(s - 1):  # square until −1 or cycle\n",
+                "            x = pow(x, 2, n)\n",
+                "            if x == n - 1:\n",
+                "                break\n",
+                "        else:  # never hit −1 ⇒ composite\n",
+                "            return False\n",
+                "    return True\n",
+                "\n",
+                "\n",
+                "def largest_prime_below(n: int) -> int:\n",
+                "    \"\"\"\n",
+                "    Return the largest prime strictly less than n (n must be > 2).\n",
+                "    Average cost: O(log n * log log n) because the prime gap ~ log n.\n",
+                "    \"\"\"\n",
+                "    if n <= 2:\n",
+                "        raise ValueError(\"Threshold must exceed 2.\")\n",
+                "    n -= n % 2 == 0  # make n odd\n",
+                "    while not _is_prime_64(n):\n",
+                "        n -= 2\n",
+                "    return n\n",
+                "\n",
+                "\n",
+                "LARGEST_INTEGRAL_FLOAT_PRIME = largest_prime_below(int(LARGEST_INTEGRAL_FLOAT))\n",
+                "print(f\"{LARGEST_INTEGRAL_FLOAT_PRIME:,}\")  # This will be used for stress-testing"
             ]
         },
         {
@@ -343,44 +471,203 @@
             "metadata": {},
             "outputs": [],
             "source": [
-                "from typing import List\n",
+                "import random\n",
+                "\n",
+                "all_0 = \"\\x00\" * 1_000\n",
+                "all_1 = \"\\x01\" * 1_000\n",
+                "all_127 = \"\\x7f\" * 1_000\n",
+                "all_255 = \"\\xff\" * 1_000\n",
+                "all_0_255 = \"\\x00\\xff\" * 500  # alternating 0 and 255 characters\n",
+                "all_uncomfortable = \"\\x00\\x01\\x7f\\xfe\\xff\" * 250  # all uncomfortable characters\n",
+                "\n",
+                "long_random_strings = [\n",
+                "    \"\".join(random.choices(\"\\x00\\x01\\x7f\\xfe\\xff\", k=10_000)) for _ in range(10)\n",
+                "]  # 10 long random strings with uncomfortable characters\n",
                 "\n",
-                "def min_hash(rolling_hashes: List[np.ndarray]) -> np.ndarray:\n",
-                "    ...\n"
+                "alphabet_size = 256\n",
+                "multiplier = 257\n",
+                "largest_term = alphabet_size + 1  # in this specific case, same as `multiplier`\n",
+                "large_modulo = largest_prime_below(\n",
+                "    int(LARGEST_INTEGRAL_FLOAT) // multiplier - largest_term\n",
+                ")\n",
+                "\n",
+                "for window_width in [3, 17, 64, 707]:\n",
+                "    for line in [\n",
+                "        all_0,\n",
+                "        all_1,\n",
+                "        all_127,\n",
+                "        all_255,\n",
+                "        all_0_255,\n",
+                "        all_uncomfortable,\n",
+                "        *long_random_strings,\n",
+                "    ]:\n",
+                "        compare_hashes(\n",
+                "            line,\n",
+                "            lambda l: rabin_karp_ints(\n",
+                "                l,\n",
+                "                window_width=window_width,\n",
+                "                multiplier=multiplier,\n",
+                "                modulo=large_modulo,\n",
+                "                alphabet_size=alphabet_size,\n",
+                "            ),\n",
+                "            lambda l: rabin_karp_fma(\n",
+                "                l,\n",
+                "                window_width=window_width,\n",
+                "                multiplier=multiplier,\n",
+                "                modulo=large_modulo,\n",
+                "                alphabet_size=alphabet_size,\n",
+                "            ),\n",
+                "        )\n",
+                "    print(f\"Passed for window width: {window_width}, modulo: {large_modulo:,}!\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## Min-Hash Fingerprinting\n",
+                "\n",
+                "Min-Hash fingerprints transform variable length text representations into **fixed-length vectors**, where each dimension stores the minimum hash value of a certain hash function across the whole document.\n",
+                "It's great for large-scale information retrieval using Hamming Distance or Jaccard Similarity ($|A ∩ B| / |A ∪ B|$) or its weighted alternative.\n",
+                "\n",
+                "A potentially more informative alternative is \"weighted Min-Hash\", which takes into account the frequency of each element in the document. This makes the fingerprints compatible with **TF-IDF**-like algorithms, and makes the system more robust especially for narrow rolling windows."
             ]
         },
         {
             "cell_type": "code",
-            "execution_count": 5,
+            "execution_count": null,
             "metadata": {},
             "outputs": [],
             "source": [
-                "def hamming_distance(a: np.ndarray, b: np.ndarray) -> int:\n",
-                "    \"\"\"Return the Hamming distance between two arrays of the same length.\"\"\"\n",
-                "    if len(a) != len(b):\n",
-                "        raise ValueError(\"Arrays must be of the same length\")\n",
-                "    return np.sum(a != b)"
+                "!pip install tqdm numpy"
             ]
         },
         {
             "cell_type": "code",
-            "execution_count": 9,
+            "execution_count": null,
             "metadata": {},
             "outputs": [],
             "source": [
-                "dna_dataset_path = dataset_directory / \"acgt_10k.txt\"\n",
-                "dna_dataset = open(dna_dataset_path, \"r\").read().strip()"
+                "import numpy as np\n",
+                "from numpy.dtypes import StringDType\n",
+                "from typing import List, Tuple\n",
+                "\n",
+                "def count_min_sketch(\n",
+                "    text: str,\n",
+                "    window_widths: List[int],\n",
+                "    multipliers: List[int],\n",
+                "    modulo: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:\n",
+                "    \"\"\"\n",
+                "    Produces a weighted Min-Hash fingerprint also called a Count-Min Sketch.\n",
+                "    Those sketches are trivial to merge\n",
+                "    \n",
+                "    https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch\n",
+                "    \"\"\"\n",
+                "    \n",
+                "    count_widths = len(window_widths)\n",
+                "    count_multipliers = len(multipliers)\n",
+                "    assert count_widths == count_multipliers, f\"{count_widths=} != {count_multipliers=}\"\n",
+                "    \n",
+                "    fingerprint_hashes = np.empty((len(window_widths),), dtype=np.uint32)\n",
+                "    fingerprint_weights = np.empty((len(window_widths),), dtype=np.uint32)\n",
+                "    fingerprint_ngrams = np.empty((len(window_widths),), dtype=StringDType())\n",
+                "    \n",
+                "    skipped_u32_hash = np.iinfo(np.uint32).max\n",
+                "    skipped_u64_intermediary = np.iinfo(np.uint64).max\n",
+                "    hashers = [\n",
+                "        rabin_karp_fma(text, window_width=width, multiplier=multiplier, modulo=modulo)\n",
+                "        for width, multiplier in zip(window_widths, multipliers)\n",
+                "    ]\n",
+                "    \n",
+                "    for i, hasher in enumerate(hashers):\n",
+                "        smallest_hash = skipped_u64_intermediary\n",
+                "        smallest_count = 0\n",
+                "        smallest_example = None\n",
+                "        for rolling_intermediate_u64_hash in hasher:\n",
+                "            new_smallest_hash = min(smallest_hash, rolling_intermediate_u64_hash)\n",
+                "            if new_smallest_hash < smallest_hash:\n",
+                "                smallest_count = 1\n",
+                "                smallest_hash = new_smallest_hash\n",
+                "                smallest_example = text[i:i + window_widths[i]]\n",
+                "            elif new_smallest_hash == smallest_hash:\n",
+                "                smallest_count += 1\n",
+                "            \n",
+                "        smallest_hash &= skipped_u32_hash  # Ensure we don't exceed the `uint32` range\n",
+                "        fingerprint_hashes[i] = smallest_hash\n",
+                "        fingerprint_weights[i] = smallest_count\n",
+                "        fingerprint_ngrams[i] = smallest_example\n",
+                "\n",
+                "    return fingerprint_hashes, fingerprint_weights, fingerprint_ngrams\n",
+                "\n",
+                "count_min_sketch(\"abcde\", [3, 4], [257, 258], 4503599626977)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "A good set of hyper-parameters for Min-Hashing binary text would be:\n",
+                "\n",
+                "- `window_widths`: ${3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 18, 21, 24, 27, 30}$ - 16 widths\n",
+                "- `alphabet_size`: $256$ for ASCII & binary UTF-8 content\n",
+                "- `ndim`: $16...1024$, something like 192 should be great for X/Twitter\n",
+                "- `multipliers`: ${257, 258, 259, 260, 261, 262, ..., 1024 + 256}$\n",
+                "\n",
+                "When processing less usual inputs, like the DNA sequences, parameters may be different, e.g.:\n",
+                "\n",
+                "- `window_widths`: ${3, 6, 9, 12, 15, 30, 60, 120}$\n",
+                "- `alphabet_size`: $4$ for DNA sequences\n",
+                "- `ndim`: should be probably proportional to $√n$, where $n$ is the typical length of sequences\n",
+                "- `multipliers`: ${5, 6, 7, 8, 9, ..., 4 * n + 1}$\n",
+                "\n",
+                "In every case, the `modulo` should be co-prime to the multiplier.\n",
+                "The easiest option is to use a large prime, that can be obtained via:\n",
+                "\n",
+                "```python\n",
+                "largest_prime_below(int(LARGEST_INTEGRAL_FLOAT) // max(multipliers) - (alphabet_size + 1))\n",
+                "```"
             ]
         },
         {
             "cell_type": "code",
-            "execution_count": 10,
+            "execution_count": null,
             "metadata": {},
             "outputs": [],
             "source": [
-                "window_widths = [3, 4, 8, 16, 32]\n",
-                "precision_levels = [np.uint8, np.uint16, np.uint32, np.uint64]\n",
-                "modulo_per_precision = [251, 65521, 2**31 - 1, 2**63 - 1]"
+                "import numpy as np\n",
+                "from typing import Tuple\n",
+                "\n",
+                "\n",
+                "def jaccard_similarity(a: np.ndarray, b: np.ndarray) -> float:\n",
+                "    if a.shape != b.shape:\n",
+                "        raise ValueError(\"Fingerprints must have identical length\")\n",
+                "\n",
+                "    return float(np.mean(a == b))\n",
+                "\n",
+                "\n",
+                "def weighted_jaccard_similarity(\n",
+                "    a: Tuple[np.ndarray, np.ndarray],\n",
+                "    b: Tuple[np.ndarray, np.ndarray],\n",
+                ") -> float:\n",
+                "    hashes_a, weights_a = a\n",
+                "    hashes_b, weights_b = b\n",
+                "\n",
+                "    if hashes_a.shape != hashes_b.shape or weights_a.shape != weights_b.shape:\n",
+                "        raise ValueError(\"Both fingerprints must have identical dimensions\")\n",
+                "\n",
+                "    magnitude_i = (weights_a * weights_b)[hashes_a == hashes_b].sum()\n",
+                "    magnitude_a = (weights_a * weights_a).sum()\n",
+                "    magnitude_b = (weights_b * weights_b).sum()\n",
+                "    magnitude_u = magnitude_a + magnitude_b - magnitude_i\n",
+                "\n",
+                "    return float(magnitude_i) / float(magnitude_u)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "Let's compute the rolling fingerprints:"
             ]
         },
         {
@@ -389,17 +676,221 @@
             "metadata": {},
             "outputs": [],
             "source": [
-                "for window_width in window_widths:\n",
-                "    for precision, modulo in zip(precision_levels, modulo_per_precision):\n",
-                "        print(\n",
-                "            f\"Window width: {window_width}, Precision: {precision}, Modulo: {modulo}\"\n",
-                "        )\n",
+                "textual_dataset_path = dataset_directory / \"leipzig1M.txt\"\n",
+                "textual_dataset = open(textual_dataset_path, \"r\").read().strip()\n",
+                "textual_lines = textual_dataset.split(\"\\n\")\n",
+                "print(f\"Loaded {len(textual_lines):,} lines of mean length {sum(len(line) for line in textual_lines) / len(textual_lines):.2f} characters\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "from tqdm import tqdm\n",
+                "\n",
+                "multipliers = list(range(256, 256+192))\n",
+                "window_widths = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 18, 21, 24, 27, 30]\n",
+                "window_widths *= (192 // len(window_widths))\n",
+                "LARGEST_MODULO_SAFE_MODULO = 4503599626977\n",
+                "\n",
+                "fingerprint_hashes = []\n",
+                "fingerprint_counts = []\n",
+                "fingerprint_ngrams = []\n",
+                "\n",
+                "DATASET_SIZE_LIMIT = 10_000\n",
+                "\n",
+                "for line in tqdm(textual_lines[:DATASET_SIZE_LIMIT], desc=\"Fingerprinting lines\", unit=\"line\"):\n",
+                "    hashes, counts, ngrams = count_min_sketch(\n",
+                "        text=line,\n",
+                "        window_widths=window_widths,\n",
+                "        multipliers=multipliers,\n",
+                "        modulo= LARGEST_MODULO_SAFE_MODULO,\n",
+                "    )\n",
+                "    fingerprint_hashes.append(hashes)\n",
+                "    fingerprint_counts.append(counts)\n",
+                "    fingerprint_ngrams.append(ngrams)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "Let's estimate Recall @ 1, but before we do that - let's find a way to highlight N-gram matches between strings."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "COLOR_ARRAY = [\n",
+                "    \"\\033[38;5;196m\",  # red\n",
+                "    \"\\033[38;5;208m\",  # orange\n",
+                "    \"\\033[38;5;226m\",  # yellow\n",
+                "    \"\\033[38;5;082m\",  # green\n",
+                "    \"\\033[38;5;039m\",  # blue\n",
+                "    \"\\033[38;5;201m\",  # magenta\n",
+                "    \"\\033[38;5;129m\",  # purple\n",
+                "]\n",
+                "\n",
+                "# Sometimes hashes match, but the n-grams are different\n",
+                "COLOR_COLLISION = \"\\033[38;5;244m\"  # grey\n",
+                "COLOR_RESET = \"\\033[0m\"\n",
+                "\n",
+                "def color_code_matches(\n",
+                "    query_text: str,\n",
+                "    document_text: str,\n",
+                "    query_hashes: np.ndarray,\n",
+                "    document_hashes: np.ndarray,\n",
+                "    query_ngrams: np.ndarray,\n",
+                "    document_ngrams: np.ndarray,\n",
+                ") -> Tuple[str, str]:\n",
+                "    \n",
+                "    color_index = 0\n",
+                "    for dim in range(len(query_hashes)):\n",
+                "        is_matching_hash = query_hashes[dim] == document_hashes[dim]\n",
+                "        is_matching_ngram = query_ngrams[dim] == document_ngrams[dim]\n",
                 "        \n",
-                "        # Textual dataset\n",
-                "        textual_hashes = rolling_hashes(\n",
-                "            textual_dataset, window_width, 31, modulo, dtype=precision\n",
+                "        if is_matching_ngram:\n",
+                "            color = COLOR_ARRAY[color_index % len(COLOR_ARRAY)]\n",
+                "            ngram_replacement = f\"{color}{query_ngrams[dim]}{COLOR_RESET}\"\n",
+                "            query_text = query_text.replace(query_ngrams[dim], ngram_replacement)\n",
+                "            document_text = document_text.replace(document_ngrams[dim], ngram_replacement)\n",
+                "            color_index += 1\n",
+                "        elif is_matching_hash:\n",
+                "            ngram_replacement = f\"{COLOR_COLLISION}{query_ngrams[dim]}{COLOR_RESET}\"\n",
+                "            query_text = query_text.replace(query_ngrams[dim], ngram_replacement)\n",
+                "            document_text = document_text.replace(document_ngrams[dim], ngram_replacement)\n",
+                "    \n",
+                "    return query_text, document_text"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "from tqdm import tqdm\n",
+                "\n",
+                "QUERIES_TO_COMPARE = 100\n",
+                "\n",
+                "for i, query_hashes, query_counts, query_ngrams in tqdm(zip(\n",
+                "    range(QUERIES_TO_COMPARE),\n",
+                "    fingerprint_hashes[:QUERIES_TO_COMPARE],\n",
+                "    fingerprint_counts[:QUERIES_TO_COMPARE],\n",
+                "    fingerprint_ngrams[:QUERIES_TO_COMPARE],\n",
+                "), desc=\"Searching\", unit=\"doc\", total=QUERIES_TO_COMPARE):\n",
+                "    \n",
+                "    # Compare with all other fingerprints\n",
+                "    best_score, best_index = 0.0, -1\n",
+                "    for j, dataset_hashes, dataset_counts, dataset_ngrams in zip(\n",
+                "        range(len(fingerprint_hashes)),\n",
+                "        fingerprint_hashes,\n",
+                "        fingerprint_counts,\n",
+                "        fingerprint_ngrams,\n",
+                "    ):\n",
+                "        if i == j:\n",
+                "            continue\n",
+                "\n",
+                "        score = jaccard_similarity(query_hashes, dataset_hashes)\n",
+                "        if score > best_score:\n",
+                "            best_score = score\n",
+                "            best_index = j\n",
+                "\n",
+                "    query = textual_lines[i]\n",
+                "    doc = textual_lines[best_index]\n",
+                "    colored_query, colored_doc = color_code_matches(\n",
+                "        query_text=query,\n",
+                "        document_text=doc,\n",
+                "        query_hashes=query_hashes,\n",
+                "        document_hashes=fingerprint_hashes[best_index],\n",
+                "        query_ngrams=query_ngrams,\n",
+                "        document_ngrams=fingerprint_ngrams[best_index],\n",
+                "    )\n",
+                "    print(f\"Matched query {i:,} with document {best_index:,} with score {best_score:.4f}\")\n",
+                "    print(f\"- {colored_query}\")\n",
+                "    print(f\"- {colored_doc}\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "from tqdm import tqdm\n",
+                "\n",
+                "QUERIES_TO_COMPARE = 100\n",
+                "\n",
+                "for i, query_hashes, query_counts, query_ngrams in tqdm(\n",
+                "    zip(\n",
+                "        range(QUERIES_TO_COMPARE),\n",
+                "        fingerprint_hashes[:QUERIES_TO_COMPARE],\n",
+                "        fingerprint_counts[:QUERIES_TO_COMPARE],\n",
+                "        fingerprint_ngrams[:QUERIES_TO_COMPARE],\n",
+                "    ),\n",
+                "    desc=\"Searching\",\n",
+                "    unit=\"doc\",\n",
+                "    total=QUERIES_TO_COMPARE,\n",
+                "):\n",
+                "\n",
+                "    # Compare with all other fingerprints\n",
+                "    best_score, best_index = 0.0, -1\n",
+                "    for j, dataset_hashes, dataset_counts, dataset_ngrams in zip(\n",
+                "        range(len(fingerprint_hashes)),\n",
+                "        fingerprint_hashes,\n",
+                "        fingerprint_counts,\n",
+                "        fingerprint_ngrams,\n",
+                "    ):\n",
+                "        if i == j:\n",
+                "            continue\n",
+                "\n",
+                "        score = weighted_jaccard_similarity(\n",
+                "            query_hashes,\n",
+                "            dataset_hashes,\n",
+                "            query_ngrams,\n",
+                "            dataset_ngrams,\n",
                 "        )\n",
-                "        print(f\"Textual hashes: {textual_hashes[:10]}...\")"
+                "        if score > best_score:\n",
+                "            best_score = score\n",
+                "            best_index = j\n",
+                "\n",
+                "    query = textual_lines[i]\n",
+                "    doc = textual_lines[best_index]\n",
+                "    colored_query, colored_doc = color_code_matches(\n",
+                "        query_text=query,\n",
+                "        document_text=doc,\n",
+                "        query_hashes=query_hashes,\n",
+                "        document_hashes=fingerprint_hashes[best_index],\n",
+                "        query_ngrams=query_ngrams,\n",
+                "        document_ngrams=fingerprint_ngrams[best_index],\n",
+                "    )\n",
+                "    print(\n",
+                "        f\"Matched query {i:,} with document {best_index:,} with score {best_score:.4f}\"\n",
+                "    )\n",
+                "    print(f\"- {colored_query}\")\n",
+                "    print(f\"- {colored_doc}\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## Min-Hash Fingerprinting DNA & Protein Sequences"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "dna_dataset_path = dataset_directory / \"acgt_10k.txt\"\n",
+                "dna_dataset = open(dna_dataset_path, \"r\").read().strip()"
             ]
         }
     ],

From 19f92c45d185f1735e2b988b72aad02a9532678e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 21 Jul 2025 17:53:10 +0000
Subject: [PATCH 484/751] Fix: Fingerprinting compilation

---
 CONTRIBUTING.md                      |   4 +-
 README.md                            |   2 +-
 include/stringzilla/stringzilla.hpp  |  18 +-
 include/stringzillas/fingerprint.hpp | 420 ++++++++++++++-------------
 scripts/bench_fingerprint.cuh        |  56 ++--
 scripts/test_fingerprint.cuh         | 128 ++++++--
 scripts/test_stringzillas.cpp        |   2 +-
 7 files changed, 369 insertions(+), 261 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 529aea60..90273669 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -167,7 +167,7 @@ I'd recommend putting the following breakpoints:
 - `__asan::ReportGenericError` - to detect illegal memory accesses.
 - `__GI_exit` - to stop at exit points - the end of running any executable.
 - `__builtin_unreachable` - to catch unexpected code paths.
-- `_sz_assert_failure` - to catch StringZilla logic assertions.
+- `sz_assert_failure_` - to catch StringZilla logic assertions.
 
 ### Benchmarking
 
@@ -197,7 +197,7 @@ Let's say you want to benchmark large-batch DNA similarity scoring kernels:
 
 ```sh
 cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -B build_release
-cmake --build build_release --config Release --target stringzillas_bench_similarity_cpp20    # CPU
+cmake --build build_release --config Release --target stringzillas_bench_fingerprint_cpp20   # CPU
 cmake --build build_release --config Release --target stringzillas_bench_similarity_cu20     # GPU
 STRINGWARS_FILTER=32768 STRINGWARS_DATASET="acgt_1k.txt" build_release/stringzillas_bench_similarity_cpp20
 STRINGWARS_FILTER=1 STRINGWARS_DATASET="acgt_100k.txt" build_release/stringzillas_bench_similarity_cu20
diff --git a/README.md b/README.md
index 152bec37..127d793b 100644
--- a/README.md
+++ b/README.md
@@ -756,7 +756,7 @@ typedef union sz_string_t {
     struct internal {
         sz_ptr_t start;
         sz_u8_t length;
-        char chars[_SZ_STRING_INTERNAL_SPACE]; /// Ends with a null-terminator.
+        char chars[SZ_STRING_INTERNAL_SPACE]; /// Ends with a null-terminator.
     } internal;
 
     struct external {
diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 1b8da5fd..0fb9deb0 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -19,21 +19,19 @@
 #include "types.hpp"
 
 #if !SZ_AVOID_STL
-#include <array>
-#include <bitset>
-#include <string>
-#include <vector>
-#if _SZ_IS_CPP17 && __cpp_lib_string_view
-#include <string_view>
-#endif
-#endif
-
 #include <cassert>   // `assert`
 #include <cstddef>   // `std::size_t`
 #include <cstdint>   // `std::int8_t`
 #include <iosfwd>    // `std::basic_ostream`
 #include <stdexcept> // `std::out_of_range`
-#include <utility>   // `std::swap`
+#include <array>     // `std::array`
+#include <bitset>    // `std::bitset`
+#include <string>    // `std::string`
+#include <vector>    // `std::vector`
+#if SZ_IS_CPP17_ && __cpp_lib_string_view
+#include <string_view> // `std::string_view`
+#endif
+#endif
 
 #include <stringzilla/stringzilla.h>
 
diff --git a/include/stringzillas/fingerprint.hpp b/include/stringzillas/fingerprint.hpp
index f7a7d83e..904f0f13 100644
--- a/include/stringzillas/fingerprint.hpp
+++ b/include/stringzillas/fingerprint.hpp
@@ -37,6 +37,7 @@
  *  3. add broadcasted byte to the hashes,
  *  4. compute the modulo of the hashes with a large prime number.
  *
+ *  That logic is packed into 3 functions: @b `push`, @b `roll`, and @b `digest`.
  *  The typical instructions for high-resolution integer multiplication are like are:
  *
  *  - `VPMULLQ (ZMM, ZMM, ZMM)` for `_mm512_mullo_epi64`:
@@ -116,6 +117,7 @@ namespace stringzillas {
  */
 template <typename hash_type_ = std::uint64_t>
 struct multiplying_rolling_hasher {
+    using state_t = hash_type_;
     using hash_t = hash_type_;
 
     explicit multiplying_rolling_hasher(std::size_t window_width, hash_t multiplier = static_cast<hash_t>(257)) noexcept
@@ -129,17 +131,19 @@ struct multiplying_rolling_hasher {
 
     inline std::size_t window_width() const noexcept { return window_width_; }
 
-    inline hash_t update(hash_t old_hash, byte_t new_char) const noexcept { return old_hash * multiplier_ + new_char; }
+    inline state_t push(state_t state, byte_t new_char) const noexcept { return state * multiplier_ + new_char; }
 
-    inline hash_t update(hash_t const old_hash, byte_t const old_char, byte_t const new_char) const noexcept {
-        hash_t const without_head = old_hash - old_char * highest_power_;
+    inline state_t roll(state_t state, byte_t old_char, byte_t new_char) const noexcept {
+        state_t const without_head = state - old_char * highest_power_;
         return without_head * multiplier_ + new_char;
     }
 
+    inline hash_t digest(state_t const state) const noexcept { return state; }
+
   private:
     std::size_t window_width_;
-    hash_t multiplier_;
-    hash_t highest_power_;
+    state_t multiplier_;
+    state_t highest_power_;
 };
 
 /**
@@ -155,7 +159,7 @@ struct multiplying_rolling_hasher {
 template <typename hash_type_ = std::uint32_t, typename accumulator_type_ = std::uint64_t>
 struct rabin_karp_rolling_hasher {
     using hash_t = hash_type_;
-    using accumulator_t = accumulator_type_;
+    using state_t = accumulator_type_;
 
     static_assert(is_same_type<hash_t, std::uint16_t>::value || is_same_type<hash_t, std::uint32_t>::value ||
                       is_same_type<hash_t, std::uint64_t>::value,
@@ -183,36 +187,31 @@ struct rabin_karp_rolling_hasher {
 
     inline std::size_t window_width() const noexcept { return window_width_; }
 
-    inline hash_t update(hash_t old_hash, byte_t new_char) const noexcept {
-        return add_mod(mul_mod(old_hash, multiplier_), new_char);
+    inline state_t push(state_t state, byte_t new_char) const noexcept {
+        state_t new_term = static_cast<state_t>(new_char + 1u);
+        return add_mod(mul_mod(state, multiplier_), new_term);
     }
 
-    inline hash_t update(hash_t const old_hash, byte_t const old_char, byte_t const new_char) const noexcept {
-        hash_t const term_to_subtract = mul_mod(old_char, discarding_multiplier_);
-        hash_t const without_head = sub_mod(old_hash, term_to_subtract);
-        return add_mod(mul_mod(without_head, multiplier_), new_char);
-    }
+    inline state_t roll(state_t state, byte_t old_char, byte_t new_char) const noexcept {
+        state_t old_term = static_cast<state_t>(old_char + 1u);
+        state_t new_term = static_cast<state_t>(new_char + 1u);
 
-  private:
-    inline hash_t mul_mod(hash_t const a, hash_t const b) const noexcept {
-        accumulator_t const prod = accumulator_t {a} * accumulator_t {b};
-        return static_cast<hash_t>(prod % modulo_);
+        state_t without_old = sub_mod(state, mul_mod(old_term, discarding_multiplier_));
+        state_t with_new = add_mod(mul_mod(without_old, multiplier_), new_term);
+        return with_new;
     }
 
-    inline hash_t add_mod(hash_t const a, hash_t const b) const noexcept {
-        accumulator_t const sum = accumulator_t {a} + accumulator_t {b};
-        return static_cast<hash_t>(sum % modulo_);
-    }
+    inline hash_t digest(state_t state) noexcept { return static_cast<hash_t>(state); }
 
-    inline hash_t sub_mod(hash_t const a, hash_t const b) const noexcept {
-        accumulator_t diff = accumulator_t {a} + modulo_ - accumulator_t {b};
-        return static_cast<hash_t>(diff % modulo_);
-    }
+  private:
+    inline state_t mul_mod(state_t a, state_t b) const noexcept { return (a * b) % modulo_; }
+    inline state_t add_mod(state_t a, state_t b) const noexcept { return (a + b) % modulo_; }
+    inline state_t sub_mod(state_t a, state_t b) const noexcept { return (a + modulo_ - b) % modulo_; }
 
     std::size_t window_width_;
-    hash_t modulo_;
-    hash_t multiplier_;
-    hash_t discarding_multiplier_;
+    state_t modulo_;
+    state_t multiplier_;
+    state_t discarding_multiplier_;
 };
 
 /**
@@ -222,6 +221,7 @@ struct rabin_karp_rolling_hasher {
  */
 template <typename hash_type_ = std::uint64_t>
 struct buz_rolling_hasher {
+    using state_t = hash_type_;
     using hash_t = hash_type_;
 
     explicit buz_rolling_hasher(std::size_t window_width, std::uint64_t seed = 0x9E3779B97F4A7C15ull) noexcept
@@ -233,21 +233,22 @@ struct buz_rolling_hasher {
 
     inline std::size_t window_width() const noexcept { return window_width_; }
 
-    inline hash_t update(hash_t old_hash, byte_t new_char) const noexcept {
-        return rotl(old_hash, 1) ^ table_[new_char & 0xFFu];
+    inline state_t push(state_t state, byte_t new_char) const noexcept {
+        return rotl(state, 1) ^ table_[new_char & 0xFFu];
     }
 
-    inline hash_t update(hash_t const old_hash, byte_t const old_char, byte_t const new_char) const noexcept {
-        constexpr unsigned bits_k = sizeof(hash_t) * 8u;
-
-        hash_t const rolled = rotl(old_hash, 1);
-        hash_t const remove_term = rotl(table_[old_char & 0xFFu], window_width_ & (bits_k - 1u));
+    inline state_t roll(state_t state, byte_t old_char, byte_t new_char) const noexcept {
+        constexpr unsigned bits_k = sizeof(state_t) * 8u;
+        state_t const rolled = rotl(state, 1);
+        state_t const remove_term = rotl(table_[old_char & 0xFFu], window_width_ & (bits_k - 1u));
         return rolled ^ remove_term ^ table_[new_char & 0xFFu];
     }
 
+    inline hash_t digest(state_t state) const noexcept { return state; }
+
   private:
-    static inline hash_t rotl(hash_t const v, unsigned const r) noexcept {
-        constexpr unsigned bits_k = sizeof(hash_t) * 8u;
+    static inline state_t rotl(state_t v, unsigned r) noexcept {
+        constexpr unsigned bits_k = sizeof(state_t) * 8u;
         return (v << r) | (v >> (bits_k - r));
     }
 
@@ -260,7 +261,7 @@ struct buz_rolling_hasher {
     }
 
     std::size_t window_width_;
-    hash_t table_[256];
+    state_t table_[256];
 };
 
 /**
@@ -282,12 +283,12 @@ inline std::uint64_t choose_coprime_modulo(std::uint64_t multiplier, std::uint64
     return 0;
 }
 
-template <typename float_type_ = float>
+template <typename state_type_ = float>
 struct floating_rolling_hasher;
 
 /**
  *  @brief Rabin-Karp-style Rolling hash function for single-precision floating-point numbers.
- *  @tparam float_type_ Type of the floating-point number, e.g., `float`.
+ *  @tparam state_type_ Type of the floating-point number, e.g., `float`.
  *
  *  The IEEE 754 single-precision `float` has a 24-bit significand (23 explicit bits + 1 implicit bit).
  *  For simplicity, we just focus on the 23-bit part, which is capable of exactly representing integers
@@ -308,20 +309,24 @@ struct floating_rolling_hasher;
  */
 template <>
 struct floating_rolling_hasher<float> {
+    using state_t = float;
     using hash_t = std::uint32_t;
-    using float_t = float;
 
-    static constexpr float_t limit_k = 8'388'607.0f;
+    /** @brief The largest integer exactly representable as a float. */
+    static constexpr state_t limit_k = 8'388'607.0f;
+
+    /** @brief The typical size of the alphabet - the 256 possible values of a single byte. */
     static constexpr hash_t default_alphabet_size_k = 256u;
+
+    /** @brief The largest prime, that multiplied by `default_alphabet_size_k` and added a term - stays `limit_k`. */
     static constexpr hash_t default_modulo_base_k = 8123u;
 
     explicit floating_rolling_hasher(                      //
         std::size_t const window_width,                    //
         hash_t const multiplier = default_alphabet_size_k, //
         hash_t const modulo = default_modulo_base_k) noexcept
-        : window_width_ {window_width}, multiplier_ {static_cast<float_t>(multiplier)},
-          modulo_ {static_cast<float_t>(modulo)}, inverse_modulo_ {1.0f / modulo_},
-          negative_discarding_multiplier_ {1.0f} {
+        : window_width_ {window_width}, multiplier_ {static_cast<state_t>(multiplier)},
+          modulo_ {static_cast<state_t>(modulo)}, inverse_modulo_ {1.0f / modulo_}, discarding_multiplier_ {1.0f} {
 
         sz_assert_(window_width_ > 1 && "Window width must be > 1");
         sz_assert_(multiplier_ > 0 && "Multiplier must be positive");
@@ -329,66 +334,70 @@ struct floating_rolling_hasher<float> {
 
         // If we want to avoid hitting +inf or NaN, we need to make sure that the product of our post-modulo
         // normalized number with the multiplier and added subsequent term stays within the exactly representable range.
-        float_t const largest_input_term = std::numeric_limits<byte_t>::max() + 1.0f;
-        float_t const largest_normalized_state = modulo_ - 1;
-        float_t const largest_intermediary = largest_normalized_state * multiplier_ + largest_input_term;
+        state_t const largest_input_term = std::numeric_limits<byte_t>::max() + 1.0f;
+        state_t const largest_normalized_state = modulo_ - 1;
+        state_t const largest_intermediary = largest_normalized_state * multiplier_ + largest_input_term;
         sz_assert_(largest_intermediary < limit_k && "Intermediate state overflows the limit");
 
         // ! The GCC header misses the `std::fmodf` overload, so we use the underlying C version
         for (std::size_t i = 0; i + 1 < window_width_; ++i)
-            negative_discarding_multiplier_ = ::fmodf(negative_discarding_multiplier_ * multiplier_, modulo_);
-        negative_discarding_multiplier_ = -negative_discarding_multiplier_;
+            discarding_multiplier_ = ::fmodf(discarding_multiplier_ * multiplier_, modulo_);
     }
 
     inline std::size_t window_width() const noexcept { return window_width_; }
 
-    inline hash_t update(hash_t const old_hash, byte_t const new_char) const noexcept {
-
-        float_t state = sz_bitcast_(float_t, old_hash);
-        float_t new_term = float_t(new_char) + 1.0f;
-
-        state = std::fmaf(state, multiplier_, new_term);
-        state = reduce(state);
-
-        return sz_bitcast_(hash_t, state);
+    inline state_t push(state_t state, byte_t new_char) const noexcept {
+        state_t new_term = state_t(new_char) + 1.0f;
+        return fma_mod(state, multiplier_, new_term);
     }
 
-    inline hash_t update(hash_t const old_hash, byte_t const old_char, byte_t const new_char) const noexcept {
+    inline state_t roll(state_t state, byte_t old_char, byte_t new_char) const noexcept {
 
-        float_t state = sz_bitcast_(float_t, old_hash);
-        float_t old_term = float_t(old_char) + 1.0f;
-        float_t new_term = float_t(new_char) + 1.0f;
+        state_t old_term = state_t(old_char) + 1.0f;
+        state_t new_term = state_t(new_char) + 1.0f;
 
-        state = std::fmaf(state, negative_discarding_multiplier_, old_term); // Remove tail
-        state = std::fmaf(state, multiplier_, new_term);                     // Add head
-        state = reduce(state);
-
-        return sz_bitcast_(hash_t, state);
+        state_t term_to_subtract = mul_mod(old_term, discarding_multiplier_);
+        state_t without_head = sub_mod(state, term_to_subtract);
+        return fma_mod(without_head, multiplier_, new_term);
     }
 
+    inline hash_t digest(state_t state) const noexcept { return static_cast<hash_t>(state); }
+
   private:
+    inline state_t mul_mod(state_t a, state_t b) const noexcept { return reduce(a * b); }
+    inline state_t fma_mod(state_t a, state_t b, state_t c) const noexcept { return reduce(a * b + c); }
+    inline state_t sub_mod(state_t a, state_t b) const noexcept { return reduce(a + modulo_ - b); }
+
     /**
      *  @brief Barrett-style `std::fmodf` alternative to avoid overflow.
      *  @see https://en.cppreference.com/w/cpp/numeric/math/fmod
      */
-    inline float_t reduce(float_t h) const noexcept {
+    inline state_t reduce(state_t state) const noexcept {
+        sz_assert_(state >= 0 && "We can't handle negative states");
+
+        state_t h = state;
         h -= modulo_ * std::floor(h * inverse_modulo_);
         // Clamp into the [0, modulo_) range.
         h += modulo_ * (h < 0.0f);
         h -= modulo_ * (h >= modulo_);
+        sz_assert_(h >= 0 && "Intermediate state underflows the zero");
+        sz_assert_(h < limit_k && "Intermediate state overflows the limit");
+        sz_assert_(static_cast<std::uint64_t>(state) % static_cast<std::uint64_t>(modulo_) ==
+                       static_cast<std::uint64_t>(h) &&
+                   "Floating point approximation was incorrect");
         return h;
     }
 
     std::size_t window_width_;
-    float_t multiplier_;
-    float_t modulo_;
-    float_t inverse_modulo_;
-    float_t negative_discarding_multiplier_;
+    state_t multiplier_;
+    state_t modulo_;
+    state_t inverse_modulo_;
+    state_t discarding_multiplier_;
 };
 
 /**
  *  @brief Rabin-Karp-style Rolling hash function for double-precision floating-point numbers.
- *  @tparam float_type_ Type of the floating-point number, e.g., `float`.
+ *  @tparam state_type_ Type of the floating-point number, e.g., `float`.
  *
  *  The IEEE 754 double-precision `float` has a 53-bit significand (52 explicit bits + 1 implicit bit).
  *  For simplicity, we just focus on the 52-bit part, which is capable of exactly representing integers
@@ -400,28 +409,27 @@ struct floating_rolling_hasher<float> {
  *
  *  Assuming the multipliers are typically within @b [256;~1000) and the additive factor is always within @b [1;257],
  *  a safer choice of modulo is the largest prime under `limit_k/1000-257`:
- *
  *      4'503'599'626'781, 4'503'599'626'783, 4'503'599'626'807,
  *      4'503'599'626'907, 4'503'599'626'957, 4'503'599'626'977.
  *
- *  @sa `floating_rolling_hasher<double>` for 52 bit variant.
+ *  @sa `rabin_karp_rolling_hasher<std::uint32_t, std::uint64_t>` integer implementation for small modulo variants.
+ *  @sa `floating_rolling_hasher<float>` for a lower-resolution hash.
  */
 template <>
 struct floating_rolling_hasher<double> {
+    using state_t = double;
     using hash_t = std::uint64_t;
-    using float_t = double;
 
-    static constexpr float_t limit_k = 4503599627370495.0;
+    static constexpr state_t limit_k = 4503599627370495.0;
     static constexpr hash_t default_alphabet_size_k = 256u;
     static constexpr hash_t default_modulo_base_k = 4503599626977u;
 
-    explicit floating_rolling_hasher(                      //
-        std::size_t const window_width,                    //
-        hash_t const multiplier = default_alphabet_size_k, //
-        hash_t const modulo = default_modulo_base_k) noexcept
-        : window_width_ {window_width}, multiplier_ {static_cast<float_t>(multiplier)},
-          modulo_ {static_cast<float_t>(modulo)}, inverse_modulo_ {1.0 / modulo_},
-          negative_discarding_multiplier_ {1.0} {
+    explicit floating_rolling_hasher(                       //
+        std::size_t const window_width,                     //
+        state_t const multiplier = default_alphabet_size_k, //
+        state_t const modulo = default_modulo_base_k) noexcept
+        : window_width_ {window_width}, multiplier_ {static_cast<state_t>(multiplier)},
+          modulo_ {static_cast<state_t>(modulo)}, inverse_modulo_ {1.0 / modulo_}, discarding_multiplier_ {1.0} {
 
         sz_assert_(window_width_ > 1 && "Window width must be > 1");
         sz_assert_(multiplier_ > 0 && "Multiplier must be positive");
@@ -429,65 +437,77 @@ struct floating_rolling_hasher<double> {
 
         // If we want to avoid hitting +inf or NaN, we need to make sure that the product of our post-modulo
         // normalized number with the multiplier and added subsequent term stays within the exactly representable range.
-        float_t const largest_input_term = std::numeric_limits<byte_t>::max() + 1.0;
-        float_t const largest_normalized_state = modulo_ - 1;
-        float_t const largest_intermediary = largest_normalized_state * multiplier_ + largest_input_term;
+        state_t const largest_input_term = std::numeric_limits<byte_t>::max() + 1.0;
+        state_t const largest_normalized_state = modulo_ - 1;
+        state_t const largest_intermediary = largest_normalized_state * multiplier_ + largest_input_term;
         sz_assert_(largest_intermediary < limit_k && "Intermediate state overflows the limit");
 
         for (std::size_t i = 0; i + 1 < window_width_; ++i)
-            negative_discarding_multiplier_ = std::fmod(negative_discarding_multiplier_ * multiplier_, modulo_);
-        negative_discarding_multiplier_ = -negative_discarding_multiplier_;
+            discarding_multiplier_ = std::fmod(discarding_multiplier_ * multiplier_, modulo_);
+        discarding_multiplier_ = -discarding_multiplier_;
     }
 
     inline std::size_t window_width() const noexcept { return window_width_; }
 
-    inline hash_t update(hash_t const old_hash, byte_t const new_char) const noexcept {
-
-        float_t state = sz_bitcast_(float_t, old_hash);
-        float_t new_term = float_t(new_char) + 1.0;
-
+    inline state_t push(state_t state, byte_t new_char) const noexcept {
+        state_t new_term = state_t(new_char) + 1.0;
         state = std::fma(state, multiplier_, new_term);
         state = reduce(state);
-
-        return sz_bitcast_(hash_t, state);
+        return state;
     }
 
-    inline hash_t update(hash_t const old_hash, byte_t const old_char, byte_t const new_char) const noexcept {
+    inline state_t roll(state_t state, byte_t old_char, byte_t new_char) const noexcept {
 
-        float_t state = sz_bitcast_(float_t, old_hash);
-        float_t old_term = float_t(old_char) + 1.0;
-        float_t new_term = float_t(new_char) + 1.0;
+        state_t old_term = state_t(old_char) + 1.0;
+        state_t new_term = state_t(new_char) + 1.0;
 
-        state = std::fma(state, negative_discarding_multiplier_, old_term); // Remove tail
-        state = std::fma(state, multiplier_, new_term);                     // Add head
+        // Add head
+        state = std::fma(state, multiplier_, new_term);
+        sz_assert_(state < limit_k && "Intermediate state exceeds the limit");
+
+        // Remove tail
+        sz_assert_(std::abs(discarding_multiplier_) * old_term <= state && "Will underflow");
+        state = std::fma(discarding_multiplier_, old_term, state);
         state = reduce(state);
 
-        return sz_bitcast_(hash_t, state);
+        return state;
     }
 
-    inline float_t multiplier() const noexcept { return multiplier_; }
-    inline float_t modulo() const noexcept { return modulo_; }
-    inline float_t inverse_modulo() const noexcept { return inverse_modulo_; }
-    inline float_t negative_discarding_multiplier() const noexcept { return negative_discarding_multiplier_; }
+    inline hash_t digest(state_t state) const noexcept { return static_cast<hash_t>(state); }
+
+    inline state_t multiplier() const noexcept { return multiplier_; }
+    inline state_t modulo() const noexcept { return modulo_; }
+    inline state_t inverse_modulo() const noexcept { return inverse_modulo_; }
+    inline state_t negative_discarding_multiplier() const noexcept { return discarding_multiplier_; }
 
   private:
     /**
      *  @brief Barrett-style `std::fmod` alternative to avoid overflow.
      *  @see https://en.cppreference.com/w/cpp/numeric/math/fmod
      */
-    inline float_t reduce(float_t h) const noexcept {
+    inline state_t reduce(state_t state) const noexcept {
+        sz_assert_(state >= 0 && "We can't handle negative states");
+
+        state_t h = state;
         h -= modulo_ * std::floor(h * inverse_modulo_);
         // Clamp into the [0, modulo_) range.
         h += modulo_ * (h < 0.0);
         h -= modulo_ * (h >= modulo_);
+
+        sz_assert_(h >= 0 && "Intermediate state underflows the zero");
+        sz_assert_(h < limit_k && "Intermediate state overflows the limit");
+        sz_assert_(static_cast<std::uint64_t>(state) % static_cast<std::uint64_t>(modulo_) ==
+                       static_cast<std::uint64_t>(h) &&
+                   "Floating point modulo was incorrect");
+
         return h;
     }
 
     std::size_t window_width_;
-    float_t multiplier_;
-    float_t modulo_;
-    float_t inverse_modulo_;
-    float_t negative_discarding_multiplier_;
+    state_t multiplier_;
+    state_t modulo_;
+    state_t inverse_modulo_;
+    state_t discarding_multiplier_;
 };
 
 #pragma endregion - Baseline Rolling Hashers
@@ -500,31 +520,34 @@ struct floating_rolling_hasher<double> {
  */
 template <                                                                           //
     typename hasher_type_ = rabin_karp_rolling_hasher<std::uint32_t, std::uint64_t>, //
-    typename allocator_type_ = std::allocator<hasher_type_>,                         //
-    typename scalar_type_ = typename hasher_type_::hash_t                            //
+    typename scalar_type_ = typename hasher_type_::hash_t,                           //
+    typename allocator_type_ = std::allocator<hasher_type_>                          //
     >
 struct basic_rolling_hashers {
 
     using hasher_t = hasher_type_;
+    using rolling_state_t = typename hasher_t::state_t;
     using rolling_hash_t = typename hasher_t::hash_t;
+
     using result_scalar_t = scalar_type_;
     using allocator_t = allocator_type_;
 
-    static constexpr rolling_hash_t skipped_rolling_hash_k = std::numeric_limits<rolling_hash_t>::max();
+    static constexpr rolling_state_t skipped_rolling_state_k = std::numeric_limits<rolling_state_t>::max();
+    static constexpr rolling_hash_t max_rolling_hash_k = std::numeric_limits<rolling_hash_t>::max();
     static constexpr result_scalar_t max_result_scalar_k = std::numeric_limits<result_scalar_t>::max();
 
-    struct state_t {
-        rolling_hash_t last = 0;
-        rolling_hash_t minimum = skipped_rolling_hash_k;
+    struct dimension_state_t {
+        rolling_state_t last = 0;
+        rolling_hash_t minimum = max_rolling_hash_k;
     };
 
   private:
     using allocator_traits_t = std::allocator_traits<allocator_type_>;
     using hasher_allocator_t = typename allocator_traits_t::template rebind_alloc<hasher_t>;
-    using state_allocator_t = typename allocator_traits_t::template rebind_alloc<state_t>;
+    using dimension_state_allocator_t = typename allocator_traits_t::template rebind_alloc<dimension_state_t>;
 
     using hashers_t = safe_vector<hasher_t, hasher_allocator_t>;
-    using states_t = safe_vector<state_t, state_allocator_t>;
+    using dimension_states_t = safe_vector<dimension_state_t, dimension_state_allocator_t>;
 
     allocator_t allocator_;
     hashers_t hashers_;
@@ -592,10 +615,10 @@ struct basic_rolling_hashers {
         sz_assert_(result.size() == dimensions() && "Dimensions number & hashers number mismatch");
 
         // Allocate temporary states
-        states_t states(allocator_traits_t::select_on_container_copy_construction(allocator_));
-        if (states.try_resize(dimensions()) != status_t::success_k) return status_t::bad_alloc_k;
+        dimension_states_t dimension_states(allocator_traits_t::select_on_container_copy_construction(allocator_));
+        if (dimension_states.try_resize(dimensions()) != status_t::success_k) return status_t::bad_alloc_k;
 
-        fingerprint<dimensions_>(text, {states.data(), states.size()}, result);
+        fingerprint<dimensions_>(text, {dimension_states.data(), dimension_states.size()}, result);
         return status_t::success_k;
     }
 
@@ -606,19 +629,22 @@ struct basic_rolling_hashers {
      *  @param[out] result The output fingerprint, a vector of minimum hashes.
      */
     template <size_t dimensions_ = SZ_SIZE_MAX>
-    void fingerprint(span<byte_t const> text, span<state_t, dimensions_> states,
+    void fingerprint(span<byte_t const> text, span<dimension_state_t, dimensions_> dimension_states,
                      span<result_scalar_t, dimensions_> result) const noexcept {
 
         sz_assert_(result.size() == dimensions() && "Dimensions number & hashers number mismatch");
-        sz_assert_(states.size() == dimensions() && "Dimensions number & states number mismatch");
+        sz_assert_(dimension_states.size() == dimensions() && "Dimensions number & states number mismatch");
 
-        fill_states_(text, states);
+        fill_states_(text, dimension_states);
 
         // Export the minimum hashes to the fingerprint
         for (std::size_t dim = 0; dim < result.size(); ++dim) {
             hasher_t const &hasher = hashers_[dim];
-            rolling_hash_t minimum = hasher.window_width() < text.size() ? skipped_rolling_hash_k : states[dim].minimum;
-            result[dim] = static_cast<result_scalar_t>(minimum & max_result_scalar_k);
+            if (hasher.window_width() > text.size()) {
+                rolling_hash_t min_hash = dimension_states[dim].minimum;
+                result[dim] = static_cast<result_scalar_t>(min_hash & max_result_scalar_k);
+            }
+            else { result[dim] = max_result_scalar_k; }
         }
     }
 
@@ -646,8 +672,9 @@ struct basic_rolling_hashers {
         std::size_t const dims = dimensions();
 
         // Allocate enough temporary states for all cores to have individual states
-        states_t states(allocator_traits_t::select_on_container_copy_construction(allocator_));
-        if (states.try_resize(executor.threads_count() * dims) != status_t::success_k) return status_t::bad_alloc_k;
+        dimension_states_t dimension_states(allocator_traits_t::select_on_container_copy_construction(allocator_));
+        if (dimension_states.try_resize(executor.threads_count() * dims) != status_t::success_k)
+            return status_t::bad_alloc_k;
 
         // Process small texts by individual threads
         executor.for_n_dynamic(texts.size(), [&](auto prong) noexcept {
@@ -658,7 +685,7 @@ struct basic_rolling_hashers {
             if (text.size() >= text_size_threshold) return;
 
             auto text_view = to_bytes_view(text);
-            auto thread_local_states = to_span(states).subspan(thread_index * dims, dims);
+            auto thread_local_states = to_span(dimension_states).subspan(thread_index * dims, dims);
             auto result = to_span(results[text_index]);
             fingerprint<SZ_SIZE_MAX>(text_view, thread_local_states, result);
         });
@@ -683,20 +710,19 @@ struct basic_rolling_hashers {
                 // ? same slices isn't a big deal.
                 auto overlapping_text_end = std::min(text_start + chunk_size + max_window_width_ - 1, text_view.end());
                 auto thread_local_text = span<byte_t const>(text_start, overlapping_text_end);
-                auto thread_local_states = to_span(states).subspan(thread_index * dims, dims);
+                auto thread_local_states = to_span(dimension_states).subspan(thread_index * dims, dims);
                 fill_states_<SZ_SIZE_MAX>(thread_local_text, thread_local_states);
             });
 
             // Compute the minimums of each thread's local states
             auto &result = results[text_index];
             for (std::size_t dim = 0; dim < result.size(); ++dim) {
-                rolling_hash_t minimum_across_thread_local_chunks = skipped_rolling_hash_k;
+                rolling_hash_t min_hash = max_rolling_hash_k;
                 for (std::size_t thread_index = 0; thread_index < executor.threads_count(); ++thread_index) {
-                    auto const &state = states[thread_index * dims + dim];
-                    minimum_across_thread_local_chunks = (std::min)(minimum_across_thread_local_chunks, state.minimum);
+                    rolling_hash_t const &dimension_state = dimension_states[thread_index * dims + dim];
+                    min_hash = (std::min)(min_hash, dimension_state.minimum);
                 }
-                minimum_across_thread_local_chunks &= max_result_scalar_k; // Clamp to the result scalar range
-                result[dim] = static_cast<result_scalar_t>(minimum_across_thread_local_chunks);
+                result[dim] = static_cast<result_scalar_t>(min_hash & max_result_scalar_k);
             }
         }
 
@@ -705,12 +731,12 @@ struct basic_rolling_hashers {
 
   private:
     template <size_t dimensions_ = SZ_SIZE_MAX>
-    void fill_states_(span<byte_t const> text, span<state_t, dimensions_> states) const noexcept {
+    void fill_states_(span<byte_t const> text, span<dimension_state_t, dimensions_> states) const noexcept {
 
         sz_assert_(states.size() >= hashers_.size() && "Dimensions number & states number mismatch");
 
         // Clear the states
-        for (auto &state : states) state = state_t {};
+        for (auto &state : states) state = dimension_state_t {};
 
         // Until we reach the maximum window length, use a branching code version
         std::size_t const prefix_length = (std::min)(text.size(), max_window_width_);
@@ -719,11 +745,15 @@ struct basic_rolling_hashers {
             for (std::size_t dim = 0; dim < states.size(); ++dim) {
                 auto &hasher = hashers_[dim];
                 auto &state = states[dim];
-                if (hasher.window_width() > new_char_offset) { state.last = hasher.update(state.last, new_char); }
+                if (hasher.window_width() > new_char_offset) {
+                    state.last = hasher.push(state.last, new_char);
+                    if (hasher.window_width() == (new_char_offset + 1))
+                        state.minimum = (std::min)(state.minimum, hasher.digest(state.last));
+                }
                 else {
                     auto const old_char = text[new_char_offset - hasher.window_width()];
-                    state.last = hasher.update(state.last, old_char, new_char);
-                    state.minimum = (std::min)(state.minimum, state.last);
+                    state.last = hasher.roll(state.last, old_char, new_char);
+                    state.minimum = (std::min)(state.minimum, hasher.digest(state.last));
                 }
             }
         }
@@ -735,8 +765,8 @@ struct basic_rolling_hashers {
                 auto &hasher = hashers_[dim];
                 auto &state = states[dim];
                 auto const old_char = text[new_char_offset - hasher.window_width()];
-                state.last = hasher.update(state.last, old_char, new_char);
-                state.minimum = (std::min)(state.minimum, state.last);
+                state.last = hasher.roll(state.last, old_char, new_char);
+                state.minimum = (std::min)(state.minimum, hasher.digest(state.last));
             }
         }
     }
@@ -766,22 +796,23 @@ template <                                         //
 struct floating_rolling_hashers {
 
     using hasher_t = floating_rolling_hasher<double>;
-    using rolling_hash_t = typename hasher_t::hash_t;
+    using rolling_state_t = typename hasher_t::hash_t;
     using result_scalar_t = std::uint32_t;
 
     static constexpr std::size_t window_width_k = window_width_;
     static constexpr std::size_t dimensions_k = dimensions_;
-    static constexpr result_scalar_t skipped_rolling_hash_k = std::numeric_limits<result_scalar_t>::max();
+    static constexpr rolling_state_t skipped_rolling_hash_k = std::numeric_limits<rolling_state_t>::max();
+    static constexpr result_scalar_t max_result_scalar_k = std::numeric_limits<result_scalar_t>::max();
 
     using fingerprint_span_t = span<result_scalar_t, dimensions_k>;
 
   private:
-    using float_t = typename hasher_t::float_t;
+    using state_t = typename hasher_t::state_t;
 
-    float_t multipliers_[dimensions_k];
-    float_t modulos_[dimensions_k];
-    float_t inverse_modulos_[dimensions_k];
-    float_t negative_discarding_multipliers_[dimensions_k];
+    state_t multipliers_[dimensions_k];
+    state_t modulos_[dimensions_k];
+    state_t inverse_modulos_[dimensions_k];
+    state_t negative_discarding_multipliers_[dimensions_k];
 
   public:
     constexpr std::size_t window_width() const noexcept { return window_width_k; }
@@ -808,16 +839,15 @@ struct floating_rolling_hashers {
      *  @param[out] result The output fingerprint, a vector of minimum hashes.
      */
     void fingerprint(span<byte_t const> text, fingerprint_span_t result) const noexcept {
-        if (text.size() < window_width_k) return;
 
         // Fill the states
-        rolling_hash_t last_hashes[dimensions_k];
-        rolling_hash_t minimum_hashes[dimensions_k];
-        fill_states_(text, last_hashes, minimum_hashes);
+        rolling_state_t last_floats[dimensions_k];
+        rolling_state_t minimum_floats[dimensions_k];
+        fill_states_(text, last_floats, minimum_floats);
 
-        // Export the minimum hashes to the fingerprint
+        // Export the minimum floats to the fingerprint
         for (std::size_t dim = 0; dim < dimensions_k; ++dim)
-            result[dim] = static_cast<result_scalar_t>(minimum_hashes[dim]);
+            result[dim] = static_cast<result_scalar_t>(minimum_floats[dim]); // & max_result_scalar_k);
     }
 
     /**
@@ -877,7 +907,7 @@ struct floating_rolling_hashers {
                 specs.cache_line_width);
 
             auto gather_mutex = executor.make_mutex();
-            rolling_hash_t minimum_hashes[dimensions_k];
+            rolling_state_t minimum_floats[dimensions_k];
 
             // Distribute overlapping chunks across threads
             executor.for_threads([&](std::size_t thread_index) noexcept {
@@ -888,83 +918,81 @@ struct floating_rolling_hashers {
                 auto overlapping_text_end = std::min(text_start + chunk_size + window_width_k - 1, text_view.end());
                 auto thread_local_text = span<byte_t const>(text_start, overlapping_text_end);
 
-                rolling_hash_t thread_local_last_hashes[dimensions_k];
-                rolling_hash_t thread_local_minimum_hashes[dimensions_k];
-                fill_states_(thread_local_text, thread_local_last_hashes, thread_local_minimum_hashes);
+                rolling_state_t thread_local_last_floats[dimensions_k];
+                rolling_state_t thread_local_minimum_floats[dimensions_k];
+                fill_states_(thread_local_text, thread_local_last_floats, thread_local_minimum_floats);
 
                 lock_guard lock(gather_mutex);
                 for (std::size_t dim = 0; dim < dimensions_k; ++dim)
-                    minimum_hashes[dim] = (std::min)(minimum_hashes[dim], thread_local_minimum_hashes[dim]);
+                    minimum_floats[dim] = (std::min)(minimum_floats[dim], thread_local_minimum_floats[dim]);
             });
 
             // Compute the minimums of each thread's local states
             auto &result = results[text_index];
             for (std::size_t dim = 0; dim < dimensions_k; ++dim)
-                result[dim] = static_cast<result_scalar_t>(minimum_hashes[dim]);
+                result[dim] = static_cast<result_scalar_t>(minimum_floats[dim] & max_result_scalar_k);
         }
 
         return status_t::success_k;
     }
 
   private:
-    inline float_t reduce(float_t h, std::size_t dim) const noexcept {
-        float_t const modulo = modulos_[dim];
-        float_t const inverse_modulo = inverse_modulos_[dim];
-        // Using `trunc` instead of `floor` for modulo.
-        // This is branchless and avoids STL.
-        h -= modulo * static_cast<float_t>(static_cast<long long>(h * inverse_modulo));
-
-        // Branchless clamp to [0, modulo).
-        // `h` is in (-modulo, modulo).
-        // If h is negative, add modulo.
-        h += modulo * static_cast<float_t>(sz_bitcast_(std::uint64_t, h) >> 63);
-        // Now `h` is in [0, 2*modulo).
-        // If h is >= modulo, subtract modulo.
-        long long is_ge = static_cast<long long>(h * inverse_modulo);
-        h -= modulo * static_cast<float_t>(is_ge);
+    inline state_t reduce(state_t h, std::size_t dim) const noexcept {
+        state_t const modulo = modulos_[dim];
+        state_t const inverse_modulo = inverse_modulos_[dim];
+        // Use STL-based modulo reduction like floating_rolling_hasher
+        h -= modulo * std::floor(h * inverse_modulo);
+        // Clamp into the [0, modulo) range.
+        h += modulo * (h < 0.0);
+        h -= modulo * (h >= modulo);
         return h;
     }
 
-    void fill_states_(span<byte_t const> text, span<rolling_hash_t, dimensions_k> last_hashes,
-                      span<rolling_hash_t, dimensions_k> minimum_hashes) const noexcept {
+    void fill_states_(span<byte_t const> text, span<rolling_state_t, dimensions_k> last_floats,
+                      span<rolling_state_t, dimensions_k> minimum_floats) const noexcept {
 
         for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
-            last_hashes[dim] = 0;
-            minimum_hashes[dim] = skipped_rolling_hash_k;
+            last_floats[dim] = 0;
+            minimum_floats[dim] = skipped_rolling_hash_k;
         }
 
+        if (text.size() > window_width_k) return;
+
         // Until we reach the maximum window length, use a branching code version
         for (std::size_t new_char_offset = 0; new_char_offset < window_width_k; ++new_char_offset) {
             byte_t const new_char = text[new_char_offset];
-            float_t const new_term = static_cast<float_t>(new_char) + 1.0;
+            state_t const new_term = static_cast<state_t>(new_char) + 1.0;
             for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
-                rolling_hash_t &hash = last_hashes[dim];
-                float_t state = sz_bitcast_(float_t, hash);
+                rolling_state_t &hash = last_floats[dim];
+                state_t state = sz_bitcast_(state_t, hash);
                 state += multipliers_[dim] * new_term;
                 state = reduce(state, dim);
 
                 // Save back
-                hash = sz_bitcast_(rolling_hash_t, state);
-                minimum_hashes[dim] = std::min(minimum_hashes[dim], hash);
+                hash = sz_bitcast_(rolling_state_t, state);
             }
         }
 
+        // We now have our first minimum hashes
+        for (std::size_t dim = 0; dim < dimensions_k; ++dim)
+            minimum_floats[dim] = std::min(minimum_floats[dim], last_floats[dim]);
+
         // Now we can avoid a branch in the nested loop, as we are passed the longest window width
         for (std::size_t new_char_offset = window_width_k; new_char_offset < text.size(); ++new_char_offset) {
             byte_t const new_char = text[new_char_offset];
             byte_t const old_char = text[new_char_offset - window_width_k];
-            float_t const new_term = static_cast<float_t>(new_char) + 1.0;
-            float_t const old_term = static_cast<float_t>(old_char) + 1.0;
+            state_t const new_term = static_cast<state_t>(new_char) + 1.0;
+            state_t const old_term = static_cast<state_t>(old_char) + 1.0;
             for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
-                rolling_hash_t &hash = last_hashes[dim];
-                float_t state = sz_bitcast_(float_t, hash);
+                rolling_state_t &hash = last_floats[dim];
+                state_t state = sz_bitcast_(state_t, hash);
                 state += negative_discarding_multipliers_[dim] * old_term; // Remove tail
                 state += multipliers_[dim] * new_term;                     // Add head
                 state = reduce(state, dim);
 
                 // Save back
-                hash = sz_bitcast_(rolling_hash_t, state);
-                minimum_hashes[dim] = std::min(minimum_hashes[dim], hash);
+                hash = sz_bitcast_(rolling_state_t, state);
+                minimum_floats[dim] = std::min(minimum_floats[dim], hash);
             }
         }
     }
@@ -981,7 +1009,7 @@ template <std::size_t dimensions_>
 struct floating_rolling_hashers<sz_cap_ice_k, dimensions_> {
 
     using hasher_t = floating_rolling_hasher<double>;
-    using rolling_hash_t = typename hasher_t::hash_t;
+    using rolling_state_t = typename hasher_t::hash_t;
     using result_scalar_t = std::uint32_t;
     static constexpr std::size_t dimensions_k = dimensions_;
 
@@ -989,7 +1017,7 @@ struct floating_rolling_hashers<sz_cap_ice_k, dimensions_> {
                     span<result_scalar_t, dimensions_k> fingerprint) const noexcept {
 
         // constexpr std::size_t unroll_factor_k = 2;
-        // constexpr std::size_t unrolled_hashes_k = unroll_factor_k * sizeof(sz_u512_vec_t) / sizeof(rolling_hash_t);
+        // constexpr std::size_t unrolled_hashes_k = unroll_factor_k * sizeof(sz_u512_vec_t) / sizeof(rolling_state_t);
 
         // sz_u512_vec_t window_widths[unroll_factor_k], multipliers[unroll_factor_k],
         //     negative_discarding_multipliers[unroll_factor_k], modulos[unroll_factor_k],
diff --git a/scripts/bench_fingerprint.cuh b/scripts/bench_fingerprint.cuh
index ee0d9e9c..9119783e 100644
--- a/scripts/bench_fingerprint.cuh
+++ b/scripts/bench_fingerprint.cuh
@@ -22,7 +22,7 @@ namespace scripts {
 
 using namespace ashvardanian::stringzilla::scripts;
 
-static constexpr std::size_t default_embedding_dims_k = 768;
+static constexpr std::size_t default_embedding_dims_k = 128;
 static constexpr std::size_t default_window_width_k = 7;
 
 using fingerprint_t = std::array<std::uint32_t, default_embedding_dims_k>;
@@ -30,18 +30,17 @@ using fingerprints_t = unified_vector<fingerprint_t>;
 
 #pragma region Multi-Pattern Search
 
-/** @brief Wraps a hardware-specific fingerprinting backend into something @b `bench_nullary`-compatible . */
+/** @brief Wraps a hardware-specific fingerprinting backend into something @b `bench_nullary`-compatible. */
 template <typename engine_type_, typename... extra_args_>
 struct fingerprint_callable {
     using engine_t = engine_type_;
 
     environment_t const &env;
     fingerprints_t &fingerprints;
-    engine_t const &engine;
+    engine_t &engine;
     std::tuple<extra_args_...> extra_args = {};
 
-    fingerprint_callable(environment_t const &env, fingerprints_t &fingerprints, engine_t const &eng,
-                         extra_args_... args)
+    fingerprint_callable(environment_t const &env, fingerprints_t &fingerprints, engine_t &eng, extra_args_... args)
         : env(env), fingerprints(fingerprints), engine(eng), extra_args(args...) {}
 
     call_result_t operator()() noexcept(false) {
@@ -50,6 +49,7 @@ struct fingerprint_callable {
         status_t status = std::apply(
             [&](auto &&...rest) mutable {
                 auto result = engine(env.tokens, fingerprints, rest...);
+                do_not_optimize(result);
                 for (auto &fingerprint : fingerprints) do_not_optimize(fingerprint);
                 return result;
             },
@@ -61,12 +61,12 @@ struct fingerprint_callable {
         std::size_t bytes_passed = 0;
         for (std::size_t i = 0; i < env.tokens.size(); ++i) bytes_passed += env.tokens[i].size();
 
-        volatile call_result_t call_result;
+        call_result_t call_result;
         call_result.bytes_passed = bytes_passed;
         call_result.operations = bytes_passed * default_embedding_dims_k;
         call_result.inputs_processed = env.tokens.size();
         call_result.check_value = reinterpret_cast<check_value_t>(&fingerprints);
-        return (call_result_t const &)call_result;
+        return call_result;
     }
 };
 
@@ -97,8 +97,7 @@ void bench_fingerprint(environment_t const &env) {
     if (multiply_u32->try_extend(default_window_width_k, default_embedding_dims_k) != status_t::success_k)
         throw std::runtime_error("Can't build Multiplying Hasher.");
 
-    using rolling_f64_t = basic_rolling_hashers<floating_rolling_hasher<double>,
-                                                std::allocator<floating_rolling_hasher<double>>, std::uint32_t>;
+    using rolling_f64_t = basic_rolling_hashers<floating_rolling_hasher<double>, std::uint32_t>;
     auto rolling_f64 = std::make_unique<rolling_f64_t>();
     if (rolling_f64->try_extend(default_window_width_k, default_embedding_dims_k) != status_t::success_k)
         throw std::runtime_error("Can't build Floating f64 Rolling Hasher.");
@@ -112,7 +111,7 @@ void bench_fingerprint(environment_t const &env) {
         floating_rolling_hashers<sz_cap_serial_k, default_window_width_k, default_embedding_dims_k>;
     auto rolling_serial = std::make_unique<rolling_serial_t>();
     if (rolling_serial->try_seed() != status_t::success_k)
-        throw std::runtime_error("Can't build Serial Floating Hasher.");
+        throw std::runtime_error("Can't build Unrolled Floating Hasher.");
 
     using rolling_skylake_t =
         floating_rolling_hashers<sz_cap_skylake_k, default_window_width_k, default_embedding_dims_k>;
@@ -121,31 +120,36 @@ void bench_fingerprint(environment_t const &env) {
         throw std::runtime_error("Can't build Skylake Floating Hasher.");
 
     // Perform the benchmarks, passing the dictionary to the engines
-    bench_result_t baseline =
-        bench_nullary(env, "rabin_u64", fingerprint_callable<rabin_u64_t>(env, fingerprints_accelerated, *rabin_u64))
-            .log();
+    auto call_baseline = fingerprint_callable<rolling_f64_t>(env, fingerprints_baseline, *rolling_f64);
+    bench_result_t baseline = bench_nullary(env, "rolling_f64", call_baseline);
 
     // Semi-serial variants
+    bench_nullary(env, "rolling_f32", fingerprint_callable<rolling_f32_t>(env, fingerprints_accelerated, *rolling_f32))
+        .log(baseline);
+    bench_nullary(env, "rabin_u64", fingerprint_callable<rabin_u64_t>(env, fingerprints_accelerated, *rabin_u64))
+        .log(baseline);
     bench_nullary(env, "buz_u32", fingerprint_callable<buz_u32_t>(env, fingerprints_accelerated, *buz_u32)) //
         .log(baseline);
     bench_nullary(env, "multiply_u32",
                   fingerprint_callable<multiply_u32_t>(env, fingerprints_accelerated, *multiply_u32))
         .log(baseline);
-    bench_nullary(env, "rolling_f64", fingerprint_callable<rolling_f64_t>(env, fingerprints_accelerated, *rolling_f64))
-        .log(baseline);
-    bench_nullary(env, "rolling_f32", fingerprint_callable<rolling_f32_t>(env, fingerprints_accelerated, *rolling_f32))
-        .log(baseline);
 
     // Actually unrolled hard-coded variants, including SIMD ports
-    auto call_serial = fingerprint_callable<rolling_serial_t>(env, fingerprints_baseline, *rolling_serial);
-    bench_result_t serial = bench_nullary(env, "rolling_serial", call_serial).log(baseline);
-
-    bench_nullary(                                                                              //
-        env, "rolling_serial", call_serial,                                                     //
-        fingerprint_callable<rolling_serial_t>(env, fingerprints_accelerated, *rolling_serial), //
-        callable_no_op_t {},                                                                    // preprocessing
-        fingerprints_equality_t {})                                                             // equality check
-        .log(baseline, serial);
+    bench_result_t unrolled =
+        bench_nullary(                                                                              //
+            env, "rolling_serial", call_baseline,                                                   //
+            fingerprint_callable<rolling_serial_t>(env, fingerprints_accelerated, *rolling_serial), //
+            callable_no_op_t {},                                                                    // preprocessing
+            fingerprints_equality_t {})                                                             // equality check
+            .log(baseline);
+    scramble_accelerated_results();
+
+    bench_nullary(                                                                                //
+        env, "rolling_skylake", call_baseline,                                                    //
+        fingerprint_callable<rolling_skylake_t>(env, fingerprints_accelerated, *rolling_skylake), //
+        callable_no_op_t {},                                                                      // preprocessing
+        fingerprints_equality_t {})                                                               // equality check
+        .log(baseline, unrolled);
     scramble_accelerated_results();
 }
 
diff --git a/scripts/test_fingerprint.cuh b/scripts/test_fingerprint.cuh
index d4f5ee48..3bef4b90 100644
--- a/scripts/test_fingerprint.cuh
+++ b/scripts/test_fingerprint.cuh
@@ -34,11 +34,56 @@ using namespace stringzilla::scripts;
 template <typename hasher_type_>
 void test_rolling_hasher(hasher_type_ &&hasher, std::vector<std::string> const &strs) {
 
+    using hasher_t = typename std::decay<hasher_type_>::type;
+    using state_t = typename hasher_t::state_t;
+    using hash_t = typename hasher_t::hash_t;
+
     // Let's make sure that all slice hashes are the same as rolling hashes
     std::size_t window_width = hasher.window_width();
+
+    for (std::size_t i = 0; i != strs.size(); ++i) {
+        auto const &str = strs[i];
+        if (str.size() <= window_width) continue; // Skip very short inputs
+
+        // Compute the hash of the slice
+        std::size_t count_hashes = str.size() - window_width + 1;
+        std::vector<hash_t> hashes(count_hashes);
+        for (std::size_t j = 0; j < count_hashes; ++j) {
+            state_t slice_state = 0;
+            for (std::size_t k = 0; k < window_width; ++k) slice_state = hasher.push(slice_state, str[j + k]);
+            hashes[j] = hasher.digest(slice_state);
+        }
+
+        // Pre-populate the rolling-hash state until the first window ends
+        state_t rolling_state = 0;
+        for (std::size_t j = 0; j < window_width; ++j) rolling_state = hasher.push(rolling_state, str[j]);
+        hash_t rolling_hash = hasher.digest(rolling_state);
+        sz_assert_(rolling_hash == hashes[0]);
+
+        // Now compute the rolling hash and compare it to the slice hashes
+        for (std::size_t j = window_width; j < str.size(); ++j) {
+            rolling_state = hasher.roll(rolling_state, str[j - window_width], str[j]);
+            rolling_hash = hasher.digest(rolling_state);
+            sz_assert_(rolling_hash == hashes[j - window_width + 1]);
+        }
+    }
+}
+
+template <typename hasher_type_, typename baseline_hasher_type_>
+void test_rolling_hasher(hasher_type_ &&hasher, baseline_hasher_type_ &&baseline_hasher,
+                         std::vector<std::string> const &strs) {
+
     using hasher_t = typename std::decay<hasher_type_>::type;
+    using state_t = typename hasher_t::state_t;
     using hash_t = typename hasher_t::hash_t;
 
+    using baseline_hasher_t = typename std::decay<baseline_hasher_type_>::type;
+    using baseline_state_t = typename baseline_hasher_t::state_t;
+    using baseline_hash_t = typename baseline_hasher_t::hash_t;
+
+    // Let's make sure that all slice hashes are the same as rolling hashes
+    std::size_t window_width = hasher.window_width();
+
     for (std::size_t i = 0; i != strs.size(); ++i) {
         auto const &str = strs[i];
         if (str.size() <= window_width) continue; // Skip very short inputs
@@ -46,21 +91,41 @@ void test_rolling_hasher(hasher_type_ &&hasher, std::vector<std::string> const &
         // Compute the hash of the slice
         std::size_t count_hashes = str.size() - window_width + 1;
         std::vector<hash_t> hashes(count_hashes);
+        std::vector<baseline_hash_t> baseline_hashes(count_hashes);
         for (std::size_t j = 0; j < count_hashes; ++j) {
-            hash_t slice_hash = 0;
-            for (std::size_t k = 0; k < window_width; ++k) slice_hash = hasher.update(slice_hash, str[j + k]);
-            hashes[j] = slice_hash;
+            state_t slice_state = 0;
+            baseline_state_t baseline_slice_state = 0;
+            for (std::size_t k = 0; k < window_width; ++k) {
+                slice_state = hasher.push(slice_state, str[j + k]);
+                baseline_slice_state = baseline_hasher.push(baseline_slice_state, str[j + k]);
+            }
+            hashes[j] = hasher.digest(slice_state);
+            baseline_hashes[j] = baseline_hasher.digest(baseline_slice_state);
+            sz_assert_(hashes[j] == baseline_hashes[j] && "Slice hashes do not match baseline hashes");
         }
 
         // Pre-populate the rolling-hash state until the first window ends
-        hash_t rolling_hash = 0;
-        for (std::size_t j = 0; j < window_width; ++j) rolling_hash = hasher.update(rolling_hash, str[j]);
-        sz_assert_(rolling_hash == hashes[0]);
+        state_t rolling_state = 0;
+        baseline_state_t baseline_rolling_state = 0;
+        for (std::size_t j = 0; j < window_width; ++j) {
+            rolling_state = hasher.push(rolling_state, str[j]);
+            baseline_rolling_state = baseline_hasher.push(baseline_rolling_state, str[j]);
+        }
+        hash_t rolling_hash = hasher.digest(rolling_state);
+        baseline_hash_t baseline_rolling_hash = baseline_hasher.digest(baseline_rolling_state);
+        sz_assert_(rolling_hash == baseline_rolling_hash && "Rolling hashes do not match baseline hashes");
 
         // Now compute the rolling hash and compare it to the slice hashes
         for (std::size_t j = window_width; j < str.size(); ++j) {
-            rolling_hash = hasher.update(rolling_hash, str[j - window_width], str[j]);
+            rolling_state = hasher.roll(rolling_state, str[j - window_width], str[j]);
+            rolling_hash = hasher.digest(rolling_state);
+
+            baseline_rolling_state = baseline_hasher.roll(baseline_rolling_state, str[j - window_width], str[j]);
+            baseline_rolling_hash = baseline_hasher.digest(baseline_rolling_state);
+
+            sz_assert_(rolling_hash == baseline_rolling_hash && "Rolling hashes do not match baseline rolling hashes");
             sz_assert_(rolling_hash == hashes[j - window_width + 1]);
+            sz_assert_(baseline_rolling_hash == baseline_hashes[j - window_width + 1]);
         }
     }
 }
@@ -139,6 +204,10 @@ void test_rolling_hasher() {
     using f32u32_hasher_t = floating_rolling_hasher<float>;
     using f64u64_hasher_t = floating_rolling_hasher<double>;
 
+    test_rolling_hasher(f64u64_hasher_t(4, 257, 65521), u32u64_hasher_t(4, 257, 65521), unit_strings);
+    test_rolling_hasher(f64u64_hasher_t(4, 257, 65521), u32u64_hasher_t(4, 257, 65521), dna_like_strings);
+    test_rolling_hasher(f64u64_hasher_t(4, 257, 65521), u32u64_hasher_t(4, 257, 65521), inconvenient_strings);
+
     std::vector<u16u32_hasher_t> u16u32_hashers;
     u16u32_hashers.emplace_back(3, 31, 65521);
     u16u32_hashers.emplace_back(5, 31, 65521);
@@ -225,12 +294,12 @@ void test_rolling_hasher() {
 
     std::vector<f32u32_hasher_t> f32u32_hashers;
     f32u32_hashers.emplace_back(3, 31);
-    f32u32_hashers.emplace_back(5, 7001);
     f32u32_hashers.emplace_back(4, 257);
     f32u32_hashers.emplace_back(4, 257);
     f32u32_hashers.emplace_back(4, 257);
     f32u32_hashers.emplace_back(32, 257);
-    f32u32_hashers.emplace_back(32, 7001);
+    f32u32_hashers.emplace_back(5, 257, 7001);
+    f32u32_hashers.emplace_back(32, 71, 7001);
     f32u32_hashers.emplace_back(3);
     f32u32_hashers.emplace_back(32);
     f32u32_hashers.emplace_back(65);
@@ -243,12 +312,12 @@ void test_rolling_hasher() {
 
     std::vector<f64u64_hasher_t> f64u64_hashers;
     f64u64_hashers.emplace_back(3, 31);
-    f64u64_hashers.emplace_back(5, 65521);
+    f64u64_hashers.emplace_back(5, 31, 65521);
     f64u64_hashers.emplace_back(4, 257);
     f64u64_hashers.emplace_back(4, 257);
     f64u64_hashers.emplace_back(4, 257);
     f64u64_hashers.emplace_back(32, 257);
-    f64u64_hashers.emplace_back(32, 65521);
+    f64u64_hashers.emplace_back(32, 257, 65521);
     f64u64_hashers.emplace_back(3);
     f64u64_hashers.emplace_back(32);
     f64u64_hashers.emplace_back(65);
@@ -264,10 +333,10 @@ void test_rolling_hasher() {
  *  Compares the equivalence of SIMD backends to @b `floating_rolling_hashers<sz_cap_serial_k>`
  *  and the simpler `basic_rolling_hashers<floating_rolling_hasher<double>, ..., std::uint32_t>`.
  */
-template <std::size_t window_width_>
+template <std::size_t window_width_, std::size_t embedding_dims_>
 void test_rolling_hashers_equivalence_for_width() {
 
-    constexpr std::size_t embedding_dims_k = 32;
+    constexpr std::size_t embedding_dims_k = embedding_dims_;
     constexpr std::size_t window_width_k = window_width_;
     using fingerprint_t = safe_array<std::uint32_t, embedding_dims_k>;
 
@@ -291,8 +360,7 @@ void test_rolling_hashers_equivalence_for_width() {
     };
 
     // Define hasher classes
-    using rolling_f64_t = basic_rolling_hashers<floating_rolling_hasher<double>,
-                                                std::allocator<floating_rolling_hasher<double>>, std::uint32_t>;
+    using rolling_f64_t = basic_rolling_hashers<floating_rolling_hasher<double>, std::uint32_t>;
     using rolling_serial_t = floating_rolling_hashers<sz_cap_serial_k, window_width_k, embedding_dims_k>;
     using rolling_skylake_t = floating_rolling_hashers<sz_cap_skylake_k, window_width_k, embedding_dims_k>;
 
@@ -304,22 +372,32 @@ void test_rolling_hashers_equivalence_for_width() {
     sz_assert_(rolling_serial.try_seed() == status_t::success_k);
     sz_assert_(rolling_skylake.try_seed() == status_t::success_k);
 
-    // Allocate test datasets
+    // Test on each individual dataset
     auto unit_strings = rolling_hasher_basic_inputs();
+    test_against_baseline(unit_strings, rolling_f64, rolling_serial);
+    test_against_baseline(unit_strings, rolling_f64, rolling_skylake);
+
+    // Now for DNA-like data
     auto dna_like_strings = rolling_hasher_dna_like_inputs();
-    auto inconvenient_strings = rolling_hasher_inconvenient_inputs();
+    test_against_baseline(dna_like_strings, rolling_f64, rolling_serial);
+    test_against_baseline(dna_like_strings, rolling_f64, rolling_skylake);
 
-    for (auto const &dataset : {unit_strings, dna_like_strings, /*inconvenient_strings*/})
-        test_against_baseline(dataset, rolling_f64, rolling_serial);
-    for (auto const &dataset : {unit_strings, dna_like_strings, /*inconvenient_strings*/})
-        test_against_baseline(dataset, rolling_f64, rolling_skylake);
+    // Finally, for inconvenient strings
+    auto inconvenient_strings = rolling_hasher_inconvenient_inputs();
+    test_against_baseline(inconvenient_strings, rolling_f64, rolling_serial);
+    test_against_baseline(inconvenient_strings, rolling_f64, rolling_skylake);
 }
 
 void test_rolling_hashers_equivalence() {
-    test_rolling_hashers_equivalence_for_width<3>();
-    test_rolling_hashers_equivalence_for_width<7>();
-    test_rolling_hashers_equivalence_for_width<33>();
-    test_rolling_hashers_equivalence_for_width<64>();
+    // Just 2 hashes per input
+    test_rolling_hashers_equivalence_for_width<3, 2>();
+    test_rolling_hashers_equivalence_for_width<7, 2>();
+
+    // 32 hashes per input
+    test_rolling_hashers_equivalence_for_width<3, 32>();
+    test_rolling_hashers_equivalence_for_width<7, 32>();
+    test_rolling_hashers_equivalence_for_width<33, 32>();
+    test_rolling_hashers_equivalence_for_width<64, 32>();
 }
 
 } // namespace scripts
diff --git a/scripts/test_stringzillas.cpp b/scripts/test_stringzillas.cpp
index d278cc1b..ded9f08d 100644
--- a/scripts/test_stringzillas.cpp
+++ b/scripts/test_stringzillas.cpp
@@ -41,8 +41,8 @@ int main(int argc, char const **argv) {
     if (auto code = szs::scripts::log_environment(); code != 0) return code;
 
     try {
-        szs::scripts::test_rolling_hashers_equivalence();
         szs::scripts::test_rolling_hasher();
+        szs::scripts::test_rolling_hashers_equivalence();
         szs::scripts::test_similarity_scores_equivalence();
         szs::scripts::test_similarity_scores_memory_usage();
     }

From 883a3cd32881f6d38c7f8f879a56f6972a6a41f6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 21 Jul 2025 18:38:56 +0000
Subject: [PATCH 485/751] Improve: Separate StringZillas Python code

---
 python/{lib.c => stringzilla.c}          |  327 +---
 python/stringzillas.c                    | 2000 ++++++++++++++++++++++
 scripts/{test.py => test_stringzilla.py} |  140 --
 scripts/test_stringzillas.py             |  342 ++++
 4 files changed, 2390 insertions(+), 419 deletions(-)
 rename python/{lib.c => stringzilla.c} (91%)
 create mode 100644 python/stringzillas.c
 rename scripts/{test.py => test_stringzilla.py} (82%)
 create mode 100644 scripts/test_stringzillas.py

diff --git a/python/lib.c b/python/stringzilla.c
similarity index 91%
rename from python/lib.c
rename to python/stringzilla.c
index b73b39aa..15b544a0 100644
--- a/python/lib.c
+++ b/python/stringzilla.c
@@ -1,5 +1,5 @@
 /**
- *  @file       lib.c
+ *  @file       stringzilla.c
  *  @brief      Very light-weight CPython wrapper for StringZilla, with support for memory-mapping,
  *              native Python strings, Apache Arrow collections, and more.
  *  @author     Ash Vardanian
@@ -690,17 +690,24 @@ static char const doc_like_hash[] = //
     "This function can be called as a method on a Str object or as a standalone function.\n"
     "Args:\n"
     "  text (Str or str or bytes): The string to hash.\n"
+    "  seed (int, optional): The seed value for hashing. Defaults to 0.\n"
     "Returns:\n"
     "  int: The hash value of the string.\n"
     "Raises:\n"
     "  TypeError: If the argument is not string-like or incorrect number of arguments is provided.";
 
 static PyObject *Str_like_hash(PyObject *self, PyObject *args, PyObject *kwargs) {
-    // Check minimum arguments
+    // Check arguments
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
     Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member || nargs > !is_member + 1 || kwargs) {
-        PyErr_SetString(PyExc_TypeError, "hash() expects exactly one positional argument");
+
+    // Allow optional seed parameter
+    Py_ssize_t min_args = !is_member;
+    Py_ssize_t max_args = min_args + 1;
+
+    if (nargs < min_args || nargs > max_args) {
+        PyErr_SetString(PyExc_TypeError, is_member ? "hash() takes 0 or 1 positional arguments"
+                                                   : "hash() takes 1 or 2 positional arguments");
         return NULL;
     }
 
@@ -713,7 +720,43 @@ static PyObject *Str_like_hash(PyObject *self, PyObject *args, PyObject *kwargs)
         return NULL;
     }
 
-    sz_u64_t result = sz_hash(text.start, text.length, 0);
+    // Parse optional seed
+    sz_u64_t seed = 0;
+    if (is_member && nargs > 0) {
+        // Member method with seed argument
+        PyObject *seed_obj = PyTuple_GET_ITEM(args, 0);
+        if (!PyLong_Check(seed_obj)) {
+            PyErr_SetString(PyExc_TypeError, "seed must be an integer");
+            return NULL;
+        }
+        seed = PyLong_AsUnsignedLongLong(seed_obj);
+        if (PyErr_Occurred()) return NULL;
+    }
+    else if (!is_member && nargs > 1) {
+        // Standalone function with seed argument
+        PyObject *seed_obj = PyTuple_GET_ITEM(args, 1);
+        if (!PyLong_Check(seed_obj)) {
+            PyErr_SetString(PyExc_TypeError, "seed must be an integer");
+            return NULL;
+        }
+        seed = PyLong_AsUnsignedLongLong(seed_obj);
+        if (PyErr_Occurred()) return NULL;
+    }
+
+    // Handle keyword arguments if present
+    if (kwargs && PyDict_Size(kwargs) > 0) {
+        PyObject *seed_obj = PyDict_GetItemString(kwargs, "seed");
+        if (seed_obj) {
+            if (!PyLong_Check(seed_obj)) {
+                PyErr_SetString(PyExc_TypeError, "seed must be an integer");
+                return NULL;
+            }
+            seed = PyLong_AsUnsignedLongLong(seed_obj);
+            if (PyErr_Occurred()) return NULL;
+        }
+    }
+
+    sz_u64_t result = sz_hash(text.start, text.length, seed);
     return PyLong_FromUnsignedLongLong((unsigned long long)result);
 }
 
@@ -1837,263 +1880,6 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) {
     return PyLong_FromSize_t(count);
 }
 
-static PyObject *_Str_levenshtein_distance(PyObject *self, PyObject *args, PyObject *kwargs,
-                                           sz_levenshtein_distance_t function) {
-    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member + 1 || nargs > !is_member + 2) {
-        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
-        return NULL;
-    }
-
-    PyObject *str1_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *str2_obj = PyTuple_GET_ITEM(args, !is_member + 0);
-    PyObject *bound_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
-
-    if (kwargs) {
-        Py_ssize_t pos = 0;
-        PyObject *key, *value;
-        while (PyDict_Next(kwargs, &pos, &key, &value))
-            if (PyUnicode_CompareWithASCIIString(key, "bound") == 0 && !bound_obj) { bound_obj = value; }
-            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key))
-                return NULL;
-    }
-
-    sz_size_t bound = SZ_SIZE_MAX; // Default value for bound
-    if (bound_obj && ((bound = (sz_size_t)PyLong_AsSize_t(bound_obj)) == (sz_size_t)(-1))) {
-        PyErr_Format(PyExc_ValueError, "Bound must be a non-negative integer");
-        return NULL;
-    }
-
-    sz_string_view_t str1, str2;
-    if (!export_string_like(str1_obj, &str1.start, &str1.length) ||
-        !export_string_like(str2_obj, &str2.start, &str2.length)) {
-        wrap_current_exception("Both arguments must be string-like");
-        return NULL;
-    }
-
-    // Allocate memory for the Levenshtein matrix
-    sz_memory_allocator_t reusing_allocator;
-    reusing_allocator.allocate = &temporary_memory_allocate;
-    reusing_allocator.free = &temporary_memory_free;
-    reusing_allocator.handle = &temporary_memory;
-
-    sz_size_t distance;
-    sz_status_t status =
-        function(str1.start, str1.length, str2.start, str2.length, bound, &reusing_allocator, &distance);
-
-    // Check for memory allocation issues
-    if (status != sz_success_k) {
-        PyErr_NoMemory();
-        return NULL;
-    }
-
-    return PyLong_FromSize_t(distance);
-}
-
-static char const doc_levenshtein_distance[] = //
-    "Compute the Levenshtein edit distance between two strings.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The first string.\n"
-    "  other (str): The second string to compare.\n"
-    "  bound (int, optional): Optional maximum distance to compute (default is no bound).\n"
-    "Returns:\n"
-    "  int: The edit distance (number of insertions, deletions, substitutions).";
-
-static PyObject *Str_levenshtein_distance(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return _Str_levenshtein_distance(self, args, kwargs, &sz_levenshtein_distance);
-}
-
-static char const doc_levenshtein_distance_unicode[] = //
-    "Compute the Levenshtein edit distance between two Unicode strings.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The first string.\n"
-    "  other (str): The second string to compare.\n"
-    "  bound (int, optional): Optional maximum distance to compute (default is no bound).\n"
-    "Returns:\n"
-    "  int: The edit distance in Unicode characters.";
-
-static PyObject *Str_levenshtein_distance_unicode(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return _Str_levenshtein_distance(self, args, kwargs, &sz_levenshtein_distance_utf8);
-}
-
-static PyObject *_Str_hamming_distance(PyObject *self, PyObject *args, PyObject *kwargs,
-                                       sz_hamming_distance_t function) {
-    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member + 1 || nargs > !is_member + 2) {
-        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
-        return NULL;
-    }
-
-    PyObject *str1_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *str2_obj = PyTuple_GET_ITEM(args, !is_member + 0);
-    PyObject *bound_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
-
-    if (kwargs) {
-        Py_ssize_t pos = 0;
-        PyObject *key, *value;
-        while (PyDict_Next(kwargs, &pos, &key, &value))
-            if (PyUnicode_CompareWithASCIIString(key, "bound") == 0 && !bound_obj) { bound_obj = value; }
-            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key))
-                return NULL;
-    }
-
-    Py_ssize_t bound = 0; // Default value for bound
-    if (bound_obj && ((bound = PyLong_AsSsize_t(bound_obj)) < 0)) {
-        PyErr_Format(PyExc_ValueError, "Bound must be a non-negative integer");
-        return NULL;
-    }
-
-    sz_string_view_t str1, str2;
-    if (!export_string_like(str1_obj, &str1.start, &str1.length) ||
-        !export_string_like(str2_obj, &str2.start, &str2.length)) {
-        wrap_current_exception("Both arguments must be string-like");
-        return NULL;
-    }
-
-    sz_size_t distance;
-    sz_status_t status = function(str1.start, str1.length, str2.start, str2.length, (sz_size_t)bound, &distance);
-
-    // Check for memory allocation issues
-    if (status != sz_success_k) {
-        PyErr_NoMemory();
-        return NULL;
-    }
-
-    return PyLong_FromSize_t(distance);
-}
-
-static char const doc_hamming_distance[] = //
-    "Compute the Hamming distance between two strings.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The first string.\n"
-    "  other (str): The second string to compare.\n"
-    "  bound (int, optional): Optional maximum distance to compute (default is no bound).\n"
-    "Returns:\n"
-    "  int: The Hamming distance, including differing bytes and length difference.";
-
-static PyObject *Str_hamming_distance(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return _Str_hamming_distance(self, args, kwargs, &sz_hamming_distance);
-}
-
-static char const doc_hamming_distance_unicode[] = //
-    "Compute the Hamming distance between two Unicode strings.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The first string.\n"
-    "  other (str): The second string to compare.\n"
-    "  bound (int, optional): Optional maximum distance to compute (default is no bound).\n"
-    "Returns:\n"
-    "  int: The Hamming distance, including differing Unicode characters and length difference.";
-
-static PyObject *Str_hamming_distance_unicode(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return _Str_hamming_distance(self, args, kwargs, &sz_hamming_distance_utf8);
-}
-
-static char const doc_needleman_wunsch_score[] = //
-    "Compute the Needleman-Wunsch alignment score between two strings.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The first string.\n"
-    "  other (str): The second string to align.\n"
-    "  substitution_matrix (numpy.ndarray): A 256x256 substitution cost matrix.\n"
-    "  gap_score (int): The score for introducing a gap.\n"
-    "  bound (int, optional): Optional maximum score to compute (default is no bound).\n"
-    "Returns:\n"
-    "  int: The alignment score.";
-
-static PyObject *Str_needleman_wunsch_score(PyObject *self, PyObject *args, PyObject *kwargs) {
-    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member + 1 || nargs > !is_member + 2) {
-        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
-        return NULL;
-    }
-
-    PyObject *str1_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *str2_obj = PyTuple_GET_ITEM(args, !is_member + 0);
-    PyObject *substitution_matrix_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
-    PyObject *gap_score_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
-
-    if (kwargs) {
-        Py_ssize_t pos = 0;
-        PyObject *key, *value;
-        while (PyDict_Next(kwargs, &pos, &key, &value))
-            if (PyUnicode_CompareWithASCIIString(key, "gap_score") == 0 && !gap_score_obj) { gap_score_obj = value; }
-            else if (PyUnicode_CompareWithASCIIString(key, "substitution_matrix") == 0 && !substitution_matrix_obj) {
-                substitution_matrix_obj = value;
-            }
-            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key))
-                return NULL;
-    }
-
-    Py_ssize_t gap = 1; // Default value for gap costs
-    if (gap_score_obj && (gap = PyLong_AsSsize_t(gap_score_obj)) && (gap >= 128 || gap <= -128)) {
-        PyErr_Format(PyExc_ValueError, "The `gap_score` must fit into an 8-bit signed integer");
-        return NULL;
-    }
-
-    // Now extract the substitution matrix from the `substitution_matrix_obj`.
-    // It must conform to the buffer protocol, and contain a continuous 256x256 matrix of 8-bit signed integers.
-    sz_error_cost_t const *substitutions;
-
-    // Ensure the substitution matrix object is provided
-    if (!substitution_matrix_obj) {
-        PyErr_Format(PyExc_TypeError, "No substitution matrix provided");
-        return NULL;
-    }
-
-    // Request a buffer view
-    Py_buffer substitutions_view;
-    if (PyObject_GetBuffer(substitution_matrix_obj, &substitutions_view, PyBUF_FULL)) {
-        PyErr_Format(PyExc_TypeError, "Failed to get buffer from substitution matrix");
-        return NULL;
-    }
-
-    // Validate the buffer
-    if (substitutions_view.ndim != 2 || substitutions_view.shape[0] != 256 || substitutions_view.shape[1] != 256 ||
-        substitutions_view.itemsize != sizeof(sz_error_cost_t)) {
-        PyErr_Format(PyExc_ValueError, "Substitution matrix must be a 256x256 matrix of 8-bit signed integers");
-        PyBuffer_Release(&substitutions_view);
-        return NULL;
-    }
-
-    sz_string_view_t str1, str2;
-    if (!export_string_like(str1_obj, &str1.start, &str1.length) ||
-        !export_string_like(str2_obj, &str2.start, &str2.length)) {
-        wrap_current_exception("Both arguments must be string-like");
-        return NULL;
-    }
-
-    // Assign the buffer's data to substitutions
-    substitutions = (sz_error_cost_t const *)substitutions_view.buf;
-
-    // Allocate memory for the Levenshtein matrix
-    sz_memory_allocator_t reusing_allocator;
-    reusing_allocator.allocate = &temporary_memory_allocate;
-    reusing_allocator.free = &temporary_memory_free;
-    reusing_allocator.handle = &temporary_memory;
-
-    sz_ssize_t score;
-    sz_status_t status = sz_needleman_wunsch_score(str1.start, str1.length, str2.start, str2.length, substitutions,
-                                                   (sz_error_cost_t)gap, &reusing_allocator, &score);
-
-    // Don't forget to release the buffer view
-    PyBuffer_Release(&substitutions_view);
-
-    // Check for memory allocation issues
-    if (status != sz_success_k) {
-        PyErr_NoMemory();
-        return NULL;
-    }
-
-    return PyLong_FromSsize_t(score);
-}
-
 static char const doc_startswith[] = //
     "Check if a string starts with a given prefix.\n"
     "\n"
@@ -3012,15 +2798,6 @@ static PyMethodDef Str_methods[] = {
     {"rpartition", (PyCFunction)Str_rpartition, SZ_METHOD_FLAGS, doc_rpartition},
     {"rsplit", (PyCFunction)Str_rsplit, SZ_METHOD_FLAGS, doc_rsplit},
 
-    // Edit distance extensions
-    {"hamming_distance", (PyCFunction)Str_hamming_distance, SZ_METHOD_FLAGS, doc_hamming_distance},
-    {"hamming_distance_unicode", (PyCFunction)Str_hamming_distance_unicode, SZ_METHOD_FLAGS,
-     doc_hamming_distance_unicode},
-    {"levenshtein_distance", (PyCFunction)Str_levenshtein_distance, SZ_METHOD_FLAGS, doc_levenshtein_distance},
-    {"levenshtein_distance_unicode", (PyCFunction)Str_levenshtein_distance_unicode, SZ_METHOD_FLAGS,
-     doc_levenshtein_distance_unicode},
-    {"needleman_wunsch_score", (PyCFunction)Str_needleman_wunsch_score, SZ_METHOD_FLAGS, doc_needleman_wunsch_score},
-
     // Character search extensions
     {"find_first_of", (PyCFunction)Str_find_first_of, SZ_METHOD_FLAGS, doc_find_first_of},
     {"find_last_of", (PyCFunction)Str_find_last_of, SZ_METHOD_FLAGS, doc_find_last_of},
@@ -3661,14 +3438,6 @@ static PyMethodDef stringzilla_methods[] = {
     {"rpartition", Str_rpartition, SZ_METHOD_FLAGS, doc_rpartition},
     {"rsplit", Str_rsplit, SZ_METHOD_FLAGS, doc_rsplit},
 
-    // Edit distance extensions
-    {"hamming_distance", Str_hamming_distance, SZ_METHOD_FLAGS, doc_hamming_distance},
-    {"hamming_distance_unicode", Str_hamming_distance_unicode, SZ_METHOD_FLAGS, doc_hamming_distance_unicode},
-    {"levenshtein_distance", Str_levenshtein_distance, SZ_METHOD_FLAGS, doc_levenshtein_distance},
-    {"levenshtein_distance_unicode", Str_levenshtein_distance_unicode, SZ_METHOD_FLAGS,
-     doc_levenshtein_distance_unicode},
-    {"needleman_wunsch_score", Str_needleman_wunsch_score, SZ_METHOD_FLAGS, doc_needleman_wunsch_score},
-
     // Character search extensions
     {"find_first_of", Str_find_first_of, SZ_METHOD_FLAGS, doc_find_first_of},
     {"find_last_of", Str_find_last_of, SZ_METHOD_FLAGS, doc_find_last_of},
diff --git a/python/stringzillas.c b/python/stringzillas.c
new file mode 100644
index 00000000..67aa0bea
--- /dev/null
+++ b/python/stringzillas.c
@@ -0,0 +1,2000 @@
+/**
+ *  @file       stringzillas.c
+ *  @brief      Very light-weight CPython wrapper for StringZilla, with support for memory-mapping,
+ *              native Python strings, Apache Arrow collections, and more.
+ *  @author     Ash Vardanian
+ *  @date       July 10, 2023
+ *  @copyright  Copyright (c) 2023
+ *
+ *  - Doesn't use PyBind11, NanoBind, Boost.Python, or any other high-level libs, only CPython API.
+ *  - To minimize latency this implementation avoids `PyArg_ParseTupleAndKeywords` calls.
+ */
+#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
+#define NOMINMAX
+#include <windows.h>
+#else
+#include <fcntl.h>    // `O_RDNLY`
+#include <sys/mman.h> // `mmap`
+#include <sys/stat.h> // `stat`
+#include <sys/types.h>
+#endif
+
+#ifdef _MSC_VER
+#include <BaseTsd.h>
+typedef SSIZE_T ssize_t;
+#else
+#include <limits.h> // `SSIZE_MAX`
+#include <unistd.h> // `ssize_t`
+#endif
+
+// It seems like some Python versions forget to include a header, so we should:
+// https://github.com/ashvardanian/StringZilla/actions/runs/7706636733/job/21002535521
+#ifndef SSIZE_MAX
+#define SSIZE_MAX (SIZE_MAX / 2)
+#endif
+
+#include <Python.h> // Core CPython interfaces
+
+#include <errno.h>  // `errno`
+#include <stdio.h>  // `fopen`
+#include <stdlib.h> // `rand`, `srand`
+#include <time.h>   // `time`
+
+#include <stringzilla/stringzillas.h>
+
+#pragma region Forward Declarations
+
+static PyTypeObject SimilaritiesEngineType;
+static PyTypeObject FingerprintsEngineType;
+
+/**
+ *  @brief  Type-punned StringZilla-string, that points to a slice of an existing Python `str`
+ *          or a `File`.
+ *
+ *  When a slice is constructed, the `parent` object's reference count is being incremented to preserve lifetime.
+ *  It usage in Python would look like:
+ *
+ *      - Str() # Empty string
+ *      - Str("some-string") # Full-range slice of a Python `str`
+ *      - Str(File("some-path.txt")) # Full-range view of a persisted file
+ *      - Str(File("some-path.txt"), from=0, to=sys.maxsize)
+ */
+typedef struct {
+    PyObject ob_base;
+
+    PyObject *parent;
+    sz_string_view_t memory;
+} Str;
+
+static PyObject *_Str_levenshtein_distance(PyObject *self, PyObject *args, PyObject *kwargs,
+                                           sz_levenshtein_distance_t function) {
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs < !is_member + 1 || nargs > !is_member + 2) {
+        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
+        return NULL;
+    }
+
+    PyObject *str1_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *str2_obj = PyTuple_GET_ITEM(args, !is_member + 0);
+    PyObject *bound_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+
+    if (kwargs) {
+        Py_ssize_t pos = 0;
+        PyObject *key, *value;
+        while (PyDict_Next(kwargs, &pos, &key, &value))
+            if (PyUnicode_CompareWithASCIIString(key, "bound") == 0 && !bound_obj) { bound_obj = value; }
+            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key))
+                return NULL;
+    }
+
+    sz_size_t bound = SZ_SIZE_MAX; // Default value for bound
+    if (bound_obj && ((bound = (sz_size_t)PyLong_AsSize_t(bound_obj)) == (sz_size_t)(-1))) {
+        PyErr_Format(PyExc_ValueError, "Bound must be a non-negative integer");
+        return NULL;
+    }
+
+    sz_string_view_t str1, str2;
+    if (!export_string_like(str1_obj, &str1.start, &str1.length) ||
+        !export_string_like(str2_obj, &str2.start, &str2.length)) {
+        wrap_current_exception("Both arguments must be string-like");
+        return NULL;
+    }
+
+    // Allocate memory for the Levenshtein matrix
+    sz_memory_allocator_t reusing_allocator;
+    reusing_allocator.allocate = &temporary_memory_allocate;
+    reusing_allocator.free = &temporary_memory_free;
+    reusing_allocator.handle = &temporary_memory;
+
+    sz_size_t distance;
+    sz_status_t status =
+        function(str1.start, str1.length, str2.start, str2.length, bound, &reusing_allocator, &distance);
+
+    // Check for memory allocation issues
+    if (status != sz_success_k) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    return PyLong_FromSize_t(distance);
+}
+
+static char const doc_levenshtein_distance[] = //
+    "Compute the Levenshtein edit distance between two strings.\n"
+    "\n"
+    "Args:\n"
+    "  text (Str or str or bytes): The first string.\n"
+    "  other (str): The second string to compare.\n"
+    "  bound (int, optional): Optional maximum distance to compute (default is no bound).\n"
+    "Returns:\n"
+    "  int: The edit distance (number of insertions, deletions, substitutions).";
+
+static PyObject *Str_levenshtein_distance(PyObject *self, PyObject *args, PyObject *kwargs) {
+    return _Str_levenshtein_distance(self, args, kwargs, &sz_levenshtein_distance);
+}
+
+static char const doc_levenshtein_distance_unicode[] = //
+    "Compute the Levenshtein edit distance between two Unicode strings.\n"
+    "\n"
+    "Args:\n"
+    "  text (Str or str or bytes): The first string.\n"
+    "  other (str): The second string to compare.\n"
+    "  bound (int, optional): Optional maximum distance to compute (default is no bound).\n"
+    "Returns:\n"
+    "  int: The edit distance in Unicode characters.";
+
+static PyObject *Str_levenshtein_distance_unicode(PyObject *self, PyObject *args, PyObject *kwargs) {
+    return _Str_levenshtein_distance(self, args, kwargs, &sz_levenshtein_distance_utf8);
+}
+
+static PyObject *_Str_hamming_distance(PyObject *self, PyObject *args, PyObject *kwargs,
+                                       sz_hamming_distance_t function) {
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs < !is_member + 1 || nargs > !is_member + 2) {
+        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
+        return NULL;
+    }
+
+    PyObject *str1_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *str2_obj = PyTuple_GET_ITEM(args, !is_member + 0);
+    PyObject *bound_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+
+    if (kwargs) {
+        Py_ssize_t pos = 0;
+        PyObject *key, *value;
+        while (PyDict_Next(kwargs, &pos, &key, &value))
+            if (PyUnicode_CompareWithASCIIString(key, "bound") == 0 && !bound_obj) { bound_obj = value; }
+            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key))
+                return NULL;
+    }
+
+    Py_ssize_t bound = 0; // Default value for bound
+    if (bound_obj && ((bound = PyLong_AsSsize_t(bound_obj)) < 0)) {
+        PyErr_Format(PyExc_ValueError, "Bound must be a non-negative integer");
+        return NULL;
+    }
+
+    sz_string_view_t str1, str2;
+    if (!export_string_like(str1_obj, &str1.start, &str1.length) ||
+        !export_string_like(str2_obj, &str2.start, &str2.length)) {
+        wrap_current_exception("Both arguments must be string-like");
+        return NULL;
+    }
+
+    sz_size_t distance;
+    sz_status_t status = function(str1.start, str1.length, str2.start, str2.length, (sz_size_t)bound, &distance);
+
+    // Check for memory allocation issues
+    if (status != sz_success_k) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    return PyLong_FromSize_t(distance);
+}
+
+static char const doc_hamming_distance[] = //
+    "Compute the Hamming distance between two strings.\n"
+    "\n"
+    "Args:\n"
+    "  text (Str or str or bytes): The first string.\n"
+    "  other (str): The second string to compare.\n"
+    "  bound (int, optional): Optional maximum distance to compute (default is no bound).\n"
+    "Returns:\n"
+    "  int: The Hamming distance, including differing bytes and length difference.";
+
+static PyObject *Str_hamming_distance(PyObject *self, PyObject *args, PyObject *kwargs) {
+    return _Str_hamming_distance(self, args, kwargs, &sz_hamming_distance);
+}
+
+static char const doc_hamming_distance_unicode[] = //
+    "Compute the Hamming distance between two Unicode strings.\n"
+    "\n"
+    "Args:\n"
+    "  text (Str or str or bytes): The first string.\n"
+    "  other (str): The second string to compare.\n"
+    "  bound (int, optional): Optional maximum distance to compute (default is no bound).\n"
+    "Returns:\n"
+    "  int: The Hamming distance, including differing Unicode characters and length difference.";
+
+static PyObject *Str_hamming_distance_unicode(PyObject *self, PyObject *args, PyObject *kwargs) {
+    return _Str_hamming_distance(self, args, kwargs, &sz_hamming_distance_utf8);
+}
+
+static char const doc_needleman_wunsch_score[] = //
+    "Compute the Needleman-Wunsch alignment score between two strings.\n"
+    "\n"
+    "Args:\n"
+    "  text (Str or str or bytes): The first string.\n"
+    "  other (str): The second string to align.\n"
+    "  substitution_matrix (numpy.ndarray): A 256x256 substitution cost matrix.\n"
+    "  gap_score (int): The score for introducing a gap.\n"
+    "  bound (int, optional): Optional maximum score to compute (default is no bound).\n"
+    "Returns:\n"
+    "  int: The alignment score.";
+
+static PyObject *Str_needleman_wunsch_score(PyObject *self, PyObject *args, PyObject *kwargs) {
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs < !is_member + 1 || nargs > !is_member + 2) {
+        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
+        return NULL;
+    }
+
+    PyObject *str1_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *str2_obj = PyTuple_GET_ITEM(args, !is_member + 0);
+    PyObject *substitution_matrix_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+    PyObject *gap_score_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
+
+    if (kwargs) {
+        Py_ssize_t pos = 0;
+        PyObject *key, *value;
+        while (PyDict_Next(kwargs, &pos, &key, &value))
+            if (PyUnicode_CompareWithASCIIString(key, "gap_score") == 0 && !gap_score_obj) { gap_score_obj = value; }
+            else if (PyUnicode_CompareWithASCIIString(key, "substitution_matrix") == 0 && !substitution_matrix_obj) {
+                substitution_matrix_obj = value;
+            }
+            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key))
+                return NULL;
+    }
+
+    Py_ssize_t gap = 1; // Default value for gap costs
+    if (gap_score_obj && (gap = PyLong_AsSsize_t(gap_score_obj)) && (gap >= 128 || gap <= -128)) {
+        PyErr_Format(PyExc_ValueError, "The `gap_score` must fit into an 8-bit signed integer");
+        return NULL;
+    }
+
+    // Now extract the substitution matrix from the `substitution_matrix_obj`.
+    // It must conform to the buffer protocol, and contain a continuous 256x256 matrix of 8-bit signed integers.
+    sz_error_cost_t const *substitutions;
+
+    // Ensure the substitution matrix object is provided
+    if (!substitution_matrix_obj) {
+        PyErr_Format(PyExc_TypeError, "No substitution matrix provided");
+        return NULL;
+    }
+
+    // Request a buffer view
+    Py_buffer substitutions_view;
+    if (PyObject_GetBuffer(substitution_matrix_obj, &substitutions_view, PyBUF_FULL)) {
+        PyErr_Format(PyExc_TypeError, "Failed to get buffer from substitution matrix");
+        return NULL;
+    }
+
+    // Validate the buffer
+    if (substitutions_view.ndim != 2 || substitutions_view.shape[0] != 256 || substitutions_view.shape[1] != 256 ||
+        substitutions_view.itemsize != sizeof(sz_error_cost_t)) {
+        PyErr_Format(PyExc_ValueError, "Substitution matrix must be a 256x256 matrix of 8-bit signed integers");
+        PyBuffer_Release(&substitutions_view);
+        return NULL;
+    }
+
+    sz_string_view_t str1, str2;
+    if (!export_string_like(str1_obj, &str1.start, &str1.length) ||
+        !export_string_like(str2_obj, &str2.start, &str2.length)) {
+        wrap_current_exception("Both arguments must be string-like");
+        return NULL;
+    }
+
+    // Assign the buffer's data to substitutions
+    substitutions = (sz_error_cost_t const *)substitutions_view.buf;
+
+    // Allocate memory for the Levenshtein matrix
+    sz_memory_allocator_t reusing_allocator;
+    reusing_allocator.allocate = &temporary_memory_allocate;
+    reusing_allocator.free = &temporary_memory_free;
+    reusing_allocator.handle = &temporary_memory;
+
+    sz_ssize_t score;
+    sz_status_t status = sz_needleman_wunsch_score(str1.start, str1.length, str2.start, str2.length, substitutions,
+                                                   (sz_error_cost_t)gap, &reusing_allocator, &score);
+
+    // Don't forget to release the buffer view
+    PyBuffer_Release(&substitutions_view);
+
+    // Check for memory allocation issues
+    if (status != sz_success_k) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    return PyLong_FromSsize_t(score);
+}
+
+static char const doc_startswith[] = //
+    "Check if a string starts with a given prefix.\n"
+    "\n"
+    "Args:\n"
+    "  text (Str or str or bytes): The string object.\n"
+    "  prefix (str): The prefix to check.\n"
+    "  start (int, optional): The starting index (default is 0).\n"
+    "  end (int, optional): The ending index (default is the string length).\n"
+    "Returns:\n"
+    "  bool: True if the string starts with the prefix, False otherwise.";
+
+static PyObject *Str_startswith(PyObject *self, PyObject *args, PyObject *kwargs) {
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs < !is_member + 1 || nargs > !is_member + 3) {
+        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
+        return NULL;
+    }
+
+    PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *prefix_obj = PyTuple_GET_ITEM(args, !is_member);
+    PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+    PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
+
+    // Optional start and end arguments
+    Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
+
+    if (start_obj && ((start = PyLong_AsSsize_t(start_obj)) == -1 && PyErr_Occurred())) {
+        PyErr_SetString(PyExc_TypeError, "start must be an integer");
+        return NULL;
+    }
+
+    if (end_obj && ((end = PyLong_AsSsize_t(end_obj)) == -1 && PyErr_Occurred())) {
+        PyErr_SetString(PyExc_TypeError, "end must be an integer");
+        return NULL;
+    }
+
+    sz_string_view_t str, prefix;
+    if (!export_string_like(str_obj, &str.start, &str.length) ||
+        !export_string_like(prefix_obj, &prefix.start, &prefix.length)) {
+        wrap_current_exception("Both arguments must be string-like");
+        return NULL;
+    }
+
+    // Apply start and end arguments
+    str.start += start;
+    str.length -= start;
+    if (end != PY_SSIZE_T_MAX && (sz_size_t)(end - start) < str.length) { str.length = (sz_size_t)(end - start); }
+
+    if (str.length < prefix.length) { Py_RETURN_FALSE; }
+    else if (strncmp(str.start, prefix.start, prefix.length) == 0) { Py_RETURN_TRUE; }
+    else { Py_RETURN_FALSE; }
+}
+
+static char const doc_endswith[] = //
+    "Check if a string ends with a given suffix.\n"
+    "\n"
+    "Args:\n"
+    "  text (Str or str or bytes): The string object.\n"
+    "  suffix (str): The suffix to check.\n"
+    "  start (int, optional): The starting index (default is 0).\n"
+    "  end (int, optional): The ending index (default is the string length).\n"
+    "Returns:\n"
+    "  bool: True if the string ends with the suffix, False otherwise.";
+
+static PyObject *Str_endswith(PyObject *self, PyObject *args, PyObject *kwargs) {
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs < !is_member + 1 || nargs > !is_member + 3) {
+        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
+        return NULL;
+    }
+
+    PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *suffix_obj = PyTuple_GET_ITEM(args, !is_member);
+    PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+    PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
+
+    // Optional start and end arguments
+    Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
+
+    if (start_obj && ((start = PyLong_AsSsize_t(start_obj)) == -1 && PyErr_Occurred())) {
+        PyErr_SetString(PyExc_TypeError, "start must be an integer");
+        return NULL;
+    }
+
+    if (end_obj && ((end = PyLong_AsSsize_t(end_obj)) == -1 && PyErr_Occurred())) {
+        PyErr_SetString(PyExc_TypeError, "end must be an integer");
+        return NULL;
+    }
+
+    sz_string_view_t str, suffix;
+    if (!export_string_like(str_obj, &str.start, &str.length) ||
+        !export_string_like(suffix_obj, &suffix.start, &suffix.length)) {
+        wrap_current_exception("Both arguments must be string-like");
+        return NULL;
+    }
+
+    // Apply start and end arguments
+    str.start += start;
+    str.length -= start;
+    if (end != PY_SSIZE_T_MAX && (sz_size_t)(end - start) < str.length) { str.length = (sz_size_t)(end - start); }
+
+    if (str.length < suffix.length) { Py_RETURN_FALSE; }
+    else if (strncmp(str.start + (str.length - suffix.length), suffix.start, suffix.length) == 0) { Py_RETURN_TRUE; }
+    else { Py_RETURN_FALSE; }
+}
+
+static char const doc_translate[] = //
+    "Perform transformation of a string using a look-up table.\n"
+    "\n"
+    "Args:\n"
+    "  text (Str or str or bytes): The string object.\n"
+    "  table (str or dict): A 256-character string or a dictionary mapping bytes to bytes.\n"
+    "  inplace (bool, optional): If True, the string is modified in place (default is False).\n"
+    "\n"
+    "  start (int, optional): The starting index for translation (default is 0).\n"
+    "  end (int, optional): The ending index for translation (default is the string length).\n"
+    "Returns:\n"
+    "  Union[None, str, bytes]: If inplace is False, a new string is returned, otherwise None.\n"
+    "Raises:\n"
+    "  ValueError: If the table is not 256 bytes long.\n"
+    "  TypeError: If the table is not a string or dictionary.";
+
+static PyObject *Str_translate(PyObject *self, PyObject *args, PyObject *kwargs) {
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs < !is_member + 1 || nargs > !is_member + 4) {
+        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
+        return NULL;
+    }
+
+    PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *look_up_table_obj = PyTuple_GET_ITEM(args, !is_member);
+    PyObject *inplace_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+    PyObject *start_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
+    PyObject *end_obj = nargs > !is_member + 3 ? PyTuple_GET_ITEM(args, !is_member + 3) : NULL;
+
+    // Optional keyword arguments
+    if (kwargs) {
+        Py_ssize_t pos = 0;
+        PyObject *key, *value;
+        while (PyDict_Next(kwargs, &pos, &key, &value))
+            if (PyUnicode_CompareWithASCIIString(key, "inplace") == 0 && !inplace_obj) { inplace_obj = value; }
+            else if (PyUnicode_CompareWithASCIIString(key, "start") == 0 && !start_obj) { start_obj = value; }
+            else if (PyUnicode_CompareWithASCIIString(key, "end") == 0 && !end_obj) { end_obj = value; }
+            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key))
+                return NULL;
+    }
+
+    // Optional start and end arguments
+    Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
+
+    if (start_obj && ((start = PyLong_AsSsize_t(start_obj)) == -1 && PyErr_Occurred())) {
+        PyErr_SetString(PyExc_TypeError, "start must be an integer");
+        return NULL;
+    }
+
+    if (end_obj && ((end = PyLong_AsSsize_t(end_obj)) == -1 && PyErr_Occurred())) {
+        PyErr_SetString(PyExc_TypeError, "end must be an integer");
+        return NULL;
+    }
+
+    sz_string_view_t str;
+    if (!export_string_like(str_obj, &str.start, &str.length)) {
+        wrap_current_exception("First argument must be string-like");
+        return NULL;
+    }
+
+    sz_string_view_t look_up_table_str;
+    SZ_ALIGN64 char look_up_table[256];
+    if (PyDict_Check(look_up_table_obj)) {
+
+        // If any character is not defined, it will be replaced with itself:
+        for (int i = 0; i < 256; i++) look_up_table[i] = (char)i;
+
+        // Process the dictionary into the look-up table
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+        while (PyDict_Next(look_up_table_obj, &pos, &key, &value)) {
+            if (!PyUnicode_Check(key) || PyUnicode_GetLength(key) != 1 || !PyUnicode_Check(value) ||
+                PyUnicode_GetLength(value) != 1) {
+                PyErr_SetString(PyExc_TypeError, "Keys and values must be single characters");
+                return NULL;
+            }
+
+            char key_char = PyUnicode_AsUTF8(key)[0];
+            char value_char = PyUnicode_AsUTF8(value)[0];
+            look_up_table[(unsigned char)key_char] = value_char;
+        }
+    }
+    else if (export_string_like(look_up_table_obj, &look_up_table_str.start, &look_up_table_str.length)) {
+        if (look_up_table_str.length != 256) {
+            PyErr_SetString(PyExc_ValueError, "The look-up table must be exactly 256 bytes long");
+            return NULL;
+        }
+        sz_copy(&look_up_table[0], look_up_table_str.start, look_up_table_str.length);
+    }
+    else {
+        wrap_current_exception("The look-up table must be string-like or a dictionary");
+        return NULL;
+    }
+
+    int is_inplace = inplace_obj ? PyObject_IsTrue(inplace_obj) : 0;
+    if (is_inplace == -1) {
+        PyErr_SetString(PyExc_TypeError, "The inplace argument must be a boolean");
+        return NULL;
+    }
+
+    // Apply start and end arguments
+    str.start += start;
+    str.length -= start;
+    if (end != PY_SSIZE_T_MAX && (sz_size_t)(end - start) < str.length) { str.length = (sz_size_t)(end - start); }
+
+    // Perform the translation using the look-up table
+    if (is_inplace) {
+        sz_lookup(str.start, str.length, str.start, look_up_table);
+        Py_RETURN_NONE;
+    }
+    // Allocate a string of the same size, get it's raw pointer and transform the data into it
+    else {
+
+        // For binary inputs return bytes, for unicode return str
+        if (PyUnicode_Check(str_obj)) {
+            // Create a new Unicode object
+            PyObject *new_unicode_obj = PyUnicode_New(str.length, PyUnicode_MAX_CHAR_VALUE(str_obj));
+            if (!new_unicode_obj) {
+                PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for new Unicode string");
+                return NULL;
+            }
+
+            sz_ptr_t new_buffer = (sz_ptr_t)PyUnicode_DATA(new_unicode_obj);
+            sz_lookup(new_buffer, str.length, str.start, look_up_table);
+            return new_unicode_obj;
+        }
+        else {
+            PyObject *new_bytes_obj = PyBytes_FromStringAndSize(NULL, str.length);
+            if (!new_bytes_obj) {
+                PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for new string");
+                return NULL;
+            }
+
+            // Get the buffer and perform the transformation
+            sz_ptr_t new_buffer = (sz_ptr_t)PyBytes_AS_STRING(new_bytes_obj);
+            sz_lookup(new_buffer, str.length, str.start, look_up_table);
+            return new_bytes_obj;
+        }
+    }
+}
+
+static char const doc_find_first_of[] = //
+    "Find the index of the first occurrence of any character from another string.\n"
+    "\n"
+    "Args:\n"
+    "  text (Str or str or bytes): The string object.\n"
+    "  chars (str): A string containing characters to search for.\n"
+    "  start (int, optional): Starting index (default is 0).\n"
+    "  end (int, optional): Ending index (default is the string length).\n"
+    "Returns:\n"
+    "  int: Index of the first matching character, or -1 if none found.";
+
+static PyObject *Str_find_first_of(PyObject *self, PyObject *args, PyObject *kwargs) {
+    Py_ssize_t signed_offset;
+    sz_string_view_t text;
+    sz_string_view_t separator;
+    if (!_Str_find_implementation_(self, args, kwargs, &sz_find_byte_from, sz_false_k, &signed_offset, &text,
+                                   &separator))
+        return NULL;
+    return PyLong_FromSsize_t(signed_offset);
+}
+
+static char const doc_find_first_not_of[] = //
+    "Find the index of the first character not in another string.\n"
+    "\n"
+    "Args:\n"
+    "  text (Str or str or bytes): The string object.\n"
+    "  chars (str): A string containing characters to exclude.\n"
+    "  start (int, optional): Starting index (default is 0).\n"
+    "  end (int, optional): Ending index (default is the string length).\n"
+    "Returns:\n"
+    "  int: Index of the first non-matching character, or -1 if all match.";
+
+static PyObject *Str_find_first_not_of(PyObject *self, PyObject *args, PyObject *kwargs) {
+    Py_ssize_t signed_offset;
+    sz_string_view_t text;
+    sz_string_view_t separator;
+    if (!_Str_find_implementation_(self, args, kwargs, &sz_find_byte_not_from, sz_false_k, &signed_offset, &text,
+                                   &separator))
+        return NULL;
+    return PyLong_FromSsize_t(signed_offset);
+}
+
+static char const doc_find_last_of[] = //
+    "Find the index of the last occurrence of any character from another string.\n"
+    "\n"
+    "Args:\n"
+    "  text (Str or str or bytes): The string object.\n"
+    "  chars (str): A string containing characters to search for.\n"
+    "  start (int, optional): Starting index (default is 0).\n"
+    "  end (int, optional): Ending index (default is the string length).\n"
+    "Returns:\n"
+    "  int: Index of the last matching character, or -1 if none found.";
+
+static PyObject *Str_find_last_of(PyObject *self, PyObject *args, PyObject *kwargs) {
+    Py_ssize_t signed_offset;
+    sz_string_view_t text;
+    sz_string_view_t separator;
+    if (!_Str_find_implementation_(self, args, kwargs, &sz_rfind_byte_from, sz_true_k, &signed_offset, &text,
+                                   &separator))
+        return NULL;
+    return PyLong_FromSsize_t(signed_offset);
+}
+
+static char const doc_find_last_not_of[] = //
+    "Find the index of the last character not in another string.\n"
+    "\n"
+    "Args:\n"
+    "  text (Str or str or bytes): The string object.\n"
+    "  chars (str): A string containing characters to exclude.\n"
+    "  start (int, optional): Starting index (default is 0).\n"
+    "  end (int, optional): Ending index (default is the string length).\n"
+    "Returns:\n"
+    "  int: Index of the last non-matching character, or -1 if all match.";
+
+static PyObject *Str_find_last_not_of(PyObject *self, PyObject *args, PyObject *kwargs) {
+    Py_ssize_t signed_offset;
+    sz_string_view_t text;
+    sz_string_view_t separator;
+    if (!_Str_find_implementation_(self, args, kwargs, &sz_rfind_byte_not_from, sz_true_k, &signed_offset, &text,
+                                   &separator))
+        return NULL;
+    return PyLong_FromSsize_t(signed_offset);
+}
+
+/**
+ *  @brief  Given parsed split settings, constructs an iterator that would produce that split.
+ */
+static SplitIterator *Str_split_iter_(PyObject *text_obj, PyObject *separator_obj,                   //
+                                      sz_string_view_t const text, sz_string_view_t const separator, //
+                                      int keepseparator, Py_ssize_t maxsplit, sz_find_t finder, sz_size_t match_length,
+                                      sz_bool_t is_reverse) {
+
+    // Create a new `SplitIterator` object
+    SplitIterator *result_obj = (SplitIterator *)SplitIteratorType.tp_alloc(&SplitIteratorType, 0);
+    if (result_obj == NULL && PyErr_NoMemory()) return NULL;
+
+    // Set its properties based on the slice
+    result_obj->text_obj = text_obj;
+    result_obj->separator_obj = separator_obj;
+    result_obj->text = text;
+    result_obj->separator = separator;
+    result_obj->finder = finder;
+
+    result_obj->match_length = match_length;
+    result_obj->include_match = keepseparator;
+    result_obj->is_reverse = is_reverse;
+    result_obj->max_parts = (sz_size_t)maxsplit + 1;
+    result_obj->reached_tail = 0;
+
+    // Increment the reference count of the parent
+    Py_INCREF(result_obj->text_obj);
+    Py_XINCREF(result_obj->separator_obj);
+    return result_obj;
+}
+
+/**
+ *  @brief  Implements the normal order split logic for both string-delimiters and character sets.
+ *          Produces one of the consecutive layouts - `STRS_CONSECUTIVE_64` or `STRS_CONSECUTIVE_32`.
+ */
+static Strs *Str_split_(PyObject *parent_string, sz_string_view_t const text, sz_string_view_t const separator,
+                        int keepseparator, Py_ssize_t maxsplit, sz_find_t finder, sz_size_t match_length) {
+    // Create Strs object
+    Strs *result = (Strs *)PyObject_New(Strs, &StrsType);
+    if (!result) return NULL;
+
+    // Initialize Strs object based on the splitting logic
+    void *offsets_endings = NULL;
+    size_t offsets_capacity = 0;
+    size_t offsets_count = 0;
+    size_t bytes_per_offset;
+    if (text.length >= UINT32_MAX) {
+        bytes_per_offset = 8;
+        result->type = STRS_CONSECUTIVE_64;
+        result->data.consecutive_64bit.start = text.start;
+        result->data.consecutive_64bit.parent_string = parent_string;
+        result->data.consecutive_64bit.separator_length = !keepseparator * match_length;
+    }
+    else {
+        bytes_per_offset = 4;
+        result->type = STRS_CONSECUTIVE_32;
+        result->data.consecutive_32bit.start = text.start;
+        result->data.consecutive_32bit.parent_string = parent_string;
+        result->data.consecutive_32bit.separator_length = !keepseparator * match_length;
+    }
+
+    sz_bool_t reached_tail = 0;
+    sz_size_t total_skipped = 0;
+    sz_size_t max_parts = (sz_size_t)maxsplit + 1;
+    while (!reached_tail) {
+
+        sz_cptr_t match =
+            offsets_count + 1 < max_parts
+                ? finder(text.start + total_skipped, text.length - total_skipped, separator.start, separator.length)
+                : NULL;
+
+        sz_size_t part_end_offset;
+        if (match) {
+            part_end_offset = (match - text.start) + match_length;
+            total_skipped = part_end_offset;
+        }
+        else {
+            part_end_offset = text.length;
+            total_skipped = text.length;
+            reached_tail = 1;
+        }
+
+        // Reallocate offsets array if needed
+        if (offsets_count >= offsets_capacity) {
+            offsets_capacity = (offsets_capacity + 1) * 2;
+            void *new_offsets = realloc(offsets_endings, offsets_capacity * bytes_per_offset);
+            if (!new_offsets) {
+                if (offsets_endings) free(offsets_endings);
+            }
+            offsets_endings = new_offsets;
+        }
+
+        // If the memory allocation has failed - discard the response
+        if (!offsets_endings) {
+            Py_XDECREF(result);
+            PyErr_NoMemory();
+            return NULL;
+        }
+
+        // Export the offset
+        if (bytes_per_offset == 8) { ((uint64_t *)offsets_endings)[offsets_count] = (uint64_t)part_end_offset; }
+        else { ((uint32_t *)offsets_endings)[offsets_count] = (uint32_t)part_end_offset; }
+        offsets_count++;
+    }
+
+    // Populate the Strs object with the offsets
+    if (bytes_per_offset == 8) {
+        result->data.consecutive_64bit.end_offsets = offsets_endings;
+        result->data.consecutive_64bit.count = offsets_count;
+    }
+    else {
+        result->data.consecutive_32bit.end_offsets = offsets_endings;
+        result->data.consecutive_32bit.count = offsets_count;
+    }
+
+    Py_INCREF(parent_string);
+    return result;
+}
+
+/**
+ *  @brief  Implements the reverse order split logic for both string-delimiters and character sets.
+ *          Unlike the `Str_split_` can't use consecutive layouts and produces a `REORDERED` one.
+ */
+static Strs *Str_rsplit_(PyObject *parent_string, sz_string_view_t const text, sz_string_view_t const separator,
+                         int keepseparator, Py_ssize_t maxsplit, sz_find_t finder, sz_size_t match_length) {
+    // Create Strs object
+    Strs *result = (Strs *)PyObject_New(Strs, &StrsType);
+    if (!result) return NULL;
+
+    // Initialize Strs object based on the splitting logic
+    result->type = STRS_REORDERED;
+    result->data.reordered.parent_string = parent_string;
+    result->data.reordered.parts = NULL;
+    result->data.reordered.count = 0;
+
+    // Keep track of the memory usage
+    sz_string_view_t *parts = NULL;
+    sz_size_t parts_capacity = 0;
+    sz_size_t parts_count = 0;
+
+    sz_bool_t reached_tail = 0;
+    sz_size_t total_skipped = 0;
+    sz_size_t max_parts = (sz_size_t)maxsplit + 1;
+    while (!reached_tail) {
+
+        sz_cptr_t match = parts_count + 1 < max_parts
+                              ? finder(text.start, text.length - total_skipped, separator.start, separator.length)
+                              : NULL;
+
+        // Determine the next part
+        sz_string_view_t part;
+        if (match) {
+            part.start = match + match_length * !keepseparator;
+            part.length = text.start + text.length - total_skipped - part.start;
+            total_skipped = text.start + text.length - match;
+        }
+        else {
+            part.start = text.start;
+            part.length = text.length - total_skipped;
+            reached_tail = 1;
+        }
+
+        // Reallocate parts array if needed
+        if (parts_count >= parts_capacity) {
+            parts_capacity = (parts_capacity + 1) * 2;
+            sz_string_view_t *new_parts = (sz_string_view_t *)realloc(parts, parts_capacity * sizeof(sz_string_view_t));
+            if (!new_parts) {
+                if (parts) free(parts);
+            }
+            parts = new_parts;
+        }
+
+        // If the memory allocation has failed - discard the response
+        if (!parts) {
+            Py_XDECREF(result);
+            PyErr_NoMemory();
+            return NULL;
+        }
+
+        // Populate the parts array
+        parts[parts_count] = part;
+        parts_count++;
+    }
+
+    // Python does this weird thing, where the `rsplit` results appear in the same order as `split`
+    // so we need to reverse the order of elements in the `parts` array.
+    for (sz_size_t i = 0; i < parts_count / 2; i++) {
+        sz_string_view_t temp = parts[i];
+        parts[i] = parts[parts_count - i - 1];
+        parts[parts_count - i - 1] = temp;
+    }
+
+    result->data.reordered.parts = parts;
+    result->data.reordered.count = parts_count;
+    Py_INCREF(parent_string);
+    return result;
+}
+
+/**
+ *  @brief  Proxy routing requests like `Str.split`, `Str.rsplit`, `Str.split_byteset` and `Str.rsplit_byteset`
+ *          to `Str_split_` and `Str_rsplit_` implementations, parsing function arguments.
+ */
+static PyObject *Str_split_with_known_callback(PyObject *self, PyObject *args, PyObject *kwargs, //
+                                               sz_find_t finder, sz_size_t match_length,         //
+                                               sz_bool_t is_reverse, sz_bool_t is_lazy_iterator) {
+    // Check minimum arguments
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs < !is_member || nargs > !is_member + 3) {
+        PyErr_SetString(PyExc_TypeError, "sz.split() received unsupported number of arguments");
+        return NULL;
+    }
+
+    PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *separator_obj = nargs > !is_member + 0 ? PyTuple_GET_ITEM(args, !is_member + 0) : NULL;
+    PyObject *maxsplit_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+    PyObject *keepseparator_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
+
+    if (kwargs) {
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+            if (PyUnicode_CompareWithASCIIString(key, "separator") == 0 && !separator_obj) { separator_obj = value; }
+            else if (PyUnicode_CompareWithASCIIString(key, "maxsplit") == 0 && !maxsplit_obj) { maxsplit_obj = value; }
+            else if (PyUnicode_CompareWithASCIIString(key, "keepseparator") == 0 && !keepseparator_obj) {
+                keepseparator_obj = value;
+            }
+            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key))
+                return NULL;
+        }
+    }
+
+    sz_string_view_t text;
+    sz_string_view_t separator;
+    int keepseparator;
+    Py_ssize_t maxsplit;
+
+    // Validate and convert `text`
+    if (!export_string_like(text_obj, &text.start, &text.length)) {
+        wrap_current_exception("The text argument must be string-like");
+        return NULL;
+    }
+
+    // Validate and convert `separator`
+    if (separator_obj) {
+        if (!export_string_like(separator_obj, &separator.start, &separator.length)) {
+            wrap_current_exception("The separator argument must be string-like");
+            return NULL;
+        }
+        // Raise a `ValueError` if it's length is zero, like the native `str.split`
+        if (separator.length == 0) {
+            PyErr_SetString(PyExc_ValueError, "The separator argument must not be empty");
+            return NULL;
+        }
+        if (match_length == 0) match_length = separator.length;
+    }
+    else {
+        separator.start = " ";
+        match_length = separator.length = 1;
+    }
+
+    // Validate and convert `keepseparator`
+    if (keepseparator_obj) {
+        keepseparator = PyObject_IsTrue(keepseparator_obj);
+        if (keepseparator == -1) {
+            PyErr_SetString(PyExc_TypeError, "The keepseparator argument must be a boolean");
+            return NULL;
+        }
+    }
+    else { keepseparator = 0; }
+
+    // Validate and convert `maxsplit`
+    if (maxsplit_obj) {
+        maxsplit = PyLong_AsSsize_t(maxsplit_obj);
+        if (maxsplit == -1 && PyErr_Occurred()) {
+            PyErr_SetString(PyExc_TypeError, "The maxsplit argument must be an integer");
+            return NULL;
+        }
+    }
+    else { maxsplit = PY_SSIZE_T_MAX; }
+
+    // Dispatch the right backend
+    if (is_lazy_iterator)
+        return Str_split_iter_(text_obj, separator_obj, text, separator, //
+                               keepseparator, maxsplit, finder, match_length, is_reverse);
+    else
+        return !is_reverse ? Str_split_(text_obj, text, separator, keepseparator, maxsplit, finder, match_length)
+                           : Str_rsplit_(text_obj, text, separator, keepseparator, maxsplit, finder, match_length);
+}
+
+static char const doc_split[] = //
+    "Split a string by a separator.\n"
+    "\n"
+    "Args:\n"
+    "  text (Str or str or bytes): The string object.\n"
+    "  separator (str): The separator to split by (cannot be empty).\n"
+    "  maxsplit (int, optional): Maximum number of splits (default is no limit).\n"
+    "  keepseparator (bool, optional): Include the separator in results (default is False).\n"
+    "Returns:\n"
+    "  Strs: A list of strings split by the separator.\n"
+    "Raises:\n"
+    "  ValueError: If the separator is an empty string.";
+
+static PyObject *Str_split(PyObject *self, PyObject *args, PyObject *kwargs) {
+    return Str_split_with_known_callback(self, args, kwargs, &sz_find, 0, sz_false_k, sz_false_k);
+}
+
+static char const doc_rsplit[] = //
+    "Split a string by a separator starting from the end.\n"
+    "\n"
+    "Args:\n"
+    "  text (Str or str or bytes): The string object.\n"
+    "  separator (str): The separator to split by (cannot be empty).\n"
+    "  maxsplit (int, optional): Maximum number of splits (default is no limit).\n"
+    "  keepseparator (bool, optional): Include the separator in results (default is False).\n"
+    "Returns:\n"
+    "  Strs: A list of strings split by the separator.\n"
+    "Raises:\n"
+    "  ValueError: If the separator is an empty string.";
+
+static PyObject *Str_rsplit(PyObject *self, PyObject *args, PyObject *kwargs) {
+    return Str_split_with_known_callback(self, args, kwargs, &sz_rfind, 0, sz_true_k, sz_false_k);
+}
+
+static char const doc_split_byteset[] = //
+    "Split a string by a set of character separators.\n"
+    "\n"
+    "Args:\n"
+    "  text (Str or str or bytes): The string object.\n"
+    "  separators (str): A string containing separator characters.\n"
+    "  maxsplit (int, optional): Maximum number of splits (default is no limit).\n"
+    "  keepseparator (bool, optional): Include separators in results (default is False).\n"
+    "Returns:\n"
+    "  Strs: A list of strings split by the character set.";
+
+static PyObject *Str_split_byteset(PyObject *self, PyObject *args, PyObject *kwargs) {
+    return Str_split_with_known_callback(self, args, kwargs, &sz_find_byte_from, 1, sz_false_k, sz_false_k);
+}
+
+static char const doc_rsplit_byteset[] = //
+    "Split a string by a set of character separators in reverse order.\n"
+    "\n"
+    "Args:\n"
+    "  text (Str or str or bytes): The string object.\n"
+    "  separators (str): A string containing separator characters.\n"
+    "  maxsplit (int, optional): Maximum number of splits (default is no limit).\n"
+    "  keepseparator (bool, optional): Include separators in results (default is False).\n"
+    "Returns:\n"
+    "  Strs: A list of strings split by the character set.";
+
+static PyObject *Str_rsplit_byteset(PyObject *self, PyObject *args, PyObject *kwargs) {
+    return Str_split_with_known_callback(self, args, kwargs, &sz_rfind_byte_from, 1, sz_true_k, sz_false_k);
+}
+
+static char const doc_split_iter[] = //
+    "Create an iterator for splitting a string by a separator.\n"
+    "\n"
+    "Args:\n"
+    "  text (Str or str or bytes): The string object.\n"
+    "  separator (str): The separator to split by (cannot be empty).\n"
+    "  keepseparator (bool, optional): Include separator in results (default is False).\n"
+    "Returns:\n"
+    "  iterator: An iterator yielding split substrings.\n"
+    "Raises:\n"
+    "  ValueError: If the separator is an empty string.";
+
+static PyObject *Str_split_iter(PyObject *self, PyObject *args, PyObject *kwargs) {
+    return Str_split_with_known_callback(self, args, kwargs, &sz_find, 0, sz_false_k, sz_true_k);
+}
+
+static char const doc_rsplit_iter[] = //
+    "Create an iterator for splitting a string by a separator in reverse order.\n"
+    "\n"
+    "Args:\n"
+    "  text (Str or str or bytes): The string object.\n"
+    "  separator (str): The separator to split by (cannot be empty).\n"
+    "  keepseparator (bool, optional): Include separator in results (default is False).\n"
+    "Returns:\n"
+    "  iterator: An iterator yielding split substrings in reverse.\n"
+    "Raises:\n"
+    "  ValueError: If the separator is an empty string.";
+
+static PyObject *Str_rsplit_iter(PyObject *self, PyObject *args, PyObject *kwargs) {
+    return Str_split_with_known_callback(self, args, kwargs, &sz_rfind, 0, sz_true_k, sz_true_k);
+}
+
+static char const doc_split_byteset_iter[] = //
+    "Create an iterator for splitting a string by a set of character separators.\n"
+    "\n"
+    "Args:\n"
+    "  text (Str or str or bytes): The string object.\n"
+    "  separators (str): A string containing separator characters.\n"
+    "  keepseparator (bool, optional): Include separators in results (default is False).\n"
+    "Returns:\n"
+    "  iterator: An iterator yielding split substrings.";
+
+static PyObject *Str_split_byteset_iter(PyObject *self, PyObject *args, PyObject *kwargs) {
+    return Str_split_with_known_callback(self, args, kwargs, &sz_find_byte_from, 1, sz_false_k, sz_true_k);
+}
+
+static char const doc_rsplit_byteset_iter[] = //
+    "Create an iterator for splitting a string by a set of character separators in reverse order.\n"
+    "\n"
+    "Args:\n"
+    "  text (Str or str or bytes): The string object.\n"
+    "  separators (str): A string containing separator characters.\n"
+    "  keepseparator (bool, optional): Include separators in results (default is False).\n"
+    "Returns:\n"
+    "  iterator: An iterator yielding split substrings in reverse.";
+
+static PyObject *Str_rsplit_byteset_iter(PyObject *self, PyObject *args, PyObject *kwargs) {
+    return Str_split_with_known_callback(self, args, kwargs, &sz_rfind_byte_from, 1, sz_true_k, sz_true_k);
+}
+
+static char const doc_splitlines[] = //
+    "Split a string by line breaks.\n"
+    "\n"
+    "Args:\n"
+    "  text (Str or str or bytes): The string object.\n"
+    "  keeplinebreaks (bool, optional): Include line breaks in the results (default is False).\n"
+    "  maxsplit (int, optional): Maximum number of splits (default is no limit).\n"
+    "Returns:\n"
+    "  Strs: A list of strings split by line breaks.";
+
+static PyObject *Str_splitlines(PyObject *self, PyObject *args, PyObject *kwargs) {
+    // Check minimum arguments
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs < !is_member || nargs > !is_member + 2) {
+        PyErr_SetString(PyExc_TypeError, "splitlines() requires at least 1 argument");
+        return NULL;
+    }
+
+    PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *keeplinebreaks_obj = nargs > !is_member ? PyTuple_GET_ITEM(args, !is_member) : NULL;
+    PyObject *maxsplit_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+
+    if (kwargs) {
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+            if (PyUnicode_CompareWithASCIIString(key, "keeplinebreaks") == 0 && !keeplinebreaks_obj) {
+                keeplinebreaks_obj = value;
+            }
+            else if (PyUnicode_CompareWithASCIIString(key, "maxsplit") == 0 && !maxsplit_obj) { maxsplit_obj = value; }
+            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) { return NULL; }
+        }
+    }
+
+    sz_string_view_t text;
+    int keeplinebreaks;
+    Py_ssize_t maxsplit = PY_SSIZE_T_MAX; // Default value for maxsplit
+
+    // Validate and convert `text`
+    if (!export_string_like(text_obj, &text.start, &text.length)) {
+        wrap_current_exception("The text argument must be string-like");
+        return NULL;
+    }
+
+    // Validate and convert `keeplinebreaks`
+    if (keeplinebreaks_obj) {
+        keeplinebreaks = PyObject_IsTrue(keeplinebreaks_obj);
+        if (keeplinebreaks == -1) {
+            wrap_current_exception("The keeplinebreaks argument must be a boolean");
+            return NULL;
+        }
+    }
+    else { keeplinebreaks = 0; }
+
+    // Validate and convert `maxsplit`
+    if (maxsplit_obj) {
+        maxsplit = PyLong_AsSsize_t(maxsplit_obj);
+        if (maxsplit == -1 && PyErr_Occurred()) {
+            PyErr_SetString(PyExc_TypeError, "The maxsplit argument must be an integer");
+            return NULL;
+        }
+    }
+
+    // The Unicode standard defines a number of characters that conforming applications
+    // should recognize as line terminators:
+    //
+    //      LF:    Line Feed, U+000A                            - 1 byte (\n)
+    //      VT:    Vertical Tab, U+000B                         - 1 byte (\v)
+    //      FF:    Form Feed, U+000C                            - 1 byte (\f)
+    //      CR:    Carriage Return, U+000D                      - 1 byte (\r)
+    //      NEL:   Next Line, U+0085                            - 1 byte (\x85)
+    //      LS:    Line Separator, U+2028                       - 2 bytes
+    //      PS:    Paragraph Separator, U+2029                  - 2 bytes
+    //      CR+LF: CR (U+000D) followed by LF (U+000A)          - 2 bytes
+    //
+    // The Python standard is different, it also includes:
+    //
+    //     FS:    File Separator, U+001C                       - 1 byte (\x1C)
+    //     GS:    Group Separator, U+001D                      - 1 byte (\x1D)
+    //     RS:    Record Separator, U+001E                     - 1 byte (\x1E)
+    //
+    // We avoid all 2-byte sequences and only consider 1-byte delimiters.
+    // CPython docs: https://docs.python.org/3/library/stdtypes.html#str.splitlines
+    sz_string_view_t separator;
+    separator.start = "\x0A\x0B\x0C\x0D\x85\x1C\x1D\x1E";
+    separator.length = 8;
+    return Str_split_(text_obj, text, separator, keeplinebreaks, maxsplit, &sz_find_byte_from, 1);
+}
+
+static PyObject *Str_concat(PyObject *self, PyObject *other) {
+    struct sz_string_view_t self_str, other_str;
+
+    // Validate and convert `self` and `other`
+    if (!export_string_like(self, &self_str.start, &self_str.length) ||
+        !export_string_like(other, &other_str.start, &other_str.length)) {
+        wrap_current_exception("Both operands must be string-like");
+        return NULL;
+    }
+
+    // Allocate a new Str instance
+    Str *result_str = PyObject_New(Str, &StrType);
+    if (result_str == NULL) { return NULL; }
+
+    // Calculate the total length of the new string
+    result_str->parent = NULL;
+    result_str->memory.length = self_str.length + other_str.length;
+
+    // Allocate memory for the new string
+    result_str->memory.start = malloc(result_str->memory.length);
+    if (result_str->memory.start == NULL) {
+        PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for string concatenation");
+        return NULL;
+    }
+
+    // Perform the string concatenation
+    sz_copy(result_str->memory.start, self_str.start, self_str.length);
+    sz_copy(result_str->memory.start + self_str.length, other_str.start, other_str.length);
+
+    return (PyObject *)result_str;
+}
+
+static PySequenceMethods Str_as_sequence = {
+    .sq_length = Str_len,   //
+    .sq_item = Str_getitem, //
+    .sq_contains = Str_in,  //
+};
+
+static PyMappingMethods Str_as_mapping = {
+    .mp_length = Str_len,          //
+    .mp_subscript = Str_subscript, // Is used to implement slices in Python
+};
+
+static PyBufferProcs Str_as_buffer = {
+    .bf_getbuffer = Str_getbuffer,
+    .bf_releasebuffer = Str_releasebuffer,
+};
+
+static PyNumberMethods Str_as_number = {
+    .nb_add = Str_concat,
+};
+
+static PyGetSetDef Str_getsetters[] = {
+    // Compatibility with PyArrow
+    {"address", (getter)Str_get_address, NULL, "Get the memory address of the first byte of the string", NULL},
+    {"nbytes", (getter)Str_get_nbytes, NULL, "Get the length of the string in bytes", NULL},
+    {NULL} // Sentinel
+};
+
+#define SZ_METHOD_FLAGS METH_VARARGS | METH_KEYWORDS
+
+static PyMethodDef Str_methods[] = {
+    {"contains", (PyCFunction)Str_contains, SZ_METHOD_FLAGS, doc_contains},
+    {"count", (PyCFunction)Str_count, SZ_METHOD_FLAGS, doc_count},
+    {"splitlines", (PyCFunction)Str_splitlines, SZ_METHOD_FLAGS, doc_splitlines},
+    {"startswith", (PyCFunction)Str_startswith, SZ_METHOD_FLAGS, doc_startswith},
+    {"endswith", (PyCFunction)Str_endswith, SZ_METHOD_FLAGS, doc_endswith},
+    {"translate", (PyCFunction)Str_translate, SZ_METHOD_FLAGS, doc_translate},
+    {"decode", (PyCFunction)Str_decode, SZ_METHOD_FLAGS, doc_decode},
+
+    // Bidirectional operations
+    {"find", (PyCFunction)Str_find, SZ_METHOD_FLAGS, doc_find},
+    {"index", (PyCFunction)Str_index, SZ_METHOD_FLAGS, doc_index},
+    {"partition", (PyCFunction)Str_partition, SZ_METHOD_FLAGS, doc_partition},
+    {"split", (PyCFunction)Str_split, SZ_METHOD_FLAGS, doc_split},
+    {"rfind", (PyCFunction)Str_rfind, SZ_METHOD_FLAGS, doc_rfind},
+    {"rindex", (PyCFunction)Str_rindex, SZ_METHOD_FLAGS, doc_rindex},
+    {"rpartition", (PyCFunction)Str_rpartition, SZ_METHOD_FLAGS, doc_rpartition},
+    {"rsplit", (PyCFunction)Str_rsplit, SZ_METHOD_FLAGS, doc_rsplit},
+
+    // Edit distance extensions
+    {"hamming_distance", (PyCFunction)Str_hamming_distance, SZ_METHOD_FLAGS, doc_hamming_distance},
+    {"hamming_distance_unicode", (PyCFunction)Str_hamming_distance_unicode, SZ_METHOD_FLAGS,
+     doc_hamming_distance_unicode},
+    {"levenshtein_distance", (PyCFunction)Str_levenshtein_distance, SZ_METHOD_FLAGS, doc_levenshtein_distance},
+    {"levenshtein_distance_unicode", (PyCFunction)Str_levenshtein_distance_unicode, SZ_METHOD_FLAGS,
+     doc_levenshtein_distance_unicode},
+    {"needleman_wunsch_score", (PyCFunction)Str_needleman_wunsch_score, SZ_METHOD_FLAGS, doc_needleman_wunsch_score},
+
+    // Character search extensions
+    {"find_first_of", (PyCFunction)Str_find_first_of, SZ_METHOD_FLAGS, doc_find_first_of},
+    {"find_last_of", (PyCFunction)Str_find_last_of, SZ_METHOD_FLAGS, doc_find_last_of},
+    {"find_first_not_of", (PyCFunction)Str_find_first_not_of, SZ_METHOD_FLAGS, doc_find_first_not_of},
+    {"find_last_not_of", (PyCFunction)Str_find_last_not_of, SZ_METHOD_FLAGS, doc_find_last_not_of},
+    {"split_byteset", (PyCFunction)Str_split_byteset, SZ_METHOD_FLAGS, doc_split_byteset},
+    {"rsplit_byteset", (PyCFunction)Str_rsplit_byteset, SZ_METHOD_FLAGS, doc_rsplit_byteset},
+
+    // Lazily evaluated iterators
+    {"split_iter", (PyCFunction)Str_split_iter, SZ_METHOD_FLAGS, doc_split_iter},
+    {"rsplit_iter", (PyCFunction)Str_rsplit_iter, SZ_METHOD_FLAGS, doc_rsplit_iter},
+    {"split_byteset_iter", (PyCFunction)Str_split_byteset_iter, SZ_METHOD_FLAGS, doc_split_byteset_iter},
+    {"rsplit_byteset_iter", (PyCFunction)Str_rsplit_byteset_iter, SZ_METHOD_FLAGS, doc_rsplit_byteset_iter},
+
+    // Dealing with larger-than-memory datasets
+    {"offset_within", (PyCFunction)Str_offset_within, SZ_METHOD_FLAGS, doc_offset_within},
+    {"write_to", (PyCFunction)Str_write_to, SZ_METHOD_FLAGS, doc_write_to},
+
+    {NULL, NULL, 0, NULL} // Sentinel
+};
+
+static PyTypeObject StrType = {
+    PyVarObject_HEAD_INIT(NULL, 0) //
+        .tp_name = "stringzilla.Str",
+    .tp_doc = "Immutable string/slice class with SIMD and SWAR-accelerated operations",
+    .tp_basicsize = sizeof(Str),
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_new = Str_new,
+    .tp_init = Str_init,
+    .tp_dealloc = Str_dealloc,
+    .tp_hash = Str_hash,
+    .tp_richcompare = Str_richcompare,
+    .tp_repr = (reprfunc)Str_repr,
+    .tp_str = Str_str,
+    .tp_methods = Str_methods,
+    .tp_as_sequence = &Str_as_sequence,
+    .tp_as_mapping = &Str_as_mapping,
+    .tp_as_buffer = &Str_as_buffer,
+    .tp_as_number = &Str_as_number,
+    .tp_getset = Str_getsetters,
+};
+
+#pragma endregion
+
+#pragma region Split Iterator
+
+static PyObject *SplitIteratorType_next(SplitIterator *self) {
+    // No more data to split
+    if (self->reached_tail) return NULL;
+
+    // Create a new `Str` object
+    Str *result_obj = (Str *)StrType.tp_alloc(&StrType, 0);
+    if (result_obj == NULL && PyErr_NoMemory()) return NULL;
+
+    sz_string_view_t result_memory;
+
+    // Find the next needle
+    sz_cptr_t found =
+        self->max_parts > 1 //
+            ? self->finder(self->text.start, self->text.length, self->separator.start, self->separator.length)
+            : NULL;
+
+    // We've reached the end of the string
+    if (found == NULL) {
+        result_memory.start = self->text.start;
+        result_memory.length = self->text.length;
+        self->text.length = 0;
+        self->reached_tail = 1;
+        self->max_parts = 0;
+    }
+    else {
+        if (self->is_reverse) {
+            result_memory.start = found + self->match_length * !self->include_match;
+            result_memory.length = self->text.start + self->text.length - result_memory.start;
+            self->text.length = found - self->text.start;
+        }
+        else {
+            result_memory.start = self->text.start;
+            result_memory.length = found - self->text.start;
+            self->text.start = found + self->match_length;
+            self->text.length -= result_memory.length + self->match_length;
+            result_memory.length += self->match_length * self->include_match;
+        }
+        self->max_parts--;
+    }
+
+    // Set its properties based on the slice
+    result_obj->memory = result_memory;
+    result_obj->parent = self->text_obj;
+
+    // Increment the reference count of the parent
+    Py_INCREF(self->text_obj);
+    return (PyObject *)result_obj;
+}
+
+static void SplitIteratorType_dealloc(SplitIterator *self) {
+    Py_XDECREF(self->text_obj);
+    Py_XDECREF(self->separator_obj);
+    Py_TYPE(self)->tp_free((PyObject *)self);
+}
+
+static PyObject *SplitIteratorType_iter(PyObject *self) {
+    Py_INCREF(self); // Iterator should return itself in __iter__.
+    return self;
+}
+
+static PyTypeObject SplitIteratorType = {
+    PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzilla.SplitIterator",
+    .tp_basicsize = sizeof(SplitIterator),
+    .tp_itemsize = 0,
+    .tp_dealloc = (destructor)SplitIteratorType_dealloc,
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_doc = "Text-splitting iterator",
+    .tp_iter = SplitIteratorType_iter,
+    .tp_iternext = (iternextfunc)SplitIteratorType_next,
+};
+
+#pragma endregion
+
+#pragma region Strs
+
+static PyObject *Strs_shuffle(Strs *self, PyObject *args, PyObject *kwargs) {
+
+    // Check for positional arguments
+    Py_ssize_t nargs = PyTuple_Size(args);
+    PyObject *seed_obj = nargs == 1 ? PyTuple_GET_ITEM(args, 0) : NULL;
+    if (nargs > 1) {
+        PyErr_SetString(PyExc_TypeError, "shuffle() takes at most 1 positional argument");
+        return NULL;
+    }
+
+    // Check for keyword arguments
+    if (kwargs) {
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+            if (PyUnicode_CompareWithASCIIString(key, "seed") == 0 && !seed_obj) { seed_obj = value; }
+            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) { return NULL; }
+        }
+    }
+
+    // Change the layout
+    if (!prepare_strings_for_reordering(self)) {
+        PyErr_Format(PyExc_TypeError, "Failed to prepare the sequence for shuffling");
+        return NULL;
+    }
+
+    // Get the parts and their count
+    struct reordered_slices_t *reordered = &self->data.reordered;
+    sz_string_view_t *parts = reordered->parts;
+    size_t count = reordered->count;
+
+    // Fisher-Yates Shuffle Algorithm
+    unsigned int seed = seed_obj ? PyLong_AsUnsignedLong(seed_obj) : time(NULL);
+    srand(seed);
+    for (size_t i = count - 1; i > 0; --i) {
+        size_t j = rand() % (i + 1);
+        // Swap parts[i] and parts[j]
+        sz_string_view_t temp = parts[i];
+        parts[i] = parts[j];
+        parts[j] = temp;
+    }
+
+    Py_RETURN_NONE;
+}
+
+static sz_bool_t Strs_argsort_(Strs *self, sz_string_view_t **parts_output, sz_sorted_idx_t **order_output,
+                               sz_size_t *count_output) {
+    // Change the layout
+    if (!prepare_strings_for_reordering(self)) {
+        PyErr_Format(PyExc_TypeError, "Failed to prepare the sequence for sorting");
+        return 0;
+    }
+
+    // Get the parts and their count
+    // The only possible `self->type` by now is the `STRS_REORDERED`
+    sz_string_view_t *parts = self->data.reordered.parts;
+    size_t count = self->data.reordered.count;
+
+    // Allocate temporary memory to store the ordering offsets
+    size_t memory_needed = sizeof(sz_sorted_idx_t) * count;
+    if (temporary_memory.length < memory_needed) {
+        temporary_memory.start = realloc(temporary_memory.start, memory_needed);
+        temporary_memory.length = memory_needed;
+    }
+    if (!temporary_memory.start) {
+        PyErr_Format(PyExc_MemoryError, "Unable to allocate memory for the Levenshtein matrix");
+        return 0;
+    }
+
+    // Call our sorting algorithm
+    sz_sequence_t sequence;
+    sz_fill(&sequence, sizeof(sequence), 0);
+    sequence.count = count;
+    sequence.handle = parts;
+    sequence.get_start = parts_get_start;
+    sequence.get_length = parts_get_length;
+    sz_status_t status = sz_sequence_argsort(&sequence, NULL, (sz_sorted_idx_t *)temporary_memory.start);
+
+    // Export results
+    *parts_output = parts;
+    *order_output = (sz_sorted_idx_t *)temporary_memory.start;
+    *count_output = sequence.count;
+    return 1;
+}
+
+static PyObject *Strs_sort(Strs *self, PyObject *args, PyObject *kwargs) {
+    PyObject *reverse_obj = NULL; // Default is not reversed
+
+    // Check for positional arguments
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs > 1) {
+        PyErr_SetString(PyExc_TypeError, "sort() takes at most 1 positional argument");
+        return NULL;
+    }
+    else if (nargs == 1) { reverse_obj = PyTuple_GET_ITEM(args, 0); }
+
+    // Check for keyword arguments
+    if (kwargs) {
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+            if (PyUnicode_CompareWithASCIIString(key, "reverse") == 0 && !reverse_obj) { reverse_obj = value; }
+            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) { return NULL; }
+        }
+    }
+
+    sz_bool_t reverse = 0; // Default is False
+    if (reverse_obj) {
+        if (!PyBool_Check(reverse_obj)) {
+            PyErr_SetString(PyExc_TypeError, "The reverse must be a boolean");
+            return NULL;
+        }
+        reverse = PyObject_IsTrue(reverse_obj);
+    }
+
+    sz_string_view_t *parts = NULL;
+    sz_size_t *order = NULL;
+    sz_size_t count = 0;
+    if (!Strs_argsort_(self, &parts, &order, &count)) return NULL;
+
+    // Apply the sorting algorithm here, considering the `reverse` value
+    if (reverse) reverse_offsets(order, count);
+
+    // Apply the new order.
+    permute(parts, order, count);
+
+    Py_RETURN_NONE;
+}
+
+static PyObject *Strs_argsort(Strs *self, PyObject *args, PyObject *kwargs) {
+    PyObject *reverse_obj = NULL; // Default is not reversed
+
+    // Check for positional arguments
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs > 1) {
+        PyErr_SetString(PyExc_TypeError, "order() takes at most 1 positional argument");
+        return NULL;
+    }
+    else if (nargs == 1) { reverse_obj = PyTuple_GET_ITEM(args, 0); }
+
+    // Check for keyword arguments
+    if (kwargs) {
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+            if (PyUnicode_CompareWithASCIIString(key, "reverse") == 0 && !reverse_obj) { reverse_obj = value; }
+            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) { return NULL; }
+        }
+    }
+
+    sz_bool_t reverse = 0; // Default is False
+    if (reverse_obj) {
+        if (!PyBool_Check(reverse_obj)) {
+            PyErr_SetString(PyExc_TypeError, "The reverse must be a boolean");
+            return NULL;
+        }
+        reverse = PyObject_IsTrue(reverse_obj);
+    }
+
+    sz_string_view_t *parts = NULL;
+    sz_sorted_idx_t *order = NULL;
+    sz_size_t count = 0;
+    if (!Strs_argsort_(self, &parts, &order, &count)) return NULL;
+
+    // Apply the sorting algorithm here, considering the `reverse` value
+    if (reverse) reverse_offsets(order, count);
+
+    // Here, instead of applying the order, we want to return the copy of the
+    // order as a NumPy array of 64-bit unsigned integers.
+    //
+    //      npy_intp numpy_size = count;
+    //      PyObject *array = PyArray_SimpleNew(1, &numpy_size, NPY_UINT64);
+    //      if (!array) {
+    //          PyErr_SetString(PyExc_RuntimeError, "Failed to create a NumPy array");
+    //          return NULL;
+    //      }
+    //      sz_sorted_idx_t *numpy_data_ptr = (sz_sorted_idx_t *)PyArray_DATA((PyArrayObject *)array);
+    //      sz_copy(numpy_data_ptr, order, count * sizeof(sz_sorted_idx_t));
+    //
+    // There are compilation issues with NumPy.
+    // Here is an example for `cp312-musllinux_s390x`: https://x.com/ashvardanian/status/1757880762278531447?s=20
+    // So instead of NumPy, let's produce a tuple of integers.
+    PyObject *tuple = PyTuple_New(count);
+    if (!tuple) {
+        PyErr_SetString(PyExc_RuntimeError, "Failed to create a tuple");
+        return NULL;
+    }
+    for (sz_size_t i = 0; i < count; ++i) {
+        PyObject *index = PyLong_FromUnsignedLong(order[i]);
+        if (!index) {
+            PyErr_SetString(PyExc_RuntimeError, "Failed to create a tuple element");
+            Py_DECREF(tuple);
+            return NULL;
+        }
+        PyTuple_SET_ITEM(tuple, i, index);
+    }
+    return tuple;
+}
+
+static PyObject *Strs_sample(Strs *self, PyObject *args, PyObject *kwargs) {
+    PyObject *sample_size_obj = NULL;
+    PyObject *seed_obj = NULL;
+
+    // Check for positional arguments
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs > 1) {
+        PyErr_SetString(PyExc_TypeError, "sample() takes 1 positional argument and 1 keyword argument");
+        return NULL;
+    }
+    else if (nargs == 1) { sample_size_obj = PyTuple_GET_ITEM(args, 0); }
+
+    // Parse keyword arguments
+    if (kwargs) {
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+            if (PyUnicode_CompareWithASCIIString(key, "seed") == 0 && !seed_obj) { seed_obj = value; }
+            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) { return NULL; }
+        }
+    }
+
+    // Translate the seed and the sample size to C types
+    size_t sample_size = 0;
+    if (sample_size_obj) {
+        if (!PyLong_Check(sample_size_obj)) {
+            PyErr_SetString(PyExc_TypeError, "The sample size must be an integer");
+            return NULL;
+        }
+        sample_size = PyLong_AsSize_t(sample_size_obj);
+    }
+    unsigned int seed = time(NULL); // Default seed
+    if (seed_obj) {
+        if (!PyLong_Check(seed_obj)) {
+            PyErr_SetString(PyExc_TypeError, "The seed must be an integer");
+            return NULL;
+        }
+        seed = PyLong_AsUnsignedLong(seed_obj);
+    }
+
+    // Create a new `Strs` object
+    Strs *result = (Strs *)StrsType.tp_alloc(&StrsType, 0);
+    if (result == NULL && PyErr_NoMemory()) return NULL;
+
+    result->type = STRS_REORDERED;
+    result->data.reordered.count = 0;
+    result->data.reordered.parts = NULL;
+    result->data.reordered.parent_string = NULL;
+    if (sample_size == 0) { return (PyObject *)result; }
+
+    // Now create a new Strs object with the sampled strings
+    sz_string_view_t *result_parts = malloc(sample_size * sizeof(sz_string_view_t));
+    if (!result_parts) {
+        PyErr_SetString(PyExc_MemoryError, "Failed to allocate memory for the sample");
+        return NULL;
+    }
+
+    // Introspect the Strs object to know the from which will be sampling
+    Py_ssize_t count = Strs_len(self);
+    get_string_at_offset_t getter = str_at_offset_getter(self);
+    if (!getter) {
+        PyErr_SetString(PyExc_TypeError, "Unknown Strs kind");
+        return NULL;
+    }
+
+    // Randomly sample the strings
+    srand(seed);
+    PyObject *parent_string;
+    for (Py_ssize_t i = 0; i < sample_size; i++) {
+        size_t index = rand() % count;
+        getter(self, index, count, &parent_string, &result_parts[i].start, &result_parts[i].length);
+    }
+
+    // Update the Strs object
+    result->type = STRS_REORDERED;
+    result->data.reordered.count = sample_size;
+    result->data.reordered.parts = result_parts;
+    result->data.reordered.parent_string = parent_string;
+    return result;
+}
+
+/**
+ *  @brief Exports a string to a UTF-8 buffer, escaping single quotes.
+ *  @param[out] did_fit Populated with 1 if the string is fully exported, 0 if it didn't fit.
+ */
+char const *export_escaped_unquoted_to_utf8_buffer(char const *cstr, size_t cstr_length, //
+                                                   char *buffer, size_t buffer_length,   //
+                                                   int *did_fit) {
+    char const *const cstr_end = cstr + cstr_length;
+    char *const buffer_end = buffer + buffer_length;
+    *did_fit = 1;
+
+    while (cstr < cstr_end) {
+        sz_rune_t rune;
+        sz_rune_length_t rune_length;
+        sz_rune_parse(cstr, &rune, &rune_length);
+        if (rune_length == 1 && buffer + 2 < buffer_end) {
+            if (*cstr == '\'') {
+                *(buffer++) = '\\';
+                *(buffer++) = '\'';
+                cstr++;
+            }
+            else if (*cstr == '\'') {
+                *(buffer++) = '\\';
+                *(buffer++) = '\'';
+                cstr++;
+            }
+            else { *(buffer++) = *(cstr++); }
+        }
+        else if (buffer + rune_length < buffer_end) {
+            sz_copy(buffer, cstr, rune_length);
+            buffer += rune_length;
+            cstr += rune_length;
+        }
+        else {
+            *did_fit = 0;
+            break;
+        }
+    }
+
+    return buffer;
+}
+
+/**
+ *  @brief  Formats an array of strings, similar to the `repr` method of Python lists.
+ *          Will output an object that looks like `sz.Str(['item1', 'item2... ])`, potentially
+ *          dropping the last few entries.
+ */
+static PyObject *Strs_repr(Strs *self) {
+    get_string_at_offset_t getter = str_at_offset_getter(self);
+    if (!getter) {
+        PyErr_SetString(PyExc_TypeError, "Unknown Strs kind");
+        return NULL;
+    }
+
+    char repr_buffer[1024];
+    char *repr_buffer_ptr = &repr_buffer[0];
+    char const *const repr_buffer_end = repr_buffer_ptr + 1024;
+
+    // Start of the array
+    sz_copy(repr_buffer_ptr, "sz.Strs([", 9);
+    repr_buffer_ptr += 9;
+
+    size_t count = Strs_len(self);
+    PyObject *parent_string;
+
+    // In the worst case, we must have enough space for `...', ...])`
+    // That's extra 11 bytes of content.
+    char const *non_fitting_array_tail = "... ])";
+    int const non_fitting_array_tail_length = 6;
+
+    // If the whole string doesn't fit, even before the `non_fitting_array_tail` tail,
+    // we need to add `, '` separator of 3 bytes.
+    for (size_t i = 0; i < count && repr_buffer_ptr + (non_fitting_array_tail_length + 3) < repr_buffer_end; i++) {
+        char const *cstr_start = NULL;
+        size_t cstr_length = 0;
+        getter(self, i, count, &parent_string, &cstr_start, &cstr_length);
+
+        if (i > 0) { *(repr_buffer_ptr++) = ',', *(repr_buffer_ptr++) = ' '; }
+        *(repr_buffer_ptr++) = '\'';
+
+        int did_fit;
+        repr_buffer_ptr = export_escaped_unquoted_to_utf8_buffer(
+            cstr_start, cstr_length, repr_buffer_ptr, repr_buffer_end - repr_buffer_ptr - non_fitting_array_tail_length,
+            &did_fit);
+        // If it didn't fit, let's put an ellipsis
+        if (!did_fit) {
+            sz_copy(repr_buffer_ptr, non_fitting_array_tail, non_fitting_array_tail_length);
+            repr_buffer_ptr += non_fitting_array_tail_length;
+            return PyUnicode_FromStringAndSize(repr_buffer, repr_buffer_ptr - repr_buffer);
+        }
+        else
+            *(repr_buffer_ptr++) = '\''; // Close the string
+    }
+
+    // Close the array
+    *(repr_buffer_ptr++) = ']', *(repr_buffer_ptr++) = ')';
+    return PyUnicode_FromStringAndSize(repr_buffer, repr_buffer_ptr - repr_buffer);
+}
+
+/**
+ *  @brief  Array to string conversion method, that concatenates all the strings in the array.
+ *          Will output an object that looks like `['item1', 'item2', 'item3']`, containing all
+ *          the strings.
+ */
+static PyObject *Strs_str(Strs *self) {
+    get_string_at_offset_t getter = str_at_offset_getter(self);
+    if (!getter) {
+        PyErr_SetString(PyExc_TypeError, "Unknown Strs kind");
+        return NULL;
+    }
+
+    // Aggregate the total length of all the slices and count the number of bytes we need to allocate:
+    size_t count = Strs_len(self);
+    PyObject *parent_string;
+    size_t total_bytes = 2; // opening and closing square brackets
+    for (size_t i = 0; i < count; i++) {
+        char const *cstr_start = NULL;
+        size_t cstr_length = 0;
+        getter(self, i, count, &parent_string, &cstr_start, &cstr_length);
+        total_bytes += cstr_length;
+        total_bytes += 2;             // For the single quotes
+        if (i != 0) total_bytes += 2; // For the preceding comma and space
+
+        // Count the number of single quotes in the string
+        while (cstr_length) {
+            char quote = '\'';
+            sz_cptr_t next_quote = sz_find_byte(cstr_start, cstr_length, &quote);
+            if (next_quote == NULL) break;
+            total_bytes++;
+            cstr_length -= next_quote - cstr_start;
+            cstr_start = next_quote + 1;
+        }
+    }
+
+    // Now allocate the memory for the concatenated string
+    char *const result_buffer = malloc(total_bytes);
+    if (!result_buffer) {
+        PyErr_SetString(PyExc_MemoryError, "Failed to allocate memory for the concatenated string");
+        return NULL;
+    }
+
+    // Copy the strings into the result buffer
+    char *result_ptr = result_buffer;
+    *result_ptr++ = '[';
+    for (size_t i = 0; i < count; i++) {
+        if (i != 0) {
+            *result_ptr++ = ',';
+            *result_ptr++ = ' ';
+        }
+        char const *cstr_start = NULL;
+        size_t cstr_length = 0;
+        getter(self, i, count, &parent_string, &cstr_start, &cstr_length);
+        *result_ptr++ = '\'';
+        int did_fit;
+        result_ptr = export_escaped_unquoted_to_utf8_buffer(cstr_start, cstr_length, result_ptr, total_bytes, &did_fit);
+        *result_ptr++ = '\'';
+    }
+
+    *result_ptr++ = ']';
+    return PyUnicode_FromStringAndSize(result_buffer, total_bytes);
+}
+
+static PySequenceMethods Strs_as_sequence = {
+    .sq_length = Strs_len,   //
+    .sq_item = Strs_getitem, //
+    .sq_contains = Strs_in,  //
+};
+
+static PyMappingMethods Strs_as_mapping = {
+    .mp_length = Strs_len,          //
+    .mp_subscript = Strs_subscript, // Is used to implement slices in Python
+};
+
+static PyGetSetDef Strs_getsetters[] = {
+    // Compatibility with PyArrow
+    {"tape", (getter)Strs_get_tape, NULL, "In-place transforms the string representation to match Apache Arrow", NULL},
+    {"tape_address", (getter)Strs_get_tape_address, NULL, "Address of the first byte of the first string", NULL},
+    {"tape_nbytes", (getter)Strs_get_tape_nbytes, NULL, "Length of the entire tape of strings in bytes", NULL},
+    {"offsets_address", (getter)Strs_get_offsets_address, NULL, "Address of the first byte of offsets array", NULL},
+    {"offsets_nbytes", (getter)Strs_get_offsets_nbytes, NULL, "Get teh length of offsets array in bytes", NULL},
+    {"offsets_are_large", (getter)Strs_get_offsets_are_large, NULL,
+     "Checks if 64-bit addressing should be used to convert to Arrow", NULL},
+    {NULL} // Sentinel
+};
+
+static PyMethodDef Strs_methods[] = {
+    {"shuffle", Strs_shuffle, SZ_METHOD_FLAGS, "Shuffle (in-place) the elements of the Strs object."}, //
+    {"sort", Strs_sort, SZ_METHOD_FLAGS, "Sort (in-place) the elements of the Strs object."},          //
+    {"argsort", Strs_argsort, SZ_METHOD_FLAGS, "Provides the permutation to achieve sorted order."},   //
+    {"sample", Strs_sample, SZ_METHOD_FLAGS, "Provides a random sample of a given size."},             //
+    // {"to_pylist", Strs_to_pylist, SZ_METHOD_FLAGS, "Exports string-views to a native list of native strings."}, //
+    {NULL, NULL, 0, NULL} // Sentinel
+};
+
+static PyTypeObject StrsType = {
+    PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzilla.Strs",
+    .tp_doc = "Space-efficient container for large collections of strings and their slices",
+    .tp_basicsize = sizeof(Strs),
+    .tp_itemsize = 0,
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_new = PyType_GenericNew,
+    .tp_methods = Strs_methods,
+    .tp_as_sequence = &Strs_as_sequence,
+    .tp_as_mapping = &Strs_as_mapping,
+    .tp_getset = Strs_getsetters,
+    .tp_richcompare = Strs_richcompare,
+    .tp_repr = (reprfunc)Strs_repr,
+    .tp_str = (reprfunc)Strs_str,
+};
+
+#pragma endregion
+
+static void stringzilla_cleanup(PyObject *m) {
+    if (temporary_memory.start) free(temporary_memory.start);
+    temporary_memory.start = NULL;
+    temporary_memory.length = 0;
+}
+
+static PyMethodDef stringzilla_methods[] = {
+    // Basic `str`, `bytes`, and `bytearray`-like functionality
+    {"contains", Str_contains, SZ_METHOD_FLAGS, doc_contains},
+    {"count", Str_count, SZ_METHOD_FLAGS, doc_count},
+    {"splitlines", Str_splitlines, SZ_METHOD_FLAGS, doc_splitlines},
+    {"startswith", Str_startswith, SZ_METHOD_FLAGS, doc_startswith},
+    {"endswith", Str_endswith, SZ_METHOD_FLAGS, doc_endswith},
+    {"translate", Str_translate, SZ_METHOD_FLAGS, doc_translate},
+    {"decode", Str_decode, SZ_METHOD_FLAGS, doc_decode},
+    {"equal", Str_like_equal, SZ_METHOD_FLAGS, doc_like_equal},
+
+    // Bidirectional operations
+    {"find", Str_find, SZ_METHOD_FLAGS, doc_find},
+    {"index", Str_index, SZ_METHOD_FLAGS, doc_index},
+    {"partition", Str_partition, SZ_METHOD_FLAGS, doc_partition},
+    {"split", Str_split, SZ_METHOD_FLAGS, doc_split},
+    {"rfind", Str_rfind, SZ_METHOD_FLAGS, doc_rfind},
+    {"rindex", Str_rindex, SZ_METHOD_FLAGS, doc_rindex},
+    {"rpartition", Str_rpartition, SZ_METHOD_FLAGS, doc_rpartition},
+    {"rsplit", Str_rsplit, SZ_METHOD_FLAGS, doc_rsplit},
+
+    // Edit distance extensions
+    {"hamming_distance", Str_hamming_distance, SZ_METHOD_FLAGS, doc_hamming_distance},
+    {"hamming_distance_unicode", Str_hamming_distance_unicode, SZ_METHOD_FLAGS, doc_hamming_distance_unicode},
+    {"levenshtein_distance", Str_levenshtein_distance, SZ_METHOD_FLAGS, doc_levenshtein_distance},
+    {"levenshtein_distance_unicode", Str_levenshtein_distance_unicode, SZ_METHOD_FLAGS,
+     doc_levenshtein_distance_unicode},
+    {"needleman_wunsch_score", Str_needleman_wunsch_score, SZ_METHOD_FLAGS, doc_needleman_wunsch_score},
+
+    // Character search extensions
+    {"find_first_of", Str_find_first_of, SZ_METHOD_FLAGS, doc_find_first_of},
+    {"find_last_of", Str_find_last_of, SZ_METHOD_FLAGS, doc_find_last_of},
+    {"find_first_not_of", Str_find_first_not_of, SZ_METHOD_FLAGS, doc_find_first_not_of},
+    {"find_last_not_of", Str_find_last_not_of, SZ_METHOD_FLAGS, doc_find_last_not_of},
+    {"split_byteset", Str_split_byteset, SZ_METHOD_FLAGS, doc_split_byteset},
+    {"rsplit_byteset", Str_rsplit_byteset, SZ_METHOD_FLAGS, doc_rsplit_byteset},
+
+    // Lazily evaluated iterators
+    {"split_iter", Str_split_iter, SZ_METHOD_FLAGS, doc_split_iter},
+    {"rsplit_iter", Str_rsplit_iter, SZ_METHOD_FLAGS, doc_rsplit_iter},
+    {"split_byteset_iter", Str_split_byteset_iter, SZ_METHOD_FLAGS, doc_split_byteset_iter},
+    {"rsplit_byteset_iter", Str_rsplit_byteset_iter, SZ_METHOD_FLAGS, doc_rsplit_byteset_iter},
+
+    // Dealing with larger-than-memory datasets
+    {"offset_within", Str_offset_within, SZ_METHOD_FLAGS, doc_offset_within},
+    {"write_to", Str_write_to, SZ_METHOD_FLAGS, doc_write_to},
+
+    // Global unary extensions
+    {"hash", Str_like_hash, SZ_METHOD_FLAGS, doc_like_hash},
+    {"bytesum", Str_like_bytesum, SZ_METHOD_FLAGS, doc_like_bytesum},
+
+    {NULL, NULL, 0, NULL}};
+
+static PyModuleDef stringzilla_module = {
+    PyModuleDef_HEAD_INIT,
+    "stringzilla",
+    "SIMD-accelerated string search, sort, hashes, fingerprints, & edit distances",
+    -1,
+    stringzilla_methods,
+    NULL,
+    NULL,
+    NULL,
+    stringzilla_cleanup,
+};
+
+PyMODINIT_FUNC PyInit_stringzilla(void) {
+    PyObject *m;
+
+    if (PyType_Ready(&StrType) < 0) return NULL;
+    if (PyType_Ready(&FileType) < 0) return NULL;
+    if (PyType_Ready(&StrsType) < 0) return NULL;
+    if (PyType_Ready(&SplitIteratorType) < 0) return NULL;
+
+    m = PyModule_Create(&stringzilla_module);
+    if (m == NULL) return NULL;
+
+    // Add version metadata
+    {
+        char version_str[50];
+        sprintf(version_str, "%d.%d.%d", sz_version_major(), sz_version_minor(), sz_version_patch());
+        PyModule_AddStringConstant(m, "__version__", version_str);
+    }
+
+    // Define SIMD capabilities
+    {
+        sz_capability_t caps = sz_capabilities();
+        sz_cptr_t caps_str = sz_capabilities_to_string(caps);
+        PyModule_AddStringConstant(m, "__capabilities__", caps_str);
+    }
+
+    Py_INCREF(&StrType);
+    if (PyModule_AddObject(m, "Str", (PyObject *)&StrType) < 0) {
+        Py_XDECREF(&StrType);
+        Py_XDECREF(m);
+        return NULL;
+    }
+
+    Py_INCREF(&FileType);
+    if (PyModule_AddObject(m, "File", (PyObject *)&FileType) < 0) {
+        Py_XDECREF(&FileType);
+        Py_XDECREF(&StrType);
+        Py_XDECREF(m);
+        return NULL;
+    }
+
+    Py_INCREF(&StrsType);
+    if (PyModule_AddObject(m, "Strs", (PyObject *)&StrsType) < 0) {
+        Py_XDECREF(&StrsType);
+        Py_XDECREF(&FileType);
+        Py_XDECREF(&StrType);
+        Py_XDECREF(m);
+        return NULL;
+    }
+
+    Py_INCREF(&SplitIteratorType);
+    if (PyModule_AddObject(m, "SplitIterator", (PyObject *)&SplitIteratorType) < 0) {
+        Py_XDECREF(&SplitIteratorType);
+        Py_XDECREF(&StrsType);
+        Py_XDECREF(&FileType);
+        Py_XDECREF(&StrType);
+        Py_XDECREF(m);
+        return NULL;
+    }
+
+    // Initialize temporary_memory, if needed
+    temporary_memory.start = malloc(4096);
+    temporary_memory.length = 4096 * (temporary_memory.start != NULL);
+    return m;
+}
diff --git a/scripts/test.py b/scripts/test_stringzilla.py
similarity index 82%
rename from scripts/test.py
rename to scripts/test_stringzilla.py
index eb92252d..15808044 100644
--- a/scripts/test.py
+++ b/scripts/test_stringzilla.py
@@ -435,18 +435,6 @@ def test_unit_globals():
     assert sz.count("aaaaa", "aa") == 2
     assert sz.count("aaaaa", "aa", allowoverlap=True) == 4
 
-    assert sz.hamming_distance("aaa", "aaa") == 0
-    assert sz.hamming_distance("aaa", "bbb") == 3
-    assert sz.hamming_distance("abababab", "aaaaaaaa") == 4
-    assert sz.hamming_distance("abababab", "aaaaaaaa", 2) == 2
-    assert sz.hamming_distance("abababab", "aaaaaaaa", bound=2) == 2
-
-    assert sz.edit_distance("aaa", "aaa") == 0
-    assert sz.edit_distance("aaa", "bbb") == 3
-    assert sz.edit_distance("abababab", "aaaaaaaa") == 4
-    assert sz.edit_distance("abababab", "aaaaaaaa", 2) == 2
-    assert sz.edit_distance("abababab", "aaaaaaaa", bound=2) == 2
-
     assert sz.translate("ABC", {"A": "X", "B": "Y", "C": "Z"}) == "XYZ"
     assert sz.translate("ABC", {"A": "X", "B": "Y"}) == "XYC"
     assert sz.translate("ABC", {"A": "X", "B": "Y"}, start=1, end=-1) == "YC"
@@ -521,38 +509,6 @@ def is_equal_strings(native_strings, big_strings):
         ), f"Mismatch between `{native_slice}` and `{str(big_slice)}`"
 
 
-@pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
-def baseline_edit_distance(s1, s2) -> int:
-    """
-    Compute the Levenshtein distance between two strings.
-    """
-
-    # Create a matrix of size (len(s1)+1) x (len(s2)+1)
-    matrix = np.zeros((len(s1) + 1, len(s2) + 1), dtype=int)
-
-    # Initialize the first column and first row of the matrix
-    for i in range(len(s1) + 1):
-        matrix[i, 0] = i
-    for j in range(len(s2) + 1):
-        matrix[0, j] = j
-
-    # Compute Levenshtein distance
-    for i in range(1, len(s1) + 1):
-        for j in range(1, len(s2) + 1):
-            if s1[i - 1] == s2[j - 1]:
-                cost = 0
-            else:
-                cost = 1
-            matrix[i, j] = min(
-                matrix[i - 1, j] + 1,  # Deletion
-                matrix[i, j - 1] + 1,  # Insertion
-                matrix[i - 1, j - 1] + cost,  # Substitution
-            )
-
-    # Return the Levenshtein distance
-    return matrix[len(s1), len(s2)]
-
-
 def check_identical(
     native: str,
     big: Str,
@@ -622,102 +578,6 @@ def test_fuzzy_substrings(pattern_length: int, haystack_length: int, variability
     ), f"Failed to locate {pattern} at offset {native.find(pattern)} in {native}"
 
 
-@pytest.mark.repeat(100)
-@pytest.mark.parametrize("max_edit_distance", [150])
-def test_edit_distance_insertions(max_edit_distance: int):
-    # Create a new string by slicing and concatenating
-    def insert_char_at(s, char_to_insert, index):
-        return s[:index] + char_to_insert + s[index:]
-
-    a = get_random_string(length=20)
-    b = a
-    for i in range(max_edit_distance):
-        source_offset = randint(0, len(ascii_lowercase) - 1)
-        target_offset = randint(0, len(b) - 1)
-        b = insert_char_at(b, ascii_lowercase[source_offset], target_offset)
-        assert sz.edit_distance(a, b, bound=200) == i + 1
-
-
-def test_edit_distances():
-
-    assert sz.hamming_distance("hello", "hello") == 0
-    assert sz.hamming_distance("hello", "hell") == 1
-    assert sz.hamming_distance("abc", "adc") == 1, "one substitution"
-    assert sz.hamming_distance("αβγδ", "αxxγδ") == 2, "replace Beta UTF8 codepoint"
-    assert (
-        sz.hamming_distance_unicode("abcdefgh", "_bcdefg_") == 2
-    ), "replace ASCI prefix and suffix"
-    assert (
-        sz.hamming_distance_unicode("αβγδ", "αγγδ") == 1
-    ), "replace Beta UTF8 codepoint"
-
-    assert sz.edit_distance("hello", "hello") == 0
-    assert sz.edit_distance("hello", "hell") == 1
-    assert sz.edit_distance("", "") == 0
-    assert sz.edit_distance("", "abc") == 3
-    assert sz.edit_distance("abc", "") == 3
-    assert sz.edit_distance("abc", "ac") == 1, "one deletion"
-    assert sz.edit_distance("abc", "a_bc") == 1, "one insertion"
-    assert sz.edit_distance("abc", "adc") == 1, "one substitution"
-    assert (
-        sz.edit_distance("ggbuzgjux{}l", "gbuzgjux{}l") == 1
-    ), "one insertion (prepended)"
-    assert sz.edit_distance("abcdefgABCDEFG", "ABCDEFGabcdefg") == 14
-
-    assert (
-        sz.edit_distance_unicode("hello", "hell") == 1
-    ), "no unicode symbols, just ASCII"
-    assert (
-        sz.edit_distance_unicode("𠜎 𠜱 𠝹 𠱓", "𠜎𠜱𠝹𠱓") == 3
-    ), "add 3 whitespaces in Chinese"
-    assert sz.edit_distance_unicode("💖", "💗") == 1
-
-    assert sz.edit_distance_unicode("αβγδ", "αγδ") == 1, "insert Beta"
-    assert (
-        sz.edit_distance_unicode("école", "école") == 2
-    ), "etter 'é' as a single character vs 'e' + '´'"
-    assert (
-        sz.edit_distance_unicode("façade", "facade") == 1
-    ), "'ç' with cedilla vs. plain"
-    assert (
-        sz.edit_distance_unicode("Schön", "Scho\u0308n") == 2
-    ), "'ö' represented as 'o' + '¨'"
-    assert (
-        sz.edit_distance_unicode("München", "Muenchen") == 2
-    ), "German with umlaut vs. transcription"
-    assert sz.edit_distance_unicode("こんにちは世界", "こんばんは世界") == 2
-
-
-@pytest.mark.repeat(30)
-@pytest.mark.parametrize("first_length", [20, 100])
-@pytest.mark.parametrize("second_length", [20, 100])
-@pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
-def test_edit_distance_random(first_length: int, second_length: int):
-    a = get_random_string(length=first_length)
-    b = get_random_string(length=second_length)
-    assert sz.edit_distance(a, b) == baseline_edit_distance(a, b)
-
-
-@pytest.mark.repeat(30)
-@pytest.mark.parametrize("first_length", [20, 100])
-@pytest.mark.parametrize("second_length", [20, 100])
-@pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
-def test_alignment_score_random(first_length: int, second_length: int):
-
-    a = get_random_string(length=first_length)
-    b = get_random_string(length=second_length)
-    character_substitutions = np.zeros((256, 256), dtype=np.int8)
-    character_substitutions.fill(-1)
-    np.fill_diagonal(character_substitutions, 0)
-
-    assert sz.alignment_score(
-        a,
-        b,
-        substitution_matrix=character_substitutions,
-        gap_score=-1,
-    ) == -baseline_edit_distance(a, b)
-
-
 def baseline_translate(body: str, lut: Sequence) -> str:
     return "".join([chr(lut[ord(c)]) for c in body])
 
diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
new file mode 100644
index 00000000..a43a3757
--- /dev/null
+++ b/scripts/test_stringzillas.py
@@ -0,0 +1,342 @@
+from random import choice, randint
+from string import ascii_lowercase
+from typing import Optional, Sequence, Dict
+import tempfile
+import os
+
+import pytest
+
+import stringzilla as sz
+from stringzilla import Str, Strs
+
+# NumPy is available on most platforms and is required for most tests.
+# When using PyPy on some platforms NumPy has internal issues, that will
+# raise a weird error, not an `ImportError`. That's why we intentionally
+# use a naked `except:`. Necessary evil!
+try:
+    import numpy as np
+
+    numpy_available = True
+except:
+    # NumPy is not installed, most tests will be skipped
+    numpy_available = False
+
+
+# PyArrow is not available on most platforms.
+# When using PyPy on some platforms PyArrow has internal issues, that will
+# raise a weird error, not an `ImportError`. That's why we intentionally
+# use a naked `except:`. Necessary evil!
+try:
+    import pyarrow as pa
+
+    pyarrow_available = True
+except:
+    # PyArrow is not installed, most tests will be skipped
+    pyarrow_available = False
+
+
+def test_library_properties():
+    assert len(sz.__version__.split(".")) == 3, "Semantic versioning must be preserved"
+    assert "serial" in sz.__capabilities__.split(","), "Serial backend must be present"
+
+
+def test_unit_globals():
+    """Validates that the previously unit-tested member methods are also visible as global functions."""
+
+    assert sz.hamming_distance("aaa", "aaa") == 0
+    assert sz.hamming_distance("aaa", "bbb") == 3
+    assert sz.hamming_distance("abababab", "aaaaaaaa") == 4
+    assert sz.hamming_distance("abababab", "aaaaaaaa", 2) == 2
+    assert sz.hamming_distance("abababab", "aaaaaaaa", bound=2) == 2
+
+    assert sz.edit_distance("aaa", "aaa") == 0
+    assert sz.edit_distance("aaa", "bbb") == 3
+    assert sz.edit_distance("abababab", "aaaaaaaa") == 4
+    assert sz.edit_distance("abababab", "aaaaaaaa", 2) == 2
+    assert sz.edit_distance("abababab", "aaaaaaaa", bound=2) == 2
+
+
+def get_random_string(
+    length: Optional[int] = None,
+    variability: Optional[int] = None,
+) -> str:
+    if length is None:
+        length = randint(3, 300)
+    if variability is None:
+        variability = len(ascii_lowercase)
+    return "".join(choice(ascii_lowercase[:variability]) for _ in range(length))
+
+
+def is_equal_strings(native_strings, big_strings):
+    for native_slice, big_slice in zip(native_strings, big_strings):
+        assert (
+            native_slice == big_slice
+        ), f"Mismatch between `{native_slice}` and `{str(big_slice)}`"
+
+
+@pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
+def baseline_edit_distance(s1, s2) -> int:
+    """
+    Compute the Levenshtein distance between two strings.
+    """
+
+    # Create a matrix of size (len(s1)+1) x (len(s2)+1)
+    matrix = np.zeros((len(s1) + 1, len(s2) + 1), dtype=int)
+
+    # Initialize the first column and first row of the matrix
+    for i in range(len(s1) + 1):
+        matrix[i, 0] = i
+    for j in range(len(s2) + 1):
+        matrix[0, j] = j
+
+    # Compute Levenshtein distance
+    for i in range(1, len(s1) + 1):
+        for j in range(1, len(s2) + 1):
+            if s1[i - 1] == s2[j - 1]:
+                cost = 0
+            else:
+                cost = 1
+            matrix[i, j] = min(
+                matrix[i - 1, j] + 1,  # Deletion
+                matrix[i, j - 1] + 1,  # Insertion
+                matrix[i - 1, j - 1] + cost,  # Substitution
+            )
+
+    # Return the Levenshtein distance
+    return matrix[len(s1), len(s2)]
+
+
+@pytest.mark.repeat(100)
+@pytest.mark.parametrize("max_edit_distance", [150])
+def test_edit_distance_insertions(max_edit_distance: int):
+    # Create a new string by slicing and concatenating
+    def insert_char_at(s, char_to_insert, index):
+        return s[:index] + char_to_insert + s[index:]
+
+    a = get_random_string(length=20)
+    b = a
+    for i in range(max_edit_distance):
+        source_offset = randint(0, len(ascii_lowercase) - 1)
+        target_offset = randint(0, len(b) - 1)
+        b = insert_char_at(b, ascii_lowercase[source_offset], target_offset)
+        assert sz.edit_distance(a, b, bound=200) == i + 1
+
+
+def test_edit_distances():
+
+    assert sz.hamming_distance("hello", "hello") == 0
+    assert sz.hamming_distance("hello", "hell") == 1
+    assert sz.hamming_distance("abc", "adc") == 1, "one substitution"
+    assert sz.hamming_distance("αβγδ", "αxxγδ") == 2, "replace Beta UTF8 codepoint"
+    assert (
+        sz.hamming_distance_unicode("abcdefgh", "_bcdefg_") == 2
+    ), "replace ASCI prefix and suffix"
+    assert (
+        sz.hamming_distance_unicode("αβγδ", "αγγδ") == 1
+    ), "replace Beta UTF8 codepoint"
+
+    assert sz.edit_distance("hello", "hello") == 0
+    assert sz.edit_distance("hello", "hell") == 1
+    assert sz.edit_distance("", "") == 0
+    assert sz.edit_distance("", "abc") == 3
+    assert sz.edit_distance("abc", "") == 3
+    assert sz.edit_distance("abc", "ac") == 1, "one deletion"
+    assert sz.edit_distance("abc", "a_bc") == 1, "one insertion"
+    assert sz.edit_distance("abc", "adc") == 1, "one substitution"
+    assert (
+        sz.edit_distance("ggbuzgjux{}l", "gbuzgjux{}l") == 1
+    ), "one insertion (prepended)"
+    assert sz.edit_distance("abcdefgABCDEFG", "ABCDEFGabcdefg") == 14
+
+    assert (
+        sz.edit_distance_unicode("hello", "hell") == 1
+    ), "no unicode symbols, just ASCII"
+    assert (
+        sz.edit_distance_unicode("𠜎 𠜱 𠝹 𠱓", "𠜎𠜱𠝹𠱓") == 3
+    ), "add 3 whitespaces in Chinese"
+    assert sz.edit_distance_unicode("💖", "💗") == 1
+
+    assert sz.edit_distance_unicode("αβγδ", "αγδ") == 1, "insert Beta"
+    assert (
+        sz.edit_distance_unicode("école", "école") == 2
+    ), "etter 'é' as a single character vs 'e' + '´'"
+    assert (
+        sz.edit_distance_unicode("façade", "facade") == 1
+    ), "'ç' with cedilla vs. plain"
+    assert (
+        sz.edit_distance_unicode("Schön", "Scho\u0308n") == 2
+    ), "'ö' represented as 'o' + '¨'"
+    assert (
+        sz.edit_distance_unicode("München", "Muenchen") == 2
+    ), "German with umlaut vs. transcription"
+    assert sz.edit_distance_unicode("こんにちは世界", "こんばんは世界") == 2
+
+
+@pytest.mark.repeat(30)
+@pytest.mark.parametrize("first_length", [20, 100])
+@pytest.mark.parametrize("second_length", [20, 100])
+@pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
+def test_edit_distance_random(first_length: int, second_length: int):
+    a = get_random_string(length=first_length)
+    b = get_random_string(length=second_length)
+    assert sz.edit_distance(a, b) == baseline_edit_distance(a, b)
+
+
+@pytest.mark.repeat(30)
+@pytest.mark.parametrize("first_length", [20, 100])
+@pytest.mark.parametrize("second_length", [20, 100])
+@pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
+def test_alignment_score_random(first_length: int, second_length: int):
+
+    a = get_random_string(length=first_length)
+    b = get_random_string(length=second_length)
+    character_substitutions = np.zeros((256, 256), dtype=np.int8)
+    character_substitutions.fill(-1)
+    np.fill_diagonal(character_substitutions, 0)
+
+    assert sz.alignment_score(
+        a,
+        b,
+        substitution_matrix=character_substitutions,
+        gap_score=-1,
+    ) == -baseline_edit_distance(a, b)
+
+
+def baseline_translate(body: str, lut: Sequence) -> str:
+    return "".join([chr(lut[ord(c)]) for c in body])
+
+
+def translation_table_to_dict(lut: Sequence) -> Dict[str, str]:
+    return {chr(i): chr(lut[i]) for i in range(256)}
+
+
+@pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
+@pytest.mark.parametrize("length", range(1, 300))
+def test_translations(length: int):
+
+    map_identity = np.arange(256, dtype=np.uint8)
+    map_invert = np.arange(255, -1, -1, dtype=np.uint8)
+    map_threshold = np.where(np.arange(256) > 127, 255, 0).astype(np.uint8)
+    dict_identity = translation_table_to_dict(map_identity)
+    dict_invert = translation_table_to_dict(map_invert)
+    dict_threshold = translation_table_to_dict(map_threshold)
+    view_identity = memoryview(map_identity)
+    view_invert = memoryview(map_invert)
+    view_threshold = memoryview(map_threshold)
+
+    body = get_random_string(length=length)
+    body_bytes = body.encode("utf-8")
+
+    # Check mapping strings and byte-strings into new strings
+    assert sz.translate(body, view_identity) == body
+    assert sz.translate(body_bytes, view_identity) == body_bytes
+    assert sz.translate(body_bytes, view_identity) == body_bytes.translate(
+        view_identity
+    )
+    assert sz.translate(body_bytes, view_invert) == body_bytes.translate(view_invert)
+    assert sz.translate(body_bytes, view_threshold) == body_bytes.translate(
+        view_threshold
+    )
+
+    # Check in-place translations - all of them return nothing
+    after_identity = memoryview(body_bytes)
+    assert sz.translate(after_identity, view_identity, inplace=True) == None
+    assert sz.equal(after_identity, body.translate(dict_identity))
+    after_invert = memoryview(body_bytes)
+    assert sz.translate(after_invert, view_invert, inplace=True) == None
+    assert sz.equal(after_invert, body.translate(dict_invert))
+    after_threshold = memoryview(body_bytes)
+    assert sz.translate(after_threshold, view_threshold, inplace=True) == None
+    assert sz.equal(after_threshold, body.translate(dict_threshold))
+
+
+@pytest.mark.repeat(3)
+@pytest.mark.parametrize("length", list(range(0, 300)) + [1024, 4096, 100000])
+@pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
+def test_translations_random(length: int):
+    body = get_random_string(length=length)
+    lut = np.random.randint(0, 256, size=256, dtype=np.uint8)
+    assert sz.translate(body, memoryview(lut)) == baseline_translate(body, lut)
+
+
+@pytest.mark.repeat(3)
+@pytest.mark.parametrize("length", list(range(0, 300)) + [1024, 4096, 100000])
+def test_bytesums_random(length: int):
+    def sum_bytes(body: str) -> int:
+        return sum([ord(c) for c in body])
+
+    body = get_random_string(length=length)
+    assert sum_bytes(body) == sz.bytesum(body)
+
+
+@pytest.mark.parametrize("list_length", [10, 20, 30, 40, 50])
+@pytest.mark.parametrize("part_length", [5, 10])
+@pytest.mark.parametrize("variability", [2, 3])
+def test_fuzzy_sorting(list_length: int, part_length: int, variability: int):
+    native_list = [
+        get_random_string(variability=variability, length=part_length)
+        for _ in range(list_length)
+    ]
+    native_joined = ".".join(native_list)
+    big_joined = Str(native_joined)
+    big_list = big_joined.split(".")
+
+    native_ordered = sorted(native_list)
+    native_order = big_list.argsort()
+    for i in range(list_length):
+        assert native_ordered[i] == native_list[native_order[i]], "Order is wrong"
+        assert native_ordered[i] == str(
+            big_list[int(native_order[i])]
+        ), "Split is wrong?!"
+
+    native_list.sort()
+    big_list.sort()
+
+    assert len(native_list) == len(big_list)
+    for native_str, big_str in zip(native_list, big_list):
+        assert native_str == str(big_str), "Order is wrong"
+
+
+@pytest.mark.parametrize("list_length", [10, 20, 30, 40, 50])
+@pytest.mark.parametrize("part_length", [5, 10])
+@pytest.mark.parametrize("variability", [2, 3])
+def test_fuzzy_sorting(list_length: int, part_length: int, variability: int):
+    native_list = [
+        get_random_string(variability=variability, length=part_length)
+        for _ in range(list_length)
+    ]
+    native_joined = ".".join(native_list)
+    big_joined = Str(native_joined)
+    big_list = big_joined.split(".")
+
+    native_ordered = sorted(native_list)
+    native_order = big_list.argsort()
+    for i in range(list_length):
+        assert native_ordered[i] == native_list[native_order[i]], "Order is wrong"
+        assert native_ordered[i] == str(
+            big_list[int(native_order[i])]
+        ), "Split is wrong?!"
+
+    native_list.sort()
+    big_list.sort()
+
+    assert len(native_list) == len(big_list)
+    for native_str, big_str in zip(native_list, big_list):
+        assert native_str == str(big_str), "Order is wrong"
+
+
+@pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
+def test_pyarrow_str_conversion():
+    native = "hello"
+    big = Str(native)
+    assert isinstance(big.address, int) and big.address != 0
+    assert isinstance(big.nbytes, int) and big.nbytes == len(native)
+
+    arrow_buffer = pa.foreign_buffer(big.address, big.nbytes, big)
+    assert arrow_buffer.to_pybytes() == native.encode("utf-8")
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(["-x", "-s", __file__]))

From 78d39f934f4273a488afe4b7f56ff0112071bf7d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 21 Jul 2025 19:14:27 +0000
Subject: [PATCH 486/751] Improve: Ensure `seed` affects hashes

---
 scripts/test_stringzilla.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/test_stringzilla.cpp b/scripts/test_stringzilla.cpp
index 9ccf9d2c..c3c65126 100644
--- a/scripts/test_stringzilla.cpp
+++ b/scripts/test_stringzilla.cpp
@@ -345,6 +345,10 @@ static void test_random_generator_equivalence(sz_fill_random_t generate_base, sz
 
 static void test_equivalence() {
 
+    // Ensure the seed affects hash results
+    assert(sz_hash_serial("abc", 3, 100) != sz_hash_serial("abc", 3, 200));
+    assert(sz_hash_serial("abcdefgh", 8, 0) != sz_hash_serial("abcdefgh", 8, 7));
+
 #if SZ_USE_HASWELL
     test_hash_equivalence(                                      //
         sz_hash_serial, sz_hash_state_init_serial,              //

From 255d4439d401a766e00a877b38b94669f9fe9d90 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 21 Jul 2025 19:15:03 +0000
Subject: [PATCH 487/751] Docs: Show higher recall with better hashes

---
 scripts/explore_fingerprint.ipynb | 843 +++++++++++++++++++++++++-----
 1 file changed, 725 insertions(+), 118 deletions(-)

diff --git a/scripts/explore_fingerprint.ipynb b/scripts/explore_fingerprint.ipynb
index 440a0367..80330aba 100644
--- a/scripts/explore_fingerprint.ipynb
+++ b/scripts/explore_fingerprint.ipynb
@@ -17,7 +17,19 @@
             "cell_type": "markdown",
             "metadata": {},
             "source": [
-                "## Rabin-Karp Rolling Hashing"
+                "## Rabin-Karp Rolling Hashing\n",
+                "\n",
+                "Rabin-Karp algorithm is a polynomial rolling hash function built around modulo arithmetic.\n",
+                "Once the hashing window rolls forward, the leftmost character is removed and a new rightmost character is added.\n",
+                "Thus, the cost of computing each slices hash is just $O(1)$, if the previous window's hash is known.\n",
+                "\n",
+                "Assuming, many such rolling hashes will be used later, we can parameterize the algorithm with a few parameters:\n",
+                "- `window_width` - the length of the substring to hash;\n",
+                "- `multiplier` - the multiplier for the polynomial hash;\n",
+                "- `modulo` - the modulo to use for the hash, generally prime;\n",
+                "- `alphabet_size` - the size of the alphabet used in the string, e.g. 256 for ASCII;\n",
+                "- `salt` - an optional salt to add to each character's ordinal value, usually 1 to avoid adding zeroes;\n",
+                "- `seed` - an optional seed for the first hash, can be 0."
             ]
         },
         {
@@ -35,9 +47,10 @@
                 "    multiplier: int,\n",
                 "    modulo: int,\n",
                 "    alphabet_size: int = 256,\n",
+                "    salt: int = 1,\n",
                 ") -> Generator[int, None, None]:\n",
                 "    \"\"\"Return the rolling polynomial hashes of every length-`window_width` substring of `s`\"\"\"\n",
-                "    \n",
+                "\n",
                 "    assert window_width > 0, \"Window width must be positive\"\n",
                 "    assert multiplier > 0, \"Multiplier must be positive\"\n",
                 "    assert modulo > 0, \"Modulo must be positive\"\n",
@@ -48,17 +61,17 @@
                 "\n",
                 "    current_hash: int = 0\n",
                 "    for char in s[:window_width]:\n",
-                "        new_term = ord(char) + 1\n",
-                "        assert new_term <= alphabet_size, \"Pass correct `alphabet_size`\"\n",
+                "        new_term = ord(char) + salt\n",
+                "        assert new_term < (alphabet_size + salt), \"Pass correct `alphabet_size`\"\n",
                 "        current_hash = (current_hash * multiplier + new_term) % modulo\n",
                 "    yield current_hash\n",
                 "\n",
                 "    discarding_multiplier: int = pow(multiplier, window_width - 1, modulo)\n",
                 "    total_hashes = len(s) - window_width + 1\n",
                 "    for i in range(1, total_hashes):  # First hash is already yielded\n",
-                "        old_term = ord(s[i - 1]) + 1\n",
-                "        new_term = ord(s[i + window_width - 1]) + 1\n",
-                "        \n",
+                "        old_term = ord(s[i - 1]) + salt\n",
+                "        new_term = ord(s[i + window_width - 1]) + salt\n",
+                "\n",
                 "        # Remove leftmost char and add the new rightmost one.\n",
                 "        # All operations must be modulo `modulo`, but assuming the infinite precision of integers,\n",
                 "        # we don't care in this draft.\n",
@@ -92,7 +105,7 @@
         },
         {
             "cell_type": "code",
-            "execution_count": null,
+            "execution_count": 39,
             "metadata": {},
             "outputs": [],
             "source": [
@@ -107,6 +120,7 @@
                 "    multiplier: int,\n",
                 "    modulo: int,\n",
                 "    alphabet_size: int = 256,\n",
+                "    salt: int = 1,\n",
                 ") -> Generator[int, None, None]:\n",
                 "    \"\"\"Return the rolling polynomial hashes of every length-`window_width` substring of `s`\"\"\"\n",
                 "\n",
@@ -149,16 +163,16 @@
                 "    # Handle the first window - without dropping any characters\n",
                 "    current_hash: float = 0.0\n",
                 "    for char in s[:window_width]:\n",
-                "        new_term = float(ord(char) + 1)\n",
-                "        assert new_term <= alphabet_size, \"Pass correct `alphabet_size`\"\n",
+                "        new_term = float(ord(char) + salt)\n",
+                "        assert new_term < (alphabet_size + salt), \"Pass correct `alphabet_size`\"\n",
                 "        current_hash = add_mod(mul_mod(current_hash, multiplier), new_term)\n",
                 "    yield int(current_hash)\n",
                 "\n",
                 "    # Roll through the rest of the string\n",
                 "    total_hashes = len(s) - window_width + 1\n",
                 "    for i in range(1, total_hashes):  # First hash is already yielded\n",
-                "        old_term = float(ord(s[i - 1]) + 1)\n",
-                "        new_term = float(ord(s[i + window_width - 1]) + 1)\n",
+                "        old_term = float(ord(s[i - 1]) + salt)\n",
+                "        new_term = float(ord(s[i + window_width - 1]) + salt)\n",
                 "\n",
                 "        # Remove leftmost char and add the new rightmost one.\n",
                 "        current_hash = sub_mod(current_hash, mul_mod(old_term, discarding_multiplier))\n",
@@ -186,7 +200,7 @@
         },
         {
             "cell_type": "code",
-            "execution_count": null,
+            "execution_count": 40,
             "metadata": {},
             "outputs": [],
             "source": [
@@ -197,7 +211,7 @@
         },
         {
             "cell_type": "code",
-            "execution_count": null,
+            "execution_count": 41,
             "metadata": {},
             "outputs": [],
             "source": [
@@ -207,9 +221,17 @@
         },
         {
             "cell_type": "code",
-            "execution_count": null,
+            "execution_count": 42,
             "metadata": {},
-            "outputs": [],
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "Loaded 1,000,000 lines of mean length 128.64 characters\n"
+                    ]
+                }
+            ],
             "source": [
                 "textual_lines = textual_dataset.split(\"\\n\")\n",
                 "print(f\"Loaded {len(textual_lines):,} lines of mean length {sum(len(line) for line in textual_lines) / len(textual_lines):.2f} characters\")"
@@ -217,7 +239,7 @@
         },
         {
             "cell_type": "code",
-            "execution_count": null,
+            "execution_count": 43,
             "metadata": {},
             "outputs": [],
             "source": [
@@ -246,18 +268,28 @@
         },
         {
             "cell_type": "code",
-            "execution_count": null,
+            "execution_count": 44,
             "metadata": {},
-            "outputs": [],
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "Passed for window width: 3!\n",
+                        "Passed for window width: 17!\n",
+                        "Passed for window width: 64!\n"
+                    ]
+                }
+            ],
             "source": [
-                "LARGEST_MODULO_SAFE_MODULO = 4503599626977\n",
+                "LARGEST_SAFE_MODULO = 4503599626977\n",
                 "\n",
                 "for window_width in [3, 17, 64]:\n",
                 "    for line in textual_lines[:50]:\n",
                 "        compare_hashes(\n",
                 "            line,\n",
-                "            lambda l: rabin_karp_ints(l, window_width=window_width, multiplier=257, modulo=LARGEST_MODULO_SAFE_MODULO),\n",
-                "            lambda l: rabin_karp_floats(l, window_width=window_width, multiplier=257, modulo=LARGEST_MODULO_SAFE_MODULO))\n",
+                "            lambda l: rabin_karp_ints(l, window_width=window_width, multiplier=257, modulo=LARGEST_SAFE_MODULO),\n",
+                "            lambda l: rabin_karp_floats(l, window_width=window_width, multiplier=257, modulo=LARGEST_SAFE_MODULO))\n",
                 "    print(f\"Passed for window width: {window_width}!\")"
             ]
         },
@@ -274,7 +306,7 @@
         },
         {
             "cell_type": "code",
-            "execution_count": null,
+            "execution_count": 45,
             "metadata": {},
             "outputs": [],
             "source": [
@@ -290,6 +322,7 @@
                 "    multiplier: int,\n",
                 "    modulo: int,\n",
                 "    alphabet_size: int = 256,\n",
+                "    salt: int = 1,\n",
                 ") -> Generator[int, None, None]:\n",
                 "    \"\"\"Return the rolling polynomial hashes of every length-`window_width` substring of `s`\n",
                 "    using Fused-Multiply-Add (FMA) operations & Barrett reduction for performance.\"\"\"\n",
@@ -349,16 +382,16 @@
                 "    # Handle the first window - without dropping any characters\n",
                 "    current_hash: float = 0.0\n",
                 "    for char in s[:window_width]:\n",
-                "        new_term = float(ord(char) + 1)\n",
-                "        assert new_term <= alphabet_size, \"Pass correct `alphabet_size`\"\n",
+                "        new_term = float(ord(char) + salt)\n",
+                "        assert new_term < (alphabet_size + salt), \"Pass correct `alphabet_size`\"\n",
                 "        current_hash = fma_mod(current_hash, multiplier, new_term)\n",
                 "    yield int(current_hash)\n",
                 "\n",
                 "    # Roll through the rest of the string\n",
                 "    total_hashes = len(s) - window_width + 1\n",
                 "    for i in range(1, total_hashes):  # First hash is already yielded\n",
-                "        old_term = float(ord(s[i - 1]) + 1)\n",
-                "        new_term = float(ord(s[i + window_width - 1]) + 1)\n",
+                "        old_term = float(ord(s[i - 1]) + salt)\n",
+                "        new_term = float(ord(s[i + window_width - 1]) + salt)\n",
                 "\n",
                 "        # Remove leftmost char and add the new rightmost one.\n",
                 "        current_hash = fma_mod(old_term, negative_discarding_multiplier, current_hash)\n",
@@ -383,18 +416,28 @@
         },
         {
             "cell_type": "code",
-            "execution_count": null,
+            "execution_count": 46,
             "metadata": {},
-            "outputs": [],
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "Passed for window width: 3!\n",
+                        "Passed for window width: 17!\n",
+                        "Passed for window width: 64!\n"
+                    ]
+                }
+            ],
             "source": [
-                "LARGEST_MODULO_SAFE_MODULO = 4503599626977\n",
+                "LARGEST_SAFE_MODULO = 4503599626977\n",
                 "\n",
                 "for window_width in [3, 17, 64]:\n",
                 "    for line in textual_lines[:50]:\n",
                 "        compare_hashes(\n",
                 "            line,\n",
-                "            lambda l: rabin_karp_ints(l, window_width=window_width, multiplier=257, modulo=LARGEST_MODULO_SAFE_MODULO),\n",
-                "            lambda l: rabin_karp_fma(l, window_width=window_width, multiplier=257, modulo=LARGEST_MODULO_SAFE_MODULO))\n",
+                "            lambda l: rabin_karp_ints(l, window_width=window_width, multiplier=257, modulo=LARGEST_SAFE_MODULO),\n",
+                "            lambda l: rabin_karp_fma(l, window_width=window_width, multiplier=257, modulo=LARGEST_SAFE_MODULO))\n",
                 "    print(f\"Passed for window width: {window_width}!\")"
             ]
         },
@@ -407,11 +450,19 @@
         },
         {
             "cell_type": "code",
-            "execution_count": null,
+            "execution_count": 47,
             "metadata": {},
-            "outputs": [],
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "4,503,599,627,370,449\n"
+                    ]
+                }
+            ],
             "source": [
-                "from typing import Final, List\n",
+                "from typing import Final, List, Generator\n",
                 "\n",
                 "# Fixed witnesses that make Miller-Rabin exact for n < 2**64\n",
                 "MR_BASES: Final[List[int]] = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37]\n",
@@ -448,28 +499,52 @@
                 "    return True\n",
                 "\n",
                 "\n",
-                "def largest_prime_below(n: int) -> int:\n",
+                "def prev_primes(n: int) -> Generator[int, None, None]:\n",
                 "    \"\"\"\n",
-                "    Return the largest prime strictly less than n (n must be > 2).\n",
+                "    Yield the largest primes strictly less than n (n must be > 2).\n",
                 "    Average cost: O(log n * log log n) because the prime gap ~ log n.\n",
                 "    \"\"\"\n",
                 "    if n <= 2:\n",
                 "        raise ValueError(\"Threshold must exceed 2.\")\n",
                 "    n -= n % 2 == 0  # make n odd\n",
-                "    while not _is_prime_64(n):\n",
+                "    while n > 2:\n",
+                "        if _is_prime_64(n):\n",
+                "            yield n\n",
                 "        n -= 2\n",
-                "    return n\n",
                 "\n",
+                "def next_primes(n: int) -> Generator[int, None, None]:\n",
+                "    \"\"\"\n",
+                "    Yield the smallest primes strictly greater than n (n must be > 2).\n",
+                "    Average cost: O(log n * log log n) because the prime gap ~ log n.\n",
+                "    \"\"\"\n",
+                "    if n <= 2:\n",
+                "        raise ValueError(\"Threshold must exceed 2.\")\n",
+                "    n += n % 2 == 0  # make n odd\n",
+                "    while True:\n",
+                "        if _is_prime_64(n):\n",
+                "            yield n\n",
+                "        n += 2\n",
                 "\n",
-                "LARGEST_INTEGRAL_FLOAT_PRIME = largest_prime_below(int(LARGEST_INTEGRAL_FLOAT))\n",
+                "LARGEST_INTEGRAL_FLOAT_PRIME = next(prev_primes(int(LARGEST_INTEGRAL_FLOAT)))\n",
                 "print(f\"{LARGEST_INTEGRAL_FLOAT_PRIME:,}\")  # This will be used for stress-testing"
             ]
         },
         {
             "cell_type": "code",
-            "execution_count": null,
+            "execution_count": 48,
             "metadata": {},
-            "outputs": [],
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "Passed for window width: 3, modulo: 17,523,733,958,369!\n",
+                        "Passed for window width: 17, modulo: 17,523,733,958,369!\n",
+                        "Passed for window width: 64, modulo: 17,523,733,958,369!\n",
+                        "Passed for window width: 707, modulo: 17,523,733,958,369!\n"
+                    ]
+                }
+            ],
             "source": [
                 "import random\n",
                 "\n",
@@ -487,9 +562,9 @@
                 "alphabet_size = 256\n",
                 "multiplier = 257\n",
                 "largest_term = alphabet_size + 1  # in this specific case, same as `multiplier`\n",
-                "large_modulo = largest_prime_below(\n",
+                "large_modulo = next(prev_primes(\n",
                 "    int(LARGEST_INTEGRAL_FLOAT) // multiplier - largest_term\n",
-                ")\n",
+                "))\n",
                 "\n",
                 "for window_width in [3, 17, 64, 707]:\n",
                 "    for line in [\n",
@@ -535,50 +610,178 @@
         },
         {
             "cell_type": "code",
-            "execution_count": null,
+            "execution_count": 49,
             "metadata": {},
-            "outputs": [],
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "Defaulting to user installation because normal site-packages is not writeable\n",
+                        "Requirement already satisfied: tqdm in /home/ubuntu/.local/lib/python3.10/site-packages (4.67.1)\n",
+                        "Requirement already satisfied: numpy in /home/ubuntu/.local/lib/python3.10/site-packages (2.2.4)\n"
+                    ]
+                }
+            ],
             "source": [
                 "!pip install tqdm numpy"
             ]
         },
         {
             "cell_type": "code",
-            "execution_count": null,
+            "execution_count": 50,
             "metadata": {},
-            "outputs": [],
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "(14038566040298863954, 12264879942290955073)"
+                        ]
+                    },
+                    "execution_count": 50,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "from stringzilla import hash as sz_hash\n",
+                "\n",
+                "sz_hash(\"abc\", 200), sz_hash(\"abc\", 201)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 51,
+            "metadata": {},
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "(array([2256051662, 1712240109], dtype=uint32),\n",
+                            " array([3, 2], dtype=uint32),\n",
+                            " array(['abc', 'abcd'], dtype=StringDType()))"
+                        ]
+                    },
+                    "execution_count": 51,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
             "source": [
                 "import numpy as np\n",
                 "from numpy.dtypes import StringDType\n",
                 "from typing import List, Tuple\n",
                 "\n",
+                "\n",
                 "def count_min_sketch(\n",
                 "    text: str,\n",
                 "    window_widths: List[int],\n",
+                "    seeds: List[int],\n",
+                "    hash_resolution: np.dtype = np.uint32,\n",
+                ") -> Tuple[np.ndarray, np.ndarray, np.ndarray]:\n",
+                "    \"\"\"\n",
+                "    Produces a weighted Min-Hash fingerprint also called a Count-Min Sketch.\n",
+                "    Uses StringZilla's native hash function, as opposed to the Rabin Karp.\n",
+                "\n",
+                "    https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch\n",
+                "    \"\"\"\n",
+                "\n",
+                "    fingerprint_hashes = np.empty((len(window_widths),), dtype=hash_resolution)\n",
+                "    fingerprint_weights = np.empty((len(window_widths),), dtype=np.uint32)\n",
+                "    fingerprint_ngrams = np.empty((len(window_widths),), dtype=StringDType())\n",
+                "\n",
+                "    skipped_final_hash = np.iinfo(hash_resolution).max\n",
+                "    skipped_u64_intermediary = np.iinfo(np.uint64).max\n",
+                "\n",
+                "    for i, (window_width, seed) in enumerate(zip(window_widths, seeds)):\n",
+                "        assert window_width > 0, \"Window width must be positive\"\n",
+                "\n",
+                "        smallest_hash = skipped_u64_intermediary\n",
+                "        smallest_count = 0\n",
+                "        smallest_example = None\n",
+                "\n",
+                "        for j in range(len(text) - window_width + 1):\n",
+                "            text_window = text[j : j + window_width]\n",
+                "            rolling_intermediate_u64_hash = sz_hash(text_window, seed)\n",
+                "            new_smallest_hash = min(smallest_hash, rolling_intermediate_u64_hash)\n",
+                "            if new_smallest_hash < smallest_hash:\n",
+                "                smallest_count = 1\n",
+                "                smallest_hash = new_smallest_hash\n",
+                "                smallest_example = text_window\n",
+                "            elif new_smallest_hash == smallest_hash:\n",
+                "                smallest_count += 1\n",
+                "\n",
+                "        smallest_hash &= skipped_final_hash  # Ensure we don't exceed the `uint32` range\n",
+                "        fingerprint_hashes[i] = smallest_hash\n",
+                "        fingerprint_weights[i] = smallest_count\n",
+                "        fingerprint_ngrams[i] = smallest_example\n",
+                "\n",
+                "    return fingerprint_hashes, fingerprint_weights, fingerprint_ngrams\n",
+                "\n",
+                "\n",
+                "count_min_sketch(\"abcde\", window_widths=[3, 4], seeds=[257, 258])"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 52,
+            "metadata": {},
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "(array([   6498345, 1706860248], dtype=uint32),\n",
+                            " array([3, 2], dtype=uint32),\n",
+                            " array(['abc', 'bcde'], dtype=StringDType()))"
+                        ]
+                    },
+                    "execution_count": 52,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "import numpy as np\n",
+                "from numpy.dtypes import StringDType\n",
+                "from typing import List, Tuple\n",
+                "\n",
+                "\n",
+                "def rolling_count_min_sketch(\n",
+                "    text: str,\n",
+                "    window_widths: List[int],\n",
                 "    multipliers: List[int],\n",
-                "    modulo: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:\n",
+                "    salts: List[int],\n",
+                "    modulo: int,\n",
+                "    hash_resolution: np.dtype = np.uint32,\n",
+                ") -> Tuple[np.ndarray, np.ndarray, np.ndarray]:\n",
                 "    \"\"\"\n",
                 "    Produces a weighted Min-Hash fingerprint also called a Count-Min Sketch.\n",
                 "    Those sketches are trivial to merge\n",
-                "    \n",
+                "\n",
                 "    https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch\n",
                 "    \"\"\"\n",
-                "    \n",
+                "\n",
                 "    count_widths = len(window_widths)\n",
                 "    count_multipliers = len(multipliers)\n",
                 "    assert count_widths == count_multipliers, f\"{count_widths=} != {count_multipliers=}\"\n",
-                "    \n",
-                "    fingerprint_hashes = np.empty((len(window_widths),), dtype=np.uint32)\n",
+                "\n",
+                "    fingerprint_hashes = np.empty((len(window_widths),), dtype=hash_resolution)\n",
                 "    fingerprint_weights = np.empty((len(window_widths),), dtype=np.uint32)\n",
                 "    fingerprint_ngrams = np.empty((len(window_widths),), dtype=StringDType())\n",
-                "    \n",
-                "    skipped_u32_hash = np.iinfo(np.uint32).max\n",
+                "\n",
+                "    skipped_final_hash = np.iinfo(hash_resolution).max\n",
                 "    skipped_u64_intermediary = np.iinfo(np.uint64).max\n",
                 "    hashers = [\n",
-                "        rabin_karp_fma(text, window_width=width, multiplier=multiplier, modulo=modulo)\n",
-                "        for width, multiplier in zip(window_widths, multipliers)\n",
+                "        rabin_karp_fma(\n",
+                "            text,\n",
+                "            window_width=width,\n",
+                "            multiplier=multiplier,\n",
+                "            modulo=modulo,\n",
+                "            salt=salt,\n",
+                "        )\n",
+                "        for width, multiplier, salt in zip(window_widths, multipliers, salts)\n",
                 "    ]\n",
-                "    \n",
+                "\n",
                 "    for i, hasher in enumerate(hashers):\n",
                 "        smallest_hash = skipped_u64_intermediary\n",
                 "        smallest_count = 0\n",
@@ -588,18 +791,19 @@
                 "            if new_smallest_hash < smallest_hash:\n",
                 "                smallest_count = 1\n",
                 "                smallest_hash = new_smallest_hash\n",
-                "                smallest_example = text[i:i + window_widths[i]]\n",
+                "                smallest_example = text[i : i + window_widths[i]]\n",
                 "            elif new_smallest_hash == smallest_hash:\n",
                 "                smallest_count += 1\n",
-                "            \n",
-                "        smallest_hash &= skipped_u32_hash  # Ensure we don't exceed the `uint32` range\n",
+                "\n",
+                "        smallest_hash &= skipped_final_hash  # Ensure we don't exceed the `uint32` range\n",
                 "        fingerprint_hashes[i] = smallest_hash\n",
                 "        fingerprint_weights[i] = smallest_count\n",
                 "        fingerprint_ngrams[i] = smallest_example\n",
                 "\n",
                 "    return fingerprint_hashes, fingerprint_weights, fingerprint_ngrams\n",
                 "\n",
-                "count_min_sketch(\"abcde\", [3, 4], [257, 258], 4503599626977)"
+                "\n",
+                "rolling_count_min_sketch(\"abcde\", window_widths=[3, 4], multipliers=[257, 258], salts=[1, 2], modulo=4503599626977)"
             ]
         },
         {
@@ -630,7 +834,7 @@
         },
         {
             "cell_type": "code",
-            "execution_count": null,
+            "execution_count": 53,
             "metadata": {},
             "outputs": [],
             "source": [
@@ -672,28 +876,63 @@
         },
         {
             "cell_type": "code",
-            "execution_count": null,
+            "execution_count": 54,
             "metadata": {},
-            "outputs": [],
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "Loaded 1,000,000 lines of mean length 128.64 characters\n"
+                    ]
+                }
+            ],
             "source": [
                 "textual_dataset_path = dataset_directory / \"leipzig1M.txt\"\n",
-                "textual_dataset = open(textual_dataset_path, \"r\").read().strip()\n",
+                "textual_dataset = open(textual_dataset_path, \"r\").read().casefold().strip()\n",
                 "textual_lines = textual_dataset.split(\"\\n\")\n",
                 "print(f\"Loaded {len(textual_lines):,} lines of mean length {sum(len(line) for line in textual_lines) / len(textual_lines):.2f} characters\")"
             ]
         },
         {
             "cell_type": "code",
-            "execution_count": null,
+            "execution_count": 55,
             "metadata": {},
-            "outputs": [],
+            "outputs": [
+                {
+                    "name": "stderr",
+                    "output_type": "stream",
+                    "text": [
+                        "Fingerprinting lines: 100%|██████████| 10000/10000 [00:43<00:00, 232.46line/s]\n"
+                    ]
+                }
+            ],
             "source": [
                 "from tqdm import tqdm\n",
+                "from itertools import islice\n",
+                "\n",
+                "\n",
+                "def take_first_n(iterable, n):\n",
+                "    return islice(iterable, n)\n",
+                "\n",
+                "\n",
+                "def keep_each_nth(iterable, k):\n",
+                "    return (x for i, x in enumerate(iterable, 1) if i % k == 0)\n",
+                "\n",
+                "\n",
+                "NDIM: int = 192\n",
+                "consecutive_multipliers = list(range(256, 256 + NDIM))\n",
+                "prime_multipliers = list(take_first_n(keep_each_nth(next_primes(257), 7), NDIM))\n",
                 "\n",
-                "multipliers = list(range(256, 256+192))\n",
                 "window_widths = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 18, 21, 24, 27, 30]\n",
-                "window_widths *= (192 // len(window_widths))\n",
-                "LARGEST_MODULO_SAFE_MODULO = 4503599626977\n",
+                "window_widths *= NDIM // len(window_widths)\n",
+                "salts = range(1, NDIM + 1)  # Use different salts for each window width\n",
+                "alphabet_size = 256\n",
+                "largest_term = alphabet_size + max(salts)\n",
+                "LARGEST_SAFE_MODULO = next(\n",
+                "    prev_primes(int(LARGEST_INTEGRAL_FLOAT) // max(prime_multipliers) - largest_term)\n",
+                ")\n",
+                "HASH_DTYPE = np.uint64\n",
                 "\n",
                 "fingerprint_hashes = []\n",
                 "fingerprint_counts = []\n",
@@ -701,18 +940,259 @@
                 "\n",
                 "DATASET_SIZE_LIMIT = 10_000\n",
                 "\n",
-                "for line in tqdm(textual_lines[:DATASET_SIZE_LIMIT], desc=\"Fingerprinting lines\", unit=\"line\"):\n",
-                "    hashes, counts, ngrams = count_min_sketch(\n",
-                "        text=line,\n",
-                "        window_widths=window_widths,\n",
-                "        multipliers=multipliers,\n",
-                "        modulo= LARGEST_MODULO_SAFE_MODULO,\n",
-                "    )\n",
+                "default_min_sketcher = lambda line: count_min_sketch(\n",
+                "    text=line,\n",
+                "    window_widths=window_widths,\n",
+                "    seeds=prime_multipliers,\n",
+                "    hash_resolution=HASH_DTYPE,\n",
+                ")\n",
+                "# For Rabin-Karp rolling hashes we pass more parameters:\n",
+                "default_rolling_sketcher = lambda line: rolling_count_min_sketch(\n",
+                "    text=line,\n",
+                "    window_widths=window_widths,\n",
+                "    multipliers=prime_multipliers,\n",
+                "    salts=salts,\n",
+                "    modulo=LARGEST_SAFE_MODULO,\n",
+                "    hash_resolution=HASH_DTYPE,\n",
+                ")\n",
+                "\n",
+                "for line in tqdm(\n",
+                "    textual_lines[:DATASET_SIZE_LIMIT], desc=\"Fingerprinting lines\", unit=\"line\"\n",
+                "):\n",
+                "    hashes, counts, ngrams = default_min_sketcher(line)\n",
                 "    fingerprint_hashes.append(hashes)\n",
                 "    fingerprint_counts.append(counts)\n",
                 "    fingerprint_ngrams.append(ngrams)"
             ]
         },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "Let's cross-reference the fingerprints counting the number of hash collisions without our test set."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 56,
+            "metadata": {},
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "Dimension 0: 345 unique hashes, 0 collisions\n",
+                        "Dimension 1: 1,366 unique hashes, 0 collisions\n",
+                        "Dimension 2: 2,439 unique hashes, 0 collisions\n",
+                        "Dimension 3: 3,400 unique hashes, 0 collisions\n",
+                        "Dimension 4: 5,267 unique hashes, 0 collisions\n",
+                        "Dimension 5: 6,297 unique hashes, 0 collisions\n",
+                        "Dimension 6: 7,303 unique hashes, 0 collisions\n",
+                        "Dimension 7: 8,027 unique hashes, 0 collisions\n",
+                        "Dimension 8: 8,681 unique hashes, 0 collisions\n",
+                        "Dimension 9: 9,059 unique hashes, 0 collisions\n",
+                        "Dimension 10: 9,635 unique hashes, 0 collisions\n",
+                        "Dimension 11: 9,887 unique hashes, 0 collisions\n",
+                        "Dimension 12: 9,939 unique hashes, 0 collisions\n",
+                        "Dimension 13: 9,941 unique hashes, 0 collisions\n",
+                        "Dimension 14: 9,916 unique hashes, 0 collisions\n",
+                        "Dimension 15: 9,879 unique hashes, 0 collisions\n",
+                        "Dimension 16: 446 unique hashes, 0 collisions\n",
+                        "Dimension 17: 1,335 unique hashes, 0 collisions\n",
+                        "Dimension 18: 2,445 unique hashes, 0 collisions\n",
+                        "Dimension 19: 3,641 unique hashes, 0 collisions\n",
+                        "Dimension 20: 5,054 unique hashes, 0 collisions\n",
+                        "Dimension 21: 6,272 unique hashes, 0 collisions\n",
+                        "Dimension 22: 7,288 unique hashes, 0 collisions\n",
+                        "Dimension 23: 8,114 unique hashes, 0 collisions\n",
+                        "Dimension 24: 8,677 unique hashes, 0 collisions\n",
+                        "Dimension 25: 8,934 unique hashes, 0 collisions\n",
+                        "Dimension 26: 9,622 unique hashes, 0 collisions\n",
+                        "Dimension 27: 9,891 unique hashes, 0 collisions\n",
+                        "Dimension 28: 9,934 unique hashes, 0 collisions\n",
+                        "Dimension 29: 9,947 unique hashes, 0 collisions\n",
+                        "Dimension 30: 9,921 unique hashes, 0 collisions\n",
+                        "Dimension 31: 9,875 unique hashes, 0 collisions\n",
+                        "Dimension 32: 423 unique hashes, 0 collisions\n",
+                        "Dimension 33: 1,331 unique hashes, 0 collisions\n",
+                        "Dimension 34: 2,281 unique hashes, 0 collisions\n",
+                        "Dimension 35: 3,579 unique hashes, 0 collisions\n",
+                        "Dimension 36: 5,163 unique hashes, 0 collisions\n",
+                        "Dimension 37: 6,388 unique hashes, 0 collisions\n",
+                        "Dimension 38: 7,398 unique hashes, 0 collisions\n",
+                        "Dimension 39: 7,918 unique hashes, 0 collisions\n",
+                        "Dimension 40: 8,692 unique hashes, 0 collisions\n",
+                        "Dimension 41: 8,988 unique hashes, 0 collisions\n",
+                        "Dimension 42: 9,657 unique hashes, 0 collisions\n",
+                        "Dimension 43: 9,855 unique hashes, 0 collisions\n",
+                        "Dimension 44: 9,934 unique hashes, 0 collisions\n",
+                        "Dimension 45: 9,944 unique hashes, 0 collisions\n",
+                        "Dimension 46: 9,926 unique hashes, 0 collisions\n",
+                        "Dimension 47: 9,872 unique hashes, 0 collisions\n",
+                        "Dimension 48: 459 unique hashes, 0 collisions\n",
+                        "Dimension 49: 875 unique hashes, 0 collisions\n",
+                        "Dimension 50: 2,584 unique hashes, 0 collisions\n",
+                        "Dimension 51: 3,709 unique hashes, 0 collisions\n",
+                        "Dimension 52: 5,276 unique hashes, 0 collisions\n",
+                        "Dimension 53: 6,460 unique hashes, 0 collisions\n",
+                        "Dimension 54: 7,191 unique hashes, 0 collisions\n",
+                        "Dimension 55: 7,973 unique hashes, 0 collisions\n",
+                        "Dimension 56: 8,520 unique hashes, 0 collisions\n",
+                        "Dimension 57: 9,010 unique hashes, 0 collisions\n",
+                        "Dimension 58: 9,685 unique hashes, 0 collisions\n",
+                        "Dimension 59: 9,859 unique hashes, 0 collisions\n",
+                        "Dimension 60: 9,952 unique hashes, 0 collisions\n",
+                        "Dimension 61: 9,939 unique hashes, 0 collisions\n",
+                        "Dimension 62: 9,918 unique hashes, 0 collisions\n",
+                        "Dimension 63: 9,871 unique hashes, 0 collisions\n",
+                        "Dimension 64: 413 unique hashes, 0 collisions\n",
+                        "Dimension 65: 1,101 unique hashes, 0 collisions\n",
+                        "Dimension 66: 2,476 unique hashes, 0 collisions\n",
+                        "Dimension 67: 3,692 unique hashes, 0 collisions\n",
+                        "Dimension 68: 5,268 unique hashes, 0 collisions\n",
+                        "Dimension 69: 6,433 unique hashes, 0 collisions\n",
+                        "Dimension 70: 7,186 unique hashes, 0 collisions\n",
+                        "Dimension 71: 8,179 unique hashes, 0 collisions\n",
+                        "Dimension 72: 8,573 unique hashes, 0 collisions\n",
+                        "Dimension 73: 8,992 unique hashes, 0 collisions\n",
+                        "Dimension 74: 9,682 unique hashes, 0 collisions\n",
+                        "Dimension 75: 9,841 unique hashes, 0 collisions\n",
+                        "Dimension 76: 9,928 unique hashes, 0 collisions\n",
+                        "Dimension 77: 9,918 unique hashes, 0 collisions\n",
+                        "Dimension 78: 9,925 unique hashes, 0 collisions\n",
+                        "Dimension 79: 9,876 unique hashes, 0 collisions\n",
+                        "Dimension 80: 350 unique hashes, 0 collisions\n",
+                        "Dimension 81: 1,266 unique hashes, 0 collisions\n",
+                        "Dimension 82: 2,297 unique hashes, 0 collisions\n",
+                        "Dimension 83: 3,625 unique hashes, 0 collisions\n",
+                        "Dimension 84: 5,102 unique hashes, 0 collisions\n",
+                        "Dimension 85: 6,497 unique hashes, 0 collisions\n",
+                        "Dimension 86: 7,193 unique hashes, 0 collisions\n",
+                        "Dimension 87: 8,038 unique hashes, 0 collisions\n",
+                        "Dimension 88: 8,469 unique hashes, 0 collisions\n",
+                        "Dimension 89: 9,035 unique hashes, 0 collisions\n",
+                        "Dimension 90: 9,619 unique hashes, 0 collisions\n",
+                        "Dimension 91: 9,865 unique hashes, 0 collisions\n",
+                        "Dimension 92: 9,930 unique hashes, 0 collisions\n",
+                        "Dimension 93: 9,931 unique hashes, 0 collisions\n",
+                        "Dimension 94: 9,919 unique hashes, 0 collisions\n",
+                        "Dimension 95: 9,865 unique hashes, 0 collisions\n",
+                        "Dimension 96: 442 unique hashes, 0 collisions\n",
+                        "Dimension 97: 1,179 unique hashes, 0 collisions\n",
+                        "Dimension 98: 2,260 unique hashes, 0 collisions\n",
+                        "Dimension 99: 3,904 unique hashes, 0 collisions\n",
+                        "Dimension 100: 5,143 unique hashes, 0 collisions\n",
+                        "Dimension 101: 6,179 unique hashes, 0 collisions\n",
+                        "Dimension 102: 7,239 unique hashes, 0 collisions\n",
+                        "Dimension 103: 7,790 unique hashes, 0 collisions\n",
+                        "Dimension 104: 8,663 unique hashes, 0 collisions\n",
+                        "Dimension 105: 9,092 unique hashes, 0 collisions\n",
+                        "Dimension 106: 9,636 unique hashes, 0 collisions\n",
+                        "Dimension 107: 9,840 unique hashes, 0 collisions\n",
+                        "Dimension 108: 9,931 unique hashes, 0 collisions\n",
+                        "Dimension 109: 9,937 unique hashes, 0 collisions\n",
+                        "Dimension 110: 9,906 unique hashes, 0 collisions\n",
+                        "Dimension 111: 9,875 unique hashes, 0 collisions\n",
+                        "Dimension 112: 477 unique hashes, 0 collisions\n",
+                        "Dimension 113: 1,019 unique hashes, 0 collisions\n",
+                        "Dimension 114: 2,410 unique hashes, 0 collisions\n",
+                        "Dimension 115: 3,583 unique hashes, 0 collisions\n",
+                        "Dimension 116: 5,294 unique hashes, 0 collisions\n",
+                        "Dimension 117: 6,508 unique hashes, 0 collisions\n",
+                        "Dimension 118: 7,062 unique hashes, 0 collisions\n",
+                        "Dimension 119: 8,001 unique hashes, 0 collisions\n",
+                        "Dimension 120: 8,524 unique hashes, 0 collisions\n",
+                        "Dimension 121: 8,972 unique hashes, 0 collisions\n",
+                        "Dimension 122: 9,652 unique hashes, 0 collisions\n",
+                        "Dimension 123: 9,854 unique hashes, 0 collisions\n",
+                        "Dimension 124: 9,947 unique hashes, 0 collisions\n",
+                        "Dimension 125: 9,943 unique hashes, 0 collisions\n",
+                        "Dimension 126: 9,918 unique hashes, 0 collisions\n",
+                        "Dimension 127: 9,871 unique hashes, 0 collisions\n",
+                        "Dimension 128: 436 unique hashes, 0 collisions\n",
+                        "Dimension 129: 1,221 unique hashes, 0 collisions\n",
+                        "Dimension 130: 2,291 unique hashes, 0 collisions\n",
+                        "Dimension 131: 3,850 unique hashes, 0 collisions\n",
+                        "Dimension 132: 5,053 unique hashes, 0 collisions\n",
+                        "Dimension 133: 6,374 unique hashes, 0 collisions\n",
+                        "Dimension 134: 7,393 unique hashes, 0 collisions\n",
+                        "Dimension 135: 8,098 unique hashes, 0 collisions\n",
+                        "Dimension 136: 8,602 unique hashes, 0 collisions\n",
+                        "Dimension 137: 9,073 unique hashes, 0 collisions\n",
+                        "Dimension 138: 9,695 unique hashes, 0 collisions\n",
+                        "Dimension 139: 9,873 unique hashes, 0 collisions\n",
+                        "Dimension 140: 9,924 unique hashes, 0 collisions\n",
+                        "Dimension 141: 9,940 unique hashes, 0 collisions\n",
+                        "Dimension 142: 9,920 unique hashes, 0 collisions\n",
+                        "Dimension 143: 9,870 unique hashes, 0 collisions\n",
+                        "Dimension 144: 473 unique hashes, 0 collisions\n",
+                        "Dimension 145: 1,208 unique hashes, 0 collisions\n",
+                        "Dimension 146: 2,305 unique hashes, 0 collisions\n",
+                        "Dimension 147: 3,852 unique hashes, 0 collisions\n",
+                        "Dimension 148: 4,905 unique hashes, 0 collisions\n",
+                        "Dimension 149: 6,541 unique hashes, 0 collisions\n",
+                        "Dimension 150: 7,204 unique hashes, 0 collisions\n",
+                        "Dimension 151: 7,946 unique hashes, 0 collisions\n",
+                        "Dimension 152: 8,543 unique hashes, 0 collisions\n",
+                        "Dimension 153: 9,029 unique hashes, 0 collisions\n",
+                        "Dimension 154: 9,591 unique hashes, 0 collisions\n",
+                        "Dimension 155: 9,827 unique hashes, 0 collisions\n",
+                        "Dimension 156: 9,933 unique hashes, 0 collisions\n",
+                        "Dimension 157: 9,938 unique hashes, 0 collisions\n",
+                        "Dimension 158: 9,912 unique hashes, 0 collisions\n",
+                        "Dimension 159: 9,871 unique hashes, 0 collisions\n",
+                        "Dimension 160: 406 unique hashes, 0 collisions\n",
+                        "Dimension 161: 1,297 unique hashes, 0 collisions\n",
+                        "Dimension 162: 2,575 unique hashes, 0 collisions\n",
+                        "Dimension 163: 3,728 unique hashes, 0 collisions\n",
+                        "Dimension 164: 5,267 unique hashes, 0 collisions\n",
+                        "Dimension 165: 6,177 unique hashes, 0 collisions\n",
+                        "Dimension 166: 7,355 unique hashes, 0 collisions\n",
+                        "Dimension 167: 8,058 unique hashes, 0 collisions\n",
+                        "Dimension 168: 8,525 unique hashes, 0 collisions\n",
+                        "Dimension 169: 9,061 unique hashes, 0 collisions\n",
+                        "Dimension 170: 9,665 unique hashes, 0 collisions\n",
+                        "Dimension 171: 9,870 unique hashes, 0 collisions\n",
+                        "Dimension 172: 9,921 unique hashes, 0 collisions\n",
+                        "Dimension 173: 9,940 unique hashes, 0 collisions\n",
+                        "Dimension 174: 9,918 unique hashes, 0 collisions\n",
+                        "Dimension 175: 9,881 unique hashes, 0 collisions\n",
+                        "Dimension 176: 373 unique hashes, 0 collisions\n",
+                        "Dimension 177: 1,308 unique hashes, 0 collisions\n",
+                        "Dimension 178: 2,357 unique hashes, 0 collisions\n",
+                        "Dimension 179: 3,728 unique hashes, 0 collisions\n",
+                        "Dimension 180: 5,421 unique hashes, 0 collisions\n",
+                        "Dimension 181: 6,246 unique hashes, 0 collisions\n",
+                        "Dimension 182: 7,419 unique hashes, 0 collisions\n",
+                        "Dimension 183: 8,200 unique hashes, 0 collisions\n",
+                        "Dimension 184: 8,448 unique hashes, 0 collisions\n",
+                        "Dimension 185: 8,999 unique hashes, 0 collisions\n",
+                        "Dimension 186: 9,673 unique hashes, 0 collisions\n",
+                        "Dimension 187: 9,893 unique hashes, 0 collisions\n",
+                        "Dimension 188: 9,941 unique hashes, 0 collisions\n",
+                        "Dimension 189: 9,949 unique hashes, 0 collisions\n",
+                        "Dimension 190: 9,921 unique hashes, 0 collisions\n",
+                        "Dimension 191: 9,861 unique hashes, 0 collisions\n"
+                    ]
+                }
+            ],
+            "source": [
+                "from typing import Dict, Set\n",
+                "\n",
+                "for dim in range(len(window_widths)):\n",
+                "    hash_to_ngram: Dict[int, str] = {}\n",
+                "    hash_collisions: Set[int] = set()\n",
+                "    for hashes, ngrams in zip(fingerprint_hashes, fingerprint_ngrams):\n",
+                "        hash_value = hashes[dim]\n",
+                "        ngram_value = ngrams[dim]\n",
+                "        if hash_value not in hash_to_ngram:\n",
+                "            hash_to_ngram[hash_value] = ngram_value\n",
+                "        elif hash_to_ngram[hash_value] != ngram_value:\n",
+                "            hash_collisions.add(hash_value)\n",
+                "\n",
+                "    print(f\"Dimension {dim}: {len(hash_to_ngram):,} unique hashes, {len(hash_collisions):,} collisions\")"
+            ]
+        },
         {
             "cell_type": "markdown",
             "metadata": {},
@@ -722,11 +1202,36 @@
         },
         {
             "cell_type": "code",
-            "execution_count": null,
+            "execution_count": 57,
             "metadata": {},
-            "outputs": [],
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "(\"A short string<span style='color:#ffff00'> <span style='color:#0080ff'>wit</span></span>h an <span style='color:#ff0000'><span style='color:#ff8000'>n<span style='color:#00ff00'><span style='color:#ff00ff'>-gr</span>a</span>m</span></span>\",\n",
+                            " \"Longer strings<span style='color:#ffff00'> <span style='color:#0080ff'>wit</span></span>h different <span style='color:#ff0000'><span style='color:#ff8000'>n<span style='color:#00ff00'><span style='color:#ff00ff'>-gr</span>a</span>m</span></span>s\")"
+                        ]
+                    },
+                    "execution_count": 57,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
             "source": [
-                "COLOR_ARRAY = [\n",
+                "from typing import Tuple\n",
+                "from IPython.display import HTML\n",
+                "import numpy as np\n",
+                "\n",
+                "HTML_COLORS = [\n",
+                "    \"#ff0000\",\n",
+                "    \"#ff8000\",\n",
+                "    \"#ffff00\",\n",
+                "    \"#00ff00\",\n",
+                "    \"#0080ff\",\n",
+                "    \"#ff00ff\",\n",
+                "    \"#800080\",\n",
+                "]\n",
+                "ASCII_COLORS = [\n",
                 "    \"\\033[38;5;196m\",  # red\n",
                 "    \"\\033[38;5;208m\",  # orange\n",
                 "    \"\\033[38;5;226m\",  # yellow\n",
@@ -736,9 +1241,6 @@
                 "    \"\\033[38;5;129m\",  # purple\n",
                 "]\n",
                 "\n",
-                "# Sometimes hashes match, but the n-grams are different\n",
-                "COLOR_COLLISION = \"\\033[38;5;244m\"  # grey\n",
-                "COLOR_RESET = \"\\033[0m\"\n",
                 "\n",
                 "def color_code_matches(\n",
                 "    query_text: str,\n",
@@ -747,36 +1249,108 @@
                 "    document_hashes: np.ndarray,\n",
                 "    query_ngrams: np.ndarray,\n",
                 "    document_ngrams: np.ndarray,\n",
+                "    *,\n",
+                "    html: bool = True,\n",
                 ") -> Tuple[str, str]:\n",
-                "    \n",
+                "    \"\"\"Highlight matching n‑grams / hash‑collisions in the two texts.\"\"\"\n",
+                "\n",
+                "    COLOR_ARRAY = (\n",
+                "        [f\"<span style='color:{hex_}'>\" for hex_ in HTML_COLORS]\n",
+                "        if html\n",
+                "        else ASCII_COLORS\n",
+                "    )\n",
+                "    COLOR_COLLISION = (\n",
+                "        \"<span style='color:#888888'>\" if html else \"\\033[38;5;244m\"\n",
+                "    )  # grey\n",
+                "    COLOR_RESET = \"</span>\" if html else \"\\033[0m\"\n",
+                "\n",
+                "    def number_of_matches_in_dimension(dim: int) -> int:\n",
+                "        if len(query_ngrams[dim]) == 0 or len(document_ngrams[dim]) == 0:\n",
+                "            return 0\n",
+                "        return min(\n",
+                "            query_text.count(query_ngrams[dim]),\n",
+                "            document_text.count(document_ngrams[dim]),\n",
+                "        )\n",
+                "\n",
+                "    def ngram_length_in_dimension(dim: int) -> int:\n",
+                "        return len(query_ngrams[dim]) if dim < len(query_ngrams) else 0\n",
+                "\n",
+                "    all_dims = [\n",
+                "        d for d in range(len(query_hashes)) if number_of_matches_in_dimension(d)\n",
+                "    ]\n",
+                "    all_dims.sort(key=ngram_length_in_dimension, reverse=True)\n",
+                "\n",
                 "    color_index = 0\n",
-                "    for dim in range(len(query_hashes)):\n",
-                "        is_matching_hash = query_hashes[dim] == document_hashes[dim]\n",
-                "        is_matching_ngram = query_ngrams[dim] == document_ngrams[dim]\n",
-                "        \n",
-                "        if is_matching_ngram:\n",
-                "            color = COLOR_ARRAY[color_index % len(COLOR_ARRAY)]\n",
-                "            ngram_replacement = f\"{color}{query_ngrams[dim]}{COLOR_RESET}\"\n",
-                "            query_text = query_text.replace(query_ngrams[dim], ngram_replacement)\n",
-                "            document_text = document_text.replace(document_ngrams[dim], ngram_replacement)\n",
+                "    for dim in all_dims:\n",
+                "        if number_of_matches_in_dimension(dim) == 0:\n",
+                "            continue\n",
+                "\n",
+                "        is_hash_eq = query_hashes[dim] == document_hashes[dim]\n",
+                "        is_ngram_eq = query_ngrams[dim] == document_ngrams[dim]\n",
+                "        token = query_ngrams[dim]\n",
+                "        assert token, \"N‑gram must not be empty\"\n",
+                "\n",
+                "        if is_ngram_eq:\n",
+                "            color_tag = COLOR_ARRAY[color_index % len(COLOR_ARRAY)]\n",
+                "            replacement = f\"{color_tag}{token}{COLOR_RESET}\"\n",
                 "            color_index += 1\n",
-                "        elif is_matching_hash:\n",
-                "            ngram_replacement = f\"{COLOR_COLLISION}{query_ngrams[dim]}{COLOR_RESET}\"\n",
-                "            query_text = query_text.replace(query_ngrams[dim], ngram_replacement)\n",
-                "            document_text = document_text.replace(document_ngrams[dim], ngram_replacement)\n",
-                "    \n",
-                "    return query_text, document_text"
+                "        elif is_hash_eq:\n",
+                "            replacement = f\"{COLOR_COLLISION}{token}{COLOR_RESET}\"\n",
+                "        else:\n",
+                "            continue\n",
+                "\n",
+                "        query_text = query_text.replace(token, replacement)\n",
+                "        document_text = document_text.replace(token, replacement)\n",
+                "\n",
+                "    return query_text, document_text\n",
+                "\n",
+                "\n",
+                "query_text = \"A short string with an n-gram\"\n",
+                "document_text = \"Longer strings with different n-grams\"\n",
+                "query_hashes, query_weights, query_ngrams = default_min_sketcher(query_text)\n",
+                "document_hashes, document_weights, document_ngrams = default_min_sketcher(document_text)\n",
+                "color_code_matches(\n",
+                "    query_text=query_text,\n",
+                "    document_text=document_text,\n",
+                "    query_hashes=query_hashes,\n",
+                "    document_hashes=document_hashes,\n",
+                "    query_ngrams=query_ngrams,\n",
+                "    document_ngrams=document_ngrams,\n",
+                ")"
             ]
         },
         {
             "cell_type": "code",
-            "execution_count": null,
+            "execution_count": 58,
             "metadata": {},
-            "outputs": [],
+            "outputs": [
+                {
+                    "name": "stderr",
+                    "output_type": "stream",
+                    "text": [
+                        "Searching: 100%|██████████| 10/10 [00:00<00:00, 26.05doc/s]\n"
+                    ]
+                },
+                {
+                    "data": {
+                        "text/html": [
+                            "<pre style='font-family:monospace'>Matched query 0 with document 1,559 with score 0.0677<br/>a rebel statement sent to lisbon from jamba said 86 government soldie<span style='color:#ff00ff'>rs </span>and 13 gu<span style='color:#ff0000'>erril<span style='color:#ff8000'>las<span style='color:#ffff00'><span style='color:#00ff00'> were</span> </span>ki</span>l<span style='color:#0080ff'>led </span></span>in <span style='color:#ff0000'>the</span> f<span style='color:#800080'>igh</span>ting that ended jan. 3. it said <span style='color:#ff0000'>the</span> rebel forces sill held mavinga.<br/>hou<span style='color:#ff00ff'>rs </span>later, six leftist gu<span style='color:#ff0000'>erril<span style='color:#ff8000'>las<span style='color:#ffff00'><span style='color:#00ff00'> were</span> </span>ki</span>l<span style='color:#0080ff'>led </span></span>in a battle with a special army brigade created to f<span style='color:#800080'>igh</span>t <span style='color:#ff0000'>the</span> rebels.<br/>Matched query 1 with document 6,376 with score 0.0573<br/>auth<span style='color:#ff00ff'><span style='color:#800080'>ori</span>t</span>ies last week issued a vacate order for a club in<span style='color:#ff0000'> ma<span style='color:#00ff00'>nh<span style='color:#ff0000'>att</span></span></span>an<span style='color:#0080ff'> and</span> closed another in th<span style='color:#ff8000'>e <span style='color:#ffff00'>bronx</span></span>.<br/>auth<span style='color:#ff00ff'><span style='color:#800080'>ori</span>t</span>ies raided 15 suspected boiler room sites monday in<span style='color:#ff0000'> ma<span style='color:#00ff00'>nh<span style='color:#ff0000'>att</span></span></span>an, th<span style='color:#ff8000'>e <span style='color:#ffff00'>bronx</span></span>, brooklyn<span style='color:#0080ff'> and</span> queens,<span style='color:#0080ff'> and</span> on long island in hempstead<span style='color:#0080ff'> and</span> massapequa.<br/>Matched query 2 with document 2,540 with score 0.0625<br/>at the first pan am bankruptcy hearing, for example, a<span style='color:#ff0000'><span style='color:#ff8000'><span style='color:#0080ff'>t le</span>ast</span> fi</span>ve airlines<span style='color:#ffff00'><span style='color:#00ff00'> were</span> </span>represent<span style='color:#ff00ff'>ed.</span><br/>a<span style='color:#ff0000'><span style='color:#ff8000'><span style='color:#0080ff'>t le</span>ast</span> fi</span>ve businesses<span style='color:#ffff00'><span style='color:#00ff00'> were</span> </span>destroyed and 15<span style='color:#ffff00'><span style='color:#00ff00'> were</span> </span>damag<span style='color:#ff00ff'>ed.</span><br/>Matched query 3 with document 3,331 with score 0.0417<br/>mr. neigum, poker-faced duri<span style='color:#ff0000'><span style='color:#00ff00'>ng </span>the </span><span style='color:#ff8000'>di<span style='color:#ffff00'>ffic</span></span>ult task,<span style='color:#0080ff'> ma</span>nages a 46-second showing.<br/>although they appreciate it, it's<span style='color:#0080ff'> ma</span>ki<span style='color:#ff0000'><span style='color:#00ff00'>ng </span>the </span>day-to-day operations very <span style='color:#ff8000'>di<span style='color:#ffff00'>ffic</span></span>ult.<br/>Matched query 4 with document 7,018 with score 0.0469<br/>this, co<span style='color:#ff0000'><span style='color:#ff8000'><span style='color:#ffff00'>mbined w</span>i</span>th </span>the container division talks, suggests the group's bankers might be considering an orderly disposal of all assets.<br/>that, co<span style='color:#ff0000'><span style='color:#ff8000'><span style='color:#ffff00'>mbined w</span>i</span>th </span>lower prices, could get gnp back up to zero growth during the first quarter of 1991, he says.<br/>Matched query 5 with document 6,679 with score 0.0573<br/>she told the post i<span style='color:#ff0000'>n a<span style='color:#ff8000'>n <span style='color:#ffff00'><span style='color:#00ff00'>i<span style='color:#ff00ff'>nter</span></span>v</span></span></span>iew published sunday that some of the money may have become \"mingled\" into improvements on her home that included a swimming pool, a $2,500 wide-screen televis<span style='color:#0080ff'>ion </span>and renovations to her basement.<br/>chancellor helmut kohl said i<span style='color:#ff0000'>n a<span style='color:#ff8000'>n <span style='color:#ffff00'><span style='color:#00ff00'>i<span style='color:#ff00ff'>nter</span></span>v</span></span></span>iew with the ard televis<span style='color:#0080ff'>ion </span>network that he was \"alarmed\" by the republicans' showing in west berlin.<br/>Matched query 6 with document 3,322 with score 0.0469<br/>acco<span style='color:#ff0000'>r<span style='color:#ff8000'>ding t</span></span>o a stud<span style='color:#0080ff'>y b</span>y the m<span style='color:#00ff00'>ars</span>hall institute, the average nasa employee's age<span style='color:#ffff00'> in </span>1963 was 30; now most of its senior and middle-managers will be eligible to retire<span style='color:#ffff00'> in </span>five ye<span style='color:#00ff00'>ars</span>.<br/>fifteen ye<span style='color:#00ff00'>ars</span> after the first oil embargo, the u.s. economy has stopped making gains<span style='color:#ffff00'> in </span>energy efficiency, acco<span style='color:#ff0000'>r<span style='color:#ff8000'>ding t</span></span>o experts, who add that the country ma<span style='color:#0080ff'>y b</span>e losing ground instead.<br/>Matched query 7 with document 8,828 with score 0.0990<br/>preston tisch, 62, is <span style='color:#ffff00'><span style='color:#00ff00'>president</span> a</span><span style='color:#ff0000'>nd </span>co-chie<span style='color:#ff0000'>f execu<span style='color:#ff8000'>tive o<span style='color:#0080ff'>f<span style='color:#ff00ff'>fice</span></span>r</span> o</span>f loews<span style='color:#800080'> cor</span>p. a<span style='color:#ff0000'>nd </span>is a former postmaster general.<br/>arnold staloff, past <span style='color:#00ff00'>president</span> of the new york commodity exchange, on tuesday was named <span style='color:#ffff00'><span style='color:#00ff00'>president</span> a</span><span style='color:#ff0000'>nd </span>chie<span style='color:#ff0000'>f execu<span style='color:#ff8000'>tive o<span style='color:#0080ff'>f<span style='color:#ff00ff'>fice</span></span>r</span> o</span>f bloom staloff<span style='color:#800080'> cor</span>p., a philadelphia securities trading firm.<br/>Matched query 8 with document 7,057 with score 0.0521<br/>\"we'<span style='color:#ff0000'><span style='color:#ff8000'>re d<span style='color:#ffff00'>ealing </span></span></span>w<span style='color:#0080ff'>ith</span> an owner who couldn't give a rip.<span style='color:#00ff00'> they</span> cut off her mail and sh<span style='color:#ff00ff'>e g</span>ot a post office box.\" starting friday, an animal-control officer is accompanying finster on his route.<br/>\"once the borrowers knows<span style='color:#00ff00'> they</span> a<span style='color:#ff0000'><span style='color:#ff8000'>re d<span style='color:#ffff00'>ealing </span></span></span>w<span style='color:#0080ff'>ith</span> th<span style='color:#ff00ff'>e g</span>overnment, the likelihood of delinquency increases tremendously,\" he said.<br/>Matched query 9 with document 1,011 with score 0.0573<br/>asked if<span style='color:#ff00ff'> he </span>might bring <span style='color:#ff0000'>the</span> world leaders to texas, possibly to san antonio, <span style='color:#ff0000'>t<span style='color:#ff8000'>h<span style='color:#ffff00'>e<span style='color:#00ff00'> pres</span></span>ide</span></span>nt remarked, \"<span style='color:#0080ff'>t<span style='color:#800080'>hat'</span></span>s a distinct possibility.<br/>\"<span style='color:#0080ff'>t<span style='color:#800080'>hat'</span></span>s not <span style='color:#ff0000'>the</span> issue,\"<span style='color:#ff00ff'> he </span>said. \"i got enough votes to become <span style='color:#ff0000'>t<span style='color:#ff8000'>h<span style='color:#ffff00'>e<span style='color:#00ff00'> pres</span></span>ide</span></span>nt.</pre>"
+                        ],
+                        "text/plain": [
+                            "<IPython.core.display.HTML object>"
+                        ]
+                    },
+                    "metadata": {},
+                    "output_type": "display_data"
+                }
+            ],
             "source": [
                 "from tqdm import tqdm\n",
+                "from IPython.display import display\n",
+                "\n",
+                "QUERIES_TO_COMPARE = 10\n",
                 "\n",
-                "QUERIES_TO_COMPARE = 100\n",
+                "log_lines = []\n",
                 "\n",
                 "for i, query_hashes, query_counts, query_ngrams in tqdm(zip(\n",
                 "    range(QUERIES_TO_COMPARE),\n",
@@ -811,20 +1385,49 @@
                 "        query_ngrams=query_ngrams,\n",
                 "        document_ngrams=fingerprint_ngrams[best_index],\n",
                 "    )\n",
-                "    print(f\"Matched query {i:,} with document {best_index:,} with score {best_score:.4f}\")\n",
-                "    print(f\"- {colored_query}\")\n",
-                "    print(f\"- {colored_doc}\")"
+                "    log_lines.extend([\n",
+                "        f\"Matched query {i:,} with document {best_index:,} with score {best_score:.4f}\",\n",
+                "        colored_query,\n",
+                "        colored_doc,\n",
+                "    ])\n",
+                "    \n",
+                "concatenated_log = \"<br/>\".join(log_lines)\n",
+                "monospaced_log = HTML(f\"<pre style='font-family:monospace'>{concatenated_log}</pre>\")\n",
+                "display(monospaced_log)"
             ]
         },
         {
             "cell_type": "code",
-            "execution_count": null,
+            "execution_count": 59,
             "metadata": {},
-            "outputs": [],
+            "outputs": [
+                {
+                    "name": "stderr",
+                    "output_type": "stream",
+                    "text": [
+                        "Searching: 100%|██████████| 10/10 [00:00<00:00, 16.21doc/s]\n"
+                    ]
+                },
+                {
+                    "data": {
+                        "text/html": [
+                            "<pre style='font-family:monospace'>Matched query 0 with document 4,683 with score 0.0350<br/>a rebel statement sent to lisbon from jamba said 86 g<span style='color:#ff0000'>o<span style='color:#ff8000'><span style='color:#ffff00'>vernme</span>n</span>t </span>soldiers<span style='color:#00ff00'> and</span> 13 guerrillas we<span style='color:#0080ff'>re </span>killed in the fighting that ended jan. 3. it said the rebel forces sill held mavinga.<br/>sapoa, nicaragua _ rebel<span style='color:#00ff00'> and</span> g<span style='color:#ff0000'>o<span style='color:#ff8000'><span style='color:#ffff00'>vernme</span>n</span>t </span>negotiators began hammering out the details of a 60-day cease-fi<span style='color:#0080ff'>re </span>outlined in the peace accord signed last week.<br/>Matched query 1 with document 4,907 with score 0.0424<br/>a<span style='color:#ff0000'><span style='color:#ff8000'>uthori<span style='color:#ffff00'>tie<span style='color:#0080ff'>s la</span>st</span> we</span><span style='color:#ff00ff'>ek </span></span>issued a vacate order for a club in manhattan<span style='color:#00ff00'> and</span> closed another in the bronx.<br/>a<span style='color:#ff0000'><span style='color:#ff8000'>uthori<span style='color:#ffff00'>tie<span style='color:#0080ff'>s la</span>st</span> we</span><span style='color:#ff00ff'>ek </span></span>announced the seizure of nearly 100 tons of steel tubing<span style='color:#00ff00'> and</span> other parts about to be shipped out of italy to jordan to be transported overland from there, presumably to an assembly point near baghdad.<br/>Matched query 2 with document 4,939 with score 0.0396<br/>a<span style='color:#ff0000'>t the<span style='color:#ff8000'> fir</span></span>st pan am bankruptcy hearing, for example, at least five airlines were represented.<br/>a<span style='color:#ff0000'>t the<span style='color:#ff8000'> fir</span></span>st show, como won for best vocal performance by a male.<br/>Matched query 3 with document 9,924 with score 0.0233<br/>mr. neigum, poker-faced duri<span style='color:#ffff00'>ng </span>the<span style='color:#00ff00'> di</span><span style='color:#ff0000'><span style='color:#ff8000'>ffic</span>ult </span>task, manages a 46-second showing.<br/>there is a series of really important and<span style='color:#00ff00'> di</span><span style='color:#ff0000'><span style='color:#ff8000'>ffic</span>ult </span>issues of european policy cryi<span style='color:#ffff00'>ng </span>out for it to address.<br/>Matched query 4 with document 7,018 with score 0.0688<br/>this, co<span style='color:#ff0000'><span style='color:#ff8000'><span style='color:#ffff00'>mbined w</span>i</span>th </span>the container division talks, suggests the group's bankers might be considering an orderly disposal of all assets.<br/>that, co<span style='color:#ff0000'><span style='color:#ff8000'><span style='color:#ffff00'>mbined w</span>i</span>th </span>lower prices, could get gnp back up to zero growth during the first quarter of 1991, he says.<br/>Matched query 5 with document 883 with score 0.0523<br/>she told the post i<span style='color:#ff0000'>n a<span style='color:#ff8000'>n <span style='color:#ffff00'>i<span style='color:#0080ff'>nter</span>v</span></span></span>iew published sunday that some of the money may have become \"mingled\" into improvement<span style='color:#ff00ff'>s on</span> her home that included a swimming pool, a $2,500 wide-screen televis<span style='color:#00ff00'>ion </span>and renovations to her basement.<br/>i<span style='color:#ff0000'>n a<span style='color:#ff8000'>n <span style='color:#ffff00'>i<span style='color:#0080ff'>nter</span>v</span></span></span>iew with the washington post in early october, the secretary said the fed may be slightly more i<span style='color:#0080ff'>nter</span>ested in curbing inflat<span style='color:#00ff00'>ion </span>than the administrat<span style='color:#00ff00'>ion </span>is, while the administrat<span style='color:#00ff00'>ion </span>may put slightly more emphasi<span style='color:#ff00ff'>s on</span> spurring economic growth.<br/>Matched query 6 with document 7,774 with score 0.0331<br/>acco<span style='color:#ff0000'>r<span style='color:#ff8000'>ding t</span></span>o a study by the marshall institute<span style='color:#0080ff'>, t</span>he average nasa employee's age in 1963 was 30; now most of its senior<span style='color:#ffff00'> and</span> middle-managers will be eligible to reti<span style='color:#00ff00'>re </span>in five years.<br/>acco<span style='color:#ff0000'>r<span style='color:#ff8000'>ding t</span></span>o the royal palace<span style='color:#0080ff'>, t</span>he crackdown left 10 people dead<span style='color:#ffff00'> and</span> 107 injured throughout the country on april 6. but witnesses said at least 200 people we<span style='color:#00ff00'>re </span>killed in katmandu alone.<br/>Matched query 7 with document 4,990 with score 0.0621<br/>preston tisch, 62, is <span style='color:#ffff00'><span style='color:#0080ff'>president</span> a</span>nd co-<span style='color:#00ff00'>c<span style='color:#800080'>hi<span style='color:#ff8000'>ef </span></span>exec</span>u<span style='color:#ff0000'><span style='color:#ff8000'>ti<span style='color:#ff00ff'>ve of<span style='color:#ff0000'>fice</span></span>r</span> of</span> loews corp. and is a former postmaster general.<br/>thomas m. egan, <span style='color:#ffff00'><span style='color:#0080ff'>president</span> a</span>nd <span style='color:#00ff00'>c<span style='color:#800080'>hi<span style='color:#ff8000'>ef </span></span>exec</span>u<span style='color:#ff0000'><span style='color:#ff8000'>ti<span style='color:#ff00ff'>ve of<span style='color:#ff0000'>fice</span></span>r</span> of</span> stotler group, said the company had been in negotiations with the creditors in efforts to reach a solution.<br/>Matched query 8 with document 7,057 with score 0.0496<br/>\"we'<span style='color:#ff0000'><span style='color:#ff8000'>re d<span style='color:#ffff00'>ealing </span></span></span>w<span style='color:#0080ff'>ith</span> an owner who couldn't give a rip.<span style='color:#00ff00'> they</span> cut off her mail and sh<span style='color:#ff00ff'>e g</span>ot a post office box.\" starting friday, an animal-control officer is accompanying finster on his route.<br/>\"once the borrowers knows<span style='color:#00ff00'> they</span> a<span style='color:#ff0000'><span style='color:#ff8000'>re d<span style='color:#ffff00'>ealing </span></span></span>w<span style='color:#0080ff'>ith</span> th<span style='color:#ff00ff'>e g</span>overnment, the likelihood of delinquency increases tremendously,\" he said.<br/>Matched query 9 with document 8,003 with score 0.0340<br/>asked if he m<span style='color:#0080ff'>igh</span>t bri<span style='color:#ff0000'><span style='color:#ff8000'>n<span style='color:#ffff00'><span style='color:#00ff00'>g the </span>w</span></span>o</span>rld lea<span style='color:#ff00ff'>der</span>s to texas, possibly to san antonio, the president remarked, \"that's a distinct possibility.<br/>about 8,800 firef<span style='color:#0080ff'>igh</span>ters are battli<span style='color:#ff0000'><span style='color:#ff8000'>n<span style='color:#ffff00'><span style='color:#00ff00'>g the </span>w</span></span>o</span>rst of some 200 blazes in the west and alaska, 16 of them consi<span style='color:#ff00ff'>der</span>ed major fires. nearly a third of the manpower is in yellowstone national park, where fires have charred 140,000 acres.</pre>"
+                        ],
+                        "text/plain": [
+                            "<IPython.core.display.HTML object>"
+                        ]
+                    },
+                    "metadata": {},
+                    "output_type": "display_data"
+                }
+            ],
             "source": [
                 "from tqdm import tqdm\n",
+                "from IPython.display import display\n",
+                "\n",
+                "QUERIES_TO_COMPARE = 10\n",
                 "\n",
-                "QUERIES_TO_COMPARE = 100\n",
+                "log_lines = []\n",
                 "\n",
                 "for i, query_hashes, query_counts, query_ngrams in tqdm(\n",
                 "    zip(\n",
@@ -850,10 +1453,8 @@
                 "            continue\n",
                 "\n",
                 "        score = weighted_jaccard_similarity(\n",
-                "            query_hashes,\n",
-                "            dataset_hashes,\n",
-                "            query_ngrams,\n",
-                "            dataset_ngrams,\n",
+                "            (query_hashes, query_counts),\n",
+                "            (dataset_hashes, dataset_counts),\n",
                 "        )\n",
                 "        if score > best_score:\n",
                 "            best_score = score\n",
@@ -869,11 +1470,17 @@
                 "        query_ngrams=query_ngrams,\n",
                 "        document_ngrams=fingerprint_ngrams[best_index],\n",
                 "    )\n",
-                "    print(\n",
-                "        f\"Matched query {i:,} with document {best_index:,} with score {best_score:.4f}\"\n",
+                "    log_lines.extend(\n",
+                "        [\n",
+                "            f\"Matched query {i:,} with document {best_index:,} with score {best_score:.4f}\",\n",
+                "            colored_query,\n",
+                "            colored_doc,\n",
+                "        ]\n",
                 "    )\n",
-                "    print(f\"- {colored_query}\")\n",
-                "    print(f\"- {colored_doc}\")"
+                "\n",
+                "concatenated_log = \"<br/>\".join(log_lines)\n",
+                "monospaced_log = HTML(f\"<pre style='font-family:monospace'>{concatenated_log}</pre>\")\n",
+                "display(monospaced_log)"
             ]
         },
         {
@@ -885,7 +1492,7 @@
         },
         {
             "cell_type": "code",
-            "execution_count": null,
+            "execution_count": 60,
             "metadata": {},
             "outputs": [],
             "source": [

From 97ab23c91f2cf1ab47ac4817fe0da59c71c8d827 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 21 Jul 2025 20:38:20 +0000
Subject: [PATCH 488/751] Improve: Using fast calling convention for CPython

---
 python/stringzilla.c        | 809 ++++++++++++++++++++++++------------
 scripts/test_stringzilla.py |  22 +
 2 files changed, 559 insertions(+), 272 deletions(-)

diff --git a/python/stringzilla.c b/python/stringzilla.c
index 15b544a0..561e1a10 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -689,52 +689,82 @@ static char const doc_like_hash[] = //
     "\n"
     "This function can be called as a method on a Str object or as a standalone function.\n"
     "Args:\n"
-    "  text (Str or str or bytes): The string to hash.\n"
-    "  seed (int, optional): The seed value for hashing. Defaults to 0.\n"
+    "  text (Str or str or bytes): The string to hash (positional-only when standalone).\n"
+    "  seed (int, optional): The seed value for hashing. Defaults to 0. Can be positional or keyword.\n"
     "Returns:\n"
-    "  int: The hash value of the string.\n"
+    "  int: The hash value as an unsigned 64-bit integer. This differs from Python's\n"
+    "       built-in `hash()` which returns a `Py_hash_t` and may be platform-dependent.\n"
     "Raises:\n"
-    "  TypeError: If the argument is not string-like or incorrect number of arguments is provided.";
+    "  TypeError: If the argument is not string-like or incorrect number of arguments is provided.\n"
+    "Signature:\n"
+    "  >>> def hash(text, seed=0, /) -> int: ...";
+
+static PyObject *Str_like_hash(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                               PyObject *args_names_tuple) {
+    // Fast path variables
+    PyObject *text_obj = NULL;
+    PyObject *seed_obj = NULL;
+    sz_string_view_t text;
+    sz_u64_t seed = 0;
 
-static PyObject *Str_like_hash(PyObject *self, PyObject *args, PyObject *kwargs) {
-    // Check arguments
-    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
+    // Check if this is a method call on a Str instance
+    int const is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
 
-    // Allow optional seed parameter
-    Py_ssize_t min_args = !is_member;
-    Py_ssize_t max_args = min_args + 1;
+    // Fast argument validation
+    Py_ssize_t const args_names_count = args_names_tuple ? PyTuple_Size(args_names_tuple) : 0;
+    Py_ssize_t const total_args = positional_args_count + args_names_count;
+    Py_ssize_t const expected_min = is_member ? 0 : 1;
+    Py_ssize_t const expected_max = expected_min + 1;
 
-    if (nargs < min_args || nargs > max_args) {
+    if (total_args < expected_min || total_args > expected_max) {
         PyErr_SetString(PyExc_TypeError, is_member ? "hash() takes 0 or 1 positional arguments"
                                                    : "hash() takes 1 or 2 positional arguments");
         return NULL;
     }
 
-    PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    sz_string_view_t text;
+    if (positional_args_count > expected_max) {
+        PyErr_SetString(PyExc_TypeError, "Too many positional arguments");
+        return NULL;
+    }
 
-    // Validate and convert `text`
+    // Fast positional argument extraction
+    if (is_member) {
+        text_obj = self;
+        if (positional_args_count >= 1) seed_obj = args[0];
+    }
+    else {
+        if (positional_args_count >= 1) text_obj = args[0];
+        if (positional_args_count >= 2) seed_obj = args[1];
+    }
+
+    // Fast keyword argument parsing
+    if (args_names_count > 0) {
+        for (Py_ssize_t i = 0; i < args_names_count; ++i) {
+            PyObject *const key = PyTuple_GetItem(args_names_tuple, i);
+            PyObject *const value = args[positional_args_count + i];
+
+            if (PyUnicode_CompareWithASCIIString(key, "seed") == 0) {
+                if (seed_obj) {
+                    PyErr_SetString(PyExc_TypeError, "seed specified twice");
+                    return NULL;
+                }
+                seed_obj = value;
+            }
+            else {
+                PyErr_Format(PyExc_TypeError, "unexpected keyword argument: %S", key);
+                return NULL;
+            }
+        }
+    }
+
+    // Validate and convert text
     if (!export_string_like(text_obj, &text.start, &text.length)) {
         wrap_current_exception("The text argument must be string-like");
         return NULL;
     }
 
-    // Parse optional seed
-    sz_u64_t seed = 0;
-    if (is_member && nargs > 0) {
-        // Member method with seed argument
-        PyObject *seed_obj = PyTuple_GET_ITEM(args, 0);
-        if (!PyLong_Check(seed_obj)) {
-            PyErr_SetString(PyExc_TypeError, "seed must be an integer");
-            return NULL;
-        }
-        seed = PyLong_AsUnsignedLongLong(seed_obj);
-        if (PyErr_Occurred()) return NULL;
-    }
-    else if (!is_member && nargs > 1) {
-        // Standalone function with seed argument
-        PyObject *seed_obj = PyTuple_GET_ITEM(args, 1);
+    // Parse seed
+    if (seed_obj) {
         if (!PyLong_Check(seed_obj)) {
             PyErr_SetString(PyExc_TypeError, "seed must be an integer");
             return NULL;
@@ -743,19 +773,6 @@ static PyObject *Str_like_hash(PyObject *self, PyObject *args, PyObject *kwargs)
         if (PyErr_Occurred()) return NULL;
     }
 
-    // Handle keyword arguments if present
-    if (kwargs && PyDict_Size(kwargs) > 0) {
-        PyObject *seed_obj = PyDict_GetItemString(kwargs, "seed");
-        if (seed_obj) {
-            if (!PyLong_Check(seed_obj)) {
-                PyErr_SetString(PyExc_TypeError, "seed must be an integer");
-                return NULL;
-            }
-            seed = PyLong_AsUnsignedLongLong(seed_obj);
-            if (PyErr_Occurred()) return NULL;
-        }
-    }
-
     sz_u64_t result = sz_hash(text.start, text.length, seed);
     return PyLong_FromUnsignedLongLong((unsigned long long)result);
 }
@@ -771,16 +788,16 @@ static char const doc_like_bytesum[] = //
     "Raises:\n"
     "  TypeError: If the argument is not string-like or incorrect number of arguments is provided.";
 
-static PyObject *Str_like_bytesum(PyObject *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Str_like_bytesum(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                  PyObject *args_names_tuple) {
     // Check minimum arguments
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member || nargs > !is_member + 1 || kwargs) {
+    if (positional_args_count < !is_member || positional_args_count > !is_member + 1 || args_names_tuple) {
         PyErr_SetString(PyExc_TypeError, "bytesum() expects exactly one positional argument");
         return NULL;
     }
 
-    PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *text_obj = is_member ? self : args[0];
     sz_string_view_t text;
 
     // Validate and convert `text`
@@ -805,17 +822,17 @@ static char const doc_like_equal[] = //
     "Raises:\n"
     "  TypeError: If the argument is not string-like or incorrect number of arguments is provided.";
 
-static PyObject *Str_like_equal(PyObject *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Str_like_equal(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                PyObject *args_names_tuple) {
     // Check minimum arguments
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member || nargs > !is_member + 1 || kwargs) {
+    if (positional_args_count < !is_member || positional_args_count > !is_member + 1 || args_names_tuple) {
         PyErr_SetString(PyExc_TypeError, "equals() expects exactly two positional arguments");
         return NULL;
     }
 
-    PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *other_obj = PyTuple_GET_ITEM(args, is_member);
+    PyObject *text_obj = is_member ? self : args[0];
+    PyObject *other_obj = args[is_member];
     sz_string_view_t text, other;
 
     // Validate and convert tje texts
@@ -1352,26 +1369,28 @@ static char const doc_decode[] = //
     "Raises:\n"
     "  UnicodeDecodeError: If decoding fails.";
 
-static PyObject *Str_decode(PyObject *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Str_decode(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                            PyObject *args_names_tuple) {
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member || nargs > !is_member + 2) {
+    if (positional_args_count < !is_member || positional_args_count > !is_member + 2) {
         PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
         return NULL;
     }
 
-    PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *encoding_obj = nargs > !is_member + 0 ? PyTuple_GET_ITEM(args, !is_member + 0) : NULL;
-    PyObject *errors_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+    PyObject *text_obj = is_member ? self : args[0];
+    PyObject *encoding_obj = positional_args_count > !is_member + 0 ? args[!is_member + 0] : NULL;
+    PyObject *errors_obj = positional_args_count > !is_member + 1 ? args[!is_member + 1] : NULL;
 
-    if (kwargs) {
-        Py_ssize_t pos = 0;
-        PyObject *key, *value;
-        while (PyDict_Next(kwargs, &pos, &key, &value))
+    if (args_names_tuple) {
+        Py_ssize_t args_names_count = PyTuple_GET_SIZE(args_names_tuple);
+        for (Py_ssize_t i = 0; i < args_names_count; ++i) {
+            PyObject *key = PyTuple_GET_ITEM(args_names_tuple, i);
+            PyObject *value = args[positional_args_count + i];
             if (PyUnicode_CompareWithASCIIString(key, "encoding") == 0 && !encoding_obj) { encoding_obj = value; }
             else if (PyUnicode_CompareWithASCIIString(key, "errors") == 0 && !errors_obj) { errors_obj = value; }
             else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key))
                 return NULL;
+        }
     }
 
     // Convert `encoding` and `errors` to `NULL` if they are `None`
@@ -1403,20 +1422,20 @@ static char const doc_write_to[] = //
     "Returns:\n"
     "  None.";
 
-static PyObject *Str_write_to(PyObject *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Str_write_to(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                              PyObject *args_names_tuple) {
 
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs != !is_member + 1) {
+    if (positional_args_count != !is_member + 1) {
         PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
         return NULL;
     }
 
-    PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *path_obj = PyTuple_GET_ITEM(args, !is_member + 0);
+    PyObject *text_obj = is_member ? self : args[0];
+    PyObject *path_obj = args[!is_member + 0];
 
     // Parse keyword arguments
-    if (kwargs) {
+    if (args_names_tuple) {
         PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument");
         return NULL;
     }
@@ -1482,20 +1501,20 @@ static char const doc_offset_within[] = //
     "Returns:\n"
     "  int: The byte offset where 'self' is found within 'larger', or -1 if not found.";
 
-static PyObject *Str_offset_within(PyObject *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Str_offset_within(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                   PyObject *args_names_tuple) {
 
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs != !is_member + 1) {
+    if (positional_args_count != !is_member + 1) {
         PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
         return NULL;
     }
 
-    PyObject *slice_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *text_obj = PyTuple_GET_ITEM(args, !is_member + 0);
+    PyObject *slice_obj = is_member ? self : args[0];
+    PyObject *text_obj = args[!is_member + 0];
 
     // Parse keyword arguments
-    if (kwargs) {
+    if (args_names_tuple) {
         PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument");
         return NULL;
     }
@@ -1522,31 +1541,74 @@ static PyObject *Str_offset_within(PyObject *self, PyObject *args, PyObject *kwa
  *  @brief  Implementation function for all search-like operations, parameterized by a function callback.
  *  @return 1 on success, 0 on failure.
  */
-static int _Str_find_implementation_( //
-    PyObject *self, PyObject *args, PyObject *kwargs, sz_find_t finder, sz_bool_t is_reverse, Py_ssize_t *offset_out,
-    sz_string_view_t *haystack_out, sz_string_view_t *needle_out) {
-
-    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member + 1 || nargs > !is_member + 3) {
+static int Str_find_implementation_( //
+    PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count, PyObject *args_names_tuple,
+    sz_find_t finder, sz_bool_t is_reverse, Py_ssize_t *offset_out, sz_string_view_t *haystack_out,
+    sz_string_view_t *needle_out) {
+
+    // Fast path variables
+    PyObject *haystack_obj = NULL;
+    PyObject *needle_obj = NULL;
+    PyObject *start_obj = NULL;
+    PyObject *end_obj = NULL;
+
+    int const is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+
+    // Fast argument validation
+    Py_ssize_t const args_names_count = args_names_tuple ? PyTuple_Size(args_names_tuple) : 0;
+    Py_ssize_t const total_args = positional_args_count + args_names_count;
+    Py_ssize_t const expected_min = is_member ? 1 : 2; // needle is required
+    Py_ssize_t const expected_max = expected_min + 2;  // + start + end
+
+    if (total_args < expected_min || total_args > expected_max) {
         PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
         return 0;
     }
 
-    PyObject *haystack_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *needle_obj = PyTuple_GET_ITEM(args, !is_member + 0);
-    PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
-    PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
+    if (positional_args_count > expected_max) {
+        PyErr_SetString(PyExc_TypeError, "Too many positional arguments");
+        return 0;
+    }
 
-    // Parse keyword arguments
-    if (kwargs) {
-        Py_ssize_t pos = 0;
-        PyObject *key, *value;
-        while (PyDict_Next(kwargs, &pos, &key, &value))
-            if (PyUnicode_CompareWithASCIIString(key, "start") == 0 && !start_obj) { start_obj = value; }
-            else if (PyUnicode_CompareWithASCIIString(key, "end") == 0 && !end_obj) { end_obj = value; }
-            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key))
+    // Fast positional argument extraction
+    if (is_member) {
+        haystack_obj = self;
+        if (positional_args_count >= 1) needle_obj = args[0];
+        if (positional_args_count >= 2) start_obj = args[1];
+        if (positional_args_count >= 3) end_obj = args[2];
+    }
+    else {
+        if (positional_args_count >= 1) haystack_obj = args[0];
+        if (positional_args_count >= 2) needle_obj = args[1];
+        if (positional_args_count >= 3) start_obj = args[2];
+        if (positional_args_count >= 4) end_obj = args[3];
+    }
+
+    // Fast keyword argument parsing
+    if (args_names_count > 0) {
+        for (Py_ssize_t i = 0; i < args_names_count; ++i) {
+            PyObject *const key = PyTuple_GetItem(args_names_tuple, i);
+            PyObject *const value = args[positional_args_count + i];
+
+            if (PyUnicode_CompareWithASCIIString(key, "start") == 0) {
+                if (start_obj) {
+                    PyErr_SetString(PyExc_TypeError, "start specified twice");
+                    return 0;
+                }
+                start_obj = value;
+            }
+            else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) {
+                if (end_obj) {
+                    PyErr_SetString(PyExc_TypeError, "end specified twice");
+                    return 0;
+                }
+                end_obj = value;
+            }
+            else {
+                PyErr_Format(PyExc_TypeError, "unexpected keyword argument: %S", key);
                 return 0;
+            }
+        }
     }
 
     sz_string_view_t haystack;
@@ -1615,11 +1677,13 @@ static char const doc_contains[] = //
     "Returns:\n"
     "  bool: True if the substring is found, False otherwise.";
 
-static PyObject *Str_contains(PyObject *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Str_contains(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                              PyObject *args_names_tuple) {
     Py_ssize_t signed_offset;
     sz_string_view_t text;
     sz_string_view_t separator;
-    if (!_Str_find_implementation_(self, args, kwargs, &sz_find, sz_false_k, &signed_offset, &text, &separator))
+    if (!Str_find_implementation_(self, args, positional_args_count, args_names_tuple, &sz_find, sz_false_k,
+                                  &signed_offset, &text, &separator))
         return NULL;
     if (signed_offset == -1) { Py_RETURN_FALSE; }
     else { Py_RETURN_TRUE; }
@@ -1636,11 +1700,13 @@ static char const doc_find[] = //
     "Returns:\n"
     "  int: The index of the first occurrence, or -1 if not found.";
 
-static PyObject *Str_find(PyObject *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Str_find(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                          PyObject *args_names_tuple) {
     Py_ssize_t signed_offset;
     sz_string_view_t text;
     sz_string_view_t separator;
-    if (!_Str_find_implementation_(self, args, kwargs, &sz_find, sz_false_k, &signed_offset, &text, &separator))
+    if (!Str_find_implementation_(self, args, positional_args_count, args_names_tuple, &sz_find, sz_false_k,
+                                  &signed_offset, &text, &separator))
         return NULL;
     return PyLong_FromSsize_t(signed_offset);
 }
@@ -1658,11 +1724,13 @@ static char const doc_index[] = //
     "Raises:\n"
     "  ValueError: If the substring is not found.";
 
-static PyObject *Str_index(PyObject *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Str_index(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                           PyObject *args_names_tuple) {
     Py_ssize_t signed_offset;
     sz_string_view_t text;
     sz_string_view_t separator;
-    if (!_Str_find_implementation_(self, args, kwargs, &sz_find, sz_false_k, &signed_offset, &text, &separator))
+    if (!Str_find_implementation_(self, args, positional_args_count, args_names_tuple, &sz_find, sz_false_k,
+                                  &signed_offset, &text, &separator))
         return NULL;
     if (signed_offset == -1) {
         PyErr_SetString(PyExc_ValueError, "substring not found");
@@ -1682,11 +1750,13 @@ static char const doc_rfind[] = //
     "Returns:\n"
     "  int: The index of the last occurrence, or -1 if not found.";
 
-static PyObject *Str_rfind(PyObject *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Str_rfind(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                           PyObject *args_names_tuple) {
     Py_ssize_t signed_offset;
     sz_string_view_t text;
     sz_string_view_t separator;
-    if (!_Str_find_implementation_(self, args, kwargs, &sz_rfind, sz_true_k, &signed_offset, &text, &separator))
+    if (!Str_find_implementation_(self, args, positional_args_count, args_names_tuple, &sz_rfind, sz_true_k,
+                                  &signed_offset, &text, &separator))
         return NULL;
     return PyLong_FromSsize_t(signed_offset);
 }
@@ -1704,11 +1774,13 @@ static char const doc_rindex[] = //
     "Raises:\n"
     "  ValueError: If the substring is not found.";
 
-static PyObject *Str_rindex(PyObject *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Str_rindex(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                            PyObject *args_names_tuple) {
     Py_ssize_t signed_offset;
     sz_string_view_t text;
     sz_string_view_t separator;
-    if (!_Str_find_implementation_(self, args, kwargs, &sz_rfind, sz_true_k, &signed_offset, &text, &separator))
+    if (!Str_find_implementation_(self, args, positional_args_count, args_names_tuple, &sz_rfind, sz_true_k,
+                                  &signed_offset, &text, &separator))
         return NULL;
     if (signed_offset == -1) {
         PyErr_SetString(PyExc_ValueError, "substring not found");
@@ -1717,15 +1789,16 @@ static PyObject *Str_rindex(PyObject *self, PyObject *args, PyObject *kwargs) {
     return PyLong_FromSsize_t(signed_offset);
 }
 
-static PyObject *_Str_partition_implementation(PyObject *self, PyObject *args, PyObject *kwargs, sz_find_t finder,
-                                               sz_bool_t is_reverse) {
+static PyObject *Str_partition_implementation_(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                               PyObject *args_names_tuple, sz_find_t finder, sz_bool_t is_reverse) {
     Py_ssize_t separator_index;
     sz_string_view_t text;
     sz_string_view_t separator;
     PyObject *result_tuple;
 
-    // Use _Str_find_implementation_ to get the index of the separator
-    if (!_Str_find_implementation_(self, args, kwargs, finder, is_reverse, &separator_index, &text, &separator))
+    // Use `Str_find_implementation_` to get the index of the separator
+    if (!Str_find_implementation_(self, args, positional_args_count, args_names_tuple, finder, is_reverse,
+                                  &separator_index, &text, &separator))
         return NULL;
 
     // If the separator length is zero, we must raise a `ValueError`
@@ -1781,8 +1854,9 @@ static char const doc_partition[] = //
     "Returns:\n"
     "  tuple: A 3-tuple (head, separator, tail). If the separator is not found, returns (self, '', '').";
 
-static PyObject *Str_partition(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return _Str_partition_implementation(self, args, kwargs, &sz_find, sz_false_k);
+static PyObject *Str_partition(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                               PyObject *args_names_tuple) {
+    return Str_partition_implementation_(self, args, positional_args_count, args_names_tuple, &sz_find, sz_false_k);
 }
 
 static char const doc_rpartition[] = //
@@ -1794,8 +1868,9 @@ static char const doc_rpartition[] = //
     "Returns:\n"
     "  tuple: A 3-tuple (head, separator, tail). If the separator is not found, returns ('', '', self).";
 
-static PyObject *Str_rpartition(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return _Str_partition_implementation(self, args, kwargs, &sz_rfind, sz_true_k);
+static PyObject *Str_rpartition(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                PyObject *args_names_tuple) {
+    return Str_partition_implementation_(self, args, positional_args_count, args_names_tuple, &sz_rfind, sz_true_k);
 }
 
 static char const doc_count[] = //
@@ -1810,29 +1885,81 @@ static char const doc_count[] = //
     "Returns:\n"
     "  int: The number of occurrences of the substring.";
 
-static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) {
-    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member + 1 || nargs > !is_member + 4) {
-        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
+static PyObject *Str_count(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                           PyObject *args_names_tuple) {
+    // Fast path variables
+    PyObject *haystack_obj = NULL;
+    PyObject *needle_obj = NULL;
+    PyObject *start_obj = NULL;
+    PyObject *end_obj = NULL;
+    PyObject *allowoverlap_obj = NULL;
+
+    int const is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+
+    // Fast argument validation
+    Py_ssize_t const args_names_count = args_names_tuple ? PyTuple_Size(args_names_tuple) : 0;
+    Py_ssize_t const total_args = positional_args_count + args_names_count;
+    Py_ssize_t const expected_min = is_member ? 1 : 2; // needle is required
+    Py_ssize_t const expected_max = expected_min + 3;  // + start + end + allowoverlap
+
+    if (total_args < expected_min || total_args > expected_max) {
+        PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
         return NULL;
     }
 
-    PyObject *haystack_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *needle_obj = PyTuple_GET_ITEM(args, !is_member + 0);
-    PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
-    PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
-    PyObject *allowoverlap_obj = nargs > !is_member + 3 ? PyTuple_GET_ITEM(args, !is_member + 3) : NULL;
+    if (positional_args_count > expected_max) {
+        PyErr_SetString(PyExc_TypeError, "Too many positional arguments");
+        return NULL;
+    }
 
-    if (kwargs) {
-        Py_ssize_t pos = 0;
-        PyObject *key, *value;
-        while (PyDict_Next(kwargs, &pos, &key, &value))
-            if (PyUnicode_CompareWithASCIIString(key, "start") == 0) { start_obj = value; }
-            else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) { end_obj = value; }
-            else if (PyUnicode_CompareWithASCIIString(key, "allowoverlap") == 0) { allowoverlap_obj = value; }
-            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key))
+    // Fast positional argument extraction
+    if (is_member) {
+        haystack_obj = self;
+        if (positional_args_count >= 1) needle_obj = args[0];
+        if (positional_args_count >= 2) start_obj = args[1];
+        if (positional_args_count >= 3) end_obj = args[2];
+        if (positional_args_count >= 4) allowoverlap_obj = args[3];
+    }
+    else {
+        if (positional_args_count >= 1) haystack_obj = args[0];
+        if (positional_args_count >= 2) needle_obj = args[1];
+        if (positional_args_count >= 3) start_obj = args[2];
+        if (positional_args_count >= 4) end_obj = args[3];
+        if (positional_args_count >= 5) allowoverlap_obj = args[4];
+    }
+
+    // Fast keyword argument parsing
+    if (args_names_count > 0) {
+        for (Py_ssize_t i = 0; i < args_names_count; ++i) {
+            PyObject *const key = PyTuple_GetItem(args_names_tuple, i);
+            PyObject *const value = args[positional_args_count + i];
+
+            if (PyUnicode_CompareWithASCIIString(key, "start") == 0) {
+                if (start_obj) {
+                    PyErr_SetString(PyExc_TypeError, "start specified twice");
+                    return NULL;
+                }
+                start_obj = value;
+            }
+            else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) {
+                if (end_obj) {
+                    PyErr_SetString(PyExc_TypeError, "end specified twice");
+                    return NULL;
+                }
+                end_obj = value;
+            }
+            else if (PyUnicode_CompareWithASCIIString(key, "allowoverlap") == 0) {
+                if (allowoverlap_obj) {
+                    PyErr_SetString(PyExc_TypeError, "allowoverlap specified twice");
+                    return NULL;
+                }
+                allowoverlap_obj = value;
+            }
+            else {
+                PyErr_Format(PyExc_TypeError, "unexpected keyword argument: %S", key);
                 return NULL;
+            }
+        }
     }
 
     sz_string_view_t haystack;
@@ -1891,18 +2018,72 @@ static char const doc_startswith[] = //
     "Returns:\n"
     "  bool: True if the string starts with the prefix, False otherwise.";
 
-static PyObject *Str_startswith(PyObject *self, PyObject *args, PyObject *kwargs) {
-    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member + 1 || nargs > !is_member + 3) {
-        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
+static PyObject *Str_startswith(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                PyObject *args_names_tuple) {
+    // Fast path variables
+    PyObject *str_obj = NULL;
+    PyObject *prefix_obj = NULL;
+    PyObject *start_obj = NULL;
+    PyObject *end_obj = NULL;
+
+    int const is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+
+    // Fast argument validation
+    Py_ssize_t const args_names_count = args_names_tuple ? PyTuple_Size(args_names_tuple) : 0;
+    Py_ssize_t const total_args = positional_args_count + args_names_count;
+    Py_ssize_t const expected_min = is_member ? 1 : 2; // prefix is required
+    Py_ssize_t const expected_max = expected_min + 2;  // + start + end
+
+    if (total_args < expected_min || total_args > expected_max) {
+        PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
+        return NULL;
+    }
+
+    if (positional_args_count > expected_max) {
+        PyErr_SetString(PyExc_TypeError, "Too many positional arguments");
         return NULL;
     }
 
-    PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *prefix_obj = PyTuple_GET_ITEM(args, !is_member);
-    PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
-    PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
+    // Fast positional argument extraction
+    if (is_member) {
+        str_obj = self;
+        if (positional_args_count >= 1) prefix_obj = args[0];
+        if (positional_args_count >= 2) start_obj = args[1];
+        if (positional_args_count >= 3) end_obj = args[2];
+    }
+    else {
+        if (positional_args_count >= 1) str_obj = args[0];
+        if (positional_args_count >= 2) prefix_obj = args[1];
+        if (positional_args_count >= 3) start_obj = args[2];
+        if (positional_args_count >= 4) end_obj = args[3];
+    }
+
+    // Fast keyword argument parsing
+    if (args_names_count > 0) {
+        for (Py_ssize_t i = 0; i < args_names_count; ++i) {
+            PyObject *const key = PyTuple_GetItem(args_names_tuple, i);
+            PyObject *const value = args[positional_args_count + i];
+
+            if (PyUnicode_CompareWithASCIIString(key, "start") == 0) {
+                if (start_obj) {
+                    PyErr_SetString(PyExc_TypeError, "start specified twice");
+                    return NULL;
+                }
+                start_obj = value;
+            }
+            else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) {
+                if (end_obj) {
+                    PyErr_SetString(PyExc_TypeError, "end specified twice");
+                    return NULL;
+                }
+                end_obj = value;
+            }
+            else {
+                PyErr_Format(PyExc_TypeError, "unexpected keyword argument: %S", key);
+                return NULL;
+            }
+        }
+    }
 
     // Optional start and end arguments
     Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
@@ -1945,18 +2126,72 @@ static char const doc_endswith[] = //
     "Returns:\n"
     "  bool: True if the string ends with the suffix, False otherwise.";
 
-static PyObject *Str_endswith(PyObject *self, PyObject *args, PyObject *kwargs) {
-    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member + 1 || nargs > !is_member + 3) {
-        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
+static PyObject *Str_endswith(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                              PyObject *args_names_tuple) {
+    // Fast path variables
+    PyObject *str_obj = NULL;
+    PyObject *suffix_obj = NULL;
+    PyObject *start_obj = NULL;
+    PyObject *end_obj = NULL;
+
+    int const is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+
+    // Fast argument validation
+    Py_ssize_t const args_names_count = args_names_tuple ? PyTuple_Size(args_names_tuple) : 0;
+    Py_ssize_t const total_args = positional_args_count + args_names_count;
+    Py_ssize_t const expected_min = is_member ? 1 : 2; // suffix is required
+    Py_ssize_t const expected_max = expected_min + 2;  // + start + end
+
+    if (total_args < expected_min || total_args > expected_max) {
+        PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
+        return NULL;
+    }
+
+    if (positional_args_count > expected_max) {
+        PyErr_SetString(PyExc_TypeError, "Too many positional arguments");
         return NULL;
     }
 
-    PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *suffix_obj = PyTuple_GET_ITEM(args, !is_member);
-    PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
-    PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
+    // Fast positional argument extraction
+    if (is_member) {
+        str_obj = self;
+        if (positional_args_count >= 1) suffix_obj = args[0];
+        if (positional_args_count >= 2) start_obj = args[1];
+        if (positional_args_count >= 3) end_obj = args[2];
+    }
+    else {
+        if (positional_args_count >= 1) str_obj = args[0];
+        if (positional_args_count >= 2) suffix_obj = args[1];
+        if (positional_args_count >= 3) start_obj = args[2];
+        if (positional_args_count >= 4) end_obj = args[3];
+    }
+
+    // Fast keyword argument parsing
+    if (args_names_count > 0) {
+        for (Py_ssize_t i = 0; i < args_names_count; ++i) {
+            PyObject *const key = PyTuple_GetItem(args_names_tuple, i);
+            PyObject *const value = args[positional_args_count + i];
+
+            if (PyUnicode_CompareWithASCIIString(key, "start") == 0) {
+                if (start_obj) {
+                    PyErr_SetString(PyExc_TypeError, "start specified twice");
+                    return NULL;
+                }
+                start_obj = value;
+            }
+            else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) {
+                if (end_obj) {
+                    PyErr_SetString(PyExc_TypeError, "end specified twice");
+                    return NULL;
+                }
+                end_obj = value;
+            }
+            else {
+                PyErr_Format(PyExc_TypeError, "unexpected keyword argument: %S", key);
+                return NULL;
+            }
+        }
+    }
 
     // Optional start and end arguments
     Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
@@ -2004,30 +2239,32 @@ static char const doc_translate[] = //
     "  ValueError: If the table is not 256 bytes long.\n"
     "  TypeError: If the table is not a string or dictionary.";
 
-static PyObject *Str_translate(PyObject *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Str_translate(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                               PyObject *args_names_tuple) {
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member + 1 || nargs > !is_member + 4) {
+    if (positional_args_count < !is_member + 1 || positional_args_count > !is_member + 4) {
         PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
         return NULL;
     }
 
-    PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *look_up_table_obj = PyTuple_GET_ITEM(args, !is_member);
-    PyObject *inplace_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
-    PyObject *start_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
-    PyObject *end_obj = nargs > !is_member + 3 ? PyTuple_GET_ITEM(args, !is_member + 3) : NULL;
+    PyObject *str_obj = is_member ? self : args[0];
+    PyObject *look_up_table_obj = args[!is_member];
+    PyObject *inplace_obj = positional_args_count > !is_member + 1 ? args[!is_member + 1] : NULL;
+    PyObject *start_obj = positional_args_count > !is_member + 2 ? args[!is_member + 2] : NULL;
+    PyObject *end_obj = positional_args_count > !is_member + 3 ? args[!is_member + 3] : NULL;
 
     // Optional keyword arguments
-    if (kwargs) {
-        Py_ssize_t pos = 0;
-        PyObject *key, *value;
-        while (PyDict_Next(kwargs, &pos, &key, &value))
+    if (args_names_tuple) {
+        Py_ssize_t args_names_count = PyTuple_GET_SIZE(args_names_tuple);
+        for (Py_ssize_t i = 0; i < args_names_count; ++i) {
+            PyObject *key = PyTuple_GET_ITEM(args_names_tuple, i);
+            PyObject *value = args[positional_args_count + i];
             if (PyUnicode_CompareWithASCIIString(key, "inplace") == 0 && !inplace_obj) { inplace_obj = value; }
             else if (PyUnicode_CompareWithASCIIString(key, "start") == 0 && !start_obj) { start_obj = value; }
             else if (PyUnicode_CompareWithASCIIString(key, "end") == 0 && !end_obj) { end_obj = value; }
             else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key))
                 return NULL;
+        }
     }
 
     // Optional start and end arguments
@@ -2141,12 +2378,13 @@ static char const doc_find_first_of[] = //
     "Returns:\n"
     "  int: Index of the first matching character, or -1 if none found.";
 
-static PyObject *Str_find_first_of(PyObject *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Str_find_first_of(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                   PyObject *args_names_tuple) {
     Py_ssize_t signed_offset;
     sz_string_view_t text;
     sz_string_view_t separator;
-    if (!_Str_find_implementation_(self, args, kwargs, &sz_find_byte_from, sz_false_k, &signed_offset, &text,
-                                   &separator))
+    if (!Str_find_implementation_(self, args, positional_args_count, args_names_tuple, &sz_find_byte_from, sz_false_k,
+                                  &signed_offset, &text, &separator))
         return NULL;
     return PyLong_FromSsize_t(signed_offset);
 }
@@ -2162,12 +2400,13 @@ static char const doc_find_first_not_of[] = //
     "Returns:\n"
     "  int: Index of the first non-matching character, or -1 if all match.";
 
-static PyObject *Str_find_first_not_of(PyObject *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Str_find_first_not_of(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                       PyObject *args_names_tuple) {
     Py_ssize_t signed_offset;
     sz_string_view_t text;
     sz_string_view_t separator;
-    if (!_Str_find_implementation_(self, args, kwargs, &sz_find_byte_not_from, sz_false_k, &signed_offset, &text,
-                                   &separator))
+    if (!Str_find_implementation_(self, args, positional_args_count, args_names_tuple, &sz_find_byte_not_from,
+                                  sz_false_k, &signed_offset, &text, &separator))
         return NULL;
     return PyLong_FromSsize_t(signed_offset);
 }
@@ -2183,12 +2422,13 @@ static char const doc_find_last_of[] = //
     "Returns:\n"
     "  int: Index of the last matching character, or -1 if none found.";
 
-static PyObject *Str_find_last_of(PyObject *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Str_find_last_of(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                  PyObject *args_names_tuple) {
     Py_ssize_t signed_offset;
     sz_string_view_t text;
     sz_string_view_t separator;
-    if (!_Str_find_implementation_(self, args, kwargs, &sz_rfind_byte_from, sz_true_k, &signed_offset, &text,
-                                   &separator))
+    if (!Str_find_implementation_(self, args, positional_args_count, args_names_tuple, &sz_rfind_byte_from, sz_true_k,
+                                  &signed_offset, &text, &separator))
         return NULL;
     return PyLong_FromSsize_t(signed_offset);
 }
@@ -2204,12 +2444,13 @@ static char const doc_find_last_not_of[] = //
     "Returns:\n"
     "  int: Index of the last non-matching character, or -1 if all match.";
 
-static PyObject *Str_find_last_not_of(PyObject *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Str_find_last_not_of(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                      PyObject *args_names_tuple) {
     Py_ssize_t signed_offset;
     sz_string_view_t text;
     sz_string_view_t separator;
-    if (!_Str_find_implementation_(self, args, kwargs, &sz_rfind_byte_not_from, sz_true_k, &signed_offset, &text,
-                                   &separator))
+    if (!Str_find_implementation_(self, args, positional_args_count, args_names_tuple, &sz_rfind_byte_not_from,
+                                  sz_true_k, &signed_offset, &text, &separator))
         return NULL;
     return PyLong_FromSsize_t(signed_offset);
 }
@@ -2416,26 +2657,29 @@ static Strs *Str_rsplit_(PyObject *parent_string, sz_string_view_t const text, s
  *  @brief  Proxy routing requests like `Str.split`, `Str.rsplit`, `Str.split_byteset` and `Str.rsplit_byteset`
  *          to `Str_split_` and `Str_rsplit_` implementations, parsing function arguments.
  */
-static PyObject *Str_split_with_known_callback(PyObject *self, PyObject *args, PyObject *kwargs, //
-                                               sz_find_t finder, sz_size_t match_length,         //
+static PyObject *Str_split_with_known_callback(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                               PyObject *args_names_tuple,               //
+                                               sz_find_t finder, sz_size_t match_length, //
                                                sz_bool_t is_reverse, sz_bool_t is_lazy_iterator) {
     // Check minimum arguments
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member || nargs > !is_member + 3) {
+    Py_ssize_t expected_min_args = !is_member;
+    Py_ssize_t expected_max_args = !is_member + 3;
+    if (positional_args_count < expected_min_args || positional_args_count > expected_max_args) {
         PyErr_SetString(PyExc_TypeError, "sz.split() received unsupported number of arguments");
         return NULL;
     }
 
-    PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *separator_obj = nargs > !is_member + 0 ? PyTuple_GET_ITEM(args, !is_member + 0) : NULL;
-    PyObject *maxsplit_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
-    PyObject *keepseparator_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
+    PyObject *text_obj = is_member ? self : args[0];
+    PyObject *separator_obj = positional_args_count > !is_member + 0 ? args[!is_member + 0] : NULL;
+    PyObject *maxsplit_obj = positional_args_count > !is_member + 1 ? args[!is_member + 1] : NULL;
+    PyObject *keepseparator_obj = positional_args_count > !is_member + 2 ? args[!is_member + 2] : NULL;
 
-    if (kwargs) {
-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
-        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+    if (args_names_tuple) {
+        Py_ssize_t args_names_count = PyTuple_GET_SIZE(args_names_tuple);
+        for (Py_ssize_t i = 0; i < args_names_count; ++i) {
+            PyObject *key = PyTuple_GET_ITEM(args_names_tuple, i);
+            PyObject *value = args[positional_args_count + i];
             if (PyUnicode_CompareWithASCIIString(key, "separator") == 0 && !separator_obj) { separator_obj = value; }
             else if (PyUnicode_CompareWithASCIIString(key, "maxsplit") == 0 && !maxsplit_obj) { maxsplit_obj = value; }
             else if (PyUnicode_CompareWithASCIIString(key, "keepseparator") == 0 && !keepseparator_obj) {
@@ -2517,8 +2761,10 @@ static char const doc_split[] = //
     "Raises:\n"
     "  ValueError: If the separator is an empty string.";
 
-static PyObject *Str_split(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return Str_split_with_known_callback(self, args, kwargs, &sz_find, 0, sz_false_k, sz_false_k);
+static PyObject *Str_split(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                           PyObject *args_names_tuple) {
+    return Str_split_with_known_callback(self, args, positional_args_count, args_names_tuple, &sz_find, 0, sz_false_k,
+                                         sz_false_k);
 }
 
 static char const doc_rsplit[] = //
@@ -2534,8 +2780,10 @@ static char const doc_rsplit[] = //
     "Raises:\n"
     "  ValueError: If the separator is an empty string.";
 
-static PyObject *Str_rsplit(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return Str_split_with_known_callback(self, args, kwargs, &sz_rfind, 0, sz_true_k, sz_false_k);
+static PyObject *Str_rsplit(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                            PyObject *args_names_tuple) {
+    return Str_split_with_known_callback(self, args, positional_args_count, args_names_tuple, &sz_rfind, 0, sz_true_k,
+                                         sz_false_k);
 }
 
 static char const doc_split_byteset[] = //
@@ -2549,8 +2797,10 @@ static char const doc_split_byteset[] = //
     "Returns:\n"
     "  Strs: A list of strings split by the character set.";
 
-static PyObject *Str_split_byteset(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return Str_split_with_known_callback(self, args, kwargs, &sz_find_byte_from, 1, sz_false_k, sz_false_k);
+static PyObject *Str_split_byteset(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                   PyObject *args_names_tuple) {
+    return Str_split_with_known_callback(self, args, positional_args_count, args_names_tuple, &sz_find_byte_from, 1,
+                                         sz_false_k, sz_false_k);
 }
 
 static char const doc_rsplit_byteset[] = //
@@ -2564,8 +2814,10 @@ static char const doc_rsplit_byteset[] = //
     "Returns:\n"
     "  Strs: A list of strings split by the character set.";
 
-static PyObject *Str_rsplit_byteset(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return Str_split_with_known_callback(self, args, kwargs, &sz_rfind_byte_from, 1, sz_true_k, sz_false_k);
+static PyObject *Str_rsplit_byteset(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                    PyObject *args_names_tuple) {
+    return Str_split_with_known_callback(self, args, positional_args_count, args_names_tuple, &sz_rfind_byte_from, 1,
+                                         sz_true_k, sz_false_k);
 }
 
 static char const doc_split_iter[] = //
@@ -2580,8 +2832,10 @@ static char const doc_split_iter[] = //
     "Raises:\n"
     "  ValueError: If the separator is an empty string.";
 
-static PyObject *Str_split_iter(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return Str_split_with_known_callback(self, args, kwargs, &sz_find, 0, sz_false_k, sz_true_k);
+static PyObject *Str_split_iter(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                PyObject *args_names_tuple) {
+    return Str_split_with_known_callback(self, args, positional_args_count, args_names_tuple, &sz_find, 0, sz_false_k,
+                                         sz_true_k);
 }
 
 static char const doc_rsplit_iter[] = //
@@ -2596,8 +2850,10 @@ static char const doc_rsplit_iter[] = //
     "Raises:\n"
     "  ValueError: If the separator is an empty string.";
 
-static PyObject *Str_rsplit_iter(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return Str_split_with_known_callback(self, args, kwargs, &sz_rfind, 0, sz_true_k, sz_true_k);
+static PyObject *Str_rsplit_iter(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                 PyObject *args_names_tuple) {
+    return Str_split_with_known_callback(self, args, positional_args_count, args_names_tuple, &sz_rfind, 0, sz_true_k,
+                                         sz_true_k);
 }
 
 static char const doc_split_byteset_iter[] = //
@@ -2610,8 +2866,10 @@ static char const doc_split_byteset_iter[] = //
     "Returns:\n"
     "  iterator: An iterator yielding split substrings.";
 
-static PyObject *Str_split_byteset_iter(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return Str_split_with_known_callback(self, args, kwargs, &sz_find_byte_from, 1, sz_false_k, sz_true_k);
+static PyObject *Str_split_byteset_iter(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                        PyObject *args_names_tuple) {
+    return Str_split_with_known_callback(self, args, positional_args_count, args_names_tuple, &sz_find_byte_from, 1,
+                                         sz_false_k, sz_true_k);
 }
 
 static char const doc_rsplit_byteset_iter[] = //
@@ -2624,8 +2882,10 @@ static char const doc_rsplit_byteset_iter[] = //
     "Returns:\n"
     "  iterator: An iterator yielding split substrings in reverse.";
 
-static PyObject *Str_rsplit_byteset_iter(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return Str_split_with_known_callback(self, args, kwargs, &sz_rfind_byte_from, 1, sz_true_k, sz_true_k);
+static PyObject *Str_rsplit_byteset_iter(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                         PyObject *args_names_tuple) {
+    return Str_split_with_known_callback(self, args, positional_args_count, args_names_tuple, &sz_rfind_byte_from, 1,
+                                         sz_true_k, sz_true_k);
 }
 
 static char const doc_splitlines[] = //
@@ -2638,23 +2898,24 @@ static char const doc_splitlines[] = //
     "Returns:\n"
     "  Strs: A list of strings split by line breaks.";
 
-static PyObject *Str_splitlines(PyObject *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Str_splitlines(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                PyObject *args_names_tuple) {
     // Check minimum arguments
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member || nargs > !is_member + 2) {
+    if (positional_args_count < !is_member || positional_args_count > !is_member + 2) {
         PyErr_SetString(PyExc_TypeError, "splitlines() requires at least 1 argument");
         return NULL;
     }
 
-    PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *keeplinebreaks_obj = nargs > !is_member ? PyTuple_GET_ITEM(args, !is_member) : NULL;
-    PyObject *maxsplit_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+    PyObject *text_obj = is_member ? self : args[0];
+    PyObject *keeplinebreaks_obj = positional_args_count > !is_member ? args[!is_member] : NULL;
+    PyObject *maxsplit_obj = positional_args_count > !is_member + 1 ? args[!is_member + 1] : NULL;
 
-    if (kwargs) {
-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
-        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+    if (args_names_tuple) {
+        Py_ssize_t args_names_count = PyTuple_GET_SIZE(args_names_tuple);
+        for (Py_ssize_t i = 0; i < args_names_count; ++i) {
+            PyObject *key = PyTuple_GET_ITEM(args_names_tuple, i);
+            PyObject *value = args[positional_args_count + i];
             if (PyUnicode_CompareWithASCIIString(key, "keeplinebreaks") == 0 && !keeplinebreaks_obj) {
                 keeplinebreaks_obj = value;
             }
@@ -2777,7 +3038,7 @@ static PyGetSetDef Str_getsetters[] = {
     {NULL} // Sentinel
 };
 
-#define SZ_METHOD_FLAGS METH_VARARGS | METH_KEYWORDS
+#define SZ_METHOD_FLAGS METH_FASTCALL | METH_KEYWORDS
 
 static PyMethodDef Str_methods[] = {
     {"contains", (PyCFunction)Str_contains, SZ_METHOD_FLAGS, doc_contains},
@@ -2919,21 +3180,22 @@ static PyTypeObject SplitIteratorType = {
 
 #pragma region Strs
 
-static PyObject *Strs_shuffle(Strs *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Strs_shuffle(Strs *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                              PyObject *args_names_tuple) {
 
     // Check for positional arguments
-    Py_ssize_t nargs = PyTuple_Size(args);
-    PyObject *seed_obj = nargs == 1 ? PyTuple_GET_ITEM(args, 0) : NULL;
-    if (nargs > 1) {
+    PyObject *seed_obj = positional_args_count == 1 ? args[0] : NULL;
+    if (positional_args_count > 1) {
         PyErr_SetString(PyExc_TypeError, "shuffle() takes at most 1 positional argument");
         return NULL;
     }
 
     // Check for keyword arguments
-    if (kwargs) {
-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
-        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+    if (args_names_tuple) {
+        Py_ssize_t args_names_count = PyTuple_GET_SIZE(args_names_tuple);
+        for (Py_ssize_t i = 0; i < args_names_count; ++i) {
+            PyObject *key = PyTuple_GET_ITEM(args_names_tuple, i);
+            PyObject *value = args[positional_args_count + i];
             if (PyUnicode_CompareWithASCIIString(key, "seed") == 0 && !seed_obj) { seed_obj = value; }
             else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) { return NULL; }
         }
@@ -3004,22 +3266,23 @@ static sz_bool_t Strs_argsort_(Strs *self, sz_string_view_t **parts_output, sz_s
     return 1;
 }
 
-static PyObject *Strs_sort(Strs *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Strs_sort(Strs *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                           PyObject *args_names_tuple) {
     PyObject *reverse_obj = NULL; // Default is not reversed
 
     // Check for positional arguments
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs > 1) {
+    if (positional_args_count > 1) {
         PyErr_SetString(PyExc_TypeError, "sort() takes at most 1 positional argument");
         return NULL;
     }
-    else if (nargs == 1) { reverse_obj = PyTuple_GET_ITEM(args, 0); }
+    else if (positional_args_count == 1) { reverse_obj = args[0]; }
 
     // Check for keyword arguments
-    if (kwargs) {
-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
-        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+    if (args_names_tuple) {
+        Py_ssize_t args_names_count = PyTuple_GET_SIZE(args_names_tuple);
+        for (Py_ssize_t i = 0; i < args_names_count; ++i) {
+            PyObject *key = PyTuple_GET_ITEM(args_names_tuple, i);
+            PyObject *value = args[positional_args_count + i];
             if (PyUnicode_CompareWithASCIIString(key, "reverse") == 0 && !reverse_obj) { reverse_obj = value; }
             else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) { return NULL; }
         }
@@ -3048,22 +3311,23 @@ static PyObject *Strs_sort(Strs *self, PyObject *args, PyObject *kwargs) {
     Py_RETURN_NONE;
 }
 
-static PyObject *Strs_argsort(Strs *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Strs_argsort(Strs *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                              PyObject *args_names_tuple) {
     PyObject *reverse_obj = NULL; // Default is not reversed
 
     // Check for positional arguments
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs > 1) {
+    if (positional_args_count > 1) {
         PyErr_SetString(PyExc_TypeError, "order() takes at most 1 positional argument");
         return NULL;
     }
-    else if (nargs == 1) { reverse_obj = PyTuple_GET_ITEM(args, 0); }
+    else if (positional_args_count == 1) { reverse_obj = args[0]; }
 
     // Check for keyword arguments
-    if (kwargs) {
-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
-        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+    if (args_names_tuple) {
+        Py_ssize_t args_names_count = PyTuple_GET_SIZE(args_names_tuple);
+        for (Py_ssize_t i = 0; i < args_names_count; ++i) {
+            PyObject *key = PyTuple_GET_ITEM(args_names_tuple, i);
+            PyObject *value = args[positional_args_count + i];
             if (PyUnicode_CompareWithASCIIString(key, "reverse") == 0 && !reverse_obj) { reverse_obj = value; }
             else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) { return NULL; }
         }
@@ -3118,23 +3382,24 @@ static PyObject *Strs_argsort(Strs *self, PyObject *args, PyObject *kwargs) {
     return tuple;
 }
 
-static PyObject *Strs_sample(Strs *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Strs_sample(Strs *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                             PyObject *args_names_tuple) {
     PyObject *sample_size_obj = NULL;
     PyObject *seed_obj = NULL;
 
     // Check for positional arguments
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs > 1) {
+    if (positional_args_count > 1) {
         PyErr_SetString(PyExc_TypeError, "sample() takes 1 positional argument and 1 keyword argument");
         return NULL;
     }
-    else if (nargs == 1) { sample_size_obj = PyTuple_GET_ITEM(args, 0); }
+    else if (positional_args_count == 1) { sample_size_obj = args[0]; }
 
     // Parse keyword arguments
-    if (kwargs) {
-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
-        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+    if (args_names_tuple) {
+        Py_ssize_t args_names_count = PyTuple_GET_SIZE(args_names_tuple);
+        for (Py_ssize_t i = 0; i < args_names_count; ++i) {
+            PyObject *key = PyTuple_GET_ITEM(args_names_tuple, i);
+            PyObject *value = args[positional_args_count + i];
             if (PyUnicode_CompareWithASCIIString(key, "seed") == 0 && !seed_obj) { seed_obj = value; }
             else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) { return NULL; }
         }
@@ -3419,46 +3684,46 @@ static void stringzilla_cleanup(PyObject *m) {
 
 static PyMethodDef stringzilla_methods[] = {
     // Basic `str`, `bytes`, and `bytearray`-like functionality
-    {"contains", Str_contains, SZ_METHOD_FLAGS, doc_contains},
-    {"count", Str_count, SZ_METHOD_FLAGS, doc_count},
-    {"splitlines", Str_splitlines, SZ_METHOD_FLAGS, doc_splitlines},
-    {"startswith", Str_startswith, SZ_METHOD_FLAGS, doc_startswith},
-    {"endswith", Str_endswith, SZ_METHOD_FLAGS, doc_endswith},
-    {"translate", Str_translate, SZ_METHOD_FLAGS, doc_translate},
-    {"decode", Str_decode, SZ_METHOD_FLAGS, doc_decode},
-    {"equal", Str_like_equal, SZ_METHOD_FLAGS, doc_like_equal},
+    {"contains", (PyCFunction)Str_contains, SZ_METHOD_FLAGS, doc_contains},
+    {"count", (PyCFunction)Str_count, SZ_METHOD_FLAGS, doc_count},
+    {"splitlines", (PyCFunction)Str_splitlines, SZ_METHOD_FLAGS, doc_splitlines},
+    {"startswith", (PyCFunction)Str_startswith, SZ_METHOD_FLAGS, doc_startswith},
+    {"endswith", (PyCFunction)Str_endswith, SZ_METHOD_FLAGS, doc_endswith},
+    {"translate", (PyCFunction)Str_translate, SZ_METHOD_FLAGS, doc_translate},
+    {"decode", (PyCFunction)Str_decode, SZ_METHOD_FLAGS, doc_decode},
+    {"equal", (PyCFunction)Str_like_equal, SZ_METHOD_FLAGS, doc_like_equal},
 
     // Bidirectional operations
-    {"find", Str_find, SZ_METHOD_FLAGS, doc_find},
-    {"index", Str_index, SZ_METHOD_FLAGS, doc_index},
-    {"partition", Str_partition, SZ_METHOD_FLAGS, doc_partition},
-    {"split", Str_split, SZ_METHOD_FLAGS, doc_split},
-    {"rfind", Str_rfind, SZ_METHOD_FLAGS, doc_rfind},
-    {"rindex", Str_rindex, SZ_METHOD_FLAGS, doc_rindex},
-    {"rpartition", Str_rpartition, SZ_METHOD_FLAGS, doc_rpartition},
-    {"rsplit", Str_rsplit, SZ_METHOD_FLAGS, doc_rsplit},
+    {"find", (PyCFunction)Str_find, SZ_METHOD_FLAGS, doc_find},
+    {"index", (PyCFunction)Str_index, SZ_METHOD_FLAGS, doc_index},
+    {"partition", (PyCFunction)Str_partition, SZ_METHOD_FLAGS, doc_partition},
+    {"split", (PyCFunction)Str_split, SZ_METHOD_FLAGS, doc_split},
+    {"rfind", (PyCFunction)Str_rfind, SZ_METHOD_FLAGS, doc_rfind},
+    {"rindex", (PyCFunction)Str_rindex, SZ_METHOD_FLAGS, doc_rindex},
+    {"rpartition", (PyCFunction)Str_rpartition, SZ_METHOD_FLAGS, doc_rpartition},
+    {"rsplit", (PyCFunction)Str_rsplit, SZ_METHOD_FLAGS, doc_rsplit},
 
     // Character search extensions
-    {"find_first_of", Str_find_first_of, SZ_METHOD_FLAGS, doc_find_first_of},
-    {"find_last_of", Str_find_last_of, SZ_METHOD_FLAGS, doc_find_last_of},
-    {"find_first_not_of", Str_find_first_not_of, SZ_METHOD_FLAGS, doc_find_first_not_of},
-    {"find_last_not_of", Str_find_last_not_of, SZ_METHOD_FLAGS, doc_find_last_not_of},
-    {"split_byteset", Str_split_byteset, SZ_METHOD_FLAGS, doc_split_byteset},
-    {"rsplit_byteset", Str_rsplit_byteset, SZ_METHOD_FLAGS, doc_rsplit_byteset},
+    {"find_first_of", (PyCFunction)Str_find_first_of, SZ_METHOD_FLAGS, doc_find_first_of},
+    {"find_last_of", (PyCFunction)Str_find_last_of, SZ_METHOD_FLAGS, doc_find_last_of},
+    {"find_first_not_of", (PyCFunction)Str_find_first_not_of, SZ_METHOD_FLAGS, doc_find_first_not_of},
+    {"find_last_not_of", (PyCFunction)Str_find_last_not_of, SZ_METHOD_FLAGS, doc_find_last_not_of},
+    {"split_byteset", (PyCFunction)Str_split_byteset, SZ_METHOD_FLAGS, doc_split_byteset},
+    {"rsplit_byteset", (PyCFunction)Str_rsplit_byteset, SZ_METHOD_FLAGS, doc_rsplit_byteset},
 
     // Lazily evaluated iterators
-    {"split_iter", Str_split_iter, SZ_METHOD_FLAGS, doc_split_iter},
-    {"rsplit_iter", Str_rsplit_iter, SZ_METHOD_FLAGS, doc_rsplit_iter},
-    {"split_byteset_iter", Str_split_byteset_iter, SZ_METHOD_FLAGS, doc_split_byteset_iter},
-    {"rsplit_byteset_iter", Str_rsplit_byteset_iter, SZ_METHOD_FLAGS, doc_rsplit_byteset_iter},
+    {"split_iter", (PyCFunction)Str_split_iter, SZ_METHOD_FLAGS, doc_split_iter},
+    {"rsplit_iter", (PyCFunction)Str_rsplit_iter, SZ_METHOD_FLAGS, doc_rsplit_iter},
+    {"split_byteset_iter", (PyCFunction)Str_split_byteset_iter, SZ_METHOD_FLAGS, doc_split_byteset_iter},
+    {"rsplit_byteset_iter", (PyCFunction)Str_rsplit_byteset_iter, SZ_METHOD_FLAGS, doc_rsplit_byteset_iter},
 
     // Dealing with larger-than-memory datasets
-    {"offset_within", Str_offset_within, SZ_METHOD_FLAGS, doc_offset_within},
-    {"write_to", Str_write_to, SZ_METHOD_FLAGS, doc_write_to},
+    {"offset_within", (PyCFunction)Str_offset_within, SZ_METHOD_FLAGS, doc_offset_within},
+    {"write_to", (PyCFunction)Str_write_to, SZ_METHOD_FLAGS, doc_write_to},
 
     // Global unary extensions
-    {"hash", Str_like_hash, SZ_METHOD_FLAGS, doc_like_hash},
-    {"bytesum", Str_like_bytesum, SZ_METHOD_FLAGS, doc_like_bytesum},
+    {"hash", (PyCFunction)Str_like_hash, SZ_METHOD_FLAGS, doc_like_hash},
+    {"bytesum", (PyCFunction)Str_like_bytesum, SZ_METHOD_FLAGS, doc_like_bytesum},
 
     {NULL, NULL, 0, NULL}};
 
diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index 15808044..c9e3b3be 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -421,6 +421,22 @@ def test_unit_globals():
         "world", 0, 11
     )
 
+    assert sz.find_first_of("abcdef", "cde") == 2
+    assert sz.find_first_of("abcdef", "xyz") == -1
+    assert sz.find_first_of("hello world", "aeiou") == 1
+
+    assert sz.find_last_of("abcdef", "abc") == 2
+    assert sz.find_last_of("abcdef", "xyz") == -1
+    assert sz.find_last_of("hello world", "aeiou") == 7
+
+    assert sz.find_first_not_of("aaabbbccc", "ab") == 6
+    assert sz.find_first_not_of("abcdef", "abcdef") == -1
+    assert sz.find_first_not_of("   hello", " ") == 3
+
+    assert sz.find_last_not_of("aaabbbccc", "bc") == 2
+    assert sz.find_last_not_of("abcdef", "abcdef") == -1
+    assert sz.find_last_not_of("hello   ", " ") == 4
+
     # Compare partitioning functions
     assert sz.partition("abcdef", "c") == ("ab", "c", "def")
     assert sz.rpartition("abcdef", "c") == ("ab", "c", "def")
@@ -435,11 +451,17 @@ def test_unit_globals():
     assert sz.count("aaaaa", "aa") == 2
     assert sz.count("aaaaa", "aa", allowoverlap=True) == 4
 
+    assert sz.bytesum("hello") > 0
+    assert sz.bytesum("hello") > sz.bytesum("hell")
+
     assert sz.translate("ABC", {"A": "X", "B": "Y", "C": "Z"}) == "XYZ"
     assert sz.translate("ABC", {"A": "X", "B": "Y"}) == "XYC"
     assert sz.translate("ABC", {"A": "X", "B": "Y"}, start=1, end=-1) == "YC"
     assert sz.translate("ABC", bytes(range(256))) == "ABC"
 
+    assert sz.split("hello world test", " ") == ["hello", "world", "test"]
+    assert sz.rsplit("hello world test", " ", 1) == ["hello world", "test"]
+
 
 def test_string_lengths():
     assert 4 == len(sz.Str("abcd"))

From a1b30012965d1d0e677e1d7acef378409c4ab524 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 22 Jul 2025 11:09:34 +0000
Subject: [PATCH 489/751] Fix: Choosing co-primes with `std::gcd`

---
 include/stringzillas/fingerprint.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/stringzillas/fingerprint.hpp b/include/stringzillas/fingerprint.hpp
index 904f0f13..f81ab5ce 100644
--- a/include/stringzillas/fingerprint.hpp
+++ b/include/stringzillas/fingerprint.hpp
@@ -278,7 +278,7 @@ inline std::uint64_t choose_coprime_modulo(std::uint64_t multiplier, std::uint64
     if (!(bound & 1u)) --bound; // Make odd
 
     for (std::uint64_t p = bound; p >= 3; p -= 2)
-        if (std::gcd(p, multiplier) != 1) continue;
+        if (std::gcd(p, multiplier) == 1) return p;
 
     return 0;
 }

From f86dfaa8cf346c1e619afde8b1e93672524d5cf4 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 22 Jul 2025 11:34:37 +0000
Subject: [PATCH 490/751] Docs: Using `uv` for tests

---
 CONTRIBUTING.md | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 90273669..0ff490dd 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -416,9 +416,9 @@ Python bindings are implemented using pure CPython, so you wouldn't need to inst
 Still, you need a virtual environment, and it's recommended to use `uv` to create one.
 
 ```bash
-uv venv --python 3.11           # Or your preferred Python version
-source .venv/bin/activate       # To activate the virtual environment
-pip install -e .                # To build locally from source
+uv venv --python 3.11                   # or your preferred Python version
+source .venv/bin/activate               # to activate the virtual environment
+uv pip install -e . --force-reinstall   # to build locally from source
 ```
 
 ### Testing
@@ -426,8 +426,10 @@ pip install -e .                # To build locally from source
 For testing we use PyTest, which may not be installed on your system.
 
 ```bash
-pip install pytest numpy        # NumPy is optional, but recommended
-pytest scripts/test.py -s -x    # Runs tests printing logs and stops on the first failure
+uv pip install pytest pytest-repeat numpy pyarrow                                       # for repeated fuzzy tests
+uv run --no-project python -m pytest scripts/test_stringzilla.py                        # to run with default settings
+uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x -p no:warnings   # to pass custom settings
+uv run --no-project python -c 'from stringzilla import hash as sz_hash; print(sz_hash("abc", 100))'
 ```
 
 StringZilla for Python seems to cover more OS and hardware combinations, than NumPy.

From 1fc1caba5261bcc9eda9e0c5e0c53533b5c06ddf Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 22 Jul 2025 11:36:32 +0000
Subject: [PATCH 491/751] Fix: Consistent `barrett_mod` in C++ & Python

---
 include/stringzillas/fingerprint.hpp |  108 +-
 scripts/explore_fingerprint.ipynb    | 3109 +++++++++++++-------------
 2 files changed, 1638 insertions(+), 1579 deletions(-)

diff --git a/include/stringzillas/fingerprint.hpp b/include/stringzillas/fingerprint.hpp
index f81ab5ce..35ad9e67 100644
--- a/include/stringzillas/fingerprint.hpp
+++ b/include/stringzillas/fingerprint.hpp
@@ -326,7 +326,8 @@ struct floating_rolling_hasher<float> {
         hash_t const multiplier = default_alphabet_size_k, //
         hash_t const modulo = default_modulo_base_k) noexcept
         : window_width_ {window_width}, multiplier_ {static_cast<state_t>(multiplier)},
-          modulo_ {static_cast<state_t>(modulo)}, inverse_modulo_ {1.0f / modulo_}, discarding_multiplier_ {1.0f} {
+          modulo_ {static_cast<state_t>(modulo)}, inverse_modulo_ {1.0f / modulo_},
+          negative_discarding_multiplier_ {1.0f} {
 
         sz_assert_(window_width_ > 1 && "Window width must be > 1");
         sz_assert_(multiplier_ > 0 && "Multiplier must be positive");
@@ -341,7 +342,8 @@ struct floating_rolling_hasher<float> {
 
         // ! The GCC header misses the `std::fmodf` overload, so we use the underlying C version
         for (std::size_t i = 0; i + 1 < window_width_; ++i)
-            discarding_multiplier_ = ::fmodf(discarding_multiplier_ * multiplier_, modulo_);
+            negative_discarding_multiplier_ = ::fmodf(negative_discarding_multiplier_ * multiplier_, modulo_);
+        negative_discarding_multiplier_ = -negative_discarding_multiplier_;
     }
 
     inline std::size_t window_width() const noexcept { return window_width_; }
@@ -356,43 +358,42 @@ struct floating_rolling_hasher<float> {
         state_t old_term = state_t(old_char) + 1.0f;
         state_t new_term = state_t(new_char) + 1.0f;
 
-        state_t term_to_subtract = mul_mod(old_term, discarding_multiplier_);
-        state_t without_head = sub_mod(state, term_to_subtract);
-        return fma_mod(without_head, multiplier_, new_term);
+        state_t without_old = fma_mod(old_term, negative_discarding_multiplier_, state);
+        return fma_mod(without_old, multiplier_, new_term);
     }
 
     inline hash_t digest(state_t state) const noexcept { return static_cast<hash_t>(state); }
 
   private:
-    inline state_t mul_mod(state_t a, state_t b) const noexcept { return reduce(a * b); }
-    inline state_t fma_mod(state_t a, state_t b, state_t c) const noexcept { return reduce(a * b + c); }
-    inline state_t sub_mod(state_t a, state_t b) const noexcept { return reduce(a + modulo_ - b); }
+    inline state_t fma_mod(state_t a, state_t b, state_t c) const noexcept { return barrett_mod(a * b + c); }
 
     /**
      *  @brief Barrett-style `std::fmodf` alternative to avoid overflow.
      *  @see https://en.cppreference.com/w/cpp/numeric/math/fmod
      */
-    inline state_t reduce(state_t state) const noexcept {
-        sz_assert_(state >= 0 && "We can't handle negative states");
+    inline state_t barrett_mod(state_t x) const noexcept {
+
+        state_t q = std::floor(x * inverse_modulo_);
+        state_t result = x - q * modulo_;
 
-        state_t h = state;
-        h -= modulo_ * std::floor(h * inverse_modulo_);
         // Clamp into the [0, modulo_) range.
-        h += modulo_ * (h < 0.0f);
-        h -= modulo_ * (h >= modulo_);
-        sz_assert_(h >= 0 && "Intermediate state underflows the zero");
-        sz_assert_(h < limit_k && "Intermediate state overflows the limit");
-        sz_assert_(static_cast<std::uint64_t>(state) % static_cast<std::uint64_t>(modulo_) ==
-                       static_cast<std::uint64_t>(h) &&
-                   "Floating point approximation was incorrect");
-        return h;
+        if (result >= modulo_) result -= modulo_;
+        if (result < 0.0f) result += modulo_;
+
+        sz_assert_(result >= 0 && "Intermediate x underflows the zero");
+        sz_assert_(result < limit_k && "Intermediate x overflows the limit");
+        sz_assert_(static_cast<std::uint64_t>(::fmodf(x, modulo_) + (::fmodf(x, modulo_) < 0.0f ? modulo_ : 0.0f)) ==
+                       static_cast<std::uint64_t>(result) &&
+                   "Floating point modulo was incorrect");
+
+        return result;
     }
 
     std::size_t window_width_;
     state_t multiplier_;
     state_t modulo_;
     state_t inverse_modulo_;
-    state_t discarding_multiplier_;
+    state_t negative_discarding_multiplier_;
 };
 
 /**
@@ -429,7 +430,8 @@ struct floating_rolling_hasher<double> {
         state_t const multiplier = default_alphabet_size_k, //
         state_t const modulo = default_modulo_base_k) noexcept
         : window_width_ {window_width}, multiplier_ {static_cast<state_t>(multiplier)},
-          modulo_ {static_cast<state_t>(modulo)}, inverse_modulo_ {1.0 / modulo_}, discarding_multiplier_ {1.0} {
+          modulo_ {static_cast<state_t>(modulo)}, inverse_modulo_ {1.0 / modulo_},
+          negative_discarding_multiplier_ {1.0} {
 
         sz_assert_(window_width_ > 1 && "Window width must be > 1");
         sz_assert_(multiplier_ > 0 && "Multiplier must be positive");
@@ -443,34 +445,22 @@ struct floating_rolling_hasher<double> {
         sz_assert_(largest_intermediary < limit_k && "Intermediate state overflows the limit");
 
         for (std::size_t i = 0; i + 1 < window_width_; ++i)
-            discarding_multiplier_ = std::fmod(discarding_multiplier_ * multiplier_, modulo_);
-        discarding_multiplier_ = -discarding_multiplier_;
+            negative_discarding_multiplier_ = std::fmod(negative_discarding_multiplier_ * multiplier_, modulo_);
+        negative_discarding_multiplier_ = -negative_discarding_multiplier_;
     }
 
     inline std::size_t window_width() const noexcept { return window_width_; }
 
     inline state_t push(state_t state, byte_t new_char) const noexcept {
         state_t new_term = state_t(new_char) + 1.0;
-        state = std::fma(state, multiplier_, new_term);
-        state = reduce(state);
-        return state;
+        return fma_mod(state, multiplier_, new_term);
     }
 
     inline state_t roll(state_t state, byte_t old_char, byte_t new_char) const noexcept {
-
         state_t old_term = state_t(old_char) + 1.0;
         state_t new_term = state_t(new_char) + 1.0;
-
-        // Add head
-        state = std::fma(state, multiplier_, new_term);
-        sz_assert_(state < limit_k && "Intermediate state exceeds the limit");
-
-        // Remove tail
-        sz_assert_(std::abs(discarding_multiplier_) * old_term <= state && "Will underflow");
-        state = std::fma(discarding_multiplier_, old_term, state);
-        state = reduce(state);
-
-        return state;
+        state_t without_old = fma_mod(old_term, negative_discarding_multiplier_, state);
+        return fma_mod(without_old, multiplier_, new_term);
     }
 
     inline hash_t digest(state_t state) const noexcept { return static_cast<hash_t>(state); }
@@ -478,36 +468,37 @@ struct floating_rolling_hasher<double> {
     inline state_t multiplier() const noexcept { return multiplier_; }
     inline state_t modulo() const noexcept { return modulo_; }
     inline state_t inverse_modulo() const noexcept { return inverse_modulo_; }
-    inline state_t negative_discarding_multiplier() const noexcept { return discarding_multiplier_; }
+    inline state_t negative_discarding_multiplier() const noexcept { return negative_discarding_multiplier_; }
 
   private:
+    inline state_t fma_mod(state_t a, state_t b, state_t c) const noexcept { return barrett_mod(a * b + c); }
+
     /**
      *  @brief Barrett-style `std::fmod` alternative to avoid overflow.
      *  @see https://en.cppreference.com/w/cpp/numeric/math/fmod
      */
-    inline state_t reduce(state_t state) const noexcept {
-        sz_assert_(state >= 0 && "We can't handle negative states");
+    inline state_t barrett_mod(state_t x) const noexcept {
+
+        state_t q = std::floor(x * inverse_modulo_);
+        state_t result = x - q * modulo_;
 
-        state_t h = state;
-        h -= modulo_ * std::floor(h * inverse_modulo_);
         // Clamp into the [0, modulo_) range.
-        h += modulo_ * (h < 0.0);
-        h -= modulo_ * (h >= modulo_);
+        if (result >= modulo_) result -= modulo_;
+        if (result < 0.0) result += modulo_;
 
-        sz_assert_(h >= 0 && "Intermediate state underflows the zero");
-        sz_assert_(h < limit_k && "Intermediate state overflows the limit");
-        sz_assert_(static_cast<std::uint64_t>(state) % static_cast<std::uint64_t>(modulo_) ==
-                       static_cast<std::uint64_t>(h) &&
+        sz_assert_(result >= 0 && "Intermediate x underflows the zero");
+        sz_assert_(result < limit_k && "Intermediate x overflows the limit");
+        sz_assert_(static_cast<std::uint64_t>(std::fmod(x, modulo_) + (std::fmod(x, modulo_) < 0.0 ? modulo_ : 0.0)) ==
+                       static_cast<std::uint64_t>(result) &&
                    "Floating point modulo was incorrect");
-
-        return h;
+        return result;
     }
 
     std::size_t window_width_;
     state_t multiplier_;
     state_t modulo_;
     state_t inverse_modulo_;
-    state_t discarding_multiplier_;
+    state_t negative_discarding_multiplier_;
 };
 
 #pragma endregion - Baseline Rolling Hashers
@@ -937,7 +928,7 @@ struct floating_rolling_hashers {
     }
 
   private:
-    inline state_t reduce(state_t h, std::size_t dim) const noexcept {
+    inline state_t barrett_mod(state_t h, std::size_t dim) const noexcept {
         state_t const modulo = modulos_[dim];
         state_t const inverse_modulo = inverse_modulos_[dim];
         // Use STL-based modulo reduction like floating_rolling_hasher
@@ -945,6 +936,9 @@ struct floating_rolling_hashers {
         // Clamp into the [0, modulo) range.
         h += modulo * (h < 0.0);
         h -= modulo * (h >= modulo);
+        // Handle potential precision issues with additional clamping
+        if (h < 0.0) h += modulo;
+        if (h >= modulo) h -= modulo;
         return h;
     }
 
@@ -956,7 +950,7 @@ struct floating_rolling_hashers {
             minimum_floats[dim] = skipped_rolling_hash_k;
         }
 
-        if (text.size() > window_width_k) return;
+        if (text.size() < window_width_k) return;
 
         // Until we reach the maximum window length, use a branching code version
         for (std::size_t new_char_offset = 0; new_char_offset < window_width_k; ++new_char_offset) {
@@ -966,7 +960,7 @@ struct floating_rolling_hashers {
                 rolling_state_t &hash = last_floats[dim];
                 state_t state = sz_bitcast_(state_t, hash);
                 state += multipliers_[dim] * new_term;
-                state = reduce(state, dim);
+                state = barrett_mod(state, dim);
 
                 // Save back
                 hash = sz_bitcast_(rolling_state_t, state);
@@ -988,7 +982,7 @@ struct floating_rolling_hashers {
                 state_t state = sz_bitcast_(state_t, hash);
                 state += negative_discarding_multipliers_[dim] * old_term; // Remove tail
                 state += multipliers_[dim] * new_term;                     // Add head
-                state = reduce(state, dim);
+                state = barrett_mod(state, dim);
 
                 // Save back
                 hash = sz_bitcast_(rolling_state_t, state);
diff --git a/scripts/explore_fingerprint.ipynb b/scripts/explore_fingerprint.ipynb
index 80330aba..d4c94a77 100644
--- a/scripts/explore_fingerprint.ipynb
+++ b/scripts/explore_fingerprint.ipynb
@@ -1,1525 +1,1590 @@
 {
-    "cells": [
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "# Exploring Rabin-Karp-style Min-Hash Fingerprinting\n",
-                "\n",
-                "This document showcases the differences between different numeric types that one can use to implement a Rabin-Karp-style min-hash fingerprinting algorithm.\n",
-                "It answers several important questions:\n",
-                "\n",
-                "- How to use floating-point numbers for a traditionally integer-based task - \"hashing\"?\n",
-                "- How to properly compose many such hash functions to maximize the quality of fingerprints?"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Rabin-Karp Rolling Hashing\n",
-                "\n",
-                "Rabin-Karp algorithm is a polynomial rolling hash function built around modulo arithmetic.\n",
-                "Once the hashing window rolls forward, the leftmost character is removed and a new rightmost character is added.\n",
-                "Thus, the cost of computing each slices hash is just $O(1)$, if the previous window's hash is known.\n",
-                "\n",
-                "Assuming, many such rolling hashes will be used later, we can parameterize the algorithm with a few parameters:\n",
-                "- `window_width` - the length of the substring to hash;\n",
-                "- `multiplier` - the multiplier for the polynomial hash;\n",
-                "- `modulo` - the modulo to use for the hash, generally prime;\n",
-                "- `alphabet_size` - the size of the alphabet used in the string, e.g. 256 for ASCII;\n",
-                "- `salt` - an optional salt to add to each character's ordinal value, usually 1 to avoid adding zeroes;\n",
-                "- `seed` - an optional seed for the first hash, can be 0."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "from typing import Generator\n",
-                "\n",
-                "\n",
-                "def rabin_karp_ints(\n",
-                "    s: str,\n",
-                "    window_width: int,\n",
-                "    multiplier: int,\n",
-                "    modulo: int,\n",
-                "    alphabet_size: int = 256,\n",
-                "    salt: int = 1,\n",
-                ") -> Generator[int, None, None]:\n",
-                "    \"\"\"Return the rolling polynomial hashes of every length-`window_width` substring of `s`\"\"\"\n",
-                "\n",
-                "    assert window_width > 0, \"Window width must be positive\"\n",
-                "    assert multiplier > 0, \"Multiplier must be positive\"\n",
-                "    assert modulo > 0, \"Modulo must be positive\"\n",
-                "    assert multiplier < modulo, \"Multiplier must be less than modulo\"\n",
-                "\n",
-                "    if len(s) < window_width:\n",
-                "        return\n",
-                "\n",
-                "    current_hash: int = 0\n",
-                "    for char in s[:window_width]:\n",
-                "        new_term = ord(char) + salt\n",
-                "        assert new_term < (alphabet_size + salt), \"Pass correct `alphabet_size`\"\n",
-                "        current_hash = (current_hash * multiplier + new_term) % modulo\n",
-                "    yield current_hash\n",
-                "\n",
-                "    discarding_multiplier: int = pow(multiplier, window_width - 1, modulo)\n",
-                "    total_hashes = len(s) - window_width + 1\n",
-                "    for i in range(1, total_hashes):  # First hash is already yielded\n",
-                "        old_term = ord(s[i - 1]) + salt\n",
-                "        new_term = ord(s[i + window_width - 1]) + salt\n",
-                "\n",
-                "        # Remove leftmost char and add the new rightmost one.\n",
-                "        # All operations must be modulo `modulo`, but assuming the infinite precision of integers,\n",
-                "        # we don't care in this draft.\n",
-                "        current_hash = (current_hash - old_term * discarding_multiplier) % modulo\n",
-                "        current_hash = (current_hash * multiplier + new_term) % modulo\n",
-                "        yield current_hash\n",
-                "\n",
-                "\n",
-                "# Quick sanity-check\n",
-                "assert list(rabin_karp_ints(\"abcd\", 3, 31, 1_000_000_007)) == [\n",
-                "    next(rabin_karp_ints(\"abc\", 3, 31, 1_000_000_007)),\n",
-                "    next(rabin_karp_ints(\"bcd\", 3, 31, 1_000_000_007)),\n",
-                "]\n",
-                "assert list(rabin_karp_ints(\"abcdefdhijklmnopqr\", 17, 31, 65521)) == [\n",
-                "    next(rabin_karp_ints(\"abcdefdhijklmnopq\", 17, 31, 65521)),\n",
-                "    next(rabin_karp_ints(\"bcdefdhijklmnopqr\", 17, 31, 65521)),\n",
-                "]"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Rabin-Karp Rolling Hashing via Floats\n",
-                "\n",
-                "The Python's `int` type is unbounded, so it can be used to implement the Rabin-Karp rolling hash algorithm without worrying about overflow.\n",
-                "It is, however, insanely expensive to use, and doesn't allow us to explore optimization opportunities.\n",
-                "The `float`, on the other hand, is just a double-precision IEEE 754 floating-point number, which can exactly represent 52-bit integers!\n",
-                "Thus, we can convert our arithmetic to use `float`s, if we guarantee, that no intermediate result will exceed that limit."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 39,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "from typing import Generator\n",
-                "\n",
-                "LARGEST_INTEGRAL_FLOAT: float = 4503599627370495.0\n",
-                "\n",
-                "\n",
-                "def rabin_karp_floats(\n",
-                "    s: str,\n",
-                "    window_width: int,\n",
-                "    multiplier: int,\n",
-                "    modulo: int,\n",
-                "    alphabet_size: int = 256,\n",
-                "    salt: int = 1,\n",
-                ") -> Generator[int, None, None]:\n",
-                "    \"\"\"Return the rolling polynomial hashes of every length-`window_width` substring of `s`\"\"\"\n",
-                "\n",
-                "    assert window_width > 0, \"Window width must be positive\"\n",
-                "    assert multiplier > 0, \"Multiplier must be positive\"\n",
-                "    assert modulo > 0, \"Modulo must be positive\"\n",
-                "    assert multiplier < modulo, \"Multiplier must be less than modulo\"\n",
-                "\n",
-                "    if len(s) < window_width:\n",
-                "        return\n",
-                "\n",
-                "    multiplier = float(multiplier)\n",
-                "    modulo = float(modulo)\n",
-                "    assert (\n",
-                "        modulo < LARGEST_INTEGRAL_FLOAT\n",
-                "    ), \"Modulo can't exceed the largest integral float value\"\n",
-                "\n",
-                "    # Ensure, we won't overflow the floating-point representation\n",
-                "    largest_post_modulo = modulo - 1\n",
-                "    max_possible_term = alphabet_size\n",
-                "    assert (\n",
-                "        largest_post_modulo * multiplier + max_possible_term <= LARGEST_INTEGRAL_FLOAT\n",
-                "    ), \"Will overflow\"\n",
-                "\n",
-                "    # All of the operations will happen with a modulo:\n",
-                "    def mul_mod(a: float, b: float) -> float:\n",
-                "        return (a * b) % modulo\n",
-                "\n",
-                "    def add_mod(a: float, b: float) -> float:\n",
-                "        return (a + b) % modulo\n",
-                "\n",
-                "    def sub_mod(a: float, b: float) -> float:\n",
-                "        return (a - b) % modulo\n",
-                "\n",
-                "    # Precompute the discarding multiplier\n",
-                "    discarding_multiplier: float = 1.0\n",
-                "    for _ in range(window_width - 1):\n",
-                "        discarding_multiplier = mul_mod(discarding_multiplier, multiplier)\n",
-                "\n",
-                "    # Handle the first window - without dropping any characters\n",
-                "    current_hash: float = 0.0\n",
-                "    for char in s[:window_width]:\n",
-                "        new_term = float(ord(char) + salt)\n",
-                "        assert new_term < (alphabet_size + salt), \"Pass correct `alphabet_size`\"\n",
-                "        current_hash = add_mod(mul_mod(current_hash, multiplier), new_term)\n",
-                "    yield int(current_hash)\n",
-                "\n",
-                "    # Roll through the rest of the string\n",
-                "    total_hashes = len(s) - window_width + 1\n",
-                "    for i in range(1, total_hashes):  # First hash is already yielded\n",
-                "        old_term = float(ord(s[i - 1]) + salt)\n",
-                "        new_term = float(ord(s[i + window_width - 1]) + salt)\n",
-                "\n",
-                "        # Remove leftmost char and add the new rightmost one.\n",
-                "        current_hash = sub_mod(current_hash, mul_mod(old_term, discarding_multiplier))\n",
-                "        current_hash = add_mod(mul_mod(current_hash, multiplier), new_term)\n",
-                "        yield int(current_hash)\n",
-                "\n",
-                "\n",
-                "# Quick sanity-check\n",
-                "assert list(rabin_karp_floats(\"abcd\", 3, 31, 1_000_000_007)) == [\n",
-                "    next(rabin_karp_floats(\"abc\", 3, 31, 1_000_000_007)),\n",
-                "    next(rabin_karp_floats(\"bcd\", 3, 31, 1_000_000_007)),\n",
-                "]\n",
-                "assert list(rabin_karp_floats(\"abcdefdhijklmnopqr\", 17, 31, 65521)) == [\n",
-                "    next(rabin_karp_floats(\"abcdefdhijklmnopq\", 17, 31, 65521)),\n",
-                "    next(rabin_karp_floats(\"bcdefdhijklmnopqr\", 17, 31, 65521)),\n",
-                "]"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "Let's load some data and ensure that the outputs are identical between the `int` and `float` implementations."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 40,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "from pathlib import Path\n",
-                "\n",
-                "dataset_directory = Path(\"..\")"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 41,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "textual_dataset_path = dataset_directory / \"leipzig1M.txt\"\n",
-                "textual_dataset = open(textual_dataset_path, \"r\").read().strip()"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 42,
-            "metadata": {},
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "Loaded 1,000,000 lines of mean length 128.64 characters\n"
-                    ]
-                }
-            ],
-            "source": [
-                "textual_lines = textual_dataset.split(\"\\n\")\n",
-                "print(f\"Loaded {len(textual_lines):,} lines of mean length {sum(len(line) for line in textual_lines) / len(textual_lines):.2f} characters\")"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 43,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "def compare_hashes(line, make_baseline_generator, make_test_generator):\n",
-                "    int_hashes = list(make_baseline_generator(line))\n",
-                "    float_hashes = list(make_test_generator(line))\n",
-                "    if int_hashes != float_hashes:\n",
-                "        print(f\"Int Hashes:   {int_hashes}\")\n",
-                "        print(f\"Float Hashes: {float_hashes}\")\n",
-                "\n",
-                "\n",
-                "for line in textual_lines[:2]:\n",
-                "    compare_hashes(\n",
-                "        line,\n",
-                "        lambda l: rabin_karp_ints(l, 17, 31, 65521),\n",
-                "        lambda l: rabin_karp_floats(l, 17, 31, 65521),\n",
-                "    )"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "A bigger question now is, will the same hold, if we use much larger modulo values?"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 44,
-            "metadata": {},
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "Passed for window width: 3!\n",
-                        "Passed for window width: 17!\n",
-                        "Passed for window width: 64!\n"
-                    ]
-                }
-            ],
-            "source": [
-                "LARGEST_SAFE_MODULO = 4503599626977\n",
-                "\n",
-                "for window_width in [3, 17, 64]:\n",
-                "    for line in textual_lines[:50]:\n",
-                "        compare_hashes(\n",
-                "            line,\n",
-                "            lambda l: rabin_karp_ints(l, window_width=window_width, multiplier=257, modulo=LARGEST_SAFE_MODULO),\n",
-                "            lambda l: rabin_karp_floats(l, window_width=window_width, multiplier=257, modulo=LARGEST_SAFE_MODULO))\n",
-                "    print(f\"Passed for window width: {window_width}!\")"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Rabin-Karp Rolling Hashing via FMAs\n",
-                "\n",
-                "- How aggressively can we use **FMA** (Fused Multiply-Add) operations to optimize the algorithm?\n",
-                "- How many of the modulo operations can we avoid?\n",
-                "- How can we simplify the `%` modulo operation?"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 45,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "import math\n",
-                "from typing import Generator\n",
-                "\n",
-                "LARGEST_INTEGRAL_FLOAT: float = 4503599627370495.0\n",
-                "\n",
-                "\n",
-                "def rabin_karp_fma(\n",
-                "    s: str,\n",
-                "    window_width: int,\n",
-                "    multiplier: int,\n",
-                "    modulo: int,\n",
-                "    alphabet_size: int = 256,\n",
-                "    salt: int = 1,\n",
-                ") -> Generator[int, None, None]:\n",
-                "    \"\"\"Return the rolling polynomial hashes of every length-`window_width` substring of `s`\n",
-                "    using Fused-Multiply-Add (FMA) operations & Barrett reduction for performance.\"\"\"\n",
-                "\n",
-                "    assert window_width > 0, \"Window width must be positive\"\n",
-                "    assert multiplier > 0, \"Multiplier must be positive\"\n",
-                "    assert modulo > 0, \"Modulo must be positive\"\n",
-                "    assert multiplier < modulo, \"Multiplier must be less than modulo\"\n",
-                "\n",
-                "    if len(s) < window_width:\n",
-                "        return\n",
-                "\n",
-                "    multiplier = float(multiplier)\n",
-                "    modulo = float(modulo)\n",
-                "    assert (\n",
-                "        modulo < LARGEST_INTEGRAL_FLOAT\n",
-                "    ), \"Modulo can't exceed the largest integral float value\"\n",
-                "\n",
-                "    # Ensure, we won't overflow the floating-point representation\n",
-                "    largest_post_modulo = modulo - 1\n",
-                "    max_possible_term = alphabet_size\n",
-                "    assert (\n",
-                "        largest_post_modulo * multiplier + max_possible_term <= LARGEST_INTEGRAL_FLOAT\n",
-                "    ), \"Will overflow\"\n",
-                "\n",
-                "    inverse_modulo: float = 1.0 / modulo\n",
-                "\n",
-                "    # Barrett reduction function\n",
-                "    # It will be used to reduce the intermediate results to the modulo range\n",
-                "    def barrett_mod(x: float) -> float:\n",
-                "        q = math.floor(x * inverse_modulo)\n",
-                "        result = x - q * modulo\n",
-                "        # Handle potential off-by-one errors\n",
-                "        if result >= modulo:\n",
-                "            result -= modulo\n",
-                "        elif result < 0:\n",
-                "            result += modulo\n",
-                "        assert result == (x % modulo), \"Barrett reduction failed\"\n",
-                "        return result\n",
-                "\n",
-                "    # All of the operations will happen with a modulo:\n",
-                "    def fma_mod(a: float, b: float, c: float) -> float:\n",
-                "        intermediate = a * b + c\n",
-                "        assert intermediate <= LARGEST_INTEGRAL_FLOAT, \"FMA did exceed integral range\"\n",
-                "        return barrett_mod(intermediate)\n",
-                "\n",
-                "    # Precompute the discarding multiplier\n",
-                "    negative_discarding_multiplier: float = 1.0\n",
-                "    for _ in range(window_width - 1):\n",
-                "        negative_discarding_multiplier = fma_mod(\n",
-                "            negative_discarding_multiplier, multiplier, 0.0\n",
-                "        )\n",
-                "    negative_discarding_multiplier = (\n",
-                "        -negative_discarding_multiplier\n",
-                "    )  # Negate for FMA compatibility\n",
-                "\n",
-                "    # Handle the first window - without dropping any characters\n",
-                "    current_hash: float = 0.0\n",
-                "    for char in s[:window_width]:\n",
-                "        new_term = float(ord(char) + salt)\n",
-                "        assert new_term < (alphabet_size + salt), \"Pass correct `alphabet_size`\"\n",
-                "        current_hash = fma_mod(current_hash, multiplier, new_term)\n",
-                "    yield int(current_hash)\n",
-                "\n",
-                "    # Roll through the rest of the string\n",
-                "    total_hashes = len(s) - window_width + 1\n",
-                "    for i in range(1, total_hashes):  # First hash is already yielded\n",
-                "        old_term = float(ord(s[i - 1]) + salt)\n",
-                "        new_term = float(ord(s[i + window_width - 1]) + salt)\n",
-                "\n",
-                "        # Remove leftmost char and add the new rightmost one.\n",
-                "        current_hash = fma_mod(old_term, negative_discarding_multiplier, current_hash)\n",
-                "        assert (\n",
-                "            current_hash >= -modulo\n",
-                "        ), \"Intermediate hash may be negative, but within modulo range\"\n",
-                "        current_hash = fma_mod(current_hash, multiplier, new_term)\n",
-                "        assert current_hash >= 0, \"Current hash should not be negative\"\n",
-                "        yield int(current_hash)\n",
-                "\n",
-                "\n",
-                "# Quick sanity-check\n",
-                "assert list(rabin_karp_fma(\"abcd\", 3, 31, 1_000_000_007)) == [\n",
-                "    next(rabin_karp_fma(\"abc\", 3, 31, 1_000_000_007)),\n",
-                "    next(rabin_karp_fma(\"bcd\", 3, 31, 1_000_000_007)),\n",
-                "]\n",
-                "assert list(rabin_karp_fma(\"abcdefdhijklmnopqr\", 17, 31, 65521)) == [\n",
-                "    next(rabin_karp_fma(\"abcdefdhijklmnopq\", 17, 31, 65521)),\n",
-                "    next(rabin_karp_fma(\"bcdefdhijklmnopqr\", 17, 31, 65521)),\n",
-                "]"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 46,
-            "metadata": {},
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "Passed for window width: 3!\n",
-                        "Passed for window width: 17!\n",
-                        "Passed for window width: 64!\n"
-                    ]
-                }
-            ],
-            "source": [
-                "LARGEST_SAFE_MODULO = 4503599626977\n",
-                "\n",
-                "for window_width in [3, 17, 64]:\n",
-                "    for line in textual_lines[:50]:\n",
-                "        compare_hashes(\n",
-                "            line,\n",
-                "            lambda l: rabin_karp_ints(l, window_width=window_width, multiplier=257, modulo=LARGEST_SAFE_MODULO),\n",
-                "            lambda l: rabin_karp_fma(l, window_width=window_width, multiplier=257, modulo=LARGEST_SAFE_MODULO))\n",
-                "    print(f\"Passed for window width: {window_width}!\")"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "As we can handle typical texts, let's try several tricky inputs... where we'll be at a brink of an overflow! Some uncomfortable character values are: `\\x00`, `\\x01`, `\\x7F`, `\\xFF`. To really stress-test, let's pick the largest prime number below `LARGEST_INTEGRAL_FLOAT`, that can be used safely for a given alphabet size."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 47,
-            "metadata": {},
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "4,503,599,627,370,449\n"
-                    ]
-                }
-            ],
-            "source": [
-                "from typing import Final, List, Generator\n",
-                "\n",
-                "# Fixed witnesses that make Miller-Rabin exact for n < 2**64\n",
-                "MR_BASES: Final[List[int]] = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37]\n",
-                "\n",
-                "\n",
-                "def _is_prime_64(n: int) -> bool:\n",
-                "    \"\"\"Exact primality for 0 < n < 2**64.\"\"\"\n",
-                "    if n < 2:\n",
-                "        return False\n",
-                "    # Quick reject: small prime factors\n",
-                "    for p in MR_BASES:  # covers all primes ≤ 37\n",
-                "        if n == p:\n",
-                "            return True\n",
-                "        if n % p == 0:\n",
-                "            return False\n",
-                "\n",
-                "    # Write n-1 = d · 2ˢ  with d odd\n",
-                "    d, s = n - 1, 0\n",
-                "    while d & 1 == 0:\n",
-                "        d >>= 1\n",
-                "        s += 1\n",
-                "\n",
-                "    # Strong-probable-prime test for each base\n",
-                "    for a in MR_BASES:\n",
-                "        x = pow(a, d, n)\n",
-                "        if x in (1, n - 1):  # self-loop or −1 ⇒ may be prime\n",
-                "            continue\n",
-                "        for _ in range(s - 1):  # square until −1 or cycle\n",
-                "            x = pow(x, 2, n)\n",
-                "            if x == n - 1:\n",
-                "                break\n",
-                "        else:  # never hit −1 ⇒ composite\n",
-                "            return False\n",
-                "    return True\n",
-                "\n",
-                "\n",
-                "def prev_primes(n: int) -> Generator[int, None, None]:\n",
-                "    \"\"\"\n",
-                "    Yield the largest primes strictly less than n (n must be > 2).\n",
-                "    Average cost: O(log n * log log n) because the prime gap ~ log n.\n",
-                "    \"\"\"\n",
-                "    if n <= 2:\n",
-                "        raise ValueError(\"Threshold must exceed 2.\")\n",
-                "    n -= n % 2 == 0  # make n odd\n",
-                "    while n > 2:\n",
-                "        if _is_prime_64(n):\n",
-                "            yield n\n",
-                "        n -= 2\n",
-                "\n",
-                "def next_primes(n: int) -> Generator[int, None, None]:\n",
-                "    \"\"\"\n",
-                "    Yield the smallest primes strictly greater than n (n must be > 2).\n",
-                "    Average cost: O(log n * log log n) because the prime gap ~ log n.\n",
-                "    \"\"\"\n",
-                "    if n <= 2:\n",
-                "        raise ValueError(\"Threshold must exceed 2.\")\n",
-                "    n += n % 2 == 0  # make n odd\n",
-                "    while True:\n",
-                "        if _is_prime_64(n):\n",
-                "            yield n\n",
-                "        n += 2\n",
-                "\n",
-                "LARGEST_INTEGRAL_FLOAT_PRIME = next(prev_primes(int(LARGEST_INTEGRAL_FLOAT)))\n",
-                "print(f\"{LARGEST_INTEGRAL_FLOAT_PRIME:,}\")  # This will be used for stress-testing"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 48,
-            "metadata": {},
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "Passed for window width: 3, modulo: 17,523,733,958,369!\n",
-                        "Passed for window width: 17, modulo: 17,523,733,958,369!\n",
-                        "Passed for window width: 64, modulo: 17,523,733,958,369!\n",
-                        "Passed for window width: 707, modulo: 17,523,733,958,369!\n"
-                    ]
-                }
-            ],
-            "source": [
-                "import random\n",
-                "\n",
-                "all_0 = \"\\x00\" * 1_000\n",
-                "all_1 = \"\\x01\" * 1_000\n",
-                "all_127 = \"\\x7f\" * 1_000\n",
-                "all_255 = \"\\xff\" * 1_000\n",
-                "all_0_255 = \"\\x00\\xff\" * 500  # alternating 0 and 255 characters\n",
-                "all_uncomfortable = \"\\x00\\x01\\x7f\\xfe\\xff\" * 250  # all uncomfortable characters\n",
-                "\n",
-                "long_random_strings = [\n",
-                "    \"\".join(random.choices(\"\\x00\\x01\\x7f\\xfe\\xff\", k=10_000)) for _ in range(10)\n",
-                "]  # 10 long random strings with uncomfortable characters\n",
-                "\n",
-                "alphabet_size = 256\n",
-                "multiplier = 257\n",
-                "largest_term = alphabet_size + 1  # in this specific case, same as `multiplier`\n",
-                "large_modulo = next(prev_primes(\n",
-                "    int(LARGEST_INTEGRAL_FLOAT) // multiplier - largest_term\n",
-                "))\n",
-                "\n",
-                "for window_width in [3, 17, 64, 707]:\n",
-                "    for line in [\n",
-                "        all_0,\n",
-                "        all_1,\n",
-                "        all_127,\n",
-                "        all_255,\n",
-                "        all_0_255,\n",
-                "        all_uncomfortable,\n",
-                "        *long_random_strings,\n",
-                "    ]:\n",
-                "        compare_hashes(\n",
-                "            line,\n",
-                "            lambda l: rabin_karp_ints(\n",
-                "                l,\n",
-                "                window_width=window_width,\n",
-                "                multiplier=multiplier,\n",
-                "                modulo=large_modulo,\n",
-                "                alphabet_size=alphabet_size,\n",
-                "            ),\n",
-                "            lambda l: rabin_karp_fma(\n",
-                "                l,\n",
-                "                window_width=window_width,\n",
-                "                multiplier=multiplier,\n",
-                "                modulo=large_modulo,\n",
-                "                alphabet_size=alphabet_size,\n",
-                "            ),\n",
-                "        )\n",
-                "    print(f\"Passed for window width: {window_width}, modulo: {large_modulo:,}!\")"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Min-Hash Fingerprinting\n",
-                "\n",
-                "Min-Hash fingerprints transform variable length text representations into **fixed-length vectors**, where each dimension stores the minimum hash value of a certain hash function across the whole document.\n",
-                "It's great for large-scale information retrieval using Hamming Distance or Jaccard Similarity ($|A ∩ B| / |A ∪ B|$) or its weighted alternative.\n",
-                "\n",
-                "A potentially more informative alternative is \"weighted Min-Hash\", which takes into account the frequency of each element in the document. This makes the fingerprints compatible with **TF-IDF**-like algorithms, and makes the system more robust especially for narrow rolling windows."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 49,
-            "metadata": {},
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "Defaulting to user installation because normal site-packages is not writeable\n",
-                        "Requirement already satisfied: tqdm in /home/ubuntu/.local/lib/python3.10/site-packages (4.67.1)\n",
-                        "Requirement already satisfied: numpy in /home/ubuntu/.local/lib/python3.10/site-packages (2.2.4)\n"
-                    ]
-                }
-            ],
-            "source": [
-                "!pip install tqdm numpy"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 50,
-            "metadata": {},
-            "outputs": [
-                {
-                    "data": {
-                        "text/plain": [
-                            "(14038566040298863954, 12264879942290955073)"
-                        ]
-                    },
-                    "execution_count": 50,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "from stringzilla import hash as sz_hash\n",
-                "\n",
-                "sz_hash(\"abc\", 200), sz_hash(\"abc\", 201)"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 51,
-            "metadata": {},
-            "outputs": [
-                {
-                    "data": {
-                        "text/plain": [
-                            "(array([2256051662, 1712240109], dtype=uint32),\n",
-                            " array([3, 2], dtype=uint32),\n",
-                            " array(['abc', 'abcd'], dtype=StringDType()))"
-                        ]
-                    },
-                    "execution_count": 51,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "import numpy as np\n",
-                "from numpy.dtypes import StringDType\n",
-                "from typing import List, Tuple\n",
-                "\n",
-                "\n",
-                "def count_min_sketch(\n",
-                "    text: str,\n",
-                "    window_widths: List[int],\n",
-                "    seeds: List[int],\n",
-                "    hash_resolution: np.dtype = np.uint32,\n",
-                ") -> Tuple[np.ndarray, np.ndarray, np.ndarray]:\n",
-                "    \"\"\"\n",
-                "    Produces a weighted Min-Hash fingerprint also called a Count-Min Sketch.\n",
-                "    Uses StringZilla's native hash function, as opposed to the Rabin Karp.\n",
-                "\n",
-                "    https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch\n",
-                "    \"\"\"\n",
-                "\n",
-                "    fingerprint_hashes = np.empty((len(window_widths),), dtype=hash_resolution)\n",
-                "    fingerprint_weights = np.empty((len(window_widths),), dtype=np.uint32)\n",
-                "    fingerprint_ngrams = np.empty((len(window_widths),), dtype=StringDType())\n",
-                "\n",
-                "    skipped_final_hash = np.iinfo(hash_resolution).max\n",
-                "    skipped_u64_intermediary = np.iinfo(np.uint64).max\n",
-                "\n",
-                "    for i, (window_width, seed) in enumerate(zip(window_widths, seeds)):\n",
-                "        assert window_width > 0, \"Window width must be positive\"\n",
-                "\n",
-                "        smallest_hash = skipped_u64_intermediary\n",
-                "        smallest_count = 0\n",
-                "        smallest_example = None\n",
-                "\n",
-                "        for j in range(len(text) - window_width + 1):\n",
-                "            text_window = text[j : j + window_width]\n",
-                "            rolling_intermediate_u64_hash = sz_hash(text_window, seed)\n",
-                "            new_smallest_hash = min(smallest_hash, rolling_intermediate_u64_hash)\n",
-                "            if new_smallest_hash < smallest_hash:\n",
-                "                smallest_count = 1\n",
-                "                smallest_hash = new_smallest_hash\n",
-                "                smallest_example = text_window\n",
-                "            elif new_smallest_hash == smallest_hash:\n",
-                "                smallest_count += 1\n",
-                "\n",
-                "        smallest_hash &= skipped_final_hash  # Ensure we don't exceed the `uint32` range\n",
-                "        fingerprint_hashes[i] = smallest_hash\n",
-                "        fingerprint_weights[i] = smallest_count\n",
-                "        fingerprint_ngrams[i] = smallest_example\n",
-                "\n",
-                "    return fingerprint_hashes, fingerprint_weights, fingerprint_ngrams\n",
-                "\n",
-                "\n",
-                "count_min_sketch(\"abcde\", window_widths=[3, 4], seeds=[257, 258])"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 52,
-            "metadata": {},
-            "outputs": [
-                {
-                    "data": {
-                        "text/plain": [
-                            "(array([   6498345, 1706860248], dtype=uint32),\n",
-                            " array([3, 2], dtype=uint32),\n",
-                            " array(['abc', 'bcde'], dtype=StringDType()))"
-                        ]
-                    },
-                    "execution_count": 52,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "import numpy as np\n",
-                "from numpy.dtypes import StringDType\n",
-                "from typing import List, Tuple\n",
-                "\n",
-                "\n",
-                "def rolling_count_min_sketch(\n",
-                "    text: str,\n",
-                "    window_widths: List[int],\n",
-                "    multipliers: List[int],\n",
-                "    salts: List[int],\n",
-                "    modulo: int,\n",
-                "    hash_resolution: np.dtype = np.uint32,\n",
-                ") -> Tuple[np.ndarray, np.ndarray, np.ndarray]:\n",
-                "    \"\"\"\n",
-                "    Produces a weighted Min-Hash fingerprint also called a Count-Min Sketch.\n",
-                "    Those sketches are trivial to merge\n",
-                "\n",
-                "    https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch\n",
-                "    \"\"\"\n",
-                "\n",
-                "    count_widths = len(window_widths)\n",
-                "    count_multipliers = len(multipliers)\n",
-                "    assert count_widths == count_multipliers, f\"{count_widths=} != {count_multipliers=}\"\n",
-                "\n",
-                "    fingerprint_hashes = np.empty((len(window_widths),), dtype=hash_resolution)\n",
-                "    fingerprint_weights = np.empty((len(window_widths),), dtype=np.uint32)\n",
-                "    fingerprint_ngrams = np.empty((len(window_widths),), dtype=StringDType())\n",
-                "\n",
-                "    skipped_final_hash = np.iinfo(hash_resolution).max\n",
-                "    skipped_u64_intermediary = np.iinfo(np.uint64).max\n",
-                "    hashers = [\n",
-                "        rabin_karp_fma(\n",
-                "            text,\n",
-                "            window_width=width,\n",
-                "            multiplier=multiplier,\n",
-                "            modulo=modulo,\n",
-                "            salt=salt,\n",
-                "        )\n",
-                "        for width, multiplier, salt in zip(window_widths, multipliers, salts)\n",
-                "    ]\n",
-                "\n",
-                "    for i, hasher in enumerate(hashers):\n",
-                "        smallest_hash = skipped_u64_intermediary\n",
-                "        smallest_count = 0\n",
-                "        smallest_example = None\n",
-                "        for rolling_intermediate_u64_hash in hasher:\n",
-                "            new_smallest_hash = min(smallest_hash, rolling_intermediate_u64_hash)\n",
-                "            if new_smallest_hash < smallest_hash:\n",
-                "                smallest_count = 1\n",
-                "                smallest_hash = new_smallest_hash\n",
-                "                smallest_example = text[i : i + window_widths[i]]\n",
-                "            elif new_smallest_hash == smallest_hash:\n",
-                "                smallest_count += 1\n",
-                "\n",
-                "        smallest_hash &= skipped_final_hash  # Ensure we don't exceed the `uint32` range\n",
-                "        fingerprint_hashes[i] = smallest_hash\n",
-                "        fingerprint_weights[i] = smallest_count\n",
-                "        fingerprint_ngrams[i] = smallest_example\n",
-                "\n",
-                "    return fingerprint_hashes, fingerprint_weights, fingerprint_ngrams\n",
-                "\n",
-                "\n",
-                "rolling_count_min_sketch(\"abcde\", window_widths=[3, 4], multipliers=[257, 258], salts=[1, 2], modulo=4503599626977)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "A good set of hyper-parameters for Min-Hashing binary text would be:\n",
-                "\n",
-                "- `window_widths`: ${3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 18, 21, 24, 27, 30}$ - 16 widths\n",
-                "- `alphabet_size`: $256$ for ASCII & binary UTF-8 content\n",
-                "- `ndim`: $16...1024$, something like 192 should be great for X/Twitter\n",
-                "- `multipliers`: ${257, 258, 259, 260, 261, 262, ..., 1024 + 256}$\n",
-                "\n",
-                "When processing less usual inputs, like the DNA sequences, parameters may be different, e.g.:\n",
-                "\n",
-                "- `window_widths`: ${3, 6, 9, 12, 15, 30, 60, 120}$\n",
-                "- `alphabet_size`: $4$ for DNA sequences\n",
-                "- `ndim`: should be probably proportional to $√n$, where $n$ is the typical length of sequences\n",
-                "- `multipliers`: ${5, 6, 7, 8, 9, ..., 4 * n + 1}$\n",
-                "\n",
-                "In every case, the `modulo` should be co-prime to the multiplier.\n",
-                "The easiest option is to use a large prime, that can be obtained via:\n",
-                "\n",
-                "```python\n",
-                "largest_prime_below(int(LARGEST_INTEGRAL_FLOAT) // max(multipliers) - (alphabet_size + 1))\n",
-                "```"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 53,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "import numpy as np\n",
-                "from typing import Tuple\n",
-                "\n",
-                "\n",
-                "def jaccard_similarity(a: np.ndarray, b: np.ndarray) -> float:\n",
-                "    if a.shape != b.shape:\n",
-                "        raise ValueError(\"Fingerprints must have identical length\")\n",
-                "\n",
-                "    return float(np.mean(a == b))\n",
-                "\n",
-                "\n",
-                "def weighted_jaccard_similarity(\n",
-                "    a: Tuple[np.ndarray, np.ndarray],\n",
-                "    b: Tuple[np.ndarray, np.ndarray],\n",
-                ") -> float:\n",
-                "    hashes_a, weights_a = a\n",
-                "    hashes_b, weights_b = b\n",
-                "\n",
-                "    if hashes_a.shape != hashes_b.shape or weights_a.shape != weights_b.shape:\n",
-                "        raise ValueError(\"Both fingerprints must have identical dimensions\")\n",
-                "\n",
-                "    magnitude_i = (weights_a * weights_b)[hashes_a == hashes_b].sum()\n",
-                "    magnitude_a = (weights_a * weights_a).sum()\n",
-                "    magnitude_b = (weights_b * weights_b).sum()\n",
-                "    magnitude_u = magnitude_a + magnitude_b - magnitude_i\n",
-                "\n",
-                "    return float(magnitude_i) / float(magnitude_u)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "Let's compute the rolling fingerprints:"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 54,
-            "metadata": {},
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "Loaded 1,000,000 lines of mean length 128.64 characters\n"
-                    ]
-                }
-            ],
-            "source": [
-                "textual_dataset_path = dataset_directory / \"leipzig1M.txt\"\n",
-                "textual_dataset = open(textual_dataset_path, \"r\").read().casefold().strip()\n",
-                "textual_lines = textual_dataset.split(\"\\n\")\n",
-                "print(f\"Loaded {len(textual_lines):,} lines of mean length {sum(len(line) for line in textual_lines) / len(textual_lines):.2f} characters\")"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 55,
-            "metadata": {},
-            "outputs": [
-                {
-                    "name": "stderr",
-                    "output_type": "stream",
-                    "text": [
-                        "Fingerprinting lines: 100%|██████████| 10000/10000 [00:43<00:00, 232.46line/s]\n"
-                    ]
-                }
-            ],
-            "source": [
-                "from tqdm import tqdm\n",
-                "from itertools import islice\n",
-                "\n",
-                "\n",
-                "def take_first_n(iterable, n):\n",
-                "    return islice(iterable, n)\n",
-                "\n",
-                "\n",
-                "def keep_each_nth(iterable, k):\n",
-                "    return (x for i, x in enumerate(iterable, 1) if i % k == 0)\n",
-                "\n",
-                "\n",
-                "NDIM: int = 192\n",
-                "consecutive_multipliers = list(range(256, 256 + NDIM))\n",
-                "prime_multipliers = list(take_first_n(keep_each_nth(next_primes(257), 7), NDIM))\n",
-                "\n",
-                "window_widths = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 18, 21, 24, 27, 30]\n",
-                "window_widths *= NDIM // len(window_widths)\n",
-                "salts = range(1, NDIM + 1)  # Use different salts for each window width\n",
-                "alphabet_size = 256\n",
-                "largest_term = alphabet_size + max(salts)\n",
-                "LARGEST_SAFE_MODULO = next(\n",
-                "    prev_primes(int(LARGEST_INTEGRAL_FLOAT) // max(prime_multipliers) - largest_term)\n",
-                ")\n",
-                "HASH_DTYPE = np.uint64\n",
-                "\n",
-                "fingerprint_hashes = []\n",
-                "fingerprint_counts = []\n",
-                "fingerprint_ngrams = []\n",
-                "\n",
-                "DATASET_SIZE_LIMIT = 10_000\n",
-                "\n",
-                "default_min_sketcher = lambda line: count_min_sketch(\n",
-                "    text=line,\n",
-                "    window_widths=window_widths,\n",
-                "    seeds=prime_multipliers,\n",
-                "    hash_resolution=HASH_DTYPE,\n",
-                ")\n",
-                "# For Rabin-Karp rolling hashes we pass more parameters:\n",
-                "default_rolling_sketcher = lambda line: rolling_count_min_sketch(\n",
-                "    text=line,\n",
-                "    window_widths=window_widths,\n",
-                "    multipliers=prime_multipliers,\n",
-                "    salts=salts,\n",
-                "    modulo=LARGEST_SAFE_MODULO,\n",
-                "    hash_resolution=HASH_DTYPE,\n",
-                ")\n",
-                "\n",
-                "for line in tqdm(\n",
-                "    textual_lines[:DATASET_SIZE_LIMIT], desc=\"Fingerprinting lines\", unit=\"line\"\n",
-                "):\n",
-                "    hashes, counts, ngrams = default_min_sketcher(line)\n",
-                "    fingerprint_hashes.append(hashes)\n",
-                "    fingerprint_counts.append(counts)\n",
-                "    fingerprint_ngrams.append(ngrams)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "Let's cross-reference the fingerprints counting the number of hash collisions without our test set."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 56,
-            "metadata": {},
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "Dimension 0: 345 unique hashes, 0 collisions\n",
-                        "Dimension 1: 1,366 unique hashes, 0 collisions\n",
-                        "Dimension 2: 2,439 unique hashes, 0 collisions\n",
-                        "Dimension 3: 3,400 unique hashes, 0 collisions\n",
-                        "Dimension 4: 5,267 unique hashes, 0 collisions\n",
-                        "Dimension 5: 6,297 unique hashes, 0 collisions\n",
-                        "Dimension 6: 7,303 unique hashes, 0 collisions\n",
-                        "Dimension 7: 8,027 unique hashes, 0 collisions\n",
-                        "Dimension 8: 8,681 unique hashes, 0 collisions\n",
-                        "Dimension 9: 9,059 unique hashes, 0 collisions\n",
-                        "Dimension 10: 9,635 unique hashes, 0 collisions\n",
-                        "Dimension 11: 9,887 unique hashes, 0 collisions\n",
-                        "Dimension 12: 9,939 unique hashes, 0 collisions\n",
-                        "Dimension 13: 9,941 unique hashes, 0 collisions\n",
-                        "Dimension 14: 9,916 unique hashes, 0 collisions\n",
-                        "Dimension 15: 9,879 unique hashes, 0 collisions\n",
-                        "Dimension 16: 446 unique hashes, 0 collisions\n",
-                        "Dimension 17: 1,335 unique hashes, 0 collisions\n",
-                        "Dimension 18: 2,445 unique hashes, 0 collisions\n",
-                        "Dimension 19: 3,641 unique hashes, 0 collisions\n",
-                        "Dimension 20: 5,054 unique hashes, 0 collisions\n",
-                        "Dimension 21: 6,272 unique hashes, 0 collisions\n",
-                        "Dimension 22: 7,288 unique hashes, 0 collisions\n",
-                        "Dimension 23: 8,114 unique hashes, 0 collisions\n",
-                        "Dimension 24: 8,677 unique hashes, 0 collisions\n",
-                        "Dimension 25: 8,934 unique hashes, 0 collisions\n",
-                        "Dimension 26: 9,622 unique hashes, 0 collisions\n",
-                        "Dimension 27: 9,891 unique hashes, 0 collisions\n",
-                        "Dimension 28: 9,934 unique hashes, 0 collisions\n",
-                        "Dimension 29: 9,947 unique hashes, 0 collisions\n",
-                        "Dimension 30: 9,921 unique hashes, 0 collisions\n",
-                        "Dimension 31: 9,875 unique hashes, 0 collisions\n",
-                        "Dimension 32: 423 unique hashes, 0 collisions\n",
-                        "Dimension 33: 1,331 unique hashes, 0 collisions\n",
-                        "Dimension 34: 2,281 unique hashes, 0 collisions\n",
-                        "Dimension 35: 3,579 unique hashes, 0 collisions\n",
-                        "Dimension 36: 5,163 unique hashes, 0 collisions\n",
-                        "Dimension 37: 6,388 unique hashes, 0 collisions\n",
-                        "Dimension 38: 7,398 unique hashes, 0 collisions\n",
-                        "Dimension 39: 7,918 unique hashes, 0 collisions\n",
-                        "Dimension 40: 8,692 unique hashes, 0 collisions\n",
-                        "Dimension 41: 8,988 unique hashes, 0 collisions\n",
-                        "Dimension 42: 9,657 unique hashes, 0 collisions\n",
-                        "Dimension 43: 9,855 unique hashes, 0 collisions\n",
-                        "Dimension 44: 9,934 unique hashes, 0 collisions\n",
-                        "Dimension 45: 9,944 unique hashes, 0 collisions\n",
-                        "Dimension 46: 9,926 unique hashes, 0 collisions\n",
-                        "Dimension 47: 9,872 unique hashes, 0 collisions\n",
-                        "Dimension 48: 459 unique hashes, 0 collisions\n",
-                        "Dimension 49: 875 unique hashes, 0 collisions\n",
-                        "Dimension 50: 2,584 unique hashes, 0 collisions\n",
-                        "Dimension 51: 3,709 unique hashes, 0 collisions\n",
-                        "Dimension 52: 5,276 unique hashes, 0 collisions\n",
-                        "Dimension 53: 6,460 unique hashes, 0 collisions\n",
-                        "Dimension 54: 7,191 unique hashes, 0 collisions\n",
-                        "Dimension 55: 7,973 unique hashes, 0 collisions\n",
-                        "Dimension 56: 8,520 unique hashes, 0 collisions\n",
-                        "Dimension 57: 9,010 unique hashes, 0 collisions\n",
-                        "Dimension 58: 9,685 unique hashes, 0 collisions\n",
-                        "Dimension 59: 9,859 unique hashes, 0 collisions\n",
-                        "Dimension 60: 9,952 unique hashes, 0 collisions\n",
-                        "Dimension 61: 9,939 unique hashes, 0 collisions\n",
-                        "Dimension 62: 9,918 unique hashes, 0 collisions\n",
-                        "Dimension 63: 9,871 unique hashes, 0 collisions\n",
-                        "Dimension 64: 413 unique hashes, 0 collisions\n",
-                        "Dimension 65: 1,101 unique hashes, 0 collisions\n",
-                        "Dimension 66: 2,476 unique hashes, 0 collisions\n",
-                        "Dimension 67: 3,692 unique hashes, 0 collisions\n",
-                        "Dimension 68: 5,268 unique hashes, 0 collisions\n",
-                        "Dimension 69: 6,433 unique hashes, 0 collisions\n",
-                        "Dimension 70: 7,186 unique hashes, 0 collisions\n",
-                        "Dimension 71: 8,179 unique hashes, 0 collisions\n",
-                        "Dimension 72: 8,573 unique hashes, 0 collisions\n",
-                        "Dimension 73: 8,992 unique hashes, 0 collisions\n",
-                        "Dimension 74: 9,682 unique hashes, 0 collisions\n",
-                        "Dimension 75: 9,841 unique hashes, 0 collisions\n",
-                        "Dimension 76: 9,928 unique hashes, 0 collisions\n",
-                        "Dimension 77: 9,918 unique hashes, 0 collisions\n",
-                        "Dimension 78: 9,925 unique hashes, 0 collisions\n",
-                        "Dimension 79: 9,876 unique hashes, 0 collisions\n",
-                        "Dimension 80: 350 unique hashes, 0 collisions\n",
-                        "Dimension 81: 1,266 unique hashes, 0 collisions\n",
-                        "Dimension 82: 2,297 unique hashes, 0 collisions\n",
-                        "Dimension 83: 3,625 unique hashes, 0 collisions\n",
-                        "Dimension 84: 5,102 unique hashes, 0 collisions\n",
-                        "Dimension 85: 6,497 unique hashes, 0 collisions\n",
-                        "Dimension 86: 7,193 unique hashes, 0 collisions\n",
-                        "Dimension 87: 8,038 unique hashes, 0 collisions\n",
-                        "Dimension 88: 8,469 unique hashes, 0 collisions\n",
-                        "Dimension 89: 9,035 unique hashes, 0 collisions\n",
-                        "Dimension 90: 9,619 unique hashes, 0 collisions\n",
-                        "Dimension 91: 9,865 unique hashes, 0 collisions\n",
-                        "Dimension 92: 9,930 unique hashes, 0 collisions\n",
-                        "Dimension 93: 9,931 unique hashes, 0 collisions\n",
-                        "Dimension 94: 9,919 unique hashes, 0 collisions\n",
-                        "Dimension 95: 9,865 unique hashes, 0 collisions\n",
-                        "Dimension 96: 442 unique hashes, 0 collisions\n",
-                        "Dimension 97: 1,179 unique hashes, 0 collisions\n",
-                        "Dimension 98: 2,260 unique hashes, 0 collisions\n",
-                        "Dimension 99: 3,904 unique hashes, 0 collisions\n",
-                        "Dimension 100: 5,143 unique hashes, 0 collisions\n",
-                        "Dimension 101: 6,179 unique hashes, 0 collisions\n",
-                        "Dimension 102: 7,239 unique hashes, 0 collisions\n",
-                        "Dimension 103: 7,790 unique hashes, 0 collisions\n",
-                        "Dimension 104: 8,663 unique hashes, 0 collisions\n",
-                        "Dimension 105: 9,092 unique hashes, 0 collisions\n",
-                        "Dimension 106: 9,636 unique hashes, 0 collisions\n",
-                        "Dimension 107: 9,840 unique hashes, 0 collisions\n",
-                        "Dimension 108: 9,931 unique hashes, 0 collisions\n",
-                        "Dimension 109: 9,937 unique hashes, 0 collisions\n",
-                        "Dimension 110: 9,906 unique hashes, 0 collisions\n",
-                        "Dimension 111: 9,875 unique hashes, 0 collisions\n",
-                        "Dimension 112: 477 unique hashes, 0 collisions\n",
-                        "Dimension 113: 1,019 unique hashes, 0 collisions\n",
-                        "Dimension 114: 2,410 unique hashes, 0 collisions\n",
-                        "Dimension 115: 3,583 unique hashes, 0 collisions\n",
-                        "Dimension 116: 5,294 unique hashes, 0 collisions\n",
-                        "Dimension 117: 6,508 unique hashes, 0 collisions\n",
-                        "Dimension 118: 7,062 unique hashes, 0 collisions\n",
-                        "Dimension 119: 8,001 unique hashes, 0 collisions\n",
-                        "Dimension 120: 8,524 unique hashes, 0 collisions\n",
-                        "Dimension 121: 8,972 unique hashes, 0 collisions\n",
-                        "Dimension 122: 9,652 unique hashes, 0 collisions\n",
-                        "Dimension 123: 9,854 unique hashes, 0 collisions\n",
-                        "Dimension 124: 9,947 unique hashes, 0 collisions\n",
-                        "Dimension 125: 9,943 unique hashes, 0 collisions\n",
-                        "Dimension 126: 9,918 unique hashes, 0 collisions\n",
-                        "Dimension 127: 9,871 unique hashes, 0 collisions\n",
-                        "Dimension 128: 436 unique hashes, 0 collisions\n",
-                        "Dimension 129: 1,221 unique hashes, 0 collisions\n",
-                        "Dimension 130: 2,291 unique hashes, 0 collisions\n",
-                        "Dimension 131: 3,850 unique hashes, 0 collisions\n",
-                        "Dimension 132: 5,053 unique hashes, 0 collisions\n",
-                        "Dimension 133: 6,374 unique hashes, 0 collisions\n",
-                        "Dimension 134: 7,393 unique hashes, 0 collisions\n",
-                        "Dimension 135: 8,098 unique hashes, 0 collisions\n",
-                        "Dimension 136: 8,602 unique hashes, 0 collisions\n",
-                        "Dimension 137: 9,073 unique hashes, 0 collisions\n",
-                        "Dimension 138: 9,695 unique hashes, 0 collisions\n",
-                        "Dimension 139: 9,873 unique hashes, 0 collisions\n",
-                        "Dimension 140: 9,924 unique hashes, 0 collisions\n",
-                        "Dimension 141: 9,940 unique hashes, 0 collisions\n",
-                        "Dimension 142: 9,920 unique hashes, 0 collisions\n",
-                        "Dimension 143: 9,870 unique hashes, 0 collisions\n",
-                        "Dimension 144: 473 unique hashes, 0 collisions\n",
-                        "Dimension 145: 1,208 unique hashes, 0 collisions\n",
-                        "Dimension 146: 2,305 unique hashes, 0 collisions\n",
-                        "Dimension 147: 3,852 unique hashes, 0 collisions\n",
-                        "Dimension 148: 4,905 unique hashes, 0 collisions\n",
-                        "Dimension 149: 6,541 unique hashes, 0 collisions\n",
-                        "Dimension 150: 7,204 unique hashes, 0 collisions\n",
-                        "Dimension 151: 7,946 unique hashes, 0 collisions\n",
-                        "Dimension 152: 8,543 unique hashes, 0 collisions\n",
-                        "Dimension 153: 9,029 unique hashes, 0 collisions\n",
-                        "Dimension 154: 9,591 unique hashes, 0 collisions\n",
-                        "Dimension 155: 9,827 unique hashes, 0 collisions\n",
-                        "Dimension 156: 9,933 unique hashes, 0 collisions\n",
-                        "Dimension 157: 9,938 unique hashes, 0 collisions\n",
-                        "Dimension 158: 9,912 unique hashes, 0 collisions\n",
-                        "Dimension 159: 9,871 unique hashes, 0 collisions\n",
-                        "Dimension 160: 406 unique hashes, 0 collisions\n",
-                        "Dimension 161: 1,297 unique hashes, 0 collisions\n",
-                        "Dimension 162: 2,575 unique hashes, 0 collisions\n",
-                        "Dimension 163: 3,728 unique hashes, 0 collisions\n",
-                        "Dimension 164: 5,267 unique hashes, 0 collisions\n",
-                        "Dimension 165: 6,177 unique hashes, 0 collisions\n",
-                        "Dimension 166: 7,355 unique hashes, 0 collisions\n",
-                        "Dimension 167: 8,058 unique hashes, 0 collisions\n",
-                        "Dimension 168: 8,525 unique hashes, 0 collisions\n",
-                        "Dimension 169: 9,061 unique hashes, 0 collisions\n",
-                        "Dimension 170: 9,665 unique hashes, 0 collisions\n",
-                        "Dimension 171: 9,870 unique hashes, 0 collisions\n",
-                        "Dimension 172: 9,921 unique hashes, 0 collisions\n",
-                        "Dimension 173: 9,940 unique hashes, 0 collisions\n",
-                        "Dimension 174: 9,918 unique hashes, 0 collisions\n",
-                        "Dimension 175: 9,881 unique hashes, 0 collisions\n",
-                        "Dimension 176: 373 unique hashes, 0 collisions\n",
-                        "Dimension 177: 1,308 unique hashes, 0 collisions\n",
-                        "Dimension 178: 2,357 unique hashes, 0 collisions\n",
-                        "Dimension 179: 3,728 unique hashes, 0 collisions\n",
-                        "Dimension 180: 5,421 unique hashes, 0 collisions\n",
-                        "Dimension 181: 6,246 unique hashes, 0 collisions\n",
-                        "Dimension 182: 7,419 unique hashes, 0 collisions\n",
-                        "Dimension 183: 8,200 unique hashes, 0 collisions\n",
-                        "Dimension 184: 8,448 unique hashes, 0 collisions\n",
-                        "Dimension 185: 8,999 unique hashes, 0 collisions\n",
-                        "Dimension 186: 9,673 unique hashes, 0 collisions\n",
-                        "Dimension 187: 9,893 unique hashes, 0 collisions\n",
-                        "Dimension 188: 9,941 unique hashes, 0 collisions\n",
-                        "Dimension 189: 9,949 unique hashes, 0 collisions\n",
-                        "Dimension 190: 9,921 unique hashes, 0 collisions\n",
-                        "Dimension 191: 9,861 unique hashes, 0 collisions\n"
-                    ]
-                }
-            ],
-            "source": [
-                "from typing import Dict, Set\n",
-                "\n",
-                "for dim in range(len(window_widths)):\n",
-                "    hash_to_ngram: Dict[int, str] = {}\n",
-                "    hash_collisions: Set[int] = set()\n",
-                "    for hashes, ngrams in zip(fingerprint_hashes, fingerprint_ngrams):\n",
-                "        hash_value = hashes[dim]\n",
-                "        ngram_value = ngrams[dim]\n",
-                "        if hash_value not in hash_to_ngram:\n",
-                "            hash_to_ngram[hash_value] = ngram_value\n",
-                "        elif hash_to_ngram[hash_value] != ngram_value:\n",
-                "            hash_collisions.add(hash_value)\n",
-                "\n",
-                "    print(f\"Dimension {dim}: {len(hash_to_ngram):,} unique hashes, {len(hash_collisions):,} collisions\")"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "Let's estimate Recall @ 1, but before we do that - let's find a way to highlight N-gram matches between strings."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 57,
-            "metadata": {},
-            "outputs": [
-                {
-                    "data": {
-                        "text/plain": [
-                            "(\"A short string<span style='color:#ffff00'> <span style='color:#0080ff'>wit</span></span>h an <span style='color:#ff0000'><span style='color:#ff8000'>n<span style='color:#00ff00'><span style='color:#ff00ff'>-gr</span>a</span>m</span></span>\",\n",
-                            " \"Longer strings<span style='color:#ffff00'> <span style='color:#0080ff'>wit</span></span>h different <span style='color:#ff0000'><span style='color:#ff8000'>n<span style='color:#00ff00'><span style='color:#ff00ff'>-gr</span>a</span>m</span></span>s\")"
-                        ]
-                    },
-                    "execution_count": 57,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "from typing import Tuple\n",
-                "from IPython.display import HTML\n",
-                "import numpy as np\n",
-                "\n",
-                "HTML_COLORS = [\n",
-                "    \"#ff0000\",\n",
-                "    \"#ff8000\",\n",
-                "    \"#ffff00\",\n",
-                "    \"#00ff00\",\n",
-                "    \"#0080ff\",\n",
-                "    \"#ff00ff\",\n",
-                "    \"#800080\",\n",
-                "]\n",
-                "ASCII_COLORS = [\n",
-                "    \"\\033[38;5;196m\",  # red\n",
-                "    \"\\033[38;5;208m\",  # orange\n",
-                "    \"\\033[38;5;226m\",  # yellow\n",
-                "    \"\\033[38;5;082m\",  # green\n",
-                "    \"\\033[38;5;039m\",  # blue\n",
-                "    \"\\033[38;5;201m\",  # magenta\n",
-                "    \"\\033[38;5;129m\",  # purple\n",
-                "]\n",
-                "\n",
-                "\n",
-                "def color_code_matches(\n",
-                "    query_text: str,\n",
-                "    document_text: str,\n",
-                "    query_hashes: np.ndarray,\n",
-                "    document_hashes: np.ndarray,\n",
-                "    query_ngrams: np.ndarray,\n",
-                "    document_ngrams: np.ndarray,\n",
-                "    *,\n",
-                "    html: bool = True,\n",
-                ") -> Tuple[str, str]:\n",
-                "    \"\"\"Highlight matching n‑grams / hash‑collisions in the two texts.\"\"\"\n",
-                "\n",
-                "    COLOR_ARRAY = (\n",
-                "        [f\"<span style='color:{hex_}'>\" for hex_ in HTML_COLORS]\n",
-                "        if html\n",
-                "        else ASCII_COLORS\n",
-                "    )\n",
-                "    COLOR_COLLISION = (\n",
-                "        \"<span style='color:#888888'>\" if html else \"\\033[38;5;244m\"\n",
-                "    )  # grey\n",
-                "    COLOR_RESET = \"</span>\" if html else \"\\033[0m\"\n",
-                "\n",
-                "    def number_of_matches_in_dimension(dim: int) -> int:\n",
-                "        if len(query_ngrams[dim]) == 0 or len(document_ngrams[dim]) == 0:\n",
-                "            return 0\n",
-                "        return min(\n",
-                "            query_text.count(query_ngrams[dim]),\n",
-                "            document_text.count(document_ngrams[dim]),\n",
-                "        )\n",
-                "\n",
-                "    def ngram_length_in_dimension(dim: int) -> int:\n",
-                "        return len(query_ngrams[dim]) if dim < len(query_ngrams) else 0\n",
-                "\n",
-                "    all_dims = [\n",
-                "        d for d in range(len(query_hashes)) if number_of_matches_in_dimension(d)\n",
-                "    ]\n",
-                "    all_dims.sort(key=ngram_length_in_dimension, reverse=True)\n",
-                "\n",
-                "    color_index = 0\n",
-                "    for dim in all_dims:\n",
-                "        if number_of_matches_in_dimension(dim) == 0:\n",
-                "            continue\n",
-                "\n",
-                "        is_hash_eq = query_hashes[dim] == document_hashes[dim]\n",
-                "        is_ngram_eq = query_ngrams[dim] == document_ngrams[dim]\n",
-                "        token = query_ngrams[dim]\n",
-                "        assert token, \"N‑gram must not be empty\"\n",
-                "\n",
-                "        if is_ngram_eq:\n",
-                "            color_tag = COLOR_ARRAY[color_index % len(COLOR_ARRAY)]\n",
-                "            replacement = f\"{color_tag}{token}{COLOR_RESET}\"\n",
-                "            color_index += 1\n",
-                "        elif is_hash_eq:\n",
-                "            replacement = f\"{COLOR_COLLISION}{token}{COLOR_RESET}\"\n",
-                "        else:\n",
-                "            continue\n",
-                "\n",
-                "        query_text = query_text.replace(token, replacement)\n",
-                "        document_text = document_text.replace(token, replacement)\n",
-                "\n",
-                "    return query_text, document_text\n",
-                "\n",
-                "\n",
-                "query_text = \"A short string with an n-gram\"\n",
-                "document_text = \"Longer strings with different n-grams\"\n",
-                "query_hashes, query_weights, query_ngrams = default_min_sketcher(query_text)\n",
-                "document_hashes, document_weights, document_ngrams = default_min_sketcher(document_text)\n",
-                "color_code_matches(\n",
-                "    query_text=query_text,\n",
-                "    document_text=document_text,\n",
-                "    query_hashes=query_hashes,\n",
-                "    document_hashes=document_hashes,\n",
-                "    query_ngrams=query_ngrams,\n",
-                "    document_ngrams=document_ngrams,\n",
-                ")"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 58,
-            "metadata": {},
-            "outputs": [
-                {
-                    "name": "stderr",
-                    "output_type": "stream",
-                    "text": [
-                        "Searching: 100%|██████████| 10/10 [00:00<00:00, 26.05doc/s]\n"
-                    ]
-                },
-                {
-                    "data": {
-                        "text/html": [
-                            "<pre style='font-family:monospace'>Matched query 0 with document 1,559 with score 0.0677<br/>a rebel statement sent to lisbon from jamba said 86 government soldie<span style='color:#ff00ff'>rs </span>and 13 gu<span style='color:#ff0000'>erril<span style='color:#ff8000'>las<span style='color:#ffff00'><span style='color:#00ff00'> were</span> </span>ki</span>l<span style='color:#0080ff'>led </span></span>in <span style='color:#ff0000'>the</span> f<span style='color:#800080'>igh</span>ting that ended jan. 3. it said <span style='color:#ff0000'>the</span> rebel forces sill held mavinga.<br/>hou<span style='color:#ff00ff'>rs </span>later, six leftist gu<span style='color:#ff0000'>erril<span style='color:#ff8000'>las<span style='color:#ffff00'><span style='color:#00ff00'> were</span> </span>ki</span>l<span style='color:#0080ff'>led </span></span>in a battle with a special army brigade created to f<span style='color:#800080'>igh</span>t <span style='color:#ff0000'>the</span> rebels.<br/>Matched query 1 with document 6,376 with score 0.0573<br/>auth<span style='color:#ff00ff'><span style='color:#800080'>ori</span>t</span>ies last week issued a vacate order for a club in<span style='color:#ff0000'> ma<span style='color:#00ff00'>nh<span style='color:#ff0000'>att</span></span></span>an<span style='color:#0080ff'> and</span> closed another in th<span style='color:#ff8000'>e <span style='color:#ffff00'>bronx</span></span>.<br/>auth<span style='color:#ff00ff'><span style='color:#800080'>ori</span>t</span>ies raided 15 suspected boiler room sites monday in<span style='color:#ff0000'> ma<span style='color:#00ff00'>nh<span style='color:#ff0000'>att</span></span></span>an, th<span style='color:#ff8000'>e <span style='color:#ffff00'>bronx</span></span>, brooklyn<span style='color:#0080ff'> and</span> queens,<span style='color:#0080ff'> and</span> on long island in hempstead<span style='color:#0080ff'> and</span> massapequa.<br/>Matched query 2 with document 2,540 with score 0.0625<br/>at the first pan am bankruptcy hearing, for example, a<span style='color:#ff0000'><span style='color:#ff8000'><span style='color:#0080ff'>t le</span>ast</span> fi</span>ve airlines<span style='color:#ffff00'><span style='color:#00ff00'> were</span> </span>represent<span style='color:#ff00ff'>ed.</span><br/>a<span style='color:#ff0000'><span style='color:#ff8000'><span style='color:#0080ff'>t le</span>ast</span> fi</span>ve businesses<span style='color:#ffff00'><span style='color:#00ff00'> were</span> </span>destroyed and 15<span style='color:#ffff00'><span style='color:#00ff00'> were</span> </span>damag<span style='color:#ff00ff'>ed.</span><br/>Matched query 3 with document 3,331 with score 0.0417<br/>mr. neigum, poker-faced duri<span style='color:#ff0000'><span style='color:#00ff00'>ng </span>the </span><span style='color:#ff8000'>di<span style='color:#ffff00'>ffic</span></span>ult task,<span style='color:#0080ff'> ma</span>nages a 46-second showing.<br/>although they appreciate it, it's<span style='color:#0080ff'> ma</span>ki<span style='color:#ff0000'><span style='color:#00ff00'>ng </span>the </span>day-to-day operations very <span style='color:#ff8000'>di<span style='color:#ffff00'>ffic</span></span>ult.<br/>Matched query 4 with document 7,018 with score 0.0469<br/>this, co<span style='color:#ff0000'><span style='color:#ff8000'><span style='color:#ffff00'>mbined w</span>i</span>th </span>the container division talks, suggests the group's bankers might be considering an orderly disposal of all assets.<br/>that, co<span style='color:#ff0000'><span style='color:#ff8000'><span style='color:#ffff00'>mbined w</span>i</span>th </span>lower prices, could get gnp back up to zero growth during the first quarter of 1991, he says.<br/>Matched query 5 with document 6,679 with score 0.0573<br/>she told the post i<span style='color:#ff0000'>n a<span style='color:#ff8000'>n <span style='color:#ffff00'><span style='color:#00ff00'>i<span style='color:#ff00ff'>nter</span></span>v</span></span></span>iew published sunday that some of the money may have become \"mingled\" into improvements on her home that included a swimming pool, a $2,500 wide-screen televis<span style='color:#0080ff'>ion </span>and renovations to her basement.<br/>chancellor helmut kohl said i<span style='color:#ff0000'>n a<span style='color:#ff8000'>n <span style='color:#ffff00'><span style='color:#00ff00'>i<span style='color:#ff00ff'>nter</span></span>v</span></span></span>iew with the ard televis<span style='color:#0080ff'>ion </span>network that he was \"alarmed\" by the republicans' showing in west berlin.<br/>Matched query 6 with document 3,322 with score 0.0469<br/>acco<span style='color:#ff0000'>r<span style='color:#ff8000'>ding t</span></span>o a stud<span style='color:#0080ff'>y b</span>y the m<span style='color:#00ff00'>ars</span>hall institute, the average nasa employee's age<span style='color:#ffff00'> in </span>1963 was 30; now most of its senior and middle-managers will be eligible to retire<span style='color:#ffff00'> in </span>five ye<span style='color:#00ff00'>ars</span>.<br/>fifteen ye<span style='color:#00ff00'>ars</span> after the first oil embargo, the u.s. economy has stopped making gains<span style='color:#ffff00'> in </span>energy efficiency, acco<span style='color:#ff0000'>r<span style='color:#ff8000'>ding t</span></span>o experts, who add that the country ma<span style='color:#0080ff'>y b</span>e losing ground instead.<br/>Matched query 7 with document 8,828 with score 0.0990<br/>preston tisch, 62, is <span style='color:#ffff00'><span style='color:#00ff00'>president</span> a</span><span style='color:#ff0000'>nd </span>co-chie<span style='color:#ff0000'>f execu<span style='color:#ff8000'>tive o<span style='color:#0080ff'>f<span style='color:#ff00ff'>fice</span></span>r</span> o</span>f loews<span style='color:#800080'> cor</span>p. a<span style='color:#ff0000'>nd </span>is a former postmaster general.<br/>arnold staloff, past <span style='color:#00ff00'>president</span> of the new york commodity exchange, on tuesday was named <span style='color:#ffff00'><span style='color:#00ff00'>president</span> a</span><span style='color:#ff0000'>nd </span>chie<span style='color:#ff0000'>f execu<span style='color:#ff8000'>tive o<span style='color:#0080ff'>f<span style='color:#ff00ff'>fice</span></span>r</span> o</span>f bloom staloff<span style='color:#800080'> cor</span>p., a philadelphia securities trading firm.<br/>Matched query 8 with document 7,057 with score 0.0521<br/>\"we'<span style='color:#ff0000'><span style='color:#ff8000'>re d<span style='color:#ffff00'>ealing </span></span></span>w<span style='color:#0080ff'>ith</span> an owner who couldn't give a rip.<span style='color:#00ff00'> they</span> cut off her mail and sh<span style='color:#ff00ff'>e g</span>ot a post office box.\" starting friday, an animal-control officer is accompanying finster on his route.<br/>\"once the borrowers knows<span style='color:#00ff00'> they</span> a<span style='color:#ff0000'><span style='color:#ff8000'>re d<span style='color:#ffff00'>ealing </span></span></span>w<span style='color:#0080ff'>ith</span> th<span style='color:#ff00ff'>e g</span>overnment, the likelihood of delinquency increases tremendously,\" he said.<br/>Matched query 9 with document 1,011 with score 0.0573<br/>asked if<span style='color:#ff00ff'> he </span>might bring <span style='color:#ff0000'>the</span> world leaders to texas, possibly to san antonio, <span style='color:#ff0000'>t<span style='color:#ff8000'>h<span style='color:#ffff00'>e<span style='color:#00ff00'> pres</span></span>ide</span></span>nt remarked, \"<span style='color:#0080ff'>t<span style='color:#800080'>hat'</span></span>s a distinct possibility.<br/>\"<span style='color:#0080ff'>t<span style='color:#800080'>hat'</span></span>s not <span style='color:#ff0000'>the</span> issue,\"<span style='color:#ff00ff'> he </span>said. \"i got enough votes to become <span style='color:#ff0000'>t<span style='color:#ff8000'>h<span style='color:#ffff00'>e<span style='color:#00ff00'> pres</span></span>ide</span></span>nt.</pre>"
-                        ],
-                        "text/plain": [
-                            "<IPython.core.display.HTML object>"
-                        ]
-                    },
-                    "metadata": {},
-                    "output_type": "display_data"
-                }
-            ],
-            "source": [
-                "from tqdm import tqdm\n",
-                "from IPython.display import display\n",
-                "\n",
-                "QUERIES_TO_COMPARE = 10\n",
-                "\n",
-                "log_lines = []\n",
-                "\n",
-                "for i, query_hashes, query_counts, query_ngrams in tqdm(zip(\n",
-                "    range(QUERIES_TO_COMPARE),\n",
-                "    fingerprint_hashes[:QUERIES_TO_COMPARE],\n",
-                "    fingerprint_counts[:QUERIES_TO_COMPARE],\n",
-                "    fingerprint_ngrams[:QUERIES_TO_COMPARE],\n",
-                "), desc=\"Searching\", unit=\"doc\", total=QUERIES_TO_COMPARE):\n",
-                "    \n",
-                "    # Compare with all other fingerprints\n",
-                "    best_score, best_index = 0.0, -1\n",
-                "    for j, dataset_hashes, dataset_counts, dataset_ngrams in zip(\n",
-                "        range(len(fingerprint_hashes)),\n",
-                "        fingerprint_hashes,\n",
-                "        fingerprint_counts,\n",
-                "        fingerprint_ngrams,\n",
-                "    ):\n",
-                "        if i == j:\n",
-                "            continue\n",
-                "\n",
-                "        score = jaccard_similarity(query_hashes, dataset_hashes)\n",
-                "        if score > best_score:\n",
-                "            best_score = score\n",
-                "            best_index = j\n",
-                "\n",
-                "    query = textual_lines[i]\n",
-                "    doc = textual_lines[best_index]\n",
-                "    colored_query, colored_doc = color_code_matches(\n",
-                "        query_text=query,\n",
-                "        document_text=doc,\n",
-                "        query_hashes=query_hashes,\n",
-                "        document_hashes=fingerprint_hashes[best_index],\n",
-                "        query_ngrams=query_ngrams,\n",
-                "        document_ngrams=fingerprint_ngrams[best_index],\n",
-                "    )\n",
-                "    log_lines.extend([\n",
-                "        f\"Matched query {i:,} with document {best_index:,} with score {best_score:.4f}\",\n",
-                "        colored_query,\n",
-                "        colored_doc,\n",
-                "    ])\n",
-                "    \n",
-                "concatenated_log = \"<br/>\".join(log_lines)\n",
-                "monospaced_log = HTML(f\"<pre style='font-family:monospace'>{concatenated_log}</pre>\")\n",
-                "display(monospaced_log)"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 59,
-            "metadata": {},
-            "outputs": [
-                {
-                    "name": "stderr",
-                    "output_type": "stream",
-                    "text": [
-                        "Searching: 100%|██████████| 10/10 [00:00<00:00, 16.21doc/s]\n"
-                    ]
-                },
-                {
-                    "data": {
-                        "text/html": [
-                            "<pre style='font-family:monospace'>Matched query 0 with document 4,683 with score 0.0350<br/>a rebel statement sent to lisbon from jamba said 86 g<span style='color:#ff0000'>o<span style='color:#ff8000'><span style='color:#ffff00'>vernme</span>n</span>t </span>soldiers<span style='color:#00ff00'> and</span> 13 guerrillas we<span style='color:#0080ff'>re </span>killed in the fighting that ended jan. 3. it said the rebel forces sill held mavinga.<br/>sapoa, nicaragua _ rebel<span style='color:#00ff00'> and</span> g<span style='color:#ff0000'>o<span style='color:#ff8000'><span style='color:#ffff00'>vernme</span>n</span>t </span>negotiators began hammering out the details of a 60-day cease-fi<span style='color:#0080ff'>re </span>outlined in the peace accord signed last week.<br/>Matched query 1 with document 4,907 with score 0.0424<br/>a<span style='color:#ff0000'><span style='color:#ff8000'>uthori<span style='color:#ffff00'>tie<span style='color:#0080ff'>s la</span>st</span> we</span><span style='color:#ff00ff'>ek </span></span>issued a vacate order for a club in manhattan<span style='color:#00ff00'> and</span> closed another in the bronx.<br/>a<span style='color:#ff0000'><span style='color:#ff8000'>uthori<span style='color:#ffff00'>tie<span style='color:#0080ff'>s la</span>st</span> we</span><span style='color:#ff00ff'>ek </span></span>announced the seizure of nearly 100 tons of steel tubing<span style='color:#00ff00'> and</span> other parts about to be shipped out of italy to jordan to be transported overland from there, presumably to an assembly point near baghdad.<br/>Matched query 2 with document 4,939 with score 0.0396<br/>a<span style='color:#ff0000'>t the<span style='color:#ff8000'> fir</span></span>st pan am bankruptcy hearing, for example, at least five airlines were represented.<br/>a<span style='color:#ff0000'>t the<span style='color:#ff8000'> fir</span></span>st show, como won for best vocal performance by a male.<br/>Matched query 3 with document 9,924 with score 0.0233<br/>mr. neigum, poker-faced duri<span style='color:#ffff00'>ng </span>the<span style='color:#00ff00'> di</span><span style='color:#ff0000'><span style='color:#ff8000'>ffic</span>ult </span>task, manages a 46-second showing.<br/>there is a series of really important and<span style='color:#00ff00'> di</span><span style='color:#ff0000'><span style='color:#ff8000'>ffic</span>ult </span>issues of european policy cryi<span style='color:#ffff00'>ng </span>out for it to address.<br/>Matched query 4 with document 7,018 with score 0.0688<br/>this, co<span style='color:#ff0000'><span style='color:#ff8000'><span style='color:#ffff00'>mbined w</span>i</span>th </span>the container division talks, suggests the group's bankers might be considering an orderly disposal of all assets.<br/>that, co<span style='color:#ff0000'><span style='color:#ff8000'><span style='color:#ffff00'>mbined w</span>i</span>th </span>lower prices, could get gnp back up to zero growth during the first quarter of 1991, he says.<br/>Matched query 5 with document 883 with score 0.0523<br/>she told the post i<span style='color:#ff0000'>n a<span style='color:#ff8000'>n <span style='color:#ffff00'>i<span style='color:#0080ff'>nter</span>v</span></span></span>iew published sunday that some of the money may have become \"mingled\" into improvement<span style='color:#ff00ff'>s on</span> her home that included a swimming pool, a $2,500 wide-screen televis<span style='color:#00ff00'>ion </span>and renovations to her basement.<br/>i<span style='color:#ff0000'>n a<span style='color:#ff8000'>n <span style='color:#ffff00'>i<span style='color:#0080ff'>nter</span>v</span></span></span>iew with the washington post in early october, the secretary said the fed may be slightly more i<span style='color:#0080ff'>nter</span>ested in curbing inflat<span style='color:#00ff00'>ion </span>than the administrat<span style='color:#00ff00'>ion </span>is, while the administrat<span style='color:#00ff00'>ion </span>may put slightly more emphasi<span style='color:#ff00ff'>s on</span> spurring economic growth.<br/>Matched query 6 with document 7,774 with score 0.0331<br/>acco<span style='color:#ff0000'>r<span style='color:#ff8000'>ding t</span></span>o a study by the marshall institute<span style='color:#0080ff'>, t</span>he average nasa employee's age in 1963 was 30; now most of its senior<span style='color:#ffff00'> and</span> middle-managers will be eligible to reti<span style='color:#00ff00'>re </span>in five years.<br/>acco<span style='color:#ff0000'>r<span style='color:#ff8000'>ding t</span></span>o the royal palace<span style='color:#0080ff'>, t</span>he crackdown left 10 people dead<span style='color:#ffff00'> and</span> 107 injured throughout the country on april 6. but witnesses said at least 200 people we<span style='color:#00ff00'>re </span>killed in katmandu alone.<br/>Matched query 7 with document 4,990 with score 0.0621<br/>preston tisch, 62, is <span style='color:#ffff00'><span style='color:#0080ff'>president</span> a</span>nd co-<span style='color:#00ff00'>c<span style='color:#800080'>hi<span style='color:#ff8000'>ef </span></span>exec</span>u<span style='color:#ff0000'><span style='color:#ff8000'>ti<span style='color:#ff00ff'>ve of<span style='color:#ff0000'>fice</span></span>r</span> of</span> loews corp. and is a former postmaster general.<br/>thomas m. egan, <span style='color:#ffff00'><span style='color:#0080ff'>president</span> a</span>nd <span style='color:#00ff00'>c<span style='color:#800080'>hi<span style='color:#ff8000'>ef </span></span>exec</span>u<span style='color:#ff0000'><span style='color:#ff8000'>ti<span style='color:#ff00ff'>ve of<span style='color:#ff0000'>fice</span></span>r</span> of</span> stotler group, said the company had been in negotiations with the creditors in efforts to reach a solution.<br/>Matched query 8 with document 7,057 with score 0.0496<br/>\"we'<span style='color:#ff0000'><span style='color:#ff8000'>re d<span style='color:#ffff00'>ealing </span></span></span>w<span style='color:#0080ff'>ith</span> an owner who couldn't give a rip.<span style='color:#00ff00'> they</span> cut off her mail and sh<span style='color:#ff00ff'>e g</span>ot a post office box.\" starting friday, an animal-control officer is accompanying finster on his route.<br/>\"once the borrowers knows<span style='color:#00ff00'> they</span> a<span style='color:#ff0000'><span style='color:#ff8000'>re d<span style='color:#ffff00'>ealing </span></span></span>w<span style='color:#0080ff'>ith</span> th<span style='color:#ff00ff'>e g</span>overnment, the likelihood of delinquency increases tremendously,\" he said.<br/>Matched query 9 with document 8,003 with score 0.0340<br/>asked if he m<span style='color:#0080ff'>igh</span>t bri<span style='color:#ff0000'><span style='color:#ff8000'>n<span style='color:#ffff00'><span style='color:#00ff00'>g the </span>w</span></span>o</span>rld lea<span style='color:#ff00ff'>der</span>s to texas, possibly to san antonio, the president remarked, \"that's a distinct possibility.<br/>about 8,800 firef<span style='color:#0080ff'>igh</span>ters are battli<span style='color:#ff0000'><span style='color:#ff8000'>n<span style='color:#ffff00'><span style='color:#00ff00'>g the </span>w</span></span>o</span>rst of some 200 blazes in the west and alaska, 16 of them consi<span style='color:#ff00ff'>der</span>ed major fires. nearly a third of the manpower is in yellowstone national park, where fires have charred 140,000 acres.</pre>"
-                        ],
-                        "text/plain": [
-                            "<IPython.core.display.HTML object>"
-                        ]
-                    },
-                    "metadata": {},
-                    "output_type": "display_data"
-                }
-            ],
-            "source": [
-                "from tqdm import tqdm\n",
-                "from IPython.display import display\n",
-                "\n",
-                "QUERIES_TO_COMPARE = 10\n",
-                "\n",
-                "log_lines = []\n",
-                "\n",
-                "for i, query_hashes, query_counts, query_ngrams in tqdm(\n",
-                "    zip(\n",
-                "        range(QUERIES_TO_COMPARE),\n",
-                "        fingerprint_hashes[:QUERIES_TO_COMPARE],\n",
-                "        fingerprint_counts[:QUERIES_TO_COMPARE],\n",
-                "        fingerprint_ngrams[:QUERIES_TO_COMPARE],\n",
-                "    ),\n",
-                "    desc=\"Searching\",\n",
-                "    unit=\"doc\",\n",
-                "    total=QUERIES_TO_COMPARE,\n",
-                "):\n",
-                "\n",
-                "    # Compare with all other fingerprints\n",
-                "    best_score, best_index = 0.0, -1\n",
-                "    for j, dataset_hashes, dataset_counts, dataset_ngrams in zip(\n",
-                "        range(len(fingerprint_hashes)),\n",
-                "        fingerprint_hashes,\n",
-                "        fingerprint_counts,\n",
-                "        fingerprint_ngrams,\n",
-                "    ):\n",
-                "        if i == j:\n",
-                "            continue\n",
-                "\n",
-                "        score = weighted_jaccard_similarity(\n",
-                "            (query_hashes, query_counts),\n",
-                "            (dataset_hashes, dataset_counts),\n",
-                "        )\n",
-                "        if score > best_score:\n",
-                "            best_score = score\n",
-                "            best_index = j\n",
-                "\n",
-                "    query = textual_lines[i]\n",
-                "    doc = textual_lines[best_index]\n",
-                "    colored_query, colored_doc = color_code_matches(\n",
-                "        query_text=query,\n",
-                "        document_text=doc,\n",
-                "        query_hashes=query_hashes,\n",
-                "        document_hashes=fingerprint_hashes[best_index],\n",
-                "        query_ngrams=query_ngrams,\n",
-                "        document_ngrams=fingerprint_ngrams[best_index],\n",
-                "    )\n",
-                "    log_lines.extend(\n",
-                "        [\n",
-                "            f\"Matched query {i:,} with document {best_index:,} with score {best_score:.4f}\",\n",
-                "            colored_query,\n",
-                "            colored_doc,\n",
-                "        ]\n",
-                "    )\n",
-                "\n",
-                "concatenated_log = \"<br/>\".join(log_lines)\n",
-                "monospaced_log = HTML(f\"<pre style='font-family:monospace'>{concatenated_log}</pre>\")\n",
-                "display(monospaced_log)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "metadata": {},
-            "source": [
-                "## Min-Hash Fingerprinting DNA & Protein Sequences"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 60,
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "dna_dataset_path = dataset_directory / \"acgt_10k.txt\"\n",
-                "dna_dataset = open(dna_dataset_path, \"r\").read().strip()"
-            ]
-        }
-    ],
-    "metadata": {
-        "kernelspec": {
-            "display_name": "StringZilla",
-            "language": "python",
-            "name": "python3"
-        },
-        "language_info": {
-            "codemirror_mode": {
-                "name": "ipython",
-                "version": 3
-            },
-            "file_extension": ".py",
-            "mimetype": "text/x-python",
-            "name": "python",
-            "nbconvert_exporter": "python",
-            "pygments_lexer": "ipython3",
-            "version": "3.11.11"
-        }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Exploring Rabin-Karp-style Min-Hash Fingerprinting\n",
+    "\n",
+    "This document showcases the differences between different numeric types that one can use to implement a Rabin-Karp-style min-hash fingerprinting algorithm.\n",
+    "It answers several important questions:\n",
+    "\n",
+    "- How to use floating-point numbers for a traditionally integer-based task - \"hashing\"?\n",
+    "- How to properly compose many such hash functions to maximize the quality of fingerprints?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Rabin-Karp Rolling Hashing\n",
+    "\n",
+    "Rabin-Karp algorithm is a polynomial rolling hash function built around modulo arithmetic.\n",
+    "Once the hashing window rolls forward, the leftmost character is removed and a new rightmost character is added.\n",
+    "Thus, the cost of computing each slices hash is just $O(1)$, if the previous window's hash is known.\n",
+    "\n",
+    "Assuming, many such rolling hashes will be used later, we can parameterize the algorithm with a few parameters:\n",
+    "- `window_width` - the length of the substring to hash;\n",
+    "- `multiplier` - the multiplier for the polynomial hash;\n",
+    "- `modulo` - the modulo to use for the hash, generally prime;\n",
+    "- `alphabet_size` - the size of the alphabet used in the string, e.g. 256 for ASCII;\n",
+    "- `salt` - an optional salt to add to each character's ordinal value, usually 1 to avoid adding zeroes;\n",
+    "- `seed` - an optional seed for the first hash, can be 0."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Generator\n",
+    "\n",
+    "\n",
+    "def rabin_karp_ints(\n",
+    "    s: str,\n",
+    "    window_width: int,\n",
+    "    multiplier: int,\n",
+    "    modulo: int,\n",
+    "    alphabet_size: int = 256,\n",
+    "    salt: int = 1,\n",
+    ") -> Generator[int, None, None]:\n",
+    "    \"\"\"Return the rolling polynomial hashes of every length-`window_width` substring of `s`\"\"\"\n",
+    "\n",
+    "    assert window_width > 0, \"Window width must be positive\"\n",
+    "    assert multiplier > 0, \"Multiplier must be positive\"\n",
+    "    assert modulo > 0, \"Modulo must be positive\"\n",
+    "    assert multiplier < modulo, \"Multiplier must be less than modulo\"\n",
+    "\n",
+    "    if len(s) < window_width:\n",
+    "        return\n",
+    "\n",
+    "    current_hash: int = 0\n",
+    "    for char in s[:window_width]:\n",
+    "        new_term = ord(char) + salt\n",
+    "        assert new_term < (alphabet_size + salt), \"Pass correct `alphabet_size`\"\n",
+    "        current_hash = (current_hash * multiplier + new_term) % modulo\n",
+    "    yield current_hash\n",
+    "\n",
+    "    discarding_multiplier: int = pow(multiplier, window_width - 1, modulo)\n",
+    "    total_hashes = len(s) - window_width + 1\n",
+    "    for i in range(1, total_hashes):  # First hash is already yielded\n",
+    "        old_term = ord(s[i - 1]) + salt\n",
+    "        new_term = ord(s[i + window_width - 1]) + salt\n",
+    "\n",
+    "        # Remove leftmost char and add the new rightmost one.\n",
+    "        # All operations must be modulo `modulo`, but assuming the infinite precision of integers,\n",
+    "        # we don't care in this draft.\n",
+    "        current_hash = (current_hash - old_term * discarding_multiplier) % modulo\n",
+    "        current_hash = (current_hash * multiplier + new_term) % modulo\n",
+    "        yield current_hash\n",
+    "\n",
+    "\n",
+    "# Quick sanity-check\n",
+    "assert list(rabin_karp_ints(\"abcd\", 3, 31, 1_000_000_007)) == [\n",
+    "    next(rabin_karp_ints(\"abc\", 3, 31, 1_000_000_007)),\n",
+    "    next(rabin_karp_ints(\"bcd\", 3, 31, 1_000_000_007)),\n",
+    "]\n",
+    "assert list(rabin_karp_ints(\"abcdefdhijklmnopqr\", 17, 31, 65521)) == [\n",
+    "    next(rabin_karp_ints(\"abcdefdhijklmnopq\", 17, 31, 65521)),\n",
+    "    next(rabin_karp_ints(\"bcdefdhijklmnopqr\", 17, 31, 65521)),\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Rabin-Karp Rolling Hashing via Floats\n",
+    "\n",
+    "The Python's `int` type is unbounded, so it can be used to implement the Rabin-Karp rolling hash algorithm without worrying about overflow.\n",
+    "It is, however, insanely expensive to use, and doesn't allow us to explore optimization opportunities.\n",
+    "The `float`, on the other hand, is just a double-precision IEEE 754 floating-point number, which can exactly represent 52-bit integers!\n",
+    "Thus, we can convert our arithmetic to use `float`s, if we guarantee, that no intermediate result will exceed that limit."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Generator\n",
+    "\n",
+    "LARGEST_INTEGRAL_FLOAT: float = 4503599627370495.0\n",
+    "\n",
+    "\n",
+    "def rabin_karp_floats(\n",
+    "    s: str,\n",
+    "    window_width: int,\n",
+    "    multiplier: int,\n",
+    "    modulo: int,\n",
+    "    alphabet_size: int = 256,\n",
+    "    salt: int = 1,\n",
+    ") -> Generator[int, None, None]:\n",
+    "    \"\"\"Return the rolling polynomial hashes of every length-`window_width` substring of `s`\"\"\"\n",
+    "\n",
+    "    assert window_width > 0, \"Window width must be positive\"\n",
+    "    assert multiplier > 0, \"Multiplier must be positive\"\n",
+    "    assert modulo > 0, \"Modulo must be positive\"\n",
+    "    assert multiplier < modulo, \"Multiplier must be less than modulo\"\n",
+    "\n",
+    "    if len(s) < window_width:\n",
+    "        return\n",
+    "\n",
+    "    multiplier = float(multiplier)\n",
+    "    modulo = float(modulo)\n",
+    "    assert (\n",
+    "        modulo < LARGEST_INTEGRAL_FLOAT\n",
+    "    ), \"Modulo can't exceed the largest integral float value\"\n",
+    "\n",
+    "    # Ensure, we won't overflow the floating-point representation\n",
+    "    largest_post_modulo = modulo - 1\n",
+    "    max_possible_term = alphabet_size\n",
+    "    assert (\n",
+    "        largest_post_modulo * multiplier + max_possible_term <= LARGEST_INTEGRAL_FLOAT\n",
+    "    ), \"Will overflow\"\n",
+    "\n",
+    "    # All of the operations will happen with a modulo:\n",
+    "    def mul_mod(a: float, b: float) -> float:\n",
+    "        return (a * b) % modulo\n",
+    "\n",
+    "    def add_mod(a: float, b: float) -> float:\n",
+    "        return (a + b) % modulo\n",
+    "\n",
+    "    def sub_mod(a: float, b: float) -> float:\n",
+    "        return (a - b) % modulo\n",
+    "\n",
+    "    # Precompute the discarding multiplier\n",
+    "    discarding_multiplier: float = 1.0\n",
+    "    for _ in range(window_width - 1):\n",
+    "        discarding_multiplier = mul_mod(discarding_multiplier, multiplier)\n",
+    "\n",
+    "    # Handle the first window - without dropping any characters\n",
+    "    current_hash: float = 0.0\n",
+    "    for char in s[:window_width]:\n",
+    "        new_term = float(ord(char) + salt)\n",
+    "        assert new_term < (alphabet_size + salt), \"Pass correct `alphabet_size`\"\n",
+    "        current_hash = add_mod(mul_mod(current_hash, multiplier), new_term)\n",
+    "    yield int(current_hash)\n",
+    "\n",
+    "    # Roll through the rest of the string\n",
+    "    total_hashes = len(s) - window_width + 1\n",
+    "    for i in range(1, total_hashes):  # First hash is already yielded\n",
+    "        old_term = float(ord(s[i - 1]) + salt)\n",
+    "        new_term = float(ord(s[i + window_width - 1]) + salt)\n",
+    "\n",
+    "        # Remove leftmost char and add the new rightmost one.\n",
+    "        current_hash = sub_mod(current_hash, mul_mod(old_term, discarding_multiplier))\n",
+    "        current_hash = add_mod(mul_mod(current_hash, multiplier), new_term)\n",
+    "        yield int(current_hash)\n",
+    "\n",
+    "\n",
+    "# Quick sanity-check\n",
+    "assert list(rabin_karp_floats(\"abcd\", 3, 31, 1_000_000_007)) == [\n",
+    "    next(rabin_karp_floats(\"abc\", 3, 31, 1_000_000_007)),\n",
+    "    next(rabin_karp_floats(\"bcd\", 3, 31, 1_000_000_007)),\n",
+    "]\n",
+    "assert list(rabin_karp_floats(\"abcdefdhijklmnopqr\", 17, 31, 65521)) == [\n",
+    "    next(rabin_karp_floats(\"abcdefdhijklmnopq\", 17, 31, 65521)),\n",
+    "    next(rabin_karp_floats(\"bcdefdhijklmnopqr\", 17, 31, 65521)),\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's load some data and ensure that the outputs are identical between the `int` and `float` implementations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "dataset_directory = Path(\"..\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "textual_dataset_path = dataset_directory / \"leipzig1M.txt\"\n",
+    "textual_dataset = open(textual_dataset_path, \"r\").read().strip()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded 1,000,000 lines of mean length 128.64 characters\n"
+     ]
+    }
+   ],
+   "source": [
+    "textual_lines = textual_dataset.split(\"\\n\")\n",
+    "print(\n",
+    "    f\"Loaded {len(textual_lines):,} lines of mean length {sum(len(line) for line in textual_lines) / len(textual_lines):.2f} characters\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compare_hashes(line, make_baseline_generator, make_test_generator):\n",
+    "    int_hashes = list(make_baseline_generator(line))\n",
+    "    float_hashes = list(make_test_generator(line))\n",
+    "    if int_hashes != float_hashes:\n",
+    "        print(f\"Int Hashes:   {int_hashes}\")\n",
+    "        print(f\"Float Hashes: {float_hashes}\")\n",
+    "\n",
+    "\n",
+    "for line in textual_lines[:2]:\n",
+    "    compare_hashes(\n",
+    "        line,\n",
+    "        lambda l: rabin_karp_ints(l, 17, 31, 65521),\n",
+    "        lambda l: rabin_karp_floats(l, 17, 31, 65521),\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A bigger question now is, will the same hold, if we use much larger modulo values?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Passed for window width: 3!\n",
+      "Passed for window width: 17!\n",
+      "Passed for window width: 64!\n"
+     ]
+    }
+   ],
+   "source": [
+    "LARGEST_SAFE_MODULO = 4503599626977\n",
+    "\n",
+    "for window_width in [3, 17, 64]:\n",
+    "    for line in textual_lines[:50]:\n",
+    "        compare_hashes(\n",
+    "            line,\n",
+    "            lambda l: rabin_karp_ints(\n",
+    "                l, window_width=window_width, multiplier=257, modulo=LARGEST_SAFE_MODULO\n",
+    "            ),\n",
+    "            lambda l: rabin_karp_floats(\n",
+    "                l, window_width=window_width, multiplier=257, modulo=LARGEST_SAFE_MODULO\n",
+    "            ),\n",
+    "        )\n",
+    "    print(f\"Passed for window width: {window_width}!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Rabin-Karp Rolling Hashing via FMAs\n",
+    "\n",
+    "- How aggressively can we use **FMA** (Fused Multiply-Add) operations to optimize the algorithm?\n",
+    "- How many of the modulo operations can we avoid?\n",
+    "- How can we simplify the `%` modulo operation?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import math\n",
+    "from typing import Generator\n",
+    "\n",
+    "LARGEST_INTEGRAL_FLOAT: float = 4503599627370495.0\n",
+    "\n",
+    "\n",
+    "def rabin_karp_fma(\n",
+    "    s: str,\n",
+    "    window_width: int,\n",
+    "    multiplier: int,\n",
+    "    modulo: int,\n",
+    "    alphabet_size: int = 256,\n",
+    "    salt: int = 1,\n",
+    ") -> Generator[int, None, None]:\n",
+    "    \"\"\"Return the rolling polynomial hashes of every length-`window_width` substring of `s`\n",
+    "    using Fused-Multiply-Add (FMA) operations & Barrett reduction for performance.\"\"\"\n",
+    "\n",
+    "    assert window_width > 0, \"Window width must be positive\"\n",
+    "    assert multiplier > 0, \"Multiplier must be positive\"\n",
+    "    assert modulo > 0, \"Modulo must be positive\"\n",
+    "    assert multiplier < modulo, \"Multiplier must be less than modulo\"\n",
+    "\n",
+    "    if len(s) < window_width:\n",
+    "        return\n",
+    "\n",
+    "    multiplier = float(multiplier)\n",
+    "    modulo = float(modulo)\n",
+    "    assert (\n",
+    "        modulo < LARGEST_INTEGRAL_FLOAT\n",
+    "    ), \"Modulo can't exceed the largest integral float value\"\n",
+    "\n",
+    "    # Ensure, we won't overflow the floating-point representation\n",
+    "    largest_post_modulo = modulo - 1\n",
+    "    max_possible_term = alphabet_size\n",
+    "    assert (\n",
+    "        largest_post_modulo * multiplier + max_possible_term <= LARGEST_INTEGRAL_FLOAT\n",
+    "    ), \"Will overflow\"\n",
+    "\n",
+    "    inverse_modulo: float = 1.0 / modulo\n",
+    "\n",
+    "    # Barrett reduction function\n",
+    "    # It will be used to reduce the intermediate results to the modulo range\n",
+    "    def barrett_mod(x: float) -> float:\n",
+    "        q = math.floor(x * inverse_modulo)\n",
+    "        result = x - q * modulo\n",
+    "        # Handle potential off-by-one errors\n",
+    "        if result >= modulo:\n",
+    "            result -= modulo\n",
+    "        elif result < 0:\n",
+    "            result += modulo\n",
+    "        assert int(result) == int(x % modulo), \"Barrett reduction failed\"\n",
+    "        return result\n",
+    "\n",
+    "    # All of the operations will happen with a modulo:\n",
+    "    def fma_mod(a: float, b: float, c: float) -> float:\n",
+    "        intermediate = a * b + c\n",
+    "        assert intermediate <= LARGEST_INTEGRAL_FLOAT, \"FMA did exceed integral range\"\n",
+    "        return barrett_mod(intermediate)\n",
+    "\n",
+    "    # Precompute the discarding multiplier\n",
+    "    negative_discarding_multiplier: float = 1.0\n",
+    "    for _ in range(window_width - 1):\n",
+    "        negative_discarding_multiplier = fma_mod(\n",
+    "            negative_discarding_multiplier, multiplier, 0.0\n",
+    "        )\n",
+    "    negative_discarding_multiplier = (\n",
+    "        -negative_discarding_multiplier\n",
+    "    )  # Negate for FMA compatibility\n",
+    "\n",
+    "    # Handle the first window - without dropping any characters\n",
+    "    current_hash: float = 0.0\n",
+    "    for char in s[:window_width]:\n",
+    "        new_term = float(ord(char) + salt)\n",
+    "        assert new_term < (alphabet_size + salt), \"Pass correct `alphabet_size`\"\n",
+    "        current_hash = fma_mod(current_hash, multiplier, new_term)\n",
+    "    yield int(current_hash)\n",
+    "\n",
+    "    # Roll through the rest of the string\n",
+    "    total_hashes = len(s) - window_width + 1\n",
+    "    for i in range(1, total_hashes):  # First hash is already yielded\n",
+    "        old_term = float(ord(s[i - 1]) + salt)\n",
+    "        new_term = float(ord(s[i + window_width - 1]) + salt)\n",
+    "\n",
+    "        # Remove leftmost char and add the new rightmost one.\n",
+    "        current_hash = fma_mod(old_term, negative_discarding_multiplier, current_hash)\n",
+    "        assert (\n",
+    "            current_hash >= -modulo\n",
+    "        ), \"Intermediate hash may be negative, but within modulo range\"\n",
+    "        current_hash = fma_mod(current_hash, multiplier, new_term)\n",
+    "        assert current_hash >= 0, \"Current hash should not be negative\"\n",
+    "        yield int(current_hash)\n",
+    "\n",
+    "\n",
+    "# Quick sanity-check\n",
+    "assert list(rabin_karp_fma(\"abcd\", 3, 31, 1_000_000_007)) == [\n",
+    "    next(rabin_karp_fma(\"abc\", 3, 31, 1_000_000_007)),\n",
+    "    next(rabin_karp_fma(\"bcd\", 3, 31, 1_000_000_007)),\n",
+    "]\n",
+    "assert list(rabin_karp_fma(\"abcdefdhijklmnopqr\", 17, 31, 65521)) == [\n",
+    "    next(rabin_karp_fma(\"abcdefdhijklmnopq\", 17, 31, 65521)),\n",
+    "    next(rabin_karp_fma(\"bcdefdhijklmnopqr\", 17, 31, 65521)),\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Passed for window width: 3!\n",
+      "Passed for window width: 17!\n",
+      "Passed for window width: 64!\n"
+     ]
+    }
+   ],
+   "source": [
+    "LARGEST_SAFE_MODULO = 4503599626977\n",
+    "\n",
+    "for window_width in [3, 17, 64]:\n",
+    "    for line in textual_lines[:50]:\n",
+    "        compare_hashes(\n",
+    "            line,\n",
+    "            lambda l: rabin_karp_ints(\n",
+    "                l, window_width=window_width, multiplier=257, modulo=LARGEST_SAFE_MODULO\n",
+    "            ),\n",
+    "            lambda l: rabin_karp_fma(\n",
+    "                l, window_width=window_width, multiplier=257, modulo=LARGEST_SAFE_MODULO\n",
+    "            ),\n",
+    "        )\n",
+    "    print(f\"Passed for window width: {window_width}!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As we can handle typical texts, let's try several tricky inputs... where we'll be at a brink of an overflow! Some uncomfortable character values are: `\\x00`, `\\x01`, `\\x7F`, `\\xFF`. To really stress-test, let's pick the largest prime number below `LARGEST_INTEGRAL_FLOAT`, that can be used safely for a given alphabet size."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "4,503,599,627,370,449\n"
+     ]
+    }
+   ],
+   "source": [
+    "from typing import Final, List, Generator\n",
+    "\n",
+    "# Fixed witnesses that make Miller-Rabin exact for n < 2**64\n",
+    "MR_BASES: Final[List[int]] = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37]\n",
+    "\n",
+    "\n",
+    "def _is_prime_64(n: int) -> bool:\n",
+    "    \"\"\"Exact primality for 0 < n < 2**64.\"\"\"\n",
+    "    if n < 2:\n",
+    "        return False\n",
+    "    # Quick reject: small prime factors\n",
+    "    for p in MR_BASES:  # covers all primes ≤ 37\n",
+    "        if n == p:\n",
+    "            return True\n",
+    "        if n % p == 0:\n",
+    "            return False\n",
+    "\n",
+    "    # Write n-1 = d · 2ˢ  with d odd\n",
+    "    d, s = n - 1, 0\n",
+    "    while d & 1 == 0:\n",
+    "        d >>= 1\n",
+    "        s += 1\n",
+    "\n",
+    "    # Strong-probable-prime test for each base\n",
+    "    for a in MR_BASES:\n",
+    "        x = pow(a, d, n)\n",
+    "        if x in (1, n - 1):  # self-loop or −1 ⇒ may be prime\n",
+    "            continue\n",
+    "        for _ in range(s - 1):  # square until −1 or cycle\n",
+    "            x = pow(x, 2, n)\n",
+    "            if x == n - 1:\n",
+    "                break\n",
+    "        else:  # never hit −1 ⇒ composite\n",
+    "            return False\n",
+    "    return True\n",
+    "\n",
+    "\n",
+    "def prev_primes(n: int) -> Generator[int, None, None]:\n",
+    "    \"\"\"\n",
+    "    Yield the largest primes strictly less than n (n must be > 2).\n",
+    "    Average cost: O(log n * log log n) because the prime gap ~ log n.\n",
+    "    \"\"\"\n",
+    "    if n <= 2:\n",
+    "        raise ValueError(\"Threshold must exceed 2.\")\n",
+    "    n -= n % 2 == 0  # make n odd\n",
+    "    while n > 2:\n",
+    "        if _is_prime_64(n):\n",
+    "            yield n\n",
+    "        n -= 2\n",
+    "\n",
+    "\n",
+    "def next_primes(n: int) -> Generator[int, None, None]:\n",
+    "    \"\"\"\n",
+    "    Yield the smallest primes strictly greater than n (n must be > 2).\n",
+    "    Average cost: O(log n * log log n) because the prime gap ~ log n.\n",
+    "    \"\"\"\n",
+    "    if n <= 2:\n",
+    "        raise ValueError(\"Threshold must exceed 2.\")\n",
+    "    n += n % 2 == 0  # make n odd\n",
+    "    while True:\n",
+    "        if _is_prime_64(n):\n",
+    "            yield n\n",
+    "        n += 2\n",
+    "\n",
+    "\n",
+    "LARGEST_INTEGRAL_FLOAT_PRIME = next(prev_primes(int(LARGEST_INTEGRAL_FLOAT)))\n",
+    "print(f\"{LARGEST_INTEGRAL_FLOAT_PRIME:,}\")  # This will be used for stress-testing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Passed for window width: 3, modulo: 17,523,733,958,369!\n",
+      "Passed for window width: 17, modulo: 17,523,733,958,369!\n",
+      "Passed for window width: 64, modulo: 17,523,733,958,369!\n",
+      "Passed for window width: 707, modulo: 17,523,733,958,369!\n"
+     ]
+    }
+   ],
+   "source": [
+    "import random\n",
+    "\n",
+    "all_0 = \"\\x00\" * 1_000\n",
+    "all_1 = \"\\x01\" * 1_000\n",
+    "all_127 = \"\\x7f\" * 1_000\n",
+    "all_255 = \"\\xff\" * 1_000\n",
+    "all_0_255 = \"\\x00\\xff\" * 500  # alternating 0 and 255 characters\n",
+    "all_uncomfortable = \"\\x00\\x01\\x7f\\xfe\\xff\" * 250  # all uncomfortable characters\n",
+    "\n",
+    "long_random_strings = [\n",
+    "    \"\".join(random.choices(\"\\x00\\x01\\x7f\\xfe\\xff\", k=10_000)) for _ in range(10)\n",
+    "]  # 10 long random strings with uncomfortable characters\n",
+    "\n",
+    "alphabet_size = 256\n",
+    "multiplier = 257\n",
+    "largest_term = alphabet_size + 1  # in this specific case, same as `multiplier`\n",
+    "large_modulo = next(\n",
+    "    prev_primes(int(LARGEST_INTEGRAL_FLOAT) // multiplier - largest_term)\n",
+    ")\n",
+    "\n",
+    "for window_width in [3, 17, 64, 707]:\n",
+    "    for line in [\n",
+    "        all_0,\n",
+    "        all_1,\n",
+    "        all_127,\n",
+    "        all_255,\n",
+    "        all_0_255,\n",
+    "        all_uncomfortable,\n",
+    "        *long_random_strings,\n",
+    "    ]:\n",
+    "        compare_hashes(\n",
+    "            line,\n",
+    "            lambda l: rabin_karp_ints(\n",
+    "                l,\n",
+    "                window_width=window_width,\n",
+    "                multiplier=multiplier,\n",
+    "                modulo=large_modulo,\n",
+    "                alphabet_size=alphabet_size,\n",
+    "            ),\n",
+    "            lambda l: rabin_karp_fma(\n",
+    "                l,\n",
+    "                window_width=window_width,\n",
+    "                multiplier=multiplier,\n",
+    "                modulo=large_modulo,\n",
+    "                alphabet_size=alphabet_size,\n",
+    "            ),\n",
+    "        )\n",
+    "    print(f\"Passed for window width: {window_width}, modulo: {large_modulo:,}!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Min-Hash Fingerprinting\n",
+    "\n",
+    "Min-Hash fingerprints transform variable length text representations into **fixed-length vectors**, where each dimension stores the minimum hash value of a certain hash function across the whole document.\n",
+    "It's great for large-scale information retrieval using Hamming Distance or Jaccard Similarity ($|A ∩ B| / |A ∪ B|$) or its weighted alternative.\n",
+    "\n",
+    "A potentially more informative alternative is \"weighted Min-Hash\", which takes into account the frequency of each element in the document. This makes the fingerprints compatible with **TF-IDF**-like algorithms, and makes the system more robust especially for narrow rolling windows."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Defaulting to user installation because normal site-packages is not writeable\n",
+      "Requirement already satisfied: tqdm in /home/ubuntu/.local/lib/python3.10/site-packages (4.67.1)\n",
+      "Requirement already satisfied: numpy in /home/ubuntu/.local/lib/python3.10/site-packages (2.2.4)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install tqdm numpy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(array([2256051662, 1712240109], dtype=uint32),\n",
+       " array([3, 2], dtype=uint32),\n",
+       " array(['abc', 'abcd'], dtype=StringDType()))"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "from numpy.dtypes import StringDType\n",
+    "from typing import List, Tuple\n",
+    "from stringzilla import hash as sz_hash\n",
+    "\n",
+    "\n",
+    "def count_min_sketch(\n",
+    "    text: str,\n",
+    "    window_widths: List[int],\n",
+    "    seeds: List[int],\n",
+    "    hash_resolution: np.dtype = np.uint32,\n",
+    ") -> Tuple[np.ndarray, np.ndarray, np.ndarray]:\n",
+    "    \"\"\"\n",
+    "    Produces a weighted Min-Hash fingerprint also called a Count-Min Sketch.\n",
+    "    Uses StringZilla's native hash function, as opposed to the Rabin Karp.\n",
+    "\n",
+    "    https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch\n",
+    "    \"\"\"\n",
+    "\n",
+    "    fingerprint_hashes = np.empty((len(window_widths),), dtype=hash_resolution)\n",
+    "    fingerprint_weights = np.empty((len(window_widths),), dtype=np.uint32)\n",
+    "    fingerprint_ngrams = np.empty((len(window_widths),), dtype=StringDType())\n",
+    "\n",
+    "    skipped_final_hash = np.iinfo(hash_resolution).max\n",
+    "    skipped_u64_intermediary = np.iinfo(np.uint64).max\n",
+    "\n",
+    "    for i, (window_width, seed) in enumerate(zip(window_widths, seeds)):\n",
+    "        assert window_width > 0, \"Window width must be positive\"\n",
+    "\n",
+    "        smallest_hash = skipped_u64_intermediary\n",
+    "        smallest_count = 0\n",
+    "        smallest_example = None\n",
+    "\n",
+    "        for j in range(len(text) - window_width + 1):\n",
+    "            text_window = text[j : j + window_width]\n",
+    "            rolling_intermediate_u64_hash = sz_hash(text_window, seed)\n",
+    "            new_smallest_hash = min(smallest_hash, rolling_intermediate_u64_hash)\n",
+    "            if new_smallest_hash < smallest_hash:\n",
+    "                smallest_count = 1\n",
+    "                smallest_hash = new_smallest_hash\n",
+    "                smallest_example = text_window\n",
+    "            elif new_smallest_hash == smallest_hash:\n",
+    "                smallest_count += 1\n",
+    "\n",
+    "        smallest_hash &= skipped_final_hash  # Ensure we don't exceed the `uint32` range\n",
+    "        fingerprint_hashes[i] = smallest_hash\n",
+    "        fingerprint_weights[i] = smallest_count\n",
+    "        fingerprint_ngrams[i] = smallest_example\n",
+    "\n",
+    "    return fingerprint_hashes, fingerprint_weights, fingerprint_ngrams\n",
+    "\n",
+    "\n",
+    "count_min_sketch(\"abcde\", window_widths=[3, 4], seeds=[257, 258])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(array([   6498345, 1706860248], dtype=uint32),\n",
+       " array([3, 2], dtype=uint32),\n",
+       " array(['abc', 'abcd'], dtype=StringDType()))"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "from numpy.dtypes import StringDType\n",
+    "from typing import List, Tuple\n",
+    "\n",
+    "\n",
+    "def rolling_count_min_sketch(\n",
+    "    text: str,\n",
+    "    window_widths: List[int],\n",
+    "    multipliers: List[int],\n",
+    "    salts: List[int],\n",
+    "    modulo: int,\n",
+    "    hash_resolution: np.dtype = np.uint32,\n",
+    ") -> Tuple[np.ndarray, np.ndarray, np.ndarray]:\n",
+    "    \"\"\"\n",
+    "    Produces a weighted Min-Hash fingerprint also called a Count-Min Sketch.\n",
+    "    Uses Rabin-Karp rolling hash function for algorithmic efficiency.\n",
+    "\n",
+    "    https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch\n",
+    "    \"\"\"\n",
+    "\n",
+    "    count_widths = len(window_widths)\n",
+    "    count_multipliers = len(multipliers)\n",
+    "    assert count_widths == count_multipliers, f\"{count_widths=} != {count_multipliers=}\"\n",
+    "\n",
+    "    fingerprint_hashes = np.empty((len(window_widths),), dtype=hash_resolution)\n",
+    "    fingerprint_weights = np.empty((len(window_widths),), dtype=np.uint32)\n",
+    "    fingerprint_ngrams = np.empty((len(window_widths),), dtype=StringDType())\n",
+    "\n",
+    "    skipped_final_hash = np.iinfo(hash_resolution).max\n",
+    "    skipped_u64_intermediary = np.iinfo(np.uint64).max\n",
+    "    hashers = [\n",
+    "        rabin_karp_ints(\n",
+    "            text,\n",
+    "            window_width=width,\n",
+    "            multiplier=multiplier,\n",
+    "            modulo=modulo,\n",
+    "            salt=salt,\n",
+    "        )\n",
+    "        for width, multiplier, salt in zip(window_widths, multipliers, salts)\n",
+    "    ]\n",
+    "\n",
+    "    for i, hasher in enumerate(hashers):\n",
+    "        smallest_hash = skipped_u64_intermediary\n",
+    "        smallest_count = 0\n",
+    "        smallest_example = None\n",
+    "\n",
+    "        for j, rolling_intermediate_u64_hash in enumerate(hasher):\n",
+    "            new_smallest_hash = min(smallest_hash, rolling_intermediate_u64_hash)\n",
+    "            if new_smallest_hash < smallest_hash:\n",
+    "                smallest_count = 1\n",
+    "                smallest_hash = new_smallest_hash\n",
+    "                # Extract N-gram from the correct position where minimum hash occurred\n",
+    "                smallest_example = text[j : j + window_widths[i]]\n",
+    "            elif new_smallest_hash == smallest_hash:\n",
+    "                smallest_count += 1\n",
+    "\n",
+    "        smallest_hash &= skipped_final_hash  # Ensure we don't exceed the `uint32` range\n",
+    "        fingerprint_hashes[i] = smallest_hash\n",
+    "        fingerprint_weights[i] = smallest_count\n",
+    "        fingerprint_ngrams[i] = smallest_example\n",
+    "\n",
+    "    return fingerprint_hashes, fingerprint_weights, fingerprint_ngrams\n",
+    "\n",
+    "\n",
+    "rolling_count_min_sketch(\n",
+    "    \"abcde\",\n",
+    "    window_widths=[3, 4],\n",
+    "    multipliers=[257, 258],\n",
+    "    salts=[1, 2],\n",
+    "    modulo=4503599626977,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A good set of hyper-parameters for Min-Hashing binary text would be:\n",
+    "\n",
+    "- `window_widths`: ${3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 18, 21, 24, 27, 30}$ - 16 widths\n",
+    "- `alphabet_size`: $256$ for ASCII & binary UTF-8 content\n",
+    "- `ndim`: $16...1024$, something like 192 should be great for X/Twitter\n",
+    "- `multipliers`: ${257, 258, 259, 260, 261, 262, ..., 1024 + 256}$\n",
+    "\n",
+    "When processing less usual inputs, like the DNA sequences, parameters may be different, e.g.:\n",
+    "\n",
+    "- `window_widths`: ${3, 6, 9, 12, 15, 30, 60, 120}$\n",
+    "- `alphabet_size`: $4$ for DNA sequences\n",
+    "- `ndim`: should be probably proportional to $√n$, where $n$ is the typical length of sequences\n",
+    "- `multipliers`: ${5, 6, 7, 8, 9, ..., 4 * n + 1}$\n",
+    "\n",
+    "In every case, the `modulo` should be co-prime to the multiplier.\n",
+    "The easiest option is to use a large prime, that can be obtained via:\n",
+    "\n",
+    "```python\n",
+    "largest_prime_below(int(LARGEST_INTEGRAL_FLOAT) // max(multipliers) - (alphabet_size + 1))\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from typing import Tuple\n",
+    "\n",
+    "\n",
+    "def jaccard_similarity(a: np.ndarray, b: np.ndarray) -> float:\n",
+    "    if a.shape != b.shape:\n",
+    "        raise ValueError(\"Fingerprints must have identical length\")\n",
+    "\n",
+    "    return float(np.mean(a == b))\n",
+    "\n",
+    "\n",
+    "def weighted_jaccard_similarity(\n",
+    "    a: Tuple[np.ndarray, np.ndarray],\n",
+    "    b: Tuple[np.ndarray, np.ndarray],\n",
+    ") -> float:\n",
+    "    hashes_a, weights_a = a\n",
+    "    hashes_b, weights_b = b\n",
+    "\n",
+    "    if hashes_a.shape != hashes_b.shape or weights_a.shape != weights_b.shape:\n",
+    "        raise ValueError(\"Both fingerprints must have identical dimensions\")\n",
+    "\n",
+    "    magnitude_i = (weights_a * weights_b)[hashes_a == hashes_b].sum()\n",
+    "    magnitude_a = (weights_a * weights_a).sum()\n",
+    "    magnitude_b = (weights_b * weights_b).sum()\n",
+    "    magnitude_u = magnitude_a + magnitude_b - magnitude_i\n",
+    "\n",
+    "    return float(magnitude_i) / float(magnitude_u)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "165161"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "NDIM: int = 192\n",
+    "window_widths = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 18, 21, 24, 27, 30]\n",
+    "window_widths *= NDIM // len(window_widths)\n",
+    "\n",
+    "# For Rabin-Karp rolling hashes let's take different prime multipliers,\n",
+    "# with the smallest being a function of the window width and the largest easily representable integer:\n",
+    "smallest_multiplier = int(pow(LARGEST_INTEGRAL_FLOAT, 1 / min(window_widths)))\n",
+    "smallest_prime_multiplier = next(next_primes(smallest_multiplier))\n",
+    "smallest_prime_multiplier"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's compute the rolling fingerprints:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded 1,000,000 lines of mean length 128.64 characters\n"
+     ]
+    }
+   ],
+   "source": [
+    "textual_dataset_path = dataset_directory / \"leipzig1M.txt\"\n",
+    "textual_dataset = open(textual_dataset_path, \"r\").read().casefold().strip()\n",
+    "textual_lines = textual_dataset.split(\"\\n\")\n",
+    "print(\n",
+    "    f\"Loaded {len(textual_lines):,} lines of mean length {sum(len(line) for line in textual_lines) / len(textual_lines):.2f} characters\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Fingerprinting lines: 100%|██████████| 10000/10000 [01:35<00:00, 104.49line/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from tqdm import tqdm\n",
+    "from itertools import islice\n",
+    "import random\n",
+    "\n",
+    "\n",
+    "def take_first_n(iterable, n):\n",
+    "    return islice(iterable, n)\n",
+    "\n",
+    "\n",
+    "def keep_each_nth(iterable, k):\n",
+    "    return (x for i, x in enumerate(iterable, 1) if i % k == 0)\n",
+    "\n",
+    "\n",
+    "prime_multipliers = list(\n",
+    "    take_first_n(keep_each_nth(next_primes(smallest_prime_multiplier), 7), NDIM)\n",
+    ")\n",
+    "random_multipliers = [random.randint(257, 1024 * 1024 * 16) for _ in range(NDIM)]\n",
+    "consecutive_multipliers = list(range(256, 256 + NDIM))\n",
+    "\n",
+    "salts = range(1, NDIM + 1)  # Use different salts for each window width\n",
+    "alphabet_size = 256\n",
+    "largest_term = alphabet_size + max(salts)\n",
+    "LARGEST_SAFE_MODULO = next(\n",
+    "    prev_primes(int(LARGEST_INTEGRAL_FLOAT) // max(prime_multipliers) - largest_term)\n",
+    ")\n",
+    "HASH_DTYPE = np.uint64\n",
+    "\n",
+    "fingerprint_hashes = []\n",
+    "fingerprint_counts = []\n",
+    "fingerprint_ngrams = []\n",
+    "\n",
+    "DATASET_SIZE_LIMIT = 10_000\n",
+    "\n",
+    "default_static_sketcher = lambda line: count_min_sketch(\n",
+    "    text=line,\n",
+    "    window_widths=window_widths,\n",
+    "    seeds=prime_multipliers,\n",
+    "    hash_resolution=HASH_DTYPE,\n",
+    ")\n",
+    "# For Rabin-Karp rolling hashes we pass more parameters:\n",
+    "default_rolling_sketcher = lambda line: rolling_count_min_sketch(\n",
+    "    text=line,\n",
+    "    window_widths=window_widths,\n",
+    "    multipliers=random_multipliers,\n",
+    "    salts=salts,\n",
+    "    modulo=LARGEST_SAFE_MODULO,\n",
+    "    hash_resolution=HASH_DTYPE,\n",
+    ")\n",
+    "\n",
+    "for line in tqdm(\n",
+    "    textual_lines[:DATASET_SIZE_LIMIT],\n",
+    "    desc=\"Fingerprinting lines\",\n",
+    "    unit=\"line\",\n",
+    "):\n",
+    "    hashes, counts, ngrams = default_rolling_sketcher(line)\n",
+    "    fingerprint_hashes.append(hashes)\n",
+    "    fingerprint_counts.append(counts)\n",
+    "    fingerprint_ngrams.append(ngrams)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "24819100627"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "LARGEST_SAFE_MODULO"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's cross-reference the fingerprints counting the number of hash collisions without our test set."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dimension 0: 492 unique hashes, 0 collisions\n",
+      "Dimension 1: 1,552 unique hashes, 0 collisions\n",
+      "Dimension 2: 2,663 unique hashes, 0 collisions\n",
+      "Dimension 3: 2,445 unique hashes, 0 collisions\n",
+      "Dimension 4: 4,936 unique hashes, 0 collisions\n",
+      "Dimension 5: 6,149 unique hashes, 0 collisions\n",
+      "Dimension 6: 7,266 unique hashes, 0 collisions\n",
+      "Dimension 7: 8,090 unique hashes, 0 collisions\n",
+      "Dimension 8: 8,556 unique hashes, 0 collisions\n",
+      "Dimension 9: 9,095 unique hashes, 0 collisions\n",
+      "Dimension 10: 9,654 unique hashes, 0 collisions\n",
+      "Dimension 11: 9,859 unique hashes, 0 collisions\n",
+      "Dimension 12: 9,927 unique hashes, 1 collisions\n",
+      "Dimension 13: 9,943 unique hashes, 0 collisions\n",
+      "Dimension 14: 9,923 unique hashes, 0 collisions\n",
+      "Dimension 15: 9,879 unique hashes, 0 collisions\n",
+      "Dimension 16: 470 unique hashes, 0 collisions\n",
+      "Dimension 17: 1,492 unique hashes, 0 collisions\n",
+      "Dimension 18: 2,358 unique hashes, 0 collisions\n",
+      "Dimension 19: 3,769 unique hashes, 0 collisions\n",
+      "Dimension 20: 5,362 unique hashes, 0 collisions\n",
+      "Dimension 21: 6,214 unique hashes, 0 collisions\n",
+      "Dimension 22: 7,165 unique hashes, 0 collisions\n",
+      "Dimension 23: 7,778 unique hashes, 0 collisions\n",
+      "Dimension 24: 8,727 unique hashes, 1 collisions\n",
+      "Dimension 25: 9,035 unique hashes, 0 collisions\n",
+      "Dimension 26: 9,689 unique hashes, 0 collisions\n",
+      "Dimension 27: 9,861 unique hashes, 0 collisions\n",
+      "Dimension 28: 9,902 unique hashes, 0 collisions\n",
+      "Dimension 29: 9,936 unique hashes, 0 collisions\n",
+      "Dimension 30: 9,926 unique hashes, 0 collisions\n",
+      "Dimension 31: 9,867 unique hashes, 0 collisions\n",
+      "Dimension 32: 419 unique hashes, 0 collisions\n",
+      "Dimension 33: 1,392 unique hashes, 0 collisions\n",
+      "Dimension 34: 2,428 unique hashes, 0 collisions\n",
+      "Dimension 35: 4,020 unique hashes, 0 collisions\n",
+      "Dimension 36: 4,429 unique hashes, 0 collisions\n",
+      "Dimension 37: 6,583 unique hashes, 0 collisions\n",
+      "Dimension 38: 7,302 unique hashes, 0 collisions\n",
+      "Dimension 39: 8,100 unique hashes, 0 collisions\n",
+      "Dimension 40: 8,722 unique hashes, 0 collisions\n",
+      "Dimension 41: 9,033 unique hashes, 0 collisions\n",
+      "Dimension 42: 9,689 unique hashes, 0 collisions\n",
+      "Dimension 43: 9,873 unique hashes, 0 collisions\n",
+      "Dimension 44: 9,922 unique hashes, 0 collisions\n",
+      "Dimension 45: 9,928 unique hashes, 0 collisions\n",
+      "Dimension 46: 9,923 unique hashes, 0 collisions\n",
+      "Dimension 47: 9,863 unique hashes, 0 collisions\n",
+      "Dimension 48: 355 unique hashes, 0 collisions\n",
+      "Dimension 49: 1,016 unique hashes, 0 collisions\n",
+      "Dimension 50: 2,388 unique hashes, 0 collisions\n",
+      "Dimension 51: 3,898 unique hashes, 0 collisions\n",
+      "Dimension 52: 5,295 unique hashes, 0 collisions\n",
+      "Dimension 53: 6,398 unique hashes, 0 collisions\n",
+      "Dimension 54: 7,403 unique hashes, 0 collisions\n",
+      "Dimension 55: 7,948 unique hashes, 0 collisions\n",
+      "Dimension 56: 8,646 unique hashes, 0 collisions\n",
+      "Dimension 57: 8,946 unique hashes, 1 collisions\n",
+      "Dimension 58: 9,648 unique hashes, 0 collisions\n",
+      "Dimension 59: 9,849 unique hashes, 0 collisions\n",
+      "Dimension 60: 9,909 unique hashes, 0 collisions\n",
+      "Dimension 61: 9,928 unique hashes, 0 collisions\n",
+      "Dimension 62: 9,915 unique hashes, 0 collisions\n",
+      "Dimension 63: 9,863 unique hashes, 0 collisions\n",
+      "Dimension 64: 607 unique hashes, 0 collisions\n",
+      "Dimension 65: 809 unique hashes, 0 collisions\n",
+      "Dimension 66: 2,237 unique hashes, 0 collisions\n",
+      "Dimension 67: 3,450 unique hashes, 0 collisions\n",
+      "Dimension 68: 4,635 unique hashes, 0 collisions\n",
+      "Dimension 69: 6,308 unique hashes, 0 collisions\n",
+      "Dimension 70: 7,594 unique hashes, 0 collisions\n",
+      "Dimension 71: 8,234 unique hashes, 0 collisions\n",
+      "Dimension 72: 8,535 unique hashes, 0 collisions\n",
+      "Dimension 73: 8,981 unique hashes, 0 collisions\n",
+      "Dimension 74: 9,643 unique hashes, 0 collisions\n",
+      "Dimension 75: 9,879 unique hashes, 0 collisions\n",
+      "Dimension 76: 9,921 unique hashes, 0 collisions\n",
+      "Dimension 77: 9,932 unique hashes, 1 collisions\n",
+      "Dimension 78: 9,922 unique hashes, 0 collisions\n",
+      "Dimension 79: 9,867 unique hashes, 0 collisions\n",
+      "Dimension 80: 394 unique hashes, 0 collisions\n",
+      "Dimension 81: 1,296 unique hashes, 0 collisions\n",
+      "Dimension 82: 2,071 unique hashes, 0 collisions\n",
+      "Dimension 83: 3,764 unique hashes, 0 collisions\n",
+      "Dimension 84: 4,886 unique hashes, 0 collisions\n",
+      "Dimension 85: 6,365 unique hashes, 0 collisions\n",
+      "Dimension 86: 7,446 unique hashes, 0 collisions\n",
+      "Dimension 87: 7,888 unique hashes, 0 collisions\n",
+      "Dimension 88: 8,647 unique hashes, 0 collisions\n",
+      "Dimension 89: 8,906 unique hashes, 0 collisions\n",
+      "Dimension 90: 9,622 unique hashes, 1 collisions\n",
+      "Dimension 91: 9,874 unique hashes, 1 collisions\n",
+      "Dimension 92: 9,932 unique hashes, 0 collisions\n",
+      "Dimension 93: 9,919 unique hashes, 0 collisions\n",
+      "Dimension 94: 9,906 unique hashes, 1 collisions\n",
+      "Dimension 95: 9,870 unique hashes, 0 collisions\n",
+      "Dimension 96: 681 unique hashes, 0 collisions\n",
+      "Dimension 97: 1,241 unique hashes, 0 collisions\n",
+      "Dimension 98: 2,499 unique hashes, 0 collisions\n",
+      "Dimension 99: 3,866 unique hashes, 0 collisions\n",
+      "Dimension 100: 4,805 unique hashes, 0 collisions\n",
+      "Dimension 101: 6,347 unique hashes, 0 collisions\n",
+      "Dimension 102: 7,291 unique hashes, 0 collisions\n",
+      "Dimension 103: 7,909 unique hashes, 0 collisions\n",
+      "Dimension 104: 8,601 unique hashes, 0 collisions\n",
+      "Dimension 105: 8,999 unique hashes, 0 collisions\n",
+      "Dimension 106: 9,680 unique hashes, 0 collisions\n",
+      "Dimension 107: 9,861 unique hashes, 0 collisions\n",
+      "Dimension 108: 9,938 unique hashes, 0 collisions\n",
+      "Dimension 109: 9,927 unique hashes, 0 collisions\n",
+      "Dimension 110: 9,913 unique hashes, 0 collisions\n",
+      "Dimension 111: 9,871 unique hashes, 0 collisions\n",
+      "Dimension 112: 423 unique hashes, 0 collisions\n",
+      "Dimension 113: 684 unique hashes, 0 collisions\n",
+      "Dimension 114: 2,054 unique hashes, 0 collisions\n",
+      "Dimension 115: 3,796 unique hashes, 0 collisions\n",
+      "Dimension 116: 5,107 unique hashes, 0 collisions\n",
+      "Dimension 117: 5,942 unique hashes, 0 collisions\n",
+      "Dimension 118: 7,303 unique hashes, 0 collisions\n",
+      "Dimension 119: 7,934 unique hashes, 0 collisions\n",
+      "Dimension 120: 8,373 unique hashes, 0 collisions\n",
+      "Dimension 121: 9,024 unique hashes, 0 collisions\n",
+      "Dimension 122: 9,657 unique hashes, 0 collisions\n",
+      "Dimension 123: 9,874 unique hashes, 0 collisions\n",
+      "Dimension 124: 9,907 unique hashes, 0 collisions\n",
+      "Dimension 125: 9,927 unique hashes, 0 collisions\n",
+      "Dimension 126: 9,915 unique hashes, 0 collisions\n",
+      "Dimension 127: 9,872 unique hashes, 0 collisions\n",
+      "Dimension 128: 510 unique hashes, 0 collisions\n",
+      "Dimension 129: 1,205 unique hashes, 0 collisions\n",
+      "Dimension 130: 2,864 unique hashes, 0 collisions\n",
+      "Dimension 131: 2,888 unique hashes, 0 collisions\n",
+      "Dimension 132: 5,104 unique hashes, 0 collisions\n",
+      "Dimension 133: 6,185 unique hashes, 0 collisions\n",
+      "Dimension 134: 7,591 unique hashes, 0 collisions\n",
+      "Dimension 135: 7,889 unique hashes, 0 collisions\n",
+      "Dimension 136: 8,516 unique hashes, 0 collisions\n",
+      "Dimension 137: 8,822 unique hashes, 0 collisions\n",
+      "Dimension 138: 9,675 unique hashes, 0 collisions\n",
+      "Dimension 139: 9,882 unique hashes, 0 collisions\n",
+      "Dimension 140: 9,909 unique hashes, 0 collisions\n",
+      "Dimension 141: 9,927 unique hashes, 0 collisions\n",
+      "Dimension 142: 9,916 unique hashes, 0 collisions\n",
+      "Dimension 143: 9,867 unique hashes, 0 collisions\n",
+      "Dimension 144: 691 unique hashes, 0 collisions\n",
+      "Dimension 145: 1,401 unique hashes, 0 collisions\n",
+      "Dimension 146: 2,549 unique hashes, 0 collisions\n",
+      "Dimension 147: 4,065 unique hashes, 0 collisions\n",
+      "Dimension 148: 4,908 unique hashes, 0 collisions\n",
+      "Dimension 149: 6,141 unique hashes, 0 collisions\n",
+      "Dimension 150: 7,289 unique hashes, 0 collisions\n",
+      "Dimension 151: 7,991 unique hashes, 0 collisions\n",
+      "Dimension 152: 8,693 unique hashes, 1 collisions\n",
+      "Dimension 153: 8,986 unique hashes, 0 collisions\n",
+      "Dimension 154: 9,641 unique hashes, 0 collisions\n",
+      "Dimension 155: 9,822 unique hashes, 0 collisions\n",
+      "Dimension 156: 9,916 unique hashes, 0 collisions\n",
+      "Dimension 157: 9,936 unique hashes, 0 collisions\n",
+      "Dimension 158: 9,920 unique hashes, 0 collisions\n",
+      "Dimension 159: 9,863 unique hashes, 0 collisions\n",
+      "Dimension 160: 530 unique hashes, 0 collisions\n",
+      "Dimension 161: 1,749 unique hashes, 0 collisions\n",
+      "Dimension 162: 2,507 unique hashes, 0 collisions\n",
+      "Dimension 163: 4,411 unique hashes, 0 collisions\n",
+      "Dimension 164: 5,242 unique hashes, 0 collisions\n",
+      "Dimension 165: 6,319 unique hashes, 0 collisions\n",
+      "Dimension 166: 7,316 unique hashes, 0 collisions\n",
+      "Dimension 167: 7,856 unique hashes, 0 collisions\n",
+      "Dimension 168: 8,560 unique hashes, 0 collisions\n",
+      "Dimension 169: 8,924 unique hashes, 0 collisions\n",
+      "Dimension 170: 9,681 unique hashes, 0 collisions\n",
+      "Dimension 171: 9,845 unique hashes, 0 collisions\n",
+      "Dimension 172: 9,935 unique hashes, 0 collisions\n",
+      "Dimension 173: 9,942 unique hashes, 0 collisions\n",
+      "Dimension 174: 9,917 unique hashes, 0 collisions\n",
+      "Dimension 175: 9,876 unique hashes, 0 collisions\n",
+      "Dimension 176: 604 unique hashes, 0 collisions\n",
+      "Dimension 177: 1,569 unique hashes, 0 collisions\n",
+      "Dimension 178: 2,690 unique hashes, 0 collisions\n",
+      "Dimension 179: 3,509 unique hashes, 0 collisions\n",
+      "Dimension 180: 5,181 unique hashes, 0 collisions\n",
+      "Dimension 181: 6,340 unique hashes, 0 collisions\n",
+      "Dimension 182: 7,285 unique hashes, 0 collisions\n",
+      "Dimension 183: 8,280 unique hashes, 0 collisions\n",
+      "Dimension 184: 8,628 unique hashes, 0 collisions\n",
+      "Dimension 185: 9,028 unique hashes, 0 collisions\n",
+      "Dimension 186: 9,631 unique hashes, 0 collisions\n",
+      "Dimension 187: 9,872 unique hashes, 0 collisions\n",
+      "Dimension 188: 9,910 unique hashes, 0 collisions\n",
+      "Dimension 189: 9,926 unique hashes, 0 collisions\n",
+      "Dimension 190: 9,918 unique hashes, 0 collisions\n",
+      "Dimension 191: 9,875 unique hashes, 0 collisions\n"
+     ]
+    }
+   ],
+   "source": [
+    "from typing import Dict, Set\n",
+    "\n",
+    "for dim in range(len(window_widths)):\n",
+    "    hash_to_ngram: Dict[int, str] = {}\n",
+    "    hash_collisions: Set[int] = set()\n",
+    "    for hashes, ngrams in zip(fingerprint_hashes, fingerprint_ngrams):\n",
+    "        hash_value = hashes[dim]\n",
+    "        ngram_value = ngrams[dim]\n",
+    "        if hash_value not in hash_to_ngram:\n",
+    "            hash_to_ngram[hash_value] = ngram_value\n",
+    "        elif hash_to_ngram[hash_value] != ngram_value:\n",
+    "            hash_collisions.add(hash_value)\n",
+    "\n",
+    "    print(\n",
+    "        f\"Dimension {dim}: {len(hash_to_ngram):,} unique hashes, {len(hash_collisions):,} collisions\"\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's estimate Recall @ 1, but before we do that - let's find a way to highlight N-gram matches between strings."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(\"A short <span style='color:#ff8000'><span style='color:#ffff00'>s<span style='color:#00ff00'>trin</span></span>g</span> w<span style='color:#0080ff'><span style='color:#ff00ff'>ith </span></span>an <span style='color:#ff0000'>n<span style='color:#800080'><span style='color:#ff0000'>-gr</span></span>am</span>\",\n",
+       " \"Longer <span style='color:#ff8000'><span style='color:#ffff00'>s<span style='color:#00ff00'>trin</span></span>g</span>s w<span style='color:#0080ff'><span style='color:#ff00ff'>ith </span></span>different <span style='color:#ff0000'>n<span style='color:#800080'><span style='color:#ff0000'>-gr</span></span>am</span>s\")"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from typing import Tuple\n",
+    "from IPython.display import HTML\n",
+    "import numpy as np\n",
+    "\n",
+    "HTML_COLORS = [\n",
+    "    \"#ff0000\",\n",
+    "    \"#ff8000\",\n",
+    "    \"#ffff00\",\n",
+    "    \"#00ff00\",\n",
+    "    \"#0080ff\",\n",
+    "    \"#ff00ff\",\n",
+    "    \"#800080\",\n",
+    "]\n",
+    "ASCII_COLORS = [\n",
+    "    \"\\033[38;5;196m\",  # red\n",
+    "    \"\\033[38;5;208m\",  # orange\n",
+    "    \"\\033[38;5;226m\",  # yellow\n",
+    "    \"\\033[38;5;082m\",  # green\n",
+    "    \"\\033[38;5;039m\",  # blue\n",
+    "    \"\\033[38;5;201m\",  # magenta\n",
+    "    \"\\033[38;5;129m\",  # purple\n",
+    "]\n",
+    "\n",
+    "\n",
+    "def color_code_matches(\n",
+    "    query_text: str,\n",
+    "    document_text: str,\n",
+    "    query_hashes: np.ndarray,\n",
+    "    document_hashes: np.ndarray,\n",
+    "    query_ngrams: np.ndarray,\n",
+    "    document_ngrams: np.ndarray,\n",
+    "    *,\n",
+    "    html: bool = True,\n",
+    ") -> Tuple[str, str]:\n",
+    "    \"\"\"Highlight matching n‑grams / hash‑collisions in the two texts.\"\"\"\n",
+    "\n",
+    "    COLOR_ARRAY = (\n",
+    "        [f\"<span style='color:{hex_}'>\" for hex_ in HTML_COLORS]\n",
+    "        if html\n",
+    "        else ASCII_COLORS\n",
+    "    )\n",
+    "    COLOR_COLLISION = (\n",
+    "        \"<span style='color:#888888'>\" if html else \"\\033[38;5;244m\"\n",
+    "    )  # grey\n",
+    "    COLOR_RESET = \"</span>\" if html else \"\\033[0m\"\n",
+    "\n",
+    "    def number_of_matches_in_dimension(dim: int) -> int:\n",
+    "        if len(query_ngrams[dim]) == 0 or len(document_ngrams[dim]) == 0:\n",
+    "            return 0\n",
+    "        return min(\n",
+    "            query_text.count(query_ngrams[dim]),\n",
+    "            document_text.count(document_ngrams[dim]),\n",
+    "        )\n",
+    "\n",
+    "    def ngram_length_in_dimension(dim: int) -> int:\n",
+    "        return len(query_ngrams[dim]) if dim < len(query_ngrams) else 0\n",
+    "\n",
+    "    all_dims = [\n",
+    "        d for d in range(len(query_hashes)) if number_of_matches_in_dimension(d)\n",
+    "    ]\n",
+    "    all_dims.sort(key=ngram_length_in_dimension, reverse=True)\n",
+    "\n",
+    "    color_index = 0\n",
+    "    for dim in all_dims:\n",
+    "        if number_of_matches_in_dimension(dim) == 0:\n",
+    "            continue\n",
+    "\n",
+    "        is_hash_eq = query_hashes[dim] == document_hashes[dim]\n",
+    "        is_ngram_eq = query_ngrams[dim] == document_ngrams[dim]\n",
+    "        token = query_ngrams[dim]\n",
+    "        assert token, \"N‑gram must not be empty\"\n",
+    "\n",
+    "        if is_ngram_eq:\n",
+    "            color_tag = COLOR_ARRAY[color_index % len(COLOR_ARRAY)]\n",
+    "            replacement = f\"{color_tag}{token}{COLOR_RESET}\"\n",
+    "            color_index += 1\n",
+    "        elif is_hash_eq:\n",
+    "            replacement = f\"{COLOR_COLLISION}{token}{COLOR_RESET}\"\n",
+    "        else:\n",
+    "            continue\n",
+    "\n",
+    "        query_text = query_text.replace(token, replacement)\n",
+    "        document_text = document_text.replace(token, replacement)\n",
+    "\n",
+    "    return query_text, document_text\n",
+    "\n",
+    "\n",
+    "query_text = \"A short string with an n-gram\"\n",
+    "document_text = \"Longer strings with different n-grams\"\n",
+    "query_hashes, query_weights, query_ngrams = default_rolling_sketcher(query_text)\n",
+    "document_hashes, document_weights, document_ngrams = default_rolling_sketcher(\n",
+    "    document_text\n",
+    ")\n",
+    "color_code_matches(\n",
+    "    query_text=query_text,\n",
+    "    document_text=document_text,\n",
+    "    query_hashes=query_hashes,\n",
+    "    document_hashes=document_hashes,\n",
+    "    query_ngrams=query_ngrams,\n",
+    "    document_ngrams=document_ngrams,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Searching: 100%|██████████| 10/10 [00:00<00:00, 26.40doc/s]\n"
+     ]
     },
-    "nbformat": 4,
-    "nbformat_minor": 2
+    {
+     "data": {
+      "text/html": [
+       "<pre style='font-family:monospace'><b>Matched query 0 with document 1,559 with score 0.0625</b><br/>- a rebel statement sent to lisbon from jamba said 86 government soldiers and 13 gu<span style='color:#ff0000'>er<span style='color:#ff8000'>ril<span style='color:#ff00ff'>l<span style='color:#ff0000'>as w</span>er</span>e<span style='color:#0080ff'><span style='color:#800080'> <span style='color:#ff8000'>kil</span></span>led i</span>n</span> </span>the fighting that ended jan. 3. it said<span style='color:#ffff00'><span style='color:#00ff00'> the rebe</span>l</span> forces sill held mavinga.<br/>- hours later, six leftist gu<span style='color:#ff0000'>er<span style='color:#ff8000'>ril<span style='color:#ff00ff'>l<span style='color:#ff0000'>as w</span>er</span>e<span style='color:#0080ff'><span style='color:#800080'> <span style='color:#ff8000'>kil</span></span>led i</span>n</span> </span>a battle with a special army brigade created to fight<span style='color:#ffff00'><span style='color:#00ff00'> the rebe</span>l</span>s.<br/><b>Matched query 1 with document 3,483 with score 0.0417</b><br/>- authoriti<span style='color:#ff0000'>e<span style='color:#ff8000'>s <span style='color:#ffff00'><span style='color:#0080ff'>las</span>t we</span>e</span></span>k issued a vacate order for a club in manhattan a<span style='color:#00ff00'>nd </span>closed another in the bronx.<br/>- pric<span style='color:#ff0000'>e<span style='color:#ff8000'>s <span style='color:#ffff00'><span style='color:#0080ff'>las</span>t we</span>e</span></span>k were off 41% from six years ago a<span style='color:#00ff00'>nd </span>11% from <span style='color:#0080ff'>las</span>t year's sale.<br/><b>Matched query 2 with document 8,745 with score 0.0469</b><br/>- at <span style='color:#ff0000'>th<span style='color:#ffff00'>e <span style='color:#00ff00'><span style='color:#0080ff'>first</span></span></span></span> pan am bankruptcy hearing, <span style='color:#ff8000'>fo<span style='color:#ff00ff'>r <span style='color:#800080'>exa</span></span>m</span>ple, at least five airlines were represented.<br/>- <span style='color:#ff8000'>fo<span style='color:#ff00ff'>r <span style='color:#800080'>exa</span></span>m</span>ple, libya was <span style='color:#ff0000'>th<span style='color:#ffff00'>e <span style='color:#00ff00'><span style='color:#0080ff'>first</span></span></span></span> to break the $30-a-barrel level and others followed.<br/><b>Matched query 3 with document 2,243 with score 0.0469</b><br/>- mr. neigum, poker-faced<span style='color:#ff0000'> <span style='color:#ff8000'>d<span style='color:#ffff00'>uring t</span>he</span> </span>difficult task, manages a 46-second showing.<br/>- the june contract of the long gilt future on liffe traded between 108 21/32 and 107 5/8<span style='color:#ff0000'> <span style='color:#ff8000'>d<span style='color:#ffff00'>uring t</span>he</span> </span>day.<br/><b>Matched query 4 with document 9,748 with score 0.0417</b><br/>- this,<span style='color:#ff0000'> co<span style='color:#ff8000'>mb<span style='color:#ffff00'>ined </span>wit</span>h </span>the container division talks, suggests the group's bankers might be considering a<span style='color:#00ff00'>n o</span>rderly disposal of<span style='color:#0080ff'> al</span>l assets.<br/>- the near-extinctio<span style='color:#00ff00'>n o</span>f the buffalo herd,<span style='color:#ff0000'> co<span style='color:#ff8000'>mb<span style='color:#ffff00'>ined </span>wit</span>h </span>pressure from local missionaries,<span style='color:#0080ff'> al</span>l but wiped out the rituals that had united the omahas for centuries.<br/><b>Matched query 5 with document 8,400 with score 0.0625</b><br/>- she told the post in <span style='color:#ff0000'>an interv<span style='color:#ff8000'>i<span style='color:#0080ff'>ew </span>p</span>u</span>blished sunday that some of the money may have become \"mingled<span style='color:#00ff00'>\" i</span>nto improvements on her home that included a swimming pool, a $2,500 wide-screen televisio<span style='color:#ffff00'>n a</span>nd renovations to her basement.<br/>- eisner asked. \"i<span style='color:#ffff00'>n a</span> word, panic.<span style='color:#00ff00'>\" i</span>n <span style='color:#ff0000'>an interv<span style='color:#ff8000'>i<span style='color:#0080ff'>ew </span>p</span>u</span>blished in friday's contra costa times, eisner, 48, said he rarely rests easy.<br/><b>Matched query 6 with document 5,453 with score 0.0573</b><br/>- <span style='color:#ffff00'>accor</span>di<span style='color:#ff0000'><span style='color:#ff8000'>n<span style='color:#00ff00'>g to</span></span> a s</span>tud<span style='color:#0080ff'>y b</span>y the marshall institute, the average nasa employee's age in 1963 was 30; now most of its senior and middle-managers will be eligible to retire in five years.<br/>- the resolution was passed tuesda<span style='color:#0080ff'>y b</span>y the afl-cio executive council, <span style='color:#ffff00'>accor</span>di<span style='color:#ff0000'><span style='color:#ff8000'>n<span style='color:#00ff00'>g to</span></span> a s</span>tatement.<br/><b>Matched query 7 with document 8,218 with score 0.0990</b><br/>- preston tisch, 62, is <span style='color:#ffff00'><span style='color:#00ff00'>pre<span style='color:#ff0000'>side</span>nt </span></span>and co-<span style='color:#ff0000'><span style='color:#ff8000'>chi<span style='color:#0080ff'>ef <span style='color:#ff00ff'><span style='color:#800080'>e<span style='color:#ff8000'><span style='color:#ffff00'>xec</span></span></span>u</span></span>t</span>ive offic</span>er of loews corp. and is a former postmaster general.<br/>- hormel said charles b. olson, <span style='color:#ffff00'><span style='color:#00ff00'>pre<span style='color:#ff0000'>side</span>nt </span></span>of jennie-o, will also be named <span style='color:#ff0000'><span style='color:#ff8000'>chi<span style='color:#0080ff'>ef <span style='color:#ff00ff'><span style='color:#800080'>e<span style='color:#ff8000'><span style='color:#ffff00'>xec</span></span></span>u</span></span>t</span>ive offic</span>er of the new subsidiary.<br/><b>Matched query 8 with document 3,568 with score 0.0469</b><br/>- \"we're dealing with an <span style='color:#ff8000'>o<span style='color:#00ff00'>wner</span></span> who couldn't give<span style='color:#ff00ff'> a </span>rip. th<span style='color:#ff0000'>ey </span>cut off her mail a<span style='color:#800080'>nd </span>she got<span style='color:#ff00ff'> a </span>post office box.\" starting friday, a<span style='color:#0080ff'>n a</span>nimal-control officer is ac<span style='color:#ff0000'>c<span style='color:#ffff00'>ompa</span>n</span>ying finster on his route.<br/>- i<span style='color:#0080ff'>n a</span> letter to the <span style='color:#ff0000'>c<span style='color:#ffff00'>ompa</span>n</span>y's board of directors, united airline's pilots, flight attendants a<span style='color:#800080'>nd </span>machinists said th<span style='color:#ff0000'>ey </span>were prepared to negotiate as<span style='color:#ff00ff'> a </span>group for the acquisition of stock through one or more employee stock <span style='color:#ff8000'>o<span style='color:#00ff00'>wner</span></span>ship plans, or esops.<br/><b>Matched query 9 with document 8,323 with score 0.0469</b><br/>- asked if he might br<span style='color:#ff0000'><span style='color:#ff8000'>i<span style='color:#ffff00'><span style='color:#00ff00'><span style='color:#0080ff'>n<span style='color:#ff00ff'>g th</span></span>e w</span>o</span>r</span></span>ld leaders to texas, possibly to san antonio, the president remarked, \"tha<span style='color:#800080'>t's </span>a distinct possibility.<br/>- i<span style='color:#800080'>t's </span>sweep<span style='color:#ff0000'><span style='color:#ff8000'>i<span style='color:#ffff00'><span style='color:#00ff00'><span style='color:#0080ff'>n<span style='color:#ff00ff'>g th</span></span>e w</span>o</span>r</span></span>ld.</pre>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from tqdm import tqdm\n",
+    "from IPython.display import display\n",
+    "\n",
+    "QUERIES_TO_COMPARE = 10\n",
+    "\n",
+    "log_lines = []\n",
+    "\n",
+    "for i, query_hashes, query_counts, query_ngrams in tqdm(\n",
+    "    zip(\n",
+    "        range(QUERIES_TO_COMPARE),\n",
+    "        fingerprint_hashes[:QUERIES_TO_COMPARE],\n",
+    "        fingerprint_counts[:QUERIES_TO_COMPARE],\n",
+    "        fingerprint_ngrams[:QUERIES_TO_COMPARE],\n",
+    "    ),\n",
+    "    desc=\"Searching\",\n",
+    "    unit=\"doc\",\n",
+    "    total=QUERIES_TO_COMPARE,\n",
+    "):\n",
+    "\n",
+    "    # Compare with all other fingerprints\n",
+    "    best_score, best_index = 0.0, -1\n",
+    "    for j, dataset_hashes, dataset_counts, dataset_ngrams in zip(\n",
+    "        range(len(fingerprint_hashes)),\n",
+    "        fingerprint_hashes,\n",
+    "        fingerprint_counts,\n",
+    "        fingerprint_ngrams,\n",
+    "    ):\n",
+    "        if i == j:\n",
+    "            continue\n",
+    "\n",
+    "        score = jaccard_similarity(query_hashes, dataset_hashes)\n",
+    "        if score > best_score:\n",
+    "            best_score = score\n",
+    "            best_index = j\n",
+    "\n",
+    "    query = textual_lines[i]\n",
+    "    doc = textual_lines[best_index]\n",
+    "    colored_query, colored_doc = color_code_matches(\n",
+    "        query_text=query,\n",
+    "        document_text=doc,\n",
+    "        query_hashes=query_hashes,\n",
+    "        document_hashes=fingerprint_hashes[best_index],\n",
+    "        query_ngrams=query_ngrams,\n",
+    "        document_ngrams=fingerprint_ngrams[best_index],\n",
+    "    )\n",
+    "    log_lines.extend(\n",
+    "        [\n",
+    "            f\"<b>Matched query {i:,} with document {best_index:,} with score {best_score:.4f}</b>\",\n",
+    "            f\"- {colored_query}\",\n",
+    "            f\"- {colored_doc}\",\n",
+    "        ]\n",
+    "    )\n",
+    "\n",
+    "concatenated_log = \"<br/>\".join(log_lines)\n",
+    "monospaced_log = HTML(f\"<pre style='font-family:monospace'>{concatenated_log}</pre>\")\n",
+    "display(monospaced_log)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Searching: 100%|██████████| 10/10 [00:00<00:00, 17.60doc/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style='font-family:monospace'><b>Matched query 0 with document 3,668 with score 0.0305</b><br/>- a rebel statement sent to lisbon from jamba<span style='color:#ffff00'> sai</span>d 86 g<span style='color:#ff0000'><span style='color:#ff8000'>over</span>nme</span>nt soldiers and 13 guerrillas were <span style='color:#00ff00'>kil</span>led in the fighting that ended jan. 3. it<span style='color:#ffff00'> sai</span>d the rebel forces sill held mavinga.<br/>- the peruvian g<span style='color:#ff0000'><span style='color:#ff8000'>over</span>nme</span>nt has<span style='color:#ffff00'> sai</span>d that since the group turned from propaganda to violence in may 1980, it has <span style='color:#00ff00'>kil</span>led more than 15,000 people and caused more than $10 billion in damage.<br/><b>Matched query 1 with document 3,483 with score 0.0508</b><br/>- authoriti<span style='color:#ff0000'>e<span style='color:#ff8000'>s <span style='color:#ffff00'><span style='color:#0080ff'>las</span>t we</span>e</span></span>k issued a vacate order for a club in manhattan a<span style='color:#00ff00'>nd </span>closed another in the bronx.<br/>- pric<span style='color:#ff0000'>e<span style='color:#ff8000'>s <span style='color:#ffff00'><span style='color:#0080ff'>las</span>t we</span>e</span></span>k were off 41% from six years ago a<span style='color:#00ff00'>nd </span>11% from <span style='color:#0080ff'>las</span>t year's sale.<br/><b>Matched query 2 with document 2,193 with score 0.0525</b><br/>- at <span style='color:#ff0000'><span style='color:#ff8000'>the fir</span>st</span> pan am bankruptcy hearing, for example, at least five airlines were represented.<br/>- in <span style='color:#ff0000'><span style='color:#ff8000'>the fir</span>st</span> nine months, frozen food reduced long-term debt to $8.8 million from $19.2 million.<br/><b>Matched query 3 with document 1,436 with score 0.0459</b><br/>- mr. neigum, poker-faced <span style='color:#ff0000'><span style='color:#ff8000'>du<span style='color:#ffff00'>ri<span style='color:#00ff00'>n<span style='color:#0080ff'>g th</span></span></span></span>e</span> difficult task, manages a 46-second showing.<br/>- <span style='color:#ff0000'><span style='color:#ff8000'>du<span style='color:#ffff00'>ri<span style='color:#00ff00'>n<span style='color:#0080ff'>g th</span></span></span></span>e</span> last meeting, the sandinistas presented their most liberal proposal.<br/><b>Matched query 4 with document 7,018 with score 0.0369</b><br/>- this,<span style='color:#ff0000'> comb<span style='color:#ff8000'>ine<span style='color:#ffff00'>d wi</span>th</span> </span>the container division talks, suggests the group's bankers might be considering an orderly disposal of all assets.<br/>- that,<span style='color:#ff0000'> comb<span style='color:#ff8000'>ine<span style='color:#ffff00'>d wi</span>th</span> </span>lower prices, could get gnp back up to zero growth during the first quarter of 1991, he says.<br/><b>Matched query 5 with document 8,400 with score 0.0528</b><br/>- she told the post in <span style='color:#ff0000'>an interv<span style='color:#ff8000'>i<span style='color:#0080ff'>ew </span>p</span>u</span>blished sunday that some of the money may have become \"mingled<span style='color:#00ff00'>\" i</span>nto improvements on her home that included a swimming pool, a $2,500 wide-screen televisio<span style='color:#ffff00'>n a</span>nd renovations to her basement.<br/>- eisner asked. \"i<span style='color:#ffff00'>n a</span> word, panic.<span style='color:#00ff00'>\" i</span>n <span style='color:#ff0000'>an interv<span style='color:#ff8000'>i<span style='color:#0080ff'>ew </span>p</span>u</span>blished in friday's contra costa times, eisner, 48, said he rarely rests easy.<br/><b>Matched query 6 with document 3,224 with score 0.0498</b><br/>- accord<span style='color:#ff0000'>i<span style='color:#ff8000'>n<span style='color:#0080ff'>g to</span></span> </span>a study by the marshall institute, the average nasa employee's age in 1963 was 30; now most of its senior and middle-m<span style='color:#ffff00'>anage</span><span style='color:#00ff00'>rs wi</span>ll be eligible to retire in five years.<br/>- it is try<span style='color:#ff0000'>i<span style='color:#ff8000'>n<span style='color:#0080ff'>g to</span></span> </span>agree redundancy with 150 branch m<span style='color:#ffff00'>anage</span><span style='color:#00ff00'>rs wi</span>thin the next few months, and 120 computer and 80 clerical staff are also affected.<br/><b>Matched query 7 with document 4,990 with score 0.0619</b><br/>- preston tisch, 62, is<span style='color:#ff8000'> pre<span style='color:#0080ff'>side</span>nt</span> and co-<span style='color:#ff0000'>chief <span style='color:#00ff00'>e<span style='color:#ff00ff'><span style='color:#800080'>xec</span></span></span><span style='color:#ffff00'>utive off</span>ic</span>er of loews corp. and is a former postmaster general.<br/>- thomas m. egan,<span style='color:#ff8000'> pre<span style='color:#0080ff'>side</span>nt</span> and <span style='color:#ff0000'>chief <span style='color:#00ff00'>e<span style='color:#ff00ff'><span style='color:#800080'>xec</span></span></span><span style='color:#ffff00'>utive off</span>ic</span>er of stotler group, said the company had been in negotiations with the creditors in efforts to reach a solution.<br/><b>Matched query 8 with document 4,686 with score 0.0309</b><br/>- \"we're dealing with an owner wh<span style='color:#ffff00'><span style='color:#00ff00'>o c</span></span>ould<span style='color:#ff8000'>n't </span>giv<span style='color:#ff0000'>e<span style='color:#0080ff'> a </span></span>rip. they cut off her mail and she got<span style='color:#0080ff'> a </span>post office box.\" starting friday, an animal-control officer is accompanying finster on his route.<br/>- we do<span style='color:#ff8000'>n't </span>have t<span style='color:#ffff00'><span style='color:#00ff00'>o c</span></span>hang<span style='color:#ff0000'>e<span style='color:#0080ff'> a </span></span>single thing,\" bush told campaign staff workers<span style='color:#0080ff'> a </span>day after beating sen. bob dole of kansas, his chief rival for the gop presidential nomination, in new hampshire.<br/><b>Matched query 9 with document 1,831 with score 0.0288</b><br/>- asked if he might bri<span style='color:#ff8000'>n<span style='color:#ffff00'>g th</span></span>e world leaders to t<span style='color:#00ff00'>exa</span>s, possibly to san antonio, the <span style='color:#ff0000'>president </span>remarked, \"that's a distinct possibility.<br/>- the t<span style='color:#00ff00'>exa</span>ns' success in persuadi<span style='color:#ff8000'>n<span style='color:#ffff00'>g th</span></span>e <span style='color:#ff0000'>president </span>suggests that mexico may get better treatment in the bush administration than it has in the past.</pre>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from tqdm import tqdm\n",
+    "from IPython.display import display\n",
+    "\n",
+    "QUERIES_TO_COMPARE = 10\n",
+    "\n",
+    "log_lines = []\n",
+    "\n",
+    "for i, query_hashes, query_counts, query_ngrams in tqdm(\n",
+    "    zip(\n",
+    "        range(QUERIES_TO_COMPARE),\n",
+    "        fingerprint_hashes[:QUERIES_TO_COMPARE],\n",
+    "        fingerprint_counts[:QUERIES_TO_COMPARE],\n",
+    "        fingerprint_ngrams[:QUERIES_TO_COMPARE],\n",
+    "    ),\n",
+    "    desc=\"Searching\",\n",
+    "    unit=\"doc\",\n",
+    "    total=QUERIES_TO_COMPARE,\n",
+    "):\n",
+    "\n",
+    "    # Compare with all other fingerprints\n",
+    "    best_score, best_index = 0.0, -1\n",
+    "    for j, dataset_hashes, dataset_counts, dataset_ngrams in zip(\n",
+    "        range(len(fingerprint_hashes)),\n",
+    "        fingerprint_hashes,\n",
+    "        fingerprint_counts,\n",
+    "        fingerprint_ngrams,\n",
+    "    ):\n",
+    "        if i == j:\n",
+    "            continue\n",
+    "\n",
+    "        score = weighted_jaccard_similarity(\n",
+    "            (query_hashes, query_counts),\n",
+    "            (dataset_hashes, dataset_counts),\n",
+    "        )\n",
+    "        if score > best_score:\n",
+    "            best_score = score\n",
+    "            best_index = j\n",
+    "\n",
+    "    query = textual_lines[i]\n",
+    "    doc = textual_lines[best_index]\n",
+    "    colored_query, colored_doc = color_code_matches(\n",
+    "        query_text=query,\n",
+    "        document_text=doc,\n",
+    "        query_hashes=query_hashes,\n",
+    "        document_hashes=fingerprint_hashes[best_index],\n",
+    "        query_ngrams=query_ngrams,\n",
+    "        document_ngrams=fingerprint_ngrams[best_index],\n",
+    "    )\n",
+    "    log_lines.extend(\n",
+    "        [\n",
+    "            f\"<b>Matched query {i:,} with document {best_index:,} with score {best_score:.4f}</b>\",\n",
+    "            f\"- {colored_query}\",\n",
+    "            f\"- {colored_doc}\",\n",
+    "        ]\n",
+    "    )\n",
+    "\n",
+    "concatenated_log = \"<br/>\".join(log_lines)\n",
+    "monospaced_log = HTML(f\"<pre style='font-family:monospace'>{concatenated_log}</pre>\")\n",
+    "display(monospaced_log)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Min-Hash Fingerprinting DNA & Protein Sequences"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dna_dataset_path = dataset_directory / \"acgt_10k.txt\"\n",
+    "dna_dataset = open(dna_dataset_path, \"r\").read().strip()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "StringZilla",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
 }

From 4a577891d41bc2aeb46909a8a27d5c34aacb5be8 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 25 Jul 2025 10:14:02 +0000
Subject: [PATCH 492/751] Improve: Expose floating-point SIMD states

---
 include/stringzilla/types.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index ea059525..0591ba5e 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -745,6 +745,8 @@ typedef union sz_u64_vec_t {
 typedef union sz_u128_vec_t {
 #if SZ_USE_HASWELL
     __m128i xmm;
+    __m128d xmm_pd;
+    __m128 xmm_ps;
 #endif
 #if SZ_USE_NEON
     uint8x16_t u8x16;
@@ -770,6 +772,8 @@ typedef union sz_u128_vec_t {
 typedef union sz_u256_vec_t {
 #if SZ_USE_HASWELL
     __m256i ymm;
+    __m256d ymm_pd;
+    __m256 ymm_ps;
     __m128i xmms[2];
 #endif
 #if SZ_USE_NEON
@@ -796,6 +800,8 @@ typedef union sz_u256_vec_t {
 typedef union sz_u512_vec_t {
 #if SZ_USE_SKYLAKE || SZ_USE_ICE
     __m512i zmm;
+    __m512d zmm_pd;
+    __m512 zmm_ps;
 #endif
 #if SZ_USE_HASWELL || SZ_USE_SKYLAKE || SZ_USE_ICE
     __m256i ymms[2];

From 05725b232d1611277d202f15557ff281ccffabc5 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 25 Jul 2025 10:15:30 +0000
Subject: [PATCH 493/751] Improve: Consistent kernel naming without underscore
 prefixes

---
 include/stringzillas/similarity.cuh | 64 ++++++++++++++---------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/include/stringzillas/similarity.cuh b/include/stringzillas/similarity.cuh
index 1a9cfa4b..388121d9 100644
--- a/include/stringzillas/similarity.cuh
+++ b/include/stringzillas/similarity.cuh
@@ -1232,7 +1232,7 @@ template <                                                       //
     sz_similarity_locality_t locality_ = sz_similarity_global_k, //
     sz_capability_t capability_ = sz_cap_cuda_k                  //
     >
-__global__ void _linear_score_across_cuda_device(              //
+__global__ void linear_score_across_cuda_device_(              //
     char_type_ const *shorter_ptr, index_type_ shorter_length, //
     char_type_ const *longer_ptr, index_type_ longer_length,   //
     final_score_type_ *result_ptr, score_type_ *diagonals_ptr, //
@@ -1409,7 +1409,7 @@ template <                                                       //
     sz_similarity_locality_t locality_ = sz_similarity_global_k, //
     sz_capability_t capability_ = sz_cap_cuda_k                  //
     >
-__global__ void _affine_score_across_cuda_device(              //
+__global__ void affine_score_across_cuda_device(               //
     char_type_ const *shorter_ptr, index_type_ shorter_length, //
     char_type_ const *longer_ptr, index_type_ longer_length,   //
     final_score_type_ *result_ptr, score_type_ *diagonals_ptr, //
@@ -1613,7 +1613,7 @@ template < //
     sz_similarity_locality_t locality_ = sz_similarity_global_k, //
     sz_capability_t capability_ = sz_cap_cuda_k                  //
     >
-__global__ void _linear_score_on_each_cuda_warp(                             //
+__global__ void linear_score_on_each_cuda_warp_(                             //
     task_type_ *tasks, size_t tasks_count,                                   //
     substituter_type_ const substituter, linear_gap_costs_t const gap_costs, //
     uint const shared_memory_size) {
@@ -1806,7 +1806,7 @@ template < //
     sz_similarity_locality_t locality_ = sz_similarity_global_k, //
     sz_capability_t capability_ = sz_cap_cuda_k                  //
     >
-__global__ void _affine_score_on_each_cuda_warp(                             //
+__global__ void affine_score_on_each_cuda_warp_(                             //
     task_type_ *tasks, size_t tasks_count,                                   //
     substituter_type_ const substituter, affine_gap_costs_t const gap_costs, //
     uint const shared_memory_size) {
@@ -2122,26 +2122,26 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
         if (device_level_tasks.size()) {
             auto device_level_u16_kernel =
                 is_affine_k //
-                    ? (void *)&_affine_score_across_cuda_device<char_t, sz_u16_t, sz_u16_t, final_score_t,
-                                                                uniform_substitution_costs_t, sz_minimize_distance_k,
-                                                                sz_similarity_global_k, capability_k>
-                    : (void *)&_linear_score_across_cuda_device<char_t, sz_u16_t, sz_u16_t, final_score_t,
+                    ? (void *)&affine_score_across_cuda_device<char_t, sz_u16_t, sz_u16_t, final_score_t,
+                                                               uniform_substitution_costs_t, sz_minimize_distance_k,
+                                                               sz_similarity_global_k, capability_k>
+                    : (void *)&linear_score_across_cuda_device_<char_t, sz_u16_t, sz_u16_t, final_score_t,
                                                                 uniform_substitution_costs_t, sz_minimize_distance_k,
                                                                 sz_similarity_global_k, capability_k>;
             auto device_level_u32_kernel =
                 is_affine_k //
-                    ? (void *)&_affine_score_across_cuda_device<char_t, sz_u32_t, sz_u32_t, final_score_t,
-                                                                uniform_substitution_costs_t, sz_minimize_distance_k,
-                                                                sz_similarity_global_k, capability_k>
-                    : (void *)&_linear_score_across_cuda_device<char_t, sz_u32_t, sz_u32_t, final_score_t,
+                    ? (void *)&affine_score_across_cuda_device<char_t, sz_u32_t, sz_u32_t, final_score_t,
+                                                               uniform_substitution_costs_t, sz_minimize_distance_k,
+                                                               sz_similarity_global_k, capability_k>
+                    : (void *)&linear_score_across_cuda_device_<char_t, sz_u32_t, sz_u32_t, final_score_t,
                                                                 uniform_substitution_costs_t, sz_minimize_distance_k,
                                                                 sz_similarity_global_k, capability_k>;
             auto device_level_u64_kernel =
                 is_affine_k //
-                    ? (void *)&_affine_score_across_cuda_device<char_t, sz_u64_t, sz_u64_t, final_score_t,
-                                                                uniform_substitution_costs_t, sz_minimize_distance_k,
-                                                                sz_similarity_global_k, capability_k>
-                    : (void *)&_linear_score_across_cuda_device<char_t, sz_u64_t, sz_u64_t, final_score_t,
+                    ? (void *)&affine_score_across_cuda_device<char_t, sz_u64_t, sz_u64_t, final_score_t,
+                                                               uniform_substitution_costs_t, sz_minimize_distance_k,
+                                                               sz_similarity_global_k, capability_k>
+                    : (void *)&linear_score_across_cuda_device_<char_t, sz_u64_t, sz_u64_t, final_score_t,
                                                                 uniform_substitution_costs_t, sz_minimize_distance_k,
                                                                 sz_similarity_global_k, capability_k>;
             void *device_level_kernel_args[8];
@@ -2195,18 +2195,18 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
         if (warp_level_tasks.size()) {
             auto warp_level_u8_kernel =
                 is_affine_k
-                    ? (void *)&_affine_score_on_each_cuda_warp<task_t, char_t, sz_u8_t, sz_u8_t,
+                    ? (void *)&affine_score_on_each_cuda_warp_<task_t, char_t, sz_u8_t, sz_u8_t,
                                                                uniform_substitution_costs_t, sz_minimize_distance_k,
                                                                sz_similarity_global_k, capability_k>
-                    : (void *)&_linear_score_on_each_cuda_warp<task_t, char_t, sz_u8_t, sz_u8_t,
+                    : (void *)&linear_score_on_each_cuda_warp_<task_t, char_t, sz_u8_t, sz_u8_t,
                                                                uniform_substitution_costs_t, sz_minimize_distance_k,
                                                                sz_similarity_global_k, capability_k>;
             auto warp_level_u16_kernel =
                 is_affine_k
-                    ? (void *)&_affine_score_on_each_cuda_warp<task_t, char_t, sz_u16_t, sz_u16_t,
+                    ? (void *)&affine_score_on_each_cuda_warp_<task_t, char_t, sz_u16_t, sz_u16_t,
                                                                uniform_substitution_costs_t, sz_minimize_distance_k,
                                                                sz_similarity_global_k, capability_k>
-                    : (void *)&_linear_score_on_each_cuda_warp<task_t, char_t, sz_u16_t, sz_u16_t,
+                    : (void *)&linear_score_on_each_cuda_warp_<task_t, char_t, sz_u16_t, sz_u16_t,
                                                                uniform_substitution_costs_t, sz_minimize_distance_k,
                                                                sz_similarity_global_k, capability_k>;
             void *warp_level_kernel_args[5];
@@ -2789,18 +2789,18 @@ struct _cuda_nw_or_sw_byte_level_scores {
         if (device_level_tasks.size()) {
             auto device_level_i32_kernel =
                 is_affine_k //
-                    ? (void *)&_affine_score_across_cuda_device<char_t, sz_u32_t, sz_i32_t, final_score_t,
-                                                                error_costs_256x256_in_cuda_constant_memory_t,
-                                                                sz_maximize_score_k, locality_k, capability_k>
-                    : (void *)&_linear_score_across_cuda_device<char_t, sz_u32_t, sz_i32_t, final_score_t,
+                    ? (void *)&affine_score_across_cuda_device<char_t, sz_u32_t, sz_i32_t, final_score_t,
+                                                               error_costs_256x256_in_cuda_constant_memory_t,
+                                                               sz_maximize_score_k, locality_k, capability_k>
+                    : (void *)&linear_score_across_cuda_device_<char_t, sz_u32_t, sz_i32_t, final_score_t,
                                                                 error_costs_256x256_in_cuda_constant_memory_t,
                                                                 sz_maximize_score_k, locality_k, capability_k>;
             auto device_level_i64_kernel =
                 is_affine_k //
-                    ? (void *)&_affine_score_across_cuda_device<char_t, sz_u64_t, sz_i64_t, final_score_t,
-                                                                error_costs_256x256_in_cuda_constant_memory_t,
-                                                                sz_maximize_score_k, locality_k, capability_k>
-                    : (void *)&_linear_score_across_cuda_device<char_t, sz_u64_t, sz_i64_t, final_score_t,
+                    ? (void *)&affine_score_across_cuda_device<char_t, sz_u64_t, sz_i64_t, final_score_t,
+                                                               error_costs_256x256_in_cuda_constant_memory_t,
+                                                               sz_maximize_score_k, locality_k, capability_k>
+                    : (void *)&linear_score_across_cuda_device_<char_t, sz_u64_t, sz_i64_t, final_score_t,
                                                                 error_costs_256x256_in_cuda_constant_memory_t,
                                                                 sz_maximize_score_k, locality_k, capability_k>;
             void *device_level_kernel_args[8];
@@ -2851,17 +2851,17 @@ struct _cuda_nw_or_sw_byte_level_scores {
         // From the highest possible number of warps per multiprocessor to the lowest.
         if (warp_level_tasks.size()) {
             auto warp_level_i16_kernel =
-                is_affine_k ? (void *)&_affine_score_on_each_cuda_warp<task_t, char_t, sz_u16_t, sz_i16_t,
+                is_affine_k ? (void *)&affine_score_on_each_cuda_warp_<task_t, char_t, sz_u16_t, sz_i16_t,
                                                                        error_costs_256x256_in_cuda_constant_memory_t,
                                                                        sz_maximize_score_k, locality_k, capability_k>
-                            : (void *)&_linear_score_on_each_cuda_warp<task_t, char_t, sz_u16_t, sz_i16_t,
+                            : (void *)&linear_score_on_each_cuda_warp_<task_t, char_t, sz_u16_t, sz_i16_t,
                                                                        error_costs_256x256_in_cuda_constant_memory_t,
                                                                        sz_maximize_score_k, locality_k, capability_k>;
             auto warp_level_i32_kernel =
-                is_affine_k ? (void *)&_affine_score_on_each_cuda_warp<task_t, char_t, sz_u32_t, sz_i32_t,
+                is_affine_k ? (void *)&affine_score_on_each_cuda_warp_<task_t, char_t, sz_u32_t, sz_i32_t,
                                                                        error_costs_256x256_in_cuda_constant_memory_t,
                                                                        sz_maximize_score_k, locality_k, capability_k>
-                            : (void *)&_linear_score_on_each_cuda_warp<task_t, char_t, sz_u32_t, sz_i32_t,
+                            : (void *)&linear_score_on_each_cuda_warp_<task_t, char_t, sz_u32_t, sz_i32_t,
                                                                        error_costs_256x256_in_cuda_constant_memory_t,
                                                                        sz_maximize_score_k, locality_k, capability_k>;
             void *warp_level_kernel_args[5];

From 0d9ba5b1f99a5e8b43f4af4846660ad2162085bb Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 25 Jul 2025 10:16:07 +0000
Subject: [PATCH 494/751] Improve: Include hash counts in fingerprints

---
 include/stringzillas/fingerprint.hpp | 1451 ++++++++++++--------------
 scripts/bench_fingerprint.cuh        |   75 +-
 scripts/test_fingerprint.cuh         |   18 +-
 3 files changed, 722 insertions(+), 822 deletions(-)

diff --git a/include/stringzillas/fingerprint.hpp b/include/stringzillas/fingerprint.hpp
index 35ad9e67..f3e5ad26 100644
--- a/include/stringzillas/fingerprint.hpp
+++ b/include/stringzillas/fingerprint.hpp
@@ -89,10 +89,26 @@
  *  With such minimalistic alphabets of just four characters (AGCT) longer windows might be needed.
  *  For protein sequences the alphabet is 20 characters long, so the window can be shorter, than for DNAs.
  *
- *  @section Fingerprinting/Sketching via @b Weighted-Min-Hashing
+ *  @section Fingerprinting, @b Min-Hashing, or @b Count-Min-Sketching?
  *
  *  Computing one such hash won't help us much in large-scale retrieval tasks, but there is a common technique
- *  called "Min-Hashing" that can. The idea is built around the T
+ *  called "Min-Hashing" that can. The idea is to apply many hash functions for different slices of the input,
+ *  and then output the minimum of each hash function as an individual dimension of a resulting vector.
+ *
+ *  Picking the right number of dimensions is task-dependant. The longer and more diverse are the input strings,
+ *  the more dimensions may be needed to capture their uniqueness. The shorter and more similar the strings,
+ *  the fewer dimensions are needed. A good starting point is to use roughly the same amount of memory as the size
+ *  of input documents. So if you are processing 4 KB memory pages, I'd recommend 1024 dimensions, each encoded
+ *  as a 32-bit integer, which is 4 KB in total.
+ *
+ *  From the hardware perspective, however, on both CPUs and GPUs we vectorize the code. Hash functions that have
+ *  the same window width can be processed simultaneously without complex memory access patterns. Assuming, the state
+ *  of each rolling hash is 8 bytes:
+ *
+ *  - on AVX-512 capable CPUs, take at least 8 hash-functions of each width,
+ *  - on AVX-512 capable CPUs with a physical 512-bit path, take 16 or more, to increase register utilization,
+ *  - on Nvidia GPUs, take at least 32 hash-functions of each width, to activate all 32 threads in a warp.
+ *  - on AMD GPUs, take at least 64 hash-functions of each width, to activate all 64 threads in a wave.
  */
 #ifndef STRINGZILLAS_FINGERPRINT_HPP_
 #define STRINGZILLAS_FINGERPRINT_HPP_
@@ -101,6 +117,7 @@
 #include "stringzilla/memory.h"   // `sz_move`
 #include "stringzillas/types.hpp" // `sz::executor_like`
 
+#include <cstddef>
 #include <limits>   // `std::numeric_limits` for numeric types
 #include <iterator> // `std::iterator_traits` for iterators
 #include <cmath>    // `std::fabsf` for `f32_rolling_hasher`
@@ -505,13 +522,41 @@ struct floating_rolling_hasher<double> {
 
 #pragma region - Optimized Rolling MinHashers
 
+template <size_t dimensions_ = SZ_SIZE_MAX, typename hash_type_ = std::uint32_t, typename count_type_ = std::uint32_t>
+void merge_count_min_sketches(                                                                           //
+    span<hash_type_ const, dimensions_> a_min_hashes, span<count_type_ const, dimensions_> a_min_counts, //
+    span<hash_type_ const, dimensions_> b_min_hashes, span<count_type_ const, dimensions_> b_min_counts, //
+    span<hash_type_, dimensions_> c_min_hashes, span<count_type_, dimensions_> c_min_counts) noexcept {
+
+    sz_assert_(a_min_hashes.size() == b_min_hashes.size() && "Input sketches must have the same size");
+    sz_assert_(a_min_counts.size() == b_min_counts.size() && "Input counts must have the same size");
+    sz_assert_(c_min_hashes.size() == a_min_hashes.size() && "Output hashes must have the same size");
+    sz_assert_(c_min_counts.size() == a_min_counts.size() && "Output counts must have the same size");
+
+    for (std::size_t dim = 0; dim < c_min_hashes.size(); ++dim) {
+        if (a_min_hashes[dim] < b_min_hashes[dim]) {
+            c_min_hashes[dim] = a_min_hashes[dim];
+            c_min_counts[dim] = a_min_counts[dim];
+        }
+        else if (b_min_hashes[dim] < a_min_hashes[dim]) {
+            c_min_hashes[dim] = b_min_hashes[dim];
+            c_min_counts[dim] = b_min_counts[dim];
+        }
+        else {
+            c_min_hashes[dim] = a_min_hashes[dim];
+            c_min_counts[dim] = a_min_counts[dim] + b_min_counts[dim];
+        }
+    }
+}
+
 /**
- *  @brief Boring Min-Hash implementation on top of the baseline Rabin Karp algorithm just for benchmarking.
+ *  @brief Boring Min-Hash / Count-Min-Sketch implementation over any rolling hashing algorithm just for benchmarking.
  *  @tparam hasher_type_ Can be the Rabin-Karp, BuzHash, or anything else compatible.
  */
 template <                                                                           //
     typename hasher_type_ = rabin_karp_rolling_hasher<std::uint32_t, std::uint64_t>, //
-    typename scalar_type_ = typename hasher_type_::hash_t,                           //
+    typename min_hash_type_ = std::uint32_t,                                         //
+    typename min_count_type_ = std::uint32_t,                                        //
     typename allocator_type_ = std::allocator<hasher_type_>                          //
     >
 struct basic_rolling_hashers {
@@ -520,28 +565,23 @@ struct basic_rolling_hashers {
     using rolling_state_t = typename hasher_t::state_t;
     using rolling_hash_t = typename hasher_t::hash_t;
 
-    using result_scalar_t = scalar_type_;
+    using min_hash_t = min_hash_type_;
+    using min_count_t = min_count_type_;
     using allocator_t = allocator_type_;
 
     static constexpr rolling_state_t skipped_rolling_state_k = std::numeric_limits<rolling_state_t>::max();
-    static constexpr rolling_hash_t max_rolling_hash_k = std::numeric_limits<rolling_hash_t>::max();
-    static constexpr result_scalar_t max_result_scalar_k = std::numeric_limits<result_scalar_t>::max();
-
-    struct dimension_state_t {
-        rolling_state_t last = 0;
-        rolling_hash_t minimum = max_rolling_hash_k;
-    };
+    static constexpr rolling_hash_t skipped_rolling_hash_k = std::numeric_limits<rolling_hash_t>::max();
+    static constexpr min_hash_t max_hash_k = std::numeric_limits<min_hash_t>::max();
 
   private:
     using allocator_traits_t = std::allocator_traits<allocator_type_>;
     using hasher_allocator_t = typename allocator_traits_t::template rebind_alloc<hasher_t>;
-    using dimension_state_allocator_t = typename allocator_traits_t::template rebind_alloc<dimension_state_t>;
-
-    using hashers_t = safe_vector<hasher_t, hasher_allocator_t>;
-    using dimension_states_t = safe_vector<dimension_state_t, dimension_state_allocator_t>;
+    using rolling_states_allocator_t = typename allocator_traits_t::template rebind_alloc<rolling_state_t>;
+    using rolling_hashes_allocator_t = typename allocator_traits_t::template rebind_alloc<rolling_hash_t>;
+    using min_counts_allocator_t = typename allocator_traits_t::template rebind_alloc<min_count_t>;
 
     allocator_t allocator_;
-    hashers_t hashers_;
+    safe_vector<hasher_t, hasher_allocator_t> hashers_;
     std::size_t max_window_width_ = 0;
 
   public:
@@ -595,76 +635,163 @@ struct basic_rolling_hashers {
     }
 
     /**
-     *  @brief Computes the fingerprint of a single @p text on the current thread.
+     *  @brief Computes the fingerprint of a single @p `text` on the current thread.
      *  @param[in] text The input text to hash, typically a UTF-8 encoded string.
-     *  @param[out] result The output fingerprint, a vector of minimum hashes.
+     *  @param[out] min_hashes The output fingerprint, a vector of minimum hashes.
+     *  @param[out] min_counts The output frequencies of @p `min_hashes` hashes.
      *  @retval status_t::success_k on success, or an error code otherwise.
      *  @retval status_t::bad_alloc_k if the memory allocation fails.
      */
     template <size_t dimensions_ = SZ_SIZE_MAX>
-    status_t try_fingerprint(span<byte_t const> text, span<result_scalar_t, dimensions_> result) const noexcept {
-        sz_assert_(result.size() == dimensions() && "Dimensions number & hashers number mismatch");
+    status_t try_fingerprint(                     //
+        span<byte_t const> text,                  //
+        span<min_hash_t, dimensions_> min_hashes, //
+        span<min_count_t, dimensions_> min_counts) const noexcept {
+
+        sz_assert_(dimensions() == min_hashes.size() && "Dimensions number & hashers number mismatch");
+        sz_assert_(dimensions() == min_counts.size() && "Dimensions number & hash-counts number mismatch");
 
         // Allocate temporary states
-        dimension_states_t dimension_states(allocator_traits_t::select_on_container_copy_construction(allocator_));
-        if (dimension_states.try_resize(dimensions()) != status_t::success_k) return status_t::bad_alloc_k;
+        safe_vector<rolling_state_t, rolling_states_allocator_t> rolling_states_buffer(
+            allocator_traits_t::select_on_container_copy_construction(allocator_));
+        safe_vector<rolling_hash_t, rolling_hashes_allocator_t> rolling_minimums_buffer(
+            allocator_traits_t::select_on_container_copy_construction(allocator_));
+        if (rolling_states_buffer.try_resize(dimensions()) != status_t::success_k &&
+            rolling_minimums_buffer.try_resize(dimensions()) != status_t::success_k)
+            return status_t::bad_alloc_k;
+
+        // Initialize the starting states
+        for (auto &state : rolling_states_buffer) state = rolling_state_t(0);
+        for (auto &minimum : rolling_minimums_buffer) minimum = skipped_rolling_hash_k;
+
+        // Roll through the entire `text`
+        auto rolling_states = span<rolling_state_t, dimensions_>(rolling_states_buffer);
+        auto rolling_minimums = span<rolling_hash_t, dimensions_>(rolling_minimums_buffer);
+        roll<dimensions_>(text, rolling_states, rolling_minimums, min_counts, 0);
 
-        fingerprint<dimensions_>(text, {dimension_states.data(), dimension_states.size()}, result);
+        // Now that the states are updated, export them into the output spans
+        digest<dimensions_>(rolling_minimums, min_hashes);
         return status_t::success_k;
     }
 
     /**
-     *  @brief Computes the fingerprint of a single @p text on the current thread.
-     *  @param[in] text The input text to hash, typically a UTF-8 encoded string.
-     *  @param[inout] states The states of the hashers, used to avoid re-allocating temporary buffers.
-     *  @param[out] result The output fingerprint, a vector of minimum hashes.
+     *  @brief Underlying machinery of `fingerprint` that fills the states of the hashers.
+     *  @param[in] text_chunk A chunk of text to update the @p `last_states` with.
+     *  @param[inout] last_states The last computed floats for each hasher; start with zeroes.
+     *  @param[inout] rolling_minimums The minimum floats for each hasher; start with `skipped_rolling_hash_k`.
+     *  @param[inout] rolling_counts The appearance frequency counts of each @p `rolling_minimums`; start with 1.
+     *  @param[in] passed_progress The offset of the received @p `text_chunk` in the whole text; defaults to 0.
      */
     template <size_t dimensions_ = SZ_SIZE_MAX>
-    void fingerprint(span<byte_t const> text, span<dimension_state_t, dimensions_> dimension_states,
-                     span<result_scalar_t, dimensions_> result) const noexcept {
+    void roll(                                              //
+        span<byte_t const> text_chunk,                      //
+        span<rolling_state_t, dimensions_> last_states,     //
+        span<rolling_hash_t, dimensions_> rolling_minimums, //
+        span<min_count_t, dimensions_> rolling_counts,      //
+        std::size_t const passed_progress = 0) const noexcept {
 
-        sz_assert_(result.size() == dimensions() && "Dimensions number & hashers number mismatch");
-        sz_assert_(dimension_states.size() == dimensions() && "Dimensions number & states number mismatch");
+        sz_assert_(dimensions() == last_states.size() && "Dimensions number & states number mismatch");
+        sz_assert_(dimensions() == rolling_minimums.size() && "Dimensions number & minimums number mismatch");
+        sz_assert_(dimensions() == rolling_counts.size() && "Dimensions number & hash-counts number mismatch");
 
-        fill_states_(text, dimension_states);
+        // Until we reach the maximum window length, use a branching code version
+        std::size_t const prefix_length = (std::min)(text_chunk.size(), max_window_width_);
+        std::size_t new_char_offset = passed_progress;
+        for (; new_char_offset < prefix_length; ++new_char_offset) {
+            byte_t const new_char = text_chunk[new_char_offset];
+            for (std::size_t dim = 0; dim < last_states.size(); ++dim) {
+                auto &hasher = hashers_[dim];
+                auto &last_state = last_states[dim];
+                auto &rolling_minimum = rolling_minimums[dim];
+                auto &rolling_count = rolling_counts[dim];
+                if (hasher.window_width() > new_char_offset) {
+                    last_state = hasher.push(last_state, new_char);
+                    if (hasher.window_width() == (new_char_offset + 1)) {
+                        rolling_minimum = (std::min)(rolling_minimum, hasher.digest(last_state));
+                        rolling_count = 1; // First occurrence of this hash
+                    }
+                }
+                else {
+                    auto const old_char = text_chunk[new_char_offset - hasher.window_width()];
+                    last_state = hasher.roll(last_state, old_char, new_char);
+                    auto new_hash = hasher.digest(last_state);
+                    rolling_count += new_hash == rolling_minimum;
+                    rolling_minimum = (std::min)(rolling_minimum, new_hash);
+                }
+            }
+        }
 
-        // Export the minimum hashes to the fingerprint
-        for (std::size_t dim = 0; dim < result.size(); ++dim) {
-            hasher_t const &hasher = hashers_[dim];
-            if (hasher.window_width() > text.size()) {
-                rolling_hash_t min_hash = dimension_states[dim].minimum;
-                result[dim] = static_cast<result_scalar_t>(min_hash & max_result_scalar_k);
+        // Now we can avoid a branch in the nested loop, as we are passed the longest window width
+        for (; new_char_offset < text_chunk.size(); ++new_char_offset) {
+            byte_t const new_char = text_chunk[new_char_offset];
+            for (std::size_t dim = 0; dim < last_states.size(); ++dim) {
+                auto &hasher = hashers_[dim];
+                auto &last_state = last_states[dim];
+                auto &rolling_minimum = rolling_minimums[dim];
+                auto &rolling_count = rolling_counts[dim];
+                auto const old_char = text_chunk[new_char_offset - hasher.window_width()];
+                last_state = hasher.roll(last_state, old_char, new_char);
+                auto new_hash = hasher.digest(last_state);
+                rolling_count += new_hash == rolling_minimum;
+                rolling_minimum = (std::min)(rolling_minimum, new_hash);
             }
-            else { result[dim] = max_result_scalar_k; }
+        }
+    }
+
+    /**
+     *  @brief Converts the rolling minimums into the final minimum hashes.
+     *  @param[in] rolling_minimums The minimum hashes computed by the rolling hashers.
+     *  @param[out] min_hashes The output minimum hashes, which are the final fingerprints.
+     */
+    template <size_t dimensions_ = SZ_SIZE_MAX>
+    void digest(                                                  //
+        span<rolling_hash_t const, dimensions_> rolling_minimums, //
+        span<min_hash_t, dimensions_> min_hashes) const noexcept {
+
+        for (std::size_t dim = 0; dim < min_hashes.size(); ++dim) {
+            rolling_hash_t const rolling_minimum = rolling_minimums[dim];
+            min_hashes[dim] = rolling_minimum == skipped_rolling_hash_k
+                                  ? max_hash_k // If the rolling minimum is not set, use the maximum hash value
+                                  : static_cast<min_hash_t>(rolling_minimum & max_hash_k);
         }
     }
 
     /**
      *  @brief Computes many fingerprints in parallel for input @p texts via an @p executor.
-     *  @param[in] texts The input texts to hash, typically a UTF-8 encoded string.
-     *  @param[out] results The output fingerprints, a array of vectors of minimum hashes.
+     *  @param[in] texts The input texts to hash, typically a sequential container of UTF-8 encoded strings.
+     *  @param[out] min_hashes_per_text The output fingerprints, an array of vectors of minimum hashes.
+     *  @param[out] min_counts_per_text The output frequencies of @p `min_hashes_per_text` hashes.
      *  @param[in] executor The executor to use for parallel processing, defaults to a dummy executor.
      *  @param[in] specs The CPU specifications to use, defaults to an empty `cpu_specs_t`.
      *  @retval status_t::success_k on success, or an error code otherwise.
      *  @retval status_t::bad_alloc_k if the memory allocation fails.
      */
-    template <typename texts_type_, typename fingerprints_type_, typename executor_type_ = dummy_executor_t>
+    template <typename texts_type_, typename min_hashes_per_text_type_, typename min_counts_per_text_type_,
+              typename executor_type_ = dummy_executor_t>
 #if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
-    status_t operator()(                                        //
-        texts_type_ const &texts, fingerprints_type_ &&results, //
+    status_t operator()(                                                                                  //
+        texts_type_ const &texts,                                                                         //
+        min_hashes_per_text_type_ &&min_hashes_per_text, min_counts_per_text_type_ &&min_counts_per_text, //
         executor_type_ &&executor = {}, cpu_specs_t specs = {}) const noexcept {
 
         // Depending on document sizes, choose the appropriate parallelization strategy
         // - Either split each text into chunks across threads
         // - Or split the texts themselves across threads
-        std::size_t const text_size_threshold = specs.l2_bytes * executor.threads_count();
+        std::size_t const text_size_threshold = executor.threads_count() * specs.l2_bytes;
         std::size_t const dims = dimensions();
 
         // Allocate enough temporary states for all cores to have individual states
-        dimension_states_t dimension_states(allocator_traits_t::select_on_container_copy_construction(allocator_));
-        if (dimension_states.try_resize(executor.threads_count() * dims) != status_t::success_k)
+        safe_vector<rolling_state_t, rolling_states_allocator_t> rolling_states(
+            allocator_traits_t::select_on_container_copy_construction(allocator_));
+        safe_vector<rolling_hash_t, rolling_hashes_allocator_t> rolling_minimums(
+            allocator_traits_t::select_on_container_copy_construction(allocator_));
+        safe_vector<min_count_t, min_counts_allocator_t> rolling_counts(
+            allocator_traits_t::select_on_container_copy_construction(allocator_));
+        if (rolling_states.try_resize(executor.threads_count() * dims) != status_t::success_k ||
+            rolling_minimums.try_resize(executor.threads_count() * dims) != status_t::success_k ||
+            rolling_counts.try_resize(executor.threads_count() * dims) != status_t::success_k)
             return status_t::bad_alloc_k;
 
         // Process small texts by individual threads
@@ -675,10 +802,14 @@ struct basic_rolling_hashers {
             auto const &text = texts[text_index];
             if (text.size() >= text_size_threshold) return;
 
-            auto text_view = to_bytes_view(text);
-            auto thread_local_states = to_span(dimension_states).subspan(thread_index * dims, dims);
-            auto result = to_span(results[text_index]);
-            fingerprint<SZ_SIZE_MAX>(text_view, thread_local_states, result);
+            auto min_hashes = to_span(min_hashes_per_text[text_index]);
+            auto min_counts = to_span(min_counts_per_text[text_index]);
+
+            span<byte_t const> text_view = to_bytes_view(text);
+            span<rolling_state_t> thread_local_states {rolling_states.data() + thread_index * dims, dims};
+            span<rolling_hash_t> thread_local_minimums {rolling_minimums.data() + thread_index * dims, dims};
+            roll<SZ_SIZE_MAX>(text_view, thread_local_states, thread_local_minimums, min_counts);
+            digest<SZ_SIZE_MAX>(thread_local_minimums, min_hashes);
         });
 
         // Process large texts by splitting them into chunks
@@ -695,73 +826,145 @@ struct basic_rolling_hashers {
 
             // Distribute overlapping chunks across threads
             executor.for_threads([&](std::size_t thread_index) noexcept {
-                auto text_start = text_view.data() + std::min(text_view.size(), thread_index * chunk_size);
+                auto text_start = text_view.data() + (std::min)(text_view.size(), thread_index * chunk_size);
                 // ? This overlap will be different for different window widths, but assuming we are
                 // ? computing the non-weighted Min-Hash, recomputing & comparing a few hashes for the
                 // ? same slices isn't a big deal.
-                auto overlapping_text_end = std::min(text_start + chunk_size + max_window_width_ - 1, text_view.end());
+                auto overlapping_text_end =
+                    (std::min)(text_start + chunk_size + max_window_width_ - 1, text_view.end());
                 auto thread_local_text = span<byte_t const>(text_start, overlapping_text_end);
-                auto thread_local_states = to_span(dimension_states).subspan(thread_index * dims, dims);
-                fill_states_<SZ_SIZE_MAX>(thread_local_text, thread_local_states);
+                auto thread_local_states = span<rolling_state_t> {rolling_states.data() + thread_index * dims, dims};
+                auto thread_local_minimums = span<rolling_hash_t> {rolling_minimums.data() + thread_index * dims, dims};
+                auto thread_local_counts = span<min_count_t> {rolling_counts.data() + thread_index * dims, dims};
+                roll<SZ_SIZE_MAX>(thread_local_text, thread_local_states, thread_local_minimums, thread_local_counts);
             });
 
             // Compute the minimums of each thread's local states
-            auto &result = results[text_index];
-            for (std::size_t dim = 0; dim < result.size(); ++dim) {
-                rolling_hash_t min_hash = max_rolling_hash_k;
+            auto min_hashes = to_span(min_hashes_per_text[text_index]);
+            auto min_counts = to_span(min_counts_per_text[text_index]);
+            for (std::size_t dim = 0; dim < min_hashes.size(); ++dim) {
+                rolling_hash_t min_hash = skipped_rolling_hash_k;
+                min_count_t min_count = 0;
                 for (std::size_t thread_index = 0; thread_index < executor.threads_count(); ++thread_index) {
-                    rolling_hash_t const &dimension_state = dimension_states[thread_index * dims + dim];
-                    min_hash = (std::min)(min_hash, dimension_state.minimum);
+                    rolling_hash_t thread_local_min_hash = rolling_minimums[thread_index * dims + dim];
+                    min_count_t thread_local_min_count = rolling_counts[thread_index * dims + dim];
+                    if (thread_local_min_hash == min_hash) { min_count += thread_local_min_count; }
+                    else if (thread_local_min_hash > min_hash) { continue; }
+                    else { min_hash = thread_local_min_hash, min_count = thread_local_min_count; }
                 }
-                result[dim] = static_cast<result_scalar_t>(min_hash & max_result_scalar_k);
+                min_hashes[dim] = static_cast<min_hash_t>(min_hash & max_hash_k);
+                min_counts[dim] = min_count;
             }
         }
 
         return status_t::success_k;
     }
+};
 
-  private:
-    template <size_t dimensions_ = SZ_SIZE_MAX>
-    void fill_states_(span<byte_t const> text, span<dimension_state_t, dimensions_> states) const noexcept {
-
-        sz_assert_(states.size() >= hashers_.size() && "Dimensions number & states number mismatch");
-
-        // Clear the states
-        for (auto &state : states) state = dimension_state_t {};
-
-        // Until we reach the maximum window length, use a branching code version
-        std::size_t const prefix_length = (std::min)(text.size(), max_window_width_);
-        for (std::size_t new_char_offset = 0; new_char_offset < prefix_length; ++new_char_offset) {
-            byte_t const new_char = text[new_char_offset];
-            for (std::size_t dim = 0; dim < states.size(); ++dim) {
-                auto &hasher = hashers_[dim];
-                auto &state = states[dim];
-                if (hasher.window_width() > new_char_offset) {
-                    state.last = hasher.push(state.last, new_char);
-                    if (hasher.window_width() == (new_char_offset + 1))
-                        state.minimum = (std::min)(state.minimum, hasher.digest(state.last));
-                }
-                else {
-                    auto const old_char = text[new_char_offset - hasher.window_width()];
-                    state.last = hasher.roll(state.last, old_char, new_char);
-                    state.minimum = (std::min)(state.minimum, hasher.digest(state.last));
-                }
-            }
+/**
+ *  @brief Computes many fingerprints in parallel for input @p texts, calling @p engine on each thread of @p executor.
+ *  @param[in] texts The input texts to hash, typically a UTF-8 encoded string.
+ *  @param[out] results The output fingerprints, a array of vectors of minimum hashes.
+ *  @param[in] executor The executor to use for parallel processing, defaults to a dummy executor.
+ *  @param[in] specs The CPU specifications to use, defaults to an empty `cpu_specs_t`.
+ *  @retval status_t::success_k on success, or an error code otherwise.
+ *  @retval status_t::bad_alloc_k if the memory allocation fails.
+ */
+template <typename engine_type_, typename texts_type_, typename min_hashes_per_text_type_,
+          typename min_counts_per_text_type_,
+          typename executor_type_ = dummy_executor_t>
+#if SZ_IS_CPP20_
+    requires executor_like<executor_type_>
+#endif
+status_t floating_rolling_hashers_in_parallel_(                                                       //
+    engine_type_ const &engine, texts_type_ const &texts,                                             //
+    min_hashes_per_text_type_ &&min_hashes_per_text, min_counts_per_text_type_ &&min_counts_per_text, //
+    executor_type_ &&executor = {}, cpu_specs_t specs = {}) noexcept {
+
+    using engine_t = engine_type_;
+    using rolling_state_t = typename engine_t::rolling_state_t;
+    using min_count_t = typename engine_t::min_count_t;
+    using min_hash_t = typename engine_t::min_hash_t;
+    static constexpr auto window_width_k = engine_t::window_width_k;
+    static constexpr auto dimensions_k = engine_t::dimensions_k;
+    static constexpr auto skipped_rolling_state_k = engine_t::skipped_rolling_state_k;
+    static constexpr auto skipped_rolling_hash_k = engine_t::skipped_rolling_hash_k;
+
+    // Depending on document sizes, choose the appropriate parallelization strategy
+    // - Either split each text into chunks across threads
+    // - Or split the texts themselves across threads
+    std::size_t const text_size_threshold = specs.l2_bytes * executor.threads_count();
+
+    // Process small texts by individual threads
+    executor.for_n_dynamic(texts.size(), [&](auto prong) noexcept {
+        auto const text_index = prong.task;
+
+        auto const &text = texts[text_index];
+        if (text.size() >= text_size_threshold) return;
+
+        auto text_view = to_bytes_view(text);
+        auto min_hashes = to_span<dimensions_k>(min_hashes_per_text[text_index]);
+        auto min_counts = to_span<dimensions_k>(min_counts_per_text[text_index]);
+        engine.fingerprint(text_view, min_hashes, min_counts);
+    });
+
+    // Process large texts by splitting them into chunks
+    for (std::size_t text_index = 0; text_index < texts.size(); ++text_index) {
+
+        auto const &text = texts[text_index];
+        if (text.size() < text_size_threshold) continue;
+
+        // Split the text into chunks of the maximum window width
+        auto text_view = to_bytes_view(text);
+        std::size_t const chunk_size = round_up_to_multiple(             //
+            divide_round_up(text_view.size(), executor.threads_count()), //
+            specs.cache_line_width);
+
+        rolling_state_t states[dimensions_k];
+        rolling_state_t minimums[dimensions_k];
+        for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
+            states[dim] = skipped_rolling_state_k;
+            minimums[dim] = skipped_rolling_hash_k;
         }
 
-        // Now we can avoid a branch in the nested loop, as we are passed the longest window width
-        for (std::size_t new_char_offset = max_window_width_; new_char_offset < text.size(); ++new_char_offset) {
-            byte_t const new_char = text[new_char_offset];
-            for (std::size_t dim = 0; dim < states.size(); ++dim) {
-                auto &hasher = hashers_[dim];
-                auto &state = states[dim];
-                auto const old_char = text[new_char_offset - hasher.window_width()];
-                state.last = hasher.roll(state.last, old_char, new_char);
-                state.minimum = (std::min)(state.minimum, hasher.digest(state.last));
+        // Distribute overlapping chunks across threads
+        auto min_hashes = to_span(min_hashes_per_text[text_index]);
+        auto min_counts = to_span(min_counts_per_text[text_index]);
+        auto gather_mutex = executor.make_mutex();
+        executor.for_threads([&](std::size_t thread_index) noexcept {
+            auto text_start = text_view.data() + (std::min)(text_view.size(), thread_index * chunk_size);
+            // ? This overlap will be different for different window widths, but assuming we are
+            // ? computing the non-weighted Min-Hash, recomputing & comparing a few hashes for the
+            // ? same slices isn't a big deal.
+            auto overlapping_text_end = (std::min)(text_start + chunk_size + window_width_k - 1, text_view.end());
+            auto thread_local_text = span<byte_t const>(text_start, overlapping_text_end);
+
+            rolling_state_t thread_local_states[dimensions_k];
+            rolling_state_t thread_local_minimums[dimensions_k];
+            min_count_t thread_local_counts[dimensions_k];
+            engine.roll(thread_local_text, thread_local_states, thread_local_minimums, thread_local_counts);
+
+            lock_guard lock(gather_mutex);
+            for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
+                rolling_state_t &min_hash = minimums[dim];
+                min_count_t &min_count = min_counts[dim];
+                rolling_state_t thread_local_min_hash = thread_local_minimums[dim];
+                min_count_t thread_local_min_count = thread_local_counts[dim];
+                if (thread_local_min_hash == min_hash) { min_count += thread_local_min_count; }
+                else if (thread_local_min_hash > min_hash) { continue; }
+                else { min_hash = thread_local_min_hash, min_count = thread_local_min_count; }
             }
-        }
+        });
+
+        // Digest the smallest hash states, luckily for us, for this hash function,
+        // the smallest state corresponds to the smallest digested hash :)
+        // This is also never a bottleneck, so let's keep it sequential for simplicity.
+
+        engine.digest(span<rolling_state_t const, dimensions_k>(minimums), span<min_hash_t, dimensions_k>(min_hashes));
     }
-};
+
+    return status_t::success_k;
+}
 
 /**
  *  @brief Optimized rolling Min-Hashers via floats, @b constrained to a certain dimensionality & window width.
@@ -775,35 +978,36 @@ struct basic_rolling_hashers {
  *
  *  @tparam capability_ The CPU capability, e.g., `sz_cap_serial_k`, `sz_cap_ice_k`, etc.
  *  @tparam window_width_ The width of the rolling window, e.g., 3, 4, 5, 6, etc.
- *  @tparam dimensions_ The number of dimensions in the fingerprint, ideally a multiple of 16, like 64 or 80.
+ *  @tparam dimensions_ The number of dimensions in the fingerprint, recommended a multiple of 16, ideally @b 64.
  *  @tparam enable_ A type used to enable or disable this specialization, e.g., `void` for default.
  */
 template <                                         //
     sz_capability_t capability_ = sz_cap_serial_k, //
     std::size_t window_width_ = SZ_SIZE_MAX,       //
-    std::size_t dimensions_ = 16,                  //
+    std::size_t dimensions_ = 64,                  //
     typename enable_ = void                        //
     >
 struct floating_rolling_hashers {
 
     using hasher_t = floating_rolling_hasher<double>;
-    using rolling_state_t = typename hasher_t::hash_t;
-    using result_scalar_t = std::uint32_t;
+    using rolling_state_t = double;
+    using min_hash_t = std::uint32_t;
+    using min_count_t = std::uint32_t;
 
     static constexpr std::size_t window_width_k = window_width_;
     static constexpr std::size_t dimensions_k = dimensions_;
+    static constexpr rolling_state_t skipped_rolling_state_k = std::numeric_limits<rolling_state_t>::max();
     static constexpr rolling_state_t skipped_rolling_hash_k = std::numeric_limits<rolling_state_t>::max();
-    static constexpr result_scalar_t max_result_scalar_k = std::numeric_limits<result_scalar_t>::max();
+    static constexpr min_hash_t max_hash_k = std::numeric_limits<min_hash_t>::max();
 
-    using fingerprint_span_t = span<result_scalar_t, dimensions_k>;
+    using min_hashes_span_t = span<min_hash_t, dimensions_k>;
+    using min_counts_span_t = span<min_count_t, dimensions_k>;
 
   private:
-    using state_t = typename hasher_t::state_t;
-
-    state_t multipliers_[dimensions_k];
-    state_t modulos_[dimensions_k];
-    state_t inverse_modulos_[dimensions_k];
-    state_t negative_discarding_multipliers_[dimensions_k];
+    rolling_state_t multipliers_[dimensions_k];
+    rolling_state_t modulos_[dimensions_k];
+    rolling_state_t inverse_modulos_[dimensions_k];
+    rolling_state_t negative_discarding_multipliers_[dimensions_k];
 
   public:
     constexpr std::size_t window_width() const noexcept { return window_width_k; }
@@ -827,168 +1031,152 @@ struct floating_rolling_hashers {
     /**
      *  @brief Computes the fingerprint of a single @p text on the current thread.
      *  @param[in] text The input text to hash, typically a UTF-8 encoded string.
-     *  @param[out] result The output fingerprint, a vector of minimum hashes.
+     *  @param[out] min_hashes The output fingerprint, a vector of minimum hashes.
+     *  @param[out] min_counts The output frequencies of @p `min_hashes` hashes.
      */
-    void fingerprint(span<byte_t const> text, fingerprint_span_t result) const noexcept {
+    void fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
+                     min_counts_span_t min_counts) const noexcept {
 
         // Fill the states
-        rolling_state_t last_floats[dimensions_k];
-        rolling_state_t minimum_floats[dimensions_k];
-        fill_states_(text, last_floats, minimum_floats);
+        rolling_state_t rolling_states[dimensions_k];
+        rolling_state_t rolling_minimums[dimensions_k];
+        min_count_t rolling_counts[dimensions_k];
+        for (std::size_t dim = 0; dim < dimensions_k; ++dim)
+            rolling_states[dim] = 0, rolling_minimums[dim] = skipped_rolling_hash_k, rolling_counts[dim] = 0;
+
+        // Roll through the whole input at once
+        if (text.size() >= window_width_k) roll(text, rolling_states, rolling_minimums, rolling_counts);
 
         // Export the minimum floats to the fingerprint
         for (std::size_t dim = 0; dim < dimensions_k; ++dim)
-            result[dim] = static_cast<result_scalar_t>(minimum_floats[dim]); // & max_result_scalar_k);
+            min_hashes[dim] = static_cast<min_hash_t>(rolling_minimums[dim] & max_hash_k),
+            min_counts[dim] = rolling_counts[dim];
     }
 
     /**
      *  @brief Computes the fingerprint of a single @p text on the current thread.
      *  @param[in] text The input text to hash, typically a UTF-8 encoded string.
-     *  @param[out] result The output fingerprint, a vector of minimum hashes.
+     *  @param[out] min_hashes The output fingerprint, a vector of minimum hashes.
+     *  @param[out] min_counts The output frequencies of @p `min_hashes` hashes.
      */
-    status_t try_fingerprint(span<byte_t const> text, fingerprint_span_t result) const noexcept {
-        fingerprint(text, result);
+    status_t try_fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
+                             min_counts_span_t min_counts) const noexcept {
+        fingerprint(text, min_hashes, min_counts);
         return status_t::success_k;
     }
 
     /**
-     *  @brief Computes many fingerprints in parallel for input @p texts via an @p executor.
-     *  @param[in] texts The input texts to hash, typically a UTF-8 encoded string.
-     *  @param[out] results The output fingerprints, a array of vectors of minimum hashes.
-     *  @param[in] executor The executor to use for parallel processing, defaults to a dummy executor.
-     *  @param[in] specs The CPU specifications to use, defaults to an empty `cpu_specs_t`.
-     *  @retval status_t::success_k on success, or an error code otherwise.
-     *  @retval status_t::bad_alloc_k if the memory allocation fails.
+     *  @brief Underlying machinery of `fingerprint` that fills the states of the hashers.
+     *  @param[in] text_chunk A chunk of text to update the @p `last_states` with.
+     *  @param[inout] last_states The last computed floats for each hasher; start with zeroes.
+     *  @param[inout] rolling_minimums The minimum floats for each hasher; start with `skipped_rolling_hash_k`.
+     *  @param[inout] rolling_counts The appearance frequency counts of each @p `rolling_minimums`; start with 1.
+     *  @param[in] passed_progress The offset of the received @p `text_chunk` in the whole text; defaults to 0.
      */
-    template <typename texts_type_, typename fingerprints_type_, typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
-        requires executor_like<executor_type_>
-#endif
-    status_t operator()(                                        //
-        texts_type_ const &texts, fingerprints_type_ &&results, //
-        executor_type_ &&executor = {}, cpu_specs_t specs = {}) const noexcept {
-
-        // Depending on document sizes, choose the appropriate parallelization strategy
-        // - Either split each text into chunks across threads
-        // - Or split the texts themselves across threads
-        std::size_t const text_size_threshold = specs.l2_bytes * executor.threads_count();
-
-        // Process small texts by individual threads
-        executor.for_n_dynamic(texts.size(), [&](auto prong) noexcept {
-            auto const text_index = prong.task;
+    void roll(                                                //
+        span<byte_t const> text_chunk,                        //
+        span<rolling_state_t, dimensions_k> last_states,      //
+        span<rolling_state_t, dimensions_k> rolling_minimums, //
+        span<min_count_t, dimensions_k> rolling_counts,       //
+        std::size_t const passed_progress = 0) const noexcept {
 
-            auto const &text = texts[text_index];
-            if (text.size() >= text_size_threshold) return;
-
-            auto text_view = to_bytes_view(text);
-            auto result = to_span<dimensions_k>(results[text_index]);
-            fingerprint(text_view, result);
-        });
-
-        // Process large texts by splitting them into chunks
-        for (std::size_t text_index = 0; text_index < texts.size(); ++text_index) {
-
-            auto const &text = texts[text_index];
-            if (text.size() < text_size_threshold) continue;
-
-            // Split the text into chunks of the maximum window width
-            auto text_view = to_bytes_view(text);
-            std::size_t const chunk_size = round_up_to_multiple(             //
-                divide_round_up(text_view.size(), executor.threads_count()), //
-                specs.cache_line_width);
-
-            auto gather_mutex = executor.make_mutex();
-            rolling_state_t minimum_floats[dimensions_k];
+        // Until we reach the maximum window length, use a branching code version
+        std::size_t const prefix_length = (std::min)(text_chunk.size(), window_width_k);
+        std::size_t new_char_offset = passed_progress;
+        for (; new_char_offset < prefix_length; ++new_char_offset) {
+            byte_t const new_char = text_chunk[new_char_offset];
+            rolling_state_t const new_term = static_cast<rolling_state_t>(new_char) + 1.0;
+            for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
+                rolling_state_t &last_state = last_states[dim];
+                last_state += multipliers_[dim] * new_term;
+                last_state = barrett_mod(last_state, dim);
+            }
+        }
 
-            // Distribute overlapping chunks across threads
-            executor.for_threads([&](std::size_t thread_index) noexcept {
-                auto text_start = text_view.data() + std::min(text_view.size(), thread_index * chunk_size);
-                // ? This overlap will be different for different window widths, but assuming we are
-                // ? computing the non-weighted Min-Hash, recomputing & comparing a few hashes for the
-                // ? same slices isn't a big deal.
-                auto overlapping_text_end = std::min(text_start + chunk_size + window_width_k - 1, text_view.end());
-                auto thread_local_text = span<byte_t const>(text_start, overlapping_text_end);
+        // We now have our first minimum hashes
+        if (new_char_offset == prefix_length)
+            for (std::size_t dim = 0; dim < dimensions_k; ++dim)
+                rolling_minimums[dim] = (std::min)(rolling_minimums[dim], last_states[dim]),
+                rolling_counts[dim] = 1; // First occurrence of this hash
 
-                rolling_state_t thread_local_last_floats[dimensions_k];
-                rolling_state_t thread_local_minimum_floats[dimensions_k];
-                fill_states_(thread_local_text, thread_local_last_floats, thread_local_minimum_floats);
+        // Now we can avoid a branch in the nested loop, as we are passed the longest window width
+        for (; new_char_offset < text_chunk.size(); ++new_char_offset) {
+            byte_t const new_char = text_chunk[new_char_offset];
+            byte_t const old_char = text_chunk[new_char_offset - window_width_k];
+            rolling_state_t const new_term = static_cast<rolling_state_t>(new_char) + 1.0;
+            rolling_state_t const old_term = static_cast<rolling_state_t>(old_char) + 1.0;
+            for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
+                rolling_state_t &last_state = last_states[dim];
+                rolling_state_t &rolling_minimum = rolling_minimums[dim];
+                min_count_t &rolling_count = rolling_counts[dim];
 
-                lock_guard lock(gather_mutex);
-                for (std::size_t dim = 0; dim < dimensions_k; ++dim)
-                    minimum_floats[dim] = (std::min)(minimum_floats[dim], thread_local_minimum_floats[dim]);
-            });
+                last_state += negative_discarding_multipliers_[dim] * old_term; // Remove tail
+                last_state = barrett_mod(last_state, dim);
+                last_state += multipliers_[dim] * new_term; // Add head
+                last_state = barrett_mod(last_state, dim);
 
-            // Compute the minimums of each thread's local states
-            auto &result = results[text_index];
-            for (std::size_t dim = 0; dim < dimensions_k; ++dim)
-                result[dim] = static_cast<result_scalar_t>(minimum_floats[dim] & max_result_scalar_k);
+                if (rolling_minimum == last_state) { rolling_count++; }
+                else if (last_state < rolling_minimum) { rolling_minimum = last_state, rolling_count = 1; }
+            }
         }
-
-        return status_t::success_k;
     }
 
-  private:
-    inline state_t barrett_mod(state_t h, std::size_t dim) const noexcept {
-        state_t const modulo = modulos_[dim];
-        state_t const inverse_modulo = inverse_modulos_[dim];
-        // Use STL-based modulo reduction like floating_rolling_hasher
-        h -= modulo * std::floor(h * inverse_modulo);
-        // Clamp into the [0, modulo) range.
-        h += modulo * (h < 0.0);
-        h -= modulo * (h >= modulo);
-        // Handle potential precision issues with additional clamping
-        if (h < 0.0) h += modulo;
-        if (h >= modulo) h -= modulo;
-        return h;
-    }
-
-    void fill_states_(span<byte_t const> text, span<rolling_state_t, dimensions_k> last_floats,
-                      span<rolling_state_t, dimensions_k> minimum_floats) const noexcept {
-
-        for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
-            last_floats[dim] = 0;
-            minimum_floats[dim] = skipped_rolling_hash_k;
+    /**
+     *  @brief Converts the rolling minimums into the final minimum hashes.
+     *  @param[in] rolling_minimums The minimum hashes computed by the rolling hashers.
+     *  @param[out] min_hashes The output minimum hashes, which are the final fingerprints.
+     */
+    template <std::size_t digest_dimensions_ = SZ_SIZE_MAX>
+    void digest(                                                          //
+        span<rolling_state_t const, digest_dimensions_> rolling_minimums, //
+        span<min_hash_t, digest_dimensions_> min_hashes) const noexcept {
+
+        for (std::size_t dim = 0; dim < min_hashes.size(); ++dim) {
+            rolling_state_t const rolling_minimum = rolling_minimums[dim];
+            min_hashes[dim] = rolling_minimum == skipped_rolling_hash_k
+                                  ? max_hash_k // If the rolling minimum is not set, use the maximum hash value
+                                  : static_cast<min_hash_t>(static_cast<std::uint64_t>(rolling_minimum) & max_hash_k);
         }
+    }
 
-        if (text.size() < window_width_k) return;
-
-        // Until we reach the maximum window length, use a branching code version
-        for (std::size_t new_char_offset = 0; new_char_offset < window_width_k; ++new_char_offset) {
-            byte_t const new_char = text[new_char_offset];
-            state_t const new_term = static_cast<state_t>(new_char) + 1.0;
-            for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
-                rolling_state_t &hash = last_floats[dim];
-                state_t state = sz_bitcast_(state_t, hash);
-                state += multipliers_[dim] * new_term;
-                state = barrett_mod(state, dim);
+    /**
+     *  @brief Computes many fingerprints in parallel for input @p texts via an @p executor.
+     *  @param[in] texts The input texts to hash, typically a sequential container of UTF-8 encoded strings.
+     *  @param[out] min_hashes_per_text The output fingerprints, an array of vectors of minimum hashes.
+     *  @param[out] min_counts_per_text The output frequencies of @p `min_hashes_per_text` hashes.
+     *  @param[in] executor The executor to use for parallel processing, defaults to a dummy executor.
+     *  @param[in] specs The CPU specifications to use, defaults to an empty `cpu_specs_t`.
+     *  @retval status_t::success_k on success, or an error code otherwise.
+     *  @retval status_t::bad_alloc_k if the memory allocation fails.
+     */
+    template <typename texts_type_, typename min_hashes_per_text_type_, typename min_counts_per_text_type_,
+              typename executor_type_ = dummy_executor_t>
+#if SZ_IS_CPP20_
+        requires executor_like<executor_type_>
+#endif
+    status_t operator()(texts_type_ const &texts, min_hashes_per_text_type_ &&min_hashes, //
+                        min_counts_per_text_type_ &&min_counts, executor_type_ &&executor = {},
+                        cpu_specs_t specs = {}) noexcept {
+        return floating_rolling_hashers_in_parallel_(            //
+            *this, texts,                                        //
+            std::forward<min_hashes_per_text_type_>(min_hashes), //
+            std::forward<min_counts_per_text_type_>(min_counts), //
+            std::forward<executor_type_>(executor), specs);
+    }
 
-                // Save back
-                hash = sz_bitcast_(rolling_state_t, state);
-            }
-        }
+  private:
+    inline rolling_state_t barrett_mod(rolling_state_t x, std::size_t dim) const noexcept {
+        rolling_state_t const modulo = modulos_[dim];
+        rolling_state_t const inverse_modulo = inverse_modulos_[dim];
 
-        // We now have our first minimum hashes
-        for (std::size_t dim = 0; dim < dimensions_k; ++dim)
-            minimum_floats[dim] = std::min(minimum_floats[dim], last_floats[dim]);
+        // Use STL-based modulo reduction like `floating_rolling_hasher`
+        rolling_state_t q = std::floor(x * inverse_modulo);
+        rolling_state_t result = x - q * modulo;
 
-        // Now we can avoid a branch in the nested loop, as we are passed the longest window width
-        for (std::size_t new_char_offset = window_width_k; new_char_offset < text.size(); ++new_char_offset) {
-            byte_t const new_char = text[new_char_offset];
-            byte_t const old_char = text[new_char_offset - window_width_k];
-            state_t const new_term = static_cast<state_t>(new_char) + 1.0;
-            state_t const old_term = static_cast<state_t>(old_char) + 1.0;
-            for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
-                rolling_state_t &hash = last_floats[dim];
-                state_t state = sz_bitcast_(state_t, hash);
-                state += negative_discarding_multipliers_[dim] * old_term; // Remove tail
-                state += multipliers_[dim] * new_term;                     // Add head
-                state = barrett_mod(state, dim);
-
-                // Save back
-                hash = sz_bitcast_(rolling_state_t, state);
-                minimum_floats[dim] = std::min(minimum_floats[dim], hash);
-            }
-        }
+        // Clamp into the [0, modulo) range.
+        result += modulo * (result < 0.0);
+        result -= modulo * (result >= modulo);
+        return result;
     }
 };
 
@@ -999,576 +1187,273 @@ struct floating_rolling_hashers {
  *  Assuming 32x ZMM registers, and roughly 10ish scalars for intermediaries per hash, we can unroll
  *  2-3x times, and process 16-24 hashes in parallel.
  */
-template <std::size_t dimensions_>
-struct floating_rolling_hashers<sz_cap_ice_k, dimensions_> {
+template <std::size_t window_width_, std::size_t dimensions_>
+struct floating_rolling_hashers<sz_cap_ice_k, window_width_, dimensions_> {
 
     using hasher_t = floating_rolling_hasher<double>;
-    using rolling_state_t = typename hasher_t::hash_t;
-    using result_scalar_t = std::uint32_t;
-    static constexpr std::size_t dimensions_k = dimensions_;
+    using rolling_state_t = double;
+    using min_hash_t = std::uint32_t;
+    using min_count_t = std::uint32_t;
 
-    void operator()(span<byte_t const> text, span<hasher_t const, dimensions_k> hashers,
-                    span<result_scalar_t, dimensions_k> fingerprint) const noexcept {
-
-        // constexpr std::size_t unroll_factor_k = 2;
-        // constexpr std::size_t unrolled_hashes_k = unroll_factor_k * sizeof(sz_u512_vec_t) / sizeof(rolling_state_t);
+    static constexpr std::size_t window_width_k = window_width_;
+    static constexpr std::size_t dimensions_k = dimensions_;
+    static constexpr rolling_state_t skipped_rolling_hash_k = std::numeric_limits<rolling_state_t>::max();
+    static constexpr min_hash_t max_hash_k = std::numeric_limits<min_hash_t>::max();
 
-        // sz_u512_vec_t window_widths[unroll_factor_k], multipliers[unroll_factor_k],
-        //     negative_discarding_multipliers[unroll_factor_k], modulos[unroll_factor_k],
-        //     inverse_modulos[unroll_factor_k], states_[unroll_factor_k], min_hashes[unroll_factor_k];
+    using min_hashes_span_t = span<min_hash_t, dimensions_k>;
+    using min_counts_span_t = span<min_count_t, dimensions_k>;
 
-        sz_unused_(text);
-        sz_unused_(hashers);
-        sz_unused_(fingerprint);
-    }
-};
+    static constexpr unsigned unroll_factor_k = 2;
+    static constexpr unsigned hashes_per_zmm_k = sizeof(sz_u512_vec_t) / sizeof(rolling_state_t);
+    static constexpr unsigned hashes_per_unrolled_group_k = unroll_factor_k * hashes_per_zmm_k;
+    static_assert(dimensions_k % hashes_per_unrolled_group_k == 0,
+                  "Dimensions number must be divisible by the hash-count");
+    static_assert(dimensions_k <= 256, "Too many dimensions to keep on stack");
 
-#pragma endregion - Optimized Rolling MinHashers
+  private:
+    SZ_ALIGN64 rolling_state_t multipliers_[dimensions_k];
+    SZ_ALIGN64 rolling_state_t modulos_[dimensions_k];
+    SZ_ALIGN64 rolling_state_t inverse_modulos_[dimensions_k];
+    SZ_ALIGN64 rolling_state_t negative_discarding_multipliers_[dimensions_k];
 
-} // namespace stringzillas
-} // namespace ashvardanian
+  public:
+    constexpr std::size_t window_width() const noexcept { return window_width_k; }
+    constexpr std::size_t dimensions() const noexcept { return dimensions_k; }
 
-#if 0
-#ifdef __cplusplus
-extern "C" {
-#endif
+    /**
+     *  @brief Initializes several rolling hashers with different multipliers and modulos.
+     *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
+     */
+    status_t try_seed(std::size_t alphabet_size = 256) noexcept {
+        for (unsigned j = 0; j < dimensions_k; ++j) {
+            hasher_t hasher(window_width_k, alphabet_size + j, hasher_t::default_modulo_base_k);
+            multipliers_[j] = hasher.multiplier();
+            modulos_[j] = hasher.modulo();
+            inverse_modulos_[j] = -hasher.inverse_modulo();
+            negative_discarding_multipliers_[j] = hasher.negative_discarding_multiplier();
+        }
+        return status_t::success_k;
+    }
 
-#pragma region Core API
+    /**
+     *  @brief Computes the fingerprint of a single @p text on the current thread.
+     *  @param[in] text The input text to hash, typically a UTF-8 encoded string.
+     *  @param[out] min_hashes The output fingerprint, a vector of minimum hashes.
+     *  @param[out] min_counts The output frequencies of @p `min_hashes` hashes.
+     */
+    void fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
+                     min_counts_span_t min_counts) const noexcept {
 
-/**
- *  @brief  Computes the Karp-Rabin rolling hashes of a string supplying them to the provided `callback`.
- *          Can be used for similarity scores, search, ranking, etc.
- *
- *  Rabin-Karp-like rolling hashes can have very high-level of collisions and depend
- *  on the choice of bases and the prime number. That's why, often two hashes from the same
- *  family are used with different bases.
- *
- *       1. Kernighan and Ritchie's function uses 31, a prime close to the size of English alphabet.
- *       2. To be friendlier to byte-arrays and UTF8, we use 257 for the second function.
- *
- *
- *  @param text             String to hash.
- *  @param length           Number of bytes in the string.
- *  @param window_width    Length of the rolling window in bytes.
- *  @param window_step      Step of reported hashes. @b Must be power of two. Should be smaller than `window_width`.
- *  @param callback         Function receiving the start & length of a substring, the hash, and the `callback_handle`.
- *  @param callback_handle  Optional user-provided pointer to be passed to the `callback`.
- *  @see                    sz_hashes_fingerprint, sz_hashes_intersection
- */
-SZ_DYNAMIC void sz_hashes(                                                            //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_width, sz_size_t window_step, //
-    sz_hash_callback_t callback, void *callback_handle);
+        // Fill the states
+        rolling_state_t rolling_states[dimensions_k];
+        rolling_state_t rolling_minimums[dimensions_k];
+        min_count_t rolling_counts[dimensions_k];
+        for (std::size_t dim = 0; dim < dimensions_k; ++dim)
+            rolling_states[dim] = 0, rolling_minimums[dim] = skipped_rolling_hash_k, rolling_counts[dim] = 0;
 
-/**
- *  @brief  Computes the Karp-Rabin rolling hashes of a string outputting a binary fingerprint.
- *          Such fingerprints can be compared with Hamming or Jaccard (Tanimoto) distance for similarity.
- *
- *  The algorithm doesn't clear the fingerprint buffer on start, so it can be invoked multiple times
- *  to produce a fingerprint of a longer string, by passing the previous fingerprint as the ::fingerprint.
- *  It can also be reused to produce multi-resolution fingerprints by changing the ::window_width
- *  and calling the same function multiple times for the same input ::text.
- *
- *  Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
- *  avoiding cache-coherency penalties of remote on-heap buffers.
- *
- *  @param text                 String to hash.
- *  @param length               Number of bytes in the string.
- *  @param fingerprint          Output fingerprint buffer.
- *  @param fingerprint_bytes    Number of bytes in the fingerprint buffer.
- *  @param window_width        Length of the rolling window in bytes.
- *  @see                        sz_hashes, sz_hashes_intersection
- */
-SZ_PUBLIC void sz_hashes_fingerprint(                          //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_width, //
-    sz_ptr_t fingerprint, sz_size_t fingerprint_bytes) {
-    sz_unused_(text && length && window_width && fingerprint && fingerprint_bytes);
-}
+        // Roll through the whole input at once
+        if (text.size() >= window_width_k) roll(text, rolling_states, rolling_minimums, rolling_counts);
 
-/**
- *  @brief  Given a hash-fingerprint of a textual document, computes the number of intersecting hashes
- *          of the incoming document. Can be used for document scoring and search.
- *
- *  Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
- *  avoiding cache-coherency penalties of remote on-heap buffers.
- *
- *  @param text                 Input document.
- *  @param length               Number of bytes in the input document.
- *  @param fingerprint          Reference document fingerprint.
- *  @param fingerprint_bytes    Number of bytes in the reference documents fingerprint.
- *  @param window_width        Length of the rolling window in bytes.
- *  @see                        sz_hashes, sz_hashes_fingerprint
- */
-SZ_PUBLIC sz_size_t sz_hashes_intersection(                    //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_width, //
-    sz_cptr_t fingerprint, sz_size_t fingerprint_bytes);
-
-/** @copydoc sz_hashes */
-SZ_PUBLIC void sz_hashes_serial(                                                      //
-    sz_cptr_t text, sz_size_t length, sz_size_t window_width, sz_size_t window_step, //
-    sz_hash_callback_t callback, void *callback_handle);
-}
-
-#pragma endregion // Core API
+        // Export the minimum floats to the fingerprint
+        for (std::size_t dim = 0; dim < dimensions_k; ++dim)
+            min_hashes[dim] = static_cast<min_hash_t>(rolling_minimums[dim] & max_hash_k),
+            min_counts[dim] = rolling_counts[dim];
+    }
 
-#pragma region Serial Implementation
+    /**
+     *  @brief Underlying machinery of `fingerprint` that fills the states of the hashers.
+     *  @param[in] text_chunk A chunk of text to update the @p `last_floats` with.
+     *  @param[inout] last_floats The last computed floats for each hasher; start with zeroes.
+     *  @param[inout] rolling_minimums The minimum floats for each hasher; start with `skipped_rolling_hash_k`.
+     *  @param[inout] rolling_counts The appearance frequency counts of each @p `rolling_minimums`; start with 1.
+     *  @param[in] passed_progress The offset of the received @p `text_chunk` in the whole text; defaults to 0.
+     */
+    void roll(                                                //
+        span<byte_t const> text_chunk,                        //
+        span<rolling_state_t, dimensions_k> last_floats,      //
+        span<rolling_state_t, dimensions_k> rolling_minimums, //
+        span<min_count_t, dimensions_k> rolling_counts,       //
+        std::size_t passed_progress = 0) const noexcept {
+
+        constexpr unsigned groups_count_k = dimensions_k / hashes_per_unrolled_group_k;
+        for (unsigned group_index = 0; group_index < groups_count_k; ++group_index)
+            roll_group(text_chunk, group_index, last_floats, rolling_minimums, rolling_counts, passed_progress);
+    }
 
-/*
- *  One hardware-accelerated way of mixing hashes can be CRC, but it's only implemented for 32-bit values.
- *  Using a Boost-like mixer works very poorly in such case:
- *
- *       hash_first ^ (hash_second + 0x517cc1b727220a95 + (hash_first << 6) + (hash_first >> 2));
- *
- *  Let's stick to the Fibonacci hash trick using the golden ratio.
- *  https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
- */
-#define sz_hash_mix_(first, second) ((first * 11400714819323198485ull) ^ (second * 11400714819323198485ull))
-#define sz_shift_low_(x) (x)
-#define sz_shift_high_(x) ((x + 77ull) & 0xFFull)
-#define sz_prime_mod_(x) (x % SZ_U64_MAX_PRIME)
-
-SZ_PUBLIC void sz_hashes_serial(sz_cptr_t start, sz_size_t length, sz_size_t window_width, sz_size_t step, //
-                                sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_width || !window_width) return;
-    sz_u8_t const *text = (sz_u8_t const *)start;
-    sz_u8_t const *text_end = text + length;
-
-    // Prepare the `prime ^ window_width` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_width; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // Compute the initial hash value for the first window.
-    sz_u64_t hash_low = 0, hash_high = 0, hash_mix;
-    for (sz_u8_t const *first_end = text + window_width; text < first_end; ++text)
-        hash_low = (hash_low * 31ull + sz_shift_low_(*text)) % SZ_U64_MAX_PRIME,
-        hash_high = (hash_high * 257ull + sz_shift_high_(*text)) % SZ_U64_MAX_PRIME;
-
-    // In most cases the fingerprint length will be a power of two.
-    hash_mix = sz_hash_mix_(hash_low, hash_high);
-    callback((sz_cptr_t)text, window_width, hash_mix, callback_handle);
-
-    // Compute the hash value for every window, exporting into the fingerprint,
-    // using the expensive modulo operation.
-    sz_size_t cycles = 1;
-    sz_size_t const step_mask = step - 1;
-    for (; text < text_end; ++text, ++cycles) {
-        // Discard one character:
-        hash_low -= sz_shift_low_(*(text - window_width)) * prime_power_low;
-        hash_high -= sz_shift_high_(*(text - window_width)) * prime_power_high;
-        // And add a new one:
-        hash_low = 31ull * hash_low + sz_shift_low_(*text);
-        hash_high = 257ull * hash_high + sz_shift_high_(*text);
-        // Wrap the hashes around:
-        hash_low = sz_prime_mod_(hash_low);
-        hash_high = sz_prime_mod_(hash_high);
-        // Mix only if we've skipped enough hashes.
-        if ((cycles & step_mask) == 0) {
-            hash_mix = sz_hash_mix_(hash_low, hash_high);
-            callback((sz_cptr_t)text, window_width, hash_mix, callback_handle);
+    /**
+     *  @brief Converts the rolling minimums into the final minimum hashes.
+     *  @param[in] rolling_minimums The minimum hashes computed by the rolling hashers.
+     *  @param[out] min_hashes The output minimum hashes, which are the final fingerprints.
+     */
+    void digest(                                                    //
+        span<rolling_state_t const, dimensions_k> rolling_minimums, //
+        span<min_hash_t, dimensions_k> min_hashes) const noexcept {
+
+        for (std::size_t dim = 0; dim < min_hashes.size(); ++dim) {
+            rolling_state_t const rolling_minimum = rolling_minimums[dim];
+            min_hashes[dim] = rolling_minimum == skipped_rolling_hash_k
+                                  ? max_hash_k // If the rolling minimum is not set, use the maximum hash value
+                                  : static_cast<min_hash_t>(static_cast<std::uint64_t>(rolling_minimum) & max_hash_k);
         }
     }
-}
-
-/** @brief  An internal callback used to set a bit in a power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void sz_hashes_fingerprint_pow2_callback_(sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *handle) {
-    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
-    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
-    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
-    fingerprint_u8s[(hash / 8) & (fingerprint_bytes - 1)] |= (1 << (hash & 7));
-    sz_unused_(start && length);
-}
-
-/** @brief  An internal callback used to set a bit in a @b non power-of-two length binary fingerprint of a string. */
-SZ_INTERNAL void sz_hashes_fingerprint_non_pow2_callback_( //
-    sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *handle) {
-    sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
-    sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
-    sz_size_t fingerprint_bytes = fingerprint_buffer->length;
-    fingerprint_u8s[(hash / 8) % fingerprint_bytes] |= (1 << (hash & 7));
-    sz_unused_(start && length);
-}
-
-/** @brief  An internal callback, used to mix all the running hashes into one pointer-size value. */
-SZ_INTERNAL void sz_hashes_fingerprint_scalar_callback_( //
-    sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *scalar_handle) {
-    sz_unused_(start && length && hash && scalar_handle);
-    sz_size_t *scalar_ptr = (sz_size_t *)scalar_handle;
-    *scalar_ptr ^= hash;
-}
-
-#undef sz_shift_low_
-#undef sz_shift_high_
-#undef sz_hash_mix_
-#undef sz_prime_mod_
-
-#pragma endregion // Serial Implementation
 
-/*  AVX2 implementation of the string search algorithms for Haswell processors and newer.
- *  Very minimalistic (compared to AVX-512), but still faster than the serial implementation.
- */
-#pragma region Haswell Implementation
-#if SZ_USE_HASWELL
-#pragma GCC push_options
-#pragma GCC target("avx2")
-#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
+    /**
+     *  @brief Computes many fingerprints in parallel for input @p texts via an @p executor.
+     *  @param[in] texts The input texts to hash, typically a sequential container of UTF-8 encoded strings.
+     *  @param[out] min_hashes_per_text The output fingerprints, an array of vectors of minimum hashes.
+     *  @param[out] min_counts_per_text The output frequencies of @p `min_hashes_per_text` hashes.
+     *  @param[in] executor The executor to use for parallel processing, defaults to a dummy executor.
+     *  @param[in] specs The CPU specifications to use, defaults to an empty `cpu_specs_t`.
+     *  @retval status_t::success_k on success, or an error code otherwise.
+     *  @retval status_t::bad_alloc_k if the memory allocation fails.
+     */
+    template <typename texts_type_, typename min_hashes_per_text_type_, typename min_counts_per_text_type_,
+              typename executor_type_ = dummy_executor_t>
+#if SZ_IS_CPP20_
+        requires executor_like<executor_type_>
+#endif
+    status_t operator()(texts_type_ const &texts, min_hashes_per_text_type_ &&min_hashes, //
+                        min_counts_per_text_type_ &&min_counts, executor_type_ &&executor = {},
+                        cpu_specs_t specs = {}) noexcept {
+        return floating_rolling_hashers_in_parallel_(            //
+            *this, texts,                                        //
+            std::forward<min_hashes_per_text_type_>(min_hashes), //
+            std::forward<min_counts_per_text_type_>(min_counts), //
+            std::forward<executor_type_>(executor), specs);
+    }
 
-/**
- *  @brief  There is no AVX2 instruction for fast multiplication of 64-bit integers.
- *          This implementation is coming from Agner Fog's Vector Class Library.
- */
-SZ_INTERNAL __m256i _mm256_mul_epu64(__m256i a, __m256i b) {
-    __m256i bswap = _mm256_shuffle_epi32(b, 0xB1);
-    __m256i prodlh = _mm256_mullo_epi32(a, bswap);
-    __m256i zero = _mm256_setzero_si256();
-    __m256i prodlh2 = _mm256_hadd_epi32(prodlh, zero);
-    __m256i prodlh3 = _mm256_shuffle_epi32(prodlh2, 0x73);
-    __m256i prodll = _mm256_mul_epu32(a, b);
-    __m256i prod = _mm256_add_epi64(prodll, prodlh3);
-    return prod;
-}
+  private:
+    struct unrolled_states_t {
+        sz_u512_vec_t last_f64s[unroll_factor_k];
+        sz_u512_vec_t minimum_f64s[unroll_factor_k];
+        sz_u512_vec_t count_u64s[unroll_factor_k];
+    };
 
-SZ_PUBLIC void sz_hashes_haswell(sz_cptr_t start, sz_size_t length, sz_size_t window_width, sz_size_t step, //
-                                 sz_hash_callback_t callback, void *callback_handle) {
+    // TODO: We can probably shave a few ore cycles here:
+    SZ_INLINE __m512d barrett_mod(__m512d xs, __m512d modulos, __m512d inverse_modulos) const noexcept {
+        // Use rounding SIMD arithmetic
+        __m512d qs = _mm512_mul_round_pd(xs, inverse_modulos, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        __m512d results = _mm512_fnmadd_round_pd(qs, modulos, xs, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 
-    if (length < window_width || !window_width) return;
-    if (length < 4 * window_width) {
-        sz_hashes_serial(start, length, window_width, step, callback, callback_handle);
-        return;
+        // Clamp into the [0, modulo) range.
+        __mmask8 ge_mask = _mm512_cmp_pd_mask(results, modulos, _CMP_GE_OQ);
+        __mmask8 lt_mask = _mm512_cmp_pd_mask(results, _mm512_setzero_pd(), _CMP_LT_OQ);
+        results = _mm512_mask_sub_pd(results, ge_mask, results, modulos);
+        results = _mm512_mask_add_pd(results, lt_mask, results, modulos);
+        return results;
     }
 
-    // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
-    // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
-    sz_size_t const max_hashes = length - window_width + 1;
-    sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
-    sz_u8_t const *text_first = (sz_u8_t const *)start;
-    sz_u8_t const *text_second = text_first + min_hashes_per_thread;
-    sz_u8_t const *text_third = text_first + min_hashes_per_thread * 2;
-    sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
-    sz_u8_t const *text_end = text_first + length;
-
-    // Prepare the `prime ^ window_width` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_width; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // Broadcast the constants into the registers.
-    sz_u256_vec_t prime_vec, golden_ratio_vec;
-    sz_u256_vec_t base_low_vec, base_high_vec, prime_power_low_vec, prime_power_high_vec, shift_high_vec;
-    base_low_vec.ymm = _mm256_set1_epi64x(31ull);
-    base_high_vec.ymm = _mm256_set1_epi64x(257ull);
-    shift_high_vec.ymm = _mm256_set1_epi64x(77ull);
-    prime_vec.ymm = _mm256_set1_epi64x(SZ_U64_MAX_PRIME);
-    golden_ratio_vec.ymm = _mm256_set1_epi64x(11400714819323198485ull);
-    prime_power_low_vec.ymm = _mm256_set1_epi64x(prime_power_low);
-    prime_power_high_vec.ymm = _mm256_set1_epi64x(prime_power_high);
-
-    // Compute the initial hash values for every one of the four windows.
-    sz_u256_vec_t hash_low_vec, hash_high_vec, hash_mix_vec, chars_low_vec, chars_high_vec;
-    hash_low_vec.ymm = _mm256_setzero_si256();
-    hash_high_vec.ymm = _mm256_setzero_si256();
-    for (sz_u8_t const *prefix_end = text_first + window_width; text_first < prefix_end;
-         ++text_first, ++text_second, ++text_third, ++text_fourth) {
-
-        // 1. Multiply the hashes by the base.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-
-        // 3. Add the incoming characters.
-        hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_low_vec.ymm = _mm256_blendv_epi8( //
-            hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
-            _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
-        hash_high_vec.ymm = _mm256_blendv_epi8( //
-            hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
-            _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
-    }
+    void roll_group(                                          //
+        span<byte_t const> text_chunk, unsigned group_index,  //
+        span<rolling_state_t, dimensions_k> last_floats,      //
+        span<rolling_state_t, dimensions_k> rolling_minimums, //
+        span<min_count_t, dimensions_k> rolling_counts,       //
+        std::size_t passed_progress = 0) const noexcept {
+
+        // Resulting variables
+        sz_u512_vec_t last_floats_vec[unroll_factor_k];
+        sz_u512_vec_t rolling_minimums_vec[unroll_factor_k];
+        sz_u256_vec_t rolling_counts_vec[unroll_factor_k];
+
+        for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
+            unsigned const dim = group_index * hashes_per_unrolled_group_k + index_in_group * hashes_per_zmm_k;
+            last_floats_vec[index_in_group].zmm_pd = _mm512_loadu_pd(&last_floats[dim]);
+            rolling_minimums_vec[index_in_group].zmm_pd = _mm512_loadu_pd(&rolling_minimums[dim]);
+            rolling_counts_vec[index_in_group].ymm =
+                _mm256_loadu_si256(reinterpret_cast<__m256i const *>(&rolling_counts[dim]));
+        }
 
-    // 5. Compute the hash mix, that will be used to index into the fingerprint.
-    //    This includes a serial step at the end.
-    hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
-    hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
-    hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
-    callback((sz_cptr_t)text_first, window_width, hash_mix_vec.u64s[0], callback_handle);
-    callback((sz_cptr_t)text_second, window_width, hash_mix_vec.u64s[1], callback_handle);
-    callback((sz_cptr_t)text_third, window_width, hash_mix_vec.u64s[2], callback_handle);
-    callback((sz_cptr_t)text_fourth, window_width, hash_mix_vec.u64s[3], callback_handle);
-
-    // Now repeat that operation for the remaining characters, discarding older characters.
-    sz_size_t cycle = 1;
-    sz_size_t const step_mask = step - 1;
-    for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
-        // 0. Load again the four characters we are dropping, shift them, and subtract.
-        chars_low_vec.ymm = _mm256_set_epi64x( //
-            text_fourth[-window_width], text_third[-window_width], text_second[-window_width],
-            text_first[-window_width]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-        hash_low_vec.ymm =
-            _mm256_sub_epi64(hash_low_vec.ymm, _mm256_mul_epu64(chars_low_vec.ymm, prime_power_low_vec.ymm));
-        hash_high_vec.ymm =
-            _mm256_sub_epi64(hash_high_vec.ymm, _mm256_mul_epu64(chars_high_vec.ymm, prime_power_high_vec.ymm));
-
-        // 1. Multiply the hashes by the base.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
-
-        // 3. Add the incoming characters.
-        hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
-        hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_low_vec.ymm = _mm256_blendv_epi8( //
-            hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
-            _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
-        hash_high_vec.ymm = _mm256_blendv_epi8( //
-            hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
-            _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
-
-        // 5. Compute the hash mix, that will be used to index into the fingerprint.
-        //    This includes a serial step at the end.
-        hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
-        hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
-        hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
-        if ((cycle & step_mask) == 0) {
-            callback((sz_cptr_t)text_first, window_width, hash_mix_vec.u64s[0], callback_handle);
-            callback((sz_cptr_t)text_second, window_width, hash_mix_vec.u64s[1], callback_handle);
-            callback((sz_cptr_t)text_third, window_width, hash_mix_vec.u64s[2], callback_handle);
-            callback((sz_cptr_t)text_fourth, window_width, hash_mix_vec.u64s[3], callback_handle);
+        // Temporary variables for the rolling state
+        sz_u512_vec_t multipliers_vec[unroll_factor_k], negative_discarding_multipliers_vec[unroll_factor_k],
+            modulos_vec[unroll_factor_k], inverse_modulos_vec[unroll_factor_k];
+
+        for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
+            unsigned const dim = group_index * hashes_per_unrolled_group_k + index_in_group * hashes_per_zmm_k;
+            multipliers_vec[index_in_group].zmm_pd = _mm512_load_pd(&multipliers_[dim]);
+            negative_discarding_multipliers_vec[index_in_group].zmm_pd =
+                _mm512_load_pd(&negative_discarding_multipliers_[dim]);
+            modulos_vec[index_in_group].zmm_pd = _mm512_load_pd(&modulos_[dim]);
+            inverse_modulos_vec[index_in_group].zmm_pd = _mm512_load_pd(&inverse_modulos_[dim]);
         }
-    }
-}
 
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif            // SZ_USE_HASWELL
-#pragma endregion // Haswell Implementation
+        // Until we reach the `window_width_k`, we don't need to discard any symbols and can keep the code simpler
+        std::size_t const prefix_length = (std::min)(text_chunk.size(), window_width_k);
+        std::size_t new_char_offset = passed_progress;
+        for (; new_char_offset < prefix_length; ++new_char_offset) {
+            byte_t const new_char = text_chunk[new_char_offset];
+            rolling_state_t const new_term = static_cast<rolling_state_t>(new_char) + 1.0;
+            __m512d new_term_zmm = _mm512_set1_pd(new_term);
+
+            for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
+                last_floats_vec[index_in_group].zmm_pd = _mm512_fmadd_pd(
+                    multipliers_vec[index_in_group].zmm_pd, new_term_zmm, last_floats_vec[index_in_group].zmm_pd);
+                last_floats_vec[index_in_group].zmm_pd = barrett_mod( //
+                    last_floats_vec[index_in_group].zmm_pd,           //
+                    modulos_vec[index_in_group].zmm_pd,               //
+                    inverse_modulos_vec[index_in_group].zmm_pd);
+            }
+        }
 
-/*  AVX512 implementation of the string hashing algorithms for Skylake and newer CPUs.
- *  Includes extensions: F, CD, ER, PF, VL, DQ, BW.
- *
- *  This is the "starting level" for the advanced algorithms using K-mask registers on x86.
- */
-#pragma region Skylake Implementation
-#if SZ_USE_SKYLAKE
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif            // SZ_USE_SKYLAKE
-#pragma endregion // Skylake Implementation
-
-/*  AVX512 implementation of the string search algorithms for Ice Lake and newer CPUs.
- *  Includes extensions:
- *      - 2017 Skylake: F, CD, ER, PF, VL, DQ, BW,
- *      - 2018 CannonLake: IFMA, VBMI,
- *      - 2019 Ice Lake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES.
- */
-#pragma region Ice Lake Implementation
-#if SZ_USE_ICE
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
-                             apply_to = function)
-
-SZ_PUBLIC void sz_hashes_ice(sz_cptr_t start, sz_size_t length, sz_size_t window_width, sz_size_t step, //
-                             sz_hash_callback_t callback, void *callback_handle) {
-
-    if (length < window_width || !window_width) return;
-    if (length < 4 * window_width) {
-        sz_hashes_serial(start, length, window_width, step, callback, callback_handle);
-        return;
-    }
+        // We now have our first minimum hashes
+        __m256i const ones_ymm = _mm256_set1_epi32(1);
+        if (new_char_offset == prefix_length)
+            for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group)
+                rolling_minimums_vec[index_in_group].zmm_pd = last_floats_vec[index_in_group].zmm_pd,
+                rolling_counts_vec[index_in_group].ymm = ones_ymm;
 
-    // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
-    // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
-    sz_size_t const max_hashes = length - window_width + 1;
-    sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
-    sz_u8_t const *text_first = (sz_u8_t const *)start;
-    sz_u8_t const *text_second = text_first + min_hashes_per_thread;
-    sz_u8_t const *text_third = text_first + min_hashes_per_thread * 2;
-    sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
-    sz_u8_t const *text_end = text_first + length;
-
-    // Broadcast the global constants into the registers.
-    // Both high and low hashes will work with the same prime and golden ratio.
-    sz_u512_vec_t prime_vec, golden_ratio_vec;
-    prime_vec.zmm = _mm512_set1_epi64(SZ_U64_MAX_PRIME);
-    golden_ratio_vec.zmm = _mm512_set1_epi64(11400714819323198485ull);
-
-    // Prepare the `prime ^ window_width` values, that we are going to use for modulo arithmetic.
-    sz_u64_t prime_power_low = 1, prime_power_high = 1;
-    for (sz_size_t i = 0; i + 1 < window_width; ++i)
-        prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
-        prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
-
-    // We will be evaluating 4 offsets at a time with 2 different hash functions.
-    // We can fit all those 8 state variables in each of the following ZMM registers.
-    sz_u512_vec_t base_vec, prime_power_vec, shift_vec;
-    base_vec.zmm = _mm512_set_epi64(31ull, 31ull, 31ull, 31ull, 257ull, 257ull, 257ull, 257ull);
-    shift_vec.zmm = _mm512_set_epi64(0ull, 0ull, 0ull, 0ull, 77ull, 77ull, 77ull, 77ull);
-    prime_power_vec.zmm = _mm512_set_epi64(prime_power_low, prime_power_low, prime_power_low, prime_power_low,
-                                           prime_power_high, prime_power_high, prime_power_high, prime_power_high);
-
-    // Compute the initial hash values for every one of the four windows.
-    sz_u512_vec_t hash_vec, chars_vec;
-    hash_vec.zmm = _mm512_setzero_si512();
-    for (sz_u8_t const *prefix_end = text_first + window_width; text_first < prefix_end;
-         ++text_first, ++text_second, ++text_third, ++text_fourth) {
-
-        // 1. Multiply the hashes by the base.
-        hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`...
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
-                                         text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-
-        // 3. Add the incoming characters.
-        hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
-                                              _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
-    }
+        // Now we can avoid a branch in the nested loop, as we are passed the longest window width
+        for (std::size_t new_char_offset = window_width_k; new_char_offset < text_chunk.size(); ++new_char_offset) {
+            byte_t const new_char = text_chunk[new_char_offset];
+            byte_t const old_char = text_chunk[new_char_offset - window_width_k];
+            rolling_state_t const new_term = static_cast<rolling_state_t>(new_char) + 1.0;
+            rolling_state_t const old_term = static_cast<rolling_state_t>(old_char) + 1.0;
+            __m512d new_term_zmm = _mm512_set1_pd(new_term);
+            __m512d old_term_zmm = _mm512_set1_pd(old_term);
+
+            for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
+
+                // Discard the old term
+                last_floats_vec[index_in_group].zmm_pd =
+                    _mm512_fmadd_pd(negative_discarding_multipliers_vec[index_in_group].zmm_pd, old_term_zmm,
+                                    last_floats_vec[index_in_group].zmm_pd);
+                last_floats_vec[index_in_group].zmm_pd = barrett_mod( //
+                    last_floats_vec[index_in_group].zmm_pd,           //
+                    modulos_vec[index_in_group].zmm_pd,               //
+                    inverse_modulos_vec[index_in_group].zmm_pd);
+
+                // Add the new term
+                last_floats_vec[index_in_group].zmm_pd = _mm512_fmadd_pd(
+                    last_floats_vec[index_in_group].zmm_pd, multipliers_vec[index_in_group].zmm_pd, new_term_zmm);
+                last_floats_vec[index_in_group].zmm_pd = barrett_mod( //
+                    last_floats_vec[index_in_group].zmm_pd,           //
+                    modulos_vec[index_in_group].zmm_pd,               //
+                    inverse_modulos_vec[index_in_group].zmm_pd);
+
+                // To keep the right comparison mask, check out: https://stackoverflow.com/q/16988199
+                __mmask8 same_mask = _mm512_cmp_pd_mask(rolling_minimums_vec[index_in_group].zmm_pd,
+                                                        last_floats_vec[index_in_group].zmm_pd, _CMP_EQ_OQ);
+                rolling_minimums_vec[index_in_group].zmm_pd =
+                    _mm512_min_pd(rolling_minimums_vec[index_in_group].zmm_pd, last_floats_vec[index_in_group].zmm_pd);
+                rolling_counts_vec[index_in_group].ymm =
+                    _mm256_mask_add_epi32(rolling_counts_vec[index_in_group].ymm, same_mask,
+                                          rolling_counts_vec[index_in_group].ymm, ones_ymm);
+            }
+        }
 
-    // 5. Compute the hash mix, that will be used to index into the fingerprint.
-    //    This includes a serial step at the end.
-    sz_u512_vec_t hash_mix_vec;
-    hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
-    hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
-                                            _mm512_extracti64x4_epi64(hash_mix_vec.zmm, 0));
-
-    callback((sz_cptr_t)text_first, window_width, hash_mix_vec.u64s[0], callback_handle);
-    callback((sz_cptr_t)text_second, window_width, hash_mix_vec.u64s[1], callback_handle);
-    callback((sz_cptr_t)text_third, window_width, hash_mix_vec.u64s[2], callback_handle);
-    callback((sz_cptr_t)text_fourth, window_width, hash_mix_vec.u64s[3], callback_handle);
-
-    // Now repeat that operation for the remaining characters, discarding older characters.
-    sz_size_t cycle = 1;
-    sz_size_t step_mask = step - 1;
-    for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
-        // 0. Load again the four characters we are dropping, shift them, and subtract.
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[-window_width], text_third[-window_width],
-                                         text_second[-window_width], text_first[-window_width], //
-                                         text_fourth[-window_width], text_third[-window_width],
-                                         text_second[-window_width], text_first[-window_width]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-        hash_vec.zmm = _mm512_sub_epi64(hash_vec.zmm, _mm512_mullo_epi64(chars_vec.zmm, prime_power_vec.zmm));
-
-        // 1. Multiply the hashes by the base.
-        hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
-
-        // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
-        //   `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
-        chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
-                                         text_fourth[0], text_third[0], text_second[0], text_first[0]);
-        chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
-
-        // ... and prefetch the next four characters into Level 2 or higher.
-        _mm_prefetch((sz_cptr_t)text_fourth + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_third + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_second + 1, _MM_HINT_T1);
-        _mm_prefetch((sz_cptr_t)text_first + 1, _MM_HINT_T1);
-
-        // 3. Add the incoming characters.
-        hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
-
-        // 4. Compute the modulo. Assuming there are only 59 values between our prime
-        //    and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
-        hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
-                                              _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
-
-        // 5. Compute the hash mix, that will be used to index into the fingerprint.
-        //    This includes a serial step at the end.
-        hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
-        hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
-                                                _mm512_castsi512_si256(hash_mix_vec.zmm));
-
-        if ((cycle & step_mask) == 0) {
-            callback((sz_cptr_t)text_first, window_width, hash_mix_vec.u64s[0], callback_handle);
-            callback((sz_cptr_t)text_second, window_width, hash_mix_vec.u64s[1], callback_handle);
-            callback((sz_cptr_t)text_third, window_width, hash_mix_vec.u64s[2], callback_handle);
-            callback((sz_cptr_t)text_fourth, window_width, hash_mix_vec.u64s[3], callback_handle);
+        // Dump back the results into our spans
+        for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
+            unsigned const dim = group_index * hashes_per_unrolled_group_k + index_in_group * hashes_per_zmm_k;
+            _mm512_storeu_pd(&last_floats[dim], last_floats_vec[index_in_group].zmm_pd);
+            _mm512_storeu_pd(&rolling_minimums[dim], rolling_minimums_vec[index_in_group].zmm_pd);
+            _mm256_storeu_si256(reinterpret_cast<__m256i *>(&rolling_counts[dim]),
+                                rolling_counts_vec[index_in_group].ymm);
         }
     }
-}
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif            // SZ_USE_ICE
-#pragma endregion // Ice Lake Implementation
+};
 
-/*  Implementation of the string hashing algorithms using the Arm NEON instruction set, available on 64-bit
- *  Arm processors. Covers billions of mobile CPUs worldwide, including Apple's A-series, and Qualcomm's Snapdragon.
- */
-#pragma region NEON Implementation
-#if SZ_USE_NEON
-#pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+simd")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif            // SZ_USE_NEON
-#pragma endregion // NEON Implementation
-
-/*  Implementation of the string search algorithms using the Arm SVE variable-length registers,
- *  available in Arm v9 processors, like in Apple M4+ and Graviton 3+ CPUs.
- */
-#pragma region SVE Implementation
-#if SZ_USE_SVE
-#pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+sve")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif            // SZ_USE_SVE
-#pragma endregion // SVE Implementation
-
-/*  Pick the right implementation for the string search algorithms.
- *  To override this behavior and precompile all backends - set `SZ_DYNAMIC_DISPATCH` to 1.
- */
-#pragma region Compile Time Dispatching
-#if !SZ_DYNAMIC_DISPATCH
-
-SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_width, sz_size_t window_step, //
-                          sz_hash_callback_t callback, void *callback_handle) {
-#if SZ_USE_ICE
-    sz_hashes_ice(text, length, window_width, window_step, callback, callback_handle);
-#elif SZ_USE_HASWELL
-    sz_hashes_haswell(text, length, window_width, window_step, callback, callback_handle);
-#else
-    sz_hashes_serial(text, length, window_width, window_step, callback, callback_handle);
-#endif
-}
+#pragma endregion - Optimized Rolling MinHashers
 
-#endif            // !SZ_DYNAMIC_DISPATCH
-#pragma endregion // Compile Time Dispatching
+} // namespace stringzillas
+} // namespace ashvardanian
 
-#ifdef __cplusplus
-}
-#endif // __cplusplus
 #endif // STRINGZILLAS_FINGERPRINT_HPP_
-#endif
\ No newline at end of file
diff --git a/scripts/bench_fingerprint.cuh b/scripts/bench_fingerprint.cuh
index 9119783e..3edf16b0 100644
--- a/scripts/bench_fingerprint.cuh
+++ b/scripts/bench_fingerprint.cuh
@@ -25,8 +25,10 @@ using namespace ashvardanian::stringzilla::scripts;
 static constexpr std::size_t default_embedding_dims_k = 128;
 static constexpr std::size_t default_window_width_k = 7;
 
-using fingerprint_t = std::array<std::uint32_t, default_embedding_dims_k>;
-using fingerprints_t = unified_vector<fingerprint_t>;
+using fingerprint_min_hashes_t = std::array<std::uint32_t, default_embedding_dims_k>;
+using fingerprint_min_counts_t = std::array<std::uint32_t, default_embedding_dims_k>;
+using fingerprints_min_hashes_t = unified_vector<fingerprint_min_hashes_t>;
+using fingerprints_min_counts_t = unified_vector<fingerprint_min_counts_t>;
 
 #pragma region Multi-Pattern Search
 
@@ -36,21 +38,25 @@ struct fingerprint_callable {
     using engine_t = engine_type_;
 
     environment_t const &env;
-    fingerprints_t &fingerprints;
+    fingerprints_min_hashes_t &fingerprints_hashes;
+    fingerprints_min_counts_t &fingerprints_counts;
     engine_t &engine;
     std::tuple<extra_args_...> extra_args = {};
 
-    fingerprint_callable(environment_t const &env, fingerprints_t &fingerprints, engine_t &eng, extra_args_... args)
-        : env(env), fingerprints(fingerprints), engine(eng), extra_args(args...) {}
+    fingerprint_callable(environment_t const &env, fingerprints_min_hashes_t &fingerprints_hashes,
+                         fingerprints_min_counts_t &fingerprints_counts, engine_t &eng, extra_args_... args)
+        : env(env), fingerprints_hashes(fingerprints_hashes), fingerprints_counts(fingerprints_counts), engine(eng),
+          extra_args(args...) {}
 
     call_result_t operator()() noexcept(false) {
 
         // Unpack the extra arguments from `std::tuple` into the engine call using `std::apply`
         status_t status = std::apply(
             [&](auto &&...rest) mutable {
-                auto result = engine(env.tokens, fingerprints, rest...);
+                auto result = engine(env.tokens, fingerprints_hashes, fingerprints_counts, rest...);
                 do_not_optimize(result);
-                for (auto &fingerprint : fingerprints) do_not_optimize(fingerprint);
+                for (auto &scalar : fingerprints_hashes) do_not_optimize(scalar);
+                for (auto &scalar : fingerprints_counts) do_not_optimize(scalar);
                 return result;
             },
             extra_args);
@@ -65,7 +71,7 @@ struct fingerprint_callable {
         call_result.bytes_passed = bytes_passed;
         call_result.operations = bytes_passed * default_embedding_dims_k;
         call_result.inputs_processed = env.tokens.size();
-        call_result.check_value = reinterpret_cast<check_value_t>(&fingerprints);
+        call_result.check_value = reinterpret_cast<check_value_t>(&fingerprints_hashes);
         return call_result;
     }
 };
@@ -74,11 +80,13 @@ void bench_fingerprint(environment_t const &env) {
 
     // Preallocate buffers for resulting fingerprints,
     // so that we can compare baseline and accelerated results for exact matches
-    using fingerprints_equality_t = arrays_equality<fingerprints_t>;
-    fingerprints_t fingerprints_baseline, fingerprints_accelerated;
-    fingerprints_baseline.resize(env.tokens.size()), fingerprints_accelerated.resize(env.tokens.size());
+    using fingerprints_equality_t = arrays_equality<fingerprints_min_hashes_t>;
+    fingerprints_min_hashes_t min_hashes_baseline, min_hashes_accelerated;
+    fingerprints_min_counts_t min_counts_baseline, min_counts_accelerated;
+    min_hashes_baseline.resize(env.tokens.size()), min_hashes_accelerated.resize(env.tokens.size());
+    min_counts_baseline.resize(env.tokens.size()), min_counts_accelerated.resize(env.tokens.size());
     auto scramble_accelerated_results = [&]() {
-        std::shuffle(fingerprints_accelerated.begin(), fingerprints_accelerated.end(), global_random_generator());
+        std::shuffle(min_hashes_accelerated.begin(), min_hashes_accelerated.end(), global_random_generator());
     };
 
     // Allocate all hashers on heap
@@ -120,35 +128,42 @@ void bench_fingerprint(environment_t const &env) {
         throw std::runtime_error("Can't build Skylake Floating Hasher.");
 
     // Perform the benchmarks, passing the dictionary to the engines
-    auto call_baseline = fingerprint_callable<rolling_f64_t>(env, fingerprints_baseline, *rolling_f64);
+    auto call_baseline =
+        fingerprint_callable<rolling_f64_t>(env, min_hashes_baseline, min_counts_baseline, *rolling_f64);
     bench_result_t baseline = bench_nullary(env, "rolling_f64", call_baseline);
 
     // Semi-serial variants
-    bench_nullary(env, "rolling_f32", fingerprint_callable<rolling_f32_t>(env, fingerprints_accelerated, *rolling_f32))
+    bench_nullary(
+        env, "rolling_f32",
+        fingerprint_callable<rolling_f32_t>(env, min_hashes_accelerated, min_counts_accelerated, *rolling_f32))
         .log(baseline);
-    bench_nullary(env, "rabin_u64", fingerprint_callable<rabin_u64_t>(env, fingerprints_accelerated, *rabin_u64))
+    bench_nullary(env, "rabin_u64",
+                  fingerprint_callable<rabin_u64_t>(env, min_hashes_accelerated, min_counts_accelerated, *rabin_u64))
         .log(baseline);
-    bench_nullary(env, "buz_u32", fingerprint_callable<buz_u32_t>(env, fingerprints_accelerated, *buz_u32)) //
+    bench_nullary(env, "buz_u32",
+                  fingerprint_callable<buz_u32_t>(env, min_hashes_accelerated, min_counts_accelerated, *buz_u32)) //
         .log(baseline);
-    bench_nullary(env, "multiply_u32",
-                  fingerprint_callable<multiply_u32_t>(env, fingerprints_accelerated, *multiply_u32))
+    bench_nullary(
+        env, "multiply_u32",
+        fingerprint_callable<multiply_u32_t>(env, min_hashes_accelerated, min_counts_accelerated, *multiply_u32))
         .log(baseline);
 
     // Actually unrolled hard-coded variants, including SIMD ports
-    bench_result_t unrolled =
-        bench_nullary(                                                                              //
-            env, "rolling_serial", call_baseline,                                                   //
-            fingerprint_callable<rolling_serial_t>(env, fingerprints_accelerated, *rolling_serial), //
-            callable_no_op_t {},                                                                    // preprocessing
-            fingerprints_equality_t {})                                                             // equality check
-            .log(baseline);
+    bench_result_t unrolled = bench_nullary(                            //
+                                  env, "rolling_serial", call_baseline, //
+                                  fingerprint_callable<rolling_serial_t>(env, min_hashes_accelerated,
+                                                                         min_counts_accelerated, *rolling_serial), //
+                                  callable_no_op_t {},        // preprocessing
+                                  fingerprints_equality_t {}) // equality check
+                                  .log(baseline);
     scramble_accelerated_results();
 
-    bench_nullary(                                                                                //
-        env, "rolling_skylake", call_baseline,                                                    //
-        fingerprint_callable<rolling_skylake_t>(env, fingerprints_accelerated, *rolling_skylake), //
-        callable_no_op_t {},                                                                      // preprocessing
-        fingerprints_equality_t {})                                                               // equality check
+    bench_nullary(                             //
+        env, "rolling_skylake", call_baseline, //
+        fingerprint_callable<rolling_skylake_t>(env, min_hashes_accelerated, min_counts_accelerated,
+                                                *rolling_skylake), //
+        callable_no_op_t {},                                       // preprocessing
+        fingerprints_equality_t {})                                // equality check
         .log(baseline, unrolled);
     scramble_accelerated_results();
 }
diff --git a/scripts/test_fingerprint.cuh b/scripts/test_fingerprint.cuh
index 3bef4b90..d879724e 100644
--- a/scripts/test_fingerprint.cuh
+++ b/scripts/test_fingerprint.cuh
@@ -338,24 +338,24 @@ void test_rolling_hashers_equivalence_for_width() {
 
     constexpr std::size_t embedding_dims_k = embedding_dims_;
     constexpr std::size_t window_width_k = window_width_;
-    using fingerprint_t = safe_array<std::uint32_t, embedding_dims_k>;
+    using fingerprint_hashes_t = safe_array<std::uint32_t, embedding_dims_k>;
+    using fingerprint_counts_t = safe_array<std::uint32_t, embedding_dims_k>;
 
     auto test_against_baseline = [&](auto const &strings, auto const &baseline_hasher, auto const &accelerated_hasher) {
-        fingerprint_t fingerprint_accelerated;
-        fingerprint_t fingerprint_serial;
+        fingerprint_hashes_t serial_hashes, accelerated_hashes;
+        fingerprint_counts_t serial_counts, accelerated_counts;
 
         // Compute the fingerprints
         for (auto const &str : strings) {
             auto bytes = to_bytes_view(str);
-            baseline_hasher.template try_fingerprint<embedding_dims_k>(bytes, fingerprint_serial);
-            accelerated_hasher.try_fingerprint(bytes, fingerprint_accelerated);
+            baseline_hasher.template try_fingerprint<embedding_dims_k>(bytes, serial_hashes, serial_counts);
+            accelerated_hasher.try_fingerprint(bytes, accelerated_hashes, accelerated_counts);
 
             // Compare the results
             std::size_t const first_mismatch_index =
-                std::mismatch(fingerprint_serial.begin(), fingerprint_serial.end(), fingerprint_accelerated.begin())
-                    .first -
-                fingerprint_serial.begin();
-            sz_assert_(first_mismatch_index == fingerprint_serial.size() && "Fingerprints do not match");
+                std::mismatch(serial_hashes.begin(), serial_hashes.end(), accelerated_hashes.begin()).first -
+                serial_hashes.begin();
+            sz_assert_(first_mismatch_index == serial_hashes.size() && "Fingerprints do not match");
         }
     };
 

From 44a058d8095ac955b52df2e92473f4d25a896fef Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 25 Jul 2025 12:02:33 +0000
Subject: [PATCH 495/751] Fix: Passing fingerprinting builds

---
 include/stringzilla/types.hpp        |   4 +
 include/stringzillas/fingerprint.hpp | 296 ++++++++++++++-------------
 2 files changed, 154 insertions(+), 146 deletions(-)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index fd693dc2..8715c6c6 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -159,6 +159,8 @@ struct span {
         sz_unused_(size);
     }
 
+    sz_constexpr_if_cpp14 explicit operator bool() const noexcept { return data_ != nullptr; }
+
     constexpr value_type *begin() const noexcept { return data_; }
     constexpr value_type *end() const noexcept { return data_ + extent; }
     constexpr value_type *data() const noexcept { return data_; }
@@ -202,6 +204,8 @@ struct span<value_type_, SZ_SIZE_MAX> {
     constexpr span(value_type *data, size_type size) noexcept : data_(data), size_(size) {}
     constexpr span(value_type *data, value_type *end) noexcept : data_(data), size_(end - data) {}
 
+    sz_constexpr_if_cpp14 explicit operator bool() const noexcept { return data_ != nullptr; }
+
     constexpr value_type *begin() const noexcept { return data_; }
     constexpr value_type *end() const noexcept { return data_ + size_; }
     constexpr value_type *data() const noexcept { return data_; }
diff --git a/include/stringzillas/fingerprint.hpp b/include/stringzillas/fingerprint.hpp
index f3e5ad26..de27b52f 100644
--- a/include/stringzillas/fingerprint.hpp
+++ b/include/stringzillas/fingerprint.hpp
@@ -218,7 +218,7 @@ struct rabin_karp_rolling_hasher {
         return with_new;
     }
 
-    inline hash_t digest(state_t state) noexcept { return static_cast<hash_t>(state); }
+    inline hash_t digest(state_t state) const noexcept { return static_cast<hash_t>(state); }
 
   private:
     inline state_t mul_mod(state_t a, state_t b) const noexcept { return (a * b) % modulo_; }
@@ -375,7 +375,7 @@ struct floating_rolling_hasher<float> {
         state_t old_term = state_t(old_char) + 1.0f;
         state_t new_term = state_t(new_char) + 1.0f;
 
-        state_t without_old = fma_mod(old_term, negative_discarding_multiplier_, state);
+        state_t without_old = fma_mod(negative_discarding_multiplier_, old_term, state);
         return fma_mod(without_old, multiplier_, new_term);
     }
 
@@ -476,7 +476,7 @@ struct floating_rolling_hasher<double> {
     inline state_t roll(state_t state, byte_t old_char, byte_t new_char) const noexcept {
         state_t old_term = state_t(old_char) + 1.0;
         state_t new_term = state_t(new_char) + 1.0;
-        state_t without_old = fma_mod(old_term, negative_discarding_multiplier_, state);
+        state_t without_old = fma_mod(negative_discarding_multiplier_, old_term, state);
         return fma_mod(without_old, multiplier_, new_term);
     }
 
@@ -656,7 +656,7 @@ struct basic_rolling_hashers {
             allocator_traits_t::select_on_container_copy_construction(allocator_));
         safe_vector<rolling_hash_t, rolling_hashes_allocator_t> rolling_minimums_buffer(
             allocator_traits_t::select_on_container_copy_construction(allocator_));
-        if (rolling_states_buffer.try_resize(dimensions()) != status_t::success_k &&
+        if (rolling_states_buffer.try_resize(dimensions()) != status_t::success_k ||
             rolling_minimums_buffer.try_resize(dimensions()) != status_t::success_k)
             return status_t::bad_alloc_k;
 
@@ -665,34 +665,41 @@ struct basic_rolling_hashers {
         for (auto &minimum : rolling_minimums_buffer) minimum = skipped_rolling_hash_k;
 
         // Roll through the entire `text`
-        auto rolling_states = span<rolling_state_t, dimensions_>(rolling_states_buffer);
-        auto rolling_minimums = span<rolling_hash_t, dimensions_>(rolling_minimums_buffer);
-        roll<dimensions_>(text, rolling_states, rolling_minimums, min_counts, 0);
-
-        // Now that the states are updated, export them into the output spans
-        digest<dimensions_>(rolling_minimums, min_hashes);
+        auto rolling_states =
+            span<rolling_state_t, dimensions_>(rolling_states_buffer.data(), rolling_states_buffer.size());
+        auto rolling_minimums =
+            span<rolling_hash_t, dimensions_>(rolling_minimums_buffer.data(), rolling_minimums_buffer.size());
+        fingerprint_chunk<dimensions_>(text, rolling_states, rolling_minimums, min_hashes, min_counts);
         return status_t::success_k;
     }
 
     /**
      *  @brief Underlying machinery of `fingerprint` that fills the states of the hashers.
      *  @param[in] text_chunk A chunk of text to update the @p `last_states` with.
-     *  @param[inout] last_states The last computed floats for each hasher; start with zeroes.
-     *  @param[inout] rolling_minimums The minimum floats for each hasher; start with `skipped_rolling_hash_k`.
-     *  @param[inout] rolling_counts The appearance frequency counts of each @p `rolling_minimums`; start with 1.
+     *  @param[inout] last_states The last computed floats for each hasher; start with @b zeroes.
+     *  @param[inout] rolling_minimums The minimum floats for each hasher; start with @b `skipped_rolling_hash_k`.
+     *  @param[out] min_hashes The @b optional output for minimum hashes, which are the final fingerprints.
+     *  @param[out] min_counts The frequencies of @p `rolling_minimums` and optional @p `min_hashes` hashes.
      *  @param[in] passed_progress The offset of the received @p `text_chunk` in the whole text; defaults to 0.
+     *
+     *  Unlike the `fingerprint` method, this function can be used in a @b rolling fashion, i.e., it can be called
+     *  multiple times with different chunks of text, and it will update the states accordingly. In the end, it
+     *  will anyways export the composing Count-Min-Sketch fingerprint into the @p `min_hashes` and @p `min_counts`,
+     *  as its a relatively cheap operation.
      */
     template <size_t dimensions_ = SZ_SIZE_MAX>
-    void roll(                                              //
+    void fingerprint_chunk(                                 //
         span<byte_t const> text_chunk,                      //
         span<rolling_state_t, dimensions_> last_states,     //
         span<rolling_hash_t, dimensions_> rolling_minimums, //
-        span<min_count_t, dimensions_> rolling_counts,      //
+        span<min_hash_t, dimensions_> min_hashes,           //
+        span<min_count_t, dimensions_> min_counts,          //
         std::size_t const passed_progress = 0) const noexcept {
 
         sz_assert_(dimensions() == last_states.size() && "Dimensions number & states number mismatch");
         sz_assert_(dimensions() == rolling_minimums.size() && "Dimensions number & minimums number mismatch");
-        sz_assert_(dimensions() == rolling_counts.size() && "Dimensions number & hash-counts number mismatch");
+        sz_assert_(dimensions() == min_hashes.size() && "Dimensions number & min-hashes number mismatch");
+        sz_assert_(dimensions() == min_counts.size() && "Dimensions number & hash-counts number mismatch");
 
         // Until we reach the maximum window length, use a branching code version
         std::size_t const prefix_length = (std::min)(text_chunk.size(), max_window_width_);
@@ -701,21 +708,22 @@ struct basic_rolling_hashers {
             byte_t const new_char = text_chunk[new_char_offset];
             for (std::size_t dim = 0; dim < last_states.size(); ++dim) {
                 auto &hasher = hashers_[dim];
-                auto &last_state = last_states[dim];
-                auto &rolling_minimum = rolling_minimums[dim];
-                auto &rolling_count = rolling_counts[dim];
+                rolling_state_t &last_state = last_states[dim];
+                rolling_hash_t &rolling_minimum = rolling_minimums[dim];
+                min_count_t &min_count = min_counts[dim];
                 if (hasher.window_width() > new_char_offset) {
                     last_state = hasher.push(last_state, new_char);
                     if (hasher.window_width() == (new_char_offset + 1)) {
                         rolling_minimum = (std::min)(rolling_minimum, hasher.digest(last_state));
-                        rolling_count = 1; // First occurrence of this hash
+                        min_count = 1; // First occurrence of this hash
                     }
                 }
                 else {
                     auto const old_char = text_chunk[new_char_offset - hasher.window_width()];
                     last_state = hasher.roll(last_state, old_char, new_char);
-                    auto new_hash = hasher.digest(last_state);
-                    rolling_count += new_hash == rolling_minimum;
+                    rolling_hash_t new_hash = hasher.digest(last_state);
+                    min_count *= new_hash >= rolling_minimum; // Discard `min_count` to 0, if a new minimum is found
+                    min_count += new_hash == rolling_minimum; // Increments `min_count` by 1 for new & existing minimums
                     rolling_minimum = (std::min)(rolling_minimum, new_hash);
                 }
             }
@@ -726,34 +734,27 @@ struct basic_rolling_hashers {
             byte_t const new_char = text_chunk[new_char_offset];
             for (std::size_t dim = 0; dim < last_states.size(); ++dim) {
                 auto &hasher = hashers_[dim];
-                auto &last_state = last_states[dim];
-                auto &rolling_minimum = rolling_minimums[dim];
-                auto &rolling_count = rolling_counts[dim];
+                rolling_state_t &last_state = last_states[dim];
+                rolling_hash_t &rolling_minimum = rolling_minimums[dim];
+                min_count_t &min_count = min_counts[dim];
                 auto const old_char = text_chunk[new_char_offset - hasher.window_width()];
                 last_state = hasher.roll(last_state, old_char, new_char);
-                auto new_hash = hasher.digest(last_state);
-                rolling_count += new_hash == rolling_minimum;
+                rolling_hash_t new_hash = hasher.digest(last_state);
+                min_count *= new_hash >= rolling_minimum; // Discard `min_count` to 0, if a new minimum is found
+                min_count += new_hash == rolling_minimum; // Increments `min_count` by 1 for new & existing minimums
                 rolling_minimum = (std::min)(rolling_minimum, new_hash);
             }
         }
-    }
 
-    /**
-     *  @brief Converts the rolling minimums into the final minimum hashes.
-     *  @param[in] rolling_minimums The minimum hashes computed by the rolling hashers.
-     *  @param[out] min_hashes The output minimum hashes, which are the final fingerprints.
-     */
-    template <size_t dimensions_ = SZ_SIZE_MAX>
-    void digest(                                                  //
-        span<rolling_hash_t const, dimensions_> rolling_minimums, //
-        span<min_hash_t, dimensions_> min_hashes) const noexcept {
-
-        for (std::size_t dim = 0; dim < min_hashes.size(); ++dim) {
-            rolling_hash_t const rolling_minimum = rolling_minimums[dim];
-            min_hashes[dim] = rolling_minimum == skipped_rolling_hash_k
-                                  ? max_hash_k // If the rolling minimum is not set, use the maximum hash value
-                                  : static_cast<min_hash_t>(rolling_minimum & max_hash_k);
-        }
+        // Finally, export the minimum hashes into the smaller representations
+        if (min_hashes)
+            for (std::size_t dim = 0; dim < min_hashes.size(); ++dim) {
+                rolling_hash_t const &rolling_minimum = rolling_minimums[dim];
+                min_hash_t &min_hash = min_hashes[dim];
+                min_hash = rolling_minimum == skipped_rolling_hash_k
+                               ? max_hash_k // If the rolling minimum is not set, use the maximum hash value
+                               : static_cast<min_hash_t>(rolling_minimum & max_hash_k);
+            }
     }
 
     /**
@@ -808,8 +809,12 @@ struct basic_rolling_hashers {
             span<byte_t const> text_view = to_bytes_view(text);
             span<rolling_state_t> thread_local_states {rolling_states.data() + thread_index * dims, dims};
             span<rolling_hash_t> thread_local_minimums {rolling_minimums.data() + thread_index * dims, dims};
-            roll<SZ_SIZE_MAX>(text_view, thread_local_states, thread_local_minimums, min_counts);
-            digest<SZ_SIZE_MAX>(thread_local_minimums, min_hashes);
+
+            // Clear the thread-local buffers & run the rolling fingerprinting API
+            for (auto &state : thread_local_states) state = rolling_state_t(0);
+            for (auto &minimum : thread_local_minimums) minimum = skipped_rolling_hash_k;
+            fingerprint_chunk<SZ_SIZE_MAX>(text_view, thread_local_states, thread_local_minimums, min_hashes,
+                                           min_counts);
         });
 
         // Process large texts by splitting them into chunks
@@ -836,7 +841,12 @@ struct basic_rolling_hashers {
                 auto thread_local_states = span<rolling_state_t> {rolling_states.data() + thread_index * dims, dims};
                 auto thread_local_minimums = span<rolling_hash_t> {rolling_minimums.data() + thread_index * dims, dims};
                 auto thread_local_counts = span<min_count_t> {rolling_counts.data() + thread_index * dims, dims};
-                roll<SZ_SIZE_MAX>(thread_local_text, thread_local_states, thread_local_minimums, thread_local_counts);
+
+                // Clear the thread-local buffers & run the rolling fingerprinting API
+                for (auto &state : thread_local_states) state = rolling_state_t(0);
+                for (auto &minimum : thread_local_minimums) minimum = skipped_rolling_hash_k;
+                fingerprint_chunk<SZ_SIZE_MAX>(thread_local_text, thread_local_states, thread_local_minimums, {},
+                                               thread_local_counts);
             });
 
             // Compute the minimums of each thread's local states
@@ -887,8 +897,8 @@ status_t floating_rolling_hashers_in_parallel_(
     using min_hash_t = typename engine_t::min_hash_t;
     static constexpr auto window_width_k = engine_t::window_width_k;
     static constexpr auto dimensions_k = engine_t::dimensions_k;
-    static constexpr auto skipped_rolling_state_k = engine_t::skipped_rolling_state_k;
     static constexpr auto skipped_rolling_hash_k = engine_t::skipped_rolling_hash_k;
+    static constexpr auto max_hash_k = engine_t::max_hash_k;
 
     // Depending on document sizes, choose the appropriate parallelization strategy
     // - Either split each text into chunks across threads
@@ -920,12 +930,8 @@ status_t floating_rolling_hashers_in_parallel_(
             divide_round_up(text_view.size(), executor.threads_count()), //
             specs.cache_line_width);
 
-        rolling_state_t states[dimensions_k];
-        rolling_state_t minimums[dimensions_k];
-        for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
-            states[dim] = skipped_rolling_state_k;
-            minimums[dim] = skipped_rolling_hash_k;
-        }
+        rolling_state_t rolling_minimums[dimensions_k];
+        for (std::size_t dim = 0; dim < dimensions_k; ++dim) rolling_minimums[dim] = skipped_rolling_hash_k;
 
         // Distribute overlapping chunks across threads
         auto min_hashes = to_span(min_hashes_per_text[text_index]);
@@ -942,11 +948,14 @@ status_t floating_rolling_hashers_in_parallel_(
             rolling_state_t thread_local_states[dimensions_k];
             rolling_state_t thread_local_minimums[dimensions_k];
             min_count_t thread_local_counts[dimensions_k];
-            engine.roll(thread_local_text, thread_local_states, thread_local_minimums, thread_local_counts);
+            for (std::size_t dim = 0; dim < dimensions_k; ++dim)
+                thread_local_states[dim] = 0, thread_local_minimums[dim] = skipped_rolling_hash_k;
+            engine.fingerprint_chunk(thread_local_text, thread_local_states, thread_local_minimums, {},
+                                     thread_local_counts);
 
             lock_guard lock(gather_mutex);
             for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
-                rolling_state_t &min_hash = minimums[dim];
+                rolling_state_t &min_hash = rolling_minimums[dim];
                 min_count_t &min_count = min_counts[dim];
                 rolling_state_t thread_local_min_hash = thread_local_minimums[dim];
                 min_count_t thread_local_min_count = thread_local_counts[dim];
@@ -959,8 +968,14 @@ status_t floating_rolling_hashers_in_parallel_(
         // Digest the smallest hash states, luckily for us, for this hash function,
         // the smallest state corresponds to the smallest digested hash :)
         // This is also never a bottleneck, so let's keep it sequential for simplicity.
-
-        engine.digest(span<rolling_state_t const, dimensions_k>(minimums), span<min_hash_t, dimensions_k>(min_hashes));
+        for (std::size_t dim = 0; dim < min_hashes.size(); ++dim) {
+            rolling_state_t const &rolling_minimum = rolling_minimums[dim];
+            min_hash_t &min_hash = min_hashes[dim];
+            auto const rolling_minimum_as_uint = static_cast<std::uint64_t>(rolling_minimum);
+            min_hash = rolling_minimum == skipped_rolling_hash_k
+                           ? max_hash_k // If the rolling minimum is not set, use the maximum hash value
+                           : static_cast<min_hash_t>(rolling_minimum_as_uint & max_hash_k);
+        }
     }
 
     return status_t::success_k;
@@ -1037,20 +1052,17 @@ struct floating_rolling_hashers {
     void fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
                      min_counts_span_t min_counts) const noexcept {
 
-        // Fill the states
+        if (text.size() < window_width_k) {
+            for (auto &min_hash : min_hashes) min_hash = max_hash_k;
+            for (auto &min_count : min_counts) min_count = 0;
+            return;
+        }
+
         rolling_state_t rolling_states[dimensions_k];
         rolling_state_t rolling_minimums[dimensions_k];
-        min_count_t rolling_counts[dimensions_k];
-        for (std::size_t dim = 0; dim < dimensions_k; ++dim)
-            rolling_states[dim] = 0, rolling_minimums[dim] = skipped_rolling_hash_k, rolling_counts[dim] = 0;
-
-        // Roll through the whole input at once
-        if (text.size() >= window_width_k) roll(text, rolling_states, rolling_minimums, rolling_counts);
-
-        // Export the minimum floats to the fingerprint
         for (std::size_t dim = 0; dim < dimensions_k; ++dim)
-            min_hashes[dim] = static_cast<min_hash_t>(rolling_minimums[dim] & max_hash_k),
-            min_counts[dim] = rolling_counts[dim];
+            rolling_states[dim] = 0, rolling_minimums[dim] = skipped_rolling_hash_k;
+        fingerprint_chunk(text, rolling_states, rolling_minimums, min_hashes, min_counts);
     }
 
     /**
@@ -1068,16 +1080,23 @@ struct floating_rolling_hashers {
     /**
      *  @brief Underlying machinery of `fingerprint` that fills the states of the hashers.
      *  @param[in] text_chunk A chunk of text to update the @p `last_states` with.
-     *  @param[inout] last_states The last computed floats for each hasher; start with zeroes.
-     *  @param[inout] rolling_minimums The minimum floats for each hasher; start with `skipped_rolling_hash_k`.
-     *  @param[inout] rolling_counts The appearance frequency counts of each @p `rolling_minimums`; start with 1.
+     *  @param[inout] last_states The last computed floats for each hasher; start with @b zeroes.
+     *  @param[inout] rolling_minimums The minimum floats for each hasher; start with @b `skipped_rolling_hash_k`.
+     *  @param[out] min_hashes The @b optional output for minimum hashes, which are the final fingerprints.
+     *  @param[out] min_counts The frequencies of @p `rolling_minimums` and optional @p `min_hashes` hashes.
      *  @param[in] passed_progress The offset of the received @p `text_chunk` in the whole text; defaults to 0.
+     *
+     *  Unlike the `fingerprint` method, this function can be used in a @b rolling fashion, i.e., it can be called
+     *  multiple times with different chunks of text, and it will update the states accordingly. In the end, it
+     *  will anyways export the composing Count-Min-Sketch fingerprint into the @p `min_hashes` and @p `min_counts`,
+     *  as its a relatively cheap operation.
      */
-    void roll(                                                //
+    void fingerprint_chunk(                                   //
         span<byte_t const> text_chunk,                        //
         span<rolling_state_t, dimensions_k> last_states,      //
         span<rolling_state_t, dimensions_k> rolling_minimums, //
-        span<min_count_t, dimensions_k> rolling_counts,       //
+        span<min_hash_t, dimensions_k> min_hashes,            //
+        span<min_count_t, dimensions_k> min_counts,           //
         std::size_t const passed_progress = 0) const noexcept {
 
         // Until we reach the maximum window length, use a branching code version
@@ -1088,7 +1107,7 @@ struct floating_rolling_hashers {
             rolling_state_t const new_term = static_cast<rolling_state_t>(new_char) + 1.0;
             for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
                 rolling_state_t &last_state = last_states[dim];
-                last_state += multipliers_[dim] * new_term;
+                last_state = std::fma(last_state, multipliers_[dim], new_term); // Add head
                 last_state = barrett_mod(last_state, dim);
             }
         }
@@ -1097,7 +1116,7 @@ struct floating_rolling_hashers {
         if (new_char_offset == prefix_length)
             for (std::size_t dim = 0; dim < dimensions_k; ++dim)
                 rolling_minimums[dim] = (std::min)(rolling_minimums[dim], last_states[dim]),
-                rolling_counts[dim] = 1; // First occurrence of this hash
+                min_counts[dim] = 1; // First occurrence of this hash
 
         // Now we can avoid a branch in the nested loop, as we are passed the longest window width
         for (; new_char_offset < text_chunk.size(); ++new_char_offset) {
@@ -1108,35 +1127,28 @@ struct floating_rolling_hashers {
             for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
                 rolling_state_t &last_state = last_states[dim];
                 rolling_state_t &rolling_minimum = rolling_minimums[dim];
-                min_count_t &rolling_count = rolling_counts[dim];
+                min_count_t &min_count = min_counts[dim];
 
-                last_state += negative_discarding_multipliers_[dim] * old_term; // Remove tail
+                last_state = std::fma(negative_discarding_multipliers_[dim], old_term, last_state); // Remove tail
                 last_state = barrett_mod(last_state, dim);
-                last_state += multipliers_[dim] * new_term; // Add head
+                last_state = std::fma(last_state, multipliers_[dim], new_term); // Add head
                 last_state = barrett_mod(last_state, dim);
 
-                if (rolling_minimum == last_state) { rolling_count++; }
-                else if (last_state < rolling_minimum) { rolling_minimum = last_state, rolling_count = 1; }
+                if (rolling_minimum == last_state) { min_count++; }
+                else if (last_state < rolling_minimum) { rolling_minimum = last_state, min_count = 1; }
             }
         }
-    }
 
-    /**
-     *  @brief Converts the rolling minimums into the final minimum hashes.
-     *  @param[in] rolling_minimums The minimum hashes computed by the rolling hashers.
-     *  @param[out] min_hashes The output minimum hashes, which are the final fingerprints.
-     */
-    template <std::size_t digest_dimensions_ = SZ_SIZE_MAX>
-    void digest(                                                          //
-        span<rolling_state_t const, digest_dimensions_> rolling_minimums, //
-        span<min_hash_t, digest_dimensions_> min_hashes) const noexcept {
-
-        for (std::size_t dim = 0; dim < min_hashes.size(); ++dim) {
-            rolling_state_t const rolling_minimum = rolling_minimums[dim];
-            min_hashes[dim] = rolling_minimum == skipped_rolling_hash_k
-                                  ? max_hash_k // If the rolling minimum is not set, use the maximum hash value
-                                  : static_cast<min_hash_t>(static_cast<std::uint64_t>(rolling_minimum) & max_hash_k);
-        }
+        // Finally, export the minimum hashes into the smaller representations
+        if (min_hashes)
+            for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
+                rolling_state_t const &rolling_minimum = rolling_minimums[dim];
+                min_hash_t &min_hash = min_hashes[dim];
+                auto const rolling_minimum_as_uint = static_cast<std::uint64_t>(rolling_minimum);
+                min_hash = rolling_minimum == skipped_rolling_hash_k
+                               ? max_hash_k // If the rolling minimum is not set, use the maximum hash value
+                               : static_cast<min_hash_t>(rolling_minimum_as_uint & max_hash_k);
+            }
     }
 
     /**
@@ -1244,57 +1256,55 @@ struct floating_rolling_hashers<sz_cap_ice_k, window_width_, dimensions_> {
     void fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
                      min_counts_span_t min_counts) const noexcept {
 
-        // Fill the states
+        if (text.size() < window_width_k) {
+            for (auto &min_hash : min_hashes) min_hash = max_hash_k;
+            for (auto &min_count : min_counts) min_count = 0;
+            return;
+        }
+
         rolling_state_t rolling_states[dimensions_k];
         rolling_state_t rolling_minimums[dimensions_k];
-        min_count_t rolling_counts[dimensions_k];
         for (std::size_t dim = 0; dim < dimensions_k; ++dim)
-            rolling_states[dim] = 0, rolling_minimums[dim] = skipped_rolling_hash_k, rolling_counts[dim] = 0;
-
-        // Roll through the whole input at once
-        if (text.size() >= window_width_k) roll(text, rolling_states, rolling_minimums, rolling_counts);
-
-        // Export the minimum floats to the fingerprint
-        for (std::size_t dim = 0; dim < dimensions_k; ++dim)
-            min_hashes[dim] = static_cast<min_hash_t>(rolling_minimums[dim] & max_hash_k),
-            min_counts[dim] = rolling_counts[dim];
+            rolling_states[dim] = 0, rolling_minimums[dim] = skipped_rolling_hash_k;
+        fingerprint_chunk(text, rolling_states, rolling_minimums, min_hashes, min_counts);
     }
 
     /**
      *  @brief Underlying machinery of `fingerprint` that fills the states of the hashers.
-     *  @param[in] text_chunk A chunk of text to update the @p `last_floats` with.
-     *  @param[inout] last_floats The last computed floats for each hasher; start with zeroes.
-     *  @param[inout] rolling_minimums The minimum floats for each hasher; start with `skipped_rolling_hash_k`.
-     *  @param[inout] rolling_counts The appearance frequency counts of each @p `rolling_minimums`; start with 1.
+     *  @param[in] text_chunk A chunk of text to update the @p `last_states` with.
+     *  @param[inout] last_states The last computed floats for each hasher; start with @b zeroes.
+     *  @param[inout] rolling_minimums The minimum floats for each hasher; start with @b `skipped_rolling_hash_k`.
+     *  @param[out] min_hashes The @b optional output for minimum hashes, which are the final fingerprints.
+     *  @param[out] min_counts The frequencies of @p `rolling_minimums` and optional @p `min_hashes` hashes.
      *  @param[in] passed_progress The offset of the received @p `text_chunk` in the whole text; defaults to 0.
+     *
+     *  Unlike the `fingerprint` method, this function can be used in a @b rolling fashion, i.e., it can be called
+     *  multiple times with different chunks of text, and it will update the states accordingly. In the end, it
+     *  will anyways export the composing Count-Min-Sketch fingerprint into the @p `min_hashes` and @p `min_counts`,
+     *  as its a relatively cheap operation.
      */
-    void roll(                                                //
+    void fingerprint_chunk(                                   //
         span<byte_t const> text_chunk,                        //
-        span<rolling_state_t, dimensions_k> last_floats,      //
+        span<rolling_state_t, dimensions_k> last_states,      //
         span<rolling_state_t, dimensions_k> rolling_minimums, //
-        span<min_count_t, dimensions_k> rolling_counts,       //
-        std::size_t passed_progress = 0) const noexcept {
+        span<min_hash_t, dimensions_k> min_hashes,            //
+        span<min_count_t, dimensions_k> min_counts,           //
+        std::size_t const passed_progress = 0) const noexcept {
 
         constexpr unsigned groups_count_k = dimensions_k / hashes_per_unrolled_group_k;
         for (unsigned group_index = 0; group_index < groups_count_k; ++group_index)
-            roll_group(text_chunk, group_index, last_floats, rolling_minimums, rolling_counts, passed_progress);
-    }
+            roll_group(text_chunk, group_index, last_states, rolling_minimums, min_counts, passed_progress);
 
-    /**
-     *  @brief Converts the rolling minimums into the final minimum hashes.
-     *  @param[in] rolling_minimums The minimum hashes computed by the rolling hashers.
-     *  @param[out] min_hashes The output minimum hashes, which are the final fingerprints.
-     */
-    void digest(                                                    //
-        span<rolling_state_t const, dimensions_k> rolling_minimums, //
-        span<min_hash_t, dimensions_k> min_hashes) const noexcept {
-
-        for (std::size_t dim = 0; dim < min_hashes.size(); ++dim) {
-            rolling_state_t const rolling_minimum = rolling_minimums[dim];
-            min_hashes[dim] = rolling_minimum == skipped_rolling_hash_k
-                                  ? max_hash_k // If the rolling minimum is not set, use the maximum hash value
-                                  : static_cast<min_hash_t>(static_cast<std::uint64_t>(rolling_minimum) & max_hash_k);
-        }
+        // Finally, export the minimum hashes into the smaller representations
+        if (min_hashes)
+            for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
+                rolling_state_t const &rolling_minimum = rolling_minimums[dim];
+                min_hash_t &min_hash = min_hashes[dim];
+                auto const rolling_minimum_as_uint = static_cast<std::uint64_t>(rolling_minimum);
+                min_hashes[dim] = rolling_minimum == skipped_rolling_hash_k
+                                      ? max_hash_k // If the rolling minimum is not set, use the maximum hash value
+                                      : static_cast<min_hash_t>(rolling_minimum_as_uint & max_hash_k);
+            }
     }
 
     /**
@@ -1312,23 +1322,17 @@ struct floating_rolling_hashers<sz_cap_ice_k, window_width_, dimensions_> {
 #if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
-    status_t operator()(texts_type_ const &texts, min_hashes_per_text_type_ &&min_hashes, //
-                        min_counts_per_text_type_ &&min_counts, executor_type_ &&executor = {},
+    status_t operator()(texts_type_ const &texts, min_hashes_per_text_type_ &&min_hashes_per_text, //
+                        min_counts_per_text_type_ &&min_counts_per_text, executor_type_ &&executor = {},
                         cpu_specs_t specs = {}) noexcept {
-        return floating_rolling_hashers_in_parallel_(            //
-            *this, texts,                                        //
-            std::forward<min_hashes_per_text_type_>(min_hashes), //
-            std::forward<min_counts_per_text_type_>(min_counts), //
+        return floating_rolling_hashers_in_parallel_(                     //
+            *this, texts,                                                 //
+            std::forward<min_hashes_per_text_type_>(min_hashes_per_text), //
+            std::forward<min_counts_per_text_type_>(min_counts_per_text), //
             std::forward<executor_type_>(executor), specs);
     }
 
   private:
-    struct unrolled_states_t {
-        sz_u512_vec_t last_f64s[unroll_factor_k];
-        sz_u512_vec_t minimum_f64s[unroll_factor_k];
-        sz_u512_vec_t count_u64s[unroll_factor_k];
-    };
-
     // TODO: We can probably shave a few ore cycles here:
     SZ_INLINE __m512d barrett_mod(__m512d xs, __m512d modulos, __m512d inverse_modulos) const noexcept {
         // Use rounding SIMD arithmetic

From 08c1e86986a9ed9666bda9ee33b95f6928dab00e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 26 Jul 2025 18:34:23 +0000
Subject: [PATCH 496/751] Fix: Fingerprinting via Skylake extensions

---
 include/stringzilla/types.hpp        |   3 +
 include/stringzillas/fingerprint.hpp | 225 +++++++++++++++++----------
 scripts/bench_fingerprint.cpp        |   2 +-
 scripts/bench_fingerprint.cuh        |   2 +-
 scripts/test_fingerprint.cuh         |  12 ++
 5 files changed, 164 insertions(+), 80 deletions(-)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 8715c6c6..8eb8b79b 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -111,6 +111,9 @@ using size_t = sz_size_t;
 using ssize_t = sz_ssize_t;
 using byte_t = sz_byte_t;
 
+using f32_t = float;
+using f64_t = double;
+
 using ptr_t = sz_ptr_t;
 using cptr_t = sz_cptr_t;
 using error_cost_t = sz_error_cost_t;
diff --git a/include/stringzillas/fingerprint.hpp b/include/stringzillas/fingerprint.hpp
index de27b52f..e9aa1dc1 100644
--- a/include/stringzillas/fingerprint.hpp
+++ b/include/stringzillas/fingerprint.hpp
@@ -413,6 +413,13 @@ struct floating_rolling_hasher<float> {
     state_t negative_discarding_multiplier_;
 };
 
+inline f64_t absolute_fmod(f64_t x, f64_t y) noexcept {
+    f64_t result = std::fmod(x, y);
+    return result < 0.0 ? result + y : result;
+}
+
+inline u64_t absolute_umod(f64_t x, f64_t y) noexcept { return static_cast<u64_t>(absolute_fmod(x, y)); }
+
 /**
  *  @brief Rabin-Karp-style Rolling hash function for double-precision floating-point numbers.
  *  @tparam state_type_ Type of the floating-point number, e.g., `float`.
@@ -505,8 +512,7 @@ struct floating_rolling_hasher<double> {
 
         sz_assert_(result >= 0 && "Intermediate x underflows the zero");
         sz_assert_(result < limit_k && "Intermediate x overflows the limit");
-        sz_assert_(static_cast<std::uint64_t>(std::fmod(x, modulo_) + (std::fmod(x, modulo_) < 0.0 ? modulo_ : 0.0)) ==
-                       static_cast<std::uint64_t>(result) &&
+        sz_assert_(static_cast<std::uint64_t>(absolute_fmod(x, modulo_)) == static_cast<std::uint64_t>(result) &&
                    "Floating point modulo was incorrect");
         return result;
     }
@@ -991,7 +997,7 @@ status_t floating_rolling_hashers_in_parallel_(
  *  - 32 dimensions for 5-grams,
  *  - 64 dimensions for 7-grams.
  *
- *  @tparam capability_ The CPU capability, e.g., `sz_cap_serial_k`, `sz_cap_ice_k`, etc.
+ *  @tparam capability_ The CPU capability, e.g., `sz_cap_serial_k`, `sz_cap_skylake_k`, etc.
  *  @tparam window_width_ The width of the rolling window, e.g., 3, 4, 5, 6, etc.
  *  @tparam dimensions_ The number of dimensions in the fingerprint, recommended a multiple of 16, ideally @b 64.
  *  @tparam enable_ A type used to enable or disable this specialization, e.g., `void` for default.
@@ -1033,12 +1039,12 @@ struct floating_rolling_hashers {
      *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
      */
     status_t try_seed(std::size_t alphabet_size = 256) noexcept {
-        for (std::size_t j = 0; j < dimensions_k; ++j) {
-            hasher_t hasher(window_width_k, alphabet_size + j, hasher_t::default_modulo_base_k);
-            multipliers_[j] = hasher.multiplier();
-            modulos_[j] = hasher.modulo();
-            inverse_modulos_[j] = hasher.inverse_modulo();
-            negative_discarding_multipliers_[j] = hasher.negative_discarding_multiplier();
+        for (unsigned dim = 0; dim < dimensions_k; ++dim) {
+            hasher_t hasher(window_width_k, alphabet_size + dim, hasher_t::default_modulo_base_k);
+            multipliers_[dim] = hasher.multiplier();
+            modulos_[dim] = hasher.modulo();
+            inverse_modulos_[dim] = hasher.inverse_modulo();
+            negative_discarding_multipliers_[dim] = hasher.negative_discarding_multiplier();
         }
         return status_t::success_k;
     }
@@ -1113,7 +1119,7 @@ struct floating_rolling_hashers {
         }
 
         // We now have our first minimum hashes
-        if (new_char_offset == prefix_length)
+        if (new_char_offset == window_width_k)
             for (std::size_t dim = 0; dim < dimensions_k; ++dim)
                 rolling_minimums[dim] = (std::min)(rolling_minimums[dim], last_states[dim]),
                 min_counts[dim] = 1; // First occurrence of this hash
@@ -1192,6 +1198,18 @@ struct floating_rolling_hashers {
     }
 };
 
+/*  AVX512 implementation of the string hashing algorithms for Skylake and newer CPUs.
+ *  Includes extensions: F, CD, ER, PF, VL, DQ, BW.
+ *
+ *  This is the "starting level" for the advanced algorithms using K-mask registers on x86.
+ */
+#pragma region Skylake Implementation
+#if SZ_USE_SKYLAKE
+#pragma GCC push_options
+#pragma GCC target("avx", "avx512f", "avx512vl", "avx512dq", "avx512bw", "bmi", "bmi2")
+#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512dq,avx512bw,bmi,bmi2"))), \
+                             apply_to = function)
+
 /**
  *  @brief Optimized rolling Min-Hashers built around floating-point numbers.
  *
@@ -1200,7 +1218,7 @@ struct floating_rolling_hashers {
  *  2-3x times, and process 16-24 hashes in parallel.
  */
 template <std::size_t window_width_, std::size_t dimensions_>
-struct floating_rolling_hashers<sz_cap_ice_k, window_width_, dimensions_> {
+struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
 
     using hasher_t = floating_rolling_hasher<double>;
     using rolling_state_t = double;
@@ -1218,15 +1236,19 @@ struct floating_rolling_hashers<sz_cap_ice_k, window_width_, dimensions_> {
     static constexpr unsigned unroll_factor_k = 2;
     static constexpr unsigned hashes_per_zmm_k = sizeof(sz_u512_vec_t) / sizeof(rolling_state_t);
     static constexpr unsigned hashes_per_unrolled_group_k = unroll_factor_k * hashes_per_zmm_k;
-    static_assert(dimensions_k % hashes_per_unrolled_group_k == 0,
-                  "Dimensions number must be divisible by the hash-count");
+    static constexpr bool has_incomplete_tail_group_k = dimensions_k % hashes_per_unrolled_group_k;
+    static constexpr std::size_t aligned_dimensions_k =
+        has_incomplete_tail_group_k ? (dimensions_k / hashes_per_unrolled_group_k + 1) * hashes_per_unrolled_group_k
+                                    : (dimensions_k);
+    static constexpr unsigned groups_count_k = aligned_dimensions_k / hashes_per_unrolled_group_k;
+
     static_assert(dimensions_k <= 256, "Too many dimensions to keep on stack");
 
   private:
-    SZ_ALIGN64 rolling_state_t multipliers_[dimensions_k];
-    SZ_ALIGN64 rolling_state_t modulos_[dimensions_k];
-    SZ_ALIGN64 rolling_state_t inverse_modulos_[dimensions_k];
-    SZ_ALIGN64 rolling_state_t negative_discarding_multipliers_[dimensions_k];
+    rolling_state_t multipliers_[aligned_dimensions_k];
+    rolling_state_t modulos_[aligned_dimensions_k];
+    rolling_state_t inverse_modulos_[aligned_dimensions_k];
+    rolling_state_t negative_discarding_multipliers_[aligned_dimensions_k];
 
   public:
     constexpr std::size_t window_width() const noexcept { return window_width_k; }
@@ -1237,12 +1259,12 @@ struct floating_rolling_hashers<sz_cap_ice_k, window_width_, dimensions_> {
      *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
      */
     status_t try_seed(std::size_t alphabet_size = 256) noexcept {
-        for (unsigned j = 0; j < dimensions_k; ++j) {
-            hasher_t hasher(window_width_k, alphabet_size + j, hasher_t::default_modulo_base_k);
-            multipliers_[j] = hasher.multiplier();
-            modulos_[j] = hasher.modulo();
-            inverse_modulos_[j] = -hasher.inverse_modulo();
-            negative_discarding_multipliers_[j] = hasher.negative_discarding_multiplier();
+        for (unsigned dim = 0; dim < dimensions_k; ++dim) {
+            hasher_t hasher(window_width_k, alphabet_size + dim, hasher_t::default_modulo_base_k);
+            multipliers_[dim] = hasher.multiplier();
+            modulos_[dim] = hasher.modulo();
+            inverse_modulos_[dim] = hasher.inverse_modulo();
+            negative_discarding_multipliers_[dim] = hasher.negative_discarding_multiplier();
         }
         return status_t::success_k;
     }
@@ -1266,7 +1288,19 @@ struct floating_rolling_hashers<sz_cap_ice_k, window_width_, dimensions_> {
         rolling_state_t rolling_minimums[dimensions_k];
         for (std::size_t dim = 0; dim < dimensions_k; ++dim)
             rolling_states[dim] = 0, rolling_minimums[dim] = skipped_rolling_hash_k;
-        fingerprint_chunk(text, rolling_states, rolling_minimums, min_hashes, min_counts);
+        fingerprint_chunk(text, &rolling_states[0], &rolling_minimums[0], min_hashes, min_counts);
+    }
+
+    /**
+     *  @brief Computes the fingerprint of a single @p text on the current thread.
+     *  @param[in] text The input text to hash, typically a UTF-8 encoded string.
+     *  @param[out] min_hashes The output fingerprint, a vector of minimum hashes.
+     *  @param[out] min_counts The output frequencies of @p `min_hashes` hashes.
+     */
+    status_t try_fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
+                             min_counts_span_t min_counts) const noexcept {
+        fingerprint(text, min_hashes, min_counts);
+        return status_t::success_k;
     }
 
     /**
@@ -1291,7 +1325,6 @@ struct floating_rolling_hashers<sz_cap_ice_k, window_width_, dimensions_> {
         span<min_count_t, dimensions_k> min_counts,           //
         std::size_t const passed_progress = 0) const noexcept {
 
-        constexpr unsigned groups_count_k = dimensions_k / hashes_per_unrolled_group_k;
         for (unsigned group_index = 0; group_index < groups_count_k; ++group_index)
             roll_group(text_chunk, group_index, last_states, rolling_minimums, min_counts, passed_progress);
 
@@ -1301,9 +1334,9 @@ struct floating_rolling_hashers<sz_cap_ice_k, window_width_, dimensions_> {
                 rolling_state_t const &rolling_minimum = rolling_minimums[dim];
                 min_hash_t &min_hash = min_hashes[dim];
                 auto const rolling_minimum_as_uint = static_cast<std::uint64_t>(rolling_minimum);
-                min_hashes[dim] = rolling_minimum == skipped_rolling_hash_k
-                                      ? max_hash_k // If the rolling minimum is not set, use the maximum hash value
-                                      : static_cast<min_hash_t>(rolling_minimum_as_uint & max_hash_k);
+                min_hash = rolling_minimum == skipped_rolling_hash_k
+                               ? max_hash_k // If the rolling minimum is not set, use the maximum hash value
+                               : static_cast<min_hash_t>(rolling_minimum_as_uint & max_hash_k);
             }
     }
 
@@ -1336,36 +1369,59 @@ struct floating_rolling_hashers<sz_cap_ice_k, window_width_, dimensions_> {
     // TODO: We can probably shave a few ore cycles here:
     SZ_INLINE __m512d barrett_mod(__m512d xs, __m512d modulos, __m512d inverse_modulos) const noexcept {
         // Use rounding SIMD arithmetic
-        __m512d qs = _mm512_mul_round_pd(xs, inverse_modulos, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-        __m512d results = _mm512_fnmadd_round_pd(qs, modulos, xs, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        __m512d qs = _mm512_roundscale_pd(      // ! The rounding operation is extremely expensive,
+            _mm512_mul_pd(xs, inverse_modulos), // ! so alternatives should be considered.
+            _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+        __m512d results = _mm512_fnmadd_pd(qs, modulos, xs);
 
         // Clamp into the [0, modulo) range.
-        __mmask8 ge_mask = _mm512_cmp_pd_mask(results, modulos, _CMP_GE_OQ);
-        __mmask8 lt_mask = _mm512_cmp_pd_mask(results, _mm512_setzero_pd(), _CMP_LT_OQ);
-        results = _mm512_mask_sub_pd(results, ge_mask, results, modulos);
-        results = _mm512_mask_add_pd(results, lt_mask, results, modulos);
+        __mmask8 overflow_mask = _mm512_cmp_pd_mask(results, modulos, _CMP_GE_OQ);
+        results = _mm512_mask_sub_pd(results, overflow_mask, results, modulos);
+        __mmask8 negative_mask = _mm512_fpclass_pd_mask(results, 0x44); // Negative
+        results = _mm512_mask_add_pd(results, negative_mask, results, modulos);
+
+        sz_assert_(modulos[0] == 0 || absolute_umod(xs[0], modulos[0]) == static_cast<u64_t>(results[0]));
+        sz_assert_(modulos[1] == 0 || absolute_umod(xs[1], modulos[1]) == static_cast<u64_t>(results[1]));
+        sz_assert_(modulos[2] == 0 || absolute_umod(xs[2], modulos[2]) == static_cast<u64_t>(results[2]));
+        sz_assert_(modulos[3] == 0 || absolute_umod(xs[3], modulos[3]) == static_cast<u64_t>(results[3]));
+        sz_assert_(modulos[4] == 0 || absolute_umod(xs[4], modulos[4]) == static_cast<u64_t>(results[4]));
+        sz_assert_(modulos[5] == 0 || absolute_umod(xs[5], modulos[5]) == static_cast<u64_t>(results[5]));
+        sz_assert_(modulos[6] == 0 || absolute_umod(xs[6], modulos[6]) == static_cast<u64_t>(results[6]));
+        sz_assert_(modulos[7] == 0 || absolute_umod(xs[7], modulos[7]) == static_cast<u64_t>(results[7]));
+
         return results;
     }
 
-    void roll_group(                                          //
-        span<byte_t const> text_chunk, unsigned group_index,  //
-        span<rolling_state_t, dimensions_k> last_floats,      //
-        span<rolling_state_t, dimensions_k> rolling_minimums, //
-        span<min_count_t, dimensions_k> rolling_counts,       //
-        std::size_t passed_progress = 0) const noexcept {
+    __attribute__((no_sanitize("address"))) void roll_group(       //
+        span<byte_t const> text_chunk, unsigned const group_index, //
+        span<rolling_state_t, dimensions_k> last_states,           //
+        span<rolling_state_t, dimensions_k> rolling_minimums,      //
+        span<min_count_t, dimensions_k> rolling_counts,            //
+        std::size_t const passed_progress = 0) const noexcept {
 
-        // Resulting variables
-        sz_u512_vec_t last_floats_vec[unroll_factor_k];
+        // Register space for in-out variables
+        sz_u512_vec_t last_states_vec[unroll_factor_k];
         sz_u512_vec_t rolling_minimums_vec[unroll_factor_k];
         sz_u256_vec_t rolling_counts_vec[unroll_factor_k];
 
-        for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
-            unsigned const dim = group_index * hashes_per_unrolled_group_k + index_in_group * hashes_per_zmm_k;
-            last_floats_vec[index_in_group].zmm_pd = _mm512_loadu_pd(&last_floats[dim]);
-            rolling_minimums_vec[index_in_group].zmm_pd = _mm512_loadu_pd(&rolling_minimums[dim]);
-            rolling_counts_vec[index_in_group].ymm =
-                _mm256_loadu_si256(reinterpret_cast<__m256i const *>(&rolling_counts[dim]));
-        }
+        // Use masked loads for the incomplete tail group
+        if (has_incomplete_tail_group_k && group_index + 1 == groups_count_k)
+            for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
+                unsigned const dim = group_index * hashes_per_unrolled_group_k + index_in_group * hashes_per_zmm_k;
+                __mmask8 const load_mask = dimensions_k > dim ? sz_u8_mask_until_(dimensions_k - dim) : (__mmask8)0;
+                last_states_vec[index_in_group].zmm_pd = _mm512_maskz_loadu_pd(load_mask, &last_states[dim]);
+                rolling_minimums_vec[index_in_group].zmm_pd = _mm512_maskz_loadu_pd(load_mask, &rolling_minimums[dim]);
+                rolling_counts_vec[index_in_group].ymm = _mm256_maskz_loadu_epi32(load_mask, &rolling_counts[dim]);
+            }
+        // Otherwise, everything is easy
+        else
+            for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
+                unsigned const dim = group_index * hashes_per_unrolled_group_k + index_in_group * hashes_per_zmm_k;
+                last_states_vec[index_in_group].zmm_pd = _mm512_loadu_pd(&last_states[dim]);
+                rolling_minimums_vec[index_in_group].zmm_pd = _mm512_loadu_pd(&rolling_minimums[dim]);
+                rolling_counts_vec[index_in_group].ymm =
+                    _mm256_loadu_si256(reinterpret_cast<__m256i const *>(&rolling_counts[dim]));
+            }
 
         // Temporary variables for the rolling state
         sz_u512_vec_t multipliers_vec[unroll_factor_k], negative_discarding_multipliers_vec[unroll_factor_k],
@@ -1373,11 +1429,11 @@ struct floating_rolling_hashers<sz_cap_ice_k, window_width_, dimensions_> {
 
         for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
             unsigned const dim = group_index * hashes_per_unrolled_group_k + index_in_group * hashes_per_zmm_k;
-            multipliers_vec[index_in_group].zmm_pd = _mm512_load_pd(&multipliers_[dim]);
+            multipliers_vec[index_in_group].zmm_pd = _mm512_loadu_pd(&multipliers_[dim]);
             negative_discarding_multipliers_vec[index_in_group].zmm_pd =
-                _mm512_load_pd(&negative_discarding_multipliers_[dim]);
-            modulos_vec[index_in_group].zmm_pd = _mm512_load_pd(&modulos_[dim]);
-            inverse_modulos_vec[index_in_group].zmm_pd = _mm512_load_pd(&inverse_modulos_[dim]);
+                _mm512_loadu_pd(&negative_discarding_multipliers_[dim]);
+            modulos_vec[index_in_group].zmm_pd = _mm512_loadu_pd(&modulos_[dim]);
+            inverse_modulos_vec[index_in_group].zmm_pd = _mm512_loadu_pd(&inverse_modulos_[dim]);
         }
 
         // Until we reach the `window_width_k`, we don't need to discard any symbols and can keep the code simpler
@@ -1389,10 +1445,10 @@ struct floating_rolling_hashers<sz_cap_ice_k, window_width_, dimensions_> {
             __m512d new_term_zmm = _mm512_set1_pd(new_term);
 
             for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
-                last_floats_vec[index_in_group].zmm_pd = _mm512_fmadd_pd(
-                    multipliers_vec[index_in_group].zmm_pd, new_term_zmm, last_floats_vec[index_in_group].zmm_pd);
-                last_floats_vec[index_in_group].zmm_pd = barrett_mod( //
-                    last_floats_vec[index_in_group].zmm_pd,           //
+                last_states_vec[index_in_group].zmm_pd = _mm512_fmadd_pd(
+                    last_states_vec[index_in_group].zmm_pd, multipliers_vec[index_in_group].zmm_pd, new_term_zmm);
+                last_states_vec[index_in_group].zmm_pd = barrett_mod( //
+                    last_states_vec[index_in_group].zmm_pd,           //
                     modulos_vec[index_in_group].zmm_pd,               //
                     inverse_modulos_vec[index_in_group].zmm_pd);
             }
@@ -1400,13 +1456,13 @@ struct floating_rolling_hashers<sz_cap_ice_k, window_width_, dimensions_> {
 
         // We now have our first minimum hashes
         __m256i const ones_ymm = _mm256_set1_epi32(1);
-        if (new_char_offset == prefix_length)
+        if (new_char_offset == window_width_k && passed_progress < prefix_length)
             for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group)
-                rolling_minimums_vec[index_in_group].zmm_pd = last_floats_vec[index_in_group].zmm_pd,
+                rolling_minimums_vec[index_in_group].zmm_pd = last_states_vec[index_in_group].zmm_pd,
                 rolling_counts_vec[index_in_group].ymm = ones_ymm;
 
         // Now we can avoid a branch in the nested loop, as we are passed the longest window width
-        for (std::size_t new_char_offset = window_width_k; new_char_offset < text_chunk.size(); ++new_char_offset) {
+        for (; new_char_offset < text_chunk.size(); ++new_char_offset) {
             byte_t const new_char = text_chunk[new_char_offset];
             byte_t const old_char = text_chunk[new_char_offset - window_width_k];
             rolling_state_t const new_term = static_cast<rolling_state_t>(new_char) + 1.0;
@@ -1417,44 +1473,57 @@ struct floating_rolling_hashers<sz_cap_ice_k, window_width_, dimensions_> {
             for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
 
                 // Discard the old term
-                last_floats_vec[index_in_group].zmm_pd =
+                last_states_vec[index_in_group].zmm_pd =
                     _mm512_fmadd_pd(negative_discarding_multipliers_vec[index_in_group].zmm_pd, old_term_zmm,
-                                    last_floats_vec[index_in_group].zmm_pd);
-                last_floats_vec[index_in_group].zmm_pd = barrett_mod( //
-                    last_floats_vec[index_in_group].zmm_pd,           //
+                                    last_states_vec[index_in_group].zmm_pd);
+                last_states_vec[index_in_group].zmm_pd = barrett_mod( //
+                    last_states_vec[index_in_group].zmm_pd,           //
                     modulos_vec[index_in_group].zmm_pd,               //
                     inverse_modulos_vec[index_in_group].zmm_pd);
 
                 // Add the new term
-                last_floats_vec[index_in_group].zmm_pd = _mm512_fmadd_pd(
-                    last_floats_vec[index_in_group].zmm_pd, multipliers_vec[index_in_group].zmm_pd, new_term_zmm);
-                last_floats_vec[index_in_group].zmm_pd = barrett_mod( //
-                    last_floats_vec[index_in_group].zmm_pd,           //
+                last_states_vec[index_in_group].zmm_pd = _mm512_fmadd_pd(
+                    last_states_vec[index_in_group].zmm_pd, multipliers_vec[index_in_group].zmm_pd, new_term_zmm);
+                last_states_vec[index_in_group].zmm_pd = barrett_mod( //
+                    last_states_vec[index_in_group].zmm_pd,           //
                     modulos_vec[index_in_group].zmm_pd,               //
                     inverse_modulos_vec[index_in_group].zmm_pd);
 
                 // To keep the right comparison mask, check out: https://stackoverflow.com/q/16988199
                 __mmask8 same_mask = _mm512_cmp_pd_mask(rolling_minimums_vec[index_in_group].zmm_pd,
-                                                        last_floats_vec[index_in_group].zmm_pd, _CMP_EQ_OQ);
+                                                        last_states_vec[index_in_group].zmm_pd, _CMP_EQ_OQ);
                 rolling_minimums_vec[index_in_group].zmm_pd =
-                    _mm512_min_pd(rolling_minimums_vec[index_in_group].zmm_pd, last_floats_vec[index_in_group].zmm_pd);
+                    _mm512_min_pd(rolling_minimums_vec[index_in_group].zmm_pd, last_states_vec[index_in_group].zmm_pd);
                 rolling_counts_vec[index_in_group].ymm =
                     _mm256_mask_add_epi32(rolling_counts_vec[index_in_group].ymm, same_mask,
                                           rolling_counts_vec[index_in_group].ymm, ones_ymm);
             }
         }
 
-        // Dump back the results into our spans
-        for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
-            unsigned const dim = group_index * hashes_per_unrolled_group_k + index_in_group * hashes_per_zmm_k;
-            _mm512_storeu_pd(&last_floats[dim], last_floats_vec[index_in_group].zmm_pd);
-            _mm512_storeu_pd(&rolling_minimums[dim], rolling_minimums_vec[index_in_group].zmm_pd);
-            _mm256_storeu_si256(reinterpret_cast<__m256i *>(&rolling_counts[dim]),
-                                rolling_counts_vec[index_in_group].ymm);
-        }
+        // Dump back the results from registers into our spans
+        if (has_incomplete_tail_group_k && group_index + 1 == groups_count_k)
+            for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
+                unsigned const dim = group_index * hashes_per_unrolled_group_k + index_in_group * hashes_per_zmm_k;
+                __mmask8 const store_mask = dimensions_k > dim ? sz_u8_mask_until_(dimensions_k - dim) : (__mmask8)0;
+                _mm512_mask_storeu_pd(&last_states[dim], store_mask, last_states_vec[index_in_group].zmm_pd);
+                _mm512_mask_storeu_pd(&rolling_minimums[dim], store_mask, rolling_minimums_vec[index_in_group].zmm_pd);
+                _mm256_mask_storeu_epi32(&rolling_counts[dim], store_mask, rolling_counts_vec[index_in_group].ymm);
+            }
+        else
+            for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
+                unsigned const dim = group_index * hashes_per_unrolled_group_k + index_in_group * hashes_per_zmm_k;
+                _mm512_storeu_pd(&last_states[dim], last_states_vec[index_in_group].zmm_pd);
+                _mm512_storeu_pd(&rolling_minimums[dim], rolling_minimums_vec[index_in_group].zmm_pd);
+                _mm256_storeu_si256(reinterpret_cast<__m256i *>(&rolling_counts[dim]),
+                                    rolling_counts_vec[index_in_group].ymm);
+            }
     }
 };
 
+#pragma clang attribute pop
+#pragma GCC pop_options
+#endif // SZ_USE_SKYLAKE
+
 #pragma endregion - Optimized Rolling MinHashers
 
 } // namespace stringzillas
diff --git a/scripts/bench_fingerprint.cpp b/scripts/bench_fingerprint.cpp
index 32303313..e4bbe726 100644
--- a/scripts/bench_fingerprint.cpp
+++ b/scripts/bench_fingerprint.cpp
@@ -53,7 +53,7 @@ int main(int argc, char const **argv) {
             "xlsum.csv",                       // Preferred for UTF-8 content
             environment_t::tokenization_t::lines_k);
 
-        std::printf("Starting string multi-pattern search benchmarks...\n");
+        std::printf("Starting string fingerprinting benchmarks...\n");
         bench_fingerprint(env);
     }
     catch (std::exception const &e) {
diff --git a/scripts/bench_fingerprint.cuh b/scripts/bench_fingerprint.cuh
index 3edf16b0..61c92732 100644
--- a/scripts/bench_fingerprint.cuh
+++ b/scripts/bench_fingerprint.cuh
@@ -80,7 +80,7 @@ void bench_fingerprint(environment_t const &env) {
 
     // Preallocate buffers for resulting fingerprints,
     // so that we can compare baseline and accelerated results for exact matches
-    using fingerprints_equality_t = arrays_equality<fingerprints_min_hashes_t>;
+    using fingerprints_equality_t = arrays_equality<fingerprint_min_hashes_t>;
     fingerprints_min_hashes_t min_hashes_baseline, min_hashes_accelerated;
     fingerprints_min_counts_t min_counts_baseline, min_counts_accelerated;
     min_hashes_baseline.resize(env.tokens.size()), min_hashes_accelerated.resize(env.tokens.size());
diff --git a/scripts/test_fingerprint.cuh b/scripts/test_fingerprint.cuh
index d879724e..a67c3e22 100644
--- a/scripts/test_fingerprint.cuh
+++ b/scripts/test_fingerprint.cuh
@@ -355,6 +355,18 @@ void test_rolling_hashers_equivalence_for_width() {
             std::size_t const first_mismatch_index =
                 std::mismatch(serial_hashes.begin(), serial_hashes.end(), accelerated_hashes.begin()).first -
                 serial_hashes.begin();
+
+            if (first_mismatch_index != serial_hashes.size()) {
+                std::printf("Fingerprint mismatch at index %zu:\n", first_mismatch_index);
+                std::printf("  String: \"%s\"\n", str.c_str());
+                std::printf("  Serial hash:      %u\n", serial_hashes[first_mismatch_index]);
+                std::printf("  Accelerated hash: %u\n", accelerated_hashes[first_mismatch_index]);
+                std::printf("  Serial count:     %u\n", serial_counts[first_mismatch_index]);
+                std::printf("  Accelerated count:%u\n", accelerated_counts[first_mismatch_index]);
+                for (std::size_t i = 0; i < serial_hashes.size(); ++i) {
+                    std::printf("  [%zu] serial=%u accelerated=%u\n", i, serial_hashes[i], accelerated_hashes[i]);
+                }
+            }
             sz_assert_(first_mismatch_index == serial_hashes.size() && "Fingerprints do not match");
         }
     };

From 9f3beac45e910f562daacd3d645711b8d0f8d5b4 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 26 Jul 2025 18:35:05 +0000
Subject: [PATCH 497/751] Make: Launchers for Parallel C++ benchmarks

---
 .vscode/launch.json | 41 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/.vscode/launch.json b/.vscode/launch.json
index 5cd4a5f1..cb18a505 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -99,7 +99,7 @@
       "name": "Current C++ Benchmark",
       "type": "cppdbg",
       "request": "launch",
-      "program": "${workspaceFolder}/build_debug/stringzilla_${fileBasenameNoExtension}",
+      "program": "${workspaceFolder}/build_debug/stringzilla_${fileBasenameNoExtension}_cpp20",
       "cwd": "${workspaceFolder}",
       "environment": [
         {
@@ -132,11 +132,48 @@
         "miDebuggerPath": "C:\\MinGw\\bin\\gdb.exe"
       }
     },
+    {
+      "name": "Current Parallel C++ Benchmark",
+      "type": "cppdbg",
+      "request": "launch",
+      "program": "${workspaceFolder}/build_debug/stringzillas_${fileBasenameNoExtension}_cpp20",
+      "cwd": "${workspaceFolder}",
+      "environment": [
+        {
+          "name": "ASAN_OPTIONS",
+          "value": "detect_leaks=0:atexit=1:strict_init_order=1:strict_string_checks=1"
+        },
+        {
+          "name": "STRINGWARS_DATASET",
+          "value": "utf8.txt"
+        }
+      ],
+      "stopAtEntry": false,
+      "preLaunchTask": "Build Benchmarks: Debug",
+      "linux": {
+        "MIMode": "gdb",
+        "setupCommands": [
+          {
+            "description": "Enable pretty-printing for GDB",
+            "text": "-enable-pretty-printing",
+            "ignoreFailures": true
+          }
+        ]
+      },
+      "osx": {
+        "MIMode": "lldb"
+      },
+      "windows": {
+        "program": "${workspaceFolder}\\build_debug\\stringzillas_${fileBasenameNoExtension}.exe",
+        "MIMode": "gdb",
+        "miDebuggerPath": "C:\\MinGw\\bin\\gdb.exe"
+      }
+    },
     {
       "name": "Current CUDA Benchmark",
       "type": "cuda-gdb",
       "request": "launch",
-      "program": "${workspaceFolder}/build_debug/stringzillas_${fileBasenameNoExtension}",
+      "program": "${workspaceFolder}/build_debug/stringzillas_${fileBasenameNoExtension}_cu20",
       "cwd": "${workspaceFolder}",
       "environment": [
         {

From afaf11bbbf63b305c13fa2bf9d267b27b6571dc8 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 26 Jul 2025 20:26:34 +0000
Subject: [PATCH 498/751] Improve: Wording inconsistencies

---
 scripts/bench_fingerprint.cpp | 2 +-
 scripts/bench_fingerprint.cu  | 4 ++--
 scripts/bench_sequence.cpp    | 8 ++++----
 scripts/bench_token.cpp       | 7 +++----
 scripts/test_stringzilla.cpp  | 2 +-
 scripts/test_stringzillas.cpp | 2 +-
 6 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/scripts/bench_fingerprint.cpp b/scripts/bench_fingerprint.cpp
index e4bbe726..9e9111d8 100644
--- a/scripts/bench_fingerprint.cpp
+++ b/scripts/bench_fingerprint.cpp
@@ -50,7 +50,7 @@ int main(int argc, char const **argv) {
         std::printf("Building up the environment...\n");
         environment_t env = build_environment( //
             argc, argv,                        //
-            "xlsum.csv",                       // Preferred for UTF-8 content
+            "leipzig1M.txt",                   //
             environment_t::tokenization_t::lines_k);
 
         std::printf("Starting string fingerprinting benchmarks...\n");
diff --git a/scripts/bench_fingerprint.cu b/scripts/bench_fingerprint.cu
index f322ac67..8fcc38e4 100644
--- a/scripts/bench_fingerprint.cu
+++ b/scripts/bench_fingerprint.cu
@@ -50,10 +50,10 @@ int main(int argc, char const **argv) {
         std::printf("Building up the environment...\n");
         environment_t env = build_environment( //
             argc, argv,                        //
-            "xlsum.csv",                       // Preferred for UTF-8 content
+            "leipzig1M.txt",                   //
             environment_t::tokenization_t::lines_k);
 
-        std::printf("Starting string multi-pattern search benchmarks...\n");
+        std::printf("Starting string fingerprinting search benchmarks...\n");
         bench_fingerprint(env);
     }
     catch (std::exception const &e) {
diff --git a/scripts/bench_sequence.cpp b/scripts/bench_sequence.cpp
index e3caaede..30323e3b 100644
--- a/scripts/bench_sequence.cpp
+++ b/scripts/bench_sequence.cpp
@@ -204,7 +204,7 @@ struct argsort_strings_via_sz {
  *  @brief Find the array permutation that sorts the input strings.
  *  @warning Some algorithms use more memory than others and memory usage is not accounted for in this benchmark.
  */
-void bench_sequenceing_strings(environment_t const &env) {
+void bench_sequencing_strings(environment_t const &env) {
     permute_t permute_buffer(env.tokens.size());
 
     // First, benchmark the STL function
@@ -282,7 +282,7 @@ struct sort_pgrams_via_sz {
  *  @brief Find the array permutation that sorts the input strings.
  *  @warning Some algorithms use more memory than others and memory usage is not accounted for in this benchmark.
  */
-void bench_sequenceing_pgrams(environment_t const &env) {
+void bench_sequencing_pgrams(environment_t const &env) {
     permute_t permute_buffer(env.tokens.size());
 
     // Before sorting the strings themselves, which is a heavy operation,
@@ -439,8 +439,8 @@ int main(int argc, char const **argv) {
         environment_t::tokenization_t::words_k);
 
     std::printf("Starting search benchmarks...\n");
-    bench_sequenceing_pgrams(env);
-    bench_sequenceing_strings(env);
+    bench_sequencing_pgrams(env);
+    bench_sequencing_strings(env);
     bench_intersections(env);
 
     std::printf("All benchmarks passed.\n");
diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index 589d4263..0d41dc68 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -9,9 +9,8 @@
  *  - Stream hashing of a token (file, lines, or words) - @b hash_init, @b hash_stream, @b hash_fold.
  *  - Equality check between two tokens and their relative order - @b equal and @b ordering.
  *
- *  For substring search, the number of operations per second are reported as the number of character-level comparisons
- *  happening in the worst case in the naive algorithm, meaning O(N*M) for N characters in the haystack and M in the
- *  needle.
+ *  For token operations, the number of operations per second are reported as the number of bytes processed
+ *  or comparisons performed, depending on the specific operation being benchmarked.
  *
  *  Instead of CLI arguments, for compatibility with @b StringWa.rs, the following environment variables are used:
  *  - `STRINGWARS_DATASET` : Path to the dataset file.
@@ -45,7 +44,7 @@
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
- *  This file is the sibling of `bench_sequence.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
+ *  This file is the sibling of `bench_find.cpp`, `bench_sequence.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
  */
 #include <numeric> // `std::accumulate`
 
diff --git a/scripts/test_stringzilla.cpp b/scripts/test_stringzilla.cpp
index c3c65126..5a91380d 100644
--- a/scripts/test_stringzilla.cpp
+++ b/scripts/test_stringzilla.cpp
@@ -5,7 +5,7 @@
  *
  *  @see     Stress-tests on real-world and synthetic data are integrated into the @b `scripts/bench*.cpp` benchmarks.
  *
- *  @file    test.cpp
+ *  @file    test_stringzilla.cpp
  *  @author  Ash Vardanian
  */
 #undef NDEBUG // ! Enable all assertions for testing
diff --git a/scripts/test_stringzillas.cpp b/scripts/test_stringzillas.cpp
index ded9f08d..3a8e16f6 100644
--- a/scripts/test_stringzillas.cpp
+++ b/scripts/test_stringzillas.cpp
@@ -2,7 +2,7 @@
  *  @brief   Extensive @b stress-testing suite for StringCuZilla parallel operations, written in CUDA C++.
  *  @see     Stress-tests on real-world and synthetic data are integrated into the @b `scripts/bench*.cpp` benchmarks.
  *
- *  @file    test.cu
+ *  @file    test_stringzillas.cpp
  *  @author  Ash Vardanian
  */
 #undef NDEBUG // ! Enable all assertions for testing

From b1077a45b945d6644036b59f45400ccbc727f0c1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 26 Jul 2025 20:27:17 +0000
Subject: [PATCH 499/751] Improve: Align thread-pool within stack-frame

---
 drafts/bench_find_many.cuh   | 2 +-
 drafts/test_find_many.cuh    | 4 ++--
 scripts/bench_similarity.cuh | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drafts/bench_find_many.cuh b/drafts/bench_find_many.cuh
index 43cfe84e..b6d9ae59 100644
--- a/drafts/bench_find_many.cuh
+++ b/drafts/bench_find_many.cuh
@@ -120,7 +120,7 @@ void bench_find_many(environment_t const &env) {
     using matches_equality_t = arrays_equality<find_many_match_t>;
 
     // Let's reuse a thread-pool to amortize the cost of spawning threads.
-    fu::basic_pool_t pool;
+    alignas(fu::default_alignment_k) fu::basic_pool_t pool;
     if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
     static_assert(executor_like<fu::basic_pool_t>);
 
diff --git a/drafts/test_find_many.cuh b/drafts/test_find_many.cuh
index 07b2372d..9cada526 100644
--- a/drafts/test_find_many.cuh
+++ b/drafts/test_find_many.cuh
@@ -450,14 +450,14 @@ void test_find_many_equivalence() {
 
     // Multi-threaded parallel Aho-Corasick implementation
     for (std::size_t threads : {2, 3, 4, 5}) {
-        fu::basic_pool_t pool;
+        alignas(fu::default_alignment_k) fu::basic_pool_t pool;
         if (!pool.try_spawn(threads)) throw std::runtime_error("Failed to spawn thread pool.");
         static_assert(executor_like<fu::basic_pool_t>);
         test_find_many_fixed(find_many_baselines_t {}, find_many_u32_parallel_t {}, pool);
     }
 
     // Let's reuse a thread-pool to amortize the cost of spawning threads.
-    fu::basic_pool_t pool;
+    alignas(fu::default_alignment_k) fu::basic_pool_t pool;
     if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
     static_assert(executor_like<fu::basic_pool_t>);
 
diff --git a/scripts/bench_similarity.cuh b/scripts/bench_similarity.cuh
index de8efb52..9c94a06b 100644
--- a/scripts/bench_similarity.cuh
+++ b/scripts/bench_similarity.cuh
@@ -101,7 +101,7 @@ void bench_levenshtein(environment_t const &env) {
     similarities_t results_utf8_baseline, results_utf8_accelerated;
 
     // Let's reuse a thread-pool to amortize the cost of spawning threads.
-    fu::basic_pool_t pool;
+    alignas(fu::default_alignment_k) fu::basic_pool_t pool;
     if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
     static_assert(executor_like<fu::basic_pool_t>);
 
@@ -244,7 +244,7 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
     similarities_t results_affine_local_baseline, results_affine_local_accelerated;
 
     // Let's reuse a thread-pool to amortize the cost of spawning threads.
-    fu::basic_pool_t pool;
+    alignas(fu::default_alignment_k) fu::basic_pool_t pool;
     if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
     static_assert(executor_like<fu::basic_pool_t>);
 

From 531b1e92a845e602316ed18335561259faba2720 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 26 Jul 2025 20:28:45 +0000
Subject: [PATCH 500/751] Fix: Inferring the prong type of executors

---
 include/stringzillas/fingerprint.hpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/stringzillas/fingerprint.hpp b/include/stringzillas/fingerprint.hpp
index e9aa1dc1..31ced5ea 100644
--- a/include/stringzillas/fingerprint.hpp
+++ b/include/stringzillas/fingerprint.hpp
@@ -802,7 +802,9 @@ struct basic_rolling_hashers {
             return status_t::bad_alloc_k;
 
         // Process small texts by individual threads
-        executor.for_n_dynamic(texts.size(), [&](auto prong) noexcept {
+        using executor_t = typename std::decay<executor_type_>::type;
+        using prong_t = typename executor_t::prong_t;
+        executor.for_n_dynamic(texts.size(), [&](prong_t prong) noexcept {
             auto const text_index = prong.task;
             auto const thread_index = prong.thread;
 
@@ -879,8 +881,10 @@ struct basic_rolling_hashers {
 
 /**
  *  @brief Computes many fingerprints in parallel for input @p texts, calling @p engine on each thread of @p executor.
- *  @param[in] texts The input texts to hash, typically a UTF-8 encoded string.
- *  @param[out] results The output fingerprints, a array of vectors of minimum hashes.
+ *  @param[in] executor The executor to use for parallel processing, defaults to a dummy executor.
+ *  @param[in] texts The input texts to hash, typically a sequential container of UTF-8 encoded strings.
+ *  @param[out] min_hashes_per_text The output fingerprints, an array of vectors of minimum hashes.
+ *  @param[out] min_counts_per_text The output frequencies of @p `min_hashes_per_text` hashes.
  *  @param[in] executor The executor to use for parallel processing, defaults to a dummy executor.
  *  @param[in] specs The CPU specifications to use, defaults to an empty `cpu_specs_t`.
  *  @retval status_t::success_k on success, or an error code otherwise.
@@ -912,7 +916,9 @@ status_t floating_rolling_hashers_in_parallel_(
     std::size_t const text_size_threshold = specs.l2_bytes * executor.threads_count();
 
     // Process small texts by individual threads
-    executor.for_n_dynamic(texts.size(), [&](auto prong) noexcept {
+    using executor_t = typename std::decay<executor_type_>::type;
+    using prong_t = typename executor_t::prong_t;
+    executor.for_n_dynamic(texts.size(), [&](prong_t prong) noexcept {
         auto const text_index = prong.task;
 
         auto const &text = texts[text_index];

From cda36fd4c033252a3868d3b28b7a5f494f7c3466 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 26 Jul 2025 20:29:08 +0000
Subject: [PATCH 501/751] Improve: Unroll & parallelize fingerprinting

---
 include/stringzillas/fingerprint.hpp |  7 ++++
 scripts/bench_fingerprint.cuh        | 55 +++++++++++++++++-----------
 2 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/include/stringzillas/fingerprint.hpp b/include/stringzillas/fingerprint.hpp
index 31ced5ea..ae789345 100644
--- a/include/stringzillas/fingerprint.hpp
+++ b/include/stringzillas/fingerprint.hpp
@@ -1412,6 +1412,7 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
 
         // Use masked loads for the incomplete tail group
         if (has_incomplete_tail_group_k && group_index + 1 == groups_count_k)
+#pragma unroll(unroll_factor_k)
             for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
                 unsigned const dim = group_index * hashes_per_unrolled_group_k + index_in_group * hashes_per_zmm_k;
                 __mmask8 const load_mask = dimensions_k > dim ? sz_u8_mask_until_(dimensions_k - dim) : (__mmask8)0;
@@ -1421,6 +1422,7 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
             }
         // Otherwise, everything is easy
         else
+#pragma unroll(unroll_factor_k)
             for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
                 unsigned const dim = group_index * hashes_per_unrolled_group_k + index_in_group * hashes_per_zmm_k;
                 last_states_vec[index_in_group].zmm_pd = _mm512_loadu_pd(&last_states[dim]);
@@ -1433,6 +1435,7 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
         sz_u512_vec_t multipliers_vec[unroll_factor_k], negative_discarding_multipliers_vec[unroll_factor_k],
             modulos_vec[unroll_factor_k], inverse_modulos_vec[unroll_factor_k];
 
+#pragma unroll(unroll_factor_k)
         for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
             unsigned const dim = group_index * hashes_per_unrolled_group_k + index_in_group * hashes_per_zmm_k;
             multipliers_vec[index_in_group].zmm_pd = _mm512_loadu_pd(&multipliers_[dim]);
@@ -1450,6 +1453,7 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
             rolling_state_t const new_term = static_cast<rolling_state_t>(new_char) + 1.0;
             __m512d new_term_zmm = _mm512_set1_pd(new_term);
 
+#pragma unroll(unroll_factor_k)
             for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
                 last_states_vec[index_in_group].zmm_pd = _mm512_fmadd_pd(
                     last_states_vec[index_in_group].zmm_pd, multipliers_vec[index_in_group].zmm_pd, new_term_zmm);
@@ -1463,6 +1467,7 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
         // We now have our first minimum hashes
         __m256i const ones_ymm = _mm256_set1_epi32(1);
         if (new_char_offset == window_width_k && passed_progress < prefix_length)
+#pragma unroll(unroll_factor_k)
             for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group)
                 rolling_minimums_vec[index_in_group].zmm_pd = last_states_vec[index_in_group].zmm_pd,
                 rolling_counts_vec[index_in_group].ymm = ones_ymm;
@@ -1476,6 +1481,7 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
             __m512d new_term_zmm = _mm512_set1_pd(new_term);
             __m512d old_term_zmm = _mm512_set1_pd(old_term);
 
+#pragma unroll(unroll_factor_k)
             for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
 
                 // Discard the old term
@@ -1508,6 +1514,7 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
 
         // Dump back the results from registers into our spans
         if (has_incomplete_tail_group_k && group_index + 1 == groups_count_k)
+#pragma unroll(unroll_factor_k)
             for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
                 unsigned const dim = group_index * hashes_per_unrolled_group_k + index_in_group * hashes_per_zmm_k;
                 __mmask8 const store_mask = dimensions_k > dim ? sz_u8_mask_until_(dimensions_k - dim) : (__mmask8)0;
diff --git a/scripts/bench_fingerprint.cuh b/scripts/bench_fingerprint.cuh
index 61c92732..ad06a779 100644
--- a/scripts/bench_fingerprint.cuh
+++ b/scripts/bench_fingerprint.cuh
@@ -22,7 +22,7 @@ namespace scripts {
 
 using namespace ashvardanian::stringzilla::scripts;
 
-static constexpr std::size_t default_embedding_dims_k = 128;
+static constexpr std::size_t default_embedding_dims_k = 16;
 static constexpr std::size_t default_window_width_k = 7;
 
 using fingerprint_min_hashes_t = std::array<std::uint32_t, default_embedding_dims_k>;
@@ -78,6 +78,8 @@ struct fingerprint_callable {
 
 void bench_fingerprint(environment_t const &env) {
 
+    namespace fu = fork_union;
+
     // Preallocate buffers for resulting fingerprints,
     // so that we can compare baseline and accelerated results for exact matches
     using fingerprints_equality_t = arrays_equality<fingerprint_min_hashes_t>;
@@ -85,6 +87,12 @@ void bench_fingerprint(environment_t const &env) {
     fingerprints_min_counts_t min_counts_baseline, min_counts_accelerated;
     min_hashes_baseline.resize(env.tokens.size()), min_hashes_accelerated.resize(env.tokens.size());
     min_counts_baseline.resize(env.tokens.size()), min_counts_accelerated.resize(env.tokens.size());
+
+    // Let's reuse a thread-pool to amortize the cost of spawning threads.
+    alignas(fu::default_alignment_k) fu::basic_pool_t pool;
+    if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
+    static_assert(executor_like<fu::basic_pool_t>);
+
     auto scramble_accelerated_results = [&]() {
         std::shuffle(min_hashes_accelerated.begin(), min_hashes_accelerated.end(), global_random_generator());
     };
@@ -128,42 +136,45 @@ void bench_fingerprint(environment_t const &env) {
         throw std::runtime_error("Can't build Skylake Floating Hasher.");
 
     // Perform the benchmarks, passing the dictionary to the engines
-    auto call_baseline =
-        fingerprint_callable<rolling_f64_t>(env, min_hashes_baseline, min_counts_baseline, *rolling_f64);
+    auto call_baseline = fingerprint_callable<rolling_f64_t, fu::basic_pool_t &>(
+        env, min_hashes_baseline, min_counts_baseline, *rolling_f64, pool);
     bench_result_t baseline = bench_nullary(env, "rolling_f64", call_baseline);
 
     // Semi-serial variants
-    bench_nullary(
-        env, "rolling_f32",
-        fingerprint_callable<rolling_f32_t>(env, min_hashes_accelerated, min_counts_accelerated, *rolling_f32))
+    bench_nullary(env, "rolling_f32",
+                  fingerprint_callable<rolling_f32_t, fu::basic_pool_t &>(env, min_hashes_accelerated,
+                                                                          min_counts_accelerated, *rolling_f32, pool))
         .log(baseline);
     bench_nullary(env, "rabin_u64",
-                  fingerprint_callable<rabin_u64_t>(env, min_hashes_accelerated, min_counts_accelerated, *rabin_u64))
+                  fingerprint_callable<rabin_u64_t, fu::basic_pool_t &>(env, min_hashes_accelerated,
+                                                                        min_counts_accelerated, *rabin_u64, pool))
         .log(baseline);
     bench_nullary(env, "buz_u32",
-                  fingerprint_callable<buz_u32_t>(env, min_hashes_accelerated, min_counts_accelerated, *buz_u32)) //
+                  fingerprint_callable<buz_u32_t, fu::basic_pool_t &>(env, min_hashes_accelerated,
+                                                                      min_counts_accelerated, *buz_u32, pool)) //
         .log(baseline);
-    bench_nullary(
-        env, "multiply_u32",
-        fingerprint_callable<multiply_u32_t>(env, min_hashes_accelerated, min_counts_accelerated, *multiply_u32))
+    bench_nullary(env, "multiply_u32",
+                  fingerprint_callable<multiply_u32_t, fu::basic_pool_t &>(env, min_hashes_accelerated,
+                                                                           min_counts_accelerated, *multiply_u32, pool))
         .log(baseline);
 
     // Actually unrolled hard-coded variants, including SIMD ports
-    bench_result_t unrolled = bench_nullary(                            //
-                                  env, "rolling_serial", call_baseline, //
-                                  fingerprint_callable<rolling_serial_t>(env, min_hashes_accelerated,
-                                                                         min_counts_accelerated, *rolling_serial), //
-                                  callable_no_op_t {},        // preprocessing
-                                  fingerprints_equality_t {}) // equality check
-                                  .log(baseline);
+    bench_result_t unrolled =                     //
+        bench_nullary(                            //
+            env, "rolling_serial", call_baseline, //
+            fingerprint_callable<rolling_serial_t, fu::basic_pool_t &>(
+                env, min_hashes_accelerated, min_counts_accelerated, *rolling_serial, pool), //
+            callable_no_op_t {},                                                             // preprocessing
+            fingerprints_equality_t {})                                                      // equality check
+            .log(baseline);
     scramble_accelerated_results();
 
     bench_nullary(                             //
         env, "rolling_skylake", call_baseline, //
-        fingerprint_callable<rolling_skylake_t>(env, min_hashes_accelerated, min_counts_accelerated,
-                                                *rolling_skylake), //
-        callable_no_op_t {},                                       // preprocessing
-        fingerprints_equality_t {})                                // equality check
+        fingerprint_callable<rolling_skylake_t, fu::basic_pool_t &>(env, min_hashes_accelerated, min_counts_accelerated,
+                                                                    *rolling_skylake, pool), //
+        callable_no_op_t {},                                                                 // preprocessing
+        fingerprints_equality_t {})                                                          // equality check
         .log(baseline, unrolled);
     scramble_accelerated_results();
 }

From 25d3ee6636802da221dad7cf291c0ad637c4dcad Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 27 Jul 2025 17:53:15 +0000
Subject: [PATCH 502/751] Add: Haswell rolling fingerprints

---
 include/stringzillas/fingerprint.hpp | 303 +++++++++++++++++++++++++++
 1 file changed, 303 insertions(+)

diff --git a/include/stringzillas/fingerprint.hpp b/include/stringzillas/fingerprint.hpp
index ae789345..6fe0cdd9 100644
--- a/include/stringzillas/fingerprint.hpp
+++ b/include/stringzillas/fingerprint.hpp
@@ -1204,6 +1204,309 @@ struct floating_rolling_hashers {
     }
 };
 
+/*  AVX2 implementation of the string hashing algorithms for Haswell processors and newer.
+ *  Very minimalistic (compared to AVX-512), but still faster than the serial implementation.
+ */
+#pragma region Haswell Implementation
+#if SZ_USE_HASWELL
+#pragma GCC push_options
+#pragma GCC target("avx2")
+#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
+
+SZ_INLINE __m256d _mm256_floor_magic_pd(__m256d x) noexcept {
+    // Magic number rounding approach for fast floor
+    __m256d magic = _mm256_set1_pd(6755399441055744.0); // 2^52 + 2^51
+    __m256d rounded = _mm256_sub_pd(_mm256_add_pd(x, magic), magic);
+
+    // Handle negative numbers: if result > x, subtract 1
+    __m256d neg_mask_pd = _mm256_cmp_pd(rounded, x, _CMP_GT_OQ);
+    return _mm256_sub_pd(rounded, _mm256_and_pd(neg_mask_pd, _mm256_set1_pd(1.0)));
+}
+
+/**
+ *  @brief Optimized rolling Min-Hashers built around floating-point numbers.
+ *  In a single YMM register we can store 4 `double` values, so we can process 4 hashes per register.
+ */
+template <std::size_t window_width_, std::size_t dimensions_>
+struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
+
+    using hasher_t = floating_rolling_hasher<double>;
+    using rolling_state_t = double;
+    using min_hash_t = std::uint32_t;
+    using min_count_t = std::uint32_t;
+
+    static constexpr std::size_t window_width_k = window_width_;
+    static constexpr std::size_t dimensions_k = dimensions_;
+    static constexpr rolling_state_t skipped_rolling_hash_k = std::numeric_limits<rolling_state_t>::max();
+    static constexpr min_hash_t max_hash_k = std::numeric_limits<min_hash_t>::max();
+
+    using min_hashes_span_t = span<min_hash_t, dimensions_k>;
+    using min_counts_span_t = span<min_count_t, dimensions_k>;
+
+    static constexpr unsigned hashes_per_ymm_k = sizeof(sz_u256_vec_t) / sizeof(rolling_state_t);
+    static constexpr bool has_incomplete_tail_group_k = dimensions_k % hashes_per_ymm_k;
+    static constexpr std::size_t aligned_dimensions_k =
+        has_incomplete_tail_group_k ? (dimensions_k / hashes_per_ymm_k + 1) * hashes_per_ymm_k : (dimensions_k);
+    static constexpr unsigned groups_count_k = aligned_dimensions_k / hashes_per_ymm_k;
+
+    static_assert(dimensions_k <= 256, "Too many dimensions to keep on stack");
+
+  private:
+    rolling_state_t multipliers_[aligned_dimensions_k];
+    rolling_state_t modulos_[aligned_dimensions_k];
+    rolling_state_t inverse_modulos_[aligned_dimensions_k];
+    rolling_state_t negative_discarding_multipliers_[aligned_dimensions_k];
+
+  public:
+    constexpr std::size_t dimensions() const noexcept { return dimensions_k; }
+    constexpr std::size_t window_width() const noexcept { return window_width_k; }
+    constexpr std::size_t window_width(std::size_t) const noexcept { return window_width_k; }
+
+    /**
+     *  @brief Initializes several rolling hashers with different multipliers and modulos.
+     *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
+     */
+    status_t try_seed(std::size_t alphabet_size = 256) noexcept {
+        for (unsigned dim = 0; dim < dimensions_k; ++dim) {
+            hasher_t hasher(window_width_k, alphabet_size + dim, hasher_t::default_modulo_base_k);
+            multipliers_[dim] = hasher.multiplier();
+            modulos_[dim] = hasher.modulo();
+            inverse_modulos_[dim] = hasher.inverse_modulo();
+            negative_discarding_multipliers_[dim] = hasher.negative_discarding_multiplier();
+        }
+        return status_t::success_k;
+    }
+
+    /**
+     *  @brief Computes the fingerprint of a single @p text on the current thread.
+     *  @param[in] text The input text to hash, typically a UTF-8 encoded string.
+     *  @param[out] min_hashes The output fingerprint, a vector of minimum hashes.
+     *  @param[out] min_counts The output frequencies of @p `min_hashes` hashes.
+     */
+    void fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
+                     min_counts_span_t min_counts) const noexcept {
+
+        if (text.size() < window_width_k) {
+            for (auto &min_hash : min_hashes) min_hash = max_hash_k;
+            for (auto &min_count : min_counts) min_count = 0;
+            return;
+        }
+
+        rolling_state_t rolling_states[dimensions_k];
+        rolling_state_t rolling_minimums[dimensions_k];
+        for (std::size_t dim = 0; dim < dimensions_k; ++dim)
+            rolling_states[dim] = 0, rolling_minimums[dim] = skipped_rolling_hash_k;
+        fingerprint_chunk(text, &rolling_states[0], &rolling_minimums[0], min_hashes, min_counts);
+    }
+
+    /**
+     *  @brief Computes the fingerprint of a single @p text on the current thread.
+     *  @param[in] text The input text to hash, typically a UTF-8 encoded string.
+     *  @param[out] min_hashes The output fingerprint, a vector of minimum hashes.
+     *  @param[out] min_counts The output frequencies of @p `min_hashes` hashes.
+     */
+    status_t try_fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
+                             min_counts_span_t min_counts) const noexcept {
+        fingerprint(text, min_hashes, min_counts);
+        return status_t::success_k;
+    }
+
+    /**
+     *  @brief Underlying machinery of `fingerprint` that fills the states of the hashers.
+     *  @param[in] text_chunk A chunk of text to update the @p `last_states` with.
+     *  @param[inout] last_states The last computed floats for each hasher; start with @b zeroes.
+     *  @param[inout] rolling_minimums The minimum floats for each hasher; start with @b `skipped_rolling_hash_k`.
+     *  @param[out] min_hashes The @b optional output for minimum hashes, which are the final fingerprints.
+     *  @param[out] min_counts The frequencies of @p `rolling_minimums` and optional @p `min_hashes` hashes.
+     *  @param[in] passed_progress The offset of the received @p `text_chunk` in the whole text; defaults to 0.
+     *
+     *  Unlike the `fingerprint` method, this function can be used in a @b rolling fashion, i.e., it can be called
+     *  multiple times with different chunks of text, and it will update the states accordingly. In the end, it
+     *  will anyways export the composing Count-Min-Sketch fingerprint into the @p `min_hashes` and @p `min_counts`,
+     *  as its a relatively cheap operation.
+     */
+    void fingerprint_chunk(                                   //
+        span<byte_t const> text_chunk,                        //
+        span<rolling_state_t, dimensions_k> last_states,      //
+        span<rolling_state_t, dimensions_k> rolling_minimums, //
+        min_hashes_span_t min_hashes,                         //
+        min_counts_span_t min_counts,                         //
+        std::size_t passed_progress = 0                       //
+    ) const noexcept {
+
+        for (unsigned group_index = 0; group_index < groups_count_k; ++group_index)
+            roll_group(text_chunk, group_index, last_states, rolling_minimums, min_counts, passed_progress);
+
+        // Finally, export the minimum hashes into the smaller representations
+        if (min_hashes)
+            for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
+                rolling_state_t const &rolling_minimum = rolling_minimums[dim];
+                min_hash_t &min_hash = min_hashes[dim];
+                auto const rolling_minimum_as_uint = static_cast<std::uint64_t>(rolling_minimum);
+                min_hash = rolling_minimum == skipped_rolling_hash_k
+                               ? max_hash_k // If the rolling minimum is not set, use the maximum hash value
+                               : static_cast<min_hash_t>(rolling_minimum_as_uint & max_hash_k);
+            }
+    }
+
+    /**
+     *  @brief Computes many fingerprints in parallel for input @p texts via an @p executor.
+     *  @param[in] texts The input texts to hash, typically a sequential container of UTF-8 encoded strings.
+     *  @param[out] min_hashes_per_text The output fingerprints, an array of vectors of minimum hashes.
+     *  @param[out] min_counts_per_text The output frequencies of @p `min_hashes_per_text` hashes.
+     *  @param[in] executor The executor to use for parallel processing, defaults to a dummy executor.
+     *  @param[in] specs The CPU specifications to use, defaults to an empty `cpu_specs_t`.
+     *  @retval status_t::success_k on success, or an error code otherwise.
+     *  @retval status_t::bad_alloc_k if the memory allocation fails.
+     */
+    template <typename texts_type_, typename min_hashes_per_text_type_, typename min_counts_per_text_type_,
+              typename executor_type_ = dummy_executor_t>
+#if SZ_IS_CPP20_
+        requires executor_like<executor_type_>
+#endif
+    status_t operator()(texts_type_ const &texts, min_hashes_per_text_type_ &&min_hashes_per_text, //
+                        min_counts_per_text_type_ &&min_counts_per_text, executor_type_ &&executor = {},
+                        cpu_specs_t specs = {}) noexcept {
+        return floating_rolling_hashers_in_parallel_(                     //
+            *this, texts,                                                 //
+            std::forward<min_hashes_per_text_type_>(min_hashes_per_text), //
+            std::forward<min_counts_per_text_type_>(min_counts_per_text), //
+            std::forward<executor_type_>(executor), specs);
+    }
+
+  private:
+    SZ_INLINE __m256d barrett_mod(__m256d xs, __m256d modulos, __m256d inverse_modulos) const noexcept {
+        __m256d qs = _mm256_floor_magic_pd(_mm256_mul_pd(xs, inverse_modulos));
+        __m256d results = _mm256_fnmadd_pd(qs, modulos, xs);
+
+        // Clamp into the [0, modulo) range.
+        __m256d overflow_mask_pd = _mm256_cmp_pd(results, modulos, _CMP_GE_OQ);
+        results = _mm256_sub_pd(results, _mm256_and_pd(overflow_mask_pd, modulos));
+        __m256d negative_mask_pd = _mm256_cmp_pd(results, _mm256_setzero_pd(), _CMP_LT_OQ);
+        results = _mm256_add_pd(results, _mm256_and_pd(negative_mask_pd, modulos));
+
+        return results;
+    }
+
+    void roll_group(                                               //
+        span<byte_t const> text_chunk, unsigned const group_index, //
+        span<rolling_state_t, dimensions_k> last_states,           //
+        span<rolling_state_t, dimensions_k> rolling_minimums,      //
+        span<min_count_t, dimensions_k> rolling_counts,            //
+        std::size_t const passed_progress = 0) const noexcept {
+
+        unsigned const first_dim = group_index * hashes_per_ymm_k;
+
+        // Register space for in-out variables
+        sz_u256_vec_t last_states_vec;
+        sz_u256_vec_t rolling_minimums_vec;
+        sz_u256_vec_t rolling_counts_vec; // ? Unlike other cases, use larger 64-bit counters simplify masking
+
+        // Use scalar loads for the incomplete tail group
+        if (has_incomplete_tail_group_k && group_index + 1 == groups_count_k) {
+            for (std::size_t word_index = 0; word_index < (dimensions_k - first_dim); ++word_index) {
+                last_states_vec.f64s[word_index] = last_states[first_dim + word_index];
+                rolling_minimums_vec.f64s[word_index] = rolling_minimums[first_dim + word_index];
+                rolling_counts_vec.u64s[word_index] = rolling_counts[first_dim + word_index];
+            }
+        }
+        // Otherwise, everything is easy
+        else {
+            last_states_vec.ymm_pd = _mm256_loadu_pd(&last_states[first_dim]);
+            rolling_minimums_vec.ymm_pd = _mm256_loadu_pd(&rolling_minimums[first_dim]);
+            rolling_counts_vec.ymm =
+                _mm256_cvtepu32_epi64(_mm_loadu_si128(reinterpret_cast<__m128i const *>(&rolling_counts[first_dim])));
+        }
+
+        // Temporary variables for the rolling state
+        sz_u256_vec_t multipliers_vec, negative_discarding_multipliers_vec, modulos_vec, inverse_modulos_vec;
+        multipliers_vec.ymm_pd = _mm256_loadu_pd(&multipliers_[first_dim]);
+        negative_discarding_multipliers_vec.ymm_pd = _mm256_loadu_pd(&negative_discarding_multipliers_[first_dim]);
+        modulos_vec.ymm_pd = _mm256_loadu_pd(&modulos_[first_dim]);
+        inverse_modulos_vec.ymm_pd = _mm256_loadu_pd(&inverse_modulos_[first_dim]);
+
+        // Until we reach the `window_width_k`, we don't need to discard any symbols and can keep the code simpler
+        std::size_t const prefix_length = (std::min)(text_chunk.size(), window_width_k);
+        std::size_t new_char_offset = passed_progress;
+        for (; new_char_offset < prefix_length; ++new_char_offset) {
+            byte_t const new_char = text_chunk[new_char_offset];
+            rolling_state_t const new_term = static_cast<rolling_state_t>(new_char) + 1.0;
+            __m256d new_term_ymm = _mm256_set1_pd(new_term);
+
+            last_states_vec.ymm_pd = _mm256_fmadd_pd(last_states_vec.ymm_pd, multipliers_vec.ymm_pd, new_term_ymm);
+            last_states_vec.ymm_pd = barrett_mod( //
+                last_states_vec.ymm_pd,           //
+                modulos_vec.ymm_pd,               //
+                inverse_modulos_vec.ymm_pd);
+        }
+
+        // We now have our first minimum hashes
+        __m256i const ones_ymm = _mm256_set1_epi64x(1);
+        if (new_char_offset == window_width_k && passed_progress < prefix_length)
+            rolling_minimums_vec.ymm_pd = last_states_vec.ymm_pd, rolling_counts_vec.ymm = ones_ymm;
+
+        // Now we can avoid a branch in the nested loop, as we are passed the longest window width
+        for (; new_char_offset < text_chunk.size(); ++new_char_offset) {
+            byte_t const new_char = text_chunk[new_char_offset];
+            byte_t const old_char = text_chunk[new_char_offset - window_width_k];
+            rolling_state_t const new_term = static_cast<rolling_state_t>(new_char) + 1.0;
+            rolling_state_t const old_term = static_cast<rolling_state_t>(old_char) + 1.0;
+            __m256d new_term_ymm = _mm256_set1_pd(new_term);
+            __m256d old_term_ymm = _mm256_set1_pd(old_term);
+
+            // Discard the old term
+            last_states_vec.ymm_pd =
+                _mm256_fmadd_pd(negative_discarding_multipliers_vec.ymm_pd, old_term_ymm, last_states_vec.ymm_pd);
+            last_states_vec.ymm_pd = barrett_mod( //
+                last_states_vec.ymm_pd,           //
+                modulos_vec.ymm_pd,               //
+                inverse_modulos_vec.ymm_pd);
+
+            // Add the new term
+            last_states_vec.ymm_pd = _mm256_fmadd_pd(last_states_vec.ymm_pd, multipliers_vec.ymm_pd, new_term_ymm);
+            last_states_vec.ymm_pd = barrett_mod( //
+                last_states_vec.ymm_pd,           //
+                modulos_vec.ymm_pd,               //
+                inverse_modulos_vec.ymm_pd);
+
+            // To keep the right comparison mask, check out: https://stackoverflow.com/q/16988199
+            __m256d found_ymm = _mm256_cmp_pd(last_states_vec.ymm_pd, rolling_minimums_vec.ymm_pd, _CMP_LE_OQ);
+            __m256d discard_ymm = _mm256_cmp_pd(last_states_vec.ymm_pd, rolling_minimums_vec.ymm_pd, _CMP_GE_OQ);
+            rolling_minimums_vec.ymm_pd =
+                _mm256_blendv_pd(rolling_minimums_vec.ymm_pd, last_states_vec.ymm_pd, found_ymm);
+
+            // A branchless way to update the counts
+            // 1. Discard "min counts" to 0, if a new minimum is found
+            // 2. Increment the counts for new & existing minimums
+            rolling_counts_vec.ymm_pd = _mm256_blendv_pd(_mm256_setzero_pd(), rolling_counts_vec.ymm_pd, discard_ymm);
+            rolling_counts_vec.ymm_pd =
+                _mm256_blendv_pd(rolling_counts_vec.ymm_pd,
+                                 _mm256_castsi256_pd(_mm256_add_epi64(rolling_counts_vec.ymm, ones_ymm)), found_ymm);
+        }
+
+        // Dump back the results from registers into our spans
+        if (has_incomplete_tail_group_k && group_index + 1 == groups_count_k) {
+            for (std::size_t word_index = 0; word_index < (dimensions_k - first_dim); ++word_index) {
+                last_states[first_dim + word_index] = last_states_vec.f64s[word_index];
+                rolling_minimums[first_dim + word_index] = rolling_minimums_vec.f64s[word_index];
+                rolling_counts[first_dim + word_index] = static_cast<min_count_t>(rolling_counts_vec.u64s[word_index]);
+            }
+        }
+        else {
+            _mm256_storeu_pd(&last_states[first_dim], last_states_vec.ymm_pd);
+            _mm256_storeu_pd(&rolling_minimums[first_dim], rolling_minimums_vec.ymm_pd);
+            _mm_storeu_si128(reinterpret_cast<__m128i *>(&rolling_counts[first_dim]),
+                             _mm256_cvtepi64_epi32(rolling_counts_vec.ymm));
+        }
+    }
+};
+
+#pragma clang attribute pop
+#pragma GCC pop_options
+#endif // SZ_USE_HASWELL
+
+#pragma endregion Haswell Implementation
+
 /*  AVX512 implementation of the string hashing algorithms for Skylake and newer CPUs.
  *  Includes extensions: F, CD, ER, PF, VL, DQ, BW.
  *

From 058af71d3576804ea42663ddd9b4df9b7ec3b1b9 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 27 Jul 2025 17:54:03 +0000
Subject: [PATCH 503/751] Fix: Estimating hash counts in fingerprints

---
 include/stringzillas/fingerprint.hpp | 251 ++++++++++++++-------------
 scripts/bench_fingerprint.cuh        |  14 ++
 scripts/test_fingerprint.cuh         | 131 +++++++++-----
 3 files changed, 230 insertions(+), 166 deletions(-)

diff --git a/include/stringzillas/fingerprint.hpp b/include/stringzillas/fingerprint.hpp
index 6fe0cdd9..eac2731f 100644
--- a/include/stringzillas/fingerprint.hpp
+++ b/include/stringzillas/fingerprint.hpp
@@ -595,8 +595,9 @@ struct basic_rolling_hashers {
         : allocator_(std::move(allocator)),
           hashers_(allocator_traits_t::select_on_container_copy_construction(allocator)) {}
 
-    std::size_t max_window_width() const noexcept { return max_window_width_; }
     std::size_t dimensions() const noexcept { return hashers_.size(); }
+    std::size_t max_window_width() const noexcept { return max_window_width_; }
+    std::size_t window_width(std::size_t dim) const noexcept { return hashers_[dim].window_width(); }
 
     /**
      *  @brief Appends multiple new rolling hashers for a given @p window_width.
@@ -717,21 +718,20 @@ struct basic_rolling_hashers {
                 rolling_state_t &last_state = last_states[dim];
                 rolling_hash_t &rolling_minimum = rolling_minimums[dim];
                 min_count_t &min_count = min_counts[dim];
-                if (hasher.window_width() > new_char_offset) {
+                if (new_char_offset < hasher.window_width()) {
                     last_state = hasher.push(last_state, new_char);
                     if (hasher.window_width() == (new_char_offset + 1)) {
                         rolling_minimum = (std::min)(rolling_minimum, hasher.digest(last_state));
                         min_count = 1; // First occurrence of this hash
                     }
+                    continue;
                 }
-                else {
-                    auto const old_char = text_chunk[new_char_offset - hasher.window_width()];
-                    last_state = hasher.roll(last_state, old_char, new_char);
-                    rolling_hash_t new_hash = hasher.digest(last_state);
-                    min_count *= new_hash >= rolling_minimum; // Discard `min_count` to 0, if a new minimum is found
-                    min_count += new_hash == rolling_minimum; // Increments `min_count` by 1 for new & existing minimums
-                    rolling_minimum = (std::min)(rolling_minimum, new_hash);
-                }
+                auto const old_char = text_chunk[new_char_offset - hasher.window_width()];
+                last_state = hasher.roll(last_state, old_char, new_char);
+                rolling_hash_t new_hash = hasher.digest(last_state);
+                min_count *= new_hash >= rolling_minimum; // ? Discard `min_count` to 0, if a new minimum is found
+                min_count += new_hash <= rolling_minimum; // ? Increments `min_count` by 1 for new & existing minimums
+                rolling_minimum = (std::min)(rolling_minimum, new_hash);
             }
         }
 
@@ -746,8 +746,8 @@ struct basic_rolling_hashers {
                 auto const old_char = text_chunk[new_char_offset - hasher.window_width()];
                 last_state = hasher.roll(last_state, old_char, new_char);
                 rolling_hash_t new_hash = hasher.digest(last_state);
-                min_count *= new_hash >= rolling_minimum; // Discard `min_count` to 0, if a new minimum is found
-                min_count += new_hash == rolling_minimum; // Increments `min_count` by 1 for new & existing minimums
+                min_count *= new_hash >= rolling_minimum; // ? Discard `min_count` to 0, if a new minimum is found
+                min_count += new_hash <= rolling_minimum; // ? Increments `min_count` by 1 for new & existing minimums
                 rolling_minimum = (std::min)(rolling_minimum, new_hash);
             }
         }
@@ -761,6 +761,17 @@ struct basic_rolling_hashers {
                                ? max_hash_k // If the rolling minimum is not set, use the maximum hash value
                                : static_cast<min_hash_t>(rolling_minimum & max_hash_k);
             }
+
+        // We may be in a position, when `text_chunk.size()` is smaller than the shortest window width,
+        // so we must output zeros for the `min_counts` for every case, where the rolling state is skipped.
+        if (min_counts)
+            for (std::size_t dim = 0; dim < min_counts.size(); ++dim) {
+                rolling_hash_t const &rolling_minimum = rolling_minimums[dim];
+                min_count_t &min_count = min_counts[dim];
+                min_count = rolling_minimum == skipped_rolling_hash_k
+                                ? 0 // If the rolling minimum is not set, reset to zeros
+                                : min_count;
+            }
     }
 
     /**
@@ -1037,8 +1048,9 @@ struct floating_rolling_hashers {
     rolling_state_t negative_discarding_multipliers_[dimensions_k];
 
   public:
-    constexpr std::size_t window_width() const noexcept { return window_width_k; }
     constexpr std::size_t dimensions() const noexcept { return dimensions_k; }
+    constexpr std::size_t window_width() const noexcept { return window_width_k; }
+    constexpr std::size_t window_width(std::size_t) const noexcept { return window_width_k; }
 
     /**
      *  @brief Initializes several rolling hashers with different multipliers and modulos.
@@ -1146,8 +1158,16 @@ struct floating_rolling_hashers {
                 last_state = std::fma(last_state, multipliers_[dim], new_term); // Add head
                 last_state = barrett_mod(last_state, dim);
 
-                if (rolling_minimum == last_state) { min_count++; }
-                else if (last_state < rolling_minimum) { rolling_minimum = last_state, min_count = 1; }
+                // In essence, we need to increment the `min_count` if the new hash is equal to the minimum,
+                // or reset it to 1 if the new hash is smaller than the minimum.
+                //
+                //      if (rolling_minimum == last_state) { min_count++; }
+                //      else if (last_state < rolling_minimum) { rolling_minimum = last_state, min_count = 1; }
+                //
+                // There's a branchless approach to achieve the same outcome:
+                min_count *= last_state >= rolling_minimum; // ? Discard `min_count` to 0, if a new minimum is found
+                min_count += last_state <= rolling_minimum; // ? Increments `min_count` by 1 for new & existing minimums
+                rolling_minimum = (std::min)(rolling_minimum, last_state);
             }
         }
 
@@ -1519,12 +1539,24 @@ struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
 #pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512dq,avx512bw,bmi,bmi2"))), \
                              apply_to = function)
 
+/**
+ *  @brief Alternative to `_mm512_roundscale_pd` and `std::floor`.
+ *  Using `_mm512_roundscale_pd` drops throughput to 1/10th of `std::floor`,
+ *  while this approach is about 2x faster than `std::floor`.
+ */
+SZ_INLINE __m512d _mm512_floor_magic_pd(__m512d x) noexcept {
+    // Add magic number to force rounding, then subtract it back
+    __m512d magic = _mm512_set1_pd(6755399441055744.0); // 2^52 + 2^51
+    __m512d rounded = _mm512_sub_pd(_mm512_add_pd(x, magic), magic);
+
+    // Handle negative numbers: if result > x, subtract 1
+    __mmask8 neg_mask = _mm512_cmp_pd_mask(rounded, x, _CMP_GT_OQ);
+    return _mm512_mask_sub_pd(rounded, neg_mask, rounded, _mm512_set1_pd(1.0));
+}
+
 /**
  *  @brief Optimized rolling Min-Hashers built around floating-point numbers.
- *
  *  In a single ZMM register we can store 8 `double` values, so we can process 8 hashes per register.
- *  Assuming 32x ZMM registers, and roughly 10ish scalars for intermediaries per hash, we can unroll
- *  2-3x times, and process 16-24 hashes in parallel.
  */
 template <std::size_t window_width_, std::size_t dimensions_>
 struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
@@ -1542,14 +1574,11 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
     using min_hashes_span_t = span<min_hash_t, dimensions_k>;
     using min_counts_span_t = span<min_count_t, dimensions_k>;
 
-    static constexpr unsigned unroll_factor_k = 2;
     static constexpr unsigned hashes_per_zmm_k = sizeof(sz_u512_vec_t) / sizeof(rolling_state_t);
-    static constexpr unsigned hashes_per_unrolled_group_k = unroll_factor_k * hashes_per_zmm_k;
-    static constexpr bool has_incomplete_tail_group_k = dimensions_k % hashes_per_unrolled_group_k;
+    static constexpr bool has_incomplete_tail_group_k = dimensions_k % hashes_per_zmm_k;
     static constexpr std::size_t aligned_dimensions_k =
-        has_incomplete_tail_group_k ? (dimensions_k / hashes_per_unrolled_group_k + 1) * hashes_per_unrolled_group_k
-                                    : (dimensions_k);
-    static constexpr unsigned groups_count_k = aligned_dimensions_k / hashes_per_unrolled_group_k;
+        has_incomplete_tail_group_k ? (dimensions_k / hashes_per_zmm_k + 1) * hashes_per_zmm_k : (dimensions_k);
+    static constexpr unsigned groups_count_k = aligned_dimensions_k / hashes_per_zmm_k;
 
     static_assert(dimensions_k <= 256, "Too many dimensions to keep on stack");
 
@@ -1560,8 +1589,9 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
     rolling_state_t negative_discarding_multipliers_[aligned_dimensions_k];
 
   public:
-    constexpr std::size_t window_width() const noexcept { return window_width_k; }
     constexpr std::size_t dimensions() const noexcept { return dimensions_k; }
+    constexpr std::size_t window_width() const noexcept { return window_width_k; }
+    constexpr std::size_t window_width(std::size_t) const noexcept { return window_width_k; }
 
     /**
      *  @brief Initializes several rolling hashers with different multipliers and modulos.
@@ -1630,8 +1660,8 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
         span<byte_t const> text_chunk,                        //
         span<rolling_state_t, dimensions_k> last_states,      //
         span<rolling_state_t, dimensions_k> rolling_minimums, //
-        span<min_hash_t, dimensions_k> min_hashes,            //
-        span<min_count_t, dimensions_k> min_counts,           //
+        min_hashes_span_t min_hashes,                         //
+        min_counts_span_t min_counts,                         //
         std::size_t const passed_progress = 0) const noexcept {
 
         for (unsigned group_index = 0; group_index < groups_count_k; ++group_index)
@@ -1675,12 +1705,10 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
     }
 
   private:
-    // TODO: We can probably shave a few ore cycles here:
     SZ_INLINE __m512d barrett_mod(__m512d xs, __m512d modulos, __m512d inverse_modulos) const noexcept {
+
         // Use rounding SIMD arithmetic
-        __m512d qs = _mm512_roundscale_pd(      // ! The rounding operation is extremely expensive,
-            _mm512_mul_pd(xs, inverse_modulos), // ! so alternatives should be considered.
-            _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+        __m512d qs = _mm512_floor_magic_pd(_mm512_mul_pd(xs, inverse_modulos));
         __m512d results = _mm512_fnmadd_pd(qs, modulos, xs);
 
         // Clamp into the [0, modulo) range.
@@ -1708,45 +1736,34 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
         span<min_count_t, dimensions_k> rolling_counts,            //
         std::size_t const passed_progress = 0) const noexcept {
 
+        unsigned const first_dim = group_index * hashes_per_zmm_k;
+
         // Register space for in-out variables
-        sz_u512_vec_t last_states_vec[unroll_factor_k];
-        sz_u512_vec_t rolling_minimums_vec[unroll_factor_k];
-        sz_u256_vec_t rolling_counts_vec[unroll_factor_k];
+        sz_u512_vec_t last_states_vec;
+        sz_u512_vec_t rolling_minimums_vec;
+        sz_u256_vec_t rolling_counts_vec;
 
         // Use masked loads for the incomplete tail group
-        if (has_incomplete_tail_group_k && group_index + 1 == groups_count_k)
-#pragma unroll(unroll_factor_k)
-            for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
-                unsigned const dim = group_index * hashes_per_unrolled_group_k + index_in_group * hashes_per_zmm_k;
-                __mmask8 const load_mask = dimensions_k > dim ? sz_u8_mask_until_(dimensions_k - dim) : (__mmask8)0;
-                last_states_vec[index_in_group].zmm_pd = _mm512_maskz_loadu_pd(load_mask, &last_states[dim]);
-                rolling_minimums_vec[index_in_group].zmm_pd = _mm512_maskz_loadu_pd(load_mask, &rolling_minimums[dim]);
-                rolling_counts_vec[index_in_group].ymm = _mm256_maskz_loadu_epi32(load_mask, &rolling_counts[dim]);
-            }
+        if (has_incomplete_tail_group_k && group_index + 1 == groups_count_k) {
+            __mmask8 const load_mask =
+                dimensions_k > first_dim ? sz_u8_mask_until_(dimensions_k - first_dim) : (__mmask8)0;
+            last_states_vec.zmm_pd = _mm512_maskz_loadu_pd(load_mask, &last_states[first_dim]);
+            rolling_minimums_vec.zmm_pd = _mm512_maskz_loadu_pd(load_mask, &rolling_minimums[first_dim]);
+            rolling_counts_vec.ymm = _mm256_maskz_loadu_epi32(load_mask, &rolling_counts[first_dim]);
+        }
         // Otherwise, everything is easy
-        else
-#pragma unroll(unroll_factor_k)
-            for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
-                unsigned const dim = group_index * hashes_per_unrolled_group_k + index_in_group * hashes_per_zmm_k;
-                last_states_vec[index_in_group].zmm_pd = _mm512_loadu_pd(&last_states[dim]);
-                rolling_minimums_vec[index_in_group].zmm_pd = _mm512_loadu_pd(&rolling_minimums[dim]);
-                rolling_counts_vec[index_in_group].ymm =
-                    _mm256_loadu_si256(reinterpret_cast<__m256i const *>(&rolling_counts[dim]));
-            }
+        else {
+            last_states_vec.zmm_pd = _mm512_loadu_pd(&last_states[first_dim]);
+            rolling_minimums_vec.zmm_pd = _mm512_loadu_pd(&rolling_minimums[first_dim]);
+            rolling_counts_vec.ymm = _mm256_loadu_si256(reinterpret_cast<__m256i const *>(&rolling_counts[first_dim]));
+        }
 
         // Temporary variables for the rolling state
-        sz_u512_vec_t multipliers_vec[unroll_factor_k], negative_discarding_multipliers_vec[unroll_factor_k],
-            modulos_vec[unroll_factor_k], inverse_modulos_vec[unroll_factor_k];
-
-#pragma unroll(unroll_factor_k)
-        for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
-            unsigned const dim = group_index * hashes_per_unrolled_group_k + index_in_group * hashes_per_zmm_k;
-            multipliers_vec[index_in_group].zmm_pd = _mm512_loadu_pd(&multipliers_[dim]);
-            negative_discarding_multipliers_vec[index_in_group].zmm_pd =
-                _mm512_loadu_pd(&negative_discarding_multipliers_[dim]);
-            modulos_vec[index_in_group].zmm_pd = _mm512_loadu_pd(&modulos_[dim]);
-            inverse_modulos_vec[index_in_group].zmm_pd = _mm512_loadu_pd(&inverse_modulos_[dim]);
-        }
+        sz_u512_vec_t multipliers_vec, negative_discarding_multipliers_vec, modulos_vec, inverse_modulos_vec;
+        multipliers_vec.zmm_pd = _mm512_loadu_pd(&multipliers_[first_dim]);
+        negative_discarding_multipliers_vec.zmm_pd = _mm512_loadu_pd(&negative_discarding_multipliers_[first_dim]);
+        modulos_vec.zmm_pd = _mm512_loadu_pd(&modulos_[first_dim]);
+        inverse_modulos_vec.zmm_pd = _mm512_loadu_pd(&inverse_modulos_[first_dim]);
 
         // Until we reach the `window_width_k`, we don't need to discard any symbols and can keep the code simpler
         std::size_t const prefix_length = (std::min)(text_chunk.size(), window_width_k);
@@ -1756,24 +1773,17 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
             rolling_state_t const new_term = static_cast<rolling_state_t>(new_char) + 1.0;
             __m512d new_term_zmm = _mm512_set1_pd(new_term);
 
-#pragma unroll(unroll_factor_k)
-            for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
-                last_states_vec[index_in_group].zmm_pd = _mm512_fmadd_pd(
-                    last_states_vec[index_in_group].zmm_pd, multipliers_vec[index_in_group].zmm_pd, new_term_zmm);
-                last_states_vec[index_in_group].zmm_pd = barrett_mod( //
-                    last_states_vec[index_in_group].zmm_pd,           //
-                    modulos_vec[index_in_group].zmm_pd,               //
-                    inverse_modulos_vec[index_in_group].zmm_pd);
-            }
+            last_states_vec.zmm_pd = _mm512_fmadd_pd(last_states_vec.zmm_pd, multipliers_vec.zmm_pd, new_term_zmm);
+            last_states_vec.zmm_pd = barrett_mod( //
+                last_states_vec.zmm_pd,           //
+                modulos_vec.zmm_pd,               //
+                inverse_modulos_vec.zmm_pd);
         }
 
         // We now have our first minimum hashes
         __m256i const ones_ymm = _mm256_set1_epi32(1);
         if (new_char_offset == window_width_k && passed_progress < prefix_length)
-#pragma unroll(unroll_factor_k)
-            for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group)
-                rolling_minimums_vec[index_in_group].zmm_pd = last_states_vec[index_in_group].zmm_pd,
-                rolling_counts_vec[index_in_group].ymm = ones_ymm;
+            rolling_minimums_vec.zmm_pd = last_states_vec.zmm_pd, rolling_counts_vec.ymm = ones_ymm;
 
         // Now we can avoid a branch in the nested loop, as we are passed the longest window width
         for (; new_char_offset < text_chunk.size(); ++new_char_offset) {
@@ -1784,55 +1794,48 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
             __m512d new_term_zmm = _mm512_set1_pd(new_term);
             __m512d old_term_zmm = _mm512_set1_pd(old_term);
 
-#pragma unroll(unroll_factor_k)
-            for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
-
-                // Discard the old term
-                last_states_vec[index_in_group].zmm_pd =
-                    _mm512_fmadd_pd(negative_discarding_multipliers_vec[index_in_group].zmm_pd, old_term_zmm,
-                                    last_states_vec[index_in_group].zmm_pd);
-                last_states_vec[index_in_group].zmm_pd = barrett_mod( //
-                    last_states_vec[index_in_group].zmm_pd,           //
-                    modulos_vec[index_in_group].zmm_pd,               //
-                    inverse_modulos_vec[index_in_group].zmm_pd);
-
-                // Add the new term
-                last_states_vec[index_in_group].zmm_pd = _mm512_fmadd_pd(
-                    last_states_vec[index_in_group].zmm_pd, multipliers_vec[index_in_group].zmm_pd, new_term_zmm);
-                last_states_vec[index_in_group].zmm_pd = barrett_mod( //
-                    last_states_vec[index_in_group].zmm_pd,           //
-                    modulos_vec[index_in_group].zmm_pd,               //
-                    inverse_modulos_vec[index_in_group].zmm_pd);
-
-                // To keep the right comparison mask, check out: https://stackoverflow.com/q/16988199
-                __mmask8 same_mask = _mm512_cmp_pd_mask(rolling_minimums_vec[index_in_group].zmm_pd,
-                                                        last_states_vec[index_in_group].zmm_pd, _CMP_EQ_OQ);
-                rolling_minimums_vec[index_in_group].zmm_pd =
-                    _mm512_min_pd(rolling_minimums_vec[index_in_group].zmm_pd, last_states_vec[index_in_group].zmm_pd);
-                rolling_counts_vec[index_in_group].ymm =
-                    _mm256_mask_add_epi32(rolling_counts_vec[index_in_group].ymm, same_mask,
-                                          rolling_counts_vec[index_in_group].ymm, ones_ymm);
-            }
+            // Discard the old term
+            last_states_vec.zmm_pd =
+                _mm512_fmadd_pd(negative_discarding_multipliers_vec.zmm_pd, old_term_zmm, last_states_vec.zmm_pd);
+            last_states_vec.zmm_pd = barrett_mod( //
+                last_states_vec.zmm_pd,           //
+                modulos_vec.zmm_pd,               //
+                inverse_modulos_vec.zmm_pd);
+
+            // Add the new term
+            last_states_vec.zmm_pd = _mm512_fmadd_pd(last_states_vec.zmm_pd, multipliers_vec.zmm_pd, new_term_zmm);
+            last_states_vec.zmm_pd = barrett_mod( //
+                last_states_vec.zmm_pd,           //
+                modulos_vec.zmm_pd,               //
+                inverse_modulos_vec.zmm_pd);
+
+            // To keep the right comparison mask, check out: https://stackoverflow.com/q/16988199
+            __mmask8 found_mask = _mm512_cmp_pd_mask(last_states_vec.zmm_pd, rolling_minimums_vec.zmm_pd, _CMP_LE_OQ);
+            __mmask8 discard_mask = _mm512_cmp_pd_mask(last_states_vec.zmm_pd, rolling_minimums_vec.zmm_pd, _CMP_GE_OQ);
+            rolling_minimums_vec.zmm_pd =
+                _mm512_mask_mov_pd(rolling_minimums_vec.zmm_pd, found_mask, last_states_vec.zmm_pd);
+
+            // A branchless way to update the counts
+            // 1. Discard "min counts" to 0, if a new minimum is found
+            // 2. Increment the counts for new & existing minimums
+            rolling_counts_vec.ymm = _mm256_maskz_mov_epi32(discard_mask, rolling_counts_vec.ymm);
+            rolling_counts_vec.ymm =
+                _mm256_mask_add_epi32(rolling_counts_vec.ymm, found_mask, rolling_counts_vec.ymm, ones_ymm);
         }
 
         // Dump back the results from registers into our spans
-        if (has_incomplete_tail_group_k && group_index + 1 == groups_count_k)
-#pragma unroll(unroll_factor_k)
-            for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
-                unsigned const dim = group_index * hashes_per_unrolled_group_k + index_in_group * hashes_per_zmm_k;
-                __mmask8 const store_mask = dimensions_k > dim ? sz_u8_mask_until_(dimensions_k - dim) : (__mmask8)0;
-                _mm512_mask_storeu_pd(&last_states[dim], store_mask, last_states_vec[index_in_group].zmm_pd);
-                _mm512_mask_storeu_pd(&rolling_minimums[dim], store_mask, rolling_minimums_vec[index_in_group].zmm_pd);
-                _mm256_mask_storeu_epi32(&rolling_counts[dim], store_mask, rolling_counts_vec[index_in_group].ymm);
-            }
-        else
-            for (unsigned index_in_group = 0; index_in_group < unroll_factor_k; ++index_in_group) {
-                unsigned const dim = group_index * hashes_per_unrolled_group_k + index_in_group * hashes_per_zmm_k;
-                _mm512_storeu_pd(&last_states[dim], last_states_vec[index_in_group].zmm_pd);
-                _mm512_storeu_pd(&rolling_minimums[dim], rolling_minimums_vec[index_in_group].zmm_pd);
-                _mm256_storeu_si256(reinterpret_cast<__m256i *>(&rolling_counts[dim]),
-                                    rolling_counts_vec[index_in_group].ymm);
-            }
+        if (has_incomplete_tail_group_k && group_index + 1 == groups_count_k) {
+            __mmask8 const store_mask =
+                dimensions_k > first_dim ? sz_u8_mask_until_(dimensions_k - first_dim) : (__mmask8)0;
+            _mm512_mask_storeu_pd(&last_states[first_dim], store_mask, last_states_vec.zmm_pd);
+            _mm512_mask_storeu_pd(&rolling_minimums[first_dim], store_mask, rolling_minimums_vec.zmm_pd);
+            _mm256_mask_storeu_epi32(&rolling_counts[first_dim], store_mask, rolling_counts_vec.ymm);
+        }
+        else {
+            _mm512_storeu_pd(&last_states[first_dim], last_states_vec.zmm_pd);
+            _mm512_storeu_pd(&rolling_minimums[first_dim], rolling_minimums_vec.zmm_pd);
+            _mm256_storeu_si256(reinterpret_cast<__m256i *>(&rolling_counts[first_dim]), rolling_counts_vec.ymm);
+        }
     }
 };
 
@@ -1840,6 +1843,8 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
 #pragma GCC pop_options
 #endif // SZ_USE_SKYLAKE
 
+#pragma endregion Skylake Implementation
+
 #pragma endregion - Optimized Rolling MinHashers
 
 } // namespace stringzillas
diff --git a/scripts/bench_fingerprint.cuh b/scripts/bench_fingerprint.cuh
index ad06a779..f730cc47 100644
--- a/scripts/bench_fingerprint.cuh
+++ b/scripts/bench_fingerprint.cuh
@@ -129,6 +129,12 @@ void bench_fingerprint(environment_t const &env) {
     if (rolling_serial->try_seed() != status_t::success_k)
         throw std::runtime_error("Can't build Unrolled Floating Hasher.");
 
+    using rolling_haswell_t =
+        floating_rolling_hashers<sz_cap_haswell_k, default_window_width_k, default_embedding_dims_k>;
+    auto rolling_haswell = std::make_unique<rolling_haswell_t>();
+    if (rolling_haswell->try_seed() != status_t::success_k)
+        throw std::runtime_error("Can't build Haswell Floating Hasher.");
+
     using rolling_skylake_t =
         floating_rolling_hashers<sz_cap_skylake_k, default_window_width_k, default_embedding_dims_k>;
     auto rolling_skylake = std::make_unique<rolling_skylake_t>();
@@ -169,6 +175,14 @@ void bench_fingerprint(environment_t const &env) {
             .log(baseline);
     scramble_accelerated_results();
 
+    bench_nullary(                             //
+        env, "rolling_haswell", call_baseline, //
+        fingerprint_callable<rolling_haswell_t, fu::basic_pool_t &>(env, min_hashes_accelerated, min_counts_accelerated,
+                                                                    *rolling_haswell, pool), //
+        callable_no_op_t {},                                                                 // preprocessing
+        fingerprints_equality_t {})                                                          // equality check
+        .log(baseline, unrolled);
+
     bench_nullary(                             //
         env, "rolling_skylake", call_baseline, //
         fingerprint_callable<rolling_skylake_t, fu::basic_pool_t &>(env, min_hashes_accelerated, min_counts_accelerated,
diff --git a/scripts/test_fingerprint.cuh b/scripts/test_fingerprint.cuh
index a67c3e22..cd76a817 100644
--- a/scripts/test_fingerprint.cuh
+++ b/scripts/test_fingerprint.cuh
@@ -144,6 +144,14 @@ std::vector<std::string> rolling_hasher_basic_inputs() {
     strings.emplace_back("si siht si a tset gnirts; reh ton si ehs, tub sih ti si.");
     strings.emplace_back("his\0is\r\nshe\0her");
 
+    // Repetitive patterns to check min-counts
+    strings.emplace_back("ab ab ab ab ab");
+    strings.emplace_back("ababababab");
+    strings.emplace_back("abcabcabcabc");
+    strings.emplace_back("a a a a a");
+    strings.emplace_back("ab ab ab ab ab ab ab ab ab ab");
+    strings.emplace_back("abc abc abc abc abc abc abc abc");
+
     // Unicode variants:
     strings.emplace_back("école"), strings.emplace_back("école");                   // decomposed
     strings.emplace_back("Schön"), strings.emplace_back("Scho\u0308n");             // combining diaeresis
@@ -329,75 +337,112 @@ void test_rolling_hasher() {
             test_rolling_hasher(hasher, inconvenient_strings);
 }
 
+template <std::size_t dims_, typename strings_type_, typename baseline_hasher_type_, typename accelerated_hasher_type_>
+void test_rolling_hashers_equivalence_against_baseline(strings_type_ const &strings,
+                                                       baseline_hasher_type_ const &baseline_hasher,
+                                                       accelerated_hasher_type_ const &accelerated_hasher) {
+    constexpr std::size_t dims_k = dims_;
+    using fingerprint_hashes_t = safe_array<std::uint32_t, dims_k>;
+    using fingerprint_counts_t = safe_array<std::uint32_t, dims_k>;
+    fingerprint_hashes_t serial_hashes, accelerated_hashes;
+    fingerprint_counts_t serial_counts, accelerated_counts;
+
+    // Compute the fingerprints
+    for (auto const &str : strings) {
+        auto bytes = to_bytes_view(str);
+        baseline_hasher.template try_fingerprint<dims_k>(bytes, serial_hashes, serial_counts);
+        accelerated_hasher.try_fingerprint(bytes, accelerated_hashes, accelerated_counts);
+
+        // Compare the results
+        std::size_t const first_mismatch_index =
+            std::mismatch(serial_hashes.begin(), serial_hashes.end(), accelerated_hashes.begin()).first -
+            serial_hashes.begin();
+
+        if (first_mismatch_index != serial_hashes.size()) {
+            std::printf("Fingerprint mismatch at index %zu:\n", first_mismatch_index);
+            std::printf("  String: \"%s\"\n", str.c_str());
+            std::printf("  Serial hash:      %u\n", serial_hashes[first_mismatch_index]);
+            std::printf("  Accelerated hash: %u\n", accelerated_hashes[first_mismatch_index]);
+            std::printf("  Serial count:     %u\n", serial_counts[first_mismatch_index]);
+            std::printf("  Accelerated count:%u\n", accelerated_counts[first_mismatch_index]);
+            for (std::size_t i = 0; i < serial_hashes.size(); ++i) {
+                std::printf("  [%zu] serial=%u accelerated=%u\n", i, serial_hashes[i], accelerated_hashes[i]);
+            }
+        }
+        sz_assert_(first_mismatch_index == serial_hashes.size() && "Fingerprints do not match");
+
+        // Counters can't be zero, if the input string is at least the size of a window
+        for (std::size_t i = 0; i < serial_counts.size(); ++i) {
+            if (str.size() >= baseline_hasher.window_width(i)) {
+                sz_assert_(serial_counts[i] > 0 && "Serial fingerprint count is zero");
+                sz_assert_(accelerated_counts[i] > 0 && "Accelerated fingerprint count is zero");
+            }
+            else {
+                sz_assert_(serial_counts[i] == 0 && "Serial fingerprint should be zero");
+                sz_assert_(accelerated_counts[i] == 0 && "Accelerated fingerprint should be zero");
+            }
+        }
+
+        // Compare the counts
+        std::size_t const first_counts_mismatch_index =
+            std::mismatch(serial_counts.begin(), serial_counts.end(), accelerated_counts.begin()).first -
+            serial_counts.begin();
+        if (first_counts_mismatch_index != serial_counts.size()) {
+            std::printf("Fingerprint counts mismatch at index %zu:\n", first_counts_mismatch_index);
+            std::printf("  String: \"%s\"\n", str.c_str());
+            std::printf("  Serial count:      %u\n", serial_counts[first_counts_mismatch_index]);
+            std::printf("  Accelerated count: %u\n", accelerated_counts[first_counts_mismatch_index]);
+            for (std::size_t i = 0; i < serial_counts.size(); ++i) {
+                std::printf("  [%zu] serial=%u accelerated=%u\n", i, serial_counts[i], accelerated_counts[i]);
+            }
+        }
+        sz_assert_(first_counts_mismatch_index == serial_counts.size() && "Fingerprint counts do not match");
+    }
+}
+
 /**
  *  Compares the equivalence of SIMD backends to @b `floating_rolling_hashers<sz_cap_serial_k>`
  *  and the simpler `basic_rolling_hashers<floating_rolling_hasher<double>, ..., std::uint32_t>`.
  */
-template <std::size_t window_width_, std::size_t embedding_dims_>
+template <std::size_t window_width_, std::size_t dims_>
 void test_rolling_hashers_equivalence_for_width() {
 
-    constexpr std::size_t embedding_dims_k = embedding_dims_;
     constexpr std::size_t window_width_k = window_width_;
-    using fingerprint_hashes_t = safe_array<std::uint32_t, embedding_dims_k>;
-    using fingerprint_counts_t = safe_array<std::uint32_t, embedding_dims_k>;
-
-    auto test_against_baseline = [&](auto const &strings, auto const &baseline_hasher, auto const &accelerated_hasher) {
-        fingerprint_hashes_t serial_hashes, accelerated_hashes;
-        fingerprint_counts_t serial_counts, accelerated_counts;
-
-        // Compute the fingerprints
-        for (auto const &str : strings) {
-            auto bytes = to_bytes_view(str);
-            baseline_hasher.template try_fingerprint<embedding_dims_k>(bytes, serial_hashes, serial_counts);
-            accelerated_hasher.try_fingerprint(bytes, accelerated_hashes, accelerated_counts);
-
-            // Compare the results
-            std::size_t const first_mismatch_index =
-                std::mismatch(serial_hashes.begin(), serial_hashes.end(), accelerated_hashes.begin()).first -
-                serial_hashes.begin();
-
-            if (first_mismatch_index != serial_hashes.size()) {
-                std::printf("Fingerprint mismatch at index %zu:\n", first_mismatch_index);
-                std::printf("  String: \"%s\"\n", str.c_str());
-                std::printf("  Serial hash:      %u\n", serial_hashes[first_mismatch_index]);
-                std::printf("  Accelerated hash: %u\n", accelerated_hashes[first_mismatch_index]);
-                std::printf("  Serial count:     %u\n", serial_counts[first_mismatch_index]);
-                std::printf("  Accelerated count:%u\n", accelerated_counts[first_mismatch_index]);
-                for (std::size_t i = 0; i < serial_hashes.size(); ++i) {
-                    std::printf("  [%zu] serial=%u accelerated=%u\n", i, serial_hashes[i], accelerated_hashes[i]);
-                }
-            }
-            sz_assert_(first_mismatch_index == serial_hashes.size() && "Fingerprints do not match");
-        }
-    };
+    constexpr std::size_t dims_k = dims_;
 
     // Define hasher classes
     using rolling_f64_t = basic_rolling_hashers<floating_rolling_hasher<double>, std::uint32_t>;
-    using rolling_serial_t = floating_rolling_hashers<sz_cap_serial_k, window_width_k, embedding_dims_k>;
-    using rolling_skylake_t = floating_rolling_hashers<sz_cap_skylake_k, window_width_k, embedding_dims_k>;
+    using rolling_serial_t = floating_rolling_hashers<sz_cap_serial_k, window_width_k, dims_k>;
+    using rolling_haswell_t = floating_rolling_hashers<sz_cap_haswell_k, window_width_k, dims_k>;
+    using rolling_skylake_t = floating_rolling_hashers<sz_cap_skylake_k, window_width_k, dims_k>;
 
     // Instantiate all rolling hashers
     rolling_f64_t rolling_f64;
     rolling_serial_t rolling_serial;
+    rolling_haswell_t rolling_haswell;
     rolling_skylake_t rolling_skylake;
-    sz_assert_(rolling_f64.try_extend(window_width_k, embedding_dims_k) == status_t::success_k);
+    sz_assert_(rolling_f64.try_extend(window_width_k, dims_k) == status_t::success_k);
     sz_assert_(rolling_serial.try_seed() == status_t::success_k);
+    sz_assert_(rolling_haswell.try_seed() == status_t::success_k);
     sz_assert_(rolling_skylake.try_seed() == status_t::success_k);
 
     // Test on each individual dataset
     auto unit_strings = rolling_hasher_basic_inputs();
-    test_against_baseline(unit_strings, rolling_f64, rolling_serial);
-    test_against_baseline(unit_strings, rolling_f64, rolling_skylake);
+    test_rolling_hashers_equivalence_against_baseline<dims_k>(unit_strings, rolling_f64, rolling_serial);
+    test_rolling_hashers_equivalence_against_baseline<dims_k>(unit_strings, rolling_f64, rolling_haswell);
+    test_rolling_hashers_equivalence_against_baseline<dims_k>(unit_strings, rolling_f64, rolling_skylake);
 
     // Now for DNA-like data
     auto dna_like_strings = rolling_hasher_dna_like_inputs();
-    test_against_baseline(dna_like_strings, rolling_f64, rolling_serial);
-    test_against_baseline(dna_like_strings, rolling_f64, rolling_skylake);
+    test_rolling_hashers_equivalence_against_baseline<dims_k>(dna_like_strings, rolling_f64, rolling_serial);
+    test_rolling_hashers_equivalence_against_baseline<dims_k>(dna_like_strings, rolling_f64, rolling_haswell);
+    test_rolling_hashers_equivalence_against_baseline<dims_k>(dna_like_strings, rolling_f64, rolling_skylake);
 
     // Finally, for inconvenient strings
     auto inconvenient_strings = rolling_hasher_inconvenient_inputs();
-    test_against_baseline(inconvenient_strings, rolling_f64, rolling_serial);
-    test_against_baseline(inconvenient_strings, rolling_f64, rolling_skylake);
+    test_rolling_hashers_equivalence_against_baseline<dims_k>(inconvenient_strings, rolling_f64, rolling_serial);
+    test_rolling_hashers_equivalence_against_baseline<dims_k>(inconvenient_strings, rolling_f64, rolling_haswell);
+    test_rolling_hashers_equivalence_against_baseline<dims_k>(inconvenient_strings, rolling_f64, rolling_skylake);
 }
 
 void test_rolling_hashers_equivalence() {

From 11af64462dc4e4d0d51344995c3d5a132efac206 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 30 Jul 2025 15:27:51 +0000
Subject: [PATCH 504/751] Add: Draft fingerprinting on GPUs

---
 include/stringzilla/types.h          |  10 +
 include/stringzillas/fingerprint.cuh | 412 +++++++++++++++++++++++++++
 include/stringzillas/fingerprint.hpp | 384 ++++++++++++-------------
 3 files changed, 614 insertions(+), 192 deletions(-)
 create mode 100644 include/stringzillas/fingerprint.cuh

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 0591ba5e..ee19352d 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -293,6 +293,9 @@
 extern "C" {
 #endif
 
+typedef float sz_f32_t;  // 32-bit floating-point number
+typedef double sz_f64_t; // 64-bit floating-point number
+
 /*
  *  Let's infer the integer types or pull them from LibC,
  *  if that is allowed by the user.
@@ -754,6 +757,8 @@ typedef union sz_u128_vec_t {
     uint32x4_t u32x4;
     uint64x2_t u64x2;
 #endif
+    sz_f64_t f64s[2];
+    sz_f32_t f32s[4];
     sz_u64_t u64s[2];
     sz_i64_t i64s[2];
     sz_u32_t u32s[4];
@@ -782,6 +787,8 @@ typedef union sz_u256_vec_t {
     uint32x4_t u32x4s[2];
     uint64x2_t u64x2s[2];
 #endif
+    sz_f64_t f64s[4];
+    sz_f32_t f32s[8];
     sz_u64_t u64s[4];
     sz_i64_t i64s[4];
     sz_u32_t u32s[8];
@@ -813,6 +820,8 @@ typedef union sz_u512_vec_t {
     uint32x4_t u32x4s[4];
     uint64x2_t u64x2s[4];
 #endif
+    sz_f64_t f64s[8];
+    sz_f32_t f32s[16];
     sz_u64_t u64s[8];
     sz_i64_t i64s[8];
     sz_u32_t u32s[16];
@@ -1152,6 +1161,7 @@ SZ_INTERNAL __mmask8 sz_u8_mask_until_(sz_size_t n) { return (__mmask8)_bzhi_u32
 SZ_INTERNAL __mmask16 sz_u16_mask_until_(sz_size_t n) { return (__mmask16)_bzhi_u32(0xFFFFu, n); }
 SZ_INTERNAL __mmask32 sz_u32_mask_until_(sz_size_t n) { return (__mmask32)_bzhi_u64(0xFFFFFFFFu, n); }
 SZ_INTERNAL __mmask64 sz_u64_mask_until_(sz_size_t n) { return (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n); }
+SZ_INTERNAL __mmask8 sz_u8_clamp_mask_until_(sz_size_t n) { return n < 8 ? sz_u8_mask_until_(n) : 0xFFu; }
 SZ_INTERNAL __mmask16 sz_u16_clamp_mask_until_(sz_size_t n) { return n < 16 ? sz_u16_mask_until_(n) : 0xFFFFu; }
 SZ_INTERNAL __mmask32 sz_u32_clamp_mask_until_(sz_size_t n) { return n < 32 ? sz_u32_mask_until_(n) : 0xFFFFFFFFu; }
 SZ_INTERNAL __mmask64 sz_u64_clamp_mask_until_(sz_size_t n) {
diff --git a/include/stringzillas/fingerprint.cuh b/include/stringzillas/fingerprint.cuh
new file mode 100644
index 00000000..d9f0adb7
--- /dev/null
+++ b/include/stringzillas/fingerprint.cuh
@@ -0,0 +1,412 @@
+/**
+ *  @brief  CUDA-accelerated fingerprinting utilities for string collections.
+ *  @file   fingerprint.cuh
+ *  @author Ash Vardanian
+ *
+ *  CUDA specialization of the `floating_rolling_hashers` template for GPU-accelerated count-min-sketching.
+ *  Unlike the CPU variants, this implementation focuses on batch-processing of large collections of strings,
+ *  assigning warps to process multiple strings in parallel.
+ */
+#ifndef STRINGZILLAS_FINGERPRINT_CUH_
+#define STRINGZILLAS_FINGERPRINT_CUH_
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cooperative_groups.h>
+
+#include "stringzillas/types.cuh"
+#include "stringzillas/fingerprint.hpp"
+
+namespace ashvardanian {
+namespace stringzillas {
+
+#pragma region - CUDA Device Helpers
+
+/**
+ *  @brief Wraps a single task for the CUDA-based @b byte-level "fingerprint" kernels.
+ *  @note Used to allow sorting/grouping inputs to differentiate device-wide and warp-wide tasks.
+ */
+template <typename char_type_>
+struct cuda_floating_fingerprint_task_ {
+    using char_t = char_type_;
+
+    char_t const *text_ptr = nullptr;
+    size_t text_length = 0;
+    size_t original_index = 0;
+    u32_t *min_hashes = nullptr;
+    u32_t *min_counts = nullptr;
+    warp_tasks_density_t density = warps_working_together_k; // ? Worst case, we have to sync final writes
+};
+
+__device__ __forceinline__ f64_t barrett_mod_cuda_(f64_t x, f64_t modulo, f64_t inverse_modulo) noexcept {
+    f64_t q = floor(x * inverse_modulo);
+    f64_t result = fma(-q, modulo, x);
+
+    if (result < 0.0) result += modulo;
+    if (result >= modulo) result -= modulo;
+    return result;
+}
+
+#pragma endregion - CUDA Device Helpers
+
+#pragma region - CUDA Kernels
+
+/**
+ *  Each warp takes in an individual document from @p `texts` and computes many rolling hashes for it.
+ *  Each thread computes an independent rolling hash for a specific dimension, so you should have a multiple
+ *  of warp-size dimensions per fingerprint.
+ *
+ *  The core idea of this kernel is to adapt to hash-functions of different window widths.
+ *  Assuming the GPUs handle handle consecutive reads much better than random reads
+ */
+template <typename texts_type_, typename min_hashes_per_text_type_, typename min_counts_per_text_type_,
+          typename hasher_type_, size_t warp_size_ = 32>
+__global__ void basic_rolling_hashers_kernel_(texts_type_ const &texts, span<hasher_type_> hashers,
+                                              min_hashes_per_text_type_ &min_hashes_output,
+                                              min_counts_per_text_type_ &min_counts_output) {
+    //
+    using hasher_t = hasher_type_;
+    using state_t = typename hasher_t::state_t;
+    using hash_t = typename hasher_t::hash_t;
+
+    using text_t = typename texts_type_::value_type;
+    using char_t = typename text_t::value_type;
+
+    // With different window widths, the discarding character will vary between threads in a warp.
+    // We may, however, assume that in the common case, the window will be the same and the outgoing
+    // character will be cached. So we explicitly store only the incoming character in the shared memory.
+    // On Nvidia GPUs with 32 threads per warp, this is only 32 * 4 = 128 bytes of shared memory.
+    // On AMD GPUs, with 64 threads per warp, this is 64 * 4 = 256 bytes of shared memory.
+    constexpr size_t warp_size_k = warp_size_;
+    __shared__ sz_u32_vec_t incoming_text_chunk[warp_size_k];
+}
+
+/**
+ *  Each warp takes in an individual document from @p `tasks` and computes many rolling hashes for it.
+ *  Each thread computes an independent rolling hash for a specific dimension, so you should have a multiple
+ *  of warp-size dimensions per fingerprint.
+ *
+ *  Unlike the `basic_rolling_hashers_kernel_` basic variant, all hashers have the same window width.
+ *  This greatly simplifies the memory access patterns. Assuming each thread in a warp can issue an independent
+ *  read for consecutive elements, and easily loads 32 bits at a time, this kernel is suited to loading
+ *  4x bytes and computing 4x (warp_size_) rolling hashes per thread in the inner loop.
+ */
+template <                                                                     //
+    unsigned window_width_, unsigned dimensions_, sz_capability_t capability_, //
+    typename char_type_ = byte_t, unsigned warp_size_ = 32                     //
+    >
+__global__ void floating_rolling_hashers_on_each_cuda_warp_(                            //
+    cuda_floating_fingerprint_task_<char_type_> const *tasks, size_t const tasks_count, //
+    floating_rolling_hasher<f64_t> const *hashers, size_t const hashers_count) {
+
+    //
+    using task_t = cuda_floating_fingerprint_task_<char_type_>;
+    using hasher_t = floating_rolling_hasher<f64_t>;
+    constexpr size_t window_width_k = window_width_;
+    constexpr size_t dimensions_k = dimensions_;
+    constexpr size_t dimensions_per_thread_k = dimensions_k / warp_size_k;
+    constexpr f64_t skipped_rolling_hash_k = hasher_t::skipped_rolling_hash_k;
+    constexpr u32_t max_hash_k = hasher_t::max_hash_k;
+    static_assert(dimensions_k % warp_size_k == 0, "Dimensions must be a multiple of warp size");
+
+    // We don't use too much shared memory in these algorithms to allow scaling to very long windows,
+    // and large number of blocks per SM. The consecutive aligned reads should be very performant.
+    constexpr size_t warp_size_k = warp_size_;
+    __shared__ sz_u32_vec_t discarding_text_chunk[warp_size_k]; // TODO: How do we allocate many warps per block?!
+    __shared__ sz_u32_vec_t incoming_text_chunk[warp_size_k];
+
+    // We may have multiple warps operating in the same block.
+    unsigned const warp_size = warpSize;
+    unsigned const global_thread_index = static_cast<unsigned>(blockIdx.x * blockDim.x + threadIdx.x);
+    unsigned const global_warp_index = static_cast<unsigned>(global_thread_index / warp_size_k);
+    unsigned const warps_per_block = static_cast<unsigned>(blockDim.x / warp_size_k);
+    unsigned const warps_per_device = static_cast<unsigned>(gridDim.x * warps_per_block);
+    unsigned const warp_thread_index = static_cast<unsigned>(global_thread_index % warp_size_k);
+    sz_assert_(warp_size == warp_size_k && "Warp size mismatch in kernel");
+
+    // Load the hashers states per thread.
+    f64_t multipliers[dimensions_per_thread_k];
+    f64_t negative_discarding_multipliers[dimensions_per_thread_k];
+    f64_t modulos[dimensions_per_thread_k];
+    f64_t inverse_modulos[dimensions_per_thread_k];
+    for (unsigned dim_withing_thread = 0; dim_withing_thread < dimensions_per_thread_k; ++dim_withing_thread) {
+        unsigned const dim = warp_thread_index * dimensions_per_thread_k + dim_withing_thread;
+        auto const &hasher = hashers[dim];
+        multipliers[dim_withing_thread] = hasher.multiplier();
+        negative_discarding_multipliers[dim_withing_thread] = hasher.negative_discarding_multiplier();
+        modulos[dim_withing_thread] = hasher.modulo();
+        inverse_modulos[dim_withing_thread] = hasher.inverse_modulo();
+    }
+
+    // We are computing N edit distances for N pairs of strings. Not a cartesian product!
+    // Each block/warp may end up receiving a different number of strings.
+    for (size_t task_index = global_warp_index; task_index < tasks.size(); task_index += warps_per_device) {
+        task_t const task = tasks[task_index];
+
+        // For each state we need to reset the local state
+        f64_t rolling_states[dimensions_per_thread_k] = {0.0};
+        f64_t rolling_minimums[dimensions_per_thread_k] = {skipped_rolling_hash_k};
+        u32_t min_counts[dimensions_per_thread_k] = {0};
+
+        // Until we reach the `window_width_k`, we don't need to discard any symbols and can keep the code simpler
+        size_t const prefix_length = (std::min)(task.text_length, window_width_k);
+        size_t new_char_offset = 0;
+        for (; new_char_offset < prefix_length; ++new_char_offset) {
+            byte_t const new_char = task.text_ptr[new_char_offset]; // ? Hardware may auto-broadcast this
+            f64_t const new_term = static_cast<f64_t>(new_char) + 1.0;
+
+            for (unsigned dim_within_thread = 0; dim_within_thread < dimensions_per_thread_k; ++dim_within_thread) {
+                f64_t &rolling_state = rolling_states[dim_within_thread];
+                f64_t const multiplier = multipliers[dim_within_thread];
+                f64_t const modulo = modulos[dim_within_thread];
+                f64_t const inverse_modulo = inverse_modulos[dim_within_thread];
+                rolling_state = fma(rolling_state, multiplier, new_term);
+                rolling_state = barrett_mod_cuda_(rolling_state, modulo, inverse_modulo);
+            }
+        }
+
+        // We now have our first minimum hashes
+        if (new_char_offset == window_width_k)
+            for (unsigned dim_within_thread = 0; dim_within_thread < dimensions_per_thread_k; ++dim_within_thread)
+                rolling_minimums[dim_within_thread] = rolling_states[dim_within_thread],
+                rolling_counts[dim_within_thread] = 1;
+
+        // Roll forward, until we reach an offset divisible by 4, the size of `sz_u32_vec_t` for optimal loads.
+        size_t const unaligned_prefix_length = (std::min)(                       //
+            round_up_to_multiple<size_t>(new_char_offset, sizeof(sz_u32_vec_t)), //
+            task.text_length);
+        for (; new_char_offset < unaligned_prefix_length; ++new_char_offset) {
+            byte_t const new_char = task.text_ptr[new_char_offset]; // ? Hardware may auto-broadcast this
+            byte_t const old_char = task.text_ptr[new_char_offset - window_width_k];
+            f64_t const new_term = static_cast<f64_t>(new_char) + 1.0;
+            f64_t const old_term = static_cast<f64_t>(old_char) + 1.0;
+
+            for (unsigned dim_within_thread = 0; dim_within_thread < dimensions_per_thread_k; ++dim_within_thread) {
+                f64_t &rolling_state = rolling_states[dim_within_thread];
+                f64_t const multiplier = multipliers[dim_within_thread];
+                f64_t const negative_discarding_multiplier = negative_discarding_multipliers[dim_within_thread];
+                f64_t const modulo = modulos[dim_within_thread];
+                f64_t const inverse_modulo = inverse_modulos[dim_within_thread];
+                rolling_state = fma(negative_discarding_multiplier, old_term, rolling_state);
+                rolling_state = barrett_mod_cuda_(rolling_state, modulo, inverse_modulo);
+                rolling_state = fma(rolling_state, multiplier, new_term);
+                rolling_state = barrett_mod_cuda_(rolling_state, modulo, inverse_modulo);
+
+                // Update the minimums and counts
+                f64_t &rolling_minimum = rolling_minimums[dim_within_thread];
+                u32_t &min_count = min_counts[dim_within_thread];
+                min_count *= rolling_state >= rolling_minimum; // ? Discard `min_count` to 0 for new extremums
+                min_count += rolling_state <= rolling_minimum; // ? Increments by 1 for new & old minimums
+                rolling_minimum = (std::min)(rolling_minimum, rolling_state);
+            }
+        }
+
+        // Now the main massive unrolled, coalescing reads & writes via `discarding_text_chunk` & `incoming_text_chunk`.
+        constexpr size_t unrolled_step_length_k = sizeof(incoming_text_chunk);
+        for (; new_char_offset + unrolled_step_length_k < task.text_length; new_char_offset += unrolled_step_length_k) {
+
+            // Load the next chunk of characters into shared memory
+            sz_u32_vec_t const *incoming_words = reinterpret_cast<sz_u32_vec_t const *>(text.data() + new_char_offset);
+            sz_u32_vec_t const *discarding_words =
+                reinterpret_cast<sz_u32_vec_t const *>(text.data() + new_char_offset - window_width_k);
+            incoming_text_chunk[warp_thread_index] = incoming_words[warp_thread_index];
+            discarding_text_chunk[warp_thread_index] = discarding_words[warp_thread_index];
+
+            // Make sure the shared memory is fully loaded.
+            __syncwarp();
+
+            for (unsigned char_within_step = 0; char_within_step < unrolled_step_length_k; ++char_within_step) {
+                // Transparently index in-shared-memory chunks
+                byte_t const new_char = incoming_text_chunk[0].u8s[char_within_step];
+                byte_t const old_char = discarding_text_chunk[0].u8s[char_within_step];
+                f64_t const new_term = static_cast<f64_t>(new_char) + 1.0;
+                f64_t const old_term = static_cast<f64_t>(old_char) + 1.0;
+
+                for (unsigned dim_within_thread = 0; dim_within_thread < dimensions_per_thread_k; ++dim_within_thread) {
+                    f64_t &rolling_state = rolling_states[dim_within_thread];
+                    f64_t const multiplier = multipliers[dim_within_thread];
+                    f64_t const negative_discarding_multiplier = negative_discarding_multipliers[dim_within_thread];
+                    f64_t const modulo = modulos[dim_within_thread];
+                    f64_t const inverse_modulo = inverse_modulos[dim_within_thread];
+                    rolling_state = fma(negative_discarding_multiplier, old_term, rolling_state);
+                    rolling_state = barrett_mod_cuda_(rolling_state, modulo, inverse_modulo);
+                    rolling_state = fma(rolling_state, multiplier, new_term);
+                    rolling_state = barrett_mod_cuda_(rolling_state, modulo, inverse_modulo);
+
+                    // Update the minimums and counts
+                    f64_t &rolling_minimum = rolling_minimums[dim_within_thread];
+                    u32_t &min_count = min_counts[dim_within_thread];
+                    min_count *= rolling_state >= rolling_minimum; // ? Discard `min_count` to 0 for new extremums
+                    min_count += rolling_state <= rolling_minimum; // ? Increments by 1 for new & old minimums
+                    rolling_minimum = (std::min)(rolling_minimum, rolling_state);
+                }
+            }
+        }
+
+        // Roll until the end of the text
+        for (; new_char_offset < task.text_length; ++new_char_offset) {
+            byte_t const new_char = task.text_ptr[new_char_offset]; // ? Hardware may auto-broadcast this
+            byte_t const old_char = task.text_ptr[new_char_offset - window_width_k];
+            f64_t const new_term = static_cast<f64_t>(new_char) + 1.0;
+            f64_t const old_term = static_cast<f64_t>(old_char) + 1.0;
+
+            for (unsigned dim_within_thread = 0; dim_within_thread < dimensions_per_thread_k; ++dim_within_thread) {
+                f64_t &rolling_state = rolling_states[dim_within_thread];
+                f64_t const multiplier = multipliers[dim_within_thread];
+                f64_t const negative_discarding_multiplier = negative_discarding_multipliers[dim_within_thread];
+                f64_t const modulo = modulos[dim_within_thread];
+                f64_t const inverse_modulo = inverse_modulos[dim_within_thread];
+                rolling_state = fma(negative_discarding_multiplier, old_term, rolling_state);
+                rolling_state = barrett_mod_cuda_(rolling_state, modulo, inverse_modulo);
+                rolling_state = fma(rolling_state, multiplier, new_term);
+                rolling_state = barrett_mod_cuda_(rolling_state, modulo, inverse_modulo);
+
+                // Update the minimums and counts
+                f64_t &rolling_minimum = rolling_minimums[dim_within_thread];
+                u32_t &min_count = min_counts[dim_within_thread];
+                min_count *= rolling_state >= rolling_minimum; // ? Discard `min_count` to 0 for new extremums
+                min_count += rolling_state <= rolling_minimum; // ? Increments by 1 for new & old minimums
+                rolling_minimum = (std::min)(rolling_minimum, rolling_state);
+            }
+        }
+
+        // Finally export the results
+        for (unsigned dim_within_thread = 0; dim_within_thread < dimensions_per_thread_k; ++dim_within_thread) {
+            unsigned const dim = warp_thread_index * dimensions_per_thread_k + dim_within_thread;
+            task.min_counts[dim] = min_counts[dim_within_thread];
+            task.min_hashes[dim] = rolling_minimums[dim_within_thread] == skipped_rolling_hash_k
+                                       ? max_hash_k
+                                       : static_cast<min_hash_t>(rolling_minimums[dim_within_thread]);
+        }
+    }
+}
+
+/**
+ *  Each of @p `tasks` is distributed across the entire device, unlike `floating_rolling_hashers_on_each_cuda_warp_`,
+ *  where individual warps take care of separate unrelated inputs. The biggest difference is in how the minimum values
+ *  are later reduced across the entire device, rather than per-warp.
+ */
+template <                                                                 //
+    size_t window_width_, size_t dimensions_, sz_capability_t capability_, //
+    typename char_type_ = byte_t, size_t warp_size_ = 32                   //
+    >
+__global__ void floating_rolling_hashers_across_cuda_device_(span<cuda_floating_fingerprint_task_<char_type_>> tasks,
+                                                             span<floating_rolling_hasher<f64_t> const> hashers) {
+    sz_unused_(tasks);
+    sz_unused_(hashers);
+}
+
+#pragma endregion - CUDA Kernels
+
+/**
+ *  @brief CUDA specialization of floating_rolling_hashers for count-min-sketching.
+ */
+template <size_t window_width_, size_t dimensions_>
+struct floating_rolling_hashers<sz_cap_cuda_k, window_width_, dimensions_> {
+
+    using hasher_t = floating_rolling_hasher<f64_t>;
+    using rolling_state_t = f64_t;
+    using min_hash_t = u32_t;
+    using min_count_t = u32_t;
+    using allocator_t = ualloc_t;
+
+    using hashers_allocator_t = typename allocator_t::template rebind<hasher_t>::other;
+    using hashers_t = safe_vector<hasher_t, hashers_allocator_t>;
+
+    static constexpr size_t window_width_k = window_width_;
+    static constexpr size_t dimensions_k = dimensions_;
+    static constexpr rolling_state_t skipped_rolling_hash_k = std::numeric_limits<rolling_state_t>::max();
+    static constexpr min_hash_t max_hash_k = std::numeric_limits<min_hash_t>::max();
+
+  private:
+    allocator_t alloc_;
+    hashers_t hashers_;
+
+  public:
+    floating_rolling_hashers(allocator_t const &alloc = {}) noexcept : alloc_(alloc), hashers_(alloc) {}
+    constexpr size_t dimensions() const noexcept { return dimensions_k; }
+    constexpr size_t window_width() const noexcept { return window_width_k; }
+    constexpr size_t window_width(size_t) const noexcept { return window_width_k; }
+
+    /**
+     *  @brief Initializes several rolling hashers with different multipliers and modulos.
+     *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
+     */
+    status_t try_seed(size_t alphabet_size = 256) noexcept {
+        if (hashers_.try_resize(dimensions_k) != status_t::success_k) return status_t::bad_alloc_k;
+        for (unsigned dim = 0; dim < dimensions_k; ++dim)
+            hashers_[dim] = hasher_t(window_width_k, alphabet_size + dim, hasher_t::default_modulo_base_k);
+        return status_t::success_k;
+    }
+
+    template <typename texts_type_, typename min_hashes_per_text_type_, typename min_counts_per_text_type_>
+    cuda_status_t operator()(texts_type_ const &texts, min_hashes_per_text_type_ &&min_hashes_per_text,
+                             min_counts_per_text_type_ &&min_counts_per_text, gpu_specs_t specs = {},
+                             cuda_executor_t executor = {}) const noexcept {
+
+        using texts_t = texts_type_;
+        using text_t = typename texts_t::value_type;
+        using char_t = typename text_t::value_type;
+        using task_t = cuda_similarity_task_<char_t>;
+        using tasks_allocator_t = typename allocator_t::template rebind<task_t>::other;
+
+        // Preallocate the events for GPU timing.
+        cudaEvent_t start_event, stop_event;
+        cudaEventCreate(&start_event, cudaEventBlockingSync);
+        cudaEventCreate(&stop_event, cudaEventBlockingSync);
+
+        // Populate the tasks for each warp or the entire device, putting it into unified memory.
+        safe_vector<task_t, tasks_allocator_t> tasks(alloc_);
+        if (tasks.try_resize(first_strings.size()) == status_t::bad_alloc_k) return {status_t::bad_alloc_k};
+        for (size_t task_index = 0; task_index < texts.size(); ++task_index) {
+            auto const &text = texts[task_index];
+            auto min_hashes = to_span(min_hashes_per_text[task_index]);
+            auto min_counts = to_span(min_counts_per_text[task_index]);
+            tasks[task_index] = task_t {
+                .text_ptr = text.data(),
+                .text_length = text.size(),
+                .original_index = task_index,
+                .min_hashes = min_hashes.data(),
+                .min_counts = min_counts.data(),
+                // Eight in this case is an arbitrary choice to ensure
+                .density = texts.size() > 8 * specs.streaming_multiprocessors ? warps_working_together_k
+                                                                              : four_warps_per_multiprocessor_k,
+            };
+        }
+        std::partition(tasks.begin(), tasks.end(),
+                       [](task_t const &task) { return task.density == warps_working_together_k; });
+
+        // Record the start event
+        cudaError_t start_event_error = cudaEventRecord(start_event, executor.stream);
+        if (start_event_error != cudaSuccess) return {status_t::unknown_k, start_event_error};
+
+        cuda_status_t result;
+
+        void *warp_level_kernel_args[4];
+        warp_level_kernel_args[0] = reinterpret_cast<void *>(tasks.data());
+        warp_level_kernel_args[1] = reinterpret_cast<void *>(tasks.size());
+        warp_level_kernel_args[2] = reinterpret_cast<void *>(hashers_.data());
+        warp_level_kernel_args[3] = reinterpret_cast<void *>(hashers_.size());
+        auto warp_level_kernel =
+            &floating_rolling_hashers_on_each_cuda_warp_<window_width_k, dimensions_k, sz_cap_cuda_k, char_t, 32>;
+
+        // TODO: We can be wiser about the dimensions of this grid.
+        unsigned const random_block_size = 128;
+        unsigned const random_blocks_per_multiprocessor = 32;
+        cudaError_t launch_error = cudaLaunchCooperativeKernel(                       //
+            reinterpret_cast<void *>(warp_level_kernel),                              // Kernel function pointer
+            dim3(random_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions
+            dim3(random_block_size),                                                  // Block dimensions
+            warp_level_kernel_args, // Array of kernel argument pointers
+            0,                      // Shared memory per block (in bytes)
+            executor.stream);       // CUDA stream
+        if (launch_error != cudaSuccess)
+            if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
+            else { return {status_t::unknown_k, launch_error}; }
+    }
+};
+
+} // namespace stringzillas
+} // namespace ashvardanian
+
+#endif // STRINGZILLAS_FINGERPRINT_CUH_
diff --git a/include/stringzillas/fingerprint.hpp b/include/stringzillas/fingerprint.hpp
index eac2731f..cd43bd6a 100644
--- a/include/stringzillas/fingerprint.hpp
+++ b/include/stringzillas/fingerprint.hpp
@@ -70,7 +70,7 @@
  *    - on Intel Ice Lake: 4 cycles on port 0.
  *    - on AMD Zen4: 4 cycles on ports 0 or 1.
  *
- *  The significand of a `double` can store at least 52 bits worth of unique values, and the latencies of
+ *  The significand of a `f64_t` can store at least 52 bits worth of unique values, and the latencies of
  *  the `VFMADD132PD` and `VPMADD52LUQ` seem identical, which suggests that under the hood, those instructions
  *  may be using the same machinery. Importantly, floating-point division is still expensive:
  *
@@ -130,35 +130,35 @@ namespace stringzillas {
 
 /**
  *  @brief The simplest example of a rolling hash function, leveraging 2^N modulo arithmetic.
- *  @tparam hash_type_ Type of the hash value, e.g., `std::uint64_t`.
+ *  @tparam hash_type_ Type of the hash value, e.g., `u64_t`.
  */
-template <typename hash_type_ = std::uint64_t>
+template <typename hash_type_ = u64_t>
 struct multiplying_rolling_hasher {
     using state_t = hash_type_;
     using hash_t = hash_type_;
 
-    explicit multiplying_rolling_hasher(std::size_t window_width, hash_t multiplier = static_cast<hash_t>(257)) noexcept
+    explicit multiplying_rolling_hasher(size_t window_width, hash_t multiplier = static_cast<hash_t>(257)) noexcept
         : window_width_ {window_width}, multiplier_ {multiplier}, highest_power_ {1} {
 
         sz_assert_(window_width_ > 1 && "Window width must be > 1");
         sz_assert_(multiplier_ > 0 && "Multiplier must be positive");
 
-        for (std::size_t i = 0; i + 1 < window_width_; ++i) highest_power_ = highest_power_ * multiplier_;
+        for (size_t i = 0; i + 1 < window_width_; ++i) highest_power_ = highest_power_ * multiplier_;
     }
 
-    inline std::size_t window_width() const noexcept { return window_width_; }
+    constexpr size_t window_width() const noexcept { return window_width_; }
 
-    inline state_t push(state_t state, byte_t new_char) const noexcept { return state * multiplier_ + new_char; }
+    constexpr state_t push(state_t state, byte_t new_char) const noexcept { return state * multiplier_ + new_char; }
 
-    inline state_t roll(state_t state, byte_t old_char, byte_t new_char) const noexcept {
+    constexpr state_t roll(state_t state, byte_t old_char, byte_t new_char) const noexcept {
         state_t const without_head = state - old_char * highest_power_;
         return without_head * multiplier_ + new_char;
     }
 
-    inline hash_t digest(state_t const state) const noexcept { return state; }
+    constexpr hash_t digest(state_t const state) const noexcept { return state; }
 
   private:
-    std::size_t window_width_;
+    size_t window_width_;
     state_t multiplier_;
     state_t highest_power_;
 };
@@ -166,30 +166,30 @@ struct multiplying_rolling_hasher {
 /**
  *  @brief Rabin-Karp–style rolling polynomial hash function.
  *  @tparam hash_type_ Type of the hash value, can be `u16`, `u32`, or `u64`.
- *  @tparam accumulator_type_ Type used for modulo arithmetic, e.g., `std::uint64_t`.
+ *  @tparam accumulator_type_ Type used for modulo arithmetic, e.g., `u64_t`.
  *
  *  Barrett's reduction can be used to avoid overflow in the multiplication and modulo operations.
  *  That, however, is quite tricky and computationally expensive, so this algorithm is provided merely
  *  as a baseline for retrieval benchmarks.
  *  @sa `multiplying_rolling_hasher`
  */
-template <typename hash_type_ = std::uint32_t, typename accumulator_type_ = std::uint64_t>
+template <typename hash_type_ = u32_t, typename accumulator_type_ = u64_t>
 struct rabin_karp_rolling_hasher {
     using hash_t = hash_type_;
     using state_t = accumulator_type_;
 
-    static_assert(is_same_type<hash_t, std::uint16_t>::value || is_same_type<hash_t, std::uint32_t>::value ||
-                      is_same_type<hash_t, std::uint64_t>::value,
+    static_assert(is_same_type<hash_t, u16_t>::value || is_same_type<hash_t, u32_t>::value ||
+                      is_same_type<hash_t, u64_t>::value,
                   "Unsupported hash type");
 
     static constexpr hash_t default_alphabet_size_k = 256u;
     static constexpr hash_t default_modulo_base_k = //
-        is_same_type<hash_t, std::uint16_t>::value   ? SZ_U16_MAX_PRIME
-        : is_same_type<hash_t, std::uint32_t>::value ? SZ_U32_MAX_PRIME
-                                                     : SZ_U64_MAX_PRIME;
+        is_same_type<hash_t, u16_t>::value   ? SZ_U16_MAX_PRIME
+        : is_same_type<hash_t, u32_t>::value ? SZ_U32_MAX_PRIME
+                                             : SZ_U64_MAX_PRIME;
 
     explicit rabin_karp_rolling_hasher(              //
-        std::size_t window_width,                    //
+        size_t window_width,                         //
         hash_t multiplier = default_alphabet_size_k, //
         hash_t modulo = default_modulo_base_k) noexcept
         : window_width_ {window_width}, modulo_ {modulo}, multiplier_ {multiplier}, discarding_multiplier_ {1} {
@@ -198,18 +198,18 @@ struct rabin_karp_rolling_hasher {
         sz_assert_(multiplier_ > 0 && "Multiplier must be positive");
         sz_assert_(modulo_ > 1 && "Modulo base must be > 1");
 
-        for (std::size_t i = 0; i + 1 < window_width_; ++i)
+        for (size_t i = 0; i + 1 < window_width_; ++i)
             discarding_multiplier_ = mul_mod(discarding_multiplier_, multiplier_);
     }
 
-    inline std::size_t window_width() const noexcept { return window_width_; }
+    constexpr size_t window_width() const noexcept { return window_width_; }
 
-    inline state_t push(state_t state, byte_t new_char) const noexcept {
+    constexpr state_t push(state_t state, byte_t new_char) const noexcept {
         state_t new_term = static_cast<state_t>(new_char + 1u);
         return add_mod(mul_mod(state, multiplier_), new_term);
     }
 
-    inline state_t roll(state_t state, byte_t old_char, byte_t new_char) const noexcept {
+    constexpr state_t roll(state_t state, byte_t old_char, byte_t new_char) const noexcept {
         state_t old_term = static_cast<state_t>(old_char + 1u);
         state_t new_term = static_cast<state_t>(new_char + 1u);
 
@@ -218,14 +218,14 @@ struct rabin_karp_rolling_hasher {
         return with_new;
     }
 
-    inline hash_t digest(state_t state) const noexcept { return static_cast<hash_t>(state); }
+    constexpr hash_t digest(state_t state) const noexcept { return static_cast<hash_t>(state); }
 
   private:
-    inline state_t mul_mod(state_t a, state_t b) const noexcept { return (a * b) % modulo_; }
-    inline state_t add_mod(state_t a, state_t b) const noexcept { return (a + b) % modulo_; }
-    inline state_t sub_mod(state_t a, state_t b) const noexcept { return (a + modulo_ - b) % modulo_; }
+    constexpr state_t mul_mod(state_t a, state_t b) const noexcept { return (a * b) % modulo_; }
+    constexpr state_t add_mod(state_t a, state_t b) const noexcept { return (a + b) % modulo_; }
+    constexpr state_t sub_mod(state_t a, state_t b) const noexcept { return (a + modulo_ - b) % modulo_; }
 
-    std::size_t window_width_;
+    size_t window_width_;
     state_t modulo_;
     state_t multiplier_;
     state_t discarding_multiplier_;
@@ -233,51 +233,51 @@ struct rabin_karp_rolling_hasher {
 
 /**
  *  @brief BuzHash rolling hash function leveraging a fixed-size lookup table and bitwise operations.
- *  @tparam hash_type_ Type of the hash value, e.g., `std::uint64_t`.
+ *  @tparam hash_type_ Type of the hash value, e.g., `u64_t`.
  *  @sa `multiplying_rolling_hasher`, `rabin_karp_rolling_hasher`
  */
-template <typename hash_type_ = std::uint64_t>
+template <typename hash_type_ = u64_t>
 struct buz_rolling_hasher {
     using state_t = hash_type_;
     using hash_t = hash_type_;
 
-    explicit buz_rolling_hasher(std::size_t window_width, std::uint64_t seed = 0x9E3779B97F4A7C15ull) noexcept
+    explicit buz_rolling_hasher(size_t window_width, u64_t seed = 0x9E3779B97F4A7C15ull) noexcept
         : window_width_ {window_width} {
 
         sz_assert_(window_width_ > 1 && "Window width must be > 1");
-        for (std::size_t i = 0; i < 256; ++i) table_[i] = split_mix64(seed);
+        for (size_t i = 0; i < 256; ++i) table_[i] = split_mix64(seed);
     }
 
-    inline std::size_t window_width() const noexcept { return window_width_; }
+    constexpr size_t window_width() const noexcept { return window_width_; }
 
-    inline state_t push(state_t state, byte_t new_char) const noexcept {
+    constexpr state_t push(state_t state, byte_t new_char) const noexcept {
         return rotl(state, 1) ^ table_[new_char & 0xFFu];
     }
 
-    inline state_t roll(state_t state, byte_t old_char, byte_t new_char) const noexcept {
+    constexpr state_t roll(state_t state, byte_t old_char, byte_t new_char) const noexcept {
         constexpr unsigned bits_k = sizeof(state_t) * 8u;
         state_t const rolled = rotl(state, 1);
         state_t const remove_term = rotl(table_[old_char & 0xFFu], window_width_ & (bits_k - 1u));
         return rolled ^ remove_term ^ table_[new_char & 0xFFu];
     }
 
-    inline hash_t digest(state_t state) const noexcept { return state; }
+    constexpr hash_t digest(state_t state) const noexcept { return state; }
 
   private:
-    static inline state_t rotl(state_t v, unsigned r) noexcept {
+    static constexpr state_t rotl(state_t v, unsigned r) noexcept {
         constexpr unsigned bits_k = sizeof(state_t) * 8u;
         return (v << r) | (v >> (bits_k - r));
     }
 
-    static inline std::uint64_t split_mix64(std::uint64_t &state) noexcept {
+    static constexpr u64_t split_mix64(u64_t &state) noexcept {
         state += 0x9E3779B97F4A7C15ull;
-        std::uint64_t z = state;
+        u64_t z = state;
         z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
         z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
         return z ^ (z >> 31);
     }
 
-    std::size_t window_width_;
+    size_t window_width_;
     state_t table_[256];
 };
 
@@ -285,29 +285,29 @@ struct buz_rolling_hasher {
  *  @brief Helper function to pick the second co-prime "modulo" base for the Karp-Rabin rolling hashes.
  *  @retval 0 on failure, or a valid prime number otherwise.
  */
-inline std::uint64_t choose_coprime_modulo(std::uint64_t multiplier, std::uint64_t limit) noexcept {
+inline u64_t choose_coprime_modulo(u64_t multiplier, u64_t limit) noexcept {
     if (multiplier == 0 || multiplier >= limit || limit <= 1) return 0;
 
     // Upper bound guaranteeing no overflow in non-discarding `update` calls
-    std::uint64_t max_input = std::numeric_limits<byte_t>::max() + 1u;
-    std::uint64_t bound = (limit - (max_input + 1)) / multiplier + 1;
+    u64_t max_input = std::numeric_limits<byte_t>::max() + 1u;
+    u64_t bound = (limit - (max_input + 1)) / multiplier + 1;
 
     if (!(bound & 1u)) --bound; // Make odd
 
-    for (std::uint64_t p = bound; p >= 3; p -= 2)
+    for (u64_t p = bound; p >= 3; p -= 2)
         if (std::gcd(p, multiplier) == 1) return p;
 
     return 0;
 }
 
-template <typename state_type_ = float>
+template <typename state_type_ = f32_t>
 struct floating_rolling_hasher;
 
 /**
  *  @brief Rabin-Karp-style Rolling hash function for single-precision floating-point numbers.
- *  @tparam state_type_ Type of the floating-point number, e.g., `float`.
+ *  @tparam state_type_ Type of the floating-point number, e.g., `f32_t`.
  *
- *  The IEEE 754 single-precision `float` has a 24-bit significand (23 explicit bits + 1 implicit bit).
+ *  The IEEE 754 single-precision `f32_t` has a 24-bit significand (23 explicit bits + 1 implicit bit).
  *  For simplicity, we just focus on the 23-bit part, which is capable of exactly representing integers
  *  up to (2²³ - 1) = (8'388'607), available in @b `limit_k`.
  *
@@ -320,14 +320,14 @@ struct floating_rolling_hasher;
  *     8'089, 8'093, 8'101, 8'111, 8'117, 8'123
  *
  *  ! Notice how small those modulo values are, so there's going to be very little information encoded in hashes.
- *  ! So the `floating_rolling_hasher<float>` should only be used for exploratory purposes & testing.
+ *  ! So the `floating_rolling_hasher<f32_t>` should only be used for exploratory purposes & testing.
  *
- *  @sa `floating_rolling_hasher<double>` for 52 bit variant.
+ *  @sa `floating_rolling_hasher<f64_t>` for 52 bit variant.
  */
 template <>
-struct floating_rolling_hasher<float> {
-    using state_t = float;
-    using hash_t = std::uint32_t;
+struct floating_rolling_hasher<f32_t> {
+    using state_t = f32_t;
+    using hash_t = u32_t;
 
     /** @brief The largest integer exactly representable as a float. */
     static constexpr state_t limit_k = 8'388'607.0f;
@@ -339,7 +339,7 @@ struct floating_rolling_hasher<float> {
     static constexpr hash_t default_modulo_base_k = 8123u;
 
     explicit floating_rolling_hasher(                      //
-        std::size_t const window_width,                    //
+        size_t const window_width,                         //
         hash_t const multiplier = default_alphabet_size_k, //
         hash_t const modulo = default_modulo_base_k) noexcept
         : window_width_ {window_width}, multiplier_ {static_cast<state_t>(multiplier)},
@@ -358,19 +358,19 @@ struct floating_rolling_hasher<float> {
         sz_assert_(largest_intermediary < limit_k && "Intermediate state overflows the limit");
 
         // ! The GCC header misses the `std::fmodf` overload, so we use the underlying C version
-        for (std::size_t i = 0; i + 1 < window_width_; ++i)
+        for (size_t i = 0; i + 1 < window_width_; ++i)
             negative_discarding_multiplier_ = ::fmodf(negative_discarding_multiplier_ * multiplier_, modulo_);
         negative_discarding_multiplier_ = -negative_discarding_multiplier_;
     }
 
-    inline std::size_t window_width() const noexcept { return window_width_; }
+    SZ_INLINE size_t window_width() const noexcept { return window_width_; }
 
-    inline state_t push(state_t state, byte_t new_char) const noexcept {
+    SZ_INLINE state_t push(state_t state, byte_t new_char) const noexcept {
         state_t new_term = state_t(new_char) + 1.0f;
         return fma_mod(state, multiplier_, new_term);
     }
 
-    inline state_t roll(state_t state, byte_t old_char, byte_t new_char) const noexcept {
+    SZ_INLINE state_t roll(state_t state, byte_t old_char, byte_t new_char) const noexcept {
 
         state_t old_term = state_t(old_char) + 1.0f;
         state_t new_term = state_t(new_char) + 1.0f;
@@ -379,16 +379,16 @@ struct floating_rolling_hasher<float> {
         return fma_mod(without_old, multiplier_, new_term);
     }
 
-    inline hash_t digest(state_t state) const noexcept { return static_cast<hash_t>(state); }
+    SZ_INLINE hash_t digest(state_t state) const noexcept { return static_cast<hash_t>(state); }
 
   private:
-    inline state_t fma_mod(state_t a, state_t b, state_t c) const noexcept { return barrett_mod(a * b + c); }
+    SZ_INLINE state_t fma_mod(state_t a, state_t b, state_t c) const noexcept { return barrett_mod(a * b + c); }
 
     /**
      *  @brief Barrett-style `std::fmodf` alternative to avoid overflow.
      *  @see https://en.cppreference.com/w/cpp/numeric/math/fmod
      */
-    inline state_t barrett_mod(state_t x) const noexcept {
+    SZ_INLINE state_t barrett_mod(state_t x) const noexcept {
 
         state_t q = std::floor(x * inverse_modulo_);
         state_t result = x - q * modulo_;
@@ -399,14 +399,14 @@ struct floating_rolling_hasher<float> {
 
         sz_assert_(result >= 0 && "Intermediate x underflows the zero");
         sz_assert_(result < limit_k && "Intermediate x overflows the limit");
-        sz_assert_(static_cast<std::uint64_t>(::fmodf(x, modulo_) + (::fmodf(x, modulo_) < 0.0f ? modulo_ : 0.0f)) ==
-                       static_cast<std::uint64_t>(result) &&
+        sz_assert_(static_cast<u64_t>(::fmodf(x, modulo_) + (::fmodf(x, modulo_) < 0.0f ? modulo_ : 0.0f)) ==
+                       static_cast<u64_t>(result) &&
                    "Floating point modulo was incorrect");
 
         return result;
     }
 
-    std::size_t window_width_;
+    size_t window_width_;
     state_t multiplier_;
     state_t modulo_;
     state_t inverse_modulo_;
@@ -421,10 +421,10 @@ inline f64_t absolute_fmod(f64_t x, f64_t y) noexcept {
 inline u64_t absolute_umod(f64_t x, f64_t y) noexcept { return static_cast<u64_t>(absolute_fmod(x, y)); }
 
 /**
- *  @brief Rabin-Karp-style Rolling hash function for double-precision floating-point numbers.
- *  @tparam state_type_ Type of the floating-point number, e.g., `float`.
+ *  @brief Rabin-Karp-style Rolling hash function for f64_t-precision floating-point numbers.
+ *  @tparam state_type_ Type of the floating-point number, e.g., `f32_t`.
  *
- *  The IEEE 754 double-precision `float` has a 53-bit significand (52 explicit bits + 1 implicit bit).
+ *  The IEEE 754 f64_t-precision `f32_t` has a 53-bit significand (52 explicit bits + 1 implicit bit).
  *  For simplicity, we just focus on the 52-bit part, which is capable of exactly representing integers
  *  up to (2⁵² - 1) = (4'503'599'627'370'495), available in @b `limit_k`.
  *
@@ -437,20 +437,20 @@ inline u64_t absolute_umod(f64_t x, f64_t y) noexcept { return static_cast<u64_t
  *      4'503'599'626'781, 4'503'599'626'783, 4'503'599'626'807,
  *      4'503'599'626'907, 4'503'599'626'957, 4'503'599'626'977.
  *
- *  @sa `rabin_karp_rolling_hasher<std::uint32_t, std::uint64_t>` integer implementation for small modulo variants.
- *  @sa `floating_rolling_hasher<float>` for a lower-resolution hash.
+ *  @sa `rabin_karp_rolling_hasher<u32_t, u64_t>` integer implementation for small modulo variants.
+ *  @sa `floating_rolling_hasher<f32_t>` for a lower-resolution hash.
  */
 template <>
-struct floating_rolling_hasher<double> {
-    using state_t = double;
-    using hash_t = std::uint64_t;
+struct floating_rolling_hasher<f64_t> {
+    using state_t = f64_t;
+    using hash_t = u64_t;
 
     static constexpr state_t limit_k = 4503599627370495.0;
     static constexpr hash_t default_alphabet_size_k = 256u;
     static constexpr hash_t default_modulo_base_k = 4503599626977u;
 
     explicit floating_rolling_hasher(                       //
-        std::size_t const window_width,                     //
+        size_t const window_width,                          //
         state_t const multiplier = default_alphabet_size_k, //
         state_t const modulo = default_modulo_base_k) noexcept
         : window_width_ {window_width}, multiplier_ {static_cast<state_t>(multiplier)},
@@ -468,40 +468,40 @@ struct floating_rolling_hasher<double> {
         state_t const largest_intermediary = largest_normalized_state * multiplier_ + largest_input_term;
         sz_assert_(largest_intermediary < limit_k && "Intermediate state overflows the limit");
 
-        for (std::size_t i = 0; i + 1 < window_width_; ++i)
+        for (size_t i = 0; i + 1 < window_width_; ++i)
             negative_discarding_multiplier_ = std::fmod(negative_discarding_multiplier_ * multiplier_, modulo_);
         negative_discarding_multiplier_ = -negative_discarding_multiplier_;
     }
 
-    inline std::size_t window_width() const noexcept { return window_width_; }
+    SZ_INLINE size_t window_width() const noexcept { return window_width_; }
 
-    inline state_t push(state_t state, byte_t new_char) const noexcept {
+    SZ_INLINE state_t push(state_t state, byte_t new_char) const noexcept {
         state_t new_term = state_t(new_char) + 1.0;
         return fma_mod(state, multiplier_, new_term);
     }
 
-    inline state_t roll(state_t state, byte_t old_char, byte_t new_char) const noexcept {
+    SZ_INLINE state_t roll(state_t state, byte_t old_char, byte_t new_char) const noexcept {
         state_t old_term = state_t(old_char) + 1.0;
         state_t new_term = state_t(new_char) + 1.0;
         state_t without_old = fma_mod(negative_discarding_multiplier_, old_term, state);
         return fma_mod(without_old, multiplier_, new_term);
     }
 
-    inline hash_t digest(state_t state) const noexcept { return static_cast<hash_t>(state); }
+    SZ_INLINE hash_t digest(state_t state) const noexcept { return static_cast<hash_t>(state); }
 
-    inline state_t multiplier() const noexcept { return multiplier_; }
-    inline state_t modulo() const noexcept { return modulo_; }
-    inline state_t inverse_modulo() const noexcept { return inverse_modulo_; }
-    inline state_t negative_discarding_multiplier() const noexcept { return negative_discarding_multiplier_; }
+    SZ_INLINE state_t multiplier() const noexcept { return multiplier_; }
+    SZ_INLINE state_t modulo() const noexcept { return modulo_; }
+    SZ_INLINE state_t inverse_modulo() const noexcept { return inverse_modulo_; }
+    SZ_INLINE state_t negative_discarding_multiplier() const noexcept { return negative_discarding_multiplier_; }
 
   private:
-    inline state_t fma_mod(state_t a, state_t b, state_t c) const noexcept { return barrett_mod(a * b + c); }
+    SZ_INLINE state_t fma_mod(state_t a, state_t b, state_t c) const noexcept { return barrett_mod(a * b + c); }
 
     /**
      *  @brief Barrett-style `std::fmod` alternative to avoid overflow.
      *  @see https://en.cppreference.com/w/cpp/numeric/math/fmod
      */
-    inline state_t barrett_mod(state_t x) const noexcept {
+    SZ_INLINE state_t barrett_mod(state_t x) const noexcept {
 
         state_t q = std::floor(x * inverse_modulo_);
         state_t result = x - q * modulo_;
@@ -512,12 +512,12 @@ struct floating_rolling_hasher<double> {
 
         sz_assert_(result >= 0 && "Intermediate x underflows the zero");
         sz_assert_(result < limit_k && "Intermediate x overflows the limit");
-        sz_assert_(static_cast<std::uint64_t>(absolute_fmod(x, modulo_)) == static_cast<std::uint64_t>(result) &&
+        sz_assert_(static_cast<u64_t>(absolute_fmod(x, modulo_)) == static_cast<u64_t>(result) &&
                    "Floating point modulo was incorrect");
         return result;
     }
 
-    std::size_t window_width_;
+    size_t window_width_;
     state_t multiplier_;
     state_t modulo_;
     state_t inverse_modulo_;
@@ -528,7 +528,7 @@ struct floating_rolling_hasher<double> {
 
 #pragma region - Optimized Rolling MinHashers
 
-template <size_t dimensions_ = SZ_SIZE_MAX, typename hash_type_ = std::uint32_t, typename count_type_ = std::uint32_t>
+template <size_t dimensions_ = SZ_SIZE_MAX, typename hash_type_ = u32_t, typename count_type_ = u32_t>
 void merge_count_min_sketches(                                                                           //
     span<hash_type_ const, dimensions_> a_min_hashes, span<count_type_ const, dimensions_> a_min_counts, //
     span<hash_type_ const, dimensions_> b_min_hashes, span<count_type_ const, dimensions_> b_min_counts, //
@@ -539,7 +539,7 @@ void merge_count_min_sketches(
     sz_assert_(c_min_hashes.size() == a_min_hashes.size() && "Output hashes must have the same size");
     sz_assert_(c_min_counts.size() == a_min_counts.size() && "Output counts must have the same size");
 
-    for (std::size_t dim = 0; dim < c_min_hashes.size(); ++dim) {
+    for (size_t dim = 0; dim < c_min_hashes.size(); ++dim) {
         if (a_min_hashes[dim] < b_min_hashes[dim]) {
             c_min_hashes[dim] = a_min_hashes[dim];
             c_min_counts[dim] = a_min_counts[dim];
@@ -559,11 +559,11 @@ void merge_count_min_sketches(
  *  @brief Boring Min-Hash / Count-Min-Sketch implementation over any rolling hashing algorithm just for benchmarking.
  *  @tparam hasher_type_ Can be the Rabin-Karp, BuzHash, or anything else compatible.
  */
-template <                                                                           //
-    typename hasher_type_ = rabin_karp_rolling_hasher<std::uint32_t, std::uint64_t>, //
-    typename min_hash_type_ = std::uint32_t,                                         //
-    typename min_count_type_ = std::uint32_t,                                        //
-    typename allocator_type_ = std::allocator<hasher_type_>                          //
+template <                                                           //
+    typename hasher_type_ = rabin_karp_rolling_hasher<u32_t, u64_t>, //
+    typename min_hash_type_ = u32_t,                                 //
+    typename min_count_type_ = u32_t,                                //
+    typename allocator_type_ = std::allocator<hasher_type_>          //
     >
 struct basic_rolling_hashers {
 
@@ -588,16 +588,16 @@ struct basic_rolling_hashers {
 
     allocator_t allocator_;
     safe_vector<hasher_t, hasher_allocator_t> hashers_;
-    std::size_t max_window_width_ = 0;
+    size_t max_window_width_ = 0;
 
   public:
     basic_rolling_hashers(allocator_t allocator = {}) noexcept
         : allocator_(std::move(allocator)),
           hashers_(allocator_traits_t::select_on_container_copy_construction(allocator)) {}
 
-    std::size_t dimensions() const noexcept { return hashers_.size(); }
-    std::size_t max_window_width() const noexcept { return max_window_width_; }
-    std::size_t window_width(std::size_t dim) const noexcept { return hashers_[dim].window_width(); }
+    size_t dimensions() const noexcept { return hashers_.size(); }
+    size_t max_window_width() const noexcept { return max_window_width_; }
+    size_t window_width(size_t dim) const noexcept { return hashers_[dim].window_width(); }
 
     /**
      *  @brief Appends multiple new rolling hashers for a given @p window_width.
@@ -611,17 +611,17 @@ struct basic_rolling_hashers {
      *  Typical usage of this interface (error handling aside) would be like:
      *
      *  @code{.cpp}
-     *  basic_rolling_hashers<rabin_karp_rolling_hasher<std::uint32_t>> hashers;
+     *  basic_rolling_hashers<rabin_karp_rolling_hasher<u32_t>> hashers;
      *  hashers.try_extend(3, 32); // 32 dims for 3-grams
      *  hashers.try_extend(5, 32); // 32 dims for 5-grams
      *  hashers.try_extend(7, 64); // 64 dims for 7-grams
-     *  std::array<std::uint32_t, 128> fingerprint; // 128 total dims
+     *  std::array<u32_t, 128> fingerprint; // 128 total dims
      *  hashers("some text", fingerprint);
      *  @endcode
      */
-    status_t try_extend(std::size_t window_width, std::size_t dims, std::size_t alphabet_size = 256) noexcept {
+    status_t try_extend(size_t window_width, size_t dims, size_t alphabet_size = 256) noexcept {
         if (hashers_.try_reserve(dims) != status_t::success_k) return status_t::bad_alloc_k;
-        for (std::size_t dim = 0; dim < dims; ++dim) {
+        for (size_t dim = 0; dim < dims; ++dim) {
             status_t status = try_append(hasher_t(window_width, alphabet_size + dim));
             sz_assert_(status == status_t::success_k && "Couldn't fail after the reserve");
         }
@@ -701,7 +701,7 @@ struct basic_rolling_hashers {
         span<rolling_hash_t, dimensions_> rolling_minimums, //
         span<min_hash_t, dimensions_> min_hashes,           //
         span<min_count_t, dimensions_> min_counts,          //
-        std::size_t const passed_progress = 0) const noexcept {
+        size_t const passed_progress = 0) const noexcept {
 
         sz_assert_(dimensions() == last_states.size() && "Dimensions number & states number mismatch");
         sz_assert_(dimensions() == rolling_minimums.size() && "Dimensions number & minimums number mismatch");
@@ -709,11 +709,11 @@ struct basic_rolling_hashers {
         sz_assert_(dimensions() == min_counts.size() && "Dimensions number & hash-counts number mismatch");
 
         // Until we reach the maximum window length, use a branching code version
-        std::size_t const prefix_length = (std::min)(text_chunk.size(), max_window_width_);
-        std::size_t new_char_offset = passed_progress;
+        size_t const prefix_length = (std::min)(text_chunk.size(), max_window_width_);
+        size_t new_char_offset = passed_progress;
         for (; new_char_offset < prefix_length; ++new_char_offset) {
             byte_t const new_char = text_chunk[new_char_offset];
-            for (std::size_t dim = 0; dim < last_states.size(); ++dim) {
+            for (size_t dim = 0; dim < last_states.size(); ++dim) {
                 auto &hasher = hashers_[dim];
                 rolling_state_t &last_state = last_states[dim];
                 rolling_hash_t &rolling_minimum = rolling_minimums[dim];
@@ -729,8 +729,8 @@ struct basic_rolling_hashers {
                 auto const old_char = text_chunk[new_char_offset - hasher.window_width()];
                 last_state = hasher.roll(last_state, old_char, new_char);
                 rolling_hash_t new_hash = hasher.digest(last_state);
-                min_count *= new_hash >= rolling_minimum; // ? Discard `min_count` to 0, if a new minimum is found
-                min_count += new_hash <= rolling_minimum; // ? Increments `min_count` by 1 for new & existing minimums
+                min_count *= new_hash >= rolling_minimum; // ? Discard `min_count` to 0 for new extremums
+                min_count += new_hash <= rolling_minimum; // ? Increments by 1 for new & old minimums
                 rolling_minimum = (std::min)(rolling_minimum, new_hash);
             }
         }
@@ -738,7 +738,7 @@ struct basic_rolling_hashers {
         // Now we can avoid a branch in the nested loop, as we are passed the longest window width
         for (; new_char_offset < text_chunk.size(); ++new_char_offset) {
             byte_t const new_char = text_chunk[new_char_offset];
-            for (std::size_t dim = 0; dim < last_states.size(); ++dim) {
+            for (size_t dim = 0; dim < last_states.size(); ++dim) {
                 auto &hasher = hashers_[dim];
                 rolling_state_t &last_state = last_states[dim];
                 rolling_hash_t &rolling_minimum = rolling_minimums[dim];
@@ -746,15 +746,15 @@ struct basic_rolling_hashers {
                 auto const old_char = text_chunk[new_char_offset - hasher.window_width()];
                 last_state = hasher.roll(last_state, old_char, new_char);
                 rolling_hash_t new_hash = hasher.digest(last_state);
-                min_count *= new_hash >= rolling_minimum; // ? Discard `min_count` to 0, if a new minimum is found
-                min_count += new_hash <= rolling_minimum; // ? Increments `min_count` by 1 for new & existing minimums
+                min_count *= new_hash >= rolling_minimum; // ? Discard `min_count` to 0 for new extremums
+                min_count += new_hash <= rolling_minimum; // ? Increments by 1 for new & old minimums
                 rolling_minimum = (std::min)(rolling_minimum, new_hash);
             }
         }
 
         // Finally, export the minimum hashes into the smaller representations
         if (min_hashes)
-            for (std::size_t dim = 0; dim < min_hashes.size(); ++dim) {
+            for (size_t dim = 0; dim < min_hashes.size(); ++dim) {
                 rolling_hash_t const &rolling_minimum = rolling_minimums[dim];
                 min_hash_t &min_hash = min_hashes[dim];
                 min_hash = rolling_minimum == skipped_rolling_hash_k
@@ -765,7 +765,7 @@ struct basic_rolling_hashers {
         // We may be in a position, when `text_chunk.size()` is smaller than the shortest window width,
         // so we must output zeros for the `min_counts` for every case, where the rolling state is skipped.
         if (min_counts)
-            for (std::size_t dim = 0; dim < min_counts.size(); ++dim) {
+            for (size_t dim = 0; dim < min_counts.size(); ++dim) {
                 rolling_hash_t const &rolling_minimum = rolling_minimums[dim];
                 min_count_t &min_count = min_counts[dim];
                 min_count = rolling_minimum == skipped_rolling_hash_k
@@ -797,8 +797,8 @@ struct basic_rolling_hashers {
         // Depending on document sizes, choose the appropriate parallelization strategy
         // - Either split each text into chunks across threads
         // - Or split the texts themselves across threads
-        std::size_t const text_size_threshold = executor.threads_count() * specs.l2_bytes;
-        std::size_t const dims = dimensions();
+        size_t const text_size_threshold = executor.threads_count() * specs.l2_bytes;
+        size_t const dims = dimensions();
 
         // Allocate enough temporary states for all cores to have individual states
         safe_vector<rolling_state_t, rolling_states_allocator_t> rolling_states(
@@ -837,19 +837,19 @@ struct basic_rolling_hashers {
         });
 
         // Process large texts by splitting them into chunks
-        for (std::size_t text_index = 0; text_index < texts.size(); ++text_index) {
+        for (size_t text_index = 0; text_index < texts.size(); ++text_index) {
 
             auto const &text = texts[text_index];
             if (text.size() < text_size_threshold) continue;
 
             // Split the text into chunks of the maximum window width
             auto const text_view = to_bytes_view(text);
-            std::size_t const chunk_size = round_up_to_multiple(             //
+            size_t const chunk_size = round_up_to_multiple(                  //
                 divide_round_up(text_view.size(), executor.threads_count()), //
                 specs.cache_line_width);
 
             // Distribute overlapping chunks across threads
-            executor.for_threads([&](std::size_t thread_index) noexcept {
+            executor.for_threads([&](size_t thread_index) noexcept {
                 auto text_start = text_view.data() + (std::min)(text_view.size(), thread_index * chunk_size);
                 // ? This overlap will be different for different window widths, but assuming we are
                 // ? computing the non-weighted Min-Hash, recomputing & comparing a few hashes for the
@@ -871,10 +871,10 @@ struct basic_rolling_hashers {
             // Compute the minimums of each thread's local states
             auto min_hashes = to_span(min_hashes_per_text[text_index]);
             auto min_counts = to_span(min_counts_per_text[text_index]);
-            for (std::size_t dim = 0; dim < min_hashes.size(); ++dim) {
+            for (size_t dim = 0; dim < min_hashes.size(); ++dim) {
                 rolling_hash_t min_hash = skipped_rolling_hash_k;
                 min_count_t min_count = 0;
-                for (std::size_t thread_index = 0; thread_index < executor.threads_count(); ++thread_index) {
+                for (size_t thread_index = 0; thread_index < executor.threads_count(); ++thread_index) {
                     rolling_hash_t thread_local_min_hash = rolling_minimums[thread_index * dims + dim];
                     min_count_t thread_local_min_count = rolling_counts[thread_index * dims + dim];
                     if (thread_local_min_hash == min_hash) { min_count += thread_local_min_count; }
@@ -924,7 +924,7 @@ status_t floating_rolling_hashers_in_parallel_(
     // Depending on document sizes, choose the appropriate parallelization strategy
     // - Either split each text into chunks across threads
     // - Or split the texts themselves across threads
-    std::size_t const text_size_threshold = specs.l2_bytes * executor.threads_count();
+    size_t const text_size_threshold = specs.l2_bytes * executor.threads_count();
 
     // Process small texts by individual threads
     using executor_t = typename std::decay<executor_type_>::type;
@@ -942,25 +942,25 @@ status_t floating_rolling_hashers_in_parallel_(
     });
 
     // Process large texts by splitting them into chunks
-    for (std::size_t text_index = 0; text_index < texts.size(); ++text_index) {
+    for (size_t text_index = 0; text_index < texts.size(); ++text_index) {
 
         auto const &text = texts[text_index];
         if (text.size() < text_size_threshold) continue;
 
         // Split the text into chunks of the maximum window width
         auto text_view = to_bytes_view(text);
-        std::size_t const chunk_size = round_up_to_multiple(             //
+        size_t const chunk_size = round_up_to_multiple(                  //
             divide_round_up(text_view.size(), executor.threads_count()), //
             specs.cache_line_width);
 
         rolling_state_t rolling_minimums[dimensions_k];
-        for (std::size_t dim = 0; dim < dimensions_k; ++dim) rolling_minimums[dim] = skipped_rolling_hash_k;
+        for (size_t dim = 0; dim < dimensions_k; ++dim) rolling_minimums[dim] = skipped_rolling_hash_k;
 
         // Distribute overlapping chunks across threads
         auto min_hashes = to_span(min_hashes_per_text[text_index]);
         auto min_counts = to_span(min_counts_per_text[text_index]);
         auto gather_mutex = executor.make_mutex();
-        executor.for_threads([&](std::size_t thread_index) noexcept {
+        executor.for_threads([&](size_t thread_index) noexcept {
             auto text_start = text_view.data() + (std::min)(text_view.size(), thread_index * chunk_size);
             // ? This overlap will be different for different window widths, but assuming we are
             // ? computing the non-weighted Min-Hash, recomputing & comparing a few hashes for the
@@ -971,13 +971,13 @@ status_t floating_rolling_hashers_in_parallel_(
             rolling_state_t thread_local_states[dimensions_k];
             rolling_state_t thread_local_minimums[dimensions_k];
             min_count_t thread_local_counts[dimensions_k];
-            for (std::size_t dim = 0; dim < dimensions_k; ++dim)
+            for (size_t dim = 0; dim < dimensions_k; ++dim)
                 thread_local_states[dim] = 0, thread_local_minimums[dim] = skipped_rolling_hash_k;
             engine.fingerprint_chunk(thread_local_text, thread_local_states, thread_local_minimums, {},
                                      thread_local_counts);
 
             lock_guard lock(gather_mutex);
-            for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
+            for (size_t dim = 0; dim < dimensions_k; ++dim) {
                 rolling_state_t &min_hash = rolling_minimums[dim];
                 min_count_t &min_count = min_counts[dim];
                 rolling_state_t thread_local_min_hash = thread_local_minimums[dim];
@@ -991,10 +991,10 @@ status_t floating_rolling_hashers_in_parallel_(
         // Digest the smallest hash states, luckily for us, for this hash function,
         // the smallest state corresponds to the smallest digested hash :)
         // This is also never a bottleneck, so let's keep it sequential for simplicity.
-        for (std::size_t dim = 0; dim < min_hashes.size(); ++dim) {
+        for (size_t dim = 0; dim < min_hashes.size(); ++dim) {
             rolling_state_t const &rolling_minimum = rolling_minimums[dim];
             min_hash_t &min_hash = min_hashes[dim];
-            auto const rolling_minimum_as_uint = static_cast<std::uint64_t>(rolling_minimum);
+            auto const rolling_minimum_as_uint = static_cast<u64_t>(rolling_minimum);
             min_hash = rolling_minimum == skipped_rolling_hash_k
                            ? max_hash_k // If the rolling minimum is not set, use the maximum hash value
                            : static_cast<min_hash_t>(rolling_minimum_as_uint & max_hash_k);
@@ -1021,19 +1021,19 @@ status_t floating_rolling_hashers_in_parallel_(
  */
 template <                                         //
     sz_capability_t capability_ = sz_cap_serial_k, //
-    std::size_t window_width_ = SZ_SIZE_MAX,       //
-    std::size_t dimensions_ = 64,                  //
+    size_t window_width_ = SZ_SIZE_MAX,            //
+    size_t dimensions_ = 64,                       //
     typename enable_ = void                        //
     >
 struct floating_rolling_hashers {
 
-    using hasher_t = floating_rolling_hasher<double>;
-    using rolling_state_t = double;
-    using min_hash_t = std::uint32_t;
-    using min_count_t = std::uint32_t;
+    using hasher_t = floating_rolling_hasher<f64_t>;
+    using rolling_state_t = f64_t;
+    using min_hash_t = u32_t;
+    using min_count_t = u32_t;
 
-    static constexpr std::size_t window_width_k = window_width_;
-    static constexpr std::size_t dimensions_k = dimensions_;
+    static constexpr size_t window_width_k = window_width_;
+    static constexpr size_t dimensions_k = dimensions_;
     static constexpr rolling_state_t skipped_rolling_state_k = std::numeric_limits<rolling_state_t>::max();
     static constexpr rolling_state_t skipped_rolling_hash_k = std::numeric_limits<rolling_state_t>::max();
     static constexpr min_hash_t max_hash_k = std::numeric_limits<min_hash_t>::max();
@@ -1048,15 +1048,15 @@ struct floating_rolling_hashers {
     rolling_state_t negative_discarding_multipliers_[dimensions_k];
 
   public:
-    constexpr std::size_t dimensions() const noexcept { return dimensions_k; }
-    constexpr std::size_t window_width() const noexcept { return window_width_k; }
-    constexpr std::size_t window_width(std::size_t) const noexcept { return window_width_k; }
+    constexpr size_t dimensions() const noexcept { return dimensions_k; }
+    constexpr size_t window_width() const noexcept { return window_width_k; }
+    constexpr size_t window_width(size_t) const noexcept { return window_width_k; }
 
     /**
      *  @brief Initializes several rolling hashers with different multipliers and modulos.
      *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
      */
-    status_t try_seed(std::size_t alphabet_size = 256) noexcept {
+    status_t try_seed(size_t alphabet_size = 256) noexcept {
         for (unsigned dim = 0; dim < dimensions_k; ++dim) {
             hasher_t hasher(window_width_k, alphabet_size + dim, hasher_t::default_modulo_base_k);
             multipliers_[dim] = hasher.multiplier();
@@ -1084,7 +1084,7 @@ struct floating_rolling_hashers {
 
         rolling_state_t rolling_states[dimensions_k];
         rolling_state_t rolling_minimums[dimensions_k];
-        for (std::size_t dim = 0; dim < dimensions_k; ++dim)
+        for (size_t dim = 0; dim < dimensions_k; ++dim)
             rolling_states[dim] = 0, rolling_minimums[dim] = skipped_rolling_hash_k;
         fingerprint_chunk(text, rolling_states, rolling_minimums, min_hashes, min_counts);
     }
@@ -1121,15 +1121,15 @@ struct floating_rolling_hashers {
         span<rolling_state_t, dimensions_k> rolling_minimums, //
         span<min_hash_t, dimensions_k> min_hashes,            //
         span<min_count_t, dimensions_k> min_counts,           //
-        std::size_t const passed_progress = 0) const noexcept {
+        size_t const passed_progress = 0) const noexcept {
 
         // Until we reach the maximum window length, use a branching code version
-        std::size_t const prefix_length = (std::min)(text_chunk.size(), window_width_k);
-        std::size_t new_char_offset = passed_progress;
+        size_t const prefix_length = (std::min)(text_chunk.size(), window_width_k);
+        size_t new_char_offset = passed_progress;
         for (; new_char_offset < prefix_length; ++new_char_offset) {
             byte_t const new_char = text_chunk[new_char_offset];
             rolling_state_t const new_term = static_cast<rolling_state_t>(new_char) + 1.0;
-            for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
+            for (size_t dim = 0; dim < dimensions_k; ++dim) {
                 rolling_state_t &last_state = last_states[dim];
                 last_state = std::fma(last_state, multipliers_[dim], new_term); // Add head
                 last_state = barrett_mod(last_state, dim);
@@ -1138,7 +1138,7 @@ struct floating_rolling_hashers {
 
         // We now have our first minimum hashes
         if (new_char_offset == window_width_k)
-            for (std::size_t dim = 0; dim < dimensions_k; ++dim)
+            for (size_t dim = 0; dim < dimensions_k; ++dim)
                 rolling_minimums[dim] = (std::min)(rolling_minimums[dim], last_states[dim]),
                 min_counts[dim] = 1; // First occurrence of this hash
 
@@ -1148,7 +1148,7 @@ struct floating_rolling_hashers {
             byte_t const old_char = text_chunk[new_char_offset - window_width_k];
             rolling_state_t const new_term = static_cast<rolling_state_t>(new_char) + 1.0;
             rolling_state_t const old_term = static_cast<rolling_state_t>(old_char) + 1.0;
-            for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
+            for (size_t dim = 0; dim < dimensions_k; ++dim) {
                 rolling_state_t &last_state = last_states[dim];
                 rolling_state_t &rolling_minimum = rolling_minimums[dim];
                 min_count_t &min_count = min_counts[dim];
@@ -1165,18 +1165,18 @@ struct floating_rolling_hashers {
                 //      else if (last_state < rolling_minimum) { rolling_minimum = last_state, min_count = 1; }
                 //
                 // There's a branchless approach to achieve the same outcome:
-                min_count *= last_state >= rolling_minimum; // ? Discard `min_count` to 0, if a new minimum is found
-                min_count += last_state <= rolling_minimum; // ? Increments `min_count` by 1 for new & existing minimums
+                min_count *= last_state >= rolling_minimum; // ? Discard `min_count` to 0 for new extremums
+                min_count += last_state <= rolling_minimum; // ? Increments by 1 for new & old minimums
                 rolling_minimum = (std::min)(rolling_minimum, last_state);
             }
         }
 
         // Finally, export the minimum hashes into the smaller representations
         if (min_hashes)
-            for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
+            for (size_t dim = 0; dim < dimensions_k; ++dim) {
                 rolling_state_t const &rolling_minimum = rolling_minimums[dim];
                 min_hash_t &min_hash = min_hashes[dim];
-                auto const rolling_minimum_as_uint = static_cast<std::uint64_t>(rolling_minimum);
+                auto const rolling_minimum_as_uint = static_cast<u64_t>(rolling_minimum);
                 min_hash = rolling_minimum == skipped_rolling_hash_k
                                ? max_hash_k // If the rolling minimum is not set, use the maximum hash value
                                : static_cast<min_hash_t>(rolling_minimum_as_uint & max_hash_k);
@@ -1209,7 +1209,7 @@ struct floating_rolling_hashers {
     }
 
   private:
-    inline rolling_state_t barrett_mod(rolling_state_t x, std::size_t dim) const noexcept {
+    inline rolling_state_t barrett_mod(rolling_state_t x, size_t dim) const noexcept {
         rolling_state_t const modulo = modulos_[dim];
         rolling_state_t const inverse_modulo = inverse_modulos_[dim];
 
@@ -1245,18 +1245,18 @@ SZ_INLINE __m256d _mm256_floor_magic_pd(__m256d x) noexcept {
 
 /**
  *  @brief Optimized rolling Min-Hashers built around floating-point numbers.
- *  In a single YMM register we can store 4 `double` values, so we can process 4 hashes per register.
+ *  In a single YMM register we can store 4 `f64_t` values, so we can process 4 hashes per register.
  */
-template <std::size_t window_width_, std::size_t dimensions_>
+template <size_t window_width_, size_t dimensions_>
 struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
 
-    using hasher_t = floating_rolling_hasher<double>;
-    using rolling_state_t = double;
-    using min_hash_t = std::uint32_t;
-    using min_count_t = std::uint32_t;
+    using hasher_t = floating_rolling_hasher<f64_t>;
+    using rolling_state_t = f64_t;
+    using min_hash_t = u32_t;
+    using min_count_t = u32_t;
 
-    static constexpr std::size_t window_width_k = window_width_;
-    static constexpr std::size_t dimensions_k = dimensions_;
+    static constexpr size_t window_width_k = window_width_;
+    static constexpr size_t dimensions_k = dimensions_;
     static constexpr rolling_state_t skipped_rolling_hash_k = std::numeric_limits<rolling_state_t>::max();
     static constexpr min_hash_t max_hash_k = std::numeric_limits<min_hash_t>::max();
 
@@ -1265,7 +1265,7 @@ struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
 
     static constexpr unsigned hashes_per_ymm_k = sizeof(sz_u256_vec_t) / sizeof(rolling_state_t);
     static constexpr bool has_incomplete_tail_group_k = dimensions_k % hashes_per_ymm_k;
-    static constexpr std::size_t aligned_dimensions_k =
+    static constexpr size_t aligned_dimensions_k =
         has_incomplete_tail_group_k ? (dimensions_k / hashes_per_ymm_k + 1) * hashes_per_ymm_k : (dimensions_k);
     static constexpr unsigned groups_count_k = aligned_dimensions_k / hashes_per_ymm_k;
 
@@ -1278,15 +1278,15 @@ struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
     rolling_state_t negative_discarding_multipliers_[aligned_dimensions_k];
 
   public:
-    constexpr std::size_t dimensions() const noexcept { return dimensions_k; }
-    constexpr std::size_t window_width() const noexcept { return window_width_k; }
-    constexpr std::size_t window_width(std::size_t) const noexcept { return window_width_k; }
+    constexpr size_t dimensions() const noexcept { return dimensions_k; }
+    constexpr size_t window_width() const noexcept { return window_width_k; }
+    constexpr size_t window_width(size_t) const noexcept { return window_width_k; }
 
     /**
      *  @brief Initializes several rolling hashers with different multipliers and modulos.
      *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
      */
-    status_t try_seed(std::size_t alphabet_size = 256) noexcept {
+    status_t try_seed(size_t alphabet_size = 256) noexcept {
         for (unsigned dim = 0; dim < dimensions_k; ++dim) {
             hasher_t hasher(window_width_k, alphabet_size + dim, hasher_t::default_modulo_base_k);
             multipliers_[dim] = hasher.multiplier();
@@ -1314,7 +1314,7 @@ struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
 
         rolling_state_t rolling_states[dimensions_k];
         rolling_state_t rolling_minimums[dimensions_k];
-        for (std::size_t dim = 0; dim < dimensions_k; ++dim)
+        for (size_t dim = 0; dim < dimensions_k; ++dim)
             rolling_states[dim] = 0, rolling_minimums[dim] = skipped_rolling_hash_k;
         fingerprint_chunk(text, &rolling_states[0], &rolling_minimums[0], min_hashes, min_counts);
     }
@@ -1351,7 +1351,7 @@ struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
         span<rolling_state_t, dimensions_k> rolling_minimums, //
         min_hashes_span_t min_hashes,                         //
         min_counts_span_t min_counts,                         //
-        std::size_t passed_progress = 0                       //
+        size_t passed_progress = 0                            //
     ) const noexcept {
 
         for (unsigned group_index = 0; group_index < groups_count_k; ++group_index)
@@ -1359,10 +1359,10 @@ struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
 
         // Finally, export the minimum hashes into the smaller representations
         if (min_hashes)
-            for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
+            for (size_t dim = 0; dim < dimensions_k; ++dim) {
                 rolling_state_t const &rolling_minimum = rolling_minimums[dim];
                 min_hash_t &min_hash = min_hashes[dim];
-                auto const rolling_minimum_as_uint = static_cast<std::uint64_t>(rolling_minimum);
+                auto const rolling_minimum_as_uint = static_cast<u64_t>(rolling_minimum);
                 min_hash = rolling_minimum == skipped_rolling_hash_k
                                ? max_hash_k // If the rolling minimum is not set, use the maximum hash value
                                : static_cast<min_hash_t>(rolling_minimum_as_uint & max_hash_k);
@@ -1413,7 +1413,7 @@ struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
         span<rolling_state_t, dimensions_k> last_states,           //
         span<rolling_state_t, dimensions_k> rolling_minimums,      //
         span<min_count_t, dimensions_k> rolling_counts,            //
-        std::size_t const passed_progress = 0) const noexcept {
+        size_t const passed_progress = 0) const noexcept {
 
         unsigned const first_dim = group_index * hashes_per_ymm_k;
 
@@ -1424,7 +1424,7 @@ struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
 
         // Use scalar loads for the incomplete tail group
         if (has_incomplete_tail_group_k && group_index + 1 == groups_count_k) {
-            for (std::size_t word_index = 0; word_index < (dimensions_k - first_dim); ++word_index) {
+            for (size_t word_index = 0; word_index < (dimensions_k - first_dim); ++word_index) {
                 last_states_vec.f64s[word_index] = last_states[first_dim + word_index];
                 rolling_minimums_vec.f64s[word_index] = rolling_minimums[first_dim + word_index];
                 rolling_counts_vec.u64s[word_index] = rolling_counts[first_dim + word_index];
@@ -1446,8 +1446,8 @@ struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
         inverse_modulos_vec.ymm_pd = _mm256_loadu_pd(&inverse_modulos_[first_dim]);
 
         // Until we reach the `window_width_k`, we don't need to discard any symbols and can keep the code simpler
-        std::size_t const prefix_length = (std::min)(text_chunk.size(), window_width_k);
-        std::size_t new_char_offset = passed_progress;
+        size_t const prefix_length = (std::min)(text_chunk.size(), window_width_k);
+        size_t new_char_offset = passed_progress;
         for (; new_char_offset < prefix_length; ++new_char_offset) {
             byte_t const new_char = text_chunk[new_char_offset];
             rolling_state_t const new_term = static_cast<rolling_state_t>(new_char) + 1.0;
@@ -1506,7 +1506,7 @@ struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
 
         // Dump back the results from registers into our spans
         if (has_incomplete_tail_group_k && group_index + 1 == groups_count_k) {
-            for (std::size_t word_index = 0; word_index < (dimensions_k - first_dim); ++word_index) {
+            for (size_t word_index = 0; word_index < (dimensions_k - first_dim); ++word_index) {
                 last_states[first_dim + word_index] = last_states_vec.f64s[word_index];
                 rolling_minimums[first_dim + word_index] = rolling_minimums_vec.f64s[word_index];
                 rolling_counts[first_dim + word_index] = static_cast<min_count_t>(rolling_counts_vec.u64s[word_index]);
@@ -1556,18 +1556,18 @@ SZ_INLINE __m512d _mm512_floor_magic_pd(__m512d x) noexcept {
 
 /**
  *  @brief Optimized rolling Min-Hashers built around floating-point numbers.
- *  In a single ZMM register we can store 8 `double` values, so we can process 8 hashes per register.
+ *  In a single ZMM register we can store 8 `f64_t` values, so we can process 8 hashes per register.
  */
-template <std::size_t window_width_, std::size_t dimensions_>
+template <size_t window_width_, size_t dimensions_>
 struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
 
-    using hasher_t = floating_rolling_hasher<double>;
-    using rolling_state_t = double;
-    using min_hash_t = std::uint32_t;
-    using min_count_t = std::uint32_t;
+    using hasher_t = floating_rolling_hasher<f64_t>;
+    using rolling_state_t = f64_t;
+    using min_hash_t = u32_t;
+    using min_count_t = u32_t;
 
-    static constexpr std::size_t window_width_k = window_width_;
-    static constexpr std::size_t dimensions_k = dimensions_;
+    static constexpr size_t window_width_k = window_width_;
+    static constexpr size_t dimensions_k = dimensions_;
     static constexpr rolling_state_t skipped_rolling_hash_k = std::numeric_limits<rolling_state_t>::max();
     static constexpr min_hash_t max_hash_k = std::numeric_limits<min_hash_t>::max();
 
@@ -1576,7 +1576,7 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
 
     static constexpr unsigned hashes_per_zmm_k = sizeof(sz_u512_vec_t) / sizeof(rolling_state_t);
     static constexpr bool has_incomplete_tail_group_k = dimensions_k % hashes_per_zmm_k;
-    static constexpr std::size_t aligned_dimensions_k =
+    static constexpr size_t aligned_dimensions_k =
         has_incomplete_tail_group_k ? (dimensions_k / hashes_per_zmm_k + 1) * hashes_per_zmm_k : (dimensions_k);
     static constexpr unsigned groups_count_k = aligned_dimensions_k / hashes_per_zmm_k;
 
@@ -1589,15 +1589,15 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
     rolling_state_t negative_discarding_multipliers_[aligned_dimensions_k];
 
   public:
-    constexpr std::size_t dimensions() const noexcept { return dimensions_k; }
-    constexpr std::size_t window_width() const noexcept { return window_width_k; }
-    constexpr std::size_t window_width(std::size_t) const noexcept { return window_width_k; }
+    constexpr size_t dimensions() const noexcept { return dimensions_k; }
+    constexpr size_t window_width() const noexcept { return window_width_k; }
+    constexpr size_t window_width(size_t) const noexcept { return window_width_k; }
 
     /**
      *  @brief Initializes several rolling hashers with different multipliers and modulos.
      *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
      */
-    status_t try_seed(std::size_t alphabet_size = 256) noexcept {
+    status_t try_seed(size_t alphabet_size = 256) noexcept {
         for (unsigned dim = 0; dim < dimensions_k; ++dim) {
             hasher_t hasher(window_width_k, alphabet_size + dim, hasher_t::default_modulo_base_k);
             multipliers_[dim] = hasher.multiplier();
@@ -1625,7 +1625,7 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
 
         rolling_state_t rolling_states[dimensions_k];
         rolling_state_t rolling_minimums[dimensions_k];
-        for (std::size_t dim = 0; dim < dimensions_k; ++dim)
+        for (size_t dim = 0; dim < dimensions_k; ++dim)
             rolling_states[dim] = 0, rolling_minimums[dim] = skipped_rolling_hash_k;
         fingerprint_chunk(text, &rolling_states[0], &rolling_minimums[0], min_hashes, min_counts);
     }
@@ -1662,17 +1662,17 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
         span<rolling_state_t, dimensions_k> rolling_minimums, //
         min_hashes_span_t min_hashes,                         //
         min_counts_span_t min_counts,                         //
-        std::size_t const passed_progress = 0) const noexcept {
+        size_t const passed_progress = 0) const noexcept {
 
         for (unsigned group_index = 0; group_index < groups_count_k; ++group_index)
             roll_group(text_chunk, group_index, last_states, rolling_minimums, min_counts, passed_progress);
 
         // Finally, export the minimum hashes into the smaller representations
         if (min_hashes)
-            for (std::size_t dim = 0; dim < dimensions_k; ++dim) {
+            for (size_t dim = 0; dim < dimensions_k; ++dim) {
                 rolling_state_t const &rolling_minimum = rolling_minimums[dim];
                 min_hash_t &min_hash = min_hashes[dim];
-                auto const rolling_minimum_as_uint = static_cast<std::uint64_t>(rolling_minimum);
+                auto const rolling_minimum_as_uint = static_cast<u64_t>(rolling_minimum);
                 min_hash = rolling_minimum == skipped_rolling_hash_k
                                ? max_hash_k // If the rolling minimum is not set, use the maximum hash value
                                : static_cast<min_hash_t>(rolling_minimum_as_uint & max_hash_k);
@@ -1734,7 +1734,7 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
         span<rolling_state_t, dimensions_k> last_states,           //
         span<rolling_state_t, dimensions_k> rolling_minimums,      //
         span<min_count_t, dimensions_k> rolling_counts,            //
-        std::size_t const passed_progress = 0) const noexcept {
+        size_t const passed_progress = 0) const noexcept {
 
         unsigned const first_dim = group_index * hashes_per_zmm_k;
 
@@ -1766,8 +1766,8 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
         inverse_modulos_vec.zmm_pd = _mm512_loadu_pd(&inverse_modulos_[first_dim]);
 
         // Until we reach the `window_width_k`, we don't need to discard any symbols and can keep the code simpler
-        std::size_t const prefix_length = (std::min)(text_chunk.size(), window_width_k);
-        std::size_t new_char_offset = passed_progress;
+        size_t const prefix_length = (std::min)(text_chunk.size(), window_width_k);
+        size_t new_char_offset = passed_progress;
         for (; new_char_offset < prefix_length; ++new_char_offset) {
             byte_t const new_char = text_chunk[new_char_offset];
             rolling_state_t const new_term = static_cast<rolling_state_t>(new_char) + 1.0;

From 52b1d73201774c1f0038fa9a7bffc0848925f80f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 30 Jul 2025 21:21:44 +0000
Subject: [PATCH 505/751] Fix: Fingerprinting in CUDA

---
 include/stringzillas/fingerprint.cuh | 288 ++++++++++++++++-----------
 include/stringzillas/fingerprint.hpp |  66 ++++--
 include/stringzillas/types.cuh       |  10 +-
 include/stringzillas/types.hpp       |   2 +-
 scripts/bench_fingerprint.cuh        |  36 +++-
 scripts/test_fingerprint.cuh         | 123 ++++++++----
 scripts/test_stringzillas.cpp        |   2 +-
 scripts/test_stringzillas.cu         |   1 +
 8 files changed, 349 insertions(+), 179 deletions(-)

diff --git a/include/stringzillas/fingerprint.cuh b/include/stringzillas/fingerprint.cuh
index d9f0adb7..303fd719 100644
--- a/include/stringzillas/fingerprint.cuh
+++ b/include/stringzillas/fingerprint.cuh
@@ -51,36 +51,6 @@ __device__ __forceinline__ f64_t barrett_mod_cuda_(f64_t x, f64_t modulo, f64_t
 
 #pragma region - CUDA Kernels
 
-/**
- *  Each warp takes in an individual document from @p `texts` and computes many rolling hashes for it.
- *  Each thread computes an independent rolling hash for a specific dimension, so you should have a multiple
- *  of warp-size dimensions per fingerprint.
- *
- *  The core idea of this kernel is to adapt to hash-functions of different window widths.
- *  Assuming the GPUs handle handle consecutive reads much better than random reads
- */
-template <typename texts_type_, typename min_hashes_per_text_type_, typename min_counts_per_text_type_,
-          typename hasher_type_, size_t warp_size_ = 32>
-__global__ void basic_rolling_hashers_kernel_(texts_type_ const &texts, span<hasher_type_> hashers,
-                                              min_hashes_per_text_type_ &min_hashes_output,
-                                              min_counts_per_text_type_ &min_counts_output) {
-    //
-    using hasher_t = hasher_type_;
-    using state_t = typename hasher_t::state_t;
-    using hash_t = typename hasher_t::hash_t;
-
-    using text_t = typename texts_type_::value_type;
-    using char_t = typename text_t::value_type;
-
-    // With different window widths, the discarding character will vary between threads in a warp.
-    // We may, however, assume that in the common case, the window will be the same and the outgoing
-    // character will be cached. So we explicitly store only the incoming character in the shared memory.
-    // On Nvidia GPUs with 32 threads per warp, this is only 32 * 4 = 128 bytes of shared memory.
-    // On AMD GPUs, with 64 threads per warp, this is 64 * 4 = 256 bytes of shared memory.
-    constexpr size_t warp_size_k = warp_size_;
-    __shared__ sz_u32_vec_t incoming_text_chunk[warp_size_k];
-}
-
 /**
  *  Each warp takes in an individual document from @p `tasks` and computes many rolling hashes for it.
  *  Each thread computes an independent rolling hash for a specific dimension, so you should have a multiple
@@ -93,7 +63,8 @@ __global__ void basic_rolling_hashers_kernel_(texts_type_ const &texts, span<has
  */
 template <                                                                     //
     unsigned window_width_, unsigned dimensions_, sz_capability_t capability_, //
-    typename char_type_ = byte_t, unsigned warp_size_ = 32                     //
+    typename char_type_ = byte_t, warp_size_t warp_size_ = warp_size_nvidia_k, //
+    warp_tasks_density_t density_ = four_warps_per_multiprocessor_k            //
     >
 __global__ void floating_rolling_hashers_on_each_cuda_warp_(                            //
     cuda_floating_fingerprint_task_<char_type_> const *tasks, size_t const tasks_count, //
@@ -102,27 +73,29 @@ __global__ void floating_rolling_hashers_on_each_cuda_warp_(
     //
     using task_t = cuda_floating_fingerprint_task_<char_type_>;
     using hasher_t = floating_rolling_hasher<f64_t>;
-    constexpr size_t window_width_k = window_width_;
-    constexpr size_t dimensions_k = dimensions_;
-    constexpr size_t dimensions_per_thread_k = dimensions_k / warp_size_k;
-    constexpr f64_t skipped_rolling_hash_k = hasher_t::skipped_rolling_hash_k;
-    constexpr u32_t max_hash_k = hasher_t::max_hash_k;
+    constexpr warp_size_t warp_size_k = warp_size_;
+    constexpr warp_tasks_density_t density_k = density_;
+    constexpr unsigned window_width_k = window_width_;
+    constexpr unsigned dimensions_k = dimensions_;
+    constexpr unsigned dimensions_per_thread_k = dimensions_k / warp_size_k;
+    constexpr f64_t skipped_rolling_state_k = basic_rolling_hashers<hasher_t>::skipped_rolling_state_k;
+    constexpr u32_t max_hash_k = basic_rolling_hashers<hasher_t>::max_hash_k;
     static_assert(dimensions_k % warp_size_k == 0, "Dimensions must be a multiple of warp size");
 
     // We don't use too much shared memory in these algorithms to allow scaling to very long windows,
     // and large number of blocks per SM. The consecutive aligned reads should be very performant.
-    constexpr size_t warp_size_k = warp_size_;
-    __shared__ sz_u32_vec_t discarding_text_chunk[warp_size_k]; // TODO: How do we allocate many warps per block?!
-    __shared__ sz_u32_vec_t incoming_text_chunk[warp_size_k];
+    __shared__ byte_t discarding_text_chunk[density_k][warp_size_k];
+    __shared__ byte_t incoming_text_chunk[density_k][warp_size_k];
 
     // We may have multiple warps operating in the same block.
     unsigned const warp_size = warpSize;
+    sz_assert_(warp_size == warp_size_k && "Warp size mismatch in kernel");
     unsigned const global_thread_index = static_cast<unsigned>(blockIdx.x * blockDim.x + threadIdx.x);
     unsigned const global_warp_index = static_cast<unsigned>(global_thread_index / warp_size_k);
     unsigned const warps_per_block = static_cast<unsigned>(blockDim.x / warp_size_k);
     unsigned const warps_per_device = static_cast<unsigned>(gridDim.x * warps_per_block);
-    unsigned const warp_thread_index = static_cast<unsigned>(global_thread_index % warp_size_k);
-    sz_assert_(warp_size == warp_size_k && "Warp size mismatch in kernel");
+    unsigned const thread_in_warp_index = static_cast<unsigned>(global_thread_index % warp_size_k);
+    unsigned const warp_in_block_index = static_cast<unsigned>(global_warp_index % warps_per_block);
 
     // Load the hashers states per thread.
     f64_t multipliers[dimensions_per_thread_k];
@@ -130,8 +103,9 @@ __global__ void floating_rolling_hashers_on_each_cuda_warp_(
     f64_t modulos[dimensions_per_thread_k];
     f64_t inverse_modulos[dimensions_per_thread_k];
     for (unsigned dim_withing_thread = 0; dim_withing_thread < dimensions_per_thread_k; ++dim_withing_thread) {
-        unsigned const dim = warp_thread_index * dimensions_per_thread_k + dim_withing_thread;
-        auto const &hasher = hashers[dim];
+        unsigned const dim = thread_in_warp_index * dimensions_per_thread_k + dim_withing_thread;
+        hasher_t const &hasher = hashers[dim];
+        if (dim >= hashers_count) continue; // ? Avoid out-of-bounds access
         multipliers[dim_withing_thread] = hasher.multiplier();
         negative_discarding_multipliers[dim_withing_thread] = hasher.negative_discarding_multiplier();
         modulos[dim_withing_thread] = hasher.modulo();
@@ -140,16 +114,16 @@ __global__ void floating_rolling_hashers_on_each_cuda_warp_(
 
     // We are computing N edit distances for N pairs of strings. Not a cartesian product!
     // Each block/warp may end up receiving a different number of strings.
-    for (size_t task_index = global_warp_index; task_index < tasks.size(); task_index += warps_per_device) {
+    for (size_t task_index = global_warp_index; task_index < tasks_count; task_index += warps_per_device) {
         task_t const task = tasks[task_index];
 
         // For each state we need to reset the local state
         f64_t rolling_states[dimensions_per_thread_k] = {0.0};
-        f64_t rolling_minimums[dimensions_per_thread_k] = {skipped_rolling_hash_k};
-        u32_t min_counts[dimensions_per_thread_k] = {0};
+        f64_t rolling_minimums[dimensions_per_thread_k] = {skipped_rolling_state_k};
+        u32_t rolling_counts[dimensions_per_thread_k] = {0};
 
         // Until we reach the `window_width_k`, we don't need to discard any symbols and can keep the code simpler
-        size_t const prefix_length = (std::min)(task.text_length, window_width_k);
+        size_t const prefix_length = std::min<size_t>(task.text_length, window_width_k);
         size_t new_char_offset = 0;
         for (; new_char_offset < prefix_length; ++new_char_offset) {
             byte_t const new_char = task.text_ptr[new_char_offset]; // ? Hardware may auto-broadcast this
@@ -166,59 +140,33 @@ __global__ void floating_rolling_hashers_on_each_cuda_warp_(
         }
 
         // We now have our first minimum hashes
-        if (new_char_offset == window_width_k)
-            for (unsigned dim_within_thread = 0; dim_within_thread < dimensions_per_thread_k; ++dim_within_thread)
-                rolling_minimums[dim_within_thread] = rolling_states[dim_within_thread],
-                rolling_counts[dim_within_thread] = 1;
-
-        // Roll forward, until we reach an offset divisible by 4, the size of `sz_u32_vec_t` for optimal loads.
-        size_t const unaligned_prefix_length = (std::min)(                       //
-            round_up_to_multiple<size_t>(new_char_offset, sizeof(sz_u32_vec_t)), //
-            task.text_length);
-        for (; new_char_offset < unaligned_prefix_length; ++new_char_offset) {
-            byte_t const new_char = task.text_ptr[new_char_offset]; // ? Hardware may auto-broadcast this
-            byte_t const old_char = task.text_ptr[new_char_offset - window_width_k];
-            f64_t const new_term = static_cast<f64_t>(new_char) + 1.0;
-            f64_t const old_term = static_cast<f64_t>(old_char) + 1.0;
-
+        if (new_char_offset == window_width_k) {
             for (unsigned dim_within_thread = 0; dim_within_thread < dimensions_per_thread_k; ++dim_within_thread) {
-                f64_t &rolling_state = rolling_states[dim_within_thread];
-                f64_t const multiplier = multipliers[dim_within_thread];
-                f64_t const negative_discarding_multiplier = negative_discarding_multipliers[dim_within_thread];
-                f64_t const modulo = modulos[dim_within_thread];
-                f64_t const inverse_modulo = inverse_modulos[dim_within_thread];
-                rolling_state = fma(negative_discarding_multiplier, old_term, rolling_state);
-                rolling_state = barrett_mod_cuda_(rolling_state, modulo, inverse_modulo);
-                rolling_state = fma(rolling_state, multiplier, new_term);
-                rolling_state = barrett_mod_cuda_(rolling_state, modulo, inverse_modulo);
-
-                // Update the minimums and counts
-                f64_t &rolling_minimum = rolling_minimums[dim_within_thread];
-                u32_t &min_count = min_counts[dim_within_thread];
-                min_count *= rolling_state >= rolling_minimum; // ? Discard `min_count` to 0 for new extremums
-                min_count += rolling_state <= rolling_minimum; // ? Increments by 1 for new & old minimums
-                rolling_minimum = (std::min)(rolling_minimum, rolling_state);
+                rolling_minimums[dim_within_thread] = rolling_states[dim_within_thread];
+                rolling_counts[dim_within_thread] = 1;
+                // unsigned const dim = thread_in_warp_index * dimensions_per_thread_k + dim_within_thread;
+                // printf("DEBUG: rolling_minimums[%d] = %f, rolling_states[%d] = %f\n", dim,
+                //        rolling_minimums[dim_within_thread], dim, rolling_states[dim_within_thread]);
             }
         }
 
-        // Now the main massive unrolled, coalescing reads & writes via `discarding_text_chunk` & `incoming_text_chunk`.
-        constexpr size_t unrolled_step_length_k = sizeof(incoming_text_chunk);
-        for (; new_char_offset + unrolled_step_length_k < task.text_length; new_char_offset += unrolled_step_length_k) {
+        // Now the main massive unrolled, coalescing reads & writes via `discarding_text_chunk` & `incoming_text_chunk`,
+        // practically performing a (`warp_size_k` by `warp_size_k`) hash-calculating operation unrolling the loop
+        // nested inside of this one.
+        for (; new_char_offset + warp_size_k < task.text_length; new_char_offset += warp_size_k) {
 
             // Load the next chunk of characters into shared memory
-            sz_u32_vec_t const *incoming_words = reinterpret_cast<sz_u32_vec_t const *>(text.data() + new_char_offset);
-            sz_u32_vec_t const *discarding_words =
-                reinterpret_cast<sz_u32_vec_t const *>(text.data() + new_char_offset - window_width_k);
-            incoming_text_chunk[warp_thread_index] = incoming_words[warp_thread_index];
-            discarding_text_chunk[warp_thread_index] = discarding_words[warp_thread_index];
+            byte_t const *incoming_bytes = task.text_ptr + new_char_offset;
+            byte_t const *discarding_bytes = task.text_ptr + new_char_offset - window_width_k;
+            incoming_text_chunk[warp_in_block_index][thread_in_warp_index] = incoming_bytes[thread_in_warp_index];
+            discarding_text_chunk[warp_in_block_index][thread_in_warp_index] = discarding_bytes[thread_in_warp_index];
 
             // Make sure the shared memory is fully loaded.
             __syncwarp();
 
-            for (unsigned char_within_step = 0; char_within_step < unrolled_step_length_k; ++char_within_step) {
-                // Transparently index in-shared-memory chunks
-                byte_t const new_char = incoming_text_chunk[0].u8s[char_within_step];
-                byte_t const old_char = discarding_text_chunk[0].u8s[char_within_step];
+            for (unsigned char_within_step = 0; char_within_step < warp_size_k; ++char_within_step) {
+                byte_t const new_char = incoming_text_chunk[warp_in_block_index][char_within_step];
+                byte_t const old_char = discarding_text_chunk[warp_in_block_index][char_within_step];
                 f64_t const new_term = static_cast<f64_t>(new_char) + 1.0;
                 f64_t const old_term = static_cast<f64_t>(old_char) + 1.0;
 
@@ -235,7 +183,7 @@ __global__ void floating_rolling_hashers_on_each_cuda_warp_(
 
                     // Update the minimums and counts
                     f64_t &rolling_minimum = rolling_minimums[dim_within_thread];
-                    u32_t &min_count = min_counts[dim_within_thread];
+                    u32_t &min_count = rolling_counts[dim_within_thread];
                     min_count *= rolling_state >= rolling_minimum; // ? Discard `min_count` to 0 for new extremums
                     min_count += rolling_state <= rolling_minimum; // ? Increments by 1 for new & old minimums
                     rolling_minimum = (std::min)(rolling_minimum, rolling_state);
@@ -263,7 +211,7 @@ __global__ void floating_rolling_hashers_on_each_cuda_warp_(
 
                 // Update the minimums and counts
                 f64_t &rolling_minimum = rolling_minimums[dim_within_thread];
-                u32_t &min_count = min_counts[dim_within_thread];
+                u32_t &min_count = rolling_counts[dim_within_thread];
                 min_count *= rolling_state >= rolling_minimum; // ? Discard `min_count` to 0 for new extremums
                 min_count += rolling_state <= rolling_minimum; // ? Increments by 1 for new & old minimums
                 rolling_minimum = (std::min)(rolling_minimum, rolling_state);
@@ -272,11 +220,13 @@ __global__ void floating_rolling_hashers_on_each_cuda_warp_(
 
         // Finally export the results
         for (unsigned dim_within_thread = 0; dim_within_thread < dimensions_per_thread_k; ++dim_within_thread) {
-            unsigned const dim = warp_thread_index * dimensions_per_thread_k + dim_within_thread;
-            task.min_counts[dim] = min_counts[dim_within_thread];
-            task.min_hashes[dim] = rolling_minimums[dim_within_thread] == skipped_rolling_hash_k
-                                       ? max_hash_k
-                                       : static_cast<min_hash_t>(rolling_minimums[dim_within_thread]);
+            unsigned const dim = thread_in_warp_index * dimensions_per_thread_k + dim_within_thread;
+            if (dim >= hashers_count) continue; // ? Avoid out-of-bounds access
+            task.min_counts[dim] = rolling_counts[dim_within_thread];
+            task.min_hashes[dim] =
+                rolling_minimums[dim_within_thread] == skipped_rolling_state_k
+                    ? max_hash_k
+                    : static_cast<u32_t>(static_cast<u64_t>(rolling_minimums[dim_within_thread]) & max_hash_k);
         }
     }
 }
@@ -308,16 +258,25 @@ struct floating_rolling_hashers<sz_cap_cuda_k, window_width_, dimensions_> {
     using rolling_state_t = f64_t;
     using min_hash_t = u32_t;
     using min_count_t = u32_t;
-    using allocator_t = ualloc_t;
+    using allocator_t = unified_alloc<char>;
 
     using hashers_allocator_t = typename allocator_t::template rebind<hasher_t>::other;
     using hashers_t = safe_vector<hasher_t, hashers_allocator_t>;
 
     static constexpr size_t window_width_k = window_width_;
     static constexpr size_t dimensions_k = dimensions_;
-    static constexpr rolling_state_t skipped_rolling_hash_k = std::numeric_limits<rolling_state_t>::max();
+    static constexpr rolling_state_t skipped_rolling_state_k = std::numeric_limits<rolling_state_t>::max();
     static constexpr min_hash_t max_hash_k = std::numeric_limits<min_hash_t>::max();
 
+    using min_hashes_span_t = span<min_hash_t, dimensions_k>;
+    using min_counts_span_t = span<min_count_t, dimensions_k>;
+
+    static constexpr unsigned hashes_per_warp_k = static_cast<unsigned>(warp_size_nvidia_k);
+    static constexpr bool has_incomplete_tail_group_k = dimensions_k % hashes_per_warp_k;
+    static constexpr size_t aligned_dimensions_k =
+        has_incomplete_tail_group_k ? (dimensions_k / hashes_per_warp_k + 1) * hashes_per_warp_k : (dimensions_k);
+    static constexpr unsigned groups_count_k = aligned_dimensions_k / hashes_per_warp_k;
+
   private:
     allocator_t alloc_;
     hashers_t hashers_;
@@ -333,12 +292,89 @@ struct floating_rolling_hashers<sz_cap_cuda_k, window_width_, dimensions_> {
      *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
      */
     status_t try_seed(size_t alphabet_size = 256) noexcept {
-        if (hashers_.try_resize(dimensions_k) != status_t::success_k) return status_t::bad_alloc_k;
+        if (hashers_.try_resize(aligned_dimensions_k) != status_t::success_k) return status_t::bad_alloc_k;
         for (unsigned dim = 0; dim < dimensions_k; ++dim)
             hashers_[dim] = hasher_t(window_width_k, alphabet_size + dim, hasher_t::default_modulo_base_k);
         return status_t::success_k;
     }
 
+    /**
+     *  @brief Convenience function to compute the fingerprint of a single @p `text`-ual document.
+     *  @param[in] text The input text to hash, typically a UTF-8 encoded string.
+     *  @param[out] min_hashes The output fingerprint, a vector of minimum hashes.
+     *  @param[out] min_counts The output frequencies of @p `min_hashes` hashes.
+     *  @note Unlike the CPU kernels, @b not intended for product use, but rather for testing.
+     */
+    cuda_status_t try_fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes, min_counts_span_t min_counts,
+                                  gpu_specs_t specs = {}, cuda_executor_t executor = {}) const noexcept {
+
+        using task_t = cuda_floating_fingerprint_task_<byte_t>;
+        using tasks_allocator_t = typename allocator_t::template rebind<task_t>::other;
+
+        // Preallocate the events for GPU timing.
+        cudaEvent_t start_event, stop_event;
+        cudaEventCreate(&start_event, cudaEventBlockingSync);
+        cudaEventCreate(&stop_event, cudaEventBlockingSync);
+
+        // Populate the tasks array with a single task for the entire device.
+        safe_vector<task_t, tasks_allocator_t> tasks(alloc_);
+        if (tasks.try_resize(1) == status_t::bad_alloc_k) return {status_t::bad_alloc_k};
+
+        tasks[0] = task_t {
+            .text_ptr = text.data(),
+            .text_length = text.size(),
+            .original_index = 0,
+            .min_hashes = min_hashes.data(),
+            .min_counts = min_counts.data(),
+            .density = one_warp_per_multiprocessor_k,
+        };
+
+        // Record the start event
+        cudaError_t start_event_error = cudaEventRecord(start_event, executor.stream);
+        if (start_event_error != cudaSuccess) return {status_t::unknown_k, start_event_error};
+
+        void *warp_level_kernel_args[4];
+        auto const *tasks_ptr = tasks.data();
+        auto const tasks_size = tasks.size();
+        auto const *hashers_ptr = hashers_.data();
+        auto const hashers_size = (std::min)(dimensions_k, hashers_.size());
+        warp_level_kernel_args[0] = (void *)(&tasks_ptr);
+        warp_level_kernel_args[1] = (void *)(&tasks_size);
+        warp_level_kernel_args[2] = (void *)(&hashers_ptr);
+        warp_level_kernel_args[3] = (void *)(&hashers_size);
+
+        auto warp_level_kernel = &floating_rolling_hashers_on_each_cuda_warp_< //
+            window_width_k, aligned_dimensions_k, sz_cap_cuda_k, byte_t,       //
+            warp_size_nvidia_k, one_warp_per_multiprocessor_k>;
+
+        // TODO: We can be wiser about the dimensions of this grid.
+        unsigned const random_block_size = static_cast<unsigned>(warp_size_nvidia_k) * //
+                                           static_cast<unsigned>(one_warp_per_multiprocessor_k);
+        unsigned const random_blocks_per_multiprocessor = 1;
+        cudaError_t launch_error = cudaLaunchCooperativeKernel( //
+            reinterpret_cast<void *>(warp_level_kernel),        // Kernel function pointer
+            dim3(random_blocks_per_multiprocessor * 1),         // Grid dimensions
+            dim3(random_block_size),                            // Block dimensions
+            warp_level_kernel_args,                             // Array of kernel argument pointers
+            0,                                                  // Shared memory per block (in bytes)
+            executor.stream);                                   // CUDA stream
+        if (launch_error != cudaSuccess)
+            if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
+            else { return {status_t::unknown_k, launch_error}; }
+
+        // Wait until everything completes, as on the next iteration we will update the properties again.
+        cudaError_t execution_error = cudaStreamSynchronize(executor.stream);
+        if (execution_error != cudaSuccess) { return {status_t::unknown_k, execution_error}; }
+
+        // Calculate the duration:
+        cudaError_t stop_event_error = cudaEventRecord(stop_event, executor.stream);
+        if (stop_event_error != cudaSuccess) return {status_t::unknown_k, stop_event_error};
+        float execution_milliseconds = 0;
+        cudaEventElapsedTime(&execution_milliseconds, start_event, stop_event);
+
+        return {status_t::success_k, cudaSuccess, execution_milliseconds};
+    }
+
     template <typename texts_type_, typename min_hashes_per_text_type_, typename min_counts_per_text_type_>
     cuda_status_t operator()(texts_type_ const &texts, min_hashes_per_text_type_ &&min_hashes_per_text,
                              min_counts_per_text_type_ &&min_counts_per_text, gpu_specs_t specs = {},
@@ -347,7 +383,7 @@ struct floating_rolling_hashers<sz_cap_cuda_k, window_width_, dimensions_> {
         using texts_t = texts_type_;
         using text_t = typename texts_t::value_type;
         using char_t = typename text_t::value_type;
-        using task_t = cuda_similarity_task_<char_t>;
+        using task_t = cuda_floating_fingerprint_task_<char_t>;
         using tasks_allocator_t = typename allocator_t::template rebind<task_t>::other;
 
         // Preallocate the events for GPU timing.
@@ -357,7 +393,7 @@ struct floating_rolling_hashers<sz_cap_cuda_k, window_width_, dimensions_> {
 
         // Populate the tasks for each warp or the entire device, putting it into unified memory.
         safe_vector<task_t, tasks_allocator_t> tasks(alloc_);
-        if (tasks.try_resize(first_strings.size()) == status_t::bad_alloc_k) return {status_t::bad_alloc_k};
+        if (tasks.try_resize(texts.size()) == status_t::bad_alloc_k) return {status_t::bad_alloc_k};
         for (size_t task_index = 0; task_index < texts.size(); ++task_index) {
             auto const &text = texts[task_index];
             auto min_hashes = to_span(min_hashes_per_text[task_index]);
@@ -368,9 +404,7 @@ struct floating_rolling_hashers<sz_cap_cuda_k, window_width_, dimensions_> {
                 .original_index = task_index,
                 .min_hashes = min_hashes.data(),
                 .min_counts = min_counts.data(),
-                // Eight in this case is an arbitrary choice to ensure
-                .density = texts.size() > 8 * specs.streaming_multiprocessors ? warps_working_together_k
-                                                                              : four_warps_per_multiprocessor_k,
+                .density = four_warps_per_multiprocessor_k,
             };
         }
         std::partition(tasks.begin(), tasks.end(),
@@ -380,18 +414,24 @@ struct floating_rolling_hashers<sz_cap_cuda_k, window_width_, dimensions_> {
         cudaError_t start_event_error = cudaEventRecord(start_event, executor.stream);
         if (start_event_error != cudaSuccess) return {status_t::unknown_k, start_event_error};
 
-        cuda_status_t result;
-
         void *warp_level_kernel_args[4];
-        warp_level_kernel_args[0] = reinterpret_cast<void *>(tasks.data());
-        warp_level_kernel_args[1] = reinterpret_cast<void *>(tasks.size());
-        warp_level_kernel_args[2] = reinterpret_cast<void *>(hashers_.data());
-        warp_level_kernel_args[3] = reinterpret_cast<void *>(hashers_.size());
-        auto warp_level_kernel =
-            &floating_rolling_hashers_on_each_cuda_warp_<window_width_k, dimensions_k, sz_cap_cuda_k, char_t, 32>;
+        auto const *tasks_ptr = tasks.data();
+        auto tasks_size = tasks.size();
+        auto const *hashers_ptr = hashers_.data();
+        auto const hashers_size = (std::min)(dimensions_k, hashers_.size());
+        warp_level_kernel_args[0] = (void *)(&tasks_ptr);
+        warp_level_kernel_args[1] = (void *)(&tasks_size);
+        warp_level_kernel_args[2] = (void *)(&hashers_ptr);
+        warp_level_kernel_args[3] = (void *)(&hashers_size);
+
+        static_assert(sizeof(char_t) == sizeof(byte_t), "Characters must be byte-sized");
+        auto warp_level_kernel = &floating_rolling_hashers_on_each_cuda_warp_< //
+            window_width_k, aligned_dimensions_k, sz_cap_cuda_k, byte_t,       //
+            warp_size_nvidia_k, four_warps_per_multiprocessor_k>;
 
         // TODO: We can be wiser about the dimensions of this grid.
-        unsigned const random_block_size = 128;
+        unsigned const random_block_size =
+            static_cast<unsigned>(warp_size_nvidia_k) * static_cast<unsigned>(four_warps_per_multiprocessor_k);
         unsigned const random_blocks_per_multiprocessor = 32;
         cudaError_t launch_error = cudaLaunchCooperativeKernel(                       //
             reinterpret_cast<void *>(warp_level_kernel),                              // Kernel function pointer
@@ -403,6 +443,26 @@ struct floating_rolling_hashers<sz_cap_cuda_k, window_width_, dimensions_> {
         if (launch_error != cudaSuccess)
             if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
             else { return {status_t::unknown_k, launch_error}; }
+
+        // Wait until everything completes, as on the next iteration we will update the properties again.
+        cudaError_t execution_error = cudaStreamSynchronize(executor.stream);
+        if (execution_error != cudaSuccess) { return {status_t::unknown_k, execution_error}; }
+
+        // Calculate the duration:
+        cudaError_t stop_event_error = cudaEventRecord(stop_event, executor.stream);
+        if (stop_event_error != cudaSuccess) return {status_t::unknown_k, stop_event_error};
+        float execution_milliseconds = 0;
+        cudaEventElapsedTime(&execution_milliseconds, start_event, stop_event);
+
+        // Now that everything went well, export the results back into the `results` array.
+        for (size_t task_index = 0; task_index < tasks.size(); ++task_index) {
+            task_t const &task = tasks[task_index];
+            auto min_hashes = to_span(min_hashes_per_text[task.original_index]);
+            auto min_counts = to_span(min_counts_per_text[task.original_index]);
+            sz_copy((sz_ptr_t)min_hashes.data(), (sz_cptr_t)task.min_hashes, min_hashes.size_bytes());
+            sz_copy((sz_ptr_t)min_counts.data(), (sz_cptr_t)task.min_counts, min_counts.size_bytes());
+        }
+        return {status_t::success_k, cudaSuccess, execution_milliseconds};
     }
 };
 
diff --git a/include/stringzillas/fingerprint.hpp b/include/stringzillas/fingerprint.hpp
index cd43bd6a..6981fec3 100644
--- a/include/stringzillas/fingerprint.hpp
+++ b/include/stringzillas/fingerprint.hpp
@@ -406,11 +406,11 @@ struct floating_rolling_hasher<f32_t> {
         return result;
     }
 
-    size_t window_width_;
-    state_t multiplier_;
-    state_t modulo_;
-    state_t inverse_modulo_;
-    state_t negative_discarding_multiplier_;
+    size_t window_width_ = 0;
+    state_t multiplier_ = 0.0f;
+    state_t modulo_ = 0.0f;
+    state_t inverse_modulo_ = 0.0f;
+    state_t negative_discarding_multiplier_ = 0.0f;
 };
 
 inline f64_t absolute_fmod(f64_t x, f64_t y) noexcept {
@@ -473,6 +473,12 @@ struct floating_rolling_hasher<f64_t> {
         negative_discarding_multiplier_ = -negative_discarding_multiplier_;
     }
 
+    constexpr floating_rolling_hasher() noexcept = default;
+    constexpr floating_rolling_hasher(floating_rolling_hasher &&) noexcept = default;
+    constexpr floating_rolling_hasher(floating_rolling_hasher const &) noexcept = default;
+    constexpr floating_rolling_hasher &operator=(floating_rolling_hasher &&) noexcept = default;
+    constexpr floating_rolling_hasher &operator=(floating_rolling_hasher const &) noexcept = default;
+
     SZ_INLINE size_t window_width() const noexcept { return window_width_; }
 
     SZ_INLINE state_t push(state_t state, byte_t new_char) const noexcept {
@@ -489,10 +495,10 @@ struct floating_rolling_hasher<f64_t> {
 
     SZ_INLINE hash_t digest(state_t state) const noexcept { return static_cast<hash_t>(state); }
 
-    SZ_INLINE state_t multiplier() const noexcept { return multiplier_; }
-    SZ_INLINE state_t modulo() const noexcept { return modulo_; }
-    SZ_INLINE state_t inverse_modulo() const noexcept { return inverse_modulo_; }
-    SZ_INLINE state_t negative_discarding_multiplier() const noexcept { return negative_discarding_multiplier_; }
+    constexpr state_t multiplier() const noexcept { return multiplier_; }
+    constexpr state_t modulo() const noexcept { return modulo_; }
+    constexpr state_t inverse_modulo() const noexcept { return inverse_modulo_; }
+    constexpr state_t negative_discarding_multiplier() const noexcept { return negative_discarding_multiplier_; }
 
   private:
     SZ_INLINE state_t fma_mod(state_t a, state_t b, state_t c) const noexcept { return barrett_mod(a * b + c); }
@@ -517,11 +523,11 @@ struct floating_rolling_hasher<f64_t> {
         return result;
     }
 
-    size_t window_width_;
-    state_t multiplier_;
-    state_t modulo_;
-    state_t inverse_modulo_;
-    state_t negative_discarding_multiplier_;
+    size_t window_width_ = 0;
+    state_t multiplier_ = 0.0;
+    state_t modulo_ = 0.0;
+    state_t inverse_modulo_ = 0.0;
+    state_t negative_discarding_multiplier_ = 0.0;
 };
 
 #pragma endregion - Baseline Rolling Hashers
@@ -1025,7 +1031,10 @@ template <                                         //
     size_t dimensions_ = 64,                       //
     typename enable_ = void                        //
     >
-struct floating_rolling_hashers {
+struct floating_rolling_hashers;
+
+template <size_t window_width_, size_t dimensions_>
+struct floating_rolling_hashers<sz_cap_serial_k, window_width_, dimensions_> {
 
     using hasher_t = floating_rolling_hasher<f64_t>;
     using rolling_state_t = f64_t;
@@ -1052,6 +1061,15 @@ struct floating_rolling_hashers {
     constexpr size_t window_width() const noexcept { return window_width_k; }
     constexpr size_t window_width(size_t) const noexcept { return window_width_k; }
 
+    floating_rolling_hashers() noexcept {
+        // Reset all variables to zeros
+        for (auto &multiplier : multipliers_) multiplier = 0.0;
+        for (auto &modulo : modulos_) modulo = 0.0;
+        for (auto &inverse_modulo : inverse_modulos_) inverse_modulo = 0.0;
+        for (auto &negative_discarding_multiplier : negative_discarding_multipliers_)
+            negative_discarding_multiplier = 0.0;
+    }
+
     /**
      *  @brief Initializes several rolling hashers with different multipliers and modulos.
      *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
@@ -1282,6 +1300,15 @@ struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
     constexpr size_t window_width() const noexcept { return window_width_k; }
     constexpr size_t window_width(size_t) const noexcept { return window_width_k; }
 
+    floating_rolling_hashers() noexcept {
+        // Reset all variables to zeros
+        for (auto &multiplier : multipliers_) multiplier = 0.0;
+        for (auto &modulo : modulos_) modulo = 0.0;
+        for (auto &inverse_modulo : inverse_modulos_) inverse_modulo = 0.0;
+        for (auto &negative_discarding_multiplier : negative_discarding_multipliers_)
+            negative_discarding_multiplier = 0.0;
+    }
+
     /**
      *  @brief Initializes several rolling hashers with different multipliers and modulos.
      *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
@@ -1593,6 +1620,15 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
     constexpr size_t window_width() const noexcept { return window_width_k; }
     constexpr size_t window_width(size_t) const noexcept { return window_width_k; }
 
+    floating_rolling_hashers() noexcept {
+        // Reset all variables to zeros
+        for (auto &multiplier : multipliers_) multiplier = 0.0;
+        for (auto &modulo : modulos_) modulo = 0.0;
+        for (auto &inverse_modulo : inverse_modulos_) inverse_modulo = 0.0;
+        for (auto &negative_discarding_multiplier : negative_discarding_multipliers_)
+            negative_discarding_multiplier = 0.0;
+    }
+
     /**
      *  @brief Initializes several rolling hashers with different multipliers and modulos.
      *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
diff --git a/include/stringzillas/types.cuh b/include/stringzillas/types.cuh
index fedcc634..ce5b5069 100644
--- a/include/stringzillas/types.cuh
+++ b/include/stringzillas/types.cuh
@@ -7,6 +7,8 @@
  *  CUDA backends of higher-level complex templated algorithms implemented outside of the C layer, like:
  *
  *  - `unified_alloc` - a custom allocator that uses CUDA Unified Memory for allocation.
+ *  - `gpu_specs_t` - a structure that contains the GPU specifications, like number of SMs, VRAM size, etc.
+ *  - `cuda_status_t` - a composite of the CUDA status, error code, and elapsed kernel execution time.
  */
 #ifndef STRINGZILLAS_TYPES_CUH_
 #define STRINGZILLAS_TYPES_CUH_
@@ -145,12 +147,18 @@ __forceinline__ __device__ sz_u32_vec_t sz_u32_load_unaligned(void const *ptr) n
     return result;
 }
 
+/** @brief Number of threads per warp on the GPU. */
+enum warp_size_t : unsigned {
+    warp_size_nvidia_k = 32, // ? NVIDIA GPUs use 32 threads per warp
+    warp_size_amd_k = 64,    // ? AMD GPUs use 64 threads per wave
+};
+
 /**
  *  @brief  Defines the upper bound on the number of warps per multi processor we may theoretically
  *          be able to run as part of one or many blocks. Generally this number depends on the amount
  *          of shared memory available on the device, and the amount of reserved memory per block.
  */
-enum warp_tasks_density_t : uint {
+enum warp_tasks_density_t : unsigned {
     warps_working_together_k = 0,
     one_warp_per_multiprocessor_k = 1,
     two_warps_per_multiprocessor_k = 2,
diff --git a/include/stringzillas/types.hpp b/include/stringzillas/types.hpp
index 0ba92bb7..f09fe087 100644
--- a/include/stringzillas/types.hpp
+++ b/include/stringzillas/types.hpp
@@ -15,7 +15,7 @@ namespace stringzillas {
 
 using namespace ashvardanian::stringzilla;
 
-enum bytes_per_cell_t : uint {
+enum bytes_per_cell_t : unsigned {
     zero_bytes_per_cell_k = 0,
     one_byte_per_cell_k = 1,
     two_bytes_per_cell_k = 2,
diff --git a/scripts/bench_fingerprint.cuh b/scripts/bench_fingerprint.cuh
index f730cc47..1f276074 100644
--- a/scripts/bench_fingerprint.cuh
+++ b/scripts/bench_fingerprint.cuh
@@ -22,7 +22,7 @@ namespace scripts {
 
 using namespace ashvardanian::stringzilla::scripts;
 
-static constexpr std::size_t default_embedding_dims_k = 16;
+static constexpr std::size_t default_embedding_dims_k = 64;
 static constexpr std::size_t default_window_width_k = 7;
 
 using fingerprint_min_hashes_t = std::array<std::uint32_t, default_embedding_dims_k>;
@@ -53,7 +53,8 @@ struct fingerprint_callable {
         // Unpack the extra arguments from `std::tuple` into the engine call using `std::apply`
         status_t status = std::apply(
             [&](auto &&...rest) mutable {
-                auto result = engine(env.tokens, fingerprints_hashes, fingerprints_counts, rest...);
+                [[maybe_unused]] auto result = engine( //
+                    env.tokens, *std::launder(&fingerprints_hashes), *std::launder(&fingerprints_counts), rest...);
                 do_not_optimize(result);
                 for (auto &scalar : fingerprints_hashes) do_not_optimize(scalar);
                 for (auto &scalar : fingerprints_counts) do_not_optimize(scalar);
@@ -62,7 +63,7 @@ struct fingerprint_callable {
             extra_args);
 
         do_not_optimize(status);
-        if (status != status_t::success_k) throw std::runtime_error("Failed multi-pattern search.");
+        if (status != status_t::success_k) throw std::runtime_error("Failed fingerprinting.");
 
         std::size_t bytes_passed = 0;
         for (std::size_t i = 0; i < env.tokens.size(); ++i) bytes_passed += env.tokens[i].size();
@@ -80,6 +81,10 @@ void bench_fingerprint(environment_t const &env) {
 
     namespace fu = fork_union;
 
+#if SZ_USE_CUDA
+    gpu_specs_t specs = *gpu_specs();
+#endif
+
     // Preallocate buffers for resulting fingerprints,
     // so that we can compare baseline and accelerated results for exact matches
     using fingerprints_equality_t = arrays_equality<fingerprint_min_hashes_t>;
@@ -129,17 +134,27 @@ void bench_fingerprint(environment_t const &env) {
     if (rolling_serial->try_seed() != status_t::success_k)
         throw std::runtime_error("Can't build Unrolled Floating Hasher.");
 
+#if SZ_USE_HASWELL
     using rolling_haswell_t =
         floating_rolling_hashers<sz_cap_haswell_k, default_window_width_k, default_embedding_dims_k>;
     auto rolling_haswell = std::make_unique<rolling_haswell_t>();
     if (rolling_haswell->try_seed() != status_t::success_k)
         throw std::runtime_error("Can't build Haswell Floating Hasher.");
+#endif // SZ_USE_HASWELL
 
+#if SZ_USE_SKYLAKE
     using rolling_skylake_t =
         floating_rolling_hashers<sz_cap_skylake_k, default_window_width_k, default_embedding_dims_k>;
     auto rolling_skylake = std::make_unique<rolling_skylake_t>();
     if (rolling_skylake->try_seed() != status_t::success_k)
         throw std::runtime_error("Can't build Skylake Floating Hasher.");
+#endif // SZ_USE_SKYLAKE
+
+#if SZ_USE_CUDA
+    using rolling_cuda_t = floating_rolling_hashers<sz_cap_cuda_k, default_window_width_k, default_embedding_dims_k>;
+    auto rolling_cuda = std::make_unique<rolling_cuda_t>();
+    if (rolling_cuda->try_seed() != status_t::success_k) throw std::runtime_error("Can't build CUDA Floating Hasher.");
+#endif // SZ_USE_CUDA
 
     // Perform the benchmarks, passing the dictionary to the engines
     auto call_baseline = fingerprint_callable<rolling_f64_t, fu::basic_pool_t &>(
@@ -175,6 +190,7 @@ void bench_fingerprint(environment_t const &env) {
             .log(baseline);
     scramble_accelerated_results();
 
+#if SZ_USE_HASWELL
     bench_nullary(                             //
         env, "rolling_haswell", call_baseline, //
         fingerprint_callable<rolling_haswell_t, fu::basic_pool_t &>(env, min_hashes_accelerated, min_counts_accelerated,
@@ -182,7 +198,9 @@ void bench_fingerprint(environment_t const &env) {
         callable_no_op_t {},                                                                 // preprocessing
         fingerprints_equality_t {})                                                          // equality check
         .log(baseline, unrolled);
+#endif // SZ_USE_HASWELL
 
+#if SZ_USE_SKYLAKE
     bench_nullary(                             //
         env, "rolling_skylake", call_baseline, //
         fingerprint_callable<rolling_skylake_t, fu::basic_pool_t &>(env, min_hashes_accelerated, min_counts_accelerated,
@@ -191,6 +209,18 @@ void bench_fingerprint(environment_t const &env) {
         fingerprints_equality_t {})                                                          // equality check
         .log(baseline, unrolled);
     scramble_accelerated_results();
+#endif // SZ_USE_SKYLAKE
+
+#if SZ_USE_CUDA
+    bench_nullary(                          //
+        env, "rolling_cuda", call_baseline, //
+        fingerprint_callable<rolling_cuda_t, gpu_specs_t>(env, min_hashes_accelerated, min_counts_accelerated,
+                                                          *rolling_cuda, specs), //
+        callable_no_op_t {},                                                     // preprocessing
+        fingerprints_equality_t {})                                              // equality check
+        .log(baseline, unrolled);
+    scramble_accelerated_results();
+#endif // SZ_USE_CUDA
 }
 
 #pragma endregion
diff --git a/scripts/test_fingerprint.cuh b/scripts/test_fingerprint.cuh
index cd76a817..753e6180 100644
--- a/scripts/test_fingerprint.cuh
+++ b/scripts/test_fingerprint.cuh
@@ -13,7 +13,7 @@
 
 #include "stringzillas/fingerprint.hpp"
 
-#if SZ_USE_CUDA && 0
+#if SZ_USE_CUDA
 #include "stringzillas/fingerprint.cuh"
 #endif
 
@@ -133,10 +133,10 @@ void test_rolling_hasher(hasher_type_ &&hasher, baseline_hasher_type_ &&baseline
 std::vector<std::string> rolling_hasher_basic_inputs() {
     std::vector<std::string> strings;
 
-    strings.emplace_back("his");
-    strings.emplace_back("is");
-    strings.emplace_back("she");
-    strings.emplace_back("her");
+    // strings.emplace_back("his");
+    // strings.emplace_back("is");
+    // strings.emplace_back("she");
+    // strings.emplace_back("her");
     strings.emplace_back("this is");
     strings.emplace_back("That is a test string");
     strings.emplace_back("ahishers");
@@ -152,7 +152,7 @@ std::vector<std::string> rolling_hasher_basic_inputs() {
     strings.emplace_back("ab ab ab ab ab ab ab ab ab ab");
     strings.emplace_back("abc abc abc abc abc abc abc abc");
 
-    // Unicode variants:
+    // Unicode variants
     strings.emplace_back("école"), strings.emplace_back("école");                   // decomposed
     strings.emplace_back("Schön"), strings.emplace_back("Scho\u0308n");             // combining diaeresis
     strings.emplace_back("naïve"), strings.emplace_back("naive");                   // stripped diaeresis
@@ -164,6 +164,12 @@ std::vector<std::string> rolling_hasher_basic_inputs() {
     strings.emplace_back("🙂"), strings.emplace_back("☺️");                          // emoji variants
     strings.emplace_back("€100"), strings.emplace_back("EUR 100");                  // currency symbol vs abbreviation
 
+    // Try longer strings that will trigger some loop-unrolled optimizations
+    strings.emplace_back( //
+        "This is a longer string that will be used to test the rolling hasher. "
+        "It should be long enough to cover multiple windows and provide a good test case for the "
+        "rolling hasher implementation. Let's see how it performs with this longer input string.");
+
     return strings;
 }
 
@@ -210,7 +216,7 @@ void test_rolling_hasher() {
     using u32buz_hasher_t = buz_rolling_hasher<u32_t>;
     using u64buz_hasher_t = buz_rolling_hasher<u64_t>;
     using f32u32_hasher_t = floating_rolling_hasher<float>;
-    using f64u64_hasher_t = floating_rolling_hasher<double>;
+    using f64u64_hasher_t = floating_rolling_hasher<f64_t>;
 
     test_rolling_hasher(f64u64_hasher_t(4, 257, 65521), u32u64_hasher_t(4, 257, 65521), unit_strings);
     test_rolling_hasher(f64u64_hasher_t(4, 257, 65521), u32u64_hasher_t(4, 257, 65521), dna_like_strings);
@@ -337,21 +343,34 @@ void test_rolling_hasher() {
             test_rolling_hasher(hasher, inconvenient_strings);
 }
 
-template <std::size_t dims_, typename strings_type_, typename baseline_hasher_type_, typename accelerated_hasher_type_>
-void test_rolling_hashers_equivalence_against_baseline(strings_type_ const &strings,
+template <std::size_t dims_, typename texts_type_, typename baseline_hasher_type_, typename accelerated_hasher_type_>
+void test_rolling_hashers_equivalence_against_baseline(texts_type_ const &texts,
                                                        baseline_hasher_type_ const &baseline_hasher,
                                                        accelerated_hasher_type_ const &accelerated_hasher) {
     constexpr std::size_t dims_k = dims_;
-    using fingerprint_hashes_t = safe_array<std::uint32_t, dims_k>;
-    using fingerprint_counts_t = safe_array<std::uint32_t, dims_k>;
-    fingerprint_hashes_t serial_hashes, accelerated_hashes;
-    fingerprint_counts_t serial_counts, accelerated_counts;
+    using min_hashes_t = safe_array<u32_t, dims_k>;
+    using min_counts_t = safe_array<u32_t, dims_k>;
+
+    arrow_strings_tape_t texts_tape;
+    safe_vector<min_hashes_t, unified_alloc<min_hashes_t>> serial_hashes_per_text, accelerated_hashes_per_text;
+    safe_vector<min_counts_t, unified_alloc<min_counts_t>> serial_counts_per_text, accelerated_counts_per_text;
+
+    sz_assert_(texts_tape.try_assign(texts.begin(), texts.end()) == status_t::success_k);
+    sz_assert_(serial_hashes_per_text.try_resize(texts.size()) == status_t::success_k);
+    sz_assert_(accelerated_hashes_per_text.try_resize(texts.size()) == status_t::success_k);
+    sz_assert_(serial_counts_per_text.try_resize(texts.size()) == status_t::success_k);
+    sz_assert_(accelerated_counts_per_text.try_resize(texts.size()) == status_t::success_k);
 
     // Compute the fingerprints
-    for (auto const &str : strings) {
-        auto bytes = to_bytes_view(str);
-        baseline_hasher.template try_fingerprint<dims_k>(bytes, serial_hashes, serial_counts);
-        accelerated_hasher.try_fingerprint(bytes, accelerated_hashes, accelerated_counts);
+    for (size_t text_index = 0; text_index < texts.size(); ++text_index) {
+        auto text = texts_tape[text_index];
+        min_hashes_t &serial_hashes = serial_hashes_per_text[text_index];
+        min_counts_t &serial_counts = serial_counts_per_text[text_index];
+        min_hashes_t &accelerated_hashes = accelerated_hashes_per_text[text_index];
+        min_counts_t &accelerated_counts = accelerated_counts_per_text[text_index];
+        baseline_hasher.template try_fingerprint<dims_k>(text.template cast<byte_t const>(), serial_hashes,
+                                                         serial_counts);
+        accelerated_hasher.try_fingerprint(text.template cast<byte_t const>(), accelerated_hashes, accelerated_counts);
 
         // Compare the results
         std::size_t const first_mismatch_index =
@@ -360,7 +379,7 @@ void test_rolling_hashers_equivalence_against_baseline(strings_type_ const &stri
 
         if (first_mismatch_index != serial_hashes.size()) {
             std::printf("Fingerprint mismatch at index %zu:\n", first_mismatch_index);
-            std::printf("  String: \"%s\"\n", str.c_str());
+            std::printf("  String: \"%.*s\"\n", static_cast<int>(text.size()), text.data());
             std::printf("  Serial hash:      %u\n", serial_hashes[first_mismatch_index]);
             std::printf("  Accelerated hash: %u\n", accelerated_hashes[first_mismatch_index]);
             std::printf("  Serial count:     %u\n", serial_counts[first_mismatch_index]);
@@ -373,7 +392,7 @@ void test_rolling_hashers_equivalence_against_baseline(strings_type_ const &stri
 
         // Counters can't be zero, if the input string is at least the size of a window
         for (std::size_t i = 0; i < serial_counts.size(); ++i) {
-            if (str.size() >= baseline_hasher.window_width(i)) {
+            if (text.size() >= baseline_hasher.window_width(i)) {
                 sz_assert_(serial_counts[i] > 0 && "Serial fingerprint count is zero");
                 sz_assert_(accelerated_counts[i] > 0 && "Accelerated fingerprint count is zero");
             }
@@ -389,7 +408,7 @@ void test_rolling_hashers_equivalence_against_baseline(strings_type_ const &stri
             serial_counts.begin();
         if (first_counts_mismatch_index != serial_counts.size()) {
             std::printf("Fingerprint counts mismatch at index %zu:\n", first_counts_mismatch_index);
-            std::printf("  String: \"%s\"\n", str.c_str());
+            std::printf("  String: \"%.*s\"\n", static_cast<int>(text.size()), text.data());
             std::printf("  Serial count:      %u\n", serial_counts[first_counts_mismatch_index]);
             std::printf("  Accelerated count: %u\n", accelerated_counts[first_counts_mismatch_index]);
             for (std::size_t i = 0; i < serial_counts.size(); ++i) {
@@ -402,7 +421,7 @@ void test_rolling_hashers_equivalence_against_baseline(strings_type_ const &stri
 
 /**
  *  Compares the equivalence of SIMD backends to @b `floating_rolling_hashers<sz_cap_serial_k>`
- *  and the simpler `basic_rolling_hashers<floating_rolling_hasher<double>, ..., std::uint32_t>`.
+ *  and the simpler `basic_rolling_hashers<floating_rolling_hasher<f64_t>, ..., u32_t>`.
  */
 template <std::size_t window_width_, std::size_t dims_>
 void test_rolling_hashers_equivalence_for_width() {
@@ -411,43 +430,53 @@ void test_rolling_hashers_equivalence_for_width() {
     constexpr std::size_t dims_k = dims_;
 
     // Define hasher classes
-    using rolling_f64_t = basic_rolling_hashers<floating_rolling_hasher<double>, std::uint32_t>;
-    using rolling_serial_t = floating_rolling_hashers<sz_cap_serial_k, window_width_k, dims_k>;
-    using rolling_haswell_t = floating_rolling_hashers<sz_cap_haswell_k, window_width_k, dims_k>;
-    using rolling_skylake_t = floating_rolling_hashers<sz_cap_skylake_k, window_width_k, dims_k>;
-
-    // Instantiate all rolling hashers
+    using rolling_f64_t = basic_rolling_hashers<floating_rolling_hasher<f64_t>, u32_t>;
     rolling_f64_t rolling_f64;
-    rolling_serial_t rolling_serial;
-    rolling_haswell_t rolling_haswell;
-    rolling_skylake_t rolling_skylake;
     sz_assert_(rolling_f64.try_extend(window_width_k, dims_k) == status_t::success_k);
-    sz_assert_(rolling_serial.try_seed() == status_t::success_k);
-    sz_assert_(rolling_haswell.try_seed() == status_t::success_k);
-    sz_assert_(rolling_skylake.try_seed() == status_t::success_k);
 
     // Test on each individual dataset
     auto unit_strings = rolling_hasher_basic_inputs();
-    test_rolling_hashers_equivalence_against_baseline<dims_k>(unit_strings, rolling_f64, rolling_serial);
-    test_rolling_hashers_equivalence_against_baseline<dims_k>(unit_strings, rolling_f64, rolling_haswell);
-    test_rolling_hashers_equivalence_against_baseline<dims_k>(unit_strings, rolling_f64, rolling_skylake);
-
-    // Now for DNA-like data
     auto dna_like_strings = rolling_hasher_dna_like_inputs();
-    test_rolling_hashers_equivalence_against_baseline<dims_k>(dna_like_strings, rolling_f64, rolling_serial);
-    test_rolling_hashers_equivalence_against_baseline<dims_k>(dna_like_strings, rolling_f64, rolling_haswell);
-    test_rolling_hashers_equivalence_against_baseline<dims_k>(dna_like_strings, rolling_f64, rolling_skylake);
-
-    // Finally, for inconvenient strings
     auto inconvenient_strings = rolling_hasher_inconvenient_inputs();
+
+    using rolling_serial_t = floating_rolling_hashers<sz_cap_serial_k, window_width_k, dims_k>;
+    rolling_serial_t rolling_serial;
+    sz_assert_(rolling_serial.try_seed() == status_t::success_k);
+    test_rolling_hashers_equivalence_against_baseline<dims_k>(unit_strings, rolling_f64, rolling_serial);
+    test_rolling_hashers_equivalence_against_baseline<dims_k>(dna_like_strings, rolling_f64, rolling_serial);
     test_rolling_hashers_equivalence_against_baseline<dims_k>(inconvenient_strings, rolling_f64, rolling_serial);
+
+#if SZ_USE_HASWELL
+    using rolling_haswell_t = floating_rolling_hashers<sz_cap_haswell_k, window_width_k, dims_k>;
+    rolling_haswell_t rolling_haswell;
+    sz_assert_(rolling_haswell.try_seed() == status_t::success_k);
+    test_rolling_hashers_equivalence_against_baseline<dims_k>(unit_strings, rolling_f64, rolling_haswell);
+    test_rolling_hashers_equivalence_against_baseline<dims_k>(dna_like_strings, rolling_f64, rolling_haswell);
     test_rolling_hashers_equivalence_against_baseline<dims_k>(inconvenient_strings, rolling_f64, rolling_haswell);
+#endif
+
+#if SZ_USE_SKYLAKE
+    using rolling_skylake_t = floating_rolling_hashers<sz_cap_skylake_k, window_width_k, dims_k>;
+    rolling_skylake_t rolling_skylake;
+    sz_assert_(rolling_skylake.try_seed() == status_t::success_k);
+    test_rolling_hashers_equivalence_against_baseline<dims_k>(unit_strings, rolling_f64, rolling_skylake);
+    test_rolling_hashers_equivalence_against_baseline<dims_k>(dna_like_strings, rolling_f64, rolling_skylake);
     test_rolling_hashers_equivalence_against_baseline<dims_k>(inconvenient_strings, rolling_f64, rolling_skylake);
+#endif
+
+#if SZ_USE_CUDA
+    using rolling_cuda_t = floating_rolling_hashers<sz_cap_cuda_k, window_width_k, dims_k>;
+    rolling_cuda_t rolling_cuda;
+    sz_assert_(rolling_cuda.try_seed() == status_t::success_k);
+    test_rolling_hashers_equivalence_against_baseline<dims_k>(unit_strings, rolling_f64, rolling_cuda);
+    test_rolling_hashers_equivalence_against_baseline<dims_k>(dna_like_strings, rolling_f64, rolling_cuda);
+    test_rolling_hashers_equivalence_against_baseline<dims_k>(inconvenient_strings, rolling_f64, rolling_cuda);
+#endif
 }
 
 void test_rolling_hashers_equivalence() {
     // Just 2 hashes per input
-    test_rolling_hashers_equivalence_for_width<3, 2>();
+    // test_rolling_hashers_equivalence_for_width<3, 2>();
     test_rolling_hashers_equivalence_for_width<7, 2>();
 
     // 32 hashes per input
@@ -455,6 +484,12 @@ void test_rolling_hashers_equivalence() {
     test_rolling_hashers_equivalence_for_width<7, 32>();
     test_rolling_hashers_equivalence_for_width<33, 32>();
     test_rolling_hashers_equivalence_for_width<64, 32>();
+
+    // 32 hashes per input with windows divisible by 4
+    test_rolling_hashers_equivalence_for_width<4, 32>();
+    test_rolling_hashers_equivalence_for_width<8, 32>();
+    test_rolling_hashers_equivalence_for_width<12, 32>();
+    test_rolling_hashers_equivalence_for_width<16, 32>();
 }
 
 } // namespace scripts
diff --git a/scripts/test_stringzillas.cpp b/scripts/test_stringzillas.cpp
index 3a8e16f6..d75915a0 100644
--- a/scripts/test_stringzillas.cpp
+++ b/scripts/test_stringzillas.cpp
@@ -41,8 +41,8 @@ int main(int argc, char const **argv) {
     if (auto code = szs::scripts::log_environment(); code != 0) return code;
 
     try {
-        szs::scripts::test_rolling_hasher();
         szs::scripts::test_rolling_hashers_equivalence();
+        szs::scripts::test_rolling_hasher();
         szs::scripts::test_similarity_scores_equivalence();
         szs::scripts::test_similarity_scores_memory_usage();
     }
diff --git a/scripts/test_stringzillas.cu b/scripts/test_stringzillas.cu
index 08a0c185..ac332b4a 100644
--- a/scripts/test_stringzillas.cu
+++ b/scripts/test_stringzillas.cu
@@ -41,6 +41,7 @@ int main(int argc, char const **argv) {
     if (auto code = szs::scripts::log_environment(); code != 0) return code;
 
     try {
+        szs::scripts::test_rolling_hashers_equivalence();
         szs::scripts::test_rolling_hasher();
         szs::scripts::test_similarity_scores_equivalence();
         szs::scripts::test_similarity_scores_memory_usage();

From 0a0955ea35e80ca88f65f58a1cfc858b64ea6d0d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 31 Jul 2025 11:17:33 +0000
Subject: [PATCH 506/751] Fix: Weird compiler bug related to `cuda_status_t`

---
 scripts/bench_fingerprint.cuh | 72 ++++++++++++++++++-----------------
 1 file changed, 37 insertions(+), 35 deletions(-)

diff --git a/scripts/bench_fingerprint.cuh b/scripts/bench_fingerprint.cuh
index 1f276074..4f6c0f6f 100644
--- a/scripts/bench_fingerprint.cuh
+++ b/scripts/bench_fingerprint.cuh
@@ -37,41 +37,37 @@ template <typename engine_type_, typename... extra_args_>
 struct fingerprint_callable {
     using engine_t = engine_type_;
 
-    environment_t const &env;
+    arrow_strings_tape_t const &tape;
     fingerprints_min_hashes_t &fingerprints_hashes;
     fingerprints_min_counts_t &fingerprints_counts;
     engine_t &engine;
     std::tuple<extra_args_...> extra_args = {};
 
-    fingerprint_callable(environment_t const &env, fingerprints_min_hashes_t &fingerprints_hashes,
+    fingerprint_callable(arrow_strings_tape_t const &tape, fingerprints_min_hashes_t &fingerprints_hashes,
                          fingerprints_min_counts_t &fingerprints_counts, engine_t &eng, extra_args_... args)
-        : env(env), fingerprints_hashes(fingerprints_hashes), fingerprints_counts(fingerprints_counts), engine(eng),
+        : tape(tape), fingerprints_hashes(fingerprints_hashes), fingerprints_counts(fingerprints_counts), engine(eng),
           extra_args(args...) {}
 
     call_result_t operator()() noexcept(false) {
 
         // Unpack the extra arguments from `std::tuple` into the engine call using `std::apply`
-        status_t status = std::apply(
+        status_t result = status_t::success_k;
+        std::apply(
             [&](auto &&...rest) mutable {
-                [[maybe_unused]] auto result = engine( //
-                    env.tokens, *std::launder(&fingerprints_hashes), *std::launder(&fingerprints_counts), rest...);
-                do_not_optimize(result);
+                result = engine(tape, fingerprints_hashes, fingerprints_counts, rest...);
                 for (auto &scalar : fingerprints_hashes) do_not_optimize(scalar);
                 for (auto &scalar : fingerprints_counts) do_not_optimize(scalar);
-                return result;
             },
             extra_args);
-
-        do_not_optimize(status);
-        if (status != status_t::success_k) throw std::runtime_error("Failed fingerprinting.");
+        if (static_cast<status_t>(result) != status_t::success_k) throw std::runtime_error("Failed fingerprinting.");
 
         std::size_t bytes_passed = 0;
-        for (std::size_t i = 0; i < env.tokens.size(); ++i) bytes_passed += env.tokens[i].size();
+        for (std::size_t i = 0; i < tape.size(); ++i) bytes_passed += tape[i].size();
 
         call_result_t call_result;
         call_result.bytes_passed = bytes_passed;
         call_result.operations = bytes_passed * default_embedding_dims_k;
-        call_result.inputs_processed = env.tokens.size();
+        call_result.inputs_processed = tape.size();
         call_result.check_value = reinterpret_cast<check_value_t>(&fingerprints_hashes);
         return call_result;
     }
@@ -82,9 +78,15 @@ void bench_fingerprint(environment_t const &env) {
     namespace fu = fork_union;
 
 #if SZ_USE_CUDA
-    gpu_specs_t specs = *gpu_specs();
+    auto maybe_specs = gpu_specs();
+    if (!maybe_specs.has_value()) throw std::runtime_error("Failed to get GPU specs.");
+    gpu_specs_t specs = *maybe_specs;
 #endif
 
+    arrow_strings_tape_t tape;
+    if (tape.try_assign(env.tokens.begin(), env.tokens.end()) != status_t::success_k)
+        throw std::runtime_error("Failed to assign tokens to tape.");
+
     // Preallocate buffers for resulting fingerprints,
     // so that we can compare baseline and accelerated results for exact matches
     using fingerprints_equality_t = arrays_equality<fingerprint_min_hashes_t>;
@@ -150,32 +152,32 @@ void bench_fingerprint(environment_t const &env) {
         throw std::runtime_error("Can't build Skylake Floating Hasher.");
 #endif // SZ_USE_SKYLAKE
 
+    // Perform the benchmarks, passing the dictionary to the engines
+    auto call_baseline = fingerprint_callable<rolling_f64_t, fu::basic_pool_t &>(
+        tape, min_hashes_baseline, min_counts_baseline, *rolling_f64, pool);
+    bench_result_t baseline = bench_nullary(env, "rolling_f64", call_baseline);
+
 #if SZ_USE_CUDA
     using rolling_cuda_t = floating_rolling_hashers<sz_cap_cuda_k, default_window_width_k, default_embedding_dims_k>;
     auto rolling_cuda = std::make_unique<rolling_cuda_t>();
     if (rolling_cuda->try_seed() != status_t::success_k) throw std::runtime_error("Can't build CUDA Floating Hasher.");
 #endif // SZ_USE_CUDA
 
-    // Perform the benchmarks, passing the dictionary to the engines
-    auto call_baseline = fingerprint_callable<rolling_f64_t, fu::basic_pool_t &>(
-        env, min_hashes_baseline, min_counts_baseline, *rolling_f64, pool);
-    bench_result_t baseline = bench_nullary(env, "rolling_f64", call_baseline);
-
     // Semi-serial variants
     bench_nullary(env, "rolling_f32",
-                  fingerprint_callable<rolling_f32_t, fu::basic_pool_t &>(env, min_hashes_accelerated,
+                  fingerprint_callable<rolling_f32_t, fu::basic_pool_t &>(tape, min_hashes_accelerated,
                                                                           min_counts_accelerated, *rolling_f32, pool))
         .log(baseline);
     bench_nullary(env, "rabin_u64",
-                  fingerprint_callable<rabin_u64_t, fu::basic_pool_t &>(env, min_hashes_accelerated,
+                  fingerprint_callable<rabin_u64_t, fu::basic_pool_t &>(tape, min_hashes_accelerated,
                                                                         min_counts_accelerated, *rabin_u64, pool))
         .log(baseline);
     bench_nullary(env, "buz_u32",
-                  fingerprint_callable<buz_u32_t, fu::basic_pool_t &>(env, min_hashes_accelerated,
+                  fingerprint_callable<buz_u32_t, fu::basic_pool_t &>(tape, min_hashes_accelerated,
                                                                       min_counts_accelerated, *buz_u32, pool)) //
         .log(baseline);
     bench_nullary(env, "multiply_u32",
-                  fingerprint_callable<multiply_u32_t, fu::basic_pool_t &>(env, min_hashes_accelerated,
+                  fingerprint_callable<multiply_u32_t, fu::basic_pool_t &>(tape, min_hashes_accelerated,
                                                                            min_counts_accelerated, *multiply_u32, pool))
         .log(baseline);
 
@@ -184,29 +186,29 @@ void bench_fingerprint(environment_t const &env) {
         bench_nullary(                            //
             env, "rolling_serial", call_baseline, //
             fingerprint_callable<rolling_serial_t, fu::basic_pool_t &>(
-                env, min_hashes_accelerated, min_counts_accelerated, *rolling_serial, pool), //
-            callable_no_op_t {},                                                             // preprocessing
-            fingerprints_equality_t {})                                                      // equality check
+                tape, min_hashes_accelerated, min_counts_accelerated, *rolling_serial, pool), //
+            callable_no_op_t {},                                                              // preprocessing
+            fingerprints_equality_t {})                                                       // equality check
             .log(baseline);
     scramble_accelerated_results();
 
 #if SZ_USE_HASWELL
     bench_nullary(                             //
         env, "rolling_haswell", call_baseline, //
-        fingerprint_callable<rolling_haswell_t, fu::basic_pool_t &>(env, min_hashes_accelerated, min_counts_accelerated,
-                                                                    *rolling_haswell, pool), //
-        callable_no_op_t {},                                                                 // preprocessing
-        fingerprints_equality_t {})                                                          // equality check
+        fingerprint_callable<rolling_haswell_t, fu::basic_pool_t &>(tape, min_hashes_accelerated,
+                                                                    min_counts_accelerated, *rolling_haswell, pool), //
+        callable_no_op_t {},        // preprocessing
+        fingerprints_equality_t {}) // equality check
         .log(baseline, unrolled);
 #endif // SZ_USE_HASWELL
 
 #if SZ_USE_SKYLAKE
     bench_nullary(                             //
         env, "rolling_skylake", call_baseline, //
-        fingerprint_callable<rolling_skylake_t, fu::basic_pool_t &>(env, min_hashes_accelerated, min_counts_accelerated,
-                                                                    *rolling_skylake, pool), //
-        callable_no_op_t {},                                                                 // preprocessing
-        fingerprints_equality_t {})                                                          // equality check
+        fingerprint_callable<rolling_skylake_t, fu::basic_pool_t &>(tape, min_hashes_accelerated,
+                                                                    min_counts_accelerated, *rolling_skylake, pool), //
+        callable_no_op_t {},        // preprocessing
+        fingerprints_equality_t {}) // equality check
         .log(baseline, unrolled);
     scramble_accelerated_results();
 #endif // SZ_USE_SKYLAKE
@@ -214,7 +216,7 @@ void bench_fingerprint(environment_t const &env) {
 #if SZ_USE_CUDA
     bench_nullary(                          //
         env, "rolling_cuda", call_baseline, //
-        fingerprint_callable<rolling_cuda_t, gpu_specs_t>(env, min_hashes_accelerated, min_counts_accelerated,
+        fingerprint_callable<rolling_cuda_t, gpu_specs_t>(tape, min_hashes_accelerated, min_counts_accelerated,
                                                           *rolling_cuda, specs), //
         callable_no_op_t {},                                                     // preprocessing
         fingerprints_equality_t {})                                              // equality check

From 4e6e1af7fcf6de22f7d7fb1a670c3004c5fa9f84 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 31 Jul 2025 19:50:06 +0000
Subject: [PATCH 507/751] Add: `SZ_NOINLINE`

---
 include/stringzilla/types.hpp | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 8eb8b79b..5743a004 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -85,12 +85,22 @@
 #define sz_constexpr_if_cpp20
 #endif
 
-#if defined(__GNUC__) || defined(__clang__)
+#if defined(_MSC_VER)
+#define SZ_INLINE __forceinline
+#elif defined(__GNUC__) || defined(__clang__)
 #define SZ_INLINE inline __attribute__((always_inline))
 #else
 #define SZ_INLINE inline
 #endif
 
+#if defined(_MSC_VER)
+#define SZ_NOINLINE __declspec(noinline)
+#elif defined(__GNUC__) || defined(__clang__)
+#define SZ_NOINLINE __attribute__((noinline))
+#else
+#define SZ_NOINLINE
+#endif
+
 #if !SZ_AVOID_STL
 #include <initializer_list> // `std::initializer_list` is only ~100 LOC
 #include <iterator>         // `std::random_access_iterator_tag` pulls 20K LOC
@@ -125,7 +135,7 @@ using rune_t = sz_rune_t;
 using sorted_idx_t = sz_sorted_idx_t;
 
 /** @sa sz_status_t */
-enum class status_t {
+enum class status_t : int {
     success_k = sz_success_k,
     bad_alloc_k = sz_bad_alloc_k,
     invalid_utf8_k = sz_invalid_utf8_k,

From 49cf4ea5d525529c93891170c971cc27c1a66732 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 31 Jul 2025 19:50:50 +0000
Subject: [PATCH 508/751] Docs: Refresh Python benchmarking suite

---
 scripts/bench_find.py     | 79 ++++++++++++++++++++++++++++++++++----
 scripts/bench_sequence.py | 80 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 141 insertions(+), 18 deletions(-)

diff --git a/scripts/bench_find.py b/scripts/bench_find.py
index e5759d35..602a67df 100644
--- a/scripts/bench_find.py
+++ b/scripts/bench_find.py
@@ -1,10 +1,33 @@
-import time
+# /// script
+# dependencies = [
+#   "stringzilla"
+# ]
+# ///
+"""
+StringZilla find operations benchmark script.
+
+This script benchmarks string search operations using different backends:
+- Python's built-in `str.find()` and `str.rfind()`
+- StringZilla's `Str.find()` and `Str.rfind()`
+- Regular expressions with `re.finditer()`
+- StringZilla's `Str.find_first_of()` for character sets
+- String translation operations
+
+Example usage via UV:
+
+    # Benchmark with a file
+    uv run --no-project scripts/bench_find.py --haystack-path leipzig1M.txt
+
+    # Benchmark with synthetic data
+    uv run --no-project scripts/bench_find.py --haystack-pattern "hello world " --haystack-length 1000000
+"""
+
+import argparse
 import re
 import random
+import time
 from typing import List
 
-import fire
-
 from stringzilla import Str
 
 
@@ -16,7 +39,7 @@ def log(name: str, haystack, patterns, operator: callable):
     bytes_length = len(haystack) * len(patterns)
     secs = (b - a) / 1e9
     gb_per_sec = bytes_length / (1e9 * secs)
-    print(f"{name}: took {secs:} seconds ~ {gb_per_sec:.3f} GB/s")
+    print(f"{name}: took {secs:.4f} seconds ~ {gb_per_sec:.3f} GB/s")
 
 
 def find_all(haystack, pattern) -> int:
@@ -100,6 +123,7 @@ def bench(
     haystack_pattern: str = None,
     haystack_length: int = None,
 ):
+    """Run string search benchmarks."""
     if haystack_path:
         pythonic_str: str = open(haystack_path, "r").read()
     else:
@@ -112,9 +136,7 @@ def bench(
     total_tokens = len(tokens)
     mean_token_length = sum(len(t) for t in tokens) / total_tokens
 
-    print(
-        f"Parsed the file with {total_tokens:,} words of {mean_token_length:.2f} mean length!"
-    )
+    print(f"Prepared {total_tokens:,} tokens of {mean_token_length:.2f} mean length!")
 
     tokens = random.sample(tokens, 100)
     log_functionality(
@@ -124,5 +146,46 @@ def bench(
     )
 
 
+_main_epilog = """
+Examples:
+
+  # Benchmark with a file
+  %(prog)s --haystack-path leipzig1M.txt
+
+  # Benchmark with synthetic data
+  %(prog)s --haystack-pattern "hello world " --haystack-length 1000000
+"""
+
+
+def main():
+    """Main entry point with argument parsing."""
+    parser = argparse.ArgumentParser(
+        description="Benchmark StringZilla find operations",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=_main_epilog,
+    )
+
+    parser.add_argument("--haystack-path", help="Path to input file")
+    parser.add_argument(
+        "--haystack-pattern", help="Pattern to repeat for synthetic data"
+    )
+    parser.add_argument(
+        "--haystack-length", type=int, help="Length of synthetic haystack"
+    )
+
+    args = parser.parse_args()
+
+    if args.haystack_path:
+        if args.haystack_pattern or args.haystack_length:
+            parser.error("Cannot specify both --haystack-path and synthetic options")
+    else:
+        if not (args.haystack_pattern and args.haystack_length):
+            parser.error(
+                "Must specify either --haystack-path or both --haystack-pattern and --haystack-length"
+            )
+
+    bench(args.haystack_path, args.haystack_pattern, args.haystack_length)
+
+
 if __name__ == "__main__":
-    fire.Fire(bench)
+    main()
diff --git a/scripts/bench_sequence.py b/scripts/bench_sequence.py
index b96847c9..57021fc7 100644
--- a/scripts/bench_sequence.py
+++ b/scripts/bench_sequence.py
@@ -1,6 +1,23 @@
-import time
+"""
+StringZilla sequence operations benchmark script.
+
+This script benchmarks string splitting operations using different backends:
+- Python's built-in `str.split()`
+- StringZilla's `Str.split()`
+- Python's built-in `sort()` on the result of `str.split()`
+- StringZilla's `Str.split().sort()`
+
+Example usage vi UV:
+
+    # Benchmark with a file
+    uv run --no-project scripts/bench_sequence.py --haystack-path leipzig1M.txt --needle "\n"
+
+    # Benchmark with synthetic data
+    uv run --no-project scripts/bench_sequence.py --haystack-pattern "hello world " --haystack-length 1000000 --needle " "
+"""
 
-import fire
+import argparse
+import time
 
 from stringzilla import Str, File
 
@@ -11,7 +28,7 @@ def log(name: str, bytes_length: int, operator: callable):
     b = time.time_ns()
     secs = (b - a) / 1e9
     gb_per_sec = bytes_length / (1e9 * secs)
-    print(f"{name}: took {secs:} seconds ~ {gb_per_sec:.3f} GB/s")
+    print(f"{name}: took {secs:.4f} seconds ~ {gb_per_sec:.3f} GB/s")
 
 
 def log_functionality(
@@ -23,13 +40,8 @@ def log_functionality(
 ):
     log("str.split", bytes_length, lambda: pythonic_str.split(pattern))
     log("Str.split", bytes_length, lambda: stringzilla_str.split(pattern))
-    if stringzilla_file:
-        log("File.split", bytes_length, lambda: stringzilla_file.split(pattern))
-
     log("str.split.sort", bytes_length, lambda: pythonic_str.split(pattern).sort())
     log("Str.split.sort", bytes_length, lambda: stringzilla_str.split(pattern).sort())
-    if stringzilla_file:
-        log("File.split", bytes_length, lambda: stringzilla_file.split(pattern).sort())
 
 
 def bench(
@@ -38,6 +50,7 @@ def bench(
     haystack_length: int = None,
     needle: str = None,
 ):
+    """Run string splitting benchmarks."""
     if haystack_path:
         pythonic_str: str = open(haystack_path, "r").read()
         stringzilla_file = File(haystack_path)
@@ -48,11 +61,58 @@ def bench(
         stringzilla_file = None
 
     stringzilla_str = Str(pythonic_str)
+    print(f"Prepared input of {len(pythonic_str):,} length!")
 
     log_functionality(
-        needle, len(stringzilla_str), pythonic_str, stringzilla_str, stringzilla_file
+        needle,
+        len(stringzilla_str),
+        pythonic_str,
+        stringzilla_str,
+        stringzilla_file,
+    )
+
+
+_main_epilog = """
+Examples:
+
+  # Benchmark with a file
+  %(prog)s --haystack-path leipzig1M.txt --needle "\\n"
+  
+  # Benchmark with synthetic data  
+  %(prog)s --haystack-pattern "hello world " --haystack-length 1000000 --needle " "
+"""
+
+
+def main():
+    """Main entry point with argument parsing."""
+    parser = argparse.ArgumentParser(
+        description="Benchmark StringZilla sequence operations",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=_main_epilog,
+    )
+
+    parser.add_argument("--haystack-path", help="Path to input file")
+    parser.add_argument(
+        "--haystack-pattern", help="Pattern to repeat for synthetic data"
     )
+    parser.add_argument(
+        "--haystack-length", type=int, help="Length of synthetic haystack"
+    )
+    parser.add_argument("--needle", required=True, help="Pattern to split on")
+
+    args = parser.parse_args()
+
+    if args.haystack_path:
+        if args.haystack_pattern or args.haystack_length:
+            parser.error("Cannot specify both --haystack-path and synthetic options")
+    else:
+        if not (args.haystack_pattern and args.haystack_length):
+            parser.error(
+                "Must specify either --haystack-path or both --haystack-pattern and --haystack-length"
+            )
+
+    bench(args.haystack_path, args.haystack_pattern, args.haystack_length, args.needle)
 
 
 if __name__ == "__main__":
-    fire.Fire(bench)
+    main()

From f997dadcb29b259c61167a2ef06cfb431b5c9e85 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 31 Jul 2025 19:51:00 +0000
Subject: [PATCH 509/751] Add: Fingerprinting baselines

---
 scripts/bench_fingerprint.py | 264 +++++++++++++++++++++++++++++++++++
 1 file changed, 264 insertions(+)
 create mode 100644 scripts/bench_fingerprint.py

diff --git a/scripts/bench_fingerprint.py b/scripts/bench_fingerprint.py
new file mode 100644
index 00000000..93ebec1c
--- /dev/null
+++ b/scripts/bench_fingerprint.py
@@ -0,0 +1,264 @@
+# /// script
+# dependencies = [
+#   "stringzilla",
+#   "datasketch",
+#   "scikit-learn",
+#   "numpy",
+#   "tqdm",
+# ]
+# ///
+"""
+StringZilla fingerprinting benchmark script.
+
+This script benchmarks MinHash fingerprinting operations using specialized sketching libraries:
+- datasketch: MinHash, HyperLogLog, and LSH implementations
+- sklearn: Feature hashing and MinHash variants
+
+Example usage via UV:
+
+    # Benchmark with a file
+    uv run --no-project scripts/bench_fingerprint.py --dataset leipzig1M.txt
+
+    # Benchmark with limited docs
+    uv run --no-project scripts/bench_fingerprint.py --dataset leipzig1M.txt --max-docs 1000
+
+    # Benchmark with custom parameters
+    uv run --no-project scripts/bench_fingerprint.py --dataset leipzig1M.txt --dimensions 32
+"""
+
+import argparse
+import time
+from pathlib import Path
+from typing import List, Callable, Iterable
+
+from tqdm import tqdm
+import numpy as np
+
+from datasketch import MinHash
+from sklearn.feature_extraction.text import HashingVectorizer
+
+# Global state for MinHash to avoid repeated initialization
+_datasketch_min_hash_state = None
+_sklearn_feature_hasher = None
+_sklearn_words_vectorizer = None
+_sklearn_ngram_vectorizer = None
+
+
+def log(
+    name: str,
+    docs: Iterable[str],
+    docs_sizes: Iterable[int],
+    operation_func: Callable,
+    timeout_seconds: int = 10,
+):
+    """Benchmark an operation with timeout and progress tracking."""
+    processed_docs = 0
+    processed_bytes = 0
+    start_time = time.time_ns()
+
+    try:
+        with tqdm(desc=name, unit="docs", leave=False, total=len(docs)) as progress_bar:
+            for doc, doc_size in zip(docs, docs_sizes):
+
+                # Check timeout (convert seconds to nanoseconds)
+                if time.time_ns() - start_time > timeout_seconds * 1e9:
+                    break
+
+                try:
+                    operation_func(doc)
+                    processed_docs += 1
+                    processed_bytes += doc_size
+
+                    # Update progress bar with custom rate
+                    elapsed_ns = time.time_ns() - start_time
+                    elapsed_s = elapsed_ns / 1e9
+                    if elapsed_s > 0:
+                        docs_per_sec = processed_docs / elapsed_s
+                        bytes_per_sec = processed_bytes / elapsed_s
+                        progress_bar.set_postfix(
+                            {
+                                "docs/s": f"{docs_per_sec:.0f}",
+                                "MB/s": f"{bytes_per_sec/1e6:.1f}",
+                            }
+                        )
+                    progress_bar.update(1)
+
+                except Exception as e:
+                    # Skip failed operations but continue
+                    continue
+
+    except KeyboardInterrupt:
+        print(f"\n{name}: SKIPPED (interrupted by user)")
+        return
+
+    total_time_ns = time.time_ns() - start_time
+    total_time_s = total_time_ns / 1e9
+    if processed_docs > 0:
+        docs_per_sec = processed_docs / total_time_s
+        mb_per_sec = processed_bytes / (1e6 * total_time_s)
+        print(
+            f"{name}: {processed_docs:,} docs in {total_time_s:.2f}s ~ {mb_per_sec:.3f} MB/s, {docs_per_sec:.0f} docs/s"
+        )
+    else:
+        print(f"{name}: No documents processed")
+
+
+def log_fingerprinting_functionality(
+    docs: Iterable[str],
+    docs_sizes: Iterable[int],
+    dimensions: int,
+    timeout_seconds: int = 10,
+):
+    """Benchmark fingerprinting and sketching implementations."""
+    global _datasketch_min_hash_state, _sklearn_feature_hasher, _sklearn_words_vectorizer, _sklearn_ngram_vectorizer
+
+    binary_docs = [doc.encode("utf-8") for doc in docs]
+    if _datasketch_min_hash_state is None:
+        _datasketch_min_hash_state = MinHash(num_perm=dimensions)
+
+    def datasketch_minhash_update(doc: bytes) -> np.ndarray:
+        _datasketch_min_hash_state.update(doc)
+        digest = _datasketch_min_hash_state.digest()
+        _datasketch_min_hash_state.clear()
+        return digest
+
+    log(
+        "datasketch.MinHash.update",
+        binary_docs,
+        docs_sizes,
+        datasketch_minhash_update,
+        timeout_seconds,
+    )
+
+    def datasketch_minhash_update_batch(
+        doc: bytes,
+        window_width: int = 3,
+    ) -> np.ndarray:
+        ngrams = (doc[i : i + window_width] for i in range(len(doc) - window_width + 1))
+        _datasketch_min_hash_state.update_batch(ngrams)
+        digest = _datasketch_min_hash_state.digest()
+        _datasketch_min_hash_state.clear()
+        return digest
+
+    log(
+        "datasketch.MinHash.update_batch(ngrams)",
+        binary_docs,
+        docs_sizes,
+        datasketch_minhash_update_batch,
+        timeout_seconds,
+    )
+
+    _sklearn_words_vectorizer = HashingVectorizer(
+        n_features=dimensions,
+        analyzer="word",
+        decode_error="ignore",
+        norm=None,
+    )
+
+    def sklearn_words_vectorizer(doc: bytes) -> list:
+        return _sklearn_words_vectorizer.transform([doc]).toarray()
+
+    log(
+        "sklearn.HashingVectorizer(word)",
+        docs,
+        docs_sizes,
+        sklearn_words_vectorizer,
+        timeout_seconds,
+    )
+
+    _sklearn_ngrams_vectorizer = HashingVectorizer(
+        n_features=dimensions,
+        analyzer="char",
+        ngram_range=(3, 17),  # trigrams and larger, up to 17-grams
+        decode_error="ignore",
+        norm=None,
+    )
+
+    def sklearn_ngrams_vectorizer(doc: bytes) -> list:
+        return _sklearn_ngrams_vectorizer.transform([doc]).toarray()
+
+    log(
+        "sklearn.HashingVectorizer(ngram)",
+        docs,
+        docs_sizes,
+        sklearn_ngrams_vectorizer,
+        timeout_seconds,
+    )
+
+
+def bench(
+    dataset_path: str,
+    max_docs: int = None,
+    dimensions: int = 64,
+    timeout_seconds: int = 10,
+):
+    """Run fingerprinting benchmarks."""
+
+    # Load dataset
+    if not Path(dataset_path).exists():
+        raise FileNotFoundError(f"Dataset not found: {dataset_path}")
+
+    with open(dataset_path, "r", encoding="utf-8", errors="ignore") as f:
+        docs = [doc.strip() for doc in f if doc.strip()]
+
+    if max_docs:
+        docs = docs[:max_docs]
+
+    docs_sizes = [len(doc.encode("utf-8")) for doc in docs]
+
+    print(
+        f"Prepared {len(docs):,} docs of {sum(docs_sizes)/len(docs_sizes):.1f} mean byte length!"
+    )
+    print(f"Total bytes: {sum(docs_sizes):,}")
+    print(f"Num hashes: {dimensions}")
+    print()
+
+    print("=== Fingerprinting & Sketching Benchmarks ===")
+    log_fingerprinting_functionality(docs, docs_sizes, dimensions, timeout_seconds)
+
+
+_main_epilog = """
+Examples:
+
+  # Benchmark with a file
+  %(prog)s --dataset leipzig1M.txt
+
+  # Benchmark with limited docs
+  %(prog)s --dataset leipzig1M.txt --max-docs 1000
+
+  # Custom parameters
+  %(prog)s --dataset leipzig1M.txt --dimensions 32
+"""
+
+
+def main():
+    """Main entry point with argument parsing."""
+    parser = argparse.ArgumentParser(
+        description="Benchmark StringZilla fingerprinting operations",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=_main_epilog,
+    )
+
+    parser.add_argument("--dataset", required=True, help="Path to text dataset file")
+    parser.add_argument(
+        "--max-docs", type=int, help="Maximum number of docs to process"
+    )
+    parser.add_argument(
+        "--dimensions",
+        type=int,
+        default=64,
+        help="Number of hash functions for MinHash (default: 64)",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=10,
+        help="Timeout in seconds for each benchmark (default: 10)",
+    )
+
+    args = parser.parse_args()
+    bench(args.dataset, args.max_docs, args.dimensions, args.timeout)
+
+
+if __name__ == "__main__":
+    main()

From 37f3d80943f49574e2e76ed0bf0ca14b65966940 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 31 Jul 2025 19:52:39 +0000
Subject: [PATCH 510/751] Improve: Unroll CUDA fingerprints

Benchmarks suggest H100 being over 600 MB/s,
while 16x Sapphire Rapids cores are ~50 MB/s.
---
 include/stringzillas/fingerprint.cuh | 69 ++++++++++++++--------------
 include/stringzillas/fingerprint.hpp | 66 +++++++++++++-------------
 2 files changed, 68 insertions(+), 67 deletions(-)

diff --git a/include/stringzillas/fingerprint.cuh b/include/stringzillas/fingerprint.cuh
index 303fd719..8868b67e 100644
--- a/include/stringzillas/fingerprint.cuh
+++ b/include/stringzillas/fingerprint.cuh
@@ -93,23 +93,25 @@ __global__ void floating_rolling_hashers_on_each_cuda_warp_(
     unsigned const global_thread_index = static_cast<unsigned>(blockIdx.x * blockDim.x + threadIdx.x);
     unsigned const global_warp_index = static_cast<unsigned>(global_thread_index / warp_size_k);
     unsigned const warps_per_block = static_cast<unsigned>(blockDim.x / warp_size_k);
+    sz_assert_(warps_per_block == density_k && "Block size mismatch in kernel");
     unsigned const warps_per_device = static_cast<unsigned>(gridDim.x * warps_per_block);
     unsigned const thread_in_warp_index = static_cast<unsigned>(global_thread_index % warp_size_k);
-    unsigned const warp_in_block_index = static_cast<unsigned>(global_warp_index % warps_per_block);
+    unsigned const warp_in_block_index = static_cast<unsigned>(global_warp_index % density_k);
 
     // Load the hashers states per thread.
     f64_t multipliers[dimensions_per_thread_k];
     f64_t negative_discarding_multipliers[dimensions_per_thread_k];
     f64_t modulos[dimensions_per_thread_k];
     f64_t inverse_modulos[dimensions_per_thread_k];
-    for (unsigned dim_withing_thread = 0; dim_withing_thread < dimensions_per_thread_k; ++dim_withing_thread) {
-        unsigned const dim = thread_in_warp_index * dimensions_per_thread_k + dim_withing_thread;
+#pragma unroll
+    for (unsigned dim_within_thread = 0; dim_within_thread < dimensions_per_thread_k; ++dim_within_thread) {
+        unsigned const dim = thread_in_warp_index * dimensions_per_thread_k + dim_within_thread;
         hasher_t const &hasher = hashers[dim];
         if (dim >= hashers_count) continue; // ? Avoid out-of-bounds access
-        multipliers[dim_withing_thread] = hasher.multiplier();
-        negative_discarding_multipliers[dim_withing_thread] = hasher.negative_discarding_multiplier();
-        modulos[dim_withing_thread] = hasher.modulo();
-        inverse_modulos[dim_withing_thread] = hasher.inverse_modulo();
+        multipliers[dim_within_thread] = hasher.multiplier();
+        negative_discarding_multipliers[dim_within_thread] = hasher.negative_discarding_multiplier();
+        modulos[dim_within_thread] = hasher.modulo();
+        inverse_modulos[dim_within_thread] = hasher.inverse_modulo();
     }
 
     // We are computing N edit distances for N pairs of strings. Not a cartesian product!
@@ -118,9 +120,12 @@ __global__ void floating_rolling_hashers_on_each_cuda_warp_(
         task_t const task = tasks[task_index];
 
         // For each state we need to reset the local state
-        f64_t rolling_states[dimensions_per_thread_k] = {0.0};
-        f64_t rolling_minimums[dimensions_per_thread_k] = {skipped_rolling_state_k};
-        u32_t rolling_counts[dimensions_per_thread_k] = {0};
+        f64_t rolling_states[dimensions_per_thread_k];
+        f64_t rolling_minimums[dimensions_per_thread_k];
+        u32_t rolling_counts[dimensions_per_thread_k];
+        for (auto &rolling_state : rolling_states) rolling_state = 0.0;
+        for (auto &rolling_minimum : rolling_minimums) rolling_minimum = skipped_rolling_state_k;
+        for (auto &rolling_count : rolling_counts) rolling_count = 0;
 
         // Until we reach the `window_width_k`, we don't need to discard any symbols and can keep the code simpler
         size_t const prefix_length = std::min<size_t>(task.text_length, window_width_k);
@@ -129,6 +134,7 @@ __global__ void floating_rolling_hashers_on_each_cuda_warp_(
             byte_t const new_char = task.text_ptr[new_char_offset]; // ? Hardware may auto-broadcast this
             f64_t const new_term = static_cast<f64_t>(new_char) + 1.0;
 
+#pragma unroll
             for (unsigned dim_within_thread = 0; dim_within_thread < dimensions_per_thread_k; ++dim_within_thread) {
                 f64_t &rolling_state = rolling_states[dim_within_thread];
                 f64_t const multiplier = multipliers[dim_within_thread];
@@ -141,19 +147,17 @@ __global__ void floating_rolling_hashers_on_each_cuda_warp_(
 
         // We now have our first minimum hashes
         if (new_char_offset == window_width_k) {
+#pragma unroll
             for (unsigned dim_within_thread = 0; dim_within_thread < dimensions_per_thread_k; ++dim_within_thread) {
                 rolling_minimums[dim_within_thread] = rolling_states[dim_within_thread];
                 rolling_counts[dim_within_thread] = 1;
-                // unsigned const dim = thread_in_warp_index * dimensions_per_thread_k + dim_within_thread;
-                // printf("DEBUG: rolling_minimums[%d] = %f, rolling_states[%d] = %f\n", dim,
-                //        rolling_minimums[dim_within_thread], dim, rolling_states[dim_within_thread]);
             }
         }
 
         // Now the main massive unrolled, coalescing reads & writes via `discarding_text_chunk` & `incoming_text_chunk`,
         // practically performing a (`warp_size_k` by `warp_size_k`) hash-calculating operation unrolling the loop
         // nested inside of this one.
-        for (; new_char_offset + warp_size_k < task.text_length; new_char_offset += warp_size_k) {
+        for (; new_char_offset + warp_size_k <= task.text_length; new_char_offset += warp_size_k) {
 
             // Load the next chunk of characters into shared memory
             byte_t const *incoming_bytes = task.text_ptr + new_char_offset;
@@ -164,12 +168,14 @@ __global__ void floating_rolling_hashers_on_each_cuda_warp_(
             // Make sure the shared memory is fully loaded.
             __syncwarp();
 
+#pragma unroll
             for (unsigned char_within_step = 0; char_within_step < warp_size_k; ++char_within_step) {
                 byte_t const new_char = incoming_text_chunk[warp_in_block_index][char_within_step];
                 byte_t const old_char = discarding_text_chunk[warp_in_block_index][char_within_step];
                 f64_t const new_term = static_cast<f64_t>(new_char) + 1.0;
                 f64_t const old_term = static_cast<f64_t>(old_char) + 1.0;
 
+#pragma unroll
                 for (unsigned dim_within_thread = 0; dim_within_thread < dimensions_per_thread_k; ++dim_within_thread) {
                     f64_t &rolling_state = rolling_states[dim_within_thread];
                     f64_t const multiplier = multipliers[dim_within_thread];
@@ -198,6 +204,7 @@ __global__ void floating_rolling_hashers_on_each_cuda_warp_(
             f64_t const new_term = static_cast<f64_t>(new_char) + 1.0;
             f64_t const old_term = static_cast<f64_t>(old_char) + 1.0;
 
+#pragma unroll
             for (unsigned dim_within_thread = 0; dim_within_thread < dimensions_per_thread_k; ++dim_within_thread) {
                 f64_t &rolling_state = rolling_states[dim_within_thread];
                 f64_t const multiplier = multipliers[dim_within_thread];
@@ -219,6 +226,7 @@ __global__ void floating_rolling_hashers_on_each_cuda_warp_(
         }
 
         // Finally export the results
+#pragma unroll
         for (unsigned dim_within_thread = 0; dim_within_thread < dimensions_per_thread_k; ++dim_within_thread) {
             unsigned const dim = thread_in_warp_index * dimensions_per_thread_k + dim_within_thread;
             if (dim >= hashers_count) continue; // ? Avoid out-of-bounds access
@@ -291,7 +299,7 @@ struct floating_rolling_hashers<sz_cap_cuda_k, window_width_, dimensions_> {
      *  @brief Initializes several rolling hashers with different multipliers and modulos.
      *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
      */
-    status_t try_seed(size_t alphabet_size = 256) noexcept {
+    SZ_NOINLINE status_t try_seed(size_t alphabet_size = 256) noexcept {
         if (hashers_.try_resize(aligned_dimensions_k) != status_t::success_k) return status_t::bad_alloc_k;
         for (unsigned dim = 0; dim < dimensions_k; ++dim)
             hashers_[dim] = hasher_t(window_width_k, alphabet_size + dim, hasher_t::default_modulo_base_k);
@@ -305,8 +313,9 @@ struct floating_rolling_hashers<sz_cap_cuda_k, window_width_, dimensions_> {
      *  @param[out] min_counts The output frequencies of @p `min_hashes` hashes.
      *  @note Unlike the CPU kernels, @b not intended for product use, but rather for testing.
      */
-    cuda_status_t try_fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes, min_counts_span_t min_counts,
-                                  gpu_specs_t specs = {}, cuda_executor_t executor = {}) const noexcept {
+    SZ_NOINLINE cuda_status_t try_fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
+                                              min_counts_span_t min_counts, gpu_specs_t specs = {},
+                                              cuda_executor_t executor = {}) const noexcept {
 
         using task_t = cuda_floating_fingerprint_task_<byte_t>;
         using tasks_allocator_t = typename allocator_t::template rebind<task_t>::other;
@@ -376,9 +385,9 @@ struct floating_rolling_hashers<sz_cap_cuda_k, window_width_, dimensions_> {
     }
 
     template <typename texts_type_, typename min_hashes_per_text_type_, typename min_counts_per_text_type_>
-    cuda_status_t operator()(texts_type_ const &texts, min_hashes_per_text_type_ &&min_hashes_per_text,
-                             min_counts_per_text_type_ &&min_counts_per_text, gpu_specs_t specs = {},
-                             cuda_executor_t executor = {}) const noexcept {
+    SZ_NOINLINE cuda_status_t operator()(texts_type_ const &texts, min_hashes_per_text_type_ &&min_hashes_per_text,
+                                         min_counts_per_text_type_ &&min_counts_per_text, gpu_specs_t specs = {},
+                                         cuda_executor_t executor = {}) const noexcept {
 
         using texts_t = texts_type_;
         using text_t = typename texts_t::value_type;
@@ -407,8 +416,8 @@ struct floating_rolling_hashers<sz_cap_cuda_k, window_width_, dimensions_> {
                 .density = four_warps_per_multiprocessor_k,
             };
         }
-        std::partition(tasks.begin(), tasks.end(),
-                       [](task_t const &task) { return task.density == warps_working_together_k; });
+        // std::partition(tasks.begin(), tasks.end(),
+        //                [](task_t const &task) { return task.density == warps_working_together_k; });
 
         // Record the start event
         cudaError_t start_event_error = cudaEventRecord(start_event, executor.stream);
@@ -416,7 +425,7 @@ struct floating_rolling_hashers<sz_cap_cuda_k, window_width_, dimensions_> {
 
         void *warp_level_kernel_args[4];
         auto const *tasks_ptr = tasks.data();
-        auto tasks_size = tasks.size();
+        auto const tasks_size = tasks.size();
         auto const *hashers_ptr = hashers_.data();
         auto const hashers_size = (std::min)(dimensions_k, hashers_.size());
         warp_level_kernel_args[0] = (void *)(&tasks_ptr);
@@ -430,9 +439,9 @@ struct floating_rolling_hashers<sz_cap_cuda_k, window_width_, dimensions_> {
             warp_size_nvidia_k, four_warps_per_multiprocessor_k>;
 
         // TODO: We can be wiser about the dimensions of this grid.
-        unsigned const random_block_size =
-            static_cast<unsigned>(warp_size_nvidia_k) * static_cast<unsigned>(four_warps_per_multiprocessor_k);
-        unsigned const random_blocks_per_multiprocessor = 32;
+        unsigned const random_block_size = static_cast<unsigned>(warp_size_nvidia_k) * //
+                                           static_cast<unsigned>(four_warps_per_multiprocessor_k);
+        unsigned const random_blocks_per_multiprocessor = 2;
         cudaError_t launch_error = cudaLaunchCooperativeKernel(                       //
             reinterpret_cast<void *>(warp_level_kernel),                              // Kernel function pointer
             dim3(random_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions
@@ -454,14 +463,6 @@ struct floating_rolling_hashers<sz_cap_cuda_k, window_width_, dimensions_> {
         float execution_milliseconds = 0;
         cudaEventElapsedTime(&execution_milliseconds, start_event, stop_event);
 
-        // Now that everything went well, export the results back into the `results` array.
-        for (size_t task_index = 0; task_index < tasks.size(); ++task_index) {
-            task_t const &task = tasks[task_index];
-            auto min_hashes = to_span(min_hashes_per_text[task.original_index]);
-            auto min_counts = to_span(min_counts_per_text[task.original_index]);
-            sz_copy((sz_ptr_t)min_hashes.data(), (sz_cptr_t)task.min_hashes, min_hashes.size_bytes());
-            sz_copy((sz_ptr_t)min_counts.data(), (sz_cptr_t)task.min_counts, min_counts.size_bytes());
-        }
         return {status_t::success_k, cudaSuccess, execution_milliseconds};
     }
 };
diff --git a/include/stringzillas/fingerprint.hpp b/include/stringzillas/fingerprint.hpp
index 6981fec3..9580e824 100644
--- a/include/stringzillas/fingerprint.hpp
+++ b/include/stringzillas/fingerprint.hpp
@@ -625,7 +625,7 @@ struct basic_rolling_hashers {
      *  hashers("some text", fingerprint);
      *  @endcode
      */
-    status_t try_extend(size_t window_width, size_t dims, size_t alphabet_size = 256) noexcept {
+    SZ_NOINLINE status_t try_extend(size_t window_width, size_t dims, size_t alphabet_size = 256) noexcept {
         if (hashers_.try_reserve(dims) != status_t::success_k) return status_t::bad_alloc_k;
         for (size_t dim = 0; dim < dims; ++dim) {
             status_t status = try_append(hasher_t(window_width, alphabet_size + dim));
@@ -639,7 +639,7 @@ struct basic_rolling_hashers {
      *  @retval status_t::success_k on success, or an error code otherwise.
      *  @retval status_t::bad_alloc_k if the memory allocation fails.
      */
-    status_t try_append(hasher_t hasher) noexcept {
+    SZ_NOINLINE status_t try_append(hasher_t hasher) noexcept {
         auto const new_window_width = hasher.window_width();
         if (hashers_.try_push_back(std::move(hasher)) != status_t::success_k) return status_t::bad_alloc_k;
 
@@ -656,7 +656,7 @@ struct basic_rolling_hashers {
      *  @retval status_t::bad_alloc_k if the memory allocation fails.
      */
     template <size_t dimensions_ = SZ_SIZE_MAX>
-    status_t try_fingerprint(                     //
+    SZ_NOINLINE status_t try_fingerprint(         //
         span<byte_t const> text,                  //
         span<min_hash_t, dimensions_> min_hashes, //
         span<min_count_t, dimensions_> min_counts) const noexcept {
@@ -701,7 +701,7 @@ struct basic_rolling_hashers {
      *  as its a relatively cheap operation.
      */
     template <size_t dimensions_ = SZ_SIZE_MAX>
-    void fingerprint_chunk(                                 //
+    SZ_NOINLINE void fingerprint_chunk(                     //
         span<byte_t const> text_chunk,                      //
         span<rolling_state_t, dimensions_> last_states,     //
         span<rolling_hash_t, dimensions_> rolling_minimums, //
@@ -795,7 +795,7 @@ struct basic_rolling_hashers {
 #if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
-    status_t operator()(                                                                                  //
+    SZ_NOINLINE status_t operator()(                                                                      //
         texts_type_ const &texts,                                                                         //
         min_hashes_per_text_type_ &&min_hashes_per_text, min_counts_per_text_type_ &&min_counts_per_text, //
         executor_type_ &&executor = {}, cpu_specs_t specs = {}) const noexcept {
@@ -913,7 +913,7 @@ template <typename engine_type_, typename texts_type_, typename min_hashes_per_t
 #if SZ_IS_CPP20_
     requires executor_like<executor_type_>
 #endif
-status_t floating_rolling_hashers_in_parallel_(                                                       //
+SZ_NOINLINE status_t floating_rolling_hashers_in_parallel_(                                           //
     engine_type_ const &engine, texts_type_ const &texts,                                             //
     min_hashes_per_text_type_ &&min_hashes_per_text, min_counts_per_text_type_ &&min_counts_per_text, //
     executor_type_ &&executor = {}, cpu_specs_t specs = {}) noexcept {
@@ -1074,7 +1074,7 @@ struct floating_rolling_hashers<sz_cap_serial_k, window_width_, dimensions_> {
      *  @brief Initializes several rolling hashers with different multipliers and modulos.
      *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
      */
-    status_t try_seed(size_t alphabet_size = 256) noexcept {
+    SZ_NOINLINE status_t try_seed(size_t alphabet_size = 256) noexcept {
         for (unsigned dim = 0; dim < dimensions_k; ++dim) {
             hasher_t hasher(window_width_k, alphabet_size + dim, hasher_t::default_modulo_base_k);
             multipliers_[dim] = hasher.multiplier();
@@ -1091,8 +1091,8 @@ struct floating_rolling_hashers<sz_cap_serial_k, window_width_, dimensions_> {
      *  @param[out] min_hashes The output fingerprint, a vector of minimum hashes.
      *  @param[out] min_counts The output frequencies of @p `min_hashes` hashes.
      */
-    void fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
-                     min_counts_span_t min_counts) const noexcept {
+    SZ_NOINLINE void fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
+                                 min_counts_span_t min_counts) const noexcept {
 
         if (text.size() < window_width_k) {
             for (auto &min_hash : min_hashes) min_hash = max_hash_k;
@@ -1113,8 +1113,8 @@ struct floating_rolling_hashers<sz_cap_serial_k, window_width_, dimensions_> {
      *  @param[out] min_hashes The output fingerprint, a vector of minimum hashes.
      *  @param[out] min_counts The output frequencies of @p `min_hashes` hashes.
      */
-    status_t try_fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
-                             min_counts_span_t min_counts) const noexcept {
+    SZ_NOINLINE status_t try_fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
+                                         min_counts_span_t min_counts) const noexcept {
         fingerprint(text, min_hashes, min_counts);
         return status_t::success_k;
     }
@@ -1133,7 +1133,7 @@ struct floating_rolling_hashers<sz_cap_serial_k, window_width_, dimensions_> {
      *  will anyways export the composing Count-Min-Sketch fingerprint into the @p `min_hashes` and @p `min_counts`,
      *  as its a relatively cheap operation.
      */
-    void fingerprint_chunk(                                   //
+    SZ_NOINLINE void fingerprint_chunk(                       //
         span<byte_t const> text_chunk,                        //
         span<rolling_state_t, dimensions_k> last_states,      //
         span<rolling_state_t, dimensions_k> rolling_minimums, //
@@ -1216,9 +1216,9 @@ struct floating_rolling_hashers<sz_cap_serial_k, window_width_, dimensions_> {
 #if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
-    status_t operator()(texts_type_ const &texts, min_hashes_per_text_type_ &&min_hashes, //
-                        min_counts_per_text_type_ &&min_counts, executor_type_ &&executor = {},
-                        cpu_specs_t specs = {}) noexcept {
+    SZ_NOINLINE status_t operator()(texts_type_ const &texts, min_hashes_per_text_type_ &&min_hashes, //
+                                    min_counts_per_text_type_ &&min_counts, executor_type_ &&executor = {},
+                                    cpu_specs_t specs = {}) noexcept {
         return floating_rolling_hashers_in_parallel_(            //
             *this, texts,                                        //
             std::forward<min_hashes_per_text_type_>(min_hashes), //
@@ -1313,7 +1313,7 @@ struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
      *  @brief Initializes several rolling hashers with different multipliers and modulos.
      *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
      */
-    status_t try_seed(size_t alphabet_size = 256) noexcept {
+    SZ_NOINLINE status_t try_seed(size_t alphabet_size = 256) noexcept {
         for (unsigned dim = 0; dim < dimensions_k; ++dim) {
             hasher_t hasher(window_width_k, alphabet_size + dim, hasher_t::default_modulo_base_k);
             multipliers_[dim] = hasher.multiplier();
@@ -1330,8 +1330,8 @@ struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
      *  @param[out] min_hashes The output fingerprint, a vector of minimum hashes.
      *  @param[out] min_counts The output frequencies of @p `min_hashes` hashes.
      */
-    void fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
-                     min_counts_span_t min_counts) const noexcept {
+    SZ_NOINLINE void fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
+                                 min_counts_span_t min_counts) const noexcept {
 
         if (text.size() < window_width_k) {
             for (auto &min_hash : min_hashes) min_hash = max_hash_k;
@@ -1352,8 +1352,8 @@ struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
      *  @param[out] min_hashes The output fingerprint, a vector of minimum hashes.
      *  @param[out] min_counts The output frequencies of @p `min_hashes` hashes.
      */
-    status_t try_fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
-                             min_counts_span_t min_counts) const noexcept {
+    SZ_NOINLINE status_t try_fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
+                                         min_counts_span_t min_counts) const noexcept {
         fingerprint(text, min_hashes, min_counts);
         return status_t::success_k;
     }
@@ -1372,7 +1372,7 @@ struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
      *  will anyways export the composing Count-Min-Sketch fingerprint into the @p `min_hashes` and @p `min_counts`,
      *  as its a relatively cheap operation.
      */
-    void fingerprint_chunk(                                   //
+    SZ_NOINLINE void fingerprint_chunk(                       //
         span<byte_t const> text_chunk,                        //
         span<rolling_state_t, dimensions_k> last_states,      //
         span<rolling_state_t, dimensions_k> rolling_minimums, //
@@ -1411,9 +1411,9 @@ struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
 #if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
-    status_t operator()(texts_type_ const &texts, min_hashes_per_text_type_ &&min_hashes_per_text, //
-                        min_counts_per_text_type_ &&min_counts_per_text, executor_type_ &&executor = {},
-                        cpu_specs_t specs = {}) noexcept {
+    SZ_NOINLINE status_t operator()(texts_type_ const &texts, min_hashes_per_text_type_ &&min_hashes_per_text, //
+                                    min_counts_per_text_type_ &&min_counts_per_text, executor_type_ &&executor = {},
+                                    cpu_specs_t specs = {}) noexcept {
         return floating_rolling_hashers_in_parallel_(                     //
             *this, texts,                                                 //
             std::forward<min_hashes_per_text_type_>(min_hashes_per_text), //
@@ -1633,7 +1633,7 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
      *  @brief Initializes several rolling hashers with different multipliers and modulos.
      *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
      */
-    status_t try_seed(size_t alphabet_size = 256) noexcept {
+    SZ_NOINLINE status_t try_seed(size_t alphabet_size = 256) noexcept {
         for (unsigned dim = 0; dim < dimensions_k; ++dim) {
             hasher_t hasher(window_width_k, alphabet_size + dim, hasher_t::default_modulo_base_k);
             multipliers_[dim] = hasher.multiplier();
@@ -1650,8 +1650,8 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
      *  @param[out] min_hashes The output fingerprint, a vector of minimum hashes.
      *  @param[out] min_counts The output frequencies of @p `min_hashes` hashes.
      */
-    void fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
-                     min_counts_span_t min_counts) const noexcept {
+    SZ_NOINLINE void fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
+                                 min_counts_span_t min_counts) const noexcept {
 
         if (text.size() < window_width_k) {
             for (auto &min_hash : min_hashes) min_hash = max_hash_k;
@@ -1672,8 +1672,8 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
      *  @param[out] min_hashes The output fingerprint, a vector of minimum hashes.
      *  @param[out] min_counts The output frequencies of @p `min_hashes` hashes.
      */
-    status_t try_fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
-                             min_counts_span_t min_counts) const noexcept {
+    SZ_NOINLINE status_t try_fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
+                                         min_counts_span_t min_counts) const noexcept {
         fingerprint(text, min_hashes, min_counts);
         return status_t::success_k;
     }
@@ -1692,7 +1692,7 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
      *  will anyways export the composing Count-Min-Sketch fingerprint into the @p `min_hashes` and @p `min_counts`,
      *  as its a relatively cheap operation.
      */
-    void fingerprint_chunk(                                   //
+    SZ_NOINLINE void fingerprint_chunk(                       //
         span<byte_t const> text_chunk,                        //
         span<rolling_state_t, dimensions_k> last_states,      //
         span<rolling_state_t, dimensions_k> rolling_minimums, //
@@ -1730,9 +1730,9 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
 #if SZ_IS_CPP20_
         requires executor_like<executor_type_>
 #endif
-    status_t operator()(texts_type_ const &texts, min_hashes_per_text_type_ &&min_hashes_per_text, //
-                        min_counts_per_text_type_ &&min_counts_per_text, executor_type_ &&executor = {},
-                        cpu_specs_t specs = {}) noexcept {
+    SZ_NOINLINE status_t operator()(texts_type_ const &texts, min_hashes_per_text_type_ &&min_hashes_per_text, //
+                                    min_counts_per_text_type_ &&min_counts_per_text, executor_type_ &&executor = {},
+                                    cpu_specs_t specs = {}) noexcept {
         return floating_rolling_hashers_in_parallel_(                     //
             *this, texts,                                                 //
             std::forward<min_hashes_per_text_type_>(min_hashes_per_text), //

From 3b48c937aa75e59dd4538e1e3d4d736f2ede15f5 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 31 Jul 2025 19:56:57 +0000
Subject: [PATCH 511/751] Improve: Naming internal symbols

---
 include/stringzilla/stringzilla.hpp |  28 +-
 include/stringzillas/similarity.cuh | 711 ++++++++++++++--------------
 include/stringzillas/similarity.hpp |  12 +-
 3 files changed, 383 insertions(+), 368 deletions(-)

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 0fb9deb0..1902bf2e 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -3801,22 +3801,22 @@ void lookup(basic_string_slice<char_type_> string, basic_look_up_table<char_type
  *  @sa try_argsort, argsort, try_join, join
  */
 template <typename container_type_, typename string_extractor_>
-struct _sequence_args {
+struct sequence_args_ {
     container_type_ const &container;
     string_extractor_ const &extractor;
 };
 
 template <typename container_type_, typename string_extractor_>
-sz_cptr_t _call_sequence_member_start(void const *sequence_args_ptr, sz_size_t i) {
-    using sequence_args_t = _sequence_args<container_type_, string_extractor_>;
+sz_cptr_t call_sequence_member_start_(void const *sequence_args_ptr, sz_size_t i) {
+    using sequence_args_t = sequence_args_<container_type_, string_extractor_>;
     sequence_args_t const *args = reinterpret_cast<sequence_args_t const *>(sequence_args_ptr);
     string_view member = args->extractor(args->container[i]);
     return member.data();
 }
 
 template <typename container_type_, typename string_extractor_>
-sz_size_t _call_sequence_member_length(void const *sequence_args_ptr, sz_size_t i) {
-    using sequence_args_t = _sequence_args<container_type_, string_extractor_>;
+sz_size_t call_sequence_member_length_(void const *sequence_args_ptr, sz_size_t i) {
+    using sequence_args_t = sequence_args_<container_type_, string_extractor_>;
     sequence_args_t const *args = reinterpret_cast<sequence_args_t const *>(sequence_args_ptr);
     string_view member = args->extractor(args->container[i]);
     return static_cast<sz_size_t>(member.size());
@@ -3838,13 +3838,13 @@ status_t try_argsort(container_type_ const &container, string_extractor_ const &
                      sorted_idx_t *order) noexcept {
 
     // Pack the arguments into a single structure to reference it from the callback.
-    using args_t = _sequence_args<container_type_, string_extractor_>;
+    using args_t = sequence_args_<container_type_, string_extractor_>;
     args_t args {container, extractor};
     sz_sequence_t sequence;
     sequence.handle = &args;
     sequence.count = container.size();
-    sequence.get_start = _call_sequence_member_start<container_type_, string_extractor_>;
-    sequence.get_length = _call_sequence_member_length<container_type_, string_extractor_>;
+    sequence.get_start = call_sequence_member_start_<container_type_, string_extractor_>;
+    sequence.get_length = call_sequence_member_length_<container_type_, string_extractor_>;
 
     using sz_alloc_type = sz_memory_allocator_t;
     return _with_alloc<std::allocator<sz_u8_t>>(
@@ -3872,18 +3872,18 @@ status_t try_intersect(
     sorted_idx_t *first_positions, sorted_idx_t *second_positions) noexcept {
 
     // Pack the arguments into a single structure to reference it from the callback.
-    using first_t = _sequence_args<first_container_, first_extractor_>;
-    using second_t = _sequence_args<second_container_, second_extractor_>;
+    using first_t = sequence_args_<first_container_, first_extractor_>;
+    using second_t = sequence_args_<second_container_, second_extractor_>;
     first_t first_args {first_container, first_extractor};
     second_t second_args {second_container, second_extractor};
 
     sz_sequence_t first_sequence, second_sequence;
     first_sequence.count = first_container.size(), second_sequence.count = second_container.size();
     first_sequence.handle = &first_args, second_sequence.handle = &second_args;
-    first_sequence.get_start = _call_sequence_member_start<first_container_, first_extractor_>;
-    first_sequence.get_length = _call_sequence_member_length<first_container_, first_extractor_>;
-    second_sequence.get_start = _call_sequence_member_start<second_container_, second_extractor_>;
-    second_sequence.get_length = _call_sequence_member_length<second_container_, second_extractor_>;
+    first_sequence.get_start = call_sequence_member_start_<first_container_, first_extractor_>;
+    first_sequence.get_length = call_sequence_member_length_<first_container_, first_extractor_>;
+    second_sequence.get_start = call_sequence_member_start_<second_container_, second_extractor_>;
+    second_sequence.get_length = call_sequence_member_length_<second_container_, second_extractor_>;
 
     using sz_alloc_type = sz_memory_allocator_t;
     return _with_alloc<std::allocator<sz_u8_t>>([&](sz_alloc_type &alloc) {
diff --git a/include/stringzillas/similarity.cuh b/include/stringzillas/similarity.cuh
index 388121d9..44e8f0d3 100644
--- a/include/stringzillas/similarity.cuh
+++ b/include/stringzillas/similarity.cuh
@@ -94,20 +94,20 @@ using affine_smith_waterman_hopper_t =
  *  @brief  Dispatches min or max operation based on the compile-time objective.
  */
 template <sz_similarity_objective_t objective_, typename scalar_type_>
-__forceinline__ __device__ scalar_type_ _pick_best(scalar_type_ a, scalar_type_ b) noexcept {
+__forceinline__ __device__ scalar_type_ pick_best_(scalar_type_ a, scalar_type_ b) noexcept {
     if constexpr (objective_ == sz_minimize_distance_k) { return std::min(a, b); }
     else { return std::max(a, b); }
 }
 
 template <sz_similarity_objective_t objective_, typename scalar_type_>
-__forceinline__ __device__ scalar_type_ _pick_best_in_warp(scalar_type_ x) noexcept {
+__forceinline__ __device__ scalar_type_ pick_best_in_warp_(scalar_type_ x) noexcept {
     // The `__shfl_down_sync` replaces `__shfl_down`
     // https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
-    x = _pick_best<objective_, scalar_type_>(__shfl_down_sync(0xffffffff, x, 16), x);
-    x = _pick_best<objective_, scalar_type_>(__shfl_down_sync(0xffffffff, x, 8), x);
-    x = _pick_best<objective_, scalar_type_>(__shfl_down_sync(0xffffffff, x, 4), x);
-    x = _pick_best<objective_, scalar_type_>(__shfl_down_sync(0xffffffff, x, 2), x);
-    x = _pick_best<objective_, scalar_type_>(__shfl_down_sync(0xffffffff, x, 1), x);
+    x = pick_best_<objective_, scalar_type_>(__shfl_down_sync(0xffffffff, x, 16), x);
+    x = pick_best_<objective_, scalar_type_>(__shfl_down_sync(0xffffffff, x, 8), x);
+    x = pick_best_<objective_, scalar_type_>(__shfl_down_sync(0xffffffff, x, 4), x);
+    x = pick_best_<objective_, scalar_type_>(__shfl_down_sync(0xffffffff, x, 2), x);
+    x = pick_best_<objective_, scalar_type_>(__shfl_down_sync(0xffffffff, x, 1), x);
     return x;
 }
 
@@ -117,7 +117,7 @@ __forceinline__ __device__ scalar_type_ _pick_best_in_warp(scalar_type_ x) noexc
  *  @see    https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#global-memory-5-x
  */
 template <typename scalar_type_>
-__forceinline__ __device__ scalar_type_ _load_immutable(scalar_type_ const *ptr) noexcept {
+__forceinline__ __device__ scalar_type_ load_immutable_(scalar_type_ const *ptr) noexcept {
     // The `__ldg` intrinsic translates into the `ld.global.nc` PTX instruction.
     // It reads a value from global memory and caches it in the non-coherent cache.
     // return __ldg(ptr);
@@ -130,7 +130,7 @@ __forceinline__ __device__ scalar_type_ _load_immutable(scalar_type_ const *ptr)
  *  @see    https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cache-operators
  */
 template <typename scalar_type_>
-__forceinline__ __device__ scalar_type_ _load_last_use(scalar_type_ const *ptr) noexcept {
+__forceinline__ __device__ scalar_type_ load_last_use_(scalar_type_ const *ptr) noexcept {
     // return __ldlu(ptr);
     return *ptr;
 }
@@ -141,7 +141,7 @@ __forceinline__ __device__ scalar_type_ _load_last_use(scalar_type_ const *ptr)
 
 /**
  *  @brief GPU adaptation of the `tile_scorer` on CUDA, avoiding warp-level shuffles and DPX.
- *  @note Uses 32-bit `uint` counter to iterate through the string slices, so it can't be over 4 billion characters.
+ *  @note Uses 32-bit `unsigned` counter to iterate through the string slices, so it can't be over 4 billion characters.
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
@@ -202,7 +202,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
      *  @param tasks_step The step size for the next character to compare from each string.
      *  @param tasks_count The total number of characters to compare from input slices.
      *
-     *  @tparam index_type_ @b `uint` is recommended if the strings are under 4 billion characters.
+     *  @tparam index_type_ @b `unsigned` is recommended if the strings are under 4 billion characters.
      */
     template <typename index_type_>
     __forceinline__ __device__ void operator()(                                                      //
@@ -220,16 +220,16 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         // ? and allowing them to reuse the previous `pre_deletion` as the new `pre_insertion`,
         // ? that code ends up being slower than the one below.
         for (index_type_ i = tasks_offset; i < tasks_count; i += tasks_step) {
-            score_t pre_substitution = _load_last_use(scores_pre_substitution + i);
+            score_t pre_substitution = load_last_use_(scores_pre_substitution + i);
             score_t pre_insertion = scores_pre_insertion[i];
             score_t pre_deletion = scores_pre_deletion[i];
-            char first_char = _load_immutable(first_slice + tasks_count - i - 1);
-            char second_char = _load_immutable(second_slice + i);
+            char first_char = load_immutable_(first_slice + tasks_count - i - 1);
+            char second_char = load_immutable_(second_slice + i);
 
             error_cost_t cost_of_substitution = substituter_(first_char, second_char);
             score_t if_substitution = pre_substitution + cost_of_substitution;
-            score_t if_deletion_or_insertion = _pick_best<objective_k>(pre_deletion, pre_insertion) + gap_costs;
-            score_t cell_score = _pick_best<objective_k>(if_deletion_or_insertion, if_substitution);
+            score_t if_deletion_or_insertion = pick_best_<objective_k>(pre_deletion, pre_insertion) + gap_costs;
+            score_t cell_score = pick_best_<objective_k>(if_deletion_or_insertion, if_substitution);
             scores_new[i] = cell_score;
         }
 
@@ -240,7 +240,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 
 /**
  *  @brief GPU adaptation of the `local_scorer` on CUDA, avoiding warp-level shuffles and DPX.
- *  @note Uses 32-bit `uint` counter to iterate through the string slices, so it can't be over 4 billion characters.
+ *  @note Uses 32-bit `unsigned` counter to iterate through the string slices, so it can't be over 4 billion characters.
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
@@ -298,7 +298,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
      *  @param tasks_step The step size for the next character to compare from each string.
      *  @param tasks_count The total number of characters to compare from input slices.
      *
-     *  @tparam index_type_ @b `uint` is recommended if the strings are under 4 billion characters.
+     *  @tparam index_type_ @b `unsigned` is recommended if the strings are under 4 billion characters.
      */
     template <typename index_type_>
     __forceinline__ __device__ void operator()(                                                      //
@@ -316,31 +316,31 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         // ? and allowing them to reuse the previous `pre_deletion` as the new `pre_insertion`,
         // ? that code ends up being slower than the one below.
         for (index_type_ i = tasks_offset; i < tasks_count; i += tasks_step) {
-            score_t pre_substitution = _load_last_use(scores_pre_substitution + i);
+            score_t pre_substitution = load_last_use_(scores_pre_substitution + i);
             score_t pre_insertion = scores_pre_insertion[i];
             score_t pre_deletion = scores_pre_deletion[i];
-            char first_char = _load_immutable(first_slice + tasks_count - i - 1);
-            char second_char = _load_immutable(second_slice + i);
+            char first_char = load_immutable_(first_slice + tasks_count - i - 1);
+            char second_char = load_immutable_(second_slice + i);
 
             error_cost_t cost_of_substitution = substituter_(first_char, second_char);
             score_t if_substitution = pre_substitution + cost_of_substitution;
-            score_t if_deletion_or_insertion = _pick_best<objective_k>(pre_deletion, pre_insertion) + gap_cost;
-            score_t if_substitution_or_reset = _pick_best<objective_k, score_t>(if_substitution, 0);
-            score_t cell_score = _pick_best<objective_k>(if_deletion_or_insertion, if_substitution_or_reset);
+            score_t if_deletion_or_insertion = pick_best_<objective_k>(pre_deletion, pre_insertion) + gap_cost;
+            score_t if_substitution_or_reset = pick_best_<objective_k, score_t>(if_substitution, 0);
+            score_t cell_score = pick_best_<objective_k>(if_deletion_or_insertion, if_substitution_or_reset);
             scores_new[i] = cell_score;
 
             // Update the global maximum score if this cell beats it.
-            final_score_ = _pick_best<objective_k>(final_score_, cell_score);
+            final_score_ = pick_best_<objective_k>(final_score_, cell_score);
         }
 
         // ! Don't forget to pick the best among the best scores per thread.
-        final_score_ = _pick_best_in_warp<objective_k>(final_score_);
+        final_score_ = pick_best_in_warp_<objective_k>(final_score_);
     }
 };
 
 /**
  *  @brief GPU adaptation of the `tile_scorer` on CUDA, avoiding warp-level shuffles and DPX.
- *  @note Uses 32-bit `uint` counter to iterate through the string slices, so it can't be over 4 billion characters.
+ *  @note Uses 32-bit `unsigned` counter to iterate through the string slices, so it can't be over 4 billion characters.
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
@@ -410,7 +410,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
      *  @param tasks_step The step size for the next character to compare from each string.
      *  @param tasks_count The total number of characters to compare from input slices.
      *
-     *  @tparam index_type_ @b `uint` is recommended if the strings are under 4 billion characters.
+     *  @tparam index_type_ @b `unsigned` is recommended if the strings are under 4 billion characters.
      */
     template <typename index_type_>
     __forceinline__ __device__ void operator()(                                                      //
@@ -433,13 +433,13 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         // ? and allowing them to reuse the previous `pre_deletion` as the new `pre_insertion`,
         // ? that code ends up being slower than the one below.
         for (index_type_ i = tasks_offset; i < tasks_count; i += tasks_step) {
-            score_t pre_substitution = _load_last_use(scores_pre_substitution + i);
+            score_t pre_substitution = load_last_use_(scores_pre_substitution + i);
             score_t pre_insertion_opening = scores_pre_insertion[i];
             score_t pre_deletion_opening = scores_pre_deletion[i];
             score_t pre_insertion_expansion = scores_running_insertions[i];
             score_t pre_deletion_expansion = scores_running_deletions[i];
-            char first_char = _load_immutable(first_slice + tasks_count - i - 1);
-            char second_char = _load_immutable(second_slice + i);
+            char first_char = load_immutable_(first_slice + tasks_count - i - 1);
+            char second_char = load_immutable_(second_slice + i);
 
             error_cost_t cost_of_substitution = substituter_(first_char, second_char);
             score_t if_substitution = pre_substitution + cost_of_substitution;
@@ -448,7 +448,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
             score_t if_deletion = min_or_max<objective_k>(pre_deletion_opening + gap_costs_.open,
                                                           pre_deletion_expansion + gap_costs_.extend);
             score_t if_deletion_or_insertion = min_or_max<objective_k>(if_deletion, if_insertion);
-            score_t cell_score = _pick_best<objective_k>(if_deletion_or_insertion, if_substitution);
+            score_t cell_score = pick_best_<objective_k>(if_deletion_or_insertion, if_substitution);
 
             // Export results.
             scores_new[i] = cell_score;
@@ -463,7 +463,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 
 /**
  *  @brief GPU adaptation of the `local_scorer` on CUDA, avoiding warp-level shuffles and DPX.
- *  @note Uses 32-bit `uint` counter to iterate through the string slices, so it can't be over 4 billion characters.
+ *  @note Uses 32-bit `unsigned` counter to iterate through the string slices, so it can't be over 4 billion characters.
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
@@ -528,7 +528,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
      *  @param tasks_step The step size for the next character to compare from each string.
      *  @param tasks_count The total number of characters to compare from input slices.
      *
-     *  @tparam index_type_ @b `uint` is recommended if the strings are under 4 billion characters.
+     *  @tparam index_type_ @b `unsigned` is recommended if the strings are under 4 billion characters.
      */
     template <typename index_type_>
     __forceinline__ __device__ void operator()(                                                      //
@@ -551,13 +551,13 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
         // ? and allowing them to reuse the previous `pre_deletion` as the new `pre_insertion`,
         // ? that code ends up being slower than the one below.
         for (index_type_ i = tasks_offset; i < tasks_count; i += tasks_step) {
-            score_t pre_substitution = _load_last_use(scores_pre_substitution + i);
+            score_t pre_substitution = load_last_use_(scores_pre_substitution + i);
             score_t pre_insertion_opening = scores_pre_insertion[i];
             score_t pre_deletion_opening = scores_pre_deletion[i];
             score_t pre_insertion_expansion = scores_running_insertions[i];
             score_t pre_deletion_expansion = scores_running_deletions[i];
-            char first_char = _load_immutable(first_slice + tasks_count - i - 1);
-            char second_char = _load_immutable(second_slice + i);
+            char first_char = load_immutable_(first_slice + tasks_count - i - 1);
+            char second_char = load_immutable_(second_slice + i);
 
             error_cost_t cost_of_substitution = substituter_(first_char, second_char);
             score_t if_substitution = pre_substitution + cost_of_substitution;
@@ -566,8 +566,8 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
             score_t if_insertion = min_or_max<objective_k>(pre_insertion_opening + gap_costs_.open,
                                                            pre_insertion_expansion + gap_costs_.extend);
             score_t if_deletion_or_insertion = min_or_max<objective_k>(if_deletion, if_insertion);
-            score_t if_substitution_or_reset = _pick_best<objective_k, score_t>(if_substitution, 0);
-            score_t cell_score = _pick_best<objective_k>(if_deletion_or_insertion, if_substitution_or_reset);
+            score_t if_substitution_or_reset = pick_best_<objective_k, score_t>(if_substitution, 0);
+            score_t cell_score = pick_best_<objective_k>(if_deletion_or_insertion, if_substitution_or_reset);
 
             // Export results.
             scores_new[i] = cell_score;
@@ -575,11 +575,11 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
             scores_new_deletions[i] = if_deletion;
 
             // Update the global maximum score if this cell beats it.
-            final_score_ = _pick_best<objective_k>(final_score_, cell_score);
+            final_score_ = pick_best_<objective_k>(final_score_, cell_score);
         }
 
         // ! Don't forget to pick the best among the best scores per thread.
-        final_score_ = _pick_best_in_warp<objective_k>(final_score_);
+        final_score_ = pick_best_in_warp_<objective_k>(final_score_);
     }
 };
 
@@ -599,27 +599,29 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  *  - @b `vmax4,vmin4,vadd4` video-processing instructions.
  */
 template <>
-struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
+struct tile_scorer<char const *, char const *, u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
-    : public tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
+    : public tile_scorer<char const *, char const *, u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
 
     using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
     using kepler_warp_scorer_t =
-        tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
+        tile_scorer<char const *, char const *, u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
                     sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>;
 
-    __forceinline__ __device__ void operator()(                                 //
-        char const *first_slice, char const *second_slice,                      //
-        uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
-        sz_u8_t const *scores_pre_substitution,                                 //
-        sz_u8_t const *scores_pre_insertion,                                    //
-        sz_u8_t const *scores_pre_deletion,                                     //
-        sz_u8_t *scores_new) noexcept {
-
-        sz_u8_t const match_cost = this->substituter_.match;
-        sz_u8_t const mismatch_cost = this->substituter_.mismatch;
-        sz_u8_t const gap_cost = this->gap_costs_.open_or_extend;
+    __forceinline__ __device__ void operator()(            //
+        char const *first_slice, char const *second_slice, //
+        unsigned const tasks_offset, unsigned const tasks_step,
+        unsigned const tasks_count,          // ! Unlike CPU, uses `unsigned`
+        u8_t const *scores_pre_substitution, //
+        u8_t const *scores_pre_insertion,    //
+        u8_t const *scores_pre_deletion,     //
+        u8_t *scores_new) noexcept {
+
+        u8_t const match_cost = this->substituter_.match;
+        u8_t const mismatch_cost = this->substituter_.mismatch;
+        u8_t const gap_cost = this->gap_costs_.open_or_extend;
+
         sz_u32_vec_t match_cost_vec, mismatch_cost_vec, gap_cost_vec, equality_vec;
         match_cost_vec.u32 = match_cost * 0x01010101u;       // ! 4x `u8` match costs
         mismatch_cost_vec.u32 = mismatch_cost * 0x01010101u; // ! 4x `u8` mismatch costs
@@ -636,7 +638,7 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
 
         // ! As we are processing 4 bytes per loop, and have at least 32 threads per block (32 * 4 = 128),
         // ! and deal with strings only under 256 bytes, this loop will fire at most twice per input.
-        for (uint i = tasks_offset * 4; i < tasks_count; i += tasks_step * 4) { // ! it's OK to spill beyond bounds
+        for (unsigned i = tasks_offset * 4; i < tasks_count; i += tasks_step * 4) { // ! it's OK to spill beyond bounds
             pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
             pre_insertion_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
             pre_deletion_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
@@ -675,27 +677,29 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
  *  - @b `vmax2,vmin2,vadd2` video-processing instructions.
  */
 template <>
-struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
+struct tile_scorer<char const *, char const *, u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
-    : public tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
+    : public tile_scorer<char const *, char const *, u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
 
     using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
     using kepler_warp_scorer_t =
-        tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
+        tile_scorer<char const *, char const *, u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
                     sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>;
 
-    __forceinline__ __device__ void operator()(                                 //
-        char const *first_slice, char const *second_slice,                      //
-        uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
-        sz_u16_t const *scores_pre_substitution,                                //
-        sz_u16_t const *scores_pre_insertion,                                   //
-        sz_u16_t const *scores_pre_deletion,                                    //
-        sz_u16_t *scores_new) noexcept {
-
-        sz_u16_t const match_cost = this->substituter_.match;
-        sz_u16_t const mismatch_cost = this->substituter_.mismatch;
-        sz_u16_t const gap_cost = this->gap_costs_.open_or_extend;
+    __forceinline__ __device__ void operator()(            //
+        char const *first_slice, char const *second_slice, //
+        unsigned const tasks_offset, unsigned const tasks_step,
+        unsigned const tasks_count,           // ! Unlike CPU, uses `unsigned`
+        u16_t const *scores_pre_substitution, //
+        u16_t const *scores_pre_insertion,    //
+        u16_t const *scores_pre_deletion,     //
+        u16_t *scores_new) noexcept {
+
+        u16_t const match_cost = this->substituter_.match;
+        u16_t const mismatch_cost = this->substituter_.mismatch;
+        u16_t const gap_cost = this->gap_costs_.open_or_extend;
+
         sz_u32_vec_t match_cost_vec, mismatch_cost_vec, gap_cost_vec, equality_vec;
         match_cost_vec.u32 = match_cost * 0x00010001;       // ! 2x `u16` match costs
         mismatch_cost_vec.u32 = mismatch_cost * 0x00010001; // ! 2x `u16` mismatch costs
@@ -712,17 +716,17 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
 
         // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
         // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
-        for (uint i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
+        for (unsigned i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
             pre_substitution_vec.u16s[0] = scores_pre_substitution[i + 0];
             pre_substitution_vec.u16s[1] = scores_pre_substitution[i + 1];
             pre_insertion_vec.u16s[0] = scores_pre_insertion[i + 0];
             pre_insertion_vec.u16s[1] = scores_pre_insertion[i + 1];
             pre_deletion_vec.u16s[0] = scores_pre_deletion[i + 0];
             pre_deletion_vec.u16s[1] = scores_pre_deletion[i + 1];
-            first_vec.u16s[0] = _load_immutable(first_slice + tasks_count - i - 1);
-            first_vec.u16s[1] = _load_immutable(first_slice + tasks_count - i - 2); // ! this may be OOB
-            second_vec.u16s[0] = _load_immutable(second_slice + i + 0);
-            second_vec.u16s[1] = _load_immutable(second_slice + i + 1); // ! this may be OOB, but padded
+            first_vec.u16s[0] = load_immutable_(first_slice + tasks_count - i - 1);
+            first_vec.u16s[1] = load_immutable_(first_slice + tasks_count - i - 2); // ! this may be OOB
+            second_vec.u16s[0] = load_immutable_(second_slice + i + 0);
+            second_vec.u16s[1] = load_immutable_(second_slice + i + 1); // ! this may be OOB, but padded
 
             // Equality comparison will output 0xFFFF for each matching byte-pair.
             equality_vec.u32 = __vcmpeq2(first_vec.u32, second_vec.u32);
@@ -745,26 +749,26 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
 };
 
 template <>
-struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
+struct tile_scorer<char const *, char const *, u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
-    : public tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
+    : public tile_scorer<char const *, char const *, u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
 
     using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
     using kepler_warp_scorer_t =
-        tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
+        tile_scorer<char const *, char const *, u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
                     sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>;
 };
 
 template <>
-struct tile_scorer<char const *, char const *, sz_u64_t, uniform_substitution_costs_t, linear_gap_costs_t,
+struct tile_scorer<char const *, char const *, u64_t, uniform_substitution_costs_t, linear_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
-    : public tile_scorer<char const *, char const *, sz_u64_t, uniform_substitution_costs_t, linear_gap_costs_t,
+    : public tile_scorer<char const *, char const *, u64_t, uniform_substitution_costs_t, linear_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
 
     using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
     using kepler_warp_scorer_t =
-        tile_scorer<char const *, char const *, sz_u64_t, uniform_substitution_costs_t, linear_gap_costs_t,
+        tile_scorer<char const *, char const *, u64_t, uniform_substitution_costs_t, linear_gap_costs_t,
                     sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>;
 };
 
@@ -777,32 +781,33 @@ struct tile_scorer<char const *, char const *, sz_u64_t, uniform_substitution_co
  *  - @b `vmax4,vmin4,vadd4` video-processing instructions.
  */
 template <>
-struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
+struct tile_scorer<char const *, char const *, u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
-    : public tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
+    : public tile_scorer<char const *, char const *, u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
 
     using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
     using kepler_warp_scorer_t =
-        tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
+        tile_scorer<char const *, char const *, u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
                     sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>;
 
-    __forceinline__ __device__ void operator()(                                 //
-        char const *first_slice, char const *second_slice,                      //
-        uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
-        sz_u8_t const *scores_pre_substitution,                                 //
-        sz_u8_t const *scores_pre_insertion,                                    //
-        sz_u8_t const *scores_pre_deletion,                                     //
-        sz_u8_t const *scores_running_insertions,                               //
-        sz_u8_t const *scores_running_deletions,                                //
-        sz_u8_t *scores_new,                                                    //
-        sz_u8_t *scores_new_insertions,                                         //
-        sz_u8_t *scores_new_deletions) noexcept {
-
-        sz_u8_t const match_cost = this->substituter_.match;
-        sz_u8_t const mismatch_cost = this->substituter_.mismatch;
-        sz_u8_t const gap_open_cost = this->gap_costs_.open;
-        sz_u8_t const gap_extend_cost = this->gap_costs_.extend;
+    __forceinline__ __device__ void operator()(            //
+        char const *first_slice, char const *second_slice, //
+        unsigned const tasks_offset, unsigned const tasks_step,
+        unsigned const tasks_count,            // ! Unlike CPU, uses `unsigned`
+        u8_t const *scores_pre_substitution,   //
+        u8_t const *scores_pre_insertion,      //
+        u8_t const *scores_pre_deletion,       //
+        u8_t const *scores_running_insertions, //
+        u8_t const *scores_running_deletions,  //
+        u8_t *scores_new,                      //
+        u8_t *scores_new_insertions,           //
+        u8_t *scores_new_deletions) noexcept {
+
+        u8_t const match_cost = this->substituter_.match;
+        u8_t const mismatch_cost = this->substituter_.mismatch;
+        u8_t const gap_open_cost = this->gap_costs_.open;
+        u8_t const gap_extend_cost = this->gap_costs_.extend;
         sz_u32_vec_t match_cost_vec, mismatch_cost_vec, gap_open_cost_vec, gap_extend_cost_vec, equality_vec;
         match_cost_vec.u32 = match_cost * 0x01010101u;           // ! 4x `u8` match costs
         mismatch_cost_vec.u32 = mismatch_cost * 0x01010101u;     // ! 4x `u8` mismatch costs
@@ -821,7 +826,7 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
 
         // ! As we are processing 4 bytes per loop, and have at least 32 threads per block (32 * 4 = 128),
         // ! and deal with strings only under 256 bytes, this loop will fire at most twice per input.
-        for (uint i = tasks_offset * 4; i < tasks_count; i += tasks_step * 4) { // ! it's OK to spill beyond bounds
+        for (unsigned i = tasks_offset * 4; i < tasks_count; i += tasks_step * 4) { // ! it's OK to spill beyond bounds
             pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
             pre_insertion_opening_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
             pre_deletion_opening_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
@@ -872,32 +877,34 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
  *  - @b `vmax2,vmin2,vadd2` video-processing instructions.
  */
 template <>
-struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
+struct tile_scorer<char const *, char const *, u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
-    : public tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
+    : public tile_scorer<char const *, char const *, u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
 
     using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
     using kepler_warp_scorer_t =
-        tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
+        tile_scorer<char const *, char const *, u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
                     sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>;
 
-    __forceinline__ __device__ void operator()(                                 //
-        char const *first_slice, char const *second_slice,                      //
-        uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
-        sz_u16_t const *scores_pre_substitution,                                //
-        sz_u16_t const *scores_pre_insertion,                                   //
-        sz_u16_t const *scores_pre_deletion,                                    //
-        sz_u16_t const *scores_running_insertions,                              //
-        sz_u16_t const *scores_running_deletions,                               //
-        sz_u16_t *scores_new,                                                   //
-        sz_u16_t *scores_new_insertions,                                        //
-        sz_u16_t *scores_new_deletions) noexcept {
-
-        sz_u16_t const match_cost = this->substituter_.match;
-        sz_u16_t const mismatch_cost = this->substituter_.mismatch;
-        sz_u16_t const gap_open_cost = this->gap_costs_.open;
-        sz_u16_t const gap_extend_cost = this->gap_costs_.extend;
+    __forceinline__ __device__ void operator()(            //
+        char const *first_slice, char const *second_slice, //
+        unsigned const tasks_offset, unsigned const tasks_step,
+        unsigned const tasks_count,             // ! Unlike CPU, uses `unsigned`
+        u16_t const *scores_pre_substitution,   //
+        u16_t const *scores_pre_insertion,      //
+        u16_t const *scores_pre_deletion,       //
+        u16_t const *scores_running_insertions, //
+        u16_t const *scores_running_deletions,  //
+        u16_t *scores_new,                      //
+        u16_t *scores_new_insertions,           //
+        u16_t *scores_new_deletions) noexcept {
+
+        u16_t const match_cost = this->substituter_.match;
+        u16_t const mismatch_cost = this->substituter_.mismatch;
+        u16_t const gap_open_cost = this->gap_costs_.open;
+        u16_t const gap_extend_cost = this->gap_costs_.extend;
+
         sz_u32_vec_t match_cost_vec, mismatch_cost_vec, gap_open_cost_vec, gap_extend_cost_vec, equality_vec;
         match_cost_vec.u32 = match_cost * 0x00010001;           // ! 2x `u16` match costs
         mismatch_cost_vec.u32 = mismatch_cost * 0x00010001;     // ! 2x `u16` mismatch costs
@@ -916,7 +923,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
 
         // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
         // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
-        for (uint i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
+        for (unsigned i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
             pre_substitution_vec.u16s[0] = scores_pre_substitution[i + 0];
             pre_substitution_vec.u16s[1] = scores_pre_substitution[i + 1];
             pre_insertion_opening_vec.u16s[0] = scores_pre_insertion[i + 0];
@@ -927,10 +934,10 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
             pre_insertion_expansion_vec.u16s[1] = scores_running_insertions[i + 1];
             pre_deletion_expansion_vec.u16s[0] = scores_running_deletions[i + 0];
             pre_deletion_expansion_vec.u16s[1] = scores_running_deletions[i + 1];
-            first_vec.u16s[0] = _load_immutable(first_slice + tasks_count - i - 1);
-            first_vec.u16s[1] = _load_immutable(first_slice + tasks_count - i - 2); // ! this may be OOB
-            second_vec.u16s[0] = _load_immutable(second_slice + i + 0);
-            second_vec.u16s[1] = _load_immutable(second_slice + i + 1); // ! this may be OOB, but padded
+            first_vec.u16s[0] = load_immutable_(first_slice + tasks_count - i - 1);
+            first_vec.u16s[1] = load_immutable_(first_slice + tasks_count - i - 2); // ! this may be OOB
+            second_vec.u16s[0] = load_immutable_(second_slice + i + 0);
+            second_vec.u16s[1] = load_immutable_(second_slice + i + 1); // ! this may be OOB, but padded
 
             // Equality comparison will output 0xFFFF for each matching byte-pair.
             equality_vec.u32 = __vcmpeq2(first_vec.u32, second_vec.u32);
@@ -959,17 +966,17 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
 };
 
 template <>
-struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, affine_gap_costs_t,
+struct tile_scorer<char const *, char const *, u32_t, uniform_substitution_costs_t, affine_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
-    : public tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, affine_gap_costs_t,
+    : public tile_scorer<char const *, char const *, u32_t, uniform_substitution_costs_t, affine_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
     using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
 };
 
 template <>
-struct tile_scorer<char const *, char const *, sz_u64_t, uniform_substitution_costs_t, affine_gap_costs_t,
+struct tile_scorer<char const *, char const *, u64_t, uniform_substitution_costs_t, affine_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
-    : public tile_scorer<char const *, char const *, sz_u64_t, uniform_substitution_costs_t, affine_gap_costs_t,
+    : public tile_scorer<char const *, char const *, u64_t, uniform_substitution_costs_t, affine_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
     using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
 };
@@ -985,9 +992,9 @@ struct tile_scorer<char const *, char const *, sz_u64_t, uniform_substitution_co
 #if SZ_USE_HOPPER
 
 template <>
-struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
+struct tile_scorer<char const *, char const *, u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
-    : public tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
+    : public tile_scorer<char const *, char const *, u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k> {
     using kepler_warp_scorer_t::tile_scorer; // Make the constructors visible
 };
@@ -997,23 +1004,25 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
  *  @note Requires Hopper generation GPUs to handle 2x `u16` scores at a time.
  */
 template <>
-struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
+struct tile_scorer<char const *, char const *, u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
-    : public tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
+    : public tile_scorer<char const *, char const *, u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
     using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
 
-    __forceinline__ __device__ void operator()(                                 //
-        char const *first_slice, char const *second_slice,                      //
-        uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
-        sz_u16_t const *scores_pre_substitution,                                //
-        sz_u16_t const *scores_pre_insertion,                                   //
-        sz_u16_t const *scores_pre_deletion,                                    //
-        sz_u16_t *scores_new) noexcept {
-
-        sz_u16_t const match_cost = this->substituter_.match;
-        sz_u16_t const mismatch_cost = this->substituter_.mismatch;
-        sz_u16_t const gap_cost = this->gap_costs_.open_or_extend;
+    __forceinline__ __device__ void operator()(            //
+        char const *first_slice, char const *second_slice, //
+        unsigned const tasks_offset, unsigned const tasks_step,
+        unsigned const tasks_count,           // ! Unlike CPU, uses `unsigned`
+        u16_t const *scores_pre_substitution, //
+        u16_t const *scores_pre_insertion,    //
+        u16_t const *scores_pre_deletion,     //
+        u16_t *scores_new) noexcept {
+
+        u16_t const match_cost = this->substituter_.match;
+        u16_t const mismatch_cost = this->substituter_.mismatch;
+        u16_t const gap_cost = this->gap_costs_.open_or_extend;
+
         sz_u32_vec_t match_cost_vec, mismatch_cost_vec, gap_cost_vec, equality_vec;
         match_cost_vec.u32 = match_cost * 0x00010001;       // ! 2x `u16` match costs
         mismatch_cost_vec.u32 = mismatch_cost * 0x00010001; // ! 2x `u16` mismatch costs
@@ -1030,17 +1039,17 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
 
         // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
         // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
-        for (uint i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
+        for (unsigned i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
             pre_substitution_vec.u16s[0] = scores_pre_substitution[i + 0];
             pre_substitution_vec.u16s[1] = scores_pre_substitution[i + 1];
             pre_insertion_vec.u16s[0] = scores_pre_insertion[i + 0];
             pre_insertion_vec.u16s[1] = scores_pre_insertion[i + 1];
             pre_deletion_vec.u16s[0] = scores_pre_deletion[i + 0];
             pre_deletion_vec.u16s[1] = scores_pre_deletion[i + 1];
-            first_vec.u16s[0] = _load_immutable(first_slice + tasks_count - i - 1);
-            first_vec.u16s[1] = _load_immutable(first_slice + tasks_count - i - 2); // ! this may be OOB
-            second_vec.u16s[0] = _load_immutable(second_slice + i + 0);
-            second_vec.u16s[1] = _load_immutable(second_slice + i + 1); // ! this may be OOB, but padded
+            first_vec.u16s[0] = load_immutable_(first_slice + tasks_count - i - 1);
+            first_vec.u16s[1] = load_immutable_(first_slice + tasks_count - i - 2); // ! this may be OOB
+            second_vec.u16s[0] = load_immutable_(second_slice + i + 0);
+            second_vec.u16s[1] = load_immutable_(second_slice + i + 1); // ! this may be OOB, but padded
 
             // Equality comparison will output 0xFFFF for each matching byte-pair.
             equality_vec.u32 = __vcmpeq2(first_vec.u32, second_vec.u32);
@@ -1063,27 +1072,27 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
 };
 
 template <>
-struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
+struct tile_scorer<char const *, char const *, u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
-    : public tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
+    : public tile_scorer<char const *, char const *, u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
 
     using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
 };
 
 template <>
-struct tile_scorer<char const *, char const *, sz_u64_t, uniform_substitution_costs_t, linear_gap_costs_t,
+struct tile_scorer<char const *, char const *, u64_t, uniform_substitution_costs_t, linear_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
-    : public tile_scorer<char const *, char const *, sz_u64_t, uniform_substitution_costs_t, linear_gap_costs_t,
+    : public tile_scorer<char const *, char const *, u64_t, uniform_substitution_costs_t, linear_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
 
     using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
 };
 
 template <>
-struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
+struct tile_scorer<char const *, char const *, u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
-    : public tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
+    : public tile_scorer<char const *, char const *, u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k> {
     using kepler_warp_scorer_t::tile_scorer; // Make the constructors visible
 };
@@ -1093,29 +1102,31 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
  *  @note Requires Hopper generation GPUs to handle 2x `u8` scores at a time.
  */
 template <>
-struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
+struct tile_scorer<char const *, char const *, u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
-    : public tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
+    : public tile_scorer<char const *, char const *, u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
 
     using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
 
-    __forceinline__ __device__ void operator()(                                 //
-        char const *first_slice, char const *second_slice,                      //
-        uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
-        sz_u16_t const *scores_pre_substitution,                                //
-        sz_u16_t const *scores_pre_insertion,                                   //
-        sz_u16_t const *scores_pre_deletion,                                    //
-        sz_u16_t const *scores_running_insertions,                              //
-        sz_u16_t const *scores_running_deletions,                               //
-        sz_u16_t *scores_new,                                                   //
-        sz_u16_t *scores_new_insertions,                                        //
-        sz_u16_t *scores_new_deletions) noexcept {
-
-        sz_u16_t const match_cost = this->substituter_.match;
-        sz_u16_t const mismatch_cost = this->substituter_.mismatch;
-        sz_u16_t const gap_open_cost = this->gap_costs_.open;
-        sz_u16_t const gap_extend_cost = this->gap_costs_.extend;
+    __forceinline__ __device__ void operator()(            //
+        char const *first_slice, char const *second_slice, //
+        unsigned const tasks_offset, unsigned const tasks_step,
+        unsigned const tasks_count,             // ! Unlike CPU, uses `unsigned`
+        u16_t const *scores_pre_substitution,   //
+        u16_t const *scores_pre_insertion,      //
+        u16_t const *scores_pre_deletion,       //
+        u16_t const *scores_running_insertions, //
+        u16_t const *scores_running_deletions,  //
+        u16_t *scores_new,                      //
+        u16_t *scores_new_insertions,           //
+        u16_t *scores_new_deletions) noexcept {
+
+        u16_t const match_cost = this->substituter_.match;
+        u16_t const mismatch_cost = this->substituter_.mismatch;
+        u16_t const gap_open_cost = this->gap_costs_.open;
+        u16_t const gap_extend_cost = this->gap_costs_.extend;
+
         sz_u32_vec_t match_cost_vec, mismatch_cost_vec, gap_open_cost_vec, gap_extend_cost_vec, equality_vec;
         match_cost_vec.u32 = match_cost * 0x00010001;           // ! 2x `u16` match costs
         mismatch_cost_vec.u32 = mismatch_cost * 0x00010001;     // ! 2x `u16` mismatch costs
@@ -1134,7 +1145,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
 
         // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
         // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
-        for (uint i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
+        for (unsigned i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
             pre_substitution_vec.u16s[0] = scores_pre_substitution[i + 0];
             pre_substitution_vec.u16s[1] = scores_pre_substitution[i + 1];
             pre_insertion_opening_vec.u16s[0] = scores_pre_insertion[i + 0];
@@ -1145,10 +1156,10 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
             pre_insertion_expansion_vec.u16s[1] = scores_running_insertions[i + 1];
             pre_deletion_expansion_vec.u16s[0] = scores_running_deletions[i + 0];
             pre_deletion_expansion_vec.u16s[1] = scores_running_deletions[i + 1];
-            first_vec.u16s[0] = _load_immutable(first_slice + tasks_count - i - 1);
-            first_vec.u16s[1] = _load_immutable(first_slice + tasks_count - i - 2); // ! this may be OOB
-            second_vec.u16s[0] = _load_immutable(second_slice + i + 0);
-            second_vec.u16s[1] = _load_immutable(second_slice + i + 1); // ! this may be OOB, but padded
+            first_vec.u16s[0] = load_immutable_(first_slice + tasks_count - i - 1);
+            first_vec.u16s[1] = load_immutable_(first_slice + tasks_count - i - 2); // ! this may be OOB
+            second_vec.u16s[0] = load_immutable_(second_slice + i + 0);
+            second_vec.u16s[1] = load_immutable_(second_slice + i + 1); // ! this may be OOB, but padded
 
             // Equality comparison will output 0xFFFF for each matching byte-pair.
             equality_vec.u32 = __vcmpeq2(first_vec.u32, second_vec.u32);
@@ -1179,18 +1190,18 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
 };
 
 template <>
-struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, affine_gap_costs_t,
+struct tile_scorer<char const *, char const *, u32_t, uniform_substitution_costs_t, affine_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
-    : public tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, affine_gap_costs_t,
+    : public tile_scorer<char const *, char const *, u32_t, uniform_substitution_costs_t, affine_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
 
     using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
 };
 
 template <>
-struct tile_scorer<char const *, char const *, sz_u64_t, uniform_substitution_costs_t, affine_gap_costs_t,
+struct tile_scorer<char const *, char const *, u64_t, uniform_substitution_costs_t, affine_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
-    : public tile_scorer<char const *, char const *, sz_u64_t, uniform_substitution_costs_t, affine_gap_costs_t,
+    : public tile_scorer<char const *, char const *, u64_t, uniform_substitution_costs_t, affine_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
 
     using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
@@ -1224,7 +1235,7 @@ struct tile_scorer<char const *, char const *, sz_u64_t, uniform_substitution_co
  */
 template <                                                       //
     typename char_type_ = char,                                  //
-    typename index_type_ = uint,                                 //
+    typename index_type_ = unsigned,                             //
     typename score_type_ = size_t,                               //
     typename final_score_type_ = size_t,                         //
     typename substituter_type_ = uniform_substitution_costs_t,   //
@@ -1401,7 +1412,7 @@ __global__ void linear_score_across_cuda_device_(              //
  */
 template <                                                       //
     typename char_type_ = char,                                  //
-    typename index_type_ = uint,                                 //
+    typename index_type_ = unsigned,                             //
     typename score_type_ = size_t,                               //
     typename final_score_type_ = size_t,                         //
     typename substituter_type_ = uniform_substitution_costs_t,   //
@@ -1606,7 +1617,7 @@ __global__ void affine_score_across_cuda_device(               //
 template < //
     typename task_type_,
     typename char_type_ = char,                                  //
-    typename index_type_ = uint,                                 //
+    typename index_type_ = unsigned,                             //
     typename score_type_ = size_t,                               //
     typename substituter_type_ = uniform_substitution_costs_t,   //
     sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
@@ -1616,7 +1627,7 @@ template < //
 __global__ void linear_score_on_each_cuda_warp_(                             //
     task_type_ *tasks, size_t tasks_count,                                   //
     substituter_type_ const substituter, linear_gap_costs_t const gap_costs, //
-    uint const shared_memory_size) {
+    unsigned const shared_memory_size) {
 
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
     using task_t = task_type_;
@@ -1638,12 +1649,12 @@ __global__ void linear_score_on_each_cuda_warp_(                             //
                                            objective_k, locality_k, capability_k>;
 
     // We may have multiple warps operating in the same block.
-    uint const warp_size = warpSize;
-    size_t const global_thread_index = static_cast<uint>(blockIdx.x * blockDim.x + threadIdx.x);
-    size_t const global_warp_index = static_cast<uint>(global_thread_index / warp_size);
-    size_t const warps_per_block = static_cast<uint>(blockDim.x / warp_size);
-    size_t const warps_per_device = static_cast<uint>(gridDim.x * warps_per_block);
-    uint const warp_thread_index = static_cast<uint>(global_thread_index % warp_size);
+    unsigned const warp_size = warpSize;
+    size_t const global_thread_index = static_cast<unsigned>(blockIdx.x * blockDim.x + threadIdx.x);
+    size_t const global_warp_index = static_cast<unsigned>(global_thread_index / warp_size);
+    size_t const warps_per_block = static_cast<unsigned>(blockDim.x / warp_size);
+    size_t const warps_per_device = static_cast<unsigned>(gridDim.x * warps_per_block);
+    unsigned const thread_in_warp_index = static_cast<unsigned>(global_thread_index % warp_size);
 
     // Allocating shared memory is handled on the host side.
     extern __shared__ char shared_memory_for_block[];
@@ -1651,7 +1662,7 @@ __global__ void linear_score_on_each_cuda_warp_(                             //
         shared_memory_for_block + (global_warp_index % warps_per_block) * (shared_memory_size / warps_per_block);
 
     // Only one thread will be initializing the top row and left column and outputting the result.
-    bool const is_main_thread = warp_thread_index == 0;
+    bool const is_main_thread = thread_in_warp_index == 0;
 
     // We are computing N edit distances for N pairs of strings. Not a cartesian product!
     // Each block/warp may end up receiving a different number of strings.
@@ -1665,8 +1676,8 @@ __global__ void linear_score_on_each_cuda_warp_(                             //
 
         // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
         // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
-        uint const shorter_dim = static_cast<uint>(shorter_length + 1);
-        uint const longer_dim = static_cast<uint>(longer_length + 1);
+        unsigned const shorter_dim = static_cast<unsigned>(shorter_length + 1);
+        unsigned const longer_dim = static_cast<unsigned>(longer_length + 1);
 
         // Let's say we are dealing with 3 and 5 letter words.
         // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
@@ -1674,9 +1685,9 @@ __global__ void linear_score_on_each_cuda_warp_(                             //
         // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
         // - 2 diagonals of fixed length, at positions: 4, 5.
         // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
-        uint const diagonals_count = shorter_dim + longer_dim - 1;
-        uint const max_diagonal_length = shorter_length + 1;
-        uint const bytes_per_diagonal = round_up_to_multiple<uint>(max_diagonal_length * sizeof(score_t), 4);
+        unsigned const diagonals_count = shorter_dim + longer_dim - 1;
+        unsigned const max_diagonal_length = shorter_length + 1;
+        unsigned const bytes_per_diagonal = round_up_to_multiple<unsigned>(max_diagonal_length * sizeof(score_t), 4);
 
         // The next few pointers will be swapped around.
         score_t *previous_scores = reinterpret_cast<score_t *>(shared_memory_for_warp);
@@ -1686,8 +1697,8 @@ __global__ void linear_score_on_each_cuda_warp_(                             //
         char_t *const shorter = longer + longer_length;
 
         // Each thread in the warp will be loading it's own set of strided characters into shared memory.
-        for (uint i = warp_thread_index; i < longer_length; i += warp_size) longer[i] = longer_global[i];
-        for (uint i = warp_thread_index; i < shorter_length; i += warp_size) shorter[i] = shorter_global[i];
+        for (unsigned i = thread_in_warp_index; i < longer_length; i += warp_size) longer[i] = longer_global[i];
+        for (unsigned i = thread_in_warp_index; i < shorter_length; i += warp_size) shorter[i] = shorter_global[i];
 
         // Initialize the first two diagonals:
         cuda_warp_scorer_t diagonal_aligner {substituter, gap_costs};
@@ -1704,16 +1715,16 @@ __global__ void linear_score_on_each_cuda_warp_(                             //
         // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
         // so we are effectively computing just one value, as will be marked by a single set bit in
         // the `next_diagonal_mask` on the very first iteration.
-        uint next_diagonal_index = 2;
+        unsigned next_diagonal_index = 2;
 
         // Progress through the upper-left triangle of the Levenshtein matrix.
         for (; next_diagonal_index < shorter_dim; ++next_diagonal_index) {
 
-            uint const next_diagonal_length = next_diagonal_index + 1;
+            unsigned const next_diagonal_length = next_diagonal_index + 1;
             diagonal_aligner(                       //
                 shorter,                            // first sequence of characters
                 longer,                             // second sequence of characters
-                warp_thread_index, warp_size,       //
+                thread_in_warp_index, warp_size,    //
                 next_diagonal_length - 2,           // number of elements to compute with the `diagonal_aligner`
                 previous_scores,                    // costs pre substitution
                 current_scores, current_scores + 1, // costs pre insertion/deletion
@@ -1733,11 +1744,11 @@ __global__ void linear_score_on_each_cuda_warp_(                             //
         // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
         for (; next_diagonal_index < longer_dim; ++next_diagonal_index) {
 
-            uint const next_diagonal_length = shorter_dim;
+            unsigned const next_diagonal_length = shorter_dim;
             diagonal_aligner(                               //
                 shorter,                                    // first sequence of characters
                 longer + next_diagonal_index - shorter_dim, // second sequence of characters
-                warp_thread_index, warp_size,               //
+                thread_in_warp_index, warp_size,            //
                 next_diagonal_length - 1,                   // number of elements to compute with the `diagonal_aligner`
                 previous_scores,                            // costs pre substitution
                 current_scores, current_scores + 1,         // costs pre insertion/deletion
@@ -1750,10 +1761,10 @@ __global__ void linear_score_on_each_cuda_warp_(                             //
             // ! In the central anti-diagonal band, we can't just set the `current_scores + 1` to `previous_scores`
             // ! for the circular shift, as we will end up spilling outside of the diagonal a few iterations later.
             // ! Assuming in-place `memmove` is tricky on the GPU, so we will copy the data.
-            for (size_t i = warp_thread_index; i + 1 < next_diagonal_length; i += warp_size)
+            for (size_t i = thread_in_warp_index; i + 1 < next_diagonal_length; i += warp_size)
                 previous_scores[i] = current_scores[i + 1];
             __syncwarp();
-            for (size_t i = warp_thread_index; i < next_diagonal_length; i += warp_size)
+            for (size_t i = thread_in_warp_index; i < next_diagonal_length; i += warp_size)
                 current_scores[i] = next_scores[i];
             __syncwarp();
         }
@@ -1761,11 +1772,11 @@ __global__ void linear_score_on_each_cuda_warp_(                             //
         // Now let's handle the bottom-right triangle of the matrix.
         for (; next_diagonal_index < diagonals_count; ++next_diagonal_index) {
 
-            uint const next_diagonal_length = diagonals_count - next_diagonal_index;
+            unsigned const next_diagonal_length = diagonals_count - next_diagonal_index;
             diagonal_aligner(                               //
                 shorter + next_diagonal_index - longer_dim, // first sequence of characters
                 longer + next_diagonal_index - shorter_dim, // second sequence of characters
-                warp_thread_index, warp_size,               //
+                thread_in_warp_index, warp_size,            //
                 next_diagonal_length,                       // number of elements to compute with the `diagonal_aligner`
                 previous_scores,                            // costs pre substitution
                 current_scores, current_scores + 1,         // costs pre insertion/deletion
@@ -1799,7 +1810,7 @@ __global__ void linear_score_on_each_cuda_warp_(                             //
 template < //
     typename task_type_,
     typename char_type_ = char,                                  //
-    typename index_type_ = uint,                                 //
+    typename index_type_ = unsigned,                             //
     typename score_type_ = size_t,                               //
     typename substituter_type_ = uniform_substitution_costs_t,   //
     sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
@@ -1809,7 +1820,7 @@ template < //
 __global__ void affine_score_on_each_cuda_warp_(                             //
     task_type_ *tasks, size_t tasks_count,                                   //
     substituter_type_ const substituter, affine_gap_costs_t const gap_costs, //
-    uint const shared_memory_size) {
+    unsigned const shared_memory_size) {
 
     // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
     using task_t = task_type_;
@@ -1831,12 +1842,12 @@ __global__ void affine_score_on_each_cuda_warp_(                             //
                                            objective_k, locality_k, capability_k>;
 
     // We may have multiple warps operating in the same block.
-    uint const warp_size = warpSize;
-    size_t const global_thread_index = static_cast<uint>(blockIdx.x * blockDim.x + threadIdx.x);
-    size_t const global_warp_index = static_cast<uint>(global_thread_index / warp_size);
-    size_t const warps_per_block = static_cast<uint>(blockDim.x / warp_size);
-    size_t const warps_per_device = static_cast<uint>(gridDim.x * warps_per_block);
-    uint const warp_thread_index = static_cast<uint>(global_thread_index % warp_size);
+    unsigned const warp_size = warpSize;
+    size_t const global_thread_index = static_cast<unsigned>(blockIdx.x * blockDim.x + threadIdx.x);
+    size_t const global_warp_index = static_cast<unsigned>(global_thread_index / warp_size);
+    size_t const warps_per_block = static_cast<unsigned>(blockDim.x / warp_size);
+    size_t const warps_per_device = static_cast<unsigned>(gridDim.x * warps_per_block);
+    unsigned const thread_in_warp_index = static_cast<unsigned>(global_thread_index % warp_size);
 
     // Allocating shared memory is handled on the host side.
     extern __shared__ char shared_memory_for_block[];
@@ -1844,7 +1855,7 @@ __global__ void affine_score_on_each_cuda_warp_(                             //
         shared_memory_for_block + (global_warp_index % warps_per_block) * (shared_memory_size / warps_per_block);
 
     // Only one thread will be initializing the top row and left column and outputting the result.
-    bool const is_main_thread = warp_thread_index == 0;
+    bool const is_main_thread = thread_in_warp_index == 0;
 
     // We are computing N edit distances for N pairs of strings. Not a cartesian product!
     // Each block/warp may end up receiving a different number of strings.
@@ -1858,8 +1869,8 @@ __global__ void affine_score_on_each_cuda_warp_(                             //
 
         // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
         // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
-        uint const shorter_dim = static_cast<uint>(shorter_length + 1);
-        uint const longer_dim = static_cast<uint>(longer_length + 1);
+        unsigned const shorter_dim = static_cast<unsigned>(shorter_length + 1);
+        unsigned const longer_dim = static_cast<unsigned>(longer_length + 1);
 
         // Let's say we are dealing with 3 and 5 letter words.
         // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
@@ -1867,9 +1878,9 @@ __global__ void affine_score_on_each_cuda_warp_(                             //
         // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
         // - 2 diagonals of fixed length, at positions: 4, 5.
         // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
-        uint const diagonals_count = shorter_dim + longer_dim - 1;
-        uint const max_diagonal_length = shorter_length + 1;
-        uint const bytes_per_diagonal = round_up_to_multiple<uint>(max_diagonal_length * sizeof(score_t), 4);
+        unsigned const diagonals_count = shorter_dim + longer_dim - 1;
+        unsigned const max_diagonal_length = shorter_length + 1;
+        unsigned const bytes_per_diagonal = round_up_to_multiple<unsigned>(max_diagonal_length * sizeof(score_t), 4);
 
         // The next few pointers will be swapped around.
         score_t *previous_scores = reinterpret_cast<score_t *>(shared_memory_for_warp);
@@ -1883,8 +1894,8 @@ __global__ void affine_score_on_each_cuda_warp_(                             //
         char_t *const shorter = longer + longer_length;
 
         // Each thread in the warp will be loading it's own set of strided characters into shared memory.
-        for (uint i = warp_thread_index; i < longer_length; i += warp_size) longer[i] = longer_global[i];
-        for (uint i = warp_thread_index; i < shorter_length; i += warp_size) shorter[i] = shorter_global[i];
+        for (unsigned i = thread_in_warp_index; i < longer_length; i += warp_size) longer[i] = longer_global[i];
+        for (unsigned i = thread_in_warp_index; i < shorter_length; i += warp_size) shorter[i] = shorter_global[i];
 
         // Initialize the first two diagonals:
         cuda_warp_scorer_t diagonal_aligner {substituter, gap_costs};
@@ -1903,16 +1914,16 @@ __global__ void affine_score_on_each_cuda_warp_(                             //
         // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
         // so we are effectively computing just one value, as will be marked by a single set bit in
         // the `next_diagonal_mask` on the very first iteration.
-        uint next_diagonal_index = 2;
+        unsigned next_diagonal_index = 2;
 
         // Progress through the upper-left triangle of the Levenshtein matrix.
         for (; next_diagonal_index < shorter_dim; ++next_diagonal_index) {
 
-            uint const next_diagonal_length = next_diagonal_index + 1;
+            unsigned const next_diagonal_length = next_diagonal_index + 1;
             diagonal_aligner(                         //
                 shorter,                              // first sequence of characters
                 longer,                               // second sequence of characters
-                warp_thread_index, warp_size,         //
+                thread_in_warp_index, warp_size,      //
                 next_diagonal_length - 2,             // number of elements to compute with the `diagonal_aligner`
                 previous_scores,                      // costs pre substitution
                 current_scores, current_scores + 1,   // costs pre insertion/deletion opening
@@ -1939,11 +1950,11 @@ __global__ void affine_score_on_each_cuda_warp_(                             //
         // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
         for (; next_diagonal_index < longer_dim; ++next_diagonal_index) {
 
-            uint const next_diagonal_length = shorter_dim;
+            unsigned const next_diagonal_length = shorter_dim;
             diagonal_aligner(                               //
                 shorter,                                    // first sequence of characters
                 longer + next_diagonal_index - shorter_dim, // second sequence of characters
-                warp_thread_index, warp_size,               //
+                thread_in_warp_index, warp_size,            //
                 next_diagonal_length - 1,                   // number of elements to compute with the `diagonal_aligner`
                 previous_scores,                            // costs pre substitution
                 current_scores, current_scores + 1,         // costs pre insertion/deletion opening
@@ -1965,10 +1976,10 @@ __global__ void affine_score_on_each_cuda_warp_(                             //
             // ! In the central anti-diagonal band, we can't just set the `current_scores + 1` to `previous_scores`
             // ! for the circular shift, as we will end up spilling outside of the diagonal a few iterations later.
             // ! Assuming in-place `memmove` is tricky on the GPU, so we will copy the data.
-            for (size_t i = warp_thread_index; i + 1 < next_diagonal_length; i += warp_size)
+            for (size_t i = thread_in_warp_index; i + 1 < next_diagonal_length; i += warp_size)
                 previous_scores[i] = current_scores[i + 1];
             __syncwarp();
-            for (size_t i = warp_thread_index; i < next_diagonal_length; i += warp_size)
+            for (size_t i = thread_in_warp_index; i < next_diagonal_length; i += warp_size)
                 current_scores[i] = next_scores[i];
             __syncwarp();
         }
@@ -1976,11 +1987,11 @@ __global__ void affine_score_on_each_cuda_warp_(                             //
         // Now let's handle the bottom-right triangle of the matrix.
         for (; next_diagonal_index < diagonals_count; ++next_diagonal_index) {
 
-            uint const next_diagonal_length = diagonals_count - next_diagonal_index;
+            unsigned const next_diagonal_length = diagonals_count - next_diagonal_index;
             diagonal_aligner(                               //
                 shorter + next_diagonal_index - longer_dim, // first sequence of characters
                 longer + next_diagonal_index - shorter_dim, // second sequence of characters
-                warp_thread_index, warp_size,               //
+                thread_in_warp_index, warp_size,            //
                 next_diagonal_length,                       // number of elements to compute with the `diagonal_aligner`
                 previous_scores,                            // costs pre substitution
                 current_scores, current_scores + 1,         // costs pre insertion/deletion opening
@@ -2015,7 +2026,7 @@ __global__ void affine_score_on_each_cuda_warp_(                             //
  *  @note Used to allow sorting/grouping inputs to differentiate device-wide and warp-wide tasks.
  */
 template <typename char_type_>
-struct cuda_similarity_task {
+struct cuda_similarity_task_ {
     using char_t = char_type_;
 
     char_t const *shorter_ptr = nullptr;
@@ -2028,8 +2039,8 @@ struct cuda_similarity_task {
     bytes_per_cell_t bytes_per_cell = eight_bytes_per_cell_k; // ? Worst case, need the most memory per scalar.
     warp_tasks_density_t density = warps_working_together_k;  // ? Worst case, we are not using shared memory.
 
-    constexpr cuda_similarity_task() = default;
-    constexpr cuda_similarity_task(                   //
+    constexpr cuda_similarity_task_() = default;
+    constexpr cuda_similarity_task_(                  //
         char_t const *first_ptr, size_t first_length, //
         char_t const *second_ptr, size_t second_length) noexcept {
         if (first_length < second_length)
@@ -2058,7 +2069,7 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
     using scores_allocator_t = typename allocator_t::template rebind<size_t>::other;
     static constexpr sz_capability_t capability_k = capability_;
 
-    using task_t = cuda_similarity_task<char_t>;
+    using task_t = cuda_similarity_task_<char_t>;
     using tasks_allocator_t = typename allocator_t::template rebind<task_t>::other;
 
     uniform_substitution_costs_t substituter_ {};
@@ -2066,7 +2077,7 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
     allocator_t alloc_ {};
 
     levenshtein_distances(uniform_substitution_costs_t subs = {}, gap_costs_t gaps = {},
-                          allocator_t const &alloc = allocator_t {}) noexcept
+                          allocator_t const &alloc = {}) noexcept
         : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
@@ -2122,32 +2133,32 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
         if (device_level_tasks.size()) {
             auto device_level_u16_kernel =
                 is_affine_k //
-                    ? (void *)&affine_score_across_cuda_device<char_t, sz_u16_t, sz_u16_t, final_score_t,
+                    ? (void *)&affine_score_across_cuda_device<char_t, u16_t, u16_t, final_score_t,
                                                                uniform_substitution_costs_t, sz_minimize_distance_k,
                                                                sz_similarity_global_k, capability_k>
-                    : (void *)&linear_score_across_cuda_device_<char_t, sz_u16_t, sz_u16_t, final_score_t,
+                    : (void *)&linear_score_across_cuda_device_<char_t, u16_t, u16_t, final_score_t,
                                                                 uniform_substitution_costs_t, sz_minimize_distance_k,
                                                                 sz_similarity_global_k, capability_k>;
             auto device_level_u32_kernel =
                 is_affine_k //
-                    ? (void *)&affine_score_across_cuda_device<char_t, sz_u32_t, sz_u32_t, final_score_t,
+                    ? (void *)&affine_score_across_cuda_device<char_t, u32_t, u32_t, final_score_t,
                                                                uniform_substitution_costs_t, sz_minimize_distance_k,
                                                                sz_similarity_global_k, capability_k>
-                    : (void *)&linear_score_across_cuda_device_<char_t, sz_u32_t, sz_u32_t, final_score_t,
+                    : (void *)&linear_score_across_cuda_device_<char_t, u32_t, u32_t, final_score_t,
                                                                 uniform_substitution_costs_t, sz_minimize_distance_k,
                                                                 sz_similarity_global_k, capability_k>;
             auto device_level_u64_kernel =
                 is_affine_k //
-                    ? (void *)&affine_score_across_cuda_device<char_t, sz_u64_t, sz_u64_t, final_score_t,
+                    ? (void *)&affine_score_across_cuda_device<char_t, u64_t, u64_t, final_score_t,
                                                                uniform_substitution_costs_t, sz_minimize_distance_k,
                                                                sz_similarity_global_k, capability_k>
-                    : (void *)&linear_score_across_cuda_device_<char_t, sz_u64_t, sz_u64_t, final_score_t,
+                    : (void *)&linear_score_across_cuda_device_<char_t, u64_t, u64_t, final_score_t,
                                                                 uniform_substitution_costs_t, sz_minimize_distance_k,
                                                                 sz_similarity_global_k, capability_k>;
             void *device_level_kernel_args[8];
 
             // On very large inputs we can't fit the diagonals in shared memory, and use the global one.
-            safe_vector<sz_u64_t, scores_allocator_t> diagonals_u64_buffer(alloc_);
+            safe_vector<u64_t, scores_allocator_t> diagonals_u64_buffer(alloc_);
             task_t const &largest_task = device_level_tasks[0];
             sz_assert_(largest_task.max_diagonal_length() >= device_level_tasks.back().max_diagonal_length());
             if (diagonals_u64_buffer.try_resize(largest_task.max_diagonal_length() * count_diagonals_k) ==
@@ -2169,14 +2180,14 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
 
                 // Pick the smallest fitting type for the diagonals.
                 void *device_level_kernel = reinterpret_cast<void *>(device_level_u16_kernel);
-                if (task.bytes_per_cell >= sizeof(sz_u32_t))
+                if (task.bytes_per_cell >= sizeof(u32_t))
                     device_level_kernel = reinterpret_cast<void *>(device_level_u32_kernel);
-                if (task.bytes_per_cell >= sizeof(sz_u64_t))
+                if (task.bytes_per_cell >= sizeof(u64_t))
                     device_level_kernel = reinterpret_cast<void *>(device_level_u64_kernel);
 
                 // TODO: We can be wiser about the dimensions of this grid.
-                uint const random_block_size = 128;
-                uint const random_blocks_per_multiprocessor = 32;
+                unsigned const random_block_size = 128;
+                unsigned const random_blocks_per_multiprocessor = 32;
                 cudaError_t launch_error = cudaLaunchCooperativeKernel(                       //
                     reinterpret_cast<void *>(device_level_kernel),                            // Kernel function pointer
                     dim3(random_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions
@@ -2195,18 +2206,18 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
         if (warp_level_tasks.size()) {
             auto warp_level_u8_kernel =
                 is_affine_k
-                    ? (void *)&affine_score_on_each_cuda_warp_<task_t, char_t, sz_u8_t, sz_u8_t,
-                                                               uniform_substitution_costs_t, sz_minimize_distance_k,
-                                                               sz_similarity_global_k, capability_k>
-                    : (void *)&linear_score_on_each_cuda_warp_<task_t, char_t, sz_u8_t, sz_u8_t,
-                                                               uniform_substitution_costs_t, sz_minimize_distance_k,
-                                                               sz_similarity_global_k, capability_k>;
+                    ? (void *)&affine_score_on_each_cuda_warp_<task_t, char_t, u8_t, u8_t, uniform_substitution_costs_t,
+                                                               sz_minimize_distance_k, sz_similarity_global_k,
+                                                               capability_k>
+                    : (void *)&linear_score_on_each_cuda_warp_<task_t, char_t, u8_t, u8_t, uniform_substitution_costs_t,
+                                                               sz_minimize_distance_k, sz_similarity_global_k,
+                                                               capability_k>;
             auto warp_level_u16_kernel =
                 is_affine_k
-                    ? (void *)&affine_score_on_each_cuda_warp_<task_t, char_t, sz_u16_t, sz_u16_t,
+                    ? (void *)&affine_score_on_each_cuda_warp_<task_t, char_t, u16_t, u16_t,
                                                                uniform_substitution_costs_t, sz_minimize_distance_k,
                                                                sz_similarity_global_k, capability_k>
-                    : (void *)&linear_score_on_each_cuda_warp_<task_t, char_t, sz_u16_t, sz_u16_t,
+                    : (void *)&linear_score_on_each_cuda_warp_<task_t, char_t, u16_t, u16_t,
                                                                uniform_substitution_costs_t, sz_minimize_distance_k,
                                                                sz_similarity_global_k, capability_k>;
             void *warp_level_kernel_args[5];
@@ -2233,7 +2244,7 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
 
                 // Pick the smallest fitting type for the diagonals.
                 void *warp_level_kernel = reinterpret_cast<void *>(warp_level_u8_kernel);
-                if (indicative_task.bytes_per_cell >= sizeof(sz_u16_t))
+                if (indicative_task.bytes_per_cell >= sizeof(u16_t))
                     warp_level_kernel = reinterpret_cast<void *>(warp_level_u16_kernel);
 
                 // Even if we can fit more warps per block we sometimes should not.
@@ -2241,8 +2252,8 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
                     speculation_friendly_density(indicative_task.density);
 
                 // Update the selected kernels properties.
-                uint const shared_memory_per_block =
-                    static_cast<uint>(indicative_task.memory_requirement * optimal_density);
+                unsigned const shared_memory_per_block =
+                    static_cast<unsigned>(indicative_task.memory_requirement * optimal_density);
                 sz_assert_(shared_memory_per_block > 0);
                 sz_assert_(shared_memory_per_block < specs.shared_memory_per_multiprocessor());
                 cudaError_t attribute_error = cudaFuncSetAttribute(
@@ -2260,7 +2271,7 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
                 warp_level_kernel_args[4] = (void *)(&shared_memory_per_block);
 
                 // Warp-level algorithm clearly aligns with the warp size.
-                uint const threads_per_block = static_cast<uint>(specs.warp_size * optimal_density);
+                unsigned const threads_per_block = static_cast<unsigned>(specs.warp_size * optimal_density);
                 cudaError_t launch_error = cudaLaunchKernel(                    //
                     reinterpret_cast<void *>(warp_level_kernel),                // Kernel function pointer
                     dim3(specs.streaming_multiprocessors * speculative_factor), // Grid dimensions
@@ -2309,13 +2320,13 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
  *          used to cheaper store and access the substitution costs for the characters.
  *  @see    CUDA constant memory docs: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#constant
  */
-__constant__ char _error_costs_in_cuda_constant_memory[256 * 256];
+__constant__ char error_costs_in_cuda_constant_memory_[256 * 256];
 
 struct error_costs_256x256_in_cuda_constant_memory_t {
     __host__ error_cost_t magnitude() const noexcept { return 0; }
     __forceinline__ __host__ __device__ error_cost_t operator()(char a, char b) const noexcept {
 #if defined(__CUDA_ARCH__)
-        return _error_costs_in_cuda_constant_memory[static_cast<sz_u8_t>(a) * 256 + static_cast<sz_u8_t>(b)];
+        return error_costs_in_cuda_constant_memory_[static_cast<u8_t>(a) * 256 + static_cast<u8_t>(b)];
 #else
         sz_unused_(a && b);
         return 0;
@@ -2342,12 +2353,13 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
                       linear_gap_costs_t, sz_maximize_score_k, locality_,
                       sz_cap_cuda_k>::tile_scorer; // Make the constructors visible
 
-    __forceinline__ __device__ void operator()(                                 //
-        char const *first_slice, char const *second_slice,                      //
-        uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
-        sz_i16_t const *scores_pre_substitution,                                //
-        sz_i16_t const *scores_pre_insertion,                                   //
-        sz_i16_t const *scores_pre_deletion,                                    //
+    __forceinline__ __device__ void operator()(            //
+        char const *first_slice, char const *second_slice, //
+        unsigned const tasks_offset, unsigned const tasks_step,
+        unsigned const tasks_count,              // ! Unlike CPU, uses `unsigned`
+        sz_i16_t const *scores_pre_substitution, //
+        sz_i16_t const *scores_pre_insertion,    //
+        sz_i16_t const *scores_pre_deletion,     //
         sz_i16_t *scores_new) noexcept {
 
         error_costs_256x256_in_cuda_constant_memory_t substituter;
@@ -2367,17 +2379,17 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
 
         // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
         // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
-        for (uint i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
-            pre_substitution_vec.i16s[0] = _load_last_use(scores_pre_substitution + i + 0);
-            pre_substitution_vec.i16s[1] = _load_last_use(scores_pre_substitution + i + 1);
+        for (unsigned i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
+            pre_substitution_vec.i16s[0] = load_last_use_(scores_pre_substitution + i + 0);
+            pre_substitution_vec.i16s[1] = load_last_use_(scores_pre_substitution + i + 1);
             pre_insertion_vec.i16s[0] = scores_pre_insertion[i + 0];
             pre_insertion_vec.i16s[1] = scores_pre_insertion[i + 1];
             pre_deletion_vec.i16s[0] = scores_pre_deletion[i + 0];
             pre_deletion_vec.i16s[1] = scores_pre_deletion[i + 1];
-            first_vec.u16s[0] = _load_immutable(first_slice + tasks_count - i - 1);
-            first_vec.u16s[1] = _load_immutable(first_slice + tasks_count - i - 2); // ! this may be OOB
-            second_vec.u16s[0] = _load_immutable(second_slice + i + 0);
-            second_vec.u16s[1] = _load_immutable(second_slice + i + 1); // ! this may be OOB, but padded
+            first_vec.u16s[0] = load_immutable_(first_slice + tasks_count - i - 1);
+            first_vec.u16s[1] = load_immutable_(first_slice + tasks_count - i - 2); // ! this may be OOB
+            second_vec.u16s[0] = load_immutable_(second_slice + i + 0);
+            second_vec.u16s[1] = load_immutable_(second_slice + i + 1); // ! this may be OOB, but padded
 
             cost_of_substitution_vec.i16s[0] = substituter(first_vec.u16s[0], second_vec.u16s[0]);
             cost_of_substitution_vec.i16s[1] = substituter(first_vec.u16s[1], second_vec.u16s[1]);
@@ -2412,7 +2424,7 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
         else { // Or the best score for local alignment.
             this->final_score_ = __vimax3_s32(this->final_score_, final_score_vec.i16s[0], final_score_vec.i16s[1]);
             // On Hopper we can use specialized warp reductions for up-to 32-bit values:
-            // this->final_score_ = _pick_best_in_warp<sz_maximize_score_k>(this->final_score_);
+            // this->final_score_ = pick_best_in_warp_<sz_maximize_score_k>(this->final_score_);
             this->final_score_ = __reduce_max_sync(0xFFFFFFFF, this->final_score_);
         }
     }
@@ -2431,12 +2443,13 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
                       linear_gap_costs_t, sz_maximize_score_k, locality_,
                       sz_cap_cuda_k>::tile_scorer; // Make the constructors visible
 
-    __forceinline__ __device__ void operator()(                                 //
-        char const *first_slice, char const *second_slice,                      //
-        uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
-        sz_i32_t const *scores_pre_substitution,                                //
-        sz_i32_t const *scores_pre_insertion,                                   //
-        sz_i32_t const *scores_pre_deletion,                                    //
+    __forceinline__ __device__ void operator()(            //
+        char const *first_slice, char const *second_slice, //
+        unsigned const tasks_offset, unsigned const tasks_step,
+        unsigned const tasks_count,              // ! Unlike CPU, uses `unsigned`
+        sz_i32_t const *scores_pre_substitution, //
+        sz_i32_t const *scores_pre_insertion,    //
+        sz_i32_t const *scores_pre_deletion,     //
         sz_i32_t *scores_new) noexcept {
 
         // Make sure we are called for an anti-diagonal traversal order
@@ -2445,12 +2458,12 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
         sz_i32_t const gap_costs = this->gap_costs_.open_or_extend;
         sz_i32_t final_score = 0;
 
-        for (uint i = tasks_offset; i < tasks_count; i += tasks_step) {
-            sz_i32_t pre_substitution = _load_last_use(scores_pre_substitution + i);
+        for (unsigned i = tasks_offset; i < tasks_count; i += tasks_step) {
+            sz_i32_t pre_substitution = load_last_use_(scores_pre_substitution + i);
             sz_i32_t pre_insertion = scores_pre_insertion[i];
             sz_i32_t pre_deletion = scores_pre_deletion[i];
-            char first_char = _load_immutable(first_slice + tasks_count - i - 1);
-            char second_char = _load_immutable(second_slice + i);
+            char first_char = load_immutable_(first_slice + tasks_count - i - 1);
+            char second_char = load_immutable_(second_slice + i);
 
             error_cost_t cost_of_substitution = substituter(first_char, second_char);
             sz_i32_t if_deletion_or_insertion = (std::max)(pre_deletion, pre_insertion) + gap_costs;
@@ -2477,7 +2490,7 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
         else { // Or the best score for local alignment.
             this->final_score_ = (std::max)(this->final_score_, final_score);
             // On Hopper we can use specialized warp reductions for up-to 32-bit values:
-            // this->final_score_ = _pick_best_in_warp<sz_maximize_score_k>(this->final_score_);
+            // this->final_score_ = pick_best_in_warp_<sz_maximize_score_k>(this->final_score_);
             this->final_score_ = __reduce_max_sync(0xFFFFFFFF, this->final_score_);
         }
     }
@@ -2500,16 +2513,17 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
                       affine_gap_costs_t, sz_maximize_score_k, locality_,
                       sz_cap_cuda_k>::tile_scorer; // Make the constructors visible
 
-    __forceinline__ __device__ void operator()(                                 //
-        char const *first_slice, char const *second_slice,                      //
-        uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
-        sz_i16_t const *scores_pre_substitution,                                //
-        sz_i16_t const *scores_pre_insertion,                                   //
-        sz_i16_t const *scores_pre_deletion,                                    //
-        sz_i16_t const *scores_running_insertions,                              //
-        sz_i16_t const *scores_running_deletions,                               //
-        sz_i16_t *scores_new,                                                   //
-        sz_i16_t *scores_new_insertions,                                        //
+    __forceinline__ __device__ void operator()(            //
+        char const *first_slice, char const *second_slice, //
+        unsigned const tasks_offset, unsigned const tasks_step,
+        unsigned const tasks_count,                // ! Unlike CPU, uses `unsigned`
+        sz_i16_t const *scores_pre_substitution,   //
+        sz_i16_t const *scores_pre_insertion,      //
+        sz_i16_t const *scores_pre_deletion,       //
+        sz_i16_t const *scores_running_insertions, //
+        sz_i16_t const *scores_running_deletions,  //
+        sz_i16_t *scores_new,                      //
+        sz_i16_t *scores_new_insertions,           //
         sz_i16_t *scores_new_deletions) noexcept {
 
         error_costs_256x256_in_cuda_constant_memory_t substituter;
@@ -2532,9 +2546,9 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
 
         // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
         // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
-        for (uint i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
-            pre_substitution_vec.i16s[0] = _load_last_use(scores_pre_substitution + i + 0);
-            pre_substitution_vec.i16s[1] = _load_last_use(scores_pre_substitution + i + 1);
+        for (unsigned i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
+            pre_substitution_vec.i16s[0] = load_last_use_(scores_pre_substitution + i + 0);
+            pre_substitution_vec.i16s[1] = load_last_use_(scores_pre_substitution + i + 1);
             pre_insertion_opening_vec.i16s[0] = scores_pre_insertion[i + 0];
             pre_insertion_opening_vec.i16s[1] = scores_pre_insertion[i + 1];
             pre_deletion_opening_vec.i16s[0] = scores_pre_deletion[i + 0];
@@ -2543,10 +2557,10 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
             pre_insertion_expansion_vec.i16s[1] = scores_running_insertions[i + 1];
             pre_deletion_expansion_vec.i16s[0] = scores_running_deletions[i + 0];
             pre_deletion_expansion_vec.i16s[1] = scores_running_deletions[i + 1];
-            first_vec.u16s[0] = _load_immutable(first_slice + tasks_count - i - 1);
-            first_vec.u16s[1] = _load_immutable(first_slice + tasks_count - i - 2); // ! this may be OOB
-            second_vec.u16s[0] = _load_immutable(second_slice + i + 0);
-            second_vec.u16s[1] = _load_immutable(second_slice + i + 1); // ! this may be OOB, but padded
+            first_vec.u16s[0] = load_immutable_(first_slice + tasks_count - i - 1);
+            first_vec.u16s[1] = load_immutable_(first_slice + tasks_count - i - 2); // ! this may be OOB
+            second_vec.u16s[0] = load_immutable_(second_slice + i + 0);
+            second_vec.u16s[1] = load_immutable_(second_slice + i + 1); // ! this may be OOB, but padded
 
             cost_of_substitution_vec.i16s[0] = substituter(first_vec.u16s[0], second_vec.u16s[0]);
             cost_of_substitution_vec.i16s[1] = substituter(first_vec.u16s[1], second_vec.u16s[1]);
@@ -2589,7 +2603,7 @@ struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_
         else { // Or the best score for local alignment.
             this->final_score_ = __vimax3_s32(this->final_score_, final_score_vec.i16s[0], final_score_vec.i16s[1]);
             // On Hopper we can use specialized warp reductions for up-to 32-bit values:
-            // this->final_score_ = _pick_best_in_warp<sz_maximize_score_k>(this->final_score_);
+            // this->final_score_ = pick_best_in_warp_<sz_maximize_score_k>(this->final_score_);
             this->final_score_ = __reduce_max_sync(0xFFFFFFFF, this->final_score_);
         }
     }
@@ -2608,16 +2622,17 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
                       affine_gap_costs_t, sz_maximize_score_k, locality_,
                       sz_cap_cuda_k>::tile_scorer; // Make the constructors visible
 
-    __forceinline__ __device__ void operator()(                                 //
-        char const *first_slice, char const *second_slice,                      //
-        uint const tasks_offset, uint const tasks_step, uint const tasks_count, // ! Unlike CPU, uses `uint`
-        sz_i32_t const *scores_pre_substitution,                                //
-        sz_i32_t const *scores_pre_insertion,                                   //
-        sz_i32_t const *scores_pre_deletion,                                    //
-        sz_i32_t const *scores_running_insertions,                              //
-        sz_i32_t const *scores_running_deletions,                               //
-        sz_i32_t *scores_new,                                                   //
-        sz_i32_t *scores_new_insertions,                                        //
+    __forceinline__ __device__ void operator()(            //
+        char const *first_slice, char const *second_slice, //
+        unsigned const tasks_offset, unsigned const tasks_step,
+        unsigned const tasks_count,                // ! Unlike CPU, uses `unsigned`
+        sz_i32_t const *scores_pre_substitution,   //
+        sz_i32_t const *scores_pre_insertion,      //
+        sz_i32_t const *scores_pre_deletion,       //
+        sz_i32_t const *scores_running_insertions, //
+        sz_i32_t const *scores_running_deletions,  //
+        sz_i32_t *scores_new,                      //
+        sz_i32_t *scores_new_insertions,           //
         sz_i32_t *scores_new_deletions) noexcept {
 
         // Make sure we are called for an anti-diagonal traversal order
@@ -2627,14 +2642,14 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
         error_costs_256x256_in_cuda_constant_memory_t substituter;
         sz_i32_t final_score = 0;
 
-        for (uint i = tasks_offset; i < tasks_count; i += tasks_step) {
-            sz_i32_t pre_substitution = _load_last_use(scores_pre_substitution + i);
+        for (unsigned i = tasks_offset; i < tasks_count; i += tasks_step) {
+            sz_i32_t pre_substitution = load_last_use_(scores_pre_substitution + i);
             sz_i32_t pre_insertion_opening = scores_pre_insertion[i];
             sz_i32_t pre_deletion_opening = scores_pre_deletion[i];
             sz_i32_t pre_insertion_expansion = scores_running_insertions[i];
             sz_i32_t pre_deletion_expansion = scores_running_deletions[i];
-            char first_char = _load_immutable(first_slice + tasks_count - i - 1);
-            char second_char = _load_immutable(second_slice + i);
+            char first_char = load_immutable_(first_slice + tasks_count - i - 1);
+            char second_char = load_immutable_(second_slice + i);
 
             error_cost_t cost_of_substitution = substituter(first_char, second_char);
             sz_i32_t if_substitution = pre_substitution + cost_of_substitution;
@@ -2667,7 +2682,7 @@ struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_
         else { // Or the best score for local alignment.
             this->final_score_ = (std::max)(this->final_score_, final_score);
             // On Hopper we can use specialized warp reductions for up-to 32-bit values:
-            // this->final_score_ = _pick_best_in_warp<sz_maximize_score_k>(this->final_score_);
+            // this->final_score_ = pick_best_in_warp_<sz_maximize_score_k>(this->final_score_);
             this->final_score_ = __reduce_max_sync(0xFFFFFFFF, this->final_score_);
         }
     }
@@ -2707,7 +2722,7 @@ struct tile_scorer<char const *, char const *, sz_i64_t, error_costs_256x256_in_
  */
 template <typename gap_costs_type_, typename allocator_type_, sz_similarity_locality_t locality_,
           sz_capability_t capability_>
-struct _cuda_nw_or_sw_byte_level_scores {
+struct cuda_nw_or_sw_byte_level_scores_ {
 
     using char_t = char;
     using substituter_t = error_costs_256x256_t;
@@ -2717,14 +2732,14 @@ struct _cuda_nw_or_sw_byte_level_scores {
     static constexpr sz_similarity_locality_t locality_k = locality_;
     static constexpr sz_capability_t capability_k = capability_;
 
-    using task_t = cuda_similarity_task<char_t>;
+    using task_t = cuda_similarity_task_<char_t>;
     using tasks_allocator_t = typename allocator_t::template rebind<task_t>::other;
 
     error_costs_256x256_t substituter_ {};
     gap_costs_t gap_costs_ {};
     allocator_t alloc_ {};
 
-    _cuda_nw_or_sw_byte_level_scores(error_costs_256x256_t subs = {}, gap_costs_t gaps = {},
+    cuda_nw_or_sw_byte_level_scores_(error_costs_256x256_t subs = {}, gap_costs_t gaps = {},
                                      allocator_t const &alloc = allocator_t {}) noexcept
         : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
@@ -2753,7 +2768,7 @@ struct _cuda_nw_or_sw_byte_level_scores {
 
         // Enqueue the transfer of the substituter to the constant memory:
         cudaError_t copy_error =
-            cudaMemcpyToSymbolAsync(_error_costs_in_cuda_constant_memory, (void const *)&substituter_,
+            cudaMemcpyToSymbolAsync(error_costs_in_cuda_constant_memory_, (void const *)&substituter_,
                                     sizeof(substituter_t), 0, cudaMemcpyHostToDevice, executor.stream);
         if (copy_error != cudaSuccess) return {status_t::unknown_k, copy_error};
 
@@ -2789,24 +2804,24 @@ struct _cuda_nw_or_sw_byte_level_scores {
         if (device_level_tasks.size()) {
             auto device_level_i32_kernel =
                 is_affine_k //
-                    ? (void *)&affine_score_across_cuda_device<char_t, sz_u32_t, sz_i32_t, final_score_t,
+                    ? (void *)&affine_score_across_cuda_device<char_t, u32_t, sz_i32_t, final_score_t,
                                                                error_costs_256x256_in_cuda_constant_memory_t,
                                                                sz_maximize_score_k, locality_k, capability_k>
-                    : (void *)&linear_score_across_cuda_device_<char_t, sz_u32_t, sz_i32_t, final_score_t,
+                    : (void *)&linear_score_across_cuda_device_<char_t, u32_t, sz_i32_t, final_score_t,
                                                                 error_costs_256x256_in_cuda_constant_memory_t,
                                                                 sz_maximize_score_k, locality_k, capability_k>;
             auto device_level_i64_kernel =
                 is_affine_k //
-                    ? (void *)&affine_score_across_cuda_device<char_t, sz_u64_t, sz_i64_t, final_score_t,
+                    ? (void *)&affine_score_across_cuda_device<char_t, u64_t, sz_i64_t, final_score_t,
                                                                error_costs_256x256_in_cuda_constant_memory_t,
                                                                sz_maximize_score_k, locality_k, capability_k>
-                    : (void *)&linear_score_across_cuda_device_<char_t, sz_u64_t, sz_i64_t, final_score_t,
+                    : (void *)&linear_score_across_cuda_device_<char_t, u64_t, sz_i64_t, final_score_t,
                                                                 error_costs_256x256_in_cuda_constant_memory_t,
                                                                 sz_maximize_score_k, locality_k, capability_k>;
             void *device_level_kernel_args[8];
 
             // On very large inputs we can't fit the diagonals in shared memory, and use the global one.
-            safe_vector<sz_u64_t, scores_allocator_t> diagonals_u64_buffer(alloc_);
+            safe_vector<u64_t, scores_allocator_t> diagonals_u64_buffer(alloc_);
             task_t const &largest_task = device_level_tasks[0];
             sz_assert_(largest_task.max_diagonal_length() >= device_level_tasks.back().max_diagonal_length());
             if (diagonals_u64_buffer.try_resize(largest_task.max_diagonal_length() * count_diagonals_k) ==
@@ -2832,8 +2847,8 @@ struct _cuda_nw_or_sw_byte_level_scores {
                     device_level_kernel = reinterpret_cast<void *>(device_level_i64_kernel);
 
                 // TODO: We can be wiser about the dimensions of this grid.
-                uint const random_block_size = 128;
-                uint const random_blocks_per_multiprocessor = 32;
+                unsigned const random_block_size = 128;
+                unsigned const random_blocks_per_multiprocessor = 32;
                 cudaError_t launch_error = cudaLaunchCooperativeKernel(                       //
                     reinterpret_cast<void *>(device_level_kernel),                            // Kernel function pointer
                     dim3(random_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions
@@ -2851,17 +2866,17 @@ struct _cuda_nw_or_sw_byte_level_scores {
         // From the highest possible number of warps per multiprocessor to the lowest.
         if (warp_level_tasks.size()) {
             auto warp_level_i16_kernel =
-                is_affine_k ? (void *)&affine_score_on_each_cuda_warp_<task_t, char_t, sz_u16_t, sz_i16_t,
+                is_affine_k ? (void *)&affine_score_on_each_cuda_warp_<task_t, char_t, u16_t, sz_i16_t,
                                                                        error_costs_256x256_in_cuda_constant_memory_t,
                                                                        sz_maximize_score_k, locality_k, capability_k>
-                            : (void *)&linear_score_on_each_cuda_warp_<task_t, char_t, sz_u16_t, sz_i16_t,
+                            : (void *)&linear_score_on_each_cuda_warp_<task_t, char_t, u16_t, sz_i16_t,
                                                                        error_costs_256x256_in_cuda_constant_memory_t,
                                                                        sz_maximize_score_k, locality_k, capability_k>;
             auto warp_level_i32_kernel =
-                is_affine_k ? (void *)&affine_score_on_each_cuda_warp_<task_t, char_t, sz_u32_t, sz_i32_t,
+                is_affine_k ? (void *)&affine_score_on_each_cuda_warp_<task_t, char_t, u32_t, sz_i32_t,
                                                                        error_costs_256x256_in_cuda_constant_memory_t,
                                                                        sz_maximize_score_k, locality_k, capability_k>
-                            : (void *)&linear_score_on_each_cuda_warp_<task_t, char_t, sz_u32_t, sz_i32_t,
+                            : (void *)&linear_score_on_each_cuda_warp_<task_t, char_t, u32_t, sz_i32_t,
                                                                        error_costs_256x256_in_cuda_constant_memory_t,
                                                                        sz_maximize_score_k, locality_k, capability_k>;
             void *warp_level_kernel_args[5];
@@ -2896,8 +2911,8 @@ struct _cuda_nw_or_sw_byte_level_scores {
                     speculation_friendly_density(indicative_task.density);
 
                 // Update the selected kernels properties.
-                uint const shared_memory_per_block =
-                    static_cast<uint>(indicative_task.memory_requirement * optimal_density);
+                unsigned const shared_memory_per_block =
+                    static_cast<unsigned>(indicative_task.memory_requirement * optimal_density);
                 sz_assert_(shared_memory_per_block > 0);
                 sz_assert_(shared_memory_per_block < specs.shared_memory_per_multiprocessor());
                 cudaError_t attribute_error = cudaFuncSetAttribute(
@@ -2915,7 +2930,7 @@ struct _cuda_nw_or_sw_byte_level_scores {
                 warp_level_kernel_args[4] = (void *)(&shared_memory_per_block);
 
                 // Warp-level algorithm clearly aligns with the warp size.
-                uint const threads_per_block = static_cast<uint>(specs.warp_size * optimal_density);
+                unsigned const threads_per_block = static_cast<unsigned>(specs.warp_size * optimal_density);
                 cudaError_t launch_error = cudaLaunchKernel(                    //
                     reinterpret_cast<void *>(warp_level_kernel),                // Kernel function pointer
                     dim3(specs.streaming_multiprocessors * speculative_factor), // Grid dimensions
@@ -2947,8 +2962,8 @@ struct _cuda_nw_or_sw_byte_level_scores {
         cudaEventElapsedTime(&execution_milliseconds, start_event, stop_event);
 
         // Now that everything went well, export the results back into the `results` array.
-        for (size_t i = 0; i < tasks.size(); ++i) {
-            task_t const &task = tasks[i];
+        for (size_t task_index = 0; task_index < tasks.size(); ++task_index) {
+            task_t const &task = tasks[task_index];
             results_ptr[task.original_index] = task.result;
         }
         return {status_t::success_k, cudaSuccess, execution_milliseconds};
@@ -2959,20 +2974,20 @@ struct _cuda_nw_or_sw_byte_level_scores {
 template <typename gap_costs_type_, typename allocator_type_, sz_capability_t capability_>
 struct needleman_wunsch_scores<char, error_costs_256x256_t, gap_costs_type_, allocator_type_, capability_,
                                std::enable_if_t<capability_ & sz_cap_cuda_k>>
-    : public _cuda_nw_or_sw_byte_level_scores<gap_costs_type_, allocator_type_, sz_similarity_global_k, capability_> {
+    : public cuda_nw_or_sw_byte_level_scores_<gap_costs_type_, allocator_type_, sz_similarity_global_k, capability_> {
 
-    using _cuda_nw_or_sw_byte_level_scores<gap_costs_type_, allocator_type_, sz_similarity_global_k,
-                                           capability_>::_cuda_nw_or_sw_byte_level_scores;
+    using cuda_nw_or_sw_byte_level_scores_<gap_costs_type_, allocator_type_, sz_similarity_global_k,
+                                           capability_>::cuda_nw_or_sw_byte_level_scores_;
 };
 
 /** @brief Dispatches baseline Smith Waterman algorithm to the GPU. */
 template <typename gap_costs_type_, typename allocator_type_, sz_capability_t capability_>
 struct smith_waterman_scores<char, error_costs_256x256_t, gap_costs_type_, allocator_type_, capability_,
                              std::enable_if_t<capability_ & sz_cap_cuda_k>>
-    : public _cuda_nw_or_sw_byte_level_scores<gap_costs_type_, allocator_type_, sz_similarity_local_k, capability_> {
+    : public cuda_nw_or_sw_byte_level_scores_<gap_costs_type_, allocator_type_, sz_similarity_local_k, capability_> {
 
-    using _cuda_nw_or_sw_byte_level_scores<gap_costs_type_, allocator_type_, sz_similarity_local_k,
-                                           capability_>::_cuda_nw_or_sw_byte_level_scores;
+    using cuda_nw_or_sw_byte_level_scores_<gap_costs_type_, allocator_type_, sz_similarity_local_k,
+                                           capability_>::cuda_nw_or_sw_byte_level_scores_;
 };
 
 #pragma endregion
diff --git a/include/stringzillas/similarity.hpp b/include/stringzillas/similarity.hpp
index 28b4ada3..9dd030d0 100644
--- a/include/stringzillas/similarity.hpp
+++ b/include/stringzillas/similarity.hpp
@@ -184,7 +184,7 @@ struct error_costs_256x256_t {
  *          without fetching from RAM/VRAM all the time, including the space for 3 diagonals
  *          and the strings themselves.
  *
- *  @tparam size_type_ The type of the size, usually `size_t` for large inputs or `uint` on small inputs in CUDA.
+ *  @tparam size_type_ The type of the size, usually `size_t` for large inputs or `unsigned` on small inputs in CUDA.
  *  @tparam is_signed_ Whether the similarity scores can be negative or not.
  */
 template <typename size_type_, bool is_signed_>
@@ -2560,7 +2560,7 @@ struct error_costs_26x26ascii_t {
  *      - 2018 CannonLake: IFMA, VBMI,
  *      - 2019 Ice Lake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES.
  */
-#pragma region - Ice Lake Implementation
+#pragma region Ice Lake Implementation
 #if SZ_USE_ICE
 #pragma GCC push_options
 #pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
@@ -3862,11 +3862,11 @@ struct levenshtein_distance_utf8<char, linear_gap_costs_t, allocator_type_, capa
  *  - 8-bit, 16-bit, 32-bit, and even 64-bit costs.
  *  - Any memory allocator used.
  */
-struct _lookup_in256bytes_ice_t {
+struct lookup_in256bytes_ice_t_ {
     sz_u512_vec_t row_subs_vecs_[4];
     sz_u512_vec_t is_third_or_fourth_vec_, is_second_or_fourth_vec_;
 
-    inline _lookup_in256bytes_ice_t() noexcept {
+    inline lookup_in256bytes_ice_t_() noexcept {
         char is_third_or_fourth_check, is_second_or_fourth_check;
         *(sz_u8_t *)&is_third_or_fourth_check = 0x80, *(sz_u8_t *)&is_second_or_fourth_check = 0x40;
         is_third_or_fourth_vec_.zmm = _mm512_set1_epi8(is_third_or_fourth_check);
@@ -3932,7 +3932,7 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_
     static constexpr sz_similarity_locality_t locality_k = locality_;
     static constexpr sz_capability_t capability_k = sz_cap_ice_k;
 
-    _lookup_in256bytes_ice_t lookup_;
+    lookup_in256bytes_ice_t_ lookup_;
 
     template <typename executor_type_ = dummy_executor_t>
 #if SZ_IS_CPP20_
@@ -4072,7 +4072,7 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i32_t, error_costs_
     static constexpr sz_similarity_locality_t locality_k = locality_;
     static constexpr sz_capability_t capability_k = sz_cap_ice_k;
 
-    _lookup_in256bytes_ice_t lookup_;
+    lookup_in256bytes_ice_t_ lookup_;
 
     template <typename executor_type_ = dummy_executor_t>
 #if SZ_IS_CPP20_

From 640b7c42b400547d0bb3b11553a7cda007462785 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 31 Jul 2025 19:57:07 +0000
Subject: [PATCH 512/751] Make: Format Python to 120 columns

---
 pyproject.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index ab3d7e02..d0ecc41a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,6 +17,9 @@ addopts = ["-ra", "--showlocals", "--strict-markers", "--strict-config"]
 xfail_strict = true
 filterwarnings = ["error"]
 
+[tool.black]
+line-length = 120
+
 [tool.cibuildwheel]
 test-requires = ["pytest", "pytest-repeat"]
 test-command = "pytest {project}/scripts/test.py -x"

From fbf7203f0d0b24e7dfdfb150535848d0d4cef9ab Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 31 Jul 2025 20:10:07 +0000
Subject: [PATCH 513/751] Fix: Scramble results between fingerprint benchmarks

---
 scripts/bench_fingerprint.cuh | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/scripts/bench_fingerprint.cuh b/scripts/bench_fingerprint.cuh
index 4f6c0f6f..d0cbc78c 100644
--- a/scripts/bench_fingerprint.cuh
+++ b/scripts/bench_fingerprint.cuh
@@ -152,34 +152,41 @@ void bench_fingerprint(environment_t const &env) {
         throw std::runtime_error("Can't build Skylake Floating Hasher.");
 #endif // SZ_USE_SKYLAKE
 
-    // Perform the benchmarks, passing the dictionary to the engines
-    auto call_baseline = fingerprint_callable<rolling_f64_t, fu::basic_pool_t &>(
-        tape, min_hashes_baseline, min_counts_baseline, *rolling_f64, pool);
-    bench_result_t baseline = bench_nullary(env, "rolling_f64", call_baseline);
-
 #if SZ_USE_CUDA
     using rolling_cuda_t = floating_rolling_hashers<sz_cap_cuda_k, default_window_width_k, default_embedding_dims_k>;
     auto rolling_cuda = std::make_unique<rolling_cuda_t>();
     if (rolling_cuda->try_seed() != status_t::success_k) throw std::runtime_error("Can't build CUDA Floating Hasher.");
 #endif // SZ_USE_CUDA
 
+    // Perform the benchmarks, passing the dictionary to the engines
+    auto call_baseline = fingerprint_callable<rolling_f64_t, fu::basic_pool_t &>(
+        tape, min_hashes_baseline, min_counts_baseline, *rolling_f64, pool);
+    bench_result_t baseline = bench_nullary(env, "rolling_f64", call_baseline).log();
+
     // Semi-serial variants
     bench_nullary(env, "rolling_f32",
                   fingerprint_callable<rolling_f32_t, fu::basic_pool_t &>(tape, min_hashes_accelerated,
                                                                           min_counts_accelerated, *rolling_f32, pool))
         .log(baseline);
+    scramble_accelerated_results();
+
     bench_nullary(env, "rabin_u64",
                   fingerprint_callable<rabin_u64_t, fu::basic_pool_t &>(tape, min_hashes_accelerated,
                                                                         min_counts_accelerated, *rabin_u64, pool))
         .log(baseline);
+    scramble_accelerated_results();
+
     bench_nullary(env, "buz_u32",
                   fingerprint_callable<buz_u32_t, fu::basic_pool_t &>(tape, min_hashes_accelerated,
                                                                       min_counts_accelerated, *buz_u32, pool)) //
         .log(baseline);
+    scramble_accelerated_results();
+
     bench_nullary(env, "multiply_u32",
                   fingerprint_callable<multiply_u32_t, fu::basic_pool_t &>(tape, min_hashes_accelerated,
                                                                            min_counts_accelerated, *multiply_u32, pool))
         .log(baseline);
+    scramble_accelerated_results();
 
     // Actually unrolled hard-coded variants, including SIMD ports
     bench_result_t unrolled =                     //
@@ -200,6 +207,7 @@ void bench_fingerprint(environment_t const &env) {
         callable_no_op_t {},        // preprocessing
         fingerprints_equality_t {}) // equality check
         .log(baseline, unrolled);
+    scramble_accelerated_results();
 #endif // SZ_USE_HASWELL
 
 #if SZ_USE_SKYLAKE

From a3c3510031b4a04add2999db1f4f5c8e192ca04d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 31 Jul 2025 20:23:25 +0000
Subject: [PATCH 514/751] Improve: Naming multi-input processors

---
 .github/workflows/prerelease.yml              |   4 +-
 .gitignore                                    |   1 +
 CMakeLists.txt                                |  28 +-
 CONTRIBUTING.md                               |  33 +-
 build.rs                                      |   2 +-
 c/stringzillas.cu                             |   8 +-
 include/stringzilla/stringzilla.h             |   4 +-
 .../{fingerprint.cuh => fingerprints.cuh}     |  10 +-
 .../{fingerprint.hpp => fingerprints.hpp}     |   8 +-
 .../{similarity.cuh => similarities.cuh}      |  12 +-
 .../{similarity.hpp => similarities.hpp}      |   8 +-
 scripts/bench_container.cpp                   |   2 +-
 scripts/bench_find.cpp                        |   2 +-
 ...fingerprint.cpp => bench_fingerprints.cpp} |  12 +-
 ...h_fingerprint.cu => bench_fingerprints.cu} |  12 +-
 ...fingerprint.cuh => bench_fingerprints.cuh} |   6 +-
 ...h_fingerprint.py => bench_fingerprints.py} |   6 +-
 scripts/bench_memory.cpp                      |   2 +-
 scripts/bench_sequence.cpp                    |   2 +-
 ..._similarity.cpp => bench_similarities.cpp} |  12 +-
 ...ch_similarity.cu => bench_similarities.cu} |  12 +-
 ..._similarity.cuh => bench_similarities.cuh} |   6 +-
 ...ilarity.ipynb => bench_similarities.ipynb} |   4 +-
 scripts/bench_similarities.py                 | 392 ++++++++++++++++++
 scripts/bench_token.cpp                       |   2 +-
 ..._fingerprint.cuh => test_fingerprints.cuh} |   6 +-
 ...t_similarity.cuh => test_similarities.cuh} |   6 +-
 scripts/test_stringzillas.cpp                 |   4 +-
 scripts/test_stringzillas.cu                  |   4 +-
 29 files changed, 504 insertions(+), 106 deletions(-)
 rename include/stringzillas/{fingerprint.cuh => fingerprints.cuh} (99%)
 rename include/stringzillas/{fingerprint.hpp => fingerprints.hpp} (99%)
 rename include/stringzillas/{similarity.cuh => similarities.cuh} (99%)
 rename include/stringzillas/{similarity.hpp => similarities.hpp} (99%)
 rename scripts/{bench_fingerprint.cpp => bench_fingerprints.cpp} (89%)
 rename scripts/{bench_fingerprint.cu => bench_fingerprints.cu} (90%)
 rename scripts/{bench_fingerprint.cuh => bench_fingerprints.cuh} (98%)
 rename scripts/{bench_fingerprint.py => bench_fingerprints.py} (96%)
 rename scripts/{bench_similarity.cpp => bench_similarities.cpp} (91%)
 rename scripts/{bench_similarity.cu => bench_similarities.cu} (91%)
 rename scripts/{bench_similarity.cuh => bench_similarities.cuh} (99%)
 rename scripts/{bench_similarity.ipynb => bench_similarities.ipynb} (99%)
 create mode 100644 scripts/bench_similarities.py
 rename scripts/{test_fingerprint.cuh => test_fingerprints.cuh} (99%)
 rename scripts/{test_similarity.cuh => test_similarities.cuh} (99%)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 897e3b6b..383434a7 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -100,8 +100,8 @@ jobs:
 
       - name: Test Parallel Algorithms on Real World Data
         run: |
-          build_artifacts/stringzillas_bench_similarity_cpp20 ${DATASET_PATH} # for edit distances and alignment scores
-          build_artifacts/stringzillas_bench_fingerprint_cpp20 ${DATASET_PATH}  # for multi-needle search in many strings
+          build_artifacts/stringzillas_bench_similarities_cpp20 ${DATASET_PATH} # for edit distances and alignment scores
+          build_artifacts/stringzillas_bench_fingerprints_cpp20 ${DATASET_PATH} # for multi-needle search in many strings
         env:
           DATASET_PATH: ./README.md
         # Don't overload GitHub with our benchmarks.
diff --git a/.gitignore b/.gitignore
index fc34a7cb..bf4235e8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 build/
 build_debug/
 build_release/
+build_relwithdebinfo/
 build_artifacts*
 
 # Yes, everyone loves keeping this file in the history.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 056d1c53..081949d6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,10 +31,10 @@
 #
 # Parallel Benchmarks:
 #
-# * stringzillas_bench_similarity_cpp20: A benchmark for similarity operations.
-# * stringzillas_bench_similarity_cu20: A benchmark for similarity operations on GPU.
-# * stringzillas_bench_fingerprint_cpp20: A benchmark for finding many substrings.
-# * stringzillas_bench_fingerprint_cu20: A benchmark for finding many substrings on GPU.
+# * stringzillas_bench_similarities_cpp20: A benchmark for similarity operations.
+# * stringzillas_bench_similarities_cu20: A benchmark for similarity operations on GPU.
+# * stringzillas_bench_fingerprints_cpp20: A benchmark for finding many substrings.
+# * stringzillas_bench_fingerprints_cu20: A benchmark for finding many substrings on GPU.
 #
 # For higher-level language bindings separate build scripts are provided, native to each toolchain.
 cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
@@ -246,7 +246,7 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
             target_compile_options(${target} PRIVATE "-O0;-g")
         endif ()
         if (${CMAKE_BUILD_TYPE} STREQUAL "Release" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
-            target_compile_options(${target} PRIVATE "-O3")
+            target_compile_options(${target} PRIVATE "-O2")
         endif ()
     elseif (${compiler_id} STREQUAL "NVIDIA")
         if (${CMAKE_BUILD_TYPE} STREQUAL "Debug" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
@@ -267,7 +267,7 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
             )
         endif ()
         if (${CMAKE_BUILD_TYPE} STREQUAL "Release" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
-            target_compile_options(${target} PRIVATE "-O3")
+            target_compile_options(${target} PRIVATE "-O2")
         endif ()
     endif ()
 
@@ -330,8 +330,8 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
                 # ! NVCC can't handle sanitizers?!
                 # https://stackoverflow.com/questions/75590579/cuda-fails-to-initialise-when-address-sanitizer-is-enabled
             else ()
-                # target_compile_options(${target} PRIVATE "-fsanitize=address;-fsanitize=leak")
-                # target_link_options(${target} PRIVATE "-fsanitize=address;-fsanitize=leak")
+                target_compile_options(${target} PRIVATE "-fsanitize=address" "-fsanitize=undefined")
+                target_link_options(${target} PRIVATE "-fsanitize=address" "-fsanitize=undefined")
             endif ()
         endif ()
     else ()
@@ -372,14 +372,18 @@ if (${STRINGZILLA_BUILD_BENCHMARK})
     define_launcher(stringzilla_bench_memory_cpp20 scripts/bench_memory.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
 
     # Parallel benchmarks
-    define_launcher(stringzillas_bench_similarity_cpp20 scripts/bench_similarity.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
-    define_launcher(stringzillas_bench_fingerprint_cpp20 scripts/bench_fingerprint.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
+    define_launcher(
+        stringzillas_bench_similarities_cpp20 scripts/bench_similarities.cpp 20 "${STRINGZILLA_TARGET_ARCH}"
+    )
+    define_launcher(
+        stringzillas_bench_fingerprints_cpp20 scripts/bench_fingerprints.cpp 20 "${STRINGZILLA_TARGET_ARCH}"
+    )
     if (ENABLE_CUDA)
         define_gpu_launcher(
-            stringzillas_bench_similarity_cu20 scripts/bench_similarity.cu 20 "${STRINGZILLA_TARGET_ARCH}"
+            stringzillas_bench_similarities_cu20 scripts/bench_similarities.cu 20 "${STRINGZILLA_TARGET_ARCH}"
         )
         define_gpu_launcher(
-            stringzillas_bench_fingerprint_cu20 scripts/bench_fingerprint.cu 20 "${STRINGZILLA_TARGET_ARCH}"
+            stringzillas_bench_fingerprints_cu20 scripts/bench_fingerprints.cu 20 "${STRINGZILLA_TARGET_ARCH}"
         )
     endif ()
 endif ()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 0ff490dd..e1d631e3 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -36,14 +36,14 @@ They have the broadest coverage of the library, and are the most important to ke
 - `scripts/bench_find.cpp` - bidirectional substring search, both exact and fuzzy.
 - `scripts/bench_sequence.cpp` - sorting, partitioning, merging.
 - `scripts/bench_container.cpp` - STL containers with different string keys.
-- `scripts/bench_similarity.cpp` - benchmark all edit distance backends.
-- `scripts/bench_fingerprint.cpp` - benchmark all Min-Hash fingerprinting backends.
+- `scripts/bench_similarities.cpp` - benchmark all edit distance backends.
+- `scripts/bench_fingerprints.cpp` - benchmark all Min-Hash fingerprinting backends.
 
 The role of Python benchmarks is less to provide absolute number, but to compare against popular tools in the Python ecosystem.
 
 - `scripts/bench_find.(py|ipynb)` - compares against native Python `str`.
 - `scripts/bench_sequence.(py|ipynb)` - compares against `pandas`.
-- `scripts/bench_similarity.(ipynb)` - compares against `jellyfish`, `editdistance`, etc.
+- `scripts/bench_similarities.(ipynb)` - compares against `jellyfish`, `editdistance`, etc.
 
 ## Benchmarking Datasets
 
@@ -186,10 +186,10 @@ build_release/stringzilla_bench_container_cpp20 # - for STL containers with stri
 There are also parallel algorithms that need a very different benchmarking setup:
 
 ```sh
-build_release/stringzillas_bench_fingerprint_cpp20  # - for parallel multi-pattern search on CPU
-build_release/stringzillas_bench_fingerprint_cu20   # - for parallel multi-pattern search on GPU
-build_release/stringzillas_bench_similarity_cpp20   # - for parallel edit distances and alignment scores on CPU
-build_release/stringzillas_bench_similarity_cu20    # - for parallel edit distances and alignment scores on GPU
+build_release/stringzillas_bench_fingerprints_cpp20     # - for parallel multi-pattern search on CPU
+build_release/stringzillas_bench_fingerprints_cu20      # - for parallel multi-pattern search on GPU
+build_release/stringzillas_bench_similarities_cpp20     # - for parallel edit distances and alignment scores on CPU
+build_release/stringzillas_bench_similarities_cu20      # - for parallel edit distances and alignment scores on GPU
 ```
 
 All of them support customization via environment variables.
@@ -197,13 +197,13 @@ Let's say you want to benchmark large-batch DNA similarity scoring kernels:
 
 ```sh
 cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -B build_release
-cmake --build build_release --config Release --target stringzillas_bench_fingerprint_cpp20   # CPU
-cmake --build build_release --config Release --target stringzillas_bench_similarity_cu20     # GPU
-STRINGWARS_FILTER=32768 STRINGWARS_DATASET="acgt_1k.txt" build_release/stringzillas_bench_similarity_cpp20
-STRINGWARS_FILTER=1 STRINGWARS_DATASET="acgt_100k.txt" build_release/stringzillas_bench_similarity_cu20
+cmake --build build_release --config Release --target stringzillas_bench_fingerprints_cpp20 # CPU
+cmake --build build_release --config Release --target stringzillas_bench_similarities_cu20  # GPU
+STRINGWARS_FILTER=32768 STRINGWARS_DATASET="acgt_1k.txt" build_release/stringzillas_bench_similarities_cpp20
+STRINGWARS_FILTER=1 STRINGWARS_DATASET="acgt_100k.txt" build_release/stringzillas_bench_similarities_cu20
 
-STRINGWARS_FILTER="(cuda|kepler|hopper).*:batch32768" STRINGWARS_DATASET="acgt_1k.txt" build_release/stringzillas_bench_similarity_cu20
-STRINGWARS_STRESS=0 STRINGWARS_FILTER="(cuda|kepler|hopper).*:batch1" STRINGWARS_DATASET="acgt_100k.txt" build_release/stringzillas_bench_similarity_cu20
+STRINGWARS_FILTER="(cuda|kepler|hopper).*:batch32768" STRINGWARS_DATASET="acgt_1k.txt" build_release/stringzillas_bench_similarities_cu20
+STRINGWARS_STRESS=0 STRINGWARS_FILTER="(cuda|kepler|hopper).*:batch1" STRINGWARS_DATASET="acgt_100k.txt" build_release/stringzillas_bench_similarities_cu20
 ```
 
 Each benchmark originates from an identically named single-source file in the `scripts/` directory.
@@ -475,9 +475,10 @@ For high-performance low-latency benchmarking, stick to C/C++ native benchmarks,
 For benchmarking, the following scripts are provided.
 
 ```sh
-python scripts/bench_find.py --haystack_path "your file" --needle "your pattern" # real data
-python scripts/bench_find.py --haystack_pattern "abcd" --haystack_length 1e9 --needle "abce" # synthetic data
-python scripts/similarity_bench.py --text_path "your file" # edit distance computations
+uv run --no-project scripts/bench_find.py --help
+uv run --no-project scripts/bench_sequence.py --help
+uv run --no-project scripts/bench_similarities.py --help
+uv run --no-project scripts/bench_fingerprints.py --help
 ```
 
 Alternatively, you can explore the Jupyter notebooks in `scripts/` directory.
diff --git a/build.rs b/build.rs
index 260d0b09..701799f3 100644
--- a/build.rs
+++ b/build.rs
@@ -80,7 +80,7 @@ fn main() {
     println!("cargo:rerun-if-changed=include/stringzilla/find.h");
     println!("cargo:rerun-if-changed=include/stringzilla/hash.h");
     println!("cargo:rerun-if-changed=include/stringzilla/memory.h");
-    println!("cargo:rerun-if-changed=include/stringzilla/similarity.h");
+    println!("cargo:rerun-if-changed=include/stringzilla/similarities.h");
     println!("cargo:rerun-if-changed=include/stringzilla/small_string.h");
     println!("cargo:rerun-if-changed=include/stringzilla/sort.h");
     println!("cargo:rerun-if-changed=include/stringzilla/types.h");
diff --git a/c/stringzillas.cu b/c/stringzillas.cu
index 2433622c..53a87c8f 100644
--- a/c/stringzillas.cu
+++ b/c/stringzillas.cu
@@ -6,12 +6,12 @@
  */
 #include <fork_union.hpp> // Fork-join scoped thread pool
 
-#include <stringzillas/fingerprint.hpp> // C++ templates for string processing
-#include <stringzillas/similarity.hpp>  // C++ templates for string similarity
+#include <stringzillas/fingerprints.hpp> // C++ templates for string processing
+#include <stringzillas/similarities.hpp> // C++ templates for string similarity
 
 #if SZ_USE_CUDA
-#include <stringzillas/fingerprint.cuh> // Parallel string processing in CUDA
-#include <stringzillas/similarity.cuh>  // Parallel string similarity in CUDA
+#include <stringzillas/fingerprints.cuh> // Parallel string processing in CUDA
+#include <stringzillas/similarities.cuh> // Parallel string similarity in CUDA
 #endif
 
 namespace sz = ashvardanian::stringzilla;
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 1c45678f..6d6c187b 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -28,9 +28,9 @@
  *  It also provides many higher-level parallel algorithms, mostly implemented in C++ with OpenMP and CUDA, also exposed
  *  via the stable C 99 ABI, but requiring C++17 and CUDA 17 compilers to build the shared @b StringCuZilla libraries:
  *
- *  - `similarity.{hpp,cuh}` - similarity measures, like Levenshtein, Needleman-Wunsch, & Smith-Waterman scores.
+ *  - `similarities.{hpp,cuh}` - similarity measures, like Levenshtein, Needleman-Wunsch, & Smith-Waterman scores.
  *  - `features.{hpp,cuh}` - feature extraction for TF-IDF and other Machine Learning algorithms.
- *  - `fingerprint.{hpp,cuh}` - Aho-Corasick multi-pattern search.
+ *  - `fingerprints.{hpp,cuh}` - Aho-Corasick multi-pattern search.
  *
  *  The core implementations of those algorithms are mostly structured as callable structure templates, as opposed to
  *  template functions to simplify specialized overloads and reusing the state between invocations.
diff --git a/include/stringzillas/fingerprint.cuh b/include/stringzillas/fingerprints.cuh
similarity index 99%
rename from include/stringzillas/fingerprint.cuh
rename to include/stringzillas/fingerprints.cuh
index 8868b67e..5db63cb0 100644
--- a/include/stringzillas/fingerprint.cuh
+++ b/include/stringzillas/fingerprints.cuh
@@ -1,21 +1,21 @@
 /**
  *  @brief  CUDA-accelerated fingerprinting utilities for string collections.
- *  @file   fingerprint.cuh
+ *  @file   fingerprints.cuh
  *  @author Ash Vardanian
  *
  *  CUDA specialization of the `floating_rolling_hashers` template for GPU-accelerated count-min-sketching.
  *  Unlike the CPU variants, this implementation focuses on batch-processing of large collections of strings,
  *  assigning warps to process multiple strings in parallel.
  */
-#ifndef STRINGZILLAS_FINGERPRINT_CUH_
-#define STRINGZILLAS_FINGERPRINT_CUH_
+#ifndef STRINGZILLAS_FINGERPRINTS_CUH_
+#define STRINGZILLAS_FINGERPRINTS_CUH_
 
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cooperative_groups.h>
 
 #include "stringzillas/types.cuh"
-#include "stringzillas/fingerprint.hpp"
+#include "stringzillas/fingerprints.hpp"
 
 namespace ashvardanian {
 namespace stringzillas {
@@ -470,4 +470,4 @@ struct floating_rolling_hashers<sz_cap_cuda_k, window_width_, dimensions_> {
 } // namespace stringzillas
 } // namespace ashvardanian
 
-#endif // STRINGZILLAS_FINGERPRINT_CUH_
+#endif // STRINGZILLAS_FINGERPRINTS_CUH_
diff --git a/include/stringzillas/fingerprint.hpp b/include/stringzillas/fingerprints.hpp
similarity index 99%
rename from include/stringzillas/fingerprint.hpp
rename to include/stringzillas/fingerprints.hpp
index 9580e824..21cd59a4 100644
--- a/include/stringzillas/fingerprint.hpp
+++ b/include/stringzillas/fingerprints.hpp
@@ -1,6 +1,6 @@
 /**
  *  @brief  Hardware-accelerated Min-Hash fingerprinting for string collections.
- *  @file   fingerprint.hpp
+ *  @file   fingerprints.hpp
  *  @author Ash Vardanian
  *
  *  The `sklearn.feature_extraction` module for @b TF-IDF, `CountVectorizer`, and @b `HashingVectorizer`
@@ -110,8 +110,8 @@
  *  - on Nvidia GPUs, take at least 32 hash-functions of each width, to activate all 32 threads in a warp.
  *  - on AMD GPUs, take at least 64 hash-functions of each width, to activate all 64 threads in a wave.
  */
-#ifndef STRINGZILLAS_FINGERPRINT_HPP_
-#define STRINGZILLAS_FINGERPRINT_HPP_
+#ifndef STRINGZILLAS_FINGERPRINTS_HPP_
+#define STRINGZILLAS_FINGERPRINTS_HPP_
 
 #include "stringzilla/types.hpp"  // `sz::error_cost_t`
 #include "stringzilla/memory.h"   // `sz_move`
@@ -1886,4 +1886,4 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
 } // namespace stringzillas
 } // namespace ashvardanian
 
-#endif // STRINGZILLAS_FINGERPRINT_HPP_
+#endif // STRINGZILLAS_FINGERPRINTS_HPP_
diff --git a/include/stringzillas/similarity.cuh b/include/stringzillas/similarities.cuh
similarity index 99%
rename from include/stringzillas/similarity.cuh
rename to include/stringzillas/similarities.cuh
index 44e8f0d3..9dd158ca 100644
--- a/include/stringzillas/similarity.cuh
+++ b/include/stringzillas/similarities.cuh
@@ -1,6 +1,6 @@
 /**
  *  @brief  CUDA-accelerated string similarity utilities.
- *  @file   similarity.cuh
+ *  @file   similarities.cuh
  *  @author Ash Vardanian
  *
  *  Unlike th OpenMP backed, which also has single-pair similarity scores, the CUDA backend focuses @b only on
@@ -10,7 +10,7 @@
  *  - `sz::needleman_wunsch_score` for weighted Needleman-Wunsch global alignment scores.
  *  - `sz::smith_waterman_score` for weighted Smith-Waterman local alignment scores.
  *
- *  Unlike the trivially parallelizable CPU kernels in `stringzilla/similarity.hpp`, the GPU kernels in this file are
+ *  Unlike the trivially parallelizable CPU kernels in `stringzilla/similarities.hpp`, the GPU kernels in this file are
  *  designed for batch-processing of large collections of strings, assigning a single warp to each string pair.
  *  Thus, they should be used when hundreds of pairwise comparisons are needed, and the strings are long enough to
  *  amortize the cost of copying them to the GPU.
@@ -34,8 +34,8 @@
  *  - `levenshtein_distances`: {CUDA and Kepler} for any chars and lengths, {Hopper} for 8-bit and 16-bit lengths.
  *  - `needleman_wunsch_score`.
  */
-#ifndef STRINGZILLAS_SIMILARITY_CUH_
-#define STRINGZILLAS_SIMILARITY_CUH_
+#ifndef STRINGZILLAS_SIMILARITIES_CUH_
+#define STRINGZILLAS_SIMILARITIES_CUH_
 
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -43,7 +43,7 @@
 #include <cooperative_groups.h> // `cooperative_groups::this_grid()`
 
 #include "stringzillas/types.cuh"
-#include "stringzillas/similarity.hpp"
+#include "stringzillas/similarities.hpp"
 
 namespace ashvardanian {
 namespace stringzillas {
@@ -2995,4 +2995,4 @@ struct smith_waterman_scores<char, error_costs_256x256_t, gap_costs_type_, alloc
 } // namespace stringzillas
 } // namespace ashvardanian
 
-#endif // STRINGZILLAS_SIMILARITY_CUH_
\ No newline at end of file
+#endif // STRINGZILLAS_SIMILARITIES_CUH_
\ No newline at end of file
diff --git a/include/stringzillas/similarity.hpp b/include/stringzillas/similarities.hpp
similarity index 99%
rename from include/stringzillas/similarity.hpp
rename to include/stringzillas/similarities.hpp
index 9dd030d0..ab9a8e39 100644
--- a/include/stringzillas/similarity.hpp
+++ b/include/stringzillas/similarities.hpp
@@ -1,6 +1,6 @@
 /**
  *  @brief  OpenMP-accelerated string similarity scores in C++.
- *  @file   similarity.hpp
+ *  @file   similarities.hpp
  *  @author Ash Vardanian
  *
  *  Includes core APIs, defined as the following template objects:
@@ -66,8 +66,8 @@
  *  @see https://github.com/quim0/WFA-GPU
  *  @see https://github.com/asbschmidt/CUDASW4
  */
-#ifndef STRINGZILLAS_SIMILARITY_HPP_
-#define STRINGZILLAS_SIMILARITY_HPP_
+#ifndef STRINGZILLAS_SIMILARITIES_HPP_
+#define STRINGZILLAS_SIMILARITIES_HPP_
 
 #include "stringzilla/types.hpp"  // `sz::error_cost_t`
 #include "stringzilla/memory.h"   // `sz_move`
@@ -4400,4 +4400,4 @@ struct smith_waterman_score<char, error_costs_256x256_t, linear_gap_costs_t, all
 } // namespace stringzillas
 } // namespace ashvardanian
 
-#endif // STRINGZILLAS_SIMILARITY_HPP_
\ No newline at end of file
+#endif // STRINGZILLAS_SIMILARITIES_HPP_
\ No newline at end of file
diff --git a/scripts/bench_container.cpp b/scripts/bench_container.cpp
index eb4c2387..43d392e9 100644
--- a/scripts/bench_container.cpp
+++ b/scripts/bench_container.cpp
@@ -35,7 +35,7 @@
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
- *  This file is the sibling of `bench_sequence.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
+ *  This file is the sibling of `bench_sequence.cpp`, `bench_token.cpp`, and `bench_memory.cpp`.
  */
 #include <map>           // `std::map`
 #include <unordered_map> // `std::unordered_map`
diff --git a/scripts/bench_find.cpp b/scripts/bench_find.cpp
index f1c5f5e7..40e1e03d 100644
--- a/scripts/bench_find.cpp
+++ b/scripts/bench_find.cpp
@@ -50,7 +50,7 @@
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
- *  This file is the sibling of `bench_sequence.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
+ *  This file is the sibling of `bench_sequence.cpp`, `bench_token.cpp`, and `bench_memory.cpp`.
  */
 #include <cstring>    // `memmem`
 #include <functional> // `std::boyer_moore_searcher`
diff --git a/scripts/bench_fingerprint.cpp b/scripts/bench_fingerprints.cpp
similarity index 89%
rename from scripts/bench_fingerprint.cpp
rename to scripts/bench_fingerprints.cpp
index 9e9111d8..debc49a5 100644
--- a/scripts/bench_fingerprint.cpp
+++ b/scripts/bench_fingerprints.cpp
@@ -1,5 +1,5 @@
 /**
- *  @file   bench_fingerprint.cpp
+ *  @file   bench_fingerprints.cpp
  *  @brief  Benchmarks for exact multi-pattern substring search algorithms.
  *          The program accepts a file path to a dataset, tokenizes it, and benchmarks the search operations,
  *          validating the SIMD-accelerated backends against the serial baselines.
@@ -21,8 +21,8 @@
  *
  *  @code{.sh}
  *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
- *  cmake --build build_release --config Release --target stringzillas_bench_fingerprint_cpp20
- *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringzillas_bench_fingerprint_cpp20
+ *  cmake --build build_release --config Release --target stringzillas_bench_fingerprints_cpp20
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringzillas_bench_fingerprints_cpp20
  *  @endcode
  *
  *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
@@ -32,13 +32,13 @@
  *  @code{.sh}
  *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
  *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
- *  build_release/stringzillas_bench_fingerprint_cpp20
+ *  build_release/stringzillas_bench_fingerprints_cpp20
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
- *  This file is the sibling of `bench_sequence.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
+ *  This file is a sibling of `bench_similarities.cpp`.
  */
-#include "bench_fingerprint.cuh"
+#include "bench_fingerprints.cuh"
 
 namespace szs = ashvardanian::stringzillas;
 using namespace szs::scripts;
diff --git a/scripts/bench_fingerprint.cu b/scripts/bench_fingerprints.cu
similarity index 90%
rename from scripts/bench_fingerprint.cu
rename to scripts/bench_fingerprints.cu
index 8fcc38e4..ce7bf403 100644
--- a/scripts/bench_fingerprint.cu
+++ b/scripts/bench_fingerprints.cu
@@ -1,5 +1,5 @@
 /**
- *  @file   bench_fingerprint.cu
+ *  @file   bench_fingerprints.cu
  *  @brief  Benchmarks for exact multi-pattern substring search algorithms on the GPU.
  *          The program accepts a file path to a dataset, tokenizes it, and benchmarks the search operations,
  *          validating the SIMD-accelerated backends against the serial baselines.
@@ -21,8 +21,8 @@
  *
  *  @code{.sh}
  *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
- *  cmake --build build_release --config Release --target stringzillas_bench_fingerprint_cu20
- *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringzillas_bench_fingerprint_cu20
+ *  cmake --build build_release --config Release --target stringzillas_bench_fingerprints_cu20
+ *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=words build_release/stringzillas_bench_fingerprints_cu20
  *  @endcode
  *
  *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
@@ -32,13 +32,13 @@
  *  @code{.sh}
  *  STRINGWARS_DATASET=leipzig1M.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
  *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
- *  build_release/stringzillas_bench_fingerprint_cu20
+ *  build_release/stringzillas_bench_fingerprints_cu20
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
- *  This file is the sibling of `bench_sequence.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
+ *  This file is a sibling of `bench_similarities.cpp`.
  */
-#include "bench_fingerprint.cuh"
+#include "bench_fingerprints.cuh"
 
 namespace szs = ashvardanian::stringzillas;
 using namespace szs::scripts;
diff --git a/scripts/bench_fingerprint.cuh b/scripts/bench_fingerprints.cuh
similarity index 98%
rename from scripts/bench_fingerprint.cuh
rename to scripts/bench_fingerprints.cuh
index d0cbc78c..4e8be1fb 100644
--- a/scripts/bench_fingerprint.cuh
+++ b/scripts/bench_fingerprints.cuh
@@ -1,5 +1,5 @@
 /**
- *  @file   bench_fingerprint.cuh
+ *  @file   bench_fingerprints.cuh
  *  @brief  Shared code for CPU and GPU batched parallel exact substring search.
  */
 #include <tuple> // `std::tuple`
@@ -8,10 +8,10 @@
 #define FU_ENABLE_NUMA 0
 #include <fork_union.hpp> // Fork-join scoped thread pool
 
-#include <stringzillas/fingerprint.hpp> // C++ templates for string processing
+#include <stringzillas/fingerprints.hpp> // C++ templates for string processing
 
 #if SZ_USE_CUDA
-#include <stringzillas/fingerprint.cuh> // Parallel string processing in CUDA
+#include <stringzillas/fingerprints.cuh> // Parallel string processing in CUDA
 #endif
 
 #include "bench.hpp"
diff --git a/scripts/bench_fingerprint.py b/scripts/bench_fingerprints.py
similarity index 96%
rename from scripts/bench_fingerprint.py
rename to scripts/bench_fingerprints.py
index 93ebec1c..9fe09029 100644
--- a/scripts/bench_fingerprint.py
+++ b/scripts/bench_fingerprints.py
@@ -17,13 +17,13 @@
 Example usage via UV:
 
     # Benchmark with a file
-    uv run --no-project scripts/bench_fingerprint.py --dataset leipzig1M.txt
+    uv run --no-project scripts/bench_fingerprints.py --dataset leipzig1M.txt
 
     # Benchmark with limited docs
-    uv run --no-project scripts/bench_fingerprint.py --dataset leipzig1M.txt --max-docs 1000
+    uv run --no-project scripts/bench_fingerprints.py --dataset leipzig1M.txt --max-docs 1000
 
     # Benchmark with custom parameters
-    uv run --no-project scripts/bench_fingerprint.py --dataset leipzig1M.txt --dimensions 32
+    uv run --no-project scripts/bench_fingerprints.py --dataset leipzig1M.txt --dimensions 32
 """
 
 import argparse
diff --git a/scripts/bench_memory.cpp b/scripts/bench_memory.cpp
index b566d09c..33e36872 100644
--- a/scripts/bench_memory.cpp
+++ b/scripts/bench_memory.cpp
@@ -36,7 +36,7 @@
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
- *  This file is the sibling of `bench_find.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_sequence.cpp`.
+ *  This file is the sibling of `bench_find.cpp`, `bench_token.cpp`, and `bench_sequence.cpp`.
  */
 #include <cstring> // `memmem`
 #include <memory>  // `std::unique_ptr`
diff --git a/scripts/bench_sequence.cpp b/scripts/bench_sequence.cpp
index 30323e3b..58b9cadc 100644
--- a/scripts/bench_sequence.cpp
+++ b/scripts/bench_sequence.cpp
@@ -45,7 +45,7 @@
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
- *  This file is the sibling of `bench_find.cpp`, `bench_token.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
+ *  This file is the sibling of `bench_find.cpp`, `bench_token.cpp`, and `bench_memory.cpp`.
  */
 #include <memory>        // `std::memcpy`
 #include <numeric>       // `std::iota`
diff --git a/scripts/bench_similarity.cpp b/scripts/bench_similarities.cpp
similarity index 91%
rename from scripts/bench_similarity.cpp
rename to scripts/bench_similarities.cpp
index 0b1a5914..c04f93d5 100644
--- a/scripts/bench_similarity.cpp
+++ b/scripts/bench_similarities.cpp
@@ -1,5 +1,5 @@
 /**
- *  @file   bench_similarity.cpp
+ *  @file   bench_similarities.cpp
  *  @brief  Benchmarks string similarity computations.
  *          It accepts a file with a list of words, and benchmarks the levenshtein edit-distance computations,
  *          alignment scores, and fingerprinting techniques combined with the Hamming distance.
@@ -30,8 +30,8 @@
  *
  *  @code{.sh}
  *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
- *  cmake --build build_release --config Release --target stringzillas_bench_similarity_cpp20
- *  STRINGWARS_DATASET=xlsum.csv STRINGWARS_TOKENS=words build_release/stringzillas_bench_similarity_cpp20
+ *  cmake --build build_release --config Release --target stringzillas_bench_similarities_cpp20
+ *  STRINGWARS_DATASET=xlsum.csv STRINGWARS_TOKENS=words build_release/stringzillas_bench_similarities_cpp20
  *  @endcode
  *
  *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
@@ -41,13 +41,13 @@
  *  @code{.sh}
  *  STRINGWARS_DATASET=proteins.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
  *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
- *  build_release/stringzillas_bench_similarity_cpp20
+ *  build_release/stringzillas_bench_similarities_cpp20
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
- *  This file is the sibling of `bench_find.cpp`, `bench_token.cpp`, `bench_sequence.cpp`, and `bench_memory.cpp`.
+ *  This file is a sibling of `bench_fingerprints.cpp`.
  */
-#include "bench_similarity.cuh"
+#include "bench_similarities.cuh"
 
 namespace szs = ashvardanian::stringzillas;
 using namespace szs::scripts;
diff --git a/scripts/bench_similarity.cu b/scripts/bench_similarities.cu
similarity index 91%
rename from scripts/bench_similarity.cu
rename to scripts/bench_similarities.cu
index 6727d150..9d5d3d57 100644
--- a/scripts/bench_similarity.cu
+++ b/scripts/bench_similarities.cu
@@ -1,5 +1,5 @@
 /**
- *  @file   bench_similarity.cpp
+ *  @file   bench_similarities.cpp
  *  @brief  Benchmarks string similarity computations.
  *          It accepts a file with a list of words, and benchmarks the levenshtein edit-distance computations,
  *          alignment scores, and fingerprinting techniques combined with the Hamming distance.
@@ -30,8 +30,8 @@
  *
  *  @code{.sh}
  *  cmake -D STRINGZILLA_BUILD_BENCHMARK=1 -D CMAKE_BUILD_TYPE=Release -B build_release
- *  cmake --build build_release --config Release --target stringzillas_bench_similarity_cu20
- *  STRINGWARS_DATASET=xlsum.csv STRINGWARS_TOKENS=words build_release/stringzillas_bench_similarity_cu20
+ *  cmake --build build_release --config Release --target stringzillas_bench_similarities_cu20
+ *  STRINGWARS_DATASET=xlsum.csv STRINGWARS_TOKENS=words build_release/stringzillas_bench_similarities_cu20
  *  @endcode
  *
  *  Alternatively, if you really want to stress-test a very specific function on a certain size inputs,
@@ -41,13 +41,13 @@
  *  @code{.sh}
  *  STRINGWARS_DATASET=proteins.txt STRINGWARS_TOKENS=64 STRINGWARS_FILTER=skylake
  *  STRINGWARS_STRESS=1 STRINGWARS_STRESS_DURATION=120 STRINGWARS_STRESS_DIR=logs
- *  build_release/stringzillas_bench_similarity_cu20
+ *  build_release/stringzillas_bench_similarities_cu20
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
- *  This file is the sibling of `bench_find.cpp`, `bench_token.cpp`, `bench_sequence.cpp`, and `bench_memory.cpp`.
+ *  This file is a sibling of `bench_fingerprints.cpp`.
  */
-#include "bench_similarity.cuh"
+#include "bench_similarities.cuh"
 
 namespace szs = ashvardanian::stringzillas;
 using namespace szs::scripts;
diff --git a/scripts/bench_similarity.cuh b/scripts/bench_similarities.cuh
similarity index 99%
rename from scripts/bench_similarity.cuh
rename to scripts/bench_similarities.cuh
index 9c94a06b..0939c9b7 100644
--- a/scripts/bench_similarity.cuh
+++ b/scripts/bench_similarities.cuh
@@ -1,5 +1,5 @@
 /**
- *  @file   bench_similarity.cuh
+ *  @file   bench_similarities.cuh
  *  @brief  Shared code for CPU and GPU batched string similarity kernels.
  */
 #include <tuple> // `std::tuple`
@@ -7,10 +7,10 @@
 #define FU_ENABLE_NUMA 0
 #include <fork_union.hpp> // Fork-join scoped thread pool
 
-#include <stringzillas/similarity.hpp> // C++ templates for string similarity measures
+#include <stringzillas/similarities.hpp> // C++ templates for string similarity measures
 
 #if SZ_USE_CUDA
-#include <stringzillas/similarity.cuh> // Parallel string processing in CUDA
+#include <stringzillas/similarities.cuh> // Parallel string processing in CUDA
 #endif
 
 #include "bench.hpp"
diff --git a/scripts/bench_similarity.ipynb b/scripts/bench_similarities.ipynb
similarity index 99%
rename from scripts/bench_similarity.ipynb
rename to scripts/bench_similarities.ipynb
index 05c8e30a..f77c539a 100644
--- a/scripts/bench_similarity.ipynb
+++ b/scripts/bench_similarities.ipynb
@@ -408,7 +408,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "base",
+   "display_name": "StringZilla",
    "language": "python",
    "name": "python3"
   },
@@ -422,7 +422,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.5"
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,
diff --git a/scripts/bench_similarities.py b/scripts/bench_similarities.py
new file mode 100644
index 00000000..8e099abf
--- /dev/null
+++ b/scripts/bench_similarities.py
@@ -0,0 +1,392 @@
+# /// script
+# dependencies = [
+#   "stringzilla",
+#   "rapidfuzz",
+#   "python-Levenshtein",
+#   "levenshtein",
+#   "jellyfish",
+#   "editdistance",
+#   "distance",
+#   "polyleven",
+#   "edlib",
+#   "nltk",
+#   "biopython",
+#   "numpy",
+#   "tqdm",
+# ]
+# ///
+"""
+StringZilla similarity benchmark script.
+
+This script benchmarks string similarity operations using various libraries:
+- stringzilla: Fast edit distance and alignment scoring
+- rapidfuzz: Fast fuzzy string matching
+- python-Levenshtein: Classic Levenshtein distance
+- jellyfish: Multiple string distance metrics
+- editdistance: Pure Python edit distance
+- nltk: Natural language toolkit distances
+- edlib: Fast sequence alignment
+- biopython: Needleman-Wunsch alignment with BLOSUM matrices
+
+Example usage via UV:
+
+    # Benchmark with a file
+    uv run --no-project scripts/bench_similarities.py --dataset leipzig1M.txt
+
+    # Benchmark with limited pairs
+    uv run --no-project scripts/bench_similarities.py --dataset leipzig1M.txt --max-pairs 1000
+
+    # Benchmark with custom timeout
+    uv run --no-project scripts/bench_similarities.py --dataset leipzig1M.txt --timeout 30
+
+    # Benchmark protein sequences
+    uv run --no-project scripts/bench_similarities.py --protein-mode --protein-length 500
+
+    # Benchmark protein sequences with a custom file
+    uv run --no-project scripts/bench_similarities.py --protein-mode --dataset acgt_1k.txt
+"""
+
+import argparse
+import random
+import time
+from pathlib import Path
+from typing import List, Callable, Iterable, Tuple
+
+from tqdm import tqdm
+import numpy as np
+
+# String similarity libraries
+import stringzilla as sz
+import jellyfish as jf
+import Levenshtein as le
+import editdistance as ed
+from rapidfuzz.distance import Levenshtein as rf
+from nltk.metrics.distance import edit_distance as nltk_ed
+import edlib
+
+try:
+    import polyleven
+
+    POLYLEVEN_AVAILABLE = True
+except ImportError:
+    POLYLEVEN_AVAILABLE = False
+
+# For Needleman-Wunsch alignment
+try:
+    from Bio import Align
+    from Bio.Align import substitution_matrices
+
+    BIOPYTHON_AVAILABLE = True
+except ImportError:
+    BIOPYTHON_AVAILABLE = False
+
+# Global state for initialized models
+_biopython_aligner = None
+_blosum_matrix = None
+
+
+def log_similarity_operation(
+    name: str,
+    string_pairs: List[Tuple[str, str]],
+    similarity_func: Callable,
+    timeout_seconds: int = 10,
+):
+    """Benchmark a similarity operation with timeout and progress tracking."""
+    processed_pairs = 0
+    processed_bytes = 0
+    checksum = 0
+    start_time = time.time_ns()
+
+    try:
+        with tqdm(
+            desc=name, unit="pairs", leave=False, total=len(string_pairs)
+        ) as progress_bar:
+            for str_a, str_b in string_pairs:
+                # Check timeout (convert seconds to nanoseconds)
+                if time.time_ns() - start_time > timeout_seconds * 1e9:
+                    break
+
+                try:
+                    distance = similarity_func(str_a, str_b)
+                    checksum += distance
+                    processed_pairs += 1
+                    processed_bytes += len(str_a.encode("utf-8")) + len(
+                        str_b.encode("utf-8")
+                    )
+
+                    # Update progress bar with custom rate
+                    elapsed_ns = time.time_ns() - start_time
+                    elapsed_s = elapsed_ns / 1e9
+                    if elapsed_s > 0:
+                        pairs_per_sec = processed_pairs / elapsed_s
+                        bytes_per_sec = processed_bytes / elapsed_s
+                        progress_bar.set_postfix(
+                            {
+                                "pairs/s": f"{pairs_per_sec:.0f}",
+                                "MB/s": f"{bytes_per_sec/1e6:.1f}",
+                                "checksum": f"{checksum}",
+                            }
+                        )
+                    progress_bar.update(1)
+
+                except Exception as e:
+                    # Skip failed operations but continue
+                    continue
+
+    except KeyboardInterrupt:
+        print(f"\n{name}: SKIPPED (interrupted by user)")
+        return
+
+    total_time_ns = time.time_ns() - start_time
+    total_time_s = total_time_ns / 1e9
+    if processed_pairs > 0:
+        pairs_per_sec = processed_pairs / total_time_s
+        mb_per_sec = processed_bytes / (1e6 * total_time_s)
+        print(
+            f"{name}: {processed_pairs:,} pairs in {total_time_s:.2f}s ~ {mb_per_sec:.3f} MB/s, {pairs_per_sec:.0f} pairs/s, checksum={checksum}"
+        )
+    else:
+        print(f"{name}: No pairs processed")
+
+
+def benchmark_edit_distances(
+    string_pairs: List[Tuple[str, str]], timeout_seconds: int = 10
+):
+    """Benchmark various edit distance implementations."""
+
+    # StringZilla
+    log_similarity_operation(
+        "stringzilla.edit_distance", string_pairs, sz.edit_distance, timeout_seconds
+    )
+
+    log_similarity_operation(
+        "stringzilla.edit_distance_unicode",
+        string_pairs,
+        sz.edit_distance_unicode,
+        timeout_seconds,
+    )
+
+    # RapidFuzz
+    log_similarity_operation(
+        "rapidfuzz.Levenshtein.distance", string_pairs, rf.distance, timeout_seconds
+    )
+
+    # python-Levenshtein
+    log_similarity_operation(
+        "Levenshtein.distance", string_pairs, le.distance, timeout_seconds
+    )
+
+    # Jellyfish
+    log_similarity_operation(
+        "jellyfish.levenshtein_distance",
+        string_pairs,
+        jf.levenshtein_distance,
+        timeout_seconds,
+    )
+
+    # EditDistance
+    log_similarity_operation(
+        "editdistance.eval", string_pairs, ed.eval, timeout_seconds
+    )
+
+    # NLTK
+    log_similarity_operation(
+        "nltk.edit_distance", string_pairs, nltk_ed, timeout_seconds
+    )
+
+    # Edlib
+    def edlib_distance(a: str, b: str) -> int:
+        return edlib.align(a, b, mode="NW", task="distance")["editDistance"]
+
+    log_similarity_operation(
+        "edlib.align", string_pairs, edlib_distance, timeout_seconds
+    )
+
+    # Polyleven (if available)
+    if POLYLEVEN_AVAILABLE:
+        log_similarity_operation(
+            "polyleven.levenshtein",
+            string_pairs,
+            polyleven.levenshtein,
+            timeout_seconds,
+        )
+
+
+def benchmark_alignment_scores(
+    string_pairs: List[Tuple[str, str]], timeout_seconds: int = 10
+):
+    """Benchmark alignment scoring with substitution matrices."""
+    global _biopython_aligner, _blosum_matrix
+
+    if not BIOPYTHON_AVAILABLE:
+        print("BioPython not available, skipping alignment benchmarks")
+        return
+
+    # Initialize BioPython aligner
+    if _biopython_aligner is None:
+        _biopython_aligner = Align.PairwiseAligner()
+        _biopython_aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
+        _biopython_aligner.open_gap_score = 1
+        _biopython_aligner.extend_gap_score = 1
+
+        # Convert BLOSUM matrix to dense 256x256 for StringZilla
+        subs_packed = np.array(_biopython_aligner.substitution_matrix).astype(np.int8)
+        _blosum_matrix = np.zeros((256, 256), dtype=np.int8)
+        _blosum_matrix.fill(127)  # Large penalty for invalid characters
+
+        for packed_row, packed_row_aminoacid in enumerate(
+            _biopython_aligner.substitution_matrix.alphabet
+        ):
+            for packed_column, packed_column_aminoacid in enumerate(
+                _biopython_aligner.substitution_matrix.alphabet
+            ):
+                reconstructed_row = ord(packed_row_aminoacid)
+                reconstructed_column = ord(packed_column_aminoacid)
+                _blosum_matrix[reconstructed_row, reconstructed_column] = subs_packed[
+                    packed_row, packed_column
+                ]
+
+    # StringZilla alignment score
+    def sz_alignment_score(a: str, b: str) -> int:
+        return sz.alignment_score(a, b, substitution_matrix=_blosum_matrix, gap_score=1)
+
+    log_similarity_operation(
+        "stringzilla.alignment_score", string_pairs, sz_alignment_score, timeout_seconds
+    )
+
+    # BioPython alignment score
+    log_similarity_operation(
+        "biopython.PairwiseAligner.score",
+        string_pairs,
+        _biopython_aligner.score,
+        timeout_seconds,
+    )
+
+
+def generate_random_pairs(strings: List[str], num_pairs: int) -> List[Tuple[str, str]]:
+    """Generate random string pairs from a list of strings."""
+    return [(random.choice(strings), random.choice(strings)) for _ in range(num_pairs)]
+
+
+def generate_protein_sequences(num_sequences: int, length: int) -> List[str]:
+    """Generate random protein sequences using ACGT alphabet."""
+    return ["".join(random.choices("ACGT", k=length)) for _ in range(num_sequences)]
+
+
+def bench(
+    dataset_path: str,
+    max_pairs: int = None,
+    timeout_seconds: int = 10,
+    protein_mode: bool = False,
+    protein_length: int = 1000,
+):
+    """Run similarity benchmarks."""
+
+    if protein_mode:
+        print("=== Protein Sequence Benchmarks ===")
+        print(f"Generating {protein_length}-length protein sequences...")
+        proteins = generate_protein_sequences(1000, protein_length)
+        pairs = generate_random_pairs(proteins, max_pairs or 1000)
+
+        print(f"Generated {len(pairs):,} protein sequence pairs")
+        print(f"Average sequence length: {protein_length} chars")
+        print(f"Timeout per benchmark: {timeout_seconds}s")
+        print()
+
+        print("=== Edit Distance Benchmarks ===")
+        benchmark_edit_distances(pairs, timeout_seconds)
+        print()
+
+        print("=== Alignment Score Benchmarks ===")
+        benchmark_alignment_scores(pairs, timeout_seconds)
+
+    else:
+        # Load dataset
+        if not Path(dataset_path).exists():
+            raise FileNotFoundError(f"Dataset not found: {dataset_path}")
+
+        with open(dataset_path, "r", encoding="utf-8", errors="ignore") as f:
+            strings = [line.strip() for line in f if line.strip()]
+
+        # Generate random pairs
+        num_pairs = max_pairs or min(100000, len(strings) * 10)
+        pairs = generate_random_pairs(strings, num_pairs)
+
+        total_chars = sum(len(a) + len(b) for a, b in pairs)
+        avg_length = total_chars / (2 * len(pairs))
+
+        print(
+            f"Prepared {len(pairs):,} string pairs from {len(strings):,} unique strings"
+        )
+        print(f"Average string length: {avg_length:.1f} chars")
+        print(f"Total characters: {total_chars:,}")
+        print(f"Timeout per benchmark: {timeout_seconds}s")
+        print()
+
+        print("=== Edit Distance Benchmarks ===")
+        benchmark_edit_distances(pairs, timeout_seconds)
+
+
+_main_epilog = """
+Examples:
+
+  # Benchmark with a file
+  %(prog)s --dataset leipzig1M.txt
+
+  # Benchmark with limited pairs
+  %(prog)s --dataset leipzig1M.txt --max-pairs 1000
+
+  # Benchmark protein sequences
+  %(prog)s --protein-mode --protein-length 5000 --max-pairs 500
+
+  # Custom timeout
+  %(prog)s --dataset leipzig1M.txt --timeout 30
+"""
+
+
+def main():
+    """Main entry point with argument parsing."""
+    parser = argparse.ArgumentParser(
+        description="Benchmark StringZilla similarity operations",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=_main_epilog,
+    )
+
+    parser.add_argument("--dataset", help="Path to text dataset file")
+    parser.add_argument(
+        "--max-pairs", type=int, help="Maximum number of string pairs to process"
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=10,
+        help="Timeout in seconds for each benchmark (default: 10)",
+    )
+    parser.add_argument(
+        "--protein-mode",
+        action="store_true",
+        help="Generate random protein sequences instead of using dataset",
+    )
+    parser.add_argument(
+        "--protein-length",
+        type=int,
+        default=1000,
+        help="Length of generated protein sequences (default: 1000)",
+    )
+
+    args = parser.parse_args()
+
+    if not args.protein_mode and not args.dataset:
+        parser.error("Either --dataset or --protein-mode is required")
+
+    bench(
+        args.dataset,
+        args.max_pairs,
+        args.timeout,
+        args.protein_mode,
+        args.protein_length,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index 0d41dc68..dba9f3c7 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -44,7 +44,7 @@
  *  @endcode
  *
  *  Unlike the full-blown StringWa.rs, it doesn't use any external frameworks like Criterion or Google Benchmark.
- *  This file is the sibling of `bench_find.cpp`, `bench_sequence.cpp`, `bench_similarity.cpp`, and `bench_memory.cpp`.
+ *  This file is the sibling of `bench_find.cpp`, `bench_sequence.cpp`, and `bench_memory.cpp`.
  */
 #include <numeric> // `std::accumulate`
 
diff --git a/scripts/test_fingerprint.cuh b/scripts/test_fingerprints.cuh
similarity index 99%
rename from scripts/test_fingerprint.cuh
rename to scripts/test_fingerprints.cuh
index 753e6180..2795291f 100644
--- a/scripts/test_fingerprint.cuh
+++ b/scripts/test_fingerprints.cuh
@@ -2,7 +2,7 @@
  *  @brief   Extensive @b stress-testing suite for StringCuZilla parallel operations, written in CUDA C++.
  *  @see     Stress-tests on real-world and synthetic data are integrated into the @b `scripts/bench*.cpp` benchmarks.
  *
- *  @file    test_fingerprint.cuh
+ *  @file    test_fingerprints.cuh
  *  @author  Ash Vardanian
  */
 #include <cstring> // `std::memcmp`
@@ -11,10 +11,10 @@
 #define FU_ENABLE_NUMA 0
 #include <fork_union.hpp> // Fork-join scoped thread pool
 
-#include "stringzillas/fingerprint.hpp"
+#include "stringzillas/fingerprints.hpp"
 
 #if SZ_USE_CUDA
-#include "stringzillas/fingerprint.cuh"
+#include "stringzillas/fingerprints.cuh"
 #endif
 
 #if !SZ_IS_CPP17_
diff --git a/scripts/test_similarity.cuh b/scripts/test_similarities.cuh
similarity index 99%
rename from scripts/test_similarity.cuh
rename to scripts/test_similarities.cuh
index 579bb305..3d76c717 100644
--- a/scripts/test_similarity.cuh
+++ b/scripts/test_similarities.cuh
@@ -2,13 +2,13 @@
  *  @brief   Extensive @b stress-testing suite for StringCuZilla parallel operations, written in CUDA C++.
  *  @see     Stress-tests on real-world and synthetic data are integrated into the @b `scripts/bench*.cpp` benchmarks.
  *
- *  @file    test_similarity.cuh
+ *  @file    test_similarities.cuh
  *  @author  Ash Vardanian
  */
-#include "stringzillas/similarity.hpp"
+#include "stringzillas/similarities.hpp"
 
 #if SZ_USE_CUDA
-#include "stringzillas/similarity.cuh"
+#include "stringzillas/similarities.cuh"
 #endif
 
 #if !SZ_IS_CPP17_
diff --git a/scripts/test_stringzillas.cpp b/scripts/test_stringzillas.cpp
index d75915a0..f2b4f21e 100644
--- a/scripts/test_stringzillas.cpp
+++ b/scripts/test_stringzillas.cpp
@@ -30,8 +30,8 @@
 
 #include "test_stringzillas.cuh"
 
-#include "test_fingerprint.cuh"
-#include "test_similarity.cuh"
+#include "test_fingerprints.cuh"
+#include "test_similarities.cuh"
 
 namespace szs = ashvardanian::stringzillas;
 
diff --git a/scripts/test_stringzillas.cu b/scripts/test_stringzillas.cu
index ac332b4a..2fabb39d 100644
--- a/scripts/test_stringzillas.cu
+++ b/scripts/test_stringzillas.cu
@@ -30,8 +30,8 @@
 
 #include "test_stringzillas.cuh"
 
-#include "test_fingerprint.cuh"
-#include "test_similarity.cuh"
+#include "test_fingerprints.cuh"
+#include "test_similarities.cuh"
 
 namespace szs = ashvardanian::stringzillas;
 

From 3f5e00481c28fb3196b7d7aead61d0ffd3509cb3 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 31 Jul 2025 21:04:42 +0000
Subject: [PATCH 515/751] Add: Draft fingerprinting C binding

---
 include/stringzillas/stringzillas.h | 74 ++++++++++++++++++++++++-----
 setup.py                            |  2 +-
 2 files changed, 64 insertions(+), 12 deletions(-)

diff --git a/include/stringzillas/stringzillas.h b/include/stringzillas/stringzillas.h
index 485bb562..ca69a047 100644
--- a/include/stringzillas/stringzillas.h
+++ b/include/stringzillas/stringzillas.h
@@ -4,13 +4,18 @@
  *          On modern CPUs it uses AVX2, AVX-512, NEON, SVE, & SVE2 @b SIMD instructions & provides SWAR for older CPUs.
  *          On @b CUDA-capable GPUs it also provides C++ kernels for bulk processing.
  *
+ *  Unlike traditional StringZilla interfaces, all of the functions:
+ *  - receive large collections of string inputs instead of just one or two strings;
+ *  - support overriding a default memory allocator with a custom one, wrapped into `sz_memory_allocator_t`;
+ *  - support a custom device scope, wrapped into `sz_device_scope_t`, targeting a fraction of CPU cores or a GPU.
+ *
  *  @file   stringzillas.h
  *  @author Ash Vardanian
  */
 #ifndef STRINGZILLAS_H_
 #define STRINGZILLAS_H_
 
-#include "stringzilla.h"
+#include "stringzilla.h" // `sz_sequence_t` and other types
 
 #ifdef __cplusplus
 extern "C" {
@@ -42,11 +47,50 @@ struct sz_device_scope_t {
     sz_ssize_t gpu_device;
 };
 
+SZ_DYNAMIC void sz_allocate_at_least(                      //
+    sz_size_t size_bytes, sz_device_scope_t const *device, //
+    sz_ptr_t *result_address, sz_size_t *result_size);
+
+SZ_DYNAMIC void sz_free(sz_ptr_t ptr, sz_size_t size_bytes, sz_device_scope_t const *device);
+
+/*  APIs for computing edit-distances between binary and UTF-8 strings.
+ *  Supports `sz_sequence_t`, `sz_arrow_u32tape_t`, and `sz_arrow_u64tape_t` inputs.
+ */
+
 SZ_DYNAMIC sz_status_t sz_levenshtein_distances_sequence(                                          //
     sz_sequence_t const *a, sz_sequence_t const *b,                                                //
     sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_size_t *results);
 
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_sequence(                                     //
+    sz_sequence_t const *a, sz_sequence_t const *b,                                                //
+    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
+    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_size_t *results);
+
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u32tape(                                           //
+    sz_arrow_u32tape_t const *a, sz_arrow_u32tape_t const *b,                                      //
+    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
+    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_size_t *results);
+
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_u32tape(                                      //
+    sz_arrow_u32tape_t const *a, sz_arrow_u32tape_t const *b,                                      //
+    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
+    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_size_t *results);
+
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u64tape(                                           //
+    sz_arrow_u64tape_t const *a, sz_arrow_u64tape_t const *b,                                      //
+    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
+    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_size_t *results);
+
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_u64tape(                                      //
+    sz_arrow_u64tape_t const *a, sz_arrow_u64tape_t const *b,                                      //
+    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
+    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_size_t *results);
+
+/*  APIs for computing similarity scores between pairs of strings.
+ *  Supports `sz_sequence_t`, `sz_arrow_u32tape_t`, and `sz_arrow_u64tape_t` inputs.
+ */
+
 SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_sequence(                    //
     sz_sequence_t const *a, sz_sequence_t const *b,                            //
     sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
@@ -57,11 +101,6 @@ SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_sequence(                      /
     sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_ssize_t *results);
 
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u32tape(                                           //
-    sz_arrow_u32tape_t const *a, sz_arrow_u32tape_t const *b,                                      //
-    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
-    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_size_t *results);
-
 SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u32tape(                     //
     sz_arrow_u32tape_t const *a, sz_arrow_u32tape_t const *b,                  //
     sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
@@ -72,11 +111,6 @@ SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u32tape(                       /
     sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_ssize_t *results);
 
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u64tape(                                           //
-    sz_arrow_u64tape_t const *a, sz_arrow_u64tape_t const *b,                                      //
-    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
-    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_size_t *results);
-
 SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u64tape(                     //
     sz_arrow_u64tape_t const *a, sz_arrow_u64tape_t const *b,                  //
     sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
@@ -87,6 +121,24 @@ SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u64tape(                       /
     sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_ssize_t *results);
 
+/*  APIs for computing fingerprints, Min-Hashes, and Count-Min-Sketches of binary and UTF-8 strings.
+ *  Supports `sz_sequence_t`, `sz_arrow_u32tape_t`, and `sz_arrow_u64tape_t` inputs.
+ */
+
+SZ_DYNAMIC sz_status_t sz_fingerprints_u64tape(                                     //
+    sz_arrow_u64tape_t const *texts, sz_size_t alphabet_size, sz_size_t dimensions, //
+    sz_size_t const *window_widths, sz_size_t window_widths_count,                  //
+    sz_memory_allocator_t *alloc, sz_device_scope_t const *device,                  //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                              //
+    sz_u32_t *min_counts, sz_size_t min_counts_stride);
+
+SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_u64tape(                                //
+    sz_arrow_u64tape_t const *texts, sz_size_t alphabet_size, sz_size_t dimensions, //
+    sz_size_t const *window_widths, sz_size_t window_widths_count,                  //
+    sz_memory_allocator_t *alloc, sz_device_scope_t const *device,                  //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                              //
+    sz_u32_t *min_counts, sz_size_t min_counts_stride);
+
 #ifdef __cplusplus
 }
 #endif // __cplusplus
diff --git a/setup.py b/setup.py
index 4fbfc365..579b0841 100644
--- a/setup.py
+++ b/setup.py
@@ -141,7 +141,7 @@ def windows_settings() -> Tuple[List[str], List[str], List[Tuple[str]]]:
 ext_modules = [
     Extension(
         "stringzilla",
-        ["python/lib.c"] + glob.glob("c/*.c"),
+        ["python/stringzilla.c", "c/stringzilla.c"],
         # In the past I've used `np.get_include()` to include NumPy headers,
         # but it's not necessary for this library.
         include_dirs=["include"],

From e5e2702c28beedb0e9e32ce4165c843adef66fee Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 31 Jul 2025 21:48:03 +0000
Subject: [PATCH 516/751] Docs: Reuse operators state

---
 .vscode/settings.json               |   7 +-
 include/stringzillas/stringzillas.h | 254 ++++++++++++++++++++--------
 2 files changed, 194 insertions(+), 67 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 9ff6123b..5189d461 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -65,6 +65,7 @@
     "endregion",
     "endswith",
     "Eron",
+    "extremums",
     "Fisher",
     "Galil",
     "getitem",
@@ -112,8 +113,10 @@
     "memmove",
     "Merkle-Damgård",
     "Mersenne",
+    "minhash",
     "misalign",
     "MODINIT",
+    "modulos",
     "Morten",
     "Mosè",
     "MSVC",
@@ -133,6 +136,7 @@
     "npos",
     "nullary",
     "nullptr",
+    "NUMA",
     "numpy",
     "NVCC",
     "octdigits",
@@ -351,6 +355,7 @@
     "barrier": "cpp",
     "pipeline": "cpp",
     "__functional_03": "cpp",
-    "__functional_base_03": "cpp"
+    "__functional_base_03": "cpp",
+    "stream_ref": "cpp"
   }
 }
\ No newline at end of file
diff --git a/include/stringzillas/stringzillas.h b/include/stringzillas/stringzillas.h
index ca69a047..cd5a8bc9 100644
--- a/include/stringzillas/stringzillas.h
+++ b/include/stringzillas/stringzillas.h
@@ -5,9 +5,17 @@
  *          On @b CUDA-capable GPUs it also provides C++ kernels for bulk processing.
  *
  *  Unlike traditional StringZilla interfaces, all of the functions:
+ *  - operators are stateful, and should be reused between calls;
  *  - receive large collections of string inputs instead of just one or two strings;
- *  - support overriding a default memory allocator with a custom one, wrapped into `sz_memory_allocator_t`;
- *  - support a custom device scope, wrapped into `sz_device_scope_t`, targeting a fraction of CPU cores or a GPU.
+ *  - receive executors or thread pools to parallelize the work, targeting a fraction of CPU cores or a GPU;
+ *  - support overriding a default memory allocator with a custom one, wrapped into `sz_memory_allocator_t`.
+ *
+ *  Under the hood, a ton of C++ templates are instantiated to handle different types of inputs, like:
+ *  - `sz_sequence_t` - for a C-style `std::vector<std::string_view>`-like structure.
+ *  - `sz_arrow_u32tape_t`, `sz_arrow_u64tape_t` - for Apache Arrow-compatible tapes with 32-bit and 64-bit offsets.
+ *
+ *  Those templates also reuse the same pre-configured operators for different thread-pool & executor types,
+ *  hardware capability levels.
  *
  *  @file   stringzillas.h
  *  @author Ash Vardanian
@@ -21,18 +29,32 @@
 extern "C" {
 #endif
 
+/**
+ *  @brief Apache Arrow-compatible tape for strings with 32-bit offsets.
+ *  @sa `sz_arrow_u64tape_t` for larger collections.
+ */
 struct sz_arrow_u32tape_t {
     sz_cptr_t data;
     sz_u32_t const *lengths;
     sz_size_t count;
 };
 
+/**
+ *  @brief Apache Arrow-compatible tape for strings with 64-bit offsets.
+ *  @sa `sz_arrow_u32tape_t` for smaller space-efficient collections.
+ */
 struct sz_arrow_u64tape_t {
     sz_cptr_t data;
     sz_u64_t const *lengths;
     sz_size_t count;
 };
 
+/**
+ *  @brief Prepares the default allocator for unified memory management.
+ *  @note When compiled on CUDA-capable systems, this function will use `cudaMallocManaged`.
+ */
+SZ_DYNAMIC void sz_memory_allocator_init_unified(sz_memory_allocator_t *alloc);
+
 /**
  *  Doesn't aim to provide the same level of granularity as the C++ API.
  *  It expects that the C functions will be called in bulk, generally,
@@ -41,104 +63,204 @@ struct sz_arrow_u64tape_t {
  *  - a single CPU core,
  *  - a fraction of CPU cores through some global thread pool,
  *  - a single GPU device.
+ *
+ *  Set `cpu_cores` to 0 to target all available CPU cores, to -1 to avoid CPUs, to 1 to use only calling thread.
+ *  Set `gpu_device` to -1 to avoid GPUs, or to a positive device ID to target a specific GPU.
  */
-struct sz_device_scope_t {
-    sz_ssize_t cpu_cores;
-    sz_ssize_t gpu_device;
-};
-
-SZ_DYNAMIC void sz_allocate_at_least(                      //
-    sz_size_t size_bytes, sz_device_scope_t const *device, //
-    sz_ptr_t *result_address, sz_size_t *result_size);
+typedef void *sz_device_scope_t;
 
-SZ_DYNAMIC void sz_free(sz_ptr_t ptr, sz_size_t size_bytes, sz_device_scope_t const *device);
+SZ_DYNAMIC void sz_device_scope_init_cpu_cores(sz_size_t cpu_cores, sz_device_scope_t *scope);
+SZ_DYNAMIC void sz_device_scope_init_gpu_device(sz_size_t gpu_device, sz_device_scope_t *scope);
+SZ_DYNAMIC void sz_device_scope_free(sz_device_scope_t scope);
 
 /*  APIs for computing edit-distances between binary and UTF-8 strings.
  *  Supports `sz_sequence_t`, `sz_arrow_u32tape_t`, and `sz_arrow_u64tape_t` inputs.
  */
+typedef void *sz_levenshtein_distances_t;
+typedef void *sz_levenshtein_distances_utf8_t;
 
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_sequence(                                          //
-    sz_sequence_t const *a, sz_sequence_t const *b,                                                //
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(                                              //
     sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
-    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_size_t *results);
+    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,                              //
+    sz_levenshtein_distances_t *engine);
 
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_sequence(                                     //
-    sz_sequence_t const *a, sz_sequence_t const *b,                                                //
-    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
-    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_size_t *results);
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_sequence(        //
+    sz_levenshtein_distances_t engine, sz_device_scope_t device, //
+    sz_sequence_t const *a, sz_sequence_t const *b,              //
+    sz_size_t *results, sz_size_t results_stride);
 
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u32tape(                                           //
-    sz_arrow_u32tape_t const *a, sz_arrow_u32tape_t const *b,                                      //
-    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
-    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_size_t *results);
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u32tape(         //
+    sz_levenshtein_distances_t engine, sz_device_scope_t device, //
+    sz_arrow_u32tape_t const *a, sz_arrow_u32tape_t const *b,    //
+    sz_size_t *results, sz_size_t results_stride);
 
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_u32tape(                                      //
-    sz_arrow_u32tape_t const *a, sz_arrow_u32tape_t const *b,                                      //
-    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
-    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_size_t *results);
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u64tape(         //
+    sz_levenshtein_distances_t engine, sz_device_scope_t device, //
+    sz_arrow_u64tape_t const *a, sz_arrow_u64tape_t const *b,    //
+    sz_size_t *results, sz_size_t results_stride);
 
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u64tape(                                           //
-    sz_arrow_u64tape_t const *a, sz_arrow_u64tape_t const *b,                                      //
-    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
-    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_size_t *results);
+SZ_DYNAMIC void sz_levenshtein_distances_free(sz_levenshtein_distances_t engine);
 
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_u64tape(                                      //
-    sz_arrow_u64tape_t const *a, sz_arrow_u64tape_t const *b,                                      //
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_init(                                         //
     sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
-    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_size_t *results);
+    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,                              //
+    sz_levenshtein_distances_utf8_t *engine);
+
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_sequence(        //
+    sz_levenshtein_distances_utf8_t engine, sz_device_scope_t device, //
+    sz_sequence_t const *a, sz_sequence_t const *b,                   //
+    sz_size_t *results, sz_size_t results_stride);
+
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_u32tape(         //
+    sz_levenshtein_distances_utf8_t engine, sz_device_scope_t device, //
+    sz_arrow_u32tape_t const *a, sz_arrow_u32tape_t const *b,         //
+    sz_size_t *results, sz_size_t results_stride);
+
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_u64tape(         //
+    sz_levenshtein_distances_utf8_t engine, sz_device_scope_t device, //
+    sz_arrow_u64tape_t const *a, sz_arrow_u64tape_t const *b,         //
+    sz_size_t *results, sz_size_t results_stride);
+
+SZ_DYNAMIC void sz_levenshtein_distances_utf8_free(sz_levenshtein_distances_utf8_t engine);
 
 /*  APIs for computing similarity scores between pairs of strings.
  *  Supports `sz_sequence_t`, `sz_arrow_u32tape_t`, and `sz_arrow_u64tape_t` inputs.
  */
 
-SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_sequence(                    //
-    sz_sequence_t const *a, sz_sequence_t const *b,                            //
-    sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
-    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_ssize_t *results);
+typedef void *sz_needleman_wunsch_scores_t;
+typedef void *sz_smith_waterman_scores_t;
 
-SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_sequence(                      //
-    sz_sequence_t const *a, sz_sequence_t const *b,                            //
+SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_init(                        //
     sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
-    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_ssize_t *results);
+    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,          //
+    sz_needleman_wunsch_scores_t *engine);
 
-SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u32tape(                     //
-    sz_arrow_u32tape_t const *a, sz_arrow_u32tape_t const *b,                  //
-    sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
-    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_ssize_t *results);
+SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_sequence(        //
+    sz_needleman_wunsch_scores_t engine, sz_device_scope_t device, //
+    sz_sequence_t const *a, sz_sequence_t const *b,                //
+    sz_ssize_t *results, sz_size_t results_stride);
 
-SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u32tape(                       //
-    sz_arrow_u32tape_t const *a, sz_arrow_u32tape_t const *b,                  //
-    sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
-    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_ssize_t *results);
+SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u32tape(         //
+    sz_needleman_wunsch_scores_t engine, sz_device_scope_t device, //
+    sz_arrow_u32tape_t const *a, sz_arrow_u32tape_t const *b,      //
+    sz_ssize_t *results, sz_size_t results_stride);
 
-SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u64tape(                     //
-    sz_arrow_u64tape_t const *a, sz_arrow_u64tape_t const *b,                  //
-    sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
-    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_ssize_t *results);
+SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u64tape(         //
+    sz_needleman_wunsch_scores_t engine, sz_device_scope_t device, //
+    sz_arrow_u64tape_t const *a, sz_arrow_u64tape_t const *b,      //
+    sz_ssize_t *results, sz_size_t results_stride);
+
+SZ_DYNAMIC void sz_needleman_wunsch_scores_free(sz_needleman_wunsch_scores_t engine);
 
-SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u64tape(                       //
-    sz_arrow_u64tape_t const *a, sz_arrow_u64tape_t const *b,                  //
+SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_init(                          //
     sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
-    sz_memory_allocator_t *alloc, sz_device_scope_t const *device, sz_ssize_t *results);
+    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,          //
+    sz_smith_waterman_scores_t *engine);
 
-/*  APIs for computing fingerprints, Min-Hashes, and Count-Min-Sketches of binary and UTF-8 strings.
+SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_sequence(        //
+    sz_smith_waterman_scores_t engine, sz_device_scope_t device, //
+    sz_sequence_t const *a, sz_sequence_t const *b,              //
+    sz_ssize_t *results, sz_size_t results_stride);
+
+SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u32tape(         //
+    sz_smith_waterman_scores_t engine, sz_device_scope_t device, //
+    sz_arrow_u32tape_t const *a, sz_arrow_u32tape_t const *b,    //
+    sz_ssize_t *results, sz_size_t results_stride);
+
+SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u64tape(         //
+    sz_smith_waterman_scores_t engine, sz_device_scope_t device, //
+    sz_arrow_u64tape_t const *a, sz_arrow_u64tape_t const *b,    //
+    sz_ssize_t *results, sz_size_t results_stride);
+
+SZ_DYNAMIC void sz_smith_waterman_scores_free(sz_smith_waterman_scores_t engine);
+
+/**
+ *  APIs for computing fingerprints, Min-Hashes, and Count-Min-Sketches of binary and UTF-8 strings.
  *  Supports `sz_sequence_t`, `sz_arrow_u32tape_t`, and `sz_arrow_u64tape_t` inputs.
+ *
+ *  @section Speed Considerations
+ *
+ *  For each window width you should aim for a multiple of 64 dimensions. Rolling hashes with identical window widths
+ *  will share the same memory access pattern and can be effectively parallelized. For each platform, different minimum
+ *  dimensions are recommended:
+ *
+ *  - on AVX-512 capable CPUs, take at least 8 hash-functions of each width,
+ *  - on AVX-512 capable CPUs with a physical 512-bit path, take 16 or more, to increase register utilization,
+ *  - on Nvidia GPUs, take at least 32 hash-functions of each width, to activate all 32 threads in a warp.
+ *  - on AMD GPUs, take at least 64 hash-functions of each width, to activate all 64 threads in a wave.
+ *
+ *  Assuming 64 is the smallest size saturating all platforms - its a great default.
+ *
+ *  Having too many dimensions is also a problem, as we'll end up with a ton of redundant compute.
+ *  For short Tweet-sized strings of 256 characters, 64 dimensions of each of [3, 5, 7, 9] seem like a good default.
+ *  For web packets, around 1 KB memory pages, 64 dimensions of each of [3, 4, 5, 7, 9, 11, 15, 31] are a good default.
+ *  For longer strings, like 4 KB memory pages, we can aim for 128 dimensions of the same widths.
+ */
+typedef void *sz_fingerprints_t;
+typedef void *sz_fingerprints_utf8_t;
+
+/**
+ *  @brief Initializes a fingerprinting engine.
+ *  @param alphabet_size The size of the alphabet (256 for binary, 128 for ASCII, 4 for DNA, 22 for protein).
+ *  @param dimensions The number of dimensions for the fingerprints (ideally 64-divisible, like 2048, 2112)
+ *  @param window_widths An optional array of window widths for the fingerprints, like [3, 4, 5, 7, 9, 11, 15, 31].
+ *  @param window_widths_count The number of window widths in the @p window_widths array.
+ *  @param alloc A memory allocator to use for allocating memory.
+ *  @param device A device scope to use for parallel execution.
+ *  @param engine Pointer to the initialized fingerprinting engine.
  */
+SZ_DYNAMIC sz_status_t sz_fingerprints_init(                          //
+    sz_size_t alphabet_size, sz_size_t dimensions,                    //
+    sz_size_t const *window_widths, sz_size_t window_widths_count,    //
+    sz_memory_allocator_t const *alloc, sz_capability_t capabilities, //
+    sz_fingerprints_t *engine);
+
+SZ_DYNAMIC sz_status_t sz_fingerprints_sequence(        //
+    sz_fingerprints_t engine, sz_device_scope_t device, //
+    sz_sequence_t const *texts,                         //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,  //
+    sz_u32_t *min_counts, sz_size_t min_counts_stride);
+
+SZ_DYNAMIC sz_status_t sz_fingerprints_u64tape(         //
+    sz_fingerprints_t engine, sz_device_scope_t device, //
+    sz_arrow_u64tape_t const *texts,                    //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,  //
+    sz_u32_t *min_counts, sz_size_t min_counts_stride);
+
+SZ_DYNAMIC sz_status_t sz_fingerprints_u32tape(         //
+    sz_fingerprints_t engine, sz_device_scope_t device, //
+    sz_arrow_u32tape_t const *texts,                    //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,  //
+    sz_u32_t *min_counts, sz_size_t min_counts_stride);
+
+SZ_DYNAMIC void sz_fingerprints_free(sz_fingerprints_t engine);
+
+SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_init(                     //
+    sz_size_t alphabet_size, sz_size_t dimensions,                    //
+    sz_size_t const *window_widths, sz_size_t window_widths_count,    //
+    sz_memory_allocator_t const *alloc, sz_capability_t capabilities, //
+    sz_fingerprints_utf8_t *engine);
 
-SZ_DYNAMIC sz_status_t sz_fingerprints_u64tape(                                     //
-    sz_arrow_u64tape_t const *texts, sz_size_t alphabet_size, sz_size_t dimensions, //
-    sz_size_t const *window_widths, sz_size_t window_widths_count,                  //
-    sz_memory_allocator_t *alloc, sz_device_scope_t const *device,                  //
-    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                              //
+SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_sequence(        //
+    sz_fingerprints_utf8_t engine, sz_device_scope_t device, //
+    sz_sequence_t const *texts,                              //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,       //
     sz_u32_t *min_counts, sz_size_t min_counts_stride);
 
-SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_u64tape(                                //
-    sz_arrow_u64tape_t const *texts, sz_size_t alphabet_size, sz_size_t dimensions, //
-    sz_size_t const *window_widths, sz_size_t window_widths_count,                  //
-    sz_memory_allocator_t *alloc, sz_device_scope_t const *device,                  //
-    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                              //
+SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_u64tape(         //
+    sz_fingerprints_utf8_t engine, sz_device_scope_t device, //
+    sz_arrow_u64tape_t const *texts,                         //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,       //
     sz_u32_t *min_counts, sz_size_t min_counts_stride);
 
+SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_u32tape(         //
+    sz_fingerprints_utf8_t engine, sz_device_scope_t device, //
+    sz_arrow_u32tape_t const *texts,                         //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,       //
+    sz_u32_t *min_counts, sz_size_t min_counts_stride);
+
+SZ_DYNAMIC void sz_fingerprints_utf8_free(sz_fingerprints_utf8_t engine);
+
 #ifdef __cplusplus
 }
 #endif // __cplusplus

From 5d824547ce8ca726655c2cd7b9129495eefb8910 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 1 Aug 2025 13:53:49 +0000
Subject: [PATCH 517/751] Make: Rename `lib.rs`

---
 Cargo.lock                      | 15 ++++++++++++---
 Cargo.toml                      |  6 +++---
 build.rs                        |  4 ++--
 rust/{lib.rs => stringzilla.rs} |  0
 4 files changed, 17 insertions(+), 8 deletions(-)
 rename rust/{lib.rs => stringzilla.rs} (100%)

diff --git a/Cargo.lock b/Cargo.lock
index 0a857d09..03ef460b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1,12 +1,21 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
-version = 3
+version = 4
 
 [[package]]
 name = "cc"
-version = "1.0.95"
+version = "1.2.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d32a725bc159af97c3e629873bb9f88fb8cf8a4867175f76dc987815ea07c83b"
+checksum = "c3a42d84bb6b69d3a8b3eaacf0d88f179e1929695e1ad012b6cf64d9caaa5fd2"
+dependencies = [
+ "shlex",
+]
+
+[[package]]
+name = "shlex"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "stringzilla"
diff --git a/Cargo.toml b/Cargo.toml
index efdde4d5..65e403f1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,7 +2,7 @@
 name = "stringzilla"
 version = "3.11.3"
 authors = ["Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>"]
-description = "Faster SIMD-accelerated string search, sorting, fingerprints, and edit distances"
+description = "Search, hash, sort, fingerprint, and fuzzy-match strings faster via SWAR, SIMD, and GPGPU"
 edition = "2021"
 license = "Apache-2.0"
 publish = true
@@ -21,7 +21,7 @@ include = ["/rust/**", "/c/**", "/include/**", "/build.rs"]
 
 [lib]
 name = "stringzilla"
-path = "rust/lib.rs"
+path = "rust/stringzilla.rs"
 
 [build-dependencies]
-cc = "1.0"
+cc = "1.2.31"
diff --git a/build.rs b/build.rs
index 701799f3..acaee5b9 100644
--- a/build.rs
+++ b/build.rs
@@ -71,8 +71,8 @@ fn main() {
         }
     }
 
-    println!("cargo:rerun-if-changed=c/lib.c");
-    println!("cargo:rerun-if-changed=rust/lib.rs");
+    println!("cargo:rerun-if-changed=c/stringzilla.c");
+    println!("cargo:rerun-if-changed=rust/stringzilla.rs");
     println!("cargo:rerun-if-changed=include/stringzilla/stringzilla.h");
 
     // Constituent parts:
diff --git a/rust/lib.rs b/rust/stringzilla.rs
similarity index 100%
rename from rust/lib.rs
rename to rust/stringzilla.rs

From 031d067d349f0be0a55520a9ca91c69f5e01c2a0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 1 Aug 2025 13:54:57 +0000
Subject: [PATCH 518/751] Improve: Runtime variable window widths

---
 include/stringzillas/fingerprints.cuh | 51 ++++++++--------
 include/stringzillas/fingerprints.hpp | 88 ++++++++++++++-------------
 scripts/bench_fingerprints.cuh        | 20 +++---
 scripts/test_fingerprints.cuh         | 16 ++---
 4 files changed, 90 insertions(+), 85 deletions(-)

diff --git a/include/stringzillas/fingerprints.cuh b/include/stringzillas/fingerprints.cuh
index 5db63cb0..6c1af476 100644
--- a/include/stringzillas/fingerprints.cuh
+++ b/include/stringzillas/fingerprints.cuh
@@ -62,20 +62,19 @@ __device__ __forceinline__ f64_t barrett_mod_cuda_(f64_t x, f64_t modulo, f64_t
  *  4x bytes and computing 4x (warp_size_) rolling hashes per thread in the inner loop.
  */
 template <                                                                     //
-    unsigned window_width_, unsigned dimensions_, sz_capability_t capability_, //
+    unsigned dimensions_, sz_capability_t capability_,                         //
     typename char_type_ = byte_t, warp_size_t warp_size_ = warp_size_nvidia_k, //
     warp_tasks_density_t density_ = four_warps_per_multiprocessor_k            //
     >
 __global__ void floating_rolling_hashers_on_each_cuda_warp_(                            //
     cuda_floating_fingerprint_task_<char_type_> const *tasks, size_t const tasks_count, //
-    floating_rolling_hasher<f64_t> const *hashers, size_t const hashers_count) {
+    floating_rolling_hasher<f64_t> const *hashers, size_t const hashers_count, size_t const window_width) {
 
     //
     using task_t = cuda_floating_fingerprint_task_<char_type_>;
     using hasher_t = floating_rolling_hasher<f64_t>;
     constexpr warp_size_t warp_size_k = warp_size_;
     constexpr warp_tasks_density_t density_k = density_;
-    constexpr unsigned window_width_k = window_width_;
     constexpr unsigned dimensions_k = dimensions_;
     constexpr unsigned dimensions_per_thread_k = dimensions_k / warp_size_k;
     constexpr f64_t skipped_rolling_state_k = basic_rolling_hashers<hasher_t>::skipped_rolling_state_k;
@@ -127,8 +126,8 @@ __global__ void floating_rolling_hashers_on_each_cuda_warp_(
         for (auto &rolling_minimum : rolling_minimums) rolling_minimum = skipped_rolling_state_k;
         for (auto &rolling_count : rolling_counts) rolling_count = 0;
 
-        // Until we reach the `window_width_k`, we don't need to discard any symbols and can keep the code simpler
-        size_t const prefix_length = std::min<size_t>(task.text_length, window_width_k);
+        // Until we reach the `window_width`, we don't need to discard any symbols and can keep the code simpler
+        size_t const prefix_length = std::min<size_t>(task.text_length, window_width);
         size_t new_char_offset = 0;
         for (; new_char_offset < prefix_length; ++new_char_offset) {
             byte_t const new_char = task.text_ptr[new_char_offset]; // ? Hardware may auto-broadcast this
@@ -146,7 +145,7 @@ __global__ void floating_rolling_hashers_on_each_cuda_warp_(
         }
 
         // We now have our first minimum hashes
-        if (new_char_offset == window_width_k) {
+        if (new_char_offset == window_width) {
 #pragma unroll
             for (unsigned dim_within_thread = 0; dim_within_thread < dimensions_per_thread_k; ++dim_within_thread) {
                 rolling_minimums[dim_within_thread] = rolling_states[dim_within_thread];
@@ -161,7 +160,7 @@ __global__ void floating_rolling_hashers_on_each_cuda_warp_(
 
             // Load the next chunk of characters into shared memory
             byte_t const *incoming_bytes = task.text_ptr + new_char_offset;
-            byte_t const *discarding_bytes = task.text_ptr + new_char_offset - window_width_k;
+            byte_t const *discarding_bytes = task.text_ptr + new_char_offset - window_width;
             incoming_text_chunk[warp_in_block_index][thread_in_warp_index] = incoming_bytes[thread_in_warp_index];
             discarding_text_chunk[warp_in_block_index][thread_in_warp_index] = discarding_bytes[thread_in_warp_index];
 
@@ -200,7 +199,7 @@ __global__ void floating_rolling_hashers_on_each_cuda_warp_(
         // Roll until the end of the text
         for (; new_char_offset < task.text_length; ++new_char_offset) {
             byte_t const new_char = task.text_ptr[new_char_offset]; // ? Hardware may auto-broadcast this
-            byte_t const old_char = task.text_ptr[new_char_offset - window_width_k];
+            byte_t const old_char = task.text_ptr[new_char_offset - window_width];
             f64_t const new_term = static_cast<f64_t>(new_char) + 1.0;
             f64_t const old_term = static_cast<f64_t>(old_char) + 1.0;
 
@@ -244,9 +243,9 @@ __global__ void floating_rolling_hashers_on_each_cuda_warp_(
  *  where individual warps take care of separate unrelated inputs. The biggest difference is in how the minimum values
  *  are later reduced across the entire device, rather than per-warp.
  */
-template <                                                                 //
-    size_t window_width_, size_t dimensions_, sz_capability_t capability_, //
-    typename char_type_ = byte_t, size_t warp_size_ = 32                   //
+template <                                               //
+    size_t dimensions_, sz_capability_t capability_,     //
+    typename char_type_ = byte_t, size_t warp_size_ = 32 //
     >
 __global__ void floating_rolling_hashers_across_cuda_device_(span<cuda_floating_fingerprint_task_<char_type_>> tasks,
                                                              span<floating_rolling_hasher<f64_t> const> hashers) {
@@ -259,8 +258,8 @@ __global__ void floating_rolling_hashers_across_cuda_device_(span<cuda_floating_
 /**
  *  @brief CUDA specialization of floating_rolling_hashers for count-min-sketching.
  */
-template <size_t window_width_, size_t dimensions_>
-struct floating_rolling_hashers<sz_cap_cuda_k, window_width_, dimensions_> {
+template <size_t dimensions_>
+struct floating_rolling_hashers<sz_cap_cuda_k, dimensions_> {
 
     using hasher_t = floating_rolling_hasher<f64_t>;
     using rolling_state_t = f64_t;
@@ -271,7 +270,6 @@ struct floating_rolling_hashers<sz_cap_cuda_k, window_width_, dimensions_> {
     using hashers_allocator_t = typename allocator_t::template rebind<hasher_t>::other;
     using hashers_t = safe_vector<hasher_t, hashers_allocator_t>;
 
-    static constexpr size_t window_width_k = window_width_;
     static constexpr size_t dimensions_k = dimensions_;
     static constexpr rolling_state_t skipped_rolling_state_k = std::numeric_limits<rolling_state_t>::max();
     static constexpr min_hash_t max_hash_k = std::numeric_limits<min_hash_t>::max();
@@ -288,21 +286,24 @@ struct floating_rolling_hashers<sz_cap_cuda_k, window_width_, dimensions_> {
   private:
     allocator_t alloc_;
     hashers_t hashers_;
+    size_t window_width_;
 
   public:
-    floating_rolling_hashers(allocator_t const &alloc = {}) noexcept : alloc_(alloc), hashers_(alloc) {}
+    floating_rolling_hashers(allocator_t const &alloc = {}) noexcept
+        : alloc_(alloc), hashers_(alloc), window_width_(0) {}
     constexpr size_t dimensions() const noexcept { return dimensions_k; }
-    constexpr size_t window_width() const noexcept { return window_width_k; }
-    constexpr size_t window_width(size_t) const noexcept { return window_width_k; }
+    constexpr size_t window_width() const noexcept { return window_width_; }
+    constexpr size_t window_width(size_t) const noexcept { return window_width_; }
 
     /**
      *  @brief Initializes several rolling hashers with different multipliers and modulos.
      *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
      */
-    SZ_NOINLINE status_t try_seed(size_t alphabet_size = 256) noexcept {
+    SZ_NOINLINE status_t try_seed(size_t window_width, size_t alphabet_size = 256) noexcept {
         if (hashers_.try_resize(aligned_dimensions_k) != status_t::success_k) return status_t::bad_alloc_k;
         for (unsigned dim = 0; dim < dimensions_k; ++dim)
-            hashers_[dim] = hasher_t(window_width_k, alphabet_size + dim, hasher_t::default_modulo_base_k);
+            hashers_[dim] = hasher_t(window_width, alphabet_size + dim, hasher_t::default_modulo_base_k);
+        window_width_ = window_width;
         return status_t::success_k;
     }
 
@@ -342,7 +343,7 @@ struct floating_rolling_hashers<sz_cap_cuda_k, window_width_, dimensions_> {
         cudaError_t start_event_error = cudaEventRecord(start_event, executor.stream);
         if (start_event_error != cudaSuccess) return {status_t::unknown_k, start_event_error};
 
-        void *warp_level_kernel_args[4];
+        void *warp_level_kernel_args[5];
         auto const *tasks_ptr = tasks.data();
         auto const tasks_size = tasks.size();
         auto const *hashers_ptr = hashers_.data();
@@ -351,10 +352,10 @@ struct floating_rolling_hashers<sz_cap_cuda_k, window_width_, dimensions_> {
         warp_level_kernel_args[1] = (void *)(&tasks_size);
         warp_level_kernel_args[2] = (void *)(&hashers_ptr);
         warp_level_kernel_args[3] = (void *)(&hashers_size);
+        warp_level_kernel_args[4] = (void *)(&window_width_);
 
         auto warp_level_kernel = &floating_rolling_hashers_on_each_cuda_warp_< //
-            window_width_k, aligned_dimensions_k, sz_cap_cuda_k, byte_t,       //
-            warp_size_nvidia_k, one_warp_per_multiprocessor_k>;
+            aligned_dimensions_k, sz_cap_cuda_k, byte_t, warp_size_nvidia_k, one_warp_per_multiprocessor_k>;
 
         // TODO: We can be wiser about the dimensions of this grid.
         unsigned const random_block_size = static_cast<unsigned>(warp_size_nvidia_k) * //
@@ -423,7 +424,7 @@ struct floating_rolling_hashers<sz_cap_cuda_k, window_width_, dimensions_> {
         cudaError_t start_event_error = cudaEventRecord(start_event, executor.stream);
         if (start_event_error != cudaSuccess) return {status_t::unknown_k, start_event_error};
 
-        void *warp_level_kernel_args[4];
+        void *warp_level_kernel_args[5];
         auto const *tasks_ptr = tasks.data();
         auto const tasks_size = tasks.size();
         auto const *hashers_ptr = hashers_.data();
@@ -432,11 +433,11 @@ struct floating_rolling_hashers<sz_cap_cuda_k, window_width_, dimensions_> {
         warp_level_kernel_args[1] = (void *)(&tasks_size);
         warp_level_kernel_args[2] = (void *)(&hashers_ptr);
         warp_level_kernel_args[3] = (void *)(&hashers_size);
+        warp_level_kernel_args[4] = (void *)(&window_width_);
 
         static_assert(sizeof(char_t) == sizeof(byte_t), "Characters must be byte-sized");
         auto warp_level_kernel = &floating_rolling_hashers_on_each_cuda_warp_< //
-            window_width_k, aligned_dimensions_k, sz_cap_cuda_k, byte_t,       //
-            warp_size_nvidia_k, four_warps_per_multiprocessor_k>;
+            aligned_dimensions_k, sz_cap_cuda_k, byte_t, warp_size_nvidia_k, four_warps_per_multiprocessor_k>;
 
         // TODO: We can be wiser about the dimensions of this grid.
         unsigned const random_block_size = static_cast<unsigned>(warp_size_nvidia_k) * //
diff --git a/include/stringzillas/fingerprints.hpp b/include/stringzillas/fingerprints.hpp
index 21cd59a4..424a0c40 100644
--- a/include/stringzillas/fingerprints.hpp
+++ b/include/stringzillas/fingerprints.hpp
@@ -625,9 +625,11 @@ struct basic_rolling_hashers {
      *  hashers("some text", fingerprint);
      *  @endcode
      */
-    SZ_NOINLINE status_t try_extend(size_t window_width, size_t dims, size_t alphabet_size = 256) noexcept {
-        if (hashers_.try_reserve(dims) != status_t::success_k) return status_t::bad_alloc_k;
-        for (size_t dim = 0; dim < dims; ++dim) {
+    SZ_NOINLINE status_t try_extend(size_t window_width, size_t new_dims, size_t alphabet_size = 256) noexcept {
+        size_t const old_dims = hashers_.size();
+        if (hashers_.try_reserve(old_dims + new_dims) != status_t::success_k) return status_t::bad_alloc_k;
+        for (size_t new_dim = 0; new_dim < new_dims; ++new_dim) {
+            size_t const dim = old_dims + new_dim;
             status_t status = try_append(hasher_t(window_width, alphabet_size + dim));
             sz_assert_(status == status_t::success_k && "Couldn't fail after the reserve");
         }
@@ -922,7 +924,7 @@ SZ_NOINLINE status_t floating_rolling_hashers_in_parallel_(
     using rolling_state_t = typename engine_t::rolling_state_t;
     using min_count_t = typename engine_t::min_count_t;
     using min_hash_t = typename engine_t::min_hash_t;
-    static constexpr auto window_width_k = engine_t::window_width_k;
+    static constexpr auto window_width_ = engine_t::window_width_;
     static constexpr auto dimensions_k = engine_t::dimensions_k;
     static constexpr auto skipped_rolling_hash_k = engine_t::skipped_rolling_hash_k;
     static constexpr auto max_hash_k = engine_t::max_hash_k;
@@ -971,7 +973,7 @@ SZ_NOINLINE status_t floating_rolling_hashers_in_parallel_(
             // ? This overlap will be different for different window widths, but assuming we are
             // ? computing the non-weighted Min-Hash, recomputing & comparing a few hashes for the
             // ? same slices isn't a big deal.
-            auto overlapping_text_end = (std::min)(text_start + chunk_size + window_width_k - 1, text_view.end());
+            auto overlapping_text_end = (std::min)(text_start + chunk_size + window_width_ - 1, text_view.end());
             auto thread_local_text = span<byte_t const>(text_start, overlapping_text_end);
 
             rolling_state_t thread_local_states[dimensions_k];
@@ -1027,21 +1029,19 @@ SZ_NOINLINE status_t floating_rolling_hashers_in_parallel_(
  */
 template <                                         //
     sz_capability_t capability_ = sz_cap_serial_k, //
-    size_t window_width_ = SZ_SIZE_MAX,            //
     size_t dimensions_ = 64,                       //
     typename enable_ = void                        //
     >
 struct floating_rolling_hashers;
 
-template <size_t window_width_, size_t dimensions_>
-struct floating_rolling_hashers<sz_cap_serial_k, window_width_, dimensions_> {
+template <size_t dimensions_>
+struct floating_rolling_hashers<sz_cap_serial_k, dimensions_> {
 
     using hasher_t = floating_rolling_hasher<f64_t>;
     using rolling_state_t = f64_t;
     using min_hash_t = u32_t;
     using min_count_t = u32_t;
 
-    static constexpr size_t window_width_k = window_width_;
     static constexpr size_t dimensions_k = dimensions_;
     static constexpr rolling_state_t skipped_rolling_state_k = std::numeric_limits<rolling_state_t>::max();
     static constexpr rolling_state_t skipped_rolling_hash_k = std::numeric_limits<rolling_state_t>::max();
@@ -1055,11 +1055,12 @@ struct floating_rolling_hashers<sz_cap_serial_k, window_width_, dimensions_> {
     rolling_state_t modulos_[dimensions_k];
     rolling_state_t inverse_modulos_[dimensions_k];
     rolling_state_t negative_discarding_multipliers_[dimensions_k];
+    size_t window_width_;
 
   public:
     constexpr size_t dimensions() const noexcept { return dimensions_k; }
-    constexpr size_t window_width() const noexcept { return window_width_k; }
-    constexpr size_t window_width(size_t) const noexcept { return window_width_k; }
+    constexpr size_t window_width() const noexcept { return window_width_; }
+    constexpr size_t window_width(size_t) const noexcept { return window_width_; }
 
     floating_rolling_hashers() noexcept {
         // Reset all variables to zeros
@@ -1068,20 +1069,22 @@ struct floating_rolling_hashers<sz_cap_serial_k, window_width_, dimensions_> {
         for (auto &inverse_modulo : inverse_modulos_) inverse_modulo = 0.0;
         for (auto &negative_discarding_multiplier : negative_discarding_multipliers_)
             negative_discarding_multiplier = 0.0;
+        window_width_ = 0;
     }
 
     /**
      *  @brief Initializes several rolling hashers with different multipliers and modulos.
      *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
      */
-    SZ_NOINLINE status_t try_seed(size_t alphabet_size = 256) noexcept {
+    SZ_NOINLINE status_t try_seed(size_t window_width, size_t alphabet_size = 256) noexcept {
         for (unsigned dim = 0; dim < dimensions_k; ++dim) {
-            hasher_t hasher(window_width_k, alphabet_size + dim, hasher_t::default_modulo_base_k);
+            hasher_t hasher(window_width, alphabet_size + dim, hasher_t::default_modulo_base_k);
             multipliers_[dim] = hasher.multiplier();
             modulos_[dim] = hasher.modulo();
             inverse_modulos_[dim] = hasher.inverse_modulo();
             negative_discarding_multipliers_[dim] = hasher.negative_discarding_multiplier();
         }
+        window_width_ = window_width;
         return status_t::success_k;
     }
 
@@ -1094,7 +1097,7 @@ struct floating_rolling_hashers<sz_cap_serial_k, window_width_, dimensions_> {
     SZ_NOINLINE void fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
                                  min_counts_span_t min_counts) const noexcept {
 
-        if (text.size() < window_width_k) {
+        if (text.size() < window_width_) {
             for (auto &min_hash : min_hashes) min_hash = max_hash_k;
             for (auto &min_count : min_counts) min_count = 0;
             return;
@@ -1142,7 +1145,7 @@ struct floating_rolling_hashers<sz_cap_serial_k, window_width_, dimensions_> {
         size_t const passed_progress = 0) const noexcept {
 
         // Until we reach the maximum window length, use a branching code version
-        size_t const prefix_length = (std::min)(text_chunk.size(), window_width_k);
+        size_t const prefix_length = (std::min)(text_chunk.size(), window_width_);
         size_t new_char_offset = passed_progress;
         for (; new_char_offset < prefix_length; ++new_char_offset) {
             byte_t const new_char = text_chunk[new_char_offset];
@@ -1155,7 +1158,7 @@ struct floating_rolling_hashers<sz_cap_serial_k, window_width_, dimensions_> {
         }
 
         // We now have our first minimum hashes
-        if (new_char_offset == window_width_k)
+        if (new_char_offset == window_width_)
             for (size_t dim = 0; dim < dimensions_k; ++dim)
                 rolling_minimums[dim] = (std::min)(rolling_minimums[dim], last_states[dim]),
                 min_counts[dim] = 1; // First occurrence of this hash
@@ -1163,7 +1166,7 @@ struct floating_rolling_hashers<sz_cap_serial_k, window_width_, dimensions_> {
         // Now we can avoid a branch in the nested loop, as we are passed the longest window width
         for (; new_char_offset < text_chunk.size(); ++new_char_offset) {
             byte_t const new_char = text_chunk[new_char_offset];
-            byte_t const old_char = text_chunk[new_char_offset - window_width_k];
+            byte_t const old_char = text_chunk[new_char_offset - window_width_];
             rolling_state_t const new_term = static_cast<rolling_state_t>(new_char) + 1.0;
             rolling_state_t const old_term = static_cast<rolling_state_t>(old_char) + 1.0;
             for (size_t dim = 0; dim < dimensions_k; ++dim) {
@@ -1265,15 +1268,14 @@ SZ_INLINE __m256d _mm256_floor_magic_pd(__m256d x) noexcept {
  *  @brief Optimized rolling Min-Hashers built around floating-point numbers.
  *  In a single YMM register we can store 4 `f64_t` values, so we can process 4 hashes per register.
  */
-template <size_t window_width_, size_t dimensions_>
-struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
+template <size_t dimensions_>
+struct floating_rolling_hashers<sz_cap_haswell_k, dimensions_> {
 
     using hasher_t = floating_rolling_hasher<f64_t>;
     using rolling_state_t = f64_t;
     using min_hash_t = u32_t;
     using min_count_t = u32_t;
 
-    static constexpr size_t window_width_k = window_width_;
     static constexpr size_t dimensions_k = dimensions_;
     static constexpr rolling_state_t skipped_rolling_hash_k = std::numeric_limits<rolling_state_t>::max();
     static constexpr min_hash_t max_hash_k = std::numeric_limits<min_hash_t>::max();
@@ -1297,8 +1299,8 @@ struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
 
   public:
     constexpr size_t dimensions() const noexcept { return dimensions_k; }
-    constexpr size_t window_width() const noexcept { return window_width_k; }
-    constexpr size_t window_width(size_t) const noexcept { return window_width_k; }
+    constexpr size_t window_width() const noexcept { return window_width_; }
+    constexpr size_t window_width(size_t) const noexcept { return window_width_; }
 
     floating_rolling_hashers() noexcept {
         // Reset all variables to zeros
@@ -1307,20 +1309,22 @@ struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
         for (auto &inverse_modulo : inverse_modulos_) inverse_modulo = 0.0;
         for (auto &negative_discarding_multiplier : negative_discarding_multipliers_)
             negative_discarding_multiplier = 0.0;
+        window_width_ = 0;
     }
 
     /**
      *  @brief Initializes several rolling hashers with different multipliers and modulos.
      *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
      */
-    SZ_NOINLINE status_t try_seed(size_t alphabet_size = 256) noexcept {
+    SZ_NOINLINE status_t try_seed(size_t window_width, size_t alphabet_size = 256) noexcept {
         for (unsigned dim = 0; dim < dimensions_k; ++dim) {
-            hasher_t hasher(window_width_k, alphabet_size + dim, hasher_t::default_modulo_base_k);
+            hasher_t hasher(window_width, alphabet_size + dim, hasher_t::default_modulo_base_k);
             multipliers_[dim] = hasher.multiplier();
             modulos_[dim] = hasher.modulo();
             inverse_modulos_[dim] = hasher.inverse_modulo();
             negative_discarding_multipliers_[dim] = hasher.negative_discarding_multiplier();
         }
+        window_width_ = window_width;
         return status_t::success_k;
     }
 
@@ -1333,7 +1337,7 @@ struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
     SZ_NOINLINE void fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
                                  min_counts_span_t min_counts) const noexcept {
 
-        if (text.size() < window_width_k) {
+        if (text.size() < window_width_) {
             for (auto &min_hash : min_hashes) min_hash = max_hash_k;
             for (auto &min_count : min_counts) min_count = 0;
             return;
@@ -1472,8 +1476,8 @@ struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
         modulos_vec.ymm_pd = _mm256_loadu_pd(&modulos_[first_dim]);
         inverse_modulos_vec.ymm_pd = _mm256_loadu_pd(&inverse_modulos_[first_dim]);
 
-        // Until we reach the `window_width_k`, we don't need to discard any symbols and can keep the code simpler
-        size_t const prefix_length = (std::min)(text_chunk.size(), window_width_k);
+        // Until we reach the `window_width_`, we don't need to discard any symbols and can keep the code simpler
+        size_t const prefix_length = (std::min)(text_chunk.size(), window_width_);
         size_t new_char_offset = passed_progress;
         for (; new_char_offset < prefix_length; ++new_char_offset) {
             byte_t const new_char = text_chunk[new_char_offset];
@@ -1489,13 +1493,13 @@ struct floating_rolling_hashers<sz_cap_haswell_k, window_width_, dimensions_> {
 
         // We now have our first minimum hashes
         __m256i const ones_ymm = _mm256_set1_epi64x(1);
-        if (new_char_offset == window_width_k && passed_progress < prefix_length)
+        if (new_char_offset == window_width_ && passed_progress < prefix_length)
             rolling_minimums_vec.ymm_pd = last_states_vec.ymm_pd, rolling_counts_vec.ymm = ones_ymm;
 
         // Now we can avoid a branch in the nested loop, as we are passed the longest window width
         for (; new_char_offset < text_chunk.size(); ++new_char_offset) {
             byte_t const new_char = text_chunk[new_char_offset];
-            byte_t const old_char = text_chunk[new_char_offset - window_width_k];
+            byte_t const old_char = text_chunk[new_char_offset - window_width_];
             rolling_state_t const new_term = static_cast<rolling_state_t>(new_char) + 1.0;
             rolling_state_t const old_term = static_cast<rolling_state_t>(old_char) + 1.0;
             __m256d new_term_ymm = _mm256_set1_pd(new_term);
@@ -1585,15 +1589,14 @@ SZ_INLINE __m512d _mm512_floor_magic_pd(__m512d x) noexcept {
  *  @brief Optimized rolling Min-Hashers built around floating-point numbers.
  *  In a single ZMM register we can store 8 `f64_t` values, so we can process 8 hashes per register.
  */
-template <size_t window_width_, size_t dimensions_>
-struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
+template <size_t dimensions_>
+struct floating_rolling_hashers<sz_cap_skylake_k, dimensions_> {
 
     using hasher_t = floating_rolling_hasher<f64_t>;
     using rolling_state_t = f64_t;
     using min_hash_t = u32_t;
     using min_count_t = u32_t;
 
-    static constexpr size_t window_width_k = window_width_;
     static constexpr size_t dimensions_k = dimensions_;
     static constexpr rolling_state_t skipped_rolling_hash_k = std::numeric_limits<rolling_state_t>::max();
     static constexpr min_hash_t max_hash_k = std::numeric_limits<min_hash_t>::max();
@@ -1614,11 +1617,12 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
     rolling_state_t modulos_[aligned_dimensions_k];
     rolling_state_t inverse_modulos_[aligned_dimensions_k];
     rolling_state_t negative_discarding_multipliers_[aligned_dimensions_k];
+    size_t window_width_ = 0;
 
   public:
     constexpr size_t dimensions() const noexcept { return dimensions_k; }
-    constexpr size_t window_width() const noexcept { return window_width_k; }
-    constexpr size_t window_width(size_t) const noexcept { return window_width_k; }
+    constexpr size_t window_width() const noexcept { return window_width_; }
+    constexpr size_t window_width(size_t) const noexcept { return window_width_; }
 
     floating_rolling_hashers() noexcept {
         // Reset all variables to zeros
@@ -1627,20 +1631,22 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
         for (auto &inverse_modulo : inverse_modulos_) inverse_modulo = 0.0;
         for (auto &negative_discarding_multiplier : negative_discarding_multipliers_)
             negative_discarding_multiplier = 0.0;
+        window_width_ = 0;
     }
 
     /**
      *  @brief Initializes several rolling hashers with different multipliers and modulos.
      *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
      */
-    SZ_NOINLINE status_t try_seed(size_t alphabet_size = 256) noexcept {
+    SZ_NOINLINE status_t try_seed(size_t window_width, size_t alphabet_size = 256) noexcept {
         for (unsigned dim = 0; dim < dimensions_k; ++dim) {
-            hasher_t hasher(window_width_k, alphabet_size + dim, hasher_t::default_modulo_base_k);
+            hasher_t hasher(window_width, alphabet_size + dim, hasher_t::default_modulo_base_k);
             multipliers_[dim] = hasher.multiplier();
             modulos_[dim] = hasher.modulo();
             inverse_modulos_[dim] = hasher.inverse_modulo();
             negative_discarding_multipliers_[dim] = hasher.negative_discarding_multiplier();
         }
+        window_width_ = window_width;
         return status_t::success_k;
     }
 
@@ -1653,7 +1659,7 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
     SZ_NOINLINE void fingerprint(span<byte_t const> text, min_hashes_span_t min_hashes,
                                  min_counts_span_t min_counts) const noexcept {
 
-        if (text.size() < window_width_k) {
+        if (text.size() < window_width_) {
             for (auto &min_hash : min_hashes) min_hash = max_hash_k;
             for (auto &min_count : min_counts) min_count = 0;
             return;
@@ -1801,8 +1807,8 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
         modulos_vec.zmm_pd = _mm512_loadu_pd(&modulos_[first_dim]);
         inverse_modulos_vec.zmm_pd = _mm512_loadu_pd(&inverse_modulos_[first_dim]);
 
-        // Until we reach the `window_width_k`, we don't need to discard any symbols and can keep the code simpler
-        size_t const prefix_length = (std::min)(text_chunk.size(), window_width_k);
+        // Until we reach the `window_width_`, we don't need to discard any symbols and can keep the code simpler
+        size_t const prefix_length = (std::min)(text_chunk.size(), window_width_);
         size_t new_char_offset = passed_progress;
         for (; new_char_offset < prefix_length; ++new_char_offset) {
             byte_t const new_char = text_chunk[new_char_offset];
@@ -1818,13 +1824,13 @@ struct floating_rolling_hashers<sz_cap_skylake_k, window_width_, dimensions_> {
 
         // We now have our first minimum hashes
         __m256i const ones_ymm = _mm256_set1_epi32(1);
-        if (new_char_offset == window_width_k && passed_progress < prefix_length)
+        if (new_char_offset == window_width_ && passed_progress < prefix_length)
             rolling_minimums_vec.zmm_pd = last_states_vec.zmm_pd, rolling_counts_vec.ymm = ones_ymm;
 
         // Now we can avoid a branch in the nested loop, as we are passed the longest window width
         for (; new_char_offset < text_chunk.size(); ++new_char_offset) {
             byte_t const new_char = text_chunk[new_char_offset];
-            byte_t const old_char = text_chunk[new_char_offset - window_width_k];
+            byte_t const old_char = text_chunk[new_char_offset - window_width_];
             rolling_state_t const new_term = static_cast<rolling_state_t>(new_char) + 1.0;
             rolling_state_t const old_term = static_cast<rolling_state_t>(old_char) + 1.0;
             __m512d new_term_zmm = _mm512_set1_pd(new_term);
diff --git a/scripts/bench_fingerprints.cuh b/scripts/bench_fingerprints.cuh
index 4e8be1fb..877abde6 100644
--- a/scripts/bench_fingerprints.cuh
+++ b/scripts/bench_fingerprints.cuh
@@ -130,32 +130,30 @@ void bench_fingerprint(environment_t const &env) {
     if (rolling_f32->try_extend(default_window_width_k, default_embedding_dims_k) != status_t::success_k)
         throw std::runtime_error("Can't build Floating f32 Rolling Hasher.");
 
-    using rolling_serial_t =
-        floating_rolling_hashers<sz_cap_serial_k, default_window_width_k, default_embedding_dims_k>;
+    using rolling_serial_t = floating_rolling_hashers<sz_cap_serial_k, default_embedding_dims_k>;
     auto rolling_serial = std::make_unique<rolling_serial_t>();
-    if (rolling_serial->try_seed() != status_t::success_k)
+    if (rolling_serial->try_seed(default_window_width_k) != status_t::success_k)
         throw std::runtime_error("Can't build Unrolled Floating Hasher.");
 
 #if SZ_USE_HASWELL
-    using rolling_haswell_t =
-        floating_rolling_hashers<sz_cap_haswell_k, default_window_width_k, default_embedding_dims_k>;
+    using rolling_haswell_t = floating_rolling_hashers<sz_cap_haswell_k, default_embedding_dims_k>;
     auto rolling_haswell = std::make_unique<rolling_haswell_t>();
-    if (rolling_haswell->try_seed() != status_t::success_k)
+    if (rolling_haswell->try_seed(default_window_width_k) != status_t::success_k)
         throw std::runtime_error("Can't build Haswell Floating Hasher.");
 #endif // SZ_USE_HASWELL
 
 #if SZ_USE_SKYLAKE
-    using rolling_skylake_t =
-        floating_rolling_hashers<sz_cap_skylake_k, default_window_width_k, default_embedding_dims_k>;
+    using rolling_skylake_t = floating_rolling_hashers<sz_cap_skylake_k, default_embedding_dims_k>;
     auto rolling_skylake = std::make_unique<rolling_skylake_t>();
-    if (rolling_skylake->try_seed() != status_t::success_k)
+    if (rolling_skylake->try_seed(default_window_width_k) != status_t::success_k)
         throw std::runtime_error("Can't build Skylake Floating Hasher.");
 #endif // SZ_USE_SKYLAKE
 
 #if SZ_USE_CUDA
-    using rolling_cuda_t = floating_rolling_hashers<sz_cap_cuda_k, default_window_width_k, default_embedding_dims_k>;
+    using rolling_cuda_t = floating_rolling_hashers<sz_cap_cuda_k, default_embedding_dims_k>;
     auto rolling_cuda = std::make_unique<rolling_cuda_t>();
-    if (rolling_cuda->try_seed() != status_t::success_k) throw std::runtime_error("Can't build CUDA Floating Hasher.");
+    if (rolling_cuda->try_seed(default_window_width_k) != status_t::success_k)
+        throw std::runtime_error("Can't build CUDA Floating Hasher.");
 #endif // SZ_USE_CUDA
 
     // Perform the benchmarks, passing the dictionary to the engines
diff --git a/scripts/test_fingerprints.cuh b/scripts/test_fingerprints.cuh
index 2795291f..e0e7d2ef 100644
--- a/scripts/test_fingerprints.cuh
+++ b/scripts/test_fingerprints.cuh
@@ -439,35 +439,35 @@ void test_rolling_hashers_equivalence_for_width() {
     auto dna_like_strings = rolling_hasher_dna_like_inputs();
     auto inconvenient_strings = rolling_hasher_inconvenient_inputs();
 
-    using rolling_serial_t = floating_rolling_hashers<sz_cap_serial_k, window_width_k, dims_k>;
+    using rolling_serial_t = floating_rolling_hashers<sz_cap_serial_k, dims_k>;
     rolling_serial_t rolling_serial;
-    sz_assert_(rolling_serial.try_seed() == status_t::success_k);
+    sz_assert_(rolling_serial.try_seed(window_width_k) == status_t::success_k);
     test_rolling_hashers_equivalence_against_baseline<dims_k>(unit_strings, rolling_f64, rolling_serial);
     test_rolling_hashers_equivalence_against_baseline<dims_k>(dna_like_strings, rolling_f64, rolling_serial);
     test_rolling_hashers_equivalence_against_baseline<dims_k>(inconvenient_strings, rolling_f64, rolling_serial);
 
 #if SZ_USE_HASWELL
-    using rolling_haswell_t = floating_rolling_hashers<sz_cap_haswell_k, window_width_k, dims_k>;
+    using rolling_haswell_t = floating_rolling_hashers<sz_cap_haswell_k, dims_k>;
     rolling_haswell_t rolling_haswell;
-    sz_assert_(rolling_haswell.try_seed() == status_t::success_k);
+    sz_assert_(rolling_haswell.try_seed(window_width_k) == status_t::success_k);
     test_rolling_hashers_equivalence_against_baseline<dims_k>(unit_strings, rolling_f64, rolling_haswell);
     test_rolling_hashers_equivalence_against_baseline<dims_k>(dna_like_strings, rolling_f64, rolling_haswell);
     test_rolling_hashers_equivalence_against_baseline<dims_k>(inconvenient_strings, rolling_f64, rolling_haswell);
 #endif
 
 #if SZ_USE_SKYLAKE
-    using rolling_skylake_t = floating_rolling_hashers<sz_cap_skylake_k, window_width_k, dims_k>;
+    using rolling_skylake_t = floating_rolling_hashers<sz_cap_skylake_k, dims_k>;
     rolling_skylake_t rolling_skylake;
-    sz_assert_(rolling_skylake.try_seed() == status_t::success_k);
+    sz_assert_(rolling_skylake.try_seed(window_width_k) == status_t::success_k);
     test_rolling_hashers_equivalence_against_baseline<dims_k>(unit_strings, rolling_f64, rolling_skylake);
     test_rolling_hashers_equivalence_against_baseline<dims_k>(dna_like_strings, rolling_f64, rolling_skylake);
     test_rolling_hashers_equivalence_against_baseline<dims_k>(inconvenient_strings, rolling_f64, rolling_skylake);
 #endif
 
 #if SZ_USE_CUDA
-    using rolling_cuda_t = floating_rolling_hashers<sz_cap_cuda_k, window_width_k, dims_k>;
+    using rolling_cuda_t = floating_rolling_hashers<sz_cap_cuda_k, dims_k>;
     rolling_cuda_t rolling_cuda;
-    sz_assert_(rolling_cuda.try_seed() == status_t::success_k);
+    sz_assert_(rolling_cuda.try_seed(window_width_k) == status_t::success_k);
     test_rolling_hashers_equivalence_against_baseline<dims_k>(unit_strings, rolling_f64, rolling_cuda);
     test_rolling_hashers_equivalence_against_baseline<dims_k>(dna_like_strings, rolling_f64, rolling_cuda);
     test_rolling_hashers_equivalence_against_baseline<dims_k>(inconvenient_strings, rolling_f64, rolling_cuda);

From e49f57057f71b1c3b64d9a5094a809017ff2ac2f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 1 Aug 2025 13:59:44 +0000
Subject: [PATCH 519/751] Improve: Draft parallel fingerprinting API

---
 c/stringzillas.cu                   | 310 +++++++++++++++++++++++++---
 include/stringzillas/stringzillas.h |  27 +--
 2 files changed, 296 insertions(+), 41 deletions(-)

diff --git a/c/stringzillas.cu b/c/stringzillas.cu
index 53a87c8f..9db6a57f 100644
--- a/c/stringzillas.cu
+++ b/c/stringzillas.cu
@@ -4,6 +4,9 @@
  *  @author     Ash Vardanian
  *  @date       March 23, 2025
  */
+#include <stringzillas/stringzillas.h> // StringZillas library header
+
+#include <variant>        // For `std::variant`
 #include <fork_union.hpp> // Fork-join scoped thread pool
 
 #include <stringzillas/fingerprints.hpp> // C++ templates for string processing
@@ -14,46 +17,297 @@
 #include <stringzillas/similarities.cuh> // Parallel string similarity in CUDA
 #endif
 
+namespace fu = ashvardanian::fork_union;
 namespace sz = ashvardanian::stringzilla;
 namespace szs = ashvardanian::stringzillas;
 
+/** Helper class for `std::visit` to handle multiple callable types in a single variant. */
+template <typename... callable_types_>
+struct overloaded : callable_types_... {
+    using callable_types_::operator()...;
+};
+template <typename... callable_types_>
+overloaded(callable_types_...) -> overloaded<callable_types_...>;
+
+/** Wraps a `sz_sequence_t` to feel like `std::vector<std::string_view>>` in the implementation layer. */
+struct sz_sequence_as_cpp_container_t {
+    sz_sequence_t const *sequence_ = nullptr;
+
+    std::size_t size() const noexcept {
+        sz_assert_(sequence_ != nullptr && "Sequence must not be null");
+        return sequence_->count;
+    }
+    std::string_view operator[](std::size_t index) const noexcept {
+        sz_assert_(sequence_ != nullptr && "Sequence must not be null");
+        sz_assert_(index < sequence_->count && "Index out of bounds");
+        return {sequence_->get_start(sequence_->handle, index), sequence_->get_length(sequence_->handle, index)};
+    }
+};
+
+/** Wraps a `sz_arrow_u64tape_t` to feel like `std::vector<std::string_view>>` in the implementation layer. */
+struct sz_arrow_u64tape_as_cpp_container_t {
+    sz_arrow_u64tape_t const *tape_ = nullptr;
+
+    std::size_t size() const noexcept {
+        sz_assert_(tape_ != nullptr && "Tape must not be null");
+        return tape_->count;
+    }
+    std::string_view operator[](std::size_t index) const noexcept {
+        sz_assert_(tape_ != nullptr && "Tape must not be null");
+        sz_assert_(index < tape_->count && "Index out of bounds");
+        return {tape_->data + tape_->offsets[index], tape_->offsets[index + 1] - tape_->offsets[index]};
+    }
+};
+
+/** Wraps a `sz_arrow_u32tape_t` to feel like `std::vector<std::string_view>>` in the implementation layer. */
+struct sz_arrow_u32tape_as_cpp_container_t {
+    sz_arrow_u32tape_t const *tape_ = nullptr;
+
+    std::size_t size() const noexcept {
+        sz_assert_(tape_ != nullptr && "Tape must not be null");
+        return tape_->count;
+    }
+    std::string_view operator[](std::size_t index) const noexcept {
+        sz_assert_(tape_ != nullptr && "Tape must not be null");
+        sz_assert_(index < tape_->count && "Index out of bounds");
+        return {tape_->data + tape_->offsets[index], tape_->offsets[index + 1] - tape_->offsets[index]};
+    }
+};
+
+/** Convenience class for slicing a strided fingerprints output. */
+template <typename element_type_>
+struct strided_rows {
+    using value_type = element_type_;
+
+  private:
+    sz_ptr_t data_ = nullptr;
+    sz_size_t stride_bytes_ = 0;
+    sz_size_t row_length_ = 0;
+    sz_size_t count_ = 0;
+
+  public:
+    strided_rows(sz_ptr_t data, sz_size_t row_length, sz_size_t stride_bytes, sz_size_t count) noexcept
+        : data_(data), stride_bytes_(stride_bytes), row_length_(row_length), count_(count) {}
+
+    std::size_t size() const noexcept { return count_; }
+
+    span<value_type> operator[](std::size_t index) const noexcept {
+        sz_assert_(index < count_ && "Index out of bounds");
+        return {reinterpret_cast<value_type *>(data_ + index * stride_bytes_), row_length_};
+    }
+};
+
+#if SZ_USE_CUDA
+
+/** @brief Redirects to CUDA's unified memory allocator. */
+void *sz_memory_allocate_from_unified_(sz_size_t size_bytes, void *handle) {
+    sz_unused_(handle);
+    return szs::unified_alloc_t {}.allocate(size_bytes);
+}
+
+/** @brief Redirects to CUDA's unified memory allocator. */
+void sz_memory_free_from_unified_(void *address, sz_size_t size_bytes, void *handle) {
+    sz_unused_(handle);
+    szs::unified_alloc_t {}.deallocate((char *)address, size_bytes);
+}
+
+#endif // SZ_USE_CUDA
+
+struct device_scope_default_t {
+    szs::dummy_executor_t executor;
+    sz::cpu_specs_t specs;
+};
+
+struct device_scope_cpu_cores_t {
+    fu::basic_pool_t executor;
+    sz::cpu_specs_t specs;
+};
+
+struct device_scope_gpu_device_t {
+    szs::cuda_executor_t executor;
+    sz::gpu_specs_t specs;
+};
+
+struct device_scope_t {
+    std::variant<device_scope_default_t, device_scope_cpu_cores_t, device_scope_gpu_device_t> variants;
+};
+
+static constexpr size_t fingerprint_slice_k = 64;
+
+template <typename element_type_>
+using vec = szs::safe_vector<element_type_, std::allocator<element_type_>>;
+
+struct fingerprints_t {
+    using fallback_variant_t = szs::basic_rolling_hashers<szs::floating_rolling_hasher<sz::f64_t>, sz::u32_t>;
+
+    /**
+     *  On each hardware platform the contains a group of rolling hashers.
+     *  Each rolling hasher produces `fingerprint_slice_k` worth of fingerprint dimensions.
+     */
+    std::variant<
+#if SZ_USE_HASWELL
+        vec<szs::floating_rolling_hashers<sz_cap_haswell_k, fingerprint_slice_k>>,
+#endif
+#if SZ_USE_SKYLAKE
+        vec<szs::floating_rolling_hashers<sz_cap_skylake_k, fingerprint_slice_k>>,
+#endif
+#if SZ_USE_CUDA
+        vec<szs::floating_rolling_hashers<sz_cap_cuda_k, fingerprint_slice_k>>,
+#endif
+        vec<szs::floating_rolling_hashers<sz_cap_serial_k, fingerprint_slice_k>>, fallback_variant_t>
+        slices;
+
+    sz_size_t dimensions = 0; // Total number of dimensions across all hashers
+
+    template <typename... variants_arguments_>
+    fingerprints_t(variants_arguments_ &&...args) noexcept : slices(std::forward<variants_arguments_>(args)...) {}
+};
+
+template <typename texts_type_>
+sz_status_t sz_fingerprints_for_(                                     //
+    sz_fingerprints_t engine_punned, sz_device_scope_t device_punned, //
+    texts_type_ &&texts_container,                                    //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                //
+    sz_u32_t *min_counts, sz_size_t min_counts_stride) {
+
+    sz_assert_(engine_punned != nullptr && "Engine must be initialized");
+    sz_assert_(texts != nullptr && "Input texts cannot be null");
+    sz_assert_(min_hashes != nullptr && "Output min_hashes cannot be null");
+    sz_assert_(min_counts != nullptr && "Output min_counts cannot be null");
+
+    // Revert back from opaque pointer types
+    auto *engine = reinterpret_cast<fingerprints_t *>(engine_punned);
+    auto *device = reinterpret_cast<device_scope_t *>(device_punned);
+
+    // Wrap our stable ABI sequences into C++ friendly containers
+    auto const dims = engine->dimensions;
+    auto const texts_count = texts_container.size();
+    auto min_hashes_rows = strided_rows<sz_u32_t> {min_hashes, dims, min_hashes_stride, texts_count};
+    auto min_counts_rows = strided_rows<sz_u32_t> {min_counts, dims, min_counts_stride, texts_count};
+
+    // The simplest case, is having non-optimized non-unrolled hashers.
+    sz_status_t result = sz_success_k;
+    using fallback_variant_t = typename fingerprints_t::fallback_variant_t;
+    auto fallback_logic = [&](fallback_variant_t &fallback_hashers) {
+        // Now we need one more level of branching/matching to pass down a typed device scope.
+        std::visit(
+            [&](auto &device_scope) {
+                sz::status_t status = fallback_hashers(                //
+                    texts_container, min_hashes_rows, min_counts_rows, //
+                    device_scope.executor, device_scope.specs);
+                result = static_cast<sz_status_t>(status);
+            },
+            device->variants);
+    };
+
+    // The unrolled logic is a bit more complex than `fallback_logic`, but in practice involves
+    // just one additional loop level.
+    auto unrolled_logic = [&](auto &&unrolled_hashers) { std::printf("Unrolled hashers with %zu dimensions\n", dims); };
+
+    std::visit(overloaded {fallback_logic, unrolled_logic}, engine->slices);
+    return result;
+}
+
 extern "C" {
 
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u32tape( //
-    sz_cptr_t a_data, sz_u32_t const *a_lengths,         //
-    sz_cptr_t b_data, sz_u32_t const *b_lengths,         //
-    sz_size_t count,                                     //
-    sz_size_t bound,                                     //
-    sz_memory_allocator_t *alloc, sz_size_t *results) {
+SZ_DYNAMIC void sz_memory_allocator_init_unified(sz_memory_allocator_t *alloc) {
+#if SZ_USE_CUDA
+    alloc->allocate = &sz_memory_allocate_from_unified_;
+    alloc->free = &sz_memory_free_from_unified_;
+    alloc->handle = nullptr;
+#else
+    sz_memory_allocator_init_default(alloc);
+#endif
+}
+
+SZ_DYNAMIC void sz_device_scope_init_cpu_cores(sz_size_t cpu_cores, sz_device_scope_t *scope) {
+    sz_unused_(cpu_cores);
+    sz_unused_(scope);
+}
+
+SZ_DYNAMIC void sz_device_scope_init_gpu_device(sz_size_t gpu_device, sz_device_scope_t *scope) {
+    sz_unused_(gpu_device);
+    sz_unused_(scope);
+}
+
+SZ_DYNAMIC void sz_device_scope_free(sz_device_scope_t scope) { sz_unused_(scope); }
+
+static constexpr size_t window_widths_256d_k[] = {3, 5, 7, 9}; // 64 dims per width for 256+ dims
+static constexpr size_t window_widths_512d_k[] = {3, 4, 5, 7, 9, 11, 15, 31};
+
+SZ_DYNAMIC sz_status_t sz_fingerprints_init(                              //
+    sz_size_t alphabet_size, sz_size_t const *window_widths,              //
+    sz_size_t window_widths_count, sz_size_t dimensions_per_window_width, //
+    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,     //
+    sz_fingerprints_t *engine_punned) {
+
+    sz_assert_(engine_punned != nullptr && *engine_punned == nullptr && "Engine must be uninitialized");
+
+    // If window widths are not provided, let's pick some of the default configurations.
+    auto const dimensions = window_widths_count * dimensions_per_window_width;
+    auto const can_use_sliced_sketchers = dimensions_per_window_width % fingerprint_slice_k == 0;
+    using fallback_variant_t = typename fingerprints_t::fallback_variant_t;
 
-    sz_unused_(bound && alloc);
+    if (!can_use_sliced_sketchers) {
+        auto variant = fallback_variant_t();
+        for (size_t window_width_index = 0; window_width_index < window_widths_count; ++window_width_index) {
+            auto const window_width = window_widths[window_width_index];
+            auto extend_status = variant.try_extend(window_width, dimensions_per_window_width, alphabet_size);
+            if (extend_status != sz::status_t::success_k) return static_cast<sz_status_t>(extend_status);
+        }
 
-    using tape_t = sz::arrow_strings_tape<char, sz_u32_t, sz::dummy_alloc_t>;
-    sz_size_t const a_total_length = a_lengths[count];
-    sz_size_t const b_total_length = b_lengths[count];
-    tape_t a({a_data, a_total_length}, {a_lengths, count + 1}, {});
-    tape_t b({b_data, b_total_length}, {b_lengths, count + 1}, {});
+        auto engine = new (std::nothrow) fingerprints_t(std::in_place_type_t<fallback_variant_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
 
-    sz::status_t result = sz::cuda::levenshtein_distances(a, b, results);
-    return (sz_status_t)result;
+        engine->dimensions = dimensions;
+        *engine_punned = reinterpret_cast<sz_fingerprints_t>(engine);
+        return sz_success_k;
+    }
 }
 
-SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u32tape( //
-    sz_cptr_t a_data, sz_u32_t const *a_lengths,           //
-    sz_cptr_t b_data, sz_u32_t const *b_lengths,           //
-    sz_size_t count,                                       //
-    sz_error_cost_t const *subs, sz_error_cost_t gap,      //
-    sz_memory_allocator_t *alloc, sz_ssize_t *results) {
+SZ_DYNAMIC sz_status_t sz_fingerprints_sequence(                      //
+    sz_fingerprints_t engine_punned, sz_device_scope_t device_punned, //
+    sz_sequence_t const *texts,                                       //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                //
+    sz_u32_t *min_counts, sz_size_t min_counts_stride) {
 
-    sz_unused_(alloc);
+    sz_assert_(texts != nullptr && "Input texts cannot be null");
+    auto texts_container = sz_sequence_as_cpp_container_t {texts};
+    return sz_fingerprints_for_(                       //
+        engine_punned, device_punned, texts_container, //
+        min_hashes, min_hashes_stride, min_counts, min_counts_stride);
+}
+
+SZ_DYNAMIC sz_status_t sz_fingerprints_u64tape(                       //
+    sz_fingerprints_t engine_punned, sz_device_scope_t device_punned, //
+    sz_arrow_u64tape_t const *texts,                                  //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                //
+    sz_u32_t *min_counts, sz_size_t min_counts_stride) {
+
+    sz_assert_(texts != nullptr && "Input texts cannot be null");
+    auto texts_container = sz_arrow_u64tape_as_cpp_container_t {texts};
+    return sz_fingerprints_for_(                       //
+        engine_punned, device_punned, texts_container, //
+        min_hashes, min_hashes_stride, min_counts, min_counts_stride);
+}
 
-    using tape_t = sz::arrow_strings_tape<char, sz_u32_t, sz::dummy_alloc_t>;
-    sz_size_t const a_total_length = a_lengths[count];
-    sz_size_t const b_total_length = b_lengths[count];
-    tape_t a({a_data, a_total_length}, {a_lengths, count + 1}, {});
-    tape_t b({b_data, b_total_length}, {b_lengths, count + 1}, {});
+SZ_DYNAMIC sz_status_t sz_fingerprints_u32tape(                       //
+    sz_fingerprints_t engine_punned, sz_device_scope_t device_punned, //
+    sz_arrow_u32tape_t const *texts,                                  //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                //
+    sz_u32_t *min_counts, sz_size_t min_counts_stride) {
 
-    sz::status_t result = sz::cuda::needleman_wunsch_scores(a, b, results, subs, gap);
-    return (sz_status_t)result;
+    sz_assert_(texts != nullptr && "Input texts cannot be null");
+    auto texts_container = sz_arrow_u32tape_as_cpp_container_t {texts};
+    return sz_fingerprints_for_(                       //
+        engine_punned, device_punned, texts_container, //
+        min_hashes, min_hashes_stride, min_counts, min_counts_stride);
 }
+
+SZ_DYNAMIC void sz_fingerprints_free(sz_fingerprints_t engine_punned) {
+    sz_assert_(engine_punned != nullptr && "Engine must be initialized");
+    auto *engine = reinterpret_cast<fingerprints_t *>(engine_punned);
+    delete engine;
+}
+
 } // extern "C"
\ No newline at end of file
diff --git a/include/stringzillas/stringzillas.h b/include/stringzillas/stringzillas.h
index cd5a8bc9..297450bd 100644
--- a/include/stringzillas/stringzillas.h
+++ b/include/stringzillas/stringzillas.h
@@ -30,22 +30,22 @@ extern "C" {
 #endif
 
 /**
- *  @brief Apache Arrow-compatible tape for strings with 32-bit offsets.
+ *  @brief Apache Arrow-compatible tape for non-NULL strings with 32-bit offsets.
  *  @sa `sz_arrow_u64tape_t` for larger collections.
  */
 struct sz_arrow_u32tape_t {
     sz_cptr_t data;
-    sz_u32_t const *lengths;
+    sz_u32_t const *offsets;
     sz_size_t count;
 };
 
 /**
- *  @brief Apache Arrow-compatible tape for strings with 64-bit offsets.
+ *  @brief Apache Arrow-compatible tape for non-NULL strings with 64-bit offsets.
  *  @sa `sz_arrow_u32tape_t` for smaller space-efficient collections.
  */
 struct sz_arrow_u64tape_t {
     sz_cptr_t data;
-    sz_u64_t const *lengths;
+    sz_u64_t const *offsets;
     sz_size_t count;
 };
 
@@ -69,6 +69,7 @@ SZ_DYNAMIC void sz_memory_allocator_init_unified(sz_memory_allocator_t *alloc);
  */
 typedef void *sz_device_scope_t;
 
+SZ_DYNAMIC void sz_device_scope_init_default(sz_device_scope_t *scope);
 SZ_DYNAMIC void sz_device_scope_init_cpu_cores(sz_size_t cpu_cores, sz_device_scope_t *scope);
 SZ_DYNAMIC void sz_device_scope_init_gpu_device(sz_size_t gpu_device, sz_device_scope_t *scope);
 SZ_DYNAMIC void sz_device_scope_free(sz_device_scope_t scope);
@@ -202,17 +203,17 @@ typedef void *sz_fingerprints_utf8_t;
 /**
  *  @brief Initializes a fingerprinting engine.
  *  @param alphabet_size The size of the alphabet (256 for binary, 128 for ASCII, 4 for DNA, 22 for protein).
- *  @param dimensions The number of dimensions for the fingerprints (ideally 64-divisible, like 2048, 2112)
  *  @param window_widths An optional array of window widths for the fingerprints, like [3, 4, 5, 7, 9, 11, 15, 31].
  *  @param window_widths_count The number of window widths in the @p window_widths array.
+ *  @param dimensions_per_window_width The number of dimensions for each window width, ideally 64 or its multiple.
  *  @param alloc A memory allocator to use for allocating memory.
  *  @param device A device scope to use for parallel execution.
  *  @param engine Pointer to the initialized fingerprinting engine.
  */
-SZ_DYNAMIC sz_status_t sz_fingerprints_init(                          //
-    sz_size_t alphabet_size, sz_size_t dimensions,                    //
-    sz_size_t const *window_widths, sz_size_t window_widths_count,    //
-    sz_memory_allocator_t const *alloc, sz_capability_t capabilities, //
+SZ_DYNAMIC sz_status_t sz_fingerprints_init(                              //
+    sz_size_t alphabet_size, sz_size_t const *window_widths,              //
+    sz_size_t window_widths_count, sz_size_t dimensions_per_window_width, //
+    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,     //
     sz_fingerprints_t *engine);
 
 SZ_DYNAMIC sz_status_t sz_fingerprints_sequence(        //
@@ -235,10 +236,10 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_u32tape(         //
 
 SZ_DYNAMIC void sz_fingerprints_free(sz_fingerprints_t engine);
 
-SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_init(                     //
-    sz_size_t alphabet_size, sz_size_t dimensions,                    //
-    sz_size_t const *window_widths, sz_size_t window_widths_count,    //
-    sz_memory_allocator_t const *alloc, sz_capability_t capabilities, //
+SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_init(                         //
+    sz_size_t alphabet_size, sz_size_t const *window_widths,              //
+    sz_size_t window_widths_count, sz_size_t dimensions_per_window_width, //
+    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,     //
     sz_fingerprints_utf8_t *engine);
 
 SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_sequence(        //

From ab9b61719598aa9746fc94bf6dfc344338ff5d21 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 1 Aug 2025 14:06:42 +0000
Subject: [PATCH 520/751] Docs: Sync description one-liner

---
 .github/workflows/release.yml | 2 +-
 CMakeLists.txt                | 2 +-
 package.json                  | 2 +-
 python/stringzilla.c          | 2 +-
 python/stringzillas.c         | 2 +-
 setup.py                      | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 1c95500b..034c33b9 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -260,7 +260,7 @@ jobs:
           mkdir "stringzilla_bare_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}/usr/local/include"
           cp include/stringzilla/stringzilla.h "stringzilla_bare_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}/usr/local/include/"
           cp build_release/libstringzilla_bare.so "stringzilla_bare_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}/usr/local/lib/"
-          echo -e "Package: stringzilla\nVersion: ${{ steps.set_version.outputs.version }}\nMaintainer: Ash Vardanian\nArchitecture: ${{ matrix.arch }}\nDescription: SIMD-accelerated string search, sort, hashes, fingerprints, & edit distances" > "stringzilla_bare_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}/DEBIAN/control"
+          echo -e "Package: stringzilla\nVersion: ${{ steps.set_version.outputs.version }}\nMaintainer: Ash Vardanian\nArchitecture: ${{ matrix.arch }}\nDescription: Search, hash, sort, fingerprint, and fuzzy-match strings faster via SWAR, SIMD, and GPGPU" > "stringzilla_bare_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}/DEBIAN/control"
           dpkg-deb --build "stringzilla_bare_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}"
 
       - name: Upload library
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 081949d6..0e71ba11 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,7 +42,7 @@ project(
     stringzilla
     VERSION 3.11.3
     LANGUAGES C CXX ASM
-    DESCRIPTION "SIMD-accelerated string search, sort, hashes, fingerprints, & edit distances"
+    DESCRIPTION "Search, hash, sort, fingerprint, and fuzzy-match strings faster via SWAR, SIMD, and GPGPU"
     HOMEPAGE_URL "https://github.com/ashvardanian/stringzilla"
 )
 
diff --git a/package.json b/package.json
index 39c77c6b..dff67693 100644
--- a/package.json
+++ b/package.json
@@ -1,7 +1,7 @@
 {
   "name": "stringzilla",
   "version": "3.11.3",
-  "description": "SIMD-accelerated string search, sort, hashes, fingerprints, & edit distances",
+  "description": "Search, hash, sort, fingerprint, and fuzzy-match strings faster via SWAR, SIMD, and GPGPU",
   "author": "Ash Vardanian",
   "license": "Apache 2.0",
   "main": "javascript/stringzilla.js",
diff --git a/python/stringzilla.c b/python/stringzilla.c
index 561e1a10..469f4bbc 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -3730,7 +3730,7 @@ static PyMethodDef stringzilla_methods[] = {
 static PyModuleDef stringzilla_module = {
     PyModuleDef_HEAD_INIT,
     "stringzilla",
-    "SIMD-accelerated string search, sort, hashes, fingerprints, & edit distances",
+    "Search, hash, sort, fingerprint, and fuzzy-match strings faster via SWAR, SIMD, and GPGPU",
     -1,
     stringzilla_methods,
     NULL,
diff --git a/python/stringzillas.c b/python/stringzillas.c
index 67aa0bea..690ff694 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -1925,7 +1925,7 @@ static PyMethodDef stringzilla_methods[] = {
 static PyModuleDef stringzilla_module = {
     PyModuleDef_HEAD_INIT,
     "stringzilla",
-    "SIMD-accelerated string search, sort, hashes, fingerprints, & edit distances",
+    "Search, hash, sort, fingerprint, and fuzzy-match strings faster via SWAR, SIMD, and GPGPU",
     -1,
     stringzilla_methods,
     NULL,
diff --git a/setup.py b/setup.py
index 579b0841..46db7e9d 100644
--- a/setup.py
+++ b/setup.py
@@ -163,7 +163,7 @@ def windows_settings() -> Tuple[List[str], List[str], List[Tuple[str]]]:
 setup(
     name=__lib_name__,
     version=__version__,
-    description="SIMD-accelerated string search, sort, hashes, fingerprints, & edit distances",
+    description="Search, hash, sort, fingerprint, and fuzzy-match strings faster via SWAR, SIMD, and GPGPU",
     author="Ash Vardanian",
     author_email="1983160+ashvardanian@users.noreply.github.com",
     url="https://github.com/ashvardanian/stringzilla",

From 3d7b491c4808f59be8c79879ca98d3f82607ec3f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 3 Aug 2025 18:09:02 +0000
Subject: [PATCH 521/751] Improve: `gpu_specs_fetch` & GPU args order

---
 drafts/bench_find_many.cuh            |  3 +-
 drafts/test_find_many.cuh             |  4 ++-
 include/stringzilla/types.h           |  2 ++
 include/stringzilla/types.hpp         |  6 ++++
 include/stringzillas/fingerprints.cuh | 20 +++++------
 include/stringzillas/similarities.cuh | 26 +++++++--------
 include/stringzillas/stringzillas.h   | 12 +++----
 include/stringzillas/types.cuh        | 48 +++++++++++++++++++--------
 scripts/bench_fingerprints.cpp        |  2 +-
 scripts/bench_fingerprints.cu         |  2 +-
 scripts/bench_fingerprints.cuh        |  7 ++--
 scripts/bench_similarities.cuh        |  6 ++--
 scripts/test_similarities.cuh         |  6 ++--
 13 files changed, 89 insertions(+), 55 deletions(-)

diff --git a/drafts/bench_find_many.cuh b/drafts/bench_find_many.cuh
index b6d9ae59..fe5e2641 100644
--- a/drafts/bench_find_many.cuh
+++ b/drafts/bench_find_many.cuh
@@ -102,7 +102,8 @@ void bench_find_many(environment_t const &env) {
     using namespace std::string_literals; // for "s" suffix
 
 #if SZ_USE_CUDA
-    gpu_specs_t specs = *gpu_specs();
+    gpu_specs_t specs;
+    if (gpu_specs_fetch(specs) != status_t::success_k) throw std::runtime_error("Failed to fetch GPU specs.");
 #endif
     std::vector<std::size_t> vocabulary_sizes = {
         1024,
diff --git a/drafts/test_find_many.cuh b/drafts/test_find_many.cuh
index 9cada526..281a1bdc 100644
--- a/drafts/test_find_many.cuh
+++ b/drafts/test_find_many.cuh
@@ -442,7 +442,9 @@ void test_find_many_equivalence() {
         std::pow(needles_long_config.alphabet.size(), needles_long_config.max_string_length);
 
 #if SZ_USE_CUDA
-    gpu_specs_t first_gpu_specs = *gpu_specs();
+    gpu_specs_t first_gpu_specs;
+    if (gpu_specs_fetch(first_gpu_specs) != status_t::success_k)
+        throw std::runtime_error("Failed to fetch GPU specs for multi-pattern search equivalence test.");
 #endif
 
     // Single-threaded serial Aho-Corasick implementation
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index ee19352d..4b37d170 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -465,6 +465,8 @@ typedef enum sz_status_t {
     sz_overflow_risk_k = -14,
     /** For algorithms with multi-stage pipelines indicates input/output size mismatch. */
     sz_unexpected_dimensions_k = -15,
+    /** GPU support is missing in the library. */
+    sz_missing_gpu_k = -16,
     /** A sink-hole status for unknown errors. */
     sz_status_unknown_k = -1,
 } sz_status_t;
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 5743a004..1c7927a5 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -142,6 +142,7 @@ enum class status_t : int {
     contains_duplicates_k = sz_contains_duplicates_k,
     overflow_risk_k = sz_overflow_risk_k,
     unexpected_dimensions_k = sz_unexpected_dimensions_k,
+    missing_gpu_k = sz_missing_gpu_k,
     unknown_k = sz_status_unknown_k,
 };
 
@@ -248,6 +249,11 @@ struct span<value_type_, SZ_SIZE_MAX> {
     }
 };
 
+template <typename value_type_, std::size_t extent_>
+span<value_type_, extent_> to_span(span<value_type_, extent_> span) noexcept {
+    return span;
+}
+
 template <std::size_t extent_ = SZ_SIZE_MAX, typename container_type_ = void>
 span<typename container_type_::value_type, extent_> to_span(container_type_ &container) noexcept {
     return {container.data(), container.size()};
diff --git a/include/stringzillas/fingerprints.cuh b/include/stringzillas/fingerprints.cuh
index 6c1af476..5fb19280 100644
--- a/include/stringzillas/fingerprints.cuh
+++ b/include/stringzillas/fingerprints.cuh
@@ -340,7 +340,7 @@ struct floating_rolling_hashers<sz_cap_cuda_k, dimensions_> {
         };
 
         // Record the start event
-        cudaError_t start_event_error = cudaEventRecord(start_event, executor.stream);
+        cudaError_t start_event_error = cudaEventRecord(start_event, executor.stream());
         if (start_event_error != cudaSuccess) return {status_t::unknown_k, start_event_error};
 
         void *warp_level_kernel_args[5];
@@ -367,17 +367,17 @@ struct floating_rolling_hashers<sz_cap_cuda_k, dimensions_> {
             dim3(random_block_size),                            // Block dimensions
             warp_level_kernel_args,                             // Array of kernel argument pointers
             0,                                                  // Shared memory per block (in bytes)
-            executor.stream);                                   // CUDA stream
+            executor.stream());                                   // CUDA stream
         if (launch_error != cudaSuccess)
             if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
             else { return {status_t::unknown_k, launch_error}; }
 
         // Wait until everything completes, as on the next iteration we will update the properties again.
-        cudaError_t execution_error = cudaStreamSynchronize(executor.stream);
+        cudaError_t execution_error = cudaStreamSynchronize(executor.stream());
         if (execution_error != cudaSuccess) { return {status_t::unknown_k, execution_error}; }
 
         // Calculate the duration:
-        cudaError_t stop_event_error = cudaEventRecord(stop_event, executor.stream);
+        cudaError_t stop_event_error = cudaEventRecord(stop_event, executor.stream());
         if (stop_event_error != cudaSuccess) return {status_t::unknown_k, stop_event_error};
         float execution_milliseconds = 0;
         cudaEventElapsedTime(&execution_milliseconds, start_event, stop_event);
@@ -387,8 +387,8 @@ struct floating_rolling_hashers<sz_cap_cuda_k, dimensions_> {
 
     template <typename texts_type_, typename min_hashes_per_text_type_, typename min_counts_per_text_type_>
     SZ_NOINLINE cuda_status_t operator()(texts_type_ const &texts, min_hashes_per_text_type_ &&min_hashes_per_text,
-                                         min_counts_per_text_type_ &&min_counts_per_text, gpu_specs_t specs = {},
-                                         cuda_executor_t executor = {}) const noexcept {
+                                         min_counts_per_text_type_ &&min_counts_per_text, cuda_executor_t executor = {},
+                                         gpu_specs_t specs = {}) const noexcept {
 
         using texts_t = texts_type_;
         using text_t = typename texts_t::value_type;
@@ -421,7 +421,7 @@ struct floating_rolling_hashers<sz_cap_cuda_k, dimensions_> {
         //                [](task_t const &task) { return task.density == warps_working_together_k; });
 
         // Record the start event
-        cudaError_t start_event_error = cudaEventRecord(start_event, executor.stream);
+        cudaError_t start_event_error = cudaEventRecord(start_event, executor.stream());
         if (start_event_error != cudaSuccess) return {status_t::unknown_k, start_event_error};
 
         void *warp_level_kernel_args[5];
@@ -449,17 +449,17 @@ struct floating_rolling_hashers<sz_cap_cuda_k, dimensions_> {
             dim3(random_block_size),                                                  // Block dimensions
             warp_level_kernel_args, // Array of kernel argument pointers
             0,                      // Shared memory per block (in bytes)
-            executor.stream);       // CUDA stream
+            executor.stream());       // CUDA stream
         if (launch_error != cudaSuccess)
             if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
             else { return {status_t::unknown_k, launch_error}; }
 
         // Wait until everything completes, as on the next iteration we will update the properties again.
-        cudaError_t execution_error = cudaStreamSynchronize(executor.stream);
+        cudaError_t execution_error = cudaStreamSynchronize(executor.stream());
         if (execution_error != cudaSuccess) { return {status_t::unknown_k, execution_error}; }
 
         // Calculate the duration:
-        cudaError_t stop_event_error = cudaEventRecord(stop_event, executor.stream);
+        cudaError_t stop_event_error = cudaEventRecord(stop_event, executor.stream());
         if (stop_event_error != cudaSuccess) return {status_t::unknown_k, stop_event_error};
         float execution_milliseconds = 0;
         cudaEventElapsedTime(&execution_milliseconds, start_event, stop_event);
diff --git a/include/stringzillas/similarities.cuh b/include/stringzillas/similarities.cuh
index 9dd158ca..7aef6672 100644
--- a/include/stringzillas/similarities.cuh
+++ b/include/stringzillas/similarities.cuh
@@ -2084,7 +2084,7 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
     cuda_status_t operator()(                                                                 //
         first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
         results_type_ *results_ptr,                                                           //
-        gpu_specs_t specs = {}, cuda_executor_t executor = {}) const noexcept {
+        cuda_executor_t executor = {}, gpu_specs_t specs = {}) const noexcept {
 
         constexpr bool is_affine_k = is_same_type<gap_costs_t, affine_gap_costs_t>::value;
         constexpr size_t count_diagonals_k = is_affine_k ? 7 : 3;
@@ -2099,7 +2099,7 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
         if (tasks.try_resize(first_strings.size()) == status_t::bad_alloc_k) return {status_t::bad_alloc_k};
 
         // Record the start event
-        cudaError_t start_event_error = cudaEventRecord(start_event, executor.stream);
+        cudaError_t start_event_error = cudaEventRecord(start_event, executor.stream());
         if (start_event_error != cudaSuccess) return {status_t::unknown_k, start_event_error};
 
         // Export all the tasks and sort them by decreasing memory requirement.
@@ -2194,7 +2194,7 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
                     dim3(random_block_size),                                                  // Block dimensions
                     device_level_kernel_args, // Array of kernel argument pointers
                     0,                        // Shared memory per block (in bytes)
-                    executor.stream);         // CUDA stream
+                    executor.stream());         // CUDA stream
                 if (launch_error != cudaSuccess)
                     if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
                     else { return {status_t::unknown_k, launch_error}; }
@@ -2278,7 +2278,7 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
                     dim3(threads_per_block),                                    // Block dimensions
                     warp_level_kernel_args,                                     // Array of kernel argument pointers
                     shared_memory_per_block,                                    // Shared memory per block (in bytes)
-                    executor.stream);                                           // CUDA stream
+                    executor.stream());                                           // CUDA stream
                 if (launch_error != cudaSuccess) {
                     result = {launch_error == cudaErrorMemoryAllocation ? status_t::bad_alloc_k : status_t::unknown_k,
                               launch_error};
@@ -2286,7 +2286,7 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
                 }
 
                 // Wait until everything completes, as on the next iteration we will update the properties again.
-                cudaError_t execution_error = cudaStreamSynchronize(executor.stream);
+                cudaError_t execution_error = cudaStreamSynchronize(executor.stream());
                 if (execution_error != cudaSuccess) {
                     result = {status_t::unknown_k, execution_error};
                     return;
@@ -2297,7 +2297,7 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
         }
 
         // Calculate the duration:
-        cudaError_t stop_event_error = cudaEventRecord(stop_event, executor.stream);
+        cudaError_t stop_event_error = cudaEventRecord(stop_event, executor.stream());
         if (stop_event_error != cudaSuccess) return {status_t::unknown_k, stop_event_error};
         float execution_milliseconds = 0;
         cudaEventElapsedTime(&execution_milliseconds, start_event, stop_event);
@@ -2747,7 +2747,7 @@ struct cuda_nw_or_sw_byte_level_scores_ {
     cuda_status_t operator()(                                                                 //
         first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
         results_type_ *results_ptr,                                                           //
-        gpu_specs_t specs = {}, cuda_executor_t executor = {}) const noexcept {
+        cuda_executor_t executor = {}, gpu_specs_t specs = {}) const noexcept {
 
         constexpr bool is_local_k = locality_k == sz_similarity_local_k;
         constexpr bool is_affine_k = is_same_type<gap_costs_t, affine_gap_costs_t>::value;
@@ -2763,13 +2763,13 @@ struct cuda_nw_or_sw_byte_level_scores_ {
         if (tasks.try_resize(first_strings.size()) == status_t::bad_alloc_k) return {status_t::bad_alloc_k};
 
         // Record the start event
-        cudaError_t start_event_error = cudaEventRecord(start_event, executor.stream);
+        cudaError_t start_event_error = cudaEventRecord(start_event, executor.stream());
         if (start_event_error != cudaSuccess) return {status_t::unknown_k, start_event_error};
 
         // Enqueue the transfer of the substituter to the constant memory:
         cudaError_t copy_error =
             cudaMemcpyToSymbolAsync(error_costs_in_cuda_constant_memory_, (void const *)&substituter_,
-                                    sizeof(substituter_t), 0, cudaMemcpyHostToDevice, executor.stream);
+                                    sizeof(substituter_t), 0, cudaMemcpyHostToDevice, executor.stream());
         if (copy_error != cudaSuccess) return {status_t::unknown_k, copy_error};
 
         // Export all the tasks and sort them by decreasing memory requirement.
@@ -2855,7 +2855,7 @@ struct cuda_nw_or_sw_byte_level_scores_ {
                     dim3(random_block_size),                                                  // Block dimensions
                     device_level_kernel_args, // Array of kernel argument pointers
                     0,                        // Shared memory per block (in bytes)
-                    executor.stream);         // CUDA stream
+                    executor.stream());         // CUDA stream
                 if (launch_error != cudaSuccess)
                     if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
                     else { return {status_t::unknown_k, launch_error}; }
@@ -2937,7 +2937,7 @@ struct cuda_nw_or_sw_byte_level_scores_ {
                     dim3(threads_per_block),                                    // Block dimensions
                     warp_level_kernel_args,                                     // Array of kernel argument pointers
                     shared_memory_per_block,                                    // Shared memory per block (in bytes)
-                    executor.stream);                                           // CUDA stream
+                    executor.stream());                                           // CUDA stream
                 if (launch_error != cudaSuccess) {
                     result = {launch_error == cudaErrorMemoryAllocation ? status_t::bad_alloc_k : status_t::unknown_k,
                               launch_error};
@@ -2945,7 +2945,7 @@ struct cuda_nw_or_sw_byte_level_scores_ {
                 }
 
                 // Wait until everything completes, as on the next iteration we will update the properties again.
-                cudaError_t execution_error = cudaStreamSynchronize(executor.stream);
+                cudaError_t execution_error = cudaStreamSynchronize(executor.stream());
                 if (execution_error != cudaSuccess) {
                     result = {status_t::unknown_k, execution_error};
                     return;
@@ -2956,7 +2956,7 @@ struct cuda_nw_or_sw_byte_level_scores_ {
         }
 
         // Calculate the duration:
-        cudaError_t stop_event_error = cudaEventRecord(stop_event, executor.stream);
+        cudaError_t stop_event_error = cudaEventRecord(stop_event, executor.stream());
         if (stop_event_error != cudaSuccess) return {status_t::unknown_k, stop_event_error};
         float execution_milliseconds = 0;
         cudaEventElapsedTime(&execution_milliseconds, start_event, stop_event);
diff --git a/include/stringzillas/stringzillas.h b/include/stringzillas/stringzillas.h
index 297450bd..136f0f53 100644
--- a/include/stringzillas/stringzillas.h
+++ b/include/stringzillas/stringzillas.h
@@ -23,7 +23,7 @@
 #ifndef STRINGZILLAS_H_
 #define STRINGZILLAS_H_
 
-#include "stringzilla.h" // `sz_sequence_t` and other types
+#include <stringzilla/stringzilla.h> // `sz_sequence_t` and other types
 
 #ifdef __cplusplus
 extern "C" {
@@ -53,7 +53,7 @@ struct sz_arrow_u64tape_t {
  *  @brief Prepares the default allocator for unified memory management.
  *  @note When compiled on CUDA-capable systems, this function will use `cudaMallocManaged`.
  */
-SZ_DYNAMIC void sz_memory_allocator_init_unified(sz_memory_allocator_t *alloc);
+SZ_DYNAMIC sz_status_t sz_memory_allocator_init_unified(sz_memory_allocator_t *alloc);
 
 /**
  *  Doesn't aim to provide the same level of granularity as the C++ API.
@@ -69,10 +69,10 @@ SZ_DYNAMIC void sz_memory_allocator_init_unified(sz_memory_allocator_t *alloc);
  */
 typedef void *sz_device_scope_t;
 
-SZ_DYNAMIC void sz_device_scope_init_default(sz_device_scope_t *scope);
-SZ_DYNAMIC void sz_device_scope_init_cpu_cores(sz_size_t cpu_cores, sz_device_scope_t *scope);
-SZ_DYNAMIC void sz_device_scope_init_gpu_device(sz_size_t gpu_device, sz_device_scope_t *scope);
-SZ_DYNAMIC void sz_device_scope_free(sz_device_scope_t scope);
+SZ_DYNAMIC sz_status_t sz_device_scope_init_default(sz_device_scope_t *scope);
+SZ_DYNAMIC sz_status_t sz_device_scope_init_cpu_cores(sz_size_t cpu_cores, sz_device_scope_t *scope);
+SZ_DYNAMIC sz_status_t sz_device_scope_init_gpu_device(sz_size_t gpu_device, sz_device_scope_t *scope);
+SZ_DYNAMIC sz_status_t sz_device_scope_free(sz_device_scope_t scope);
 
 /*  APIs for computing edit-distances between binary and UTF-8 strings.
  *  Supports `sz_sequence_t`, `sz_arrow_u32tape_t`, and `sz_arrow_u64tape_t` inputs.
diff --git a/include/stringzillas/types.cuh b/include/stringzillas/types.cuh
index ce5b5069..a54e3b5e 100644
--- a/include/stringzillas/types.cuh
+++ b/include/stringzillas/types.cuh
@@ -91,11 +91,20 @@ struct unified_alloc {
     }
 };
 
-inline std::optional<gpu_specs_t> gpu_specs(int device = 0) noexcept {
-    gpu_specs_t specs;
+using unified_alloc_t = unified_alloc<char>;
+
+struct cuda_status_t {
+    status_t status = status_t::success_k;
+    cudaError_t cuda_error = cudaSuccess;
+    float elapsed_milliseconds = 0.0;
+
+    inline operator status_t() const noexcept { return status; }
+};
+
+inline cuda_status_t gpu_specs_fetch(gpu_specs_t &specs, int device_id = 0) noexcept {
     cudaDeviceProp prop;
-    cudaError_t cuda_error = cudaGetDeviceProperties(&prop, device);
-    if (cuda_error != cudaSuccess) return std::nullopt; // ! Failed to get device properties
+    cudaError_t cuda_error = cudaGetDeviceProperties(&prop, device_id);
+    if (cuda_error != cudaSuccess) return {status_t::unknown_k, cuda_error};
 
     // Set the GPU specs
     specs.streaming_multiprocessors = prop.multiProcessorCount;
@@ -110,19 +119,30 @@ inline std::optional<gpu_specs_t> gpu_specs(int device = 0) noexcept {
     // Scheduling-related constants
     specs.max_blocks_per_multiprocessor = prop.maxBlocksPerMultiProcessor;
     specs.reserved_memory_per_block = prop.reservedSharedMemPerBlock;
-    return specs;
+    return {status_t::success_k, cudaSuccess};
 }
 
-struct cuda_status_t {
-    status_t status = status_t::success_k;
-    cudaError_t cuda_error = cudaSuccess;
-    float elapsed_milliseconds = 0.0;
-
-    inline operator status_t() const noexcept { return status; }
-};
+class cuda_executor_t {
+    cudaStream_t stream_ = 0;
+    int device_id_ = 0;
+
+  public:
+    constexpr cuda_executor_t() noexcept = default;
+    constexpr cuda_executor_t(cuda_executor_t const &) noexcept = default;
+    constexpr cuda_executor_t &operator=(cuda_executor_t const &) noexcept = default;
+
+    cuda_status_t try_scheduling(int device_id) noexcept {
+        device_id_ = -1; // ? Invalid device ID
+        cudaError_t switching_error = cudaSetDevice(device_id);
+        if (switching_error != cudaSuccess) return {status_t::unknown_k, switching_error};
+        cudaError_t creation_error = cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking);
+        if (creation_error != cudaSuccess) return {status_t::unknown_k, creation_error};
+        device_id_ = device_id;
+        return {status_t::success_k, cudaSuccess};
+    }
 
-struct cuda_executor_t {
-    cudaStream_t stream = 0;
+    explicit operator bool() const noexcept { return device_id_ >= 0; }
+    inline cudaStream_t stream() const noexcept { return stream_; }
 };
 
 /**
diff --git a/scripts/bench_fingerprints.cpp b/scripts/bench_fingerprints.cpp
index debc49a5..850de2f1 100644
--- a/scripts/bench_fingerprints.cpp
+++ b/scripts/bench_fingerprints.cpp
@@ -54,7 +54,7 @@ int main(int argc, char const **argv) {
             environment_t::tokenization_t::lines_k);
 
         std::printf("Starting string fingerprinting benchmarks...\n");
-        bench_fingerprint(env);
+        bench_fingerprints(env);
     }
     catch (std::exception const &e) {
         std::fprintf(stderr, "Failed with: %s\n", e.what());
diff --git a/scripts/bench_fingerprints.cu b/scripts/bench_fingerprints.cu
index ce7bf403..d36d6ed3 100644
--- a/scripts/bench_fingerprints.cu
+++ b/scripts/bench_fingerprints.cu
@@ -54,7 +54,7 @@ int main(int argc, char const **argv) {
             environment_t::tokenization_t::lines_k);
 
         std::printf("Starting string fingerprinting search benchmarks...\n");
-        bench_fingerprint(env);
+        bench_fingerprints(env);
     }
     catch (std::exception const &e) {
         std::fprintf(stderr, "Failed with: %s\n", e.what());
diff --git a/scripts/bench_fingerprints.cuh b/scripts/bench_fingerprints.cuh
index 877abde6..df5f8715 100644
--- a/scripts/bench_fingerprints.cuh
+++ b/scripts/bench_fingerprints.cuh
@@ -73,14 +73,13 @@ struct fingerprint_callable {
     }
 };
 
-void bench_fingerprint(environment_t const &env) {
+void bench_fingerprints(environment_t const &env) {
 
     namespace fu = fork_union;
 
 #if SZ_USE_CUDA
-    auto maybe_specs = gpu_specs();
-    if (!maybe_specs.has_value()) throw std::runtime_error("Failed to get GPU specs.");
-    gpu_specs_t specs = *maybe_specs;
+    gpu_specs_t specs;
+    if (gpu_specs_fetch(specs) != status_t::success_k) throw std::runtime_error("Failed to get GPU specs.");
 #endif
 
     arrow_strings_tape_t tape;
diff --git a/scripts/bench_similarities.cuh b/scripts/bench_similarities.cuh
index 0939c9b7..0050aa46 100644
--- a/scripts/bench_similarities.cuh
+++ b/scripts/bench_similarities.cuh
@@ -90,7 +90,8 @@ void bench_levenshtein(environment_t const &env) {
     namespace fu = fork_union;
 
 #if SZ_USE_CUDA
-    gpu_specs_t specs = *gpu_specs();
+    gpu_specs_t specs;
+    if (gpu_specs_fetch(specs) != status_t::success_k) throw std::runtime_error("Failed to fetch GPU specs.");
 #endif
     std::vector<std::size_t> batch_sizes = {1, 64, 1024, 32 * 1024};
 #if SZ_DEBUG
@@ -232,7 +233,8 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
     auto blosum62_matrix = blosum62_mat.decompressed();
 
 #if SZ_USE_CUDA
-    gpu_specs_t specs = *gpu_specs();
+    gpu_specs_t specs;
+    if (gpu_specs_fetch(specs) != status_t::success_k) throw std::runtime_error("Failed to fetch GPU specs.");
 #endif
     std::vector<std::size_t> batch_sizes = {1, 64, 1024, 32 * 1024};
 #if SZ_DEBUG
diff --git a/scripts/test_similarities.cuh b/scripts/test_similarities.cuh
index 3d76c717..e28cd22a 100644
--- a/scripts/test_similarities.cuh
+++ b/scripts/test_similarities.cuh
@@ -732,7 +732,8 @@ void test_similarity_scores_equivalence() {
 #endif
 
 #if SZ_USE_CUDA
-    gpu_specs_t first_gpu_specs = *gpu_specs();
+    gpu_specs_t first_gpu_specs;
+    sz_assert_(get_first_gpu_specs(first_gpu_specs) == status_t::success_k);
 #endif
 
 #if SZ_USE_CUDA
@@ -858,7 +859,8 @@ void test_similarity_scores_memory_usage() {
     };
 
 #if SZ_USE_CUDA
-    gpu_specs_t first_gpu_specs = *gpu_specs();
+    gpu_specs_t first_gpu_specs;
+    sz_assert_(get_first_gpu_specs(first_gpu_specs) == status_t::success_k);
 #endif
 
     // Let's define some weird scoring schemes for Levenshtein-like distance, that are not unary:

From 6ac80e8e77e68c4330ab107941375a62ae8bc165 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 3 Aug 2025 18:09:44 +0000
Subject: [PATCH 522/751] Make: Compiling StringZillas shared libs

---
 CMakeLists.txt | 161 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 137 insertions(+), 24 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0e71ba11..e37620aa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,9 @@
 # * stringzilla_header: A header-only library with the StringZilla C and C++ headers.
 # * stringzilla_shared: A shared library with the StringZilla C and C++ headers and dynamic SIMD dispatch.
 # * stringzilla_bare: A shared library with the StringZilla headers, but without linking the standard C library.
+# * stringzillas_cpus_shared: A shared library with the StringZillas parallel algorithms for multi-threaded CPUs.
+# * stringzillas_cuda_shared: A shared library with the StringZillas parallel algorithms for CUDA-capable GPUs.
+# * stringzillas_rocm_shared: A shared library with the StringZillas parallel algorithms for ROCm-capable GPUs.
 #
 # Tests for different C++ standards:
 #
@@ -122,6 +125,7 @@ option(STRINGZILLA_INSTALL "Install CMake targets" OFF)
 option(STRINGZILLA_BUILD_TEST "Compile a native unit test in C++" ${STRINGZILLA_IS_MAIN_PROJECT})
 option(STRINGZILLA_BUILD_BENCHMARK "Compile a native benchmark in C++" ${STRINGZILLA_IS_MAIN_PROJECT})
 option(STRINGZILLA_BUILD_SHARED "Compile a dynamic library" ${STRINGZILLA_IS_MAIN_PROJECT})
+option(STRINGZILLAS_BUILD_SHARED "Compile dynamic parallel libraries" ${STRINGZILLA_IS_MAIN_PROJECT})
 set(STRINGZILLA_TARGET_ARCH
     ""
     CACHE STRING "Architecture to tell the compiler to optimize for (-march)"
@@ -521,31 +525,140 @@ if (${STRINGZILLA_BUILD_SHARED})
     endif ()
 endif ()
 
+if (${STRINGZILLAS_BUILD_SHARED})
+    # StringZillas shared library targets for parallel string operations
+    function (define_stringzillas_shared target backend_flags)
+        add_library(${target} SHARED c/stringzillas.cu)
+        add_library(${PROJECT_NAME}::${target} ALIAS ${target})
+
+        set_target_properties(
+            ${target}
+            PROPERTIES VERSION ${PROJECT_VERSION}
+                       SOVERSION 1
+                       POSITION_INDEPENDENT_CODE ON
+        )
+
+        target_include_directories(${target} PUBLIC include)
+        target_include_directories(${target} PRIVATE fork_union/include)
+        target_compile_definitions(${target} PRIVATE "SZ_DYNAMIC_DISPATCH=1")
+        target_compile_definitions(${target} PRIVATE "SZ_AVOID_LIBC=0")
+        target_compile_definitions(${target} PRIVATE "SZ_DEBUG=0")
+
+        # Set backend-specific compilation flags
+        foreach (flag ${backend_flags})
+            target_compile_definitions(${target} PRIVATE ${flag})
+        endforeach ()
+
+        # Use C++17 for StringZillas
+        set_target_properties(${target} PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON)
+
+        # Architecture-specific optimizations
+        target_compile_options(
+            ${target} PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang>:-O3;-fPIC>" "$<$<CXX_COMPILER_ID:MSVC>:/O2>"
+        )
+
+        # Dynamic dispatch for SIMD on different architectures
+        if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64")
+            target_compile_definitions(${target} PRIVATE "SZ_IS_64BIT_X86_=1" "SZ_IS_64BIT_ARM_=0")
+        elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
+            target_compile_definitions(${target} PRIVATE "SZ_IS_64BIT_X86_=0" "SZ_IS_64BIT_ARM_=1")
+        endif ()
+
+        # Link threading libraries for CPU backend
+        find_package(Threads REQUIRED)
+        target_link_libraries(${target} PRIVATE Threads::Threads)
+
+        # Platform-specific runtime libraries (similar to define_shared)
+        if (WIN32 AND CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
+            target_link_libraries(${target} PRIVATE msvcrt.lib vcruntime.lib ucrt.lib)
+        endif ()
+
+    endfunction ()
+
+    # Define StringZillas CPU shared library
+    define_stringzillas_shared(stringzillas_cpus_shared "SZ_USE_CUDA=0;SZ_USE_ROCM=0")
+
+    # Define StringZillas CUDA shared library (only if CUDA is available)
+    if (ENABLE_CUDA)
+        define_stringzillas_shared(stringzillas_cuda_shared "SZ_USE_CUDA=1;SZ_USE_ROCM=0")
+
+        # Link CUDA libraries
+        target_link_libraries(stringzillas_cuda_shared PRIVATE CUDA::cudart)
+
+        # Set CUDA-specific properties
+        set_target_properties(stringzillas_cuda_shared PROPERTIES CUDA_STANDARD 17 CUDA_STANDARD_REQUIRED ON)
+        set_target_properties(stringzillas_cuda_shared PROPERTIES CUDA_ARCHITECTURES "70;75;80;86;89;90")
+
+        # Enable CUDA separable compilation for device code
+        set_target_properties(stringzillas_cuda_shared PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    endif ()
+
+    # TODO: Define StringZillas ROCm shared library when ROCm support is added if (ENABLE_ROCM)
+    # define_stringzillas_shared(stringzillas_rocm_shared "SZ_USE_CUDA=0;SZ_USE_ROCM=1") endif ()
+
+endif ()
+
 if (STRINGZILLA_INSTALL)
-    install(
-        TARGETS stringzilla_shared
-        ARCHIVE
-        BUNDLE
-        FRAMEWORK
-        LIBRARY
-        OBJECTS
-        PRIVATE_HEADER
-        PUBLIC_HEADER
-        RESOURCE
-        RUNTIME
-    )
-    install(
-        TARGETS stringzilla_bare
-        ARCHIVE
-        BUNDLE
-        FRAMEWORK
-        LIBRARY
-        OBJECTS
-        PRIVATE_HEADER
-        PUBLIC_HEADER
-        RESOURCE
-        RUNTIME
-    )
+    if (TARGET stringzilla_header)
+        install(
+            TARGETS stringzilla_shared
+            ARCHIVE
+            BUNDLE
+            FRAMEWORK
+            LIBRARY
+            OBJECTS
+            PRIVATE_HEADER
+            PUBLIC_HEADER
+            RESOURCE
+            RUNTIME
+        )
+    endif ()
+    if (TARGET stringzilla_bare)
+        install(
+            TARGETS stringzilla_bare
+            ARCHIVE
+            BUNDLE
+            FRAMEWORK
+            LIBRARY
+            OBJECTS
+            PRIVATE_HEADER
+            PUBLIC_HEADER
+            RESOURCE
+            RUNTIME
+        )
+    endif ()
+
+    # Install StringZillas shared libraries if they were built
+    if (TARGET stringzillas_cpus_shared)
+        install(
+            TARGETS stringzillas_cpus_shared
+            ARCHIVE
+            BUNDLE
+            FRAMEWORK
+            LIBRARY
+            OBJECTS
+            PRIVATE_HEADER
+            PUBLIC_HEADER
+            RESOURCE
+            RUNTIME
+        )
+    endif ()
+
+    if (TARGET stringzillas_cuda_shared)
+        install(
+            TARGETS stringzillas_cuda_shared
+            ARCHIVE
+            BUNDLE
+            FRAMEWORK
+            LIBRARY
+            OBJECTS
+            PRIVATE_HEADER
+            PUBLIC_HEADER
+            RESOURCE
+            RUNTIME
+        )
+    endif ()
+
     install(DIRECTORY ${STRINGZILLA_INCLUDE_BUILD_DIR} DESTINATION ${STRINGZILLA_INCLUDE_INSTALL_DIR})
     install(DIRECTORY ./c/ DESTINATION /usr/src/${PROJECT_NAME}/)
 endif ()

From a2b228cab2350713fbe2c8c2f13365f937b713f2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 3 Aug 2025 19:54:35 +0000
Subject: [PATCH 523/751] Fix: Compilation of all C targets

---
 c/stringzillas.cu                     | 139 +++++++++++++++++++-------
 include/stringzillas/fingerprints.cuh |   5 +-
 include/stringzillas/fingerprints.hpp |   5 +-
 include/stringzillas/stringzillas.h   |   2 +-
 include/stringzillas/types.hpp        |   1 +
 scripts/bench_fingerprints.cuh        |   8 +-
 scripts/bench_similarities.cuh        |  91 +++++++++--------
 scripts/test_fingerprints.cuh         |  13 +--
 scripts/test_similarities.cuh         |  54 +++++-----
 9 files changed, 201 insertions(+), 117 deletions(-)

diff --git a/c/stringzillas.cu b/c/stringzillas.cu
index 9db6a57f..0e44a3a2 100644
--- a/c/stringzillas.cu
+++ b/c/stringzillas.cu
@@ -91,9 +91,9 @@ struct strided_rows {
 
     std::size_t size() const noexcept { return count_; }
 
-    span<value_type> operator[](std::size_t index) const noexcept {
+    sz::span<value_type> operator[](std::size_t index) const noexcept {
         sz_assert_(index < count_ && "Index out of bounds");
-        return {reinterpret_cast<value_type *>(data_ + index * stride_bytes_), row_length_};
+        return sz::span<value_type>(reinterpret_cast<value_type *>(data_ + index * stride_bytes_), row_length_);
     }
 };
 
@@ -113,23 +113,39 @@ void sz_memory_free_from_unified_(void *address, sz_size_t size_bytes, void *han
 
 #endif // SZ_USE_CUDA
 
-struct device_scope_default_t {
-    szs::dummy_executor_t executor;
-    sz::cpu_specs_t specs;
-};
+struct default_scope_t {};
+szs::dummy_executor_t get_executor(default_scope_t const &) noexcept { return {}; }
+sz::cpu_specs_t get_specs(default_scope_t const &) noexcept { return {}; }
 
-struct device_scope_cpu_cores_t {
-    fu::basic_pool_t executor;
+struct cpu_scope_t {
+    std::unique_ptr<fu::basic_pool_t> executor_ptr;
     sz::cpu_specs_t specs;
+
+    cpu_scope_t() = default;
+    cpu_scope_t(std::unique_ptr<fu::basic_pool_t> exec_ptr, sz::cpu_specs_t cpu_specs) noexcept
+        : executor_ptr(std::move(exec_ptr)), specs(cpu_specs) {}
 };
+fu::basic_pool_t &get_executor(cpu_scope_t &scope) noexcept { return *scope.executor_ptr; }
+sz::cpu_specs_t get_specs(cpu_scope_t const &scope) noexcept { return scope.specs; }
 
-struct device_scope_gpu_device_t {
+#if SZ_USE_CUDA
+struct gpu_scope_t {
     szs::cuda_executor_t executor;
     sz::gpu_specs_t specs;
 };
+szs::cuda_executor_t &get_executor(gpu_scope_t &scope) noexcept { return scope.executor; }
+sz::gpu_specs_t get_specs(gpu_scope_t const &scope) noexcept { return scope.specs; }
+#endif
 
 struct device_scope_t {
-    std::variant<device_scope_default_t, device_scope_cpu_cores_t, device_scope_gpu_device_t> variants;
+#if SZ_USE_CUDA
+    std::variant<default_scope_t, cpu_scope_t, gpu_scope_t> variants;
+#else
+    std::variant<default_scope_t, cpu_scope_t> variants;
+#endif
+
+    template <typename... variants_arguments_>
+    device_scope_t(variants_arguments_ &&...args) noexcept : variants(std::forward<variants_arguments_>(args)...) {}
 };
 
 static constexpr size_t fingerprint_slice_k = 64;
@@ -155,12 +171,12 @@ struct fingerprints_t {
         vec<szs::floating_rolling_hashers<sz_cap_cuda_k, fingerprint_slice_k>>,
 #endif
         vec<szs::floating_rolling_hashers<sz_cap_serial_k, fingerprint_slice_k>>, fallback_variant_t>
-        slices;
+        variants;
 
     sz_size_t dimensions = 0; // Total number of dimensions across all hashers
 
     template <typename... variants_arguments_>
-    fingerprints_t(variants_arguments_ &&...args) noexcept : slices(std::forward<variants_arguments_>(args)...) {}
+    fingerprints_t(variants_arguments_ &&...args) noexcept : variants(std::forward<variants_arguments_>(args)...) {}
 };
 
 template <typename texts_type_>
@@ -171,7 +187,7 @@ sz_status_t sz_fingerprints_for_(                                     //
     sz_u32_t *min_counts, sz_size_t min_counts_stride) {
 
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
-    sz_assert_(texts != nullptr && "Input texts cannot be null");
+    sz_assert_(device_punned != nullptr && "Device must be initialized");
     sz_assert_(min_hashes != nullptr && "Output min_hashes cannot be null");
     sz_assert_(min_counts != nullptr && "Output min_counts cannot be null");
 
@@ -182,58 +198,106 @@ sz_status_t sz_fingerprints_for_(                                     //
     // Wrap our stable ABI sequences into C++ friendly containers
     auto const dims = engine->dimensions;
     auto const texts_count = texts_container.size();
-    auto min_hashes_rows = strided_rows<sz_u32_t> {min_hashes, dims, min_hashes_stride, texts_count};
-    auto min_counts_rows = strided_rows<sz_u32_t> {min_counts, dims, min_counts_stride, texts_count};
+    auto min_hashes_rows =
+        strided_rows<sz_u32_t> {reinterpret_cast<sz_ptr_t>(min_hashes), dims, min_hashes_stride, texts_count};
+    auto min_counts_rows =
+        strided_rows<sz_u32_t> {reinterpret_cast<sz_ptr_t>(min_counts), dims, min_counts_stride, texts_count};
 
     // The simplest case, is having non-optimized non-unrolled hashers.
     sz_status_t result = sz_success_k;
     using fallback_variant_t = typename fingerprints_t::fallback_variant_t;
     auto fallback_logic = [&](fallback_variant_t &fallback_hashers) {
-        // Now we need one more level of branching/matching to pass down a typed device scope.
-        std::visit(
-            [&](auto &device_scope) {
-                sz::status_t status = fallback_hashers(                //
-                    texts_container, min_hashes_rows, min_counts_rows, //
-                    device_scope.executor, device_scope.specs);
-                result = static_cast<sz_status_t>(status);
-            },
-            device->variants);
+        // CPU fallback hashers can only work with CPU-compatible device scopes
+        if (std::holds_alternative<default_scope_t>(device->variants)) {
+            auto &device_scope = std::get<default_scope_t>(device->variants);
+            sz::status_t status = fallback_hashers(                //
+                texts_container, min_hashes_rows, min_counts_rows, //
+                get_executor(device_scope), get_specs(device_scope));
+            result = static_cast<sz_status_t>(status);
+        }
+        else if (std::holds_alternative<cpu_scope_t>(device->variants)) {
+            auto &device_scope = std::get<cpu_scope_t>(device->variants);
+            sz::status_t status = fallback_hashers(                //
+                texts_container, min_hashes_rows, min_counts_rows, //
+                get_executor(device_scope), get_specs(device_scope));
+            result = static_cast<sz_status_t>(status);
+        }
+        else { result = sz_status_unknown_k; }
     };
 
     // The unrolled logic is a bit more complex than `fallback_logic`, but in practice involves
     // just one additional loop level.
     auto unrolled_logic = [&](auto &&unrolled_hashers) { std::printf("Unrolled hashers with %zu dimensions\n", dims); };
 
-    std::visit(overloaded {fallback_logic, unrolled_logic}, engine->slices);
+    std::visit(overloaded {fallback_logic, unrolled_logic}, engine->variants);
     return result;
 }
 
 extern "C" {
 
-SZ_DYNAMIC void sz_memory_allocator_init_unified(sz_memory_allocator_t *alloc) {
+SZ_DYNAMIC sz_status_t sz_memory_allocator_init_unified(sz_memory_allocator_t *alloc) {
 #if SZ_USE_CUDA
     alloc->allocate = &sz_memory_allocate_from_unified_;
     alloc->free = &sz_memory_free_from_unified_;
     alloc->handle = nullptr;
+    return sz_success_k;
 #else
-    sz_memory_allocator_init_default(alloc);
+    return sz_missing_gpu_k;
 #endif
 }
 
-SZ_DYNAMIC void sz_device_scope_init_cpu_cores(sz_size_t cpu_cores, sz_device_scope_t *scope) {
-    sz_unused_(cpu_cores);
-    sz_unused_(scope);
+SZ_DYNAMIC sz_status_t sz_device_scope_init_default(sz_device_scope_t *scope_punned) {
+    sz_assert_(scope_punned != nullptr && "Scope must not be null");
+    auto *scope = new device_scope_t {default_scope_t {}};
+    if (!scope) return sz_bad_alloc_k;
+    *scope_punned = reinterpret_cast<sz_device_scope_t>(scope);
+    return sz_success_k;
 }
 
-SZ_DYNAMIC void sz_device_scope_init_gpu_device(sz_size_t gpu_device, sz_device_scope_t *scope) {
-    sz_unused_(gpu_device);
-    sz_unused_(scope);
+SZ_DYNAMIC sz_status_t sz_device_scope_init_cpu_cores(sz_size_t cpu_cores, sz_device_scope_t *scope_punned) {
+    sz_assert_(scope_punned != nullptr && "Scope must not be null");
+    sz_assert_(cpu_cores > 0 && "CPU cores must be greater than zero");
+    sz_assert_(cpu_cores > 1 && "For a single-threaded execution, use the default scope");
+
+    sz::cpu_specs_t specs;
+    auto executor = std::make_unique<fu::basic_pool_t>();
+    if (!executor->try_spawn(cpu_cores)) return sz_bad_alloc_k;
+
+    auto *scope =
+        new (std::nothrow) device_scope_t(std::in_place_type_t<cpu_scope_t> {}, std::move(executor), std::move(specs));
+    if (!scope) return sz_bad_alloc_k;
+    *scope_punned = reinterpret_cast<sz_device_scope_t>(scope);
+    return sz_success_k;
 }
 
-SZ_DYNAMIC void sz_device_scope_free(sz_device_scope_t scope) { sz_unused_(scope); }
+SZ_DYNAMIC sz_status_t sz_device_scope_init_gpu_device(sz_size_t gpu_device, sz_device_scope_t *scope_punned) {
+    sz_assert_(scope_punned != nullptr && "Scope must not be null");
 
-static constexpr size_t window_widths_256d_k[] = {3, 5, 7, 9}; // 64 dims per width for 256+ dims
-static constexpr size_t window_widths_512d_k[] = {3, 4, 5, 7, 9, 11, 15, 31};
+#if SZ_USE_CUDA
+    sz::gpu_specs_t specs;
+    auto specs_status = szs::gpu_specs_fetch(specs, static_cast<int>(gpu_device));
+    if (specs_status.status != sz::status_t::success_k) return static_cast<sz_status_t>(specs_status.status);
+    szs::cuda_executor_t executor;
+    auto executor_status = executor.try_scheduling(static_cast<int>(gpu_device));
+    if (executor_status.status != sz::status_t::success_k) return static_cast<sz_status_t>(executor_status.status);
+
+    auto *scope =
+        new (std::nothrow) device_scope_t {gpu_scope_t {.executor = std::move(executor), .specs = std::move(specs)}};
+    if (!scope) return sz_bad_alloc_k;
+    *scope_punned = reinterpret_cast<sz_device_scope_t>(scope);
+    return sz_success_k;
+#else
+    sz_unused_(gpu_device);
+    sz_unused_(scope_punned);
+    return sz_missing_gpu_k;
+#endif
+}
+
+SZ_DYNAMIC void sz_device_scope_free(sz_device_scope_t scope_punned) {
+    if (scope_punned == nullptr) return;
+    auto *scope = reinterpret_cast<device_scope_t *>(scope_punned);
+    delete scope;
+}
 
 SZ_DYNAMIC sz_status_t sz_fingerprints_init(                              //
     sz_size_t alphabet_size, sz_size_t const *window_widths,              //
@@ -263,6 +327,9 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_init(                              //
         *engine_punned = reinterpret_cast<sz_fingerprints_t>(engine);
         return sz_success_k;
     }
+
+    // TODO: Implement unrolled logic
+    return sz_status_unknown_k;
 }
 
 SZ_DYNAMIC sz_status_t sz_fingerprints_sequence(                      //
diff --git a/include/stringzillas/fingerprints.cuh b/include/stringzillas/fingerprints.cuh
index 5fb19280..cb0d460d 100644
--- a/include/stringzillas/fingerprints.cuh
+++ b/include/stringzillas/fingerprints.cuh
@@ -320,6 +320,7 @@ struct floating_rolling_hashers<sz_cap_cuda_k, dimensions_> {
 
         using task_t = cuda_floating_fingerprint_task_<byte_t>;
         using tasks_allocator_t = typename allocator_t::template rebind<task_t>::other;
+        sz_unused_(specs);
 
         // Preallocate the events for GPU timing.
         cudaEvent_t start_event, stop_event;
@@ -367,7 +368,7 @@ struct floating_rolling_hashers<sz_cap_cuda_k, dimensions_> {
             dim3(random_block_size),                            // Block dimensions
             warp_level_kernel_args,                             // Array of kernel argument pointers
             0,                                                  // Shared memory per block (in bytes)
-            executor.stream());                                   // CUDA stream
+            executor.stream());                                 // CUDA stream
         if (launch_error != cudaSuccess)
             if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
             else { return {status_t::unknown_k, launch_error}; }
@@ -449,7 +450,7 @@ struct floating_rolling_hashers<sz_cap_cuda_k, dimensions_> {
             dim3(random_block_size),                                                  // Block dimensions
             warp_level_kernel_args, // Array of kernel argument pointers
             0,                      // Shared memory per block (in bytes)
-            executor.stream());       // CUDA stream
+            executor.stream());     // CUDA stream
         if (launch_error != cudaSuccess)
             if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
             else { return {status_t::unknown_k, launch_error}; }
diff --git a/include/stringzillas/fingerprints.hpp b/include/stringzillas/fingerprints.hpp
index 424a0c40..299a4908 100644
--- a/include/stringzillas/fingerprints.hpp
+++ b/include/stringzillas/fingerprints.hpp
@@ -924,7 +924,6 @@ SZ_NOINLINE status_t floating_rolling_hashers_in_parallel_(
     using rolling_state_t = typename engine_t::rolling_state_t;
     using min_count_t = typename engine_t::min_count_t;
     using min_hash_t = typename engine_t::min_hash_t;
-    static constexpr auto window_width_ = engine_t::window_width_;
     static constexpr auto dimensions_k = engine_t::dimensions_k;
     static constexpr auto skipped_rolling_hash_k = engine_t::skipped_rolling_hash_k;
     static constexpr auto max_hash_k = engine_t::max_hash_k;
@@ -933,6 +932,7 @@ SZ_NOINLINE status_t floating_rolling_hashers_in_parallel_(
     // - Either split each text into chunks across threads
     // - Or split the texts themselves across threads
     size_t const text_size_threshold = specs.l2_bytes * executor.threads_count();
+    size_t const window_width = engine.window_width();
 
     // Process small texts by individual threads
     using executor_t = typename std::decay<executor_type_>::type;
@@ -973,7 +973,7 @@ SZ_NOINLINE status_t floating_rolling_hashers_in_parallel_(
             // ? This overlap will be different for different window widths, but assuming we are
             // ? computing the non-weighted Min-Hash, recomputing & comparing a few hashes for the
             // ? same slices isn't a big deal.
-            auto overlapping_text_end = (std::min)(text_start + chunk_size + window_width_ - 1, text_view.end());
+            auto overlapping_text_end = (std::min)(text_start + chunk_size + window_width - 1, text_view.end());
             auto thread_local_text = span<byte_t const>(text_start, overlapping_text_end);
 
             rolling_state_t thread_local_states[dimensions_k];
@@ -1296,6 +1296,7 @@ struct floating_rolling_hashers<sz_cap_haswell_k, dimensions_> {
     rolling_state_t modulos_[aligned_dimensions_k];
     rolling_state_t inverse_modulos_[aligned_dimensions_k];
     rolling_state_t negative_discarding_multipliers_[aligned_dimensions_k];
+    size_t window_width_;
 
   public:
     constexpr size_t dimensions() const noexcept { return dimensions_k; }
diff --git a/include/stringzillas/stringzillas.h b/include/stringzillas/stringzillas.h
index 136f0f53..45f750b5 100644
--- a/include/stringzillas/stringzillas.h
+++ b/include/stringzillas/stringzillas.h
@@ -72,7 +72,7 @@ typedef void *sz_device_scope_t;
 SZ_DYNAMIC sz_status_t sz_device_scope_init_default(sz_device_scope_t *scope);
 SZ_DYNAMIC sz_status_t sz_device_scope_init_cpu_cores(sz_size_t cpu_cores, sz_device_scope_t *scope);
 SZ_DYNAMIC sz_status_t sz_device_scope_init_gpu_device(sz_size_t gpu_device, sz_device_scope_t *scope);
-SZ_DYNAMIC sz_status_t sz_device_scope_free(sz_device_scope_t scope);
+SZ_DYNAMIC void sz_device_scope_free(sz_device_scope_t scope);
 
 /*  APIs for computing edit-distances between binary and UTF-8 strings.
  *  Supports `sz_sequence_t`, `sz_arrow_u32tape_t`, and `sz_arrow_u64tape_t` inputs.
diff --git a/include/stringzillas/types.hpp b/include/stringzillas/types.hpp
index f09fe087..663ea444 100644
--- a/include/stringzillas/types.hpp
+++ b/include/stringzillas/types.hpp
@@ -61,6 +61,7 @@ template <typename type_>
 using remove_cvref = typename std::remove_cv<typename std::remove_reference<type_>::type>::type;
 
 struct dummy_executor_t {
+    using prong_t = dummy_prong_t;
 
     constexpr size_t threads_count() const noexcept { return 1; }
     constexpr dummy_mutex_t make_mutex() const noexcept { return {}; }
diff --git a/scripts/bench_fingerprints.cuh b/scripts/bench_fingerprints.cuh
index df5f8715..f54a7031 100644
--- a/scripts/bench_fingerprints.cuh
+++ b/scripts/bench_fingerprints.cuh
@@ -221,10 +221,10 @@ void bench_fingerprints(environment_t const &env) {
 #if SZ_USE_CUDA
     bench_nullary(                          //
         env, "rolling_cuda", call_baseline, //
-        fingerprint_callable<rolling_cuda_t, gpu_specs_t>(tape, min_hashes_accelerated, min_counts_accelerated,
-                                                          *rolling_cuda, specs), //
-        callable_no_op_t {},                                                     // preprocessing
-        fingerprints_equality_t {})                                              // equality check
+        fingerprint_callable<rolling_cuda_t, cuda_executor_t, gpu_specs_t>(
+            tape, min_hashes_accelerated, min_counts_accelerated, *rolling_cuda, cuda_executor_t {}, specs), //
+        callable_no_op_t {},        // preprocessing
+        fingerprints_equality_t {}) // equality check
         .log(baseline, unrolled);
     scramble_accelerated_results();
 #endif // SZ_USE_CUDA
diff --git a/scripts/bench_similarities.cuh b/scripts/bench_similarities.cuh
index 0050aa46..edf63ca8 100644
--- a/scripts/bench_similarities.cuh
+++ b/scripts/bench_similarities.cuh
@@ -165,57 +165,60 @@ void bench_levenshtein(environment_t const &env) {
 
 #if SZ_USE_CUDA
         bench_unary(env, "levenshtein_cuda:batch"s + std::to_string(batch_size), call_linear_baseline,
-                    similarities_callable<levenshtein_cuda_t, gpu_specs_t>(
-                        env, results_linear_accelerated, levenshtein_cuda_t {weird_uniform, weird_linear}, specs),
+                    similarities_callable<levenshtein_cuda_t, cuda_executor_t, gpu_specs_t>(
+                        env, results_linear_accelerated, levenshtein_cuda_t {weird_uniform, weird_linear},
+                        cuda_executor_t {}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
             .log(linear_baseline);
         scramble_accelerated_results(results_linear_accelerated);
 
-        bench_unary(
-            env, "affine_levenshtein_cuda:batch"s + std::to_string(batch_size), call_affine_baseline,
-            similarities_callable<affine_levenshtein_cuda_t, gpu_specs_t>(
-                env, results_affine_accelerated, affine_levenshtein_cuda_t {weird_uniform, weird_affine}, specs),
-            callable_no_op_t {},        // preprocessing
-            similarities_equality_t {}) // equality check
+        bench_unary(env, "affine_levenshtein_cuda:batch"s + std::to_string(batch_size), call_affine_baseline,
+                    similarities_callable<affine_levenshtein_cuda_t, cuda_executor_t, gpu_specs_t>(
+                        env, results_affine_accelerated, affine_levenshtein_cuda_t {weird_uniform, weird_affine},
+                        cuda_executor_t {}, specs),
+                    callable_no_op_t {},        // preprocessing
+                    similarities_equality_t {}) // equality check
             .log(linear_baseline, affine_baseline);
         scramble_accelerated_results(results_affine_accelerated);
 #endif
 
 #if SZ_USE_KEPLER
         bench_unary(env, "levenshtein_kepler:batch"s + std::to_string(batch_size), call_linear_baseline,
-                    similarities_callable<levenshtein_kepler_t, gpu_specs_t>(
-                        env, results_linear_accelerated, levenshtein_kepler_t {weird_uniform, weird_linear}, specs),
+                    similarities_callable<levenshtein_kepler_t, cuda_executor_t, gpu_specs_t>(
+                        env, results_linear_accelerated, levenshtein_kepler_t {weird_uniform, weird_linear},
+                        cuda_executor_t {}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
             .log(linear_baseline);
         scramble_accelerated_results(results_linear_accelerated);
 
-        bench_unary(
-            env, "affine_levenshtein_kepler:batch"s + std::to_string(batch_size), call_affine_baseline,
-            similarities_callable<affine_levenshtein_kepler_t, gpu_specs_t>(
-                env, results_affine_accelerated, affine_levenshtein_kepler_t {weird_uniform, weird_affine}, specs),
-            callable_no_op_t {},        // preprocessing
-            similarities_equality_t {}) // equality check
+        bench_unary(env, "affine_levenshtein_kepler:batch"s + std::to_string(batch_size), call_affine_baseline,
+                    similarities_callable<affine_levenshtein_kepler_t, cuda_executor_t, gpu_specs_t>(
+                        env, results_affine_accelerated, affine_levenshtein_kepler_t {weird_uniform, weird_affine},
+                        cuda_executor_t {}, specs),
+                    callable_no_op_t {},        // preprocessing
+                    similarities_equality_t {}) // equality check
             .log(linear_baseline, affine_baseline);
         scramble_accelerated_results(results_affine_accelerated);
 #endif
 
 #if SZ_USE_HOPPER
         bench_unary(env, "levenshtein_hopper:batch"s + std::to_string(batch_size), call_linear_baseline,
-                    similarities_callable<levenshtein_hopper_t, gpu_specs_t>(
-                        env, results_linear_accelerated, levenshtein_hopper_t {weird_uniform, weird_linear}, specs),
+                    similarities_callable<levenshtein_hopper_t, cuda_executor_t, gpu_specs_t>(
+                        env, results_linear_accelerated, levenshtein_hopper_t {weird_uniform, weird_linear},
+                        cuda_executor_t {}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
             .log(linear_baseline);
         scramble_accelerated_results(results_linear_accelerated);
 
-        bench_unary(
-            env, "affine_levenshtein_hopper:batch"s + std::to_string(batch_size), call_affine_baseline,
-            similarities_callable<affine_levenshtein_hopper_t, gpu_specs_t>(
-                env, results_affine_accelerated, affine_levenshtein_hopper_t {weird_uniform, weird_affine}, specs),
-            callable_no_op_t {},        // preprocessing
-            similarities_equality_t {}) // equality check
+        bench_unary(env, "affine_levenshtein_hopper:batch"s + std::to_string(batch_size), call_affine_baseline,
+                    similarities_callable<affine_levenshtein_hopper_t, cuda_executor_t, gpu_specs_t>(
+                        env, results_affine_accelerated, affine_levenshtein_hopper_t {weird_uniform, weird_affine},
+                        cuda_executor_t {}, specs),
+                    callable_no_op_t {},        // preprocessing
+                    similarities_equality_t {}) // equality check
             .log(linear_baseline, affine_baseline);
         scramble_accelerated_results(results_affine_accelerated);
 #endif
@@ -323,16 +326,18 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
 
 #if SZ_USE_CUDA
         bench_unary(env, "needleman_wunsch_cuda:batch"s + std::to_string(batch_size), call_linear_global_baseline,
-                    similarities_callable<needleman_wunsch_cuda_t, gpu_specs_t>(
-                        env, results_linear_global_accelerated, {blosum62_matrix, blosum62_linear_cost}, specs),
+                    similarities_callable<needleman_wunsch_cuda_t, cuda_executor_t, gpu_specs_t>(
+                        env, results_linear_global_accelerated, {blosum62_matrix, blosum62_linear_cost},
+                        cuda_executor_t {}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
             .log(linear_global_baseline);
         scramble_accelerated_results(results_linear_global_accelerated);
 
         bench_unary(env, "smith_waterman_cuda:batch"s + std::to_string(batch_size), call_linear_local_baseline,
-                    similarities_callable<smith_waterman_cuda_t, gpu_specs_t>(
-                        env, results_linear_local_accelerated, {blosum62_matrix, blosum62_linear_cost}, specs),
+                    similarities_callable<smith_waterman_cuda_t, cuda_executor_t, gpu_specs_t>(
+                        env, results_linear_local_accelerated, {blosum62_matrix, blosum62_linear_cost},
+                        cuda_executor_t {}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
             .log(linear_local_baseline);
@@ -340,16 +345,18 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
 
         bench_unary(env, "affine_needleman_wunsch_cuda:batch"s + std::to_string(batch_size),
                     call_affine_global_baseline,
-                    similarities_callable<affine_needleman_wunsch_cuda_t, gpu_specs_t>(
-                        env, results_affine_global_accelerated, {blosum62_matrix, blosum62_affine_cost}, specs),
+                    similarities_callable<affine_needleman_wunsch_cuda_t, cuda_executor_t, gpu_specs_t>(
+                        env, results_affine_global_accelerated, {blosum62_matrix, blosum62_affine_cost},
+                        cuda_executor_t {}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
             .log(affine_global_baseline);
         scramble_accelerated_results(results_affine_global_accelerated);
 
         bench_unary(env, "affine_smith_waterman_cuda:batch"s + std::to_string(batch_size), call_affine_local_baseline,
-                    similarities_callable<affine_smith_waterman_cuda_t, gpu_specs_t>(
-                        env, results_affine_local_accelerated, {blosum62_matrix, blosum62_affine_cost}, specs),
+                    similarities_callable<affine_smith_waterman_cuda_t, cuda_executor_t, gpu_specs_t>(
+                        env, results_affine_local_accelerated, {blosum62_matrix, blosum62_affine_cost},
+                        cuda_executor_t {}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
             .log(affine_local_baseline);
@@ -358,16 +365,18 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
 
 #if SZ_USE_HOPPER
         bench_unary(env, "needleman_wunsch_hopper:batch"s + std::to_string(batch_size), call_linear_global_baseline,
-                    similarities_callable<needleman_wunsch_hopper_t, gpu_specs_t>(
-                        env, results_linear_global_accelerated, {blosum62_matrix, blosum62_linear_cost}, specs),
+                    similarities_callable<needleman_wunsch_hopper_t, cuda_executor_t, gpu_specs_t>(
+                        env, results_linear_global_accelerated, {blosum62_matrix, blosum62_linear_cost},
+                        cuda_executor_t {}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
             .log(linear_global_baseline);
         scramble_accelerated_results(results_linear_global_accelerated);
 
         bench_unary(env, "smith_waterman_hopper:batch"s + std::to_string(batch_size), call_linear_local_baseline,
-                    similarities_callable<smith_waterman_hopper_t, gpu_specs_t>(
-                        env, results_linear_local_accelerated, {blosum62_matrix, blosum62_linear_cost}, specs),
+                    similarities_callable<smith_waterman_hopper_t, cuda_executor_t, gpu_specs_t>(
+                        env, results_linear_local_accelerated, {blosum62_matrix, blosum62_linear_cost},
+                        cuda_executor_t {}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
             .log(linear_local_baseline);
@@ -375,16 +384,18 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
 
         bench_unary(env, "affine_needleman_wunsch_hopper:batch"s + std::to_string(batch_size),
                     call_affine_global_baseline,
-                    similarities_callable<affine_needleman_wunsch_hopper_t, gpu_specs_t>(
-                        env, results_affine_global_accelerated, {blosum62_matrix, blosum62_affine_cost}, specs),
+                    similarities_callable<affine_needleman_wunsch_hopper_t, cuda_executor_t, gpu_specs_t>(
+                        env, results_affine_global_accelerated, {blosum62_matrix, blosum62_affine_cost},
+                        cuda_executor_t {}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
             .log(affine_global_baseline);
         scramble_accelerated_results(results_affine_global_accelerated);
 
         bench_unary(env, "affine_smith_waterman_hopper:batch"s + std::to_string(batch_size), call_affine_local_baseline,
-                    similarities_callable<affine_smith_waterman_hopper_t, gpu_specs_t>(
-                        env, results_affine_local_accelerated, {blosum62_matrix, blosum62_affine_cost}, specs),
+                    similarities_callable<affine_smith_waterman_hopper_t, cuda_executor_t, gpu_specs_t>(
+                        env, results_affine_local_accelerated, {blosum62_matrix, blosum62_affine_cost},
+                        cuda_executor_t {}, specs),
                     callable_no_op_t {},        // preprocessing
                     similarities_equality_t {}) // equality check
             .log(affine_local_baseline);
diff --git a/scripts/test_fingerprints.cuh b/scripts/test_fingerprints.cuh
index e0e7d2ef..f869c389 100644
--- a/scripts/test_fingerprints.cuh
+++ b/scripts/test_fingerprints.cuh
@@ -15,6 +15,7 @@
 
 #if SZ_USE_CUDA
 #include "stringzillas/fingerprints.cuh"
+#include "stringzillas/types.cuh" // `unified_alloc`
 #endif
 
 #if !SZ_IS_CPP17_
@@ -352,14 +353,14 @@ void test_rolling_hashers_equivalence_against_baseline(texts_type_ const &texts,
     using min_counts_t = safe_array<u32_t, dims_k>;
 
     arrow_strings_tape_t texts_tape;
-    safe_vector<min_hashes_t, unified_alloc<min_hashes_t>> serial_hashes_per_text, accelerated_hashes_per_text;
-    safe_vector<min_counts_t, unified_alloc<min_counts_t>> serial_counts_per_text, accelerated_counts_per_text;
+    unified_vector<min_hashes_t> serial_hashes_per_text, accelerated_hashes_per_text;
+    unified_vector<min_counts_t> serial_counts_per_text, accelerated_counts_per_text;
 
     sz_assert_(texts_tape.try_assign(texts.begin(), texts.end()) == status_t::success_k);
-    sz_assert_(serial_hashes_per_text.try_resize(texts.size()) == status_t::success_k);
-    sz_assert_(accelerated_hashes_per_text.try_resize(texts.size()) == status_t::success_k);
-    sz_assert_(serial_counts_per_text.try_resize(texts.size()) == status_t::success_k);
-    sz_assert_(accelerated_counts_per_text.try_resize(texts.size()) == status_t::success_k);
+    serial_hashes_per_text.resize(texts.size());
+    accelerated_hashes_per_text.resize(texts.size());
+    serial_counts_per_text.resize(texts.size());
+    accelerated_counts_per_text.resize(texts.size());
 
     // Compute the fingerprints
     for (size_t text_index = 0; text_index < texts.size(); ++text_index) {
diff --git a/scripts/test_similarities.cuh b/scripts/test_similarities.cuh
index e28cd22a..7a00a6e6 100644
--- a/scripts/test_similarities.cuh
+++ b/scripts/test_similarities.cuh
@@ -383,9 +383,9 @@ void edit_distance_log_mismatch(std::string const &first, std::string const &sec
  *          as well as the similarity scoring functions for bioinformatics-like workloads
  *          on a @b fixed set of different representative ASCII and UTF-8 strings.
  */
-template <typename score_type_, typename base_operator_, typename simd_operator_, typename... extra_args_>
+template <typename score_type_, typename base_operator_, typename simd_operator_, typename... simd_extra_args_>
 void test_similarity_scores_fixed(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
-                                  std::string_view allowed_chars = {}, extra_args_ &&...extra_args) {
+                                  std::string_view allowed_chars = {}, simd_extra_args_ &&...simd_extra_args) {
 
     std::vector<std::pair<std::string, std::string>> test_cases;
     auto append = [&test_cases](std::string const &first, std::string const &second) {
@@ -500,7 +500,7 @@ void test_similarity_scores_fixed(base_operator_ &&base_operator, simd_operator_
         score_t *results_base_ptr = results_base.data();
         score_t *results_simd_ptr = results_simd.data();
         status_t status_base = base_operator(first_view, second_view, results_base_ptr);
-        status_t status_simd = simd_operator(first_view, second_view, results_simd_ptr, extra_args...);
+        status_t status_simd = simd_operator(first_view, second_view, results_simd_ptr, simd_extra_args...);
         sz_assert_(status_base == status_t::success_k);
         sz_assert_(status_simd == status_t::success_k);
         if (results_base[0] != results_simd[0])
@@ -520,7 +520,8 @@ void test_similarity_scores_fixed(base_operator_ &&base_operator, simd_operator_
 
         // Compute with both backends
         status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
-        status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data(), extra_args...);
+        status_t status_simd =
+            simd_operator(first_tape.view(), second_tape.view(), results_simd.data(), simd_extra_args...);
         sz_assert_(status_base == status_t::success_k);
         sz_assert_(status_simd == status_t::success_k);
 
@@ -537,10 +538,10 @@ void test_similarity_scores_fixed(base_operator_ &&base_operator, simd_operator_
  *          as well as the similarity scoring functions for bioinformatics-like workloads
  *          on a synthetic @b randomly-generated set of strings from a given @p alphabet.
  */
-template <typename score_type_, typename base_operator_, typename simd_operator_, typename... extra_args_>
+template <typename score_type_, typename base_operator_, typename simd_operator_, typename... simd_extra_args_>
 void test_similarity_scores_fuzzy(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
                                   fuzzy_config_t config = {}, std::size_t iterations = 10,
-                                  extra_args_ &&...extra_args) {
+                                  simd_extra_args_ &&...simd_extra_args) {
 
     unified_vector<score_type_> results_base(config.batch_size), results_simd(config.batch_size);
     std::vector<std::string> first_array, second_array;
@@ -553,7 +554,8 @@ void test_similarity_scores_fuzzy(base_operator_ &&base_operator, simd_operator_
 
         // Compute with both backends
         status_t status_base = base_operator(first_tape.view(), second_tape.view(), results_base.data());
-        status_t status_simd = simd_operator(first_tape.view(), second_tape.view(), results_simd.data(), extra_args...);
+        status_t status_simd =
+            simd_operator(first_tape.view(), second_tape.view(), results_simd.data(), simd_extra_args...);
         sz_assert_(status_base == status_t::success_k);
         sz_assert_(status_simd == status_t::success_k);
 
@@ -565,12 +567,12 @@ void test_similarity_scores_fuzzy(base_operator_ &&base_operator, simd_operator_
     }
 }
 
-template <typename score_type_, typename base_operator_, typename simd_operator_, typename... extra_args_>
+template <typename score_type_, typename base_operator_, typename simd_operator_, typename... simd_extra_args_>
 void test_similarity_scores_fixed_and_fuzzy(base_operator_ &&base_operator, simd_operator_ &&simd_operator,
                                             std::string_view allowed_chars = {}, fuzzy_config_t config = {},
-                                            extra_args_ &&...extra_args) {
-    test_similarity_scores_fixed<score_type_>(base_operator, simd_operator, allowed_chars, extra_args...);
-    test_similarity_scores_fuzzy<score_type_>(base_operator, simd_operator, config, 1, extra_args...);
+                                            simd_extra_args_ &&...simd_extra_args) {
+    test_similarity_scores_fixed<score_type_>(base_operator, simd_operator, allowed_chars, simd_extra_args...);
+    test_similarity_scores_fuzzy<score_type_>(base_operator, simd_operator, config, 1, simd_extra_args...);
 }
 
 /**
@@ -733,7 +735,7 @@ void test_similarity_scores_equivalence() {
 
 #if SZ_USE_CUDA
     gpu_specs_t first_gpu_specs;
-    sz_assert_(get_first_gpu_specs(first_gpu_specs) == status_t::success_k);
+    sz_assert_(gpu_specs_fetch(first_gpu_specs) == status_t::success_k);
 #endif
 
 #if SZ_USE_CUDA
@@ -741,13 +743,13 @@ void test_similarity_scores_equivalence() {
     test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                                            //
         levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear}, //
         levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k> {weird_uniform, weird_linear}, {}, {},
-        first_gpu_specs);
+        cuda_executor_t {}, first_gpu_specs);
 
     // CUDA Levenshtein distance against Multi-threaded on CPU with weird affine costs
     test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                                            //
         levenshtein_distances<char, affine_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_affine}, //
         levenshtein_distances<char, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k> {weird_uniform, weird_affine}, {}, {},
-        first_gpu_specs);
+        cuda_executor_t {}, first_gpu_specs);
 #endif
 
 #if SZ_USE_KEPLER
@@ -755,7 +757,7 @@ void test_similarity_scores_equivalence() {
     test_similarity_scores_fixed_and_fuzzy<sz_size_t>(                                                            //
         levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {weird_uniform, weird_linear}, //
         levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_caps_ck_k> {weird_uniform, weird_linear}, {}, {},
-        first_gpu_specs);
+        cuda_executor_t {}, first_gpu_specs);
 #endif
 
 #if SZ_USE_CUDA
@@ -765,7 +767,7 @@ void test_similarity_scores_equivalence() {
             blosum62_matrix, blosum62_linear_cost}, //
         needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k> {
             blosum62_matrix, blosum62_linear_cost},
-        {}, {}, first_gpu_specs);
+        {}, {}, cuda_executor_t {}, first_gpu_specs);
 
     // CUDA Needleman-Wunsch score against Multi-threaded on CPU with affine costs
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
@@ -773,7 +775,7 @@ void test_similarity_scores_equivalence() {
             blosum62_matrix, blosum62_affine_cost}, //
         needleman_wunsch_scores<char, error_matrix_t, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k> {
             blosum62_matrix, blosum62_affine_cost},
-        {}, {}, first_gpu_specs);
+        {}, {}, cuda_executor_t {}, first_gpu_specs);
 
     // CUDA Smith-Waterman score against Multi-threaded on CPU
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
@@ -781,7 +783,7 @@ void test_similarity_scores_equivalence() {
             blosum62_matrix, blosum62_linear_cost}, //
         smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k> {blosum62_matrix,
                                                                                                   blosum62_linear_cost},
-        {}, {}, first_gpu_specs);
+        {}, {}, cuda_executor_t {}, first_gpu_specs);
 
     // CUDA Smith-Waterman score against Multi-threaded on CPU with affine costs
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
@@ -789,7 +791,7 @@ void test_similarity_scores_equivalence() {
             blosum62_matrix, blosum62_affine_cost}, //
         smith_waterman_scores<char, error_matrix_t, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k> {blosum62_matrix,
                                                                                                   blosum62_affine_cost},
-        {}, {}, first_gpu_specs);
+        {}, {}, cuda_executor_t {}, first_gpu_specs);
 #endif
 
 #if SZ_USE_HOPPER
@@ -799,7 +801,7 @@ void test_similarity_scores_equivalence() {
             blosum62_matrix, blosum62_linear_cost}, //
         needleman_wunsch_scores<char, error_matrix_t, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k> {
             blosum62_matrix, blosum62_linear_cost},
-        {}, {}, first_gpu_specs);
+        {}, {}, cuda_executor_t {}, first_gpu_specs);
 
     // CUDA Needleman-Wunsch score on Hopper against Multi-threaded on CPU with affine costs
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
@@ -807,7 +809,7 @@ void test_similarity_scores_equivalence() {
             blosum62_matrix, blosum62_affine_cost}, //
         needleman_wunsch_scores<char, error_matrix_t, affine_gap_costs_t, ualloc_t, sz_caps_ckh_k> {
             blosum62_matrix, blosum62_affine_cost},
-        {}, {}, first_gpu_specs);
+        {}, {}, cuda_executor_t {}, first_gpu_specs);
 
     // CUDA Smith-Waterman score on Hopper against Multi-threaded on CPU
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
@@ -815,7 +817,7 @@ void test_similarity_scores_equivalence() {
             blosum62_matrix, blosum62_linear_cost}, //
         smith_waterman_scores<char, error_matrix_t, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k> {blosum62_matrix,
                                                                                                   blosum62_linear_cost},
-        {}, {}, first_gpu_specs);
+        {}, {}, cuda_executor_t {}, first_gpu_specs);
 
     // CUDA Smith-Waterman score on Hopper against Multi-threaded on CPU with affine costs
     test_similarity_scores_fixed_and_fuzzy<sz_ssize_t>( //
@@ -823,7 +825,7 @@ void test_similarity_scores_equivalence() {
             blosum62_matrix, blosum62_affine_cost}, //
         smith_waterman_scores<char, error_matrix_t, affine_gap_costs_t, ualloc_t, sz_caps_ckh_k> {blosum62_matrix,
                                                                                                   blosum62_affine_cost},
-        {}, {}, first_gpu_specs);
+        {}, {}, cuda_executor_t {}, first_gpu_specs);
 
 #endif
 }
@@ -860,7 +862,7 @@ void test_similarity_scores_memory_usage() {
 
 #if SZ_USE_CUDA
     gpu_specs_t first_gpu_specs;
-    sz_assert_(get_first_gpu_specs(first_gpu_specs) == status_t::success_k);
+    sz_assert_(gpu_specs_fetch(first_gpu_specs) == status_t::success_k);
 #endif
 
     // Let's define some weird scoring schemes for Levenshtein-like distance, that are not unary:
@@ -914,7 +916,7 @@ void test_similarity_scores_memory_usage() {
         test_similarity_scores_fuzzy<sz_size_t>(                                           //
             levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {}, //
             levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k> {}, experiment, 10,
-            first_gpu_specs);
+            cuda_executor_t {}, first_gpu_specs);
 #endif
 
 #if SZ_USE_KEPLER
@@ -922,7 +924,7 @@ void test_similarity_scores_memory_usage() {
         test_similarity_scores_fuzzy<sz_size_t>(                                           //
             levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k> {}, //
             levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_caps_ck_k> {}, experiment, 10,
-            first_gpu_specs);
+            cuda_executor_t {}, first_gpu_specs);
 #endif
     }
 }

From 2e1daa45885611a53c476ebe07015c41c8025811 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 3 Aug 2025 19:54:48 +0000
Subject: [PATCH 524/751] Make: FMA flag for Haswell

---
 include/stringzillas/fingerprints.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/stringzillas/fingerprints.hpp b/include/stringzillas/fingerprints.hpp
index 299a4908..290d074d 100644
--- a/include/stringzillas/fingerprints.hpp
+++ b/include/stringzillas/fingerprints.hpp
@@ -1251,8 +1251,8 @@ struct floating_rolling_hashers<sz_cap_serial_k, dimensions_> {
 #pragma region Haswell Implementation
 #if SZ_USE_HASWELL
 #pragma GCC push_options
-#pragma GCC target("avx2")
-#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
+#pragma GCC target("avx2", "fma")
+#pragma clang attribute push(__attribute__((target("avx2,fma"))), apply_to = function)
 
 SZ_INLINE __m256d _mm256_floor_magic_pd(__m256d x) noexcept {
     // Magic number rounding approach for fast floor

From d4a66c5ad4cf2eddd3ef1668ae41793576c21b53 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 3 Aug 2025 20:35:52 +0000
Subject: [PATCH 525/751] Make: Move CUDA lib into header

---
 c/{stringzillas.cu => stringzillas.cuh} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename c/{stringzillas.cu => stringzillas.cuh} (100%)

diff --git a/c/stringzillas.cu b/c/stringzillas.cuh
similarity index 100%
rename from c/stringzillas.cu
rename to c/stringzillas.cuh

From 496ae847d3f5bf633d49775a5066f00d86647de9 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 3 Aug 2025 20:41:28 +0000
Subject: [PATCH 526/751] Make: Forward `march` flags through NVCC

---
 CMakeLists.txt | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e37620aa..0b99019c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -295,7 +295,14 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
     if ("${target_arch}" STREQUAL "")
         # Only use the current system if we are not cross compiling
         if ((NOT CMAKE_CROSSCOMPILING) OR (CMAKE_SYSTEM_PROCESSOR MATCHES CMAKE_HOST_SYSTEM_PROCESSOR))
-            if (NOT (${compiler_id} STREQUAL "MSVC"))
+            if (${compiler_id} STREQUAL "NVIDIA")
+                # For NVCC, pass native flag to host compiler
+                include(CheckCXXCompilerFlag)
+                check_cxx_compiler_flag("-march=native" supports_march_native)
+                if (supports_march_native)
+                    target_compile_options(${target} PRIVATE "-Xcompiler=-march=native")
+                endif ()
+            elseif (NOT (${compiler_id} STREQUAL "MSVC"))
                 include(CheckCXXCompilerFlag)
                 check_cxx_compiler_flag("-march=native" supports_march_native)
                 if (supports_march_native)
@@ -309,6 +316,9 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
     else ()
         if (${compiler_id} STREQUAL "MSVC")
             target_compile_options(${target} PRIVATE "/arch:${target_arch}")
+        elseif (${compiler_id} STREQUAL "NVIDIA")
+            # NVCC handles CPU architecture through host compiler flags
+            target_compile_options(${target} PRIVATE "-Xcompiler=-march=${target_arch}")
         else ()
             target_compile_options(${target} PRIVATE "-march=${target_arch}")
         endif ()

From ff019b1192b53cc614c0f0a96f34dca00cec9062 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 3 Aug 2025 20:42:02 +0000
Subject: [PATCH 527/751] Make: Separate parallel library sources

---
 CMakeLists.txt     | 8 ++++----
 c/stringzillas.cpp | 7 +++++++
 c/stringzillas.cu  | 7 +++++++
 c/stringzillas.cuh | 2 +-
 4 files changed, 19 insertions(+), 5 deletions(-)
 create mode 100644 c/stringzillas.cpp
 create mode 100644 c/stringzillas.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0b99019c..480f391b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -537,8 +537,8 @@ endif ()
 
 if (${STRINGZILLAS_BUILD_SHARED})
     # StringZillas shared library targets for parallel string operations
-    function (define_stringzillas_shared target backend_flags)
-        add_library(${target} SHARED c/stringzillas.cu)
+    function (define_stringzillas_shared target source_file backend_flags)
+        add_library(${target} SHARED ${source_file})
         add_library(${PROJECT_NAME}::${target} ALIAS ${target})
 
         set_target_properties(
@@ -586,11 +586,11 @@ if (${STRINGZILLAS_BUILD_SHARED})
     endfunction ()
 
     # Define StringZillas CPU shared library
-    define_stringzillas_shared(stringzillas_cpus_shared "SZ_USE_CUDA=0;SZ_USE_ROCM=0")
+    define_stringzillas_shared(stringzillas_cpus_shared c/stringzillas.cpp "SZ_USE_CUDA=0;SZ_USE_ROCM=0")
 
     # Define StringZillas CUDA shared library (only if CUDA is available)
     if (ENABLE_CUDA)
-        define_stringzillas_shared(stringzillas_cuda_shared "SZ_USE_CUDA=1;SZ_USE_ROCM=0")
+        define_stringzillas_shared(stringzillas_cuda_shared c/stringzillas.cu "SZ_USE_CUDA=1;SZ_USE_ROCM=0")
 
         # Link CUDA libraries
         target_link_libraries(stringzillas_cuda_shared PRIVATE CUDA::cudart)
diff --git a/c/stringzillas.cpp b/c/stringzillas.cpp
new file mode 100644
index 00000000..b3110a86
--- /dev/null
+++ b/c/stringzillas.cpp
@@ -0,0 +1,7 @@
+/**
+ *  @file       stringzillas.cpp
+ *  @brief      StringZillas library for parallel string operations using CPU backends.
+ *  @author     Ash Vardanian
+ *  @date       March 23, 2025
+ */
+#include "stringzillas.cuh"
\ No newline at end of file
diff --git a/c/stringzillas.cu b/c/stringzillas.cu
new file mode 100644
index 00000000..e1a45088
--- /dev/null
+++ b/c/stringzillas.cu
@@ -0,0 +1,7 @@
+/**
+ *  @file       stringzillas.cu
+ *  @brief      StringZillas library for parallel string operations using CUDA backends.
+ *  @author     Ash Vardanian
+ *  @date       March 23, 2025
+ */
+#include "stringzillas.cuh"
\ No newline at end of file
diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index 0e44a3a2..3388a3c3 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -1,6 +1,6 @@
 /**
  *  @file       stringzillas.cu
- *  @brief      StringZillas library for parallel string operations using CUDA C++ and OpenMP backends.
+ *  @brief      StringZillas library shared code for parallel string operations using CPU & CUDA backends.
  *  @author     Ash Vardanian
  *  @date       March 23, 2025
  */

From 014002ec1ec7e6a28cff8c160e6f55ab2b22aa48 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 3 Aug 2025 20:47:20 +0000
Subject: [PATCH 528/751] Fix: Avoid `_mm256_cvtepi64_epi32` on Haswell

---
 include/stringzillas/fingerprints.hpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/include/stringzillas/fingerprints.hpp b/include/stringzillas/fingerprints.hpp
index 290d074d..de54b16b 100644
--- a/include/stringzillas/fingerprints.hpp
+++ b/include/stringzillas/fingerprints.hpp
@@ -1547,8 +1547,11 @@ struct floating_rolling_hashers<sz_cap_haswell_k, dimensions_> {
         else {
             _mm256_storeu_pd(&last_states[first_dim], last_states_vec.ymm_pd);
             _mm256_storeu_pd(&rolling_minimums[first_dim], rolling_minimums_vec.ymm_pd);
-            _mm_storeu_si128(reinterpret_cast<__m128i *>(&rolling_counts[first_dim]),
-                             _mm256_cvtepi64_epi32(rolling_counts_vec.ymm));
+            // AVX2-compatible replacement for `_mm256_cvtepi64_epi32`
+            __m256i shuffled = _mm256_shuffle_epi32(rolling_counts_vec.ymm, _MM_SHUFFLE(2, 0, 2, 0));
+            __m128i lo = _mm256_extracti128_si256(shuffled, 0);
+            __m128i hi = _mm256_extracti128_si256(shuffled, 1);
+            _mm_storeu_si128(reinterpret_cast<__m128i *>(&rolling_counts[first_dim]), _mm_unpacklo_epi64(lo, hi));
         }
     }
 };

From 5bb90b3128bf76e753a4e4f7eb9bc39e8138e15a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 3 Aug 2025 21:58:56 +0000
Subject: [PATCH 529/751] Make: Pull submodules in CI

---
 .github/workflows/prerelease.yml | 18 +++++++++++++++++-
 .github/workflows/release.yml    | 10 +++++++---
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 383434a7..0d4cd598 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -53,6 +53,8 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
+        with:
+          submodules: recursive
 
         # C/C++
         # If the compilation fails, we want to log the compilation commands in addition to
@@ -145,6 +147,8 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
+        with:
+          submodules: recursive
 
         # C/C++
         # Clang 16 isn't available from default repos on Ubuntu 22.04, so we have to install it manually
@@ -230,6 +234,8 @@ jobs:
     container: swift:5.9
     steps:
       - uses: actions/checkout@v4
+        with:
+          submodules: recursive
       - name: Test Swift
         run: swift test
 
@@ -254,6 +260,8 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
+        with:
+          submodules: recursive
 
         # C/C++
         # We need to install the cross-compilation toolchain for ARM64 and ARMHF
@@ -294,6 +302,8 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
+        with:
+          submodules: recursive
 
         # C/C++
       - name: Install dependencies
@@ -359,6 +369,8 @@ jobs:
     runs-on: windows-2022
     steps:
       - uses: actions/checkout@v4
+        with:
+          submodules: recursive
       - uses: ilammy/msvc-dev-cmd@v1
 
       - name: Build C/C++
@@ -414,7 +426,9 @@ jobs:
       image: alpine:latest
       options: --privileged # If needed for certain Docker operations
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
 
         # C/C++
       - name: Build C/C++
@@ -462,6 +476,8 @@ jobs:
         python-version: ["36", "37", "38", "39", "310", "311", "312", "313"]
     steps:
       - uses: actions/checkout@v4
+        with:
+          submodules: recursive
       - name: Set up Python
         uses: actions/setup-python@v5.2.0
         with:
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 034c33b9..2e2c0566 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -57,6 +57,7 @@ jobs:
         uses: actions/checkout@v4
         with:
           fetch-depth: 0
+          submodules: recursive
 
       - name: Perform rebase
         run: |
@@ -84,6 +85,7 @@ jobs:
       - uses: actions/checkout@v4
         with:
           ref: "main"
+          submodules: recursive
 
       - name: Set up Python
         uses: actions/setup-python@v5.2.0
@@ -114,6 +116,7 @@ jobs:
       - uses: actions/checkout@v4
         with:
           ref: "main"
+          submodules: recursive
       - name: Set up Python
         uses: actions/setup-python@v5.2.0
         with:
@@ -168,7 +171,7 @@ jobs:
       - uses: actions/checkout@v4
         with:
           ref: "main"
-      - run: git submodule update --init --recursive
+          submodules: recursive
       - uses: actions-rs/toolchain@v1
         with:
           toolchain: stable
@@ -222,6 +225,7 @@ jobs:
         with:
           persist-credentials: false
           ref: "main"
+          submodules: recursive
 
       - name: Get version
         id: set_version
@@ -292,7 +296,7 @@ jobs:
         with:
           persist-credentials: false
           ref: "main"
-      - run: git submodule update --init --recursive
+          submodules: recursive
 
       - name: Setup CMake
         uses: jwlawson/actions-setup-cmake@v1.13
@@ -337,7 +341,7 @@ jobs:
         with:
           persist-credentials: false
           ref: "main"
-      - run: git submodule update --init --recursive
+          submodules: recursive
 
       - name: Get version
         id: set_version

From 325cedbc6eab18a14fd4ed098d4bb2b2224b25cf Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 4 Aug 2025 00:21:24 +0000
Subject: [PATCH 530/751] Fix: Using braces for Clang builds

---
 include/stringzillas/similarities.hpp |  4 +-
 scripts/bench_memory.cpp              | 84 +++++++++++++--------------
 scripts/bench_similarities.cuh        |  2 +
 scripts/bench_token.cpp               | 64 ++++++++++----------
 4 files changed, 78 insertions(+), 76 deletions(-)

diff --git a/include/stringzillas/similarities.hpp b/include/stringzillas/similarities.hpp
index ab9a8e39..7cec33ee 100644
--- a/include/stringzillas/similarities.hpp
+++ b/include/stringzillas/similarities.hpp
@@ -147,7 +147,7 @@ struct uniform_substitution_costs_t {
  *          so smaller variants should be preferred where possible.
  */
 struct error_costs_256x256_t {
-    error_cost_t cells[256][256] = {0};
+    error_cost_t cells[256][256] = {{0}};
 
     constexpr error_cost_t operator()(char a, char b) const noexcept { return cells[(sz_u8_t)a][(sz_u8_t)b]; }
     constexpr error_cost_t operator()(sz_u8_t a, sz_u8_t b) const noexcept { return cells[a][b]; }
@@ -2457,7 +2457,7 @@ struct smith_waterman_scores {
  *  @endcode
  */
 struct error_costs_26x26ascii_t {
-    error_cost_t cells[26][26] = {0};
+    error_cost_t cells[26][26] = {{0}};
 
     constexpr error_cost_t operator()(char a, char b) const noexcept { return cells[(sz_u8_t)a - 65][(sz_u8_t)b - 65]; }
     constexpr error_cost_t operator()(sz_u8_t a, sz_u8_t b) const noexcept { return cells[a - 65][b - 65]; }
diff --git a/scripts/bench_memory.cpp b/scripts/bench_memory.cpp
index 33e36872..86203828 100644
--- a/scripts/bench_memory.cpp
+++ b/scripts/bench_memory.cpp
@@ -119,29 +119,29 @@ void bench_copy(environment_t const &env) {
     sz_ptr_t o = output_buffer.get();
 
     // Provide a baseline
-    bench_result_t align = bench_unary(env, "sz_copy_serial(align)", copy_from_sz<sz_copy_serial>(env, o)).log();
-    bench_result_t shift = bench_unary(env, "sz_copy_serial(shift)", copy_from_sz<sz_copy_serial, 1>(env, o)) //
+    bench_result_t align = bench_unary(env, "sz_copy_serial(align)", copy_from_sz<sz_copy_serial> {env, o}).log();
+    bench_result_t shift = bench_unary(env, "sz_copy_serial(shift)", copy_from_sz<sz_copy_serial, 1> {env, o}) //
                                .log(align);
 
 #if SZ_USE_HASWELL
-    bench_unary(env, "sz_copy_haswell(align)", copy_from_sz<sz_copy_haswell>(env, o)).log(align);
-    bench_unary(env, "sz_copy_haswell(shift)", copy_from_sz<sz_copy_haswell, 1>(env, o)).log(align, shift);
+    bench_unary(env, "sz_copy_haswell(align)", copy_from_sz<sz_copy_haswell> {env, o}).log(align);
+    bench_unary(env, "sz_copy_haswell(shift)", copy_from_sz<sz_copy_haswell, 1> {env, o}).log(align, shift);
 #endif
 #if SZ_USE_SKYLAKE
-    bench_unary(env, "sz_copy_skylake(align)", copy_from_sz<sz_copy_skylake>(env, o)).log(align);
-    bench_unary(env, "sz_copy_skylake(shift)", copy_from_sz<sz_copy_skylake, 1>(env, o)).log(align, shift);
+    bench_unary(env, "sz_copy_skylake(align)", copy_from_sz<sz_copy_skylake> {env, o}).log(align);
+    bench_unary(env, "sz_copy_skylake(shift)", copy_from_sz<sz_copy_skylake, 1> {env, o}).log(align, shift);
 #endif
 #if SZ_USE_NEON
-    bench_unary(env, "sz_copy_neon(align)", copy_from_sz<sz_copy_neon>(env, o)).log(align);
-    bench_unary(env, "sz_copy_neon(shift)", copy_from_sz<sz_copy_neon, 1>(env, o)).log(align, shift);
+    bench_unary(env, "sz_copy_neon(align)", copy_from_sz<sz_copy_neon> {env, o}).log(align);
+    bench_unary(env, "sz_copy_neon(shift)", copy_from_sz<sz_copy_neon, 1> {env, o}).log(align, shift);
 #endif
 #if SZ_USE_SVE
-    bench_unary(env, "sz_copy_sve(align)", copy_from_sz<sz_copy_sve>(env, o)).log(align);
-    bench_unary(env, "sz_copy_sve(shift)", copy_from_sz<sz_copy_sve, 1>(env, o)).log(align, shift);
+    bench_unary(env, "sz_copy_sve(align)", copy_from_sz<sz_copy_sve> {env, o}).log(align);
+    bench_unary(env, "sz_copy_sve(shift)", copy_from_sz<sz_copy_sve, 1> {env, o}).log(align, shift);
 #endif
 
-    bench_unary(env, "std::memcpy(align)", copy_from_sz<memcpy_like_sz>(env, o)).log(align);
-    bench_unary(env, "std::memcpy(shift)", copy_from_sz<memcpy_like_sz, 1>(env, o)).log(align, shift);
+    bench_unary(env, "std::memcpy(align)", copy_from_sz<memcpy_like_sz> {env, o}).log(align);
+    bench_unary(env, "std::memcpy(shift)", copy_from_sz<memcpy_like_sz, 1> {env, o}).log(align, shift);
 }
 
 #pragma endregion // MemCpy
@@ -191,28 +191,28 @@ void bench_move(environment_t const &env) {
     std::memcpy(o, env.dataset.data(), env.dataset.size());
 
     // Provide a baseline for shifting forward by a single byte or a single cache line
-    bench_result_t byte = bench_unary(env, "sz_move_serial(by1)", move_from_sz<sz_move_serial, 1>(env, o)).log();
-    bench_result_t page = bench_unary(env, "sz_move_serial(by64)", move_from_sz<sz_move_serial, 64>(env, o)).log(byte);
+    bench_result_t byte = bench_unary(env, "sz_move_serial(by1)", move_from_sz<sz_move_serial, 1> {env, o}).log();
+    bench_result_t page = bench_unary(env, "sz_move_serial(by64)", move_from_sz<sz_move_serial, 64> {env, o}).log(byte);
 
 #if SZ_USE_HASWELL
-    bench_unary(env, "sz_move_haswell(by1)", move_from_sz<sz_move_haswell, 1>(env, o)).log(byte);
-    bench_unary(env, "sz_move_haswell(by64)", move_from_sz<sz_move_haswell, 64>(env, o)).log(byte, page);
+    bench_unary(env, "sz_move_haswell(by1)", move_from_sz<sz_move_haswell, 1> {env, o}).log(byte);
+    bench_unary(env, "sz_move_haswell(by64)", move_from_sz<sz_move_haswell, 64> {env, o}).log(byte, page);
 #endif
 #if SZ_USE_SKYLAKE
-    bench_unary(env, "sz_move_skylake(by1)", move_from_sz<sz_move_skylake, 1>(env, o)).log(byte);
-    bench_unary(env, "sz_move_skylake(by64)", move_from_sz<sz_move_skylake, 64>(env, o)).log(byte, page);
+    bench_unary(env, "sz_move_skylake(by1)", move_from_sz<sz_move_skylake, 1> {env, o}).log(byte);
+    bench_unary(env, "sz_move_skylake(by64)", move_from_sz<sz_move_skylake, 64> {env, o}).log(byte, page);
 #endif
 #if SZ_USE_NEON
-    bench_unary(env, "sz_move_neon(by1)", move_from_sz<sz_move_neon, 1>(env, o)).log(byte);
-    bench_unary(env, "sz_move_neon(by64)", move_from_sz<sz_move_neon, 64>(env, o)).log(byte, page);
+    bench_unary(env, "sz_move_neon(by1)", move_from_sz<sz_move_neon, 1> {env, o}).log(byte);
+    bench_unary(env, "sz_move_neon(by64)", move_from_sz<sz_move_neon, 64> {env, o}).log(byte, page);
 #endif
 #if SZ_USE_SVE
-    bench_unary(env, "sz_move_sve(by1)", move_from_sz<sz_move_sve, 1>(env, o)).log(byte);
-    bench_unary(env, "sz_move_sve(by64)", move_from_sz<sz_move_sve, 64>(env, o)).log(byte, page);
+    bench_unary(env, "sz_move_sve(by1)", move_from_sz<sz_move_sve, 1> {env, o}).log(byte);
+    bench_unary(env, "sz_move_sve(by64)", move_from_sz<sz_move_sve, 64> {env, o}).log(byte, page);
 #endif
 
-    bench_unary(env, "std::memmove(by1)", move_from_sz<memmove_like_sz, 1>(env, o)).log(byte);
-    bench_unary(env, "std::memmove(by64)", move_from_sz<memmove_like_sz, 64>(env, o)).log(byte, page);
+    bench_unary(env, "std::memmove(by1)", move_from_sz<memmove_like_sz, 1> {env, o}).log(byte);
+    bench_unary(env, "std::memmove(by64)", move_from_sz<memmove_like_sz, 64> {env, o}).log(byte, page);
 }
 
 #pragma endregion // MemMove
@@ -285,34 +285,34 @@ void bench_fill(environment_t const &env) {
     std::memcpy(o, env.dataset.data(), env.dataset.size());
 
     // Provide a baseline for overwriting the `output_buffer` memory
-    bench_result_t zeros = bench_unary(env, "sz_fill_serial", fill_from_sz<sz_fill_serial>(env, o)).log();
-    auto random_call = fill_random_from_sz<sz_fill_random_serial>(env, o);
+    bench_result_t zeros = bench_unary(env, "sz_fill_serial", fill_from_sz<sz_fill_serial> {env, o}).log();
+    auto random_call = fill_random_from_sz<sz_fill_random_serial> {env, o};
     bench_result_t random = bench_unary(env, "sz_fill_random_serial", random_call).log(zeros);
 
 #if SZ_USE_HASWELL
-    bench_unary(env, "sz_fill_haswell", fill_from_sz<sz_fill_haswell>(env, o)).log(zeros);
-    bench_unary(env, "sz_fill_random_haswell", random_call, fill_random_from_sz<sz_fill_random_haswell>(env, o))
+    bench_unary(env, "sz_fill_haswell", fill_from_sz<sz_fill_haswell> {env, o}).log(zeros);
+    bench_unary(env, "sz_fill_random_haswell", random_call, fill_random_from_sz<sz_fill_random_haswell> {env, o})
         .log(zeros, random);
 #endif
 #if SZ_USE_SKYLAKE
-    bench_unary(env, "sz_fill_skylake", fill_from_sz<sz_fill_skylake>(env, o)).log(zeros);
-    bench_unary(env, "sz_fill_random_skylake", random_call, fill_random_from_sz<sz_fill_random_skylake>(env, o))
+    bench_unary(env, "sz_fill_skylake", fill_from_sz<sz_fill_skylake> {env, o}).log(zeros);
+    bench_unary(env, "sz_fill_random_skylake", random_call, fill_random_from_sz<sz_fill_random_skylake> {env, o})
         .log(zeros, random);
 #endif
 #if SZ_USE_ICE
-    bench_unary(env, "sz_fill_random_ice", random_call, fill_random_from_sz<sz_fill_random_ice>(env, o))
+    bench_unary(env, "sz_fill_random_ice", random_call, fill_random_from_sz<sz_fill_random_ice> {env, o})
         .log(zeros, random);
 #endif
 #if SZ_USE_NEON
-    bench_unary(env, "sz_fill_neon", fill_from_sz<sz_fill_neon>(env, o)).log(zeros);
-    bench_unary(env, "sz_fill_random_neon", random_call, fill_random_from_sz<sz_fill_random_neon>(env, o))
+    bench_unary(env, "sz_fill_neon", fill_from_sz<sz_fill_neon> {env, o}).log(zeros);
+    bench_unary(env, "sz_fill_random_neon", random_call, fill_random_from_sz<sz_fill_random_neon> {env, o})
         .log(zeros, random);
 #endif
 #if SZ_USE_SVE
-    bench_unary(env, "sz_fill_sve", fill_from_sz<sz_fill_sve>(env, o)).log(zeros);
+    bench_unary(env, "sz_fill_sve", fill_from_sz<sz_fill_sve> {env, o}).log(zeros);
 #endif
-    bench_unary(env, "fill<std::memset>", fill_from_sz<memset_like_sz>(env, o)).log(zeros);
-    bench_unary(env, "fill<std::random_device>", fill_random_from_sz<generate_like_sz>(env, o)).log(zeros, random);
+    bench_unary(env, "fill<std::memset>", fill_from_sz<memset_like_sz> {env, o}).log(zeros);
+    bench_unary(env, "fill<std::random_device>", fill_random_from_sz<generate_like_sz> {env, o}).log(zeros, random);
 }
 
 #pragma endregion // Broadcasting Constants with MemSet
@@ -367,18 +367,18 @@ void bench_lookup(environment_t const &env) {
 
     // Provide a baseline for overwriting the `output_buffer` memory
     sz_cptr_t lut = reinterpret_cast<sz_cptr_t>(lookup_table);
-    bench_result_t zeros = bench_unary(env, "sz_lookup_serial", lookup_from_sz<sz_lookup_serial>(env, o, lut)).log();
+    bench_result_t zeros = bench_unary(env, "sz_lookup_serial", lookup_from_sz<sz_lookup_serial> {env, o, lut}).log();
 
 #if SZ_USE_HASWELL
-    bench_unary(env, "sz_lookup_haswell", lookup_from_sz<sz_lookup_haswell>(env, o, lut)).log(zeros);
+    bench_unary(env, "sz_lookup_haswell", lookup_from_sz<sz_lookup_haswell> {env, o, lut}).log(zeros);
 #endif
 #if SZ_USE_ICE
-    bench_unary(env, "sz_lookup_ice", lookup_from_sz<sz_lookup_ice>(env, o, lut)).log(zeros);
+    bench_unary(env, "sz_lookup_ice", lookup_from_sz<sz_lookup_ice> {env, o, lut}).log(zeros);
 #endif
 #if SZ_USE_NEON
-    bench_unary(env, "sz_lookup_neon", lookup_from_sz<sz_lookup_neon>(env, o, lut)).log(zeros);
+    bench_unary(env, "sz_lookup_neon", lookup_from_sz<sz_lookup_neon> {env, o, lut}).log(zeros);
 #endif
-    bench_unary(env, "lookup<std::transform>", lookup_from_sz<transform_like_sz>(env, o, lut)).log(zeros);
+    bench_unary(env, "lookup<std::transform>", lookup_from_sz<transform_like_sz> {env, o, lut}).log(zeros);
 }
 
 #pragma endregion // Lookup Transformations
@@ -400,4 +400,4 @@ int main(int argc, char const **argv) {
 
     std::printf("All benchmarks passed.\n");
     return 0;
-}
\ No newline at end of file
+}
diff --git a/scripts/bench_similarities.cuh b/scripts/bench_similarities.cuh
index edf63ca8..3fc51eaa 100644
--- a/scripts/bench_similarities.cuh
+++ b/scripts/bench_similarities.cuh
@@ -109,6 +109,7 @@ void bench_levenshtein(environment_t const &env) {
     auto scramble_accelerated_results = [&](similarities_t &results_accelerated) {
         std::shuffle(results_accelerated.begin(), results_accelerated.end(), global_random_generator());
     };
+    sz_unused_(scramble_accelerated_results);
 
     // Let's define some weird scoring schemes for Levenshtein-like distance, that are not unary:
     constexpr linear_gap_costs_t weird_linear {3};
@@ -256,6 +257,7 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
     auto scramble_accelerated_results = [&](similarities_t &results_accelerated) {
         std::shuffle(results_accelerated.begin(), results_accelerated.end(), global_random_generator());
     };
+    sz_unused_(scramble_accelerated_results);
 
     for (std::size_t batch_size : batch_sizes) {
         results_linear_global_baseline.resize(batch_size), results_linear_global_accelerated.resize(batch_size);
diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index dba9f3c7..4aec90a9 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -139,80 +139,80 @@ struct hash_stream_from_sz {
 
 void bench_checksums(environment_t const &env) {
 
-    auto validator = bytesum_from_std_t(env);
+    auto validator = bytesum_from_std_t {env};
     bench_result_t base_stl = bench_unary(env, "bytesum<std::accumulate>", validator).log();
     bench_result_t base =
-        bench_unary(env, "sz_bytesum_serial", validator, bytesum_from_sz<sz_bytesum_serial>(env)).log(base_stl);
+        bench_unary(env, "sz_bytesum_serial", validator, bytesum_from_sz<sz_bytesum_serial> {env}).log(base_stl);
 
 #if SZ_USE_HASWELL
-    bench_unary(env, "sz_bytesum_haswell", validator, bytesum_from_sz<sz_bytesum_haswell>(env)).log(base, base_stl);
+    bench_unary(env, "sz_bytesum_haswell", validator, bytesum_from_sz<sz_bytesum_haswell> {env}).log(base, base_stl);
 #endif
 #if SZ_USE_SKYLAKE
-    bench_unary(env, "sz_bytesum_skylake", validator, bytesum_from_sz<sz_bytesum_skylake>(env)).log(base, base_stl);
+    bench_unary(env, "sz_bytesum_skylake", validator, bytesum_from_sz<sz_bytesum_skylake> {env}).log(base, base_stl);
 #endif
 #if SZ_USE_ICE
-    bench_unary(env, "sz_bytesum_ice", validator, bytesum_from_sz<sz_bytesum_ice>(env)).log(base, base_stl);
+    bench_unary(env, "sz_bytesum_ice", validator, bytesum_from_sz<sz_bytesum_ice> {env}).log(base, base_stl);
 #endif
 #if SZ_USE_NEON
-    bench_unary(env, "sz_bytesum_neon", validator, bytesum_from_sz<sz_bytesum_neon>(env)).log(base, base_stl);
+    bench_unary(env, "sz_bytesum_neon", validator, bytesum_from_sz<sz_bytesum_neon> {env}).log(base, base_stl);
 #endif
 #if SZ_USE_SVE
-    bench_unary(env, "sz_bytesum_sve", validator, bytesum_from_sz<sz_bytesum_sve>(env)).log(base, base_stl);
+    bench_unary(env, "sz_bytesum_sve", validator, bytesum_from_sz<sz_bytesum_sve> {env}).log(base, base_stl);
 #endif
 #if SZ_USE_SVE2
-    bench_unary(env, "sz_bytesum_sve2", validator, bytesum_from_sz<sz_bytesum_sve2>(env)).log(base, base_stl);
+    bench_unary(env, "sz_bytesum_sve2", validator, bytesum_from_sz<sz_bytesum_sve2> {env}).log(base, base_stl);
 #endif
 }
 
 void bench_hashing(environment_t const &env) {
 
-    auto validator = hash_from_sz<sz_hash_serial>(env);
+    auto validator = hash_from_sz<sz_hash_serial> {env};
     bench_result_t base = bench_unary(env, "sz_hash_serial", validator).log();
-    bench_result_t base_stl = bench_unary(env, "std::hash", hash_from_std_t(env)).log(base);
+    bench_result_t base_stl = bench_unary(env, "std::hash", hash_from_std_t {env}).log(base);
 #if SZ_USE_HASWELL
-    bench_unary(env, "sz_hash_haswell", validator, hash_from_sz<sz_hash_haswell>(env)).log(base, base_stl);
+    bench_unary(env, "sz_hash_haswell", validator, hash_from_sz<sz_hash_haswell> {env}).log(base, base_stl);
 #endif
 #if SZ_USE_SKYLAKE
-    bench_unary(env, "sz_hash_skylake", validator, hash_from_sz<sz_hash_skylake>(env)).log(base, base_stl);
+    bench_unary(env, "sz_hash_skylake", validator, hash_from_sz<sz_hash_skylake> {env}).log(base, base_stl);
 #endif
 #if SZ_USE_ICE
-    bench_unary(env, "sz_hash_ice", validator, hash_from_sz<sz_hash_ice>(env)).log(base, base_stl);
+    bench_unary(env, "sz_hash_ice", validator, hash_from_sz<sz_hash_ice> {env}).log(base, base_stl);
 #endif
 #if SZ_USE_SVE2
-    bench_unary(env, "sz_hash_sve2", validator, hash_from_sz<sz_hash_sve2>(env)).log(base, base_stl);
+    bench_unary(env, "sz_hash_sve2", validator, hash_from_sz<sz_hash_sve2> {env}).log(base, base_stl);
 #endif
 #if SZ_USE_NEON
-    bench_unary(env, "sz_hash_neon", validator, hash_from_sz<sz_hash_neon>(env)).log(base, base_stl);
+    bench_unary(env, "sz_hash_neon", validator, hash_from_sz<sz_hash_neon> {env}).log(base, base_stl);
 #endif
 }
 
 void bench_stream_hashing(environment_t const &env) {
 
     auto validator =
-        hash_stream_from_sz<sz_hash_state_init_serial, sz_hash_state_stream_serial, sz_hash_state_fold_serial>(env);
+        hash_stream_from_sz<sz_hash_state_init_serial, sz_hash_state_stream_serial, sz_hash_state_fold_serial> {env};
     bench_result_t base = bench_unary(env, "sz_hash_stream_serial", validator).log();
-    bench_result_t base_stl = bench_unary(env, "std::hash", hash_from_std_t(env)).log(base);
+    bench_result_t base_stl = bench_unary(env, "std::hash", hash_from_std_t {env}).log(base);
 
 #if SZ_USE_HASWELL
     bench_unary(
         env, "sz_hash_stream_haswell", validator,
-        hash_stream_from_sz<sz_hash_state_init_haswell, sz_hash_state_stream_haswell, sz_hash_state_fold_haswell>(env))
+        hash_stream_from_sz<sz_hash_state_init_haswell, sz_hash_state_stream_haswell, sz_hash_state_fold_haswell> {env})
         .log(base, base_stl);
 #endif
 #if SZ_USE_SKYLAKE
     bench_unary(
         env, "sz_hash_stream_skylake", validator,
-        hash_stream_from_sz<sz_hash_state_init_skylake, sz_hash_state_stream_skylake, sz_hash_state_fold_skylake>(env))
+        hash_stream_from_sz<sz_hash_state_init_skylake, sz_hash_state_stream_skylake, sz_hash_state_fold_skylake> {env})
         .log(base, base_stl);
 #endif
 #if SZ_USE_ICE
     bench_unary(env, "sz_hash_stream_ice", validator,
-                hash_stream_from_sz<sz_hash_state_init_ice, sz_hash_state_stream_ice, sz_hash_state_fold_ice>(env))
+                hash_stream_from_sz<sz_hash_state_init_ice, sz_hash_state_stream_ice, sz_hash_state_fold_ice> {env})
         .log(base, base_stl);
 #endif
 #if SZ_USE_NEON
     bench_unary(env, "sz_hash_stream_neon", validator,
-                hash_stream_from_sz<sz_hash_state_init_neon, sz_hash_state_stream_neon, sz_hash_state_fold_neon>(env))
+                hash_stream_from_sz<sz_hash_state_init_neon, sz_hash_state_stream_neon, sz_hash_state_fold_neon> {env})
         .log(base, base_stl);
 #endif
 }
@@ -334,38 +334,38 @@ struct ordering_from_memcmp_t {
 
 void bench_comparing_equality(environment_t const &env) {
 
-    auto validator = equality_from_memcmp_t(env);
-    bench_result_t base = bench_unary(env, "sz_equal_serial", validator, equality_from_sz<sz_equal_serial>(env)).log();
+    auto validator = equality_from_memcmp_t {env};
+    bench_result_t base = bench_unary(env, "sz_equal_serial", validator, equality_from_sz<sz_equal_serial> {env}).log();
     bench_result_t base_stl = bench_unary(env, "equal<std::memcmp>", validator).log(base);
 
 #if SZ_USE_HASWELL
-    bench_unary(env, "sz_equal_haswell", validator, equality_from_sz<sz_equal_haswell>(env)).log(base, base_stl);
+    bench_unary(env, "sz_equal_haswell", validator, equality_from_sz<sz_equal_haswell> {env}).log(base, base_stl);
 #endif
 #if SZ_USE_SKYLAKE
-    bench_unary(env, "sz_equal_skylake", validator, equality_from_sz<sz_equal_skylake>(env)).log(base, base_stl);
+    bench_unary(env, "sz_equal_skylake", validator, equality_from_sz<sz_equal_skylake> {env}).log(base, base_stl);
 #endif
 #if SZ_USE_NEON
-    bench_unary(env, "sz_equal_neon", validator, equality_from_sz<sz_equal_neon>(env)).log(base, base_stl);
+    bench_unary(env, "sz_equal_neon", validator, equality_from_sz<sz_equal_neon> {env}).log(base, base_stl);
 #endif
 #if SZ_USE_SVE
-    bench_unary(env, "sz_equal_sve", validator, equality_from_sz<sz_equal_sve>(env)).log(base, base_stl);
+    bench_unary(env, "sz_equal_sve", validator, equality_from_sz<sz_equal_sve> {env}).log(base, base_stl);
 #endif
 }
 
 void bench_comparing_order(environment_t const &env) {
 
-    auto validator = ordering_from_memcmp_t(env);
-    bench_result_t base = bench_unary(env, "sz_order_serial", validator, ordering_from_sz<sz_order_serial>(env)).log();
+    auto validator = ordering_from_memcmp_t {env};
+    bench_result_t base = bench_unary(env, "sz_order_serial", validator, ordering_from_sz<sz_order_serial> {env}).log();
     bench_result_t base_stl = bench_unary(env, "order<std::memcmp>", validator).log(base);
 
 #if SZ_USE_HASWELL
-    bench_unary(env, "sz_order_haswell", validator, ordering_from_sz<sz_order_haswell>(env)).log(base, base_stl);
+    bench_unary(env, "sz_order_haswell", validator, ordering_from_sz<sz_order_haswell> {env}).log(base, base_stl);
 #endif
 #if SZ_USE_SKYLAKE
-    bench_unary(env, "sz_order_skylake", validator, ordering_from_sz<sz_order_skylake>(env)).log(base, base_stl);
+    bench_unary(env, "sz_order_skylake", validator, ordering_from_sz<sz_order_skylake> {env}).log(base, base_stl);
 #endif
 #if SZ_USE_NEON
-    bench_unary(env, "sz_order_neon", validator, ordering_from_sz<sz_order_neon>(env)).log(base, base_stl);
+    bench_unary(env, "sz_order_neon", validator, ordering_from_sz<sz_order_neon> {env}).log(base, base_stl);
 #endif
 }
 

From 0441b323aff89d9e066096299f57eb8c80be6f07 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 4 Aug 2025 00:25:35 +0000
Subject: [PATCH 531/751] Add: `Strs.from_arrow` conversion

---
 python/stringzilla.c        | 261 +++++++++++++++++++++++++++++++++++-
 scripts/test_stringzilla.py | 109 +++++++++++++++
 2 files changed, 366 insertions(+), 4 deletions(-)

diff --git a/python/stringzilla.c b/python/stringzilla.c
index 469f4bbc..d28d98ee 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -8,6 +8,12 @@
  *
  *  - Doesn't use PyBind11, NanoBind, Boost.Python, or any other high-level libs, only CPython API.
  *  - To minimize latency this implementation avoids `PyArg_ParseTupleAndKeywords` calls.
+ *  - Reimplements all of the `str` functionality in C as a `Str` type.
+ *  - Provides a highly generic `Strs` class for handling collections of strings, Arrow-style or not.
+ *
+ *  Pandas doesn't provide a C API, and even in the 2.0 the Apache Arrow representation is opt-in, not default.
+ *  PyCapsule protocol in conjunction with @b `__arrow_c_array__` dunder methods can be used to extract strings.
+ *  @see https://arrow.apache.org/docs/python/generated/pyarrow.array.html
  */
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
 #define NOMINMAX
@@ -42,6 +48,39 @@ typedef SSIZE_T ssize_t;
 
 #include <stringzilla/stringzilla.h>
 
+/**
+ *  @brief Arrow C Data Interface structure for an array schema.
+ *  @see https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions
+ */
+struct ArrowSchema {
+    const char *format;
+    const char *name;
+    const char *metadata;
+    int64_t flags;
+    int64_t n_children;
+    struct ArrowSchema **children;
+    struct ArrowSchema *dictionary;
+    void (*release)(struct ArrowSchema *);
+    void *private_data;
+};
+
+/**
+ *  @brief Arrow C Data Interface structure for an array content.
+ *  @see https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions
+ */
+struct ArrowArray {
+    int64_t length;
+    int64_t null_count;
+    int64_t offset;
+    int64_t n_buffers;
+    int64_t n_children;
+    const void **buffers;
+    struct ArrowArray **children;
+    struct ArrowArray *dictionary;
+    void (*release)(struct ArrowArray *);
+    void *private_data;
+};
+
 #pragma region Forward Declarations
 
 static PyTypeObject FileType;
@@ -565,10 +604,24 @@ static int File_init(File *self, PyObject *positional_args, PyObject *named_args
 static PyMethodDef File_methods[] = { //
     {NULL, NULL, 0, NULL}};
 
+static char const doc_File[] = //
+    "File(path, mode='r')\\n"
+    "\\n"
+    "Memory-mapped file class that exposes the memory range for low-level access.\\n"
+    "Provides efficient read-only access to file contents without loading into memory.\\n"
+    "\\n"
+    "Args:\\n"
+    "  path (str): Path to the file to memory-map.\\n"
+    "  mode (str): File access mode (default: 'r' for read-only).\\n"
+    "\\n"
+    "Example:\\n"
+    "  >>> f = sz.File('data.txt')\\n"
+    "  >>> content = str(f)  # Access file contents as string";
+
 static PyTypeObject FileType = {
     PyVarObject_HEAD_INIT(NULL, 0) //
         .tp_name = "stringzilla.File",
-    .tp_doc = "Memory mapped file class, that exposes the memory range for low-level access",
+    .tp_doc = doc_File,
     .tp_basicsize = sizeof(File),
     .tp_flags = Py_TPFLAGS_DEFAULT,
     .tp_methods = File_methods,
@@ -3080,10 +3133,31 @@ static PyMethodDef Str_methods[] = {
     {NULL, NULL, 0, NULL} // Sentinel
 };
 
+static char const doc_Str[] = //
+    "Str(source)\\n"
+    "\\n"
+    "Immutable byte-string/slice class with SIMD and SWAR-accelerated operations.\\n"
+    "Provides high-performance byte-string operations using modern CPU instructions.\\n"
+    "\\n"
+    "Args:\\n"
+    "  source (str, bytes, bytearray, or buffer): Source data to wrap.\\n"
+    "\\n"
+    "Methods:\\n"
+    "  - find(), rfind(): Fast substring search with SIMD acceleration\\n"
+    "  - count(): Count occurrences with optional overlap support\\n"
+    "  - split(), rsplit(): String splitting with various separators\\n"
+    "  - contains(): Fast membership testing\\n"
+    "  - translate(): Byte-level translations with lookup tables\\n"
+    "\\n"
+    "Example:\\n"
+    "  >>> s = sz.Str('hello world')\\n"
+    "  >>> s.find('world')  # Returns 6\\n"
+    "  >>> s.count('l')     # Returns 3";
+
 static PyTypeObject StrType = {
     PyVarObject_HEAD_INIT(NULL, 0) //
         .tp_name = "stringzilla.Str",
-    .tp_doc = "Immutable string/slice class with SIMD and SWAR-accelerated operations",
+    .tp_doc = doc_Str,
     .tp_basicsize = sizeof(Str),
     .tp_flags = Py_TPFLAGS_DEFAULT,
     .tp_new = Str_new,
@@ -3165,13 +3239,35 @@ static PyObject *SplitIteratorType_iter(PyObject *self) {
     return self;
 }
 
+static char const doc_SplitIterator[] = //
+    "SplitIterator(string, separator, ...)\\n"
+    "\\n"
+    "Text-splitting iterator for efficient string processing.\\n"
+    "Provides lazy evaluation of string splits without materializing all results.\\n"
+    "\\n"
+    "Created by:\\n"
+    "  - Str.split_iter()\\n"
+    "  - Str.rsplit_iter()\\n"
+    "  - Str.split_byteset_iter()\\n"
+    "  - Str.rsplit_byteset_iter()\\n"
+    "\\n"
+    "Features:\\n"
+    "  - Memory-efficient: yields results one at a time\\n"
+    "  - Forward and reverse iteration support\\n"
+    "  - Character set and string separator support\\n"
+    "\\n"
+    "Example:\\n"
+    "  >>> s = sz.Str('a,b,c,d')\\n"
+    "  >>> for part in s.split_iter(','):\\n"
+    "  ...     print(part)";
+
 static PyTypeObject SplitIteratorType = {
     PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzilla.SplitIterator",
     .tp_basicsize = sizeof(SplitIterator),
     .tp_itemsize = 0,
     .tp_dealloc = (destructor)SplitIteratorType_dealloc,
     .tp_flags = Py_TPFLAGS_DEFAULT,
-    .tp_doc = "Text-splitting iterator",
+    .tp_doc = doc_SplitIterator,
     .tp_iter = SplitIteratorType_iter,
     .tp_iternext = (iternextfunc)SplitIteratorType_next,
 };
@@ -3649,7 +3745,136 @@ static PyGetSetDef Strs_getsetters[] = {
     {NULL} // Sentinel
 };
 
+static char const doc_Strs_from_arrow[] = //
+    "from_arrow(arrow_array)\n"
+    "\n"
+    "Create a Strs object from an Arrow string array with zero-copy semantics.\n"
+    "\n"
+    "Args:\n"
+    "  arrow_array: Arrow array object supporting `__arrow_c_array__` protocol.\n"
+    "\n"
+    "Returns:\n"
+    "  Strs: Zero-copy view of the Arrow string array.";
+
+static PyObject *Strs_from_arrow(PyObject *cls, PyObject *args, PyObject *kwargs) {
+    PyObject *arrow_array_obj = NULL;
+
+    // Manual argument parsing for performance
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs != 1) {
+        PyErr_SetString(PyExc_TypeError, "from_arrow() takes exactly 1 argument");
+        return NULL;
+    }
+
+    arrow_array_obj = PyTuple_GET_ITEM(args, 0);
+
+    // Try to get the __arrow_c_array__ method
+    PyObject *arrow_c_array_method = PyObject_GetAttrString(arrow_array_obj, "__arrow_c_array__");
+    if (!arrow_c_array_method) {
+        PyErr_SetString(PyExc_TypeError,
+                        "Object does not support Arrow C interface (__arrow_c_array__ method missing)");
+        return NULL;
+    }
+
+    // Call __arrow_c_array__() to get the capsules
+    PyObject *capsules = PyObject_CallNoArgs(arrow_c_array_method);
+    Py_DECREF(arrow_c_array_method);
+
+    if (!capsules || !PyTuple_Check(capsules) || PyTuple_Size(capsules) != 2) {
+        Py_XDECREF(capsules);
+        PyErr_SetString(PyExc_ValueError, "__arrow_c_array__ must return a tuple of 2 capsules (schema, array)");
+        return NULL;
+    }
+
+    PyObject *schema_capsule = PyTuple_GET_ITEM(capsules, 0);
+    PyObject *array_capsule = PyTuple_GET_ITEM(capsules, 1);
+
+    if (!PyCapsule_CheckExact(schema_capsule) || !PyCapsule_CheckExact(array_capsule)) {
+        Py_DECREF(capsules);
+        PyErr_SetString(PyExc_ValueError, "Expected PyCapsule objects from __arrow_c_array__");
+        return NULL;
+    }
+
+    // Get the Arrow C schema and array structures
+    struct ArrowSchema *schema = (struct ArrowSchema *)PyCapsule_GetPointer(schema_capsule, "arrow_schema");
+    struct ArrowArray *array = (struct ArrowArray *)PyCapsule_GetPointer(array_capsule, "arrow_array");
+
+    if (!schema || !array) {
+        Py_DECREF(capsules);
+        PyErr_SetString(PyExc_ValueError, "Failed to extract Arrow C structures from capsules");
+        return NULL;
+    }
+
+    // Validate that this is a string array (utf8, large utf8, or binary)
+    if (!schema->format ||
+        (strcmp(schema->format, "u") != 0 && strcmp(schema->format, "U") != 0 && strcmp(schema->format, "z") != 0)) {
+        Py_DECREF(capsules);
+        PyErr_SetString(PyExc_ValueError, "Arrow array must be string type (utf8, large utf8, or binary)");
+        return NULL;
+    }
+
+    // Validate that we have the expected number of buffers (validity, offsets, data)
+    if (array->n_buffers != 3) {
+        Py_DECREF(capsules);
+        PyErr_SetString(PyExc_ValueError, "String Arrow array must have exactly 3 buffers");
+        return NULL;
+    }
+
+    // Extract the buffers: validity (optional), offsets, data
+    const void **buffers = (const void **)array->buffers;
+    const char *data_buffer = (const char *)buffers[2]; // String data
+    size_t length = array->length;
+
+    // Create a new Strs object
+    Strs *result = (Strs *)StrsType.tp_alloc(&StrsType, 0);
+    if (!result) {
+        Py_DECREF(capsules);
+        return NULL;
+    }
+
+    // Determine if we need 32-bit or 64-bit offsets based on Arrow format
+    const int32_t *offsets_32 = NULL;
+    const int64_t *offsets_64 = NULL;
+    int use_64bit = (strcmp(schema->format, "U") == 0); // Large strings use 64-bit offsets
+
+    if (use_64bit) { offsets_64 = (const int64_t *)buffers[1]; }
+    else {
+        offsets_32 = (const int32_t *)buffers[1];
+        // Check if the last offset exceeds 32-bit range
+        int32_t max_offset_32 = offsets_32[length];
+        if (max_offset_32 < 0) { // Overflow indicates we need 64-bit
+            use_64bit = 1;
+            offsets_64 = (const int64_t *)buffers[1];
+        }
+    }
+
+    if (use_64bit) {
+        result->type = STRS_CONSECUTIVE_64;
+        result->data.consecutive_64bit.count = length;
+        result->data.consecutive_64bit.separator_length = 0;     // No separator in Arrow arrays
+        result->data.consecutive_64bit.parent_string = capsules; // Keep capsules alive
+        result->data.consecutive_64bit.start = data_buffer;
+        // Arrow has N+1 offsets, we need the end offsets which start at offsets[1]
+        result->data.consecutive_64bit.end_offsets = (uint64_t *)(offsets_64 + 1);
+        Py_INCREF(capsules); // Keep the capsules alive
+    }
+    else {
+        result->type = STRS_CONSECUTIVE_32;
+        result->data.consecutive_32bit.count = length;
+        result->data.consecutive_32bit.separator_length = 0;     // No separator in Arrow arrays
+        result->data.consecutive_32bit.parent_string = capsules; // Keep capsules alive
+        result->data.consecutive_32bit.start = data_buffer;
+        // Arrow has N+1 offsets, we need the end offsets which start at offsets[1]
+        result->data.consecutive_32bit.end_offsets = (uint32_t *)(offsets_32 + 1);
+        Py_INCREF(capsules); // Keep the capsules alive
+    }
+
+    Py_DECREF(capsules);
+    return (PyObject *)result;
+}
+
 static PyMethodDef Strs_methods[] = {
+    {"from_arrow", (PyCFunction)Strs_from_arrow, METH_VARARGS | METH_KEYWORDS | METH_CLASS, doc_Strs_from_arrow},
     {"shuffle", Strs_shuffle, SZ_METHOD_FLAGS, "Shuffle (in-place) the elements of the Strs object."}, //
     {"sort", Strs_sort, SZ_METHOD_FLAGS, "Sort (in-place) the elements of the Strs object."},          //
     {"argsort", Strs_argsort, SZ_METHOD_FLAGS, "Provides the permutation to achieve sorted order."},   //
@@ -3658,9 +3883,37 @@ static PyMethodDef Strs_methods[] = {
     {NULL, NULL, 0, NULL} // Sentinel
 };
 
+static char const doc_Strs[] = //
+    "Strs(source)\\n"
+    "\\n"
+    "Space-efficient container for large collections of strings and their slices.\\n"
+    "Optimized for memory efficiency and bulk operations on string collections.\\n"
+    "\\n"
+    "Args:\\n"
+    "  source (sequence): Iterable of strings to store.\\n"
+    "\\n"
+    "Features:\\n"
+    "  - Memory-efficient storage with shared backing buffers\\n"
+    "  - Zero-copy slicing and indexing operations\\n"
+    "  - Bulk operations: sort(), shuffle(), sample()\\n"
+    "  - Arrow integration: from_arrow() for zero-copy imports\\n"
+    "  - Fast comparison operations with native containers\\n"
+    "\\n"
+    "Methods:\\n"
+    "  - sort(): In-place sorting with custom comparison\\n"
+    "  - argsort(): Get indices for sorted order\\n"
+    "  - shuffle(): Randomize element order\\n"
+    "  - sample(): Get random subset of elements\\n"
+    "  - from_arrow(): Create from Apache Arrow arrays (zero-copy)\\n"
+    "\\n"
+    "Example:\\n"
+    "  >>> strs = sz.Strs(['apple', 'banana', 'cherry'])\\n"
+    "  >>> strs.sort()\\n"
+    "  >>> list(strs)  # ['apple', 'banana', 'cherry']";
+
 static PyTypeObject StrsType = {
     PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzilla.Strs",
-    .tp_doc = "Space-efficient container for large collections of strings and their slices",
+    .tp_doc = doc_Strs,
     .tp_basicsize = sizeof(Strs),
     .tp_itemsize = 0,
     .tp_flags = Py_TPFLAGS_DEFAULT,
diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index c9e3b3be..c273271a 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -734,6 +734,115 @@ def test_pyarrow_str_conversion():
     assert arrow_buffer.to_pybytes() == native.encode("utf-8")
 
 
+@pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
+def test_strs_from_arrow_basic():
+    """Test basic conversion from Arrow string array to Strs."""
+    arrow_array = pa.array(["hello", "world", "test", "arrow"])
+    strs = Strs.from_arrow(arrow_array)
+    
+    assert len(strs) == 4
+    assert strs[0] == "hello"
+    assert strs[1] == "world" 
+    assert strs[2] == "test"
+    assert strs[3] == "arrow"
+
+
+@pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
+def test_strs_from_arrow_empty_strings():
+    """Test conversion with empty strings."""
+    arrow_array = pa.array(["hello", "", "world", ""])
+    strs = Strs.from_arrow(arrow_array)
+    
+    assert len(strs) == 4
+    assert strs[0] == "hello"
+    assert strs[1] == ""
+    assert strs[2] == "world"
+    assert strs[3] == ""
+
+
+@pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
+def test_strs_from_arrow_unicode():
+    """Test conversion with Unicode strings."""
+    arrow_array = pa.array(["hello", "мир", "🌍", "test"])
+    strs = Strs.from_arrow(arrow_array)
+    
+    assert len(strs) == 4
+    assert strs[0] == "hello"
+    assert strs[1] == "мир"
+    assert strs[2] == "🌍" 
+    assert strs[3] == "test"
+
+
+@pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
+def test_strs_from_arrow_binary_array():
+    """Test conversion from Arrow binary array."""
+    binary_data = [b"hello", b"world", b"binary", b"data"]
+    arrow_array = pa.array(binary_data, type=pa.binary())
+    
+    strs = Strs.from_arrow(arrow_array)
+    
+    assert len(strs) == 4
+    # Strs should handle binary data properly - compare as bytes
+    for i, expected in enumerate(binary_data):
+        str_bytes = strs[i].encode('latin-1') if isinstance(strs[i], str) else bytes(strs[i])
+        assert str_bytes == expected
+
+
+@pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
+def test_strs_from_arrow_large_strings():
+    """Test conversion from Arrow large string array."""
+    arrow_array = pa.array(["hello", "world", "large", "strings"], type=pa.large_string())
+    
+    strs = Strs.from_arrow(arrow_array)
+    
+    assert len(strs) == 4
+    assert strs[0] == "hello"
+    assert strs[1] == "world"
+    assert strs[2] == "large"
+    assert strs[3] == "strings"
+
+
+@pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
+def test_strs_from_arrow_error_cases():
+    """Test error handling for invalid inputs."""
+    # Test with non-Arrow object
+    with pytest.raises(TypeError):
+        Strs.from_arrow("not_an_arrow_array")
+    
+    with pytest.raises(TypeError):
+        Strs.from_arrow(["list", "of", "strings"])
+    
+    # Test with non-string Arrow array
+    int_array = pa.array([1, 2, 3, 4])
+    with pytest.raises((TypeError, ValueError)):
+        Strs.from_arrow(int_array)
+
+
+@pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
+def test_strs_from_arrow_c_interface():
+    """Test the low-level Arrow C Data Interface."""
+    arrow_array = pa.array(["test", "c", "interface"])
+    
+    # Check that Arrow array has the __arrow_c_array__ method
+    assert hasattr(arrow_array, '__arrow_c_array__')
+    
+    # Get the C interface capsules
+    schema_capsule, array_capsule = arrow_array.__arrow_c_array__()
+    
+    # Verify capsules are valid PyCapsule objects
+    import sys
+    if sys.version_info >= (3, 1):
+        assert str(type(schema_capsule)) == "<class 'PyCapsule'>"
+        assert str(type(array_capsule)) == "<class 'PyCapsule'>"
+    
+    # Test actual conversion
+    strs = Strs.from_arrow(arrow_array)
+    assert len(strs) == 3
+    assert strs[0] == "test"
+    assert strs[1] == "c"
+    assert strs[2] == "interface"
+
+
 if __name__ == "__main__":
     import sys
 

From 7741272bb1943559049eb15d2cb23593d3cdb669 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 4 Aug 2025 00:33:17 +0000
Subject: [PATCH 532/751] Fix: Clang build warnings

---
 include/stringzilla/compare.h | 20 ---------------
 include/stringzilla/types.hpp | 27 +++++++++++---------
 scripts/test_stringzilla.cpp  | 48 +++++++++++++++++------------------
 3 files changed, 39 insertions(+), 56 deletions(-)

diff --git a/include/stringzilla/compare.h b/include/stringzilla/compare.h
index 609bea9d..41076764 100644
--- a/include/stringzilla/compare.h
+++ b/include/stringzilla/compare.h
@@ -336,26 +336,6 @@ SZ_PUBLIC sz_bool_t sz_equal_skylake(sz_cptr_t a, sz_cptr_t b, sz_size_t length)
 #endif            // SZ_USE_SKYLAKE
 #pragma endregion // Skylake Implementation
 
-/*  AVX512 implementation of the string search algorithms for Ice Lake and newer CPUs.
- *  Includes extensions:
- *      - 2017 Skylake: F, CD, ER, PF, VL, DQ, BW,
- *      - 2018 CannonLake: IFMA, VBMI,
- *      - 2019 Ice Lake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES.
- */
-#pragma region Ice Lake Implementation
-#if SZ_USE_ICE
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
-                             apply_to = function)
-
-/* Nothing here for now. */
-
-#pragma clang attribute pop
-#pragma GCC pop_options
-#endif            // SZ_USE_ICE
-#pragma endregion // Ice Lake Implementation
-
 /*  Implementation of the string search algorithms using the Arm NEON instruction set, available on 64-bit
  *  Arm processors. Covers billions of mobile CPUs worldwide, including Apple's A-series, and Qualcomm's Snapdragon.
  */
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 1c7927a5..9613b5b4 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -370,12 +370,12 @@ struct indexed_container_iterator {
         return *this;
     }
 
-    constexpr indexed_container_iterator operator+(difference_type n) const noexcept {
+    sz_constexpr_if_cpp14 indexed_container_iterator operator+(difference_type n) const noexcept {
         indexed_container_iterator temp = *this;
         return temp += n;
     }
 
-    constexpr indexed_container_iterator operator-(difference_type n) const noexcept {
+    sz_constexpr_if_cpp14 indexed_container_iterator operator-(difference_type n) const noexcept {
         indexed_container_iterator temp = *this;
         return temp -= n;
     }
@@ -470,8 +470,8 @@ struct arrow_strings_tape {
     using iterator = iterator_t; // ? For STL compatibility
 
 #if SZ_IS_CPP17_
-    using char_alloc_t = typename std::allocator_traits<allocator_t>::rebind_alloc<char_t>;
-    using offset_alloc_t = typename std::allocator_traits<allocator_t>::rebind_alloc<offset_t>;
+    using char_alloc_t = typename std::allocator_traits<allocator_t>::template rebind_alloc<char_t>;
+    using offset_alloc_t = typename std::allocator_traits<allocator_t>::template rebind_alloc<offset_t>;
 #else
     using char_alloc_t = typename allocator_t::template rebind<char_t>::other;
     using offset_alloc_t = typename allocator_t::template rebind<offset_t>::other;
@@ -486,11 +486,11 @@ struct arrow_strings_tape {
 
   public:
     constexpr arrow_strings_tape() = default;
-    constexpr arrow_strings_tape(arrow_strings_tape const &) = delete;
-    constexpr arrow_strings_tape &operator=(arrow_strings_tape const &other) = delete;
 
-    constexpr arrow_strings_tape(arrow_strings_tape &&) = delete;
-    constexpr arrow_strings_tape &operator=(arrow_strings_tape &&) = delete;
+    arrow_strings_tape(arrow_strings_tape &&) = delete;
+    arrow_strings_tape(arrow_strings_tape const &) = delete;
+    arrow_strings_tape &operator=(arrow_strings_tape &&) = delete;
+    arrow_strings_tape &operator=(arrow_strings_tape const &) = delete;
 
     constexpr arrow_strings_tape(span<char_t> buffer, span<offset_t> offsets, allocator_t alloc)
         : buffer_(buffer), offsets_(offsets), char_alloc_(alloc), offset_alloc_(alloc) {}
@@ -633,18 +633,18 @@ struct constant_iterator {
         return *this;
     }
     sz_constexpr_if_cpp14 constant_iterator operator++(int) {
-        constexpr constant_iterator tmp(*this);
+        constant_iterator temp(*this);
         ++pos_;
-        return tmp;
+        return temp;
     }
     sz_constexpr_if_cpp14 constant_iterator &operator--() {
         --pos_;
         return *this;
     }
     sz_constexpr_if_cpp14 constant_iterator operator--(int) {
-        constexpr constant_iterator tmp(*this);
+        constant_iterator temp(*this);
         --pos_;
-        return tmp;
+        return temp;
     }
     sz_constexpr_if_cpp14 constant_iterator &operator+=(difference_type n) {
         pos_ += n;
@@ -829,6 +829,9 @@ struct head_body_tail_t {
     size_t head = 0;
     size_t body = 0;
     size_t tail = 0;
+
+    constexpr head_body_tail_t() = default;
+    constexpr head_body_tail_t(size_t h, size_t b, size_t t) : head(h), body(b), tail(t) {}
 };
 
 template <size_t elements_per_page_, typename element_type_>
diff --git a/scripts/test_stringzilla.cpp b/scripts/test_stringzilla.cpp
index 5a91380d..ad2dac4f 100644
--- a/scripts/test_stringzilla.cpp
+++ b/scripts/test_stringzilla.cpp
@@ -103,7 +103,7 @@ template class std::unordered_map<sz::string_view, int>;
  *  @brief  Several string processing operations rely on computing integer logarithms.
  *          Failures in such operations will result in wrong `resize` outcomes and heap corruption.
  */
-static void test_arithmetical_utilities() {
+void test_arithmetical_utilities() {
 
     assert(sz_u64_clz(0x0000000000000001ull) == 63);
     assert(sz_u64_clz(0x0000000000000002ull) == 62);
@@ -164,7 +164,7 @@ static void test_arithmetical_utilities() {
 }
 
 /** @brief Validates `sz_sequence_t` and related construction utilities. */
-static void test_sequence_struct() {
+void test_sequence_struct() {
     // Make sure the sequence helper functions work as expected
     // for both trivial c-style arrays and more complicated STL containers.
     {
@@ -202,7 +202,7 @@ static void test_sequence_struct() {
 }
 
 /** @brief Validates `sz_memory_allocator_t` and related construction utilities. */
-static void test_memory_allocator_struct() {
+void test_memory_allocator_struct() {
     // Our behavior for `malloc(0)` is to return a NULL pointer,
     // while the standard is implementation-defined.
     {
@@ -232,7 +232,7 @@ static void test_memory_allocator_struct() {
 }
 
 /** @brief Validates `sz_byteset_t` and related construction utilities. */
-static void test_byteset_struct() {
+void test_byteset_struct() {
     sz_byteset_t s;
     sz_byteset_init(&s);
     assert(sz_byteset_contains(&s, 'a') == false);
@@ -254,7 +254,7 @@ static void test_byteset_struct() {
  *  The test covers increasingly long and complex strings, starting with "abcabc..." repetitions and
  *  progressing towards corner cases like empty strings, all-zero inputs, zero seeds, and so on.
  */
-static void test_hash_equivalence(                                      //
+void test_hash_equivalence(                                             //
     sz_hash_t hash_base, sz_hash_state_init_t init_base,                //
     sz_hash_state_stream_t stream_base, sz_hash_state_fold_t fold_base, //
     sz_hash_t hash_simd, sz_hash_state_init_t init_simd,                //
@@ -321,7 +321,7 @@ static void test_hash_equivalence(                                      //
  *  @brief  Tests Pseudo-Random Number Generators (PRNGs) ensuring that the same nonce
  *          produces exactly the same output across different SIMD implementations.
  */
-static void test_random_generator_equivalence(sz_fill_random_t generate_base, sz_fill_random_t generate_simd) {
+void test_random_generator_equivalence(sz_fill_random_t generate_base, sz_fill_random_t generate_simd) {
 
     auto test_on_nonce = [&](std::size_t length, sz_u64_t nonce) {
         std::string text_base(length, '\0');
@@ -343,7 +343,7 @@ static void test_random_generator_equivalence(sz_fill_random_t generate_base, sz
             test_on_nonce(length, nonce);
 }
 
-static void test_equivalence() {
+void test_equivalence() {
 
     // Ensure the seed affects hash results
     assert(sz_hash_serial("abc", 3, 100) != sz_hash_serial("abc", 3, 200));
@@ -396,7 +396,7 @@ static void test_equivalence() {
  *          provided by `sz::string` and `sz::string_view`.
  */
 template <typename string_type>
-static void test_ascii_utilities() {
+void test_ascii_utilities() {
 
     using str = string_type;
 
@@ -462,7 +462,7 @@ inline void expect_equality(char const *a, char const *b, std::size_t size) {
  *  Uses a large heap-allocated buffer to ensure that operations optimized for @b larger-than-L2-cache memory
  *  regions are tested. Uses a combination of deterministic and random tests with uniform and exponential distributions.
  */
-static void test_memory_utilities( //
+void test_memory_utilities( //
     std::size_t experiments = 1024ull * 1024ull, std::size_t max_l2_size = 1024ull * 1024ull) {
 
     // We will be mirroring the operations on both standard and StringZilla strings.
@@ -592,7 +592,7 @@ static void test_memory_utilities( //
  *          This test guarantees API @b compatibility with STL `std::basic_string` template.
  */
 template <typename string_type>
-static void test_stl_compatibility_for_reads() {
+void test_stl_compatibility_for_reads() {
 
     using str = string_type;
 
@@ -856,7 +856,7 @@ static void test_stl_compatibility_for_reads() {
  *          compilation. This test guarantees API compatibility with STL `std::basic_string` template.
  */
 template <typename string_type>
-static void test_stl_compatibility_for_updates() {
+void test_stl_compatibility_for_updates() {
 
     using str = string_type;
 
@@ -979,7 +979,7 @@ static void test_stl_compatibility_for_updates() {
 /**
  *  @brief  Constructs StringZilla classes from STL and vice-versa to ensure that the conversions are working.
  */
-static void test_stl_conversions() {
+void test_stl_conversions() {
     // From a mutable STL string to StringZilla and vice-versa.
     {
         std::string stl {"hello"};
@@ -1030,7 +1030,7 @@ inline std::size_t arithmetic_sum(std::size_t first, std::size_t last, std::size
  *          extensions beyond the STL API.
  */
 template <typename string_type>
-static void test_non_stl_extensions_for_reads() {
+void test_non_stl_extensions_for_reads() {
     using str = string_type;
 
     // Signed offset lookups and slices.
@@ -1167,7 +1167,7 @@ void test_non_stl_extensions_for_updates() {
 /**
  *  @brief  Tests copy constructor and copy-assignment constructor of `sz::string`.
  */
-static void test_constructors() {
+void test_constructors() {
     std::string alphabet {sz::ascii_printables(), sizeof(sz::ascii_printables())};
     std::vector<sz::string> strings;
     for (std::size_t alphabet_slice = 0; alphabet_slice != alphabet.size(); ++alphabet_slice)
@@ -1202,8 +1202,8 @@ struct accounting_allocator : public std::allocator<char> {
         return global_value;
     }
 
-    template <typename... args_types>
-    static void print_if_verbose(char const *fmt, args_types... args) {
+    template <typename... args_types_>
+    static void print_if_verbose(char const *fmt, args_types_... args) {
         if (!verbose_ref()) return;
         std::printf(fmt, args...);
     }
@@ -1241,7 +1241,7 @@ void assert_balanced_memory(callback_type callback) {
 /**
  *  @brief  Checks for memory leaks in the string class using the `accounting_allocator`.
  */
-static void test_memory_stability_for_length(std::size_t len = 1ull << 10) {
+void test_memory_stability_for_length(std::size_t len = 1ull << 10) {
     std::size_t iterations = 4;
 
     assert(accounting_allocator::counter_ref() == 0);
@@ -1313,7 +1313,7 @@ static void test_memory_stability_for_length(std::size_t len = 1ull << 10) {
 /**
  *  @brief  Tests the correctness of the string class update methods, such as `push_back` and `erase`.
  */
-static void test_updates(std::size_t repetitions = 1024) {
+void test_updates(std::size_t repetitions = 1024) {
     // Compare STL and StringZilla strings append functionality.
     char const alphabet_chars[] = "abcdefghijklmnopqrstuvwxyz";
     for (std::size_t repetition = 0; repetition != repetitions; ++repetition) {
@@ -1340,7 +1340,7 @@ static void test_updates(std::size_t repetitions = 1024) {
 /**
  *  @brief  Tests the correctness of the string class comparison methods, such as `compare` and `operator==`.
  */
-static void test_comparisons() {
+void test_comparisons() {
     // Comparing relative order of the strings
     assert("a"_sv.compare("a") == 0);
     assert("a"_sv.compare("ab") == -1);
@@ -1357,7 +1357,7 @@ static void test_comparisons() {
  *  @brief  Tests the correctness of the string class search methods, such as `find` and `find_first_of`.
  *          This covers haystacks and needles of different lengths, as well as character-sets.
  */
-static void test_search() {
+void test_search() {
 
     // Searching for a set of characters
     assert(sz::string_view("a").find_first_of("az") == 0);
@@ -1624,7 +1624,7 @@ void test_search_with_misaligned_repetitions(std::string_view haystack_pattern,
  *  @brief  Extensively tests the correctness of the string class search methods, such as `find` and `find_first_of`.
  *          Covers different alignment cases within a cache line, repetitive patterns, and overlapping matches.
  */
-static void test_search_with_misaligned_repetitions() {
+void test_search_with_misaligned_repetitions() {
     // When haystack is only formed of needles:
     test_search_with_misaligned_repetitions("a", "a");
     test_search_with_misaligned_repetitions("ab", "ab");
@@ -1705,7 +1705,7 @@ void test_replacements(std::size_t lookup_tables_to_try = 32, std::size_t slices
  *  4. Test on random strings of varying lengths.
  *  5. Test on random strings of varying lengths with zero characters.
  */
-static void test_sorting_algorithms() {
+void test_sorting_algorithms() {
     using strs_t = std::vector<std::string>;
     using order_t = std::vector<sz::sorted_idx_t>;
 
@@ -1779,7 +1779,7 @@ static void test_sorting_algorithms() {
 /**
  *  @brief  Tests array intersection functionality.
  */
-static void test_intersecting_algorithms() {
+void test_intersecting_algorithms() {
     using strs_t = std::vector<std::string>;
     using result_t = sz::intersect_result_t;
 
@@ -1857,7 +1857,7 @@ static void test_intersecting_algorithms() {
 /**
  *  @brief  Tests constructing STL containers with StringZilla strings.
  */
-static void test_stl_containers() {
+void test_stl_containers() {
     std::map<sz::string, int> sorted_words_sz;
     std::unordered_map<sz::string, int> words_sz;
     assert(sorted_words_sz.empty());

From 0f44c250994b30c12d5db89ccaf5b04f74eccce2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 5 Aug 2025 17:25:10 +0000
Subject: [PATCH 533/751] Make: Building parallel Python packages

---
 CONTRIBUTING.md |   8 ++++
 setup.py        | 111 ++++++++++++++++++++++++++++++++----------------
 2 files changed, 83 insertions(+), 36 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e1d631e3..72034b92 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -421,6 +421,14 @@ source .venv/bin/activate               # to activate the virtual environment
 uv pip install -e . --force-reinstall   # to build locally from source
 ```
 
+To build parallel StringZillas CPUs & CUDA backends, try:
+
+```bash
+uv pip install numpy
+SZ_TARGET=stringzillas-cpus uv pip install -e . --force-reinstall --no-build-isolation
+SZ_TARGET=stringzillas-cuda uv pip install -e . --force-reinstall --no-build-isolation
+```
+
 ### Testing
 
 For testing we use PyTest, which may not be installed on your system.
diff --git a/setup.py b/setup.py
index 46db7e9d..e27ac94e 100644
--- a/setup.py
+++ b/setup.py
@@ -2,9 +2,12 @@
 import sys
 import platform
 from setuptools import setup, find_packages, Extension
-from typing import List, Tuple
+from typing import List, Tuple, Final
 import sysconfig
-import glob
+
+
+using_cibuildwheel: Final[str] = os.environ.get("CIBUILDWHEEL", "0") == "1"
+sz_target: Final[str] = os.environ.get("SZ_TARGET", "stringzilla")
 
 
 def get_compiler() -> str:
@@ -14,18 +17,15 @@ def get_compiler() -> str:
     return ""
 
 
-using_cibuildwheels = os.environ.get("CIBUILDWHEEL", "0") == "1"
-
-
 def is_64bit_x86() -> bool:
-    if using_cibuildwheels:
+    if using_cibuildwheel:
         return "SZ_X86_64" in os.environ
     arch = platform.machine()
     return arch in ["x86_64", "x64", "AMD64"]
 
 
 def is_64bit_arm() -> bool:
-    if using_cibuildwheels:
+    if using_cibuildwheel:
         return "SZ_ARM64" in os.environ
     arch = platform.machine()
     return arch in ["arm64", "aarch64", "ARM64"]
@@ -35,9 +35,9 @@ def is_big_endian() -> bool:
     return sys.byteorder == "big"
 
 
-def linux_settings() -> Tuple[List[str], List[str], List[Tuple[str]]]:
+def linux_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[Tuple[str]]]:
     compile_args = [
-        "-std=c99",  # use the C 99 language dialect
+        "-std=c++17" if use_cpp else "-std=c99",  # use C++17 for StringZillas, C99 for StringZilla
         "-pedantic",  # stick close to the C language standard, avoid compiler extensions
         "-O3",  # maximum optimization level
         "-fdiagnostics-color=always",  # color console output
@@ -66,10 +66,10 @@ def linux_settings() -> Tuple[List[str], List[str], List[Tuple[str]]]:
     return compile_args, link_args, macros_args
 
 
-def darwin_settings() -> Tuple[List[str], List[str], List[Tuple[str]]]:
+def darwin_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[Tuple[str]]]:
 
     compile_args = [
-        "-std=c99",  # use the C 99 language dialect
+        "-std=c++17" if use_cpp else "-std=c99",  # use C++17 for StringZillas, C99 for StringZilla
         "-pedantic",  # stick close to the C language standard, avoid compiler extensions
         "-O3",  # maximum optimization level
         "-fcolor-diagnostics",  # color console output
@@ -103,9 +103,9 @@ def darwin_settings() -> Tuple[List[str], List[str], List[Tuple[str]]]:
     return compile_args, link_args, macros_args
 
 
-def windows_settings() -> Tuple[List[str], List[str], List[Tuple[str]]]:
+def windows_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[Tuple[str]]]:
     compile_args = [
-        "/std:c99",  # use the C 99 language dialect
+        "/std:c++17" if use_cpp else "/std:c99",  # use C++17 for StringZillas, C99 for StringZilla
         "/Wall",  # stick close to the C language standard, avoid compiler extensions
         "/O2",  # maximum optimization level
     ]
@@ -125,35 +125,79 @@ def windows_settings() -> Tuple[List[str], List[str], List[Tuple[str]]]:
     return compile_args, link_args, macros_args
 
 
+use_cpp: Final[bool] = sz_target != "stringzilla"
+
 if sys.platform == "linux" or sys.platform.startswith("freebsd"):
-    compile_args, link_args, macros_args = linux_settings()
+    compile_args, link_args, macros_args = linux_settings(use_cpp=use_cpp)
 
 elif sys.platform == "darwin":
-    compile_args, link_args, macros_args = darwin_settings()
+    compile_args, link_args, macros_args = darwin_settings(use_cpp=use_cpp)
 
 elif sys.platform == "win32":
-    compile_args, link_args, macros_args = windows_settings()
+    compile_args, link_args, macros_args = windows_settings(use_cpp=use_cpp)
 
 # TODO: It would be great to infer available compilation flags on FreeBSD. They are likely similar to Linux
 else:
     compile_args, link_args, macros_args = [], [], []
 
-ext_modules = [
-    Extension(
-        "stringzilla",
-        ["python/stringzilla.c", "c/stringzilla.c"],
-        # In the past I've used `np.get_include()` to include NumPy headers,
-        # but it's not necessary for this library.
-        include_dirs=["include"],
-        extra_compile_args=compile_args,
-        extra_link_args=link_args,
-        define_macros=[("SZ_DYNAMIC_DISPATCH", "1")] + macros_args,
-    ),
-]
+ext_modules = []
+entry_points = {}
+
+if sz_target == "stringzilla":
+    __lib_name__ = "stringzilla"
+    ext_modules = [
+        Extension(
+            "stringzilla",
+            ["python/stringzilla.c", "c/stringzilla.c"],
+            include_dirs=["include"],
+            extra_compile_args=compile_args,
+            extra_link_args=link_args,
+            define_macros=[("SZ_DYNAMIC_DISPATCH", "1")] + macros_args,
+        ),
+    ]
+    entry_points = {
+        "console_scripts": [
+            "sz_split=cli.split:main",
+            "sz_wc=cli.wc:main",
+        ],
+    }
+elif sz_target == "stringzillas-cpus":
+    import numpy as np
+
+    from setuptools import Extension, setup
+    from setuptools.command.build_ext import build_ext
+    import numpy as np  # only used to obtain the include path
+
+    __lib_name__ = "stringzillas-cpus"
+    ext_modules = [
+        Extension(
+            "stringzillas",
+            ["python/stringzillas.c", "c/stringzillas.cpp"],
+            include_dirs=["include", "c", "fork_union/include", np.get_include()],
+            extra_compile_args=compile_args,
+            extra_link_args=link_args,
+            define_macros=[("SZ_DYNAMIC_DISPATCH", "1"), ("SZ_USE_CUDA", "0")] + macros_args,
+        ),
+    ]
+elif sz_target == "stringzillas-cuda":
+    import numpy as np
+
+    __lib_name__ = "stringzillas-cuda"
+    ext_modules = [
+        Extension(
+            "stringzillas",
+            ["python/stringzillas.c", "c/stringzillas.cu"],
+            include_dirs=["include", "c", "fork_union/include", "/usr/local/cuda/include", np.get_include()],
+            extra_compile_args=compile_args + ["-x", "cuda"],
+            extra_link_args=link_args + ["-L/usr/local/cuda/lib64", "-lcudart"],
+            define_macros=[("SZ_DYNAMIC_DISPATCH", "1")] + macros_args,
+        ),
+    ]
+else:
+    raise ValueError("Unknown target specified with SZ_TARGET environment variable.")
 
-__version__ = open("VERSION", "r").read().strip()
-__lib_name__ = "stringzilla"
 
+__version__ = open("VERSION", "r").read().strip()
 
 this_directory = os.path.abspath(os.path.dirname(__file__))
 with open(os.path.join(this_directory, "README.md"), "r", encoding="utf-8") as f:
@@ -200,10 +244,5 @@ def windows_settings() -> Tuple[List[str], List[str], List[Tuple[str]]]:
     setup_requires=[],
     ext_modules=ext_modules,
     packages=find_packages(),
-    entry_points={
-        "console_scripts": [
-            "sz_split=cli.split:main",
-            "sz_wc=cli.wc:main",
-        ],
-    },
+    entry_points=entry_points,
 )

From c5e778d1cb0d466ea12d06357d6aded04c7f0d3b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 5 Aug 2025 18:07:17 +0000
Subject: [PATCH 534/751] Improve: Random-access similarity outputs

---
 include/stringzillas/similarities.cuh | 26 ++++++++++++++++----------
 include/stringzillas/similarities.hpp | 12 ++++++------
 include/stringzillas/types.hpp        | 23 +++++++++++++++++++++++
 3 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/include/stringzillas/similarities.cuh b/include/stringzillas/similarities.cuh
index 7aef6672..6a31aee1 100644
--- a/include/stringzillas/similarities.cuh
+++ b/include/stringzillas/similarities.cuh
@@ -2081,9 +2081,12 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
         : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
+#if SZ_IS_CPP20_
+        requires indexed_results_like<results_type_>
+#endif
     cuda_status_t operator()(                                                                 //
         first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
-        results_type_ *results_ptr,                                                           //
+        results_type_ &&results,                                                              //
         cuda_executor_t executor = {}, gpu_specs_t specs = {}) const noexcept {
 
         constexpr bool is_affine_k = is_same_type<gap_costs_t, affine_gap_costs_t>::value;
@@ -2094,7 +2097,7 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
         cudaEventCreate(&start_event, cudaEventBlockingSync);
         cudaEventCreate(&stop_event, cudaEventBlockingSync);
 
-        using final_score_t = results_type_;
+        using final_score_t = typename indexed_results_type<results_type_>::type;
         safe_vector<task_t, tasks_allocator_t> tasks(alloc_);
         if (tasks.try_resize(first_strings.size()) == status_t::bad_alloc_k) return {status_t::bad_alloc_k};
 
@@ -2194,7 +2197,7 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
                     dim3(random_block_size),                                                  // Block dimensions
                     device_level_kernel_args, // Array of kernel argument pointers
                     0,                        // Shared memory per block (in bytes)
-                    executor.stream());         // CUDA stream
+                    executor.stream());       // CUDA stream
                 if (launch_error != cudaSuccess)
                     if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
                     else { return {status_t::unknown_k, launch_error}; }
@@ -2278,7 +2281,7 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
                     dim3(threads_per_block),                                    // Block dimensions
                     warp_level_kernel_args,                                     // Array of kernel argument pointers
                     shared_memory_per_block,                                    // Shared memory per block (in bytes)
-                    executor.stream());                                           // CUDA stream
+                    executor.stream());                                         // CUDA stream
                 if (launch_error != cudaSuccess) {
                     result = {launch_error == cudaErrorMemoryAllocation ? status_t::bad_alloc_k : status_t::unknown_k,
                               launch_error};
@@ -2305,7 +2308,7 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
         // Now that everything went well, export the results back into the `results` array.
         for (size_t i = 0; i < tasks.size(); ++i) {
             task_t const &task = tasks[i];
-            results_ptr[task.original_index] = task.result;
+            results[task.original_index] = task.result;
         }
         return {status_t::success_k, cudaSuccess, execution_milliseconds};
     }
@@ -2744,9 +2747,12 @@ struct cuda_nw_or_sw_byte_level_scores_ {
         : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
+#if SZ_IS_CPP20_
+        requires indexed_results_like<results_type_>
+#endif
     cuda_status_t operator()(                                                                 //
         first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
-        results_type_ *results_ptr,                                                           //
+        results_type_ &&results,                                                              //
         cuda_executor_t executor = {}, gpu_specs_t specs = {}) const noexcept {
 
         constexpr bool is_local_k = locality_k == sz_similarity_local_k;
@@ -2758,7 +2764,7 @@ struct cuda_nw_or_sw_byte_level_scores_ {
         cudaEventCreate(&start_event, cudaEventBlockingSync);
         cudaEventCreate(&stop_event, cudaEventBlockingSync);
 
-        using final_score_t = results_type_;
+        using final_score_t = typename indexed_results_type<results_type_>::type;
         safe_vector<task_t, tasks_allocator_t> tasks(alloc_);
         if (tasks.try_resize(first_strings.size()) == status_t::bad_alloc_k) return {status_t::bad_alloc_k};
 
@@ -2855,7 +2861,7 @@ struct cuda_nw_or_sw_byte_level_scores_ {
                     dim3(random_block_size),                                                  // Block dimensions
                     device_level_kernel_args, // Array of kernel argument pointers
                     0,                        // Shared memory per block (in bytes)
-                    executor.stream());         // CUDA stream
+                    executor.stream());       // CUDA stream
                 if (launch_error != cudaSuccess)
                     if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
                     else { return {status_t::unknown_k, launch_error}; }
@@ -2937,7 +2943,7 @@ struct cuda_nw_or_sw_byte_level_scores_ {
                     dim3(threads_per_block),                                    // Block dimensions
                     warp_level_kernel_args,                                     // Array of kernel argument pointers
                     shared_memory_per_block,                                    // Shared memory per block (in bytes)
-                    executor.stream());                                           // CUDA stream
+                    executor.stream());                                         // CUDA stream
                 if (launch_error != cudaSuccess) {
                     result = {launch_error == cudaErrorMemoryAllocation ? status_t::bad_alloc_k : status_t::unknown_k,
                               launch_error};
@@ -2964,7 +2970,7 @@ struct cuda_nw_or_sw_byte_level_scores_ {
         // Now that everything went well, export the results back into the `results` array.
         for (size_t task_index = 0; task_index < tasks.size(); ++task_index) {
             task_t const &task = tasks[task_index];
-            results_ptr[task.original_index] = task.result;
+            results[task.original_index] = task.result;
         }
         return {status_t::success_k, cudaSuccess, execution_milliseconds};
     }
diff --git a/include/stringzillas/similarities.hpp b/include/stringzillas/similarities.hpp
index 7cec33ee..e876ef13 100644
--- a/include/stringzillas/similarities.hpp
+++ b/include/stringzillas/similarities.hpp
@@ -2094,7 +2094,7 @@ template <                                     //
     typename executor_type_ = dummy_executor_t //
     >
 #if SZ_IS_CPP20_
-    requires score_like<score_type_> && executor_like<executor_type_>
+    requires score_like<score_type_> && executor_like<executor_type_> && indexed_results_like<results_type_>
 #endif
 status_t _score_in_parallel(                                                                                       //
     scoring_type_ &&scoring, first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
@@ -2160,7 +2160,7 @@ template <                         //
     typename results_type_         //
     >
 #if SZ_IS_CPP20_
-    requires score_like<score_type_>
+    requires score_like<score_type_> && indexed_results_like<results_type_>
 #endif
 status_t _score_sequentially(                                                                                      //
     scoring_type_ &&scoring, first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
@@ -2222,7 +2222,7 @@ struct levenshtein_distances {
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_,
               typename executor_type_>
 #if SZ_IS_CPP20_
-        requires executor_like<executor_type_>
+        requires executor_like<executor_type_> && indexed_results_like<results_type_>
 #endif
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
                         results_type_ &&results, executor_type_ &&executor,
@@ -2273,7 +2273,7 @@ struct levenshtein_distances_utf8 {
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_,
               typename executor_type_>
 #if SZ_IS_CPP20_
-        requires executor_like<executor_type_>
+        requires executor_like<executor_type_> && indexed_results_like<results_type_>
 #endif
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
                         results_type_ &&results, executor_type_ &&executor,
@@ -2325,7 +2325,7 @@ struct needleman_wunsch_scores {
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_,
               typename executor_type_>
 #if SZ_IS_CPP20_
-        requires executor_like<executor_type_>
+        requires executor_like<executor_type_> && indexed_results_like<results_type_>
 #endif
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
                         results_type_ &&results, executor_type_ &&executor,
@@ -2377,7 +2377,7 @@ struct smith_waterman_scores {
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_,
               typename executor_type_>
 #if SZ_IS_CPP20_
-        requires executor_like<executor_type_>
+        requires executor_like<executor_type_> && indexed_results_like<results_type_>
 #endif
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
                         results_type_ &&results, executor_type_ &&executor,
diff --git a/include/stringzillas/types.hpp b/include/stringzillas/types.hpp
index 663ea444..6ef9387c 100644
--- a/include/stringzillas/types.hpp
+++ b/include/stringzillas/types.hpp
@@ -128,8 +128,31 @@ concept executor_like = requires(executor_type_ executor) {
     sizeof(executor) > 0;
 #endif
 };
+
+template <typename results_type_>
+concept indexed_results_like = requires(results_type_ results, size_t i) {
+    { results[i] };
+};
+
 #endif
 
+/** @brief Type trait to extract the value type from indexed results. */
+template <typename results_type_>
+struct indexed_results_type {
+    using clean_type = typename std::remove_reference<results_type_>::type;
+    using type = typename clean_type::value_type;
+};
+
+template <typename value_type_>
+struct indexed_results_type<value_type_ *> {
+    using type = value_type_;
+};
+
+template <typename value_type_>
+struct indexed_results_type<value_type_ *&> {
+    using type = value_type_;
+};
+
 struct openmp_executor_t {
 
     /**

From b924d90d64154be1307117eecbeeb441fc724890 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 5 Aug 2025 18:16:18 +0000
Subject: [PATCH 535/751] Fix: Tautological compare check

---
 include/stringzilla/small_string.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/stringzilla/small_string.h b/include/stringzilla/small_string.h
index 630a756d..defed48a 100644
--- a/include/stringzilla/small_string.h
+++ b/include/stringzilla/small_string.h
@@ -95,6 +95,11 @@ typedef union sz_string_t {
 
 } sz_string_t;
 
+#if !SZ_AVOID_LIBC // `offsetof` comes from `stddef.h`, which is part of the C standard library.
+sz_static_assert(offsetof(sz_string_t, internal.start) == offsetof(sz_string_t, external.start),
+                 Alignment_confusion_between_internal_and_external_storage);
+#endif
+
 #pragma endregion // Core Structure
 
 #pragma region Core API
@@ -292,7 +297,6 @@ SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length,
         string->external.length = length;
         string->external.space = space_needed;
     }
-    sz_assert_(&string->internal.start == &string->external.start && "Alignment confusion");
     string->external.start[length] = 0;
     return string->external.start;
 }

From 492d72665d99f743d7373e44d59a6217a34a050f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 5 Aug 2025 18:17:53 +0000
Subject: [PATCH 536/751] Fix: `rebind_alloc` in C++20

---
 include/stringzillas/similarities.cuh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/stringzillas/similarities.cuh b/include/stringzillas/similarities.cuh
index 6a31aee1..0302a790 100644
--- a/include/stringzillas/similarities.cuh
+++ b/include/stringzillas/similarities.cuh
@@ -2066,11 +2066,11 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
     using char_t = char_type_;
     using gap_costs_t = gap_costs_type_;
     using allocator_t = allocator_type_;
-    using scores_allocator_t = typename allocator_t::template rebind<size_t>::other;
+    using scores_allocator_t = typename std::allocator_traits<allocator_t>::template rebind_alloc<size_t>;
     static constexpr sz_capability_t capability_k = capability_;
 
     using task_t = cuda_similarity_task_<char_t>;
-    using tasks_allocator_t = typename allocator_t::template rebind<task_t>::other;
+    using tasks_allocator_t = typename std::allocator_traits<allocator_t>::template rebind_alloc<task_t>;
 
     uniform_substitution_costs_t substituter_ {};
     gap_costs_t gap_costs_ {};
@@ -2731,12 +2731,12 @@ struct cuda_nw_or_sw_byte_level_scores_ {
     using substituter_t = error_costs_256x256_t;
     using gap_costs_t = gap_costs_type_;
     using allocator_t = allocator_type_;
-    using scores_allocator_t = typename allocator_t::template rebind<size_t>::other;
+    using scores_allocator_t = typename std::allocator_traits<allocator_t>::template rebind_alloc<size_t>;
     static constexpr sz_similarity_locality_t locality_k = locality_;
     static constexpr sz_capability_t capability_k = capability_;
 
     using task_t = cuda_similarity_task_<char_t>;
-    using tasks_allocator_t = typename allocator_t::template rebind<task_t>::other;
+    using tasks_allocator_t = typename std::allocator_traits<allocator_t>::template rebind_alloc<task_t>;
 
     error_costs_256x256_t substituter_ {};
     gap_costs_t gap_costs_ {};

From 320f3da25a58bf2c81f06135ca54750f61e5bc80 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 5 Aug 2025 18:20:32 +0000
Subject: [PATCH 537/751] Make: Bump C++ & CUDA to 20 for libs

---
 CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 480f391b..6372b970 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -559,8 +559,8 @@ if (${STRINGZILLAS_BUILD_SHARED})
             target_compile_definitions(${target} PRIVATE ${flag})
         endforeach ()
 
-        # Use C++17 for StringZillas
-        set_target_properties(${target} PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON)
+        # Use C++20 for StringZillas
+        set_target_properties(${target} PROPERTIES CXX_STANDARD 20 CXX_STANDARD_REQUIRED ON)
 
         # Architecture-specific optimizations
         target_compile_options(
@@ -596,7 +596,7 @@ if (${STRINGZILLAS_BUILD_SHARED})
         target_link_libraries(stringzillas_cuda_shared PRIVATE CUDA::cudart)
 
         # Set CUDA-specific properties
-        set_target_properties(stringzillas_cuda_shared PROPERTIES CUDA_STANDARD 17 CUDA_STANDARD_REQUIRED ON)
+        set_target_properties(stringzillas_cuda_shared PROPERTIES CUDA_STANDARD 20 CUDA_STANDARD_REQUIRED ON)
         set_target_properties(stringzillas_cuda_shared PROPERTIES CUDA_ARCHITECTURES "70;75;80;86;89;90")
 
         # Enable CUDA separable compilation for device code

From 42a815f1ffb17afd0a0c6a3aa8d63f2631516777 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 5 Aug 2025 19:41:28 +0000
Subject: [PATCH 538/751] Add: Levenshtein kernels in shared lib

---
 c/stringzillas.cuh                  | 335 ++++++++++++++++++++++++++--
 include/stringzillas/stringzillas.h |  74 +++---
 2 files changed, 353 insertions(+), 56 deletions(-)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index 3388a3c3..dccb625e 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -21,6 +21,11 @@ namespace fu = ashvardanian::fork_union;
 namespace sz = ashvardanian::stringzilla;
 namespace szs = ashvardanian::stringzillas;
 
+using malloc_t = std::allocator<char>;
+#if SZ_USE_CUDA
+using ualloc_t = szs::unified_alloc<char>;
+#endif // SZ_USE_CUDA
+
 /** Helper class for `std::visit` to handle multiple callable types in a single variant. */
 template <typename... callable_types_>
 struct overloaded : callable_types_... {
@@ -44,9 +49,9 @@ struct sz_sequence_as_cpp_container_t {
     }
 };
 
-/** Wraps a `sz_arrow_u64tape_t` to feel like `std::vector<std::string_view>>` in the implementation layer. */
-struct sz_arrow_u64tape_as_cpp_container_t {
-    sz_arrow_u64tape_t const *tape_ = nullptr;
+/** Wraps a `sz_sequence_u64tape_t` to feel like `std::vector<std::string_view>>` in the implementation layer. */
+struct sz_sequence_u64tape_as_cpp_container_t {
+    sz_sequence_u64tape_t const *tape_ = nullptr;
 
     std::size_t size() const noexcept {
         sz_assert_(tape_ != nullptr && "Tape must not be null");
@@ -59,9 +64,9 @@ struct sz_arrow_u64tape_as_cpp_container_t {
     }
 };
 
-/** Wraps a `sz_arrow_u32tape_t` to feel like `std::vector<std::string_view>>` in the implementation layer. */
-struct sz_arrow_u32tape_as_cpp_container_t {
-    sz_arrow_u32tape_t const *tape_ = nullptr;
+/** Wraps a `sz_sequence_u32tape_t` to feel like `std::vector<std::string_view>>` in the implementation layer. */
+struct sz_sequence_u32tape_as_cpp_container_t {
+    sz_sequence_u32tape_t const *tape_ = nullptr;
 
     std::size_t size() const noexcept {
         sz_assert_(tape_ != nullptr && "Tape must not be null");
@@ -97,6 +102,63 @@ struct strided_rows {
     }
 };
 
+/**
+ *  @brief Convenience class for strided pointer arithmetic.
+ *  @see
+ * https://github.com/ashvardanian/less_slow.cpp/blob/b21507f7143f8175b92d0b2b2d827b3bd4bb081b/less_slow.cpp#L2593-L2641
+ */
+template <typename value_type_>
+class strided_ptr {
+    sz_ptr_t data_;
+    std::size_t stride_;
+
+  public:
+    using value_type = value_type_;
+    using pointer = value_type_ *;
+    using reference = value_type_ &;
+    using difference_type = std::ptrdiff_t;
+    using iterator_category = std::random_access_iterator_tag;
+
+    strided_ptr(sz_ptr_t data, std::size_t stride_bytes) : data_(data), stride_(stride_bytes) {
+        assert(data_ && "Pointer must not be null, as NULL arithmetic is undefined");
+    }
+#if defined(__cpp_lib_assume_aligned) // Not available in Apple Clang
+    reference operator*() const noexcept {
+        return *std::launder(std::assume_aligned<1>(reinterpret_cast<pointer>(data_)));
+    }
+    reference operator[](difference_type i) const noexcept {
+        return *std::launder(std::assume_aligned<1>(reinterpret_cast<pointer>(data_ + i * stride_)));
+    }
+#else
+    reference operator*() const noexcept { return *reinterpret_cast<pointer>(data_); }
+    reference operator[](difference_type i) const noexcept { return *reinterpret_cast<pointer>(data_ + i * stride_); }
+#endif // defined(__cpp_lib_assume_aligned)
+
+    // clang-format off
+    pointer operator->() const noexcept { return &operator*(); }
+    strided_ptr &operator++() noexcept { data_ += stride_; return *this; }
+    strided_ptr operator++(int) noexcept { strided_ptr temp = *this; ++(*this); return temp; }
+    strided_ptr &operator--() noexcept { data_ -= stride_; return *this; }
+    strided_ptr operator--(int) noexcept { strided_ptr temp = *this; --(*this); return temp; }
+    strided_ptr &operator+=(difference_type offset) noexcept { data_ += offset * stride_; return *this; }
+    strided_ptr &operator-=(difference_type offset) noexcept { data_ -= offset * stride_; return *this; }
+    strided_ptr operator+(difference_type offset) const noexcept { strided_ptr temp = *this; return temp += offset; }
+    strided_ptr operator-(difference_type offset) const noexcept { strided_ptr temp = *this; return temp -= offset; }
+    friend difference_type operator-(strided_ptr const &a, strided_ptr const &b) noexcept { assert(a.stride_ == b.stride_); return (a.data_ - b.data_) / static_cast<difference_type>(a.stride_); }
+    friend bool operator==(strided_ptr const &a, strided_ptr const &b) noexcept { return a.data_ == b.data_; }
+    friend bool operator<(strided_ptr const &a, strided_ptr const &b) noexcept { return a.data_ < b.data_; }
+    friend bool operator!=(strided_ptr const &a, strided_ptr const &b) noexcept { return !(a == b); }
+    friend bool operator>(strided_ptr const &a, strided_ptr const &b) noexcept { return b < a; }
+    friend bool operator<=(strided_ptr const &a, strided_ptr const &b) noexcept { return !(b < a); }
+    friend bool operator>=(strided_ptr const &a, strided_ptr const &b) noexcept { return !(a < b); }
+    // clang-format on
+};
+
+constexpr bool is_gpu_capability(sz_capability_t capability) noexcept {
+    return (capability & sz_cap_cuda_k) != 0 || (capability & sz_cap_kepler_k) != 0 ||
+           (capability & sz_cap_hopper_k) != 0;
+}
+
 #if SZ_USE_CUDA
 
 /** @brief Redirects to CUDA's unified memory allocator. */
@@ -148,12 +210,111 @@ struct device_scope_t {
     device_scope_t(variants_arguments_ &&...args) noexcept : variants(std::forward<variants_arguments_>(args)...) {}
 };
 
-static constexpr size_t fingerprint_slice_k = 64;
+struct levenshtein_distances_backends_t {
+
+    using fallback_linear_variant_t =
+        szs::levenshtein_distances<char, szs::linear_gap_costs_t, malloc_t, sz_cap_serial_k>;
+    using fallback_affine_variant_t =
+        szs::levenshtein_distances<char, szs::affine_gap_costs_t, malloc_t, sz_cap_serial_k>;
+
+    /**
+     *  On each hardware platform we use a different backend for Levenshtein distances,
+     *  separately covering:
+     *  - Linear or Affine gap costs
+     *  - Serial, Ice Lake, CUDA, CUDA Kepler, and CUDA Hopper backends
+     */
+    std::variant<
+#if SZ_USE_ICE
+        szs::levenshtein_distances<char, szs::linear_gap_costs_t, malloc_t, sz_caps_si_k>,
+        szs::levenshtein_distances<char, szs::affine_gap_costs_t, malloc_t, sz_caps_si_k>,
+#endif
+#if SZ_USE_CUDA
+        szs::levenshtein_distances<char, szs::linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>,
+        szs::levenshtein_distances<char, szs::affine_gap_costs_t, ualloc_t, sz_cap_cuda_k>,
+#endif
+#if SZ_USE_KEPLER
+        szs::levenshtein_distances<char, szs::linear_gap_costs_t, ualloc_t, sz_caps_ck_k>,
+        szs::levenshtein_distances<char, szs::affine_gap_costs_t, ualloc_t, sz_caps_ck_k>,
+#endif
+#if SZ_USE_HOPPER
+        szs::levenshtein_distances<char, szs::linear_gap_costs_t, ualloc_t, sz_caps_ckh_k>,
+        szs::levenshtein_distances<char, szs::affine_gap_costs_t, ualloc_t, sz_caps_ckh_k>,
+#endif
+        fallback_linear_variant_t, fallback_affine_variant_t>
+        variants;
+
+    template <typename... variants_arguments_>
+    levenshtein_distances_backends_t(variants_arguments_ &&...args) noexcept
+        : variants(std::forward<variants_arguments_>(args)...) {}
+};
+
+template <typename texts_type_>
+sz_status_t sz_levenshtein_distances_for_(                                     //
+    sz_levenshtein_distances_t engine_punned, sz_device_scope_t device_punned, //
+    texts_type_ &&a_container, texts_type_ &&b_container,                      //
+    sz_size_t *results, sz_size_t results_stride) {
+
+    sz_assert_(engine_punned != nullptr && "Engine must be initialized");
+    sz_assert_(device_punned != nullptr && "Device must be initialized");
+    sz_assert_(results != nullptr && "Results must not be null");
+
+    // Revert back from opaque pointer types
+    auto *engine = reinterpret_cast<levenshtein_distances_backends_t *>(engine_punned);
+    auto *device = reinterpret_cast<device_scope_t *>(device_punned);
+
+    // Wrap our stable ABI sequences into C++ friendly containers
+    auto results_strided = strided_ptr<sz_size_t> {reinterpret_cast<sz_ptr_t>(results), results_stride};
+
+    // The simplest case, is having non-optimized non-unrolled hashers.
+    sz_status_t result = sz_success_k;
+    auto variant_logic = [&](auto &engine_variant) {
+        constexpr sz_capability_t engine_capability_k = engine_variant.capability_k;
+
+        // GPU backends are only compatible with GPU scopes
+        if constexpr (is_gpu_capability(engine_capability_k)) {
+#if SZ_USE_CUDA
+            if (std::holds_alternative<gpu_scope_t>(device->variants)) {
+                auto &device_scope = std::get<gpu_scope_t>(device->variants);
+                sz::status_t status = engine_variant(          //
+                    a_container, b_container, results_strided, //
+                    get_executor(device_scope), get_specs(device_scope));
+                result = static_cast<sz_status_t>(status);
+            }
+            else { result = sz_status_unknown_k; }
+#else
+            result = sz_status_unknown_k; // GPU support is not enabled
+#endif // SZ_USE_CUDA
+        }
+        // CPU backends are only compatible with CPU scopes
+        else {
+            if (std::holds_alternative<default_scope_t>(device->variants)) {
+                auto &device_scope = std::get<default_scope_t>(device->variants);
+                sz::status_t status = engine_variant(          //
+                    a_container, b_container, results_strided, //
+                    get_executor(device_scope), get_specs(device_scope));
+                result = static_cast<sz_status_t>(status);
+            }
+            else if (std::holds_alternative<cpu_scope_t>(device->variants)) {
+                auto &device_scope = std::get<cpu_scope_t>(device->variants);
+                sz::status_t status = engine_variant(          //
+                    a_container, b_container, results_strided, //
+                    get_executor(device_scope), get_specs(device_scope));
+                result = static_cast<sz_status_t>(status);
+            }
+            else { result = sz_status_unknown_k; }
+        }
+    };
+
+    std::visit(variant_logic, engine->variants);
+    return result;
+}
 
 template <typename element_type_>
 using vec = szs::safe_vector<element_type_, std::allocator<element_type_>>;
 
-struct fingerprints_t {
+static constexpr size_t fingerprint_slice_k = 64;
+
+struct fingerprints_backends_t {
     using fallback_variant_t = szs::basic_rolling_hashers<szs::floating_rolling_hasher<sz::f64_t>, sz::u32_t>;
 
     /**
@@ -176,7 +337,8 @@ struct fingerprints_t {
     sz_size_t dimensions = 0; // Total number of dimensions across all hashers
 
     template <typename... variants_arguments_>
-    fingerprints_t(variants_arguments_ &&...args) noexcept : variants(std::forward<variants_arguments_>(args)...) {}
+    fingerprints_backends_t(variants_arguments_ &&...args) noexcept
+        : variants(std::forward<variants_arguments_>(args)...) {}
 };
 
 template <typename texts_type_>
@@ -192,7 +354,7 @@ sz_status_t sz_fingerprints_for_(                                     //
     sz_assert_(min_counts != nullptr && "Output min_counts cannot be null");
 
     // Revert back from opaque pointer types
-    auto *engine = reinterpret_cast<fingerprints_t *>(engine_punned);
+    auto *engine = reinterpret_cast<fingerprints_backends_t *>(engine_punned);
     auto *device = reinterpret_cast<device_scope_t *>(device_punned);
 
     // Wrap our stable ABI sequences into C++ friendly containers
@@ -205,7 +367,7 @@ sz_status_t sz_fingerprints_for_(                                     //
 
     // The simplest case, is having non-optimized non-unrolled hashers.
     sz_status_t result = sz_success_k;
-    using fallback_variant_t = typename fingerprints_t::fallback_variant_t;
+    using fallback_variant_t = typename fingerprints_backends_t::fallback_variant_t;
     auto fallback_logic = [&](fallback_variant_t &fallback_hashers) {
         // CPU fallback hashers can only work with CPU-compatible device scopes
         if (std::holds_alternative<default_scope_t>(device->variants)) {
@@ -299,6 +461,138 @@ SZ_DYNAMIC void sz_device_scope_free(sz_device_scope_t scope_punned) {
     delete scope;
 }
 
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(                                              //
+    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
+    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,                              //
+    sz_levenshtein_distances_t *engine_punned) {
+
+    sz_assert_(engine_punned != nullptr && *engine_punned == nullptr && "Engine must be uninitialized");
+
+    // If the gap opening and extension costs are identical we can use less memory
+    auto const can_use_linear_costs = open == extend;
+    auto const substitution_costs = szs::uniform_substitution_costs_t {match, mismatch};
+    auto const linear_costs = szs::linear_gap_costs_t {open};
+    auto const affine_costs = szs::affine_gap_costs_t {open, extend};
+    using fallback_linear_variant_t = typename levenshtein_distances_backends_t::fallback_linear_variant_t;
+    using fallback_affine_variant_t = typename levenshtein_distances_backends_t::fallback_affine_variant_t;
+
+#if SZ_USE_ICE
+    bool const can_use_ice = (capabilities & sz_cap_serial_k) != 0;
+    if (can_use_ice && can_use_linear_costs) {
+        auto variant = szs::levenshtein_distances<char, szs::linear_gap_costs_t, malloc_t, sz_caps_si_k>(
+            substitution_costs, linear_costs);
+        auto engine = new (std::nothrow) levenshtein_distances_backends_t(
+            std::in_place_type_t<szs::levenshtein_distances<char, szs::linear_gap_costs_t, malloc_t, sz_caps_si_k>>(),
+            std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
+        return sz_success_k;
+    }
+    else if (can_use_ice) {
+        auto variant = szs::levenshtein_distances<char, szs::affine_gap_costs_t, malloc_t, sz_caps_si_k>(
+            substitution_costs, affine_costs);
+        auto engine = new (std::nothrow) levenshtein_distances_backends_t(
+            std::in_place_type_t<szs::levenshtein_distances<char, szs::affine_gap_costs_t, malloc_t, sz_caps_si_k>>(),
+            std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
+        return sz_success_k;
+    }
+#endif // SZ_USE_ICE
+
+#if SZ_USE_CUDA
+    bool const can_use_cuda = (capabilities & sz_cap_cuda_k) != 0;
+    if (can_use_cuda && can_use_linear_costs) {
+        auto variant = szs::levenshtein_distances<char, szs::linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>(
+            substitution_costs, linear_costs);
+        auto engine = new (std::nothrow) levenshtein_distances_backends_t(
+            std::in_place_type_t<szs::levenshtein_distances<char, szs::linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>>(),
+            std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
+        return sz_success_k;
+    }
+    else if (can_use_cuda) {
+        auto variant = szs::levenshtein_distances<char, szs::affine_gap_costs_t, ualloc_t, sz_cap_cuda_k>(
+            substitution_costs, affine_costs);
+        auto engine = new (std::nothrow) levenshtein_distances_backends_t(
+            std::in_place_type_t<szs::levenshtein_distances<char, szs::affine_gap_costs_t, ualloc_t, sz_cap_cuda_k>>(),
+            std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
+        return sz_success_k;
+    }
+#endif // SZ_USE_CUDA
+
+    if (can_use_linear_costs) {
+        auto variant = fallback_linear_variant_t(substitution_costs, linear_costs);
+        auto engine = new (std::nothrow)
+            levenshtein_distances_backends_t(std::in_place_type_t<fallback_linear_variant_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
+        return sz_success_k;
+    }
+    else {
+        auto variant = fallback_affine_variant_t(substitution_costs, affine_costs);
+        auto engine = new (std::nothrow)
+            levenshtein_distances_backends_t(std::in_place_type_t<fallback_affine_variant_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
+        return sz_success_k;
+    }
+}
+
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_sequence(                      //
+    sz_levenshtein_distances_t engine_punned, sz_device_scope_t device_punned, //
+    sz_sequence_t const *a, sz_sequence_t const *b,                            //
+    sz_size_t *results, sz_size_t results_stride) {
+
+    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
+    auto a_container = sz_sequence_as_cpp_container_t {a};
+    auto b_container = sz_sequence_as_cpp_container_t {b};
+    return sz_levenshtein_distances_for_(                       //
+        engine_punned, device_punned, a_container, b_container, //
+        results, results_stride);
+}
+
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u32tape(                       //
+    sz_levenshtein_distances_t engine_punned, sz_device_scope_t device_punned, //
+    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,            //
+    sz_size_t *results, sz_size_t results_stride) {
+
+    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
+    auto a_container = sz_sequence_u32tape_as_cpp_container_t {a};
+    auto b_container = sz_sequence_u32tape_as_cpp_container_t {b};
+    return sz_levenshtein_distances_for_(                       //
+        engine_punned, device_punned, a_container, b_container, //
+        results, results_stride);
+}
+
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u64tape(                       //
+    sz_levenshtein_distances_t engine_punned, sz_device_scope_t device_punned, //
+    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,            //
+    sz_size_t *results, sz_size_t results_stride) {
+
+    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
+    auto a_container = sz_sequence_u64tape_as_cpp_container_t {a};
+    auto b_container = sz_sequence_u64tape_as_cpp_container_t {b};
+    return sz_levenshtein_distances_for_(                       //
+        engine_punned, device_punned, a_container, b_container, //
+        results, results_stride);
+}
+
+SZ_DYNAMIC void sz_levenshtein_distances_free(sz_levenshtein_distances_t engine_punned) {
+    sz_assert_(engine_punned != nullptr && "Engine must be initialized");
+    auto *engine = reinterpret_cast<levenshtein_distances_backends_t *>(engine_punned);
+    delete engine;
+}
+
 SZ_DYNAMIC sz_status_t sz_fingerprints_init(                              //
     sz_size_t alphabet_size, sz_size_t const *window_widths,              //
     sz_size_t window_widths_count, sz_size_t dimensions_per_window_width, //
@@ -310,7 +604,7 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_init(                              //
     // If window widths are not provided, let's pick some of the default configurations.
     auto const dimensions = window_widths_count * dimensions_per_window_width;
     auto const can_use_sliced_sketchers = dimensions_per_window_width % fingerprint_slice_k == 0;
-    using fallback_variant_t = typename fingerprints_t::fallback_variant_t;
+    using fallback_variant_t = typename fingerprints_backends_t::fallback_variant_t;
 
     if (!can_use_sliced_sketchers) {
         auto variant = fallback_variant_t();
@@ -320,7 +614,8 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_init(                              //
             if (extend_status != sz::status_t::success_k) return static_cast<sz_status_t>(extend_status);
         }
 
-        auto engine = new (std::nothrow) fingerprints_t(std::in_place_type_t<fallback_variant_t>(), std::move(variant));
+        auto engine =
+            new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<fallback_variant_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
         engine->dimensions = dimensions;
@@ -345,27 +640,27 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_sequence(                      //
         min_hashes, min_hashes_stride, min_counts, min_counts_stride);
 }
 
-SZ_DYNAMIC sz_status_t sz_fingerprints_u64tape(                       //
+SZ_DYNAMIC sz_status_t sz_fingerprints_u32tape(                       //
     sz_fingerprints_t engine_punned, sz_device_scope_t device_punned, //
-    sz_arrow_u64tape_t const *texts,                                  //
+    sz_sequence_u32tape_t const *texts,                               //
     sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                //
     sz_u32_t *min_counts, sz_size_t min_counts_stride) {
 
     sz_assert_(texts != nullptr && "Input texts cannot be null");
-    auto texts_container = sz_arrow_u64tape_as_cpp_container_t {texts};
+    auto texts_container = sz_sequence_u32tape_as_cpp_container_t {texts};
     return sz_fingerprints_for_(                       //
         engine_punned, device_punned, texts_container, //
         min_hashes, min_hashes_stride, min_counts, min_counts_stride);
 }
 
-SZ_DYNAMIC sz_status_t sz_fingerprints_u32tape(                       //
+SZ_DYNAMIC sz_status_t sz_fingerprints_u64tape(                       //
     sz_fingerprints_t engine_punned, sz_device_scope_t device_punned, //
-    sz_arrow_u32tape_t const *texts,                                  //
+    sz_sequence_u64tape_t const *texts,                               //
     sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                //
     sz_u32_t *min_counts, sz_size_t min_counts_stride) {
 
     sz_assert_(texts != nullptr && "Input texts cannot be null");
-    auto texts_container = sz_arrow_u32tape_as_cpp_container_t {texts};
+    auto texts_container = sz_sequence_u64tape_as_cpp_container_t {texts};
     return sz_fingerprints_for_(                       //
         engine_punned, device_punned, texts_container, //
         min_hashes, min_hashes_stride, min_counts, min_counts_stride);
@@ -373,7 +668,7 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_u32tape(                       //
 
 SZ_DYNAMIC void sz_fingerprints_free(sz_fingerprints_t engine_punned) {
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
-    auto *engine = reinterpret_cast<fingerprints_t *>(engine_punned);
+    auto *engine = reinterpret_cast<fingerprints_backends_t *>(engine_punned);
     delete engine;
 }
 
diff --git a/include/stringzillas/stringzillas.h b/include/stringzillas/stringzillas.h
index 45f750b5..73342968 100644
--- a/include/stringzillas/stringzillas.h
+++ b/include/stringzillas/stringzillas.h
@@ -12,7 +12,7 @@
  *
  *  Under the hood, a ton of C++ templates are instantiated to handle different types of inputs, like:
  *  - `sz_sequence_t` - for a C-style `std::vector<std::string_view>`-like structure.
- *  - `sz_arrow_u32tape_t`, `sz_arrow_u64tape_t` - for Apache Arrow-compatible tapes with 32-bit and 64-bit offsets.
+ *  - `sz_sequence_u32tape_t`, `sz_sequence_u64tape_t` - for Apache Arrow-like tapes with 32-bit and 64-bit offsets.
  *
  *  Those templates also reuse the same pre-configured operators for different thread-pool & executor types,
  *  hardware capability levels.
@@ -30,24 +30,26 @@ extern "C" {
 #endif
 
 /**
- *  @brief Apache Arrow-compatible tape for non-NULL strings with 32-bit offsets.
- *  @sa `sz_arrow_u64tape_t` for larger collections.
+ *  @brief Apache Arrow-like tape for non-NULL strings with 32-bit offsets.
+ *  @sa `sz_sequence_u64tape_t` for larger collections.
+ *  @note Unlike Apache Arrow, we only take (N) offsets for (N) strings, assuming the first one starts at zero offset.
  */
-struct sz_arrow_u32tape_t {
+typedef struct sz_sequence_u32tape_t {
     sz_cptr_t data;
     sz_u32_t const *offsets;
     sz_size_t count;
-};
+} sz_sequence_u32tape_t;
 
 /**
- *  @brief Apache Arrow-compatible tape for non-NULL strings with 64-bit offsets.
- *  @sa `sz_arrow_u32tape_t` for smaller space-efficient collections.
+ *  @brief Apache Arrow-like tape for non-NULL strings with 64-bit offsets.
+ *  @sa `sz_sequence_u32tape_t` for smaller space-efficient collections.
+ *  @note Unlike Apache Arrow, we only take (N) offsets for (N) strings, assuming the first one starts at zero offset.
  */
-struct sz_arrow_u64tape_t {
+typedef struct sz_sequence_u64tape_t {
     sz_cptr_t data;
     sz_u64_t const *offsets;
     sz_size_t count;
-};
+} sz_sequence_u64tape_t;
 
 /**
  *  @brief Prepares the default allocator for unified memory management.
@@ -75,7 +77,7 @@ SZ_DYNAMIC sz_status_t sz_device_scope_init_gpu_device(sz_size_t gpu_device, sz_
 SZ_DYNAMIC void sz_device_scope_free(sz_device_scope_t scope);
 
 /*  APIs for computing edit-distances between binary and UTF-8 strings.
- *  Supports `sz_sequence_t`, `sz_arrow_u32tape_t`, and `sz_arrow_u64tape_t` inputs.
+ *  Supports `sz_sequence_t`, `sz_sequence_u32tape_t`, and `sz_sequence_u64tape_t` inputs.
  */
 typedef void *sz_levenshtein_distances_t;
 typedef void *sz_levenshtein_distances_utf8_t;
@@ -90,14 +92,14 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_sequence(        //
     sz_sequence_t const *a, sz_sequence_t const *b,              //
     sz_size_t *results, sz_size_t results_stride);
 
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u32tape(         //
-    sz_levenshtein_distances_t engine, sz_device_scope_t device, //
-    sz_arrow_u32tape_t const *a, sz_arrow_u32tape_t const *b,    //
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u32tape(            //
+    sz_levenshtein_distances_t engine, sz_device_scope_t device,    //
+    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b, //
     sz_size_t *results, sz_size_t results_stride);
 
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u64tape(         //
-    sz_levenshtein_distances_t engine, sz_device_scope_t device, //
-    sz_arrow_u64tape_t const *a, sz_arrow_u64tape_t const *b,    //
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u64tape(            //
+    sz_levenshtein_distances_t engine, sz_device_scope_t device,    //
+    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b, //
     sz_size_t *results, sz_size_t results_stride);
 
 SZ_DYNAMIC void sz_levenshtein_distances_free(sz_levenshtein_distances_t engine);
@@ -114,18 +116,18 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_sequence(        //
 
 SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_u32tape(         //
     sz_levenshtein_distances_utf8_t engine, sz_device_scope_t device, //
-    sz_arrow_u32tape_t const *a, sz_arrow_u32tape_t const *b,         //
+    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,   //
     sz_size_t *results, sz_size_t results_stride);
 
 SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_u64tape(         //
     sz_levenshtein_distances_utf8_t engine, sz_device_scope_t device, //
-    sz_arrow_u64tape_t const *a, sz_arrow_u64tape_t const *b,         //
+    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,   //
     sz_size_t *results, sz_size_t results_stride);
 
 SZ_DYNAMIC void sz_levenshtein_distances_utf8_free(sz_levenshtein_distances_utf8_t engine);
 
 /*  APIs for computing similarity scores between pairs of strings.
- *  Supports `sz_sequence_t`, `sz_arrow_u32tape_t`, and `sz_arrow_u64tape_t` inputs.
+ *  Supports `sz_sequence_t`, `sz_sequence_u32tape_t`, and `sz_sequence_u64tape_t` inputs.
  */
 
 typedef void *sz_needleman_wunsch_scores_t;
@@ -141,14 +143,14 @@ SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_sequence(        //
     sz_sequence_t const *a, sz_sequence_t const *b,                //
     sz_ssize_t *results, sz_size_t results_stride);
 
-SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u32tape(         //
-    sz_needleman_wunsch_scores_t engine, sz_device_scope_t device, //
-    sz_arrow_u32tape_t const *a, sz_arrow_u32tape_t const *b,      //
+SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u32tape(          //
+    sz_needleman_wunsch_scores_t engine, sz_device_scope_t device,  //
+    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b, //
     sz_ssize_t *results, sz_size_t results_stride);
 
-SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u64tape(         //
-    sz_needleman_wunsch_scores_t engine, sz_device_scope_t device, //
-    sz_arrow_u64tape_t const *a, sz_arrow_u64tape_t const *b,      //
+SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u64tape(          //
+    sz_needleman_wunsch_scores_t engine, sz_device_scope_t device,  //
+    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b, //
     sz_ssize_t *results, sz_size_t results_stride);
 
 SZ_DYNAMIC void sz_needleman_wunsch_scores_free(sz_needleman_wunsch_scores_t engine);
@@ -163,21 +165,21 @@ SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_sequence(        //
     sz_sequence_t const *a, sz_sequence_t const *b,              //
     sz_ssize_t *results, sz_size_t results_stride);
 
-SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u32tape(         //
-    sz_smith_waterman_scores_t engine, sz_device_scope_t device, //
-    sz_arrow_u32tape_t const *a, sz_arrow_u32tape_t const *b,    //
+SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u32tape(            //
+    sz_smith_waterman_scores_t engine, sz_device_scope_t device,    //
+    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b, //
     sz_ssize_t *results, sz_size_t results_stride);
 
-SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u64tape(         //
-    sz_smith_waterman_scores_t engine, sz_device_scope_t device, //
-    sz_arrow_u64tape_t const *a, sz_arrow_u64tape_t const *b,    //
+SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u64tape(            //
+    sz_smith_waterman_scores_t engine, sz_device_scope_t device,    //
+    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b, //
     sz_ssize_t *results, sz_size_t results_stride);
 
 SZ_DYNAMIC void sz_smith_waterman_scores_free(sz_smith_waterman_scores_t engine);
 
 /**
  *  APIs for computing fingerprints, Min-Hashes, and Count-Min-Sketches of binary and UTF-8 strings.
- *  Supports `sz_sequence_t`, `sz_arrow_u32tape_t`, and `sz_arrow_u64tape_t` inputs.
+ *  Supports `sz_sequence_t`, `sz_sequence_u32tape_t`, and `sz_sequence_u64tape_t` inputs.
  *
  *  @section Speed Considerations
  *
@@ -224,13 +226,13 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_sequence(        //
 
 SZ_DYNAMIC sz_status_t sz_fingerprints_u64tape(         //
     sz_fingerprints_t engine, sz_device_scope_t device, //
-    sz_arrow_u64tape_t const *texts,                    //
+    sz_sequence_u64tape_t const *texts,                 //
     sz_u32_t *min_hashes, sz_size_t min_hashes_stride,  //
     sz_u32_t *min_counts, sz_size_t min_counts_stride);
 
 SZ_DYNAMIC sz_status_t sz_fingerprints_u32tape(         //
     sz_fingerprints_t engine, sz_device_scope_t device, //
-    sz_arrow_u32tape_t const *texts,                    //
+    sz_sequence_u32tape_t const *texts,                 //
     sz_u32_t *min_hashes, sz_size_t min_hashes_stride,  //
     sz_u32_t *min_counts, sz_size_t min_counts_stride);
 
@@ -250,13 +252,13 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_sequence(        //
 
 SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_u64tape(         //
     sz_fingerprints_utf8_t engine, sz_device_scope_t device, //
-    sz_arrow_u64tape_t const *texts,                         //
+    sz_sequence_u64tape_t const *texts,                      //
     sz_u32_t *min_hashes, sz_size_t min_hashes_stride,       //
     sz_u32_t *min_counts, sz_size_t min_counts_stride);
 
 SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_u32tape(         //
     sz_fingerprints_utf8_t engine, sz_device_scope_t device, //
-    sz_arrow_u32tape_t const *texts,                         //
+    sz_sequence_u32tape_t const *texts,                      //
     sz_u32_t *min_hashes, sz_size_t min_hashes_stride,       //
     sz_u32_t *min_counts, sz_size_t min_counts_stride);
 

From d1fc68c60b3d5f4c738d52acd5e4c40597a0d9bd Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 5 Aug 2025 19:41:48 +0000
Subject: [PATCH 539/751] Docs: Mark programming languages correctly

---
 .gitattributes | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 .gitattributes

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 00000000..f01c2310
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,6 @@
+# GitHub's Linguist doesn't properly classify many languages
+*.h linguist-language=C
+*.hpp linguist-language=C++
+*.cuh linguist-language=CUDA
+*.S linguist-language=Assembly
+*.ptx linguist-language=Assembly
\ No newline at end of file

From c411e37e787f741c91802b39efd93eb7bc9e62be Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 5 Aug 2025 20:21:40 +0000
Subject: [PATCH 540/751] Fix: Avoid depending SZS -> SZ

---
 c/stringzillas.cuh                    |  8 ++++++++
 include/stringzillas/fingerprints.hpp |  1 -
 include/stringzillas/similarities.hpp | 14 +++++++-------
 include/stringzillas/stringzillas.h   |  5 +++++
 4 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index dccb625e..49435a6c 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -397,6 +397,14 @@ sz_status_t sz_fingerprints_for_(                                     //
 
 extern "C" {
 
+SZ_DYNAMIC int szs_version_major(void) { return STRINGZILLA_H_VERSION_MAJOR; }
+SZ_DYNAMIC int szs_version_minor(void) { return STRINGZILLA_H_VERSION_MINOR; }
+SZ_DYNAMIC int szs_version_patch(void) { return STRINGZILLA_H_VERSION_PATCH; }
+
+SZ_DYNAMIC sz_capability_t szs_capabilities(void) {
+    return static_cast<sz_capability_t>(sz_caps_spi_k | sz_caps_ckh_k);
+}
+
 SZ_DYNAMIC sz_status_t sz_memory_allocator_init_unified(sz_memory_allocator_t *alloc) {
 #if SZ_USE_CUDA
     alloc->allocate = &sz_memory_allocate_from_unified_;
diff --git a/include/stringzillas/fingerprints.hpp b/include/stringzillas/fingerprints.hpp
index de54b16b..84d9ab6e 100644
--- a/include/stringzillas/fingerprints.hpp
+++ b/include/stringzillas/fingerprints.hpp
@@ -114,7 +114,6 @@
 #define STRINGZILLAS_FINGERPRINTS_HPP_
 
 #include "stringzilla/types.hpp"  // `sz::error_cost_t`
-#include "stringzilla/memory.h"   // `sz_move`
 #include "stringzillas/types.hpp" // `sz::executor_like`
 
 #include <cstddef>
diff --git a/include/stringzillas/similarities.hpp b/include/stringzillas/similarities.hpp
index e876ef13..0637cb9e 100644
--- a/include/stringzillas/similarities.hpp
+++ b/include/stringzillas/similarities.hpp
@@ -70,7 +70,7 @@
 #define STRINGZILLAS_SIMILARITIES_HPP_
 
 #include "stringzilla/types.hpp"  // `sz::error_cost_t`
-#include "stringzilla/memory.h"   // `sz_move`
+#include "stringzilla/memory.h"   // `sz_move_serial`
 #include "stringzillas/types.hpp" // `sz::executor_like`
 
 #include <atomic>      // `std::atomic` to synchronize OpenMP threads
@@ -1078,8 +1078,8 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_co
             rotate_three(previous_scores, current_scores, next_scores);
 
             // ! Drop the first entry among the current scores.
-            sz_move((sz_ptr_t)(previous_scores), (sz_ptr_t)(previous_scores + 1),
-                    (max_diagonal_length - 1) * sizeof(score_t));
+            sz_move_serial((sz_ptr_t)(previous_scores), (sz_ptr_t)(previous_scores + 1),
+                           (max_diagonal_length - 1) * sizeof(score_t));
         }
 
         // Now let's handle the bottom-right triangle of the matrix.
@@ -1101,7 +1101,7 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_co
 
             // ! Drop the first entry among the current scores.
             // ! Assuming every next diagonal is shorter by one element,
-            // ! we don't need a full-blown `sz_move` to shift the array by one element.
+            // ! we don't need a full-blown `sz_move_serial` to shift the array by one element.
             previous_scores++;
         }
 
@@ -1295,8 +1295,8 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
             trivial_swap(current_deletes, next_deletes);
 
             // ! Drop the first entry among the current scores.
-            sz_move((sz_ptr_t)(previous_scores), (sz_ptr_t)(previous_scores + 1),
-                    (max_diagonal_length - 1) * sizeof(score_t));
+            sz_move_serial((sz_ptr_t)(previous_scores), (sz_ptr_t)(previous_scores + 1),
+                           (max_diagonal_length - 1) * sizeof(score_t));
         }
 
         // Now let's handle the bottom-right triangle of the matrix.
@@ -1323,7 +1323,7 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
 
             // ! Drop the first entry among the current scores.
             // ! Assuming every next diagonal is shorter by one element,
-            // ! we don't need a full-blown `sz_move` to shift the array by one element.
+            // ! we don't need a full-blown `sz_move_serial` to shift the array by one element.
             previous_scores++;
         }
 
diff --git a/include/stringzillas/stringzillas.h b/include/stringzillas/stringzillas.h
index 73342968..56f2e97f 100644
--- a/include/stringzillas/stringzillas.h
+++ b/include/stringzillas/stringzillas.h
@@ -29,6 +29,11 @@
 extern "C" {
 #endif
 
+SZ_DYNAMIC int szs_version_major(void);
+SZ_DYNAMIC int szs_version_minor(void);
+SZ_DYNAMIC int szs_version_patch(void);
+SZ_DYNAMIC sz_capability_t szs_capabilities(void);
+
 /**
  *  @brief Apache Arrow-like tape for non-NULL strings with 32-bit offsets.
  *  @sa `sz_sequence_u64tape_t` for larger collections.

From 6206eb4f8ddcb7aee79ed0f82296b86c1d977415 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 6 Aug 2025 09:55:03 +0000
Subject: [PATCH 541/751] Improve: Expose `sz_capabilities` in non-dynamic
 builds

---
 c/stringzilla.c                   | 138 +---------------------------
 include/stringzilla/stringzilla.h | 146 ++++++++++++++++++++++++++++--
 2 files changed, 141 insertions(+), 143 deletions(-)

diff --git a/c/stringzilla.c b/c/stringzilla.c
index f94e426e..a6a746bd 100644
--- a/c/stringzilla.c
+++ b/c/stringzilla.c
@@ -33,147 +33,10 @@ extern void *malloc(size_t length);
 #define SZ_DYNAMIC_DISPATCH 1
 #include <stringzilla/stringzilla.h>
 
-// Inferring target OS: Windows, MacOS, or Linux
-#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || defined(__CYGWIN__)
-#define SZ_IS_WINDOWS_ 1
-#elif defined(__APPLE__) && defined(__MACH__)
-#define SZ_IS_APPLE_ 1
-#elif defined(__linux__)
-#define SZ_IS_LINUX_ 1
-#endif
-
-// On Apple Silicon, `mrs` is not allowed in user-space, so we need to use the `sysctl` API.
-#if defined(SZ_IS_APPLE_)
-#include <sys/sysctl.h>
-#endif
-
 #if defined(SZ_IS_WINDOWS_)
 #include <windows.h> // `DllMain`
 #endif
 
-#if SZ_IS_64BIT_ARM_
-
-/**
- *  @brief  Function to determine the SIMD capabilities of the current 64-bit Arm machine at @b runtime.
- *  @return A bitmask of the SIMD capabilities represented as a `sz_capability_t` enum value.
- */
-SZ_INTERNAL sz_capability_t sz_capabilities_arm_(void) {
-    // https://github.com/ashvardanian/SimSIMD/blob/28e536083602f85ad0c59456782c8864463ffb0e/include/simsimd/simsimd.h#L434
-    // for documentation on how we detect capabilities across different ARM platforms.
-#if defined(SZ_IS_APPLE_)
-
-    // On Apple Silicon, `mrs` is not allowed in user-space, so we need to use the `sysctl` API.
-    uint32_t supports_neon = 0;
-    size_t size = sizeof(supports_neon);
-    if (sysctlbyname("hw.optional.neon", &supports_neon, &size, NULL, 0) != 0) supports_neon = 0;
-
-    return (sz_capability_t)(               //
-        (sz_cap_neon_k * (supports_neon)) | //
-        (sz_cap_serial_k));
-
-#elif defined(SZ_IS_LINUX_)
-
-    // Read CPUID registers directly
-    unsigned long id_aa64isar0_el1 = 0, id_aa64isar1_el1 = 0, id_aa64pfr0_el1 = 0, id_aa64zfr0_el1 = 0;
-
-    // Now let's unpack the status flags from ID_AA64ISAR0_EL1
-    // https://developer.arm.com/documentation/ddi0601/2024-03/AArch64-Registers/ID-AA64ISAR0-EL1--AArch64-Instruction-Set-Attribute-Register-0?lang=en
-    __asm__ __volatile__("mrs %0, ID_AA64ISAR0_EL1" : "=r"(id_aa64isar0_el1));
-    // Now let's unpack the status flags from ID_AA64ISAR1_EL1
-    // https://developer.arm.com/documentation/ddi0601/2024-03/AArch64-Registers/ID-AA64ISAR1-EL1--AArch64-Instruction-Set-Attribute-Register-1?lang=en
-    __asm__ __volatile__("mrs %0, ID_AA64ISAR1_EL1" : "=r"(id_aa64isar1_el1));
-    // Now let's unpack the status flags from ID_AA64PFR0_EL1
-    // https://developer.arm.com/documentation/ddi0601/2024-03/AArch64-Registers/ID-AA64PFR0-EL1--AArch64-Processor-Feature-Register-0?lang=en
-    __asm__ __volatile__("mrs %0, ID_AA64PFR0_EL1" : "=r"(id_aa64pfr0_el1));
-    // SVE, bits [35:32] of ID_AA64PFR0_EL1
-    unsigned supports_sve = ((id_aa64pfr0_el1 >> 32) & 0xF) >= 1;
-    // Now let's unpack the status flags from ID_AA64ZFR0_EL1
-    // https://developer.arm.com/documentation/ddi0601/2024-03/AArch64-Registers/ID-AA64ZFR0-EL1--SVE-Feature-ID-Register-0?lang=en
-    if (supports_sve) __asm__ __volatile__("mrs %0, ID_AA64ZFR0_EL1" : "=r"(id_aa64zfr0_el1));
-    // SVEver, bits [3:0] can be used to check for capability levels:
-    //  - 0b0000: SVE is implemented
-    //  - 0b0001: SVE2 is implemented
-    //  - 0b0010: SVE2.1 is implemented
-    // This value must match the existing indicator obtained from ID_AA64PFR0_EL1:
-    unsigned supports_sve2 = ((id_aa64zfr0_el1) & 0xF) >= 1;
-    unsigned supports_neon = 1; // NEON is always supported
-
-    return (sz_capability_t)(               //
-        (sz_cap_neon_k * (supports_neon)) | //
-        (sz_cap_sve_k * (supports_sve)) |   //
-        (sz_cap_sve2_k * (supports_sve2)) | //
-        (sz_cap_serial_k));
-
-#else // if !defined(SZ_IS_APPLE_) && !defined(SZ_IS_LINUX_)
-    return sz_cap_serial_k;
-#endif
-}
-
-#endif // SZ_IS_64BIT_ARM_
-
-#if SZ_IS_64BIT_X86_
-
-SZ_INTERNAL sz_capability_t sz_capabilities_x86_(void) {
-
-#if SZ_USE_HASWELL || SZ_USE_SKYLAKE || SZ_USE_ICE
-
-    /// The states of 4 registers populated for a specific "cpuid" assembly call
-    union four_registers_t {
-        int array[4];
-        struct separate_t {
-            unsigned eax, ebx, ecx, edx;
-        } named;
-    } info1, info7;
-
-#ifdef _MSC_VER
-    __cpuidex(info1.array, 1, 0);
-    __cpuidex(info7.array, 7, 0);
-#else
-    __asm__ __volatile__( //
-        "cpuid"
-        : "=a"(info1.named.eax), "=b"(info1.named.ebx), "=c"(info1.named.ecx), "=d"(info1.named.edx)
-        : "a"(1), "c"(0));
-    __asm__ __volatile__( //
-        "cpuid"
-        : "=a"(info7.named.eax), "=b"(info7.named.ebx), "=c"(info7.named.ecx), "=d"(info7.named.edx)
-        : "a"(7), "c"(0));
-#endif
-
-    // Check for AVX2 (Function ID 7, EBX register), you can take the relevant flags from the LLVM implementation:
-    // https://github.com/llvm/llvm-project/blob/50598f0ff44f3a4e75706f8c53f3380fe7faa896/clang/lib/Headers/cpuid.h#L148
-    unsigned supports_avx2 = (info7.named.ebx & 0x00000020) != 0;
-    unsigned supports_avx512f = (info7.named.ebx & 0x00010000) != 0;
-    unsigned supports_avx512bw = (info7.named.ebx & 0x40000000) != 0;
-    unsigned supports_avx512vl = (info7.named.ebx & 0x80000000) != 0;
-    unsigned supports_avx512vbmi = (info7.named.ecx & 0x00000002) != 0;
-    unsigned supports_avx512vbmi2 = (info7.named.ecx & 0x00000040) != 0;
-    unsigned supports_vaes = (info7.named.ecx & 0x00000200) != 0;
-
-    return (sz_capability_t)(                                                                                //
-        (sz_cap_haswell_k * supports_avx2) |                                                                 //
-        (sz_cap_skylake_k * (supports_avx512f && supports_avx512vl && supports_avx512bw && supports_vaes)) | //
-        (sz_cap_ice_k * (supports_avx512vbmi && supports_avx512vbmi2)) |                                     //
-        (sz_cap_serial_k));
-#else
-    return sz_cap_serial_k;
-#endif
-}
-#endif // SZ_IS_64BIT_X86_
-
-/**
- *  @brief  Function to determine the SIMD capabilities of the current 64-bit x86 machine at @b runtime.
- *  @return A bitmask of the SIMD capabilities represented as a `sz_capability_t` enum value.
- */
-SZ_DYNAMIC sz_capability_t sz_capabilities(void) {
-#if SZ_IS_64BIT_X86_
-    return sz_capabilities_x86_();
-#elif SZ_IS_64BIT_ARM_
-    return sz_capabilities_arm_();
-#else
-    return sz_cap_serial_k;
-#endif
-}
-
 typedef struct sz_implementations_t {
     sz_equal_t equal;
     sz_order_t order;
@@ -387,6 +250,7 @@ SZ_DYNAMIC int sz_dynamic_dispatch(void) { return 1; }
 SZ_DYNAMIC int sz_version_major(void) { return STRINGZILLA_H_VERSION_MAJOR; }
 SZ_DYNAMIC int sz_version_minor(void) { return STRINGZILLA_H_VERSION_MINOR; }
 SZ_DYNAMIC int sz_version_patch(void) { return STRINGZILLA_H_VERSION_PATCH; }
+SZ_DYNAMIC sz_capability_t sz_capabilities(void) { return sz_capabilities_implementation_(); }
 SZ_DYNAMIC sz_cptr_t sz_capabilities_to_string(sz_capability_t caps) {
     return sz_capabilities_to_string_implementation_(caps);
 }
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 6d6c187b..e29adb3a 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -77,16 +77,24 @@
 #include "sort.h"         // `sz_sequence_argsort`, `sz_pgrams_sort`
 #include "intersect.h"    // `sz_sequence_intersect`
 
+/* Inferring target OS: Windows, MacOS, or Linux */
+#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || defined(__CYGWIN__)
+#define SZ_IS_WINDOWS_ 1
+#elif defined(__APPLE__) && defined(__MACH__)
+#define SZ_IS_APPLE_ 1
+#elif defined(__linux__)
+#define SZ_IS_LINUX_ 1
+#endif
+
+/* On Apple Silicon, `mrs` is not allowed in user-space, so we need to use the `sysctl` API */
+#if defined(SZ_IS_APPLE_)
+#include <sys/sysctl.h>
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/**
- *  @brief  Function to determine the SIMD capabilities of the current machine @b only at @b runtime.
- *  @return A bitmask of the SIMD capabilities represented as a `sz_capability_t` enum value.
- */
-SZ_DYNAMIC sz_capability_t sz_capabilities(void);
-
 /**
  *  @brief Internal helper function to convert SIMD capabilities to a string.
  *  @sa    sz_capabilities_to_string, sz_capabilities
@@ -136,12 +144,137 @@ SZ_INTERNAL sz_cptr_t sz_capabilities_to_string_implementation_(sz_capability_t
     return buf;
 }
 
+#if SZ_IS_64BIT_ARM_
+
+/**
+ *  @brief  Function to determine the SIMD capabilities of the current 64-bit Arm machine at @b runtime.
+ *  @return A bitmask of the SIMD capabilities represented as a `sz_capability_t` enum value.
+ */
+SZ_INTERNAL sz_capability_t sz_capabilities_implementation_arm_(void) {
+    // https://github.com/ashvardanian/SimSIMD/blob/28e536083602f85ad0c59456782c8864463ffb0e/include/simsimd/simsimd.h#L434
+    // for documentation on how we detect capabilities across different ARM platforms.
+#if defined(SZ_IS_APPLE_)
+
+    // On Apple Silicon, `mrs` is not allowed in user-space, so we need to use the `sysctl` API.
+    uint32_t supports_neon = 0;
+    size_t size = sizeof(supports_neon);
+    if (sysctlbyname("hw.optional.neon", &supports_neon, &size, NULL, 0) != 0) supports_neon = 0;
+
+    return (sz_capability_t)(               //
+        (sz_cap_neon_k * (supports_neon)) | //
+        (sz_cap_serial_k));
+
+#elif defined(SZ_IS_LINUX_)
+
+    // Read CPUID registers directly
+    unsigned long id_aa64isar0_el1 = 0, id_aa64isar1_el1 = 0, id_aa64pfr0_el1 = 0, id_aa64zfr0_el1 = 0;
+
+    // Now let's unpack the status flags from ID_AA64ISAR0_EL1
+    // https://developer.arm.com/documentation/ddi0601/2024-03/AArch64-Registers/ID-AA64ISAR0-EL1--AArch64-Instruction-Set-Attribute-Register-0?lang=en
+    __asm__ __volatile__("mrs %0, ID_AA64ISAR0_EL1" : "=r"(id_aa64isar0_el1));
+    // Now let's unpack the status flags from ID_AA64ISAR1_EL1
+    // https://developer.arm.com/documentation/ddi0601/2024-03/AArch64-Registers/ID-AA64ISAR1-EL1--AArch64-Instruction-Set-Attribute-Register-1?lang=en
+    __asm__ __volatile__("mrs %0, ID_AA64ISAR1_EL1" : "=r"(id_aa64isar1_el1));
+    // Now let's unpack the status flags from ID_AA64PFR0_EL1
+    // https://developer.arm.com/documentation/ddi0601/2024-03/AArch64-Registers/ID-AA64PFR0-EL1--AArch64-Processor-Feature-Register-0?lang=en
+    __asm__ __volatile__("mrs %0, ID_AA64PFR0_EL1" : "=r"(id_aa64pfr0_el1));
+    // SVE, bits [35:32] of ID_AA64PFR0_EL1
+    unsigned supports_sve = ((id_aa64pfr0_el1 >> 32) & 0xF) >= 1;
+    // Now let's unpack the status flags from ID_AA64ZFR0_EL1
+    // https://developer.arm.com/documentation/ddi0601/2024-03/AArch64-Registers/ID-AA64ZFR0-EL1--SVE-Feature-ID-Register-0?lang=en
+    if (supports_sve) __asm__ __volatile__("mrs %0, ID_AA64ZFR0_EL1" : "=r"(id_aa64zfr0_el1));
+    // SVEver, bits [3:0] can be used to check for capability levels:
+    //  - 0b0000: SVE is implemented
+    //  - 0b0001: SVE2 is implemented
+    //  - 0b0010: SVE2.1 is implemented
+    // This value must match the existing indicator obtained from ID_AA64PFR0_EL1:
+    unsigned supports_sve2 = ((id_aa64zfr0_el1) & 0xF) >= 1;
+    unsigned supports_neon = 1; // NEON is always supported
+
+    return (sz_capability_t)(               //
+        (sz_cap_neon_k * (supports_neon)) | //
+        (sz_cap_sve_k * (supports_sve)) |   //
+        (sz_cap_sve2_k * (supports_sve2)) | //
+        (sz_cap_serial_k));
+
+#else // if !defined(SZ_IS_APPLE_) && !defined(SZ_IS_LINUX_)
+    return sz_cap_serial_k;
+#endif
+}
+
+#endif // SZ_IS_64BIT_ARM_
+
+#if SZ_IS_64BIT_X86_
+
+SZ_INTERNAL sz_capability_t sz_capabilities_implementation_x86_(void) {
+
+#if SZ_USE_HASWELL || SZ_USE_SKYLAKE || SZ_USE_ICE
+
+    /// The states of 4 registers populated for a specific "cpuid" assembly call
+    union four_registers_t {
+        int array[4];
+        struct separate_t {
+            unsigned eax, ebx, ecx, edx;
+        } named;
+    } info1, info7;
+
+#if defined(_MSC_VER)
+    __cpuidex(info1.array, 1, 0);
+    __cpuidex(info7.array, 7, 0);
+#else
+    __asm__ __volatile__( //
+        "cpuid"
+        : "=a"(info1.named.eax), "=b"(info1.named.ebx), "=c"(info1.named.ecx), "=d"(info1.named.edx)
+        : "a"(1), "c"(0));
+    __asm__ __volatile__( //
+        "cpuid"
+        : "=a"(info7.named.eax), "=b"(info7.named.ebx), "=c"(info7.named.ecx), "=d"(info7.named.edx)
+        : "a"(7), "c"(0));
+#endif
+
+    // Check for AVX2 (Function ID 7, EBX register), you can take the relevant flags from the LLVM implementation:
+    // https://github.com/llvm/llvm-project/blob/50598f0ff44f3a4e75706f8c53f3380fe7faa896/clang/lib/Headers/cpuid.h#L148
+    unsigned supports_avx2 = (info7.named.ebx & 0x00000020) != 0;
+    unsigned supports_avx512f = (info7.named.ebx & 0x00010000) != 0;
+    unsigned supports_avx512bw = (info7.named.ebx & 0x40000000) != 0;
+    unsigned supports_avx512vl = (info7.named.ebx & 0x80000000) != 0;
+    unsigned supports_avx512vbmi = (info7.named.ecx & 0x00000002) != 0;
+    unsigned supports_avx512vbmi2 = (info7.named.ecx & 0x00000040) != 0;
+    unsigned supports_vaes = (info7.named.ecx & 0x00000200) != 0;
+
+    return (sz_capability_t)(                                                                                //
+        (sz_cap_haswell_k * supports_avx2) |                                                                 //
+        (sz_cap_skylake_k * (supports_avx512f && supports_avx512vl && supports_avx512bw && supports_vaes)) | //
+        (sz_cap_ice_k * (supports_avx512vbmi && supports_avx512vbmi2)) |                                     //
+        (sz_cap_serial_k));
+#else
+    return sz_cap_serial_k;
+#endif
+}
+#endif // SZ_IS_64BIT_X86_
+
+/**
+ *  @brief Function to determine the SIMD capabilities of the current 64-bit x86 machine at @b runtime.
+ *  @return A bitmask of the SIMD capabilities represented as a `sz_capability_t` enum value.
+ *  @note Excludes parallel-processing & GPGPU capabilities, which are detected separately in StringZillas.
+ */
+SZ_INTERNAL sz_capability_t sz_capabilities_implementation_(void) {
+#if SZ_IS_64BIT_X86_
+    return sz_capabilities_implementation_x86_();
+#elif SZ_IS_64BIT_ARM_
+    return sz_capabilities_implementation_arm_();
+#else
+    return sz_cap_serial_k;
+#endif
+}
+
 #if defined(SZ_DYNAMIC_DISPATCH)
 
 SZ_DYNAMIC int sz_dynamic_dispatch(void);
 SZ_DYNAMIC int sz_version_major(void);
 SZ_DYNAMIC int sz_version_minor(void);
 SZ_DYNAMIC int sz_version_patch(void);
+SZ_DYNAMIC sz_capability_t sz_capabilities(void);
 SZ_DYNAMIC sz_cptr_t sz_capabilities_to_string(sz_capability_t caps);
 
 #else
@@ -150,6 +283,7 @@ SZ_DYNAMIC int sz_dynamic_dispatch(void) { return 0; }
 SZ_PUBLIC int sz_version_major(void) { return STRINGZILLA_H_VERSION_MAJOR; }
 SZ_PUBLIC int sz_version_minor(void) { return STRINGZILLA_H_VERSION_MINOR; }
 SZ_PUBLIC int sz_version_patch(void) { return STRINGZILLA_H_VERSION_PATCH; }
+SZ_PUBLIC sz_capability_t sz_capabilities(void) { return sz_capabilities_implementation_(); }
 SZ_PUBLIC sz_cptr_t sz_capabilities_to_string(sz_capability_t caps) {
     return sz_capabilities_to_string_implementation_(caps);
 }

From 9d5935be166a1fa2260c92c7294a2bee4d3d25a1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 9 Aug 2025 11:12:21 +0000
Subject: [PATCH 542/751] Add: Exportable `_sz_py_api` capsules

---
 python/stringzilla.c | 167 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 135 insertions(+), 32 deletions(-)

diff --git a/python/stringzilla.c b/python/stringzilla.c
index d28d98ee..7120efd6 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -14,6 +14,12 @@
  *  Pandas doesn't provide a C API, and even in the 2.0 the Apache Arrow representation is opt-in, not default.
  *  PyCapsule protocol in conjunction with @b `__arrow_c_array__` dunder methods can be used to extract strings.
  *  @see https://arrow.apache.org/docs/python/generated/pyarrow.array.html
+ *
+ *  This module exports C functions via `PyCapsule` for use by other extensions (like `stringzillas-cpus`):
+ *  - sz_py_export_string_like: exported as "sz_py_export_string_like".
+ *  - sz_py_export_strings_as_sequence: exported as "sz_py_export_strings_as_sequence".
+ *  - sz_py_export_strings_as_u32tape: exported as "sz_py_export_strings_as_u32tape".
+ *  - sz_py_export_strings_as_u64tape: exported as "sz_py_export_strings_as_u64tape".
  */
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
 #define NOMINMAX
@@ -81,6 +87,13 @@ struct ArrowArray {
     void *private_data;
 };
 
+typedef struct PyAPI {
+    sz_bool_t (*sz_py_export_string_like)(PyObject *, sz_cptr_t *, sz_size_t *);
+    sz_bool_t (*sz_py_export_strings_as_sequence)(PyObject *, sz_sequence_t *);
+    sz_bool_t (*sz_py_export_strings_as_u32tape)(PyObject *, sz_cptr_t *, sz_u32_t const **, sz_size_t *);
+    sz_bool_t (*sz_py_export_strings_as_u64tape)(PyObject *, sz_cptr_t *, sz_u64_t const **, sz_size_t *);
+} PyAPI;
+
 #pragma region Forward Declarations
 
 static PyTypeObject FileType;
@@ -294,7 +307,7 @@ void permute(sz_string_view_t *array, sz_sorted_idx_t *order, size_t length) {
  *  @brief  Helper function to export a Python string-like object into a `sz_string_view_t`.
  *          On failure, sets a Python exception and returns 0.
  */
-sz_bool_t export_string_like(PyObject *object, sz_cptr_t **start, sz_size_t *length) {
+SZ_DYNAMIC sz_bool_t sz_py_export_string_like(PyObject *object, sz_cptr_t *start, sz_size_t *length) {
     if (PyUnicode_Check(object)) {
         // Handle Python `str` object
         Py_ssize_t signed_length;
@@ -363,6 +376,80 @@ sz_bool_t export_string_like(PyObject *object, sz_cptr_t **start, sz_size_t *len
     }
 }
 
+sz_cptr_t sz_py_strs_sequence_member_start_if_reordered(void const *sequence_punned, sz_size_t index) {
+    Strs *strs = (Strs *)sequence_punned;
+    sz_assert_(strs->type == STRS_REORDERED && "Expected a reordered Strs type");
+    if (index < 0 || index >= strs->data.reordered.count) {
+        PyErr_SetString(PyExc_IndexError, "Index out of bounds");
+        return NULL;
+    }
+    return strs->data.reordered.parts[index].start;
+}
+
+sz_size_t sz_py_strs_sequence_member_length_if_reordered(void const *sequence_punned, sz_size_t index) {
+    Strs *strs = (Strs *)sequence_punned;
+    sz_assert_(strs->type == STRS_REORDERED && "Expected a reordered Strs type");
+    if (index < 0 || index >= strs->data.reordered.count) {
+        PyErr_SetString(PyExc_IndexError, "Index out of bounds");
+        return 0;
+    }
+    return strs->data.reordered.parts[index].length;
+}
+
+/**
+ *  @brief  Helper function to export a `Strs` or similar sequence objects into a `sz_sequence_t`.
+ */
+SZ_DYNAMIC sz_bool_t sz_py_export_strings_as_sequence(PyObject *object, sz_sequence_t *sequence) {
+    if (!sequence) return sz_false_k;
+
+    if (PyObject_TypeCheck(object, &StrsType)) {
+        Strs *strs = (Strs *)object;
+        sz_assert_(strs->type == STRS_REORDERED && "View as tapes!");
+
+        sequence->handle = strs;
+        sequence->count = strs->data.reordered.count;
+        sequence->get_start = sz_py_strs_sequence_member_start_if_reordered;
+        sequence->get_length = sz_py_strs_sequence_member_length_if_reordered;
+        return sz_true_k;
+    }
+
+    return sz_false_k;
+}
+
+/**
+ *  @brief  Helper function to export a `Strs` object into `sz_sequence_u32tape_t` components.
+ */
+SZ_DYNAMIC sz_bool_t sz_py_export_strings_as_u32tape(PyObject *object, sz_cptr_t *data, sz_u32_t const **offsets,
+                                                     sz_size_t *count) {
+
+    if (!data || !offsets || !count) return sz_false_k;
+    if (!PyObject_TypeCheck(object, &StrsType)) return sz_false_k;
+    Strs *strs = (Strs *)object;
+    if (strs->type != STRS_CONSECUTIVE_32) return sz_false_k;
+
+    *data = strs->data.consecutive_32bit.start;
+    *offsets = strs->data.consecutive_32bit.end_offsets;
+    *count = strs->data.consecutive_32bit.count;
+    return sz_true_k;
+}
+
+/**
+ *  @brief  Helper function to export a `Strs` object into `sz_sequence_u64tape_t` components.
+ */
+SZ_DYNAMIC sz_bool_t sz_py_export_strings_as_u64tape(PyObject *object, sz_cptr_t *data, sz_u64_t const **offsets,
+                                                     sz_size_t *count) {
+
+    if (!data || !offsets || !count) return sz_false_k;
+    if (!PyObject_TypeCheck(object, &StrsType)) return sz_false_k;
+    Strs *strs = (Strs *)object;
+    if (strs->type != STRS_CONSECUTIVE_64) return sz_false_k;
+
+    *data = strs->data.consecutive_64bit.start;
+    *offsets = strs->data.consecutive_64bit.end_offsets;
+    *count = strs->data.consecutive_64bit.count;
+    return sz_true_k;
+}
+
 /**
  *  @brief  Helper function to wrap the current exception with a custom prefix message.
  *          A example is augmenting the argument parsing error with the name of the variable
@@ -681,7 +768,7 @@ static int Str_init(Str *self, PyObject *args, PyObject *kwargs) {
         self->memory.length = 0;
     }
     // Increment the reference count of the parent
-    else if (export_string_like(parent_obj, &self->memory.start, &self->memory.length)) {
+    else if (sz_py_export_string_like(parent_obj, &self->memory.start, &self->memory.length)) {
         self->parent = parent_obj;
         Py_INCREF(parent_obj);
     }
@@ -811,7 +898,7 @@ static PyObject *Str_like_hash(PyObject *self, PyObject *const *args, Py_ssize_t
     }
 
     // Validate and convert text
-    if (!export_string_like(text_obj, &text.start, &text.length)) {
+    if (!sz_py_export_string_like(text_obj, &text.start, &text.length)) {
         wrap_current_exception("The text argument must be string-like");
         return NULL;
     }
@@ -854,7 +941,7 @@ static PyObject *Str_like_bytesum(PyObject *self, PyObject *const *args, Py_ssiz
     sz_string_view_t text;
 
     // Validate and convert `text`
-    if (!export_string_like(text_obj, &text.start, &text.length)) {
+    if (!sz_py_export_string_like(text_obj, &text.start, &text.length)) {
         wrap_current_exception("The text argument must be string-like");
         return NULL;
     }
@@ -889,8 +976,8 @@ static PyObject *Str_like_equal(PyObject *self, PyObject *const *args, Py_ssize_
     sz_string_view_t text, other;
 
     // Validate and convert tje texts
-    if (!export_string_like(text_obj, &text.start, &text.length) || //
-        !export_string_like(other_obj, &other.start, &other.length)) {
+    if (!sz_py_export_string_like(text_obj, &text.start, &text.length) || //
+        !sz_py_export_string_like(other_obj, &other.start, &other.length)) {
         wrap_current_exception("The arguments must be string-like");
         return NULL;
     }
@@ -988,7 +1075,7 @@ static void Str_releasebuffer(PyObject *_, Py_buffer *view) {
 static int Str_in(Str *self, PyObject *needle_obj) {
 
     sz_string_view_t needle;
-    if (!export_string_like(needle_obj, &needle.start, &needle.length)) {
+    if (!sz_py_export_string_like(needle_obj, &needle.start, &needle.length)) {
         wrap_current_exception("Unsupported needle type");
         return -1;
     }
@@ -1202,7 +1289,7 @@ static int Strs_in(Str *self, PyObject *needle_obj) {
 
     // Validate and convert `needle`
     sz_string_view_t needle;
-    if (!export_string_like(needle_obj, &needle.start, &needle.length)) {
+    if (!sz_py_export_string_like(needle_obj, &needle.start, &needle.length)) {
         wrap_current_exception("The needle argument must be string-like");
         return -1;
     }
@@ -1231,7 +1318,7 @@ static PyObject *Str_richcompare(PyObject *self, PyObject *other, int op) {
 
     sz_cptr_t a_start = NULL, b_start = NULL;
     sz_size_t a_length = 0, b_length = 0;
-    if (!export_string_like(self, &a_start, &a_length) || !export_string_like(other, &b_start, &b_length))
+    if (!sz_py_export_string_like(self, &a_start, &a_length) || !sz_py_export_string_like(other, &b_start, &b_length))
         Py_RETURN_NOTIMPLEMENTED;
 
     int order = (int)sz_order(a_start, a_length, b_start, b_length);
@@ -1348,7 +1435,7 @@ static PyObject *Strs_richcompare(PyObject *self, PyObject *other, int op) {
 
         // Try unpacking the element from the second sequence
         sz_string_view_t bi;
-        if (!export_string_like(other_item, &bi.start, &bi.length)) {
+        if (!sz_py_export_string_like(other_item, &bi.start, &bi.length)) {
             Py_DECREF(other_item);
             Py_DECREF(other_iter);
             wrap_current_exception("The second container must contain string-like objects");
@@ -1451,9 +1538,9 @@ static PyObject *Str_decode(PyObject *self, PyObject *const *args, Py_ssize_t po
     if (errors_obj == Py_None) errors_obj = NULL;
 
     sz_string_view_t text, encoding, errors;
-    if ((!export_string_like(text_obj, &text.start, &text.length)) ||
-        (encoding_obj && !export_string_like(encoding_obj, &encoding.start, &encoding.length)) ||
-        (errors_obj && !export_string_like(errors_obj, &errors.start, &errors.length))) {
+    if ((!sz_py_export_string_like(text_obj, &text.start, &text.length)) ||
+        (encoding_obj && !sz_py_export_string_like(encoding_obj, &encoding.start, &encoding.length)) ||
+        (errors_obj && !sz_py_export_string_like(errors_obj, &errors.start, &errors.length))) {
         wrap_current_exception("text, encoding, and errors must be string-like");
         return NULL;
     }
@@ -1497,8 +1584,8 @@ static PyObject *Str_write_to(PyObject *self, PyObject *const *args, Py_ssize_t
     sz_string_view_t path;
 
     // Validate and convert `text` and `path`
-    if (!export_string_like(text_obj, &text.start, &text.length) ||
-        !export_string_like(path_obj, &path.start, &path.length)) {
+    if (!sz_py_export_string_like(text_obj, &text.start, &text.length) ||
+        !sz_py_export_string_like(path_obj, &path.start, &path.length)) {
         wrap_current_exception("Text and path must be string-like");
         return NULL;
     }
@@ -1576,8 +1663,8 @@ static PyObject *Str_offset_within(PyObject *self, PyObject *const *args, Py_ssi
     sz_string_view_t slice;
 
     // Validate and convert `text` and `slice`
-    if (!export_string_like(text_obj, &text.start, &text.length) ||
-        !export_string_like(slice_obj, &slice.start, &slice.length)) {
+    if (!sz_py_export_string_like(text_obj, &text.start, &text.length) ||
+        !sz_py_export_string_like(slice_obj, &slice.start, &slice.length)) {
         wrap_current_exception("Text and slice must be string-like");
         return NULL;
     }
@@ -1669,8 +1756,8 @@ static int Str_find_implementation_( //
     Py_ssize_t start, end;
 
     // Validate and convert `haystack` and `needle`
-    if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
-        !export_string_like(needle_obj, &needle.start, &needle.length)) {
+    if (!sz_py_export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
+        !sz_py_export_string_like(needle_obj, &needle.start, &needle.length)) {
         wrap_current_exception("Haystack and needle must be string-like");
         return 0;
     }
@@ -2021,8 +2108,8 @@ static PyObject *Str_count(PyObject *self, PyObject *const *args, Py_ssize_t pos
     Py_ssize_t end = end_obj ? PyLong_AsSsize_t(end_obj) : PY_SSIZE_T_MAX;
     int allowoverlap = allowoverlap_obj ? PyObject_IsTrue(allowoverlap_obj) : 0;
 
-    if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
-        !export_string_like(needle_obj, &needle.start, &needle.length)) {
+    if (!sz_py_export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
+        !sz_py_export_string_like(needle_obj, &needle.start, &needle.length)) {
         wrap_current_exception("Haystack and needle must be string-like");
         return NULL;
     }
@@ -2152,8 +2239,8 @@ static PyObject *Str_startswith(PyObject *self, PyObject *const *args, Py_ssize_
     }
 
     sz_string_view_t str, prefix;
-    if (!export_string_like(str_obj, &str.start, &str.length) ||
-        !export_string_like(prefix_obj, &prefix.start, &prefix.length)) {
+    if (!sz_py_export_string_like(str_obj, &str.start, &str.length) ||
+        !sz_py_export_string_like(prefix_obj, &prefix.start, &prefix.length)) {
         wrap_current_exception("Both arguments must be string-like");
         return NULL;
     }
@@ -2260,8 +2347,8 @@ static PyObject *Str_endswith(PyObject *self, PyObject *const *args, Py_ssize_t
     }
 
     sz_string_view_t str, suffix;
-    if (!export_string_like(str_obj, &str.start, &str.length) ||
-        !export_string_like(suffix_obj, &suffix.start, &suffix.length)) {
+    if (!sz_py_export_string_like(str_obj, &str.start, &str.length) ||
+        !sz_py_export_string_like(suffix_obj, &suffix.start, &suffix.length)) {
         wrap_current_exception("Both arguments must be string-like");
         return NULL;
     }
@@ -2334,7 +2421,7 @@ static PyObject *Str_translate(PyObject *self, PyObject *const *args, Py_ssize_t
     }
 
     sz_string_view_t str;
-    if (!export_string_like(str_obj, &str.start, &str.length)) {
+    if (!sz_py_export_string_like(str_obj, &str.start, &str.length)) {
         wrap_current_exception("First argument must be string-like");
         return NULL;
     }
@@ -2361,7 +2448,7 @@ static PyObject *Str_translate(PyObject *self, PyObject *const *args, Py_ssize_t
             look_up_table[(unsigned char)key_char] = value_char;
         }
     }
-    else if (export_string_like(look_up_table_obj, &look_up_table_str.start, &look_up_table_str.length)) {
+    else if (sz_py_export_string_like(look_up_table_obj, &look_up_table_str.start, &look_up_table_str.length)) {
         if (look_up_table_str.length != 256) {
             PyErr_SetString(PyExc_ValueError, "The look-up table must be exactly 256 bytes long");
             return NULL;
@@ -2749,14 +2836,14 @@ static PyObject *Str_split_with_known_callback(PyObject *self, PyObject *const *
     Py_ssize_t maxsplit;
 
     // Validate and convert `text`
-    if (!export_string_like(text_obj, &text.start, &text.length)) {
+    if (!sz_py_export_string_like(text_obj, &text.start, &text.length)) {
         wrap_current_exception("The text argument must be string-like");
         return NULL;
     }
 
     // Validate and convert `separator`
     if (separator_obj) {
-        if (!export_string_like(separator_obj, &separator.start, &separator.length)) {
+        if (!sz_py_export_string_like(separator_obj, &separator.start, &separator.length)) {
             wrap_current_exception("The separator argument must be string-like");
             return NULL;
         }
@@ -2982,7 +3069,7 @@ static PyObject *Str_splitlines(PyObject *self, PyObject *const *args, Py_ssize_
     Py_ssize_t maxsplit = PY_SSIZE_T_MAX; // Default value for maxsplit
 
     // Validate and convert `text`
-    if (!export_string_like(text_obj, &text.start, &text.length)) {
+    if (!sz_py_export_string_like(text_obj, &text.start, &text.length)) {
         wrap_current_exception("The text argument must be string-like");
         return NULL;
     }
@@ -3036,8 +3123,8 @@ static PyObject *Str_concat(PyObject *self, PyObject *other) {
     struct sz_string_view_t self_str, other_str;
 
     // Validate and convert `self` and `other`
-    if (!export_string_like(self, &self_str.start, &self_str.length) ||
-        !export_string_like(other, &other_str.start, &other_str.length)) {
+    if (!sz_py_export_string_like(self, &self_str.start, &self_str.length) ||
+        !sz_py_export_string_like(other, &other_str.start, &other_str.length)) {
         wrap_current_exception("Both operands must be string-like");
         return NULL;
     }
@@ -4051,6 +4138,22 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
         return NULL;
     }
 
+    // Export C API functions as a single capsule structure for StringZillas
+    static PyAPI sz_py_api = {
+        .sz_py_export_string_like = sz_py_export_string_like,
+        .sz_py_export_strings_as_sequence = sz_py_export_strings_as_sequence,
+        .sz_py_export_strings_as_u32tape = sz_py_export_strings_as_u32tape,
+        .sz_py_export_strings_as_u64tape = sz_py_export_strings_as_u64tape,
+    };
+    if (PyModule_AddObject(m, "_sz_py_api", PyCapsule_New(&sz_py_api, "_sz_py_api", NULL)) < 0) {
+        Py_XDECREF(&SplitIteratorType);
+        Py_XDECREF(&StrsType);
+        Py_XDECREF(&FileType);
+        Py_XDECREF(&StrType);
+        Py_XDECREF(m);
+        return NULL;
+    }
+
     // Initialize temporary_memory, if needed
     temporary_memory.start = malloc(4096);
     temporary_memory.length = 4096 * (temporary_memory.start != NULL);

From 4e5df6241af0be3c9d94ede15787c482da8ae8b5 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 9 Aug 2025 11:12:57 +0000
Subject: [PATCH 543/751] Add: `Strs` layout conversion tests

---
 scripts/test_stringzilla.py | 211 +++++++++++++++++++++---------------
 1 file changed, 125 insertions(+), 86 deletions(-)

diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index c273271a..3e41ac5b 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -1,7 +1,19 @@
+#!/usr/bin/env python3
+"""
+Test suite for StringZilla package.
+For full coverage, preinstall NumPy and PyArrow.
+To run locally:
+
+    uv pip install numpy pyarrow pytest pytest-repeat
+    uv pip install -e . --force-reinstall --no-build-isolation
+    uv run --no-project python -m pytest scripts/test_stringzilla.py
+"""
+
 from random import choice, randint
 from string import ascii_lowercase
 from typing import Optional, Sequence, Dict
 import tempfile
+import sys
 import os
 
 import pytest
@@ -153,9 +165,7 @@ def test_str_write_to():
         big.write_to(temp_filename)
         with open(temp_filename, "r") as file:
             content = file.read()
-            assert (
-                content == native
-            ), "The content of the file does not match the expected output"
+            assert content == native, "The content of the file does not match the expected output"
     finally:
         os.remove(temp_filename)
 
@@ -386,9 +396,7 @@ def to_str(seq):
 
     # Introduce a step to skip some values
     assert big_sequence[::2] == ["1", "3", "5"], to_str(big_sequence[::2])
-    assert big_sequence[::-1] == ["6", "5", "4", "3", "2", "1"], to_str(
-        big_sequence[::-1]
-    )
+    assert big_sequence[::-1] == ["6", "5", "4", "3", "2", "1"], to_str(big_sequence[::-1])
 
     # Let's go harder with nested slicing
     assert big_sequence[1:][::-1] == ["6", "5", "4", "3", "2"]
@@ -414,12 +422,8 @@ def test_unit_globals():
     assert sz.find("", "abcdef") == "".find("abcdef")
     assert sz.rfind("", "abcdef") == "".rfind("abcdef")
 
-    assert sz.find("Hello, world!", "world", 0, 11) == "Hello, world!".find(
-        "world", 0, 11
-    )
-    assert sz.rfind("Hello, world!", "world", 0, 11) == "Hello, world!".rfind(
-        "world", 0, 11
-    )
+    assert sz.find("Hello, world!", "world", 0, 11) == "Hello, world!".find("world", 0, 11)
+    assert sz.rfind("Hello, world!", "world", 0, 11) == "Hello, world!".rfind("world", 0, 11)
 
     assert sz.find_first_of("abcdef", "cde") == 2
     assert sz.find_first_of("abcdef", "xyz") == -1
@@ -526,9 +530,7 @@ def get_random_string(
 
 def is_equal_strings(native_strings, big_strings):
     for native_slice, big_slice in zip(native_strings, big_strings):
-        assert (
-            native_slice == big_slice
-        ), f"Mismatch between `{native_slice}` and `{str(big_slice)}`"
+        assert native_slice == big_slice, f"Mismatch between `{native_slice}` and `{str(big_slice)}`"
 
 
 def check_identical(
@@ -553,9 +555,7 @@ def check_identical(
     len_half = len(native) // 2
     len_quarter = len(native) // 4
     assert native.find(needle, len_half) == big.find(needle, len_half)
-    assert native.find(needle, len_quarter, 3 * len_quarter) == big.find(
-        needle, len_quarter, 3 * len_quarter
-    )
+    assert native.find(needle, len_quarter, 3 * len_quarter) == big.find(needle, len_quarter, 3 * len_quarter)
 
     # Check splits and other sequence operations
     native_strings = native.split(needle)
@@ -628,13 +628,9 @@ def test_translations(length: int):
     # Check mapping strings and byte-strings into new strings
     assert sz.translate(body, view_identity) == body
     assert sz.translate(body_bytes, view_identity) == body_bytes
-    assert sz.translate(body_bytes, view_identity) == body_bytes.translate(
-        view_identity
-    )
+    assert sz.translate(body_bytes, view_identity) == body_bytes.translate(view_identity)
     assert sz.translate(body_bytes, view_invert) == body_bytes.translate(view_invert)
-    assert sz.translate(body_bytes, view_threshold) == body_bytes.translate(
-        view_threshold
-    )
+    assert sz.translate(body_bytes, view_threshold) == body_bytes.translate(view_threshold)
 
     # Check in-place translations - all of them return nothing
     after_identity = memoryview(body_bytes)
@@ -671,10 +667,7 @@ def sum_bytes(body: str) -> int:
 @pytest.mark.parametrize("part_length", [5, 10])
 @pytest.mark.parametrize("variability", [2, 3])
 def test_fuzzy_sorting(list_length: int, part_length: int, variability: int):
-    native_list = [
-        get_random_string(variability=variability, length=part_length)
-        for _ in range(list_length)
-    ]
+    native_list = [get_random_string(variability=variability, length=part_length) for _ in range(list_length)]
     native_joined = ".".join(native_list)
     big_joined = Str(native_joined)
     big_list = big_joined.split(".")
@@ -683,9 +676,7 @@ def test_fuzzy_sorting(list_length: int, part_length: int, variability: int):
     native_order = big_list.argsort()
     for i in range(list_length):
         assert native_ordered[i] == native_list[native_order[i]], "Order is wrong"
-        assert native_ordered[i] == str(
-            big_list[int(native_order[i])]
-        ), "Split is wrong?!"
+        assert native_ordered[i] == str(big_list[int(native_order[i])]), "Split is wrong?!"
 
     native_list.sort()
     big_list.sort()
@@ -699,10 +690,7 @@ def test_fuzzy_sorting(list_length: int, part_length: int, variability: int):
 @pytest.mark.parametrize("part_length", [5, 10])
 @pytest.mark.parametrize("variability", [2, 3])
 def test_fuzzy_sorting(list_length: int, part_length: int, variability: int):
-    native_list = [
-        get_random_string(variability=variability, length=part_length)
-        for _ in range(list_length)
-    ]
+    native_list = [get_random_string(variability=variability, length=part_length) for _ in range(list_length)]
     native_joined = ".".join(native_list)
     big_joined = Str(native_joined)
     big_list = big_joined.split(".")
@@ -711,9 +699,7 @@ def test_fuzzy_sorting(list_length: int, part_length: int, variability: int):
     native_order = big_list.argsort()
     for i in range(list_length):
         assert native_ordered[i] == native_list[native_order[i]], "Order is wrong"
-        assert native_ordered[i] == str(
-            big_list[int(native_order[i])]
-        ), "Split is wrong?!"
+        assert native_ordered[i] == str(big_list[int(native_order[i])]), "Split is wrong?!"
 
     native_list.sort()
     big_list.sort()
@@ -724,7 +710,7 @@ def test_fuzzy_sorting(list_length: int, part_length: int, variability: int):
 
 
 @pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
-def test_pyarrow_str_conversion():
+def test_str_to_pyarrow_conversion():
     native = "hello"
     big = Str(native)
     assert isinstance(big.address, int) and big.address != 0
@@ -734,43 +720,65 @@ def test_pyarrow_str_conversion():
     assert arrow_buffer.to_pybytes() == native.encode("utf-8")
 
 
-@pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
-def test_strs_from_arrow_basic():
-    """Test basic conversion from Arrow string array to Strs."""
-    arrow_array = pa.array(["hello", "world", "test", "arrow"])
-    strs = Strs.from_arrow(arrow_array)
-    
-    assert len(strs) == 4
+@pytest.mark.parametrize("container_class", [tuple])
+@pytest.mark.parametrize("view", [False, True])
+def test_strs_from_python_basic(container_class: type, view: bool):
+    """Test basic conversion from Python containers to Strs."""
+    container = container_class(["hello", "world", "test", " ", "from", " ", "container", ""])
+    strs = Strs(container, view=view)
+
+    assert len(strs) == len(container)
     assert strs[0] == "hello"
-    assert strs[1] == "world" 
+    assert strs[1] == "world"
     assert strs[2] == "test"
-    assert strs[3] == "arrow"
+    assert strs[3] == " "
+    assert strs[4] == "from"
+    assert strs[5] == " "
+    assert strs[6] == "container"
+    assert strs[7] == ""
 
 
-@pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
-def test_strs_from_arrow_empty_strings():
-    """Test conversion with empty strings."""
-    arrow_array = pa.array(["hello", "", "world", ""])
-    strs = Strs.from_arrow(arrow_array)
+@pytest.mark.parametrize("container_class", [tuple])
+@pytest.mark.parametrize("view", [False, True])
+def test_strs_reference_counting(container_class: type, view: bool):
+    """Test reference counting to prevent memory leaks."""
+    import sys
+    import gc
     
-    assert len(strs) == 4
-    assert strs[0] == "hello"
-    assert strs[1] == ""
-    assert strs[2] == "world"
-    assert strs[3] == ""
+    container = container_class(["ref", "count", "test"])
+    initial_refcount = sys.getrefcount(container)
+    
+    strs = Strs(container, view=view)
+    during_refcount = sys.getrefcount(container)
+    
+    # View mode should increment refcount, copy mode should not
+    if view:
+        assert during_refcount == initial_refcount + 1, f"View mode should increment refcount"
+    else:
+        assert during_refcount == initial_refcount, f"Copy mode should not change refcount"
+    
+    # Verify functionality
+    assert len(strs) == 3
+    assert strs[0] == "ref"
+    
+    del strs
+    gc.collect()
+    final_refcount = sys.getrefcount(container)
+    assert final_refcount == initial_refcount, f"Refcount should return to initial value"
 
 
 @pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
-def test_strs_from_arrow_unicode():
-    """Test conversion with Unicode strings."""
-    arrow_array = pa.array(["hello", "мир", "🌍", "test"])
-    strs = Strs.from_arrow(arrow_array)
-    
+@pytest.mark.parametrize("view", [False, True])
+def test_strs_from_arrow_basic(view: bool):
+    """Test basic conversion from Arrow string array to Strs."""
+    arrow_array = pa.array(["hello", "world", "test", "arrow"])
+    strs = Strs(arrow_array, view=view)
+
     assert len(strs) == 4
     assert strs[0] == "hello"
-    assert strs[1] == "мир"
-    assert strs[2] == "🌍" 
-    assert strs[3] == "test"
+    assert strs[1] == "world"
+    assert strs[2] == "test"
+    assert strs[3] == "arrow"
 
 
 @pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
@@ -778,13 +786,13 @@ def test_strs_from_arrow_binary_array():
     """Test conversion from Arrow binary array."""
     binary_data = [b"hello", b"world", b"binary", b"data"]
     arrow_array = pa.array(binary_data, type=pa.binary())
-    
-    strs = Strs.from_arrow(arrow_array)
-    
+
+    strs = Strs(arrow_array)
+
     assert len(strs) == 4
     # Strs should handle binary data properly - compare as bytes
     for i, expected in enumerate(binary_data):
-        str_bytes = strs[i].encode('latin-1') if isinstance(strs[i], str) else bytes(strs[i])
+        str_bytes = strs[i].encode("latin-1") if isinstance(strs[i], str) else bytes(strs[i])
         assert str_bytes == expected
 
 
@@ -792,9 +800,9 @@ def test_strs_from_arrow_binary_array():
 def test_strs_from_arrow_large_strings():
     """Test conversion from Arrow large string array."""
     arrow_array = pa.array(["hello", "world", "large", "strings"], type=pa.large_string())
-    
-    strs = Strs.from_arrow(arrow_array)
-    
+
+    strs = Strs(arrow_array)
+
     assert len(strs) == 4
     assert strs[0] == "hello"
     assert strs[1] == "world"
@@ -807,43 +815,74 @@ def test_strs_from_arrow_error_cases():
     """Test error handling for invalid inputs."""
     # Test with non-Arrow object
     with pytest.raises(TypeError):
-        Strs.from_arrow("not_an_arrow_array")
-    
+        Strs("not_an_arrow_array")
+
     with pytest.raises(TypeError):
-        Strs.from_arrow(["list", "of", "strings"])
-    
+        Strs(["list", "of", "strings"])
+
     # Test with non-string Arrow array
     int_array = pa.array([1, 2, 3, 4])
     with pytest.raises((TypeError, ValueError)):
-        Strs.from_arrow(int_array)
+        Strs(int_array)
 
 
 @pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
 def test_strs_from_arrow_c_interface():
     """Test the low-level Arrow C Data Interface."""
     arrow_array = pa.array(["test", "c", "interface"])
-    
+
     # Check that Arrow array has the __arrow_c_array__ method
-    assert hasattr(arrow_array, '__arrow_c_array__')
-    
+    assert hasattr(arrow_array, "__arrow_c_array__")
+
     # Get the C interface capsules
     schema_capsule, array_capsule = arrow_array.__arrow_c_array__()
-    
+
     # Verify capsules are valid PyCapsule objects
-    import sys
     if sys.version_info >= (3, 1):
         assert str(type(schema_capsule)) == "<class 'PyCapsule'>"
         assert str(type(array_capsule)) == "<class 'PyCapsule'>"
-    
+
     # Test actual conversion
-    strs = Strs.from_arrow(arrow_array)
+    strs = Strs(arrow_array)
     assert len(strs) == 3
     assert strs[0] == "test"
     assert strs[1] == "c"
     assert strs[2] == "interface"
 
 
+@pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
+def test_strs_from_arrow_with_nulls():
+    """Test conversion from Arrow array with null values (validity bits)."""
+    # Create an array with None values
+    arrow_array = pa.array(["hello", None, "world", None, "test"])
+
+    strs = Strs(arrow_array)
+
+    assert len(strs) == 5
+    assert strs[0] == "hello"
+    assert strs[1] == ""  # None values should be converted to empty strings
+    assert strs[2] == "world"
+    assert strs[3] == ""  # None values should be converted to empty strings
+    assert strs[4] == "test"
+
+    # Test with all nulls
+    all_nulls = pa.array([None, None, None], type=pa.string())
+    strs_nulls = Strs(all_nulls)
+    assert len(strs_nulls) == 3
+    assert all(s == "" for s in strs_nulls)
+
+    # Test with mixed None and empty strings
+    mixed = pa.array(["", None, "hello", "", None, "world"])
+    strs_mixed = Strs(mixed)
+    assert len(strs_mixed) == 6
+    assert strs_mixed[0] == ""
+    assert strs_mixed[1] == ""
+    assert strs_mixed[2] == "hello"
+    assert strs_mixed[3] == ""
+    assert strs_mixed[4] == ""
+    assert strs_mixed[5] == "world"
+
+
 if __name__ == "__main__":
-    import sys
 
     sys.exit(pytest.main(["-x", "-s", __file__]))

From 978d13df084f9e32e053d70f3fbba7fd96f2d168 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 9 Aug 2025 11:13:52 +0000
Subject: [PATCH 544/751] Add: StringZillas Python tests

---
 scripts/test_stringzillas.py | 378 +++++++++++++++--------------------
 1 file changed, 163 insertions(+), 215 deletions(-)

diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index a43a3757..6b1c45e3 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -1,12 +1,23 @@
+#!/usr/bin/env python3
+"""
+Test suite for StringZillas parallel algorithms module.
+Tests with Python lists, NumPy arrays, Apache Arrow columns, and StringZilla Strs types.
+To run locally:
+
+    uv pip install numpy pyarrow pytest pytest-repeat
+    SZ_TARGET=stringzillas-cpus uv pip install -e . --force-reinstall --no-build-isolation
+    SZ_TARGET=stringzillas-cuda uv pip install -e . --force-reinstall --no-build-isolation
+    uv run --no-project python -m pytest scripts/test_stringzillas.py
+"""
+
 from random import choice, randint
 from string import ascii_lowercase
 from typing import Optional, Sequence, Dict
-import tempfile
-import os
 
 import pytest
 
 import stringzilla as sz
+import stringzillas as szs
 from stringzilla import Str, Strs
 
 # NumPy is available on most platforms and is required for most tests.
@@ -39,21 +50,33 @@ def test_library_properties():
     assert len(sz.__version__.split(".")) == 3, "Semantic versioning must be preserved"
     assert "serial" in sz.__capabilities__.split(","), "Serial backend must be present"
 
+    # Test StringZillas properties
+    assert len(szs.__version__.split(".")) == 3, "Semantic versioning must be preserved"
+    assert hasattr(szs, "__capabilities__"), "Capabilities must be exposed"
+    assert hasattr(szs, "__numpy_available__"), "NumPy availability must be exposed"
+
+
+def test_device_scope():
+    """Test DeviceScope for execution context control."""
+
+    default_scope = szs.DeviceScope()
+    assert default_scope is not None
+
+    scope_multi = szs.DeviceScope.with_cpu_cores(4)
+    assert scope_multi is not None
 
-def test_unit_globals():
-    """Validates that the previously unit-tested member methods are also visible as global functions."""
+    scope_gpu = szs.DeviceScope.with_gpu_device(0)
+    assert scope_gpu is not None
 
-    assert sz.hamming_distance("aaa", "aaa") == 0
-    assert sz.hamming_distance("aaa", "bbb") == 3
-    assert sz.hamming_distance("abababab", "aaaaaaaa") == 4
-    assert sz.hamming_distance("abababab", "aaaaaaaa", 2) == 2
-    assert sz.hamming_distance("abababab", "aaaaaaaa", bound=2) == 2
+    # Invalid arguments
+    with pytest.raises(ValueError):
+        szs.DeviceScope.with_cpu_cores(1)
 
-    assert sz.edit_distance("aaa", "aaa") == 0
-    assert sz.edit_distance("aaa", "bbb") == 3
-    assert sz.edit_distance("abababab", "aaaaaaaa") == 4
-    assert sz.edit_distance("abababab", "aaaaaaaa", 2) == 2
-    assert sz.edit_distance("abababab", "aaaaaaaa", bound=2) == 2
+    with pytest.raises(TypeError):
+        szs.DeviceScope.with_cpu_cores("invalid")
+
+    with pytest.raises(TypeError):
+        szs.DeviceScope.with_gpu_device("invalid")
 
 
 def get_random_string(
@@ -69,13 +92,11 @@ def get_random_string(
 
 def is_equal_strings(native_strings, big_strings):
     for native_slice, big_slice in zip(native_strings, big_strings):
-        assert (
-            native_slice == big_slice
-        ), f"Mismatch between `{native_slice}` and `{str(big_slice)}`"
+        assert native_slice == big_slice, f"Mismatch between `{native_slice}` and `{str(big_slice)}`"
 
 
 @pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
-def baseline_edit_distance(s1, s2) -> int:
+def baseline_levenshtein_distance(s1, s2) -> int:
     """
     Compute the Levenshtein distance between two strings.
     """
@@ -108,232 +129,159 @@ def baseline_edit_distance(s1, s2) -> int:
 
 @pytest.mark.repeat(100)
 @pytest.mark.parametrize("max_edit_distance", [150])
-def test_edit_distance_insertions(max_edit_distance: int):
+def test_levenshtein_distance_insertions(max_edit_distance: int):
     # Create a new string by slicing and concatenating
     def insert_char_at(s, char_to_insert, index):
         return s[:index] + char_to_insert + s[index:]
 
+    binary_distance = sz.LevenshteinDistance()
+
     a = get_random_string(length=20)
     b = a
     for i in range(max_edit_distance):
         source_offset = randint(0, len(ascii_lowercase) - 1)
         target_offset = randint(0, len(b) - 1)
         b = insert_char_at(b, ascii_lowercase[source_offset], target_offset)
-        assert sz.edit_distance(a, b, bound=200) == i + 1
-
-
-def test_edit_distances():
-
-    assert sz.hamming_distance("hello", "hello") == 0
-    assert sz.hamming_distance("hello", "hell") == 1
-    assert sz.hamming_distance("abc", "adc") == 1, "one substitution"
-    assert sz.hamming_distance("αβγδ", "αxxγδ") == 2, "replace Beta UTF8 codepoint"
-    assert (
-        sz.hamming_distance_unicode("abcdefgh", "_bcdefg_") == 2
-    ), "replace ASCI prefix and suffix"
-    assert (
-        sz.hamming_distance_unicode("αβγδ", "αγγδ") == 1
-    ), "replace Beta UTF8 codepoint"
-
-    assert sz.edit_distance("hello", "hello") == 0
-    assert sz.edit_distance("hello", "hell") == 1
-    assert sz.edit_distance("", "") == 0
-    assert sz.edit_distance("", "abc") == 3
-    assert sz.edit_distance("abc", "") == 3
-    assert sz.edit_distance("abc", "ac") == 1, "one deletion"
-    assert sz.edit_distance("abc", "a_bc") == 1, "one insertion"
-    assert sz.edit_distance("abc", "adc") == 1, "one substitution"
-    assert (
-        sz.edit_distance("ggbuzgjux{}l", "gbuzgjux{}l") == 1
-    ), "one insertion (prepended)"
-    assert sz.edit_distance("abcdefgABCDEFG", "ABCDEFGabcdefg") == 14
-
-    assert (
-        sz.edit_distance_unicode("hello", "hell") == 1
-    ), "no unicode symbols, just ASCII"
-    assert (
-        sz.edit_distance_unicode("𠜎 𠜱 𠝹 𠱓", "𠜎𠜱𠝹𠱓") == 3
-    ), "add 3 whitespaces in Chinese"
-    assert sz.edit_distance_unicode("💖", "💗") == 1
-
-    assert sz.edit_distance_unicode("αβγδ", "αγδ") == 1, "insert Beta"
-    assert (
-        sz.edit_distance_unicode("école", "école") == 2
-    ), "etter 'é' as a single character vs 'e' + '´'"
-    assert (
-        sz.edit_distance_unicode("façade", "facade") == 1
-    ), "'ç' with cedilla vs. plain"
-    assert (
-        sz.edit_distance_unicode("Schön", "Scho\u0308n") == 2
-    ), "'ö' represented as 'o' + '¨'"
-    assert (
-        sz.edit_distance_unicode("München", "Muenchen") == 2
-    ), "German with umlaut vs. transcription"
-    assert sz.edit_distance_unicode("こんにちは世界", "こんばんは世界") == 2
-
-
-@pytest.mark.repeat(30)
+        assert binary_distance([a], [b]) == [i + 1], f"Edit distance mismatch after {i + 1} insertions: {a} -> {b}"
+
+
+def test_levenshtein_distances_with_simple_cases():
+
+    binary_distance = sz.LevenshteinDistance()
+    unicode_distance = sz.LevenshteinDistanceUTF8()
+
+    assert binary_distance(["hello"], ["hello"]) == [0]
+    assert binary_distance(["hello"], ["hell"]) == [1]
+    assert binary_distance([""], [""]) == [0]
+    assert binary_distance([""], ["abc"]) == [3]
+    assert binary_distance(["abc"], [""]) == [3]
+    assert binary_distance(["abc"], ["ac"]) == [1], "one deletion"
+    assert binary_distance(["abc"], ["a_bc"]) == [1], "one insertion"
+    assert binary_distance(["abc"], ["adc"]) == [1], "one substitution"
+    assert binary_distance(["ggbuzgjux{}l"], ["gbuzgjux{}l"]) == [1], "one insertion (prepended)"
+    assert binary_distance(["abcdefgABCDEFG"], ["ABCDEFGabcdefg"]) == [14]
+
+    assert unicode_distance(["hello"], ["hell"]) == [1], "no unicode symbols, just ASCII"
+    assert unicode_distance(["𠜎 𠜱 𠝹 𠱓"], ["𠜎𠜱𠝹𠱓"]) == [3], "add 3 whitespaces in Chinese"
+    assert unicode_distance(["💖"], ["💗"]) == [1]
+
+    assert unicode_distance(["αβγδ"], ["αγδ"]) == [1], "insert Beta"
+    assert unicode_distance(["école"], ["école"]) == [2], "etter 'é' as 1 character vs 'e' + '´'"
+    assert unicode_distance(["façade"], ["facade"]) == [1], "'ç' with cedilla vs. plain"
+    assert unicode_distance(["Schön"], ["Scho\u0308n"]) == [2], "'ö' represented as 'o' + '¨'"
+    assert unicode_distance(["München"], ["Muenchen"]) == [2], "German with umlaut vs. transcription"
+    assert unicode_distance(["こんにちは世界"], ["こんばんは世界"]) == [2], "Japanese greetings"
+
+
+def test_levenshtein_distances_with_custom_gaps():
+
+    mismatch: int = 4
+    opening: int = 3
+    extension: int = 2
+
+    binary_distance = sz.LevenshteinDistance()
+    unicode_distance = sz.LevenshteinDistanceUTF8()
+
+    assert binary_distance(["hello"], ["hello"]) == [0]
+    assert binary_distance(["hello"], ["hell"]) == [opening]
+    assert binary_distance([""], [""]) == [0]
+    assert binary_distance([""], ["abc"]) == [opening + 2 * extension]
+    assert binary_distance(["abc"], [""]) == [opening + 2 * extension]
+    assert binary_distance(["abc"], ["ac"]) == [opening], "one deletion"
+    assert binary_distance(["abc"], ["a_bc"]) == [opening], "one insertion"
+    assert binary_distance(["abc"], ["adc"]) == [mismatch], "one substitution"
+    assert binary_distance(["ggbuzgjux{}l"], ["gbuzgjux{}l"]) == [opening], "one insertion (prepended)"
+    assert binary_distance(["abcdefgABCDEFG"], ["ABCDEFGabcdefg"]) == [14 * mismatch]
+
+    assert unicode_distance(["hello"], ["hell"]) == [opening], "no unicode symbols, just ASCII"
+    assert unicode_distance(["𠜎 𠜱 𠝹 𠱓"], ["𠜎𠜱𠝹𠱓"]) == [3 * opening], "add 3 whitespaces in Chinese"
+    assert unicode_distance(["💖"], ["💗"]) == [1 * mismatch]
+
+    assert unicode_distance(["αβγδ"], ["αγδ"]) == [opening], "insert Beta"
+    assert unicode_distance(["école"], ["école"]) == [mismatch + opening], "etter 'é' as 1 character vs 'e' + '´'"
+    assert unicode_distance(["façade"], ["facade"]) == [mismatch], "'ç' with cedilla vs. plain"
+    assert unicode_distance(["Schön"], ["Scho\u0308n"]) == [mismatch + opening], "'ö' represented as 'o' + '¨'"
+    assert unicode_distance(["München"], ["Muenchen"]) == [mismatch + opening], "German with umlaut vs. transcription"
+    assert unicode_distance(["こんにちは世界"], ["こんばんは世界"]) == [mismatch + opening], "Japanese greetings"
+
+
+@pytest.mark.repeat(10)
 @pytest.mark.parametrize("first_length", [20, 100])
 @pytest.mark.parametrize("second_length", [20, 100])
+@pytest.mark.parametrize("batch_size", [1, 3, 133, 1000])
 @pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
-def test_edit_distance_random(first_length: int, second_length: int):
-    a = get_random_string(length=first_length)
-    b = get_random_string(length=second_length)
-    assert sz.edit_distance(a, b) == baseline_edit_distance(a, b)
+def test_levenshtein_distance_random(first_length: int, second_length: int, batch_size: int):
+    batch_a = [get_random_string(length=first_length) for _ in range(batch_size)]
+    batch_b = [get_random_string(length=second_length) for _ in range(batch_size)]
 
+    baselines = np.array([baseline_levenshtein_distance(a, b) for a, b in zip(batch_a, batch_b)])
+    engine = sz.LevenshteinDistance()
+    results = engine(batch_a, batch_b)
 
-@pytest.mark.repeat(30)
+    np.testing.assert_array_equal(results, baselines, "Edit distances do not match")
+
+
+@pytest.mark.repeat(10)
 @pytest.mark.parametrize("first_length", [20, 100])
 @pytest.mark.parametrize("second_length", [20, 100])
+@pytest.mark.parametrize("batch_size", [1, 3, 133, 1000])
 @pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
-def test_alignment_score_random(first_length: int, second_length: int):
+def test_needleman_wunsch_vs_levenshtein_random(first_length: int, second_length: int, batch_size: int):
+    """Test Needleman-Wunsch global alignment scores against Levenshtein distances with random strings."""
+
+    batch_a = [get_random_string(length=first_length) for _ in range(batch_size)]
+    batch_b = [get_random_string(length=second_length) for _ in range(batch_size)]
 
-    a = get_random_string(length=first_length)
-    b = get_random_string(length=second_length)
     character_substitutions = np.zeros((256, 256), dtype=np.int8)
     character_substitutions.fill(-1)
     np.fill_diagonal(character_substitutions, 0)
 
-    assert sz.alignment_score(
-        a,
-        b,
-        substitution_matrix=character_substitutions,
-        gap_score=-1,
-    ) == -baseline_edit_distance(a, b)
+    baselines = [-baseline_levenshtein_distance(a, b) for a, b in zip(batch_a, batch_b)]
+    engine = sz.NeedlemanWunsch(substitution_matrix=character_substitutions, open=-1, extend=-1)
+    results = engine(batch_a, batch_b)
 
+    np.testing.assert_array_equal(results, baselines, "Edit distances do not match")
 
-def baseline_translate(body: str, lut: Sequence) -> str:
-    return "".join([chr(lut[ord(c)]) for c in body])
 
+def test_fingerprints():
+    """Test Fingerprints and FingerprintsUTF8 basic functionality."""
 
-def translation_table_to_dict(lut: Sequence) -> Dict[str, str]:
-    return {chr(i): chr(lut[i]) for i in range(256)}
+    engine = szs.Fingerprints()
+    utf8_engine = szs.FingerprintsUTF8()
 
+    # Basic functionality
+    assert engine([]) == []
+    assert utf8_engine([]) == []
 
-@pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
-@pytest.mark.parametrize("length", range(1, 300))
-def test_translations(length: int):
-
-    map_identity = np.arange(256, dtype=np.uint8)
-    map_invert = np.arange(255, -1, -1, dtype=np.uint8)
-    map_threshold = np.where(np.arange(256) > 127, 255, 0).astype(np.uint8)
-    dict_identity = translation_table_to_dict(map_identity)
-    dict_invert = translation_table_to_dict(map_invert)
-    dict_threshold = translation_table_to_dict(map_threshold)
-    view_identity = memoryview(map_identity)
-    view_invert = memoryview(map_invert)
-    view_threshold = memoryview(map_threshold)
-
-    body = get_random_string(length=length)
-    body_bytes = body.encode("utf-8")
-
-    # Check mapping strings and byte-strings into new strings
-    assert sz.translate(body, view_identity) == body
-    assert sz.translate(body_bytes, view_identity) == body_bytes
-    assert sz.translate(body_bytes, view_identity) == body_bytes.translate(
-        view_identity
-    )
-    assert sz.translate(body_bytes, view_invert) == body_bytes.translate(view_invert)
-    assert sz.translate(body_bytes, view_threshold) == body_bytes.translate(
-        view_threshold
-    )
-
-    # Check in-place translations - all of them return nothing
-    after_identity = memoryview(body_bytes)
-    assert sz.translate(after_identity, view_identity, inplace=True) == None
-    assert sz.equal(after_identity, body.translate(dict_identity))
-    after_invert = memoryview(body_bytes)
-    assert sz.translate(after_invert, view_invert, inplace=True) == None
-    assert sz.equal(after_invert, body.translate(dict_invert))
-    after_threshold = memoryview(body_bytes)
-    assert sz.translate(after_threshold, view_threshold, inplace=True) == None
-    assert sz.equal(after_threshold, body.translate(dict_threshold))
-
-
-@pytest.mark.repeat(3)
-@pytest.mark.parametrize("length", list(range(0, 300)) + [1024, 4096, 100000])
-@pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
-def test_translations_random(length: int):
-    body = get_random_string(length=length)
-    lut = np.random.randint(0, 256, size=256, dtype=np.uint8)
-    assert sz.translate(body, memoryview(lut)) == baseline_translate(body, lut)
-
-
-@pytest.mark.repeat(3)
-@pytest.mark.parametrize("length", list(range(0, 300)) + [1024, 4096, 100000])
-def test_bytesums_random(length: int):
-    def sum_bytes(body: str) -> int:
-        return sum([ord(c) for c in body])
-
-    body = get_random_string(length=length)
-    assert sum_bytes(body) == sz.bytesum(body)
-
-
-@pytest.mark.parametrize("list_length", [10, 20, 30, 40, 50])
-@pytest.mark.parametrize("part_length", [5, 10])
-@pytest.mark.parametrize("variability", [2, 3])
-def test_fuzzy_sorting(list_length: int, part_length: int, variability: int):
-    native_list = [
-        get_random_string(variability=variability, length=part_length)
-        for _ in range(list_length)
-    ]
-    native_joined = ".".join(native_list)
-    big_joined = Str(native_joined)
-    big_list = big_joined.split(".")
-
-    native_ordered = sorted(native_list)
-    native_order = big_list.argsort()
-    for i in range(list_length):
-        assert native_ordered[i] == native_list[native_order[i]], "Order is wrong"
-        assert native_ordered[i] == str(
-            big_list[int(native_order[i])]
-        ), "Split is wrong?!"
-
-    native_list.sort()
-    big_list.sort()
-
-    assert len(native_list) == len(big_list)
-    for native_str, big_str in zip(native_list, big_list):
-        assert native_str == str(big_str), "Order is wrong"
-
-
-@pytest.mark.parametrize("list_length", [10, 20, 30, 40, 50])
-@pytest.mark.parametrize("part_length", [5, 10])
-@pytest.mark.parametrize("variability", [2, 3])
-def test_fuzzy_sorting(list_length: int, part_length: int, variability: int):
-    native_list = [
-        get_random_string(variability=variability, length=part_length)
-        for _ in range(list_length)
-    ]
-    native_joined = ".".join(native_list)
-    big_joined = Str(native_joined)
-    big_list = big_joined.split(".")
-
-    native_ordered = sorted(native_list)
-    native_order = big_list.argsort()
-    for i in range(list_length):
-        assert native_ordered[i] == native_list[native_order[i]], "Order is wrong"
-        assert native_ordered[i] == str(
-            big_list[int(native_order[i])]
-        ), "Split is wrong?!"
-
-    native_list.sort()
-    big_list.sort()
-
-    assert len(native_list) == len(big_list)
-    for native_str, big_str in zip(native_list, big_list):
-        assert native_str == str(big_str), "Order is wrong"
-
-
-@pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
-def test_pyarrow_str_conversion():
-    native = "hello"
-    big = Str(native)
-    assert isinstance(big.address, int) and big.address != 0
-    assert isinstance(big.nbytes, int) and big.nbytes == len(native)
-
-    arrow_buffer = pa.foreign_buffer(big.address, big.nbytes, big)
-    assert arrow_buffer.to_pybytes() == native.encode("utf-8")
+    test_strings = ["hello", "world", "hello"]
+    results = engine(test_strings)
+    assert len(results) == 3
+    assert results[0] == results[2], "Identical strings should produce identical fingerprints"
+    assert results[0] != results[1], "Different strings should produce different fingerprints"
+
+    # Unicode handling
+    unicode_strings = ["café", "世界", "🌟"]
+    utf8_results = utf8_engine(unicode_strings)
+    assert len(utf8_results) == 3
+    assert (
+        len(set(tuple(fp) if hasattr(fp, "__iter__") else fp for fp in utf8_results)) == 3
+    ), "Unicode strings should produce unique fingerprints"
+
+
+@pytest.mark.repeat(5)
+@pytest.mark.parametrize("batch_size", [1, 10, 100])
+def test_fingerprints_random(batch_size: int):
+    """Test fingerprinting with random strings."""
+
+    engine = szs.Fingerprints()
+    batch = [get_random_string(length=randint(5, 50)) for _ in range(batch_size)]
+
+    results = engine(batch)
+    assert len(results) == batch_size
+
+    # Verify consistency
+    results_repeated = engine(batch)
+    assert results == results_repeated, "Same input should produce same fingerprints"
 
 
 if __name__ == "__main__":

From 65380ca02601f1ed8f732bef2d677ff80cbc5258 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 9 Aug 2025 15:55:43 +0000
Subject: [PATCH 545/751] Fix: Track ownership of `Strs` offsets

---
 python/stringzilla.c | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/python/stringzilla.c b/python/stringzilla.c
index 7120efd6..846a266e 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -203,8 +203,9 @@ typedef struct {
             size_t count;
             size_t separator_length;
             PyObject *parent_string;
-            char const *start;
+            char const *start; // ? Ownership is controlled by presence of `parent_string`
             uint32_t *end_offsets;
+            int owns_offsets; // ? 1 if we allocated `end_offsets` and should `free`
         } consecutive_32bit;
 
         /**
@@ -223,8 +224,9 @@ typedef struct {
             size_t count;
             size_t separator_length;
             PyObject *parent_string;
-            char const *start;
+            char const *start; // ? Ownership is controlled by presence of `parent_string`
             uint64_t *end_offsets;
+            int owns_offsets; // ? 1 if we allocated `end_offsets` and should `free`
         } consecutive_64bit;
 
         /**
@@ -234,7 +236,7 @@ typedef struct {
         struct reordered_slices_t {
             size_t count;
             PyObject *parent_string;
-            sz_string_view_t *parts;
+            sz_string_view_t *parts; // ? Ownership is controlled by presence of `parent_string`
         } reordered;
 
     } data;
@@ -507,19 +509,19 @@ sz_bool_t prepare_strings_for_reordering(Strs *strs) {
 
     // Allocate memory for reordered slices
     size_t count = 0;
-    void *old_buffer = NULL;
+    void *buffer_to_release = NULL;
     get_string_at_offset_t getter = NULL;
     PyObject *parent_string = NULL;
     switch (strs->type) {
     case STRS_CONSECUTIVE_32:
         count = strs->data.consecutive_32bit.count;
-        old_buffer = strs->data.consecutive_32bit.end_offsets;
+        if (strs->data.consecutive_32bit.owns_offsets) buffer_to_release = strs->data.consecutive_32bit.end_offsets;
         parent_string = strs->data.consecutive_32bit.parent_string;
         getter = str_at_offset_consecutive_32bit;
         break;
     case STRS_CONSECUTIVE_64:
         count = strs->data.consecutive_64bit.count;
-        old_buffer = strs->data.consecutive_64bit.end_offsets;
+        if (strs->data.consecutive_64bit.owns_offsets) buffer_to_release = strs->data.consecutive_64bit.end_offsets;
         parent_string = strs->data.consecutive_64bit.parent_string;
         getter = str_at_offset_consecutive_64bit;
         break;
@@ -548,8 +550,8 @@ sz_bool_t prepare_strings_for_reordering(Strs *strs) {
         new_parts[i].length = length;
     }
 
-    // Release previous used memory.
-    if (old_buffer) free(old_buffer);
+    // Release previous used memory, if we own it
+    if (buffer_to_release) free(buffer_to_release);
 
     // Update the Strs object
     strs->type = STRS_REORDERED;
@@ -2647,6 +2649,8 @@ static Strs *Str_split_(PyObject *parent_string, sz_string_view_t const text, sz
         result->data.consecutive_64bit.start = text.start;
         result->data.consecutive_64bit.parent_string = parent_string;
         result->data.consecutive_64bit.separator_length = !keepseparator * match_length;
+        result->data.consecutive_64bit.end_offsets = NULL;
+        result->data.consecutive_64bit.owns_offsets = 0;
     }
     else {
         bytes_per_offset = 4;
@@ -2654,6 +2658,8 @@ static Strs *Str_split_(PyObject *parent_string, sz_string_view_t const text, sz
         result->data.consecutive_32bit.start = text.start;
         result->data.consecutive_32bit.parent_string = parent_string;
         result->data.consecutive_32bit.separator_length = !keepseparator * match_length;
+        result->data.consecutive_32bit.end_offsets = NULL;
+        result->data.consecutive_32bit.owns_offsets = 0;
     }
 
     sz_bool_t reached_tail = 0;
@@ -2704,10 +2710,12 @@ static Strs *Str_split_(PyObject *parent_string, sz_string_view_t const text, sz
     if (bytes_per_offset == 8) {
         result->data.consecutive_64bit.end_offsets = offsets_endings;
         result->data.consecutive_64bit.count = offsets_count;
+        result->data.consecutive_64bit.owns_offsets = 1;
     }
     else {
         result->data.consecutive_32bit.end_offsets = offsets_endings;
         result->data.consecutive_32bit.count = offsets_count;
+        result->data.consecutive_32bit.owns_offsets = 1;
     }
 
     Py_INCREF(parent_string);
@@ -3425,11 +3433,16 @@ static sz_bool_t Strs_argsort_(Strs *self, sz_string_view_t **parts_output, sz_s
     // Allocate temporary memory to store the ordering offsets
     size_t memory_needed = sizeof(sz_sorted_idx_t) * count;
     if (temporary_memory.length < memory_needed) {
-        temporary_memory.start = realloc(temporary_memory.start, memory_needed);
+        void *new_memory = realloc(temporary_memory.start, memory_needed);
+        if (!new_memory) {
+            PyErr_Format(PyExc_MemoryError, "Unable to allocate memory for the sorting operation");
+            return 0;
+        }
+        temporary_memory.start = new_memory;
         temporary_memory.length = memory_needed;
     }
     if (!temporary_memory.start) {
-        PyErr_Format(PyExc_MemoryError, "Unable to allocate memory for the Levenshtein matrix");
+        PyErr_Format(PyExc_MemoryError, "Unable to allocate memory for the sorting operation");
         return 0;
     }
 

From 7aebef4ed0b933faaed4bae3f156256fe04433a7 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 9 Aug 2025 16:00:58 +0000
Subject: [PATCH 546/751] Improve: Constructing `Strs` from PyArrow

---
 python/stringzilla.c | 471 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 380 insertions(+), 91 deletions(-)

diff --git a/python/stringzilla.c b/python/stringzilla.c
index 846a266e..6dbc6ae2 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -3845,45 +3845,14 @@ static PyGetSetDef Strs_getsetters[] = {
     {NULL} // Sentinel
 };
 
-static char const doc_Strs_from_arrow[] = //
-    "from_arrow(arrow_array)\n"
-    "\n"
-    "Create a Strs object from an Arrow string array with zero-copy semantics.\n"
-    "\n"
-    "Args:\n"
-    "  arrow_array: Arrow array object supporting `__arrow_c_array__` protocol.\n"
-    "\n"
-    "Returns:\n"
-    "  Strs: Zero-copy view of the Arrow string array.";
-
-static PyObject *Strs_from_arrow(PyObject *cls, PyObject *args, PyObject *kwargs) {
-    PyObject *arrow_array_obj = NULL;
-
-    // Manual argument parsing for performance
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs != 1) {
-        PyErr_SetString(PyExc_TypeError, "from_arrow() takes exactly 1 argument");
-        return NULL;
-    }
-
-    arrow_array_obj = PyTuple_GET_ITEM(args, 0);
-
-    // Try to get the __arrow_c_array__ method
-    PyObject *arrow_c_array_method = PyObject_GetAttrString(arrow_array_obj, "__arrow_c_array__");
-    if (!arrow_c_array_method) {
-        PyErr_SetString(PyExc_TypeError,
-                        "Object does not support Arrow C interface (__arrow_c_array__ method missing)");
-        return NULL;
-    }
-
-    // Call __arrow_c_array__() to get the capsules
-    PyObject *capsules = PyObject_CallNoArgs(arrow_c_array_method);
-    Py_DECREF(arrow_c_array_method);
-
+// The efficient `Strs_init` path initializing from PyArrow array capsules.
+static int Strs_init_from_pyarrow(Strs *self, PyObject *sequence_obj, int view) {
+    // Handle Arrow array
+    PyObject *capsules = PyObject_CallMethod(sequence_obj, "__arrow_c_array__", NULL);
     if (!capsules || !PyTuple_Check(capsules) || PyTuple_Size(capsules) != 2) {
         Py_XDECREF(capsules);
-        PyErr_SetString(PyExc_ValueError, "__arrow_c_array__ must return a tuple of 2 capsules (schema, array)");
-        return NULL;
+        PyErr_SetString(PyExc_ValueError, "__arrow_c_array__ must return a tuple of 2 capsules");
+        return -1;
     }
 
     PyObject *schema_capsule = PyTuple_GET_ITEM(capsules, 0);
@@ -3892,89 +3861,407 @@ static PyObject *Strs_from_arrow(PyObject *cls, PyObject *args, PyObject *kwargs
     if (!PyCapsule_CheckExact(schema_capsule) || !PyCapsule_CheckExact(array_capsule)) {
         Py_DECREF(capsules);
         PyErr_SetString(PyExc_ValueError, "Expected PyCapsule objects from __arrow_c_array__");
-        return NULL;
+        return -1;
     }
 
-    // Get the Arrow C schema and array structures
     struct ArrowSchema *schema = (struct ArrowSchema *)PyCapsule_GetPointer(schema_capsule, "arrow_schema");
     struct ArrowArray *array = (struct ArrowArray *)PyCapsule_GetPointer(array_capsule, "arrow_array");
 
     if (!schema || !array) {
         Py_DECREF(capsules);
-        PyErr_SetString(PyExc_ValueError, "Failed to extract Arrow C structures from capsules");
-        return NULL;
+        PyErr_SetString(PyExc_ValueError, "Failed to extract Arrow C structures");
+        return -1;
     }
 
-    // Validate that this is a string array (utf8, large utf8, or binary)
-    if (!schema->format ||
-        (strcmp(schema->format, "u") != 0 && strcmp(schema->format, "U") != 0 && strcmp(schema->format, "z") != 0)) {
+    // Validate string array type
+    if (!schema->format || (strcmp(schema->format, "u") != 0 && strcmp(schema->format, "U") != 0 &&
+                            strcmp(schema->format, "z") != 0 && strcmp(schema->format, "Z") != 0)) {
         Py_DECREF(capsules);
-        PyErr_SetString(PyExc_ValueError, "Arrow array must be string type (utf8, large utf8, or binary)");
-        return NULL;
+        PyErr_SetString(PyExc_ValueError, "Arrow array must be string type");
+        return -1;
     }
 
-    // Validate that we have the expected number of buffers (validity, offsets, data)
     if (array->n_buffers != 3) {
         Py_DECREF(capsules);
-        PyErr_SetString(PyExc_ValueError, "String Arrow array must have exactly 3 buffers");
-        return NULL;
+        PyErr_SetString(PyExc_ValueError, "String Arrow array must have 3 buffers");
+        return -1;
     }
 
-    // Extract the buffers: validity (optional), offsets, data
-    const void **buffers = (const void **)array->buffers;
-    const char *data_buffer = (const char *)buffers[2]; // String data
+    void const **buffers = (void const **)array->buffers;
+    uint8_t const *validity = (uint8_t const *)buffers[0]; // May be NULL
+    char const *data_buffer = (char const *)buffers[2];
     size_t length = array->length;
 
-    // Create a new Strs object
-    Strs *result = (Strs *)StrsType.tp_alloc(&StrsType, 0);
-    if (!result) {
-        Py_DECREF(capsules);
-        return NULL;
+    // Determine if 32-bit or 64-bit offsets
+    int use_64bit = (strcmp(schema->format, "U") == 0 || strcmp(schema->format, "Z") == 0);
+
+    if (view) {
+        // Zero-copy mode for Arrow arrays
+        if (use_64bit) {
+            int64_t const *offsets_64 = (int64_t const *)buffers[1];
+            self->type = STRS_CONSECUTIVE_64;
+            self->data.consecutive_64bit.count = length;
+            self->data.consecutive_64bit.separator_length = 0;
+            self->data.consecutive_64bit.parent_string = capsules;
+            self->data.consecutive_64bit.start = data_buffer;
+            self->data.consecutive_64bit.end_offsets = (uint64_t *)(offsets_64 + 1);
+            self->data.consecutive_64bit.owns_offsets = 0; // Arrow owns buffer
+            Py_INCREF(capsules);
+        }
+        else {
+            int32_t const *offsets_32 = (int32_t const *)buffers[1];
+            self->type = STRS_CONSECUTIVE_32;
+            self->data.consecutive_32bit.count = length;
+            self->data.consecutive_32bit.separator_length = 0;
+            self->data.consecutive_32bit.parent_string = capsules;
+            self->data.consecutive_32bit.start = data_buffer;
+            self->data.consecutive_32bit.end_offsets = (uint32_t *)(offsets_32 + 1);
+            self->data.consecutive_32bit.owns_offsets = 0; // Arrow owns buffer
+            Py_INCREF(capsules);
+        }
+    }
+    else {
+        // Copy mode for Arrow arrays
+        if (use_64bit) {
+            int64_t const *offsets_64 = (int64_t const *)buffers[1];
+            size_t total_bytes = offsets_64[length] - offsets_64[0];
+
+            // Allocate new buffer and offsets
+            char *new_data = (char *)malloc(total_bytes);
+            uint64_t *new_offsets = (uint64_t *)malloc(length * sizeof(uint64_t));
+            if (!new_data || !new_offsets) {
+                free(new_data);
+                free(new_offsets);
+                Py_DECREF(capsules);
+                PyErr_NoMemory();
+                return -1;
+            }
+
+            // Copy data and adjust offsets
+            sz_copy(new_data, data_buffer + offsets_64[0], total_bytes);
+            for (size_t i = 0; i < length; i++) {
+                // Handle null values by checking validity bitmap
+                if (validity && !(validity[i / 8] & (1 << (i % 8)))) {
+                    new_offsets[i] = (i == 0) ? 0 : new_offsets[i - 1];
+                }
+                else { new_offsets[i] = offsets_64[i + 1] - offsets_64[0]; }
+            }
+
+            // Create parent bytes object to own the data
+            PyObject *parent = PyBytes_FromStringAndSize(new_data, total_bytes);
+            free(new_data);
+            if (!parent) {
+                free(new_offsets);
+                Py_DECREF(capsules);
+                return -1;
+            }
+
+            self->type = STRS_CONSECUTIVE_64;
+            self->data.consecutive_64bit.count = length;
+            self->data.consecutive_64bit.separator_length = 0;
+            self->data.consecutive_64bit.parent_string = parent;
+            self->data.consecutive_64bit.start = PyBytes_AS_STRING(parent);
+            self->data.consecutive_64bit.end_offsets = new_offsets;
+            self->data.consecutive_64bit.owns_offsets = 1;
+        }
+        else {
+            int32_t const *offsets_32 = (int32_t const *)buffers[1];
+            size_t total_bytes = offsets_32[length] - offsets_32[0];
+
+            // Allocate new buffer and offsets
+            char *new_data = (char *)malloc(total_bytes);
+            uint32_t *new_offsets = (uint32_t *)malloc(length * sizeof(uint32_t));
+            if (!new_data || !new_offsets) {
+                free(new_data);
+                free(new_offsets);
+                Py_DECREF(capsules);
+                PyErr_NoMemory();
+                return -1;
+            }
+
+            // Copy data and adjust offsets
+            sz_copy(new_data, data_buffer + offsets_32[0], total_bytes);
+            for (size_t i = 0; i < length; i++) {
+                // Handle null values by checking validity bitmap
+                if (validity && !(validity[i / 8] & (1 << (i % 8)))) {
+                    new_offsets[i] = (i == 0) ? 0 : new_offsets[i - 1];
+                }
+                else { new_offsets[i] = offsets_32[i + 1] - offsets_32[0]; }
+            }
+
+            // Create parent bytes object to own the data
+            PyObject *parent = PyBytes_FromStringAndSize(new_data, total_bytes);
+            free(new_data);
+            if (!parent) {
+                free(new_offsets);
+                Py_DECREF(capsules);
+                return -1;
+            }
+
+            self->type = STRS_CONSECUTIVE_32;
+            self->data.consecutive_32bit.count = length;
+            self->data.consecutive_32bit.separator_length = 0;
+            self->data.consecutive_32bit.parent_string = parent;
+            self->data.consecutive_32bit.start = PyBytes_AS_STRING(parent);
+            self->data.consecutive_32bit.end_offsets = new_offsets;
+            self->data.consecutive_32bit.owns_offsets = 1;
+        }
     }
 
-    // Determine if we need 32-bit or 64-bit offsets based on Arrow format
-    const int32_t *offsets_32 = NULL;
-    const int64_t *offsets_64 = NULL;
-    int use_64bit = (strcmp(schema->format, "U") == 0); // Large strings use 64-bit offsets
+    Py_DECREF(capsules);
+    return 0;
+}
+
+// The less efficient `Strs_init` path initializing from a Pythonic tuple of strings.
+static int Strs_init_from_tuple(Strs *self, PyObject *sequence_obj, int view) {
+    Py_ssize_t count = PyTuple_GET_SIZE(sequence_obj);
+
+    // Empty tuple, create empty Strs
+    if (count == 0) {
+        self->type = STRS_REORDERED;
+        self->data.reordered.count = 0;
+        self->data.reordered.parts = NULL;
+        self->data.reordered.parent_string = NULL;
+        return 0;
+    }
+
+    // Zero-copy mode for Python sequences - use reordered layout for memory-scattered strings
+    if (view) {
+        sz_string_view_t *parts = (sz_string_view_t *)malloc(count * sizeof(sz_string_view_t));
+        if (!parts) {
+            Py_DECREF(sequence_obj);
+            PyErr_NoMemory();
+            return -1;
+        }
 
-    if (use_64bit) { offsets_64 = (const int64_t *)buffers[1]; }
+        // Create views directly to Python string objects
+        for (size_t i = 0; i < count; i++) {
+            PyObject *item = PyTuple_GET_ITEM(sequence_obj, i);
+            sz_cptr_t item_start;
+            sz_size_t item_length;
+            if (!sz_py_export_string_like(item, &item_start, &item_length)) {
+                free(parts);
+                PyErr_Format(PyExc_TypeError, "Item %zd is not a string-like object", i);
+                return -1;
+            }
+            parts[i].start = item_start;
+            parts[i].length = item_length;
+        }
+
+        self->type = STRS_REORDERED;
+        self->data.reordered.count = count;
+        self->data.reordered.parts = parts;
+        self->data.reordered.parent_string = sequence_obj; // Keep sequence alive
+        Py_INCREF(sequence_obj);
+    }
+    // Allocate a new tape to fit all of the items
     else {
-        offsets_32 = (const int32_t *)buffers[1];
-        // Check if the last offset exceeds 32-bit range
-        int32_t max_offset_32 = offsets_32[length];
-        if (max_offset_32 < 0) { // Overflow indicates we need 64-bit
-            use_64bit = 1;
-            offsets_64 = (const int64_t *)buffers[1];
+        // Estimate the overall size of strings in bytes
+        size_t total_bytes = 0;
+        for (Py_ssize_t i = 0; i < count; i++) {
+            PyObject *item = PyTuple_GET_ITEM(sequence_obj, i);
+            sz_cptr_t item_start;
+            sz_size_t item_length;
+            if (!sz_py_export_string_like(item, &item_start, &item_length)) {
+                PyErr_Format(PyExc_TypeError, "Item %zd is not a string-like object", i);
+                return -1;
+            }
+            total_bytes += item_length;
+        }
+
+        int use_64bit = (total_bytes >= UINT32_MAX);
+
+        // Allocate data buffer
+        char *data_buffer = (char *)malloc(total_bytes);
+        if (!data_buffer) {
+            PyErr_NoMemory();
+            return -1;
+        }
+
+        if (use_64bit) {
+            uint64_t *offsets = (uint64_t *)malloc(count * sizeof(uint64_t));
+            if (!offsets) {
+                free(data_buffer);
+                PyErr_NoMemory();
+                return -1;
+            }
+
+            size_t offset = 0;
+            for (Py_ssize_t i = 0; i < count; i++) {
+                PyObject *item = PyTuple_GET_ITEM(sequence_obj, i);
+                sz_cptr_t item_start;
+                sz_size_t item_length;
+                sz_py_export_string_like(item, &item_start, &item_length);
+
+                sz_copy(data_buffer + offset, item_start, item_length);
+                offset += item_length;
+                offsets[i] = offset;
+            }
+
+            PyObject *parent = PyBytes_FromStringAndSize(data_buffer, total_bytes);
+            free(data_buffer);
+            if (!parent) {
+                free(offsets);
+                return -1;
+            }
+
+            self->type = STRS_CONSECUTIVE_64;
+            self->data.consecutive_64bit.count = count;
+            self->data.consecutive_64bit.separator_length = 0;
+            self->data.consecutive_64bit.parent_string = parent;
+            self->data.consecutive_64bit.start = PyBytes_AS_STRING(parent);
+            self->data.consecutive_64bit.end_offsets = offsets;
+        }
+        else {
+            uint32_t *offsets = (uint32_t *)malloc(count * sizeof(uint32_t));
+            if (!offsets) {
+                free(data_buffer);
+                PyErr_NoMemory();
+                return -1;
+            }
+
+            size_t offset = 0;
+            for (Py_ssize_t i = 0; i < count; i++) {
+                PyObject *item = PyTuple_GET_ITEM(sequence_obj, i);
+                sz_cptr_t item_start;
+                sz_size_t item_length;
+                sz_py_export_string_like(item, &item_start, &item_length);
+
+                sz_copy(data_buffer + offset, item_start, item_length);
+                offset += item_length;
+                offsets[i] = offset;
+            }
+
+            PyObject *parent = PyBytes_FromStringAndSize(data_buffer, total_bytes);
+            free(data_buffer);
+            if (!parent) {
+                free(offsets);
+                return -1;
+            }
+
+            self->type = STRS_CONSECUTIVE_32;
+            self->data.consecutive_32bit.count = count;
+            self->data.consecutive_32bit.separator_length = 0;
+            self->data.consecutive_32bit.parent_string = parent;
+            self->data.consecutive_32bit.start = PyBytes_AS_STRING(parent);
+            self->data.consecutive_32bit.end_offsets = offsets;
         }
     }
 
-    if (use_64bit) {
-        result->type = STRS_CONSECUTIVE_64;
-        result->data.consecutive_64bit.count = length;
-        result->data.consecutive_64bit.separator_length = 0;     // No separator in Arrow arrays
-        result->data.consecutive_64bit.parent_string = capsules; // Keep capsules alive
-        result->data.consecutive_64bit.start = data_buffer;
-        // Arrow has N+1 offsets, we need the end offsets which start at offsets[1]
-        result->data.consecutive_64bit.end_offsets = (uint64_t *)(offsets_64 + 1);
-        Py_INCREF(capsules); // Keep the capsules alive
+    return 0;
+}
+
+static void Strs_dealloc(Strs *self) {
+    switch (self->type) {
+    case STRS_CONSECUTIVE_32:
+        // Free offset array (only if owned) and decref parent string
+        if (self->data.consecutive_32bit.owns_offsets && self->data.consecutive_32bit.end_offsets)
+            free(self->data.consecutive_32bit.end_offsets);
+        Py_XDECREF(self->data.consecutive_32bit.parent_string);
+        break;
+
+    case STRS_CONSECUTIVE_64:
+        // Free offset array (only if owned) and decref parent string
+        if (self->data.consecutive_64bit.owns_offsets && self->data.consecutive_64bit.end_offsets)
+            free(self->data.consecutive_64bit.end_offsets);
+        Py_XDECREF(self->data.consecutive_64bit.parent_string);
+        break;
+
+    case STRS_REORDERED:
+        // Free parts array and decref parent string
+        free(self->data.reordered.parts);
+        Py_XDECREF(self->data.reordered.parent_string);
+        break;
+
+    case STRS_MULTI_SOURCE:
+        // Handle multi-source cleanup if needed
+        // (not currently used in our implementation)
+        break;
+    }
+
+    Py_TYPE(self)->tp_free((PyObject *)self);
+}
+
+// The inefficient `Strs_init` path initializing from a Pythonic list of strings.
+static int Strs_init_from_list(Strs *self, PyObject *sequence_obj, int view) {
+    // For now, just return an error indicating this is not yet implemented
+    PyErr_SetString(PyExc_NotImplementedError, "Strs initialization from list is not yet implemented");
+    return -1;
+}
+
+// The inefficient `Strs_init` path initializing from a Pythonic iterable of strings.
+static int Strs_init_from_iterable(Strs *self, PyObject *sequence_obj, int view) {
+    // For now, just return an error indicating this is not yet implemented
+    PyErr_SetString(PyExc_NotImplementedError, "Strs initialization from iterable is not yet implemented");
+    return -1;
+}
+
+static int Strs_init(Strs *self, PyObject *args, PyObject *kwargs) {
+    // Manual argument parsing for performance
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs > 2) {
+        PyErr_SetString(PyExc_TypeError,
+                        "Strs() takes at most 2 arguments: sequence of strings and a boolean indicator");
+        return -1;
+    }
+
+    PyObject *sequence_obj = nargs >= 1 ? PyTuple_GET_ITEM(args, 0) : NULL;
+    PyObject *view_obj = nargs >= 2 ? PyTuple_GET_ITEM(args, 1) : NULL;
+    int view = 0; // Default to copy mode
+
+    // Parse keyword arguments if provided
+    if (kwargs) {
+        Py_ssize_t pos = 0;
+        PyObject *key, *value;
+        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+            if (PyUnicode_CompareWithASCIIString(key, "sequence") == 0 && !sequence_obj) { sequence_obj = value; }
+            else if (PyUnicode_CompareWithASCIIString(key, "view") == 0 && !view_obj) { view_obj = value; }
+            else {
+                PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
+                return -1;
+            }
+        }
+    }
+
+    // Parse view flag
+    if (view_obj) {
+        view = PyObject_IsTrue(view_obj);
+        if (view == -1) return -1;
+    }
+
+    // If no sequence provided, create empty Strs
+    if (!sequence_obj) {
+        self->type = STRS_REORDERED;
+        self->data.reordered.count = 0;
+        self->data.reordered.parts = NULL;
+        self->data.reordered.parent_string = NULL;
+        return 0;
+    }
+
+    // Check if it's an Arrow array (has `__arrow_c_array__` method)
+    PyObject *arrow_method = PyObject_GetAttrString(sequence_obj, "__arrow_c_array__");
+    if (arrow_method) {
+        Py_DECREF(arrow_method);
+        return Strs_init_from_pyarrow(self, sequence_obj, view);
+    }
+
+    // Handle more traditional Python sequences
+    PyErr_Clear(); // Clear the attribute error from checking for `__arrow_c_array__`
+
+    if (PyTuple_Check(sequence_obj)) { return Strs_init_from_tuple(self, sequence_obj, view); }
+    else if (PyList_Check(sequence_obj)) { return Strs_init_from_list(self, sequence_obj, view); }
+    else if (PyObject_HasAttrString(sequence_obj, "__iter__")) {
+        return Strs_init_from_iterable(self, sequence_obj, view);
     }
     else {
-        result->type = STRS_CONSECUTIVE_32;
-        result->data.consecutive_32bit.count = length;
-        result->data.consecutive_32bit.separator_length = 0;     // No separator in Arrow arrays
-        result->data.consecutive_32bit.parent_string = capsules; // Keep capsules alive
-        result->data.consecutive_32bit.start = data_buffer;
-        // Arrow has N+1 offsets, we need the end offsets which start at offsets[1]
-        result->data.consecutive_32bit.end_offsets = (uint32_t *)(offsets_32 + 1);
-        Py_INCREF(capsules); // Keep the capsules alive
+        PyErr_SetString(PyExc_TypeError, "Strs() argument must be a tuple, list, or iterable");
+        return -1;
     }
 
-    Py_DECREF(capsules);
-    return (PyObject *)result;
+    return 0;
 }
 
 static PyMethodDef Strs_methods[] = {
-    {"from_arrow", (PyCFunction)Strs_from_arrow, METH_VARARGS | METH_KEYWORDS | METH_CLASS, doc_Strs_from_arrow},
     {"shuffle", Strs_shuffle, SZ_METHOD_FLAGS, "Shuffle (in-place) the elements of the Strs object."}, //
     {"sort", Strs_sort, SZ_METHOD_FLAGS, "Sort (in-place) the elements of the Strs object."},          //
     {"argsort", Strs_argsort, SZ_METHOD_FLAGS, "Provides the permutation to achieve sorted order."},   //
@@ -3984,13 +4271,14 @@ static PyMethodDef Strs_methods[] = {
 };
 
 static char const doc_Strs[] = //
-    "Strs(source)\\n"
+    "Strs(sequence, view=False)\\n"
     "\\n"
     "Space-efficient container for large collections of strings and their slices.\\n"
     "Optimized for memory efficiency and bulk operations on string collections.\\n"
     "\\n"
     "Args:\\n"
-    "  source (sequence): Iterable of strings to store.\\n"
+    "  sequence (list | tuple | generator | pyarrow.Array): Collection of strings to store.\\n"
+    "  view (bool): If True, create a view into the original data instead of copying it.\\n"
     "\\n"
     "Features:\\n"
     "  - Memory-efficient storage with shared backing buffers\\n"
@@ -4004,7 +4292,6 @@ static char const doc_Strs[] = //
     "  - argsort(): Get indices for sorted order\\n"
     "  - shuffle(): Randomize element order\\n"
     "  - sample(): Get random subset of elements\\n"
-    "  - from_arrow(): Create from Apache Arrow arrays (zero-copy)\\n"
     "\\n"
     "Example:\\n"
     "  >>> strs = sz.Strs(['apple', 'banana', 'cherry'])\\n"
@@ -4018,6 +4305,8 @@ static PyTypeObject StrsType = {
     .tp_itemsize = 0,
     .tp_flags = Py_TPFLAGS_DEFAULT,
     .tp_new = PyType_GenericNew,
+    .tp_init = (initproc)Strs_init,
+    .tp_dealloc = (destructor)Strs_dealloc,
     .tp_methods = Strs_methods,
     .tp_as_sequence = &Strs_as_sequence,
     .tp_as_mapping = &Strs_as_mapping,

From 5ac53c3a3fd0f5ed0e73cc74e57a8a1220bf5953 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 9 Aug 2025 16:08:18 +0000
Subject: [PATCH 547/751] Improve: Type-casting `seed`s in `Strs.sample`

---
 python/stringzilla.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/python/stringzilla.c b/python/stringzilla.c
index 6dbc6ae2..4b63385e 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -1150,7 +1150,6 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) {
 
     // Create a new `Strs` object
     Strs *result = (Strs *)StrsType.tp_alloc(&StrsType, 0);
-    // REVIEW(alexbowe): Does this raise the appropriate Error on the Python side?
     if (result == NULL && PyErr_NoMemory()) return NULL;
     if (result_count == 0) {
         result->type = STRS_REORDERED;
@@ -3404,7 +3403,15 @@ static PyObject *Strs_shuffle(Strs *self, PyObject *const *args, Py_ssize_t posi
     size_t count = reordered->count;
 
     // Fisher-Yates Shuffle Algorithm
-    unsigned int seed = seed_obj ? PyLong_AsUnsignedLong(seed_obj) : time(NULL);
+    unsigned int seed = (unsigned int)time(NULL);
+    if (seed_obj) {
+        if (!PyLong_Check(seed_obj)) {
+            PyErr_SetString(PyExc_TypeError, "The seed must be an integer");
+            return NULL;
+        }
+        seed = PyLong_AsUnsignedLong(seed_obj);
+    }
+
     srand(seed);
     for (size_t i = count - 1; i > 0; --i) {
         size_t j = rand() % (i + 1);
@@ -3494,7 +3501,7 @@ static PyObject *Strs_sort(Strs *self, PyObject *const *args, Py_ssize_t positio
     }
 
     sz_string_view_t *parts = NULL;
-    sz_size_t *order = NULL;
+    sz_sorted_idx_t *order = NULL;
     sz_size_t count = 0;
     if (!Strs_argsort_(self, &parts, &order, &count)) return NULL;
 
@@ -3647,7 +3654,7 @@ static PyObject *Strs_sample(Strs *self, PyObject *const *args, Py_ssize_t posit
     // Randomly sample the strings
     srand(seed);
     PyObject *parent_string;
-    for (Py_ssize_t i = 0; i < sample_size; i++) {
+    for (Py_ssize_t i = 0; i < (Py_ssize_t)sample_size; i++) {
         size_t index = rand() % count;
         getter(self, index, count, &parent_string, &result_parts[i].start, &result_parts[i].length);
     }

From 05a54348b56a7f07c08de2b5efe4aad3f6dd9d7f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 10 Aug 2025 18:36:52 +0000
Subject: [PATCH 548/751] Add: Make `Strs` from lists, tuples, generators

---
 python/stringzilla.c        | 399 ++++++++++++++++++++++++++++++++----
 scripts/test_stringzilla.py |  63 ++++--
 2 files changed, 405 insertions(+), 57 deletions(-)

diff --git a/python/stringzilla.c b/python/stringzilla.c
index 4b63385e..3953e59c 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -1159,7 +1159,7 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) {
         return (PyObject *)result;
     }
 
-    // If a step is requested, we have to create a new `REORDERED` Strs object,
+    // If a step is requested, we have to create a new `REORDERED` instance of `Strs`,
     // even if the original one was `CONSECUTIVE`.
     if (step != 1) {
         sz_string_view_t *new_parts = (sz_string_view_t *)malloc(result_count * sizeof(sz_string_view_t));
@@ -1175,7 +1175,7 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) {
         result->data.reordered.parts = new_parts;
         result->data.reordered.parent_string = NULL;
 
-        // Populate the new reordered array using get_string_at_offset
+        // Populate the new reordered array using `get_string_at_offset`
         size_t j = 0;
         if (step > 0)
             for (Py_ssize_t i = start; i < stop; i += step, ++j) {
@@ -1188,6 +1188,8 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) {
                        &new_parts[j].length);
             }
 
+        // Ensure the parent string isn't prematurely deallocated by this view.
+        Py_XINCREF(result->data.reordered.parent_string);
         return (PyObject *)result;
     }
 
@@ -1212,6 +1214,7 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) {
             Py_XDECREF(result);
             return NULL;
         }
+        to->owns_offsets = 1;
 
         // Now populate the offsets
         size_t element_length;
@@ -1241,6 +1244,7 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) {
             Py_XDECREF(result);
             return NULL;
         }
+        to->owns_offsets = 1;
 
         // Now populate the offsets
         size_t element_length;
@@ -3461,6 +3465,7 @@ static sz_bool_t Strs_argsort_(Strs *self, sz_string_view_t **parts_output, sz_s
     sequence.get_start = parts_get_start;
     sequence.get_length = parts_get_length;
     sz_status_t status = sz_sequence_argsort(&sequence, NULL, (sz_sorted_idx_t *)temporary_memory.start);
+    sz_unused_(status);
 
     // Export results
     *parts_output = parts;
@@ -3617,7 +3622,7 @@ static PyObject *Strs_sample(Strs *self, PyObject *const *args, Py_ssize_t posit
         }
         sample_size = PyLong_AsSize_t(sample_size_obj);
     }
-    unsigned int seed = time(NULL); // Default seed
+    unsigned int seed = (unsigned int)time(NULL); // Default seed
     if (seed_obj) {
         if (!PyLong_Check(seed_obj)) {
             PyErr_SetString(PyExc_TypeError, "The seed must be an integer");
@@ -3659,11 +3664,13 @@ static PyObject *Strs_sample(Strs *self, PyObject *const *args, Py_ssize_t posit
         getter(self, index, count, &parent_string, &result_parts[i].start, &result_parts[i].length);
     }
 
-    // Update the Strs object
+    // Update the `Strs` object
     result->type = STRS_REORDERED;
     result->data.reordered.count = sample_size;
     result->data.reordered.parts = result_parts;
     result->data.reordered.parent_string = parent_string;
+    // Hold a reference to the parent backing buffer while this view is alive
+    Py_XINCREF(result->data.reordered.parent_string);
     return result;
 }
 
@@ -4158,49 +4165,336 @@ static int Strs_init_from_tuple(Strs *self, PyObject *sequence_obj, int view) {
     return 0;
 }
 
-static void Strs_dealloc(Strs *self) {
-    switch (self->type) {
-    case STRS_CONSECUTIVE_32:
-        // Free offset array (only if owned) and decref parent string
-        if (self->data.consecutive_32bit.owns_offsets && self->data.consecutive_32bit.end_offsets)
-            free(self->data.consecutive_32bit.end_offsets);
-        Py_XDECREF(self->data.consecutive_32bit.parent_string);
-        break;
+// The inefficient `Strs_init` path initializing from a Pythonic list of strings.
+static int Strs_init_from_list(Strs *self, PyObject *sequence_obj, int view) {
+    Py_ssize_t count = PyList_GET_SIZE(sequence_obj);
 
-    case STRS_CONSECUTIVE_64:
-        // Free offset array (only if owned) and decref parent string
-        if (self->data.consecutive_64bit.owns_offsets && self->data.consecutive_64bit.end_offsets)
-            free(self->data.consecutive_64bit.end_offsets);
-        Py_XDECREF(self->data.consecutive_64bit.parent_string);
-        break;
+    // Handle empty list
+    if (count == 0) {
+        self->type = STRS_REORDERED;
+        self->data.reordered.count = 0;
+        self->data.reordered.parts = NULL;
+        self->data.reordered.parent_string = NULL;
+        return 0;
+    }
 
-    case STRS_REORDERED:
-        // Free parts array and decref parent string
-        free(self->data.reordered.parts);
-        Py_XDECREF(self->data.reordered.parent_string);
-        break;
+    // Zero-copy mode for Python sequences - use reordered layout for memory-scattered strings
+    if (view) {
+        sz_string_view_t *parts = (sz_string_view_t *)malloc(count * sizeof(sz_string_view_t));
+        if (!parts) {
+            PyErr_NoMemory();
+            return -1;
+        }
 
-    case STRS_MULTI_SOURCE:
-        // Handle multi-source cleanup if needed
-        // (not currently used in our implementation)
-        break;
+        // Build views directly to the string data
+        for (Py_ssize_t i = 0; i < count; i++) {
+            PyObject *item = PyList_GET_ITEM(sequence_obj, i);
+
+            // Export string data directly (no copying, just span)
+            sz_cptr_t item_start;
+            sz_size_t item_length;
+            if (!sz_py_export_string_like(item, &item_start, &item_length)) {
+                free(parts);
+                PyErr_Format(PyExc_TypeError, "Item %zd is not a string-like object", i);
+                return -1;
+            }
+
+            parts[i].start = item_start;
+            parts[i].length = item_length;
+        }
+
+        // Setup reordered layout with parent list to keep strings alive
+        self->type = STRS_REORDERED;
+        self->data.reordered.count = count;
+        self->data.reordered.parts = parts;
+        self->data.reordered.parent_string = sequence_obj; // Keep list alive
+        Py_INCREF(sequence_obj);
+        return 0;
     }
+    // Allocate a new tape to fit all of the items
+    else {
 
-    Py_TYPE(self)->tp_free((PyObject *)self);
-}
+        // First pass: calculate total size needed
+        size_t total_bytes = 0;
+        int use_64bit = 0;
 
-// The inefficient `Strs_init` path initializing from a Pythonic list of strings.
-static int Strs_init_from_list(Strs *self, PyObject *sequence_obj, int view) {
-    // For now, just return an error indicating this is not yet implemented
-    PyErr_SetString(PyExc_NotImplementedError, "Strs initialization from list is not yet implemented");
-    return -1;
+        for (Py_ssize_t i = 0; i < count; i++) {
+            PyObject *item = PyList_GET_ITEM(sequence_obj, i);
+            sz_cptr_t item_start;
+            sz_size_t item_length;
+            if (!sz_py_export_string_like(item, &item_start, &item_length)) {
+                PyErr_Format(PyExc_TypeError, "Item %zd is not a string-like object", i);
+                return -1;
+            }
+
+            // Check if we need 64-bit offsets
+            if (total_bytes + item_length > UINT32_MAX) { use_64bit = 1; }
+            total_bytes += item_length;
+        }
+
+        // Allocate buffers based on calculated sizes
+        char *data_buffer = (char *)malloc(total_bytes);
+        void *offsets;
+
+        if (use_64bit) { offsets = malloc(count * sizeof(uint64_t)); }
+        else { offsets = malloc(count * sizeof(uint32_t)); }
+
+        if (!data_buffer || !offsets) {
+            free(data_buffer);
+            free(offsets);
+            PyErr_NoMemory();
+            return -1;
+        }
+
+        // Second pass: copy data and build offsets
+        size_t current_offset = 0;
+        for (Py_ssize_t i = 0; i < count; i++) {
+            PyObject *item = PyList_GET_ITEM(sequence_obj, i);
+            sz_cptr_t item_start;
+            sz_size_t item_length;
+
+            // We already validated this in first pass, so this should not fail
+            sz_py_export_string_like(item, &item_start, &item_length);
+
+            // Copy the string data
+            memcpy(data_buffer + current_offset, item_start, item_length);
+            current_offset += item_length;
+
+            // Store offset
+            if (use_64bit) { ((uint64_t *)offsets)[i] = current_offset; }
+            else { ((uint32_t *)offsets)[i] = current_offset; }
+        }
+
+        // Create parent bytes object from the buffer
+        PyObject *parent = PyBytes_FromStringAndSize(data_buffer, total_bytes);
+        free(data_buffer);
+        if (!parent) {
+            free(offsets);
+            PyErr_NoMemory();
+            return -1;
+        }
+
+        // Setup the consecutive layout (32-bit or 64-bit)
+        if (use_64bit) {
+            self->type = STRS_CONSECUTIVE_64;
+            self->data.consecutive_64bit.count = count;
+            self->data.consecutive_64bit.separator_length = 0;
+            self->data.consecutive_64bit.parent_string = parent;
+            self->data.consecutive_64bit.start = PyBytes_AS_STRING(parent);
+            self->data.consecutive_64bit.end_offsets = (uint64_t *)offsets;
+            self->data.consecutive_64bit.owns_offsets = 1;
+        }
+        else {
+            self->type = STRS_CONSECUTIVE_32;
+            self->data.consecutive_32bit.count = count;
+            self->data.consecutive_32bit.separator_length = 0;
+            self->data.consecutive_32bit.parent_string = parent;
+            self->data.consecutive_32bit.start = PyBytes_AS_STRING(parent);
+            self->data.consecutive_32bit.end_offsets = (uint32_t *)offsets;
+            self->data.consecutive_32bit.owns_offsets = 1;
+        }
+
+        return 0;
+    }
 }
 
 // The inefficient `Strs_init` path initializing from a Pythonic iterable of strings.
 static int Strs_init_from_iterable(Strs *self, PyObject *sequence_obj, int view) {
-    // For now, just return an error indicating this is not yet implemented
-    PyErr_SetString(PyExc_NotImplementedError, "Strs initialization from iterable is not yet implemented");
-    return -1;
+    // Get an iterator from the object
+    PyObject *iterator = PyObject_GetIter(sequence_obj);
+    if (!iterator) {
+        PyErr_SetString(PyExc_TypeError, "Object is not iterable");
+        return -1;
+    }
+
+    if (view) {
+        // View mode is not supported for iterators because we can't safely keep references
+        // to all the individual string objects without significant overhead
+        Py_DECREF(iterator);
+        PyErr_SetString(PyExc_ValueError, "View mode (view=True) is not supported for iterators. "
+                                          "Use view=False to create a copy, or convert to a list/tuple first.");
+        return -1;
+    }
+    // Allocate a new tape to fit all of the items
+    else {
+        size_t data_capacity = 4096;
+        size_t offsets_capacity = 16;
+        size_t count = 0;
+        size_t total_bytes = 0;
+        int use_64bit = 0; // Start with 32-bit
+
+        char *data_buffer = (char *)malloc(data_capacity);
+        void *offsets = malloc(offsets_capacity * sizeof(uint32_t)); // Start with 32-bit
+
+        if (!data_buffer || !offsets) {
+            free(data_buffer);
+            free(offsets);
+            Py_DECREF(iterator);
+            PyErr_NoMemory();
+            return -1;
+        }
+
+        // Iterate through all items
+        PyObject *item;
+        while ((item = PyIter_Next(iterator))) {
+            sz_cptr_t item_start;
+            sz_size_t item_length;
+            if (!sz_py_export_string_like(item, &item_start, &item_length)) {
+                Py_DECREF(item);
+                free(data_buffer);
+                free(offsets);
+                Py_DECREF(iterator);
+                PyErr_Format(PyExc_TypeError, "Item %zd is not a string-like object", count);
+                return -1;
+            }
+
+            // Check if adding this string would exceed UINT32_MAX and switch to 64-bit
+            if (!use_64bit && total_bytes + item_length > UINT32_MAX) {
+                // Convert offsets from 32-bit to 64-bit
+                uint64_t *new_offsets = (uint64_t *)malloc(offsets_capacity * sizeof(uint64_t));
+                if (!new_offsets) {
+                    Py_DECREF(item);
+                    free(data_buffer);
+                    free(offsets);
+                    Py_DECREF(iterator);
+                    PyErr_NoMemory();
+                    return -1;
+                }
+
+                // Copy existing 32-bit offsets to 64-bit
+                uint32_t *old_offsets = (uint32_t *)offsets;
+                for (size_t i = 0; i < count; i++) { new_offsets[i] = old_offsets[i]; }
+
+                free(offsets);
+                offsets = new_offsets;
+                use_64bit = 1;
+            }
+
+            // Grow data buffer if needed (doubling strategy)
+            while (total_bytes + item_length > data_capacity) {
+                size_t new_capacity = data_capacity * 2;
+                if (new_capacity < data_capacity) { // Overflow check
+                    new_capacity = SIZE_MAX;
+                    if (total_bytes + item_length > new_capacity) {
+                        Py_DECREF(item);
+                        free(data_buffer);
+                        free(offsets);
+                        Py_DECREF(iterator);
+                        PyErr_SetString(PyExc_MemoryError, "String data too large");
+                        return -1;
+                    }
+                }
+
+                char *new_buffer = (char *)realloc(data_buffer, new_capacity);
+                if (!new_buffer) {
+                    Py_DECREF(item);
+                    free(data_buffer);
+                    free(offsets);
+                    Py_DECREF(iterator);
+                    PyErr_NoMemory();
+                    return -1;
+                }
+                data_buffer = new_buffer;
+                data_capacity = new_capacity;
+            }
+
+            // Grow offsets array if needed (doubling strategy)
+            if (count >= offsets_capacity) {
+                size_t new_capacity = offsets_capacity * 2;
+                size_t element_size = use_64bit ? sizeof(uint64_t) : sizeof(uint32_t);
+                if (new_capacity > SIZE_MAX / element_size) {
+                    Py_DECREF(item);
+                    free(data_buffer);
+                    free(offsets);
+                    Py_DECREF(iterator);
+                    PyErr_SetString(PyExc_MemoryError, "Too many strings");
+                    return -1;
+                }
+
+                void *new_offsets = realloc(offsets, new_capacity * element_size);
+                if (!new_offsets) {
+                    Py_DECREF(item);
+                    free(data_buffer);
+                    free(offsets);
+                    Py_DECREF(iterator);
+                    PyErr_NoMemory();
+                    return -1;
+                }
+                offsets = new_offsets;
+                offsets_capacity = new_capacity;
+            }
+
+            // Copy the string data
+            memcpy(data_buffer + total_bytes, item_start, item_length);
+            total_bytes += item_length;
+
+            // Store offset
+            if (use_64bit) { ((uint64_t *)offsets)[count] = total_bytes; }
+            else { ((uint32_t *)offsets)[count] = total_bytes; }
+            count++;
+
+            Py_DECREF(item);
+        }
+
+        Py_DECREF(iterator);
+
+        // Check for errors during iteration
+        if (PyErr_Occurred()) {
+            free(data_buffer);
+            free(offsets);
+            return -1;
+        }
+
+        // Handle empty iterator
+        if (count == 0) {
+            free(data_buffer);
+            free(offsets);
+            self->type = STRS_REORDERED;
+            self->data.reordered.count = 0;
+            self->data.reordered.parts = NULL;
+            self->data.reordered.parent_string = NULL;
+            return 0;
+        }
+
+        // Shrink buffers to actual size
+        char *final_buffer = (char *)realloc(data_buffer, total_bytes);
+        if (final_buffer) data_buffer = final_buffer;
+
+        size_t element_size = use_64bit ? sizeof(uint64_t) : sizeof(uint32_t);
+        void *final_offsets = realloc(offsets, count * element_size);
+        if (final_offsets) offsets = final_offsets;
+
+        // Create parent bytes object from the buffer
+        PyObject *parent = PyBytes_FromStringAndSize(data_buffer, total_bytes);
+        free(data_buffer);
+        if (!parent) {
+            free(offsets);
+            PyErr_NoMemory();
+            return -1;
+        }
+
+        // Setup the consecutive layout (32-bit or 64-bit)
+        if (use_64bit) {
+            self->type = STRS_CONSECUTIVE_64;
+            self->data.consecutive_64bit.count = count;
+            self->data.consecutive_64bit.separator_length = 0;
+            self->data.consecutive_64bit.parent_string = parent;
+            self->data.consecutive_64bit.start = PyBytes_AS_STRING(parent);
+            self->data.consecutive_64bit.end_offsets = (uint64_t *)offsets;
+            self->data.consecutive_64bit.owns_offsets = 1;
+        }
+        else {
+            self->type = STRS_CONSECUTIVE_32;
+            self->data.consecutive_32bit.count = count;
+            self->data.consecutive_32bit.separator_length = 0;
+            self->data.consecutive_32bit.parent_string = parent;
+            self->data.consecutive_32bit.start = PyBytes_AS_STRING(parent);
+            self->data.consecutive_32bit.end_offsets = (uint32_t *)offsets;
+            self->data.consecutive_32bit.owns_offsets = 1;
+        }
+
+        return 0;
+    }
 }
 
 static int Strs_init(Strs *self, PyObject *args, PyObject *kwargs) {
@@ -4268,6 +4562,37 @@ static int Strs_init(Strs *self, PyObject *args, PyObject *kwargs) {
     return 0;
 }
 
+static void Strs_dealloc(Strs *self) {
+    switch (self->type) {
+    case STRS_CONSECUTIVE_32:
+        // Free offset array (only if owned) and decref parent string
+        if (self->data.consecutive_32bit.owns_offsets && self->data.consecutive_32bit.end_offsets)
+            free(self->data.consecutive_32bit.end_offsets);
+        Py_XDECREF(self->data.consecutive_32bit.parent_string);
+        break;
+
+    case STRS_CONSECUTIVE_64:
+        // Free offset array (only if owned) and decref parent string
+        if (self->data.consecutive_64bit.owns_offsets && self->data.consecutive_64bit.end_offsets)
+            free(self->data.consecutive_64bit.end_offsets);
+        Py_XDECREF(self->data.consecutive_64bit.parent_string);
+        break;
+
+    case STRS_REORDERED:
+        // Free parts array and decref parent string
+        free(self->data.reordered.parts);
+        Py_XDECREF(self->data.reordered.parent_string);
+        break;
+
+    case STRS_MULTI_SOURCE:
+        // Handle multi-source cleanup if needed
+        // (not currently used in our implementation)
+        break;
+    }
+
+    Py_TYPE(self)->tp_free((PyObject *)self);
+}
+
 static PyMethodDef Strs_methods[] = {
     {"shuffle", Strs_shuffle, SZ_METHOD_FLAGS, "Shuffle (in-place) the elements of the Strs object."}, //
     {"sort", Strs_sort, SZ_METHOD_FLAGS, "Sort (in-place) the elements of the Strs object."},          //
diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index 3e41ac5b..b90944dc 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -720,14 +720,22 @@ def test_str_to_pyarrow_conversion():
     assert arrow_buffer.to_pybytes() == native.encode("utf-8")
 
 
-@pytest.mark.parametrize("container_class", [tuple])
+@pytest.mark.parametrize("container_class", [tuple, list, iter])
 @pytest.mark.parametrize("view", [False, True])
 def test_strs_from_python_basic(container_class: type, view: bool):
     """Test basic conversion from Python containers to Strs."""
-    container = container_class(["hello", "world", "test", " ", "from", " ", "container", ""])
+    base_items = ["hello", "world", "test", " ", "from", " ", "container", ""]
+    container = container_class(base_items)
+
+    # Skip iter+view combination as it's not supported
+    if container_class == iter and view:
+        with pytest.raises(ValueError, match="View mode.*not supported for iterators"):
+            Strs(container, view=view)
+        return
+
     strs = Strs(container, view=view)
 
-    assert len(strs) == len(container)
+    assert len(strs) == len(base_items)
     assert strs[0] == "hello"
     assert strs[1] == "world"
     assert strs[2] == "test"
@@ -738,33 +746,48 @@ def test_strs_from_python_basic(container_class: type, view: bool):
     assert strs[7] == ""
 
 
-@pytest.mark.parametrize("container_class", [tuple])
+@pytest.mark.parametrize("container_class", [tuple, list, iter])
 @pytest.mark.parametrize("view", [False, True])
 def test_strs_reference_counting(container_class: type, view: bool):
     """Test reference counting to prevent memory leaks."""
     import sys
     import gc
-    
-    container = container_class(["ref", "count", "test"])
+
+    base_items = ["ref", "count", "test"]
+    container = container_class(base_items)
+
+    # Skip iter+view combination as it's not supported
+    if container_class == iter and view:
+        with pytest.raises(ValueError, match="View mode.*not supported for iterators"):
+            Strs(container, view=view)
+        return
+
     initial_refcount = sys.getrefcount(container)
-    
+
     strs = Strs(container, view=view)
     during_refcount = sys.getrefcount(container)
-    
-    # View mode should increment refcount, copy mode should not
-    if view:
-        assert during_refcount == initial_refcount + 1, f"View mode should increment refcount"
-    else:
-        assert during_refcount == initial_refcount, f"Copy mode should not change refcount"
-    
+
+    # For iterators, we can't check refcount behavior the same way since iter() creates a new object
+    # and the iterator consumes the original container during iteration
+    if container_class != iter:
+        # View mode should increment refcount, copy mode should not
+        if view:
+            assert during_refcount == initial_refcount + 1, f"View mode should increment refcount"
+        else:
+            assert during_refcount == initial_refcount, f"Copy mode should not change refcount"
+
     # Verify functionality
     assert len(strs) == 3
     assert strs[0] == "ref"
-    
+    assert strs[1] == "count"
+    assert strs[2] == "test"
+
     del strs
     gc.collect()
-    final_refcount = sys.getrefcount(container)
-    assert final_refcount == initial_refcount, f"Refcount should return to initial value"
+
+    if container_class != iter:
+        final_refcount = sys.getrefcount(container)
+        assert final_refcount == initial_refcount, f"Refcount should return to initial value"
 
 
 @pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
@@ -813,12 +836,12 @@ def test_strs_from_arrow_large_strings():
 @pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
 def test_strs_from_arrow_error_cases():
     """Test error handling for invalid inputs."""
-    # Test with non-Arrow object
+    # Test with non-iterable object
     with pytest.raises(TypeError):
-        Strs("not_an_arrow_array")
+        Strs(123)  # Integer is not iterable
 
     with pytest.raises(TypeError):
-        Strs(["list", "of", "strings"])
+        Strs(None)  # None is not iterable
 
     # Test with non-string Arrow array
     int_array = pa.array([1, 2, 3, 4])

From 30b8fd3fb375036ba403de23ec31c50bcead03f2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 10 Aug 2025 19:47:49 +0000
Subject: [PATCH 549/751] Add: Wrap `DeviceScope` for Python

---
 python/stringzilla.c         |   10 +-
 python/stringzillas.c        | 2362 ++++++++--------------------------
 scripts/test_stringzillas.py |  159 ++-
 3 files changed, 653 insertions(+), 1878 deletions(-)

diff --git a/python/stringzilla.c b/python/stringzilla.c
index 3953e59c..1533cd03 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -15,11 +15,11 @@
  *  PyCapsule protocol in conjunction with @b `__arrow_c_array__` dunder methods can be used to extract strings.
  *  @see https://arrow.apache.org/docs/python/generated/pyarrow.array.html
  *
- *  This module exports C functions via `PyCapsule` for use by other extensions (like `stringzillas-cpus`):
- *  - sz_py_export_string_like: exported as "sz_py_export_string_like".
- *  - sz_py_export_strings_as_sequence: exported as "sz_py_export_strings_as_sequence".
- *  - sz_py_export_strings_as_u32tape: exported as "sz_py_export_strings_as_u32tape".
- *  - sz_py_export_strings_as_u64tape: exported as "sz_py_export_strings_as_u64tape".
+ *  This module exports C functions via `PyCapsule` of `PyAPI` for use by other extensions (like `stringzillas-cpus`):
+ *  - `sz_py_export_string_like`.
+ *  - `sz_py_export_strings_as_sequence`.
+ *  - `sz_py_export_strings_as_u32tape`.
+ *  - `sz_py_export_strings_as_u64tape`.
  */
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
 #define NOMINMAX
diff --git a/python/stringzillas.c b/python/stringzillas.c
index 690ff694..5003c3ed 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -1,13 +1,15 @@
 /**
  *  @file       stringzillas.c
- *  @brief      Very light-weight CPython wrapper for StringZilla, with support for memory-mapping,
- *              native Python strings, Apache Arrow collections, and more.
+ *  @brief      Very light-weight CPython wrapper for StringZillas advanced algorithms,
+ *              supporting bulk operations for edit distances, sequence alignment, and fingerprinting.
  *  @author     Ash Vardanian
- *  @date       July 10, 2023
- *  @copyright  Copyright (c) 2023
+ *  @date       December 15, 2024
+ *  @copyright  Copyright (c) 2024
  *
  *  - Doesn't use PyBind11, NanoBind, Boost.Python, or any other high-level libs, only CPython API.
  *  - To minimize latency this implementation avoids `PyArg_ParseTupleAndKeywords` calls.
+ *  - Uses manual argument parsing for performance on hot paths.
+ *  - Returns & accepts NumPy arrays when available, avoiding memory-scattered Python lists.
  */
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
 #define NOMINMAX
@@ -33,1968 +35,700 @@ typedef SSIZE_T ssize_t;
 #define SSIZE_MAX (SIZE_MAX / 2)
 #endif
 
-#include <Python.h> // Core CPython interfaces
-
 #include <errno.h>  // `errno`
 #include <stdio.h>  // `fopen`
 #include <stdlib.h> // `rand`, `srand`
 #include <time.h>   // `time`
 
-#include <stringzilla/stringzillas.h>
-
-#pragma region Forward Declarations
-
-static PyTypeObject SimilaritiesEngineType;
-static PyTypeObject FingerprintsEngineType;
-
-/**
- *  @brief  Type-punned StringZilla-string, that points to a slice of an existing Python `str`
- *          or a `File`.
- *
- *  When a slice is constructed, the `parent` object's reference count is being incremented to preserve lifetime.
- *  It usage in Python would look like:
- *
- *      - Str() # Empty string
- *      - Str("some-string") # Full-range slice of a Python `str`
- *      - Str(File("some-path.txt")) # Full-range view of a persisted file
- *      - Str(File("some-path.txt"), from=0, to=sys.maxsize)
- */
-typedef struct {
-    PyObject ob_base;
-
-    PyObject *parent;
-    sz_string_view_t memory;
-} Str;
-
-static PyObject *_Str_levenshtein_distance(PyObject *self, PyObject *args, PyObject *kwargs,
-                                           sz_levenshtein_distance_t function) {
-    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member + 1 || nargs > !is_member + 2) {
-        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
-        return NULL;
-    }
-
-    PyObject *str1_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *str2_obj = PyTuple_GET_ITEM(args, !is_member + 0);
-    PyObject *bound_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
-
-    if (kwargs) {
-        Py_ssize_t pos = 0;
-        PyObject *key, *value;
-        while (PyDict_Next(kwargs, &pos, &key, &value))
-            if (PyUnicode_CompareWithASCIIString(key, "bound") == 0 && !bound_obj) { bound_obj = value; }
-            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key))
-                return NULL;
-    }
-
-    sz_size_t bound = SZ_SIZE_MAX; // Default value for bound
-    if (bound_obj && ((bound = (sz_size_t)PyLong_AsSize_t(bound_obj)) == (sz_size_t)(-1))) {
-        PyErr_Format(PyExc_ValueError, "Bound must be a non-negative integer");
-        return NULL;
-    }
-
-    sz_string_view_t str1, str2;
-    if (!export_string_like(str1_obj, &str1.start, &str1.length) ||
-        !export_string_like(str2_obj, &str2.start, &str2.length)) {
-        wrap_current_exception("Both arguments must be string-like");
-        return NULL;
-    }
-
-    // Allocate memory for the Levenshtein matrix
-    sz_memory_allocator_t reusing_allocator;
-    reusing_allocator.allocate = &temporary_memory_allocate;
-    reusing_allocator.free = &temporary_memory_free;
-    reusing_allocator.handle = &temporary_memory;
-
-    sz_size_t distance;
-    sz_status_t status =
-        function(str1.start, str1.length, str2.start, str2.length, bound, &reusing_allocator, &distance);
-
-    // Check for memory allocation issues
-    if (status != sz_success_k) {
-        PyErr_NoMemory();
-        return NULL;
-    }
-
-    return PyLong_FromSize_t(distance);
-}
-
-static char const doc_levenshtein_distance[] = //
-    "Compute the Levenshtein edit distance between two strings.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The first string.\n"
-    "  other (str): The second string to compare.\n"
-    "  bound (int, optional): Optional maximum distance to compute (default is no bound).\n"
-    "Returns:\n"
-    "  int: The edit distance (number of insertions, deletions, substitutions).";
-
-static PyObject *Str_levenshtein_distance(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return _Str_levenshtein_distance(self, args, kwargs, &sz_levenshtein_distance);
-}
-
-static char const doc_levenshtein_distance_unicode[] = //
-    "Compute the Levenshtein edit distance between two Unicode strings.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The first string.\n"
-    "  other (str): The second string to compare.\n"
-    "  bound (int, optional): Optional maximum distance to compute (default is no bound).\n"
-    "Returns:\n"
-    "  int: The edit distance in Unicode characters.";
-
-static PyObject *Str_levenshtein_distance_unicode(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return _Str_levenshtein_distance(self, args, kwargs, &sz_levenshtein_distance_utf8);
-}
+#include <Python.h>            // CPython API
+#include <numpy/arrayobject.h> // NumPy C API
 
-static PyObject *_Str_hamming_distance(PyObject *self, PyObject *args, PyObject *kwargs,
-                                       sz_hamming_distance_t function) {
-    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member + 1 || nargs > !is_member + 2) {
-        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
-        return NULL;
-    }
-
-    PyObject *str1_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *str2_obj = PyTuple_GET_ITEM(args, !is_member + 0);
-    PyObject *bound_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
-
-    if (kwargs) {
-        Py_ssize_t pos = 0;
-        PyObject *key, *value;
-        while (PyDict_Next(kwargs, &pos, &key, &value))
-            if (PyUnicode_CompareWithASCIIString(key, "bound") == 0 && !bound_obj) { bound_obj = value; }
-            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key))
-                return NULL;
-    }
-
-    Py_ssize_t bound = 0; // Default value for bound
-    if (bound_obj && ((bound = PyLong_AsSsize_t(bound_obj)) < 0)) {
-        PyErr_Format(PyExc_ValueError, "Bound must be a non-negative integer");
-        return NULL;
-    }
-
-    sz_string_view_t str1, str2;
-    if (!export_string_like(str1_obj, &str1.start, &str1.length) ||
-        !export_string_like(str2_obj, &str2.start, &str2.length)) {
-        wrap_current_exception("Both arguments must be string-like");
-        return NULL;
-    }
-
-    sz_size_t distance;
-    sz_status_t status = function(str1.start, str1.length, str2.start, str2.length, (sz_size_t)bound, &distance);
-
-    // Check for memory allocation issues
-    if (status != sz_success_k) {
-        PyErr_NoMemory();
-        return NULL;
-    }
-
-    return PyLong_FromSize_t(distance);
-}
-
-static char const doc_hamming_distance[] = //
-    "Compute the Hamming distance between two strings.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The first string.\n"
-    "  other (str): The second string to compare.\n"
-    "  bound (int, optional): Optional maximum distance to compute (default is no bound).\n"
-    "Returns:\n"
-    "  int: The Hamming distance, including differing bytes and length difference.";
-
-static PyObject *Str_hamming_distance(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return _Str_hamming_distance(self, args, kwargs, &sz_hamming_distance);
-}
-
-static char const doc_hamming_distance_unicode[] = //
-    "Compute the Hamming distance between two Unicode strings.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The first string.\n"
-    "  other (str): The second string to compare.\n"
-    "  bound (int, optional): Optional maximum distance to compute (default is no bound).\n"
-    "Returns:\n"
-    "  int: The Hamming distance, including differing Unicode characters and length difference.";
-
-static PyObject *Str_hamming_distance_unicode(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return _Str_hamming_distance(self, args, kwargs, &sz_hamming_distance_utf8);
-}
-
-static char const doc_needleman_wunsch_score[] = //
-    "Compute the Needleman-Wunsch alignment score between two strings.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The first string.\n"
-    "  other (str): The second string to align.\n"
-    "  substitution_matrix (numpy.ndarray): A 256x256 substitution cost matrix.\n"
-    "  gap_score (int): The score for introducing a gap.\n"
-    "  bound (int, optional): Optional maximum score to compute (default is no bound).\n"
-    "Returns:\n"
-    "  int: The alignment score.";
-
-static PyObject *Str_needleman_wunsch_score(PyObject *self, PyObject *args, PyObject *kwargs) {
-    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member + 1 || nargs > !is_member + 2) {
-        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
-        return NULL;
-    }
-
-    PyObject *str1_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *str2_obj = PyTuple_GET_ITEM(args, !is_member + 0);
-    PyObject *substitution_matrix_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
-    PyObject *gap_score_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
-
-    if (kwargs) {
-        Py_ssize_t pos = 0;
-        PyObject *key, *value;
-        while (PyDict_Next(kwargs, &pos, &key, &value))
-            if (PyUnicode_CompareWithASCIIString(key, "gap_score") == 0 && !gap_score_obj) { gap_score_obj = value; }
-            else if (PyUnicode_CompareWithASCIIString(key, "substitution_matrix") == 0 && !substitution_matrix_obj) {
-                substitution_matrix_obj = value;
-            }
-            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key))
-                return NULL;
-    }
-
-    Py_ssize_t gap = 1; // Default value for gap costs
-    if (gap_score_obj && (gap = PyLong_AsSsize_t(gap_score_obj)) && (gap >= 128 || gap <= -128)) {
-        PyErr_Format(PyExc_ValueError, "The `gap_score` must fit into an 8-bit signed integer");
-        return NULL;
-    }
-
-    // Now extract the substitution matrix from the `substitution_matrix_obj`.
-    // It must conform to the buffer protocol, and contain a continuous 256x256 matrix of 8-bit signed integers.
-    sz_error_cost_t const *substitutions;
-
-    // Ensure the substitution matrix object is provided
-    if (!substitution_matrix_obj) {
-        PyErr_Format(PyExc_TypeError, "No substitution matrix provided");
-        return NULL;
-    }
-
-    // Request a buffer view
-    Py_buffer substitutions_view;
-    if (PyObject_GetBuffer(substitution_matrix_obj, &substitutions_view, PyBUF_FULL)) {
-        PyErr_Format(PyExc_TypeError, "Failed to get buffer from substitution matrix");
-        return NULL;
-    }
-
-    // Validate the buffer
-    if (substitutions_view.ndim != 2 || substitutions_view.shape[0] != 256 || substitutions_view.shape[1] != 256 ||
-        substitutions_view.itemsize != sizeof(sz_error_cost_t)) {
-        PyErr_Format(PyExc_ValueError, "Substitution matrix must be a 256x256 matrix of 8-bit signed integers");
-        PyBuffer_Release(&substitutions_view);
-        return NULL;
-    }
-
-    sz_string_view_t str1, str2;
-    if (!export_string_like(str1_obj, &str1.start, &str1.length) ||
-        !export_string_like(str2_obj, &str2.start, &str2.length)) {
-        wrap_current_exception("Both arguments must be string-like");
-        return NULL;
-    }
-
-    // Assign the buffer's data to substitutions
-    substitutions = (sz_error_cost_t const *)substitutions_view.buf;
-
-    // Allocate memory for the Levenshtein matrix
-    sz_memory_allocator_t reusing_allocator;
-    reusing_allocator.allocate = &temporary_memory_allocate;
-    reusing_allocator.free = &temporary_memory_free;
-    reusing_allocator.handle = &temporary_memory;
-
-    sz_ssize_t score;
-    sz_status_t status = sz_needleman_wunsch_score(str1.start, str1.length, str2.start, str2.length, substitutions,
-                                                   (sz_error_cost_t)gap, &reusing_allocator, &score);
-
-    // Don't forget to release the buffer view
-    PyBuffer_Release(&substitutions_view);
-
-    // Check for memory allocation issues
-    if (status != sz_success_k) {
-        PyErr_NoMemory();
-        return NULL;
-    }
-
-    return PyLong_FromSsize_t(score);
-}
-
-static char const doc_startswith[] = //
-    "Check if a string starts with a given prefix.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The string object.\n"
-    "  prefix (str): The prefix to check.\n"
-    "  start (int, optional): The starting index (default is 0).\n"
-    "  end (int, optional): The ending index (default is the string length).\n"
-    "Returns:\n"
-    "  bool: True if the string starts with the prefix, False otherwise.";
-
-static PyObject *Str_startswith(PyObject *self, PyObject *args, PyObject *kwargs) {
-    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member + 1 || nargs > !is_member + 3) {
-        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
-        return NULL;
-    }
-
-    PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *prefix_obj = PyTuple_GET_ITEM(args, !is_member);
-    PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
-    PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
-
-    // Optional start and end arguments
-    Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
-
-    if (start_obj && ((start = PyLong_AsSsize_t(start_obj)) == -1 && PyErr_Occurred())) {
-        PyErr_SetString(PyExc_TypeError, "start must be an integer");
-        return NULL;
-    }
-
-    if (end_obj && ((end = PyLong_AsSsize_t(end_obj)) == -1 && PyErr_Occurred())) {
-        PyErr_SetString(PyExc_TypeError, "end must be an integer");
-        return NULL;
-    }
-
-    sz_string_view_t str, prefix;
-    if (!export_string_like(str_obj, &str.start, &str.length) ||
-        !export_string_like(prefix_obj, &prefix.start, &prefix.length)) {
-        wrap_current_exception("Both arguments must be string-like");
-        return NULL;
-    }
-
-    // Apply start and end arguments
-    str.start += start;
-    str.length -= start;
-    if (end != PY_SSIZE_T_MAX && (sz_size_t)(end - start) < str.length) { str.length = (sz_size_t)(end - start); }
-
-    if (str.length < prefix.length) { Py_RETURN_FALSE; }
-    else if (strncmp(str.start, prefix.start, prefix.length) == 0) { Py_RETURN_TRUE; }
-    else { Py_RETURN_FALSE; }
-}
-
-static char const doc_endswith[] = //
-    "Check if a string ends with a given suffix.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The string object.\n"
-    "  suffix (str): The suffix to check.\n"
-    "  start (int, optional): The starting index (default is 0).\n"
-    "  end (int, optional): The ending index (default is the string length).\n"
-    "Returns:\n"
-    "  bool: True if the string ends with the suffix, False otherwise.";
-
-static PyObject *Str_endswith(PyObject *self, PyObject *args, PyObject *kwargs) {
-    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member + 1 || nargs > !is_member + 3) {
-        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
-        return NULL;
-    }
-
-    PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *suffix_obj = PyTuple_GET_ITEM(args, !is_member);
-    PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
-    PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
-
-    // Optional start and end arguments
-    Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
-
-    if (start_obj && ((start = PyLong_AsSsize_t(start_obj)) == -1 && PyErr_Occurred())) {
-        PyErr_SetString(PyExc_TypeError, "start must be an integer");
-        return NULL;
-    }
-
-    if (end_obj && ((end = PyLong_AsSsize_t(end_obj)) == -1 && PyErr_Occurred())) {
-        PyErr_SetString(PyExc_TypeError, "end must be an integer");
-        return NULL;
-    }
-
-    sz_string_view_t str, suffix;
-    if (!export_string_like(str_obj, &str.start, &str.length) ||
-        !export_string_like(suffix_obj, &suffix.start, &suffix.length)) {
-        wrap_current_exception("Both arguments must be string-like");
-        return NULL;
-    }
-
-    // Apply start and end arguments
-    str.start += start;
-    str.length -= start;
-    if (end != PY_SSIZE_T_MAX && (sz_size_t)(end - start) < str.length) { str.length = (sz_size_t)(end - start); }
-
-    if (str.length < suffix.length) { Py_RETURN_FALSE; }
-    else if (strncmp(str.start + (str.length - suffix.length), suffix.start, suffix.length) == 0) { Py_RETURN_TRUE; }
-    else { Py_RETURN_FALSE; }
-}
-
-static char const doc_translate[] = //
-    "Perform transformation of a string using a look-up table.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The string object.\n"
-    "  table (str or dict): A 256-character string or a dictionary mapping bytes to bytes.\n"
-    "  inplace (bool, optional): If True, the string is modified in place (default is False).\n"
-    "\n"
-    "  start (int, optional): The starting index for translation (default is 0).\n"
-    "  end (int, optional): The ending index for translation (default is the string length).\n"
-    "Returns:\n"
-    "  Union[None, str, bytes]: If inplace is False, a new string is returned, otherwise None.\n"
-    "Raises:\n"
-    "  ValueError: If the table is not 256 bytes long.\n"
-    "  TypeError: If the table is not a string or dictionary.";
-
-static PyObject *Str_translate(PyObject *self, PyObject *args, PyObject *kwargs) {
-    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member + 1 || nargs > !is_member + 4) {
-        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
-        return NULL;
-    }
-
-    PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *look_up_table_obj = PyTuple_GET_ITEM(args, !is_member);
-    PyObject *inplace_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
-    PyObject *start_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
-    PyObject *end_obj = nargs > !is_member + 3 ? PyTuple_GET_ITEM(args, !is_member + 3) : NULL;
-
-    // Optional keyword arguments
-    if (kwargs) {
-        Py_ssize_t pos = 0;
-        PyObject *key, *value;
-        while (PyDict_Next(kwargs, &pos, &key, &value))
-            if (PyUnicode_CompareWithASCIIString(key, "inplace") == 0 && !inplace_obj) { inplace_obj = value; }
-            else if (PyUnicode_CompareWithASCIIString(key, "start") == 0 && !start_obj) { start_obj = value; }
-            else if (PyUnicode_CompareWithASCIIString(key, "end") == 0 && !end_obj) { end_obj = value; }
-            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key))
-                return NULL;
-    }
-
-    // Optional start and end arguments
-    Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
-
-    if (start_obj && ((start = PyLong_AsSsize_t(start_obj)) == -1 && PyErr_Occurred())) {
-        PyErr_SetString(PyExc_TypeError, "start must be an integer");
-        return NULL;
-    }
-
-    if (end_obj && ((end = PyLong_AsSsize_t(end_obj)) == -1 && PyErr_Occurred())) {
-        PyErr_SetString(PyExc_TypeError, "end must be an integer");
-        return NULL;
-    }
-
-    sz_string_view_t str;
-    if (!export_string_like(str_obj, &str.start, &str.length)) {
-        wrap_current_exception("First argument must be string-like");
-        return NULL;
-    }
-
-    sz_string_view_t look_up_table_str;
-    SZ_ALIGN64 char look_up_table[256];
-    if (PyDict_Check(look_up_table_obj)) {
-
-        // If any character is not defined, it will be replaced with itself:
-        for (int i = 0; i < 256; i++) look_up_table[i] = (char)i;
-
-        // Process the dictionary into the look-up table
-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
-        while (PyDict_Next(look_up_table_obj, &pos, &key, &value)) {
-            if (!PyUnicode_Check(key) || PyUnicode_GetLength(key) != 1 || !PyUnicode_Check(value) ||
-                PyUnicode_GetLength(value) != 1) {
-                PyErr_SetString(PyExc_TypeError, "Keys and values must be single characters");
-                return NULL;
-            }
-
-            char key_char = PyUnicode_AsUTF8(key)[0];
-            char value_char = PyUnicode_AsUTF8(value)[0];
-            look_up_table[(unsigned char)key_char] = value_char;
-        }
-    }
-    else if (export_string_like(look_up_table_obj, &look_up_table_str.start, &look_up_table_str.length)) {
-        if (look_up_table_str.length != 256) {
-            PyErr_SetString(PyExc_ValueError, "The look-up table must be exactly 256 bytes long");
-            return NULL;
-        }
-        sz_copy(&look_up_table[0], look_up_table_str.start, look_up_table_str.length);
-    }
-    else {
-        wrap_current_exception("The look-up table must be string-like or a dictionary");
-        return NULL;
-    }
-
-    int is_inplace = inplace_obj ? PyObject_IsTrue(inplace_obj) : 0;
-    if (is_inplace == -1) {
-        PyErr_SetString(PyExc_TypeError, "The inplace argument must be a boolean");
-        return NULL;
-    }
-
-    // Apply start and end arguments
-    str.start += start;
-    str.length -= start;
-    if (end != PY_SSIZE_T_MAX && (sz_size_t)(end - start) < str.length) { str.length = (sz_size_t)(end - start); }
-
-    // Perform the translation using the look-up table
-    if (is_inplace) {
-        sz_lookup(str.start, str.length, str.start, look_up_table);
-        Py_RETURN_NONE;
-    }
-    // Allocate a string of the same size, get it's raw pointer and transform the data into it
-    else {
-
-        // For binary inputs return bytes, for unicode return str
-        if (PyUnicode_Check(str_obj)) {
-            // Create a new Unicode object
-            PyObject *new_unicode_obj = PyUnicode_New(str.length, PyUnicode_MAX_CHAR_VALUE(str_obj));
-            if (!new_unicode_obj) {
-                PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for new Unicode string");
-                return NULL;
-            }
-
-            sz_ptr_t new_buffer = (sz_ptr_t)PyUnicode_DATA(new_unicode_obj);
-            sz_lookup(new_buffer, str.length, str.start, look_up_table);
-            return new_unicode_obj;
-        }
-        else {
-            PyObject *new_bytes_obj = PyBytes_FromStringAndSize(NULL, str.length);
-            if (!new_bytes_obj) {
-                PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for new string");
-                return NULL;
-            }
-
-            // Get the buffer and perform the transformation
-            sz_ptr_t new_buffer = (sz_ptr_t)PyBytes_AS_STRING(new_bytes_obj);
-            sz_lookup(new_buffer, str.length, str.start, look_up_table);
-            return new_bytes_obj;
-        }
-    }
-}
-
-static char const doc_find_first_of[] = //
-    "Find the index of the first occurrence of any character from another string.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The string object.\n"
-    "  chars (str): A string containing characters to search for.\n"
-    "  start (int, optional): Starting index (default is 0).\n"
-    "  end (int, optional): Ending index (default is the string length).\n"
-    "Returns:\n"
-    "  int: Index of the first matching character, or -1 if none found.";
-
-static PyObject *Str_find_first_of(PyObject *self, PyObject *args, PyObject *kwargs) {
-    Py_ssize_t signed_offset;
-    sz_string_view_t text;
-    sz_string_view_t separator;
-    if (!_Str_find_implementation_(self, args, kwargs, &sz_find_byte_from, sz_false_k, &signed_offset, &text,
-                                   &separator))
-        return NULL;
-    return PyLong_FromSsize_t(signed_offset);
-}
-
-static char const doc_find_first_not_of[] = //
-    "Find the index of the first character not in another string.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The string object.\n"
-    "  chars (str): A string containing characters to exclude.\n"
-    "  start (int, optional): Starting index (default is 0).\n"
-    "  end (int, optional): Ending index (default is the string length).\n"
-    "Returns:\n"
-    "  int: Index of the first non-matching character, or -1 if all match.";
-
-static PyObject *Str_find_first_not_of(PyObject *self, PyObject *args, PyObject *kwargs) {
-    Py_ssize_t signed_offset;
-    sz_string_view_t text;
-    sz_string_view_t separator;
-    if (!_Str_find_implementation_(self, args, kwargs, &sz_find_byte_not_from, sz_false_k, &signed_offset, &text,
-                                   &separator))
-        return NULL;
-    return PyLong_FromSsize_t(signed_offset);
-}
-
-static char const doc_find_last_of[] = //
-    "Find the index of the last occurrence of any character from another string.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The string object.\n"
-    "  chars (str): A string containing characters to search for.\n"
-    "  start (int, optional): Starting index (default is 0).\n"
-    "  end (int, optional): Ending index (default is the string length).\n"
-    "Returns:\n"
-    "  int: Index of the last matching character, or -1 if none found.";
-
-static PyObject *Str_find_last_of(PyObject *self, PyObject *args, PyObject *kwargs) {
-    Py_ssize_t signed_offset;
-    sz_string_view_t text;
-    sz_string_view_t separator;
-    if (!_Str_find_implementation_(self, args, kwargs, &sz_rfind_byte_from, sz_true_k, &signed_offset, &text,
-                                   &separator))
-        return NULL;
-    return PyLong_FromSsize_t(signed_offset);
-}
-
-static char const doc_find_last_not_of[] = //
-    "Find the index of the last character not in another string.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The string object.\n"
-    "  chars (str): A string containing characters to exclude.\n"
-    "  start (int, optional): Starting index (default is 0).\n"
-    "  end (int, optional): Ending index (default is the string length).\n"
-    "Returns:\n"
-    "  int: Index of the last non-matching character, or -1 if all match.";
-
-static PyObject *Str_find_last_not_of(PyObject *self, PyObject *args, PyObject *kwargs) {
-    Py_ssize_t signed_offset;
-    sz_string_view_t text;
-    sz_string_view_t separator;
-    if (!_Str_find_implementation_(self, args, kwargs, &sz_rfind_byte_not_from, sz_true_k, &signed_offset, &text,
-                                   &separator))
-        return NULL;
-    return PyLong_FromSsize_t(signed_offset);
-}
-
-/**
- *  @brief  Given parsed split settings, constructs an iterator that would produce that split.
- */
-static SplitIterator *Str_split_iter_(PyObject *text_obj, PyObject *separator_obj,                   //
-                                      sz_string_view_t const text, sz_string_view_t const separator, //
-                                      int keepseparator, Py_ssize_t maxsplit, sz_find_t finder, sz_size_t match_length,
-                                      sz_bool_t is_reverse) {
-
-    // Create a new `SplitIterator` object
-    SplitIterator *result_obj = (SplitIterator *)SplitIteratorType.tp_alloc(&SplitIteratorType, 0);
-    if (result_obj == NULL && PyErr_NoMemory()) return NULL;
-
-    // Set its properties based on the slice
-    result_obj->text_obj = text_obj;
-    result_obj->separator_obj = separator_obj;
-    result_obj->text = text;
-    result_obj->separator = separator;
-    result_obj->finder = finder;
-
-    result_obj->match_length = match_length;
-    result_obj->include_match = keepseparator;
-    result_obj->is_reverse = is_reverse;
-    result_obj->max_parts = (sz_size_t)maxsplit + 1;
-    result_obj->reached_tail = 0;
-
-    // Increment the reference count of the parent
-    Py_INCREF(result_obj->text_obj);
-    Py_XINCREF(result_obj->separator_obj);
-    return result_obj;
-}
-
-/**
- *  @brief  Implements the normal order split logic for both string-delimiters and character sets.
- *          Produces one of the consecutive layouts - `STRS_CONSECUTIVE_64` or `STRS_CONSECUTIVE_32`.
- */
-static Strs *Str_split_(PyObject *parent_string, sz_string_view_t const text, sz_string_view_t const separator,
-                        int keepseparator, Py_ssize_t maxsplit, sz_find_t finder, sz_size_t match_length) {
-    // Create Strs object
-    Strs *result = (Strs *)PyObject_New(Strs, &StrsType);
-    if (!result) return NULL;
-
-    // Initialize Strs object based on the splitting logic
-    void *offsets_endings = NULL;
-    size_t offsets_capacity = 0;
-    size_t offsets_count = 0;
-    size_t bytes_per_offset;
-    if (text.length >= UINT32_MAX) {
-        bytes_per_offset = 8;
-        result->type = STRS_CONSECUTIVE_64;
-        result->data.consecutive_64bit.start = text.start;
-        result->data.consecutive_64bit.parent_string = parent_string;
-        result->data.consecutive_64bit.separator_length = !keepseparator * match_length;
-    }
-    else {
-        bytes_per_offset = 4;
-        result->type = STRS_CONSECUTIVE_32;
-        result->data.consecutive_32bit.start = text.start;
-        result->data.consecutive_32bit.parent_string = parent_string;
-        result->data.consecutive_32bit.separator_length = !keepseparator * match_length;
-    }
-
-    sz_bool_t reached_tail = 0;
-    sz_size_t total_skipped = 0;
-    sz_size_t max_parts = (sz_size_t)maxsplit + 1;
-    while (!reached_tail) {
-
-        sz_cptr_t match =
-            offsets_count + 1 < max_parts
-                ? finder(text.start + total_skipped, text.length - total_skipped, separator.start, separator.length)
-                : NULL;
-
-        sz_size_t part_end_offset;
-        if (match) {
-            part_end_offset = (match - text.start) + match_length;
-            total_skipped = part_end_offset;
-        }
-        else {
-            part_end_offset = text.length;
-            total_skipped = text.length;
-            reached_tail = 1;
-        }
-
-        // Reallocate offsets array if needed
-        if (offsets_count >= offsets_capacity) {
-            offsets_capacity = (offsets_capacity + 1) * 2;
-            void *new_offsets = realloc(offsets_endings, offsets_capacity * bytes_per_offset);
-            if (!new_offsets) {
-                if (offsets_endings) free(offsets_endings);
-            }
-            offsets_endings = new_offsets;
-        }
-
-        // If the memory allocation has failed - discard the response
-        if (!offsets_endings) {
-            Py_XDECREF(result);
-            PyErr_NoMemory();
-            return NULL;
-        }
-
-        // Export the offset
-        if (bytes_per_offset == 8) { ((uint64_t *)offsets_endings)[offsets_count] = (uint64_t)part_end_offset; }
-        else { ((uint32_t *)offsets_endings)[offsets_count] = (uint32_t)part_end_offset; }
-        offsets_count++;
-    }
-
-    // Populate the Strs object with the offsets
-    if (bytes_per_offset == 8) {
-        result->data.consecutive_64bit.end_offsets = offsets_endings;
-        result->data.consecutive_64bit.count = offsets_count;
-    }
-    else {
-        result->data.consecutive_32bit.end_offsets = offsets_endings;
-        result->data.consecutive_32bit.count = offsets_count;
-    }
-
-    Py_INCREF(parent_string);
-    return result;
-}
-
-/**
- *  @brief  Implements the reverse order split logic for both string-delimiters and character sets.
- *          Unlike the `Str_split_` can't use consecutive layouts and produces a `REORDERED` one.
- */
-static Strs *Str_rsplit_(PyObject *parent_string, sz_string_view_t const text, sz_string_view_t const separator,
-                         int keepseparator, Py_ssize_t maxsplit, sz_find_t finder, sz_size_t match_length) {
-    // Create Strs object
-    Strs *result = (Strs *)PyObject_New(Strs, &StrsType);
-    if (!result) return NULL;
-
-    // Initialize Strs object based on the splitting logic
-    result->type = STRS_REORDERED;
-    result->data.reordered.parent_string = parent_string;
-    result->data.reordered.parts = NULL;
-    result->data.reordered.count = 0;
-
-    // Keep track of the memory usage
-    sz_string_view_t *parts = NULL;
-    sz_size_t parts_capacity = 0;
-    sz_size_t parts_count = 0;
-
-    sz_bool_t reached_tail = 0;
-    sz_size_t total_skipped = 0;
-    sz_size_t max_parts = (sz_size_t)maxsplit + 1;
-    while (!reached_tail) {
-
-        sz_cptr_t match = parts_count + 1 < max_parts
-                              ? finder(text.start, text.length - total_skipped, separator.start, separator.length)
-                              : NULL;
-
-        // Determine the next part
-        sz_string_view_t part;
-        if (match) {
-            part.start = match + match_length * !keepseparator;
-            part.length = text.start + text.length - total_skipped - part.start;
-            total_skipped = text.start + text.length - match;
-        }
-        else {
-            part.start = text.start;
-            part.length = text.length - total_skipped;
-            reached_tail = 1;
-        }
-
-        // Reallocate parts array if needed
-        if (parts_count >= parts_capacity) {
-            parts_capacity = (parts_capacity + 1) * 2;
-            sz_string_view_t *new_parts = (sz_string_view_t *)realloc(parts, parts_capacity * sizeof(sz_string_view_t));
-            if (!new_parts) {
-                if (parts) free(parts);
-            }
-            parts = new_parts;
-        }
-
-        // If the memory allocation has failed - discard the response
-        if (!parts) {
-            Py_XDECREF(result);
-            PyErr_NoMemory();
-            return NULL;
-        }
-
-        // Populate the parts array
-        parts[parts_count] = part;
-        parts_count++;
-    }
-
-    // Python does this weird thing, where the `rsplit` results appear in the same order as `split`
-    // so we need to reverse the order of elements in the `parts` array.
-    for (sz_size_t i = 0; i < parts_count / 2; i++) {
-        sz_string_view_t temp = parts[i];
-        parts[i] = parts[parts_count - i - 1];
-        parts[parts_count - i - 1] = temp;
-    }
-
-    result->data.reordered.parts = parts;
-    result->data.reordered.count = parts_count;
-    Py_INCREF(parent_string);
-    return result;
-}
-
-/**
- *  @brief  Proxy routing requests like `Str.split`, `Str.rsplit`, `Str.split_byteset` and `Str.rsplit_byteset`
- *          to `Str_split_` and `Str_rsplit_` implementations, parsing function arguments.
- */
-static PyObject *Str_split_with_known_callback(PyObject *self, PyObject *args, PyObject *kwargs, //
-                                               sz_find_t finder, sz_size_t match_length,         //
-                                               sz_bool_t is_reverse, sz_bool_t is_lazy_iterator) {
-    // Check minimum arguments
-    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member || nargs > !is_member + 3) {
-        PyErr_SetString(PyExc_TypeError, "sz.split() received unsupported number of arguments");
-        return NULL;
-    }
-
-    PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *separator_obj = nargs > !is_member + 0 ? PyTuple_GET_ITEM(args, !is_member + 0) : NULL;
-    PyObject *maxsplit_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
-    PyObject *keepseparator_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
-
-    if (kwargs) {
-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
-        while (PyDict_Next(kwargs, &pos, &key, &value)) {
-            if (PyUnicode_CompareWithASCIIString(key, "separator") == 0 && !separator_obj) { separator_obj = value; }
-            else if (PyUnicode_CompareWithASCIIString(key, "maxsplit") == 0 && !maxsplit_obj) { maxsplit_obj = value; }
-            else if (PyUnicode_CompareWithASCIIString(key, "keepseparator") == 0 && !keepseparator_obj) {
-                keepseparator_obj = value;
-            }
-            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key))
-                return NULL;
-        }
-    }
-
-    sz_string_view_t text;
-    sz_string_view_t separator;
-    int keepseparator;
-    Py_ssize_t maxsplit;
-
-    // Validate and convert `text`
-    if (!export_string_like(text_obj, &text.start, &text.length)) {
-        wrap_current_exception("The text argument must be string-like");
-        return NULL;
-    }
-
-    // Validate and convert `separator`
-    if (separator_obj) {
-        if (!export_string_like(separator_obj, &separator.start, &separator.length)) {
-            wrap_current_exception("The separator argument must be string-like");
-            return NULL;
-        }
-        // Raise a `ValueError` if it's length is zero, like the native `str.split`
-        if (separator.length == 0) {
-            PyErr_SetString(PyExc_ValueError, "The separator argument must not be empty");
-            return NULL;
-        }
-        if (match_length == 0) match_length = separator.length;
-    }
-    else {
-        separator.start = " ";
-        match_length = separator.length = 1;
-    }
-
-    // Validate and convert `keepseparator`
-    if (keepseparator_obj) {
-        keepseparator = PyObject_IsTrue(keepseparator_obj);
-        if (keepseparator == -1) {
-            PyErr_SetString(PyExc_TypeError, "The keepseparator argument must be a boolean");
-            return NULL;
-        }
-    }
-    else { keepseparator = 0; }
-
-    // Validate and convert `maxsplit`
-    if (maxsplit_obj) {
-        maxsplit = PyLong_AsSsize_t(maxsplit_obj);
-        if (maxsplit == -1 && PyErr_Occurred()) {
-            PyErr_SetString(PyExc_TypeError, "The maxsplit argument must be an integer");
-            return NULL;
-        }
-    }
-    else { maxsplit = PY_SSIZE_T_MAX; }
-
-    // Dispatch the right backend
-    if (is_lazy_iterator)
-        return Str_split_iter_(text_obj, separator_obj, text, separator, //
-                               keepseparator, maxsplit, finder, match_length, is_reverse);
-    else
-        return !is_reverse ? Str_split_(text_obj, text, separator, keepseparator, maxsplit, finder, match_length)
-                           : Str_rsplit_(text_obj, text, separator, keepseparator, maxsplit, finder, match_length);
-}
-
-static char const doc_split[] = //
-    "Split a string by a separator.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The string object.\n"
-    "  separator (str): The separator to split by (cannot be empty).\n"
-    "  maxsplit (int, optional): Maximum number of splits (default is no limit).\n"
-    "  keepseparator (bool, optional): Include the separator in results (default is False).\n"
-    "Returns:\n"
-    "  Strs: A list of strings split by the separator.\n"
-    "Raises:\n"
-    "  ValueError: If the separator is an empty string.";
-
-static PyObject *Str_split(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return Str_split_with_known_callback(self, args, kwargs, &sz_find, 0, sz_false_k, sz_false_k);
-}
+#include <stringzillas/stringzillas.h>
 
-static char const doc_rsplit[] = //
-    "Split a string by a separator starting from the end.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The string object.\n"
-    "  separator (str): The separator to split by (cannot be empty).\n"
-    "  maxsplit (int, optional): Maximum number of splits (default is no limit).\n"
-    "  keepseparator (bool, optional): Include the separator in results (default is False).\n"
-    "Returns:\n"
-    "  Strs: A list of strings split by the separator.\n"
-    "Raises:\n"
-    "  ValueError: If the separator is an empty string.";
-
-static PyObject *Str_rsplit(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return Str_split_with_known_callback(self, args, kwargs, &sz_rfind, 0, sz_true_k, sz_false_k);
-}
+#pragma region Forward Declarations
 
-static char const doc_split_byteset[] = //
-    "Split a string by a set of character separators.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The string object.\n"
-    "  separators (str): A string containing separator characters.\n"
-    "  maxsplit (int, optional): Maximum number of splits (default is no limit).\n"
-    "  keepseparator (bool, optional): Include separators in results (default is False).\n"
-    "Returns:\n"
-    "  Strs: A list of strings split by the character set.";
-
-static PyObject *Str_split_byteset(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return Str_split_with_known_callback(self, args, kwargs, &sz_find_byte_from, 1, sz_false_k, sz_false_k);
-}
+// Try to import NumPy, and fail if it's not available
+static int numpy_available = 0;
+static PyObject *numpy_module = NULL;
+
+static PyTypeObject DeviceScopeType;
+static PyTypeObject LevenshteinDistancesType;
+static PyTypeObject LevenshteinDistancesUTF8Type;
+static PyTypeObject NeedlemanWunschType;
+static PyTypeObject SmithWatermanType;
+static PyTypeObject FingerprintsType;
+static PyTypeObject FingerprintsUTF8Type;
+
+// Function pointers for stringzilla functions imported from capsules
+static sz_bool_t (*sz_py_export_string_like)(PyObject *, sz_cptr_t *, sz_size_t *) = NULL;
+static sz_bool_t (*sz_py_export_strings_as_sequence)(PyObject *, sz_sequence_t *) = NULL;
+static sz_bool_t (*sz_py_export_strings_as_u32tape)(PyObject *, sz_cptr_t *, sz_u32_t const **, sz_size_t *) = NULL;
+static sz_bool_t (*sz_py_export_strings_as_u64tape)(PyObject *, sz_cptr_t *, sz_u64_t const **, sz_size_t *) = NULL;
+
+// Default device scope that can be safely reused across calls
+// The underlying implementation is stateless and thread-safe
+static sz_device_scope_t default_device_scope = NULL;
+
+typedef struct PyAPI {
+    sz_bool_t (*sz_py_export_string_like)(PyObject *, sz_cptr_t *, sz_size_t *);
+    sz_bool_t (*sz_py_export_strings_as_sequence)(PyObject *, sz_sequence_t *);
+    sz_bool_t (*sz_py_export_strings_as_u32tape)(PyObject *, sz_cptr_t *, sz_u32_t const **, sz_size_t *);
+    sz_bool_t (*sz_py_export_strings_as_u64tape)(PyObject *, sz_cptr_t *, sz_u64_t const **, sz_size_t *);
+} PyAPI;
+
+// Method flags
+#define SZ_METHOD_FLAGS METH_VARARGS | METH_KEYWORDS
 
-static char const doc_rsplit_byteset[] = //
-    "Split a string by a set of character separators in reverse order.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The string object.\n"
-    "  separators (str): A string containing separator characters.\n"
-    "  maxsplit (int, optional): Maximum number of splits (default is no limit).\n"
-    "  keepseparator (bool, optional): Include separators in results (default is False).\n"
-    "Returns:\n"
-    "  Strs: A list of strings split by the character set.";
-
-static PyObject *Str_rsplit_byteset(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return Str_split_with_known_callback(self, args, kwargs, &sz_rfind_byte_from, 1, sz_true_k, sz_false_k);
-}
+#pragma endregion
 
-static char const doc_split_iter[] = //
-    "Create an iterator for splitting a string by a separator.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The string object.\n"
-    "  separator (str): The separator to split by (cannot be empty).\n"
-    "  keepseparator (bool, optional): Include separator in results (default is False).\n"
-    "Returns:\n"
-    "  iterator: An iterator yielding split substrings.\n"
-    "Raises:\n"
-    "  ValueError: If the separator is an empty string.";
-
-static PyObject *Str_split_iter(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return Str_split_with_known_callback(self, args, kwargs, &sz_find, 0, sz_false_k, sz_true_k);
-}
+#pragma region DeviceScope
 
-static char const doc_rsplit_iter[] = //
-    "Create an iterator for splitting a string by a separator in reverse order.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The string object.\n"
-    "  separator (str): The separator to split by (cannot be empty).\n"
-    "  keepseparator (bool, optional): Include separator in results (default is False).\n"
-    "Returns:\n"
-    "  iterator: An iterator yielding split substrings in reverse.\n"
-    "Raises:\n"
-    "  ValueError: If the separator is an empty string.";
-
-static PyObject *Str_rsplit_iter(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return Str_split_with_known_callback(self, args, kwargs, &sz_rfind, 0, sz_true_k, sz_true_k);
-}
+/**
+ *  @brief  Device scope for controlling execution context (CPU cores or GPU device).
+ */
+typedef struct {
+    PyObject ob_base;
+    sz_device_scope_t handle;
+} DeviceScope;
 
-static char const doc_split_byteset_iter[] = //
-    "Create an iterator for splitting a string by a set of character separators.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The string object.\n"
-    "  separators (str): A string containing separator characters.\n"
-    "  keepseparator (bool, optional): Include separators in results (default is False).\n"
-    "Returns:\n"
-    "  iterator: An iterator yielding split substrings.";
-
-static PyObject *Str_split_byteset_iter(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return Str_split_with_known_callback(self, args, kwargs, &sz_find_byte_from, 1, sz_false_k, sz_true_k);
+static void DeviceScope_dealloc(DeviceScope *self) {
+    if (self->handle) {
+        sz_device_scope_free(self->handle);
+        self->handle = NULL;
+    }
+    Py_TYPE(self)->tp_free((PyObject *)self);
 }
 
-static char const doc_rsplit_byteset_iter[] = //
-    "Create an iterator for splitting a string by a set of character separators in reverse order.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The string object.\n"
-    "  separators (str): A string containing separator characters.\n"
-    "  keepseparator (bool, optional): Include separators in results (default is False).\n"
-    "Returns:\n"
-    "  iterator: An iterator yielding split substrings in reverse.";
-
-static PyObject *Str_rsplit_byteset_iter(PyObject *self, PyObject *args, PyObject *kwargs) {
-    return Str_split_with_known_callback(self, args, kwargs, &sz_rfind_byte_from, 1, sz_true_k, sz_true_k);
+static PyObject *DeviceScope_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) {
+    DeviceScope *self = (DeviceScope *)type->tp_alloc(type, 0);
+    if (self != NULL) { self->handle = NULL; }
+    return (PyObject *)self;
 }
 
-static char const doc_splitlines[] = //
-    "Split a string by line breaks.\n"
-    "\n"
-    "Args:\n"
-    "  text (Str or str or bytes): The string object.\n"
-    "  keeplinebreaks (bool, optional): Include line breaks in the results (default is False).\n"
-    "  maxsplit (int, optional): Maximum number of splits (default is no limit).\n"
-    "Returns:\n"
-    "  Strs: A list of strings split by line breaks.";
-
-static PyObject *Str_splitlines(PyObject *self, PyObject *args, PyObject *kwargs) {
-    // Check minimum arguments
-    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+static int DeviceScope_init(DeviceScope *self, PyObject *args, PyObject *kwargs) {
     Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member || nargs > !is_member + 2) {
-        PyErr_SetString(PyExc_TypeError, "splitlines() requires at least 1 argument");
-        return NULL;
+    PyObject *cpu_cores_obj = NULL;
+    PyObject *gpu_device_obj = NULL;
+
+    // Manual argument parsing - check positional args first
+    if (nargs > 2) {
+        PyErr_SetString(PyExc_TypeError, "DeviceScope() takes at most 2 arguments");
+        return -1;
     }
 
-    PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *keeplinebreaks_obj = nargs > !is_member ? PyTuple_GET_ITEM(args, !is_member) : NULL;
-    PyObject *maxsplit_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+    if (nargs >= 1) cpu_cores_obj = PyTuple_GET_ITEM(args, 0);
+    if (nargs >= 2) gpu_device_obj = PyTuple_GET_ITEM(args, 1);
 
+    // Parse keyword arguments
     if (kwargs) {
-        PyObject *key, *value;
         Py_ssize_t pos = 0;
+        PyObject *key, *value;
         while (PyDict_Next(kwargs, &pos, &key, &value)) {
-            if (PyUnicode_CompareWithASCIIString(key, "keeplinebreaks") == 0 && !keeplinebreaks_obj) {
-                keeplinebreaks_obj = value;
+            if (PyUnicode_CompareWithASCIIString(key, "cpu_cores") == 0) {
+                if (cpu_cores_obj) {
+                    PyErr_SetString(PyExc_TypeError, "cpu_cores specified twice");
+                    return -1;
+                }
+                cpu_cores_obj = value;
+            }
+            else if (PyUnicode_CompareWithASCIIString(key, "gpu_device") == 0) {
+                if (gpu_device_obj) {
+                    PyErr_SetString(PyExc_TypeError, "gpu_device specified twice");
+                    return -1;
+                }
+                gpu_device_obj = value;
+            }
+            else {
+                PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
+                return -1;
             }
-            else if (PyUnicode_CompareWithASCIIString(key, "maxsplit") == 0 && !maxsplit_obj) { maxsplit_obj = value; }
-            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) { return NULL; }
         }
     }
 
-    sz_string_view_t text;
-    int keeplinebreaks;
-    Py_ssize_t maxsplit = PY_SSIZE_T_MAX; // Default value for maxsplit
+    sz_status_t status;
 
-    // Validate and convert `text`
-    if (!export_string_like(text_obj, &text.start, &text.length)) {
-        wrap_current_exception("The text argument must be string-like");
-        return NULL;
+    if (cpu_cores_obj != NULL && gpu_device_obj != NULL) {
+        PyErr_SetString(PyExc_ValueError, "Cannot specify both cpu_cores and gpu_device");
+        return -1;
     }
-
-    // Validate and convert `keeplinebreaks`
-    if (keeplinebreaks_obj) {
-        keeplinebreaks = PyObject_IsTrue(keeplinebreaks_obj);
-        if (keeplinebreaks == -1) {
-            wrap_current_exception("The keeplinebreaks argument must be a boolean");
-            return NULL;
+    else if (cpu_cores_obj != NULL) {
+        if (!PyLong_Check(cpu_cores_obj)) {
+            PyErr_SetString(PyExc_TypeError, "cpu_cores must be an integer");
+            return -1;
         }
+        sz_size_t cpu_cores = PyLong_AsSize_t(cpu_cores_obj);
+        if (cpu_cores == (sz_size_t)-1 && PyErr_Occurred()) { return -1; }
+        status = sz_device_scope_init_cpu_cores(cpu_cores, &self->handle);
     }
-    else { keeplinebreaks = 0; }
-
-    // Validate and convert `maxsplit`
-    if (maxsplit_obj) {
-        maxsplit = PyLong_AsSsize_t(maxsplit_obj);
-        if (maxsplit == -1 && PyErr_Occurred()) {
-            PyErr_SetString(PyExc_TypeError, "The maxsplit argument must be an integer");
-            return NULL;
+    else if (gpu_device_obj != NULL) {
+        if (!PyLong_Check(gpu_device_obj)) {
+            PyErr_SetString(PyExc_TypeError, "gpu_device must be an integer");
+            return -1;
         }
+        sz_size_t gpu_device = PyLong_AsSize_t(gpu_device_obj);
+        if (gpu_device == (sz_size_t)-1 && PyErr_Occurred()) { return -1; }
+        status = sz_device_scope_init_gpu_device(gpu_device, &self->handle);
     }
+    else { status = sz_device_scope_init_default(&self->handle); }
 
-    // The Unicode standard defines a number of characters that conforming applications
-    // should recognize as line terminators:
-    //
-    //      LF:    Line Feed, U+000A                            - 1 byte (\n)
-    //      VT:    Vertical Tab, U+000B                         - 1 byte (\v)
-    //      FF:    Form Feed, U+000C                            - 1 byte (\f)
-    //      CR:    Carriage Return, U+000D                      - 1 byte (\r)
-    //      NEL:   Next Line, U+0085                            - 1 byte (\x85)
-    //      LS:    Line Separator, U+2028                       - 2 bytes
-    //      PS:    Paragraph Separator, U+2029                  - 2 bytes
-    //      CR+LF: CR (U+000D) followed by LF (U+000A)          - 2 bytes
-    //
-    // The Python standard is different, it also includes:
-    //
-    //     FS:    File Separator, U+001C                       - 1 byte (\x1C)
-    //     GS:    Group Separator, U+001D                      - 1 byte (\x1D)
-    //     RS:    Record Separator, U+001E                     - 1 byte (\x1E)
-    //
-    // We avoid all 2-byte sequences and only consider 1-byte delimiters.
-    // CPython docs: https://docs.python.org/3/library/stdtypes.html#str.splitlines
-    sz_string_view_t separator;
-    separator.start = "\x0A\x0B\x0C\x0D\x85\x1C\x1D\x1E";
-    separator.length = 8;
-    return Str_split_(text_obj, text, separator, keeplinebreaks, maxsplit, &sz_find_byte_from, 1);
-}
-
-static PyObject *Str_concat(PyObject *self, PyObject *other) {
-    struct sz_string_view_t self_str, other_str;
-
-    // Validate and convert `self` and `other`
-    if (!export_string_like(self, &self_str.start, &self_str.length) ||
-        !export_string_like(other, &other_str.start, &other_str.length)) {
-        wrap_current_exception("Both operands must be string-like");
-        return NULL;
-    }
-
-    // Allocate a new Str instance
-    Str *result_str = PyObject_New(Str, &StrType);
-    if (result_str == NULL) { return NULL; }
-
-    // Calculate the total length of the new string
-    result_str->parent = NULL;
-    result_str->memory.length = self_str.length + other_str.length;
-
-    // Allocate memory for the new string
-    result_str->memory.start = malloc(result_str->memory.length);
-    if (result_str->memory.start == NULL) {
-        PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for string concatenation");
-        return NULL;
+    if (status != sz_success_k) {
+        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize device scope");
+        return -1;
     }
 
-    // Perform the string concatenation
-    sz_copy(result_str->memory.start, self_str.start, self_str.length);
-    sz_copy(result_str->memory.start + self_str.length, other_str.start, other_str.length);
-
-    return (PyObject *)result_str;
+    return 0;
 }
 
-static PySequenceMethods Str_as_sequence = {
-    .sq_length = Str_len,   //
-    .sq_item = Str_getitem, //
-    .sq_contains = Str_in,  //
-};
-
-static PyMappingMethods Str_as_mapping = {
-    .mp_length = Str_len,          //
-    .mp_subscript = Str_subscript, // Is used to implement slices in Python
-};
-
-static PyBufferProcs Str_as_buffer = {
-    .bf_getbuffer = Str_getbuffer,
-    .bf_releasebuffer = Str_releasebuffer,
-};
-
-static PyNumberMethods Str_as_number = {
-    .nb_add = Str_concat,
-};
-
-static PyGetSetDef Str_getsetters[] = {
-    // Compatibility with PyArrow
-    {"address", (getter)Str_get_address, NULL, "Get the memory address of the first byte of the string", NULL},
-    {"nbytes", (getter)Str_get_nbytes, NULL, "Get the length of the string in bytes", NULL},
-    {NULL} // Sentinel
-};
-
-#define SZ_METHOD_FLAGS METH_VARARGS | METH_KEYWORDS
-
-static PyMethodDef Str_methods[] = {
-    {"contains", (PyCFunction)Str_contains, SZ_METHOD_FLAGS, doc_contains},
-    {"count", (PyCFunction)Str_count, SZ_METHOD_FLAGS, doc_count},
-    {"splitlines", (PyCFunction)Str_splitlines, SZ_METHOD_FLAGS, doc_splitlines},
-    {"startswith", (PyCFunction)Str_startswith, SZ_METHOD_FLAGS, doc_startswith},
-    {"endswith", (PyCFunction)Str_endswith, SZ_METHOD_FLAGS, doc_endswith},
-    {"translate", (PyCFunction)Str_translate, SZ_METHOD_FLAGS, doc_translate},
-    {"decode", (PyCFunction)Str_decode, SZ_METHOD_FLAGS, doc_decode},
-
-    // Bidirectional operations
-    {"find", (PyCFunction)Str_find, SZ_METHOD_FLAGS, doc_find},
-    {"index", (PyCFunction)Str_index, SZ_METHOD_FLAGS, doc_index},
-    {"partition", (PyCFunction)Str_partition, SZ_METHOD_FLAGS, doc_partition},
-    {"split", (PyCFunction)Str_split, SZ_METHOD_FLAGS, doc_split},
-    {"rfind", (PyCFunction)Str_rfind, SZ_METHOD_FLAGS, doc_rfind},
-    {"rindex", (PyCFunction)Str_rindex, SZ_METHOD_FLAGS, doc_rindex},
-    {"rpartition", (PyCFunction)Str_rpartition, SZ_METHOD_FLAGS, doc_rpartition},
-    {"rsplit", (PyCFunction)Str_rsplit, SZ_METHOD_FLAGS, doc_rsplit},
-
-    // Edit distance extensions
-    {"hamming_distance", (PyCFunction)Str_hamming_distance, SZ_METHOD_FLAGS, doc_hamming_distance},
-    {"hamming_distance_unicode", (PyCFunction)Str_hamming_distance_unicode, SZ_METHOD_FLAGS,
-     doc_hamming_distance_unicode},
-    {"levenshtein_distance", (PyCFunction)Str_levenshtein_distance, SZ_METHOD_FLAGS, doc_levenshtein_distance},
-    {"levenshtein_distance_unicode", (PyCFunction)Str_levenshtein_distance_unicode, SZ_METHOD_FLAGS,
-     doc_levenshtein_distance_unicode},
-    {"needleman_wunsch_score", (PyCFunction)Str_needleman_wunsch_score, SZ_METHOD_FLAGS, doc_needleman_wunsch_score},
-
-    // Character search extensions
-    {"find_first_of", (PyCFunction)Str_find_first_of, SZ_METHOD_FLAGS, doc_find_first_of},
-    {"find_last_of", (PyCFunction)Str_find_last_of, SZ_METHOD_FLAGS, doc_find_last_of},
-    {"find_first_not_of", (PyCFunction)Str_find_first_not_of, SZ_METHOD_FLAGS, doc_find_first_not_of},
-    {"find_last_not_of", (PyCFunction)Str_find_last_not_of, SZ_METHOD_FLAGS, doc_find_last_not_of},
-    {"split_byteset", (PyCFunction)Str_split_byteset, SZ_METHOD_FLAGS, doc_split_byteset},
-    {"rsplit_byteset", (PyCFunction)Str_rsplit_byteset, SZ_METHOD_FLAGS, doc_rsplit_byteset},
-
-    // Lazily evaluated iterators
-    {"split_iter", (PyCFunction)Str_split_iter, SZ_METHOD_FLAGS, doc_split_iter},
-    {"rsplit_iter", (PyCFunction)Str_rsplit_iter, SZ_METHOD_FLAGS, doc_rsplit_iter},
-    {"split_byteset_iter", (PyCFunction)Str_split_byteset_iter, SZ_METHOD_FLAGS, doc_split_byteset_iter},
-    {"rsplit_byteset_iter", (PyCFunction)Str_rsplit_byteset_iter, SZ_METHOD_FLAGS, doc_rsplit_byteset_iter},
-
-    // Dealing with larger-than-memory datasets
-    {"offset_within", (PyCFunction)Str_offset_within, SZ_METHOD_FLAGS, doc_offset_within},
-    {"write_to", (PyCFunction)Str_write_to, SZ_METHOD_FLAGS, doc_write_to},
-
-    {NULL, NULL, 0, NULL} // Sentinel
-};
+static char const doc_DeviceScope[] = //
+    "DeviceScope(cpu_cores=None, gpu_device=None)\n"
+    "\n"
+    "Context for controlling execution on CPU cores or GPU devices.\n"
+    "\n"
+    "Args:\n"
+    "  cpu_cores (int, optional): Number of CPU cores to use (0 for all, 1 for single-threaded).\n"
+    "  gpu_device (int, optional): GPU device ID to target.\n"
+    "\n"
+    "Note: Cannot specify both cpu_cores and gpu_device.";
 
-static PyTypeObject StrType = {
-    PyVarObject_HEAD_INIT(NULL, 0) //
-        .tp_name = "stringzilla.Str",
-    .tp_doc = "Immutable string/slice class with SIMD and SWAR-accelerated operations",
-    .tp_basicsize = sizeof(Str),
+static PyTypeObject DeviceScopeType = {
+    PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzillas.DeviceScope",
+    .tp_doc = doc_DeviceScope,
+    .tp_basicsize = sizeof(DeviceScope),
     .tp_flags = Py_TPFLAGS_DEFAULT,
-    .tp_new = Str_new,
-    .tp_init = Str_init,
-    .tp_dealloc = Str_dealloc,
-    .tp_hash = Str_hash,
-    .tp_richcompare = Str_richcompare,
-    .tp_repr = (reprfunc)Str_repr,
-    .tp_str = Str_str,
-    .tp_methods = Str_methods,
-    .tp_as_sequence = &Str_as_sequence,
-    .tp_as_mapping = &Str_as_mapping,
-    .tp_as_buffer = &Str_as_buffer,
-    .tp_as_number = &Str_as_number,
-    .tp_getset = Str_getsetters,
+    .tp_new = DeviceScope_new,
+    .tp_init = (initproc)DeviceScope_init,
+    .tp_dealloc = (destructor)DeviceScope_dealloc,
 };
 
 #pragma endregion
 
-#pragma region Split Iterator
-
-static PyObject *SplitIteratorType_next(SplitIterator *self) {
-    // No more data to split
-    if (self->reached_tail) return NULL;
-
-    // Create a new `Str` object
-    Str *result_obj = (Str *)StrType.tp_alloc(&StrType, 0);
-    if (result_obj == NULL && PyErr_NoMemory()) return NULL;
+#pragma region LevenshteinDistances
 
-    sz_string_view_t result_memory;
-
-    // Find the next needle
-    sz_cptr_t found =
-        self->max_parts > 1 //
-            ? self->finder(self->text.start, self->text.length, self->separator.start, self->separator.length)
-            : NULL;
+/**
+ *  @brief  Levenshtein distance computation engine for binary strings.
+ */
+typedef struct {
+    PyObject ob_base;
+    sz_levenshtein_distances_t handle;
+} LevenshteinDistances;
 
-    // We've reached the end of the string
-    if (found == NULL) {
-        result_memory.start = self->text.start;
-        result_memory.length = self->text.length;
-        self->text.length = 0;
-        self->reached_tail = 1;
-        self->max_parts = 0;
-    }
-    else {
-        if (self->is_reverse) {
-            result_memory.start = found + self->match_length * !self->include_match;
-            result_memory.length = self->text.start + self->text.length - result_memory.start;
-            self->text.length = found - self->text.start;
-        }
-        else {
-            result_memory.start = self->text.start;
-            result_memory.length = found - self->text.start;
-            self->text.start = found + self->match_length;
-            self->text.length -= result_memory.length + self->match_length;
-            result_memory.length += self->match_length * self->include_match;
-        }
-        self->max_parts--;
+static void LevenshteinDistances_dealloc(LevenshteinDistances *self) {
+    if (self->handle) {
+        sz_levenshtein_distances_free(self->handle);
+        self->handle = NULL;
     }
-
-    // Set its properties based on the slice
-    result_obj->memory = result_memory;
-    result_obj->parent = self->text_obj;
-
-    // Increment the reference count of the parent
-    Py_INCREF(self->text_obj);
-    return (PyObject *)result_obj;
-}
-
-static void SplitIteratorType_dealloc(SplitIterator *self) {
-    Py_XDECREF(self->text_obj);
-    Py_XDECREF(self->separator_obj);
     Py_TYPE(self)->tp_free((PyObject *)self);
 }
 
-static PyObject *SplitIteratorType_iter(PyObject *self) {
-    Py_INCREF(self); // Iterator should return itself in __iter__.
-    return self;
+static PyObject *LevenshteinDistances_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) {
+    LevenshteinDistances *self = (LevenshteinDistances *)type->tp_alloc(type, 0);
+    if (self != NULL) { self->handle = NULL; }
+    return (PyObject *)self;
 }
 
-static PyTypeObject SplitIteratorType = {
-    PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzilla.SplitIterator",
-    .tp_basicsize = sizeof(SplitIterator),
-    .tp_itemsize = 0,
-    .tp_dealloc = (destructor)SplitIteratorType_dealloc,
-    .tp_flags = Py_TPFLAGS_DEFAULT,
-    .tp_doc = "Text-splitting iterator",
-    .tp_iter = SplitIteratorType_iter,
-    .tp_iternext = (iternextfunc)SplitIteratorType_next,
-};
-
-#pragma endregion
-
-#pragma region Strs
-
-static PyObject *Strs_shuffle(Strs *self, PyObject *args, PyObject *kwargs) {
-
-    // Check for positional arguments
+static int LevenshteinDistances_init(LevenshteinDistances *self, PyObject *args, PyObject *kwargs) {
     Py_ssize_t nargs = PyTuple_Size(args);
-    PyObject *seed_obj = nargs == 1 ? PyTuple_GET_ITEM(args, 0) : NULL;
-    if (nargs > 1) {
-        PyErr_SetString(PyExc_TypeError, "shuffle() takes at most 1 positional argument");
-        return NULL;
-    }
-
-    // Check for keyword arguments
-    if (kwargs) {
-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
-        while (PyDict_Next(kwargs, &pos, &key, &value)) {
-            if (PyUnicode_CompareWithASCIIString(key, "seed") == 0 && !seed_obj) { seed_obj = value; }
-            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) { return NULL; }
+    sz_error_cost_t match = 0, mismatch = 1, open = 1, extend = 1;
+
+    // Manual argument parsing - fast path for positional args
+    if (nargs >= 1) {
+        PyObject *obj = PyTuple_GET_ITEM(args, 0);
+        if (PyLong_Check(obj)) {
+            long val = PyLong_AsLong(obj);
+            if (val < -128 || val > 127) {
+                PyErr_SetString(PyExc_ValueError, "match cost must fit in 8-bit signed integer");
+                return -1;
+            }
+            match = (sz_error_cost_t)val;
+        }
+        else {
+            PyErr_SetString(PyExc_TypeError, "match cost must be an integer");
+            return -1;
         }
     }
 
-    // Change the layout
-    if (!prepare_strings_for_reordering(self)) {
-        PyErr_Format(PyExc_TypeError, "Failed to prepare the sequence for shuffling");
-        return NULL;
-    }
-
-    // Get the parts and their count
-    struct reordered_slices_t *reordered = &self->data.reordered;
-    sz_string_view_t *parts = reordered->parts;
-    size_t count = reordered->count;
-
-    // Fisher-Yates Shuffle Algorithm
-    unsigned int seed = seed_obj ? PyLong_AsUnsignedLong(seed_obj) : time(NULL);
-    srand(seed);
-    for (size_t i = count - 1; i > 0; --i) {
-        size_t j = rand() % (i + 1);
-        // Swap parts[i] and parts[j]
-        sz_string_view_t temp = parts[i];
-        parts[i] = parts[j];
-        parts[j] = temp;
+    if (nargs >= 2) {
+        PyObject *obj = PyTuple_GET_ITEM(args, 1);
+        if (PyLong_Check(obj)) {
+            long val = PyLong_AsLong(obj);
+            if (val < -128 || val > 127) {
+                PyErr_SetString(PyExc_ValueError, "mismatch cost must fit in 8-bit signed integer");
+                return -1;
+            }
+            mismatch = (sz_error_cost_t)val;
+        }
+        else {
+            PyErr_SetString(PyExc_TypeError, "mismatch cost must be an integer");
+            return -1;
+        }
     }
 
-    Py_RETURN_NONE;
-}
-
-static sz_bool_t Strs_argsort_(Strs *self, sz_string_view_t **parts_output, sz_sorted_idx_t **order_output,
-                               sz_size_t *count_output) {
-    // Change the layout
-    if (!prepare_strings_for_reordering(self)) {
-        PyErr_Format(PyExc_TypeError, "Failed to prepare the sequence for sorting");
-        return 0;
+    if (nargs >= 3) {
+        PyObject *obj = PyTuple_GET_ITEM(args, 2);
+        if (PyLong_Check(obj)) {
+            long val = PyLong_AsLong(obj);
+            if (val < -128 || val > 127) {
+                PyErr_SetString(PyExc_ValueError, "open cost must fit in 8-bit signed integer");
+                return -1;
+            }
+            open = (sz_error_cost_t)val;
+        }
+        else {
+            PyErr_SetString(PyExc_TypeError, "open cost must be an integer");
+            return -1;
+        }
     }
 
-    // Get the parts and their count
-    // The only possible `self->type` by now is the `STRS_REORDERED`
-    sz_string_view_t *parts = self->data.reordered.parts;
-    size_t count = self->data.reordered.count;
-
-    // Allocate temporary memory to store the ordering offsets
-    size_t memory_needed = sizeof(sz_sorted_idx_t) * count;
-    if (temporary_memory.length < memory_needed) {
-        temporary_memory.start = realloc(temporary_memory.start, memory_needed);
-        temporary_memory.length = memory_needed;
-    }
-    if (!temporary_memory.start) {
-        PyErr_Format(PyExc_MemoryError, "Unable to allocate memory for the Levenshtein matrix");
-        return 0;
+    if (nargs >= 4) {
+        PyObject *obj = PyTuple_GET_ITEM(args, 3);
+        if (PyLong_Check(obj)) {
+            long val = PyLong_AsLong(obj);
+            if (val < -128 || val > 127) {
+                PyErr_SetString(PyExc_ValueError, "extend cost must fit in 8-bit signed integer");
+                return -1;
+            }
+            extend = (sz_error_cost_t)val;
+        }
+        else {
+            PyErr_SetString(PyExc_TypeError, "extend cost must be an integer");
+            return -1;
+        }
     }
 
-    // Call our sorting algorithm
-    sz_sequence_t sequence;
-    sz_fill(&sequence, sizeof(sequence), 0);
-    sequence.count = count;
-    sequence.handle = parts;
-    sequence.get_start = parts_get_start;
-    sequence.get_length = parts_get_length;
-    sz_status_t status = sz_sequence_argsort(&sequence, NULL, (sz_sorted_idx_t *)temporary_memory.start);
-
-    // Export results
-    *parts_output = parts;
-    *order_output = (sz_sorted_idx_t *)temporary_memory.start;
-    *count_output = sequence.count;
-    return 1;
-}
-
-static PyObject *Strs_sort(Strs *self, PyObject *args, PyObject *kwargs) {
-    PyObject *reverse_obj = NULL; // Default is not reversed
-
-    // Check for positional arguments
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs > 1) {
-        PyErr_SetString(PyExc_TypeError, "sort() takes at most 1 positional argument");
-        return NULL;
+    if (nargs > 4) {
+        PyErr_SetString(PyExc_TypeError, "LevenshteinDistances() takes at most 4 arguments");
+        return -1;
     }
-    else if (nargs == 1) { reverse_obj = PyTuple_GET_ITEM(args, 0); }
 
-    // Check for keyword arguments
+    // Parse keyword arguments
     if (kwargs) {
-        PyObject *key, *value;
         Py_ssize_t pos = 0;
+        PyObject *key, *value;
         while (PyDict_Next(kwargs, &pos, &key, &value)) {
-            if (PyUnicode_CompareWithASCIIString(key, "reverse") == 0 && !reverse_obj) { reverse_obj = value; }
-            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) { return NULL; }
-        }
-    }
-
-    sz_bool_t reverse = 0; // Default is False
-    if (reverse_obj) {
-        if (!PyBool_Check(reverse_obj)) {
-            PyErr_SetString(PyExc_TypeError, "The reverse must be a boolean");
-            return NULL;
+            if (PyUnicode_CompareWithASCIIString(key, "match") == 0) {
+                if (nargs >= 1) {
+                    PyErr_SetString(PyExc_TypeError, "match specified twice");
+                    return -1;
+                }
+                if (!PyLong_Check(value)) {
+                    PyErr_SetString(PyExc_TypeError, "match must be an integer");
+                    return -1;
+                }
+                long val = PyLong_AsLong(value);
+                if (val < -128 || val > 127) {
+                    PyErr_SetString(PyExc_ValueError, "match cost must fit in 8-bit signed integer");
+                    return -1;
+                }
+                match = (sz_error_cost_t)val;
+            }
+            else if (PyUnicode_CompareWithASCIIString(key, "mismatch") == 0) {
+                if (nargs >= 2) {
+                    PyErr_SetString(PyExc_TypeError, "mismatch specified twice");
+                    return -1;
+                }
+                if (!PyLong_Check(value)) {
+                    PyErr_SetString(PyExc_TypeError, "mismatch must be an integer");
+                    return -1;
+                }
+                long val = PyLong_AsLong(value);
+                if (val < -128 || val > 127) {
+                    PyErr_SetString(PyExc_ValueError, "mismatch cost must fit in 8-bit signed integer");
+                    return -1;
+                }
+                mismatch = (sz_error_cost_t)val;
+            }
+            else if (PyUnicode_CompareWithASCIIString(key, "open") == 0) {
+                if (nargs >= 3) {
+                    PyErr_SetString(PyExc_TypeError, "open specified twice");
+                    return -1;
+                }
+                if (!PyLong_Check(value)) {
+                    PyErr_SetString(PyExc_TypeError, "open must be an integer");
+                    return -1;
+                }
+                long val = PyLong_AsLong(value);
+                if (val < -128 || val > 127) {
+                    PyErr_SetString(PyExc_ValueError, "open cost must fit in 8-bit signed integer");
+                    return -1;
+                }
+                open = (sz_error_cost_t)val;
+            }
+            else if (PyUnicode_CompareWithASCIIString(key, "extend") == 0) {
+                if (nargs >= 4) {
+                    PyErr_SetString(PyExc_TypeError, "extend specified twice");
+                    return -1;
+                }
+                if (!PyLong_Check(value)) {
+                    PyErr_SetString(PyExc_TypeError, "extend must be an integer");
+                    return -1;
+                }
+                long val = PyLong_AsLong(value);
+                if (val < -128 || val > 127) {
+                    PyErr_SetString(PyExc_ValueError, "extend cost must fit in 8-bit signed integer");
+                    return -1;
+                }
+                extend = (sz_error_cost_t)val;
+            }
+            else {
+                PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
+                return -1;
+            }
         }
-        reverse = PyObject_IsTrue(reverse_obj);
     }
 
-    sz_string_view_t *parts = NULL;
-    sz_size_t *order = NULL;
-    sz_size_t count = 0;
-    if (!Strs_argsort_(self, &parts, &order, &count)) return NULL;
-
-    // Apply the sorting algorithm here, considering the `reverse` value
-    if (reverse) reverse_offsets(order, count);
+    sz_status_t status =
+        sz_levenshtein_distances_init(match, mismatch, open, extend, NULL, szs_capabilities(), &self->handle);
 
-    // Apply the new order.
-    permute(parts, order, count);
+    if (status != sz_success_k) {
+        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize Levenshtein distances engine");
+        return -1;
+    }
 
-    Py_RETURN_NONE;
+    return 0;
 }
 
-static PyObject *Strs_argsort(Strs *self, PyObject *args, PyObject *kwargs) {
-    PyObject *reverse_obj = NULL; // Default is not reversed
-
-    // Check for positional arguments
+static PyObject *LevenshteinDistances_call(LevenshteinDistances *self, PyObject *args, PyObject *kwargs) {
     Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs > 1) {
-        PyErr_SetString(PyExc_TypeError, "order() takes at most 1 positional argument");
-        return NULL;
-    }
-    else if (nargs == 1) { reverse_obj = PyTuple_GET_ITEM(args, 0); }
-
-    // Check for keyword arguments
-    if (kwargs) {
-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
-        while (PyDict_Next(kwargs, &pos, &key, &value)) {
-            if (PyUnicode_CompareWithASCIIString(key, "reverse") == 0 && !reverse_obj) { reverse_obj = value; }
-            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) { return NULL; }
-        }
-    }
-
-    sz_bool_t reverse = 0; // Default is False
-    if (reverse_obj) {
-        if (!PyBool_Check(reverse_obj)) {
-            PyErr_SetString(PyExc_TypeError, "The reverse must be a boolean");
-            return NULL;
-        }
-        reverse = PyObject_IsTrue(reverse_obj);
-    }
+    PyObject *a_obj = NULL, *b_obj = NULL, *device_obj = NULL, *out_obj = NULL;
 
-    sz_string_view_t *parts = NULL;
-    sz_sorted_idx_t *order = NULL;
-    sz_size_t count = 0;
-    if (!Strs_argsort_(self, &parts, &order, &count)) return NULL;
-
-    // Apply the sorting algorithm here, considering the `reverse` value
-    if (reverse) reverse_offsets(order, count);
-
-    // Here, instead of applying the order, we want to return the copy of the
-    // order as a NumPy array of 64-bit unsigned integers.
-    //
-    //      npy_intp numpy_size = count;
-    //      PyObject *array = PyArray_SimpleNew(1, &numpy_size, NPY_UINT64);
-    //      if (!array) {
-    //          PyErr_SetString(PyExc_RuntimeError, "Failed to create a NumPy array");
-    //          return NULL;
-    //      }
-    //      sz_sorted_idx_t *numpy_data_ptr = (sz_sorted_idx_t *)PyArray_DATA((PyArrayObject *)array);
-    //      sz_copy(numpy_data_ptr, order, count * sizeof(sz_sorted_idx_t));
-    //
-    // There are compilation issues with NumPy.
-    // Here is an example for `cp312-musllinux_s390x`: https://x.com/ashvardanian/status/1757880762278531447?s=20
-    // So instead of NumPy, let's produce a tuple of integers.
-    PyObject *tuple = PyTuple_New(count);
-    if (!tuple) {
-        PyErr_SetString(PyExc_RuntimeError, "Failed to create a tuple");
+    // Manual argument parsing for hot path
+    if (nargs < 2) {
+        PyErr_SetString(PyExc_TypeError, "LevenshteinDistances() requires at least 2 arguments");
         return NULL;
     }
-    for (sz_size_t i = 0; i < count; ++i) {
-        PyObject *index = PyLong_FromUnsignedLong(order[i]);
-        if (!index) {
-            PyErr_SetString(PyExc_RuntimeError, "Failed to create a tuple element");
-            Py_DECREF(tuple);
-            return NULL;
-        }
-        PyTuple_SET_ITEM(tuple, i, index);
-    }
-    return tuple;
-}
-
-static PyObject *Strs_sample(Strs *self, PyObject *args, PyObject *kwargs) {
-    PyObject *sample_size_obj = NULL;
-    PyObject *seed_obj = NULL;
 
-    // Check for positional arguments
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs > 1) {
-        PyErr_SetString(PyExc_TypeError, "sample() takes 1 positional argument and 1 keyword argument");
+    if (nargs > 4) {
+        PyErr_SetString(PyExc_TypeError, "LevenshteinDistances() takes at most 4 arguments");
         return NULL;
     }
-    else if (nargs == 1) { sample_size_obj = PyTuple_GET_ITEM(args, 0); }
+
+    a_obj = PyTuple_GET_ITEM(args, 0);
+    b_obj = PyTuple_GET_ITEM(args, 1);
+    if (nargs >= 3) device_obj = PyTuple_GET_ITEM(args, 2);
+    if (nargs >= 4) out_obj = PyTuple_GET_ITEM(args, 3);
 
     // Parse keyword arguments
     if (kwargs) {
-        PyObject *key, *value;
         Py_ssize_t pos = 0;
+        PyObject *key, *value;
         while (PyDict_Next(kwargs, &pos, &key, &value)) {
-            if (PyUnicode_CompareWithASCIIString(key, "seed") == 0 && !seed_obj) { seed_obj = value; }
-            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) { return NULL; }
+            if (PyUnicode_CompareWithASCIIString(key, "device") == 0) {
+                if (device_obj) {
+                    PyErr_SetString(PyExc_TypeError, "device specified twice");
+                    return NULL;
+                }
+                device_obj = value;
+            }
+            else if (PyUnicode_CompareWithASCIIString(key, "out") == 0) {
+                if (out_obj) {
+                    PyErr_SetString(PyExc_TypeError, "out specified twice");
+                    return NULL;
+                }
+                out_obj = value;
+            }
+            else {
+                PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
+                return NULL;
+            }
         }
     }
 
-    // Translate the seed and the sample size to C types
-    size_t sample_size = 0;
-    if (sample_size_obj) {
-        if (!PyLong_Check(sample_size_obj)) {
-            PyErr_SetString(PyExc_TypeError, "The sample size must be an integer");
+    DeviceScope *device_scope = NULL;
+    if (device_obj != NULL && device_obj != Py_None) {
+        if (!PyObject_TypeCheck(device_obj, &DeviceScopeType)) {
+            PyErr_SetString(PyExc_TypeError, "device must be a DeviceScope instance");
             return NULL;
         }
-        sample_size = PyLong_AsSize_t(sample_size_obj);
-    }
-    unsigned int seed = time(NULL); // Default seed
-    if (seed_obj) {
-        if (!PyLong_Check(seed_obj)) {
-            PyErr_SetString(PyExc_TypeError, "The seed must be an integer");
+        device_scope = (DeviceScope *)device_obj;
+    }
+
+    sz_device_scope_t device_handle = device_scope ? device_scope->handle : default_device_scope;
+    sz_size_t kernel_input_size = 0;
+    void *kernel_a_texts_punned = NULL;
+    void *kernel_b_texts_punned = NULL;
+    sz_size_t *kernel_results = NULL;
+    sz_size_t kernel_results_stride = sizeof(sz_size_t);
+    sz_status_t (*kernel_punned)(sz_levenshtein_distances_t, sz_device_scope_t, void *, void *, sz_size_t *,
+                                 sz_size_t) = NULL;
+
+    // Handle 32-bit tape inputs
+    sz_sequence_u32tape_t a_u32tape, b_u32tape;
+    sz_bool_t a_is_u32tape = sz_py_export_strings_as_u32tape( //
+        a_obj, &a_u32tape.data, &a_u32tape.offsets, &a_u32tape.count);
+    sz_bool_t b_is_u32tape = sz_py_export_strings_as_u32tape( //
+        b_obj, &b_u32tape.data, &b_u32tape.offsets, &b_u32tape.count);
+    if (a_is_u32tape && b_is_u32tape) {
+        if (a_u32tape.count != b_u32tape.count) {
+            PyErr_SetString(PyExc_ValueError, "Input sequences must have the same length");
             return NULL;
         }
-        seed = PyLong_AsUnsignedLong(seed_obj);
-    }
-
-    // Create a new `Strs` object
-    Strs *result = (Strs *)StrsType.tp_alloc(&StrsType, 0);
-    if (result == NULL && PyErr_NoMemory()) return NULL;
-
-    result->type = STRS_REORDERED;
-    result->data.reordered.count = 0;
-    result->data.reordered.parts = NULL;
-    result->data.reordered.parent_string = NULL;
-    if (sample_size == 0) { return (PyObject *)result; }
-
-    // Now create a new Strs object with the sampled strings
-    sz_string_view_t *result_parts = malloc(sample_size * sizeof(sz_string_view_t));
-    if (!result_parts) {
-        PyErr_SetString(PyExc_MemoryError, "Failed to allocate memory for the sample");
-        return NULL;
-    }
 
-    // Introspect the Strs object to know the from which will be sampling
-    Py_ssize_t count = Strs_len(self);
-    get_string_at_offset_t getter = str_at_offset_getter(self);
-    if (!getter) {
-        PyErr_SetString(PyExc_TypeError, "Unknown Strs kind");
-        return NULL;
+        kernel_input_size = a_u32tape.count;
+        kernel_punned = sz_levenshtein_distances_u32tape;
+        kernel_a_texts_punned = &a_u32tape;
+        kernel_b_texts_punned = &b_u32tape;
     }
 
-    // Randomly sample the strings
-    srand(seed);
-    PyObject *parent_string;
-    for (Py_ssize_t i = 0; i < sample_size; i++) {
-        size_t index = rand() % count;
-        getter(self, index, count, &parent_string, &result_parts[i].start, &result_parts[i].length);
+    // Handle 64-bit tape inputs
+    sz_sequence_u64tape_t a_u64tape, b_u64tape;
+    sz_bool_t a_is_u64tape = !a_is_u32tape && sz_py_export_strings_as_u64tape( //
+                                                  a_obj, &a_u64tape.data, &a_u64tape.offsets, &a_u64tape.count);
+    sz_bool_t b_is_u64tape = !b_is_u32tape && sz_py_export_strings_as_u64tape( //
+                                                  b_obj, &b_u64tape.data, &b_u64tape.offsets, &b_u64tape.count);
+    if (a_is_u64tape && b_is_u64tape) {
+        if (a_u64tape.count != b_u64tape.count) {
+            PyErr_SetString(PyExc_ValueError, "Input sequences must have the same length");
+            return NULL;
+        }
+        kernel_input_size = a_u64tape.count;
+        kernel_punned = sz_levenshtein_distances_u64tape;
+        kernel_a_texts_punned = &a_u64tape;
+        kernel_b_texts_punned = &b_u64tape;
     }
 
-    // Update the Strs object
-    result->type = STRS_REORDERED;
-    result->data.reordered.count = sample_size;
-    result->data.reordered.parts = result_parts;
-    result->data.reordered.parent_string = parent_string;
-    return result;
-}
-
-/**
- *  @brief Exports a string to a UTF-8 buffer, escaping single quotes.
- *  @param[out] did_fit Populated with 1 if the string is fully exported, 0 if it didn't fit.
- */
-char const *export_escaped_unquoted_to_utf8_buffer(char const *cstr, size_t cstr_length, //
-                                                   char *buffer, size_t buffer_length,   //
-                                                   int *did_fit) {
-    char const *const cstr_end = cstr + cstr_length;
-    char *const buffer_end = buffer + buffer_length;
-    *did_fit = 1;
-
-    while (cstr < cstr_end) {
-        sz_rune_t rune;
-        sz_rune_length_t rune_length;
-        sz_rune_parse(cstr, &rune, &rune_length);
-        if (rune_length == 1 && buffer + 2 < buffer_end) {
-            if (*cstr == '\'') {
-                *(buffer++) = '\\';
-                *(buffer++) = '\'';
-                cstr++;
-            }
-            else if (*cstr == '\'') {
-                *(buffer++) = '\\';
-                *(buffer++) = '\'';
-                cstr++;
-            }
-            else { *(buffer++) = *(cstr++); }
-        }
-        else if (buffer + rune_length < buffer_end) {
-            sz_copy(buffer, cstr, rune_length);
-            buffer += rune_length;
-            cstr += rune_length;
-        }
-        else {
-            *did_fit = 0;
-            break;
+    // Handle sequence inputs
+    sz_sequence_t a_seq, b_seq;
+    sz_bool_t a_is_sequence = !a_is_u32tape && !a_is_u64tape && sz_py_export_strings_as_sequence(a_obj, &a_seq);
+    sz_bool_t b_is_sequence = !b_is_u32tape && !b_is_u64tape && sz_py_export_strings_as_sequence(b_obj, &b_seq);
+    if (a_is_sequence && b_is_sequence) {
+        if (a_seq.count != b_seq.count) {
+            PyErr_SetString(PyExc_ValueError, "Input sequences must have the same length");
+            return NULL;
         }
+        kernel_input_size = a_seq.count;
+        kernel_punned = sz_levenshtein_distances_sequence;
+        kernel_a_texts_punned = &a_seq;
+        kernel_b_texts_punned = &b_seq;
     }
 
-    return buffer;
-}
-
-/**
- *  @brief  Formats an array of strings, similar to the `repr` method of Python lists.
- *          Will output an object that looks like `sz.Str(['item1', 'item2... ])`, potentially
- *          dropping the last few entries.
- */
-static PyObject *Strs_repr(Strs *self) {
-    get_string_at_offset_t getter = str_at_offset_getter(self);
-    if (!getter) {
-        PyErr_SetString(PyExc_TypeError, "Unknown Strs kind");
+    // If no valid input types were found, raise an error
+    if (!kernel_punned) {
+        PyErr_Format(PyExc_TypeError, 
+                     "Unsupported input types for Levenshtein distances. "
+                     "u32tape: a=%d b=%d, u64tape: a=%d b=%d, seq: a=%d b=%d",
+                     a_is_u32tape, b_is_u32tape, a_is_u64tape, b_is_u64tape, 
+                     a_is_sequence, b_is_sequence);
         return NULL;
     }
 
-    char repr_buffer[1024];
-    char *repr_buffer_ptr = &repr_buffer[0];
-    char const *const repr_buffer_end = repr_buffer_ptr + 1024;
-
-    // Start of the array
-    sz_copy(repr_buffer_ptr, "sz.Strs([", 9);
-    repr_buffer_ptr += 9;
-
-    size_t count = Strs_len(self);
-    PyObject *parent_string;
-
-    // In the worst case, we must have enough space for `...', ...])`
-    // That's extra 11 bytes of content.
-    char const *non_fitting_array_tail = "... ])";
-    int const non_fitting_array_tail_length = 6;
-
-    // If the whole string doesn't fit, even before the `non_fitting_array_tail` tail,
-    // we need to add `, '` separator of 3 bytes.
-    for (size_t i = 0; i < count && repr_buffer_ptr + (non_fitting_array_tail_length + 3) < repr_buffer_end; i++) {
-        char const *cstr_start = NULL;
-        size_t cstr_length = 0;
-        getter(self, i, count, &parent_string, &cstr_start, &cstr_length);
-
-        if (i > 0) { *(repr_buffer_ptr++) = ',', *(repr_buffer_ptr++) = ' '; }
-        *(repr_buffer_ptr++) = '\'';
-
-        int did_fit;
-        repr_buffer_ptr = export_escaped_unquoted_to_utf8_buffer(
-            cstr_start, cstr_length, repr_buffer_ptr, repr_buffer_end - repr_buffer_ptr - non_fitting_array_tail_length,
-            &did_fit);
-        // If it didn't fit, let's put an ellipsis
-        if (!did_fit) {
-            sz_copy(repr_buffer_ptr, non_fitting_array_tail, non_fitting_array_tail_length);
-            repr_buffer_ptr += non_fitting_array_tail_length;
-            return PyUnicode_FromStringAndSize(repr_buffer, repr_buffer_ptr - repr_buffer);
+    // Make sure the `out` argument is valid NumPy array and extract `kernel_results` and `kernel_results_stride`
+    // or create a new results array.
+    PyObject *results_array = NULL;
+    if (!out_obj || out_obj == Py_None) {
+        // Create a new NumPy array for results
+        npy_intp numpy_size = kernel_input_size;
+        results_array = PyArray_SimpleNew(1, &numpy_size, NPY_UINT64);
+        if (!results_array) {
+            PyErr_SetString(PyExc_RuntimeError, "Failed to create NumPy array for results");
+            goto cleanup;
         }
-        else
-            *(repr_buffer_ptr++) = '\''; // Close the string
-    }
-
-    // Close the array
-    *(repr_buffer_ptr++) = ']', *(repr_buffer_ptr++) = ')';
-    return PyUnicode_FromStringAndSize(repr_buffer, repr_buffer_ptr - repr_buffer);
-}
-
-/**
- *  @brief  Array to string conversion method, that concatenates all the strings in the array.
- *          Will output an object that looks like `['item1', 'item2', 'item3']`, containing all
- *          the strings.
- */
-static PyObject *Strs_str(Strs *self) {
-    get_string_at_offset_t getter = str_at_offset_getter(self);
-    if (!getter) {
-        PyErr_SetString(PyExc_TypeError, "Unknown Strs kind");
-        return NULL;
+        kernel_results = (sz_size_t *)PyArray_DATA((PyArrayObject *)results_array);
+        kernel_results_stride = sizeof(sz_size_t);
     }
-
-    // Aggregate the total length of all the slices and count the number of bytes we need to allocate:
-    size_t count = Strs_len(self);
-    PyObject *parent_string;
-    size_t total_bytes = 2; // opening and closing square brackets
-    for (size_t i = 0; i < count; i++) {
-        char const *cstr_start = NULL;
-        size_t cstr_length = 0;
-        getter(self, i, count, &parent_string, &cstr_start, &cstr_length);
-        total_bytes += cstr_length;
-        total_bytes += 2;             // For the single quotes
-        if (i != 0) total_bytes += 2; // For the preceding comma and space
-
-        // Count the number of single quotes in the string
-        while (cstr_length) {
-            char quote = '\'';
-            sz_cptr_t next_quote = sz_find_byte(cstr_start, cstr_length, &quote);
-            if (next_quote == NULL) break;
-            total_bytes++;
-            cstr_length -= next_quote - cstr_start;
-            cstr_start = next_quote + 1;
+    else {
+        // Validate existing NumPy array
+        if (!PyArray_Check(out_obj)) {
+            PyErr_SetString(PyExc_TypeError, "out argument must be a NumPy array");
+            goto cleanup;
+        }
+        PyArrayObject *array = (PyArrayObject *)out_obj;
+        if (PyArray_NDIM(array) != 1) {
+            PyErr_SetString(PyExc_ValueError, "out array must be 1-dimensional");
+            goto cleanup;
         }
+        if (PyArray_SIZE(array) < (npy_intp)kernel_input_size) {
+            PyErr_SetString(PyExc_ValueError, "out array is too small for results");
+            goto cleanup;
+        }
+        if (PyArray_TYPE(array) != NPY_UINT64) {
+            PyErr_SetString(PyExc_TypeError, "out array must have uint64 dtype");
+            goto cleanup;
+        }
+        kernel_results = (sz_size_t *)PyArray_DATA(array);
+        kernel_results_stride = PyArray_STRIDE(array, 0);
+        results_array = out_obj;
+        Py_INCREF(results_array);
     }
 
-    // Now allocate the memory for the concatenated string
-    char *const result_buffer = malloc(total_bytes);
-    if (!result_buffer) {
-        PyErr_SetString(PyExc_MemoryError, "Failed to allocate memory for the concatenated string");
-        return NULL;
-    }
+    sz_status_t status = kernel_punned(               //
+        self->handle, device_handle,                  //
+        kernel_a_texts_punned, kernel_b_texts_punned, //
+        kernel_results, kernel_results_stride);
 
-    // Copy the strings into the result buffer
-    char *result_ptr = result_buffer;
-    *result_ptr++ = '[';
-    for (size_t i = 0; i < count; i++) {
-        if (i != 0) {
-            *result_ptr++ = ',';
-            *result_ptr++ = ' ';
-        }
-        char const *cstr_start = NULL;
-        size_t cstr_length = 0;
-        getter(self, i, count, &parent_string, &cstr_start, &cstr_length);
-        *result_ptr++ = '\'';
-        int did_fit;
-        result_ptr = export_escaped_unquoted_to_utf8_buffer(cstr_start, cstr_length, result_ptr, total_bytes, &did_fit);
-        *result_ptr++ = '\'';
+    if (status != sz_success_k) {
+        PyErr_SetString(PyExc_RuntimeError, "Levenshtein distance computation failed");
+        goto cleanup;
     }
+    return results_array;
 
-    *result_ptr++ = ']';
-    return PyUnicode_FromStringAndSize(result_buffer, total_bytes);
+cleanup:
+    Py_XDECREF(results_array);
+    return NULL;
 }
 
-static PySequenceMethods Strs_as_sequence = {
-    .sq_length = Strs_len,   //
-    .sq_item = Strs_getitem, //
-    .sq_contains = Strs_in,  //
-};
-
-static PyMappingMethods Strs_as_mapping = {
-    .mp_length = Strs_len,          //
-    .mp_subscript = Strs_subscript, // Is used to implement slices in Python
-};
-
-static PyGetSetDef Strs_getsetters[] = {
-    // Compatibility with PyArrow
-    {"tape", (getter)Strs_get_tape, NULL, "In-place transforms the string representation to match Apache Arrow", NULL},
-    {"tape_address", (getter)Strs_get_tape_address, NULL, "Address of the first byte of the first string", NULL},
-    {"tape_nbytes", (getter)Strs_get_tape_nbytes, NULL, "Length of the entire tape of strings in bytes", NULL},
-    {"offsets_address", (getter)Strs_get_offsets_address, NULL, "Address of the first byte of offsets array", NULL},
-    {"offsets_nbytes", (getter)Strs_get_offsets_nbytes, NULL, "Get teh length of offsets array in bytes", NULL},
-    {"offsets_are_large", (getter)Strs_get_offsets_are_large, NULL,
-     "Checks if 64-bit addressing should be used to convert to Arrow", NULL},
-    {NULL} // Sentinel
-};
-
-static PyMethodDef Strs_methods[] = {
-    {"shuffle", Strs_shuffle, SZ_METHOD_FLAGS, "Shuffle (in-place) the elements of the Strs object."}, //
-    {"sort", Strs_sort, SZ_METHOD_FLAGS, "Sort (in-place) the elements of the Strs object."},          //
-    {"argsort", Strs_argsort, SZ_METHOD_FLAGS, "Provides the permutation to achieve sorted order."},   //
-    {"sample", Strs_sample, SZ_METHOD_FLAGS, "Provides a random sample of a given size."},             //
-    // {"to_pylist", Strs_to_pylist, SZ_METHOD_FLAGS, "Exports string-views to a native list of native strings."}, //
-    {NULL, NULL, 0, NULL} // Sentinel
-};
-
-static PyTypeObject StrsType = {
-    PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzilla.Strs",
-    .tp_doc = "Space-efficient container for large collections of strings and their slices",
-    .tp_basicsize = sizeof(Strs),
-    .tp_itemsize = 0,
+static char const doc_LevenshteinDistances[] = //
+    "LevenshteinDistances(match=0, mismatch=1, open=1, extend=1)\n"
+    "\n"
+    "Compute Levenshtein edit distances between pairs of binary strings.\n"
+    "\n"
+    "Args:\n"
+    "  match (int): Cost for matching characters (default: 0).\n"
+    "  mismatch (int): Cost for mismatched characters (default: 1).\n"
+    "  open (int): Cost for opening a gap (default: 1).\n"
+    "  extend (int): Cost for extending a gap (default: 1).\n"
+    "\n"
+    "Call with:\n"
+    "  a (sequence): First sequence of strings.\n"
+    "  b (sequence): Second sequence of strings.\n"
+    "  device (DeviceScope, optional): Device execution context.\n"
+    "  out (array, optional): Output buffer for results.";
+
+static PyTypeObject LevenshteinDistancesType = {
+    PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzillas.LevenshteinDistances",
+    .tp_doc = doc_LevenshteinDistances,
+    .tp_basicsize = sizeof(LevenshteinDistances),
     .tp_flags = Py_TPFLAGS_DEFAULT,
-    .tp_new = PyType_GenericNew,
-    .tp_methods = Strs_methods,
-    .tp_as_sequence = &Strs_as_sequence,
-    .tp_as_mapping = &Strs_as_mapping,
-    .tp_getset = Strs_getsetters,
-    .tp_richcompare = Strs_richcompare,
-    .tp_repr = (reprfunc)Strs_repr,
-    .tp_str = (reprfunc)Strs_str,
+    .tp_new = LevenshteinDistances_new,
+    .tp_init = (initproc)LevenshteinDistances_init,
+    .tp_dealloc = (destructor)LevenshteinDistances_dealloc,
+    .tp_call = (ternaryfunc)LevenshteinDistances_call,
 };
 
 #pragma endregion
 
-static void stringzilla_cleanup(PyObject *m) {
-    if (temporary_memory.start) free(temporary_memory.start);
-    temporary_memory.start = NULL;
-    temporary_memory.length = 0;
+static void stringzillas_cleanup(PyObject *m) { 
+    sz_unused_(m);
+    if (default_device_scope) {
+        sz_device_scope_free(default_device_scope);
+        default_device_scope = NULL;
+    }
 }
 
-static PyMethodDef stringzilla_methods[] = {
-    // Basic `str`, `bytes`, and `bytearray`-like functionality
-    {"contains", Str_contains, SZ_METHOD_FLAGS, doc_contains},
-    {"count", Str_count, SZ_METHOD_FLAGS, doc_count},
-    {"splitlines", Str_splitlines, SZ_METHOD_FLAGS, doc_splitlines},
-    {"startswith", Str_startswith, SZ_METHOD_FLAGS, doc_startswith},
-    {"endswith", Str_endswith, SZ_METHOD_FLAGS, doc_endswith},
-    {"translate", Str_translate, SZ_METHOD_FLAGS, doc_translate},
-    {"decode", Str_decode, SZ_METHOD_FLAGS, doc_decode},
-    {"equal", Str_like_equal, SZ_METHOD_FLAGS, doc_like_equal},
-
-    // Bidirectional operations
-    {"find", Str_find, SZ_METHOD_FLAGS, doc_find},
-    {"index", Str_index, SZ_METHOD_FLAGS, doc_index},
-    {"partition", Str_partition, SZ_METHOD_FLAGS, doc_partition},
-    {"split", Str_split, SZ_METHOD_FLAGS, doc_split},
-    {"rfind", Str_rfind, SZ_METHOD_FLAGS, doc_rfind},
-    {"rindex", Str_rindex, SZ_METHOD_FLAGS, doc_rindex},
-    {"rpartition", Str_rpartition, SZ_METHOD_FLAGS, doc_rpartition},
-    {"rsplit", Str_rsplit, SZ_METHOD_FLAGS, doc_rsplit},
-
-    // Edit distance extensions
-    {"hamming_distance", Str_hamming_distance, SZ_METHOD_FLAGS, doc_hamming_distance},
-    {"hamming_distance_unicode", Str_hamming_distance_unicode, SZ_METHOD_FLAGS, doc_hamming_distance_unicode},
-    {"levenshtein_distance", Str_levenshtein_distance, SZ_METHOD_FLAGS, doc_levenshtein_distance},
-    {"levenshtein_distance_unicode", Str_levenshtein_distance_unicode, SZ_METHOD_FLAGS,
-     doc_levenshtein_distance_unicode},
-    {"needleman_wunsch_score", Str_needleman_wunsch_score, SZ_METHOD_FLAGS, doc_needleman_wunsch_score},
-
-    // Character search extensions
-    {"find_first_of", Str_find_first_of, SZ_METHOD_FLAGS, doc_find_first_of},
-    {"find_last_of", Str_find_last_of, SZ_METHOD_FLAGS, doc_find_last_of},
-    {"find_first_not_of", Str_find_first_not_of, SZ_METHOD_FLAGS, doc_find_first_not_of},
-    {"find_last_not_of", Str_find_last_not_of, SZ_METHOD_FLAGS, doc_find_last_not_of},
-    {"split_byteset", Str_split_byteset, SZ_METHOD_FLAGS, doc_split_byteset},
-    {"rsplit_byteset", Str_rsplit_byteset, SZ_METHOD_FLAGS, doc_rsplit_byteset},
-
-    // Lazily evaluated iterators
-    {"split_iter", Str_split_iter, SZ_METHOD_FLAGS, doc_split_iter},
-    {"rsplit_iter", Str_rsplit_iter, SZ_METHOD_FLAGS, doc_rsplit_iter},
-    {"split_byteset_iter", Str_split_byteset_iter, SZ_METHOD_FLAGS, doc_split_byteset_iter},
-    {"rsplit_byteset_iter", Str_rsplit_byteset_iter, SZ_METHOD_FLAGS, doc_rsplit_byteset_iter},
-
-    // Dealing with larger-than-memory datasets
-    {"offset_within", Str_offset_within, SZ_METHOD_FLAGS, doc_offset_within},
-    {"write_to", Str_write_to, SZ_METHOD_FLAGS, doc_write_to},
-
-    // Global unary extensions
-    {"hash", Str_like_hash, SZ_METHOD_FLAGS, doc_like_hash},
-    {"bytesum", Str_like_bytesum, SZ_METHOD_FLAGS, doc_like_bytesum},
-
-    {NULL, NULL, 0, NULL}};
-
-static PyModuleDef stringzilla_module = {
+static PyMethodDef stringzillas_methods[] = {{NULL, NULL, 0, NULL}};
+
+static PyModuleDef stringzillas_module = {
     PyModuleDef_HEAD_INIT,
-    "stringzilla",
+    "stringzillas",
     "Search, hash, sort, fingerprint, and fuzzy-match strings faster via SWAR, SIMD, and GPGPU",
     -1,
-    stringzilla_methods,
+    stringzillas_methods,
     NULL,
     NULL,
     NULL,
-    stringzilla_cleanup,
+    stringzillas_cleanup,
 };
 
-PyMODINIT_FUNC PyInit_stringzilla(void) {
+PyMODINIT_FUNC PyInit_stringzillas(void) {
     PyObject *m;
 
-    if (PyType_Ready(&StrType) < 0) return NULL;
-    if (PyType_Ready(&FileType) < 0) return NULL;
-    if (PyType_Ready(&StrsType) < 0) return NULL;
-    if (PyType_Ready(&SplitIteratorType) < 0) return NULL;
+    // Try to import NumPy
+#if defined(NPY_VERSION)
+    import_array();
+    numpy_available = 1;
+#else
+    // Try to import numpy module dynamically
+    numpy_module = PyImport_ImportModule("numpy");
+    if (numpy_module) { numpy_available = 1; }
+    else {
+        PyErr_Clear(); // Clear the import error
+        PyErr_SetString(PyExc_ImportError, "NumPy is required but not available");
+        return NULL;
+    }
+#endif
+
+    // Try to import StringZilla and get the C API functions
+    PyObject *stringzilla_module = PyImport_ImportModule("stringzilla");
+    if (!stringzilla_module) {
+        PyErr_SetString(PyExc_ImportError, "StringZilla module is required but not available");
+        return NULL;
+    }
+
+    // Import the C API struct from the single capsule
+    PyObject *capsule = PyObject_GetAttrString(stringzilla_module, "_sz_py_api");
+    if (!capsule || !PyCapsule_CheckExact(capsule)) {
+        Py_XDECREF(capsule);
+        Py_DECREF(stringzilla_module);
+        PyErr_SetString(PyExc_ImportError, "Failed to import StringZilla C API capsule");
+        return NULL;
+    }
+
+    // Get the PyAPI struct from the capsule
+    PyAPI *api = (PyAPI *)PyCapsule_GetPointer(capsule, "_sz_py_api");
+    if (!api) {
+        Py_DECREF(capsule);
+        Py_DECREF(stringzilla_module);
+        PyErr_SetString(PyExc_ImportError, "Failed to get StringZilla C API pointer from capsule");
+        return NULL;
+    }
+
+    // Extract the function pointers from the struct
+    sz_py_export_string_like = api->sz_py_export_string_like;
+    sz_py_export_strings_as_sequence = api->sz_py_export_strings_as_sequence;
+    sz_py_export_strings_as_u32tape = api->sz_py_export_strings_as_u32tape;
+    sz_py_export_strings_as_u64tape = api->sz_py_export_strings_as_u64tape;
+
+    Py_DECREF(capsule);
+    Py_DECREF(stringzilla_module);
 
-    m = PyModule_Create(&stringzilla_module);
+    // Check that all functions were loaded
+    if (!sz_py_export_string_like || !sz_py_export_strings_as_sequence || !sz_py_export_strings_as_u32tape ||
+        !sz_py_export_strings_as_u64tape) {
+        PyErr_SetString(PyExc_ImportError, "Failed to import required StringZilla C API functions");
+        return NULL;
+    }
+
+    // Initialize the default device scope for reuse
+    sz_status_t status = sz_device_scope_init_default(&default_device_scope);
+    if (status != sz_success_k) {
+        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize default device scope");
+        return NULL;
+    }
+
+    if (PyType_Ready(&DeviceScopeType) < 0) return NULL;
+    if (PyType_Ready(&LevenshteinDistancesType) < 0) return NULL;
+    // if (PyType_Ready(&LevenshteinDistancesUTF8Type) < 0) return NULL;
+
+    m = PyModule_Create(&stringzillas_module);
     if (m == NULL) return NULL;
 
     // Add version metadata
     {
         char version_str[50];
-        sprintf(version_str, "%d.%d.%d", sz_version_major(), sz_version_minor(), sz_version_patch());
+        sprintf(version_str, "%d.%d.%d", szs_version_major(), szs_version_minor(), szs_version_patch());
         PyModule_AddStringConstant(m, "__version__", version_str);
     }
 
     // Define SIMD capabilities
     {
-        sz_capability_t caps = sz_capabilities();
-        sz_cptr_t caps_str = sz_capabilities_to_string(caps);
+        sz_capability_t caps = szs_capabilities();
+        sz_cptr_t caps_str = sz_capabilities_to_string_implementation_(caps);
         PyModule_AddStringConstant(m, "__capabilities__", caps_str);
     }
 
-    Py_INCREF(&StrType);
-    if (PyModule_AddObject(m, "Str", (PyObject *)&StrType) < 0) {
-        Py_XDECREF(&StrType);
+    Py_INCREF(&DeviceScopeType);
+    if (PyModule_AddObject(m, "DeviceScope", (PyObject *)&DeviceScopeType) < 0) {
+        Py_XDECREF(&DeviceScopeType);
         Py_XDECREF(m);
         return NULL;
     }
 
-    Py_INCREF(&FileType);
-    if (PyModule_AddObject(m, "File", (PyObject *)&FileType) < 0) {
-        Py_XDECREF(&FileType);
-        Py_XDECREF(&StrType);
+    Py_INCREF(&LevenshteinDistancesType);
+    if (PyModule_AddObject(m, "LevenshteinDistances", (PyObject *)&LevenshteinDistancesType) < 0) {
+        Py_XDECREF(&LevenshteinDistancesType);
+        Py_XDECREF(&DeviceScopeType);
         Py_XDECREF(m);
         return NULL;
     }
 
-    Py_INCREF(&StrsType);
-    if (PyModule_AddObject(m, "Strs", (PyObject *)&StrsType) < 0) {
-        Py_XDECREF(&StrsType);
-        Py_XDECREF(&FileType);
-        Py_XDECREF(&StrType);
-        Py_XDECREF(m);
-        return NULL;
-    }
-
-    Py_INCREF(&SplitIteratorType);
-    if (PyModule_AddObject(m, "SplitIterator", (PyObject *)&SplitIteratorType) < 0) {
-        Py_XDECREF(&SplitIteratorType);
-        Py_XDECREF(&StrsType);
-        Py_XDECREF(&FileType);
-        Py_XDECREF(&StrType);
-        Py_XDECREF(m);
-        return NULL;
-    }
+    // Add UTF8 version - commented out for now until properly implemented
+    // Py_INCREF(&LevenshteinDistancesUTF8Type);
+    // if (PyModule_AddObject(m, "LevenshteinDistancesUTF8", (PyObject *)&LevenshteinDistancesUTF8Type) < 0) {
+    //     Py_XDECREF(&LevenshteinDistancesUTF8Type);
+    //     Py_XDECREF(&LevenshteinDistancesType);
+    //     Py_XDECREF(&DeviceScopeType);
+    //     Py_XDECREF(m);
+    //     return NULL;
+    // }
 
-    // Initialize temporary_memory, if needed
-    temporary_memory.start = malloc(4096);
-    temporary_memory.length = 4096 * (temporary_memory.start != NULL);
     return m;
-}
+}
\ No newline at end of file
diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index 6b1c45e3..751b9f94 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -52,8 +52,7 @@ def test_library_properties():
 
     # Test StringZillas properties
     assert len(szs.__version__.split(".")) == 3, "Semantic versioning must be preserved"
-    assert hasattr(szs, "__capabilities__"), "Capabilities must be exposed"
-    assert hasattr(szs, "__numpy_available__"), "NumPy availability must be exposed"
+    assert "serial" in szs.__capabilities__.split(","), "Serial backend must be present"
 
 
 def test_device_scope():
@@ -62,21 +61,29 @@ def test_device_scope():
     default_scope = szs.DeviceScope()
     assert default_scope is not None
 
-    scope_multi = szs.DeviceScope.with_cpu_cores(4)
+    scope_multi = szs.DeviceScope(cpu_cores=4)
     assert scope_multi is not None
 
-    scope_gpu = szs.DeviceScope.with_gpu_device(0)
-    assert scope_gpu is not None
+    if "cuda" in szs.__capabilities__.split(","):
+        scope_gpu = szs.DeviceScope(gpu_device=0)
+        assert scope_gpu is not None
+    else:
+        with pytest.raises(RuntimeError):
+            szs.DeviceScope(gpu_device=0)
+
+    # Test single-threaded execution
+    scope_single = szs.DeviceScope(cpu_cores=1)
+    assert scope_single is not None
 
     # Invalid arguments
     with pytest.raises(ValueError):
-        szs.DeviceScope.with_cpu_cores(1)
+        szs.DeviceScope(cpu_cores=4, gpu_device=0)  # Can't specify both
 
     with pytest.raises(TypeError):
-        szs.DeviceScope.with_cpu_cores("invalid")
+        szs.DeviceScope(cpu_cores="invalid")
 
     with pytest.raises(TypeError):
-        szs.DeviceScope.with_gpu_device("invalid")
+        szs.DeviceScope(gpu_device="invalid")
 
 
 def get_random_string(
@@ -134,7 +141,7 @@ def test_levenshtein_distance_insertions(max_edit_distance: int):
     def insert_char_at(s, char_to_insert, index):
         return s[:index] + char_to_insert + s[index:]
 
-    binary_distance = sz.LevenshteinDistance()
+    binary_engine = szs.LevenshteinDistances()
 
     a = get_random_string(length=20)
     b = a
@@ -142,35 +149,54 @@ def insert_char_at(s, char_to_insert, index):
         source_offset = randint(0, len(ascii_lowercase) - 1)
         target_offset = randint(0, len(b) - 1)
         b = insert_char_at(b, ascii_lowercase[source_offset], target_offset)
-        assert binary_distance([a], [b]) == [i + 1], f"Edit distance mismatch after {i + 1} insertions: {a} -> {b}"
+        a_strs = Strs([a])
+        b_strs = Strs([b])
+        results = binary_engine(a_strs, b_strs)
+        assert len(results) == 1, "Binary engine should return a single distance"
+        assert results[0] == [i + 1], f"Edit distance mismatch after {i + 1} insertions: {a} -> {b}"
 
 
 def test_levenshtein_distances_with_simple_cases():
 
-    binary_distance = sz.LevenshteinDistance()
-    unicode_distance = sz.LevenshteinDistanceUTF8()
-
-    assert binary_distance(["hello"], ["hello"]) == [0]
-    assert binary_distance(["hello"], ["hell"]) == [1]
-    assert binary_distance([""], [""]) == [0]
-    assert binary_distance([""], ["abc"]) == [3]
-    assert binary_distance(["abc"], [""]) == [3]
-    assert binary_distance(["abc"], ["ac"]) == [1], "one deletion"
-    assert binary_distance(["abc"], ["a_bc"]) == [1], "one insertion"
-    assert binary_distance(["abc"], ["adc"]) == [1], "one substitution"
-    assert binary_distance(["ggbuzgjux{}l"], ["gbuzgjux{}l"]) == [1], "one insertion (prepended)"
-    assert binary_distance(["abcdefgABCDEFG"], ["ABCDEFGabcdefg"]) == [14]
-
-    assert unicode_distance(["hello"], ["hell"]) == [1], "no unicode symbols, just ASCII"
-    assert unicode_distance(["𠜎 𠜱 𠝹 𠱓"], ["𠜎𠜱𠝹𠱓"]) == [3], "add 3 whitespaces in Chinese"
-    assert unicode_distance(["💖"], ["💗"]) == [1]
-
-    assert unicode_distance(["αβγδ"], ["αγδ"]) == [1], "insert Beta"
-    assert unicode_distance(["école"], ["école"]) == [2], "etter 'é' as 1 character vs 'e' + '´'"
-    assert unicode_distance(["façade"], ["facade"]) == [1], "'ç' with cedilla vs. plain"
-    assert unicode_distance(["Schön"], ["Scho\u0308n"]) == [2], "'ö' represented as 'o' + '¨'"
-    assert unicode_distance(["München"], ["Muenchen"]) == [2], "German with umlaut vs. transcription"
-    assert unicode_distance(["こんにちは世界"], ["こんばんは世界"]) == [2], "Japanese greetings"
+    binary_engine = szs.LevenshteinDistances()
+
+    def binary_distance(a: str, b: str) -> int:
+        a_strs = Strs([a])
+        b_strs = Strs([b])
+        results = binary_engine(a_strs, b_strs)
+        assert len(results) == 1, "Binary engine should return a single distance"
+        return results[0]
+
+    assert binary_distance("hello", "hello") == 0
+    assert binary_distance("hello", "hell") == 1
+    assert binary_distance("", "") == 0
+    assert binary_distance("", "abc") == 3
+    assert binary_distance("abc", "") == 3
+    assert binary_distance("abc", "ac") == 1, "one deletion"
+    assert binary_distance("abc", "a_bc") == 1, "one insertion"
+    assert binary_distance("abc", "adc") == 1, "one substitution"
+    assert binary_distance("ggbuzgjux{}l", "gbuzgjux{}l") == 1, "one insertion (prepended)"
+    assert binary_distance("abcdefgABCDEFG", "ABCDEFGabcdefg") == 14
+
+    unicode_engine = szs.LevenshteinDistancesUTF8()
+
+    def unicode_distance(a: str, b: str) -> int:
+        a_strs = Strs([a])
+        b_strs = Strs([b])
+        results = unicode_engine(a_strs, b_strs)
+        assert len(results) == 1, "Unicode engine should return a single distance"
+        return results[0]
+
+    assert unicode_distance("hello", "hell") == 1, "no unicode symbols, just ASCII"
+    assert unicode_distance("𠜎 𠜱 𠝹 𠱓", "𠜎𠜱𠝹𠱓") == 3, "add 3 whitespaces in Chinese"
+    assert unicode_distance("💖", "💗") == 1
+
+    assert unicode_distance("αβγδ", "αγδ") == 1, "insert Beta"
+    assert unicode_distance("école", "école") == 2, "etter 'é' as 1 character vs 'e' + '´'"
+    assert unicode_distance("façade", "facade") == 1, "'ç' with cedilla vs. plain"
+    assert unicode_distance("Schön", "Scho\u0308n") == 2, "'ö' represented as 'o' + '¨'"
+    assert unicode_distance("München", "Muenchen") == 2, "German with umlaut vs. transcription"
+    assert unicode_distance("こんにちは世界", "こんばんは世界") == 2, "Japanese greetings"
 
 
 def test_levenshtein_distances_with_custom_gaps():
@@ -179,30 +205,45 @@ def test_levenshtein_distances_with_custom_gaps():
     opening: int = 3
     extension: int = 2
 
-    binary_distance = sz.LevenshteinDistance()
-    unicode_distance = sz.LevenshteinDistanceUTF8()
-
-    assert binary_distance(["hello"], ["hello"]) == [0]
-    assert binary_distance(["hello"], ["hell"]) == [opening]
-    assert binary_distance([""], [""]) == [0]
-    assert binary_distance([""], ["abc"]) == [opening + 2 * extension]
-    assert binary_distance(["abc"], [""]) == [opening + 2 * extension]
-    assert binary_distance(["abc"], ["ac"]) == [opening], "one deletion"
-    assert binary_distance(["abc"], ["a_bc"]) == [opening], "one insertion"
-    assert binary_distance(["abc"], ["adc"]) == [mismatch], "one substitution"
-    assert binary_distance(["ggbuzgjux{}l"], ["gbuzgjux{}l"]) == [opening], "one insertion (prepended)"
-    assert binary_distance(["abcdefgABCDEFG"], ["ABCDEFGabcdefg"]) == [14 * mismatch]
-
-    assert unicode_distance(["hello"], ["hell"]) == [opening], "no unicode symbols, just ASCII"
-    assert unicode_distance(["𠜎 𠜱 𠝹 𠱓"], ["𠜎𠜱𠝹𠱓"]) == [3 * opening], "add 3 whitespaces in Chinese"
-    assert unicode_distance(["💖"], ["💗"]) == [1 * mismatch]
-
-    assert unicode_distance(["αβγδ"], ["αγδ"]) == [opening], "insert Beta"
-    assert unicode_distance(["école"], ["école"]) == [mismatch + opening], "etter 'é' as 1 character vs 'e' + '´'"
-    assert unicode_distance(["façade"], ["facade"]) == [mismatch], "'ç' with cedilla vs. plain"
-    assert unicode_distance(["Schön"], ["Scho\u0308n"]) == [mismatch + opening], "'ö' represented as 'o' + '¨'"
-    assert unicode_distance(["München"], ["Muenchen"]) == [mismatch + opening], "German with umlaut vs. transcription"
-    assert unicode_distance(["こんにちは世界"], ["こんばんは世界"]) == [mismatch + opening], "Japanese greetings"
+    binary_engine = szs.LevenshteinDistances()
+
+    def binary_distance(a: str, b: str) -> int:
+        a_strs = Strs([a])
+        b_strs = Strs([b])
+        results = binary_engine(a_strs, b_strs)
+        assert len(results) == 1, "Binary engine should return a single distance"
+        return results[0]
+
+    assert binary_distance("hello", "hello") == 0
+    assert binary_distance("hello", "hell") == opening
+    assert binary_distance("", "") == 0
+    assert binary_distance("", "abc") == opening + 2 * extension
+    assert binary_distance("abc", "") == opening + 2 * extension
+    assert binary_distance("abc", "ac") == opening, "one deletion"
+    assert binary_distance("abc", "a_bc") == opening, "one insertion"
+    assert binary_distance("abc", "adc") == mismatch, "one substitution"
+    assert binary_distance("ggbuzgjux{}l", "gbuzgjux{}l") == opening, "one insertion (prepended)"
+    assert binary_distance("abcdefgABCDEFG", "ABCDEFGabcdefg") == 14 * mismatch
+
+    unicode_engine = szs.LevenshteinDistancesUTF8()
+
+    def unicode_distance(a: str, b: str) -> int:
+        a_strs = Strs([a])
+        b_strs = Strs([b])
+        results = unicode_engine(a_strs, b_strs)
+        assert len(results) == 1, "Unicode engine should return a single distance"
+        return results[0]
+
+    assert unicode_distance("hello", "hell") == opening, "no unicode symbols, just ASCII"
+    assert unicode_distance("𠜎 𠜱 𠝹 𠱓", "𠜎𠜱𠝹𠱓") == 3 * opening, "add 3 whitespaces in Chinese"
+    assert unicode_distance("💖", "💗") == 1 * mismatch
+
+    assert unicode_distance("αβγδ", "αγδ") == opening, "insert Beta"
+    assert unicode_distance("école", "école") == mismatch + opening, "etter 'é' as 1 character vs 'e' + '´'"
+    assert unicode_distance("façade", "facade") == mismatch, "'ç' with cedilla vs. plain"
+    assert unicode_distance("Schön", "Scho\u0308n") == mismatch + opening, "'ö' represented as 'o' + '¨'"
+    assert unicode_distance("München", "Muenchen") == mismatch + opening, "German with umlaut vs. transcription"
+    assert unicode_distance("こんにちは世界", "こんばんは世界") == mismatch + opening, "Japanese greetings"
 
 
 @pytest.mark.repeat(10)
@@ -215,7 +256,7 @@ def test_levenshtein_distance_random(first_length: int, second_length: int, batc
     batch_b = [get_random_string(length=second_length) for _ in range(batch_size)]
 
     baselines = np.array([baseline_levenshtein_distance(a, b) for a, b in zip(batch_a, batch_b)])
-    engine = sz.LevenshteinDistance()
+    engine = szs.LevenshteinDistances()
     results = engine(batch_a, batch_b)
 
     np.testing.assert_array_equal(results, baselines, "Edit distances do not match")

From 38c87b4173c08308ada7c375ba86530009a6f707 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 10 Aug 2025 20:16:26 +0000
Subject: [PATCH 550/751] Fix: Match Apache Arrow layout

---
 include/stringzillas/stringzillas.h |   4 +-
 python/stringzilla.c                | 170 ++++++++++++++++------------
 scripts/test_stringzilla.py         |   2 +-
 scripts/test_stringzillas.py        |   2 +-
 4 files changed, 102 insertions(+), 76 deletions(-)

diff --git a/include/stringzillas/stringzillas.h b/include/stringzillas/stringzillas.h
index 56f2e97f..96658890 100644
--- a/include/stringzillas/stringzillas.h
+++ b/include/stringzillas/stringzillas.h
@@ -37,7 +37,7 @@ SZ_DYNAMIC sz_capability_t szs_capabilities(void);
 /**
  *  @brief Apache Arrow-like tape for non-NULL strings with 32-bit offsets.
  *  @sa `sz_sequence_u64tape_t` for larger collections.
- *  @note Unlike Apache Arrow, we only take (N) offsets for (N) strings, assuming the first one starts at zero offset.
+ *  @note Like Apache Arrow, we take (N+1) offsets for (N) strings, where `lengths[i] = offsets[i] - offsets[i-1]`.
  */
 typedef struct sz_sequence_u32tape_t {
     sz_cptr_t data;
@@ -48,7 +48,7 @@ typedef struct sz_sequence_u32tape_t {
 /**
  *  @brief Apache Arrow-like tape for non-NULL strings with 64-bit offsets.
  *  @sa `sz_sequence_u32tape_t` for smaller space-efficient collections.
- *  @note Unlike Apache Arrow, we only take (N) offsets for (N) strings, assuming the first one starts at zero offset.
+ *  @note Like Apache Arrow, we take (N+1) offsets for (N) strings, where `lengths[i] = offsets[i] - offsets[i-1]`.
  */
 typedef struct sz_sequence_u64tape_t {
     sz_cptr_t data;
diff --git a/python/stringzilla.c b/python/stringzilla.c
index 1533cd03..4fff6ea9 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -190,13 +190,12 @@ typedef struct {
         /**
          *  Simple structure resembling Apache Arrow arrays of variable length strings.
          *  When you split a `Str`, that is under 4 GB in size, this is used for space-efficiency.
-         *  The `end_offsets` contains `count`-many integers marking the end offset of part at a given
-         *  index. The length of consecutive elements can be determined as the difference in consecutive
-         *  offsets. The starting offset of the first element is zero bytes after the `start`.
-         *  Every chunk will include a separator of length `separator_length` at the end, except for the
-         *  last one.
          *
-         *  The layout isn't exactly identical to Arrow, as we have an optional separator and we have one less offset.
+         *  The `offsets` contains `count+1` integers similar to the Apache Arrow format.
+         *  The length of the i-th string is calculated as: `offsets[i+1] - offsets[i] - separator_length`.
+         *  The first offset is typically 0, unless we are looking at a slice of a larger array.
+         *
+         *  The layout is now identical to Apache Arrow format: N+1 offsets for N strings.
          *  https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout
          */
         struct consecutive_slices_32bit_t {
@@ -204,20 +203,19 @@ typedef struct {
             size_t separator_length;
             PyObject *parent_string;
             char const *start; // ? Ownership is controlled by presence of `parent_string`
-            uint32_t *end_offsets;
-            int owns_offsets; // ? 1 if we allocated `end_offsets` and should `free`
+            uint32_t *offsets; // Apache Arrow format: N+1 offsets for N strings, starting with 0
+            int owns_offsets;  // ? 1 if we allocated `offsets` and should `free`
         } consecutive_32bit;
 
         /**
          *  Simple structure resembling Apache Arrow arrays of variable length strings.
          *  When you split a `Str`, over 4 GB long, this structure is used to indicate chunk offsets.
-         *  The `end_offsets` contains `count`-many integers marking the end offset of part at a given
-         *  index. The length of consecutive elements can be determined as the difference in consecutive
-         *  offsets. The starting offset of the first element is zero bytes after the `start`.
-         *  Every chunk will include a separator of length `separator_length` at the end, except for the
-         *  last one.
          *
-         *  The layout isn't exactly identical to Arrow, as we have an optional separator and we have one less offset.
+         *  The `offsets` contains `count+1` integers similar to the Apache Arrow format.
+         *  The length of the i-th string is calculated as: `offsets[i+1] - offsets[i] - separator_length`.
+         *  The first offset is typically 0, unless we are looking at a slice of a larger array.
+         *
+         *  The layout is now identical to Apache Arrow format: N+1 offsets for N strings.
          *  https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout
          */
         struct consecutive_slices_64bit_t {
@@ -225,8 +223,8 @@ typedef struct {
             size_t separator_length;
             PyObject *parent_string;
             char const *start; // ? Ownership is controlled by presence of `parent_string`
-            uint64_t *end_offsets;
-            int owns_offsets; // ? 1 if we allocated `end_offsets` and should `free`
+            uint64_t *offsets; // Apache Arrow format: N+1 offsets for N strings, starting with 0
+            int owns_offsets;  // ? 1 if we allocated `offsets` and should `free`
         } consecutive_64bit;
 
         /**
@@ -428,9 +426,10 @@ SZ_DYNAMIC sz_bool_t sz_py_export_strings_as_u32tape(PyObject *object, sz_cptr_t
     if (!PyObject_TypeCheck(object, &StrsType)) return sz_false_k;
     Strs *strs = (Strs *)object;
     if (strs->type != STRS_CONSECUTIVE_32) return sz_false_k;
+    if (strs->data.consecutive_32bit.separator_length != 0) return sz_false_k;
 
     *data = strs->data.consecutive_32bit.start;
-    *offsets = strs->data.consecutive_32bit.end_offsets;
+    *offsets = strs->data.consecutive_32bit.offsets;
     *count = strs->data.consecutive_32bit.count;
     return sz_true_k;
 }
@@ -445,9 +444,10 @@ SZ_DYNAMIC sz_bool_t sz_py_export_strings_as_u64tape(PyObject *object, sz_cptr_t
     if (!PyObject_TypeCheck(object, &StrsType)) return sz_false_k;
     Strs *strs = (Strs *)object;
     if (strs->type != STRS_CONSECUTIVE_64) return sz_false_k;
+    if (strs->data.consecutive_64bit.separator_length != 0) return sz_false_k;
 
     *data = strs->data.consecutive_64bit.start;
-    *offsets = strs->data.consecutive_64bit.end_offsets;
+    *offsets = strs->data.consecutive_64bit.offsets;
     *count = strs->data.consecutive_64bit.count;
     return sz_true_k;
 }
@@ -468,8 +468,9 @@ typedef void (*get_string_at_offset_t)(Strs *, Py_ssize_t, Py_ssize_t, PyObject
 
 void str_at_offset_consecutive_32bit(Strs *strs, Py_ssize_t i, Py_ssize_t count, //
                                      PyObject **parent_string, char const **start, size_t *length) {
-    uint32_t start_offset = (i == 0) ? 0 : strs->data.consecutive_32bit.end_offsets[i - 1];
-    uint32_t end_offset = strs->data.consecutive_32bit.end_offsets[i] - //
+    // Apache Arrow format: offsets[i] to offsets[i+1] defines string i
+    uint32_t start_offset = strs->data.consecutive_32bit.offsets[i];
+    uint32_t end_offset = strs->data.consecutive_32bit.offsets[i + 1] - //
                           strs->data.consecutive_32bit.separator_length * (i + 1 != count);
     *start = strs->data.consecutive_32bit.start + start_offset;
     *length = end_offset - start_offset;
@@ -478,8 +479,9 @@ void str_at_offset_consecutive_32bit(Strs *strs, Py_ssize_t i, Py_ssize_t count,
 
 void str_at_offset_consecutive_64bit(Strs *strs, Py_ssize_t i, Py_ssize_t count, //
                                      PyObject **parent_string, char const **start, size_t *length) {
-    uint64_t start_offset = (i == 0) ? 0 : strs->data.consecutive_64bit.end_offsets[i - 1];
-    uint64_t end_offset = strs->data.consecutive_64bit.end_offsets[i] - //
+    // Apache Arrow format: offsets[i] to offsets[i+1] defines string i
+    uint64_t start_offset = strs->data.consecutive_64bit.offsets[i];
+    uint64_t end_offset = strs->data.consecutive_64bit.offsets[i + 1] - //
                           strs->data.consecutive_64bit.separator_length * (i + 1 != count);
     *start = strs->data.consecutive_64bit.start + start_offset;
     *length = end_offset - start_offset;
@@ -515,13 +517,13 @@ sz_bool_t prepare_strings_for_reordering(Strs *strs) {
     switch (strs->type) {
     case STRS_CONSECUTIVE_32:
         count = strs->data.consecutive_32bit.count;
-        if (strs->data.consecutive_32bit.owns_offsets) buffer_to_release = strs->data.consecutive_32bit.end_offsets;
+        if (strs->data.consecutive_32bit.owns_offsets) buffer_to_release = strs->data.consecutive_32bit.offsets;
         parent_string = strs->data.consecutive_32bit.parent_string;
         getter = str_at_offset_consecutive_32bit;
         break;
     case STRS_CONSECUTIVE_64:
         count = strs->data.consecutive_64bit.count;
-        if (strs->data.consecutive_64bit.owns_offsets) buffer_to_release = strs->data.consecutive_64bit.end_offsets;
+        if (strs->data.consecutive_64bit.owns_offsets) buffer_to_release = strs->data.consecutive_64bit.offsets;
         parent_string = strs->data.consecutive_64bit.parent_string;
         getter = str_at_offset_consecutive_64bit;
         break;
@@ -1207,25 +1209,26 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) {
         consecutive_slices_t *to = &result->data.consecutive_32bit;
         to->count = result_count;
 
-        // Allocate memory for the end offsets
+        // Allocate memory for the offsets (Apache Arrow format: N+1 offsets for N strings)
         to->separator_length = from->separator_length;
-        to->end_offsets = malloc(sizeof(uint32_t) * result_count);
-        if (to->end_offsets == NULL && PyErr_NoMemory()) {
+        to->offsets = malloc(sizeof(uint32_t) * (result_count + 1));
+        if (to->offsets == NULL && PyErr_NoMemory()) {
             Py_XDECREF(result);
             return NULL;
         }
         to->owns_offsets = 1;
 
-        // Now populate the offsets
+        // Now populate the offsets (Apache Arrow format: N+1 offsets for N strings)
+        to->offsets[0] = 0; // First offset is always 0
         size_t element_length;
         str_at_offset_consecutive_32bit(self, start, count, &to->parent_string, &to->start, &element_length);
-        to->end_offsets[0] = element_length;
+        to->offsets[1] = element_length;
         for (Py_ssize_t i = 1; i < result_count; ++i) {
-            to->end_offsets[i - 1] += from->separator_length;
+            to->offsets[i] += from->separator_length;
             PyObject *element_parent = NULL;
             char const *element_start = NULL;
-            str_at_offset_consecutive_32bit(self, start, count, &element_parent, &element_start, &element_length);
-            to->end_offsets[i] = element_length + to->end_offsets[i - 1];
+            str_at_offset_consecutive_32bit(self, start + i, count, &element_parent, &element_start, &element_length);
+            to->offsets[i + 1] = element_length + to->offsets[i];
         }
         Py_INCREF(to->parent_string);
         break;
@@ -1237,25 +1240,26 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) {
         consecutive_slices_t *to = &result->data.consecutive_64bit;
         to->count = result_count;
 
-        // Allocate memory for the end offsets
+        // Allocate memory for the offsets (Apache Arrow format: N+1 offsets for N strings)
         to->separator_length = from->separator_length;
-        to->end_offsets = malloc(sizeof(uint64_t) * result_count);
-        if (to->end_offsets == NULL && PyErr_NoMemory()) {
+        to->offsets = malloc(sizeof(uint64_t) * (result_count + 1));
+        if (to->offsets == NULL && PyErr_NoMemory()) {
             Py_XDECREF(result);
             return NULL;
         }
         to->owns_offsets = 1;
 
-        // Now populate the offsets
+        // Now populate the offsets (Apache Arrow format: N+1 offsets for N strings)
+        to->offsets[0] = 0; // First offset is always 0
         size_t element_length;
         str_at_offset_consecutive_64bit(self, start, count, &to->parent_string, &to->start, &element_length);
-        to->end_offsets[0] = element_length;
+        to->offsets[1] = element_length;
         for (Py_ssize_t i = 1; i < result_count; ++i) {
-            to->end_offsets[i - 1] += from->separator_length;
+            to->offsets[i] += from->separator_length;
             PyObject *element_parent = NULL;
             char const *element_start = NULL;
-            str_at_offset_consecutive_64bit(self, start, count, &element_parent, &element_start, &element_length);
-            to->end_offsets[i] = element_length + to->end_offsets[i - 1];
+            str_at_offset_consecutive_64bit(self, start + i, count, &element_parent, &element_start, &element_length);
+            to->offsets[i + 1] = element_length + to->offsets[i];
         }
         Py_INCREF(to->parent_string);
         break;
@@ -2644,7 +2648,7 @@ static Strs *Str_split_(PyObject *parent_string, sz_string_view_t const text, sz
     // Initialize Strs object based on the splitting logic
     void *offsets_endings = NULL;
     size_t offsets_capacity = 0;
-    size_t offsets_count = 0;
+    size_t offsets_count = 1; // Start with 1 to account for the initial 0 offset
     size_t bytes_per_offset;
     if (text.length >= UINT32_MAX) {
         bytes_per_offset = 8;
@@ -2652,7 +2656,7 @@ static Strs *Str_split_(PyObject *parent_string, sz_string_view_t const text, sz
         result->data.consecutive_64bit.start = text.start;
         result->data.consecutive_64bit.parent_string = parent_string;
         result->data.consecutive_64bit.separator_length = !keepseparator * match_length;
-        result->data.consecutive_64bit.end_offsets = NULL;
+        result->data.consecutive_64bit.offsets = NULL;
         result->data.consecutive_64bit.owns_offsets = 0;
     }
     else {
@@ -2661,17 +2665,30 @@ static Strs *Str_split_(PyObject *parent_string, sz_string_view_t const text, sz
         result->data.consecutive_32bit.start = text.start;
         result->data.consecutive_32bit.parent_string = parent_string;
         result->data.consecutive_32bit.separator_length = !keepseparator * match_length;
-        result->data.consecutive_32bit.end_offsets = NULL;
+        result->data.consecutive_32bit.offsets = NULL;
         result->data.consecutive_32bit.owns_offsets = 0;
     }
 
+    // Initialize the first offset to 0 (Apache Arrow format)
+    if (offsets_capacity == 0) {
+        offsets_capacity = 4;
+        offsets_endings = malloc(offsets_capacity * bytes_per_offset);
+        if (!offsets_endings) {
+            Py_XDECREF(result);
+            PyErr_NoMemory();
+            return NULL;
+        }
+    }
+    if (bytes_per_offset == 8) { ((uint64_t *)offsets_endings)[0] = 0; }
+    else { ((uint32_t *)offsets_endings)[0] = 0; }
+
     sz_bool_t reached_tail = 0;
     sz_size_t total_skipped = 0;
     sz_size_t max_parts = (sz_size_t)maxsplit + 1;
     while (!reached_tail) {
 
         sz_cptr_t match =
-            offsets_count + 1 < max_parts
+            offsets_count < max_parts
                 ? finder(text.start + total_skipped, text.length - total_skipped, separator.start, separator.length)
                 : NULL;
 
@@ -2711,13 +2728,13 @@ static Strs *Str_split_(PyObject *parent_string, sz_string_view_t const text, sz
 
     // Populate the Strs object with the offsets
     if (bytes_per_offset == 8) {
-        result->data.consecutive_64bit.end_offsets = offsets_endings;
-        result->data.consecutive_64bit.count = offsets_count;
+        result->data.consecutive_64bit.offsets = offsets_endings;
+        result->data.consecutive_64bit.count = offsets_count - 1; // count is number of strings, not offsets
         result->data.consecutive_64bit.owns_offsets = 1;
     }
     else {
-        result->data.consecutive_32bit.end_offsets = offsets_endings;
-        result->data.consecutive_32bit.count = offsets_count;
+        result->data.consecutive_32bit.offsets = offsets_endings;
+        result->data.consecutive_32bit.count = offsets_count - 1; // count is number of strings, not offsets
         result->data.consecutive_32bit.owns_offsets = 1;
     }
 
@@ -3918,7 +3935,7 @@ static int Strs_init_from_pyarrow(Strs *self, PyObject *sequence_obj, int view)
             self->data.consecutive_64bit.separator_length = 0;
             self->data.consecutive_64bit.parent_string = capsules;
             self->data.consecutive_64bit.start = data_buffer;
-            self->data.consecutive_64bit.end_offsets = (uint64_t *)(offsets_64 + 1);
+            self->data.consecutive_64bit.offsets = (uint64_t *)(offsets_64 + 1);
             self->data.consecutive_64bit.owns_offsets = 0; // Arrow owns buffer
             Py_INCREF(capsules);
         }
@@ -3929,7 +3946,7 @@ static int Strs_init_from_pyarrow(Strs *self, PyObject *sequence_obj, int view)
             self->data.consecutive_32bit.separator_length = 0;
             self->data.consecutive_32bit.parent_string = capsules;
             self->data.consecutive_32bit.start = data_buffer;
-            self->data.consecutive_32bit.end_offsets = (uint32_t *)(offsets_32 + 1);
+            self->data.consecutive_32bit.offsets = (uint32_t *)(offsets_32 + 1);
             self->data.consecutive_32bit.owns_offsets = 0; // Arrow owns buffer
             Py_INCREF(capsules);
         }
@@ -3975,7 +3992,7 @@ static int Strs_init_from_pyarrow(Strs *self, PyObject *sequence_obj, int view)
             self->data.consecutive_64bit.separator_length = 0;
             self->data.consecutive_64bit.parent_string = parent;
             self->data.consecutive_64bit.start = PyBytes_AS_STRING(parent);
-            self->data.consecutive_64bit.end_offsets = new_offsets;
+            self->data.consecutive_64bit.offsets = new_offsets;
             self->data.consecutive_64bit.owns_offsets = 1;
         }
         else {
@@ -4017,7 +4034,7 @@ static int Strs_init_from_pyarrow(Strs *self, PyObject *sequence_obj, int view)
             self->data.consecutive_32bit.separator_length = 0;
             self->data.consecutive_32bit.parent_string = parent;
             self->data.consecutive_32bit.start = PyBytes_AS_STRING(parent);
-            self->data.consecutive_32bit.end_offsets = new_offsets;
+            self->data.consecutive_32bit.offsets = new_offsets;
             self->data.consecutive_32bit.owns_offsets = 1;
         }
     }
@@ -4093,7 +4110,8 @@ static int Strs_init_from_tuple(Strs *self, PyObject *sequence_obj, int view) {
         }
 
         if (use_64bit) {
-            uint64_t *offsets = (uint64_t *)malloc(count * sizeof(uint64_t));
+            // Apache Arrow format: N+1 offsets for N strings
+            uint64_t *offsets = (uint64_t *)malloc((count + 1) * sizeof(uint64_t));
             if (!offsets) {
                 free(data_buffer);
                 PyErr_NoMemory();
@@ -4101,6 +4119,7 @@ static int Strs_init_from_tuple(Strs *self, PyObject *sequence_obj, int view) {
             }
 
             size_t offset = 0;
+            offsets[0] = 0; // First offset is always 0
             for (Py_ssize_t i = 0; i < count; i++) {
                 PyObject *item = PyTuple_GET_ITEM(sequence_obj, i);
                 sz_cptr_t item_start;
@@ -4109,7 +4128,7 @@ static int Strs_init_from_tuple(Strs *self, PyObject *sequence_obj, int view) {
 
                 sz_copy(data_buffer + offset, item_start, item_length);
                 offset += item_length;
-                offsets[i] = offset;
+                offsets[i + 1] = offset; // Apache Arrow format: offset after this string
             }
 
             PyObject *parent = PyBytes_FromStringAndSize(data_buffer, total_bytes);
@@ -4124,10 +4143,11 @@ static int Strs_init_from_tuple(Strs *self, PyObject *sequence_obj, int view) {
             self->data.consecutive_64bit.separator_length = 0;
             self->data.consecutive_64bit.parent_string = parent;
             self->data.consecutive_64bit.start = PyBytes_AS_STRING(parent);
-            self->data.consecutive_64bit.end_offsets = offsets;
+            self->data.consecutive_64bit.offsets = offsets;
         }
         else {
-            uint32_t *offsets = (uint32_t *)malloc(count * sizeof(uint32_t));
+            // Apache Arrow format: N+1 offsets for N strings
+            uint32_t *offsets = (uint32_t *)malloc((count + 1) * sizeof(uint32_t));
             if (!offsets) {
                 free(data_buffer);
                 PyErr_NoMemory();
@@ -4135,6 +4155,7 @@ static int Strs_init_from_tuple(Strs *self, PyObject *sequence_obj, int view) {
             }
 
             size_t offset = 0;
+            offsets[0] = 0; // First offset is always 0
             for (Py_ssize_t i = 0; i < count; i++) {
                 PyObject *item = PyTuple_GET_ITEM(sequence_obj, i);
                 sz_cptr_t item_start;
@@ -4143,7 +4164,7 @@ static int Strs_init_from_tuple(Strs *self, PyObject *sequence_obj, int view) {
 
                 sz_copy(data_buffer + offset, item_start, item_length);
                 offset += item_length;
-                offsets[i] = offset;
+                offsets[i + 1] = offset; // Apache Arrow format: offset after this string
             }
 
             PyObject *parent = PyBytes_FromStringAndSize(data_buffer, total_bytes);
@@ -4158,7 +4179,7 @@ static int Strs_init_from_tuple(Strs *self, PyObject *sequence_obj, int view) {
             self->data.consecutive_32bit.separator_length = 0;
             self->data.consecutive_32bit.parent_string = parent;
             self->data.consecutive_32bit.start = PyBytes_AS_STRING(parent);
-            self->data.consecutive_32bit.end_offsets = offsets;
+            self->data.consecutive_32bit.offsets = offsets;
         }
     }
 
@@ -4236,8 +4257,9 @@ static int Strs_init_from_list(Strs *self, PyObject *sequence_obj, int view) {
         char *data_buffer = (char *)malloc(total_bytes);
         void *offsets;
 
-        if (use_64bit) { offsets = malloc(count * sizeof(uint64_t)); }
-        else { offsets = malloc(count * sizeof(uint32_t)); }
+        // Apache Arrow format: N+1 offsets for N strings
+        if (use_64bit) { offsets = malloc((count + 1) * sizeof(uint64_t)); }
+        else { offsets = malloc((count + 1) * sizeof(uint32_t)); }
 
         if (!data_buffer || !offsets) {
             free(data_buffer);
@@ -4246,8 +4268,12 @@ static int Strs_init_from_list(Strs *self, PyObject *sequence_obj, int view) {
             return -1;
         }
 
-        // Second pass: copy data and build offsets
+        // Second pass: copy data and build offsets (Apache Arrow format)
         size_t current_offset = 0;
+        // Set first offset to 0
+        if (use_64bit) { ((uint64_t *)offsets)[0] = 0; }
+        else { ((uint32_t *)offsets)[0] = 0; }
+
         for (Py_ssize_t i = 0; i < count; i++) {
             PyObject *item = PyList_GET_ITEM(sequence_obj, i);
             sz_cptr_t item_start;
@@ -4260,9 +4286,9 @@ static int Strs_init_from_list(Strs *self, PyObject *sequence_obj, int view) {
             memcpy(data_buffer + current_offset, item_start, item_length);
             current_offset += item_length;
 
-            // Store offset
-            if (use_64bit) { ((uint64_t *)offsets)[i] = current_offset; }
-            else { ((uint32_t *)offsets)[i] = current_offset; }
+            // Store offset (Apache Arrow format: offset after this string)
+            if (use_64bit) { ((uint64_t *)offsets)[i + 1] = current_offset; }
+            else { ((uint32_t *)offsets)[i + 1] = current_offset; }
         }
 
         // Create parent bytes object from the buffer
@@ -4281,7 +4307,7 @@ static int Strs_init_from_list(Strs *self, PyObject *sequence_obj, int view) {
             self->data.consecutive_64bit.separator_length = 0;
             self->data.consecutive_64bit.parent_string = parent;
             self->data.consecutive_64bit.start = PyBytes_AS_STRING(parent);
-            self->data.consecutive_64bit.end_offsets = (uint64_t *)offsets;
+            self->data.consecutive_64bit.offsets = (uint64_t *)offsets;
             self->data.consecutive_64bit.owns_offsets = 1;
         }
         else {
@@ -4290,7 +4316,7 @@ static int Strs_init_from_list(Strs *self, PyObject *sequence_obj, int view) {
             self->data.consecutive_32bit.separator_length = 0;
             self->data.consecutive_32bit.parent_string = parent;
             self->data.consecutive_32bit.start = PyBytes_AS_STRING(parent);
-            self->data.consecutive_32bit.end_offsets = (uint32_t *)offsets;
+            self->data.consecutive_32bit.offsets = (uint32_t *)offsets;
             self->data.consecutive_32bit.owns_offsets = 1;
         }
 
@@ -4480,7 +4506,7 @@ static int Strs_init_from_iterable(Strs *self, PyObject *sequence_obj, int view)
             self->data.consecutive_64bit.separator_length = 0;
             self->data.consecutive_64bit.parent_string = parent;
             self->data.consecutive_64bit.start = PyBytes_AS_STRING(parent);
-            self->data.consecutive_64bit.end_offsets = (uint64_t *)offsets;
+            self->data.consecutive_64bit.offsets = (uint64_t *)offsets;
             self->data.consecutive_64bit.owns_offsets = 1;
         }
         else {
@@ -4489,7 +4515,7 @@ static int Strs_init_from_iterable(Strs *self, PyObject *sequence_obj, int view)
             self->data.consecutive_32bit.separator_length = 0;
             self->data.consecutive_32bit.parent_string = parent;
             self->data.consecutive_32bit.start = PyBytes_AS_STRING(parent);
-            self->data.consecutive_32bit.end_offsets = (uint32_t *)offsets;
+            self->data.consecutive_32bit.offsets = (uint32_t *)offsets;
             self->data.consecutive_32bit.owns_offsets = 1;
         }
 
@@ -4566,15 +4592,15 @@ static void Strs_dealloc(Strs *self) {
     switch (self->type) {
     case STRS_CONSECUTIVE_32:
         // Free offset array (only if owned) and decref parent string
-        if (self->data.consecutive_32bit.owns_offsets && self->data.consecutive_32bit.end_offsets)
-            free(self->data.consecutive_32bit.end_offsets);
+        if (self->data.consecutive_32bit.owns_offsets && self->data.consecutive_32bit.offsets)
+            free(self->data.consecutive_32bit.offsets);
         Py_XDECREF(self->data.consecutive_32bit.parent_string);
         break;
 
     case STRS_CONSECUTIVE_64:
         // Free offset array (only if owned) and decref parent string
-        if (self->data.consecutive_64bit.owns_offsets && self->data.consecutive_64bit.end_offsets)
-            free(self->data.consecutive_64bit.end_offsets);
+        if (self->data.consecutive_64bit.owns_offsets && self->data.consecutive_64bit.offsets)
+            free(self->data.consecutive_64bit.offsets);
         Py_XDECREF(self->data.consecutive_64bit.parent_string);
         break;
 
diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index b90944dc..028913dd 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -6,7 +6,7 @@
 
     uv pip install numpy pyarrow pytest pytest-repeat
     uv pip install -e . --force-reinstall --no-build-isolation
-    uv run --no-project python -m pytest scripts/test_stringzilla.py
+    uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x
 """
 
 from random import choice, randint
diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index 751b9f94..a19e5b94 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -7,7 +7,7 @@
     uv pip install numpy pyarrow pytest pytest-repeat
     SZ_TARGET=stringzillas-cpus uv pip install -e . --force-reinstall --no-build-isolation
     SZ_TARGET=stringzillas-cuda uv pip install -e . --force-reinstall --no-build-isolation
-    uv run --no-project python -m pytest scripts/test_stringzillas.py
+    uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x
 """
 
 from random import choice, randint

From 4cd6c7d10578cff5f392189ef87cef97452c9a66 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 10 Aug 2025 20:42:09 +0000
Subject: [PATCH 551/751] Add: `LevenshteinDistancesUTF8` in Py

---
 python/stringzillas.c | 296 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 283 insertions(+), 13 deletions(-)

diff --git a/python/stringzillas.c b/python/stringzillas.c
index 5003c3ed..e8a7b5d7 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -503,11 +503,10 @@ static PyObject *LevenshteinDistances_call(LevenshteinDistances *self, PyObject
 
     // If no valid input types were found, raise an error
     if (!kernel_punned) {
-        PyErr_Format(PyExc_TypeError, 
+        PyErr_Format(PyExc_TypeError,
                      "Unsupported input types for Levenshtein distances. "
                      "u32tape: a=%d b=%d, u64tape: a=%d b=%d, seq: a=%d b=%d",
-                     a_is_u32tape, b_is_u32tape, a_is_u64tape, b_is_u64tape, 
-                     a_is_sequence, b_is_sequence);
+                     a_is_u32tape, b_is_u32tape, a_is_u64tape, b_is_u64tape, a_is_sequence, b_is_sequence);
         return NULL;
     }
 
@@ -596,7 +595,279 @@ static PyTypeObject LevenshteinDistancesType = {
 
 #pragma endregion
 
-static void stringzillas_cleanup(PyObject *m) { 
+#pragma region LevenshteinDistancesUTF8
+
+typedef struct {
+    PyObject ob_base;
+    sz_levenshtein_distances_utf8_t handle;
+} LevenshteinDistancesUTF8;
+
+static PyObject *LevenshteinDistancesUTF8_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
+    LevenshteinDistancesUTF8 *self = (LevenshteinDistancesUTF8 *)type->tp_alloc(type, 0);
+    if (self != NULL) { self->handle = NULL; }
+    return (PyObject *)self;
+}
+
+static void LevenshteinDistancesUTF8_dealloc(LevenshteinDistancesUTF8 *self) {
+    if (self->handle) { sz_levenshtein_distances_utf8_free(self->handle); }
+    Py_TYPE(self)->tp_free((PyObject *)self);
+}
+
+static int LevenshteinDistancesUTF8_init(LevenshteinDistancesUTF8 *self, PyObject *args, PyObject *kwds) {
+    static char *kwlist[] = {"match", "mismatch", "gap_open", "gap_extend", NULL};
+    sz_error_cost_t match = 0, mismatch = 1, open = 1, extend = 1;
+
+    if (args) {
+        Py_ssize_t n_args = PyTuple_Size(args);
+        for (Py_ssize_t i = 0; i < n_args; i++) {
+            PyObject *arg = PyTuple_GetItem(args, i);
+            int val = PyLong_AsLong(arg);
+            if (PyErr_Occurred()) return -1;
+            if (i == 0) { match = (sz_error_cost_t)val; }
+            else if (i == 1) { mismatch = (sz_error_cost_t)val; }
+            else if (i == 2) { open = (sz_error_cost_t)val; }
+            else if (i == 3) { extend = (sz_error_cost_t)val; }
+        }
+    }
+
+    if (kwds) {
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+        while (PyDict_Next(kwds, &pos, &key, &value)) {
+            int val = PyLong_AsLong(value);
+            if (PyErr_Occurred()) return -1;
+
+            if (PyUnicode_CompareWithASCIIString(key, "match") == 0) { match = (sz_error_cost_t)val; }
+            else if (PyUnicode_CompareWithASCIIString(key, "mismatch") == 0) { mismatch = (sz_error_cost_t)val; }
+            else if (PyUnicode_CompareWithASCIIString(key, "gap_open") == 0) { open = (sz_error_cost_t)val; }
+            else if (PyUnicode_CompareWithASCIIString(key, "gap_extend") == 0) { extend = (sz_error_cost_t)val; }
+            else {
+                PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
+                return -1;
+            }
+        }
+    }
+
+    sz_status_t status =
+        sz_levenshtein_distances_utf8_init(match, mismatch, open, extend, NULL, szs_capabilities(), &self->handle);
+
+    if (status != sz_success_k) {
+        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize UTF-8 Levenshtein distances engine");
+        return -1;
+    }
+    return 0;
+}
+
+static PyObject *LevenshteinDistancesUTF8_call(LevenshteinDistancesUTF8 *self, PyObject *args, PyObject *kwargs) {
+    Py_ssize_t nargs = PyTuple_Size(args);
+    PyObject *a_obj = NULL, *b_obj = NULL, *device_obj = NULL, *out_obj = NULL;
+
+    // Manual argument parsing for hot path
+    if (nargs < 2) {
+        PyErr_SetString(PyExc_TypeError, "LevenshteinDistancesUTF8() requires at least 2 arguments");
+        return NULL;
+    }
+
+    if (nargs > 4) {
+        PyErr_SetString(PyExc_TypeError, "LevenshteinDistancesUTF8() takes at most 4 arguments");
+        return NULL;
+    }
+
+    a_obj = PyTuple_GET_ITEM(args, 0);
+    b_obj = PyTuple_GET_ITEM(args, 1);
+    if (nargs >= 3) device_obj = PyTuple_GET_ITEM(args, 2);
+    if (nargs >= 4) out_obj = PyTuple_GET_ITEM(args, 3);
+
+    // Parse keyword arguments
+    if (kwargs) {
+        Py_ssize_t pos = 0;
+        PyObject *key, *value;
+        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+            if (PyUnicode_CompareWithASCIIString(key, "device") == 0) {
+                if (device_obj) {
+                    PyErr_SetString(PyExc_TypeError, "device specified twice");
+                    return NULL;
+                }
+                device_obj = value;
+            }
+            else if (PyUnicode_CompareWithASCIIString(key, "out") == 0) {
+                if (out_obj) {
+                    PyErr_SetString(PyExc_TypeError, "out specified twice");
+                    return NULL;
+                }
+                out_obj = value;
+            }
+            else {
+                PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
+                return NULL;
+            }
+        }
+    }
+
+    DeviceScope *device_scope = NULL;
+    if (device_obj != NULL && device_obj != Py_None) {
+        if (!PyObject_TypeCheck(device_obj, &DeviceScopeType)) {
+            PyErr_SetString(PyExc_TypeError, "device must be a DeviceScope instance");
+            return NULL;
+        }
+        device_scope = (DeviceScope *)device_obj;
+    }
+
+    sz_device_scope_t device_handle = device_scope ? device_scope->handle : default_device_scope;
+    sz_size_t kernel_input_size = 0;
+    void *kernel_a_texts_punned = NULL;
+    void *kernel_b_texts_punned = NULL;
+    sz_size_t *kernel_results = NULL;
+    sz_size_t kernel_results_stride = sizeof(sz_size_t);
+    sz_status_t (*kernel_punned)(sz_levenshtein_distances_t, sz_device_scope_t, void *, void *, sz_size_t *,
+                                 sz_size_t) = NULL;
+
+    // Handle 32-bit tape inputs
+    sz_sequence_u32tape_t a_u32tape, b_u32tape;
+    sz_bool_t a_is_u32tape = sz_py_export_strings_as_u32tape( //
+        a_obj, &a_u32tape.data, &a_u32tape.offsets, &a_u32tape.count);
+    sz_bool_t b_is_u32tape = sz_py_export_strings_as_u32tape( //
+        b_obj, &b_u32tape.data, &b_u32tape.offsets, &b_u32tape.count);
+    if (a_is_u32tape && b_is_u32tape) {
+        if (a_u32tape.count != b_u32tape.count) {
+            PyErr_SetString(PyExc_ValueError, "Input sequences must have the same length");
+            return NULL;
+        }
+
+        kernel_input_size = a_u32tape.count;
+        kernel_punned = sz_levenshtein_distances_utf8_u32tape;
+        kernel_a_texts_punned = &a_u32tape;
+        kernel_b_texts_punned = &b_u32tape;
+    }
+
+    // Handle 64-bit tape inputs
+    sz_sequence_u64tape_t a_u64tape, b_u64tape;
+    sz_bool_t a_is_u64tape = !a_is_u32tape && sz_py_export_strings_as_u64tape( //
+                                                  a_obj, &a_u64tape.data, &a_u64tape.offsets, &a_u64tape.count);
+    sz_bool_t b_is_u64tape = !b_is_u32tape && sz_py_export_strings_as_u64tape( //
+                                                  b_obj, &b_u64tape.data, &b_u64tape.offsets, &b_u64tape.count);
+    if (a_is_u64tape && b_is_u64tape) {
+        if (a_u64tape.count != b_u64tape.count) {
+            PyErr_SetString(PyExc_ValueError, "Input sequences must have the same length");
+            return NULL;
+        }
+        kernel_input_size = a_u64tape.count;
+        kernel_punned = sz_levenshtein_distances_utf8_u64tape;
+        kernel_a_texts_punned = &a_u64tape;
+        kernel_b_texts_punned = &b_u64tape;
+    }
+
+    // Handle sequence inputs
+    sz_sequence_t a_seq, b_seq;
+    sz_bool_t a_is_sequence = !a_is_u32tape && !a_is_u64tape && sz_py_export_strings_as_sequence(a_obj, &a_seq);
+    sz_bool_t b_is_sequence = !b_is_u32tape && !b_is_u64tape && sz_py_export_strings_as_sequence(b_obj, &b_seq);
+    if (a_is_sequence && b_is_sequence) {
+        if (a_seq.count != b_seq.count) {
+            PyErr_SetString(PyExc_ValueError, "Input sequences must have the same length");
+            return NULL;
+        }
+        kernel_input_size = a_seq.count;
+        kernel_punned = sz_levenshtein_distances_utf8_sequence;
+        kernel_a_texts_punned = &a_seq;
+        kernel_b_texts_punned = &b_seq;
+    }
+
+    // If no valid input types were found, raise an error
+    if (!kernel_punned) {
+        PyErr_Format(PyExc_TypeError,
+                     "Unsupported input types for Levenshtein distances. "
+                     "u32tape: a=%d b=%d, u64tape: a=%d b=%d, seq: a=%d b=%d",
+                     a_is_u32tape, b_is_u32tape, a_is_u64tape, b_is_u64tape, a_is_sequence, b_is_sequence);
+        return NULL;
+    }
+
+    // Make sure the `out` argument is valid NumPy array and extract `kernel_results` and `kernel_results_stride`
+    // or create a new results array.
+    PyObject *results_array = NULL;
+    if (!out_obj || out_obj == Py_None) {
+        // Create a new NumPy array for results
+        npy_intp numpy_size = kernel_input_size;
+        results_array = PyArray_SimpleNew(1, &numpy_size, NPY_UINT64);
+        if (!results_array) {
+            PyErr_SetString(PyExc_RuntimeError, "Failed to create NumPy array for results");
+            goto cleanup;
+        }
+        kernel_results = (sz_size_t *)PyArray_DATA((PyArrayObject *)results_array);
+        kernel_results_stride = sizeof(sz_size_t);
+    }
+    else {
+        // Validate existing NumPy array
+        if (!PyArray_Check(out_obj)) {
+            PyErr_SetString(PyExc_TypeError, "out argument must be a NumPy array");
+            goto cleanup;
+        }
+        PyArrayObject *array = (PyArrayObject *)out_obj;
+        if (PyArray_NDIM(array) != 1) {
+            PyErr_SetString(PyExc_ValueError, "out array must be 1-dimensional");
+            goto cleanup;
+        }
+        if (PyArray_SIZE(array) < (npy_intp)kernel_input_size) {
+            PyErr_SetString(PyExc_ValueError, "out array is too small for results");
+            goto cleanup;
+        }
+        if (PyArray_TYPE(array) != NPY_UINT64) {
+            PyErr_SetString(PyExc_TypeError, "out array must have uint64 dtype");
+            goto cleanup;
+        }
+        kernel_results = (sz_size_t *)PyArray_DATA(array);
+        kernel_results_stride = PyArray_STRIDE(array, 0);
+        results_array = out_obj;
+        Py_INCREF(results_array);
+    }
+
+    sz_status_t status = kernel_punned(               //
+        self->handle, device_handle,                  //
+        kernel_a_texts_punned, kernel_b_texts_punned, //
+        kernel_results, kernel_results_stride);
+
+    if (status != sz_success_k) {
+        PyErr_SetString(PyExc_RuntimeError, "Levenshtein distance computation failed");
+        goto cleanup;
+    }
+    return results_array;
+
+cleanup:
+    Py_XDECREF(results_array);
+    return NULL;
+}
+
+static char const doc_LevenshteinDistancesUTF8[] = //
+    "LevenshteinDistancesUTF8(match=0, mismatch=1, gap_open=1, gap_extend=1)\n"
+    "\n"
+    "Vectorized UTF-8 Levenshtein distance calculator.\n"
+    "Computes edit distances between pairs of UTF-8 encoded strings.\n"
+    "\n"
+    "Args:\n"
+    "  match (int): Cost of matching characters (default 0).\n"
+    "  mismatch (int): Cost of mismatched characters (default 1).\n"
+    "  gap_open (int): Cost of opening a gap (default 1).\n"
+    "  gap_extend (int): Cost of extending a gap (default 1).\n"
+    "\n"
+    "Call with:\n"
+    "  a (sequence): First sequence of UTF-8 strings.\n"
+    "  b (sequence): Second sequence of UTF-8 strings.\n"
+    "  device (DeviceScope, optional): Device execution context.\n"
+    "  out (array, optional): Output buffer for results.";
+
+static PyTypeObject LevenshteinDistancesUTF8Type = {
+    PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzillas.LevenshteinDistancesUTF8",
+    .tp_doc = doc_LevenshteinDistancesUTF8,
+    .tp_basicsize = sizeof(LevenshteinDistancesUTF8),
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_new = LevenshteinDistancesUTF8_new,
+    .tp_init = (initproc)LevenshteinDistancesUTF8_init,
+    .tp_dealloc = (destructor)LevenshteinDistancesUTF8_dealloc,
+    .tp_call = (ternaryfunc)LevenshteinDistancesUTF8_call,
+};
+
+#pragma endregion
+
+static void stringzillas_cleanup(PyObject *m) {
     sz_unused_(m);
     if (default_device_scope) {
         sz_device_scope_free(default_device_scope);
@@ -720,15 +991,14 @@ PyMODINIT_FUNC PyInit_stringzillas(void) {
         return NULL;
     }
 
-    // Add UTF8 version - commented out for now until properly implemented
-    // Py_INCREF(&LevenshteinDistancesUTF8Type);
-    // if (PyModule_AddObject(m, "LevenshteinDistancesUTF8", (PyObject *)&LevenshteinDistancesUTF8Type) < 0) {
-    //     Py_XDECREF(&LevenshteinDistancesUTF8Type);
-    //     Py_XDECREF(&LevenshteinDistancesType);
-    //     Py_XDECREF(&DeviceScopeType);
-    //     Py_XDECREF(m);
-    //     return NULL;
-    // }
+    Py_INCREF(&LevenshteinDistancesUTF8Type);
+    if (PyModule_AddObject(m, "LevenshteinDistancesUTF8", (PyObject *)&LevenshteinDistancesUTF8Type) < 0) {
+        Py_XDECREF(&LevenshteinDistancesUTF8Type);
+        Py_XDECREF(&LevenshteinDistancesType);
+        Py_XDECREF(&DeviceScopeType);
+        Py_XDECREF(m);
+        return NULL;
+    }
 
     return m;
 }
\ No newline at end of file

From 94d8c3634319d10314b71be15663dd92b811ddab Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 11 Aug 2025 09:26:52 +0000
Subject: [PATCH 552/751] Improve: Printing CUDA caps

---
 include/stringzilla/stringzilla.h | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index e29adb3a..43233cf3 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -110,9 +110,23 @@ SZ_INTERNAL sz_cptr_t sz_capabilities_to_string_implementation_(sz_capability_t
         sz_capability_t flag;
         char const *name;
     } capability_map[] = {
-        {sz_cap_serial_k, "serial"}, {sz_cap_haswell_k, "haswell"}, {sz_cap_skylake_k, "skylake"},
-        {sz_cap_ice_k, "ice"},       {sz_cap_neon_k, "neon"},       {sz_cap_neon_aes_k, "neon+aes"},
-        {sz_cap_sve_k, "sve"},       {sz_cap_sve2_k, "sve2"},       {sz_cap_sve2_aes_k, "sve2+aes"},
+        //
+        {sz_cap_serial_k, "serial"},
+        {sz_cap_parallel_k, "parallel"},
+        //
+        {sz_cap_haswell_k, "haswell"},
+        {sz_cap_skylake_k, "skylake"},
+        {sz_cap_ice_k, "ice"},
+        //
+        {sz_cap_neon_k, "neon"},
+        {sz_cap_neon_aes_k, "neon+aes"},
+        {sz_cap_sve_k, "sve"},
+        {sz_cap_sve2_k, "sve2"},
+        {sz_cap_sve2_aes_k, "sve2+aes"},
+        //
+        {sz_cap_cuda_k, "cuda"},
+        {sz_cap_kepler_k, "kepler"},
+        {sz_cap_hopper_k, "hopper"},
     };
     int const capabilities_count = sizeof(capability_map) / sizeof(capability_map[0]);
 
@@ -122,7 +136,7 @@ SZ_INTERNAL sz_cptr_t sz_capabilities_to_string_implementation_(sz_capability_t
             int const is_first = p == buf;
             // Add separator if this is not the first capability.
             if (!is_first) {
-                char const sep[3] = {',', ' ', '\0'};
+                char const sep[2] = {',', '\0'};
                 char const *s = sep;
                 while (*s && p < end - 1) *p++ = *s++;
             }

From 0223b62d001e30248ffd8f0739252d6c45bf7db1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 11 Aug 2025 11:57:38 +0000
Subject: [PATCH 553/751] Improve: Export capabilities as a tuple

---
 include/stringzilla/stringzilla.h | 60 ++++++++++++++++++-------------
 include/stringzilla/types.h       |  6 ++++
 python/stringzilla.c              | 33 +++++++++++++++--
 python/stringzillas.c             | 33 +++++++++++++++--
 scripts/test_stringzilla.py       |  2 +-
 scripts/test_stringzillas.py      | 15 +++++---
 6 files changed, 115 insertions(+), 34 deletions(-)

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 43233cf3..8ff78257 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -96,15 +96,15 @@ extern "C" {
 #endif
 
 /**
- *  @brief Internal helper function to convert SIMD capabilities to a string.
- *  @sa    sz_capabilities_to_string, sz_capabilities
+ *  @brief Internal helper function to convert SIMD capabilities to an array of string pointers.
+ *  @param[in] caps The capabilities bitfield
+ *  @param[out] strings Output array to store string pointers (should have more than `SZ_CAPABILITIES_COUNT` slots)
+ *  @param[in] max_count Maximum number of strings to output
+ *  @return Number of capability strings written to the array
+ *  @sa sz_capabilities_to_string_implementation_, sz_capabilities
  */
-SZ_INTERNAL sz_cptr_t sz_capabilities_to_string_implementation_(sz_capability_t caps) {
-
-    static char buf[256];
-    char *p = buf;
-    char *const end = buf + sizeof(buf);
-
+SZ_INTERNAL sz_size_t sz_capabilities_to_strings_implementation_(sz_capability_t caps, char const **strings,
+                                                                 sz_size_t max_count) {
     // Mapping each flag to its string literal.
     struct {
         sz_capability_t flag;
@@ -131,25 +131,37 @@ SZ_INTERNAL sz_cptr_t sz_capabilities_to_string_implementation_(sz_capability_t
     int const capabilities_count = sizeof(capability_map) / sizeof(capability_map[0]);
 
     // Iterate over each capability flag.
-    for (int i = 0; i < capabilities_count; i++) {
-        if (caps & capability_map[i].flag) {
-            int const is_first = p == buf;
+    sz_size_t count = 0;
+    for (int i = 0; i < capabilities_count && count < max_count; i++)
+        if (caps & capability_map[i].flag) strings[count++] = capability_map[i].name;
+
+    return count;
+}
+
+/**
+ *  @brief Internal helper function to convert SIMD capabilities to a string.
+ *  @sa    sz_capabilities_to_string, sz_capabilities
+ */
+SZ_INTERNAL sz_cptr_t sz_capabilities_to_string_implementation_(sz_capability_t caps) {
+
+    static char buf[256];
+    char *p = buf;
+    char *const end = buf + sizeof(buf);
+
+    // Use the new function to get capability strings
+    char const *cap_strings[SZ_CAPABILITIES_COUNT];
+    sz_size_t cap_count = sz_capabilities_to_strings_implementation_(caps, cap_strings, SZ_CAPABILITIES_COUNT);
+
+    // Build the comma-separated string
+    for (sz_size_t i = 0; i < cap_count; i++) {
+        if (i > 0) {
             // Add separator if this is not the first capability.
-            if (!is_first) {
-                char const sep[2] = {',', '\0'};
-                char const *s = sep;
-                while (*s && p < end - 1) *p++ = *s++;
-            }
-            // Append the capability name character by character.
-            char const *s = capability_map[i].name;
+            char const sep[2] = {',', '\0'};
+            char const *s = sep;
             while (*s && p < end - 1) *p++ = *s++;
         }
-    }
-
-    // If no capability was added, write "none".
-    int const nothing_detected = p == buf;
-    if (nothing_detected) {
-        char const *s = "none";
+        // Append the capability name character by character.
+        char const *s = cap_strings[i];
         while (*s && p < end - 1) *p++ = *s++;
     }
 
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 4b37d170..ee766d8e 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -503,6 +503,12 @@ typedef enum sz_capability_t {
 
 } sz_capability_t;
 
+/**
+ *  @brief Maximum number of individual capability flags that can be represented.
+ *  @sa sz_capabilities_to_strings_implementation_ - not intended for public use, but a valid example.
+ */
+#define SZ_CAPABILITIES_COUNT 14
+
 /**
  *  @brief Describes the length of a UTF-8 @b rune / character / codepoint in bytes, which can be 1 to 4.
  *  @see https://en.wikipedia.org/wiki/UTF-8
diff --git a/python/stringzilla.c b/python/stringzilla.c
index 4fff6ea9..a33d233d 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -4757,11 +4757,40 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
         PyModule_AddStringConstant(m, "__version__", version_str);
     }
 
-    // Define SIMD capabilities
+    // Define SIMD capabilities as a tuple
     {
         sz_capability_t caps = sz_capabilities();
+        
+        // Get capability strings using the new function
+        char const *cap_strings[SZ_CAPABILITIES_COUNT];
+        sz_size_t cap_count = sz_capabilities_to_strings_implementation_(caps, cap_strings, SZ_CAPABILITIES_COUNT);
+        
+        // Create a Python tuple with the capabilities
+        PyObject *caps_tuple = PyTuple_New(cap_count);
+        if (!caps_tuple) {
+            Py_XDECREF(m);
+            return NULL;
+        }
+        
+        for (sz_size_t i = 0; i < cap_count; i++) {
+            PyObject *cap_str = PyUnicode_FromString(cap_strings[i]);
+            if (!cap_str) {
+                Py_DECREF(caps_tuple);
+                Py_XDECREF(m);
+                return NULL;
+            }
+            PyTuple_SET_ITEM(caps_tuple, i, cap_str);
+        }
+        
+        if (PyModule_AddObject(m, "__capabilities__", caps_tuple) < 0) {
+            Py_DECREF(caps_tuple);
+            Py_XDECREF(m);
+            return NULL;
+        }
+        
+        // Also keep the old comma-separated string version for backward compatibility
         sz_cptr_t caps_str = sz_capabilities_to_string(caps);
-        PyModule_AddStringConstant(m, "__capabilities__", caps_str);
+        PyModule_AddStringConstant(m, "__capabilities_str__", caps_str);
     }
 
     Py_INCREF(&StrType);
diff --git a/python/stringzillas.c b/python/stringzillas.c
index e8a7b5d7..e1a54ad2 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -969,11 +969,40 @@ PyMODINIT_FUNC PyInit_stringzillas(void) {
         PyModule_AddStringConstant(m, "__version__", version_str);
     }
 
-    // Define SIMD capabilities
+    // Define SIMD capabilities as a tuple
     {
         sz_capability_t caps = szs_capabilities();
+
+        // Get capability strings using the new function
+        char const *cap_strings[SZ_CAPABILITIES_COUNT];
+        sz_size_t cap_count = sz_capabilities_to_strings_implementation_(caps, cap_strings, SZ_CAPABILITIES_COUNT);
+
+        // Create a Python tuple with the capabilities
+        PyObject *caps_tuple = PyTuple_New(cap_count);
+        if (!caps_tuple) {
+            Py_XDECREF(m);
+            return NULL;
+        }
+
+        for (sz_size_t i = 0; i < cap_count; i++) {
+            PyObject *cap_str = PyUnicode_FromString(cap_strings[i]);
+            if (!cap_str) {
+                Py_DECREF(caps_tuple);
+                Py_XDECREF(m);
+                return NULL;
+            }
+            PyTuple_SET_ITEM(caps_tuple, i, cap_str);
+        }
+
+        if (PyModule_AddObject(m, "__capabilities__", caps_tuple) < 0) {
+            Py_DECREF(caps_tuple);
+            Py_XDECREF(m);
+            return NULL;
+        }
+
+        // Also keep the old comma-separated string version for backward compatibility
         sz_cptr_t caps_str = sz_capabilities_to_string_implementation_(caps);
-        PyModule_AddStringConstant(m, "__capabilities__", caps_str);
+        PyModule_AddStringConstant(m, "__capabilities_str__", caps_str);
     }
 
     Py_INCREF(&DeviceScopeType);
diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index 028913dd..e96588b6 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -49,7 +49,7 @@
 
 def test_library_properties():
     assert len(sz.__version__.split(".")) == 3, "Semantic versioning must be preserved"
-    assert "serial" in sz.__capabilities__.split(","), "Serial backend must be present"
+    assert "serial" in sz.__capabilities__, "Serial backend must be present"
 
 
 @pytest.mark.parametrize("native_type", [str, bytes, bytearray])
diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index a19e5b94..5110dd43 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -48,11 +48,12 @@
 
 def test_library_properties():
     assert len(sz.__version__.split(".")) == 3, "Semantic versioning must be preserved"
-    assert "serial" in sz.__capabilities__.split(","), "Serial backend must be present"
+    assert "serial" in sz.__capabilities__, "Serial backend must be present"
 
     # Test StringZillas properties
     assert len(szs.__version__.split(".")) == 3, "Semantic versioning must be preserved"
-    assert "serial" in szs.__capabilities__.split(","), "Serial backend must be present"
+    assert "serial" in szs.__capabilities__, "Serial backend must be present"
+
 
 
 def test_device_scope():
@@ -64,9 +65,13 @@ def test_device_scope():
     scope_multi = szs.DeviceScope(cpu_cores=4)
     assert scope_multi is not None
 
-    if "cuda" in szs.__capabilities__.split(","):
-        scope_gpu = szs.DeviceScope(gpu_device=0)
-        assert scope_gpu is not None
+    if "cuda" in szs.__capabilities__:
+        try:
+            scope_gpu = szs.DeviceScope(gpu_device=0)
+            assert scope_gpu is not None
+        except RuntimeError:
+            # GPU capability is reported but device initialization failed
+            pass
     else:
         with pytest.raises(RuntimeError):
             szs.DeviceScope(gpu_device=0)

From b9a1109c68faac769a9b0bb21a1e44cb2a5af292 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 11 Aug 2025 12:01:50 +0000
Subject: [PATCH 554/751] Improve: Cache hardware capabilities

---
 python/stringzillas.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/stringzillas.c b/python/stringzillas.c
index e1a54ad2..09cfd8d9 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -68,6 +68,8 @@ static sz_bool_t (*sz_py_export_strings_as_u64tape)(PyObject *, sz_cptr_t *, sz_
 // Default device scope that can be safely reused across calls
 // The underlying implementation is stateless and thread-safe
 static sz_device_scope_t default_device_scope = NULL;
+// Static variable to store hardware capabilities
+static sz_capability_t default_hardware_capabilities = 0;
 
 typedef struct PyAPI {
     sz_bool_t (*sz_py_export_string_like)(PyObject *, sz_cptr_t *, sz_size_t *);
@@ -969,9 +971,12 @@ PyMODINIT_FUNC PyInit_stringzillas(void) {
         PyModule_AddStringConstant(m, "__version__", version_str);
     }
 
+    // Initialize hardware capabilities for capability intersection
+    default_hardware_capabilities = szs_capabilities();
+
     // Define SIMD capabilities as a tuple
     {
-        sz_capability_t caps = szs_capabilities();
+        sz_capability_t caps = default_hardware_capabilities;
 
         // Get capability strings using the new function
         char const *cap_strings[SZ_CAPABILITIES_COUNT];

From cb81c00015ef19b62950027a6998f8e95b6aae14 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 11 Aug 2025 12:02:53 +0000
Subject: [PATCH 555/751] Fix: Announce `LevenshteinDistancesUTF8Type`

Otherwise we'll SEGFAULT
the first time we try to use the Python
class.
---
 python/stringzillas.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/stringzillas.c b/python/stringzillas.c
index 09cfd8d9..82770093 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -959,7 +959,7 @@ PyMODINIT_FUNC PyInit_stringzillas(void) {
 
     if (PyType_Ready(&DeviceScopeType) < 0) return NULL;
     if (PyType_Ready(&LevenshteinDistancesType) < 0) return NULL;
-    // if (PyType_Ready(&LevenshteinDistancesUTF8Type) < 0) return NULL;
+    if (PyType_Ready(&LevenshteinDistancesUTF8Type) < 0) return NULL;
 
     m = PyModule_Create(&stringzillas_module);
     if (m == NULL) return NULL;

From d0dfa0e5ed5ad8d7d517e20425c3ca945c316aa2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 11 Aug 2025 12:06:58 +0000
Subject: [PATCH 556/751] Add: Capability-constrained Py constructors

This drops the gap-extension argument for
UTF-8 Levenshtein distances due to the lack
of practical applications.
---
 c/stringzillas.cuh                  | 183 +++++++++++++++++++-
 include/stringzillas/stringzillas.h |   6 +-
 python/stringzillas.c               | 260 ++++++++++++++++++++++++----
 scripts/test_stringzillas.py        |  71 ++++++--
 4 files changed, 472 insertions(+), 48 deletions(-)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index 49435a6c..0e79d923 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -309,6 +309,79 @@ sz_status_t sz_levenshtein_distances_for_(                                     /
     return result;
 }
 
+struct levenshtein_distances_utf8_backends_t {
+
+    using fallback_variant_t =
+        szs::levenshtein_distances_utf8<char, szs::linear_gap_costs_t, malloc_t, sz_cap_serial_k>;
+
+    /**
+     *  On each hardware platform we use a different backend for Levenshtein UTF8 distances,
+     *  separately covering:
+     *  - Serial, Ice Lake, CUDA backends
+     */
+    std::variant<
+#if SZ_USE_ICE
+        szs::levenshtein_distances_utf8<char, szs::linear_gap_costs_t, malloc_t, sz_caps_si_k>,
+#endif
+        fallback_variant_t>
+        variants;
+
+    template <typename... variants_arguments_>
+    levenshtein_distances_utf8_backends_t(variants_arguments_ &&...args) noexcept
+        : variants(std::forward<variants_arguments_>(args)...) {}
+};
+
+template <typename texts_type_>
+sz_status_t sz_levenshtein_distances_utf8_for_(                                     //
+    sz_levenshtein_distances_utf8_t engine_punned, sz_device_scope_t device_punned, //
+    texts_type_ &&a_container, texts_type_ &&b_container,                           //
+    sz_size_t *results, sz_size_t results_stride) {
+
+    sz_assert_(engine_punned != nullptr && "Engine must be initialized");
+    sz_assert_(device_punned != nullptr && "Device must be initialized");
+    sz_assert_(results != nullptr && "Results must not be null");
+
+    // Revert back from opaque pointer types
+    auto *engine = reinterpret_cast<levenshtein_distances_utf8_backends_t *>(engine_punned);
+    auto *device = reinterpret_cast<device_scope_t *>(device_punned);
+
+    // Wrap our stable ABI sequences into C++ friendly containers
+    auto results_strided = strided_ptr<sz_size_t> {reinterpret_cast<sz_ptr_t>(results), results_stride};
+
+    // The simplest case, is having non-optimized non-unrolled hashers.
+    sz_status_t result = sz_success_k;
+    auto variant_logic = [&](auto &engine_variant) {
+        constexpr sz_capability_t engine_capability_k = engine_variant.capability_k;
+
+        // GPU backends are only compatible with GPU scopes
+        if constexpr (is_gpu_capability(engine_capability_k)) {
+            // No GPU backends for UTF8 Levenshtein distances yet
+            result = sz_status_unknown_k;
+        }
+        // CPU backends are only compatible with CPU scopes
+        else {
+            if (std::holds_alternative<default_scope_t>(device->variants)) {
+                auto &device_scope = std::get<default_scope_t>(device->variants);
+                sz::status_t status = engine_variant(          //
+                    a_container, b_container, results_strided, //
+                    get_executor(device_scope), get_specs(device_scope));
+                result = static_cast<sz_status_t>(status);
+            }
+            else if (std::holds_alternative<cpu_scope_t>(device->variants)) {
+                auto &device_scope = std::get<cpu_scope_t>(device->variants);
+                sz::status_t status = engine_variant(          //
+                    a_container, b_container, results_strided, //
+                    get_executor(device_scope), get_specs(device_scope));
+                result = static_cast<sz_status_t>(status);
+            }
+            else { result = sz_status_unknown_k; }
+        }
+    };
+
+    std::visit(variant_logic, engine->variants);
+    return result;
+}
+
 template <typename element_type_>
 using vec = szs::safe_vector<element_type_, std::allocator<element_type_>>;
 
@@ -397,12 +470,15 @@ sz_status_t sz_fingerprints_for_(                                     //
 
 extern "C" {
 
+#pragma region Metadata
+
 SZ_DYNAMIC int szs_version_major(void) { return STRINGZILLA_H_VERSION_MAJOR; }
 SZ_DYNAMIC int szs_version_minor(void) { return STRINGZILLA_H_VERSION_MINOR; }
 SZ_DYNAMIC int szs_version_patch(void) { return STRINGZILLA_H_VERSION_PATCH; }
 
 SZ_DYNAMIC sz_capability_t szs_capabilities(void) {
-    return static_cast<sz_capability_t>(sz_caps_spi_k | sz_caps_ckh_k);
+    sz_capability_t cpu_capabilities = sz_capabilities_implementation_();
+    return static_cast<sz_capability_t>(cpu_capabilities | sz_caps_ckh_k);
 }
 
 SZ_DYNAMIC sz_status_t sz_memory_allocator_init_unified(sz_memory_allocator_t *alloc) {
@@ -416,6 +492,10 @@ SZ_DYNAMIC sz_status_t sz_memory_allocator_init_unified(sz_memory_allocator_t *a
 #endif
 }
 
+#pragma endregion Metadata
+
+#pragma region Device Scopes
+
 SZ_DYNAMIC sz_status_t sz_device_scope_init_default(sz_device_scope_t *scope_punned) {
     sz_assert_(scope_punned != nullptr && "Scope must not be null");
     auto *scope = new device_scope_t {default_scope_t {}};
@@ -469,6 +549,10 @@ SZ_DYNAMIC void sz_device_scope_free(sz_device_scope_t scope_punned) {
     delete scope;
 }
 
+#pragma endregion Device Scopes
+
+#pragma region Levenshtein Distances
+
 SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(                                              //
     sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities,                              //
@@ -601,6 +685,101 @@ SZ_DYNAMIC void sz_levenshtein_distances_free(sz_levenshtein_distances_t engine_
     delete engine;
 }
 
+#pragma endregion Levenshtein Distances
+
+#pragma region Levenshtein UTF8 Distances
+
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_init(                //
+    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t gap, //
+    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,     //
+    sz_levenshtein_distances_utf8_t *engine_punned) {
+
+    sz_assert_(engine_punned != nullptr && *engine_punned == nullptr && "Engine must be uninitialized");
+
+    // If the gap opening and extension costs are identical we can use less memory
+    auto const substitution_costs = szs::uniform_substitution_costs_t {match, mismatch};
+    auto const linear_costs = szs::linear_gap_costs_t {gap};
+    using fallback_variant_t = typename levenshtein_distances_utf8_backends_t::fallback_variant_t;
+
+#if SZ_USE_ICE
+    bool const can_use_ice = (capabilities & sz_cap_ice_k) != 0;
+    if (can_use_ice) {
+        auto variant = szs::levenshtein_distances_utf8<char, szs::linear_gap_costs_t, malloc_t, sz_caps_si_k>(
+            substitution_costs, linear_costs);
+        auto engine = new (std::nothrow) levenshtein_distances_utf8_backends_t(
+            std::in_place_type_t<
+                szs::levenshtein_distances_utf8<char, szs::linear_gap_costs_t, malloc_t, sz_caps_si_k>>(),
+            std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_levenshtein_distances_utf8_t>(engine);
+        return sz_success_k;
+    }
+#endif // SZ_USE_ICE
+
+    bool const can_use_serial = (capabilities & sz_cap_serial_k) != 0;
+    if (can_use_serial) {
+        auto variant = fallback_variant_t(substitution_costs, linear_costs);
+        auto engine = new (std::nothrow)
+            levenshtein_distances_utf8_backends_t(std::in_place_type_t<fallback_variant_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_levenshtein_distances_utf8_t>(engine);
+        return sz_success_k;
+    }
+
+    return sz_status_unknown_k; // No supported backends available
+}
+
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_sequence(                      //
+    sz_levenshtein_distances_utf8_t engine_punned, sz_device_scope_t device_punned, //
+    sz_sequence_t const *a, sz_sequence_t const *b,                                 //
+    sz_size_t *results, sz_size_t results_stride) {
+
+    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
+    auto a_container = sz_sequence_as_cpp_container_t {a};
+    auto b_container = sz_sequence_as_cpp_container_t {b};
+    return sz_levenshtein_distances_utf8_for_(                  //
+        engine_punned, device_punned, a_container, b_container, //
+        results, results_stride);
+}
+
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_u32tape(                       //
+    sz_levenshtein_distances_utf8_t engine_punned, sz_device_scope_t device_punned, //
+    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,                 //
+    sz_size_t *results, sz_size_t results_stride) {
+
+    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
+    auto a_container = sz_sequence_u32tape_as_cpp_container_t {a};
+    auto b_container = sz_sequence_u32tape_as_cpp_container_t {b};
+    return sz_levenshtein_distances_utf8_for_(                  //
+        engine_punned, device_punned, a_container, b_container, //
+        results, results_stride);
+}
+
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_u64tape(                       //
+    sz_levenshtein_distances_utf8_t engine_punned, sz_device_scope_t device_punned, //
+    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,                 //
+    sz_size_t *results, sz_size_t results_stride) {
+
+    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
+    auto a_container = sz_sequence_u64tape_as_cpp_container_t {a};
+    auto b_container = sz_sequence_u64tape_as_cpp_container_t {b};
+    return sz_levenshtein_distances_utf8_for_(                  //
+        engine_punned, device_punned, a_container, b_container, //
+        results, results_stride);
+}
+
+SZ_DYNAMIC void sz_levenshtein_distances_utf8_free(sz_levenshtein_distances_utf8_t engine_punned) {
+    sz_assert_(engine_punned != nullptr && "Engine must be initialized");
+    auto *engine = reinterpret_cast<levenshtein_distances_utf8_backends_t *>(engine_punned);
+    delete engine;
+}
+
+#pragma endregion Levenshtein UTF8 Distances
+
+#pragma region Fingerprints
+
 SZ_DYNAMIC sz_status_t sz_fingerprints_init(                              //
     sz_size_t alphabet_size, sz_size_t const *window_widths,              //
     sz_size_t window_widths_count, sz_size_t dimensions_per_window_width, //
@@ -680,4 +859,6 @@ SZ_DYNAMIC void sz_fingerprints_free(sz_fingerprints_t engine_punned) {
     delete engine;
 }
 
+#pragma endregion Fingerprints
+
 } // extern "C"
\ No newline at end of file
diff --git a/include/stringzillas/stringzillas.h b/include/stringzillas/stringzillas.h
index 96658890..e1063cb1 100644
--- a/include/stringzillas/stringzillas.h
+++ b/include/stringzillas/stringzillas.h
@@ -109,9 +109,9 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u64tape(            //
 
 SZ_DYNAMIC void sz_levenshtein_distances_free(sz_levenshtein_distances_t engine);
 
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_init(                                         //
-    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
-    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,                              //
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_init(                //
+    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t gap, //
+    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,     //
     sz_levenshtein_distances_utf8_t *engine);
 
 SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_sequence(        //
diff --git a/python/stringzillas.c b/python/stringzillas.c
index 82770093..34eca81d 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -83,6 +83,65 @@ typedef struct PyAPI {
 
 #pragma endregion
 
+#pragma region Metadata
+
+/**
+ *  @brief Parse capabilities from a Python tuple of strings and intersect with hardware capabilities.
+ *  @param[in] caps_tuple Python tuple containing capability strings (e.g., ('serial', 'haswell')).
+ *  @param[out] result Output capability mask after intersection with hardware capabilities.
+ *  @return 0 on success, -1 on error (with Python exception set).
+ */
+static int parse_and_intersect_capabilities(PyObject *caps_tuple, sz_capability_t *result) {
+    if (!PyTuple_Check(caps_tuple)) {
+        PyErr_SetString(PyExc_TypeError, "capabilities must be a tuple of strings");
+        return -1;
+    }
+
+    sz_capability_t requested_caps = 0;
+    Py_ssize_t n = PyTuple_Size(caps_tuple);
+
+    for (Py_ssize_t i = 0; i < n; i++) {
+        PyObject *item = PyTuple_GET_ITEM(caps_tuple, i);
+        if (!PyUnicode_Check(item)) {
+            PyErr_SetString(PyExc_TypeError, "capabilities must be a tuple of strings");
+            return -1;
+        }
+
+        const char *cap_str = PyUnicode_AsUTF8(item);
+        if (!cap_str) return -1;
+
+        // Map string to capability flag
+        if (strcmp(cap_str, "serial") == 0) { requested_caps |= sz_cap_serial_k; }
+        else if (strcmp(cap_str, "parallel") == 0) { requested_caps |= sz_cap_parallel_k; }
+        else if (strcmp(cap_str, "haswell") == 0) { requested_caps |= sz_cap_haswell_k; }
+        else if (strcmp(cap_str, "skylake") == 0) { requested_caps |= sz_cap_skylake_k; }
+        else if (strcmp(cap_str, "ice") == 0) { requested_caps |= sz_cap_ice_k; }
+        else if (strcmp(cap_str, "neon") == 0) { requested_caps |= sz_cap_neon_k; }
+        else if (strcmp(cap_str, "neon_aes") == 0) { requested_caps |= sz_cap_neon_aes_k; }
+        else if (strcmp(cap_str, "sve") == 0) { requested_caps |= sz_cap_sve_k; }
+        else if (strcmp(cap_str, "sve2") == 0) { requested_caps |= sz_cap_sve2_k; }
+        else if (strcmp(cap_str, "sve2_aes") == 0) { requested_caps |= sz_cap_sve2_aes_k; }
+        else if (strcmp(cap_str, "cuda") == 0) { requested_caps |= sz_cap_cuda_k; }
+        else if (strcmp(cap_str, "kepler") == 0) { requested_caps |= sz_cap_kepler_k; }
+        else if (strcmp(cap_str, "hopper") == 0) { requested_caps |= sz_cap_hopper_k; }
+        else if (strcmp(cap_str, "any") == 0) { requested_caps |= sz_cap_any_k; }
+        else {
+            PyErr_Format(PyExc_ValueError, "Unknown capability: %s", cap_str);
+            return -1;
+        }
+    }
+
+    // Intersect with hardware capabilities
+    *result = requested_caps & default_hardware_capabilities;
+
+    // If no capabilities match, fall back to serial
+    if (*result == 0) { *result = sz_cap_serial_k; }
+
+    return 0;
+}
+
+#pragma endregion
+
 #pragma region DeviceScope
 
 /**
@@ -231,6 +290,8 @@ static PyObject *LevenshteinDistances_new(PyTypeObject *type, PyObject *args, Py
 static int LevenshteinDistances_init(LevenshteinDistances *self, PyObject *args, PyObject *kwargs) {
     Py_ssize_t nargs = PyTuple_Size(args);
     sz_error_cost_t match = 0, mismatch = 1, open = 1, extend = 1;
+    PyObject *capabilities_tuple = NULL;
+    sz_capability_t capabilities = default_hardware_capabilities;
 
     // Manual argument parsing - fast path for positional args
     if (nargs >= 1) {
@@ -297,8 +358,17 @@ static int LevenshteinDistances_init(LevenshteinDistances *self, PyObject *args,
         }
     }
 
-    if (nargs > 4) {
-        PyErr_SetString(PyExc_TypeError, "LevenshteinDistances() takes at most 4 arguments");
+    if (nargs >= 5) {
+        PyObject *obj = PyTuple_GET_ITEM(args, 4);
+        if (PyTuple_Check(obj)) { capabilities_tuple = obj; }
+        else {
+            PyErr_SetString(PyExc_TypeError, "capabilities must be a tuple of strings");
+            return -1;
+        }
+    }
+
+    if (nargs > 5) {
+        PyErr_SetString(PyExc_TypeError, "LevenshteinDistances() takes at most 5 arguments");
         return -1;
     }
 
@@ -371,6 +441,17 @@ static int LevenshteinDistances_init(LevenshteinDistances *self, PyObject *args,
                 }
                 extend = (sz_error_cost_t)val;
             }
+            else if (PyUnicode_CompareWithASCIIString(key, "capabilities") == 0) {
+                if (nargs >= 5) {
+                    PyErr_SetString(PyExc_TypeError, "capabilities specified twice");
+                    return -1;
+                }
+                if (!PyTuple_Check(value)) {
+                    PyErr_SetString(PyExc_TypeError, "capabilities must be a tuple of strings");
+                    return -1;
+                }
+                capabilities_tuple = value;
+            }
             else {
                 PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
                 return -1;
@@ -378,8 +459,13 @@ static int LevenshteinDistances_init(LevenshteinDistances *self, PyObject *args,
         }
     }
 
+    // Parse capabilities if provided
+    if (capabilities_tuple) {
+        if (parse_and_intersect_capabilities(capabilities_tuple, &capabilities) != 0) { return -1; }
+    }
+
     sz_status_t status =
-        sz_levenshtein_distances_init(match, mismatch, open, extend, NULL, szs_capabilities(), &self->handle);
+        sz_levenshtein_distances_init(match, mismatch, open, extend, NULL, capabilities, &self->handle);
 
     if (status != sz_success_k) {
         PyErr_SetString(PyExc_RuntimeError, "Failed to initialize Levenshtein distances engine");
@@ -568,7 +654,7 @@ static PyObject *LevenshteinDistances_call(LevenshteinDistances *self, PyObject
 }
 
 static char const doc_LevenshteinDistances[] = //
-    "LevenshteinDistances(match=0, mismatch=1, open=1, extend=1)\n"
+    "LevenshteinDistances(match=0, mismatch=1, open=1, extend=1, capabilities=None)\n"
     "\n"
     "Compute Levenshtein edit distances between pairs of binary strings.\n"
     "\n"
@@ -577,6 +663,9 @@ static char const doc_LevenshteinDistances[] = //
     "  mismatch (int): Cost for mismatched characters (default: 1).\n"
     "  open (int): Cost for opening a gap (default: 1).\n"
     "  extend (int): Cost for extending a gap (default: 1).\n"
+    "  capabilities (Tuple[str], optional): Hardware capabilities to use.\n"
+    "                                       Will be intersected with detected capabilities.\n"
+    "                                       Examples: ('serial',), ('haswell', 'parallel')\n"
     "\n"
     "Call with:\n"
     "  a (sequence): First sequence of strings.\n"
@@ -615,34 +704,135 @@ static void LevenshteinDistancesUTF8_dealloc(LevenshteinDistancesUTF8 *self) {
     Py_TYPE(self)->tp_free((PyObject *)self);
 }
 
-static int LevenshteinDistancesUTF8_init(LevenshteinDistancesUTF8 *self, PyObject *args, PyObject *kwds) {
-    static char *kwlist[] = {"match", "mismatch", "gap_open", "gap_extend", NULL};
-    sz_error_cost_t match = 0, mismatch = 1, open = 1, extend = 1;
+static int LevenshteinDistancesUTF8_init(LevenshteinDistancesUTF8 *self, PyObject *args, PyObject *kwargs) {
+    Py_ssize_t nargs = PyTuple_Size(args);
+    sz_error_cost_t match = 0, mismatch = 1, gap = 1;
+    PyObject *capabilities_tuple = NULL;
+    sz_capability_t capabilities = default_hardware_capabilities;
 
-    if (args) {
-        Py_ssize_t n_args = PyTuple_Size(args);
-        for (Py_ssize_t i = 0; i < n_args; i++) {
-            PyObject *arg = PyTuple_GetItem(args, i);
-            int val = PyLong_AsLong(arg);
-            if (PyErr_Occurred()) return -1;
-            if (i == 0) { match = (sz_error_cost_t)val; }
-            else if (i == 1) { mismatch = (sz_error_cost_t)val; }
-            else if (i == 2) { open = (sz_error_cost_t)val; }
-            else if (i == 3) { extend = (sz_error_cost_t)val; }
+    // Manual argument parsing - fast path for positional args
+    if (nargs >= 1) {
+        PyObject *obj = PyTuple_GET_ITEM(args, 0);
+        if (PyLong_Check(obj)) {
+            long val = PyLong_AsLong(obj);
+            if (val < -128 || val > 127) {
+                PyErr_SetString(PyExc_ValueError, "match cost must fit in 8-bit signed integer");
+                return -1;
+            }
+            match = (sz_error_cost_t)val;
+        }
+        else {
+            PyErr_SetString(PyExc_TypeError, "match cost must be an integer");
+            return -1;
+        }
+    }
+    if (nargs >= 2) {
+        PyObject *obj = PyTuple_GET_ITEM(args, 1);
+        if (PyLong_Check(obj)) {
+            long val = PyLong_AsLong(obj);
+            if (val < -128 || val > 127) {
+                PyErr_SetString(PyExc_ValueError, "mismatch cost must fit in 8-bit signed integer");
+                return -1;
+            }
+            mismatch = (sz_error_cost_t)val;
+        }
+        else {
+            PyErr_SetString(PyExc_TypeError, "mismatch cost must be an integer");
+            return -1;
+        }
+    }
+    if (nargs >= 3) {
+        PyObject *obj = PyTuple_GET_ITEM(args, 2);
+        if (PyLong_Check(obj)) {
+            long val = PyLong_AsLong(obj);
+            if (val < -128 || val > 127) {
+                PyErr_SetString(PyExc_ValueError, "gap cost must fit in 8-bit signed integer");
+                return -1;
+            }
+            gap = (sz_error_cost_t)val;
+        }
+        else {
+            PyErr_SetString(PyExc_TypeError, "gap cost must be an integer");
+            return -1;
         }
     }
+    if (nargs >= 4) {
+        PyObject *obj = PyTuple_GET_ITEM(args, 3);
+        if (PyTuple_Check(obj)) { capabilities_tuple = obj; }
+        else {
+            PyErr_SetString(PyExc_TypeError, "capabilities must be a tuple of strings");
+            return -1;
+        }
+    }
+    if (nargs > 4) {
+        PyErr_SetString(PyExc_TypeError, "LevenshteinDistancesUTF8() takes at most 4 arguments");
+        return -1;
+    }
 
-    if (kwds) {
-        PyObject *key, *value;
+    // Parse keyword arguments
+    if (kwargs) {
         Py_ssize_t pos = 0;
-        while (PyDict_Next(kwds, &pos, &key, &value)) {
-            int val = PyLong_AsLong(value);
-            if (PyErr_Occurred()) return -1;
-
-            if (PyUnicode_CompareWithASCIIString(key, "match") == 0) { match = (sz_error_cost_t)val; }
-            else if (PyUnicode_CompareWithASCIIString(key, "mismatch") == 0) { mismatch = (sz_error_cost_t)val; }
-            else if (PyUnicode_CompareWithASCIIString(key, "gap_open") == 0) { open = (sz_error_cost_t)val; }
-            else if (PyUnicode_CompareWithASCIIString(key, "gap_extend") == 0) { extend = (sz_error_cost_t)val; }
+        PyObject *key, *value;
+        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+            if (PyUnicode_CompareWithASCIIString(key, "match") == 0) {
+                if (nargs >= 1) {
+                    PyErr_SetString(PyExc_TypeError, "match specified twice");
+                    return -1;
+                }
+                if (!PyLong_Check(value)) {
+                    PyErr_SetString(PyExc_TypeError, "match must be an integer");
+                    return -1;
+                }
+                long val = PyLong_AsLong(value);
+                if (val < -128 || val > 127) {
+                    PyErr_SetString(PyExc_ValueError, "match cost must fit in 8-bit signed integer");
+                    return -1;
+                }
+                match = (sz_error_cost_t)val;
+            }
+            else if (PyUnicode_CompareWithASCIIString(key, "mismatch") == 0) {
+                if (nargs >= 2) {
+                    PyErr_SetString(PyExc_TypeError, "mismatch specified twice");
+                    return -1;
+                }
+                if (!PyLong_Check(value)) {
+                    PyErr_SetString(PyExc_TypeError, "mismatch must be an integer");
+                    return -1;
+                }
+                long val = PyLong_AsLong(value);
+                if (val < -128 || val > 127) {
+                    PyErr_SetString(PyExc_ValueError, "mismatch cost must fit in 8-bit signed integer");
+                    return -1;
+                }
+                mismatch = (sz_error_cost_t)val;
+            }
+            else if (PyUnicode_CompareWithASCIIString(key, "gap") == 0) {
+                if (nargs >= 3) {
+                    PyErr_SetString(PyExc_TypeError, "gap specified twice");
+                    return -1;
+                }
+                if (!PyLong_Check(value)) {
+                    PyErr_SetString(PyExc_TypeError, "gap must be an integer");
+                    return -1;
+                }
+                long val = PyLong_AsLong(value);
+                if (val < -128 || val > 127) {
+                    PyErr_SetString(PyExc_ValueError, "gap cost must fit in 8-bit signed integer");
+                    return -1;
+                }
+                gap = (sz_error_cost_t)val;
+            }
+            else if (PyUnicode_CompareWithASCIIString(key, "capabilities") == 0) {
+                if (nargs >= 4) {
+                    PyErr_SetString(PyExc_TypeError, "capabilities specified twice");
+                    return -1;
+                }
+                if (!PyTuple_Check(value)) {
+                    PyErr_SetString(PyExc_TypeError, "capabilities must be a tuple of strings");
+                    return -1;
+                }
+                capabilities_tuple = value;
+            }
             else {
                 PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
                 return -1;
@@ -650,8 +840,12 @@ static int LevenshteinDistancesUTF8_init(LevenshteinDistancesUTF8 *self, PyObjec
         }
     }
 
-    sz_status_t status =
-        sz_levenshtein_distances_utf8_init(match, mismatch, open, extend, NULL, szs_capabilities(), &self->handle);
+    // Parse capabilities if provided
+    if (capabilities_tuple) {
+        if (parse_and_intersect_capabilities(capabilities_tuple, &capabilities) != 0) { return -1; }
+    }
+
+    sz_status_t status = sz_levenshtein_distances_utf8_init(match, mismatch, gap, NULL, capabilities, &self->handle);
 
     if (status != sz_success_k) {
         PyErr_SetString(PyExc_RuntimeError, "Failed to initialize UTF-8 Levenshtein distances engine");
@@ -839,7 +1033,7 @@ static PyObject *LevenshteinDistancesUTF8_call(LevenshteinDistancesUTF8 *self, P
 }
 
 static char const doc_LevenshteinDistancesUTF8[] = //
-    "LevenshteinDistancesUTF8(match=0, mismatch=1, gap_open=1, gap_extend=1)\n"
+    "LevenshteinDistancesUTF8(match=0, mismatch=1, gap=1, capabilities=None)\n"
     "\n"
     "Vectorized UTF-8 Levenshtein distance calculator.\n"
     "Computes edit distances between pairs of UTF-8 encoded strings.\n"
@@ -847,8 +1041,10 @@ static char const doc_LevenshteinDistancesUTF8[] = //
     "Args:\n"
     "  match (int): Cost of matching characters (default 0).\n"
     "  mismatch (int): Cost of mismatched characters (default 1).\n"
-    "  gap_open (int): Cost of opening a gap (default 1).\n"
-    "  gap_extend (int): Cost of extending a gap (default 1).\n"
+    "  gap (int): Cost of gap insertion (default 1).\n"
+    "  capabilities (Tuple[str], optional): Hardware capabilities to use.\n"
+    "                                       Will be intersected with detected capabilities.\n"
+    "                                       Examples: ('serial',), ('haswell', 'parallel')\n"
     "\n"
     "Call with:\n"
     "  a (sequence): First sequence of UTF-8 strings.\n"
diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index 5110dd43..8e7d6362 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -2,17 +2,24 @@
 """
 Test suite for StringZillas parallel algorithms module.
 Tests with Python lists, NumPy arrays, Apache Arrow columns, and StringZilla Strs types.
-To run locally:
+To run for the CPU backend:
 
     uv pip install numpy pyarrow pytest pytest-repeat
     SZ_TARGET=stringzillas-cpus uv pip install -e . --force-reinstall --no-build-isolation
+    uv run --no-project python -c "import stringzillas; print(stringzillas.__capabilities__)"
+    uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x
+
+To run for the CUDA backend:
+
+    uv pip install numpy pyarrow pytest pytest-repeat
     SZ_TARGET=stringzillas-cuda uv pip install -e . --force-reinstall --no-build-isolation
+    uv run --no-project python -c "import stringzillas; print(stringzillas.__capabilities__)"
     uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x
 """
 
 from random import choice, randint
 from string import ascii_lowercase
-from typing import Optional, Sequence, Dict
+from typing import Optional, Literal
 
 import pytest
 
@@ -55,6 +62,21 @@ def test_library_properties():
     assert "serial" in szs.__capabilities__, "Serial backend must be present"
 
 
+DeviceName = Literal["default", "cpu_cores", "gpu_device"]
+DEVICE_NAMES = ["default", "cpu_cores", "gpu_device"] if "cuda" in szs.__capabilities__ else ["default", "cpu_cores"]
+
+
+def device_scope_and_capabilities(device: DeviceName):
+    """Create a DeviceScope based on the specified device type."""
+    if device == "default":
+        return szs.DeviceScope(), ("serial",)
+    elif device == "cpu_cores":
+        return szs.DeviceScope(cpu_cores=2), ("serial", "parallel")
+    elif device == "gpu_device":
+        return szs.DeviceScope(gpu_device=0), ("cuda",)
+    else:
+        raise ValueError(f"Unknown device type: {device}")
+
 
 def test_device_scope():
     """Test DeviceScope for execution context control."""
@@ -161,14 +183,16 @@ def insert_char_at(s, char_to_insert, index):
         assert results[0] == [i + 1], f"Edit distance mismatch after {i + 1} insertions: {a} -> {b}"
 
 
-def test_levenshtein_distances_with_simple_cases():
+@pytest.mark.parametrize("device_name", DEVICE_NAMES)
+def test_levenshtein_distances_with_simple_cases(device_name: DeviceName):
 
-    binary_engine = szs.LevenshteinDistances()
+    device_scope, capabilities = device_scope_and_capabilities(device_name)
+    binary_engine = szs.LevenshteinDistances(capabilities=capabilities)
 
     def binary_distance(a: str, b: str) -> int:
         a_strs = Strs([a])
         b_strs = Strs([b])
-        results = binary_engine(a_strs, b_strs)
+        results = binary_engine(a_strs, b_strs, device=device_scope)
         assert len(results) == 1, "Binary engine should return a single distance"
         return results[0]
 
@@ -183,12 +207,20 @@ def binary_distance(a: str, b: str) -> int:
     assert binary_distance("ggbuzgjux{}l", "gbuzgjux{}l") == 1, "one insertion (prepended)"
     assert binary_distance("abcdefgABCDEFG", "ABCDEFGabcdefg") == 14
 
-    unicode_engine = szs.LevenshteinDistancesUTF8()
+
+@pytest.mark.parametrize("device_name", DEVICE_NAMES)
+def test_levenshtein_distances_utf8_with_simple_cases(device_name: DeviceName):
+
+    if device_name == "cuda":
+        pytest.skip("CUDA backend does not support custom gaps in UTF-8 Levenshtein distances")
+
+    device_scope, capabilities = device_scope_and_capabilities(device_name)
+    unicode_engine = szs.LevenshteinDistancesUTF8(capabilities=capabilities)
 
     def unicode_distance(a: str, b: str) -> int:
         a_strs = Strs([a])
         b_strs = Strs([b])
-        results = unicode_engine(a_strs, b_strs)
+        results = unicode_engine(a_strs, b_strs, device=device_scope)
         assert len(results) == 1, "Unicode engine should return a single distance"
         return results[0]
 
@@ -204,18 +236,22 @@ def unicode_distance(a: str, b: str) -> int:
     assert unicode_distance("こんにちは世界", "こんばんは世界") == 2, "Japanese greetings"
 
 
-def test_levenshtein_distances_with_custom_gaps():
+@pytest.mark.parametrize("device_name", DEVICE_NAMES)
+def test_levenshtein_distances_with_custom_gaps(device_name: DeviceName):
 
     mismatch: int = 4
     opening: int = 3
     extension: int = 2
 
-    binary_engine = szs.LevenshteinDistances()
+    device_scope, capabilities = device_scope_and_capabilities(device_name)
+    binary_engine = szs.LevenshteinDistances(
+        open=opening, extend=extension, mismatch=mismatch, capabilities=capabilities
+    )
 
     def binary_distance(a: str, b: str) -> int:
         a_strs = Strs([a])
         b_strs = Strs([b])
-        results = binary_engine(a_strs, b_strs)
+        results = binary_engine(a_strs, b_strs, device=device_scope)
         assert len(results) == 1, "Binary engine should return a single distance"
         return results[0]
 
@@ -230,12 +266,23 @@ def binary_distance(a: str, b: str) -> int:
     assert binary_distance("ggbuzgjux{}l", "gbuzgjux{}l") == opening, "one insertion (prepended)"
     assert binary_distance("abcdefgABCDEFG", "ABCDEFGabcdefg") == 14 * mismatch
 
-    unicode_engine = szs.LevenshteinDistancesUTF8()
+
+@pytest.mark.parametrize("device_name", DEVICE_NAMES)
+def test_levenshtein_distances_utf8_with_custom_gaps(device_name: DeviceName):
+
+    if device_name == "cuda":
+        pytest.skip("CUDA backend does not support custom gaps in UTF-8 Levenshtein distances")
+
+    mismatch: int = 4
+    opening: int = 3
+
+    device_scope, capabilities = device_scope_and_capabilities(device_name)
+    unicode_engine = szs.LevenshteinDistancesUTF8(gap=opening, mismatch=mismatch, capabilities=capabilities)
 
     def unicode_distance(a: str, b: str) -> int:
         a_strs = Strs([a])
         b_strs = Strs([b])
-        results = unicode_engine(a_strs, b_strs)
+        results = unicode_engine(a_strs, b_strs, device=device_scope)
         assert len(results) == 1, "Unicode engine should return a single distance"
         return results[0]
 

From 183ea96b7eeae625a85ac36919b1cc5986e41c08 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 11 Aug 2025 12:32:18 +0000
Subject: [PATCH 557/751] Make: Avoid OpenMP in builds

---
 CMakeLists.txt                 | 19 ++++++-------------
 include/stringzillas/types.hpp |  4 ++++
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6372b970..62c6d09c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -216,19 +216,6 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
         )
     endif ()
 
-    # Enable OpenMP if available
-    if (NOT target_type STREQUAL "SHARED_LIBRARY")
-        if (${compiler_id} STREQUAL "GNU"
-            OR ${compiler_id} STREQUAL "Clang"
-            OR ${compiler_id} STREQUAL "NVIDIA"
-        )
-            target_compile_options(${target} PRIVATE "-fopenmp")
-            target_link_libraries(${target} PRIVATE "-fopenmp")
-        elseif (${compiler_id} STREQUAL "MSVC")
-            target_compile_options(${target} PRIVATE "/openmp")
-            target_link_libraries(${target} PRIVATE "/openmp")
-        endif ()
-    endif ()
 
     # Set optimization options for different compilers differently
     if (${compiler_id} STREQUAL "MSVC")
@@ -601,6 +588,12 @@ if (${STRINGZILLAS_BUILD_SHARED})
 
         # Enable CUDA separable compilation for device code
         set_target_properties(stringzillas_cuda_shared PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+        
+        # Add CUDA-specific compiler flags
+        target_compile_options(stringzillas_cuda_shared PRIVATE "--expt-relaxed-constexpr")
+        
+        # Set the source file as CUDA
+        set_source_files_properties(c/stringzillas.cu TARGET_DIRECTORY stringzillas_cuda_shared PROPERTIES LANGUAGE CUDA)
     endif ()
 
     # TODO: Define StringZillas ROCm shared library when ROCm support is added if (ENABLE_ROCM)
diff --git a/include/stringzillas/types.hpp b/include/stringzillas/types.hpp
index 6ef9387c..74aab6fd 100644
--- a/include/stringzillas/types.hpp
+++ b/include/stringzillas/types.hpp
@@ -153,6 +153,10 @@ struct indexed_results_type<value_type_ *&> {
     using type = value_type_;
 };
 
+/**
+ *  @brief An example of an executor that uses OpenMP for parallel execution.
+ *  @note Fork Union is preferred over this for library builds, but this is useful for users already leveraging OpenMP.
+ */
 struct openmp_executor_t {
 
     /**

From 31fcdb34136b0949118993a493b804a040129a35 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 11 Aug 2025 14:24:16 +0000
Subject: [PATCH 558/751] Fix: MSVC & Clang compilation errors

---
 drafts/bench_find_many.cuh            |   1 -
 drafts/find_many.hpp                  |   6 +-
 drafts/test_find_many.cuh             |   2 -
 include/stringzilla/types.hpp         |  16 ++--
 include/stringzillas/fingerprints.hpp |  10 +--
 include/stringzillas/similarities.cuh |  12 +--
 include/stringzillas/similarities.hpp | 121 ++++++++++++++------------
 include/stringzillas/types.hpp        |  13 +--
 scripts/bench.hpp                     |  12 ++-
 scripts/bench_find.cpp                |   2 -
 scripts/bench_fingerprints.cuh        |   1 -
 scripts/bench_similarities.cuh        |   7 +-
 12 files changed, 105 insertions(+), 98 deletions(-)

diff --git a/drafts/bench_find_many.cuh b/drafts/bench_find_many.cuh
index fe5e2641..10fe436c 100644
--- a/drafts/bench_find_many.cuh
+++ b/drafts/bench_find_many.cuh
@@ -123,7 +123,6 @@ void bench_find_many(environment_t const &env) {
     // Let's reuse a thread-pool to amortize the cost of spawning threads.
     alignas(fu::default_alignment_k) fu::basic_pool_t pool;
     if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
-    static_assert(executor_like<fu::basic_pool_t>);
 
     auto scramble_accelerated_results = [&](auto &results_accelerated) {
         std::shuffle(results_accelerated.begin(), results_accelerated.end(), global_random_generator());
diff --git a/drafts/find_many.hpp b/drafts/find_many.hpp
index 0c8006c2..54ae7333 100644
--- a/drafts/find_many.hpp
+++ b/drafts/find_many.hpp
@@ -722,7 +722,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
      *  @return The total number of occurrences found.
      */
     template <typename haystacks_type_, typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     status_t try_count(haystacks_type_ &&haystacks, span<size_t> counts, executor_type_ &&executor = {},
@@ -785,7 +785,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
      *  @note The @p matches reference objects should be assignable from @b `match_t`.
      */
     template <typename haystacks_type_, typename output_matches_type_, typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     status_t try_find(haystacks_type_ &&haystacks, output_matches_type_ &&matches, //
@@ -806,7 +806,7 @@ struct find_many<state_id_type_, allocator_type_, sz_caps_sp_k, enable_> {
      *  @note The @p matches reference objects should be assignable from @b `match_t`.
      */
     template <typename haystacks_type_, typename output_matches_type_, typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     status_t try_find(haystacks_type_ &&haystacks, span<size_t const> counts, output_matches_type_ &&matches,
diff --git a/drafts/test_find_many.cuh b/drafts/test_find_many.cuh
index 281a1bdc..0514d5f4 100644
--- a/drafts/test_find_many.cuh
+++ b/drafts/test_find_many.cuh
@@ -454,14 +454,12 @@ void test_find_many_equivalence() {
     for (std::size_t threads : {2, 3, 4, 5}) {
         alignas(fu::default_alignment_k) fu::basic_pool_t pool;
         if (!pool.try_spawn(threads)) throw std::runtime_error("Failed to spawn thread pool.");
-        static_assert(executor_like<fu::basic_pool_t>);
         test_find_many_fixed(find_many_baselines_t {}, find_many_u32_parallel_t {}, pool);
     }
 
     // Let's reuse a thread-pool to amortize the cost of spawning threads.
     alignas(fu::default_alignment_k) fu::basic_pool_t pool;
     if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
-    static_assert(executor_like<fu::basic_pool_t>);
 
 #if SZ_USE_CUDA
     test_find_many_fixed(find_many_baselines_t {}, find_many_u32_cuda_t {}, cuda_executor_t {});
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 9613b5b4..c2cc02d7 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -101,6 +101,17 @@
 #define SZ_NOINLINE
 #endif
 
+/**
+ *  MSVC and NVCC have a hard time with concepts.
+ */
+#if defined(__NVCC__)
+#define SZ_HAS_CONCEPTS_ 0
+#elif defined(__cpp_concepts)
+#define SZ_HAS_CONCEPTS_ 1
+#else
+#define SZ_HAS_CONCEPTS_ 0
+#endif
+
 #if !SZ_AVOID_STL
 #include <initializer_list> // `std::initializer_list` is only ~100 LOC
 #include <iterator>         // `std::random_access_iterator_tag` pulls 20K LOC
@@ -469,13 +480,8 @@ struct arrow_strings_tape {
     using iterator_t = indexed_container_iterator<self_t>;
     using iterator = iterator_t; // ? For STL compatibility
 
-#if SZ_IS_CPP17_
     using char_alloc_t = typename std::allocator_traits<allocator_t>::template rebind_alloc<char_t>;
     using offset_alloc_t = typename std::allocator_traits<allocator_t>::template rebind_alloc<offset_t>;
-#else
-    using char_alloc_t = typename allocator_t::template rebind<char_t>::other;
-    using offset_alloc_t = typename allocator_t::template rebind<offset_t>::other;
-#endif
 
   private:
     span<char_t> buffer_;
diff --git a/include/stringzillas/fingerprints.hpp b/include/stringzillas/fingerprints.hpp
index 84d9ab6e..331a8db2 100644
--- a/include/stringzillas/fingerprints.hpp
+++ b/include/stringzillas/fingerprints.hpp
@@ -793,7 +793,7 @@ struct basic_rolling_hashers {
      */
     template <typename texts_type_, typename min_hashes_per_text_type_, typename min_counts_per_text_type_,
               typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     SZ_NOINLINE status_t operator()(                                                                      //
@@ -911,7 +911,7 @@ struct basic_rolling_hashers {
 template <typename engine_type_, typename texts_type_, typename min_hashes_per_text_type_,
           typename min_counts_per_text_type_,
           typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires executor_like<executor_type_>
 #endif
 SZ_NOINLINE status_t floating_rolling_hashers_in_parallel_(                                           //
@@ -1215,7 +1215,7 @@ struct floating_rolling_hashers<sz_cap_serial_k, dimensions_> {
      */
     template <typename texts_type_, typename min_hashes_per_text_type_, typename min_counts_per_text_type_,
               typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     SZ_NOINLINE status_t operator()(texts_type_ const &texts, min_hashes_per_text_type_ &&min_hashes, //
@@ -1412,7 +1412,7 @@ struct floating_rolling_hashers<sz_cap_haswell_k, dimensions_> {
      */
     template <typename texts_type_, typename min_hashes_per_text_type_, typename min_counts_per_text_type_,
               typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     SZ_NOINLINE status_t operator()(texts_type_ const &texts, min_hashes_per_text_type_ &&min_hashes_per_text, //
@@ -1736,7 +1736,7 @@ struct floating_rolling_hashers<sz_cap_skylake_k, dimensions_> {
      */
     template <typename texts_type_, typename min_hashes_per_text_type_, typename min_counts_per_text_type_,
               typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     SZ_NOINLINE status_t operator()(texts_type_ const &texts, min_hashes_per_text_type_ &&min_hashes_per_text, //
diff --git a/include/stringzillas/similarities.cuh b/include/stringzillas/similarities.cuh
index 0302a790..5c978697 100644
--- a/include/stringzillas/similarities.cuh
+++ b/include/stringzillas/similarities.cuh
@@ -145,7 +145,7 @@ __forceinline__ __device__ scalar_type_ load_last_use_(scalar_type_ const *ptr)
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
              substituter_like<substituter_type_>
 #endif
@@ -244,7 +244,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
              substituter_like<substituter_type_>
 #endif
@@ -344,7 +344,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
              substituter_like<substituter_type_>
 #endif
@@ -467,7 +467,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
              substituter_like<substituter_type_>
 #endif
@@ -2081,7 +2081,7 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
         : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires indexed_results_like<results_type_>
 #endif
     cuda_status_t operator()(                                                                 //
@@ -2747,7 +2747,7 @@ struct cuda_nw_or_sw_byte_level_scores_ {
         : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires indexed_results_like<results_type_>
 #endif
     cuda_status_t operator()(                                                                 //
diff --git a/include/stringzillas/similarities.hpp b/include/stringzillas/similarities.hpp
index 0637cb9e..5ff19c93 100644
--- a/include/stringzillas/similarities.hpp
+++ b/include/stringzillas/similarities.hpp
@@ -98,6 +98,11 @@ constexpr void rotate_three(value_type_ &a, value_type_ &b, value_type_ &c) noex
     c = tmp;
 }
 
+/**
+ *  @brief  A trivial `error_cost_abs` analog, that's `constexpr`-friendly.
+ */
+constexpr size_t error_cost_abs(error_cost_t x) noexcept { return static_cast<size_t>(x < 0 ? -(i32_t)x : (i32_t)x); }
+
 /**
  *  @brief  A trivial function object for linear and affine gap costs in Levenshtein-like similarity algorithms.
  *  @sa     affine_gap_costs_t
@@ -105,7 +110,7 @@ constexpr void rotate_three(value_type_ &a, value_type_ &b, value_type_ &c) noex
 struct linear_gap_costs_t {
     error_cost_t open_or_extend = 1;
 
-    constexpr size_t magnitude() const noexcept { return std::abs(open_or_extend); }
+    constexpr size_t magnitude() const noexcept { return error_cost_abs(open_or_extend); }
 };
 
 /**
@@ -116,7 +121,7 @@ struct affine_gap_costs_t {
     error_cost_t open = 1;
     error_cost_t extend = 1;
 
-    constexpr size_t magnitude() const noexcept { return std::max(std::abs(open), std::abs(extend)); }
+    constexpr size_t magnitude() const noexcept { return std::max(error_cost_abs(open), error_cost_abs(extend)); }
 };
 
 template <typename gap_costs_type_>
@@ -138,7 +143,7 @@ struct uniform_substitution_costs_t {
 
     constexpr error_cost_t operator()(char a, char b) const noexcept { return a == b ? match : mismatch; }
     constexpr error_cost_t operator()(sz_rune_t a, sz_rune_t b) const noexcept { return a == b ? match : mismatch; }
-    constexpr size_t magnitude() const noexcept { return std::max(std::abs(match), std::abs(mismatch)); }
+    constexpr size_t magnitude() const noexcept { return std::max(error_cost_abs(match), error_cost_abs(mismatch)); }
 };
 
 /**
@@ -172,7 +177,7 @@ struct error_costs_256x256_t {
         size_t max_magnitude = 0;
         for (int i = 0; i != 256; ++i)
             for (int j = 0; j != 256; ++j) //
-                max_magnitude = std::max(max_magnitude, (size_t)std::abs((int)cells[i][j]));
+                max_magnitude = std::max(max_magnitude, error_cost_abs(cells[i][j]));
         return max_magnitude;
     }
 };
@@ -266,7 +271,7 @@ struct similarity_memory_requirements {
 
 #pragma region - Core Templates
 
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
 
 template <typename iterator_type_>
 concept pointer_like = requires(iterator_type_ iterator, std::size_t idx) {
@@ -316,7 +321,7 @@ template <                                                       //
     sz_capability_t capability_ = sz_cap_serial_k,               //
     typename enable_ = void                                      //
     >
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
              substituter_like<substituter_type_> && gap_costs_like<gap_costs_type_>
 #endif
@@ -354,7 +359,7 @@ template <                                                       //
     sz_capability_t capability_ = sz_cap_serial_k,               //
     typename enable_ = void                                      //
     >
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires score_like<score_type_> && substituter_like<substituter_type_> && gap_costs_like<gap_costs_type_>
 #endif
 struct diagonal_walker;
@@ -393,7 +398,7 @@ template <                                                       //
     sz_capability_t capability_ = sz_cap_serial_k,               //
     typename enable_ = void                                      //
     >
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires score_like<score_type_> && substituter_like<substituter_type_> && gap_costs_like<gap_costs_type_>
 #endif
 struct horizontal_walker;
@@ -410,7 +415,7 @@ template <                                         //
     sz_capability_t capability_ = sz_cap_serial_k, //
     typename enable_ = void                        //
     >
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires gap_costs_like<gap_costs_type_>
 #endif
 struct levenshtein_distances;
@@ -422,7 +427,7 @@ template <                                         //
     sz_capability_t capability_ = sz_cap_serial_k, //
     typename enable_ = void                        //
     >
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires gap_costs_like<gap_costs_type_>
 #endif
 struct levenshtein_distances_utf8;
@@ -435,7 +440,7 @@ template <                                              //
     sz_capability_t capability_ = sz_cap_serial_k,      //
     typename enable_ = void                             //
     >
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires substituter_like<substituter_type_> && gap_costs_like<gap_costs_type_>
 #endif
 struct needleman_wunsch_scores;
@@ -448,7 +453,7 @@ template <                                              //
     sz_capability_t capability_ = sz_cap_serial_k,      //
     typename enable_ = void                             //
     >
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires substituter_like<substituter_type_> && gap_costs_like<gap_costs_type_>
 #endif
 struct smith_waterman_scores;
@@ -510,7 +515,7 @@ using affine_levenshtein_utf8_ice_t = levenshtein_distances_utf8<char, affine_ga
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
              substituter_like<substituter_type_>
 #endif
@@ -565,7 +570,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
      *  @param n The length of the diagonal to evaluate and the number of characters to compare from each string.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     void operator()(                                                                     //
@@ -602,7 +607,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
              substituter_like<substituter_type_>
 #endif
@@ -649,7 +654,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     score_t score() const noexcept { return best_score_; }
 
     template <typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     void operator()(                                                                           //
@@ -693,7 +698,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
              substituter_like<substituter_type_>
 #endif
@@ -757,7 +762,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
      *  @param n The length of the diagonal to evaluate and the number of characters to compare from each string.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     void operator()(                                                                     //
@@ -809,7 +814,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
  */
 template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
           typename substituter_type_, sz_similarity_objective_t objective_>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
              substituter_like<substituter_type_>
 #endif
@@ -863,7 +868,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
     score_t score() const noexcept { return best_score_; }
 
     template <typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     void operator()(                                                                           //
@@ -930,7 +935,7 @@ struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, sub
 template <typename char_type_, typename score_type_, typename substituter_type_, typename allocator_type_,
           sz_similarity_objective_t objective_, sz_similarity_locality_t locality_, sz_capability_t capability_,
           typename enable_>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires score_like<score_type_> && substituter_like<substituter_type_>
 #endif
 struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_costs_t, allocator_type_, objective_,
@@ -971,7 +976,7 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_co
      *  @param[out] result_ref Location to dump the calculated score.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref,
@@ -1123,7 +1128,7 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, linear_gap_co
 template <typename char_type_, typename score_type_, typename substituter_type_, typename allocator_type_,
           sz_similarity_objective_t objective_, sz_similarity_locality_t locality_, sz_capability_t capability_,
           typename enable_>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires score_like<score_type_> && substituter_like<substituter_type_>
 #endif
 struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_costs_t, allocator_type_, objective_,
@@ -1165,7 +1170,7 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
      *  @param[out] result_ref Location to dump the calculated score.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref,
@@ -1347,7 +1352,7 @@ struct diagonal_walker<char_type_, score_type_, substituter_type_, affine_gap_co
  */
 template <typename char_type_, typename score_type_, typename substituter_type_, typename allocator_type_,
           sz_similarity_objective_t objective_, sz_similarity_locality_t locality_>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires score_like<score_type_> && substituter_like<substituter_type_>
 #endif
 struct horizontal_walker<char_type_, score_type_, substituter_type_, linear_gap_costs_t, allocator_type_, objective_,
@@ -1391,7 +1396,7 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, linear_gap_
      *  @param[out] result_ref Location to dump the calculated score.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref,
@@ -1472,7 +1477,7 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, linear_gap_
  */
 template <typename char_type_, typename score_type_, typename substituter_type_, typename allocator_type_,
           sz_similarity_objective_t objective_, sz_similarity_locality_t locality_>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires score_like<score_type_> && substituter_like<substituter_type_>
 #endif
 struct horizontal_walker<char_type_, score_type_, substituter_type_, affine_gap_costs_t, allocator_type_, objective_,
@@ -1516,7 +1521,7 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, affine_gap_
      *  @param[out] result_ref Location to dump the calculated score.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, score_t &result_ref,
@@ -1624,7 +1629,7 @@ template <                                         //
     sz_capability_t capability_ = sz_cap_serial_k, //
     typename enable_ = void                        //
     >
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires gap_costs_like<gap_costs_type_>
 #endif
 struct levenshtein_distance {
@@ -1669,7 +1674,7 @@ struct levenshtein_distance {
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, size_t &result_ref,
@@ -1741,7 +1746,7 @@ template <                                         //
     sz_capability_t capability_ = sz_cap_serial_k, //
     typename enable_ = void                        //
     >
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires gap_costs_like<gap_costs_type_>
 #endif
 struct levenshtein_distance_utf8 {
@@ -1787,7 +1792,7 @@ struct levenshtein_distance_utf8 {
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, size_t &result_ref,
@@ -1887,7 +1892,7 @@ template <                                              //
     sz_capability_t capability_ = sz_cap_serial_k,      //
     typename enable_ = void                             //
     >
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires gap_costs_like<gap_costs_type_>
 #endif
 struct needleman_wunsch_score {
@@ -1927,7 +1932,7 @@ struct needleman_wunsch_score {
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref,
@@ -1982,7 +1987,7 @@ template <                                              //
     sz_capability_t capability_ = sz_cap_serial_k,      //
     typename enable_ = void                             //
     >
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires gap_costs_like<gap_costs_type_>
 #endif
 struct smith_waterman_score {
@@ -2022,7 +2027,7 @@ struct smith_waterman_score {
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref,
@@ -2093,7 +2098,7 @@ template <                                     //
     typename results_type_,                    //
     typename executor_type_ = dummy_executor_t //
     >
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires score_like<score_type_> && executor_like<executor_type_> && indexed_results_like<results_type_>
 #endif
 status_t _score_in_parallel(                                                                                       //
@@ -2159,7 +2164,7 @@ template <                         //
     typename second_strings_type_, //
     typename results_type_         //
     >
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires score_like<score_type_> && indexed_results_like<results_type_>
 #endif
 status_t _score_sequentially(                                                                                      //
@@ -2190,7 +2195,7 @@ template <                       //
     sz_capability_t capability_, //
     typename enable_             //
     >
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires gap_costs_like<gap_costs_type_>
 #endif
 struct levenshtein_distances {
@@ -2221,7 +2226,7 @@ struct levenshtein_distances {
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_,
               typename executor_type_>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_> && indexed_results_like<results_type_>
 #endif
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
@@ -2241,7 +2246,7 @@ template <                       //
     sz_capability_t capability_, //
     typename enable_             //
     >
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires gap_costs_like<gap_costs_type_>
 #endif
 struct levenshtein_distances_utf8 {
@@ -2272,7 +2277,7 @@ struct levenshtein_distances_utf8 {
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_,
               typename executor_type_>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_> && indexed_results_like<results_type_>
 #endif
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
@@ -2293,7 +2298,7 @@ template <                       //
     sz_capability_t capability_, //
     typename enable_             //
     >
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires substituter_like<substituter_type_> && gap_costs_like<gap_costs_type_>
 #endif
 struct needleman_wunsch_scores {
@@ -2324,7 +2329,7 @@ struct needleman_wunsch_scores {
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_,
               typename executor_type_>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_> && indexed_results_like<results_type_>
 #endif
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
@@ -2345,7 +2350,7 @@ template <                       //
     sz_capability_t capability_, //
     typename enable_             //
     >
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
     requires substituter_like<substituter_type_> && gap_costs_like<gap_costs_type_>
 #endif
 struct smith_waterman_scores {
@@ -2376,7 +2381,7 @@ struct smith_waterman_scores {
 
     template <typename first_strings_type_, typename second_strings_type_, typename results_type_,
               typename executor_type_>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_> && indexed_results_like<results_type_>
 #endif
     status_t operator()(first_strings_type_ const &first_strings, second_strings_type_ const &second_strings,
@@ -2477,7 +2482,7 @@ struct error_costs_26x26ascii_t {
         size_t max_magnitude = 0;
         for (int i = 0; i != 26; ++i)
             for (int j = 0; j != 26; ++j) //
-                max_magnitude = std::max(max_magnitude, (size_t)std::abs((int)cells[i][j]));
+                max_magnitude = std::max(max_magnitude, error_cost_abs(cells[i][j]));
         return max_magnitude;
     }
 
@@ -2942,7 +2947,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
     }
 
     template <typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     inline void operator()(                                                              //
@@ -3088,7 +3093,7 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_subst
     }
 
     template <typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     inline void operator()(                                                                        //
@@ -3234,7 +3239,7 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
     }
 
     template <typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     inline void operator()(                                                              //
@@ -3475,7 +3480,7 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
     }
 
     template <typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     inline void operator()(                                                              //
@@ -3596,7 +3601,7 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
     }
 
     template <typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     inline void operator()(                                                              //
@@ -3692,7 +3697,7 @@ struct levenshtein_distance<char, gap_costs_type_, allocator_type_, capability_,
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, size_t &result_ref,
@@ -3781,7 +3786,7 @@ struct levenshtein_distance_utf8<char, linear_gap_costs_t, allocator_type_, capa
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, size_t &result_ref,
@@ -3935,7 +3940,7 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i16_t, error_costs_
     lookup_in256bytes_ice_t_ lookup_;
 
     template <typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     void operator()(                                                                   //
@@ -4075,7 +4080,7 @@ struct tile_scorer<constant_iterator<char>, char const *, sz_i32_t, error_costs_
     lookup_in256bytes_ice_t_ lookup_;
 
     template <typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     void operator()(                                                                   //
@@ -4280,7 +4285,7 @@ struct needleman_wunsch_score<char, error_costs_256x256_t, linear_gap_costs_t, a
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref,
@@ -4354,7 +4359,7 @@ struct smith_waterman_score<char, error_costs_256x256_t, linear_gap_costs_t, all
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
      */
     template <typename executor_type_ = dummy_executor_t>
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
         requires executor_like<executor_type_>
 #endif
     status_t operator()(span<char_t const> first, span<char_t const> second, sz_ssize_t &result_ref,
diff --git a/include/stringzillas/types.hpp b/include/stringzillas/types.hpp
index 74aab6fd..c91036d3 100644
--- a/include/stringzillas/types.hpp
+++ b/include/stringzillas/types.hpp
@@ -107,10 +107,10 @@ struct dummy_executor_t {
     }
 };
 
-#if SZ_IS_CPP20_
+#if SZ_HAS_CONCEPTS_
+
 template <typename executor_type_>
 concept executor_like = requires(executor_type_ executor) {
-#if !defined(__NVCC__) && 0
     { executor.threads_count() } -> std::same_as<size_t>;
     {
         executor.for_n(0u, [](size_t) {})
@@ -124,9 +124,6 @@ concept executor_like = requires(executor_type_ executor) {
     {
         executor.for_threads([](size_t) {})
     };
-#else
-    sizeof(executor) > 0;
-#endif
 };
 
 template <typename results_type_>
@@ -228,12 +225,10 @@ struct openmp_executor_t {
     }
 };
 
-#if SZ_IS_CPP20_
-#if !defined(__NVCC__)
+#if SZ_HAS_CONCEPTS_
 static_assert(executor_like<dummy_executor_t>);
 static_assert(executor_like<openmp_executor_t>);
-// static_assert(!executor_like<int>);
-#endif
+static_assert(!executor_like<int>);
 
 template <typename continuous_type_>
 concept continuous_like = requires(continuous_type_ container) {
diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index 07b3be5d..4eff4c39 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -37,6 +37,7 @@
 #include <exception>  // `std::invalid_argument`
 #include <functional> // `std::equal_to`
 #include <limits>     // `std::numeric_limits`
+#include <numeric>    // `std::accumulate`
 #include <random>     // `std::random_device`, `std::mt19937`
 #include <string>     // `std::hash`
 #include <vector>     // `std::vector`
@@ -47,6 +48,10 @@
 #include <string_view> // Requires C++17
 #include <span>        // Requires C++20, used to pass info to batch-capable parallel backends
 
+#if defined(_MSC_VER)
+#include <intrin.h> // `__rdtsc`
+#endif
+
 #include "stringzilla/stringzilla.h"
 #include "stringzilla/stringzilla.hpp"
 
@@ -98,7 +103,10 @@ using profiled_function_t = std::function<call_result_t(std::size_t)>;
  *          Used as a more efficient alternative to `std::chrono::high_resolution_clock`.
  */
 inline std::uint64_t cpu_cycle_counter() {
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+    // Use MSVC intrinsics for `rdtsc`
+    return __rdtsc();
+#elif defined(__i386__) || defined(__x86_64__)
     // Use x86 inline assembly for `rdtsc` only if actually compiling for x86.
     unsigned int lo, hi;
     __asm__ volatile("rdtsc" : "=a"(lo), "=d"(hi));
@@ -439,7 +447,7 @@ inline environment_t build_environment(                                        /
     case environment_t::words_k: std::printf("word\n"); break;
     default: std::printf("%zu-grams\n", static_cast<std::size_t>(env.tokenization)); break;
     }
-    std::printf(" - Seed: %zu%s\n", env.seed, seed_message);
+    std::printf(" - Seed: %zu%s\n", static_cast<std::size_t>(env.seed), seed_message);
     std::printf(" - Stress-testing: %s\n", env.stress ? "yes" : "no");
     std::printf(" - Loaded dataset size: %zu bytes\n", env.dataset.size());
     std::printf(" - Number of tokens: %zu\n", env.tokens.size());
diff --git a/scripts/bench_find.cpp b/scripts/bench_find.cpp
index 40e1e03d..fa10cf78 100644
--- a/scripts/bench_find.cpp
+++ b/scripts/bench_find.cpp
@@ -306,7 +306,6 @@ struct matcher_strchr_t {
     constexpr size_type skip_length() const noexcept { return 1; }
 };
 
-#if defined(_GNU_SOURCE)
 /**
  *  @brief  Wraps the LibC functionality for finding the next occurrence of a byte-string in a buffer
  *          into something similar to @b `sz::matcher_find` and compatible with @b `sz::range_matches`.
@@ -325,7 +324,6 @@ struct matcher_memchr_t {
     }
     constexpr size_type skip_length() const noexcept { return 1; }
 };
-#endif
 
 /**
  *  @brief  Wraps the C++11 @b `std::find` algorithms for finding the next occurrence of a string
diff --git a/scripts/bench_fingerprints.cuh b/scripts/bench_fingerprints.cuh
index f54a7031..969c5ef4 100644
--- a/scripts/bench_fingerprints.cuh
+++ b/scripts/bench_fingerprints.cuh
@@ -97,7 +97,6 @@ void bench_fingerprints(environment_t const &env) {
     // Let's reuse a thread-pool to amortize the cost of spawning threads.
     alignas(fu::default_alignment_k) fu::basic_pool_t pool;
     if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
-    static_assert(executor_like<fu::basic_pool_t>);
 
     auto scramble_accelerated_results = [&]() {
         std::shuffle(min_hashes_accelerated.begin(), min_hashes_accelerated.end(), global_random_generator());
diff --git a/scripts/bench_similarities.cuh b/scripts/bench_similarities.cuh
index 3fc51eaa..82d805ac 100644
--- a/scripts/bench_similarities.cuh
+++ b/scripts/bench_similarities.cuh
@@ -51,10 +51,11 @@ struct similarities_callable {
 
     call_result_t operator()(std::span<token_view_t const> a, std::span<token_view_t const> b) noexcept(false) {
         // Unpack the extra arguments from `std::tuple` into the engine call using `std::apply`
-        status_t status = std::apply([&](auto &&...rest) { return engine(a, b, results.data(), rest...); }, extra_args);
+        auto status = std::apply([&](auto &&...rest) { return engine(a, b, results, rest...); }, extra_args);
         do_not_optimize(status);
 
-        if (status != status_t::success_k) throw std::runtime_error("Failed to compute Levenshtein distance.");
+        if (static_cast<status_t>(status) != status_t::success_k)
+            throw std::runtime_error("Failed to compute Levenshtein distance.");
         do_not_optimize(results);
         std::size_t bytes_passed = 0, cells_passed = 0;
         for (std::size_t i = 0; i < results.size(); ++i) {
@@ -104,7 +105,6 @@ void bench_levenshtein(environment_t const &env) {
     // Let's reuse a thread-pool to amortize the cost of spawning threads.
     alignas(fu::default_alignment_k) fu::basic_pool_t pool;
     if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
-    static_assert(executor_like<fu::basic_pool_t>);
 
     auto scramble_accelerated_results = [&](similarities_t &results_accelerated) {
         std::shuffle(results_accelerated.begin(), results_accelerated.end(), global_random_generator());
@@ -252,7 +252,6 @@ void bench_needleman_wunsch_smith_waterman(environment_t const &env) {
     // Let's reuse a thread-pool to amortize the cost of spawning threads.
     alignas(fu::default_alignment_k) fu::basic_pool_t pool;
     if (!pool.try_spawn(std::thread::hardware_concurrency())) throw std::runtime_error("Failed to spawn thread pool.");
-    static_assert(executor_like<fu::basic_pool_t>);
 
     auto scramble_accelerated_results = [&](similarities_t &results_accelerated) {
         std::shuffle(results_accelerated.begin(), results_accelerated.end(), global_random_generator());

From 079a8c17b4cbf0691c937030409733f95556a984 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 12 Aug 2025 20:27:00 +0000
Subject: [PATCH 559/751] Add: NW scoring

---
 c/stringzillas.cuh                  | 410 +++++++++++++++++++++++-----
 include/stringzillas/stringzillas.h |   6 +-
 2 files changed, 342 insertions(+), 74 deletions(-)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index 0e79d923..09011563 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -210,12 +210,7 @@ struct device_scope_t {
     device_scope_t(variants_arguments_ &&...args) noexcept : variants(std::forward<variants_arguments_>(args)...) {}
 };
 
-struct levenshtein_distances_backends_t {
-
-    using fallback_linear_variant_t =
-        szs::levenshtein_distances<char, szs::linear_gap_costs_t, malloc_t, sz_cap_serial_k>;
-    using fallback_affine_variant_t =
-        szs::levenshtein_distances<char, szs::affine_gap_costs_t, malloc_t, sz_cap_serial_k>;
+struct levenshtein_backends_t {
 
     /**
      *  On each hardware platform we use a different backend for Levenshtein distances,
@@ -225,26 +220,22 @@ struct levenshtein_distances_backends_t {
      */
     std::variant<
 #if SZ_USE_ICE
-        szs::levenshtein_distances<char, szs::linear_gap_costs_t, malloc_t, sz_caps_si_k>,
-        szs::levenshtein_distances<char, szs::affine_gap_costs_t, malloc_t, sz_caps_si_k>,
+        szs::levenshtein_ice_t, szs::affine_levenshtein_ice_t,
 #endif
 #if SZ_USE_CUDA
-        szs::levenshtein_distances<char, szs::linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>,
-        szs::levenshtein_distances<char, szs::affine_gap_costs_t, ualloc_t, sz_cap_cuda_k>,
+        szs::levenshtein_cuda_t, szs::affine_levenshtein_cuda_t,
 #endif
 #if SZ_USE_KEPLER
-        szs::levenshtein_distances<char, szs::linear_gap_costs_t, ualloc_t, sz_caps_ck_k>,
-        szs::levenshtein_distances<char, szs::affine_gap_costs_t, ualloc_t, sz_caps_ck_k>,
+        szs::levenshtein_kepler_t, szs::affine_levenshtein_kepler_t,
 #endif
 #if SZ_USE_HOPPER
-        szs::levenshtein_distances<char, szs::linear_gap_costs_t, ualloc_t, sz_caps_ckh_k>,
-        szs::levenshtein_distances<char, szs::affine_gap_costs_t, ualloc_t, sz_caps_ckh_k>,
+        szs::levenshtein_hopper_t, szs::affine_levenshtein_hopper_t,
 #endif
-        fallback_linear_variant_t, fallback_affine_variant_t>
+        szs::levenshtein_serial_t, szs::affine_levenshtein_serial_t>
         variants;
 
     template <typename... variants_arguments_>
-    levenshtein_distances_backends_t(variants_arguments_ &&...args) noexcept
+    levenshtein_backends_t(variants_arguments_ &&...args) noexcept
         : variants(std::forward<variants_arguments_>(args)...) {}
 };
 
@@ -259,7 +250,7 @@ sz_status_t sz_levenshtein_distances_for_(                                     /
     sz_assert_(results != nullptr && "Results must not be null");
 
     // Revert back from opaque pointer types
-    auto *engine = reinterpret_cast<levenshtein_distances_backends_t *>(engine_punned);
+    auto *engine = reinterpret_cast<levenshtein_backends_t *>(engine_punned);
     auto *device = reinterpret_cast<device_scope_t *>(device_punned);
 
     // Wrap our stable ABI sequences into C++ friendly containers
@@ -309,10 +300,7 @@ sz_status_t sz_levenshtein_distances_for_(                                     /
     return result;
 }
 
-struct levenshtein_distances_utf8_backends_t {
-
-    using fallback_variant_t =
-        szs::levenshtein_distances_utf8<char, szs::linear_gap_costs_t, malloc_t, sz_cap_serial_k>;
+struct levenshtein_utf8_backends_t {
 
     /**
      *  On each hardware platform we use a different backend for Levenshtein UTF8 distances,
@@ -321,13 +309,13 @@ struct levenshtein_distances_utf8_backends_t {
      */
     std::variant<
 #if SZ_USE_ICE
-        szs::levenshtein_distances_utf8<char, szs::linear_gap_costs_t, malloc_t, sz_caps_si_k>,
+        szs::levenshtein_utf8_ice_t, szs::affine_levenshtein_utf8_ice_t,
 #endif
-        fallback_variant_t>
+        szs::levenshtein_utf8_serial_t, szs::affine_levenshtein_utf8_serial_t>
         variants;
 
     template <typename... variants_arguments_>
-    levenshtein_distances_utf8_backends_t(variants_arguments_ &&...args) noexcept
+    levenshtein_utf8_backends_t(variants_arguments_ &&...args) noexcept
         : variants(std::forward<variants_arguments_>(args)...) {}
 };
 
@@ -342,7 +330,7 @@ sz_status_t sz_levenshtein_distances_utf8_for_(
     sz_assert_(results != nullptr && "Results must not be null");
 
     // Revert back from opaque pointer types
-    auto *engine = reinterpret_cast<levenshtein_distances_utf8_backends_t *>(engine_punned);
+    auto *engine = reinterpret_cast<levenshtein_utf8_backends_t *>(engine_punned);
     auto *device = reinterpret_cast<device_scope_t *>(device_punned);
 
     // Wrap our stable ABI sequences into C++ friendly containers
@@ -382,6 +370,93 @@ sz_status_t sz_levenshtein_distances_utf8_for_(
     return result;
 }
 
+struct needleman_wunsch_scores_backends_t {
+
+    /**
+     *  On each hardware platform we use a different backend for Levenshtein distances,
+     *  separately covering:
+     *  - Linear or Affine gap costs
+     *  - Serial, Ice Lake, CUDA, CUDA Kepler, and CUDA Hopper backends
+     */
+    std::variant<
+#if SZ_USE_ICE
+        szs::needleman_wunsch_ice_t, // ! No affine variant here yet
+#endif
+#if SZ_USE_CUDA
+        szs::needleman_wunsch_cuda_t, szs::affine_needleman_wunsch_cuda_t,
+#endif
+#if SZ_USE_HOPPER
+        szs::needleman_wunsch_hopper_t, szs::affine_needleman_wunsch_hopper_t,
+#endif
+        szs::needleman_wunsch_serial_t, szs::affine_needleman_wunsch_serial_t>
+        variants;
+
+    template <typename... variants_arguments_>
+    needleman_wunsch_scores_backends_t(variants_arguments_ &&...args) noexcept
+        : variants(std::forward<variants_arguments_>(args)...) {}
+};
+
+template <typename texts_type_>
+sz_status_t sz_needleman_wunsch_scores_for_(                                     //
+    sz_needleman_wunsch_scores_t engine_punned, sz_device_scope_t device_punned, //
+    texts_type_ &&a_container, texts_type_ &&b_container,                        //
+    sz_ssize_t *results, sz_size_t results_stride) {
+
+    sz_assert_(engine_punned != nullptr && "Engine must be initialized");
+    sz_assert_(device_punned != nullptr && "Device must be initialized");
+    sz_assert_(results != nullptr && "Results must not be null");
+
+    // Revert back from opaque pointer types
+    auto *engine = reinterpret_cast<needleman_wunsch_scores_backends_t *>(engine_punned);
+    auto *device = reinterpret_cast<device_scope_t *>(device_punned);
+
+    // Wrap our stable ABI sequences into C++ friendly containers
+    auto results_strided = strided_ptr<sz_ssize_t> {reinterpret_cast<sz_ptr_t>(results), results_stride};
+
+    // The simplest case, is having non-optimized non-unrolled hashers.
+    sz_status_t result = sz_success_k;
+    auto variant_logic = [&](auto &engine_variant) {
+        constexpr sz_capability_t engine_capability_k = engine_variant.capability_k;
+
+        // GPU backends are only compatible with GPU scopes
+        if constexpr (is_gpu_capability(engine_capability_k)) {
+#if SZ_USE_CUDA
+            if (std::holds_alternative<gpu_scope_t>(device->variants)) {
+                auto &device_scope = std::get<gpu_scope_t>(device->variants);
+                sz::status_t status = engine_variant(          //
+                    a_container, b_container, results_strided, //
+                    get_executor(device_scope), get_specs(device_scope));
+                result = static_cast<sz_status_t>(status);
+            }
+            else { result = sz_status_unknown_k; }
+#else
+            result = sz_status_unknown_k; // GPU support is not enabled
+#endif // SZ_USE_CUDA
+        }
+        // CPU backends are only compatible with CPU scopes
+        else {
+            if (std::holds_alternative<default_scope_t>(device->variants)) {
+                auto &device_scope = std::get<default_scope_t>(device->variants);
+                sz::status_t status = engine_variant(          //
+                    a_container, b_container, results_strided, //
+                    get_executor(device_scope), get_specs(device_scope));
+                result = static_cast<sz_status_t>(status);
+            }
+            else if (std::holds_alternative<cpu_scope_t>(device->variants)) {
+                auto &device_scope = std::get<cpu_scope_t>(device->variants);
+                sz::status_t status = engine_variant(          //
+                    a_container, b_container, results_strided, //
+                    get_executor(device_scope), get_specs(device_scope));
+                result = static_cast<sz_status_t>(status);
+            }
+            else { result = sz_status_unknown_k; }
+        }
+    };
+
+    std::visit(variant_logic, engine->variants);
+    return result;
+}
+
 template <typename element_type_>
 using vec = szs::safe_vector<element_type_, std::allocator<element_type_>>;
 
@@ -478,7 +553,11 @@ SZ_DYNAMIC int szs_version_patch(void) { return STRINGZILLA_H_VERSION_PATCH; }
 
 SZ_DYNAMIC sz_capability_t szs_capabilities(void) {
     sz_capability_t cpu_capabilities = sz_capabilities_implementation_();
+#if SZ_USE_CUDA
     return static_cast<sz_capability_t>(cpu_capabilities | sz_caps_ckh_k);
+#else
+    return cpu_capabilities;
+#endif // SZ_USE_CUDA
 }
 
 SZ_DYNAMIC sz_status_t sz_memory_allocator_init_unified(sz_memory_allocator_t *alloc) {
@@ -565,28 +644,22 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(
     auto const substitution_costs = szs::uniform_substitution_costs_t {match, mismatch};
     auto const linear_costs = szs::linear_gap_costs_t {open};
     auto const affine_costs = szs::affine_gap_costs_t {open, extend};
-    using fallback_linear_variant_t = typename levenshtein_distances_backends_t::fallback_linear_variant_t;
-    using fallback_affine_variant_t = typename levenshtein_distances_backends_t::fallback_affine_variant_t;
 
 #if SZ_USE_ICE
-    bool const can_use_ice = (capabilities & sz_cap_serial_k) != 0;
+    bool const can_use_ice = (capabilities & sz_cap_serial_k) == sz_cap_serial_k;
     if (can_use_ice && can_use_linear_costs) {
-        auto variant = szs::levenshtein_distances<char, szs::linear_gap_costs_t, malloc_t, sz_caps_si_k>(
-            substitution_costs, linear_costs);
-        auto engine = new (std::nothrow) levenshtein_distances_backends_t(
-            std::in_place_type_t<szs::levenshtein_distances<char, szs::linear_gap_costs_t, malloc_t, sz_caps_si_k>>(),
-            std::move(variant));
+        auto variant = szs::levenshtein_ice_t(substitution_costs, linear_costs);
+        auto engine = new (std::nothrow)
+            levenshtein_backends_t(std::in_place_type_t<szs::levenshtein_ice_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
         *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
         return sz_success_k;
     }
     else if (can_use_ice) {
-        auto variant = szs::levenshtein_distances<char, szs::affine_gap_costs_t, malloc_t, sz_caps_si_k>(
-            substitution_costs, affine_costs);
-        auto engine = new (std::nothrow) levenshtein_distances_backends_t(
-            std::in_place_type_t<szs::levenshtein_distances<char, szs::affine_gap_costs_t, malloc_t, sz_caps_si_k>>(),
-            std::move(variant));
+        auto variant = szs::affine_levenshtein_ice_t(substitution_costs, affine_costs);
+        auto engine = new (std::nothrow)
+            levenshtein_backends_t(std::in_place_type_t<szs::affine_levenshtein_ice_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
         *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
@@ -595,24 +668,20 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(
 #endif // SZ_USE_ICE
 
 #if SZ_USE_CUDA
-    bool const can_use_cuda = (capabilities & sz_cap_cuda_k) != 0;
+    bool const can_use_cuda = (capabilities & sz_cap_cuda_k) == sz_cap_cuda_k;
     if (can_use_cuda && can_use_linear_costs) {
-        auto variant = szs::levenshtein_distances<char, szs::linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>(
-            substitution_costs, linear_costs);
-        auto engine = new (std::nothrow) levenshtein_distances_backends_t(
-            std::in_place_type_t<szs::levenshtein_distances<char, szs::linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>>(),
-            std::move(variant));
+        auto variant = szs::levenshtein_cuda_t(substitution_costs, linear_costs);
+        auto engine = new (std::nothrow)
+            levenshtein_backends_t(std::in_place_type_t<szs::levenshtein_cuda_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
         *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
         return sz_success_k;
     }
     else if (can_use_cuda) {
-        auto variant = szs::levenshtein_distances<char, szs::affine_gap_costs_t, ualloc_t, sz_cap_cuda_k>(
-            substitution_costs, affine_costs);
-        auto engine = new (std::nothrow) levenshtein_distances_backends_t(
-            std::in_place_type_t<szs::levenshtein_distances<char, szs::affine_gap_costs_t, ualloc_t, sz_cap_cuda_k>>(),
-            std::move(variant));
+        auto variant = szs::affine_levenshtein_cuda_t(substitution_costs, affine_costs);
+        auto engine = new (std::nothrow)
+            levenshtein_backends_t(std::in_place_type_t<szs::affine_levenshtein_cuda_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
         *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
@@ -620,19 +689,63 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(
     }
 #endif // SZ_USE_CUDA
 
+#if SZ_USE_KEPLER
+    bool const can_use_kepler = (capabilities & sz_cap_ck_k) == sz_cap_ck_k;
+    if (can_use_kepler && can_use_linear_costs) {
+        auto variant = szs::levenshtein_kepler_t(substitution_costs, linear_costs);
+        auto engine = new (std::nothrow)
+            levenshtein_backends_t(std::in_place_type_t<szs::levenshtein_kepler_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
+        return sz_success_k;
+    }
+    else if (can_use_kepler) {
+        auto variant = szs::affine_levenshtein_kepler_t(substitution_costs, affine_costs);
+        auto engine = new (std::nothrow)
+            levenshtein_backends_t(std::in_place_type_t<szs::affine_levenshtein_kepler_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
+        return sz_success_k;
+    }
+#endif // SZ_USE_KEPLER
+
+#if SZ_USE_HOPPER
+    bool const can_use_hopper = (capabilities & sz_caps_ckh_k) == sz_caps_ckh_k;
+    if (can_use_hopper && can_use_linear_costs) {
+        auto variant = szs::levenshtein_hopper_t(substitution_costs, linear_costs);
+        auto engine = new (std::nothrow)
+            levenshtein_backends_t(std::in_place_type_t<szs::levenshtein_hopper_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
+        return sz_success_k;
+    }
+    else if (can_use_hopper) {
+        auto variant = szs::affine_levenshtein_hopper_t(substitution_costs, affine_costs);
+        auto engine = new (std::nothrow)
+            levenshtein_backends_t(std::in_place_type_t<szs::affine_levenshtein_hopper_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
+        return sz_success_k;
+    }
+#endif // SZ_USE_HOPPER
+
     if (can_use_linear_costs) {
-        auto variant = fallback_linear_variant_t(substitution_costs, linear_costs);
+        auto variant = szs::levenshtein_serial_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)
-            levenshtein_distances_backends_t(std::in_place_type_t<fallback_linear_variant_t>(), std::move(variant));
+            levenshtein_backends_t(std::in_place_type_t<szs::levenshtein_serial_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
         *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
         return sz_success_k;
     }
     else {
-        auto variant = fallback_affine_variant_t(substitution_costs, affine_costs);
+        auto variant = szs::affine_levenshtein_serial_t(substitution_costs, affine_costs);
         auto engine = new (std::nothrow)
-            levenshtein_distances_backends_t(std::in_place_type_t<fallback_affine_variant_t>(), std::move(variant));
+            levenshtein_backends_t(std::in_place_type_t<szs::affine_levenshtein_serial_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
         *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
@@ -681,7 +794,7 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u64tape(                       /
 
 SZ_DYNAMIC void sz_levenshtein_distances_free(sz_levenshtein_distances_t engine_punned) {
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
-    auto *engine = reinterpret_cast<levenshtein_distances_backends_t *>(engine_punned);
+    auto *engine = reinterpret_cast<levenshtein_backends_t *>(engine_punned);
     delete engine;
 }
 
@@ -689,27 +802,34 @@ SZ_DYNAMIC void sz_levenshtein_distances_free(sz_levenshtein_distances_t engine_
 
 #pragma region Levenshtein UTF8 Distances
 
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_init(                //
-    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t gap, //
-    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,     //
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_init(                                         //
+    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
+    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,                              //
     sz_levenshtein_distances_utf8_t *engine_punned) {
 
     sz_assert_(engine_punned != nullptr && *engine_punned == nullptr && "Engine must be uninitialized");
 
     // If the gap opening and extension costs are identical we can use less memory
+    auto const can_use_linear_costs = open == extend;
     auto const substitution_costs = szs::uniform_substitution_costs_t {match, mismatch};
-    auto const linear_costs = szs::linear_gap_costs_t {gap};
-    using fallback_variant_t = typename levenshtein_distances_utf8_backends_t::fallback_variant_t;
+    auto const linear_costs = szs::linear_gap_costs_t {open};
+    auto const affine_costs = szs::affine_gap_costs_t {open, extend};
 
 #if SZ_USE_ICE
     bool const can_use_ice = (capabilities & sz_cap_ice_k) != 0;
-    if (can_use_ice) {
-        auto variant = szs::levenshtein_distances_utf8<char, szs::linear_gap_costs_t, malloc_t, sz_caps_si_k>(
-            substitution_costs, linear_costs);
-        auto engine = new (std::nothrow) levenshtein_distances_utf8_backends_t(
-            std::in_place_type_t<
-                szs::levenshtein_distances_utf8<char, szs::linear_gap_costs_t, malloc_t, sz_caps_si_k>>(),
-            std::move(variant));
+    if (can_use_ice && can_use_linear_costs) {
+        auto variant = szs::levenshtein_utf8_ice_t(substitution_costs, linear_costs);
+        auto engine = new (std::nothrow)
+            levenshtein_utf8_backends_t(std::in_place_type_t<szs::levenshtein_utf8_ice_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_levenshtein_distances_utf8_t>(engine);
+        return sz_success_k;
+    }
+    else {
+        auto variant = szs::affine_levenshtein_utf8_ice_t(substitution_costs, affine_costs);
+        auto engine = new (std::nothrow)
+            levenshtein_utf8_backends_t(std::in_place_type_t<szs::affine_levenshtein_utf8_ice_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
         *engine_punned = reinterpret_cast<sz_levenshtein_distances_utf8_t>(engine);
@@ -717,11 +837,20 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_init(                //
     }
 #endif // SZ_USE_ICE
 
-    bool const can_use_serial = (capabilities & sz_cap_serial_k) != 0;
-    if (can_use_serial) {
-        auto variant = fallback_variant_t(substitution_costs, linear_costs);
+    bool const can_use_serial = (capabilities & sz_cap_serial_k) == sz_cap_serial_k;
+    if (can_use_serial && can_use_linear_costs) {
+        auto variant = szs::levenshtein_utf8_serial_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)
-            levenshtein_distances_utf8_backends_t(std::in_place_type_t<fallback_variant_t>(), std::move(variant));
+            levenshtein_utf8_backends_t(std::in_place_type_t<szs::levenshtein_utf8_serial_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_levenshtein_distances_utf8_t>(engine);
+        return sz_success_k;
+    }
+    else {
+        auto variant = szs::affine_levenshtein_utf8_serial_t(substitution_costs, affine_costs);
+        auto engine = new (std::nothrow) levenshtein_utf8_backends_t(
+            std::in_place_type_t<szs::affine_levenshtein_utf8_serial_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
         *engine_punned = reinterpret_cast<sz_levenshtein_distances_utf8_t>(engine);
@@ -772,12 +901,151 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_u64tape(
 
 SZ_DYNAMIC void sz_levenshtein_distances_utf8_free(sz_levenshtein_distances_utf8_t engine_punned) {
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
-    auto *engine = reinterpret_cast<levenshtein_distances_utf8_backends_t *>(engine_punned);
+    auto *engine = reinterpret_cast<levenshtein_utf8_backends_t *>(engine_punned);
     delete engine;
 }
 
 #pragma endregion Levenshtein UTF8 Distances
 
+#pragma region Needleman Wunsch
+
+SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_init(                        //
+    sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
+    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,          //
+    sz_needleman_wunsch_scores_t *engine_punned) {
+
+    sz_assert_(engine_punned != nullptr && *engine_punned == nullptr && "Engine must be uninitialized");
+
+    // If the gap opening and extension costs are identical we can use less memory
+    auto const can_use_linear_costs = open == extend;
+    auto const substitution_costs = *reinterpret_cast<szs::error_costs_256x256_t const *>(subs);
+    auto const linear_costs = szs::linear_gap_costs_t {open};
+    auto const affine_costs = szs::affine_gap_costs_t {open, extend};
+
+#if SZ_USE_ICE
+    bool const can_use_ice = (capabilities & sz_cap_serial_k) == sz_cap_serial_k;
+    if (can_use_ice && can_use_linear_costs) {
+        auto variant = szs::needleman_wunsch_ice_t(substitution_costs, linear_costs);
+        auto engine = new (std::nothrow)
+            needleman_wunsch_scores_backends_t(std::in_place_type_t<szs::needleman_wunsch_ice_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_needleman_wunsch_scores_t>(engine);
+        return sz_success_k;
+    }
+#endif // SZ_USE_ICE
+
+#if SZ_USE_CUDA
+    bool const can_use_cuda = (capabilities & sz_cap_cuda_k) != 0;
+    if (can_use_cuda && can_use_linear_costs) {
+        auto variant = szs::needleman_wunsch_cuda_t(substitution_costs, linear_costs);
+        auto engine = new (std::nothrow) needleman_wunsch_scores_backends_t(
+            std::in_place_type_t<szs::needleman_wunsch_cuda_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_needleman_wunsch_scores_t>(engine);
+        return sz_success_k;
+    }
+    else if (can_use_cuda) {
+        auto variant = affine_needleman_wunsch_cuda_t(substitution_costs, affine_costs);
+        auto engine = new (std::nothrow) needleman_wunsch_scores_backends_t(
+            std::in_place_type_t<affine_needleman_wunsch_cuda_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_needleman_wunsch_scores_t>(engine);
+        return sz_success_k;
+    }
+#endif // SZ_USE_CUDA
+
+#if SZ_USE_HOPPER
+    bool const can_use_hopper = (capabilities & sz_caps_ckh_k) == sz_caps_ckh_k;
+    if (can_use_hopper && can_use_linear_costs) {
+        auto variant = szs::needleman_wunsch_hopper_t(substitution_costs, linear_costs);
+        auto engine = new (std::nothrow) needleman_wunsch_scores_backends_t(
+            std::in_place_type_t<szs::needleman_wunsch_hopper_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_needleman_wunsch_scores_t>(engine);
+        return sz_success_k;
+    }
+    else if (can_use_hopper) {
+        auto variant = szs::affine_needleman_wunsch_hopper_t(substitution_costs, affine_costs);
+        auto engine = new (std::nothrow) needleman_wunsch_scores_backends_t(
+            std::in_place_type_t<szs::affine_needleman_wunsch_hopper_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_needleman_wunsch_scores_t>(engine);
+        return sz_success_k;
+    }
+#endif // SZ_USE_HOPPER
+
+    if (can_use_linear_costs) {
+        auto variant = szs::needleman_wunsch_serial_t(substitution_costs, linear_costs);
+        auto engine = new (std::nothrow) needleman_wunsch_scores_backends_t(
+            std::in_place_type_t<szs::needleman_wunsch_serial_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_needleman_wunsch_scores_t>(engine);
+        return sz_success_k;
+    }
+    else {
+        auto variant = szs::affine_needleman_wunsch_serial_t(substitution_costs, affine_costs);
+        auto engine = new (std::nothrow) needleman_wunsch_scores_backends_t(
+            std::in_place_type_t<szs::affine_needleman_wunsch_serial_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_needleman_wunsch_scores_t>(engine);
+        return sz_success_k;
+    }
+}
+
+SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_sequence(                      //
+    sz_needleman_wunsch_scores_t engine_punned, sz_device_scope_t device_punned, //
+    sz_sequence_t const *a, sz_sequence_t const *b,                              //
+    sz_ssize_t *results, sz_size_t results_stride) {
+
+    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
+    auto a_container = sz_sequence_as_cpp_container_t {a};
+    auto b_container = sz_sequence_as_cpp_container_t {b};
+    return sz_needleman_wunsch_scores_for_(                     //
+        engine_punned, device_punned, a_container, b_container, //
+        results, results_stride);
+}
+
+SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u32tape(                       //
+    sz_needleman_wunsch_scores_t engine_punned, sz_device_scope_t device_punned, //
+    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,              //
+    sz_ssize_t *results, sz_size_t results_stride) {
+
+    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
+    auto a_container = sz_sequence_u32tape_as_cpp_container_t {a};
+    auto b_container = sz_sequence_u32tape_as_cpp_container_t {b};
+    return sz_needleman_wunsch_scores_for_(                     //
+        engine_punned, device_punned, a_container, b_container, //
+        results, results_stride);
+}
+
+SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u64tape(                       //
+    sz_needleman_wunsch_scores_t engine_punned, sz_device_scope_t device_punned, //
+    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,              //
+    sz_ssize_t *results, sz_size_t results_stride) {
+
+    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
+    auto a_container = sz_sequence_u64tape_as_cpp_container_t {a};
+    auto b_container = sz_sequence_u64tape_as_cpp_container_t {b};
+    return sz_needleman_wunsch_scores_for_(                     //
+        engine_punned, device_punned, a_container, b_container, //
+        results, results_stride);
+}
+
+SZ_DYNAMIC void sz_needleman_wunsch_scores_free(sz_needleman_wunsch_scores_t engine_punned) {
+    sz_assert_(engine_punned != nullptr && "Engine must be initialized");
+    auto *engine = reinterpret_cast<needleman_wunsch_scores_backends_t *>(engine_punned);
+    delete engine;
+}
+
+#pragma endregion Needleman Wunsch
+
 #pragma region Fingerprints
 
 SZ_DYNAMIC sz_status_t sz_fingerprints_init(                              //
diff --git a/include/stringzillas/stringzillas.h b/include/stringzillas/stringzillas.h
index e1063cb1..96658890 100644
--- a/include/stringzillas/stringzillas.h
+++ b/include/stringzillas/stringzillas.h
@@ -109,9 +109,9 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u64tape(            //
 
 SZ_DYNAMIC void sz_levenshtein_distances_free(sz_levenshtein_distances_t engine);
 
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_init(                //
-    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t gap, //
-    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,     //
+SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_init(                                         //
+    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
+    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,                              //
     sz_levenshtein_distances_utf8_t *engine);
 
 SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_sequence(        //

From 57d4ec82c3d08d424b2414765db494540d1f4c34 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 12 Aug 2025 20:27:17 +0000
Subject: [PATCH 560/751] Fix: Refer to `prong_t` in executor concepts

---
 include/stringzillas/types.hpp | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/include/stringzillas/types.hpp b/include/stringzillas/types.hpp
index c91036d3..52d7c94b 100644
--- a/include/stringzillas/types.hpp
+++ b/include/stringzillas/types.hpp
@@ -111,19 +111,12 @@ struct dummy_executor_t {
 
 template <typename executor_type_>
 concept executor_like = requires(executor_type_ executor) {
-    { executor.threads_count() } -> std::same_as<size_t>;
-    {
-        executor.for_n(0u, [](size_t) {})
-    };
-    {
-        executor.for_slices(0u, [](size_t, size_t) {})
-    };
-    {
-        executor.for_n_dynamic(0u, [](size_t) {})
-    };
-    {
-        executor.for_threads([](size_t) {})
-    };
+    { executor.threads_count() } -> std::convertible_to<size_t>;
+    typename executor_type_::prong_t;
+    executor.for_n(0u, [](typename executor_type_::prong_t) {});
+    executor.for_slices(0u, [](typename executor_type_::prong_t, size_t) {});
+    executor.for_n_dynamic(0u, [](typename executor_type_::prong_t) {});
+    executor.for_threads([](size_t) {});
 };
 
 template <typename results_type_>
@@ -155,6 +148,7 @@ struct indexed_results_type<value_type_ *&> {
  *  @note Fork Union is preferred over this for library builds, but this is useful for users already leveraging OpenMP.
  */
 struct openmp_executor_t {
+    using prong_t = std::size_t;
 
     /**
      *  @brief  Calls the @p function for each index from 0 to @p (n) in such

From 84492e58c915a983fcc69b521a526ea1643ebd3a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 12 Aug 2025 20:37:57 +0000
Subject: [PATCH 561/751] Make: Option to disable CUDA builds

We also should only build for the newest
supported CUDA architecture - CUDA 90a
for Hopper. Dynamic dispatch will happen
under the hood.
---
 CMakeLists.txt | 64 ++++++++++++++++++++++++++++----------------------
 1 file changed, 36 insertions(+), 28 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 62c6d09c..d29c09ca 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,27 +65,14 @@ message(STATUS "C++ Compiler Version: ${CMAKE_CXX_COMPILER_VERSION}")
 message(STATUS "C++ Compiler: ${CMAKE_CXX_COMPILER}")
 
 # Detect CUDA Support
-set(ENABLE_CUDA OFF)
+set(STRINGZILLA_CAN_BUILD_CUDA OFF)
 include(CheckLanguage)
 check_language(CUDA)
-
 if (CMAKE_CUDA_COMPILER)
-    enable_language(CUDA)
-    set(ENABLE_CUDA ON)
-    set(CMAKE_CUDA_STANDARD 20)
-    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
-    set(CMAKE_CUDA_EXTENSIONS OFF)
-    set(CMAKE_CUDA_ARCHITECTURES 80 90)
-    set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
-    find_package(CUDAToolkit REQUIRED)
-    message(STATUS "CUDA detected!")
-    message(STATUS "CUDA Compiler: ${CMAKE_CUDA_COMPILER}")
-    message(STATUS "CUDA Compiler ID: ${CMAKE_CUDA_COMPILER_ID}")
-    message(STATUS "CUDA Toolkit Version: ${CUDAToolkit_VERSION}")
-    message(STATUS "CUDA Toolkit Include Path: ${CUDAToolkit_INCLUDE_DIRS}")
-    message(STATUS "CUDA Toolkit Libraries Path: ${CUDAToolkit_LIBRARY_DIR}")
+    set(STRINGZILLA_CAN_BUILD_CUDA ON)
+    message(STATUS "CUDA compiler available")
 else ()
-    message(STATUS "CUDA not detected. Skipping CUDA-specific builds.")
+    message(STATUS "CUDA compiler not available")
 endif ()
 
 if (CMAKE_SIZEOF_VOID_P EQUAL 8)
@@ -126,11 +113,31 @@ option(STRINGZILLA_BUILD_TEST "Compile a native unit test in C++" ${STRINGZILLA_
 option(STRINGZILLA_BUILD_BENCHMARK "Compile a native benchmark in C++" ${STRINGZILLA_IS_MAIN_PROJECT})
 option(STRINGZILLA_BUILD_SHARED "Compile a dynamic library" ${STRINGZILLA_IS_MAIN_PROJECT})
 option(STRINGZILLAS_BUILD_SHARED "Compile dynamic parallel libraries" ${STRINGZILLA_IS_MAIN_PROJECT})
+option(STRINGZILLA_BUILD_CUDA "Build CUDA-accelerated targets" ${STRINGZILLA_CAN_BUILD_CUDA})
 set(STRINGZILLA_TARGET_ARCH
     ""
     CACHE STRING "Architecture to tell the compiler to optimize for (-march)"
 )
 
+# Enable CUDA if requested
+if (STRINGZILLA_BUILD_CUDA)
+    if (NOT STRINGZILLA_CAN_BUILD_CUDA)
+        message(FATAL_ERROR "CUDA support requested but CUDA compiler not found")
+    endif ()
+    enable_language(CUDA)
+    set(CMAKE_CUDA_STANDARD 20)
+    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+    set(CMAKE_CUDA_EXTENSIONS OFF)
+    set(CMAKE_CUDA_ARCHITECTURES 90a) # Hopper is the newest architecture we specialize for
+    set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
+    find_package(CUDAToolkit REQUIRED)
+    message(STATUS "CUDA support enabled")
+    message(STATUS "CUDA Compiler: ${CMAKE_CUDA_COMPILER}")
+    message(STATUS "CUDA Compiler ID: ${CMAKE_CUDA_COMPILER_ID}")
+    message(STATUS "CUDA Toolkit Version: ${CUDAToolkit_VERSION}")
+    message(STATUS "CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
+endif ()
+
 # Includes
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
 include(ExternalProject)
@@ -216,7 +223,6 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
         )
     endif ()
 
-
     # Set optimization options for different compilers differently
     if (${compiler_id} STREQUAL "MSVC")
         if (${CMAKE_BUILD_TYPE} STREQUAL "Debug" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
@@ -379,7 +385,7 @@ if (${STRINGZILLA_BUILD_BENCHMARK})
     define_launcher(
         stringzillas_bench_fingerprints_cpp20 scripts/bench_fingerprints.cpp 20 "${STRINGZILLA_TARGET_ARCH}"
     )
-    if (ENABLE_CUDA)
+    if (STRINGZILLA_BUILD_CUDA)
         define_gpu_launcher(
             stringzillas_bench_similarities_cu20 scripts/bench_similarities.cu 20 "${STRINGZILLA_TARGET_ARCH}"
         )
@@ -405,7 +411,7 @@ if (${STRINGZILLA_BUILD_TEST})
     # To avoid bloating our codebase with `__device__` function annotations, we only target C++14 and newer to compile
     # `constexpr` functions on both host and device side. To avoid the complexity of defining too many template objects
     # and complex SFINAE, we only target C++17 anf newer to compile `if constexpr` compile-time SIMD dispatch.
-    if (ENABLE_CUDA)
+    if (STRINGZILLA_BUILD_CUDA)
         define_gpu_launcher(stringzillas_test_cu17 scripts/test_stringzillas.cu 17 "${STRINGZILLA_TARGET_ARCH}")
         define_gpu_launcher(stringzillas_test_cu20 scripts/test_stringzillas.cu 20 "${STRINGZILLA_TARGET_ARCH}")
     endif ()
@@ -418,7 +424,7 @@ if (${STRINGZILLA_BUILD_TEST})
             define_launcher(stringzilla_test_cpp20_serial scripts/test_stringzilla.cpp 20 "AVX")
             define_launcher(stringzilla_test_cpp20_haswell scripts/test_stringzilla.cpp 20 "AVX2")
             define_launcher(stringzilla_test_cpp20_ice scripts/test_stringzilla.cpp 20 "AVX512")
-            if (ENABLE_CUDA)
+            if (STRINGZILLA_BUILD_CUDA)
                 define_gpu_launcher(stringzillas_test_cu20_serial scripts/test_stringzillas.cu 20 "AVX")
                 define_gpu_launcher(stringzillas_test_cu20_haswell scripts/test_stringzillas.cu 20 "AVX2")
                 define_gpu_launcher(stringzillas_test_cu20_ice scripts/test_stringzillas.cu 20 "AVX512")
@@ -427,7 +433,7 @@ if (${STRINGZILLA_BUILD_TEST})
             define_launcher(stringzilla_test_cpp20_serial scripts/test_stringzilla.cpp 20 "ivybridge")
             define_launcher(stringzilla_test_cpp20_haswell scripts/test_stringzilla.cpp 20 "haswell")
             define_launcher(stringzilla_test_cpp20_ice scripts/test_stringzilla.cpp 20 "sapphirerapids")
-            if (ENABLE_CUDA)
+            if (STRINGZILLA_BUILD_CUDA)
                 define_gpu_launcher(stringzillas_test_cu20_serial scripts/test_stringzillas.cu 20 "ivybridge")
                 define_gpu_launcher(stringzillas_test_cu20_haswell scripts/test_stringzillas.cu 20 "haswell")
                 define_gpu_launcher(stringzillas_test_cu20_ice scripts/test_stringzillas.cu 20 "sapphirerapids")
@@ -438,7 +444,7 @@ if (${STRINGZILLA_BUILD_TEST})
         define_launcher(stringzilla_test_cpp20_serial scripts/test_stringzilla.cpp 20 "armv8-a")
         define_launcher(stringzilla_test_cpp20_neon scripts/test_stringzilla.cpp 20 "armv8-a+simd")
         define_launcher(stringzilla_test_cpp20_sve scripts/test_stringzilla.cpp 20 "armv8.2-a+sve")
-        if (ENABLE_CUDA)
+        if (STRINGZILLA_BUILD_CUDA)
             define_gpu_launcher(stringzillas_test_cu20_serial scripts/test_stringzillas.cu 20 "armv8-a")
             define_gpu_launcher(stringzillas_test_cu20_neon scripts/test_stringzillas.cu 20 "armv8-a+simd")
             define_gpu_launcher(stringzillas_test_cu20_sve scripts/test_stringzillas.cu 20 "armv8.2-a+sve")
@@ -576,7 +582,7 @@ if (${STRINGZILLAS_BUILD_SHARED})
     define_stringzillas_shared(stringzillas_cpus_shared c/stringzillas.cpp "SZ_USE_CUDA=0;SZ_USE_ROCM=0")
 
     # Define StringZillas CUDA shared library (only if CUDA is available)
-    if (ENABLE_CUDA)
+    if (STRINGZILLA_BUILD_CUDA)
         define_stringzillas_shared(stringzillas_cuda_shared c/stringzillas.cu "SZ_USE_CUDA=1;SZ_USE_ROCM=0")
 
         # Link CUDA libraries
@@ -584,16 +590,18 @@ if (${STRINGZILLAS_BUILD_SHARED})
 
         # Set CUDA-specific properties
         set_target_properties(stringzillas_cuda_shared PROPERTIES CUDA_STANDARD 20 CUDA_STANDARD_REQUIRED ON)
-        set_target_properties(stringzillas_cuda_shared PROPERTIES CUDA_ARCHITECTURES "70;75;80;86;89;90")
+        set_target_properties(stringzillas_cuda_shared PROPERTIES CUDA_ARCHITECTURES "90") # We dispatch manually
 
         # Enable CUDA separable compilation for device code
         set_target_properties(stringzillas_cuda_shared PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-        
+
         # Add CUDA-specific compiler flags
         target_compile_options(stringzillas_cuda_shared PRIVATE "--expt-relaxed-constexpr")
-        
+
         # Set the source file as CUDA
-        set_source_files_properties(c/stringzillas.cu TARGET_DIRECTORY stringzillas_cuda_shared PROPERTIES LANGUAGE CUDA)
+        set_source_files_properties(
+            c/stringzillas.cu TARGET_DIRECTORY stringzillas_cuda_shared PROPERTIES LANGUAGE CUDA
+        )
     endif ()
 
     # TODO: Define StringZillas ROCm shared library when ROCm support is added if (ENABLE_ROCM)

From 3e8a3a69e74c9033a9c84dd5a878520beb9bae5f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 12 Aug 2025 20:38:16 +0000
Subject: [PATCH 562/751] Add: SW scoring

---
 c/stringzillas.cuh | 260 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 243 insertions(+), 17 deletions(-)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index 09011563..a3eadb7f 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -370,7 +370,7 @@ sz_status_t sz_levenshtein_distances_utf8_for_(
     return result;
 }
 
-struct needleman_wunsch_scores_backends_t {
+struct needleman_wunsch_backends_t {
 
     /**
      *  On each hardware platform we use a different backend for Levenshtein distances,
@@ -392,7 +392,7 @@ struct needleman_wunsch_scores_backends_t {
         variants;
 
     template <typename... variants_arguments_>
-    needleman_wunsch_scores_backends_t(variants_arguments_ &&...args) noexcept
+    needleman_wunsch_backends_t(variants_arguments_ &&...args) noexcept
         : variants(std::forward<variants_arguments_>(args)...) {}
 };
 
@@ -407,7 +407,94 @@ sz_status_t sz_needleman_wunsch_scores_for_(
     sz_assert_(results != nullptr && "Results must not be null");
 
     // Revert back from opaque pointer types
-    auto *engine = reinterpret_cast<needleman_wunsch_scores_backends_t *>(engine_punned);
+    auto *engine = reinterpret_cast<needleman_wunsch_backends_t *>(engine_punned);
+    auto *device = reinterpret_cast<device_scope_t *>(device_punned);
+
+    // Wrap our stable ABI sequences into C++ friendly containers
+    auto results_strided = strided_ptr<sz_ssize_t> {reinterpret_cast<sz_ptr_t>(results), results_stride};
+
+    // The simplest case, is having non-optimized non-unrolled hashers.
+    sz_status_t result = sz_success_k;
+    auto variant_logic = [&](auto &engine_variant) {
+        constexpr sz_capability_t engine_capability_k = engine_variant.capability_k;
+
+        // GPU backends are only compatible with GPU scopes
+        if constexpr (is_gpu_capability(engine_capability_k)) {
+#if SZ_USE_CUDA
+            if (std::holds_alternative<gpu_scope_t>(device->variants)) {
+                auto &device_scope = std::get<gpu_scope_t>(device->variants);
+                sz::status_t status = engine_variant(          //
+                    a_container, b_container, results_strided, //
+                    get_executor(device_scope), get_specs(device_scope));
+                result = static_cast<sz_status_t>(status);
+            }
+            else { result = sz_status_unknown_k; }
+#else
+            result = sz_status_unknown_k; // GPU support is not enabled
+#endif // SZ_USE_CUDA
+        }
+        // CPU backends are only compatible with CPU scopes
+        else {
+            if (std::holds_alternative<default_scope_t>(device->variants)) {
+                auto &device_scope = std::get<default_scope_t>(device->variants);
+                sz::status_t status = engine_variant(          //
+                    a_container, b_container, results_strided, //
+                    get_executor(device_scope), get_specs(device_scope));
+                result = static_cast<sz_status_t>(status);
+            }
+            else if (std::holds_alternative<cpu_scope_t>(device->variants)) {
+                auto &device_scope = std::get<cpu_scope_t>(device->variants);
+                sz::status_t status = engine_variant(          //
+                    a_container, b_container, results_strided, //
+                    get_executor(device_scope), get_specs(device_scope));
+                result = static_cast<sz_status_t>(status);
+            }
+            else { result = sz_status_unknown_k; }
+        }
+    };
+
+    std::visit(variant_logic, engine->variants);
+    return result;
+}
+
+struct smith_waterman_backends_t {
+
+    /**
+     *  On each hardware platform we use a different backend for Levenshtein distances,
+     *  separately covering:
+     *  - Linear or Affine gap costs
+     *  - Serial, Ice Lake, CUDA, CUDA Kepler, and CUDA Hopper backends
+     */
+    std::variant<
+#if SZ_USE_ICE
+        szs::smith_waterman_ice_t, // ! No affine variant here yet
+#endif
+#if SZ_USE_CUDA
+        szs::smith_waterman_cuda_t, szs::affine_smith_waterman_cuda_t,
+#endif
+#if SZ_USE_HOPPER
+        szs::smith_waterman_hopper_t, szs::affine_smith_waterman_hopper_t,
+#endif
+        szs::smith_waterman_serial_t, szs::affine_smith_waterman_serial_t>
+        variants;
+
+    template <typename... variants_arguments_>
+    smith_waterman_backends_t(variants_arguments_ &&...args) noexcept
+        : variants(std::forward<variants_arguments_>(args)...) {}
+};
+
+template <typename texts_type_>
+sz_status_t sz_smith_waterman_scores_for_(                                     //
+    sz_smith_waterman_scores_t engine_punned, sz_device_scope_t device_punned, //
+    texts_type_ &&a_container, texts_type_ &&b_container,                      //
+    sz_ssize_t *results, sz_size_t results_stride) {
+
+    sz_assert_(engine_punned != nullptr && "Engine must be initialized");
+    sz_assert_(device_punned != nullptr && "Device must be initialized");
+    sz_assert_(results != nullptr && "Results must not be null");
+
+    // Revert back from opaque pointer types
+    auto *engine = reinterpret_cast<smith_waterman_backends_t *>(engine_punned);
     auto *device = reinterpret_cast<device_scope_t *>(device_punned);
 
     // Wrap our stable ABI sequences into C++ friendly containers
@@ -690,7 +777,7 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(
 #endif // SZ_USE_CUDA
 
 #if SZ_USE_KEPLER
-    bool const can_use_kepler = (capabilities & sz_cap_ck_k) == sz_cap_ck_k;
+    bool const can_use_kepler = (capabilities & sz_caps_ck_k) == sz_caps_ck_k;
     if (can_use_kepler && can_use_linear_costs) {
         auto variant = szs::levenshtein_kepler_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)
@@ -927,7 +1014,7 @@ SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_init(                        /
     if (can_use_ice && can_use_linear_costs) {
         auto variant = szs::needleman_wunsch_ice_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)
-            needleman_wunsch_scores_backends_t(std::in_place_type_t<szs::needleman_wunsch_ice_t>(), std::move(variant));
+            needleman_wunsch_backends_t(std::in_place_type_t<szs::needleman_wunsch_ice_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
         *engine_punned = reinterpret_cast<sz_needleman_wunsch_scores_t>(engine);
@@ -939,17 +1026,17 @@ SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_init(                        /
     bool const can_use_cuda = (capabilities & sz_cap_cuda_k) != 0;
     if (can_use_cuda && can_use_linear_costs) {
         auto variant = szs::needleman_wunsch_cuda_t(substitution_costs, linear_costs);
-        auto engine = new (std::nothrow) needleman_wunsch_scores_backends_t(
-            std::in_place_type_t<szs::needleman_wunsch_cuda_t>(), std::move(variant));
+        auto engine = new (std::nothrow)
+            needleman_wunsch_backends_t(std::in_place_type_t<szs::needleman_wunsch_cuda_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
         *engine_punned = reinterpret_cast<sz_needleman_wunsch_scores_t>(engine);
         return sz_success_k;
     }
     else if (can_use_cuda) {
-        auto variant = affine_needleman_wunsch_cuda_t(substitution_costs, affine_costs);
-        auto engine = new (std::nothrow) needleman_wunsch_scores_backends_t(
-            std::in_place_type_t<affine_needleman_wunsch_cuda_t>(), std::move(variant));
+        auto variant = szs::affine_needleman_wunsch_cuda_t(substitution_costs, affine_costs);
+        auto engine = new (std::nothrow) needleman_wunsch_backends_t(
+            std::in_place_type_t<szs::affine_needleman_wunsch_cuda_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
         *engine_punned = reinterpret_cast<sz_needleman_wunsch_scores_t>(engine);
@@ -961,8 +1048,8 @@ SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_init(                        /
     bool const can_use_hopper = (capabilities & sz_caps_ckh_k) == sz_caps_ckh_k;
     if (can_use_hopper && can_use_linear_costs) {
         auto variant = szs::needleman_wunsch_hopper_t(substitution_costs, linear_costs);
-        auto engine = new (std::nothrow) needleman_wunsch_scores_backends_t(
-            std::in_place_type_t<szs::needleman_wunsch_hopper_t>(), std::move(variant));
+        auto engine = new (std::nothrow)
+            needleman_wunsch_backends_t(std::in_place_type_t<szs::needleman_wunsch_hopper_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
         *engine_punned = reinterpret_cast<sz_needleman_wunsch_scores_t>(engine);
@@ -970,7 +1057,7 @@ SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_init(                        /
     }
     else if (can_use_hopper) {
         auto variant = szs::affine_needleman_wunsch_hopper_t(substitution_costs, affine_costs);
-        auto engine = new (std::nothrow) needleman_wunsch_scores_backends_t(
+        auto engine = new (std::nothrow) needleman_wunsch_backends_t(
             std::in_place_type_t<szs::affine_needleman_wunsch_hopper_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
@@ -981,8 +1068,8 @@ SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_init(                        /
 
     if (can_use_linear_costs) {
         auto variant = szs::needleman_wunsch_serial_t(substitution_costs, linear_costs);
-        auto engine = new (std::nothrow) needleman_wunsch_scores_backends_t(
-            std::in_place_type_t<szs::needleman_wunsch_serial_t>(), std::move(variant));
+        auto engine = new (std::nothrow)
+            needleman_wunsch_backends_t(std::in_place_type_t<szs::needleman_wunsch_serial_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
         *engine_punned = reinterpret_cast<sz_needleman_wunsch_scores_t>(engine);
@@ -990,7 +1077,7 @@ SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_init(                        /
     }
     else {
         auto variant = szs::affine_needleman_wunsch_serial_t(substitution_costs, affine_costs);
-        auto engine = new (std::nothrow) needleman_wunsch_scores_backends_t(
+        auto engine = new (std::nothrow) needleman_wunsch_backends_t(
             std::in_place_type_t<szs::affine_needleman_wunsch_serial_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
@@ -1040,12 +1127,151 @@ SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u64tape(
 
 SZ_DYNAMIC void sz_needleman_wunsch_scores_free(sz_needleman_wunsch_scores_t engine_punned) {
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
-    auto *engine = reinterpret_cast<needleman_wunsch_scores_backends_t *>(engine_punned);
+    auto *engine = reinterpret_cast<needleman_wunsch_backends_t *>(engine_punned);
     delete engine;
 }
 
 #pragma endregion Needleman Wunsch
 
+#pragma region Smith Waterman
+
+SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_init(                          //
+    sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
+    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,          //
+    sz_smith_waterman_scores_t *engine_punned) {
+
+    sz_assert_(engine_punned != nullptr && *engine_punned == nullptr && "Engine must be uninitialized");
+
+    // If the gap opening and extension costs are identical we can use less memory
+    auto const can_use_linear_costs = open == extend;
+    auto const substitution_costs = *reinterpret_cast<szs::error_costs_256x256_t const *>(subs);
+    auto const linear_costs = szs::linear_gap_costs_t {open};
+    auto const affine_costs = szs::affine_gap_costs_t {open, extend};
+
+#if SZ_USE_ICE
+    bool const can_use_ice = (capabilities & sz_cap_serial_k) == sz_cap_serial_k;
+    if (can_use_ice && can_use_linear_costs) {
+        auto variant = szs::smith_waterman_ice_t(substitution_costs, linear_costs);
+        auto engine = new (std::nothrow)
+            smith_waterman_backends_t(std::in_place_type_t<szs::smith_waterman_ice_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_smith_waterman_scores_t>(engine);
+        return sz_success_k;
+    }
+#endif // SZ_USE_ICE
+
+#if SZ_USE_CUDA
+    bool const can_use_cuda = (capabilities & sz_cap_cuda_k) != 0;
+    if (can_use_cuda && can_use_linear_costs) {
+        auto variant = szs::smith_waterman_cuda_t(substitution_costs, linear_costs);
+        auto engine = new (std::nothrow)
+            smith_waterman_backends_t(std::in_place_type_t<szs::smith_waterman_cuda_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_smith_waterman_scores_t>(engine);
+        return sz_success_k;
+    }
+    else if (can_use_cuda) {
+        auto variant = szs::affine_smith_waterman_cuda_t(substitution_costs, affine_costs);
+        auto engine = new (std::nothrow)
+            smith_waterman_backends_t(std::in_place_type_t<szs::affine_smith_waterman_cuda_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_smith_waterman_scores_t>(engine);
+        return sz_success_k;
+    }
+#endif // SZ_USE_CUDA
+
+#if SZ_USE_HOPPER
+    bool const can_use_hopper = (capabilities & sz_caps_ckh_k) == sz_caps_ckh_k;
+    if (can_use_hopper && can_use_linear_costs) {
+        auto variant = szs::smith_waterman_hopper_t(substitution_costs, linear_costs);
+        auto engine = new (std::nothrow)
+            smith_waterman_backends_t(std::in_place_type_t<szs::smith_waterman_hopper_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_smith_waterman_scores_t>(engine);
+        return sz_success_k;
+    }
+    else if (can_use_hopper) {
+        auto variant = szs::affine_smith_waterman_hopper_t(substitution_costs, affine_costs);
+        auto engine = new (std::nothrow)
+            smith_waterman_backends_t(std::in_place_type_t<szs::affine_smith_waterman_hopper_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_smith_waterman_scores_t>(engine);
+        return sz_success_k;
+    }
+#endif // SZ_USE_HOPPER
+
+    if (can_use_linear_costs) {
+        auto variant = szs::smith_waterman_serial_t(substitution_costs, linear_costs);
+        auto engine = new (std::nothrow)
+            smith_waterman_backends_t(std::in_place_type_t<szs::smith_waterman_serial_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_smith_waterman_scores_t>(engine);
+        return sz_success_k;
+    }
+    else {
+        auto variant = szs::affine_smith_waterman_serial_t(substitution_costs, affine_costs);
+        auto engine = new (std::nothrow)
+            smith_waterman_backends_t(std::in_place_type_t<szs::affine_smith_waterman_serial_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        *engine_punned = reinterpret_cast<sz_smith_waterman_scores_t>(engine);
+        return sz_success_k;
+    }
+}
+
+SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_sequence(                      //
+    sz_smith_waterman_scores_t engine_punned, sz_device_scope_t device_punned, //
+    sz_sequence_t const *a, sz_sequence_t const *b,                            //
+    sz_ssize_t *results, sz_size_t results_stride) {
+
+    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
+    auto a_container = sz_sequence_as_cpp_container_t {a};
+    auto b_container = sz_sequence_as_cpp_container_t {b};
+    return sz_smith_waterman_scores_for_(                       //
+        engine_punned, device_punned, a_container, b_container, //
+        results, results_stride);
+}
+
+SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u32tape(                       //
+    sz_smith_waterman_scores_t engine_punned, sz_device_scope_t device_punned, //
+    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,            //
+    sz_ssize_t *results, sz_size_t results_stride) {
+
+    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
+    auto a_container = sz_sequence_u32tape_as_cpp_container_t {a};
+    auto b_container = sz_sequence_u32tape_as_cpp_container_t {b};
+    return sz_smith_waterman_scores_for_(                       //
+        engine_punned, device_punned, a_container, b_container, //
+        results, results_stride);
+}
+
+SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u64tape(                       //
+    sz_smith_waterman_scores_t engine_punned, sz_device_scope_t device_punned, //
+    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,            //
+    sz_ssize_t *results, sz_size_t results_stride) {
+
+    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
+    auto a_container = sz_sequence_u64tape_as_cpp_container_t {a};
+    auto b_container = sz_sequence_u64tape_as_cpp_container_t {b};
+    return sz_smith_waterman_scores_for_(                       //
+        engine_punned, device_punned, a_container, b_container, //
+        results, results_stride);
+}
+
+SZ_DYNAMIC void sz_smith_waterman_scores_free(sz_smith_waterman_scores_t engine_punned) {
+    sz_assert_(engine_punned != nullptr && "Engine must be initialized");
+    auto *engine = reinterpret_cast<smith_waterman_backends_t *>(engine_punned);
+    delete engine;
+}
+
+#pragma endregion Smith Waterman
+
 #pragma region Fingerprints
 
 SZ_DYNAMIC sz_status_t sz_fingerprints_init(                              //

From 4abf63c0101f0710dd98aa37645eb5d2ff216a37 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 12 Aug 2025 20:44:03 +0000
Subject: [PATCH 563/751] Make: Custom `CudaBuildExtension` for Python

---
 setup.py | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 72 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index e27ac94e..53aeadd4 100644
--- a/setup.py
+++ b/setup.py
@@ -2,8 +2,72 @@
 import sys
 import platform
 from setuptools import setup, find_packages, Extension
+from setuptools.command.build_ext import build_ext
 from typing import List, Tuple, Final
 import sysconfig
+import subprocess
+
+
+class CudaBuildExtension(build_ext):
+    def build_extensions(self):
+        for ext in self.extensions:
+            if any(source.endswith(".cu") for source in ext.sources):
+                self._build_cuda_extension(ext)
+            else:
+                super().build_extension(ext)
+
+    def _build_cuda_extension(self, ext):
+        # Separate CUDA and C sources
+        cuda_sources = [s for s in ext.sources if s.endswith(".cu")]
+        c_sources = [s for s in ext.sources if not s.endswith(".cu")]
+
+        # Compile CUDA files with nvcc first
+        objects = []
+        for cuda_source in cuda_sources:
+            # Generate object file path
+            obj_name = os.path.splitext(os.path.basename(cuda_source))[0] + ".o"
+            obj_path = os.path.join(self.build_temp, obj_name)
+            os.makedirs(self.build_temp, exist_ok=True)
+
+            # NVCC command
+            nvcc_cmd = [
+                "nvcc",
+                "-c",
+                cuda_source,
+                "-o",
+                obj_path,
+                "--compiler-options",
+                "-fPIC",
+                "-std=c++17",
+                "-O3",
+                "--use_fast_math",
+                "--expt-relaxed-constexpr",  # Allow constexpr functions in device code
+                "-arch=sm_80",  # Set appropriate compute capability
+                "-DSZ_DYNAMIC_DISPATCH=1",
+                "-DSZ_USE_CUDA=1",
+            ]
+
+            # Add include directories
+            for inc_dir in ext.include_dirs:
+                nvcc_cmd.extend(["-I", inc_dir])
+
+            # Add defines
+            for define in ext.define_macros:
+                if len(define) == 2:
+                    nvcc_cmd.append(f"-D{define[0]}={define[1]}")
+                else:
+                    nvcc_cmd.append(f"-D{define[0]}")
+
+            print(f"Compiling {cuda_source} with nvcc...")
+            subprocess.check_call(nvcc_cmd)
+            objects.append(obj_path)
+
+        # Update extension: remove .cu sources, add compiled objects
+        ext.sources = c_sources
+        ext.extra_objects = getattr(ext, "extra_objects", []) + objects
+
+        # Build normally
+        super().build_extension(ext)
 
 
 using_cibuildwheel: Final[str] = os.environ.get("CIBUILDWHEEL", "0") == "1"
@@ -46,6 +110,7 @@ def linux_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[Tu
         "-Wno-incompatible-pointer-types",  # like: passing argument 4 of ‘sz_export_prefix_u32’ from incompatible pointer type
         "-Wno-discarded-qualifiers",  # like: passing argument 1 of ‘free’ discards ‘const’ qualifier from pointer target type
         "-fPIC",  # to enable dynamic dispatch
+        "-g",  # include debug symbols for better debugging experience
     ]
     link_args = [
         "-fPIC",  # to enable dynamic dispatch
@@ -142,6 +207,7 @@ def windows_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[
 
 ext_modules = []
 entry_points = {}
+cmdclass = {}
 
 if sz_target == "stringzilla":
     __lib_name__ = "stringzilla"
@@ -188,11 +254,13 @@ def windows_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[
             "stringzillas",
             ["python/stringzillas.c", "c/stringzillas.cu"],
             include_dirs=["include", "c", "fork_union/include", "/usr/local/cuda/include", np.get_include()],
-            extra_compile_args=compile_args + ["-x", "cuda"],
-            extra_link_args=link_args + ["-L/usr/local/cuda/lib64", "-lcudart"],
-            define_macros=[("SZ_DYNAMIC_DISPATCH", "1")] + macros_args,
+            extra_compile_args=compile_args,
+            extra_link_args=link_args + ["-L/usr/local/cuda/lib64", "-lcudart", "-lcuda", "-lstdc++"],
+            define_macros=[("SZ_DYNAMIC_DISPATCH", "1"), ("SZ_USE_CUDA", "1")] + macros_args,
+            language="c++",  # Force C++ linking
         ),
     ]
+    cmdclass = {"build_ext": CudaBuildExtension}
 else:
     raise ValueError("Unknown target specified with SZ_TARGET environment variable.")
 
@@ -245,4 +313,5 @@ def windows_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[
     ext_modules=ext_modules,
     packages=find_packages(),
     entry_points=entry_points,
+    cmdclass=cmdclass,
 )

From 1e112c6b333790ac0e2b662a6e7ab34e0fe753d0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 12 Aug 2025 21:02:51 +0000
Subject: [PATCH 564/751] Fix: Skip missing `affine_levenshtein_utf8_ice_t`

---
 c/stringzillas.cuh | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index a3eadb7f..36e4001c 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -309,7 +309,7 @@ struct levenshtein_utf8_backends_t {
      */
     std::variant<
 #if SZ_USE_ICE
-        szs::levenshtein_utf8_ice_t, szs::affine_levenshtein_utf8_ice_t,
+        szs::levenshtein_utf8_ice_t, // ! `szs::affine_levenshtein_utf8_ice_t` won't compile yet
 #endif
         szs::levenshtein_utf8_serial_t, szs::affine_levenshtein_utf8_serial_t>
         variants;
@@ -910,15 +910,6 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_init(
             levenshtein_utf8_backends_t(std::in_place_type_t<szs::levenshtein_utf8_ice_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_levenshtein_distances_utf8_t>(engine);
-        return sz_success_k;
-    }
-    else {
-        auto variant = szs::affine_levenshtein_utf8_ice_t(substitution_costs, affine_costs);
-        auto engine = new (std::nothrow)
-            levenshtein_utf8_backends_t(std::in_place_type_t<szs::affine_levenshtein_utf8_ice_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
-
         *engine_punned = reinterpret_cast<sz_levenshtein_distances_utf8_t>(engine);
         return sz_success_k;
     }

From 7870cd36bdad5ffcf191d6f24733365f754d95ba Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 12 Aug 2025 21:42:38 +0000
Subject: [PATCH 565/751] Improve: Comparing 2 mem-allocators

---
 include/stringzilla/types.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index ee766d8e..cf7f6428 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -649,6 +649,14 @@ SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc);
  */
 SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length);
 
+/**
+ *  @brief Checks if two memory allocators are equivalent.
+ *  @param[in] a First memory allocator.
+ *  @param[in] b Second memory allocator.
+ *  @return True if the allocators are the same, false otherwise.
+ */
+SZ_PUBLIC sz_bool_t sz_memory_allocator_equal(sz_memory_allocator_t const *a, sz_memory_allocator_t const *b);
+
 #pragma endregion
 
 #pragma region API Signature Types
@@ -1388,6 +1396,13 @@ SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void
     *((sz_ptr_t)buffer + sizeof(sz_size_t)) = sizeof(sz_size_t) * 2; // The capacity and consumption so far
 }
 
+SZ_PUBLIC sz_bool_t sz_memory_allocator_equal(sz_memory_allocator_t const *a, sz_memory_allocator_t const *b) {
+    if (!a || !b) return sz_false_k;
+
+    // Two allocators are considered equal if they have the same function pointers and handle
+    return (a->allocate == b->allocate) && (a->free == b->free) && (a->handle == b->handle) ? sz_true_k : sz_false_k;
+}
+
 SZ_PUBLIC sz_cptr_t sz_sequence_from_null_terminated_strings_get_start_(void const *handle, sz_size_t i) {
     sz_cptr_t const *start = (sz_cptr_t const *)handle;
     return start[i];

From 711bd633d60be25dab000ebe27b7f980c8c7475d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 13 Aug 2025 08:41:59 +0000
Subject: [PATCH 566/751] Improve: Propagate error message to Py

---
 python/stringzillas.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/python/stringzillas.c b/python/stringzillas.c
index 34eca81d..56645561 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -643,7 +643,18 @@ static PyObject *LevenshteinDistances_call(LevenshteinDistances *self, PyObject
         kernel_results, kernel_results_stride);
 
     if (status != sz_success_k) {
-        PyErr_SetString(PyExc_RuntimeError, "Levenshtein distance computation failed");
+        char const *error_msg;
+        switch (status) {
+        case sz_bad_alloc_k: error_msg = "Levenshtein failed: memory allocation failed"; break;
+        case sz_invalid_utf8_k: error_msg = "Levenshtein failed: invalid UTF-8 input"; break;
+        case sz_contains_duplicates_k: error_msg = "Levenshtein failed: contains duplicates"; break;
+        case sz_overflow_risk_k: error_msg = "Levenshtein failed: overflow risk"; break;
+        case sz_unexpected_dimensions_k: error_msg = "Levenshtein failed: input/output size mismatch"; break;
+        case sz_missing_gpu_k: error_msg = "Levenshtein failed: GPU support is missing in the library"; break;
+        case sz_status_unknown_k: error_msg = "Levenshtein failed: unknown error"; break;
+        default: error_msg = "Levenshtein failed: unexpected error"; break;
+        }
+        PyErr_Format(PyExc_RuntimeError, "%s (status code: %d)", error_msg, (int)status);
         goto cleanup;
     }
     return results_array;
@@ -1022,7 +1033,18 @@ static PyObject *LevenshteinDistancesUTF8_call(LevenshteinDistancesUTF8 *self, P
         kernel_results, kernel_results_stride);
 
     if (status != sz_success_k) {
-        PyErr_SetString(PyExc_RuntimeError, "Levenshtein distance computation failed");
+        char const *error_msg;
+        switch (status) {
+        case sz_bad_alloc_k: error_msg = "Levenshtein failed: memory allocation failed"; break;
+        case sz_invalid_utf8_k: error_msg = "Levenshtein failed: invalid UTF-8 input"; break;
+        case sz_contains_duplicates_k: error_msg = "Levenshtein failed: contains duplicates"; break;
+        case sz_overflow_risk_k: error_msg = "Levenshtein failed: overflow risk"; break;
+        case sz_unexpected_dimensions_k: error_msg = "Levenshtein failed: input/output size mismatch"; break;
+        case sz_missing_gpu_k: error_msg = "Levenshtein failed: GPU support is missing in the library"; break;
+        case sz_status_unknown_k: error_msg = "Levenshtein failed: unknown error"; break;
+        default: error_msg = "Levenshtein failed: unexpected error"; break;
+        }
+        PyErr_Format(PyExc_RuntimeError, "%s (status code: %d)", error_msg, (int)status);
         goto cleanup;
     }
     return results_array;

From 36745f4900c59fa2d293917f56b135bbf94db1e4 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 13 Aug 2025 08:43:35 +0000
Subject: [PATCH 567/751] Improve: Simpler & slower Py args parsing

---
 python/stringzillas.c | 456 +++++-------------------------------------
 1 file changed, 54 insertions(+), 402 deletions(-)

diff --git a/python/stringzillas.c b/python/stringzillas.c
index 56645561..19b8eb02 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -167,44 +167,16 @@ static PyObject *DeviceScope_new(PyTypeObject *type, PyObject *args, PyObject *k
 }
 
 static int DeviceScope_init(DeviceScope *self, PyObject *args, PyObject *kwargs) {
-    Py_ssize_t nargs = PyTuple_Size(args);
+    sz_size_t cpu_cores = 0;
+    sz_size_t gpu_device = 0;
+    int has_cpu_cores = 0;
+    int has_gpu_device = 0;
     PyObject *cpu_cores_obj = NULL;
     PyObject *gpu_device_obj = NULL;
 
-    // Manual argument parsing - check positional args first
-    if (nargs > 2) {
-        PyErr_SetString(PyExc_TypeError, "DeviceScope() takes at most 2 arguments");
-        return -1;
-    }
+    static char *kwlist[] = {"cpu_cores", "gpu_device", NULL};
 
-    if (nargs >= 1) cpu_cores_obj = PyTuple_GET_ITEM(args, 0);
-    if (nargs >= 2) gpu_device_obj = PyTuple_GET_ITEM(args, 1);
-
-    // Parse keyword arguments
-    if (kwargs) {
-        Py_ssize_t pos = 0;
-        PyObject *key, *value;
-        while (PyDict_Next(kwargs, &pos, &key, &value)) {
-            if (PyUnicode_CompareWithASCIIString(key, "cpu_cores") == 0) {
-                if (cpu_cores_obj) {
-                    PyErr_SetString(PyExc_TypeError, "cpu_cores specified twice");
-                    return -1;
-                }
-                cpu_cores_obj = value;
-            }
-            else if (PyUnicode_CompareWithASCIIString(key, "gpu_device") == 0) {
-                if (gpu_device_obj) {
-                    PyErr_SetString(PyExc_TypeError, "gpu_device specified twice");
-                    return -1;
-                }
-                gpu_device_obj = value;
-            }
-            else {
-                PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
-                return -1;
-            }
-        }
-    }
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|OO", kwlist, &cpu_cores_obj, &gpu_device_obj)) { return -1; }
 
     sz_status_t status;
 
@@ -217,7 +189,7 @@ static int DeviceScope_init(DeviceScope *self, PyObject *args, PyObject *kwargs)
             PyErr_SetString(PyExc_TypeError, "cpu_cores must be an integer");
             return -1;
         }
-        sz_size_t cpu_cores = PyLong_AsSize_t(cpu_cores_obj);
+        cpu_cores = PyLong_AsSize_t(cpu_cores_obj);
         if (cpu_cores == (sz_size_t)-1 && PyErr_Occurred()) { return -1; }
         status = sz_device_scope_init_cpu_cores(cpu_cores, &self->handle);
     }
@@ -226,7 +198,7 @@ static int DeviceScope_init(DeviceScope *self, PyObject *args, PyObject *kwargs)
             PyErr_SetString(PyExc_TypeError, "gpu_device must be an integer");
             return -1;
         }
-        sz_size_t gpu_device = PyLong_AsSize_t(gpu_device_obj);
+        gpu_device = PyLong_AsSize_t(gpu_device_obj);
         if (gpu_device == (sz_size_t)-1 && PyErr_Occurred()) { return -1; }
         status = sz_device_scope_init_gpu_device(gpu_device, &self->handle);
     }
@@ -288,175 +260,33 @@ static PyObject *LevenshteinDistances_new(PyTypeObject *type, PyObject *args, Py
 }
 
 static int LevenshteinDistances_init(LevenshteinDistances *self, PyObject *args, PyObject *kwargs) {
-    Py_ssize_t nargs = PyTuple_Size(args);
-    sz_error_cost_t match = 0, mismatch = 1, open = 1, extend = 1;
+    int match = 0, mismatch = 1, open = 1, extend = 1;
     PyObject *capabilities_tuple = NULL;
     sz_capability_t capabilities = default_hardware_capabilities;
 
-    // Manual argument parsing - fast path for positional args
-    if (nargs >= 1) {
-        PyObject *obj = PyTuple_GET_ITEM(args, 0);
-        if (PyLong_Check(obj)) {
-            long val = PyLong_AsLong(obj);
-            if (val < -128 || val > 127) {
-                PyErr_SetString(PyExc_ValueError, "match cost must fit in 8-bit signed integer");
-                return -1;
-            }
-            match = (sz_error_cost_t)val;
-        }
-        else {
-            PyErr_SetString(PyExc_TypeError, "match cost must be an integer");
-            return -1;
-        }
-    }
+    static char *kwlist[] = {"match", "mismatch", "open", "extend", "capabilities", NULL};
 
-    if (nargs >= 2) {
-        PyObject *obj = PyTuple_GET_ITEM(args, 1);
-        if (PyLong_Check(obj)) {
-            long val = PyLong_AsLong(obj);
-            if (val < -128 || val > 127) {
-                PyErr_SetString(PyExc_ValueError, "mismatch cost must fit in 8-bit signed integer");
-                return -1;
-            }
-            mismatch = (sz_error_cost_t)val;
-        }
-        else {
-            PyErr_SetString(PyExc_TypeError, "mismatch cost must be an integer");
-            return -1;
-        }
-    }
-
-    if (nargs >= 3) {
-        PyObject *obj = PyTuple_GET_ITEM(args, 2);
-        if (PyLong_Check(obj)) {
-            long val = PyLong_AsLong(obj);
-            if (val < -128 || val > 127) {
-                PyErr_SetString(PyExc_ValueError, "open cost must fit in 8-bit signed integer");
-                return -1;
-            }
-            open = (sz_error_cost_t)val;
-        }
-        else {
-            PyErr_SetString(PyExc_TypeError, "open cost must be an integer");
-            return -1;
-        }
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iiiiO", kwlist, &match, &mismatch, &open, &extend,
+                                     &capabilities_tuple)) {
+        return -1;
     }
 
-    if (nargs >= 4) {
-        PyObject *obj = PyTuple_GET_ITEM(args, 3);
-        if (PyLong_Check(obj)) {
-            long val = PyLong_AsLong(obj);
-            if (val < -128 || val > 127) {
-                PyErr_SetString(PyExc_ValueError, "extend cost must fit in 8-bit signed integer");
-                return -1;
-            }
-            extend = (sz_error_cost_t)val;
-        }
-        else {
-            PyErr_SetString(PyExc_TypeError, "extend cost must be an integer");
-            return -1;
-        }
+    // Validate range of values
+    if (match < -128 || match > 127) {
+        PyErr_SetString(PyExc_ValueError, "match cost must fit in 8-bit signed integer");
+        return -1;
     }
-
-    if (nargs >= 5) {
-        PyObject *obj = PyTuple_GET_ITEM(args, 4);
-        if (PyTuple_Check(obj)) { capabilities_tuple = obj; }
-        else {
-            PyErr_SetString(PyExc_TypeError, "capabilities must be a tuple of strings");
-            return -1;
-        }
+    if (mismatch < -128 || mismatch > 127) {
+        PyErr_SetString(PyExc_ValueError, "mismatch cost must fit in 8-bit signed integer");
+        return -1;
     }
-
-    if (nargs > 5) {
-        PyErr_SetString(PyExc_TypeError, "LevenshteinDistances() takes at most 5 arguments");
+    if (open < -128 || open > 127) {
+        PyErr_SetString(PyExc_ValueError, "open cost must fit in 8-bit signed integer");
         return -1;
     }
-
-    // Parse keyword arguments
-    if (kwargs) {
-        Py_ssize_t pos = 0;
-        PyObject *key, *value;
-        while (PyDict_Next(kwargs, &pos, &key, &value)) {
-            if (PyUnicode_CompareWithASCIIString(key, "match") == 0) {
-                if (nargs >= 1) {
-                    PyErr_SetString(PyExc_TypeError, "match specified twice");
-                    return -1;
-                }
-                if (!PyLong_Check(value)) {
-                    PyErr_SetString(PyExc_TypeError, "match must be an integer");
-                    return -1;
-                }
-                long val = PyLong_AsLong(value);
-                if (val < -128 || val > 127) {
-                    PyErr_SetString(PyExc_ValueError, "match cost must fit in 8-bit signed integer");
-                    return -1;
-                }
-                match = (sz_error_cost_t)val;
-            }
-            else if (PyUnicode_CompareWithASCIIString(key, "mismatch") == 0) {
-                if (nargs >= 2) {
-                    PyErr_SetString(PyExc_TypeError, "mismatch specified twice");
-                    return -1;
-                }
-                if (!PyLong_Check(value)) {
-                    PyErr_SetString(PyExc_TypeError, "mismatch must be an integer");
-                    return -1;
-                }
-                long val = PyLong_AsLong(value);
-                if (val < -128 || val > 127) {
-                    PyErr_SetString(PyExc_ValueError, "mismatch cost must fit in 8-bit signed integer");
-                    return -1;
-                }
-                mismatch = (sz_error_cost_t)val;
-            }
-            else if (PyUnicode_CompareWithASCIIString(key, "open") == 0) {
-                if (nargs >= 3) {
-                    PyErr_SetString(PyExc_TypeError, "open specified twice");
-                    return -1;
-                }
-                if (!PyLong_Check(value)) {
-                    PyErr_SetString(PyExc_TypeError, "open must be an integer");
-                    return -1;
-                }
-                long val = PyLong_AsLong(value);
-                if (val < -128 || val > 127) {
-                    PyErr_SetString(PyExc_ValueError, "open cost must fit in 8-bit signed integer");
-                    return -1;
-                }
-                open = (sz_error_cost_t)val;
-            }
-            else if (PyUnicode_CompareWithASCIIString(key, "extend") == 0) {
-                if (nargs >= 4) {
-                    PyErr_SetString(PyExc_TypeError, "extend specified twice");
-                    return -1;
-                }
-                if (!PyLong_Check(value)) {
-                    PyErr_SetString(PyExc_TypeError, "extend must be an integer");
-                    return -1;
-                }
-                long val = PyLong_AsLong(value);
-                if (val < -128 || val > 127) {
-                    PyErr_SetString(PyExc_ValueError, "extend cost must fit in 8-bit signed integer");
-                    return -1;
-                }
-                extend = (sz_error_cost_t)val;
-            }
-            else if (PyUnicode_CompareWithASCIIString(key, "capabilities") == 0) {
-                if (nargs >= 5) {
-                    PyErr_SetString(PyExc_TypeError, "capabilities specified twice");
-                    return -1;
-                }
-                if (!PyTuple_Check(value)) {
-                    PyErr_SetString(PyExc_TypeError, "capabilities must be a tuple of strings");
-                    return -1;
-                }
-                capabilities_tuple = value;
-            }
-            else {
-                PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
-                return -1;
-            }
-        }
+    if (extend < -128 || extend > 127) {
+        PyErr_SetString(PyExc_ValueError, "extend cost must fit in 8-bit signed integer");
+        return -1;
     }
 
     // Parse capabilities if provided
@@ -476,51 +306,14 @@ static int LevenshteinDistances_init(LevenshteinDistances *self, PyObject *args,
 }
 
 static PyObject *LevenshteinDistances_call(LevenshteinDistances *self, PyObject *args, PyObject *kwargs) {
-    Py_ssize_t nargs = PyTuple_Size(args);
     PyObject *a_obj = NULL, *b_obj = NULL, *device_obj = NULL, *out_obj = NULL;
 
-    // Manual argument parsing for hot path
-    if (nargs < 2) {
-        PyErr_SetString(PyExc_TypeError, "LevenshteinDistances() requires at least 2 arguments");
-        return NULL;
-    }
+    static char *kwlist[] = {"a", "b", "device", "out", NULL};
 
-    if (nargs > 4) {
-        PyErr_SetString(PyExc_TypeError, "LevenshteinDistances() takes at most 4 arguments");
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|OO", kwlist, &a_obj, &b_obj, &device_obj, &out_obj)) {
         return NULL;
     }
 
-    a_obj = PyTuple_GET_ITEM(args, 0);
-    b_obj = PyTuple_GET_ITEM(args, 1);
-    if (nargs >= 3) device_obj = PyTuple_GET_ITEM(args, 2);
-    if (nargs >= 4) out_obj = PyTuple_GET_ITEM(args, 3);
-
-    // Parse keyword arguments
-    if (kwargs) {
-        Py_ssize_t pos = 0;
-        PyObject *key, *value;
-        while (PyDict_Next(kwargs, &pos, &key, &value)) {
-            if (PyUnicode_CompareWithASCIIString(key, "device") == 0) {
-                if (device_obj) {
-                    PyErr_SetString(PyExc_TypeError, "device specified twice");
-                    return NULL;
-                }
-                device_obj = value;
-            }
-            else if (PyUnicode_CompareWithASCIIString(key, "out") == 0) {
-                if (out_obj) {
-                    PyErr_SetString(PyExc_TypeError, "out specified twice");
-                    return NULL;
-                }
-                out_obj = value;
-            }
-            else {
-                PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
-                return NULL;
-            }
-        }
-    }
-
     DeviceScope *device_scope = NULL;
     if (device_obj != NULL && device_obj != Py_None) {
         if (!PyObject_TypeCheck(device_obj, &DeviceScopeType)) {
@@ -716,139 +509,33 @@ static void LevenshteinDistancesUTF8_dealloc(LevenshteinDistancesUTF8 *self) {
 }
 
 static int LevenshteinDistancesUTF8_init(LevenshteinDistancesUTF8 *self, PyObject *args, PyObject *kwargs) {
-    Py_ssize_t nargs = PyTuple_Size(args);
-    sz_error_cost_t match = 0, mismatch = 1, gap = 1;
+    int match = 0, mismatch = 1, open = 1, extend = 1;
     PyObject *capabilities_tuple = NULL;
     sz_capability_t capabilities = default_hardware_capabilities;
 
-    // Manual argument parsing - fast path for positional args
-    if (nargs >= 1) {
-        PyObject *obj = PyTuple_GET_ITEM(args, 0);
-        if (PyLong_Check(obj)) {
-            long val = PyLong_AsLong(obj);
-            if (val < -128 || val > 127) {
-                PyErr_SetString(PyExc_ValueError, "match cost must fit in 8-bit signed integer");
-                return -1;
-            }
-            match = (sz_error_cost_t)val;
-        }
-        else {
-            PyErr_SetString(PyExc_TypeError, "match cost must be an integer");
-            return -1;
-        }
-    }
-    if (nargs >= 2) {
-        PyObject *obj = PyTuple_GET_ITEM(args, 1);
-        if (PyLong_Check(obj)) {
-            long val = PyLong_AsLong(obj);
-            if (val < -128 || val > 127) {
-                PyErr_SetString(PyExc_ValueError, "mismatch cost must fit in 8-bit signed integer");
-                return -1;
-            }
-            mismatch = (sz_error_cost_t)val;
-        }
-        else {
-            PyErr_SetString(PyExc_TypeError, "mismatch cost must be an integer");
-            return -1;
-        }
+    static char *kwlist[] = {"match", "mismatch", "open", "extend", "capabilities", NULL};
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iiiiO", kwlist, &match, &mismatch, &open, &extend,
+                                     &capabilities_tuple)) {
+        return -1;
     }
-    if (nargs >= 3) {
-        PyObject *obj = PyTuple_GET_ITEM(args, 2);
-        if (PyLong_Check(obj)) {
-            long val = PyLong_AsLong(obj);
-            if (val < -128 || val > 127) {
-                PyErr_SetString(PyExc_ValueError, "gap cost must fit in 8-bit signed integer");
-                return -1;
-            }
-            gap = (sz_error_cost_t)val;
-        }
-        else {
-            PyErr_SetString(PyExc_TypeError, "gap cost must be an integer");
-            return -1;
-        }
+
+    // Validate range of values
+    if (match < -128 || match > 127) {
+        PyErr_SetString(PyExc_ValueError, "match cost must fit in 8-bit signed integer");
+        return -1;
     }
-    if (nargs >= 4) {
-        PyObject *obj = PyTuple_GET_ITEM(args, 3);
-        if (PyTuple_Check(obj)) { capabilities_tuple = obj; }
-        else {
-            PyErr_SetString(PyExc_TypeError, "capabilities must be a tuple of strings");
-            return -1;
-        }
+    if (mismatch < -128 || mismatch > 127) {
+        PyErr_SetString(PyExc_ValueError, "mismatch cost must fit in 8-bit signed integer");
+        return -1;
     }
-    if (nargs > 4) {
-        PyErr_SetString(PyExc_TypeError, "LevenshteinDistancesUTF8() takes at most 4 arguments");
+    if (open < -128 || open > 127) {
+        PyErr_SetString(PyExc_ValueError, "open cost must fit in 8-bit signed integer");
         return -1;
     }
-
-    // Parse keyword arguments
-    if (kwargs) {
-        Py_ssize_t pos = 0;
-        PyObject *key, *value;
-        while (PyDict_Next(kwargs, &pos, &key, &value)) {
-            if (PyUnicode_CompareWithASCIIString(key, "match") == 0) {
-                if (nargs >= 1) {
-                    PyErr_SetString(PyExc_TypeError, "match specified twice");
-                    return -1;
-                }
-                if (!PyLong_Check(value)) {
-                    PyErr_SetString(PyExc_TypeError, "match must be an integer");
-                    return -1;
-                }
-                long val = PyLong_AsLong(value);
-                if (val < -128 || val > 127) {
-                    PyErr_SetString(PyExc_ValueError, "match cost must fit in 8-bit signed integer");
-                    return -1;
-                }
-                match = (sz_error_cost_t)val;
-            }
-            else if (PyUnicode_CompareWithASCIIString(key, "mismatch") == 0) {
-                if (nargs >= 2) {
-                    PyErr_SetString(PyExc_TypeError, "mismatch specified twice");
-                    return -1;
-                }
-                if (!PyLong_Check(value)) {
-                    PyErr_SetString(PyExc_TypeError, "mismatch must be an integer");
-                    return -1;
-                }
-                long val = PyLong_AsLong(value);
-                if (val < -128 || val > 127) {
-                    PyErr_SetString(PyExc_ValueError, "mismatch cost must fit in 8-bit signed integer");
-                    return -1;
-                }
-                mismatch = (sz_error_cost_t)val;
-            }
-            else if (PyUnicode_CompareWithASCIIString(key, "gap") == 0) {
-                if (nargs >= 3) {
-                    PyErr_SetString(PyExc_TypeError, "gap specified twice");
-                    return -1;
-                }
-                if (!PyLong_Check(value)) {
-                    PyErr_SetString(PyExc_TypeError, "gap must be an integer");
-                    return -1;
-                }
-                long val = PyLong_AsLong(value);
-                if (val < -128 || val > 127) {
-                    PyErr_SetString(PyExc_ValueError, "gap cost must fit in 8-bit signed integer");
-                    return -1;
-                }
-                gap = (sz_error_cost_t)val;
-            }
-            else if (PyUnicode_CompareWithASCIIString(key, "capabilities") == 0) {
-                if (nargs >= 4) {
-                    PyErr_SetString(PyExc_TypeError, "capabilities specified twice");
-                    return -1;
-                }
-                if (!PyTuple_Check(value)) {
-                    PyErr_SetString(PyExc_TypeError, "capabilities must be a tuple of strings");
-                    return -1;
-                }
-                capabilities_tuple = value;
-            }
-            else {
-                PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
-                return -1;
-            }
-        }
+    if (extend < -128 || extend > 127) {
+        PyErr_SetString(PyExc_ValueError, "extend cost must fit in 8-bit signed integer");
+        return -1;
     }
 
     // Parse capabilities if provided
@@ -856,7 +543,8 @@ static int LevenshteinDistancesUTF8_init(LevenshteinDistancesUTF8 *self, PyObjec
         if (parse_and_intersect_capabilities(capabilities_tuple, &capabilities) != 0) { return -1; }
     }
 
-    sz_status_t status = sz_levenshtein_distances_utf8_init(match, mismatch, gap, NULL, capabilities, &self->handle);
+    sz_status_t status =
+        sz_levenshtein_distances_utf8_init(match, mismatch, open, extend, NULL, capabilities, &self->handle);
 
     if (status != sz_success_k) {
         PyErr_SetString(PyExc_RuntimeError, "Failed to initialize UTF-8 Levenshtein distances engine");
@@ -866,51 +554,14 @@ static int LevenshteinDistancesUTF8_init(LevenshteinDistancesUTF8 *self, PyObjec
 }
 
 static PyObject *LevenshteinDistancesUTF8_call(LevenshteinDistancesUTF8 *self, PyObject *args, PyObject *kwargs) {
-    Py_ssize_t nargs = PyTuple_Size(args);
     PyObject *a_obj = NULL, *b_obj = NULL, *device_obj = NULL, *out_obj = NULL;
 
-    // Manual argument parsing for hot path
-    if (nargs < 2) {
-        PyErr_SetString(PyExc_TypeError, "LevenshteinDistancesUTF8() requires at least 2 arguments");
-        return NULL;
-    }
+    static char *kwlist[] = {"a", "b", "device", "out", NULL};
 
-    if (nargs > 4) {
-        PyErr_SetString(PyExc_TypeError, "LevenshteinDistancesUTF8() takes at most 4 arguments");
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|OO", kwlist, &a_obj, &b_obj, &device_obj, &out_obj)) {
         return NULL;
     }
 
-    a_obj = PyTuple_GET_ITEM(args, 0);
-    b_obj = PyTuple_GET_ITEM(args, 1);
-    if (nargs >= 3) device_obj = PyTuple_GET_ITEM(args, 2);
-    if (nargs >= 4) out_obj = PyTuple_GET_ITEM(args, 3);
-
-    // Parse keyword arguments
-    if (kwargs) {
-        Py_ssize_t pos = 0;
-        PyObject *key, *value;
-        while (PyDict_Next(kwargs, &pos, &key, &value)) {
-            if (PyUnicode_CompareWithASCIIString(key, "device") == 0) {
-                if (device_obj) {
-                    PyErr_SetString(PyExc_TypeError, "device specified twice");
-                    return NULL;
-                }
-                device_obj = value;
-            }
-            else if (PyUnicode_CompareWithASCIIString(key, "out") == 0) {
-                if (out_obj) {
-                    PyErr_SetString(PyExc_TypeError, "out specified twice");
-                    return NULL;
-                }
-                out_obj = value;
-            }
-            else {
-                PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
-                return NULL;
-            }
-        }
-    }
-
     DeviceScope *device_scope = NULL;
     if (device_obj != NULL && device_obj != Py_None) {
         if (!PyObject_TypeCheck(device_obj, &DeviceScopeType)) {
@@ -1055,15 +706,16 @@ static PyObject *LevenshteinDistancesUTF8_call(LevenshteinDistancesUTF8 *self, P
 }
 
 static char const doc_LevenshteinDistancesUTF8[] = //
-    "LevenshteinDistancesUTF8(match=0, mismatch=1, gap=1, capabilities=None)\n"
+    "LevenshteinDistancesUTF8(match=0, mismatch=1, open=1, extend=1, capabilities=None)\n"
     "\n"
-    "Vectorized UTF-8 Levenshtein distance calculator.\n"
+    "Vectorized UTF-8 Levenshtein distance calculator with affine gap penalties.\n"
     "Computes edit distances between pairs of UTF-8 encoded strings.\n"
     "\n"
     "Args:\n"
     "  match (int): Cost of matching characters (default 0).\n"
     "  mismatch (int): Cost of mismatched characters (default 1).\n"
-    "  gap (int): Cost of gap insertion (default 1).\n"
+    "  open (int): Cost of opening a gap (default 1).\n"
+    "  extend (int): Cost of extending a gap (default 1).\n"
     "  capabilities (Tuple[str], optional): Hardware capabilities to use.\n"
     "                                       Will be intersected with detected capabilities.\n"
     "                                       Examples: ('serial',), ('haswell', 'parallel')\n"

From 4f7649a7df9378a3ddf3da0d956a180e37a62f32 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 13 Aug 2025 18:09:34 +0000
Subject: [PATCH 568/751] Fix: Checking for Ice Lake caps

---
 c/stringzillas.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index 36e4001c..076d0732 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -1001,7 +1001,7 @@ SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_init(                        /
     auto const affine_costs = szs::affine_gap_costs_t {open, extend};
 
 #if SZ_USE_ICE
-    bool const can_use_ice = (capabilities & sz_cap_serial_k) == sz_cap_serial_k;
+    bool const can_use_ice = (capabilities & sz_cap_ice_k) == sz_cap_ice_k;
     if (can_use_ice && can_use_linear_costs) {
         auto variant = szs::needleman_wunsch_ice_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)
@@ -1140,7 +1140,7 @@ SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_init(                          /
     auto const affine_costs = szs::affine_gap_costs_t {open, extend};
 
 #if SZ_USE_ICE
-    bool const can_use_ice = (capabilities & sz_cap_serial_k) == sz_cap_serial_k;
+    bool const can_use_ice = (capabilities & sz_cap_ice_k) == sz_cap_ice_k;
     if (can_use_ice && can_use_linear_costs) {
         auto variant = szs::smith_waterman_ice_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)

From a6d3ed2d03f9c8ca083ee4497e7d22329925b8f5 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 13 Aug 2025 19:51:31 +0000
Subject: [PATCH 569/751] Improve: More readable PyTest

---
 scripts/test_stringzillas.py | 74 +++++++++++++++++++++++++++---------
 1 file changed, 55 insertions(+), 19 deletions(-)

diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index 8e7d6362..f315b7fe 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -78,6 +78,40 @@ def device_scope_and_capabilities(device: DeviceName):
         raise ValueError(f"Unknown device type: {device}")
 
 
+InputSizeConfig = Literal["one-large", "few-big", "many-small"]
+INPUT_SIZE_CONFIGS = ["one-large", "few-big", "many-small"]
+
+
+def generate_string_batches(config: InputSizeConfig):
+    """Generate string batches based on the specified configuration.
+
+    Returns:
+        tuple: (batch_size, min_length, max_length) parameters for generating test strings
+    """
+    if config == "one-large":
+        return 1, 50, 1024  # Single pair of long strings
+    elif config == "few-big":
+        return 7, 30, 128  # Few pairs of medium strings
+    elif config == "many-small":
+        return 1000, 10, 30  # Many pairs of short strings
+    else:
+        raise ValueError(f"Unknown input size config: {config}")
+
+
+def get_random_string_batch(config: InputSizeConfig):
+    """Generate two batches of random strings based on the configuration."""
+    batch_size, min_len, max_len = generate_string_batches(config)
+
+    # Generate random lengths for each string in the batch
+    a_lengths = [randint(min_len, max_len) for _ in range(batch_size)]
+    b_lengths = [randint(min_len, max_len) for _ in range(batch_size)]
+
+    a_batch = [get_random_string(length=length) for length in a_lengths]
+    b_batch = [get_random_string(length=length) for length in b_lengths]
+
+    return a_batch, b_batch
+
+
 def test_device_scope():
     """Test DeviceScope for execution context control."""
 
@@ -264,7 +298,7 @@ def binary_distance(a: str, b: str) -> int:
     assert binary_distance("abc", "a_bc") == opening, "one insertion"
     assert binary_distance("abc", "adc") == mismatch, "one substitution"
     assert binary_distance("ggbuzgjux{}l", "gbuzgjux{}l") == opening, "one insertion (prepended)"
-    assert binary_distance("abcdefgABCDEFG", "ABCDEFGabcdefg") == 14 * mismatch
+    assert binary_distance("abcdefgABCDEFG", "ABCDEFGabcdefg") == min(14 * mismatch, 2 * opening + 12 * extension)
 
 
 @pytest.mark.parametrize("device_name", DEVICE_NAMES)
@@ -295,43 +329,45 @@ def unicode_distance(a: str, b: str) -> int:
     assert unicode_distance("façade", "facade") == mismatch, "'ç' with cedilla vs. plain"
     assert unicode_distance("Schön", "Scho\u0308n") == mismatch + opening, "'ö' represented as 'o' + '¨'"
     assert unicode_distance("München", "Muenchen") == mismatch + opening, "German with umlaut vs. transcription"
-    assert unicode_distance("こんにちは世界", "こんばんは世界") == mismatch + opening, "Japanese greetings"
+    assert unicode_distance("こんにちは世界", "こんばんは世界") == min(2 * mismatch, 4 * opening), "Japanese greetings"
 
 
 @pytest.mark.repeat(10)
-@pytest.mark.parametrize("first_length", [20, 100])
-@pytest.mark.parametrize("second_length", [20, 100])
-@pytest.mark.parametrize("batch_size", [1, 3, 133, 1000])
+@pytest.mark.parametrize("config", INPUT_SIZE_CONFIGS)
 @pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
-def test_levenshtein_distance_random(first_length: int, second_length: int, batch_size: int):
-    batch_a = [get_random_string(length=first_length) for _ in range(batch_size)]
-    batch_b = [get_random_string(length=second_length) for _ in range(batch_size)]
+def test_levenshtein_distance_random(config: InputSizeConfig):
+    a_batch, b_batch = get_random_string_batch(config)
 
-    baselines = np.array([baseline_levenshtein_distance(a, b) for a, b in zip(batch_a, batch_b)])
+    baselines = np.array([baseline_levenshtein_distance(a, b) for a, b in zip(a_batch, b_batch)])
     engine = szs.LevenshteinDistances()
-    results = engine(batch_a, batch_b)
+
+    # Convert to Strs objects
+    a_strs = Strs(a_batch)
+    b_strs = Strs(b_batch)
+    results = engine(a_strs, b_strs)
 
     np.testing.assert_array_equal(results, baselines, "Edit distances do not match")
 
 
 @pytest.mark.repeat(10)
-@pytest.mark.parametrize("first_length", [20, 100])
-@pytest.mark.parametrize("second_length", [20, 100])
-@pytest.mark.parametrize("batch_size", [1, 3, 133, 1000])
+@pytest.mark.parametrize("config", INPUT_SIZE_CONFIGS)
 @pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
-def test_needleman_wunsch_vs_levenshtein_random(first_length: int, second_length: int, batch_size: int):
+def test_needleman_wunsch_vs_levenshtein_random(config: InputSizeConfig):
     """Test Needleman-Wunsch global alignment scores against Levenshtein distances with random strings."""
 
-    batch_a = [get_random_string(length=first_length) for _ in range(batch_size)]
-    batch_b = [get_random_string(length=second_length) for _ in range(batch_size)]
+    a_batch, b_batch = get_random_string_batch(config)
 
     character_substitutions = np.zeros((256, 256), dtype=np.int8)
     character_substitutions.fill(-1)
     np.fill_diagonal(character_substitutions, 0)
 
-    baselines = [-baseline_levenshtein_distance(a, b) for a, b in zip(batch_a, batch_b)]
-    engine = sz.NeedlemanWunsch(substitution_matrix=character_substitutions, open=-1, extend=-1)
-    results = engine(batch_a, batch_b)
+    baselines = [-baseline_levenshtein_distance(a, b) for a, b in zip(a_batch, b_batch)]
+    engine = szs.NeedlemanWunsch(substitution_matrix=character_substitutions, open=-1, extend=-1)
+
+    # Convert to Strs objects
+    a_strs = Strs(a_batch)
+    b_strs = Strs(b_batch)
+    results = engine(a_strs, b_strs)
 
     np.testing.assert_array_equal(results, baselines, "Edit distances do not match")
 

From 009b97521526ae655eaeb4f2fae7b6685ea1b751 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 13 Aug 2025 20:53:20 +0000
Subject: [PATCH 570/751] Break: Refactor `Strs` ops

Replace `sort` -> `sorted`,
and `shuffle` -> `shuffled`.
We also now have 5 layouts for `Strs`.
---
 python/stringzilla.c        | 2041 ++++++++++++++++++++---------------
 scripts/test_stringzilla.py |   16 +-
 2 files changed, 1206 insertions(+), 851 deletions(-)

diff --git a/python/stringzilla.c b/python/stringzilla.c
index a33d233d..d7e9afa2 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -20,6 +20,7 @@
  *  - `sz_py_export_strings_as_sequence`.
  *  - `sz_py_export_strings_as_u32tape`.
  *  - `sz_py_export_strings_as_u64tape`.
+ *  - `sz_py_replace_strings_allocator`.
  */
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
 #define NOMINMAX
@@ -59,9 +60,9 @@ typedef SSIZE_T ssize_t;
  *  @see https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions
  */
 struct ArrowSchema {
-    const char *format;
-    const char *name;
-    const char *metadata;
+    char const *format;
+    char const *name;
+    char const *metadata;
     int64_t flags;
     int64_t n_children;
     struct ArrowSchema **children;
@@ -80,7 +81,7 @@ struct ArrowArray {
     int64_t offset;
     int64_t n_buffers;
     int64_t n_children;
-    const void **buffers;
+    void const **buffers;
     struct ArrowArray **children;
     struct ArrowArray *dictionary;
     void (*release)(struct ArrowArray *);
@@ -92,6 +93,7 @@ typedef struct PyAPI {
     sz_bool_t (*sz_py_export_strings_as_sequence)(PyObject *, sz_sequence_t *);
     sz_bool_t (*sz_py_export_strings_as_u32tape)(PyObject *, sz_cptr_t *, sz_u32_t const **, sz_size_t *);
     sz_bool_t (*sz_py_export_strings_as_u64tape)(PyObject *, sz_cptr_t *, sz_u64_t const **, sz_size_t *);
+    sz_bool_t (*sz_py_replace_strings_allocator)(PyObject *, sz_memory_allocator_t *);
 } PyAPI;
 
 #pragma region Forward Declarations
@@ -180,62 +182,68 @@ typedef struct {
     PyObject ob_base;
 
     enum {
-        STRS_CONSECUTIVE_32,
-        STRS_CONSECUTIVE_64,
-        STRS_REORDERED,
-        STRS_MULTI_SOURCE,
+        STRS_U32_TAPE_VIEW,
+        STRS_U64_TAPE_VIEW,
+        STRS_U32_TAPE,
+        STRS_U64_TAPE,
+        STRS_FRAGMENTED,
     } type;
 
     union {
         /**
-         *  Simple structure resembling Apache Arrow arrays of variable length strings.
-         *  When you split a `Str`, that is under 4 GB in size, this is used for space-efficiency.
-         *
-         *  The `offsets` contains `count+1` integers similar to the Apache Arrow format.
-         *  The length of the i-th string is calculated as: `offsets[i+1] - offsets[i] - separator_length`.
-         *  The first offset is typically 0, unless we are looking at a slice of a larger array.
-         *
-         *  The layout is now identical to Apache Arrow format: N+1 offsets for N strings.
+         *  U32 tape view - references existing Arrow array data, owns nothing.
+         *  The layout is identical to Apache Arrow format: N+1 offsets for N strings.
          *  https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout
          */
-        struct consecutive_slices_32bit_t {
-            size_t count;
-            size_t separator_length;
-            PyObject *parent_string;
-            char const *start; // ? Ownership is controlled by presence of `parent_string`
-            uint32_t *offsets; // Apache Arrow format: N+1 offsets for N strings, starting with 0
-            int owns_offsets;  // ? 1 if we allocated `offsets` and should `free`
-        } consecutive_32bit;
+        struct u32_tape_view_t {
+            sz_size_t count;
+            sz_cptr_t data;    // Points to existing data (not owned)
+            sz_u32_t *offsets; // Points to existing offsets (not owned)
+            PyObject *parent;  // Parent Arrow array or other object
+        } u32_tape_view;
 
         /**
-         *  Simple structure resembling Apache Arrow arrays of variable length strings.
-         *  When you split a `Str`, over 4 GB long, this structure is used to indicate chunk offsets.
-         *
-         *  The `offsets` contains `count+1` integers similar to the Apache Arrow format.
-         *  The length of the i-th string is calculated as: `offsets[i+1] - offsets[i] - separator_length`.
-         *  The first offset is typically 0, unless we are looking at a slice of a larger array.
-         *
-         *  The layout is now identical to Apache Arrow format: N+1 offsets for N strings.
+         *  U32 tape - owns both offsets and data with custom allocator.
+         */
+        struct u32_tape_t {
+            sz_size_t count;
+            sz_cptr_t data;    // Owned data
+            sz_u32_t *offsets; // Owned offsets (N+1 for N strings)
+            sz_memory_allocator_t allocator;
+        } u32_tape;
+
+        /**
+         *  U64 tape view - references existing Arrow array data, owns nothing.
+         *  The layout is identical to Apache Arrow format: N+1 offsets for N strings.
          *  https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout
          */
-        struct consecutive_slices_64bit_t {
-            size_t count;
-            size_t separator_length;
-            PyObject *parent_string;
-            char const *start; // ? Ownership is controlled by presence of `parent_string`
-            uint64_t *offsets; // Apache Arrow format: N+1 offsets for N strings, starting with 0
-            int owns_offsets;  // ? 1 if we allocated `offsets` and should `free`
-        } consecutive_64bit;
+        struct u64_tape_view_t {
+            sz_size_t count;
+            sz_cptr_t data;    // Points to existing data (not owned)
+            sz_u64_t *offsets; // Points to existing offsets (not owned)
+            PyObject *parent;  // Parent Arrow array or other object
+        } u64_tape_view;
 
         /**
-         *  Once you sort, shuffle, or reorganize slices making up a larger string, this structure
-         *  can be used for space-efficient lookups.
+         *  U64 tape - owns both offsets and data with custom allocator.
          */
-        struct reordered_slices_t {
-            size_t count;
-            PyObject *parent_string;
-            sz_string_view_t *parts; // ? Ownership is controlled by presence of `parent_string`
-        } reordered;
+        struct u64_tape_t {
+            sz_size_t count;
+            sz_cptr_t data;    // Owned data
+            sz_u64_t *offsets; // Owned offsets (N+1 for N strings)
+            sz_memory_allocator_t allocator;
+        } u64_tape;
+
+        /**
+         *  Reordered subviews - owns only the array of individual spans.
+         *  Each span points to data in the parent object.
+         */
+        struct fragmented_t {
+            sz_size_t count;
+            sz_string_view_t *spans; // Owned array of spans
+            PyObject *parent;        // Parent object (Str, Strs, or other)
+            sz_memory_allocator_t allocator;
+        } fragmented;
 
     } data;
 
@@ -260,16 +268,32 @@ static sz_ptr_t temporary_memory_allocate(sz_size_t size, sz_string_view_t *exis
 
 static void temporary_memory_free(sz_ptr_t start, sz_size_t size, sz_string_view_t *existing) {}
 
-static sz_cptr_t parts_get_start(void const *handle, sz_size_t i) {
-    return ((sz_string_view_t const *)handle)[i].start;
+static sz_cptr_t Strs_get_start_(void const *handle, sz_size_t i) {
+    Strs *strs = (Strs *)handle;
+    switch (strs->type) {
+    case STRS_U32_TAPE: return strs->data.u32_tape.data + strs->data.u32_tape.offsets[i];
+    case STRS_U32_TAPE_VIEW: return strs->data.u32_tape_view.data + strs->data.u32_tape_view.offsets[i];
+    case STRS_U64_TAPE: return strs->data.u64_tape.data + strs->data.u64_tape.offsets[i];
+    case STRS_U64_TAPE_VIEW: return strs->data.u64_tape_view.data + strs->data.u64_tape_view.offsets[i];
+    case STRS_FRAGMENTED: return strs->data.fragmented.spans[i].start;
+    }
+    return NULL;
 }
 
-static sz_size_t parts_get_length(void const *handle, sz_size_t i) {
-    return ((sz_string_view_t const *)handle)[i].length;
+static sz_size_t Strs_get_length_(void const *handle, sz_size_t i) {
+    Strs *strs = (Strs *)handle;
+    switch (strs->type) {
+    case STRS_U32_TAPE: return strs->data.u32_tape.offsets[i + 1] - strs->data.u32_tape.offsets[i];
+    case STRS_U32_TAPE_VIEW: return strs->data.u32_tape_view.offsets[i + 1] - strs->data.u32_tape_view.offsets[i];
+    case STRS_U64_TAPE: return strs->data.u64_tape.offsets[i + 1] - strs->data.u64_tape.offsets[i];
+    case STRS_U64_TAPE_VIEW: return strs->data.u64_tape_view.offsets[i + 1] - strs->data.u64_tape_view.offsets[i];
+    case STRS_FRAGMENTED: return strs->data.fragmented.spans[i].length;
+    }
+    return 0;
 }
 
-void reverse_offsets(sz_sorted_idx_t *array, size_t length) {
-    size_t i, j;
+void reverse_offsets(sz_sorted_idx_t *array, sz_size_t length) {
+    sz_size_t i, j;
     // Swap array[i] and array[j]
     for (i = 0, j = length - 1; i < j; i++, j--) {
         sz_sorted_idx_t temp = array[i];
@@ -278,8 +302,8 @@ void reverse_offsets(sz_sorted_idx_t *array, size_t length) {
     }
 }
 
-void reverse_haystacks(sz_string_view_t *array, size_t length) {
-    size_t i, j;
+void reverse_haystacks(sz_string_view_t *array, sz_size_t length) {
+    sz_size_t i, j;
     // Swap array[i] and array[j]
     for (i = 0, j = length - 1; i < j; i++, j--) {
         sz_string_view_t temp = array[i];
@@ -288,12 +312,12 @@ void reverse_haystacks(sz_string_view_t *array, size_t length) {
     }
 }
 
-void permute(sz_string_view_t *array, sz_sorted_idx_t *order, size_t length) {
-    for (size_t i = 0; i < length; ++i) {
+void permute(sz_string_view_t *array, sz_sorted_idx_t *order, sz_size_t length) {
+    for (sz_size_t i = 0; i < length; ++i) {
         if (i == order[i]) continue;
         sz_string_view_t temp = array[i];
-        size_t k = i, j;
-        while (i != (j = (size_t)order[k])) {
+        sz_size_t k = i, j;
+        while (i != (j = (sz_size_t)order[k])) {
             array[k] = array[j];
             order[k] = k;
             k = j;
@@ -312,18 +336,18 @@ SZ_DYNAMIC sz_bool_t sz_py_export_string_like(PyObject *object, sz_cptr_t *start
         // Handle Python `str` object
         Py_ssize_t signed_length;
         *start = PyUnicode_AsUTF8AndSize(object, &signed_length);
-        *length = (size_t)signed_length;
+        *length = (sz_size_t)signed_length;
         return 1;
     }
     else if (PyBytes_Check(object)) {
         // Handle Python `bytes` object
         // https://docs.python.org/3/c-api/bytes.html
         Py_ssize_t signed_length;
-        if (PyBytes_AsStringAndSize(object, (char **)start, &signed_length) == -1) {
+        if (PyBytes_AsStringAndSize(object, (sz_ptr_t *)start, &signed_length) == -1) {
             PyErr_SetString(PyExc_ValueError, "Couldn't access `bytes` buffer internals");
             return 0;
         }
-        *length = (size_t)signed_length;
+        *length = (sz_size_t)signed_length;
         return 1;
     }
     else if (PyByteArray_Check(object)) {
@@ -378,22 +402,22 @@ SZ_DYNAMIC sz_bool_t sz_py_export_string_like(PyObject *object, sz_cptr_t *start
 
 sz_cptr_t sz_py_strs_sequence_member_start_if_reordered(void const *sequence_punned, sz_size_t index) {
     Strs *strs = (Strs *)sequence_punned;
-    sz_assert_(strs->type == STRS_REORDERED && "Expected a reordered Strs type");
-    if (index < 0 || index >= strs->data.reordered.count) {
+    sz_assert_(strs->type == STRS_FRAGMENTED && "Expected a reordered Strs type");
+    if (index < 0 || index >= strs->data.fragmented.count) {
         PyErr_SetString(PyExc_IndexError, "Index out of bounds");
         return NULL;
     }
-    return strs->data.reordered.parts[index].start;
+    return strs->data.fragmented.spans[index].start;
 }
 
 sz_size_t sz_py_strs_sequence_member_length_if_reordered(void const *sequence_punned, sz_size_t index) {
     Strs *strs = (Strs *)sequence_punned;
-    sz_assert_(strs->type == STRS_REORDERED && "Expected a reordered Strs type");
-    if (index < 0 || index >= strs->data.reordered.count) {
+    sz_assert_(strs->type == STRS_FRAGMENTED && "Expected a reordered Strs type");
+    if (index < 0 || index >= strs->data.fragmented.count) {
         PyErr_SetString(PyExc_IndexError, "Index out of bounds");
         return 0;
     }
-    return strs->data.reordered.parts[index].length;
+    return strs->data.fragmented.spans[index].length;
 }
 
 /**
@@ -404,10 +428,10 @@ SZ_DYNAMIC sz_bool_t sz_py_export_strings_as_sequence(PyObject *object, sz_seque
 
     if (PyObject_TypeCheck(object, &StrsType)) {
         Strs *strs = (Strs *)object;
-        sz_assert_(strs->type == STRS_REORDERED && "View as tapes!");
+        sz_assert_(strs->type == STRS_FRAGMENTED && "View as tapes!");
 
         sequence->handle = strs;
-        sequence->count = strs->data.reordered.count;
+        sequence->count = strs->data.fragmented.count;
         sequence->get_start = sz_py_strs_sequence_member_start_if_reordered;
         sequence->get_length = sz_py_strs_sequence_member_length_if_reordered;
         return sz_true_k;
@@ -425,12 +449,17 @@ SZ_DYNAMIC sz_bool_t sz_py_export_strings_as_u32tape(PyObject *object, sz_cptr_t
     if (!data || !offsets || !count) return sz_false_k;
     if (!PyObject_TypeCheck(object, &StrsType)) return sz_false_k;
     Strs *strs = (Strs *)object;
-    if (strs->type != STRS_CONSECUTIVE_32) return sz_false_k;
-    if (strs->data.consecutive_32bit.separator_length != 0) return sz_false_k;
-
-    *data = strs->data.consecutive_32bit.start;
-    *offsets = strs->data.consecutive_32bit.offsets;
-    *count = strs->data.consecutive_32bit.count;
+    if (strs->type != STRS_U32_TAPE && strs->type != STRS_U32_TAPE_VIEW) return sz_false_k;
+    if (strs->type == STRS_U32_TAPE) {
+        *data = strs->data.u32_tape.data;
+        *offsets = strs->data.u32_tape.offsets;
+        *count = strs->data.u32_tape.count;
+    }
+    else {
+        *data = strs->data.u32_tape_view.data;
+        *offsets = strs->data.u32_tape_view.offsets;
+        *count = strs->data.u32_tape_view.count;
+    }
     return sz_true_k;
 }
 
@@ -443,12 +472,209 @@ SZ_DYNAMIC sz_bool_t sz_py_export_strings_as_u64tape(PyObject *object, sz_cptr_t
     if (!data || !offsets || !count) return sz_false_k;
     if (!PyObject_TypeCheck(object, &StrsType)) return sz_false_k;
     Strs *strs = (Strs *)object;
-    if (strs->type != STRS_CONSECUTIVE_64) return sz_false_k;
-    if (strs->data.consecutive_64bit.separator_length != 0) return sz_false_k;
+    if (strs->type != STRS_U64_TAPE && strs->type != STRS_U64_TAPE_VIEW) return sz_false_k;
+    if (strs->type == STRS_U64_TAPE) {
+        *data = strs->data.u64_tape.data;
+        *offsets = strs->data.u64_tape.offsets;
+        *count = strs->data.u64_tape.count;
+    }
+    else {
+        *data = strs->data.u64_tape_view.data;
+        *offsets = strs->data.u64_tape_view.offsets;
+        *count = strs->data.u64_tape_view.count;
+    }
+    return sz_true_k;
+}
+
+/**
+ *  @brief  Helper function to replace the memory allocator in a `Strs` object.
+ *          This reallocates existing string data using the new allocator.
+ *
+ *  This may change the type of the `Strs` layout:
+ *  - `STRS_U32_TAPE_VIEW` becomes `STRS_U32_TAPE`.
+ *  - `STRS_U64_TAPE_VIEW` becomes `STRS_U64_TAPE`.
+ *  - `STRS_U32_TAPE` remains, if the allocator is different.
+ *  - `STRS_U64_TAPE` remains, if the allocator is different.
+ *  - `STRS_FRAGMENTED` remains, but detaches from the parent object, if the allocator is different.
+ */
+SZ_DYNAMIC sz_bool_t sz_py_replace_strings_allocator(PyObject *object, sz_memory_allocator_t *allocator) {
+    if (!object || !allocator) return sz_false_k;
+    if (!PyObject_TypeCheck(object, &StrsType)) return sz_false_k;
+
+    Strs *strs = (Strs *)object;
+
+    printf("DEBUG: sz_py_replace_strings_allocator called for Strs at %p\n", strs);
+    printf("DEBUG: Strs type: %d\n", strs->type);
+
+    // Get the current allocator based on type
+    sz_memory_allocator_t old_allocator;
+    switch (strs->type) {
+    case STRS_U32_TAPE: old_allocator = strs->data.u32_tape.allocator; break;
+    case STRS_U64_TAPE: old_allocator = strs->data.u64_tape.allocator; break;
+    case STRS_FRAGMENTED: old_allocator = strs->data.fragmented.allocator; break;
+    case STRS_U32_TAPE_VIEW:
+    case STRS_U64_TAPE_VIEW:
+        // Views don't own memory, use default allocator for comparison
+        sz_memory_allocator_init_default(&old_allocator);
+        break;
+    }
+
+    // Check if the allocators are the same - no need to reallocate
+    if (sz_memory_allocator_equal(&old_allocator, allocator)) {
+        printf("DEBUG: Allocators are equal, no reallocation needed\n");
+        return sz_true_k;
+    }
+
+    // Handle different Strs layouts
+    switch (strs->type) {
+    case STRS_U32_TAPE: {
+        struct u32_tape_t *data = &strs->data.u32_tape;
+        sz_assert_(data->offsets && "Expected offsets to be allocated");
+
+        sz_size_t const string_data_size = (sz_size_t)data->offsets[data->count];
+        sz_size_t const offsets_size = (data->count + 1) * sizeof(sz_u32_t);
+
+        // Allocate new string data with new allocator
+        sz_ptr_t new_string_data = (sz_ptr_t)allocator->allocate(string_data_size, allocator->handle);
+        if (!new_string_data) return sz_false_k;
+        memcpy(new_string_data, data->data, string_data_size);
+
+        // Allocate new offsets array
+        sz_u32_t *new_offsets = (sz_u32_t *)allocator->allocate(offsets_size, allocator->handle);
+        if (!new_offsets) {
+            allocator->free(new_string_data, string_data_size, allocator->handle);
+            return sz_false_k;
+        }
+        memcpy(new_offsets, data->offsets, offsets_size);
+
+        // Free old memory with old allocator (tapes always own their data)
+        old_allocator.free(data->data, string_data_size, old_allocator.handle);
+        old_allocator.free(data->offsets, offsets_size, old_allocator.handle);
+
+        // Update pointers and allocator
+        data->data = new_string_data;
+        data->offsets = new_offsets;
+        data->allocator = *allocator;
+        break;
+    }
+
+    case STRS_U64_TAPE: {
+        struct u64_tape_t *data = &strs->data.u64_tape;
+        sz_assert_(data->offsets && "Expected offsets to be allocated");
+
+        sz_size_t string_data_size = (sz_size_t)data->offsets[data->count];
+        sz_size_t offsets_size = (data->count + 1) * sizeof(sz_u64_t);
+
+        // Allocate new string data with new allocator
+        sz_ptr_t new_string_data = (sz_ptr_t)allocator->allocate(string_data_size, allocator->handle);
+        if (!new_string_data) return sz_false_k;
+        memcpy(new_string_data, data->data, string_data_size);
+
+        // Allocate new offsets array
+        sz_u64_t *new_offsets = (sz_u64_t *)allocator->allocate(offsets_size, allocator->handle);
+        if (!new_offsets) {
+            allocator->free(new_string_data, string_data_size, allocator->handle);
+            return sz_false_k;
+        }
+        memcpy(new_offsets, data->offsets, offsets_size);
+
+        // Free old memory with old allocator (tapes always own their data)
+        old_allocator.free(data->data, string_data_size, old_allocator.handle);
+        old_allocator.free(data->offsets, offsets_size, old_allocator.handle);
+
+        // Update pointers and allocator
+        data->data = new_string_data;
+        data->offsets = new_offsets;
+        data->allocator = *allocator;
+        break;
+    }
+
+    case STRS_U32_TAPE_VIEW: {
+        // Convert view to tape by copying the data
+        struct u32_tape_view_t *view = &strs->data.u32_tape_view;
+        sz_size_t const string_data_size = (sz_size_t)view->offsets[view->count];
+        sz_size_t const offsets_size = (view->count + 1) * sizeof(sz_u32_t);
+
+        // Allocate new string data with new allocator
+        sz_ptr_t new_string_data = (sz_ptr_t)allocator->allocate(string_data_size, allocator->handle);
+        if (!new_string_data) return sz_false_k;
+        memcpy(new_string_data, view->data, string_data_size);
+
+        // Allocate new offsets array
+        sz_u32_t *new_offsets = (sz_u32_t *)allocator->allocate(offsets_size, allocator->handle);
+        if (!new_offsets) {
+            allocator->free(new_string_data, string_data_size, allocator->handle);
+            return sz_false_k;
+        }
+        memcpy(new_offsets, view->offsets, offsets_size);
+
+        // Release parent reference if any
+        Py_XDECREF(view->parent);
+
+        // Convert to tape layout
+        strs->type = STRS_U32_TAPE;
+        strs->data.u32_tape.count = view->count;
+        strs->data.u32_tape.data = new_string_data;
+        strs->data.u32_tape.offsets = new_offsets;
+        strs->data.u32_tape.allocator = *allocator;
+        break;
+    }
+
+    case STRS_U64_TAPE_VIEW: {
+        // Convert view to tape by copying the data
+        struct u64_tape_view_t *view = &strs->data.u64_tape_view;
+        sz_size_t const string_data_size = (sz_size_t)view->offsets[view->count];
+        sz_size_t const offsets_size = (view->count + 1) * sizeof(sz_u64_t);
+
+        // Allocate new string data with new allocator
+        sz_ptr_t new_string_data = (sz_ptr_t)allocator->allocate(string_data_size, allocator->handle);
+        if (!new_string_data) return sz_false_k;
+        memcpy(new_string_data, view->data, string_data_size);
+
+        // Allocate new offsets array
+        sz_u64_t *new_offsets = (sz_u64_t *)allocator->allocate(offsets_size, allocator->handle);
+        if (!new_offsets) {
+            allocator->free(new_string_data, string_data_size, allocator->handle);
+            return sz_false_k;
+        }
+        memcpy(new_offsets, view->offsets, offsets_size);
+
+        // Release parent reference if any
+        Py_XDECREF(view->parent);
+
+        // Convert to tape layout
+        strs->type = STRS_U64_TAPE;
+        strs->data.u64_tape.count = view->count;
+        strs->data.u64_tape.data = new_string_data;
+        strs->data.u64_tape.offsets = new_offsets;
+        strs->data.u64_tape.allocator = *allocator;
+        break;
+    }
+
+    case STRS_FRAGMENTED: {
+        struct fragmented_t *data = &strs->data.fragmented;
+        sz_assert_(data->spans && "Expected spans to be allocated");
+
+        // Reallocate the spans array with the new allocator
+        sz_size_t spans_size = data->count * sizeof(sz_string_view_t);
+        sz_string_view_t *new_spans = (sz_string_view_t *)allocator->allocate(spans_size, allocator->handle);
+        if (!new_spans) return sz_false_k;
+        memcpy(new_spans, data->spans, spans_size);
+
+        // Free old spans with old allocator
+        old_allocator.free(data->spans, spans_size, old_allocator.handle);
+
+        // Detach from parent object
+        Py_XDECREF(data->parent);
+        data->parent = NULL;
+
+        // Update pointer and allocator
+        data->spans = new_spans;
+        data->allocator = *allocator;
+        break;
+    }
+    }
 
-    *data = strs->data.consecutive_64bit.start;
-    *offsets = strs->data.consecutive_64bit.offsets;
-    *count = strs->data.consecutive_64bit.count;
     return sz_true_k;
 }
 
@@ -464,107 +690,69 @@ void wrap_current_exception(sz_cptr_t comment) {
     sz_unused_(comment);
 }
 
-typedef void (*get_string_at_offset_t)(Strs *, Py_ssize_t, Py_ssize_t, PyObject **, char const **, size_t *);
+typedef void (*get_string_at_offset_t)(Strs *, Py_ssize_t, Py_ssize_t, PyObject **, sz_cptr_t *, sz_size_t *);
 
-void str_at_offset_consecutive_32bit(Strs *strs, Py_ssize_t i, Py_ssize_t count, //
-                                     PyObject **parent_string, char const **start, size_t *length) {
+void str_at_offset_u32_tape(Strs *strs, Py_ssize_t i, Py_ssize_t count, //
+                            PyObject **memory_owner, sz_cptr_t *start, sz_size_t *length) {
     // Apache Arrow format: offsets[i] to offsets[i+1] defines string i
-    uint32_t start_offset = strs->data.consecutive_32bit.offsets[i];
-    uint32_t end_offset = strs->data.consecutive_32bit.offsets[i + 1] - //
-                          strs->data.consecutive_32bit.separator_length * (i + 1 != count);
-    *start = strs->data.consecutive_32bit.start + start_offset;
+    sz_u32_t start_offset = strs->data.u32_tape.offsets[i];
+    sz_u32_t end_offset = strs->data.u32_tape.offsets[i + 1];
+    *start = strs->data.u32_tape.data + start_offset;
     *length = end_offset - start_offset;
-    *parent_string = strs->data.consecutive_32bit.parent_string;
+    *memory_owner = strs; // Tapes own their data
 }
 
-void str_at_offset_consecutive_64bit(Strs *strs, Py_ssize_t i, Py_ssize_t count, //
-                                     PyObject **parent_string, char const **start, size_t *length) {
+void str_at_offset_u32_tape_view(Strs *strs, Py_ssize_t i, Py_ssize_t count, //
+                                 PyObject **memory_owner, sz_cptr_t *start, sz_size_t *length) {
     // Apache Arrow format: offsets[i] to offsets[i+1] defines string i
-    uint64_t start_offset = strs->data.consecutive_64bit.offsets[i];
-    uint64_t end_offset = strs->data.consecutive_64bit.offsets[i + 1] - //
-                          strs->data.consecutive_64bit.separator_length * (i + 1 != count);
-    *start = strs->data.consecutive_64bit.start + start_offset;
+    sz_u32_t start_offset = strs->data.u32_tape_view.offsets[i];
+    sz_u32_t end_offset = strs->data.u32_tape_view.offsets[i + 1];
+    *start = strs->data.u32_tape_view.data + start_offset;
     *length = end_offset - start_offset;
-    *parent_string = strs->data.consecutive_64bit.parent_string;
+    *memory_owner = strs->data.u32_tape_view.parent;
 }
 
-void str_at_offset_reordered(Strs *strs, Py_ssize_t i, Py_ssize_t count, //
-                             PyObject **parent_string, char const **start, size_t *length) {
-    *start = strs->data.reordered.parts[i].start;
-    *length = strs->data.reordered.parts[i].length;
-    *parent_string = strs->data.reordered.parent_string;
+void str_at_offset_u64_tape(Strs *strs, Py_ssize_t i, Py_ssize_t count, //
+                            PyObject **memory_owner, sz_cptr_t *start, sz_size_t *length) {
+    // Apache Arrow format: offsets[i] to offsets[i+1] defines string i
+    sz_u64_t start_offset = strs->data.u64_tape.offsets[i];
+    sz_u64_t end_offset = strs->data.u64_tape.offsets[i + 1];
+    *start = strs->data.u64_tape.data + start_offset;
+    *length = end_offset - start_offset;
+    *memory_owner = strs; // Tapes own their data
 }
 
-get_string_at_offset_t str_at_offset_getter(Strs *strs) {
-    switch (strs->type) {
-    case STRS_CONSECUTIVE_32: return str_at_offset_consecutive_32bit;
-    case STRS_CONSECUTIVE_64: return str_at_offset_consecutive_64bit;
-    case STRS_REORDERED: return str_at_offset_reordered;
-    default:
-        // Unsupported type
-        PyErr_SetString(PyExc_TypeError, "Unsupported type for conversion");
-        return NULL;
-    }
+void str_at_offset_u64_tape_view(Strs *strs, Py_ssize_t i, Py_ssize_t count, //
+                                 PyObject **memory_owner, sz_cptr_t *start, sz_size_t *length) {
+    // Apache Arrow format: offsets[i] to offsets[i+1] defines string i
+    sz_u64_t start_offset = strs->data.u64_tape_view.offsets[i];
+    sz_u64_t end_offset = strs->data.u64_tape_view.offsets[i + 1];
+    *start = strs->data.u64_tape_view.data + start_offset;
+    *length = end_offset - start_offset;
+    *memory_owner = strs->data.u64_tape_view.parent;
 }
 
-sz_bool_t prepare_strings_for_reordering(Strs *strs) {
+void str_at_offset_fragmented(Strs *strs, Py_ssize_t i, Py_ssize_t count, //
+                              PyObject **memory_owner, sz_cptr_t *start, sz_size_t *length) {
+    *start = strs->data.fragmented.spans[i].start;
+    *length = strs->data.fragmented.spans[i].length;
+    *memory_owner = strs->data.fragmented.parent;
+}
 
-    // Allocate memory for reordered slices
-    size_t count = 0;
-    void *buffer_to_release = NULL;
-    get_string_at_offset_t getter = NULL;
-    PyObject *parent_string = NULL;
+get_string_at_offset_t str_at_offset_getter(Strs *strs) {
     switch (strs->type) {
-    case STRS_CONSECUTIVE_32:
-        count = strs->data.consecutive_32bit.count;
-        if (strs->data.consecutive_32bit.owns_offsets) buffer_to_release = strs->data.consecutive_32bit.offsets;
-        parent_string = strs->data.consecutive_32bit.parent_string;
-        getter = str_at_offset_consecutive_32bit;
-        break;
-    case STRS_CONSECUTIVE_64:
-        count = strs->data.consecutive_64bit.count;
-        if (strs->data.consecutive_64bit.owns_offsets) buffer_to_release = strs->data.consecutive_64bit.offsets;
-        parent_string = strs->data.consecutive_64bit.parent_string;
-        getter = str_at_offset_consecutive_64bit;
-        break;
-    // Already in reordered form
-    case STRS_REORDERED: return 1;
-    case STRS_MULTI_SOURCE: return 1;
+    case STRS_U32_TAPE: return str_at_offset_u32_tape;
+    case STRS_U32_TAPE_VIEW: return str_at_offset_u32_tape_view;
+    case STRS_U64_TAPE: return str_at_offset_u64_tape;
+    case STRS_U64_TAPE_VIEW: return str_at_offset_u64_tape_view;
+    case STRS_FRAGMENTED: return str_at_offset_fragmented;
     default:
         // Unsupported type
         PyErr_SetString(PyExc_TypeError, "Unsupported type for conversion");
-        return 0;
-    }
-
-    sz_string_view_t *new_parts = (sz_string_view_t *)malloc(count * sizeof(sz_string_view_t));
-    if (new_parts == NULL) {
-        PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for reordered slices");
-        return 0;
+        return NULL;
     }
-
-    // Populate the new reordered array using get_string_at_offset
-    for (size_t i = 0; i < count; ++i) {
-        PyObject *parent_string;
-        char const *start;
-        size_t length;
-        getter(strs, (Py_ssize_t)i, count, &parent_string, &start, &length);
-        new_parts[i].start = start;
-        new_parts[i].length = length;
-    }
-
-    // Release previous used memory, if we own it
-    if (buffer_to_release) free(buffer_to_release);
-
-    // Update the Strs object
-    strs->type = STRS_REORDERED;
-    strs->data.reordered.count = count;
-    strs->data.reordered.parts = new_parts;
-    strs->data.reordered.parent_string = parent_string;
-    return 1;
 }
 
-sz_bool_t prepare_strings_for_extension(Strs *strs, size_t new_parents, size_t new_parts) { return 1; }
-
 #pragma endregion
 
 #pragma region Memory Mapping File
@@ -617,7 +805,7 @@ static PyObject *File_new(PyTypeObject *type, PyObject *positional_args, PyObjec
 }
 
 static int File_init(File *self, PyObject *positional_args, PyObject *named_args) {
-    char const *path;
+    sz_cptr_t path;
     if (!PyArg_ParseTuple(positional_args, "s", &path)) return -1;
 
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
@@ -644,7 +832,7 @@ static int File_init(File *self, PyObject *positional_args, PyObject *named_args
         return -1;
     }
 
-    char *file = (char *)MapViewOfFile(self->mapping_handle, FILE_MAP_READ, 0, 0, 0);
+    sz_ptr_t file = (sz_ptr_t)MapViewOfFile(self->mapping_handle, FILE_MAP_READ, 0, 0, 0);
     if (file == 0) {
         CloseHandle(self->mapping_handle);
         self->mapping_handle = NULL;
@@ -677,7 +865,7 @@ static int File_init(File *self, PyObject *positional_args, PyObject *named_args
         PyErr_Format(PyExc_ValueError, "The provided path is not a normal file at '%s'", path);
         return -1;
     }
-    size_t file_size = sb.st_size;
+    sz_size_t file_size = sb.st_size;
     void *map = mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, self->file_descriptor, 0);
     if (map == MAP_FAILED) {
         close(self->file_descriptor);
@@ -782,9 +970,9 @@ static int Str_init(Str *self, PyObject *args, PyObject *kwargs) {
     }
 
     // Apply slicing
-    size_t normalized_offset, normalized_length;
+    sz_size_t normalized_offset, normalized_length;
     sz_ssize_clamp_interval(self->memory.length, from, to, &normalized_offset, &normalized_length);
-    self->memory.start = ((char *)self->memory.start) + normalized_offset;
+    self->memory.start = ((sz_ptr_t)self->memory.start) + normalized_offset;
     self->memory.length = normalized_length;
     return 0;
 }
@@ -1002,7 +1190,7 @@ static PyObject *Str_getitem(Str *self, Py_ssize_t i) {
     // Negative indexing
     if (i < 0) i += self->memory.length;
 
-    if (i < 0 || (size_t)i >= self->memory.length) {
+    if (i < 0 || (sz_size_t)i >= self->memory.length) {
         PyErr_SetString(PyExc_IndexError, "Index out of range");
         return NULL;
     }
@@ -1096,14 +1284,17 @@ static PyObject *Strs_get_offsets_nbytes(Str *self, void *closure) { return NULL
 
 static Py_ssize_t Strs_len(Strs *self) {
     switch (self->type) {
-    case STRS_CONSECUTIVE_32: return self->data.consecutive_32bit.count;
-    case STRS_CONSECUTIVE_64: return self->data.consecutive_64bit.count;
-    case STRS_REORDERED: return self->data.reordered.count;
+    case STRS_U32_TAPE: return self->data.u32_tape.count;
+    case STRS_U32_TAPE_VIEW: return self->data.u32_tape_view.count;
+    case STRS_U64_TAPE: return self->data.u64_tape.count;
+    case STRS_U64_TAPE_VIEW: return self->data.u64_tape_view.count;
+    case STRS_FRAGMENTED: return self->data.fragmented.count;
     default: return 0;
     }
 }
 
 static PyObject *Strs_getitem(Strs *self, Py_ssize_t i) {
+
     // Check for negative index and convert to positive
     Py_ssize_t count = Strs_len(self);
     if (i < 0) i += count;
@@ -1112,16 +1303,16 @@ static PyObject *Strs_getitem(Strs *self, Py_ssize_t i) {
         return NULL;
     }
 
-    PyObject *parent = NULL;
-    char const *start = NULL;
-    size_t length = 0;
     get_string_at_offset_t getter = str_at_offset_getter(self);
     if (!getter) {
         PyErr_SetString(PyExc_TypeError, "Unknown Strs kind");
         return NULL;
     }
-    else
-        getter(self, i, count, &parent, &start, &length);
+
+    PyObject *memory_owner = NULL;
+    sz_cptr_t start = NULL;
+    sz_size_t length = 0;
+    getter(self, i, count, &memory_owner, &start, &length);
 
     // Create a new `Str` object
     Str *view_copy = (Str *)StrType.tp_alloc(&StrType, 0);
@@ -1129,11 +1320,19 @@ static PyObject *Strs_getitem(Strs *self, Py_ssize_t i) {
 
     view_copy->memory.start = start;
     view_copy->memory.length = length;
-    view_copy->parent = parent;
-    Py_INCREF(parent);
+    view_copy->parent = memory_owner;
+    Py_XINCREF(memory_owner);
     return view_copy;
 }
 
+/**
+ *  This returns a `Strs` object of a potentially different layout:
+ *  - `STRS_U32_TAPE_VIEW` input yields a `STRS_U32_TAPE_VIEW` for `step=1`, `STRS_FRAGMENTED` otherwise.
+ *  - `STRS_U64_TAPE_VIEW` input yields a `STRS_U64_TAPE_VIEW` for `step=1`, `STRS_FRAGMENTED` otherwise.
+ *  - `STRS_U32_TAPE` input yields a `STRS_U32_TAPE_VIEW`  for `step=1`, `STRS_FRAGMENTED` otherwise.
+ *  - `STRS_U64_TAPE` input yields a `STRS_U64_TAPE_VIEW`  for `step=1`, `STRS_FRAGMENTED` otherwise.
+ *  - `STRS_FRAGMENTED` input yields a `STRS_FRAGMENTED` output.
+ */
 static PyObject *Strs_subscript(Strs *self, PyObject *key) {
 
     if (PyLong_Check(key)) { return Strs_getitem(self, PyLong_AsSsize_t(key)); }
@@ -1153,136 +1352,118 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) {
     // Create a new `Strs` object
     Strs *result = (Strs *)StrsType.tp_alloc(&StrsType, 0);
     if (result == NULL && PyErr_NoMemory()) return NULL;
+
     if (result_count == 0) {
-        result->type = STRS_REORDERED;
-        result->data.reordered.count = 0;
-        result->data.reordered.parts = NULL;
-        result->data.reordered.parent_string = NULL;
+        result->type = STRS_FRAGMENTED;
+        result->data.fragmented.count = 0;
+        result->data.fragmented.spans = NULL;
+        result->data.fragmented.parent = NULL;
+        sz_memory_allocator_init_default(&result->data.fragmented.allocator);
         return (PyObject *)result;
     }
 
-    // If a step is requested, we have to create a new `REORDERED` instance of `Strs`,
-    // even if the original one was `CONSECUTIVE`.
+    // If a step is requested, we have to create a new `FRAGMENTED` instance of `Strs`,
+    // even if the original one was a tape layout.
     if (step != 1) {
-        sz_string_view_t *new_parts = (sz_string_view_t *)malloc(result_count * sizeof(sz_string_view_t));
-        if (new_parts == NULL) {
+        sz_string_view_t *new_spans = (sz_string_view_t *)malloc(result_count * sizeof(sz_string_view_t));
+        if (new_spans == NULL) {
             Py_XDECREF(result);
-            PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for reordered slices");
-            return 0;
+            PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for fragmented spans");
+            return NULL;
         }
 
         get_string_at_offset_t getter = str_at_offset_getter(self);
-        result->type = STRS_REORDERED;
-        result->data.reordered.count = result_count;
-        result->data.reordered.parts = new_parts;
-        result->data.reordered.parent_string = NULL;
-
-        // Populate the new reordered array using `get_string_at_offset`
-        size_t j = 0;
+        result->type = STRS_FRAGMENTED;
+        result->data.fragmented.count = result_count;
+        result->data.fragmented.spans = new_spans;
+        result->data.fragmented.parent = NULL;
+        sz_memory_allocator_init_default(&result->data.fragmented.allocator);
+
+        // Populate the new fragmented array using `get_string_at_offset`
+        sz_size_t j = 0;
         if (step > 0)
             for (Py_ssize_t i = start; i < stop; i += step, ++j) {
-                getter(self, i, count, &result->data.reordered.parent_string, &new_parts[j].start,
-                       &new_parts[j].length);
+                getter(self, i, count, &result->data.fragmented.parent, &new_spans[j].start, &new_spans[j].length);
             }
         else
             for (Py_ssize_t i = start; i > stop; i += step, ++j) {
-                getter(self, i, count, &result->data.reordered.parent_string, &new_parts[j].start,
-                       &new_parts[j].length);
+                getter(self, i, count, &result->data.fragmented.parent, &new_spans[j].start, &new_spans[j].length);
             }
 
         // Ensure the parent string isn't prematurely deallocated by this view.
-        Py_XINCREF(result->data.reordered.parent_string);
+        Py_XINCREF(result->data.fragmented.parent);
         return (PyObject *)result;
     }
 
-    // Depending on the layout, the procedure will be different, but by now we know that:
-    // - `start` and `stop` are valid indices
-    // - `step` is 1
-    // - `result_count` is positive
-    // - the resulting object will have the same type as the original one
-    result->type = self->type;
+    // For step=1, follow the docstring behavior:
     switch (self->type) {
 
-    case STRS_CONSECUTIVE_32: {
-        typedef struct consecutive_slices_32bit_t consecutive_slices_t;
-        consecutive_slices_t *from = &self->data.consecutive_32bit;
-        consecutive_slices_t *to = &result->data.consecutive_32bit;
-        to->count = result_count;
+    case STRS_U32_TAPE_VIEW: {
+        // STRS_U32_TAPE_VIEW input yields STRS_U32_TAPE_VIEW for step=1
+        result->type = STRS_U32_TAPE_VIEW;
+        result->data.u32_tape_view.count = result_count;
+        result->data.u32_tape_view.data = self->data.u32_tape_view.data + self->data.u32_tape_view.offsets[start];
+        result->data.u32_tape_view.offsets = self->data.u32_tape_view.offsets + start;
+        result->data.u32_tape_view.parent = self->data.u32_tape_view.parent;
+        Py_INCREF(result->data.u32_tape_view.parent);
+        break;
+    }
 
-        // Allocate memory for the offsets (Apache Arrow format: N+1 offsets for N strings)
-        to->separator_length = from->separator_length;
-        to->offsets = malloc(sizeof(uint32_t) * (result_count + 1));
-        if (to->offsets == NULL && PyErr_NoMemory()) {
-            Py_XDECREF(result);
-            return NULL;
-        }
-        to->owns_offsets = 1;
-
-        // Now populate the offsets (Apache Arrow format: N+1 offsets for N strings)
-        to->offsets[0] = 0; // First offset is always 0
-        size_t element_length;
-        str_at_offset_consecutive_32bit(self, start, count, &to->parent_string, &to->start, &element_length);
-        to->offsets[1] = element_length;
-        for (Py_ssize_t i = 1; i < result_count; ++i) {
-            to->offsets[i] += from->separator_length;
-            PyObject *element_parent = NULL;
-            char const *element_start = NULL;
-            str_at_offset_consecutive_32bit(self, start + i, count, &element_parent, &element_start, &element_length);
-            to->offsets[i + 1] = element_length + to->offsets[i];
-        }
-        Py_INCREF(to->parent_string);
+    case STRS_U64_TAPE_VIEW: {
+        // STRS_U64_TAPE_VIEW input yields STRS_U64_TAPE_VIEW for step=1
+        result->type = STRS_U64_TAPE_VIEW;
+        result->data.u64_tape_view.count = result_count;
+        result->data.u64_tape_view.data = self->data.u64_tape_view.data + self->data.u64_tape_view.offsets[start];
+        result->data.u64_tape_view.offsets = self->data.u64_tape_view.offsets + start;
+        result->data.u64_tape_view.parent = self->data.u64_tape_view.parent;
+        Py_INCREF(result->data.u64_tape_view.parent);
         break;
     }
 
-    case STRS_CONSECUTIVE_64: {
-        typedef struct consecutive_slices_64bit_t consecutive_slices_t;
-        consecutive_slices_t *from = &self->data.consecutive_64bit;
-        consecutive_slices_t *to = &result->data.consecutive_64bit;
-        to->count = result_count;
+    case STRS_U32_TAPE: {
+        // STRS_U32_TAPE input yields STRS_U32_TAPE_VIEW for step=1
+        result->type = STRS_U32_TAPE_VIEW;
+        result->data.u32_tape_view.count = result_count;
+        result->data.u32_tape_view.data = self->data.u32_tape.data + self->data.u32_tape.offsets[start];
+        result->data.u32_tape_view.offsets = self->data.u32_tape.offsets + start;
+        result->data.u32_tape_view.parent = (PyObject *)self;
+        Py_INCREF((PyObject *)self);
+        break;
+    }
 
-        // Allocate memory for the offsets (Apache Arrow format: N+1 offsets for N strings)
-        to->separator_length = from->separator_length;
-        to->offsets = malloc(sizeof(uint64_t) * (result_count + 1));
-        if (to->offsets == NULL && PyErr_NoMemory()) {
-            Py_XDECREF(result);
-            return NULL;
-        }
-        to->owns_offsets = 1;
-
-        // Now populate the offsets (Apache Arrow format: N+1 offsets for N strings)
-        to->offsets[0] = 0; // First offset is always 0
-        size_t element_length;
-        str_at_offset_consecutive_64bit(self, start, count, &to->parent_string, &to->start, &element_length);
-        to->offsets[1] = element_length;
-        for (Py_ssize_t i = 1; i < result_count; ++i) {
-            to->offsets[i] += from->separator_length;
-            PyObject *element_parent = NULL;
-            char const *element_start = NULL;
-            str_at_offset_consecutive_64bit(self, start + i, count, &element_parent, &element_start, &element_length);
-            to->offsets[i + 1] = element_length + to->offsets[i];
-        }
-        Py_INCREF(to->parent_string);
+    case STRS_U64_TAPE: {
+        // STRS_U64_TAPE input yields STRS_U64_TAPE_VIEW for step=1
+        result->type = STRS_U64_TAPE_VIEW;
+        result->data.u64_tape_view.count = result_count;
+        result->data.u64_tape_view.data = self->data.u64_tape.data + self->data.u64_tape.offsets[start];
+        result->data.u64_tape_view.offsets = self->data.u64_tape.offsets + start;
+        result->data.u64_tape_view.parent = (PyObject *)self;
+        Py_INCREF((PyObject *)self);
         break;
     }
 
-    case STRS_REORDERED: {
-        struct reordered_slices_t *from = &self->data.reordered;
-        struct reordered_slices_t *to = &result->data.reordered;
-        to->count = result_count;
-        to->parent_string = from->parent_string;
+    case STRS_FRAGMENTED: {
+        // STRS_FRAGMENTED input yields STRS_FRAGMENTED output
+        result->type = STRS_FRAGMENTED;
+        result->data.fragmented.count = result_count;
+        result->data.fragmented.parent = self->data.fragmented.parent;
+        sz_memory_allocator_init_default(&result->data.fragmented.allocator);
 
-        to->parts = malloc(sizeof(sz_string_view_t) * to->count);
-        if (to->parts == NULL && PyErr_NoMemory()) {
+        result->data.fragmented.spans = malloc(sizeof(sz_string_view_t) * result_count);
+        if (result->data.fragmented.spans == NULL && PyErr_NoMemory()) {
             Py_XDECREF(result);
             return NULL;
         }
-        sz_copy(to->parts, from->parts + start, sizeof(sz_string_view_t) * to->count);
-        Py_INCREF(to->parent_string);
+        sz_copy(result->data.fragmented.spans, self->data.fragmented.spans + start,
+                sizeof(sz_string_view_t) * result_count);
+        Py_INCREF(result->data.fragmented.parent);
         break;
     }
+
     default:
         // Unsupported type
         PyErr_SetString(PyExc_TypeError, "Unsupported type for conversion");
+        Py_XDECREF(result);
         return NULL;
     }
 
@@ -1314,8 +1495,8 @@ static int Strs_in(Str *self, PyObject *needle_obj) {
     // Time for a full-scan
     for (Py_ssize_t i = 0; i < count; ++i) {
         PyObject *parent = NULL;
-        char const *start = NULL;
-        size_t length = 0;
+        sz_cptr_t start = NULL;
+        sz_size_t length = 0;
         getter(self, i, count, &parent, &start, &length);
         if (length == needle.length && sz_equal(start, needle.start, needle.length) == sz_true_k) return 1;
     }
@@ -1375,8 +1556,8 @@ static PyObject *Strs_richcompare(PyObject *self, PyObject *other, int op) {
         Py_ssize_t min_length = sz_min_of_two(a_length, b_length);
         for (Py_ssize_t i = 0; i < min_length; i++) {
             PyObject *ai_parent = NULL, *bi_parent = NULL;
-            char const *ai_start = NULL, *bi_start = NULL;
-            size_t ai_length = 0, bi_length = 0;
+            sz_cptr_t ai_start = NULL, *bi_start = NULL;
+            sz_size_t ai_length = 0, bi_length = 0;
             a_getter(a, i, a_length, &ai_parent, &ai_start, &ai_length);
             b_getter(b, i, b_length, &bi_parent, &bi_start, &bi_length);
 
@@ -1453,8 +1634,8 @@ static PyObject *Strs_richcompare(PyObject *self, PyObject *other, int op) {
 
         // Both sequences aren't exhausted yet
         PyObject *ai_parent = NULL;
-        char const *ai_start = NULL;
-        size_t ai_length = 0;
+        sz_cptr_t ai_start = NULL;
+        sz_size_t ai_length = 0;
         a_getter(a, i, a_length, &ai_parent, &ai_start, &ai_length);
 
         // When dealing with arrays, early exists make sense only in some cases
@@ -1606,7 +1787,7 @@ static PyObject *Str_write_to(PyObject *self, PyObject *const *args, Py_ssize_t
     //
     // https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry
     // https://doc.owncloud.com/server/next/admin_manual/troubleshooting/path_filename_length.html
-    char *path_buffer = (char *)malloc(path.length + 1);
+    sz_ptr_t path_buffer = (sz_ptr_t)malloc(path.length + 1);
     if (path_buffer == NULL) {
         PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for the path");
         return NULL;
@@ -1683,7 +1864,7 @@ static PyObject *Str_offset_within(PyObject *self, PyObject *const *args, Py_ssi
         return NULL;
     }
 
-    return PyLong_FromSize_t((size_t)(slice.start - text.start));
+    return PyLong_FromSize_t((sz_size_t)(slice.start - text.start));
 }
 
 /**
@@ -1792,7 +1973,7 @@ static int Str_find_implementation_( //
     else { end = PY_SSIZE_T_MAX; }
 
     // Limit the `haystack` range
-    size_t normalized_offset, normalized_length;
+    sz_size_t normalized_offset, normalized_length;
     sz_ssize_clamp_interval(haystack.length, start, end, &normalized_offset, &normalized_length);
     haystack.start += normalized_offset;
     haystack.length = normalized_length;
@@ -2125,12 +2306,12 @@ static PyObject *Str_count(PyObject *self, PyObject *const *args, Py_ssize_t pos
 
     if ((start == -1 || end == -1 || allowoverlap == -1) && PyErr_Occurred()) return NULL;
 
-    size_t normalized_offset, normalized_length;
+    sz_size_t normalized_offset, normalized_length;
     sz_ssize_clamp_interval(haystack.length, start, end, &normalized_offset, &normalized_length);
     haystack.start += normalized_offset;
     haystack.length = normalized_length;
 
-    size_t count = 0;
+    sz_size_t count = 0;
     if (needle.length == 0 || haystack.length == 0 || haystack.length < needle.length) { count = 0; }
     else if (allowoverlap) {
         while (haystack.length) {
@@ -2637,7 +2818,7 @@ static SplitIterator *Str_split_iter_(PyObject *text_obj, PyObject *separator_ob
 
 /**
  *  @brief  Implements the normal order split logic for both string-delimiters and character sets.
- *          Produces one of the consecutive layouts - `STRS_CONSECUTIVE_64` or `STRS_CONSECUTIVE_32`.
+ *          Produces a `Strs` object with `REORDERED_SUBVIEWS` layout.
  */
 static Strs *Str_split_(PyObject *parent_string, sz_string_view_t const text, sz_string_view_t const separator,
                         int keepseparator, Py_ssize_t maxsplit, sz_find_t finder, sz_size_t match_length) {
@@ -2645,106 +2826,89 @@ static Strs *Str_split_(PyObject *parent_string, sz_string_view_t const text, sz
     Strs *result = (Strs *)PyObject_New(Strs, &StrsType);
     if (!result) return NULL;
 
-    // Initialize Strs object based on the splitting logic
-    void *offsets_endings = NULL;
-    size_t offsets_capacity = 0;
-    size_t offsets_count = 1; // Start with 1 to account for the initial 0 offset
-    size_t bytes_per_offset;
-    if (text.length >= UINT32_MAX) {
-        bytes_per_offset = 8;
-        result->type = STRS_CONSECUTIVE_64;
-        result->data.consecutive_64bit.start = text.start;
-        result->data.consecutive_64bit.parent_string = parent_string;
-        result->data.consecutive_64bit.separator_length = !keepseparator * match_length;
-        result->data.consecutive_64bit.offsets = NULL;
-        result->data.consecutive_64bit.owns_offsets = 0;
-    }
-    else {
-        bytes_per_offset = 4;
-        result->type = STRS_CONSECUTIVE_32;
-        result->data.consecutive_32bit.start = text.start;
-        result->data.consecutive_32bit.parent_string = parent_string;
-        result->data.consecutive_32bit.separator_length = !keepseparator * match_length;
-        result->data.consecutive_32bit.offsets = NULL;
-        result->data.consecutive_32bit.owns_offsets = 0;
-    }
-
-    // Initialize the first offset to 0 (Apache Arrow format)
-    if (offsets_capacity == 0) {
-        offsets_capacity = 4;
-        offsets_endings = malloc(offsets_capacity * bytes_per_offset);
-        if (!offsets_endings) {
-            Py_XDECREF(result);
-            PyErr_NoMemory();
-            return NULL;
-        }
+    // Use reordered subviews layout with the haystack as parent
+    result->type = STRS_FRAGMENTED;
+    result->data.fragmented.parent = parent_string;
+    sz_memory_allocator_init_default(&result->data.fragmented.allocator);
+
+    // Collect split positions first
+    sz_string_view_t *spans = NULL;
+    sz_size_t spans_capacity = 4;
+    sz_size_t spans_count = 0;
+
+    spans = (sz_string_view_t *)malloc(spans_capacity * sizeof(sz_string_view_t));
+    if (!spans) {
+        Py_XDECREF(result);
+        PyErr_NoMemory();
+        return NULL;
     }
-    if (bytes_per_offset == 8) { ((uint64_t *)offsets_endings)[0] = 0; }
-    else { ((uint32_t *)offsets_endings)[0] = 0; }
 
-    sz_bool_t reached_tail = 0;
-    sz_size_t total_skipped = 0;
-    sz_size_t max_parts = (sz_size_t)maxsplit + 1;
-    while (!reached_tail) {
+    sz_cptr_t current_start = text.start;
+    sz_size_t remaining_length = text.length;
+    sz_size_t splits_made = 0;
+    sz_size_t max_splits = (maxsplit < 0) ? SIZE_MAX : (sz_size_t)maxsplit;
 
-        sz_cptr_t match =
-            offsets_count < max_parts
-                ? finder(text.start + total_skipped, text.length - total_skipped, separator.start, separator.length)
-                : NULL;
+    while (remaining_length > 0 && splits_made < max_splits) {
+        sz_cptr_t match = finder(current_start, remaining_length, separator.start, separator.length);
 
-        sz_size_t part_end_offset;
         if (match) {
-            part_end_offset = (match - text.start) + match_length;
-            total_skipped = part_end_offset;
-        }
-        else {
-            part_end_offset = text.length;
-            total_skipped = text.length;
-            reached_tail = 1;
-        }
-
-        // Reallocate offsets array if needed
-        if (offsets_count >= offsets_capacity) {
-            offsets_capacity = (offsets_capacity + 1) * 2;
-            void *new_offsets = realloc(offsets_endings, offsets_capacity * bytes_per_offset);
-            if (!new_offsets) {
-                if (offsets_endings) free(offsets_endings);
+            // Add the part before the separator
+            sz_size_t part_length = match - current_start;
+
+            // Reallocate spans array if needed
+            if (spans_count >= spans_capacity) {
+                spans_capacity *= 2;
+                sz_string_view_t *new_spans =
+                    (sz_string_view_t *)realloc(spans, spans_capacity * sizeof(sz_string_view_t));
+                if (!new_spans) {
+                    free(spans);
+                    Py_XDECREF(result);
+                    PyErr_NoMemory();
+                    return NULL;
+                }
+                spans = new_spans;
             }
-            offsets_endings = new_offsets;
+
+            spans[spans_count].start = current_start;
+            spans[spans_count].length = keepseparator ? part_length + match_length : part_length;
+            spans_count++;
+
+            // Move past the separator
+            current_start = match + match_length;
+            remaining_length = text.length - (current_start - text.start);
+            splits_made++;
         }
+        else { break; }
+    }
 
-        // If the memory allocation has failed - discard the response
-        if (!offsets_endings) {
+    // Add the final part (everything remaining)
+    if (spans_count >= spans_capacity) {
+        spans_capacity++;
+        sz_string_view_t *new_spans = (sz_string_view_t *)realloc(spans, spans_capacity * sizeof(sz_string_view_t));
+        if (!new_spans) {
+            free(spans);
             Py_XDECREF(result);
             PyErr_NoMemory();
             return NULL;
         }
-
-        // Export the offset
-        if (bytes_per_offset == 8) { ((uint64_t *)offsets_endings)[offsets_count] = (uint64_t)part_end_offset; }
-        else { ((uint32_t *)offsets_endings)[offsets_count] = (uint32_t)part_end_offset; }
-        offsets_count++;
+        spans = new_spans;
     }
 
-    // Populate the Strs object with the offsets
-    if (bytes_per_offset == 8) {
-        result->data.consecutive_64bit.offsets = offsets_endings;
-        result->data.consecutive_64bit.count = offsets_count - 1; // count is number of strings, not offsets
-        result->data.consecutive_64bit.owns_offsets = 1;
-    }
-    else {
-        result->data.consecutive_32bit.offsets = offsets_endings;
-        result->data.consecutive_32bit.count = offsets_count - 1; // count is number of strings, not offsets
-        result->data.consecutive_32bit.owns_offsets = 1;
-    }
+    spans[spans_count].start = current_start;
+    spans[spans_count].length = remaining_length;
+    spans_count++;
 
+    // Set up the result
+    result->data.fragmented.spans = spans;
+    result->data.fragmented.count = spans_count;
     Py_INCREF(parent_string);
+
     return result;
 }
 
 /**
  *  @brief  Implements the reverse order split logic for both string-delimiters and character sets.
- *          Unlike the `Str_split_` can't use consecutive layouts and produces a `REORDERED` one.
+ *          Produces a `Strs` object with `REORDERED_SUBVIEWS` layout.
  */
 static Strs *Str_rsplit_(PyObject *parent_string, sz_string_view_t const text, sz_string_view_t const separator,
                          int keepseparator, Py_ssize_t maxsplit, sz_find_t finder, sz_size_t match_length) {
@@ -2752,22 +2916,30 @@ static Strs *Str_rsplit_(PyObject *parent_string, sz_string_view_t const text, s
     Strs *result = (Strs *)PyObject_New(Strs, &StrsType);
     if (!result) return NULL;
 
-    // Initialize Strs object based on the splitting logic
-    result->type = STRS_REORDERED;
-    result->data.reordered.parent_string = parent_string;
-    result->data.reordered.parts = NULL;
-    result->data.reordered.count = 0;
+    // Use reordered subviews layout with the haystack as parent
+    result->type = STRS_FRAGMENTED;
+    result->data.fragmented.parent = parent_string;
+    sz_memory_allocator_init_default(&result->data.fragmented.allocator);
+    result->data.fragmented.spans = NULL;
+    result->data.fragmented.count = 0;
 
     // Keep track of the memory usage
     sz_string_view_t *parts = NULL;
-    sz_size_t parts_capacity = 0;
+    sz_size_t parts_capacity = 4;
     sz_size_t parts_count = 0;
 
+    parts = (sz_string_view_t *)malloc(parts_capacity * sizeof(sz_string_view_t));
+    if (!parts) {
+        Py_XDECREF(result);
+        PyErr_NoMemory();
+        return NULL;
+    }
+
     sz_bool_t reached_tail = 0;
     sz_size_t total_skipped = 0;
-    sz_size_t max_parts = (sz_size_t)maxsplit + 1;
-    while (!reached_tail) {
+    sz_size_t max_parts = (maxsplit < 0) ? SIZE_MAX : ((sz_size_t)maxsplit + 1);
 
+    while (!reached_tail) {
         sz_cptr_t match = parts_count + 1 < max_parts
                               ? finder(text.start, text.length - total_skipped, separator.start, separator.length)
                               : NULL;
@@ -2787,21 +2959,17 @@ static Strs *Str_rsplit_(PyObject *parent_string, sz_string_view_t const text, s
 
         // Reallocate parts array if needed
         if (parts_count >= parts_capacity) {
-            parts_capacity = (parts_capacity + 1) * 2;
+            parts_capacity *= 2;
             sz_string_view_t *new_parts = (sz_string_view_t *)realloc(parts, parts_capacity * sizeof(sz_string_view_t));
             if (!new_parts) {
-                if (parts) free(parts);
+                free(parts);
+                Py_XDECREF(result);
+                PyErr_NoMemory();
+                return NULL;
             }
             parts = new_parts;
         }
 
-        // If the memory allocation has failed - discard the response
-        if (!parts) {
-            Py_XDECREF(result);
-            PyErr_NoMemory();
-            return NULL;
-        }
-
         // Populate the parts array
         parts[parts_count] = part;
         parts_count++;
@@ -2815,8 +2983,8 @@ static Strs *Str_rsplit_(PyObject *parent_string, sz_string_view_t const text, s
         parts[parts_count - i - 1] = temp;
     }
 
-    result->data.reordered.parts = parts;
-    result->data.reordered.count = parts_count;
+    result->data.fragmented.spans = parts;
+    result->data.fragmented.count = parts_count;
     Py_INCREF(parent_string);
     return result;
 }
@@ -3391,8 +3559,18 @@ static PyTypeObject SplitIteratorType = {
 
 #pragma region Strs
 
-static PyObject *Strs_shuffle(Strs *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                              PyObject *args_names_tuple) {
+/**
+ *  @brief Shuffles the parts of a `Strs` object.
+ *
+ *  This accepts a `Strs` object and potentially produces a new `Strs` object of a different layout:
+ *  - `STRS_U32_TAPE_VIEW` becomes `STRS_FRAGMENTED`, and keeps a link to the old as a parent.
+ *  - `STRS_U64_TAPE_VIEW` becomes `STRS_FRAGMENTED`, and keeps a link to the old as a parent.
+ *  - `STRS_U32_TAPE` becomes `STRS_FRAGMENTED`, and keeps a link to the old as a parent.
+ *  - `STRS_U64_TAPE` becomes `STRS_FRAGMENTED`, and keeps a link to the old as a parent.
+ *  - `STRS_FRAGMENTED` returns a copy of itself, with the parts shuffled.
+ */
+static PyObject *Strs_shuffled(Strs *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                               PyObject *args_names_tuple) {
 
     // Check for positional arguments
     PyObject *seed_obj = positional_args_count == 1 ? args[0] : NULL;
@@ -3412,17 +3590,6 @@ static PyObject *Strs_shuffle(Strs *self, PyObject *const *args, Py_ssize_t posi
         }
     }
 
-    // Change the layout
-    if (!prepare_strings_for_reordering(self)) {
-        PyErr_Format(PyExc_TypeError, "Failed to prepare the sequence for shuffling");
-        return NULL;
-    }
-
-    // Get the parts and their count
-    struct reordered_slices_t *reordered = &self->data.reordered;
-    sz_string_view_t *parts = reordered->parts;
-    size_t count = reordered->count;
-
     // Fisher-Yates Shuffle Algorithm
     unsigned int seed = (unsigned int)time(NULL);
     if (seed_obj) {
@@ -3433,66 +3600,102 @@ static PyObject *Strs_shuffle(Strs *self, PyObject *const *args, Py_ssize_t posi
         seed = PyLong_AsUnsignedLong(seed_obj);
     }
 
-    srand(seed);
-    for (size_t i = count - 1; i > 0; --i) {
-        size_t j = rand() % (i + 1);
-        // Swap parts[i] and parts[j]
-        sz_string_view_t temp = parts[i];
-        parts[i] = parts[j];
-        parts[j] = temp;
-    }
+    // Determine the amount of memory needed
+    sz_size_t substrings_count = 0;
+    get_string_at_offset_t substring_getter = NULL;
+    PyObject *parent_to_increment = NULL;
+    sz_memory_allocator_t allocator;
 
-    Py_RETURN_NONE;
-}
+    switch (self->type) {
+    case STRS_U32_TAPE:
+        substring_getter = str_at_offset_u32_tape;
+        substrings_count = self->data.u32_tape.count;
+        parent_to_increment = (PyObject *)self;
+        allocator = self->data.u32_tape.allocator;
+        break;
+    case STRS_U32_TAPE_VIEW:
+        substring_getter = str_at_offset_u32_tape_view;
+        substrings_count = self->data.u32_tape_view.count;
+        parent_to_increment = self->data.u32_tape_view.parent;
+        sz_memory_allocator_init_default(&allocator);
+        break;
+    case STRS_U64_TAPE:
+        substring_getter = str_at_offset_u64_tape;
+        substrings_count = self->data.u64_tape.count;
+        parent_to_increment = (PyObject *)self;
+        allocator = self->data.u64_tape.allocator;
+        break;
+    case STRS_U64_TAPE_VIEW:
+        substring_getter = str_at_offset_u64_tape_view;
+        substrings_count = self->data.u64_tape_view.count;
+        parent_to_increment = self->data.u64_tape_view.parent;
+        sz_memory_allocator_init_default(&allocator);
+        break;
+    case STRS_FRAGMENTED:
+        substring_getter = str_at_offset_fragmented;
+        substrings_count = self->data.fragmented.count;
+        parent_to_increment = self->data.fragmented.parent;
+        allocator = self->data.fragmented.allocator;
+        break;
+    }
 
-static sz_bool_t Strs_argsort_(Strs *self, sz_string_view_t **parts_output, sz_sorted_idx_t **order_output,
-                               sz_size_t *count_output) {
-    // Change the layout
-    if (!prepare_strings_for_reordering(self)) {
-        PyErr_Format(PyExc_TypeError, "Failed to prepare the sequence for sorting");
-        return 0;
+    sz_string_view_t *new_spans =
+        (sz_string_view_t *)allocator.allocate(substrings_count * sizeof(sz_string_view_t), allocator.handle);
+    if (new_spans == NULL) {
+        PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for reordered slices");
+        return NULL;
     }
 
-    // Get the parts and their count
-    // The only possible `self->type` by now is the `STRS_REORDERED`
-    sz_string_view_t *parts = self->data.reordered.parts;
-    size_t count = self->data.reordered.count;
+    // Populate the new reordered array using get_string_at_offset
+    for (sz_size_t i = 0; i < substrings_count; ++i) {
+        PyObject *unused_parent;
+        sz_cptr_t start;
+        sz_size_t length;
+        substring_getter(self, (Py_ssize_t)i, substrings_count, &unused_parent, &start, &length);
+        new_spans[i].start = start;
+        new_spans[i].length = length;
+    }
 
-    // Allocate temporary memory to store the ordering offsets
-    size_t memory_needed = sizeof(sz_sorted_idx_t) * count;
-    if (temporary_memory.length < memory_needed) {
-        void *new_memory = realloc(temporary_memory.start, memory_needed);
-        if (!new_memory) {
-            PyErr_Format(PyExc_MemoryError, "Unable to allocate memory for the sorting operation");
-            return 0;
-        }
-        temporary_memory.start = new_memory;
-        temporary_memory.length = memory_needed;
+    // Create a new Strs object for the reordered layout
+    Strs *result = (Strs *)PyObject_New(Strs, &StrsType);
+    if (!result) {
+        allocator.free(new_spans, substrings_count * sizeof(sz_string_view_t), allocator.handle);
+        PyErr_NoMemory();
+        return NULL;
     }
-    if (!temporary_memory.start) {
-        PyErr_Format(PyExc_MemoryError, "Unable to allocate memory for the sorting operation");
-        return 0;
+
+    srand(seed);
+    for (sz_size_t i = substrings_count - 1; i > 0; --i) {
+        sz_size_t j = rand() % (i + 1);
+        // Swap parts[i] and parts[j]
+        sz_string_view_t temp = new_spans[i];
+        new_spans[i] = new_spans[j];
+        new_spans[j] = temp;
     }
 
-    // Call our sorting algorithm
-    sz_sequence_t sequence;
-    sz_fill(&sequence, sizeof(sequence), 0);
-    sequence.count = count;
-    sequence.handle = parts;
-    sequence.get_start = parts_get_start;
-    sequence.get_length = parts_get_length;
-    sz_status_t status = sz_sequence_argsort(&sequence, NULL, (sz_sorted_idx_t *)temporary_memory.start);
-    sz_unused_(status);
+    // Set up the new reordered object
+    result->type = STRS_FRAGMENTED;
+    result->data.fragmented.count = substrings_count;
+    result->data.fragmented.spans = new_spans;
+    result->data.fragmented.parent = parent_to_increment;
+    result->data.fragmented.allocator = allocator;
+    Py_INCREF(parent_to_increment); // Keep the original as parent
 
-    // Export results
-    *parts_output = parts;
-    *order_output = (sz_sorted_idx_t *)temporary_memory.start;
-    *count_output = sequence.count;
-    return 1;
+    return result;
 }
 
-static PyObject *Strs_sort(Strs *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                           PyObject *args_names_tuple) {
+/**
+ *  @brief Sorts the parts of a `Strs` object.
+ *
+ *  This accepts a `Strs` object and potentially produces a new `Strs` object of a different layout:
+ *  - `STRS_U32_TAPE_VIEW` becomes `STRS_FRAGMENTED`, and keeps a link to the old as a parent.
+ *  - `STRS_U64_TAPE_VIEW` becomes `STRS_FRAGMENTED`, and keeps a link to the old as a parent.
+ *  - `STRS_U32_TAPE` becomes `STRS_FRAGMENTED`, and keeps a link to the old as a parent.
+ *  - `STRS_U64_TAPE` becomes `STRS_FRAGMENTED`, and keeps a link to the old as a parent.
+ *  - `STRS_FRAGMENTED` returns a copy of itself, with the parts sorted.
+ */
+static PyObject *Strs_sorted(Strs *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                             PyObject *args_names_tuple) {
     PyObject *reverse_obj = NULL; // Default is not reversed
 
     // Check for positional arguments
@@ -3522,20 +3725,131 @@ static PyObject *Strs_sort(Strs *self, PyObject *const *args, Py_ssize_t positio
         reverse = PyObject_IsTrue(reverse_obj);
     }
 
-    sz_string_view_t *parts = NULL;
-    sz_sorted_idx_t *order = NULL;
-    sz_size_t count = 0;
-    if (!Strs_argsort_(self, &parts, &order, &count)) return NULL;
+    // Determine the amount of memory needed
+    sz_size_t substrings_count = 0;
+    get_string_at_offset_t substring_getter = NULL;
+    PyObject *parent_to_increment = NULL;
+    sz_memory_allocator_t allocator;
+
+    switch (self->type) {
+    case STRS_U32_TAPE:
+        substring_getter = str_at_offset_u32_tape;
+        substrings_count = self->data.u32_tape.count;
+        parent_to_increment = (PyObject *)self;
+        allocator = self->data.u32_tape.allocator;
+        break;
+    case STRS_U32_TAPE_VIEW:
+        substring_getter = str_at_offset_u32_tape_view;
+        substrings_count = self->data.u32_tape_view.count;
+        parent_to_increment = (PyObject *)self;
+        sz_memory_allocator_init_default(&allocator);
+        break;
+    case STRS_U64_TAPE:
+        substring_getter = str_at_offset_u64_tape;
+        substrings_count = self->data.u64_tape.count;
+        parent_to_increment = (PyObject *)self;
+        allocator = self->data.u64_tape.allocator;
+        break;
+    case STRS_U64_TAPE_VIEW:
+        substring_getter = str_at_offset_u64_tape_view;
+        substrings_count = self->data.u64_tape_view.count;
+        parent_to_increment = (PyObject *)self;
+        sz_memory_allocator_init_default(&allocator);
+        break;
+    case STRS_FRAGMENTED:
+        substring_getter = str_at_offset_fragmented;
+        substrings_count = self->data.fragmented.count;
+        parent_to_increment = self->data.fragmented.parent;
+        allocator = self->data.fragmented.allocator;
+        break;
+    }
+
+    sz_string_view_t *new_spans =
+        (sz_string_view_t *)allocator.allocate(substrings_count * sizeof(sz_string_view_t), allocator.handle);
+    if (new_spans == NULL) {
+        PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for reordered slices");
+        return NULL;
+    }
+
+    // Populate the new reordered array using get_string_at_offset
+    for (sz_size_t i = 0; i < substrings_count; ++i) {
+        PyObject *unused_parent;
+        sz_cptr_t start;
+        sz_size_t length;
+        substring_getter((Strs *)self, (Py_ssize_t)i, substrings_count, &unused_parent, &start, &length);
+        new_spans[i].start = start;
+        new_spans[i].length = length;
+    }
+
+    // Determine memory needed for sorting
+    sz_size_t const memory_needed = sizeof(sz_sorted_idx_t) * substrings_count;
+    if (temporary_memory.length < memory_needed) {
+        void *new_memory = realloc(temporary_memory.start, memory_needed);
+        if (!new_memory) {
+            allocator.free(new_spans, substrings_count * sizeof(sz_string_view_t), allocator.handle);
+            PyErr_Format(PyExc_MemoryError, "Unable to allocate memory for the sorting operation");
+            return NULL;
+        }
+        temporary_memory.start = new_memory;
+        temporary_memory.length = memory_needed;
+    }
+    if (!temporary_memory.start) {
+        allocator.free(new_spans, substrings_count * sizeof(sz_string_view_t), allocator.handle);
+        PyErr_Format(PyExc_MemoryError, "Unable to allocate memory for the sorting operation");
+        return NULL;
+    }
+    sz_sorted_idx_t *order = (sz_sorted_idx_t *)temporary_memory.start;
+
+    // Call our sorting algorithm
+    sz_sequence_t sequence;
+    sz_fill(&sequence, sizeof(sequence), 0);
+    sequence.count = substrings_count;
+    sequence.handle = (void *)self;
+    sequence.get_start = Strs_get_start_;
+    sequence.get_length = Strs_get_length_;
+    sz_status_t status = sz_sequence_argsort(&sequence, NULL, order);
+    sz_unused_(status);
 
     // Apply the sorting algorithm here, considering the `reverse` value
-    if (reverse) reverse_offsets(order, count);
+    if (reverse) reverse_offsets(order, substrings_count);
+
+    // Apply the new order to create sorted spans
+    sz_string_view_t *sorted_spans =
+        (sz_string_view_t *)allocator.allocate(substrings_count * sizeof(sz_string_view_t), allocator.handle);
+    if (sorted_spans == NULL) {
+        allocator.free(new_spans, substrings_count * sizeof(sz_string_view_t), allocator.handle);
+        PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for sorted slices");
+        return NULL;
+    }
 
-    // Apply the new order.
-    permute(parts, order, count);
+    // Apply the permutation
+    for (sz_size_t i = 0; i < substrings_count; ++i) sorted_spans[i] = new_spans[order[i]];
 
-    Py_RETURN_NONE;
+    // Free the temporary spans array
+    allocator.free(new_spans, substrings_count * sizeof(sz_string_view_t), allocator.handle);
+
+    // Create a new Strs object for the sorted layout
+    Strs *result = (Strs *)PyObject_New(Strs, &StrsType);
+    if (!result) {
+        allocator.free(sorted_spans, substrings_count * sizeof(sz_string_view_t), allocator.handle);
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    // Set up the new sorted object
+    result->type = STRS_FRAGMENTED;
+    result->data.fragmented.count = substrings_count;
+    result->data.fragmented.spans = sorted_spans;
+    result->data.fragmented.parent = parent_to_increment;
+    result->data.fragmented.allocator = allocator;
+    Py_INCREF(parent_to_increment); // Keep the original as parent
+
+    return (PyObject *)result;
 }
 
+/**
+ *  @brief Returns the tuple permuting a `Strs` object into a sorted order.
+ */
 static PyObject *Strs_argsort(Strs *self, PyObject *const *args, Py_ssize_t positional_args_count,
                               PyObject *args_names_tuple) {
     PyObject *reverse_obj = NULL; // Default is not reversed
@@ -3567,10 +3881,33 @@ static PyObject *Strs_argsort(Strs *self, PyObject *const *args, Py_ssize_t posi
         reverse = PyObject_IsTrue(reverse_obj);
     }
 
-    sz_string_view_t *parts = NULL;
-    sz_sorted_idx_t *order = NULL;
-    sz_size_t count = 0;
-    if (!Strs_argsort_(self, &parts, &order, &count)) return NULL;
+    // Determine the amount of memory needed
+    sz_size_t const count = Strs_len(self);
+    sz_size_t const memory_needed = sizeof(sz_sorted_idx_t) * count;
+    if (temporary_memory.length < memory_needed) {
+        void *new_memory = realloc(temporary_memory.start, memory_needed);
+        if (!new_memory) {
+            PyErr_Format(PyExc_MemoryError, "Unable to allocate memory for the sorting operation");
+            return 0;
+        }
+        temporary_memory.start = new_memory;
+        temporary_memory.length = memory_needed;
+    }
+    if (!temporary_memory.start) {
+        PyErr_Format(PyExc_MemoryError, "Unable to allocate memory for the sorting operation");
+        return 0;
+    }
+    sz_sorted_idx_t *order = (sz_sorted_idx_t *)temporary_memory.start;
+
+    // Call our sorting algorithm
+    sz_sequence_t sequence;
+    sz_fill(&sequence, sizeof(sequence), 0);
+    sequence.count = count;
+    sequence.handle = self;
+    sequence.get_start = Strs_get_start_;
+    sequence.get_length = Strs_get_length_;
+    sz_status_t status = sz_sequence_argsort(&sequence, NULL, order);
+    sz_unused_(status);
 
     // Apply the sorting algorithm here, considering the `reverse` value
     if (reverse) reverse_offsets(order, count);
@@ -3631,7 +3968,7 @@ static PyObject *Strs_sample(Strs *self, PyObject *const *args, Py_ssize_t posit
     }
 
     // Translate the seed and the sample size to C types
-    size_t sample_size = 0;
+    sz_size_t sample_size = 0;
     if (sample_size_obj) {
         if (!PyLong_Check(sample_size_obj)) {
             PyErr_SetString(PyExc_TypeError, "The sample size must be an integer");
@@ -3652,15 +3989,18 @@ static PyObject *Strs_sample(Strs *self, PyObject *const *args, Py_ssize_t posit
     Strs *result = (Strs *)StrsType.tp_alloc(&StrsType, 0);
     if (result == NULL && PyErr_NoMemory()) return NULL;
 
-    result->type = STRS_REORDERED;
-    result->data.reordered.count = 0;
-    result->data.reordered.parts = NULL;
-    result->data.reordered.parent_string = NULL;
+    // Initialize the memory allocator with default malloc wrapper
+    sz_memory_allocator_init_default(&result->data.fragmented.allocator);
+
+    result->type = STRS_FRAGMENTED;
+    result->data.fragmented.count = 0;
+    result->data.fragmented.spans = NULL;
+    result->data.fragmented.parent = NULL;
     if (sample_size == 0) { return (PyObject *)result; }
 
     // Now create a new Strs object with the sampled strings
-    sz_string_view_t *result_parts = malloc(sample_size * sizeof(sz_string_view_t));
-    if (!result_parts) {
+    sz_string_view_t *result_spans = malloc(sample_size * sizeof(sz_string_view_t));
+    if (!result_spans) {
         PyErr_SetString(PyExc_MemoryError, "Failed to allocate memory for the sample");
         return NULL;
     }
@@ -3677,17 +4017,17 @@ static PyObject *Strs_sample(Strs *self, PyObject *const *args, Py_ssize_t posit
     srand(seed);
     PyObject *parent_string;
     for (Py_ssize_t i = 0; i < (Py_ssize_t)sample_size; i++) {
-        size_t index = rand() % count;
-        getter(self, index, count, &parent_string, &result_parts[i].start, &result_parts[i].length);
+        sz_size_t index = rand() % count;
+        getter(self, index, count, &parent_string, &result_spans[i].start, &result_spans[i].length);
     }
 
     // Update the `Strs` object
-    result->type = STRS_REORDERED;
-    result->data.reordered.count = sample_size;
-    result->data.reordered.parts = result_parts;
-    result->data.reordered.parent_string = parent_string;
+    result->type = STRS_FRAGMENTED;
+    result->data.fragmented.count = sample_size;
+    result->data.fragmented.spans = result_spans;
+    result->data.fragmented.parent = parent_string;
     // Hold a reference to the parent backing buffer while this view is alive
-    Py_XINCREF(result->data.reordered.parent_string);
+    Py_XINCREF(result->data.fragmented.parent);
     return result;
 }
 
@@ -3695,11 +4035,11 @@ static PyObject *Strs_sample(Strs *self, PyObject *const *args, Py_ssize_t posit
  *  @brief Exports a string to a UTF-8 buffer, escaping single quotes.
  *  @param[out] did_fit Populated with 1 if the string is fully exported, 0 if it didn't fit.
  */
-char const *export_escaped_unquoted_to_utf8_buffer(char const *cstr, size_t cstr_length, //
-                                                   char *buffer, size_t buffer_length,   //
-                                                   int *did_fit) {
-    char const *const cstr_end = cstr + cstr_length;
-    char *const buffer_end = buffer + buffer_length;
+sz_cptr_t export_escaped_unquoted_to_utf8_buffer(sz_cptr_t cstr, sz_size_t cstr_length,    //
+                                                 sz_ptr_t buffer, sz_size_t buffer_length, //
+                                                 int *did_fit) {
+    sz_cptr_t const cstr_end = cstr + cstr_length;
+    sz_ptr_t const buffer_end = buffer + buffer_length;
     *did_fit = 1;
 
     while (cstr < cstr_end) {
@@ -3746,26 +4086,26 @@ static PyObject *Strs_repr(Strs *self) {
     }
 
     char repr_buffer[1024];
-    char *repr_buffer_ptr = &repr_buffer[0];
-    char const *const repr_buffer_end = repr_buffer_ptr + 1024;
+    sz_ptr_t repr_buffer_ptr = &repr_buffer[0];
+    sz_cptr_t const repr_buffer_end = repr_buffer_ptr + 1024;
 
     // Start of the array
     sz_copy(repr_buffer_ptr, "sz.Strs([", 9);
     repr_buffer_ptr += 9;
 
-    size_t count = Strs_len(self);
+    sz_size_t count = Strs_len(self);
     PyObject *parent_string;
 
     // In the worst case, we must have enough space for `...', ...])`
     // That's extra 11 bytes of content.
-    char const *non_fitting_array_tail = "... ])";
+    sz_cptr_t non_fitting_array_tail = "... ])";
     int const non_fitting_array_tail_length = 6;
 
     // If the whole string doesn't fit, even before the `non_fitting_array_tail` tail,
     // we need to add `, '` separator of 3 bytes.
-    for (size_t i = 0; i < count && repr_buffer_ptr + (non_fitting_array_tail_length + 3) < repr_buffer_end; i++) {
-        char const *cstr_start = NULL;
-        size_t cstr_length = 0;
+    for (sz_size_t i = 0; i < count && repr_buffer_ptr + (non_fitting_array_tail_length + 3) < repr_buffer_end; i++) {
+        sz_cptr_t cstr_start = NULL;
+        sz_size_t cstr_length = 0;
         getter(self, i, count, &parent_string, &cstr_start, &cstr_length);
 
         if (i > 0) { *(repr_buffer_ptr++) = ',', *(repr_buffer_ptr++) = ' '; }
@@ -3803,12 +4143,12 @@ static PyObject *Strs_str(Strs *self) {
     }
 
     // Aggregate the total length of all the slices and count the number of bytes we need to allocate:
-    size_t count = Strs_len(self);
+    sz_size_t count = Strs_len(self);
     PyObject *parent_string;
-    size_t total_bytes = 2; // opening and closing square brackets
-    for (size_t i = 0; i < count; i++) {
-        char const *cstr_start = NULL;
-        size_t cstr_length = 0;
+    sz_size_t total_bytes = 2; // opening and closing square brackets
+    for (sz_size_t i = 0; i < count; i++) {
+        sz_cptr_t cstr_start = NULL;
+        sz_size_t cstr_length = 0;
         getter(self, i, count, &parent_string, &cstr_start, &cstr_length);
         total_bytes += cstr_length;
         total_bytes += 2;             // For the single quotes
@@ -3826,22 +4166,22 @@ static PyObject *Strs_str(Strs *self) {
     }
 
     // Now allocate the memory for the concatenated string
-    char *const result_buffer = malloc(total_bytes);
+    sz_ptr_t const result_buffer = malloc(total_bytes);
     if (!result_buffer) {
         PyErr_SetString(PyExc_MemoryError, "Failed to allocate memory for the concatenated string");
         return NULL;
     }
 
     // Copy the strings into the result buffer
-    char *result_ptr = result_buffer;
+    sz_ptr_t result_ptr = result_buffer;
     *result_ptr++ = '[';
-    for (size_t i = 0; i < count; i++) {
+    for (sz_size_t i = 0; i < count; i++) {
         if (i != 0) {
             *result_ptr++ = ',';
             *result_ptr++ = ' ';
         }
-        char const *cstr_start = NULL;
-        size_t cstr_length = 0;
+        sz_cptr_t cstr_start = NULL;
+        sz_size_t cstr_length = 0;
         getter(self, i, count, &parent_string, &cstr_start, &cstr_length);
         *result_ptr++ = '\'';
         int did_fit;
@@ -3918,124 +4258,105 @@ static int Strs_init_from_pyarrow(Strs *self, PyObject *sequence_obj, int view)
         return -1;
     }
 
-    void const **buffers = (void const **)array->buffers;
-    uint8_t const *validity = (uint8_t const *)buffers[0]; // May be NULL
-    char const *data_buffer = (char const *)buffers[2];
-    size_t length = array->length;
-
     // Determine if 32-bit or 64-bit offsets
     int use_64bit = (strcmp(schema->format, "U") == 0 || strcmp(schema->format, "Z") == 0);
+    void const **buffers = (void const **)array->buffers;
+    sz_u8_t const *validity = (sz_u8_t const *)buffers[0]; // May be NULL
+    sz_cptr_t data_buffer = (sz_cptr_t)buffers[2];
+    sz_size_t length = array->length;
 
+    // Zero-copy mode for Arrow arrays
     if (view) {
-        // Zero-copy mode for Arrow arrays
         if (use_64bit) {
-            int64_t const *offsets_64 = (int64_t const *)buffers[1];
-            self->type = STRS_CONSECUTIVE_64;
-            self->data.consecutive_64bit.count = length;
-            self->data.consecutive_64bit.separator_length = 0;
-            self->data.consecutive_64bit.parent_string = capsules;
-            self->data.consecutive_64bit.start = data_buffer;
-            self->data.consecutive_64bit.offsets = (uint64_t *)(offsets_64 + 1);
-            self->data.consecutive_64bit.owns_offsets = 0; // Arrow owns buffer
+            sz_i64_t const *offsets_64 = (sz_i64_t const *)buffers[1];
+            self->type = STRS_U64_TAPE_VIEW;
+            self->data.u64_tape_view.count = length;
+            self->data.u64_tape_view.parent = capsules;
+            self->data.u64_tape_view.data = data_buffer;
+            self->data.u64_tape_view.offsets = (sz_u64_t *)offsets_64;
             Py_INCREF(capsules);
         }
         else {
-            int32_t const *offsets_32 = (int32_t const *)buffers[1];
-            self->type = STRS_CONSECUTIVE_32;
-            self->data.consecutive_32bit.count = length;
-            self->data.consecutive_32bit.separator_length = 0;
-            self->data.consecutive_32bit.parent_string = capsules;
-            self->data.consecutive_32bit.start = data_buffer;
-            self->data.consecutive_32bit.offsets = (uint32_t *)(offsets_32 + 1);
-            self->data.consecutive_32bit.owns_offsets = 0; // Arrow owns buffer
+            sz_i32_t const *offsets_32 = (sz_i32_t const *)buffers[1];
+            self->type = STRS_U32_TAPE_VIEW;
+            self->data.u32_tape_view.count = length;
+            self->data.u32_tape_view.parent = capsules;
+            self->data.u32_tape_view.data = data_buffer;
+            self->data.u32_tape_view.offsets = (sz_u32_t *)offsets_32;
             Py_INCREF(capsules);
         }
     }
+    // Copy mode for Arrow arrays
     else {
-        // Copy mode for Arrow arrays
+        // Copy mode for Arrow arrays - use allocator for memory management
+        sz_memory_allocator_t allocator;
+        sz_memory_allocator_init_default(&allocator);
+
         if (use_64bit) {
-            int64_t const *offsets_64 = (int64_t const *)buffers[1];
-            size_t total_bytes = offsets_64[length] - offsets_64[0];
+            sz_i64_t const *offsets_64 = (sz_i64_t const *)buffers[1];
+            sz_size_t total_bytes = offsets_64[length] - offsets_64[0];
+
+            // Handle zero-byte case (all nulls)
+            if (total_bytes == 0) total_bytes = 1; // Allocate at least 1 byte
 
-            // Allocate new buffer and offsets
-            char *new_data = (char *)malloc(total_bytes);
-            uint64_t *new_offsets = (uint64_t *)malloc(length * sizeof(uint64_t));
+            // Allocate new buffer and offsets using the allocator
+            sz_ptr_t new_data = (sz_ptr_t)allocator.allocate(total_bytes, allocator.handle);
+            sz_u64_t *new_offsets = (sz_u64_t *)allocator.allocate((length + 1) * sizeof(sz_u64_t), allocator.handle);
             if (!new_data || !new_offsets) {
-                free(new_data);
-                free(new_offsets);
-                Py_DECREF(capsules);
+                if (new_data) allocator.free(new_data, total_bytes, allocator.handle);
+                if (new_offsets) allocator.free(new_offsets, (length + 1) * sizeof(sz_u64_t), allocator.handle);
                 PyErr_NoMemory();
                 return -1;
             }
 
-            // Copy data and adjust offsets
-            sz_copy(new_data, data_buffer + offsets_64[0], total_bytes);
-            for (size_t i = 0; i < length; i++) {
+            // Copy data and adjust offsets (Apache Arrow format)
+            sz_size_t actual_bytes = offsets_64[length] - offsets_64[0];
+            if (actual_bytes > 0) sz_copy(new_data, data_buffer + offsets_64[0], actual_bytes);
+            new_offsets[0] = 0; // First offset is always 0
+            for (sz_size_t i = 0; i < length; i++) {
                 // Handle null values by checking validity bitmap
-                if (validity && !(validity[i / 8] & (1 << (i % 8)))) {
-                    new_offsets[i] = (i == 0) ? 0 : new_offsets[i - 1];
-                }
-                else { new_offsets[i] = offsets_64[i + 1] - offsets_64[0]; }
-            }
-
-            // Create parent bytes object to own the data
-            PyObject *parent = PyBytes_FromStringAndSize(new_data, total_bytes);
-            free(new_data);
-            if (!parent) {
-                free(new_offsets);
-                Py_DECREF(capsules);
-                return -1;
+                if (validity && !(validity[i / 8] & (1 << (i % 8)))) { new_offsets[i + 1] = new_offsets[i]; }
+                else { new_offsets[i + 1] = offsets_64[i + 1] - offsets_64[0]; }
             }
 
-            self->type = STRS_CONSECUTIVE_64;
-            self->data.consecutive_64bit.count = length;
-            self->data.consecutive_64bit.separator_length = 0;
-            self->data.consecutive_64bit.parent_string = parent;
-            self->data.consecutive_64bit.start = PyBytes_AS_STRING(parent);
-            self->data.consecutive_64bit.offsets = new_offsets;
-            self->data.consecutive_64bit.owns_offsets = 1;
+            self->type = STRS_U64_TAPE;
+            self->data.u64_tape.count = length;
+            self->data.u64_tape.data = new_data;
+            self->data.u64_tape.offsets = new_offsets;
+            self->data.u64_tape.allocator = allocator;
         }
         else {
-            int32_t const *offsets_32 = (int32_t const *)buffers[1];
-            size_t total_bytes = offsets_32[length] - offsets_32[0];
+            sz_i32_t const *offsets_32 = (sz_i32_t const *)buffers[1];
+            sz_size_t total_bytes = offsets_32[length] - offsets_32[0];
+
+            // Handle zero-byte case (all nulls)
+            if (total_bytes == 0) total_bytes = 1; // Allocate at least 1 byte
 
-            // Allocate new buffer and offsets
-            char *new_data = (char *)malloc(total_bytes);
-            uint32_t *new_offsets = (uint32_t *)malloc(length * sizeof(uint32_t));
+            // Allocate new buffer and offsets using the allocator
+            sz_ptr_t new_data = (sz_ptr_t)allocator.allocate(total_bytes, allocator.handle);
+            sz_u32_t *new_offsets = (sz_u32_t *)allocator.allocate((length + 1) * sizeof(sz_u32_t), allocator.handle);
             if (!new_data || !new_offsets) {
-                free(new_data);
-                free(new_offsets);
-                Py_DECREF(capsules);
+                if (new_data) allocator.free(new_data, total_bytes, allocator.handle);
+                if (new_offsets) allocator.free(new_offsets, (length + 1) * sizeof(sz_u32_t), allocator.handle);
                 PyErr_NoMemory();
                 return -1;
             }
 
-            // Copy data and adjust offsets
-            sz_copy(new_data, data_buffer + offsets_32[0], total_bytes);
-            for (size_t i = 0; i < length; i++) {
+            // Copy data and adjust offsets (Apache Arrow format)
+            sz_size_t actual_bytes = offsets_32[length] - offsets_32[0];
+            if (actual_bytes > 0) sz_copy(new_data, data_buffer + offsets_32[0], actual_bytes);
+            new_offsets[0] = 0; // First offset is always 0
+            for (sz_size_t i = 0; i < length; i++) {
                 // Handle null values by checking validity bitmap
-                if (validity && !(validity[i / 8] & (1 << (i % 8)))) {
-                    new_offsets[i] = (i == 0) ? 0 : new_offsets[i - 1];
-                }
-                else { new_offsets[i] = offsets_32[i + 1] - offsets_32[0]; }
+                if (validity && !(validity[i / 8] & (1 << (i % 8)))) { new_offsets[i + 1] = new_offsets[i]; }
+                else { new_offsets[i + 1] = offsets_32[i + 1] - offsets_32[0]; }
             }
 
-            // Create parent bytes object to own the data
-            PyObject *parent = PyBytes_FromStringAndSize(new_data, total_bytes);
-            free(new_data);
-            if (!parent) {
-                free(new_offsets);
-                Py_DECREF(capsules);
-                return -1;
-            }
-
-            self->type = STRS_CONSECUTIVE_32;
-            self->data.consecutive_32bit.count = length;
-            self->data.consecutive_32bit.separator_length = 0;
-            self->data.consecutive_32bit.parent_string = parent;
-            self->data.consecutive_32bit.start = PyBytes_AS_STRING(parent);
-            self->data.consecutive_32bit.offsets = new_offsets;
-            self->data.consecutive_32bit.owns_offsets = 1;
+            self->type = STRS_U32_TAPE;
+            self->data.u32_tape.count = length;
+            self->data.u32_tape.data = new_data;
+            self->data.u32_tape.offsets = new_offsets;
+            self->data.u32_tape.allocator = allocator;
         }
     }
 
@@ -4049,29 +4370,34 @@ static int Strs_init_from_tuple(Strs *self, PyObject *sequence_obj, int view) {
 
     // Empty tuple, create empty Strs
     if (count == 0) {
-        self->type = STRS_REORDERED;
-        self->data.reordered.count = 0;
-        self->data.reordered.parts = NULL;
-        self->data.reordered.parent_string = NULL;
+        self->type = STRS_FRAGMENTED;
+        self->data.fragmented.count = 0;
+        self->data.fragmented.spans = NULL;
+        self->data.fragmented.parent = NULL;
+        sz_memory_allocator_init_default(&self->data.fragmented.allocator);
         return 0;
     }
 
     // Zero-copy mode for Python sequences - use reordered layout for memory-scattered strings
     if (view) {
-        sz_string_view_t *parts = (sz_string_view_t *)malloc(count * sizeof(sz_string_view_t));
+        // Initialize allocator for memory management
+        sz_memory_allocator_t allocator;
+        sz_memory_allocator_init_default(&allocator);
+
+        sz_string_view_t *parts =
+            (sz_string_view_t *)allocator.allocate(count * sizeof(sz_string_view_t), allocator.handle);
         if (!parts) {
-            Py_DECREF(sequence_obj);
             PyErr_NoMemory();
             return -1;
         }
 
         // Create views directly to Python string objects
-        for (size_t i = 0; i < count; i++) {
+        for (sz_size_t i = 0; i < (sz_size_t)count; i++) {
             PyObject *item = PyTuple_GET_ITEM(sequence_obj, i);
             sz_cptr_t item_start;
             sz_size_t item_length;
             if (!sz_py_export_string_like(item, &item_start, &item_length)) {
-                free(parts);
+                allocator.free(parts, count * sizeof(sz_string_view_t), allocator.handle);
                 PyErr_Format(PyExc_TypeError, "Item %zd is not a string-like object", i);
                 return -1;
             }
@@ -4079,16 +4405,17 @@ static int Strs_init_from_tuple(Strs *self, PyObject *sequence_obj, int view) {
             parts[i].length = item_length;
         }
 
-        self->type = STRS_REORDERED;
-        self->data.reordered.count = count;
-        self->data.reordered.parts = parts;
-        self->data.reordered.parent_string = sequence_obj; // Keep sequence alive
+        self->type = STRS_FRAGMENTED;
+        self->data.fragmented.count = count;
+        self->data.fragmented.spans = parts;
+        self->data.fragmented.allocator = allocator;
+        self->data.fragmented.parent = sequence_obj; // Keep sequence alive
         Py_INCREF(sequence_obj);
     }
     // Allocate a new tape to fit all of the items
     else {
         // Estimate the overall size of strings in bytes
-        size_t total_bytes = 0;
+        sz_size_t total_bytes = 0;
         for (Py_ssize_t i = 0; i < count; i++) {
             PyObject *item = PyTuple_GET_ITEM(sequence_obj, i);
             sz_cptr_t item_start;
@@ -4102,8 +4429,12 @@ static int Strs_init_from_tuple(Strs *self, PyObject *sequence_obj, int view) {
 
         int use_64bit = (total_bytes >= UINT32_MAX);
 
-        // Allocate data buffer
-        char *data_buffer = (char *)malloc(total_bytes);
+        // Initialize allocator for memory management
+        sz_memory_allocator_t allocator;
+        sz_memory_allocator_init_default(&allocator);
+
+        // Allocate data buffer using allocator
+        sz_ptr_t data_buffer = (sz_ptr_t)allocator.allocate(total_bytes, allocator.handle);
         if (!data_buffer) {
             PyErr_NoMemory();
             return -1;
@@ -4111,14 +4442,14 @@ static int Strs_init_from_tuple(Strs *self, PyObject *sequence_obj, int view) {
 
         if (use_64bit) {
             // Apache Arrow format: N+1 offsets for N strings
-            uint64_t *offsets = (uint64_t *)malloc((count + 1) * sizeof(uint64_t));
+            sz_u64_t *offsets = (sz_u64_t *)allocator.allocate((count + 1) * sizeof(sz_u64_t), allocator.handle);
             if (!offsets) {
-                free(data_buffer);
+                allocator.free(data_buffer, total_bytes, allocator.handle);
                 PyErr_NoMemory();
                 return -1;
             }
 
-            size_t offset = 0;
+            sz_size_t offset = 0;
             offsets[0] = 0; // First offset is always 0
             for (Py_ssize_t i = 0; i < count; i++) {
                 PyObject *item = PyTuple_GET_ITEM(sequence_obj, i);
@@ -4131,30 +4462,22 @@ static int Strs_init_from_tuple(Strs *self, PyObject *sequence_obj, int view) {
                 offsets[i + 1] = offset; // Apache Arrow format: offset after this string
             }
 
-            PyObject *parent = PyBytes_FromStringAndSize(data_buffer, total_bytes);
-            free(data_buffer);
-            if (!parent) {
-                free(offsets);
-                return -1;
-            }
-
-            self->type = STRS_CONSECUTIVE_64;
-            self->data.consecutive_64bit.count = count;
-            self->data.consecutive_64bit.separator_length = 0;
-            self->data.consecutive_64bit.parent_string = parent;
-            self->data.consecutive_64bit.start = PyBytes_AS_STRING(parent);
-            self->data.consecutive_64bit.offsets = offsets;
+            self->type = STRS_U64_TAPE;
+            self->data.u64_tape.count = count;
+            self->data.u64_tape.data = data_buffer;
+            self->data.u64_tape.offsets = offsets;
+            self->data.u64_tape.allocator = allocator;
         }
         else {
             // Apache Arrow format: N+1 offsets for N strings
-            uint32_t *offsets = (uint32_t *)malloc((count + 1) * sizeof(uint32_t));
+            sz_u32_t *offsets = (sz_u32_t *)allocator.allocate((count + 1) * sizeof(sz_u32_t), allocator.handle);
             if (!offsets) {
-                free(data_buffer);
+                allocator.free(data_buffer, total_bytes, allocator.handle);
                 PyErr_NoMemory();
                 return -1;
             }
 
-            size_t offset = 0;
+            sz_size_t offset = 0;
             offsets[0] = 0; // First offset is always 0
             for (Py_ssize_t i = 0; i < count; i++) {
                 PyObject *item = PyTuple_GET_ITEM(sequence_obj, i);
@@ -4167,19 +4490,11 @@ static int Strs_init_from_tuple(Strs *self, PyObject *sequence_obj, int view) {
                 offsets[i + 1] = offset; // Apache Arrow format: offset after this string
             }
 
-            PyObject *parent = PyBytes_FromStringAndSize(data_buffer, total_bytes);
-            free(data_buffer);
-            if (!parent) {
-                free(offsets);
-                return -1;
-            }
-
-            self->type = STRS_CONSECUTIVE_32;
-            self->data.consecutive_32bit.count = count;
-            self->data.consecutive_32bit.separator_length = 0;
-            self->data.consecutive_32bit.parent_string = parent;
-            self->data.consecutive_32bit.start = PyBytes_AS_STRING(parent);
-            self->data.consecutive_32bit.offsets = offsets;
+            self->type = STRS_U32_TAPE;
+            self->data.u32_tape.count = count;
+            self->data.u32_tape.data = data_buffer;
+            self->data.u32_tape.offsets = offsets;
+            self->data.u32_tape.allocator = allocator;
         }
     }
 
@@ -4192,16 +4507,22 @@ static int Strs_init_from_list(Strs *self, PyObject *sequence_obj, int view) {
 
     // Handle empty list
     if (count == 0) {
-        self->type = STRS_REORDERED;
-        self->data.reordered.count = 0;
-        self->data.reordered.parts = NULL;
-        self->data.reordered.parent_string = NULL;
+        self->type = STRS_FRAGMENTED;
+        self->data.fragmented.count = 0;
+        self->data.fragmented.spans = NULL;
+        sz_memory_allocator_init_default(&self->data.fragmented.allocator);
+        self->data.fragmented.parent = NULL;
         return 0;
     }
 
     // Zero-copy mode for Python sequences - use reordered layout for memory-scattered strings
     if (view) {
-        sz_string_view_t *parts = (sz_string_view_t *)malloc(count * sizeof(sz_string_view_t));
+        // Initialize allocator for memory management
+        sz_memory_allocator_t allocator;
+        sz_memory_allocator_init_default(&allocator);
+
+        sz_string_view_t *parts =
+            (sz_string_view_t *)allocator.allocate(count * sizeof(sz_string_view_t), allocator.handle);
         if (!parts) {
             PyErr_NoMemory();
             return -1;
@@ -4215,7 +4536,7 @@ static int Strs_init_from_list(Strs *self, PyObject *sequence_obj, int view) {
             sz_cptr_t item_start;
             sz_size_t item_length;
             if (!sz_py_export_string_like(item, &item_start, &item_length)) {
-                free(parts);
+                allocator.free(parts, count * sizeof(sz_string_view_t), allocator.handle);
                 PyErr_Format(PyExc_TypeError, "Item %zd is not a string-like object", i);
                 return -1;
             }
@@ -4225,10 +4546,11 @@ static int Strs_init_from_list(Strs *self, PyObject *sequence_obj, int view) {
         }
 
         // Setup reordered layout with parent list to keep strings alive
-        self->type = STRS_REORDERED;
-        self->data.reordered.count = count;
-        self->data.reordered.parts = parts;
-        self->data.reordered.parent_string = sequence_obj; // Keep list alive
+        self->type = STRS_FRAGMENTED;
+        self->data.fragmented.count = count;
+        self->data.fragmented.spans = parts;
+        self->data.fragmented.allocator = allocator;
+        self->data.fragmented.parent = sequence_obj; // Keep list alive
         Py_INCREF(sequence_obj);
         return 0;
     }
@@ -4236,7 +4558,7 @@ static int Strs_init_from_list(Strs *self, PyObject *sequence_obj, int view) {
     else {
 
         // First pass: calculate total size needed
-        size_t total_bytes = 0;
+        sz_size_t total_bytes = 0;
         int use_64bit = 0;
 
         for (Py_ssize_t i = 0; i < count; i++) {
@@ -4253,26 +4575,33 @@ static int Strs_init_from_list(Strs *self, PyObject *sequence_obj, int view) {
             total_bytes += item_length;
         }
 
+        // Initialize allocator for memory management
+        sz_memory_allocator_t allocator;
+        sz_memory_allocator_init_default(&allocator);
+
         // Allocate buffers based on calculated sizes
-        char *data_buffer = (char *)malloc(total_bytes);
+        sz_ptr_t data_buffer = (sz_ptr_t)allocator.allocate(total_bytes, allocator.handle);
         void *offsets;
 
         // Apache Arrow format: N+1 offsets for N strings
-        if (use_64bit) { offsets = malloc((count + 1) * sizeof(uint64_t)); }
-        else { offsets = malloc((count + 1) * sizeof(uint32_t)); }
+        if (use_64bit) { offsets = allocator.allocate((count + 1) * sizeof(sz_u64_t), allocator.handle); }
+        else { offsets = allocator.allocate((count + 1) * sizeof(sz_u32_t), allocator.handle); }
 
         if (!data_buffer || !offsets) {
-            free(data_buffer);
-            free(offsets);
+            if (data_buffer) allocator.free(data_buffer, total_bytes, allocator.handle);
+            if (offsets) {
+                sz_size_t offsets_size = use_64bit ? (count + 1) * sizeof(sz_u64_t) : (count + 1) * sizeof(sz_u32_t);
+                allocator.free(offsets, offsets_size, allocator.handle);
+            }
             PyErr_NoMemory();
             return -1;
         }
 
         // Second pass: copy data and build offsets (Apache Arrow format)
-        size_t current_offset = 0;
+        sz_size_t current_offset = 0;
         // Set first offset to 0
-        if (use_64bit) { ((uint64_t *)offsets)[0] = 0; }
-        else { ((uint32_t *)offsets)[0] = 0; }
+        if (use_64bit) { ((sz_u64_t *)offsets)[0] = 0; }
+        else { ((sz_u32_t *)offsets)[0] = 0; }
 
         for (Py_ssize_t i = 0; i < count; i++) {
             PyObject *item = PyList_GET_ITEM(sequence_obj, i);
@@ -4287,37 +4616,24 @@ static int Strs_init_from_list(Strs *self, PyObject *sequence_obj, int view) {
             current_offset += item_length;
 
             // Store offset (Apache Arrow format: offset after this string)
-            if (use_64bit) { ((uint64_t *)offsets)[i + 1] = current_offset; }
-            else { ((uint32_t *)offsets)[i + 1] = current_offset; }
-        }
-
-        // Create parent bytes object from the buffer
-        PyObject *parent = PyBytes_FromStringAndSize(data_buffer, total_bytes);
-        free(data_buffer);
-        if (!parent) {
-            free(offsets);
-            PyErr_NoMemory();
-            return -1;
+            if (use_64bit) { ((sz_u64_t *)offsets)[i + 1] = current_offset; }
+            else { ((sz_u32_t *)offsets)[i + 1] = current_offset; }
         }
 
         // Setup the consecutive layout (32-bit or 64-bit)
         if (use_64bit) {
-            self->type = STRS_CONSECUTIVE_64;
-            self->data.consecutive_64bit.count = count;
-            self->data.consecutive_64bit.separator_length = 0;
-            self->data.consecutive_64bit.parent_string = parent;
-            self->data.consecutive_64bit.start = PyBytes_AS_STRING(parent);
-            self->data.consecutive_64bit.offsets = (uint64_t *)offsets;
-            self->data.consecutive_64bit.owns_offsets = 1;
+            self->type = STRS_U64_TAPE;
+            self->data.u64_tape.count = count;
+            self->data.u64_tape.data = data_buffer;
+            self->data.u64_tape.offsets = (sz_u64_t *)offsets;
+            self->data.u64_tape.allocator = allocator;
         }
         else {
-            self->type = STRS_CONSECUTIVE_32;
-            self->data.consecutive_32bit.count = count;
-            self->data.consecutive_32bit.separator_length = 0;
-            self->data.consecutive_32bit.parent_string = parent;
-            self->data.consecutive_32bit.start = PyBytes_AS_STRING(parent);
-            self->data.consecutive_32bit.offsets = (uint32_t *)offsets;
-            self->data.consecutive_32bit.owns_offsets = 1;
+            self->type = STRS_U32_TAPE;
+            self->data.u32_tape.count = count;
+            self->data.u32_tape.data = data_buffer;
+            self->data.u32_tape.offsets = (sz_u32_t *)offsets;
+            self->data.u32_tape.allocator = allocator;
         }
 
         return 0;
@@ -4341,189 +4657,203 @@ static int Strs_init_from_iterable(Strs *self, PyObject *sequence_obj, int view)
                                           "Use view=False to create a copy, or convert to a list/tuple first.");
         return -1;
     }
-    // Allocate a new tape to fit all of the items
-    else {
-        size_t data_capacity = 4096;
-        size_t offsets_capacity = 16;
-        size_t count = 0;
-        size_t total_bytes = 0;
-        int use_64bit = 0; // Start with 32-bit
 
-        char *data_buffer = (char *)malloc(data_capacity);
-        void *offsets = malloc(offsets_capacity * sizeof(uint32_t)); // Start with 32-bit
+    // Initialize allocator for memory management
+    sz_memory_allocator_t allocator;
+    sz_memory_allocator_init_default(&allocator);
 
-        if (!data_buffer || !offsets) {
-            free(data_buffer);
-            free(offsets);
+    // Incrementally allocate a new tape to fit all of the items
+    sz_size_t data_capacity = 4096;
+    sz_size_t offsets_capacity = 16;
+    sz_size_t count = 0;
+    sz_size_t total_bytes = 0;
+    int use_64bit = 0; // Start with 32-bit
+
+    sz_ptr_t data_buffer = (sz_ptr_t)allocator.allocate(data_capacity, allocator.handle);
+    void *offsets = allocator.allocate(offsets_capacity * sizeof(sz_u32_t), allocator.handle); // Start with 32-bit
+
+    if (!data_buffer || !offsets) {
+        if (data_buffer) allocator.free(data_buffer, data_capacity, allocator.handle);
+        if (offsets) allocator.free(offsets, offsets_capacity * sizeof(sz_u32_t), allocator.handle);
+        Py_DECREF(iterator);
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    // Set initial offset to 0 (Apache Arrow format: N+1 offsets for N strings)
+    if (use_64bit) { ((sz_u64_t *)offsets)[0] = 0; }
+    else { ((sz_u32_t *)offsets)[0] = 0; }
+
+    // Iterate through all items
+    PyObject *item;
+    while ((item = PyIter_Next(iterator))) {
+        sz_cptr_t item_start;
+        sz_size_t item_length;
+        if (!sz_py_export_string_like(item, &item_start, &item_length)) {
+            Py_DECREF(item);
+            allocator.free(data_buffer, data_capacity, allocator.handle);
+            allocator.free(offsets, offsets_capacity * (use_64bit ? sizeof(sz_u64_t) : sizeof(sz_u32_t)),
+                           allocator.handle);
             Py_DECREF(iterator);
-            PyErr_NoMemory();
+            PyErr_Format(PyExc_TypeError, "Item %zd is not a string-like object", count);
             return -1;
         }
 
-        // Iterate through all items
-        PyObject *item;
-        while ((item = PyIter_Next(iterator))) {
-            sz_cptr_t item_start;
-            sz_size_t item_length;
-            if (!sz_py_export_string_like(item, &item_start, &item_length)) {
+        // Check if adding this string would exceed UINT32_MAX and switch to 64-bit
+        if (!use_64bit && total_bytes + item_length > UINT32_MAX) {
+            // Convert offsets from 32-bit to 64-bit
+            sz_size_t new_offsets_size = offsets_capacity * sizeof(sz_u64_t);
+            sz_u64_t *new_offsets = (sz_u64_t *)allocator.allocate(new_offsets_size, allocator.handle);
+            if (!new_offsets) {
                 Py_DECREF(item);
-                free(data_buffer);
-                free(offsets);
+                allocator.free(data_buffer, data_capacity, allocator.handle);
+                allocator.free(offsets, offsets_capacity * sizeof(sz_u32_t), allocator.handle);
                 Py_DECREF(iterator);
-                PyErr_Format(PyExc_TypeError, "Item %zd is not a string-like object", count);
+                PyErr_NoMemory();
                 return -1;
             }
 
-            // Check if adding this string would exceed UINT32_MAX and switch to 64-bit
-            if (!use_64bit && total_bytes + item_length > UINT32_MAX) {
-                // Convert offsets from 32-bit to 64-bit
-                uint64_t *new_offsets = (uint64_t *)malloc(offsets_capacity * sizeof(uint64_t));
-                if (!new_offsets) {
-                    Py_DECREF(item);
-                    free(data_buffer);
-                    free(offsets);
-                    Py_DECREF(iterator);
-                    PyErr_NoMemory();
-                    return -1;
-                }
-
-                // Copy existing 32-bit offsets to 64-bit
-                uint32_t *old_offsets = (uint32_t *)offsets;
-                for (size_t i = 0; i < count; i++) { new_offsets[i] = old_offsets[i]; }
+            // Copy existing 32-bit offsets to 64-bit (including initial 0 and all current offsets)
+            sz_u32_t *old_offsets = (sz_u32_t *)offsets;
+            for (sz_size_t i = 0; i <= count; i++) { new_offsets[i] = old_offsets[i]; }
 
-                free(offsets);
-                offsets = new_offsets;
-                use_64bit = 1;
-            }
-
-            // Grow data buffer if needed (doubling strategy)
-            while (total_bytes + item_length > data_capacity) {
-                size_t new_capacity = data_capacity * 2;
-                if (new_capacity < data_capacity) { // Overflow check
-                    new_capacity = SIZE_MAX;
-                    if (total_bytes + item_length > new_capacity) {
-                        Py_DECREF(item);
-                        free(data_buffer);
-                        free(offsets);
-                        Py_DECREF(iterator);
-                        PyErr_SetString(PyExc_MemoryError, "String data too large");
-                        return -1;
-                    }
-                }
+            allocator.free(offsets, offsets_capacity * sizeof(sz_u32_t), allocator.handle);
+            offsets = new_offsets;
+            use_64bit = 1;
+        }
 
-                char *new_buffer = (char *)realloc(data_buffer, new_capacity);
-                if (!new_buffer) {
+        // Grow data buffer if needed (doubling strategy)
+        while (total_bytes + item_length > data_capacity) {
+            sz_size_t new_capacity = data_capacity * 2;
+            if (new_capacity < data_capacity) { // Overflow check
+                new_capacity = SIZE_MAX;
+                if (total_bytes + item_length > new_capacity) {
                     Py_DECREF(item);
-                    free(data_buffer);
-                    free(offsets);
+                    allocator.free(data_buffer, data_capacity, allocator.handle);
+                    allocator.free(offsets, offsets_capacity * (use_64bit ? sizeof(sz_u64_t) : sizeof(sz_u32_t)),
+                                   allocator.handle);
                     Py_DECREF(iterator);
-                    PyErr_NoMemory();
+                    PyErr_SetString(PyExc_MemoryError, "String data too large");
                     return -1;
                 }
-                data_buffer = new_buffer;
-                data_capacity = new_capacity;
             }
 
-            // Grow offsets array if needed (doubling strategy)
-            if (count >= offsets_capacity) {
-                size_t new_capacity = offsets_capacity * 2;
-                size_t element_size = use_64bit ? sizeof(uint64_t) : sizeof(uint32_t);
-                if (new_capacity > SIZE_MAX / element_size) {
-                    Py_DECREF(item);
-                    free(data_buffer);
-                    free(offsets);
-                    Py_DECREF(iterator);
-                    PyErr_SetString(PyExc_MemoryError, "Too many strings");
-                    return -1;
-                }
-
-                void *new_offsets = realloc(offsets, new_capacity * element_size);
-                if (!new_offsets) {
-                    Py_DECREF(item);
-                    free(data_buffer);
-                    free(offsets);
-                    Py_DECREF(iterator);
-                    PyErr_NoMemory();
-                    return -1;
-                }
-                offsets = new_offsets;
-                offsets_capacity = new_capacity;
+            sz_ptr_t new_buffer = (sz_ptr_t)allocator.allocate(new_capacity, allocator.handle);
+            if (!new_buffer) {
+                Py_DECREF(item);
+                allocator.free(data_buffer, data_capacity, allocator.handle);
+                allocator.free(offsets, offsets_capacity * (use_64bit ? sizeof(sz_u64_t) : sizeof(sz_u32_t)),
+                               allocator.handle);
+                Py_DECREF(iterator);
+                PyErr_NoMemory();
+                return -1;
             }
-
-            // Copy the string data
-            memcpy(data_buffer + total_bytes, item_start, item_length);
-            total_bytes += item_length;
-
-            // Store offset
-            if (use_64bit) { ((uint64_t *)offsets)[count] = total_bytes; }
-            else { ((uint32_t *)offsets)[count] = total_bytes; }
-            count++;
-
-            Py_DECREF(item);
+            memcpy(new_buffer, data_buffer, total_bytes);
+            allocator.free(data_buffer, data_capacity, allocator.handle);
+            data_buffer = new_buffer;
+            data_capacity = new_capacity;
         }
 
-        Py_DECREF(iterator);
+        // Grow offsets array if needed (doubling strategy)
+        // Need space for count+2 offsets total (0, 1, ..., count+1)
+        if (count + 1 >= offsets_capacity) {
+            sz_size_t new_capacity = offsets_capacity * 2;
+            sz_size_t element_size = use_64bit ? sizeof(sz_u64_t) : sizeof(sz_u32_t);
+            if (new_capacity > SIZE_MAX / element_size) {
+                Py_DECREF(item);
+                allocator.free(data_buffer, data_capacity, allocator.handle);
+                allocator.free(offsets, offsets_capacity * element_size, allocator.handle);
+                Py_DECREF(iterator);
+                PyErr_SetString(PyExc_MemoryError, "Too many strings");
+                return -1;
+            }
 
-        // Check for errors during iteration
-        if (PyErr_Occurred()) {
-            free(data_buffer);
-            free(offsets);
-            return -1;
+            void *new_offsets = allocator.allocate(new_capacity * element_size, allocator.handle);
+            if (!new_offsets) {
+                Py_DECREF(item);
+                allocator.free(data_buffer, data_capacity, allocator.handle);
+                allocator.free(offsets, offsets_capacity * element_size, allocator.handle);
+                Py_DECREF(iterator);
+                PyErr_NoMemory();
+                return -1;
+            }
+            memcpy(new_offsets, offsets, (count + 1) * element_size);
+            allocator.free(offsets, offsets_capacity * element_size, allocator.handle);
+            offsets = new_offsets;
+            offsets_capacity = new_capacity;
         }
 
-        // Handle empty iterator
-        if (count == 0) {
-            free(data_buffer);
-            free(offsets);
-            self->type = STRS_REORDERED;
-            self->data.reordered.count = 0;
-            self->data.reordered.parts = NULL;
-            self->data.reordered.parent_string = NULL;
-            return 0;
-        }
+        // Copy the string data
+        memcpy(data_buffer + total_bytes, item_start, item_length);
+        total_bytes += item_length;
+        count++;
 
-        // Shrink buffers to actual size
-        char *final_buffer = (char *)realloc(data_buffer, total_bytes);
-        if (final_buffer) data_buffer = final_buffer;
+        // Store next offset (end of the string we just added)
+        if (use_64bit) { ((sz_u64_t *)offsets)[count] = total_bytes; }
+        else { ((sz_u32_t *)offsets)[count] = total_bytes; }
 
-        size_t element_size = use_64bit ? sizeof(uint64_t) : sizeof(uint32_t);
-        void *final_offsets = realloc(offsets, count * element_size);
-        if (final_offsets) offsets = final_offsets;
+        Py_DECREF(item);
+    }
 
-        // Create parent bytes object from the buffer
-        PyObject *parent = PyBytes_FromStringAndSize(data_buffer, total_bytes);
-        free(data_buffer);
-        if (!parent) {
-            free(offsets);
-            PyErr_NoMemory();
-            return -1;
-        }
+    Py_DECREF(iterator);
 
-        // Setup the consecutive layout (32-bit or 64-bit)
-        if (use_64bit) {
-            self->type = STRS_CONSECUTIVE_64;
-            self->data.consecutive_64bit.count = count;
-            self->data.consecutive_64bit.separator_length = 0;
-            self->data.consecutive_64bit.parent_string = parent;
-            self->data.consecutive_64bit.start = PyBytes_AS_STRING(parent);
-            self->data.consecutive_64bit.offsets = (uint64_t *)offsets;
-            self->data.consecutive_64bit.owns_offsets = 1;
-        }
-        else {
-            self->type = STRS_CONSECUTIVE_32;
-            self->data.consecutive_32bit.count = count;
-            self->data.consecutive_32bit.separator_length = 0;
-            self->data.consecutive_32bit.parent_string = parent;
-            self->data.consecutive_32bit.start = PyBytes_AS_STRING(parent);
-            self->data.consecutive_32bit.offsets = (uint32_t *)offsets;
-            self->data.consecutive_32bit.owns_offsets = 1;
-        }
+    // Check for errors during iteration
+    if (PyErr_Occurred()) {
+        allocator.free(data_buffer, data_capacity, allocator.handle);
+        allocator.free(offsets, offsets_capacity * (use_64bit ? sizeof(sz_u64_t) : sizeof(sz_u32_t)), allocator.handle);
+        return -1;
+    }
 
+    // Handle empty iterator
+    if (count == 0) {
+        allocator.free(data_buffer, data_capacity, allocator.handle);
+        allocator.free(offsets, offsets_capacity * sizeof(sz_u32_t), allocator.handle);
+        self->type = STRS_FRAGMENTED;
+        self->data.fragmented.count = 0;
+        self->data.fragmented.spans = NULL;
+        self->data.fragmented.allocator = allocator;
+        self->data.fragmented.parent = NULL;
         return 0;
     }
+
+    // Shrink buffers to actual size
+    sz_ptr_t final_buffer = (sz_ptr_t)allocator.allocate(total_bytes, allocator.handle);
+    if (final_buffer) {
+        memcpy(final_buffer, data_buffer, total_bytes);
+        allocator.free(data_buffer, data_capacity, allocator.handle);
+        data_buffer = final_buffer;
+    }
+
+    sz_size_t element_size = use_64bit ? sizeof(sz_u64_t) : sizeof(sz_u32_t);
+    sz_size_t final_offsets_size = (count + 1) * element_size;
+    void *final_offsets = allocator.allocate(final_offsets_size, allocator.handle);
+    if (final_offsets) {
+        memcpy(final_offsets, offsets, final_offsets_size);
+        allocator.free(offsets, offsets_capacity * element_size, allocator.handle);
+        offsets = final_offsets;
+    }
+
+    // Setup the consecutive layout (32-bit or 64-bit)
+    if (use_64bit) {
+        self->type = STRS_U64_TAPE;
+        self->data.u64_tape.count = count;
+        self->data.u64_tape.data = data_buffer;
+        self->data.u64_tape.offsets = (sz_u64_t *)offsets;
+        self->data.u64_tape.allocator = allocator;
+    }
+    else {
+        self->type = STRS_U32_TAPE;
+        self->data.u32_tape.count = count;
+        self->data.u32_tape.data = data_buffer;
+        self->data.u32_tape.offsets = (sz_u32_t *)offsets;
+        self->data.u32_tape.allocator = allocator;
+    }
+
+    return 0;
 }
 
 static int Strs_init(Strs *self, PyObject *args, PyObject *kwargs) {
+
     // Manual argument parsing for performance
     Py_ssize_t nargs = PyTuple_Size(args);
     if (nargs > 2) {
@@ -4558,10 +4888,11 @@ static int Strs_init(Strs *self, PyObject *args, PyObject *kwargs) {
 
     // If no sequence provided, create empty Strs
     if (!sequence_obj) {
-        self->type = STRS_REORDERED;
-        self->data.reordered.count = 0;
-        self->data.reordered.parts = NULL;
-        self->data.reordered.parent_string = NULL;
+        self->type = STRS_FRAGMENTED;
+        self->data.fragmented.count = 0;
+        self->data.fragmented.spans = NULL;
+        sz_memory_allocator_init_default(&self->data.fragmented.allocator);
+        self->data.fragmented.parent = NULL;
         return 0;
     }
 
@@ -4590,29 +4921,52 @@ static int Strs_init(Strs *self, PyObject *args, PyObject *kwargs) {
 
 static void Strs_dealloc(Strs *self) {
     switch (self->type) {
-    case STRS_CONSECUTIVE_32:
-        // Free offset array (only if owned) and decref parent string
-        if (self->data.consecutive_32bit.owns_offsets && self->data.consecutive_32bit.offsets)
-            free(self->data.consecutive_32bit.offsets);
-        Py_XDECREF(self->data.consecutive_32bit.parent_string);
+    case STRS_U32_TAPE:
+        // Free owned data and offsets
+        if (self->data.u32_tape.data) {
+            sz_size_t data_size = self->data.u32_tape.offsets[self->data.u32_tape.count];
+            self->data.u32_tape.allocator.free((sz_ptr_t)self->data.u32_tape.data, data_size,
+                                               self->data.u32_tape.allocator.handle);
+        }
+        if (self->data.u32_tape.offsets) {
+            sz_size_t offsets_size = (self->data.u32_tape.count + 1) * sizeof(sz_u32_t);
+            self->data.u32_tape.allocator.free(self->data.u32_tape.offsets, offsets_size,
+                                               self->data.u32_tape.allocator.handle);
+        }
         break;
 
-    case STRS_CONSECUTIVE_64:
-        // Free offset array (only if owned) and decref parent string
-        if (self->data.consecutive_64bit.owns_offsets && self->data.consecutive_64bit.offsets)
-            free(self->data.consecutive_64bit.offsets);
-        Py_XDECREF(self->data.consecutive_64bit.parent_string);
+    case STRS_U64_TAPE:
+        // Free owned data and offsets
+        if (self->data.u64_tape.data) {
+            sz_size_t data_size = self->data.u64_tape.offsets[self->data.u64_tape.count];
+            self->data.u64_tape.allocator.free((sz_ptr_t)self->data.u64_tape.data, data_size,
+                                               self->data.u64_tape.allocator.handle);
+        }
+        if (self->data.u64_tape.offsets) {
+            sz_size_t offsets_size = (self->data.u64_tape.count + 1) * sizeof(sz_u64_t);
+            self->data.u64_tape.allocator.free(self->data.u64_tape.offsets, offsets_size,
+                                               self->data.u64_tape.allocator.handle);
+        }
         break;
 
-    case STRS_REORDERED:
-        // Free parts array and decref parent string
-        free(self->data.reordered.parts);
-        Py_XDECREF(self->data.reordered.parent_string);
+    case STRS_U32_TAPE_VIEW:
+        // Views don't own data, just release parent reference
+        Py_XDECREF(self->data.u32_tape_view.parent);
         break;
 
-    case STRS_MULTI_SOURCE:
-        // Handle multi-source cleanup if needed
-        // (not currently used in our implementation)
+    case STRS_U64_TAPE_VIEW:
+        // Views don't own data, just release parent reference
+        Py_XDECREF(self->data.u64_tape_view.parent);
+        break;
+
+    case STRS_FRAGMENTED:
+        // Free owned spans array and release parent reference
+        if (self->data.fragmented.spans) {
+            sz_size_t spans_size = self->data.fragmented.count * sizeof(sz_string_view_t);
+            self->data.fragmented.allocator.free(self->data.fragmented.spans, spans_size,
+                                                 self->data.fragmented.allocator.handle);
+        }
+        Py_XDECREF(self->data.fragmented.parent);
         break;
     }
 
@@ -4620,10 +4974,10 @@ static void Strs_dealloc(Strs *self) {
 }
 
 static PyMethodDef Strs_methods[] = {
-    {"shuffle", Strs_shuffle, SZ_METHOD_FLAGS, "Shuffle (in-place) the elements of the Strs object."}, //
-    {"sort", Strs_sort, SZ_METHOD_FLAGS, "Sort (in-place) the elements of the Strs object."},          //
-    {"argsort", Strs_argsort, SZ_METHOD_FLAGS, "Provides the permutation to achieve sorted order."},   //
-    {"sample", Strs_sample, SZ_METHOD_FLAGS, "Provides a random sample of a given size."},             //
+    {"shuffled", Strs_shuffled, SZ_METHOD_FLAGS, "Shuffle the elements of the Strs object."},        //
+    {"sorted", Strs_sorted, SZ_METHOD_FLAGS, "Sort (in-place) the elements of the Strs object."},    //
+    {"argsort", Strs_argsort, SZ_METHOD_FLAGS, "Provides the permutation to achieve sorted order."}, //
+    {"sample", Strs_sample, SZ_METHOD_FLAGS, "Provides a random sample of a given size."},           //
     // {"to_pylist", Strs_to_pylist, SZ_METHOD_FLAGS, "Exports string-views to a native list of native strings."}, //
     {NULL, NULL, 0, NULL} // Sentinel
 };
@@ -4760,18 +5114,18 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
     // Define SIMD capabilities as a tuple
     {
         sz_capability_t caps = sz_capabilities();
-        
+
         // Get capability strings using the new function
-        char const *cap_strings[SZ_CAPABILITIES_COUNT];
+        sz_cptr_t cap_strings[SZ_CAPABILITIES_COUNT];
         sz_size_t cap_count = sz_capabilities_to_strings_implementation_(caps, cap_strings, SZ_CAPABILITIES_COUNT);
-        
+
         // Create a Python tuple with the capabilities
         PyObject *caps_tuple = PyTuple_New(cap_count);
         if (!caps_tuple) {
             Py_XDECREF(m);
             return NULL;
         }
-        
+
         for (sz_size_t i = 0; i < cap_count; i++) {
             PyObject *cap_str = PyUnicode_FromString(cap_strings[i]);
             if (!cap_str) {
@@ -4781,13 +5135,13 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
             }
             PyTuple_SET_ITEM(caps_tuple, i, cap_str);
         }
-        
+
         if (PyModule_AddObject(m, "__capabilities__", caps_tuple) < 0) {
             Py_DECREF(caps_tuple);
             Py_XDECREF(m);
             return NULL;
         }
-        
+
         // Also keep the old comma-separated string version for backward compatibility
         sz_cptr_t caps_str = sz_capabilities_to_string(caps);
         PyModule_AddStringConstant(m, "__capabilities_str__", caps_str);
@@ -4833,6 +5187,7 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
         .sz_py_export_strings_as_sequence = sz_py_export_strings_as_sequence,
         .sz_py_export_strings_as_u32tape = sz_py_export_strings_as_u32tape,
         .sz_py_export_strings_as_u64tape = sz_py_export_strings_as_u64tape,
+        .sz_py_replace_strings_allocator = sz_py_replace_strings_allocator,
     };
     if (PyModule_AddObject(m, "_sz_py_api", PyCapsule_New(&sz_py_api, "_sz_py_api", NULL)) < 0) {
         Py_XDECREF(&SplitIteratorType);
diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index e96588b6..4859c17d 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -310,14 +310,14 @@ def test_unit_strs_sequence():
     assert str(Str("a" * 1_000_000).split()).startswith("['aaa")
     assert str(Str("a" * 1_000_000).split()).endswith("aaa']")
 
-    lines.sort()
-    assert [0, 1, 2] == list(lines.argsort())
-    assert ["p1", "p2", "p3"] == list(lines)
+    lines_sorted = lines.sorted()
+    assert [0, 1, 2] == list(lines_sorted.argsort())
+    assert ["p1", "p2", "p3"] == list(lines_sorted)
 
     # Reverse order
-    assert [2, 1, 0] == list(lines.argsort(reverse=True))
-    lines.sort(reverse=True)
-    assert ["p3", "p2", "p1"] == list(lines)
+    assert [2, 1, 0] == list(lines_sorted.argsort(reverse=True))
+    lines_sorted_reverse = lines.sorted(reverse=True)
+    assert ["p3", "p2", "p1"] == list(lines_sorted_reverse)
 
     # Sampling an array
     sampled = lines.sample(100, seed=42)
@@ -679,7 +679,7 @@ def test_fuzzy_sorting(list_length: int, part_length: int, variability: int):
         assert native_ordered[i] == str(big_list[int(native_order[i])]), "Split is wrong?!"
 
     native_list.sort()
-    big_list.sort()
+    big_list = big_list.sorted()
 
     assert len(native_list) == len(big_list)
     for native_str, big_str in zip(native_list, big_list):
@@ -702,7 +702,7 @@ def test_fuzzy_sorting(list_length: int, part_length: int, variability: int):
         assert native_ordered[i] == str(big_list[int(native_order[i])]), "Split is wrong?!"
 
     native_list.sort()
-    big_list.sort()
+    big_list = big_list.sorted()
 
     assert len(native_list) == len(big_list)
     for native_str, big_str in zip(native_list, big_list):

From 517757c3a9ad988f060d253137723a69564f1a1f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 13 Aug 2025 22:16:29 +0000
Subject: [PATCH 571/751] Fix: Handling empty strings in arrays

---
 python/stringzilla.c | 447 ++++++++++++++++++++++++++-----------------
 1 file changed, 276 insertions(+), 171 deletions(-)

diff --git a/python/stringzilla.c b/python/stringzilla.c
index d7e9afa2..9352070d 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -182,11 +182,11 @@ typedef struct {
     PyObject ob_base;
 
     enum {
-        STRS_U32_TAPE_VIEW,
-        STRS_U64_TAPE_VIEW,
-        STRS_U32_TAPE,
-        STRS_U64_TAPE,
-        STRS_FRAGMENTED,
+        STRS_U32_TAPE_VIEW = 0,
+        STRS_U64_TAPE_VIEW = 1,
+        STRS_U32_TAPE = 2,
+        STRS_U64_TAPE = 3,
+        STRS_FRAGMENTED = 4,
     } type;
 
     union {
@@ -449,18 +449,20 @@ SZ_DYNAMIC sz_bool_t sz_py_export_strings_as_u32tape(PyObject *object, sz_cptr_t
     if (!data || !offsets || !count) return sz_false_k;
     if (!PyObject_TypeCheck(object, &StrsType)) return sz_false_k;
     Strs *strs = (Strs *)object;
-    if (strs->type != STRS_U32_TAPE && strs->type != STRS_U32_TAPE_VIEW) return sz_false_k;
+
     if (strs->type == STRS_U32_TAPE) {
         *data = strs->data.u32_tape.data;
         *offsets = strs->data.u32_tape.offsets;
         *count = strs->data.u32_tape.count;
+        return sz_true_k;
     }
-    else {
+    else if (strs->type == STRS_U32_TAPE_VIEW) {
         *data = strs->data.u32_tape_view.data;
         *offsets = strs->data.u32_tape_view.offsets;
         *count = strs->data.u32_tape_view.count;
+        return sz_true_k;
     }
-    return sz_true_k;
+    else { return sz_false_k; }
 }
 
 /**
@@ -472,210 +474,311 @@ SZ_DYNAMIC sz_bool_t sz_py_export_strings_as_u64tape(PyObject *object, sz_cptr_t
     if (!data || !offsets || !count) return sz_false_k;
     if (!PyObject_TypeCheck(object, &StrsType)) return sz_false_k;
     Strs *strs = (Strs *)object;
-    if (strs->type != STRS_U64_TAPE && strs->type != STRS_U64_TAPE_VIEW) return sz_false_k;
+
     if (strs->type == STRS_U64_TAPE) {
         *data = strs->data.u64_tape.data;
         *offsets = strs->data.u64_tape.offsets;
         *count = strs->data.u64_tape.count;
     }
-    else {
+    else if (strs->type == STRS_U64_TAPE_VIEW) {
         *data = strs->data.u64_tape_view.data;
         *offsets = strs->data.u64_tape_view.offsets;
         *count = strs->data.u64_tape_view.count;
     }
-    return sz_true_k;
+    else { return sz_false_k; }
 }
 
-/**
- *  @brief  Helper function to replace the memory allocator in a `Strs` object.
- *          This reallocates existing string data using the new allocator.
- *
- *  This may change the type of the `Strs` layout:
- *  - `STRS_U32_TAPE_VIEW` becomes `STRS_U32_TAPE`.
- *  - `STRS_U64_TAPE_VIEW` becomes `STRS_U64_TAPE`.
- *  - `STRS_U32_TAPE` remains, if the allocator is different.
- *  - `STRS_U64_TAPE` remains, if the allocator is different.
- *  - `STRS_FRAGMENTED` remains, but detaches from the parent object, if the allocator is different.
- */
-SZ_DYNAMIC sz_bool_t sz_py_replace_strings_allocator(PyObject *object, sz_memory_allocator_t *allocator) {
-    if (!object || !allocator) return sz_false_k;
-    if (!PyObject_TypeCheck(object, &StrsType)) return sz_false_k;
-
-    Strs *strs = (Strs *)object;
-
-    printf("DEBUG: sz_py_replace_strings_allocator called for Strs at %p\n", strs);
-    printf("DEBUG: Strs type: %d\n", strs->type);
-
-    // Get the current allocator based on type
-    sz_memory_allocator_t old_allocator;
-    switch (strs->type) {
-    case STRS_U32_TAPE: old_allocator = strs->data.u32_tape.allocator; break;
-    case STRS_U64_TAPE: old_allocator = strs->data.u64_tape.allocator; break;
-    case STRS_FRAGMENTED: old_allocator = strs->data.fragmented.allocator; break;
-    case STRS_U32_TAPE_VIEW:
-    case STRS_U64_TAPE_VIEW:
-        // Views don't own memory, use default allocator for comparison
-        sz_memory_allocator_init_default(&old_allocator);
-        break;
-    }
-
-    // Check if the allocators are the same - no need to reallocate
-    if (sz_memory_allocator_equal(&old_allocator, allocator)) {
-        printf("DEBUG: Allocators are equal, no reallocation needed\n");
-        return sz_true_k;
-    }
+static sz_bool_t sz_py_replace_u32_tape_allocator(Strs *strs, sz_memory_allocator_t *old_allocator,
+                                                  sz_memory_allocator_t *allocator) {
+    struct u32_tape_t *data = &strs->data.u32_tape;
+    sz_assert_(data->offsets && "Expected offsets to be allocated");
+
+    sz_size_t const string_data_size = (sz_size_t)data->offsets[data->count];
+    sz_size_t const offsets_size = (data->count + 1) * sizeof(sz_u32_t);
+
+    // Allocate new string data with new allocator
+    sz_ptr_t new_string_data =
+        string_data_size ? (sz_ptr_t)allocator->allocate(string_data_size, allocator->handle) : (sz_ptr_t)NULL;
+    if (string_data_size && !new_string_data) return sz_false_k;
+    memcpy(new_string_data, data->data, string_data_size);
+
+    // Allocate new offsets array
+    sz_u32_t *new_offsets =
+        offsets_size ? (sz_u32_t *)allocator->allocate(offsets_size, allocator->handle) : (sz_u32_t *)NULL;
+    if (offsets_size && !new_offsets) {
+        if (string_data_size) allocator->free(new_string_data, string_data_size, allocator->handle);
+        return sz_false_k;
+    }
+    memcpy(new_offsets, data->offsets, offsets_size);
+
+    // Free old memory with old allocator (tapes always own their data)
+    old_allocator->free(data->data, string_data_size, old_allocator->handle);
+    old_allocator->free(data->offsets, offsets_size, old_allocator->handle);
+
+    // Update pointers and allocator
+    data->data = new_string_data;
+    data->offsets = new_offsets;
+    data->allocator = *allocator;
+    printf("Replaced u32 tape allocator with %p\n", allocator->handle);
+    return sz_true_k;
+}
 
-    // Handle different Strs layouts
-    switch (strs->type) {
-    case STRS_U32_TAPE: {
-        struct u32_tape_t *data = &strs->data.u32_tape;
-        sz_assert_(data->offsets && "Expected offsets to be allocated");
+static sz_bool_t sz_py_replace_u64_tape_allocator(Strs *strs, sz_memory_allocator_t *old_allocator,
+                                                  sz_memory_allocator_t *allocator) {
+    struct u64_tape_t *data = &strs->data.u64_tape;
+    sz_assert_(data->offsets && "Expected offsets to be allocated");
+
+    sz_size_t string_data_size = (sz_size_t)data->offsets[data->count];
+    sz_size_t offsets_size = (data->count + 1) * sizeof(sz_u64_t);
+
+    // Allocate new string data with new allocator
+    sz_ptr_t new_string_data =
+        string_data_size ? (sz_ptr_t)allocator->allocate(string_data_size, allocator->handle) : (sz_ptr_t)NULL;
+    if (string_data_size && !new_string_data) return sz_false_k;
+    memcpy(new_string_data, data->data, string_data_size);
+
+    // Allocate new offsets array
+    sz_u64_t *new_offsets =
+        offsets_size ? (sz_u64_t *)allocator->allocate(offsets_size, allocator->handle) : (sz_u64_t *)NULL;
+    if (offsets_size && !new_offsets) {
+        if (string_data_size) allocator->free(new_string_data, string_data_size, allocator->handle);
+        return sz_false_k;
+    }
+    memcpy(new_offsets, data->offsets, offsets_size);
+
+    // Free old memory with old allocator (tapes always own their data)
+    old_allocator->free(data->data, string_data_size, old_allocator->handle);
+    old_allocator->free(data->offsets, offsets_size, old_allocator->handle);
+
+    // Update pointers and allocator
+    data->data = new_string_data;
+    data->offsets = new_offsets;
+    data->allocator = *allocator;
+    return sz_true_k;
+}
 
-        sz_size_t const string_data_size = (sz_size_t)data->offsets[data->count];
-        sz_size_t const offsets_size = (data->count + 1) * sizeof(sz_u32_t);
+static sz_bool_t sz_py_replace_u32_tape_view_allocator(Strs *strs, sz_memory_allocator_t *allocator) {
+    // Convert view to tape by copying the data
+    struct u32_tape_view_t *view = &strs->data.u32_tape_view;
+    sz_size_t const string_data_size = (sz_size_t)view->offsets[view->count];
+    sz_size_t const offsets_size = (view->count + 1) * sizeof(sz_u32_t);
 
-        // Allocate new string data with new allocator
-        sz_ptr_t new_string_data = (sz_ptr_t)allocator->allocate(string_data_size, allocator->handle);
+    // Allocate new string data with new allocator
+    sz_ptr_t new_string_data = NULL;
+    if (string_data_size > 0) {
+        new_string_data = (sz_ptr_t)allocator->allocate(string_data_size, allocator->handle);
         if (!new_string_data) return sz_false_k;
-        memcpy(new_string_data, data->data, string_data_size);
+        memcpy(new_string_data, view->data, string_data_size);
+    }
 
-        // Allocate new offsets array
-        sz_u32_t *new_offsets = (sz_u32_t *)allocator->allocate(offsets_size, allocator->handle);
+    // Allocate new offsets array
+    sz_u32_t *new_offsets = NULL;
+    if (offsets_size > 0) {
+        new_offsets = (sz_u32_t *)allocator->allocate(offsets_size, allocator->handle);
         if (!new_offsets) {
-            allocator->free(new_string_data, string_data_size, allocator->handle);
+            if (string_data_size > 0) allocator->free(new_string_data, string_data_size, allocator->handle);
             return sz_false_k;
         }
-        memcpy(new_offsets, data->offsets, offsets_size);
-
-        // Free old memory with old allocator (tapes always own their data)
-        old_allocator.free(data->data, string_data_size, old_allocator.handle);
-        old_allocator.free(data->offsets, offsets_size, old_allocator.handle);
-
-        // Update pointers and allocator
-        data->data = new_string_data;
-        data->offsets = new_offsets;
-        data->allocator = *allocator;
-        break;
+        memcpy(new_offsets, view->offsets, offsets_size);
     }
 
-    case STRS_U64_TAPE: {
-        struct u64_tape_t *data = &strs->data.u64_tape;
-        sz_assert_(data->offsets && "Expected offsets to be allocated");
+    // Release parent reference if any
+    Py_XDECREF(view->parent);
 
-        sz_size_t string_data_size = (sz_size_t)data->offsets[data->count];
-        sz_size_t offsets_size = (data->count + 1) * sizeof(sz_u64_t);
+    // Convert to tape layout
+    strs->type = STRS_U32_TAPE;
+    strs->data.u32_tape.count = view->count;
+    strs->data.u32_tape.data = new_string_data;
+    strs->data.u32_tape.offsets = new_offsets;
+    strs->data.u32_tape.allocator = *allocator;
+    return sz_true_k;
+}
 
-        // Allocate new string data with new allocator
-        sz_ptr_t new_string_data = (sz_ptr_t)allocator->allocate(string_data_size, allocator->handle);
+static sz_bool_t sz_py_replace_u64_tape_view_allocator(Strs *strs, sz_memory_allocator_t *allocator) {
+    // Convert view to tape by copying the data
+    struct u64_tape_view_t *view = &strs->data.u64_tape_view;
+    sz_size_t const string_data_size = (sz_size_t)view->offsets[view->count];
+    sz_size_t const offsets_size = (view->count + 1) * sizeof(sz_u64_t);
+
+    // Allocate new string data with new allocator
+    sz_ptr_t new_string_data = NULL;
+    if (string_data_size > 0) {
+        new_string_data = (sz_ptr_t)allocator->allocate(string_data_size, allocator->handle);
         if (!new_string_data) return sz_false_k;
-        memcpy(new_string_data, data->data, string_data_size);
+        memcpy(new_string_data, view->data, string_data_size);
+    }
 
-        // Allocate new offsets array
-        sz_u64_t *new_offsets = (sz_u64_t *)allocator->allocate(offsets_size, allocator->handle);
+    // Allocate new offsets array
+    sz_u64_t *new_offsets = NULL;
+    if (offsets_size > 0) {
+        new_offsets = (sz_u64_t *)allocator->allocate(offsets_size, allocator->handle);
         if (!new_offsets) {
-            allocator->free(new_string_data, string_data_size, allocator->handle);
+            if (string_data_size > 0) allocator->free(new_string_data, string_data_size, allocator->handle);
             return sz_false_k;
         }
-        memcpy(new_offsets, data->offsets, offsets_size);
+        memcpy(new_offsets, view->offsets, offsets_size);
+    }
 
-        // Free old memory with old allocator (tapes always own their data)
-        old_allocator.free(data->data, string_data_size, old_allocator.handle);
-        old_allocator.free(data->offsets, offsets_size, old_allocator.handle);
+    // Release parent reference if any
+    Py_XDECREF(view->parent);
 
-        // Update pointers and allocator
-        data->data = new_string_data;
-        data->offsets = new_offsets;
-        data->allocator = *allocator;
-        break;
-    }
+    // Convert to tape layout
+    strs->type = STRS_U64_TAPE;
+    strs->data.u64_tape.count = view->count;
+    strs->data.u64_tape.data = new_string_data;
+    strs->data.u64_tape.offsets = new_offsets;
+    strs->data.u64_tape.allocator = *allocator;
+    return sz_true_k;
+}
 
-    case STRS_U32_TAPE_VIEW: {
-        // Convert view to tape by copying the data
-        struct u32_tape_view_t *view = &strs->data.u32_tape_view;
-        sz_size_t const string_data_size = (sz_size_t)view->offsets[view->count];
-        sz_size_t const offsets_size = (view->count + 1) * sizeof(sz_u32_t);
+static sz_bool_t sz_py_replace_fragmented_allocator(Strs *strs, sz_memory_allocator_t *old_allocator,
+                                                    sz_memory_allocator_t *allocator) {
+    struct fragmented_t *fragmented = &strs->data.fragmented;
+    sz_assert_(fragmented->spans && "Expected spans to be allocated");
 
-        // Allocate new string data with new allocator
-        sz_ptr_t new_string_data = (sz_ptr_t)allocator->allocate(string_data_size, allocator->handle);
-        if (!new_string_data) return sz_false_k;
-        memcpy(new_string_data, view->data, string_data_size);
+    // Calculate total size needed for consolidated tape
+    sz_size_t total_bytes = 0;
+    for (sz_size_t i = 0; i < fragmented->count; i++) total_bytes += fragmented->spans[i].length;
 
-        // Allocate new offsets array
-        sz_u32_t *new_offsets = (sz_u32_t *)allocator->allocate(offsets_size, allocator->handle);
-        if (!new_offsets) {
-            allocator->free(new_string_data, string_data_size, allocator->handle);
-            return sz_false_k;
-        }
-        memcpy(new_offsets, view->offsets, offsets_size);
+    // Choose 32-bit or 64-bit tape based on size
+    sz_bool_t use_64bit = total_bytes >= UINT32_MAX;
 
-        // Release parent reference if any
-        Py_XDECREF(view->parent);
+    // Skip allocation if there's no data to allocate (empty strings case)
+    if (total_bytes == 0) {
+        // Convert to empty tape layout
+        old_allocator->free(fragmented->spans, fragmented->count * sizeof(sz_string_view_t), old_allocator->handle);
+        Py_XDECREF(fragmented->parent);
 
-        // Convert to tape layout
         strs->type = STRS_U32_TAPE;
-        strs->data.u32_tape.count = view->count;
-        strs->data.u32_tape.data = new_string_data;
-        strs->data.u32_tape.offsets = new_offsets;
+        strs->data.u32_tape.count = fragmented->count;
+        strs->data.u32_tape.data = NULL;
+        strs->data.u32_tape.offsets = NULL;
         strs->data.u32_tape.allocator = *allocator;
-        break;
+        return sz_true_k;
     }
 
-    case STRS_U64_TAPE_VIEW: {
-        // Convert view to tape by copying the data
-        struct u64_tape_view_t *view = &strs->data.u64_tape_view;
-        sz_size_t const string_data_size = (sz_size_t)view->offsets[view->count];
-        sz_size_t const offsets_size = (view->count + 1) * sizeof(sz_u64_t);
-
-        // Allocate new string data with new allocator
-        sz_ptr_t new_string_data = (sz_ptr_t)allocator->allocate(string_data_size, allocator->handle);
-        if (!new_string_data) return sz_false_k;
-        memcpy(new_string_data, view->data, string_data_size);
+    // Allocate consolidated data buffer and offsets array
+    sz_ptr_t new_data = (sz_ptr_t)allocator->allocate(total_bytes, allocator->handle);
+    if (!new_data) return sz_false_k;
 
-        // Allocate new offsets array
-        sz_u64_t *new_offsets = (sz_u64_t *)allocator->allocate(offsets_size, allocator->handle);
+    if (use_64bit) {
+        sz_u64_t *new_offsets =
+            (sz_u64_t *)allocator->allocate((fragmented->count + 1) * sizeof(sz_u64_t), allocator->handle);
         if (!new_offsets) {
-            allocator->free(new_string_data, string_data_size, allocator->handle);
+            allocator->free(new_data, total_bytes, allocator->handle);
             return sz_false_k;
         }
-        memcpy(new_offsets, view->offsets, offsets_size);
 
-        // Release parent reference if any
-        Py_XDECREF(view->parent);
+        // Copy fragmented data into consolidated buffer
+        sz_size_t current_offset = 0;
+        new_offsets[0] = 0;
+        for (sz_size_t i = 0; i < fragmented->count; i++) {
+            sz_size_t len = fragmented->spans[i].length;
+            if (len > 0) { memcpy(new_data + current_offset, fragmented->spans[i].start, len); }
+            current_offset += len;
+            new_offsets[i + 1] = current_offset;
+        }
+
+        // Free old fragmented data and convert to 64-bit tape
+        old_allocator->free(fragmented->spans, fragmented->count * sizeof(sz_string_view_t), old_allocator->handle);
+        Py_XDECREF(fragmented->parent);
 
-        // Convert to tape layout
         strs->type = STRS_U64_TAPE;
-        strs->data.u64_tape.count = view->count;
-        strs->data.u64_tape.data = new_string_data;
+        strs->data.u64_tape.count = fragmented->count;
+        strs->data.u64_tape.data = new_data;
         strs->data.u64_tape.offsets = new_offsets;
         strs->data.u64_tape.allocator = *allocator;
-        break;
     }
+    else {
+        sz_u32_t *new_offsets =
+            (sz_u32_t *)allocator->allocate((fragmented->count + 1) * sizeof(sz_u32_t), allocator->handle);
+        if (!new_offsets) {
+            allocator->free(new_data, total_bytes, allocator->handle);
+            return sz_false_k;
+        }
 
-    case STRS_FRAGMENTED: {
-        struct fragmented_t *data = &strs->data.fragmented;
-        sz_assert_(data->spans && "Expected spans to be allocated");
+        // Copy fragmented data into consolidated buffer
+        sz_size_t current_offset = 0;
+        new_offsets[0] = 0;
+        for (sz_size_t i = 0; i < fragmented->count; i++) {
+            sz_size_t len = fragmented->spans[i].length;
+            if (len > 0) { memcpy(new_data + current_offset, fragmented->spans[i].start, len); }
+            current_offset += len;
+            // Ensure we don't overflow 32-bit offset
+            if (current_offset > UINT32_MAX) {
+                allocator->free(new_data, total_bytes, allocator->handle);
+                allocator->free(new_offsets, (fragmented->count + 1) * sizeof(sz_u32_t), allocator->handle);
+                return sz_false_k;
+            }
+            new_offsets[i + 1] = (sz_u32_t)current_offset;
+        }
+
+        // Free old fragmented data and convert to 32-bit tape
+        old_allocator->free(fragmented->spans, fragmented->count * sizeof(sz_string_view_t), old_allocator->handle);
+        Py_XDECREF(fragmented->parent);
+
+        strs->type = STRS_U32_TAPE;
+        strs->data.u32_tape.count = fragmented->count;
+        strs->data.u32_tape.data = new_data;
+        strs->data.u32_tape.offsets = new_offsets;
+        strs->data.u32_tape.allocator = *allocator;
+    }
+    return sz_true_k;
+}
 
-        // Reallocate the spans array with the new allocator
-        sz_size_t spans_size = data->count * sizeof(sz_string_view_t);
-        sz_string_view_t *new_spans = (sz_string_view_t *)allocator->allocate(spans_size, allocator->handle);
-        if (!new_spans) return sz_false_k;
-        memcpy(new_spans, data->spans, spans_size);
+/**
+ *  @brief  Helper function to replace the memory allocator in a `Strs` object.
+ *          This reallocates existing string data using the new allocator.
+ *
+ *  This may change the type of the `Strs` layout:
+ *  - `STRS_U32_TAPE_VIEW` becomes `STRS_U32_TAPE`.
+ *  - `STRS_U64_TAPE_VIEW` becomes `STRS_U64_TAPE`.
+ *  - `STRS_U32_TAPE` remains, if the allocator is different.
+ *  - `STRS_U64_TAPE` remains, if the allocator is different.
+ *  - `STRS_FRAGMENTED` becomes a `STRS_U32_TAPE` or `STRS_U64_TAPE` depending on the content size.
+ */
+SZ_DYNAMIC sz_bool_t sz_py_replace_strings_allocator(PyObject *object, sz_memory_allocator_t *allocator) {
+    if (!object || !allocator) return sz_false_k;
+    if (!PyObject_TypeCheck(object, &StrsType)) return sz_false_k;
 
-        // Free old spans with old allocator
-        old_allocator.free(data->spans, spans_size, old_allocator.handle);
+    Strs *strs = (Strs *)object;
 
-        // Detach from parent object
-        Py_XDECREF(data->parent);
-        data->parent = NULL;
+    printf("DEBUG: sz_py_replace_strings_allocator called for Strs at %p\n", strs);
+    printf("DEBUG: Strs type: %d\n", strs->type);
 
-        // Update pointer and allocator
-        data->spans = new_spans;
-        data->allocator = *allocator;
+    // Get the current allocator based on type
+    sz_memory_allocator_t old_allocator;
+    switch (strs->type) {
+    case STRS_U32_TAPE: old_allocator = strs->data.u32_tape.allocator; break;
+    case STRS_U64_TAPE: old_allocator = strs->data.u64_tape.allocator; break;
+    case STRS_FRAGMENTED: old_allocator = strs->data.fragmented.allocator; break;
+    case STRS_U32_TAPE_VIEW:
+    case STRS_U64_TAPE_VIEW:
+        // Views don't own memory, use default allocator for comparison
+        sz_memory_allocator_init_default(&old_allocator);
         break;
     }
+
+    // Check if the allocators are the same - no need to reallocate
+    printf("DEBUG: Comparing allocators - old(alloc=%p, free=%p, handle=%p) vs new(alloc=%p, free=%p, handle=%p)\n",
+           old_allocator.allocate, old_allocator.free, old_allocator.handle, allocator->allocate, allocator->free,
+           allocator->handle);
+    if (sz_memory_allocator_equal(&old_allocator, allocator)) {
+        printf("DEBUG: Allocators are equal, no reallocation needed\n");
+        return sz_true_k;
+    }
+    printf("DEBUG: Allocators are different, proceeding with reallocation\n");
+
+    // Handle different Strs layouts using dedicated functions
+    switch (strs->type) {
+    case STRS_U32_TAPE: return sz_py_replace_u32_tape_allocator(strs, &old_allocator, allocator);
+    case STRS_U64_TAPE: return sz_py_replace_u64_tape_allocator(strs, &old_allocator, allocator);
+    case STRS_U32_TAPE_VIEW: return sz_py_replace_u32_tape_view_allocator(strs, allocator);
+    case STRS_U64_TAPE_VIEW: return sz_py_replace_u64_tape_view_allocator(strs, allocator);
+    case STRS_FRAGMENTED: return sz_py_replace_fragmented_allocator(strs, &old_allocator, allocator);
     }
 
-    return sz_true_k;
+    return sz_false_k; // Should never reach here
 }
 
 /**
@@ -4296,13 +4399,12 @@ static int Strs_init_from_pyarrow(Strs *self, PyObject *sequence_obj, int view)
             sz_i64_t const *offsets_64 = (sz_i64_t const *)buffers[1];
             sz_size_t total_bytes = offsets_64[length] - offsets_64[0];
 
-            // Handle zero-byte case (all nulls)
-            if (total_bytes == 0) total_bytes = 1; // Allocate at least 1 byte
-
             // Allocate new buffer and offsets using the allocator
-            sz_ptr_t new_data = (sz_ptr_t)allocator.allocate(total_bytes, allocator.handle);
+            sz_ptr_t new_data =
+                total_bytes ? (sz_ptr_t)allocator.allocate(total_bytes, allocator.handle) : (sz_ptr_t)NULL;
             sz_u64_t *new_offsets = (sz_u64_t *)allocator.allocate((length + 1) * sizeof(sz_u64_t), allocator.handle);
-            if (!new_data || !new_offsets) {
+            int const failed_to_allocate_data = total_bytes && !new_data;
+            if (failed_to_allocate_data || !new_offsets) {
                 if (new_data) allocator.free(new_data, total_bytes, allocator.handle);
                 if (new_offsets) allocator.free(new_offsets, (length + 1) * sizeof(sz_u64_t), allocator.handle);
                 PyErr_NoMemory();
@@ -4329,13 +4431,12 @@ static int Strs_init_from_pyarrow(Strs *self, PyObject *sequence_obj, int view)
             sz_i32_t const *offsets_32 = (sz_i32_t const *)buffers[1];
             sz_size_t total_bytes = offsets_32[length] - offsets_32[0];
 
-            // Handle zero-byte case (all nulls)
-            if (total_bytes == 0) total_bytes = 1; // Allocate at least 1 byte
-
             // Allocate new buffer and offsets using the allocator
-            sz_ptr_t new_data = (sz_ptr_t)allocator.allocate(total_bytes, allocator.handle);
+            sz_ptr_t new_data =
+                total_bytes ? (sz_ptr_t)allocator.allocate(total_bytes, allocator.handle) : (sz_ptr_t)NULL;
             sz_u32_t *new_offsets = (sz_u32_t *)allocator.allocate((length + 1) * sizeof(sz_u32_t), allocator.handle);
-            if (!new_data || !new_offsets) {
+            int const failed_to_allocate_data = total_bytes && !new_data;
+            if (failed_to_allocate_data || !new_offsets) {
                 if (new_data) allocator.free(new_data, total_bytes, allocator.handle);
                 if (new_offsets) allocator.free(new_offsets, (length + 1) * sizeof(sz_u32_t), allocator.handle);
                 PyErr_NoMemory();
@@ -4434,8 +4535,10 @@ static int Strs_init_from_tuple(Strs *self, PyObject *sequence_obj, int view) {
         sz_memory_allocator_init_default(&allocator);
 
         // Allocate data buffer using allocator
-        sz_ptr_t data_buffer = (sz_ptr_t)allocator.allocate(total_bytes, allocator.handle);
-        if (!data_buffer) {
+        sz_ptr_t data_buffer =
+            total_bytes ? (sz_ptr_t)allocator.allocate(total_bytes, allocator.handle) : (sz_ptr_t)NULL;
+        int const failed_to_allocate_data = total_bytes && !data_buffer;
+        if (failed_to_allocate_data) {
             PyErr_NoMemory();
             return -1;
         }
@@ -4444,7 +4547,7 @@ static int Strs_init_from_tuple(Strs *self, PyObject *sequence_obj, int view) {
             // Apache Arrow format: N+1 offsets for N strings
             sz_u64_t *offsets = (sz_u64_t *)allocator.allocate((count + 1) * sizeof(sz_u64_t), allocator.handle);
             if (!offsets) {
-                allocator.free(data_buffer, total_bytes, allocator.handle);
+                if (data_buffer) allocator.free(data_buffer, total_bytes, allocator.handle);
                 PyErr_NoMemory();
                 return -1;
             }
@@ -4472,7 +4575,7 @@ static int Strs_init_from_tuple(Strs *self, PyObject *sequence_obj, int view) {
             // Apache Arrow format: N+1 offsets for N strings
             sz_u32_t *offsets = (sz_u32_t *)allocator.allocate((count + 1) * sizeof(sz_u32_t), allocator.handle);
             if (!offsets) {
-                allocator.free(data_buffer, total_bytes, allocator.handle);
+                if (data_buffer) allocator.free(data_buffer, total_bytes, allocator.handle);
                 PyErr_NoMemory();
                 return -1;
             }
@@ -4580,14 +4683,16 @@ static int Strs_init_from_list(Strs *self, PyObject *sequence_obj, int view) {
         sz_memory_allocator_init_default(&allocator);
 
         // Allocate buffers based on calculated sizes
-        sz_ptr_t data_buffer = (sz_ptr_t)allocator.allocate(total_bytes, allocator.handle);
-        void *offsets;
+        sz_ptr_t data_buffer =
+            total_bytes ? (sz_ptr_t)allocator.allocate(total_bytes, allocator.handle) : (sz_ptr_t)NULL;
 
         // Apache Arrow format: N+1 offsets for N strings
+        void *offsets;
         if (use_64bit) { offsets = allocator.allocate((count + 1) * sizeof(sz_u64_t), allocator.handle); }
         else { offsets = allocator.allocate((count + 1) * sizeof(sz_u32_t), allocator.handle); }
 
-        if (!data_buffer || !offsets) {
+        int const failed_to_allocate_data = total_bytes && !data_buffer;
+        if (failed_to_allocate_data || !offsets) {
             if (data_buffer) allocator.free(data_buffer, total_bytes, allocator.handle);
             if (offsets) {
                 sz_size_t offsets_size = use_64bit ? (count + 1) * sizeof(sz_u64_t) : (count + 1) * sizeof(sz_u32_t);

From 958c9e1063120e82b20e9e4ff2acd20b63a9fead Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 13 Aug 2025 22:29:47 +0000
Subject: [PATCH 572/751] Add: NW and SW scoring classes

---
 c/stringzillas.cuh           |  26 +-
 python/stringzilla.c         |  13 +-
 python/stringzillas.c        | 714 ++++++++++++++++++++++++++++++++++-
 scripts/test_stringzillas.py |  13 +-
 4 files changed, 742 insertions(+), 24 deletions(-)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index 076d0732..73b1e914 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -253,6 +253,16 @@ sz_status_t sz_levenshtein_distances_for_(                                     /
     auto *engine = reinterpret_cast<levenshtein_backends_t *>(engine_punned);
     auto *device = reinterpret_cast<device_scope_t *>(device_punned);
 
+    // Debug: Check which variant the device holds
+    printf("DEBUG: Device scope variant index: %zu\n", device->variants.index());
+    if (std::holds_alternative<default_scope_t>(device->variants)) {
+        printf("DEBUG: Device contains default_scope_t\n");
+    }
+    else if (std::holds_alternative<cpu_scope_t>(device->variants)) { printf("DEBUG: Device contains cpu_scope_t\n"); }
+#if SZ_USE_CUDA
+    else if (std::holds_alternative<gpu_scope_t>(device->variants)) { printf("DEBUG: Device contains gpu_scope_t\n"); }
+#endif
+
     // Wrap our stable ABI sequences into C++ friendly containers
     auto results_strided = strided_ptr<sz_size_t> {reinterpret_cast<sz_ptr_t>(results), results_stride};
 
@@ -690,17 +700,25 @@ SZ_DYNAMIC sz_status_t sz_device_scope_init_gpu_device(sz_size_t gpu_device, sz_
     sz_assert_(scope_punned != nullptr && "Scope must not be null");
 
 #if SZ_USE_CUDA
+    printf("DEBUG: Initializing GPU device scope for device %zu\n", gpu_device);
     sz::gpu_specs_t specs;
     auto specs_status = szs::gpu_specs_fetch(specs, static_cast<int>(gpu_device));
-    if (specs_status.status != sz::status_t::success_k) return static_cast<sz_status_t>(specs_status.status);
+    if (specs_status.status != sz::status_t::success_k) {
+        printf("DEBUG: Failed to fetch GPU specs, status: %d\n", (int)specs_status.status);
+        return static_cast<sz_status_t>(specs_status.status);
+    }
     szs::cuda_executor_t executor;
     auto executor_status = executor.try_scheduling(static_cast<int>(gpu_device));
-    if (executor_status.status != sz::status_t::success_k) return static_cast<sz_status_t>(executor_status.status);
+    if (executor_status.status != sz::status_t::success_k) {
+        printf("DEBUG: Failed to schedule GPU executor, status: %d\n", (int)executor_status.status);
+        return static_cast<sz_status_t>(executor_status.status);
+    }
 
     auto *scope =
         new (std::nothrow) device_scope_t {gpu_scope_t {.executor = std::move(executor), .specs = std::move(specs)}};
     if (!scope) return sz_bad_alloc_k;
     *scope_punned = reinterpret_cast<sz_device_scope_t>(scope);
+    printf("DEBUG: Successfully created GPU device scope\n");
     return sz_success_k;
 #else
     sz_unused_(gpu_device);
@@ -733,7 +751,7 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(
     auto const affine_costs = szs::affine_gap_costs_t {open, extend};
 
 #if SZ_USE_ICE
-    bool const can_use_ice = (capabilities & sz_cap_serial_k) == sz_cap_serial_k;
+    bool const can_use_ice = (capabilities & sz_cap_ice_k) == sz_cap_ice_k;
     if (can_use_ice && can_use_linear_costs) {
         auto variant = szs::levenshtein_ice_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)
@@ -757,6 +775,7 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(
 #if SZ_USE_CUDA
     bool const can_use_cuda = (capabilities & sz_cap_cuda_k) == sz_cap_cuda_k;
     if (can_use_cuda && can_use_linear_costs) {
+        printf("DEBUG: Creating levenshtein_cuda_t variant (linear costs)\n");
         auto variant = szs::levenshtein_cuda_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)
             levenshtein_backends_t(std::in_place_type_t<szs::levenshtein_cuda_t>(), std::move(variant));
@@ -766,6 +785,7 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(
         return sz_success_k;
     }
     else if (can_use_cuda) {
+        printf("DEBUG: Creating affine_levenshtein_cuda_t variant (affine costs)\n");
         auto variant = szs::affine_levenshtein_cuda_t(substitution_costs, affine_costs);
         auto engine = new (std::nothrow)
             levenshtein_backends_t(std::in_place_type_t<szs::affine_levenshtein_cuda_t>(), std::move(variant));
diff --git a/python/stringzilla.c b/python/stringzilla.c
index 9352070d..a6d1a4a3 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -519,7 +519,6 @@ static sz_bool_t sz_py_replace_u32_tape_allocator(Strs *strs, sz_memory_allocato
     data->data = new_string_data;
     data->offsets = new_offsets;
     data->allocator = *allocator;
-    printf("Replaced u32 tape allocator with %p\n", allocator->handle);
     return sz_true_k;
 }
 
@@ -743,9 +742,6 @@ SZ_DYNAMIC sz_bool_t sz_py_replace_strings_allocator(PyObject *object, sz_memory
 
     Strs *strs = (Strs *)object;
 
-    printf("DEBUG: sz_py_replace_strings_allocator called for Strs at %p\n", strs);
-    printf("DEBUG: Strs type: %d\n", strs->type);
-
     // Get the current allocator based on type
     sz_memory_allocator_t old_allocator;
     switch (strs->type) {
@@ -760,14 +756,7 @@ SZ_DYNAMIC sz_bool_t sz_py_replace_strings_allocator(PyObject *object, sz_memory
     }
 
     // Check if the allocators are the same - no need to reallocate
-    printf("DEBUG: Comparing allocators - old(alloc=%p, free=%p, handle=%p) vs new(alloc=%p, free=%p, handle=%p)\n",
-           old_allocator.allocate, old_allocator.free, old_allocator.handle, allocator->allocate, allocator->free,
-           allocator->handle);
-    if (sz_memory_allocator_equal(&old_allocator, allocator)) {
-        printf("DEBUG: Allocators are equal, no reallocation needed\n");
-        return sz_true_k;
-    }
-    printf("DEBUG: Allocators are different, proceeding with reallocation\n");
+    if (sz_memory_allocator_equal(&old_allocator, allocator)) return sz_true_k;
 
     // Handle different Strs layouts using dedicated functions
     switch (strs->type) {
diff --git a/python/stringzillas.c b/python/stringzillas.c
index 19b8eb02..8a0ac0f6 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -47,6 +47,29 @@ typedef SSIZE_T ssize_t;
 
 #pragma region Forward Declarations
 
+/**
+ * @brief Creates a Python tuple from capabilities mask.
+ * @param[in] caps Capabilities mask
+ * @return New reference to Python tuple, or NULL on error
+ */
+static PyObject *capabilities_to_tuple(sz_capability_t caps) {
+    char const *cap_strings[SZ_CAPABILITIES_COUNT];
+    sz_size_t cap_count = sz_capabilities_to_strings_implementation_(caps, cap_strings, SZ_CAPABILITIES_COUNT);
+
+    PyObject *caps_tuple = PyTuple_New(cap_count);
+    if (!caps_tuple) return NULL;
+
+    for (sz_size_t i = 0; i < cap_count; i++) {
+        PyObject *cap_str = PyUnicode_FromString(cap_strings[i]);
+        if (!cap_str) {
+            Py_DECREF(caps_tuple);
+            return NULL;
+        }
+        PyTuple_SET_ITEM(caps_tuple, i, cap_str);
+    }
+    return caps_tuple;
+}
+
 // Try to import NumPy, and fail if it's not available
 static int numpy_available = 0;
 static PyObject *numpy_module = NULL;
@@ -64,23 +87,47 @@ static sz_bool_t (*sz_py_export_string_like)(PyObject *, sz_cptr_t *, sz_size_t
 static sz_bool_t (*sz_py_export_strings_as_sequence)(PyObject *, sz_sequence_t *) = NULL;
 static sz_bool_t (*sz_py_export_strings_as_u32tape)(PyObject *, sz_cptr_t *, sz_u32_t const **, sz_size_t *) = NULL;
 static sz_bool_t (*sz_py_export_strings_as_u64tape)(PyObject *, sz_cptr_t *, sz_u64_t const **, sz_size_t *) = NULL;
+static sz_bool_t (*sz_py_replace_strings_allocator)(PyObject *, sz_memory_allocator_t *) = NULL;
 
 // Default device scope that can be safely reused across calls
 // The underlying implementation is stateless and thread-safe
 static sz_device_scope_t default_device_scope = NULL;
 // Static variable to store hardware capabilities
 static sz_capability_t default_hardware_capabilities = 0;
+// Static unified memory allocator for GPU compatibility
+static sz_memory_allocator_t unified_allocator;
 
 typedef struct PyAPI {
     sz_bool_t (*sz_py_export_string_like)(PyObject *, sz_cptr_t *, sz_size_t *);
     sz_bool_t (*sz_py_export_strings_as_sequence)(PyObject *, sz_sequence_t *);
     sz_bool_t (*sz_py_export_strings_as_u32tape)(PyObject *, sz_cptr_t *, sz_u32_t const **, sz_size_t *);
     sz_bool_t (*sz_py_export_strings_as_u64tape)(PyObject *, sz_cptr_t *, sz_u64_t const **, sz_size_t *);
+    sz_bool_t (*sz_py_replace_strings_allocator)(PyObject *, sz_memory_allocator_t *);
 } PyAPI;
 
 // Method flags
 #define SZ_METHOD_FLAGS METH_VARARGS | METH_KEYWORDS
 
+/**
+ *  @brief  Helper function to automatically swap a Strs object's allocator to unified memory.
+ *          This ensures GPU compatibility for string operations.
+ *  @param[in] strs_obj The Strs object to swap allocator for
+ *  @return sz_true_k on success, sz_false_k on failure
+ */
+static sz_bool_t try_swap_to_unified_allocator(PyObject *strs_obj) {
+    if (!strs_obj || !sz_py_replace_strings_allocator) return sz_false_k;
+
+    // Try to swap to unified allocator - this will be a no-op if already using it
+    sz_bool_t success = sz_py_replace_strings_allocator(strs_obj, &unified_allocator);
+
+    if (!success) {
+        // Set Python error to inform user of the failure
+        PyErr_SetString(PyExc_RuntimeError, "Failed to allocate unified memory for GPU compatibility. "
+                                            "Consider reducing input size or freeing memory.");
+    }
+    return success;
+}
+
 #pragma endregion
 
 #pragma region Metadata
@@ -107,7 +154,7 @@ static int parse_and_intersect_capabilities(PyObject *caps_tuple, sz_capability_
             return -1;
         }
 
-        const char *cap_str = PyUnicode_AsUTF8(item);
+        char const *cap_str = PyUnicode_AsUTF8(item);
         if (!cap_str) return -1;
 
         // Map string to capability flag
@@ -150,6 +197,7 @@ static int parse_and_intersect_capabilities(PyObject *caps_tuple, sz_capability_
 typedef struct {
     PyObject ob_base;
     sz_device_scope_t handle;
+    char description[32];
 } DeviceScope;
 
 static void DeviceScope_dealloc(DeviceScope *self) {
@@ -162,7 +210,10 @@ static void DeviceScope_dealloc(DeviceScope *self) {
 
 static PyObject *DeviceScope_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) {
     DeviceScope *self = (DeviceScope *)type->tp_alloc(type, 0);
-    if (self != NULL) { self->handle = NULL; }
+    if (self != NULL) {
+        self->handle = NULL;
+        self->description[0] = '\0';
+    }
     return (PyObject *)self;
 }
 
@@ -192,6 +243,7 @@ static int DeviceScope_init(DeviceScope *self, PyObject *args, PyObject *kwargs)
         cpu_cores = PyLong_AsSize_t(cpu_cores_obj);
         if (cpu_cores == (sz_size_t)-1 && PyErr_Occurred()) { return -1; }
         status = sz_device_scope_init_cpu_cores(cpu_cores, &self->handle);
+        snprintf(self->description, sizeof(self->description), "CPUs:%zu", cpu_cores);
     }
     else if (gpu_device_obj != NULL) {
         if (!PyLong_Check(gpu_device_obj)) {
@@ -201,8 +253,12 @@ static int DeviceScope_init(DeviceScope *self, PyObject *args, PyObject *kwargs)
         gpu_device = PyLong_AsSize_t(gpu_device_obj);
         if (gpu_device == (sz_size_t)-1 && PyErr_Occurred()) { return -1; }
         status = sz_device_scope_init_gpu_device(gpu_device, &self->handle);
+        snprintf(self->description, sizeof(self->description), "GPU:%zu", gpu_device);
+    }
+    else {
+        status = sz_device_scope_init_default(&self->handle);
+        snprintf(self->description, sizeof(self->description), "default");
     }
-    else { status = sz_device_scope_init_default(&self->handle); }
 
     if (status != sz_success_k) {
         PyErr_SetString(PyExc_RuntimeError, "Failed to initialize device scope");
@@ -212,6 +268,10 @@ static int DeviceScope_init(DeviceScope *self, PyObject *args, PyObject *kwargs)
     return 0;
 }
 
+static PyObject *DeviceScope_repr(DeviceScope *self) {
+    return PyUnicode_FromFormat("DeviceScope(%s)", self->description);
+}
+
 static char const doc_DeviceScope[] = //
     "DeviceScope(cpu_cores=None, gpu_device=None)\n"
     "\n"
@@ -231,6 +291,7 @@ static PyTypeObject DeviceScopeType = {
     .tp_new = DeviceScope_new,
     .tp_init = (initproc)DeviceScope_init,
     .tp_dealloc = (destructor)DeviceScope_dealloc,
+    .tp_repr = (reprfunc)DeviceScope_repr,
 };
 
 #pragma endregion
@@ -243,6 +304,8 @@ static PyTypeObject DeviceScopeType = {
 typedef struct {
     PyObject ob_base;
     sz_levenshtein_distances_t handle;
+    char description[32];
+    sz_capability_t capabilities;
 } LevenshteinDistances;
 
 static void LevenshteinDistances_dealloc(LevenshteinDistances *self) {
@@ -255,7 +318,11 @@ static void LevenshteinDistances_dealloc(LevenshteinDistances *self) {
 
 static PyObject *LevenshteinDistances_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) {
     LevenshteinDistances *self = (LevenshteinDistances *)type->tp_alloc(type, 0);
-    if (self != NULL) { self->handle = NULL; }
+    if (self != NULL) {
+        self->handle = NULL;
+        self->description[0] = '\0';
+        self->capabilities = 0;
+    }
     return (PyObject *)self;
 }
 
@@ -302,9 +369,19 @@ static int LevenshteinDistances_init(LevenshteinDistances *self, PyObject *args,
         return -1;
     }
 
+    snprintf(self->description, sizeof(self->description), "%d,%d,%d,%d", match, mismatch, open, extend);
+    self->capabilities = capabilities;
     return 0;
 }
 
+static PyObject *LevenshteinDistances_repr(LevenshteinDistances *self) {
+    return PyUnicode_FromFormat("LevenshteinDistances(match,mismatch,open,extend=%s)", self->description);
+}
+
+static PyObject *LevenshteinDistances_get_capabilities(LevenshteinDistances *self, void *closure) {
+    return capabilities_to_tuple(self->capabilities);
+}
+
 static PyObject *LevenshteinDistances_call(LevenshteinDistances *self, PyObject *args, PyObject *kwargs) {
     PyObject *a_obj = NULL, *b_obj = NULL, *device_obj = NULL, *out_obj = NULL;
 
@@ -332,6 +409,9 @@ static PyObject *LevenshteinDistances_call(LevenshteinDistances *self, PyObject
     sz_status_t (*kernel_punned)(sz_levenshtein_distances_t, sz_device_scope_t, void *, void *, sz_size_t *,
                                  sz_size_t) = NULL;
 
+    // Try to swap allocators to unified memory for GPU compatibility
+    if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) { return NULL; }
+
     // Handle 32-bit tape inputs
     sz_sequence_u32tape_t a_u32tape, b_u32tape;
     sz_bool_t a_is_u32tape = sz_py_export_strings_as_u32tape( //
@@ -477,6 +557,12 @@ static char const doc_LevenshteinDistances[] = //
     "  device (DeviceScope, optional): Device execution context.\n"
     "  out (array, optional): Output buffer for results.";
 
+static PyGetSetDef LevenshteinDistances_getsetters[] = {
+    {"__capabilities__", (getter)LevenshteinDistances_get_capabilities, NULL,
+     "Hardware capabilities used by this engine", NULL},
+    {NULL} /* Sentinel */
+};
+
 static PyTypeObject LevenshteinDistancesType = {
     PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzillas.LevenshteinDistances",
     .tp_doc = doc_LevenshteinDistances,
@@ -486,6 +572,8 @@ static PyTypeObject LevenshteinDistancesType = {
     .tp_init = (initproc)LevenshteinDistances_init,
     .tp_dealloc = (destructor)LevenshteinDistances_dealloc,
     .tp_call = (ternaryfunc)LevenshteinDistances_call,
+    .tp_repr = (reprfunc)LevenshteinDistances_repr,
+    .tp_getset = LevenshteinDistances_getsetters,
 };
 
 #pragma endregion
@@ -495,11 +583,17 @@ static PyTypeObject LevenshteinDistancesType = {
 typedef struct {
     PyObject ob_base;
     sz_levenshtein_distances_utf8_t handle;
+    char description[32];
+    sz_capability_t capabilities;
 } LevenshteinDistancesUTF8;
 
 static PyObject *LevenshteinDistancesUTF8_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
     LevenshteinDistancesUTF8 *self = (LevenshteinDistancesUTF8 *)type->tp_alloc(type, 0);
-    if (self != NULL) { self->handle = NULL; }
+    if (self != NULL) {
+        self->handle = NULL;
+        self->description[0] = '\0';
+        self->capabilities = 0;
+    }
     return (PyObject *)self;
 }
 
@@ -550,9 +644,19 @@ static int LevenshteinDistancesUTF8_init(LevenshteinDistancesUTF8 *self, PyObjec
         PyErr_SetString(PyExc_RuntimeError, "Failed to initialize UTF-8 Levenshtein distances engine");
         return -1;
     }
+    snprintf(self->description, sizeof(self->description), "%d,%d,%d,%d", match, mismatch, open, extend);
+    self->capabilities = capabilities;
     return 0;
 }
 
+static PyObject *LevenshteinDistancesUTF8_repr(LevenshteinDistancesUTF8 *self) {
+    return PyUnicode_FromFormat("LevenshteinDistancesUTF8(match,mismatch,open,extend=%s)", self->description);
+}
+
+static PyObject *LevenshteinDistancesUTF8_get_capabilities(LevenshteinDistancesUTF8 *self, void *closure) {
+    return capabilities_to_tuple(self->capabilities);
+}
+
 static PyObject *LevenshteinDistancesUTF8_call(LevenshteinDistancesUTF8 *self, PyObject *args, PyObject *kwargs) {
     PyObject *a_obj = NULL, *b_obj = NULL, *device_obj = NULL, *out_obj = NULL;
 
@@ -580,6 +684,9 @@ static PyObject *LevenshteinDistancesUTF8_call(LevenshteinDistancesUTF8 *self, P
     sz_status_t (*kernel_punned)(sz_levenshtein_distances_t, sz_device_scope_t, void *, void *, sz_size_t *,
                                  sz_size_t) = NULL;
 
+    // Try to swap allocators to unified memory for GPU compatibility
+    if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) { return NULL; }
+
     // Handle 32-bit tape inputs
     sz_sequence_u32tape_t a_u32tape, b_u32tape;
     sz_bool_t a_is_u32tape = sz_py_export_strings_as_u32tape( //
@@ -726,6 +833,12 @@ static char const doc_LevenshteinDistancesUTF8[] = //
     "  device (DeviceScope, optional): Device execution context.\n"
     "  out (array, optional): Output buffer for results.";
 
+static PyGetSetDef LevenshteinDistancesUTF8_getsetters[] = {
+    {"__capabilities__", (getter)LevenshteinDistancesUTF8_get_capabilities, NULL,
+     "Hardware capabilities used by this engine", NULL},
+    {NULL} /* Sentinel */
+};
+
 static PyTypeObject LevenshteinDistancesUTF8Type = {
     PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzillas.LevenshteinDistancesUTF8",
     .tp_doc = doc_LevenshteinDistancesUTF8,
@@ -735,6 +848,567 @@ static PyTypeObject LevenshteinDistancesUTF8Type = {
     .tp_init = (initproc)LevenshteinDistancesUTF8_init,
     .tp_dealloc = (destructor)LevenshteinDistancesUTF8_dealloc,
     .tp_call = (ternaryfunc)LevenshteinDistancesUTF8_call,
+    .tp_repr = (reprfunc)LevenshteinDistancesUTF8_repr,
+    .tp_getset = LevenshteinDistancesUTF8_getsetters,
+};
+
+#pragma endregion
+
+#pragma region NeedlemanWunsch
+
+/**
+ *  @brief  Needleman-Wunsch global alignment scoring engine.
+ */
+typedef struct {
+    PyObject ob_base;
+    sz_needleman_wunsch_scores_t handle;
+    char description[32];
+    sz_capability_t capabilities;
+} NeedlemanWunsch;
+
+static void NeedlemanWunsch_dealloc(NeedlemanWunsch *self) {
+    if (self->handle) {
+        sz_needleman_wunsch_scores_free(self->handle);
+        self->handle = NULL;
+    }
+    Py_TYPE(self)->tp_free((PyObject *)self);
+}
+
+static PyObject *NeedlemanWunsch_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) {
+    NeedlemanWunsch *self = (NeedlemanWunsch *)type->tp_alloc(type, 0);
+    if (self != NULL) {
+        self->handle = NULL;
+        self->description[0] = '\0';
+        self->capabilities = 0;
+    }
+    return (PyObject *)self;
+}
+
+static int NeedlemanWunsch_init(NeedlemanWunsch *self, PyObject *args, PyObject *kwargs) {
+    PyObject *substitution_matrix_obj = NULL;
+    sz_error_cost_t open = -1, extend = -1;
+    PyObject *capabilities_tuple = NULL;
+    sz_capability_t capabilities = default_hardware_capabilities;
+
+    // Parse arguments: substitution_matrix, open, extend, capabilities
+    static char *kwlist[] = {"substitution_matrix", "open", "extend", "capabilities", NULL};
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|iiO", kwlist, &substitution_matrix_obj, &open, &extend,
+                                     &capabilities_tuple)) {
+        return -1;
+    }
+
+    // Validate substitution matrix (should be a 256x256 numpy array)
+    if (!numpy_available || !PyArray_Check(substitution_matrix_obj)) {
+        PyErr_SetString(PyExc_TypeError, "substitution_matrix must be a NumPy array");
+        return -1;
+    }
+
+    PyArrayObject *subs_array = (PyArrayObject *)substitution_matrix_obj;
+    if (PyArray_NDIM(subs_array) != 2 || PyArray_DIM(subs_array, 0) != 256 || PyArray_DIM(subs_array, 1) != 256) {
+        PyErr_SetString(PyExc_ValueError, "substitution_matrix must be a 256x256 array");
+        return -1;
+    }
+
+    if (PyArray_TYPE(subs_array) != NPY_INT8) {
+        PyErr_SetString(PyExc_TypeError, "substitution_matrix must have int8 dtype");
+        return -1;
+    }
+
+    // Parse capabilities if provided
+    if (capabilities_tuple) {
+        if (parse_and_intersect_capabilities(capabilities_tuple, &capabilities) != 0) { return -1; }
+    }
+
+    // Initialize the engine
+    sz_error_cost_t *subs_data = (sz_error_cost_t *)PyArray_DATA(subs_array);
+
+    // Create a simple checksum of the substitution matrix for the description
+    uint32_t subs_checksum = 0;
+    for (int i = 0; i < 256; i += 16) {                    // Sample every 16th element
+        subs_checksum += (uint32_t)subs_data[i * 256 + i]; // Diagonal elements
+    }
+
+    sz_status_t status = sz_needleman_wunsch_scores_init(subs_data, open, extend, NULL, capabilities, &self->handle);
+
+    if (status != sz_success_k) {
+        char const *error_msg;
+        switch (status) {
+        case sz_bad_alloc_k: error_msg = "NeedlemanWunsch failed: memory allocation failed"; break;
+        case sz_invalid_utf8_k: error_msg = "NeedlemanWunsch failed: invalid UTF-8 input"; break;
+        case sz_contains_duplicates_k: error_msg = "NeedlemanWunsch failed: contains duplicates"; break;
+        case sz_overflow_risk_k: error_msg = "NeedlemanWunsch failed: overflow risk"; break;
+        case sz_unexpected_dimensions_k: error_msg = "NeedlemanWunsch failed: input/output size mismatch"; break;
+        case sz_missing_gpu_k: error_msg = "NeedlemanWunsch failed: GPU support is missing in the library"; break;
+        case sz_status_unknown_k: error_msg = "NeedlemanWunsch failed: unknown error"; break;
+        default: error_msg = "NeedlemanWunsch failed: unexpected error"; break;
+        }
+        PyErr_Format(PyExc_RuntimeError, "%s (status code: %d)", error_msg, (int)status);
+        return -1;
+    }
+
+    snprintf(self->description, sizeof(self->description), "%X,%d,%d", subs_checksum & 0xFFFF, open, extend);
+    self->capabilities = capabilities;
+    return 0;
+}
+
+static PyObject *NeedlemanWunsch_repr(NeedlemanWunsch *self) {
+    return PyUnicode_FromFormat("NeedlemanWunsch(subs_checksum,open,extend=%s)", self->description);
+}
+
+static PyObject *NeedlemanWunsch_get_capabilities(NeedlemanWunsch *self, void *closure) {
+    return capabilities_to_tuple(self->capabilities);
+}
+
+static PyObject *NeedlemanWunsch_call(NeedlemanWunsch *self, PyObject *args, PyObject *kwargs) {
+    PyObject *a_obj = NULL, *b_obj = NULL, *device_obj = NULL, *out_obj = NULL;
+
+    static char *kwlist[] = {"a", "b", "device", "out", NULL};
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|OO", kwlist, &a_obj, &b_obj, &device_obj, &out_obj)) {
+        return NULL;
+    }
+
+    // Get device handle
+    sz_device_scope_t device_handle = default_device_scope;
+    if (device_obj && device_obj != Py_None) {
+        if (!PyObject_IsInstance(device_obj, (PyObject *)&DeviceScopeType)) {
+            PyErr_SetString(PyExc_TypeError, "device must be a DeviceScope instance");
+            return NULL;
+        }
+        device_handle = ((DeviceScope *)device_obj)->handle;
+    }
+
+    sz_size_t kernel_input_size = 0;
+    void const *kernel_a_texts_punned = NULL;
+    void const *kernel_b_texts_punned = NULL;
+    sz_status_t (*kernel_punned)(sz_needleman_wunsch_scores_t, sz_device_scope_t, void const *, void const *,
+                                 sz_ssize_t *, sz_size_t) = NULL;
+    // Try to swap allocators to unified memory for GPU compatibility
+    if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) { return NULL; }
+
+    // Handle 32-bit tape inputs
+    sz_sequence_u32tape_t a_u32tape, b_u32tape;
+    sz_bool_t a_is_u32tape = sz_py_export_strings_as_u32tape( //
+        a_obj, &a_u32tape.data, &a_u32tape.offsets, &a_u32tape.count);
+    sz_bool_t b_is_u32tape = sz_py_export_strings_as_u32tape( //
+        b_obj, &b_u32tape.data, &b_u32tape.offsets, &b_u32tape.count);
+    if (a_is_u32tape && b_is_u32tape) {
+        if (a_u32tape.count != b_u32tape.count) {
+            PyErr_SetString(PyExc_ValueError, "Input sequences must have the same length");
+            return NULL;
+        }
+        kernel_input_size = a_u32tape.count;
+        kernel_punned = sz_needleman_wunsch_scores_u32tape;
+        kernel_a_texts_punned = &a_u32tape;
+        kernel_b_texts_punned = &b_u32tape;
+    }
+
+    // Handle 64-bit tape inputs
+    sz_sequence_u64tape_t a_u64tape, b_u64tape;
+    sz_bool_t a_is_u64tape = !a_is_u32tape && sz_py_export_strings_as_u64tape( //
+                                                  a_obj, &a_u64tape.data, &a_u64tape.offsets, &a_u64tape.count);
+    sz_bool_t b_is_u64tape = !b_is_u32tape && sz_py_export_strings_as_u64tape( //
+                                                  b_obj, &b_u64tape.data, &b_u64tape.offsets, &b_u64tape.count);
+    if (a_is_u64tape && b_is_u64tape) {
+        if (a_u64tape.count != b_u64tape.count) {
+            PyErr_SetString(PyExc_ValueError, "Input sequences must have the same length");
+            return NULL;
+        }
+        kernel_input_size = a_u64tape.count;
+        kernel_punned = sz_needleman_wunsch_scores_u64tape;
+        kernel_a_texts_punned = &a_u64tape;
+        kernel_b_texts_punned = &b_u64tape;
+    }
+
+    // Handle sequence inputs
+    sz_sequence_t a_seq, b_seq;
+    sz_bool_t a_is_sequence = !a_is_u32tape && !a_is_u64tape && sz_py_export_strings_as_sequence(a_obj, &a_seq);
+    sz_bool_t b_is_sequence = !b_is_u32tape && !b_is_u64tape && sz_py_export_strings_as_sequence(b_obj, &b_seq);
+    if (a_is_sequence && b_is_sequence) {
+        if (a_seq.count != b_seq.count) {
+            PyErr_SetString(PyExc_ValueError, "Input sequences must have the same length");
+            return NULL;
+        }
+        kernel_input_size = a_seq.count;
+        kernel_punned = sz_needleman_wunsch_scores_sequence;
+        kernel_a_texts_punned = &a_seq;
+        kernel_b_texts_punned = &b_seq;
+    }
+
+    // If no valid input types were found, raise an error
+    if (!kernel_punned) {
+        PyErr_Format(PyExc_TypeError,
+                     "Unsupported input types for NeedlemanWunsch. "
+                     "u32tape: a=%d b=%d, u64tape: a=%d b=%d, seq: a=%d b=%d",
+                     a_is_u32tape, b_is_u32tape, a_is_u64tape, b_is_u64tape, a_is_sequence, b_is_sequence);
+        return NULL;
+    }
+
+    // Make sure the `out` argument is valid NumPy array and extract results info
+    PyObject *results_array = NULL;
+    sz_ssize_t *kernel_results = NULL;
+    sz_size_t kernel_results_stride = sizeof(sz_ssize_t);
+
+    if (!out_obj || out_obj == Py_None) {
+        // Create a new NumPy array for results (signed integers for scores)
+        npy_intp numpy_size = kernel_input_size;
+        results_array = PyArray_SimpleNew(1, &numpy_size, NPY_INT64);
+        if (!results_array) {
+            PyErr_SetString(PyExc_MemoryError, "Failed to allocate results array");
+            goto cleanup;
+        }
+        kernel_results = (sz_ssize_t *)PyArray_DATA((PyArrayObject *)results_array);
+        kernel_results_stride = PyArray_STRIDE((PyArrayObject *)results_array, 0);
+    }
+    else {
+        // Use provided array
+        if (!PyArray_Check(out_obj)) {
+            PyErr_SetString(PyExc_TypeError, "out must be a NumPy array");
+            goto cleanup;
+        }
+        PyArrayObject *array = (PyArrayObject *)out_obj;
+        if (PyArray_NDIM(array) != 1) {
+            PyErr_SetString(PyExc_ValueError, "out array must be 1-dimensional");
+            goto cleanup;
+        }
+        if (PyArray_SIZE(array) < (npy_intp)kernel_input_size) {
+            PyErr_SetString(PyExc_ValueError, "out array is too small for results");
+            goto cleanup;
+        }
+        if (PyArray_TYPE(array) != NPY_INT64) {
+            PyErr_SetString(PyExc_TypeError, "out array must have int64 dtype");
+            goto cleanup;
+        }
+        kernel_results = (sz_ssize_t *)PyArray_DATA(array);
+        kernel_results_stride = PyArray_STRIDE(array, 0);
+        results_array = out_obj;
+        Py_INCREF(results_array);
+    }
+
+    sz_status_t status = kernel_punned(               //
+        self->handle, device_handle,                  //
+        kernel_a_texts_punned, kernel_b_texts_punned, //
+        kernel_results, kernel_results_stride);
+
+    if (status != sz_success_k) {
+        char const *error_msg;
+        switch (status) {
+        case sz_bad_alloc_k: error_msg = "NeedlemanWunsch failed: memory allocation failed"; break;
+        case sz_invalid_utf8_k: error_msg = "NeedlemanWunsch failed: invalid UTF-8 input"; break;
+        case sz_contains_duplicates_k: error_msg = "NeedlemanWunsch failed: contains duplicates"; break;
+        case sz_overflow_risk_k: error_msg = "NeedlemanWunsch failed: overflow risk"; break;
+        case sz_unexpected_dimensions_k: error_msg = "NeedlemanWunsch failed: input/output size mismatch"; break;
+        case sz_missing_gpu_k: error_msg = "NeedlemanWunsch failed: GPU support is missing in the library"; break;
+        case sz_status_unknown_k: error_msg = "NeedlemanWunsch failed: unknown error"; break;
+        default: error_msg = "NeedlemanWunsch failed: unexpected error"; break;
+        }
+        PyErr_Format(PyExc_RuntimeError, "%s (status code: %d)", error_msg, (int)status);
+        goto cleanup;
+    }
+    return results_array;
+
+cleanup:
+    Py_XDECREF(results_array);
+    return NULL;
+}
+
+static char const doc_NeedlemanWunsch[] = //
+    "NeedlemanWunsch(substitution_matrix, open=-1, extend=-1, capabilities=None)\n"
+    "\n"
+    "Needleman-Wunsch global alignment scoring engine.\n"
+    "\n"
+    "Args:\n"
+    "  substitution_matrix (np.ndarray): 256x256 int8 substitution matrix.\n"
+    "  open (int): Cost for opening a gap (default: -1).\n"
+    "  extend (int): Cost for extending a gap (default: -1).\n"
+    "  capabilities (Tuple[str], optional): Hardware capabilities to use.\n"
+    "                                       Will be intersected with detected capabilities.\n"
+    "                                       Examples: ('serial',), ('haswell', 'parallel')\n"
+    "\n"
+    "Call with:\n"
+    "  a (sequence): First sequence of strings.\n"
+    "  b (sequence): Second sequence of strings.\n"
+    "  device (DeviceScope, optional): Device execution context.\n"
+    "  out (array, optional): Output buffer for results.";
+
+static PyTypeObject NeedlemanWunschType = {
+    PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzillas.NeedlemanWunsch",
+    .tp_doc = doc_NeedlemanWunsch,
+    .tp_basicsize = sizeof(NeedlemanWunsch),
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_new = NeedlemanWunsch_new,
+    .tp_init = (initproc)NeedlemanWunsch_init,
+    .tp_dealloc = (destructor)NeedlemanWunsch_dealloc,
+    .tp_call = (ternaryfunc)NeedlemanWunsch_call,
+};
+
+#pragma endregion
+
+#pragma region SmithWaterman
+
+/**
+ *  @brief  Smith-Waterman local alignment scoring engine.
+ */
+typedef struct {
+    PyObject ob_base;
+    sz_smith_waterman_scores_t handle;
+} SmithWaterman;
+
+static void SmithWaterman_dealloc(SmithWaterman *self) {
+    if (self->handle) {
+        sz_smith_waterman_scores_free(self->handle);
+        self->handle = NULL;
+    }
+    Py_TYPE(self)->tp_free((PyObject *)self);
+}
+
+static PyObject *SmithWaterman_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) {
+    SmithWaterman *self = (SmithWaterman *)type->tp_alloc(type, 0);
+    if (self != NULL) { self->handle = NULL; }
+    return (PyObject *)self;
+}
+
+static int SmithWaterman_init(SmithWaterman *self, PyObject *args, PyObject *kwargs) {
+    PyObject *substitution_matrix_obj = NULL;
+    sz_error_cost_t open = -1, extend = -1;
+    PyObject *capabilities_tuple = NULL;
+    sz_capability_t capabilities = default_hardware_capabilities;
+
+    // Parse arguments: substitution_matrix, open, extend, capabilities
+    static char *kwlist[] = {"substitution_matrix", "open", "extend", "capabilities", NULL};
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|iiO", kwlist, &substitution_matrix_obj, &open, &extend,
+                                     &capabilities_tuple)) {
+        return -1;
+    }
+
+    // Validate substitution matrix (should be a 256x256 numpy array)
+    if (!numpy_available || !PyArray_Check(substitution_matrix_obj)) {
+        PyErr_SetString(PyExc_TypeError, "substitution_matrix must be a NumPy array");
+        return -1;
+    }
+
+    PyArrayObject *subs_array = (PyArrayObject *)substitution_matrix_obj;
+    if (PyArray_NDIM(subs_array) != 2 || PyArray_DIM(subs_array, 0) != 256 || PyArray_DIM(subs_array, 1) != 256) {
+        PyErr_SetString(PyExc_ValueError, "substitution_matrix must be a 256x256 array");
+        return -1;
+    }
+
+    if (PyArray_TYPE(subs_array) != NPY_INT8) {
+        PyErr_SetString(PyExc_TypeError, "substitution_matrix must have int8 dtype");
+        return -1;
+    }
+
+    // Parse capabilities if provided
+    if (capabilities_tuple) {
+        if (parse_and_intersect_capabilities(capabilities_tuple, &capabilities) != 0) { return -1; }
+    }
+
+    // Initialize the engine
+    sz_error_cost_t *subs_data = (sz_error_cost_t *)PyArray_DATA(subs_array);
+    sz_status_t status = sz_smith_waterman_scores_init(subs_data, open, extend, NULL, capabilities, &self->handle);
+
+    if (status != sz_success_k) {
+        char const *error_msg;
+        switch (status) {
+        case sz_bad_alloc_k: error_msg = "SmithWaterman failed: memory allocation failed"; break;
+        case sz_invalid_utf8_k: error_msg = "SmithWaterman failed: invalid UTF-8 input"; break;
+        case sz_contains_duplicates_k: error_msg = "SmithWaterman failed: contains duplicates"; break;
+        case sz_overflow_risk_k: error_msg = "SmithWaterman failed: overflow risk"; break;
+        case sz_unexpected_dimensions_k: error_msg = "SmithWaterman failed: input/output size mismatch"; break;
+        case sz_missing_gpu_k: error_msg = "SmithWaterman failed: GPU support is missing in the library"; break;
+        case sz_status_unknown_k: error_msg = "SmithWaterman failed: unknown error"; break;
+        default: error_msg = "SmithWaterman failed: unexpected error"; break;
+        }
+        PyErr_Format(PyExc_RuntimeError, "%s (status code: %d)", error_msg, (int)status);
+        return -1;
+    }
+
+    return 0;
+}
+
+static PyObject *SmithWaterman_call(SmithWaterman *self, PyObject *args, PyObject *kwargs) {
+    PyObject *a_obj = NULL, *b_obj = NULL, *device_obj = NULL, *out_obj = NULL;
+
+    static char *kwlist[] = {"a", "b", "device", "out", NULL};
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|OO", kwlist, &a_obj, &b_obj, &device_obj, &out_obj)) {
+        return NULL;
+    }
+
+    // Get device handle
+    sz_device_scope_t device_handle = default_device_scope;
+    if (device_obj && device_obj != Py_None) {
+        if (!PyObject_IsInstance(device_obj, (PyObject *)&DeviceScopeType)) {
+            PyErr_SetString(PyExc_TypeError, "device must be a DeviceScope instance");
+            return NULL;
+        }
+        device_handle = ((DeviceScope *)device_obj)->handle;
+    }
+
+    sz_size_t kernel_input_size = 0;
+    void const *kernel_a_texts_punned = NULL;
+    void const *kernel_b_texts_punned = NULL;
+    sz_status_t (*kernel_punned)(sz_smith_waterman_scores_t, sz_device_scope_t, void const *, void const *,
+                                 sz_ssize_t *, sz_size_t) = NULL;
+    // Try to swap allocators to unified memory for GPU compatibility
+    if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) { return NULL; }
+
+    // Handle 32-bit tape inputs
+    sz_sequence_u32tape_t a_u32tape, b_u32tape;
+    sz_bool_t a_is_u32tape = sz_py_export_strings_as_u32tape( //
+        a_obj, &a_u32tape.data, &a_u32tape.offsets, &a_u32tape.count);
+    sz_bool_t b_is_u32tape = sz_py_export_strings_as_u32tape( //
+        b_obj, &b_u32tape.data, &b_u32tape.offsets, &b_u32tape.count);
+    if (a_is_u32tape && b_is_u32tape) {
+        if (a_u32tape.count != b_u32tape.count) {
+            PyErr_SetString(PyExc_ValueError, "Input sequences must have the same length");
+            return NULL;
+        }
+        kernel_input_size = a_u32tape.count;
+        kernel_punned = sz_smith_waterman_scores_u32tape;
+        kernel_a_texts_punned = &a_u32tape;
+        kernel_b_texts_punned = &b_u32tape;
+    }
+
+    // Handle 64-bit tape inputs
+    sz_sequence_u64tape_t a_u64tape, b_u64tape;
+    sz_bool_t a_is_u64tape = !a_is_u32tape && sz_py_export_strings_as_u64tape( //
+                                                  a_obj, &a_u64tape.data, &a_u64tape.offsets, &a_u64tape.count);
+    sz_bool_t b_is_u64tape = !b_is_u32tape && sz_py_export_strings_as_u64tape( //
+                                                  b_obj, &b_u64tape.data, &b_u64tape.offsets, &b_u64tape.count);
+    if (a_is_u64tape && b_is_u64tape) {
+        if (a_u64tape.count != b_u64tape.count) {
+            PyErr_SetString(PyExc_ValueError, "Input sequences must have the same length");
+            return NULL;
+        }
+        kernel_input_size = a_u64tape.count;
+        kernel_punned = sz_smith_waterman_scores_u64tape;
+        kernel_a_texts_punned = &a_u64tape;
+        kernel_b_texts_punned = &b_u64tape;
+    }
+
+    // Handle sequence inputs
+    sz_sequence_t a_seq, b_seq;
+    sz_bool_t a_is_sequence = !a_is_u32tape && !a_is_u64tape && sz_py_export_strings_as_sequence(a_obj, &a_seq);
+    sz_bool_t b_is_sequence = !b_is_u32tape && !b_is_u64tape && sz_py_export_strings_as_sequence(b_obj, &b_seq);
+    if (a_is_sequence && b_is_sequence) {
+        if (a_seq.count != b_seq.count) {
+            PyErr_SetString(PyExc_ValueError, "Input sequences must have the same length");
+            return NULL;
+        }
+        kernel_input_size = a_seq.count;
+        kernel_punned = sz_smith_waterman_scores_sequence;
+        kernel_a_texts_punned = &a_seq;
+        kernel_b_texts_punned = &b_seq;
+    }
+
+    // If no valid input types were found, raise an error
+    if (!kernel_punned) {
+        PyErr_Format(PyExc_TypeError,
+                     "Unsupported input types for SmithWaterman. "
+                     "u32tape: a=%d b=%d, u64tape: a=%d b=%d, seq: a=%d b=%d",
+                     a_is_u32tape, b_is_u32tape, a_is_u64tape, b_is_u64tape, a_is_sequence, b_is_sequence);
+        return NULL;
+    }
+
+    // Make sure the `out` argument is valid NumPy array and extract results info
+    PyObject *results_array = NULL;
+    sz_ssize_t *kernel_results = NULL;
+    sz_size_t kernel_results_stride = sizeof(sz_ssize_t);
+
+    if (!out_obj || out_obj == Py_None) {
+        // Create a new NumPy array for results (signed integers for scores)
+        npy_intp numpy_size = kernel_input_size;
+        results_array = PyArray_SimpleNew(1, &numpy_size, NPY_INT64);
+        if (!results_array) {
+            PyErr_SetString(PyExc_MemoryError, "Failed to allocate results array");
+            goto cleanup;
+        }
+        kernel_results = (sz_ssize_t *)PyArray_DATA((PyArrayObject *)results_array);
+        kernel_results_stride = PyArray_STRIDE((PyArrayObject *)results_array, 0);
+    }
+    else {
+        // Use provided array
+        if (!PyArray_Check(out_obj)) {
+            PyErr_SetString(PyExc_TypeError, "out must be a NumPy array");
+            goto cleanup;
+        }
+        PyArrayObject *array = (PyArrayObject *)out_obj;
+        if (PyArray_NDIM(array) != 1) {
+            PyErr_SetString(PyExc_ValueError, "out array must be 1-dimensional");
+            goto cleanup;
+        }
+        if (PyArray_SIZE(array) < (npy_intp)kernel_input_size) {
+            PyErr_SetString(PyExc_ValueError, "out array is too small for results");
+            goto cleanup;
+        }
+        if (PyArray_TYPE(array) != NPY_INT64) {
+            PyErr_SetString(PyExc_TypeError, "out array must have int64 dtype");
+            goto cleanup;
+        }
+        kernel_results = (sz_ssize_t *)PyArray_DATA(array);
+        kernel_results_stride = PyArray_STRIDE(array, 0);
+        results_array = out_obj;
+        Py_INCREF(results_array);
+    }
+
+    sz_status_t status = kernel_punned(               //
+        self->handle, device_handle,                  //
+        kernel_a_texts_punned, kernel_b_texts_punned, //
+        kernel_results, kernel_results_stride);
+
+    if (status != sz_success_k) {
+        char const *error_msg;
+        switch (status) {
+        case sz_bad_alloc_k: error_msg = "SmithWaterman failed: memory allocation failed"; break;
+        case sz_invalid_utf8_k: error_msg = "SmithWaterman failed: invalid UTF-8 input"; break;
+        case sz_contains_duplicates_k: error_msg = "SmithWaterman failed: contains duplicates"; break;
+        case sz_overflow_risk_k: error_msg = "SmithWaterman failed: overflow risk"; break;
+        case sz_unexpected_dimensions_k: error_msg = "SmithWaterman failed: input/output size mismatch"; break;
+        case sz_missing_gpu_k: error_msg = "SmithWaterman failed: GPU support is missing in the library"; break;
+        case sz_status_unknown_k: error_msg = "SmithWaterman failed: unknown error"; break;
+        default: error_msg = "SmithWaterman failed: unexpected error"; break;
+        }
+        PyErr_Format(PyExc_RuntimeError, "%s (status code: %d)", error_msg, (int)status);
+        goto cleanup;
+    }
+    return results_array;
+
+cleanup:
+    Py_XDECREF(results_array);
+    return NULL;
+}
+
+static char const doc_SmithWaterman[] = //
+    "SmithWaterman(substitution_matrix, open=-1, extend=-1, capabilities=None)\n"
+    "\n"
+    "Smith-Waterman local alignment scoring engine.\n"
+    "\n"
+    "Args:\n"
+    "  substitution_matrix (np.ndarray): 256x256 int8 substitution matrix.\n"
+    "  open (int): Cost for opening a gap (default: -1).\n"
+    "  extend (int): Cost for extending a gap (default: -1).\n"
+    "  capabilities (Tuple[str], optional): Hardware capabilities to use.\n"
+    "                                       Will be intersected with detected capabilities.\n"
+    "                                       Examples: ('serial',), ('haswell', 'parallel')\n"
+    "\n"
+    "Call with:\n"
+    "  a (sequence): First sequence of strings.\n"
+    "  b (sequence): Second sequence of strings.\n"
+    "  device (DeviceScope, optional): Device execution context.\n"
+    "  out (array, optional): Output buffer for results.";
+
+static PyTypeObject SmithWatermanType = {
+    PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzillas.SmithWaterman",
+    .tp_doc = doc_SmithWaterman,
+    .tp_basicsize = sizeof(SmithWaterman),
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_new = SmithWaterman_new,
+    .tp_init = (initproc)SmithWaterman_init,
+    .tp_dealloc = (destructor)SmithWaterman_dealloc,
+    .tp_call = (ternaryfunc)SmithWaterman_call,
 };
 
 #pragma endregion
@@ -809,17 +1483,22 @@ PyMODINIT_FUNC PyInit_stringzillas(void) {
     sz_py_export_strings_as_sequence = api->sz_py_export_strings_as_sequence;
     sz_py_export_strings_as_u32tape = api->sz_py_export_strings_as_u32tape;
     sz_py_export_strings_as_u64tape = api->sz_py_export_strings_as_u64tape;
+    sz_py_replace_strings_allocator = api->sz_py_replace_strings_allocator;
 
     Py_DECREF(capsule);
     Py_DECREF(stringzilla_module);
 
     // Check that all functions were loaded
     if (!sz_py_export_string_like || !sz_py_export_strings_as_sequence || !sz_py_export_strings_as_u32tape ||
-        !sz_py_export_strings_as_u64tape) {
+        !sz_py_export_strings_as_u64tape || !sz_py_replace_strings_allocator) {
         PyErr_SetString(PyExc_ImportError, "Failed to import required StringZilla C API functions");
         return NULL;
     }
 
+    // Initialize the unified memory allocator for GPU compatibility
+    sz_status_t alloc_status = sz_memory_allocator_init_unified(&unified_allocator);
+    if (alloc_status != sz_success_k) sz_memory_allocator_init_default(&unified_allocator);
+
     // Initialize the default device scope for reuse
     sz_status_t status = sz_device_scope_init_default(&default_device_scope);
     if (status != sz_success_k) {
@@ -830,6 +1509,8 @@ PyMODINIT_FUNC PyInit_stringzillas(void) {
     if (PyType_Ready(&DeviceScopeType) < 0) return NULL;
     if (PyType_Ready(&LevenshteinDistancesType) < 0) return NULL;
     if (PyType_Ready(&LevenshteinDistancesUTF8Type) < 0) return NULL;
+    if (PyType_Ready(&NeedlemanWunschType) < 0) return NULL;
+    if (PyType_Ready(&SmithWatermanType) < 0) return NULL;
 
     m = PyModule_Create(&stringzillas_module);
     if (m == NULL) return NULL;
@@ -904,5 +1585,26 @@ PyMODINIT_FUNC PyInit_stringzillas(void) {
         return NULL;
     }
 
+    Py_INCREF(&NeedlemanWunschType);
+    if (PyModule_AddObject(m, "NeedlemanWunsch", (PyObject *)&NeedlemanWunschType) < 0) {
+        Py_XDECREF(&NeedlemanWunschType);
+        Py_XDECREF(&LevenshteinDistancesUTF8Type);
+        Py_XDECREF(&LevenshteinDistancesType);
+        Py_XDECREF(&DeviceScopeType);
+        Py_XDECREF(m);
+        return NULL;
+    }
+
+    Py_INCREF(&SmithWatermanType);
+    if (PyModule_AddObject(m, "SmithWaterman", (PyObject *)&SmithWatermanType) < 0) {
+        Py_XDECREF(&SmithWatermanType);
+        Py_XDECREF(&NeedlemanWunschType);
+        Py_XDECREF(&LevenshteinDistancesUTF8Type);
+        Py_XDECREF(&LevenshteinDistancesType);
+        Py_XDECREF(&DeviceScopeType);
+        Py_XDECREF(m);
+        return NULL;
+    }
+
     return m;
 }
\ No newline at end of file
diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index f315b7fe..3f69c73e 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -245,8 +245,9 @@ def binary_distance(a: str, b: str) -> int:
 @pytest.mark.parametrize("device_name", DEVICE_NAMES)
 def test_levenshtein_distances_utf8_with_simple_cases(device_name: DeviceName):
 
-    if device_name == "cuda":
+    if device_name == "gpu_device":
         pytest.skip("CUDA backend does not support custom gaps in UTF-8 Levenshtein distances")
+        return
 
     device_scope, capabilities = device_scope_and_capabilities(device_name)
     unicode_engine = szs.LevenshteinDistancesUTF8(capabilities=capabilities)
@@ -304,14 +305,20 @@ def binary_distance(a: str, b: str) -> int:
 @pytest.mark.parametrize("device_name", DEVICE_NAMES)
 def test_levenshtein_distances_utf8_with_custom_gaps(device_name: DeviceName):
 
-    if device_name == "cuda":
+    if device_name == "gpu_device":
         pytest.skip("CUDA backend does not support custom gaps in UTF-8 Levenshtein distances")
+        return
 
     mismatch: int = 4
     opening: int = 3
 
     device_scope, capabilities = device_scope_and_capabilities(device_name)
-    unicode_engine = szs.LevenshteinDistancesUTF8(gap=opening, mismatch=mismatch, capabilities=capabilities)
+    unicode_engine = szs.LevenshteinDistancesUTF8(
+        open=opening,
+        extend=opening,
+        mismatch=mismatch,
+        capabilities=capabilities,
+    )
 
     def unicode_distance(a: str, b: str) -> int:
         a_strs = Strs([a])

From bbf30d2a1583c4ca0e8313ca809f83d28df6fef7 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 14 Aug 2025 11:19:16 +0000
Subject: [PATCH 573/751] Improve: Wrap high-dim fingerprints

---
 c/stringzillas.cuh                    | 189 ++++++++++++----
 include/stringzillas/fingerprints.cuh |   9 +-
 include/stringzillas/fingerprints.hpp |  27 ++-
 include/stringzillas/stringzillas.h   |  51 ++---
 python/stringzillas.c                 | 303 ++++++++++++++++++++++++--
 scripts/test_stringzillas.py          |  95 ++++----
 6 files changed, 509 insertions(+), 165 deletions(-)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index 73b1e914..4a5484cc 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -253,16 +253,6 @@ sz_status_t sz_levenshtein_distances_for_(                                     /
     auto *engine = reinterpret_cast<levenshtein_backends_t *>(engine_punned);
     auto *device = reinterpret_cast<device_scope_t *>(device_punned);
 
-    // Debug: Check which variant the device holds
-    printf("DEBUG: Device scope variant index: %zu\n", device->variants.index());
-    if (std::holds_alternative<default_scope_t>(device->variants)) {
-        printf("DEBUG: Device contains default_scope_t\n");
-    }
-    else if (std::holds_alternative<cpu_scope_t>(device->variants)) { printf("DEBUG: Device contains cpu_scope_t\n"); }
-#if SZ_USE_CUDA
-    else if (std::holds_alternative<gpu_scope_t>(device->variants)) { printf("DEBUG: Device contains gpu_scope_t\n"); }
-#endif
-
     // Wrap our stable ABI sequences into C++ friendly containers
     auto results_strided = strided_ptr<sz_size_t> {reinterpret_cast<sz_ptr_t>(results), results_stride};
 
@@ -576,7 +566,8 @@ struct fingerprints_backends_t {
 #if SZ_USE_CUDA
         vec<szs::floating_rolling_hashers<sz_cap_cuda_k, fingerprint_slice_k>>,
 #endif
-        vec<szs::floating_rolling_hashers<sz_cap_serial_k, fingerprint_slice_k>>, fallback_variant_t>
+        vec<szs::floating_rolling_hashers<sz_cap_serial_k, fingerprint_slice_k>>, //
+        fallback_variant_t>
         variants;
 
     sz_size_t dimensions = 0; // Total number of dimensions across all hashers
@@ -700,25 +691,17 @@ SZ_DYNAMIC sz_status_t sz_device_scope_init_gpu_device(sz_size_t gpu_device, sz_
     sz_assert_(scope_punned != nullptr && "Scope must not be null");
 
 #if SZ_USE_CUDA
-    printf("DEBUG: Initializing GPU device scope for device %zu\n", gpu_device);
     sz::gpu_specs_t specs;
     auto specs_status = szs::gpu_specs_fetch(specs, static_cast<int>(gpu_device));
-    if (specs_status.status != sz::status_t::success_k) {
-        printf("DEBUG: Failed to fetch GPU specs, status: %d\n", (int)specs_status.status);
-        return static_cast<sz_status_t>(specs_status.status);
-    }
+    if (specs_status.status != sz::status_t::success_k) { return static_cast<sz_status_t>(specs_status.status); }
     szs::cuda_executor_t executor;
     auto executor_status = executor.try_scheduling(static_cast<int>(gpu_device));
-    if (executor_status.status != sz::status_t::success_k) {
-        printf("DEBUG: Failed to schedule GPU executor, status: %d\n", (int)executor_status.status);
-        return static_cast<sz_status_t>(executor_status.status);
-    }
+    if (executor_status.status != sz::status_t::success_k) { return static_cast<sz_status_t>(executor_status.status); }
 
     auto *scope =
         new (std::nothrow) device_scope_t {gpu_scope_t {.executor = std::move(executor), .specs = std::move(specs)}};
     if (!scope) return sz_bad_alloc_k;
     *scope_punned = reinterpret_cast<sz_device_scope_t>(scope);
-    printf("DEBUG: Successfully created GPU device scope\n");
     return sz_success_k;
 #else
     sz_unused_(gpu_device);
@@ -775,7 +758,6 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(
 #if SZ_USE_CUDA
     bool const can_use_cuda = (capabilities & sz_cap_cuda_k) == sz_cap_cuda_k;
     if (can_use_cuda && can_use_linear_costs) {
-        printf("DEBUG: Creating levenshtein_cuda_t variant (linear costs)\n");
         auto variant = szs::levenshtein_cuda_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)
             levenshtein_backends_t(std::in_place_type_t<szs::levenshtein_cuda_t>(), std::move(variant));
@@ -785,7 +767,6 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(
         return sz_success_k;
     }
     else if (can_use_cuda) {
-        printf("DEBUG: Creating affine_levenshtein_cuda_t variant (affine costs)\n");
         auto variant = szs::affine_levenshtein_cuda_t(substitution_costs, affine_costs);
         auto engine = new (std::nothrow)
             levenshtein_backends_t(std::in_place_type_t<szs::affine_levenshtein_cuda_t>(), std::move(variant));
@@ -1285,38 +1266,115 @@ SZ_DYNAMIC void sz_smith_waterman_scores_free(sz_smith_waterman_scores_t engine_
 
 #pragma region Fingerprints
 
-SZ_DYNAMIC sz_status_t sz_fingerprints_init(                              //
-    sz_size_t alphabet_size, sz_size_t const *window_widths,              //
-    sz_size_t window_widths_count, sz_size_t dimensions_per_window_width, //
-    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,     //
+SZ_DYNAMIC sz_status_t sz_fingerprints_init(                          //
+    sz_size_t dimensions, sz_size_t alphabet_size,                    //
+    sz_size_t const *window_widths, sz_size_t window_widths_count,    //
+    sz_memory_allocator_t const *alloc, sz_capability_t capabilities, //
     sz_fingerprints_t *engine_punned) {
 
     sz_assert_(engine_punned != nullptr && *engine_punned == nullptr && "Engine must be uninitialized");
 
-    // If window widths are not provided, let's pick some of the default configurations.
-    auto const dimensions = window_widths_count * dimensions_per_window_width;
-    auto const can_use_sliced_sketchers = dimensions_per_window_width % fingerprint_slice_k == 0;
+    // Use some default window widths if none are provided
+    sz_size_t const default_window_widths[] = {3, 4, 5, 7, 9, 11, 15, 31};
+    if (!window_widths || window_widths_count == 0) {
+        window_widths = default_window_widths;
+        window_widths_count = sizeof(default_window_widths) / sizeof(sz_size_t);
+    }
+
+    // For optimal performance the number of dimensions per window width must be divisible by the fingerprint slice.
+    auto const dimensions_per_window_width_min = dimensions / window_widths_count;
+    auto const dimensions_per_window_width_max = sz::divide_round_up(dimensions, window_widths_count);
+    auto const can_use_sliced_sketchers = (dimensions_per_window_width_min == dimensions_per_window_width_max) &&
+                                          (dimensions_per_window_width_min % fingerprint_slice_k == 0);
     using fallback_variant_t = typename fingerprints_backends_t::fallback_variant_t;
 
-    if (!can_use_sliced_sketchers) {
-        auto variant = fallback_variant_t();
-        for (size_t window_width_index = 0; window_width_index < window_widths_count; ++window_width_index) {
-            auto const window_width = window_widths[window_width_index];
-            auto extend_status = variant.try_extend(window_width, dimensions_per_window_width, alphabet_size);
-            if (extend_status != sz::status_t::success_k) return static_cast<sz_status_t>(extend_status);
+#if SZ_USE_HASWELL
+    bool const can_use_haswell = (capabilities & sz_cap_haswell_k) == sz_cap_haswell_k;
+    if (can_use_haswell && can_use_sliced_sketchers) {
+        auto const count_hashers = dimensions / fingerprint_slice_k;
+        using hasher_t = szs::floating_rolling_hashers<sz_cap_haswell_k, fingerprint_slice_k>;
+        vec<hasher_t> hashers;
+        if (hashers.try_resize(count_hashers) != sz::status_t::success_k) return sz_bad_alloc_k;
+
+        // Populate the hashers with the given window widths
+        for (size_t i = 0; i < count_hashers; ++i) {
+            auto const window_width = window_widths[i % window_widths_count];
+            auto const first_dimension_offset = i * fingerprint_slice_k;
+            auto const seed_status = hashers[i].try_seed(window_width, alphabet_size, first_dimension_offset);
+            if (seed_status != sz::status_t::success_k) return static_cast<sz_status_t>(seed_status);
         }
 
         auto engine =
-            new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<fallback_variant_t>(), std::move(variant));
+            new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<vec<hasher_t>>(), std::move(hashers));
         if (!engine) return sz_bad_alloc_k;
+        *engine_punned = reinterpret_cast<sz_fingerprints_t>(engine);
+        return sz_success_k;
+    }
+#endif // SZ_USE_HASWELL
 
-        engine->dimensions = dimensions;
+#if SZ_USE_SKYLAKE
+    bool const can_use_skylake = (capabilities & sz_cap_skylake_k) == sz_cap_skylake_k;
+    if (can_use_skylake && can_use_sliced_sketchers) {
+        auto const count_hashers = dimensions / fingerprint_slice_k;
+        using hasher_t = szs::floating_rolling_hashers<sz_cap_skylake_k, fingerprint_slice_k>;
+        vec<hasher_t> hashers;
+        if (hashers.try_resize(count_hashers) != sz::status_t::success_k) return sz_bad_alloc_k;
+
+        // Populate the hashers with the given window widths
+        for (size_t i = 0; i < count_hashers; ++i) {
+            auto const window_width = window_widths[i % window_widths_count];
+            auto const first_dimension_offset = i * fingerprint_slice_k;
+            auto const seed_status = hashers[i].try_seed(window_width, alphabet_size, first_dimension_offset);
+            if (seed_status != sz::status_t::success_k) return static_cast<sz_status_t>(seed_status);
+        }
+
+        auto engine =
+            new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<vec<hasher_t>>(), std::move(hashers));
+        if (!engine) return sz_bad_alloc_k;
+        *engine_punned = reinterpret_cast<sz_fingerprints_t>(engine);
+        return sz_success_k;
+    }
+#endif // SZ_USE_SKYLAKE
+
+#if SZ_USE_CUDA
+    bool const can_use_cuda = (capabilities & sz_cap_cuda_k) == sz_cap_cuda_k;
+    if (can_use_cuda && can_use_sliced_sketchers) {
+        auto const count_hashers = dimensions / fingerprint_slice_k;
+        using hasher_t = szs::floating_rolling_hashers<sz_cap_cuda_k, fingerprint_slice_k>;
+        vec<hasher_t> hashers;
+        if (hashers.try_resize(count_hashers) != sz::status_t::success_k) return sz_bad_alloc_k;
+
+        // Populate the hashers with the given window widths
+        for (size_t i = 0; i < count_hashers; ++i) {
+            auto const window_width = window_widths[i % window_widths_count];
+            auto const first_dimension_offset = i * fingerprint_slice_k;
+            auto const seed_status = hashers[i].try_seed(window_width, alphabet_size, first_dimension_offset);
+            if (seed_status != sz::status_t::success_k) return static_cast<sz_status_t>(seed_status);
+        }
+
+        auto engine =
+            new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<vec<hasher_t>>(), std::move(hashers));
+        if (!engine) return sz_bad_alloc_k;
         *engine_punned = reinterpret_cast<sz_fingerprints_t>(engine);
         return sz_success_k;
     }
+#endif // SZ_USE_CUDA
+
+    // Build the fallback variant with interleaving width dimensions
+    auto variant = fallback_variant_t();
+    for (size_t dimension = 0; dimension < dimensions; ++dimension) {
+        auto const window_width = window_widths[dimension % window_widths_count];
+        auto const extend_status = variant.try_extend(window_width, 1, alphabet_size);
+        if (extend_status != sz::status_t::success_k) return static_cast<sz_status_t>(extend_status);
+    }
+
+    auto engine =
+        new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<fallback_variant_t>(), std::move(variant));
+    if (!engine) return sz_bad_alloc_k;
 
-    // TODO: Implement unrolled logic
-    return sz_status_unknown_k;
+    engine->dimensions = dimensions;
+    *engine_punned = reinterpret_cast<sz_fingerprints_t>(engine);
+    return sz_success_k;
 }
 
 SZ_DYNAMIC sz_status_t sz_fingerprints_sequence(                      //
@@ -1366,4 +1424,55 @@ SZ_DYNAMIC void sz_fingerprints_free(sz_fingerprints_t engine_punned) {
 
 #pragma endregion Fingerprints
 
+#pragma region Fingerprints UTF8
+
+SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_init(                     //
+    sz_size_t dimensions, sz_size_t alphabet_size,                    //
+    sz_size_t const *window_widths, sz_size_t window_widths_count,    //
+    sz_memory_allocator_t const *alloc, sz_capability_t capabilities, //
+    sz_fingerprints_utf8_t *engine_punned) {
+
+    return sz_fingerprints_init( //
+        dimensions, alphabet_size, window_widths, window_widths_count, alloc, capabilities, engine_punned);
+}
+
+SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_sequence(                      //
+    sz_fingerprints_utf8_t engine_punned, sz_device_scope_t device_punned, //
+    sz_sequence_t const *texts,                                            //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                     //
+    sz_u32_t *min_counts, sz_size_t min_counts_stride) {
+
+    return sz_fingerprints_sequence(         //
+        engine_punned, device_punned, texts, //
+        min_hashes, min_hashes_stride, min_counts, min_counts_stride);
+}
+
+SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_u32tape(                       //
+    sz_fingerprints_utf8_t engine_punned, sz_device_scope_t device_punned, //
+    sz_sequence_u32tape_t const *texts,                                    //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                     //
+    sz_u32_t *min_counts, sz_size_t min_counts_stride) {
+
+    return sz_fingerprints_u32tape(          //
+        engine_punned, device_punned, texts, //
+        min_hashes, min_hashes_stride, min_counts, min_counts_stride);
+}
+
+SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_u64tape(                       //
+    sz_fingerprints_utf8_t engine_punned, sz_device_scope_t device_punned, //
+    sz_sequence_u64tape_t const *texts,                                    //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                     //
+    sz_u32_t *min_counts, sz_size_t min_counts_stride) {
+
+    return sz_fingerprints_u64tape(          //
+        engine_punned, device_punned, texts, //
+        min_hashes, min_hashes_stride, min_counts, min_counts_stride);
+}
+
+SZ_DYNAMIC void sz_fingerprints_utf8_free(sz_fingerprints_utf8_t engine_punned) {
+    return sz_fingerprints_free(engine_punned);
+}
+
+#pragma endregion Fingerprints UTF8
+
 } // extern "C"
\ No newline at end of file
diff --git a/include/stringzillas/fingerprints.cuh b/include/stringzillas/fingerprints.cuh
index cb0d460d..45023bac 100644
--- a/include/stringzillas/fingerprints.cuh
+++ b/include/stringzillas/fingerprints.cuh
@@ -298,11 +298,14 @@ struct floating_rolling_hashers<sz_cap_cuda_k, dimensions_> {
     /**
      *  @brief Initializes several rolling hashers with different multipliers and modulos.
      *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
+     *  @param[in] first_dimension_offset The offset for the first dimension within a larger fingerprint, typically 0.
      */
-    SZ_NOINLINE status_t try_seed(size_t window_width, size_t alphabet_size = 256) noexcept {
+    SZ_NOINLINE status_t try_seed(size_t window_width, size_t alphabet_size = 256,
+                                  size_t first_dimension_offset = 0) noexcept {
         if (hashers_.try_resize(aligned_dimensions_k) != status_t::success_k) return status_t::bad_alloc_k;
-        for (unsigned dim = 0; dim < dimensions_k; ++dim)
-            hashers_[dim] = hasher_t(window_width, alphabet_size + dim, hasher_t::default_modulo_base_k);
+        for (size_t dim = 0; dim < dimensions_k; ++dim)
+            hashers_[dim] =
+                hasher_t(window_width, alphabet_size + first_dimension_offset + dim, hasher_t::default_modulo_base_k);
         window_width_ = window_width;
         return status_t::success_k;
     }
diff --git a/include/stringzillas/fingerprints.hpp b/include/stringzillas/fingerprints.hpp
index 331a8db2..dd6d19f2 100644
--- a/include/stringzillas/fingerprints.hpp
+++ b/include/stringzillas/fingerprints.hpp
@@ -1074,10 +1074,13 @@ struct floating_rolling_hashers<sz_cap_serial_k, dimensions_> {
     /**
      *  @brief Initializes several rolling hashers with different multipliers and modulos.
      *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
+     *  @param[in] first_dimension_offset The offset for the first dimension within a larger fingerprint, typically 0.
      */
-    SZ_NOINLINE status_t try_seed(size_t window_width, size_t alphabet_size = 256) noexcept {
-        for (unsigned dim = 0; dim < dimensions_k; ++dim) {
-            hasher_t hasher(window_width, alphabet_size + dim, hasher_t::default_modulo_base_k);
+    SZ_NOINLINE status_t try_seed(size_t window_width, size_t alphabet_size = 256,
+                                  size_t first_dimension_offset = 0) noexcept {
+        for (size_t dim = 0; dim < dimensions_k; ++dim) {
+            hasher_t hasher(window_width, alphabet_size + first_dimension_offset + dim,
+                            hasher_t::default_modulo_base_k);
             multipliers_[dim] = hasher.multiplier();
             modulos_[dim] = hasher.modulo();
             inverse_modulos_[dim] = hasher.inverse_modulo();
@@ -1315,10 +1318,13 @@ struct floating_rolling_hashers<sz_cap_haswell_k, dimensions_> {
     /**
      *  @brief Initializes several rolling hashers with different multipliers and modulos.
      *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
+     *  @param[in] first_dimension_offset The offset for the first dimension within a larger fingerprint, typically 0.
      */
-    SZ_NOINLINE status_t try_seed(size_t window_width, size_t alphabet_size = 256) noexcept {
-        for (unsigned dim = 0; dim < dimensions_k; ++dim) {
-            hasher_t hasher(window_width, alphabet_size + dim, hasher_t::default_modulo_base_k);
+    SZ_NOINLINE status_t try_seed(size_t window_width, size_t alphabet_size = 256,
+                                  size_t first_dimension_offset = 0) noexcept {
+        for (size_t dim = 0; dim < dimensions_k; ++dim) {
+            hasher_t hasher(window_width, alphabet_size + first_dimension_offset + dim,
+                            hasher_t::default_modulo_base_k);
             multipliers_[dim] = hasher.multiplier();
             modulos_[dim] = hasher.modulo();
             inverse_modulos_[dim] = hasher.inverse_modulo();
@@ -1640,10 +1646,13 @@ struct floating_rolling_hashers<sz_cap_skylake_k, dimensions_> {
     /**
      *  @brief Initializes several rolling hashers with different multipliers and modulos.
      *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
+     *  @param[in] first_dimension_offset The offset for the first dimension within a larger fingerprint, typically 0.
      */
-    SZ_NOINLINE status_t try_seed(size_t window_width, size_t alphabet_size = 256) noexcept {
-        for (unsigned dim = 0; dim < dimensions_k; ++dim) {
-            hasher_t hasher(window_width, alphabet_size + dim, hasher_t::default_modulo_base_k);
+    SZ_NOINLINE status_t try_seed(size_t window_width, size_t alphabet_size = 256,
+                                  size_t first_dimension_offset = 0) noexcept {
+        for (size_t dim = 0; dim < dimensions_k; ++dim) {
+            hasher_t hasher(window_width, alphabet_size + first_dimension_offset + dim,
+                            hasher_t::default_modulo_base_k);
             multipliers_[dim] = hasher.multiplier();
             modulos_[dim] = hasher.modulo();
             inverse_modulos_[dim] = hasher.inverse_modulo();
diff --git a/include/stringzillas/stringzillas.h b/include/stringzillas/stringzillas.h
index 96658890..b528a607 100644
--- a/include/stringzillas/stringzillas.h
+++ b/include/stringzillas/stringzillas.h
@@ -209,18 +209,21 @@ typedef void *sz_fingerprints_utf8_t;
 
 /**
  *  @brief Initializes a fingerprinting engine.
- *  @param alphabet_size The size of the alphabet (256 for binary, 128 for ASCII, 4 for DNA, 22 for protein).
- *  @param window_widths An optional array of window widths for the fingerprints, like [3, 4, 5, 7, 9, 11, 15, 31].
- *  @param window_widths_count The number of window widths in the @p window_widths array.
- *  @param dimensions_per_window_width The number of dimensions for each window width, ideally 64 or its multiple.
- *  @param alloc A memory allocator to use for allocating memory.
- *  @param device A device scope to use for parallel execution.
- *  @param engine Pointer to the initialized fingerprinting engine.
+ *  @param[in] dimensions Total dimensions per fingerprint, ideally 1024 or a (64 * window_widths_count) multiple.
+ *  @param[in] alphabet_size The size of the alphabet (256 for binary, 128 for ASCII, 4 for DNA, 22 for protein).
+ *  @param[in] window_widths An optional array of window widths for the fingerprints, like [3, 4, 5, 7, 9, 11, 15, 31].
+ *  @param[in] window_widths_count The number of window widths in the @p window_widths array.
+ *  @param[in] alloc A memory allocator to use for allocating memory.
+ *  @param[in] capabilities A set of capabilities to use for the fingerprinting engine.
+ *  @param[out] engine Pointer to the initialized fingerprinting engine.
+ *
+ *  If the @p alphabet_size is 0, it will be set to 256 by default.
+ *  If the @p window_widths_count is 0 or @p window_widths is NULL, some default window widths will be used.
  */
-SZ_DYNAMIC sz_status_t sz_fingerprints_init(                              //
-    sz_size_t alphabet_size, sz_size_t const *window_widths,              //
-    sz_size_t window_widths_count, sz_size_t dimensions_per_window_width, //
-    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,     //
+SZ_DYNAMIC sz_status_t sz_fingerprints_init(                          //
+    sz_size_t dimensions, sz_size_t alphabet_size,                    //
+    sz_size_t const *window_widths, sz_size_t window_widths_count,    //
+    sz_memory_allocator_t const *alloc, sz_capability_t capabilities, //
     sz_fingerprints_t *engine);
 
 SZ_DYNAMIC sz_status_t sz_fingerprints_sequence(        //
@@ -243,32 +246,6 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_u32tape(         //
 
 SZ_DYNAMIC void sz_fingerprints_free(sz_fingerprints_t engine);
 
-SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_init(                         //
-    sz_size_t alphabet_size, sz_size_t const *window_widths,              //
-    sz_size_t window_widths_count, sz_size_t dimensions_per_window_width, //
-    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,     //
-    sz_fingerprints_utf8_t *engine);
-
-SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_sequence(        //
-    sz_fingerprints_utf8_t engine, sz_device_scope_t device, //
-    sz_sequence_t const *texts,                              //
-    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,       //
-    sz_u32_t *min_counts, sz_size_t min_counts_stride);
-
-SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_u64tape(         //
-    sz_fingerprints_utf8_t engine, sz_device_scope_t device, //
-    sz_sequence_u64tape_t const *texts,                      //
-    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,       //
-    sz_u32_t *min_counts, sz_size_t min_counts_stride);
-
-SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_u32tape(         //
-    sz_fingerprints_utf8_t engine, sz_device_scope_t device, //
-    sz_sequence_u32tape_t const *texts,                      //
-    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,       //
-    sz_u32_t *min_counts, sz_size_t min_counts_stride);
-
-SZ_DYNAMIC void sz_fingerprints_utf8_free(sz_fingerprints_utf8_t engine);
-
 #ifdef __cplusplus
 }
 #endif // __cplusplus
diff --git a/python/stringzillas.c b/python/stringzillas.c
index 8a0ac0f6..c8b21122 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -48,9 +48,9 @@ typedef SSIZE_T ssize_t;
 #pragma region Forward Declarations
 
 /**
- * @brief Creates a Python tuple from capabilities mask.
- * @param[in] caps Capabilities mask
- * @return New reference to Python tuple, or NULL on error
+ *  @brief Creates a Python tuple from capabilities mask.
+ *  @param[in] caps Capabilities mask
+ *  @return New reference to Python tuple, or NULL on error
  */
 static PyObject *capabilities_to_tuple(sz_capability_t caps) {
     char const *cap_strings[SZ_CAPABILITIES_COUNT];
@@ -80,7 +80,6 @@ static PyTypeObject LevenshteinDistancesUTF8Type;
 static PyTypeObject NeedlemanWunschType;
 static PyTypeObject SmithWatermanType;
 static PyTypeObject FingerprintsType;
-static PyTypeObject FingerprintsUTF8Type;
 
 // Function pointers for stringzilla functions imported from capsules
 static sz_bool_t (*sz_py_export_string_like)(PyObject *, sz_cptr_t *, sz_size_t *) = NULL;
@@ -1413,6 +1412,270 @@ static PyTypeObject SmithWatermanType = {
 
 #pragma endregion
 
+#pragma region Fingerprints
+
+/**
+ *  @brief  Fingerprinting engine for binary strings.
+ */
+typedef struct {
+    PyObject ob_base;
+    sz_fingerprints_t handle;
+    char description[64];
+    sz_capability_t capabilities;
+    sz_size_t ndim;
+} Fingerprints;
+
+static void Fingerprints_dealloc(Fingerprints *self) {
+    if (self->handle) {
+        sz_fingerprints_free(self->handle);
+        self->handle = NULL;
+    }
+    Py_TYPE(self)->tp_free((PyObject *)self);
+}
+
+static PyObject *Fingerprints_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) {
+    Fingerprints *self = (Fingerprints *)type->tp_alloc(type, 0);
+    if (self != NULL) {
+        self->handle = NULL;
+        self->description[0] = '\0';
+        self->capabilities = 0;
+        self->ndim = 0;
+    }
+    return (PyObject *)self;
+}
+
+static int Fingerprints_init(Fingerprints *self, PyObject *args, PyObject *kwargs) {
+    sz_size_t ndim;
+    PyObject *window_widths_obj = NULL;
+    sz_size_t alphabet_size = 256;
+    PyObject *capabilities_tuple = NULL;
+    sz_capability_t capabilities = default_hardware_capabilities;
+
+    static char *kwlist[] = {"ndim", "window_widths", "alphabet_size", "capabilities", NULL};
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "n|OnO", kwlist, &ndim, &window_widths_obj, &alphabet_size,
+                                     &capabilities_tuple))
+        return -1;
+
+    // Parse capabilities if provided
+    if (capabilities_tuple)
+        if (parse_and_intersect_capabilities(capabilities_tuple, &capabilities) != 0) return -1;
+
+    sz_size_t *window_widths = NULL;
+    sz_size_t window_widths_count = 0;
+
+    // Parse window_widths if provided - require NumPy array of uint64
+    if (window_widths_obj && window_widths_obj != Py_None) {
+        if (!PyArray_Check(window_widths_obj)) {
+            PyErr_SetString(PyExc_TypeError, "window_widths must be a numpy array of uint64");
+            return -1;
+        }
+
+        PyArrayObject *arr = (PyArrayObject *)window_widths_obj;
+
+        // Check dtype is uint64
+        if (PyArray_TYPE(arr) != NPY_UINT64) {
+            PyErr_SetString(PyExc_TypeError, "window_widths must have dtype uint64");
+            return -1;
+        }
+
+        // Check that it's 1D
+        if (PyArray_NDIM(arr) != 1) {
+            PyErr_SetString(PyExc_ValueError, "window_widths must be a 1D array");
+            return -1;
+        }
+
+        // Check that it's contiguous (no strides)
+        if (!PyArray_IS_C_CONTIGUOUS(arr)) {
+            PyErr_SetString(PyExc_ValueError, "window_widths must be a contiguous C-style array (no strides)");
+            return -1;
+        }
+
+        window_widths_count = PyArray_SIZE(arr);
+        window_widths = (sz_size_t *)PyArray_DATA(arr);
+    }
+
+    sz_status_t status = sz_fingerprints_init(ndim, alphabet_size, window_widths, window_widths_count, NULL,
+                                              capabilities, &self->handle);
+
+    if (status != sz_success_k) {
+        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize Fingerprints engine");
+        return -1;
+    }
+
+    snprintf(self->description, sizeof(self->description), "ndim=%zu,window_widths=%zu,alphabet_size=%zu", ndim,
+             window_widths_count, alphabet_size);
+    self->capabilities = capabilities;
+    self->ndim = ndim;
+    return 0;
+}
+
+static PyObject *Fingerprints_repr(Fingerprints *self) {
+    return PyUnicode_FromFormat("Fingerprints(%s)", self->description);
+}
+
+static PyObject *Fingerprints_get_capabilities(Fingerprints *self, void *closure) {
+    return capabilities_to_tuple(self->capabilities);
+}
+
+static PyObject *Fingerprints_call(Fingerprints *self, PyObject *args, PyObject *kwargs) {
+
+    PyObject *texts_obj = NULL, *device_obj = NULL, *out_obj = NULL;
+    static char *kwlist[] = {"texts", "device", "out", NULL};
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OO", kwlist, &texts_obj, &device_obj, &out_obj)) return NULL;
+
+    DeviceScope *device_scope = NULL;
+    if (device_obj != NULL && device_obj != Py_None) {
+        if (!PyObject_TypeCheck(device_obj, &DeviceScopeType)) {
+            PyErr_SetString(PyExc_TypeError, "device must be a DeviceScope instance");
+            return NULL;
+        }
+        device_scope = (DeviceScope *)device_obj;
+    }
+
+    sz_device_scope_t device_handle = device_scope ? device_scope->handle : default_device_scope;
+
+    // Handle empty input - return tuple of empty arrays
+    if (PySequence_Check(texts_obj) && PySequence_Size(texts_obj) == 0) {
+        npy_intp dims[2] = {0, self->ndim};
+        PyArrayObject *empty_hashes = (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_UINT32);
+        PyArrayObject *empty_counts = (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_UINT32);
+
+        if (!empty_hashes || !empty_counts) {
+            Py_XDECREF(empty_hashes);
+            Py_XDECREF(empty_counts);
+            return PyErr_NoMemory();
+        }
+
+        PyObject *result_tuple = PyTuple_New(2);
+        if (!result_tuple) {
+            Py_DECREF(empty_hashes);
+            Py_DECREF(empty_counts);
+            return NULL;
+        }
+
+        PyTuple_SET_ITEM(result_tuple, 0, (PyObject *)empty_hashes);
+        PyTuple_SET_ITEM(result_tuple, 1, (PyObject *)empty_counts);
+        return result_tuple;
+    }
+
+    // Try to swap allocators to unified memory for GPU compatibility
+    if (!try_swap_to_unified_allocator(texts_obj)) return NULL;
+
+    sz_size_t kernel_input_size = 0;
+    void *kernel_texts_punned = NULL;
+    sz_status_t (*kernel_punned)(sz_fingerprints_t, sz_device_scope_t, void *, sz_u32_t *, sz_size_t, sz_u32_t *,
+                                 sz_size_t) = NULL;
+
+    // Handle sequence inputs
+    sz_sequence_t texts_seq;
+    sz_bool_t texts_is_sequence = sz_py_export_strings_as_sequence(texts_obj, &texts_seq);
+    if (texts_is_sequence) {
+        kernel_input_size = texts_seq.count;
+        kernel_punned = sz_fingerprints_sequence;
+        kernel_texts_punned = &texts_seq;
+    }
+
+    // Handle 32-bit tape inputs
+    sz_sequence_u32tape_t texts_u32tape;
+    sz_bool_t texts_is_u32tape =
+        !texts_is_sequence && sz_py_export_strings_as_u32tape( //
+                                  texts_obj, &texts_u32tape.data, &texts_u32tape.offsets, &texts_u32tape.count);
+    if (texts_is_u32tape) {
+        kernel_input_size = texts_u32tape.count;
+        kernel_punned = sz_fingerprints_u32tape;
+        kernel_texts_punned = &texts_u32tape;
+    }
+
+    // Handle 64-bit tape inputs
+    sz_sequence_u64tape_t texts_u64tape;
+    sz_bool_t texts_is_u64tape = !texts_is_sequence && !texts_is_u32tape &&
+                                 sz_py_export_strings_as_u64tape( //
+                                     texts_obj, &texts_u64tape.data, &texts_u64tape.offsets, &texts_u64tape.count);
+    if (texts_is_u64tape) {
+        kernel_input_size = texts_u64tape.count;
+        kernel_punned = sz_fingerprints_u64tape;
+        kernel_texts_punned = &texts_u64tape;
+    }
+
+    if (kernel_punned == NULL) {
+        PyErr_SetString(PyExc_TypeError, "Unsupported input type for fingerprinting");
+        return NULL;
+    }
+
+    // Create NumPy arrays for output matrices - each row contains fingerprints for one text
+    npy_intp dims[2] = {kernel_input_size, self->ndim};
+
+    PyArrayObject *hashes_array = (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_UINT32);
+    PyArrayObject *counts_array = (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_UINT32);
+
+    if (!hashes_array || !counts_array) {
+        Py_XDECREF(hashes_array);
+        Py_XDECREF(counts_array);
+        return PyErr_NoMemory();
+    }
+
+    sz_u32_t *min_hashes = (sz_u32_t *)PyArray_DATA(hashes_array);
+    sz_u32_t *min_counts = (sz_u32_t *)PyArray_DATA(counts_array);
+
+    // Call the kernel
+    sz_status_t status = kernel_punned(self->handle, device_handle, kernel_texts_punned, min_hashes,
+                                       self->ndim * sizeof(sz_u32_t), min_counts, self->ndim * sizeof(sz_u32_t));
+
+    if (status != sz_success_k) {
+        Py_DECREF(hashes_array);
+        Py_DECREF(counts_array);
+        PyErr_SetString(PyExc_RuntimeError, "Fingerprinting computation failed");
+        return NULL;
+    }
+
+    // Return tuple of two NumPy arrays: (hashes_matrix, counts_matrix)
+    PyObject *result_tuple = PyTuple_New(2);
+    if (!result_tuple) {
+        Py_DECREF(hashes_array);
+        Py_DECREF(counts_array);
+        return NULL;
+    }
+
+    PyTuple_SET_ITEM(result_tuple, 0, (PyObject *)hashes_array);
+    PyTuple_SET_ITEM(result_tuple, 1, (PyObject *)counts_array);
+
+    return result_tuple;
+}
+
+static char const doc_Fingerprints[] = //
+    "Fingerprints(ndim, window_widths=None, alphabet_size=256, capabilities=None)\n"
+    "\n"
+    "Compute MinHash fingerprints for binary strings.\n"
+    "\n"
+    "Args:\n"
+    "  ndim (int): Number of dimensions per fingerprint.\n"
+    "  window_widths (numpy.array, optional): 1D uint64 contiguous array of window widths. Uses defaults if None.\n"
+    "  alphabet_size (int, optional): Alphabet size, default 256 for binary strings.\n"
+    "  capabilities (tuple, optional): Computational capabilities to enable ('serial', 'parallel', 'cuda').\n"
+    "\n"
+    "Returns:\n"
+    "  tuple: (hashes_matrix, counts_matrix) - Two numpy uint32 matrices of shape (num_texts, ndim).";
+
+static PyGetSetDef Fingerprints_getsetters[] = {
+    {"capabilities", (getter)Fingerprints_get_capabilities, NULL, "computational capabilities", NULL},
+    {NULL} /* Sentinel */
+};
+
+static PyTypeObject FingerprintsType = {
+    PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzillas.Fingerprints",
+    .tp_doc = doc_Fingerprints,
+    .tp_basicsize = sizeof(Fingerprints),
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_new = Fingerprints_new,
+    .tp_init = (initproc)Fingerprints_init,
+    .tp_getset = Fingerprints_getsetters,
+    .tp_repr = (reprfunc)Fingerprints_repr,
+    .tp_dealloc = (destructor)Fingerprints_dealloc,
+    .tp_call = (ternaryfunc)Fingerprints_call,
+};
+
+#pragma endregion
+
 static void stringzillas_cleanup(PyObject *m) {
     sz_unused_(m);
     if (default_device_scope) {
@@ -1511,6 +1774,7 @@ PyMODINIT_FUNC PyInit_stringzillas(void) {
     if (PyType_Ready(&LevenshteinDistancesUTF8Type) < 0) return NULL;
     if (PyType_Ready(&NeedlemanWunschType) < 0) return NULL;
     if (PyType_Ready(&SmithWatermanType) < 0) return NULL;
+    if (PyType_Ready(&FingerprintsType) < 0) return NULL;
 
     m = PyModule_Create(&stringzillas_module);
     if (m == NULL) return NULL;
@@ -1527,29 +1791,14 @@ PyMODINIT_FUNC PyInit_stringzillas(void) {
 
     // Define SIMD capabilities as a tuple
     {
-        sz_capability_t caps = default_hardware_capabilities;
-
-        // Get capability strings using the new function
-        char const *cap_strings[SZ_CAPABILITIES_COUNT];
-        sz_size_t cap_count = sz_capabilities_to_strings_implementation_(caps, cap_strings, SZ_CAPABILITIES_COUNT);
-
         // Create a Python tuple with the capabilities
-        PyObject *caps_tuple = PyTuple_New(cap_count);
+        sz_capability_t caps = default_hardware_capabilities;
+        PyObject *caps_tuple = capabilities_to_tuple(caps);
         if (!caps_tuple) {
             Py_XDECREF(m);
             return NULL;
         }
 
-        for (sz_size_t i = 0; i < cap_count; i++) {
-            PyObject *cap_str = PyUnicode_FromString(cap_strings[i]);
-            if (!cap_str) {
-                Py_DECREF(caps_tuple);
-                Py_XDECREF(m);
-                return NULL;
-            }
-            PyTuple_SET_ITEM(caps_tuple, i, cap_str);
-        }
-
         if (PyModule_AddObject(m, "__capabilities__", caps_tuple) < 0) {
             Py_DECREF(caps_tuple);
             Py_XDECREF(m);
@@ -1606,5 +1855,17 @@ PyMODINIT_FUNC PyInit_stringzillas(void) {
         return NULL;
     }
 
+    Py_INCREF(&FingerprintsType);
+    if (PyModule_AddObject(m, "Fingerprints", (PyObject *)&FingerprintsType) < 0) {
+        Py_XDECREF(&FingerprintsType);
+        Py_XDECREF(&SmithWatermanType);
+        Py_XDECREF(&NeedlemanWunschType);
+        Py_XDECREF(&LevenshteinDistancesUTF8Type);
+        Py_XDECREF(&LevenshteinDistancesType);
+        Py_XDECREF(&DeviceScopeType);
+        Py_XDECREF(m);
+        return NULL;
+    }
+
     return m;
 }
\ No newline at end of file
diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index 3f69c73e..3df44588 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -22,36 +22,12 @@
 from typing import Optional, Literal
 
 import pytest
+import numpy as np  # ! Unlike StringZilla, NumPy is mandatory for StringZillas
 
 import stringzilla as sz
 import stringzillas as szs
 from stringzilla import Str, Strs
 
-# NumPy is available on most platforms and is required for most tests.
-# When using PyPy on some platforms NumPy has internal issues, that will
-# raise a weird error, not an `ImportError`. That's why we intentionally
-# use a naked `except:`. Necessary evil!
-try:
-    import numpy as np
-
-    numpy_available = True
-except:
-    # NumPy is not installed, most tests will be skipped
-    numpy_available = False
-
-
-# PyArrow is not available on most platforms.
-# When using PyPy on some platforms PyArrow has internal issues, that will
-# raise a weird error, not an `ImportError`. That's why we intentionally
-# use a naked `except:`. Necessary evil!
-try:
-    import pyarrow as pa
-
-    pyarrow_available = True
-except:
-    # PyArrow is not installed, most tests will be skipped
-    pyarrow_available = False
-
 
 def test_library_properties():
     assert len(sz.__version__.split(".")) == 3, "Semantic versioning must be preserved"
@@ -163,7 +139,6 @@ def is_equal_strings(native_strings, big_strings):
         assert native_slice == big_slice, f"Mismatch between `{native_slice}` and `{str(big_slice)}`"
 
 
-@pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
 def baseline_levenshtein_distance(s1, s2) -> int:
     """
     Compute the Levenshtein distance between two strings.
@@ -341,7 +316,6 @@ def unicode_distance(a: str, b: str) -> int:
 
 @pytest.mark.repeat(10)
 @pytest.mark.parametrize("config", INPUT_SIZE_CONFIGS)
-@pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
 def test_levenshtein_distance_random(config: InputSizeConfig):
     a_batch, b_batch = get_random_string_batch(config)
 
@@ -358,7 +332,6 @@ def test_levenshtein_distance_random(config: InputSizeConfig):
 
 @pytest.mark.repeat(10)
 @pytest.mark.parametrize("config", INPUT_SIZE_CONFIGS)
-@pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
 def test_needleman_wunsch_vs_levenshtein_random(config: InputSizeConfig):
     """Test Needleman-Wunsch global alignment scores against Levenshtein distances with random strings."""
 
@@ -379,45 +352,57 @@ def test_needleman_wunsch_vs_levenshtein_random(config: InputSizeConfig):
     np.testing.assert_array_equal(results, baselines, "Edit distances do not match")
 
 
-def test_fingerprints():
-    """Test Fingerprints and FingerprintsUTF8 basic functionality."""
+@pytest.mark.parametrize("device_name", DEVICE_NAMES)
+def test_fingerprints(device_name: str):
+    """Test Fingerprints basic functionality."""
 
-    engine = szs.Fingerprints()
-    utf8_engine = szs.FingerprintsUTF8()
+    # Create engine with smaller dimensions to avoid memory issues
+    device_scope, capabilities = device_scope_and_capabilities(device_name)
+    engine = szs.Fingerprints(ndim=64, capabilities=capabilities)
+
+    # Basic functionality - empty input should return empty arrays
+    hashes, counts = engine(Strs([]), device=device_scope)
+    assert hashes.shape == (0, 64)
+    assert counts.shape == (0, 64)
+    assert hashes.dtype == np.uint32
+    assert counts.dtype == np.uint32
+
+    test_strings = Strs(["hello", "world", "hello"])
+    hashes, counts = engine(test_strings, device=device_scope)
 
-    # Basic functionality
-    assert engine([]) == []
-    assert utf8_engine([]) == []
+    # Check output shape and types
+    assert hashes.shape == (3, 64), f"Expected (3, 64), got {hashes.shape}"
+    assert counts.shape == (3, 64), f"Expected (3, 64), got {counts.shape}"
+    assert hashes.dtype == np.uint32
+    assert counts.dtype == np.uint32
 
-    test_strings = ["hello", "world", "hello"]
-    results = engine(test_strings)
-    assert len(results) == 3
-    assert results[0] == results[2], "Identical strings should produce identical fingerprints"
-    assert results[0] != results[1], "Different strings should produce different fingerprints"
+    # Identical strings should produce identical fingerprints
+    assert np.array_equal(hashes[0], hashes[2]), "Identical strings should produce identical hashes"
+    assert np.array_equal(counts[0], counts[2]), "Identical strings should produce identical counts"
 
-    # Unicode handling
-    unicode_strings = ["café", "世界", "🌟"]
-    utf8_results = utf8_engine(unicode_strings)
-    assert len(utf8_results) == 3
-    assert (
-        len(set(tuple(fp) if hasattr(fp, "__iter__") else fp for fp in utf8_results)) == 3
-    ), "Unicode strings should produce unique fingerprints"
+    # Different strings should produce different fingerprints
+    assert not np.array_equal(hashes[0], hashes[1]), "Different strings should produce different hashes"
+    assert not np.array_equal(counts[0], counts[1]), "Different strings should produce different counts"
 
 
 @pytest.mark.repeat(5)
 @pytest.mark.parametrize("batch_size", [1, 10, 100])
-def test_fingerprints_random(batch_size: int):
-    """Test fingerprinting with random strings."""
+@pytest.mark.parametrize("device_name", DEVICE_NAMES)
+def test_fingerprints_random(batch_size: int, device_name: str):
+    """Test Fingerprints with random strings."""
 
-    engine = szs.Fingerprints()
-    batch = [get_random_string(length=randint(5, 50)) for _ in range(batch_size)]
+    device_scope, capabilities = device_scope_and_capabilities(device_name)
+    engine = szs.Fingerprints(ndim=64, capabilities=capabilities)
+    batch = Strs([get_random_string(length=randint(5, 50)) for _ in range(batch_size)])
 
-    results = engine(batch)
-    assert len(results) == batch_size
+    hashes, counts = engine(batch, device=device_scope)
+    assert hashes.shape == (batch_size, 64)
+    assert counts.shape == (batch_size, 64)
 
     # Verify consistency
-    results_repeated = engine(batch)
-    assert results == results_repeated, "Same input should produce same fingerprints"
+    hashes_repeated, counts_repeated = engine(batch, device=device_scope)
+    assert np.array_equal(hashes, hashes_repeated), "Same input should produce same hashes"
+    assert np.array_equal(counts, counts_repeated), "Same input should produce same counts"
 
 
 if __name__ == "__main__":

From e7fdd986013cb48d9ab44b7495e42520a01f0298 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 15 Aug 2025 11:16:27 +0000
Subject: [PATCH 574/751] Fix: `to_span` compilation

---
 include/stringzilla/types.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index c2cc02d7..285a7e9a 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -260,7 +260,7 @@ struct span<value_type_, SZ_SIZE_MAX> {
     }
 };
 
-template <typename value_type_, std::size_t extent_>
+template <std::size_t extent_, typename value_type_>
 span<value_type_, extent_> to_span(span<value_type_, extent_> span) noexcept {
     return span;
 }

From 9a04c95aad9eeabcd3650d3641e5a6c4208ffff2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 15 Aug 2025 13:04:46 +0000
Subject: [PATCH 575/751] Add: Unrolled fingerprinting backends

---
 c/stringzillas.cuh                    | 128 ++++++++++++++++--
 include/stringzillas/fingerprints.cuh |   1 +
 include/stringzillas/fingerprints.hpp |   3 +
 python/stringzilla.c                  | 180 ++++++++++++++++----------
 4 files changed, 227 insertions(+), 85 deletions(-)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index 4a5484cc..e2ebd251 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -45,7 +45,9 @@ struct sz_sequence_as_cpp_container_t {
     std::string_view operator[](std::size_t index) const noexcept {
         sz_assert_(sequence_ != nullptr && "Sequence must not be null");
         sz_assert_(index < sequence_->count && "Index out of bounds");
-        return {sequence_->get_start(sequence_->handle, index), sequence_->get_length(sequence_->handle, index)};
+        sz_cptr_t start_ptr = sequence_->get_start(sequence_->handle, index);
+        sz_size_t length = sequence_->get_length(sequence_->handle, index);
+        return {start_ptr, length};
     }
 };
 
@@ -80,11 +82,13 @@ struct sz_sequence_u32tape_as_cpp_container_t {
 };
 
 /** Convenience class for slicing a strided fingerprints output. */
-template <typename element_type_>
+template <typename element_type_, sz_size_t row_extent_ = SZ_SIZE_MAX>
 struct strided_rows {
     using value_type = element_type_;
 
   private:
+    static constexpr sz_size_t extent_k = row_extent_; // Extent of each row, default to SZ_SIZE_MAX
+
     sz_ptr_t data_ = nullptr;
     sz_size_t stride_bytes_ = 0;
     sz_size_t row_length_ = 0;
@@ -96,9 +100,15 @@ struct strided_rows {
 
     std::size_t size() const noexcept { return count_; }
 
-    sz::span<value_type> operator[](std::size_t index) const noexcept {
+    template <sz_size_t new_extent_ = extent_k>
+    strided_rows<element_type_, new_extent_> shifted(std::ptrdiff_t offset) const noexcept {
+        return strided_rows<element_type_, new_extent_>(data_ + offset, row_length_, stride_bytes_, count_);
+    }
+
+    sz::span<value_type, extent_k> operator[](std::size_t index) const noexcept {
         sz_assert_(index < count_ && "Index out of bounds");
-        return sz::span<value_type>(reinterpret_cast<value_type *>(data_ + index * stride_bytes_), row_length_);
+        return sz::span<value_type, extent_k>(reinterpret_cast<value_type *>(data_ + index * stride_bytes_),
+                                              row_length_);
     }
 };
 
@@ -242,7 +252,7 @@ struct levenshtein_backends_t {
 template <typename texts_type_>
 sz_status_t sz_levenshtein_distances_for_(                                     //
     sz_levenshtein_distances_t engine_punned, sz_device_scope_t device_punned, //
-    texts_type_ &&a_container, texts_type_ &&b_container,                      //
+    texts_type_ const &a_container, texts_type_ const &b_container,            //
     sz_size_t *results, sz_size_t results_stride) {
 
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
@@ -322,7 +332,7 @@ struct levenshtein_utf8_backends_t {
 template <typename texts_type_>
 sz_status_t sz_levenshtein_distances_utf8_for_(                                     //
     sz_levenshtein_distances_utf8_t engine_punned, sz_device_scope_t device_punned, //
-    texts_type_ &&a_container, texts_type_ &&b_container,                           //
+    texts_type_ const &a_container, texts_type_ const &b_container,                 //
     sz_size_t *results, sz_size_t results_stride) {
 
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
@@ -399,7 +409,7 @@ struct needleman_wunsch_backends_t {
 template <typename texts_type_>
 sz_status_t sz_needleman_wunsch_scores_for_(                                     //
     sz_needleman_wunsch_scores_t engine_punned, sz_device_scope_t device_punned, //
-    texts_type_ &&a_container, texts_type_ &&b_container,                        //
+    texts_type_ const &a_container, texts_type_ const &b_container,              //
     sz_ssize_t *results, sz_size_t results_stride) {
 
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
@@ -486,7 +496,7 @@ struct smith_waterman_backends_t {
 template <typename texts_type_>
 sz_status_t sz_smith_waterman_scores_for_(                                     //
     sz_smith_waterman_scores_t engine_punned, sz_device_scope_t device_punned, //
-    texts_type_ &&a_container, texts_type_ &&b_container,                      //
+    texts_type_ const &a_container, texts_type_ const &b_container,            //
     sz_ssize_t *results, sz_size_t results_stride) {
 
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
@@ -580,7 +590,7 @@ struct fingerprints_backends_t {
 template <typename texts_type_>
 sz_status_t sz_fingerprints_for_(                                     //
     sz_fingerprints_t engine_punned, sz_device_scope_t device_punned, //
-    texts_type_ &&texts_container,                                    //
+    texts_type_ const &texts_container,                               //
     sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                //
     sz_u32_t *min_counts, sz_size_t min_counts_stride) {
 
@@ -596,15 +606,16 @@ sz_status_t sz_fingerprints_for_(                                     //
     // Wrap our stable ABI sequences into C++ friendly containers
     auto const dims = engine->dimensions;
     auto const texts_count = texts_container.size();
-    auto min_hashes_rows =
-        strided_rows<sz_u32_t> {reinterpret_cast<sz_ptr_t>(min_hashes), dims, min_hashes_stride, texts_count};
-    auto min_counts_rows =
-        strided_rows<sz_u32_t> {reinterpret_cast<sz_ptr_t>(min_counts), dims, min_counts_stride, texts_count};
 
     // The simplest case, is having non-optimized non-unrolled hashers.
     sz_status_t result = sz_success_k;
     using fallback_variant_t = typename fingerprints_backends_t::fallback_variant_t;
     auto fallback_logic = [&](fallback_variant_t &fallback_hashers) {
+        auto const min_hashes_rows = //
+            strided_rows<sz_u32_t> {reinterpret_cast<sz_ptr_t>(min_hashes), dims, min_hashes_stride, texts_count};
+        auto const min_counts_rows = //
+            strided_rows<sz_u32_t> {reinterpret_cast<sz_ptr_t>(min_counts), dims, min_counts_stride, texts_count};
+
         // CPU fallback hashers can only work with CPU-compatible device scopes
         if (std::holds_alternative<default_scope_t>(device->variants)) {
             auto &device_scope = std::get<default_scope_t>(device->variants);
@@ -625,7 +636,70 @@ sz_status_t sz_fingerprints_for_(                                     //
 
     // The unrolled logic is a bit more complex than `fallback_logic`, but in practice involves
     // just one additional loop level.
-    auto unrolled_logic = [&](auto &&unrolled_hashers) { std::printf("Unrolled hashers with %zu dimensions\n", dims); };
+    auto unrolled_logic = [&](auto &&unrolled_hashers) {
+        using unrolled_hashers_t = std::decay_t<decltype(unrolled_hashers)>;
+        using unrolled_hasher_t = typename unrolled_hashers_t::value_type;
+        constexpr sz_capability_t engine_capability_k = unrolled_hasher_t::capability_k;
+        constexpr size_t bytes_per_slice_k = fingerprint_slice_k * sizeof(sz_u32_t);
+
+        // Each engine will produce only a few dimensions so the outputs should be defined
+        // differently
+        auto const min_hashes_rows = //
+            strided_rows<sz_u32_t> {reinterpret_cast<sz_ptr_t>(min_hashes), fingerprint_slice_k, min_hashes_stride,
+                                    texts_count};
+        auto const min_counts_rows = //
+            strided_rows<sz_u32_t> {reinterpret_cast<sz_ptr_t>(min_counts), fingerprint_slice_k, min_counts_stride,
+                                    texts_count};
+
+        // GPU backends are only compatible with GPU scopes
+        if constexpr (is_gpu_capability(engine_capability_k)) {
+#if SZ_USE_CUDA
+            if (std::holds_alternative<gpu_scope_t>(device->variants)) {
+                auto &device_scope = std::get<gpu_scope_t>(device->variants);
+                for (std::size_t i = 0; i < unrolled_hashers.size(); ++i) {
+                    auto &engine_variant = unrolled_hashers[i];
+                    sz::status_t status = engine_variant(                                             //
+                        texts_container,                                                              //
+                        min_hashes_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
+                        min_counts_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
+                        get_executor(device_scope), get_specs(device_scope));
+                    result = static_cast<sz_status_t>(status);
+                }
+            }
+            else { result = sz_status_unknown_k; }
+#else
+            result = sz_status_unknown_k; // GPU support is not enabled
+#endif // SZ_USE_CUDA
+        }
+        // CPU backends are only compatible with CPU scopes
+        else {
+            if (std::holds_alternative<default_scope_t>(device->variants)) {
+                auto &device_scope = std::get<default_scope_t>(device->variants);
+                for (std::size_t i = 0; i < unrolled_hashers.size(); ++i) {
+                    auto &engine_variant = unrolled_hashers[i];
+                    sz::status_t status = engine_variant(                                             //
+                        texts_container,                                                              //
+                        min_hashes_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
+                        min_counts_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
+                        get_executor(device_scope), get_specs(device_scope));
+                    result = static_cast<sz_status_t>(status);
+                }
+            }
+            else if (std::holds_alternative<cpu_scope_t>(device->variants)) {
+                auto &device_scope = std::get<cpu_scope_t>(device->variants);
+                for (std::size_t i = 0; i < unrolled_hashers.size(); ++i) {
+                    auto &engine_variant = unrolled_hashers[i];
+                    sz::status_t status = engine_variant(                                             //
+                        texts_container,                                                              //
+                        min_hashes_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
+                        min_counts_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
+                        get_executor(device_scope), get_specs(device_scope));
+                    result = static_cast<sz_status_t>(status);
+                }
+            }
+            else { result = sz_status_unknown_k; }
+        }
+    };
 
     std::visit(overloaded {fallback_logic, unrolled_logic}, engine->variants);
     return result;
@@ -1307,6 +1381,7 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_init(                          //
         auto engine =
             new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<vec<hasher_t>>(), std::move(hashers));
         if (!engine) return sz_bad_alloc_k;
+        engine->dimensions = dimensions;
         *engine_punned = reinterpret_cast<sz_fingerprints_t>(engine);
         return sz_success_k;
     }
@@ -1331,6 +1406,7 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_init(                          //
         auto engine =
             new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<vec<hasher_t>>(), std::move(hashers));
         if (!engine) return sz_bad_alloc_k;
+        engine->dimensions = dimensions;
         *engine_punned = reinterpret_cast<sz_fingerprints_t>(engine);
         return sz_success_k;
     }
@@ -1355,11 +1431,35 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_init(                          //
         auto engine =
             new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<vec<hasher_t>>(), std::move(hashers));
         if (!engine) return sz_bad_alloc_k;
+        engine->dimensions = dimensions;
         *engine_punned = reinterpret_cast<sz_fingerprints_t>(engine);
         return sz_success_k;
     }
 #endif // SZ_USE_CUDA
 
+    // Build the vectorized, but serial backend
+    if (can_use_sliced_sketchers) {
+        auto const count_hashers = dimensions / fingerprint_slice_k;
+        using hasher_t = szs::floating_rolling_hashers<sz_cap_serial_k, fingerprint_slice_k>;
+        vec<hasher_t> hashers;
+        if (hashers.try_resize(count_hashers) != sz::status_t::success_k) return sz_bad_alloc_k;
+
+        // Populate the hashers with the given window widths
+        for (size_t i = 0; i < count_hashers; ++i) {
+            auto const window_width = window_widths[i % window_widths_count];
+            auto const first_dimension_offset = i * fingerprint_slice_k;
+            auto const seed_status = hashers[i].try_seed(window_width, alphabet_size, first_dimension_offset);
+            if (seed_status != sz::status_t::success_k) return static_cast<sz_status_t>(seed_status);
+        }
+
+        auto engine =
+            new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<vec<hasher_t>>(), std::move(hashers));
+        if (!engine) return sz_bad_alloc_k;
+        engine->dimensions = dimensions;
+        *engine_punned = reinterpret_cast<sz_fingerprints_t>(engine);
+        return sz_success_k;
+    }
+
     // Build the fallback variant with interleaving width dimensions
     auto variant = fallback_variant_t();
     for (size_t dimension = 0; dimension < dimensions; ++dimension) {
diff --git a/include/stringzillas/fingerprints.cuh b/include/stringzillas/fingerprints.cuh
index 45023bac..1c707149 100644
--- a/include/stringzillas/fingerprints.cuh
+++ b/include/stringzillas/fingerprints.cuh
@@ -271,6 +271,7 @@ struct floating_rolling_hashers<sz_cap_cuda_k, dimensions_> {
     using hashers_t = safe_vector<hasher_t, hashers_allocator_t>;
 
     static constexpr size_t dimensions_k = dimensions_;
+    static constexpr sz_capability_t capability_k = sz_cap_cuda_k;
     static constexpr rolling_state_t skipped_rolling_state_k = std::numeric_limits<rolling_state_t>::max();
     static constexpr min_hash_t max_hash_k = std::numeric_limits<min_hash_t>::max();
 
diff --git a/include/stringzillas/fingerprints.hpp b/include/stringzillas/fingerprints.hpp
index dd6d19f2..ae37aac9 100644
--- a/include/stringzillas/fingerprints.hpp
+++ b/include/stringzillas/fingerprints.hpp
@@ -1042,6 +1042,7 @@ struct floating_rolling_hashers<sz_cap_serial_k, dimensions_> {
     using min_count_t = u32_t;
 
     static constexpr size_t dimensions_k = dimensions_;
+    static constexpr sz_capability_t capability_k = sz_cap_serial_k;
     static constexpr rolling_state_t skipped_rolling_state_k = std::numeric_limits<rolling_state_t>::max();
     static constexpr rolling_state_t skipped_rolling_hash_k = std::numeric_limits<rolling_state_t>::max();
     static constexpr min_hash_t max_hash_k = std::numeric_limits<min_hash_t>::max();
@@ -1279,6 +1280,7 @@ struct floating_rolling_hashers<sz_cap_haswell_k, dimensions_> {
     using min_count_t = u32_t;
 
     static constexpr size_t dimensions_k = dimensions_;
+    static constexpr sz_capability_t capability_k = sz_cap_haswell_k;
     static constexpr rolling_state_t skipped_rolling_hash_k = std::numeric_limits<rolling_state_t>::max();
     static constexpr min_hash_t max_hash_k = std::numeric_limits<min_hash_t>::max();
 
@@ -1607,6 +1609,7 @@ struct floating_rolling_hashers<sz_cap_skylake_k, dimensions_> {
     using min_count_t = u32_t;
 
     static constexpr size_t dimensions_k = dimensions_;
+    static constexpr sz_capability_t capability_k = sz_cap_skylake_k;
     static constexpr rolling_state_t skipped_rolling_hash_k = std::numeric_limits<rolling_state_t>::max();
     static constexpr min_hash_t max_hash_k = std::numeric_limits<min_hash_t>::max();
 
diff --git a/python/stringzilla.c b/python/stringzilla.c
index a6d1a4a3..9a7f8475 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -187,7 +187,7 @@ typedef struct {
         STRS_U32_TAPE = 2,
         STRS_U64_TAPE = 3,
         STRS_FRAGMENTED = 4,
-    } type;
+    } layout;
 
     union {
         /**
@@ -270,7 +270,7 @@ static void temporary_memory_free(sz_ptr_t start, sz_size_t size, sz_string_view
 
 static sz_cptr_t Strs_get_start_(void const *handle, sz_size_t i) {
     Strs *strs = (Strs *)handle;
-    switch (strs->type) {
+    switch (strs->layout) {
     case STRS_U32_TAPE: return strs->data.u32_tape.data + strs->data.u32_tape.offsets[i];
     case STRS_U32_TAPE_VIEW: return strs->data.u32_tape_view.data + strs->data.u32_tape_view.offsets[i];
     case STRS_U64_TAPE: return strs->data.u64_tape.data + strs->data.u64_tape.offsets[i];
@@ -282,7 +282,7 @@ static sz_cptr_t Strs_get_start_(void const *handle, sz_size_t i) {
 
 static sz_size_t Strs_get_length_(void const *handle, sz_size_t i) {
     Strs *strs = (Strs *)handle;
-    switch (strs->type) {
+    switch (strs->layout) {
     case STRS_U32_TAPE: return strs->data.u32_tape.offsets[i + 1] - strs->data.u32_tape.offsets[i];
     case STRS_U32_TAPE_VIEW: return strs->data.u32_tape_view.offsets[i + 1] - strs->data.u32_tape_view.offsets[i];
     case STRS_U64_TAPE: return strs->data.u64_tape.offsets[i + 1] - strs->data.u64_tape.offsets[i];
@@ -395,14 +395,14 @@ SZ_DYNAMIC sz_bool_t sz_py_export_string_like(PyObject *object, sz_cptr_t *start
         return 1;
     }
     else {
-        PyErr_SetString(PyExc_TypeError, "Unsupported argument type");
+        PyErr_SetString(PyExc_TypeError, "Unsupported argument layout");
         return 0;
     }
 }
 
-sz_cptr_t sz_py_strs_sequence_member_start_if_reordered(void const *sequence_punned, sz_size_t index) {
+sz_cptr_t sz_py_strs_sequence_member_start_if_fragmented(void const *sequence_punned, sz_size_t index) {
     Strs *strs = (Strs *)sequence_punned;
-    sz_assert_(strs->type == STRS_FRAGMENTED && "Expected a reordered Strs type");
+    sz_assert_(strs->layout == STRS_FRAGMENTED && "Expected a reordered Strs layout");
     if (index < 0 || index >= strs->data.fragmented.count) {
         PyErr_SetString(PyExc_IndexError, "Index out of bounds");
         return NULL;
@@ -410,9 +410,9 @@ sz_cptr_t sz_py_strs_sequence_member_start_if_reordered(void const *sequence_pun
     return strs->data.fragmented.spans[index].start;
 }
 
-sz_size_t sz_py_strs_sequence_member_length_if_reordered(void const *sequence_punned, sz_size_t index) {
+sz_size_t sz_py_strs_sequence_member_length_if_fragmented(void const *sequence_punned, sz_size_t index) {
     Strs *strs = (Strs *)sequence_punned;
-    sz_assert_(strs->type == STRS_FRAGMENTED && "Expected a reordered Strs type");
+    sz_assert_(strs->layout == STRS_FRAGMENTED && "Expected a reordered Strs layout");
     if (index < 0 || index >= strs->data.fragmented.count) {
         PyErr_SetString(PyExc_IndexError, "Index out of bounds");
         return 0;
@@ -428,12 +428,12 @@ SZ_DYNAMIC sz_bool_t sz_py_export_strings_as_sequence(PyObject *object, sz_seque
 
     if (PyObject_TypeCheck(object, &StrsType)) {
         Strs *strs = (Strs *)object;
-        sz_assert_(strs->type == STRS_FRAGMENTED && "View as tapes!");
+        sz_assert_(strs->layout == STRS_FRAGMENTED && "View as tapes!");
 
         sequence->handle = strs;
         sequence->count = strs->data.fragmented.count;
-        sequence->get_start = sz_py_strs_sequence_member_start_if_reordered;
-        sequence->get_length = sz_py_strs_sequence_member_length_if_reordered;
+        sequence->get_start = sz_py_strs_sequence_member_start_if_fragmented;
+        sequence->get_length = sz_py_strs_sequence_member_length_if_fragmented;
         return sz_true_k;
     }
 
@@ -450,13 +450,13 @@ SZ_DYNAMIC sz_bool_t sz_py_export_strings_as_u32tape(PyObject *object, sz_cptr_t
     if (!PyObject_TypeCheck(object, &StrsType)) return sz_false_k;
     Strs *strs = (Strs *)object;
 
-    if (strs->type == STRS_U32_TAPE) {
+    if (strs->layout == STRS_U32_TAPE) {
         *data = strs->data.u32_tape.data;
         *offsets = strs->data.u32_tape.offsets;
         *count = strs->data.u32_tape.count;
         return sz_true_k;
     }
-    else if (strs->type == STRS_U32_TAPE_VIEW) {
+    else if (strs->layout == STRS_U32_TAPE_VIEW) {
         *data = strs->data.u32_tape_view.data;
         *offsets = strs->data.u32_tape_view.offsets;
         *count = strs->data.u32_tape_view.count;
@@ -475,12 +475,12 @@ SZ_DYNAMIC sz_bool_t sz_py_export_strings_as_u64tape(PyObject *object, sz_cptr_t
     if (!PyObject_TypeCheck(object, &StrsType)) return sz_false_k;
     Strs *strs = (Strs *)object;
 
-    if (strs->type == STRS_U64_TAPE) {
+    if (strs->layout == STRS_U64_TAPE) {
         *data = strs->data.u64_tape.data;
         *offsets = strs->data.u64_tape.offsets;
         *count = strs->data.u64_tape.count;
     }
-    else if (strs->type == STRS_U64_TAPE_VIEW) {
+    else if (strs->layout == STRS_U64_TAPE_VIEW) {
         *data = strs->data.u64_tape_view.data;
         *offsets = strs->data.u64_tape_view.offsets;
         *count = strs->data.u64_tape_view.count;
@@ -585,7 +585,7 @@ static sz_bool_t sz_py_replace_u32_tape_view_allocator(Strs *strs, sz_memory_all
     Py_XDECREF(view->parent);
 
     // Convert to tape layout
-    strs->type = STRS_U32_TAPE;
+    strs->layout = STRS_U32_TAPE;
     strs->data.u32_tape.count = view->count;
     strs->data.u32_tape.data = new_string_data;
     strs->data.u32_tape.offsets = new_offsets;
@@ -622,7 +622,7 @@ static sz_bool_t sz_py_replace_u64_tape_view_allocator(Strs *strs, sz_memory_all
     Py_XDECREF(view->parent);
 
     // Convert to tape layout
-    strs->type = STRS_U64_TAPE;
+    strs->layout = STRS_U64_TAPE;
     strs->data.u64_tape.count = view->count;
     strs->data.u64_tape.data = new_string_data;
     strs->data.u64_tape.offsets = new_offsets;
@@ -648,7 +648,7 @@ static sz_bool_t sz_py_replace_fragmented_allocator(Strs *strs, sz_memory_alloca
         old_allocator->free(fragmented->spans, fragmented->count * sizeof(sz_string_view_t), old_allocator->handle);
         Py_XDECREF(fragmented->parent);
 
-        strs->type = STRS_U32_TAPE;
+        strs->layout = STRS_U32_TAPE;
         strs->data.u32_tape.count = fragmented->count;
         strs->data.u32_tape.data = NULL;
         strs->data.u32_tape.offsets = NULL;
@@ -682,7 +682,7 @@ static sz_bool_t sz_py_replace_fragmented_allocator(Strs *strs, sz_memory_alloca
         old_allocator->free(fragmented->spans, fragmented->count * sizeof(sz_string_view_t), old_allocator->handle);
         Py_XDECREF(fragmented->parent);
 
-        strs->type = STRS_U64_TAPE;
+        strs->layout = STRS_U64_TAPE;
         strs->data.u64_tape.count = fragmented->count;
         strs->data.u64_tape.data = new_data;
         strs->data.u64_tape.offsets = new_offsets;
@@ -716,7 +716,7 @@ static sz_bool_t sz_py_replace_fragmented_allocator(Strs *strs, sz_memory_alloca
         old_allocator->free(fragmented->spans, fragmented->count * sizeof(sz_string_view_t), old_allocator->handle);
         Py_XDECREF(fragmented->parent);
 
-        strs->type = STRS_U32_TAPE;
+        strs->layout = STRS_U32_TAPE;
         strs->data.u32_tape.count = fragmented->count;
         strs->data.u32_tape.data = new_data;
         strs->data.u32_tape.offsets = new_offsets;
@@ -729,7 +729,7 @@ static sz_bool_t sz_py_replace_fragmented_allocator(Strs *strs, sz_memory_alloca
  *  @brief  Helper function to replace the memory allocator in a `Strs` object.
  *          This reallocates existing string data using the new allocator.
  *
- *  This may change the type of the `Strs` layout:
+ *  This may change the layout of the `Strs` layout:
  *  - `STRS_U32_TAPE_VIEW` becomes `STRS_U32_TAPE`.
  *  - `STRS_U64_TAPE_VIEW` becomes `STRS_U64_TAPE`.
  *  - `STRS_U32_TAPE` remains, if the allocator is different.
@@ -742,9 +742,9 @@ SZ_DYNAMIC sz_bool_t sz_py_replace_strings_allocator(PyObject *object, sz_memory
 
     Strs *strs = (Strs *)object;
 
-    // Get the current allocator based on type
+    // Get the current allocator based on layout
     sz_memory_allocator_t old_allocator;
-    switch (strs->type) {
+    switch (strs->layout) {
     case STRS_U32_TAPE: old_allocator = strs->data.u32_tape.allocator; break;
     case STRS_U64_TAPE: old_allocator = strs->data.u64_tape.allocator; break;
     case STRS_FRAGMENTED: old_allocator = strs->data.fragmented.allocator; break;
@@ -759,7 +759,7 @@ SZ_DYNAMIC sz_bool_t sz_py_replace_strings_allocator(PyObject *object, sz_memory
     if (sz_memory_allocator_equal(&old_allocator, allocator)) return sz_true_k;
 
     // Handle different Strs layouts using dedicated functions
-    switch (strs->type) {
+    switch (strs->layout) {
     case STRS_U32_TAPE: return sz_py_replace_u32_tape_allocator(strs, &old_allocator, allocator);
     case STRS_U64_TAPE: return sz_py_replace_u64_tape_allocator(strs, &old_allocator, allocator);
     case STRS_U32_TAPE_VIEW: return sz_py_replace_u32_tape_view_allocator(strs, allocator);
@@ -832,15 +832,15 @@ void str_at_offset_fragmented(Strs *strs, Py_ssize_t i, Py_ssize_t count, //
 }
 
 get_string_at_offset_t str_at_offset_getter(Strs *strs) {
-    switch (strs->type) {
+    switch (strs->layout) {
     case STRS_U32_TAPE: return str_at_offset_u32_tape;
     case STRS_U32_TAPE_VIEW: return str_at_offset_u32_tape_view;
     case STRS_U64_TAPE: return str_at_offset_u64_tape;
     case STRS_U64_TAPE_VIEW: return str_at_offset_u64_tape_view;
     case STRS_FRAGMENTED: return str_at_offset_fragmented;
     default:
-        // Unsupported type
-        PyErr_SetString(PyExc_TypeError, "Unsupported type for conversion");
+        // Unsupported layout
+        PyErr_SetString(PyExc_TypeError, "Unsupported layout for conversion");
         return NULL;
     }
 }
@@ -1029,7 +1029,7 @@ static int Str_init(Str *self, PyObject *args, PyObject *kwargs) {
                 return -1;
     }
 
-    // Now, type-check and cast each argument
+    // Now, layout-check and cast each argument
     Py_ssize_t from = 0, to = PY_SSIZE_T_MAX;
     if (from_obj) {
         from = PyLong_AsSsize_t(from_obj);
@@ -1360,7 +1360,7 @@ static int Str_in(Str *self, PyObject *needle_obj) {
 
     sz_string_view_t needle;
     if (!sz_py_export_string_like(needle_obj, &needle.start, &needle.length)) {
-        wrap_current_exception("Unsupported needle type");
+        wrap_current_exception("Unsupported needle layout");
         return -1;
     }
 
@@ -1375,7 +1375,7 @@ static PyObject *Strs_get_tape_nbytes(Str *self, void *closure) { return NULL; }
 static PyObject *Strs_get_offsets_nbytes(Str *self, void *closure) { return NULL; }
 
 static Py_ssize_t Strs_len(Strs *self) {
-    switch (self->type) {
+    switch (self->layout) {
     case STRS_U32_TAPE: return self->data.u32_tape.count;
     case STRS_U32_TAPE_VIEW: return self->data.u32_tape_view.count;
     case STRS_U64_TAPE: return self->data.u64_tape.count;
@@ -1446,7 +1446,7 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) {
     if (result == NULL && PyErr_NoMemory()) return NULL;
 
     if (result_count == 0) {
-        result->type = STRS_FRAGMENTED;
+        result->layout = STRS_FRAGMENTED;
         result->data.fragmented.count = 0;
         result->data.fragmented.spans = NULL;
         result->data.fragmented.parent = NULL;
@@ -1465,7 +1465,7 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) {
         }
 
         get_string_at_offset_t getter = str_at_offset_getter(self);
-        result->type = STRS_FRAGMENTED;
+        result->layout = STRS_FRAGMENTED;
         result->data.fragmented.count = result_count;
         result->data.fragmented.spans = new_spans;
         result->data.fragmented.parent = NULL;
@@ -1488,11 +1488,11 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) {
     }
 
     // For step=1, follow the docstring behavior:
-    switch (self->type) {
+    switch (self->layout) {
 
     case STRS_U32_TAPE_VIEW: {
         // STRS_U32_TAPE_VIEW input yields STRS_U32_TAPE_VIEW for step=1
-        result->type = STRS_U32_TAPE_VIEW;
+        result->layout = STRS_U32_TAPE_VIEW;
         result->data.u32_tape_view.count = result_count;
         result->data.u32_tape_view.data = self->data.u32_tape_view.data + self->data.u32_tape_view.offsets[start];
         result->data.u32_tape_view.offsets = self->data.u32_tape_view.offsets + start;
@@ -1503,7 +1503,7 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) {
 
     case STRS_U64_TAPE_VIEW: {
         // STRS_U64_TAPE_VIEW input yields STRS_U64_TAPE_VIEW for step=1
-        result->type = STRS_U64_TAPE_VIEW;
+        result->layout = STRS_U64_TAPE_VIEW;
         result->data.u64_tape_view.count = result_count;
         result->data.u64_tape_view.data = self->data.u64_tape_view.data + self->data.u64_tape_view.offsets[start];
         result->data.u64_tape_view.offsets = self->data.u64_tape_view.offsets + start;
@@ -1514,7 +1514,7 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) {
 
     case STRS_U32_TAPE: {
         // STRS_U32_TAPE input yields STRS_U32_TAPE_VIEW for step=1
-        result->type = STRS_U32_TAPE_VIEW;
+        result->layout = STRS_U32_TAPE_VIEW;
         result->data.u32_tape_view.count = result_count;
         result->data.u32_tape_view.data = self->data.u32_tape.data + self->data.u32_tape.offsets[start];
         result->data.u32_tape_view.offsets = self->data.u32_tape.offsets + start;
@@ -1525,7 +1525,7 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) {
 
     case STRS_U64_TAPE: {
         // STRS_U64_TAPE input yields STRS_U64_TAPE_VIEW for step=1
-        result->type = STRS_U64_TAPE_VIEW;
+        result->layout = STRS_U64_TAPE_VIEW;
         result->data.u64_tape_view.count = result_count;
         result->data.u64_tape_view.data = self->data.u64_tape.data + self->data.u64_tape.offsets[start];
         result->data.u64_tape_view.offsets = self->data.u64_tape.offsets + start;
@@ -1536,7 +1536,7 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) {
 
     case STRS_FRAGMENTED: {
         // STRS_FRAGMENTED input yields STRS_FRAGMENTED output
-        result->type = STRS_FRAGMENTED;
+        result->layout = STRS_FRAGMENTED;
         result->data.fragmented.count = result_count;
         result->data.fragmented.parent = self->data.fragmented.parent;
         sz_memory_allocator_init_default(&result->data.fragmented.allocator);
@@ -1553,8 +1553,8 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) {
     }
 
     default:
-        // Unsupported type
-        PyErr_SetString(PyExc_TypeError, "Unsupported type for conversion");
+        // Unsupported layout
+        PyErr_SetString(PyExc_TypeError, "Unsupported layout for conversion");
         Py_XDECREF(result);
         return NULL;
     }
@@ -2919,7 +2919,7 @@ static Strs *Str_split_(PyObject *parent_string, sz_string_view_t const text, sz
     if (!result) return NULL;
 
     // Use reordered subviews layout with the haystack as parent
-    result->type = STRS_FRAGMENTED;
+    result->layout = STRS_FRAGMENTED;
     result->data.fragmented.parent = parent_string;
     sz_memory_allocator_init_default(&result->data.fragmented.allocator);
 
@@ -3009,7 +3009,7 @@ static Strs *Str_rsplit_(PyObject *parent_string, sz_string_view_t const text, s
     if (!result) return NULL;
 
     // Use reordered subviews layout with the haystack as parent
-    result->type = STRS_FRAGMENTED;
+    result->layout = STRS_FRAGMENTED;
     result->data.fragmented.parent = parent_string;
     sz_memory_allocator_init_default(&result->data.fragmented.allocator);
     result->data.fragmented.spans = NULL;
@@ -3698,7 +3698,7 @@ static PyObject *Strs_shuffled(Strs *self, PyObject *const *args, Py_ssize_t pos
     PyObject *parent_to_increment = NULL;
     sz_memory_allocator_t allocator;
 
-    switch (self->type) {
+    switch (self->layout) {
     case STRS_U32_TAPE:
         substring_getter = str_at_offset_u32_tape;
         substrings_count = self->data.u32_tape.count;
@@ -3766,7 +3766,7 @@ static PyObject *Strs_shuffled(Strs *self, PyObject *const *args, Py_ssize_t pos
     }
 
     // Set up the new reordered object
-    result->type = STRS_FRAGMENTED;
+    result->layout = STRS_FRAGMENTED;
     result->data.fragmented.count = substrings_count;
     result->data.fragmented.spans = new_spans;
     result->data.fragmented.parent = parent_to_increment;
@@ -3823,7 +3823,7 @@ static PyObject *Strs_sorted(Strs *self, PyObject *const *args, Py_ssize_t posit
     PyObject *parent_to_increment = NULL;
     sz_memory_allocator_t allocator;
 
-    switch (self->type) {
+    switch (self->layout) {
     case STRS_U32_TAPE:
         substring_getter = str_at_offset_u32_tape;
         substrings_count = self->data.u32_tape.count;
@@ -3929,7 +3929,7 @@ static PyObject *Strs_sorted(Strs *self, PyObject *const *args, Py_ssize_t posit
     }
 
     // Set up the new sorted object
-    result->type = STRS_FRAGMENTED;
+    result->layout = STRS_FRAGMENTED;
     result->data.fragmented.count = substrings_count;
     result->data.fragmented.spans = sorted_spans;
     result->data.fragmented.parent = parent_to_increment;
@@ -4084,7 +4084,7 @@ static PyObject *Strs_sample(Strs *self, PyObject *const *args, Py_ssize_t posit
     // Initialize the memory allocator with default malloc wrapper
     sz_memory_allocator_init_default(&result->data.fragmented.allocator);
 
-    result->type = STRS_FRAGMENTED;
+    result->layout = STRS_FRAGMENTED;
     result->data.fragmented.count = 0;
     result->data.fragmented.spans = NULL;
     result->data.fragmented.parent = NULL;
@@ -4114,7 +4114,7 @@ static PyObject *Strs_sample(Strs *self, PyObject *const *args, Py_ssize_t posit
     }
 
     // Update the `Strs` object
-    result->type = STRS_FRAGMENTED;
+    result->layout = STRS_FRAGMENTED;
     result->data.fragmented.count = sample_size;
     result->data.fragmented.spans = result_spans;
     result->data.fragmented.parent = parent_string;
@@ -4123,6 +4123,43 @@ static PyObject *Strs_sample(Strs *self, PyObject *const *args, Py_ssize_t posit
     return result;
 }
 
+static PyObject *Strs_get_layout(Strs *self, void *Py_UNUSED(closure)) {
+    char buffer[1024];
+
+    switch (self->layout) {
+    case STRS_U32_TAPE_VIEW:
+        snprintf(buffer, sizeof(buffer), "Strs[layout=U32_TAPE_VIEW, count=%zu, data=%p, offsets=%p, parent=%p]",
+                 self->data.u32_tape_view.count, self->data.u32_tape_view.data, self->data.u32_tape_view.offsets,
+                 self->data.u32_tape_view.parent);
+        break;
+
+    case STRS_U64_TAPE_VIEW:
+        snprintf(buffer, sizeof(buffer), "Strs[layout=U64_TAPE_VIEW, count=%zu, data=%p, offsets=%p, parent=%p]",
+                 self->data.u64_tape_view.count, self->data.u64_tape_view.data, self->data.u64_tape_view.offsets,
+                 self->data.u64_tape_view.parent);
+        break;
+
+    case STRS_U32_TAPE:
+        snprintf(buffer, sizeof(buffer), "Strs[layout=U32_TAPE, count=%zu, data=%p, offsets=%p]",
+                 self->data.u32_tape.count, self->data.u32_tape.data, self->data.u32_tape.offsets);
+        break;
+
+    case STRS_U64_TAPE:
+        snprintf(buffer, sizeof(buffer), "Strs[layout=U64_TAPE, count=%zu, data=%p, offsets=%p]",
+                 self->data.u64_tape.count, self->data.u64_tape.data, self->data.u64_tape.offsets);
+        break;
+
+    case STRS_FRAGMENTED:
+        snprintf(buffer, sizeof(buffer), "Strs[layout=FRAGMENTED, count=%zu, spans=%p, parent=%p]",
+                 self->data.fragmented.count, self->data.fragmented.spans, self->data.fragmented.parent);
+        break;
+
+    default: snprintf(buffer, sizeof(buffer), "Strs[layout=UNKNOWN(%d)]", self->layout); break;
+    }
+
+    return PyUnicode_FromString(buffer);
+}
+
 /**
  *  @brief Exports a string to a UTF-8 buffer, escaping single quotes.
  *  @param[out] did_fit Populated with 1 if the string is fully exported, 0 if it didn't fit.
@@ -4305,6 +4342,7 @@ static PyGetSetDef Strs_getsetters[] = {
     {"offsets_nbytes", (getter)Strs_get_offsets_nbytes, NULL, "Get teh length of offsets array in bytes", NULL},
     {"offsets_are_large", (getter)Strs_get_offsets_are_large, NULL,
      "Checks if 64-bit addressing should be used to convert to Arrow", NULL},
+    {"__layout__", (getter)Strs_get_layout, NULL, "Debug information about the internal layout", NULL},
     {NULL} // Sentinel
 };
 
@@ -4336,11 +4374,11 @@ static int Strs_init_from_pyarrow(Strs *self, PyObject *sequence_obj, int view)
         return -1;
     }
 
-    // Validate string array type
+    // Validate string array layout
     if (!schema->format || (strcmp(schema->format, "u") != 0 && strcmp(schema->format, "U") != 0 &&
                             strcmp(schema->format, "z") != 0 && strcmp(schema->format, "Z") != 0)) {
         Py_DECREF(capsules);
-        PyErr_SetString(PyExc_ValueError, "Arrow array must be string type");
+        PyErr_SetString(PyExc_ValueError, "Arrow array must be string layout");
         return -1;
     }
 
@@ -4361,7 +4399,7 @@ static int Strs_init_from_pyarrow(Strs *self, PyObject *sequence_obj, int view)
     if (view) {
         if (use_64bit) {
             sz_i64_t const *offsets_64 = (sz_i64_t const *)buffers[1];
-            self->type = STRS_U64_TAPE_VIEW;
+            self->layout = STRS_U64_TAPE_VIEW;
             self->data.u64_tape_view.count = length;
             self->data.u64_tape_view.parent = capsules;
             self->data.u64_tape_view.data = data_buffer;
@@ -4370,7 +4408,7 @@ static int Strs_init_from_pyarrow(Strs *self, PyObject *sequence_obj, int view)
         }
         else {
             sz_i32_t const *offsets_32 = (sz_i32_t const *)buffers[1];
-            self->type = STRS_U32_TAPE_VIEW;
+            self->layout = STRS_U32_TAPE_VIEW;
             self->data.u32_tape_view.count = length;
             self->data.u32_tape_view.parent = capsules;
             self->data.u32_tape_view.data = data_buffer;
@@ -4410,7 +4448,7 @@ static int Strs_init_from_pyarrow(Strs *self, PyObject *sequence_obj, int view)
                 else { new_offsets[i + 1] = offsets_64[i + 1] - offsets_64[0]; }
             }
 
-            self->type = STRS_U64_TAPE;
+            self->layout = STRS_U64_TAPE;
             self->data.u64_tape.count = length;
             self->data.u64_tape.data = new_data;
             self->data.u64_tape.offsets = new_offsets;
@@ -4442,7 +4480,7 @@ static int Strs_init_from_pyarrow(Strs *self, PyObject *sequence_obj, int view)
                 else { new_offsets[i + 1] = offsets_32[i + 1] - offsets_32[0]; }
             }
 
-            self->type = STRS_U32_TAPE;
+            self->layout = STRS_U32_TAPE;
             self->data.u32_tape.count = length;
             self->data.u32_tape.data = new_data;
             self->data.u32_tape.offsets = new_offsets;
@@ -4460,7 +4498,7 @@ static int Strs_init_from_tuple(Strs *self, PyObject *sequence_obj, int view) {
 
     // Empty tuple, create empty Strs
     if (count == 0) {
-        self->type = STRS_FRAGMENTED;
+        self->layout = STRS_FRAGMENTED;
         self->data.fragmented.count = 0;
         self->data.fragmented.spans = NULL;
         self->data.fragmented.parent = NULL;
@@ -4495,7 +4533,7 @@ static int Strs_init_from_tuple(Strs *self, PyObject *sequence_obj, int view) {
             parts[i].length = item_length;
         }
 
-        self->type = STRS_FRAGMENTED;
+        self->layout = STRS_FRAGMENTED;
         self->data.fragmented.count = count;
         self->data.fragmented.spans = parts;
         self->data.fragmented.allocator = allocator;
@@ -4554,7 +4592,7 @@ static int Strs_init_from_tuple(Strs *self, PyObject *sequence_obj, int view) {
                 offsets[i + 1] = offset; // Apache Arrow format: offset after this string
             }
 
-            self->type = STRS_U64_TAPE;
+            self->layout = STRS_U64_TAPE;
             self->data.u64_tape.count = count;
             self->data.u64_tape.data = data_buffer;
             self->data.u64_tape.offsets = offsets;
@@ -4582,7 +4620,7 @@ static int Strs_init_from_tuple(Strs *self, PyObject *sequence_obj, int view) {
                 offsets[i + 1] = offset; // Apache Arrow format: offset after this string
             }
 
-            self->type = STRS_U32_TAPE;
+            self->layout = STRS_U32_TAPE;
             self->data.u32_tape.count = count;
             self->data.u32_tape.data = data_buffer;
             self->data.u32_tape.offsets = offsets;
@@ -4599,7 +4637,7 @@ static int Strs_init_from_list(Strs *self, PyObject *sequence_obj, int view) {
 
     // Handle empty list
     if (count == 0) {
-        self->type = STRS_FRAGMENTED;
+        self->layout = STRS_FRAGMENTED;
         self->data.fragmented.count = 0;
         self->data.fragmented.spans = NULL;
         sz_memory_allocator_init_default(&self->data.fragmented.allocator);
@@ -4638,7 +4676,7 @@ static int Strs_init_from_list(Strs *self, PyObject *sequence_obj, int view) {
         }
 
         // Setup reordered layout with parent list to keep strings alive
-        self->type = STRS_FRAGMENTED;
+        self->layout = STRS_FRAGMENTED;
         self->data.fragmented.count = count;
         self->data.fragmented.spans = parts;
         self->data.fragmented.allocator = allocator;
@@ -4716,14 +4754,14 @@ static int Strs_init_from_list(Strs *self, PyObject *sequence_obj, int view) {
 
         // Setup the consecutive layout (32-bit or 64-bit)
         if (use_64bit) {
-            self->type = STRS_U64_TAPE;
+            self->layout = STRS_U64_TAPE;
             self->data.u64_tape.count = count;
             self->data.u64_tape.data = data_buffer;
             self->data.u64_tape.offsets = (sz_u64_t *)offsets;
             self->data.u64_tape.allocator = allocator;
         }
         else {
-            self->type = STRS_U32_TAPE;
+            self->layout = STRS_U32_TAPE;
             self->data.u32_tape.count = count;
             self->data.u32_tape.data = data_buffer;
             self->data.u32_tape.offsets = (sz_u32_t *)offsets;
@@ -4902,7 +4940,7 @@ static int Strs_init_from_iterable(Strs *self, PyObject *sequence_obj, int view)
     if (count == 0) {
         allocator.free(data_buffer, data_capacity, allocator.handle);
         allocator.free(offsets, offsets_capacity * sizeof(sz_u32_t), allocator.handle);
-        self->type = STRS_FRAGMENTED;
+        self->layout = STRS_FRAGMENTED;
         self->data.fragmented.count = 0;
         self->data.fragmented.spans = NULL;
         self->data.fragmented.allocator = allocator;
@@ -4929,14 +4967,14 @@ static int Strs_init_from_iterable(Strs *self, PyObject *sequence_obj, int view)
 
     // Setup the consecutive layout (32-bit or 64-bit)
     if (use_64bit) {
-        self->type = STRS_U64_TAPE;
+        self->layout = STRS_U64_TAPE;
         self->data.u64_tape.count = count;
         self->data.u64_tape.data = data_buffer;
         self->data.u64_tape.offsets = (sz_u64_t *)offsets;
         self->data.u64_tape.allocator = allocator;
     }
     else {
-        self->type = STRS_U32_TAPE;
+        self->layout = STRS_U32_TAPE;
         self->data.u32_tape.count = count;
         self->data.u32_tape.data = data_buffer;
         self->data.u32_tape.offsets = (sz_u32_t *)offsets;
@@ -4982,7 +5020,7 @@ static int Strs_init(Strs *self, PyObject *args, PyObject *kwargs) {
 
     // If no sequence provided, create empty Strs
     if (!sequence_obj) {
-        self->type = STRS_FRAGMENTED;
+        self->layout = STRS_FRAGMENTED;
         self->data.fragmented.count = 0;
         self->data.fragmented.spans = NULL;
         sz_memory_allocator_init_default(&self->data.fragmented.allocator);
@@ -5014,7 +5052,7 @@ static int Strs_init(Strs *self, PyObject *args, PyObject *kwargs) {
 }
 
 static void Strs_dealloc(Strs *self) {
-    switch (self->type) {
+    switch (self->layout) {
     case STRS_U32_TAPE:
         // Free owned data and offsets
         if (self->data.u32_tape.data) {
@@ -5068,10 +5106,10 @@ static void Strs_dealloc(Strs *self) {
 }
 
 static PyMethodDef Strs_methods[] = {
-    {"shuffled", Strs_shuffled, SZ_METHOD_FLAGS, "Shuffle the elements of the Strs object."},        //
-    {"sorted", Strs_sorted, SZ_METHOD_FLAGS, "Sort (in-place) the elements of the Strs object."},    //
-    {"argsort", Strs_argsort, SZ_METHOD_FLAGS, "Provides the permutation to achieve sorted order."}, //
-    {"sample", Strs_sample, SZ_METHOD_FLAGS, "Provides a random sample of a given size."},           //
+    {"shuffled", Strs_shuffled, SZ_METHOD_FLAGS, "Shuffle the elements of the Strs object."},         //
+    {"sorted", Strs_sorted, SZ_METHOD_FLAGS, "Sort (in-place) the elements of the Strs object."},     //
+    {"argsort", Strs_argsort, SZ_METHOD_FLAGS, "Provides the permutation to achieve sorted order."},  //
+    {"sample", Strs_sample, SZ_METHOD_FLAGS, "Provides a random sample of a given size."},            //
     // {"to_pylist", Strs_to_pylist, SZ_METHOD_FLAGS, "Exports string-views to a native list of native strings."}, //
     {NULL, NULL, 0, NULL} // Sentinel
 };

From f8dea134629d260c7c9c6cadb8d7cd4b09261163 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 15 Aug 2025 13:32:53 +0000
Subject: [PATCH 576/751] Fix: Fingerprinting memory management

---
 python/stringzillas.c        | 75 +++++++++++++++++++++++-------------
 scripts/test_stringzillas.py |  4 +-
 2 files changed, 51 insertions(+), 28 deletions(-)

diff --git a/python/stringzillas.c b/python/stringzillas.c
index c8b21122..ddeb05bc 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -1566,20 +1566,10 @@ static PyObject *Fingerprints_call(Fingerprints *self, PyObject *args, PyObject
     sz_status_t (*kernel_punned)(sz_fingerprints_t, sz_device_scope_t, void *, sz_u32_t *, sz_size_t, sz_u32_t *,
                                  sz_size_t) = NULL;
 
-    // Handle sequence inputs
-    sz_sequence_t texts_seq;
-    sz_bool_t texts_is_sequence = sz_py_export_strings_as_sequence(texts_obj, &texts_seq);
-    if (texts_is_sequence) {
-        kernel_input_size = texts_seq.count;
-        kernel_punned = sz_fingerprints_sequence;
-        kernel_texts_punned = &texts_seq;
-    }
-
     // Handle 32-bit tape inputs
     sz_sequence_u32tape_t texts_u32tape;
-    sz_bool_t texts_is_u32tape =
-        !texts_is_sequence && sz_py_export_strings_as_u32tape( //
-                                  texts_obj, &texts_u32tape.data, &texts_u32tape.offsets, &texts_u32tape.count);
+    sz_bool_t texts_is_u32tape = sz_py_export_strings_as_u32tape( //
+        texts_obj, &texts_u32tape.data, &texts_u32tape.offsets, &texts_u32tape.count);
     if (texts_is_u32tape) {
         kernel_input_size = texts_u32tape.count;
         kernel_punned = sz_fingerprints_u32tape;
@@ -1588,20 +1578,54 @@ static PyObject *Fingerprints_call(Fingerprints *self, PyObject *args, PyObject
 
     // Handle 64-bit tape inputs
     sz_sequence_u64tape_t texts_u64tape;
-    sz_bool_t texts_is_u64tape = !texts_is_sequence && !texts_is_u32tape &&
-                                 sz_py_export_strings_as_u64tape( //
-                                     texts_obj, &texts_u64tape.data, &texts_u64tape.offsets, &texts_u64tape.count);
+    sz_bool_t texts_is_u64tape =
+        !texts_is_u32tape && sz_py_export_strings_as_u64tape( //
+                                 texts_obj, &texts_u64tape.data, &texts_u64tape.offsets, &texts_u64tape.count);
     if (texts_is_u64tape) {
         kernel_input_size = texts_u64tape.count;
         kernel_punned = sz_fingerprints_u64tape;
         kernel_texts_punned = &texts_u64tape;
     }
 
+    // Handle generic sequence inputs
+    sz_sequence_t texts_seq;
+    sz_bool_t texts_is_sequence =
+        !texts_is_u32tape && !texts_is_u64tape && sz_py_export_strings_as_sequence(texts_obj, &texts_seq);
+    if (texts_is_sequence) {
+        kernel_input_size = texts_seq.count;
+        kernel_punned = sz_fingerprints_sequence;
+        kernel_texts_punned = &texts_seq;
+    }
+
     if (kernel_punned == NULL) {
         PyErr_SetString(PyExc_TypeError, "Unsupported input type for fingerprinting");
         return NULL;
     }
 
+    // Allocate unified memory first for CUDA compatibility
+    sz_size_t total_elements = kernel_input_size * self->ndim;
+    sz_size_t total_bytes = total_elements * sizeof(sz_u32_t);
+
+    sz_u32_t *unified_hashes = (sz_u32_t *)unified_allocator.allocate(total_bytes, unified_allocator.handle);
+    sz_u32_t *unified_counts = (sz_u32_t *)unified_allocator.allocate(total_bytes, unified_allocator.handle);
+
+    if (!unified_hashes || !unified_counts) {
+        if (unified_hashes) unified_allocator.free(unified_hashes, total_bytes, unified_allocator.handle);
+        if (unified_counts) unified_allocator.free(unified_counts, total_bytes, unified_allocator.handle);
+        return PyErr_NoMemory();
+    }
+
+    // Call the kernel with unified memory buffers
+    sz_status_t status = kernel_punned(self->handle, device_handle, kernel_texts_punned, unified_hashes,
+                                       self->ndim * sizeof(sz_u32_t), unified_counts, self->ndim * sizeof(sz_u32_t));
+
+    if (status != sz_success_k) {
+        unified_allocator.free(unified_hashes, total_bytes, unified_allocator.handle);
+        unified_allocator.free(unified_counts, total_bytes, unified_allocator.handle);
+        PyErr_SetString(PyExc_RuntimeError, "Fingerprinting computation failed");
+        return NULL;
+    }
+
     // Create NumPy arrays for output matrices - each row contains fingerprints for one text
     npy_intp dims[2] = {kernel_input_size, self->ndim};
 
@@ -1611,22 +1635,21 @@ static PyObject *Fingerprints_call(Fingerprints *self, PyObject *args, PyObject
     if (!hashes_array || !counts_array) {
         Py_XDECREF(hashes_array);
         Py_XDECREF(counts_array);
+        unified_allocator.free(unified_hashes, total_bytes, unified_allocator.handle);
+        unified_allocator.free(unified_counts, total_bytes, unified_allocator.handle);
         return PyErr_NoMemory();
     }
 
-    sz_u32_t *min_hashes = (sz_u32_t *)PyArray_DATA(hashes_array);
-    sz_u32_t *min_counts = (sz_u32_t *)PyArray_DATA(counts_array);
+    // Copy from unified memory to NumPy arrays
+    sz_u32_t *numpy_hashes = (sz_u32_t *)PyArray_DATA(hashes_array);
+    sz_u32_t *numpy_counts = (sz_u32_t *)PyArray_DATA(counts_array);
 
-    // Call the kernel
-    sz_status_t status = kernel_punned(self->handle, device_handle, kernel_texts_punned, min_hashes,
-                                       self->ndim * sizeof(sz_u32_t), min_counts, self->ndim * sizeof(sz_u32_t));
+    memcpy(numpy_hashes, unified_hashes, total_bytes);
+    memcpy(numpy_counts, unified_counts, total_bytes);
 
-    if (status != sz_success_k) {
-        Py_DECREF(hashes_array);
-        Py_DECREF(counts_array);
-        PyErr_SetString(PyExc_RuntimeError, "Fingerprinting computation failed");
-        return NULL;
-    }
+    // Free unified memory
+    unified_allocator.free(unified_hashes, total_bytes, unified_allocator.handle);
+    unified_allocator.free(unified_counts, total_bytes, unified_allocator.handle);
 
     // Return tuple of two NumPy arrays: (hashes_matrix, counts_matrix)
     PyObject *result_tuple = PyTuple_New(2);
diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index 3df44588..d10ba863 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -380,9 +380,9 @@ def test_fingerprints(device_name: str):
     assert np.array_equal(hashes[0], hashes[2]), "Identical strings should produce identical hashes"
     assert np.array_equal(counts[0], counts[2]), "Identical strings should produce identical counts"
 
-    # Different strings should produce different fingerprints
+    # Different strings should produce different fingerprints, but we can't always expect
+    # different counts on very short inputs
     assert not np.array_equal(hashes[0], hashes[1]), "Different strings should produce different hashes"
-    assert not np.array_equal(counts[0], counts[1]), "Different strings should produce different counts"
 
 
 @pytest.mark.repeat(5)

From 77248869280621507eb5ba60b35b4ada18b32d06 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 15 Aug 2025 13:34:04 +0000
Subject: [PATCH 577/751] Fix: Expose `value_type` for CUDA fingerprinter

---
 c/stringzillas.cuh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index e2ebd251..4d54f40e 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -36,6 +36,7 @@ overloaded(callable_types_...) -> overloaded<callable_types_...>;
 
 /** Wraps a `sz_sequence_t` to feel like `std::vector<std::string_view>>` in the implementation layer. */
 struct sz_sequence_as_cpp_container_t {
+    using value_type = std::string_view;
     sz_sequence_t const *sequence_ = nullptr;
 
     std::size_t size() const noexcept {
@@ -53,6 +54,7 @@ struct sz_sequence_as_cpp_container_t {
 
 /** Wraps a `sz_sequence_u64tape_t` to feel like `std::vector<std::string_view>>` in the implementation layer. */
 struct sz_sequence_u64tape_as_cpp_container_t {
+    using value_type = std::string_view;
     sz_sequence_u64tape_t const *tape_ = nullptr;
 
     std::size_t size() const noexcept {
@@ -68,6 +70,7 @@ struct sz_sequence_u64tape_as_cpp_container_t {
 
 /** Wraps a `sz_sequence_u32tape_t` to feel like `std::vector<std::string_view>>` in the implementation layer. */
 struct sz_sequence_u32tape_as_cpp_container_t {
+    using value_type = std::string_view;
     sz_sequence_u32tape_t const *tape_ = nullptr;
 
     std::size_t size() const noexcept {

From b5060a8fd5b23781e5860b2e791fcdc9401b06a8 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 16 Aug 2025 20:12:01 +0000
Subject: [PATCH 578/751] Improve: PyTest different MinHash dimensions

---
 scripts/test_stringzillas.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index d10ba863..0b4337ae 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -353,17 +353,18 @@ def test_needleman_wunsch_vs_levenshtein_random(config: InputSizeConfig):
 
 
 @pytest.mark.parametrize("device_name", DEVICE_NAMES)
-def test_fingerprints(device_name: str):
+@pytest.mark.parametrize("ndim", [1, 7, 64, 1024])
+def test_fingerprints(device_name: str, ndim: int):
     """Test Fingerprints basic functionality."""
 
     # Create engine with smaller dimensions to avoid memory issues
     device_scope, capabilities = device_scope_and_capabilities(device_name)
-    engine = szs.Fingerprints(ndim=64, capabilities=capabilities)
+    engine = szs.Fingerprints(ndim=ndim, capabilities=capabilities)
 
     # Basic functionality - empty input should return empty arrays
     hashes, counts = engine(Strs([]), device=device_scope)
-    assert hashes.shape == (0, 64)
-    assert counts.shape == (0, 64)
+    assert hashes.shape == (0, ndim)
+    assert counts.shape == (0, ndim)
     assert hashes.dtype == np.uint32
     assert counts.dtype == np.uint32
 
@@ -371,8 +372,8 @@ def test_fingerprints(device_name: str):
     hashes, counts = engine(test_strings, device=device_scope)
 
     # Check output shape and types
-    assert hashes.shape == (3, 64), f"Expected (3, 64), got {hashes.shape}"
-    assert counts.shape == (3, 64), f"Expected (3, 64), got {counts.shape}"
+    assert hashes.shape == (3, ndim), f"Expected (3, {ndim}), got {hashes.shape}"
+    assert counts.shape == (3, ndim), f"Expected (3, {ndim}), got {counts.shape}"
     assert hashes.dtype == np.uint32
     assert counts.dtype == np.uint32
 
@@ -388,16 +389,17 @@ def test_fingerprints(device_name: str):
 @pytest.mark.repeat(5)
 @pytest.mark.parametrize("batch_size", [1, 10, 100])
 @pytest.mark.parametrize("device_name", DEVICE_NAMES)
-def test_fingerprints_random(batch_size: int, device_name: str):
+@pytest.mark.parametrize("ndim", [1, 7, 64, 1024])
+def test_fingerprints_random(batch_size: int, device_name: str, ndim: int):
     """Test Fingerprints with random strings."""
 
     device_scope, capabilities = device_scope_and_capabilities(device_name)
-    engine = szs.Fingerprints(ndim=64, capabilities=capabilities)
+    engine = szs.Fingerprints(ndim=ndim, capabilities=capabilities)
     batch = Strs([get_random_string(length=randint(5, 50)) for _ in range(batch_size)])
 
     hashes, counts = engine(batch, device=device_scope)
-    assert hashes.shape == (batch_size, 64)
-    assert counts.shape == (batch_size, 64)
+    assert hashes.shape == (batch_size, ndim)
+    assert counts.shape == (batch_size, ndim)
 
     # Verify consistency
     hashes_repeated, counts_repeated = engine(batch, device=device_scope)

From a25e3f2503867f97aef68c784f382b3f58f3168d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 16 Aug 2025 20:12:36 +0000
Subject: [PATCH 579/751] Add: `basic_rolling_hashers` CUDA port

This is mostly needed for compatibility purposes,
as the performance of non-coalesced memory accesses
on Nvidia GPUs is expected to be low.
---
 include/stringzillas/fingerprints.cuh | 372 ++++++++++++++++++++++++--
 include/stringzillas/fingerprints.hpp |  20 +-
 2 files changed, 369 insertions(+), 23 deletions(-)

diff --git a/include/stringzillas/fingerprints.cuh b/include/stringzillas/fingerprints.cuh
index 1c707149..b5d46918 100644
--- a/include/stringzillas/fingerprints.cuh
+++ b/include/stringzillas/fingerprints.cuh
@@ -26,15 +26,17 @@ namespace stringzillas {
  *  @brief Wraps a single task for the CUDA-based @b byte-level "fingerprint" kernels.
  *  @note Used to allow sorting/grouping inputs to differentiate device-wide and warp-wide tasks.
  */
-template <typename char_type_>
-struct cuda_floating_fingerprint_task_ {
+template <typename char_type_, typename min_hash_type_ = u32_t, typename min_count_type_ = u32_t>
+struct cuda_fingerprint_task_ {
     using char_t = char_type_;
+    using min_hash_t = min_hash_type_;
+    using min_count_t = min_count_type_;
 
     char_t const *text_ptr = nullptr;
     size_t text_length = 0;
     size_t original_index = 0;
-    u32_t *min_hashes = nullptr;
-    u32_t *min_counts = nullptr;
+    min_hash_t *min_hashes = nullptr;
+    min_count_t *min_counts = nullptr;
     warp_tasks_density_t density = warps_working_together_k; // ? Worst case, we have to sync final writes
 };
 
@@ -56,7 +58,147 @@ __device__ __forceinline__ f64_t barrett_mod_cuda_(f64_t x, f64_t modulo, f64_t
  *  Each thread computes an independent rolling hash for a specific dimension, so you should have a multiple
  *  of warp-size dimensions per fingerprint.
  *
- *  Unlike the `basic_rolling_hashers_kernel_` basic variant, all hashers have the same window width.
+ *  @sa This kernel is much slower than `floating_rolling_hashers_on_each_cuda_warp_` and is intended as a fallback.
+ *
+ *  To avoid dynamically allocated buffers for the @p `hashers`, one should provide a compile-time upper bound
+ *  for the number of dimensions, @p `dimensions_upper_bound_`, which is used to allocate registers. 1024 is a good
+ *  default. If more dimensions are needed, we can easily call this kernel multiple times.
+ */
+template <                                                                     //
+    unsigned dimensions_upper_bound_,                                          //
+    typename hasher_type_,                                                     //
+    typename min_hash_type_,                                                   //
+    typename min_count_type_,                                                  //
+    sz_capability_t capability_,                                               //
+    typename char_type_ = byte_t, warp_size_t warp_size_ = warp_size_nvidia_k, //
+    warp_tasks_density_t density_ = four_warps_per_multiprocessor_k            //
+    >
+__global__ void basic_rolling_hashers_kernel_(                                                                  //
+    cuda_fingerprint_task_<char_type_, min_hash_type_, min_count_type_> const *tasks, size_t const tasks_count, //
+    hasher_type_ const *hashers_global, size_t const hashers_count, size_t const max_window_width) {
+
+    //
+    using task_t = cuda_fingerprint_task_<char_type_, min_hash_type_, min_count_type_>;
+    using hasher_t = hasher_type_;
+    using rolling_state_t = typename hasher_t::state_t;
+    using rolling_hash_t = typename hasher_t::hash_t;
+    using min_hash_t = min_hash_type_;
+    using min_count_t = min_count_type_;
+    constexpr warp_size_t warp_size_k = warp_size_;
+    constexpr warp_tasks_density_t density_k = density_;
+    constexpr unsigned dimensions_k = dimensions_upper_bound_;
+    constexpr unsigned dimensions_per_thread_k = dimensions_k / warp_size_k;
+    static constexpr rolling_state_t skipped_rolling_state_k = std::numeric_limits<rolling_state_t>::max();
+    static constexpr rolling_hash_t skipped_rolling_hash_k = std::numeric_limits<rolling_hash_t>::max();
+    static constexpr min_hash_t max_hash_k = std::numeric_limits<min_hash_t>::max();
+    static_assert(dimensions_k % warp_size_k == 0, "Dimensions must be a multiple of warp size");
+    sz_assert_(hashers_count <= dimensions_k && "We can't have more hashers than the dimensions upper bound");
+
+    // We may have multiple warps operating in the same block.
+    unsigned const warp_size = warpSize;
+    sz_assert_(warp_size == warp_size_k && "Warp size mismatch in kernel");
+    unsigned const global_thread_index = static_cast<unsigned>(blockIdx.x * blockDim.x + threadIdx.x);
+    unsigned const global_warp_index = static_cast<unsigned>(global_thread_index / warp_size_k);
+    unsigned const warps_per_block = static_cast<unsigned>(blockDim.x / warp_size_k);
+    sz_assert_(warps_per_block == density_k && "Block size mismatch in kernel");
+    unsigned const warps_per_device = static_cast<unsigned>(gridDim.x * warps_per_block);
+    unsigned const thread_in_warp_index = static_cast<unsigned>(global_thread_index % warp_size_k);
+    unsigned const warp_in_block_index = static_cast<unsigned>(global_warp_index % density_k);
+
+    // Load the hashers states per thread in a strided fashion.
+    hasher_t hashers[dimensions_per_thread_k];
+#pragma unroll
+    for (unsigned dim_within_thread = 0; dim_within_thread < dimensions_per_thread_k; ++dim_within_thread) {
+        unsigned const dim = dim_within_thread * warp_size_k + thread_in_warp_index;
+        hasher_t const &hasher = hashers[dim];
+        if (dim >= hashers_count) continue; // ? Avoid out-of-bounds access
+        hashers[dim_within_thread] = hasher;
+    }
+
+    // Each block/warp may end up receiving a different number of strings.
+    for (size_t task_index = global_warp_index; task_index < tasks_count; task_index += warps_per_device) {
+        task_t const task = tasks[task_index];
+
+        // For each state we need to reset the local state
+        rolling_state_t last_states[dimensions_per_thread_k];
+        rolling_hash_t rolling_minimums[dimensions_per_thread_k];
+        min_count_t rolling_counts[dimensions_per_thread_k];
+        for (auto &rolling_state : last_states) rolling_state = rolling_state_t(0);
+        for (auto &rolling_minimum : rolling_minimums) rolling_minimum = skipped_rolling_hash_k;
+        for (auto &rolling_count : rolling_counts) rolling_count = 0;
+
+        // Until we reach the maximum window length, use a branching code version
+        size_t const prefix_length = std::min<size_t>(task.text_length, max_window_width_);
+        size_t new_char_offset = 0;
+        for (; new_char_offset < prefix_length; ++new_char_offset) {
+            byte_t const new_char = task.text_ptr[new_char_offset]; // ? Hardware may auto-broadcast this
+            f64_t const new_term = static_cast<f64_t>(new_char) + 1.0;
+
+#pragma unroll
+            for (unsigned dim_within_thread = 0; dim_within_thread < dimensions_per_thread_k; ++dim_within_thread) {
+                auto &hasher = hashers_[dim_within_thread];
+                rolling_state_t &last_state = last_states[dim_within_thread];
+                rolling_hash_t &rolling_minimum = rolling_minimums[dim_within_thread];
+                min_count_t &min_count = min_counts[dim_within_thread];
+                if (new_char_offset < hasher.window_width()) {
+                    last_state = hasher.push(last_state, new_char);
+                    if (hasher.window_width() == (new_char_offset + 1)) {
+                        rolling_minimum = (std::min)(rolling_minimum, hasher.digest(last_state));
+                        min_count = 1; // First occurrence of this hash
+                    }
+                    continue;
+                }
+                auto const old_char = text_chunk[new_char_offset - hasher.window_width()];
+                last_state = hasher.roll(last_state, old_char, new_char);
+                rolling_hash_t new_hash = hasher.digest(last_state);
+                min_count *= new_hash >= rolling_minimum; // ? Discard `min_count` to 0 for new extremums
+                min_count += new_hash <= rolling_minimum; // ? Increments by 1 for new & old minimums
+                rolling_minimum = (std::min)(rolling_minimum, new_hash);
+            }
+        }
+
+        // Now we can avoid a branch in the nested loop, as we are passed the longest window width
+        for (; new_char_offset + warp_size_k <= task.text_length; new_char_offset += warp_size_k) {
+            byte_t const new_char = task.text_ptr[new_char_offset]; // ? Hardware may auto-broadcast this
+            f64_t const new_term = static_cast<f64_t>(new_char) + 1.0;
+
+#pragma unroll
+            for (unsigned dim_within_thread = 0; dim_within_thread < dimensions_per_thread_k; ++dim_within_thread) {
+                auto &hasher = hashers_[dim_within_thread];
+                rolling_state_t &last_state = last_states[dim_within_thread];
+                rolling_hash_t &rolling_minimum = rolling_minimums[dim_within_thread];
+                min_count_t &min_count = min_counts[dim_within_thread];
+                auto const old_char = text_chunk[new_char_offset - hasher.window_width()];
+                last_state = hasher.roll(last_state, old_char, new_char);
+                rolling_hash_t new_hash = hasher.digest(last_state);
+                min_count *= new_hash >= rolling_minimum; // ? Discard `min_count` to 0 for new extremums
+                min_count += new_hash <= rolling_minimum; // ? Increments by 1 for new & old minimums
+                rolling_minimum = (std::min)(rolling_minimum, new_hash);
+            }
+        }
+
+        // Finally export the results
+#pragma unroll
+        for (unsigned dim_within_thread = 0; dim_within_thread < dimensions_per_thread_k; ++dim_within_thread) {
+            unsigned const dim = dim_within_thread * warp_size_k + thread_in_warp_index;
+            if (dim >= hashers_count) continue; // ? Avoid out-of-bounds access
+            rolling_hash_t const &rolling_minimum = rolling_minimums[dim_within_thread];
+            task.min_counts[dim] = rolling_minimum == skipped_rolling_state_k
+                                       ? 0 // If the rolling minimum is not set, reset to zeros
+                                       : rolling_counts[dim_within_thread];
+            task.min_hashes[dim] = rolling_minimum == skipped_rolling_state_k
+                                       ? max_hash_k // If the rolling minimum is not set, use the maximum hash value
+                                       : static_cast<min_hash_t>(rolling_minimum & max_hash_k);
+        }
+    }
+}
+
+/**
+ *  Each warp takes in an individual document from @p `tasks` and computes many rolling hashes for it.
+ *  Each thread computes an independent rolling hash for a specific dimension, so you should have a multiple
+ *  of warp-size dimensions per fingerprint.
+ *
+ *  Unlike the `basic_rolling_hashers_kernel_` basic variant, all @p `hashers` @b must have the same @p `window_width`.
  *  This greatly simplifies the memory access patterns. Assuming each thread in a warp can issue an independent
  *  read for consecutive elements, and easily loads 32 bits at a time, this kernel is suited to loading
  *  4x bytes and computing 4x (warp_size_) rolling hashes per thread in the inner loop.
@@ -66,12 +208,12 @@ template <                                                                     /
     typename char_type_ = byte_t, warp_size_t warp_size_ = warp_size_nvidia_k, //
     warp_tasks_density_t density_ = four_warps_per_multiprocessor_k            //
     >
-__global__ void floating_rolling_hashers_on_each_cuda_warp_(                            //
-    cuda_floating_fingerprint_task_<char_type_> const *tasks, size_t const tasks_count, //
+__global__ void floating_rolling_hashers_on_each_cuda_warp_(                   //
+    cuda_fingerprint_task_<char_type_> const *tasks, size_t const tasks_count, //
     floating_rolling_hasher<f64_t> const *hashers, size_t const hashers_count, size_t const window_width) {
 
     //
-    using task_t = cuda_floating_fingerprint_task_<char_type_>;
+    using task_t = cuda_fingerprint_task_<char_type_>;
     using hasher_t = floating_rolling_hasher<f64_t>;
     constexpr warp_size_t warp_size_k = warp_size_;
     constexpr warp_tasks_density_t density_k = density_;
@@ -113,7 +255,6 @@ __global__ void floating_rolling_hashers_on_each_cuda_warp_(
         inverse_modulos[dim_within_thread] = hasher.inverse_modulo();
     }
 
-    // We are computing N edit distances for N pairs of strings. Not a cartesian product!
     // Each block/warp may end up receiving a different number of strings.
     for (size_t task_index = global_warp_index; task_index < tasks_count; task_index += warps_per_device) {
         task_t const task = tasks[task_index];
@@ -247,7 +388,7 @@ template <                                               //
     size_t dimensions_, sz_capability_t capability_,     //
     typename char_type_ = byte_t, size_t warp_size_ = 32 //
     >
-__global__ void floating_rolling_hashers_across_cuda_device_(span<cuda_floating_fingerprint_task_<char_type_>> tasks,
+__global__ void floating_rolling_hashers_across_cuda_device_(span<cuda_fingerprint_task_<char_type_>> tasks,
                                                              span<floating_rolling_hasher<f64_t> const> hashers) {
     sz_unused_(tasks);
     sz_unused_(hashers);
@@ -256,7 +397,192 @@ __global__ void floating_rolling_hashers_across_cuda_device_(span<cuda_floating_
 #pragma endregion - CUDA Kernels
 
 /**
- *  @brief CUDA specialization of floating_rolling_hashers for count-min-sketching.
+ *  @brief CUDA specialization of `basic_rolling_hashers` for count-min-sketching.
+ */
+template <typename hasher_type_, typename min_hash_type_, typename min_count_type_>
+struct basic_rolling_hashers<hasher_type_, min_hash_type_, min_count_type_, unified_alloc<char>, sz_cap_cuda_k> {
+
+    using hasher_t = hasher_type_;
+    using rolling_state_t = typename hasher_t::state_t;
+    using rolling_hash_t = typename hasher_t::hash_t;
+
+    using min_hash_t = min_hash_type_;
+    using min_count_t = min_count_type_;
+    using allocator_t = unified_alloc<char>;
+
+    using hashers_allocator_t = typename allocator_t::template rebind<hasher_t>::other;
+    using hashers_t = safe_vector<hasher_t, hashers_allocator_t>;
+
+    static constexpr sz_capability_t capability_k = sz_cap_cuda_k;
+    static constexpr rolling_state_t skipped_rolling_state_k = std::numeric_limits<rolling_state_t>::max();
+    static constexpr min_hash_t max_hash_k = std::numeric_limits<min_hash_t>::max();
+
+    using min_hashes_span_t = span<min_hash_t>;
+    using min_counts_span_t = span<min_count_t>;
+
+    static constexpr unsigned hashes_per_warp_k = static_cast<unsigned>(warp_size_nvidia_k);
+    static constexpr unsigned aligned_dimensions_k = 1024; // ? Must be a multiple of `warp_size_nvidia_k`
+
+  private:
+    using allocator_traits_t = std::allocator_traits<allocator_t>;
+    using hasher_allocator_t = typename allocator_traits_t::template rebind_alloc<hasher_t>;
+    using rolling_states_allocator_t = typename allocator_traits_t::template rebind_alloc<rolling_state_t>;
+    using rolling_hashes_allocator_t = typename allocator_traits_t::template rebind_alloc<rolling_hash_t>;
+    using min_counts_allocator_t = typename allocator_traits_t::template rebind_alloc<min_count_t>;
+
+    allocator_t allocator_;
+    hashers_t hashers_;
+    size_t max_window_width_;
+
+  public:
+    basic_rolling_hashers(allocator_t const &allocator = {}) noexcept
+        : allocator_(allocator), hashers_(allocator), max_window_width_(0) {}
+
+    size_t dimensions() const noexcept { return hashers_.size(); }
+    size_t max_window_width() const noexcept { return max_window_width_; }
+    size_t window_width(size_t dim) const noexcept { return hashers_[dim].window_width(); }
+
+    /**
+     *  @brief Appends multiple new rolling hashers for a given @p window_width.
+     *
+     *  @param[in] window_width Width of the rolling window, typically 3, 4, 5, 6, or 7.
+     *  @param[in] dims Number of hash functions to use, typically 768, 1024, or 1536.
+     *  @param[in] alphabet_size Size of the alphabet, typically 256 for UTF-8, 4 for DNA, or 20 for proteins.
+     *  @retval status_t::success_k on success, or an error code otherwise.
+     *  @retval status_t::bad_alloc_k if the memory allocation fails.
+     *
+     *  Typical usage of this interface (error handling aside) would be like:
+     *
+     *  @code{.cpp}
+     *  basic_rolling_hashers<rabin_karp_rolling_hasher<u32_t>> hashers;
+     *  hashers.try_extend(3, 32); // 32 dims for 3-grams
+     *  hashers.try_extend(5, 32); // 32 dims for 5-grams
+     *  hashers.try_extend(7, 64); // 64 dims for 7-grams
+     *  std::array<u32_t, 128> fingerprint; // 128 total dims
+     *  hashers("some text", fingerprint);
+     *  @endcode
+     */
+    SZ_NOINLINE status_t try_extend(size_t window_width, size_t new_dims, size_t alphabet_size = 256) noexcept {
+        size_t const old_dims = hashers_.size();
+        if (hashers_.try_reserve(old_dims + new_dims) != status_t::success_k) return status_t::bad_alloc_k;
+        for (size_t new_dim = 0; new_dim < new_dims; ++new_dim) {
+            size_t const dim = old_dims + new_dim;
+            status_t status = try_append(hasher_t(window_width, alphabet_size + dim));
+            sz_assert_(status == status_t::success_k && "Couldn't fail after the reserve");
+        }
+        return status_t::success_k;
+    }
+
+    /**
+     *  @brief Appends a new rolling @p hasher to the collection via `try_append`.
+     *  @retval status_t::success_k on success, or an error code otherwise.
+     *  @retval status_t::bad_alloc_k if the memory allocation fails.
+     */
+    SZ_NOINLINE status_t try_append(hasher_t hasher) noexcept {
+        auto const new_window_width = hasher.window_width();
+        if (hashers_.try_push_back(std::move(hasher)) != status_t::success_k) return status_t::bad_alloc_k;
+
+        max_window_width_ = (std::max)(new_window_width, max_window_width_);
+        return status_t::success_k;
+    }
+
+    /**
+     *  @brief Computes many fingerprints in parallel for input @p texts via an @p executor.
+     *  @param[in] texts The input texts to hash, typically a sequential container of UTF-8 encoded strings.
+     *  @param[out] min_hashes_per_text The output fingerprints, an array of vectors of minimum hashes.
+     *  @param[out] min_counts_per_text The output frequencies of @p `min_hashes_per_text` hashes.
+     *  @param[in] executor The device executor to use for parallel processing, defaults to the first GPU.
+     *  @param[in] specs The GPU specifications to use, defaults to an empty `gpu_specs_t`.
+     *  @retval status_t::success_k on success, or an error code otherwise.
+     *  @retval status_t::bad_alloc_k if the memory allocation fails.
+     */
+    template <typename texts_type_, typename min_hashes_per_text_type_, typename min_counts_per_text_type_>
+    SZ_NOINLINE cuda_status_t operator()(                                                                 //
+        texts_type_ const &texts,                                                                         //
+        min_hashes_per_text_type_ &&min_hashes_per_text, min_counts_per_text_type_ &&min_counts_per_text, //
+        cuda_executor_t executor = {}, gpu_specs_t specs = {}) const noexcept {
+
+        using texts_t = texts_type_;
+        using text_t = typename texts_t::value_type;
+        using char_t = typename text_t::value_type;
+        using task_t = cuda_fingerprint_task_<char_t, min_hash_t, min_count_t>;
+        using tasks_allocator_t = typename allocator_t::template rebind<task_t>::other;
+
+        // Preallocate the events for GPU timing.
+        cudaEvent_t start_event, stop_event;
+        cudaEventCreate(&start_event, cudaEventBlockingSync);
+        cudaEventCreate(&stop_event, cudaEventBlockingSync);
+
+        // Populate the tasks for each warp or the entire device, putting it into unified memory.
+        safe_vector<task_t, tasks_allocator_t> tasks(allocator_);
+        if (tasks.try_resize(texts.size()) == status_t::bad_alloc_k) return {status_t::bad_alloc_k};
+        for (size_t task_index = 0; task_index < texts.size(); ++task_index) {
+            auto const &text = texts[task_index];
+            auto min_hashes = to_span(min_hashes_per_text[task_index]);
+            auto min_counts = to_span(min_counts_per_text[task_index]);
+            tasks[task_index] = task_t {
+                .text_ptr = text.data(),
+                .text_length = text.size(),
+                .original_index = task_index,
+                .min_hashes = min_hashes.data(),
+                .min_counts = min_counts.data(),
+                .density = four_warps_per_multiprocessor_k,
+            };
+        }
+        // std::partition(tasks.begin(), tasks.end(),
+        //                [](task_t const &task) { return task.density == warps_working_together_k; });
+
+        // Record the start event
+        cudaError_t start_event_error = cudaEventRecord(start_event, executor.stream());
+        if (start_event_error != cudaSuccess) return {status_t::unknown_k, start_event_error};
+
+        void *warp_level_kernel_args[5];
+        auto const *tasks_ptr = tasks.data();
+        auto const tasks_size = tasks.size();
+        auto const *hashers_ptr = hashers_.data();
+        auto const hashers_size = (std::min)(dimensions_k, hashers_.size());
+        warp_level_kernel_args[0] = (void *)(&tasks_ptr);
+        warp_level_kernel_args[1] = (void *)(&tasks_size);
+        warp_level_kernel_args[2] = (void *)(&hashers_ptr);
+        warp_level_kernel_args[3] = (void *)(&hashers_size);
+        warp_level_kernel_args[4] = (void *)(&max_window_width_);
+
+        static_assert(sizeof(char_t) == sizeof(byte_t), "Characters must be byte-sized");
+        auto warp_level_kernel = &basic_rolling_hashers_kernel_< //
+            aligned_dimensions_k, hasher_t, min_hash_t, min_count_t, sz_cap_cuda_k, byte_t, warp_size_nvidia_k,
+            four_warps_per_multiprocessor_k>;
+
+        // TODO: We can be wiser about the dimensions of this grid.
+        unsigned const random_block_size = static_cast<unsigned>(warp_size_nvidia_k) * //
+                                           static_cast<unsigned>(four_warps_per_multiprocessor_k);
+        unsigned const random_blocks_per_multiprocessor = 2;
+        cudaError_t launch_error = cudaLaunchCooperativeKernel(                       //
+            reinterpret_cast<void *>(warp_level_kernel),                              // Kernel function pointer
+            dim3(random_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions
+            dim3(random_block_size),                                                  // Block dimensions
+            warp_level_kernel_args, // Array of kernel argument pointers
+            0,                      // Shared memory per block (in bytes)
+            executor.stream());     // CUDA stream
+        if (launch_error != cudaSuccess)
+            if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
+            else { return {status_t::unknown_k, launch_error}; }
+
+        // Wait until everything completes, as on the next iteration we will update the properties again.
+        cudaError_t execution_error = cudaStreamSynchronize(executor.stream());
+        if (execution_error != cudaSuccess) { return {status_t::unknown_k, execution_error}; }
+
+        // Calculate the duration:
+        cudaError_t stop_event_error = cudaEventRecord(stop_event, executor.stream());
+        if (stop_event_error != cudaSuccess) return {status_t::unknown_k, stop_event_error};
+        float execution_milliseconds = 0;
+        cudaEventElapsedTime(&execution_milliseconds, start_event, stop_event);
+
+        return {status_t::success_k, cudaSuccess, execution_milliseconds};
+    }
+};
+
+/**
+ *  @brief CUDA specialization of `floating_rolling_hashers` for count-min-sketching.
  */
 template <size_t dimensions_>
 struct floating_rolling_hashers<sz_cap_cuda_k, dimensions_> {
@@ -285,13 +611,13 @@ struct floating_rolling_hashers<sz_cap_cuda_k, dimensions_> {
     static constexpr unsigned groups_count_k = aligned_dimensions_k / hashes_per_warp_k;
 
   private:
-    allocator_t alloc_;
+    allocator_t allocator_;
     hashers_t hashers_;
     size_t window_width_;
 
   public:
-    floating_rolling_hashers(allocator_t const &alloc = {}) noexcept
-        : alloc_(alloc), hashers_(alloc), window_width_(0) {}
+    floating_rolling_hashers(allocator_t const &allocator = {}) noexcept
+        : allocator_(allocator), hashers_(allocator), window_width_(0) {}
     constexpr size_t dimensions() const noexcept { return dimensions_k; }
     constexpr size_t window_width() const noexcept { return window_width_; }
     constexpr size_t window_width(size_t) const noexcept { return window_width_; }
@@ -322,7 +648,7 @@ struct floating_rolling_hashers<sz_cap_cuda_k, dimensions_> {
                                               min_counts_span_t min_counts, gpu_specs_t specs = {},
                                               cuda_executor_t executor = {}) const noexcept {
 
-        using task_t = cuda_floating_fingerprint_task_<byte_t>;
+        using task_t = cuda_fingerprint_task_<byte_t>;
         using tasks_allocator_t = typename allocator_t::template rebind<task_t>::other;
         sz_unused_(specs);
 
@@ -332,7 +658,7 @@ struct floating_rolling_hashers<sz_cap_cuda_k, dimensions_> {
         cudaEventCreate(&stop_event, cudaEventBlockingSync);
 
         // Populate the tasks array with a single task for the entire device.
-        safe_vector<task_t, tasks_allocator_t> tasks(alloc_);
+        safe_vector<task_t, tasks_allocator_t> tasks(allocator_);
         if (tasks.try_resize(1) == status_t::bad_alloc_k) return {status_t::bad_alloc_k};
 
         tasks[0] = task_t {
@@ -390,6 +716,16 @@ struct floating_rolling_hashers<sz_cap_cuda_k, dimensions_> {
         return {status_t::success_k, cudaSuccess, execution_milliseconds};
     }
 
+    /**
+     *  @brief Computes many fingerprints in parallel for input @p texts via an @p executor.
+     *  @param[in] texts The input texts to hash, typically a sequential container of UTF-8 encoded strings.
+     *  @param[out] min_hashes_per_text The output fingerprints, an array of vectors of minimum hashes.
+     *  @param[out] min_counts_per_text The output frequencies of @p `min_hashes_per_text` hashes.
+     *  @param[in] executor The device executor to use for parallel processing, defaults to the first GPU.
+     *  @param[in] specs The GPU specifications to use, defaults to an empty `gpu_specs_t`.
+     *  @retval status_t::success_k on success, or an error code otherwise.
+     *  @retval status_t::bad_alloc_k if the memory allocation fails.
+     */
     template <typename texts_type_, typename min_hashes_per_text_type_, typename min_counts_per_text_type_>
     SZ_NOINLINE cuda_status_t operator()(texts_type_ const &texts, min_hashes_per_text_type_ &&min_hashes_per_text,
                                          min_counts_per_text_type_ &&min_counts_per_text, cuda_executor_t executor = {},
@@ -398,7 +734,7 @@ struct floating_rolling_hashers<sz_cap_cuda_k, dimensions_> {
         using texts_t = texts_type_;
         using text_t = typename texts_t::value_type;
         using char_t = typename text_t::value_type;
-        using task_t = cuda_floating_fingerprint_task_<char_t>;
+        using task_t = cuda_fingerprint_task_<char_t>;
         using tasks_allocator_t = typename allocator_t::template rebind<task_t>::other;
 
         // Preallocate the events for GPU timing.
@@ -407,7 +743,7 @@ struct floating_rolling_hashers<sz_cap_cuda_k, dimensions_> {
         cudaEventCreate(&stop_event, cudaEventBlockingSync);
 
         // Populate the tasks for each warp or the entire device, putting it into unified memory.
-        safe_vector<task_t, tasks_allocator_t> tasks(alloc_);
+        safe_vector<task_t, tasks_allocator_t> tasks(allocator_);
         if (tasks.try_resize(texts.size()) == status_t::bad_alloc_k) return {status_t::bad_alloc_k};
         for (size_t task_index = 0; task_index < texts.size(); ++task_index) {
             auto const &text = texts[task_index];
diff --git a/include/stringzillas/fingerprints.hpp b/include/stringzillas/fingerprints.hpp
index ae37aac9..e639bd7b 100644
--- a/include/stringzillas/fingerprints.hpp
+++ b/include/stringzillas/fingerprints.hpp
@@ -568,9 +568,18 @@ template <                                                           //
     typename hasher_type_ = rabin_karp_rolling_hasher<u32_t, u64_t>, //
     typename min_hash_type_ = u32_t,                                 //
     typename min_count_type_ = u32_t,                                //
-    typename allocator_type_ = std::allocator<hasher_type_>          //
+    typename allocator_type_ = std::allocator<hasher_type_>,         //
+    sz_capability_t capability_ = sz_cap_serial_k                    //
     >
-struct basic_rolling_hashers {
+struct basic_rolling_hashers;
+
+template <                    //
+    typename hasher_type_,    //
+    typename min_hash_type_,  //
+    typename min_count_type_, //
+    typename allocator_type_  //
+    >
+struct basic_rolling_hashers<hasher_type_, min_hash_type_, min_count_type_, allocator_type_, sz_cap_serial_k> {
 
     using hasher_t = hasher_type_;
     using rolling_state_t = typename hasher_t::state_t;
@@ -579,13 +588,14 @@ struct basic_rolling_hashers {
     using min_hash_t = min_hash_type_;
     using min_count_t = min_count_type_;
     using allocator_t = allocator_type_;
+    static constexpr sz_capability_t capability_k = sz_cap_serial_k;
 
     static constexpr rolling_state_t skipped_rolling_state_k = std::numeric_limits<rolling_state_t>::max();
     static constexpr rolling_hash_t skipped_rolling_hash_k = std::numeric_limits<rolling_hash_t>::max();
     static constexpr min_hash_t max_hash_k = std::numeric_limits<min_hash_t>::max();
 
   private:
-    using allocator_traits_t = std::allocator_traits<allocator_type_>;
+    using allocator_traits_t = std::allocator_traits<allocator_t>;
     using hasher_allocator_t = typename allocator_traits_t::template rebind_alloc<hasher_t>;
     using rolling_states_allocator_t = typename allocator_traits_t::template rebind_alloc<rolling_state_t>;
     using rolling_hashes_allocator_t = typename allocator_traits_t::template rebind_alloc<rolling_hash_t>;
@@ -721,7 +731,7 @@ struct basic_rolling_hashers {
         for (; new_char_offset < prefix_length; ++new_char_offset) {
             byte_t const new_char = text_chunk[new_char_offset];
             for (size_t dim = 0; dim < last_states.size(); ++dim) {
-                auto &hasher = hashers_[dim];
+                hasher_t &hasher = hashers_[dim];
                 rolling_state_t &last_state = last_states[dim];
                 rolling_hash_t &rolling_minimum = rolling_minimums[dim];
                 min_count_t &min_count = min_counts[dim];
@@ -746,7 +756,7 @@ struct basic_rolling_hashers {
         for (; new_char_offset < text_chunk.size(); ++new_char_offset) {
             byte_t const new_char = text_chunk[new_char_offset];
             for (size_t dim = 0; dim < last_states.size(); ++dim) {
-                auto &hasher = hashers_[dim];
+                hasher_t &hasher = hashers_[dim];
                 rolling_state_t &last_state = last_states[dim];
                 rolling_hash_t &rolling_minimum = rolling_minimums[dim];
                 min_count_t &min_count = min_counts[dim];

From 474dec4d59de0643d30aa88eba6fd07233e9e6d0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 16 Aug 2025 20:51:37 +0000
Subject: [PATCH 580/751] Fix: Forward errors from `sz_rune_parse`

---
 include/stringzillas/similarities.hpp | 35 +++++++++++++++++++++------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/include/stringzillas/similarities.hpp b/include/stringzillas/similarities.hpp
index 5ff19c93..b50ace1a 100644
--- a/include/stringzillas/similarities.hpp
+++ b/include/stringzillas/similarities.hpp
@@ -1754,6 +1754,8 @@ struct levenshtein_distance_utf8 {
     using char_t = char_type_;
     using gap_costs_t = gap_costs_type_;
     using allocator_t = allocator_type_;
+    using allocator_traits_t = allocator_traits<allocator_t>;
+    using rune_allocator_t = typename allocator_traits_t::template rebind<sz_rune_t>::other;
 
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
@@ -1813,19 +1815,25 @@ struct levenshtein_distance_utf8 {
             return ascii_fallback_t {substituter_, gap_costs_, alloc_}(first, second, result_ref, executor);
 
         // Allocate some memory to expand UTF-8 strings into UTF-32.
-        size_t const max_utf32_bytes = first.size() * 4 + second.size() * 4;
-        sz_rune_t *const first_data_utf32 = (sz_rune_t *)alloc_.allocate(max_utf32_bytes);
+        safe_array<sz_rune_t, rune_allocator_t> unpacked_utf32(alloc_);
+        if (unpacked_utf32.try_resize(first.size() + second.size()) != status_t::success_k)
+            return status_t::bad_alloc_k;
+        sz_rune_t *const first_data_utf32 = unpacked_utf32.data();
         sz_rune_t *const second_data_utf32 = first_data_utf32 + first.size();
 
         // Export into UTF-32 buffer.
         sz_rune_length_t rune_length;
         size_t first_length_utf32 = 0, second_length_utf32 = 0;
         for (size_t progress_utf8 = 0, progress_utf32 = 0; progress_utf8 < first.size();
-             progress_utf8 += rune_length, ++progress_utf32, ++first_length_utf32)
+             progress_utf8 += rune_length, ++progress_utf32, ++first_length_utf32) {
             sz_rune_parse(first.data() + progress_utf8, first_data_utf32 + progress_utf32, &rune_length);
+            if (rune_length == sz_utf8_invalid_k) return status_t::invalid_utf8_k;
+        }
         for (size_t progress_utf8 = 0, progress_utf32 = 0; progress_utf8 < second.size();
-             progress_utf8 += rune_length, ++progress_utf32, ++second_length_utf32)
+             progress_utf8 += rune_length, ++progress_utf32, ++second_length_utf32) {
             sz_rune_parse(second.data() + progress_utf8, second_data_utf32 + progress_utf32, &rune_length);
+            if (rune_length == sz_utf8_invalid_k) return status_t::invalid_utf8_k;
+        }
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
         using similarity_memory_requirements_t = similarity_memory_requirements<size_t, false>;
@@ -3752,6 +3760,8 @@ struct levenshtein_distance_utf8<char, linear_gap_costs_t, allocator_type_, capa
     using char_t = char;
     using gap_costs_t = linear_gap_costs_t;
     using allocator_t = allocator_type_;
+    using allocator_traits_t = allocator_traits<allocator_t>;
+    using rune_allocator_t = typename allocator_traits_t::template rebind<sz_rune_t>::other;
 
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_wout_simd_k = (sz_capability_t)(capability_k & ~sz_cap_ice_k);
@@ -3784,6 +3794,9 @@ struct levenshtein_distance_utf8<char, linear_gap_costs_t, allocator_type_, capa
      *  @param[in] first The first string.
      *  @param[in] second The second string.
      *  @param[out] result_ref Location to dump the calculated score. Pointer-sized for compatibility with C APIs.
+     *  @retval status_t::success_k On successful computation.
+     *  @retval status_t::invalid_utf8_k If either input contains invalid UTF-8 sequences.
+     *  @retval status_t::bad_alloc_k If memory allocation fails.
      */
     template <typename executor_type_ = dummy_executor_t>
 #if SZ_HAS_CONCEPTS_
@@ -3798,19 +3811,25 @@ struct levenshtein_distance_utf8<char, linear_gap_costs_t, allocator_type_, capa
             return ascii_fallback_t {substituter_, gap_costs_, alloc_}(first, second, result_ref, executor);
 
         // Allocate some memory to expand UTF-8 strings into UTF-32.
-        size_t const max_utf32_bytes = first.size() * 4 + second.size() * 4;
-        sz_rune_t *const first_data_utf32 = (sz_rune_t *)alloc_.allocate(max_utf32_bytes);
+        safe_array<sz_rune_t, rune_allocator_t> unpacked_utf32(alloc_);
+        if (unpacked_utf32.try_resize(first.size() + second.size()) != status_t::success_k)
+            return status_t::bad_alloc_k;
+        sz_rune_t *const first_data_utf32 = unpacked_utf32.data();
         sz_rune_t *const second_data_utf32 = first_data_utf32 + first.size();
 
         // Export into UTF-32 buffer.
         sz_rune_length_t rune_length;
         size_t first_length_utf32 = 0, second_length_utf32 = 0;
         for (size_t progress_utf8 = 0, progress_utf32 = 0; progress_utf8 < first.size();
-             progress_utf8 += rune_length, ++progress_utf32, ++first_length_utf32)
+             progress_utf8 += rune_length, ++progress_utf32, ++first_length_utf32) {
             sz_rune_parse(first.data() + progress_utf8, first_data_utf32 + progress_utf32, &rune_length);
+            if (rune_length == sz_utf8_invalid_k) return status_t::invalid_utf8_k;
+        }
         for (size_t progress_utf8 = 0, progress_utf32 = 0; progress_utf8 < second.size();
-             progress_utf8 += rune_length, ++progress_utf32, ++second_length_utf32)
+             progress_utf8 += rune_length, ++progress_utf32, ++second_length_utf32) {
             sz_rune_parse(second.data() + progress_utf8, second_data_utf32 + progress_utf32, &rune_length);
+            if (rune_length == sz_utf8_invalid_k) return status_t::invalid_utf8_k;
+        }
 
         // Estimate the maximum dimension of the DP matrix and choose the best type for it.
         using similarity_memory_requirements_t = similarity_memory_requirements<size_t, false>;

From cf73d7949eacd7d3e3b33e246988461f19204fdf Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 16 Aug 2025 21:08:41 +0000
Subject: [PATCH 581/751] Fix: Rendering byte strings in Python

---
 include/stringzilla/types.h |  21 ++++-
 python/stringzilla.c        | 182 +++++++++++++++++++++++++++---------
 scripts/test_stringzilla.py |  24 +++++
 3 files changed, 180 insertions(+), 47 deletions(-)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index cf7f6428..2f2b345c 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -884,7 +884,9 @@ SZ_PUBLIC void sz_rune_parse(sz_cptr_t utf8, sz_rune_t *code, sz_rune_length_t *
         ch |= (*current++ & 0x3F) << 12;
         ch |= (*current++ & 0x3F) << 6;
         ch |= (*current++ & 0x3F);
-        ch_length = sz_utf8_rune_4bytes_k;
+        // Check if the code point is within valid Unicode range (U+0000 to U+10FFFF)
+        if (ch > 0x10FFFF) { ch = 0, ch_length = sz_utf8_invalid_k; }
+        else { ch_length = sz_utf8_rune_4bytes_k; }
     }
     else {
         // Invalid UTF8 rune.
@@ -895,6 +897,23 @@ SZ_PUBLIC void sz_rune_parse(sz_cptr_t utf8, sz_rune_t *code, sz_rune_length_t *
     *code_length = ch_length;
 }
 
+/**
+ *  @brief Validates if a UTF8 string contains only valid UTF8 sequences.
+ *  @param[in] utf8 The UTF8 string to validate.
+ *  @param[in] utf8_length The length of the UTF8 string in bytes.
+ *  @return sz_true_k if the string contains only valid UTF8, sz_false_k otherwise.
+ */
+SZ_PUBLIC sz_bool_t sz_runes_valid(sz_cptr_t utf8, sz_size_t utf8_length) {
+    sz_cptr_t const end = utf8 + utf8_length;
+    sz_rune_length_t rune_length;
+    sz_rune_t rune;
+    for (; utf8 != end; utf8 += rune_length) {
+        sz_rune_parse(utf8, &rune, &rune_length);
+        if (rune_length == sz_utf8_invalid_k) return sz_false_k;
+    }
+    return sz_true_k;
+}
+
 /**
  *  @brief Exports a UTF8 string into a UTF32 buffer.
  *  @warning The result is undefined id the UTF8 string is corrupted.
diff --git a/python/stringzilla.c b/python/stringzilla.c
index 9a7f8475..2c540c90 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -4162,44 +4162,109 @@ static PyObject *Strs_get_layout(Strs *self, void *Py_UNUSED(closure)) {
 
 /**
  *  @brief Exports a string to a UTF-8 buffer, escaping single quotes.
- *  @param[out] did_fit Populated with 1 if the string is fully exported, 0 if it didn't fit.
+ *  @param[in] cstr The input string to export.
+ *  @param[in] cstr_length The length of the input string.
+ *  @param[out] buffer The output buffer to write to.
+ *  @param[in] buffer_length The size of the output buffer.
+ *  @param[out] did_fit Populated with 1 if the string is fully exported, 0 if it didn't fit, -1 if invalid UTF-8.
+ *  @return Pointer to the end of the written data in the buffer, or buffer position where error occurred.
  */
 sz_cptr_t export_escaped_unquoted_to_utf8_buffer(sz_cptr_t cstr, sz_size_t cstr_length,    //
                                                  sz_ptr_t buffer, sz_size_t buffer_length, //
                                                  int *did_fit) {
     sz_cptr_t const cstr_end = cstr + cstr_length;
-    sz_ptr_t const buffer_end = buffer + buffer_length;
+    sz_ptr_t buffer_ptr = buffer;
     *did_fit = 1;
 
+    // First pass: calculate required buffer size and validate UTF-8
+    sz_size_t required_bytes = 2; // Opening and closing quotes
+    sz_cptr_t scan_ptr = cstr;
+    while (scan_ptr < cstr_end) {
+        sz_rune_t rune;
+        sz_rune_length_t rune_length;
+        sz_rune_parse(scan_ptr, &rune, &rune_length);
+
+        // Check for invalid UTF-8
+        if (rune_length == sz_utf8_invalid_k) {
+            *did_fit = -1; // Signal UTF-8 error
+            return buffer_ptr;
+        }
+
+        if (rune_length == 1 && *scan_ptr == '\'') { required_bytes += 2; } // Escaped quote: \'
+        else { required_bytes += rune_length; }                             // Normal rune
+        scan_ptr += rune_length;
+    }
+
+    // Check if we have enough buffer space
+    if (required_bytes > buffer_length) {
+        *did_fit = 0;
+        return buffer_ptr;
+    }
+
+    // Second pass: actually write to buffer
+    *(buffer_ptr++) = '\''; // Opening quote
+
     while (cstr < cstr_end) {
         sz_rune_t rune;
         sz_rune_length_t rune_length;
         sz_rune_parse(cstr, &rune, &rune_length);
-        if (rune_length == 1 && buffer + 2 < buffer_end) {
-            if (*cstr == '\'') {
-                *(buffer++) = '\\';
-                *(buffer++) = '\'';
-                cstr++;
-            }
-            else if (*cstr == '\'') {
-                *(buffer++) = '\\';
-                *(buffer++) = '\'';
-                cstr++;
-            }
-            else { *(buffer++) = *(cstr++); }
-        }
-        else if (buffer + rune_length < buffer_end) {
-            sz_copy(buffer, cstr, rune_length);
-            buffer += rune_length;
-            cstr += rune_length;
+
+        if (rune_length == 1 && *cstr == '\'') {
+            *(buffer_ptr++) = '\\';
+            *(buffer_ptr++) = '\'';
         }
         else {
-            *did_fit = 0;
-            break;
+            sz_copy(buffer_ptr, cstr, rune_length);
+            buffer_ptr += rune_length;
         }
+        cstr += rune_length;
+    }
+
+    *(buffer_ptr++) = '\''; // Closing quote
+    return buffer_ptr;
+}
+
+/**
+ *  @brief Exports a binary string to a buffer in Python bytes representation (b'\\x..').
+ *  @param[in] data The binary data to export.
+ *  @param[in] data_length The length of the binary data.
+ *  @param[out] buffer The output buffer to write to.
+ *  @param[in] buffer_length The size of the output buffer.
+ *  @param[out] did_fit Populated with 1 if the data is fully exported, 0 if it didn't fit.
+ *  @return Pointer to the end of the written data in the buffer.
+ */
+sz_cptr_t export_escaped_unquoted_to_binary_buffer(sz_cptr_t data, sz_size_t data_length,    //
+                                                   sz_ptr_t buffer, sz_size_t buffer_length, //
+                                                   int *did_fit) {
+    sz_ptr_t buffer_ptr = buffer;
+    *did_fit = 1;
+
+    // First pass: calculate required buffer size
+    // Format: b'\x00\x01...'  -> 3 bytes prefix + 4 bytes per byte + 1 byte suffix
+    sz_size_t required_bytes = 3 + (data_length * 4) + 1;
+
+    // Check if we have enough buffer space
+    if (required_bytes > buffer_length) {
+        *did_fit = 0;
+        return buffer_ptr;
     }
 
-    return buffer;
+    // Second pass: write to buffer
+    *(buffer_ptr++) = 'b';
+    *(buffer_ptr++) = '\'';
+
+    // Export each byte as \x followed by two hex digits
+    static const char hex_chars[] = "0123456789abcdef";
+    for (sz_size_t i = 0; i < data_length; i++) {
+        unsigned char byte = (unsigned char)data[i];
+        *(buffer_ptr++) = '\\';
+        *(buffer_ptr++) = 'x';
+        *(buffer_ptr++) = hex_chars[byte >> 4];
+        *(buffer_ptr++) = hex_chars[byte & 0x0f];
+    }
+
+    *(buffer_ptr++) = '\'';
+    return buffer_ptr;
 }
 
 /**
@@ -4238,20 +4303,23 @@ static PyObject *Strs_repr(Strs *self) {
         getter(self, i, count, &parent_string, &cstr_start, &cstr_length);
 
         if (i > 0) { *(repr_buffer_ptr++) = ',', *(repr_buffer_ptr++) = ' '; }
-        *(repr_buffer_ptr++) = '\'';
 
+        // Check if the string contains valid UTF-8
         int did_fit;
-        repr_buffer_ptr = export_escaped_unquoted_to_utf8_buffer(
-            cstr_start, cstr_length, repr_buffer_ptr, repr_buffer_end - repr_buffer_ptr - non_fitting_array_tail_length,
-            &did_fit);
+        repr_buffer_ptr = sz_runes_valid(cstr_start, cstr_length)
+                              ? export_escaped_unquoted_to_utf8_buffer(
+                                    cstr_start, cstr_length, repr_buffer_ptr,
+                                    repr_buffer_end - repr_buffer_ptr - non_fitting_array_tail_length, &did_fit)
+                              : export_escaped_unquoted_to_binary_buffer(
+                                    cstr_start, cstr_length, repr_buffer_ptr,
+                                    repr_buffer_end - repr_buffer_ptr - non_fitting_array_tail_length, &did_fit);
+
         // If it didn't fit, let's put an ellipsis
         if (!did_fit) {
             sz_copy(repr_buffer_ptr, non_fitting_array_tail, non_fitting_array_tail_length);
             repr_buffer_ptr += non_fitting_array_tail_length;
             return PyUnicode_FromStringAndSize(repr_buffer, repr_buffer_ptr - repr_buffer);
         }
-        else
-            *(repr_buffer_ptr++) = '\''; // Close the string
     }
 
     // Close the array
@@ -4279,18 +4347,31 @@ static PyObject *Strs_str(Strs *self) {
         sz_cptr_t cstr_start = NULL;
         sz_size_t cstr_length = 0;
         getter(self, i, count, &parent_string, &cstr_start, &cstr_length);
-        total_bytes += cstr_length;
-        total_bytes += 2;             // For the single quotes
+        
         if (i != 0) total_bytes += 2; // For the preceding comma and space
 
-        // Count the number of single quotes in the string
-        while (cstr_length) {
-            char quote = '\'';
-            sz_cptr_t next_quote = sz_find_byte(cstr_start, cstr_length, &quote);
-            if (next_quote == NULL) break;
-            total_bytes++;
-            cstr_length -= next_quote - cstr_start;
-            cstr_start = next_quote + 1;
+        // Check if string is valid UTF-8 to determine format
+        if (sz_runes_valid(cstr_start, cstr_length)) {
+            // Valid UTF-8: format as '...' with escaped quotes
+            total_bytes += 2; // Opening and closing quotes
+            total_bytes += cstr_length; // Base string length
+            
+            // Count the number of single quotes that need escaping
+            sz_cptr_t scan_ptr = cstr_start;
+            sz_size_t scan_length = cstr_length;
+            while (scan_length) {
+                char quote = '\'';
+                sz_cptr_t next_quote = sz_find_byte(scan_ptr, scan_length, &quote);
+                if (next_quote == NULL) break;
+                total_bytes++; // Extra byte for escaping
+                scan_length -= next_quote - scan_ptr + 1;
+                scan_ptr = next_quote + 1;
+            }
+        } else {
+            // Invalid UTF-8: format as b'\x...'
+            total_bytes += 3; // "b'" prefix
+            total_bytes += cstr_length * 4; // Each byte becomes \xNN (4 chars)
+            total_bytes += 1; // Closing quote
         }
     }
 
@@ -4312,14 +4393,23 @@ static PyObject *Strs_str(Strs *self) {
         sz_cptr_t cstr_start = NULL;
         sz_size_t cstr_length = 0;
         getter(self, i, count, &parent_string, &cstr_start, &cstr_length);
-        *result_ptr++ = '\'';
         int did_fit;
-        result_ptr = export_escaped_unquoted_to_utf8_buffer(cstr_start, cstr_length, result_ptr, total_bytes, &did_fit);
-        *result_ptr++ = '\'';
+        // Check if the string contains valid UTF-8 and export appropriately
+        result_ptr =
+            sz_runes_valid(cstr_start, cstr_length)
+                ? export_escaped_unquoted_to_utf8_buffer(cstr_start, cstr_length, result_ptr,
+                                                         total_bytes - (result_ptr - result_buffer), &did_fit)
+                : export_escaped_unquoted_to_binary_buffer(cstr_start, cstr_length, result_ptr,
+                                                           total_bytes - (result_ptr - result_buffer), &did_fit);
+
+        // Note: If did_fit is 0, we have a buffer size calculation error, but we continue for robustness
     }
 
     *result_ptr++ = ']';
-    return PyUnicode_FromStringAndSize(result_buffer, total_bytes);
+    sz_size_t actual_bytes = result_ptr - result_buffer;
+    PyObject *result = PyUnicode_FromStringAndSize(result_buffer, actual_bytes);
+    free(result_buffer);
+    return result;
 }
 
 static PySequenceMethods Strs_as_sequence = {
@@ -5106,10 +5196,10 @@ static void Strs_dealloc(Strs *self) {
 }
 
 static PyMethodDef Strs_methods[] = {
-    {"shuffled", Strs_shuffled, SZ_METHOD_FLAGS, "Shuffle the elements of the Strs object."},         //
-    {"sorted", Strs_sorted, SZ_METHOD_FLAGS, "Sort (in-place) the elements of the Strs object."},     //
-    {"argsort", Strs_argsort, SZ_METHOD_FLAGS, "Provides the permutation to achieve sorted order."},  //
-    {"sample", Strs_sample, SZ_METHOD_FLAGS, "Provides a random sample of a given size."},            //
+    {"shuffled", Strs_shuffled, SZ_METHOD_FLAGS, "Shuffle the elements of the Strs object."},        //
+    {"sorted", Strs_sorted, SZ_METHOD_FLAGS, "Sort (in-place) the elements of the Strs object."},    //
+    {"argsort", Strs_argsort, SZ_METHOD_FLAGS, "Provides the permutation to achieve sorted order."}, //
+    {"sample", Strs_sample, SZ_METHOD_FLAGS, "Provides a random sample of a given size."},           //
     // {"to_pylist", Strs_to_pylist, SZ_METHOD_FLAGS, "Exports string-views to a native list of native strings."}, //
     {NULL, NULL, 0, NULL} // Sentinel
 };
diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index 4859c17d..c95b905c 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -906,6 +906,30 @@ def test_strs_from_arrow_with_nulls():
     assert strs_mixed[5] == "world"
 
 
+def test_invalid_utf8_handling():
+    """Test that both Str and Strs handle invalid UTF-8 bytes gracefully."""
+
+    # Test arrays with invalid UTF-8 sequences
+    test_arrays = [
+        [b"hello", b"\x80world", b"valid"],  # Mixed valid/invalid
+        [b"\xff\xfe", b"\x80", b"\xf4\x90\x80\x80"],  # All invalid
+        [b"normal", b"string with \x80 bytes"],  # Partial invalid
+    ]
+
+    for test_array in test_arrays:
+        strs_obj = Strs(test_array)
+
+        # These should never raise exceptions
+        repr_result = repr(strs_obj)
+        str_result = str(strs_obj)
+
+        # Basic assertions
+        assert isinstance(repr_result, str)
+        assert isinstance(str_result, str)
+        assert len(repr_result) > 0
+        assert len(str_result) > 0
+
+
 if __name__ == "__main__":
 
     sys.exit(pytest.main(["-x", "-s", __file__]))

From f4d4a76b6c6ced8c1c2e4ff5b0cc117a152a1c1d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 17 Aug 2025 10:22:25 +0000
Subject: [PATCH 582/751] Improve: Drop unused `info1`

---
 include/stringzilla/stringzilla.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 8ff78257..c8009f44 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -242,16 +242,11 @@ SZ_INTERNAL sz_capability_t sz_capabilities_implementation_x86_(void) {
         struct separate_t {
             unsigned eax, ebx, ecx, edx;
         } named;
-    } info1, info7;
+    } info7;
 
 #if defined(_MSC_VER)
-    __cpuidex(info1.array, 1, 0);
     __cpuidex(info7.array, 7, 0);
 #else
-    __asm__ __volatile__( //
-        "cpuid"
-        : "=a"(info1.named.eax), "=b"(info1.named.ebx), "=c"(info1.named.ecx), "=d"(info1.named.edx)
-        : "a"(1), "c"(0));
     __asm__ __volatile__( //
         "cpuid"
         : "=a"(info7.named.eax), "=b"(info7.named.ebx), "=c"(info7.named.ecx), "=d"(info7.named.edx)

From 6f2662961631d4884877357be235c1c2d91c2097 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 17 Aug 2025 11:23:02 +0000
Subject: [PATCH 583/751] Add: On GPU fingerprints in Python

---
 c/stringzillas.cuh                    |  69 ++++++++--
 drafts/find_many.cuh                  |   2 +-
 include/stringzillas/fingerprints.cuh |  31 ++---
 include/stringzillas/fingerprints.hpp |  69 ++++++++--
 include/stringzillas/similarities.cuh |   2 +-
 include/stringzillas/similarities.hpp |  12 +-
 scripts/bench_fingerprints.cuh        | 186 +++++++++++++++-----------
 7 files changed, 249 insertions(+), 122 deletions(-)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index 4d54f40e..b6ba9501 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -23,7 +23,7 @@ namespace szs = ashvardanian::stringzillas;
 
 using malloc_t = std::allocator<char>;
 #if SZ_USE_CUDA
-using ualloc_t = szs::unified_alloc<char>;
+using ualloc_t = szs::unified_alloc_t;
 #endif // SZ_USE_CUDA
 
 /** Helper class for `std::visit` to handle multiple callable types in a single variant. */
@@ -563,7 +563,11 @@ using vec = szs::safe_vector<element_type_, std::allocator<element_type_>>;
 static constexpr size_t fingerprint_slice_k = 64;
 
 struct fingerprints_backends_t {
-    using fallback_variant_t = szs::basic_rolling_hashers<szs::floating_rolling_hasher<sz::f64_t>, sz::u32_t>;
+    using fallback_variant_cpus_t = szs::basic_rolling_hashers<szs::floating_rolling_hasher<sz::f64_t>, sz::u32_t>;
+#if SZ_USE_CUDA
+    using fallback_variant_cuda_t = szs::basic_rolling_hashers<szs::floating_rolling_hasher<sz::f64_t>, sz::u32_t,
+                                                               sz::u32_t, ualloc_t, sz_cap_cuda_k>;
+#endif // SZ_USE_CUDA
 
     /**
      *  On each hardware platform the contains a group of rolling hashers.
@@ -577,10 +581,9 @@ struct fingerprints_backends_t {
         vec<szs::floating_rolling_hashers<sz_cap_skylake_k, fingerprint_slice_k>>,
 #endif
 #if SZ_USE_CUDA
-        vec<szs::floating_rolling_hashers<sz_cap_cuda_k, fingerprint_slice_k>>,
+        vec<szs::floating_rolling_hashers<sz_cap_cuda_k, fingerprint_slice_k>>, fallback_variant_cuda_t,
 #endif
-        vec<szs::floating_rolling_hashers<sz_cap_serial_k, fingerprint_slice_k>>, //
-        fallback_variant_t>
+        vec<szs::floating_rolling_hashers<sz_cap_serial_k, fingerprint_slice_k>>, fallback_variant_cpus_t>
         variants;
 
     sz_size_t dimensions = 0; // Total number of dimensions across all hashers
@@ -612,8 +615,8 @@ sz_status_t sz_fingerprints_for_(                                     //
 
     // The simplest case, is having non-optimized non-unrolled hashers.
     sz_status_t result = sz_success_k;
-    using fallback_variant_t = typename fingerprints_backends_t::fallback_variant_t;
-    auto fallback_logic = [&](fallback_variant_t &fallback_hashers) {
+    using fallback_variant_cpus_t = typename fingerprints_backends_t::fallback_variant_cpus_t;
+    auto fallback_logic_cpus = [&](fallback_variant_cpus_t &fallback_hashers) {
         auto const min_hashes_rows = //
             strided_rows<sz_u32_t> {reinterpret_cast<sz_ptr_t>(min_hashes), dims, min_hashes_stride, texts_count};
         auto const min_counts_rows = //
@@ -634,10 +637,29 @@ sz_status_t sz_fingerprints_for_(                                     //
                 get_executor(device_scope), get_specs(device_scope));
             result = static_cast<sz_status_t>(status);
         }
+        else if (std::holds_alternative<gpu_scope_t>(device->variants)) { result = sz_status_unknown_k; }
+        else { result = sz_status_unknown_k; }
+    };
+#if SZ_USE_CUDA
+    using fallback_variant_cuda_t = typename fingerprints_backends_t::fallback_variant_cuda_t;
+    auto fallback_logic_gpus = [&](fallback_variant_cuda_t &fallback_hashers) {
+        auto const min_hashes_rows = //
+            strided_rows<sz_u32_t> {reinterpret_cast<sz_ptr_t>(min_hashes), dims, min_hashes_stride, texts_count};
+        auto const min_counts_rows = //
+            strided_rows<sz_u32_t> {reinterpret_cast<sz_ptr_t>(min_counts), dims, min_counts_stride, texts_count};
+
+        // CPU fallback hashers can only work with CPU-compatible device scopes
+        if (std::holds_alternative<gpu_scope_t>(device->variants)) {
+            auto &device_scope = std::get<gpu_scope_t>(device->variants);
+            sz::status_t status = fallback_hashers(                //
+                texts_container, min_hashes_rows, min_counts_rows, //
+                get_executor(device_scope), get_specs(device_scope));
+            result = static_cast<sz_status_t>(status);
+        }
         else { result = sz_status_unknown_k; }
     };
 
-    // The unrolled logic is a bit more complex than `fallback_logic`, but in practice involves
+    // The unrolled logic is a bit more complex than `fallback_logic_cpus`, but in practice involves
     // just one additional loop level.
     auto unrolled_logic = [&](auto &&unrolled_hashers) {
         using unrolled_hashers_t = std::decay_t<decltype(unrolled_hashers)>;
@@ -667,6 +689,7 @@ sz_status_t sz_fingerprints_for_(                                     //
                         min_counts_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
                         get_executor(device_scope), get_specs(device_scope));
                     result = static_cast<sz_status_t>(status);
+                    if (result != sz_success_k) break;
                 }
             }
             else { result = sz_status_unknown_k; }
@@ -703,8 +726,13 @@ sz_status_t sz_fingerprints_for_(                                     //
             else { result = sz_status_unknown_k; }
         }
     };
+#endif // SZ_USE_CUDA
 
-    std::visit(overloaded {fallback_logic, unrolled_logic}, engine->variants);
+#if SZ_USE_CUDA
+    std::visit(overloaded {fallback_logic_cpus, fallback_logic_gpus, unrolled_logic}, engine->variants);
+#else
+    std::visit(overloaded {fallback_logic_cpus, unrolled_logic}, engine->variants);
+#endif
     return result;
 }
 
@@ -1363,7 +1391,7 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_init(                          //
     auto const dimensions_per_window_width_max = sz::divide_round_up(dimensions, window_widths_count);
     auto const can_use_sliced_sketchers = (dimensions_per_window_width_min == dimensions_per_window_width_max) &&
                                           (dimensions_per_window_width_min % fingerprint_slice_k == 0);
-    using fallback_variant_t = typename fingerprints_backends_t::fallback_variant_t;
+    using fallback_variant_cpus_t = typename fingerprints_backends_t::fallback_variant_cpus_t;
 
 #if SZ_USE_HASWELL
     bool const can_use_haswell = (capabilities & sz_cap_haswell_k) == sz_cap_haswell_k;
@@ -1438,6 +1466,23 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_init(                          //
         *engine_punned = reinterpret_cast<sz_fingerprints_t>(engine);
         return sz_success_k;
     }
+    else if (can_use_cuda) {
+        using fallback_variant_cuda_t = typename fingerprints_backends_t::fallback_variant_cuda_t;
+        auto variant = fallback_variant_cuda_t();
+        for (size_t dimension = 0; dimension < dimensions; ++dimension) {
+            auto const window_width = window_widths[dimension % window_widths_count];
+            auto const extend_status = variant.try_extend(window_width, 1, alphabet_size);
+            if (extend_status != sz::status_t::success_k) return static_cast<sz_status_t>(extend_status);
+        }
+
+        auto engine = new (std::nothrow)
+            fingerprints_backends_t(std::in_place_type_t<fallback_variant_cuda_t>(), std::move(variant));
+        if (!engine) return sz_bad_alloc_k;
+
+        engine->dimensions = dimensions;
+        *engine_punned = reinterpret_cast<sz_fingerprints_t>(engine);
+        return sz_success_k;
+    }
 #endif // SZ_USE_CUDA
 
     // Build the vectorized, but serial backend
@@ -1464,7 +1509,7 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_init(                          //
     }
 
     // Build the fallback variant with interleaving width dimensions
-    auto variant = fallback_variant_t();
+    auto variant = fallback_variant_cpus_t();
     for (size_t dimension = 0; dimension < dimensions; ++dimension) {
         auto const window_width = window_widths[dimension % window_widths_count];
         auto const extend_status = variant.try_extend(window_width, 1, alphabet_size);
@@ -1472,7 +1517,7 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_init(                          //
     }
 
     auto engine =
-        new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<fallback_variant_t>(), std::move(variant));
+        new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<fallback_variant_cpus_t>(), std::move(variant));
     if (!engine) return sz_bad_alloc_k;
 
     engine->dimensions = dimensions;
diff --git a/drafts/find_many.cuh b/drafts/find_many.cuh
index 85fc0f0e..34363ad6 100644
--- a/drafts/find_many.cuh
+++ b/drafts/find_many.cuh
@@ -477,7 +477,7 @@ struct find_many<state_id_type_, allocator_type_, sz_cap_cuda_k, enable_> {
 
 #pragma endregion // General Purpose CUDA Backend
 
-using find_many_u32_cuda_t = find_many<u32_t, unified_alloc<char>, sz_cap_cuda_k>;
+using find_many_u32_cuda_t = find_many<u32_t, unified_alloc_t, sz_cap_cuda_k>;
 
 } // namespace stringzillas
 } // namespace ashvardanian
diff --git a/include/stringzillas/fingerprints.cuh b/include/stringzillas/fingerprints.cuh
index b5d46918..fdc8c275 100644
--- a/include/stringzillas/fingerprints.cuh
+++ b/include/stringzillas/fingerprints.cuh
@@ -103,14 +103,13 @@ __global__ void basic_rolling_hashers_kernel_(
     sz_assert_(warps_per_block == density_k && "Block size mismatch in kernel");
     unsigned const warps_per_device = static_cast<unsigned>(gridDim.x * warps_per_block);
     unsigned const thread_in_warp_index = static_cast<unsigned>(global_thread_index % warp_size_k);
-    unsigned const warp_in_block_index = static_cast<unsigned>(global_warp_index % density_k);
 
     // Load the hashers states per thread in a strided fashion.
     hasher_t hashers[dimensions_per_thread_k];
 #pragma unroll
     for (unsigned dim_within_thread = 0; dim_within_thread < dimensions_per_thread_k; ++dim_within_thread) {
         unsigned const dim = dim_within_thread * warp_size_k + thread_in_warp_index;
-        hasher_t const &hasher = hashers[dim];
+        hasher_t const &hasher = hashers_global[dim];
         if (dim >= hashers_count) continue; // ? Avoid out-of-bounds access
         hashers[dim_within_thread] = hasher;
     }
@@ -128,18 +127,17 @@ __global__ void basic_rolling_hashers_kernel_(
         for (auto &rolling_count : rolling_counts) rolling_count = 0;
 
         // Until we reach the maximum window length, use a branching code version
-        size_t const prefix_length = std::min<size_t>(task.text_length, max_window_width_);
+        size_t const prefix_length = std::min<size_t>(task.text_length, max_window_width);
         size_t new_char_offset = 0;
         for (; new_char_offset < prefix_length; ++new_char_offset) {
-            byte_t const new_char = task.text_ptr[new_char_offset]; // ? Hardware may auto-broadcast this
-            f64_t const new_term = static_cast<f64_t>(new_char) + 1.0;
+            auto const new_char = task.text_ptr[new_char_offset]; // ? Hardware may auto-broadcast this
 
 #pragma unroll
             for (unsigned dim_within_thread = 0; dim_within_thread < dimensions_per_thread_k; ++dim_within_thread) {
-                auto &hasher = hashers_[dim_within_thread];
+                hasher_t &hasher = hashers[dim_within_thread];
                 rolling_state_t &last_state = last_states[dim_within_thread];
                 rolling_hash_t &rolling_minimum = rolling_minimums[dim_within_thread];
-                min_count_t &min_count = min_counts[dim_within_thread];
+                min_count_t &min_count = rolling_counts[dim_within_thread];
                 if (new_char_offset < hasher.window_width()) {
                     last_state = hasher.push(last_state, new_char);
                     if (hasher.window_width() == (new_char_offset + 1)) {
@@ -148,7 +146,7 @@ __global__ void basic_rolling_hashers_kernel_(
                     }
                     continue;
                 }
-                auto const old_char = text_chunk[new_char_offset - hasher.window_width()];
+                auto const old_char = task.text_ptr[new_char_offset - hasher.window_width()];
                 last_state = hasher.roll(last_state, old_char, new_char);
                 rolling_hash_t new_hash = hasher.digest(last_state);
                 min_count *= new_hash >= rolling_minimum; // ? Discard `min_count` to 0 for new extremums
@@ -159,16 +157,15 @@ __global__ void basic_rolling_hashers_kernel_(
 
         // Now we can avoid a branch in the nested loop, as we are passed the longest window width
         for (; new_char_offset + warp_size_k <= task.text_length; new_char_offset += warp_size_k) {
-            byte_t const new_char = task.text_ptr[new_char_offset]; // ? Hardware may auto-broadcast this
-            f64_t const new_term = static_cast<f64_t>(new_char) + 1.0;
+            auto const new_char = task.text_ptr[new_char_offset]; // ? Hardware may auto-broadcast this
 
 #pragma unroll
             for (unsigned dim_within_thread = 0; dim_within_thread < dimensions_per_thread_k; ++dim_within_thread) {
-                auto &hasher = hashers_[dim_within_thread];
+                hasher_t &hasher = hashers[dim_within_thread];
                 rolling_state_t &last_state = last_states[dim_within_thread];
                 rolling_hash_t &rolling_minimum = rolling_minimums[dim_within_thread];
-                min_count_t &min_count = min_counts[dim_within_thread];
-                auto const old_char = text_chunk[new_char_offset - hasher.window_width()];
+                min_count_t &min_count = rolling_counts[dim_within_thread];
+                auto const old_char = task.text_ptr[new_char_offset - hasher.window_width()];
                 last_state = hasher.roll(last_state, old_char, new_char);
                 rolling_hash_t new_hash = hasher.digest(last_state);
                 min_count *= new_hash >= rolling_minimum; // ? Discard `min_count` to 0 for new extremums
@@ -400,7 +397,7 @@ __global__ void floating_rolling_hashers_across_cuda_device_(span<cuda_fingerpri
  *  @brief CUDA specialization of `basic_rolling_hashers` for count-min-sketching.
  */
 template <typename hasher_type_, typename min_hash_type_, typename min_count_type_>
-struct basic_rolling_hashers<hasher_type_, min_hash_type_, min_count_type_, unified_alloc<char>, sz_cap_cuda_k> {
+struct basic_rolling_hashers<hasher_type_, min_hash_type_, min_count_type_, unified_alloc_t, sz_cap_cuda_k> {
 
     using hasher_t = hasher_type_;
     using rolling_state_t = typename hasher_t::state_t;
@@ -408,7 +405,7 @@ struct basic_rolling_hashers<hasher_type_, min_hash_type_, min_count_type_, unif
 
     using min_hash_t = min_hash_type_;
     using min_count_t = min_count_type_;
-    using allocator_t = unified_alloc<char>;
+    using allocator_t = unified_alloc_t;
 
     using hashers_allocator_t = typename allocator_t::template rebind<hasher_t>::other;
     using hashers_t = safe_vector<hasher_t, hashers_allocator_t>;
@@ -540,7 +537,7 @@ struct basic_rolling_hashers<hasher_type_, min_hash_type_, min_count_type_, unif
         auto const *tasks_ptr = tasks.data();
         auto const tasks_size = tasks.size();
         auto const *hashers_ptr = hashers_.data();
-        auto const hashers_size = (std::min)(dimensions_k, hashers_.size());
+        auto const hashers_size = hashers_.size();
         warp_level_kernel_args[0] = (void *)(&tasks_ptr);
         warp_level_kernel_args[1] = (void *)(&tasks_size);
         warp_level_kernel_args[2] = (void *)(&hashers_ptr);
@@ -591,7 +588,7 @@ struct floating_rolling_hashers<sz_cap_cuda_k, dimensions_> {
     using rolling_state_t = f64_t;
     using min_hash_t = u32_t;
     using min_count_t = u32_t;
-    using allocator_t = unified_alloc<char>;
+    using allocator_t = unified_alloc_t;
 
     using hashers_allocator_t = typename allocator_t::template rebind<hasher_t>::other;
     using hashers_t = safe_vector<hasher_t, hashers_allocator_t>;
diff --git a/include/stringzillas/fingerprints.hpp b/include/stringzillas/fingerprints.hpp
index e639bd7b..1159a1cb 100644
--- a/include/stringzillas/fingerprints.hpp
+++ b/include/stringzillas/fingerprints.hpp
@@ -187,7 +187,11 @@ struct rabin_karp_rolling_hasher {
         : is_same_type<hash_t, u32_t>::value ? SZ_U32_MAX_PRIME
                                              : SZ_U64_MAX_PRIME;
 
-    explicit rabin_karp_rolling_hasher(              //
+    constexpr rabin_karp_rolling_hasher() noexcept
+        : window_width_ {0}, modulo_ {default_modulo_base_k}, multiplier_ {default_alphabet_size_k},
+          discarding_multiplier_ {1} {}
+
+    constexpr explicit rabin_karp_rolling_hasher(    //
         size_t window_width,                         //
         hash_t multiplier = default_alphabet_size_k, //
         hash_t modulo = default_modulo_base_k) noexcept
@@ -419,6 +423,48 @@ inline f64_t absolute_fmod(f64_t x, f64_t y) noexcept {
 
 inline u64_t absolute_umod(f64_t x, f64_t y) noexcept { return static_cast<u64_t>(absolute_fmod(x, y)); }
 
+/**
+ * @brief Constexpr-compatible `std::floor`-like function for C++17, based on IEEE 754 bit manipulation.
+ * @param[in] x The double-precision floating-point number to floor.
+ * @return The largest integer value not greater than x.
+ */
+inline constexpr f64_t constexpr_floor(f64_t x) noexcept {
+    // Use a union to access the bit representation of the double
+    union ieee754_double {
+        f64_t value;
+        u64_t bits;
+    };
+
+    ieee754_double number = {x};
+
+    // Extract the exponent: bits 52-62, biased by 1023
+    i32_t exponent = static_cast<i32_t>((number.bits >> 52) & 0x7FF) - 1023;
+
+    // If exponent < 0, then |x| < 1
+    if (exponent < 0) {
+        // Return 0 for positive numbers, -1 for negative numbers with fractional part
+        if (static_cast<i64_t>(number.bits) >= 0) { return 0.0; }             // Positive number less than 1
+        else if ((number.bits & 0x7FFFFFFFFFFFFFFFULL) != 0) { return -1.0; } // Negative number with fractional part
+        return x;                                                             // Exactly 0 or -0
+    }
+
+    // If exponent >= 52, all bits represent the integer part (no fractional bits)
+    if (exponent >= 52) return x; // Already an integer (or infinity/NaN)
+
+    // Calculate which bits represent the fractional part
+    u64_t fractional_mask = 0x000FFFFFFFFFFFFFULL >> exponent;
+
+    // If no fractional bits are set, x is already an integer
+    if ((number.bits & fractional_mask) == 0) return x;
+
+    // For negative numbers, add 1 to the integer part before truncating
+    if (static_cast<i64_t>(number.bits) < 0) number.bits += (0x0010000000000000ULL >> exponent);
+
+    // Clear the fractional bits
+    number.bits &= ~fractional_mask;
+    return number.value;
+}
+
 /**
  *  @brief Rabin-Karp-style Rolling hash function for f64_t-precision floating-point numbers.
  *  @tparam state_type_ Type of the floating-point number, e.g., `f32_t`.
@@ -478,21 +524,21 @@ struct floating_rolling_hasher<f64_t> {
     constexpr floating_rolling_hasher &operator=(floating_rolling_hasher &&) noexcept = default;
     constexpr floating_rolling_hasher &operator=(floating_rolling_hasher const &) noexcept = default;
 
-    SZ_INLINE size_t window_width() const noexcept { return window_width_; }
+    constexpr size_t window_width() const noexcept { return window_width_; }
 
-    SZ_INLINE state_t push(state_t state, byte_t new_char) const noexcept {
+    constexpr state_t push(state_t state, byte_t new_char) const noexcept {
         state_t new_term = state_t(new_char) + 1.0;
         return fma_mod(state, multiplier_, new_term);
     }
 
-    SZ_INLINE state_t roll(state_t state, byte_t old_char, byte_t new_char) const noexcept {
+    constexpr state_t roll(state_t state, byte_t old_char, byte_t new_char) const noexcept {
         state_t old_term = state_t(old_char) + 1.0;
         state_t new_term = state_t(new_char) + 1.0;
         state_t without_old = fma_mod(negative_discarding_multiplier_, old_term, state);
         return fma_mod(without_old, multiplier_, new_term);
     }
 
-    SZ_INLINE hash_t digest(state_t state) const noexcept { return static_cast<hash_t>(state); }
+    constexpr hash_t digest(state_t state) const noexcept { return static_cast<hash_t>(state); }
 
     constexpr state_t multiplier() const noexcept { return multiplier_; }
     constexpr state_t modulo() const noexcept { return modulo_; }
@@ -500,25 +546,28 @@ struct floating_rolling_hasher<f64_t> {
     constexpr state_t negative_discarding_multiplier() const noexcept { return negative_discarding_multiplier_; }
 
   private:
-    SZ_INLINE state_t fma_mod(state_t a, state_t b, state_t c) const noexcept { return barrett_mod(a * b + c); }
+    constexpr state_t fma_mod(state_t a, state_t b, state_t c) const noexcept { return barrett_mod(a * b + c); }
 
     /**
      *  @brief Barrett-style `std::fmod` alternative to avoid overflow.
      *  @see https://en.cppreference.com/w/cpp/numeric/math/fmod
      */
-    SZ_INLINE state_t barrett_mod(state_t x) const noexcept {
+    constexpr state_t barrett_mod(state_t x) const noexcept {
 
-        state_t q = std::floor(x * inverse_modulo_);
+        state_t q = constexpr_floor(x * inverse_modulo_);
         state_t result = x - q * modulo_;
 
         // Clamp into the [0, modulo_) range.
         if (result >= modulo_) result -= modulo_;
         if (result < 0.0) result += modulo_;
 
+        // Skip debug assertions that call non-constexpr functions when compiling with NVCC
+#if !defined(__NVCC__)
         sz_assert_(result >= 0 && "Intermediate x underflows the zero");
         sz_assert_(result < limit_k && "Intermediate x overflows the limit");
         sz_assert_(static_cast<u64_t>(absolute_fmod(x, modulo_)) == static_cast<u64_t>(result) &&
                    "Floating point modulo was incorrect");
+#endif
         return result;
     }
 
@@ -731,7 +780,7 @@ struct basic_rolling_hashers<hasher_type_, min_hash_type_, min_count_type_, allo
         for (; new_char_offset < prefix_length; ++new_char_offset) {
             byte_t const new_char = text_chunk[new_char_offset];
             for (size_t dim = 0; dim < last_states.size(); ++dim) {
-                hasher_t &hasher = hashers_[dim];
+                hasher_t const &hasher = hashers_[dim];
                 rolling_state_t &last_state = last_states[dim];
                 rolling_hash_t &rolling_minimum = rolling_minimums[dim];
                 min_count_t &min_count = min_counts[dim];
@@ -756,7 +805,7 @@ struct basic_rolling_hashers<hasher_type_, min_hash_type_, min_count_type_, allo
         for (; new_char_offset < text_chunk.size(); ++new_char_offset) {
             byte_t const new_char = text_chunk[new_char_offset];
             for (size_t dim = 0; dim < last_states.size(); ++dim) {
-                hasher_t &hasher = hashers_[dim];
+                hasher_t const &hasher = hashers_[dim];
                 rolling_state_t &last_state = last_states[dim];
                 rolling_hash_t &rolling_minimum = rolling_minimums[dim];
                 min_count_t &min_count = min_counts[dim];
diff --git a/include/stringzillas/similarities.cuh b/include/stringzillas/similarities.cuh
index 5c978697..4b5916a5 100644
--- a/include/stringzillas/similarities.cuh
+++ b/include/stringzillas/similarities.cuh
@@ -50,7 +50,7 @@ namespace stringzillas {
 
 #pragma region - Common Aliases
 
-using ualloc_t = unified_alloc<char>;
+using ualloc_t = unified_alloc_t;
 
 /**
  *  In @b CUDA:
diff --git a/include/stringzillas/similarities.hpp b/include/stringzillas/similarities.hpp
index b50ace1a..7102e65f 100644
--- a/include/stringzillas/similarities.hpp
+++ b/include/stringzillas/similarities.hpp
@@ -1754,8 +1754,8 @@ struct levenshtein_distance_utf8 {
     using char_t = char_type_;
     using gap_costs_t = gap_costs_type_;
     using allocator_t = allocator_type_;
-    using allocator_traits_t = allocator_traits<allocator_t>;
-    using rune_allocator_t = typename allocator_traits_t::template rebind<sz_rune_t>::other;
+    using allocator_traits_t = std::allocator_traits<allocator_t>;
+    using rune_allocator_t = typename allocator_traits_t::template rebind_alloc<sz_rune_t>;
 
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_serialized_k = (sz_capability_t)(capability_k & ~sz_cap_parallel_k);
@@ -1815,7 +1815,7 @@ struct levenshtein_distance_utf8 {
             return ascii_fallback_t {substituter_, gap_costs_, alloc_}(first, second, result_ref, executor);
 
         // Allocate some memory to expand UTF-8 strings into UTF-32.
-        safe_array<sz_rune_t, rune_allocator_t> unpacked_utf32(alloc_);
+        safe_vector<sz_rune_t, rune_allocator_t> unpacked_utf32(alloc_);
         if (unpacked_utf32.try_resize(first.size() + second.size()) != status_t::success_k)
             return status_t::bad_alloc_k;
         sz_rune_t *const first_data_utf32 = unpacked_utf32.data();
@@ -3760,8 +3760,8 @@ struct levenshtein_distance_utf8<char, linear_gap_costs_t, allocator_type_, capa
     using char_t = char;
     using gap_costs_t = linear_gap_costs_t;
     using allocator_t = allocator_type_;
-    using allocator_traits_t = allocator_traits<allocator_t>;
-    using rune_allocator_t = typename allocator_traits_t::template rebind<sz_rune_t>::other;
+    using allocator_traits_t = std::allocator_traits<allocator_t>;
+    using rune_allocator_t = typename allocator_traits_t::template rebind_alloc<sz_rune_t>;
 
     static constexpr sz_capability_t capability_k = capability_;
     static constexpr sz_capability_t capability_wout_simd_k = (sz_capability_t)(capability_k & ~sz_cap_ice_k);
@@ -3811,7 +3811,7 @@ struct levenshtein_distance_utf8<char, linear_gap_costs_t, allocator_type_, capa
             return ascii_fallback_t {substituter_, gap_costs_, alloc_}(first, second, result_ref, executor);
 
         // Allocate some memory to expand UTF-8 strings into UTF-32.
-        safe_array<sz_rune_t, rune_allocator_t> unpacked_utf32(alloc_);
+        safe_vector<sz_rune_t, rune_allocator_t> unpacked_utf32(alloc_);
         if (unpacked_utf32.try_resize(first.size() + second.size()) != status_t::success_k)
             return status_t::bad_alloc_k;
         sz_rune_t *const first_data_utf32 = unpacked_utf32.data();
diff --git a/scripts/bench_fingerprints.cuh b/scripts/bench_fingerprints.cuh
index 969c5ef4..1cd1d688 100644
--- a/scripts/bench_fingerprints.cuh
+++ b/scripts/bench_fingerprints.cuh
@@ -25,8 +25,8 @@ using namespace ashvardanian::stringzilla::scripts;
 static constexpr std::size_t default_embedding_dims_k = 64;
 static constexpr std::size_t default_window_width_k = 7;
 
-using fingerprint_min_hashes_t = std::array<std::uint32_t, default_embedding_dims_k>;
-using fingerprint_min_counts_t = std::array<std::uint32_t, default_embedding_dims_k>;
+using fingerprint_min_hashes_t = std::array<u32_t, default_embedding_dims_k>;
+using fingerprint_min_counts_t = std::array<u32_t, default_embedding_dims_k>;
 using fingerprints_min_hashes_t = unified_vector<fingerprint_min_hashes_t>;
 using fingerprints_min_counts_t = unified_vector<fingerprint_min_counts_t>;
 
@@ -103,128 +103,164 @@ void bench_fingerprints(environment_t const &env) {
     };
 
     // Allocate all hashers on heap
-    using rabin_u64_t = basic_rolling_hashers<rabin_karp_rolling_hasher<std::uint32_t, std::uint64_t>>;
-    auto rabin_u64 = std::make_unique<rabin_u64_t>();
-    if (rabin_u64->try_extend(default_window_width_k, default_embedding_dims_k) != status_t::success_k)
+    using basic_rabin_u64_serial_t = basic_rolling_hashers<rabin_karp_rolling_hasher<u32_t, u64_t>>;
+    auto basic_rabin_u64_serial = std::make_unique<basic_rabin_u64_serial_t>();
+    if (basic_rabin_u64_serial->try_extend(default_window_width_k, default_embedding_dims_k) != status_t::success_k)
         throw std::runtime_error("Can't build Rabin Karp u64/u32 Hasher.");
 
-    using buz_u32_t = basic_rolling_hashers<buz_rolling_hasher<std::uint32_t>>;
-    auto buz_u32 = std::make_unique<buz_u32_t>();
-    if (buz_u32->try_extend(default_window_width_k, default_embedding_dims_k) != status_t::success_k)
+    using basic_buz_u32_serial_t = basic_rolling_hashers<buz_rolling_hasher<u32_t>>;
+    auto basic_buz_u32_serial = std::make_unique<basic_buz_u32_serial_t>();
+    if (basic_buz_u32_serial->try_extend(default_window_width_k, default_embedding_dims_k) != status_t::success_k)
         throw std::runtime_error("Can't build Buz Hasher.");
 
-    using multiply_u32_t = basic_rolling_hashers<multiplying_rolling_hasher<std::uint32_t>>;
-    auto multiply_u32 = std::make_unique<multiply_u32_t>();
-    if (multiply_u32->try_extend(default_window_width_k, default_embedding_dims_k) != status_t::success_k)
+    using basic_multiply_u32_serial_t = basic_rolling_hashers<multiplying_rolling_hasher<u32_t>>;
+    auto basic_multiply_u32_serial = std::make_unique<basic_multiply_u32_serial_t>();
+    if (basic_multiply_u32_serial->try_extend(default_window_width_k, default_embedding_dims_k) != status_t::success_k)
         throw std::runtime_error("Can't build Multiplying Hasher.");
 
-    using rolling_f64_t = basic_rolling_hashers<floating_rolling_hasher<double>, std::uint32_t>;
-    auto rolling_f64 = std::make_unique<rolling_f64_t>();
-    if (rolling_f64->try_extend(default_window_width_k, default_embedding_dims_k) != status_t::success_k)
+    using basic_rolling_f64_serial_t = basic_rolling_hashers<floating_rolling_hasher<f64_t>, u32_t>;
+    auto basic_rolling_f64_serial = std::make_unique<basic_rolling_f64_serial_t>();
+    if (basic_rolling_f64_serial->try_extend(default_window_width_k, default_embedding_dims_k) != status_t::success_k)
         throw std::runtime_error("Can't build Floating f64 Rolling Hasher.");
 
-    using rolling_f32_t = basic_rolling_hashers<floating_rolling_hasher<float>>;
-    auto rolling_f32 = std::make_unique<rolling_f32_t>();
-    if (rolling_f32->try_extend(default_window_width_k, default_embedding_dims_k) != status_t::success_k)
+    using basic_rolling_f32_serial_t = basic_rolling_hashers<floating_rolling_hasher<float>>;
+    auto basic_rolling_f32_serial = std::make_unique<basic_rolling_f32_serial_t>();
+    if (basic_rolling_f32_serial->try_extend(default_window_width_k, default_embedding_dims_k) != status_t::success_k)
         throw std::runtime_error("Can't build Floating f32 Rolling Hasher.");
 
-    using rolling_serial_t = floating_rolling_hashers<sz_cap_serial_k, default_embedding_dims_k>;
-    auto rolling_serial = std::make_unique<rolling_serial_t>();
-    if (rolling_serial->try_seed(default_window_width_k) != status_t::success_k)
+    using floating_serial_t = floating_rolling_hashers<sz_cap_serial_k, default_embedding_dims_k>;
+    auto floating_serial = std::make_unique<floating_serial_t>();
+    if (floating_serial->try_seed(default_window_width_k) != status_t::success_k)
         throw std::runtime_error("Can't build Unrolled Floating Hasher.");
 
+#if SZ_USE_CUDA
+    using basic_rabin_u64_cuda_t =
+        basic_rolling_hashers<rabin_karp_rolling_hasher<u32_t, u64_t>, u32_t, u32_t, unified_alloc_t, sz_cap_cuda_k>;
+    auto basic_rabin_u64_cuda = std::make_unique<basic_rabin_u64_cuda_t>();
+    if (basic_rabin_u64_cuda->try_extend(default_window_width_k, default_embedding_dims_k) != status_t::success_k)
+        throw std::runtime_error("Can't build Rabin Karp u64/u32 CUDA Hasher.");
+
+    using basic_rolling_f64_cuda_t =
+        basic_rolling_hashers<floating_rolling_hasher<f64_t>, u32_t, u32_t, unified_alloc_t, sz_cap_cuda_k>;
+    auto basic_rolling_f64_cuda = std::make_unique<basic_rolling_f64_cuda_t>();
+    if (basic_rolling_f64_cuda->try_extend(default_window_width_k, default_embedding_dims_k) != status_t::success_k)
+        throw std::runtime_error("Can't build Floating f64 Rolling CUDA Hasher.");
+#endif // SZ_USE_CUDA
+
 #if SZ_USE_HASWELL
-    using rolling_haswell_t = floating_rolling_hashers<sz_cap_haswell_k, default_embedding_dims_k>;
-    auto rolling_haswell = std::make_unique<rolling_haswell_t>();
-    if (rolling_haswell->try_seed(default_window_width_k) != status_t::success_k)
+    using floating_haswell_t = floating_rolling_hashers<sz_cap_haswell_k, default_embedding_dims_k>;
+    auto floating_haswell = std::make_unique<floating_haswell_t>();
+    if (floating_haswell->try_seed(default_window_width_k) != status_t::success_k)
         throw std::runtime_error("Can't build Haswell Floating Hasher.");
 #endif // SZ_USE_HASWELL
 
 #if SZ_USE_SKYLAKE
-    using rolling_skylake_t = floating_rolling_hashers<sz_cap_skylake_k, default_embedding_dims_k>;
-    auto rolling_skylake = std::make_unique<rolling_skylake_t>();
-    if (rolling_skylake->try_seed(default_window_width_k) != status_t::success_k)
+    using floating_skylake_t = floating_rolling_hashers<sz_cap_skylake_k, default_embedding_dims_k>;
+    auto floating_skylake = std::make_unique<floating_skylake_t>();
+    if (floating_skylake->try_seed(default_window_width_k) != status_t::success_k)
         throw std::runtime_error("Can't build Skylake Floating Hasher.");
 #endif // SZ_USE_SKYLAKE
 
 #if SZ_USE_CUDA
-    using rolling_cuda_t = floating_rolling_hashers<sz_cap_cuda_k, default_embedding_dims_k>;
-    auto rolling_cuda = std::make_unique<rolling_cuda_t>();
-    if (rolling_cuda->try_seed(default_window_width_k) != status_t::success_k)
+    using floating_cuda_t = floating_rolling_hashers<sz_cap_cuda_k, default_embedding_dims_k>;
+    auto floating_cuda = std::make_unique<floating_cuda_t>();
+    if (floating_cuda->try_seed(default_window_width_k) != status_t::success_k)
         throw std::runtime_error("Can't build CUDA Floating Hasher.");
 #endif // SZ_USE_CUDA
 
     // Perform the benchmarks, passing the dictionary to the engines
-    auto call_baseline = fingerprint_callable<rolling_f64_t, fu::basic_pool_t &>(
-        tape, min_hashes_baseline, min_counts_baseline, *rolling_f64, pool);
-    bench_result_t baseline = bench_nullary(env, "rolling_f64", call_baseline).log();
+    auto basic_rolling_f64_serial_call = fingerprint_callable<basic_rolling_f64_serial_t, fu::basic_pool_t &>(
+        tape, min_hashes_baseline, min_counts_baseline, *basic_rolling_f64_serial, pool);
+    bench_result_t basic_rolling_f64_serial_result =
+        bench_nullary(env, "basic_rolling_f64_serial", basic_rolling_f64_serial_call).log();
 
     // Semi-serial variants
-    bench_nullary(env, "rolling_f32",
-                  fingerprint_callable<rolling_f32_t, fu::basic_pool_t &>(tape, min_hashes_accelerated,
-                                                                          min_counts_accelerated, *rolling_f32, pool))
-        .log(baseline);
+    bench_nullary(env, "basic_rolling_f32_serial",
+                  fingerprint_callable<basic_rolling_f32_serial_t, fu::basic_pool_t &>(
+                      tape, min_hashes_accelerated, min_counts_accelerated, *basic_rolling_f32_serial, pool))
+        .log(basic_rolling_f64_serial_result);
     scramble_accelerated_results();
 
-    bench_nullary(env, "rabin_u64",
-                  fingerprint_callable<rabin_u64_t, fu::basic_pool_t &>(tape, min_hashes_accelerated,
-                                                                        min_counts_accelerated, *rabin_u64, pool))
-        .log(baseline);
+    bench_nullary(env, "basic_rabin_u64_serial",
+                  fingerprint_callable<basic_rabin_u64_serial_t, fu::basic_pool_t &>(
+                      tape, min_hashes_accelerated, min_counts_accelerated, *basic_rabin_u64_serial, pool))
+        .log(basic_rolling_f64_serial_result);
     scramble_accelerated_results();
 
-    bench_nullary(env, "buz_u32",
-                  fingerprint_callable<buz_u32_t, fu::basic_pool_t &>(tape, min_hashes_accelerated,
-                                                                      min_counts_accelerated, *buz_u32, pool)) //
-        .log(baseline);
+    bench_nullary(env, "basic_buz_u32_serial",
+                  fingerprint_callable<basic_buz_u32_serial_t, fu::basic_pool_t &>(
+                      tape, min_hashes_accelerated, min_counts_accelerated, *basic_buz_u32_serial, pool)) //
+        .log(basic_rolling_f64_serial_result);
     scramble_accelerated_results();
 
-    bench_nullary(env, "multiply_u32",
-                  fingerprint_callable<multiply_u32_t, fu::basic_pool_t &>(tape, min_hashes_accelerated,
-                                                                           min_counts_accelerated, *multiply_u32, pool))
-        .log(baseline);
+    bench_nullary(env, "basic_multiply_u32_serial",
+                  fingerprint_callable<basic_multiply_u32_serial_t, fu::basic_pool_t &>(
+                      tape, min_hashes_accelerated, min_counts_accelerated, *basic_multiply_u32_serial, pool))
+        .log(basic_rolling_f64_serial_result);
     scramble_accelerated_results();
 
-    // Actually unrolled hard-coded variants, including SIMD ports
-    bench_result_t unrolled =                     //
-        bench_nullary(                            //
-            env, "rolling_serial", call_baseline, //
-            fingerprint_callable<rolling_serial_t, fu::basic_pool_t &>(
-                tape, min_hashes_accelerated, min_counts_accelerated, *rolling_serial, pool), //
-            callable_no_op_t {},                                                              // preprocessing
-            fingerprints_equality_t {})                                                       // equality check
-            .log(baseline);
+#if SZ_USE_CUDA
+    bench_nullary(                                                  //
+        env, "basic_rabin_u64_cuda", basic_rolling_f64_serial_call, //
+        fingerprint_callable<basic_rabin_u64_cuda_t, cuda_executor_t, gpu_specs_t>(
+            tape, min_hashes_accelerated, min_counts_accelerated, *basic_rabin_u64_cuda, cuda_executor_t {}, specs), //
+        callable_no_op_t {},        // preprocessing
+        fingerprints_equality_t {}) // equality check
+        .log(basic_rolling_f64_serial_result);
     scramble_accelerated_results();
 
-#if SZ_USE_HASWELL
-    bench_nullary(                             //
-        env, "rolling_haswell", call_baseline, //
-        fingerprint_callable<rolling_haswell_t, fu::basic_pool_t &>(tape, min_hashes_accelerated,
-                                                                    min_counts_accelerated, *rolling_haswell, pool), //
+    bench_nullary(                                                    //
+        env, "basic_rolling_f64_cuda", basic_rolling_f64_serial_call, //
+        fingerprint_callable<basic_rolling_f64_cuda_t, cuda_executor_t, gpu_specs_t>(
+            tape, min_hashes_accelerated, min_counts_accelerated, *basic_rolling_f64_cuda, cuda_executor_t {},
+            specs),                 //
         callable_no_op_t {},        // preprocessing
         fingerprints_equality_t {}) // equality check
-        .log(baseline, unrolled);
+        .log(basic_rolling_f64_serial_result);
+    scramble_accelerated_results();
+#endif // SZ_USE_CUDA
+
+    // Actually unrolled hard-coded variants, including SIMD ports
+    bench_result_t floating_serial_result =                        //
+        bench_nullary(                                             //
+            env, "floating_serial", basic_rolling_f64_serial_call, //
+            fingerprint_callable<floating_serial_t, fu::basic_pool_t &>(
+                tape, min_hashes_accelerated, min_counts_accelerated, *floating_serial, pool), //
+            callable_no_op_t {},                                                               // preprocessing
+            fingerprints_equality_t {})                                                        // equality check
+            .log(basic_rolling_f64_serial_result);
+    scramble_accelerated_results();
+
+#if SZ_USE_HASWELL
+    bench_nullary(                                              //
+        env, "floating_haswell", basic_rolling_f64_serial_call, //
+        fingerprint_callable<floating_haswell_t, fu::basic_pool_t &>(
+            tape, min_hashes_accelerated, min_counts_accelerated, *floating_haswell, pool), //
+        callable_no_op_t {},                                                                // preprocessing
+        fingerprints_equality_t {})                                                         // equality check
+        .log(basic_rolling_f64_serial_result, floating_serial_result);
     scramble_accelerated_results();
 #endif // SZ_USE_HASWELL
 
 #if SZ_USE_SKYLAKE
-    bench_nullary(                             //
-        env, "rolling_skylake", call_baseline, //
-        fingerprint_callable<rolling_skylake_t, fu::basic_pool_t &>(tape, min_hashes_accelerated,
-                                                                    min_counts_accelerated, *rolling_skylake, pool), //
-        callable_no_op_t {},        // preprocessing
-        fingerprints_equality_t {}) // equality check
-        .log(baseline, unrolled);
+    bench_nullary(                                              //
+        env, "floating_skylake", basic_rolling_f64_serial_call, //
+        fingerprint_callable<floating_skylake_t, fu::basic_pool_t &>(
+            tape, min_hashes_accelerated, min_counts_accelerated, *floating_skylake, pool), //
+        callable_no_op_t {},                                                                // preprocessing
+        fingerprints_equality_t {})                                                         // equality check
+        .log(basic_rolling_f64_serial_result, floating_serial_result);
     scramble_accelerated_results();
 #endif // SZ_USE_SKYLAKE
 
 #if SZ_USE_CUDA
-    bench_nullary(                          //
-        env, "rolling_cuda", call_baseline, //
-        fingerprint_callable<rolling_cuda_t, cuda_executor_t, gpu_specs_t>(
-            tape, min_hashes_accelerated, min_counts_accelerated, *rolling_cuda, cuda_executor_t {}, specs), //
+    bench_nullary(                                           //
+        env, "floating_cuda", basic_rolling_f64_serial_call, //
+        fingerprint_callable<floating_cuda_t, cuda_executor_t, gpu_specs_t>(
+            tape, min_hashes_accelerated, min_counts_accelerated, *floating_cuda, cuda_executor_t {}, specs), //
         callable_no_op_t {},        // preprocessing
         fingerprints_equality_t {}) // equality check
-        .log(baseline, unrolled);
+        .log(basic_rolling_f64_serial_result, floating_serial_result);
     scramble_accelerated_results();
 #endif // SZ_USE_CUDA
 }

From 5fcde229d9d6c400258680e4cdbd4c1109b860d3 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 17 Aug 2025 11:23:59 +0000
Subject: [PATCH 584/751] Make: Consistent `-O2` optimization

---
 CMakeLists.txt | 17 +++++++++++------
 setup.py       | 12 ++++++------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d29c09ca..5bb482c4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -246,25 +246,30 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
             target_compile_options(${target} PRIVATE "-O2")
         endif ()
     elseif (${compiler_id} STREQUAL "NVIDIA")
+        target_compile_options(
+            ${target} PRIVATE "-Xcompiler=-Wall" # All warnings (host)
+                              "-Xcompiler=-Wextra" # Extra warnings (host)
+        )
+
         if (${CMAKE_BUILD_TYPE} STREQUAL "Debug" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
             target_compile_options(
                 ${target}
                 PRIVATE "-G" # Device debug symbols
                         "-lineinfo" # Include source line info in PTX
-                        "-O0" # Disable NVCC optimizations explicitly
                         "-no-compress" # No compression of debug info
-                        "-Xptxas=-O0" # Disable PTX assembler optimizations
                         "-Xcompiler=-g" # Host debugging symbols explicitly
-                        "-Xcompiler=-O0" # Host optimizations off
                         "-Xcompiler=-fno-omit-frame-pointer" # Stack trace clarity
-                        "-Xcompiler=-Wall" # All warnings (host)
-                        "-Xcompiler=-Wextra" # Extra warnings (host)
                         "-Xcompiler=-fno-inline" # Prevent host inlining
                         "-maxrregcount=0" # No register count limits
             )
         endif ()
         if (${CMAKE_BUILD_TYPE} STREQUAL "Release" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
-            target_compile_options(${target} PRIVATE "-O2")
+            target_compile_options(
+                ${target}
+                PRIVATE "-O2" # Disable NVCC optimizations explicitly
+                        "-Xptxas=-O2" # Disable PTX assembler optimizations
+                        "-Xcompiler=-O2" # Host optimizations off
+            )
         endif ()
     endif ()
 
diff --git a/setup.py b/setup.py
index 53aeadd4..90e067a2 100644
--- a/setup.py
+++ b/setup.py
@@ -103,7 +103,7 @@ def linux_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[Tu
     compile_args = [
         "-std=c++17" if use_cpp else "-std=c99",  # use C++17 for StringZillas, C99 for StringZilla
         "-pedantic",  # stick close to the C language standard, avoid compiler extensions
-        "-O3",  # maximum optimization level
+        "-O2",  # optimization level
         "-fdiagnostics-color=always",  # color console output
         "-Wno-unknown-pragmas",  # like: `pragma region` and some unrolls
         "-Wno-unused-function",  # like: ... declared ‘static’ but never defined
@@ -136,7 +136,7 @@ def darwin_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[T
     compile_args = [
         "-std=c++17" if use_cpp else "-std=c99",  # use C++17 for StringZillas, C99 for StringZilla
         "-pedantic",  # stick close to the C language standard, avoid compiler extensions
-        "-O3",  # maximum optimization level
+        "-O2",  # optimization level
         "-fcolor-diagnostics",  # color console output
         "-Wno-unknown-pragmas",  # like: `pragma region` and some unrolls
         "-Wno-incompatible-function-pointer-types",
@@ -172,7 +172,7 @@ def windows_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[
     compile_args = [
         "/std:c++17" if use_cpp else "/std:c99",  # use C++17 for StringZillas, C99 for StringZilla
         "/Wall",  # stick close to the C language standard, avoid compiler extensions
-        "/O2",  # maximum optimization level
+        "/O2",  # optimization level
     ]
 
     # When packaging the library, even if the current machine doesn't support AVX-512 or SVE, still precompile those.
@@ -207,7 +207,7 @@ def windows_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[
 
 ext_modules = []
 entry_points = {}
-cmdclass = {}
+command_class = {}
 
 if sz_target == "stringzilla":
     __lib_name__ = "stringzilla"
@@ -260,7 +260,7 @@ def windows_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[
             language="c++",  # Force C++ linking
         ),
     ]
-    cmdclass = {"build_ext": CudaBuildExtension}
+    command_class = {"build_ext": CudaBuildExtension}
 else:
     raise ValueError("Unknown target specified with SZ_TARGET environment variable.")
 
@@ -313,5 +313,5 @@ def windows_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[
     ext_modules=ext_modules,
     packages=find_packages(),
     entry_points=entry_points,
-    cmdclass=cmdclass,
+    cmdclass=command_class,
 )

From 471649eaeb51dd9b52d3a3ccf57596e8791ea747 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 17 Aug 2025 12:53:47 +0000
Subject: [PATCH 585/751] Add: Fingerprinting benchmarks

---
 c/stringzillas.cuh                    |   3 +-
 include/stringzillas/fingerprints.hpp |   8 +-
 python/stringzillas.c                 |  28 +++--
 scripts/bench_fingerprints.py         |  58 +++++++--
 scripts/bench_similarities.py         | 174 ++++++++++++++++++++------
 scripts/test_stringzillas.py          |   2 +-
 6 files changed, 202 insertions(+), 71 deletions(-)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index b6ba9501..b064b30e 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -637,7 +637,6 @@ sz_status_t sz_fingerprints_for_(                                     //
                 get_executor(device_scope), get_specs(device_scope));
             result = static_cast<sz_status_t>(status);
         }
-        else if (std::holds_alternative<gpu_scope_t>(device->variants)) { result = sz_status_unknown_k; }
         else { result = sz_status_unknown_k; }
     };
 #if SZ_USE_CUDA
@@ -658,6 +657,7 @@ sz_status_t sz_fingerprints_for_(                                     //
         }
         else { result = sz_status_unknown_k; }
     };
+#endif // SZ_USE_CUDA
 
     // The unrolled logic is a bit more complex than `fallback_logic_cpus`, but in practice involves
     // just one additional loop level.
@@ -726,7 +726,6 @@ sz_status_t sz_fingerprints_for_(                                     //
             else { result = sz_status_unknown_k; }
         }
     };
-#endif // SZ_USE_CUDA
 
 #if SZ_USE_CUDA
     std::visit(overloaded {fallback_logic_cpus, fallback_logic_gpus, unrolled_logic}, engine->variants);
diff --git a/include/stringzillas/fingerprints.hpp b/include/stringzillas/fingerprints.hpp
index 1159a1cb..107f683f 100644
--- a/include/stringzillas/fingerprints.hpp
+++ b/include/stringzillas/fingerprints.hpp
@@ -561,13 +561,11 @@ struct floating_rolling_hasher<f64_t> {
         if (result >= modulo_) result -= modulo_;
         if (result < 0.0) result += modulo_;
 
-        // Skip debug assertions that call non-constexpr functions when compiling with NVCC
-#if !defined(__NVCC__)
+        // Skip debug assertions that call non-constexpr functions:
+        // sz_assert_(static_cast<u64_t>(absolute_fmod(x, modulo_)) == static_cast<u64_t>(result) &&
+        //            "Floating point modulo was incorrect");
         sz_assert_(result >= 0 && "Intermediate x underflows the zero");
         sz_assert_(result < limit_k && "Intermediate x overflows the limit");
-        sz_assert_(static_cast<u64_t>(absolute_fmod(x, modulo_)) == static_cast<u64_t>(result) &&
-                   "Floating point modulo was incorrect");
-#endif
         return result;
     }
 
diff --git a/python/stringzillas.c b/python/stringzillas.c
index ddeb05bc..599f68a4 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -464,9 +464,9 @@ static PyObject *LevenshteinDistances_call(LevenshteinDistances *self, PyObject
     // If no valid input types were found, raise an error
     if (!kernel_punned) {
         PyErr_Format(PyExc_TypeError,
-                     "Unsupported input types for Levenshtein distances. "
-                     "u32tape: a=%d b=%d, u64tape: a=%d b=%d, seq: a=%d b=%d",
-                     a_is_u32tape, b_is_u32tape, a_is_u64tape, b_is_u64tape, a_is_sequence, b_is_sequence);
+                     "Expected stringzilla.Strs objects, got %s and %s. "
+                     "Convert using: stringzilla.Strs(your_string_list)",
+                     Py_TYPE(a_obj)->tp_name, Py_TYPE(b_obj)->tp_name);
         return NULL;
     }
 
@@ -739,9 +739,9 @@ static PyObject *LevenshteinDistancesUTF8_call(LevenshteinDistancesUTF8 *self, P
     // If no valid input types were found, raise an error
     if (!kernel_punned) {
         PyErr_Format(PyExc_TypeError,
-                     "Unsupported input types for Levenshtein distances. "
-                     "u32tape: a=%d b=%d, u64tape: a=%d b=%d, seq: a=%d b=%d",
-                     a_is_u32tape, b_is_u32tape, a_is_u64tape, b_is_u64tape, a_is_sequence, b_is_sequence);
+                     "Expected stringzilla.Strs objects, got %s and %s. "
+                     "Convert using: stringzilla.Strs(your_string_list)",
+                     Py_TYPE(a_obj)->tp_name, Py_TYPE(b_obj)->tp_name);
         return NULL;
     }
 
@@ -1038,9 +1038,9 @@ static PyObject *NeedlemanWunsch_call(NeedlemanWunsch *self, PyObject *args, PyO
     // If no valid input types were found, raise an error
     if (!kernel_punned) {
         PyErr_Format(PyExc_TypeError,
-                     "Unsupported input types for NeedlemanWunsch. "
-                     "u32tape: a=%d b=%d, u64tape: a=%d b=%d, seq: a=%d b=%d",
-                     a_is_u32tape, b_is_u32tape, a_is_u64tape, b_is_u64tape, a_is_sequence, b_is_sequence);
+                     "Expected stringzilla.Strs objects, got %s and %s. "
+                     "Convert using: stringzilla.Strs(your_string_list)",
+                     Py_TYPE(a_obj)->tp_name, Py_TYPE(b_obj)->tp_name);
         return NULL;
     }
 
@@ -1306,9 +1306,9 @@ static PyObject *SmithWaterman_call(SmithWaterman *self, PyObject *args, PyObjec
     // If no valid input types were found, raise an error
     if (!kernel_punned) {
         PyErr_Format(PyExc_TypeError,
-                     "Unsupported input types for SmithWaterman. "
-                     "u32tape: a=%d b=%d, u64tape: a=%d b=%d, seq: a=%d b=%d",
-                     a_is_u32tape, b_is_u32tape, a_is_u64tape, b_is_u64tape, a_is_sequence, b_is_sequence);
+                     "Expected stringzilla.Strs objects, got %s and %s. "
+                     "Convert using: stringzilla.Strs(your_string_list)",
+                     Py_TYPE(a_obj)->tp_name, Py_TYPE(b_obj)->tp_name);
         return NULL;
     }
 
@@ -1598,7 +1598,9 @@ static PyObject *Fingerprints_call(Fingerprints *self, PyObject *args, PyObject
     }
 
     if (kernel_punned == NULL) {
-        PyErr_SetString(PyExc_TypeError, "Unsupported input type for fingerprinting");
+        PyErr_Format(PyExc_TypeError, 
+            "Expected stringzilla.Strs object, got %s. Convert using: stringzilla.Strs(your_string_list)", 
+            Py_TYPE(texts_obj)->tp_name);
         return NULL;
     }
 
diff --git a/scripts/bench_fingerprints.py b/scripts/bench_fingerprints.py
index 9fe09029..22685679 100644
--- a/scripts/bench_fingerprints.py
+++ b/scripts/bench_fingerprints.py
@@ -26,16 +26,19 @@
     uv run --no-project scripts/bench_fingerprints.py --dataset leipzig1M.txt --dimensions 32
 """
 
-import argparse
+import os
 import time
+import argparse
 from pathlib import Path
-from typing import List, Callable, Iterable
+from typing import Callable, Iterable
 
 from tqdm import tqdm
 import numpy as np
 
 from datasketch import MinHash
 from sklearn.feature_extraction.text import HashingVectorizer
+import stringzillas as szs
+import stringzilla as sz
 
 # Global state for MinHash to avoid repeated initialization
 _datasketch_min_hash_state = None
@@ -185,11 +188,48 @@ def sklearn_ngrams_vectorizer(doc: bytes) -> list:
         timeout_seconds,
     )
 
+    # Convert docs to Strs object
+    docs_strs = sz.Strs(docs)
+
+    # Benchmark batch CPU fingerprinting
+    if "serial" in szs.__capabilities__:
+        start_time = time.time_ns()
+        cpu_scope = szs.DeviceScope(cpu_cores=os.cpu_count())
+        fingerprints_cpu = szs.Fingerprints(ndim=dimensions, capabilities=cpu_scope)
+        hashes, counts = fingerprints_cpu(docs_strs, device=cpu_scope)
+        end_time = time.time_ns()
+        total_time_s = (end_time - start_time) / 1e9
+        total_bytes = sum(docs_sizes)
+        mb_per_sec = total_bytes / (1e6 * total_time_s)
+        docs_per_sec = len(docs) / total_time_s
+        print(
+            f"stringzillas.Fingerprints(CPU): {len(docs):,} docs in {total_time_s:.2f}s ~ {mb_per_sec:.3f} MB/s, {docs_per_sec:.0f} docs/s"
+        )
+    else:
+        print(f"stringzillas.Fingerprints(CPU): FAILED - {e}")
+
+    # Benchmark batch GPU fingerprinting
+    if "cuda" in szs.__capabilities__:
+        start_time = time.time_ns()
+        gpu_scope = szs.DeviceScope(gpu_device=0)
+        fingerprints_gpu = szs.Fingerprints(ndim=dimensions, capabilities=gpu_scope)
+        hashes, counts = fingerprints_gpu(docs_strs, device=gpu_scope)
+        end_time = time.time_ns()
+        total_time_s = (end_time - start_time) / 1e9
+        total_bytes = sum(docs_sizes)
+        mb_per_sec = total_bytes / (1e6 * total_time_s)
+        docs_per_sec = len(docs) / total_time_s
+        print(
+            f"stringzillas.Fingerprints(GPU): {len(docs):,} docs in {total_time_s:.2f}s ~ {mb_per_sec:.3f} MB/s, {docs_per_sec:.0f} docs/s"
+        )
+    else:
+        print("stringzillas.Fingerprints(GPU): SKIPPED - CUDA not available")
+
 
 def bench(
     dataset_path: str,
     max_docs: int = None,
-    dimensions: int = 64,
+    dimensions: int = 1024,
     timeout_seconds: int = 10,
 ):
     """Run fingerprinting benchmarks."""
@@ -206,9 +246,7 @@ def bench(
 
     docs_sizes = [len(doc.encode("utf-8")) for doc in docs]
 
-    print(
-        f"Prepared {len(docs):,} docs of {sum(docs_sizes)/len(docs_sizes):.1f} mean byte length!"
-    )
+    print(f"Prepared {len(docs):,} docs of {sum(docs_sizes)/len(docs_sizes):.1f} mean byte length!")
     print(f"Total bytes: {sum(docs_sizes):,}")
     print(f"Num hashes: {dimensions}")
     print()
@@ -240,14 +278,12 @@ def main():
     )
 
     parser.add_argument("--dataset", required=True, help="Path to text dataset file")
-    parser.add_argument(
-        "--max-docs", type=int, help="Maximum number of docs to process"
-    )
+    parser.add_argument("--max-docs", type=int, help="Maximum number of docs to process")
     parser.add_argument(
         "--dimensions",
         type=int,
-        default=64,
-        help="Number of hash functions for MinHash (default: 64)",
+        default=1024,
+        help="Number of hash functions for MinHash (default: 1024)",
     )
     parser.add_argument(
         "--timeout",
diff --git a/scripts/bench_similarities.py b/scripts/bench_similarities.py
index 8e099abf..d9a33ede 100644
--- a/scripts/bench_similarities.py
+++ b/scripts/bench_similarities.py
@@ -46,17 +46,19 @@
     uv run --no-project scripts/bench_similarities.py --protein-mode --dataset acgt_1k.txt
 """
 
-import argparse
-import random
+import os
 import time
+import random
+import argparse
 from pathlib import Path
-from typing import List, Callable, Iterable, Tuple
+from typing import List, Callable, Tuple
 
 from tqdm import tqdm
 import numpy as np
 
 # String similarity libraries
 import stringzilla as sz
+import stringzillas as szs
 import jellyfish as jf
 import Levenshtein as le
 import editdistance as ed
@@ -98,9 +100,7 @@ def log_similarity_operation(
     start_time = time.time_ns()
 
     try:
-        with tqdm(
-            desc=name, unit="pairs", leave=False, total=len(string_pairs)
-        ) as progress_bar:
+        with tqdm(desc=name, unit="pairs", leave=False, total=len(string_pairs)) as progress_bar:
             for str_a, str_b in string_pairs:
                 # Check timeout (convert seconds to nanoseconds)
                 if time.time_ns() - start_time > timeout_seconds * 1e9:
@@ -110,9 +110,7 @@ def log_similarity_operation(
                     distance = similarity_func(str_a, str_b)
                     checksum += distance
                     processed_pairs += 1
-                    processed_bytes += len(str_a.encode("utf-8")) + len(
-                        str_b.encode("utf-8")
-                    )
+                    processed_bytes += len(str_a.encode("utf-8")) + len(str_b.encode("utf-8"))
 
                     # Update progress bar with custom rate
                     elapsed_ns = time.time_ns() - start_time
@@ -149,14 +147,15 @@ def log_similarity_operation(
         print(f"{name}: No pairs processed")
 
 
-def benchmark_edit_distances(
-    string_pairs: List[Tuple[str, str]], timeout_seconds: int = 10
-):
+def benchmark_edit_distances(string_pairs: List[Tuple[str, str]], timeout_seconds: int = 10):
     """Benchmark various edit distance implementations."""
 
     # StringZilla
     log_similarity_operation(
-        "stringzilla.edit_distance", string_pairs, sz.edit_distance, timeout_seconds
+        "stringzilla.edit_distance",
+        string_pairs,
+        sz.edit_distance,
+        timeout_seconds,
     )
 
     log_similarity_operation(
@@ -168,12 +167,18 @@ def benchmark_edit_distances(
 
     # RapidFuzz
     log_similarity_operation(
-        "rapidfuzz.Levenshtein.distance", string_pairs, rf.distance, timeout_seconds
+        "rapidfuzz.Levenshtein.distance",
+        string_pairs,
+        rf.distance,
+        timeout_seconds,
     )
 
     # python-Levenshtein
     log_similarity_operation(
-        "Levenshtein.distance", string_pairs, le.distance, timeout_seconds
+        "Levenshtein.distance",
+        string_pairs,
+        le.distance,
+        timeout_seconds,
     )
 
     # Jellyfish
@@ -186,12 +191,18 @@ def benchmark_edit_distances(
 
     # EditDistance
     log_similarity_operation(
-        "editdistance.eval", string_pairs, ed.eval, timeout_seconds
+        "editdistance.eval",
+        string_pairs,
+        ed.eval,
+        timeout_seconds,
     )
 
     # NLTK
     log_similarity_operation(
-        "nltk.edit_distance", string_pairs, nltk_ed, timeout_seconds
+        "nltk.edit_distance",
+        string_pairs,
+        nltk_ed,
+        timeout_seconds,
     )
 
     # Edlib
@@ -199,7 +210,10 @@ def edlib_distance(a: str, b: str) -> int:
         return edlib.align(a, b, mode="NW", task="distance")["editDistance"]
 
     log_similarity_operation(
-        "edlib.align", string_pairs, edlib_distance, timeout_seconds
+        "edlib.align",
+        string_pairs,
+        edlib_distance,
+        timeout_seconds,
     )
 
     # Polyleven (if available)
@@ -211,10 +225,53 @@ def edlib_distance(a: str, b: str) -> int:
             timeout_seconds,
         )
 
+    # StringZillas batch processing
+    def benchmark_stringzillas_batch(engine_name, engine_class, device_scope):
+        try:
+            engine = engine_class()
 
-def benchmark_alignment_scores(
-    string_pairs: List[Tuple[str, str]], timeout_seconds: int = 10
-):
+            # Prepare data for batch processing
+            strings_a = sz.Strs([pair[0] for pair in string_pairs])
+            strings_b = sz.Strs([pair[1] for pair in string_pairs])
+
+            start_time = time.time_ns()
+            results = engine(strings_a, strings_b, device_scope)
+            end_time = time.time_ns()
+
+            total_time_s = (end_time - start_time) / 1e9
+            processed_bytes = sum(len(a.encode("utf-8")) + len(b.encode("utf-8")) for a, b in string_pairs)
+            mb_per_sec = processed_bytes / (1e6 * total_time_s)
+            pairs_per_sec = len(string_pairs) / total_time_s
+            checksum = sum(results) if hasattr(results, "__iter__") else 0
+
+            print(
+                f"{engine_name}: {len(string_pairs):,} pairs in {total_time_s:.2f}s ~ {mb_per_sec:.3f} MB/s, {pairs_per_sec:.0f} pairs/s, checksum={checksum}"
+            )
+        except Exception as e:
+            print(f"{engine_name}: FAILED - {e}")
+
+    # StringZillas Levenshtein distances (batch)
+    cpu_scope = szs.DeviceScope(cpu_cores=os.cpu_count())
+    benchmark_stringzillas_batch("stringzillas.LevenshteinDistances(CPU)", szs.LevenshteinDistances, cpu_scope)
+
+    try:
+        gpu_scope = szs.DeviceScope(gpu_device=0)
+        benchmark_stringzillas_batch("stringzillas.LevenshteinDistances(GPU)", szs.LevenshteinDistances, gpu_scope)
+    except:
+        pass  # GPU may not be available
+
+    # StringZillas UTF-8 Levenshtein distances (batch)
+    benchmark_stringzillas_batch("stringzillas.LevenshteinDistancesUTF8(CPU)", szs.LevenshteinDistancesUTF8, cpu_scope)
+
+    try:
+        benchmark_stringzillas_batch(
+            "stringzillas.LevenshteinDistancesUTF8(GPU)", szs.LevenshteinDistancesUTF8, gpu_scope
+        )
+    except:
+        pass  # GPU may not be available
+
+
+def benchmark_alignment_scores(string_pairs: List[Tuple[str, str]], timeout_seconds: int = 10):
     """Benchmark alignment scoring with substitution matrices."""
     global _biopython_aligner, _blosum_matrix
 
@@ -234,25 +291,17 @@ def benchmark_alignment_scores(
         _blosum_matrix = np.zeros((256, 256), dtype=np.int8)
         _blosum_matrix.fill(127)  # Large penalty for invalid characters
 
-        for packed_row, packed_row_aminoacid in enumerate(
-            _biopython_aligner.substitution_matrix.alphabet
-        ):
-            for packed_column, packed_column_aminoacid in enumerate(
-                _biopython_aligner.substitution_matrix.alphabet
-            ):
+        for packed_row, packed_row_aminoacid in enumerate(_biopython_aligner.substitution_matrix.alphabet):
+            for packed_column, packed_column_aminoacid in enumerate(_biopython_aligner.substitution_matrix.alphabet):
                 reconstructed_row = ord(packed_row_aminoacid)
                 reconstructed_column = ord(packed_column_aminoacid)
-                _blosum_matrix[reconstructed_row, reconstructed_column] = subs_packed[
-                    packed_row, packed_column
-                ]
+                _blosum_matrix[reconstructed_row, reconstructed_column] = subs_packed[packed_row, packed_column]
 
     # StringZilla alignment score
     def sz_alignment_score(a: str, b: str) -> int:
         return sz.alignment_score(a, b, substitution_matrix=_blosum_matrix, gap_score=1)
 
-    log_similarity_operation(
-        "stringzilla.alignment_score", string_pairs, sz_alignment_score, timeout_seconds
-    )
+    log_similarity_operation("stringzilla.alignment_score", string_pairs, sz_alignment_score, timeout_seconds)
 
     # BioPython alignment score
     log_similarity_operation(
@@ -262,6 +311,57 @@ def sz_alignment_score(a: str, b: str) -> int:
         timeout_seconds,
     )
 
+    # StringZillas alignment functions (batch)
+    def benchmark_stringzillas_alignment_batch(engine_name, engine_class, substitution_matrix, device_scope):
+        try:
+            engine = engine_class(substitution_matrix=substitution_matrix)
+
+            # Prepare data for batch processing
+            strings_a = sz.Strs([pair[0] for pair in string_pairs])
+            strings_b = sz.Strs([pair[1] for pair in string_pairs])
+
+            start_time = time.time_ns()
+            results = engine(strings_a, strings_b, device_scope)
+            end_time = time.time_ns()
+
+            total_time_s = (end_time - start_time) / 1e9
+            processed_bytes = sum(len(a.encode("utf-8")) + len(b.encode("utf-8")) for a, b in string_pairs)
+            mb_per_sec = processed_bytes / (1e6 * total_time_s)
+            pairs_per_sec = len(string_pairs) / total_time_s
+            checksum = sum(results) if hasattr(results, "__iter__") else 0
+
+            print(
+                f"{engine_name}: {len(string_pairs):,} pairs in {total_time_s:.2f}s ~ {mb_per_sec:.3f} MB/s, {pairs_per_sec:.0f} pairs/s, checksum={checksum}"
+            )
+        except Exception as e:
+            print(f"{engine_name}: FAILED - {e}")
+
+    # StringZillas Needleman-Wunsch (global alignment)
+    cpu_scope = szs.DeviceScope(cpu_cores=os.cpu_count())
+    benchmark_stringzillas_alignment_batch(
+        "stringzillas.NeedlemanWunsch(CPU)", szs.NeedlemanWunsch, _blosum_matrix, cpu_scope
+    )
+
+    try:
+        gpu_scope = szs.DeviceScope(gpu_device=0)
+        benchmark_stringzillas_alignment_batch(
+            "stringzillas.NeedlemanWunsch(GPU)", szs.NeedlemanWunsch, _blosum_matrix, gpu_scope
+        )
+    except:
+        pass  # GPU may not be available
+
+    # StringZillas Smith-Waterman (local alignment)
+    benchmark_stringzillas_alignment_batch(
+        "stringzillas.SmithWaterman(CPU)", szs.SmithWaterman, _blosum_matrix, cpu_scope
+    )
+
+    try:
+        benchmark_stringzillas_alignment_batch(
+            "stringzillas.SmithWaterman(GPU)", szs.SmithWaterman, _blosum_matrix, gpu_scope
+        )
+    except:
+        pass  # GPU may not be available
+
 
 def generate_random_pairs(strings: List[str], num_pairs: int) -> List[Tuple[str, str]]:
     """Generate random string pairs from a list of strings."""
@@ -315,9 +415,7 @@ def bench(
         total_chars = sum(len(a) + len(b) for a, b in pairs)
         avg_length = total_chars / (2 * len(pairs))
 
-        print(
-            f"Prepared {len(pairs):,} string pairs from {len(strings):,} unique strings"
-        )
+        print(f"Prepared {len(pairs):,} string pairs from {len(strings):,} unique strings")
         print(f"Average string length: {avg_length:.1f} chars")
         print(f"Total characters: {total_chars:,}")
         print(f"Timeout per benchmark: {timeout_seconds}s")
@@ -353,9 +451,7 @@ def main():
     )
 
     parser.add_argument("--dataset", help="Path to text dataset file")
-    parser.add_argument(
-        "--max-pairs", type=int, help="Maximum number of string pairs to process"
-    )
+    parser.add_argument("--max-pairs", type=int, help="Maximum number of string pairs to process")
     parser.add_argument(
         "--timeout",
         type=int,
diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index 0b4337ae..4f6bc64e 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -26,7 +26,7 @@
 
 import stringzilla as sz
 import stringzillas as szs
-from stringzilla import Str, Strs
+from stringzilla import Strs
 
 
 def test_library_properties():

From d9100a3a3d6a9833aa7fc8a786f68d77437a3fed Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 17 Aug 2025 12:54:26 +0000
Subject: [PATCH 586/751] Improve: Introspect `sz_device_scope_t`

---
 c/stringzillas.cuh                  | 33 +++++++++++++++++++++++++++++
 include/stringzillas/stringzillas.h |  2 ++
 2 files changed, 35 insertions(+)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index b064b30e..779d569e 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -814,6 +814,39 @@ SZ_DYNAMIC sz_status_t sz_device_scope_init_gpu_device(sz_size_t gpu_device, sz_
 #endif
 }
 
+SZ_DYNAMIC sz_status_t sz_device_scope_get_cpu_cores(sz_device_scope_t scope_punned, sz_size_t *cpu_cores) {
+    if (scope_punned == nullptr || cpu_cores == nullptr) return sz_status_unknown_k;
+    auto *scope = reinterpret_cast<device_scope_t *>(scope_punned);
+    
+    if (std::holds_alternative<cpu_scope_t>(scope->variants)) {
+        auto &cpu_scope = std::get<cpu_scope_t>(scope->variants);
+        if (cpu_scope.executor_ptr) {
+            *cpu_cores = cpu_scope.executor_ptr->threads_count();
+            return sz_success_k;
+        }
+    }
+    
+    return sz_status_unknown_k;
+}
+
+SZ_DYNAMIC sz_status_t sz_device_scope_get_gpu_device(sz_device_scope_t scope_punned, sz_size_t *gpu_device) {
+    if (scope_punned == nullptr || gpu_device == nullptr) return sz_status_unknown_k;
+    
+#if SZ_USE_CUDA
+    auto *scope = reinterpret_cast<device_scope_t *>(scope_punned);
+    if (std::holds_alternative<gpu_scope_t>(scope->variants)) {
+        auto &gpu_scope = std::get<gpu_scope_t>(scope->variants);
+        *gpu_device = static_cast<sz_size_t>(gpu_scope.executor.device_id());
+        return sz_success_k;
+    }
+#else
+    sz_unused_(scope_punned);
+    sz_unused_(gpu_device);
+#endif
+    
+    return sz_status_unknown_k;
+}
+
 SZ_DYNAMIC void sz_device_scope_free(sz_device_scope_t scope_punned) {
     if (scope_punned == nullptr) return;
     auto *scope = reinterpret_cast<device_scope_t *>(scope_punned);
diff --git a/include/stringzillas/stringzillas.h b/include/stringzillas/stringzillas.h
index b528a607..a3e6768f 100644
--- a/include/stringzillas/stringzillas.h
+++ b/include/stringzillas/stringzillas.h
@@ -79,6 +79,8 @@ typedef void *sz_device_scope_t;
 SZ_DYNAMIC sz_status_t sz_device_scope_init_default(sz_device_scope_t *scope);
 SZ_DYNAMIC sz_status_t sz_device_scope_init_cpu_cores(sz_size_t cpu_cores, sz_device_scope_t *scope);
 SZ_DYNAMIC sz_status_t sz_device_scope_init_gpu_device(sz_size_t gpu_device, sz_device_scope_t *scope);
+SZ_DYNAMIC sz_status_t sz_device_scope_get_cpu_cores(sz_device_scope_t scope, sz_size_t *cpu_cores);
+SZ_DYNAMIC sz_status_t sz_device_scope_get_gpu_device(sz_device_scope_t scope, sz_size_t *gpu_device);
 SZ_DYNAMIC void sz_device_scope_free(sz_device_scope_t scope);
 
 /*  APIs for computing edit-distances between binary and UTF-8 strings.

From a807eba28d28314b070249265bfaeadda19ab972 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 17 Aug 2025 13:31:02 +0000
Subject: [PATCH 587/751] Improve: Infer `capabilities` from `DeviceScope`

---
 c/stringzillas.cuh             |  13 +-
 include/stringzilla/types.h    |   5 +
 include/stringzillas/types.cuh |   1 +
 python/stringzillas.c          | 327 +++++++++++++++++++++------------
 4 files changed, 223 insertions(+), 123 deletions(-)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index 779d569e..076261de 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -817,7 +817,7 @@ SZ_DYNAMIC sz_status_t sz_device_scope_init_gpu_device(sz_size_t gpu_device, sz_
 SZ_DYNAMIC sz_status_t sz_device_scope_get_cpu_cores(sz_device_scope_t scope_punned, sz_size_t *cpu_cores) {
     if (scope_punned == nullptr || cpu_cores == nullptr) return sz_status_unknown_k;
     auto *scope = reinterpret_cast<device_scope_t *>(scope_punned);
-    
+
     if (std::holds_alternative<cpu_scope_t>(scope->variants)) {
         auto &cpu_scope = std::get<cpu_scope_t>(scope->variants);
         if (cpu_scope.executor_ptr) {
@@ -825,13 +825,18 @@ SZ_DYNAMIC sz_status_t sz_device_scope_get_cpu_cores(sz_device_scope_t scope_pun
             return sz_success_k;
         }
     }
-    
+    // Default scope is single-threaded
+    else if (std::holds_alternative<default_scope_t>(scope->variants)) {
+        *cpu_cores = 1;
+        return sz_success_k;
+    }
+
     return sz_status_unknown_k;
 }
 
 SZ_DYNAMIC sz_status_t sz_device_scope_get_gpu_device(sz_device_scope_t scope_punned, sz_size_t *gpu_device) {
     if (scope_punned == nullptr || gpu_device == nullptr) return sz_status_unknown_k;
-    
+
 #if SZ_USE_CUDA
     auto *scope = reinterpret_cast<device_scope_t *>(scope_punned);
     if (std::holds_alternative<gpu_scope_t>(scope->variants)) {
@@ -843,7 +848,7 @@ SZ_DYNAMIC sz_status_t sz_device_scope_get_gpu_device(sz_device_scope_t scope_pu
     sz_unused_(scope_punned);
     sz_unused_(gpu_device);
 #endif
-    
+
     return sz_status_unknown_k;
 }
 
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 2f2b345c..655778e2 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -501,6 +501,11 @@ typedef enum sz_capability_t {
     sz_caps_ck_k = sz_cap_cuda_k | sz_cap_kepler_k,                     ///< CUDA code with Kepler
     sz_caps_ckh_k = sz_cap_cuda_k | sz_cap_kepler_k | sz_cap_hopper_k,  ///< CUDA code with Kepler and Hopper
 
+    // Aggregates for different StringZillas builds
+    sz_caps_cpus_k = sz_cap_serial_k | sz_cap_parallel_k | sz_cap_haswell_k | sz_cap_skylake_k | sz_cap_ice_k |
+                     sz_cap_neon_k | sz_cap_neon_aes_k | sz_cap_sve_k | sz_cap_sve2_k | sz_cap_sve2_aes_k,
+    sz_caps_cuda_k = sz_cap_cuda_k | sz_cap_kepler_k | sz_cap_hopper_k,
+
 } sz_capability_t;
 
 /**
diff --git a/include/stringzillas/types.cuh b/include/stringzillas/types.cuh
index a54e3b5e..f7d974e1 100644
--- a/include/stringzillas/types.cuh
+++ b/include/stringzillas/types.cuh
@@ -143,6 +143,7 @@ class cuda_executor_t {
 
     explicit operator bool() const noexcept { return device_id_ >= 0; }
     inline cudaStream_t stream() const noexcept { return stream_; }
+    inline int device_id() const noexcept { return device_id_; }
 };
 
 /**
diff --git a/python/stringzillas.c b/python/stringzillas.c
index 599f68a4..de4f4598 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -129,65 +129,6 @@ static sz_bool_t try_swap_to_unified_allocator(PyObject *strs_obj) {
 
 #pragma endregion
 
-#pragma region Metadata
-
-/**
- *  @brief Parse capabilities from a Python tuple of strings and intersect with hardware capabilities.
- *  @param[in] caps_tuple Python tuple containing capability strings (e.g., ('serial', 'haswell')).
- *  @param[out] result Output capability mask after intersection with hardware capabilities.
- *  @return 0 on success, -1 on error (with Python exception set).
- */
-static int parse_and_intersect_capabilities(PyObject *caps_tuple, sz_capability_t *result) {
-    if (!PyTuple_Check(caps_tuple)) {
-        PyErr_SetString(PyExc_TypeError, "capabilities must be a tuple of strings");
-        return -1;
-    }
-
-    sz_capability_t requested_caps = 0;
-    Py_ssize_t n = PyTuple_Size(caps_tuple);
-
-    for (Py_ssize_t i = 0; i < n; i++) {
-        PyObject *item = PyTuple_GET_ITEM(caps_tuple, i);
-        if (!PyUnicode_Check(item)) {
-            PyErr_SetString(PyExc_TypeError, "capabilities must be a tuple of strings");
-            return -1;
-        }
-
-        char const *cap_str = PyUnicode_AsUTF8(item);
-        if (!cap_str) return -1;
-
-        // Map string to capability flag
-        if (strcmp(cap_str, "serial") == 0) { requested_caps |= sz_cap_serial_k; }
-        else if (strcmp(cap_str, "parallel") == 0) { requested_caps |= sz_cap_parallel_k; }
-        else if (strcmp(cap_str, "haswell") == 0) { requested_caps |= sz_cap_haswell_k; }
-        else if (strcmp(cap_str, "skylake") == 0) { requested_caps |= sz_cap_skylake_k; }
-        else if (strcmp(cap_str, "ice") == 0) { requested_caps |= sz_cap_ice_k; }
-        else if (strcmp(cap_str, "neon") == 0) { requested_caps |= sz_cap_neon_k; }
-        else if (strcmp(cap_str, "neon_aes") == 0) { requested_caps |= sz_cap_neon_aes_k; }
-        else if (strcmp(cap_str, "sve") == 0) { requested_caps |= sz_cap_sve_k; }
-        else if (strcmp(cap_str, "sve2") == 0) { requested_caps |= sz_cap_sve2_k; }
-        else if (strcmp(cap_str, "sve2_aes") == 0) { requested_caps |= sz_cap_sve2_aes_k; }
-        else if (strcmp(cap_str, "cuda") == 0) { requested_caps |= sz_cap_cuda_k; }
-        else if (strcmp(cap_str, "kepler") == 0) { requested_caps |= sz_cap_kepler_k; }
-        else if (strcmp(cap_str, "hopper") == 0) { requested_caps |= sz_cap_hopper_k; }
-        else if (strcmp(cap_str, "any") == 0) { requested_caps |= sz_cap_any_k; }
-        else {
-            PyErr_Format(PyExc_ValueError, "Unknown capability: %s", cap_str);
-            return -1;
-        }
-    }
-
-    // Intersect with hardware capabilities
-    *result = requested_caps & default_hardware_capabilities;
-
-    // If no capabilities match, fall back to serial
-    if (*result == 0) { *result = sz_cap_serial_k; }
-
-    return 0;
-}
-
-#pragma endregion
-
 #pragma region DeviceScope
 
 /**
@@ -225,8 +166,7 @@ static int DeviceScope_init(DeviceScope *self, PyObject *args, PyObject *kwargs)
     PyObject *gpu_device_obj = NULL;
 
     static char *kwlist[] = {"cpu_cores", "gpu_device", NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|OO", kwlist, &cpu_cores_obj, &gpu_device_obj)) { return -1; }
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|OO", kwlist, &cpu_cores_obj, &gpu_device_obj)) return -1;
 
     sz_status_t status;
 
@@ -295,6 +235,95 @@ static PyTypeObject DeviceScopeType = {
 
 #pragma endregion
 
+#pragma region Metadata
+
+/**
+ *  @brief Parse capabilities from a Python tuple of strings and intersect with hardware capabilities.
+ *  @param[in] caps_tuple Python tuple containing capability strings (e.g., ('serial', 'haswell')).
+ *  @param[out] result Output capability mask after intersection with hardware capabilities.
+ *  @return 0 on success, -1 on error (with Python exception set).
+ */
+static int parse_and_intersect_capabilities(PyObject *caps_obj, sz_capability_t *result) {
+    // Handle `DeviceScope` objects
+    if (PyObject_IsInstance(caps_obj, (PyObject *)&DeviceScopeType)) {
+        DeviceScope *device_scope = (DeviceScope *)caps_obj;
+
+        // Try to get GPU device
+        sz_size_t gpu_device;
+        if (sz_device_scope_get_gpu_device(device_scope->handle, &gpu_device) == sz_success_k) {
+            // This is a GPU scope - prefer CUDA if available
+            if (default_hardware_capabilities & sz_caps_cuda_k) { *result = sz_cap_cuda_k; }
+            else {
+                PyErr_SetString(PyExc_RuntimeError, "GPU DeviceScope requested but CUDA not available");
+                return -1;
+            }
+            return 0;
+        }
+
+        // Try to get CPU cores first
+        sz_size_t cpu_cores;
+        if (sz_device_scope_get_cpu_cores(device_scope->handle, &cpu_cores) == sz_success_k) {
+            // This is a CPU scope - prefer parallel if available, otherwise serial
+            *result = sz_caps_cpus_k;
+            return 0;
+        }
+
+        // Default scope - use all available capabilities
+        *result = default_hardware_capabilities;
+        return 0;
+    }
+
+    // Handle tuple of capability strings (original behavior)
+    if (!PyTuple_Check(caps_obj)) {
+        PyErr_SetString(PyExc_TypeError, "capabilities must be a tuple of strings or a DeviceScope object");
+        return -1;
+    }
+
+    sz_capability_t requested_caps = 0;
+    Py_ssize_t n = PyTuple_Size(caps_obj);
+
+    for (Py_ssize_t i = 0; i < n; i++) {
+        PyObject *item = PyTuple_GET_ITEM(caps_obj, i);
+        if (!PyUnicode_Check(item)) {
+            PyErr_SetString(PyExc_TypeError, "capabilities must be a tuple of strings");
+            return -1;
+        }
+
+        char const *cap_str = PyUnicode_AsUTF8(item);
+        if (!cap_str) return -1;
+
+        // Map string to capability flag
+        if (strcmp(cap_str, "serial") == 0) { requested_caps |= sz_cap_serial_k; }
+        else if (strcmp(cap_str, "parallel") == 0) { requested_caps |= sz_cap_parallel_k; }
+        else if (strcmp(cap_str, "haswell") == 0) { requested_caps |= sz_cap_haswell_k; }
+        else if (strcmp(cap_str, "skylake") == 0) { requested_caps |= sz_cap_skylake_k; }
+        else if (strcmp(cap_str, "ice") == 0) { requested_caps |= sz_cap_ice_k; }
+        else if (strcmp(cap_str, "neon") == 0) { requested_caps |= sz_cap_neon_k; }
+        else if (strcmp(cap_str, "neon_aes") == 0) { requested_caps |= sz_cap_neon_aes_k; }
+        else if (strcmp(cap_str, "sve") == 0) { requested_caps |= sz_cap_sve_k; }
+        else if (strcmp(cap_str, "sve2") == 0) { requested_caps |= sz_cap_sve2_k; }
+        else if (strcmp(cap_str, "sve2_aes") == 0) { requested_caps |= sz_cap_sve2_aes_k; }
+        else if (strcmp(cap_str, "cuda") == 0) { requested_caps |= sz_cap_cuda_k; }
+        else if (strcmp(cap_str, "kepler") == 0) { requested_caps |= sz_cap_kepler_k; }
+        else if (strcmp(cap_str, "hopper") == 0) { requested_caps |= sz_cap_hopper_k; }
+        else if (strcmp(cap_str, "any") == 0) { requested_caps |= sz_cap_any_k; }
+        else {
+            PyErr_Format(PyExc_ValueError, "Unknown capability: %s", cap_str);
+            return -1;
+        }
+    }
+
+    // Intersect with hardware capabilities
+    *result = requested_caps & default_hardware_capabilities;
+
+    // If no capabilities match, fall back to serial
+    if (*result == 0) { *result = sz_cap_serial_k; }
+
+    return 0;
+}
+
+#pragma endregion
+
 #pragma region LevenshteinDistances
 
 /**
@@ -331,11 +360,9 @@ static int LevenshteinDistances_init(LevenshteinDistances *self, PyObject *args,
     sz_capability_t capabilities = default_hardware_capabilities;
 
     static char *kwlist[] = {"match", "mismatch", "open", "extend", "capabilities", NULL};
-
     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iiiiO", kwlist, &match, &mismatch, &open, &extend,
-                                     &capabilities_tuple)) {
+                                     &capabilities_tuple))
         return -1;
-    }
 
     // Validate range of values
     if (match < -128 || match > 127) {
@@ -385,10 +412,7 @@ static PyObject *LevenshteinDistances_call(LevenshteinDistances *self, PyObject
     PyObject *a_obj = NULL, *b_obj = NULL, *device_obj = NULL, *out_obj = NULL;
 
     static char *kwlist[] = {"a", "b", "device", "out", NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|OO", kwlist, &a_obj, &b_obj, &device_obj, &out_obj)) {
-        return NULL;
-    }
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|OO", kwlist, &a_obj, &b_obj, &device_obj, &out_obj)) return NULL;
 
     DeviceScope *device_scope = NULL;
     if (device_obj != NULL && device_obj != Py_None) {
@@ -409,7 +433,7 @@ static PyObject *LevenshteinDistances_call(LevenshteinDistances *self, PyObject
                                  sz_size_t) = NULL;
 
     // Try to swap allocators to unified memory for GPU compatibility
-    if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) { return NULL; }
+    if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) return NULL;
 
     // Handle 32-bit tape inputs
     sz_sequence_u32tape_t a_u32tape, b_u32tape;
@@ -546,15 +570,30 @@ static char const doc_LevenshteinDistances[] = //
     "  mismatch (int): Cost for mismatched characters (default: 1).\n"
     "  open (int): Cost for opening a gap (default: 1).\n"
     "  extend (int): Cost for extending a gap (default: 1).\n"
-    "  capabilities (Tuple[str], optional): Hardware capabilities to use.\n"
-    "                                       Will be intersected with detected capabilities.\n"
-    "                                       Examples: ('serial',), ('haswell', 'parallel')\n"
+    "  capabilities (Tuple[str] or DeviceScope, optional): Hardware capabilities to use.\n"
+    "                                       Can be explicit capabilities like ('serial', 'parallel')\n"
+    "                                       or a DeviceScope for automatic capability inference.\n"
     "\n"
     "Call with:\n"
     "  a (sequence): First sequence of strings.\n"
     "  b (sequence): Second sequence of strings.\n"
     "  device (DeviceScope, optional): Device execution context.\n"
-    "  out (array, optional): Output buffer for results.";
+    "  out (array, optional): Output buffer for results.\n"
+    "\n"
+    "Examples:\n"
+    "  ```python\n"
+    "  # Minimal CPU example with auto-inferred capabilities\n"
+    "  import stringzilla as sz, stringzillas as szs\n"
+    "  engine = szs.LevenshteinDistances()\n"
+    "  strings_a = sz.Strs(['hello', 'world'])\n"
+    "  strings_b = sz.Strs(['hallo', 'word'])\n"
+    "  distances = engine(strings_a, strings_b)\n"
+    "  \n"
+    "  # GPU example with custom costs and auto-inferred capabilities\n"
+    "  gpu_scope = szs.DeviceScope(gpu_device=0)\n"
+    "  engine = szs.LevenshteinDistances(match=0, mismatch=2, open=3, extend=1, capabilities=gpu_scope)\n"
+    "  distances = engine(strings_a, strings_b, device=gpu_scope)\n"
+    "  ```";
 
 static PyGetSetDef LevenshteinDistances_getsetters[] = {
     {"__capabilities__", (getter)LevenshteinDistances_get_capabilities, NULL,
@@ -607,11 +646,9 @@ static int LevenshteinDistancesUTF8_init(LevenshteinDistancesUTF8 *self, PyObjec
     sz_capability_t capabilities = default_hardware_capabilities;
 
     static char *kwlist[] = {"match", "mismatch", "open", "extend", "capabilities", NULL};
-
     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iiiiO", kwlist, &match, &mismatch, &open, &extend,
-                                     &capabilities_tuple)) {
+                                     &capabilities_tuple))
         return -1;
-    }
 
     // Validate range of values
     if (match < -128 || match > 127) {
@@ -660,10 +697,7 @@ static PyObject *LevenshteinDistancesUTF8_call(LevenshteinDistancesUTF8 *self, P
     PyObject *a_obj = NULL, *b_obj = NULL, *device_obj = NULL, *out_obj = NULL;
 
     static char *kwlist[] = {"a", "b", "device", "out", NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|OO", kwlist, &a_obj, &b_obj, &device_obj, &out_obj)) {
-        return NULL;
-    }
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|OO", kwlist, &a_obj, &b_obj, &device_obj, &out_obj)) return NULL;
 
     DeviceScope *device_scope = NULL;
     if (device_obj != NULL && device_obj != Py_None) {
@@ -684,7 +718,7 @@ static PyObject *LevenshteinDistancesUTF8_call(LevenshteinDistancesUTF8 *self, P
                                  sz_size_t) = NULL;
 
     // Try to swap allocators to unified memory for GPU compatibility
-    if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) { return NULL; }
+    if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) return NULL;
 
     // Handle 32-bit tape inputs
     sz_sequence_u32tape_t a_u32tape, b_u32tape;
@@ -822,15 +856,30 @@ static char const doc_LevenshteinDistancesUTF8[] = //
     "  mismatch (int): Cost of mismatched characters (default 1).\n"
     "  open (int): Cost of opening a gap (default 1).\n"
     "  extend (int): Cost of extending a gap (default 1).\n"
-    "  capabilities (Tuple[str], optional): Hardware capabilities to use.\n"
-    "                                       Will be intersected with detected capabilities.\n"
-    "                                       Examples: ('serial',), ('haswell', 'parallel')\n"
+    "  capabilities (Tuple[str] or DeviceScope, optional): Hardware capabilities to use.\n"
+    "                                       Can be explicit capabilities like ('serial', 'parallel')\n"
+    "                                       or a DeviceScope for automatic capability inference.\n"
     "\n"
     "Call with:\n"
     "  a (sequence): First sequence of UTF-8 strings.\n"
     "  b (sequence): Second sequence of UTF-8 strings.\n"
     "  device (DeviceScope, optional): Device execution context.\n"
-    "  out (array, optional): Output buffer for results.";
+    "  out (array, optional): Output buffer for results.\n"
+    "\n"
+    "Examples:\n"
+    "  ```python\n"
+    "  # Minimal CPU example with Unicode strings\n"
+    "  import stringzilla as sz, stringzillas as szs\n"
+    "  engine = szs.LevenshteinDistancesUTF8()\n"
+    "  strings_a = sz.Strs(['café', 'naïve'])\n"
+    "  strings_b = sz.Strs(['caffe', 'naive'])\n"
+    "  distances = engine(strings_a, strings_b)\n"
+    "  \n"
+    "  # GPU example with high mismatch penalty\n"
+    "  gpu_scope = szs.DeviceScope(gpu_device=0)\n"
+    "  engine = szs.LevenshteinDistancesUTF8(mismatch=5, capabilities=gpu_scope)\n"
+    "  distances = engine(strings_a, strings_b, device=gpu_scope)\n"
+    "  ```";
 
 static PyGetSetDef LevenshteinDistancesUTF8_getsetters[] = {
     {"__capabilities__", (getter)LevenshteinDistancesUTF8_get_capabilities, NULL,
@@ -891,11 +940,9 @@ static int NeedlemanWunsch_init(NeedlemanWunsch *self, PyObject *args, PyObject
 
     // Parse arguments: substitution_matrix, open, extend, capabilities
     static char *kwlist[] = {"substitution_matrix", "open", "extend", "capabilities", NULL};
-
     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|iiO", kwlist, &substitution_matrix_obj, &open, &extend,
-                                     &capabilities_tuple)) {
+                                     &capabilities_tuple))
         return -1;
-    }
 
     // Validate substitution matrix (should be a 256x256 numpy array)
     if (!numpy_available || !PyArray_Check(substitution_matrix_obj)) {
@@ -923,13 +970,11 @@ static int NeedlemanWunsch_init(NeedlemanWunsch *self, PyObject *args, PyObject
     sz_error_cost_t *subs_data = (sz_error_cost_t *)PyArray_DATA(subs_array);
 
     // Create a simple checksum of the substitution matrix for the description
-    uint32_t subs_checksum = 0;
-    for (int i = 0; i < 256; i += 16) {                    // Sample every 16th element
-        subs_checksum += (uint32_t)subs_data[i * 256 + i]; // Diagonal elements
-    }
+    sz_u32_t subs_checksum = 0;
+    for (int i = 0; i < 256; i += 16)                      // Sample every 16th element
+        subs_checksum += (sz_u32_t)subs_data[i * 256 + i]; // Diagonal elements
 
     sz_status_t status = sz_needleman_wunsch_scores_init(subs_data, open, extend, NULL, capabilities, &self->handle);
-
     if (status != sz_success_k) {
         char const *error_msg;
         switch (status) {
@@ -963,10 +1008,7 @@ static PyObject *NeedlemanWunsch_call(NeedlemanWunsch *self, PyObject *args, PyO
     PyObject *a_obj = NULL, *b_obj = NULL, *device_obj = NULL, *out_obj = NULL;
 
     static char *kwlist[] = {"a", "b", "device", "out", NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|OO", kwlist, &a_obj, &b_obj, &device_obj, &out_obj)) {
-        return NULL;
-    }
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|OO", kwlist, &a_obj, &b_obj, &device_obj, &out_obj)) return NULL;
 
     // Get device handle
     sz_device_scope_t device_handle = default_device_scope;
@@ -984,7 +1026,7 @@ static PyObject *NeedlemanWunsch_call(NeedlemanWunsch *self, PyObject *args, PyO
     sz_status_t (*kernel_punned)(sz_needleman_wunsch_scores_t, sz_device_scope_t, void const *, void const *,
                                  sz_ssize_t *, sz_size_t) = NULL;
     // Try to swap allocators to unified memory for GPU compatibility
-    if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) { return NULL; }
+    if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) return NULL;
 
     // Handle 32-bit tape inputs
     sz_sequence_u32tape_t a_u32tape, b_u32tape;
@@ -1121,15 +1163,31 @@ static char const doc_NeedlemanWunsch[] = //
     "  substitution_matrix (np.ndarray): 256x256 int8 substitution matrix.\n"
     "  open (int): Cost for opening a gap (default: -1).\n"
     "  extend (int): Cost for extending a gap (default: -1).\n"
-    "  capabilities (Tuple[str], optional): Hardware capabilities to use.\n"
-    "                                       Will be intersected with detected capabilities.\n"
-    "                                       Examples: ('serial',), ('haswell', 'parallel')\n"
+    "  capabilities (Tuple[str] or DeviceScope, optional): Hardware capabilities to use.\n"
+    "                                       Can be explicit capabilities like ('serial', 'parallel')\n"
+    "                                       or a DeviceScope for automatic capability inference.\n"
     "\n"
     "Call with:\n"
     "  a (sequence): First sequence of strings.\n"
     "  b (sequence): Second sequence of strings.\n"
     "  device (DeviceScope, optional): Device execution context.\n"
-    "  out (array, optional): Output buffer for results.";
+    "  out (array, optional): Output buffer for results.\n"
+    "\n"
+    "Examples:\n"
+    "  ```python\n"
+    "  # Minimal CPU example with BLOSUM62 matrix\n"
+    "  import numpy as np, stringzilla as sz, stringzillas as szs\n"
+    "  matrix = np.zeros((256, 256), dtype=np.int8)\n"
+    "  engine = szs.NeedlemanWunsch(substitution_matrix=matrix)\n"
+    "  proteins_a = sz.Strs(['ACGT', 'TGCA'])\n"
+    "  proteins_b = sz.Strs(['ACCT', 'TGAA'])\n"
+    "  scores = engine(proteins_a, proteins_b)\n"
+    "  \n"
+    "  # GPU example with custom gap penalties\n"
+    "  gpu_scope = szs.DeviceScope(gpu_device=0)\n"
+    "  engine = szs.NeedlemanWunsch(substitution_matrix=matrix, open=-2, extend=-1, capabilities=gpu_scope)\n"
+    "  scores = engine(proteins_a, proteins_b, device=gpu_scope)\n"
+    "  ```";
 
 static PyTypeObject NeedlemanWunschType = {
     PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzillas.NeedlemanWunsch",
@@ -1176,11 +1234,9 @@ static int SmithWaterman_init(SmithWaterman *self, PyObject *args, PyObject *kwa
 
     // Parse arguments: substitution_matrix, open, extend, capabilities
     static char *kwlist[] = {"substitution_matrix", "open", "extend", "capabilities", NULL};
-
     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|iiO", kwlist, &substitution_matrix_obj, &open, &extend,
-                                     &capabilities_tuple)) {
+                                     &capabilities_tuple))
         return -1;
-    }
 
     // Validate substitution matrix (should be a 256x256 numpy array)
     if (!numpy_available || !PyArray_Check(substitution_matrix_obj)) {
@@ -1231,10 +1287,7 @@ static PyObject *SmithWaterman_call(SmithWaterman *self, PyObject *args, PyObjec
     PyObject *a_obj = NULL, *b_obj = NULL, *device_obj = NULL, *out_obj = NULL;
 
     static char *kwlist[] = {"a", "b", "device", "out", NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|OO", kwlist, &a_obj, &b_obj, &device_obj, &out_obj)) {
-        return NULL;
-    }
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|OO", kwlist, &a_obj, &b_obj, &device_obj, &out_obj)) return NULL;
 
     // Get device handle
     sz_device_scope_t device_handle = default_device_scope;
@@ -1252,7 +1305,7 @@ static PyObject *SmithWaterman_call(SmithWaterman *self, PyObject *args, PyObjec
     sz_status_t (*kernel_punned)(sz_smith_waterman_scores_t, sz_device_scope_t, void const *, void const *,
                                  sz_ssize_t *, sz_size_t) = NULL;
     // Try to swap allocators to unified memory for GPU compatibility
-    if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) { return NULL; }
+    if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) return NULL;
 
     // Handle 32-bit tape inputs
     sz_sequence_u32tape_t a_u32tape, b_u32tape;
@@ -1389,15 +1442,31 @@ static char const doc_SmithWaterman[] = //
     "  substitution_matrix (np.ndarray): 256x256 int8 substitution matrix.\n"
     "  open (int): Cost for opening a gap (default: -1).\n"
     "  extend (int): Cost for extending a gap (default: -1).\n"
-    "  capabilities (Tuple[str], optional): Hardware capabilities to use.\n"
-    "                                       Will be intersected with detected capabilities.\n"
-    "                                       Examples: ('serial',), ('haswell', 'parallel')\n"
+    "  capabilities (Tuple[str] or DeviceScope, optional): Hardware capabilities to use.\n"
+    "                                       Can be explicit capabilities like ('serial', 'parallel')\n"
+    "                                       or a DeviceScope for automatic capability inference.\n"
     "\n"
     "Call with:\n"
     "  a (sequence): First sequence of strings.\n"
     "  b (sequence): Second sequence of strings.\n"
     "  device (DeviceScope, optional): Device execution context.\n"
-    "  out (array, optional): Output buffer for results.";
+    "  out (array, optional): Output buffer for results.\n"
+    "\n"
+    "Examples:\n"
+    "  ```python\n"
+    "  # Minimal CPU example for local alignment\n"
+    "  import numpy as np, stringzilla as sz, stringzillas as szs\n"
+    "  matrix = np.eye(256, dtype=np.int8)  # Identity matrix\n"
+    "  engine = szs.SmithWaterman(substitution_matrix=matrix)\n"
+    "  seqs_a = sz.Strs(['ACGTACGT', 'TGCATGCA'])\n"
+    "  seqs_b = sz.Strs(['CGTACGTA', 'GCATGCAT'])\n"
+    "  scores = engine(seqs_a, seqs_b)\n"
+    "  \n"
+    "  # GPU example with different gap costs\n"
+    "  gpu_scope = szs.DeviceScope(gpu_device=0)\n"
+    "  engine = szs.SmithWaterman(substitution_matrix=matrix, open=-3, extend=-1, capabilities=gpu_scope)\n"
+    "  scores = engine(seqs_a, seqs_b, device=gpu_scope)\n"
+    "  ```";
 
 static PyTypeObject SmithWatermanType = {
     PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzillas.SmithWaterman",
@@ -1598,9 +1667,9 @@ static PyObject *Fingerprints_call(Fingerprints *self, PyObject *args, PyObject
     }
 
     if (kernel_punned == NULL) {
-        PyErr_Format(PyExc_TypeError, 
-            "Expected stringzilla.Strs object, got %s. Convert using: stringzilla.Strs(your_string_list)", 
-            Py_TYPE(texts_obj)->tp_name);
+        PyErr_Format(PyExc_TypeError,
+                     "Expected stringzilla.Strs object, got %s. Convert using: stringzilla.Strs(your_string_list)",
+                     Py_TYPE(texts_obj)->tp_name);
         return NULL;
     }
 
@@ -1676,10 +1745,30 @@ static char const doc_Fingerprints[] = //
     "  ndim (int): Number of dimensions per fingerprint.\n"
     "  window_widths (numpy.array, optional): 1D uint64 contiguous array of window widths. Uses defaults if None.\n"
     "  alphabet_size (int, optional): Alphabet size, default 256 for binary strings.\n"
-    "  capabilities (tuple, optional): Computational capabilities to enable ('serial', 'parallel', 'cuda').\n"
+    "  capabilities (Tuple[str] or DeviceScope, optional): Hardware capabilities to use.\n"
+    "                                       Can be explicit capabilities like ('serial', 'parallel', 'cuda')\n"
+    "                                       or a DeviceScope for automatic capability inference.\n"
+    "\n"
+    "Call with:\n"
+    "  texts (sequence): Sequence of strings to fingerprint.\n"
+    "  device (DeviceScope, optional): Device execution context.\n"
     "\n"
     "Returns:\n"
-    "  tuple: (hashes_matrix, counts_matrix) - Two numpy uint32 matrices of shape (num_texts, ndim).";
+    "  tuple: (hashes_matrix, counts_matrix) - Two numpy uint32 matrices of shape (num_texts, ndim).\n"
+    "\n"
+    "Examples:\n"
+    "  ```python\n"
+    "  # Minimal CPU example with auto-inferred capabilities\n"
+    "  import stringzilla as sz, stringzillas as szs\n"
+    "  engine = szs.Fingerprints(ndim=128)\n"
+    "  docs = sz.Strs(['document one', 'document two', 'document three'])\n"
+    "  hashes, counts = engine(docs)\n"
+    "  \n"
+    "  # GPU example with custom dimensions\n"
+    "  gpu_scope = szs.DeviceScope(gpu_device=0)\n"
+    "  engine = szs.Fingerprints(ndim=256, capabilities=gpu_scope)\n"
+    "  hashes, counts = engine(docs, device=gpu_scope)\n"
+    "  ```";
 
 static PyGetSetDef Fingerprints_getsetters[] = {
     {"capabilities", (getter)Fingerprints_get_capabilities, NULL, "computational capabilities", NULL},

From 48a112097fd732db1141b88f4f442e9e526823dd Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 17 Aug 2025 19:39:17 +0000
Subject: [PATCH 588/751] Add: StringZillas for Rust draft

---
 CONTRIBUTING.md                     |   2 +
 Cargo.lock                          |  11 +-
 Cargo.toml                          |  11 +-
 build.rs                            | 101 +++-
 c/stringzillas.cuh                  |  41 ++
 include/stringzillas/stringzillas.h |  18 +
 rust/stringzilla.rs                 | 509 +---------------
 rust/stringzillas.rs                | 860 ++++++++++++++++++++++++++++
 8 files changed, 1063 insertions(+), 490 deletions(-)
 create mode 100644 rust/stringzillas.rs

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 72034b92..25e49cd5 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -543,6 +543,8 @@ sudo docker run --rm -v "$PWD:/workspace" -w /workspace swift:5.9 /bin/bash -cl
 
 ```bash
 cargo test
+cargo test --features cpus
+cargo test --features cuda
 ```
 
 If you need to isolate a failing test:
diff --git a/Cargo.lock b/Cargo.lock
index 03ef460b..9e52779d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,11 +2,17 @@
 # It is not intended for manual editing.
 version = 4
 
+[[package]]
+name = "allocator-api2"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78200ac3468a57d333cd0ea5dd398e25111194dcacd49208afca95c629a6311d"
+
 [[package]]
 name = "cc"
-version = "1.2.31"
+version = "1.2.33"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3a42d84bb6b69d3a8b3eaacf0d88f179e1929695e1ad012b6cf64d9caaa5fd2"
+checksum = "3ee0f8803222ba5a7e2777dd72ca451868909b1ac410621b676adf07280e9b5f"
 dependencies = [
  "shlex",
 ]
@@ -21,5 +27,6 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 name = "stringzilla"
 version = "3.11.3"
 dependencies = [
+ "allocator-api2",
  "cc",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 65e403f1..9a840ddc 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,5 +23,14 @@ include = ["/rust/**", "/c/**", "/include/**", "/build.rs"]
 name = "stringzilla"
 path = "rust/stringzilla.rs"
 
+[features]
+default = []
+cpus = ["allocator-api2"] # Multi-threaded CPU backend (StringZillas)
+cuda = ["cpus"]           # CUDA GPU backend (includes multi-threaded CPU backend)
+rocm = ["cpus"]           # ROCm GPU backend (includes multi-threaded CPU backend)
+
+[dependencies]
+allocator-api2 = { version = "0.3.0", optional = true }
+
 [build-dependencies]
-cc = "1.2.31"
+cc = "1.2.33"
diff --git a/build.rs b/build.rs
index acaee5b9..5588d932 100644
--- a/build.rs
+++ b/build.rs
@@ -1,15 +1,28 @@
 use std::env;
 
 fn main() {
+    // Build stringzilla (always included, single-string operations)
+    build_stringzilla();
+
+    // Build stringzillas (multi-string operations) if any feature is enabled
+    if env::var("CARGO_FEATURE_CPUS").is_ok()
+        || env::var("CARGO_FEATURE_CUDA").is_ok()
+        || env::var("CARGO_FEATURE_ROCM").is_ok()
+    {
+        build_stringzillas();
+    }
+}
+
+fn build_stringzilla() {
     let mut build = cc::Build::new();
     build
-        .file("c/lib.c")
+        .file("c/stringzilla.c")
         .include("include")
         .warnings(false)
         .define("SZ_DYNAMIC_DISPATCH", "1")
         .define("SZ_AVOID_LIBC", "0")
         .define("SZ_DEBUG", "0")
-        .flag("-O3")
+        .flag("-O2")
         .flag("-std=c99") // Enforce C99 standard
         .flag_if_supported("-fdiagnostics-color=always")
         .flag_if_supported("-fPIC");
@@ -85,3 +98,87 @@ fn main() {
     println!("cargo:rerun-if-changed=include/stringzilla/sort.h");
     println!("cargo:rerun-if-changed=include/stringzilla/types.h");
 }
+
+fn build_stringzillas() {
+    let mut build = cc::Build::new();
+    let is_cuda = env::var("CARGO_FEATURE_CUDA").is_ok();
+    let is_rocm = env::var("CARGO_FEATURE_ROCM").is_ok();
+
+    build
+        .include("include")
+        .include("fork_union/include")
+        .warnings(false)
+        .define("SZ_DYNAMIC_DISPATCH", "1")
+        .define("SZ_AVOID_LIBC", "0")
+        .define("SZ_DEBUG", "0")
+        .flag("-O2");
+
+    // Set GPU backend flags
+    if is_cuda {
+        build.file("c/stringzillas.cu");
+        build.define("SZ_USE_CUDA", "1");
+        build.define("SZ_USE_ROCM", "0");
+        // For CUDA, we need C++ compilation
+        build.flag("-std=c++17");
+        build.cpp(true);
+    } else if is_rocm {
+        build.file("c/stringzillas.cu");
+        build.define("SZ_USE_CUDA", "0");
+        build.define("SZ_USE_ROCM", "1");
+        // For ROCm, we need C++ compilation
+        build.flag("-std=c++17");
+        build.cpp(true);
+    } else {
+        // CPU-only multi-threading
+        build.file("c/stringzillas.cpp");
+        build.define("SZ_USE_CUDA", "0");
+        build.define("SZ_USE_ROCM", "0");
+        build.flag("-std=c++17");
+        build.cpp(true);
+    }
+
+    // Common flags
+    build
+        .flag_if_supported("-fdiagnostics-color=always")
+        .flag_if_supported("-fPIC");
+
+    // Architecture-specific setup (same as stringzilla)
+    let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default();
+    let target_endian = env::var("CARGO_CFG_TARGET_ENDIAN").unwrap_or_default();
+
+    if target_endian == "big" {
+        build.define("SZ_DETECT_BIG_ENDIAN", "1");
+    } else {
+        build.define("SZ_DETECT_BIG_ENDIAN", "0");
+    }
+
+    if target_arch == "x86_64" {
+        build.define("SZ_IS_64BIT_X86_", "1");
+        build.define("SZ_IS_64BIT_ARM_", "0");
+    } else if target_arch == "aarch64" {
+        build.define("SZ_IS_64BIT_X86_", "0");
+        build.define("SZ_IS_64BIT_ARM_", "1");
+    }
+
+    // Try compilation with fallback (similar to stringzilla approach)
+    if build.try_compile("stringzillas").is_err() {
+        println!("cargo:warning=Failed to compile stringzillas with selected backend");
+
+        // Fallback: disable GPU features and try CPU-only
+        build.define("SZ_USE_CUDA", "0");
+        build.define("SZ_USE_ROCM", "0");
+
+        if build.try_compile("stringzillas").is_err() {
+            panic!("Failed to compile stringzillas even with CPU-only fallback");
+        }
+    }
+
+    // StringZillas-specific rerun triggers
+    println!("cargo:rerun-if-changed=c/stringzillas.cu");
+    println!("cargo:rerun-if-changed=c/stringzillas.cuh");
+    println!("cargo:rerun-if-changed=include/stringzillas/stringzillas.h");
+    println!("cargo:rerun-if-changed=include/stringzillas/fingerprints.hpp");
+    println!("cargo:rerun-if-changed=include/stringzillas/fingerprints.cuh");
+    println!("cargo:rerun-if-changed=include/stringzillas/similarities.hpp");
+    println!("cargo:rerun-if-changed=include/stringzillas/similarities.cuh");
+}
diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index 076261de..80d65788 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -858,8 +858,49 @@ SZ_DYNAMIC void sz_device_scope_free(sz_device_scope_t scope_punned) {
     delete scope;
 }
 
+SZ_DYNAMIC sz_status_t sz_device_scope_get_capabilities(sz_device_scope_t scope_punned, sz_capability_t *capabilities) {
+    if (scope_punned == nullptr || capabilities == nullptr) return sz_status_unknown_k;
+    auto *scope = reinterpret_cast<device_scope_t *>(scope_punned);
+
+    sz_capability_t system_caps = sz_capabilities();
+
+#if SZ_USE_CUDA
+    if (std::holds_alternative<gpu_scope_t>(scope->variants)) {
+        // For GPU scope, intersect system capabilities with CUDA capabilities
+        *capabilities = static_cast<sz_capability_t>(system_caps & sz_caps_cuda_k);
+        return sz_success_k;
+    }
+#endif
+    
+    // For default and CPU scopes, intersect system capabilities with CPU capabilities
+    *capabilities = static_cast<sz_capability_t>(system_caps & sz_caps_cpus_k);
+    return sz_success_k;
+}
+
 #pragma endregion Device Scopes
 
+#pragma region Unified Allocator
+
+SZ_DYNAMIC void *sz_unified_alloc(sz_size_t size_bytes) {
+#if SZ_USE_CUDA
+    return szs::unified_alloc_t {}.allocate(size_bytes);
+#else
+    return std::malloc(size_bytes);
+#endif
+}
+
+SZ_DYNAMIC void sz_unified_free(void *ptr, sz_size_t size_bytes) {
+    if (!ptr) return;
+#if SZ_USE_CUDA
+    szs::unified_alloc_t {}.deallocate(static_cast<char *>(ptr), size_bytes);
+#else
+    sz_unused_(size_bytes);
+    std::free(ptr);
+#endif
+}
+
+#pragma endregion Unified Allocator
+
 #pragma region Levenshtein Distances
 
 SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(                                              //
diff --git a/include/stringzillas/stringzillas.h b/include/stringzillas/stringzillas.h
index a3e6768f..a0c1f775 100644
--- a/include/stringzillas/stringzillas.h
+++ b/include/stringzillas/stringzillas.h
@@ -81,6 +81,7 @@ SZ_DYNAMIC sz_status_t sz_device_scope_init_cpu_cores(sz_size_t cpu_cores, sz_de
 SZ_DYNAMIC sz_status_t sz_device_scope_init_gpu_device(sz_size_t gpu_device, sz_device_scope_t *scope);
 SZ_DYNAMIC sz_status_t sz_device_scope_get_cpu_cores(sz_device_scope_t scope, sz_size_t *cpu_cores);
 SZ_DYNAMIC sz_status_t sz_device_scope_get_gpu_device(sz_device_scope_t scope, sz_size_t *gpu_device);
+SZ_DYNAMIC sz_status_t sz_device_scope_get_capabilities(sz_device_scope_t scope, sz_capability_t *capabilities);
 SZ_DYNAMIC void sz_device_scope_free(sz_device_scope_t scope);
 
 /*  APIs for computing edit-distances between binary and UTF-8 strings.
@@ -248,6 +249,23 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_u32tape(         //
 
 SZ_DYNAMIC void sz_fingerprints_free(sz_fingerprints_t engine);
 
+/**
+ *  @brief Allocates memory using unified memory allocator.
+ *  @param[in] size_bytes Number of bytes to allocate.
+ *  @return Pointer to allocated memory, or NULL on failure.
+ *  
+ *  Uses CUDA unified memory when available, falls back to malloc otherwise.
+ *  Allocated memory can be accessed from both CPU and GPU when CUDA is available.
+ */
+SZ_DYNAMIC void *sz_unified_alloc(sz_size_t size_bytes);
+
+/**
+ *  @brief Deallocates memory allocated by sz_unified_alloc.
+ *  @param[in] ptr Pointer to memory to deallocate.
+ *  @param[in] size_bytes Size of the allocation (for compatibility, may be ignored).
+ */
+SZ_DYNAMIC void sz_unified_free(void *ptr, sz_size_t size_bytes);
+
 #ifdef __cplusplus
 }
 #endif // __cplusplus
diff --git a/rust/stringzilla.rs b/rust/stringzilla.rs
index 4f8f36a7..d81bbb08 100644
--- a/rust/stringzilla.rs
+++ b/rust/stringzilla.rs
@@ -197,55 +197,6 @@ pub mod sz {
             second_positions: *mut SortedIdx,
         ) -> Status;
 
-        pub fn sz_levenshtein_distance(
-            a: *const c_void,
-            a_length: usize,
-            b: *const c_void,
-            b_length: usize,
-            bound: usize,
-            alloc: *const c_void,
-            result: *mut usize,
-        ) -> Status;
-
-        pub fn sz_levenshtein_distance_utf8(
-            a: *const c_void,
-            a_length: usize,
-            b: *const c_void,
-            b_length: usize,
-            bound: usize,
-            alloc: *const c_void,
-            result: *mut usize,
-        ) -> Status;
-
-        pub fn sz_hamming_distance(
-            a: *const c_void,
-            a_length: usize,
-            b: *const c_void,
-            b_length: usize,
-            bound: usize,
-            result: *mut usize,
-        ) -> Status;
-
-        pub fn sz_hamming_distance_utf8(
-            a: *const c_void,
-            a_length: usize,
-            b: *const c_void,
-            b_length: usize,
-            bound: usize,
-            result: *mut usize,
-        ) -> Status;
-
-        pub fn sz_needleman_wunsch_score(
-            a: *const c_void,
-            a_length: usize,
-            b: *const c_void,
-            b_length: usize,
-            subs: *const i8,
-            gap: i8,
-            alloc: *const c_void,
-            result: *mut isize,
-        ) -> Status;
-
     }
 
     impl SemVer {
@@ -791,306 +742,6 @@ pub mod sz {
         rfind_byteset(haystack, Byteset::from(needles).inverted())
     }
 
-    /// Computes the Levenshtein edit distance between two strings, using the Wagner-Fisher
-    /// algorithm. This measure is widely used in applications like spell-checking, DNA sequence
-    /// analysis.
-    ///
-    /// # Arguments
-    ///
-    /// * `first`: The first byte slice.
-    /// * `second`: The second byte slice.
-    /// * `bound`: The maximum distance to compute, allowing for early exit.
-    ///
-    /// # Returns
-    ///
-    /// A `usize` representing the minimum number of single-character edits (insertions,
-    /// deletions, or substitutions) required to change `first` into `second`.
-    pub fn levenshtein_distance_bounded<F, S>(first: F, second: S, bound: usize) -> Result<usize, Status>
-    where
-        F: AsRef<[u8]>,
-        S: AsRef<[u8]>,
-    {
-        let first_ref = first.as_ref();
-        let second_ref = second.as_ref();
-        let first_length = first_ref.len();
-        let second_length = second_ref.len();
-        let first_pointer = first_ref.as_ptr() as _;
-        let second_pointer = second_ref.as_ptr() as _;
-        let mut result: usize = 0;
-        let status = unsafe {
-            sz_levenshtein_distance(
-                first_pointer,
-                first_length,
-                second_pointer,
-                second_length,
-                bound,
-                core::ptr::null(), // Uses the default allocator
-                &mut result as *mut _,
-            )
-        };
-        if status == Status::Success {
-            Ok(result)
-        } else {
-            Err(status)
-        }
-    }
-
-    /// Computes the Levenshtein edit distance between two UTF8 strings, using the Wagner-Fisher
-    /// algorithm. This measure is widely used in applications like spell-checking.
-    ///
-    /// # Arguments
-    ///
-    /// * `first`: The first byte slice.
-    /// * `second`: The second byte slice.
-    /// * `bound`: The maximum distance to compute, allowing for early exit.
-    ///
-    /// # Returns
-    ///
-    /// A `usize` representing the minimum number of single-character edits (insertions,
-    /// deletions, or substitutions) required to change `first` into `second`.
-    pub fn levenshtein_distance_utf8_bounded<F, S>(first: F, second: S, bound: usize) -> Result<usize, Status>
-    where
-        F: AsRef<[u8]>,
-        S: AsRef<[u8]>,
-    {
-        let first_ref = first.as_ref();
-        let second_ref = second.as_ref();
-        let first_length = first_ref.len();
-        let second_length = second_ref.len();
-        let first_pointer = first_ref.as_ptr() as _;
-        let second_pointer = second_ref.as_ptr() as _;
-        let mut result: usize = 0;
-        let status = unsafe {
-            sz_levenshtein_distance_utf8(
-                first_pointer,
-                first_length,
-                second_pointer,
-                second_length,
-                bound,
-                core::ptr::null(), // Uses the default allocator
-                &mut result as *mut _,
-            )
-        };
-        if status == Status::Success {
-            Ok(result)
-        } else {
-            Err(status)
-        }
-    }
-
-    /// Computes the Levenshtein edit distance between two strings, using the Wagner-Fisher
-    /// algorithm. This measure is widely used in applications like spell-checking, DNA sequence
-    /// analysis.
-    ///
-    /// # Arguments
-    ///
-    /// * `first`: The first byte slice.
-    /// * `second`: The second byte slice.
-    ///
-    /// # Returns
-    ///
-    /// A `usize` representing the minimum number of single-character edits (insertions,
-    /// deletions, or substitutions) required to change `first` into `second`.
-    pub fn levenshtein_distance<F, S>(first: F, second: S) -> Result<usize, Status>
-    where
-        F: AsRef<[u8]>,
-        S: AsRef<[u8]>,
-    {
-        levenshtein_distance_bounded(first, second, usize::MAX)
-    }
-
-    /// Computes the Levenshtein edit distance between two UTF8 strings, using the Wagner-Fisher
-    /// algorithm. This measure is widely used in applications like spell-checking.
-    ///
-    /// # Arguments
-    ///
-    /// * `first`: The first byte slice.
-    /// * `second`: The second byte slice.
-    ///
-    /// # Returns
-    ///
-    /// A `usize` representing the minimum number of single-character edits (insertions,
-    /// deletions, or substitutions) required to change `first` into `second`.
-    pub fn levenshtein_distance_utf8<F, S>(first: F, second: S) -> Result<usize, Status>
-    where
-        F: AsRef<[u8]>,
-        S: AsRef<[u8]>,
-    {
-        levenshtein_distance_utf8_bounded(first, second, usize::MAX)
-    }
-
-    /// Computes the Hamming edit distance between two strings, counting the number of substituted characters.
-    /// Difference in length is added to the result as well.
-    ///
-    /// # Arguments
-    ///
-    /// * `first`: The first byte slice.
-    /// * `second`: The second byte slice.
-    /// * `bound`: The maximum distance to compute, allowing for early exit.
-    ///
-    /// # Returns
-    ///
-    /// A `usize` representing the minimum number of single-character edits (substitutions) required to
-    /// change `first` into `second`.
-    pub fn hamming_distance_bounded<F, S>(first: F, second: S, bound: usize) -> Result<usize, Status>
-    where
-        F: AsRef<[u8]>,
-        S: AsRef<[u8]>,
-    {
-        let first_ref = first.as_ref();
-        let second_ref = second.as_ref();
-        let first_length = first_ref.len();
-        let second_length = second_ref.len();
-        let first_pointer = first_ref.as_ptr() as _;
-        let second_pointer = second_ref.as_ptr() as _;
-        let mut result: usize = 0;
-        let status = unsafe {
-            sz_hamming_distance(
-                first_pointer,
-                first_length,
-                second_pointer,
-                second_length,
-                bound,
-                &mut result as *mut _,
-            )
-        };
-        if status == Status::Success {
-            Ok(result)
-        } else {
-            Err(status)
-        }
-    }
-
-    /// Computes the Hamming edit distance between two UTF8 strings, counting the number of substituted
-    /// variable-length characters. Difference in length is added to the result as well.
-    ///
-    /// # Arguments
-    ///
-    /// * `first`: The first byte slice.
-    /// * `second`: The second byte slice.
-    /// * `bound`: The maximum distance to compute, allowing for early exit.
-    ///
-    /// # Returns
-    ///
-    /// A `usize` representing the minimum number of single-character edits (substitutions) required to
-    /// change `first` into `second`.
-    pub fn hamming_distance_utf8_bounded<F, S>(first: F, second: S, bound: usize) -> Result<usize, Status>
-    where
-        F: AsRef<[u8]>,
-        S: AsRef<[u8]>,
-    {
-        let first_ref = first.as_ref();
-        let second_ref = second.as_ref();
-        let first_length = first_ref.len();
-        let second_length = second_ref.len();
-        let first_pointer = first_ref.as_ptr() as _;
-        let second_pointer = second_ref.as_ptr() as _;
-        let mut result: usize = 0;
-        let status = unsafe {
-            sz_hamming_distance_utf8(
-                first_pointer,
-                first_length,
-                second_pointer,
-                second_length,
-                bound,
-                &mut result as *mut _,
-            )
-        };
-        if status == Status::Success {
-            Ok(result)
-        } else {
-            Err(status)
-        }
-    }
-
-    /// Computes the Hamming edit distance between two strings, counting the number of substituted characters.
-    /// Difference in length is added to the result as well.
-    ///
-    /// # Arguments
-    ///
-    /// * `first`: The first byte slice.
-    /// * `second`: The second byte slice.
-    ///
-    /// # Returns
-    ///
-    /// A `usize` representing the minimum number of single-character edits (substitutions) required to
-    /// change `first` into `second`.
-    pub fn hamming_distance<F, S>(first: F, second: S) -> Result<usize, Status>
-    where
-        F: AsRef<[u8]>,
-        S: AsRef<[u8]>,
-    {
-        hamming_distance_bounded(first, second, 0)
-    }
-
-    /// Computes the Hamming edit distance between two UTF8 strings, counting the number of substituted
-    /// variable-length characters. Difference in length is added to the result as well.
-    ///
-    /// # Arguments
-    ///
-    /// * `first`: The first byte slice.
-    /// * `second`: The second byte slice.
-    ///
-    /// # Returns
-    ///
-    /// A `usize` representing the minimum number of single-character edits (substitutions) required to
-    /// change `first` into `second`.
-    pub fn hamming_distance_utf8<F, S>(first: F, second: S) -> Result<usize, Status>
-    where
-        F: AsRef<[u8]>,
-        S: AsRef<[u8]>,
-    {
-        hamming_distance_utf8_bounded(first, second, 0)
-    }
-
-    /// Computes the Needleman-Wunsch alignment score for two strings. This function is
-    /// particularly used in bioinformatics for sequence alignment but is also applicable in
-    /// other domains requiring detailed comparison between two strings, including gap and
-    /// substitution penalties.
-    ///
-    /// # Arguments
-    ///
-    /// * `first`: The first byte slice to align.
-    /// * `second`: The second byte slice to align.
-    /// * `matrix`: The substitution matrix used for scoring.
-    /// * `gap`: The penalty for each gap introduced during alignment.
-    ///
-    /// # Returns
-    ///
-    /// An `isize` representing the total alignment score, where higher scores indicate better
-    /// alignment between the two strings, considering the specified gap penalties and
-    /// substitution matrix.
-    pub fn alignment_score<F, S>(first: F, second: S, matrix: [[i8; 256]; 256], gap: i8) -> Result<isize, Status>
-    where
-        F: AsRef<[u8]>,
-        S: AsRef<[u8]>,
-    {
-        let first_ref = first.as_ref();
-        let second_ref = second.as_ref();
-        let first_length = first_ref.len();
-        let second_length = second_ref.len();
-        let first_pointer = first_ref.as_ptr() as _;
-        let second_pointer = second_ref.as_ptr() as _;
-        let mut result: isize = 0;
-        let status = unsafe {
-            sz_needleman_wunsch_score(
-                first_pointer,
-                first_length,
-                second_pointer,
-                second_length,
-                matrix.as_ptr() as _,
-                gap,
-                core::ptr::null(), // Uses the default allocator
-                &mut result as *mut _,
-            )
-        };
-        if status == Status::Success {
-            Ok(result)
-        } else {
-            Err(status)
-        }
-    }
-
     /// Generates a default substitution matrix for use with the Needleman-Wunsch
     /// alignment algorithm. This matrix is initialized such that diagonal entries
     /// (representing matching characters) are zero, and off-diagonal entries
@@ -1183,9 +834,9 @@ pub mod sz {
     /// use stringzilla::sz;
     ///
     /// let fruits = ["banana", "apple", "cherry"];
-    /// let mut order = [0; fruits.len()];
+    /// let mut order = [0; 3];
     /// sz::argsort_permutation(&fruits, &mut order).expect("sort failed");
-    /// assert_eq!(order, &[1, 0, 2]); // "apple", "banana", "cherry"
+    /// assert_eq!(&order, &[1, 0, 2]); // "apple", "banana", "cherry"
     /// ```
     pub fn argsort_permutation<T: AsRef<[u8]>>(data: &[T], order: &mut [SortedIdx]) -> Result<(), Status> {
         if data.len() > order.len() {
@@ -1202,14 +853,17 @@ pub mod sz {
     /// ```rust
     /// use stringzilla::sz;
     ///
+    /// #[derive(Debug)]
+    /// struct Person { name: &'static str, age: u32 }
+    ///
     /// let people = [
     ///     Person { name: "Charlie", age: 20 },
     ///     Person { name: "Alice", age: 25 },
     ///     Person { name: "Bob", age: 30 },
     /// ];
-    /// let mut order = [0; people.len()];
+    /// let mut order = [0; 3];
     /// sz::argsort_permutation_by(|i| people[i].name.as_bytes(), &mut order).expect("sort failed");
-    /// assert_eq!(order, &[1, 2, 0]); // "Alice", "Bob", "Charlie"
+    /// assert_eq!(&order, &[1, 2, 0]); // "Alice", "Bob", "Charlie"
     /// ```
     pub fn argsort_permutation_by<F, A>(mapper: F, order: &mut [SortedIdx]) -> Result<(), Status>
     where
@@ -1299,6 +953,9 @@ pub mod sz {
     /// ```rust
     /// use stringzilla::sz;
     ///
+    /// #[derive(Debug)]
+    /// struct Person { name: &'static str, age: u32 }
+    ///
     /// let people1 = [
     ///     Person { name: "Charlie", age: 20 },
     ///     Person { name: "Alice", age: 25 },
@@ -1309,8 +966,8 @@ pub mod sz {
     ///     Person { name: "Bob", age: 30 },
     ///     Person { name: "Charlie", age: 20 },
     /// ];
-    /// let mut positions1 = [0; people1.len().min(people2.len())];
-    /// let mut positions2 = [0; people2.len().min(people1.len())];
+    /// let mut positions1 = [0; 3]; // min(people1.len(), people2.len())
+    /// let mut positions2 = [0; 3]; // min(people1.len(), people2.len())
     /// let n = sz::intersection_by(
     ///     |i| people1[i].name.as_bytes(),
     ///     |j| people2[j].name.as_bytes(),
@@ -1699,8 +1356,8 @@ where
     /// ```
     /// use stringzilla::StringZilla;
     ///
-    /// let text = "Hello";
-    /// assert_eq!(text.sz_bytesum(), Some(500));
+    /// let text: &str = "Hello";
+    /// assert_eq!(text.sz_bytesum(), 500);
     /// ```
     fn sz_bytesum(&self) -> u64;
 
@@ -1714,7 +1371,9 @@ where
     /// ```
     /// use stringzilla::StringZilla;
     ///
-    /// assert_ne!("Hello".sz_hash(), "World".sz_hash());
+    /// let s1 = "Hello";
+    /// let s2 = "World";
+    /// assert_ne!(StringZilla::sz_hash(s1), StringZilla::sz_hash(s2));
     /// ```
     fn sz_hash(&self) -> u64;
 
@@ -1790,74 +1449,6 @@ where
     /// ```
     fn sz_rfind_byte_not_from(&self, needles: N) -> Option<usize>;
 
-    /// Computes the Levenshtein edit distance between `self` and `other`.
-    ///
-    /// # Examples
-    ///
-    /// ```
-    /// use stringzilla::StringZilla;
-    ///
-    /// let first = "kitten";
-    /// let second = "sitting";
-    /// assert_eq!(first.sz_levenshtein_distance(second.as_bytes()), Ok(3));
-    /// ```
-    fn sz_levenshtein_distance(&self, other: N) -> Result<usize, sz::Status>;
-
-    /// Computes the Levenshtein edit distance between `self` and `other`.
-    ///
-    /// # Examples
-    ///
-    /// ```
-    /// use stringzilla::StringZilla;
-    ///
-    /// let first = "kitten";
-    /// let second = "sitting";
-    /// assert_eq!(first.sz_levenshtein_distance_utf8(second.as_bytes()), Ok(3));
-    /// ```
-    fn sz_levenshtein_distance_utf8(&self, other: N) -> Result<usize, sz::Status>;
-
-    /// Computes the bounded Levenshtein edit distance between `self` and `other`.
-    ///
-    /// # Examples
-    ///
-    /// ```
-    /// use stringzilla::StringZilla;
-    ///
-    /// let first = "kitten";
-    /// let second = "sitting";
-    /// assert_eq!(first.sz_levenshtein_distance_bounded(second.as_bytes()), Ok(3));
-    /// ```
-    fn sz_levenshtein_distance_bounded(&self, other: N, bound: usize) -> Result<usize, sz::Status>;
-
-    /// Computes the bounded Levenshtein edit distance between `self` and `other`.
-    ///
-    /// # Examples
-    ///
-    /// ```
-    /// use stringzilla::StringZilla;
-    ///
-    /// let first = "kitten";
-    /// let second = "sitting";
-    /// assert_eq!(first.sz_levenshtein_distance_utf8_bounded(second.as_bytes()), Ok(3));
-    /// ```
-    fn sz_levenshtein_distance_utf8_bounded(&self, other: N, bound: usize) -> Result<usize, sz::Status>;
-
-    /// Computes the alignment score between `self` and `other` using the specified
-    /// substitution matrix and gap penalty.
-    ///
-    /// # Examples
-    ///
-    /// ```
-    /// use stringzilla::{sz, StringZilla};
-    ///
-    /// let first = "kitten";
-    /// let second = "sitting";
-    /// let matrix = sz::error_costs_256x256_unary();
-    /// let gap_penalty = -1;
-    /// assert_eq!(first.sz_needleman_wunsch_score(second.as_bytes(), matrix, gap_penalty), Ok(-3));
-    /// ```
-    fn sz_needleman_wunsch_score(&self, other: N, matrix: [[i8; 256]; 256], gap: i8) -> Result<isize, sz::Status>;
-
     /// Returns an iterator over all non-overlapping matches of the given `needle` in `self`.
     ///
     /// # Arguments
@@ -2040,26 +1631,6 @@ where
         sz::rfind_byte_not_from(self, needles)
     }
 
-    fn sz_levenshtein_distance(&self, other: N) -> Result<usize, sz::Status> {
-        sz::levenshtein_distance(self, other)
-    }
-
-    fn sz_levenshtein_distance_utf8(&self, other: N) -> Result<usize, sz::Status> {
-        sz::levenshtein_distance_utf8(self, other)
-    }
-
-    fn sz_levenshtein_distance_bounded(&self, other: N, bound: usize) -> Result<usize, sz::Status> {
-        sz::levenshtein_distance_bounded(self, other, bound)
-    }
-
-    fn sz_levenshtein_distance_utf8_bounded(&self, other: N, bound: usize) -> Result<usize, sz::Status> {
-        sz::levenshtein_distance_utf8_bounded(self, other, bound)
-    }
-
-    fn sz_needleman_wunsch_score(&self, other: N, matrix: [[i8; 256]; 256], gap: i8) -> Result<isize, sz::Status> {
-        sz::alignment_score(self, other, matrix, gap)
-    }
-
     fn sz_matches(&'a self, needle: &'a N) -> RangeMatches<'a> {
         RangeMatches::new(self.as_ref(), MatcherType::Find(needle.as_ref()), true)
     }
@@ -2137,45 +1708,6 @@ mod tests {
         }
     }
 
-    #[test]
-    fn hamming() {
-        assert_eq!(sz::hamming_distance("hello", "hello"), Ok(0));
-        assert_eq!(sz::hamming_distance("hello", "hell"), Ok(1));
-        assert_eq!(sz::hamming_distance("abc", "adc"), Ok(1));
-
-        assert_eq!(sz::hamming_distance_bounded("abcdefgh", "ABCDEFGH", 2), Ok(2));
-        assert_eq!(sz::hamming_distance_utf8("αβγδ", "αγγδ"), Ok(1));
-    }
-
-    #[test]
-    fn levenshtein() {
-        assert_eq!(sz::levenshtein_distance("hello", "hell"), Ok(1));
-        assert_eq!(sz::levenshtein_distance("hello", "hell"), Ok(1));
-        assert_eq!(sz::levenshtein_distance("abc", ""), Ok(3));
-        assert_eq!(sz::levenshtein_distance("abc", "ac"), Ok(1));
-        assert_eq!(sz::levenshtein_distance("abc", "a_bc"), Ok(1));
-        assert_eq!(sz::levenshtein_distance("abc", "adc"), Ok(1));
-        assert_eq!(sz::levenshtein_distance("fitting", "kitty"), Ok(4));
-        assert_eq!(sz::levenshtein_distance("smitten", "mitten"), Ok(1));
-        assert_eq!(sz::levenshtein_distance("ggbuzgjux{}l", "gbuzgjux{}l"), Ok(1));
-        assert_eq!(sz::levenshtein_distance("abcdefgABCDEFG", "ABCDEFGabcdefg"), Ok(14));
-
-        assert_eq!(sz::levenshtein_distance_bounded("fitting", "kitty", 2), Ok(2));
-        assert_eq!(sz::levenshtein_distance_utf8("façade", "facade"), Ok(1));
-    }
-
-    #[test]
-    fn needleman() {
-        let costs_vector = sz::error_costs_256x256_unary();
-        assert_eq!(sz::alignment_score("listen", "silent", costs_vector, -1), Ok(-4));
-        assert_eq!(
-            sz::alignment_score("abcdefgABCDEFG", "ABCDEFGabcdefg", costs_vector, -1),
-            Ok(-14)
-        );
-        assert_eq!(sz::alignment_score("hello", "hello", costs_vector, -1), Ok(0));
-        assert_eq!(sz::alignment_score("hello", "hell", costs_vector, -1), Ok(-1));
-    }
-
     #[test]
     fn search() {
         let my_string: String = String::from("Hello, world!");
@@ -2512,3 +2044,10 @@ mod tests {
         assert_eq!(common_from_api, expected);
     }
 }
+
+/// StringZillas - Multi-string parallel operations module
+/// This module is conditionally compiled when `cpus`, `cuda`, or `rocm` features are enabled.
+#[cfg(any(feature = "cpus", feature = "cuda", feature = "rocm"))]
+pub mod stringzillas {
+    include!("stringzillas.rs");
+}
diff --git a/rust/stringzillas.rs b/rust/stringzillas.rs
new file mode 100644
index 00000000..a2a8d6ba
--- /dev/null
+++ b/rust/stringzillas.rs
@@ -0,0 +1,860 @@
+#[doc = r"
+The `szs` module provides multi-string parallel operations for StringZillas,
+including fingerprinting, Min-Hashes, and Count-Min-Sketches of binary and UTF-8 strings.
+This module is available when `cpus`, `cuda`, or `rocm` features are enabled.
+"]
+extern crate alloc;
+use allocator_api2::{alloc::Allocator, alloc::AllocError, alloc::Layout};
+use core::ptr;
+
+// Re-export common types from stringzilla
+pub use crate::sz::{SortedIdx, Status as SzStatus};
+
+pub mod szs {
+    use super::*;
+    use alloc::vec::Vec;
+    use core::ffi::c_void;
+
+    /// Capability flags
+    pub type Capability = u32;
+
+    // Import from stringzilla module
+    pub use crate::sz::Status;
+
+    /// Device scope wrapper for parallel execution
+    pub struct DeviceScope {
+        handle: *mut c_void,
+    }
+
+    impl DeviceScope {
+        /// Create a default device scope
+        ///
+        /// # Examples
+        ///
+        /// ```
+        /// # use stringzilla::stringzillas::szs::DeviceScope;
+        /// let device = DeviceScope::default().unwrap();
+        /// ```
+        pub fn default() -> Result<Self, Status> {
+            let mut handle = ptr::null_mut();
+            let status = unsafe { sz_device_scope_init_default(&mut handle) };
+            match status {
+                Status::Success => Ok(Self { handle }),
+                err => Err(err),
+            }
+        }
+
+        /// Create a device scope for CPU cores
+        ///
+        /// # Examples
+        ///
+        /// ```
+        /// # use stringzilla::stringzillas::szs::DeviceScope;
+        /// let device = DeviceScope::cpu_cores(4).unwrap();
+        /// ```
+        pub fn cpu_cores(cpu_cores: usize) -> Result<Self, Status> {
+            let mut handle = ptr::null_mut();
+            let status = unsafe { sz_device_scope_init_cpu_cores(cpu_cores, &mut handle) };
+            match status {
+                Status::Success => Ok(Self { handle }),
+                err => Err(err),
+            }
+        }
+
+        /// Create a device scope for GPU device
+        ///
+        /// # Examples
+        ///
+        /// ```
+        /// # use stringzilla::stringzillas::szs::DeviceScope;
+        /// let device = DeviceScope::gpu_device(0).unwrap();
+        /// ```
+        pub fn gpu_device(gpu_device: usize) -> Result<Self, Status> {
+            let mut handle = ptr::null_mut();
+            let status = unsafe { sz_device_scope_init_gpu_device(gpu_device, &mut handle) };
+            match status {
+                Status::Success => Ok(Self { handle }),
+                err => Err(err),
+            }
+        }
+
+        /// Get the capabilities of this device scope
+        pub fn get_capabilities(&self) -> Result<Capability, Status> {
+            let mut capabilities: Capability = 0;
+            let status = unsafe { sz_device_scope_get_capabilities(self.handle, &mut capabilities) };
+            match status {
+                Status::Success => Ok(capabilities),
+                err => Err(err),
+            }
+        }
+
+        /// Get the raw handle for this device scope
+        pub(crate) fn as_ptr(&self) -> *mut c_void {
+            self.handle
+        }
+    }
+
+    impl Drop for DeviceScope {
+        fn drop(&mut self) {
+            if !self.handle.is_null() {
+                unsafe {
+                    sz_device_scope_free(self.handle);
+                }
+            }
+        }
+    }
+
+    /// Builder for fingerprinting engine configuration
+    ///
+    /// By default, uses alphabet_size=0 and null window_widths to let the C layer
+    /// infer optimal configurations based on the provided capabilities.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+    /// let device = DeviceScope::default().unwrap();
+    /// let engine = Fingerprints::builder()
+    ///     .build(&device)
+    ///     .unwrap();
+    /// ```
+    pub struct FingerprintsBuilder {
+        alphabet_size: usize,
+        window_widths: Option<Vec<usize>>,
+        dimensions: usize,
+    }
+
+    impl FingerprintsBuilder {
+        /// Create a new builder with defaults (alphabet_size=0, no window widths)
+        /// The C layer will infer optimal settings based on device capabilities
+        pub fn new() -> Self {
+            Self {
+                alphabet_size: 0,
+                window_widths: None,
+                dimensions: 1024, // Default dimensions
+            }
+        }
+
+        /// Set alphabet size to binary (256 characters)
+        pub fn binary(mut self) -> Self {
+            self.alphabet_size = 256;
+            self
+        }
+
+        /// Set alphabet size to ASCII (128 characters)
+        pub fn ascii(mut self) -> Self {
+            self.alphabet_size = 128;
+            self
+        }
+
+        /// Set alphabet size to DNA (4 characters: A, C, G, T)
+        pub fn dna(mut self) -> Self {
+            self.alphabet_size = 4;
+            self
+        }
+
+        /// Set alphabet size to protein (22 characters)
+        pub fn protein(mut self) -> Self {
+            self.alphabet_size = 22;
+            self
+        }
+
+        /// Set custom alphabet size
+        pub fn alphabet_size(mut self, size: usize) -> Self {
+            self.alphabet_size = size;
+            self
+        }
+
+        /// Set window widths for rolling hashes
+        /// If not set, the C layer will use default window widths
+        pub fn window_widths(mut self, widths: &[usize]) -> Self {
+            self.window_widths = Some(widths.to_vec());
+            self
+        }
+
+        /// Set total dimensions per fingerprint
+        /// Ideally 1024 or a (64 * window_widths_count) multiple
+        pub fn dimensions(mut self, dimensions: usize) -> Self {
+            self.dimensions = dimensions;
+            self
+        }
+
+        /// Build the fingerprinting engine using the device scope's capabilities
+        ///
+        /// If alphabet_size is 0, it will be set to 256 by default.
+        /// If window_widths is None, default window widths will be used.
+        pub fn build(self, device: &DeviceScope) -> Result<Fingerprints, Status> {
+            let mut engine: FingerprintsHandle = ptr::null_mut();
+            let capabilities = device.get_capabilities().unwrap_or(0);
+
+            let (widths_ptr, widths_len) = match &self.window_widths {
+                Some(widths) => (widths.as_ptr(), widths.len()),
+                None => (ptr::null(), 0),
+            };
+
+            let status = unsafe {
+                sz_fingerprints_init(
+                    self.dimensions,
+                    self.alphabet_size,
+                    widths_ptr,
+                    widths_len,
+                    ptr::null(), // No custom allocator
+                    capabilities,
+                    &mut engine,
+                )
+            };
+
+            match status {
+                Status::Success => Ok(Fingerprints { handle: engine }),
+                err => Err(err),
+            }
+        }
+    }
+
+    /// StringZillas fingerprinting engine
+    pub struct Fingerprints {
+        handle: FingerprintsHandle,
+    }
+
+    impl Fingerprints {
+        /// Create a new builder for configuring the engine
+        pub fn builder() -> FingerprintsBuilder {
+            FingerprintsBuilder::new()
+        }
+
+        /// Process a collection of strings and compute fingerprints
+        pub fn fingerprint<T, S>(
+            &self,
+            device: &DeviceScope,
+            strings: &T,
+            min_hashes: &mut [u32],
+            min_counts: &mut [u32],
+        ) -> Result<(), Status>
+        where
+            T: AsRef<[S]>,
+            S: AsRef<[u8]>,
+        {
+            let strings_slice = strings.as_ref();
+            let sequence = create_sequence_view(strings_slice);
+
+            let status = unsafe {
+                sz_fingerprints_sequence(
+                    self.handle,
+                    device.handle,
+                    &sequence as *const _ as *const c_void,
+                    min_hashes.as_mut_ptr(),
+                    min_hashes.len(),
+                    min_counts.as_mut_ptr(),
+                    min_counts.len(),
+                )
+            };
+
+            match status {
+                Status::Success => Ok(()),
+                err => Err(err),
+            }
+        }
+    }
+
+    impl Drop for Fingerprints {
+        fn drop(&mut self) {
+            if !self.handle.is_null() {
+                unsafe {
+                    sz_fingerprints_free(self.handle);
+                }
+            }
+        }
+    }
+
+    /// Internal representation of sz_sequence_t for passing to C
+    #[repr(C)]
+    struct SzSequence {
+        handle: *mut c_void,
+        count: usize,
+        get_start: extern "C" fn(*mut c_void, usize) -> *const u8,
+        get_length: extern "C" fn(*mut c_void, usize) -> usize,
+        // Additional fields for our implementation
+        starts: *const *const u8,
+        lengths: *const usize,
+    }
+
+    /// Opaque handles for similarity engines
+    pub type FingerprintsHandle = *mut c_void;
+    pub type LevenshteinDistancesHandle = *mut c_void;
+    pub type LevenshteinDistancesUtf8Handle = *mut c_void;
+    pub type NeedlemanWunschScoresHandle = *mut c_void;
+    pub type SmithWatermanScoresHandle = *mut c_void;
+
+    // C API bindings
+    extern "C" {
+        // Device scope functions
+        fn sz_device_scope_init_default(scope: *mut *mut c_void) -> Status;
+        fn sz_device_scope_init_cpu_cores(cpu_cores: usize, scope: *mut *mut c_void) -> Status;
+        fn sz_device_scope_init_gpu_device(gpu_device: usize, scope: *mut *mut c_void) -> Status;
+        fn sz_device_scope_get_capabilities(scope: *mut c_void, capabilities: *mut Capability) -> Status;
+        fn sz_device_scope_free(scope: *mut c_void);
+
+        // Levenshtein distance functions
+        fn sz_levenshtein_distances_init(
+            match_cost: i8,
+            mismatch_cost: i8,
+            open_cost: i8,
+            extend_cost: i8,
+            alloc: *const c_void,
+            capabilities: Capability,
+            engine: *mut LevenshteinDistancesHandle,
+        ) -> Status;
+
+        fn sz_levenshtein_distances_sequence(
+            engine: LevenshteinDistancesHandle,
+            device: *mut c_void,
+            a: *const c_void, // sz_sequence_t
+            b: *const c_void, // sz_sequence_t
+            results: *mut usize,
+            results_stride: usize,
+        ) -> Status;
+
+        fn sz_levenshtein_distances_free(engine: LevenshteinDistancesHandle);
+
+        // Levenshtein distance UTF-8 functions
+        fn sz_levenshtein_distances_utf8_init(
+            match_cost: i8,
+            mismatch_cost: i8,
+            open_cost: i8,
+            extend_cost: i8,
+            alloc: *const c_void,
+            capabilities: Capability,
+            engine: *mut LevenshteinDistancesUtf8Handle,
+        ) -> Status;
+
+        fn sz_levenshtein_distances_utf8_sequence(
+            engine: LevenshteinDistancesUtf8Handle,
+            device: *mut c_void,
+            a: *const c_void, // sz_sequence_t
+            b: *const c_void, // sz_sequence_t
+            results: *mut usize,
+            results_stride: usize,
+        ) -> Status;
+
+        fn sz_levenshtein_distances_utf8_free(engine: LevenshteinDistancesUtf8Handle);
+
+        // Needleman-Wunsch scoring functions
+        fn sz_needleman_wunsch_scores_init(
+            subs: *const i8, // 256x256 substitution matrix
+            open_cost: i8,
+            extend_cost: i8,
+            alloc: *const c_void,
+            capabilities: Capability,
+            engine: *mut NeedlemanWunschScoresHandle,
+        ) -> Status;
+
+        fn sz_needleman_wunsch_scores_sequence(
+            engine: NeedlemanWunschScoresHandle,
+            device: *mut c_void,
+            a: *const c_void, // sz_sequence_t
+            b: *const c_void, // sz_sequence_t
+            results: *mut isize,
+            results_stride: usize,
+        ) -> Status;
+
+        fn sz_needleman_wunsch_scores_free(engine: NeedlemanWunschScoresHandle);
+
+        // Smith-Waterman scoring functions
+        fn sz_smith_waterman_scores_init(
+            subs: *const i8, // 256x256 substitution matrix
+            open_cost: i8,
+            extend_cost: i8,
+            alloc: *const c_void,
+            capabilities: Capability,
+            engine: *mut SmithWatermanScoresHandle,
+        ) -> Status;
+
+        fn sz_smith_waterman_scores_sequence(
+            engine: SmithWatermanScoresHandle,
+            device: *mut c_void,
+            a: *const c_void, // sz_sequence_t
+            b: *const c_void, // sz_sequence_t
+            results: *mut isize,
+            results_stride: usize,
+        ) -> Status;
+
+        fn sz_smith_waterman_scores_free(engine: SmithWatermanScoresHandle);
+
+        // Fingerprinting functions
+        fn sz_fingerprints_init(
+            dimensions: usize,
+            alphabet_size: usize,
+            window_widths: *const usize,
+            window_widths_count: usize,
+            alloc: *const c_void, // MemoryAllocator - using null for default
+            capabilities: Capability,
+            engine: *mut FingerprintsHandle,
+        ) -> Status;
+
+        fn sz_fingerprints_sequence(
+            engine: FingerprintsHandle,
+            device: *mut c_void,  // DeviceScope
+            texts: *const c_void, // sz_sequence_t
+            min_hashes: *mut u32,
+            min_hashes_stride: usize,
+            min_counts: *mut u32,
+            min_counts_stride: usize,
+        ) -> Status;
+
+        fn sz_fingerprints_free(engine: FingerprintsHandle);
+
+        // Unified allocator functions
+        fn sz_unified_alloc(size_bytes: usize) -> *mut c_void;
+        fn sz_unified_free(ptr: *mut c_void, size_bytes: usize);
+    }
+
+    /// Unified memory allocator that uses CUDA unified memory when available,
+    /// falls back to malloc otherwise. Works with allocator-api2.
+    pub struct UnifiedAlloc;
+
+    unsafe impl Allocator for UnifiedAlloc {
+        fn allocate(&self, layout: Layout) -> Result<core::ptr::NonNull<[u8]>, AllocError> {
+            let size = layout.size();
+            if size == 0 {
+                // For zero-sized allocations, return a properly aligned non-null dangling pointer
+                let ptr = core::ptr::NonNull::new(layout.align() as *mut u8).ok_or(AllocError)?;
+                return Ok(core::ptr::NonNull::slice_from_raw_parts(ptr, 0));
+            }
+
+            let ptr = unsafe { sz_unified_alloc(size) };
+            if ptr.is_null() {
+                return Err(AllocError);
+            }
+
+            let ptr = core::ptr::NonNull::new(ptr as *mut u8).ok_or(AllocError)?;
+            Ok(core::ptr::NonNull::slice_from_raw_parts(ptr, size))
+        }
+
+        unsafe fn deallocate(&self, ptr: core::ptr::NonNull<u8>, layout: Layout) {
+            if layout.size() != 0 {
+                sz_unified_free(ptr.as_ptr() as *mut c_void, layout.size());
+            }
+        }
+    }
+
+    /// Type alias for Vec with unified allocator
+    pub type UnifiedVec<T> = allocator_api2::vec::Vec<T, UnifiedAlloc>;
+
+    /// Levenshtein distance engine for batch processing
+    pub struct LevenshteinDistances {
+        handle: LevenshteinDistancesHandle,
+    }
+
+    impl LevenshteinDistances {
+        /// Create a new Levenshtein distances engine
+        /// Uses the device scope to infer capabilities
+        pub fn new(
+            device: &DeviceScope,
+            match_cost: i8,
+            mismatch_cost: i8,
+            open_cost: i8,
+            extend_cost: i8,
+        ) -> Result<Self, Status> {
+            let mut handle = ptr::null_mut();
+            let capabilities = device.get_capabilities().unwrap_or(0);
+            let status = unsafe {
+                sz_levenshtein_distances_init(
+                    match_cost,
+                    mismatch_cost,
+                    open_cost,
+                    extend_cost,
+                    ptr::null(),
+                    capabilities,
+                    &mut handle,
+                )
+            };
+            match status {
+                Status::Success => Ok(Self { handle }),
+                err => Err(err),
+            }
+        }
+
+        /// Call operator to compute distances
+        pub fn call<T, S>(
+            &self,
+            device: &DeviceScope,
+            sequences_a: T,
+            sequences_b: T,
+            results: &mut [usize],
+        ) -> Result<(), Status>
+        where
+            T: AsRef<[S]>,
+            S: AsRef<[u8]>,
+        {
+            let seq_a = create_sequence_view(sequences_a.as_ref());
+            let seq_b = create_sequence_view(sequences_b.as_ref());
+
+            let results_stride = core::mem::size_of::<usize>(); // stride in bytes
+            let status = unsafe {
+                sz_levenshtein_distances_sequence(
+                    self.handle,
+                    device.handle,
+                    &seq_a as *const _ as *const c_void,
+                    &seq_b as *const _ as *const c_void,
+                    results.as_mut_ptr(),
+                    results_stride,
+                )
+            };
+            match status {
+                Status::Success => Ok(()),
+                err => Err(err),
+            }
+        }
+    }
+
+    impl Drop for LevenshteinDistances {
+        fn drop(&mut self) {
+            if !self.handle.is_null() {
+                unsafe { sz_levenshtein_distances_free(self.handle) };
+            }
+        }
+    }
+
+    /// UTF-8 aware Levenshtein distance engine
+    pub struct LevenshteinDistancesUtf8 {
+        handle: LevenshteinDistancesUtf8Handle,
+    }
+
+    impl LevenshteinDistancesUtf8 {
+        /// Create a new UTF-8 Levenshtein distances engine
+        /// Uses the device scope to infer capabilities
+        pub fn new(
+            device: &DeviceScope,
+            match_cost: i8,
+            mismatch_cost: i8,
+            open_cost: i8,
+            extend_cost: i8,
+        ) -> Result<Self, Status> {
+            let mut handle = ptr::null_mut();
+            let capabilities = device.get_capabilities().unwrap_or(0);
+            let status = unsafe {
+                sz_levenshtein_distances_utf8_init(
+                    match_cost,
+                    mismatch_cost,
+                    open_cost,
+                    extend_cost,
+                    ptr::null(),
+                    capabilities,
+                    &mut handle,
+                )
+            };
+            match status {
+                Status::Success => Ok(Self { handle }),
+                err => Err(err),
+            }
+        }
+
+        /// Call operator to compute UTF-8 distances
+        pub fn call<T, S>(
+            &self,
+            device: &DeviceScope,
+            sequences_a: T,
+            sequences_b: T,
+            results: &mut [usize],
+        ) -> Result<(), Status>
+        where
+            T: AsRef<[S]>,
+            S: AsRef<str>,
+        {
+            let seq_a = create_sequence_view_str(sequences_a.as_ref());
+            let seq_b = create_sequence_view_str(sequences_b.as_ref());
+
+            let results_stride = core::mem::size_of::<usize>(); // stride in bytes
+            let status = unsafe {
+                sz_levenshtein_distances_utf8_sequence(
+                    self.handle,
+                    device.handle,
+                    &seq_a as *const _ as *const c_void,
+                    &seq_b as *const _ as *const c_void,
+                    results.as_mut_ptr(),
+                    results_stride,
+                )
+            };
+            match status {
+                Status::Success => Ok(()),
+                err => Err(err),
+            }
+        }
+    }
+
+    impl Drop for LevenshteinDistancesUtf8 {
+        fn drop(&mut self) {
+            if !self.handle.is_null() {
+                unsafe { sz_levenshtein_distances_utf8_free(self.handle) };
+            }
+        }
+    }
+
+    /// Needleman-Wunsch alignment scoring engine
+    pub struct NeedlemanWunschScores {
+        handle: NeedlemanWunschScoresHandle,
+    }
+
+    impl NeedlemanWunschScores {
+        /// Create a new Needleman-Wunsch scoring engine
+        /// Uses the device scope to infer capabilities
+        pub fn new(
+            device: &DeviceScope,
+            substitution_matrix: &[[i8; 256]; 256],
+            open_cost: i8,
+            extend_cost: i8,
+        ) -> Result<Self, Status> {
+            let mut handle = ptr::null_mut();
+            let capabilities = device.get_capabilities().unwrap_or(0);
+            let status = unsafe {
+                sz_needleman_wunsch_scores_init(
+                    substitution_matrix.as_ptr() as *const i8,
+                    open_cost,
+                    extend_cost,
+                    ptr::null(),
+                    capabilities,
+                    &mut handle,
+                )
+            };
+            match status {
+                Status::Success => Ok(Self { handle }),
+                err => Err(err),
+            }
+        }
+
+        /// Call operator to compute alignment scores
+        pub fn call<T, S>(
+            &self,
+            device: &DeviceScope,
+            sequences_a: T,
+            sequences_b: T,
+            results: &mut [isize],
+        ) -> Result<(), Status>
+        where
+            T: AsRef<[S]>,
+            S: AsRef<[u8]>,
+        {
+            let seq_a = create_sequence_view(sequences_a.as_ref());
+            let seq_b = create_sequence_view(sequences_b.as_ref());
+
+            let results_stride = core::mem::size_of::<isize>(); // stride in bytes
+            let status = unsafe {
+                sz_needleman_wunsch_scores_sequence(
+                    self.handle,
+                    device.handle,
+                    &seq_a as *const _ as *const c_void,
+                    &seq_b as *const _ as *const c_void,
+                    results.as_mut_ptr(),
+                    results_stride,
+                )
+            };
+            match status {
+                Status::Success => Ok(()),
+                err => Err(err),
+            }
+        }
+    }
+
+    impl Drop for NeedlemanWunschScores {
+        fn drop(&mut self) {
+            if !self.handle.is_null() {
+                unsafe { sz_needleman_wunsch_scores_free(self.handle) };
+            }
+        }
+    }
+
+    /// Smith-Waterman local alignment scoring engine
+    pub struct SmithWatermanScores {
+        handle: SmithWatermanScoresHandle,
+    }
+
+    impl SmithWatermanScores {
+        /// Create a new Smith-Waterman scoring engine
+        /// Uses the device scope to infer capabilities
+        pub fn new(
+            device: &DeviceScope,
+            substitution_matrix: &[[i8; 256]; 256],
+            open_cost: i8,
+            extend_cost: i8,
+        ) -> Result<Self, Status> {
+            let mut handle = ptr::null_mut();
+            let capabilities = device.get_capabilities().unwrap_or(0);
+            let status = unsafe {
+                sz_smith_waterman_scores_init(
+                    substitution_matrix.as_ptr() as *const i8,
+                    open_cost,
+                    extend_cost,
+                    ptr::null(),
+                    capabilities,
+                    &mut handle,
+                )
+            };
+            match status {
+                Status::Success => Ok(Self { handle }),
+                err => Err(err),
+            }
+        }
+
+        /// Call operator to compute local alignment scores
+        pub fn call<T, S>(
+            &self,
+            device: &DeviceScope,
+            sequences_a: T,
+            sequences_b: T,
+            results: &mut [isize],
+        ) -> Result<(), Status>
+        where
+            T: AsRef<[S]>,
+            S: AsRef<[u8]>,
+        {
+            let seq_a = create_sequence_view(sequences_a.as_ref());
+            let seq_b = create_sequence_view(sequences_b.as_ref());
+
+            let results_stride = core::mem::size_of::<isize>(); // stride in bytes
+            let status = unsafe {
+                sz_smith_waterman_scores_sequence(
+                    self.handle,
+                    device.handle,
+                    &seq_a as *const _ as *const c_void,
+                    &seq_b as *const _ as *const c_void,
+                    results.as_mut_ptr(),
+                    results_stride,
+                )
+            };
+            match status {
+                Status::Success => Ok(()),
+                err => Err(err),
+            }
+        }
+    }
+
+    impl Drop for SmithWatermanScores {
+        fn drop(&mut self) {
+            if !self.handle.is_null() {
+                unsafe { sz_smith_waterman_scores_free(self.handle) };
+            }
+        }
+    }
+
+    /// Zero-copy helper to create sz_sequence_t view from any container of byte slices
+    fn create_sequence_view<T: AsRef<[u8]>>(strings: &[T]) -> SzSequence {
+        SzSequence {
+            handle: strings.as_ptr() as *mut c_void,
+            count: strings.len(),
+            get_start: sz_sequence_get_start_generic::<T>,
+            get_length: sz_sequence_get_length_generic::<T>,
+            starts: ptr::null(),
+            lengths: ptr::null(),
+        }
+    }
+
+    /// Zero-copy helper to create sz_sequence_t view from any container of strings
+    fn create_sequence_view_str<T: AsRef<str>>(strings: &[T]) -> SzSequence {
+        SzSequence {
+            handle: strings.as_ptr() as *mut c_void,
+            count: strings.len(),
+            get_start: sz_sequence_get_start_str::<T>,
+            get_length: sz_sequence_get_length_str::<T>,
+            starts: ptr::null(),
+            lengths: ptr::null(),
+        }
+    }
+
+    /// Generic C callback to get start of string at index for byte slices
+    extern "C" fn sz_sequence_get_start_generic<T: AsRef<[u8]>>(handle: *mut c_void, index: usize) -> *const u8 {
+        unsafe {
+            let strings = core::slice::from_raw_parts(handle as *const T, index + 1);
+            strings[index].as_ref().as_ptr()
+        }
+    }
+
+    /// Generic C callback to get length of string at index for byte slices
+    extern "C" fn sz_sequence_get_length_generic<T: AsRef<[u8]>>(handle: *mut c_void, index: usize) -> usize {
+        unsafe {
+            let strings = core::slice::from_raw_parts(handle as *const T, index + 1);
+            strings[index].as_ref().len()
+        }
+    }
+
+    /// Generic C callback to get start of string at index for str slices
+    extern "C" fn sz_sequence_get_start_str<T: AsRef<str>>(handle: *mut c_void, index: usize) -> *const u8 {
+        unsafe {
+            let strings = core::slice::from_raw_parts(handle as *const T, index + 1);
+            strings[index].as_ref().as_bytes().as_ptr()
+        }
+    }
+
+    /// Generic C callback to get length of string at index for str slices
+    extern "C" fn sz_sequence_get_length_str<T: AsRef<str>>(handle: *mut c_void, index: usize) -> usize {
+        unsafe {
+            let strings = core::slice::from_raw_parts(handle as *const T, index + 1);
+            strings[index].as_ref().as_bytes().len()
+        }
+    }
+
+    /// Get information about the compiled backend
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use stringzilla::stringzillas::szs::backend_info;
+    /// let info = backend_info();
+    /// println!("Using backend: {}", info);
+    /// ```
+    pub fn backend_info() -> &'static str {
+        #[cfg(feature = "cuda")]
+        return "CUDA GPU acceleration enabled";
+
+        #[cfg(all(feature = "rocm", not(feature = "cuda")))]
+        return "ROCm GPU acceleration enabled";
+
+        #[cfg(all(feature = "cpus", not(any(feature = "cuda", feature = "rocm"))))]
+        return "Multi-threaded CPU backend enabled";
+
+        #[cfg(not(any(feature = "cpus", feature = "cuda", feature = "rocm")))]
+        return "StringZillas not available - enable cpus, cuda, or rocm feature";
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::szs::*;
+
+    #[test]
+    fn test_backend_info() {
+        let info = backend_info();
+        assert!(!info.is_empty());
+        println!("Backend: {}", info);
+    }
+
+    #[test]
+    fn test_fingerprint_engine_builder() {
+        // Test builder pattern for different use cases
+        let device = DeviceScope::default().unwrap();
+
+        let _binary_engine = Fingerprints::builder().binary().build(&device).unwrap();
+
+        let _dna_engine = Fingerprints::builder()
+            .dna()
+            .window_widths(&[3, 5, 7])
+            .dimensions(192) // 64 * 3 window widths
+            .build(&device)
+            .unwrap();
+
+        let _custom_engine = Fingerprints::builder()
+            .alphabet_size(64)
+            .window_widths(&[5, 7, 11, 13])
+            .dimensions(256) // 64 * 4 window widths
+            .build(&device)
+            .unwrap();
+    }
+
+    #[test]
+    fn test_device_scope() {
+        // Note: These will fail until the C functions are implemented
+        // but they test the Rust API structure
+        let _default_device = DeviceScope::default();
+        let _cpu_device = DeviceScope::cpu_cores(4);
+        let _gpu_device = DeviceScope::gpu_device(0);
+    }
+}

From 7f451180ad2bf4b9a9841ba99d9d5c4ec3c52475 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 17 Aug 2025 19:47:17 +0000
Subject: [PATCH 589/751] Docs: Drop OpenMP and old name

---
 .vscode/settings.json                 | 25 +++++++++++++++----------
 drafts/test_find_many.cuh             |  2 +-
 include/stringzilla/stringzilla.h     | 13 +++++++------
 include/stringzilla/types.h           | 16 ++++------------
 include/stringzilla/types.hpp         |  2 +-
 include/stringzillas/similarities.cuh |  2 +-
 include/stringzillas/similarities.hpp | 26 +++++++++++++-------------
 include/stringzillas/types.hpp        |  2 +-
 scripts/test_fingerprints.cuh         |  2 +-
 scripts/test_similarities.cuh         |  2 +-
 scripts/test_stringzilla.cpp          |  1 -
 scripts/test_stringzillas.cpp         |  3 +--
 scripts/test_stringzillas.cu          |  3 +--
 scripts/test_stringzillas.cuh         |  5 +++--
 14 files changed, 50 insertions(+), 54 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 5189d461..6f5470b0 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -37,6 +37,7 @@
     "bigrams",
     "bioinformaticians",
     "bioinformatics",
+    "biopython",
     "Bitap",
     "bitcast",
     "bitceil",
@@ -58,6 +59,7 @@
     "copydoc",
     "Corasick",
     "cptr",
+    "CPUs",
     "CUDA",
     "CUHPP",
     "cvref",
@@ -117,6 +119,7 @@
     "misalign",
     "MODINIT",
     "modulos",
+    "monostate",
     "Morten",
     "Mosè",
     "MSVC",
@@ -132,6 +135,7 @@
     "NOARGS",
     "noexcept",
     "NOMINMAX",
+    "nothrow",
     "NOTIMPLEMENTED",
     "npos",
     "nullary",
@@ -165,6 +169,7 @@
     "Ritchie",
     "rmatcher",
     "rmatches",
+    "ROCm",
     "Rotem",
     "rpartition",
     "rsplit",
@@ -225,12 +230,15 @@
     120
   ],
   "files.associations": {
+    "__availability": "cpp",
     "__bit_reference": "cpp",
     "__bits": "cpp",
     "__config": "cpp",
     "__debug": "cpp",
     "__errc": "cpp",
+    "__functional_03": "cpp",
     "__functional_base": "cpp",
+    "__functional_base_03": "cpp",
     "__hash_table": "cpp",
     "__locale": "cpp",
     "__memory": "c",
@@ -248,10 +256,12 @@
     "any": "cpp",
     "array": "cpp",
     "atomic": "cpp",
+    "barrier": "cpp",
     "bit": "cpp",
     "bitset": "cpp",
     "cassert": "cpp",
     "cctype": "cpp",
+    "cfenv": "cpp",
     "charconv": "c",
     "chrono": "cpp",
     "cinttypes": "cpp",
@@ -301,10 +311,12 @@
     "numeric": "cpp",
     "optional": "cpp",
     "ostream": "cpp",
+    "pipeline": "cpp",
     "queue": "cpp",
     "random": "cpp",
     "ranges": "cpp",
     "ratio": "cpp",
+    "regex": "cpp",
     "semaphore": "cpp",
     "set": "cpp",
     "shared_mutex": "cpp",
@@ -316,6 +328,7 @@
     "stddef.h": "c",
     "stdexcept": "cpp",
     "stop_token": "cpp",
+    "stream_ref": "cpp",
     "streambuf": "cpp",
     "string": "cpp",
     "string_view": "cpp",
@@ -328,6 +341,7 @@
     "type_traits": "cpp",
     "typeindex": "cpp",
     "typeinfo": "cpp",
+    "types.h": "c",
     "unordered_map": "cpp",
     "unordered_set": "cpp",
     "utility": "cpp",
@@ -347,15 +361,6 @@
     "xstring": "cpp",
     "xtr1common": "cpp",
     "xtree": "cpp",
-    "xutility": "cpp",
-    "regex": "cpp",
-    "types.h": "c",
-    "cfenv": "cpp",
-    "__availability": "cpp",
-    "barrier": "cpp",
-    "pipeline": "cpp",
-    "__functional_03": "cpp",
-    "__functional_base_03": "cpp",
-    "stream_ref": "cpp"
+    "xutility": "cpp"
   }
 }
\ No newline at end of file
diff --git a/drafts/test_find_many.cuh b/drafts/test_find_many.cuh
index 0514d5f4..525be055 100644
--- a/drafts/test_find_many.cuh
+++ b/drafts/test_find_many.cuh
@@ -1,5 +1,5 @@
 /**
- *  @brief   Extensive @b stress-testing suite for StringCuZilla parallel operations, written in CUDA C++.
+ *  @brief   Extensive @b stress-testing suite for StringZillas parallel operations, written in CUDA C++.
  *  @see     Stress-tests on real-world and synthetic data are integrated into the @b `scripts/bench*.cpp` benchmarks.
  *
  *  @file    test_stringzillas.cuh
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index c8009f44..1ba90e96 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -25,12 +25,12 @@
  *  - `stringzilla.h` - umbrella header for the core C API.
  *  - `stringzilla.hpp` - umbrella header for the core C++ API.
  *
- *  It also provides many higher-level parallel algorithms, mostly implemented in C++ with OpenMP and CUDA, also exposed
- *  via the stable C 99 ABI, but requiring C++17 and CUDA 17 compilers to build the shared @b StringCuZilla libraries:
+ *  It also provides many higher-level parallel algorithms, implemented in C++ with Fork Union and CUDA, also exposed
+ *  via the stable C 99 ABI, but requiring C++17 and CUDA 17 compilers to build the shared @b StringZillas libraries:
  *
  *  - `similarities.{hpp,cuh}` - similarity measures, like Levenshtein, Needleman-Wunsch, & Smith-Waterman scores.
- *  - `features.{hpp,cuh}` - feature extraction for TF-IDF and other Machine Learning algorithms.
- *  - `fingerprints.{hpp,cuh}` - Aho-Corasick multi-pattern search.
+ *  - `fingerprints.{hpp,cuh}` - feature extraction for TF-IDF and other Machine Learning algorithms.
+ *  - `find_many.{hpp,cuh}` - Aho-Corasick multi-pattern search.
  *
  *  The core implementations of those algorithms are mostly structured as callable structure templates, as opposed to
  *  template functions to simplify specialized overloads and reusing the state between invocations.
@@ -58,8 +58,9 @@
  *  - `SZ_USE_NEON=?` - whether to use NEON instructions on ARM.
  *  - `SZ_USE_SVE=?` - whether to use SVE instructions on ARM.
  *  - `SZ_USE_SVE2=?` - whether to use SVE2 instructions on ARM.
- *  - `SZ_USE_CUDA=?` -
- *  - `SZ_USE_OPENMP=?` -
+ *  - `SZ_USE_CUDA=?` - whether to use minimal CUDA capabilities on Nvidia GPUs.
+ *  - `SZ_USE_KEPLER=?` - whether to use Kepler-level instructions on Nvidia GPUs.
+ *  - `SZ_USE_HOPPER=?` - whether to use Hopper-level instructions on Nvidia GPUs.
  */
 #ifndef STRINGZILLA_H_
 #define STRINGZILLA_H_
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 655778e2..c2340fe2 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -256,14 +256,6 @@
 #endif
 #endif
 
-#if !defined(SZ_USE_OPENMP)
-#ifdef _OPENMP
-#define SZ_USE_OPENMP (1)
-#else
-#define SZ_USE_OPENMP (0)
-#endif
-#endif
-
 #if !defined(SZ_USE_CUDA)
 #ifdef __NVCC__
 #define SZ_USE_CUDA (1)
@@ -477,7 +469,7 @@ typedef enum sz_status_t {
  */
 typedef enum sz_capability_t {
     sz_cap_serial_k = 1,        ///< Serial (non-SIMD) capability
-    sz_cap_parallel_k = 1 << 2, ///< Multi-threading via OpenMP capability
+    sz_cap_parallel_k = 1 << 2, ///< Multi-threading via Fork Union or other OpenMP-like engines
     sz_cap_any_k = 0x7FFFFFFF,  ///< Mask representing any capability with `INT_MAX`
 
     sz_cap_haswell_k = 1 << 5, ///< x86 AVX2 capability with FMA and F16C extensions
@@ -494,10 +486,10 @@ typedef enum sz_capability_t {
     sz_cap_kepler_k = 1 << 21, ///< CUDA capability with support with in-warp register shuffles
     sz_cap_hopper_k = 1 << 22, ///< CUDA capability with support for Hopper's DPX instructions
 
-    sz_caps_sp_k = sz_cap_serial_k | sz_cap_parallel_k,                 ///< Serial code with OpenMP
+    sz_caps_sp_k = sz_cap_serial_k | sz_cap_parallel_k,                 ///< Serial code with Fork Union
     sz_caps_si_k = sz_cap_serial_k | sz_cap_ice_k,                      ///< Serial code with Ice Lake
-    sz_caps_spi_k = sz_cap_serial_k | sz_cap_parallel_k | sz_cap_ice_k, ///< Serial code with OpenMP and Ice Lake
-    sz_caps_sps_k = sz_cap_serial_k | sz_cap_parallel_k | sz_cap_sve_k, ///< Serial code with OpenMP and SVE
+    sz_caps_spi_k = sz_cap_serial_k | sz_cap_parallel_k | sz_cap_ice_k, ///< Serial code with Fork Union and Ice Lake
+    sz_caps_sps_k = sz_cap_serial_k | sz_cap_parallel_k | sz_cap_sve_k, ///< Serial code with Fork Union and SVE
     sz_caps_ck_k = sz_cap_cuda_k | sz_cap_kepler_k,                     ///< CUDA code with Kepler
     sz_caps_ckh_k = sz_cap_cuda_k | sz_cap_kepler_k | sz_cap_hopper_k,  ///< CUDA code with Kepler and Hopper
 
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 285a7e9a..2a19c7e4 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -107,7 +107,7 @@
 #if defined(__NVCC__)
 #define SZ_HAS_CONCEPTS_ 0
 #elif defined(__cpp_concepts)
-#define SZ_HAS_CONCEPTS_ 1
+#define SZ_HAS_CONCEPTS_ 0 // TODO: Fix concepts compilation with GCC
 #else
 #define SZ_HAS_CONCEPTS_ 0
 #endif
diff --git a/include/stringzillas/similarities.cuh b/include/stringzillas/similarities.cuh
index 4b5916a5..16b3f887 100644
--- a/include/stringzillas/similarities.cuh
+++ b/include/stringzillas/similarities.cuh
@@ -3,7 +3,7 @@
  *  @file   similarities.cuh
  *  @author Ash Vardanian
  *
- *  Unlike th OpenMP backed, which also has single-pair similarity scores, the CUDA backend focuses @b only on
+ *  Unlike the CPU backed, which also has single-pair similarity scores, the CUDA backend focuses @b only on
  *  batch-processing of large collections of strings, generally, assigning a single warp to each string pair:
  *
  *  - `sz::levenshtein_distances` & `sz::levenshtein_distances_utf8` for Levenshtein edit-distances.
diff --git a/include/stringzillas/similarities.hpp b/include/stringzillas/similarities.hpp
index 7102e65f..a808782f 100644
--- a/include/stringzillas/similarities.hpp
+++ b/include/stringzillas/similarities.hpp
@@ -1,5 +1,5 @@
 /**
- *  @brief  OpenMP-accelerated string similarity scores in C++.
+ *  @brief  Parallel string similarity scores in C++.
  *  @file   similarities.hpp
  *  @author Ash Vardanian
  *
@@ -73,7 +73,7 @@
 #include "stringzilla/memory.h"   // `sz_move_serial`
 #include "stringzillas/types.hpp" // `sz::executor_like`
 
-#include <atomic>      // `std::atomic` to synchronize OpenMP threads
+#include <atomic>      // `std::atomic` to synchronize threads
 #include <type_traits> // `std::enable_if_t` for meta-programming
 #include <limits>      // `std::numeric_limits` for numeric types
 #include <iterator>    // `std::iterator_traits` for iterators
@@ -329,7 +329,7 @@ struct tile_scorer;
 
 /**
  *  @brief  Alignment Score and Edit Distance algorithm evaluating the Dynamic Programming matrix
- *          @b (anti)diagonal-by-(anti)diagonal on a CPU, leveraging OpenMP.
+ *          @b (anti)diagonal-by-(anti)diagonal on a CPU.
  *
  *  Can be used for both global and local alignment, like Needleman-Wunsch and Smith-Waterman.
  *  Can be used for both linear and affine gap penalties.
@@ -345,7 +345,7 @@ struct tile_scorer;
  *  @tparam allocator_type_ A default-constructible allocator type for the internal buffers.
  *  @tparam objective_ Whether to minimize the distance or maximize the score.
  *  @tparam locality_ Whether to use the global alignment algorithm or the local one.
- *  @tparam capability_ Whether to use OpenMP for @b multi-threading or some form of @b SIMD vectorization, or both.
+ *  @tparam capability_ Whether to use @b multi-threading or some form of @b SIMD vectorization, or both.
  *  @tparam enable_ Used to enable/disable the specialization.
  */
 template <                                                       //
@@ -378,7 +378,7 @@ struct diagonal_walker;
  *  @tparam allocator_type_ A default-constructible allocator type for the internal buffers.
  *  @tparam objective_ Whether to minimize the distance or maximize the score.
  *  @tparam locality_ Whether to use the global alignment algorithm or the local one.
- *  @tparam capability_ Whether to use OpenMP for @b multi-threading or some form of @b SIMD vectorization, or both.
+ *  @tparam capability_ Whether to use @b multi-threading or some form of @b SIMD vectorization, or both.
  *  @tparam enable_ Used to enable/disable the specialization.
  *
  *  @note   The API of this algorithm is a bit weird, but it's designed to minimize the reliance on the definitions
@@ -404,7 +404,7 @@ template <                                                       //
 struct horizontal_walker;
 
 /**
- *  @brief  Computes one or many pairwise Levenshtein distances in parallel using the OpenMP backend.
+ *  @brief  Computes one or many pairwise Levenshtein distances in parallel using the CPU backend.
  *          For pairs of very large strings, all cores cooperate to compute one distance maximizing
  *          cache hits. For smaller strings, each core computes its own distance.
  */
@@ -465,7 +465,7 @@ struct smith_waterman_scores;
 using malloc_t = std::allocator<char>;
 
 /**
- *  In non-SIMD backends we still leverage OpenMP for parallelism.
+ *  In non-SIMD backends we still leverage multi-threading for parallelism.
  */
 using levenshtein_serial_t = levenshtein_distances<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k>;
 using levenshtein_utf8_serial_t = levenshtein_distances_utf8<char, linear_gap_costs_t, malloc_t, sz_cap_serial_k>;
@@ -1615,7 +1615,7 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, affine_gap_
 #pragma region - Pairwise Algorithms on CPU
 
 /**
- *  @brief  Computes the @b byte-level Levenshtein distance between two strings using the OpenMP backend.
+ *  @brief  Computes the @b byte-level Levenshtein distance between two strings using the CPU backend.
  *  @sa     `levenshtein_distance_utf8` for UTF-8 strings.
  *
  *  @tparam char_type_ Can be any POD integer type, but @b `char` and @b `sz_rune_t` are preferred.
@@ -1736,7 +1736,7 @@ struct levenshtein_distance {
 };
 
 /**
- *  @brief  Computes the @b rune-level Levenshtein distance between two UTF-8 strings using the OpenMP backend.
+ *  @brief  Computes the @b rune-level Levenshtein distance between two UTF-8 strings using the CPU backend.
  *  @sa     `levenshtein_distance` for binary strings.
  */
 template <                                         //
@@ -1889,7 +1889,7 @@ struct levenshtein_distance_utf8 {
 };
 
 /**
- *  @brief  Computes the @b byte-level Needleman-Wunsch score between two strings using the OpenMP backend.
+ *  @brief  Computes the @b byte-level Needleman-Wunsch score between two strings using the CPU backend.
  *  @sa     `levenshtein_distance` for uniform substitution and gap costs.
  */
 template <                                              //
@@ -1984,7 +1984,7 @@ struct needleman_wunsch_score {
 };
 
 /**
- *  @brief  Computes the @b byte-level Needleman-Wunsch score between two strings using the OpenMP backend.
+ *  @brief  Computes the @b byte-level Needleman-Wunsch score between two strings using the CPU backend.
  *  @sa     `levenshtein_distance` for uniform substitution and gap costs.
  */
 template <                                              //
@@ -3663,7 +3663,7 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
 };
 
 /**
- *  @brief  Computes the @b byte-level Levenshtein distance between two strings using the OpenMP backend.
+ *  @brief  Computes the @b byte-level Levenshtein distance between two strings using the CPU backend.
  *  @sa     `levenshtein_distance_utf8` for UTF-8 strings.
  */
 template <typename gap_costs_type_, typename allocator_type_, sz_capability_t capability_>
@@ -3750,7 +3750,7 @@ struct levenshtein_distance<char, gap_costs_type_, allocator_type_, capability_,
 };
 
 /**
- *  @brief  Computes the @b rune-level Levenshtein distance between two UTF-8 strings using the OpenMP backend.
+ *  @brief  Computes the @b rune-level Levenshtein distance between two UTF-8 strings using the CPU backend.
  *  @sa     `levenshtein_distance` for binary strings.
  */
 template <typename allocator_type_, sz_capability_t capability_>
diff --git a/include/stringzillas/types.hpp b/include/stringzillas/types.hpp
index 52d7c94b..36061916 100644
--- a/include/stringzillas/types.hpp
+++ b/include/stringzillas/types.hpp
@@ -1,5 +1,5 @@
 /**
- *  @brief  Shared definitions for the StringCuZilla C++ library.
+ *  @brief  Shared definitions for the StringZillas C++ library.
  *  @file   types.hpp
  *  @author Ash Vardanian
  */
diff --git a/scripts/test_fingerprints.cuh b/scripts/test_fingerprints.cuh
index f869c389..b9f0df70 100644
--- a/scripts/test_fingerprints.cuh
+++ b/scripts/test_fingerprints.cuh
@@ -1,5 +1,5 @@
 /**
- *  @brief   Extensive @b stress-testing suite for StringCuZilla parallel operations, written in CUDA C++.
+ *  @brief   Extensive @b stress-testing suite for StringZillas parallel operations, written in CUDA C++.
  *  @see     Stress-tests on real-world and synthetic data are integrated into the @b `scripts/bench*.cpp` benchmarks.
  *
  *  @file    test_fingerprints.cuh
diff --git a/scripts/test_similarities.cuh b/scripts/test_similarities.cuh
index 7a00a6e6..528488a5 100644
--- a/scripts/test_similarities.cuh
+++ b/scripts/test_similarities.cuh
@@ -1,5 +1,5 @@
 /**
- *  @brief   Extensive @b stress-testing suite for StringCuZilla parallel operations, written in CUDA C++.
+ *  @brief   Extensive @b stress-testing suite for StringZillas parallel operations, written in CUDA C++.
  *  @see     Stress-tests on real-world and synthetic data are integrated into the @b `scripts/bench*.cpp` benchmarks.
  *
  *  @file    test_similarities.cuh
diff --git a/scripts/test_stringzilla.cpp b/scripts/test_stringzilla.cpp
index ad2dac4f..97bb36b5 100644
--- a/scripts/test_stringzilla.cpp
+++ b/scripts/test_stringzilla.cpp
@@ -1880,7 +1880,6 @@ int main(int argc, char const **argv) {
     std::printf("- Uses NEON: %s \n", SZ_USE_NEON ? "yes" : "no");
     std::printf("- Uses SVE: %s \n", SZ_USE_SVE ? "yes" : "no");
     std::printf("- Uses SVE2: %s \n", SZ_USE_SVE2 ? "yes" : "no");
-    std::printf("- Uses OpenMP: %s \n", SZ_USE_OPENMP ? "yes" : "no");
     std::printf("- Uses CUDA: %s \n", SZ_USE_CUDA ? "yes" : "no");
 
 #if SZ_USE_CUDA
diff --git a/scripts/test_stringzillas.cpp b/scripts/test_stringzillas.cpp
index f2b4f21e..22608523 100644
--- a/scripts/test_stringzillas.cpp
+++ b/scripts/test_stringzillas.cpp
@@ -1,5 +1,5 @@
 /**
- *  @brief   Extensive @b stress-testing suite for StringCuZilla parallel operations, written in CUDA C++.
+ *  @brief   Extensive @b stress-testing suite for StringZillas parallel operations, written in CUDA C++.
  *  @see     Stress-tests on real-world and synthetic data are integrated into the @b `scripts/bench*.cpp` benchmarks.
  *
  *  @file    test_stringzillas.cpp
@@ -19,7 +19,6 @@
 #define SZ_USE_NEON 0
 #define SZ_USE_SVE 0
 */
-#define SZ_USE_OPENMP 1
 #define SZ_USE_CUDA 0
 #define SZ_USE_KEPLER 0
 #define SZ_USE_HOPPER 0
diff --git a/scripts/test_stringzillas.cu b/scripts/test_stringzillas.cu
index 2fabb39d..98dd8355 100644
--- a/scripts/test_stringzillas.cu
+++ b/scripts/test_stringzillas.cu
@@ -1,5 +1,5 @@
 /**
- *  @brief   Extensive @b stress-testing suite for StringCuZilla parallel operations, written in CUDA C++.
+ *  @brief   Extensive @b stress-testing suite for StringZillas parallel operations, written in CUDA C++.
  *  @see     Stress-tests on real-world and synthetic data are integrated into the @b `scripts/bench*.cpp` benchmarks.
  *
  *  @file    test.cu
@@ -19,7 +19,6 @@
 #define SZ_USE_HASWELL 1
 #define SZ_USE_SKYLAKE 1
 #define SZ_USE_ICE 1
-#define SZ_USE_OPENMP 1
 #define SZ_USE_CUDA 1
 #define SZ_USE_KEPLER 1
 #define SZ_USE_HOPPER 1
diff --git a/scripts/test_stringzillas.cuh b/scripts/test_stringzillas.cuh
index 688713be..65370d88 100644
--- a/scripts/test_stringzillas.cuh
+++ b/scripts/test_stringzillas.cuh
@@ -1,5 +1,5 @@
 /**
- *  @brief   Extensive @b stress-testing suite for StringCuZilla parallel operations, written in CUDA C++.
+ *  @brief   Extensive @b stress-testing suite for StringZillas parallel operations, written in CUDA C++.
  *  @see     Stress-tests on real-world and synthetic data are integrated into the @b `scripts/bench*.cpp` benchmarks.
  *
  *  @file    test_stringzillas.cuh
@@ -32,8 +32,9 @@ int log_environment() {
     std::printf("- Uses NEON: %s \n", SZ_USE_NEON ? "yes" : "no");
     std::printf("- Uses SVE: %s \n", SZ_USE_SVE ? "yes" : "no");
     std::printf("- Uses SVE2: %s \n", SZ_USE_SVE2 ? "yes" : "no");
-    std::printf("- Uses OpenMP: %s \n", SZ_USE_OPENMP ? "yes" : "no");
     std::printf("- Uses CUDA: %s \n", SZ_USE_CUDA ? "yes" : "no");
+    std::printf("- Uses Kepler CUDA: %s \n", SZ_USE_KEPLER ? "yes" : "no");
+    std::printf("- Uses Hopper CUDA: %s \n", SZ_USE_HOPPER ? "yes" : "no");
 
 #if SZ_USE_CUDA
     cudaError_t cuda_error = cudaFree(0); // Force context initialization

From a6e0a777b966714ac13fd906695b69346500d616 Mon Sep 17 00:00:00 2001
From: Ger Hobbelt <ger@hobbelt.com>
Date: Sun, 17 Aug 2025 23:53:38 +0200
Subject: [PATCH 590/751] Fix: Memalloc initialization on MSVC (#230)

In Win32/MSVC allocated memory is not zeroed, like it is on most Linuxes.
This hid the bug (x86/x64 is a little endian machine) till today.
---
 include/stringzilla/types.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index c2340fe2..a928c43f 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -1408,8 +1408,9 @@ SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void
     alloc->allocate = (sz_memory_allocate_t)sz_memory_allocate_fixed_;
     alloc->free = (sz_memory_free_t)sz_memory_free_fixed_;
     alloc->handle = buffer;
-    *(sz_size_t *)buffer = length;
-    *((sz_ptr_t)buffer + sizeof(sz_size_t)) = sizeof(sz_size_t) * 2; // The capacity and consumption so far
+    sz_size_t *ptr = (sz_size_t *)buffer;
+    ptr[0] = length;
+    ptr[1] = sizeof(sz_size_t) * 2; // The capacity and consumption so far
 }
 
 SZ_PUBLIC sz_bool_t sz_memory_allocator_equal(sz_memory_allocator_t const *a, sz_memory_allocator_t const *b) {

From 34f41379d06c845f779bb0b7a5f9513de2de3158 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 11:16:00 +0000
Subject: [PATCH 591/751] Docs: StringZillas C API

---
 include/stringzillas/stringzillas.h | 295 ++++++++++++++++++++++++++--
 1 file changed, 284 insertions(+), 11 deletions(-)

diff --git a/include/stringzillas/stringzillas.h b/include/stringzillas/stringzillas.h
index a0c1f775..38efb097 100644
--- a/include/stringzillas/stringzillas.h
+++ b/include/stringzillas/stringzillas.h
@@ -29,9 +29,28 @@
 extern "C" {
 #endif
 
+/**
+ *  @brief Get StringZillas major version number.
+ *  @sa sz_version_major
+ */
 SZ_DYNAMIC int szs_version_major(void);
+
+/**
+ *  @brief Get StringZillas minor version number.
+ *  @sa sz_version_minor
+ */
 SZ_DYNAMIC int szs_version_minor(void);
+
+/**
+ *  @brief Get StringZillas patch version number.
+ *  @sa sz_version_patch
+ */
 SZ_DYNAMIC int szs_version_patch(void);
+
+/**
+ *  @brief Get hardware capabilities mask for current system.
+ *  @sa sz_capabilities
+ */
 SZ_DYNAMIC sz_capability_t szs_capabilities(void);
 
 /**
@@ -76,12 +95,52 @@ SZ_DYNAMIC sz_status_t sz_memory_allocator_init_unified(sz_memory_allocator_t *a
  */
 typedef void *sz_device_scope_t;
 
+/**
+ * @brief Initialize device scope with system defaults.
+ * @param[out] scope Pointer to device scope handle.
+ */
 SZ_DYNAMIC sz_status_t sz_device_scope_init_default(sz_device_scope_t *scope);
+
+/**
+ * @brief Initialize device scope for CPU parallel execution.
+ * @param[in] cpu_cores Number of CPU cores to use (must be > 1).
+ * @param[out] scope Pointer to device scope handle.
+ * @note For single-threaded execution, use `sz_device_scope_init_default()` instead.
+ */
 SZ_DYNAMIC sz_status_t sz_device_scope_init_cpu_cores(sz_size_t cpu_cores, sz_device_scope_t *scope);
+
+/**
+ * @brief Initialize device scope for GPU execution.
+ * @param[in] gpu_device GPU device index to target.
+ * @param[out] scope Pointer to device scope handle.
+ */
 SZ_DYNAMIC sz_status_t sz_device_scope_init_gpu_device(sz_size_t gpu_device, sz_device_scope_t *scope);
+
+/**
+ * @brief Query configured CPU cores count.
+ * @param[in] scope Device scope handle.
+ * @param[out] cpu_cores Number of CPU cores configured.
+ */
 SZ_DYNAMIC sz_status_t sz_device_scope_get_cpu_cores(sz_device_scope_t scope, sz_size_t *cpu_cores);
+
+/**
+ * @brief Query configured GPU device ID.
+ * @param[in] scope Device scope handle.
+ * @param[out] gpu_device GPU device index.
+ */
 SZ_DYNAMIC sz_status_t sz_device_scope_get_gpu_device(sz_device_scope_t scope, sz_size_t *gpu_device);
+
+/**
+ * @brief Get device scope hardware capabilities.
+ * @param[in] scope Device scope handle.
+ * @param[out] capabilities Hardware capabilities mask.
+ */
 SZ_DYNAMIC sz_status_t sz_device_scope_get_capabilities(sz_device_scope_t scope, sz_capability_t *capabilities);
+
+/**
+ * @brief Free device scope resources.
+ * @param[in] scope Device scope handle to free.
+ */
 SZ_DYNAMIC void sz_device_scope_free(sz_device_scope_t scope);
 
 /*  APIs for computing edit-distances between binary and UTF-8 strings.
@@ -90,48 +149,138 @@ SZ_DYNAMIC void sz_device_scope_free(sz_device_scope_t scope);
 typedef void *sz_levenshtein_distances_t;
 typedef void *sz_levenshtein_distances_utf8_t;
 
+/**
+ *  @brief Initialize Levenshtein distance engine with affine gap costs.
+ *
+ *  Creates an engine for computing edit distances between binary sequences using
+ *  the Wagner-Fischer dynamic programming algorithm with configurable costs.
+ *
+ *  @param[in] match Cost for character matches (typically negative or zero).
+ *  @param[in] mismatch Cost for character mismatches (typically positive).
+ *  @param[in] open Cost for opening a gap (typically positive).
+ *  @param[in] extend Cost for extending an existing gap (typically smaller than open).
+ *  @param[in] alloc Memory allocator (NULL for default).
+ *  @param[in] capabilities Hardware capabilities mask.
+ *  @param[out] engine Pointer to initialized engine handle.
+ */
 SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(                                              //
     sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities,                              //
     sz_levenshtein_distances_t *engine);
 
+/**
+ *  @brief Compute Levenshtein distances for sequence pairs.
+ *  @param[in] engine Initialized distance engine.
+ *  @param[in] device Device scope for execution.
+ *  @param[in] a First sequence collection.
+ *  @param[in] b Second sequence collection.
+ *  @param[out] results Output distance array.
+ *  @param[in] results_stride Stride between results in bytes.
+ */
 SZ_DYNAMIC sz_status_t sz_levenshtein_distances_sequence(        //
     sz_levenshtein_distances_t engine, sz_device_scope_t device, //
     sz_sequence_t const *a, sz_sequence_t const *b,              //
     sz_size_t *results, sz_size_t results_stride);
 
+/**
+ *  @brief Compute Levenshtein distances for 32-bit tape format.
+ *  @param[in] engine Initialized distance engine.
+ *  @param[in] device Device scope for execution.
+ *  @param[in] a First sequence tape.
+ *  @param[in] b Second sequence tape.
+ *  @param[out] results Output distance array.
+ *  @param[in] results_stride Stride between results in bytes.
+ */
 SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u32tape(            //
     sz_levenshtein_distances_t engine, sz_device_scope_t device,    //
     sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b, //
     sz_size_t *results, sz_size_t results_stride);
 
+/**
+ *  @brief Compute Levenshtein distances for 64-bit tape format.
+ *  @param[in] engine Initialized distance engine.
+ *  @param[in] device Device scope for execution.
+ *  @param[in] a First sequence tape.
+ *  @param[in] b Second sequence tape.
+ *  @param[out] results Output distance array.
+ *  @param[in] results_stride Stride between results in bytes.
+ */
 SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u64tape(            //
     sz_levenshtein_distances_t engine, sz_device_scope_t device,    //
     sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b, //
     sz_size_t *results, sz_size_t results_stride);
 
+/**
+ *  @brief Free Levenshtein distance engine resources.
+ *  @param[in] engine Engine handle to free.
+ */
 SZ_DYNAMIC void sz_levenshtein_distances_free(sz_levenshtein_distances_t engine);
 
+/**
+ *  @brief Initialize UTF-8 aware Levenshtein distance engine.
+ *  
+ *  Creates an engine for computing edit distances between UTF-8 encoded strings
+ *  using character-level comparison instead of byte-level.
+ *  
+ *  @param[in] match Cost for character matches (typically negative or zero).
+ *  @param[in] mismatch Cost for character mismatches (typically positive).
+ *  @param[in] open Cost for opening a gap (typically positive).
+ *  @param[in] extend Cost for extending an existing gap (typically smaller than open).
+ *  @param[in] alloc Memory allocator (NULL for default).
+ *  @param[in] capabilities Hardware capabilities mask.
+ *  @param[out] engine Pointer to initialized engine handle.
+ */
 SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_init(                                         //
     sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities,                              //
     sz_levenshtein_distances_utf8_t *engine);
 
+/**
+ *  @brief Compute UTF-8 aware Levenshtein distances for sequences.
+ *  @param[in] engine Initialized UTF-8 distance engine.
+ *  @param[in] device Device scope for execution.
+ *  @param[in] a First sequence collection.
+ *  @param[in] b Second sequence collection.
+ *  @param[out] results Output distance array.
+ *  @param[in] results_stride Stride between results in bytes.
+ */
 SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_sequence(        //
     sz_levenshtein_distances_utf8_t engine, sz_device_scope_t device, //
     sz_sequence_t const *a, sz_sequence_t const *b,                   //
     sz_size_t *results, sz_size_t results_stride);
 
+/**
+ *  @brief Compute UTF-8 aware distances for 32-bit tape format.
+ *  @param[in] engine Initialized UTF-8 distance engine.
+ *  @param[in] device Device scope for execution.
+ *  @param[in] a First sequence tape.
+ *  @param[in] b Second sequence tape.
+ *  @param[out] results Output distance array.
+ *  @param[in] results_stride Stride between results in bytes.
+ */
 SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_u32tape(         //
     sz_levenshtein_distances_utf8_t engine, sz_device_scope_t device, //
     sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,   //
     sz_size_t *results, sz_size_t results_stride);
 
+/**
+ *  @brief Compute UTF-8 aware distances for 64-bit tape format.
+ *  @param[in] engine Initialized UTF-8 distance engine.
+ *  @param[in] device Device scope for execution.
+ *  @param[in] a First sequence tape.
+ *  @param[in] b Second sequence tape.
+ *  @param[out] results Output distance array.
+ *  @param[in] results_stride Stride between results in bytes.
+ */
 SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_u64tape(         //
     sz_levenshtein_distances_utf8_t engine, sz_device_scope_t device, //
     sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,   //
     sz_size_t *results, sz_size_t results_stride);
 
+/**
+ *  @brief Free UTF-8 Levenshtein distance engine resources.
+ *  @param[in] engine Engine handle to free.
+ */
 SZ_DYNAMIC void sz_levenshtein_distances_utf8_free(sz_levenshtein_distances_utf8_t engine);
 
 /*  APIs for computing similarity scores between pairs of strings.
@@ -141,48 +290,136 @@ SZ_DYNAMIC void sz_levenshtein_distances_utf8_free(sz_levenshtein_distances_utf8
 typedef void *sz_needleman_wunsch_scores_t;
 typedef void *sz_smith_waterman_scores_t;
 
+/**
+ *  @brief Initialize Needleman-Wunsch global alignment scorer.
+ *
+ *  Creates an engine for computing global alignment scores between sequences using
+ *  the Needleman-Wunsch algorithm with configurable substitution matrix and gap costs.
+ *
+ *  @param[in] subs 256x256 substitution matrix for scoring character pairs.
+ *  @param[in] open Cost for opening a gap (typically positive).
+ *  @param[in] extend Cost for extending an existing gap (typically smaller than open).
+ *  @param[in] alloc Memory allocator (NULL for default).
+ *  @param[in] capabilities Hardware capabilities mask.
+ *  @param[out] engine Pointer to initialized engine handle.
+ */
 SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_init(                        //
     sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities,          //
     sz_needleman_wunsch_scores_t *engine);
 
+/**
+ *  @brief Calculate Needleman-Wunsch global alignment scores for sequences.
+ *  @param[in] engine Initialized global alignment engine.
+ *  @param[in] device Device scope for execution.
+ *  @param[in] a First sequence collection.
+ *  @param[in] b Second sequence collection.
+ *  @param[out] results Output score array.
+ *  @param[in] results_stride Stride between results in bytes.
+ */
 SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_sequence(        //
     sz_needleman_wunsch_scores_t engine, sz_device_scope_t device, //
     sz_sequence_t const *a, sz_sequence_t const *b,                //
     sz_ssize_t *results, sz_size_t results_stride);
 
+/**
+ *  @brief Calculate global alignment scores for 32-bit tape format.
+ *  @param[in] engine Initialized global alignment engine.
+ *  @param[in] device Device scope for execution.
+ *  @param[in] a First sequence tape.
+ *  @param[in] b Second sequence tape.
+ *  @param[out] results Output score array.
+ *  @param[in] results_stride Stride between results in bytes.
+ */
 SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u32tape(          //
     sz_needleman_wunsch_scores_t engine, sz_device_scope_t device,  //
     sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b, //
     sz_ssize_t *results, sz_size_t results_stride);
 
+/**
+ *  @brief Calculate global alignment scores for 64-bit tape format.
+ *  @param[in] engine Initialized global alignment engine.
+ *  @param[in] device Device scope for execution.
+ *  @param[in] a First sequence tape.
+ *  @param[in] b Second sequence tape.
+ *  @param[out] results Output score array.
+ *  @param[in] results_stride Stride between results in bytes.
+ */
 SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u64tape(          //
     sz_needleman_wunsch_scores_t engine, sz_device_scope_t device,  //
     sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b, //
     sz_ssize_t *results, sz_size_t results_stride);
 
+/**
+ *  @brief Free Needleman-Wunsch scorer resources.
+ *  @param[in] engine Engine handle to free.
+ */
 SZ_DYNAMIC void sz_needleman_wunsch_scores_free(sz_needleman_wunsch_scores_t engine);
 
+/**
+ *  @brief Initialize Smith-Waterman local alignment scorer.
+ *
+ *  Creates an engine for computing local alignment scores between sequences using
+ *  the Smith-Waterman algorithm with configurable substitution matrix and gap costs.
+ *
+ *  @param[in] subs 256x256 substitution matrix for scoring character pairs.
+ *  @param[in] open Cost for opening a gap (typically positive).
+ *  @param[in] extend Cost for extending an existing gap (typically smaller than open).
+ *  @param[in] alloc Memory allocator (NULL for default).
+ *  @param[in] capabilities Hardware capabilities mask.
+ *  @param[out] engine Pointer to initialized engine handle.
+ */
 SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_init(                          //
     sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities,          //
     sz_smith_waterman_scores_t *engine);
 
+/**
+ *  @brief Calculate Smith-Waterman local alignment scores for sequences.
+ *  @param[in] engine Initialized local alignment engine.
+ *  @param[in] device Device scope for execution.
+ *  @param[in] a First sequence collection.
+ *  @param[in] b Second sequence collection.
+ *  @param[out] results Output score array.
+ *  @param[in] results_stride Stride between results in bytes.
+ */
 SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_sequence(        //
     sz_smith_waterman_scores_t engine, sz_device_scope_t device, //
     sz_sequence_t const *a, sz_sequence_t const *b,              //
     sz_ssize_t *results, sz_size_t results_stride);
 
+/**
+ *  @brief Calculate local alignment scores for 32-bit tape format.
+ *  @param[in] engine Initialized local alignment engine.
+ *  @param[in] device Device scope for execution.
+ *  @param[in] a First sequence tape.
+ *  @param[in] b Second sequence tape.
+ *  @param[out] results Output score array.
+ *  @param[in] results_stride Stride between results in bytes.
+ */
 SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u32tape(            //
     sz_smith_waterman_scores_t engine, sz_device_scope_t device,    //
     sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b, //
     sz_ssize_t *results, sz_size_t results_stride);
 
+/**
+ *  @brief Calculate local alignment scores for 64-bit tape format.
+ *  @param[in] engine Initialized local alignment engine.
+ *  @param[in] device Device scope for execution.
+ *  @param[in] a First sequence tape.
+ *  @param[in] b Second sequence tape.
+ *  @param[out] results Output score array.
+ *  @param[in] results_stride Stride between results in bytes.
+ */
 SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u64tape(            //
     sz_smith_waterman_scores_t engine, sz_device_scope_t device,    //
     sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b, //
     sz_ssize_t *results, sz_size_t results_stride);
 
+/**
+ *  @brief Free Smith-Waterman scorer resources.
+ *  @param[in] engine Engine handle to free.
+ */
 SZ_DYNAMIC void sz_smith_waterman_scores_free(sz_smith_waterman_scores_t engine);
 
 /**
@@ -211,17 +448,19 @@ typedef void *sz_fingerprints_t;
 typedef void *sz_fingerprints_utf8_t;
 
 /**
- *  @brief Initializes a fingerprinting engine.
- *  @param[in] dimensions Total dimensions per fingerprint, ideally 1024 or a (64 * window_widths_count) multiple.
- *  @param[in] alphabet_size The size of the alphabet (256 for binary, 128 for ASCII, 4 for DNA, 22 for protein).
- *  @param[in] window_widths An optional array of window widths for the fingerprints, like [3, 4, 5, 7, 9, 11, 15, 31].
- *  @param[in] window_widths_count The number of window widths in the @p window_widths array.
- *  @param[in] alloc A memory allocator to use for allocating memory.
- *  @param[in] capabilities A set of capabilities to use for the fingerprinting engine.
- *  @param[out] engine Pointer to the initialized fingerprinting engine.
+ *  @brief Initialize fingerprinting engine for Min-Hash computation.
+ *
+ *  Creates an engine for computing rolling hash fingerprints using multiple
+ *  configurable window sizes and dimensions for efficient similarity detection.
  *
- *  If the @p alphabet_size is 0, it will be set to 256 by default.
- *  If the @p window_widths_count is 0 or @p window_widths is NULL, some default window widths will be used.
+ *  @param[in] dimensions Total dimensions per fingerprint, ideally 1024 or a (64 * window_widths_count) multiple.
+ *  @param[in] alphabet_size Size of the alphabet (256 for binary, 128 for ASCII, 4 for DNA, 22 for protein).
+ *  @param[in] window_widths Array of window widths (NULL for defaults like [3, 4, 5, 7, 9, 11, 15, 31]).
+ *  @param[in] window_widths_count Number of window widths in array (0 for defaults).
+ *  @param[in] alloc Memory allocator (NULL for default).
+ *  @param[in] capabilities Hardware capabilities mask.
+ *  @param[out] engine Pointer to initialized engine handle.
+ *  @note If alphabet_size is 0, defaults to 256. If window_widths is NULL, uses default widths.
  */
 SZ_DYNAMIC sz_status_t sz_fingerprints_init(                          //
     sz_size_t dimensions, sz_size_t alphabet_size,                    //
@@ -229,31 +468,65 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_init(                          //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities, //
     sz_fingerprints_t *engine);
 
+/**
+ *  @brief Compute Min-Hash fingerprints for sequences.
+ *  @param[in] engine Initialized fingerprinting engine.
+ *  @param[in] device Device scope for execution.
+ *  @param[in] texts Input sequence collection.
+ *  @param[out] min_hashes Output Min-Hash array.
+ *  @param[in] min_hashes_stride Stride between hash results in bytes.
+ *  @param[out] min_counts Output Count-Min-Sketch array.
+ *  @param[in] min_counts_stride Stride between count results in bytes.
+ */
 SZ_DYNAMIC sz_status_t sz_fingerprints_sequence(        //
     sz_fingerprints_t engine, sz_device_scope_t device, //
     sz_sequence_t const *texts,                         //
     sz_u32_t *min_hashes, sz_size_t min_hashes_stride,  //
     sz_u32_t *min_counts, sz_size_t min_counts_stride);
 
+/**
+ *  @brief Compute Min-Hash fingerprints for 64-bit tape format.
+ *  @param[in] engine Initialized fingerprinting engine.
+ *  @param[in] device Device scope for execution.
+ *  @param[in] texts Input sequence tape.
+ *  @param[out] min_hashes Output Min-Hash array.
+ *  @param[in] min_hashes_stride Stride between hash results in bytes.
+ *  @param[out] min_counts Output Count-Min-Sketch array.
+ *  @param[in] min_counts_stride Stride between count results in bytes.
+ */
 SZ_DYNAMIC sz_status_t sz_fingerprints_u64tape(         //
     sz_fingerprints_t engine, sz_device_scope_t device, //
     sz_sequence_u64tape_t const *texts,                 //
     sz_u32_t *min_hashes, sz_size_t min_hashes_stride,  //
     sz_u32_t *min_counts, sz_size_t min_counts_stride);
 
+/**
+ *  @brief Compute Min-Hash fingerprints for 32-bit tape format.
+ *  @param[in] engine Initialized fingerprinting engine.
+ *  @param[in] device Device scope for execution.
+ *  @param[in] texts Input sequence tape.
+ *  @param[out] min_hashes Output Min-Hash array.
+ *  @param[in] min_hashes_stride Stride between hash results in bytes.
+ *  @param[out] min_counts Output Count-Min-Sketch array.
+ *  @param[in] min_counts_stride Stride between count results in bytes.
+ */
 SZ_DYNAMIC sz_status_t sz_fingerprints_u32tape(         //
     sz_fingerprints_t engine, sz_device_scope_t device, //
     sz_sequence_u32tape_t const *texts,                 //
     sz_u32_t *min_hashes, sz_size_t min_hashes_stride,  //
     sz_u32_t *min_counts, sz_size_t min_counts_stride);
 
+/**
+ *  @brief Free fingerprinting engine resources.
+ *  @param[in] engine Engine handle to free.
+ */
 SZ_DYNAMIC void sz_fingerprints_free(sz_fingerprints_t engine);
 
 /**
  *  @brief Allocates memory using unified memory allocator.
  *  @param[in] size_bytes Number of bytes to allocate.
  *  @return Pointer to allocated memory, or NULL on failure.
- *  
+ *
  *  Uses CUDA unified memory when available, falls back to malloc otherwise.
  *  Allocated memory can be accessed from both CPU and GPU when CUDA is available.
  */

From 87a076713f5f9db34c9c99265625d779f1ef4d40 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 11:19:23 +0000
Subject: [PATCH 592/751] Improve: Use StringTape for GPU backends

---
 Cargo.lock           |   7 +
 Cargo.toml           |  10 +-
 rust/stringzillas.rs | 621 +++++++++++++++++++++++++++++++++++--------
 3 files changed, 528 insertions(+), 110 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 9e52779d..affdcced 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -23,10 +23,17 @@ version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
+[[package]]
+name = "stringtape"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fbe100eeef4ddb084de7d18d49c893a060a84c70622155ba21cbf812e6d8218"
+
 [[package]]
 name = "stringzilla"
 version = "3.11.3"
 dependencies = [
  "allocator-api2",
  "cc",
+ "stringtape",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 9a840ddc..c6c46929 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -25,12 +25,16 @@ path = "rust/stringzilla.rs"
 
 [features]
 default = []
-cpus = ["allocator-api2"] # Multi-threaded CPU backend (StringZillas)
-cuda = ["cpus"]           # CUDA GPU backend (includes multi-threaded CPU backend)
-rocm = ["cpus"]           # ROCm GPU backend (includes multi-threaded CPU backend)
+cpus = [
+    "allocator-api2",
+    "stringtape",
+] # Multi-threaded CPU backend (StringZillas)
+cuda = ["cpus"] # CUDA GPU backend (includes multi-threaded CPU backend)
+rocm = ["cpus"] # ROCm GPU backend (includes multi-threaded CPU backend)
 
 [dependencies]
 allocator-api2 = { version = "0.3.0", optional = true }
+stringtape = { version = "0.1", optional = true }
 
 [build-dependencies]
 cc = "1.2.33"
diff --git a/rust/stringzillas.rs b/rust/stringzillas.rs
index a2a8d6ba..017c4b87 100644
--- a/rust/stringzillas.rs
+++ b/rust/stringzillas.rs
@@ -88,6 +88,31 @@ pub mod szs {
             }
         }
 
+        /// Get the number of CPU cores configured for this device scope
+        pub fn get_cpu_cores(&self) -> Result<usize, Status> {
+            let mut cpu_cores: usize = 0;
+            let status = unsafe { sz_device_scope_get_cpu_cores(self.handle, &mut cpu_cores) };
+            match status {
+                Status::Success => Ok(cpu_cores),
+                err => Err(err),
+            }
+        }
+
+        /// Get the GPU device ID configured for this device scope
+        pub fn get_gpu_device(&self) -> Result<usize, Status> {
+            let mut gpu_device: usize = 0;
+            let status = unsafe { sz_device_scope_get_gpu_device(self.handle, &mut gpu_device) };
+            match status {
+                Status::Success => Ok(gpu_device),
+                err => Err(err),
+            }
+        }
+
+        /// Check if this device scope is configured for GPU execution
+        pub fn is_gpu(&self) -> bool {
+            self.get_gpu_device().is_ok()
+        }
+
         /// Get the raw handle for this device scope
         pub(crate) fn as_ptr(&self) -> *mut c_void {
             self.handle
@@ -222,36 +247,82 @@ pub mod szs {
             FingerprintsBuilder::new()
         }
 
-        /// Process a collection of strings and compute fingerprints
-        pub fn fingerprint<T, S>(
+        /// Compute fingerprints for a collection of strings
+        /// Returns min-hashes and min-counts in unified memory
+        pub fn compute<T, S>(
             &self,
             device: &DeviceScope,
-            strings: &T,
-            min_hashes: &mut [u32],
-            min_counts: &mut [u32],
-        ) -> Result<(), Status>
+            strings: T,
+            dimensions: usize,
+        ) -> Result<(UnifiedVec<u32>, UnifiedVec<u32>), Status>
         where
             T: AsRef<[S]>,
             S: AsRef<[u8]>,
         {
             let strings_slice = strings.as_ref();
-            let sequence = create_sequence_view(strings_slice);
-
-            let status = unsafe {
-                sz_fingerprints_sequence(
-                    self.handle,
-                    device.handle,
-                    &sequence as *const _ as *const c_void,
-                    min_hashes.as_mut_ptr(),
-                    min_hashes.len(),
-                    min_counts.as_mut_ptr(),
-                    min_counts.len(),
-                )
-            };
-
-            match status {
-                Status::Success => Ok(()),
-                err => Err(err),
+            let num_strings = strings_slice.len();
+            let hashes_size = num_strings * dimensions;
+            let counts_size = num_strings * dimensions;
+
+            let mut min_hashes = UnifiedVec::with_capacity_in(hashes_size, UnifiedAlloc);
+            min_hashes.resize(hashes_size, 0);
+            let mut min_counts = UnifiedVec::with_capacity_in(counts_size, UnifiedAlloc);
+            min_counts.resize(counts_size, 0);
+
+            let hashes_stride = dimensions * core::mem::size_of::<u32>();
+            let counts_stride = dimensions * core::mem::size_of::<u32>();
+
+            if device.is_gpu() {
+                let (tape, use_64bit) = create_tape(strings_slice)?;
+
+                let status = if use_64bit {
+                    let tape_view = create_u64tape_view(&tape);
+                    unsafe {
+                        sz_fingerprints_u64tape(
+                            self.handle,
+                            device.handle,
+                            &tape_view as *const _ as *const c_void,
+                            min_hashes.as_mut_ptr(),
+                            hashes_stride,
+                            min_counts.as_mut_ptr(),
+                            counts_stride,
+                        )
+                    }
+                } else {
+                    let tape_view = create_u32tape_view(&tape);
+                    unsafe {
+                        sz_fingerprints_u32tape(
+                            self.handle,
+                            device.handle,
+                            &tape_view as *const _ as *const c_void,
+                            min_hashes.as_mut_ptr(),
+                            hashes_stride,
+                            min_counts.as_mut_ptr(),
+                            counts_stride,
+                        )
+                    }
+                };
+                match status {
+                    Status::Success => Ok((min_hashes, min_counts)),
+                    err => Err(err),
+                }
+            } else {
+                let sequence = create_sequence_view(strings_slice);
+                let status = unsafe {
+                    sz_fingerprints_sequence(
+                        self.handle,
+                        device.handle,
+                        &sequence as *const _ as *const c_void,
+                        min_hashes.as_mut_ptr(),
+                        hashes_stride,
+                        min_counts.as_mut_ptr(),
+                        counts_stride,
+                    )
+                };
+                match status {
+                    Status::Success => Ok((min_hashes, min_counts)),
+                    err => Err(err),
+                }
             }
         }
     }
@@ -278,6 +349,22 @@ pub mod szs {
         lengths: *const usize,
     }
 
+    /// Apache Arrow-like tape for non-NULL strings with 32-bit offsets
+    #[repr(C)]
+    struct SzSequenceU32Tape {
+        data: *const u8,
+        offsets: *const u32,
+        count: usize,
+    }
+
+    /// Apache Arrow-like tape for non-NULL strings with 64-bit offsets
+    #[repr(C)]
+    struct SzSequenceU64Tape {
+        data: *const u8,
+        offsets: *const u64,
+        count: usize,
+    }
+
     /// Opaque handles for similarity engines
     pub type FingerprintsHandle = *mut c_void;
     pub type LevenshteinDistancesHandle = *mut c_void;
@@ -292,6 +379,8 @@ pub mod szs {
         fn sz_device_scope_init_cpu_cores(cpu_cores: usize, scope: *mut *mut c_void) -> Status;
         fn sz_device_scope_init_gpu_device(gpu_device: usize, scope: *mut *mut c_void) -> Status;
         fn sz_device_scope_get_capabilities(scope: *mut c_void, capabilities: *mut Capability) -> Status;
+        fn sz_device_scope_get_cpu_cores(scope: *mut c_void, cpu_cores: *mut usize) -> Status;
+        fn sz_device_scope_get_gpu_device(scope: *mut c_void, gpu_device: *mut usize) -> Status;
         fn sz_device_scope_free(scope: *mut c_void);
 
         // Levenshtein distance functions
@@ -314,6 +403,24 @@ pub mod szs {
             results_stride: usize,
         ) -> Status;
 
+        fn sz_levenshtein_distances_u32tape(
+            engine: LevenshteinDistancesHandle,
+            device: *mut c_void,
+            a: *const c_void, // sz_sequence_u32tape_t
+            b: *const c_void, // sz_sequence_u32tape_t
+            results: *mut usize,
+            results_stride: usize,
+        ) -> Status;
+
+        fn sz_levenshtein_distances_u64tape(
+            engine: LevenshteinDistancesHandle,
+            device: *mut c_void,
+            a: *const c_void, // sz_sequence_u64tape_t
+            b: *const c_void, // sz_sequence_u64tape_t
+            results: *mut usize,
+            results_stride: usize,
+        ) -> Status;
+
         fn sz_levenshtein_distances_free(engine: LevenshteinDistancesHandle);
 
         // Levenshtein distance UTF-8 functions
@@ -336,6 +443,24 @@ pub mod szs {
             results_stride: usize,
         ) -> Status;
 
+        fn sz_levenshtein_distances_utf8_u32tape(
+            engine: LevenshteinDistancesUtf8Handle,
+            device: *mut c_void,
+            a: *const c_void, // sz_sequence_u32tape_t
+            b: *const c_void, // sz_sequence_u32tape_t
+            results: *mut usize,
+            results_stride: usize,
+        ) -> Status;
+
+        fn sz_levenshtein_distances_utf8_u64tape(
+            engine: LevenshteinDistancesUtf8Handle,
+            device: *mut c_void,
+            a: *const c_void, // sz_sequence_u64tape_t
+            b: *const c_void, // sz_sequence_u64tape_t
+            results: *mut usize,
+            results_stride: usize,
+        ) -> Status;
+
         fn sz_levenshtein_distances_utf8_free(engine: LevenshteinDistancesUtf8Handle);
 
         // Needleman-Wunsch scoring functions
@@ -357,6 +482,24 @@ pub mod szs {
             results_stride: usize,
         ) -> Status;
 
+        fn sz_needleman_wunsch_scores_u32tape(
+            engine: NeedlemanWunschScoresHandle,
+            device: *mut c_void,
+            a: *const c_void, // sz_sequence_u32tape_t
+            b: *const c_void, // sz_sequence_u32tape_t
+            results: *mut isize,
+            results_stride: usize,
+        ) -> Status;
+
+        fn sz_needleman_wunsch_scores_u64tape(
+            engine: NeedlemanWunschScoresHandle,
+            device: *mut c_void,
+            a: *const c_void, // sz_sequence_u64tape_t
+            b: *const c_void, // sz_sequence_u64tape_t
+            results: *mut isize,
+            results_stride: usize,
+        ) -> Status;
+
         fn sz_needleman_wunsch_scores_free(engine: NeedlemanWunschScoresHandle);
 
         // Smith-Waterman scoring functions
@@ -378,6 +521,24 @@ pub mod szs {
             results_stride: usize,
         ) -> Status;
 
+        fn sz_smith_waterman_scores_u32tape(
+            engine: SmithWatermanScoresHandle,
+            device: *mut c_void,
+            a: *const c_void, // sz_sequence_u32tape_t
+            b: *const c_void, // sz_sequence_u32tape_t
+            results: *mut isize,
+            results_stride: usize,
+        ) -> Status;
+
+        fn sz_smith_waterman_scores_u64tape(
+            engine: SmithWatermanScoresHandle,
+            device: *mut c_void,
+            a: *const c_void, // sz_sequence_u64tape_t
+            b: *const c_void, // sz_sequence_u64tape_t
+            results: *mut isize,
+            results_stride: usize,
+        ) -> Status;
+
         fn sz_smith_waterman_scores_free(engine: SmithWatermanScoresHandle);
 
         // Fingerprinting functions
@@ -401,6 +562,26 @@ pub mod szs {
             min_counts_stride: usize,
         ) -> Status;
 
+        fn sz_fingerprints_u32tape(
+            engine: FingerprintsHandle,
+            device: *mut c_void,  // DeviceScope
+            texts: *const c_void, // sz_sequence_u32tape_t
+            min_hashes: *mut u32,
+            min_hashes_stride: usize,
+            min_counts: *mut u32,
+            min_counts_stride: usize,
+        ) -> Status;
+
+        fn sz_fingerprints_u64tape(
+            engine: FingerprintsHandle,
+            device: *mut c_void,  // DeviceScope
+            texts: *const c_void, // sz_sequence_u64tape_t
+            min_hashes: *mut u32,
+            min_hashes_stride: usize,
+            min_counts: *mut u32,
+            min_counts_stride: usize,
+        ) -> Status;
+
         fn sz_fingerprints_free(engine: FingerprintsHandle);
 
         // Unified allocator functions
@@ -474,35 +655,78 @@ pub mod szs {
             }
         }
 
-        /// Call operator to compute distances
-        pub fn call<T, S>(
+        /// Compute Levenshtein distances between sequence pairs
+        pub fn compute<T, S>(
             &self,
             device: &DeviceScope,
             sequences_a: T,
             sequences_b: T,
-            results: &mut [usize],
-        ) -> Result<(), Status>
+        ) -> Result<UnifiedVec<usize>, Status>
         where
             T: AsRef<[S]>,
             S: AsRef<[u8]>,
         {
-            let seq_a = create_sequence_view(sequences_a.as_ref());
-            let seq_b = create_sequence_view(sequences_b.as_ref());
-
-            let results_stride = core::mem::size_of::<usize>(); // stride in bytes
-            let status = unsafe {
-                sz_levenshtein_distances_sequence(
-                    self.handle,
-                    device.handle,
-                    &seq_a as *const _ as *const c_void,
-                    &seq_b as *const _ as *const c_void,
-                    results.as_mut_ptr(),
-                    results_stride,
-                )
-            };
-            match status {
-                Status::Success => Ok(()),
-                err => Err(err),
+            let seq_a_slice = sequences_a.as_ref();
+            let seq_b_slice = sequences_b.as_ref();
+            let num_pairs = seq_a_slice.len().min(seq_b_slice.len());
+
+            let mut results = UnifiedVec::with_capacity_in(num_pairs, UnifiedAlloc);
+            results.resize(num_pairs, 0);
+
+            let results_stride = core::mem::size_of::<usize>();
+
+            if device.is_gpu() {
+                let (tape_a, use_64bit_a) = create_tape(seq_a_slice)?;
+                let (tape_b, use_64bit_b) = create_tape(seq_b_slice)?;
+
+                let status = if use_64bit_a || use_64bit_b {
+                    let tape_a_view = create_u64tape_view(&tape_a);
+                    let tape_b_view = create_u64tape_view(&tape_b);
+                    unsafe {
+                        sz_levenshtein_distances_u64tape(
+                            self.handle,
+                            device.handle,
+                            &tape_a_view as *const _ as *const c_void,
+                            &tape_b_view as *const _ as *const c_void,
+                            results.as_mut_ptr(),
+                            results_stride,
+                        )
+                    }
+                } else {
+                    let tape_a_view = create_u32tape_view(&tape_a);
+                    let tape_b_view = create_u32tape_view(&tape_b);
+                    unsafe {
+                        sz_levenshtein_distances_u32tape(
+                            self.handle,
+                            device.handle,
+                            &tape_a_view as *const _ as *const c_void,
+                            &tape_b_view as *const _ as *const c_void,
+                            results.as_mut_ptr(),
+                            results_stride,
+                        )
+                    }
+                };
+                match status {
+                    Status::Success => Ok(results),
+                    err => Err(err),
+                }
+            } else {
+                let seq_a = create_sequence_view(seq_a_slice);
+                let seq_b = create_sequence_view(seq_b_slice);
+                let status = unsafe {
+                    sz_levenshtein_distances_sequence(
+                        self.handle,
+                        device.handle,
+                        &seq_a as *const _ as *const c_void,
+                        &seq_b as *const _ as *const c_void,
+                        results.as_mut_ptr(),
+                        results_stride,
+                    )
+                };
+                match status {
+                    Status::Success => Ok(results),
+                    err => Err(err),
+                }
             }
         }
     }
@@ -549,35 +773,78 @@ pub mod szs {
             }
         }
 
-        /// Call operator to compute UTF-8 distances
-        pub fn call<T, S>(
+        /// Compute UTF-8 aware Levenshtein distances between sequence pairs
+        pub fn compute<T, S>(
             &self,
             device: &DeviceScope,
             sequences_a: T,
             sequences_b: T,
-            results: &mut [usize],
-        ) -> Result<(), Status>
+        ) -> Result<UnifiedVec<usize>, Status>
         where
             T: AsRef<[S]>,
             S: AsRef<str>,
         {
-            let seq_a = create_sequence_view_str(sequences_a.as_ref());
-            let seq_b = create_sequence_view_str(sequences_b.as_ref());
-
-            let results_stride = core::mem::size_of::<usize>(); // stride in bytes
-            let status = unsafe {
-                sz_levenshtein_distances_utf8_sequence(
-                    self.handle,
-                    device.handle,
-                    &seq_a as *const _ as *const c_void,
-                    &seq_b as *const _ as *const c_void,
-                    results.as_mut_ptr(),
-                    results_stride,
-                )
-            };
-            match status {
-                Status::Success => Ok(()),
-                err => Err(err),
+            let seq_a_slice = sequences_a.as_ref();
+            let seq_b_slice = sequences_b.as_ref();
+            let num_pairs = seq_a_slice.len().min(seq_b_slice.len());
+
+            let mut results = UnifiedVec::with_capacity_in(num_pairs, UnifiedAlloc);
+            results.resize(num_pairs, 0);
+
+            let results_stride = core::mem::size_of::<usize>();
+
+            if device.is_gpu() {
+                let (tape_a, use_64bit_a) = create_tape_str(seq_a_slice)?;
+                let (tape_b, use_64bit_b) = create_tape_str(seq_b_slice)?;
+
+                let status = if use_64bit_a || use_64bit_b {
+                    let tape_a_view = create_u64tape_view(&tape_a);
+                    let tape_b_view = create_u64tape_view(&tape_b);
+                    unsafe {
+                        sz_levenshtein_distances_utf8_u64tape(
+                            self.handle,
+                            device.handle,
+                            &tape_a_view as *const _ as *const c_void,
+                            &tape_b_view as *const _ as *const c_void,
+                            results.as_mut_ptr(),
+                            results_stride,
+                        )
+                    }
+                } else {
+                    let tape_a_view = create_u32tape_view(&tape_a);
+                    let tape_b_view = create_u32tape_view(&tape_b);
+                    unsafe {
+                        sz_levenshtein_distances_utf8_u32tape(
+                            self.handle,
+                            device.handle,
+                            &tape_a_view as *const _ as *const c_void,
+                            &tape_b_view as *const _ as *const c_void,
+                            results.as_mut_ptr(),
+                            results_stride,
+                        )
+                    }
+                };
+                match status {
+                    Status::Success => Ok(results),
+                    err => Err(err),
+                }
+            } else {
+                let seq_a = create_sequence_view_str(seq_a_slice);
+                let seq_b = create_sequence_view_str(seq_b_slice);
+                let status = unsafe {
+                    sz_levenshtein_distances_utf8_sequence(
+                        self.handle,
+                        device.handle,
+                        &seq_a as *const _ as *const c_void,
+                        &seq_b as *const _ as *const c_void,
+                        results.as_mut_ptr(),
+                        results_stride,
+                    )
+                };
+                match status {
+                    Status::Success => Ok(results),
+                    err => Err(err),
+                }
             }
         }
     }
@@ -622,35 +889,78 @@ pub mod szs {
             }
         }
 
-        /// Call operator to compute alignment scores
-        pub fn call<T, S>(
+        /// Compute Needleman-Wunsch alignment scores between sequence pairs
+        pub fn compute<T, S>(
             &self,
             device: &DeviceScope,
             sequences_a: T,
             sequences_b: T,
-            results: &mut [isize],
-        ) -> Result<(), Status>
+        ) -> Result<UnifiedVec<isize>, Status>
         where
             T: AsRef<[S]>,
             S: AsRef<[u8]>,
         {
-            let seq_a = create_sequence_view(sequences_a.as_ref());
-            let seq_b = create_sequence_view(sequences_b.as_ref());
-
-            let results_stride = core::mem::size_of::<isize>(); // stride in bytes
-            let status = unsafe {
-                sz_needleman_wunsch_scores_sequence(
-                    self.handle,
-                    device.handle,
-                    &seq_a as *const _ as *const c_void,
-                    &seq_b as *const _ as *const c_void,
-                    results.as_mut_ptr(),
-                    results_stride,
-                )
-            };
-            match status {
-                Status::Success => Ok(()),
-                err => Err(err),
+            let seq_a_slice = sequences_a.as_ref();
+            let seq_b_slice = sequences_b.as_ref();
+            let num_pairs = seq_a_slice.len().min(seq_b_slice.len());
+
+            let mut results = UnifiedVec::with_capacity_in(num_pairs, UnifiedAlloc);
+            results.resize(num_pairs, 0);
+
+            let results_stride = core::mem::size_of::<isize>();
+
+            if device.is_gpu() {
+                let (tape_a, use_64bit_a) = create_tape(seq_a_slice)?;
+                let (tape_b, use_64bit_b) = create_tape(seq_b_slice)?;
+
+                let status = if use_64bit_a || use_64bit_b {
+                    let tape_a_view = create_u64tape_view(&tape_a);
+                    let tape_b_view = create_u64tape_view(&tape_b);
+                    unsafe {
+                        sz_needleman_wunsch_scores_u64tape(
+                            self.handle,
+                            device.handle,
+                            &tape_a_view as *const _ as *const c_void,
+                            &tape_b_view as *const _ as *const c_void,
+                            results.as_mut_ptr(),
+                            results_stride,
+                        )
+                    }
+                } else {
+                    let tape_a_view = create_u32tape_view(&tape_a);
+                    let tape_b_view = create_u32tape_view(&tape_b);
+                    unsafe {
+                        sz_needleman_wunsch_scores_u32tape(
+                            self.handle,
+                            device.handle,
+                            &tape_a_view as *const _ as *const c_void,
+                            &tape_b_view as *const _ as *const c_void,
+                            results.as_mut_ptr(),
+                            results_stride,
+                        )
+                    }
+                };
+                match status {
+                    Status::Success => Ok(results),
+                    err => Err(err),
+                }
+            } else {
+                let seq_a = create_sequence_view(seq_a_slice);
+                let seq_b = create_sequence_view(seq_b_slice);
+                let status = unsafe {
+                    sz_needleman_wunsch_scores_sequence(
+                        self.handle,
+                        device.handle,
+                        &seq_a as *const _ as *const c_void,
+                        &seq_b as *const _ as *const c_void,
+                        results.as_mut_ptr(),
+                        results_stride,
+                    )
+                };
+                match status {
+                    Status::Success => Ok(results),
+                    err => Err(err),
+                }
             }
         }
     }
@@ -695,35 +1005,78 @@ pub mod szs {
             }
         }
 
-        /// Call operator to compute local alignment scores
-        pub fn call<T, S>(
+        /// Compute Smith-Waterman local alignment scores between sequence pairs
+        pub fn compute<T, S>(
             &self,
             device: &DeviceScope,
             sequences_a: T,
             sequences_b: T,
-            results: &mut [isize],
-        ) -> Result<(), Status>
+        ) -> Result<UnifiedVec<isize>, Status>
         where
             T: AsRef<[S]>,
             S: AsRef<[u8]>,
         {
-            let seq_a = create_sequence_view(sequences_a.as_ref());
-            let seq_b = create_sequence_view(sequences_b.as_ref());
-
-            let results_stride = core::mem::size_of::<isize>(); // stride in bytes
-            let status = unsafe {
-                sz_smith_waterman_scores_sequence(
-                    self.handle,
-                    device.handle,
-                    &seq_a as *const _ as *const c_void,
-                    &seq_b as *const _ as *const c_void,
-                    results.as_mut_ptr(),
-                    results_stride,
-                )
-            };
-            match status {
-                Status::Success => Ok(()),
-                err => Err(err),
+            let seq_a_slice = sequences_a.as_ref();
+            let seq_b_slice = sequences_b.as_ref();
+            let num_pairs = seq_a_slice.len().min(seq_b_slice.len());
+
+            let mut results = UnifiedVec::with_capacity_in(num_pairs, UnifiedAlloc);
+            results.resize(num_pairs, 0);
+
+            let results_stride = core::mem::size_of::<isize>();
+
+            if device.is_gpu() {
+                let (tape_a, use_64bit_a) = create_tape(seq_a_slice)?;
+                let (tape_b, use_64bit_b) = create_tape(seq_b_slice)?;
+
+                let status = if use_64bit_a || use_64bit_b {
+                    let tape_a_view = create_u64tape_view(&tape_a);
+                    let tape_b_view = create_u64tape_view(&tape_b);
+                    unsafe {
+                        sz_smith_waterman_scores_u64tape(
+                            self.handle,
+                            device.handle,
+                            &tape_a_view as *const _ as *const c_void,
+                            &tape_b_view as *const _ as *const c_void,
+                            results.as_mut_ptr(),
+                            results_stride,
+                        )
+                    }
+                } else {
+                    let tape_a_view = create_u32tape_view(&tape_a);
+                    let tape_b_view = create_u32tape_view(&tape_b);
+                    unsafe {
+                        sz_smith_waterman_scores_u32tape(
+                            self.handle,
+                            device.handle,
+                            &tape_a_view as *const _ as *const c_void,
+                            &tape_b_view as *const _ as *const c_void,
+                            results.as_mut_ptr(),
+                            results_stride,
+                        )
+                    }
+                };
+                match status {
+                    Status::Success => Ok(results),
+                    err => Err(err),
+                }
+            } else {
+                let seq_a = create_sequence_view(seq_a_slice);
+                let seq_b = create_sequence_view(seq_b_slice);
+                let status = unsafe {
+                    sz_smith_waterman_scores_sequence(
+                        self.handle,
+                        device.handle,
+                        &seq_a as *const _ as *const c_void,
+                        &seq_b as *const _ as *const c_void,
+                        results.as_mut_ptr(),
+                        results_stride,
+                    )
+                };
+                match status {
+                    Status::Success => Ok(results),
+                    err => Err(err),
+                }
             }
         }
     }
@@ -760,6 +1113,60 @@ pub mod szs {
         }
     }
 
+    /// Convert StringTape to appropriate tape view for C API
+    fn create_tape<T, S>(sequences: &[T]) -> Result<(StringTape<UnifiedAlloc>, bool), Status>
+    where
+        T: AsRef<[S]>,
+        S: AsRef<[u8]>,
+    {
+        // Estimate total size to decide between 32-bit and 64-bit tapes
+        let total_size: usize = sequences.iter().map(|s| s.as_ref().len()).sum();
+        let use_64bit = total_size > u32::MAX as usize || sequences.len() > u32::MAX as usize;
+
+        let tape = if use_64bit {
+            StringTape::with_allocator_64(UnifiedAlloc)
+        } else {
+            StringTape::with_allocator_32(UnifiedAlloc)
+        };
+
+        let tape = sequences.iter().collect_into(tape);
+        Ok((tape, use_64bit))
+    }
+
+    /// Convert string sequences to StringTape
+    fn create_tape_str<T: AsRef<str>>(sequences: &[T]) -> Result<(StringTape<UnifiedAlloc>, bool), Status> {
+        // Estimate total size to decide between 32-bit and 64-bit tapes
+        let total_size: usize = sequences.iter().map(|s| s.as_ref().len()).sum();
+        let use_64bit = total_size > u32::MAX as usize || sequences.len() > u32::MAX as usize;
+
+        let tape = if use_64bit {
+            StringTape::with_allocator_64(UnifiedAlloc)
+        } else {
+            StringTape::with_allocator_32(UnifiedAlloc)
+        };
+
+        let tape = sequences.iter().map(|s| s.as_ref()).collect_into(tape);
+        Ok((tape, use_64bit))
+    }
+
+    /// Convert 32-bit StringTape to SzSequenceU32Tape for C API
+    fn create_u32tape_view(tape: &StringTape<UnifiedAlloc>) -> SzSequenceU32Tape {
+        SzSequenceU32Tape {
+            data: tape.as_ptr(),
+            offsets: tape.offsets_ptr_32(),
+            count: tape.len(),
+        }
+    }
+
+    /// Convert 64-bit StringTape to SzSequenceU64Tape for C API  
+    fn create_u64tape_view(tape: &StringTape<UnifiedAlloc>) -> SzSequenceU64Tape {
+        SzSequenceU64Tape {
+            data: tape.as_ptr(),
+            offsets: tape.offsets_ptr_64(),
+            count: tape.len(),
+        }
+    }
+
     /// Generic C callback to get start of string at index for byte slices
     extern "C" fn sz_sequence_get_start_generic<T: AsRef<[u8]>>(handle: *mut c_void, index: usize) -> *const u8 {
         unsafe {

From 34bb89ae2bb65a95c1fa777dd5cf8fb3e3069804 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 16:12:54 +0000
Subject: [PATCH 593/751] Improve: Passing StringZillas.rs tests

---
 Cargo.lock           |    7 +-
 Cargo.toml           |    2 +-
 rust/stringzilla.rs  |   46 +-
 rust/stringzillas.rs | 4053 ++++++++++++++++++++++++++++++------------
 4 files changed, 2985 insertions(+), 1123 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index affdcced..79070227 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -25,9 +25,12 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "stringtape"
-version = "0.1.0"
+version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3fbe100eeef4ddb084de7d18d49c893a060a84c70622155ba21cbf812e6d8218"
+checksum = "47a3efa2a827143870d5796454e8a1df97045db1a229be9370449a71088e5e24"
+dependencies = [
+ "allocator-api2",
+]
 
 [[package]]
 name = "stringzilla"
diff --git a/Cargo.toml b/Cargo.toml
index c6c46929..3c3a9a50 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -34,7 +34,7 @@ rocm = ["cpus"] # ROCm GPU backend (includes multi-threaded CPU backend)
 
 [dependencies]
 allocator-api2 = { version = "0.3.0", optional = true }
-stringtape = { version = "0.1", optional = true }
+stringtape = { version = "0.3.1", optional = true }
 
 [build-dependencies]
 cc = "1.2.33"
diff --git a/rust/stringzilla.rs b/rust/stringzilla.rs
index d81bbb08..b244b7f1 100644
--- a/rust/stringzilla.rs
+++ b/rust/stringzilla.rs
@@ -7,7 +7,7 @@ edit-distance calculations, suitable for a wide range of applications from basic
 processing to complex text analysis tasks.
 "]
 
-pub mod sz {
+pub mod stringzilla {
 
     /// A simple semantic version structure.
     #[derive(Debug, Copy, Clone, PartialEq, Eq)]
@@ -20,10 +20,30 @@ pub mod sz {
     #[repr(C)]
     #[derive(Debug, PartialEq)]
     pub enum Status {
+        /// For algorithms that return a status, this status indicates that the operation was successful.
+        /// Corresponds to `sz_success_k = 0` in C.
         Success = 0,
-        BadAlloc = -1,
-        InvalidUtf8 = -2,
-        ContainsDuplicates = -3,
+        /// For algorithms that require memory allocation, this status indicates that the allocation failed.
+        /// Corresponds to `sz_bad_alloc_k = -10` in C.
+        BadAlloc = -10,
+        /// For algorithms that require UTF8 input, this status indicates that the input is invalid.
+        /// Corresponds to `sz_invalid_utf8_k = -12` in C.
+        InvalidUtf8 = -12,
+        /// For algorithms that take collections of unique elements, this status indicates presence of duplicates.
+        /// Corresponds to `sz_contains_duplicates_k = -13` in C.
+        ContainsDuplicates = -13,
+        /// For algorithms dealing with large inputs, this error reports the need to upcast the logic to larger types.
+        /// Corresponds to `sz_overflow_risk_k = -14` in C.
+        OverflowRisk = -14,
+        /// For algorithms with multi-stage pipelines indicates input/output size mismatch.
+        /// Corresponds to `sz_unexpected_dimensions_k = -15` in C.
+        UnexpectedDimensions = -15,
+        /// GPU support is missing in the library.
+        /// Corresponds to `sz_missing_gpu_k = -16` in C.
+        MissingGpu = -16,
+        /// A sink-hole status for unknown errors.
+        /// Corresponds to `sz_status_unknown_k = -1` in C.
+        StatusUnknown = -1,
     }
 
     #[repr(C)]
@@ -2045,9 +2065,17 @@ mod tests {
     }
 }
 
-/// StringZillas - Multi-string parallel operations module
-/// This module is conditionally compiled when `cpus`, `cuda`, or `rocm` features are enabled.
+/// High-performance parallel string algorithms with CPU/GPU acceleration.
+/// 
+/// Requires `cpus`, `cuda`, or `rocm` features. Provides:
+/// - Levenshtein distances (binary and UTF-8)  
+/// - Needleman-Wunsch global alignment
+/// - Smith-Waterman local alignment
+/// - Min-Hash fingerprinting
 #[cfg(any(feature = "cpus", feature = "cuda", feature = "rocm"))]
-pub mod stringzillas {
-    include!("stringzillas.rs");
-}
+pub mod stringzillas;
+
+// Convenience aliases for shorter names
+pub use stringzilla as sz;
+#[cfg(any(feature = "cpus", feature = "cuda", feature = "rocm"))]
+pub use stringzillas as szs;
diff --git a/rust/stringzillas.rs b/rust/stringzillas.rs
index 017c4b87..b35e1266 100644
--- a/rust/stringzillas.rs
+++ b/rust/stringzillas.rs
@@ -1,1267 +1,3098 @@
-#[doc = r"
-The `szs` module provides multi-string parallel operations for StringZillas,
-including fingerprinting, Min-Hashes, and Count-Min-Sketches of binary and UTF-8 strings.
-This module is available when `cpus`, `cuda`, or `rocm` features are enabled.
-"]
 extern crate alloc;
-use allocator_api2::{alloc::Allocator, alloc::AllocError, alloc::Layout};
+use alloc::vec::Vec;
+use allocator_api2::{alloc::AllocError, alloc::Allocator, alloc::Layout};
+use core::ffi::c_void;
 use core::ptr;
+use stringtape::{BytesTape, StringTape};
 
 // Re-export common types from stringzilla
-pub use crate::sz::{SortedIdx, Status as SzStatus};
+pub use crate::stringzilla::{SortedIdx, Status as SzStatus};
+
+/// Capability flags
+pub type Capability = u32;
+
+// Import from stringzilla module
+pub use crate::stringzilla::Status;
+
+/// Device scope manages execution context and hardware resource allocation.
+///
+/// Device scopes automatically detect available hardware capabilities and select
+/// optimal implementations. They coordinate between CPU and GPU resources and
+/// manage memory allocation strategies.
+///
+/// # Hardware Detection
+///
+/// Device scopes automatically detect:
+/// - CPU SIMD capabilities (AVX2, AVX-512, NEON, SVE)
+/// - GPU availability (CUDA, ROCm)
+/// - Memory hierarchy and bandwidth
+/// - Thread pool configuration
+///
+/// # Examples
+///
+/// ```rust,no_run
+/// use stringzilla::stringzillas::szs::{DeviceScope, Status};
+///
+/// // Default scope - automatically detects best available hardware
+/// let device = DeviceScope::default().unwrap();
+/// println!("Capabilities: 0x{:x}", device.get_capabilities().unwrap());
+///
+/// // Explicit CPU configuration for reproducible results
+/// let cpu_device = DeviceScope::cpu_cores(4).unwrap();
+/// assert_eq!(cpu_device.get_cpu_cores().unwrap(), 4);
+/// assert!(!cpu_device.is_gpu());
+///
+/// // GPU configuration (requires CUDA/ROCm)
+/// if let Ok(gpu_device) = DeviceScope::gpu_device(0) {
+///     assert!(gpu_device.is_gpu());
+///     println!("Using GPU device: {}", gpu_device.get_gpu_device().unwrap());
+/// }
+/// ```
+///
+/// # Error Handling
+///
+/// ```rust,no_run
+/// # use stringzilla::stringzillas::szs::{DeviceScope, Status};
+/// // Handle invalid configurations gracefully
+/// match DeviceScope::cpu_cores(0) {
+///     Ok(_) => unreachable!("Should not accept 0 cores"),
+///     Err(Status::InvalidArgument) => println!("CPU cores must be > 0"),
+///     Err(e) => println!("Unexpected error: {:?}", e),
+/// }
+///
+/// // GPU might not be available
+/// match DeviceScope::gpu_device(99) {
+///     Ok(_) => println!("GPU available"),
+///     Err(Status::MissingGpu) => println!("GPU not available or invalid device ID"),
+///     Err(e) => println!("GPU error: {:?}", e),
+/// }
+/// ```
+pub struct DeviceScope {
+    handle: *mut c_void,
+}
+
+impl DeviceScope {
+    /// Create a device scope with system defaults.
+    ///
+    /// Automatically detects available hardware and selects the optimal configuration.
+    /// This is the recommended method for most use cases as it adapts to the runtime environment.
+    ///
+    /// # Returns
+    ///
+    /// - `Ok(DeviceScope)`: Successfully created device scope
+    /// - `Err(Status::BadAlloc)`: Memory allocation failed
+    /// - `Err(Status::Unknown)`: System detection failed
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::DeviceScope;
+    /// // Create default device scope
+    /// let device = DeviceScope::default().expect("Failed to initialize device");
+    ///
+    /// // Check detected capabilities
+    /// let caps = device.get_capabilities().unwrap();
+    /// println!("Hardware capabilities: 0x{:x}", caps);
+    ///
+    /// // Verify it's working
+    /// if device.is_gpu() {
+    ///     println!("GPU acceleration available");
+    /// } else {
+    ///     println!("Using CPU with {} cores", device.get_cpu_cores().unwrap());
+    /// }
+    /// ```
+    pub fn default() -> Result<Self, Status> {
+        let mut handle = ptr::null_mut();
+        let status = unsafe { sz_device_scope_init_default(&mut handle) };
+        match status {
+            Status::Success => Ok(Self { handle }),
+            err => Err(err),
+        }
+    }
+
+    /// Create a device scope for explicit CPU core count.
+    ///
+    /// Forces CPU-only execution with a specific number of threads. Useful for
+    /// benchmarking, testing, or when you need predictable performance characteristics.
+    ///
+    /// # Parameters
+    ///
+    /// - `cpu_cores`: Number of CPU cores to use (must be > 1)
+    ///
+    /// # Returns
+    ///
+    /// - `Ok(DeviceScope)`: Successfully created CPU device scope
+    /// - `Err(Status::InvalidArgument)`: Invalid core count (0 or 1)
+    /// - `Err(Status::BadAlloc)`: Failed to create thread pool
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::DeviceScope;
+    /// // Create scope for 4 CPU threads
+    /// let device = DeviceScope::cpu_cores(4).expect("Failed to create CPU scope");
+    ///
+    /// assert_eq!(device.get_cpu_cores().unwrap(), 4);
+    /// assert!(!device.is_gpu());
+    ///
+    /// // Use for reproducible benchmarks
+    /// let benchmark_device = DeviceScope::cpu_cores(8).unwrap();
+    /// // ... run benchmark with consistent thread count
+    /// ```
+    ///
+    /// # Performance Notes
+    ///
+    /// - Optimal core count is usually equal to physical cores
+    /// - Hyperthreading may not provide linear scaling for SIMD workloads
+    /// - Consider NUMA topology for systems with >16 cores
+    pub fn cpu_cores(cpu_cores: usize) -> Result<Self, Status> {
+        let mut handle = ptr::null_mut();
+        let status = unsafe { sz_device_scope_init_cpu_cores(cpu_cores, &mut handle) };
+        match status {
+            Status::Success => Ok(Self { handle }),
+            err => Err(err),
+        }
+    }
+
+    /// Create a device scope for a specific GPU device.
+    ///
+    /// Configures execution to use the specified GPU device. Requires CUDA or ROCm
+    /// to be available and the device ID to be valid.
+    ///
+    /// # Parameters
+    ///
+    /// - `gpu_device`: GPU device index (0-based)
+    ///
+    /// # Returns
+    ///
+    /// - `Ok(DeviceScope)`: Successfully configured GPU device
+    /// - `Err(Status::MissingGpu)`: CUDA/ROCm not available or invalid device
+    /// - `Err(Status::BadAlloc)`: GPU memory allocation failed
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::DeviceScope;
+    /// // Try to use first GPU
+    /// match DeviceScope::gpu_device(0) {
+    ///     Ok(device) => {
+    ///         println!("Using GPU device: {}", device.get_gpu_device().unwrap());
+    ///         assert!(device.is_gpu());
+    ///     }
+    ///     Err(e) => println!("GPU not available: {:?}", e),
+    /// }
+    /// ```
+    ///
+    /// # GPU Selection Strategy
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::DeviceScope;
+    /// // Try multiple GPUs in order of preference
+    /// let devices = [0, 1, 2];
+    /// let gpu_device = devices
+    ///     .iter()
+    ///     .find_map(|&id| DeviceScope::gpu_device(id).ok())
+    ///     .unwrap_or_else(|| DeviceScope::default().unwrap());
+    /// ```
+    ///
+    /// # Performance Notes
+    ///
+    /// - GPU is optimal for batch sizes >1000 string pairs
+    /// - Memory transfer overhead affects small workloads
+    /// - Use unified memory allocation for best GPU performance
+    pub fn gpu_device(gpu_device: usize) -> Result<Self, Status> {
+        let mut handle = ptr::null_mut();
+        let status = unsafe { sz_device_scope_init_gpu_device(gpu_device, &mut handle) };
+        match status {
+            Status::Success => Ok(Self { handle }),
+            err => Err(err),
+        }
+    }
+
+    /// Get the hardware capabilities mask for this device scope.
+    ///
+    /// Returns a bitmask indicating available hardware features like SIMD instructions,
+    /// GPU compute capabilities, and memory features. This can be used to verify
+    /// that required features are available before creating engines.
+    ///
+    /// # Returns
+    ///
+    /// - `Ok(Capability)`: Hardware capabilities bitmask
+    /// - `Err(Status::Unknown)`: Failed to query capabilities
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::DeviceScope;
+    /// let device = DeviceScope::default().unwrap();
+    /// let caps = device.get_capabilities().unwrap();
+    ///
+    /// // Check specific capabilities (values depend on sz_cap_* constants)
+    /// println!("Capabilities: 0x{:x}", caps);
+    /// if caps & 0x1 != 0 { println!("Basic SIMD available"); }
+    /// if caps & 0x2 != 0 { println!("Advanced SIMD available"); }
+    /// ```
+    pub fn get_capabilities(&self) -> Result<Capability, Status> {
+        let mut capabilities: Capability = 0;
+        let status = unsafe { sz_device_scope_get_capabilities(self.handle, &mut capabilities) };
+        match status {
+            Status::Success => Ok(capabilities),
+            err => Err(err),
+        }
+    }
+
+    /// Get the number of CPU cores configured for this device scope.
+    ///
+    /// Returns the number of CPU threads that will be used for parallel execution.
+    /// For GPU device scopes, this may return 0 or a fallback CPU count.
+    ///
+    /// # Returns
+    ///
+    /// - `Ok(usize)`: Number of configured CPU cores
+    /// - `Err(Status::Unknown)`: Failed to query configuration
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::DeviceScope;
+    /// let device = DeviceScope::cpu_cores(8).unwrap();
+    /// assert_eq!(device.get_cpu_cores().unwrap(), 8);
+    ///
+    /// // Default scope may use different count
+    /// let default_device = DeviceScope::default().unwrap();
+    /// let cores = default_device.get_cpu_cores().unwrap();
+    /// println!("Default device using {} CPU cores", cores);
+    /// ```
+    pub fn get_cpu_cores(&self) -> Result<usize, Status> {
+        let mut cpu_cores: usize = 0;
+        let status = unsafe { sz_device_scope_get_cpu_cores(self.handle, &mut cpu_cores) };
+        match status {
+            Status::Success => Ok(cpu_cores),
+            err => Err(err),
+        }
+    }
+
+    /// Get the GPU device ID configured for this device scope.
+    ///
+    /// Returns the GPU device index if this scope is configured for GPU execution.
+    /// For CPU-only device scopes, this will return an error.
+    ///
+    /// # Returns
+    ///
+    /// - `Ok(usize)`: GPU device index (0-based)
+    /// - `Err(Status::Unknown)`: Not configured for GPU or GPU unavailable
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::DeviceScope;
+    /// // GPU device scope
+    /// if let Ok(gpu_device) = DeviceScope::gpu_device(1) {
+    ///     assert_eq!(gpu_device.get_gpu_device().unwrap(), 1);
+    ///     assert!(gpu_device.is_gpu());
+    /// }
+    ///
+    /// // CPU device scope
+    /// let cpu_device = DeviceScope::cpu_cores(4).unwrap();
+    /// assert!(cpu_device.get_gpu_device().is_err());
+    /// assert!(!cpu_device.is_gpu());
+    /// ```
+    pub fn get_gpu_device(&self) -> Result<usize, Status> {
+        let mut gpu_device: usize = 0;
+        let status = unsafe { sz_device_scope_get_gpu_device(self.handle, &mut gpu_device) };
+        match status {
+            Status::Success => Ok(gpu_device),
+            err => Err(err),
+        }
+    }
+
+    /// Check if this device scope is configured for GPU execution.
+    ///
+    /// This is a convenience method that checks whether `get_gpu_device()` would succeed.
+    /// Use this to branch between GPU and CPU code paths.
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::DeviceScope;
+    /// let device = DeviceScope::default().unwrap();
+    ///
+    /// if device.is_gpu() {
+    ///     println!("GPU acceleration available on device {}",
+    ///              device.get_gpu_device().unwrap());
+    /// } else {
+    ///     println!("Using CPU with {} cores",
+    ///              device.get_cpu_cores().unwrap());
+    /// }
+    /// ```
+    pub fn is_gpu(&self) -> bool {
+        self.get_gpu_device().is_ok()
+    }
+}
+
+impl Drop for DeviceScope {
+    fn drop(&mut self) {
+        if !self.handle.is_null() {
+            unsafe {
+                sz_device_scope_free(self.handle);
+            }
+        }
+    }
+}
+
+unsafe impl Send for DeviceScope {}
+unsafe impl Sync for DeviceScope {}
+
+/// Builder for configuring fingerprinting engines with optimal parameters.
+///
+/// The builder pattern allows fine-tuning of fingerprinting parameters for different
+/// use cases. Default values are chosen to work well across diverse workloads,
+/// but specific applications may benefit from custom configuration.
+///
+/// # Default Configuration
+///
+/// - **Alphabet size**: 0 (auto-detect as 256 for binary data)
+/// - **Window widths**: None (use optimal defaults for hardware)
+/// - **Dimensions**: 1024 (provides good balance of accuracy and performance)
+///
+/// # Performance Guidelines
+///
+/// For optimal SIMD and GPU performance:
+/// - Use dimensions that are multiples of 64
+/// - Choose window widths appropriate for your data
+/// - Match alphabet size to your actual character set
+///
+/// # Examples
+///
+/// ```rust,no_run
+/// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+/// let device = DeviceScope::default().unwrap();
+///
+/// // Default configuration (good for most use cases)
+/// let general_engine = Fingerprints::builder()
+///     .build(&device)
+///     .unwrap();
+///
+/// // DNA sequence analysis
+/// let dna_engine = Fingerprints::builder()
+///     .dna()  // 4-character alphabet: A, C, G, T
+///     .window_widths(&[3, 5, 7, 9])  // k-mer sizes for genomics
+///     .dimensions(256)  // 64 * 4 window widths
+///     .build(&device)
+///     .unwrap();
+///
+/// // High-precision text analysis
+/// let text_engine = Fingerprints::builder()
+///     .ascii()  // 128-character alphabet
+///     .window_widths(&[3, 4, 5, 7, 9, 11, 15, 31])  // Multiple n-gram sizes
+///     .dimensions(512)  // 64 * 8 window widths
+///     .build(&device)
+///     .unwrap();
+/// ```
+///
+/// # Alphabet-Specific Presets
+///
+/// ```rust,no_run
+/// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+/// # let device = DeviceScope::default().unwrap();
+/// // Bioinformatics applications
+/// let dna_engine = Fingerprints::builder().dna().build(&device).unwrap();         // A,C,G,T
+/// let protein_engine = Fingerprints::builder().protein().build(&device).unwrap(); // 22 amino acids
+///
+/// // Text processing
+/// let ascii_engine = Fingerprints::builder().ascii().build(&device).unwrap();     // ASCII text
+/// let binary_engine = Fingerprints::builder().binary().build(&device).unwrap();   // Binary data
+/// ```
+pub struct FingerprintsBuilder {
+    alphabet_size: usize,
+    window_widths: Option<Vec<usize>>,
+    dimensions: usize,
+}
+
+impl FingerprintsBuilder {
+    /// Create a new builder with system-optimized defaults.
+    ///
+    /// Uses intelligent defaults that adapt to available hardware capabilities:
+    /// - Alphabet size: 256 (suitable for binary data and most text)
+    /// - Window widths: Hardware-optimized selection
+    /// - Dimensions: 1024 (balances accuracy and performance)
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::FingerprintsBuilder;
+    /// let builder = FingerprintsBuilder::new();
+    /// // Further customize with method chaining...
+    /// ```
+    pub fn new() -> Self {
+        Self {
+            alphabet_size: 0,
+            window_widths: None,
+            dimensions: 1024, // Default dimensions
+        }
+    }
+
+    /// Configure for binary data processing (256-character alphabet).
+    ///
+    /// Optimizes the engine for processing arbitrary binary data, including:
+    /// - File content analysis
+    /// - Network packet inspection
+    /// - Binary protocol parsing
+    /// - Raw data deduplication
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+    /// let device = DeviceScope::default().unwrap();
+    /// let engine = Fingerprints::builder()
+    ///     .binary()
+    ///     .build(&device)
+    ///     .unwrap();
+    ///
+    /// // Process binary data
+    /// let binary_data = vec![
+    ///     &[0x89, 0x50, 0x4E, 0x47][..], // PNG header
+    ///     &[0xFF, 0xD8, 0xFF, 0xE0][..], // JPEG header  
+    ///     &[0x50, 0x4B, 0x03, 0x04][..], // ZIP header
+    /// ];
+    /// let (hashes, counts) = engine.compute(&device, &binary_data, 256).unwrap();
+    /// ```
+    pub fn binary(mut self) -> Self {
+        self.alphabet_size = 256;
+        self
+    }
+
+    /// Configure for ASCII text processing (128-character alphabet).
+    ///
+    /// Optimizes for English text and ASCII-only content:
+    /// - Plain text documents
+    /// - Source code analysis
+    /// - Log file processing
+    /// - ASCII-based data formats
+    ///
+    /// Provides better hash distribution than binary mode for ASCII content.
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+    /// let device = DeviceScope::default().unwrap();
+    /// let engine = Fingerprints::builder()
+    ///     .ascii()
+    ///     .window_widths(&[3, 5, 7])  // Good for word-level analysis
+    ///     .build(&device)
+    ///     .unwrap();
+    ///
+    /// // Analyze text documents
+    /// let documents = vec![
+    ///     "The quick brown fox jumps over the lazy dog",
+    ///     "A journey of a thousand miles begins with a single step",
+    ///     "To be or not to be, that is the question",
+    /// ];
+    /// let (hashes, counts) = engine.compute(&device, &documents, 256).unwrap();
+    /// ```
+    pub fn ascii(mut self) -> Self {
+        self.alphabet_size = 128;
+        self
+    }
+
+    /// Configure for DNA sequence analysis (4-character alphabet: A, C, G, T).
+    ///
+    /// Highly optimized for genomic applications:
+    /// - DNA sequencing analysis
+    /// - Genome assembly
+    /// - Variant detection
+    /// - Phylogenetic analysis
+    /// - k-mer counting
+    ///
+    /// The small alphabet size provides excellent hash quality and performance.
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+    /// let device = DeviceScope::default().unwrap();
+    /// let engine = Fingerprints::builder()
+    ///     .dna()
+    ///     .window_widths(&[21, 31])  // Common k-mer sizes in genomics
+    ///     .dimensions(128)  // 64 * 2 window widths
+    ///     .build(&device)
+    ///     .unwrap();
+    ///
+    /// // Analyze DNA sequences
+    /// let sequences = vec![
+    ///     "ATCGATCGATCGATCGATCGATCG",
+    ///     "GCTAGCTAGCTAGCTAGCTAGCTA",
+    ///     "TTAAGGCCTTAAGGCCTTAAGGCC",
+    /// ];
+    /// let (k_mer_hashes, k_mer_counts) = engine.compute(&device, &sequences, 128).unwrap();
+    /// ```
+    pub fn dna(mut self) -> Self {
+        self.alphabet_size = 4;
+        self
+    }
+
+    /// Configure for protein sequence analysis (22-character amino acid alphabet).
+    ///
+    /// Optimized for proteomics and structural biology:
+    /// - Protein similarity search
+    /// - Structural motif discovery
+    /// - Functional domain analysis
+    /// - Evolutionary studies
+    /// - Mass spectrometry data analysis
+    ///
+    /// Uses the 20 standard amino acids plus Selenocysteine (U) and Pyrrolysine (O).
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+    /// let device = DeviceScope::default().unwrap();
+    /// let engine = Fingerprints::builder()
+    ///     .protein()
+    ///     .window_widths(&[5, 7, 9])  // Good for motif detection
+    ///     .dimensions(192)  // 64 * 3 window widths
+    ///     .build(&device)
+    ///     .unwrap();
+    ///
+    /// // Analyze protein sequences
+    /// let proteins = vec![
+    ///     "ACDEFGHIKLMNPQRSTVWY",  // Standard amino acids
+    ///     "MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRFKHLKTEAEMKASED",
+    ///     "GSHMVKVALYDYMPMNANDLQLRKGMHFRFKVAEQAARLIQPQEKKLAKAQQTLDLRSQIQQQQEQLGQ",
+    /// ];
+    /// let (peptide_hashes, peptide_counts) = engine.compute(&device, &proteins, 192).unwrap();
+    /// ```
+    pub fn protein(mut self) -> Self {
+        self.alphabet_size = 22;
+        self
+    }
+
+    /// Set a custom alphabet size for specialized applications.
+    ///
+    /// Use this for domain-specific alphabets or when you know the exact
+    /// character set size in your data. Common custom sizes:
+    /// - 16: Hexadecimal data
+    /// - 64: Base64 encoded data  
+    /// - 85: Base85 encoded data
+    /// - Custom: Domain-specific character sets
+    ///
+    /// # Parameters
+    ///
+    /// - `size`: Number of unique characters in your alphabet (> 0)
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+    /// let device = DeviceScope::default().unwrap();
+    ///
+    /// // Hexadecimal data (0-9, A-F)
+    /// let hex_engine = Fingerprints::builder()
+    ///     .alphabet_size(16)
+    ///     .build(&device)
+    ///     .unwrap();
+    ///
+    /// // Custom alphabet for specific domain
+    /// let custom_engine = Fingerprints::builder()
+    ///     .alphabet_size(32)  // Custom 32-character set
+    ///     .window_widths(&[4, 6, 8])
+    ///     .build(&device)
+    ///     .unwrap();
+    /// ```
+    pub fn alphabet_size(mut self, size: usize) -> Self {
+        self.alphabet_size = size;
+        self
+    }
+
+    /// Configure window widths (n-gram sizes) for rolling hash computation.
+    ///
+    /// Window widths determine the size of substrings used for hashing. Different
+    /// widths capture patterns at different scales. If not specified, the system
+    /// selects optimal widths based on hardware capabilities and alphabet size.
+    ///
+    /// # Guidelines
+    ///
+    /// - **Small widths (3-5)**: Capture local patterns, good for noisy data
+    /// - **Medium widths (7-15)**: Balance between specificity and robustness
+    /// - **Large widths (31+)**: Capture longer patterns, sensitive to changes
+    /// - **Multiple widths**: Provide multi-scale pattern detection
+    ///
+    /// # Domain-Specific Recommendations
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+    /// let device = DeviceScope::default().unwrap();
+    ///
+    /// // Natural language (word-level patterns)
+    /// let text_engine = Fingerprints::builder()
+    ///     .ascii()
+    ///     .window_widths(&[3, 4, 5, 7])  // Character n-grams
+    ///     .build(&device)
+    ///     .unwrap();
+    ///
+    /// // Genomics (k-mer analysis)
+    /// let genomics_engine = Fingerprints::builder()
+    ///     .dna()
+    ///     .window_widths(&[15, 21, 31])  // Standard k-mer sizes
+    ///     .build(&device)
+    ///     .unwrap();
+    ///
+    /// // Document similarity (longer patterns)
+    /// let doc_engine = Fingerprints::builder()
+    ///     .binary()
+    ///     .window_widths(&[5, 7, 11, 15, 31])  // Multi-scale analysis
+    ///     .build(&device)
+    ///     .unwrap();
+    /// ```
+    ///
+    /// # Performance Impact
+    ///
+    /// - More windows → better accuracy but slower computation
+    /// - Use multiples of the number of hash functions for SIMD efficiency
+    /// - Consider total dimensions = 64 × number_of_windows for optimal performance
+    pub fn window_widths(mut self, widths: &[usize]) -> Self {
+        self.window_widths = Some(widths.to_vec());
+        self
+    }
+
+    /// Set the total number of dimensions (hash functions) per fingerprint.
+    ///
+    /// Higher dimensions provide better accuracy and collision resistance at the
+    /// cost of increased memory usage and computation time. The optimal value
+    /// depends on your accuracy requirements and available resources.
+    ///
+    /// # Performance Guidelines
+    ///
+    /// For optimal SIMD performance, use dimensions that are multiples of 64:
+    /// - **64**: Minimal configuration, suitable for rapid prototyping
+    /// - **128**: Good for small-scale similarity detection
+    /// - **256**: Balanced accuracy/performance for most applications
+    /// - **512**: High accuracy for critical applications
+    /// - **1024**: Maximum accuracy, use when precision is paramount
+    ///
+    /// # Recommended Formulas
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+    /// let device = DeviceScope::default().unwrap();
+    ///
+    /// // Basic formula: 64 * number_of_window_widths
+    /// let balanced_engine = Fingerprints::builder()
+    ///     .dna()
+    ///     .window_widths(&[3, 5, 7, 9])  // 4 widths
+    ///     .dimensions(256)  // 64 * 4 = 256
+    ///     .build(&device)
+    ///     .unwrap();
+    ///
+    /// // High-precision configuration
+    /// let precision_engine = Fingerprints::builder()
+    ///     .binary()
+    ///     .window_widths(&[5, 7, 11, 15])  // 4 widths
+    ///     .dimensions(512)  // 128 * 4 = 512 for extra precision
+    ///     .build(&device)
+    ///     .unwrap();
+    /// ```
+    ///
+    /// # Memory Usage
+    ///
+    /// Each fingerprint uses `dimensions * sizeof(u32)` bytes for hashes plus
+    /// the same for counts. With 1024 dimensions:
+    /// - Per fingerprint: 8KB (4KB hashes + 4KB counts)
+    /// - 1000 fingerprints: ~8MB total memory
+    pub fn dimensions(mut self, dimensions: usize) -> Self {
+        self.dimensions = dimensions;
+        self
+    }
+
+    /// Build the fingerprinting engine with the configured parameters.
+    ///
+    /// Creates an optimized fingerprinting engine based on the builder configuration
+    /// and the target device capabilities. The engine automatically adapts its
+    /// implementation strategy based on available hardware features.
+    ///
+    /// # Parameter Resolution
+    ///
+    /// - **alphabet_size = 0**: Defaults to 256 (binary mode)
+    /// - **window_widths = None**: Uses hardware-optimized defaults
+    /// - **dimensions**: Used as specified, should be multiple of 64
+    ///
+    /// # Returns
+    ///
+    /// - `Ok(Fingerprints)`: Successfully created engine
+    /// - `Err(Status::BadAlloc)`: Memory allocation failed
+    /// - `Err(Status::InvalidArgument)`: Invalid parameter combination
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+    /// let device = DeviceScope::default().unwrap();
+    ///
+    /// // Build with validation
+    /// let engine = Fingerprints::builder()
+    ///     .dna()
+    ///     .dimensions(256)
+    ///     .build(&device)
+    ///     .expect("Failed to create fingerprinting engine");
+    ///
+    /// // Verify engine is ready for use
+    /// let test_data = vec!["ATCGATCG"];
+    /// let result = engine.compute(&device, &test_data, 256);
+    /// assert!(result.is_ok());
+    /// ```
+    pub fn build(self, device: &DeviceScope) -> Result<Fingerprints, Status> {
+        let mut engine: FingerprintsHandle = ptr::null_mut();
+        let capabilities = device.get_capabilities().unwrap_or(0);
+
+        let (widths_ptr, widths_len) = match &self.window_widths {
+            Some(widths) => (widths.as_ptr(), widths.len()),
+            None => (ptr::null(), 0),
+        };
+
+        let status = unsafe {
+            sz_fingerprints_init(
+                self.dimensions,
+                self.alphabet_size,
+                widths_ptr,
+                widths_len,
+                ptr::null(), // No custom allocator
+                capabilities,
+                &mut engine,
+            )
+        };
+
+        match status {
+            Status::Success => Ok(Fingerprints { handle: engine }),
+            err => Err(err),
+        }
+    }
+}
+
+/// High-performance fingerprinting engine for similarity detection and clustering.
+///
+/// The fingerprinting engine computes Min-Hash signatures and Count-Min-Sketch
+/// data structures for collections of strings. These techniques enable efficient
+/// similarity estimation, duplicate detection, and approximate set operations.
+///
+/// # Algorithms
+///
+/// ## Min-Hash
+/// Locality-sensitive hashing technique that estimates Jaccard similarity:
+/// - Maps each string to a fixed-size signature
+/// - Similar strings produce similar signatures
+/// - Enables fast similarity queries in high-dimensional spaces
+///
+/// ## Count-Min-Sketch
+/// Probabilistic data structure for frequency estimation:
+/// - Tracks approximate frequencies of elements
+/// - Space-efficient alternative to exact counting
+/// - Supports streaming data processing
+///
+/// ## Rolling Hash
+/// Efficient hash computation over sliding windows:
+/// - Multiple window sizes capture patterns at different scales
+/// - SIMD optimizations for parallel hash computation
+/// - Configurable alphabet sizes for domain-specific optimization
+///
+/// # Use Cases
+///
+/// - **Document deduplication**: Find near-duplicate documents
+/// - **Plagiarism detection**: Identify similar text passages
+/// - **Genomic analysis**: k-mer counting and sequence clustering
+/// - **Web crawling**: Detect duplicate web pages
+/// - **Data cleaning**: Merge similar database records
+/// - **Content filtering**: Identify spam or harmful content
+///
+/// # Performance Characteristics
+///
+/// - **Throughput**: Processes millions of strings per second
+/// - **Memory**: O(dimensions × batch_size) working memory
+/// - **Accuracy**: Configurable precision vs. speed tradeoffs
+/// - **Scalability**: Linear scaling with CPU cores, excellent GPU acceleration
+///
+/// # Examples
+///
+/// ## Document Similarity
+///
+/// ```rust,no_run
+/// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+/// let device = DeviceScope::default().unwrap();
+/// let engine = Fingerprints::builder()
+///     .ascii()  // Text processing optimized
+///     .dimensions(512)  // High precision
+///     .build(&device)
+///     .unwrap();
+///
+/// let documents = vec![
+///     "The quick brown fox jumps over the lazy dog",
+///     "A quick brown fox leaps over a lazy dog",  // Similar
+///     "Completely different content about science", // Dissimilar
+/// ];
+///
+/// let (hashes, _counts) = engine.compute(&device, &documents, 512).unwrap();
+///
+/// // Compare fingerprints to estimate similarity
+/// // Higher overlap in hash values indicates higher similarity
+/// ```
+///
+/// ## Genomic k-mer Analysis
+///
+/// ```rust,no_run
+/// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+/// let device = DeviceScope::default().unwrap();
+/// let engine = Fingerprints::builder()
+///     .dna()  // 4-character alphabet optimization
+///     .window_widths(&[21, 31])  // Standard k-mer sizes
+///     .dimensions(128)  // Memory-efficient
+///     .build(&device)
+///     .unwrap();
+///
+/// // Analyze genomic sequences
+/// let sequences = vec![
+///     "ATCGATCGATCGATCGATCGATCGATCGATCG",
+///     "GCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTA",
+///     "TTAAGGCCTTAAGGCCTTAAGGCCTTAAGGCC",
+/// ];
+///
+/// let (k_mer_hashes, k_mer_counts) = engine.compute(&device, &sequences, 128).unwrap();
+///
+/// // Use hashes for sequence clustering or counts for abundance analysis
+/// ```
+pub struct Fingerprints {
+    handle: FingerprintsHandle,
+}
+
+impl Fingerprints {
+    /// Create a new builder for configuring the fingerprinting engine.
+    ///
+    /// Returns a builder instance with intelligent defaults that can be customized
+    /// for specific use cases. The builder pattern provides a fluent interface
+    /// for configuring complex fingerprinting parameters.
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::Fingerprints;
+    /// // Start with default configuration
+    /// let builder = Fingerprints::builder();
+    ///
+    /// // Customize as needed
+    /// // let engine = builder.dna().dimensions(256).build(&device)?;
+    /// ```
+    pub fn builder() -> FingerprintsBuilder {
+        FingerprintsBuilder::new()
+    }
+
+    /// Compute Min-Hash fingerprints and Count-Min-Sketch data for a collection of strings.
+    ///
+    /// Processes the input strings and generates two types of output:
+    /// - **Min-Hashes**: Locality-sensitive hash signatures for similarity detection
+    /// - **Min-Counts**: Frequency sketches for approximate counting queries
+    ///
+    /// # Parameters
+    ///
+    /// - `device`: Device scope for execution (CPU or GPU)
+    /// - `strings`: Collection of input strings to fingerprint
+    /// - `dimensions`: Number of hash functions per fingerprint (should match engine config)
+    ///
+    /// # Returns
+    ///
+    /// - `Ok((UnifiedVec<u32>, UnifiedVec<u32>))`: (min_hashes, min_counts) in unified memory
+    /// - `Err(Status)`: Computation failed
+    ///
+    /// # Output Format
+    ///
+    /// Both output vectors have layout: `num_strings × dimensions`
+    /// - `min_hashes[i * dimensions + j]`: j-th hash of i-th string
+    /// - `min_counts[i * dimensions + j]`: j-th count of i-th string
+    ///
+    /// # Similarity Estimation
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+    /// let device = DeviceScope::default().unwrap();
+    /// let engine = Fingerprints::builder().build(&device).unwrap();
+    ///
+    /// let strings = vec!["hello world", "hello word", "goodbye world"];
+    /// let dimensions = 256;
+    /// let (hashes, _counts) = engine.compute(&device, &strings, dimensions).unwrap();
+    ///
+    /// // Estimate Jaccard similarity between strings 0 and 1
+    /// let mut matches = 0;
+    /// for i in 0..dimensions {
+    ///     if hashes[0 * dimensions + i] == hashes[1 * dimensions + i] {
+    ///         matches += 1;
+    ///     }
+    /// }
+    /// let similarity = matches as f64 / dimensions as f64;
+    /// println!("Estimated Jaccard similarity: {:.3}", similarity);
+    /// ```
+    ///
+    /// # Memory Management
+    ///
+    /// Uses unified memory allocation for optimal GPU performance:
+    /// - CPU: Standard heap allocation
+    /// - GPU: CUDA unified memory (accessible from both CPU and GPU)
+    /// - Automatic memory cleanup when vectors are dropped
+    ///
+    /// # Performance Notes
+    ///
+    /// - GPU optimal for large batches (>1000 strings)
+    /// - Memory usage: 8 bytes per string per dimension
+    /// - Processing time scales linearly with total input size
+    /// - SIMD acceleration provides significant speedup on modern CPUs
+    pub fn compute<T, S>(
+        &self,
+        device: &DeviceScope,
+        strings: T,
+        dimensions: usize,
+    ) -> Result<(UnifiedVec<u32>, UnifiedVec<u32>), Status>
+    where
+        T: AsRef<[S]>,
+        S: AsRef<[u8]>,
+    {
+        let strings_slice = strings.as_ref();
+        let num_strings = strings_slice.len();
+        let hashes_size = num_strings * dimensions;
+        let counts_size = num_strings * dimensions;
+
+        let mut min_hashes = UnifiedVec::with_capacity_in(hashes_size, UnifiedAlloc);
+        min_hashes.resize(hashes_size, 0);
+        let mut min_counts = UnifiedVec::with_capacity_in(counts_size, UnifiedAlloc);
+        min_counts.resize(counts_size, 0);
+
+        let hashes_stride = dimensions * core::mem::size_of::<u32>();
+        let counts_stride = dimensions * core::mem::size_of::<u32>();
+
+        if device.is_gpu() {
+            let (tape, use_64bit) = create_tape(strings_slice)?;
+
+            let status = if use_64bit {
+                let tape_view = create_u64tape_view(&tape);
+                unsafe {
+                    sz_fingerprints_u64tape(
+                        self.handle,
+                        device.handle,
+                        &tape_view as *const _ as *const c_void,
+                        min_hashes.as_mut_ptr(),
+                        hashes_stride,
+                        min_counts.as_mut_ptr(),
+                        counts_stride,
+                    )
+                }
+            } else {
+                let tape_view = create_u32tape_view(&tape);
+                unsafe {
+                    sz_fingerprints_u32tape(
+                        self.handle,
+                        device.handle,
+                        &tape_view as *const _ as *const c_void,
+                        min_hashes.as_mut_ptr(),
+                        hashes_stride,
+                        min_counts.as_mut_ptr(),
+                        counts_stride,
+                    )
+                }
+            };
+            match status {
+                Status::Success => Ok((min_hashes, min_counts)),
+                err => Err(err),
+            }
+        } else {
+            let sequence = create_sequence_view(strings_slice);
+            let status = unsafe {
+                sz_fingerprints_sequence(
+                    self.handle,
+                    device.handle,
+                    &sequence as *const _ as *const c_void,
+                    min_hashes.as_mut_ptr(),
+                    hashes_stride,
+                    min_counts.as_mut_ptr(),
+                    counts_stride,
+                )
+            };
+            match status {
+                Status::Success => Ok((min_hashes, min_counts)),
+                err => Err(err),
+            }
+        }
+    }
+}
+
+impl Drop for Fingerprints {
+    fn drop(&mut self) {
+        if !self.handle.is_null() {
+            unsafe {
+                sz_fingerprints_free(self.handle);
+            }
+        }
+    }
+}
+
+unsafe impl Send for Fingerprints {}
+unsafe impl Sync for Fingerprints {}
+
+/// Internal representation of sz_sequence_t for passing to C
+#[repr(C)]
+struct SzSequence {
+    handle: *mut c_void,
+    count: usize,
+    get_start: extern "C" fn(*mut c_void, usize) -> *const u8,
+    get_length: extern "C" fn(*mut c_void, usize) -> usize,
+    // Additional fields for our implementation
+    starts: *const *const u8,
+    lengths: *const usize,
+}
+
+/// Apache Arrow-like tape for non-NULL strings with 32-bit offsets
+#[repr(C)]
+struct SzSequenceU32Tape {
+    data: *const u8,
+    offsets: *const u32,
+    count: usize,
+}
+
+/// Apache Arrow-like tape for non-NULL strings with 64-bit offsets
+#[repr(C)]
+struct SzSequenceU64Tape {
+    data: *const u8,
+    offsets: *const u64,
+    count: usize,
+}
+
+/// Opaque handles for similarity engines
+pub type FingerprintsHandle = *mut c_void;
+pub type LevenshteinDistancesHandle = *mut c_void;
+pub type LevenshteinDistancesUtf8Handle = *mut c_void;
+pub type NeedlemanWunschScoresHandle = *mut c_void;
+pub type SmithWatermanScoresHandle = *mut c_void;
+
+// C API bindings
+extern "C" {
+    // Device scope functions
+    fn sz_device_scope_init_default(scope: *mut *mut c_void) -> Status;
+    fn sz_device_scope_init_cpu_cores(cpu_cores: usize, scope: *mut *mut c_void) -> Status;
+    fn sz_device_scope_init_gpu_device(gpu_device: usize, scope: *mut *mut c_void) -> Status;
+    fn sz_device_scope_get_capabilities(scope: *mut c_void, capabilities: *mut Capability) -> Status;
+    fn sz_device_scope_get_cpu_cores(scope: *mut c_void, cpu_cores: *mut usize) -> Status;
+    fn sz_device_scope_get_gpu_device(scope: *mut c_void, gpu_device: *mut usize) -> Status;
+    fn sz_device_scope_free(scope: *mut c_void);
+
+    // Levenshtein distance functions
+    fn sz_levenshtein_distances_init(
+        match_cost: i8,
+        mismatch_cost: i8,
+        open_cost: i8,
+        extend_cost: i8,
+        alloc: *const c_void,
+        capabilities: Capability,
+        engine: *mut LevenshteinDistancesHandle,
+    ) -> Status;
+
+    fn sz_levenshtein_distances_sequence(
+        engine: LevenshteinDistancesHandle,
+        device: *mut c_void,
+        a: *const c_void, // sz_sequence_t
+        b: *const c_void, // sz_sequence_t
+        results: *mut usize,
+        results_stride: usize,
+    ) -> Status;
+
+    fn sz_levenshtein_distances_u32tape(
+        engine: LevenshteinDistancesHandle,
+        device: *mut c_void,
+        a: *const c_void, // sz_sequence_u32tape_t
+        b: *const c_void, // sz_sequence_u32tape_t
+        results: *mut usize,
+        results_stride: usize,
+    ) -> Status;
+
+    fn sz_levenshtein_distances_u64tape(
+        engine: LevenshteinDistancesHandle,
+        device: *mut c_void,
+        a: *const c_void, // sz_sequence_u64tape_t
+        b: *const c_void, // sz_sequence_u64tape_t
+        results: *mut usize,
+        results_stride: usize,
+    ) -> Status;
+
+    fn sz_levenshtein_distances_free(engine: LevenshteinDistancesHandle);
+
+    // Levenshtein distance UTF-8 functions
+    fn sz_levenshtein_distances_utf8_init(
+        match_cost: i8,
+        mismatch_cost: i8,
+        open_cost: i8,
+        extend_cost: i8,
+        alloc: *const c_void,
+        capabilities: Capability,
+        engine: *mut LevenshteinDistancesUtf8Handle,
+    ) -> Status;
+
+    fn sz_levenshtein_distances_utf8_sequence(
+        engine: LevenshteinDistancesUtf8Handle,
+        device: *mut c_void,
+        a: *const c_void, // sz_sequence_t
+        b: *const c_void, // sz_sequence_t
+        results: *mut usize,
+        results_stride: usize,
+    ) -> Status;
+
+    fn sz_levenshtein_distances_utf8_u32tape(
+        engine: LevenshteinDistancesUtf8Handle,
+        device: *mut c_void,
+        a: *const c_void, // sz_sequence_u32tape_t
+        b: *const c_void, // sz_sequence_u32tape_t
+        results: *mut usize,
+        results_stride: usize,
+    ) -> Status;
+
+    fn sz_levenshtein_distances_utf8_u64tape(
+        engine: LevenshteinDistancesUtf8Handle,
+        device: *mut c_void,
+        a: *const c_void, // sz_sequence_u64tape_t
+        b: *const c_void, // sz_sequence_u64tape_t
+        results: *mut usize,
+        results_stride: usize,
+    ) -> Status;
+
+    fn sz_levenshtein_distances_utf8_free(engine: LevenshteinDistancesUtf8Handle);
+
+    // Needleman-Wunsch scoring functions
+    fn sz_needleman_wunsch_scores_init(
+        subs: *const i8, // 256x256 substitution matrix
+        open_cost: i8,
+        extend_cost: i8,
+        alloc: *const c_void,
+        capabilities: Capability,
+        engine: *mut NeedlemanWunschScoresHandle,
+    ) -> Status;
+
+    fn sz_needleman_wunsch_scores_sequence(
+        engine: NeedlemanWunschScoresHandle,
+        device: *mut c_void,
+        a: *const c_void, // sz_sequence_t
+        b: *const c_void, // sz_sequence_t
+        results: *mut isize,
+        results_stride: usize,
+    ) -> Status;
+
+    fn sz_needleman_wunsch_scores_u32tape(
+        engine: NeedlemanWunschScoresHandle,
+        device: *mut c_void,
+        a: *const c_void, // sz_sequence_u32tape_t
+        b: *const c_void, // sz_sequence_u32tape_t
+        results: *mut isize,
+        results_stride: usize,
+    ) -> Status;
+
+    fn sz_needleman_wunsch_scores_u64tape(
+        engine: NeedlemanWunschScoresHandle,
+        device: *mut c_void,
+        a: *const c_void, // sz_sequence_u64tape_t
+        b: *const c_void, // sz_sequence_u64tape_t
+        results: *mut isize,
+        results_stride: usize,
+    ) -> Status;
+
+    fn sz_needleman_wunsch_scores_free(engine: NeedlemanWunschScoresHandle);
+
+    // Smith-Waterman scoring functions
+    fn sz_smith_waterman_scores_init(
+        subs: *const i8, // 256x256 substitution matrix
+        open_cost: i8,
+        extend_cost: i8,
+        alloc: *const c_void,
+        capabilities: Capability,
+        engine: *mut SmithWatermanScoresHandle,
+    ) -> Status;
+
+    fn sz_smith_waterman_scores_sequence(
+        engine: SmithWatermanScoresHandle,
+        device: *mut c_void,
+        a: *const c_void, // sz_sequence_t
+        b: *const c_void, // sz_sequence_t
+        results: *mut isize,
+        results_stride: usize,
+    ) -> Status;
+
+    fn sz_smith_waterman_scores_u32tape(
+        engine: SmithWatermanScoresHandle,
+        device: *mut c_void,
+        a: *const c_void, // sz_sequence_u32tape_t
+        b: *const c_void, // sz_sequence_u32tape_t
+        results: *mut isize,
+        results_stride: usize,
+    ) -> Status;
+
+    fn sz_smith_waterman_scores_u64tape(
+        engine: SmithWatermanScoresHandle,
+        device: *mut c_void,
+        a: *const c_void, // sz_sequence_u64tape_t
+        b: *const c_void, // sz_sequence_u64tape_t
+        results: *mut isize,
+        results_stride: usize,
+    ) -> Status;
+
+    fn sz_smith_waterman_scores_free(engine: SmithWatermanScoresHandle);
+
+    // Fingerprinting functions
+    fn sz_fingerprints_init(
+        dimensions: usize,
+        alphabet_size: usize,
+        window_widths: *const usize,
+        window_widths_count: usize,
+        alloc: *const c_void, // MemoryAllocator - using null for default
+        capabilities: Capability,
+        engine: *mut FingerprintsHandle,
+    ) -> Status;
+
+    fn sz_fingerprints_sequence(
+        engine: FingerprintsHandle,
+        device: *mut c_void,  // DeviceScope
+        texts: *const c_void, // sz_sequence_t
+        min_hashes: *mut u32,
+        min_hashes_stride: usize,
+        min_counts: *mut u32,
+        min_counts_stride: usize,
+    ) -> Status;
+
+    fn sz_fingerprints_u32tape(
+        engine: FingerprintsHandle,
+        device: *mut c_void,  // DeviceScope
+        texts: *const c_void, // sz_sequence_u32tape_t
+        min_hashes: *mut u32,
+        min_hashes_stride: usize,
+        min_counts: *mut u32,
+        min_counts_stride: usize,
+    ) -> Status;
+
+    fn sz_fingerprints_u64tape(
+        engine: FingerprintsHandle,
+        device: *mut c_void,  // DeviceScope
+        texts: *const c_void, // sz_sequence_u64tape_t
+        min_hashes: *mut u32,
+        min_hashes_stride: usize,
+        min_counts: *mut u32,
+        min_counts_stride: usize,
+    ) -> Status;
+
+    fn sz_fingerprints_free(engine: FingerprintsHandle);
+
+    // Unified allocator functions
+    fn sz_unified_alloc(size_bytes: usize) -> *mut c_void;
+    fn sz_unified_free(ptr: *mut c_void, size_bytes: usize);
+}
 
-pub mod szs {
-    use super::*;
-    use alloc::vec::Vec;
-    use core::ffi::c_void;
+/// Unified memory allocator that uses CUDA unified memory when available,
+/// falls back to malloc otherwise. Works with allocator-api2.
+pub struct UnifiedAlloc;
+
+unsafe impl Allocator for UnifiedAlloc {
+    fn allocate(&self, layout: Layout) -> Result<core::ptr::NonNull<[u8]>, AllocError> {
+        let size = layout.size();
+        if size == 0 {
+            // For zero-sized allocations, return a properly aligned non-null dangling pointer
+            let ptr = core::ptr::NonNull::new(layout.align() as *mut u8).ok_or(AllocError)?;
+            return Ok(core::ptr::NonNull::slice_from_raw_parts(ptr, 0));
+        }
 
-    /// Capability flags
-    pub type Capability = u32;
+        let ptr = unsafe { sz_unified_alloc(size) };
+        if ptr.is_null() {
+            return Err(AllocError);
+        }
 
-    // Import from stringzilla module
-    pub use crate::sz::Status;
+        let ptr = core::ptr::NonNull::new(ptr as *mut u8).ok_or(AllocError)?;
+        Ok(core::ptr::NonNull::slice_from_raw_parts(ptr, size))
+    }
 
-    /// Device scope wrapper for parallel execution
-    pub struct DeviceScope {
-        handle: *mut c_void,
+    unsafe fn deallocate(&self, ptr: core::ptr::NonNull<u8>, layout: Layout) {
+        if layout.size() != 0 {
+            sz_unified_free(ptr.as_ptr() as *mut c_void, layout.size());
+        }
     }
+}
 
-    impl DeviceScope {
-        /// Create a default device scope
-        ///
-        /// # Examples
-        ///
-        /// ```
-        /// # use stringzilla::stringzillas::szs::DeviceScope;
-        /// let device = DeviceScope::default().unwrap();
-        /// ```
-        pub fn default() -> Result<Self, Status> {
-            let mut handle = ptr::null_mut();
-            let status = unsafe { sz_device_scope_init_default(&mut handle) };
-            match status {
-                Status::Success => Ok(Self { handle }),
-                err => Err(err),
-            }
+/// Type alias for Vec with unified allocator
+pub type UnifiedVec<T> = allocator_api2::vec::Vec<T, UnifiedAlloc>;
+
+/// Levenshtein distance engine for batch processing of binary sequences.
+///
+/// Computes edit distances between pairs of byte sequences using the Wagner-Fischer
+/// dynamic programming algorithm with configurable gap costs. This engine is optimized
+/// for processing large batches of sequence pairs in parallel.
+///
+/// # Algorithm
+///
+/// Uses Wagner-Fischer dynamic programming with affine gap costs:
+/// - **Match cost**: Cost when characters are identical (typically ≤ 0)
+/// - **Mismatch cost**: Cost when characters differ (typically > 0)  
+/// - **Open cost**: Cost to start a gap (insertion/deletion)
+/// - **Extend cost**: Cost to extend an existing gap (usually < open cost)
+///
+/// Time complexity: O(nm) per pair, where n and m are sequence lengths.
+/// Space complexity: O(n+m) with optimizations for large batches.
+///
+/// # Use Cases
+///
+/// - **Spell checking**: Finding closest dictionary matches
+/// - **File deduplication**: Detecting similar binary files
+/// - **Data cleaning**: Merging near-duplicate records
+/// - **Fuzzy matching**: Approximate string search
+///
+/// # Examples
+///
+/// ```rust,no_run
+/// # use stringzilla::stringzillas::szs::{DeviceScope, LevenshteinDistances};
+/// // Create engine with standard costs
+/// let device = DeviceScope::default().unwrap();
+/// let engine = LevenshteinDistances::new(
+///     &device,
+///     0,  // match: no cost for identical chars
+///     1,  // mismatch: unit cost for different chars
+///     1,  // open: unit cost to start gap
+///     1,  // extend: unit cost to extend gap
+/// ).unwrap();
+///
+/// // Compare similar strings
+/// let strings_a = vec!["kitten", "saturday"];
+/// let strings_b = vec!["sitting", "sunday"];
+/// let distances = engine.compute(&device, &strings_a, &strings_b).unwrap();
+///
+/// println!("Distance 'kitten' -> 'sitting': {}", distances[0]);  // 3
+/// println!("Distance 'saturday' -> 'sunday': {}", distances[1]); // 3
+/// ```
+///
+/// # Advanced Configuration
+///
+/// ```rust,no_run
+/// # use stringzilla::stringzillas::szs::{DeviceScope, LevenshteinDistances};
+/// // Biased towards insertions/deletions over substitutions
+/// let device = DeviceScope::default().unwrap();
+/// let engine = LevenshteinDistances::new(
+///     &device,
+///     -1, // match: reward for matches
+///     3,  // mismatch: high cost for substitutions  
+///     1,  // open: low cost for gaps
+///     1,  // extend: same cost to extend
+/// ).unwrap();
+/// ```
+///
+/// # Performance Optimization
+///
+/// ```rust,no_run
+/// # use stringzilla::stringzillas::szs::{DeviceScope, LevenshteinDistances};
+/// // For maximum performance with large batches
+/// let device = DeviceScope::cpu_cores(8).unwrap(); // or gpu_device(0)
+/// let engine = LevenshteinDistances::new(&device, 0, 1, 1, 1).unwrap();
+///
+/// // Process thousands of pairs efficiently
+/// let large_batch_a: Vec<&str> = (0..10000).map(|i| "test_string").collect();
+/// let large_batch_b: Vec<&str> = (0..10000).map(|i| "test_strong").collect();
+/// let distances = engine.compute(&device, &large_batch_a, &large_batch_b).unwrap();
+/// ```
+pub struct LevenshteinDistances {
+    handle: LevenshteinDistancesHandle,
+}
+
+impl LevenshteinDistances {
+    /// Create a new Levenshtein distances engine with specified costs.
+    ///
+    /// Initializes the engine with custom gap costs for fine-tuned distance computation.
+    /// The engine automatically selects optimal algorithms based on the device capabilities.
+    ///
+    /// # Parameters
+    ///
+    /// - `device`: Device scope for execution context and capabilities
+    /// - `match_cost`: Cost when characters match (typically ≤ 0)
+    /// - `mismatch_cost`: Cost when characters differ (typically > 0)
+    /// - `open_cost`: Cost to open a gap (insertion/deletion)
+    /// - `extend_cost`: Cost to extend existing gap (usually ≤ open_cost)
+    ///
+    /// # Returns
+    ///
+    /// - `Ok(LevenshteinDistances)`: Successfully initialized engine
+    /// - `Err(Status::BadAlloc)`: Memory allocation failed
+    /// - `Err(Status::InvalidArgument)`: Invalid cost configuration
+    ///
+    /// # Cost Configuration Guidelines
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::{DeviceScope, LevenshteinDistances};
+    /// let device = DeviceScope::default().unwrap();
+    ///
+    /// // Standard Levenshtein distance (all operations cost 1)
+    /// let standard = LevenshteinDistances::new(&device, 0, 1, 1, 1).unwrap();
+    ///
+    /// // Prefer matches, penalize mismatches heavily
+    /// let match_biased = LevenshteinDistances::new(&device, -1, 3, 2, 1).unwrap();
+    ///
+    /// // Linear gap costs (open == extend)
+    /// let linear_gaps = LevenshteinDistances::new(&device, 0, 1, 2, 2).unwrap();
+    ///
+    /// // Affine gap costs (open > extend, penalizes gap opening)
+    /// let affine_gaps = LevenshteinDistances::new(&device, 0, 1, 3, 1).unwrap();
+    /// ```
+    pub fn new(
+        device: &DeviceScope,
+        match_cost: i8,
+        mismatch_cost: i8,
+        open_cost: i8,
+        extend_cost: i8,
+    ) -> Result<Self, Status> {
+        let mut handle = ptr::null_mut();
+        let capabilities = device.get_capabilities().unwrap_or(0);
+        let status = unsafe {
+            sz_levenshtein_distances_init(
+                match_cost,
+                mismatch_cost,
+                open_cost,
+                extend_cost,
+                ptr::null(),
+                capabilities,
+                &mut handle,
+            )
+        };
+        match status {
+            Status::Success => Ok(Self { handle }),
+            err => Err(err),
         }
+    }
+
+    /// Compute Levenshtein distances between sequence pairs.
+    ///
+    /// Processes pairs of sequences in parallel, computing edit distance for each pair.
+    /// The function automatically handles both CPU SIMD and GPU acceleration based
+    /// on the device scope configuration.
+    ///
+    /// # Parameters
+    ///
+    /// - `device`: Device scope for execution
+    /// - `sequences_a`: First collection of sequences
+    /// - `sequences_b`: Second collection of sequences  
+    ///
+    /// # Returns
+    ///
+    /// - `Ok(UnifiedVec<usize>)`: Vector of distances, one per sequence pair
+    /// - `Err(Status)`: Computation failed
+    ///
+    /// # Behavior
+    ///
+    /// - Pairs sequences by index: (a[0], b[0]), (a[1], b[1]), etc.
+    /// - Result length equals `min(sequences_a.len(), sequences_b.len())`
+    /// - Uses unified memory allocation for GPU compatibility
+    /// - Empty sequences are handled correctly (distance equals other sequence length)
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::{DeviceScope, LevenshteinDistances};
+    /// let device = DeviceScope::default().unwrap();
+    /// let engine = LevenshteinDistances::new(&device, 0, 1, 1, 1).unwrap();
+    ///
+    /// // Basic usage
+    /// let words_a = vec!["cat", "dog", "bird"];
+    /// let words_b = vec!["bat", "fog", "word"];
+    /// let distances = engine.compute(&device, &words_a, &words_b).unwrap();
+    ///
+    /// assert_eq!(distances.len(), 3);
+    /// println!("Distances: {:?}", distances); // [1, 1, 3]
+    /// ```
+    ///
+    /// # Performance Notes
+    ///
+    /// - CPU performance scales with SIMD width and core count
+    /// - GPU optimal for batches >1000 pairs with medium-length sequences
+    /// - Memory layout optimized for cache efficiency
+    /// - Consider sequence length distribution for optimal performance
+    pub fn compute<T, S>(
+        &self,
+        device: &DeviceScope,
+        sequences_a: T,
+        sequences_b: T,
+    ) -> Result<UnifiedVec<usize>, Status>
+    where
+        T: AsRef<[S]>,
+        S: AsRef<[u8]>,
+    {
+        let seq_a_slice = sequences_a.as_ref();
+        let seq_b_slice = sequences_b.as_ref();
+        let num_pairs = seq_a_slice.len().min(seq_b_slice.len());
+
+        let mut results = UnifiedVec::with_capacity_in(num_pairs, UnifiedAlloc);
+        results.resize(num_pairs, 0);
+
+        let results_stride = core::mem::size_of::<usize>();
+
+        if device.is_gpu() {
+            let (tape_a, use_64bit_a) = create_tape(seq_a_slice)?;
+            let (tape_b, use_64bit_b) = create_tape(seq_b_slice)?;
 
-        /// Create a device scope for CPU cores
-        ///
-        /// # Examples
-        ///
-        /// ```
-        /// # use stringzilla::stringzillas::szs::DeviceScope;
-        /// let device = DeviceScope::cpu_cores(4).unwrap();
-        /// ```
-        pub fn cpu_cores(cpu_cores: usize) -> Result<Self, Status> {
-            let mut handle = ptr::null_mut();
-            let status = unsafe { sz_device_scope_init_cpu_cores(cpu_cores, &mut handle) };
+            let status = if use_64bit_a || use_64bit_b {
+                let tape_a_view = create_u64tape_view(&tape_a);
+                let tape_b_view = create_u64tape_view(&tape_b);
+                unsafe {
+                    sz_levenshtein_distances_u64tape(
+                        self.handle,
+                        device.handle,
+                        &tape_a_view as *const _ as *const c_void,
+                        &tape_b_view as *const _ as *const c_void,
+                        results.as_mut_ptr(),
+                        results_stride,
+                    )
+                }
+            } else {
+                let tape_a_view = create_u32tape_view(&tape_a);
+                let tape_b_view = create_u32tape_view(&tape_b);
+                unsafe {
+                    sz_levenshtein_distances_u32tape(
+                        self.handle,
+                        device.handle,
+                        &tape_a_view as *const _ as *const c_void,
+                        &tape_b_view as *const _ as *const c_void,
+                        results.as_mut_ptr(),
+                        results_stride,
+                    )
+                }
+            };
             match status {
-                Status::Success => Ok(Self { handle }),
+                Status::Success => Ok(results),
                 err => Err(err),
             }
-        }
-
-        /// Create a device scope for GPU device
-        ///
-        /// # Examples
-        ///
-        /// ```
-        /// # use stringzilla::stringzillas::szs::DeviceScope;
-        /// let device = DeviceScope::gpu_device(0).unwrap();
-        /// ```
-        pub fn gpu_device(gpu_device: usize) -> Result<Self, Status> {
-            let mut handle = ptr::null_mut();
-            let status = unsafe { sz_device_scope_init_gpu_device(gpu_device, &mut handle) };
+        } else {
+            let seq_a = create_sequence_view(seq_a_slice);
+            let seq_b = create_sequence_view(seq_b_slice);
+            let status = unsafe {
+                sz_levenshtein_distances_sequence(
+                    self.handle,
+                    device.handle,
+                    &seq_a as *const _ as *const c_void,
+                    &seq_b as *const _ as *const c_void,
+                    results.as_mut_ptr(),
+                    results_stride,
+                )
+            };
             match status {
-                Status::Success => Ok(Self { handle }),
+                Status::Success => Ok(results),
                 err => Err(err),
             }
         }
+    }
+}
 
-        /// Get the capabilities of this device scope
-        pub fn get_capabilities(&self) -> Result<Capability, Status> {
-            let mut capabilities: Capability = 0;
-            let status = unsafe { sz_device_scope_get_capabilities(self.handle, &mut capabilities) };
-            match status {
-                Status::Success => Ok(capabilities),
-                err => Err(err),
-            }
+impl Drop for LevenshteinDistances {
+    fn drop(&mut self) {
+        if !self.handle.is_null() {
+            unsafe { sz_levenshtein_distances_free(self.handle) };
+        }
+    }
+}
+
+unsafe impl Send for LevenshteinDistances {}
+unsafe impl Sync for LevenshteinDistances {}
+
+/// UTF-8 aware Levenshtein distance engine for Unicode text processing.
+///
+/// Computes edit distances between UTF-8 encoded strings at the character level
+/// rather than byte level. This engine properly handles multi-byte UTF-8 sequences,
+/// ensuring that operations are performed on Unicode code points.
+///
+/// # UTF-8 vs Binary Processing
+///
+/// - **Binary engine**: Operates on raw bytes, faster but incorrect for Unicode
+/// - **UTF-8 engine**: Operates on Unicode code points, slower but semantically correct
+///
+/// Use this engine when working with international text, emoji, or any content
+/// where character boundaries matter.
+///
+/// # Examples
+///
+/// ```rust,no_run
+/// # use stringzilla::stringzillas::szs::{DeviceScope, LevenshteinDistancesUtf8};
+/// let device = DeviceScope::default().unwrap();
+/// let engine = LevenshteinDistancesUtf8::new(&device, 0, 1, 1, 1).unwrap();
+///
+/// // Unicode strings with emoji and accents
+/// let strings_a = vec!["café", "naïve", "🦀 rust"];
+/// let strings_b = vec!["cafe", "naive", "🔥 rust"];
+/// let distances = engine.compute(&device, &strings_a, &strings_b).unwrap();
+///
+/// // Character-level distances (not byte-level)
+/// println!("'café' -> 'cafe': {}", distances[0]); // 1 (é -> e)
+/// println!("'🦀 rust' -> '🔥 rust': {}", distances[2]); // 1 (🦀 -> 🔥)
+/// ```
+///
+/// # Comparison with Binary Engine
+///
+/// ```rust,no_run
+/// # use stringzilla::stringzillas::szs::{DeviceScope, LevenshteinDistances, LevenshteinDistancesUtf8};
+/// let device = DeviceScope::default().unwrap();
+/// let binary_engine = LevenshteinDistances::new(&device, 0, 1, 1, 1).unwrap();
+/// let utf8_engine = LevenshteinDistancesUtf8::new(&device, 0, 1, 1, 1).unwrap();
+///
+/// let text_a = vec!["café"]; // 5 bytes, 4 characters
+/// let text_b = vec!["cafe"]; // 4 bytes, 4 characters
+///
+/// let binary_dist = binary_engine.compute(&device, &text_a, &text_b).unwrap();
+/// let utf8_dist = utf8_engine.compute(&device, &text_a, &text_b).unwrap();
+///
+/// println!("Binary distance: {}", binary_dist[0]); // 2 (é is 2 bytes)
+/// println!("UTF-8 distance: {}", utf8_dist[0]);   // 1 (é is 1 character)
+/// ```
+///
+/// # Performance Considerations
+///
+/// - Slower than binary engine due to UTF-8 decoding overhead
+/// - Performance impact depends on character distribution
+/// - ASCII-only text has minimal overhead
+/// - Complex scripts (Arabic, Thai) have higher overhead
+pub struct LevenshteinDistancesUtf8 {
+    handle: LevenshteinDistancesUtf8Handle,
+}
+
+impl LevenshteinDistancesUtf8 {
+    /// Create a new UTF-8 aware Levenshtein distances engine.
+    ///
+    /// Initializes an engine that processes UTF-8 strings at the character level,
+    /// properly handling multi-byte Unicode sequences. Essential for international
+    /// text processing and semantic correctness.
+    ///
+    /// # Parameters
+    ///
+    /// Same as binary engine, but costs apply to Unicode code points:
+    /// - `match_cost`: Cost when Unicode characters match
+    /// - `mismatch_cost`: Cost when Unicode characters differ
+    /// - `open_cost`: Cost to insert/delete a Unicode character
+    /// - `extend_cost`: Cost to continue insertion/deletion
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::{DeviceScope, LevenshteinDistancesUtf8};
+    /// let device = DeviceScope::default().unwrap();
+    ///
+    /// // Standard Unicode-aware engine
+    /// let engine = LevenshteinDistancesUtf8::new(&device, 0, 1, 1, 1).unwrap();
+    ///
+    /// // Test with international text
+    /// let greetings_a = vec!["Hello", "Bonjour", "こんにちは"];
+    /// let greetings_b = vec!["Hallo", "Bonjoir", "こんばんは"];
+    /// let distances = engine.compute(&device, &greetings_a, &greetings_b).unwrap();
+    /// ```
+    pub fn new(
+        device: &DeviceScope,
+        match_cost: i8,
+        mismatch_cost: i8,
+        open_cost: i8,
+        extend_cost: i8,
+    ) -> Result<Self, Status> {
+        let mut handle = ptr::null_mut();
+        let capabilities = device.get_capabilities().unwrap_or(0);
+        let status = unsafe {
+            sz_levenshtein_distances_utf8_init(
+                match_cost,
+                mismatch_cost,
+                open_cost,
+                extend_cost,
+                ptr::null(),
+                capabilities,
+                &mut handle,
+            )
+        };
+        match status {
+            Status::Success => Ok(Self { handle }),
+            err => Err(err),
         }
+    }
+
+    /// Compute UTF-8 aware Levenshtein distances between string pairs.
+    ///
+    /// Processes Unicode strings character by character, ensuring proper handling
+    /// of multi-byte UTF-8 sequences. Critical for applications requiring semantic
+    /// correctness with international text.
+    ///
+    /// # Type Requirements
+    ///
+    /// Input sequences must implement `AsRef<str>` to ensure valid UTF-8:
+    /// - `&str`, `String`, `Cow<str>` are all supported
+    /// - Invalid UTF-8 will cause undefined behavior
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::{DeviceScope, LevenshteinDistancesUtf8};
+    /// let device = DeviceScope::default().unwrap();
+    /// let engine = LevenshteinDistancesUtf8::new(&device, 0, 1, 1, 1).unwrap();
+    ///
+    /// // Mixed string types
+    /// let strings_a: Vec<String> = vec!["résumé".to_string(), "naïve".to_string()];
+    /// let strings_b: Vec<&str> = vec!["resume", "naive"];
+    /// let distances = engine.compute(&device, &strings_a, &strings_b).unwrap();
+    ///
+    /// // Each accented character counts as 1 edit
+    /// assert_eq!(distances[0], 2); // é->e, é->e
+    /// assert_eq!(distances[1], 1); // ï->i
+    /// ```
+    ///
+    /// # Unicode Normalization
+    ///
+    /// Note: This engine does NOT perform Unicode normalization. Pre-normalize
+    /// your strings if you need to handle composed vs decomposed characters:
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::{DeviceScope, LevenshteinDistancesUtf8};
+    /// // These are different at the code point level:
+    /// let composed = vec!["café"];     // é as single code point U+00E9
+    /// let decomposed = vec!["cafe\u{0301}"]; // e + combining acute accent
+    ///
+    /// // Distance would be non-zero without normalization
+    /// // Use unicode-normalization crate if needed
+    /// ```
+    pub fn compute<T, S>(
+        &self,
+        device: &DeviceScope,
+        sequences_a: T,
+        sequences_b: T,
+    ) -> Result<UnifiedVec<usize>, Status>
+    where
+        T: AsRef<[S]>,
+        S: AsRef<str>,
+    {
+        let seq_a_slice = sequences_a.as_ref();
+        let seq_b_slice = sequences_b.as_ref();
+        let num_pairs = seq_a_slice.len().min(seq_b_slice.len());
 
-        /// Get the number of CPU cores configured for this device scope
-        pub fn get_cpu_cores(&self) -> Result<usize, Status> {
-            let mut cpu_cores: usize = 0;
-            let status = unsafe { sz_device_scope_get_cpu_cores(self.handle, &mut cpu_cores) };
+        let mut results = UnifiedVec::with_capacity_in(num_pairs, UnifiedAlloc);
+        results.resize(num_pairs, 0);
+
+        let results_stride = core::mem::size_of::<usize>();
+
+        if device.is_gpu() {
+            let (tape_a, use_64bit_a) = create_tape_str(seq_a_slice)?;
+            let (tape_b, use_64bit_b) = create_tape_str(seq_b_slice)?;
+
+            let status = if use_64bit_a || use_64bit_b {
+                let tape_a_view = create_u64tape_view_str(&tape_a);
+                let tape_b_view = create_u64tape_view_str(&tape_b);
+                unsafe {
+                    sz_levenshtein_distances_utf8_u64tape(
+                        self.handle,
+                        device.handle,
+                        &tape_a_view as *const _ as *const c_void,
+                        &tape_b_view as *const _ as *const c_void,
+                        results.as_mut_ptr(),
+                        results_stride,
+                    )
+                }
+            } else {
+                let tape_a_view = create_u32tape_view_str(&tape_a);
+                let tape_b_view = create_u32tape_view_str(&tape_b);
+                unsafe {
+                    sz_levenshtein_distances_utf8_u32tape(
+                        self.handle,
+                        device.handle,
+                        &tape_a_view as *const _ as *const c_void,
+                        &tape_b_view as *const _ as *const c_void,
+                        results.as_mut_ptr(),
+                        results_stride,
+                    )
+                }
+            };
             match status {
-                Status::Success => Ok(cpu_cores),
+                Status::Success => Ok(results),
                 err => Err(err),
             }
-        }
-
-        /// Get the GPU device ID configured for this device scope
-        pub fn get_gpu_device(&self) -> Result<usize, Status> {
-            let mut gpu_device: usize = 0;
-            let status = unsafe { sz_device_scope_get_gpu_device(self.handle, &mut gpu_device) };
+        } else {
+            let seq_a = create_sequence_view_str(seq_a_slice);
+            let seq_b = create_sequence_view_str(seq_b_slice);
+            let status = unsafe {
+                sz_levenshtein_distances_utf8_sequence(
+                    self.handle,
+                    device.handle,
+                    &seq_a as *const _ as *const c_void,
+                    &seq_b as *const _ as *const c_void,
+                    results.as_mut_ptr(),
+                    results_stride,
+                )
+            };
             match status {
-                Status::Success => Ok(gpu_device),
+                Status::Success => Ok(results),
                 err => Err(err),
             }
         }
+    }
+}
 
-        /// Check if this device scope is configured for GPU execution
-        pub fn is_gpu(&self) -> bool {
-            self.get_gpu_device().is_ok()
-        }
-
-        /// Get the raw handle for this device scope
-        pub(crate) fn as_ptr(&self) -> *mut c_void {
-            self.handle
+impl Drop for LevenshteinDistancesUtf8 {
+    fn drop(&mut self) {
+        if !self.handle.is_null() {
+            unsafe { sz_levenshtein_distances_utf8_free(self.handle) };
         }
     }
+}
 
-    impl Drop for DeviceScope {
-        fn drop(&mut self) {
-            if !self.handle.is_null() {
-                unsafe {
-                    sz_device_scope_free(self.handle);
-                }
-            }
+unsafe impl Send for LevenshteinDistancesUtf8 {}
+unsafe impl Sync for LevenshteinDistancesUtf8 {}
+
+/// Needleman-Wunsch global sequence alignment scoring engine.
+///
+/// Implements the Needleman-Wunsch algorithm for finding the optimal global alignment
+/// between two sequences. Unlike edit distance, this algorithm uses a full substitution
+/// matrix and returns alignment scores rather than distances.
+///
+/// # Algorithm Details
+///
+/// The Needleman-Wunsch algorithm finds the optimal global alignment by:
+/// 1. Building a dynamic programming matrix using substitution scores
+/// 2. Applying gap penalties (open + extend costs)
+/// 3. Finding the maximum-scoring path through the entire sequences
+///
+/// Time complexity: O(nm), Space complexity: O(nm) or O(n+m) with optimizations.
+///
+/// # Applications
+///
+/// - **Bioinformatics**: Protein and DNA sequence alignment
+/// - **Linguistics**: Comparative analysis of languages
+/// - **Data integration**: Aligning structured records
+/// - **Quality control**: Comparing reference vs. observed sequences
+///
+/// # Substitution Matrix
+///
+/// Requires a 256x256 scoring matrix where `matrix[i][j]` gives the score
+/// for aligning character `i` with character `j`. Common matrices include:
+/// - **BLOSUM**: For protein sequences
+/// - **PAM**: For evolutionary analysis
+/// - **Custom**: For domain-specific applications
+///
+/// # Examples
+///
+/// ```rust,no_run
+/// # use stringzilla::stringzillas::szs::{DeviceScope, NeedlemanWunschScores};
+/// // Create simple scoring matrix (match=2, mismatch=-1)
+/// let mut matrix = [[-1i8; 256]; 256];
+/// for i in 0..256 {
+///     matrix[i][i] = 2; // Match score
+/// }
+///
+/// let device = DeviceScope::default().unwrap();
+/// let engine = NeedlemanWunschScores::new(
+///     &device,
+///     &matrix,
+///     -2, // gap open penalty
+///     -1, // gap extend penalty
+/// ).unwrap();
+///
+/// // Align protein sequences
+/// let seq_a = vec!["ACDEFGHIKLMNPQRSTVWY"];
+/// let seq_b = vec!["ACDEFGHIKL---NPQRSTVWY"];
+/// let scores = engine.compute(&device, &seq_a, &seq_b).unwrap();
+///
+/// println!("Global alignment score: {}", scores[0]);
+/// ```
+///
+/// # BLOSUM62 Example
+///
+/// ```rust,no_run
+/// # use stringzilla::stringzillas::szs::{DeviceScope, NeedlemanWunschScores};
+/// // Load BLOSUM62 matrix (simplified example)
+/// fn create_blosum62_matrix() -> [[i8; 256]; 256] {
+///     let mut matrix = [[-4i8; 256]; 256]; // Default mismatch
+///     
+///     // Set scores for amino acids (simplified BLOSUM62 subset)
+///     let aa_scores = [
+///         ('A', 'A', 4), ('A', 'R', -1), ('A', 'N', -2),
+///         ('R', 'R', 5), ('R', 'N', 0), ('N', 'N', 6),
+///         // ... (full BLOSUM62 table)
+///     ];
+///     
+///     for (aa1, aa2, score) in aa_scores.iter() {
+///         matrix[*aa1 as usize][*aa2 as usize] = *score;
+///         matrix[*aa2 as usize][*aa1 as usize] = *score; // Symmetric
+///     }
+///     matrix
+/// }
+///
+/// let device = DeviceScope::default().unwrap();
+/// let blosum62 = create_blosum62_matrix();
+/// let engine = NeedlemanWunschScores::new(&device, &blosum62, -11, -1).unwrap();
+/// ```
+pub struct NeedlemanWunschScores {
+    handle: NeedlemanWunschScoresHandle,
+}
+
+impl NeedlemanWunschScores {
+    /// Create a new Needleman-Wunsch global alignment scoring engine.
+    ///
+    /// Initializes the engine with a custom substitution matrix and gap penalties.
+    /// The engine will automatically select optimal implementations based on
+    /// hardware capabilities.
+    ///
+    /// # Parameters
+    ///
+    /// - `device`: Device scope for execution and capability detection
+    /// - `substitution_matrix`: 256x256 matrix of alignment scores
+    /// - `open_cost`: Penalty for opening a gap (typically negative)
+    /// - `extend_cost`: Penalty for extending a gap (typically negative, ≤ open_cost)
+    ///
+    /// # Returns
+    ///
+    /// - `Ok(NeedlemanWunschScores)`: Successfully initialized engine
+    /// - `Err(Status::BadAlloc)`: Memory allocation failed
+    /// - `Err(Status::InvalidArgument)`: Invalid matrix or gap costs
+    ///
+    /// # Matrix Guidelines
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::{DeviceScope, NeedlemanWunschScores};
+    /// // Identity matrix (simple match/mismatch)
+    /// let mut simple_matrix = [[0i8; 256]; 256];
+    /// for i in 0..256 {
+    ///     simple_matrix[i][i] = 1;  // Match
+    ///     for j in 0..256 {
+    ///         if i != j { simple_matrix[i][j] = -1; } // Mismatch
+    ///     }
+    /// }
+    ///
+    /// let device = DeviceScope::default().unwrap();
+    /// let engine = NeedlemanWunschScores::new(&device, &simple_matrix, -2, -1).unwrap();
+    /// ```
+    ///
+    /// # Gap Cost Selection
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::{DeviceScope, NeedlemanWunschScores};
+    /// # let mut matrix = [[0i8; 256]; 256];
+    /// # let device = DeviceScope::default().unwrap();
+    /// // Linear gap costs (open == extend)
+    /// let linear = NeedlemanWunschScores::new(&device, &matrix, -1, -1).unwrap();
+    ///
+    /// // Affine gap costs (prefer fewer, longer gaps)
+    /// let affine = NeedlemanWunschScores::new(&device, &matrix, -5, -1).unwrap();
+    /// ```
+    pub fn new(
+        device: &DeviceScope,
+        substitution_matrix: &[[i8; 256]; 256],
+        open_cost: i8,
+        extend_cost: i8,
+    ) -> Result<Self, Status> {
+        let mut handle = ptr::null_mut();
+        let capabilities = device.get_capabilities().unwrap_or(0);
+        let status = unsafe {
+            sz_needleman_wunsch_scores_init(
+                substitution_matrix.as_ptr() as *const i8,
+                open_cost,
+                extend_cost,
+                ptr::null(),
+                capabilities,
+                &mut handle,
+            )
+        };
+        match status {
+            Status::Success => Ok(Self { handle }),
+            err => Err(err),
         }
     }
 
-    /// Builder for fingerprinting engine configuration
+    /// Compute Needleman-Wunsch global alignment scores between sequence pairs.
+    ///
+    /// Finds the optimal global alignment score for each pair of sequences using
+    /// the configured substitution matrix and gap penalties. Returns positive scores
+    /// for good alignments, negative for poor alignments.
+    ///
+    /// # Parameters
     ///
-    /// By default, uses alphabet_size=0 and null window_widths to let the C layer
-    /// infer optimal configurations based on the provided capabilities.
+    /// - `device`: Device scope for parallel execution
+    /// - `sequences_a`: First collection of sequences to align
+    /// - `sequences_b`: Second collection of sequences to align
+    ///
+    /// # Returns
+    ///
+    /// - `Ok(UnifiedVec<isize>)`: Vector of alignment scores (can be negative)
+    /// - `Err(Status)`: Computation failed
+    ///
+    /// # Score Interpretation
+    ///
+    /// - **Positive scores**: Good alignment, sequences are similar
+    /// - **Zero scores**: Neutral alignment
+    /// - **Negative scores**: Poor alignment, sequences are dissimilar
+    /// - **Magnitude**: Higher absolute values indicate stronger alignment quality
     ///
     /// # Examples
     ///
-    /// ```
-    /// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::{DeviceScope, NeedlemanWunschScores};
+    /// # let mut matrix = [[0i8; 256]; 256];
+    /// # for i in 0..256 { matrix[i][i] = 2; for j in 0..256 { if i != j { matrix[i][j] = -1; } } }
     /// let device = DeviceScope::default().unwrap();
-    /// let engine = Fingerprints::builder()
-    ///     .build(&device)
-    ///     .unwrap();
+    /// let engine = NeedlemanWunschScores::new(&device, &matrix, -2, -1).unwrap();
+    ///
+    /// // Compare DNA sequences
+    /// let dna_a = vec!["ATCGATCG", "GGCCTTAA"];
+    /// let dna_b = vec!["ATCGATCC", "GGCCTTAA"]; // One mismatch, one exact
+    /// let scores = engine.compute(&device, &dna_a, &dna_b).unwrap();
+    ///
+    /// println!("DNA alignment scores: {:?}", scores);
+    /// // Expect: [positive but lower for mismatch, high positive for exact match]
     /// ```
-    pub struct FingerprintsBuilder {
-        alphabet_size: usize,
-        window_widths: Option<Vec<usize>>,
-        dimensions: usize,
-    }
+    ///
+    /// # Batch Processing
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::{DeviceScope, NeedlemanWunschScores};
+    /// # let mut matrix = [[0i8; 256]; 256];
+    /// # let device = DeviceScope::default().unwrap();
+    /// # let engine = NeedlemanWunschScores::new(&device, &matrix, -2, -1).unwrap();
+    /// // Process large batches efficiently
+    /// let sequences: Vec<&str> = vec![
+    ///     "PROTEIN_SEQUENCE_1", "PROTEIN_SEQUENCE_2", /* ... */
+    /// ];
+    /// let references: Vec<&str> = vec![
+    ///     "REFERENCE_SEQ_1", "REFERENCE_SEQ_2", /* ... */
+    /// ];
+    ///
+    /// let scores = engine.compute(&device, &sequences, &references).unwrap();
+    ///
+    /// // Find best alignments
+    /// let best_idx = scores.iter().enumerate()
+    ///     .max_by_key(|(_, &score)| score)
+    ///     .map(|(idx, _)| idx);
+    /// ```
+    pub fn compute<T, S>(
+        &self,
+        device: &DeviceScope,
+        sequences_a: T,
+        sequences_b: T,
+    ) -> Result<UnifiedVec<isize>, Status>
+    where
+        T: AsRef<[S]>,
+        S: AsRef<[u8]>,
+    {
+        let seq_a_slice = sequences_a.as_ref();
+        let seq_b_slice = sequences_b.as_ref();
+        let num_pairs = seq_a_slice.len().min(seq_b_slice.len());
+
+        let mut results = UnifiedVec::with_capacity_in(num_pairs, UnifiedAlloc);
+        results.resize(num_pairs, 0);
 
-    impl FingerprintsBuilder {
-        /// Create a new builder with defaults (alphabet_size=0, no window widths)
-        /// The C layer will infer optimal settings based on device capabilities
-        pub fn new() -> Self {
-            Self {
-                alphabet_size: 0,
-                window_widths: None,
-                dimensions: 1024, // Default dimensions
+        let results_stride = core::mem::size_of::<isize>();
+
+        if device.is_gpu() {
+            let (tape_a, use_64bit_a) = create_tape(seq_a_slice)?;
+            let (tape_b, use_64bit_b) = create_tape(seq_b_slice)?;
+
+            let status = if use_64bit_a || use_64bit_b {
+                let tape_a_view = create_u64tape_view(&tape_a);
+                let tape_b_view = create_u64tape_view(&tape_b);
+                unsafe {
+                    sz_needleman_wunsch_scores_u64tape(
+                        self.handle,
+                        device.handle,
+                        &tape_a_view as *const _ as *const c_void,
+                        &tape_b_view as *const _ as *const c_void,
+                        results.as_mut_ptr(),
+                        results_stride,
+                    )
+                }
+            } else {
+                let tape_a_view = create_u32tape_view(&tape_a);
+                let tape_b_view = create_u32tape_view(&tape_b);
+                unsafe {
+                    sz_needleman_wunsch_scores_u32tape(
+                        self.handle,
+                        device.handle,
+                        &tape_a_view as *const _ as *const c_void,
+                        &tape_b_view as *const _ as *const c_void,
+                        results.as_mut_ptr(),
+                        results_stride,
+                    )
+                }
+            };
+            match status {
+                Status::Success => Ok(results),
+                err => Err(err),
+            }
+        } else {
+            let seq_a = create_sequence_view(seq_a_slice);
+            let seq_b = create_sequence_view(seq_b_slice);
+            let status = unsafe {
+                sz_needleman_wunsch_scores_sequence(
+                    self.handle,
+                    device.handle,
+                    &seq_a as *const _ as *const c_void,
+                    &seq_b as *const _ as *const c_void,
+                    results.as_mut_ptr(),
+                    results_stride,
+                )
+            };
+            match status {
+                Status::Success => Ok(results),
+                err => Err(err),
             }
         }
+    }
+}
 
-        /// Set alphabet size to binary (256 characters)
-        pub fn binary(mut self) -> Self {
-            self.alphabet_size = 256;
-            self
+impl Drop for NeedlemanWunschScores {
+    fn drop(&mut self) {
+        if !self.handle.is_null() {
+            unsafe { sz_needleman_wunsch_scores_free(self.handle) };
         }
+    }
+}
 
-        /// Set alphabet size to ASCII (128 characters)
-        pub fn ascii(mut self) -> Self {
-            self.alphabet_size = 128;
-            self
-        }
+unsafe impl Send for NeedlemanWunschScores {}
+unsafe impl Sync for NeedlemanWunschScores {}
+
+/// Smith-Waterman local sequence alignment scoring engine.
+///
+/// Implements the Smith-Waterman algorithm for finding optimal local alignments
+/// within sequences. Unlike Needleman-Wunsch, this algorithm finds the best-matching
+/// subsequences rather than aligning entire sequences.
+///
+/// # Algorithm Details
+///
+/// The Smith-Waterman algorithm:
+/// 1. Builds a dynamic programming matrix with non-negative scores
+/// 2. Allows scores to reset to zero (no penalty for poor regions)
+/// 3. Returns the maximum score found anywhere in the matrix
+/// 4. Identifies optimal local alignments without end-to-end constraints
+///
+/// Time complexity: O(nm), Space complexity: O(nm) or O(n+m) with optimizations.
+///
+/// # Applications
+///
+/// - **Homology search**: Finding similar regions in biological sequences
+/// - **Motif discovery**: Identifying conserved patterns
+/// - **Database search**: BLAST-like local similarity search
+/// - **Partial matching**: Finding best substring alignments
+/// - **Fragment analysis**: Comparing incomplete or damaged sequences
+///
+/// # Local vs Global Alignment
+///
+/// ```text
+/// Global (Needleman-Wunsch):
+/// SEQUENCE_A: ATCGATCGATCG----ATCG
+/// SEQUENCE_B: ----ATCGATCGATCGATCG
+/// (Forces end-to-end alignment)
+///
+/// Local (Smith-Waterman):
+/// SEQUENCE_A: ...ATCGATCGATCG...
+/// SEQUENCE_B:    ATCGATCGATCG
+/// (Finds best local match)
+/// ```
+///
+/// # Examples
+///
+/// ```rust,no_run
+/// # use stringzilla::stringzillas::szs::{DeviceScope, SmithWatermanScores};
+/// // Create scoring matrix for DNA (A, T, C, G)
+/// let mut dna_matrix = [[-2i8; 256]; 256]; // Mismatch penalty
+/// let dna_chars = [b'A', b'T', b'C', b'G'];
+/// for &c1 in &dna_chars {
+///     for &c2 in &dna_chars {
+///         dna_matrix[c1 as usize][c2 as usize] = if c1 == c2 { 3 } else { -1 };
+///     }
+/// }
+///
+/// let device = DeviceScope::default().unwrap();
+/// let engine = SmithWatermanScores::new(
+///     &device,
+///     &dna_matrix,
+///     -5, // gap open
+///     -1, // gap extend
+/// ).unwrap();
+///
+/// // Find local similarities
+/// let long_seq = vec!["ATCGATCGATCGAAAAAATCGATCGATCG"];
+/// let pattern = vec!["ATCGATCGATCG"];
+/// let scores = engine.compute(&device, &long_seq, &pattern).unwrap();
+///
+/// println!("Local alignment score: {}", scores[0]); // High positive score
+/// ```
+///
+/// # Database Search Example
+///
+/// ```rust,no_run
+/// # use stringzilla::stringzillas::szs::{DeviceScope, SmithWatermanScores};
+/// # let mut matrix = [[0i8; 256]; 256];
+/// let device = DeviceScope::default().unwrap();
+/// let engine = SmithWatermanScores::new(&device, &matrix, -2, -1).unwrap();
+///
+/// // Search query against database sequences
+/// let query = "ACDEFGHIKLMN";  // Query sequence
+/// let database = vec![
+///     "ABCDEFGHIJKLMNOPQRSTUVWXYZ",  // Contains query
+///     "ZYXWVUTSRQPONMLKJIHGFEDCBA",  // Reverse complement
+///     "DEFGHIKLM",                  // Partial match
+///     "COMPLETELY_DIFFERENT",        // No similarity
+/// ];
+///
+/// let queries = vec![query; database.len()];  // Repeat query for each DB entry
+/// let scores = engine.compute(&device, &queries, &database).unwrap();
+///
+/// // Find best matches
+/// for (i, &score) in scores.iter().enumerate() {
+///     println!("Database[{}] score: {}", i, score);
+/// }
+/// ```
+pub struct SmithWatermanScores {
+    handle: SmithWatermanScoresHandle,
+}
 
-        /// Set alphabet size to DNA (4 characters: A, C, G, T)
-        pub fn dna(mut self) -> Self {
-            self.alphabet_size = 4;
-            self
+impl SmithWatermanScores {
+    /// Create a new Smith-Waterman local alignment scoring engine.
+    ///
+    /// Initializes the engine for local sequence alignment with custom scoring parameters.
+    /// The engine automatically adapts to available hardware capabilities.
+    ///
+    /// # Parameters
+    ///
+    /// - `device`: Device scope for execution context
+    /// - `substitution_matrix`: 256x256 scoring matrix for character pairs
+    /// - `open_cost`: Gap opening penalty (typically negative)
+    /// - `extend_cost`: Gap extension penalty (typically negative, ≥ open_cost)
+    ///
+    /// # Matrix Design for Local Alignment
+    ///
+    /// For effective local alignment, the matrix should have:
+    /// - **Positive match scores**: Reward similar characters
+    /// - **Negative mismatch scores**: Penalize dissimilar characters
+    /// - **Balanced penalties**: Prevent excessive gap formation
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::{DeviceScope, SmithWatermanScores};
+    /// let device = DeviceScope::default().unwrap();
+    ///
+    /// // Protein alignment matrix (simplified)
+    /// let mut protein_matrix = [[-1i8; 256]; 256];  // Default mismatch
+    ///
+    /// // Set positive scores for similar amino acids
+    /// let amino_acids = b"ACDEFGHIKLMNPQRSTVWY";
+    /// for &aa in amino_acids {
+    ///     protein_matrix[aa as usize][aa as usize] = 5; // Identity
+    /// }
+    ///
+    /// // Similar amino acids get positive but lower scores
+    /// protein_matrix[b'L' as usize][b'I' as usize] = 2; // Leucine-Isoleucine
+    /// protein_matrix[b'I' as usize][b'L' as usize] = 2;
+    ///
+    /// let engine = SmithWatermanScores::new(&device, &protein_matrix, -3, -1).unwrap();
+    /// ```
+    ///
+    /// # Gap Penalty Strategy
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::{DeviceScope, SmithWatermanScores};
+    /// # let mut matrix = [[0i8; 256]; 256];
+    /// # let device = DeviceScope::default().unwrap();
+    /// // Conservative gaps (discourage insertions/deletions)
+    /// let conservative = SmithWatermanScores::new(&device, &matrix, -10, -2).unwrap();
+    ///
+    /// // Permissive gaps (allow more insertions/deletions)
+    /// let permissive = SmithWatermanScores::new(&device, &matrix, -2, -1).unwrap();
+    /// ```
+    pub fn new(
+        device: &DeviceScope,
+        substitution_matrix: &[[i8; 256]; 256],
+        open_cost: i8,
+        extend_cost: i8,
+    ) -> Result<Self, Status> {
+        let mut handle = ptr::null_mut();
+        let capabilities = device.get_capabilities().unwrap_or(0);
+        let status = unsafe {
+            sz_smith_waterman_scores_init(
+                substitution_matrix.as_ptr() as *const i8,
+                open_cost,
+                extend_cost,
+                ptr::null(),
+                capabilities,
+                &mut handle,
+            )
+        };
+        match status {
+            Status::Success => Ok(Self { handle }),
+            err => Err(err),
         }
+    }
 
-        /// Set alphabet size to protein (22 characters)
-        pub fn protein(mut self) -> Self {
-            self.alphabet_size = 22;
-            self
-        }
+    /// Compute Smith-Waterman local alignment scores between sequence pairs.
+    ///
+    /// Finds the optimal local alignment score for each sequence pair. Returns
+    /// the maximum alignment score found within the sequences, representing
+    /// the best possible local match.
+    ///
+    /// # Parameters
+    ///
+    /// - `device`: Device scope for execution
+    /// - `sequences_a`: First collection of sequences
+    /// - `sequences_b`: Second collection of sequences
+    ///
+    /// # Returns
+    ///
+    /// - `Ok(UnifiedVec<isize>)`: Vector of local alignment scores (≥ 0)
+    /// - `Err(Status)`: Computation failed
+    ///
+    /// # Score Interpretation
+    ///
+    /// - **High scores**: Strong local similarity found
+    /// - **Low scores**: Weak or no local similarity
+    /// - **Zero scores**: No positive-scoring alignment possible
+    /// - **Never negative**: Smith-Waterman scores are always ≥ 0
+    ///
+    /// # Examples
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::{DeviceScope, SmithWatermanScores};
+    /// # let mut matrix = [[0i8; 256]; 256];
+    /// # for i in 0..256 { matrix[i][i] = 3; for j in 0..256 { if i != j { matrix[i][j] = -1; } } }
+    /// let device = DeviceScope::default().unwrap();
+    /// let engine = SmithWatermanScores::new(&device, &matrix, -2, -1).unwrap();
+    ///
+    /// // Local similarity search
+    /// let sequences = vec![
+    ///     "ATCGATCGATCG_LONG_SEQUENCE_WITH_NOISE",
+    ///     "DIFFERENT_SEQUENCE_ATCGATCGATCG_MORE_NOISE",
+    ///     "COMPLETELY_UNRELATED_SEQUENCE",
+    /// ];
+    /// let pattern = vec!["ATCGATCGATCG"; 3];  // Search for this pattern
+    ///
+    /// let scores = engine.compute(&device, &sequences, &pattern).unwrap();
+    ///
+    /// for (i, &score) in scores.iter().enumerate() {
+    ///     if score > 20 {  // Threshold for significant similarity
+    ///         println!("Sequence {} contains similar region (score: {})", i, score);
+    ///     }
+    /// }
+    /// ```
+    ///
+    /// # Homology Search
+    ///
+    /// ```rust,no_run
+    /// # use stringzilla::stringzillas::szs::{DeviceScope, SmithWatermanScores};
+    /// # let mut matrix = [[0i8; 256]; 256];
+    /// # let device = DeviceScope::default().unwrap();
+    /// # let engine = SmithWatermanScores::new(&device, &matrix, -2, -1).unwrap();
+    /// // Find homologous sequences in a database
+    /// let query_seq = vec!["PROTEIN_QUERY_SEQUENCE"];
+    /// let database_seqs = vec![
+    ///     "HOMOLOGOUS_PROTEIN_SEQUENCE_VARIANT_1",
+    ///     "HOMOLOGOUS_PROTEIN_SEQUENCE_VARIANT_2",
+    ///     "UNRELATED_PROTEIN_SEQUENCE",
+    /// ];
+    ///
+    /// let queries = vec![query_seq[0]; database_seqs.len()];
+    /// let scores = engine.compute(&device, &queries, &database_seqs).unwrap();
+    ///
+    /// // Sort by score to find best matches
+    /// let mut scored_results: Vec<_> = scores.iter().enumerate()
+    ///     .map(|(i, &score)| (i, score))
+    ///     .collect();
+    /// scored_results.sort_by_key(|(_, score)| -score);  // Descending
+    ///
+    /// println!("Best matches:");
+    /// for (idx, score) in scored_results.iter().take(3) {
+    ///     println!("Database[{}]: score {}", idx, score);
+    /// }
+    /// ```
+    pub fn compute<T, S>(
+        &self,
+        device: &DeviceScope,
+        sequences_a: T,
+        sequences_b: T,
+    ) -> Result<UnifiedVec<isize>, Status>
+    where
+        T: AsRef<[S]>,
+        S: AsRef<[u8]>,
+    {
+        let seq_a_slice = sequences_a.as_ref();
+        let seq_b_slice = sequences_b.as_ref();
+        let num_pairs = seq_a_slice.len().min(seq_b_slice.len());
 
-        /// Set custom alphabet size
-        pub fn alphabet_size(mut self, size: usize) -> Self {
-            self.alphabet_size = size;
-            self
-        }
+        let mut results = UnifiedVec::with_capacity_in(num_pairs, UnifiedAlloc);
+        results.resize(num_pairs, 0);
 
-        /// Set window widths for rolling hashes
-        /// If not set, the C layer will use default window widths
-        pub fn window_widths(mut self, widths: &[usize]) -> Self {
-            self.window_widths = Some(widths.to_vec());
-            self
-        }
+        let results_stride = core::mem::size_of::<isize>();
 
-        /// Set total dimensions per fingerprint
-        /// Ideally 1024 or a (64 * window_widths_count) multiple
-        pub fn dimensions(mut self, dimensions: usize) -> Self {
-            self.dimensions = dimensions;
-            self
-        }
+        if device.is_gpu() {
+            let (tape_a, use_64bit_a) = create_tape(seq_a_slice)?;
+            let (tape_b, use_64bit_b) = create_tape(seq_b_slice)?;
 
-        /// Build the fingerprinting engine using the device scope's capabilities
-        ///
-        /// If alphabet_size is 0, it will be set to 256 by default.
-        /// If window_widths is None, default window widths will be used.
-        pub fn build(self, device: &DeviceScope) -> Result<Fingerprints, Status> {
-            let mut engine: FingerprintsHandle = ptr::null_mut();
-            let capabilities = device.get_capabilities().unwrap_or(0);
-
-            let (widths_ptr, widths_len) = match &self.window_widths {
-                Some(widths) => (widths.as_ptr(), widths.len()),
-                None => (ptr::null(), 0),
+            let status = if use_64bit_a || use_64bit_b {
+                let tape_a_view = create_u64tape_view(&tape_a);
+                let tape_b_view = create_u64tape_view(&tape_b);
+                unsafe {
+                    sz_smith_waterman_scores_u64tape(
+                        self.handle,
+                        device.handle,
+                        &tape_a_view as *const _ as *const c_void,
+                        &tape_b_view as *const _ as *const c_void,
+                        results.as_mut_ptr(),
+                        results_stride,
+                    )
+                }
+            } else {
+                let tape_a_view = create_u32tape_view(&tape_a);
+                let tape_b_view = create_u32tape_view(&tape_b);
+                unsafe {
+                    sz_smith_waterman_scores_u32tape(
+                        self.handle,
+                        device.handle,
+                        &tape_a_view as *const _ as *const c_void,
+                        &tape_b_view as *const _ as *const c_void,
+                        results.as_mut_ptr(),
+                        results_stride,
+                    )
+                }
             };
-
+            match status {
+                Status::Success => Ok(results),
+                err => Err(err),
+            }
+        } else {
+            let seq_a = create_sequence_view(seq_a_slice);
+            let seq_b = create_sequence_view(seq_b_slice);
             let status = unsafe {
-                sz_fingerprints_init(
-                    self.dimensions,
-                    self.alphabet_size,
-                    widths_ptr,
-                    widths_len,
-                    ptr::null(), // No custom allocator
-                    capabilities,
-                    &mut engine,
+                sz_smith_waterman_scores_sequence(
+                    self.handle,
+                    device.handle,
+                    &seq_a as *const _ as *const c_void,
+                    &seq_b as *const _ as *const c_void,
+                    results.as_mut_ptr(),
+                    results_stride,
                 )
             };
-
             match status {
-                Status::Success => Ok(Fingerprints { handle: engine }),
+                Status::Success => Ok(results),
                 err => Err(err),
             }
         }
     }
+}
 
-    /// StringZillas fingerprinting engine
-    pub struct Fingerprints {
-        handle: FingerprintsHandle,
-    }
-
-    impl Fingerprints {
-        /// Create a new builder for configuring the engine
-        pub fn builder() -> FingerprintsBuilder {
-            FingerprintsBuilder::new()
+impl Drop for SmithWatermanScores {
+    fn drop(&mut self) {
+        if !self.handle.is_null() {
+            unsafe { sz_smith_waterman_scores_free(self.handle) };
         }
+    }
+}
 
-        /// Compute fingerprints for a collection of strings
-        /// Returns min-hashes and min-counts in unified memory
-        pub fn compute<T, S>(
-            &self,
-            device: &DeviceScope,
-            strings: T,
-            dimensions: usize,
-        ) -> Result<(UnifiedVec<u32>, UnifiedVec<u32>), Status>
-        where
-            T: AsRef<[S]>,
-            S: AsRef<[u8]>,
-        {
-            let strings_slice = strings.as_ref();
-            let num_strings = strings_slice.len();
-            let hashes_size = num_strings * dimensions;
-            let counts_size = num_strings * dimensions;
-
-            let mut min_hashes = UnifiedVec::with_capacity_in(hashes_size, UnifiedAlloc);
-            min_hashes.resize(hashes_size, 0);
-            let mut min_counts = UnifiedVec::with_capacity_in(counts_size, UnifiedAlloc);
-            min_counts.resize(counts_size, 0);
-
-            let hashes_stride = dimensions * core::mem::size_of::<u32>();
-            let counts_stride = dimensions * core::mem::size_of::<u32>();
-
-            if device.is_gpu() {
-                let (tape, use_64bit) = create_tape(strings_slice)?;
-
-                let status = if use_64bit {
-                    let tape_view = create_u64tape_view(&tape);
-                    unsafe {
-                        sz_fingerprints_u64tape(
-                            self.handle,
-                            device.handle,
-                            &tape_view as *const _ as *const c_void,
-                            min_hashes.as_mut_ptr(),
-                            hashes_stride,
-                            min_counts.as_mut_ptr(),
-                            counts_stride,
-                        )
-                    }
-                } else {
-                    let tape_view = create_u32tape_view(&tape);
-                    unsafe {
-                        sz_fingerprints_u32tape(
-                            self.handle,
-                            device.handle,
-                            &tape_view as *const _ as *const c_void,
-                            min_hashes.as_mut_ptr(),
-                            hashes_stride,
-                            min_counts.as_mut_ptr(),
-                            counts_stride,
-                        )
-                    }
-                };
-                match status {
-                    Status::Success => Ok((min_hashes, min_counts)),
-                    err => Err(err),
-                }
-            } else {
-                let sequence = create_sequence_view(strings_slice);
-                let status = unsafe {
-                    sz_fingerprints_sequence(
-                        self.handle,
-                        device.handle,
-                        &sequence as *const _ as *const c_void,
-                        min_hashes.as_mut_ptr(),
-                        hashes_stride,
-                        min_counts.as_mut_ptr(),
-                        counts_stride,
-                    )
-                };
-                match status {
-                    Status::Success => Ok((min_hashes, min_counts)),
-                    err => Err(err),
-                }
-            }
-        }
+unsafe impl Send for SmithWatermanScores {}
+unsafe impl Sync for SmithWatermanScores {}
+
+/// Zero-copy helper to create sz_sequence_t view from any container of byte slices
+fn create_sequence_view<T: AsRef<[u8]>>(strings: &[T]) -> SzSequence {
+    SzSequence {
+        handle: strings.as_ptr() as *mut c_void,
+        count: strings.len(),
+        get_start: sz_sequence_get_start_generic::<T>,
+        get_length: sz_sequence_get_length_generic::<T>,
+        starts: ptr::null(),
+        lengths: ptr::null(),
     }
+}
 
-    impl Drop for Fingerprints {
-        fn drop(&mut self) {
-            if !self.handle.is_null() {
-                unsafe {
-                    sz_fingerprints_free(self.handle);
-                }
-            }
-        }
+/// Zero-copy helper to create sz_sequence_t view from any container of strings
+fn create_sequence_view_str<T: AsRef<str>>(strings: &[T]) -> SzSequence {
+    SzSequence {
+        handle: strings.as_ptr() as *mut c_void,
+        count: strings.len(),
+        get_start: sz_sequence_get_start_str::<T>,
+        get_length: sz_sequence_get_length_str::<T>,
+        starts: ptr::null(),
+        lengths: ptr::null(),
     }
+}
 
-    /// Internal representation of sz_sequence_t for passing to C
-    #[repr(C)]
-    struct SzSequence {
-        handle: *mut c_void,
-        count: usize,
-        get_start: extern "C" fn(*mut c_void, usize) -> *const u8,
-        get_length: extern "C" fn(*mut c_void, usize) -> usize,
-        // Additional fields for our implementation
-        starts: *const *const u8,
-        lengths: *const usize,
+/// Convert StringTape to appropriate tape view for C API
+fn create_tape<T>(sequences: &[T]) -> Result<(BytesTape<i64, UnifiedAlloc>, bool), Status>
+where
+    T: AsRef<[u8]>,
+{
+    // Estimate total size to decide between 32-bit and 64-bit tapes
+    let total_size: usize = sequences.iter().map(|s| s.as_ref().len()).sum();
+    let use_64bit = total_size > u32::MAX as usize || sequences.len() > u32::MAX as usize;
+
+    let tape = if use_64bit {
+        BytesTape::<i64, UnifiedAlloc>::new_in(UnifiedAlloc)
+    } else {
+        BytesTape::<i64, UnifiedAlloc>::new_in(UnifiedAlloc)
+    };
+
+    let mut tape = tape;
+    tape.extend(sequences).map_err(|_| SzStatus::BadAlloc)?;
+    Ok((tape, use_64bit))
+}
+
+/// Convert string sequences to StringTape
+fn create_tape_str<T: AsRef<str>>(sequences: &[T]) -> Result<(StringTape<i64, UnifiedAlloc>, bool), Status> {
+    // Estimate total size to decide between 32-bit and 64-bit tapes
+    let total_size: usize = sequences.iter().map(|s| s.as_ref().len()).sum();
+    let use_64bit = total_size > u32::MAX as usize || sequences.len() > u32::MAX as usize;
+
+    let tape = if use_64bit {
+        StringTape::<i64, UnifiedAlloc>::new_in(UnifiedAlloc)
+    } else {
+        StringTape::<i64, UnifiedAlloc>::new_in(UnifiedAlloc)
+    };
+
+    let mut tape = tape;
+    tape.extend(sequences).map_err(|_| SzStatus::BadAlloc)?;
+    Ok((tape, use_64bit))
+}
+
+/// Convert 32-bit BytesTape to SzSequenceU32Tape for C API
+fn create_u32tape_view(tape: &BytesTape<i64, UnifiedAlloc>) -> SzSequenceU32Tape {
+    let (data_ptr, offsets_ptr, count, _capacity) = tape.as_raw_parts();
+    SzSequenceU32Tape {
+        data: data_ptr,
+        offsets: offsets_ptr as *const u32,
+        count,
     }
+}
 
-    /// Apache Arrow-like tape for non-NULL strings with 32-bit offsets
-    #[repr(C)]
-    struct SzSequenceU32Tape {
-        data: *const u8,
-        offsets: *const u32,
-        count: usize,
+/// Convert 32-bit StringTape to SzSequenceU32Tape for C API
+fn create_u32tape_view_str(tape: &StringTape<i64, UnifiedAlloc>) -> SzSequenceU32Tape {
+    let (data_ptr, offsets_ptr, count, _capacity) = tape.as_raw_parts();
+    SzSequenceU32Tape {
+        data: data_ptr,
+        offsets: offsets_ptr as *const u32,
+        count,
     }
+}
 
-    /// Apache Arrow-like tape for non-NULL strings with 64-bit offsets
-    #[repr(C)]
-    struct SzSequenceU64Tape {
-        data: *const u8,
-        offsets: *const u64,
-        count: usize,
+/// Convert 64-bit BytesTape to SzSequenceU64Tape for C API  
+fn create_u64tape_view(tape: &BytesTape<i64, UnifiedAlloc>) -> SzSequenceU64Tape {
+    let (data_ptr, offsets_ptr, count, _capacity) = tape.as_raw_parts();
+    SzSequenceU64Tape {
+        data: data_ptr,
+        offsets: offsets_ptr as *const u64,
+        count,
     }
+}
 
-    /// Opaque handles for similarity engines
-    pub type FingerprintsHandle = *mut c_void;
-    pub type LevenshteinDistancesHandle = *mut c_void;
-    pub type LevenshteinDistancesUtf8Handle = *mut c_void;
-    pub type NeedlemanWunschScoresHandle = *mut c_void;
-    pub type SmithWatermanScoresHandle = *mut c_void;
-
-    // C API bindings
-    extern "C" {
-        // Device scope functions
-        fn sz_device_scope_init_default(scope: *mut *mut c_void) -> Status;
-        fn sz_device_scope_init_cpu_cores(cpu_cores: usize, scope: *mut *mut c_void) -> Status;
-        fn sz_device_scope_init_gpu_device(gpu_device: usize, scope: *mut *mut c_void) -> Status;
-        fn sz_device_scope_get_capabilities(scope: *mut c_void, capabilities: *mut Capability) -> Status;
-        fn sz_device_scope_get_cpu_cores(scope: *mut c_void, cpu_cores: *mut usize) -> Status;
-        fn sz_device_scope_get_gpu_device(scope: *mut c_void, gpu_device: *mut usize) -> Status;
-        fn sz_device_scope_free(scope: *mut c_void);
-
-        // Levenshtein distance functions
-        fn sz_levenshtein_distances_init(
-            match_cost: i8,
-            mismatch_cost: i8,
-            open_cost: i8,
-            extend_cost: i8,
-            alloc: *const c_void,
-            capabilities: Capability,
-            engine: *mut LevenshteinDistancesHandle,
-        ) -> Status;
-
-        fn sz_levenshtein_distances_sequence(
-            engine: LevenshteinDistancesHandle,
-            device: *mut c_void,
-            a: *const c_void, // sz_sequence_t
-            b: *const c_void, // sz_sequence_t
-            results: *mut usize,
-            results_stride: usize,
-        ) -> Status;
-
-        fn sz_levenshtein_distances_u32tape(
-            engine: LevenshteinDistancesHandle,
-            device: *mut c_void,
-            a: *const c_void, // sz_sequence_u32tape_t
-            b: *const c_void, // sz_sequence_u32tape_t
-            results: *mut usize,
-            results_stride: usize,
-        ) -> Status;
-
-        fn sz_levenshtein_distances_u64tape(
-            engine: LevenshteinDistancesHandle,
-            device: *mut c_void,
-            a: *const c_void, // sz_sequence_u64tape_t
-            b: *const c_void, // sz_sequence_u64tape_t
-            results: *mut usize,
-            results_stride: usize,
-        ) -> Status;
-
-        fn sz_levenshtein_distances_free(engine: LevenshteinDistancesHandle);
-
-        // Levenshtein distance UTF-8 functions
-        fn sz_levenshtein_distances_utf8_init(
-            match_cost: i8,
-            mismatch_cost: i8,
-            open_cost: i8,
-            extend_cost: i8,
-            alloc: *const c_void,
-            capabilities: Capability,
-            engine: *mut LevenshteinDistancesUtf8Handle,
-        ) -> Status;
-
-        fn sz_levenshtein_distances_utf8_sequence(
-            engine: LevenshteinDistancesUtf8Handle,
-            device: *mut c_void,
-            a: *const c_void, // sz_sequence_t
-            b: *const c_void, // sz_sequence_t
-            results: *mut usize,
-            results_stride: usize,
-        ) -> Status;
-
-        fn sz_levenshtein_distances_utf8_u32tape(
-            engine: LevenshteinDistancesUtf8Handle,
-            device: *mut c_void,
-            a: *const c_void, // sz_sequence_u32tape_t
-            b: *const c_void, // sz_sequence_u32tape_t
-            results: *mut usize,
-            results_stride: usize,
-        ) -> Status;
-
-        fn sz_levenshtein_distances_utf8_u64tape(
-            engine: LevenshteinDistancesUtf8Handle,
-            device: *mut c_void,
-            a: *const c_void, // sz_sequence_u64tape_t
-            b: *const c_void, // sz_sequence_u64tape_t
-            results: *mut usize,
-            results_stride: usize,
-        ) -> Status;
-
-        fn sz_levenshtein_distances_utf8_free(engine: LevenshteinDistancesUtf8Handle);
-
-        // Needleman-Wunsch scoring functions
-        fn sz_needleman_wunsch_scores_init(
-            subs: *const i8, // 256x256 substitution matrix
-            open_cost: i8,
-            extend_cost: i8,
-            alloc: *const c_void,
-            capabilities: Capability,
-            engine: *mut NeedlemanWunschScoresHandle,
-        ) -> Status;
-
-        fn sz_needleman_wunsch_scores_sequence(
-            engine: NeedlemanWunschScoresHandle,
-            device: *mut c_void,
-            a: *const c_void, // sz_sequence_t
-            b: *const c_void, // sz_sequence_t
-            results: *mut isize,
-            results_stride: usize,
-        ) -> Status;
-
-        fn sz_needleman_wunsch_scores_u32tape(
-            engine: NeedlemanWunschScoresHandle,
-            device: *mut c_void,
-            a: *const c_void, // sz_sequence_u32tape_t
-            b: *const c_void, // sz_sequence_u32tape_t
-            results: *mut isize,
-            results_stride: usize,
-        ) -> Status;
-
-        fn sz_needleman_wunsch_scores_u64tape(
-            engine: NeedlemanWunschScoresHandle,
-            device: *mut c_void,
-            a: *const c_void, // sz_sequence_u64tape_t
-            b: *const c_void, // sz_sequence_u64tape_t
-            results: *mut isize,
-            results_stride: usize,
-        ) -> Status;
-
-        fn sz_needleman_wunsch_scores_free(engine: NeedlemanWunschScoresHandle);
-
-        // Smith-Waterman scoring functions
-        fn sz_smith_waterman_scores_init(
-            subs: *const i8, // 256x256 substitution matrix
-            open_cost: i8,
-            extend_cost: i8,
-            alloc: *const c_void,
-            capabilities: Capability,
-            engine: *mut SmithWatermanScoresHandle,
-        ) -> Status;
-
-        fn sz_smith_waterman_scores_sequence(
-            engine: SmithWatermanScoresHandle,
-            device: *mut c_void,
-            a: *const c_void, // sz_sequence_t
-            b: *const c_void, // sz_sequence_t
-            results: *mut isize,
-            results_stride: usize,
-        ) -> Status;
-
-        fn sz_smith_waterman_scores_u32tape(
-            engine: SmithWatermanScoresHandle,
-            device: *mut c_void,
-            a: *const c_void, // sz_sequence_u32tape_t
-            b: *const c_void, // sz_sequence_u32tape_t
-            results: *mut isize,
-            results_stride: usize,
-        ) -> Status;
-
-        fn sz_smith_waterman_scores_u64tape(
-            engine: SmithWatermanScoresHandle,
-            device: *mut c_void,
-            a: *const c_void, // sz_sequence_u64tape_t
-            b: *const c_void, // sz_sequence_u64tape_t
-            results: *mut isize,
-            results_stride: usize,
-        ) -> Status;
-
-        fn sz_smith_waterman_scores_free(engine: SmithWatermanScoresHandle);
-
-        // Fingerprinting functions
-        fn sz_fingerprints_init(
-            dimensions: usize,
-            alphabet_size: usize,
-            window_widths: *const usize,
-            window_widths_count: usize,
-            alloc: *const c_void, // MemoryAllocator - using null for default
-            capabilities: Capability,
-            engine: *mut FingerprintsHandle,
-        ) -> Status;
-
-        fn sz_fingerprints_sequence(
-            engine: FingerprintsHandle,
-            device: *mut c_void,  // DeviceScope
-            texts: *const c_void, // sz_sequence_t
-            min_hashes: *mut u32,
-            min_hashes_stride: usize,
-            min_counts: *mut u32,
-            min_counts_stride: usize,
-        ) -> Status;
-
-        fn sz_fingerprints_u32tape(
-            engine: FingerprintsHandle,
-            device: *mut c_void,  // DeviceScope
-            texts: *const c_void, // sz_sequence_u32tape_t
-            min_hashes: *mut u32,
-            min_hashes_stride: usize,
-            min_counts: *mut u32,
-            min_counts_stride: usize,
-        ) -> Status;
-
-        fn sz_fingerprints_u64tape(
-            engine: FingerprintsHandle,
-            device: *mut c_void,  // DeviceScope
-            texts: *const c_void, // sz_sequence_u64tape_t
-            min_hashes: *mut u32,
-            min_hashes_stride: usize,
-            min_counts: *mut u32,
-            min_counts_stride: usize,
-        ) -> Status;
-
-        fn sz_fingerprints_free(engine: FingerprintsHandle);
-
-        // Unified allocator functions
-        fn sz_unified_alloc(size_bytes: usize) -> *mut c_void;
-        fn sz_unified_free(ptr: *mut c_void, size_bytes: usize);
+/// Convert 64-bit StringTape to SzSequenceU64Tape for C API  
+fn create_u64tape_view_str(tape: &StringTape<i64, UnifiedAlloc>) -> SzSequenceU64Tape {
+    let (data_ptr, offsets_ptr, count, _capacity) = tape.as_raw_parts();
+    SzSequenceU64Tape {
+        data: data_ptr,
+        offsets: offsets_ptr as *const u64,
+        count,
     }
+}
 
-    /// Unified memory allocator that uses CUDA unified memory when available,
-    /// falls back to malloc otherwise. Works with allocator-api2.
-    pub struct UnifiedAlloc;
-
-    unsafe impl Allocator for UnifiedAlloc {
-        fn allocate(&self, layout: Layout) -> Result<core::ptr::NonNull<[u8]>, AllocError> {
-            let size = layout.size();
-            if size == 0 {
-                // For zero-sized allocations, return a properly aligned non-null dangling pointer
-                let ptr = core::ptr::NonNull::new(layout.align() as *mut u8).ok_or(AllocError)?;
-                return Ok(core::ptr::NonNull::slice_from_raw_parts(ptr, 0));
-            }
+/// Generic C callback to get start of string at index for byte slices
+extern "C" fn sz_sequence_get_start_generic<T: AsRef<[u8]>>(handle: *mut c_void, index: usize) -> *const u8 {
+    unsafe {
+        let strings = core::slice::from_raw_parts(handle as *const T, index + 1);
+        strings[index].as_ref().as_ptr()
+    }
+}
 
-            let ptr = unsafe { sz_unified_alloc(size) };
-            if ptr.is_null() {
-                return Err(AllocError);
-            }
+/// Generic C callback to get length of string at index for byte slices
+extern "C" fn sz_sequence_get_length_generic<T: AsRef<[u8]>>(handle: *mut c_void, index: usize) -> usize {
+    unsafe {
+        let strings = core::slice::from_raw_parts(handle as *const T, index + 1);
+        strings[index].as_ref().len()
+    }
+}
 
-            let ptr = core::ptr::NonNull::new(ptr as *mut u8).ok_or(AllocError)?;
-            Ok(core::ptr::NonNull::slice_from_raw_parts(ptr, size))
-        }
+/// Generic C callback to get start of string at index for str slices
+extern "C" fn sz_sequence_get_start_str<T: AsRef<str>>(handle: *mut c_void, index: usize) -> *const u8 {
+    unsafe {
+        let strings = core::slice::from_raw_parts(handle as *const T, index + 1);
+        strings[index].as_ref().as_bytes().as_ptr()
+    }
+}
 
-        unsafe fn deallocate(&self, ptr: core::ptr::NonNull<u8>, layout: Layout) {
-            if layout.size() != 0 {
-                sz_unified_free(ptr.as_ptr() as *mut c_void, layout.size());
-            }
-        }
+/// Generic C callback to get length of string at index for str slices
+extern "C" fn sz_sequence_get_length_str<T: AsRef<str>>(handle: *mut c_void, index: usize) -> usize {
+    unsafe {
+        let strings = core::slice::from_raw_parts(handle as *const T, index + 1);
+        strings[index].as_ref().as_bytes().len()
     }
+}
 
-    /// Type alias for Vec with unified allocator
-    pub type UnifiedVec<T> = allocator_api2::vec::Vec<T, UnifiedAlloc>;
+/// Get information about the compiled backend
+///
+/// # Examples
+///
+/// ```
+/// # use stringzilla::stringzillas::szs::backend_info;
+/// let info = backend_info();
+/// println!("Using backend: {}", info);
+/// ```
+pub fn backend_info() -> &'static str {
+    #[cfg(feature = "cuda")]
+    return "CUDA GPU acceleration enabled";
+
+    #[cfg(all(feature = "rocm", not(feature = "cuda")))]
+    return "ROCm GPU acceleration enabled";
+
+    #[cfg(all(feature = "cpus", not(any(feature = "cuda", feature = "rocm"))))]
+    return "Multi-threaded CPU backend enabled";
+
+    #[cfg(not(any(feature = "cpus", feature = "cuda", feature = "rocm")))]
+    return "StringZillas not available - enable cpus, cuda, or rocm feature";
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
 
-    /// Levenshtein distance engine for batch processing
-    pub struct LevenshteinDistances {
-        handle: LevenshteinDistancesHandle,
+    #[test]
+    fn test_backend_info() {
+        let info = backend_info();
+        assert!(!info.is_empty());
+        println!("Backend: {}", info);
     }
 
-    impl LevenshteinDistances {
-        /// Create a new Levenshtein distances engine
-        /// Uses the device scope to infer capabilities
-        pub fn new(
-            device: &DeviceScope,
-            match_cost: i8,
-            mismatch_cost: i8,
-            open_cost: i8,
-            extend_cost: i8,
-        ) -> Result<Self, Status> {
-            let mut handle = ptr::null_mut();
-            let capabilities = device.get_capabilities().unwrap_or(0);
-            let status = unsafe {
-                sz_levenshtein_distances_init(
-                    match_cost,
-                    mismatch_cost,
-                    open_cost,
-                    extend_cost,
-                    ptr::null(),
-                    capabilities,
-                    &mut handle,
-                )
-            };
-            match status {
-                Status::Success => Ok(Self { handle }),
-                err => Err(err),
+    #[test]
+    fn test_device_scope_creation() {
+        // Test default device scope
+        let default_device = DeviceScope::default();
+        match default_device {
+            Ok(device) => {
+                // Test capability query
+                let _caps = device.get_capabilities();
+                println!("Default device capabilities: {:?}", _caps);
             }
+            Err(e) => println!("Default device creation failed: {:?}", e),
         }
 
-        /// Compute Levenshtein distances between sequence pairs
-        pub fn compute<T, S>(
-            &self,
-            device: &DeviceScope,
-            sequences_a: T,
-            sequences_b: T,
-        ) -> Result<UnifiedVec<usize>, Status>
-        where
-            T: AsRef<[S]>,
-            S: AsRef<[u8]>,
-        {
-            let seq_a_slice = sequences_a.as_ref();
-            let seq_b_slice = sequences_b.as_ref();
-            let num_pairs = seq_a_slice.len().min(seq_b_slice.len());
-
-            let mut results = UnifiedVec::with_capacity_in(num_pairs, UnifiedAlloc);
-            results.resize(num_pairs, 0);
-
-            let results_stride = core::mem::size_of::<usize>();
-
-            if device.is_gpu() {
-                let (tape_a, use_64bit_a) = create_tape(seq_a_slice)?;
-                let (tape_b, use_64bit_b) = create_tape(seq_b_slice)?;
-
-                let status = if use_64bit_a || use_64bit_b {
-                    let tape_a_view = create_u64tape_view(&tape_a);
-                    let tape_b_view = create_u64tape_view(&tape_b);
-                    unsafe {
-                        sz_levenshtein_distances_u64tape(
-                            self.handle,
-                            device.handle,
-                            &tape_a_view as *const _ as *const c_void,
-                            &tape_b_view as *const _ as *const c_void,
-                            results.as_mut_ptr(),
-                            results_stride,
-                        )
-                    }
-                } else {
-                    let tape_a_view = create_u32tape_view(&tape_a);
-                    let tape_b_view = create_u32tape_view(&tape_b);
-                    unsafe {
-                        sz_levenshtein_distances_u32tape(
-                            self.handle,
-                            device.handle,
-                            &tape_a_view as *const _ as *const c_void,
-                            &tape_b_view as *const _ as *const c_void,
-                            results.as_mut_ptr(),
-                            results_stride,
-                        )
-                    }
-                };
-                match status {
-                    Status::Success => Ok(results),
-                    err => Err(err),
-                }
-            } else {
-                let seq_a = create_sequence_view(seq_a_slice);
-                let seq_b = create_sequence_view(seq_b_slice);
-                let status = unsafe {
-                    sz_levenshtein_distances_sequence(
-                        self.handle,
-                        device.handle,
-                        &seq_a as *const _ as *const c_void,
-                        &seq_b as *const _ as *const c_void,
-                        results.as_mut_ptr(),
-                        results_stride,
-                    )
-                };
-                match status {
-                    Status::Success => Ok(results),
-                    err => Err(err),
+        // Test CPU device scope with valid core count
+        let cpu_device = DeviceScope::cpu_cores(4);
+        match cpu_device {
+            Ok(device) => {
+                assert!(!device.is_gpu());
+                if let Ok(cores) = device.get_cpu_cores() {
+                    assert_eq!(cores, 4);
                 }
             }
+            Err(e) => println!("CPU device creation failed: {:?}", e),
         }
-    }
 
-    impl Drop for LevenshteinDistances {
-        fn drop(&mut self) {
-            if !self.handle.is_null() {
-                unsafe { sz_levenshtein_distances_free(self.handle) };
+        // Test GPU device scope (may fail if no GPU)
+        let gpu_device = DeviceScope::gpu_device(0);
+        match gpu_device {
+            Ok(device) => {
+                assert!(device.is_gpu());
+                if let Ok(gpu_id) = device.get_gpu_device() {
+                    assert_eq!(gpu_id, 0);
+                }
             }
+            Err(e) => println!("GPU device creation failed (expected if no GPU): {:?}", e),
         }
     }
 
-    /// UTF-8 aware Levenshtein distance engine
-    pub struct LevenshteinDistancesUtf8 {
-        handle: LevenshteinDistancesUtf8Handle,
+    #[test]
+    fn test_device_scope_validation() {
+        // Test invalid CPU core count
+        let invalid_cpu = DeviceScope::cpu_cores(0);
+        assert!(invalid_cpu.is_err());
+
+        let single_core = DeviceScope::cpu_cores(1);
+        assert!(single_core.is_err()); // Should fail as per implementation
     }
 
-    impl LevenshteinDistancesUtf8 {
-        /// Create a new UTF-8 Levenshtein distances engine
-        /// Uses the device scope to infer capabilities
-        pub fn new(
-            device: &DeviceScope,
-            match_cost: i8,
-            mismatch_cost: i8,
-            open_cost: i8,
-            extend_cost: i8,
-        ) -> Result<Self, Status> {
-            let mut handle = ptr::null_mut();
-            let capabilities = device.get_capabilities().unwrap_or(0);
-            let status = unsafe {
-                sz_levenshtein_distances_utf8_init(
-                    match_cost,
-                    mismatch_cost,
-                    open_cost,
-                    extend_cost,
-                    ptr::null(),
-                    capabilities,
-                    &mut handle,
-                )
-            };
-            match status {
-                Status::Success => Ok(Self { handle }),
-                err => Err(err),
-            }
+    #[test]
+    fn test_fingerprint_builder_configurations() {
+        let device_result = DeviceScope::default();
+        if device_result.is_err() {
+            println!("Skipping fingerprint tests - device initialization failed");
+            return;
         }
+        let device = device_result.unwrap();
 
-        /// Compute UTF-8 aware Levenshtein distances between sequence pairs
-        pub fn compute<T, S>(
-            &self,
-            device: &DeviceScope,
-            sequences_a: T,
-            sequences_b: T,
-        ) -> Result<UnifiedVec<usize>, Status>
-        where
-            T: AsRef<[S]>,
-            S: AsRef<str>,
-        {
-            let seq_a_slice = sequences_a.as_ref();
-            let seq_b_slice = sequences_b.as_ref();
-            let num_pairs = seq_a_slice.len().min(seq_b_slice.len());
-
-            let mut results = UnifiedVec::with_capacity_in(num_pairs, UnifiedAlloc);
-            results.resize(num_pairs, 0);
-
-            let results_stride = core::mem::size_of::<usize>();
-
-            if device.is_gpu() {
-                let (tape_a, use_64bit_a) = create_tape_str(seq_a_slice)?;
-                let (tape_b, use_64bit_b) = create_tape_str(seq_b_slice)?;
-
-                let status = if use_64bit_a || use_64bit_b {
-                    let tape_a_view = create_u64tape_view(&tape_a);
-                    let tape_b_view = create_u64tape_view(&tape_b);
-                    unsafe {
-                        sz_levenshtein_distances_utf8_u64tape(
-                            self.handle,
-                            device.handle,
-                            &tape_a_view as *const _ as *const c_void,
-                            &tape_b_view as *const _ as *const c_void,
-                            results.as_mut_ptr(),
-                            results_stride,
-                        )
-                    }
-                } else {
-                    let tape_a_view = create_u32tape_view(&tape_a);
-                    let tape_b_view = create_u32tape_view(&tape_b);
-                    unsafe {
-                        sz_levenshtein_distances_utf8_u32tape(
-                            self.handle,
-                            device.handle,
-                            &tape_a_view as *const _ as *const c_void,
-                            &tape_b_view as *const _ as *const c_void,
-                            results.as_mut_ptr(),
-                            results_stride,
-                        )
-                    }
-                };
-                match status {
-                    Status::Success => Ok(results),
-                    err => Err(err),
-                }
-            } else {
-                let seq_a = create_sequence_view_str(seq_a_slice);
-                let seq_b = create_sequence_view_str(seq_b_slice);
-                let status = unsafe {
-                    sz_levenshtein_distances_utf8_sequence(
-                        self.handle,
-                        device.handle,
-                        &seq_a as *const _ as *const c_void,
-                        &seq_b as *const _ as *const c_void,
-                        results.as_mut_ptr(),
-                        results_stride,
-                    )
-                };
-                match status {
-                    Status::Success => Ok(results),
-                    err => Err(err),
-                }
-            }
-        }
-    }
+        // Test default configuration
+        let default_engine = Fingerprints::builder().build(&device);
+        assert!(default_engine.is_ok(), "Default fingerprint engine should initialize");
 
-    impl Drop for LevenshteinDistancesUtf8 {
-        fn drop(&mut self) {
-            if !self.handle.is_null() {
-                unsafe { sz_levenshtein_distances_utf8_free(self.handle) };
-            }
-        }
-    }
+        // Test binary configuration
+        let binary_engine = Fingerprints::builder().binary().dimensions(256).build(&device);
+        assert!(binary_engine.is_ok(), "Binary fingerprint engine should initialize");
 
-    /// Needleman-Wunsch alignment scoring engine
-    pub struct NeedlemanWunschScores {
-        handle: NeedlemanWunschScoresHandle,
+        // Test ASCII configuration
+        let ascii_engine = Fingerprints::builder().ascii().dimensions(256).build(&device);
+        assert!(ascii_engine.is_ok(), "ASCII fingerprint engine should initialize");
+
+        // Test DNA configuration
+        let dna_engine = Fingerprints::builder()
+            .dna()
+            .window_widths(&[3, 5, 7])
+            .dimensions(192) // 64 * 3 window widths
+            .build(&device);
+        assert!(dna_engine.is_ok(), "DNA fingerprint engine should initialize");
+
+        // Test protein configuration
+        let protein_engine = Fingerprints::builder()
+            .protein()
+            .window_widths(&[5, 7])
+            .dimensions(128) // 64 * 2 window widths
+            .build(&device);
+        assert!(protein_engine.is_ok(), "Protein fingerprint engine should initialize");
+
+        // Test custom configuration
+        let custom_engine = Fingerprints::builder()
+            .alphabet_size(16) // Hexadecimal
+            .window_widths(&[4, 6, 8])
+            .dimensions(192) // 64 * 3 window widths
+            .build(&device);
+        assert!(custom_engine.is_ok(), "Custom fingerprint engine should initialize");
     }
 
-    impl NeedlemanWunschScores {
-        /// Create a new Needleman-Wunsch scoring engine
-        /// Uses the device scope to infer capabilities
-        pub fn new(
-            device: &DeviceScope,
-            substitution_matrix: &[[i8; 256]; 256],
-            open_cost: i8,
-            extend_cost: i8,
-        ) -> Result<Self, Status> {
-            let mut handle = ptr::null_mut();
-            let capabilities = device.get_capabilities().unwrap_or(0);
-            let status = unsafe {
-                sz_needleman_wunsch_scores_init(
-                    substitution_matrix.as_ptr() as *const i8,
-                    open_cost,
-                    extend_cost,
-                    ptr::null(),
-                    capabilities,
-                    &mut handle,
-                )
-            };
-            match status {
-                Status::Success => Ok(Self { handle }),
-                err => Err(err),
-            }
+    #[test]
+    fn test_fingerprint_computation() {
+        let device_result = DeviceScope::default();
+        if device_result.is_err() {
+            println!("Skipping fingerprint computation test - device initialization failed");
+            return;
         }
-
-        /// Compute Needleman-Wunsch alignment scores between sequence pairs
-        pub fn compute<T, S>(
-            &self,
-            device: &DeviceScope,
-            sequences_a: T,
-            sequences_b: T,
-        ) -> Result<UnifiedVec<isize>, Status>
-        where
-            T: AsRef<[S]>,
-            S: AsRef<[u8]>,
-        {
-            let seq_a_slice = sequences_a.as_ref();
-            let seq_b_slice = sequences_b.as_ref();
-            let num_pairs = seq_a_slice.len().min(seq_b_slice.len());
-
-            let mut results = UnifiedVec::with_capacity_in(num_pairs, UnifiedAlloc);
-            results.resize(num_pairs, 0);
-
-            let results_stride = core::mem::size_of::<isize>();
-
-            if device.is_gpu() {
-                let (tape_a, use_64bit_a) = create_tape(seq_a_slice)?;
-                let (tape_b, use_64bit_b) = create_tape(seq_b_slice)?;
-
-                let status = if use_64bit_a || use_64bit_b {
-                    let tape_a_view = create_u64tape_view(&tape_a);
-                    let tape_b_view = create_u64tape_view(&tape_b);
-                    unsafe {
-                        sz_needleman_wunsch_scores_u64tape(
-                            self.handle,
-                            device.handle,
-                            &tape_a_view as *const _ as *const c_void,
-                            &tape_b_view as *const _ as *const c_void,
-                            results.as_mut_ptr(),
-                            results_stride,
-                        )
-                    }
-                } else {
-                    let tape_a_view = create_u32tape_view(&tape_a);
-                    let tape_b_view = create_u32tape_view(&tape_b);
-                    unsafe {
-                        sz_needleman_wunsch_scores_u32tape(
-                            self.handle,
-                            device.handle,
-                            &tape_a_view as *const _ as *const c_void,
-                            &tape_b_view as *const _ as *const c_void,
-                            results.as_mut_ptr(),
-                            results_stride,
-                        )
-                    }
-                };
-                match status {
-                    Status::Success => Ok(results),
-                    err => Err(err),
-                }
-            } else {
-                let seq_a = create_sequence_view(seq_a_slice);
-                let seq_b = create_sequence_view(seq_b_slice);
-                let status = unsafe {
-                    sz_needleman_wunsch_scores_sequence(
-                        self.handle,
-                        device.handle,
-                        &seq_a as *const _ as *const c_void,
-                        &seq_b as *const _ as *const c_void,
-                        results.as_mut_ptr(),
-                        results_stride,
-                    )
-                };
-                match status {
-                    Status::Success => Ok(results),
-                    err => Err(err),
-                }
+        let device = device_result.unwrap();
+
+        let engine_result = Fingerprints::builder()
+            .binary()
+            .dimensions(64) // Small dimensions for testing
+            .build(&device);
+        if engine_result.is_err() {
+            println!("Skipping fingerprint computation test - engine initialization failed");
+            return;
+        }
+        let engine = engine_result.unwrap();
+
+        // Test basic computation
+        let test_strings = vec!["hello", "world", "test"];
+        let result = engine.compute(&device, &test_strings, 64);
+        match result {
+            Ok((hashes, counts)) => {
+                assert_eq!(hashes.len(), 3 * 64); // 3 strings * 64 dimensions
+                assert_eq!(counts.len(), 3 * 64); // 3 strings * 64 dimensions
+                println!("Fingerprint computation successful");
             }
+            Err(e) => println!("Fingerprint computation failed: {:?}", e),
         }
     }
 
-    impl Drop for NeedlemanWunschScores {
-        fn drop(&mut self) {
-            if !self.handle.is_null() {
-                unsafe { sz_needleman_wunsch_scores_free(self.handle) };
+    #[test]
+    fn test_levenshtein_distance_engine() {
+        let device_result = DeviceScope::default();
+        if device_result.is_err() {
+            println!("Skipping Levenshtein test - device initialization failed");
+            return;
+        }
+        let device = device_result.unwrap();
+
+        // Test engine creation
+        let engine_result = LevenshteinDistances::new(
+            &device, 0, // match cost
+            1, // mismatch cost
+            1, // open cost
+            1, // extend cost
+        );
+        if engine_result.is_err() {
+            println!("Skipping Levenshtein test - engine initialization failed");
+            return;
+        }
+        let engine = engine_result.unwrap();
+
+        // Test distance computation
+        let strings_a = vec!["kitten", "saturday"];
+        let strings_b = vec!["sitting", "sunday"];
+        let result = engine.compute(&device, &strings_a, &strings_b);
+        match result {
+            Ok(distances) => {
+                assert_eq!(distances.len(), 2);
+                println!("Levenshtein distances: {:?}", distances);
+                // kitten -> sitting should be 3 (substitute k->s, e->i, insert g)
+                // saturday -> sunday should be 3 (delete a,t,r)
             }
+            Err(e) => println!("Levenshtein computation failed: {:?}", e),
         }
     }
 
-    /// Smith-Waterman local alignment scoring engine
-    pub struct SmithWatermanScores {
-        handle: SmithWatermanScoresHandle,
-    }
-
-    impl SmithWatermanScores {
-        /// Create a new Smith-Waterman scoring engine
-        /// Uses the device scope to infer capabilities
-        pub fn new(
-            device: &DeviceScope,
-            substitution_matrix: &[[i8; 256]; 256],
-            open_cost: i8,
-            extend_cost: i8,
-        ) -> Result<Self, Status> {
-            let mut handle = ptr::null_mut();
-            let capabilities = device.get_capabilities().unwrap_or(0);
-            let status = unsafe {
-                sz_smith_waterman_scores_init(
-                    substitution_matrix.as_ptr() as *const i8,
-                    open_cost,
-                    extend_cost,
-                    ptr::null(),
-                    capabilities,
-                    &mut handle,
-                )
-            };
-            match status {
-                Status::Success => Ok(Self { handle }),
-                err => Err(err),
-            }
+    #[test]
+    fn test_levenshtein_utf8_engine() {
+        let device_result = DeviceScope::default();
+        if device_result.is_err() {
+            println!("Skipping UTF-8 Levenshtein test - device initialization failed");
+            return;
         }
+        let device = device_result.unwrap();
 
-        /// Compute Smith-Waterman local alignment scores between sequence pairs
-        pub fn compute<T, S>(
-            &self,
-            device: &DeviceScope,
-            sequences_a: T,
-            sequences_b: T,
-        ) -> Result<UnifiedVec<isize>, Status>
-        where
-            T: AsRef<[S]>,
-            S: AsRef<[u8]>,
-        {
-            let seq_a_slice = sequences_a.as_ref();
-            let seq_b_slice = sequences_b.as_ref();
-            let num_pairs = seq_a_slice.len().min(seq_b_slice.len());
-
-            let mut results = UnifiedVec::with_capacity_in(num_pairs, UnifiedAlloc);
-            results.resize(num_pairs, 0);
-
-            let results_stride = core::mem::size_of::<isize>();
-
-            if device.is_gpu() {
-                let (tape_a, use_64bit_a) = create_tape(seq_a_slice)?;
-                let (tape_b, use_64bit_b) = create_tape(seq_b_slice)?;
-
-                let status = if use_64bit_a || use_64bit_b {
-                    let tape_a_view = create_u64tape_view(&tape_a);
-                    let tape_b_view = create_u64tape_view(&tape_b);
-                    unsafe {
-                        sz_smith_waterman_scores_u64tape(
-                            self.handle,
-                            device.handle,
-                            &tape_a_view as *const _ as *const c_void,
-                            &tape_b_view as *const _ as *const c_void,
-                            results.as_mut_ptr(),
-                            results_stride,
-                        )
-                    }
-                } else {
-                    let tape_a_view = create_u32tape_view(&tape_a);
-                    let tape_b_view = create_u32tape_view(&tape_b);
-                    unsafe {
-                        sz_smith_waterman_scores_u32tape(
-                            self.handle,
-                            device.handle,
-                            &tape_a_view as *const _ as *const c_void,
-                            &tape_b_view as *const _ as *const c_void,
-                            results.as_mut_ptr(),
-                            results_stride,
-                        )
-                    }
-                };
-                match status {
-                    Status::Success => Ok(results),
-                    err => Err(err),
-                }
-            } else {
-                let seq_a = create_sequence_view(seq_a_slice);
-                let seq_b = create_sequence_view(seq_b_slice);
-                let status = unsafe {
-                    sz_smith_waterman_scores_sequence(
-                        self.handle,
-                        device.handle,
-                        &seq_a as *const _ as *const c_void,
-                        &seq_b as *const _ as *const c_void,
-                        results.as_mut_ptr(),
-                        results_stride,
-                    )
-                };
-                match status {
-                    Status::Success => Ok(results),
-                    err => Err(err),
-                }
-            }
+        let engine_result = LevenshteinDistancesUtf8::new(&device, 0, 1, 1, 1);
+        if engine_result.is_err() {
+            println!("Skipping UTF-8 Levenshtein test - engine initialization failed");
+            return;
         }
-    }
-
-    impl Drop for SmithWatermanScores {
-        fn drop(&mut self) {
-            if !self.handle.is_null() {
-                unsafe { sz_smith_waterman_scores_free(self.handle) };
+        let engine = engine_result.unwrap();
+
+        // Test with Unicode strings
+        let strings_a = vec!["café", "naïve"];
+        let strings_b = vec!["cafe", "naive"];
+        let result = engine.compute(&device, &strings_a, &strings_b);
+        match result {
+            Ok(distances) => {
+                assert_eq!(distances.len(), 2);
+                println!("UTF-8 Levenshtein distances: {:?}", distances);
+                // Each accented character should count as 1 substitution
             }
+            Err(e) => println!("UTF-8 Levenshtein computation failed: {:?}", e),
         }
     }
 
-    /// Zero-copy helper to create sz_sequence_t view from any container of byte slices
-    fn create_sequence_view<T: AsRef<[u8]>>(strings: &[T]) -> SzSequence {
-        SzSequence {
-            handle: strings.as_ptr() as *mut c_void,
-            count: strings.len(),
-            get_start: sz_sequence_get_start_generic::<T>,
-            get_length: sz_sequence_get_length_generic::<T>,
-            starts: ptr::null(),
-            lengths: ptr::null(),
+    #[test]
+    fn test_needleman_wunsch_engine() {
+        let device_result = DeviceScope::default();
+        if device_result.is_err() {
+            println!("Skipping Needleman-Wunsch test - device initialization failed");
+            return;
         }
-    }
+        let device = device_result.unwrap();
 
-    /// Zero-copy helper to create sz_sequence_t view from any container of strings
-    fn create_sequence_view_str<T: AsRef<str>>(strings: &[T]) -> SzSequence {
-        SzSequence {
-            handle: strings.as_ptr() as *mut c_void,
-            count: strings.len(),
-            get_start: sz_sequence_get_start_str::<T>,
-            get_length: sz_sequence_get_length_str::<T>,
-            starts: ptr::null(),
-            lengths: ptr::null(),
+        // Create simple scoring matrix
+        let mut matrix = [[-1i8; 256]; 256];
+        for i in 0..256 {
+            matrix[i][i] = 2; // Match score
         }
-    }
-
-    /// Convert StringTape to appropriate tape view for C API
-    fn create_tape<T, S>(sequences: &[T]) -> Result<(StringTape<UnifiedAlloc>, bool), Status>
-    where
-        T: AsRef<[S]>,
-        S: AsRef<[u8]>,
-    {
-        // Estimate total size to decide between 32-bit and 64-bit tapes
-        let total_size: usize = sequences.iter().map(|s| s.as_ref().len()).sum();
-        let use_64bit = total_size > u32::MAX as usize || sequences.len() > u32::MAX as usize;
-
-        let tape = if use_64bit {
-            StringTape::with_allocator_64(UnifiedAlloc)
-        } else {
-            StringTape::with_allocator_32(UnifiedAlloc)
-        };
-
-        let tape = sequences.iter().collect_into(tape);
-        Ok((tape, use_64bit))
-    }
-
-    /// Convert string sequences to StringTape
-    fn create_tape_str<T: AsRef<str>>(sequences: &[T]) -> Result<(StringTape<UnifiedAlloc>, bool), Status> {
-        // Estimate total size to decide between 32-bit and 64-bit tapes
-        let total_size: usize = sequences.iter().map(|s| s.as_ref().len()).sum();
-        let use_64bit = total_size > u32::MAX as usize || sequences.len() > u32::MAX as usize;
-
-        let tape = if use_64bit {
-            StringTape::with_allocator_64(UnifiedAlloc)
-        } else {
-            StringTape::with_allocator_32(UnifiedAlloc)
-        };
 
-        let tape = sequences.iter().map(|s| s.as_ref()).collect_into(tape);
-        Ok((tape, use_64bit))
+        let engine_result = NeedlemanWunschScores::new(&device, &matrix, -2, -1);
+        if engine_result.is_err() {
+            println!("Skipping Needleman-Wunsch test - engine initialization failed");
+            return;
+        }
+        let engine = engine_result.unwrap();
+
+        let sequences_a = vec!["ACGT"];
+        let sequences_b = vec!["ACGT"];
+        let result = engine.compute(&device, &sequences_a, &sequences_b);
+        match result {
+            Ok(scores) => {
+                assert_eq!(scores.len(), 1);
+                println!("Needleman-Wunsch score: {:?}", scores);
+                // Perfect match should give positive score
+            }
+            Err(e) => println!("Needleman-Wunsch computation failed: {:?}", e),
+        }
     }
 
-    /// Convert 32-bit StringTape to SzSequenceU32Tape for C API
-    fn create_u32tape_view(tape: &StringTape<UnifiedAlloc>) -> SzSequenceU32Tape {
-        SzSequenceU32Tape {
-            data: tape.as_ptr(),
-            offsets: tape.offsets_ptr_32(),
-            count: tape.len(),
+    #[test]
+    fn test_smith_waterman_engine() {
+        let device_result = DeviceScope::default();
+        if device_result.is_err() {
+            println!("Skipping Smith-Waterman test - device initialization failed");
+            return;
         }
-    }
+        let device = device_result.unwrap();
 
-    /// Convert 64-bit StringTape to SzSequenceU64Tape for C API  
-    fn create_u64tape_view(tape: &StringTape<UnifiedAlloc>) -> SzSequenceU64Tape {
-        SzSequenceU64Tape {
-            data: tape.as_ptr(),
-            offsets: tape.offsets_ptr_64(),
-            count: tape.len(),
+        // Create simple scoring matrix
+        let mut matrix = [[-1i8; 256]; 256];
+        for i in 0..256 {
+            matrix[i][i] = 3; // Match score
         }
-    }
 
-    /// Generic C callback to get start of string at index for byte slices
-    extern "C" fn sz_sequence_get_start_generic<T: AsRef<[u8]>>(handle: *mut c_void, index: usize) -> *const u8 {
-        unsafe {
-            let strings = core::slice::from_raw_parts(handle as *const T, index + 1);
-            strings[index].as_ref().as_ptr()
+        let engine_result = SmithWatermanScores::new(&device, &matrix, -2, -1);
+        if engine_result.is_err() {
+            println!("Skipping Smith-Waterman test - engine initialization failed");
+            return;
+        }
+        let engine = engine_result.unwrap();
+
+        let sequences_a = vec!["ACGTACGT"];
+        let sequences_b = vec!["ACGT"];
+        let result = engine.compute(&device, &sequences_a, &sequences_b);
+        match result {
+            Ok(scores) => {
+                assert_eq!(scores.len(), 1);
+                println!("Smith-Waterman score: {:?}", scores);
+                // Should find local alignment with positive score
+            }
+            Err(e) => println!("Smith-Waterman computation failed: {:?}", e),
         }
     }
 
-    /// Generic C callback to get length of string at index for byte slices
-    extern "C" fn sz_sequence_get_length_generic<T: AsRef<[u8]>>(handle: *mut c_void, index: usize) -> usize {
-        unsafe {
-            let strings = core::slice::from_raw_parts(handle as *const T, index + 1);
-            strings[index].as_ref().len()
+    #[test]
+    fn test_unified_allocator() {
+        // Test basic allocation
+        let layout = std::alloc::Layout::from_size_align(1024, 8).unwrap();
+        let alloc = UnifiedAlloc;
+
+        let result = alloc.allocate(layout);
+        match result {
+            Ok(memory) => {
+                println!("Unified allocation successful: {} bytes", memory.len());
+                unsafe { alloc.deallocate(memory.cast(), layout) };
+            }
+            Err(_) => println!("Unified allocation failed"),
         }
-    }
 
-    /// Generic C callback to get start of string at index for str slices
-    extern "C" fn sz_sequence_get_start_str<T: AsRef<str>>(handle: *mut c_void, index: usize) -> *const u8 {
-        unsafe {
-            let strings = core::slice::from_raw_parts(handle as *const T, index + 1);
-            strings[index].as_ref().as_bytes().as_ptr()
+        // Test zero-size allocation
+        let zero_layout = std::alloc::Layout::from_size_align(0, 1).unwrap();
+        let zero_result = alloc.allocate(zero_layout);
+        match zero_result {
+            Ok(memory) => {
+                assert_eq!(memory.len(), 0);
+                unsafe { alloc.deallocate(memory.cast(), zero_layout) };
+            }
+            Err(_) => println!("Zero-size allocation failed"),
         }
     }
 
-    /// Generic C callback to get length of string at index for str slices
-    extern "C" fn sz_sequence_get_length_str<T: AsRef<str>>(handle: *mut c_void, index: usize) -> usize {
-        unsafe {
-            let strings = core::slice::from_raw_parts(handle as *const T, index + 1);
-            strings[index].as_ref().as_bytes().len()
+    #[test]
+    fn test_error_handling() {
+        // Test invalid device scope parameters
+        let invalid_cpu = DeviceScope::cpu_cores(0);
+        assert!(invalid_cpu.is_err());
+
+        let invalid_gpu = DeviceScope::gpu_device(999);
+        // May succeed or fail depending on system, but shouldn't panic
+        match invalid_gpu {
+            Ok(_) => println!("GPU device 999 unexpectedly available"),
+            Err(e) => println!("GPU device 999 correctly failed: {:?}", e),
         }
     }
 
-    /// Get information about the compiled backend
-    ///
-    /// # Examples
-    ///
-    /// ```
-    /// # use stringzilla::stringzillas::szs::backend_info;
-    /// let info = backend_info();
-    /// println!("Using backend: {}", info);
-    /// ```
-    pub fn backend_info() -> &'static str {
-        #[cfg(feature = "cuda")]
-        return "CUDA GPU acceleration enabled";
-
-        #[cfg(all(feature = "rocm", not(feature = "cuda")))]
-        return "ROCm GPU acceleration enabled";
+    #[test]
+    fn test_thread_safety() {
+        use std::sync::Arc;
+        use std::thread;
+
+        let device_result = DeviceScope::default();
+        if device_result.is_err() {
+            println!("Skipping thread safety test - device initialization failed");
+            return;
+        }
+        let device = Arc::new(device_result.unwrap());
 
-        #[cfg(all(feature = "cpus", not(any(feature = "cuda", feature = "rocm"))))]
-        return "Multi-threaded CPU backend enabled";
+        let engine_result = Fingerprints::builder().dimensions(64).build(&device);
+        if engine_result.is_err() {
+            println!("Skipping thread safety test - engine initialization failed");
+            return;
+        }
+        let engine = Arc::new(engine_result.unwrap());
+
+        // Test parallel computation
+        let handles: Vec<_> = (0..4)
+            .map(|i| {
+                let device = Arc::clone(&device);
+                let engine = Arc::clone(&engine);
+                thread::spawn(move || {
+                    let test_data = vec![format!("thread_{}_data", i)];
+                    engine.compute(&device, &test_data, 64)
+                })
+            })
+            .collect();
+
+        let mut success_count = 0;
+        for handle in handles {
+            match handle.join().unwrap() {
+                Ok(_) => success_count += 1,
+                Err(e) => println!("Thread computation failed: {:?}", e),
+            }
+        }
 
-        #[cfg(not(any(feature = "cpus", feature = "cuda", feature = "rocm")))]
-        return "StringZillas not available - enable cpus, cuda, or rocm feature";
+        println!("Thread safety test: {}/4 threads succeeded", success_count);
     }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::szs::*;
 
     #[test]
-    fn test_backend_info() {
-        let info = backend_info();
-        assert!(!info.is_empty());
-        println!("Backend: {}", info);
+    fn test_large_batch_processing() {
+        let device_result = DeviceScope::default();
+        if device_result.is_err() {
+            println!("Skipping large batch test - device initialization failed");
+            return;
+        }
+        let device = device_result.unwrap();
+
+        let engine_result = Fingerprints::builder().dimensions(64).build(&device);
+        if engine_result.is_err() {
+            println!("Skipping large batch test - engine initialization failed");
+            return;
+        }
+        let engine = engine_result.unwrap();
+
+        // Create large batch
+        let large_batch: Vec<String> = (0..1000).map(|i| format!("test_string_{}", i)).collect();
+        let large_batch_refs: Vec<&str> = large_batch.iter().map(|s| s.as_str()).collect();
+
+        let result = engine.compute(&device, &large_batch_refs, 64);
+        match result {
+            Ok((hashes, counts)) => {
+                assert_eq!(hashes.len(), 1000 * 64);
+                assert_eq!(counts.len(), 1000 * 64);
+                println!("Large batch processing successful: 1000 strings processed");
+            }
+            Err(e) => println!("Large batch processing failed: {:?}", e),
+        }
     }
 
     #[test]
-    fn test_fingerprint_engine_builder() {
-        // Test builder pattern for different use cases
-        let device = DeviceScope::default().unwrap();
+    fn test_similarity_estimation() {
+        let device_result = DeviceScope::default();
+        if device_result.is_err() {
+            println!("Skipping similarity test - device initialization failed");
+            return;
+        }
+        let device = device_result.unwrap();
 
-        let _binary_engine = Fingerprints::builder().binary().build(&device).unwrap();
+        let engine_result = Fingerprints::builder().dimensions(128).build(&device);
+        if engine_result.is_err() {
+            println!("Skipping similarity test - engine initialization failed");
+            return;
+        }
+        let engine = engine_result.unwrap();
+
+        let test_strings = vec![
+            "the quick brown fox",
+            "the quick brown fox",  // Identical
+            "the quick brown dog",  // Similar
+            "completely different", // Different
+        ];
+
+        let result = engine.compute(&device, &test_strings, 128);
+        match result {
+            Ok((hashes, _counts)) => {
+                // Compare fingerprints
+                let dimensions = 128;
+
+                // Compare identical strings (should have high similarity)
+                let mut matches_identical = 0;
+                for i in 0..dimensions {
+                    if hashes[0 * dimensions + i] == hashes[1 * dimensions + i] {
+                        matches_identical += 1;
+                    }
+                }
+                let similarity_identical = matches_identical as f64 / dimensions as f64;
 
-        let _dna_engine = Fingerprints::builder()
-            .dna()
-            .window_widths(&[3, 5, 7])
-            .dimensions(192) // 64 * 3 window widths
-            .build(&device)
-            .unwrap();
-
-        let _custom_engine = Fingerprints::builder()
-            .alphabet_size(64)
-            .window_widths(&[5, 7, 11, 13])
-            .dimensions(256) // 64 * 4 window widths
-            .build(&device)
-            .unwrap();
-    }
+                // Compare similar strings
+                let mut matches_similar = 0;
+                for i in 0..dimensions {
+                    if hashes[0 * dimensions + i] == hashes[2 * dimensions + i] {
+                        matches_similar += 1;
+                    }
+                }
+                let similarity_similar = matches_similar as f64 / dimensions as f64;
 
-    #[test]
-    fn test_device_scope() {
-        // Note: These will fail until the C functions are implemented
-        // but they test the Rust API structure
-        let _default_device = DeviceScope::default();
-        let _cpu_device = DeviceScope::cpu_cores(4);
-        let _gpu_device = DeviceScope::gpu_device(0);
+                // Compare different strings
+                let mut matches_different = 0;
+                for i in 0..dimensions {
+                    if hashes[0 * dimensions + i] == hashes[3 * dimensions + i] {
+                        matches_different += 1;
+                    }
+                }
+                let similarity_different = matches_different as f64 / dimensions as f64;
+
+                println!("Similarity identical: {:.3}", similarity_identical);
+                println!("Similarity similar: {:.3}", similarity_similar);
+                println!("Similarity different: {:.3}", similarity_different);
+
+                // Basic sanity checks
+                assert!(similarity_identical >= similarity_similar);
+                assert!(similarity_similar >= similarity_different);
+            }
+            Err(e) => println!("Similarity estimation failed: {:?}", e),
+        }
     }
 }

From 4ef046429d4b951bf1682f78f6f7bf1075824aa5 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 16:13:15 +0000
Subject: [PATCH 594/751] Fix: Report requesting 1 CPU core

---
 c/stringzillas.cuh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index 80d65788..61fb5d69 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -778,7 +778,9 @@ SZ_DYNAMIC sz_status_t sz_device_scope_init_default(sz_device_scope_t *scope_pun
 SZ_DYNAMIC sz_status_t sz_device_scope_init_cpu_cores(sz_size_t cpu_cores, sz_device_scope_t *scope_punned) {
     sz_assert_(scope_punned != nullptr && "Scope must not be null");
     sz_assert_(cpu_cores > 0 && "CPU cores must be greater than zero");
-    sz_assert_(cpu_cores > 1 && "For a single-threaded execution, use the default scope");
+
+    // ! For a single-threaded execution, use the default scope
+    if (cpu_cores <= 1) return sz_status_unknown_k;
 
     sz::cpu_specs_t specs;
     auto executor = std::make_unique<fu::basic_pool_t>();
@@ -871,7 +873,7 @@ SZ_DYNAMIC sz_status_t sz_device_scope_get_capabilities(sz_device_scope_t scope_
         return sz_success_k;
     }
 #endif
-    
+
     // For default and CPU scopes, intersect system capabilities with CPU capabilities
     *capabilities = static_cast<sz_capability_t>(system_caps & sz_caps_cpus_k);
     return sz_success_k;

From 6b38ea2d53a22ba3cf90f94024572c32f9ab4acb Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 16:28:00 +0000
Subject: [PATCH 595/751] Make: NVCC flags for Rust

---
 build.rs | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/build.rs b/build.rs
index 5588d932..0084f788 100644
--- a/build.rs
+++ b/build.rs
@@ -115,26 +115,26 @@ fn build_stringzillas() {
 
     // Set GPU backend flags
     if is_cuda {
+        build.cuda(true);
         build.file("c/stringzillas.cu");
         build.define("SZ_USE_CUDA", "1");
         build.define("SZ_USE_ROCM", "0");
-        // For CUDA, we need C++ compilation
-        build.flag("-std=c++17");
-        build.cpp(true);
+        build.flag("-std=c++20");
+        build.flag("--expt-relaxed-constexpr");
+        build.flag("-arch=sm_90a");
     } else if is_rocm {
+        build.cpp(true);
         build.file("c/stringzillas.cu");
         build.define("SZ_USE_CUDA", "0");
         build.define("SZ_USE_ROCM", "1");
-        // For ROCm, we need C++ compilation
-        build.flag("-std=c++17");
-        build.cpp(true);
+        build.flag("-std=c++20");
+        // TODO: Add proper HIP/ROCm compiler support
     } else {
-        // CPU-only multi-threading
+        build.cpp(true);
         build.file("c/stringzillas.cpp");
         build.define("SZ_USE_CUDA", "0");
         build.define("SZ_USE_ROCM", "0");
-        build.flag("-std=c++17");
-        build.cpp(true);
+        build.flag("-std=c++20");
     }
 
     // Common flags

From b3338bb8f86691d87fa75e372d7e869855e42070 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 17:33:53 +0000
Subject: [PATCH 596/751] Break: Rust namespaces layout

---
 Cargo.toml          |    2 +-
 rust/lib.rs         |   36 +
 rust/stringzilla.rs | 2195 ++++++++++++++++++++++---------------------
 3 files changed, 1161 insertions(+), 1072 deletions(-)
 create mode 100644 rust/lib.rs

diff --git a/Cargo.toml b/Cargo.toml
index 3c3a9a50..4f39b393 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,7 @@ include = ["/rust/**", "/c/**", "/include/**", "/build.rs"]
 
 [lib]
 name = "stringzilla"
-path = "rust/stringzilla.rs"
+path = "rust/lib.rs"
 
 [features]
 default = []
diff --git a/rust/lib.rs b/rust/lib.rs
new file mode 100644
index 00000000..70a97e81
--- /dev/null
+++ b/rust/lib.rs
@@ -0,0 +1,36 @@
+#![cfg_attr(all(not(test), not(any(feature = "cpus", feature = "cuda", feature = "rocm"))), no_std)]
+#![doc = r"
+# StringZilla
+
+Fast string processing library with SIMD and GPU acceleration.
+
+This crate provides two main modules:
+- `stringzilla` (alias `sz`): Single-string operations  
+- `stringzillas` (alias `szs`): Multi-string parallel operations (requires features)
+
+## Features
+- `cpus`: Enable multi-threaded CPU backend
+- `cuda`: Enable CUDA GPU backend  
+- `rocm`: Enable ROCm GPU backend
+"]
+
+/// Core single-string operations with SIMD acceleration.
+///
+/// Provides fast string search, comparison, hashing, and manipulation
+/// functions optimized with SWAR and SIMD instructions.
+pub mod stringzilla;
+
+/// High-performance parallel string algorithms with CPU/GPU acceleration.
+///
+/// Requires `cpus`, `cuda`, or `rocm` features. Provides:
+/// - Levenshtein distances (binary and UTF-8)  
+/// - Needleman-Wunsch global alignment
+/// - Smith-Waterman local alignment
+/// - Min-Hash fingerprinting
+#[cfg(any(feature = "cpus", feature = "cuda", feature = "rocm"))]
+pub mod stringzillas;
+
+// Convenience aliases for shorter names
+pub use stringzilla as sz;
+#[cfg(any(feature = "cpus", feature = "cuda", feature = "rocm"))]
+pub use stringzillas as szs;
diff --git a/rust/stringzilla.rs b/rust/stringzilla.rs
index b244b7f1..c3580050 100644
--- a/rust/stringzilla.rs
+++ b/rust/stringzilla.rs
@@ -1,1074 +1,1138 @@
-#![cfg_attr(not(test), no_std)]
-#[doc = r"
-The `sz` module provides a collection of string searching and manipulation functionality,
-designed for high efficiency and compatibility with `no_std` environments. This module offers
-various utilities for byte string manipulation, including search, reverse search, and
-edit-distance calculations, suitable for a wide range of applications from basic string
-processing to complex text analysis tasks.
-"]
-
-pub mod stringzilla {
-
-    /// A simple semantic version structure.
-    #[derive(Debug, Copy, Clone, PartialEq, Eq)]
-    pub struct SemVer {
-        pub major: i32,
-        pub minor: i32,
-        pub patch: i32,
-    }
-
-    #[repr(C)]
-    #[derive(Debug, PartialEq)]
-    pub enum Status {
-        /// For algorithms that return a status, this status indicates that the operation was successful.
-        /// Corresponds to `sz_success_k = 0` in C.
-        Success = 0,
-        /// For algorithms that require memory allocation, this status indicates that the allocation failed.
-        /// Corresponds to `sz_bad_alloc_k = -10` in C.
-        BadAlloc = -10,
-        /// For algorithms that require UTF8 input, this status indicates that the input is invalid.
-        /// Corresponds to `sz_invalid_utf8_k = -12` in C.
-        InvalidUtf8 = -12,
-        /// For algorithms that take collections of unique elements, this status indicates presence of duplicates.
-        /// Corresponds to `sz_contains_duplicates_k = -13` in C.
-        ContainsDuplicates = -13,
-        /// For algorithms dealing with large inputs, this error reports the need to upcast the logic to larger types.
-        /// Corresponds to `sz_overflow_risk_k = -14` in C.
-        OverflowRisk = -14,
-        /// For algorithms with multi-stage pipelines indicates input/output size mismatch.
-        /// Corresponds to `sz_unexpected_dimensions_k = -15` in C.
-        UnexpectedDimensions = -15,
-        /// GPU support is missing in the library.
-        /// Corresponds to `sz_missing_gpu_k = -16` in C.
-        MissingGpu = -16,
-        /// A sink-hole status for unknown errors.
-        /// Corresponds to `sz_status_unknown_k = -1` in C.
-        StatusUnknown = -1,
-    }
-
-    #[repr(C)]
-    #[derive(Debug, Clone, Copy)]
-    pub struct Byteset {
-        bits: [u64; 4],
-    }
-
-    #[repr(C)]
-    #[derive(Debug, Clone, Copy)]
-    #[repr(align(64))] // For optimal performance we align to 64 bytes.
-    pub struct HashState {
-        aes: [u64; 8],
-        sum: [u64; 8],
-        ins: [u64; 8], // Ignored in comparisons
-        key: [u64; 2],
-        ins_length: usize, // Ignored in comparisons
-    }
-
-    pub type SortedIdx = usize;
-
-    /// A trait for types that support indexed lookup.
-    pub trait SequenceData {
-        type Item;
-        fn len(&self) -> usize;
-        fn index(&self, idx: usize) -> &Self::Item;
-    }
-
-    // Implement SequenceData for slices.
-    impl<T> SequenceData for [T] {
-        type Item = T;
-        #[inline]
-        fn len(&self) -> usize {
-            self.len()
-        }
-        #[inline]
-        fn index(&self, idx: usize) -> &T {
-            &self[idx]
-        }
-    }
+/// A simple semantic version structure.
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub struct SemVer {
+    pub major: i32,
+    pub minor: i32,
+    pub patch: i32,
+}
 
-    #[repr(C)]
-    pub struct _SzSequence {
-        pub handle: *const c_void,
-        pub count: usize,
-        pub get_start: Option<unsafe extern "C" fn(handle: *const c_void, idx: usize) -> *const c_void>,
-        pub get_length: Option<unsafe extern "C" fn(handle: *const c_void, idx: usize) -> usize>,
-    }
+#[repr(C)]
+#[derive(Debug, PartialEq)]
+pub enum Status {
+    /// For algorithms that return a status, this status indicates that the operation was successful.
+    /// Corresponds to `sz_success_k = 0` in C.
+    Success = 0,
+    /// For algorithms that require memory allocation, this status indicates that the allocation failed.
+    /// Corresponds to `sz_bad_alloc_k = -10` in C.
+    BadAlloc = -10,
+    /// For algorithms that require UTF8 input, this status indicates that the input is invalid.
+    /// Corresponds to `sz_invalid_utf8_k = -12` in C.
+    InvalidUtf8 = -12,
+    /// For algorithms that take collections of unique elements, this status indicates presence of duplicates.
+    /// Corresponds to `sz_contains_duplicates_k = -13` in C.
+    ContainsDuplicates = -13,
+    /// For algorithms dealing with large inputs, this error reports the need to upcast the logic to larger types.
+    /// Corresponds to `sz_overflow_risk_k = -14` in C.
+    OverflowRisk = -14,
+    /// For algorithms with multi-stage pipelines indicates input/output size mismatch.
+    /// Corresponds to `sz_unexpected_dimensions_k = -15` in C.
+    UnexpectedDimensions = -15,
+    /// GPU support is missing in the library.
+    /// Corresponds to `sz_missing_gpu_k = -16` in C.
+    MissingGpu = -16,
+    /// A sink-hole status for unknown errors.
+    /// Corresponds to `sz_status_unknown_k = -1` in C.
+    StatusUnknown = -1,
+}
 
-    impl Byteset {
-        /// Initializes a bit‑set to an empty collection (all characters banned).
-        #[inline]
-        pub fn new() -> Self {
-            Self { bits: [0; 4] }
-        }
+#[repr(C)]
+#[derive(Debug, Clone, Copy)]
+pub struct Byteset {
+    bits: [u64; 4],
+}
 
-        /// Initializes a bit‑set to contain all ASCII characters.
-        #[inline]
-        pub fn new_ascii() -> Self {
-            Self {
-                bits: [u64::MAX, u64::MAX, 0, 0],
-            }
-        }
+#[repr(C)]
+#[derive(Debug, Clone, Copy)]
+#[repr(align(64))] // For optimal performance we align to 64 bytes.
+pub struct HashState {
+    aes: [u64; 8],
+    sum: [u64; 8],
+    ins: [u64; 8], // Ignored in comparisons
+    key: [u64; 2],
+    ins_length: usize, // Ignored in comparisons
+}
 
-        /// Adds a byte to the set.
-        #[inline]
-        pub fn add_u8(&mut self, c: u8) {
-            let idx = (c >> 6) as usize; // Divide by 64.
-            let bit = c & 63; // Remainder modulo 64.
-            self.bits[idx] |= 1 << bit;
-        }
+pub type SortedIdx = usize;
 
-        /// Adds a character to the set.
-        ///
-        /// This function assumes the character is in the ASCII range.
-        #[inline]
-        pub fn add(&mut self, c: char) {
-            self.add_u8(c as u8);
-        }
+/// A trait for types that support indexed lookup.
+pub trait SequenceData {
+    type Item;
+    fn len(&self) -> usize;
+    fn index(&self, idx: usize) -> &Self::Item;
+}
 
-        /// Inverts the bit-set so that all set bits become unset and vice versa.
-        #[inline]
-        pub fn invert(&mut self) {
-            for b in self.bits.iter_mut() {
-                *b = !*b;
-            }
-        }
+// Implement SequenceData for slices.
+impl<T> SequenceData for [T] {
+    type Item = T;
+    #[inline]
+    fn len(&self) -> usize {
+        self.len()
+    }
+    #[inline]
+    fn index(&self, idx: usize) -> &T {
+        &self[idx]
+    }
+}
 
-        /// Returns a new Byteset with all bits inverted, leaving self unchanged.
-        #[inline]
-        pub fn inverted(&self) -> Self {
-            Self {
-                bits: [!self.bits[0], !self.bits[1], !self.bits[2], !self.bits[3]],
-            }
-        }
+#[repr(C)]
+pub struct _SzSequence {
+    pub handle: *const c_void,
+    pub count: usize,
+    pub get_start: Option<unsafe extern "C" fn(handle: *const c_void, idx: usize) -> *const c_void>,
+    pub get_length: Option<unsafe extern "C" fn(handle: *const c_void, idx: usize) -> usize>,
+}
 
-        /// Constructs a Byteset from a slice of bytes.
-        #[inline]
-        pub fn from_bytes(bytes: &[u8]) -> Self {
-            let mut set = Self::new();
-            for &b in bytes {
-                set.add_u8(b);
-            }
-            set
-        }
+impl Byteset {
+    /// Initializes a bit‑set to an empty collection (all characters banned).
+    #[inline]
+    pub fn new() -> Self {
+        Self { bits: [0; 4] }
     }
 
-    impl<T: AsRef<[u8]>> From<T> for Byteset {
-        #[inline]
-        fn from(bytes: T) -> Self {
-            Self::from_bytes(bytes.as_ref())
+    /// Initializes a bit‑set to contain all ASCII characters.
+    #[inline]
+    pub fn new_ascii() -> Self {
+        Self {
+            bits: [u64::MAX, u64::MAX, 0, 0],
         }
     }
 
-    use core::fmt::{self, Write};
-    use core::{ffi::c_void, ffi::CStr, usize};
-
-    // Import the functions from the StringZilla C library.
-    extern "C" {
-
-        fn sz_dynamic_dispatch() -> i32;
-        fn sz_version_major() -> i32;
-        fn sz_version_minor() -> i32;
-        fn sz_version_patch() -> i32;
-        fn sz_capabilities() -> u32;
-        fn sz_capabilities_to_string(caps: u32) -> *const c_void;
-
-        fn sz_copy(target: *const c_void, source: *const c_void, length: usize);
-        fn sz_fill(target: *const c_void, length: usize, value: u8);
-        fn sz_move(target: *const c_void, source: *const c_void, length: usize);
-        fn sz_fill_random(text: *mut c_void, length: usize, seed: u64);
-        fn sz_lookup(target: *const c_void, length: usize, source: *const c_void, lut: *const u8) -> *const c_void;
-
-        fn sz_find(
-            haystack: *const c_void,
-            haystack_length: usize,
-            needle: *const c_void,
-            needle_length: usize,
-        ) -> *const c_void;
-
-        fn sz_rfind(
-            haystack: *const c_void,
-            haystack_length: usize,
-            needle: *const c_void,
-            needle_length: usize,
-        ) -> *const c_void;
-
-        fn sz_find_byteset(haystack: *const c_void, haystack_length: usize, byteset: *const c_void) -> *const c_void;
-        fn sz_rfind_byteset(haystack: *const c_void, haystack_length: usize, byteset: *const c_void) -> *const c_void;
-
-        fn sz_bytesum(text: *const c_void, length: usize) -> u64;
-        fn sz_hash(text: *const c_void, length: usize, seed: u64) -> u64;
-        fn sz_hash_state_init(state: *const c_void, seed: u64);
-        fn sz_hash_state_stream(state: *const c_void, text: *const c_void, length: usize);
-        fn sz_hash_state_fold(state: *const c_void) -> u64;
-
-        pub fn sz_sequence_argsort(
-            //
-            sequence: *const _SzSequence,
-            alloc: *const c_void,
-            order: *mut SortedIdx,
-        ) -> Status;
-
-        pub fn sz_sequence_intersect(
-            first_sequence: *const _SzSequence,
-            second_sequence: *const _SzSequence,
-            alloc: *const c_void,
-            seed: u64,
-            intersection_size: *mut usize,
-            first_positions: *mut SortedIdx,
-            second_positions: *mut SortedIdx,
-        ) -> Status;
-
-    }
-
-    impl SemVer {
-        pub const fn new(major: i32, minor: i32, patch: i32) -> Self {
-            Self { major, minor, patch }
-        }
+    /// Adds a byte to the set.
+    #[inline]
+    pub fn add_u8(&mut self, c: u8) {
+        let idx = (c >> 6) as usize; // Divide by 64.
+        let bit = c & 63; // Remainder modulo 64.
+        self.bits[idx] |= 1 << bit;
     }
 
-    impl HashState {
-        /// Creates a new `HashState` and initializes it with a given seed.
-        pub fn new(seed: u64) -> Self {
-            let mut state = HashState {
-                aes: [0; 8],
-                sum: [0; 8],
-                ins: [0; 8],
-                key: [0; 2],
-                ins_length: 0,
-            };
-            unsafe {
-                sz_hash_state_init(&mut state as *mut _ as *mut c_void, seed);
-            }
-            state
-        }
-
-        /// Streams data into the hash state.
-        pub fn stream(&mut self, data: &[u8]) -> &mut Self {
-            unsafe {
-                sz_hash_state_stream(
-                    self as *mut _ as *mut c_void,
-                    data.as_ptr() as *const c_void,
-                    data.len(),
-                );
-            }
-            self
-        }
-
-        /// Finalizes the hash and returns the folded value.
-        pub fn fold(&self) -> u64 {
-            unsafe { sz_hash_state_fold(self as *const _ as *const c_void) }
-        }
+    /// Adds a character to the set.
+    ///
+    /// This function assumes the character is in the ASCII range.
+    #[inline]
+    pub fn add(&mut self, c: char) {
+        self.add_u8(c as u8);
     }
 
-    impl PartialEq for HashState {
-        fn eq(&self, other: &Self) -> bool {
-            self.aes == other.aes && self.sum == other.sum && self.key == other.key
+    /// Inverts the bit-set so that all set bits become unset and vice versa.
+    #[inline]
+    pub fn invert(&mut self) {
+        for b in self.bits.iter_mut() {
+            *b = !*b;
         }
     }
 
-    /// Checks if the library was compiled with dynamic dispatch enabled.
-    pub fn dynamic_dispatch() -> bool {
-        unsafe { sz_dynamic_dispatch() != 0 }
+    /// Returns a new Byteset with all bits inverted, leaving self unchanged.
+    #[inline]
+    pub fn inverted(&self) -> Self {
+        Self {
+            bits: [!self.bits[0], !self.bits[1], !self.bits[2], !self.bits[3]],
+        }
     }
 
-    /// Returns the semantic version information.
-    pub fn version() -> SemVer {
-        SemVer {
-            major: unsafe { sz_version_major() },
-            minor: unsafe { sz_version_minor() },
-            patch: unsafe { sz_version_patch() },
+    /// Constructs a Byteset from a slice of bytes.
+    #[inline]
+    pub fn from_bytes(bytes: &[u8]) -> Self {
+        let mut set = Self::new();
+        for &b in bytes {
+            set.add_u8(b);
         }
+        set
     }
+}
 
-    /// A fixed-size, compile-time known C-string buffer type.
-    /// It keeps track of the number of written bytes (excluding the null terminator).
-    pub struct FixedCString<const N: usize> {
-        buf: [u8; N],
-        len: usize,
+impl<T: AsRef<[u8]>> From<T> for Byteset {
+    #[inline]
+    fn from(bytes: T) -> Self {
+        Self::from_bytes(bytes.as_ref())
     }
+}
 
-    impl<const N: usize> FixedCString<N> {
-        /// Create a new, empty buffer.
-        /// The buffer always has a terminating NUL (0) byte at position `len`.
-        pub const fn new() -> Self {
-            Self { buf: [0u8; N], len: 0 }
-        }
-
-        /// Returns the raw pointer to the C string.
-        pub fn as_ptr(&self) -> *const u8 {
-            self.buf.as_ptr()
-        }
+use core::fmt::{self, Write};
+use core::{ffi::c_void, ffi::CStr, usize};
+
+// Import the functions from the StringZillable C library.
+extern "C" {
+
+    fn sz_dynamic_dispatch() -> i32;
+    fn sz_version_major() -> i32;
+    fn sz_version_minor() -> i32;
+    fn sz_version_patch() -> i32;
+    fn sz_capabilities() -> u32;
+    fn sz_capabilities_to_string(caps: u32) -> *const c_void;
+
+    fn sz_copy(target: *const c_void, source: *const c_void, length: usize);
+    fn sz_fill(target: *const c_void, length: usize, value: u8);
+    fn sz_move(target: *const c_void, source: *const c_void, length: usize);
+    fn sz_fill_random(text: *mut c_void, length: usize, seed: u64);
+    fn sz_lookup(target: *const c_void, length: usize, source: *const c_void, lut: *const u8) -> *const c_void;
+
+    fn sz_find(
+        haystack: *const c_void,
+        haystack_length: usize,
+        needle: *const c_void,
+        needle_length: usize,
+    ) -> *const c_void;
+
+    fn sz_rfind(
+        haystack: *const c_void,
+        haystack_length: usize,
+        needle: *const c_void,
+        needle_length: usize,
+    ) -> *const c_void;
+
+    fn sz_find_byteset(haystack: *const c_void, haystack_length: usize, byteset: *const c_void) -> *const c_void;
+    fn sz_rfind_byteset(haystack: *const c_void, haystack_length: usize, byteset: *const c_void) -> *const c_void;
+
+    fn sz_bytesum(text: *const c_void, length: usize) -> u64;
+    fn sz_hash(text: *const c_void, length: usize, seed: u64) -> u64;
+    fn sz_hash_state_init(state: *const c_void, seed: u64);
+    fn sz_hash_state_stream(state: *const c_void, text: *const c_void, length: usize);
+    fn sz_hash_state_fold(state: *const c_void) -> u64;
+
+    pub fn sz_sequence_argsort(
+        //
+        sequence: *const _SzSequence,
+        alloc: *const c_void,
+        order: *mut SortedIdx,
+    ) -> Status;
+
+    pub fn sz_sequence_intersect(
+        first_sequence: *const _SzSequence,
+        second_sequence: *const _SzSequence,
+        alloc: *const c_void,
+        seed: u64,
+        intersection_size: *mut usize,
+        first_positions: *mut SortedIdx,
+        second_positions: *mut SortedIdx,
+    ) -> Status;
 
-        /// Returns a reference as a CStr.
-        /// # Safety
-        /// The buffer must be correctly NUL terminated.
-        pub fn as_c_str(&self) -> &CStr {
-            // We know buf[..=len] is NUL-terminated because write_str() always sets it.
-            unsafe { CStr::from_bytes_with_nul_unchecked(&self.buf[..=self.len]) }
-        }
+}
 
-        /// Returns the current content as a &str.
-        /// Returns an empty string if the content isn’t valid UTF‑8.
-        pub fn as_str(&self) -> &str {
-            core::str::from_utf8(&self.buf[..self.len]).unwrap_or("")
-        }
-    }
+// Minimal representation of C allocator to pass into FFI.
+#[repr(C)]
+struct SzMemoryAllocator {
+    allocate: Option<unsafe extern "C" fn(size: usize, handle: *mut c_void) -> *mut c_void>,
+    free: Option<unsafe extern "C" fn(ptr: *mut c_void, size: usize, handle: *mut c_void)>,
+    handle: *mut c_void,
+}
 
-    impl<const N: usize> Write for FixedCString<N> {
-        fn write_str(&mut self, s: &str) -> fmt::Result {
-            let bytes = s.as_bytes();
-            // Ensure we have room for the new bytes and a NUL terminator.
-            if self.len + bytes.len() >= N {
-                return Err(fmt::Error);
-            }
-            self.buf[self.len..self.len + bytes.len()].copy_from_slice(bytes);
-            self.len += bytes.len();
-            // Always set a null terminator.
-            self.buf[self.len] = 0;
-            Ok(())
-        }
+// Simple Rust-side allocator functions used in tests to avoid requiring C libc exports.
+#[cfg(test)]
+unsafe extern "C" fn sz_rust_allocate_default(length: usize, _handle: *mut c_void) -> *mut c_void {
+    if length == 0 {
+        return core::ptr::null_mut();
     }
+    let layout = core::alloc::Layout::from_size_align_unchecked(length, core::mem::align_of::<usize>());
+    std::alloc::alloc(layout) as *mut c_void
+}
 
-    pub type SmallCString = FixedCString<256>;
-
-    /// Copies the capabilities C-string into a fixed buffer and returns it.
-    /// The returned SmallCString is guaranteed to be null-terminated.
-    pub fn capabilities() -> SmallCString {
-        let caps = unsafe { sz_capabilities() };
-        let caps_ptr = unsafe { sz_capabilities_to_string(caps) };
-        // Assume that the external function returns a valid null-terminated C string.
-        let cstr = unsafe { CStr::from_ptr(caps_ptr as *const i8) };
-        let bytes = cstr.to_bytes();
-
-        let mut buf = SmallCString::new();
-        // Use core::fmt::Write to copy the bytes.
-        // If the string is too long, it will fail. You might want to truncate in a real-world use.
-        // Here, we assume it fits.
-        let s = core::str::from_utf8(bytes).unwrap_or("");
-        let _ = buf.write_str(s);
-        buf
+#[cfg(test)]
+unsafe extern "C" fn sz_rust_free_default(ptr: *mut c_void, length: usize, _handle: *mut c_void) {
+    if ptr.is_null() || length == 0 {
+        return;
     }
+    let layout = core::alloc::Layout::from_size_align_unchecked(length, core::mem::align_of::<usize>());
+    std::alloc::dealloc(ptr as *mut u8, layout);
+}
 
-    /// Computes the checksum value of unsigned bytes in a given byte slice `text`.
-    /// This function is useful for verifying data integrity and detecting changes in
-    /// binary data, such as files or network packets.
-    ///
-    /// # Arguments
-    ///
-    /// * `text`: The byte slice to compute the checksum for.
-    ///
-    /// # Returns
-    ///
-    /// A `u64` representing the checksum value of the input byte slice.
-    #[inline(always)]
-    pub fn bytesum<T>(text: T) -> u64
-    where
-        T: AsRef<[u8]>,
-    {
-        let text_ref = text.as_ref();
-        let text_pointer = text_ref.as_ptr() as _;
-        let text_length = text_ref.len();
-        let result = unsafe { sz_bytesum(text_pointer, text_length) };
-        return result;
-    }
-
-    /// Moves the contents of `source` into `target`, overwriting the existing contents of `target`.
-    /// This function is useful for scenarios where you need to replace the contents of a byte slice
-    /// with the contents of another byte slice.
-    #[inline(always)]
-    pub fn move_<T, S>(target: &mut T, source: &S)
-    where
-        T: AsMut<[u8]> + ?Sized,
-        S: AsRef<[u8]> + ?Sized,
-    {
-        let target_slice = target.as_mut();
-        let source_slice = source.as_ref();
-        unsafe {
-            sz_move(
-                target_slice.as_mut_ptr() as *const c_void,
-                source_slice.as_ptr() as *const c_void,
-                source_slice.len(),
-            );
-        }
+impl SemVer {
+    pub const fn new(major: i32, minor: i32, patch: i32) -> Self {
+        Self { major, minor, patch }
     }
+}
 
-    /// Fills the contents of `target` with the specified `value`. This function is useful for
-    /// scenarios where you need to set all bytes in a byte slice to a specific value, such as
-    /// zeroing out a buffer or initializing a buffer with a specific byte pattern.
-    #[inline(always)]
-    pub fn fill<T>(target: &mut T, value: u8)
-    where
-        T: AsMut<[u8]> + ?Sized,
-    {
-        let target_slice = target.as_mut();
+impl HashState {
+    /// Creates a new `HashState` and initializes it with a given seed.
+    pub fn new(seed: u64) -> Self {
+        let mut state = HashState {
+            aes: [0; 8],
+            sum: [0; 8],
+            ins: [0; 8],
+            key: [0; 2],
+            ins_length: 0,
+        };
         unsafe {
-            sz_fill(target_slice.as_ptr() as *const c_void, target_slice.len(), value);
+            sz_hash_state_init(&mut state as *mut _ as *mut c_void, seed);
         }
+        state
     }
 
-    /// Copies the contents of `source` into `target`, overwriting the existing contents of `target`.
-    /// This function is useful for scenarios where you need to replace the contents of a byte slice
-    /// with the contents of another byte slice.
-    #[inline(always)]
-    pub fn copy<T, S>(target: &mut T, source: &S)
-    where
-        T: AsMut<[u8]> + ?Sized,
-        S: AsRef<[u8]> + ?Sized,
-    {
-        let target_slice = target.as_mut();
-        let source_slice = source.as_ref();
+    /// Streams data into the hash state.
+    pub fn stream(&mut self, data: &[u8]) -> &mut Self {
         unsafe {
-            sz_copy(
-                target_slice.as_mut_ptr() as *mut c_void,
-                source_slice.as_ptr() as *const c_void,
-                source_slice.len(),
+            sz_hash_state_stream(
+                self as *mut _ as *mut c_void,
+                data.as_ptr() as *const c_void,
+                data.len(),
             );
         }
+        self
     }
 
-    /// Performs a lookup transformation (LUT), mapping contents of a buffer into the same or other
-    /// memory region, taking a byte substitution value from the provided table.
-    ///
-    /// # Arguments
-    ///
-    /// * `target`: A mutable buffer to populate.
-    /// * `source`: An immutable buffer to map from.
-    /// * `table`: Lookup table of 256 substitution values.
-    ///
-    /// # Examples
-    ///
-    /// To convert uppercase ASCII characters to lowercase:
-    ///
-    /// ```
-    /// use stringzilla::sz;
-    /// let mut to_lower: [u8; 256] = core::array::from_fn(|i| i as u8);
-    /// for (upper, lower) in ('A'..='Z').zip('a'..='z') {
-    ///     to_lower[upper as usize] = lower as u8;
-    /// }
-    /// let source = "HELLO WORLD!";
-    /// let mut target = vec![0u8; source.len()];
-    /// sz::lookup(&mut target, &source, to_lower);
-    /// let result = String::from_utf8(target).expect("Invalid UTF-8 sequence");
-    /// assert_eq!(result, "hello world!");
-    /// ```
-    ///
-    pub fn lookup<T, S>(target: &mut T, source: &S, table: [u8; 256])
-    where
-        T: AsMut<[u8]> + ?Sized,
-        S: AsRef<[u8]> + ?Sized,
-    {
-        let target_slice = target.as_mut();
-        let source_slice = source.as_ref();
-        unsafe {
-            sz_lookup(
-                target_slice.as_mut_ptr() as *mut c_void,
-                source_slice.len(),
-                source_slice.as_ptr() as *const c_void,
-                table.as_ptr() as _,
-            );
-        }
+    /// Finalizes the hash and returns the folded value.
+    pub fn fold(&self) -> u64 {
+        unsafe { sz_hash_state_fold(self as *const _ as *const c_void) }
     }
+}
 
-    /// Performs a lookup transformation (LUT), mapping contents of a buffer into the same or other
-    /// memory region, taking a byte substitution value from the provided table.
-    ///
-    /// # Arguments
-    ///
-    /// * `buffer`: A mutable buffer to update inplace.
-    /// * `table`: Lookup table of 256 substitution values.
-    ///
-    /// # Examples
-    ///
-    /// ```
-    /// use stringzilla::sz;
-    /// let mut to_lower: [u8; 256] = core::array::from_fn(|i| i as u8);
-    /// for (upper, lower) in ('A'..='Z').zip('a'..='z') {
-    ///     to_lower[upper as usize] = lower as u8;
-    /// }
-    /// let mut text = *b"HELLO WORLD!";
-    /// sz::lookup_inplace(&mut text, to_lower);
-    /// assert_eq!(text, *b"hello world!");
-    /// ```
-    ///
-    pub fn lookup_inplace<T>(buffer: &mut T, table: [u8; 256])
-    where
-        T: AsMut<[u8]> + ?Sized,
-    {
-        let buffer_slice = buffer.as_mut();
-        unsafe {
-            sz_lookup(
-                buffer_slice.as_mut_ptr() as *mut c_void,
-                buffer_slice.len(),
-                buffer_slice.as_ptr() as *const c_void,
-                table.as_ptr() as _,
-            );
-        }
+impl PartialEq for HashState {
+    fn eq(&self, other: &Self) -> bool {
+        self.aes == other.aes && self.sum == other.sum && self.key == other.key
     }
+}
 
-    /// Computes a 64-bit AES-based hash value for a given byte slice `text`.
-    /// This function is designed to provide a high-quality hash value for use in
-    /// hash tables, data structures, and cryptographic applications.
-    /// Unlike the bytesum function, the hash function is order-sensitive.
-    ///
-    /// # Arguments
-    ///
-    /// * `text`: The byte slice to compute the checksum for.
-    /// * `seed`: A 64-bit value that acts as the seed for the hash function.
-    ///
-    /// # Returns
-    ///
-    /// A `u64` representing the hash value of the input byte slice.
-    #[inline(always)]
-    pub fn hash_with_seed<T>(text: T, seed: u64) -> u64
-    where
-        T: AsRef<[u8]>,
-    {
-        let text_ref = text.as_ref();
-        let text_pointer = text_ref.as_ptr() as _;
-        let text_length = text_ref.len();
-        let result = unsafe { sz_hash(text_pointer, text_length, seed) };
-        return result;
-    }
-
-    /// Computes a 64-bit AES-based hash value for a given byte slice `text`.
-    /// This function is designed to provide a high-quality hash value for use in
-    /// hash tables, data structures, and cryptographic applications.
-    /// Unlike the bytesum function, the hash function is order-sensitive.
-    ///
-    /// # Arguments
-    ///
-    /// * `text`: The byte slice to compute the checksum for.
-    ///
-    /// # Returns
-    ///
-    /// A `u64` representing the hash value of the input byte slice.
-    #[inline(always)]
-    pub fn hash<T>(text: T) -> u64
-    where
-        T: AsRef<[u8]>,
-    {
-        hash_with_seed(text, 0)
+/// Checks if the library was compiled with dynamic dispatch enabled.
+pub fn dynamic_dispatch() -> bool {
+    unsafe { sz_dynamic_dispatch() != 0 }
+}
+
+/// Returns the semantic version information.
+pub fn version() -> SemVer {
+    SemVer {
+        major: unsafe { sz_version_major() },
+        minor: unsafe { sz_version_minor() },
+        patch: unsafe { sz_version_patch() },
     }
+}
 
-    /// Locates the first matching substring within `haystack` that equals `needle`.
-    /// This function is similar to the `memmem()` function in LibC, but, unlike `strstr()`,
-    /// it requires the length of both haystack and needle to be known beforehand.
-    ///
-    /// # Arguments
-    ///
-    /// * `haystack`: The byte slice to search.
-    /// * `needle`: The byte slice to find within the haystack.
-    ///
-    /// # Returns
-    ///
-    /// An `Option<usize>` representing the starting index of the first occurrence of `needle`
-    /// within `haystack` if found, otherwise `None`.
-    pub fn find<H, N>(haystack: H, needle: N) -> Option<usize>
-    where
-        H: AsRef<[u8]>,
-        N: AsRef<[u8]>,
-    {
-        let haystack_ref = haystack.as_ref();
-        let needle_ref = needle.as_ref();
-        let haystack_pointer = haystack_ref.as_ptr() as _;
-        let haystack_length = haystack_ref.len();
-        let needle_pointer = needle_ref.as_ptr() as _;
-        let needle_length = needle_ref.len();
-        let result = unsafe { sz_find(haystack_pointer, haystack_length, needle_pointer, needle_length) };
-
-        if result.is_null() {
-            None
-        } else {
-            Some(unsafe { result.offset_from(haystack_pointer) } as usize)
-        }
+/// A fixed-size, compile-time known C-string buffer type.
+/// It keeps track of the number of written bytes (excluding the null terminator).
+pub struct FixedCString<const N: usize> {
+    buf: [u8; N],
+    len: usize,
+}
+
+impl<const N: usize> FixedCString<N> {
+    /// Create a new, empty buffer.
+    /// The buffer always has a terminating NUL (0) byte at position `len`.
+    pub const fn new() -> Self {
+        Self { buf: [0u8; N], len: 0 }
     }
 
-    /// Locates the last matching substring within `haystack` that equals `needle`.
-    /// This function is useful for finding the most recent or last occurrence of a pattern
-    /// within a byte slice.
-    ///
-    /// # Arguments
-    ///
-    /// * `haystack`: The byte slice to search.
-    /// * `needle`: The byte slice to find within the haystack.
-    ///
-    /// # Returns
-    ///
-    /// An `Option<usize>` representing the starting index of the last occurrence of `needle`
-    /// within `haystack` if found, otherwise `None`.
-    #[inline(always)]
-    pub fn rfind<H, N>(haystack: H, needle: N) -> Option<usize>
-    where
-        H: AsRef<[u8]>,
-        N: AsRef<[u8]>,
-    {
-        let haystack_ref = haystack.as_ref();
-        let needle_ref = needle.as_ref();
-        let haystack_pointer = haystack_ref.as_ptr() as _;
-        let haystack_length = haystack_ref.len();
-        let needle_pointer = needle_ref.as_ptr() as _;
-        let needle_length = needle_ref.len();
-        let result = unsafe { sz_rfind(haystack_pointer, haystack_length, needle_pointer, needle_length) };
-
-        if result.is_null() {
-            None
-        } else {
-            Some(unsafe { result.offset_from(haystack_pointer) } as usize)
-        }
+    /// Returns the raw pointer to the C string.
+    pub fn as_ptr(&self) -> *const u8 {
+        self.buf.as_ptr()
     }
 
-    /// Finds the index of the first character in `haystack` that is also present in `needles`.
-    /// This function is particularly useful for parsing and tokenization tasks where a set of
-    /// delimiter characters is used.
-    ///
-    /// # Arguments
-    ///
-    /// * `haystack`: The byte slice to search.
-    /// * `needles`: The set of bytes to search for within the haystack.
-    ///
-    /// # Returns
-    ///
-    /// An `Option<usize>` representing the index of the first occurrence of any byte from
-    /// `needles` within `haystack`, if found, otherwise `None`.
-    #[inline(always)]
-    pub fn find_byteset<H>(haystack: H, needles: Byteset) -> Option<usize>
-    where
-        H: AsRef<[u8]>,
-    {
-        let haystack_ref = haystack.as_ref();
-        let haystack_pointer = haystack_ref.as_ptr() as _;
-        let haystack_length = haystack_ref.len();
-
-        let result =
-            unsafe { sz_find_byteset(haystack_pointer, haystack_length, &needles as *const _ as *const c_void) };
-        if result.is_null() {
-            None
-        } else {
-            Some(unsafe { result.offset_from(haystack_pointer) } as usize)
-        }
+    /// Returns a reference as a CStr.
+    /// # Safety
+    /// The buffer must be correctly NUL terminated.
+    pub fn as_c_str(&self) -> &CStr {
+        // We know buf[..=len] is NUL-terminated because write_str() always sets it.
+        unsafe { CStr::from_bytes_with_nul_unchecked(&self.buf[..=self.len]) }
     }
 
-    /// Finds the index of the last character in `haystack` that is also present in `needles`.
-    /// This can be used to find the last occurrence of any character from a specified set,
-    /// useful in parsing scenarios such as finding the last delimiter in a string.
-    ///
-    /// # Arguments
-    ///
-    /// * `haystack`: The byte slice to search.
-    /// * `needles`: The set of bytes to search for within the haystack.
-    ///
-    /// # Returns
-    ///
-    /// An `Option<usize>` representing the index of the last occurrence of any byte from
-    /// `needles` within `haystack`, if found, otherwise `None`.
-    pub fn rfind_byteset<H>(haystack: H, needles: Byteset) -> Option<usize>
-    where
-        H: AsRef<[u8]>,
-    {
-        let haystack_ref = haystack.as_ref();
-        let haystack_pointer = haystack_ref.as_ptr() as _;
-        let haystack_length = haystack_ref.len();
+    /// Returns the current content as a &str.
+    /// Returns an empty string if the content isn’t valid UTF‑8.
+    pub fn as_str(&self) -> &str {
+        core::str::from_utf8(&self.buf[..self.len]).unwrap_or("")
+    }
+}
 
-        let result =
-            unsafe { sz_rfind_byteset(haystack_pointer, haystack_length, &needles as *const _ as *const c_void) };
-        if result.is_null() {
-            None
-        } else {
-            Some(unsafe { result.offset_from(haystack_pointer) } as usize)
+impl<const N: usize> Write for FixedCString<N> {
+    fn write_str(&mut self, s: &str) -> fmt::Result {
+        let bytes = s.as_bytes();
+        // Ensure we have room for the new bytes and a NUL terminator.
+        if self.len + bytes.len() >= N {
+            return Err(fmt::Error);
         }
+        self.buf[self.len..self.len + bytes.len()].copy_from_slice(bytes);
+        self.len += bytes.len();
+        // Always set a null terminator.
+        self.buf[self.len] = 0;
+        Ok(())
     }
+}
 
-    /// Finds the index of the first character in `haystack` that is also present in `needles`.
-    /// This function is particularly useful for parsing and tokenization tasks where a set of
-    /// delimiter characters is used.
-    ///
-    /// # Arguments
-    ///
-    /// * `haystack`: The byte slice to search.
-    /// * `needles`: The set of bytes to search for within the haystack.
-    ///
-    /// # Returns
-    ///
-    /// An `Option<usize>` representing the index of the first occurrence of any byte from
-    /// `needles` within `haystack`, if found, otherwise `None`.
-    #[inline(always)]
-    pub fn find_byte_from<H, N>(haystack: H, needles: N) -> Option<usize>
-    where
-        H: AsRef<[u8]>,
-        N: AsRef<[u8]>,
-    {
-        find_byteset(haystack, Byteset::from(needles))
+pub type SmallCString = FixedCString<256>;
+
+/// Copies the capabilities C-string into a fixed buffer and returns it.
+/// The returned SmallCString is guaranteed to be null-terminated.
+pub fn capabilities() -> SmallCString {
+    let caps = unsafe { sz_capabilities() };
+    let caps_ptr = unsafe { sz_capabilities_to_string(caps) };
+    // Assume that the external function returns a valid null-terminated C string.
+    let cstr = unsafe { CStr::from_ptr(caps_ptr as *const i8) };
+    let bytes = cstr.to_bytes();
+
+    let mut buf = SmallCString::new();
+    // Use core::fmt::Write to copy the bytes.
+    // If the string is too long, it will fail. You might want to truncate in a real-world use.
+    // Here, we assume it fits.
+    let s = core::str::from_utf8(bytes).unwrap_or("");
+    let _ = buf.write_str(s);
+    buf
+}
+
+/// Computes the checksum value of unsigned bytes in a given byte slice `text`.
+/// This function is useful for verifying data integrity and detecting changes in
+/// binary data, such as files or network packets.
+///
+/// # Arguments
+///
+/// * `text`: The byte slice to compute the checksum for.
+///
+/// # Returns
+///
+/// A `u64` representing the checksum value of the input byte slice.
+#[inline(always)]
+pub fn bytesum<T>(text: T) -> u64
+where
+    T: AsRef<[u8]>,
+{
+    let text_ref = text.as_ref();
+    let text_pointer = text_ref.as_ptr() as _;
+    let text_length = text_ref.len();
+    let result = unsafe { sz_bytesum(text_pointer, text_length) };
+    return result;
+}
+
+/// Moves the contents of `source` into `target`, overwriting the existing contents of `target`.
+/// This function is useful for scenarios where you need to replace the contents of a byte slice
+/// with the contents of another byte slice.
+#[inline(always)]
+pub fn move_<T, S>(target: &mut T, source: &S)
+where
+    T: AsMut<[u8]> + ?Sized,
+    S: AsRef<[u8]> + ?Sized,
+{
+    let target_slice = target.as_mut();
+    let source_slice = source.as_ref();
+    unsafe {
+        sz_move(
+            target_slice.as_mut_ptr() as *const c_void,
+            source_slice.as_ptr() as *const c_void,
+            source_slice.len(),
+        );
     }
+}
 
-    /// Finds the index of the last character in `haystack` that is also present in `needles`.
-    /// This can be used to find the last occurrence of any character from a specified set,
-    /// useful in parsing scenarios such as finding the last delimiter in a string.
-    ///
-    /// # Arguments
-    ///
-    /// * `haystack`: The byte slice to search.
-    /// * `needles`: The set of bytes to search for within the haystack.
-    ///
-    /// # Returns
-    ///
-    /// An `Option<usize>` representing the index of the last occurrence of any byte from
-    /// `needles` within `haystack`, if found, otherwise `None`.
-    pub fn rfind_byte_from<H, N>(haystack: H, needles: N) -> Option<usize>
-    where
-        H: AsRef<[u8]>,
-        N: AsRef<[u8]>,
-    {
-        rfind_byteset(haystack, Byteset::from(needles))
+/// Fills the contents of `target` with the specified `value`. This function is useful for
+/// scenarios where you need to set all bytes in a byte slice to a specific value, such as
+/// zeroing out a buffer or initializing a buffer with a specific byte pattern.
+#[inline(always)]
+pub fn fill<T>(target: &mut T, value: u8)
+where
+    T: AsMut<[u8]> + ?Sized,
+{
+    let target_slice = target.as_mut();
+    unsafe {
+        sz_fill(target_slice.as_ptr() as *const c_void, target_slice.len(), value);
     }
+}
 
-    /// Finds the index of the first character in `haystack` that is not present in `needles`.
-    /// This function is useful for skipping over a known set of characters and finding the
-    /// first character that does not belong to that set.
-    ///
-    /// # Arguments
-    ///
-    /// * `haystack`: The byte slice to search.
-    /// * `needles`: The set of bytes that should not be matched within the haystack.
-    ///
-    /// # Returns
-    ///
-    /// An `Option<usize>` representing the index of the first occurrence of any byte not in
-    /// `needles` within `haystack`, if found, otherwise `None`.
-    pub fn find_byte_not_from<H, N>(haystack: H, needles: N) -> Option<usize>
-    where
-        H: AsRef<[u8]>,
-        N: AsRef<[u8]>,
-    {
-        find_byteset(haystack, Byteset::from(needles).inverted())
+/// Copies the contents of `source` into `target`, overwriting the existing contents of `target`.
+/// This function is useful for scenarios where you need to replace the contents of a byte slice
+/// with the contents of another byte slice.
+#[inline(always)]
+pub fn copy<T, S>(target: &mut T, source: &S)
+where
+    T: AsMut<[u8]> + ?Sized,
+    S: AsRef<[u8]> + ?Sized,
+{
+    let target_slice = target.as_mut();
+    let source_slice = source.as_ref();
+    unsafe {
+        sz_copy(
+            target_slice.as_mut_ptr() as *mut c_void,
+            source_slice.as_ptr() as *const c_void,
+            source_slice.len(),
+        );
     }
+}
 
-    /// Finds the index of the last character in `haystack` that is not present in `needles`.
-    /// Useful for text processing tasks such as trimming trailing characters that belong to
-    /// a specified set.
-    ///
-    /// # Arguments
-    ///
-    /// * `haystack`: The byte slice to search.
-    /// * `needles`: The set of bytes that should not be matched within the haystack.
-    ///
-    /// # Returns
-    ///
-    /// An `Option<usize>` representing the index of the last occurrence of any byte not in
-    /// `needles` within `haystack`, if found, otherwise `None`.
-    pub fn rfind_byte_not_from<H, N>(haystack: H, needles: N) -> Option<usize>
-    where
-        H: AsRef<[u8]>,
-        N: AsRef<[u8]>,
-    {
-        rfind_byteset(haystack, Byteset::from(needles).inverted())
-    }
-
-    /// Generates a default substitution matrix for use with the Needleman-Wunsch
-    /// alignment algorithm. This matrix is initialized such that diagonal entries
-    /// (representing matching characters) are zero, and off-diagonal entries
-    /// (representing mismatches) are -1. This setup effectively produces distances
-    /// equal to the negative Levenshtein edit distance, suitable for basic sequence
-    /// alignment tasks where all mismatches are penalized equally and there are no
-    /// rewards for matches.
-    ///
-    /// # Returns
-    ///
-    /// A 256x256 array of `i8`, where each element represents the substitution cost
-    /// between two characters (byte values). Matching characters are assigned a cost
-    /// of 0, and non-matching characters are assigned a cost of -1.
-    pub fn error_costs_256x256_unary() -> [[i8; 256]; 256] {
-        let mut result = [[0i8; 256]; 256];
-
-        for i in 0..256 {
-            for j in 0..256 {
-                result[i][j] = if i == j { 0 } else { -1 };
-            }
-        }
+/// Performs a lookup transformation (LUT), mapping contents of a buffer into the same or other
+/// memory region, taking a byte substitution value from the provided table.
+///
+/// # Arguments
+///
+/// * `target`: A mutable buffer to populate.
+/// * `source`: An immutable buffer to map from.
+/// * `table`: Lookup table of 256 substitution values.
+///
+/// # Examples
+///
+/// To convert uppercase ASCII characters to lowercase:
+///
+/// ```
+/// use stringzilla::stringzilla as sz;
+/// let mut to_lower: [u8; 256] = core::array::from_fn(|i| i as u8);
+/// for (upper, lower) in ('A'..='Z').zip('a'..='z') {
+///     to_lower[upper as usize] = lower as u8;
+/// }
+/// let source = "HELLO WORLD!";
+/// let mut target = vec![0u8; source.len()];
+/// sz::lookup(&mut target, &source, to_lower);
+/// let result = String::from_utf8(target).expect("Invalid UTF-8 sequence");
+/// assert_eq!(result, "hello world!");
+/// ```
+///
+pub fn lookup<T, S>(target: &mut T, source: &S, table: [u8; 256])
+where
+    T: AsMut<[u8]> + ?Sized,
+    S: AsRef<[u8]> + ?Sized,
+{
+    let target_slice = target.as_mut();
+    let source_slice = source.as_ref();
+    unsafe {
+        sz_lookup(
+            target_slice.as_mut_ptr() as *mut c_void,
+            source_slice.len(),
+            source_slice.as_ptr() as *const c_void,
+            table.as_ptr() as _,
+        );
+    }
+}
 
-        result
+/// Performs a lookup transformation (LUT), mapping contents of a buffer into the same or other
+/// memory region, taking a byte substitution value from the provided table.
+///
+/// # Arguments
+///
+/// * `buffer`: A mutable buffer to update inplace.
+/// * `table`: Lookup table of 256 substitution values.
+///
+/// # Examples
+///
+/// ```
+/// use stringzilla::stringzilla as sz;
+/// let mut to_lower: [u8; 256] = core::array::from_fn(|i| i as u8);
+/// for (upper, lower) in ('A'..='Z').zip('a'..='z') {
+///     to_lower[upper as usize] = lower as u8;
+/// }
+/// let mut text = *b"HELLO WORLD!";
+/// sz::lookup_inplace(&mut text, to_lower);
+/// assert_eq!(text, *b"hello world!");
+/// ```
+///
+pub fn lookup_inplace<T>(buffer: &mut T, table: [u8; 256])
+where
+    T: AsMut<[u8]> + ?Sized,
+{
+    let buffer_slice = buffer.as_mut();
+    unsafe {
+        sz_lookup(
+            buffer_slice.as_mut_ptr() as *mut c_void,
+            buffer_slice.len(),
+            buffer_slice.as_ptr() as *const c_void,
+            table.as_ptr() as _,
+        );
     }
+}
 
-    /// Randomizes the contents of a given byte slice `text` using characters from
-    /// a specified `alphabet`. This function mutates `text` in place, replacing each
-    /// byte with a random one from `alphabet`. It is designed for situations where
-    /// you need to generate random strings or data sequences based on a specific set
-    /// of characters, such as generating random DNA sequences or testing inputs.
-    ///
-    /// # Arguments
-    ///
-    /// * `buffer`: A mutable reference to the data to randomize. This data will be mutated in place.
-    /// * `nonce`: A 64-bit "number used once" (nonce) value to seed the random number generator.
-    ///
-    /// # Examples
-    ///
-    /// ```
-    /// use stringzilla::sz;
-    /// let mut buffer = vec![0; 10];
-    /// sz::fill_random(&mut buffer, 42);
-    /// ```
-    ///
-    /// After than,  `buffer` is filled with random byte values from 0 to 255.
-    pub fn fill_random<T>(buffer: &mut T, nonce: u64)
-    where
-        T: AsMut<[u8]> + ?Sized, // Allows for mutable references to dynamically sized types.
-    {
-        let buffer_slice = buffer.as_mut();
-        unsafe {
-            sz_fill_random(buffer_slice.as_ptr() as _, buffer_slice.len(), nonce);
-        }
+/// Computes a 64-bit AES-based hash value for a given byte slice `text`.
+/// This function is designed to provide a high-quality hash value for use in
+/// hash tables, data structures, and cryptographic applications.
+/// Unlike the bytesum function, the hash function is order-sensitive.
+///
+/// # Arguments
+///
+/// * `text`: The byte slice to compute the checksum for.
+/// * `seed`: A 64-bit value that acts as the seed for the hash function.
+///
+/// # Returns
+///
+/// A `u64` representing the hash value of the input byte slice.
+#[inline(always)]
+pub fn hash_with_seed<T>(text: T, seed: u64) -> u64
+where
+    T: AsRef<[u8]>,
+{
+    let text_ref = text.as_ref();
+    let text_pointer = text_ref.as_ptr() as _;
+    let text_length = text_ref.len();
+    let result = unsafe { sz_hash(text_pointer, text_length, seed) };
+    return result;
+}
+
+/// Computes a 64-bit AES-based hash value for a given byte slice `text`.
+/// This function is designed to provide a high-quality hash value for use in
+/// hash tables, data structures, and cryptographic applications.
+/// Unlike the bytesum function, the hash function is order-sensitive.
+///
+/// # Arguments
+///
+/// * `text`: The byte slice to compute the checksum for.
+///
+/// # Returns
+///
+/// A `u64` representing the hash value of the input byte slice.
+#[inline(always)]
+pub fn hash<T>(text: T) -> u64
+where
+    T: AsRef<[u8]>,
+{
+    hash_with_seed(text, 0)
+}
+
+/// Locates the first matching substring within `haystack` that equals `needle`.
+/// This function is similar to the `memmem()` function in LibC, but, unlike `strstr()`,
+/// it requires the length of both haystack and needle to be known beforehand.
+///
+/// # Arguments
+///
+/// * `haystack`: The byte slice to search.
+/// * `needle`: The byte slice to find within the haystack.
+///
+/// # Returns
+///
+/// An `Option<usize>` representing the starting index of the first occurrence of `needle`
+/// within `haystack` if found, otherwise `None`.
+pub fn find<H, N>(haystack: H, needle: N) -> Option<usize>
+where
+    H: AsRef<[u8]>,
+    N: AsRef<[u8]>,
+{
+    let haystack_ref = haystack.as_ref();
+    let needle_ref = needle.as_ref();
+    let haystack_pointer = haystack_ref.as_ptr() as _;
+    let haystack_length = haystack_ref.len();
+    let needle_pointer = needle_ref.as_ptr() as _;
+    let needle_length = needle_ref.len();
+    let result = unsafe { sz_find(haystack_pointer, haystack_length, needle_pointer, needle_length) };
+
+    if result.is_null() {
+        None
+    } else {
+        Some(unsafe { result.offset_from(haystack_pointer) } as usize)
     }
+}
 
-    /// A helper type that holds a mapper closure which, given an index,
-    /// returns the corresponding byte‑slice representation.
-    ///
-    /// The closure is expected to have type `Fn(usize) -> &[u8]` so that callers
-    /// can write closures like `|i| data[i].as_ref()` or `|i| people[i].name.as_bytes()`.
-    struct _SliceLookupView<F: Fn(usize) -> &'static [u8]> {
-        mapper: F,
+/// Locates the last matching substring within `haystack` that equals `needle`.
+/// This function is useful for finding the most recent or last occurrence of a pattern
+/// within a byte slice.
+///
+/// # Arguments
+///
+/// * `haystack`: The byte slice to search.
+/// * `needle`: The byte slice to find within the haystack.
+///
+/// # Returns
+///
+/// An `Option<usize>` representing the starting index of the last occurrence of `needle`
+/// within `haystack` if found, otherwise `None`.
+#[inline(always)]
+pub fn rfind<H, N>(haystack: H, needle: N) -> Option<usize>
+where
+    H: AsRef<[u8]>,
+    N: AsRef<[u8]>,
+{
+    let haystack_ref = haystack.as_ref();
+    let needle_ref = needle.as_ref();
+    let haystack_pointer = haystack_ref.as_ptr() as _;
+    let haystack_length = haystack_ref.len();
+    let needle_pointer = needle_ref.as_ptr() as _;
+    let needle_length = needle_ref.len();
+    let result = unsafe { sz_rfind(haystack_pointer, haystack_length, needle_pointer, needle_length) };
+
+    if result.is_null() {
+        None
+    } else {
+        Some(unsafe { result.offset_from(haystack_pointer) } as usize)
     }
+}
 
-    unsafe extern "C" fn _slice_get_start<F>(handle: *const c_void, idx: SortedIdx) -> *const c_void
-    where
-        F: Fn(usize) -> &'static [u8],
-    {
-        let view = &*(handle as *const _SliceLookupView<F>);
-        (view.mapper)(idx).as_ptr() as *const c_void
+/// Finds the index of the first character in `haystack` that is also present in `needles`.
+/// This function is particularly useful for parsing and tokenization tasks where a set of
+/// delimiter characters is used.
+///
+/// # Arguments
+///
+/// * `haystack`: The byte slice to search.
+/// * `needles`: The set of bytes to search for within the haystack.
+///
+/// # Returns
+///
+/// An `Option<usize>` representing the index of the first occurrence of any byte from
+/// `needles` within `haystack`, if found, otherwise `None`.
+#[inline(always)]
+pub fn find_byteset<H>(haystack: H, needles: Byteset) -> Option<usize>
+where
+    H: AsRef<[u8]>,
+{
+    let haystack_ref = haystack.as_ref();
+    let haystack_pointer = haystack_ref.as_ptr() as _;
+    let haystack_length = haystack_ref.len();
+
+    let result = unsafe { sz_find_byteset(haystack_pointer, haystack_length, &needles as *const _ as *const c_void) };
+    if result.is_null() {
+        None
+    } else {
+        Some(unsafe { result.offset_from(haystack_pointer) } as usize)
     }
+}
 
-    unsafe extern "C" fn _slice_get_length<F>(handle: *const c_void, idx: SortedIdx) -> usize
-    where
-        F: Fn(usize) -> &'static [u8],
-    {
-        let view = &*(handle as *const _SliceLookupView<F>);
-        (view.mapper)(idx).len()
+/// Finds the index of the last character in `haystack` that is also present in `needles`.
+/// This can be used to find the last occurrence of any character from a specified set,
+/// useful in parsing scenarios such as finding the last delimiter in a string.
+///
+/// # Arguments
+///
+/// * `haystack`: The byte slice to search.
+/// * `needles`: The set of bytes to search for within the haystack.
+///
+/// # Returns
+///
+/// An `Option<usize>` representing the index of the last occurrence of any byte from
+/// `needles` within `haystack`, if found, otherwise `None`.
+pub fn rfind_byteset<H>(haystack: H, needles: Byteset) -> Option<usize>
+where
+    H: AsRef<[u8]>,
+{
+    let haystack_ref = haystack.as_ref();
+    let haystack_pointer = haystack_ref.as_ptr() as _;
+    let haystack_length = haystack_ref.len();
+
+    let result = unsafe { sz_rfind_byteset(haystack_pointer, haystack_length, &needles as *const _ as *const c_void) };
+    if result.is_null() {
+        None
+    } else {
+        Some(unsafe { result.offset_from(haystack_pointer) } as usize)
     }
+}
 
-    /// Sorts a sequence of items by comparing their byte‑slice representations.
-    ///
-    /// The caller must supply an output buffer `order` whose length is at least
-    /// equal to the length of `data`. On success, the function writes the sorted
-    /// permutation indices into `order`.
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// use stringzilla::sz;
-    ///
-    /// let fruits = ["banana", "apple", "cherry"];
-    /// let mut order = [0; 3];
-    /// sz::argsort_permutation(&fruits, &mut order).expect("sort failed");
-    /// assert_eq!(&order, &[1, 0, 2]); // "apple", "banana", "cherry"
-    /// ```
-    pub fn argsort_permutation<T: AsRef<[u8]>>(data: &[T], order: &mut [SortedIdx]) -> Result<(), Status> {
-        if data.len() > order.len() {
-            return Err(Status::BadAlloc);
+/// Finds the index of the first character in `haystack` that is also present in `needles`.
+/// This function is particularly useful for parsing and tokenization tasks where a set of
+/// delimiter characters is used.
+///
+/// # Arguments
+///
+/// * `haystack`: The byte slice to search.
+/// * `needles`: The set of bytes to search for within the haystack.
+///
+/// # Returns
+///
+/// An `Option<usize>` representing the index of the first occurrence of any byte from
+/// `needles` within `haystack`, if found, otherwise `None`.
+#[inline(always)]
+pub fn find_byte_from<H, N>(haystack: H, needles: N) -> Option<usize>
+where
+    H: AsRef<[u8]>,
+    N: AsRef<[u8]>,
+{
+    find_byteset(haystack, Byteset::from(needles))
+}
+
+/// Finds the index of the last character in `haystack` that is also present in `needles`.
+/// This can be used to find the last occurrence of any character from a specified set,
+/// useful in parsing scenarios such as finding the last delimiter in a string.
+///
+/// # Arguments
+///
+/// * `haystack`: The byte slice to search.
+/// * `needles`: The set of bytes to search for within the haystack.
+///
+/// # Returns
+///
+/// An `Option<usize>` representing the index of the last occurrence of any byte from
+/// `needles` within `haystack`, if found, otherwise `None`.
+pub fn rfind_byte_from<H, N>(haystack: H, needles: N) -> Option<usize>
+where
+    H: AsRef<[u8]>,
+    N: AsRef<[u8]>,
+{
+    rfind_byteset(haystack, Byteset::from(needles))
+}
+
+/// Finds the index of the first character in `haystack` that is not present in `needles`.
+/// This function is useful for skipping over a known set of characters and finding the
+/// first character that does not belong to that set.
+///
+/// # Arguments
+///
+/// * `haystack`: The byte slice to search.
+/// * `needles`: The set of bytes that should not be matched within the haystack.
+///
+/// # Returns
+///
+/// An `Option<usize>` representing the index of the first occurrence of any byte not in
+/// `needles` within `haystack`, if found, otherwise `None`.
+pub fn find_byte_not_from<H, N>(haystack: H, needles: N) -> Option<usize>
+where
+    H: AsRef<[u8]>,
+    N: AsRef<[u8]>,
+{
+    find_byteset(haystack, Byteset::from(needles).inverted())
+}
+
+/// Finds the index of the last character in `haystack` that is not present in `needles`.
+/// Useful for text processing tasks such as trimming trailing characters that belong to
+/// a specified set.
+///
+/// # Arguments
+///
+/// * `haystack`: The byte slice to search.
+/// * `needles`: The set of bytes that should not be matched within the haystack.
+///
+/// # Returns
+///
+/// An `Option<usize>` representing the index of the last occurrence of any byte not in
+/// `needles` within `haystack`, if found, otherwise `None`.
+pub fn rfind_byte_not_from<H, N>(haystack: H, needles: N) -> Option<usize>
+where
+    H: AsRef<[u8]>,
+    N: AsRef<[u8]>,
+{
+    rfind_byteset(haystack, Byteset::from(needles).inverted())
+}
+
+/// Generates a default substitution matrix for use with the Needleman-Wunsch
+/// alignment algorithm. This matrix is initialized such that diagonal entries
+/// (representing matching characters) are zero, and off-diagonal entries
+/// (representing mismatches) are -1. This setup effectively produces distances
+/// equal to the negative Levenshtein edit distance, suitable for basic sequence
+/// alignment tasks where all mismatches are penalized equally and there are no
+/// rewards for matches.
+///
+/// # Returns
+///
+/// A 256x256 array of `i8`, where each element represents the substitution cost
+/// between two characters (byte values). Matching characters are assigned a cost
+/// of 0, and non-matching characters are assigned a cost of -1.
+pub fn error_costs_256x256_unary() -> [[i8; 256]; 256] {
+    let mut result = [[0i8; 256]; 256];
+
+    for i in 0..256 {
+        for j in 0..256 {
+            result[i][j] = if i == j { 0 } else { -1 };
         }
-        argsort_permutation_by(|i| data[i].as_ref(), order)
     }
 
-    /// Sorts a sequence of items by comparing their corresponding byte‑slice representations.
-    /// The size of the permutation is inferred from the length of the `order` slice.
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// use stringzilla::sz;
-    ///
-    /// #[derive(Debug)]
-    /// struct Person { name: &'static str, age: u32 }
-    ///
-    /// let people = [
-    ///     Person { name: "Charlie", age: 20 },
-    ///     Person { name: "Alice", age: 25 },
-    ///     Person { name: "Bob", age: 30 },
-    /// ];
-    /// let mut order = [0; 3];
-    /// sz::argsort_permutation_by(|i| people[i].name.as_bytes(), &mut order).expect("sort failed");
-    /// assert_eq!(&order, &[1, 2, 0]); // "Alice", "Bob", "Charlie"
-    /// ```
-    pub fn argsort_permutation_by<F, A>(mapper: F, order: &mut [SortedIdx]) -> Result<(), Status>
-    where
-        F: Fn(usize) -> A,
-        A: AsRef<[u8]>,
-    {
-        // Adapter closure: given an index, call the provided mapper and then transmute the
-        // resulting slice to have a `'static` lifetime. This transmute is safe as long as
-        // the FFI call is synchronous and the returned slices are only used during the call.
-        let adapter = move |i: usize| -> &'static [u8] {
-            let binding = mapper(i);
-            let slice = binding.as_ref();
-            unsafe { core::mem::transmute(slice) }
-        };
+    result
+}
 
-        _argsort_permutation_impl(adapter, order)
+/// Randomizes the contents of a given byte slice `text` using characters from
+/// a specified `alphabet`. This function mutates `text` in place, replacing each
+/// byte with a random one from `alphabet`. It is designed for situations where
+/// you need to generate random strings or data sequences based on a specific set
+/// of characters, such as generating random DNA sequences or testing inputs.
+///
+/// # Arguments
+///
+/// * `buffer`: A mutable reference to the data to randomize. This data will be mutated in place.
+/// * `nonce`: A 64-bit "number used once" (nonce) value to seed the random number generator.
+///
+/// # Examples
+///
+/// ```
+/// use stringzilla::stringzilla as sz;
+/// let mut buffer = vec![0; 10];
+/// sz::fill_random(&mut buffer, 42);
+/// ```
+///
+/// After than,  `buffer` is filled with random byte values from 0 to 255.
+pub fn fill_random<T>(buffer: &mut T, nonce: u64)
+where
+    T: AsMut<[u8]> + ?Sized, // Allows for mutable references to dynamically sized types.
+{
+    let buffer_slice = buffer.as_mut();
+    unsafe {
+        sz_fill_random(buffer_slice.as_ptr() as _, buffer_slice.len(), nonce);
     }
+}
 
-    /// Helper that takes an adapter (with a concrete type) and performs the FFI call.
-    fn _argsort_permutation_impl<FAdapter>(adapter: FAdapter, order: &mut [SortedIdx]) -> Result<(), Status>
-    where
-        FAdapter: Fn(usize) -> &'static [u8],
-    {
-        let view = _SliceLookupView { mapper: adapter };
-        let seq = _SzSequence {
-            handle: &view as *const _ as *const c_void,
-            count: order.len(),
-            get_start: Some(_slice_get_start::<FAdapter>),
-            get_length: Some(_slice_get_length::<FAdapter>),
-        };
-        let status = unsafe { sz_sequence_argsort(&seq, core::ptr::null(), order.as_mut_ptr()) };
-        if status == Status::Success {
-            Ok(())
-        } else {
-            Err(status)
-        }
-    }
+/// A helper type that holds a mapper closure which, given an index,
+/// returns the corresponding byte‑slice representation.
+///
+/// The closure is expected to have type `Fn(usize) -> &[u8]` so that callers
+/// can write closures like `|i| data[i].as_ref()` or `|i| people[i].name.as_bytes()`.
+struct _SliceLookupView<F: Fn(usize) -> &'static [u8]> {
+    mapper: F,
+}
 
-    // ----------------------------------------------------------------------
-    // Intersection functions
-    // ----------------------------------------------------------------------
+unsafe extern "C" fn _slice_get_start<F>(handle: *const c_void, idx: SortedIdx) -> *const c_void
+where
+    F: Fn(usize) -> &'static [u8],
+{
+    let view = &*(handle as *const _SliceLookupView<F>);
+    (view.mapper)(idx).as_ptr() as *const c_void
+}
 
-    /// Intersects two sequences (inner join) using their default byte‑slice views.
-    ///
-    /// Both sequences must have an output buffer provided (for first and second positions)
-    /// whose length is at least the minimum of the two input lengths.
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// use stringzilla::sz;
-    ///
-    /// let set1 = ["banana", "apple", "cherry"];
-    /// let set2 = ["cherry", "orange", "pineapple", "banana"];
-    /// let mut positions1 = [0; 3]; // at least min(3, 4) == 3 elements.
-    /// let mut positions2 = [0; 3];
-    /// let n = sz::intersection(&set1, &set2, 0, &mut positions1, &mut positions2).expect("intersect failed");
-    /// assert!(n == 2); // "banana" and "cherry" are common.
-    /// ```
-    pub fn intersection<T: AsRef<[u8]>>(
-        data1: &[T],
-        data2: &[T],
-        seed: u64,
-        positions1: &mut [SortedIdx],
-        positions2: &mut [SortedIdx],
-    ) -> Result<usize, Status> {
-        let min_count = data1.len().min(data2.len());
-        if positions1.len() < min_count || positions2.len() < min_count {
-            return Err(Status::BadAlloc);
-        }
+unsafe extern "C" fn _slice_get_length<F>(handle: *const c_void, idx: SortedIdx) -> usize
+where
+    F: Fn(usize) -> &'static [u8],
+{
+    let view = &*(handle as *const _SliceLookupView<F>);
+    (view.mapper)(idx).len()
+}
 
-        intersection_by(
-            |i| data1[i].as_ref(),
-            |j| data2[j].as_ref(),
-            seed,
-            positions1,
-            positions2,
-        )
+/// Sorts a sequence of items by comparing their byte‑slice representations.
+///
+/// The caller must supply an output buffer `order` whose length is at least
+/// equal to the length of `data`. On success, the function writes the sorted
+/// permutation indices into `order`.
+///
+/// # Example
+///
+/// ```rust
+/// use stringzilla::stringzilla as sz;
+///
+/// let fruits = ["banana", "apple", "cherry"];
+/// let mut order = [0; 3];
+/// sz::argsort_permutation(&fruits, &mut order).expect("sort failed");
+/// assert_eq!(&order, &[1, 0, 2]); // "apple", "banana", "cherry"
+/// ```
+pub fn argsort_permutation<T: AsRef<[u8]>>(data: &[T], order: &mut [SortedIdx]) -> Result<(), Status> {
+    if data.len() > order.len() {
+        return Err(Status::BadAlloc);
     }
+    argsort_permutation_by(|i| data[i].as_ref(), order)
+}
 
-    /// Intersects two sequences (inner join) using their elements corresponding byte‑slice views.
-    /// The caller must provide a closure that maps an index to the byte slice representation of
-    /// the corresponding element in the first and second sequences.
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// use stringzilla::sz;
-    ///
-    /// #[derive(Debug)]
-    /// struct Person { name: &'static str, age: u32 }
-    ///
-    /// let people1 = [
-    ///     Person { name: "Charlie", age: 20 },
-    ///     Person { name: "Alice", age: 25 },
-    ///     Person { name: "Bob", age: 30 },
-    /// ];
-    /// let people2 = [
-    ///     Person { name: "Alice", age: 25 },
-    ///     Person { name: "Bob", age: 30 },
-    ///     Person { name: "Charlie", age: 20 },
-    /// ];
-    /// let mut positions1 = [0; 3]; // min(people1.len(), people2.len())
-    /// let mut positions2 = [0; 3]; // min(people1.len(), people2.len())
-    /// let n = sz::intersection_by(
-    ///     |i| people1[i].name.as_bytes(),
-    ///     |j| people2[j].name.as_bytes(),
-    ///     0,
-    ///     &mut positions1,
-    ///     &mut positions2,
-    /// ).expect("intersect failed");
-    /// assert!(n == 3); // "Alice", "Bob", and "Charlie" are common.
-    /// ```
-    pub fn intersection_by<F, G, A, B>(
-        mapper1: F,
-        mapper2: G,
-        seed: u64,
-        positions1: &mut [SortedIdx],
-        positions2: &mut [SortedIdx],
-    ) -> Result<usize, Status>
-    where
-        F: Fn(usize) -> A,
-        A: AsRef<[u8]>,
-        G: Fn(usize) -> B,
-        B: AsRef<[u8]>,
-    {
-        // Adapter closure: given an index, call the provided mapper and then transmute the
-        // resulting slice to have a `'static` lifetime. This transmute is safe as long as
-        // the FFI call is synchronous and the returned slices are only used during the call.
-        let adapter1 = move |i: usize| -> &'static [u8] {
-            let binding = mapper1(i);
-            let slice = binding.as_ref();
-            unsafe { core::mem::transmute(slice) }
-        };
-        let adapter2 = move |i: usize| -> &'static [u8] {
-            let binding = mapper2(i);
-            let slice = binding.as_ref();
-            unsafe { core::mem::transmute(slice) }
-        };
+/// Sorts a sequence of items by comparing their corresponding byte‑slice representations.
+/// The size of the permutation is inferred from the length of the `order` slice.
+///
+/// # Example
+///
+/// ```rust
+/// use stringzilla::stringzilla as sz;
+///
+/// #[derive(Debug)]
+/// struct Person { name: &'static str, age: u32 }
+///
+/// let people = [
+///     Person { name: "Charlie", age: 20 },
+///     Person { name: "Alice", age: 25 },
+///     Person { name: "Bob", age: 30 },
+/// ];
+/// let mut order = [0; 3];
+/// sz::argsort_permutation_by(|i| people[i].name.as_bytes(), &mut order).expect("sort failed");
+/// assert_eq!(&order, &[1, 2, 0]); // "Alice", "Bob", "Charlie"
+/// ```
+pub fn argsort_permutation_by<F, A>(mapper: F, order: &mut [SortedIdx]) -> Result<(), Status>
+where
+    F: Fn(usize) -> A,
+    A: AsRef<[u8]>,
+{
+    // Adapter closure: given an index, call the provided mapper and then transmute the
+    // resulting slice to have a `'static` lifetime. This transmute is safe as long as
+    // the FFI call is synchronous and the returned slices are only used during the call.
+    let adapter = move |i: usize| -> &'static [u8] {
+        let binding = mapper(i);
+        let slice = binding.as_ref();
+        unsafe { core::mem::transmute(slice) }
+    };
+
+    _argsort_permutation_impl(adapter, order)
+}
 
-        _intersection_by_impl(adapter1, adapter2, seed, positions1, positions2)
+/// Helper that takes an adapter (with a concrete type) and performs the FFI call.
+fn _argsort_permutation_impl<FAdapter>(adapter: FAdapter, order: &mut [SortedIdx]) -> Result<(), Status>
+where
+    FAdapter: Fn(usize) -> &'static [u8],
+{
+    let view = _SliceLookupView { mapper: adapter };
+    let seq = _SzSequence {
+        handle: &view as *const _ as *const c_void,
+        count: order.len(),
+        get_start: Some(_slice_get_start::<FAdapter>),
+        get_length: Some(_slice_get_length::<FAdapter>),
+    };
+    // Use a default allocator for the C API in tests; otherwise, allow C to choose.
+    #[cfg(test)]
+    let status = unsafe {
+        let alloc = SzMemoryAllocator {
+            allocate: Some(sz_rust_allocate_default),
+            free: Some(sz_rust_free_default),
+            handle: core::ptr::null_mut(),
+        };
+        sz_sequence_argsort(&seq, &alloc as *const _ as *const c_void, order.as_mut_ptr())
+    };
+    #[cfg(not(test))]
+    let status = unsafe { sz_sequence_argsort(&seq, core::ptr::null(), order.as_mut_ptr()) };
+    if status == Status::Success {
+        Ok(())
+    } else {
+        Err(status)
     }
+}
 
-    fn _intersection_by_impl<FAdapter, GAdapter>(
-        adapter1: FAdapter,
-        adapter2: GAdapter,
-        seed: u64,
-        positions1: &mut [SortedIdx],
-        positions2: &mut [SortedIdx],
-    ) -> Result<usize, Status>
-    where
-        FAdapter: Fn(usize) -> &'static [u8],
-        GAdapter: Fn(usize) -> &'static [u8],
-    {
-        let view1 = _SliceLookupView { mapper: adapter1 };
-        let view2 = _SliceLookupView { mapper: adapter2 };
-        let seq1 = _SzSequence {
-            handle: &view1 as *const _ as *const c_void,
-            count: positions1.len(),
-            get_start: Some(_slice_get_start::<FAdapter>),
-            get_length: Some(_slice_get_length::<FAdapter>),
-        };
-        let seq2 = _SzSequence {
-            handle: &view2 as *const _ as *const c_void,
-            count: positions2.len(),
-            get_start: Some(_slice_get_start::<GAdapter>),
-            get_length: Some(_slice_get_length::<GAdapter>),
-        };
-        let mut inter_size: usize = 0;
-        let status = unsafe {
-            sz_sequence_intersect(
-                &seq1,
-                &seq2,
-                core::ptr::null(),
-                seed,
-                &mut inter_size as *mut usize,
-                positions1.as_mut_ptr(),
-                positions2.as_mut_ptr(),
-            )
+// ----------------------------------------------------------------------
+// Intersection functions
+// ----------------------------------------------------------------------
+
+/// Intersects two sequences (inner join) using their default byte‑slice views.
+///
+/// Both sequences must have an output buffer provided (for first and second positions)
+/// whose length is at least the minimum of the two input lengths.
+///
+/// # Example
+///
+/// ```rust,no_run
+/// use stringzilla::stringzilla as sz;
+///
+/// let set1 = ["banana", "apple", "cherry"];
+/// let set2 = ["cherry", "orange", "pineapple", "banana"];
+/// let mut positions1 = [0; 3]; // at least min(3, 4) == 3 elements.
+/// let mut positions2 = [0; 3];
+/// let n = sz::intersection(&set1, &set2, 0, &mut positions1, &mut positions2).expect("intersect failed");
+/// assert!(n == 2); // "banana" and "cherry" are common.
+/// ```
+pub fn intersection<T: AsRef<[u8]>>(
+    data1: &[T],
+    data2: &[T],
+    seed: u64,
+    positions1: &mut [SortedIdx],
+    positions2: &mut [SortedIdx],
+) -> Result<usize, Status> {
+    let min_count = data1.len().min(data2.len());
+    if positions1.len() < min_count || positions2.len() < min_count {
+        return Err(Status::BadAlloc);
+    }
+
+    // Call the lower-level implementation with accurate counts for both sequences.
+    let adapter1 = move |i: usize| -> &'static [u8] {
+        // SAFETY: used only during the FFI call
+        unsafe { core::mem::transmute::<&[u8], &'static [u8]>(data1[i].as_ref()) }
+    };
+    let adapter2 = move |j: usize| -> &'static [u8] {
+        // SAFETY: used only during the FFI call
+        unsafe { core::mem::transmute::<&[u8], &'static [u8]>(data2[j].as_ref()) }
+    };
+    _intersection_by_impl(
+        adapter1,
+        adapter2,
+        seed,
+        positions1,
+        positions2,
+        data1.len(),
+        data2.len(),
+    )
+}
+
+/// Intersects two sequences (inner join) using their elements corresponding byte‑slice views.
+/// The caller must provide a closure that maps an index to the byte slice representation of
+/// the corresponding element in the first and second sequences.
+///
+/// # Example
+///
+/// ```rust,no_run
+/// use stringzilla::stringzilla as sz;
+///
+/// #[derive(Debug)]
+/// struct Person { name: &'static str, age: u32 }
+///
+/// let people1 = [
+///     Person { name: "Charlie", age: 20 },
+///     Person { name: "Alice", age: 25 },
+///     Person { name: "Bob", age: 30 },
+/// ];
+/// let people2 = [
+///     Person { name: "Alice", age: 25 },
+///     Person { name: "Bob", age: 30 },
+///     Person { name: "Charlie", age: 20 },
+/// ];
+/// let mut positions1 = [0; 3]; // min(people1.len(), people2.len())
+/// let mut positions2 = [0; 3]; // min(people1.len(), people2.len())
+/// let n = sz::intersection_by(
+///     |i| people1[i].name.as_bytes(),
+///     |j| people2[j].name.as_bytes(),
+///     0,
+///     &mut positions1,
+///     &mut positions2,
+/// ).expect("intersect failed");
+/// assert!(n == 3); // "Alice", "Bob", and "Charlie" are common.
+/// ```
+pub fn intersection_by<F, G, A, B>(
+    mapper1: F,
+    mapper2: G,
+    seed: u64,
+    positions1: &mut [SortedIdx],
+    positions2: &mut [SortedIdx],
+) -> Result<usize, Status>
+where
+    F: Fn(usize) -> A,
+    A: AsRef<[u8]>,
+    G: Fn(usize) -> B,
+    B: AsRef<[u8]>,
+{
+    // Adapter closure: given an index, call the provided mapper and then transmute the
+    // resulting slice to have a `'static` lifetime. This transmute is safe as long as
+    // the FFI call is synchronous and the returned slices are only used during the call.
+    let adapter1 = move |i: usize| -> &'static [u8] {
+        let binding = mapper1(i);
+        let slice = binding.as_ref();
+        unsafe { core::mem::transmute(slice) }
+    };
+    let adapter2 = move |i: usize| -> &'static [u8] {
+        let binding = mapper2(i);
+        let slice = binding.as_ref();
+        unsafe { core::mem::transmute(slice) }
+    };
+
+    _intersection_by_impl(
+        adapter1,
+        adapter2,
+        seed,
+        positions1,
+        positions2,
+        positions1.len(),
+        positions2.len(),
+    )
+}
+
+fn _intersection_by_impl<FAdapter, GAdapter>(
+    adapter1: FAdapter,
+    adapter2: GAdapter,
+    seed: u64,
+    positions1: &mut [SortedIdx],
+    positions2: &mut [SortedIdx],
+    count1: usize,
+    count2: usize,
+) -> Result<usize, Status>
+where
+    FAdapter: Fn(usize) -> &'static [u8],
+    GAdapter: Fn(usize) -> &'static [u8],
+{
+    let view1 = _SliceLookupView { mapper: adapter1 };
+    let view2 = _SliceLookupView { mapper: adapter2 };
+    let seq1 = _SzSequence {
+        handle: &view1 as *const _ as *const c_void,
+        count: count1,
+        get_start: Some(_slice_get_start::<FAdapter>),
+        get_length: Some(_slice_get_length::<FAdapter>),
+    };
+    let seq2 = _SzSequence {
+        handle: &view2 as *const _ as *const c_void,
+        count: count2,
+        get_start: Some(_slice_get_start::<GAdapter>),
+        get_length: Some(_slice_get_length::<GAdapter>),
+    };
+    // Use a default allocator for the C API in tests; otherwise, fall back to null.
+    let mut inter_size: usize = 0;
+    #[cfg(test)]
+    let status = unsafe {
+        let alloc = SzMemoryAllocator {
+            allocate: Some(sz_rust_allocate_default),
+            free: Some(sz_rust_free_default),
+            handle: core::ptr::null_mut(),
         };
-        if status == Status::Success {
-            Ok(inter_size)
-        } else {
-            Err(status)
-        }
+        sz_sequence_intersect(
+            &seq1,
+            &seq2,
+            &alloc as *const _ as *const c_void,
+            seed,
+            &mut inter_size as *mut usize,
+            positions1.as_mut_ptr(),
+            positions2.as_mut_ptr(),
+        )
+    };
+    #[cfg(not(test))]
+    let status = unsafe {
+        sz_sequence_intersect(
+            &seq1,
+            &seq2,
+            core::ptr::null(),
+            seed,
+            &mut inter_size as *mut usize,
+            positions1.as_mut_ptr(),
+            positions2.as_mut_ptr(),
+        )
+    };
+    if status == Status::Success {
+        Ok(inter_size)
+    } else {
+        Err(status)
     }
 }
 
@@ -1090,12 +1154,12 @@ pub enum MatcherType<'a> {
 impl<'a> Matcher<'a> for MatcherType<'a> {
     fn find(&self, haystack: &'a [u8]) -> Option<usize> {
         match self {
-            MatcherType::Find(needle) => sz::find(haystack, needle),
-            MatcherType::RFind(needle) => sz::rfind(haystack, needle),
-            MatcherType::FindFirstOf(needles) => sz::find_byte_from(haystack, needles),
-            MatcherType::FindLastOf(needles) => sz::rfind_byte_from(haystack, needles),
-            MatcherType::FindFirstNotOf(needles) => sz::find_byte_not_from(haystack, needles),
-            MatcherType::FindLastNotOf(needles) => sz::rfind_byte_not_from(haystack, needles),
+            MatcherType::Find(needle) => find(haystack, needle),
+            MatcherType::RFind(needle) => rfind(haystack, needle),
+            MatcherType::FindFirstOf(needles) => find_byte_from(haystack, needles),
+            MatcherType::FindLastOf(needles) => rfind_byte_from(haystack, needles),
+            MatcherType::FindFirstNotOf(needles) => find_byte_not_from(haystack, needles),
+            MatcherType::FindLastNotOf(needles) => rfind_byte_not_from(haystack, needles),
         }
     }
 
@@ -1122,7 +1186,7 @@ impl<'a> Matcher<'a> for MatcherType<'a> {
 /// # Examples
 ///
 /// ```
-/// use stringzilla::{sz, MatcherType, RangeMatches};
+/// use stringzilla::{stringzilla as sz, stringzilla::{MatcherType, RangeMatches}};
 ///
 /// let haystack = b"abababa";
 /// let matcher = MatcherType::Find(b"aba");
@@ -1174,7 +1238,7 @@ impl<'a> Iterator for RangeMatches<'a> {
 /// # Examples
 ///
 /// ```
-/// use stringzilla::{sz, MatcherType, RangeSplits};
+/// use stringzilla::{stringzilla as sz, stringzilla::{MatcherType, RangeSplits}};
 ///
 /// let haystack = b"a,b,c,d";
 /// let matcher = MatcherType::Find(b",");
@@ -1230,7 +1294,7 @@ impl<'a> Iterator for RangeSplits<'a> {
 /// # Examples
 ///
 /// ```
-/// use stringzilla::{sz, MatcherType, RangeRMatches};
+/// use stringzilla::{stringzilla as sz, stringzilla::{MatcherType, RangeRMatches}};
 ///
 /// let haystack = b"abababa";
 /// let matcher = MatcherType::RFind(b"aba");
@@ -1286,7 +1350,7 @@ impl<'a> Iterator for RangeRMatches<'a> {
 /// # Examples
 ///
 /// ```
-/// use stringzilla::{sz, MatcherType, RangeRSplits};
+/// use stringzilla::{stringzilla as sz, stringzilla::{MatcherType, RangeRSplits}};
 ///
 /// let haystack = b"a,b,c,d";
 /// let matcher = MatcherType::RFind(b",");
@@ -1335,38 +1399,20 @@ impl<'a> Iterator for RangeRSplits<'a> {
     }
 }
 
-/// Provides extensions for string searching and manipulation functionalities
-/// on types that can reference byte slices ([u8]). This trait extends the capability
-/// of any type implementing `AsRef<[u8]>`, allowing easy integration of SIMD-accelerated
-/// string processing functions.
+/// Trait for unary string operations that only operate on `self` without needle parameters.
+/// These operations include hash computation and byte sum calculation.
 ///
 /// # Examples
 ///
-/// Basic usage on a `Vec<u8>`:
-///
-/// ```
-/// use stringzilla::StringZilla;
-///
-/// let haystack: &[u8] = &[b'a', b'b', b'c', b'd', b'e'];
-/// let needle: &[u8] = &[b'c', b'd'];
+/// Basic usage on a byte slice:
 ///
-/// assert_eq!(haystack.sz_find(needle.as_ref()), Some(2));
 /// ```
+/// use stringzilla::sz::StringZillableUnary;
 ///
-/// Searching in a string slice:
-///
+/// let text = b"Hello";
+/// assert_eq!(text.sz_bytesum(), 500);
 /// ```
-/// use stringzilla::StringZilla;
-///
-/// let haystack = "abcdef";
-/// let needle = "cd";
-///
-/// assert_eq!(haystack.sz_find(needle.as_bytes()), Some(2));
-/// ```
-pub trait StringZilla<'a, N>
-where
-    N: AsRef<[u8]> + 'a,
-{
+pub trait StringZillableUnary {
     /// Computes the bytesum value of unsigned bytes in a given string.
     /// This function is useful for verifying data integrity and detecting changes in
     /// binary data, such as files or network packets.
@@ -1374,9 +1420,9 @@ where
     /// # Examples
     ///
     /// ```
-    /// use stringzilla::StringZilla;
+    /// use stringzilla::sz::StringZillableUnary;
     ///
-    /// let text: &str = "Hello";
+    /// let text = b"Hello";
     /// assert_eq!(text.sz_bytesum(), 500);
     /// ```
     fn sz_bytesum(&self) -> u64;
@@ -1389,20 +1435,38 @@ where
     /// # Examples
     ///
     /// ```
-    /// use stringzilla::StringZilla;
+    /// use stringzilla::sz::StringZillableUnary;
     ///
-    /// let s1 = "Hello";
-    /// let s2 = "World";
-    /// assert_ne!(StringZilla::sz_hash(s1), StringZilla::sz_hash(s2));
+    /// let s1 = b"Hello";
+    /// let s2 = b"World";
+    /// assert_ne!(s1.sz_hash(), s2.sz_hash());
     /// ```
     fn sz_hash(&self) -> u64;
+}
 
+/// Trait for binary string operations that take a needle parameter.
+/// These operations include searching, splitting, and pattern matching.
+///
+/// # Examples
+///
+/// Basic usage on a string slice:
+///
+/// ```
+/// use stringzilla::sz::StringZillableBinary;
+///
+/// let haystack = "Hello, world!";
+/// assert_eq!(haystack.sz_find("world".as_bytes()), Some(7));
+/// ```
+pub trait StringZillableBinary<'a, N>
+where
+    N: AsRef<[u8]> + 'a,
+{
     /// Searches for the first occurrence of `needle` in `self`.
     ///
     /// # Examples
     ///
     /// ```
-    /// use stringzilla::StringZilla;
+    /// use stringzilla::sz::StringZillableBinary;
     ///
     /// let haystack = "Hello, world!";
     /// assert_eq!(haystack.sz_find("world".as_bytes()), Some(7));
@@ -1414,7 +1478,7 @@ where
     /// # Examples
     ///
     /// ```
-    /// use stringzilla::StringZilla;
+    /// use stringzilla::sz::StringZillableBinary;
     ///
     /// let haystack = "Hello, world, world!";
     /// assert_eq!(haystack.sz_rfind("world".as_bytes()), Some(14));
@@ -1426,7 +1490,7 @@ where
     /// # Examples
     ///
     /// ```
-    /// use stringzilla::StringZilla;
+    /// use stringzilla::sz::StringZillableBinary;
     ///
     /// let haystack = "Hello, world!";
     /// assert_eq!(haystack.sz_find_byte_from("aeiou".as_bytes()), Some(1));
@@ -1438,7 +1502,7 @@ where
     /// # Examples
     ///
     /// ```
-    /// use stringzilla::StringZilla;
+    /// use stringzilla::sz::StringZillableBinary;
     ///
     /// let haystack = "Hello, world!";
     /// assert_eq!(haystack.sz_rfind_byte_from("aeiou".as_bytes()), Some(8));
@@ -1450,7 +1514,7 @@ where
     /// # Examples
     ///
     /// ```
-    /// use stringzilla::StringZilla;
+    /// use stringzilla::sz::StringZillableBinary;
     ///
     /// let haystack = "Hello, world!";
     /// assert_eq!(haystack.sz_find_byte_not_from("aeiou".as_bytes()), Some(0));
@@ -1462,7 +1526,7 @@ where
     /// # Examples
     ///
     /// ```
-    /// use stringzilla::StringZilla;
+    /// use stringzilla::sz::StringZillableBinary;
     ///
     /// let haystack = "Hello, world!";
     /// assert_eq!(haystack.sz_rfind_byte_not_from("aeiou".as_bytes()), Some(12));
@@ -1478,7 +1542,7 @@ where
     /// # Examples
     ///
     /// ```
-    /// use stringzilla::StringZilla;
+    /// use stringzilla::sz::StringZillableBinary;
     ///
     /// let haystack = b"abababa";
     /// let needle = b"aba";
@@ -1496,7 +1560,7 @@ where
     /// # Examples
     ///
     /// ```
-    /// use stringzilla::StringZilla;
+    /// use stringzilla::sz::StringZillableBinary;
     ///
     /// let haystack = b"abababa";
     /// let needle = b"aba";
@@ -1514,7 +1578,7 @@ where
     /// # Examples
     ///
     /// ```
-    /// use stringzilla::StringZilla;
+    /// use stringzilla::sz::StringZillableBinary;
     ///
     /// let haystack = b"a,b,c,d";
     /// let needle = b",";
@@ -1532,7 +1596,7 @@ where
     /// # Examples
     ///
     /// ```
-    /// use stringzilla::StringZilla;
+    /// use stringzilla::sz::StringZillableBinary;
     ///
     /// let haystack = b"a,b,c,d";
     /// let needle = b",";
@@ -1550,7 +1614,7 @@ where
     /// # Examples
     ///
     /// ```
-    /// use stringzilla::StringZilla;
+    /// use stringzilla::sz::StringZillableBinary;
     ///
     /// let haystack = b"Hello, world!";
     /// let needles = b"aeiou";
@@ -1568,7 +1632,7 @@ where
     /// # Examples
     ///
     /// ```
-    /// use stringzilla::StringZilla;
+    /// use stringzilla::sz::StringZillableBinary;
     ///
     /// let haystack = b"Hello, world!";
     /// let needles = b"aeiou";
@@ -1586,7 +1650,7 @@ where
     /// # Examples
     ///
     /// ```
-    /// use stringzilla::StringZilla;
+    /// use stringzilla::sz::StringZillableBinary;
     ///
     /// let haystack = b"Hello, world!";
     /// let needles = b"aeiou";
@@ -1604,7 +1668,7 @@ where
     /// # Examples
     ///
     /// ```
-    /// use stringzilla::StringZilla;
+    /// use stringzilla::sz::StringZillableBinary;
     ///
     /// let haystack = b"Hello, world!";
     /// let needles = b"aeiou";
@@ -1614,41 +1678,46 @@ where
     fn sz_find_last_not_of(&'a self, needles: &'a N) -> RangeRMatches<'a>;
 }
 
-impl<'a, T, N> StringZilla<'a, N> for T
+impl<T> StringZillableUnary for T
 where
     T: AsRef<[u8]> + ?Sized,
-    N: AsRef<[u8]> + 'a,
 {
     fn sz_bytesum(&self) -> u64 {
-        sz::bytesum(self)
+        bytesum(self)
     }
 
     fn sz_hash(&self) -> u64 {
-        sz::hash(self)
+        hash(self)
     }
+}
 
+impl<'a, T, N> StringZillableBinary<'a, N> for T
+where
+    T: AsRef<[u8]> + ?Sized,
+    N: AsRef<[u8]> + 'a,
+{
     fn sz_find(&self, needle: N) -> Option<usize> {
-        sz::find(self, needle)
+        find(self, needle)
     }
 
     fn sz_rfind(&self, needle: N) -> Option<usize> {
-        sz::rfind(self, needle)
+        rfind(self, needle)
     }
 
     fn sz_find_byte_from(&self, needles: N) -> Option<usize> {
-        sz::find_byte_from(self, needles)
+        find_byte_from(self, needles)
     }
 
     fn sz_rfind_byte_from(&self, needles: N) -> Option<usize> {
-        sz::rfind_byte_from(self, needles)
+        rfind_byte_from(self, needles)
     }
 
     fn sz_find_byte_not_from(&self, needles: N) -> Option<usize> {
-        sz::find_byte_not_from(self, needles)
+        find_byte_not_from(self, needles)
     }
 
     fn sz_rfind_byte_not_from(&self, needles: N) -> Option<usize> {
-        sz::rfind_byte_not_from(self, needles)
+        rfind_byte_not_from(self, needles)
     }
 
     fn sz_matches(&'a self, needle: &'a N) -> RangeMatches<'a> {
@@ -1689,9 +1758,8 @@ mod tests {
     use std::borrow::Cow;
     use std::collections::HashSet;
 
+    use super::*;
     use crate::sz;
-    use crate::sz::SortedIdx;
-    use crate::StringZilla;
 
     #[test]
     fn metadata() {
@@ -1739,16 +1807,17 @@ mod tests {
         assert_eq!(sz::rfind("Hello, world!", "world"), Some(7));
 
         // Use the generic function with a String
-        assert_eq!(my_string.sz_find("world"), Some(7));
-        assert_eq!(my_string.sz_rfind("world"), Some(7));
-        assert_eq!(my_string.sz_find_byte_from("world"), Some(2));
-        assert_eq!(my_string.sz_rfind_byte_from("world"), Some(11));
-        assert_eq!(my_string.sz_find_byte_not_from("world"), Some(0));
-        assert_eq!(my_string.sz_rfind_byte_not_from("world"), Some(12));
+        let world_string = String::from("world");
+        assert_eq!(my_string.sz_find(&world_string), Some(7));
+        assert_eq!(my_string.sz_rfind(&world_string), Some(7));
+        assert_eq!(my_string.sz_find_byte_from(&world_string), Some(2));
+        assert_eq!(my_string.sz_rfind_byte_from(&world_string), Some(11));
+        assert_eq!(my_string.sz_find_byte_not_from(&world_string), Some(0));
+        assert_eq!(my_string.sz_rfind_byte_not_from(&world_string), Some(12));
 
         // Use the generic function with a &str
         assert_eq!(my_str.sz_find("world"), Some(7));
-        assert_eq!(my_str.sz_find("world"), Some(7));
+        assert_eq!(my_str.sz_rfind("world"), Some(7));
         assert_eq!(my_str.sz_find_byte_from("world"), Some(2));
         assert_eq!(my_str.sz_rfind_byte_from("world"), Some(11));
         assert_eq!(my_str.sz_find_byte_not_from("world"), Some(0));
@@ -1756,7 +1825,7 @@ mod tests {
 
         // Use the generic function with a Cow<'_, str>
         assert_eq!(my_cow_str.as_ref().sz_find("world"), Some(7));
-        assert_eq!(my_cow_str.as_ref().sz_find("world"), Some(7));
+        assert_eq!(my_cow_str.as_ref().sz_rfind("world"), Some(7));
         assert_eq!(my_cow_str.as_ref().sz_find_byte_from("world"), Some(2));
         assert_eq!(my_cow_str.as_ref().sz_rfind_byte_from("world"), Some(11));
         assert_eq!(my_cow_str.as_ref().sz_find_byte_not_from("world"), Some(0));
@@ -1776,7 +1845,6 @@ mod tests {
 
     mod search_split_iterators {
         use super::*;
-        use crate::{MatcherType, RangeMatches, RangeRMatches};
 
         #[test]
         fn test_matches() {
@@ -2040,8 +2108,8 @@ mod tests {
         let mut out2 = [0; 3];
 
         let n = sz::intersection_by(
-            |i: SortedIdx| group1[i].name.as_bytes(),
-            |j: SortedIdx| group2[j].name.as_bytes(),
+            |i: sz::SortedIdx| group1[i].name.as_bytes(),
+            |j: sz::SortedIdx| group2[j].name.as_bytes(),
             0,
             &mut out1,
             &mut out2,
@@ -2064,18 +2132,3 @@ mod tests {
         assert_eq!(common_from_api, expected);
     }
 }
-
-/// High-performance parallel string algorithms with CPU/GPU acceleration.
-/// 
-/// Requires `cpus`, `cuda`, or `rocm` features. Provides:
-/// - Levenshtein distances (binary and UTF-8)  
-/// - Needleman-Wunsch global alignment
-/// - Smith-Waterman local alignment
-/// - Min-Hash fingerprinting
-#[cfg(any(feature = "cpus", feature = "cuda", feature = "rocm"))]
-pub mod stringzillas;
-
-// Convenience aliases for shorter names
-pub use stringzilla as sz;
-#[cfg(any(feature = "cpus", feature = "cuda", feature = "rocm"))]
-pub use stringzillas as szs;

From b0c33bda07bb61fecde8bec40c5905d4bf42a600 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 18:50:07 +0000
Subject: [PATCH 597/751] Fix: Allow NULL allocator args

---
 include/stringzilla/intersect.h |  14 +++
 rust/stringzilla.rs             | 146 +++++++++++++++-----------------
 2 files changed, 80 insertions(+), 80 deletions(-)

diff --git a/include/stringzilla/intersect.h b/include/stringzilla/intersect.h
index 62a0e62c..e4e4d74b 100644
--- a/include/stringzilla/intersect.h
+++ b/include/stringzilla/intersect.h
@@ -278,6 +278,13 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_serial(
         return sz_success_k;
     }
 
+    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
+    sz_memory_allocator_t global_alloc;
+    if (!alloc) {
+        sz_memory_allocator_init_default(&global_alloc);
+        alloc = &global_alloc;
+    }
+
     // Allocate memory for the hash table and initialize it with 0xFF.
     // The higher is the `hash_table_slots` multiple - the more memory we will use,
     // but the less likely the collisions will be.
@@ -394,6 +401,13 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_ice(
         return sz_success_k;
     }
 
+    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
+    sz_memory_allocator_t global_alloc;
+    if (!alloc) {
+        sz_memory_allocator_init_default(&global_alloc);
+        alloc = &global_alloc;
+    }
+
     // Allocate memory for the hash table and initialize it with 0xFF.
     // The higher is the `hash_table_slots` multiple - the more memory we will use,
     // but the less likely the collisions will be.
diff --git a/rust/stringzilla.rs b/rust/stringzilla.rs
index c3580050..5162f08e 100644
--- a/rust/stringzilla.rs
+++ b/rust/stringzilla.rs
@@ -208,33 +208,6 @@ extern "C" {
 
 }
 
-// Minimal representation of C allocator to pass into FFI.
-#[repr(C)]
-struct SzMemoryAllocator {
-    allocate: Option<unsafe extern "C" fn(size: usize, handle: *mut c_void) -> *mut c_void>,
-    free: Option<unsafe extern "C" fn(ptr: *mut c_void, size: usize, handle: *mut c_void)>,
-    handle: *mut c_void,
-}
-
-// Simple Rust-side allocator functions used in tests to avoid requiring C libc exports.
-#[cfg(test)]
-unsafe extern "C" fn sz_rust_allocate_default(length: usize, _handle: *mut c_void) -> *mut c_void {
-    if length == 0 {
-        return core::ptr::null_mut();
-    }
-    let layout = core::alloc::Layout::from_size_align_unchecked(length, core::mem::align_of::<usize>());
-    std::alloc::alloc(layout) as *mut c_void
-}
-
-#[cfg(test)]
-unsafe extern "C" fn sz_rust_free_default(ptr: *mut c_void, length: usize, _handle: *mut c_void) {
-    if ptr.is_null() || length == 0 {
-        return;
-    }
-    let layout = core::alloc::Layout::from_size_align_unchecked(length, core::mem::align_of::<usize>());
-    std::alloc::dealloc(ptr as *mut u8, layout);
-}
-
 impl SemVer {
     pub const fn new(major: i32, minor: i32, patch: i32) -> Self {
         Self { major, minor, patch }
@@ -840,20 +813,37 @@ struct _SliceLookupView<F: Fn(usize) -> &'static [u8]> {
     mapper: F,
 }
 
-unsafe extern "C" fn _slice_get_start<F>(handle: *const c_void, idx: SortedIdx) -> *const c_void
-where
-    F: Fn(usize) -> &'static [u8],
-{
-    let view = &*(handle as *const _SliceLookupView<F>);
-    (view.mapper)(idx).as_ptr() as *const c_void
+/// Type-punned wrapper for the slice lookup view
+struct _PunnedSliceLookupView {
+    get_slice: unsafe fn(*const c_void, usize) -> &'static [u8],
+    data: *const c_void,
+}
+
+unsafe extern "C" fn _slice_get_start_punned(handle: *const c_void, idx: SortedIdx) -> *const c_void {
+    let view = &*(handle as *const _PunnedSliceLookupView);
+    let slice = (view.get_slice)(view.data, idx);
+    slice.as_ptr() as *const c_void
+}
+
+unsafe extern "C" fn _slice_get_length_punned(handle: *const c_void, idx: SortedIdx) -> usize {
+    let view = &*(handle as *const _PunnedSliceLookupView);
+    let slice = (view.get_slice)(view.data, idx);
+    slice.len()
 }
 
-unsafe extern "C" fn _slice_get_length<F>(handle: *const c_void, idx: SortedIdx) -> usize
+/// Type-specific function generator for each concrete type
+unsafe fn _get_slice_fn<F>() -> unsafe fn(*const c_void, usize) -> &'static [u8]
 where
     F: Fn(usize) -> &'static [u8],
 {
-    let view = &*(handle as *const _SliceLookupView<F>);
-    (view.mapper)(idx).len()
+    unsafe fn get_slice_impl<F>(data: *const c_void, idx: usize) -> &'static [u8]
+    where
+        F: Fn(usize) -> &'static [u8],
+    {
+        let mapper = &*(data as *const F);
+        mapper(idx)
+    }
+    get_slice_impl::<F>
 }
 
 /// Sorts a sequence of items by comparing their byte‑slice representations.
@@ -921,24 +911,16 @@ fn _argsort_permutation_impl<FAdapter>(adapter: FAdapter, order: &mut [SortedIdx
 where
     FAdapter: Fn(usize) -> &'static [u8],
 {
-    let view = _SliceLookupView { mapper: adapter };
+    let wrapper = _PunnedSliceLookupView {
+        get_slice: unsafe { _get_slice_fn::<FAdapter>() },
+        data: &adapter as *const FAdapter as *const c_void,
+    };
     let seq = _SzSequence {
-        handle: &view as *const _ as *const c_void,
+        handle: &wrapper as *const _ as *const c_void,
         count: order.len(),
-        get_start: Some(_slice_get_start::<FAdapter>),
-        get_length: Some(_slice_get_length::<FAdapter>),
-    };
-    // Use a default allocator for the C API in tests; otherwise, allow C to choose.
-    #[cfg(test)]
-    let status = unsafe {
-        let alloc = SzMemoryAllocator {
-            allocate: Some(sz_rust_allocate_default),
-            free: Some(sz_rust_free_default),
-            handle: core::ptr::null_mut(),
-        };
-        sz_sequence_argsort(&seq, &alloc as *const _ as *const c_void, order.as_mut_ptr())
+        get_start: Some(_slice_get_start_punned),
+        get_length: Some(_slice_get_length_punned),
     };
-    #[cfg(not(test))]
     let status = unsafe { sz_sequence_argsort(&seq, core::ptr::null(), order.as_mut_ptr()) };
     if status == Status::Success {
         Ok(())
@@ -958,7 +940,7 @@ where
 ///
 /// # Example
 ///
-/// ```rust,no_run
+/// ```rust
 /// use stringzilla::stringzilla as sz;
 ///
 /// let set1 = ["banana", "apple", "cherry"];
@@ -1006,7 +988,7 @@ pub fn intersection<T: AsRef<[u8]>>(
 ///
 /// # Example
 ///
-/// ```rust,no_run
+/// ```rust
 /// use stringzilla::stringzilla as sz;
 ///
 /// #[derive(Debug)]
@@ -1084,40 +1066,27 @@ where
     FAdapter: Fn(usize) -> &'static [u8],
     GAdapter: Fn(usize) -> &'static [u8],
 {
-    let view1 = _SliceLookupView { mapper: adapter1 };
-    let view2 = _SliceLookupView { mapper: adapter2 };
+    let wrapper1 = _PunnedSliceLookupView {
+        get_slice: unsafe { _get_slice_fn::<FAdapter>() },
+        data: &adapter1 as *const FAdapter as *const c_void,
+    };
+    let wrapper2 = _PunnedSliceLookupView {
+        get_slice: unsafe { _get_slice_fn::<GAdapter>() },
+        data: &adapter2 as *const GAdapter as *const c_void,
+    };
     let seq1 = _SzSequence {
-        handle: &view1 as *const _ as *const c_void,
+        handle: &wrapper1 as *const _ as *const c_void,
         count: count1,
-        get_start: Some(_slice_get_start::<FAdapter>),
-        get_length: Some(_slice_get_length::<FAdapter>),
+        get_start: Some(_slice_get_start_punned),
+        get_length: Some(_slice_get_length_punned),
     };
     let seq2 = _SzSequence {
-        handle: &view2 as *const _ as *const c_void,
+        handle: &wrapper2 as *const _ as *const c_void,
         count: count2,
-        get_start: Some(_slice_get_start::<GAdapter>),
-        get_length: Some(_slice_get_length::<GAdapter>),
+        get_start: Some(_slice_get_start_punned),
+        get_length: Some(_slice_get_length_punned),
     };
-    // Use a default allocator for the C API in tests; otherwise, fall back to null.
     let mut inter_size: usize = 0;
-    #[cfg(test)]
-    let status = unsafe {
-        let alloc = SzMemoryAllocator {
-            allocate: Some(sz_rust_allocate_default),
-            free: Some(sz_rust_free_default),
-            handle: core::ptr::null_mut(),
-        };
-        sz_sequence_intersect(
-            &seq1,
-            &seq2,
-            &alloc as *const _ as *const c_void,
-            seed,
-            &mut inter_size as *mut usize,
-            positions1.as_mut_ptr(),
-            positions2.as_mut_ptr(),
-        )
-    };
-    #[cfg(not(test))]
     let status = unsafe {
         sz_sequence_intersect(
             &seq1,
@@ -2131,4 +2100,21 @@ mod tests {
 
         assert_eq!(common_from_api, expected);
     }
+
+    #[test]
+    fn test_intersection_debug() {
+        println!("Starting intersection debug test...");
+
+        let set1 = ["banana", "apple", "cherry"];
+        let set2 = ["cherry", "orange", "pineapple", "banana"];
+        let mut positions1 = [0; 3];
+        let mut positions2 = [0; 3];
+
+        println!("About to call intersection function...");
+        let n = intersection(&set1, &set2, 0, &mut positions1, &mut positions2).expect("intersect failed");
+
+        println!("Intersection found {} common elements", n);
+        assert!(n == 2);
+        println!("Test passed!");
+    }
 }

From 529fb762c3ccddfc43d95348bee04cb1e36a29a6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 19:05:16 +0000
Subject: [PATCH 598/751] Fix: Passing StringZillas doctests

---
 rust/stringzillas.rs | 207 ++++++++++++++++++++++---------------------
 1 file changed, 106 insertions(+), 101 deletions(-)

diff --git a/rust/stringzillas.rs b/rust/stringzillas.rs
index b35e1266..9f89d46c 100644
--- a/rust/stringzillas.rs
+++ b/rust/stringzillas.rs
@@ -30,8 +30,8 @@ pub use crate::stringzilla::Status;
 ///
 /// # Examples
 ///
-/// ```rust,no_run
-/// use stringzilla::stringzillas::szs::{DeviceScope, Status};
+/// ```rust
+/// use stringzilla::szs::{DeviceScope, Status};
 ///
 /// // Default scope - automatically detects best available hardware
 /// let device = DeviceScope::default().unwrap();
@@ -51,13 +51,12 @@ pub use crate::stringzilla::Status;
 ///
 /// # Error Handling
 ///
-/// ```rust,no_run
-/// # use stringzilla::stringzillas::szs::{DeviceScope, Status};
+/// ```rust
+/// # use stringzilla::szs::{DeviceScope, Status};
 /// // Handle invalid configurations gracefully
 /// match DeviceScope::cpu_cores(0) {
 ///     Ok(_) => unreachable!("Should not accept 0 cores"),
-///     Err(Status::InvalidArgument) => println!("CPU cores must be > 0"),
-///     Err(e) => println!("Unexpected error: {:?}", e),
+///     Err(e) => println!("Invalid CPU core count: {:?}", e),
 /// }
 ///
 /// // GPU might not be available
@@ -85,8 +84,8 @@ impl DeviceScope {
     ///
     /// # Examples
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::DeviceScope;
+    /// ```rust
+    /// # use stringzilla::szs::DeviceScope;
     /// // Create default device scope
     /// let device = DeviceScope::default().expect("Failed to initialize device");
     ///
@@ -127,8 +126,8 @@ impl DeviceScope {
     ///
     /// # Examples
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::DeviceScope;
+    /// ```rust
+    /// # use stringzilla::szs::DeviceScope;
     /// // Create scope for 4 CPU threads
     /// let device = DeviceScope::cpu_cores(4).expect("Failed to create CPU scope");
     ///
@@ -171,8 +170,8 @@ impl DeviceScope {
     ///
     /// # Examples
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::DeviceScope;
+    /// ```rust
+    /// # use stringzilla::szs::DeviceScope;
     /// // Try to use first GPU
     /// match DeviceScope::gpu_device(0) {
     ///     Ok(device) => {
@@ -185,8 +184,8 @@ impl DeviceScope {
     ///
     /// # GPU Selection Strategy
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::DeviceScope;
+    /// ```rust
+    /// # use stringzilla::szs::DeviceScope;
     /// // Try multiple GPUs in order of preference
     /// let devices = [0, 1, 2];
     /// let gpu_device = devices
@@ -222,8 +221,8 @@ impl DeviceScope {
     ///
     /// # Examples
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::DeviceScope;
+    /// ```rust
+    /// # use stringzilla::szs::DeviceScope;
     /// let device = DeviceScope::default().unwrap();
     /// let caps = device.get_capabilities().unwrap();
     ///
@@ -253,8 +252,8 @@ impl DeviceScope {
     ///
     /// # Examples
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::DeviceScope;
+    /// ```rust
+    /// # use stringzilla::szs::DeviceScope;
     /// let device = DeviceScope::cpu_cores(8).unwrap();
     /// assert_eq!(device.get_cpu_cores().unwrap(), 8);
     ///
@@ -284,8 +283,8 @@ impl DeviceScope {
     ///
     /// # Examples
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::DeviceScope;
+    /// ```rust
+    /// # use stringzilla::szs::DeviceScope;
     /// // GPU device scope
     /// if let Ok(gpu_device) = DeviceScope::gpu_device(1) {
     ///     assert_eq!(gpu_device.get_gpu_device().unwrap(), 1);
@@ -313,8 +312,8 @@ impl DeviceScope {
     ///
     /// # Examples
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::DeviceScope;
+    /// ```rust
+    /// # use stringzilla::szs::DeviceScope;
     /// let device = DeviceScope::default().unwrap();
     ///
     /// if device.is_gpu() {
@@ -364,8 +363,8 @@ unsafe impl Sync for DeviceScope {}
 ///
 /// # Examples
 ///
-/// ```rust,no_run
-/// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+/// ```rust
+/// # use stringzilla::szs::{Fingerprints, DeviceScope};
 /// let device = DeviceScope::default().unwrap();
 ///
 /// // Default configuration (good for most use cases)
@@ -392,8 +391,8 @@ unsafe impl Sync for DeviceScope {}
 ///
 /// # Alphabet-Specific Presets
 ///
-/// ```rust,no_run
-/// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+/// ```rust
+/// # use stringzilla::szs::{Fingerprints, DeviceScope};
 /// # let device = DeviceScope::default().unwrap();
 /// // Bioinformatics applications
 /// let dna_engine = Fingerprints::builder().dna().build(&device).unwrap();         // A,C,G,T
@@ -419,8 +418,8 @@ impl FingerprintsBuilder {
     ///
     /// # Examples
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::FingerprintsBuilder;
+    /// ```rust
+    /// # use stringzilla::szs::FingerprintsBuilder;
     /// let builder = FingerprintsBuilder::new();
     /// // Further customize with method chaining...
     /// ```
@@ -442,11 +441,12 @@ impl FingerprintsBuilder {
     ///
     /// # Examples
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+    /// ```rust
+    /// # use stringzilla::szs::{Fingerprints, DeviceScope};
     /// let device = DeviceScope::default().unwrap();
     /// let engine = Fingerprints::builder()
     ///     .binary()
+    ///     .dimensions(256)
     ///     .build(&device)
     ///     .unwrap();
     ///
@@ -475,12 +475,13 @@ impl FingerprintsBuilder {
     ///
     /// # Examples
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+    /// ```rust
+    /// # use stringzilla::szs::{Fingerprints, DeviceScope};
     /// let device = DeviceScope::default().unwrap();
     /// let engine = Fingerprints::builder()
     ///     .ascii()
     ///     .window_widths(&[3, 5, 7])  // Good for word-level analysis
+    ///     .dimensions(256)
     ///     .build(&device)
     ///     .unwrap();
     ///
@@ -510,8 +511,8 @@ impl FingerprintsBuilder {
     ///
     /// # Examples
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+    /// ```rust
+    /// # use stringzilla::szs::{Fingerprints, DeviceScope};
     /// let device = DeviceScope::default().unwrap();
     /// let engine = Fingerprints::builder()
     ///     .dna()
@@ -546,8 +547,8 @@ impl FingerprintsBuilder {
     ///
     /// # Examples
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+    /// ```rust
+    /// # use stringzilla::szs::{Fingerprints, DeviceScope};
     /// let device = DeviceScope::default().unwrap();
     /// let engine = Fingerprints::builder()
     ///     .protein()
@@ -584,8 +585,8 @@ impl FingerprintsBuilder {
     ///
     /// # Examples
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+    /// ```rust
+    /// # use stringzilla::szs::{Fingerprints, DeviceScope};
     /// let device = DeviceScope::default().unwrap();
     ///
     /// // Hexadecimal data (0-9, A-F)
@@ -621,8 +622,8 @@ impl FingerprintsBuilder {
     ///
     /// # Domain-Specific Recommendations
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+    /// ```rust
+    /// # use stringzilla::szs::{Fingerprints, DeviceScope};
     /// let device = DeviceScope::default().unwrap();
     ///
     /// // Natural language (word-level patterns)
@@ -674,8 +675,8 @@ impl FingerprintsBuilder {
     ///
     /// # Recommended Formulas
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+    /// ```rust
+    /// # use stringzilla::szs::{Fingerprints, DeviceScope};
     /// let device = DeviceScope::default().unwrap();
     ///
     /// // Basic formula: 64 * number_of_window_widths
@@ -726,8 +727,8 @@ impl FingerprintsBuilder {
     ///
     /// # Examples
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+    /// ```rust
+    /// # use stringzilla::szs::{Fingerprints, DeviceScope};
     /// let device = DeviceScope::default().unwrap();
     ///
     /// // Build with validation
@@ -816,8 +817,8 @@ impl FingerprintsBuilder {
 ///
 /// ## Document Similarity
 ///
-/// ```rust,no_run
-/// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+/// ```rust
+/// # use stringzilla::szs::{Fingerprints, DeviceScope};
 /// let device = DeviceScope::default().unwrap();
 /// let engine = Fingerprints::builder()
 ///     .ascii()  // Text processing optimized
@@ -839,8 +840,8 @@ impl FingerprintsBuilder {
 ///
 /// ## Genomic k-mer Analysis
 ///
-/// ```rust,no_run
-/// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+/// ```rust
+/// # use stringzilla::szs::{Fingerprints, DeviceScope};
 /// let device = DeviceScope::default().unwrap();
 /// let engine = Fingerprints::builder()
 ///     .dna()  // 4-character alphabet optimization
@@ -873,8 +874,8 @@ impl Fingerprints {
     ///
     /// # Examples
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::Fingerprints;
+    /// ```rust
+    /// # use stringzilla::szs::Fingerprints;
     /// // Start with default configuration
     /// let builder = Fingerprints::builder();
     ///
@@ -910,13 +911,17 @@ impl Fingerprints {
     ///
     /// # Similarity Estimation
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::{Fingerprints, DeviceScope};
+    /// ```rust
+    /// # use stringzilla::szs::{Fingerprints, DeviceScope};
     /// let device = DeviceScope::default().unwrap();
-    /// let engine = Fingerprints::builder().build(&device).unwrap();
+    /// let dimensions = 256;
+    /// let engine = Fingerprints::builder()
+    ///     .dimensions(dimensions)
+    ///     .build(&device)
+    ///     .unwrap();
     ///
     /// let strings = vec!["hello world", "hello word", "goodbye world"];
-    /// let dimensions = 256;
+    ///
     /// let (hashes, _counts) = engine.compute(&device, &strings, dimensions).unwrap();
     ///
     /// // Estimate Jaccard similarity between strings 0 and 1
@@ -1344,8 +1349,8 @@ pub type UnifiedVec<T> = allocator_api2::vec::Vec<T, UnifiedAlloc>;
 ///
 /// # Examples
 ///
-/// ```rust,no_run
-/// # use stringzilla::stringzillas::szs::{DeviceScope, LevenshteinDistances};
+/// ```rust
+/// # use stringzilla::szs::{DeviceScope, LevenshteinDistances};
 /// // Create engine with standard costs
 /// let device = DeviceScope::default().unwrap();
 /// let engine = LevenshteinDistances::new(
@@ -1367,8 +1372,8 @@ pub type UnifiedVec<T> = allocator_api2::vec::Vec<T, UnifiedAlloc>;
 ///
 /// # Advanced Configuration
 ///
-/// ```rust,no_run
-/// # use stringzilla::stringzillas::szs::{DeviceScope, LevenshteinDistances};
+/// ```rust
+/// # use stringzilla::szs::{DeviceScope, LevenshteinDistances};
 /// // Biased towards insertions/deletions over substitutions
 /// let device = DeviceScope::default().unwrap();
 /// let engine = LevenshteinDistances::new(
@@ -1382,8 +1387,8 @@ pub type UnifiedVec<T> = allocator_api2::vec::Vec<T, UnifiedAlloc>;
 ///
 /// # Performance Optimization
 ///
-/// ```rust,no_run
-/// # use stringzilla::stringzillas::szs::{DeviceScope, LevenshteinDistances};
+/// ```rust
+/// # use stringzilla::szs::{DeviceScope, LevenshteinDistances};
 /// // For maximum performance with large batches
 /// let device = DeviceScope::cpu_cores(8).unwrap(); // or gpu_device(0)
 /// let engine = LevenshteinDistances::new(&device, 0, 1, 1, 1).unwrap();
@@ -1419,8 +1424,8 @@ impl LevenshteinDistances {
     ///
     /// # Cost Configuration Guidelines
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::{DeviceScope, LevenshteinDistances};
+    /// ```rust
+    /// # use stringzilla::szs::{DeviceScope, LevenshteinDistances};
     /// let device = DeviceScope::default().unwrap();
     ///
     /// // Standard Levenshtein distance (all operations cost 1)
@@ -1487,8 +1492,8 @@ impl LevenshteinDistances {
     ///
     /// # Examples
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::{DeviceScope, LevenshteinDistances};
+    /// ```rust
+    /// # use stringzilla::szs::{DeviceScope, LevenshteinDistances};
     /// let device = DeviceScope::default().unwrap();
     /// let engine = LevenshteinDistances::new(&device, 0, 1, 1, 1).unwrap();
     ///
@@ -1609,8 +1614,8 @@ unsafe impl Sync for LevenshteinDistances {}
 ///
 /// # Examples
 ///
-/// ```rust,no_run
-/// # use stringzilla::stringzillas::szs::{DeviceScope, LevenshteinDistancesUtf8};
+/// ```rust
+/// # use stringzilla::szs::{DeviceScope, LevenshteinDistancesUtf8};
 /// let device = DeviceScope::default().unwrap();
 /// let engine = LevenshteinDistancesUtf8::new(&device, 0, 1, 1, 1).unwrap();
 ///
@@ -1626,8 +1631,8 @@ unsafe impl Sync for LevenshteinDistances {}
 ///
 /// # Comparison with Binary Engine
 ///
-/// ```rust,no_run
-/// # use stringzilla::stringzillas::szs::{DeviceScope, LevenshteinDistances, LevenshteinDistancesUtf8};
+/// ```rust
+/// # use stringzilla::szs::{DeviceScope, LevenshteinDistances, LevenshteinDistancesUtf8};
 /// let device = DeviceScope::default().unwrap();
 /// let binary_engine = LevenshteinDistances::new(&device, 0, 1, 1, 1).unwrap();
 /// let utf8_engine = LevenshteinDistancesUtf8::new(&device, 0, 1, 1, 1).unwrap();
@@ -1669,8 +1674,8 @@ impl LevenshteinDistancesUtf8 {
     ///
     /// # Examples
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::{DeviceScope, LevenshteinDistancesUtf8};
+    /// ```rust
+    /// # use stringzilla::szs::{DeviceScope, LevenshteinDistancesUtf8};
     /// let device = DeviceScope::default().unwrap();
     ///
     /// // Standard Unicode-aware engine
@@ -1721,14 +1726,14 @@ impl LevenshteinDistancesUtf8 {
     ///
     /// # Examples
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::{DeviceScope, LevenshteinDistancesUtf8};
+    /// ```rust
+    /// # use stringzilla::szs::{DeviceScope, LevenshteinDistancesUtf8};
     /// let device = DeviceScope::default().unwrap();
     /// let engine = LevenshteinDistancesUtf8::new(&device, 0, 1, 1, 1).unwrap();
     ///
-    /// // Mixed string types
+    /// // Unicode strings (same container type for both sides)
     /// let strings_a: Vec<String> = vec!["résumé".to_string(), "naïve".to_string()];
-    /// let strings_b: Vec<&str> = vec!["resume", "naive"];
+    /// let strings_b: Vec<String> = vec!["resume".to_string(), "naive".to_string()];
     /// let distances = engine.compute(&device, &strings_a, &strings_b).unwrap();
     ///
     /// // Each accented character counts as 1 edit
@@ -1741,8 +1746,8 @@ impl LevenshteinDistancesUtf8 {
     /// Note: This engine does NOT perform Unicode normalization. Pre-normalize
     /// your strings if you need to handle composed vs decomposed characters:
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::{DeviceScope, LevenshteinDistancesUtf8};
+    /// ```rust
+    /// # use stringzilla::szs::{DeviceScope, LevenshteinDistancesUtf8};
     /// // These are different at the code point level:
     /// let composed = vec!["café"];     // é as single code point U+00E9
     /// let decomposed = vec!["cafe\u{0301}"]; // e + combining acute accent
@@ -1868,8 +1873,8 @@ unsafe impl Sync for LevenshteinDistancesUtf8 {}
 ///
 /// # Examples
 ///
-/// ```rust,no_run
-/// # use stringzilla::stringzillas::szs::{DeviceScope, NeedlemanWunschScores};
+/// ```rust
+/// # use stringzilla::szs::{DeviceScope, NeedlemanWunschScores};
 /// // Create simple scoring matrix (match=2, mismatch=-1)
 /// let mut matrix = [[-1i8; 256]; 256];
 /// for i in 0..256 {
@@ -1894,8 +1899,8 @@ unsafe impl Sync for LevenshteinDistancesUtf8 {}
 ///
 /// # BLOSUM62 Example
 ///
-/// ```rust,no_run
-/// # use stringzilla::stringzillas::szs::{DeviceScope, NeedlemanWunschScores};
+/// ```rust
+/// # use stringzilla::szs::{DeviceScope, NeedlemanWunschScores};
 /// // Load BLOSUM62 matrix (simplified example)
 /// fn create_blosum62_matrix() -> [[i8; 256]; 256] {
 ///     let mut matrix = [[-4i8; 256]; 256]; // Default mismatch
@@ -1944,8 +1949,8 @@ impl NeedlemanWunschScores {
     ///
     /// # Matrix Guidelines
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::{DeviceScope, NeedlemanWunschScores};
+    /// ```rust
+    /// # use stringzilla::szs::{DeviceScope, NeedlemanWunschScores};
     /// // Identity matrix (simple match/mismatch)
     /// let mut simple_matrix = [[0i8; 256]; 256];
     /// for i in 0..256 {
@@ -1961,8 +1966,8 @@ impl NeedlemanWunschScores {
     ///
     /// # Gap Cost Selection
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::{DeviceScope, NeedlemanWunschScores};
+    /// ```rust
+    /// # use stringzilla::szs::{DeviceScope, NeedlemanWunschScores};
     /// # let mut matrix = [[0i8; 256]; 256];
     /// # let device = DeviceScope::default().unwrap();
     /// // Linear gap costs (open == extend)
@@ -2021,8 +2026,8 @@ impl NeedlemanWunschScores {
     ///
     /// # Examples
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::{DeviceScope, NeedlemanWunschScores};
+    /// ```rust
+    /// # use stringzilla::szs::{DeviceScope, NeedlemanWunschScores};
     /// # let mut matrix = [[0i8; 256]; 256];
     /// # for i in 0..256 { matrix[i][i] = 2; for j in 0..256 { if i != j { matrix[i][j] = -1; } } }
     /// let device = DeviceScope::default().unwrap();
@@ -2039,8 +2044,8 @@ impl NeedlemanWunschScores {
     ///
     /// # Batch Processing
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::{DeviceScope, NeedlemanWunschScores};
+    /// ```rust
+    /// # use stringzilla::szs::{DeviceScope, NeedlemanWunschScores};
     /// # let mut matrix = [[0i8; 256]; 256];
     /// # let device = DeviceScope::default().unwrap();
     /// # let engine = NeedlemanWunschScores::new(&device, &matrix, -2, -1).unwrap();
@@ -2185,8 +2190,8 @@ unsafe impl Sync for NeedlemanWunschScores {}
 ///
 /// # Examples
 ///
-/// ```rust,no_run
-/// # use stringzilla::stringzillas::szs::{DeviceScope, SmithWatermanScores};
+/// ```rust
+/// # use stringzilla::szs::{DeviceScope, SmithWatermanScores};
 /// // Create scoring matrix for DNA (A, T, C, G)
 /// let mut dna_matrix = [[-2i8; 256]; 256]; // Mismatch penalty
 /// let dna_chars = [b'A', b'T', b'C', b'G'];
@@ -2214,8 +2219,8 @@ unsafe impl Sync for NeedlemanWunschScores {}
 ///
 /// # Database Search Example
 ///
-/// ```rust,no_run
-/// # use stringzilla::stringzillas::szs::{DeviceScope, SmithWatermanScores};
+/// ```rust
+/// # use stringzilla::szs::{DeviceScope, SmithWatermanScores};
 /// # let mut matrix = [[0i8; 256]; 256];
 /// let device = DeviceScope::default().unwrap();
 /// let engine = SmithWatermanScores::new(&device, &matrix, -2, -1).unwrap();
@@ -2263,8 +2268,8 @@ impl SmithWatermanScores {
     ///
     /// # Examples
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::{DeviceScope, SmithWatermanScores};
+    /// ```rust
+    /// # use stringzilla::szs::{DeviceScope, SmithWatermanScores};
     /// let device = DeviceScope::default().unwrap();
     ///
     /// // Protein alignment matrix (simplified)
@@ -2285,8 +2290,8 @@ impl SmithWatermanScores {
     ///
     /// # Gap Penalty Strategy
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::{DeviceScope, SmithWatermanScores};
+    /// ```rust
+    /// # use stringzilla::szs::{DeviceScope, SmithWatermanScores};
     /// # let mut matrix = [[0i8; 256]; 256];
     /// # let device = DeviceScope::default().unwrap();
     /// // Conservative gaps (discourage insertions/deletions)
@@ -2345,8 +2350,8 @@ impl SmithWatermanScores {
     ///
     /// # Examples
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::{DeviceScope, SmithWatermanScores};
+    /// ```rust
+    /// # use stringzilla::szs::{DeviceScope, SmithWatermanScores};
     /// # let mut matrix = [[0i8; 256]; 256];
     /// # for i in 0..256 { matrix[i][i] = 3; for j in 0..256 { if i != j { matrix[i][j] = -1; } } }
     /// let device = DeviceScope::default().unwrap();
@@ -2371,8 +2376,8 @@ impl SmithWatermanScores {
     ///
     /// # Homology Search
     ///
-    /// ```rust,no_run
-    /// # use stringzilla::stringzillas::szs::{DeviceScope, SmithWatermanScores};
+    /// ```rust
+    /// # use stringzilla::szs::{DeviceScope, SmithWatermanScores};
     /// # let mut matrix = [[0i8; 256]; 256];
     /// # let device = DeviceScope::default().unwrap();
     /// # let engine = SmithWatermanScores::new(&device, &matrix, -2, -1).unwrap();
@@ -2622,7 +2627,7 @@ extern "C" fn sz_sequence_get_length_str<T: AsRef<str>>(handle: *mut c_void, ind
 /// # Examples
 ///
 /// ```
-/// # use stringzilla::stringzillas::szs::backend_info;
+/// # use stringzilla::szs::backend_info;
 /// let info = backend_info();
 /// println!("Using backend: {}", info);
 /// ```

From 70c4addc1740dc63bf72aebfe2ee72c82212e012 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 19:11:58 +0000
Subject: [PATCH 599/751] Fix: Deprecate `levenshteinDistance` in Swift

---
 README.md                              |  1 -
 swift/StringProtocol+StringZilla.swift | 35 --------------------------
 swift/Test.swift                       |  7 ------
 3 files changed, 43 deletions(-)

diff --git a/README.md b/README.md
index 127d793b..8fd3c109 100644
--- a/README.md
+++ b/README.md
@@ -1315,7 +1315,6 @@ s[s.findLast(substring: "o")!...] // "o StringZilla. 👋")
 s[s.findFirst(characterFrom: "aeiou")!...] // "ello, world! Welcome to StringZilla. 👋")
 s[s.findLast(characterFrom: "aeiou")!...] // "a. 👋")
 s[s.findFirst(characterNotFrom: "aeiou")!...] // "Hello, world! Welcome to StringZilla. 👋"
-s.levenshteinDistance(from: "Hello, world!")! // 29
 ```
 
 ## Algorithms & Design Decisions 📚
diff --git a/swift/StringProtocol+StringZilla.swift b/swift/StringProtocol+StringZilla.swift
index 3f759e71..72d27784 100644
--- a/swift/StringProtocol+StringZilla.swift
+++ b/swift/StringProtocol+StringZilla.swift
@@ -247,39 +247,4 @@ public extension StringZillaViewable {
         }
         return result
     }
-
-    func levenshteinDistance<S: StringZillaViewable>(
-        from other: S,
-        bound: UInt? = nil
-    ) throws -> UInt {
-        // Prepare a local variable for the result.
-        var computedResult: sz_size_t = 0
-
-        // Swift has a ridiculous issue with casting unsigned 64-bit to unsigned 64-bit
-        // values which results in "Fatal error: Not enough bits to represent the passed value".
-        // Let's just copy the bytes: https://stackoverflow.com/a/68650250/2766161
-        let effectiveBound: sz_size_t = bound.map { sz_size_t($0) } ?? sz_size_max_()
-        let status = try withStringZillaScope { hPointer, hLength in
-            try other.withStringZillaScope { nPointer, nLength in
-                // Pass a mutable pointer for the result.
-                sz_levenshtein_distance(
-                    hPointer,
-                    hLength,
-                    nPointer,
-                    nLength,
-                    effectiveBound,
-                    nil, // default allocator
-                    &computedResult // out-parameter for the computed distance
-                )
-            }
-        }
-
-        // Check the returned status code.
-        guard status == sz_success_k else {
-            // Map the status code to an appropriate Swift error.
-            throw StringZillaError.memoryAllocationFailed
-        }
-
-        return UInt(computedResult)
-    }
 }
diff --git a/swift/Test.swift b/swift/Test.swift
index 670d758a..aba2a287 100644
--- a/swift/Test.swift
+++ b/swift/Test.swift
@@ -49,13 +49,6 @@ class StringZillaTests: XCTestCase {
         XCTAssertEqual(testString[index...], "👋")
     }
 
-    func testLevenshteinDistance() {
-        let otherString = "Hello, world!"
-        let distance = try? testString.levenshteinDistance(from: otherString)
-        XCTAssertNotNil(distance)
-        XCTAssertEqual(distance, 29)
-    }
-
     func testFindLastCharacterNotFromSetNoMatch() {
         let index = "aeiou".findLast(characterNotFrom: "aeiou")
         XCTAssertNil(index)

From e6744134c183bee43386e6850606a0921c4276a2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 19:26:12 +0000
Subject: [PATCH 600/751] Make: Bump Fork Union to 2.2.2

---
 fork_union | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fork_union b/fork_union
index f5641d86..63076d51 160000
--- a/fork_union
+++ b/fork_union
@@ -1 +1 @@
-Subproject commit f5641d863aeed6a3efffce7a079a32c994e3343c
+Subproject commit 63076d5190db5caa111d06ec05998724cb50c046

From 4598f42b61e765b986f7caa3e5e2236877c8bb42 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 19:26:22 +0000
Subject: [PATCH 601/751] Fix: Type-casting seed on Clang

---
 scripts/bench.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index 4eff4c39..ecadd70d 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -481,7 +481,7 @@ inline void log_failure(                                              //
 
     std::fprintf(file, "Dataset path: %s\n", env.path.c_str());
     std::fprintf(file, "Tokenization mode: %d\n", env.tokenization);
-    std::fprintf(file, "Seed: %zu\n", env.seed);
+    std::fprintf(file, "Seed: %zu\n", static_cast<std::size_t>(env.seed));
     if (token_index) std::fprintf(file, "Token index: %zu\n", *token_index);
     std::fprintf(file, "Expected: %zu\n", expected_check_value);
     std::fprintf(file, "Actual: %zu\n", actual_check_value);

From 7ea327e4555749aa2a31c6a601257dbb7d071309 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 19:26:32 +0000
Subject: [PATCH 602/751] Make: Log Alpine version

---
 .github/workflows/prerelease.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 0d4cd598..4ae15f8b 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -426,6 +426,10 @@ jobs:
       image: alpine:latest
       options: --privileged # If needed for certain Docker operations
     steps:
+      - name: Log Alpine and Git versions
+        run: |
+          echo "Alpine: $(cat /etc/alpine-release 2>/dev/null || echo 'unknown')"
+          git --version || echo "git not installed"
       - uses: actions/checkout@v4
         with:
           submodules: recursive

From 1ff0c153b746c3fafd1f1c48d3d51ef08063c572 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 19:45:34 +0000
Subject: [PATCH 603/751] Make: Install Git on Alpine

---
 .github/workflows/prerelease.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 4ae15f8b..37e99083 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -426,6 +426,10 @@ jobs:
       image: alpine:latest
       options: --privileged # If needed for certain Docker operations
     steps:
+      - name: Install Git for checkout
+        run: |
+          apk add --no-cache git openssh ca-certificates
+          git --version
       - name: Log Alpine and Git versions
         run: |
           echo "Alpine: $(cat /etc/alpine-release 2>/dev/null || echo 'unknown')"

From dac39413cc77066422fe441edb7935ecfbc838ec Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 19:58:02 +0000
Subject: [PATCH 604/751] Fix: Guard SVE checks for cross-compilation

---
 include/stringzilla/stringzilla.h | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 1ba90e96..763613d1 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -195,7 +195,9 @@ SZ_INTERNAL sz_capability_t sz_capabilities_implementation_arm_(void) {
 
     // Read CPUID registers directly
     unsigned long id_aa64isar0_el1 = 0, id_aa64isar1_el1 = 0, id_aa64pfr0_el1 = 0, id_aa64zfr0_el1 = 0;
+    unsigned supports_neon = 0, supports_sve = 0, supports_sve2 = 0;
 
+#if SZ_USE_NEON || SZ_USE_SVE || SZ_USE_SVE2
     // Now let's unpack the status flags from ID_AA64ISAR0_EL1
     // https://developer.arm.com/documentation/ddi0601/2024-03/AArch64-Registers/ID-AA64ISAR0-EL1--AArch64-Instruction-Set-Attribute-Register-0?lang=en
     __asm__ __volatile__("mrs %0, ID_AA64ISAR0_EL1" : "=r"(id_aa64isar0_el1));
@@ -205,8 +207,19 @@ SZ_INTERNAL sz_capability_t sz_capabilities_implementation_arm_(void) {
     // Now let's unpack the status flags from ID_AA64PFR0_EL1
     // https://developer.arm.com/documentation/ddi0601/2024-03/AArch64-Registers/ID-AA64PFR0-EL1--AArch64-Processor-Feature-Register-0?lang=en
     __asm__ __volatile__("mrs %0, ID_AA64PFR0_EL1" : "=r"(id_aa64pfr0_el1));
+#endif // SZ_USE_NEON || SZ_USE_SVE || SZ_USE_SVE2
+
+    // AdvSIMD, bits [23:20] of ID_AA64PFR0_EL1 can be used to check for `fp16` support
+    //  - 0b0000: integers, single, double precision arithmetic
+    //  - 0b0001: includes support for half-precision floating-point arithmetic
+    //  - 0b1111: NEON is not supported?!
+    // That's a really weird way to encode lack of NEON support, but it's important to
+    // check in case we are running on R-profile CPUs.
+    supports_neon = ((id_aa64pfr0_el1 >> 20) & 0xF) != 0xF;
+
+#if SZ_USE_SVE || SZ_USE_SVE2
     // SVE, bits [35:32] of ID_AA64PFR0_EL1
-    unsigned supports_sve = ((id_aa64pfr0_el1 >> 32) & 0xF) >= 1;
+    supports_sve = ((id_aa64pfr0_el1 >> 32) & 0xF) >= 1;
     // Now let's unpack the status flags from ID_AA64ZFR0_EL1
     // https://developer.arm.com/documentation/ddi0601/2024-03/AArch64-Registers/ID-AA64ZFR0-EL1--SVE-Feature-ID-Register-0?lang=en
     if (supports_sve) __asm__ __volatile__("mrs %0, ID_AA64ZFR0_EL1" : "=r"(id_aa64zfr0_el1));
@@ -215,8 +228,8 @@ SZ_INTERNAL sz_capability_t sz_capabilities_implementation_arm_(void) {
     //  - 0b0001: SVE2 is implemented
     //  - 0b0010: SVE2.1 is implemented
     // This value must match the existing indicator obtained from ID_AA64PFR0_EL1:
-    unsigned supports_sve2 = ((id_aa64zfr0_el1) & 0xF) >= 1;
-    unsigned supports_neon = 1; // NEON is always supported
+    supports_sve2 = ((id_aa64zfr0_el1) & 0xF) >= 1;
+#endif // SZ_USE_SVE || SZ_USE_SVE2
 
     return (sz_capability_t)(               //
         (sz_cap_neon_k * (supports_neon)) | //

From b27552deeddfa5e3c0c6e5f61b4a551148a4a81d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 19:59:08 +0000
Subject: [PATCH 605/751] Make: Disable NUMA by default

---
 c/stringzillas.cuh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index 61fb5d69..b724a4e9 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -4,6 +4,11 @@
  *  @author     Ash Vardanian
  *  @date       March 23, 2025
  */
+
+#if !defined(FU_ENABLE_NUMA)
+#define FU_ENABLE_NUMA 0
+#endif
+
 #include <stringzillas/stringzillas.h> // StringZillas library header
 
 #include <variant>        // For `std::variant`

From ef3ca96f003877ef2e7bb79e267d7eda9ea279b1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 19:59:35 +0000
Subject: [PATCH 606/751] Fix: uninitialized intersection `count`

---
 include/stringzilla/stringzilla.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 1902bf2e..7a24c3bb 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -3975,7 +3975,7 @@ intersect_result_t intersect(first_type_ const &first, second_type_ const &secon
     std::size_t const max_count = (std::min)(first.size(), second.size());
     std::vector<sorted_idx_t> first_positions(max_count);
     std::vector<sorted_idx_t> second_positions(max_count);
-    std::size_t count;
+    std::size_t count = 0;
     status_t status = try_intersect( //
         first, first_extractor,      //
         second, second_extractor,    //

From ddc640b222e4d7a34f24bda72350fdbf6e66d6a8 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 20:24:12 +0000
Subject: [PATCH 607/751] Fix: Fetching engines `::capability_k`

---
 c/stringzillas.cuh | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index b724a4e9..f62d006d 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -277,7 +277,8 @@ sz_status_t sz_levenshtein_distances_for_(                                     /
     // The simplest case, is having non-optimized non-unrolled hashers.
     sz_status_t result = sz_success_k;
     auto variant_logic = [&](auto &engine_variant) {
-        constexpr sz_capability_t engine_capability_k = engine_variant.capability_k;
+        using engine_variant_t = std::decay_t<decltype(engine_variant)>;
+        constexpr sz_capability_t engine_capability_k = engine_variant_t::capability_k;
 
         // GPU backends are only compatible with GPU scopes
         if constexpr (is_gpu_capability(engine_capability_k)) {
@@ -357,7 +358,8 @@ sz_status_t sz_levenshtein_distances_utf8_for_(
     // The simplest case, is having non-optimized non-unrolled hashers.
     sz_status_t result = sz_success_k;
     auto variant_logic = [&](auto &engine_variant) {
-        constexpr sz_capability_t engine_capability_k = engine_variant.capability_k;
+        using engine_variant_t = std::decay_t<decltype(engine_variant)>;
+        constexpr sz_capability_t engine_capability_k = engine_variant_t::capability_k;
 
         // GPU backends are only compatible with GPU scopes
         if constexpr (is_gpu_capability(engine_capability_k)) {
@@ -434,7 +436,8 @@ sz_status_t sz_needleman_wunsch_scores_for_(
     // The simplest case, is having non-optimized non-unrolled hashers.
     sz_status_t result = sz_success_k;
     auto variant_logic = [&](auto &engine_variant) {
-        constexpr sz_capability_t engine_capability_k = engine_variant.capability_k;
+        using engine_variant_t = std::decay_t<decltype(engine_variant)>;
+        constexpr sz_capability_t engine_capability_k = engine_variant_t::capability_k;
 
         // GPU backends are only compatible with GPU scopes
         if constexpr (is_gpu_capability(engine_capability_k)) {
@@ -521,7 +524,8 @@ sz_status_t sz_smith_waterman_scores_for_(                                     /
     // The simplest case, is having non-optimized non-unrolled hashers.
     sz_status_t result = sz_success_k;
     auto variant_logic = [&](auto &engine_variant) {
-        constexpr sz_capability_t engine_capability_k = engine_variant.capability_k;
+        using engine_variant_t = std::decay_t<decltype(engine_variant)>;
+        constexpr sz_capability_t engine_capability_k = engine_variant_t::capability_k;
 
         // GPU backends are only compatible with GPU scopes
         if constexpr (is_gpu_capability(engine_capability_k)) {

From e825ed8592cb146c9d5b9b22e64ee3f1c32c9513 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 20:25:23 +0000
Subject: [PATCH 608/751] Fix: Unused symbols

---
 include/stringzilla/stringzilla.h | 4 ++++
 scripts/bench_sequence.cpp        | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 763613d1..611e86aa 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -196,6 +196,10 @@ SZ_INTERNAL sz_capability_t sz_capabilities_implementation_arm_(void) {
     // Read CPUID registers directly
     unsigned long id_aa64isar0_el1 = 0, id_aa64isar1_el1 = 0, id_aa64pfr0_el1 = 0, id_aa64zfr0_el1 = 0;
     unsigned supports_neon = 0, supports_sve = 0, supports_sve2 = 0;
+    sz_unused_(id_aa64isar0_el1);
+    sz_unused_(id_aa64isar1_el1);
+    sz_unused_(id_aa64pfr0_el1);
+    sz_unused_(id_aa64zfr0_el1);
 
 #if SZ_USE_NEON || SZ_USE_SVE || SZ_USE_SVE2
     // Now let's unpack the status flags from ID_AA64ISAR0_EL1
diff --git a/scripts/bench_sequence.cpp b/scripts/bench_sequence.cpp
index 58b9cadc..45979233 100644
--- a/scripts/bench_sequence.cpp
+++ b/scripts/bench_sequence.cpp
@@ -162,6 +162,8 @@ struct argsort_strings_via_qsort_t {
         qsort_r(output.data(), array.count, sizeof(sz_sorted_idx_t), _get_qsort_order, &array);
 #elif defined(SZ_HAS_QSORT_S_)
         qsort_s(output.data(), array.count, sizeof(sz_sorted_idx_t), _get_qsort_order, &array);
+#else
+        sz_unused_(_get_qsort_order);
 #endif
 
         // Prepare stats and hash the permutation to compare with the reference.

From 6b00ab9efaaf43e0fb3ee32467a409735409420e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 20:25:30 +0000
Subject: [PATCH 609/751] Fix: Converting to string views

---
 scripts/bench_container.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/bench_container.cpp b/scripts/bench_container.cpp
index 43d392e9..88fc61d2 100644
--- a/scripts/bench_container.cpp
+++ b/scripts/bench_container.cpp
@@ -88,7 +88,7 @@ struct callable_for_associative_lookups {
     inline callable_for_associative_lookups(environment_t const &env) noexcept : env(env) {}
     void preprocess() {
         using key_type = typename container_type_::key_type;
-        for (std::string_view const &key : env.tokens) container[key_type(key)]++;
+        for (std::string_view const &key : env.tokens) container[key_type(key.data(), key.size())]++;
     }
 
     /** @brief Helper API to produce a delayed construction lambda. */

From d6c7cf3db1c87f6d6c35023742c4310a87677913 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 21:10:06 +0000
Subject: [PATCH 610/751] Make: Caps introspection flags on Arm

---
 include/stringzilla/stringzilla.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 611e86aa..122f9b88 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -173,6 +173,14 @@ SZ_INTERNAL sz_cptr_t sz_capabilities_to_string_implementation_(sz_capability_t
 
 #if SZ_IS_64BIT_ARM_
 
+/*  Compiling the next section one may get: selected processor does not support system register name 'id_aa64zfr0_el1'.
+ *  Suppressing assembler errors is very complicated, so when dealing with older ARM CPUs it's simpler to compile this
+ *  function targeting newer ones.
+ */
+#pragma GCC push_options
+#pragma GCC target("arch=armv8.5-a+sve")
+#pragma clang attribute push(__attribute__((target("arch=armv8.5-a+sve"))), apply_to = function)
+
 /**
  *  @brief  Function to determine the SIMD capabilities of the current 64-bit Arm machine at @b runtime.
  *  @return A bitmask of the SIMD capabilities represented as a `sz_capability_t` enum value.
@@ -246,6 +254,9 @@ SZ_INTERNAL sz_capability_t sz_capabilities_implementation_arm_(void) {
 #endif
 }
 
+#pragma clang attribute pop
+#pragma GCC pop_options
+
 #endif // SZ_IS_64BIT_ARM_
 
 #if SZ_IS_64BIT_X86_

From 7c4fe32debadad442b9cdd4e7a392d0b2370bc11 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 21:12:57 +0000
Subject: [PATCH 611/751] Make: Referencing old tests

---
 .github/workflows/prerelease.yml | 10 +++++-----
 CONTRIBUTING.md                  |  4 ++--
 pyproject.toml                   |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 37e99083..8fd483c3 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -121,7 +121,7 @@ jobs:
           pip install pytest pytest-repeat numpy pyarrow
           python -m pip install .
       - name: Test Python
-        run: pytest scripts/test.py -s -x
+        run: pytest scripts/test_stringzilla.py -s -x
 
         # JavaScript
       # - name: Set up Node.js
@@ -206,7 +206,7 @@ jobs:
           pip install pytest pytest-repeat numpy pyarrow
           python -m pip install .
       - name: Test Python
-        run: pytest scripts/test.py -s -x
+        run: pytest scripts/test_stringzilla.py -s -x
 
         # Rust
       - name: Test Rust
@@ -345,7 +345,7 @@ jobs:
         env:
           MACOSX_DEPLOYMENT_TARGET: "11.0"
       - name: Test Python
-        run: pytest scripts/test.py -s -x
+        run: pytest scripts/test_stringzilla.py -s -x
 
         # Swift
       - name: Set up Swift ${{ env.SWIFT_VERSION }}
@@ -417,7 +417,7 @@ jobs:
           pip install pytest pytest-repeat numpy pyarrow
           python -m pip install .
       - name: Test Python
-        run: pytest scripts/test.py -s -x
+        run: pytest scripts/test_stringzilla.py -s -x
 
   test_alpine:
     name: Alpine Linux
@@ -458,7 +458,7 @@ jobs:
           pip install --break-system-packages pytest pytest-repeat
           pip install --break-system-packages .
       - name: Test Python
-        run: pytest scripts/test.py -s -x
+        run: pytest scripts/test_stringzilla.py -s -x
 
         # Rust
       - name: Test Rust
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 25e49cd5..8ccc7706 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -25,8 +25,8 @@ The project is split into the following parts:
 
 For minimal test coverage, check the following scripts:
 
-- `scripts/test.cpp` - tests C++ API (not underlying C) against STL.
-- `scripts/test.py` - tests Python API against native strings.
+- `scripts/test_stringzilla.cpp` - tests C++ API (not underlying C) against STL.
+- `scripts/test_stringzilla.py` - tests Python API against native strings.
 - `scripts/test.js`.
 
 At the C++ level all benchmarks also validate the results against the STL baseline, serving as tests on real-world data.
diff --git a/pyproject.toml b/pyproject.toml
index d0ecc41a..f909a6db 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,7 @@ line-length = 120
 
 [tool.cibuildwheel]
 test-requires = ["pytest", "pytest-repeat"]
-test-command = "pytest {project}/scripts/test.py -x"
+test-command = "pytest {project}/scripts/test_stringzilla.py -x"
 build-verbosity = 0
 
 # We need to build for all platforms:

From 352b48d9061dbbb1df6f1197a1572fb312be3f70 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 21:14:29 +0000
Subject: [PATCH 612/751] Make: Parallel backends CI

---
 .github/workflows/prerelease.yml | 123 ++++++++++++++++++++++++++++++-
 1 file changed, 122 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 8fd483c3..b103eaac 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -62,7 +62,7 @@ jobs:
       - name: Install dependencies
         run: |
           sudo apt update
-          sudo apt install -y cmake build-essential libjemalloc-dev libomp-dev gcc-12 g++-12
+          sudo apt install -y cmake build-essential gcc-12 g++-12
 
       - name: Build C/C++
         run: |
@@ -226,6 +226,125 @@ jobs:
       # - name: Test Swift
       #   run: swift test -c release --enable-test-discovery
 
+  test_ubuntu_gcc_cpus:
+    name: Ubuntu (GCC 12) StringZillas-CPUs
+    runs-on: ubuntu-22.04
+    env:
+      CC: gcc-12
+      CXX: g++-12
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+        # C/C++
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y cmake build-essential gcc-12 g++-12
+
+      - name: Build C/C++
+        run: |
+          cmake -B build_artifacts \
+            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+            -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
+            -DSTRINGZILLA_BUILD_BENCHMARK=1 \
+            -DSTRINGZILLA_BUILD_TEST=1
+
+          cmake --build build_artifacts --config RelWithDebInfo > build_artifacts/logs.txt 2>&1 || {
+            echo "Compilation failed. Here are the logs:"
+            cat build_artifacts/logs.txt
+            echo "The original compilation commands:"
+            cat build_artifacts/compile_commands.json
+            echo "CPU Features:"
+            lscpu
+            echo "GCC Version:"
+            gcc-12 --version
+            echo "G++ Version:"
+            g++-12 --version
+            exit 1
+          }
+      - name: Test C++
+        run: build_artifacts/stringzilla_test_cpp20
+
+        # Python StringZillas-CPUs
+      - name: Set up Python ${{ env.PYTHON_VERSION }}
+        uses: actions/setup-python@v5.2.0
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+      - name: Install uv
+        run: |
+          python -m pip install --upgrade pip
+          pip install uv
+      - name: Build Python StringZillas-CPUs
+        run: |
+          uv pip install pytest pytest-repeat numpy pyarrow
+          SZ_TARGET=stringzillas-cpus uv pip install -e . --force-reinstall --no-build-isolation
+      - name: Test Python StringZillas-CPUs
+        run: uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x
+
+  test_ubuntu_clang_cpus:
+    name: Ubuntu (Clang 16) StringZillas-CPUs
+    runs-on: ubuntu-22.04
+    env:
+      CC: clang-16
+      CXX: clang++-16
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+        # C/C++
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y cmake build-essential
+          wget https://apt.llvm.org/llvm.sh
+          chmod +x llvm.sh
+          sudo ./llvm.sh 16
+
+      - name: Build C/C++
+        run: |
+          cmake -B build_artifacts \
+            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+            -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
+            -DSTRINGZILLA_BUILD_BENCHMARK=1 \
+            -DSTRINGZILLA_BUILD_TEST=1
+
+          cmake --build build_artifacts --config RelWithDebInfo > build_artifacts/logs.txt 2>&1 || {
+            echo "Compilation failed. Here are the logs:"
+            cat build_artifacts/logs.txt
+            echo "The original compilation commands:"
+            cat build_artifacts/compile_commands.json
+            echo "CPU Features:"
+            lscpu
+            echo "Clang Version:"
+            clang-16 --version
+            echo "Clang++ Version:"
+            clang++-16 --version
+            exit 1
+          }
+      - name: Test C++
+        run: build_artifacts/stringzilla_test_cpp20
+
+        # Python StringZillas-CPUs
+      - name: Set up Python ${{ env.PYTHON_VERSION }}
+        uses: actions/setup-python@v5.2.0
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+      - name: Install uv
+        run: |
+          python -m pip install --upgrade pip
+          pip install uv
+      - name: Build Python StringZillas-CPUs
+        run: |
+          uv pip install pytest pytest-repeat numpy pyarrow
+          SZ_TARGET=stringzillas-cpus uv pip install -e . --force-reinstall --no-build-isolation
+      - name: Test Python StringZillas-CPUs
+        run: uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x
+
   # Temporary workaround to run Swift tests on Linux
   # Based on: https://github.com/swift-actions/setup-swift/issues/591#issuecomment-1685710678
   test_ubuntu_swift:
@@ -474,6 +593,8 @@ jobs:
       [
         test_ubuntu_gcc,
         test_ubuntu_clang,
+        test_ubuntu_gcc_cpus,
+        test_ubuntu_clang_cpus,
         test_macos,
         test_windows,
         test_alpine,

From 5e9d488ff2d2a9af4997f95544864a50f9fb41b2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 21:16:25 +0000
Subject: [PATCH 613/751] Improve: Unaligned loads in serial hashes

---
 include/stringzilla/hash.h | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 87e31cf1..172d53ae 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -644,10 +644,15 @@ SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length, sz_u64_t se
         sz_hash_minimal_init_serial_(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec;
+#if SZ_USE_MISALIGNED_LOADS
         data0_vec.u64s[0] = *(sz_u64_t const *)(start);
         data0_vec.u64s[1] = *(sz_u64_t const *)(start + 8);
         data1_vec.u64s[0] = *(sz_u64_t const *)(start + length - 16);
         data1_vec.u64s[1] = *(sz_u64_t const *)(start + length - 8);
+#else
+        for (sz_size_t i = 0; i < 16; ++i) data0_vec.u8s[i] = start[i];
+        for (sz_size_t i = 0; i < 16; ++i) data1_vec.u8s[i] = start[length - 16 + i];
+#endif
         // Let's shift the data within the register to de-interleave the bytes.
         sz_hash_shift_in_register_serial_(&data1_vec, 32 - length);
         sz_hash_minimal_update_serial_(&state, data0_vec);
@@ -660,12 +665,18 @@ SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length, sz_u64_t se
         sz_hash_minimal_init_serial_(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec;
+#if SZ_USE_MISALIGNED_LOADS
         data0_vec.u64s[0] = *(sz_u64_t const *)(start);
         data0_vec.u64s[1] = *(sz_u64_t const *)(start + 8);
         data1_vec.u64s[0] = *(sz_u64_t const *)(start + 16);
         data1_vec.u64s[1] = *(sz_u64_t const *)(start + 24);
         data2_vec.u64s[0] = *(sz_u64_t const *)(start + length - 16);
         data2_vec.u64s[1] = *(sz_u64_t const *)(start + length - 8);
+#else
+        for (sz_size_t i = 0; i < 16; ++i) data0_vec.u8s[i] = start[i];
+        for (sz_size_t i = 0; i < 16; ++i) data1_vec.u8s[i] = start[16 + i];
+        for (sz_size_t i = 0; i < 16; ++i) data2_vec.u8s[i] = start[length - 16 + i];
+#endif
         // Let's shift the data within the register to de-interleave the bytes.
         sz_hash_shift_in_register_serial_(&data2_vec, 48 - length);
         sz_hash_minimal_update_serial_(&state, data0_vec);
@@ -679,6 +690,7 @@ SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length, sz_u64_t se
         sz_hash_minimal_init_serial_(&state, seed);
         // Load the data and update the state
         sz_u128_vec_t data0_vec, data1_vec, data2_vec, data3_vec;
+#if SZ_USE_MISALIGNED_LOADS
         data0_vec.u64s[0] = *(sz_u64_t const *)(start);
         data0_vec.u64s[1] = *(sz_u64_t const *)(start + 8);
         data1_vec.u64s[0] = *(sz_u64_t const *)(start + 16);
@@ -687,6 +699,12 @@ SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length, sz_u64_t se
         data2_vec.u64s[1] = *(sz_u64_t const *)(start + 40);
         data3_vec.u64s[0] = *(sz_u64_t const *)(start + length - 16);
         data3_vec.u64s[1] = *(sz_u64_t const *)(start + length - 8);
+#else
+        for (sz_size_t i = 0; i < 16; ++i) data0_vec.u8s[i] = start[i];
+        for (sz_size_t i = 0; i < 16; ++i) data1_vec.u8s[i] = start[16 + i];
+        for (sz_size_t i = 0; i < 16; ++i) data2_vec.u8s[i] = start[32 + i];
+        for (sz_size_t i = 0; i < 16; ++i) data3_vec.u8s[i] = start[length - 16 + i];
+#endif
         // Let's shift the data within the register to de-interleave the bytes.
         sz_hash_shift_in_register_serial_(&data3_vec, 64 - length);
         sz_hash_minimal_update_serial_(&state, data0_vec);
@@ -701,6 +719,7 @@ SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length, sz_u64_t se
         SZ_ALIGN64 sz_hash_state_t state;
         sz_hash_state_init_serial(&state, seed);
 
+#if SZ_USE_MISALIGNED_LOADS
         for (; state.ins_length + 64 <= length; state.ins_length += 64) {
             state.ins.u64s[0] = *(sz_u64_t const *)(start + state.ins_length);
             state.ins.u64s[1] = *(sz_u64_t const *)(start + state.ins_length + 8);
@@ -712,6 +731,13 @@ SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length, sz_u64_t se
             state.ins.u64s[7] = *(sz_u64_t const *)(start + state.ins_length + 56);
             sz_hash_state_update_serial_(&state);
         }
+#else
+        for (; state.ins_length + 64 <= length; state.ins_length += 64) {
+            for (sz_size_t i = 0; i < 64; ++i) state.ins.u8s[i] = start[state.ins_length + i];
+            sz_hash_state_update_serial_(&state);
+        }
+#endif
+
         if (state.ins_length < length) {
             for (sz_size_t i = 0; i != 8; ++i) state.ins.u64s[i] = 0;
             for (sz_size_t i = 0; state.ins_length < length; ++i, ++state.ins_length)
@@ -938,10 +964,10 @@ SZ_PUBLIC void sz_hash_state_init_haswell(sz_hash_state_t *state, sz_u64_t seed)
     // XOR the user-supplied keys with the two "pi" constants
     sz_u64_t const *pi = sz_hash_pi_constants_();
     for (int i = 0; i < 4; ++i)
-        _mm_storeu_si128(&state->aes.xmms[i], _mm_xor_si128(seed_vec, _mm_load_si128((__m128i const *)(pi + i * 2))));
+        _mm_storeu_si128(&state->aes.xmms[i], _mm_xor_si128(seed_vec, _mm_loadu_si128((__m128i const *)(pi + i * 2))));
     for (int i = 0; i < 4; ++i)
         _mm_storeu_si128(&state->sum.xmms[i],
-                         _mm_xor_si128(seed_vec, _mm_load_si128((__m128i const *)(pi + i * 2 + 8))));
+                         _mm_xor_si128(seed_vec, _mm_loadu_si128((__m128i const *)(pi + i * 2 + 8))));
 
     // The inputs are zeroed out at the beginning
     _mm_storeu_si128(&state->ins.xmms[0], _mm_setzero_si128());

From cd263aebefef51a6f4031a2b22efa5547990e1d3 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 21:29:37 +0000
Subject: [PATCH 614/751] Fix: Unused `qsort` on MacOS

---
 scripts/bench_sequence.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/scripts/bench_sequence.cpp b/scripts/bench_sequence.cpp
index 45979233..f117e7f3 100644
--- a/scripts/bench_sequence.cpp
+++ b/scripts/bench_sequence.cpp
@@ -98,6 +98,8 @@ static sz_size_t get_length(void const *handle, sz_size_t i) {
     return array[i].size();
 }
 
+#if defined(SZ_HAS_QSORT_R_) || defined(SZ_HAS_QSORT_S_)
+
 /**
  *  @brief Callback function for the @b `qsort_r` re-entrant sorting function.
  *  @note The `qsort_r` function is not available on all platforms, and is not part of the C standard.
@@ -120,6 +122,8 @@ static int _get_qsort_order(void const *a, void const *b, void *arg) {
     return res ? res : (int)(len_a - len_b);
 }
 
+#endif
+
 #pragma endregion
 
 #pragma region Sorting Benchmarks
@@ -162,8 +166,6 @@ struct argsort_strings_via_qsort_t {
         qsort_r(output.data(), array.count, sizeof(sz_sorted_idx_t), _get_qsort_order, &array);
 #elif defined(SZ_HAS_QSORT_S_)
         qsort_s(output.data(), array.count, sizeof(sz_sorted_idx_t), _get_qsort_order, &array);
-#else
-        sz_unused_(_get_qsort_order);
 #endif
 
         // Prepare stats and hash the permutation to compare with the reference.

From 3761107bebadb38fcbb9b6ac94e8cc36013c1e64 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 22:05:51 +0000
Subject: [PATCH 615/751] Fix: Unused variable in `group_by`

---
 include/stringzillas/types.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/stringzillas/types.hpp b/include/stringzillas/types.hpp
index 36061916..14f1591a 100644
--- a/include/stringzillas/types.hpp
+++ b/include/stringzillas/types.hpp
@@ -244,7 +244,6 @@ template <typename begin_iterator_type_, typename end_iterator_type_, typename e
 size_t group_by(begin_iterator_type_ const begin, end_iterator_type_ const end, equality_type_ &&equality,
                 slice_callback_type_ &&slice_callback) {
 
-    auto const size = std::distance(begin, end);
     auto slice_start = begin;
     size_t group_count = 0;
 

From 89ed74d34ecd67b9fbe033d4e417839184d39be3 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 22:07:09 +0000
Subject: [PATCH 616/751] Make: Avoid `uv` in GitHub CI

---
 .github/workflows/prerelease.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index b103eaac..47b4486c 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -279,8 +279,8 @@ jobs:
           pip install uv
       - name: Build Python StringZillas-CPUs
         run: |
-          uv pip install pytest pytest-repeat numpy pyarrow
-          SZ_TARGET=stringzillas-cpus uv pip install -e . --force-reinstall --no-build-isolation
+          pip install pytest pytest-repeat numpy pyarrow
+          SZ_TARGET=stringzillas-cpus pip install -e . --force-reinstall --no-build-isolation
       - name: Test Python StringZillas-CPUs
         run: uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x
 
@@ -340,8 +340,8 @@ jobs:
           pip install uv
       - name: Build Python StringZillas-CPUs
         run: |
-          uv pip install pytest pytest-repeat numpy pyarrow
-          SZ_TARGET=stringzillas-cpus uv pip install -e . --force-reinstall --no-build-isolation
+          pip install pytest pytest-repeat numpy pyarrow
+          SZ_TARGET=stringzillas-cpus pip install -e . --force-reinstall --no-build-isolation
       - name: Test Python StringZillas-CPUs
         run: uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x
 

From 43c953f37e8f22fc24b10f2d10b68bedea6db718 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 22:07:51 +0000
Subject: [PATCH 617/751] Fix: `static_cast` to standard for MSVC

---
 scripts/bench_container.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/bench_container.cpp b/scripts/bench_container.cpp
index 88fc61d2..2698b0be 100644
--- a/scripts/bench_container.cpp
+++ b/scripts/bench_container.cpp
@@ -88,7 +88,7 @@ struct callable_for_associative_lookups {
     inline callable_for_associative_lookups(environment_t const &env) noexcept : env(env) {}
     void preprocess() {
         using key_type = typename container_type_::key_type;
-        for (std::string_view const &key : env.tokens) container[key_type(key.data(), key.size())]++;
+        for (std::string_view const &key : env.tokens) container[static_cast<key_type>(key)]++;
     }
 
     /** @brief Helper API to produce a delayed construction lambda. */
@@ -168,7 +168,7 @@ struct less_through_std_t {
     using is_transparent = void;
     template <typename first_type_, typename second_type_>
     inline bool operator()(first_type_ const &a, second_type_ const &b) const noexcept {
-        return std::less<std::string_view> {}(std::string_view(a), std::string_view(b));
+        return std::less<std::string_view> {}(static_cast<std::string_view>(a), static_cast<std::string_view>(b));
     }
 };
 
@@ -176,7 +176,7 @@ struct hash_through_std_t {
     using is_transparent = void;
     template <typename string_like_>
     inline std::size_t operator()(string_like_ const &str) const noexcept {
-        return std::hash<std::string_view> {}(std::string_view(str));
+        return std::hash<std::string_view> {}(static_cast<std::string_view>(str));
     }
 };
 

From 0f840bbb410349dde510aab9efb345b48bdf2443 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 22:09:48 +0000
Subject: [PATCH 618/751] Fix: Avoid unaligned XMM loads

---
 include/stringzilla/hash.h | 92 +++++++++++++++++++++++---------------
 1 file changed, 56 insertions(+), 36 deletions(-)

diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 172d53ae..58040cdf 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -958,61 +958,76 @@ SZ_PUBLIC void sz_hash_state_init_haswell(sz_hash_state_t *state, sz_u64_t seed)
     __m128i seed_vec = _mm_set1_epi64x(seed);
 
     // ! In this kernel, assuming it may be called on arbitrarily misaligned `state`,
-    // ! we must use `_mm_storeu_si128` stores to update the state.
-    _mm_storeu_si128(&state->key.xmm, seed_vec);
+    // ! we must use `_mm_storeu_si128` stores to update the state. Moreover, accessing `state.xmms[i]`
+    // ! fools the compiler into preferring aligned operations over out misaligned ones.
+    _mm_storeu_si128((__m128i *)&state->key.u8s[0], seed_vec);
 
     // XOR the user-supplied keys with the two "pi" constants
     sz_u64_t const *pi = sz_hash_pi_constants_();
     for (int i = 0; i < 4; ++i)
-        _mm_storeu_si128(&state->aes.xmms[i], _mm_xor_si128(seed_vec, _mm_loadu_si128((__m128i const *)(pi + i * 2))));
+        _mm_storeu_si128((__m128i *)&state->aes.u8s[i * sizeof(__m128i)],
+                         _mm_xor_si128(seed_vec, _mm_lddqu_si128((__m128i const *)(pi + i * 2))));
     for (int i = 0; i < 4; ++i)
-        _mm_storeu_si128(&state->sum.xmms[i],
-                         _mm_xor_si128(seed_vec, _mm_loadu_si128((__m128i const *)(pi + i * 2 + 8))));
+        _mm_storeu_si128((__m128i *)&state->sum.u8s[i * sizeof(__m128i)],
+                         _mm_xor_si128(seed_vec, _mm_lddqu_si128((__m128i const *)(pi + i * 2 + 8))));
 
     // The inputs are zeroed out at the beginning
-    _mm_storeu_si128(&state->ins.xmms[0], _mm_setzero_si128());
-    _mm_storeu_si128(&state->ins.xmms[1], _mm_setzero_si128());
-    _mm_storeu_si128(&state->ins.xmms[2], _mm_setzero_si128());
-    _mm_storeu_si128(&state->ins.xmms[3], _mm_setzero_si128());
+    _mm_storeu_si128((__m128i *)&state->ins.u8s[0 * sizeof(__m128i)], _mm_setzero_si128());
+    _mm_storeu_si128((__m128i *)&state->ins.u8s[1 * sizeof(__m128i)], _mm_setzero_si128());
+    _mm_storeu_si128((__m128i *)&state->ins.u8s[2 * sizeof(__m128i)], _mm_setzero_si128());
+    _mm_storeu_si128((__m128i *)&state->ins.u8s[3 * sizeof(__m128i)], _mm_setzero_si128());
     state->ins_length = 0;
 }
 
 SZ_INTERNAL void sz_hash_state_update_haswell_(sz_hash_state_t *state) {
     __m128i const shuffle_mask = _mm_load_si128((__m128i const *)sz_hash_u8x16x4_shuffle_());
     _mm_storeu_si128( //
-        &state->aes.xmms[0],
-        _mm_aesenc_si128(_mm_lddqu_si128(&state->aes.xmms[0]), _mm_lddqu_si128(&state->ins.xmms[0])));
+        (__m128i *)&state->aes.u8s[0 * sizeof(__m128i)],
+        _mm_aesenc_si128(_mm_lddqu_si128((__m128i *)&state->aes.u8s[0 * sizeof(__m128i)]),
+                         _mm_lddqu_si128((__m128i *)&state->ins.u8s[0 * sizeof(__m128i)])));
     _mm_storeu_si128( //
-        &state->sum.xmms[0], _mm_add_epi64(_mm_shuffle_epi8(_mm_lddqu_si128(&state->sum.xmms[0]), shuffle_mask),
-                                           _mm_lddqu_si128(&state->ins.xmms[0])));
+        (__m128i *)&state->sum.u8s[0 * sizeof(__m128i)],
+        _mm_add_epi64(_mm_shuffle_epi8(_mm_lddqu_si128((__m128i *)&state->sum.u8s[0 * sizeof(__m128i)]), shuffle_mask),
+                      _mm_lddqu_si128((__m128i *)&state->ins.u8s[0 * sizeof(__m128i)])));
     _mm_storeu_si128( //
-        &state->aes.xmms[1],
-        _mm_aesenc_si128(_mm_lddqu_si128(&state->aes.xmms[1]), _mm_lddqu_si128(&state->ins.xmms[1])));
+        (__m128i *)&state->aes.u8s[1 * sizeof(__m128i)],
+        _mm_aesenc_si128(_mm_lddqu_si128((__m128i *)&state->aes.u8s[1 * sizeof(__m128i)]),
+                         _mm_lddqu_si128((__m128i *)&state->ins.u8s[1 * sizeof(__m128i)])));
     _mm_storeu_si128( //
-        &state->sum.xmms[1], _mm_add_epi64(_mm_shuffle_epi8(_mm_lddqu_si128(&state->sum.xmms[1]), shuffle_mask),
-                                           _mm_lddqu_si128(&state->ins.xmms[1])));
+        (__m128i *)&state->sum.u8s[1 * sizeof(__m128i)],
+        _mm_add_epi64(_mm_shuffle_epi8(_mm_lddqu_si128((__m128i *)&state->sum.u8s[1 * sizeof(__m128i)]), shuffle_mask),
+                      _mm_lddqu_si128((__m128i *)&state->ins.u8s[1 * sizeof(__m128i)])));
     _mm_storeu_si128( //
-        &state->aes.xmms[2],
-        _mm_aesenc_si128(_mm_lddqu_si128(&state->aes.xmms[2]), _mm_lddqu_si128(&state->ins.xmms[2])));
+        (__m128i *)&state->aes.u8s[2 * sizeof(__m128i)],
+        _mm_aesenc_si128(_mm_lddqu_si128((__m128i *)&state->aes.u8s[2 * sizeof(__m128i)]),
+                         _mm_lddqu_si128((__m128i *)&state->ins.u8s[2 * sizeof(__m128i)])));
     _mm_storeu_si128( //
-        &state->sum.xmms[2], _mm_add_epi64(_mm_shuffle_epi8(_mm_lddqu_si128(&state->sum.xmms[2]), shuffle_mask),
-                                           _mm_lddqu_si128(&state->ins.xmms[2])));
+        (__m128i *)&state->sum.u8s[2 * sizeof(__m128i)],
+        _mm_add_epi64(_mm_shuffle_epi8(_mm_lddqu_si128((__m128i *)&state->sum.u8s[2 * sizeof(__m128i)]), shuffle_mask),
+                      _mm_lddqu_si128((__m128i *)&state->ins.u8s[2 * sizeof(__m128i)])));
     _mm_storeu_si128( //
-        &state->aes.xmms[3],
-        _mm_aesenc_si128(_mm_lddqu_si128(&state->aes.xmms[3]), _mm_lddqu_si128(&state->ins.xmms[3])));
+        (__m128i *)&state->aes.u8s[3 * sizeof(__m128i)],
+        _mm_aesenc_si128(_mm_lddqu_si128((__m128i *)&state->aes.u8s[3 * sizeof(__m128i)]),
+                         _mm_lddqu_si128((__m128i *)&state->ins.u8s[3 * sizeof(__m128i)])));
     _mm_storeu_si128( //
-        &state->sum.xmms[3], _mm_add_epi64(_mm_shuffle_epi8(_mm_lddqu_si128(&state->sum.xmms[3]), shuffle_mask),
-                                           _mm_lddqu_si128(&state->ins.xmms[3])));
+        (__m128i *)&state->sum.u8s[3 * sizeof(__m128i)],
+        _mm_add_epi64(_mm_shuffle_epi8(_mm_lddqu_si128((__m128i *)&state->sum.u8s[3 * sizeof(__m128i)]), shuffle_mask),
+                      _mm_lddqu_si128((__m128i *)&state->ins.u8s[3 * sizeof(__m128i)])));
 }
 
 SZ_INTERNAL sz_u64_t sz_hash_state_finalize_haswell_(sz_hash_state_t const *state) {
     // Mix the length into the key
-    __m128i key_with_length = _mm_add_epi64(_mm_lddqu_si128(&state->key.xmm), _mm_set_epi64x(0, state->ins_length));
+    __m128i key_with_length =
+        _mm_add_epi64(_mm_lddqu_si128((__m128i *)&state->key.u8s[0]), _mm_set_epi64x(0, state->ins_length));
     // Combine the "sum" and the "AES" blocks
-    __m128i mixed0 = _mm_aesenc_si128(_mm_lddqu_si128(&state->sum.xmms[0]), _mm_lddqu_si128(&state->aes.xmms[0]));
-    __m128i mixed1 = _mm_aesenc_si128(_mm_lddqu_si128(&state->sum.xmms[1]), _mm_lddqu_si128(&state->aes.xmms[1]));
-    __m128i mixed2 = _mm_aesenc_si128(_mm_lddqu_si128(&state->sum.xmms[2]), _mm_lddqu_si128(&state->aes.xmms[2]));
-    __m128i mixed3 = _mm_aesenc_si128(_mm_lddqu_si128(&state->sum.xmms[3]), _mm_lddqu_si128(&state->aes.xmms[3]));
+    __m128i mixed0 = _mm_aesenc_si128(_mm_lddqu_si128((__m128i *)&state->sum.u8s[0 * sizeof(__m128i)]),
+                                      _mm_lddqu_si128((__m128i *)&state->aes.u8s[0 * sizeof(__m128i)]));
+    __m128i mixed1 = _mm_aesenc_si128(_mm_lddqu_si128((__m128i *)&state->sum.u8s[1 * sizeof(__m128i)]),
+                                      _mm_lddqu_si128((__m128i *)&state->aes.u8s[1 * sizeof(__m128i)]));
+    __m128i mixed2 = _mm_aesenc_si128(_mm_lddqu_si128((__m128i *)&state->sum.u8s[2 * sizeof(__m128i)]),
+                                      _mm_lddqu_si128((__m128i *)&state->aes.u8s[2 * sizeof(__m128i)]));
+    __m128i mixed3 = _mm_aesenc_si128(_mm_lddqu_si128((__m128i *)&state->sum.u8s[3 * sizeof(__m128i)]),
+                                      _mm_lddqu_si128((__m128i *)&state->aes.u8s[3 * sizeof(__m128i)]));
     // Combine the mixed registers
     __m128i mixed01 = _mm_aesenc_si128(mixed0, mixed1);
     __m128i mixed23 = _mm_aesenc_si128(mixed2, mixed3);
@@ -1116,10 +1131,14 @@ SZ_PUBLIC void sz_hash_state_stream_haswell(sz_hash_state_t *state, sz_cptr_t te
     while (length) {
         // Append to the internal buffer until it's full
         if (state->ins_length % 64 == 0 && length >= 64) {
-            _mm_storeu_si128(&state->ins.xmms[0], _mm_lddqu_si128((__m128i const *)(text + 0)));
-            _mm_storeu_si128(&state->ins.xmms[1], _mm_lddqu_si128((__m128i const *)(text + 16)));
-            _mm_storeu_si128(&state->ins.xmms[2], _mm_lddqu_si128((__m128i const *)(text + 32)));
-            _mm_storeu_si128(&state->ins.xmms[3], _mm_lddqu_si128((__m128i const *)(text + 48)));
+            _mm_storeu_si128((__m128i *)&state->ins.u8s[0 * sizeof(__m128i)],
+                             _mm_lddqu_si128((__m128i const *)(text + 0)));
+            _mm_storeu_si128((__m128i *)&state->ins.u8s[1 * sizeof(__m128i)],
+                             _mm_lddqu_si128((__m128i const *)(text + 16)));
+            _mm_storeu_si128((__m128i *)&state->ins.u8s[2 * sizeof(__m128i)],
+                             _mm_lddqu_si128((__m128i const *)(text + 32)));
+            _mm_storeu_si128((__m128i *)&state->ins.u8s[3 * sizeof(__m128i)],
+                             _mm_lddqu_si128((__m128i const *)(text + 48)));
             sz_hash_state_update_haswell_(state);
             state->ins_length += 64;
             text += 64;
@@ -1139,7 +1158,8 @@ SZ_PUBLIC void sz_hash_state_stream_haswell(sz_hash_state_t *state, sz_cptr_t te
             if (will_fill_block) {
                 sz_hash_state_update_haswell_(state);
                 // Reset to zeros now, so we don't have to overwrite an immutable buffer in the folding state
-                for (int i = 0; i < 4; ++i) _mm_storeu_si128(&state->ins.xmms[i], _mm_setzero_si128());
+                for (int i = 0; i < 4; ++i)
+                    _mm_storeu_si128((__m128i *)&state->ins.u8s[i * sizeof(__m128i)], _mm_setzero_si128());
             }
         }
     }
@@ -1396,7 +1416,7 @@ SZ_PUBLIC void sz_hash_state_init_skylake(sz_hash_state_t *state, sz_u64_t seed)
     __m512i seed_vec = _mm512_set1_epi64(seed);
     // ! In this kernel, assuming it may be called on arbitrarily misaligned `state`,
     // ! we must use `_mm_storeu_si128` stores to update the state.
-    _mm_storeu_si128(&state->key.xmm, _mm512_castsi512_si128(seed_vec));
+    _mm_storeu_si128((__m128i *)&state->key.u8s[0], _mm512_castsi512_si128(seed_vec));
 
     // XOR the user-supplied keys with the two "pi" constants
     sz_u64_t const *pi = sz_hash_pi_constants_();

From b6a4cc64fbc187348e05b9500ace1a33f919f772 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 22:24:46 +0000
Subject: [PATCH 619/751] Fix: Workaround for `static_cast`

---
 scripts/bench_container.cpp  | 6 +++---
 scripts/test_stringzilla.hpp | 5 +++++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/scripts/bench_container.cpp b/scripts/bench_container.cpp
index 2698b0be..cfa7b8bb 100644
--- a/scripts/bench_container.cpp
+++ b/scripts/bench_container.cpp
@@ -88,7 +88,7 @@ struct callable_for_associative_lookups {
     inline callable_for_associative_lookups(environment_t const &env) noexcept : env(env) {}
     void preprocess() {
         using key_type = typename container_type_::key_type;
-        for (std::string_view const &key : env.tokens) container[static_cast<key_type>(key)]++;
+        for (std::string_view const &key : env.tokens) container[to_str<key_type>(key)]++;
     }
 
     /** @brief Helper API to produce a delayed construction lambda. */
@@ -168,7 +168,7 @@ struct less_through_std_t {
     using is_transparent = void;
     template <typename first_type_, typename second_type_>
     inline bool operator()(first_type_ const &a, second_type_ const &b) const noexcept {
-        return std::less<std::string_view> {}(static_cast<std::string_view>(a), static_cast<std::string_view>(b));
+        return std::less<std::string_view> {}(to_str<std::string_view>(a), to_str<std::string_view>(b));
     }
 };
 
@@ -176,7 +176,7 @@ struct hash_through_std_t {
     using is_transparent = void;
     template <typename string_like_>
     inline std::size_t operator()(string_like_ const &str) const noexcept {
-        return std::hash<std::string_view> {}(static_cast<std::string_view>(str));
+        return std::hash<std::string_view> {}(to_str<std::string_view>(str));
     }
 };
 
diff --git a/scripts/test_stringzilla.hpp b/scripts/test_stringzilla.hpp
index f2ee9049..6f42d7a1 100644
--- a/scripts/test_stringzilla.hpp
+++ b/scripts/test_stringzilla.hpp
@@ -53,6 +53,11 @@ inline std::mt19937 &global_random_generator() noexcept {
     return generator;
 }
 
+template <typename string_type_, typename other_string_type_>
+inline string_type_ to_str(other_string_type_ const &other) noexcept {
+    return string_type_(other.data(), other.size());
+}
+
 /**
  *  @brief  A uniform distribution of characters, with a given alphabet size.
  *          The alphabet size is the number of distinct characters in the distribution.

From 44b8d72186e7a6d028801d910e805c2820c10a8c Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 22:25:01 +0000
Subject: [PATCH 620/751] Fix: Missing `allocator_traits` include

---
 include/stringzilla/types.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 2a19c7e4..9ae33550 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -115,6 +115,7 @@
 #if !SZ_AVOID_STL
 #include <initializer_list> // `std::initializer_list` is only ~100 LOC
 #include <iterator>         // `std::random_access_iterator_tag` pulls 20K LOC
+#include <memory>           // `std::allocator_traits` for allocator rebinding
 #include <type_traits>      // `is_same_type`, `std::enable_if`, etc.
 #endif
 

From b1418b582f55d1e1f9cf0230143a30564bd8f733 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 22:25:24 +0000
Subject: [PATCH 621/751] Fix: Feature-checking STL

---
 include/stringzilla/stringzilla.hpp |  6 +++---
 scripts/test_stringzilla.cpp        | 16 ++++++++--------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 7a24c3bb..d1fdb810 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -28,7 +28,7 @@
 #include <bitset>    // `std::bitset`
 #include <string>    // `std::string`
 #include <vector>    // `std::vector`
-#if SZ_IS_CPP17_ && __cpp_lib_string_view
+#if SZ_IS_CPP17_ && defined(__cpp_lib_string_view)
 #include <string_view> // `std::string_view`
 #endif
 #endif
@@ -1257,7 +1257,7 @@ class basic_string_slice {
         return os.write(str.data(), str.size());
     }
 
-#if SZ_IS_CPP17_ && __cpp_lib_string_view
+#if SZ_IS_CPP17_ && defined(__cpp_lib_string_view)
 
     template <typename sfinae_ = char_type, typename std::enable_if<std::is_const<sfinae_>::value, int>::type = 0>
     sz_constexpr_if_cpp20 basic_string_slice(std::string_view const &other) noexcept
@@ -2169,7 +2169,7 @@ class basic_string {
         return os.write(str.data(), str.size());
     }
 
-#if SZ_IS_CPP17_ && __cpp_lib_string_view
+#if SZ_IS_CPP17_ && defined(__cpp_lib_string_view)
 
     basic_string(std::string_view other) noexcept(false) : basic_string(other.data(), other.size()) {}
     basic_string &operator=(std::string_view other) noexcept(false) { return assign({other.data(), other.size()}); }
diff --git a/scripts/test_stringzilla.cpp b/scripts/test_stringzilla.cpp
index 97bb36b5..2d98b6ee 100644
--- a/scripts/test_stringzilla.cpp
+++ b/scripts/test_stringzilla.cpp
@@ -83,7 +83,7 @@ using namespace std::literals; // for ""sv
  *  Instantiate all the templates to make the symbols visible and also check
  *  for weird compilation errors on uncommon paths.
  */
-#if SZ_IS_CPP17_ && __cpp_lib_string_view
+#if SZ_IS_CPP17_ && defined(__cpp_lib_string_view)
 template class std::basic_string_view<char>;
 #endif
 template class sz::basic_string_slice<char>;
@@ -750,7 +750,7 @@ void test_stl_compatibility_for_reads() {
     assert(str("b") >= str("a"));
     assert(str("a") < str("aa"));
 
-#if SZ_IS_CPP20_ && __cpp_lib_three_way_comparison
+#if SZ_IS_CPP20_ && defined(__cpp_lib_three_way_comparison)
     // Spaceship operator instead of conventional comparions.
     assert((str("a") <=> str("b")) == std::strong_ordering::less);
     assert((str("b") <=> str("a")) == std::strong_ordering::greater);
@@ -793,7 +793,7 @@ void test_stl_compatibility_for_reads() {
     assert(str("hello world").compare(6, 5, "worlds", 5) == 0);    // Substring "world" in both strings
     assert(str("hello world").compare(6, 5, "worlds", 6) < 0);     // Substring "world" is less than "worlds"
 
-#if SZ_IS_CPP20_ && __cpp_lib_starts_ends_with
+#if SZ_IS_CPP20_ && defined(__cpp_lib_starts_ends_with)
     // Prefix and suffix checks against strings.
     assert(str("https://cppreference.com").starts_with(str("http")) == true);
     assert(str("https://cppreference.com").starts_with(str("ftp")) == false);
@@ -813,7 +813,7 @@ void test_stl_compatibility_for_reads() {
     assert(str("string_view").ends_with("View") == false);
 #endif
 
-#if SZ_IS_CPP23_ && __cpp_lib_string_contains
+#if SZ_IS_CPP23_ && defined(__cpp_lib_string_contains)
     // Checking basic substring presence.
     assert(str("hello").contains(str("ell")) == true);
     assert(str("hello").contains(str("oll")) == false);
@@ -998,7 +998,7 @@ void test_stl_conversions() {
         sz_unused_(sz);
         sz_unused_(szv);
     }
-#if SZ_IS_CPP17_ && __cpp_lib_string_view
+#if SZ_IS_CPP17_ && defined(__cpp_lib_string_view)
     // From STL `string_view` to StringZilla and vice-versa.
     {
         std::string_view stl {"hello"};
@@ -1473,7 +1473,7 @@ void test_search() {
     assert(rsplits[4] == "");
 }
 
-#if SZ_IS_CPP17_ && __cpp_lib_string_view
+#if SZ_IS_CPP17_ && defined(__cpp_lib_string_view)
 
 /**
  *  Evaluates the correctness of a "matcher", searching for all the occurrences of the `needle_stl`
@@ -1928,7 +1928,7 @@ int main(int argc, char const **argv) {
     test_replacements();
 
 // Compatibility with STL
-#if SZ_IS_CPP17_ && __cpp_lib_string_view
+#if SZ_IS_CPP17_ && defined(__cpp_lib_string_view)
     test_stl_compatibility_for_reads<std::string_view>();
 #endif
     test_stl_compatibility_for_reads<std::string>();
@@ -1953,7 +1953,7 @@ int main(int argc, char const **argv) {
     test_stl_conversions();
     test_comparisons();
     test_search();
-#if SZ_IS_CPP17_ && __cpp_lib_string_view
+#if SZ_IS_CPP17_ && defined(__cpp_lib_string_view)
     test_search_with_misaligned_repetitions();
 #endif
 

From 4cef5208a234f2cce7f44d976a3444ec0be541e8 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 22:27:34 +0000
Subject: [PATCH 622/751] Fix: Avoid forced inlining for HW flags

---
 include/stringzilla/stringzilla.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 122f9b88..1e169cd4 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -185,7 +185,7 @@ SZ_INTERNAL sz_cptr_t sz_capabilities_to_string_implementation_(sz_capability_t
  *  @brief  Function to determine the SIMD capabilities of the current 64-bit Arm machine at @b runtime.
  *  @return A bitmask of the SIMD capabilities represented as a `sz_capability_t` enum value.
  */
-SZ_INTERNAL sz_capability_t sz_capabilities_implementation_arm_(void) {
+SZ_PUBLIC sz_capability_t sz_capabilities_implementation_arm_(void) {
     // https://github.com/ashvardanian/SimSIMD/blob/28e536083602f85ad0c59456782c8864463ffb0e/include/simsimd/simsimd.h#L434
     // for documentation on how we detect capabilities across different ARM platforms.
 #if defined(SZ_IS_APPLE_)
@@ -261,7 +261,7 @@ SZ_INTERNAL sz_capability_t sz_capabilities_implementation_arm_(void) {
 
 #if SZ_IS_64BIT_X86_
 
-SZ_INTERNAL sz_capability_t sz_capabilities_implementation_x86_(void) {
+SZ_PUBLIC sz_capability_t sz_capabilities_implementation_x86_(void) {
 
 #if SZ_USE_HASWELL || SZ_USE_SKYLAKE || SZ_USE_ICE
 
@@ -308,7 +308,7 @@ SZ_INTERNAL sz_capability_t sz_capabilities_implementation_x86_(void) {
  *  @return A bitmask of the SIMD capabilities represented as a `sz_capability_t` enum value.
  *  @note Excludes parallel-processing & GPGPU capabilities, which are detected separately in StringZillas.
  */
-SZ_INTERNAL sz_capability_t sz_capabilities_implementation_(void) {
+SZ_PUBLIC sz_capability_t sz_capabilities_implementation_(void) {
 #if SZ_IS_64BIT_X86_
     return sz_capabilities_implementation_x86_();
 #elif SZ_IS_64BIT_ARM_

From 72fd6beb1772bfca0e4064ddb1fa32934905a32f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 22:30:57 +0000
Subject: [PATCH 623/751] Make: Bump tapes to 4.0

---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 79070227..8fde518f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -25,9 +25,9 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "stringtape"
-version = "0.3.1"
+version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "47a3efa2a827143870d5796454e8a1df97045db1a229be9370449a71088e5e24"
+checksum = "b30ac50d35c42386e94714e7dee99ec54e4e53aaae3a07eeaa099c7421559ca8"
 dependencies = [
  "allocator-api2",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 4f39b393..026881f9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -34,7 +34,7 @@ rocm = ["cpus"] # ROCm GPU backend (includes multi-threaded CPU backend)
 
 [dependencies]
 allocator-api2 = { version = "0.3.0", optional = true }
-stringtape = { version = "0.3.1", optional = true }
+stringtape = { version = "0.4.0", optional = true }
 
 [build-dependencies]
 cc = "1.2.33"

From 62f8c980a9c533a13858ebf3e1c4f830e10fc5c4 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 22:31:32 +0000
Subject: [PATCH 624/751] Make: Preinstall `wheel` in CI

---
 .github/workflows/prerelease.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 47b4486c..3b1135c4 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -279,7 +279,7 @@ jobs:
           pip install uv
       - name: Build Python StringZillas-CPUs
         run: |
-          pip install pytest pytest-repeat numpy pyarrow
+          pip install pytest pytest-repeat numpy pyarrow wheel
           SZ_TARGET=stringzillas-cpus pip install -e . --force-reinstall --no-build-isolation
       - name: Test Python StringZillas-CPUs
         run: uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x
@@ -340,7 +340,7 @@ jobs:
           pip install uv
       - name: Build Python StringZillas-CPUs
         run: |
-          pip install pytest pytest-repeat numpy pyarrow
+          pip install pytest pytest-repeat numpy pyarrow wheel
           SZ_TARGET=stringzillas-cpus pip install -e . --force-reinstall --no-build-isolation
       - name: Test Python StringZillas-CPUs
         run: uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x

From acdab3f8b82de9a40850579da00333a0fc29dba1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 23:48:33 +0100
Subject: [PATCH 625/751] Fix: Wrong `SZ_DYNAMIC_DISPATCH` check

---
 include/stringzilla/stringzilla.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 1e169cd4..c1e5dd3f 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -318,7 +318,7 @@ SZ_PUBLIC sz_capability_t sz_capabilities_implementation_(void) {
 #endif
 }
 
-#if defined(SZ_DYNAMIC_DISPATCH)
+#if SZ_DYNAMIC_DISPATCH
 
 SZ_DYNAMIC int sz_dynamic_dispatch(void);
 SZ_DYNAMIC int sz_version_major(void);

From abb970bb17bc8c551f0803de0e118ff0a0f75b7b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 23:48:52 +0100
Subject: [PATCH 626/751] Make: Avoid SVE builds on macOS

---
 CMakeLists.txt | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5bb482c4..3c0cf227 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -448,11 +448,17 @@ if (${STRINGZILLA_BUILD_TEST})
         # ARM specific backends
         define_launcher(stringzilla_test_cpp20_serial scripts/test_stringzilla.cpp 20 "armv8-a")
         define_launcher(stringzilla_test_cpp20_neon scripts/test_stringzilla.cpp 20 "armv8-a+simd")
-        define_launcher(stringzilla_test_cpp20_sve scripts/test_stringzilla.cpp 20 "armv8.2-a+sve")
+        # SVE is not supported on Apple Silicon, only compile on non-Darwin ARM platforms
+        if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+            define_launcher(stringzilla_test_cpp20_sve scripts/test_stringzilla.cpp 20 "armv8.2-a+sve")
+        endif()
         if (STRINGZILLA_BUILD_CUDA)
             define_gpu_launcher(stringzillas_test_cu20_serial scripts/test_stringzillas.cu 20 "armv8-a")
             define_gpu_launcher(stringzillas_test_cu20_neon scripts/test_stringzillas.cu 20 "armv8-a+simd")
-            define_gpu_launcher(stringzillas_test_cu20_sve scripts/test_stringzillas.cu 20 "armv8.2-a+sve")
+            # SVE is not supported on Apple Silicon, only compile on non-Darwin ARM platforms
+            if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+                define_gpu_launcher(stringzillas_test_cu20_sve scripts/test_stringzillas.cu 20 "armv8.2-a+sve")
+            endif()
         endif ()
     endif ()
 endif ()

From d06d4e4769f2cb138b9925613ec5e267e58a27d4 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 23:50:04 +0100
Subject: [PATCH 627/751] Make: Override VS Code compiler choice on `osx`

---
 .vscode/tasks.json | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.vscode/tasks.json b/.vscode/tasks.json
index 1d79dae1..04212193 100644
--- a/.vscode/tasks.json
+++ b/.vscode/tasks.json
@@ -9,11 +9,11 @@
             "osx": {
                 "environment": [
                     {
-                        "name": "CXX",
+                        "name": "CMAKE_CXX_COMPILER",
                         "value": "$(brew --prefix llvm)/bin/clang++"
                     },
                     {
-                        "name": "CC",
+                        "name": "CMAKE_C_COMPILER",
                         "value": "$(brew --prefix llvm)/bin/clang"
                     }
                 ]
@@ -27,11 +27,11 @@
             "osx": {
                 "environment": [
                     {
-                        "name": "CXX",
+                        "name": "CMAKE_CXX_COMPILER",
                         "value": "$(brew --prefix llvm)/bin/clang++"
                     },
                     {
-                        "name": "CC",
+                        "name": "CMAKE_C_COMPILER",
                         "value": "$(brew --prefix llvm)/bin/clang"
                     }
                 ]
@@ -51,11 +51,11 @@
             "osx": {
                 "environment": [
                     {
-                        "name": "CXX",
+                        "name": "CMAKE_CXX_COMPILER",
                         "value": "$(brew --prefix llvm)/bin/clang++"
                     },
                     {
-                        "name": "CC",
+                        "name": "CMAKE_C_COMPILER",
                         "value": "$(brew --prefix llvm)/bin/clang"
                     }
                 ]
@@ -69,11 +69,11 @@
             "osx": {
                 "environment": [
                     {
-                        "name": "CXX",
+                        "name": "CMAKE_CXX_COMPILER",
                         "value": "$(brew --prefix llvm)/bin/clang++"
                     },
                     {
-                        "name": "CC",
+                        "name": "CMAKE_C_COMPILER",
                         "value": "$(brew --prefix llvm)/bin/clang"
                     }
                 ]

From 1fd5db83085afe5466b283f4584a37747d85462d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 23:55:42 +0100
Subject: [PATCH 628/751] Fix: Disambiguate `szs_` symbols

This also helped fixed the linking issue
when compiling on macOS.
---
 README.md                             |   6 +-
 c/stringzillas.cuh                    | 321 +++++++++++++-------------
 include/stringzillas/similarities.hpp |   2 +-
 include/stringzillas/stringzillas.h   | 158 ++++++-------
 python/stringzillas.c                 | 102 ++++----
 rust/stringzillas.rs                  | 136 +++++------
 6 files changed, 363 insertions(+), 362 deletions(-)

diff --git a/README.md b/README.md
index 8fd3c109..ecb0d220 100644
--- a/README.md
+++ b/README.md
@@ -247,7 +247,7 @@ __Who is this for?__
       <span style="color:#ABABAB;">arm:</span> <b>2,220</b> ns
     </td>
     <td align="center">
-      <code>sz_levenshtein_distance</code><br/>
+      <code>szs_levenshtein_distance</code><br/>
       <span style="color:#ABABAB;">x86:</span> <b>99</b> &centerdot;
       <span style="color:#ABABAB;">arm:</span> <b>180</b> ns
     </td>
@@ -265,7 +265,7 @@ __Who is this for?__
       <span style="color:#ABABAB;">arm:</span> <b>367</b> ms
     </td>
     <td align="center">
-      <code>sz_needleman_wunsch_score</code><br/>
+      <code>szs_needleman_wunsch_score</code><br/>
       <span style="color:#ABABAB;">x86:</span> <b>73</b> &centerdot;
       <span style="color:#ABABAB;">arm:</span> <b>177</b> ms
     </td>
@@ -1560,7 +1560,7 @@ Most StringZilla operations are byte-level, so they work well with ASCII and UTF
 In some cases, like edit-distance computation, the result of byte-level evaluation and character-level evaluation may differ.
 So StringZilla provides following functions to work with Unicode:
 
-- `sz_levenshtein_distance_utf8` - computes the Levenshtein distance between two UTF-8 strings.
+- `szs_levenshtein_distance_utf8` - computes the Levenshtein distance between two UTF-8 strings.
 - `sz_hamming_distance_utf8` - computes the Hamming distance between two UTF-8 strings.
 
 Java, JavaScript, Python 2, C#, and Objective-C, however, use wide characters (`wchar`) - two byte long codes, instead of the more reasonable fixed-length UTF32 or variable-length UTF8.
diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index f62d006d..e0fb3109 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -258,9 +258,9 @@ struct levenshtein_backends_t {
 };
 
 template <typename texts_type_>
-sz_status_t sz_levenshtein_distances_for_(                                     //
-    sz_levenshtein_distances_t engine_punned, sz_device_scope_t device_punned, //
-    texts_type_ const &a_container, texts_type_ const &b_container,            //
+sz_status_t szs_levenshtein_distances_for_(                                      //
+    szs_levenshtein_distances_t engine_punned, szs_device_scope_t device_punned, //
+    texts_type_ const &a_container, texts_type_ const &b_container,              //
     sz_size_t *results, sz_size_t results_stride) {
 
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
@@ -339,9 +339,9 @@ struct levenshtein_utf8_backends_t {
 };
 
 template <typename texts_type_>
-sz_status_t sz_levenshtein_distances_utf8_for_(                                     //
-    sz_levenshtein_distances_utf8_t engine_punned, sz_device_scope_t device_punned, //
-    texts_type_ const &a_container, texts_type_ const &b_container,                 //
+sz_status_t szs_levenshtein_distances_utf8_for_(                                      //
+    szs_levenshtein_distances_utf8_t engine_punned, szs_device_scope_t device_punned, //
+    texts_type_ const &a_container, texts_type_ const &b_container,                   //
     sz_size_t *results, sz_size_t results_stride) {
 
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
@@ -417,9 +417,9 @@ struct needleman_wunsch_backends_t {
 };
 
 template <typename texts_type_>
-sz_status_t sz_needleman_wunsch_scores_for_(                                     //
-    sz_needleman_wunsch_scores_t engine_punned, sz_device_scope_t device_punned, //
-    texts_type_ const &a_container, texts_type_ const &b_container,              //
+sz_status_t szs_needleman_wunsch_scores_for_(                                      //
+    szs_needleman_wunsch_scores_t engine_punned, szs_device_scope_t device_punned, //
+    texts_type_ const &a_container, texts_type_ const &b_container,                //
     sz_ssize_t *results, sz_size_t results_stride) {
 
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
@@ -505,9 +505,9 @@ struct smith_waterman_backends_t {
 };
 
 template <typename texts_type_>
-sz_status_t sz_smith_waterman_scores_for_(                                     //
-    sz_smith_waterman_scores_t engine_punned, sz_device_scope_t device_punned, //
-    texts_type_ const &a_container, texts_type_ const &b_container,            //
+sz_status_t szs_smith_waterman_scores_for_(                                      //
+    szs_smith_waterman_scores_t engine_punned, szs_device_scope_t device_punned, //
+    texts_type_ const &a_container, texts_type_ const &b_container,              //
     sz_ssize_t *results, sz_size_t results_stride) {
 
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
@@ -603,10 +603,10 @@ struct fingerprints_backends_t {
 };
 
 template <typename texts_type_>
-sz_status_t sz_fingerprints_for_(                                     //
-    sz_fingerprints_t engine_punned, sz_device_scope_t device_punned, //
-    texts_type_ const &texts_container,                               //
-    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                //
+sz_status_t szs_fingerprints_for_(                                      //
+    szs_fingerprints_t engine_punned, szs_device_scope_t device_punned, //
+    texts_type_ const &texts_container,                                 //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                  //
     sz_u32_t *min_counts, sz_size_t min_counts_stride) {
 
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
@@ -776,15 +776,15 @@ SZ_DYNAMIC sz_status_t sz_memory_allocator_init_unified(sz_memory_allocator_t *a
 
 #pragma region Device Scopes
 
-SZ_DYNAMIC sz_status_t sz_device_scope_init_default(sz_device_scope_t *scope_punned) {
+SZ_DYNAMIC sz_status_t szs_device_scope_init_default(szs_device_scope_t *scope_punned) {
     sz_assert_(scope_punned != nullptr && "Scope must not be null");
     auto *scope = new device_scope_t {default_scope_t {}};
     if (!scope) return sz_bad_alloc_k;
-    *scope_punned = reinterpret_cast<sz_device_scope_t>(scope);
+    *scope_punned = reinterpret_cast<szs_device_scope_t>(scope);
     return sz_success_k;
 }
 
-SZ_DYNAMIC sz_status_t sz_device_scope_init_cpu_cores(sz_size_t cpu_cores, sz_device_scope_t *scope_punned) {
+SZ_DYNAMIC sz_status_t szs_device_scope_init_cpu_cores(sz_size_t cpu_cores, szs_device_scope_t *scope_punned) {
     sz_assert_(scope_punned != nullptr && "Scope must not be null");
     sz_assert_(cpu_cores > 0 && "CPU cores must be greater than zero");
 
@@ -798,11 +798,11 @@ SZ_DYNAMIC sz_status_t sz_device_scope_init_cpu_cores(sz_size_t cpu_cores, sz_de
     auto *scope =
         new (std::nothrow) device_scope_t(std::in_place_type_t<cpu_scope_t> {}, std::move(executor), std::move(specs));
     if (!scope) return sz_bad_alloc_k;
-    *scope_punned = reinterpret_cast<sz_device_scope_t>(scope);
+    *scope_punned = reinterpret_cast<szs_device_scope_t>(scope);
     return sz_success_k;
 }
 
-SZ_DYNAMIC sz_status_t sz_device_scope_init_gpu_device(sz_size_t gpu_device, sz_device_scope_t *scope_punned) {
+SZ_DYNAMIC sz_status_t szs_device_scope_init_gpu_device(sz_size_t gpu_device, szs_device_scope_t *scope_punned) {
     sz_assert_(scope_punned != nullptr && "Scope must not be null");
 
 #if SZ_USE_CUDA
@@ -816,7 +816,7 @@ SZ_DYNAMIC sz_status_t sz_device_scope_init_gpu_device(sz_size_t gpu_device, sz_
     auto *scope =
         new (std::nothrow) device_scope_t {gpu_scope_t {.executor = std::move(executor), .specs = std::move(specs)}};
     if (!scope) return sz_bad_alloc_k;
-    *scope_punned = reinterpret_cast<sz_device_scope_t>(scope);
+    *scope_punned = reinterpret_cast<szs_device_scope_t>(scope);
     return sz_success_k;
 #else
     sz_unused_(gpu_device);
@@ -825,7 +825,7 @@ SZ_DYNAMIC sz_status_t sz_device_scope_init_gpu_device(sz_size_t gpu_device, sz_
 #endif
 }
 
-SZ_DYNAMIC sz_status_t sz_device_scope_get_cpu_cores(sz_device_scope_t scope_punned, sz_size_t *cpu_cores) {
+SZ_DYNAMIC sz_status_t szs_device_scope_get_cpu_cores(szs_device_scope_t scope_punned, sz_size_t *cpu_cores) {
     if (scope_punned == nullptr || cpu_cores == nullptr) return sz_status_unknown_k;
     auto *scope = reinterpret_cast<device_scope_t *>(scope_punned);
 
@@ -845,7 +845,7 @@ SZ_DYNAMIC sz_status_t sz_device_scope_get_cpu_cores(sz_device_scope_t scope_pun
     return sz_status_unknown_k;
 }
 
-SZ_DYNAMIC sz_status_t sz_device_scope_get_gpu_device(sz_device_scope_t scope_punned, sz_size_t *gpu_device) {
+SZ_DYNAMIC sz_status_t szs_device_scope_get_gpu_device(szs_device_scope_t scope_punned, sz_size_t *gpu_device) {
     if (scope_punned == nullptr || gpu_device == nullptr) return sz_status_unknown_k;
 
 #if SZ_USE_CUDA
@@ -863,17 +863,18 @@ SZ_DYNAMIC sz_status_t sz_device_scope_get_gpu_device(sz_device_scope_t scope_pu
     return sz_status_unknown_k;
 }
 
-SZ_DYNAMIC void sz_device_scope_free(sz_device_scope_t scope_punned) {
+SZ_DYNAMIC void szs_device_scope_free(szs_device_scope_t scope_punned) {
     if (scope_punned == nullptr) return;
     auto *scope = reinterpret_cast<device_scope_t *>(scope_punned);
     delete scope;
 }
 
-SZ_DYNAMIC sz_status_t sz_device_scope_get_capabilities(sz_device_scope_t scope_punned, sz_capability_t *capabilities) {
+SZ_DYNAMIC sz_status_t szs_device_scope_get_capabilities(szs_device_scope_t scope_punned,
+                                                         sz_capability_t *capabilities) {
     if (scope_punned == nullptr || capabilities == nullptr) return sz_status_unknown_k;
     auto *scope = reinterpret_cast<device_scope_t *>(scope_punned);
 
-    sz_capability_t system_caps = sz_capabilities();
+    sz_capability_t system_caps = szs_capabilities();
 
 #if SZ_USE_CUDA
     if (std::holds_alternative<gpu_scope_t>(scope->variants)) {
@@ -892,7 +893,7 @@ SZ_DYNAMIC sz_status_t sz_device_scope_get_capabilities(sz_device_scope_t scope_
 
 #pragma region Unified Allocator
 
-SZ_DYNAMIC void *sz_unified_alloc(sz_size_t size_bytes) {
+SZ_DYNAMIC void *szs_unified_alloc(sz_size_t size_bytes) {
 #if SZ_USE_CUDA
     return szs::unified_alloc_t {}.allocate(size_bytes);
 #else
@@ -900,7 +901,7 @@ SZ_DYNAMIC void *sz_unified_alloc(sz_size_t size_bytes) {
 #endif
 }
 
-SZ_DYNAMIC void sz_unified_free(void *ptr, sz_size_t size_bytes) {
+SZ_DYNAMIC void szs_unified_free(void *ptr, sz_size_t size_bytes) {
     if (!ptr) return;
 #if SZ_USE_CUDA
     szs::unified_alloc_t {}.deallocate(static_cast<char *>(ptr), size_bytes);
@@ -914,10 +915,10 @@ SZ_DYNAMIC void sz_unified_free(void *ptr, sz_size_t size_bytes) {
 
 #pragma region Levenshtein Distances
 
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(                                              //
+SZ_DYNAMIC sz_status_t szs_levenshtein_distances_init(                                             //
     sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities,                              //
-    sz_levenshtein_distances_t *engine_punned) {
+    szs_levenshtein_distances_t *engine_punned) {
 
     sz_assert_(engine_punned != nullptr && *engine_punned == nullptr && "Engine must be uninitialized");
 
@@ -935,7 +936,7 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(
             levenshtein_backends_t(std::in_place_type_t<szs::levenshtein_ice_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
+        *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
         return sz_success_k;
     }
     else if (can_use_ice) {
@@ -944,7 +945,7 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(
             levenshtein_backends_t(std::in_place_type_t<szs::affine_levenshtein_ice_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
+        *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
         return sz_success_k;
     }
 #endif // SZ_USE_ICE
@@ -957,7 +958,7 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(
             levenshtein_backends_t(std::in_place_type_t<szs::levenshtein_cuda_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
+        *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
         return sz_success_k;
     }
     else if (can_use_cuda) {
@@ -966,7 +967,7 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(
             levenshtein_backends_t(std::in_place_type_t<szs::affine_levenshtein_cuda_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
+        *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
         return sz_success_k;
     }
 #endif // SZ_USE_CUDA
@@ -979,7 +980,7 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(
             levenshtein_backends_t(std::in_place_type_t<szs::levenshtein_kepler_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
+        *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
         return sz_success_k;
     }
     else if (can_use_kepler) {
@@ -988,7 +989,7 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(
             levenshtein_backends_t(std::in_place_type_t<szs::affine_levenshtein_kepler_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
+        *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
         return sz_success_k;
     }
 #endif // SZ_USE_KEPLER
@@ -1001,7 +1002,7 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(
             levenshtein_backends_t(std::in_place_type_t<szs::levenshtein_hopper_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
+        *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
         return sz_success_k;
     }
     else if (can_use_hopper) {
@@ -1010,7 +1011,7 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(
             levenshtein_backends_t(std::in_place_type_t<szs::affine_levenshtein_hopper_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
+        *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
         return sz_success_k;
     }
 #endif // SZ_USE_HOPPER
@@ -1021,7 +1022,7 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(
             levenshtein_backends_t(std::in_place_type_t<szs::levenshtein_serial_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
+        *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
         return sz_success_k;
     }
     else {
@@ -1030,51 +1031,51 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(
             levenshtein_backends_t(std::in_place_type_t<szs::affine_levenshtein_serial_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_levenshtein_distances_t>(engine);
+        *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
         return sz_success_k;
     }
 }
 
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_sequence(                      //
-    sz_levenshtein_distances_t engine_punned, sz_device_scope_t device_punned, //
-    sz_sequence_t const *a, sz_sequence_t const *b,                            //
+SZ_DYNAMIC sz_status_t szs_levenshtein_distances_sequence(                       //
+    szs_levenshtein_distances_t engine_punned, szs_device_scope_t device_punned, //
+    sz_sequence_t const *a, sz_sequence_t const *b,                              //
     sz_size_t *results, sz_size_t results_stride) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_as_cpp_container_t {a};
     auto b_container = sz_sequence_as_cpp_container_t {b};
-    return sz_levenshtein_distances_for_(                       //
+    return szs_levenshtein_distances_for_(                      //
         engine_punned, device_punned, a_container, b_container, //
         results, results_stride);
 }
 
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u32tape(                       //
-    sz_levenshtein_distances_t engine_punned, sz_device_scope_t device_punned, //
-    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,            //
+SZ_DYNAMIC sz_status_t szs_levenshtein_distances_u32tape(                        //
+    szs_levenshtein_distances_t engine_punned, szs_device_scope_t device_punned, //
+    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,              //
     sz_size_t *results, sz_size_t results_stride) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_u32tape_as_cpp_container_t {a};
     auto b_container = sz_sequence_u32tape_as_cpp_container_t {b};
-    return sz_levenshtein_distances_for_(                       //
+    return szs_levenshtein_distances_for_(                      //
         engine_punned, device_punned, a_container, b_container, //
         results, results_stride);
 }
 
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u64tape(                       //
-    sz_levenshtein_distances_t engine_punned, sz_device_scope_t device_punned, //
-    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,            //
+SZ_DYNAMIC sz_status_t szs_levenshtein_distances_u64tape(                        //
+    szs_levenshtein_distances_t engine_punned, szs_device_scope_t device_punned, //
+    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,              //
     sz_size_t *results, sz_size_t results_stride) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_u64tape_as_cpp_container_t {a};
     auto b_container = sz_sequence_u64tape_as_cpp_container_t {b};
-    return sz_levenshtein_distances_for_(                       //
+    return szs_levenshtein_distances_for_(                      //
         engine_punned, device_punned, a_container, b_container, //
         results, results_stride);
 }
 
-SZ_DYNAMIC void sz_levenshtein_distances_free(sz_levenshtein_distances_t engine_punned) {
+SZ_DYNAMIC void szs_levenshtein_distances_free(szs_levenshtein_distances_t engine_punned) {
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
     auto *engine = reinterpret_cast<levenshtein_backends_t *>(engine_punned);
     delete engine;
@@ -1084,10 +1085,10 @@ SZ_DYNAMIC void sz_levenshtein_distances_free(sz_levenshtein_distances_t engine_
 
 #pragma region Levenshtein UTF8 Distances
 
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_init(                                         //
+SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_init(                                        //
     sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities,                              //
-    sz_levenshtein_distances_utf8_t *engine_punned) {
+    szs_levenshtein_distances_utf8_t *engine_punned) {
 
     sz_assert_(engine_punned != nullptr && *engine_punned == nullptr && "Engine must be uninitialized");
 
@@ -1105,7 +1106,7 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_init(
             levenshtein_utf8_backends_t(std::in_place_type_t<szs::levenshtein_utf8_ice_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_levenshtein_distances_utf8_t>(engine);
+        *engine_punned = reinterpret_cast<szs_levenshtein_distances_utf8_t>(engine);
         return sz_success_k;
     }
 #endif // SZ_USE_ICE
@@ -1117,7 +1118,7 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_init(
             levenshtein_utf8_backends_t(std::in_place_type_t<szs::levenshtein_utf8_serial_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_levenshtein_distances_utf8_t>(engine);
+        *engine_punned = reinterpret_cast<szs_levenshtein_distances_utf8_t>(engine);
         return sz_success_k;
     }
     else {
@@ -1126,53 +1127,53 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_init(
             std::in_place_type_t<szs::affine_levenshtein_utf8_serial_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_levenshtein_distances_utf8_t>(engine);
+        *engine_punned = reinterpret_cast<szs_levenshtein_distances_utf8_t>(engine);
         return sz_success_k;
     }
 
     return sz_status_unknown_k; // No supported backends available
 }
 
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_sequence(                      //
-    sz_levenshtein_distances_utf8_t engine_punned, sz_device_scope_t device_punned, //
-    sz_sequence_t const *a, sz_sequence_t const *b,                                 //
+SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_sequence(                       //
+    szs_levenshtein_distances_utf8_t engine_punned, szs_device_scope_t device_punned, //
+    sz_sequence_t const *a, sz_sequence_t const *b,                                   //
     sz_size_t *results, sz_size_t results_stride) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_as_cpp_container_t {a};
     auto b_container = sz_sequence_as_cpp_container_t {b};
-    return sz_levenshtein_distances_utf8_for_(                  //
+    return szs_levenshtein_distances_utf8_for_(                 //
         engine_punned, device_punned, a_container, b_container, //
         results, results_stride);
 }
 
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_u32tape(                       //
-    sz_levenshtein_distances_utf8_t engine_punned, sz_device_scope_t device_punned, //
-    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,                 //
+SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_u32tape(                        //
+    szs_levenshtein_distances_utf8_t engine_punned, szs_device_scope_t device_punned, //
+    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,                   //
     sz_size_t *results, sz_size_t results_stride) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_u32tape_as_cpp_container_t {a};
     auto b_container = sz_sequence_u32tape_as_cpp_container_t {b};
-    return sz_levenshtein_distances_utf8_for_(                  //
+    return szs_levenshtein_distances_utf8_for_(                 //
         engine_punned, device_punned, a_container, b_container, //
         results, results_stride);
 }
 
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_u64tape(                       //
-    sz_levenshtein_distances_utf8_t engine_punned, sz_device_scope_t device_punned, //
-    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,                 //
+SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_u64tape(                        //
+    szs_levenshtein_distances_utf8_t engine_punned, szs_device_scope_t device_punned, //
+    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,                   //
     sz_size_t *results, sz_size_t results_stride) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_u64tape_as_cpp_container_t {a};
     auto b_container = sz_sequence_u64tape_as_cpp_container_t {b};
-    return sz_levenshtein_distances_utf8_for_(                  //
+    return szs_levenshtein_distances_utf8_for_(                 //
         engine_punned, device_punned, a_container, b_container, //
         results, results_stride);
 }
 
-SZ_DYNAMIC void sz_levenshtein_distances_utf8_free(sz_levenshtein_distances_utf8_t engine_punned) {
+SZ_DYNAMIC void szs_levenshtein_distances_utf8_free(szs_levenshtein_distances_utf8_t engine_punned) {
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
     auto *engine = reinterpret_cast<levenshtein_utf8_backends_t *>(engine_punned);
     delete engine;
@@ -1182,10 +1183,10 @@ SZ_DYNAMIC void sz_levenshtein_distances_utf8_free(sz_levenshtein_distances_utf8
 
 #pragma region Needleman Wunsch
 
-SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_init(                        //
+SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_init(                       //
     sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities,          //
-    sz_needleman_wunsch_scores_t *engine_punned) {
+    szs_needleman_wunsch_scores_t *engine_punned) {
 
     sz_assert_(engine_punned != nullptr && *engine_punned == nullptr && "Engine must be uninitialized");
 
@@ -1203,7 +1204,7 @@ SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_init(                        /
             needleman_wunsch_backends_t(std::in_place_type_t<szs::needleman_wunsch_ice_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_needleman_wunsch_scores_t>(engine);
+        *engine_punned = reinterpret_cast<szs_needleman_wunsch_scores_t>(engine);
         return sz_success_k;
     }
 #endif // SZ_USE_ICE
@@ -1216,7 +1217,7 @@ SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_init(                        /
             needleman_wunsch_backends_t(std::in_place_type_t<szs::needleman_wunsch_cuda_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_needleman_wunsch_scores_t>(engine);
+        *engine_punned = reinterpret_cast<szs_needleman_wunsch_scores_t>(engine);
         return sz_success_k;
     }
     else if (can_use_cuda) {
@@ -1225,7 +1226,7 @@ SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_init(                        /
             std::in_place_type_t<szs::affine_needleman_wunsch_cuda_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_needleman_wunsch_scores_t>(engine);
+        *engine_punned = reinterpret_cast<szs_needleman_wunsch_scores_t>(engine);
         return sz_success_k;
     }
 #endif // SZ_USE_CUDA
@@ -1238,7 +1239,7 @@ SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_init(                        /
             needleman_wunsch_backends_t(std::in_place_type_t<szs::needleman_wunsch_hopper_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_needleman_wunsch_scores_t>(engine);
+        *engine_punned = reinterpret_cast<szs_needleman_wunsch_scores_t>(engine);
         return sz_success_k;
     }
     else if (can_use_hopper) {
@@ -1247,7 +1248,7 @@ SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_init(                        /
             std::in_place_type_t<szs::affine_needleman_wunsch_hopper_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_needleman_wunsch_scores_t>(engine);
+        *engine_punned = reinterpret_cast<szs_needleman_wunsch_scores_t>(engine);
         return sz_success_k;
     }
 #endif // SZ_USE_HOPPER
@@ -1258,7 +1259,7 @@ SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_init(                        /
             needleman_wunsch_backends_t(std::in_place_type_t<szs::needleman_wunsch_serial_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_needleman_wunsch_scores_t>(engine);
+        *engine_punned = reinterpret_cast<szs_needleman_wunsch_scores_t>(engine);
         return sz_success_k;
     }
     else {
@@ -1267,51 +1268,51 @@ SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_init(                        /
             std::in_place_type_t<szs::affine_needleman_wunsch_serial_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_needleman_wunsch_scores_t>(engine);
+        *engine_punned = reinterpret_cast<szs_needleman_wunsch_scores_t>(engine);
         return sz_success_k;
     }
 }
 
-SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_sequence(                      //
-    sz_needleman_wunsch_scores_t engine_punned, sz_device_scope_t device_punned, //
-    sz_sequence_t const *a, sz_sequence_t const *b,                              //
+SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_sequence(                       //
+    szs_needleman_wunsch_scores_t engine_punned, szs_device_scope_t device_punned, //
+    sz_sequence_t const *a, sz_sequence_t const *b,                                //
     sz_ssize_t *results, sz_size_t results_stride) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_as_cpp_container_t {a};
     auto b_container = sz_sequence_as_cpp_container_t {b};
-    return sz_needleman_wunsch_scores_for_(                     //
+    return szs_needleman_wunsch_scores_for_(                    //
         engine_punned, device_punned, a_container, b_container, //
         results, results_stride);
 }
 
-SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u32tape(                       //
-    sz_needleman_wunsch_scores_t engine_punned, sz_device_scope_t device_punned, //
-    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,              //
+SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_u32tape(                        //
+    szs_needleman_wunsch_scores_t engine_punned, szs_device_scope_t device_punned, //
+    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,                //
     sz_ssize_t *results, sz_size_t results_stride) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_u32tape_as_cpp_container_t {a};
     auto b_container = sz_sequence_u32tape_as_cpp_container_t {b};
-    return sz_needleman_wunsch_scores_for_(                     //
+    return szs_needleman_wunsch_scores_for_(                    //
         engine_punned, device_punned, a_container, b_container, //
         results, results_stride);
 }
 
-SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u64tape(                       //
-    sz_needleman_wunsch_scores_t engine_punned, sz_device_scope_t device_punned, //
-    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,              //
+SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_u64tape(                        //
+    szs_needleman_wunsch_scores_t engine_punned, szs_device_scope_t device_punned, //
+    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,                //
     sz_ssize_t *results, sz_size_t results_stride) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_u64tape_as_cpp_container_t {a};
     auto b_container = sz_sequence_u64tape_as_cpp_container_t {b};
-    return sz_needleman_wunsch_scores_for_(                     //
+    return szs_needleman_wunsch_scores_for_(                    //
         engine_punned, device_punned, a_container, b_container, //
         results, results_stride);
 }
 
-SZ_DYNAMIC void sz_needleman_wunsch_scores_free(sz_needleman_wunsch_scores_t engine_punned) {
+SZ_DYNAMIC void szs_needleman_wunsch_scores_free(szs_needleman_wunsch_scores_t engine_punned) {
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
     auto *engine = reinterpret_cast<needleman_wunsch_backends_t *>(engine_punned);
     delete engine;
@@ -1321,10 +1322,10 @@ SZ_DYNAMIC void sz_needleman_wunsch_scores_free(sz_needleman_wunsch_scores_t eng
 
 #pragma region Smith Waterman
 
-SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_init(                          //
+SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_init(                         //
     sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities,          //
-    sz_smith_waterman_scores_t *engine_punned) {
+    szs_smith_waterman_scores_t *engine_punned) {
 
     sz_assert_(engine_punned != nullptr && *engine_punned == nullptr && "Engine must be uninitialized");
 
@@ -1342,7 +1343,7 @@ SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_init(                          /
             smith_waterman_backends_t(std::in_place_type_t<szs::smith_waterman_ice_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_smith_waterman_scores_t>(engine);
+        *engine_punned = reinterpret_cast<szs_smith_waterman_scores_t>(engine);
         return sz_success_k;
     }
 #endif // SZ_USE_ICE
@@ -1355,7 +1356,7 @@ SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_init(                          /
             smith_waterman_backends_t(std::in_place_type_t<szs::smith_waterman_cuda_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_smith_waterman_scores_t>(engine);
+        *engine_punned = reinterpret_cast<szs_smith_waterman_scores_t>(engine);
         return sz_success_k;
     }
     else if (can_use_cuda) {
@@ -1364,7 +1365,7 @@ SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_init(                          /
             smith_waterman_backends_t(std::in_place_type_t<szs::affine_smith_waterman_cuda_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_smith_waterman_scores_t>(engine);
+        *engine_punned = reinterpret_cast<szs_smith_waterman_scores_t>(engine);
         return sz_success_k;
     }
 #endif // SZ_USE_CUDA
@@ -1377,7 +1378,7 @@ SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_init(                          /
             smith_waterman_backends_t(std::in_place_type_t<szs::smith_waterman_hopper_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_smith_waterman_scores_t>(engine);
+        *engine_punned = reinterpret_cast<szs_smith_waterman_scores_t>(engine);
         return sz_success_k;
     }
     else if (can_use_hopper) {
@@ -1386,7 +1387,7 @@ SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_init(                          /
             smith_waterman_backends_t(std::in_place_type_t<szs::affine_smith_waterman_hopper_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_smith_waterman_scores_t>(engine);
+        *engine_punned = reinterpret_cast<szs_smith_waterman_scores_t>(engine);
         return sz_success_k;
     }
 #endif // SZ_USE_HOPPER
@@ -1397,7 +1398,7 @@ SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_init(                          /
             smith_waterman_backends_t(std::in_place_type_t<szs::smith_waterman_serial_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_smith_waterman_scores_t>(engine);
+        *engine_punned = reinterpret_cast<szs_smith_waterman_scores_t>(engine);
         return sz_success_k;
     }
     else {
@@ -1406,51 +1407,51 @@ SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_init(                          /
             smith_waterman_backends_t(std::in_place_type_t<szs::affine_smith_waterman_serial_t>(), std::move(variant));
         if (!engine) return sz_bad_alloc_k;
 
-        *engine_punned = reinterpret_cast<sz_smith_waterman_scores_t>(engine);
+        *engine_punned = reinterpret_cast<szs_smith_waterman_scores_t>(engine);
         return sz_success_k;
     }
 }
 
-SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_sequence(                      //
-    sz_smith_waterman_scores_t engine_punned, sz_device_scope_t device_punned, //
-    sz_sequence_t const *a, sz_sequence_t const *b,                            //
+SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_sequence(                       //
+    szs_smith_waterman_scores_t engine_punned, szs_device_scope_t device_punned, //
+    sz_sequence_t const *a, sz_sequence_t const *b,                              //
     sz_ssize_t *results, sz_size_t results_stride) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_as_cpp_container_t {a};
     auto b_container = sz_sequence_as_cpp_container_t {b};
-    return sz_smith_waterman_scores_for_(                       //
+    return szs_smith_waterman_scores_for_(                      //
         engine_punned, device_punned, a_container, b_container, //
         results, results_stride);
 }
 
-SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u32tape(                       //
-    sz_smith_waterman_scores_t engine_punned, sz_device_scope_t device_punned, //
-    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,            //
+SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_u32tape(                        //
+    szs_smith_waterman_scores_t engine_punned, szs_device_scope_t device_punned, //
+    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,              //
     sz_ssize_t *results, sz_size_t results_stride) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_u32tape_as_cpp_container_t {a};
     auto b_container = sz_sequence_u32tape_as_cpp_container_t {b};
-    return sz_smith_waterman_scores_for_(                       //
+    return szs_smith_waterman_scores_for_(                      //
         engine_punned, device_punned, a_container, b_container, //
         results, results_stride);
 }
 
-SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u64tape(                       //
-    sz_smith_waterman_scores_t engine_punned, sz_device_scope_t device_punned, //
-    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,            //
+SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_u64tape(                        //
+    szs_smith_waterman_scores_t engine_punned, szs_device_scope_t device_punned, //
+    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,              //
     sz_ssize_t *results, sz_size_t results_stride) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_u64tape_as_cpp_container_t {a};
     auto b_container = sz_sequence_u64tape_as_cpp_container_t {b};
-    return sz_smith_waterman_scores_for_(                       //
+    return szs_smith_waterman_scores_for_(                      //
         engine_punned, device_punned, a_container, b_container, //
         results, results_stride);
 }
 
-SZ_DYNAMIC void sz_smith_waterman_scores_free(sz_smith_waterman_scores_t engine_punned) {
+SZ_DYNAMIC void szs_smith_waterman_scores_free(szs_smith_waterman_scores_t engine_punned) {
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
     auto *engine = reinterpret_cast<smith_waterman_backends_t *>(engine_punned);
     delete engine;
@@ -1460,11 +1461,11 @@ SZ_DYNAMIC void sz_smith_waterman_scores_free(sz_smith_waterman_scores_t engine_
 
 #pragma region Fingerprints
 
-SZ_DYNAMIC sz_status_t sz_fingerprints_init(                          //
+SZ_DYNAMIC sz_status_t szs_fingerprints_init(                         //
     sz_size_t dimensions, sz_size_t alphabet_size,                    //
     sz_size_t const *window_widths, sz_size_t window_widths_count,    //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities, //
-    sz_fingerprints_t *engine_punned) {
+    szs_fingerprints_t *engine_punned) {
 
     sz_assert_(engine_punned != nullptr && *engine_punned == nullptr && "Engine must be uninitialized");
 
@@ -1502,7 +1503,7 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_init(                          //
             new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<vec<hasher_t>>(), std::move(hashers));
         if (!engine) return sz_bad_alloc_k;
         engine->dimensions = dimensions;
-        *engine_punned = reinterpret_cast<sz_fingerprints_t>(engine);
+        *engine_punned = reinterpret_cast<szs_fingerprints_t>(engine);
         return sz_success_k;
     }
 #endif // SZ_USE_HASWELL
@@ -1527,7 +1528,7 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_init(                          //
             new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<vec<hasher_t>>(), std::move(hashers));
         if (!engine) return sz_bad_alloc_k;
         engine->dimensions = dimensions;
-        *engine_punned = reinterpret_cast<sz_fingerprints_t>(engine);
+        *engine_punned = reinterpret_cast<szs_fingerprints_t>(engine);
         return sz_success_k;
     }
 #endif // SZ_USE_SKYLAKE
@@ -1552,7 +1553,7 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_init(                          //
             new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<vec<hasher_t>>(), std::move(hashers));
         if (!engine) return sz_bad_alloc_k;
         engine->dimensions = dimensions;
-        *engine_punned = reinterpret_cast<sz_fingerprints_t>(engine);
+        *engine_punned = reinterpret_cast<szs_fingerprints_t>(engine);
         return sz_success_k;
     }
     else if (can_use_cuda) {
@@ -1569,7 +1570,7 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_init(                          //
         if (!engine) return sz_bad_alloc_k;
 
         engine->dimensions = dimensions;
-        *engine_punned = reinterpret_cast<sz_fingerprints_t>(engine);
+        *engine_punned = reinterpret_cast<szs_fingerprints_t>(engine);
         return sz_success_k;
     }
 #endif // SZ_USE_CUDA
@@ -1593,7 +1594,7 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_init(                          //
             new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<vec<hasher_t>>(), std::move(hashers));
         if (!engine) return sz_bad_alloc_k;
         engine->dimensions = dimensions;
-        *engine_punned = reinterpret_cast<sz_fingerprints_t>(engine);
+        *engine_punned = reinterpret_cast<szs_fingerprints_t>(engine);
         return sz_success_k;
     }
 
@@ -1610,50 +1611,50 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_init(                          //
     if (!engine) return sz_bad_alloc_k;
 
     engine->dimensions = dimensions;
-    *engine_punned = reinterpret_cast<sz_fingerprints_t>(engine);
+    *engine_punned = reinterpret_cast<szs_fingerprints_t>(engine);
     return sz_success_k;
 }
 
-SZ_DYNAMIC sz_status_t sz_fingerprints_sequence(                      //
-    sz_fingerprints_t engine_punned, sz_device_scope_t device_punned, //
-    sz_sequence_t const *texts,                                       //
-    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                //
+SZ_DYNAMIC sz_status_t szs_fingerprints_sequence(                       //
+    szs_fingerprints_t engine_punned, szs_device_scope_t device_punned, //
+    sz_sequence_t const *texts,                                         //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                  //
     sz_u32_t *min_counts, sz_size_t min_counts_stride) {
 
     sz_assert_(texts != nullptr && "Input texts cannot be null");
     auto texts_container = sz_sequence_as_cpp_container_t {texts};
-    return sz_fingerprints_for_(                       //
+    return szs_fingerprints_for_(                      //
         engine_punned, device_punned, texts_container, //
         min_hashes, min_hashes_stride, min_counts, min_counts_stride);
 }
 
-SZ_DYNAMIC sz_status_t sz_fingerprints_u32tape(                       //
-    sz_fingerprints_t engine_punned, sz_device_scope_t device_punned, //
-    sz_sequence_u32tape_t const *texts,                               //
-    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                //
+SZ_DYNAMIC sz_status_t szs_fingerprints_u32tape(                        //
+    szs_fingerprints_t engine_punned, szs_device_scope_t device_punned, //
+    sz_sequence_u32tape_t const *texts,                                 //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                  //
     sz_u32_t *min_counts, sz_size_t min_counts_stride) {
 
     sz_assert_(texts != nullptr && "Input texts cannot be null");
     auto texts_container = sz_sequence_u32tape_as_cpp_container_t {texts};
-    return sz_fingerprints_for_(                       //
+    return szs_fingerprints_for_(                      //
         engine_punned, device_punned, texts_container, //
         min_hashes, min_hashes_stride, min_counts, min_counts_stride);
 }
 
-SZ_DYNAMIC sz_status_t sz_fingerprints_u64tape(                       //
-    sz_fingerprints_t engine_punned, sz_device_scope_t device_punned, //
-    sz_sequence_u64tape_t const *texts,                               //
-    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                //
+SZ_DYNAMIC sz_status_t szs_fingerprints_u64tape(                        //
+    szs_fingerprints_t engine_punned, szs_device_scope_t device_punned, //
+    sz_sequence_u64tape_t const *texts,                                 //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                  //
     sz_u32_t *min_counts, sz_size_t min_counts_stride) {
 
     sz_assert_(texts != nullptr && "Input texts cannot be null");
     auto texts_container = sz_sequence_u64tape_as_cpp_container_t {texts};
-    return sz_fingerprints_for_(                       //
+    return szs_fingerprints_for_(                      //
         engine_punned, device_punned, texts_container, //
         min_hashes, min_hashes_stride, min_counts, min_counts_stride);
 }
 
-SZ_DYNAMIC void sz_fingerprints_free(sz_fingerprints_t engine_punned) {
+SZ_DYNAMIC void szs_fingerprints_free(szs_fingerprints_t engine_punned) {
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
     auto *engine = reinterpret_cast<fingerprints_backends_t *>(engine_punned);
     delete engine;
@@ -1663,51 +1664,51 @@ SZ_DYNAMIC void sz_fingerprints_free(sz_fingerprints_t engine_punned) {
 
 #pragma region Fingerprints UTF8
 
-SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_init(                     //
+SZ_DYNAMIC sz_status_t szs_fingerprints_utf8_init(                    //
     sz_size_t dimensions, sz_size_t alphabet_size,                    //
     sz_size_t const *window_widths, sz_size_t window_widths_count,    //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities, //
-    sz_fingerprints_utf8_t *engine_punned) {
+    szs_fingerprints_utf8_t *engine_punned) {
 
-    return sz_fingerprints_init( //
+    return szs_fingerprints_init( //
         dimensions, alphabet_size, window_widths, window_widths_count, alloc, capabilities, engine_punned);
 }
 
-SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_sequence(                      //
-    sz_fingerprints_utf8_t engine_punned, sz_device_scope_t device_punned, //
-    sz_sequence_t const *texts,                                            //
-    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                     //
+SZ_DYNAMIC sz_status_t szs_fingerprints_utf8_sequence(                       //
+    szs_fingerprints_utf8_t engine_punned, szs_device_scope_t device_punned, //
+    sz_sequence_t const *texts,                                              //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                       //
     sz_u32_t *min_counts, sz_size_t min_counts_stride) {
 
-    return sz_fingerprints_sequence(         //
+    return szs_fingerprints_sequence(        //
         engine_punned, device_punned, texts, //
         min_hashes, min_hashes_stride, min_counts, min_counts_stride);
 }
 
-SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_u32tape(                       //
-    sz_fingerprints_utf8_t engine_punned, sz_device_scope_t device_punned, //
-    sz_sequence_u32tape_t const *texts,                                    //
-    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                     //
+SZ_DYNAMIC sz_status_t szs_fingerprints_utf8_u32tape(                        //
+    szs_fingerprints_utf8_t engine_punned, szs_device_scope_t device_punned, //
+    sz_sequence_u32tape_t const *texts,                                      //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                       //
     sz_u32_t *min_counts, sz_size_t min_counts_stride) {
 
-    return sz_fingerprints_u32tape(          //
+    return szs_fingerprints_u32tape(         //
         engine_punned, device_punned, texts, //
         min_hashes, min_hashes_stride, min_counts, min_counts_stride);
 }
 
-SZ_DYNAMIC sz_status_t sz_fingerprints_utf8_u64tape(                       //
-    sz_fingerprints_utf8_t engine_punned, sz_device_scope_t device_punned, //
-    sz_sequence_u64tape_t const *texts,                                    //
-    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                     //
+SZ_DYNAMIC sz_status_t szs_fingerprints_utf8_u64tape(                        //
+    szs_fingerprints_utf8_t engine_punned, szs_device_scope_t device_punned, //
+    sz_sequence_u64tape_t const *texts,                                      //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                       //
     sz_u32_t *min_counts, sz_size_t min_counts_stride) {
 
-    return sz_fingerprints_u64tape(          //
+    return szs_fingerprints_u64tape(         //
         engine_punned, device_punned, texts, //
         min_hashes, min_hashes_stride, min_counts, min_counts_stride);
 }
 
-SZ_DYNAMIC void sz_fingerprints_utf8_free(sz_fingerprints_utf8_t engine_punned) {
-    return sz_fingerprints_free(engine_punned);
+SZ_DYNAMIC void szs_fingerprints_utf8_free(szs_fingerprints_utf8_t engine_punned) {
+    return szs_fingerprints_free(engine_punned);
 }
 
 #pragma endregion Fingerprints UTF8
diff --git a/include/stringzillas/similarities.hpp b/include/stringzillas/similarities.hpp
index a808782f..e0162178 100644
--- a/include/stringzillas/similarities.hpp
+++ b/include/stringzillas/similarities.hpp
@@ -383,7 +383,7 @@ struct diagonal_walker;
  *
  *  @note   The API of this algorithm is a bit weird, but it's designed to minimize the reliance on the definitions
  *          in the `stringzilla.hpp` header, making compilation times shorter for the end-user.
- *  @sa     For lower-level API, check `sz_levenshtein_distance[_utf8]` and `sz_needleman_wunsch_score`.
+ *  @sa     For lower-level API, check `szs_levenshtein_distance[_utf8]` and `szs_needleman_wunsch_score`.
  *  @sa     For simplicity, use the `sz::levenshtein_distance[_utf8]` and `sz::needleman_wunsch_score`.
  *  @sa     For bulk API, use `sz::levenshtein_distances[_utf8]`.
  */
diff --git a/include/stringzillas/stringzillas.h b/include/stringzillas/stringzillas.h
index 38efb097..ecf14fdb 100644
--- a/include/stringzillas/stringzillas.h
+++ b/include/stringzillas/stringzillas.h
@@ -93,61 +93,61 @@ SZ_DYNAMIC sz_status_t sz_memory_allocator_init_unified(sz_memory_allocator_t *a
  *  Set `cpu_cores` to 0 to target all available CPU cores, to -1 to avoid CPUs, to 1 to use only calling thread.
  *  Set `gpu_device` to -1 to avoid GPUs, or to a positive device ID to target a specific GPU.
  */
-typedef void *sz_device_scope_t;
+typedef void *szs_device_scope_t;
 
 /**
  * @brief Initialize device scope with system defaults.
  * @param[out] scope Pointer to device scope handle.
  */
-SZ_DYNAMIC sz_status_t sz_device_scope_init_default(sz_device_scope_t *scope);
+SZ_DYNAMIC sz_status_t szs_device_scope_init_default(szs_device_scope_t *scope);
 
 /**
  * @brief Initialize device scope for CPU parallel execution.
  * @param[in] cpu_cores Number of CPU cores to use (must be > 1).
  * @param[out] scope Pointer to device scope handle.
- * @note For single-threaded execution, use `sz_device_scope_init_default()` instead.
+ * @note For single-threaded execution, use `szs_device_scope_init_default()` instead.
  */
-SZ_DYNAMIC sz_status_t sz_device_scope_init_cpu_cores(sz_size_t cpu_cores, sz_device_scope_t *scope);
+SZ_DYNAMIC sz_status_t szs_device_scope_init_cpu_cores(sz_size_t cpu_cores, szs_device_scope_t *scope);
 
 /**
  * @brief Initialize device scope for GPU execution.
  * @param[in] gpu_device GPU device index to target.
  * @param[out] scope Pointer to device scope handle.
  */
-SZ_DYNAMIC sz_status_t sz_device_scope_init_gpu_device(sz_size_t gpu_device, sz_device_scope_t *scope);
+SZ_DYNAMIC sz_status_t szs_device_scope_init_gpu_device(sz_size_t gpu_device, szs_device_scope_t *scope);
 
 /**
  * @brief Query configured CPU cores count.
  * @param[in] scope Device scope handle.
  * @param[out] cpu_cores Number of CPU cores configured.
  */
-SZ_DYNAMIC sz_status_t sz_device_scope_get_cpu_cores(sz_device_scope_t scope, sz_size_t *cpu_cores);
+SZ_DYNAMIC sz_status_t szs_device_scope_get_cpu_cores(szs_device_scope_t scope, sz_size_t *cpu_cores);
 
 /**
  * @brief Query configured GPU device ID.
  * @param[in] scope Device scope handle.
  * @param[out] gpu_device GPU device index.
  */
-SZ_DYNAMIC sz_status_t sz_device_scope_get_gpu_device(sz_device_scope_t scope, sz_size_t *gpu_device);
+SZ_DYNAMIC sz_status_t szs_device_scope_get_gpu_device(szs_device_scope_t scope, sz_size_t *gpu_device);
 
 /**
  * @brief Get device scope hardware capabilities.
  * @param[in] scope Device scope handle.
  * @param[out] capabilities Hardware capabilities mask.
  */
-SZ_DYNAMIC sz_status_t sz_device_scope_get_capabilities(sz_device_scope_t scope, sz_capability_t *capabilities);
+SZ_DYNAMIC sz_status_t szs_device_scope_get_capabilities(szs_device_scope_t scope, sz_capability_t *capabilities);
 
 /**
  * @brief Free device scope resources.
  * @param[in] scope Device scope handle to free.
  */
-SZ_DYNAMIC void sz_device_scope_free(sz_device_scope_t scope);
+SZ_DYNAMIC void szs_device_scope_free(szs_device_scope_t scope);
 
 /*  APIs for computing edit-distances between binary and UTF-8 strings.
  *  Supports `sz_sequence_t`, `sz_sequence_u32tape_t`, and `sz_sequence_u64tape_t` inputs.
  */
-typedef void *sz_levenshtein_distances_t;
-typedef void *sz_levenshtein_distances_utf8_t;
+typedef void *szs_levenshtein_distances_t;
+typedef void *szs_levenshtein_distances_utf8_t;
 
 /**
  *  @brief Initialize Levenshtein distance engine with affine gap costs.
@@ -163,10 +163,10 @@ typedef void *sz_levenshtein_distances_utf8_t;
  *  @param[in] capabilities Hardware capabilities mask.
  *  @param[out] engine Pointer to initialized engine handle.
  */
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(                                              //
+SZ_DYNAMIC sz_status_t szs_levenshtein_distances_init(                                             //
     sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities,                              //
-    sz_levenshtein_distances_t *engine);
+    szs_levenshtein_distances_t *engine);
 
 /**
  *  @brief Compute Levenshtein distances for sequence pairs.
@@ -177,9 +177,9 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_init(
  *  @param[out] results Output distance array.
  *  @param[in] results_stride Stride between results in bytes.
  */
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_sequence(        //
-    sz_levenshtein_distances_t engine, sz_device_scope_t device, //
-    sz_sequence_t const *a, sz_sequence_t const *b,              //
+SZ_DYNAMIC sz_status_t szs_levenshtein_distances_sequence(         //
+    szs_levenshtein_distances_t engine, szs_device_scope_t device, //
+    sz_sequence_t const *a, sz_sequence_t const *b,                //
     sz_size_t *results, sz_size_t results_stride);
 
 /**
@@ -191,8 +191,8 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_sequence(        //
  *  @param[out] results Output distance array.
  *  @param[in] results_stride Stride between results in bytes.
  */
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u32tape(            //
-    sz_levenshtein_distances_t engine, sz_device_scope_t device,    //
+SZ_DYNAMIC sz_status_t szs_levenshtein_distances_u32tape(           //
+    szs_levenshtein_distances_t engine, szs_device_scope_t device,  //
     sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b, //
     sz_size_t *results, sz_size_t results_stride);
 
@@ -205,8 +205,8 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u32tape(            //
  *  @param[out] results Output distance array.
  *  @param[in] results_stride Stride between results in bytes.
  */
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u64tape(            //
-    sz_levenshtein_distances_t engine, sz_device_scope_t device,    //
+SZ_DYNAMIC sz_status_t szs_levenshtein_distances_u64tape(           //
+    szs_levenshtein_distances_t engine, szs_device_scope_t device,  //
     sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b, //
     sz_size_t *results, sz_size_t results_stride);
 
@@ -214,14 +214,14 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_u64tape(            //
  *  @brief Free Levenshtein distance engine resources.
  *  @param[in] engine Engine handle to free.
  */
-SZ_DYNAMIC void sz_levenshtein_distances_free(sz_levenshtein_distances_t engine);
+SZ_DYNAMIC void szs_levenshtein_distances_free(szs_levenshtein_distances_t engine);
 
 /**
  *  @brief Initialize UTF-8 aware Levenshtein distance engine.
- *  
+ *
  *  Creates an engine for computing edit distances between UTF-8 encoded strings
  *  using character-level comparison instead of byte-level.
- *  
+ *
  *  @param[in] match Cost for character matches (typically negative or zero).
  *  @param[in] mismatch Cost for character mismatches (typically positive).
  *  @param[in] open Cost for opening a gap (typically positive).
@@ -230,10 +230,10 @@ SZ_DYNAMIC void sz_levenshtein_distances_free(sz_levenshtein_distances_t engine)
  *  @param[in] capabilities Hardware capabilities mask.
  *  @param[out] engine Pointer to initialized engine handle.
  */
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_init(                                         //
+SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_init(                                        //
     sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities,                              //
-    sz_levenshtein_distances_utf8_t *engine);
+    szs_levenshtein_distances_utf8_t *engine);
 
 /**
  *  @brief Compute UTF-8 aware Levenshtein distances for sequences.
@@ -244,9 +244,9 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_init(
  *  @param[out] results Output distance array.
  *  @param[in] results_stride Stride between results in bytes.
  */
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_sequence(        //
-    sz_levenshtein_distances_utf8_t engine, sz_device_scope_t device, //
-    sz_sequence_t const *a, sz_sequence_t const *b,                   //
+SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_sequence(         //
+    szs_levenshtein_distances_utf8_t engine, szs_device_scope_t device, //
+    sz_sequence_t const *a, sz_sequence_t const *b,                     //
     sz_size_t *results, sz_size_t results_stride);
 
 /**
@@ -258,9 +258,9 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_sequence(        //
  *  @param[out] results Output distance array.
  *  @param[in] results_stride Stride between results in bytes.
  */
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_u32tape(         //
-    sz_levenshtein_distances_utf8_t engine, sz_device_scope_t device, //
-    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,   //
+SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_u32tape(          //
+    szs_levenshtein_distances_utf8_t engine, szs_device_scope_t device, //
+    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,     //
     sz_size_t *results, sz_size_t results_stride);
 
 /**
@@ -272,23 +272,23 @@ SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_u32tape(         //
  *  @param[out] results Output distance array.
  *  @param[in] results_stride Stride between results in bytes.
  */
-SZ_DYNAMIC sz_status_t sz_levenshtein_distances_utf8_u64tape(         //
-    sz_levenshtein_distances_utf8_t engine, sz_device_scope_t device, //
-    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,   //
+SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_u64tape(          //
+    szs_levenshtein_distances_utf8_t engine, szs_device_scope_t device, //
+    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,     //
     sz_size_t *results, sz_size_t results_stride);
 
 /**
  *  @brief Free UTF-8 Levenshtein distance engine resources.
  *  @param[in] engine Engine handle to free.
  */
-SZ_DYNAMIC void sz_levenshtein_distances_utf8_free(sz_levenshtein_distances_utf8_t engine);
+SZ_DYNAMIC void szs_levenshtein_distances_utf8_free(szs_levenshtein_distances_utf8_t engine);
 
 /*  APIs for computing similarity scores between pairs of strings.
  *  Supports `sz_sequence_t`, `sz_sequence_u32tape_t`, and `sz_sequence_u64tape_t` inputs.
  */
 
-typedef void *sz_needleman_wunsch_scores_t;
-typedef void *sz_smith_waterman_scores_t;
+typedef void *szs_needleman_wunsch_scores_t;
+typedef void *szs_smith_waterman_scores_t;
 
 /**
  *  @brief Initialize Needleman-Wunsch global alignment scorer.
@@ -303,10 +303,10 @@ typedef void *sz_smith_waterman_scores_t;
  *  @param[in] capabilities Hardware capabilities mask.
  *  @param[out] engine Pointer to initialized engine handle.
  */
-SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_init(                        //
+SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_init(                       //
     sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities,          //
-    sz_needleman_wunsch_scores_t *engine);
+    szs_needleman_wunsch_scores_t *engine);
 
 /**
  *  @brief Calculate Needleman-Wunsch global alignment scores for sequences.
@@ -317,9 +317,9 @@ SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_init(                        /
  *  @param[out] results Output score array.
  *  @param[in] results_stride Stride between results in bytes.
  */
-SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_sequence(        //
-    sz_needleman_wunsch_scores_t engine, sz_device_scope_t device, //
-    sz_sequence_t const *a, sz_sequence_t const *b,                //
+SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_sequence(         //
+    szs_needleman_wunsch_scores_t engine, szs_device_scope_t device, //
+    sz_sequence_t const *a, sz_sequence_t const *b,                  //
     sz_ssize_t *results, sz_size_t results_stride);
 
 /**
@@ -331,9 +331,9 @@ SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_sequence(        //
  *  @param[out] results Output score array.
  *  @param[in] results_stride Stride between results in bytes.
  */
-SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u32tape(          //
-    sz_needleman_wunsch_scores_t engine, sz_device_scope_t device,  //
-    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b, //
+SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_u32tape(          //
+    szs_needleman_wunsch_scores_t engine, szs_device_scope_t device, //
+    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,  //
     sz_ssize_t *results, sz_size_t results_stride);
 
 /**
@@ -345,16 +345,16 @@ SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u32tape(          //
  *  @param[out] results Output score array.
  *  @param[in] results_stride Stride between results in bytes.
  */
-SZ_DYNAMIC sz_status_t sz_needleman_wunsch_scores_u64tape(          //
-    sz_needleman_wunsch_scores_t engine, sz_device_scope_t device,  //
-    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b, //
+SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_u64tape(          //
+    szs_needleman_wunsch_scores_t engine, szs_device_scope_t device, //
+    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,  //
     sz_ssize_t *results, sz_size_t results_stride);
 
 /**
  *  @brief Free Needleman-Wunsch scorer resources.
  *  @param[in] engine Engine handle to free.
  */
-SZ_DYNAMIC void sz_needleman_wunsch_scores_free(sz_needleman_wunsch_scores_t engine);
+SZ_DYNAMIC void szs_needleman_wunsch_scores_free(szs_needleman_wunsch_scores_t engine);
 
 /**
  *  @brief Initialize Smith-Waterman local alignment scorer.
@@ -369,10 +369,10 @@ SZ_DYNAMIC void sz_needleman_wunsch_scores_free(sz_needleman_wunsch_scores_t eng
  *  @param[in] capabilities Hardware capabilities mask.
  *  @param[out] engine Pointer to initialized engine handle.
  */
-SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_init(                          //
+SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_init(                         //
     sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities,          //
-    sz_smith_waterman_scores_t *engine);
+    szs_smith_waterman_scores_t *engine);
 
 /**
  *  @brief Calculate Smith-Waterman local alignment scores for sequences.
@@ -383,9 +383,9 @@ SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_init(                          /
  *  @param[out] results Output score array.
  *  @param[in] results_stride Stride between results in bytes.
  */
-SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_sequence(        //
-    sz_smith_waterman_scores_t engine, sz_device_scope_t device, //
-    sz_sequence_t const *a, sz_sequence_t const *b,              //
+SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_sequence(         //
+    szs_smith_waterman_scores_t engine, szs_device_scope_t device, //
+    sz_sequence_t const *a, sz_sequence_t const *b,                //
     sz_ssize_t *results, sz_size_t results_stride);
 
 /**
@@ -397,8 +397,8 @@ SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_sequence(        //
  *  @param[out] results Output score array.
  *  @param[in] results_stride Stride between results in bytes.
  */
-SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u32tape(            //
-    sz_smith_waterman_scores_t engine, sz_device_scope_t device,    //
+SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_u32tape(           //
+    szs_smith_waterman_scores_t engine, szs_device_scope_t device,  //
     sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b, //
     sz_ssize_t *results, sz_size_t results_stride);
 
@@ -411,8 +411,8 @@ SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u32tape(            //
  *  @param[out] results Output score array.
  *  @param[in] results_stride Stride between results in bytes.
  */
-SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u64tape(            //
-    sz_smith_waterman_scores_t engine, sz_device_scope_t device,    //
+SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_u64tape(           //
+    szs_smith_waterman_scores_t engine, szs_device_scope_t device,  //
     sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b, //
     sz_ssize_t *results, sz_size_t results_stride);
 
@@ -420,7 +420,7 @@ SZ_DYNAMIC sz_status_t sz_smith_waterman_scores_u64tape(            //
  *  @brief Free Smith-Waterman scorer resources.
  *  @param[in] engine Engine handle to free.
  */
-SZ_DYNAMIC void sz_smith_waterman_scores_free(sz_smith_waterman_scores_t engine);
+SZ_DYNAMIC void szs_smith_waterman_scores_free(szs_smith_waterman_scores_t engine);
 
 /**
  *  APIs for computing fingerprints, Min-Hashes, and Count-Min-Sketches of binary and UTF-8 strings.
@@ -444,8 +444,8 @@ SZ_DYNAMIC void sz_smith_waterman_scores_free(sz_smith_waterman_scores_t engine)
  *  For web packets, around 1 KB memory pages, 64 dimensions of each of [3, 4, 5, 7, 9, 11, 15, 31] are a good default.
  *  For longer strings, like 4 KB memory pages, we can aim for 128 dimensions of the same widths.
  */
-typedef void *sz_fingerprints_t;
-typedef void *sz_fingerprints_utf8_t;
+typedef void *szs_fingerprints_t;
+typedef void *szs_fingerprints_utf8_t;
 
 /**
  *  @brief Initialize fingerprinting engine for Min-Hash computation.
@@ -462,11 +462,11 @@ typedef void *sz_fingerprints_utf8_t;
  *  @param[out] engine Pointer to initialized engine handle.
  *  @note If alphabet_size is 0, defaults to 256. If window_widths is NULL, uses default widths.
  */
-SZ_DYNAMIC sz_status_t sz_fingerprints_init(                          //
+SZ_DYNAMIC sz_status_t szs_fingerprints_init(                         //
     sz_size_t dimensions, sz_size_t alphabet_size,                    //
     sz_size_t const *window_widths, sz_size_t window_widths_count,    //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities, //
-    sz_fingerprints_t *engine);
+    szs_fingerprints_t *engine);
 
 /**
  *  @brief Compute Min-Hash fingerprints for sequences.
@@ -478,10 +478,10 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_init(                          //
  *  @param[out] min_counts Output Count-Min-Sketch array.
  *  @param[in] min_counts_stride Stride between count results in bytes.
  */
-SZ_DYNAMIC sz_status_t sz_fingerprints_sequence(        //
-    sz_fingerprints_t engine, sz_device_scope_t device, //
-    sz_sequence_t const *texts,                         //
-    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,  //
+SZ_DYNAMIC sz_status_t szs_fingerprints_sequence(         //
+    szs_fingerprints_t engine, szs_device_scope_t device, //
+    sz_sequence_t const *texts,                           //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,    //
     sz_u32_t *min_counts, sz_size_t min_counts_stride);
 
 /**
@@ -494,10 +494,10 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_sequence(        //
  *  @param[out] min_counts Output Count-Min-Sketch array.
  *  @param[in] min_counts_stride Stride between count results in bytes.
  */
-SZ_DYNAMIC sz_status_t sz_fingerprints_u64tape(         //
-    sz_fingerprints_t engine, sz_device_scope_t device, //
-    sz_sequence_u64tape_t const *texts,                 //
-    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,  //
+SZ_DYNAMIC sz_status_t szs_fingerprints_u64tape(          //
+    szs_fingerprints_t engine, szs_device_scope_t device, //
+    sz_sequence_u64tape_t const *texts,                   //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,    //
     sz_u32_t *min_counts, sz_size_t min_counts_stride);
 
 /**
@@ -510,17 +510,17 @@ SZ_DYNAMIC sz_status_t sz_fingerprints_u64tape(         //
  *  @param[out] min_counts Output Count-Min-Sketch array.
  *  @param[in] min_counts_stride Stride between count results in bytes.
  */
-SZ_DYNAMIC sz_status_t sz_fingerprints_u32tape(         //
-    sz_fingerprints_t engine, sz_device_scope_t device, //
-    sz_sequence_u32tape_t const *texts,                 //
-    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,  //
+SZ_DYNAMIC sz_status_t szs_fingerprints_u32tape(          //
+    szs_fingerprints_t engine, szs_device_scope_t device, //
+    sz_sequence_u32tape_t const *texts,                   //
+    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,    //
     sz_u32_t *min_counts, sz_size_t min_counts_stride);
 
 /**
  *  @brief Free fingerprinting engine resources.
  *  @param[in] engine Engine handle to free.
  */
-SZ_DYNAMIC void sz_fingerprints_free(sz_fingerprints_t engine);
+SZ_DYNAMIC void szs_fingerprints_free(szs_fingerprints_t engine);
 
 /**
  *  @brief Allocates memory using unified memory allocator.
@@ -530,14 +530,14 @@ SZ_DYNAMIC void sz_fingerprints_free(sz_fingerprints_t engine);
  *  Uses CUDA unified memory when available, falls back to malloc otherwise.
  *  Allocated memory can be accessed from both CPU and GPU when CUDA is available.
  */
-SZ_DYNAMIC void *sz_unified_alloc(sz_size_t size_bytes);
+SZ_DYNAMIC void *szs_unified_alloc(sz_size_t size_bytes);
 
 /**
- *  @brief Deallocates memory allocated by sz_unified_alloc.
+ *  @brief Deallocates memory allocated by szs_unified_alloc.
  *  @param[in] ptr Pointer to memory to deallocate.
  *  @param[in] size_bytes Size of the allocation (for compatibility, may be ignored).
  */
-SZ_DYNAMIC void sz_unified_free(void *ptr, sz_size_t size_bytes);
+SZ_DYNAMIC void szs_unified_free(void *ptr, sz_size_t size_bytes);
 
 #ifdef __cplusplus
 }
diff --git a/python/stringzillas.c b/python/stringzillas.c
index de4f4598..507689c4 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -90,7 +90,7 @@ static sz_bool_t (*sz_py_replace_strings_allocator)(PyObject *, sz_memory_alloca
 
 // Default device scope that can be safely reused across calls
 // The underlying implementation is stateless and thread-safe
-static sz_device_scope_t default_device_scope = NULL;
+static szs_device_scope_t default_device_scope = NULL;
 // Static variable to store hardware capabilities
 static sz_capability_t default_hardware_capabilities = 0;
 // Static unified memory allocator for GPU compatibility
@@ -136,13 +136,13 @@ static sz_bool_t try_swap_to_unified_allocator(PyObject *strs_obj) {
  */
 typedef struct {
     PyObject ob_base;
-    sz_device_scope_t handle;
+    szs_device_scope_t handle;
     char description[32];
 } DeviceScope;
 
 static void DeviceScope_dealloc(DeviceScope *self) {
     if (self->handle) {
-        sz_device_scope_free(self->handle);
+        szs_device_scope_free(self->handle);
         self->handle = NULL;
     }
     Py_TYPE(self)->tp_free((PyObject *)self);
@@ -181,7 +181,7 @@ static int DeviceScope_init(DeviceScope *self, PyObject *args, PyObject *kwargs)
         }
         cpu_cores = PyLong_AsSize_t(cpu_cores_obj);
         if (cpu_cores == (sz_size_t)-1 && PyErr_Occurred()) { return -1; }
-        status = sz_device_scope_init_cpu_cores(cpu_cores, &self->handle);
+        status = szs_device_scope_init_cpu_cores(cpu_cores, &self->handle);
         snprintf(self->description, sizeof(self->description), "CPUs:%zu", cpu_cores);
     }
     else if (gpu_device_obj != NULL) {
@@ -191,11 +191,11 @@ static int DeviceScope_init(DeviceScope *self, PyObject *args, PyObject *kwargs)
         }
         gpu_device = PyLong_AsSize_t(gpu_device_obj);
         if (gpu_device == (sz_size_t)-1 && PyErr_Occurred()) { return -1; }
-        status = sz_device_scope_init_gpu_device(gpu_device, &self->handle);
+        status = szs_device_scope_init_gpu_device(gpu_device, &self->handle);
         snprintf(self->description, sizeof(self->description), "GPU:%zu", gpu_device);
     }
     else {
-        status = sz_device_scope_init_default(&self->handle);
+        status = szs_device_scope_init_default(&self->handle);
         snprintf(self->description, sizeof(self->description), "default");
     }
 
@@ -250,7 +250,7 @@ static int parse_and_intersect_capabilities(PyObject *caps_obj, sz_capability_t
 
         // Try to get GPU device
         sz_size_t gpu_device;
-        if (sz_device_scope_get_gpu_device(device_scope->handle, &gpu_device) == sz_success_k) {
+        if (szs_device_scope_get_gpu_device(device_scope->handle, &gpu_device) == sz_success_k) {
             // This is a GPU scope - prefer CUDA if available
             if (default_hardware_capabilities & sz_caps_cuda_k) { *result = sz_cap_cuda_k; }
             else {
@@ -262,7 +262,7 @@ static int parse_and_intersect_capabilities(PyObject *caps_obj, sz_capability_t
 
         // Try to get CPU cores first
         sz_size_t cpu_cores;
-        if (sz_device_scope_get_cpu_cores(device_scope->handle, &cpu_cores) == sz_success_k) {
+        if (szs_device_scope_get_cpu_cores(device_scope->handle, &cpu_cores) == sz_success_k) {
             // This is a CPU scope - prefer parallel if available, otherwise serial
             *result = sz_caps_cpus_k;
             return 0;
@@ -331,14 +331,14 @@ static int parse_and_intersect_capabilities(PyObject *caps_obj, sz_capability_t
  */
 typedef struct {
     PyObject ob_base;
-    sz_levenshtein_distances_t handle;
+    szs_levenshtein_distances_t handle;
     char description[32];
     sz_capability_t capabilities;
 } LevenshteinDistances;
 
 static void LevenshteinDistances_dealloc(LevenshteinDistances *self) {
     if (self->handle) {
-        sz_levenshtein_distances_free(self->handle);
+        szs_levenshtein_distances_free(self->handle);
         self->handle = NULL;
     }
     Py_TYPE(self)->tp_free((PyObject *)self);
@@ -388,7 +388,7 @@ static int LevenshteinDistances_init(LevenshteinDistances *self, PyObject *args,
     }
 
     sz_status_t status =
-        sz_levenshtein_distances_init(match, mismatch, open, extend, NULL, capabilities, &self->handle);
+        szs_levenshtein_distances_init(match, mismatch, open, extend, NULL, capabilities, &self->handle);
 
     if (status != sz_success_k) {
         PyErr_SetString(PyExc_RuntimeError, "Failed to initialize Levenshtein distances engine");
@@ -423,13 +423,13 @@ static PyObject *LevenshteinDistances_call(LevenshteinDistances *self, PyObject
         device_scope = (DeviceScope *)device_obj;
     }
 
-    sz_device_scope_t device_handle = device_scope ? device_scope->handle : default_device_scope;
+    szs_device_scope_t device_handle = device_scope ? device_scope->handle : default_device_scope;
     sz_size_t kernel_input_size = 0;
     void *kernel_a_texts_punned = NULL;
     void *kernel_b_texts_punned = NULL;
     sz_size_t *kernel_results = NULL;
     sz_size_t kernel_results_stride = sizeof(sz_size_t);
-    sz_status_t (*kernel_punned)(sz_levenshtein_distances_t, sz_device_scope_t, void *, void *, sz_size_t *,
+    sz_status_t (*kernel_punned)(szs_levenshtein_distances_t, szs_device_scope_t, void *, void *, sz_size_t *,
                                  sz_size_t) = NULL;
 
     // Try to swap allocators to unified memory for GPU compatibility
@@ -448,7 +448,7 @@ static PyObject *LevenshteinDistances_call(LevenshteinDistances *self, PyObject
         }
 
         kernel_input_size = a_u32tape.count;
-        kernel_punned = sz_levenshtein_distances_u32tape;
+        kernel_punned = szs_levenshtein_distances_u32tape;
         kernel_a_texts_punned = &a_u32tape;
         kernel_b_texts_punned = &b_u32tape;
     }
@@ -465,7 +465,7 @@ static PyObject *LevenshteinDistances_call(LevenshteinDistances *self, PyObject
             return NULL;
         }
         kernel_input_size = a_u64tape.count;
-        kernel_punned = sz_levenshtein_distances_u64tape;
+        kernel_punned = szs_levenshtein_distances_u64tape;
         kernel_a_texts_punned = &a_u64tape;
         kernel_b_texts_punned = &b_u64tape;
     }
@@ -480,7 +480,7 @@ static PyObject *LevenshteinDistances_call(LevenshteinDistances *self, PyObject
             return NULL;
         }
         kernel_input_size = a_seq.count;
-        kernel_punned = sz_levenshtein_distances_sequence;
+        kernel_punned = szs_levenshtein_distances_sequence;
         kernel_a_texts_punned = &a_seq;
         kernel_b_texts_punned = &b_seq;
     }
@@ -620,7 +620,7 @@ static PyTypeObject LevenshteinDistancesType = {
 
 typedef struct {
     PyObject ob_base;
-    sz_levenshtein_distances_utf8_t handle;
+    szs_levenshtein_distances_utf8_t handle;
     char description[32];
     sz_capability_t capabilities;
 } LevenshteinDistancesUTF8;
@@ -636,7 +636,7 @@ static PyObject *LevenshteinDistancesUTF8_new(PyTypeObject *type, PyObject *args
 }
 
 static void LevenshteinDistancesUTF8_dealloc(LevenshteinDistancesUTF8 *self) {
-    if (self->handle) { sz_levenshtein_distances_utf8_free(self->handle); }
+    if (self->handle) { szs_levenshtein_distances_utf8_free(self->handle); }
     Py_TYPE(self)->tp_free((PyObject *)self);
 }
 
@@ -674,7 +674,7 @@ static int LevenshteinDistancesUTF8_init(LevenshteinDistancesUTF8 *self, PyObjec
     }
 
     sz_status_t status =
-        sz_levenshtein_distances_utf8_init(match, mismatch, open, extend, NULL, capabilities, &self->handle);
+        szs_levenshtein_distances_utf8_init(match, mismatch, open, extend, NULL, capabilities, &self->handle);
 
     if (status != sz_success_k) {
         PyErr_SetString(PyExc_RuntimeError, "Failed to initialize UTF-8 Levenshtein distances engine");
@@ -708,13 +708,13 @@ static PyObject *LevenshteinDistancesUTF8_call(LevenshteinDistancesUTF8 *self, P
         device_scope = (DeviceScope *)device_obj;
     }
 
-    sz_device_scope_t device_handle = device_scope ? device_scope->handle : default_device_scope;
+    szs_device_scope_t device_handle = device_scope ? device_scope->handle : default_device_scope;
     sz_size_t kernel_input_size = 0;
     void *kernel_a_texts_punned = NULL;
     void *kernel_b_texts_punned = NULL;
     sz_size_t *kernel_results = NULL;
     sz_size_t kernel_results_stride = sizeof(sz_size_t);
-    sz_status_t (*kernel_punned)(sz_levenshtein_distances_t, sz_device_scope_t, void *, void *, sz_size_t *,
+    sz_status_t (*kernel_punned)(szs_levenshtein_distances_t, szs_device_scope_t, void *, void *, sz_size_t *,
                                  sz_size_t) = NULL;
 
     // Try to swap allocators to unified memory for GPU compatibility
@@ -733,7 +733,7 @@ static PyObject *LevenshteinDistancesUTF8_call(LevenshteinDistancesUTF8 *self, P
         }
 
         kernel_input_size = a_u32tape.count;
-        kernel_punned = sz_levenshtein_distances_utf8_u32tape;
+        kernel_punned = szs_levenshtein_distances_utf8_u32tape;
         kernel_a_texts_punned = &a_u32tape;
         kernel_b_texts_punned = &b_u32tape;
     }
@@ -750,7 +750,7 @@ static PyObject *LevenshteinDistancesUTF8_call(LevenshteinDistancesUTF8 *self, P
             return NULL;
         }
         kernel_input_size = a_u64tape.count;
-        kernel_punned = sz_levenshtein_distances_utf8_u64tape;
+        kernel_punned = szs_levenshtein_distances_utf8_u64tape;
         kernel_a_texts_punned = &a_u64tape;
         kernel_b_texts_punned = &b_u64tape;
     }
@@ -765,7 +765,7 @@ static PyObject *LevenshteinDistancesUTF8_call(LevenshteinDistancesUTF8 *self, P
             return NULL;
         }
         kernel_input_size = a_seq.count;
-        kernel_punned = sz_levenshtein_distances_utf8_sequence;
+        kernel_punned = szs_levenshtein_distances_utf8_sequence;
         kernel_a_texts_punned = &a_seq;
         kernel_b_texts_punned = &b_seq;
     }
@@ -909,14 +909,14 @@ static PyTypeObject LevenshteinDistancesUTF8Type = {
  */
 typedef struct {
     PyObject ob_base;
-    sz_needleman_wunsch_scores_t handle;
+    szs_needleman_wunsch_scores_t handle;
     char description[32];
     sz_capability_t capabilities;
 } NeedlemanWunsch;
 
 static void NeedlemanWunsch_dealloc(NeedlemanWunsch *self) {
     if (self->handle) {
-        sz_needleman_wunsch_scores_free(self->handle);
+        szs_needleman_wunsch_scores_free(self->handle);
         self->handle = NULL;
     }
     Py_TYPE(self)->tp_free((PyObject *)self);
@@ -974,7 +974,7 @@ static int NeedlemanWunsch_init(NeedlemanWunsch *self, PyObject *args, PyObject
     for (int i = 0; i < 256; i += 16)                      // Sample every 16th element
         subs_checksum += (sz_u32_t)subs_data[i * 256 + i]; // Diagonal elements
 
-    sz_status_t status = sz_needleman_wunsch_scores_init(subs_data, open, extend, NULL, capabilities, &self->handle);
+    sz_status_t status = szs_needleman_wunsch_scores_init(subs_data, open, extend, NULL, capabilities, &self->handle);
     if (status != sz_success_k) {
         char const *error_msg;
         switch (status) {
@@ -1011,7 +1011,7 @@ static PyObject *NeedlemanWunsch_call(NeedlemanWunsch *self, PyObject *args, PyO
     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|OO", kwlist, &a_obj, &b_obj, &device_obj, &out_obj)) return NULL;
 
     // Get device handle
-    sz_device_scope_t device_handle = default_device_scope;
+    szs_device_scope_t device_handle = default_device_scope;
     if (device_obj && device_obj != Py_None) {
         if (!PyObject_IsInstance(device_obj, (PyObject *)&DeviceScopeType)) {
             PyErr_SetString(PyExc_TypeError, "device must be a DeviceScope instance");
@@ -1023,7 +1023,7 @@ static PyObject *NeedlemanWunsch_call(NeedlemanWunsch *self, PyObject *args, PyO
     sz_size_t kernel_input_size = 0;
     void const *kernel_a_texts_punned = NULL;
     void const *kernel_b_texts_punned = NULL;
-    sz_status_t (*kernel_punned)(sz_needleman_wunsch_scores_t, sz_device_scope_t, void const *, void const *,
+    sz_status_t (*kernel_punned)(szs_needleman_wunsch_scores_t, szs_device_scope_t, void const *, void const *,
                                  sz_ssize_t *, sz_size_t) = NULL;
     // Try to swap allocators to unified memory for GPU compatibility
     if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) return NULL;
@@ -1040,7 +1040,7 @@ static PyObject *NeedlemanWunsch_call(NeedlemanWunsch *self, PyObject *args, PyO
             return NULL;
         }
         kernel_input_size = a_u32tape.count;
-        kernel_punned = sz_needleman_wunsch_scores_u32tape;
+        kernel_punned = szs_needleman_wunsch_scores_u32tape;
         kernel_a_texts_punned = &a_u32tape;
         kernel_b_texts_punned = &b_u32tape;
     }
@@ -1057,7 +1057,7 @@ static PyObject *NeedlemanWunsch_call(NeedlemanWunsch *self, PyObject *args, PyO
             return NULL;
         }
         kernel_input_size = a_u64tape.count;
-        kernel_punned = sz_needleman_wunsch_scores_u64tape;
+        kernel_punned = szs_needleman_wunsch_scores_u64tape;
         kernel_a_texts_punned = &a_u64tape;
         kernel_b_texts_punned = &b_u64tape;
     }
@@ -1072,7 +1072,7 @@ static PyObject *NeedlemanWunsch_call(NeedlemanWunsch *self, PyObject *args, PyO
             return NULL;
         }
         kernel_input_size = a_seq.count;
-        kernel_punned = sz_needleman_wunsch_scores_sequence;
+        kernel_punned = szs_needleman_wunsch_scores_sequence;
         kernel_a_texts_punned = &a_seq;
         kernel_b_texts_punned = &b_seq;
     }
@@ -1209,12 +1209,12 @@ static PyTypeObject NeedlemanWunschType = {
  */
 typedef struct {
     PyObject ob_base;
-    sz_smith_waterman_scores_t handle;
+    szs_smith_waterman_scores_t handle;
 } SmithWaterman;
 
 static void SmithWaterman_dealloc(SmithWaterman *self) {
     if (self->handle) {
-        sz_smith_waterman_scores_free(self->handle);
+        szs_smith_waterman_scores_free(self->handle);
         self->handle = NULL;
     }
     Py_TYPE(self)->tp_free((PyObject *)self);
@@ -1262,7 +1262,7 @@ static int SmithWaterman_init(SmithWaterman *self, PyObject *args, PyObject *kwa
 
     // Initialize the engine
     sz_error_cost_t *subs_data = (sz_error_cost_t *)PyArray_DATA(subs_array);
-    sz_status_t status = sz_smith_waterman_scores_init(subs_data, open, extend, NULL, capabilities, &self->handle);
+    sz_status_t status = szs_smith_waterman_scores_init(subs_data, open, extend, NULL, capabilities, &self->handle);
 
     if (status != sz_success_k) {
         char const *error_msg;
@@ -1290,7 +1290,7 @@ static PyObject *SmithWaterman_call(SmithWaterman *self, PyObject *args, PyObjec
     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|OO", kwlist, &a_obj, &b_obj, &device_obj, &out_obj)) return NULL;
 
     // Get device handle
-    sz_device_scope_t device_handle = default_device_scope;
+    szs_device_scope_t device_handle = default_device_scope;
     if (device_obj && device_obj != Py_None) {
         if (!PyObject_IsInstance(device_obj, (PyObject *)&DeviceScopeType)) {
             PyErr_SetString(PyExc_TypeError, "device must be a DeviceScope instance");
@@ -1302,7 +1302,7 @@ static PyObject *SmithWaterman_call(SmithWaterman *self, PyObject *args, PyObjec
     sz_size_t kernel_input_size = 0;
     void const *kernel_a_texts_punned = NULL;
     void const *kernel_b_texts_punned = NULL;
-    sz_status_t (*kernel_punned)(sz_smith_waterman_scores_t, sz_device_scope_t, void const *, void const *,
+    sz_status_t (*kernel_punned)(szs_smith_waterman_scores_t, szs_device_scope_t, void const *, void const *,
                                  sz_ssize_t *, sz_size_t) = NULL;
     // Try to swap allocators to unified memory for GPU compatibility
     if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) return NULL;
@@ -1319,7 +1319,7 @@ static PyObject *SmithWaterman_call(SmithWaterman *self, PyObject *args, PyObjec
             return NULL;
         }
         kernel_input_size = a_u32tape.count;
-        kernel_punned = sz_smith_waterman_scores_u32tape;
+        kernel_punned = szs_smith_waterman_scores_u32tape;
         kernel_a_texts_punned = &a_u32tape;
         kernel_b_texts_punned = &b_u32tape;
     }
@@ -1336,7 +1336,7 @@ static PyObject *SmithWaterman_call(SmithWaterman *self, PyObject *args, PyObjec
             return NULL;
         }
         kernel_input_size = a_u64tape.count;
-        kernel_punned = sz_smith_waterman_scores_u64tape;
+        kernel_punned = szs_smith_waterman_scores_u64tape;
         kernel_a_texts_punned = &a_u64tape;
         kernel_b_texts_punned = &b_u64tape;
     }
@@ -1351,7 +1351,7 @@ static PyObject *SmithWaterman_call(SmithWaterman *self, PyObject *args, PyObjec
             return NULL;
         }
         kernel_input_size = a_seq.count;
-        kernel_punned = sz_smith_waterman_scores_sequence;
+        kernel_punned = szs_smith_waterman_scores_sequence;
         kernel_a_texts_punned = &a_seq;
         kernel_b_texts_punned = &b_seq;
     }
@@ -1488,7 +1488,7 @@ static PyTypeObject SmithWatermanType = {
  */
 typedef struct {
     PyObject ob_base;
-    sz_fingerprints_t handle;
+    szs_fingerprints_t handle;
     char description[64];
     sz_capability_t capabilities;
     sz_size_t ndim;
@@ -1496,7 +1496,7 @@ typedef struct {
 
 static void Fingerprints_dealloc(Fingerprints *self) {
     if (self->handle) {
-        sz_fingerprints_free(self->handle);
+        szs_fingerprints_free(self->handle);
         self->handle = NULL;
     }
     Py_TYPE(self)->tp_free((PyObject *)self);
@@ -1563,8 +1563,8 @@ static int Fingerprints_init(Fingerprints *self, PyObject *args, PyObject *kwarg
         window_widths = (sz_size_t *)PyArray_DATA(arr);
     }
 
-    sz_status_t status = sz_fingerprints_init(ndim, alphabet_size, window_widths, window_widths_count, NULL,
-                                              capabilities, &self->handle);
+    sz_status_t status = szs_fingerprints_init(ndim, alphabet_size, window_widths, window_widths_count, NULL,
+                                               capabilities, &self->handle);
 
     if (status != sz_success_k) {
         PyErr_SetString(PyExc_RuntimeError, "Failed to initialize Fingerprints engine");
@@ -1601,7 +1601,7 @@ static PyObject *Fingerprints_call(Fingerprints *self, PyObject *args, PyObject
         device_scope = (DeviceScope *)device_obj;
     }
 
-    sz_device_scope_t device_handle = device_scope ? device_scope->handle : default_device_scope;
+    szs_device_scope_t device_handle = device_scope ? device_scope->handle : default_device_scope;
 
     // Handle empty input - return tuple of empty arrays
     if (PySequence_Check(texts_obj) && PySequence_Size(texts_obj) == 0) {
@@ -1632,7 +1632,7 @@ static PyObject *Fingerprints_call(Fingerprints *self, PyObject *args, PyObject
 
     sz_size_t kernel_input_size = 0;
     void *kernel_texts_punned = NULL;
-    sz_status_t (*kernel_punned)(sz_fingerprints_t, sz_device_scope_t, void *, sz_u32_t *, sz_size_t, sz_u32_t *,
+    sz_status_t (*kernel_punned)(szs_fingerprints_t, szs_device_scope_t, void *, sz_u32_t *, sz_size_t, sz_u32_t *,
                                  sz_size_t) = NULL;
 
     // Handle 32-bit tape inputs
@@ -1641,7 +1641,7 @@ static PyObject *Fingerprints_call(Fingerprints *self, PyObject *args, PyObject
         texts_obj, &texts_u32tape.data, &texts_u32tape.offsets, &texts_u32tape.count);
     if (texts_is_u32tape) {
         kernel_input_size = texts_u32tape.count;
-        kernel_punned = sz_fingerprints_u32tape;
+        kernel_punned = szs_fingerprints_u32tape;
         kernel_texts_punned = &texts_u32tape;
     }
 
@@ -1652,7 +1652,7 @@ static PyObject *Fingerprints_call(Fingerprints *self, PyObject *args, PyObject
                                  texts_obj, &texts_u64tape.data, &texts_u64tape.offsets, &texts_u64tape.count);
     if (texts_is_u64tape) {
         kernel_input_size = texts_u64tape.count;
-        kernel_punned = sz_fingerprints_u64tape;
+        kernel_punned = szs_fingerprints_u64tape;
         kernel_texts_punned = &texts_u64tape;
     }
 
@@ -1662,7 +1662,7 @@ static PyObject *Fingerprints_call(Fingerprints *self, PyObject *args, PyObject
         !texts_is_u32tape && !texts_is_u64tape && sz_py_export_strings_as_sequence(texts_obj, &texts_seq);
     if (texts_is_sequence) {
         kernel_input_size = texts_seq.count;
-        kernel_punned = sz_fingerprints_sequence;
+        kernel_punned = szs_fingerprints_sequence;
         kernel_texts_punned = &texts_seq;
     }
 
@@ -1793,7 +1793,7 @@ static PyTypeObject FingerprintsType = {
 static void stringzillas_cleanup(PyObject *m) {
     sz_unused_(m);
     if (default_device_scope) {
-        sz_device_scope_free(default_device_scope);
+        szs_device_scope_free(default_device_scope);
         default_device_scope = NULL;
     }
 }
@@ -1877,7 +1877,7 @@ PyMODINIT_FUNC PyInit_stringzillas(void) {
     if (alloc_status != sz_success_k) sz_memory_allocator_init_default(&unified_allocator);
 
     // Initialize the default device scope for reuse
-    sz_status_t status = sz_device_scope_init_default(&default_device_scope);
+    sz_status_t status = szs_device_scope_init_default(&default_device_scope);
     if (status != sz_success_k) {
         PyErr_SetString(PyExc_RuntimeError, "Failed to initialize default device scope");
         return NULL;
diff --git a/rust/stringzillas.rs b/rust/stringzillas.rs
index 9f89d46c..fa9e0a58 100644
--- a/rust/stringzillas.rs
+++ b/rust/stringzillas.rs
@@ -102,7 +102,7 @@ impl DeviceScope {
     /// ```
     pub fn default() -> Result<Self, Status> {
         let mut handle = ptr::null_mut();
-        let status = unsafe { sz_device_scope_init_default(&mut handle) };
+        let status = unsafe { szs_device_scope_init_default(&mut handle) };
         match status {
             Status::Success => Ok(Self { handle }),
             err => Err(err),
@@ -146,7 +146,7 @@ impl DeviceScope {
     /// - Consider NUMA topology for systems with >16 cores
     pub fn cpu_cores(cpu_cores: usize) -> Result<Self, Status> {
         let mut handle = ptr::null_mut();
-        let status = unsafe { sz_device_scope_init_cpu_cores(cpu_cores, &mut handle) };
+        let status = unsafe { szs_device_scope_init_cpu_cores(cpu_cores, &mut handle) };
         match status {
             Status::Success => Ok(Self { handle }),
             err => Err(err),
@@ -201,7 +201,7 @@ impl DeviceScope {
     /// - Use unified memory allocation for best GPU performance
     pub fn gpu_device(gpu_device: usize) -> Result<Self, Status> {
         let mut handle = ptr::null_mut();
-        let status = unsafe { sz_device_scope_init_gpu_device(gpu_device, &mut handle) };
+        let status = unsafe { szs_device_scope_init_gpu_device(gpu_device, &mut handle) };
         match status {
             Status::Success => Ok(Self { handle }),
             err => Err(err),
@@ -233,7 +233,7 @@ impl DeviceScope {
     /// ```
     pub fn get_capabilities(&self) -> Result<Capability, Status> {
         let mut capabilities: Capability = 0;
-        let status = unsafe { sz_device_scope_get_capabilities(self.handle, &mut capabilities) };
+        let status = unsafe { szs_device_scope_get_capabilities(self.handle, &mut capabilities) };
         match status {
             Status::Success => Ok(capabilities),
             err => Err(err),
@@ -264,7 +264,7 @@ impl DeviceScope {
     /// ```
     pub fn get_cpu_cores(&self) -> Result<usize, Status> {
         let mut cpu_cores: usize = 0;
-        let status = unsafe { sz_device_scope_get_cpu_cores(self.handle, &mut cpu_cores) };
+        let status = unsafe { szs_device_scope_get_cpu_cores(self.handle, &mut cpu_cores) };
         match status {
             Status::Success => Ok(cpu_cores),
             err => Err(err),
@@ -298,7 +298,7 @@ impl DeviceScope {
     /// ```
     pub fn get_gpu_device(&self) -> Result<usize, Status> {
         let mut gpu_device: usize = 0;
-        let status = unsafe { sz_device_scope_get_gpu_device(self.handle, &mut gpu_device) };
+        let status = unsafe { szs_device_scope_get_gpu_device(self.handle, &mut gpu_device) };
         match status {
             Status::Success => Ok(gpu_device),
             err => Err(err),
@@ -333,7 +333,7 @@ impl Drop for DeviceScope {
     fn drop(&mut self) {
         if !self.handle.is_null() {
             unsafe {
-                sz_device_scope_free(self.handle);
+                szs_device_scope_free(self.handle);
             }
         }
     }
@@ -753,7 +753,7 @@ impl FingerprintsBuilder {
         };
 
         let status = unsafe {
-            sz_fingerprints_init(
+            szs_fingerprints_init(
                 self.dimensions,
                 self.alphabet_size,
                 widths_ptr,
@@ -977,7 +977,7 @@ impl Fingerprints {
             let status = if use_64bit {
                 let tape_view = create_u64tape_view(&tape);
                 unsafe {
-                    sz_fingerprints_u64tape(
+                    szs_fingerprints_u64tape(
                         self.handle,
                         device.handle,
                         &tape_view as *const _ as *const c_void,
@@ -990,7 +990,7 @@ impl Fingerprints {
             } else {
                 let tape_view = create_u32tape_view(&tape);
                 unsafe {
-                    sz_fingerprints_u32tape(
+                    szs_fingerprints_u32tape(
                         self.handle,
                         device.handle,
                         &tape_view as *const _ as *const c_void,
@@ -1008,7 +1008,7 @@ impl Fingerprints {
         } else {
             let sequence = create_sequence_view(strings_slice);
             let status = unsafe {
-                sz_fingerprints_sequence(
+                szs_fingerprints_sequence(
                     self.handle,
                     device.handle,
                     &sequence as *const _ as *const c_void,
@@ -1030,7 +1030,7 @@ impl Drop for Fingerprints {
     fn drop(&mut self) {
         if !self.handle.is_null() {
             unsafe {
-                sz_fingerprints_free(self.handle);
+                szs_fingerprints_free(self.handle);
             }
         }
     }
@@ -1077,16 +1077,16 @@ pub type SmithWatermanScoresHandle = *mut c_void;
 // C API bindings
 extern "C" {
     // Device scope functions
-    fn sz_device_scope_init_default(scope: *mut *mut c_void) -> Status;
-    fn sz_device_scope_init_cpu_cores(cpu_cores: usize, scope: *mut *mut c_void) -> Status;
-    fn sz_device_scope_init_gpu_device(gpu_device: usize, scope: *mut *mut c_void) -> Status;
-    fn sz_device_scope_get_capabilities(scope: *mut c_void, capabilities: *mut Capability) -> Status;
-    fn sz_device_scope_get_cpu_cores(scope: *mut c_void, cpu_cores: *mut usize) -> Status;
-    fn sz_device_scope_get_gpu_device(scope: *mut c_void, gpu_device: *mut usize) -> Status;
-    fn sz_device_scope_free(scope: *mut c_void);
+    fn szs_device_scope_init_default(scope: *mut *mut c_void) -> Status;
+    fn szs_device_scope_init_cpu_cores(cpu_cores: usize, scope: *mut *mut c_void) -> Status;
+    fn szs_device_scope_init_gpu_device(gpu_device: usize, scope: *mut *mut c_void) -> Status;
+    fn szs_device_scope_get_capabilities(scope: *mut c_void, capabilities: *mut Capability) -> Status;
+    fn szs_device_scope_get_cpu_cores(scope: *mut c_void, cpu_cores: *mut usize) -> Status;
+    fn szs_device_scope_get_gpu_device(scope: *mut c_void, gpu_device: *mut usize) -> Status;
+    fn szs_device_scope_free(scope: *mut c_void);
 
     // Levenshtein distance functions
-    fn sz_levenshtein_distances_init(
+    fn szs_levenshtein_distances_init(
         match_cost: i8,
         mismatch_cost: i8,
         open_cost: i8,
@@ -1096,7 +1096,7 @@ extern "C" {
         engine: *mut LevenshteinDistancesHandle,
     ) -> Status;
 
-    fn sz_levenshtein_distances_sequence(
+    fn szs_levenshtein_distances_sequence(
         engine: LevenshteinDistancesHandle,
         device: *mut c_void,
         a: *const c_void, // sz_sequence_t
@@ -1105,7 +1105,7 @@ extern "C" {
         results_stride: usize,
     ) -> Status;
 
-    fn sz_levenshtein_distances_u32tape(
+    fn szs_levenshtein_distances_u32tape(
         engine: LevenshteinDistancesHandle,
         device: *mut c_void,
         a: *const c_void, // sz_sequence_u32tape_t
@@ -1114,7 +1114,7 @@ extern "C" {
         results_stride: usize,
     ) -> Status;
 
-    fn sz_levenshtein_distances_u64tape(
+    fn szs_levenshtein_distances_u64tape(
         engine: LevenshteinDistancesHandle,
         device: *mut c_void,
         a: *const c_void, // sz_sequence_u64tape_t
@@ -1123,10 +1123,10 @@ extern "C" {
         results_stride: usize,
     ) -> Status;
 
-    fn sz_levenshtein_distances_free(engine: LevenshteinDistancesHandle);
+    fn szs_levenshtein_distances_free(engine: LevenshteinDistancesHandle);
 
     // Levenshtein distance UTF-8 functions
-    fn sz_levenshtein_distances_utf8_init(
+    fn szs_levenshtein_distances_utf8_init(
         match_cost: i8,
         mismatch_cost: i8,
         open_cost: i8,
@@ -1136,7 +1136,7 @@ extern "C" {
         engine: *mut LevenshteinDistancesUtf8Handle,
     ) -> Status;
 
-    fn sz_levenshtein_distances_utf8_sequence(
+    fn szs_levenshtein_distances_utf8_sequence(
         engine: LevenshteinDistancesUtf8Handle,
         device: *mut c_void,
         a: *const c_void, // sz_sequence_t
@@ -1145,7 +1145,7 @@ extern "C" {
         results_stride: usize,
     ) -> Status;
 
-    fn sz_levenshtein_distances_utf8_u32tape(
+    fn szs_levenshtein_distances_utf8_u32tape(
         engine: LevenshteinDistancesUtf8Handle,
         device: *mut c_void,
         a: *const c_void, // sz_sequence_u32tape_t
@@ -1154,7 +1154,7 @@ extern "C" {
         results_stride: usize,
     ) -> Status;
 
-    fn sz_levenshtein_distances_utf8_u64tape(
+    fn szs_levenshtein_distances_utf8_u64tape(
         engine: LevenshteinDistancesUtf8Handle,
         device: *mut c_void,
         a: *const c_void, // sz_sequence_u64tape_t
@@ -1163,10 +1163,10 @@ extern "C" {
         results_stride: usize,
     ) -> Status;
 
-    fn sz_levenshtein_distances_utf8_free(engine: LevenshteinDistancesUtf8Handle);
+    fn szs_levenshtein_distances_utf8_free(engine: LevenshteinDistancesUtf8Handle);
 
     // Needleman-Wunsch scoring functions
-    fn sz_needleman_wunsch_scores_init(
+    fn szs_needleman_wunsch_scores_init(
         subs: *const i8, // 256x256 substitution matrix
         open_cost: i8,
         extend_cost: i8,
@@ -1175,7 +1175,7 @@ extern "C" {
         engine: *mut NeedlemanWunschScoresHandle,
     ) -> Status;
 
-    fn sz_needleman_wunsch_scores_sequence(
+    fn szs_needleman_wunsch_scores_sequence(
         engine: NeedlemanWunschScoresHandle,
         device: *mut c_void,
         a: *const c_void, // sz_sequence_t
@@ -1184,7 +1184,7 @@ extern "C" {
         results_stride: usize,
     ) -> Status;
 
-    fn sz_needleman_wunsch_scores_u32tape(
+    fn szs_needleman_wunsch_scores_u32tape(
         engine: NeedlemanWunschScoresHandle,
         device: *mut c_void,
         a: *const c_void, // sz_sequence_u32tape_t
@@ -1193,7 +1193,7 @@ extern "C" {
         results_stride: usize,
     ) -> Status;
 
-    fn sz_needleman_wunsch_scores_u64tape(
+    fn szs_needleman_wunsch_scores_u64tape(
         engine: NeedlemanWunschScoresHandle,
         device: *mut c_void,
         a: *const c_void, // sz_sequence_u64tape_t
@@ -1202,10 +1202,10 @@ extern "C" {
         results_stride: usize,
     ) -> Status;
 
-    fn sz_needleman_wunsch_scores_free(engine: NeedlemanWunschScoresHandle);
+    fn szs_needleman_wunsch_scores_free(engine: NeedlemanWunschScoresHandle);
 
     // Smith-Waterman scoring functions
-    fn sz_smith_waterman_scores_init(
+    fn szs_smith_waterman_scores_init(
         subs: *const i8, // 256x256 substitution matrix
         open_cost: i8,
         extend_cost: i8,
@@ -1214,7 +1214,7 @@ extern "C" {
         engine: *mut SmithWatermanScoresHandle,
     ) -> Status;
 
-    fn sz_smith_waterman_scores_sequence(
+    fn szs_smith_waterman_scores_sequence(
         engine: SmithWatermanScoresHandle,
         device: *mut c_void,
         a: *const c_void, // sz_sequence_t
@@ -1223,7 +1223,7 @@ extern "C" {
         results_stride: usize,
     ) -> Status;
 
-    fn sz_smith_waterman_scores_u32tape(
+    fn szs_smith_waterman_scores_u32tape(
         engine: SmithWatermanScoresHandle,
         device: *mut c_void,
         a: *const c_void, // sz_sequence_u32tape_t
@@ -1232,7 +1232,7 @@ extern "C" {
         results_stride: usize,
     ) -> Status;
 
-    fn sz_smith_waterman_scores_u64tape(
+    fn szs_smith_waterman_scores_u64tape(
         engine: SmithWatermanScoresHandle,
         device: *mut c_void,
         a: *const c_void, // sz_sequence_u64tape_t
@@ -1241,10 +1241,10 @@ extern "C" {
         results_stride: usize,
     ) -> Status;
 
-    fn sz_smith_waterman_scores_free(engine: SmithWatermanScoresHandle);
+    fn szs_smith_waterman_scores_free(engine: SmithWatermanScoresHandle);
 
     // Fingerprinting functions
-    fn sz_fingerprints_init(
+    fn szs_fingerprints_init(
         dimensions: usize,
         alphabet_size: usize,
         window_widths: *const usize,
@@ -1254,7 +1254,7 @@ extern "C" {
         engine: *mut FingerprintsHandle,
     ) -> Status;
 
-    fn sz_fingerprints_sequence(
+    fn szs_fingerprints_sequence(
         engine: FingerprintsHandle,
         device: *mut c_void,  // DeviceScope
         texts: *const c_void, // sz_sequence_t
@@ -1264,7 +1264,7 @@ extern "C" {
         min_counts_stride: usize,
     ) -> Status;
 
-    fn sz_fingerprints_u32tape(
+    fn szs_fingerprints_u32tape(
         engine: FingerprintsHandle,
         device: *mut c_void,  // DeviceScope
         texts: *const c_void, // sz_sequence_u32tape_t
@@ -1274,7 +1274,7 @@ extern "C" {
         min_counts_stride: usize,
     ) -> Status;
 
-    fn sz_fingerprints_u64tape(
+    fn szs_fingerprints_u64tape(
         engine: FingerprintsHandle,
         device: *mut c_void,  // DeviceScope
         texts: *const c_void, // sz_sequence_u64tape_t
@@ -1284,11 +1284,11 @@ extern "C" {
         min_counts_stride: usize,
     ) -> Status;
 
-    fn sz_fingerprints_free(engine: FingerprintsHandle);
+    fn szs_fingerprints_free(engine: FingerprintsHandle);
 
     // Unified allocator functions
-    fn sz_unified_alloc(size_bytes: usize) -> *mut c_void;
-    fn sz_unified_free(ptr: *mut c_void, size_bytes: usize);
+    fn szs_unified_alloc(size_bytes: usize) -> *mut c_void;
+    fn szs_unified_free(ptr: *mut c_void, size_bytes: usize);
 }
 
 /// Unified memory allocator that uses CUDA unified memory when available,
@@ -1304,7 +1304,7 @@ unsafe impl Allocator for UnifiedAlloc {
             return Ok(core::ptr::NonNull::slice_from_raw_parts(ptr, 0));
         }
 
-        let ptr = unsafe { sz_unified_alloc(size) };
+        let ptr = unsafe { szs_unified_alloc(size) };
         if ptr.is_null() {
             return Err(AllocError);
         }
@@ -1315,7 +1315,7 @@ unsafe impl Allocator for UnifiedAlloc {
 
     unsafe fn deallocate(&self, ptr: core::ptr::NonNull<u8>, layout: Layout) {
         if layout.size() != 0 {
-            sz_unified_free(ptr.as_ptr() as *mut c_void, layout.size());
+            szs_unified_free(ptr.as_ptr() as *mut c_void, layout.size());
         }
     }
 }
@@ -1450,7 +1450,7 @@ impl LevenshteinDistances {
         let mut handle = ptr::null_mut();
         let capabilities = device.get_capabilities().unwrap_or(0);
         let status = unsafe {
-            sz_levenshtein_distances_init(
+            szs_levenshtein_distances_init(
                 match_cost,
                 mismatch_cost,
                 open_cost,
@@ -1539,7 +1539,7 @@ impl LevenshteinDistances {
                 let tape_a_view = create_u64tape_view(&tape_a);
                 let tape_b_view = create_u64tape_view(&tape_b);
                 unsafe {
-                    sz_levenshtein_distances_u64tape(
+                    szs_levenshtein_distances_u64tape(
                         self.handle,
                         device.handle,
                         &tape_a_view as *const _ as *const c_void,
@@ -1552,7 +1552,7 @@ impl LevenshteinDistances {
                 let tape_a_view = create_u32tape_view(&tape_a);
                 let tape_b_view = create_u32tape_view(&tape_b);
                 unsafe {
-                    sz_levenshtein_distances_u32tape(
+                    szs_levenshtein_distances_u32tape(
                         self.handle,
                         device.handle,
                         &tape_a_view as *const _ as *const c_void,
@@ -1570,7 +1570,7 @@ impl LevenshteinDistances {
             let seq_a = create_sequence_view(seq_a_slice);
             let seq_b = create_sequence_view(seq_b_slice);
             let status = unsafe {
-                sz_levenshtein_distances_sequence(
+                szs_levenshtein_distances_sequence(
                     self.handle,
                     device.handle,
                     &seq_a as *const _ as *const c_void,
@@ -1590,7 +1590,7 @@ impl LevenshteinDistances {
 impl Drop for LevenshteinDistances {
     fn drop(&mut self) {
         if !self.handle.is_null() {
-            unsafe { sz_levenshtein_distances_free(self.handle) };
+            unsafe { szs_levenshtein_distances_free(self.handle) };
         }
     }
 }
@@ -1696,7 +1696,7 @@ impl LevenshteinDistancesUtf8 {
         let mut handle = ptr::null_mut();
         let capabilities = device.get_capabilities().unwrap_or(0);
         let status = unsafe {
-            sz_levenshtein_distances_utf8_init(
+            szs_levenshtein_distances_utf8_init(
                 match_cost,
                 mismatch_cost,
                 open_cost,
@@ -1782,7 +1782,7 @@ impl LevenshteinDistancesUtf8 {
                 let tape_a_view = create_u64tape_view_str(&tape_a);
                 let tape_b_view = create_u64tape_view_str(&tape_b);
                 unsafe {
-                    sz_levenshtein_distances_utf8_u64tape(
+                    szs_levenshtein_distances_utf8_u64tape(
                         self.handle,
                         device.handle,
                         &tape_a_view as *const _ as *const c_void,
@@ -1795,7 +1795,7 @@ impl LevenshteinDistancesUtf8 {
                 let tape_a_view = create_u32tape_view_str(&tape_a);
                 let tape_b_view = create_u32tape_view_str(&tape_b);
                 unsafe {
-                    sz_levenshtein_distances_utf8_u32tape(
+                    szs_levenshtein_distances_utf8_u32tape(
                         self.handle,
                         device.handle,
                         &tape_a_view as *const _ as *const c_void,
@@ -1813,7 +1813,7 @@ impl LevenshteinDistancesUtf8 {
             let seq_a = create_sequence_view_str(seq_a_slice);
             let seq_b = create_sequence_view_str(seq_b_slice);
             let status = unsafe {
-                sz_levenshtein_distances_utf8_sequence(
+                szs_levenshtein_distances_utf8_sequence(
                     self.handle,
                     device.handle,
                     &seq_a as *const _ as *const c_void,
@@ -1833,7 +1833,7 @@ impl LevenshteinDistancesUtf8 {
 impl Drop for LevenshteinDistancesUtf8 {
     fn drop(&mut self) {
         if !self.handle.is_null() {
-            unsafe { sz_levenshtein_distances_utf8_free(self.handle) };
+            unsafe { szs_levenshtein_distances_utf8_free(self.handle) };
         }
     }
 }
@@ -1985,7 +1985,7 @@ impl NeedlemanWunschScores {
         let mut handle = ptr::null_mut();
         let capabilities = device.get_capabilities().unwrap_or(0);
         let status = unsafe {
-            sz_needleman_wunsch_scores_init(
+            szs_needleman_wunsch_scores_init(
                 substitution_matrix.as_ptr() as *const i8,
                 open_cost,
                 extend_cost,
@@ -2091,7 +2091,7 @@ impl NeedlemanWunschScores {
                 let tape_a_view = create_u64tape_view(&tape_a);
                 let tape_b_view = create_u64tape_view(&tape_b);
                 unsafe {
-                    sz_needleman_wunsch_scores_u64tape(
+                    szs_needleman_wunsch_scores_u64tape(
                         self.handle,
                         device.handle,
                         &tape_a_view as *const _ as *const c_void,
@@ -2104,7 +2104,7 @@ impl NeedlemanWunschScores {
                 let tape_a_view = create_u32tape_view(&tape_a);
                 let tape_b_view = create_u32tape_view(&tape_b);
                 unsafe {
-                    sz_needleman_wunsch_scores_u32tape(
+                    szs_needleman_wunsch_scores_u32tape(
                         self.handle,
                         device.handle,
                         &tape_a_view as *const _ as *const c_void,
@@ -2122,7 +2122,7 @@ impl NeedlemanWunschScores {
             let seq_a = create_sequence_view(seq_a_slice);
             let seq_b = create_sequence_view(seq_b_slice);
             let status = unsafe {
-                sz_needleman_wunsch_scores_sequence(
+                szs_needleman_wunsch_scores_sequence(
                     self.handle,
                     device.handle,
                     &seq_a as *const _ as *const c_void,
@@ -2142,7 +2142,7 @@ impl NeedlemanWunschScores {
 impl Drop for NeedlemanWunschScores {
     fn drop(&mut self) {
         if !self.handle.is_null() {
-            unsafe { sz_needleman_wunsch_scores_free(self.handle) };
+            unsafe { szs_needleman_wunsch_scores_free(self.handle) };
         }
     }
 }
@@ -2309,7 +2309,7 @@ impl SmithWatermanScores {
         let mut handle = ptr::null_mut();
         let capabilities = device.get_capabilities().unwrap_or(0);
         let status = unsafe {
-            sz_smith_waterman_scores_init(
+            szs_smith_waterman_scores_init(
                 substitution_matrix.as_ptr() as *const i8,
                 open_cost,
                 extend_cost,
@@ -2430,7 +2430,7 @@ impl SmithWatermanScores {
                 let tape_a_view = create_u64tape_view(&tape_a);
                 let tape_b_view = create_u64tape_view(&tape_b);
                 unsafe {
-                    sz_smith_waterman_scores_u64tape(
+                    szs_smith_waterman_scores_u64tape(
                         self.handle,
                         device.handle,
                         &tape_a_view as *const _ as *const c_void,
@@ -2443,7 +2443,7 @@ impl SmithWatermanScores {
                 let tape_a_view = create_u32tape_view(&tape_a);
                 let tape_b_view = create_u32tape_view(&tape_b);
                 unsafe {
-                    sz_smith_waterman_scores_u32tape(
+                    szs_smith_waterman_scores_u32tape(
                         self.handle,
                         device.handle,
                         &tape_a_view as *const _ as *const c_void,
@@ -2461,7 +2461,7 @@ impl SmithWatermanScores {
             let seq_a = create_sequence_view(seq_a_slice);
             let seq_b = create_sequence_view(seq_b_slice);
             let status = unsafe {
-                sz_smith_waterman_scores_sequence(
+                szs_smith_waterman_scores_sequence(
                     self.handle,
                     device.handle,
                     &seq_a as *const _ as *const c_void,
@@ -2481,7 +2481,7 @@ impl SmithWatermanScores {
 impl Drop for SmithWatermanScores {
     fn drop(&mut self) {
         if !self.handle.is_null() {
-            unsafe { sz_smith_waterman_scores_free(self.handle) };
+            unsafe { szs_smith_waterman_scores_free(self.handle) };
         }
     }
 }

From a65dc996091dbdf0cc21390c4e3a67f12ae6a49a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Aug 2025 23:40:11 +0000
Subject: [PATCH 629/751] Fix: Inferring Ice Lake similarity kernels

---
 include/stringzillas/similarities.hpp | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/include/stringzillas/similarities.hpp b/include/stringzillas/similarities.hpp
index a808782f..93cf383f 100644
--- a/include/stringzillas/similarities.hpp
+++ b/include/stringzillas/similarities.hpp
@@ -2591,7 +2591,8 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
     : public tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void> {
 
-    using tile_scorer_t::tile_scorer; // Make the constructors visible
+    using tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                      sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void>::tile_scorer;
 
     static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
@@ -2738,7 +2739,8 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, uniform_substi
     : public tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, uniform_substitution_costs_t,
                          linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void> {
 
-    using tile_scorer_t::tile_scorer; // Make the constructors visible
+    using tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                      sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void>::tile_scorer;
 
     static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
@@ -2883,7 +2885,8 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
     : public tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void> {
 
-    using tile_scorer_t::tile_scorer; // Make the constructors visible
+    using tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                      sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void>::tile_scorer;
 
     static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
@@ -3025,7 +3028,8 @@ struct tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_subst
     : public tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_substitution_costs_t,
                          linear_gap_costs_t, sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void> {
 
-    using tile_scorer_t::tile_scorer; // Make the constructors visible
+    using tile_scorer<sz_rune_t const *, sz_rune_t const *, sz_u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                      sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void>::tile_scorer;
 
     static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
@@ -3171,7 +3175,8 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
     : public tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void> {
 
-    using tile_scorer_t::tile_scorer; // Make the constructors visible
+    using tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
+                      sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void>::tile_scorer;
 
     static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
@@ -3316,7 +3321,8 @@ struct tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_cos
     : public tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void> {
 
-    using tile_scorer_t::tile_scorer; // Make the constructors visible
+    using tile_scorer<char const *, char const *, sz_u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                      sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void>::tile_scorer;
 
     static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
@@ -3432,7 +3438,8 @@ struct tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_co
     : public tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void> {
 
-    using tile_scorer_t::tile_scorer; // Make the constructors visible
+    using tile_scorer<char const *, char const *, sz_u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                      sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void>::tile_scorer;
 
     static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
@@ -3549,7 +3556,8 @@ struct tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_co
     : public tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, affine_gap_costs_t,
                          sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void> {
 
-    using tile_scorer_t::tile_scorer; // Make the constructors visible
+    using tile_scorer<char const *, char const *, sz_u32_t, uniform_substitution_costs_t, affine_gap_costs_t,
+                      sz_minimize_distance_k, sz_similarity_global_k, sz_cap_serial_k, void>::tile_scorer;
 
     static constexpr sz_similarity_objective_t objective_k = sz_minimize_distance_k;
     static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;

From ce6cab2d92c2aaa361fbeec5f064644c7745d99a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 19 Aug 2025 01:07:16 +0100
Subject: [PATCH 630/751] Make: Skip x86 intrinsics in `universal` builds

---
 setup.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 90e067a2..70ec8c75 100644
--- a/setup.py
+++ b/setup.py
@@ -154,13 +154,11 @@ def darwin_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[T
     # so we must pre-set the CPU generation. Technically the last Intel-based Apple
     # product was the 2021 MacBook Pro, which had the "Coffee Lake" architecture.
     # During Universal builds, however, even AVX header cause compilation errors.
-    is_building_x86 = is_64bit_x86() or "universal" in sysconfig.get_platform()
-    is_building_arm = is_64bit_arm() or "universal" in sysconfig.get_platform()
     macros_args = [
-        ("SZ_USE_HASWELL", "1" if is_building_x86 else "0"),
+        ("SZ_USE_HASWELL", "0"),
         ("SZ_USE_SKYLAKE", "0"),
         ("SZ_USE_ICE", "0"),
-        ("SZ_USE_NEON", "1" if is_building_arm else "0"),
+        ("SZ_USE_NEON", "1" if is_64bit_arm() else "0"),
         ("SZ_USE_SVE", "0"),
         ("SZ_USE_SVE2", "0"),
     ]

From 9033649f1df791b7a94823e1f731e14a6cb0f44b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 19 Aug 2025 00:19:26 +0000
Subject: [PATCH 631/751] Make: Install `sz` before `szs` in CI

---
 .github/workflows/prerelease.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 3b1135c4..37fd24a9 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -280,6 +280,7 @@ jobs:
       - name: Build Python StringZillas-CPUs
         run: |
           pip install pytest pytest-repeat numpy pyarrow wheel
+          SZ_TARGET=stringzilla pip install -e . --force-reinstall --no-build-isolation
           SZ_TARGET=stringzillas-cpus pip install -e . --force-reinstall --no-build-isolation
       - name: Test Python StringZillas-CPUs
         run: uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x
@@ -341,6 +342,7 @@ jobs:
       - name: Build Python StringZillas-CPUs
         run: |
           pip install pytest pytest-repeat numpy pyarrow wheel
+          SZ_TARGET=stringzilla pip install -e . --force-reinstall --no-build-isolation
           SZ_TARGET=stringzillas-cpus pip install -e . --force-reinstall --no-build-isolation
       - name: Test Python StringZillas-CPUs
         run: uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x

From 20bbe229e0dcbb1dd648f18092608360d62a4bae Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 19 Aug 2025 00:41:20 +0000
Subject: [PATCH 632/751] Improve: Simplify setting thread-counts

---
 c/stringzillas.cuh                  | 9 ++++++---
 include/stringzillas/stringzillas.h | 3 +--
 python/stringzillas.c               | 6 ++++--
 rust/stringzillas.rs                | 2 +-
 scripts/test_stringzillas.py        | 7 +++++--
 5 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index e0fb3109..66cc7e29 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -12,6 +12,7 @@
 #include <stringzillas/stringzillas.h> // StringZillas library header
 
 #include <variant>        // For `std::variant`
+#include <thread>         // For `std::thread::hardware_concurrency`
 #include <fork_union.hpp> // Fork-join scoped thread pool
 
 #include <stringzillas/fingerprints.hpp> // C++ templates for string processing
@@ -786,10 +787,12 @@ SZ_DYNAMIC sz_status_t szs_device_scope_init_default(szs_device_scope_t *scope_p
 
 SZ_DYNAMIC sz_status_t szs_device_scope_init_cpu_cores(sz_size_t cpu_cores, szs_device_scope_t *scope_punned) {
     sz_assert_(scope_punned != nullptr && "Scope must not be null");
-    sz_assert_(cpu_cores > 0 && "CPU cores must be greater than zero");
 
-    // ! For a single-threaded execution, use the default scope
-    if (cpu_cores <= 1) return sz_status_unknown_k;
+    // If cpu_cores is 0, use all available cores
+    if (cpu_cores == 0) cpu_cores = std::thread::hardware_concurrency();
+
+    // If cpu_cores is 1, redirect to default scope
+    if (cpu_cores == 1) return szs_device_scope_init_default(scope_punned);
 
     sz::cpu_specs_t specs;
     auto executor = std::make_unique<fu::basic_pool_t>();
diff --git a/include/stringzillas/stringzillas.h b/include/stringzillas/stringzillas.h
index ecf14fdb..22c22fe1 100644
--- a/include/stringzillas/stringzillas.h
+++ b/include/stringzillas/stringzillas.h
@@ -103,9 +103,8 @@ SZ_DYNAMIC sz_status_t szs_device_scope_init_default(szs_device_scope_t *scope);
 
 /**
  * @brief Initialize device scope for CPU parallel execution.
- * @param[in] cpu_cores Number of CPU cores to use (must be > 1).
+ * @param[in] cpu_cores Number of CPU cores to use, or zero for all cores.
  * @param[out] scope Pointer to device scope handle.
- * @note For single-threaded execution, use `szs_device_scope_init_default()` instead.
  */
 SZ_DYNAMIC sz_status_t szs_device_scope_init_cpu_cores(sz_size_t cpu_cores, szs_device_scope_t *scope);
 
diff --git a/python/stringzillas.c b/python/stringzillas.c
index 507689c4..1c3f83da 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -182,7 +182,9 @@ static int DeviceScope_init(DeviceScope *self, PyObject *args, PyObject *kwargs)
         cpu_cores = PyLong_AsSize_t(cpu_cores_obj);
         if (cpu_cores == (sz_size_t)-1 && PyErr_Occurred()) { return -1; }
         status = szs_device_scope_init_cpu_cores(cpu_cores, &self->handle);
-        snprintf(self->description, sizeof(self->description), "CPUs:%zu", cpu_cores);
+        if (cpu_cores == 1) { snprintf(self->description, sizeof(self->description), "default"); }
+        else if (cpu_cores == 0) { snprintf(self->description, sizeof(self->description), "CPUs:all"); }
+        else { snprintf(self->description, sizeof(self->description), "CPUs:%zu", cpu_cores); }
     }
     else if (gpu_device_obj != NULL) {
         if (!PyLong_Check(gpu_device_obj)) {
@@ -217,7 +219,7 @@ static char const doc_DeviceScope[] = //
     "Context for controlling execution on CPU cores or GPU devices.\n"
     "\n"
     "Args:\n"
-    "  cpu_cores (int, optional): Number of CPU cores to use (0 for all, 1 for single-threaded).\n"
+    "  cpu_cores (int, optional): Number of CPU cores to use, or zero for all cores.\n"
     "  gpu_device (int, optional): GPU device ID to target.\n"
     "\n"
     "Note: Cannot specify both cpu_cores and gpu_device.";
diff --git a/rust/stringzillas.rs b/rust/stringzillas.rs
index fa9e0a58..f02d9f11 100644
--- a/rust/stringzillas.rs
+++ b/rust/stringzillas.rs
@@ -116,7 +116,7 @@ impl DeviceScope {
     ///
     /// # Parameters
     ///
-    /// - `cpu_cores`: Number of CPU cores to use (must be > 1)
+    /// - `cpu_cores`: Number of CPU cores to use, or zero for all cores
     ///
     /// # Returns
     ///
diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index 4f6bc64e..316909a6 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -108,11 +108,14 @@ def test_device_scope():
         with pytest.raises(RuntimeError):
             szs.DeviceScope(gpu_device=0)
 
-    # Test single-threaded execution
+    # Test cpu_cores=1 redirects to default scope
     scope_single = szs.DeviceScope(cpu_cores=1)
     assert scope_single is not None
+    
+    # Test cpu_cores=0 uses all available cores
+    scope_all = szs.DeviceScope(cpu_cores=0)
+    assert scope_all is not None
 
-    # Invalid arguments
     with pytest.raises(ValueError):
         szs.DeviceScope(cpu_cores=4, gpu_device=0)  # Can't specify both
 

From 5f60fe42121a0ea66b4f18411f2917c009f166a1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 19 Aug 2025 01:43:33 +0100
Subject: [PATCH 633/751] Fix: MSVC compilation issues

---
 c/stringzilla.c             | 29 +++++++++++++++--------------
 c/stringzillas.cuh          |  1 +
 include/stringzilla/hash.h  | 18 +++++++++---------
 include/stringzilla/types.h | 10 ++++++----
 scripts/bench.hpp           | 12 ++++++------
 scripts/bench_container.cpp |  2 +-
 6 files changed, 38 insertions(+), 34 deletions(-)

diff --git a/c/stringzilla.c b/c/stringzilla.c
index a6a746bd..898077bb 100644
--- a/c/stringzilla.c
+++ b/c/stringzilla.c
@@ -4,20 +4,6 @@
  *  @author     Ash Vardanian
  *  @date       January 16, 2024
  */
-#if SZ_AVOID_LIBC
-// If we don't have the LibC, the `malloc` definition in `stringzilla.h` will be illformed.
-#ifdef _MSC_VER
-typedef sz_size_t size_t; // Reuse the type definition we've inferred from `stringzilla.h`
-extern __declspec(dllimport) int rand(void);
-extern __declspec(dllimport) void free(void *start);
-extern __declspec(dllimport) void *malloc(size_t length);
-#else
-typedef __SIZE_TYPE__ size_t; // For GCC/Clang
-extern int rand(void);
-extern void free(void *start);
-extern void *malloc(size_t length);
-#endif
-#endif
 
 // When enabled, this library will override the symbols usually provided by the C standard library.
 // It's handy if you want to use the `LD_PRELOAD` trick for non-intrusive profiling and replacing
@@ -33,6 +19,21 @@ extern void *malloc(size_t length);
 #define SZ_DYNAMIC_DISPATCH 1
 #include <stringzilla/stringzilla.h>
 
+#if SZ_AVOID_LIBC
+// If we don't have the LibC, the `malloc` definition in `stringzilla.h` will be illformed.
+#ifdef _MSC_VER
+typedef sz_size_t size_t; // Reuse the type definition we've inferred from `stringzilla.h`
+extern __declspec(dllimport) int rand(void);
+extern __declspec(dllimport) void free(void *start);
+extern __declspec(dllimport) void *malloc(size_t length);
+#else
+typedef __SIZE_TYPE__ size_t; // For GCC/Clang
+extern int rand(void);
+extern void free(void *start);
+extern void *malloc(size_t length);
+#endif
+#endif
+
 #if defined(SZ_IS_WINDOWS_)
 #include <windows.h> // `DllMain`
 #endif
diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index e0fb3109..27f3a338 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -12,6 +12,7 @@
 #include <stringzillas/stringzillas.h> // StringZillas library header
 
 #include <variant>        // For `std::variant`
+#include <string_view>    // For `std::string_view`
 #include <fork_union.hpp> // Fork-join scoped thread pool
 
 #include <stringzillas/fingerprints.hpp> // C++ templates for string processing
diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 58040cdf..8a82ae1a 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -654,7 +654,7 @@ SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length, sz_u64_t se
         for (sz_size_t i = 0; i < 16; ++i) data1_vec.u8s[i] = start[length - 16 + i];
 #endif
         // Let's shift the data within the register to de-interleave the bytes.
-        sz_hash_shift_in_register_serial_(&data1_vec, 32 - length);
+        sz_hash_shift_in_register_serial_(&data1_vec, (int)(32 - length));
         sz_hash_minimal_update_serial_(&state, data0_vec);
         sz_hash_minimal_update_serial_(&state, data1_vec);
         return sz_hash_minimal_finalize_serial_(&state, length);
@@ -678,7 +678,7 @@ SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length, sz_u64_t se
         for (sz_size_t i = 0; i < 16; ++i) data2_vec.u8s[i] = start[length - 16 + i];
 #endif
         // Let's shift the data within the register to de-interleave the bytes.
-        sz_hash_shift_in_register_serial_(&data2_vec, 48 - length);
+        sz_hash_shift_in_register_serial_(&data2_vec, (int)(48 - length));
         sz_hash_minimal_update_serial_(&state, data0_vec);
         sz_hash_minimal_update_serial_(&state, data1_vec);
         sz_hash_minimal_update_serial_(&state, data2_vec);
@@ -706,7 +706,7 @@ SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length, sz_u64_t se
         for (sz_size_t i = 0; i < 16; ++i) data3_vec.u8s[i] = start[length - 16 + i];
 #endif
         // Let's shift the data within the register to de-interleave the bytes.
-        sz_hash_shift_in_register_serial_(&data3_vec, 64 - length);
+        sz_hash_shift_in_register_serial_(&data3_vec, (int)(64 - length));
         sz_hash_minimal_update_serial_(&state, data0_vec);
         sz_hash_minimal_update_serial_(&state, data1_vec);
         sz_hash_minimal_update_serial_(&state, data2_vec);
@@ -1061,7 +1061,7 @@ SZ_PUBLIC sz_u64_t sz_hash_haswell(sz_cptr_t start, sz_size_t length, sz_u64_t s
         data0_vec.xmm = _mm_lddqu_si128((__m128i const *)(start));
         data1_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + length - 16));
         // Let's shift the data within the register to de-interleave the bytes.
-        sz_hash_shift_in_register_serial_(&data1_vec, 32 - length);
+        sz_hash_shift_in_register_serial_(&data1_vec, (int)(32 - length));
         sz_hash_minimal_update_haswell_(&state, data0_vec.xmm);
         sz_hash_minimal_update_haswell_(&state, data1_vec.xmm);
         return sz_hash_minimal_finalize_haswell_(&state, length);
@@ -1076,7 +1076,7 @@ SZ_PUBLIC sz_u64_t sz_hash_haswell(sz_cptr_t start, sz_size_t length, sz_u64_t s
         data1_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 16));
         data2_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + length - 16));
         // Let's shift the data within the register to de-interleave the bytes.
-        sz_hash_shift_in_register_serial_(&data2_vec, 48 - length);
+        sz_hash_shift_in_register_serial_(&data2_vec, (int)(48 - length));
         sz_hash_minimal_update_haswell_(&state, data0_vec.xmm);
         sz_hash_minimal_update_haswell_(&state, data1_vec.xmm);
         sz_hash_minimal_update_haswell_(&state, data2_vec.xmm);
@@ -1093,7 +1093,7 @@ SZ_PUBLIC sz_u64_t sz_hash_haswell(sz_cptr_t start, sz_size_t length, sz_u64_t s
         data2_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + 32));
         data3_vec.xmm = _mm_lddqu_si128((__m128i const *)(start + length - 16));
         // Let's shift the data within the register to de-interleave the bytes.
-        sz_hash_shift_in_register_serial_(&data3_vec, 64 - length);
+        sz_hash_shift_in_register_serial_(&data3_vec, (int)(64 - length));
         sz_hash_minimal_update_haswell_(&state, data0_vec.xmm);
         sz_hash_minimal_update_haswell_(&state, data1_vec.xmm);
         sz_hash_minimal_update_haswell_(&state, data2_vec.xmm);
@@ -2140,7 +2140,7 @@ SZ_PUBLIC sz_u64_t sz_hash_neon(sz_cptr_t start, sz_size_t length, sz_u64_t seed
         data0_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + 0));
         data1_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + length - 16));
         // Let's shift the data within the register to de-interleave the bytes.
-        sz_hash_shift_in_register_serial_(&data1_vec, 32 - length); //! `vextq_u8` requires immediates
+        sz_hash_shift_in_register_serial_(&data1_vec, (int)(32 - length)); //! `vextq_u8` requires immediates
         sz_hash_minimal_update_neon_(&state, data0_vec.u8x16);
         sz_hash_minimal_update_neon_(&state, data1_vec.u8x16);
         return sz_hash_minimal_finalize_neon_(&state, length);
@@ -2155,7 +2155,7 @@ SZ_PUBLIC sz_u64_t sz_hash_neon(sz_cptr_t start, sz_size_t length, sz_u64_t seed
         data1_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + 16));
         data2_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + length - 16));
         // Let's shift the data within the register to de-interleave the bytes.
-        sz_hash_shift_in_register_serial_(&data2_vec, 48 - length); //! `vextq_u8` requires immediates
+        sz_hash_shift_in_register_serial_(&data2_vec, (int)(48 - length)); //! `vextq_u8` requires immediates
         sz_hash_minimal_update_neon_(&state, data0_vec.u8x16);
         sz_hash_minimal_update_neon_(&state, data1_vec.u8x16);
         sz_hash_minimal_update_neon_(&state, data2_vec.u8x16);
@@ -2172,7 +2172,7 @@ SZ_PUBLIC sz_u64_t sz_hash_neon(sz_cptr_t start, sz_size_t length, sz_u64_t seed
         data2_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + 32));
         data3_vec.u8x16 = vld1q_u8((sz_u8_t const *)(start + length - 16));
         // Let's shift the data within the register to de-interleave the bytes.
-        sz_hash_shift_in_register_serial_(&data3_vec, 64 - length); //! `vextq_u8` requires immediates
+        sz_hash_shift_in_register_serial_(&data3_vec, (int)(64 - length)); //! `vextq_u8` requires immediates
         sz_hash_minimal_update_neon_(&state, data0_vec.u8x16);
         sz_hash_minimal_update_neon_(&state, data1_vec.u8x16);
         sz_hash_minimal_update_neon_(&state, data2_vec.u8x16);
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index a928c43f..3e037d90 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -1189,10 +1189,12 @@ SZ_INTERNAL sz_i32_t sz_i32_max_of_two(sz_i32_t x, sz_i32_t y) { return x - ((x
 #pragma GCC push_options
 #pragma GCC target("bmi", "bmi2")
 #pragma clang attribute push(__attribute__((target("bmi,bmi2"))), apply_to = function)
-SZ_INTERNAL __mmask8 sz_u8_mask_until_(sz_size_t n) { return (__mmask8)_bzhi_u32(0xFFu, n); }
-SZ_INTERNAL __mmask16 sz_u16_mask_until_(sz_size_t n) { return (__mmask16)_bzhi_u32(0xFFFFu, n); }
-SZ_INTERNAL __mmask32 sz_u32_mask_until_(sz_size_t n) { return (__mmask32)_bzhi_u64(0xFFFFFFFFu, n); }
-SZ_INTERNAL __mmask64 sz_u64_mask_until_(sz_size_t n) { return (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n); }
+SZ_INTERNAL __mmask8 sz_u8_mask_until_(sz_size_t n) { return (__mmask8)_bzhi_u32(0xFFu, (unsigned char)n); }
+SZ_INTERNAL __mmask16 sz_u16_mask_until_(sz_size_t n) { return (__mmask16)_bzhi_u32(0xFFFFu, (unsigned char)n); }
+SZ_INTERNAL __mmask32 sz_u32_mask_until_(sz_size_t n) { return (__mmask32)_bzhi_u64(0xFFFFFFFFu, (unsigned char)n); }
+SZ_INTERNAL __mmask64 sz_u64_mask_until_(sz_size_t n) {
+    return (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, (unsigned char)n);
+}
 SZ_INTERNAL __mmask8 sz_u8_clamp_mask_until_(sz_size_t n) { return n < 8 ? sz_u8_mask_until_(n) : 0xFFu; }
 SZ_INTERNAL __mmask16 sz_u16_clamp_mask_until_(sz_size_t n) { return n < 16 ? sz_u16_mask_until_(n) : 0xFFFFu; }
 SZ_INTERNAL __mmask32 sz_u32_clamp_mask_until_(sz_size_t n) { return n < 32 ? sz_u32_mask_until_(n) : 0xFFFFFFFFu; }
diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index ecadd70d..b58aa617 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -693,7 +693,7 @@ bench_result_t bench_nullary(  //
         result.profiled_seconds = running_seconds;
         result.profiled_calls += 1;
         result.profiled_cpu_cycles += cpu_cycles_at_end - cpu_cycles_at_start;
-        result.cpu_cycles_histogram[cpu_cycles_at_end - cpu_cycles_at_start] += 1;
+        result.cpu_cycles_histogram[static_cast<double>(cpu_cycles_at_end - cpu_cycles_at_start)] += 1;
     }
 
     return result;
@@ -764,7 +764,7 @@ bench_result_t bench_unary(    //
         result.profiled_inputs += call_result.inputs_processed;
         result.profiled_calls += 1;
         result.profiled_cpu_cycles += cpu_cycles_at_end - cpu_cycles_at_start;
-        result.cpu_cycles_histogram[cpu_cycles_at_end - cpu_cycles_at_start] += 1;
+        result.cpu_cycles_histogram[static_cast<double>(cpu_cycles_at_end - cpu_cycles_at_start)] += 1;
     });
     result.profiled_seconds = first_call_duration;
     if (first_call_duration >= env.benchmark_seconds) return result;
@@ -792,10 +792,10 @@ bench_result_t bench_unary(    //
 
         result.profiled_seconds = running_seconds;
         result.profiled_cpu_cycles += t4 - t0;
-        result.cpu_cycles_histogram[t1 - t0] += 1;
-        result.cpu_cycles_histogram[t2 - t1] += 1;
-        result.cpu_cycles_histogram[t3 - t2] += 1;
-        result.cpu_cycles_histogram[t4 - t3] += 1;
+        result.cpu_cycles_histogram[static_cast<double>(t1 - t0)] += 1;
+        result.cpu_cycles_histogram[static_cast<double>(t2 - t1)] += 1;
+        result.cpu_cycles_histogram[static_cast<double>(t3 - t2)] += 1;
+        result.cpu_cycles_histogram[static_cast<double>(t4 - t3)] += 1;
     }
 
     result.profiled_seconds += first_call_duration;
diff --git a/scripts/bench_container.cpp b/scripts/bench_container.cpp
index cfa7b8bb..ebbff346 100644
--- a/scripts/bench_container.cpp
+++ b/scripts/bench_container.cpp
@@ -184,7 +184,7 @@ struct equal_to_through_std_t {
     using is_transparent = void;
     template <typename first_type_, typename second_type_>
     inline bool operator()(first_type_ const &a, second_type_ const &b) const noexcept {
-        return std::equal_to<std::string_view> {}(std::string_view(a), std::string_view(b));
+        return std::equal_to<std::string_view> {}(to_str<std::string_view>(a), to_str<std::string_view>(b));
     }
 };
 

From b42a340186bc3d64ddc13333244ab39019b12d49 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 19 Aug 2025 01:00:44 +0000
Subject: [PATCH 634/751] Make: No `bare` builds on Windows & macOS

---
 .github/workflows/release.yml | 10 +++++-----
 CMakeLists.txt                | 22 +++++++++-------------
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 2e2c0566..032d14a2 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -316,16 +316,16 @@ jobs:
 
       - name: Build library
         run: |
-          cmake -DCMAKE_BUILD_TYPE=Release -B build_release
+          cmake -DCMAKE_BUILD_TYPE=Release -B build_release -DSTRINGZILLA_BUILD_SHARED=1
           cmake --build build_release --config Release
-          tar -cvf "stringzilla_bare_windows_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}.tar" "build_release/stringzilla_bare.dll" "./include/stringzilla/stringzilla.h"
+          tar -cvf "stringzilla_shared_windows_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}.tar" "build_release/stringzilla_shared.dll" "./include/stringzilla/stringzilla.h"
 
       - name: Upload archive
         uses: xresloader/upload-to-github-release@v1
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
-          file: "stringzilla_bare_windows_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}.tar"
+          file: "stringzilla_shared_windows_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}.tar"
           update_latest_release: true
 
   create_macos_library:
@@ -351,9 +351,9 @@ jobs:
 
       - name: Build library
         run: |
-          cmake -DCMAKE_BUILD_TYPE=Release -B build_release
+          cmake -DCMAKE_BUILD_TYPE=Release -B build_release -DSTRINGZILLA_BUILD_SHARED=1
           cmake --build build_release --config Release        
-          zip -r stringzilla_bare_macos_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}.zip build_release/libstringzilla_bare.dylib include/stringzilla/stringzilla.h
+          zip -r stringzilla_shared_macos_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}.zip build_release/libstringzilla_shared.dylib include/stringzilla/stringzilla.h
 
       - name: Upload archive
         uses: xresloader/upload-to-github-release@v1
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3c0cf227..ca2d8ebc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -451,14 +451,14 @@ if (${STRINGZILLA_BUILD_TEST})
         # SVE is not supported on Apple Silicon, only compile on non-Darwin ARM platforms
         if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
             define_launcher(stringzilla_test_cpp20_sve scripts/test_stringzilla.cpp 20 "armv8.2-a+sve")
-        endif()
+        endif ()
         if (STRINGZILLA_BUILD_CUDA)
             define_gpu_launcher(stringzillas_test_cu20_serial scripts/test_stringzillas.cu 20 "armv8-a")
             define_gpu_launcher(stringzillas_test_cu20_neon scripts/test_stringzillas.cu 20 "armv8-a+simd")
             # SVE is not supported on Apple Silicon, only compile on non-Darwin ARM platforms
             if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
                 define_gpu_launcher(stringzillas_test_cu20_sve scripts/test_stringzillas.cu 20 "armv8.2-a+sve")
-            endif()
+            endif ()
         endif ()
     endif ()
 endif ()
@@ -520,22 +520,18 @@ if (${STRINGZILLA_BUILD_SHARED})
     target_compile_definitions(stringzilla_shared PRIVATE "SZ_OVERRIDE_LIBC=1")
     target_include_directories(stringzilla_shared PUBLIC include)
 
-    # Try compiling a version without linking the LibC ! This is only for Linux and Windows, as on modern Arm-based
-    # MacOS machines ! We can't legally access Arm's "feature registers" without `sysctl` or `sysctlbyname`. So let's
-    # check if we are compiling for a Darwin-based OS.
-    if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    # Try compiling a version without linking the LibC ! This is only for Linux, as on modern Arm-based MacOS machines !
+    # We can't legally access Arm's "feature registers" without `sysctl` or `sysctlbyname`. Also exclude MSVC builds as
+    # they have linker issues with bare builds.
+    if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
         define_shared(stringzilla_bare)
         target_compile_definitions(stringzilla_bare PRIVATE "SZ_AVOID_LIBC=1")
         target_compile_definitions(stringzilla_bare PRIVATE "SZ_OVERRIDE_LIBC=1")
         target_include_directories(stringzilla_bare PUBLIC include)
 
-        # Avoid built-ins on MSVC and other compilers, as that will cause compilation errors
-        target_compile_options(
-            stringzilla_bare PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang>:-fno-builtin;-nostdlib>"
-                                     "$<$<CXX_COMPILER_ID:MSVC>:/Oi-;/GS->"
-        )
-        target_link_options(stringzilla_bare PRIVATE "$<$<CXX_COMPILER_ID:GNU,Clang>:-nostdlib>")
-        target_link_options(stringzilla_bare PRIVATE "$<$<CXX_COMPILER_ID:MSVC>:/NODEFAULTLIB>")
+        # Avoid built-ins on GCC and Clang compilers
+        target_compile_options(stringzilla_bare PRIVATE "-fno-builtin;-nostdlib")
+        target_link_options(stringzilla_bare PRIVATE "-nostdlib")
     endif ()
 endif ()
 

From 46410c62ad64f6172394afb9d29fdead9b45ecc2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 19 Aug 2025 10:56:16 +0000
Subject: [PATCH 635/751] Make: Workaround CI issues

---
 .github/workflows/prerelease.yml | 69 ++------------------------------
 CMakeLists.txt                   | 28 ++++++++-----
 scripts/bench.hpp                |  1 +
 3 files changed, 23 insertions(+), 75 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 37fd24a9..2bb90110 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -226,8 +226,8 @@ jobs:
       # - name: Test Swift
       #   run: swift test -c release --enable-test-discovery
 
-  test_ubuntu_gcc_cpus:
-    name: Ubuntu (GCC 12) StringZillas-CPUs
+  test_ubuntu_cpus:
+    name: Ubuntu (StringZillas-CPUs)
     runs-on: ubuntu-22.04
     env:
       CC: gcc-12
@@ -285,68 +285,6 @@ jobs:
       - name: Test Python StringZillas-CPUs
         run: uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x
 
-  test_ubuntu_clang_cpus:
-    name: Ubuntu (Clang 16) StringZillas-CPUs
-    runs-on: ubuntu-22.04
-    env:
-      CC: clang-16
-      CXX: clang++-16
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-
-        # C/C++
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y cmake build-essential
-          wget https://apt.llvm.org/llvm.sh
-          chmod +x llvm.sh
-          sudo ./llvm.sh 16
-
-      - name: Build C/C++
-        run: |
-          cmake -B build_artifacts \
-            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
-            -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
-            -DSTRINGZILLA_BUILD_BENCHMARK=1 \
-            -DSTRINGZILLA_BUILD_TEST=1
-
-          cmake --build build_artifacts --config RelWithDebInfo > build_artifacts/logs.txt 2>&1 || {
-            echo "Compilation failed. Here are the logs:"
-            cat build_artifacts/logs.txt
-            echo "The original compilation commands:"
-            cat build_artifacts/compile_commands.json
-            echo "CPU Features:"
-            lscpu
-            echo "Clang Version:"
-            clang-16 --version
-            echo "Clang++ Version:"
-            clang++-16 --version
-            exit 1
-          }
-      - name: Test C++
-        run: build_artifacts/stringzilla_test_cpp20
-
-        # Python StringZillas-CPUs
-      - name: Set up Python ${{ env.PYTHON_VERSION }}
-        uses: actions/setup-python@v5.2.0
-        with:
-          python-version: ${{ env.PYTHON_VERSION }}
-      - name: Install uv
-        run: |
-          python -m pip install --upgrade pip
-          pip install uv
-      - name: Build Python StringZillas-CPUs
-        run: |
-          pip install pytest pytest-repeat numpy pyarrow wheel
-          SZ_TARGET=stringzilla pip install -e . --force-reinstall --no-build-isolation
-          SZ_TARGET=stringzillas-cpus pip install -e . --force-reinstall --no-build-isolation
-      - name: Test Python StringZillas-CPUs
-        run: uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x
-
   # Temporary workaround to run Swift tests on Linux
   # Based on: https://github.com/swift-actions/setup-swift/issues/591#issuecomment-1685710678
   test_ubuntu_swift:
@@ -595,8 +533,7 @@ jobs:
       [
         test_ubuntu_gcc,
         test_ubuntu_clang,
-        test_ubuntu_gcc_cpus,
-        test_ubuntu_clang_cpus,
+        test_ubuntu_cpus,
         test_macos,
         test_windows,
         test_alpine,
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ca2d8ebc..fdb0df55 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -191,7 +191,7 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
     endif ()
 
     # Use the `/Zc:__cplusplus` flag to correctly define the `__cplusplus` macro in MSVC
-    if (${compiler_id} STREQUAL "MSVC")
+    if (${compiler_id} MATCHES "MSVC")
         target_compile_options(${target} PRIVATE "/Zc:__cplusplus")
     endif ()
 
@@ -213,8 +213,18 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
         target_compile_options(${target} PRIVATE "-Wno-cast-function-type;-Wno-unused-function") # ? Unique to GCC
     elseif (${compiler_id} STREQUAL "Clang" OR ${compiler_id} STREQUAL "AppleClang")
         target_compile_options(${target} PRIVATE "-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas")
-    elseif (${compiler_id} STREQUAL "MSVC")
-        target_compile_options(${target} PRIVATE "/Bt;/wd4068;/wd4146;/utf-8;/WX")
+    elseif (${compiler_id} MATCHES "MSVC")
+        target_compile_options(
+            ${target}
+            PRIVATE
+            "/Bt"        # Display build timings
+            "/wd4068"    # Disable warning: unknown pragma
+            "/wd4146"    # Disable warning: unary minus operator applied to unsigned type
+            "/wd4996"    # Disable warning: 'unsafe' functions like getenv, fopen (use _s variants)
+            "/wd4244"    # Disable warning: conversion with possible loss of data (e.g., size_t to double)
+            "/utf-8"     # Set source and execution character sets to UTF-8
+            "/WX"        # Treat warnings as errors
+        )
     elseif (${compiler_id} STREQUAL "NVIDIA")
         target_compile_options(
             ${target}
@@ -224,7 +234,7 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
     endif ()
 
     # Set optimization options for different compilers differently
-    if (${compiler_id} STREQUAL "MSVC")
+    if (${compiler_id} MATCHES "MSVC")
         if (${CMAKE_BUILD_TYPE} STREQUAL "Debug" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
             target_compile_options(${target} PRIVATE "/Od;/Zi")
             if (NOT target_type STREQUAL "SHARED_LIBRARY")
@@ -280,7 +290,7 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
     endif ()
 
     # Avoid builtin functions where we know what we are doing.
-    if (${compiler_id} STREQUAL "MSVC")
+    if (${compiler_id} MATCHES "MSVC")
         target_compile_options(${target} PRIVATE "/Oi-")
     else ()
         target_compile_options(${target} PRIVATE "-fno-builtin-memcmp")
@@ -300,7 +310,7 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
                 if (supports_march_native)
                     target_compile_options(${target} PRIVATE "-Xcompiler=-march=native")
                 endif ()
-            elseif (NOT (${compiler_id} STREQUAL "MSVC"))
+            elseif (NOT (${compiler_id} MATCHES "MSVC"))
                 include(CheckCXXCompilerFlag)
                 check_cxx_compiler_flag("-march=native" supports_march_native)
                 if (supports_march_native)
@@ -312,7 +322,7 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
             endif ()
         endif ()
     else ()
-        if (${compiler_id} STREQUAL "MSVC")
+        if (${compiler_id} MATCHES "MSVC")
             target_compile_options(${target} PRIVATE "/arch:${target_arch}")
         elseif (${compiler_id} STREQUAL "NVIDIA")
             # NVCC handles CPU architecture through host compiler flags
@@ -335,7 +345,7 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
     if (CMAKE_BUILD_TYPE STREQUAL "Debug")
         target_compile_definitions(${target} PRIVATE "SZ_DEBUG=1")
         if (NOT target_type STREQUAL "SHARED_LIBRARY")
-            if (${compiler_id} STREQUAL "MSVC")
+            if (${compiler_id} MATCHES "MSVC")
                 target_compile_options(${target} PRIVATE "/fsanitize=address;/fsanitize=leak")
                 target_link_options(${target} PRIVATE "/fsanitize=address;/fsanitize=leak")
             elseif (${compiler_id} STREQUAL "NVIDIA")
@@ -523,7 +533,7 @@ if (${STRINGZILLA_BUILD_SHARED})
     # Try compiling a version without linking the LibC ! This is only for Linux, as on modern Arm-based MacOS machines !
     # We can't legally access Arm's "feature registers" without `sysctl` or `sysctlbyname`. Also exclude MSVC builds as
     # they have linker issues with bare builds.
-    if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+    if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin" AND NOT CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
         define_shared(stringzilla_bare)
         target_compile_definitions(stringzilla_bare PRIVATE "SZ_AVOID_LIBC=1")
         target_compile_definitions(stringzilla_bare PRIVATE "SZ_OVERRIDE_LIBC=1")
diff --git a/scripts/bench.hpp b/scripts/bench.hpp
index b58aa617..74709aaf 100644
--- a/scripts/bench.hpp
+++ b/scripts/bench.hpp
@@ -165,6 +165,7 @@ struct repeat_up_to {
     };
 
     inline repeat_up_to(double max_seconds) : max_seconds(max_seconds) {}
+    inline repeat_up_to(std::size_t max_seconds) : max_seconds(static_cast<double>(max_seconds)) {}
     inline iterator begin() { return {max_seconds, passed_seconds}; }
     inline end_sentinel end() const noexcept { return {}; }
     inline double seconds() const noexcept { return passed_seconds; }

From 2a4552c47b91931b4b70fed27c90e084fb201d6b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 19 Aug 2025 11:05:00 +0000
Subject: [PATCH 636/751] Make: Add CUDA to GitHub CI

---
 .github/workflows/prerelease.yml | 92 +++++++++++++++++++++++++-------
 1 file changed, 72 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 2bb90110..ccf9d9c6 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -112,7 +112,7 @@ jobs:
 
         # Python
       - name: Set up Python ${{ env.PYTHON_VERSION }}
-        uses: actions/setup-python@v5.2.0
+        uses: actions/setup-python@v5.6.0
         with:
           python-version: ${{ env.PYTHON_VERSION }}
       - name: Build Python
@@ -197,7 +197,7 @@ jobs:
 
         # Python
       - name: Set up Python ${{ env.PYTHON_VERSION }}
-        uses: actions/setup-python@v5.2.0
+        uses: actions/setup-python@v5.6.0
         with:
           python-version: ${{ env.PYTHON_VERSION }}
       - name: Build Python
@@ -270,20 +270,80 @@ jobs:
 
         # Python StringZillas-CPUs
       - name: Set up Python ${{ env.PYTHON_VERSION }}
-        uses: actions/setup-python@v5.2.0
+        uses: actions/setup-python@v5.6.0
         with:
           python-version: ${{ env.PYTHON_VERSION }}
-      - name: Install uv
-        run: |
-          python -m pip install --upgrade pip
-          pip install uv
       - name: Build Python StringZillas-CPUs
         run: |
           pip install pytest pytest-repeat numpy pyarrow wheel
           SZ_TARGET=stringzilla pip install -e . --force-reinstall --no-build-isolation
           SZ_TARGET=stringzillas-cpus pip install -e . --force-reinstall --no-build-isolation
       - name: Test Python StringZillas-CPUs
-        run: uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x
+        run: python -m pytest scripts/test_stringzillas.py -s -x
+
+  test_ubuntu_cuda:
+    name: Ubuntu (StringZillas-CUDA)
+    runs-on: ubuntu-22.04
+    env:
+      CC: gcc-12
+      CXX: g++-12
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+        # Install CUDA Toolkit
+      - name: Install CUDA Toolkit
+        uses: Jimver/cuda-toolkit@v0.2.26
+        with:
+          cuda: "12.9.1"
+          method: "network"
+
+        # C/C++
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y cmake build-essential gcc-12 g++-12
+
+      - name: Build C/C++ with CUDA
+        run: |
+          cmake -B build_artifacts \
+            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+            -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
+            -DSTRINGZILLA_BUILD_BENCHMARK=1 \
+            -DSTRINGZILLA_BUILD_TEST=1
+
+          cmake --build build_artifacts --config RelWithDebInfo > build_artifacts/logs.txt 2>&1 || {
+            echo "Compilation failed. Here are the logs:"
+            cat build_artifacts/logs.txt
+            echo "The original compilation commands:"
+            cat build_artifacts/compile_commands.json
+            echo "CPU Features:"
+            lscpu
+            echo "CUDA Version:"
+            nvcc --version
+            echo "GCC Version:"
+            gcc-12 --version
+            echo "G++ Version:"
+            g++-12 --version
+            exit 1
+          }
+      - name: Test C++
+        run: build_artifacts/stringzilla_test_cpp20
+
+        # Python StringZillas-CUDA
+      - name: Set up Python ${{ env.PYTHON_VERSION }}
+        uses: actions/setup-python@v5.6.0
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+      - name: Build Python StringZillas-CUDA
+        run: |
+          pip install pytest pytest-repeat numpy pyarrow wheel
+          SZ_TARGET=stringzilla pip install -e . --force-reinstall --no-build-isolation
+          SZ_TARGET=stringzillas-cuda pip install -e . --force-reinstall --no-build-isolation
+      - name: Test Python StringZillas-CUDA
+        run: python -m pytest scripts/test_stringzillas.py -s -x
 
   # Temporary workaround to run Swift tests on Linux
   # Based on: https://github.com/swift-actions/setup-swift/issues/591#issuecomment-1685710678
@@ -393,7 +453,7 @@ jobs:
 
         # Python
       - name: Set up Python ${{ env.PYTHON_VERSION }}
-        uses: actions/setup-python@v5.2.0
+        uses: actions/setup-python@v5.6.0
         with:
           python-version: ${{ env.PYTHON_VERSION }}
       - name: Build Python
@@ -467,7 +527,7 @@ jobs:
 
         # Python
       - name: Set up Python ${{ env.PYTHON_VERSION }}
-        uses: actions/setup-python@v5.2.0
+        uses: actions/setup-python@v5.6.0
         with:
           python-version: ${{ env.PYTHON_VERSION }}
       - name: Build Python
@@ -529,15 +589,7 @@ jobs:
   build_wheels:
     name: Build Python ${{ matrix.python-version }} for ${{ matrix.os }}
     runs-on: ${{ matrix.os }}
-    needs:
-      [
-        test_ubuntu_gcc,
-        test_ubuntu_clang,
-        test_ubuntu_cpus,
-        test_macos,
-        test_windows,
-        test_alpine,
-      ]
+    needs: [test_ubuntu_gcc, test_ubuntu_clang, test_macos, test_windows]
     strategy:
       matrix:
         os: [ubuntu-24.04, macos-13, windows-2022]
@@ -547,7 +599,7 @@ jobs:
         with:
           submodules: recursive
       - name: Set up Python
-        uses: actions/setup-python@v5.2.0
+        uses: actions/setup-python@v5.6.0
         with:
           python-version: 3.x
 

From 930b1f07a44483dc81de7baa13656c87e00748d0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 19 Aug 2025 11:06:28 +0000
Subject: [PATCH 637/751] Make: Override `/std:c++` for MSVC

---
 CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fdb0df55..929185c7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -185,6 +185,9 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
     if (NOT ${cpp_standard} STREQUAL "")
         if (${compiler_id} STREQUAL "NVIDIA")
             set_target_properties(${target} PROPERTIES CUDA_STANDARD ${cpp_standard})
+        elseif (${compiler_id} MATCHES "MSVC")
+            # For MSVC, explicitly set the /std: flag
+            target_compile_options(${target} PRIVATE "/std:c++${cpp_standard}")
         else ()
             set_target_properties(${target} PROPERTIES CXX_STANDARD ${cpp_standard})
         endif ()

From 446e14e8c7d1e4c80e554b615b06a6458894de6a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 19 Aug 2025 11:57:02 +0000
Subject: [PATCH 638/751] Improve: Build & test reproducibility

---
 .github/workflows/prerelease.yml |  6 +-
 CMakeLists.txt                   | 96 ++++++++++++++++----------------
 include/stringzilla/sort.h       |  2 +-
 include/stringzilla/types.h      |  6 +-
 scripts/test_stringzilla.cpp     | 15 ++---
 scripts/test_stringzilla.hpp     |  2 +-
 scripts/test_stringzillas.py     | 51 ++++++++++++-----
 7 files changed, 103 insertions(+), 75 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index ccf9d9c6..e7e11263 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -308,13 +308,15 @@ jobs:
 
       - name: Build C/C++ with CUDA
         run: |
+          # Note: Using Release build type instead of RelWithDebInfo
+          # Optimized debugging not supported in CUDA and results in PTXAS fatal errors
           cmake -B build_artifacts \
-            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+            -DCMAKE_BUILD_TYPE=Release \
             -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
             -DSTRINGZILLA_BUILD_BENCHMARK=1 \
             -DSTRINGZILLA_BUILD_TEST=1
 
-          cmake --build build_artifacts --config RelWithDebInfo > build_artifacts/logs.txt 2>&1 || {
+          cmake --build build_artifacts --config Release > build_artifacts/logs.txt 2>&1 || {
             echo "Compilation failed. Here are the logs:"
             cat build_artifacts/logs.txt
             echo "The original compilation commands:"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 929185c7..658e5892 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -153,7 +153,7 @@ include(GNUInstallDirs)
 set(STRINGZILLA_INCLUDE_BUILD_DIR "${PROJECT_SOURCE_DIR}/include/")
 set(STRINGZILLA_INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}")
 
-if (${CMAKE_VERSION} VERSION_EQUAL 3.13 OR ${CMAKE_VERSION} VERSION_GREATER 3.13)
+if (CMAKE_VERSION VERSION_EQUAL 3.13 OR CMAKE_VERSION VERSION_GREATER 3.13)
     include(CTest)
     enable_testing()
 endif ()
@@ -182,11 +182,11 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
     endforeach ()
 
     # Set the C++ standard
-    if (NOT ${cpp_standard} STREQUAL "")
-        if (${compiler_id} STREQUAL "NVIDIA")
+    if (NOT cpp_standard STREQUAL "")
+        if (compiler_id STREQUAL "NVIDIA")
             set_target_properties(${target} PROPERTIES CUDA_STANDARD ${cpp_standard})
-        elseif (${compiler_id} MATCHES "MSVC")
-            # For MSVC, explicitly set the /std: flag
+        elseif (compiler_id MATCHES "MSVC")
+            # For MSVC, explicitly set the /std: flag - don't set CXX_STANDARD property to avoid conflicts
             target_compile_options(${target} PRIVATE "/std:c++${cpp_standard}")
         else ()
             set_target_properties(${target} PROPERTIES CXX_STANDARD ${cpp_standard})
@@ -194,12 +194,12 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
     endif ()
 
     # Use the `/Zc:__cplusplus` flag to correctly define the `__cplusplus` macro in MSVC
-    if (${compiler_id} MATCHES "MSVC")
+    if (compiler_id MATCHES "MSVC")
         target_compile_options(${target} PRIVATE "/Zc:__cplusplus")
     endif ()
 
     # Make sure CUDA C++ allows calling `constexpr` from device code
-    if (${compiler_id} STREQUAL "NVIDIA")
+    if (compiler_id STREQUAL "NVIDIA")
         target_compile_options(${target} PRIVATE "--expt-relaxed-constexpr")
     endif ()
 
@@ -207,28 +207,27 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
     #
     # MSVC uses numeric values: > 4068 for "unknown pragmas". > 4146 for "unary minus operator applied to unsigned type,
     # result still unsigned". We also specify `/utf-8` to properly UTF-8 symbols in tests.
-    if (${compiler_id} STREQUAL "GNU")
+    if (compiler_id STREQUAL "GNU")
         target_compile_options(
             ${target}
             PRIVATE
                 "-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas;-Wno-cast-function-type;-Wno-unused-function"
         )
         target_compile_options(${target} PRIVATE "-Wno-cast-function-type;-Wno-unused-function") # ? Unique to GCC
-    elseif (${compiler_id} STREQUAL "Clang" OR ${compiler_id} STREQUAL "AppleClang")
+    elseif (compiler_id STREQUAL "Clang" OR compiler_id STREQUAL "AppleClang")
         target_compile_options(${target} PRIVATE "-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas")
-    elseif (${compiler_id} MATCHES "MSVC")
+    elseif (compiler_id MATCHES "MSVC")
         target_compile_options(
             ${target}
-            PRIVATE
-            "/Bt"        # Display build timings
-            "/wd4068"    # Disable warning: unknown pragma
-            "/wd4146"    # Disable warning: unary minus operator applied to unsigned type
-            "/wd4996"    # Disable warning: 'unsafe' functions like getenv, fopen (use _s variants)
-            "/wd4244"    # Disable warning: conversion with possible loss of data (e.g., size_t to double)
-            "/utf-8"     # Set source and execution character sets to UTF-8
-            "/WX"        # Treat warnings as errors
+            PRIVATE "/Bt" # Display build timings
+                    "/wd4068" # Disable warning: unknown pragma
+                    "/wd4146" # Disable warning: unary minus operator applied to unsigned type
+                    "/wd4996" # Disable warning: 'unsafe' functions like getenv, fopen (use _s variants)
+                    "/wd4244" # Disable warning: conversion with possible loss of data (e.g., size_t to double)
+                    "/utf-8" # Set source and execution character sets to UTF-8
+                    "/WX" # Treat warnings as errors
         )
-    elseif (${compiler_id} STREQUAL "NVIDIA")
+    elseif (compiler_id STREQUAL "NVIDIA")
         target_compile_options(
             ${target}
             PRIVATE
@@ -237,38 +236,36 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
     endif ()
 
     # Set optimization options for different compilers differently
-    if (${compiler_id} MATCHES "MSVC")
-        if (${CMAKE_BUILD_TYPE} STREQUAL "Debug" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
+    if (compiler_id MATCHES "MSVC")
+        if (CMAKE_BUILD_TYPE STREQUAL "Debug")
             target_compile_options(${target} PRIVATE "/Od;/Zi")
             if (NOT target_type STREQUAL "SHARED_LIBRARY")
                 target_compile_options(${target} PRIVATE "/RTC1")
             endif ()
-        endif ()
-        if (${CMAKE_BUILD_TYPE} STREQUAL "Release" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
+        elseif (CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
             target_compile_options(${target} PRIVATE "/O2;/Zi")
         endif ()
     elseif (
-        ${compiler_id} STREQUAL "GNU"
-        OR ${compiler_id} STREQUAL "Clang"
-        OR ${compiler_id} STREQUAL "AppleClang"
+        compiler_id STREQUAL "GNU"
+        OR compiler_id STREQUAL "Clang"
+        OR compiler_id STREQUAL "AppleClang"
     )
-        if (${CMAKE_BUILD_TYPE} STREQUAL "Debug" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
+        if (CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
             target_compile_options(${target} PRIVATE "-O0;-g")
         endif ()
-        if (${CMAKE_BUILD_TYPE} STREQUAL "Release" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
+        if (CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
             target_compile_options(${target} PRIVATE "-O2")
         endif ()
-    elseif (${compiler_id} STREQUAL "NVIDIA")
+    elseif (compiler_id STREQUAL "NVIDIA")
         target_compile_options(
             ${target} PRIVATE "-Xcompiler=-Wall" # All warnings (host)
                               "-Xcompiler=-Wextra" # Extra warnings (host)
         )
 
-        if (${CMAKE_BUILD_TYPE} STREQUAL "Debug" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
+        if (CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
             target_compile_options(
                 ${target}
-                PRIVATE "-G" # Device debug symbols
-                        "-lineinfo" # Include source line info in PTX
+                PRIVATE "-G" # Device debug symbols, which will add `-lineinfo` symbols to PTX
                         "-no-compress" # No compression of debug info
                         "-Xcompiler=-g" # Host debugging symbols explicitly
                         "-Xcompiler=-fno-omit-frame-pointer" # Stack trace clarity
@@ -276,7 +273,7 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
                         "-maxrregcount=0" # No register count limits
             )
         endif ()
-        if (${CMAKE_BUILD_TYPE} STREQUAL "Release" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo")
+        if (CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
             target_compile_options(
                 ${target}
                 PRIVATE "-O2" # Disable NVCC optimizations explicitly
@@ -293,7 +290,7 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
     endif ()
 
     # Avoid builtin functions where we know what we are doing.
-    if (${compiler_id} MATCHES "MSVC")
+    if (compiler_id MATCHES "MSVC")
         target_compile_options(${target} PRIVATE "/Oi-")
     else ()
         target_compile_options(${target} PRIVATE "-fno-builtin-memcmp")
@@ -306,14 +303,14 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
     if ("${target_arch}" STREQUAL "")
         # Only use the current system if we are not cross compiling
         if ((NOT CMAKE_CROSSCOMPILING) OR (CMAKE_SYSTEM_PROCESSOR MATCHES CMAKE_HOST_SYSTEM_PROCESSOR))
-            if (${compiler_id} STREQUAL "NVIDIA")
+            if (compiler_id STREQUAL "NVIDIA")
                 # For NVCC, pass native flag to host compiler
                 include(CheckCXXCompilerFlag)
                 check_cxx_compiler_flag("-march=native" supports_march_native)
                 if (supports_march_native)
                     target_compile_options(${target} PRIVATE "-Xcompiler=-march=native")
                 endif ()
-            elseif (NOT (${compiler_id} MATCHES "MSVC"))
+            elseif (NOT (compiler_id MATCHES "MSVC"))
                 include(CheckCXXCompilerFlag)
                 check_cxx_compiler_flag("-march=native" supports_march_native)
                 if (supports_march_native)
@@ -325,9 +322,9 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
             endif ()
         endif ()
     else ()
-        if (${compiler_id} MATCHES "MSVC")
+        if (compiler_id MATCHES "MSVC")
             target_compile_options(${target} PRIVATE "/arch:${target_arch}")
-        elseif (${compiler_id} STREQUAL "NVIDIA")
+        elseif (compiler_id STREQUAL "NVIDIA")
             # NVCC handles CPU architecture through host compiler flags
             target_compile_options(${target} PRIVATE "-Xcompiler=-march=${target_arch}")
         else ()
@@ -348,10 +345,10 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
     if (CMAKE_BUILD_TYPE STREQUAL "Debug")
         target_compile_definitions(${target} PRIVATE "SZ_DEBUG=1")
         if (NOT target_type STREQUAL "SHARED_LIBRARY")
-            if (${compiler_id} MATCHES "MSVC")
+            if (compiler_id MATCHES "MSVC")
                 target_compile_options(${target} PRIVATE "/fsanitize=address;/fsanitize=leak")
                 target_link_options(${target} PRIVATE "/fsanitize=address;/fsanitize=leak")
-            elseif (${compiler_id} STREQUAL "NVIDIA")
+            elseif (compiler_id STREQUAL "NVIDIA")
                 # ! NVCC can't handle sanitizers?!
                 # https://stackoverflow.com/questions/75590579/cuda-fails-to-initialise-when-address-sanitizer-is-enabled
             else ()
@@ -367,8 +364,9 @@ endfunction ()
 function (define_launcher exec_name source cpp_standard target_arch)
     add_executable(${exec_name})
     target_sources(${exec_name} PRIVATE ${source})
-    # TODO: How do we constrain this scope?! set_source_files_properties(${source} TARGET_DIRECTORY ${exec_name}
-    # PROPERTIES LANGUAGE CXX)
+    # TODO: How do we constrain this scope?!
+    #
+    # set_source_files_properties(${source} TARGET_DIRECTORY ${exec_name} PROPERTIES LANGUAGE CXX)
     set_compiler_flags(${exec_name} ${cpp_standard} "${target_arch}" "${CMAKE_CXX_COMPILER_ID}")
     target_link_libraries(${exec_name} PRIVATE stringzilla_header)
     add_test(NAME ${exec_name} COMMAND ${exec_name})
@@ -389,7 +387,7 @@ function (define_gpu_launcher exec_name source cuda_standard target_arch)
     add_test(NAME ${exec_name} COMMAND ${exec_name})
 endfunction ()
 
-if (${STRINGZILLA_BUILD_BENCHMARK})
+if (STRINGZILLA_BUILD_BENCHMARK)
     define_launcher(stringzilla_bench_find_cpp20 scripts/bench_find.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_sequence_cpp20 scripts/bench_sequence.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
     define_launcher(stringzilla_bench_token_cpp20 scripts/bench_token.cpp 20 "${STRINGZILLA_TARGET_ARCH}")
@@ -413,7 +411,7 @@ if (${STRINGZILLA_BUILD_BENCHMARK})
     endif ()
 endif ()
 
-if (${STRINGZILLA_BUILD_TEST})
+if (STRINGZILLA_BUILD_TEST)
     # Make sure that the compilation passes for different C++ standards!
     #
     # Keep in mind, MSVC only supports C++11 and newer.
@@ -462,14 +460,14 @@ if (${STRINGZILLA_BUILD_TEST})
         define_launcher(stringzilla_test_cpp20_serial scripts/test_stringzilla.cpp 20 "armv8-a")
         define_launcher(stringzilla_test_cpp20_neon scripts/test_stringzilla.cpp 20 "armv8-a+simd")
         # SVE is not supported on Apple Silicon, only compile on non-Darwin ARM platforms
-        if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+        if (NOT CMAKE_SYSTEM_NAME MATCHES "Darwin")
             define_launcher(stringzilla_test_cpp20_sve scripts/test_stringzilla.cpp 20 "armv8.2-a+sve")
         endif ()
         if (STRINGZILLA_BUILD_CUDA)
             define_gpu_launcher(stringzillas_test_cu20_serial scripts/test_stringzillas.cu 20 "armv8-a")
             define_gpu_launcher(stringzillas_test_cu20_neon scripts/test_stringzillas.cu 20 "armv8-a+simd")
             # SVE is not supported on Apple Silicon, only compile on non-Darwin ARM platforms
-            if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+            if (NOT CMAKE_SYSTEM_NAME MATCHES "Darwin")
                 define_gpu_launcher(stringzillas_test_cu20_sve scripts/test_stringzillas.cu 20 "armv8.2-a+sve")
             endif ()
         endif ()
@@ -483,7 +481,7 @@ target_include_directories(
     stringzilla_header INTERFACE $<BUILD_INTERFACE:${STRINGZILLA_INCLUDE_BUILD_DIR}> $<INSTALL_INTERFACE:include>
 )
 
-if (${STRINGZILLA_BUILD_SHARED})
+if (STRINGZILLA_BUILD_SHARED)
 
     function (define_shared target)
         add_library(${target} SHARED c/stringzilla.c)
@@ -536,7 +534,7 @@ if (${STRINGZILLA_BUILD_SHARED})
     # Try compiling a version without linking the LibC ! This is only for Linux, as on modern Arm-based MacOS machines !
     # We can't legally access Arm's "feature registers" without `sysctl` or `sysctlbyname`. Also exclude MSVC builds as
     # they have linker issues with bare builds.
-    if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin" AND NOT CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
+    if (NOT CMAKE_SYSTEM_NAME MATCHES "Darwin" AND NOT CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
         define_shared(stringzilla_bare)
         target_compile_definitions(stringzilla_bare PRIVATE "SZ_AVOID_LIBC=1")
         target_compile_definitions(stringzilla_bare PRIVATE "SZ_OVERRIDE_LIBC=1")
@@ -548,7 +546,7 @@ if (${STRINGZILLA_BUILD_SHARED})
     endif ()
 endif ()
 
-if (${STRINGZILLAS_BUILD_SHARED})
+if (STRINGZILLAS_BUILD_SHARED)
     # StringZillas shared library targets for parallel string operations
     function (define_stringzillas_shared target source_file backend_flags)
         add_library(${target} SHARED ${source_file})
diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index 187b63b4..b0145046 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -375,7 +375,7 @@ SZ_INTERNAL void sz_sequence_argsort_serial_export_next_pgrams_(
         sz_ptr_t target_str = (sz_ptr_t)target_pgram;
         *target_pgram = 0;
         for (sz_size_t j = 0; j < exported_length; ++j) target_str[j] = source_str[j + start_character];
-        target_str[pgram_capacity] = exported_length;
+        target_str[pgram_capacity] = (char)exported_length;
 #if defined(SZ_IS_64BIT_)
         *target_pgram = sz_u64_bytes_reverse(*target_pgram);
 #else
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 3e037d90..9b92e749 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -979,11 +979,15 @@ SZ_PUBLIC void sz_sequence_from_null_terminated_strings(sz_cptr_t *start, sz_siz
 #define sz_unused_(x) ((void)(x))
 
 /** @brief Helper-macro casting a variable to another type of the same size. */
-#if defined(__has_builtin) && __has_builtin(__builtin_bit_cast)
+#if !defined(_MSC_VER) && defined(__has_builtin)
+#if __has_builtin(__builtin_bit_cast)
 #define sz_bitcast_(type, value) __builtin_bit_cast(type, (value))
 #else
 #define sz_bitcast_(type, value) (*((type *)&(value)))
 #endif
+#else
+#define sz_bitcast_(type, value) (*((type *)&(value)))
+#endif
 
 /**
  *  @brief  Defines `SZ_NULL`, analogous to `NULL`.
diff --git a/scripts/test_stringzilla.cpp b/scripts/test_stringzilla.cpp
index 2d98b6ee..de694c67 100644
--- a/scripts/test_stringzilla.cpp
+++ b/scripts/test_stringzilla.cpp
@@ -54,6 +54,7 @@
 #include <iterator>      // `std::distance`
 #include <map>           // `std::map`
 #include <memory>        // `std::allocator`
+#include <numeric>       // `std::accumulate`
 #include <random>        // `std::random_device`
 #include <sstream>       // `std::ostringstream`
 #include <unordered_map> // `std::unordered_map`
@@ -235,17 +236,17 @@ void test_memory_allocator_struct() {
 void test_byteset_struct() {
     sz_byteset_t s;
     sz_byteset_init(&s);
-    assert(sz_byteset_contains(&s, 'a') == false);
+    assert(sz_byteset_contains(&s, 'a') == sz_false_k);
     sz_byteset_add(&s, 'a');
-    assert(sz_byteset_contains(&s, 'a') == true);
+    assert(sz_byteset_contains(&s, 'a') == sz_true_k);
     sz_byteset_add(&s, 'z');
-    assert(sz_byteset_contains(&s, 'z') == true);
+    assert(sz_byteset_contains(&s, 'z') == sz_true_k);
     sz_byteset_invert(&s);
-    assert(sz_byteset_contains(&s, 'a') == false);
-    assert(sz_byteset_contains(&s, 'z') == false);
-    assert(sz_byteset_contains(&s, 'b') == true);
+    assert(sz_byteset_contains(&s, 'a') == sz_false_k);
+    assert(sz_byteset_contains(&s, 'z') == sz_false_k);
+    assert(sz_byteset_contains(&s, 'b') == sz_true_k);
     sz_byteset_init_ascii(&s);
-    assert(sz_byteset_contains(&s, 'A') == true);
+    assert(sz_byteset_contains(&s, 'A') == sz_true_k);
 }
 
 /**
diff --git a/scripts/test_stringzilla.hpp b/scripts/test_stringzilla.hpp
index 6f42d7a1..d15073d4 100644
--- a/scripts/test_stringzilla.hpp
+++ b/scripts/test_stringzilla.hpp
@@ -81,7 +81,7 @@ struct uniform_u8_distribution_t {
 };
 
 inline void randomize_string(char *string, std::size_t length, char const *alphabet, std::size_t cardinality) noexcept {
-    uniform_u8_distribution_t distribution(0, cardinality - 1);
+    uniform_u8_distribution_t distribution(0, static_cast<char>(cardinality - 1));
     std::generate(string, string + length, [&]() -> char { return alphabet[distribution(global_random_generator())]; });
 }
 
diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index 316909a6..76c5fb56 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -17,7 +17,7 @@
     uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x
 """
 
-from random import choice, randint
+from random import choice, randint, seed
 from string import ascii_lowercase
 from typing import Optional, Literal
 
@@ -57,6 +57,14 @@ def device_scope_and_capabilities(device: DeviceName):
 InputSizeConfig = Literal["one-large", "few-big", "many-small"]
 INPUT_SIZE_CONFIGS = ["one-large", "few-big", "many-small"]
 
+# Reproducible test seeds for consistent CI runs
+SEED_VALUES = [
+    42,  # Classic test seed
+    0,  # Edge case: zero seed
+    1,  # Minimal positive seed
+    314159,  # Pi digits
+]
+
 
 def generate_string_batches(config: InputSizeConfig):
     """Generate string batches based on the specified configuration.
@@ -74,8 +82,12 @@ def generate_string_batches(config: InputSizeConfig):
         raise ValueError(f"Unknown input size config: {config}")
 
 
-def get_random_string_batch(config: InputSizeConfig):
+def get_random_string_batch(config: InputSizeConfig, seed_value: Optional[int] = None):
     """Generate two batches of random strings based on the configuration."""
+    if seed_value is not None:
+        seed(seed_value)
+        np.random.seed(seed_value)
+
     batch_size, min_len, max_len = generate_string_batches(config)
 
     # Generate random lengths for each string in the batch
@@ -111,7 +123,7 @@ def test_device_scope():
     # Test cpu_cores=1 redirects to default scope
     scope_single = szs.DeviceScope(cpu_cores=1)
     assert scope_single is not None
-    
+
     # Test cpu_cores=0 uses all available cores
     scope_all = szs.DeviceScope(cpu_cores=0)
     assert scope_all is not None
@@ -173,13 +185,19 @@ def baseline_levenshtein_distance(s1, s2) -> int:
     return matrix[len(s1), len(s2)]
 
 
-@pytest.mark.repeat(100)
 @pytest.mark.parametrize("max_edit_distance", [150])
-def test_levenshtein_distance_insertions(max_edit_distance: int):
+@pytest.mark.parametrize("seed_value", SEED_VALUES)
+def test_levenshtein_distance_insertions(max_edit_distance: int, seed_value: int):
+    """Test Levenshtein distance with sequential insertions using deterministic seeds."""
+
     # Create a new string by slicing and concatenating
     def insert_char_at(s, char_to_insert, index):
         return s[:index] + char_to_insert + s[index:]
 
+    # Set seed for reproducible random strings
+    seed(seed_value)
+    np.random.seed(seed_value)
+
     binary_engine = szs.LevenshteinDistances()
 
     a = get_random_string(length=20)
@@ -317,10 +335,11 @@ def unicode_distance(a: str, b: str) -> int:
     assert unicode_distance("こんにちは世界", "こんばんは世界") == min(2 * mismatch, 4 * opening), "Japanese greetings"
 
 
-@pytest.mark.repeat(10)
 @pytest.mark.parametrize("config", INPUT_SIZE_CONFIGS)
-def test_levenshtein_distance_random(config: InputSizeConfig):
-    a_batch, b_batch = get_random_string_batch(config)
+@pytest.mark.parametrize("seed_value", SEED_VALUES)
+def test_levenshtein_distance_random(config: InputSizeConfig, seed_value: int):
+    """Test Levenshtein distances with deterministic seeds for reproducibility."""
+    a_batch, b_batch = get_random_string_batch(config, seed_value)
 
     baselines = np.array([baseline_levenshtein_distance(a, b) for a, b in zip(a_batch, b_batch)])
     engine = szs.LevenshteinDistances()
@@ -333,12 +352,12 @@ def test_levenshtein_distance_random(config: InputSizeConfig):
     np.testing.assert_array_equal(results, baselines, "Edit distances do not match")
 
 
-@pytest.mark.repeat(10)
 @pytest.mark.parametrize("config", INPUT_SIZE_CONFIGS)
-def test_needleman_wunsch_vs_levenshtein_random(config: InputSizeConfig):
+@pytest.mark.parametrize("seed_value", SEED_VALUES)
+def test_needleman_wunsch_vs_levenshtein_random(config: InputSizeConfig, seed_value: int):
     """Test Needleman-Wunsch global alignment scores against Levenshtein distances with random strings."""
 
-    a_batch, b_batch = get_random_string_batch(config)
+    a_batch, b_batch = get_random_string_batch(config, seed_value)
 
     character_substitutions = np.zeros((256, 256), dtype=np.int8)
     character_substitutions.fill(-1)
@@ -389,12 +408,16 @@ def test_fingerprints(device_name: str, ndim: int):
     assert not np.array_equal(hashes[0], hashes[1]), "Different strings should produce different hashes"
 
 
-@pytest.mark.repeat(5)
 @pytest.mark.parametrize("batch_size", [1, 10, 100])
 @pytest.mark.parametrize("device_name", DEVICE_NAMES)
 @pytest.mark.parametrize("ndim", [1, 7, 64, 1024])
-def test_fingerprints_random(batch_size: int, device_name: str, ndim: int):
-    """Test Fingerprints with random strings."""
+@pytest.mark.parametrize("seed_value", [42, 123, 1337, 12345, 98765])  # Subset of seeds for this test
+def test_fingerprints_random(batch_size: int, device_name: str, ndim: int, seed_value: int):
+    """Test Fingerprints with random strings using deterministic seeds."""
+
+    # Set seed for reproducible random strings
+    seed(seed_value)
+    np.random.seed(seed_value)
 
     device_scope, capabilities = device_scope_and_capabilities(device_name)
     engine = szs.Fingerprints(ndim=ndim, capabilities=capabilities)

From 42f043c48983297a0c8a695d528f8ea00499152b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 20 Aug 2025 11:25:52 +0000
Subject: [PATCH 639/751] Fix: Check CUDA in `szs_capabilities`

---
 c/stringzillas.cuh             | 19 +++++--
 fork_union                     |  2 +-
 include/stringzilla/types.h    |  1 +
 include/stringzilla/types.hpp  | 98 ++++++++++++++++++++++++++--------
 include/stringzillas/types.cuh |  3 +-
 5 files changed, 97 insertions(+), 26 deletions(-)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index 2cfb71df..530ae7fe 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -755,12 +755,25 @@ SZ_DYNAMIC int szs_version_minor(void) { return STRINGZILLA_H_VERSION_MINOR; }
 SZ_DYNAMIC int szs_version_patch(void) { return STRINGZILLA_H_VERSION_PATCH; }
 
 SZ_DYNAMIC sz_capability_t szs_capabilities(void) {
-    sz_capability_t cpu_capabilities = sz_capabilities_implementation_();
+    // Preserve the static capabilities
+    static sz_capability_t static_caps = sz_caps_none_k;
+    if (static_caps == sz_caps_none_k) {
+        sz_capability_t cpu_caps = sz_capabilities_implementation_();
+        sz_capability_t gpu_caps = sz_caps_none_k;
 #if SZ_USE_CUDA
-    return static_cast<sz_capability_t>(cpu_capabilities | sz_caps_ckh_k);
+        sz::gpu_specs_t first_gpu_specs;
+        auto specs_status = static_cast<sz::status_t>(szs::gpu_specs_fetch(first_gpu_specs));
+        if (specs_status != sz::status_t::success_k) return static_caps;
+        gpu_caps = static_cast<sz_capability_t>(gpu_caps | sz_cap_cuda_k);
+        if (first_gpu_specs.sm_code >= 30) gpu_caps = static_cast<sz_capability_t>(gpu_caps | sz_cap_kepler_k);
+        if (first_gpu_specs.sm_code >= 90) gpu_caps = static_cast<sz_capability_t>(gpu_caps | sz_cap_hopper_k);
+        static_caps = static_cast<sz_capability_t>(cpu_caps | gpu_caps);
 #else
-    return cpu_capabilities;
+        static_caps = cpu_caps;
 #endif // SZ_USE_CUDA
+    }
+
+    return static_caps;
 }
 
 SZ_DYNAMIC sz_status_t sz_memory_allocator_init_unified(sz_memory_allocator_t *alloc) {
diff --git a/fork_union b/fork_union
index 63076d51..3f914593 160000
--- a/fork_union
+++ b/fork_union
@@ -1 +1 @@
-Subproject commit 63076d5190db5caa111d06ec05998724cb50c046
+Subproject commit 3f9145935e0d7709a1bcda28c4dd425e94a7c1df
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 9b92e749..ab0f95b5 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -486,6 +486,7 @@ typedef enum sz_capability_t {
     sz_cap_kepler_k = 1 << 21, ///< CUDA capability with support with in-warp register shuffles
     sz_cap_hopper_k = 1 << 22, ///< CUDA capability with support for Hopper's DPX instructions
 
+    sz_caps_none_k = 0,
     sz_caps_sp_k = sz_cap_serial_k | sz_cap_parallel_k,                 ///< Serial code with Fork Union
     sz_caps_si_k = sz_cap_serial_k | sz_cap_ice_k,                      ///< Serial code with Ice Lake
     sz_caps_spi_k = sz_cap_serial_k | sz_cap_parallel_k | sz_cap_ice_k, ///< Serial code with Fork Union and Ice Lake
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 9ae33550..6698205a 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -741,6 +741,22 @@ struct is_same_type {
     static constexpr bool value = false;
 };
 
+struct cpu_specs_t {
+    size_t l1_bytes = 32 * 1024;       // ? typically around 32 KB
+    size_t l2_bytes = 256 * 1024;      // ? typically around 256 KB
+    size_t l3_bytes = 8 * 1024 * 1024; // ? typically around 8 MB
+    size_t cache_line_width = 64;      // ? 64 bytes on x86, sometimes 128 on ARM
+    size_t cores_per_socket = 1;       // ? at least 1 core
+    size_t sockets = 1;                // ? at least 1 socket
+
+    size_t cores_total() const noexcept { return cores_per_socket * sockets; }
+};
+
+/**
+ *  @brief Specifications of a typical NVIDIA GPU, such as A100 or H100.
+ *  @sa pack_sm_code, cores_per_multiprocessor helpers.
+ *  @note We recommend compiling the code for the 90a compute capability, the newest with specialized optimizations.
+ */
 struct gpu_specs_t {
     size_t vram_bytes = 40ul * 1024 * 1024 * 1024; // ? On A100 it's 40 GB
     size_t constant_memory_bytes = 64 * 1024;      // ? On A100 it's 64 KB
@@ -749,43 +765,83 @@ struct gpu_specs_t {
     size_t cuda_cores = 6912;                      // ? On A100 for f32/i32 logic
     size_t reserved_memory_per_block = 1024;       // ? Typically, 1 KB per block is reserved for bookkeeping
     size_t warp_size = 32;                         // ? Warp size is 32 threads on practically all GPUs
-    size_t max_blocks_per_multiprocessor = 0;
+    size_t max_blocks_per_multiprocessor = 0;      // ? Maximum number of blocks per SM
+    size_t sm_code = 0;                            // ? Compute capability code, e.g. 90a for Hopper (H100)
 
     inline size_t shared_memory_per_multiprocessor() const noexcept {
         return shared_memory_bytes / streaming_multiprocessors;
     }
 
-    inline static size_t cores_per_multiprocessor(int major, int minor) noexcept {
+    /**
+     *  @brief Converts a compute capability (major, minor) to a single numeric code.
+     *
+     *  - 7.0, 7.2 is Volta, like V100                  - maps to 70, 72
+     *  - 7.5 is Turing, like RTX 2080 Ti               - maps to 75
+     *  - 8.0, 8.6, 8.7 is Ampere, like A100, RTX 3090  - maps to 80, 86, 87
+     *  - 8.9 is Ada Lovelace, like RTX 4090            - maps to 89
+     *  - 9.0 is Hopper, like H100                      - maps to 90
+     *  - 12.0, 12.1 is Blackwell, like B200            - maps to 120, 121
+     */
+    inline static size_t pack_sm_code(int major, int minor) noexcept { return ((major * 10) + minor); }
+
+    /**
+     *  @brief Looks up hardware specs for a given compute capability (major, minor).
+     *  @param[in] sm The compute capability code obtained from `pack_sm_code(major, minor)`.
+     *  @sa Used to populate the `cuda_cores` property.
+     */
+    inline static size_t cores_per_multiprocessor(size_t sm) noexcept {
         typedef struct {
             size_t sm;
             size_t cores;
         } generation_to_core_count;
         generation_to_core_count generations_to_core_counts[] = {
-            {(7 << 4) + 0, 64},  // Compute Capability 7.0 (V100)
-            {(7 << 4) + 5, 64},  // Compute Capability 7.5 (RTX 2080 Ti)
-            {(8 << 4) + 0, 64},  // Compute Capability 8.0 (A100)
-            {(8 << 4) + 6, 128}, // Compute Capability 8.6 (RTX 3090)
-            {(9 << 4) + 0, 128}, // Compute Capability 9.0 (H100)
+            // Kepler architecture (2012-2014)
+            {pack_sm_code(3, 0), 192}, // Capability 3.0 (GK104 - GTX 680, GTX 770)
+            {pack_sm_code(3, 5), 192}, // Capability 3.5 (GK110 - GTX 780 Ti, GTX Titan, Tesla K20/K40)
+            {pack_sm_code(3, 7), 192}, // Capability 3.7 (GK210 - Tesla K80)
+
+            // Maxwell architecture (2014-2016)
+            {pack_sm_code(5, 0), 128}, // Capability 5.0 (GM107/GM108 - GTX 750/750 Ti, GTX 850M/860M)
+            {pack_sm_code(5, 2), 128}, // Capability 5.2 (GM200/GM204/GM206 - GTX 980/970, Titan X)
+            {pack_sm_code(5, 3), 128}, // Capability 5.3 (GM20B - Jetson TX1, Tegra X1)
+
+            // Pascal architecture (2016-2018)
+            {pack_sm_code(6, 0), 64},  // Capability 6.0 (GP100 - Tesla P100) - HPC focused, different SM design
+            {pack_sm_code(6, 1), 128}, // Capability 6.1 (GP102/GP104/GP106/GP107 - GTX 1080/1070/1060/1050, Titan X/Xp)
+            {pack_sm_code(6, 2), 128}, // Capability 6.2 (GP10B - Jetson TX2, Tegra X2)
+
+            // Volta architecture (2017-2018)
+            {pack_sm_code(7, 0), 64}, // Capability 7.0 (GV100 - Tesla V100, Titan V) - Tensor Core architecture
+            {pack_sm_code(7, 2), 64}, // Capability 7.2 (GV11B - Jetson AGX Xavier, Tegra Xavier)
+
+            // Turing architecture (2018-2020)
+            {pack_sm_code(7, 5), 64}, // Capability 7.5 (TU102/TU104/TU106/TU116/TU117 - RTX 20xx, GTX 16xx)
+
+            // Ampere architecture (2020-2022)
+            {pack_sm_code(8, 0), 64},  // Capability 8.0 (GA100 - A100) - HPC focused
+            {pack_sm_code(8, 6), 128}, // Capability 8.6 (GA102/GA104/GA106/GA107 - RTX 3090/3080/3070/3060)
+            {pack_sm_code(8, 7), 128}, // Capability 8.7 (GA10B - Jetson AGX Orin, Tegra Orin)
+
+            // Ada Lovelace architecture (2022-2023)
+            {pack_sm_code(8, 9), 128}, // Capability 8.9 (AD102/AD103/AD104/AD106/AD107 - RTX 40xx)
+
+            // Hopper architecture (2022-2024)
+            {pack_sm_code(9, 0), 128}, // Capability 9.0 (GH100 - H100, H200)
+
+            // Blackwell architecture (2024+)
+            {pack_sm_code(12, 0), 128}, // Capability 12.0 (GB100 - B100)
+            {pack_sm_code(12, 1), 128}, // Capability 12.1 (GB200 - B200)
+
             {0, 0}};
 
-        // Create a numeric code: for SM 3.5, SM = (3 << 4 + 5) = 0x35.
-        size_t sm = ((major << 4) + minor);
         size_t index = 0;
         for (; generations_to_core_counts[index].sm != 0; ++index)
             if (generations_to_core_counts[index].sm == sm) return generations_to_core_counts[index].cores;
-        return generations_to_core_counts[index - 1].cores;
-    }
-};
 
-struct cpu_specs_t {
-    size_t l1_bytes = 32 * 1024;       // ? typically around 32 KB
-    size_t l2_bytes = 256 * 1024;      // ? typically around 256 KB
-    size_t l3_bytes = 8 * 1024 * 1024; // ? typically around 8 MB
-    size_t cache_line_width = 64;      // ? 64 bytes on x86, sometimes 128 on ARM
-    size_t cores_per_socket = 1;       // ? at least 1 core
-    size_t sockets = 1;                // ? at least 1 socket
-
-    size_t cores_total() const noexcept { return cores_per_socket * sockets; }
+        // If exact match not found, return the most recent known architecture's core count
+        // This provides forward compatibility for newer architectures
+        return (index > 0) ? generations_to_core_counts[index - 1].cores : 128;
+    }
 };
 
 /**
diff --git a/include/stringzillas/types.cuh b/include/stringzillas/types.cuh
index f7d974e1..3abedc9b 100644
--- a/include/stringzillas/types.cuh
+++ b/include/stringzillas/types.cuh
@@ -114,7 +114,8 @@ inline cuda_status_t gpu_specs_fetch(gpu_specs_t &specs, int device_id = 0) noex
 
     // Infer other global settings, that CUDA doesn't expose directly
     specs.shared_memory_bytes = prop.sharedMemPerMultiprocessor * prop.multiProcessorCount;
-    specs.cuda_cores = gpu_specs_t::cores_per_multiprocessor(prop.major, prop.minor) * specs.streaming_multiprocessors;
+    specs.sm_code = gpu_specs_t::pack_sm_code(prop.major, prop.minor);
+    specs.cuda_cores = gpu_specs_t::cores_per_multiprocessor(specs.sm_code) * specs.streaming_multiprocessors;
 
     // Scheduling-related constants
     specs.max_blocks_per_multiprocessor = prop.maxBlocksPerMultiProcessor;

From a340131fd0c1f88622eeea131c8ed527f9169f03 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 20 Aug 2025 15:06:06 +0000
Subject: [PATCH 640/751] Improve: Reduce sign-casting issues

---
 CMakeLists.txt                        | 10 ++++--
 include/stringzilla/types.h           | 46 +++++++++++++--------------
 include/stringzilla/types.hpp         |  2 +-
 include/stringzillas/similarities.hpp |  4 +--
 4 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 658e5892..3c8e0b81 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -211,11 +211,14 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
         target_compile_options(
             ${target}
             PRIVATE
-                "-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas;-Wno-cast-function-type;-Wno-unused-function"
+                "-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas;-Wno-cast-function-type;-Wno-unused-function;-Wno-sign-conversion"
         )
         target_compile_options(${target} PRIVATE "-Wno-cast-function-type;-Wno-unused-function") # ? Unique to GCC
     elseif (compiler_id STREQUAL "Clang" OR compiler_id STREQUAL "AppleClang")
-        target_compile_options(${target} PRIVATE "-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas")
+        target_compile_options(
+            ${target}
+            PRIVATE "-Wall;-Wextra;-pedantic;-Werror;-Wfatal-errors;-Wno-unknown-pragmas;-Wno-sign-conversion"
+        )
     elseif (compiler_id MATCHES "MSVC")
         target_compile_options(
             ${target}
@@ -223,7 +226,8 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
                     "/wd4068" # Disable warning: unknown pragma
                     "/wd4146" # Disable warning: unary minus operator applied to unsigned type
                     "/wd4996" # Disable warning: 'unsafe' functions like getenv, fopen (use _s variants)
-                    "/wd4244" # Disable warning: conversion with possible loss of data (e.g., size_t to double)
+                    "/wd4244" # Disable warning: conversion with possible loss of data (e.g., float to int)
+                    "/wd4267" # Disable warning: conversion from 'size_t' to smaller type, possible loss of data
                     "/utf-8" # Set source and execution character sets to UTF-8
                     "/WX" # Treat warnings as errors
         )
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index ab0f95b5..4e01065b 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -858,37 +858,37 @@ SZ_PUBLIC void sz_rune_parse(sz_cptr_t utf8, sz_rune_t *code, sz_rune_length_t *
     sz_rune_length_t ch_length;
 
     // TODO: This can be made entirely branchless using 32-bit SWAR.
-    if (leading_byte < 0x80) {
+    if (leading_byte < 0x80U) {
         // Single-byte rune (0xxxxxxx)
         ch = leading_byte;
         ch_length = sz_utf8_rune_1byte_k;
     }
-    else if ((leading_byte & 0xE0) == 0xC0) {
+    else if ((leading_byte & 0xE0U) == 0xC0U) {
         // Two-byte rune (110xxxxx 10xxxxxx)
-        ch = (leading_byte & 0x1F) << 6;
-        ch |= (*current++ & 0x3F);
+        ch = (leading_byte & 0x1FU) << 6;
+        ch |= (*current++ & 0x3FU);
         ch_length = sz_utf8_rune_2bytes_k;
     }
-    else if ((leading_byte & 0xF0) == 0xE0) {
+    else if ((leading_byte & 0xF0U) == 0xE0U) {
         // Three-byte rune (1110xxxx 10xxxxxx 10xxxxxx)
-        ch = (leading_byte & 0x0F) << 12;
-        ch |= (*current++ & 0x3F) << 6;
-        ch |= (*current++ & 0x3F);
+        ch = (leading_byte & 0x0FU) << 12;
+        ch |= (*current++ & 0x3FU) << 6;
+        ch |= (*current++ & 0x3FU);
         ch_length = sz_utf8_rune_3bytes_k;
     }
-    else if ((leading_byte & 0xF8) == 0xF0) {
+    else if ((leading_byte & 0xF8U) == 0xF0U) {
         // Four-byte rune (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
-        ch = (leading_byte & 0x07) << 18;
-        ch |= (*current++ & 0x3F) << 12;
-        ch |= (*current++ & 0x3F) << 6;
-        ch |= (*current++ & 0x3F);
+        ch = (leading_byte & 0x07U) << 18;
+        ch |= (*current++ & 0x3FU) << 12;
+        ch |= (*current++ & 0x3FU) << 6;
+        ch |= (*current++ & 0x3FU);
         // Check if the code point is within valid Unicode range (U+0000 to U+10FFFF)
-        if (ch > 0x10FFFF) { ch = 0, ch_length = sz_utf8_invalid_k; }
+        if (ch > 0x10FFFFU) { ch = 0U, ch_length = sz_utf8_invalid_k; }
         else { ch_length = sz_utf8_rune_4bytes_k; }
     }
     else {
         // Invalid UTF8 rune.
-        ch = 0;
+        ch = 0U;
         ch_length = sz_utf8_invalid_k;
     }
     *code = ch;
@@ -1231,20 +1231,20 @@ SZ_INTERNAL void sz_ssize_clamp_interval( //
     sz_size_t length, sz_ssize_t start, sz_ssize_t end, sz_size_t *normalized_offset, sz_size_t *normalized_length) {
     // TODO: Remove branches.
     // Normalize negative indices
-    if (start < 0) start += length;
-    if (end < 0) end += length;
+    if (start < 0) start += (sz_ssize_t)length;
+    if (end < 0) end += (sz_ssize_t)length;
 
     // Clamp indices to a valid range
     if (start < 0) start = 0;
     if (end < 0) end = 0;
-    if (start > (sz_ssize_t)length) start = length;
-    if (end > (sz_ssize_t)length) end = length;
+    if (start > (sz_ssize_t)length) start = (sz_ssize_t)length;
+    if (end > (sz_ssize_t)length) end = (sz_ssize_t)length;
 
     // Ensure start <= end
     if (start > end) start = end;
 
-    *normalized_offset = start;
-    *normalized_length = end - start;
+    *normalized_offset = (sz_size_t)(start);
+    *normalized_length = (sz_size_t)(end - start);
 }
 
 /**
@@ -1253,8 +1253,8 @@ SZ_INTERNAL void sz_ssize_clamp_interval( //
  */
 SZ_INTERNAL sz_size_t sz_size_log2i_nonzero(sz_size_t x) {
     sz_assert_(x > 0 && "Non-positive numbers have no defined logarithm");
-    sz_size_t leading_zeros = sz_u64_clz(x);
-    return 63 - leading_zeros;
+    int leading_zeros = sz_u64_clz(x);
+    return (sz_size_t)(63 - leading_zeros);
 }
 
 /**
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 6698205a..87e7e7a0 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -782,7 +782,7 @@ struct gpu_specs_t {
      *  - 9.0 is Hopper, like H100                      - maps to 90
      *  - 12.0, 12.1 is Blackwell, like B200            - maps to 120, 121
      */
-    inline static size_t pack_sm_code(int major, int minor) noexcept { return ((major * 10) + minor); }
+    inline static size_t pack_sm_code(int major, int minor) noexcept { return static_cast<size_t>((major * 10) + minor); }
 
     /**
      *  @brief Looks up hardware specs for a given compute capability (major, minor).
diff --git a/include/stringzillas/similarities.hpp b/include/stringzillas/similarities.hpp
index d5df4d4c..3f31b735 100644
--- a/include/stringzillas/similarities.hpp
+++ b/include/stringzillas/similarities.hpp
@@ -1532,10 +1532,10 @@ struct horizontal_walker<char_type_, score_type_, substituter_type_, affine_gap_
             result_ref = 0;
             if constexpr (locality_k == sz_similarity_global_k) {
                 if (!first.empty() && second.empty()) {
-                    result_ref = gap_costs_.open + gap_costs_.extend * (first.size() - 1);
+                    result_ref = static_cast<score_t>(gap_costs_.open + gap_costs_.extend * (first.size() - 1));
                 }
                 else if (first.empty() && !second.empty()) {
-                    result_ref = gap_costs_.open + gap_costs_.extend * (second.size() - 1);
+                    result_ref = static_cast<score_t>(gap_costs_.open + gap_costs_.extend * (second.size() - 1));
                 }
             }
             return status_t::success_k;

From 48a538cda4b9d1f45052a804c1ff3af7db5c35a6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 26 Aug 2025 14:02:48 +0000
Subject: [PATCH 641/751] Improve: Test against `affine-gaps`

---
 .vscode/settings.json        |  5 +++-
 scripts/test_stringzillas.py | 49 ++++++++++++++++++++++++++++++++++--
 2 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 6f5470b0..70592254 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -192,6 +192,7 @@
     "stdc",
     "STL",
     "strided",
+    "stringtape",
     "StringWa.rs",
     "StringWars",
     "stringzilla",
@@ -223,7 +224,8 @@
     "xmms",
     "Yann",
     "Yaroshevskiy",
-    "Zilla"
+    "Zilla",
+    "Zillable"
   ],
   "editor.formatOnSave": true,
   "editor.rulers": [
@@ -347,6 +349,7 @@
     "utility": "cpp",
     "variant": "cpp",
     "vector": "cpp",
+    "version": "cpp",
     "xfacet": "cpp",
     "xhash": "cpp",
     "xiosbase": "cpp",
diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index 76c5fb56..176c6da0 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -4,14 +4,14 @@
 Tests with Python lists, NumPy arrays, Apache Arrow columns, and StringZilla Strs types.
 To run for the CPU backend:
 
-    uv pip install numpy pyarrow pytest pytest-repeat
+    uv pip install numpy pyarrow pytest pytest-repeat affine-gaps
     SZ_TARGET=stringzillas-cpus uv pip install -e . --force-reinstall --no-build-isolation
     uv run --no-project python -c "import stringzillas; print(stringzillas.__capabilities__)"
     uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x
 
 To run for the CUDA backend:
 
-    uv pip install numpy pyarrow pytest pytest-repeat
+    uv pip install numpy pyarrow pytest pytest-repeat affine-gaps
     SZ_TARGET=stringzillas-cuda uv pip install -e . --force-reinstall --no-build-isolation
     uv run --no-project python -c "import stringzillas; print(stringzillas.__capabilities__)"
     uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x
@@ -23,6 +23,7 @@
 
 import pytest
 import numpy as np  # ! Unlike StringZilla, NumPy is mandatory for StringZillas
+import affine_gaps
 
 import stringzilla as sz
 import stringzillas as szs
@@ -374,6 +375,50 @@ def test_needleman_wunsch_vs_levenshtein_random(config: InputSizeConfig, seed_va
     np.testing.assert_array_equal(results, baselines, "Edit distances do not match")
 
 
+def test_needleman_wunsch_serial_backend_validation():
+    """Test Needleman-Wunsch implementation against affine-gaps reference."""
+
+    test_cases = [
+        ("hello", "world"),
+        ("abc", "def"),
+        ("nhosizayzfwnkie", "mdlltizbxordmcr"),
+        ("abcdefghijklmno", "pqrstuvwxyzabcd"),
+    ]
+
+    character_substitutions = np.zeros((256, 256), dtype=np.int8)
+    character_substitutions.fill(-1)
+    np.fill_diagonal(character_substitutions, 0)
+
+    # Create ASCII substitution matrix for affine-gaps
+    ascii_alphabet = "".join(chr(i) for i in range(32, 127))
+    ascii_matrix = np.full((256, 256), -1, dtype=np.int32)
+    np.fill_diagonal(ascii_matrix, 0)
+
+    # Get reference scores from affine-gaps
+    baselines = []
+    for str1, str2 in test_cases:
+        ref_score = affine_gaps.needleman_wunsch_gotoh_score(
+            str1,
+            str2,
+            substitution_alphabet=ascii_alphabet,
+            substitution_matrix=ascii_matrix,
+            gap_opening=-1,
+            gap_extension=-1,
+        )
+        baselines.append(ref_score)
+
+    engine = szs.NeedlemanWunsch(substitution_matrix=character_substitutions, open=-1, extend=-1)
+
+    # Convert to Strs objects
+    a_batch = [case[0] for case in test_cases]
+    b_batch = [case[1] for case in test_cases]
+    a_strs = Strs(a_batch)
+    b_strs = Strs(b_batch)
+    results = engine(a_strs, b_strs)
+
+    np.testing.assert_array_equal(results, baselines, "NeedlemanWunsch scores do not match affine-gaps reference")
+
+
 @pytest.mark.parametrize("device_name", DEVICE_NAMES)
 @pytest.mark.parametrize("ndim", [1, 7, 64, 1024])
 def test_fingerprints(device_name: str, ndim: int):

From 3d6669c364f7665572361ae1e557e36bc89e427d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 13:18:26 +0000
Subject: [PATCH 642/751] Improve: Formatting Swift

---
 .github/workflows/prerelease.yml       | 10 +++---
 .github/workflows/release.yml          |  1 +
 .swift-format                          | 13 ++++++++
 CONTRIBUTING.md                        | 26 ++++------------
 Package.swift                          | 26 ++++++++--------
 swift/StringProtocol+StringZilla.swift | 42 +++++++++++++++-----------
 swift/Test.swift                       |  3 +-
 7 files changed, 66 insertions(+), 55 deletions(-)
 create mode 100644 .swift-format

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index e7e11263..ac1c191e 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -10,7 +10,7 @@ env:
   BUILD_TYPE: Release
   GH_TOKEN: ${{ secrets.SEMANTIC_RELEASE_TOKEN }}
   PYTHON_VERSION: 3.11
-  SWIFT_VERSION: 5.9
+  SWIFT_VERSION: 6.0
   PYTHONUTF8: 1
 
 # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
@@ -218,13 +218,13 @@ jobs:
         # Swift
         # Fails due to: https://github.com/swift-actions/setup-swift/issues/591
       # - name: Set up Swift ${{ env.SWIFT_VERSION }}
-      #   uses: swift-actions/setup-swift@v1
+      #   uses: swift-actions/setup-swift@v2.3.0
       #   with:
       #     swift-version: ${{ env.SWIFT_VERSION }}
       # - name: Build Swift
       #   run: swift build -c release --static-swift-stdlib
       # - name: Test Swift
-      #   run: swift test -c release --enable-test-discovery
+      #   run: swift test -c release
 
   test_ubuntu_cpus:
     name: Ubuntu (StringZillas-CPUs)
@@ -352,7 +352,7 @@ jobs:
   test_ubuntu_swift:
     name: Swift on Linux
     runs-on: ubuntu-22.04
-    container: swift:5.9
+    container: swift:${{ env.SWIFT_VERSION }}
     steps:
       - uses: actions/checkout@v4
         with:
@@ -470,7 +470,7 @@ jobs:
 
         # Swift
       - name: Set up Swift ${{ env.SWIFT_VERSION }}
-        uses: swift-actions/setup-swift@v1
+        uses: swift-actions/setup-swift@v2.3.0
         with:
           swift-version: ${{ env.SWIFT_VERSION }}
       - name: Build Swift
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 032d14a2..2bf48c96 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -7,6 +7,7 @@ on:
 env:
   BUILD_TYPE: Release
   GH_TOKEN: ${{ secrets.SEMANTIC_RELEASE_TOKEN }}
+  SWIFT_VERSION: 6.0
   PYTHONUTF8: 1
 
 # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
diff --git a/.swift-format b/.swift-format
new file mode 100644
index 00000000..53bf6310
--- /dev/null
+++ b/.swift-format
@@ -0,0 +1,13 @@
+{
+    "version": 1,
+    "lineLength": 120,
+    "indentation": {
+        "spaces": 4
+    },
+    "maximumBlankLines": 1,
+    "respectsExistingLineBreaks": true,
+    "lineBreakBeforeControlFlowKeywords": true,
+    "lineBreakBeforeEachArgument": true,
+    "multiElementCollectionTrailingCommas": true,
+    "spacesAroundRangeFormationOperators": true
+}
\ No newline at end of file
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 8ccc7706..d15bbc92 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -510,33 +510,19 @@ brew install swiftformat
 swiftformat .
 ```
 
-Running Swift on Linux requires a couple of extra steps, as the Swift compiler is not available in the default repositories.
-Please get the most recent Swift tarball from the [official website](https://www.swift.org/install/).
-At the time of writing, for 64-bit Arm CPU running Ubuntu 22.04, the following commands would work:
-
-```bash
-wget https://download.swift.org/swift-5.9.2-release/ubuntu2204-aarch64/swift-5.9.2-RELEASE/swift-5.9.2-RELEASE-ubuntu22.04-aarch64.tar.gz
-tar xzf swift-5.9.2-RELEASE-ubuntu22.04-aarch64.tar.gz
-sudo mv swift-5.9.2-RELEASE-ubuntu22.04-aarch64 /usr/share/swift
-echo "export PATH=/usr/share/swift/usr/bin:$PATH" >> ~/.bashrc
-source ~/.bashrc
-```
+---
 
-You can check the available images on [`swift.org/download` page](https://www.swift.org/download/#releases).
-For x86 CPUs, the following commands would work:
+Running Swift on Linux requires a couple of extra steps - [`swift.org/install` page](https://www.swift.org/install).
+Alternatively, on Linux, the official Swift Docker image can be used for builds and tests:
 
 ```bash
-wget https://download.swift.org/swift-5.9.2-release/ubuntu2204/swift-5.9.2-RELEASE/swift-5.9.2-RELEASE-ubuntu22.04.tar.gz
-tar xzf swift-5.9.2-RELEASE-ubuntu22.04.tar.gz
-sudo mv swift-5.9.2-RELEASE-ubuntu22.04 /usr/share/swift
-echo "export PATH=/usr/share/swift/usr/bin:$PATH" >> ~/.bashrc
-source ~/.bashrc
+sudo docker run --rm -v "$PWD:/workspace" -w /workspace swift:6.0 /bin/bash -cl "swift build -c release --static-swift-stdlib && swift test -c release"
 ```
 
-Alternatively, on Linux, the official Swift Docker image can be used for builds and tests:
+To format the code on Linux:
 
 ```bash
-sudo docker run --rm -v "$PWD:/workspace" -w /workspace swift:5.9 /bin/bash -cl "swift build -c release --static-swift-stdlib && swift test -c release --enable-test-discovery"
+sudo docker run --rm -v "$PWD:/workspace" -w /workspace swift:6.0 /bin/bash -c "swift format . -i -r"
 ```
 
 ## Contributing in Rust
diff --git a/Package.swift b/Package.swift
index ec3fe103..b3e7e101 100644
--- a/Package.swift
+++ b/Package.swift
@@ -5,35 +5,37 @@ let package = Package(
     name: "StringZilla",
     platforms: [
         // Linux doesn't have to be explicitly listed
-        .iOS(.v13), // For iOS, version 13 and later
-        .tvOS(.v13), // For tvOS, version 13 and later
-        .macOS(.v10_15), // For macOS, version 10.15 (Catalina) and later
-        .watchOS(.v6), // For watchOS, version 6 and later
+        .iOS(.v13),  // For iOS, version 13 and later
+        .tvOS(.v13),  // For tvOS, version 13 and later
+        .macOS(.v10_15),  // For macOS, version 10.15 (Catalina) and later
+        .watchOS(.v6),  // For watchOS, version 6 and later
+        .visionOS(.v1),  // For visionOS, version 1.0 and later
     ],
     products: [
         .library(
             name: "StringZilla",
             targets: ["StringZillaC", "StringZilla"]
-        ),
+        )
     ],
     targets: [
         .target(
             name: "StringZillaC",
-            path: "include/stringzilla", // Adjust the path to include your C source files
-            sources: ["../../c/lib.c"], // Include the source file here
+            path: "include/stringzilla",  // Adjust the path to include your C source files
+            sources: ["../../c/stringzilla.c"],  // Include the source file here
             publicHeadersPath: ".",
             cSettings: [
-                .define("SZ_DYNAMIC_DISPATCH", to: "1"), // Define a macro
-                .define("SZ_AVOID_LIBC", to: "0"), // We need `malloc` from LibC
-                .define("SZ_DEBUG", to: "0"), // We don't need any extra assertions in the C layer
-                .headerSearchPath("include/stringzilla"), // Specify header search paths
-                .unsafeFlags(["-Wall"]), // Use with caution: specify custom compiler flags
+                .define("SZ_DYNAMIC_DISPATCH", to: "1"),  // Define a macro
+                .define("SZ_AVOID_LIBC", to: "0"),  // We need `malloc` from LibC
+                .define("SZ_DEBUG", to: "0"),  // We don't need any extra assertions in the C layer
+                .headerSearchPath("include/stringzilla"),  // Specify header search paths
+                .unsafeFlags(["-Wall"]),  // Use with caution: specify custom compiler flags
             ]
         ),
         .target(
             name: "StringZilla",
             dependencies: ["StringZillaC"],
             path: "swift",
+            sources: ["StringProtocol+StringZilla.swift"],
             exclude: ["Test.swift"]
         ),
         .testTarget(
diff --git a/swift/StringProtocol+StringZilla.swift b/swift/StringProtocol+StringZilla.swift
index 72d27784..2afc9a79 100644
--- a/swift/StringProtocol+StringZilla.swift
+++ b/swift/StringProtocol+StringZilla.swift
@@ -27,7 +27,7 @@ import StringZillaC
 private protocol SingleByte {}
 
 extension UInt8: SingleByte {}
-extension Int8: SingleByte {} // This would match `CChar` as well.
+extension Int8: SingleByte {}  // This would match `CChar` as well.
 
 @usableFromInline
 enum StringZillaError: Error {
@@ -84,7 +84,9 @@ extension String: StringZillaViewable {
     }
 
     @_transparent
-    public func stringZillaByteOffset(forByte bytePointer: sz_cptr_t, after startPointer: sz_cptr_t) -> Index {
+    public func stringZillaByteOffset(forByte bytePointer: sz_cptr_t, after startPointer: sz_cptr_t)
+        -> Index
+    {
         utf8.index(utf8.startIndex, offsetBy: bytePointer - startPointer)
     }
 }
@@ -102,9 +104,10 @@ extension Substring.UTF8View: StringZillaViewable {
             let cLength = sz_size_t(bufferPointer.count)
             let cString = UnsafeRawPointer(bufferPointer.baseAddress!).assumingMemoryBound(to: CChar.self)
             return try body(cString, cLength)
-        } ?? {
-            throw StringZillaError.contiguousStorageUnavailable
-        }()
+        }
+            ?? {
+                throw StringZillaError.contiguousStorageUnavailable
+            }()
     }
 
     /// Calculates the offset index for a given byte pointer relative to a start pointer.
@@ -113,7 +116,9 @@ extension Substring.UTF8View: StringZillaViewable {
     ///   - startPointer: The starting pointer for the calculation, previously obtained from `szScope`.
     /// - Returns: The calculated index offset.
     @_transparent
-    public func stringZillaByteOffset(forByte bytePointer: sz_cptr_t, after startPointer: sz_cptr_t) -> Index {
+    public func stringZillaByteOffset(forByte bytePointer: sz_cptr_t, after startPointer: sz_cptr_t)
+        -> Index
+    {
         return index(startIndex, offsetBy: bytePointer - startPointer)
     }
 }
@@ -130,9 +135,10 @@ extension String.UTF8View: StringZillaViewable {
             let cLength = sz_size_t(bufferPointer.count)
             let cString = UnsafeRawPointer(bufferPointer.baseAddress!).assumingMemoryBound(to: CChar.self)
             return try body(cString, cLength)
-        } ?? {
-            throw StringZillaError.contiguousStorageUnavailable
-        }()
+        }
+            ?? {
+                throw StringZillaError.contiguousStorageUnavailable
+            }()
     }
 
     /// Calculates the offset index for a given byte pointer relative to a start pointer.
@@ -140,18 +146,20 @@ extension String.UTF8View: StringZillaViewable {
     ///   - bytePointer: A pointer to the byte for which the offset is calculated.
     ///   - startPointer: The starting pointer for the calculation, previously obtained from `szScope`.
     /// - Returns: The calculated index offset.
-    public func stringZillaByteOffset(forByte bytePointer: sz_cptr_t, after startPointer: sz_cptr_t) -> Index {
+    public func stringZillaByteOffset(forByte bytePointer: sz_cptr_t, after startPointer: sz_cptr_t)
+        -> Index
+    {
         return index(startIndex, offsetBy: bytePointer - startPointer)
     }
 }
 
-public extension StringZillaViewable {
+extension StringZillaViewable {
     /// Finds the first occurrence of the specified substring within the receiver.
     /// - Parameter needle: The substring to search for.
     /// - Returns: The index of the found occurrence, or `nil` if not found.
     @_specialize(where Self == String, S == String)
     @_specialize(where Self == String.UTF8View, S == String.UTF8View)
-    func findFirst<S: StringZillaViewable>(substring needle: S) -> Index? {
+    public func findFirst<S: StringZillaViewable>(substring needle: S) -> Index? {
         var result: Index?
         withStringZillaScope { hPointer, hLength in
             needle.withStringZillaScope { nPointer, nLength in
@@ -168,7 +176,7 @@ public extension StringZillaViewable {
     /// - Returns: The index of the found occurrence, or `nil` if not found.
     @_specialize(where Self == String, S == String)
     @_specialize(where Self == String.UTF8View, S == String.UTF8View)
-    func findLast<S: StringZillaViewable>(substring needle: S) -> Index? {
+    public func findLast<S: StringZillaViewable>(substring needle: S) -> Index? {
         var result: Index?
         withStringZillaScope { hPointer, hLength in
             needle.withStringZillaScope { nPointer, nLength in
@@ -185,7 +193,7 @@ public extension StringZillaViewable {
     /// - Returns: The index of the found occurrence, or `nil` if not found.
     @_specialize(where Self == String, S == String)
     @_specialize(where Self == String.UTF8View, S == String.UTF8View)
-    func findFirst<S: StringZillaViewable>(characterFrom characters: S) -> Index? {
+    public func findFirst<S: StringZillaViewable>(characterFrom characters: S) -> Index? {
         var result: Index?
         withStringZillaScope { hPointer, hLength in
             characters.withStringZillaScope { nPointer, nLength in
@@ -202,7 +210,7 @@ public extension StringZillaViewable {
     /// - Returns: The index of the found occurrence, or `nil` if not found.
     @_specialize(where Self == String, S == String)
     @_specialize(where Self == String.UTF8View, S == String.UTF8View)
-    func findLast<S: StringZillaViewable>(characterFrom characters: S) -> Index? {
+    public func findLast<S: StringZillaViewable>(characterFrom characters: S) -> Index? {
         var result: Index?
         withStringZillaScope { hPointer, hLength in
             characters.withStringZillaScope { nPointer, nLength in
@@ -219,7 +227,7 @@ public extension StringZillaViewable {
     /// - Returns: The index of the found occurrence, or `nil` if not found.
     @_specialize(where Self == String, S == String)
     @_specialize(where Self == String.UTF8View, S == String.UTF8View)
-    func findFirst<S: StringZillaViewable>(characterNotFrom characters: S) -> Index? {
+    public func findFirst<S: StringZillaViewable>(characterNotFrom characters: S) -> Index? {
         var result: Index?
         withStringZillaScope { hPointer, hLength in
             characters.withStringZillaScope { nPointer, nLength in
@@ -236,7 +244,7 @@ public extension StringZillaViewable {
     /// - Returns: The index of the found occurrence, or `nil` if not found.
     @_specialize(where Self == String, S == String)
     @_specialize(where Self == String.UTF8View, S == String.UTF8View)
-    func findLast<S: StringZillaViewable>(characterNotFrom characters: S) -> Index? {
+    public func findLast<S: StringZillaViewable>(characterNotFrom characters: S) -> Index? {
         var result: Index?
         withStringZillaScope { hPointer, hLength in
             characters.withStringZillaScope { nPointer, nLength in
diff --git a/swift/Test.swift b/swift/Test.swift
index aba2a287..e4a365d5 100644
--- a/swift/Test.swift
+++ b/swift/Test.swift
@@ -5,9 +5,10 @@
 //  Created by Ash Vardanian on 18/1/24.
 //
 
-@testable import StringZilla
 import XCTest
 
+@testable import StringZilla
+
 class StringZillaTests: XCTestCase {
     var testString: String!
 

From 0bc0f8a6275701be46e9de9e3913f19ccdbd9d46 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 13:26:06 +0000
Subject: [PATCH 643/751] Fix: Detect missing GPUs at runtime

---
 c/stringzillas.cuh             |  3 ++-
 include/stringzillas/types.cuh | 10 +++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index 530ae7fe..445d2557 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -763,7 +763,8 @@ SZ_DYNAMIC sz_capability_t szs_capabilities(void) {
 #if SZ_USE_CUDA
         sz::gpu_specs_t first_gpu_specs;
         auto specs_status = static_cast<sz::status_t>(szs::gpu_specs_fetch(first_gpu_specs));
-        if (specs_status != sz::status_t::success_k) return static_caps;
+        if (specs_status == sz::status_t::missing_gpu_k) { return cpu_caps; }        // No GPUs available
+        else if (specs_status != sz::status_t::success_k) { return sz_caps_none_k; } // Some bug
         gpu_caps = static_cast<sz_capability_t>(gpu_caps | sz_cap_cuda_k);
         if (first_gpu_specs.sm_code >= 30) gpu_caps = static_cast<sz_capability_t>(gpu_caps | sz_cap_kepler_k);
         if (first_gpu_specs.sm_code >= 90) gpu_caps = static_cast<sz_capability_t>(gpu_caps | sz_cap_hopper_k);
diff --git a/include/stringzillas/types.cuh b/include/stringzillas/types.cuh
index 3abedc9b..017c15c5 100644
--- a/include/stringzillas/types.cuh
+++ b/include/stringzillas/types.cuh
@@ -104,7 +104,15 @@ struct cuda_status_t {
 inline cuda_status_t gpu_specs_fetch(gpu_specs_t &specs, int device_id = 0) noexcept {
     cudaDeviceProp prop;
     cudaError_t cuda_error = cudaGetDeviceProperties(&prop, device_id);
-    if (cuda_error != cudaSuccess) return {status_t::unknown_k, cuda_error};
+
+    // Distinguish between "no GPU available" vs other CUDA errors for clearer handling upstream.
+    if (cuda_error != cudaSuccess) {
+        status_t status = status_t::unknown_k;
+        if (cuda_error == cudaErrorNoDevice) status = status_t::missing_gpu_k;
+        if (cuda_error == cudaErrorInvalidDevice) status = status_t::missing_gpu_k;
+        if (cuda_error == cudaErrorInsufficientDriver) status = status_t::missing_gpu_k;
+        return {status, cuda_error};
+    }
 
     // Set the GPU specs
     specs.streaming_multiprocessors = prop.multiProcessorCount;

From cbd160a1f3ce7cdf020fe8a6a42d383a00c517ff Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 13:57:21 +0000
Subject: [PATCH 644/751] Make: Bump FU due to sign conversions warnings

---
 fork_union | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fork_union b/fork_union
index 3f914593..722b3117 160000
--- a/fork_union
+++ b/fork_union
@@ -1 +1 @@
-Subproject commit 3f9145935e0d7709a1bcda28c4dd425e94a7c1df
+Subproject commit 722b3117a6ffaf04d1323460f86140adb6996212

From 7ac3b8306cc5dadad4453c3861e16b0d03bf211d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 14:02:07 +0000
Subject: [PATCH 645/751] Fix: Avoid UB assigning i8x256x256 matrix

---
 c/stringzillas.cuh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index 445d2557..0a28560c 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -12,6 +12,7 @@
 #include <stringzillas/stringzillas.h> // StringZillas library header
 
 #include <variant>        // For `std::variant`
+#include <cstring>        // For `std::memcpy`
 #include <string_view>    // For `std::string_view`
 #include <thread>         // For `std::thread::hardware_concurrency`
 #include <fork_union.hpp> // Fork-join scoped thread pool
@@ -1210,9 +1211,10 @@ SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_init(                       /
 
     // If the gap opening and extension costs are identical we can use less memory
     auto const can_use_linear_costs = open == extend;
-    auto const substitution_costs = *reinterpret_cast<szs::error_costs_256x256_t const *>(subs);
     auto const linear_costs = szs::linear_gap_costs_t {open};
     auto const affine_costs = szs::affine_gap_costs_t {open, extend};
+    auto substitution_costs = szs::error_costs_256x256_t {};
+    std::memcpy(&substitution_costs, subs, sizeof(substitution_costs));
 
 #if SZ_USE_ICE
     bool const can_use_ice = (capabilities & sz_cap_ice_k) == sz_cap_ice_k;
@@ -1349,9 +1351,10 @@ SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_init(                         /
 
     // If the gap opening and extension costs are identical we can use less memory
     auto const can_use_linear_costs = open == extend;
-    auto const substitution_costs = *reinterpret_cast<szs::error_costs_256x256_t const *>(subs);
     auto const linear_costs = szs::linear_gap_costs_t {open};
     auto const affine_costs = szs::affine_gap_costs_t {open, extend};
+    auto substitution_costs = szs::error_costs_256x256_t {};
+    std::memcpy(&substitution_costs, subs, sizeof(substitution_costs));
 
 #if SZ_USE_ICE
     bool const can_use_ice = (capabilities & sz_cap_ice_k) == sz_cap_ice_k;

From 6e597ca5078c10bd84bb25ef6e16328b426c759b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 14:32:15 +0000
Subject: [PATCH 646/751] Improve: New error codes for CPU/GPU interop

---
 c/stringzillas.cuh                    |  8 +--
 include/stringzilla/types.h           |  4 ++
 include/stringzilla/types.hpp         |  2 +
 include/stringzillas/fingerprints.cuh |  5 ++
 include/stringzillas/similarities.cuh | 10 +++-
 include/stringzillas/types.cuh        | 13 +++++
 python/stringzillas.c                 | 80 ++++++++++++++++++++++++---
 rust/stringzilla.rs                   |  6 ++
 8 files changed, 114 insertions(+), 14 deletions(-)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index 0a28560c..7f369b04 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -293,7 +293,7 @@ sz_status_t szs_levenshtein_distances_for_(
                     get_executor(device_scope), get_specs(device_scope));
                 result = static_cast<sz_status_t>(status);
             }
-            else { result = sz_status_unknown_k; }
+            else { result = sz_device_code_mismatch_k; }
 #else
             result = sz_status_unknown_k; // GPU support is not enabled
 #endif // SZ_USE_CUDA
@@ -314,7 +314,7 @@ sz_status_t szs_levenshtein_distances_for_(
                     get_executor(device_scope), get_specs(device_scope));
                 result = static_cast<sz_status_t>(status);
             }
-            else { result = sz_status_unknown_k; }
+            else { result = sz_device_code_mismatch_k; }
         }
     };
 
@@ -385,7 +385,7 @@ sz_status_t szs_levenshtein_distances_utf8_for_(
                     get_executor(device_scope), get_specs(device_scope));
                 result = static_cast<sz_status_t>(status);
             }
-            else { result = sz_status_unknown_k; }
+            else { result = sz_device_code_mismatch_k; }
         }
     };
 
@@ -1734,4 +1734,4 @@ SZ_DYNAMIC void szs_fingerprints_utf8_free(szs_fingerprints_utf8_t engine_punned
 
 #pragma endregion Fingerprints UTF8
 
-} // extern "C"
\ No newline at end of file
+} // extern "C"
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 4e01065b..15af77d6 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -459,6 +459,10 @@ typedef enum sz_status_t {
     sz_unexpected_dimensions_k = -15,
     /** GPU support is missing in the library. */
     sz_missing_gpu_k = -16,
+    /** Backend-device mismatch: e.g., GPU kernel with CPU/default executor or vice versa. */
+    sz_device_code_mismatch_k = -17,
+    /** Device memory mismatch: e.g., GPU kernel requires unified/device-accessible memory. */
+    sz_device_memory_mismatch_k = -18,
     /** A sink-hole status for unknown errors. */
     sz_status_unknown_k = -1,
 } sz_status_t;
diff --git a/include/stringzilla/types.hpp b/include/stringzilla/types.hpp
index 87e7e7a0..14bcd044 100644
--- a/include/stringzilla/types.hpp
+++ b/include/stringzilla/types.hpp
@@ -155,6 +155,8 @@ enum class status_t : int {
     overflow_risk_k = sz_overflow_risk_k,
     unexpected_dimensions_k = sz_unexpected_dimensions_k,
     missing_gpu_k = sz_missing_gpu_k,
+    device_code_mismatch_k = sz_device_code_mismatch_k,
+    device_memory_mismatch_k = sz_device_memory_mismatch_k,
     unknown_k = sz_status_unknown_k,
 };
 
diff --git a/include/stringzillas/fingerprints.cuh b/include/stringzillas/fingerprints.cuh
index fdc8c275..690bf8ba 100644
--- a/include/stringzillas/fingerprints.cuh
+++ b/include/stringzillas/fingerprints.cuh
@@ -517,6 +517,11 @@ struct basic_rolling_hashers<hasher_type_, min_hash_type_, min_count_type_, unif
             auto const &text = texts[task_index];
             auto min_hashes = to_span(min_hashes_per_text[task_index]);
             auto min_counts = to_span(min_counts_per_text[task_index]);
+            // Ensure device-accessible buffers (Unified/Device memory) for inputs and outputs
+            if (!is_device_accessible_memory((void const *)text.data()) ||
+                !is_device_accessible_memory((void const *)min_hashes.data()) ||
+                !is_device_accessible_memory((void const *)min_counts.data()))
+                return {status_t::device_memory_mismatch_k, cudaSuccess};
             tasks[task_index] = task_t {
                 .text_ptr = text.data(),
                 .text_length = text.size(),
diff --git a/include/stringzillas/similarities.cuh b/include/stringzillas/similarities.cuh
index 16b3f887..f20ae59f 100644
--- a/include/stringzillas/similarities.cuh
+++ b/include/stringzillas/similarities.cuh
@@ -2109,6 +2109,10 @@ struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capab
         size_t count_empty_tasks = 0;
         using similarity_memory_requirements_t = similarity_memory_requirements<size_t, false>;
         for (size_t i = 0; i < first_strings.size(); ++i) {
+            // Ensure inputs are device-accessible (Unified/Device memory)
+            if (!is_device_accessible_memory((void const *)first_strings[i].data()) ||
+                !is_device_accessible_memory((void const *)second_strings[i].data()))
+                return {status_t::device_memory_mismatch_k, cudaSuccess};
             task_t task(                                            //
                 first_strings[i].data(), first_strings[i].length(), //
                 second_strings[i].data(), second_strings[i].length());
@@ -2782,6 +2786,10 @@ struct cuda_nw_or_sw_byte_level_scores_ {
         size_t count_empty_tasks = 0;
         using similarity_memory_requirements_t = similarity_memory_requirements<size_t, true>;
         for (size_t i = 0; i < first_strings.size(); ++i) {
+            // Ensure inputs are device-accessible (Unified/Device memory)
+            if (!is_device_accessible_memory((void const *)first_strings[i].data()) ||
+                !is_device_accessible_memory((void const *)second_strings[i].data()))
+                return {status_t::device_memory_mismatch_k, cudaSuccess};
             task_t task(                                            //
                 first_strings[i].data(), first_strings[i].length(), //
                 second_strings[i].data(), second_strings[i].length());
@@ -3001,4 +3009,4 @@ struct smith_waterman_scores<char, error_costs_256x256_t, gap_costs_type_, alloc
 } // namespace stringzillas
 } // namespace ashvardanian
 
-#endif // STRINGZILLAS_SIMILARITIES_CUH_
\ No newline at end of file
+#endif // STRINGZILLAS_SIMILARITIES_CUH_
diff --git a/include/stringzillas/types.cuh b/include/stringzillas/types.cuh
index 017c15c5..089c735c 100644
--- a/include/stringzillas/types.cuh
+++ b/include/stringzillas/types.cuh
@@ -93,6 +93,19 @@ struct unified_alloc {
 
 using unified_alloc_t = unified_alloc<char>;
 
+/** @brief Returns `true` if the pointer refers to device-accessible memory (Device or Managed/Unified). */
+inline bool is_device_accessible_memory(void const *ptr) noexcept {
+    if (!ptr) return true;
+    cudaPointerAttributes attr;
+    cudaError_t err = cudaPointerGetAttributes(&attr, ptr);
+    if (err != cudaSuccess) return false;
+#if defined(CUDART_VERSION) && (CUDART_VERSION >= 10000) // Modern CUDA: use `type`
+    return attr.type == cudaMemoryTypeDevice || attr.type == cudaMemoryTypeManaged;
+#else // Legacy CUDA: `memoryType` and `isManaged`
+    return attr.memoryType == cudaMemoryTypeDevice || attr.isManaged;
+#endif
+}
+
 struct cuda_status_t {
     status_t status = status_t::success_k;
     cudaError_t cuda_error = cudaSuccess;
diff --git a/python/stringzillas.c b/python/stringzillas.c
index 1c3f83da..de9f3307 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -121,8 +121,9 @@ static sz_bool_t try_swap_to_unified_allocator(PyObject *strs_obj) {
 
     if (!success) {
         // Set Python error to inform user of the failure
-        PyErr_SetString(PyExc_RuntimeError, "Failed to allocate unified memory for GPU compatibility. "
-                                            "Consider reducing input size or freeing memory.");
+        PyErr_SetString(PyExc_RuntimeError,
+                        "Device memory mismatch: GPU kernels require unified/device-accessible memory. "
+                        "Consider reducing input size, freeing memory, or using CPU capabilities.");
     }
     return success;
 }
@@ -548,7 +549,17 @@ static PyObject *LevenshteinDistances_call(LevenshteinDistances *self, PyObject
         case sz_contains_duplicates_k: error_msg = "Levenshtein failed: contains duplicates"; break;
         case sz_overflow_risk_k: error_msg = "Levenshtein failed: overflow risk"; break;
         case sz_unexpected_dimensions_k: error_msg = "Levenshtein failed: input/output size mismatch"; break;
-        case sz_missing_gpu_k: error_msg = "Levenshtein failed: GPU support is missing in the library"; break;
+        case sz_missing_gpu_k:
+            error_msg = "Levenshtein failed: CUDA backend requested but no GPU device scope provided. "
+                        "Pass device=stringzillas.DeviceScope(gpu_device=0) or use serial/CPU capabilities.";
+            break;
+        case sz_device_code_mismatch_k:
+            error_msg = "Levenshtein failed: device-code mismatch between backend and executor. "
+                        "Use a GPU DeviceScope with CUDA backends or select CPU capabilities.";
+            break;
+        case sz_device_memory_mismatch_k:
+            error_msg = "Levenshtein failed: device-memory mismatch (unified/device-accessible memory required).";
+            break;
         case sz_status_unknown_k: error_msg = "Levenshtein failed: unknown error"; break;
         default: error_msg = "Levenshtein failed: unexpected error"; break;
         }
@@ -833,7 +844,17 @@ static PyObject *LevenshteinDistancesUTF8_call(LevenshteinDistancesUTF8 *self, P
         case sz_contains_duplicates_k: error_msg = "Levenshtein failed: contains duplicates"; break;
         case sz_overflow_risk_k: error_msg = "Levenshtein failed: overflow risk"; break;
         case sz_unexpected_dimensions_k: error_msg = "Levenshtein failed: input/output size mismatch"; break;
-        case sz_missing_gpu_k: error_msg = "Levenshtein failed: GPU support is missing in the library"; break;
+        case sz_missing_gpu_k:
+            error_msg = "Levenshtein failed: CUDA backend requested but no GPU device scope provided. "
+                        "Pass device=stringzillas.DeviceScope(gpu_device=0) or use serial/CPU capabilities.";
+            break;
+        case sz_device_code_mismatch_k:
+            error_msg = "Levenshtein failed: device-code mismatch between backend and executor. "
+                        "Use a GPU DeviceScope with CUDA backends or select CPU capabilities.";
+            break;
+        case sz_device_memory_mismatch_k:
+            error_msg = "Levenshtein failed: device-memory mismatch (unified/device-accessible memory required).";
+            break;
         case sz_status_unknown_k: error_msg = "Levenshtein failed: unknown error"; break;
         default: error_msg = "Levenshtein failed: unexpected error"; break;
         }
@@ -985,7 +1006,18 @@ static int NeedlemanWunsch_init(NeedlemanWunsch *self, PyObject *args, PyObject
         case sz_contains_duplicates_k: error_msg = "NeedlemanWunsch failed: contains duplicates"; break;
         case sz_overflow_risk_k: error_msg = "NeedlemanWunsch failed: overflow risk"; break;
         case sz_unexpected_dimensions_k: error_msg = "NeedlemanWunsch failed: input/output size mismatch"; break;
-        case sz_missing_gpu_k: error_msg = "NeedlemanWunsch failed: GPU support is missing in the library"; break;
+        case sz_missing_gpu_k:
+            error_msg = "NeedlemanWunsch failed: CUDA backend requested but no GPU device scope provided. "
+                        "Pass device=stringzillas.DeviceScope(gpu_device=0) or use serial/CPU capabilities.";
+            break;
+
+        case sz_device_code_mismatch_k:
+            error_msg = "NeedlemanWunsch failed: device-code mismatch between backend and executor. "
+                        "Use a GPU DeviceScope with CUDA backends or select CPU capabilities.";
+            break;
+        case sz_device_memory_mismatch_k:
+            error_msg = "NeedlemanWunsch failed: device-memory mismatch (unified/device-accessible memory required).";
+            break;
         case sz_status_unknown_k: error_msg = "NeedlemanWunsch failed: unknown error"; break;
         default: error_msg = "NeedlemanWunsch failed: unexpected error"; break;
         }
@@ -1142,7 +1174,17 @@ static PyObject *NeedlemanWunsch_call(NeedlemanWunsch *self, PyObject *args, PyO
         case sz_contains_duplicates_k: error_msg = "NeedlemanWunsch failed: contains duplicates"; break;
         case sz_overflow_risk_k: error_msg = "NeedlemanWunsch failed: overflow risk"; break;
         case sz_unexpected_dimensions_k: error_msg = "NeedlemanWunsch failed: input/output size mismatch"; break;
-        case sz_missing_gpu_k: error_msg = "NeedlemanWunsch failed: GPU support is missing in the library"; break;
+        case sz_missing_gpu_k:
+            error_msg = "NeedlemanWunsch failed: CUDA backend requested but no GPU device scope provided. "
+                        "Pass device=stringzillas.DeviceScope(gpu_device=0) or use serial/CPU capabilities.";
+            break;
+        case sz_device_code_mismatch_k:
+            error_msg = "NeedlemanWunsch failed: device-code mismatch between backend and executor. "
+                        "Use a GPU DeviceScope with CUDA backends or select CPU capabilities.";
+            break;
+        case sz_device_memory_mismatch_k:
+            error_msg = "NeedlemanWunsch failed: device-memory mismatch (unified/device-accessible memory required).";
+            break;
         case sz_status_unknown_k: error_msg = "NeedlemanWunsch failed: unknown error"; break;
         default: error_msg = "NeedlemanWunsch failed: unexpected error"; break;
         }
@@ -1274,7 +1316,17 @@ static int SmithWaterman_init(SmithWaterman *self, PyObject *args, PyObject *kwa
         case sz_contains_duplicates_k: error_msg = "SmithWaterman failed: contains duplicates"; break;
         case sz_overflow_risk_k: error_msg = "SmithWaterman failed: overflow risk"; break;
         case sz_unexpected_dimensions_k: error_msg = "SmithWaterman failed: input/output size mismatch"; break;
-        case sz_missing_gpu_k: error_msg = "SmithWaterman failed: GPU support is missing in the library"; break;
+        case sz_missing_gpu_k:
+            error_msg = "SmithWaterman failed: CUDA backend requested but no GPU device scope provided. "
+                        "Pass device=stringzillas.DeviceScope(gpu_device=0) or use serial/CPU capabilities.";
+            break;
+        case sz_device_code_mismatch_k:
+            error_msg = "SmithWaterman failed: device-code mismatch between backend and executor. "
+                        "Use a GPU DeviceScope with CUDA backends or select CPU capabilities.";
+            break;
+        case sz_device_memory_mismatch_k:
+            error_msg = "SmithWaterman failed: device-memory mismatch (unified/device-accessible memory required).";
+            break;
         case sz_status_unknown_k: error_msg = "SmithWaterman failed: unknown error"; break;
         default: error_msg = "SmithWaterman failed: unexpected error"; break;
         }
@@ -1421,7 +1473,17 @@ static PyObject *SmithWaterman_call(SmithWaterman *self, PyObject *args, PyObjec
         case sz_contains_duplicates_k: error_msg = "SmithWaterman failed: contains duplicates"; break;
         case sz_overflow_risk_k: error_msg = "SmithWaterman failed: overflow risk"; break;
         case sz_unexpected_dimensions_k: error_msg = "SmithWaterman failed: input/output size mismatch"; break;
-        case sz_missing_gpu_k: error_msg = "SmithWaterman failed: GPU support is missing in the library"; break;
+        case sz_missing_gpu_k:
+            error_msg = "SmithWaterman failed: CUDA backend requested but no GPU device scope provided. "
+                        "Pass device=stringzillas.DeviceScope(gpu_device=0) or use serial/CPU capabilities.";
+            break;
+        case sz_device_code_mismatch_k:
+            error_msg = "SmithWaterman failed: device-code mismatch between backend and executor. "
+                        "Use a GPU DeviceScope with CUDA backends or select CPU capabilities.";
+            break;
+        case sz_device_memory_mismatch_k:
+            error_msg = "SmithWaterman failed: device-memory mismatch (unified/device-accessible memory required).";
+            break;
         case sz_status_unknown_k: error_msg = "SmithWaterman failed: unknown error"; break;
         default: error_msg = "SmithWaterman failed: unexpected error"; break;
         }
@@ -1984,4 +2046,4 @@ PyMODINIT_FUNC PyInit_stringzillas(void) {
     }
 
     return m;
-}
\ No newline at end of file
+}
diff --git a/rust/stringzilla.rs b/rust/stringzilla.rs
index 5162f08e..83d2b1a1 100644
--- a/rust/stringzilla.rs
+++ b/rust/stringzilla.rs
@@ -30,6 +30,12 @@ pub enum Status {
     /// GPU support is missing in the library.
     /// Corresponds to `sz_missing_gpu_k = -16` in C.
     MissingGpu = -16,
+    /// Backend-device mismatch (e.g., GPU kernel with CPU/default executor).
+    /// Corresponds to `sz_device_code_mismatch_k = -17` in C.
+    DeviceCodeMismatch = -17,
+    /// Device memory mismatch (e.g., pageable host memory where Unified/Device memory is required).
+    /// Corresponds to `sz_device_memory_mismatch_k = -18` in C.
+    DeviceMemoryMismatch = -18,
     /// A sink-hole status for unknown errors.
     /// Corresponds to `sz_status_unknown_k = -1` in C.
     StatusUnknown = -1,

From 48b44069abf613986dfb0fdd60e2fd73f95404c5 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 14:48:07 +0000
Subject: [PATCH 647/751] Fix: Dispatch serial code for `bytes_per_cell <= 2`

---
 include/stringzillas/similarities.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/stringzillas/similarities.hpp b/include/stringzillas/similarities.hpp
index 3f31b735..74803312 100644
--- a/include/stringzillas/similarities.hpp
+++ b/include/stringzillas/similarities.hpp
@@ -1963,7 +1963,7 @@ struct needleman_wunsch_score {
 
         // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
-        else if (requirements.bytes_per_cell == 2) {
+        else if (requirements.bytes_per_cell <= 2) {
             sz_i16_t result_i16 = std::numeric_limits<sz_i16_t>::min();
             status = diagonal_i16_t {substituter_, gap_costs_, alloc_}(first, second, result_i16, executor);
             if (status == status_t::success_k) result_ref = result_i16;
@@ -2065,7 +2065,7 @@ struct smith_waterman_score {
 
         // When dealing with larger arrays, we need to differentiate kernel with different cost aggregation types.
         // Smaller ones will overflow for larger inputs, but using larger-than-needed types will waste memory.
-        else if (requirements.bytes_per_cell == 2) {
+        else if (requirements.bytes_per_cell <= 2) {
             sz_i16_t result_i16 = std::numeric_limits<sz_i16_t>::min();
             status_t status = diagonal_i16_t {substituter_, gap_costs_, alloc_}(first, second, result_i16, executor);
             if (status != status_t::success_k) return status;
@@ -4432,4 +4432,4 @@ struct smith_waterman_score<char, error_costs_256x256_t, linear_gap_costs_t, all
 } // namespace stringzillas
 } // namespace ashvardanian
 
-#endif // STRINGZILLAS_SIMILARITIES_HPP_
\ No newline at end of file
+#endif // STRINGZILLAS_SIMILARITIES_HPP_

From 2bee1e45065ec39105a2cc9de62f9e543d99d875 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 14:57:48 +0000
Subject: [PATCH 648/751] Improve: Differentiate `capabilities_mode` in PyTest

---
 scripts/test_stringzillas.py | 179 ++++++++++++++++-------------------
 1 file changed, 80 insertions(+), 99 deletions(-)

diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index 176c6da0..2fe74118 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -23,7 +23,7 @@
 
 import pytest
 import numpy as np  # ! Unlike StringZilla, NumPy is mandatory for StringZillas
-import affine_gaps
+import affine_gaps  # ? Provides baseline implementation for NW & SW scoring
 
 import stringzilla as sz
 import stringzillas as szs
@@ -83,22 +83,12 @@ def generate_string_batches(config: InputSizeConfig):
         raise ValueError(f"Unknown input size config: {config}")
 
 
-def get_random_string_batch(config: InputSizeConfig, seed_value: Optional[int] = None):
-    """Generate two batches of random strings based on the configuration."""
-    if seed_value is not None:
-        seed(seed_value)
-        np.random.seed(seed_value)
-
-    batch_size, min_len, max_len = generate_string_batches(config)
-
-    # Generate random lengths for each string in the batch
-    a_lengths = [randint(min_len, max_len) for _ in range(batch_size)]
-    b_lengths = [randint(min_len, max_len) for _ in range(batch_size)]
-
-    a_batch = [get_random_string(length=length) for length in a_lengths]
-    b_batch = [get_random_string(length=length) for length in b_lengths]
-
-    return a_batch, b_batch
+def seed_random_generators(seed_value: Optional[int] = None):
+    """Seed the random number generators for reproducibility."""
+    if seed_value is None:
+        return
+    seed(seed_value)
+    np.random.seed(seed_value)
 
 
 def test_device_scope():
@@ -195,10 +185,7 @@ def test_levenshtein_distance_insertions(max_edit_distance: int, seed_value: int
     def insert_char_at(s, char_to_insert, index):
         return s[:index] + char_to_insert + s[index:]
 
-    # Set seed for reproducible random strings
-    seed(seed_value)
-    np.random.seed(seed_value)
-
+    seed_random_generators(seed_value)
     binary_engine = szs.LevenshteinDistances()
 
     a = get_random_string(length=20)
@@ -214,11 +201,12 @@ def insert_char_at(s, char_to_insert, index):
         assert results[0] == [i + 1], f"Edit distance mismatch after {i + 1} insertions: {a} -> {b}"
 
 
+@pytest.mark.parametrize("capabilities_mode", ["base", "infer-from-device"])
 @pytest.mark.parametrize("device_name", DEVICE_NAMES)
-def test_levenshtein_distances_with_simple_cases(device_name: DeviceName):
+def test_levenshtein_distances_with_simple_cases(capabilities_mode: str, device_name: DeviceName):
 
-    device_scope, capabilities = device_scope_and_capabilities(device_name)
-    binary_engine = szs.LevenshteinDistances(capabilities=capabilities)
+    device_scope, base_caps = device_scope_and_capabilities(device_name)
+    binary_engine = szs.LevenshteinDistances(capabilities=base_caps if capabilities_mode == "base" else device_scope)
 
     def binary_distance(a: str, b: str) -> int:
         a_strs = Strs([a])
@@ -239,15 +227,18 @@ def binary_distance(a: str, b: str) -> int:
     assert binary_distance("abcdefgABCDEFG", "ABCDEFGabcdefg") == 14
 
 
+@pytest.mark.parametrize("capabilities_mode", ["base", "infer-from-device"])
 @pytest.mark.parametrize("device_name", DEVICE_NAMES)
-def test_levenshtein_distances_utf8_with_simple_cases(device_name: DeviceName):
+def test_levenshtein_distances_utf8_with_simple_cases(capabilities_mode: str, device_name: DeviceName):
 
     if device_name == "gpu_device":
         pytest.skip("CUDA backend does not support custom gaps in UTF-8 Levenshtein distances")
         return
 
-    device_scope, capabilities = device_scope_and_capabilities(device_name)
-    unicode_engine = szs.LevenshteinDistancesUTF8(capabilities=capabilities)
+    device_scope, base_caps = device_scope_and_capabilities(device_name)
+    unicode_engine = szs.LevenshteinDistancesUTF8(
+        capabilities=base_caps if capabilities_mode == "base" else device_scope
+    )
 
     def unicode_distance(a: str, b: str) -> int:
         a_strs = Strs([a])
@@ -268,16 +259,20 @@ def unicode_distance(a: str, b: str) -> int:
     assert unicode_distance("こんにちは世界", "こんばんは世界") == 2, "Japanese greetings"
 
 
+@pytest.mark.parametrize("capabilities_mode", ["base", "infer-from-device"])
 @pytest.mark.parametrize("device_name", DEVICE_NAMES)
-def test_levenshtein_distances_with_custom_gaps(device_name: DeviceName):
+def test_levenshtein_distances_with_custom_gaps(capabilities_mode: str, device_name: DeviceName):
 
     mismatch: int = 4
     opening: int = 3
     extension: int = 2
 
-    device_scope, capabilities = device_scope_and_capabilities(device_name)
+    device_scope, base_caps = device_scope_and_capabilities(device_name)
     binary_engine = szs.LevenshteinDistances(
-        open=opening, extend=extension, mismatch=mismatch, capabilities=capabilities
+        open=opening,
+        extend=extension,
+        mismatch=mismatch,
+        capabilities=base_caps if capabilities_mode == "base" else device_scope,
     )
 
     def binary_distance(a: str, b: str) -> int:
@@ -299,8 +294,9 @@ def binary_distance(a: str, b: str) -> int:
     assert binary_distance("abcdefgABCDEFG", "ABCDEFGabcdefg") == min(14 * mismatch, 2 * opening + 12 * extension)
 
 
+@pytest.mark.parametrize("capabilities_mode", ["base", "infer-from-device"])
 @pytest.mark.parametrize("device_name", DEVICE_NAMES)
-def test_levenshtein_distances_utf8_with_custom_gaps(device_name: DeviceName):
+def test_levenshtein_distances_utf8_with_custom_gaps(capabilities_mode: str, device_name: DeviceName):
 
     if device_name == "gpu_device":
         pytest.skip("CUDA backend does not support custom gaps in UTF-8 Levenshtein distances")
@@ -309,12 +305,12 @@ def test_levenshtein_distances_utf8_with_custom_gaps(device_name: DeviceName):
     mismatch: int = 4
     opening: int = 3
 
-    device_scope, capabilities = device_scope_and_capabilities(device_name)
+    device_scope, base_caps = device_scope_and_capabilities(device_name)
     unicode_engine = szs.LevenshteinDistancesUTF8(
         open=opening,
         extend=opening,
         mismatch=mismatch,
-        capabilities=capabilities,
+        capabilities=base_caps if capabilities_mode == "base" else device_scope,
     )
 
     def unicode_distance(a: str, b: str) -> int:
@@ -336,97 +332,82 @@ def unicode_distance(a: str, b: str) -> int:
     assert unicode_distance("こんにちは世界", "こんばんは世界") == min(2 * mismatch, 4 * opening), "Japanese greetings"
 
 
+@pytest.mark.parametrize("capabilities_mode", ["base", "infer-from-device"])
+@pytest.mark.parametrize("device_name", DEVICE_NAMES)
 @pytest.mark.parametrize("config", INPUT_SIZE_CONFIGS)
 @pytest.mark.parametrize("seed_value", SEED_VALUES)
-def test_levenshtein_distance_random(config: InputSizeConfig, seed_value: int):
+def test_levenshtein_distance_random(
+    capabilities_mode: str,
+    device_name: DeviceName,
+    config: InputSizeConfig,
+    seed_value: int,
+):
     """Test Levenshtein distances with deterministic seeds for reproducibility."""
-    a_batch, b_batch = get_random_string_batch(config, seed_value)
+
+    seed_random_generators(seed_value)
+    batch_size, min_len, max_len = generate_string_batches(config)
+    a_batch = [get_random_string(length=randint(min_len, max_len)) for _ in range(batch_size)]
+    b_batch = [get_random_string(length=randint(min_len, max_len)) for _ in range(batch_size)]
 
     baselines = np.array([baseline_levenshtein_distance(a, b) for a, b in zip(a_batch, b_batch)])
-    engine = szs.LevenshteinDistances()
+
+    device_scope, base_caps = device_scope_and_capabilities(device_name)
+    engine = szs.LevenshteinDistances(capabilities=base_caps if capabilities_mode == "base" else device_scope)
 
     # Convert to Strs objects
-    a_strs = Strs(a_batch)
-    b_strs = Strs(b_batch)
+    a_strs, b_strs = Strs(a_batch), Strs(b_batch)
     results = engine(a_strs, b_strs)
 
     np.testing.assert_array_equal(results, baselines, "Edit distances do not match")
 
 
+@pytest.mark.parametrize("capabilities_mode", ["base", "infer-from-device"])
+@pytest.mark.parametrize("device_name", DEVICE_NAMES)
 @pytest.mark.parametrize("config", INPUT_SIZE_CONFIGS)
 @pytest.mark.parametrize("seed_value", SEED_VALUES)
-def test_needleman_wunsch_vs_levenshtein_random(config: InputSizeConfig, seed_value: int):
+def test_needleman_wunsch_vs_levenshtein_random(
+    capabilities_mode: str,
+    device_name: DeviceName,
+    config: InputSizeConfig,
+    seed_value: int,
+):
     """Test Needleman-Wunsch global alignment scores against Levenshtein distances with random strings."""
 
-    a_batch, b_batch = get_random_string_batch(config, seed_value)
+    seed_random_generators(seed_value)
+    batch_size, min_len, max_len = generate_string_batches(config)
+    a_batch = [get_random_string(length=randint(min_len, max_len)) for _ in range(batch_size)]
+    b_batch = [get_random_string(length=randint(min_len, max_len)) for _ in range(batch_size)]
 
     character_substitutions = np.zeros((256, 256), dtype=np.int8)
     character_substitutions.fill(-1)
     np.fill_diagonal(character_substitutions, 0)
 
     baselines = [-baseline_levenshtein_distance(a, b) for a, b in zip(a_batch, b_batch)]
-    engine = szs.NeedlemanWunsch(substitution_matrix=character_substitutions, open=-1, extend=-1)
-
-    # Convert to Strs objects
-    a_strs = Strs(a_batch)
-    b_strs = Strs(b_batch)
-    results = engine(a_strs, b_strs)
-
-    np.testing.assert_array_equal(results, baselines, "Edit distances do not match")
-
-
-def test_needleman_wunsch_serial_backend_validation():
-    """Test Needleman-Wunsch implementation against affine-gaps reference."""
 
-    test_cases = [
-        ("hello", "world"),
-        ("abc", "def"),
-        ("nhosizayzfwnkie", "mdlltizbxordmcr"),
-        ("abcdefghijklmno", "pqrstuvwxyzabcd"),
-    ]
-
-    character_substitutions = np.zeros((256, 256), dtype=np.int8)
-    character_substitutions.fill(-1)
-    np.fill_diagonal(character_substitutions, 0)
-
-    # Create ASCII substitution matrix for affine-gaps
-    ascii_alphabet = "".join(chr(i) for i in range(32, 127))
-    ascii_matrix = np.full((256, 256), -1, dtype=np.int32)
-    np.fill_diagonal(ascii_matrix, 0)
-
-    # Get reference scores from affine-gaps
-    baselines = []
-    for str1, str2 in test_cases:
-        ref_score = affine_gaps.needleman_wunsch_gotoh_score(
-            str1,
-            str2,
-            substitution_alphabet=ascii_alphabet,
-            substitution_matrix=ascii_matrix,
-            gap_opening=-1,
-            gap_extension=-1,
-        )
-        baselines.append(ref_score)
-
-    engine = szs.NeedlemanWunsch(substitution_matrix=character_substitutions, open=-1, extend=-1)
+    device_scope, base_caps = device_scope_and_capabilities(device_name)
+    engine = szs.NeedlemanWunsch(
+        capabilities=base_caps if capabilities_mode == "base" else device_scope,
+        substitution_matrix=character_substitutions,
+        open=-1,
+        extend=-1,
+    )
 
     # Convert to Strs objects
-    a_batch = [case[0] for case in test_cases]
-    b_batch = [case[1] for case in test_cases]
-    a_strs = Strs(a_batch)
-    b_strs = Strs(b_batch)
+    a_strs, b_strs = Strs(a_batch), Strs(b_batch)
     results = engine(a_strs, b_strs)
 
-    np.testing.assert_array_equal(results, baselines, "NeedlemanWunsch scores do not match affine-gaps reference")
+    np.testing.assert_array_equal(results, baselines, "Edit distances do not match")
 
 
+@pytest.mark.parametrize("capabilities_mode", ["base", "infer-from-device"])
 @pytest.mark.parametrize("device_name", DEVICE_NAMES)
 @pytest.mark.parametrize("ndim", [1, 7, 64, 1024])
-def test_fingerprints(device_name: str, ndim: int):
+def test_fingerprints(capabilities_mode: str, device_name: str, ndim: int):
     """Test Fingerprints basic functionality."""
 
     # Create engine with smaller dimensions to avoid memory issues
-    device_scope, capabilities = device_scope_and_capabilities(device_name)
-    engine = szs.Fingerprints(ndim=ndim, capabilities=capabilities)
+    device_scope, base_caps = device_scope_and_capabilities(device_name)
+    engine = szs.Fingerprints(ndim=ndim, capabilities=base_caps if capabilities_mode == "base" else device_scope)
 
     # Basic functionality - empty input should return empty arrays
     hashes, counts = engine(Strs([]), device=device_scope)
@@ -454,26 +435,26 @@ def test_fingerprints(device_name: str, ndim: int):
 
 
 @pytest.mark.parametrize("batch_size", [1, 10, 100])
+@pytest.mark.parametrize("capabilities_mode", ["base", "infer-from-device"])
 @pytest.mark.parametrize("device_name", DEVICE_NAMES)
 @pytest.mark.parametrize("ndim", [1, 7, 64, 1024])
 @pytest.mark.parametrize("seed_value", [42, 123, 1337, 12345, 98765])  # Subset of seeds for this test
-def test_fingerprints_random(batch_size: int, device_name: str, ndim: int, seed_value: int):
+def test_fingerprints_random(batch_size: int, capabilities_mode: str, device_name: str, ndim: int, seed_value: int):
     """Test Fingerprints with random strings using deterministic seeds."""
 
-    # Set seed for reproducible random strings
-    seed(seed_value)
-    np.random.seed(seed_value)
+    seed_random_generators(seed_value)
+    batch = [get_random_string(length=randint(5, 50)) for _ in range(batch_size)]
 
-    device_scope, capabilities = device_scope_and_capabilities(device_name)
-    engine = szs.Fingerprints(ndim=ndim, capabilities=capabilities)
-    batch = Strs([get_random_string(length=randint(5, 50)) for _ in range(batch_size)])
+    device_scope, base_caps = device_scope_and_capabilities(device_name)
+    engine = szs.Fingerprints(ndim=ndim, capabilities=base_caps if capabilities_mode == "base" else device_scope)
 
-    hashes, counts = engine(batch, device=device_scope)
+    strs = Strs(batch)
+    hashes, counts = engine(strs, device=device_scope)
     assert hashes.shape == (batch_size, ndim)
     assert counts.shape == (batch_size, ndim)
 
     # Verify consistency
-    hashes_repeated, counts_repeated = engine(batch, device=device_scope)
+    hashes_repeated, counts_repeated = engine(strs, device=device_scope)
     assert np.array_equal(hashes, hashes_repeated), "Same input should produce same hashes"
     assert np.array_equal(counts, counts_repeated), "Same input should produce same counts"
 

From a4015f1d123620951e61a45b25d55162485ba011 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 15:32:30 +0000
Subject: [PATCH 649/751] Make: Can't read `SWIFT_VERSION` for `container`

---
 .github/workflows/prerelease.yml | 2 +-
 .github/workflows/release.yml    | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index ac1c191e..7399b48d 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -352,7 +352,7 @@ jobs:
   test_ubuntu_swift:
     name: Swift on Linux
     runs-on: ubuntu-22.04
-    container: swift:${{ env.SWIFT_VERSION }}
+    container: swift:latest
     steps:
       - uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 2bf48c96..032d14a2 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -7,7 +7,6 @@ on:
 env:
   BUILD_TYPE: Release
   GH_TOKEN: ${{ secrets.SEMANTIC_RELEASE_TOKEN }}
-  SWIFT_VERSION: 6.0
   PYTHONUTF8: 1
 
 # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages

From 10dff6c90f0e000872c7b620161082afdd2f700f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 15:40:55 +0000
Subject: [PATCH 650/751] Fix: Argument order

---
 Package.swift | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Package.swift b/Package.swift
index b3e7e101..57082da3 100644
--- a/Package.swift
+++ b/Package.swift
@@ -35,8 +35,8 @@ let package = Package(
             name: "StringZilla",
             dependencies: ["StringZillaC"],
             path: "swift",
-            sources: ["StringProtocol+StringZilla.swift"],
-            exclude: ["Test.swift"]
+            exclude: ["Test.swift"],
+            sources: ["StringProtocol+StringZilla.swift"]
         ),
         .testTarget(
             name: "StringZillaTests",

From f214169bb51462156e8c672f6b3af03472c8c5fb Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 15:41:08 +0000
Subject: [PATCH 651/751] Docs: No Alpine flow on release CI

---
 README.md | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index ecb0d220..c2c595a7 100644
--- a/README.md
+++ b/README.md
@@ -17,8 +17,7 @@ To provide predictably high performance, portable to any modern platform, operat
 [![StringZilla Rust installs](https://img.shields.io/crates/d/stringzilla?logo=rust&label=Rust%20installs)](https://crates.io/crates/stringzilla)
 [![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/ashvardanian/StringZilla/release.yml?branch=main&label=Ubuntu)](https://github.com/ashvardanian/StringZilla/actions/workflows/release.yml)
 [![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/ashvardanian/StringZilla/release.yml?branch=main&label=Windows)](https://github.com/ashvardanian/StringZilla/actions/workflows/release.yml)
-[![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/ashvardanian/StringZilla/release.yml?branch=main&label=MacOS)](https://github.com/ashvardanian/StringZilla/actions/workflows/release.yml)
-[![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/ashvardanian/StringZilla/release.yml?branch=main&label=Alpine%20Linux)](https://github.com/ashvardanian/StringZilla/actions/workflows/release.yml)
+[![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/ashvardanian/StringZilla/release.yml?branch=main&label=macOS)](https://github.com/ashvardanian/StringZilla/actions/workflows/release.yml)
 ![StringZilla code size](https://img.shields.io/github/languages/code-size/ashvardanian/stringzilla)
 
 StringZilla is the GodZilla of string libraries, using [SIMD][faq-simd] and [SWAR][faq-swar] to accelerate string operations on modern CPUs.
@@ -280,7 +279,7 @@ Notably, if the CPU supports misaligned loads, even the 64-bit SWAR backends are
 > The code was compiled with GCC 12, using `glibc` v2.35.
 > The benchmarks performed on Arm-based Graviton3 AWS `c7g` instances and `r7iz` Intel Sapphire Rapids.
 > Most modern Arm-based 64-bit CPUs will have similar relative speedups.
-> Variance withing x86 CPUs will be larger.
+> Variance within x86 CPUs will be larger.
 > <sup>1</sup> Unlike other libraries, LibC requires strings to be NULL-terminated.
 > <sup>2</sup> Six whitespaces in the ASCII set are: ` \t\n\v\f\r`. Python's and other standard libraries have specialized functions for those.
 > <sup>3</sup> Most Python libraries for strings are also implemented in C.
@@ -300,6 +299,11 @@ Notably, if the CPU supports misaligned loads, even the 64-bit SWAR backends are
 
 StringZilla is compatible with most modern CPUs, and provides a broad range of functionality.
 
+Glossary:
+
+- StringZilla: single-header C library and C++ wrapper for high-performance string operations.
+- StringZillas: parallel CPU/GPU backends used for large-batch operations and accelerators.
+
 - [x] works on both Little-Endian and Big-Endian architectures.
 - [x] works on 32-bit and 64-bit hardware architectures.
 - [x] compatible with ASCII and UTF-8 encoding.
@@ -582,7 +586,7 @@ from pyarrow import foreign_buffer
 from stringzilla import Str
 
 original = "hello"
-view = Str(native)
+view = Str(original)
 arrow = foreign_buffer(view.address, view.nbytes, view)
 ```
 
@@ -774,7 +778,7 @@ Before 2015 GCC string implementation was just 8 bytes, and could only fit 7 cha
 Different STL implementations today have different thresholds for the Small String Optimization.
 Similar to GCC, StringZilla is 32 bytes in size, and similar to Clang it can fit 22 characters on stack.
 Our layout might be preferential, if you want to avoid branches.
-If you use a different compiler, you may want to check it's SSO buffer size with a [simple Gist](https://gist.github.com/ashvardanian/c197f15732d9855c4e070797adf17b21).
+If you use a different compiler, you may want to check its SSO buffer size with a [simple Gist](https://gist.github.com/ashvardanian/c197f15732d9855c4e070797adf17b21).
 
 |                       | `libstdc++` in  GCC 13 | `libc++` in Clang 17 | StringZilla |
 | :-------------------- | ---------------------: | -------------------: | ----------: |
@@ -832,7 +836,7 @@ Conceptually:
 3. LibC function names are typically very short and cryptic.
 4. LibC lacks crucial functionality like hashing and doesn't provide primitives for less critical but relevant operations like fuzzy matching.
 
-Something has to be said about its support for UTF8.
+Something has to be said about its support for UTF-8.
 Aside from a single-byte `char` type, LibC provides `wchar_t`:
 
 - The size of `wchar_t` is not consistent across platforms. On Windows, it's typically 16 bits (suitable for UTF-16), while on Unix-like systems, it's usually 32 bits (suitable for UTF-32). This inconsistency can lead to portability issues when writing cross-platform code.
@@ -1114,7 +1118,7 @@ sz::alignment_score(first, second, costs[, gap_score[, allocator]) -> std::ptrdi
 ### Sorting in C and C++
 
 LibC provides `qsort` and STL provides `std::sort`.
-Both have their quarks.
+Both have their quirks.
 The LibC standard has no way to pass a context to the comparison function, that's only possible with platform-specific extensions.
 Those have [different arguments order](https://stackoverflow.com/a/39561369) on every OS.
 
@@ -1123,7 +1127,7 @@ Those have [different arguments order](https://stackoverflow.com/a/39561369) on
 void qsort_r(void *elements, size_t count, size_t element_width, 
     int (*compare)(void const *left, void const *right, void *context),
     void *context);
-// MacOS and FreeBSD: https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man3/qsort_r.3.html
+// macOS and FreeBSD: https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man3/qsort_r.3.html
 void qsort_r(void *elements, size_t count, size_t element_width, 
     void *context,
     int (*compare)(void *context, void const *left, void const *right));
@@ -1204,7 +1208,7 @@ __`SZ_AVOID_LIBC`__ and __`SZ_OVERRIDE_LIBC`__:
 > When using the C header-only library one can disable the use of LibC.
 > This may affect the type resolution system on obscure hardware platforms. 
 > Moreover, one may let `stringzilla` override the common symbols like the `memcpy` and `memset` with its own implementations.
-> In that case you can use the [`LD_PRELOAD` trick][ld-preload-trick] to prioritize it's symbols over the ones from the LibC and accelerate existing string-heavy applications without recompiling them.
+> In that case you can use the [`LD_PRELOAD` trick][ld-preload-trick] to prioritize its symbols over the ones from the LibC and accelerate existing string-heavy applications without recompiling them.
 > It also adds a layer of security, as the `stringzilla` isn't [undefined for NULL inputs][redhat-memcpy-ub] like `memcpy(NULL, NULL, 0)`.
 
 [ld-preload-trick]: https://ashvardanian.com/posts/ld-preload-libsee
@@ -1310,9 +1314,9 @@ The package currently covers only the most basic functionality, but is planned t
 
 ```swift
 var s = "Hello, world! Welcome to StringZilla. 👋"
-s[s.findFirst(substring: "world")!...] // "world! Welcome to StringZilla. 👋")    
-s[s.findLast(substring: "o")!...] // "o StringZilla. 👋")
-s[s.findFirst(characterFrom: "aeiou")!...] // "ello, world! Welcome to StringZilla. 👋")
+s[s.findFirst(substring: "world")!...] // "world! Welcome to StringZilla. 👋"
+s[s.findLast(substring: "o")!...] // "o StringZilla. 👋"
+s[s.findFirst(characterFrom: "aeiou")!...] // "ello, world! Welcome to StringZilla. 👋"
 s[s.findLast(characterFrom: "aeiou")!...] // "a. 👋")
 s[s.findFirst(characterNotFrom: "aeiou")!...] // "Hello, world! Welcome to StringZilla. 👋"
 ```
@@ -1549,21 +1553,21 @@ It has the same 128-bit security level as the BLAKE2, and achieves its performan
 All mentioned libraries have undergone extensive testing and are considered production-ready.
 They can definitely accelerate your application, but so may the downstream mixer.
 For instance, when a hash-table is constructed, the hashes are further shrunk to address table buckets.
-If the mixer looses entropy, the performance gains from the hash function may be lost.
+If the mixer loses entropy, the performance gains from the hash function may be lost.
 An example would be power-of-two modulo, which is a common mixer, but is known to be weak.
 One alternative would be the [fastrange](https://github.com/lemire/fastrange) by Daniel Lemire.
 Another one is the [Fibonacci hash trick](https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/) using the Golden Ratio, also used in StringZilla.
 
 ### Unicode, UTF-8, and Wide Characters
 
-Most StringZilla operations are byte-level, so they work well with ASCII and UTF8 content out of the box.
+Most StringZilla operations are byte-level, so they work well with ASCII and UTF-8 content out of the box.
 In some cases, like edit-distance computation, the result of byte-level evaluation and character-level evaluation may differ.
 So StringZilla provides following functions to work with Unicode:
 
 - `szs_levenshtein_distance_utf8` - computes the Levenshtein distance between two UTF-8 strings.
 - `sz_hamming_distance_utf8` - computes the Hamming distance between two UTF-8 strings.
 
-Java, JavaScript, Python 2, C#, and Objective-C, however, use wide characters (`wchar`) - two byte long codes, instead of the more reasonable fixed-length UTF32 or variable-length UTF8.
+Java, JavaScript, Python 2, C#, and Objective-C, however, use wide characters (`wchar`) - two byte long codes, instead of the more reasonable fixed-length UTF-32 or variable-length UTF-8.
 This leads [to all kinds of offset-counting issues][wide-char-offsets] when facing four-byte long Unicode characters.
 So consider transcoding with [simdutf](https://github.com/simdutf/simdutf), if you are coming from such environments.
 
@@ -1581,7 +1585,7 @@ If you like this project, you may also enjoy [USearch][usearch], [UCall][ucall],
 
 If you like strings and value efficiency, you may also enjoy the following projects:
 
-- [simdutf](https://github.com/simdutf/simdutf) - transcoding UTF8, UTF16, and UTF32 LE and BE.
+- [simdutf](https://github.com/simdutf/simdutf) - transcoding UTF-8, UTF-16, and UTF-32 LE and BE.
 - [hyperscan](https://github.com/intel/hyperscan) - regular expressions with SIMD acceleration.
 - [pyahocorasick](https://github.com/WojciechMula/pyahocorasick) - Aho-Corasick algorithm in Python.
 - [rapidfuzz](https://github.com/rapidfuzz/RapidFuzz) - fast string matching in C++ and Python.

From fa523632312000c0bd414bd5b381a72192ea52fc Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 15:41:25 +0000
Subject: [PATCH 652/751] Make: Missing `affine-gaps` dep

---
 .github/workflows/prerelease.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 7399b48d..3c596db8 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -275,7 +275,7 @@ jobs:
           python-version: ${{ env.PYTHON_VERSION }}
       - name: Build Python StringZillas-CPUs
         run: |
-          pip install pytest pytest-repeat numpy pyarrow wheel
+          pip install pytest pytest-repeat numpy pyarrow wheel affine-gaps
           SZ_TARGET=stringzilla pip install -e . --force-reinstall --no-build-isolation
           SZ_TARGET=stringzillas-cpus pip install -e . --force-reinstall --no-build-isolation
       - name: Test Python StringZillas-CPUs
@@ -341,7 +341,7 @@ jobs:
           python-version: ${{ env.PYTHON_VERSION }}
       - name: Build Python StringZillas-CUDA
         run: |
-          pip install pytest pytest-repeat numpy pyarrow wheel
+          pip install pytest pytest-repeat numpy pyarrow wheel affine-gaps
           SZ_TARGET=stringzilla pip install -e . --force-reinstall --no-build-isolation
           SZ_TARGET=stringzillas-cuda pip install -e . --force-reinstall --no-build-isolation
       - name: Test Python StringZillas-CUDA

From cf1623c0f6c6de2b21955d30e404bbfb476d4b72 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 15:51:46 +0000
Subject: [PATCH 653/751] Docs: Wording typos

---
 CONTRIBUTING.md                    | 47 ++++++++++++++++++++----------
 cli/README.md                      |  4 +--
 include/stringzilla/small_string.h |  2 +-
 3 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d15bbc92..3d37d582 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -19,9 +19,14 @@ The project is split into the following parts:
 
 - `include/stringzilla/stringzilla.h` - single-header C implementation.
 - `include/stringzilla/stringzilla.hpp` - single-header C++ wrapper.
+- `include/stringzillas/*` - parallel CPU/GPU header-only backends.
+- `c/*` - C sources for dynamic dispatch and parallel backends.
+- `rust/*` - Rust crate sources.
 - `python/*` - Python bindings.
+- `swift/*` - Swift package sources and tests.
 - `javascript/*` - JavaScript bindings.
 - `scripts/*` - Scripts for benchmarking and testing.
+- `cli/*` - SIMD-accelerated CLI utilities.
 
 For minimal test coverage, check the following scripts:
 
@@ -48,7 +53,7 @@ The role of Python benchmarks is less to provide absolute number, but to compare
 ## Benchmarking Datasets
 
 It's not always easy to find good datasets for benchmarking strings workloads.
-I use several ASCII and UTF8 international datasets, all of them mirrored on the HuggingFace dataset hub, in the [StringKilla](https://huggingface.co/datasets/ashvardanian/StringKilla) repository.
+I use several ASCII and UTF-8 international datasets, all of them mirrored on the HuggingFace dataset hub, in the [StringKilla](https://huggingface.co/datasets/ashvardanian/StringKilla) repository.
 You can download them using the following commands:
 
 ```sh
@@ -121,7 +126,7 @@ sudo apt-get install g++-12 gcc-12      # You may already have a newer version o
 sudo apt install libstdc++6-12-dbg      # STL debugging symbols for GCC 12
 ```
 
-On MacOS it's recommended to use Homebrew and install Clang, as opposed to "Apple Clang".
+On macOS it's recommended to use Homebrew and install Clang, as opposed to "Apple Clang".
 Replacing the default compiler is not recommended, as it may break the system, but you can pass it as an environment variable:
 
 ```bash
@@ -133,6 +138,25 @@ cmake -D CMAKE_BUILD_TYPE=Release -D STRINGZILLA_BUILD_TEST=1 \
 cmake --build build_release --config Release
 ```
 
+On Windows you can build with either MSVC (Visual Studio) or MinGW (GCC).
+Pick one.
+For MSVC (Developer Prompt):
+
+```bat
+cmake -B build_release -G "Visual Studio 17 2022" -A x64 -D STRINGZILLA_BUILD_TEST=1 -D CMAKE_BUILD_TYPE=Release
+cmake --build build_release --config Release
+build_release\\Release\\stringzilla_test_cpp20.exe
+```
+
+For MinGW (MSYS2):
+
+```bash
+pacman -S --needed --noconfirm mingw-w64-x86_64-gcc mingw-w64-x86_64-cmake make
+cmake -G "MinGW Makefiles" -B build_release -D STRINGZILLA_BUILD_TEST=1 -D CMAKE_BUILD_TYPE=Release
+cmake --build build_release --config Release
+./build_release/stringzilla_test_cpp20.exe
+```
+
 ### Testing
 
 Using modern syntax, this is how you build and run the test suite:
@@ -276,7 +300,7 @@ Docker is the goto-choice for that.
 
 #### Alpine
 
-Alpine is one of the most popular Linux distributions for containers, due to it's size.
+Alpine is one of the most popular Linux distributions for containers, due to its size.
 The base image is only ~3 MB, and it's based on musl libc, which is different from glibc.
 
 ```bash
@@ -459,7 +483,7 @@ cibuildwheel --platform linux --archs x86_64    # 64-bit x86, the most common on
 cibuildwheel --platform linux --archs aarch64   # 64-bit Arm for mobile devices, Apple M-series, and AWS Graviton
 cibuildwheel --platform linux --archs i686      # 32-bit Linux
 cibuildwheel --platform linux --archs s390x     # emulating big-endian IBM Z
-cibuildwheel --platform macos                   # works only on MacOS
+cibuildwheel --platform macos                   # works only on macOS
 cibuildwheel --platform windows                 # works only on Windows
 ```
 
@@ -469,7 +493,7 @@ You may need root privileges for multi-architecture builds:
 sudo $(which cibuildwheel) --platform linux
 ```
 
-On Windows and MacOS, to avoid frequent path resolution issues, you may want to use:
+On Windows and macOS, to avoid frequent path resolution issues, you may want to use:
 
 ```bash
 python -m cibuildwheel --platform windows
@@ -503,15 +527,6 @@ npm ci && npm test
 swift build && swift test
 ```
 
-To format, consider using [SwiftFormat](https://github.com/nicklockwood/SwiftFormat):
-
-```bash
-brew install swiftformat
-swiftformat .
-```
-
----
-
 Running Swift on Linux requires a couple of extra steps - [`swift.org/install` page](https://www.swift.org/install).
 Alternatively, on Linux, the official Swift Docker image can be used for builds and tests:
 
@@ -522,7 +537,7 @@ sudo docker run --rm -v "$PWD:/workspace" -w /workspace swift:6.0 /bin/bash -cl
 To format the code on Linux:
 
 ```bash
-sudo docker run --rm -v "$PWD:/workspace" -w /workspace swift:6.0 /bin/bash -c "swift format . -i -r"
+sudo docker run --rm -v "$PWD:/workspace" -w /workspace swift:6.0 /bin/bash -c "swift format . -i -r --configuration .swift-format"
 ```
 
 ## Contributing in Rust
@@ -605,4 +620,4 @@ Ideally, include it in `scripts/` as a Python Jupyter Notebook with explanations
 
 Sorting algorithms for strings are a deeply studied area.
 In general, string sorting algorithms discourage the use of comparisons, as they are expensive for variable-length data and also require pointer-chasing for most array layouts.
-They are also harder to accelerate with SIMD, as most layouts imply 16-byte entries, which are often too big to benefit from simple SIMD techniques.
\ No newline at end of file
+They are also harder to accelerate with SIMD, as most layouts imply 16-byte entries, which are often too big to benefit from simple SIMD techniques.
diff --git a/cli/README.md b/cli/README.md
index 06f51216..0bd67323 100644
--- a/cli/README.md
+++ b/cli/README.md
@@ -5,7 +5,7 @@ The goal is to provide a set of command-line utilities that:
 
 - ✅ that benefit the most from SIMD instructions, 
 - ✅ rely solely on core StringZilla functionality,
-- ✅ works the same on Linux, MacOS, and Windows.
+- ✅ works the same on Linux, macOS, and Windows.
 
 Other utilities are, of course, welcome to use StringZilla but may not be good candidates for this repository.
 To install, pull the Python package from PyPi:
@@ -17,7 +17,7 @@ pip install stringzilla
 Currently implemented:
 
 - `sz_wc`: 3x faster `wc` word count.
-- `sz_split`: 4x faster `splt` file splitting.
+- `sz_split`: 4x faster `split` file splitting.
 
 What other interfaces should be added?
 Levenshtein distances?
diff --git a/include/stringzilla/small_string.h b/include/stringzilla/small_string.h
index defed48a..f0808452 100644
--- a/include/stringzilla/small_string.h
+++ b/include/stringzilla/small_string.h
@@ -123,7 +123,7 @@ SZ_PUBLIC sz_bool_t sz_string_is_on_stack(sz_string_t const *string);
  *  @param start        Pointer to the start of the string.
  *  @param length       Number of bytes in the string, before the SZ_NULL character.
  *  @param space        Number of bytes allocated for the string (heap or stack), including the SZ_NULL character.
- *  @param is_external  Whether the string is allocated on the heap externally, or fits withing ::string instance.
+ *  @param is_external  Whether the string is allocated on the heap externally, or fits within ::string instance.
  */
 SZ_PUBLIC void sz_string_unpack( //
     sz_string_t const *string, sz_ptr_t *start, sz_size_t *length, sz_size_t *space, sz_bool_t *is_external);

From 72d0f4b7604cca31db764473b37c9f38b7c77496 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 15:52:04 +0000
Subject: [PATCH 654/751] Make: Outdated 64-bit detection envs

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 70ec8c75..2e336998 100644
--- a/setup.py
+++ b/setup.py
@@ -83,14 +83,14 @@ def get_compiler() -> str:
 
 def is_64bit_x86() -> bool:
     if using_cibuildwheel:
-        return "SZ_X86_64" in os.environ
+        return "SZ_IS_64BIT_X86_" in os.environ
     arch = platform.machine()
     return arch in ["x86_64", "x64", "AMD64"]
 
 
 def is_64bit_arm() -> bool:
     if using_cibuildwheel:
-        return "SZ_ARM64" in os.environ
+        return "SZ_IS_64BIT_ARM_" in os.environ
     arch = platform.machine()
     return arch in ["arm64", "aarch64", "ARM64"]
 

From c3172801038a3e3c0409f1dd50a317b1d57b6d6b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 15:53:35 +0000
Subject: [PATCH 655/751] Make: Target Hopper `90a` in Py & C

---
 CMakeLists.txt | 2 +-
 setup.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3c8e0b81..96e549eb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -612,7 +612,7 @@ if (STRINGZILLAS_BUILD_SHARED)
 
         # Set CUDA-specific properties
         set_target_properties(stringzillas_cuda_shared PROPERTIES CUDA_STANDARD 20 CUDA_STANDARD_REQUIRED ON)
-        set_target_properties(stringzillas_cuda_shared PROPERTIES CUDA_ARCHITECTURES "90") # We dispatch manually
+        set_target_properties(stringzillas_cuda_shared PROPERTIES CUDA_ARCHITECTURES "90a") # We dispatch manually
 
         # Enable CUDA separable compilation for device code
         set_target_properties(stringzillas_cuda_shared PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
diff --git a/setup.py b/setup.py
index 2e336998..e9403b55 100644
--- a/setup.py
+++ b/setup.py
@@ -42,7 +42,7 @@ def _build_cuda_extension(self, ext):
                 "-O3",
                 "--use_fast_math",
                 "--expt-relaxed-constexpr",  # Allow constexpr functions in device code
-                "-arch=sm_80",  # Set appropriate compute capability
+                f"-arch=sm_90a",  # Default to Hopper
                 "-DSZ_DYNAMIC_DISPATCH=1",
                 "-DSZ_USE_CUDA=1",
             ]

From d6221e6fe5987dda7f91d56f8b2d552c189a7cd5 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 15:54:15 +0000
Subject: [PATCH 656/751] Make: Irrelevant links & comments

---
 .github/workflows/release.yml |  2 +-
 Package.swift                 | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 032d14a2..e23caa39 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -137,7 +137,7 @@ jobs:
     runs-on: ubuntu-24.04
     environment:
       name: pypi
-      url: https://pypi.org/p/simsimd
+      url: https://pypi.org/p/stringzilla
     permissions:
       id-token: write
 
diff --git a/Package.swift b/Package.swift
index 57082da3..418d1922 100644
--- a/Package.swift
+++ b/Package.swift
@@ -20,15 +20,15 @@ let package = Package(
     targets: [
         .target(
             name: "StringZillaC",
-            path: "include/stringzilla",  // Adjust the path to include your C source files
-            sources: ["../../c/stringzilla.c"],  // Include the source file here
+            path: "include/stringzilla",
+            sources: ["../../c/stringzilla.c"],
             publicHeadersPath: ".",
             cSettings: [
-                .define("SZ_DYNAMIC_DISPATCH", to: "1"),  // Define a macro
-                .define("SZ_AVOID_LIBC", to: "0"),  // We need `malloc` from LibC
-                .define("SZ_DEBUG", to: "0"),  // We don't need any extra assertions in the C layer
-                .headerSearchPath("include/stringzilla"),  // Specify header search paths
-                .unsafeFlags(["-Wall"]),  // Use with caution: specify custom compiler flags
+                .define("SZ_DYNAMIC_DISPATCH", to: "1"),
+                .define("SZ_AVOID_LIBC", to: "0"),
+                .define("SZ_DEBUG", to: "0"),
+                .headerSearchPath("include/stringzilla"),
+                .unsafeFlags(["-Wall"]),
             ]
         ),
         .target(

From a347dcd61e4b487e820b1e5fe728eee4ab2376ad Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 15:54:35 +0000
Subject: [PATCH 657/751] Fix: Type-casting on MSVC

---
 include/stringzillas/fingerprints.cuh | 2 +-
 include/stringzillas/fingerprints.hpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/stringzillas/fingerprints.cuh b/include/stringzillas/fingerprints.cuh
index 690bf8ba..8980b680 100644
--- a/include/stringzillas/fingerprints.cuh
+++ b/include/stringzillas/fingerprints.cuh
@@ -607,7 +607,7 @@ struct floating_rolling_hashers<sz_cap_cuda_k, dimensions_> {
     using min_counts_span_t = span<min_count_t, dimensions_k>;
 
     static constexpr unsigned hashes_per_warp_k = static_cast<unsigned>(warp_size_nvidia_k);
-    static constexpr bool has_incomplete_tail_group_k = dimensions_k % hashes_per_warp_k;
+    static constexpr bool has_incomplete_tail_group_k = (dimensions_k % hashes_per_warp_k) != 0;
     static constexpr size_t aligned_dimensions_k =
         has_incomplete_tail_group_k ? (dimensions_k / hashes_per_warp_k + 1) * hashes_per_warp_k : (dimensions_k);
     static constexpr unsigned groups_count_k = aligned_dimensions_k / hashes_per_warp_k;
diff --git a/include/stringzillas/fingerprints.hpp b/include/stringzillas/fingerprints.hpp
index 107f683f..1dc147c0 100644
--- a/include/stringzillas/fingerprints.hpp
+++ b/include/stringzillas/fingerprints.hpp
@@ -1345,7 +1345,7 @@ struct floating_rolling_hashers<sz_cap_haswell_k, dimensions_> {
     using min_counts_span_t = span<min_count_t, dimensions_k>;
 
     static constexpr unsigned hashes_per_ymm_k = sizeof(sz_u256_vec_t) / sizeof(rolling_state_t);
-    static constexpr bool has_incomplete_tail_group_k = dimensions_k % hashes_per_ymm_k;
+    static constexpr bool has_incomplete_tail_group_k = (dimensions_k % hashes_per_ymm_k) != 0;
     static constexpr size_t aligned_dimensions_k =
         has_incomplete_tail_group_k ? (dimensions_k / hashes_per_ymm_k + 1) * hashes_per_ymm_k : (dimensions_k);
     static constexpr unsigned groups_count_k = aligned_dimensions_k / hashes_per_ymm_k;
@@ -1674,7 +1674,7 @@ struct floating_rolling_hashers<sz_cap_skylake_k, dimensions_> {
     using min_counts_span_t = span<min_count_t, dimensions_k>;
 
     static constexpr unsigned hashes_per_zmm_k = sizeof(sz_u512_vec_t) / sizeof(rolling_state_t);
-    static constexpr bool has_incomplete_tail_group_k = dimensions_k % hashes_per_zmm_k;
+    static constexpr bool has_incomplete_tail_group_k = (dimensions_k % hashes_per_zmm_k) != 0;
     static constexpr size_t aligned_dimensions_k =
         has_incomplete_tail_group_k ? (dimensions_k / hashes_per_zmm_k + 1) * hashes_per_zmm_k : (dimensions_k);
     static constexpr unsigned groups_count_k = aligned_dimensions_k / hashes_per_zmm_k;

From 2c02d5bae439a4127116b9320659b388b3482382 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 16:15:58 +0000
Subject: [PATCH 658/751] Make: Log HW caps in CI before tests

---
 .github/workflows/prerelease.yml | 63 ++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 3c596db8..a9a7d90d 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -120,6 +120,20 @@ jobs:
           python -m pip install --upgrade pip
           pip install pytest pytest-repeat numpy pyarrow
           python -m pip install .
+      - name: Log CPU and Python capabilities (StringZilla)
+        run: |
+          echo "==== lscpu ===="
+          lscpu
+          echo "==== /proc/cpuinfo (model name, flags) ===="
+          grep -m1 -E "model name|flags" /proc/cpuinfo || true
+          echo "==== StringZilla capabilities ===="
+          python - << 'PY'
+          import platform
+          import stringzilla as sz
+          print("platform:", platform.platform())
+          print("machine:", platform.machine(), "processor:", platform.processor())
+          print("stringzilla.__capabilities__:", sz.__capabilities__)
+          PY
       - name: Test Python
         run: pytest scripts/test_stringzilla.py -s -x
 
@@ -205,6 +219,20 @@ jobs:
           python -m pip install --upgrade pip
           pip install pytest pytest-repeat numpy pyarrow
           python -m pip install .
+      - name: Log CPU and Python capabilities (StringZilla)
+        run: |
+          echo "==== lscpu ===="
+          lscpu
+          echo "==== /proc/cpuinfo (model name, flags) ===="
+          grep -m1 -E "model name|flags" /proc/cpuinfo || true
+          echo "==== StringZilla capabilities ===="
+          python - << 'PY'
+          import platform
+          import stringzilla as sz
+          print("platform:", platform.platform())
+          print("machine:", platform.machine(), "processor:", platform.processor())
+          print("stringzilla.__capabilities__:", sz.__capabilities__)
+          PY
       - name: Test Python
         run: pytest scripts/test_stringzilla.py -s -x
 
@@ -278,6 +306,22 @@ jobs:
           pip install pytest pytest-repeat numpy pyarrow wheel affine-gaps
           SZ_TARGET=stringzilla pip install -e . --force-reinstall --no-build-isolation
           SZ_TARGET=stringzillas-cpus pip install -e . --force-reinstall --no-build-isolation
+      - name: Log CPU and Python capabilities
+        run: |
+          echo "==== lscpu ===="
+          lscpu
+          echo "==== /proc/cpuinfo (model name, flags) ===="
+          grep -m1 -E "model name|flags" /proc/cpuinfo || true
+          echo "==== Python capabilities ===="
+          python - << 'PY'
+          import platform
+          import stringzilla as sz
+          import stringzillas as szs
+          print("platform:", platform.platform())
+          print("machine:", platform.machine(), "processor:", platform.processor())
+          print("stringzilla.__capabilities__:", sz.__capabilities__)
+          print("stringzillas.__capabilities__:", szs.__capabilities__)
+          PY
       - name: Test Python StringZillas-CPUs
         run: python -m pytest scripts/test_stringzillas.py -s -x
 
@@ -344,6 +388,25 @@ jobs:
           pip install pytest pytest-repeat numpy pyarrow wheel affine-gaps
           SZ_TARGET=stringzilla pip install -e . --force-reinstall --no-build-isolation
           SZ_TARGET=stringzillas-cuda pip install -e . --force-reinstall --no-build-isolation
+      - name: Log CPU/GPU and Python capabilities
+        run: |
+          echo "==== lscpu ===="
+          lscpu
+          echo "==== /proc/cpuinfo (model name, flags) ===="
+          grep -m1 -E "model name|flags" /proc/cpuinfo || true
+          echo "==== nvcc/nvidia-smi ===="
+          nvcc --version || true
+          nvidia-smi || true
+          echo "==== Python capabilities ===="
+          python - << 'PY'
+          import platform
+          import stringzilla as sz
+          import stringzillas as szs
+          print("platform:", platform.platform())
+          print("machine:", platform.machine(), "processor:", platform.processor())
+          print("stringzilla.__capabilities__:", sz.__capabilities__)
+          print("stringzillas.__capabilities__:", szs.__capabilities__)
+          PY
       - name: Test Python StringZillas-CUDA
         run: python -m pytest scripts/test_stringzillas.py -s -x
 

From 7da4daca1d08b5225547b010b42436f2848259a6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 16:21:34 +0000
Subject: [PATCH 659/751] Fix: Linking to 64-bit symbols

---
 c/stringzilla.c             |  8 ++++----
 include/stringzilla/types.h | 21 ++++++++++++++-------
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/c/stringzilla.c b/c/stringzilla.c
index 898077bb..48c3aa16 100644
--- a/c/stringzilla.c
+++ b/c/stringzilla.c
@@ -352,7 +352,7 @@ SZ_DYNAMIC sz_status_t sz_sequence_intersect(sz_sequence_t const *first_array, s
 // how `__cdecl` functions are decorated in MSVC: https://stackoverflow.com/questions/62753691)
 
 #if defined(_MSC_VER)
-#if SZ_DETECT_64_BIT
+#if defined(_WIN64)
 #pragma comment(linker, "/export:memchr")
 #else
 #pragma comment(linker, "/export:_memchr")
@@ -366,7 +366,7 @@ SZ_DYNAMIC void *memchr(void const *s, int c_wide, size_t n) {
 }
 
 #if defined(_MSC_VER)
-#if SZ_DETECT_64_BIT
+#if defined(_WIN64)
 #pragma comment(linker, "/export:memcpy")
 #else
 #pragma comment(linker, "/export:_memcpy")
@@ -380,7 +380,7 @@ SZ_DYNAMIC void *memcpy(void *dest, void const *src, size_t n) {
 }
 
 #if defined(_MSC_VER)
-#if SZ_DETECT_64_BIT
+#if defined(_WIN64)
 #pragma comment(linker, "/export:memmove")
 #else
 #pragma comment(linker, "/export:_memmove")
@@ -394,7 +394,7 @@ SZ_DYNAMIC void *memmove(void *dest, void const *src, size_t n) {
 }
 
 #if defined(_MSC_VER)
-#if SZ_DETECT_64_BIT
+#if defined(_WIN64)
 #pragma comment(linker, "/export:memset")
 #else
 #pragma comment(linker, "/export:_memset")
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 15af77d6..1247fd7e 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -80,7 +80,8 @@
  *          64-bit on most platforms where pointers are 64-bit.
  *          32-bit on platforms where pointers are 32-bit.
  */
-#if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64)
+#if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64) || defined(SZ_IS_64BIT_ARM_) || \
+    defined(SZ_IS_64BIT_X86_)
 #define SZ_IS_64BIT_ (1)
 #else
 #define SZ_IS_64BIT_ (0)
@@ -453,7 +454,8 @@ typedef enum sz_status_t {
     sz_invalid_utf8_k = -12,
     /** For algorithms that take collections of unique elements, this status indicates presence of duplicates. */
     sz_contains_duplicates_k = -13,
-    /** For algorithms dealing with large inputs, this error reports the need to upcast the logic to larger types. */
+    /** For algorithms dealing with large inputs, this error reports the need to upcast the logic to larger types.
+     */
     sz_overflow_risk_k = -14,
     /** For algorithms with multi-stage pipelines indicates input/output size mismatch. */
     sz_unexpected_dimensions_k = -15,
@@ -647,7 +649,8 @@ SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc);
  *  @param[in] buffer Buffer to use for allocations.
  *  @param[in] length Length of the buffer. @b Must be greater than 16, at least 4KB (one RAM page) is recommended.
  *
- *  The `buffer` itself will be prepended with the capacity and the consumed size. Those values shouldn't be modified.
+ *  The `buffer` itself will be prepended with the capacity and the consumed size. Those values shouldn't be
+ * modified.
  */
 SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length);
 
@@ -1263,7 +1266,8 @@ SZ_INTERNAL sz_size_t sz_size_log2i_nonzero(sz_size_t x) {
 
 /**
  *  @brief Compute the smallest power of two greater than or equal to @p x.
- *  @pre Unlike the commonly used trick with `clz` intrinsics, is valid across the whole range of `x`, @b including 0.
+ *  @pre Unlike the commonly used trick with `clz` intrinsics, is valid across the whole range of `x`, @b including
+ * 0.
  *  @see https://stackoverflow.com/a/10143264
  */
 SZ_INTERNAL sz_size_t sz_size_bit_ceil(sz_size_t x) {
@@ -1299,7 +1303,8 @@ SZ_INTERNAL sz_u64_t sz_u64_transpose(sz_u64_t x) {
     return x;
 }
 
-/** @brief Load a 16-bit unsigned integer from a potentially unaligned pointer. Can be expensive on some platforms. */
+/** @brief Load a 16-bit unsigned integer from a potentially unaligned pointer. Can be expensive on some platforms.
+ */
 SZ_INTERNAL sz_u16_vec_t sz_u16_load(sz_cptr_t ptr) {
 #if !SZ_USE_MISALIGNED_LOADS
     sz_u16_vec_t result;
@@ -1318,7 +1323,8 @@ SZ_INTERNAL sz_u16_vec_t sz_u16_load(sz_cptr_t ptr) {
 #endif
 }
 
-/** @brief Load a 32-bit unsigned integer from a potentially unaligned pointer. Can be expensive on some platforms. */
+/** @brief Load a 32-bit unsigned integer from a potentially unaligned pointer. Can be expensive on some platforms.
+ */
 SZ_INTERNAL sz_u32_vec_t sz_u32_load(sz_cptr_t ptr) {
 #if !SZ_USE_MISALIGNED_LOADS
     sz_u32_vec_t result;
@@ -1339,7 +1345,8 @@ SZ_INTERNAL sz_u32_vec_t sz_u32_load(sz_cptr_t ptr) {
 #endif
 }
 
-/** @brief Load a 64-bit unsigned integer from a potentially unaligned pointer. Can be expensive on some platforms. */
+/** @brief Load a 64-bit unsigned integer from a potentially unaligned pointer. Can be expensive on some platforms.
+ */
 SZ_INTERNAL sz_u64_vec_t sz_u64_load(sz_cptr_t ptr) {
 #if !SZ_USE_MISALIGNED_LOADS
     sz_u64_vec_t result;

From cd81b4ade321a3915c14425a53ecbe30a9a56469 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 17:16:55 +0000
Subject: [PATCH 660/751] Fix: OS-feature-gate AVX checks

---
 include/stringzilla/stringzilla.h | 41 ++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index c1e5dd3f..be2d2cec 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -271,26 +271,49 @@ SZ_PUBLIC sz_capability_t sz_capabilities_implementation_x86_(void) {
         struct separate_t {
             unsigned eax, ebx, ecx, edx;
         } named;
-    } info7;
+    } info1, info7;
 
 #if defined(_MSC_VER)
+    __cpuidex(info1.array, 1, 0);
     __cpuidex(info7.array, 7, 0);
 #else
+    __asm__ __volatile__( //
+        "cpuid"
+        : "=a"(info1.named.eax), "=b"(info1.named.ebx), "=c"(info1.named.ecx), "=d"(info1.named.edx)
+        : "a"(1), "c"(0));
     __asm__ __volatile__( //
         "cpuid"
         : "=a"(info7.named.eax), "=b"(info7.named.ebx), "=c"(info7.named.ecx), "=d"(info7.named.edx)
         : "a"(7), "c"(0));
 #endif
 
-    // Check for AVX2 (Function ID 7, EBX register), you can take the relevant flags from the LLVM implementation:
+    // Gate AVX/AVX-512 on OS-enabled extended state (XGETBV)
+    unsigned has_osxsave = (info1.named.ecx & (1u << 27)) != 0; // OSXSAVE
+    unsigned has_avx = (info1.named.ecx & (1u << 28)) != 0;     // AVX
+
+    unsigned long long xcr0 = 0;
+    if (has_osxsave) {
+#if defined(_MSC_VER)
+        xcr0 = _xgetbv(0);
+#else
+        unsigned eax, edx;
+        __asm__ __volatile__(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0)); // xgetbv
+        xcr0 = ((unsigned long long)edx << 32) | eax;
+#endif
+    }
+
+    unsigned os_avx_enabled = has_osxsave && has_avx && ((xcr0 & 0x6u) == 0x6u); // XMM+YMM
+    unsigned os_avx512_enabled = os_avx_enabled && ((xcr0 & 0xE0u) == 0xE0u);    // OPMASK+ZMM
+
+    // Check for AVX2/AVX-512 (Function ID 7), masked by OS state
     // https://github.com/llvm/llvm-project/blob/50598f0ff44f3a4e75706f8c53f3380fe7faa896/clang/lib/Headers/cpuid.h#L148
-    unsigned supports_avx2 = (info7.named.ebx & 0x00000020) != 0;
-    unsigned supports_avx512f = (info7.named.ebx & 0x00010000) != 0;
-    unsigned supports_avx512bw = (info7.named.ebx & 0x40000000) != 0;
-    unsigned supports_avx512vl = (info7.named.ebx & 0x80000000) != 0;
-    unsigned supports_avx512vbmi = (info7.named.ecx & 0x00000002) != 0;
-    unsigned supports_avx512vbmi2 = (info7.named.ecx & 0x00000040) != 0;
-    unsigned supports_vaes = (info7.named.ecx & 0x00000200) != 0;
+    unsigned supports_avx2 = os_avx_enabled && ((info7.named.ebx & 0x00000020u) != 0);
+    unsigned supports_avx512f = os_avx512_enabled && ((info7.named.ebx & 0x00010000u) != 0);
+    unsigned supports_avx512bw = os_avx512_enabled && ((info7.named.ebx & 0x40000000u) != 0);
+    unsigned supports_avx512vl = os_avx512_enabled && ((info7.named.ebx & 0x80000000u) != 0);
+    unsigned supports_avx512vbmi = os_avx512_enabled && ((info7.named.ecx & 0x00000002u) != 0);
+    unsigned supports_avx512vbmi2 = os_avx512_enabled && ((info7.named.ecx & 0x00000040u) != 0);
+    unsigned supports_vaes = os_avx512_enabled && ((info7.named.ecx & 0x00000200u) != 0);
 
     return (sz_capability_t)(                                                                                //
         (sz_cap_haswell_k * supports_avx2) |                                                                 //

From c273d528441bbff5e0d4364e5de36e85df18775a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 17:31:50 +0000
Subject: [PATCH 661/751] Make: Verbose PyTest logging in CI

---
 .github/workflows/prerelease.yml | 12 ++++++------
 scripts/test_stringzilla.py      | 14 +++++++++++++-
 scripts/test_stringzillas.py     | 14 +++++++++++++-
 3 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index a9a7d90d..6da2b3db 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -135,7 +135,7 @@ jobs:
           print("stringzilla.__capabilities__:", sz.__capabilities__)
           PY
       - name: Test Python
-        run: pytest scripts/test_stringzilla.py -s -x
+        run: python -X faulthandler -m pytest scripts/test_stringzilla.py -s -vv --maxfail=1 --full-trace
 
         # JavaScript
       # - name: Set up Node.js
@@ -234,7 +234,7 @@ jobs:
           print("stringzilla.__capabilities__:", sz.__capabilities__)
           PY
       - name: Test Python
-        run: pytest scripts/test_stringzilla.py -s -x
+        run: python -X faulthandler -m pytest scripts/test_stringzilla.py -s -vv --maxfail=1 --full-trace
 
         # Rust
       - name: Test Rust
@@ -323,7 +323,7 @@ jobs:
           print("stringzillas.__capabilities__:", szs.__capabilities__)
           PY
       - name: Test Python StringZillas-CPUs
-        run: python -m pytest scripts/test_stringzillas.py -s -x
+        run: python -X faulthandler -m pytest scripts/test_stringzillas.py -s -vv --maxfail=1 --full-trace
 
   test_ubuntu_cuda:
     name: Ubuntu (StringZillas-CUDA)
@@ -408,7 +408,7 @@ jobs:
           print("stringzillas.__capabilities__:", szs.__capabilities__)
           PY
       - name: Test Python StringZillas-CUDA
-        run: python -m pytest scripts/test_stringzillas.py -s -x
+        run: python -X faulthandler -m pytest scripts/test_stringzillas.py -s -vv --maxfail=1 --full-trace
 
   # Temporary workaround to run Swift tests on Linux
   # Based on: https://github.com/swift-actions/setup-swift/issues/591#issuecomment-1685710678
@@ -529,7 +529,7 @@ jobs:
         env:
           MACOSX_DEPLOYMENT_TARGET: "11.0"
       - name: Test Python
-        run: pytest scripts/test_stringzilla.py -s -x
+        run: python -X faulthandler -m pytest scripts/test_stringzilla.py -s -vv --maxfail=1 --full-trace
 
         # Swift
       - name: Set up Swift ${{ env.SWIFT_VERSION }}
@@ -601,7 +601,7 @@ jobs:
           pip install pytest pytest-repeat numpy pyarrow
           python -m pip install .
       - name: Test Python
-        run: pytest scripts/test_stringzilla.py -s -x
+        run: python -X faulthandler -m pytest scripts/test_stringzilla.py -s -vv --maxfail=1 --full-trace
 
   test_alpine:
     name: Alpine Linux
diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index c95b905c..cbde58e1 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -6,7 +6,19 @@
 
     uv pip install numpy pyarrow pytest pytest-repeat
     uv pip install -e . --force-reinstall --no-build-isolation
-    uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x
+
+Recommended flags for better diagnostics:
+
+    -s                  show test output (no capture)
+    -vv                 verbose output
+    --maxfail=1         stop at first failure
+    --full-trace        full Python tracebacks
+    -k <pattern>        filter tests by substring
+    -X faulthandler     to dump on fatal signals
+
+Example:
+
+    uv run --no-project python -X faulthandler -m pytest scripts/test_stringzilla.py -s -vv --maxfail=1 --full-trace
 """
 
 from random import choice, randint
diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index 2fe74118..905c7982 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -7,7 +7,19 @@
     uv pip install numpy pyarrow pytest pytest-repeat affine-gaps
     SZ_TARGET=stringzillas-cpus uv pip install -e . --force-reinstall --no-build-isolation
     uv run --no-project python -c "import stringzillas; print(stringzillas.__capabilities__)"
-    uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x
+
+Recommended flags for better diagnostics:
+
+    -s                  show test output (no capture)
+    -vv                 verbose output
+    --maxfail=1         stop at first failure
+    --full-trace        full Python tracebacks
+    -k <pattern>        filter tests by substring
+    -X faulthandler     to dump on fatal signals
+
+Example:
+
+    uv run --no-project python -X faulthandler -m pytest scripts/test_stringzillas.py -s -vv --maxfail=1 --full-trace
 
 To run for the CUDA backend:
 

From c65bf5ea2564f56f01f58a062893fb6922905fc0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 17:35:22 +0000
Subject: [PATCH 662/751] Make: Drop Python 3.7, require 3.8+

---
 .github/workflows/prerelease.yml |  2 +-
 .github/workflows/release.yml    |  2 +-
 README.md                        |  2 +-
 pyproject.toml                   | 14 +++++++++++---
 setup.py                         |  4 +---
 5 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 6da2b3db..eae177bc 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -658,7 +658,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-24.04, macos-13, windows-2022]
-        python-version: ["36", "37", "38", "39", "310", "311", "312", "313"]
+        python-version: ["38", "39", "310", "311", "312", "313"]
     steps:
       - uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index e23caa39..76dda84d 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -80,7 +80,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-24.04, macos-13, windows-2022]
-        python-version: ["36", "37", "38", "39", "310", "311", "312", "313"]
+        python-version: ["38", "39", "310", "311", "312", "313"]
     steps:
       - uses: actions/checkout@v4
         with:
diff --git a/README.md b/README.md
index c2c595a7..804a3422 100644
--- a/README.md
+++ b/README.md
@@ -330,7 +330,7 @@ Consider contributing, if you need a feature that's not yet implemented.
 
 ## Quick Start: Python 🐍
 
-Python bindings are available on PyPI, and can be installed with `pip`.
+Python bindings are available on PyPI for Python 3.8+, and can be installed with `pip`.
 You can immediately check the installed version and the used hardware capabilities with following commands:
 
 ```bash
diff --git a/pyproject.toml b/pyproject.toml
index f909a6db..69785023 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,10 +3,10 @@
 #   - `manylinux` and `musllinux` wheels for Linux on x86_64, aarch64, i686, ppc64le, s390x;
 #   - `macos` wheels for x86_64, arm64, and universal2;
 #   - `windows` wheels for AMD64, x86, and ARM64.
-#   - for Python versions from 3.6 to 3.13.
+#   - for Python versions from 3.8 to 3.13.
 #   - running over 5,000 tests on each wheel.
-#   = meaning 16 platforms * 8 Python versions = 128 builds.
-#   = meaning over 500,000 tests.
+#   = meaning 16 platforms * 6 Python versions = 96 builds.
+#   = meaning over 300,000 tests.
 [build-system]
 requires = ["setuptools>=42", "wheel"]
 build-backend = "setuptools.build_meta"
@@ -129,3 +129,11 @@ environment.SZ_IS_64BIT_ARM_ = "1"
 
 [tool.cibuildwheel.macos.environment]
 MACOSX_DEPLOYMENT_TARGET = "10.11"
+
+[tool.ruff]
+target-version = "py38"
+line-length = 120
+extend-select = [
+  "E", # pycodestyle errors
+  "F", # pyflakes
+]
diff --git a/setup.py b/setup.py
index e9403b55..b6a89005 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,6 @@
 from setuptools import setup, find_packages, Extension
 from setuptools.command.build_ext import build_ext
 from typing import List, Tuple, Final
-import sysconfig
 import subprocess
 
 
@@ -288,8 +287,6 @@ def windows_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[
         "License :: OSI Approved :: Apache Software License",
         "Programming Language :: C++",
         "Programming Language :: Python :: 3 :: Only",
-        "Programming Language :: Python :: 3.6",
-        "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
@@ -306,6 +303,7 @@ def windows_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[
         "Topic :: Text Processing :: General",
         "Topic :: Text Processing :: Indexing",
     ],
+    python_requires=">=3.8",
     include_dirs=[],
     setup_requires=[],
     ext_modules=ext_modules,

From e4245b7e7a0e74a4e8e1d8a50b48860d61d46235 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 19:29:23 +0000
Subject: [PATCH 663/751] Fix: Intersect scopes HW capabilities

---
 python/stringzillas.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/stringzillas.c b/python/stringzillas.c
index de9f3307..b2219431 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -254,20 +254,20 @@ static int parse_and_intersect_capabilities(PyObject *caps_obj, sz_capability_t
         // Try to get GPU device
         sz_size_t gpu_device;
         if (szs_device_scope_get_gpu_device(device_scope->handle, &gpu_device) == sz_success_k) {
-            // This is a GPU scope - prefer CUDA if available
-            if (default_hardware_capabilities & sz_caps_cuda_k) { *result = sz_cap_cuda_k; }
+            if (default_hardware_capabilities & sz_caps_cuda_k) {
+                *result = sz_caps_cuda_k & default_hardware_capabilities;
+                return 0;
+            }
             else {
                 PyErr_SetString(PyExc_RuntimeError, "GPU DeviceScope requested but CUDA not available");
                 return -1;
             }
-            return 0;
         }
 
         // Try to get CPU cores first
         sz_size_t cpu_cores;
         if (szs_device_scope_get_cpu_cores(device_scope->handle, &cpu_cores) == sz_success_k) {
-            // This is a CPU scope - prefer parallel if available, otherwise serial
-            *result = sz_caps_cpus_k;
+            *result = sz_caps_cpus_k & default_hardware_capabilities;
             return 0;
         }
 

From 86c89b79051b01b415fb9afba9e43bb267f4326e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 19:37:14 +0000
Subject: [PATCH 664/751] Add: Capabilities getters for SW

---
 python/stringzillas.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/python/stringzillas.c b/python/stringzillas.c
index b2219431..f4738547 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -1254,6 +1254,8 @@ static PyTypeObject NeedlemanWunschType = {
 typedef struct {
     PyObject ob_base;
     szs_smith_waterman_scores_t handle;
+    char description[32];
+    sz_capability_t capabilities;
 } SmithWaterman;
 
 static void SmithWaterman_dealloc(SmithWaterman *self) {
@@ -1266,7 +1268,11 @@ static void SmithWaterman_dealloc(SmithWaterman *self) {
 
 static PyObject *SmithWaterman_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) {
     SmithWaterman *self = (SmithWaterman *)type->tp_alloc(type, 0);
-    if (self != NULL) { self->handle = NULL; }
+    if (self != NULL) {
+        self->handle = NULL;
+        self->description[0] = '\0';
+        self->capabilities = 0;
+    }
     return (PyObject *)self;
 }
 
@@ -1334,6 +1340,13 @@ static int SmithWaterman_init(SmithWaterman *self, PyObject *args, PyObject *kwa
         return -1;
     }
 
+    // Create a simple checksum of the substitution matrix for the description
+    sz_u32_t subs_checksum = 0;
+    for (int i = 0; i < 256; i += 16)                      // Sample every 16th element
+        subs_checksum += (sz_u32_t)subs_data[i * 256 + i]; // Diagonal elements
+
+    snprintf(self->description, sizeof(self->description), "%X,%d,%d", subs_checksum & 0xFFFF, open, extend);
+    self->capabilities = capabilities;
     return 0;
 }
 
@@ -1497,6 +1510,20 @@ static PyObject *SmithWaterman_call(SmithWaterman *self, PyObject *args, PyObjec
     return NULL;
 }
 
+static PyObject *SmithWaterman_repr(SmithWaterman *self) {
+    return PyUnicode_FromFormat("SmithWaterman(subs_checksum,open,extend=%s)", self->description);
+}
+
+static PyObject *SmithWaterman_get_capabilities(SmithWaterman *self, void *closure) {
+    return capabilities_to_tuple(self->capabilities);
+}
+
+static PyGetSetDef SmithWaterman_getsetters[] = {
+    {"__capabilities__", (getter)SmithWaterman_get_capabilities, NULL, "Hardware capabilities used by this engine",
+     NULL},
+    {NULL} /* Sentinel */
+};
+
 static char const doc_SmithWaterman[] = //
     "SmithWaterman(substitution_matrix, open=-1, extend=-1, capabilities=None)\n"
     "\n"
@@ -1541,6 +1568,8 @@ static PyTypeObject SmithWatermanType = {
     .tp_init = (initproc)SmithWaterman_init,
     .tp_dealloc = (destructor)SmithWaterman_dealloc,
     .tp_call = (ternaryfunc)SmithWaterman_call,
+    .tp_repr = (reprfunc)SmithWaterman_repr,
+    .tp_getset = SmithWaterman_getsetters,
 };
 
 #pragma endregion

From d0db2d43c52877eca75e35ed76efecfde8f07b81 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 19:37:50 +0000
Subject: [PATCH 665/751] Improve: Avoid many unified memory re-allocs

---
 python/stringzillas.c | 52 ++++++++++++++++++++++++++++++-------------
 1 file changed, 37 insertions(+), 15 deletions(-)

diff --git a/python/stringzillas.c b/python/stringzillas.c
index f4738547..e31a4514 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -108,24 +108,39 @@ typedef struct PyAPI {
 #define SZ_METHOD_FLAGS METH_VARARGS | METH_KEYWORDS
 
 /**
- *  @brief  Helper function to automatically swap a Strs object's allocator to unified memory.
- *          This ensures GPU compatibility for string operations.
+ *  @brief Helper function to automatically swap a Strs object's allocator to unified memory for GPU kernels.
  *  @param[in] strs_obj The Strs object to swap allocator for
  *  @return sz_true_k on success, sz_false_k on failure
+ *  @note Sets Pythonic error on failure.
  */
-static sz_bool_t try_swap_to_unified_allocator(PyObject *strs_obj) {
+static inline sz_bool_t try_swap_to_unified_allocator(PyObject *strs_obj) {
     if (!strs_obj || !sz_py_replace_strings_allocator) return sz_false_k;
 
     // Try to swap to unified allocator - this will be a no-op if already using it
     sz_bool_t success = sz_py_replace_strings_allocator(strs_obj, &unified_allocator);
 
     if (!success) {
-        // Set Python error to inform user of the failure
+        // Always fatal: GPU kernels require unified/device-accessible memory
         PyErr_SetString(PyExc_RuntimeError,
                         "Device memory mismatch: GPU kernels require unified/device-accessible memory. "
                         "Consider reducing input size, freeing memory, or using CPU capabilities.");
+        return sz_false_k;
     }
-    return success;
+    return sz_true_k;
+}
+
+/**
+ *  @brief Helper function to determine if unified memory is required based on capabilities and device scope.
+ *  @param[in] capabilities The capabilities bitmask of the current engine.
+ */
+static inline sz_bool_t requires_unified_memory(sz_capability_t capabilities, szs_device_scope_t device_handle) {
+    // Only relevant if CUDA capability is enabled
+    if ((capabilities & sz_cap_cuda_k) == 0) return sz_false_k;
+
+    // Check that the executor is a GPU device scope
+    sz_size_t gpu_device = 0;
+    if (szs_device_scope_get_gpu_device(device_handle, &gpu_device) == sz_success_k) return sz_true_k;
+    return sz_false_k;
 }
 
 #pragma endregion
@@ -435,8 +450,9 @@ static PyObject *LevenshteinDistances_call(LevenshteinDistances *self, PyObject
     sz_status_t (*kernel_punned)(szs_levenshtein_distances_t, szs_device_scope_t, void *, void *, sz_size_t *,
                                  sz_size_t) = NULL;
 
-    // Try to swap allocators to unified memory for GPU compatibility
-    if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) return NULL;
+    // Swap allocators only when using CUDA with a GPU device
+    if (requires_unified_memory(self->capabilities, device_handle))
+        if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) return NULL;
 
     // Handle 32-bit tape inputs
     sz_sequence_u32tape_t a_u32tape, b_u32tape;
@@ -730,8 +746,9 @@ static PyObject *LevenshteinDistancesUTF8_call(LevenshteinDistancesUTF8 *self, P
     sz_status_t (*kernel_punned)(szs_levenshtein_distances_t, szs_device_scope_t, void *, void *, sz_size_t *,
                                  sz_size_t) = NULL;
 
-    // Try to swap allocators to unified memory for GPU compatibility
-    if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) return NULL;
+    // Swap allocators only when using CUDA with a GPU device
+    if (requires_unified_memory(self->capabilities, device_handle))
+        if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) return NULL;
 
     // Handle 32-bit tape inputs
     sz_sequence_u32tape_t a_u32tape, b_u32tape;
@@ -1059,8 +1076,10 @@ static PyObject *NeedlemanWunsch_call(NeedlemanWunsch *self, PyObject *args, PyO
     void const *kernel_b_texts_punned = NULL;
     sz_status_t (*kernel_punned)(szs_needleman_wunsch_scores_t, szs_device_scope_t, void const *, void const *,
                                  sz_ssize_t *, sz_size_t) = NULL;
-    // Try to swap allocators to unified memory for GPU compatibility
-    if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) return NULL;
+
+    // Swap allocators only when using CUDA with a GPU device
+    if (requires_unified_memory(self->capabilities, device_handle))
+        if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) return NULL;
 
     // Handle 32-bit tape inputs
     sz_sequence_u32tape_t a_u32tape, b_u32tape;
@@ -1371,8 +1390,10 @@ static PyObject *SmithWaterman_call(SmithWaterman *self, PyObject *args, PyObjec
     void const *kernel_b_texts_punned = NULL;
     sz_status_t (*kernel_punned)(szs_smith_waterman_scores_t, szs_device_scope_t, void const *, void const *,
                                  sz_ssize_t *, sz_size_t) = NULL;
-    // Try to swap allocators to unified memory for GPU compatibility
-    if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) return NULL;
+
+    // Swap allocators only when using CUDA with a GPU device
+    if (requires_unified_memory(self->capabilities, device_handle))
+        if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) return NULL;
 
     // Handle 32-bit tape inputs
     sz_sequence_u32tape_t a_u32tape, b_u32tape;
@@ -1720,8 +1741,9 @@ static PyObject *Fingerprints_call(Fingerprints *self, PyObject *args, PyObject
         return result_tuple;
     }
 
-    // Try to swap allocators to unified memory for GPU compatibility
-    if (!try_swap_to_unified_allocator(texts_obj)) return NULL;
+    // Swap allocators only when using CUDA with a GPU device
+    if (requires_unified_memory(self->capabilities, device_handle))
+        if (!try_swap_to_unified_allocator(texts_obj)) return NULL;
 
     sz_size_t kernel_input_size = 0;
     void *kernel_texts_punned = NULL;

From 65d323ee8594f4d787f236be6db175d4c49babc0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 20:10:58 +0000
Subject: [PATCH 666/751] Improve: Test against Affine Gaps

---
 scripts/test_stringzillas.py | 169 ++++++++++++++++++++++++++++++++++-
 1 file changed, 166 insertions(+), 3 deletions(-)

diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index 905c7982..9b5cf6dd 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -35,7 +35,7 @@
 
 import pytest
 import numpy as np  # ! Unlike StringZilla, NumPy is mandatory for StringZillas
-import affine_gaps  # ? Provides baseline implementation for NW & SW scoring
+import affine_gaps as ag  # ? Provides baseline implementation for NW & SW scoring
 
 import stringzilla as sz
 import stringzillas as szs
@@ -144,12 +144,15 @@ def test_device_scope():
 def get_random_string(
     length: Optional[int] = None,
     variability: Optional[int] = None,
+    alphabet: Optional[str] = None,
 ) -> str:
     if length is None:
         length = randint(3, 300)
+    if alphabet is None:
+        alphabet = ascii_lowercase
     if variability is None:
-        variability = len(ascii_lowercase)
-    return "".join(choice(ascii_lowercase[:variability]) for _ in range(length))
+        variability = len(alphabet)
+    return "".join(choice(alphabet[:variability]) for _ in range(length))
 
 
 def is_equal_strings(native_strings, big_strings):
@@ -411,6 +414,166 @@ def test_needleman_wunsch_vs_levenshtein_random(
     np.testing.assert_array_equal(results, baselines, "Edit distances do not match")
 
 
+@pytest.mark.parametrize("capabilities_mode", ["base", "infer-from-device"])
+@pytest.mark.parametrize("device_name", DEVICE_NAMES)
+@pytest.mark.parametrize("batch_size", [1, 7, 33])
+@pytest.mark.parametrize("seed_value", SEED_VALUES)
+def test_needleman_wunsch_against_affine_gaps(
+    capabilities_mode: str,
+    device_name: DeviceName,
+    batch_size: int,
+    seed_value: int,
+):
+    """Compare Needleman-Wunsch global alignment scores against affine_gaps baseline."""
+
+    seed_random_generators(seed_value)
+    alphabet = ag.default_proteins_alphabet
+    a_batch = [get_random_string(length=randint(5, 50), alphabet=alphabet) for _ in range(batch_size)]
+    b_batch = [get_random_string(length=randint(5, 50), alphabet=alphabet) for _ in range(batch_size)]
+
+    # Baseline with affine_gaps (Gotoh)
+    baseline = np.array(
+        [
+            int(
+                ag.needleman_wunsch_gotoh_score(
+                    a,
+                    b,
+                    substitution_alphabet=alphabet,
+                    substitution_matrix=ag.default_proteins_matrix,
+                    gap_opening=ag.default_gap_opening,
+                    gap_extension=ag.default_gap_extension,
+                )
+            )
+            for a, b in zip(a_batch, b_batch)
+        ],
+        dtype=np.int64,
+    )
+
+    # For StringZillas, blow up the substitution matrix into a 256x256 form
+    subs = np.empty((256, 256), dtype=np.int8)
+    for i, ci in enumerate(alphabet):
+        for j, cj in enumerate(alphabet):
+            subs[ord(ci), ord(cj)] = ag.default_proteins_matrix[i, j]
+
+    device_scope, base_caps = device_scope_and_capabilities(device_name)
+    engine = szs.NeedlemanWunsch(
+        capabilities=base_caps if capabilities_mode == "base" else device_scope,
+        substitution_matrix=subs,
+        open=ag.default_gap_opening,
+        extend=ag.default_gap_extension,
+    )
+
+    results = engine(Strs(a_batch), Strs(b_batch), device=device_scope)
+    if not np.array_equal(results, baseline):
+        idx = int(np.where(results != baseline)[0][0])
+        a, b = a_batch[idx], b_batch[idx]
+        aligned_a, aligned_b = ag.needleman_wunsch_gotoh(
+            a,
+            b,
+            substitution_alphabet=alphabet,
+            substitution_matrix=ag.default_proteins_matrix,
+            gap_open=ag.default_gap_opening,
+            gap_extend=ag.default_gap_extension,
+        )
+        guide_line = "".join("|" if ca == cb else " " for ca, cb in zip(aligned_a, aligned_b))
+        pytest.fail(
+            "\n".join(
+                [
+                    f"Needleman-Wunsch mismatch at index {idx}:",
+                    f"  a: {a}",
+                    f"  b: {b}",
+                    f"  szs score:     {int(results[idx])}",
+                    f"  affine_gaps:   {int(baseline[idx])}",
+                    "  Alignment (affine_gaps):",
+                    f"    {aligned_a}",
+                    f"    {guide_line}",
+                    f"    {aligned_b}",
+                ]
+            )
+        )
+    np.testing.assert_array_equal(results, baseline)
+
+
+@pytest.mark.parametrize("capabilities_mode", ["base", "infer-from-device"])
+@pytest.mark.parametrize("device_name", DEVICE_NAMES)
+@pytest.mark.parametrize("batch_size", [1, 7, 33])
+@pytest.mark.parametrize("seed_value", SEED_VALUES)
+def test_smith_waterman_against_affine_gaps(
+    capabilities_mode: str,
+    device_name: DeviceName,
+    batch_size: int,
+    seed_value: int,
+):
+    """Compare Smith-Waterman local alignment scores against affine_gaps baseline."""
+
+    seed_random_generators(seed_value)
+    alphabet = ag.default_proteins_alphabet
+    a_batch = [get_random_string(length=randint(5, 50), alphabet=alphabet) for _ in range(batch_size)]
+    b_batch = [get_random_string(length=randint(5, 50), alphabet=alphabet) for _ in range(batch_size)]
+
+    # Baseline with affine_gaps (Gotoh)
+    baseline = np.array(
+        [
+            int(
+                ag.smith_waterman_gotoh_score(
+                    a,
+                    b,
+                    substitution_alphabet=alphabet,
+                    substitution_matrix=ag.default_proteins_matrix,
+                    gap_opening=ag.default_gap_opening,
+                    gap_extension=ag.default_gap_extension,
+                )
+            )
+            for a, b in zip(a_batch, b_batch)
+        ],
+        dtype=np.int64,
+    )
+
+    # For StringZillas, blow up the substitution matrix into a 256x256 form
+    subs = np.empty((256, 256), dtype=np.int8)
+    for i, ci in enumerate(alphabet):
+        for j, cj in enumerate(alphabet):
+            subs[ord(ci), ord(cj)] = ag.default_proteins_matrix[i, j]
+
+    device_scope, base_caps = device_scope_and_capabilities(device_name)
+    engine = szs.SmithWaterman(
+        capabilities=base_caps if capabilities_mode == "base" else device_scope,
+        substitution_matrix=subs,
+        open=ag.default_gap_opening,
+        extend=ag.default_gap_extension,
+    )
+
+    results = engine(Strs(a_batch), Strs(b_batch), device=device_scope)
+    if not np.array_equal(results, baseline):
+        idx = int(np.where(results != baseline)[0][0])
+        a, b = a_batch[idx], b_batch[idx]
+        aligned_a, aligned_b = ag.smith_waterman_gotoh(
+            a,
+            b,
+            substitution_alphabet=alphabet,
+            substitution_matrix=ag.default_proteins_matrix,
+            gap_open=ag.default_gap_opening,
+            gap_extend=ag.default_gap_extension,
+        )
+        guide_line = "".join("|" if ca == cb else " " for ca, cb in zip(aligned_a, aligned_b))
+        pytest.fail(
+            "\n".join(
+                [
+                    f"Smith-Waterman mismatch at index {idx}:",
+                    f"  a: {a}",
+                    f"  b: {b}",
+                    f"  szs score:     {int(results[idx])}",
+                    f"  affine_gaps:   {int(baseline[idx])}",
+                    "  Alignment (affine_gaps):",
+                    f"    {aligned_a}",
+                    f"    {guide_line}",
+                    f"    {aligned_b}",
+                ]
+            )
+        )
+    np.testing.assert_array_equal(results, baseline)
+
+
 @pytest.mark.parametrize("capabilities_mode", ["base", "infer-from-device"])
 @pytest.mark.parametrize("device_name", DEVICE_NAMES)
 @pytest.mark.parametrize("ndim", [1, 7, 64, 1024])

From f7923661aef033623f4b6c6f5fa50399eb65b3b2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 20:14:40 +0000
Subject: [PATCH 667/751] Fix: Win32 compilation issues

---
 c/stringzilla.c | 4 ++++
 setup.py        | 3 +--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/c/stringzilla.c b/c/stringzilla.c
index 48c3aa16..817e98fb 100644
--- a/c/stringzilla.c
+++ b/c/stringzilla.c
@@ -218,7 +218,11 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
  *  alphabetically (exclusive). The Microsoft C++ compiler puts C++ initialisation code in .CRT$XCU, so avoid that
  *  section: https://learn.microsoft.com/en-us/cpp/c-runtime-library/crt-initialization?view=msvc-170
  */
+#if defined(_WIN64)
 #pragma comment(linker, "/INCLUDE:sz_dispatch_table_init_")
+#else
+#pragma comment(linker, "/INCLUDE:_sz_dispatch_table_init_")
+#endif
 #pragma section(".CRT$XCS", read)
 __declspec(allocate(".CRT$XCS")) void (*sz_dispatch_table_init_)() = sz_dispatch_table_init;
 
diff --git a/setup.py b/setup.py
index b6a89005..b266d319 100644
--- a/setup.py
+++ b/setup.py
@@ -167,7 +167,7 @@ def darwin_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[T
 
 def windows_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[Tuple[str]]]:
     compile_args = [
-        "/std:c++17" if use_cpp else "/std:c99",  # use C++17 for StringZillas, C99 for StringZilla
+        "/std:c++17" if use_cpp else "/std:c11",  # use C++17 for StringZillas, C11 for StringZilla, as MSVC has no C99
         "/Wall",  # stick close to the C language standard, avoid compiler extensions
         "/O2",  # optimization level
     ]
@@ -284,7 +284,6 @@ def windows_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[
         "Natural Language :: English",
         "Intended Audience :: Developers",
         "Intended Audience :: Information Technology",
-        "License :: OSI Approved :: Apache Software License",
         "Programming Language :: C++",
         "Programming Language :: Python :: 3 :: Only",
         "Programming Language :: Python :: 3.8",

From a7c3f04c8032d985b26f262019e71771ec69d676 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 20:15:39 +0000
Subject: [PATCH 668/751] Make: Avoid `fail-fast` for Python pre-release wheels

---
 .github/workflows/prerelease.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index eae177bc..f17db4de 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -656,6 +656,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     needs: [test_ubuntu_gcc, test_ubuntu_clang, test_macos, test_windows]
     strategy:
+      fail-fast: false
       matrix:
         os: [ubuntu-24.04, macos-13, windows-2022]
         python-version: ["38", "39", "310", "311", "312", "313"]

From 95a95fedd5f399f0e4b1a05b0aa4e359a598e910 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 21:06:08 +0000
Subject: [PATCH 669/751] Make: Respect `MACOSX_DEPLOYMENT_TARGET`

---
 setup.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b266d319..1c84a42a 100644
--- a/setup.py
+++ b/setup.py
@@ -132,6 +132,8 @@ def linux_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[Tu
 
 def darwin_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[Tuple[str]]]:
 
+    min_macos = os.environ.get("MACOSX_DEPLOYMENT_TARGET", "11.0")
+
     compile_args = [
         "-std=c++17" if use_cpp else "-std=c99",  # use C++17 for StringZillas, C99 for StringZilla
         "-pedantic",  # stick close to the C language standard, avoid compiler extensions
@@ -143,7 +145,7 @@ def darwin_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[T
         "-Wno-discarded-qualifiers",  # like: passing argument 1 of ‘free’ discards ‘const’ qualifier from pointer target type
         "-fPIC",  # to enable dynamic dispatch
         # "-mfloat-abi=hard",  # NEON intrinsics not available with the soft-float ABI
-        "-mmacosx-version-min=11.0",  # minimum macOS version
+        f"-mmacosx-version-min={min_macos}",  # minimum macOS version (respect env if provided)
     ]
     link_args = [
         "-fPIC",  # to enable dynamic dispatch

From 0e17a3d7b233eb766761ba335e73c2ca84511d48 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 21:09:52 +0000
Subject: [PATCH 670/751] Fix: Sorting difference on 32/64 bit machines

---
 include/stringzilla/sort.h  | 2 +-
 include/stringzilla/types.h | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index b0145046..e00d0f9c 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -376,7 +376,7 @@ SZ_INTERNAL void sz_sequence_argsort_serial_export_next_pgrams_(
         *target_pgram = 0;
         for (sz_size_t j = 0; j < exported_length; ++j) target_str[j] = source_str[j + start_character];
         target_str[pgram_capacity] = (char)exported_length;
-#if defined(SZ_IS_64BIT_)
+#if SZ_IS_64BIT_
         *target_pgram = sz_u64_bytes_reverse(*target_pgram);
 #else
         *target_pgram = sz_u32_bytes_reverse(*target_pgram);
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 1247fd7e..e765bf60 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -79,9 +79,12 @@
  *  @brief  Analogous to `size_t` and `std::size_t`, unsigned integer, identical to pointer size.
  *          64-bit on most platforms where pointers are 64-bit.
  *          32-bit on platforms where pointers are 32-bit.
+ *
+ *  @note   Do not use `defined(SZ_IS_64BIT_X86_)` or `defined(SZ_IS_64BIT_ARM_)` here — those indicate
+ *          the CPU family, not pointer width. Rely on compiler/OS macros only.
  */
-#if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64) || defined(SZ_IS_64BIT_ARM_) || \
-    defined(SZ_IS_64BIT_X86_)
+#if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64) || defined(__aarch64__) || \
+    defined(__arm64__) || defined(__arm64) || defined(_M_ARM64)
 #define SZ_IS_64BIT_ (1)
 #else
 #define SZ_IS_64BIT_ (0)

From e5cfb080500f606330e2c5be79033443463110ac Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 21:20:03 +0000
Subject: [PATCH 671/751] Make: Drop macOS Universal builds

---
 .github/workflows/prerelease.yml |  2 +-
 .github/workflows/release.yml    |  2 +-
 pyproject.toml                   | 19 ++++++++++++-------
 setup.py                         |  9 ++++-----
 4 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index f17db4de..1fbbbd67 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -482,7 +482,7 @@ jobs:
 
   test_macos:
     name: MacOS
-    runs-on: macos-13
+    runs-on: macos-14
 
     steps:
       - uses: actions/checkout@v4
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 76dda84d..75b1f3ea 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -330,7 +330,7 @@ jobs:
 
   create_macos_library:
     name: Create Library for MacOS ${{ matrix.arch }}
-    runs-on: macos-13
+    runs-on: macos-14
     needs: versioning
     strategy:
       fail-fast: false
diff --git a/pyproject.toml b/pyproject.toml
index 69785023..59ce2a6b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,12 +1,12 @@
 # This file configures wheels compilation for `cibuildwheel` for StringZilla CPython bindings.
 # On a good day it will produce:
 #   - `manylinux` and `musllinux` wheels for Linux on x86_64, aarch64, i686, ppc64le, s390x;
-#   - `macos` wheels for x86_64, arm64, and universal2;
+#   - `macos` wheels for x86_64 and arm64 (no universal2);
 #   - `windows` wheels for AMD64, x86, and ARM64.
 #   - for Python versions from 3.8 to 3.13.
 #   - running over 5,000 tests on each wheel.
-#   = meaning 16 platforms * 6 Python versions = 96 builds.
-#   = meaning over 300,000 tests.
+#   = meaning 15 platforms * 6 Python versions = 90 builds.
+#   = meaning over 450,000 tests.
 [build-system]
 requires = ["setuptools>=42", "wheel"]
 build-backend = "setuptools.build_meta"
@@ -74,6 +74,7 @@ before-all = "apk add --update wget python3-dev"
 [tool.cibuildwheel.macos]
 before-build = ["rm -rf {project}/build"]
 repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}"
+archs = ["x86_64", "arm64"]
 
 [tool.cibuildwheel.windows]
 before-build = ["rd /s /q {project}\\build || echo Done"]
@@ -121,14 +122,18 @@ inherit.environment = "append"
 environment.SZ_IS_64BIT_ARM_ = "1"
 
 # Detect MacOS Universal2 builds
+## Per-arch macOS deployment target
 [[tool.cibuildwheel.overrides]]
-select = "*-macos*_universal2"
+select = "*-macos*_x86_64"
 inherit.environment = "append"
+environment.MACOSX_DEPLOYMENT_TARGET = "10.11"
 environment.SZ_IS_64BIT_X86_ = "1"
-environment.SZ_IS_64BIT_ARM_ = "1"
 
-[tool.cibuildwheel.macos.environment]
-MACOSX_DEPLOYMENT_TARGET = "10.11"
+[[tool.cibuildwheel.overrides]]
+select = "*-macos*_arm64"
+inherit.environment = "append"
+environment.MACOSX_DEPLOYMENT_TARGET = "11.0"
+environment.SZ_IS_64BIT_ARM_ = "1"
 
 [tool.ruff]
 target-version = "py38"
diff --git a/setup.py b/setup.py
index 1c84a42a..4848291b 100644
--- a/setup.py
+++ b/setup.py
@@ -151,12 +151,11 @@ def darwin_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[T
         "-fPIC",  # to enable dynamic dispatch
     ]
 
-    # Apple Clang doesn't support the `-march=native` argument,
-    # so we must pre-set the CPU generation. Technically the last Intel-based Apple
-    # product was the 2021 MacBook Pro, which had the "Coffee Lake" architecture.
-    # During Universal builds, however, even AVX header cause compilation errors.
+    # We only support single-arch macOS wheels, but not the Universal builds:
+    # - x86_64: enable Haswell (AVX2) only
+    # - arm64: enable NEON only
     macros_args = [
-        ("SZ_USE_HASWELL", "0"),
+        ("SZ_USE_HASWELL", "1" if not is_64bit_arm() and is_64bit_x86() else "0"),
         ("SZ_USE_SKYLAKE", "0"),
         ("SZ_USE_ICE", "0"),
         ("SZ_USE_NEON", "1" if is_64bit_arm() else "0"),

From b582d5dc78d24a4a0eb0a07127f781e46cd71de7 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 29 Aug 2025 21:52:26 +0000
Subject: [PATCH 672/751] Fix: Fall-back CPU alloc for fingerprints

---
 python/stringzillas.c | 78 +++++++++++++++++++++----------------------
 1 file changed, 38 insertions(+), 40 deletions(-)

diff --git a/python/stringzillas.c b/python/stringzillas.c
index e31a4514..e6622d25 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -95,6 +95,8 @@ static szs_device_scope_t default_device_scope = NULL;
 static sz_capability_t default_hardware_capabilities = 0;
 // Static unified memory allocator for GPU compatibility
 static sz_memory_allocator_t unified_allocator;
+// Default CPU-side allocator for buffer-based flows
+static sz_memory_allocator_t default_allocator;
 
 typedef struct PyAPI {
     sz_bool_t (*sz_py_export_string_like)(PyObject *, sz_cptr_t *, sz_size_t *);
@@ -1741,8 +1743,9 @@ static PyObject *Fingerprints_call(Fingerprints *self, PyObject *args, PyObject
         return result_tuple;
     }
 
-    // Swap allocators only when using CUDA with a GPU device
-    if (requires_unified_memory(self->capabilities, device_handle))
+    // Swap allocators only when using CUDA with a GPU device (inputs must be unified)
+    sz_bool_t need_unified = requires_unified_memory(self->capabilities, device_handle);
+    if (need_unified)
         if (!try_swap_to_unified_allocator(texts_obj)) return NULL;
 
     sz_size_t kernel_input_size = 0;
@@ -1788,56 +1791,49 @@ static PyObject *Fingerprints_call(Fingerprints *self, PyObject *args, PyObject
         return NULL;
     }
 
-    // Allocate unified memory first for CUDA compatibility
-    sz_size_t total_elements = kernel_input_size * self->ndim;
-    sz_size_t total_bytes = total_elements * sizeof(sz_u32_t);
-
-    sz_u32_t *unified_hashes = (sz_u32_t *)unified_allocator.allocate(total_bytes, unified_allocator.handle);
-    sz_u32_t *unified_counts = (sz_u32_t *)unified_allocator.allocate(total_bytes, unified_allocator.handle);
-
-    if (!unified_hashes || !unified_counts) {
-        if (unified_hashes) unified_allocator.free(unified_hashes, total_bytes, unified_allocator.handle);
-        if (unified_counts) unified_allocator.free(unified_counts, total_bytes, unified_allocator.handle);
-        return PyErr_NoMemory();
-    }
-
-    // Call the kernel with unified memory buffers
-    sz_status_t status = kernel_punned(self->handle, device_handle, kernel_texts_punned, unified_hashes,
-                                       self->ndim * sizeof(sz_u32_t), unified_counts, self->ndim * sizeof(sz_u32_t));
-
-    if (status != sz_success_k) {
-        unified_allocator.free(unified_hashes, total_bytes, unified_allocator.handle);
-        unified_allocator.free(unified_counts, total_bytes, unified_allocator.handle);
-        PyErr_SetString(PyExc_RuntimeError, "Fingerprinting computation failed");
-        return NULL;
-    }
-
-    // Create NumPy arrays for output matrices - each row contains fingerprints for one text
+    // Create NumPy outputs up front and copy into them (CPU or GPU)
     npy_intp dims[2] = {kernel_input_size, self->ndim};
-
     PyArrayObject *hashes_array = (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_UINT32);
     PyArrayObject *counts_array = (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_UINT32);
-
     if (!hashes_array || !counts_array) {
         Py_XDECREF(hashes_array);
         Py_XDECREF(counts_array);
-        unified_allocator.free(unified_hashes, total_bytes, unified_allocator.handle);
-        unified_allocator.free(unified_counts, total_bytes, unified_allocator.handle);
         return PyErr_NoMemory();
     }
 
-    // Copy from unified memory to NumPy arrays
-    sz_u32_t *numpy_hashes = (sz_u32_t *)PyArray_DATA(hashes_array);
-    sz_u32_t *numpy_counts = (sz_u32_t *)PyArray_DATA(counts_array);
+    // Determine bytes to write; if zero, we'll just return the empty arrays
+    sz_memory_allocator_t *out_alloc = need_unified ? &unified_allocator : &default_allocator;
+    sz_size_t const total_elements = kernel_input_size * self->ndim;
+    sz_size_t const total_bytes = total_elements * sizeof(sz_u32_t);
+
+    if (total_bytes > 0) {
+        sz_u32_t *buf_hashes = (sz_u32_t *)out_alloc->allocate(total_bytes, out_alloc->handle);
+        sz_u32_t *buf_counts = (sz_u32_t *)out_alloc->allocate(total_bytes, out_alloc->handle);
+        if (!buf_hashes || !buf_counts) {
+            if (buf_hashes) out_alloc->free(buf_hashes, total_bytes, out_alloc->handle);
+            if (buf_counts) out_alloc->free(buf_counts, total_bytes, out_alloc->handle);
+            Py_DECREF(hashes_array);
+            Py_DECREF(counts_array);
+            return PyErr_NoMemory();
+        }
 
-    memcpy(numpy_hashes, unified_hashes, total_bytes);
-    memcpy(numpy_counts, unified_counts, total_bytes);
+        sz_status_t status = kernel_punned(self->handle, device_handle, kernel_texts_punned, buf_hashes,
+                                           self->ndim * sizeof(sz_u32_t), buf_counts, self->ndim * sizeof(sz_u32_t));
+        if (status != sz_success_k) {
+            out_alloc->free(buf_hashes, total_bytes, out_alloc->handle);
+            out_alloc->free(buf_counts, total_bytes, out_alloc->handle);
+            Py_DECREF(hashes_array);
+            Py_DECREF(counts_array);
+            PyErr_SetString(PyExc_RuntimeError, "Fingerprinting computation failed");
+            return NULL;
+        }
 
-    // Free unified memory
-    unified_allocator.free(unified_hashes, total_bytes, unified_allocator.handle);
-    unified_allocator.free(unified_counts, total_bytes, unified_allocator.handle);
+        memcpy(PyArray_DATA(hashes_array), buf_hashes, total_bytes);
+        memcpy(PyArray_DATA(counts_array), buf_counts, total_bytes);
+        out_alloc->free(buf_hashes, total_bytes, out_alloc->handle);
+        out_alloc->free(buf_counts, total_bytes, out_alloc->handle);
+    }
 
-    // Return tuple of two NumPy arrays: (hashes_matrix, counts_matrix)
     PyObject *result_tuple = PyTuple_New(2);
     if (!result_tuple) {
         Py_DECREF(hashes_array);
@@ -1990,6 +1986,8 @@ PyMODINIT_FUNC PyInit_stringzillas(void) {
     // Initialize the unified memory allocator for GPU compatibility
     sz_status_t alloc_status = sz_memory_allocator_init_unified(&unified_allocator);
     if (alloc_status != sz_success_k) sz_memory_allocator_init_default(&unified_allocator);
+    // Initialize default CPU allocator
+    sz_memory_allocator_init_default(&default_allocator);
 
     // Initialize the default device scope for reuse
     sz_status_t status = szs_device_scope_init_default(&default_device_scope);

From 24967c75117834c46d90d5299b8f6d6b12896b69 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 30 Aug 2025 09:42:46 +0000
Subject: [PATCH 673/751] Make: Reinstall pre-packaged CMake on macOS-14

---
 .github/workflows/prerelease.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 1fbbbd67..a6eed5db 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -493,7 +493,7 @@ jobs:
       - name: Install dependencies
         run: |
           brew update
-          brew install cmake
+          brew reinstall cmake
       - name: Build C/C++
         run: |
           cmake -B build_artifacts \

From 6431900690b0f8c127c2200fc9d798c195fc0847 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 30 Aug 2025 09:43:35 +0000
Subject: [PATCH 674/751] Add: Ephemeral GPU executors if no device is passed

---
 c/stringzillas.cuh           | 87 +++++++++++++++++++++++++++++++++++-
 python/stringzillas.c        | 28 +++++-------
 scripts/test_stringzilla.py  |  1 +
 scripts/test_stringzillas.py | 15 ++++---
 4 files changed, 106 insertions(+), 25 deletions(-)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index 7f369b04..1c6cbb21 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -218,6 +218,28 @@ struct gpu_scope_t {
 };
 szs::cuda_executor_t &get_executor(gpu_scope_t &scope) noexcept { return scope.executor; }
 sz::gpu_specs_t get_specs(gpu_scope_t const &scope) noexcept { return scope.specs; }
+
+/** Cached default GPU context (device 0) to avoid repeated scheduling boilerplate */
+struct default_gpu_context_t {
+    sz::status_t status = sz::status_t::unknown_k;
+    szs::cuda_executor_t executor;
+    sz::gpu_specs_t specs;
+};
+
+inline default_gpu_context_t &default_gpu_context() {
+    static default_gpu_context_t ctx = [] {
+        default_gpu_context_t result;
+        auto specs_status = szs::gpu_specs_fetch(result.specs, 0);
+        if (specs_status.status != sz::status_t::success_k) {
+            result.status = specs_status.status;
+            return result;
+        }
+        auto exec_status = result.executor.try_scheduling(0);
+        result.status = exec_status.status;
+        return result;
+    }();
+    return ctx;
+}
 #endif
 
 struct device_scope_t {
@@ -293,6 +315,16 @@ sz_status_t szs_levenshtein_distances_for_(
                     get_executor(device_scope), get_specs(device_scope));
                 result = static_cast<sz_status_t>(status);
             }
+            // Try ephemeral GPU on default scope (device 0)
+            else if (std::holds_alternative<default_scope_t>(device->variants)) {
+                auto &ctx = default_gpu_context();
+                if (ctx.status != sz::status_t::success_k) { result = static_cast<sz_status_t>(ctx.status); }
+                else {
+                    sz::status_t status = engine_variant( //
+                        a_container, b_container, results_strided, ctx.executor, ctx.specs);
+                    result = static_cast<sz_status_t>(status);
+                }
+            }
             else { result = sz_device_code_mismatch_k; }
 #else
             result = sz_status_unknown_k; // GPU support is not enabled
@@ -452,6 +484,15 @@ sz_status_t szs_needleman_wunsch_scores_for_(
                     get_executor(device_scope), get_specs(device_scope));
                 result = static_cast<sz_status_t>(status);
             }
+            else if (std::holds_alternative<default_scope_t>(device->variants)) {
+                auto &ctx = default_gpu_context();
+                if (ctx.status != sz::status_t::success_k) { result = static_cast<sz_status_t>(ctx.status); }
+                else {
+                    sz::status_t status = engine_variant( //
+                        a_container, b_container, results_strided, ctx.executor, ctx.specs);
+                    result = static_cast<sz_status_t>(status);
+                }
+            }
             else { result = sz_status_unknown_k; }
 #else
             result = sz_status_unknown_k; // GPU support is not enabled
@@ -540,6 +581,25 @@ sz_status_t szs_smith_waterman_scores_for_(
                     get_executor(device_scope), get_specs(device_scope));
                 result = static_cast<sz_status_t>(status);
             }
+            else if (std::holds_alternative<default_scope_t>(device->variants)) {
+                sz::gpu_specs_t specs;
+                auto specs_status = szs::gpu_specs_fetch(specs, 0);
+                if (specs_status.status != sz::status_t::success_k) {
+                    result = static_cast<sz_status_t>(specs_status.status);
+                }
+                else {
+                    szs::cuda_executor_t executor;
+                    auto exec_status = executor.try_scheduling(0);
+                    if (exec_status.status != sz::status_t::success_k) {
+                        result = static_cast<sz_status_t>(exec_status.status);
+                    }
+                    else {
+                        sz::status_t status = engine_variant( //
+                            a_container, b_container, results_strided, executor, specs);
+                        result = static_cast<sz_status_t>(status);
+                    }
+                }
+            }
             else { result = sz_status_unknown_k; }
 #else
             result = sz_status_unknown_k; // GPU support is not enabled
@@ -659,7 +719,7 @@ sz_status_t szs_fingerprints_for_(                                      //
         auto const min_counts_rows = //
             strided_rows<sz_u32_t> {reinterpret_cast<sz_ptr_t>(min_counts), dims, min_counts_stride, texts_count};
 
-        // CPU fallback hashers can only work with CPU-compatible device scopes
+        // GPU fallback hashers can work with GPU scope, or default scope via an ephemeral GPU executor
         if (std::holds_alternative<gpu_scope_t>(device->variants)) {
             auto &device_scope = std::get<gpu_scope_t>(device->variants);
             sz::status_t status = fallback_hashers(                //
@@ -667,6 +727,15 @@ sz_status_t szs_fingerprints_for_(                                      //
                 get_executor(device_scope), get_specs(device_scope));
             result = static_cast<sz_status_t>(status);
         }
+        else if (std::holds_alternative<default_scope_t>(device->variants)) {
+            auto &ctx = default_gpu_context();
+            if (ctx.status != sz::status_t::success_k) { result = static_cast<sz_status_t>(ctx.status); }
+            else {
+                sz::status_t status = fallback_hashers( //
+                    texts_container, min_hashes_rows, min_counts_rows, ctx.executor, ctx.specs);
+                result = static_cast<sz_status_t>(status);
+            }
+        }
         else { result = sz_status_unknown_k; }
     };
 #endif // SZ_USE_CUDA
@@ -704,6 +773,22 @@ sz_status_t szs_fingerprints_for_(                                      //
                     if (result != sz_success_k) break;
                 }
             }
+            else if (std::holds_alternative<default_scope_t>(device->variants)) {
+                auto &ctx = default_gpu_context();
+                if (ctx.status != sz::status_t::success_k) { result = static_cast<sz_status_t>(ctx.status); }
+                else {
+                    for (std::size_t i = 0; i < unrolled_hashers.size(); ++i) {
+                        auto &engine_variant = unrolled_hashers[i];
+                        sz::status_t status = engine_variant(                                             //
+                            texts_container,                                                              //
+                            min_hashes_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
+                            min_counts_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
+                            ctx.executor, ctx.specs);
+                        result = static_cast<sz_status_t>(status);
+                        if (result != sz_success_k) break;
+                    }
+                }
+            }
             else { result = sz_status_unknown_k; }
 #else
             result = sz_status_unknown_k; // GPU support is not enabled
diff --git a/python/stringzillas.c b/python/stringzillas.c
index e6622d25..6b680233 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -135,14 +135,8 @@ static inline sz_bool_t try_swap_to_unified_allocator(PyObject *strs_obj) {
  *  @brief Helper function to determine if unified memory is required based on capabilities and device scope.
  *  @param[in] capabilities The capabilities bitmask of the current engine.
  */
-static inline sz_bool_t requires_unified_memory(sz_capability_t capabilities, szs_device_scope_t device_handle) {
-    // Only relevant if CUDA capability is enabled
-    if ((capabilities & sz_cap_cuda_k) == 0) return sz_false_k;
-
-    // Check that the executor is a GPU device scope
-    sz_size_t gpu_device = 0;
-    if (szs_device_scope_get_gpu_device(device_handle, &gpu_device) == sz_success_k) return sz_true_k;
-    return sz_false_k;
+static inline sz_bool_t requires_unified_memory(sz_capability_t capabilities) {
+    return (capabilities & sz_cap_cuda_k) != 0;
 }
 
 #pragma endregion
@@ -452,8 +446,8 @@ static PyObject *LevenshteinDistances_call(LevenshteinDistances *self, PyObject
     sz_status_t (*kernel_punned)(szs_levenshtein_distances_t, szs_device_scope_t, void *, void *, sz_size_t *,
                                  sz_size_t) = NULL;
 
-    // Swap allocators only when using CUDA with a GPU device
-    if (requires_unified_memory(self->capabilities, device_handle))
+    // Swap allocators only when using CUDA with a GPU device (inputs must be unified)
+    if (requires_unified_memory(self->capabilities))
         if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) return NULL;
 
     // Handle 32-bit tape inputs
@@ -748,8 +742,8 @@ static PyObject *LevenshteinDistancesUTF8_call(LevenshteinDistancesUTF8 *self, P
     sz_status_t (*kernel_punned)(szs_levenshtein_distances_t, szs_device_scope_t, void *, void *, sz_size_t *,
                                  sz_size_t) = NULL;
 
-    // Swap allocators only when using CUDA with a GPU device
-    if (requires_unified_memory(self->capabilities, device_handle))
+    // Swap allocators when engine supports CUDA
+    if (requires_unified_memory(self->capabilities))
         if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) return NULL;
 
     // Handle 32-bit tape inputs
@@ -1079,8 +1073,8 @@ static PyObject *NeedlemanWunsch_call(NeedlemanWunsch *self, PyObject *args, PyO
     sz_status_t (*kernel_punned)(szs_needleman_wunsch_scores_t, szs_device_scope_t, void const *, void const *,
                                  sz_ssize_t *, sz_size_t) = NULL;
 
-    // Swap allocators only when using CUDA with a GPU device
-    if (requires_unified_memory(self->capabilities, device_handle))
+    // Swap allocators only when using CUDA with a GPU device (inputs must be unified)
+    if (requires_unified_memory(self->capabilities))
         if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) return NULL;
 
     // Handle 32-bit tape inputs
@@ -1393,8 +1387,8 @@ static PyObject *SmithWaterman_call(SmithWaterman *self, PyObject *args, PyObjec
     sz_status_t (*kernel_punned)(szs_smith_waterman_scores_t, szs_device_scope_t, void const *, void const *,
                                  sz_ssize_t *, sz_size_t) = NULL;
 
-    // Swap allocators only when using CUDA with a GPU device
-    if (requires_unified_memory(self->capabilities, device_handle))
+    // Swap allocators only when using CUDA with a GPU device (inputs must be unified)
+    if (requires_unified_memory(self->capabilities))
         if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) return NULL;
 
     // Handle 32-bit tape inputs
@@ -1744,7 +1738,7 @@ static PyObject *Fingerprints_call(Fingerprints *self, PyObject *args, PyObject
     }
 
     // Swap allocators only when using CUDA with a GPU device (inputs must be unified)
-    sz_bool_t need_unified = requires_unified_memory(self->capabilities, device_handle);
+    sz_bool_t need_unified = requires_unified_memory(self->capabilities);
     if (need_unified)
         if (!try_swap_to_unified_allocator(texts_obj)) return NULL;
 
diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index cbde58e1..cde94105 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -6,6 +6,7 @@
 
     uv pip install numpy pyarrow pytest pytest-repeat
     uv pip install -e . --force-reinstall --no-build-isolation
+    uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x
 
 Recommended flags for better diagnostics:
 
diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index 9b5cf6dd..f2e80363 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -7,6 +7,14 @@
     uv pip install numpy pyarrow pytest pytest-repeat affine-gaps
     SZ_TARGET=stringzillas-cpus uv pip install -e . --force-reinstall --no-build-isolation
     uv run --no-project python -c "import stringzillas; print(stringzillas.__capabilities__)"
+    uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x
+
+To run for the CUDA backend:
+
+    uv pip install numpy pyarrow pytest pytest-repeat affine-gaps
+    SZ_TARGET=stringzillas-cuda uv pip install -e . --force-reinstall --no-build-isolation
+    uv run --no-project python -c "import stringzillas; print(stringzillas.__capabilities__)"
+    uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x
 
 Recommended flags for better diagnostics:
 
@@ -20,13 +28,6 @@
 Example:
 
     uv run --no-project python -X faulthandler -m pytest scripts/test_stringzillas.py -s -vv --maxfail=1 --full-trace
-
-To run for the CUDA backend:
-
-    uv pip install numpy pyarrow pytest pytest-repeat affine-gaps
-    SZ_TARGET=stringzillas-cuda uv pip install -e . --force-reinstall --no-build-isolation
-    uv run --no-project python -c "import stringzillas; print(stringzillas.__capabilities__)"
-    uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x
 """
 
 from random import choice, randint, seed

From ae74d44fd9089b1dfb6274f430d8b3c3e2f4180f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 30 Aug 2025 13:20:58 +0000
Subject: [PATCH 675/751] Make: Detect CPU AES support on Arm

---
 c/stringzilla.c                   |  43 ++++++++++--
 include/stringzilla/hash.h        | 109 +++++++++++++++++++++++++-----
 include/stringzilla/stringzilla.h |  25 ++++---
 include/stringzilla/types.h       |  16 +++++
 scripts/bench_container.cpp       |   2 +-
 scripts/bench_memory.cpp          |   2 +
 scripts/bench_token.cpp           |   6 +-
 scripts/test_stringzilla.cpp      |   4 +-
 8 files changed, 170 insertions(+), 37 deletions(-)

diff --git a/c/stringzilla.c b/c/stringzilla.c
index 817e98fb..369913f5 100644
--- a/c/stringzilla.c
+++ b/c/stringzilla.c
@@ -187,11 +187,6 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
         impl->lookup = sz_lookup_neon;
 
         impl->bytesum = sz_bytesum_neon;
-        impl->hash = sz_hash_neon;
-        impl->hash_state_init = sz_hash_state_init_neon;
-        impl->hash_state_stream = sz_hash_state_stream_neon;
-        impl->hash_state_fold = sz_hash_state_fold_neon;
-        impl->fill_random = sz_fill_random_neon;
 
         impl->find = sz_find_neon;
         impl->rfind = sz_rfind_neon;
@@ -202,13 +197,51 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
     }
 #endif
 
+#if SZ_USE_NEON_AES
+    if (caps & sz_cap_neon_aes_k) {
+        impl->hash = sz_hash_neon;
+        impl->hash_state_init = sz_hash_state_init_neon;
+        impl->hash_state_stream = sz_hash_state_stream_neon;
+        impl->hash_state_fold = sz_hash_state_fold_neon;
+        impl->fill_random = sz_fill_random_neon;
+    }
+#endif
+
 #if SZ_USE_SVE
     if (caps & sz_cap_sve_k) {
+        impl->equal = sz_equal_sve;
+        impl->order = sz_order_sve;
+
+        impl->copy = sz_copy_sve;
+        impl->move = sz_move_sve;
+        impl->fill = sz_fill_sve;
+
+        impl->find = sz_find_sve;
+        // TODO: impl->rfind = sz_rfind_sve;
+        impl->find_byte = sz_find_byte_sve;
+        impl->rfind_byte = sz_rfind_byte_sve;
+
+        impl->bytesum = sz_bytesum_sve;
+
         impl->sequence_argsort = sz_sequence_argsort_sve;
         impl->sequence_intersect = sz_sequence_intersect_sve;
         impl->pgrams_sort = sz_pgrams_sort_sve;
     }
 #endif
+
+#if SZ_USE_SVE2
+    if (caps & sz_cap_sve2_k) { impl->bytesum = sz_bytesum_sve2; }
+#endif
+
+#if SZ_USE_SVE2_AES
+    if (caps & sz_cap_sve2_aes_k) {
+        impl->hash = sz_hash_sve2;
+        impl->hash_state_init = sz_hash_state_init_sve2;
+        impl->hash_state_stream = sz_hash_state_stream_sve2;
+        impl->hash_state_fold = sz_hash_state_fold_sve2;
+        impl->fill_random = sz_fill_random_sve2;
+    }
+#endif
 }
 
 #if defined(_MSC_VER)
diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 8a82ae1a..e31a02f2 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -229,6 +229,8 @@ SZ_PUBLIC void sz_hash_state_stream_serial(sz_hash_state_t *state, sz_cptr_t tex
 /** @copydoc sz_hash_state_fold */
 SZ_PUBLIC sz_u64_t sz_hash_state_fold_serial(sz_hash_state_t const *state);
 
+#if SZ_USE_HASWELL
+
 /** @copydoc sz_bytesum */
 SZ_PUBLIC sz_u64_t sz_bytesum_haswell(sz_cptr_t text, sz_size_t length);
 
@@ -247,6 +249,10 @@ SZ_PUBLIC void sz_hash_state_stream_haswell(sz_hash_state_t *state, sz_cptr_t te
 /** @copydoc sz_hash_state_fold */
 SZ_PUBLIC sz_u64_t sz_hash_state_fold_haswell(sz_hash_state_t const *state);
 
+#endif
+
+#if SZ_USE_SKYLAKE
+
 /** @copydoc sz_bytesum */
 SZ_PUBLIC sz_u64_t sz_bytesum_skylake(sz_cptr_t text, sz_size_t length);
 
@@ -265,6 +271,10 @@ SZ_PUBLIC void sz_hash_state_stream_skylake(sz_hash_state_t *state, sz_cptr_t te
 /** @copydoc sz_hash_state_fold */
 SZ_PUBLIC sz_u64_t sz_hash_state_fold_skylake(sz_hash_state_t const *state);
 
+#endif
+
+#if SZ_USE_ICE
+
 /** @copydoc sz_bytesum */
 SZ_PUBLIC sz_u64_t sz_bytesum_ice(sz_cptr_t text, sz_size_t length);
 
@@ -283,9 +293,17 @@ SZ_PUBLIC void sz_hash_state_stream_ice(sz_hash_state_t *state, sz_cptr_t text,
 /** @copydoc sz_hash_state_fold */
 SZ_PUBLIC sz_u64_t sz_hash_state_fold_ice(sz_hash_state_t const *state);
 
+#endif
+
+#if SZ_USE_NEON
+
 /** @copydoc sz_bytesum */
 SZ_PUBLIC sz_u64_t sz_bytesum_neon(sz_cptr_t text, sz_size_t length);
 
+#endif
+
+#if SZ_USE_NEON_AES
+
 /** @copydoc sz_hash */
 SZ_PUBLIC sz_u64_t sz_hash_neon(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
 
@@ -301,6 +319,41 @@ SZ_PUBLIC void sz_hash_state_stream_neon(sz_hash_state_t *state, sz_cptr_t text,
 /** @copydoc sz_hash_state_fold */
 SZ_PUBLIC sz_u64_t sz_hash_state_fold_neon(sz_hash_state_t const *state);
 
+#endif
+
+#if SZ_USE_SVE
+
+/** @copydoc sz_bytesum */
+SZ_PUBLIC sz_u64_t sz_bytesum_sve(sz_cptr_t text, sz_size_t length);
+
+#endif
+
+#if SZ_USE_SVE2
+
+/** @copydoc sz_bytesum */
+SZ_PUBLIC sz_u64_t sz_bytesum_sve2(sz_cptr_t text, sz_size_t length);
+
+#endif
+
+#if SZ_USE_SVE2_AES
+
+/** @copydoc sz_hash */
+SZ_PUBLIC sz_u64_t sz_hash_sve2(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
+
+/** @copydoc sz_fill_random */
+SZ_PUBLIC void sz_fill_random_sve2(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
+
+/** @copydoc sz_hash_state_init */
+SZ_PUBLIC void sz_hash_state_init_sve2(sz_hash_state_t *state, sz_u64_t seed);
+
+/** @copydoc sz_hash_state_stream */
+SZ_PUBLIC void sz_hash_state_stream_sve2(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
+
+/** @copydoc sz_hash_state_fold */
+SZ_PUBLIC sz_u64_t sz_hash_state_fold_sve2(sz_hash_state_t const *state);
+
+#endif
+
 #pragma endregion // Core API
 
 #pragma region Helper Methods
@@ -1922,8 +1975,8 @@ SZ_INTERNAL void sz_hash_minimal_x4_update_ice_(sz_hash_minimal_x4_t_ *state, __
 #pragma region NEON Implementation
 #if SZ_USE_NEON
 #pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+simd+crypto")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+crypto"))), apply_to = function)
+#pragma GCC target("arch=armv8.2-a+simd")
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
 
 SZ_PUBLIC sz_u64_t sz_bytesum_neon(sz_cptr_t text, sz_size_t length) {
     uint64x2_t sum_vec = vdupq_n_u64(0);
@@ -1943,6 +1996,17 @@ SZ_PUBLIC sz_u64_t sz_bytesum_neon(sz_cptr_t text, sz_size_t length) {
     return sum;
 }
 
+#pragma clang attribute pop
+#pragma GCC pop_options
+#endif            // SZ_USE_NEON
+#pragma endregion // NEON Implementation
+
+#pragma region NEON AES Implementation
+#if SZ_USE_NEON_AES
+#pragma GCC push_options
+#pragma GCC target("arch=armv8.2-a+simd+crypto+aes")
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+crypto+aes"))), apply_to = function)
+
 /**
  *  @brief  Emulates the Intel's AES-NI `AESENC` instruction on Arm NEON.
  *  @see    "Emulating x86 AES Intrinsics on ARMv8-A" by Michael Brase:
@@ -2303,7 +2367,7 @@ SZ_PUBLIC void sz_fill_random_neon(sz_ptr_t text, sz_size_t length, sz_u64_t non
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_NEON
-#pragma endregion // NEON Implementation
+#pragma endregion // NEON AES Implementation
 
 /*  Implementation of the string search algorithms using the Arm SVE variable-length registers,
  *  available in Arm v9 processors, like in Apple M4+ and Graviton 3+ CPUs.
@@ -2340,11 +2404,11 @@ SZ_PUBLIC sz_u64_t sz_bytesum_sve(sz_cptr_t text, sz_size_t length) {
  *
  *  @see https://stackoverflow.com/a/73218637/2766161
  */
-#pragma region SVE Implementation
+#pragma region SVE2 Implementation
 #if SZ_USE_SVE2
 #pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+sve+sve2+sve2-aes")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+sve2+sve2-aes"))), apply_to = function)
+#pragma GCC target("arch=armv8.2-a+sve+sve2")
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+sve2"))), apply_to = function)
 
 SZ_PUBLIC sz_u64_t sz_bytesum_sve2(sz_cptr_t text, sz_size_t length) {
     sz_u64_t sum = 0;
@@ -2371,6 +2435,17 @@ SZ_PUBLIC sz_u64_t sz_bytesum_sve2(sz_cptr_t text, sz_size_t length) {
     return sum;
 }
 
+#pragma clang attribute pop
+#pragma GCC pop_options
+#endif            // SZ_USE_SVE
+#pragma endregion // SVE2 Implementation
+
+#pragma region SVE2 AES Implementation
+#if SZ_USE_SVE2_AES
+#pragma GCC push_options
+#pragma GCC target("arch=armv8.2-a+sve+sve2+sve2-aes")
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+sve2+sve2-aes"))), apply_to = function)
+
 /**
  *  @brief  Emulates the Intel's AES-NI `AESENC` instruction with Arm SVE2.
  *  @see    "Emulating x86 AES Intrinsics on ARMv8-A" by Michael Brase:
@@ -2504,7 +2579,7 @@ SZ_PUBLIC void sz_hash_sve2_upto16x16_(char texts[16][16], sz_size_t length[16],
 #pragma clang attribute pop
 #pragma GCC pop_options
 #endif            // SZ_USE_SVE2
-#pragma endregion // SVE Implementation
+#pragma endregion // SVE2 Implementation
 
 /*  Pick the right implementation for the string search algorithms.
  *  To override this behavior and precompile all backends - set `SZ_DYNAMIC_DISPATCH` to 1.
@@ -2537,9 +2612,9 @@ SZ_DYNAMIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length, sz_u64_t seed) {
     return sz_hash_skylake(text, length, seed);
 #elif SZ_USE_HASWELL
     return sz_hash_haswell(text, length, seed);
-#elif SZ_USE_SVE2
+#elif SZ_USE_SVE2_AES
     return sz_hash_sve2(text, length, seed);
-#elif SZ_USE_NEON
+#elif SZ_USE_NEON_AES
     return sz_hash_neon(text, length, seed);
 #else
     return sz_hash_serial(text, length, seed);
@@ -2553,9 +2628,9 @@ SZ_DYNAMIC void sz_fill_random(sz_ptr_t text, sz_size_t length, sz_u64_t nonce)
     sz_fill_random_skylake(text, length, nonce);
 #elif SZ_USE_HASWELL
     sz_fill_random_haswell(text, length, nonce);
-#elif SZ_USE_SVE2
+#elif SZ_USE_SVE2_AES
     sz_fill_random_sve2(text, length, nonce);
-#elif SZ_USE_NEON
+#elif SZ_USE_NEON_AES
     sz_fill_random_neon(text, length, nonce);
 #else
     sz_fill_random_serial(text, length, nonce);
@@ -2569,9 +2644,9 @@ SZ_DYNAMIC void sz_hash_state_init(sz_hash_state_t *state, sz_u64_t seed) {
     sz_hash_state_init_skylake(state, seed);
 #elif SZ_USE_HASWELL
     sz_hash_state_init_haswell(state, seed);
-#elif SZ_USE_SVE2
+#elif SZ_USE_SVE2_AES
     sz_hash_state_init_sve2(state, seed);
-#elif SZ_USE_NEON
+#elif SZ_USE_NEON_AES
     sz_hash_state_init_neon(state, seed);
 #else
     sz_hash_state_init_serial(state, seed);
@@ -2585,9 +2660,9 @@ SZ_DYNAMIC void sz_hash_state_stream(sz_hash_state_t *state, sz_cptr_t text, sz_
     sz_hash_state_stream_skylake(state, text, length);
 #elif SZ_USE_HASWELL
     sz_hash_state_stream_haswell(state, text, length);
-#elif SZ_USE_SVE2
+#elif SZ_USE_SVE2_AES
     sz_hash_state_stream_sve2(state, text, length);
-#elif SZ_USE_NEON
+#elif SZ_USE_NEON_AES
     sz_hash_state_stream_neon(state, text, length);
 #else
     sz_hash_state_stream_serial(state, text, length);
@@ -2601,9 +2676,9 @@ SZ_DYNAMIC sz_u64_t sz_hash_state_fold(sz_hash_state_t const *state) {
     return sz_hash_state_fold_skylake(state);
 #elif SZ_USE_HASWELL
     return sz_hash_state_fold_haswell(state);
-#elif SZ_USE_SVE2
+#elif SZ_USE_SVE2_AES
     return sz_hash_state_fold_sve2(state);
-#elif SZ_USE_NEON
+#elif SZ_USE_NEON_AES
     return sz_hash_state_fold_neon(state);
 #else
     return sz_hash_state_fold_serial(state);
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index be2d2cec..e9343343 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -192,24 +192,27 @@ SZ_PUBLIC sz_capability_t sz_capabilities_implementation_arm_(void) {
 
     // On Apple Silicon, `mrs` is not allowed in user-space, so we need to use the `sysctl` API.
     uint32_t supports_neon = 0;
+    uint32_t supports_neon_aes = 0;
     size_t size = sizeof(supports_neon);
     if (sysctlbyname("hw.optional.neon", &supports_neon, &size, NULL, 0) != 0) supports_neon = 0;
+    if (sysctlbyname("hw.optional.arm.FEAT_AES", &supports_neon_aes, &size, NULL, 0) != 0) supports_neon_aes = 0;
 
-    return (sz_capability_t)(               //
-        (sz_cap_neon_k * (supports_neon)) | //
+    return (sz_capability_t)(                       //
+        (sz_cap_neon_k * (supports_neon)) |         //
+        (sz_cap_neon_aes_k * (supports_neon_aes)) | //
         (sz_cap_serial_k));
 
 #elif defined(SZ_IS_LINUX_)
 
     // Read CPUID registers directly
     unsigned long id_aa64isar0_el1 = 0, id_aa64isar1_el1 = 0, id_aa64pfr0_el1 = 0, id_aa64zfr0_el1 = 0;
-    unsigned supports_neon = 0, supports_sve = 0, supports_sve2 = 0;
+    unsigned supports_neon = 0, supports_neon_aes = 0, supports_sve = 0, supports_sve2 = 0, supports_sve2_aes = 0;
     sz_unused_(id_aa64isar0_el1);
     sz_unused_(id_aa64isar1_el1);
     sz_unused_(id_aa64pfr0_el1);
     sz_unused_(id_aa64zfr0_el1);
 
-#if SZ_USE_NEON || SZ_USE_SVE || SZ_USE_SVE2
+#if SZ_USE_NEON || SZ_USE_SVE || SZ_USE_SVE2 || SZ_USE_NEON_AES || SZ_USE_SVE2_AES
     // Now let's unpack the status flags from ID_AA64ISAR0_EL1
     // https://developer.arm.com/documentation/ddi0601/2024-03/AArch64-Registers/ID-AA64ISAR0-EL1--AArch64-Instruction-Set-Attribute-Register-0?lang=en
     __asm__ __volatile__("mrs %0, ID_AA64ISAR0_EL1" : "=r"(id_aa64isar0_el1));
@@ -228,8 +231,9 @@ SZ_PUBLIC sz_capability_t sz_capabilities_implementation_arm_(void) {
     // That's a really weird way to encode lack of NEON support, but it's important to
     // check in case we are running on R-profile CPUs.
     supports_neon = ((id_aa64pfr0_el1 >> 20) & 0xF) != 0xF;
+    supports_neon_aes = ((id_aa64isar0_el1 >> 4) & 0xF) >= 1;
 
-#if SZ_USE_SVE || SZ_USE_SVE2
+#if SZ_USE_SVE || SZ_USE_SVE2 || SZ_USE_SVE2_AES
     // SVE, bits [35:32] of ID_AA64PFR0_EL1
     supports_sve = ((id_aa64pfr0_el1 >> 32) & 0xF) >= 1;
     // Now let's unpack the status flags from ID_AA64ZFR0_EL1
@@ -241,12 +245,15 @@ SZ_PUBLIC sz_capability_t sz_capabilities_implementation_arm_(void) {
     //  - 0b0010: SVE2.1 is implemented
     // This value must match the existing indicator obtained from ID_AA64PFR0_EL1:
     supports_sve2 = ((id_aa64zfr0_el1) & 0xF) >= 1;
+    supports_sve2_aes = ((id_aa64zfr0_el1 >> 4) & 0xF) >= 1;
 #endif // SZ_USE_SVE || SZ_USE_SVE2
 
-    return (sz_capability_t)(               //
-        (sz_cap_neon_k * (supports_neon)) | //
-        (sz_cap_sve_k * (supports_sve)) |   //
-        (sz_cap_sve2_k * (supports_sve2)) | //
+    return (sz_capability_t)(                       //
+        (sz_cap_neon_k * (supports_neon)) |         //
+        (sz_cap_neon_aes_k * (supports_neon_aes)) | //
+        (sz_cap_sve_k * (supports_sve)) |           //
+        (sz_cap_sve2_k * (supports_sve2)) |         //
+        (sz_cap_sve2_aes_k * (supports_sve2_aes)) | //
         (sz_cap_serial_k));
 
 #else // if !defined(SZ_IS_APPLE_) && !defined(SZ_IS_LINUX_)
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index e765bf60..928db7c0 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -260,6 +260,22 @@
 #endif
 #endif
 
+#if !defined(SZ_USE_NEON_AES)
+#ifdef __ARM_FEATURE_AES
+#define SZ_USE_NEON_AES (1)
+#else
+#define SZ_USE_NEON_AES (0)
+#endif
+#endif
+
+#if !defined(SZ_USE_SVE2_AES)
+#ifdef __ARM_FEATURE_SVE2_AES
+#define SZ_USE_SVE2_AES (1)
+#else
+#define SZ_USE_SVE2_AES (0)
+#endif
+#endif
+
 #if !defined(SZ_USE_CUDA)
 #ifdef __NVCC__
 #define SZ_USE_CUDA (1)
diff --git a/scripts/bench_container.cpp b/scripts/bench_container.cpp
index ebbff346..3e05c087 100644
--- a/scripts/bench_container.cpp
+++ b/scripts/bench_container.cpp
@@ -148,7 +148,7 @@ void bench_associative_lookups_with_different_simd_backends(environment_t const
             .log(base_umap);
     }
 #endif
-#if SZ_USE_NEON
+#if SZ_USE_NEON_AES
     {
         auto callable_map =
             callable_for_associative_lookups<std::map<std::string_view, unsigned, less_from_sz<sz_order_neon>>>(env);
diff --git a/scripts/bench_memory.cpp b/scripts/bench_memory.cpp
index 86203828..cdf23e32 100644
--- a/scripts/bench_memory.cpp
+++ b/scripts/bench_memory.cpp
@@ -305,6 +305,8 @@ void bench_fill(environment_t const &env) {
 #endif
 #if SZ_USE_NEON
     bench_unary(env, "sz_fill_neon", fill_from_sz<sz_fill_neon> {env, o}).log(zeros);
+#endif
+#if SZ_USE_NEON_AES
     bench_unary(env, "sz_fill_random_neon", random_call, fill_random_from_sz<sz_fill_random_neon> {env, o})
         .log(zeros, random);
 #endif
diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index 4aec90a9..0ca327d1 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -178,10 +178,10 @@ void bench_hashing(environment_t const &env) {
 #if SZ_USE_ICE
     bench_unary(env, "sz_hash_ice", validator, hash_from_sz<sz_hash_ice> {env}).log(base, base_stl);
 #endif
-#if SZ_USE_SVE2
+#if SZ_USE_SVE2_AES
     bench_unary(env, "sz_hash_sve2", validator, hash_from_sz<sz_hash_sve2> {env}).log(base, base_stl);
 #endif
-#if SZ_USE_NEON
+#if SZ_USE_NEON_AES
     bench_unary(env, "sz_hash_neon", validator, hash_from_sz<sz_hash_neon> {env}).log(base, base_stl);
 #endif
 }
@@ -210,7 +210,7 @@ void bench_stream_hashing(environment_t const &env) {
                 hash_stream_from_sz<sz_hash_state_init_ice, sz_hash_state_stream_ice, sz_hash_state_fold_ice> {env})
         .log(base, base_stl);
 #endif
-#if SZ_USE_NEON
+#if SZ_USE_NEON_AES
     bench_unary(env, "sz_hash_stream_neon", validator,
                 hash_stream_from_sz<sz_hash_state_init_neon, sz_hash_state_stream_neon, sz_hash_state_fold_neon> {env})
         .log(base, base_stl);
diff --git a/scripts/test_stringzilla.cpp b/scripts/test_stringzilla.cpp
index de694c67..56eda561 100644
--- a/scripts/test_stringzilla.cpp
+++ b/scripts/test_stringzilla.cpp
@@ -374,7 +374,7 @@ void test_equivalence() {
         sz_hash_state_stream_ice, sz_hash_state_fold_ice);
     test_random_generator_equivalence(sz_fill_random_serial, sz_fill_random_ice);
 #endif
-#if SZ_USE_NEON
+#if SZ_USE_NEON_AES
     test_hash_equivalence(                                      //
         sz_hash_serial, sz_hash_state_init_serial,              //
         sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
@@ -382,7 +382,7 @@ void test_equivalence() {
         sz_hash_state_stream_neon, sz_hash_state_fold_neon);
     test_random_generator_equivalence(sz_fill_random_serial, sz_fill_random_neon);
 #endif
-#if SZ_USE_SVE2
+#if SZ_USE_SVE2_AES
     test_hash_equivalence(                                      //
         sz_hash_serial, sz_hash_state_init_serial,              //
         sz_hash_state_stream_serial, sz_hash_state_fold_serial, //

From 337257b82781c0682c97998eb069fa44d0bad32e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 30 Aug 2025 20:50:04 +0100
Subject: [PATCH 676/751] Make: Bump FU to avoid missing `+wfxt` target

---
 fork_union | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fork_union b/fork_union
index 722b3117..c613f501 160000
--- a/fork_union
+++ b/fork_union
@@ -1 +1 @@
-Subproject commit 722b3117a6ffaf04d1323460f86140adb6996212
+Subproject commit c613f501a02c3d6b0f4f82ed8638f2427ae2923e

From 42ad14d4665f218e5a327f48a6e865e0b2950877 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 30 Aug 2025 21:08:57 +0100
Subject: [PATCH 677/751] Improve: Guard compiler pragmas

---
 include/stringzilla/compare.h         | 32 +++++++++++--
 include/stringzilla/find.h            | 42 ++++++++++++++---
 include/stringzilla/hash.h            | 68 +++++++++++++++++++++++----
 include/stringzilla/intersect.h       | 20 ++++++--
 include/stringzilla/memory.h          | 42 ++++++++++++++---
 include/stringzilla/sort.h            | 18 +++++--
 include/stringzilla/stringzilla.h     |  8 +++-
 include/stringzilla/types.h           | 10 +++-
 include/stringzillas/fingerprints.hpp | 18 +++++--
 include/stringzillas/similarities.hpp | 10 +++-
 10 files changed, 227 insertions(+), 41 deletions(-)

diff --git a/include/stringzilla/compare.h b/include/stringzilla/compare.h
index 41076764..13ab3d8a 100644
--- a/include/stringzilla/compare.h
+++ b/include/stringzilla/compare.h
@@ -161,9 +161,12 @@ SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr
  */
 #pragma region Haswell Implementation
 #if SZ_USE_HASWELL
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("avx2")
-#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
+#endif
 
 SZ_PUBLIC sz_ordering_t sz_order_haswell(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
     //! Before optimizing this, read the "Operations Not Worth Optimizing" in Contributions Guide:
@@ -228,8 +231,11 @@ SZ_PUBLIC sz_bool_t sz_equal_haswell(sz_cptr_t a, sz_cptr_t b, sz_size_t length)
     }
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_HASWELL
 #pragma endregion // Haswell Implementation
 
@@ -240,9 +246,12 @@ SZ_PUBLIC sz_bool_t sz_equal_haswell(sz_cptr_t a, sz_cptr_t b, sz_size_t length)
  */
 #pragma region Skylake Implementation
 #if SZ_USE_SKYLAKE
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
+#endif
 
 SZ_PUBLIC sz_ordering_t sz_order_skylake(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
     sz_u512_vec_t a_vec, b_vec;
@@ -331,8 +340,11 @@ SZ_PUBLIC sz_bool_t sz_equal_skylake(sz_cptr_t a, sz_cptr_t b, sz_size_t length)
     return sz_true_k;
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_SKYLAKE
 #pragma endregion // Skylake Implementation
 
@@ -341,9 +353,12 @@ SZ_PUBLIC sz_bool_t sz_equal_skylake(sz_cptr_t a, sz_cptr_t b, sz_size_t length)
  */
 #pragma region NEON Implementation
 #if SZ_USE_NEON
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+simd")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
+#endif
 
 SZ_PUBLIC sz_ordering_t sz_order_neon(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
     //! Before optimizing this, read the "Operations Not Worth Optimizing" in Contributions Guide:
@@ -372,8 +387,11 @@ SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
     return sz_true_k;
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_NEON
 #pragma endregion // NEON Implementation
 
@@ -382,9 +400,12 @@ SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
  */
 #pragma region SVE Implementation
 #if SZ_USE_SVE
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+sve")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
+#endif
 
 SZ_PUBLIC sz_bool_t sz_equal_sve(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
     // Determine the number of bytes in an SVE vector.
@@ -408,8 +429,11 @@ SZ_PUBLIC sz_ordering_t sz_order_sve(sz_cptr_t a, sz_size_t a_length, sz_cptr_t
     return sz_order_serial(a, a_length, b, b_length);
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_SVE
 #pragma endregion // SVE Implementation
 
diff --git a/include/stringzilla/find.h b/include/stringzilla/find.h
index 6b3a306b..cf4a8284 100644
--- a/include/stringzilla/find.h
+++ b/include/stringzilla/find.h
@@ -831,9 +831,12 @@ SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n
  */
 #pragma region Haswell Implementation
 #if SZ_USE_HASWELL
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("avx2")
-#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
+#endif
 
 SZ_PUBLIC sz_cptr_t sz_find_byte_haswell(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
     int mask;
@@ -1032,8 +1035,11 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byteset_haswell(sz_cptr_t text, sz_size_t length, s
     return sz_rfind_byteset_serial(text, length, filter);
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_HASWELL
 #pragma endregion // Haswell Implementation
 
@@ -1044,9 +1050,12 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byteset_haswell(sz_cptr_t text, sz_size_t length, s
  */
 #pragma region Skylake Implementation
 #if SZ_USE_SKYLAKE
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
+#endif
 
 SZ_PUBLIC sz_cptr_t sz_find_byte_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
     __mmask64 mask;
@@ -1259,8 +1268,11 @@ SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
     return SZ_NULL_CHAR;
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_SKYLAKE
 #pragma endregion // Skylake Implementation
 
@@ -1274,11 +1286,14 @@ SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
  */
 #pragma region Ice Lake Implementation
 #if SZ_USE_ICE
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "avx512vbmi2", "bmi", "bmi2")
+#if defined(__clang__)
 #pragma clang attribute push(                                                                          \
     __attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vbmi2,bmi,bmi2"))), \
     apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "avx512vbmi2", "bmi", "bmi2")
+#endif
 
 SZ_PUBLIC sz_cptr_t sz_find_byteset_ice(sz_cptr_t text, sz_size_t length, sz_byteset_t const *filter) {
 
@@ -1399,8 +1414,11 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byteset_ice(sz_cptr_t text, sz_size_t length, sz_by
     return sz_rfind_byteset_serial(text, length, filter);
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_ICE
 #pragma endregion // Ice Lake Implementation
 
@@ -1409,9 +1427,12 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byteset_ice(sz_cptr_t text, sz_size_t length, sz_by
  */
 #pragma region NEON Implementation
 #if SZ_USE_NEON
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+simd")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
+#endif
 
 SZ_INTERNAL sz_u64_t sz_vreinterpretq_u8_u4_(uint8x16_t vec) {
     // Use `vshrn` to produce a bitmask, similar to `movemask` in SSE.
@@ -1628,8 +1649,11 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byteset_neon(sz_cptr_t h, sz_size_t h_length, sz_by
     return sz_rfind_byteset_serial(h, h_length, set);
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_NEON
 #pragma endregion // NEON Implementation
 
@@ -1638,9 +1662,12 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byteset_neon(sz_cptr_t h, sz_size_t h_length, sz_by
  */
 #pragma region SVE Implementation
 #if SZ_USE_SVE
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+sve")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
+#endif
 
 SZ_PUBLIC sz_cptr_t sz_find_byte_sve(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
     sz_u8_t const n_scalar = *n;
@@ -1766,8 +1793,11 @@ SZ_PUBLIC sz_cptr_t sz_find_sve(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz
     }
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_SVE
 #pragma endregion // SVE Implementation
 
diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index e31a02f2..107282b8 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -880,9 +880,12 @@ SZ_PUBLIC void sz_fill_random_serial(sz_ptr_t text, sz_size_t length, sz_u64_t n
  */
 #pragma region Haswell Implementation
 #if SZ_USE_HASWELL
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("avx2,aes"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("avx2", "aes")
-#pragma clang attribute push(__attribute__((target("avx2,aes"))), apply_to = function)
+#endif
 
 SZ_PUBLIC sz_u64_t sz_bytesum_haswell(sz_cptr_t text, sz_size_t length) {
     // The naive implementation of this function is very simple.
@@ -1348,8 +1351,11 @@ SZ_PUBLIC void sz_fill_random_haswell(sz_ptr_t text, sz_size_t length, sz_u64_t
     }
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_HASWELL
 #pragma endregion // Haswell Implementation
 
@@ -1360,9 +1366,12 @@ SZ_PUBLIC void sz_fill_random_haswell(sz_ptr_t text, sz_size_t length, sz_u64_t
  */
 #pragma region Skylake Implementation
 #if SZ_USE_SKYLAKE
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2,aes"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2", "aes")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2,aes"))), apply_to = function)
+#endif
 
 SZ_PUBLIC sz_u64_t sz_bytesum_skylake(sz_cptr_t text, sz_size_t length) {
     // The naive implementation of this function is very simple.
@@ -1588,8 +1597,11 @@ SZ_PUBLIC void sz_fill_random_skylake(sz_ptr_t text, sz_size_t length, sz_u64_t
     sz_fill_random_haswell(text, length, nonce);
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_SKYLAKE
 #pragma endregion // Skylake Implementation
 
@@ -1601,12 +1613,15 @@ SZ_PUBLIC void sz_fill_random_skylake(sz_ptr_t text, sz_size_t length, sz_u64_t
  */
 #pragma region Ice Lake Implementation
 #if SZ_USE_ICE
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "avx512vnni", "bmi", "bmi2", \
-                   "aes", "vaes")
+#if defined(__clang__)
 #pragma clang attribute push(                                                                                  \
     __attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vnni,bmi,bmi2,aes,vaes"))), \
     apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "avx512vnni", "bmi", "bmi2", \
+                   "aes", "vaes")
+#endif
 
 SZ_PUBLIC sz_u64_t sz_bytesum_ice(sz_cptr_t text, sz_size_t length) {
     // The naive implementation of this function is very simple.
@@ -1964,8 +1979,11 @@ SZ_INTERNAL void sz_hash_minimal_x4_update_ice_(sz_hash_minimal_x4_t_ *state, __
     state->sum.zmm = _mm512_add_epi64(_mm512_shuffle_epi8(state->sum.zmm, shuffle_mask), blocks);
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_ICE
 #pragma endregion // Ice Lake Implementation
 
@@ -1974,9 +1992,12 @@ SZ_INTERNAL void sz_hash_minimal_x4_update_ice_(sz_hash_minimal_x4_t_ *state, __
  */
 #pragma region NEON Implementation
 #if SZ_USE_NEON
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+simd")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
+#endif
 
 SZ_PUBLIC sz_u64_t sz_bytesum_neon(sz_cptr_t text, sz_size_t length) {
     uint64x2_t sum_vec = vdupq_n_u64(0);
@@ -1996,16 +2017,22 @@ SZ_PUBLIC sz_u64_t sz_bytesum_neon(sz_cptr_t text, sz_size_t length) {
     return sum;
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_NEON
 #pragma endregion // NEON Implementation
 
 #pragma region NEON AES Implementation
 #if SZ_USE_NEON_AES
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+crypto+aes"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+simd+crypto+aes")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+crypto+aes"))), apply_to = function)
+#endif
 
 /**
  *  @brief  Emulates the Intel's AES-NI `AESENC` instruction on Arm NEON.
@@ -2364,8 +2391,11 @@ SZ_PUBLIC void sz_fill_random_neon(sz_ptr_t text, sz_size_t length, sz_u64_t non
     }
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_NEON
 #pragma endregion // NEON AES Implementation
 
@@ -2374,9 +2404,12 @@ SZ_PUBLIC void sz_fill_random_neon(sz_ptr_t text, sz_size_t length, sz_u64_t non
  */
 #pragma region SVE Implementation
 #if SZ_USE_SVE
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+sve")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
+#endif
 
 SZ_PUBLIC sz_u64_t sz_bytesum_sve(sz_cptr_t text, sz_size_t length) {
     sz_u64_t sum = 0;
@@ -2391,8 +2424,11 @@ SZ_PUBLIC sz_u64_t sz_bytesum_sve(sz_cptr_t text, sz_size_t length) {
     return sum;
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_SVE
 #pragma endregion // SVE Implementation
 
@@ -2406,9 +2442,12 @@ SZ_PUBLIC sz_u64_t sz_bytesum_sve(sz_cptr_t text, sz_size_t length) {
  */
 #pragma region SVE2 Implementation
 #if SZ_USE_SVE2
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+sve2"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+sve+sve2")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+sve2"))), apply_to = function)
+#endif
 
 SZ_PUBLIC sz_u64_t sz_bytesum_sve2(sz_cptr_t text, sz_size_t length) {
     sz_u64_t sum = 0;
@@ -2435,16 +2474,22 @@ SZ_PUBLIC sz_u64_t sz_bytesum_sve2(sz_cptr_t text, sz_size_t length) {
     return sum;
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_SVE
 #pragma endregion // SVE2 Implementation
 
 #pragma region SVE2 AES Implementation
 #if SZ_USE_SVE2_AES
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+sve2+sve2-aes"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+sve+sve2+sve2-aes")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+sve2+sve2-aes"))), apply_to = function)
+#endif
 
 /**
  *  @brief  Emulates the Intel's AES-NI `AESENC` instruction with Arm SVE2.
@@ -2576,8 +2621,11 @@ SZ_PUBLIC void sz_hash_sve2_upto16x16_(char texts[16][16], sz_size_t length[16],
 }
 #endif
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_SVE2
 #pragma endregion // SVE2 Implementation
 
diff --git a/include/stringzilla/intersect.h b/include/stringzilla/intersect.h
index e4e4d74b..85e8e5bc 100644
--- a/include/stringzilla/intersect.h
+++ b/include/stringzilla/intersect.h
@@ -355,12 +355,15 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_serial(
  */
 #pragma region Ice Lake Implementation
 #if SZ_USE_ICE
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "avx512vnni", "bmi", "bmi2", \
-                   "aes", "vaes")
+#if defined(__clang__)
 #pragma clang attribute push(                                                                                  \
     __attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vnni,bmi,bmi2,aes,vaes"))), \
     apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "avx512vnni", "bmi", "bmi2", \
+                   "aes", "vaes")
+#endif
 
 SZ_INTERNAL int sz_u64x4_contains_collisions_haswell_(__m256i v) {
     // Assume `v` stores values: [a, b, c, d].
@@ -722,16 +725,22 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_ice(
     return sz_success_k;
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_ICE
 #pragma endregion // Ice Lake Implementation
 
 #pragma region SVE Implementation
 #if SZ_USE_SVE
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+sve")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
+#endif
 
 SZ_PUBLIC sz_status_t sz_sequence_intersect_sve(sz_sequence_t const *first_sequence,
                                                 sz_sequence_t const *second_sequence, //
@@ -745,8 +754,11 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_sve(sz_sequence_t const *first_seque
         first_positions, second_positions);
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_SVE
 #pragma endregion // SVE Implementation
 
diff --git a/include/stringzilla/memory.h b/include/stringzilla/memory.h
index f07850ec..ded6add6 100644
--- a/include/stringzilla/memory.h
+++ b/include/stringzilla/memory.h
@@ -378,9 +378,12 @@ SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
 #pragma region Haswell Implementation
 
 #if SZ_USE_HASWELL
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("avx2")
-#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
+#endif
 
 SZ_PUBLIC void sz_fill_haswell(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
     char value_char = *(char *)&value;
@@ -724,8 +727,11 @@ SZ_PUBLIC void sz_lookup_haswell(sz_ptr_t target, sz_size_t length, sz_cptr_t so
     if (length) sz_lookup_serial(target, length, source, lut);
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_HASWELL
 #pragma endregion // Haswell Implementation
 
@@ -737,9 +743,12 @@ SZ_PUBLIC void sz_lookup_haswell(sz_ptr_t target, sz_size_t length, sz_cptr_t so
 #pragma region Skylake Implementation
 
 #if SZ_USE_SKYLAKE
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
+#endif
 
 SZ_PUBLIC void sz_fill_skylake(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
     __m512i value_vec = _mm512_set1_epi8(value);
@@ -954,8 +963,11 @@ SZ_PUBLIC void sz_move_skylake(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
     }
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_SKYLAKE
 #pragma endregion // Skylake Implementation
 
@@ -967,10 +979,13 @@ SZ_PUBLIC void sz_move_skylake(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
  */
 #pragma region Ice Lake Implementation
 #if SZ_USE_ICE
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
+#if defined(__clang__)
 #pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
                              apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
+#endif
 
 SZ_PUBLIC void sz_lookup_ice(sz_ptr_t target, sz_size_t length, sz_cptr_t source, sz_cptr_t lut) {
 
@@ -1082,8 +1097,11 @@ SZ_PUBLIC void sz_lookup_ice(sz_ptr_t target, sz_size_t length, sz_cptr_t source
     }
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_ICE
 #pragma endregion // Ice Lake Implementation
 
@@ -1092,9 +1110,12 @@ SZ_PUBLIC void sz_lookup_ice(sz_ptr_t target, sz_size_t length, sz_cptr_t source
  */
 #pragma region NEON Implementation
 #if SZ_USE_NEON
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+simd")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd"))), apply_to = function)
+#endif
 
 SZ_PUBLIC void sz_copy_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
     // In most cases the `source` and the `target` are not aligned, but we should
@@ -1204,8 +1225,11 @@ SZ_PUBLIC void sz_lookup_neon(sz_ptr_t target, sz_size_t length, sz_cptr_t sourc
     for (; tail_length; target += 1, source += 1, tail_length -= 1) *target = lut[*(sz_u8_t const *)source];
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_NEON
 #pragma endregion // NEON Implementation
 
@@ -1214,9 +1238,12 @@ SZ_PUBLIC void sz_lookup_neon(sz_ptr_t target, sz_size_t length, sz_cptr_t sourc
  */
 #pragma region SVE Implementation
 #if SZ_USE_SVE
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+sve")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
+#endif
 
 SZ_PUBLIC void sz_fill_sve(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
     svuint8_t value_vec = svdup_u8(value);
@@ -1328,8 +1355,11 @@ SZ_PUBLIC void sz_move_sve(sz_ptr_t target, sz_cptr_t source, sz_size_t length)
 #endif
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_SVE
 #pragma endregion // SVE Implementation
 
diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index e00d0f9c..8096c826 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -671,9 +671,12 @@ SZ_INTERNAL void sz_pgrams_union_serial_(
  */
 #pragma region Skylake Implementation
 #if SZ_USE_SKYLAKE
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
+#endif
 
 /**
  *  @brief The most important part of the QuickSort algorithm partitioning the elements around the pivot.
@@ -907,16 +910,22 @@ SZ_PUBLIC sz_status_t sz_sequence_argsort_skylake(sz_sequence_t const *sequence,
     return sz_success_k;
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_SKYLAKE
-#pragma endregion // Ice Lake Implementation
+#pragma endregion // Skylake Implementation
 
 #pragma region SVE Implementation
 #if SZ_USE_SVE
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+sve")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
+#endif
 
 /**
  *  @brief The most important part of the QuickSort algorithm partitioning the elements around the pivot.
@@ -1132,8 +1141,11 @@ SZ_PUBLIC sz_status_t sz_sequence_argsort_sve(sz_sequence_t const *sequence, sz_
     return sz_success_k;
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_SVE
 #pragma endregion // SVE Implementation
 
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index e9343343..4deba990 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -177,9 +177,12 @@ SZ_INTERNAL sz_cptr_t sz_capabilities_to_string_implementation_(sz_capability_t
  *  Suppressing assembler errors is very complicated, so when dealing with older ARM CPUs it's simpler to compile this
  *  function targeting newer ones.
  */
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("arch=armv8.5-a+sve"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.5-a+sve")
-#pragma clang attribute push(__attribute__((target("arch=armv8.5-a+sve"))), apply_to = function)
+#endif
 
 /**
  *  @brief  Function to determine the SIMD capabilities of the current 64-bit Arm machine at @b runtime.
@@ -261,8 +264,11 @@ SZ_PUBLIC sz_capability_t sz_capabilities_implementation_arm_(void) {
 #endif
 }
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 
 #endif // SZ_IS_64BIT_ARM_
 
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 928db7c0..b0647449 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -1217,9 +1217,12 @@ SZ_INTERNAL sz_i32_t sz_i32_max_of_two(sz_i32_t x, sz_i32_t y) { return x - ((x
  *  Alternatively, the BZHI instruction can be used to clear the bits above N.
  */
 #if SZ_USE_SKYLAKE || SZ_USE_ICE
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("bmi,bmi2"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("bmi", "bmi2")
-#pragma clang attribute push(__attribute__((target("bmi,bmi2"))), apply_to = function)
+#endif
 SZ_INTERNAL __mmask8 sz_u8_mask_until_(sz_size_t n) { return (__mmask8)_bzhi_u32(0xFFu, (unsigned char)n); }
 SZ_INTERNAL __mmask16 sz_u16_mask_until_(sz_size_t n) { return (__mmask16)_bzhi_u32(0xFFFFu, (unsigned char)n); }
 SZ_INTERNAL __mmask32 sz_u32_mask_until_(sz_size_t n) { return (__mmask32)_bzhi_u64(0xFFFFFFFFu, (unsigned char)n); }
@@ -1232,8 +1235,11 @@ SZ_INTERNAL __mmask32 sz_u32_clamp_mask_until_(sz_size_t n) { return n < 32 ? sz
 SZ_INTERNAL __mmask64 sz_u64_clamp_mask_until_(sz_size_t n) {
     return n < 64 ? sz_u64_mask_until_(n) : 0xFFFFFFFFFFFFFFFFull;
 }
-#pragma GCC pop_options
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
 #endif
 
 /**
diff --git a/include/stringzillas/fingerprints.hpp b/include/stringzillas/fingerprints.hpp
index 1dc147c0..f4c09dce 100644
--- a/include/stringzillas/fingerprints.hpp
+++ b/include/stringzillas/fingerprints.hpp
@@ -1310,9 +1310,12 @@ struct floating_rolling_hashers<sz_cap_serial_k, dimensions_> {
  */
 #pragma region Haswell Implementation
 #if SZ_USE_HASWELL
+#if defined(__clang__)
+#pragma clang attribute push(__attribute__((target("avx2,fma"))), apply_to = function)
+#elif defined(__GNUC__)
 #pragma GCC push_options
 #pragma GCC target("avx2", "fma")
-#pragma clang attribute push(__attribute__((target("avx2,fma"))), apply_to = function)
+#endif
 
 SZ_INLINE __m256d _mm256_floor_magic_pd(__m256d x) noexcept {
     // Magic number rounding approach for fast floor
@@ -1620,8 +1623,11 @@ struct floating_rolling_hashers<sz_cap_haswell_k, dimensions_> {
     }
 };
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif // SZ_USE_HASWELL
 
 #pragma endregion Haswell Implementation
@@ -1633,10 +1639,13 @@ struct floating_rolling_hashers<sz_cap_haswell_k, dimensions_> {
  */
 #pragma region Skylake Implementation
 #if SZ_USE_SKYLAKE
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512dq", "avx512bw", "bmi", "bmi2")
+#if defined(__clang__)
 #pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512dq,avx512bw,bmi,bmi2"))), \
                              apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("avx", "avx512f", "avx512vl", "avx512dq", "avx512bw", "bmi", "bmi2")
+#endif
 
 /**
  *  @brief Alternative to `_mm512_roundscale_pd` and `std::floor`.
@@ -1953,8 +1962,11 @@ struct floating_rolling_hashers<sz_cap_skylake_k, dimensions_> {
     }
 };
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif // SZ_USE_SKYLAKE
 
 #pragma endregion Skylake Implementation
diff --git a/include/stringzillas/similarities.hpp b/include/stringzillas/similarities.hpp
index 74803312..492a0e4d 100644
--- a/include/stringzillas/similarities.hpp
+++ b/include/stringzillas/similarities.hpp
@@ -2575,10 +2575,13 @@ struct error_costs_26x26ascii_t {
  */
 #pragma region Ice Lake Implementation
 #if SZ_USE_ICE
-#pragma GCC push_options
-#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
+#if defined(__clang__)
 #pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
                              apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
+#endif
 
 /**
  *  @brief Variant of `tile_scorer` - Minimizes Levenshtein distance for inputs under 256 bytes.
@@ -4424,8 +4427,11 @@ struct smith_waterman_score<char, error_costs_256x256_t, linear_gap_costs_t, all
     }
 };
 
+#if defined(__clang__)
 #pragma clang attribute pop
+#elif defined(__GNUC__)
 #pragma GCC pop_options
+#endif
 #endif            // SZ_USE_ICE
 #pragma endregion // Ice Lake Implementation
 

From c462f4263b30052081655246acc6daf77c1ff9b6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 30 Aug 2025 21:33:08 +0100
Subject: [PATCH 678/751] Make: Avoid universal builds defaults for `pip
 install .`

---
 setup.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 4848291b..51810d83 100644
--- a/setup.py
+++ b/setup.py
@@ -133,7 +133,9 @@ def linux_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[Tu
 def darwin_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[Tuple[str]]]:
 
     min_macos = os.environ.get("MACOSX_DEPLOYMENT_TARGET", "11.0")
-
+    
+    # Force single-architecture builds to prevent `universal2`
+    current_arch = platform.machine()
     compile_args = [
         "-std=c++17" if use_cpp else "-std=c99",  # use C++17 for StringZillas, C99 for StringZilla
         "-pedantic",  # stick close to the C language standard, avoid compiler extensions
@@ -141,14 +143,16 @@ def darwin_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[T
         "-fcolor-diagnostics",  # color console output
         "-Wno-unknown-pragmas",  # like: `pragma region` and some unrolls
         "-Wno-incompatible-function-pointer-types",
-        "-Wno-incompatible-pointer-types",  # like: passing argument 4 of ‘sz_export_prefix_u32’ from incompatible pointer type
-        "-Wno-discarded-qualifiers",  # like: passing argument 1 of ‘free’ discards ‘const’ qualifier from pointer target type
+        "-Wno-incompatible-pointer-types",  # like: passing argument 4 of 'sz_export_prefix_u32' from incompatible pointer type
+        "-Wno-discarded-qualifiers",  # like: passing argument 1 of 'free' discards 'const' qualifier from pointer target type
         "-fPIC",  # to enable dynamic dispatch
         # "-mfloat-abi=hard",  # NEON intrinsics not available with the soft-float ABI
         f"-mmacosx-version-min={min_macos}",  # minimum macOS version (respect env if provided)
+        "-arch", current_arch,  # force single architecture to prevent universal2 builds
     ]
     link_args = [
         "-fPIC",  # to enable dynamic dispatch
+        "-arch", current_arch,  # force single architecture to prevent universal2 builds
     ]
 
     # We only support single-arch macOS wheels, but not the Universal builds:

From dec93d05a1f8ad319b5dca03629c11da6fa2381e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 30 Aug 2025 21:33:38 +0100
Subject: [PATCH 679/751] Make: Upgrade GitHub actions

---
 .github/workflows/prerelease.yml | 24 ++++++++++++------------
 .github/workflows/release.yml    | 18 +++++++++---------
 pyproject.toml                   |  2 +-
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index a6eed5db..e65a1549 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -23,7 +23,7 @@ jobs:
     runs-on: ubuntu-24.04
     steps:
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
         with:
           fetch-depth: 0
           persist-credentials: false
@@ -52,7 +52,7 @@ jobs:
       CXX: g++-12
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           submodules: recursive
 
@@ -160,7 +160,7 @@ jobs:
       CXX: clang++-16
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           submodules: recursive
 
@@ -262,7 +262,7 @@ jobs:
       CXX: g++-12
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           submodules: recursive
 
@@ -333,7 +333,7 @@ jobs:
       CXX: g++-12
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           submodules: recursive
 
@@ -417,7 +417,7 @@ jobs:
     runs-on: ubuntu-22.04
     container: swift:latest
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           submodules: recursive
       - name: Test Swift
@@ -443,7 +443,7 @@ jobs:
             target: aarch64-linux-gnu
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           submodules: recursive
 
@@ -474,7 +474,7 @@ jobs:
           cmake --build build_artifacts --config RelWithDebInfo
 
         # We can't run the produced builds, but we can make sure they exist
-      - name: Test artifacts presense
+      - name: Test artifacts presence
         run: |
           test -e build_artifacts/libstringzilla_bare.so
           test -e build_artifacts/libstringzilla_shared.so
@@ -485,7 +485,7 @@ jobs:
     runs-on: macos-14
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           submodules: recursive
 
@@ -552,7 +552,7 @@ jobs:
     name: Windows
     runs-on: windows-2022
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           submodules: recursive
       - uses: ilammy/msvc-dev-cmd@v1
@@ -618,7 +618,7 @@ jobs:
         run: |
           echo "Alpine: $(cat /etc/alpine-release 2>/dev/null || echo 'unknown')"
           git --version || echo "git not installed"
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           submodules: recursive
 
@@ -661,7 +661,7 @@ jobs:
         os: [ubuntu-24.04, macos-13, windows-2022]
         python-version: ["38", "39", "310", "311", "312", "313"]
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           submodules: recursive
       - name: Set up Python
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 75b1f3ea..856bd713 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -22,7 +22,7 @@ jobs:
     runs-on: ubuntu-24.04
     steps:
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
         with:
           fetch-depth: 0
           persist-credentials: false
@@ -54,7 +54,7 @@ jobs:
     needs: versioning
     steps:
       - name: Checkout the latest code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
         with:
           fetch-depth: 0
           submodules: recursive
@@ -82,7 +82,7 @@ jobs:
         os: [ubuntu-24.04, macos-13, windows-2022]
         python-version: ["38", "39", "310", "311", "312", "313"]
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           ref: "main"
           submodules: recursive
@@ -113,7 +113,7 @@ jobs:
     runs-on: ubuntu-24.04
     needs: versioning
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           ref: "main"
           submodules: recursive
@@ -168,7 +168,7 @@ jobs:
     runs-on: ubuntu-22.04
     needs: versioning
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           ref: "main"
           submodules: recursive
@@ -186,7 +186,7 @@ jobs:
   #   needs: versioning
   #   runs-on: ubuntu-24.04
   #   steps:
-  #     - uses: actions/checkout@v4
+  #     - uses: actions/checkout@v5
   #       with:
   #         ref: 'main'
   #     - run: git submodule update --init --recursive
@@ -221,7 +221,7 @@ jobs:
             target: aarch64-linux-gnu
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           persist-credentials: false
           ref: "main"
@@ -292,7 +292,7 @@ jobs:
       matrix:
         arch: [x64, x86]
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           persist-credentials: false
           ref: "main"
@@ -337,7 +337,7 @@ jobs:
       matrix:
         arch: [arm64] # Only Apple Silicon for now
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           persist-credentials: false
           ref: "main"
diff --git a/pyproject.toml b/pyproject.toml
index 59ce2a6b..4729c3ed 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,7 @@ archs = ["all"]
 # Add "pp*" to skip PyPy builds, but they should work fine these days :)
 # https://cibuildwheel.readthedocs.io/en/stable/options/#build-skip
 # https://cibuildwheel.readthedocs.io/en/stable/#what-does-it-do
-skip = []
+skip = ["*-macos*_universal2"]
 
 [tool.cibuildwheel.linux]
 before-build = ["rm -rf {project}/build"]

From 6a450f6636d67f5c1111440d43296e49a448d631 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 30 Aug 2025 21:48:50 +0100
Subject: [PATCH 680/751] Make: Respect env-vars for `-arch`

---
 setup.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 51810d83..d350f184 100644
--- a/setup.py
+++ b/setup.py
@@ -135,7 +135,13 @@ def darwin_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[T
     min_macos = os.environ.get("MACOSX_DEPLOYMENT_TARGET", "11.0")
     
     # Force single-architecture builds to prevent `universal2`
-    current_arch = platform.machine()
+    if is_64bit_arm():
+        current_arch_flags = ["-arch", "arm64"]
+    elif is_64bit_x86():
+        current_arch_flags = ["-arch", "x86_64"]
+    else:
+        current_arch_flags = []
+
     compile_args = [
         "-std=c++17" if use_cpp else "-std=c99",  # use C++17 for StringZillas, C99 for StringZilla
         "-pedantic",  # stick close to the C language standard, avoid compiler extensions
@@ -148,11 +154,11 @@ def darwin_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[T
         "-fPIC",  # to enable dynamic dispatch
         # "-mfloat-abi=hard",  # NEON intrinsics not available with the soft-float ABI
         f"-mmacosx-version-min={min_macos}",  # minimum macOS version (respect env if provided)
-        "-arch", current_arch,  # force single architecture to prevent universal2 builds
+        *current_arch_flags,  # force single architecture to prevent universal2 builds
     ]
     link_args = [
         "-fPIC",  # to enable dynamic dispatch
-        "-arch", current_arch,  # force single architecture to prevent universal2 builds
+        *current_arch_flags,  # force single architecture to prevent universal2 builds
     ]
 
     # We only support single-arch macOS wheels, but not the Universal builds:

From 6545a04d456bd99bbda0aeb767ca02472b825056 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 30 Aug 2025 22:42:58 +0000
Subject: [PATCH 681/751] Improve: Fuzz PyTests and log environment

---
 scripts/test_stringzilla.py  | 84 ++++++++++++++++++++++--------------
 scripts/test_stringzillas.py | 16 ++++++-
 2 files changed, 65 insertions(+), 35 deletions(-)

diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index cde94105..9395ba1d 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -22,12 +22,13 @@
     uv run --no-project python -X faulthandler -m pytest scripts/test_stringzilla.py -s -vv --maxfail=1 --full-trace
 """
 
-from random import choice, randint
+import os
+import sys
+import tempfile
+import platform
+from random import choice, randint, seed
 from string import ascii_lowercase
 from typing import Optional, Sequence, Dict
-import tempfile
-import sys
-import os
 
 import pytest
 
@@ -59,6 +60,25 @@
     # PyArrow is not installed, most tests will be skipped
     pyarrow_available = False
 
+# Reproducible test seeds for consistent CI runs (keep in sync with test_stringzillas.py)
+SEED_VALUES = [
+    42,  # Classic test seed
+    0,  # Edge case: zero seed
+    1,  # Minimal positive seed
+    314159,  # Pi digits
+]
+
+
+def seed_random_generators(seed_value: Optional[int] = None):
+    """Seed Python and NumPy RNGs for reproducibility."""
+    if seed_value is None:
+        return
+    seed(seed_value)
+    if numpy_available:
+        import numpy as _np  # reuse imported module safely
+
+        _np.random.seed(seed_value)
+
 
 def test_library_properties():
     assert len(sz.__version__.split(".")) == 3, "Semantic versioning must be preserved"
@@ -601,7 +621,9 @@ def test_fuzzy_repetitions(repetitions: int):
 @pytest.mark.parametrize("pattern_length", [1, 2, 3, 4, 5])
 @pytest.mark.parametrize("haystack_length", range(1, 65))
 @pytest.mark.parametrize("variability", range(1, 25))
-def test_fuzzy_substrings(pattern_length: int, haystack_length: int, variability: int):
+@pytest.mark.parametrize("seed_value", SEED_VALUES)
+def test_fuzzy_substrings(pattern_length: int, haystack_length: int, variability: int, seed_value: int):
+    seed_random_generators(seed_value)
     native = get_random_string(variability=variability, length=haystack_length)
     big = Str(native)
     pattern = get_random_string(variability=variability, length=pattern_length)
@@ -623,7 +645,9 @@ def translation_table_to_dict(lut: Sequence) -> Dict[str, str]:
 
 @pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
 @pytest.mark.parametrize("length", range(1, 300))
-def test_translations(length: int):
+@pytest.mark.parametrize("seed_value", SEED_VALUES)
+def test_translations(length: int, seed_value: int):
+    seed_random_generators(seed_value)
 
     map_identity = np.arange(256, dtype=np.uint8)
     map_invert = np.arange(255, -1, -1, dtype=np.uint8)
@@ -660,7 +684,9 @@ def test_translations(length: int):
 @pytest.mark.repeat(3)
 @pytest.mark.parametrize("length", list(range(0, 300)) + [1024, 4096, 100000])
 @pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
-def test_translations_random(length: int):
+@pytest.mark.parametrize("seed_value", SEED_VALUES)
+def test_translations_random(length: int, seed_value: int):
+    seed_random_generators(seed_value)
     body = get_random_string(length=length)
     lut = np.random.randint(0, 256, size=256, dtype=np.uint8)
     assert sz.translate(body, memoryview(lut)) == baseline_translate(body, lut)
@@ -668,10 +694,12 @@ def test_translations_random(length: int):
 
 @pytest.mark.repeat(3)
 @pytest.mark.parametrize("length", list(range(0, 300)) + [1024, 4096, 100000])
-def test_bytesums_random(length: int):
+@pytest.mark.parametrize("seed_value", SEED_VALUES)
+def test_bytesums_random(length: int, seed_value: int):
     def sum_bytes(body: str) -> int:
         return sum([ord(c) for c in body])
 
+    seed_random_generators(seed_value)
     body = get_random_string(length=length)
     assert sum_bytes(body) == sz.bytesum(body)
 
@@ -679,30 +707,9 @@ def sum_bytes(body: str) -> int:
 @pytest.mark.parametrize("list_length", [10, 20, 30, 40, 50])
 @pytest.mark.parametrize("part_length", [5, 10])
 @pytest.mark.parametrize("variability", [2, 3])
-def test_fuzzy_sorting(list_length: int, part_length: int, variability: int):
-    native_list = [get_random_string(variability=variability, length=part_length) for _ in range(list_length)]
-    native_joined = ".".join(native_list)
-    big_joined = Str(native_joined)
-    big_list = big_joined.split(".")
-
-    native_ordered = sorted(native_list)
-    native_order = big_list.argsort()
-    for i in range(list_length):
-        assert native_ordered[i] == native_list[native_order[i]], "Order is wrong"
-        assert native_ordered[i] == str(big_list[int(native_order[i])]), "Split is wrong?!"
-
-    native_list.sort()
-    big_list = big_list.sorted()
-
-    assert len(native_list) == len(big_list)
-    for native_str, big_str in zip(native_list, big_list):
-        assert native_str == str(big_str), "Order is wrong"
-
-
-@pytest.mark.parametrize("list_length", [10, 20, 30, 40, 50])
-@pytest.mark.parametrize("part_length", [5, 10])
-@pytest.mark.parametrize("variability", [2, 3])
-def test_fuzzy_sorting(list_length: int, part_length: int, variability: int):
+@pytest.mark.parametrize("seed_value", SEED_VALUES)
+def test_fuzzy_sorting(list_length: int, part_length: int, variability: int, seed_value: int):
+    seed_random_generators(seed_value)
     native_list = [get_random_string(variability=variability, length=part_length) for _ in range(list_length)]
     native_joined = ".".join(native_list)
     big_joined = Str(native_joined)
@@ -943,6 +950,17 @@ def test_invalid_utf8_handling():
         assert len(str_result) > 0
 
 
-if __name__ == "__main__":
+def log_environment():
+    print(f"=== StringZilla Test Environment ===")
+    print(f"Platform: {platform.platform()}")
+    print(f"Architecture: {platform.machine()}")
+    print(f"Processor: {platform.processor()}")
+    print(f"Python: {platform.python_version()}")
+    print(f"StringZilla version: {sz.__version__}")
+    print(f"StringZilla capabilities: {sorted(sz.__capabilities__)}")
+    print("=" * 40)
 
+
+if __name__ == "__main__":
+    log_environment()
     sys.exit(pytest.main(["-x", "-s", __file__]))
diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index f2e80363..b46181b2 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -30,6 +30,8 @@
     uv run --no-project python -X faulthandler -m pytest scripts/test_stringzillas.py -s -vv --maxfail=1 --full-trace
 """
 
+import sys
+import platform
 from random import choice, randint, seed
 from string import ascii_lowercase
 from typing import Optional, Literal
@@ -635,7 +637,17 @@ def test_fingerprints_random(batch_size: int, capabilities_mode: str, device_nam
     assert np.array_equal(counts, counts_repeated), "Same input should produce same counts"
 
 
-if __name__ == "__main__":
-    import sys
+def log_environment():
+    print(f"=== StringZillas Test Environment ===")
+    print(f"Platform: {platform.platform()}")
+    print(f"Architecture: {platform.machine()}")
+    print(f"Processor: {platform.processor()}")
+    print(f"Python: {platform.python_version()}")
+    print(f"StringZilla version: {sz.__version__}")
+    print(f"StringZilla capabilities: {sorted(sz.__capabilities__)}")
+    print("=" * 40)
+
 
+if __name__ == "__main__":
+    log_environment()
     sys.exit(pytest.main(["-x", "-s", __file__]))

From 3d848123a7e93379aa5fced0b764a42e9877e319 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 30 Aug 2025 22:50:28 +0000
Subject: [PATCH 682/751] Improve: Session-scope fixture for PyTest env logs

---
 scripts/test_stringzilla.py  | 27 +++++++++++++++------------
 scripts/test_stringzillas.py | 27 +++++++++++++++------------
 2 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index 9395ba1d..2251d055 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -69,6 +69,21 @@
 ]
 
 
+@pytest.fixture(scope="session", autouse=True)
+def log_test_environment():
+    """Automatically log environment info before running any tests."""
+    print(f"=== StringZilla Test Environment ===")
+    print(f"Platform: {platform.platform()}")
+    print(f"Architecture: {platform.machine()}")
+    print(f"Processor: {platform.processor()}")
+    print(f"Python: {platform.python_version()}")
+    print(f"StringZilla version: {sz.__version__}")
+    print(f"StringZilla capabilities: {sorted(sz.__capabilities__)}")
+    print(f"NumPy available: {numpy_available}")
+    print(f"PyArrow available: {pyarrow_available}")
+    print("=" * 40)
+
+
 def seed_random_generators(seed_value: Optional[int] = None):
     """Seed Python and NumPy RNGs for reproducibility."""
     if seed_value is None:
@@ -950,17 +965,5 @@ def test_invalid_utf8_handling():
         assert len(str_result) > 0
 
 
-def log_environment():
-    print(f"=== StringZilla Test Environment ===")
-    print(f"Platform: {platform.platform()}")
-    print(f"Architecture: {platform.machine()}")
-    print(f"Processor: {platform.processor()}")
-    print(f"Python: {platform.python_version()}")
-    print(f"StringZilla version: {sz.__version__}")
-    print(f"StringZilla capabilities: {sorted(sz.__capabilities__)}")
-    print("=" * 40)
-
-
 if __name__ == "__main__":
-    log_environment()
     sys.exit(pytest.main(["-x", "-s", __file__]))
diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index b46181b2..5675e2b5 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -45,6 +45,21 @@
 from stringzilla import Strs
 
 
+@pytest.fixture(scope="session", autouse=True)
+def log_test_environment():
+    """Automatically log environment info before running any tests."""
+    print(f"=== StringZillas Test Environment ===")
+    print(f"Platform: {platform.platform()}")
+    print(f"Architecture: {platform.machine()}")
+    print(f"Processor: {platform.processor()}")
+    print(f"Python: {platform.python_version()}")
+    print(f"StringZilla version: {sz.__version__}")
+    print(f"StringZilla capabilities: {sorted(sz.__capabilities__)}")
+    print(f"NumPy version: {np.__version__}")
+    print(f"Affine Gaps version: {ag.__version__}")
+    print("=" * 40)
+
+
 def test_library_properties():
     assert len(sz.__version__.split(".")) == 3, "Semantic versioning must be preserved"
     assert "serial" in sz.__capabilities__, "Serial backend must be present"
@@ -637,17 +652,5 @@ def test_fingerprints_random(batch_size: int, capabilities_mode: str, device_nam
     assert np.array_equal(counts, counts_repeated), "Same input should produce same counts"
 
 
-def log_environment():
-    print(f"=== StringZillas Test Environment ===")
-    print(f"Platform: {platform.platform()}")
-    print(f"Architecture: {platform.machine()}")
-    print(f"Processor: {platform.processor()}")
-    print(f"Python: {platform.python_version()}")
-    print(f"StringZilla version: {sz.__version__}")
-    print(f"StringZilla capabilities: {sorted(sz.__capabilities__)}")
-    print("=" * 40)
-
-
 if __name__ == "__main__":
-    log_environment()
     sys.exit(pytest.main(["-x", "-s", __file__]))

From a21c44d3060dce8bf45f2bd298140ef109c4b1bd Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 30 Aug 2025 23:35:08 +0000
Subject: [PATCH 683/751] Improve: Check rich comparisons before sorting

---
 pyproject.toml               |  2 +-
 scripts/test_stringzilla.py  | 17 +++++++++++++++++
 scripts/test_stringzillas.py |  2 ++
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4729c3ed..e6328f1c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,7 @@ line-length = 120
 
 [tool.cibuildwheel]
 test-requires = ["pytest", "pytest-repeat"]
-test-command = "pytest {project}/scripts/test_stringzilla.py -x"
+test-command = "pytest {project}/scripts/test_stringzilla.py -s -x"
 build-verbosity = 0
 
 # We need to build for all platforms:
diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index 2251d055..77001728 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -730,6 +730,23 @@ def test_fuzzy_sorting(list_length: int, part_length: int, variability: int, see
     big_joined = Str(native_joined)
     big_list = big_joined.split(".")
 
+    # Before testing sorting, validate pairwise comparator consistency
+    def py_cmp(a: str, b: str) -> int:
+        return -1 if a < b else (1 if a > b else 0)
+
+    def sz_cmp(a: str, b: str) -> int:
+        sa, sb = Str(a), Str(b)
+        if sa < sb:
+            return -1
+        if sa > sb:
+            return 1
+        return 0
+
+    # Check every consecutive pair a[i], a[i+1]
+    for i in range(len(native_list) - 1):
+        a, b = native_list[i], native_list[i + 1]
+        assert py_cmp(a, b) == sz_cmp(a, b), f"Comparator mismatch at {i}: '{a}' vs '{b}'"
+
     native_ordered = sorted(native_list)
     native_order = big_list.argsort()
     for i in range(list_length):
diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index 5675e2b5..227d3b24 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -55,6 +55,8 @@ def log_test_environment():
     print(f"Python: {platform.python_version()}")
     print(f"StringZilla version: {sz.__version__}")
     print(f"StringZilla capabilities: {sorted(sz.__capabilities__)}")
+    print(f"StringZillas version: {szs.__version__}")
+    print(f"StringZillas capabilities: {sorted(szs.__capabilities__)}")
     print(f"NumPy version: {np.__version__}")
     print(f"Affine Gaps version: {ag.__version__}")
     print("=" * 40)

From 80fa90ee79bd9526ef44ee625a9b240606503ce6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 31 Aug 2025 12:19:38 +0000
Subject: [PATCH 684/751] Fix: Avoid `sys.getrefcount` tests on PyPy

---
 scripts/test_stringzilla.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index 77001728..7523d2a1 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -72,6 +72,7 @@
 @pytest.fixture(scope="session", autouse=True)
 def log_test_environment():
     """Automatically log environment info before running any tests."""
+
     print(f"=== StringZilla Test Environment ===")
     print(f"Platform: {platform.platform()}")
     print(f"Architecture: {platform.machine()}")
@@ -81,6 +82,17 @@ def log_test_environment():
     print(f"StringZilla capabilities: {sorted(sz.__capabilities__)}")
     print(f"NumPy available: {numpy_available}")
     print(f"PyArrow available: {pyarrow_available}")
+
+    # If QEMU is indicated via env (e.g., set by pyproject), mask out SVE/SVE2 to avoid emulation flakiness.
+    is_qemu = os.environ.get("SZ_IS_QEMU_", "").lower() in ("1", "true", "yes", "on")
+    if is_qemu:
+        sve_like = {"sve", "sve2", "sve2+aes", "sve2_aes"}
+        current = list(getattr(sz, "__capabilities__", ()))
+        desired = tuple(c for c in current if c.lower() not in sve_like)
+        if len(desired) != len(current):
+            print(f"QEMU env detected; disabling {sve_like} for stability")
+            sz.override_capabilities(desired)
+
     print("=" * 40)
 
 
@@ -802,7 +814,11 @@ def test_strs_from_python_basic(container_class: type, view: bool):
 @pytest.mark.parametrize("view", [False, True])
 def test_strs_reference_counting(container_class: type, view: bool):
     """Test reference counting to prevent memory leaks."""
-    import sys
+
+    # CPython-only: PyPy and other interpreters may not expose refcounts or use a different GC model
+    if not hasattr(sys, "getrefcount"):
+        pytest.skip("Reference counting semantics are not available")
+
     import gc
 
     base_items = ["ref", "count", "test"]

From 68172a06a0901a9a5f45a5c6552f163c687d4780 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 31 Aug 2025 12:43:10 +0000
Subject: [PATCH 685/751] Add: Allow resetting the dispatch table

---
 c/stringzilla.c                   | 18 ++++++++++++------
 include/stringzilla/stringzilla.h |  2 ++
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/c/stringzilla.c b/c/stringzilla.c
index 369913f5..721efb8a 100644
--- a/c/stringzilla.c
+++ b/c/stringzilla.c
@@ -73,13 +73,8 @@ __declspec(align(64)) static sz_implementations_t sz_dispatch_table;
 __attribute__((aligned(64))) static sz_implementations_t sz_dispatch_table;
 #endif
 
-/**
- *  @brief  Initializes a global static "virtual table" of supported backends
- *          Run it just once to avoiding unnecessary `if`-s.
- */
-SZ_DYNAMIC void sz_dispatch_table_init(void) {
+static void sz_dispatch_table_update_implementation_(sz_capability_t caps) {
     sz_implementations_t *impl = &sz_dispatch_table;
-    sz_capability_t caps = sz_capabilities();
     sz_unused_(caps); //< Unused when compiling on pre-SIMD machines.
 
     impl->equal = sz_equal_serial;
@@ -244,6 +239,17 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
 #endif
 }
 
+/**
+ *  @brief  Initializes a global static "virtual table" of supported backends
+ *          Run it just once to avoiding unnecessary `if`-s.
+ */
+SZ_DYNAMIC void sz_dispatch_table_init(void) {
+    sz_capability_t caps = sz_capabilities();
+    sz_dispatch_table_update_implementation_(caps);
+}
+
+SZ_DYNAMIC void sz_dispatch_table_update(sz_capability_t caps) { sz_dispatch_table_update_implementation_(caps); }
+
 #if defined(_MSC_VER)
 /*
  *  Makes sure the `sz_dispatch_table_init` function is called at startup, from either an executable or when loading
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index 4deba990..a837674e 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -362,6 +362,7 @@ SZ_DYNAMIC int sz_version_minor(void);
 SZ_DYNAMIC int sz_version_patch(void);
 SZ_DYNAMIC sz_capability_t sz_capabilities(void);
 SZ_DYNAMIC sz_cptr_t sz_capabilities_to_string(sz_capability_t caps);
+SZ_DYNAMIC void sz_dispatch_table_update(sz_capability_t caps);
 
 #else
 
@@ -373,6 +374,7 @@ SZ_PUBLIC sz_capability_t sz_capabilities(void) { return sz_capabilities_impleme
 SZ_PUBLIC sz_cptr_t sz_capabilities_to_string(sz_capability_t caps) {
     return sz_capabilities_to_string_implementation_(caps);
 }
+SZ_PUBLIC void sz_dispatch_table_update(sz_capability_t caps) { sz_unused_(caps); } // No-op in non-dynamic builds
 
 #endif
 

From 072ee2e7b75d88c102fb558675bff7deb20ebf93 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 31 Aug 2025 14:05:22 +0000
Subject: [PATCH 686/751] Improve: Disable SVE in QEMU runs

---
 .github/workflows/prerelease.yml  |   1 +
 .github/workflows/release.yml     |   1 +
 CONTRIBUTING.md                   |   6 ++
 include/stringzilla/stringzilla.h |  40 ++++++++++
 python/stringzilla.c              | 120 ++++++++++++++++++++++++++----
 python/stringzillas.c             |  65 +++++++++++-----
 scripts/test_stringzilla.py       |   8 +-
 scripts/test_stringzillas.py      |  19 ++++-
 8 files changed, 227 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index e65a1549..da71ceb9 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -679,3 +679,4 @@ jobs:
         run: cibuildwheel --output-dir wheelhouse
         env:
           CIBW_BUILD: cp${{ matrix.python-version }}-*
+          SZ_IS_QEMU_: 1 # When emulating this will disable some tricky SIMD instructions
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 856bd713..4cf5d210 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -101,6 +101,7 @@ jobs:
         env:
           CIBW_BUILD: cp${{ matrix.python-version }}-*
           MACOSX_DEPLOYMENT_TARGET: "11.0"
+          SZ_IS_QEMU_: 1 # When emulating this will disable some tricky SIMD instructions
       - name: Upload wheels
         uses: actions/upload-artifact@v4
         with:
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3d37d582..930a3861 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -493,6 +493,12 @@ You may need root privileges for multi-architecture builds:
 sudo $(which cibuildwheel) --platform linux
 ```
 
+To avoid QEMU issues on SVE and some other uncommon instructions, you can inform the PyTest suite, that it's running in an emulated environment:
+
+```bash
+SZ_IS_QEMU_=1 sudo $(which cibuildwheel) --platform linux
+```
+
 On Windows and macOS, to avoid frequent path resolution issues, you may want to use:
 
 ```bash
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
index a837674e..a4187d44 100644
--- a/include/stringzilla/stringzilla.h
+++ b/include/stringzilla/stringzilla.h
@@ -139,6 +139,46 @@ SZ_INTERNAL sz_size_t sz_capabilities_to_strings_implementation_(sz_capability_t
     return count;
 }
 
+SZ_INTERNAL sz_bool_t sz_equal_null_terminated_serial(char const *a, char const *b) {
+    if (!a || !b) return sz_false_k;
+    for (; *a && *b; a++, b++)
+        if (*a != *b) return sz_false_k;
+    return *b == '\0' ? sz_true_k : sz_false_k;
+}
+
+/**
+ *  @brief Internal helper to map a capability name to its flag.
+ *  @param[in] name Capability name, e.g. "serial", "neon", "sve2_aes".
+ *  @return `sz_caps_none_k` if unknown name, or a valid capability flag.
+ */
+SZ_INTERNAL sz_capability_t sz_capability_from_string_implementation_(char const *name) {
+
+    // CPU + execution model
+    if (sz_equal_null_terminated_serial(name, "serial") == sz_true_k) return sz_cap_serial_k;
+    if (sz_equal_null_terminated_serial(name, "parallel") == sz_true_k) return sz_cap_parallel_k;
+    // x86
+    if (sz_equal_null_terminated_serial(name, "haswell") == sz_true_k) return sz_cap_haswell_k;
+    if (sz_equal_null_terminated_serial(name, "skylake") == sz_true_k) return sz_cap_skylake_k;
+    if (sz_equal_null_terminated_serial(name, "ice") == sz_true_k) return sz_cap_ice_k;
+    // Arm
+    if (sz_equal_null_terminated_serial(name, "neon") == sz_true_k) return sz_cap_neon_k;
+    if (sz_equal_null_terminated_serial(name, "sve") == sz_true_k) return sz_cap_sve_k;
+    if (sz_equal_null_terminated_serial(name, "sve2") == sz_true_k) return sz_cap_sve2_k;
+    if (sz_equal_null_terminated_serial(name, "neon_aes") == sz_true_k ||
+        sz_equal_null_terminated_serial(name, "neon+aes") == sz_true_k)
+        return sz_cap_neon_aes_k;
+    if (sz_equal_null_terminated_serial(name, "sve2_aes") == sz_true_k ||
+        sz_equal_null_terminated_serial(name, "sve2+aes") == sz_true_k)
+        return sz_cap_sve2_aes_k;
+    // GPU
+    if (sz_equal_null_terminated_serial(name, "cuda") == sz_true_k) return sz_cap_cuda_k;
+    if (sz_equal_null_terminated_serial(name, "kepler") == sz_true_k) return sz_cap_kepler_k;
+    if (sz_equal_null_terminated_serial(name, "hopper") == sz_true_k) return sz_cap_hopper_k;
+    // Any
+    if (sz_equal_null_terminated_serial(name, "any") == sz_true_k) return sz_cap_any_k;
+    return sz_caps_none_k;
+}
+
 /**
  *  @brief Internal helper function to convert SIMD capabilities to a string.
  *  @sa    sz_capabilities_to_string, sz_capabilities
diff --git a/python/stringzilla.c b/python/stringzilla.c
index 2c540c90..803d435c 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -46,6 +46,10 @@ typedef SSIZE_T ssize_t;
 #define SSIZE_MAX (SIZE_MAX / 2)
 #endif
 
+// Undefine _POSIX_C_SOURCE to avoid redefinition warning with Python headers
+#ifdef _POSIX_C_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
 #include <Python.h> // Core CPython interfaces
 
 #include <errno.h>  // `errno`
@@ -479,11 +483,13 @@ SZ_DYNAMIC sz_bool_t sz_py_export_strings_as_u64tape(PyObject *object, sz_cptr_t
         *data = strs->data.u64_tape.data;
         *offsets = strs->data.u64_tape.offsets;
         *count = strs->data.u64_tape.count;
+        return sz_true_k;
     }
     else if (strs->layout == STRS_U64_TAPE_VIEW) {
         *data = strs->data.u64_tape_view.data;
         *offsets = strs->data.u64_tape_view.offsets;
         *count = strs->data.u64_tape_view.count;
+        return sz_true_k;
     }
     else { return sz_false_k; }
 }
@@ -750,6 +756,7 @@ SZ_DYNAMIC sz_bool_t sz_py_replace_strings_allocator(PyObject *object, sz_memory
     case STRS_FRAGMENTED: old_allocator = strs->data.fragmented.allocator; break;
     case STRS_U32_TAPE_VIEW:
     case STRS_U64_TAPE_VIEW:
+    default:
         // Views don't own memory, use default allocator for comparison
         sz_memory_allocator_init_default(&old_allocator);
         break;
@@ -4129,29 +4136,30 @@ static PyObject *Strs_get_layout(Strs *self, void *Py_UNUSED(closure)) {
     switch (self->layout) {
     case STRS_U32_TAPE_VIEW:
         snprintf(buffer, sizeof(buffer), "Strs[layout=U32_TAPE_VIEW, count=%zu, data=%p, offsets=%p, parent=%p]",
-                 self->data.u32_tape_view.count, self->data.u32_tape_view.data, self->data.u32_tape_view.offsets,
-                 self->data.u32_tape_view.parent);
+                 self->data.u32_tape_view.count, (void *)self->data.u32_tape_view.data,
+                 (void *)self->data.u32_tape_view.offsets, (void *)self->data.u32_tape_view.parent);
         break;
 
     case STRS_U64_TAPE_VIEW:
         snprintf(buffer, sizeof(buffer), "Strs[layout=U64_TAPE_VIEW, count=%zu, data=%p, offsets=%p, parent=%p]",
-                 self->data.u64_tape_view.count, self->data.u64_tape_view.data, self->data.u64_tape_view.offsets,
-                 self->data.u64_tape_view.parent);
+                 self->data.u64_tape_view.count, (void *)self->data.u64_tape_view.data,
+                 (void *)self->data.u64_tape_view.offsets, (void *)self->data.u64_tape_view.parent);
         break;
 
     case STRS_U32_TAPE:
         snprintf(buffer, sizeof(buffer), "Strs[layout=U32_TAPE, count=%zu, data=%p, offsets=%p]",
-                 self->data.u32_tape.count, self->data.u32_tape.data, self->data.u32_tape.offsets);
+                 self->data.u32_tape.count, (void *)self->data.u32_tape.data, (void *)self->data.u32_tape.offsets);
         break;
 
     case STRS_U64_TAPE:
         snprintf(buffer, sizeof(buffer), "Strs[layout=U64_TAPE, count=%zu, data=%p, offsets=%p]",
-                 self->data.u64_tape.count, self->data.u64_tape.data, self->data.u64_tape.offsets);
+                 self->data.u64_tape.count, (void *)self->data.u64_tape.data, (void *)self->data.u64_tape.offsets);
         break;
 
     case STRS_FRAGMENTED:
         snprintf(buffer, sizeof(buffer), "Strs[layout=FRAGMENTED, count=%zu, spans=%p, parent=%p]",
-                 self->data.fragmented.count, self->data.fragmented.spans, self->data.fragmented.parent);
+                 self->data.fragmented.count, (void *)self->data.fragmented.spans,
+                 (void *)self->data.fragmented.parent);
         break;
 
     default: snprintf(buffer, sizeof(buffer), "Strs[layout=UNKNOWN(%d)]", self->layout); break;
@@ -4347,15 +4355,15 @@ static PyObject *Strs_str(Strs *self) {
         sz_cptr_t cstr_start = NULL;
         sz_size_t cstr_length = 0;
         getter(self, i, count, &parent_string, &cstr_start, &cstr_length);
-        
+
         if (i != 0) total_bytes += 2; // For the preceding comma and space
 
         // Check if string is valid UTF-8 to determine format
         if (sz_runes_valid(cstr_start, cstr_length)) {
             // Valid UTF-8: format as '...' with escaped quotes
-            total_bytes += 2; // Opening and closing quotes
+            total_bytes += 2;           // Opening and closing quotes
             total_bytes += cstr_length; // Base string length
-            
+
             // Count the number of single quotes that need escaping
             sz_cptr_t scan_ptr = cstr_start;
             sz_size_t scan_length = cstr_length;
@@ -4367,11 +4375,12 @@ static PyObject *Strs_str(Strs *self) {
                 scan_length -= next_quote - scan_ptr + 1;
                 scan_ptr = next_quote + 1;
             }
-        } else {
+        }
+        else {
             // Invalid UTF-8: format as b'\x...'
-            total_bytes += 3; // "b'" prefix
+            total_bytes += 3;               // "b'" prefix
             total_bytes += cstr_length * 4; // Each byte becomes \xNN (4 chars)
-            total_bytes += 1; // Closing quote
+            total_bytes += 1;               // Closing quote
         }
     }
 
@@ -5252,6 +5261,88 @@ static PyTypeObject StrsType = {
 
 #pragma endregion
 
+static int parse_and_intersect_capabilities(PyObject *caps_obj, sz_capability_t *result) {
+    if (!caps_obj) {
+        PyErr_SetString(PyExc_TypeError, "capabilities must be a tuple or list of strings");
+        return -1;
+    }
+    PyObject *seq = PySequence_Fast(caps_obj, "capabilities must be a tuple or list of strings");
+    if (!seq) return -1;
+
+    sz_capability_t requested_caps = 0;
+    Py_ssize_t n = PySequence_Fast_GET_SIZE(seq);
+    PyObject **items = PySequence_Fast_ITEMS(seq);
+
+    for (Py_ssize_t i = 0; i < n; i++) {
+        PyObject *item = items[i];
+        if (!PyUnicode_Check(item)) {
+            PyErr_SetString(PyExc_TypeError, "capabilities must be strings");
+            Py_DECREF(seq);
+            return -1;
+        }
+        char const *cap_str = PyUnicode_AsUTF8(item);
+        if (!cap_str) {
+            Py_DECREF(seq);
+            return -1;
+        }
+        sz_capability_t flag = sz_capability_from_string_implementation_(cap_str);
+        if (flag == sz_caps_none_k) {
+            PyErr_Format(PyExc_ValueError, "Unknown capability: %s", cap_str);
+            Py_DECREF(seq);
+            return -1;
+        }
+        requested_caps |= flag;
+    }
+    Py_DECREF(seq);
+
+    // Intersect with hardware capabilities for safety
+    *result = requested_caps & sz_capabilities();
+    if (*result == 0) { *result = sz_cap_serial_k; }
+    return 0;
+}
+
+static char const doc_reset_capabilities[] = //
+    "reset_capabilities(names) -> None\n\n"
+    "Sets the active SIMD/backend capabilities for this module and updates the\n"
+    "runtime dispatch table. The provided names are intersected with hardware\n"
+    "capabilities; if the result is empty, falls back to 'serial'.\n\n"
+    "Side effects: updates stringzilla.__capabilities__ and __capabilities_str__.";
+
+static PyObject *module_reset_capabilities(PyObject *self, PyObject *args) {
+    PyObject *caps_obj = NULL;
+    if (!PyArg_ParseTuple(args, "O", &caps_obj)) return NULL;
+
+    sz_capability_t caps = 0;
+    if (parse_and_intersect_capabilities(caps_obj, &caps) != 0) return NULL;
+
+    // Update the dispatch table
+    sz_dispatch_table_update(caps);
+
+    // Recompute and set module-level capability exports
+    sz_cptr_t cap_strings[SZ_CAPABILITIES_COUNT];
+    sz_size_t cap_count = sz_capabilities_to_strings_implementation_(caps, cap_strings, SZ_CAPABILITIES_COUNT);
+    PyObject *caps_tuple = PyTuple_New(cap_count);
+    if (!caps_tuple) return NULL;
+    for (sz_size_t i = 0; i < cap_count; i++) {
+        PyObject *cap_str = PyUnicode_FromString(cap_strings[i]);
+        if (!cap_str) {
+            Py_DECREF(caps_tuple);
+            return NULL;
+        }
+        PyTuple_SET_ITEM(caps_tuple, i, cap_str);
+    }
+    if (PyObject_SetAttrString(self, "__capabilities__", caps_tuple) != 0) {
+        Py_DECREF(caps_tuple);
+        return NULL;
+    }
+    Py_DECREF(caps_tuple);
+
+    sz_cptr_t caps_str = sz_capabilities_to_string(caps);
+    if (PyObject_SetAttrString(self, "__capabilities_str__", PyUnicode_FromString(caps_str)) != 0) { return NULL; }
+
+    Py_RETURN_NONE;
+}
+
 static void stringzilla_cleanup(PyObject *m) {
     if (temporary_memory.start) free(temporary_memory.start);
     temporary_memory.start = NULL;
@@ -5301,6 +5392,9 @@ static PyMethodDef stringzilla_methods[] = {
     {"hash", (PyCFunction)Str_like_hash, SZ_METHOD_FLAGS, doc_like_hash},
     {"bytesum", (PyCFunction)Str_like_bytesum, SZ_METHOD_FLAGS, doc_like_bytesum},
 
+    // Updating module capabilities
+    {"reset_capabilities", (PyCFunction)module_reset_capabilities, METH_VARARGS, doc_reset_capabilities},
+
     {NULL, NULL, 0, NULL}};
 
 static PyModuleDef stringzilla_module = {
diff --git a/python/stringzillas.c b/python/stringzillas.c
index 6b680233..9ca2f8a6 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -306,25 +306,12 @@ static int parse_and_intersect_capabilities(PyObject *caps_obj, sz_capability_t
         char const *cap_str = PyUnicode_AsUTF8(item);
         if (!cap_str) return -1;
 
-        // Map string to capability flag
-        if (strcmp(cap_str, "serial") == 0) { requested_caps |= sz_cap_serial_k; }
-        else if (strcmp(cap_str, "parallel") == 0) { requested_caps |= sz_cap_parallel_k; }
-        else if (strcmp(cap_str, "haswell") == 0) { requested_caps |= sz_cap_haswell_k; }
-        else if (strcmp(cap_str, "skylake") == 0) { requested_caps |= sz_cap_skylake_k; }
-        else if (strcmp(cap_str, "ice") == 0) { requested_caps |= sz_cap_ice_k; }
-        else if (strcmp(cap_str, "neon") == 0) { requested_caps |= sz_cap_neon_k; }
-        else if (strcmp(cap_str, "neon_aes") == 0) { requested_caps |= sz_cap_neon_aes_k; }
-        else if (strcmp(cap_str, "sve") == 0) { requested_caps |= sz_cap_sve_k; }
-        else if (strcmp(cap_str, "sve2") == 0) { requested_caps |= sz_cap_sve2_k; }
-        else if (strcmp(cap_str, "sve2_aes") == 0) { requested_caps |= sz_cap_sve2_aes_k; }
-        else if (strcmp(cap_str, "cuda") == 0) { requested_caps |= sz_cap_cuda_k; }
-        else if (strcmp(cap_str, "kepler") == 0) { requested_caps |= sz_cap_kepler_k; }
-        else if (strcmp(cap_str, "hopper") == 0) { requested_caps |= sz_cap_hopper_k; }
-        else if (strcmp(cap_str, "any") == 0) { requested_caps |= sz_cap_any_k; }
-        else {
+        sz_capability_t flag = sz_capability_from_string_implementation_(cap_str);
+        if (flag == sz_caps_none_k) {
             PyErr_Format(PyExc_ValueError, "Unknown capability: %s", cap_str);
             return -1;
         }
+        requested_caps |= flag;
     }
 
     // Intersect with hardware capabilities
@@ -1895,6 +1882,48 @@ static PyTypeObject FingerprintsType = {
 
 #pragma endregion
 
+static char const doc_reset_capabilities[] = //
+    "reset_capabilities(names) -> None\n\n"
+    "Sets the active SIMD/backend capabilities for this module and updates the\n"
+    "default hardware capabilities. The provided names are intersected with hardware\n"
+    "capabilities; if the result is empty, falls back to 'serial'.\n\n"
+    "Side effects: updates stringzillas.__capabilities__ and __capabilities_str__.";
+
+static PyObject *module_reset_capabilities(PyObject *self, PyObject *args) {
+    PyObject *caps_obj = NULL;
+    if (!PyArg_ParseTuple(args, "O", &caps_obj)) return NULL;
+
+    sz_capability_t caps = 0;
+    if (parse_and_intersect_capabilities(caps_obj, &caps) != 0) return NULL;
+
+    // Update the default hardware capabilities
+    default_hardware_capabilities = caps;
+
+    // Recompute and set module-level capability exports
+    sz_cptr_t cap_strings[SZ_CAPABILITIES_COUNT];
+    sz_size_t cap_count = sz_capabilities_to_strings_implementation_(caps, cap_strings, SZ_CAPABILITIES_COUNT);
+    PyObject *caps_tuple = PyTuple_New(cap_count);
+    if (!caps_tuple) return NULL;
+    for (sz_size_t i = 0; i < cap_count; i++) {
+        PyObject *cap_str = PyUnicode_FromString(cap_strings[i]);
+        if (!cap_str) {
+            Py_DECREF(caps_tuple);
+            return NULL;
+        }
+        PyTuple_SET_ITEM(caps_tuple, i, cap_str);
+    }
+    if (PyObject_SetAttrString(self, "__capabilities__", caps_tuple) != 0) {
+        Py_DECREF(caps_tuple);
+        return NULL;
+    }
+    Py_DECREF(caps_tuple);
+
+    sz_cptr_t caps_str = sz_capabilities_to_string_implementation_(caps);
+    if (PyObject_SetAttrString(self, "__capabilities_str__", PyUnicode_FromString(caps_str)) != 0) { return NULL; }
+
+    Py_RETURN_NONE;
+}
+
 static void stringzillas_cleanup(PyObject *m) {
     sz_unused_(m);
     if (default_device_scope) {
@@ -1903,7 +1932,9 @@ static void stringzillas_cleanup(PyObject *m) {
     }
 }
 
-static PyMethodDef stringzillas_methods[] = {{NULL, NULL, 0, NULL}};
+static PyMethodDef stringzillas_methods[] = {
+    {"reset_capabilities", (PyCFunction)module_reset_capabilities, METH_VARARGS, doc_reset_capabilities},
+    {NULL, NULL, 0, NULL}};
 
 static PyModuleDef stringzillas_module = {
     PyModuleDef_HEAD_INIT,
diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index 7523d2a1..a6433f6a 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -16,9 +16,11 @@
     --full-trace        full Python tracebacks
     -k <pattern>        filter tests by substring
     -X faulthandler     to dump on fatal signals
+    --verbose           enable verbose output
 
 Example:
 
+    uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x --verbose
     uv run --no-project python -X faulthandler -m pytest scripts/test_stringzilla.py -s -vv --maxfail=1 --full-trace
 """
 
@@ -86,12 +88,12 @@ def log_test_environment():
     # If QEMU is indicated via env (e.g., set by pyproject), mask out SVE/SVE2 to avoid emulation flakiness.
     is_qemu = os.environ.get("SZ_IS_QEMU_", "").lower() in ("1", "true", "yes", "on")
     if is_qemu:
-        sve_like = {"sve", "sve2", "sve2+aes", "sve2_aes"}
+        sve_like = {"sve", "sve2", "sve2+aes"}
         current = list(getattr(sz, "__capabilities__", ()))
         desired = tuple(c for c in current if c.lower() not in sve_like)
         if len(desired) != len(current):
             print(f"QEMU env detected; disabling {sve_like} for stability")
-            sz.override_capabilities(desired)
+            sz.reset_capabilities(desired)
 
     print("=" * 40)
 
@@ -110,6 +112,8 @@ def seed_random_generators(seed_value: Optional[int] = None):
 def test_library_properties():
     assert len(sz.__version__.split(".")) == 3, "Semantic versioning must be preserved"
     assert "serial" in sz.__capabilities__, "Serial backend must be present"
+    assert isinstance(sz.__capabilities_str__, str) and len(sz.__capabilities_str__) > 0
+    sz.reset_capabilities(sz.__capabilities__)  # Should not raise
 
 
 @pytest.mark.parametrize("native_type", [str, bytes, bytearray])
diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index 227d3b24..bb6568b0 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -24,12 +24,14 @@
     --full-trace        full Python tracebacks
     -k <pattern>        filter tests by substring
     -X faulthandler     to dump on fatal signals
+    --verbose           enable verbose output
 
 Example:
 
+    uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x --verbose
     uv run --no-project python -X faulthandler -m pytest scripts/test_stringzillas.py -s -vv --maxfail=1 --full-trace
 """
-
+import os
 import sys
 import platform
 from random import choice, randint, seed
@@ -59,16 +61,31 @@ def log_test_environment():
     print(f"StringZillas capabilities: {sorted(szs.__capabilities__)}")
     print(f"NumPy version: {np.__version__}")
     print(f"Affine Gaps version: {ag.__version__}")
+
+    # If QEMU is indicated via env (e.g., set by pyproject), mask out SVE/SVE2 to avoid emulation flakiness.
+    is_qemu = os.environ.get("SZ_IS_QEMU_", "").lower() in ("1", "true", "yes", "on")
+    if is_qemu:
+        sve_like = {"sve", "sve2", "sve2+aes"}
+        current = list(getattr(sz, "__capabilities__", ()))
+        desired = tuple(c for c in current if c.lower() not in sve_like)
+        if len(desired) != len(current):
+            print(f"QEMU env detected; disabling {sve_like} for stability")
+            sz.reset_capabilities(desired)
+
     print("=" * 40)
 
 
 def test_library_properties():
     assert len(sz.__version__.split(".")) == 3, "Semantic versioning must be preserved"
     assert "serial" in sz.__capabilities__, "Serial backend must be present"
+    assert isinstance(sz.__capabilities_str__, str) and len(sz.__capabilities_str__) > 0
+    sz.reset_capabilities(sz.__capabilities__)  # Should not raise
 
     # Test StringZillas properties
     assert len(szs.__version__.split(".")) == 3, "Semantic versioning must be preserved"
     assert "serial" in szs.__capabilities__, "Serial backend must be present"
+    assert isinstance(szs.__capabilities_str__, str) and len(szs.__capabilities_str__) > 0
+    sz.reset_capabilities(szs.__capabilities__)  # Should not raise
 
 
 DeviceName = Literal["default", "cpu_cores", "gpu_device"]

From 679f7b9c480043d65260b524bd866c6fbf6fc3bf Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 31 Aug 2025 15:45:14 +0000
Subject: [PATCH 687/751] Fix: Minor logical inconsistencies & unused vars

---
 c/stringzillas.cuh           | 10 +++++-----
 python/stringzilla.c         | 13 -------------
 python/stringzillas.c        |  3 +--
 scripts/test_stringzilla.py  |  2 +-
 scripts/test_stringzillas.py |  2 +-
 5 files changed, 8 insertions(+), 22 deletions(-)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index 1c6cbb21..ab314e6a 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -845,8 +845,8 @@ SZ_DYNAMIC sz_capability_t szs_capabilities(void) {
     static sz_capability_t static_caps = sz_caps_none_k;
     if (static_caps == sz_caps_none_k) {
         sz_capability_t cpu_caps = sz_capabilities_implementation_();
-        sz_capability_t gpu_caps = sz_caps_none_k;
 #if SZ_USE_CUDA
+        sz_capability_t gpu_caps = sz_caps_none_k;
         sz::gpu_specs_t first_gpu_specs;
         auto specs_status = static_cast<sz::status_t>(szs::gpu_specs_fetch(first_gpu_specs));
         if (specs_status == sz::status_t::missing_gpu_k) { return cpu_caps; }        // No GPUs available
@@ -975,12 +975,12 @@ SZ_DYNAMIC void szs_device_scope_free(szs_device_scope_t scope_punned) {
 
 SZ_DYNAMIC sz_status_t szs_device_scope_get_capabilities(szs_device_scope_t scope_punned,
                                                          sz_capability_t *capabilities) {
-    if (scope_punned == nullptr || capabilities == nullptr) return sz_status_unknown_k;
-    auto *scope = reinterpret_cast<device_scope_t *>(scope_punned);
 
+    if (scope_punned == nullptr || capabilities == nullptr) return sz_status_unknown_k;
     sz_capability_t system_caps = szs_capabilities();
 
 #if SZ_USE_CUDA
+    auto *scope = reinterpret_cast<device_scope_t *>(scope_punned);
     if (std::holds_alternative<gpu_scope_t>(scope->variants)) {
         // For GPU scope, intersect system capabilities with CUDA capabilities
         *capabilities = static_cast<sz_capability_t>(system_caps & sz_caps_cuda_k);
@@ -1299,7 +1299,7 @@ SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_init(                       /
     auto const linear_costs = szs::linear_gap_costs_t {open};
     auto const affine_costs = szs::affine_gap_costs_t {open, extend};
     auto substitution_costs = szs::error_costs_256x256_t {};
-    std::memcpy(&substitution_costs, subs, sizeof(substitution_costs));
+    std::memcpy((void *)&substitution_costs, (void const *)subs, sizeof(substitution_costs));
 
 #if SZ_USE_ICE
     bool const can_use_ice = (capabilities & sz_cap_ice_k) == sz_cap_ice_k;
@@ -1439,7 +1439,7 @@ SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_init(                         /
     auto const linear_costs = szs::linear_gap_costs_t {open};
     auto const affine_costs = szs::affine_gap_costs_t {open, extend};
     auto substitution_costs = szs::error_costs_256x256_t {};
-    std::memcpy(&substitution_costs, subs, sizeof(substitution_costs));
+    std::memcpy((void *)&substitution_costs, (void const *)subs, sizeof(substitution_costs));
 
 #if SZ_USE_ICE
     bool const can_use_ice = (capabilities & sz_cap_ice_k) == sz_cap_ice_k;
diff --git a/python/stringzilla.c b/python/stringzilla.c
index 803d435c..f9d7b009 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -4956,19 +4956,6 @@ static int Strs_init_from_iterable(Strs *self, PyObject *sequence_obj, int view)
         // Grow data buffer if needed (doubling strategy)
         while (total_bytes + item_length > data_capacity) {
             sz_size_t new_capacity = data_capacity * 2;
-            if (new_capacity < data_capacity) { // Overflow check
-                new_capacity = SIZE_MAX;
-                if (total_bytes + item_length > new_capacity) {
-                    Py_DECREF(item);
-                    allocator.free(data_buffer, data_capacity, allocator.handle);
-                    allocator.free(offsets, offsets_capacity * (use_64bit ? sizeof(sz_u64_t) : sizeof(sz_u32_t)),
-                                   allocator.handle);
-                    Py_DECREF(iterator);
-                    PyErr_SetString(PyExc_MemoryError, "String data too large");
-                    return -1;
-                }
-            }
-
             sz_ptr_t new_buffer = (sz_ptr_t)allocator.allocate(new_capacity, allocator.handle);
             if (!new_buffer) {
                 Py_DECREF(item);
diff --git a/python/stringzillas.c b/python/stringzillas.c
index 9ca2f8a6..faeb133f 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -172,8 +172,6 @@ static PyObject *DeviceScope_new(PyTypeObject *type, PyObject *args, PyObject *k
 static int DeviceScope_init(DeviceScope *self, PyObject *args, PyObject *kwargs) {
     sz_size_t cpu_cores = 0;
     sz_size_t gpu_device = 0;
-    int has_cpu_cores = 0;
-    int has_gpu_device = 0;
     PyObject *cpu_cores_obj = NULL;
     PyObject *gpu_device_obj = NULL;
 
@@ -1955,6 +1953,7 @@ PyMODINIT_FUNC PyInit_stringzillas(void) {
 #if defined(NPY_VERSION)
     import_array();
     numpy_available = 1;
+    sz_unused_(numpy_module);
 #else
     // Try to import numpy module dynamically
     numpy_module = PyImport_ImportModule("numpy");
diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index a6433f6a..c79a8d83 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -20,7 +20,7 @@
 
 Example:
 
-    uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x --verbose
+    uv pip install -e . --force-reinstall --no-build-isolation --verbose
     uv run --no-project python -X faulthandler -m pytest scripts/test_stringzilla.py -s -vv --maxfail=1 --full-trace
 """
 
diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index bb6568b0..8915decf 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -28,7 +28,7 @@
 
 Example:
 
-    uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x --verbose
+    SZ_TARGET=stringzillas-cpus uv pip install -e . --force-reinstall --no-build-isolation --verbose
     uv run --no-project python -X faulthandler -m pytest scripts/test_stringzillas.py -s -vv --maxfail=1 --full-trace
 """
 import os

From d433db0870d9c7408e714048574895a29a7d585d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 31 Aug 2025 15:45:36 +0000
Subject: [PATCH 688/751] Make: Forwarding `SZ_IS_QEMU_`

---
 .github/workflows/prerelease.yml | 2 +-
 .github/workflows/release.yml    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index da71ceb9..45410457 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -679,4 +679,4 @@ jobs:
         run: cibuildwheel --output-dir wheelhouse
         env:
           CIBW_BUILD: cp${{ matrix.python-version }}-*
-          SZ_IS_QEMU_: 1 # When emulating this will disable some tricky SIMD instructions
+          CIBW_ENVIRONMENT_LINUX: SZ_IS_QEMU_=1 # When emulating this will disable some tricky SIMD instructions
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 4cf5d210..d1ab782f 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -101,7 +101,7 @@ jobs:
         env:
           CIBW_BUILD: cp${{ matrix.python-version }}-*
           MACOSX_DEPLOYMENT_TARGET: "11.0"
-          SZ_IS_QEMU_: 1 # When emulating this will disable some tricky SIMD instructions
+          CIBW_ENVIRONMENT_LINUX: SZ_IS_QEMU_=1 # When emulating this will disable some tricky SIMD instructions
       - name: Upload wheels
         uses: actions/upload-artifact@v4
         with:

From 3565f9b811d5c9ddb27547cb2c3f0d145e1e8d86 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 31 Aug 2025 16:12:48 +0000
Subject: [PATCH 689/751] Add: `sz.fill_random` for Python

---
 python/stringzilla.c | 99 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 97 insertions(+), 2 deletions(-)

diff --git a/python/stringzilla.c b/python/stringzilla.c
index f9d7b009..0265b9be 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -1208,6 +1208,94 @@ static PyObject *Str_like_hash(PyObject *self, PyObject *const *args, Py_ssize_t
     return PyLong_FromUnsignedLongLong((unsigned long long)result);
 }
 
+static char const doc_fill_random[] = //
+    "Fill a string-like buffer in place with pseudo-random bytes.\n"
+    "\n"
+    "Args:\n"
+    "  buffer (Str or bytes-like): Writable, contiguous byte buffer (e.g., memoryview/bytearray).\n"
+    "  nonce (int, optional): Seed/nonce ensuring reproducible output for the same inputs (default 0).\n"
+    "  start (int, optional): Starting index (default 0).\n"
+    "  end (int, optional): Ending index (default len(buffer)).\n"
+    "Returns:\n"
+    "  None: Mutates the buffer slice in place.";
+
+static PyObject *Str_fill_random(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                 PyObject *args_names_tuple) {
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+    if (positional_args_count < !is_member || positional_args_count > !is_member + 3) {
+        PyErr_SetString(PyExc_TypeError, "fill_random() expects 1 to 4 positional arguments");
+        return NULL;
+    }
+
+    PyObject *buffer_obj = is_member ? self : args[0];
+    PyObject *nonce_obj = positional_args_count > !is_member ? args[!is_member] : NULL;
+    PyObject *start_obj = positional_args_count > !is_member + 1 ? args[!is_member + 1] : NULL;
+    PyObject *end_obj = positional_args_count > !is_member + 2 ? args[!is_member + 2] : NULL;
+
+    // Optional keyword arguments
+    if (args_names_tuple) {
+        Py_ssize_t kw_count = PyTuple_GET_SIZE(args_names_tuple);
+        for (Py_ssize_t i = 0; i < kw_count; ++i) {
+            PyObject *key = PyTuple_GET_ITEM(args_names_tuple, i);
+            PyObject *value = args[positional_args_count + i];
+            if (PyUnicode_CompareWithASCIIString(key, "nonce") == 0 && !nonce_obj) nonce_obj = value;
+            else if (PyUnicode_CompareWithASCIIString(key, "start") == 0 && !start_obj)
+                start_obj = value;
+            else if (PyUnicode_CompareWithASCIIString(key, "end") == 0 && !end_obj)
+                end_obj = value;
+            else {
+                PyErr_Format(PyExc_TypeError, "unexpected keyword argument: %S", key);
+                return NULL;
+            }
+        }
+    }
+
+    // Parse start/end
+    Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
+    if (start_obj && ((start = PyLong_AsSsize_t(start_obj)) == -1 && PyErr_Occurred())) {
+        PyErr_SetString(PyExc_TypeError, "start must be an integer");
+        return NULL;
+    }
+    if (end_obj && ((end = PyLong_AsSsize_t(end_obj)) == -1 && PyErr_Occurred())) {
+        PyErr_SetString(PyExc_TypeError, "end must be an integer");
+        return NULL;
+    }
+
+    // Parse nonce
+    sz_u64_t nonce = 0;
+    if (nonce_obj) {
+        if (!PyLong_Check(nonce_obj)) {
+            PyErr_SetString(PyExc_TypeError, "nonce must be an integer");
+            return NULL;
+        }
+        nonce = PyLong_AsUnsignedLongLong(nonce_obj);
+        if (PyErr_Occurred()) return NULL;
+    }
+
+    // Export buffer and clamp range
+    sz_string_view_t buf;
+    if (!sz_py_export_string_like(buffer_obj, &buf.start, &buf.length)) {
+        wrap_current_exception("First argument must be string-like");
+        return NULL;
+    }
+
+    if (start < 0 || (end != PY_SSIZE_T_MAX && end < 0)) {
+        PyErr_SetString(PyExc_ValueError, "start/end must be non-negative");
+        return NULL;
+    }
+
+    if ((sz_size_t)start > buf.length) {
+        Py_RETURN_NONE; // nothing to do
+    }
+
+    buf.start += start;
+    buf.length -= start;
+    if (end != PY_SSIZE_T_MAX && (sz_size_t)(end - start) < buf.length) { buf.length = (sz_size_t)(end - start); }
+
+    if (buf.length > 0) sz_fill_random((sz_ptr_t)buf.start, buf.length, nonce);
+    Py_RETURN_NONE;
+}
+
 static char const doc_like_bytesum[] = //
     "Compute the checksum of individual byte values in a string.\n"
     "\n"
@@ -3481,8 +3569,8 @@ static PyMethodDef Str_methods[] = {
     {"splitlines", (PyCFunction)Str_splitlines, SZ_METHOD_FLAGS, doc_splitlines},
     {"startswith", (PyCFunction)Str_startswith, SZ_METHOD_FLAGS, doc_startswith},
     {"endswith", (PyCFunction)Str_endswith, SZ_METHOD_FLAGS, doc_endswith},
-    {"translate", (PyCFunction)Str_translate, SZ_METHOD_FLAGS, doc_translate},
     {"decode", (PyCFunction)Str_decode, SZ_METHOD_FLAGS, doc_decode},
+    {"hash", (PyCFunction)Str_like_hash, SZ_METHOD_FLAGS, doc_like_hash},
 
     // Bidirectional operations
     {"find", (PyCFunction)Str_find, SZ_METHOD_FLAGS, doc_find},
@@ -3512,6 +3600,10 @@ static PyMethodDef Str_methods[] = {
     {"offset_within", (PyCFunction)Str_offset_within, SZ_METHOD_FLAGS, doc_offset_within},
     {"write_to", (PyCFunction)Str_write_to, SZ_METHOD_FLAGS, doc_write_to},
 
+    // In-place transforms
+    {"translate", (PyCFunction)Str_translate, SZ_METHOD_FLAGS, doc_translate},
+    {"fill_random", (PyCFunction)Str_fill_random, SZ_METHOD_FLAGS, doc_fill_random},
+
     {NULL, NULL, 0, NULL} // Sentinel
 };
 
@@ -5343,7 +5435,6 @@ static PyMethodDef stringzilla_methods[] = {
     {"splitlines", (PyCFunction)Str_splitlines, SZ_METHOD_FLAGS, doc_splitlines},
     {"startswith", (PyCFunction)Str_startswith, SZ_METHOD_FLAGS, doc_startswith},
     {"endswith", (PyCFunction)Str_endswith, SZ_METHOD_FLAGS, doc_endswith},
-    {"translate", (PyCFunction)Str_translate, SZ_METHOD_FLAGS, doc_translate},
     {"decode", (PyCFunction)Str_decode, SZ_METHOD_FLAGS, doc_decode},
     {"equal", (PyCFunction)Str_like_equal, SZ_METHOD_FLAGS, doc_like_equal},
 
@@ -5375,6 +5466,10 @@ static PyMethodDef stringzilla_methods[] = {
     {"offset_within", (PyCFunction)Str_offset_within, SZ_METHOD_FLAGS, doc_offset_within},
     {"write_to", (PyCFunction)Str_write_to, SZ_METHOD_FLAGS, doc_write_to},
 
+    // In-place transforms
+    {"translate", (PyCFunction)Str_translate, SZ_METHOD_FLAGS, doc_translate},
+    {"fill_random", (PyCFunction)Str_fill_random, SZ_METHOD_FLAGS, doc_fill_random},
+
     // Global unary extensions
     {"hash", (PyCFunction)Str_like_hash, SZ_METHOD_FLAGS, doc_like_hash},
     {"bytesum", (PyCFunction)Str_like_bytesum, SZ_METHOD_FLAGS, doc_like_bytesum},

From aea096bbb1903f60694b646fe1717b1c814c2ab1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 31 Aug 2025 20:30:32 +0000
Subject: [PATCH 690/751] Fix: `np.random` v1 vs v2 compatibility

---
 scripts/test_stringzilla.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index c79a8d83..aa7cb02d 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -83,7 +83,11 @@ def log_test_environment():
     print(f"StringZilla version: {sz.__version__}")
     print(f"StringZilla capabilities: {sorted(sz.__capabilities__)}")
     print(f"NumPy available: {numpy_available}")
+    if numpy_available:
+        print(f"NumPy version: {np.__version__}")
     print(f"PyArrow available: {pyarrow_available}")
+    if pyarrow_available:
+        print(f"PyArrow version: {pa.__version__}")
 
     # If QEMU is indicated via env (e.g., set by pyproject), mask out SVE/SVE2 to avoid emulation flakiness.
     is_qemu = os.environ.get("SZ_IS_QEMU_", "").lower() in ("1", "true", "yes", "on")
@@ -103,10 +107,13 @@ def seed_random_generators(seed_value: Optional[int] = None):
     if seed_value is None:
         return
     seed(seed_value)
+    # Try to seed NumPy's random number generator
+    # This handles both NumPy 1.x and 2.x, and any import issues
     if numpy_available:
-        import numpy as _np  # reuse imported module safely
-
-        _np.random.seed(seed_value)
+        try:
+            np.random.seed(seed_value)
+        except (ImportError, AttributeError, Exception):
+            pass
 
 
 def test_library_properties():

From efceb0b020a41901589145b1e4d65990f8b964c6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 31 Aug 2025 20:30:59 +0000
Subject: [PATCH 691/751] Fix: Don't repeat seed-ed fuzzy tests

---
 scripts/test_stringzilla.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index aa7cb02d..db8ac0bd 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -719,7 +719,6 @@ def test_translations(length: int, seed_value: int):
     assert sz.equal(after_threshold, body.translate(dict_threshold))
 
 
-@pytest.mark.repeat(3)
 @pytest.mark.parametrize("length", list(range(0, 300)) + [1024, 4096, 100000])
 @pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
 @pytest.mark.parametrize("seed_value", SEED_VALUES)

From 30c89352db63efc74d0a3b7c6c46d697a62f3936 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 31 Aug 2025 20:34:15 +0000
Subject: [PATCH 692/751] Fix: Prevent PyTest from parsing invalid UTF-8

---
 scripts/test_stringzilla.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index db8ac0bd..da0e0c4d 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -561,8 +561,10 @@ def test_decoding_valid_strings(byte_string, encoding, expected):
 @pytest.mark.parametrize(
     "byte_string, encoding",
     [
-        (b"\xff", "utf-8"),  # Invalid UTF-8 byte
-        (b"\x80hello", "ascii"),  # Non-ASCII byte in ASCII string
+        # Use `bytes.fromhex()` to avoid putting binary literals in source code
+        # This prevents PyTest's source parsing from encountering invalid UTF-8
+        (bytes.fromhex("ff"), "utf-8"),  # Invalid UTF-8 byte
+        (bytes.fromhex("80") + b"hello", "ascii"),  # Non-ASCII byte in ASCII string
     ],
 )
 def test_decoding_exceptions(byte_string, encoding):
@@ -989,9 +991,11 @@ def test_invalid_utf8_handling():
 
     # Test arrays with invalid UTF-8 sequences
     test_arrays = [
-        [b"hello", b"\x80world", b"valid"],  # Mixed valid/invalid
-        [b"\xff\xfe", b"\x80", b"\xf4\x90\x80\x80"],  # All invalid
-        [b"normal", b"string with \x80 bytes"],  # Partial invalid
+        # Use `bytes.fromhex()` to avoid putting binary literals in source code
+        # This prevents PyTest's source parsing from encountering invalid UTF-8
+        [b"hello", bytes.fromhex("80") + b"world", b"valid"],  # Mixed valid/invalid
+        [bytes.fromhex("fffe"), bytes.fromhex("80"), bytes.fromhex("f4908080")],  # All invalid
+        [b"normal", b"string with " + bytes.fromhex("80") + b" bytes"],  # Partial invalid
     ]
 
     for test_array in test_arrays:

From 5f8ac03808c91e009255a1ec6f8a8c1ff9c41d08 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 31 Aug 2025 20:34:33 +0000
Subject: [PATCH 693/751] Fix: `np.random` v1 vs v2 compatibility also in
 `szs.`

---
 scripts/test_stringzillas.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index 8915decf..8e635bfc 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -137,7 +137,12 @@ def seed_random_generators(seed_value: Optional[int] = None):
     if seed_value is None:
         return
     seed(seed_value)
-    np.random.seed(seed_value)
+    # Try to seed NumPy's random number generator
+    # This handles both NumPy 1.x and 2.x, and any import issues
+    try:
+        np.random.seed(seed_value)
+    except (ImportError, AttributeError, Exception):
+        pass
 
 
 def test_device_scope():

From 3d76e8f50b13ba7c06a41a64c6ae3ecd4dcdb22a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 31 Aug 2025 21:23:43 +0000
Subject: [PATCH 694/751] Improve: Test PRNG in Py and boundary Strs sizes

---
 pyproject.toml              |   4 +-
 python/stringzilla.c        | 112 +++++++++++++++++++++++++++-
 scripts/test_stringzilla.py | 145 +++++++++++++++++++++++++++++++++++-
 3 files changed, 255 insertions(+), 6 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e6328f1c..0e79fc58 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,9 +4,9 @@
 #   - `macos` wheels for x86_64 and arm64 (no universal2);
 #   - `windows` wheels for AMD64, x86, and ARM64.
 #   - for Python versions from 3.8 to 3.13.
-#   - running over 5,000 tests on each wheel.
+#   - running over 30,000 tests on each wheel.
 #   = meaning 15 platforms * 6 Python versions = 90 builds.
-#   = meaning over 450,000 tests.
+#   = meaning over 2,700,000 tests.
 [build-system]
 requires = ["setuptools>=42", "wheel"]
 build-backend = "setuptools.build_meta"
diff --git a/python/stringzilla.c b/python/stringzilla.c
index 0265b9be..10238cf1 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -1214,6 +1214,7 @@ static char const doc_fill_random[] = //
     "Args:\n"
     "  buffer (Str or bytes-like): Writable, contiguous byte buffer (e.g., memoryview/bytearray).\n"
     "  nonce (int, optional): Seed/nonce ensuring reproducible output for the same inputs (default 0).\n"
+    "  alphabet (str or bytes, optional): If provided, remaps random bytes to characters from the alphabet.\n"
     "  start (int, optional): Starting index (default 0).\n"
     "  end (int, optional): Ending index (default len(buffer)).\n"
     "Returns:\n"
@@ -1231,6 +1232,7 @@ static PyObject *Str_fill_random(PyObject *self, PyObject *const *args, Py_ssize
     PyObject *nonce_obj = positional_args_count > !is_member ? args[!is_member] : NULL;
     PyObject *start_obj = positional_args_count > !is_member + 1 ? args[!is_member + 1] : NULL;
     PyObject *end_obj = positional_args_count > !is_member + 2 ? args[!is_member + 2] : NULL;
+    PyObject *alphabet_obj = NULL;
 
     // Optional keyword arguments
     if (args_names_tuple) {
@@ -1239,6 +1241,8 @@ static PyObject *Str_fill_random(PyObject *self, PyObject *const *args, Py_ssize
             PyObject *key = PyTuple_GET_ITEM(args_names_tuple, i);
             PyObject *value = args[positional_args_count + i];
             if (PyUnicode_CompareWithASCIIString(key, "nonce") == 0 && !nonce_obj) nonce_obj = value;
+            else if (PyUnicode_CompareWithASCIIString(key, "alphabet") == 0 && !alphabet_obj)
+                alphabet_obj = value;
             else if (PyUnicode_CompareWithASCIIString(key, "start") == 0 && !start_obj)
                 start_obj = value;
             else if (PyUnicode_CompareWithASCIIString(key, "end") == 0 && !end_obj)
@@ -1272,6 +1276,19 @@ static PyObject *Str_fill_random(PyObject *self, PyObject *const *args, Py_ssize
         if (PyErr_Occurred()) return NULL;
     }
 
+    // Parse alphabet
+    sz_string_view_t alphabet;
+    if (alphabet_obj) {
+        if (!sz_py_export_string_like(alphabet_obj, &alphabet.start, &alphabet.length)) {
+            wrap_current_exception("alphabet must be string-like");
+            return NULL;
+        }
+        if (alphabet.length == 0) {
+            PyErr_SetString(PyExc_ValueError, "alphabet must not be empty");
+            return NULL;
+        }
+    }
+
     // Export buffer and clamp range
     sz_string_view_t buf;
     if (!sz_py_export_string_like(buffer_obj, &buf.start, &buf.length)) {
@@ -1292,10 +1309,99 @@ static PyObject *Str_fill_random(PyObject *self, PyObject *const *args, Py_ssize
     buf.length -= start;
     if (end != PY_SSIZE_T_MAX && (sz_size_t)(end - start) < buf.length) { buf.length = (sz_size_t)(end - start); }
 
-    if (buf.length > 0) sz_fill_random((sz_ptr_t)buf.start, buf.length, nonce);
+    if (buf.length > 0) {
+        sz_fill_random((sz_ptr_t)buf.start, buf.length, nonce);
+        SZ_ALIGN64 char look_up_table[256];
+        for (int i = 0; i < 256; ++i) look_up_table[i] = alphabet.start[i % alphabet.length];
+        sz_lookup((sz_ptr_t)buf.start, buf.length, (sz_cptr_t)buf.start, look_up_table);
+    }
     Py_RETURN_NONE;
 }
 
+static char const doc_random[] = //
+    "random(length, *, nonce=0, alphabet=None) -> bytes\n\n"
+    "Generate a new random byte string, optionally remapped to a given alphabet.\n"
+    "If alphabet is provided, each byte is mapped to alphabet[b % len(alphabet)].";
+
+static PyObject *module_random(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                               PyObject *args_names_tuple) {
+    (void)self;
+    if (positional_args_count < 1 || positional_args_count > 2) {
+        PyErr_SetString(PyExc_TypeError, "random() expects 1 or 2 positional arguments");
+        return NULL;
+    }
+    PyObject *length_obj = args[0];
+    PyObject *nonce_obj = positional_args_count > 1 ? args[1] : NULL;
+    PyObject *alphabet_obj = NULL;
+
+    if (args_names_tuple) {
+        Py_ssize_t kw_count = PyTuple_GET_SIZE(args_names_tuple);
+        for (Py_ssize_t i = 0; i < kw_count; ++i) {
+            PyObject *key = PyTuple_GET_ITEM(args_names_tuple, i);
+            PyObject *value = args[positional_args_count + i];
+            if (PyUnicode_CompareWithASCIIString(key, "nonce") == 0 && !nonce_obj) nonce_obj = value;
+            else if (PyUnicode_CompareWithASCIIString(key, "alphabet") == 0 && !alphabet_obj)
+                alphabet_obj = value;
+            else {
+                PyErr_Format(PyExc_TypeError, "unexpected keyword argument: %S", key);
+                return NULL;
+            }
+        }
+    }
+
+    if (!PyLong_Check(length_obj)) {
+        PyErr_SetString(PyExc_TypeError, "length must be an integer");
+        return NULL;
+    }
+    Py_ssize_t signed_length = PyLong_AsSsize_t(length_obj);
+    if (signed_length == -1 && PyErr_Occurred()) return NULL;
+    if (signed_length < 0) {
+        PyErr_SetString(PyExc_ValueError, "length must be non-negative");
+        return NULL;
+    }
+    sz_size_t length = (sz_size_t)signed_length;
+
+    sz_u64_t nonce = 0;
+    if (nonce_obj) {
+        if (!PyLong_Check(nonce_obj)) {
+            PyErr_SetString(PyExc_TypeError, "nonce must be an integer");
+            return NULL;
+        }
+        nonce = PyLong_AsUnsignedLongLong(nonce_obj);
+        if (PyErr_Occurred()) return NULL;
+    }
+
+    PyObject *bytes_obj = PyBytes_FromStringAndSize(NULL, (Py_ssize_t)length);
+    if (!bytes_obj) {
+        PyErr_SetString(PyExc_MemoryError, "Unable to allocate random bytes");
+        return NULL;
+    }
+    if (length > 0) {
+        sz_ptr_t buffer = (sz_ptr_t)PyBytes_AS_STRING(bytes_obj);
+        sz_fill_random(buffer, length, nonce);
+    }
+
+    if (!alphabet_obj || length == 0) return bytes_obj;
+
+    sz_string_view_t alphabet;
+    if (!sz_py_export_string_like(alphabet_obj, &alphabet.start, &alphabet.length)) {
+        Py_DECREF(bytes_obj);
+        wrap_current_exception("alphabet must be string-like");
+        return NULL;
+    }
+    if (alphabet.length == 0) {
+        Py_DECREF(bytes_obj);
+        PyErr_SetString(PyExc_ValueError, "alphabet must not be empty");
+        return NULL;
+    }
+
+    SZ_ALIGN64 char look_up_table[256];
+    for (int i = 0; i < 256; ++i) look_up_table[i] = alphabet.start[i % alphabet.length];
+    sz_ptr_t buf_ptr = (sz_ptr_t)PyBytes_AS_STRING(bytes_obj);
+    sz_lookup(buf_ptr, length, buf_ptr, look_up_table);
+    return bytes_obj;
+}
+
 static char const doc_like_bytesum[] = //
     "Compute the checksum of individual byte values in a string.\n"
     "\n"
@@ -5473,8 +5579,10 @@ static PyMethodDef stringzilla_methods[] = {
     // Global unary extensions
     {"hash", (PyCFunction)Str_like_hash, SZ_METHOD_FLAGS, doc_like_hash},
     {"bytesum", (PyCFunction)Str_like_bytesum, SZ_METHOD_FLAGS, doc_like_bytesum},
+    {"fill_random", (PyCFunction)Str_fill_random, SZ_METHOD_FLAGS, doc_fill_random},
 
-    // Updating module capabilities
+    // Module-level functionality
+    {"random", (PyCFunction)module_random, SZ_METHOD_FLAGS, doc_random},
     {"reset_capabilities", (PyCFunction)module_reset_capabilities, METH_VARARGS, doc_reset_capabilities},
 
     {NULL, NULL, 0, NULL}};
diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index da0e0c4d..e52ea87e 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -26,6 +26,7 @@
 
 import os
 import sys
+import math
 import tempfile
 import platform
 from random import choice, randint, seed
@@ -534,6 +535,9 @@ def test_unit_globals():
     assert sz.translate("ABC", {"A": "X", "B": "Y"}, start=1, end=-1) == "YC"
     assert sz.translate("ABC", bytes(range(256))) == "ABC"
 
+    assert sz.fill_random("ABC") == None
+    assert sz.fill_random("ABC", 42) == None
+
     assert sz.split("hello world test", " ") == ["hello", "world", "test"]
     assert sz.rsplit("hello world test", " ", 1) == ["hello world", "test"]
 
@@ -731,10 +735,61 @@ def test_translations_random(length: int, seed_value: int):
     assert sz.translate(body, memoryview(lut)) == baseline_translate(body, lut)
 
 
-@pytest.mark.repeat(3)
+@pytest.mark.parametrize("seed_value", SEED_VALUES)
+def test_fill_random_slice(seed_value: int):
+    # Prepare a zeroed buffer and keep a copy for comparison
+    original = bytearray(64)
+    updated_in_slices = bytearray(original)
+
+    # Fill only a slice [start:end) deterministically
+    start, end = 10, 30
+    sz.fill_random(updated_in_slices, nonce=seed_value, start=start, end=end)
+
+    # Unchanged prefix and suffix
+    assert bytes(updated_in_slices[:start]) == bytes(original[:start])
+    assert bytes(updated_in_slices[end:]) == bytes(original[end:])
+
+    # Changed inner region
+    assert bytes(updated_in_slices[start:end]) != bytes(original[start:end])
+
+
+def test_fill_random_different_nonces():
+    first_buffer = bytearray(64)
+    second_buffer = bytearray(64)
+    sz.fill_random(first_buffer, nonce=1)
+    sz.fill_random(second_buffer, nonce=2)
+    assert bytes(first_buffer) != bytes(second_buffer)
+
+
+@pytest.mark.parametrize("length", [0, 1, 7, 64])
+@pytest.mark.parametrize("seed_value", SEED_VALUES)
+def test_fill_random_alphabet(length: int, seed_value: int):
+
+    # Same nonce should produce the same result
+    random_string = sz.random(length, nonce=seed_value)
+    same_nonce_random_string = sz.random(length, nonce=seed_value)
+    assert isinstance(random_string, (bytes, bytearray))
+    assert len(random_string) == length
+    assert random_string == same_nonce_random_string
+
+    # With alphabet: all bytes must belong to alphabet
+    alphabet = b"0123456789"
+    random_digits = sz.random(128, nonce=seed_value, alphabet=alphabet)
+    assert set(random_digits).issubset(set(alphabet))
+
+
+@pytest.mark.parametrize("body", ["", "hello", "world", "abcdefg", "a" * 32])
+@pytest.mark.parametrize("seed_value", SEED_VALUES)
+def test_hash_basic_equivalence(body: str, seed_value: int):
+    # TODO: Add streaming hashers and compare slices vs overall
+    hash_seeded = sz.hash(body, seed=seed_value)
+    hash_member = sz.Str(body).hash(seed=seed_value)
+    assert hash_seeded == hash_member
+
+
 @pytest.mark.parametrize("length", list(range(0, 300)) + [1024, 4096, 100000])
 @pytest.mark.parametrize("seed_value", SEED_VALUES)
-def test_bytesums_random(length: int, seed_value: int):
+def test_bytesum_random(length: int, seed_value: int):
     def sum_bytes(body: str) -> int:
         return sum([ord(c) for c in body])
 
@@ -822,6 +877,92 @@ def test_strs_from_python_basic(container_class: type, view: bool):
     assert strs[7] == ""
 
 
+UINT32_MAX = 2**32 - 1  # ! Many of the 32/64-bit algo corner cases happen at this input size
+
+
+def long_repeated_string(ctypes, fill_char: str, string_size: int) -> str:
+    buffer = ctypes.create_string_buffer(string_size)
+    ctypes.memset(buffer, ord(fill_char), string_size)
+    return buffer.value.decode("ascii")
+
+
+@pytest.mark.skipif(sys.maxsize <= 2**32, reason="64-bit system required for 4GB+ test")
+def test_strs_from_4gb_list():
+    """Test Strs with >4GB array of strings to verify 32-bit to 64-bit layout transition.
+    This will require over 8 GB of memory. To stress-test the behavior, limit memory per process. For 5 and 13 GB:
+
+    ulimit -v 9437184 && uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x -k test_strs_from_4gb_list
+    ulimit -v 13631488 && uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x -k test_strs_from_4gb_list
+    """
+
+    try:
+        import gc
+        import ctypes
+    except ImportError:
+        pytest.skip("ctypes & gc not available (e.g., PyPy)")
+
+    # Each individual string won't be very large, but many of them will be used
+    part_size = 64 * 1024 * 1024
+    parts_count = math.ceil(UINT32_MAX / part_size) + 1  # Ensures we exceed UINT32_MAX by a small margin
+    try:
+        parts_pythonic = [
+            long_repeated_string(ctypes, ascii_lowercase[part_index % len(ascii_lowercase)], part_size)
+            for part_index in range(parts_count)
+        ]
+        parts_stringzilla = Strs(parts_pythonic)
+
+        # Basic verification
+        last_used_char = ascii_lowercase[(parts_count - 1) % len(ascii_lowercase)]
+        assert len(parts_stringzilla) == parts_count
+        assert parts_stringzilla[0] == long_repeated_string(ctypes, "a", part_size)
+        assert parts_stringzilla[-1] == long_repeated_string(ctypes, last_used_char, part_size)
+
+        del parts_pythonic
+        del parts_stringzilla
+    except (MemoryError, OSError):
+        pytest.skip("Memory allocation failed")
+    finally:
+        gc.collect()
+
+
+@pytest.mark.skipif(sys.maxsize <= 2**32, reason="64-bit system required for 4GB+ test")
+def test_strs_from_4gb_generator():
+    """Test Strs with >4GB of strings streams to verify 32-bit to 64-bit layout transition.
+    This will require over 8 GB of memory. To stress-test the behavior, limit memory per process. For 5 and 13 GB:
+
+    ulimit -v 5242880 && uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x -k test_strs_from_4gb_generator
+    ulimit -v 13631488 && uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x -k test_strs_from_4gb_generator
+    """
+
+    try:
+        import gc
+        import ctypes
+    except ImportError:
+        pytest.skip("ctypes & gc not available (e.g., PyPy)")
+
+    # Each individual string won't be very large, but many of them will be used
+    part_size = 64 * 1024 * 1024
+    parts_count = math.ceil(UINT32_MAX / part_size) + 1  # Ensures we exceed UINT32_MAX by a small margin
+    try:
+        parts_stringzilla = Strs(
+            long_repeated_string(ctypes, ascii_lowercase[part_index % len(ascii_lowercase)], part_size)
+            for part_index in range(parts_count)
+        )
+
+        # Basic verification
+        last_used_char = ascii_lowercase[(parts_count - 1) % len(ascii_lowercase)]
+        assert len(parts_stringzilla) == parts_count
+        assert parts_stringzilla[0] == long_repeated_string(ctypes, "a", part_size)
+        assert parts_stringzilla[-1] == long_repeated_string(ctypes, last_used_char, part_size)
+
+        del parts_pythonic
+        del parts_stringzilla
+    except (MemoryError, OSError):
+        pytest.skip("Memory allocation failed")
+    finally:
+        gc.collect()
+
+
 @pytest.mark.parametrize("container_class", [tuple, list, iter])
 @pytest.mark.parametrize("view", [False, True])
 def test_strs_reference_counting(container_class: type, view: bool):

From e8c437e62bbd3a1fbc19cf0aa4262d08a3efb0d6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 31 Aug 2025 21:50:07 +0000
Subject: [PATCH 695/751] Fix: Check for immutable Py buffers

---
 python/stringzilla.c        | 32 ++++++++++++++++++++++++++++++--
 scripts/test_stringzilla.py | 15 +++++++++------
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/python/stringzilla.c b/python/stringzilla.c
index 10238cf1..52a9ea71 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -331,6 +331,31 @@ void permute(sz_string_view_t *array, sz_sorted_idx_t *order, sz_size_t length)
     }
 }
 
+/**
+ *  @brief  Helper function to check if a Python object represents a mutable buffer.
+ *          Returns sz_true_k if the object is mutable (can be written to), sz_false_k if immutable.
+ *          Sets a Python exception if immutable.
+ */
+SZ_INTERNAL sz_bool_t sz_py_is_mutable(PyObject *object) {
+    if (PyUnicode_Check(object)) {
+        PyErr_SetString(PyExc_TypeError, "str objects are immutable (use bytearray instead)");
+        return sz_false_k;
+    }
+    else if (PyBytes_Check(object)) {
+        PyErr_SetString(PyExc_TypeError, "bytes objects are immutable (use bytearray instead)");
+        return sz_false_k;
+    }
+    else if (PyMemoryView_Check(object)) {
+        Py_buffer *view = PyMemoryView_GET_BUFFER(object);
+        if (view->readonly) {
+            PyErr_SetString(PyExc_TypeError, "memoryview is read-only");
+            return sz_false_k;
+        }
+    }
+    // Everything else is optimistically considered mutable
+    return sz_true_k;
+}
+
 /**
  *  @brief  Helper function to export a Python string-like object into a `sz_string_view_t`.
  *          On failure, sets a Python exception and returns 0.
@@ -1296,6 +1321,8 @@ static PyObject *Str_fill_random(PyObject *self, PyObject *const *args, Py_ssize
         return NULL;
     }
 
+    if (sz_py_is_mutable(buffer_obj) == sz_false_k) return NULL;
+
     if (start < 0 || (end != PY_SSIZE_T_MAX && end < 0)) {
         PyErr_SetString(PyExc_ValueError, "start/end must be non-negative");
         return NULL;
@@ -1309,8 +1336,8 @@ static PyObject *Str_fill_random(PyObject *self, PyObject *const *args, Py_ssize
     buf.length -= start;
     if (end != PY_SSIZE_T_MAX && (sz_size_t)(end - start) < buf.length) { buf.length = (sz_size_t)(end - start); }
 
-    if (buf.length > 0) {
-        sz_fill_random((sz_ptr_t)buf.start, buf.length, nonce);
+    sz_fill_random((sz_ptr_t)buf.start, buf.length, nonce);
+    if (alphabet_obj) {
         SZ_ALIGN64 char look_up_table[256];
         for (int i = 0; i < 256; ++i) look_up_table[i] = alphabet.start[i % alphabet.length];
         sz_lookup((sz_ptr_t)buf.start, buf.length, (sz_cptr_t)buf.start, look_up_table);
@@ -2956,6 +2983,7 @@ static PyObject *Str_translate(PyObject *self, PyObject *const *args, Py_ssize_t
 
     // Perform the translation using the look-up table
     if (is_inplace) {
+        if (sz_py_is_mutable(str_obj) == sz_false_k) return NULL;
         sz_lookup(str.start, str.length, str.start, look_up_table);
         Py_RETURN_NONE;
     }
diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index e52ea87e..17a5ca72 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -534,9 +534,12 @@ def test_unit_globals():
     assert sz.translate("ABC", {"A": "X", "B": "Y"}) == "XYC"
     assert sz.translate("ABC", {"A": "X", "B": "Y"}, start=1, end=-1) == "YC"
     assert sz.translate("ABC", bytes(range(256))) == "ABC"
+    with pytest.raises(TypeError):
+        sz.translate("ABC", {"A": "X", "B": "Y"}, start=1, end=-1, inplace=True)
 
-    assert sz.fill_random("ABC") == None
-    assert sz.fill_random("ABC", 42) == None
+    mutable_buffer = bytearray(b"ABC")
+    assert sz.fill_random(mutable_buffer) == None
+    assert sz.fill_random(mutable_buffer, 42) == None
 
     assert sz.split("hello world test", " ") == ["hello", "world", "test"]
     assert sz.rsplit("hello world test", " ", 1) == ["hello world", "test"]
@@ -713,14 +716,14 @@ def test_translations(length: int, seed_value: int):
     assert sz.translate(body_bytes, view_invert) == body_bytes.translate(view_invert)
     assert sz.translate(body_bytes, view_threshold) == body_bytes.translate(view_threshold)
 
-    # Check in-place translations - all of them return nothing
-    after_identity = memoryview(body_bytes)
+    # Check in-place translations on mutable byte-arrays - all of them return nothing
+    after_identity = bytearray(body_bytes)
     assert sz.translate(after_identity, view_identity, inplace=True) == None
     assert sz.equal(after_identity, body.translate(dict_identity))
-    after_invert = memoryview(body_bytes)
+    after_invert = bytearray(body_bytes)
     assert sz.translate(after_invert, view_invert, inplace=True) == None
     assert sz.equal(after_invert, body.translate(dict_invert))
-    after_threshold = memoryview(body_bytes)
+    after_threshold = bytearray(body_bytes)
     assert sz.translate(after_threshold, view_threshold, inplace=True) == None
     assert sz.equal(after_threshold, body.translate(dict_threshold))
 

From 3aedfb28d0669ed632ef500113e4778116e168f6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 31 Aug 2025 22:15:51 +0000
Subject: [PATCH 696/751] Improve: Disable `E722` import exception warning

---
 scripts/test_stringzilla.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index 17a5ca72..0aeafd96 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -39,27 +39,27 @@
 from stringzilla import Str, Strs
 
 # NumPy is available on most platforms and is required for most tests.
-# When using PyPy on some platforms NumPy has internal issues, that will
-# raise a weird error, not an `ImportError`. That's why we intentionally
-# use a naked `except:`. Necessary evil!
+# ! When using PyPy on some platforms NumPy has internal issues, that will
+# ! raise a weird error, not an `ImportError`. That's why we intentionally
+# ! use a naked `except:`. Necessary evil!
 try:
     import numpy as np
 
     numpy_available = True
-except:
+except: # noqa: E722
     # NumPy is not installed, most tests will be skipped
     numpy_available = False
 
 
 # PyArrow is not available on most platforms.
-# When using PyPy on some platforms PyArrow has internal issues, that will
-# raise a weird error, not an `ImportError`. That's why we intentionally
-# use a naked `except:`. Necessary evil!
+# ! When using PyPy on some platforms PyArrow has internal issues, that will
+# ! raise a weird error, not an `ImportError`. That's why we intentionally
+# ! use a naked `except:`. Necessary evil!
 try:
     import pyarrow as pa
 
     pyarrow_available = True
-except:
+except: # noqa: E722
     # PyArrow is not installed, most tests will be skipped
     pyarrow_available = False
 

From edaff0eaea3f1ed5d739b739555cca808de023a6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 31 Aug 2025 22:29:50 +0000
Subject: [PATCH 697/751] Fix: Ruff-statically suggested issues

---
 CONTRIBUTING.md               | 10 ++++++++++
 scripts/bench_fingerprints.py |  4 ++--
 scripts/bench_similarities.py |  6 +++---
 scripts/test_stringzilla.py   | 32 ++++++++++++++++----------------
 scripts/test_stringzillas.py  |  2 +-
 5 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 930a3861..6f11c169 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -453,6 +453,16 @@ SZ_TARGET=stringzillas-cpus uv pip install -e . --force-reinstall --no-build-iso
 SZ_TARGET=stringzillas-cuda uv pip install -e . --force-reinstall --no-build-isolation
 ```
 
+To clean up code before pushing:
+
+```bash
+uv pip install ruff mypy bandit flake8
+uv run --no-project ruff check scripts/test_stringzilla.py --fix
+uv run --no-project mypy scripts/test_stringzilla.py --ignore-missing-imports
+uv run --no-project bandit scripts/test_stringzilla.py -s B101
+uv run --no-project flake8 scripts/test_stringzilla.py --max-line-length=120
+```
+
 ### Testing
 
 For testing we use PyTest, which may not be installed on your system.
diff --git a/scripts/bench_fingerprints.py b/scripts/bench_fingerprints.py
index 22685679..080a3baa 100644
--- a/scripts/bench_fingerprints.py
+++ b/scripts/bench_fingerprints.py
@@ -86,7 +86,7 @@ def log(
                         )
                     progress_bar.update(1)
 
-                except Exception as e:
+                except Exception:
                     # Skip failed operations but continue
                     continue
 
@@ -206,7 +206,7 @@ def sklearn_ngrams_vectorizer(doc: bytes) -> list:
             f"stringzillas.Fingerprints(CPU): {len(docs):,} docs in {total_time_s:.2f}s ~ {mb_per_sec:.3f} MB/s, {docs_per_sec:.0f} docs/s"
         )
     else:
-        print(f"stringzillas.Fingerprints(CPU): FAILED - {e}")
+        print(f"stringzillas.Fingerprints(CPU): FAILED - no CPU support?!")
 
     # Benchmark batch GPU fingerprinting
     if "cuda" in szs.__capabilities__:
diff --git a/scripts/bench_similarities.py b/scripts/bench_similarities.py
index d9a33ede..014ec3c8 100644
--- a/scripts/bench_similarities.py
+++ b/scripts/bench_similarities.py
@@ -127,7 +127,7 @@ def log_similarity_operation(
                         )
                     progress_bar.update(1)
 
-                except Exception as e:
+                except Exception:
                     # Skip failed operations but continue
                     continue
 
@@ -257,7 +257,7 @@ def benchmark_stringzillas_batch(engine_name, engine_class, device_scope):
     try:
         gpu_scope = szs.DeviceScope(gpu_device=0)
         benchmark_stringzillas_batch("stringzillas.LevenshteinDistances(GPU)", szs.LevenshteinDistances, gpu_scope)
-    except:
+    except Exception:
         pass  # GPU may not be available
 
     # StringZillas UTF-8 Levenshtein distances (batch)
@@ -267,7 +267,7 @@ def benchmark_stringzillas_batch(engine_name, engine_class, device_scope):
         benchmark_stringzillas_batch(
             "stringzillas.LevenshteinDistancesUTF8(GPU)", szs.LevenshteinDistancesUTF8, gpu_scope
         )
-    except:
+    except Exception:
         pass  # GPU may not be available
 
 
diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index 0aeafd96..8d5b76e3 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -76,7 +76,7 @@
 def log_test_environment():
     """Automatically log environment info before running any tests."""
 
-    print(f"=== StringZilla Test Environment ===")
+    print("=== StringZilla Test Environment ===")
     print(f"Platform: {platform.platform()}")
     print(f"Architecture: {platform.machine()}")
     print(f"Processor: {platform.processor()}")
@@ -538,8 +538,8 @@ def test_unit_globals():
         sz.translate("ABC", {"A": "X", "B": "Y"}, start=1, end=-1, inplace=True)
 
     mutable_buffer = bytearray(b"ABC")
-    assert sz.fill_random(mutable_buffer) == None
-    assert sz.fill_random(mutable_buffer, 42) == None
+    assert sz.fill_random(mutable_buffer) is None
+    assert sz.fill_random(mutable_buffer, 42) is None
 
     assert sz.split("hello world test", " ") == ["hello", "world", "test"]
     assert sz.rsplit("hello world test", " ", 1) == ["hello world", "test"]
@@ -686,8 +686,9 @@ def baseline_translate(body: str, lut: Sequence) -> str:
     return "".join([chr(lut[ord(c)]) for c in body])
 
 
-def translation_table_to_dict(lut: Sequence) -> Dict[str, str]:
-    return {chr(i): chr(lut[i]) for i in range(256)}
+def translation_table_to_dict(lut: Sequence) -> Dict[int, str]:
+    """Convert lookup table to translation dict for str.translate()"""
+    return {i: chr(lut[i]) for i in range(256)}
 
 
 @pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
@@ -718,13 +719,13 @@ def test_translations(length: int, seed_value: int):
 
     # Check in-place translations on mutable byte-arrays - all of them return nothing
     after_identity = bytearray(body_bytes)
-    assert sz.translate(after_identity, view_identity, inplace=True) == None
+    assert sz.translate(after_identity, view_identity, inplace=True) is None
     assert sz.equal(after_identity, body.translate(dict_identity))
     after_invert = bytearray(body_bytes)
-    assert sz.translate(after_invert, view_invert, inplace=True) == None
+    assert sz.translate(after_invert, view_invert, inplace=True) is None
     assert sz.equal(after_invert, body.translate(dict_invert))
     after_threshold = bytearray(body_bytes)
-    assert sz.translate(after_threshold, view_threshold, inplace=True) == None
+    assert sz.translate(after_threshold, view_threshold, inplace=True) is None
     assert sz.equal(after_threshold, body.translate(dict_threshold))
 
 
@@ -894,8 +895,8 @@ def test_strs_from_4gb_list():
     """Test Strs with >4GB array of strings to verify 32-bit to 64-bit layout transition.
     This will require over 8 GB of memory. To stress-test the behavior, limit memory per process. For 5 and 13 GB:
 
-    ulimit -v 9437184 && uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x -k test_strs_from_4gb_list
-    ulimit -v 13631488 && uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x -k test_strs_from_4gb_list
+    ulimit -v 9437184 && uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x -k 4gb_list
+    ulimit -v 13631488 && uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x -k 4gb_list
     """
 
     try:
@@ -933,8 +934,8 @@ def test_strs_from_4gb_generator():
     """Test Strs with >4GB of strings streams to verify 32-bit to 64-bit layout transition.
     This will require over 8 GB of memory. To stress-test the behavior, limit memory per process. For 5 and 13 GB:
 
-    ulimit -v 5242880 && uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x -k test_strs_from_4gb_generator
-    ulimit -v 13631488 && uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x -k test_strs_from_4gb_generator
+    ulimit -v 5242880 && uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x -k 4gb_generator
+    ulimit -v 13631488 && uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x -k 4gb_generator
     """
 
     try:
@@ -958,7 +959,6 @@ def test_strs_from_4gb_generator():
         assert parts_stringzilla[0] == long_repeated_string(ctypes, "a", part_size)
         assert parts_stringzilla[-1] == long_repeated_string(ctypes, last_used_char, part_size)
 
-        del parts_pythonic
         del parts_stringzilla
     except (MemoryError, OSError):
         pytest.skip("Memory allocation failed")
@@ -996,9 +996,9 @@ def test_strs_reference_counting(container_class: type, view: bool):
     if container_class != iter:
         # View mode should increment refcount, copy mode should not
         if view:
-            assert during_refcount == initial_refcount + 1, f"View mode should increment refcount"
+            assert during_refcount == initial_refcount + 1, "View mode should increment refcount"
         else:
-            assert during_refcount == initial_refcount, f"Copy mode should not change refcount"
+            assert during_refcount == initial_refcount, "Copy mode should not change refcount"
 
     # Verify functionality
     assert len(strs) == 3
@@ -1011,7 +1011,7 @@ def test_strs_reference_counting(container_class: type, view: bool):
 
     if container_class != iter:
         final_refcount = sys.getrefcount(container)
-        assert final_refcount == initial_refcount, f"Refcount should return to initial value"
+        assert final_refcount == initial_refcount, "Refcount should return to initial value"
 
 
 @pytest.mark.skipif(not pyarrow_available, reason="PyArrow is not installed")
diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index 8e635bfc..1f716052 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -50,7 +50,7 @@
 @pytest.fixture(scope="session", autouse=True)
 def log_test_environment():
     """Automatically log environment info before running any tests."""
-    print(f"=== StringZillas Test Environment ===")
+    print("=== StringZillas Test Environment ===")
     print(f"Platform: {platform.platform()}")
     print(f"Architecture: {platform.machine()}")
     print(f"Processor: {platform.processor()}")

From 8d2d9c828229c6711705fc0317f07e97ba5570f3 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 1 Sep 2025 11:59:59 +0000
Subject: [PATCH 698/751] Fix: Sorting on big-endian `s390x`

---
 include/stringzilla/sort.h  | 14 ++++++++++++++
 include/stringzilla/types.h |  9 +++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index 8096c826..2ea85192 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -376,10 +376,12 @@ SZ_INTERNAL void sz_sequence_argsort_serial_export_next_pgrams_(
         *target_pgram = 0;
         for (sz_size_t j = 0; j < exported_length; ++j) target_str[j] = source_str[j + start_character];
         target_str[pgram_capacity] = (char)exported_length;
+#if !SZ_IS_BIG_ENDIAN_
 #if SZ_IS_64BIT_
         *target_pgram = sz_u64_bytes_reverse(*target_pgram);
 #else
         *target_pgram = sz_u32_bytes_reverse(*target_pgram);
+#endif
 #endif
         sz_assert_(                                                //
             (length <= start_character) == (*target_pgram == 0) && //
@@ -539,7 +541,11 @@ SZ_PUBLIC void sz_sequence_argsort_serial_next_pgrams_(                   //
 
         // If the identical pgrams are not trivial and each string has more characters, sort them recursively
         sz_cptr_t current_pgram_str = (sz_cptr_t)&current_pgram;
+#if !SZ_IS_BIG_ENDIAN_
         sz_size_t current_pgram_length = (sz_size_t)current_pgram_str[0]; //! The byte order was swapped
+#else
+        sz_size_t current_pgram_length = (sz_size_t)current_pgram_str[pgram_capacity]; //! No byte swapping on big-endian
+#endif
         int has_multiple_strings = nested_end - nested_start > 1;
         int has_more_characters_in_each = current_pgram_length == pgram_capacity;
         if (has_multiple_strings && has_more_characters_in_each) {
@@ -860,7 +866,11 @@ SZ_PUBLIC void sz_sequence_argsort_skylake_next_pgrams_(
 
         // If the identical pgrams are not trivial and each string has more characters, sort them recursively
         sz_cptr_t current_pgram_str = (sz_cptr_t)&current_pgram;
+#if !SZ_IS_BIG_ENDIAN_
         sz_size_t current_pgram_length = (sz_size_t)current_pgram_str[0]; //! The byte order was swapped
+#else
+        sz_size_t current_pgram_length = (sz_size_t)current_pgram_str[pgram_capacity]; //! No byte swapping on big-endian
+#endif
         int has_multiple_strings = nested_end - nested_start > 1;
         int has_more_characters_in_each = current_pgram_length == pgram_capacity;
         if (has_multiple_strings && has_more_characters_in_each)
@@ -1101,7 +1111,11 @@ SZ_PUBLIC void sz_sequence_argsort_sve_next_pgrams_(
         while (nested_end != end_in_sequence && current_pgram == global_pgrams[nested_end]) ++nested_end;
 
         sz_cptr_t current_pgram_str = (sz_cptr_t)&current_pgram;
+#if !SZ_IS_BIG_ENDIAN_
         sz_size_t current_pgram_length = (sz_size_t)current_pgram_str[0]; // byte order was swapped
+#else
+        sz_size_t current_pgram_length = (sz_size_t)current_pgram_str[pgram_capacity]; // No byte swapping on big-endian
+#endif
         int has_multiple_strings = nested_end - nested_start > 1;
         int has_more_characters_in_each = current_pgram_length == pgram_capacity;
         if (has_multiple_strings && has_more_characters_in_each)
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index b0647449..8860f661 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -99,10 +99,15 @@
  *  For that CMake provides the `TestBigEndian` and `CMAKE_<LANG>_BYTE_ORDER` (from 3.20 onwards).
  *  In Python one can check `sys.byteorder == 'big'` in the `setup.py` script and pass the appropriate macro.
  *  https://stackoverflow.com/a/27054190
+ *
+ *  Modern compilers typically define __BYTE_ORDER__ and __ORDER_BIG_ENDIAN__.
+ *  Fall back to legacy macros and known arch tags when unavailable.
  */
 #if !defined(SZ_IS_BIG_ENDIAN_)
-#if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN || defined(__BIG_ENDIAN__) || defined(__ARMEB__) || \
-    defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || defined(__MIBSEB) || defined(__MIBSEB__)
+#if (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)) ||                                           \
+    (defined(__BYTE_ORDER) && (__BYTE_ORDER == __BIG_ENDIAN)) || defined(__BIG_ENDIAN__) || defined(_BIG_ENDIAN) ||    \
+    defined(BIG_ENDIAN) || defined(__ARMEB__) || defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || \
+    defined(__MIBSEB) || defined(__MIBSEB__) || defined(__s390x__) || defined(__s390__)
 #define SZ_IS_BIG_ENDIAN_ (1) //< It's a big-endian target architecture
 #else
 #define SZ_IS_BIG_ENDIAN_ (0) //< It's a little-endian target architecture

From f217b8645976c5515ccd1d97c784ee5b8f40725b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 1 Sep 2025 14:00:13 +0000
Subject: [PATCH 699/751] Fix: Match new C-level `DeviceScope` behavior

---
 rust/stringzillas.rs | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/rust/stringzillas.rs b/rust/stringzillas.rs
index f02d9f11..1e02fc99 100644
--- a/rust/stringzillas.rs
+++ b/rust/stringzillas.rs
@@ -53,10 +53,10 @@ pub use crate::stringzilla::Status;
 ///
 /// ```rust
 /// # use stringzilla::szs::{DeviceScope, Status};
-/// // Handle invalid configurations gracefully
+/// // CPU cores 0 means use all available cores - this is valid
 /// match DeviceScope::cpu_cores(0) {
-///     Ok(_) => unreachable!("Should not accept 0 cores"),
-///     Err(e) => println!("Invalid CPU core count: {:?}", e),
+///     Ok(_) => println!("Using all CPU cores"),
+///     Err(e) => println!("Failed to create device scope: {:?}", e),
 /// }
 ///
 /// // GPU might not be available
@@ -2696,12 +2696,17 @@ mod tests {
 
     #[test]
     fn test_device_scope_validation() {
-        // Test invalid CPU core count
-        let invalid_cpu = DeviceScope::cpu_cores(0);
-        assert!(invalid_cpu.is_err());
+        // Test valid CPU core count - 0 means use all cores
+        let all_cores = DeviceScope::cpu_cores(0);
+        assert!(all_cores.is_ok(), "CPU cores 0 should mean all cores");
 
+        // Test single core - valid, redirects to default
         let single_core = DeviceScope::cpu_cores(1);
-        assert!(single_core.is_err()); // Should fail as per implementation
+        assert!(single_core.is_ok(), "Single core should be valid");
+
+        // Test multiple cores
+        let multi_cores = DeviceScope::cpu_cores(4);
+        assert!(multi_cores.is_ok(), "Multiple cores should be valid");
     }
 
     #[test]
@@ -2948,9 +2953,9 @@ mod tests {
 
     #[test]
     fn test_error_handling() {
-        // Test invalid device scope parameters
-        let invalid_cpu = DeviceScope::cpu_cores(0);
-        assert!(invalid_cpu.is_err());
+        // Test that valid operations don't panic
+        let valid_cpu = DeviceScope::cpu_cores(0); // 0 means all cores - valid
+        assert!(valid_cpu.is_ok(), "CPU cores 0 should succeed");
 
         let invalid_gpu = DeviceScope::gpu_device(999);
         // May succeed or fail depending on system, but shouldn't panic
@@ -2958,6 +2963,13 @@ mod tests {
             Ok(_) => println!("GPU device 999 unexpectedly available"),
             Err(e) => println!("GPU device 999 correctly failed: {:?}", e),
         }
+
+        // Test default device scope
+        let default_device = DeviceScope::default();
+        match default_device {
+            Ok(_) => println!("Default device scope created successfully"),
+            Err(e) => println!("Default device scope failed: {:?}", e),
+        }
     }
 
     #[test]

From e9fb38d437e0f9028caef9397bb6a22c7611b8ef Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 1 Sep 2025 18:24:02 +0000
Subject: [PATCH 700/751] Make: Explicit CodeQL coverage in CI

---
 .github/workflows/codeql.yml | 43 ++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100644 .github/workflows/codeql.yml

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
new file mode 100644
index 00000000..9c923a2a
--- /dev/null
+++ b/.github/workflows/codeql.yml
@@ -0,0 +1,43 @@
+name: "CodeQL"
+
+on:
+  push:
+    branches: ["main", "main-dev"]
+  pull_request:
+    branches: ["main", "main-dev"]
+  schedule:
+    - cron: "0 6 * * 2" # Weekly on Tuesdays
+
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+    timeout-minutes: 360
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        language: ["c-cpp", "python", "rust", "swift", "javascript-typescript"]
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v5
+        with:
+          submodules: recursive
+
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@v3
+        with:
+          languages: ${{ matrix.language }}
+
+      - name: Autobuild
+        uses: github/codeql-action/autobuild@v3
+
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@v3
+        with:
+          category: "/language:${{matrix.language}}"

From 320bddd1eef68d53fdd67ecd2694c677be84b037 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 2 Sep 2025 10:52:44 +0000
Subject: [PATCH 701/751] Make: JSON & YAML uniform formatting

---
 .github/ISSUE_TEMPLATE/bug_report.yml      | 184 ++++++++++-----------
 .github/ISSUE_TEMPLATE/config.yml          |   6 +-
 .github/ISSUE_TEMPLATE/feature_request.yml | 108 ++++++------
 .prettierrc                                |  14 ++
 .vscode/extensions.json                    |  20 +--
 .vscode/launch.json                        |  16 +-
 .vscode/settings.json                      |  32 +++-
 .vscode/tasks.json                         | 164 +++++++++---------
 8 files changed, 288 insertions(+), 256 deletions(-)
 create mode 100644 .prettierrc

diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
index 2b936dbd..37d1da79 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -3,98 +3,98 @@ description: Something doesn't work as expected?
 title: "Bug: "
 labels: ["bug"]
 assignees:
-- octocat
+  - octocat
 body:
-- type: markdown
-  attributes:
-    value: |
-      Thanks for taking the time to fill out this bug report!
-      If you have a small question, it may be faster to ask the maintainers on [Discord](https://discord.gg/4mxGrenbNt).
-- type: textarea
-  id: what
-  attributes:
-    label: Describe the bug
-    placeholder: A short description of what the bug is.
-  validations:
-    required: true
-- type: textarea
-  id: steps
-  attributes:
-    label: Steps to reproduce
-    placeholder: Detail the steps taken to reproduce the behavior.
-  validations:
-    required: true
-- type: textarea
-  id: expected
-  attributes:
-    label: Expected behavior
-    placeholder: A clear and concise description of what you expected to happen.
-  validations:
-    required: true
-- type: input
-  id: version
-  attributes:
-    label: StringZilla version
-    placeholder: vX.Y.Z
-  validations:
-    required: true
-- type: input
-  id: os
-  attributes:
-    label: Operating System
-    placeholder: Ubuntu 22.04
-  validations:
-    required: true
-- type: dropdown
-  id: hardware
-  attributes:
-    label: Hardware architecture
-    options:
-    - x86
-    - Arm
-  validations:
-    required: true
-- type: dropdown
-  id: frontend
-  attributes:
-    label: Which interface are you using?
-    options:
-    - C implementation
-    - C++ bindings
-    - Python bindings
-    - Rust bindings
-    - Other bindings
-  validations:
-    required: true
-- type: input
-  id: contact
-  attributes:
-    label: Contact Details
-    description: How can we get in touch with you if we need more info?
-    placeholder: email@example.com
-  validations:
-    required: false
-- type: checkboxes
-  id: mentions
-  attributes:
-    label: Are you open to being tagged as a contributor?
-    description: Sometimes, a bug report is just as valuable as a patch 🤗
-    options:
-    - label: I am open to being mentioned in the project `.git` history as a contributor
-      required: false
-- type: checkboxes
-  id: duplicate
-  attributes:
-    label: Is there an existing issue for this?
-    description: Please search [our issues](https://github.com/ashvardanian/stringzilla/issues) to see if this bug already exists.
-    options:
-    - label: I have searched the existing issues
+  - type: markdown
+    attributes:
+      value: |
+        Thanks for taking the time to fill out this bug report!
+        If you have a small question, it may be faster to ask the maintainers on [Discord](https://discord.gg/4mxGrenbNt).
+  - type: textarea
+    id: what
+    attributes:
+      label: Describe the bug
+      placeholder: A short description of what the bug is.
+    validations:
+      required: true
+  - type: textarea
+    id: steps
+    attributes:
+      label: Steps to reproduce
+      placeholder: Detail the steps taken to reproduce the behavior.
+    validations:
+      required: true
+  - type: textarea
+    id: expected
+    attributes:
+      label: Expected behavior
+      placeholder: A clear and concise description of what you expected to happen.
+    validations:
+      required: true
+  - type: input
+    id: version
+    attributes:
+      label: StringZilla version
+      placeholder: vX.Y.Z
+    validations:
       required: true
-- type: checkboxes
-  id: terms
-  attributes:
-    label: Code of Conduct
-    description: By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/ashvardanian/stringzilla/blob/main/CODE_OF_CONDUCT.md)
-    options:
-    - label: I agree to follow this project's Code of Conduct
+  - type: input
+    id: os
+    attributes:
+      label: Operating System
+      placeholder: Ubuntu 22.04
+    validations:
       required: true
+  - type: dropdown
+    id: hardware
+    attributes:
+      label: Hardware architecture
+      options:
+        - x86
+        - Arm
+    validations:
+      required: true
+  - type: dropdown
+    id: frontend
+    attributes:
+      label: Which interface are you using?
+      options:
+        - C implementation
+        - C++ bindings
+        - Python bindings
+        - Rust bindings
+        - Other bindings
+    validations:
+      required: true
+  - type: input
+    id: contact
+    attributes:
+      label: Contact Details
+      description: How can we get in touch with you if we need more info?
+      placeholder: email@example.com
+    validations:
+      required: false
+  - type: checkboxes
+    id: mentions
+    attributes:
+      label: Are you open to being tagged as a contributor?
+      description: Sometimes, a bug report is just as valuable as a patch 🤗
+      options:
+        - label: I am open to being mentioned in the project `.git` history as a contributor
+          required: false
+  - type: checkboxes
+    id: duplicate
+    attributes:
+      label: Is there an existing issue for this?
+      description: Please search [our issues](https://github.com/ashvardanian/stringzilla/issues) to see if this bug already exists.
+      options:
+        - label: I have searched the existing issues
+          required: true
+  - type: checkboxes
+    id: terms
+    attributes:
+      label: Code of Conduct
+      description: By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/ashvardanian/stringzilla/blob/main/CODE_OF_CONDUCT.md)
+      options:
+        - label: I agree to follow this project's Code of Conduct
+          required: true
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index 49ddcbbf..65708400 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,5 +1,5 @@
 blank_issues_enabled: false
 contact_links:
-- name: 💬 Ask on Discord
-  url: https://discord.gg/A6wxt6dS9j
-  about: Real-time communication with maintainers, contributors, and broader community
+  - name: 💬 Ask on Discord
+    url: https://discord.gg/A6wxt6dS9j
+    about: Real-time communication with maintainers, contributors, and broader community
diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml
index 127d9f52..d5c77f59 100644
--- a/.github/ISSUE_TEMPLATE/feature_request.yml
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -3,60 +3,60 @@ description: Something we haven't implemented yet?
 title: "Feature: "
 labels: ["enhancement"]
 assignees:
-- octocat
+  - octocat
 body:
-- type: markdown
-  attributes:
-    value: |
-      Thanks for taking the time to fill out this feature request!
-      We have already scheduled a few features for our [next milestone](https://github.com/ashvardanian/stringzilla/milestones).
-- type: textarea
-  id: what
-  attributes:
-    label: Describe what you are looking for
-    placeholder: A short description of what the feature would look like, ideally with code examples.
-  validations:
-    required: true
-- type: checkboxes
-  id: check
-  attributes:
-    label: Can you contribute to the implementation?
-    options:
-    - label: I can contribute
-- type: dropdown
-  id: interface
-  attributes:
-    label: Is your feature request specific to a certain interface?
-    options:
-    - It applies to everything
-    - C implementation
-    - C++ bindings
-    - Python bindings
-    - Other bindings
-    - Other
-  validations:
-    required: true
-- type: input
-  id: contact
-  attributes:
-    label: Contact Details
-    description: How can we get in touch with you if we need more info?
-    placeholder: email@example.com
-  validations:
-    required: false
-- type: checkboxes
-  id: duplicate
-  attributes:
-    label: Is there an existing issue for this?
-    description: Please search [our issues](https://github.com/ashvardanian/stringzilla/issues) to see if this bug already exists.
-    options:
-    - label: I have searched the existing issues
+  - type: markdown
+    attributes:
+      value: |
+        Thanks for taking the time to fill out this feature request!
+        We have already scheduled a few features for our [next milestone](https://github.com/ashvardanian/stringzilla/milestones).
+  - type: textarea
+    id: what
+    attributes:
+      label: Describe what you are looking for
+      placeholder: A short description of what the feature would look like, ideally with code examples.
+    validations:
       required: true
-- type: checkboxes
-  id: terms
-  attributes:
-    label: Code of Conduct
-    description: By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/ashvardanian/stringzilla/blob/main/CODE_OF_CONDUCT.md)
-    options:
-    - label: I agree to follow this project's Code of Conduct
+  - type: checkboxes
+    id: check
+    attributes:
+      label: Can you contribute to the implementation?
+      options:
+        - label: I can contribute
+  - type: dropdown
+    id: interface
+    attributes:
+      label: Is your feature request specific to a certain interface?
+      options:
+        - It applies to everything
+        - C implementation
+        - C++ bindings
+        - Python bindings
+        - Other bindings
+        - Other
+    validations:
       required: true
+  - type: input
+    id: contact
+    attributes:
+      label: Contact Details
+      description: How can we get in touch with you if we need more info?
+      placeholder: email@example.com
+    validations:
+      required: false
+  - type: checkboxes
+    id: duplicate
+    attributes:
+      label: Is there an existing issue for this?
+      description: Please search [our issues](https://github.com/ashvardanian/stringzilla/issues) to see if this bug already exists.
+      options:
+        - label: I have searched the existing issues
+          required: true
+  - type: checkboxes
+    id: terms
+    attributes:
+      label: Code of Conduct
+      description: By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/ashvardanian/stringzilla/blob/main/CODE_OF_CONDUCT.md)
+      options:
+        - label: I agree to follow this project's Code of Conduct
+          required: true
diff --git a/.prettierrc b/.prettierrc
new file mode 100644
index 00000000..ebaeb6b1
--- /dev/null
+++ b/.prettierrc
@@ -0,0 +1,14 @@
+{
+  "tabWidth": 4,
+  "useTabs": false,
+  "semi": true,
+  "singleQuote": false,
+  "overrides": [
+    {
+      "files": ["*.json", "*.yml", "*.yaml"],
+      "options": {
+        "tabWidth": 2
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/.vscode/extensions.json b/.vscode/extensions.json
index 17ff408a..c670af69 100644
--- a/.vscode/extensions.json
+++ b/.vscode/extensions.json
@@ -1,11 +1,11 @@
 {
-    "recommendations": [
-        "ms-vscode.cpptools-themes",
-        "ms-vscode.cmake-tools",
-        "ms-python.python",
-        "ms-python.black-formatter",
-        "yzhang.markdown-all-in-one",
-        "aaron-bond.better-comments",
-        "cheshirekow.cmake-format",
-    ]
-}
\ No newline at end of file
+  "recommendations": [
+    "ms-vscode.cpptools-themes",
+    "ms-vscode.cmake-tools",
+    "ms-python.python",
+    "ms-python.black-formatter",
+    "yzhang.markdown-all-in-one",
+    "aaron-bond.better-comments",
+    "cheshirekow.cmake-format"
+  ]
+}
diff --git a/.vscode/launch.json b/.vscode/launch.json
index cb18a505..985fa924 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -93,7 +93,7 @@
         "-enable-pretty-printing",
         "set print frame-arguments all",
         "set print asm-demangle on"
-      ],
+      ]
     },
     {
       "name": "Current C++ Benchmark",
@@ -182,7 +182,7 @@
         }
       ],
       "stopAtEntry": false,
-      "preLaunchTask": "Build Benchmarks: Debug",
+      "preLaunchTask": "Build Benchmarks: Debug"
     },
     {
       "name": "Current Python File",
@@ -199,22 +199,16 @@
       "program": "${file}",
       "console": "integratedTerminal",
       "justMyCode": true,
-      "args": [
-        "./leipzig1M.txt"
-      ],
+      "args": ["./leipzig1M.txt"]
     },
     {
       "name": "Current PyTest File",
       "type": "debugpy",
       "request": "launch",
       "module": "pytest",
-      "args": [
-        "${file}",
-        "-s",
-        "-x"
-      ],
+      "args": ["${file}", "-s", "-x"],
       "console": "integratedTerminal",
       "justMyCode": false
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 70592254..978a9223 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -227,10 +227,34 @@
     "Zilla",
     "Zillable"
   ],
+  "editor.tabSize": 4,
+  "editor.insertSpaces": true,
+  "editor.detectIndentation": false,
   "editor.formatOnSave": true,
-  "editor.rulers": [
-    120
-  ],
+  "editor.rulers": [120],
+  "[javascript]": {
+    "editor.tabSize": 4,
+    "editor.insertSpaces": true,
+    "editor.defaultFormatter": "vscode.typescript-language-features"
+  },
+  "[typescript]": {
+    "editor.tabSize": 4,
+    "editor.insertSpaces": true,
+    "editor.defaultFormatter": "vscode.typescript-language-features"
+  },
+  "[json]": {
+    "editor.tabSize": 2,
+    "editor.insertSpaces": true,
+    "editor.defaultFormatter": "esbenp.prettier-vscode"
+  },
+  "[yaml]": {
+    "editor.tabSize": 2,
+    "editor.insertSpaces": true
+  },
+  "[c]": {
+    "editor.tabSize": 4,
+    "editor.insertSpaces": true
+  },
   "files.associations": {
     "__availability": "cpp",
     "__bit_reference": "cpp",
@@ -366,4 +390,4 @@
     "xtree": "cpp",
     "xutility": "cpp"
   }
-}
\ No newline at end of file
+}
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
index 04212193..6b118a6a 100644
--- a/.vscode/tasks.json
+++ b/.vscode/tasks.json
@@ -1,83 +1,83 @@
 {
-    "version": "2.0.0",
-    "tasks": [
-        {
-            "label": "Build Test: Debug C++",
-            "command": "cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=1 -B build_debug && cmake --build build_debug --config Debug --target stringzilla_test_cpp20",
-            "args": [],
-            "type": "shell",
-            "osx": {
-                "environment": [
-                    {
-                        "name": "CMAKE_CXX_COMPILER",
-                        "value": "$(brew --prefix llvm)/bin/clang++"
-                    },
-                    {
-                        "name": "CMAKE_C_COMPILER",
-                        "value": "$(brew --prefix llvm)/bin/clang"
-                    }
-                ]
-            }
-        },
-        {
-            "label": "Build Test: Debug Parallel C++",
-            "command": "cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=1 -B build_debug && cmake --build build_debug --config Debug --target stringzillas_test_cpp20",
-            "args": [],
-            "type": "shell",
-            "osx": {
-                "environment": [
-                    {
-                        "name": "CMAKE_CXX_COMPILER",
-                        "value": "$(brew --prefix llvm)/bin/clang++"
-                    },
-                    {
-                        "name": "CMAKE_C_COMPILER",
-                        "value": "$(brew --prefix llvm)/bin/clang"
-                    }
-                ]
-            }
-        },
-        {
-            "label": "Build Test: Debug CUDA",
-            "command": "cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=1 -B build_debug && cmake --build build_debug --config Debug --target stringzillas_test_cu20",
-            "args": [],
-            "type": "shell",
-        },
-        {
-            "label": "Build Benchmarks: Debug",
-            "command": "cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=0 -D STRINGZILLA_BUILD_BENCHMARK=1 -B build_debug && cmake --build build_debug --config Debug",
-            "args": [],
-            "type": "shell",
-            "osx": {
-                "environment": [
-                    {
-                        "name": "CMAKE_CXX_COMPILER",
-                        "value": "$(brew --prefix llvm)/bin/clang++"
-                    },
-                    {
-                        "name": "CMAKE_C_COMPILER",
-                        "value": "$(brew --prefix llvm)/bin/clang"
-                    }
-                ]
-            }
-        },
-        {
-            "label": "Build Benchmarks: Release",
-            "command": "cmake -D CMAKE_BUILD_TYPE=Release -D STRINGZILLA_BUILD_TEST=0 -D STRINGZILLA_BUILD_BENCHMARK=1 -B build_release && cmake --build build_release --config Release",
-            "args": [],
-            "type": "shell",
-            "osx": {
-                "environment": [
-                    {
-                        "name": "CMAKE_CXX_COMPILER",
-                        "value": "$(brew --prefix llvm)/bin/clang++"
-                    },
-                    {
-                        "name": "CMAKE_C_COMPILER",
-                        "value": "$(brew --prefix llvm)/bin/clang"
-                    }
-                ]
-            }
-        },
-    ]
-}
\ No newline at end of file
+  "version": "2.0.0",
+  "tasks": [
+    {
+      "label": "Build Test: Debug C++",
+      "command": "cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=1 -B build_debug && cmake --build build_debug --config Debug --target stringzilla_test_cpp20",
+      "args": [],
+      "type": "shell",
+      "osx": {
+        "environment": [
+          {
+            "name": "CMAKE_CXX_COMPILER",
+            "value": "$(brew --prefix llvm)/bin/clang++"
+          },
+          {
+            "name": "CMAKE_C_COMPILER",
+            "value": "$(brew --prefix llvm)/bin/clang"
+          }
+        ]
+      }
+    },
+    {
+      "label": "Build Test: Debug Parallel C++",
+      "command": "cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=1 -B build_debug && cmake --build build_debug --config Debug --target stringzillas_test_cpp20",
+      "args": [],
+      "type": "shell",
+      "osx": {
+        "environment": [
+          {
+            "name": "CMAKE_CXX_COMPILER",
+            "value": "$(brew --prefix llvm)/bin/clang++"
+          },
+          {
+            "name": "CMAKE_C_COMPILER",
+            "value": "$(brew --prefix llvm)/bin/clang"
+          }
+        ]
+      }
+    },
+    {
+      "label": "Build Test: Debug CUDA",
+      "command": "cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=1 -B build_debug && cmake --build build_debug --config Debug --target stringzillas_test_cu20",
+      "args": [],
+      "type": "shell"
+    },
+    {
+      "label": "Build Benchmarks: Debug",
+      "command": "cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=0 -D STRINGZILLA_BUILD_BENCHMARK=1 -B build_debug && cmake --build build_debug --config Debug",
+      "args": [],
+      "type": "shell",
+      "osx": {
+        "environment": [
+          {
+            "name": "CMAKE_CXX_COMPILER",
+            "value": "$(brew --prefix llvm)/bin/clang++"
+          },
+          {
+            "name": "CMAKE_C_COMPILER",
+            "value": "$(brew --prefix llvm)/bin/clang"
+          }
+        ]
+      }
+    },
+    {
+      "label": "Build Benchmarks: Release",
+      "command": "cmake -D CMAKE_BUILD_TYPE=Release -D STRINGZILLA_BUILD_TEST=0 -D STRINGZILLA_BUILD_BENCHMARK=1 -B build_release && cmake --build build_release --config Release",
+      "args": [],
+      "type": "shell",
+      "osx": {
+        "environment": [
+          {
+            "name": "CMAKE_CXX_COMPILER",
+            "value": "$(brew --prefix llvm)/bin/clang++"
+          },
+          {
+            "name": "CMAKE_C_COMPILER",
+            "value": "$(brew --prefix llvm)/bin/clang"
+          }
+        ]
+      }
+    }
+  ]
+}

From b69206f75d42fcbbe0fb6e84bca3753b102903c3 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 2 Sep 2025 10:53:15 +0000
Subject: [PATCH 702/751] Make: Ignore formatting blames

---
 .git-blame-ignore-revs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index a57e3f0f..55f5ecc7 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -41,3 +41,4 @@ ecb377541d0c706cf8997faff4f026b07e3f76f3
 0d982a45f842287d7e344f0d8b360f52482017f5
 467b4b81cb4bc0e9a64844748a417762378918c9
 74e3b6fce1a94820c26ab0d91efe08a483d1368d
+320bddd1eef68d53fdd67ecd2694c677be84b037

From 4bf2dd6d4897cbb816d9162d720ad3137769445d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 2 Sep 2025 11:03:05 +0000
Subject: [PATCH 703/751] Add: Zero-copy JS wrapper for buffers

---
 javascript/.eslintrc.json |  32 +++
 javascript/lib.c          | 482 ++++++++++++++++++++++++++++++++++----
 javascript/stringzilla.js | 123 ++++++++--
 scripts/test.js           | 224 +++++++++++++++---
 4 files changed, 763 insertions(+), 98 deletions(-)
 create mode 100644 javascript/.eslintrc.json

diff --git a/javascript/.eslintrc.json b/javascript/.eslintrc.json
new file mode 100644
index 00000000..a2e60885
--- /dev/null
+++ b/javascript/.eslintrc.json
@@ -0,0 +1,32 @@
+{
+  "env": {
+    "es2021": true,
+    "node": true
+  },
+  "extends": "eslint:recommended",
+  "parserOptions": {
+    "ecmaVersion": "latest",
+    "sourceType": "module"
+  },
+  "rules": {
+    "camelcase": [
+      "error",
+      {
+        "properties": "always",
+        "ignoreDestructuring": false,
+        "ignoreImports": false,
+        "ignoreGlobals": false
+      }
+    ],
+    "indent": ["error", 4],
+    "linebreak-style": ["error", "unix"],
+    "quotes": ["error", "double"],
+    "semi": ["error", "always"],
+    "no-unused-vars": ["warn"],
+    "no-console": ["warn"],
+    "consistent-return": ["error"],
+    "eqeqeq": ["error", "always"],
+    "no-var": ["error"],
+    "prefer-const": ["error"]
+  }
+}
diff --git a/javascript/lib.c b/javascript/lib.c
index c468c8f8..d66aeb83 100644
--- a/javascript/lib.c
+++ b/javascript/lib.c
@@ -19,34 +19,27 @@ napi_value indexOfAPI(napi_env env, napi_callback_info info) {
     napi_value args[2];
     napi_get_cb_info(env, info, &argc, args, NULL, NULL);
 
-    // Extract the C string from the JavaScript string for haystack and needle
-    sz_string_view_t haystack = {NULL, 0};
-    sz_string_view_t needle = {NULL, 0};
-
-    // For haystack
-    napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack.length);
-    haystack.start = malloc(haystack.length + 1);
-    napi_get_value_string_utf8(env, args[0], (char *)haystack.start, haystack.length + 1, (size_t *)&haystack.length);
-
-    // For needle
-    napi_get_value_string_utf8(env, args[1], NULL, 0, (size_t *)&needle.length);
-    needle.start = malloc(needle.length + 1);
-    napi_get_value_string_utf8(env, args[1], (char *)needle.start, needle.length + 1, (size_t *)&needle.length);
+    void *haystack_data, *needle_data;
+    size_t haystack_length, needle_length;
+    napi_status status = napi_get_buffer_info(env, args[0], &haystack_data, &haystack_length);
+    if (status != napi_ok) {
+        napi_throw_error(env, NULL, "First argument must be a Buffer");
+        return NULL;
+    }
+    status = napi_get_buffer_info(env, args[1], &needle_data, &needle_length);
+    if (status != napi_ok) {
+        napi_throw_error(env, NULL, "Second argument must be a Buffer");
+        return NULL;
+    }
 
-    // Convert the result to JavaScript BigInt and return
     napi_value js_result;
-    if (needle.length == 0) { napi_create_bigint_int64(env, 0, &js_result); }
+    if (needle_length == 0) { napi_create_bigint_int64(env, 0, &js_result); }
     else {
-        sz_cptr_t result = sz_find(haystack.start, haystack.length, needle.start, needle.length);
-
-        // In JavaScript, if `indexOf` is unable to indexOf the specified value, then it should return -1
+        sz_cptr_t result = sz_find((sz_cptr_t)haystack_data, haystack_length, (sz_cptr_t)needle_data, needle_length);
         if (result == NULL) { napi_create_bigint_int64(env, -1, &js_result); }
-        else { napi_create_bigint_uint64(env, result - haystack.start, &js_result); }
+        else { napi_create_bigint_uint64(env, result - (sz_cptr_t)haystack_data, &js_result); }
     }
 
-    // Cleanup
-    free((void *)haystack.start);
-    free((void *)needle.start);
     return js_result;
 }
 
@@ -55,24 +48,24 @@ napi_value countAPI(napi_env env, napi_callback_info info) {
     napi_value args[3];
     napi_get_cb_info(env, info, &argc, args, NULL, NULL);
 
-    // Extract the C string from the JavaScript string for haystack and needle
-    sz_string_view_t haystack = {NULL, 0};
-    sz_string_view_t needle = {NULL, 0};
-
-    // For haystack
-    napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack.length);
-    haystack.start = malloc(haystack.length + 1);
-    napi_get_value_string_utf8(env, args[0], (char *)haystack.start, haystack.length + 1, (size_t *)&haystack.length);
-
-    // For needle
-    napi_get_value_string_utf8(env, args[1], NULL, 0, (size_t *)&needle.length);
-    needle.start = malloc(needle.length + 1);
-    napi_get_value_string_utf8(env, args[1], (char *)needle.start, needle.length + 1, (size_t *)&needle.length);
+    void *haystack_data, *needle_data;
+    size_t haystack_length, needle_length;
+    napi_status status = napi_get_buffer_info(env, args[0], &haystack_data, &haystack_length);
+    if (status != napi_ok) {
+        napi_throw_error(env, NULL, "First argument must be a Buffer");
+        return NULL;
+    }
+    status = napi_get_buffer_info(env, args[1], &needle_data, &needle_length);
+    if (status != napi_ok) {
+        napi_throw_error(env, NULL, "Second argument must be a Buffer");
+        return NULL;
+    }
 
     bool overlap = false;
     if (argc > 2) { napi_get_value_bool(env, args[2], &overlap); }
 
-    void const *haystack_start = haystack.start, *needle_start = needle.start;
+    sz_string_view_t haystack = {(sz_cptr_t)haystack_data, haystack_length};
+    sz_string_view_t needle = {(sz_cptr_t)needle_data, needle_length};
 
     size_t count = 0;
     if (needle.length == 0 || haystack.length == 0 || haystack.length < needle.length) { count = 0; }
@@ -97,23 +90,424 @@ napi_value countAPI(napi_env env, napi_callback_info info) {
         }
     }
 
-    // Cleanup
-    free((void *)haystack_start);
-    free((void *)needle_start);
-
-    // Convert the `count` to JavaScript `BigInt` and return
     napi_value js_count;
     napi_create_bigint_uint64(env, count, &js_count);
-
     return js_count;
 }
 
+napi_value hashAPI(napi_env env, napi_callback_info info) {
+    size_t argc = 2;
+    napi_value args[2];
+    napi_get_cb_info(env, info, &argc, args, NULL, NULL);
+
+    // Get buffer info for data (zero-copy)
+    void *buffer_data;
+    size_t buffer_length;
+    napi_status status = napi_get_buffer_info(env, args[0], &buffer_data, &buffer_length);
+    if (status != napi_ok) {
+        napi_throw_error(env, NULL, "First argument must be a Buffer");
+        return NULL;
+    }
+
+    // Get optional seed parameter (default to 0)
+    sz_u64_t seed = 0;
+    if (argc > 1) {
+        bool lossless;
+        napi_get_value_bigint_uint64(env, args[1], &seed, &lossless);
+        if (!lossless) {
+            // Try regular number if BigInt fails
+            double seed_double;
+            if (napi_get_value_double(env, args[1], &seed_double) == napi_ok) { seed = (sz_u64_t)seed_double; }
+        }
+    }
+
+    // Compute hash using StringZilla
+    sz_u64_t hash_result = sz_hash((sz_cptr_t)buffer_data, buffer_length, seed);
+
+    // Convert result to JavaScript BigInt
+    napi_value js_result;
+    napi_create_bigint_uint64(env, hash_result, &js_result);
+
+    return js_result;
+}
+
+static void hasher_cleanup(napi_env env, void *data, void *hint) { free(data); }
+typedef struct {
+    sz_hash_state_t state;
+    sz_u64_t seed;
+} hasher_t;
+
+napi_value hasherConstructor(napi_env env, napi_callback_info info) {
+    size_t argc = 1;
+    napi_value args[1];
+    napi_value jsthis;
+    napi_get_cb_info(env, info, &argc, args, &jsthis, NULL);
+
+    sz_u64_t seed = 0;
+    if (argc > 0) {
+        bool lossless;
+        napi_get_value_bigint_uint64(env, args[0], &seed, &lossless);
+        if (!lossless) {
+            double seed_double;
+            if (napi_get_value_double(env, args[0], &seed_double) == napi_ok) { seed = (sz_u64_t)seed_double; }
+        }
+    }
+
+    hasher_t *hasher = malloc(sizeof(hasher_t));
+    hasher->seed = seed;
+    sz_hash_state_init(&hasher->state, seed);
+    napi_wrap(env, jsthis, hasher, hasher_cleanup, NULL, NULL);
+
+    return jsthis;
+}
+
+napi_value hasherUpdate(napi_env env, napi_callback_info info) {
+    size_t argc = 1;
+    napi_value args[1];
+    napi_value jsthis;
+    napi_get_cb_info(env, info, &argc, args, &jsthis, NULL);
+
+    hasher_t *hasher;
+    napi_unwrap(env, jsthis, (void **)&hasher);
+
+    void *buffer_data;
+    size_t buffer_length;
+    napi_status status = napi_get_buffer_info(env, args[0], &buffer_data, &buffer_length);
+    if (status != napi_ok) {
+        napi_throw_error(env, NULL, "Argument must be a Buffer");
+        return NULL;
+    }
+
+    sz_hash_state_stream(&hasher->state, (sz_cptr_t)buffer_data, buffer_length);
+    return jsthis;
+}
+
+napi_value hasherDigest(napi_env env, napi_callback_info info) {
+    napi_value jsthis;
+    napi_get_cb_info(env, info, NULL, NULL, &jsthis, NULL);
+
+    hasher_t *hasher;
+    napi_unwrap(env, jsthis, (void **)&hasher);
+
+    sz_u64_t hash = sz_hash_state_fold(&hasher->state);
+    napi_value js_result;
+    napi_create_bigint_uint64(env, hash, &js_result);
+
+    return js_result;
+}
+
+napi_value hasherReset(napi_env env, napi_callback_info info) {
+    napi_value jsthis;
+    napi_get_cb_info(env, info, NULL, NULL, &jsthis, NULL);
+
+    hasher_t *hasher;
+    napi_unwrap(env, jsthis, (void **)&hasher);
+
+    sz_hash_state_init(&hasher->state, hasher->seed);
+    return jsthis;
+}
+
+napi_value findLastAPI(napi_env env, napi_callback_info info) {
+    size_t argc = 2;
+    napi_value args[2];
+    napi_get_cb_info(env, info, &argc, args, NULL, NULL);
+
+    // Get buffer info for haystack (zero-copy)
+    void *haystack_data;
+    size_t haystack_length;
+    napi_status status = napi_get_buffer_info(env, args[0], &haystack_data, &haystack_length);
+    if (status != napi_ok) {
+        napi_throw_error(env, NULL, "First argument must be a Buffer");
+        return NULL;
+    }
+
+    // Get buffer info for needle (zero-copy)
+    void *needle_data;
+    size_t needle_length;
+    status = napi_get_buffer_info(env, args[1], &needle_data, &needle_length);
+    if (status != napi_ok) {
+        napi_throw_error(env, NULL, "Second argument must be a Buffer");
+        return NULL;
+    }
+
+    // Convert the result to JavaScript BigInt and return
+    napi_value js_result;
+    if (needle_length == 0) { napi_create_bigint_int64(env, haystack_length, &js_result); }
+    else {
+        sz_cptr_t result = sz_rfind((sz_cptr_t)haystack_data, haystack_length, (sz_cptr_t)needle_data, needle_length);
+
+        // In JavaScript, if `lastIndexOf` is unable to find the specified value, then it should return -1
+        if (result == NULL) { napi_create_bigint_int64(env, -1, &js_result); }
+        else { napi_create_bigint_uint64(env, result - (sz_cptr_t)haystack_data, &js_result); }
+    }
+
+    return js_result;
+}
+
+napi_value findByteAPI(napi_env env, napi_callback_info info) {
+    size_t argc = 2;
+    napi_value args[2];
+    napi_get_cb_info(env, info, &argc, args, NULL, NULL);
+
+    // Get buffer info for haystack (zero-copy)
+    void *haystack_data;
+    size_t haystack_length;
+    napi_status status = napi_get_buffer_info(env, args[0], &haystack_data, &haystack_length);
+    if (status != napi_ok) {
+        napi_throw_error(env, NULL, "First argument must be a Buffer");
+        return NULL;
+    }
+
+    // Get byte value (as number)
+    double byte_value_double;
+    status = napi_get_value_double(env, args[1], &byte_value_double);
+    if (status != napi_ok) {
+        napi_throw_error(env, NULL, "Second argument must be a number");
+        return NULL;
+    }
+
+    sz_u8_t byte_value = (sz_u8_t)byte_value_double;
+
+    // Find the byte using sz_find_byte (needs pointer to byte)
+    char byte_char = (char)byte_value;
+    sz_cptr_t result = sz_find_byte((sz_cptr_t)haystack_data, haystack_length, &byte_char);
+
+    // Convert the result to JavaScript BigInt and return
+    napi_value js_result;
+    if (result == NULL) { napi_create_bigint_int64(env, -1, &js_result); }
+    else { napi_create_bigint_uint64(env, result - (sz_cptr_t)haystack_data, &js_result); }
+
+    return js_result;
+}
+
+napi_value findLastByteAPI(napi_env env, napi_callback_info info) {
+    size_t argc = 2;
+    napi_value args[2];
+    napi_get_cb_info(env, info, &argc, args, NULL, NULL);
+
+    // Get buffer info for haystack (zero-copy)
+    void *haystack_data;
+    size_t haystack_length;
+    napi_status status = napi_get_buffer_info(env, args[0], &haystack_data, &haystack_length);
+    if (status != napi_ok) {
+        napi_throw_error(env, NULL, "First argument must be a Buffer");
+        return NULL;
+    }
+
+    // Get byte value (as number)
+    double byte_value_double;
+    status = napi_get_value_double(env, args[1], &byte_value_double);
+    if (status != napi_ok) {
+        napi_throw_error(env, NULL, "Second argument must be a number");
+        return NULL;
+    }
+
+    sz_u8_t byte_value = (sz_u8_t)byte_value_double;
+
+    // Find the last byte using sz_rfind_byte (needs pointer to byte)
+    char byte_char = (char)byte_value;
+    sz_cptr_t result = sz_rfind_byte((sz_cptr_t)haystack_data, haystack_length, &byte_char);
+
+    // Convert the result to JavaScript BigInt and return
+    napi_value js_result;
+    if (result == NULL) { napi_create_bigint_int64(env, -1, &js_result); }
+    else { napi_create_bigint_uint64(env, result - (sz_cptr_t)haystack_data, &js_result); }
+
+    return js_result;
+}
+
+napi_value findByteFromAPI(napi_env env, napi_callback_info info) {
+    size_t argc = 2;
+    napi_value args[2];
+    napi_get_cb_info(env, info, &argc, args, NULL, NULL);
+
+    // Get buffer info for haystack (zero-copy)
+    void *haystack_data;
+    size_t haystack_length;
+    napi_status status = napi_get_buffer_info(env, args[0], &haystack_data, &haystack_length);
+    if (status != napi_ok) {
+        napi_throw_error(env, NULL, "First argument must be a Buffer");
+        return NULL;
+    }
+
+    // Get buffer info for allowed bytes (zero-copy)
+    void *allowed_data;
+    size_t allowed_length;
+    status = napi_get_buffer_info(env, args[1], &allowed_data, &allowed_length);
+    if (status != napi_ok) {
+        napi_throw_error(env, NULL, "Second argument must be a Buffer");
+        return NULL;
+    }
+
+    // Find first byte that is in the allowed set using sz_find_byteset
+    sz_byteset_t byteset;
+    sz_byteset_init(&byteset);
+    for (size_t i = 0; i < allowed_length; i++) { sz_byteset_add_u8(&byteset, ((sz_u8_t *)allowed_data)[i]); }
+    sz_cptr_t result = sz_find_byteset((sz_cptr_t)haystack_data, haystack_length, &byteset);
+
+    // Convert the result to JavaScript BigInt and return
+    napi_value js_result;
+    if (result == NULL) { napi_create_bigint_int64(env, -1, &js_result); }
+    else { napi_create_bigint_uint64(env, result - (sz_cptr_t)haystack_data, &js_result); }
+
+    return js_result;
+}
+
+napi_value findLastByteFromAPI(napi_env env, napi_callback_info info) {
+    size_t argc = 2;
+    napi_value args[2];
+    napi_get_cb_info(env, info, &argc, args, NULL, NULL);
+
+    // Get buffer info for haystack (zero-copy)
+    void *haystack_data;
+    size_t haystack_length;
+    napi_status status = napi_get_buffer_info(env, args[0], &haystack_data, &haystack_length);
+    if (status != napi_ok) {
+        napi_throw_error(env, NULL, "First argument must be a Buffer");
+        return NULL;
+    }
+
+    // Get buffer info for allowed bytes (zero-copy)
+    void *allowed_data;
+    size_t allowed_length;
+    status = napi_get_buffer_info(env, args[1], &allowed_data, &allowed_length);
+    if (status != napi_ok) {
+        napi_throw_error(env, NULL, "Second argument must be a Buffer");
+        return NULL;
+    }
+
+    // Find last byte that is in the allowed set using sz_rfind_byteset
+    sz_byteset_t byteset;
+    sz_byteset_init(&byteset);
+    for (size_t i = 0; i < allowed_length; i++) { sz_byteset_add_u8(&byteset, ((sz_u8_t *)allowed_data)[i]); }
+    sz_cptr_t result = sz_rfind_byteset((sz_cptr_t)haystack_data, haystack_length, &byteset);
+
+    // Convert the result to JavaScript BigInt and return
+    napi_value js_result;
+    if (result == NULL) { napi_create_bigint_int64(env, -1, &js_result); }
+    else { napi_create_bigint_uint64(env, result - (sz_cptr_t)haystack_data, &js_result); }
+
+    return js_result;
+}
+
+napi_value equalAPI(napi_env env, napi_callback_info info) {
+    size_t argc = 2;
+    napi_value args[2];
+    napi_get_cb_info(env, info, &argc, args, NULL, NULL);
+
+    // Get buffer info for first buffer (zero-copy)
+    void *first_data;
+    size_t first_length;
+    napi_status status = napi_get_buffer_info(env, args[0], &first_data, &first_length);
+    if (status != napi_ok) {
+        napi_throw_error(env, NULL, "First argument must be a Buffer");
+        return NULL;
+    }
+
+    // Get buffer info for second buffer (zero-copy)
+    void *second_data;
+    size_t second_length;
+    status = napi_get_buffer_info(env, args[1], &second_data, &second_length);
+    if (status != napi_ok) {
+        napi_throw_error(env, NULL, "Second argument must be a Buffer");
+        return NULL;
+    }
+
+    // Compare for equality - need to check length first, then content
+    sz_bool_t equal = (first_length == second_length) &&
+                      (first_length == 0 || sz_equal((sz_cptr_t)first_data, (sz_cptr_t)second_data, first_length));
+
+    // Convert to JavaScript boolean and return
+    napi_value js_result;
+    napi_get_boolean(env, equal, &js_result);
+
+    return js_result;
+}
+
+napi_value compareAPI(napi_env env, napi_callback_info info) {
+    size_t argc = 2;
+    napi_value args[2];
+    napi_get_cb_info(env, info, &argc, args, NULL, NULL);
+
+    // Get buffer info for first buffer (zero-copy)
+    void *first_data;
+    size_t first_length;
+    napi_status status = napi_get_buffer_info(env, args[0], &first_data, &first_length);
+    if (status != napi_ok) {
+        napi_throw_error(env, NULL, "First argument must be a Buffer");
+        return NULL;
+    }
+
+    // Get buffer info for second buffer (zero-copy)
+    void *second_data;
+    size_t second_length;
+    status = napi_get_buffer_info(env, args[1], &second_data, &second_length);
+    if (status != napi_ok) {
+        napi_throw_error(env, NULL, "Second argument must be a Buffer");
+        return NULL;
+    }
+
+    // Compare using sz_order
+    int order = sz_order((sz_cptr_t)first_data, first_length, (sz_cptr_t)second_data, second_length);
+
+    // Convert to JavaScript number and return
+    napi_value js_result;
+    napi_create_int32(env, order, &js_result);
+
+    return js_result;
+}
+
+napi_value byteSumAPI(napi_env env, napi_callback_info info) {
+    size_t argc = 1;
+    napi_value args[1];
+    napi_get_cb_info(env, info, &argc, args, NULL, NULL);
+
+    // Get buffer info for data (zero-copy)
+    void *buffer_data;
+    size_t buffer_length;
+    napi_status status = napi_get_buffer_info(env, args[0], &buffer_data, &buffer_length);
+    if (status != napi_ok) {
+        napi_throw_error(env, NULL, "Argument must be a Buffer");
+        return NULL;
+    }
+
+    // Compute byte sum using sz_bytesum
+    sz_u64_t sum = sz_bytesum((sz_cptr_t)buffer_data, buffer_length);
+
+    // Convert to JavaScript BigInt and return
+    napi_value js_result;
+    napi_create_bigint_uint64(env, sum, &js_result);
+
+    return js_result;
+}
+
 napi_value Init(napi_env env, napi_value exports) {
 
-    // Define an array of property descriptors
+    // Create Hasher class constructor
+    napi_value hasherClass;
+    napi_property_descriptor hasherProps[] = {{"update", 0, hasherUpdate, 0, 0, 0, napi_default, 0},
+                                              {"digest", 0, hasherDigest, 0, 0, 0, napi_default, 0},
+                                              {"reset", 0, hasherReset, 0, 0, 0, napi_default, 0}};
+    napi_define_class(env, "Hasher", NAPI_AUTO_LENGTH, hasherConstructor, NULL,
+                      sizeof(hasherProps) / sizeof(hasherProps[0]), hasherProps, &hasherClass);
+
+    // Define function exports
     napi_property_descriptor findDesc = {"indexOf", 0, indexOfAPI, 0, 0, 0, napi_default, 0};
+    napi_property_descriptor findLastDesc = {"lastIndexOf", 0, findLastAPI, 0, 0, 0, napi_default, 0};
+    napi_property_descriptor findByteDesc = {"findByte", 0, findByteAPI, 0, 0, 0, napi_default, 0};
+    napi_property_descriptor findLastByteDesc = {"findLastByte", 0, findLastByteAPI, 0, 0, 0, napi_default, 0};
+    napi_property_descriptor findByteFromDesc = {"findByteFrom", 0, findByteFromAPI, 0, 0, 0, napi_default, 0};
+    napi_property_descriptor findLastByteFromDesc = {"findLastByteFrom", 0, findLastByteFromAPI, 0, 0, 0,
+                                                     napi_default,       0};
     napi_property_descriptor countDesc = {"count", 0, countAPI, 0, 0, 0, napi_default, 0};
-    napi_property_descriptor properties[] = {findDesc, countDesc};
+    napi_property_descriptor hashDesc = {"hash", 0, hashAPI, 0, 0, 0, napi_default, 0};
+    napi_property_descriptor equalDesc = {"equal", 0, equalAPI, 0, 0, 0, napi_default, 0};
+    napi_property_descriptor compareDesc = {"compare", 0, compareAPI, 0, 0, 0, napi_default, 0};
+    napi_property_descriptor byteSumDesc = {"byteSum", 0, byteSumAPI, 0, 0, 0, napi_default, 0};
+    napi_property_descriptor hasherDesc = {"Hasher", 0, 0, 0, 0, hasherClass, napi_default, 0};
+    napi_property_descriptor properties[] = {findDesc,         findLastDesc,         findByteDesc, findLastByteDesc,
+                                             findByteFromDesc, findLastByteFromDesc, countDesc,    hashDesc,
+                                             equalDesc,        compareDesc,          byteSumDesc,  hasherDesc};
 
     // Define the properties on the `exports` object
     size_t propertyCount = sizeof(properties) / sizeof(properties[0]);
diff --git a/javascript/stringzilla.js b/javascript/stringzilla.js
index 24b78e24..7f7ef340 100644
--- a/javascript/stringzilla.js
+++ b/javascript/stringzilla.js
@@ -1,22 +1,115 @@
-const compiled = require('bindings')('stringzilla');
+import bindings from "bindings";
+
+const compiled = bindings("stringzilla");
+
+export default {
+
+    /**
+     *  Searches for a short buffer in a long one (zero-copy).
+     *
+     *  @param {Buffer} haystack - Buffer to search in
+     *  @param {Buffer} needle - Buffer to search for
+     *  @returns {bigint} Index of needle in haystack, or -1n if not found
+     */
+    find: compiled.indexOf,
+
+    /**
+     *  Searches for the last occurrence of a short buffer in a long one (zero-copy).
+     *
+     *  @param {Buffer} haystack - Buffer to search in
+     *  @param {Buffer} needle - Buffer to search for
+     *  @returns {bigint} Index of last needle in haystack, or -1n if not found
+     */
+    findLast: compiled.lastIndexOf,
+
+    /**
+     *  Finds the first occurrence of a specific byte value (zero-copy).
+     *
+     *  @param {Buffer} haystack - Buffer to search in
+     *  @param {number} byte - Byte value to search for (0-255)
+     *  @returns {bigint} Index of byte in haystack, or -1n if not found
+     */
+    findByte: compiled.findByte,
+
+    /**
+     *  Finds the last occurrence of a specific byte value (zero-copy).
+     *
+     *  @param {Buffer} haystack - Buffer to search in
+     *  @param {number} byte - Byte value to search for (0-255)
+     *  @returns {bigint} Index of last byte in haystack, or -1n if not found
+     */
+    findLastByte: compiled.findLastByte,
+
+    /**
+     *  Finds the first occurrence of any byte from a set (zero-copy).
+     *
+     *  @param {Buffer} haystack - Buffer to search in
+     *  @param {Buffer} charset - Buffer containing allowed byte values
+     *  @returns {bigint} Index of first matching byte in haystack, or -1n if not found
+     */
+    findByteFrom: compiled.findByteFrom,
+
+    /**
+     *  Finds the last occurrence of any byte from a set (zero-copy).
+     *
+     *  @param {Buffer} haystack - Buffer to search in
+     *  @param {Buffer} charset - Buffer containing allowed byte values
+     *  @returns {bigint} Index of last matching byte in haystack, or -1n if not found
+     */
+    findLastByteFrom: compiled.findLastByteFrom,
+
+
+    /**
+     *  Counts occurrences of a buffer in a larger buffer (zero-copy).
+     *
+     *  @param {Buffer} haystack - Buffer to search in
+     *  @param {Buffer} needle - Buffer to search for
+     *  @param {boolean} overlap - Whether to count overlapping matches
+     *  @returns {bigint} Number of matches found
+     */
+    count: compiled.count,
+
+
+    /**
+     *  Computes hash of a buffer using StringZilla's fast hash algorithm (zero-copy).
+     *
+     *  @param {Buffer} buffer - Buffer to hash
+     *  @param {bigint|number} seed - Optional seed for hash (default: 0)
+     *  @returns {bigint} 64-bit hash value
+     */
+    hash: compiled.hash,
+
+    /**
+     *  Stateful hasher class for streaming hash computation.
+     *  Use this for hashing data that arrives in chunks.
+     */
+    Hasher: compiled.Hasher,
+
 
-module.exports = {
     /**
-     * Searches for a short string in a long one.
-     * 
-     * @param {string} haystack 
-     * @param {string} needle 
-     * @returns {bigint}
+     *  Compares two buffers for equality (zero-copy).
+     *
+     *  @param {Buffer} first - First buffer to compare
+     *  @param {Buffer} second - Second buffer to compare
+     *  @returns {boolean} True if buffers are equal, false otherwise
      */
-    find: compiled.find,
+    equal: compiled.equal,
+
+    /**
+     *  Compares two buffers lexicographically (zero-copy).
+     *
+     *  @param {Buffer} first - First buffer to compare
+     *  @param {Buffer} second - Second buffer to compare
+     *  @returns {number} -1 if first < second, 0 if equal, 1 if first > second
+     */
+    compare: compiled.compare,
+
 
     /**
-     * Searches for a substring in a larger string.
-     * 
-     * @param {string} haystack 
-     * @param {string} needle 
-     * @param {boolean} overlap 
-     * @returns {bigint}
+     *  Computes the sum of all byte values in a buffer (zero-copy).
+     *
+     *  @param {Buffer} buffer - Buffer to sum
+     *  @returns {bigint} Sum of all byte values
      */
-    count: compiled.count
+    byteSum: compiled.byteSum,
 };
diff --git a/scripts/test.js b/scripts/test.js
index 5c1d356e..e868f190 100644
--- a/scripts/test.js
+++ b/scripts/test.js
@@ -1,70 +1,216 @@
-import test from 'node:test';
-import bindings from 'bindings';
-import assert from 'node:assert';
+import test from "node:test";
+import assert from "node:assert";
 
-const stringzilla = bindings('stringzilla');
+// Import our zero-copy buffer-only StringZilla
+import stringzilla from "../javascript/stringzilla.js";
 
-test('Find Word in Text - Positive Case', () => {
-    const result = stringzilla.indexOf('hello world, hello john', 'hello');
+test("Buffer Find - Positive Case", () => {
+    const haystack = Buffer.from("hello world, hello john");
+    const needle = Buffer.from("hello");
 
+    const result = stringzilla.find(haystack, needle);
     assert.strictEqual(result, 0n);
 });
 
-test('Find Word in Text - Negative Case (Word Not Found)', () => {
-    const result_1 = stringzilla.indexOf('ha', 'aaa');
-    assert.strictEqual(result_1, -1n);
+test("Buffer Find - Negative Case (Not Found)", () => {
+    const haystack = Buffer.from("hello world");
+    const needle = Buffer.from("xyz");
 
-    const result_2 = stringzilla.indexOf('g', 'a');
-    assert.strictEqual(result_2, -1n);
+    const result = stringzilla.find(haystack, needle);
+    assert.strictEqual(result, -1n);
 });
 
-test('Find Word in Text - Negative Case (Empty String Inputs)', () => {
-    const result_1 = stringzilla.indexOf('hello world', '');
-    assert.strictEqual(result_1, 0n);
+test("Buffer Find - Empty Needle", () => {
+    const haystack = Buffer.from("hello world");
+    const needle = Buffer.alloc(0);
 
-    const result_2 = stringzilla.indexOf('', 'a');
-    assert.strictEqual(result_2, -1n);
-
-    const result_3 = stringzilla.indexOf('', '');
-    assert.strictEqual(result_3, 0n);
+    const result = stringzilla.find(haystack, needle);
+    assert.strictEqual(result, 0n);
 });
 
+test("Buffer Count - Single Occurrence", () => {
+    const haystack = Buffer.from("hello world");
+    const needle = Buffer.from("world");
 
-test('Count Words - Single Occurrence', () => {
-    const result = stringzilla.count('hello world', 'world');
-
+    const result = stringzilla.count(haystack, needle);
     assert.strictEqual(result, 1n);
 });
 
-test('Count Words - Multiple Occurrence', () => {
-    const result = stringzilla.count('hello world, hello John', 'hello');
+test("Buffer Count - Multiple Occurrences", () => {
+    const haystack = Buffer.from("hello world, hello John");
+    const needle = Buffer.from("hello");
 
+    const result = stringzilla.count(haystack, needle, false);
     assert.strictEqual(result, 2n);
 });
 
-test('Count Words - Multiple Occurrences with Overlap Test', () => {
-    const result_1 = stringzilla.count('abababab', 'aba');
+test("Buffer Count - Overlapping Test", () => {
+    const haystack = Buffer.from("abababab");
+    const needle = Buffer.from("aba");
 
-    assert.strictEqual(result_1, 2n);
+    // Non-overlapping count
+    const resultNonOverlap = stringzilla.count(haystack, needle, false);
+    assert.strictEqual(resultNonOverlap, 2n);
 
-    const result_2 = stringzilla.count('abababab', 'aba', true);
+    // Overlapping count
+    const resultOverlap = stringzilla.count(haystack, needle, true);
+    assert.strictEqual(resultOverlap, 3n);
+});
 
-    assert.strictEqual(result_2, 3n);
+test("Buffer Hash - Basic Hashing", () => {
+    const buffer = Buffer.from("hello world");
+
+    const hash = stringzilla.hash(buffer);
+    assert.strictEqual(typeof hash, "bigint");
+    assert(hash > 0n);
 });
 
-test('Count Words - No Occurrence', () => {
-    const result = stringzilla.count('hello world', 'hi');
+test("Buffer Hash - Same Input Same Output", () => {
+    const buffer1 = Buffer.from("hello world");
+    const buffer2 = Buffer.from("hello world");
 
-    assert.strictEqual(result, 0n);
+    const hash1 = stringzilla.hash(buffer1);
+    const hash2 = stringzilla.hash(buffer2);
+
+    assert.strictEqual(hash1, hash2);
+});
+
+test("Buffer Hash - Different Seeds Different Output", () => {
+    const buffer = Buffer.from("hello world");
+
+    const hash1 = stringzilla.hash(buffer, 0n);
+    const hash2 = stringzilla.hash(buffer, 123n);
+
+    assert.notStrictEqual(hash1, hash2);
+});
+
+test("Hasher Class - Single Buffer", () => {
+    const buffer = Buffer.from("hello world");
+
+    const hasher = new stringzilla.Hasher();
+    hasher.update(buffer);
+    const hashStreaming = hasher.digest();
+
+    const hashSingle = stringzilla.hash(buffer);
+    assert.strictEqual(hashSingle, hashStreaming);
+});
+
+test("Hasher Class - Multiple Buffers", () => {
+    const bufferPrefix = Buffer.from("hello ");
+    const bufferSuffix = Buffer.from("world");
+    const bufferCombined = Buffer.from("hello world");
+
+    const hasher = new stringzilla.Hasher();
+    hasher.update(bufferPrefix).update(bufferSuffix);
+    const hashStreaming = hasher.digest();
+
+    const hashCombined = stringzilla.hash(bufferCombined);
+
+    // Progressive hashing should match single-shot hashing
+    assert.strictEqual(hashCombined, hashStreaming);
+});
+
+test("Hasher Class - Reset Functionality", () => {
+    const buffer = Buffer.from("hello world");
+
+    const hasher = new stringzilla.Hasher();
+    hasher.update(buffer);
+    const hash1 = hasher.digest();
+
+    hasher.reset();
+    hasher.update(buffer);
+    const hash2 = hasher.digest();
+
+    assert.strictEqual(hash1, hash2);
+});
+
+test("Find Last - Basic Test", () => {
+    const haystack = Buffer.from("hello world, hello john");
+    const needle = Buffer.from("hello");
+
+    const result = stringzilla.findLast(haystack, needle);
+    assert.strictEqual(result, 13n);
+});
+
+test("Find Byte - Basic Test", () => {
+    const haystack = Buffer.from("hello world");
+    const byte = "o".charCodeAt(0);
+
+    const result = stringzilla.findByte(haystack, byte);
+    assert.strictEqual(result, 4n);
+});
+
+test("Find Last Byte - Basic Test", () => {
+    const haystack = Buffer.from("hello world");
+    const byte = "o".charCodeAt(0);
+
+    const result = stringzilla.findLastByte(haystack, byte);
+    assert.strictEqual(result, 7n);
+});
+
+test("Find Byte From - Basic Test", () => {
+    const haystack = Buffer.from("hello world");
+    const charset = Buffer.from("aeiou");
+
+    const result = stringzilla.findByteFrom(haystack, charset);
+    assert.strictEqual(result, 1n); // First vowel 'e'
+});
+
+test("Find Last Byte From - Basic Test", () => {
+    const haystack = Buffer.from("hello world");
+    const charset = Buffer.from("aeiou");
+
+    const result = stringzilla.findLastByteFrom(haystack, charset);
+    assert.strictEqual(result, 7n); // Last vowel 'o'
+});
+
+test("Equal - Basic Test", () => {
+    const buffer1 = Buffer.from("hello");
+    const buffer2 = Buffer.from("hello");
+    const buffer3 = Buffer.from("world");
+
+    assert.strictEqual(stringzilla.equal(buffer1, buffer2), true);
+    assert.strictEqual(stringzilla.equal(buffer1, buffer3), false);
+});
+
+test("Compare - Basic Test", () => {
+    const buffer1 = Buffer.from("abc");
+    const buffer2 = Buffer.from("abc");
+    const buffer3 = Buffer.from("def");
+    const buffer4 = Buffer.from("ab");
+
+    assert.strictEqual(stringzilla.compare(buffer1, buffer2), 0);
+    assert(stringzilla.compare(buffer1, buffer3) < 0);
+    assert(stringzilla.compare(buffer3, buffer1) > 0);
+    assert(stringzilla.compare(buffer1, buffer4) > 0);
 });
 
-test('Count Words - Empty String Inputs', () => {
-    const result_1 = stringzilla.count('hello world', '');
-    assert.strictEqual(result_1, 0n);
+test("Byte Sum - Basic Test", () => {
+    const buffer = Buffer.from([1, 2, 3, 4, 5]);
+    const expectedSum = 1 + 2 + 3 + 4 + 5;
+
+    const result = stringzilla.byteSum(buffer);
+    assert.strictEqual(result, BigInt(expectedSum));
+});
+
+test("Zero-Copy Performance Test", () => {
+    // Test with larger buffers to demonstrate zero-copy benefits
+    const largeHaystack = Buffer.alloc(10000, "a");
+    const needle = Buffer.from("aaa");
+
+    // This should be fast due to zero-copy buffer access
+    const result = stringzilla.find(largeHaystack, needle);
+    assert.strictEqual(result, 0n);
+
+    // Hash performance test
+    const hash = stringzilla.hash(largeHaystack);
+    assert.strictEqual(typeof hash, "bigint");
 
-    const result_2 = stringzilla.count('', 'hi');
-    assert.strictEqual(result_2, 0n);
+    // Byte sum performance test
+    const byteSum = stringzilla.byteSum(largeHaystack);
+    assert.strictEqual(typeof byteSum, "bigint");
 
-    const result_3 = stringzilla.count('', '');
-    assert.strictEqual(result_3, 0n);
+    // Find byte performance test
+    const byteResult = stringzilla.findByte(largeHaystack, 97); // 'a'
+    assert.strictEqual(byteResult, 0n);
 });

From 9b8c46675332de9d2daad25236542af65ce67cba Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 2 Sep 2025 11:03:34 +0000
Subject: [PATCH 704/751] Make: Parallel algorithms CI/CD for PyPI

---
 .github/workflows/release.yml | 230 +++++++++++++++++++++++++++++++---
 1 file changed, 216 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index d1ab782f..b566ced3 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -73,8 +73,8 @@ jobs:
           unprotect_reviews: True
           force: True
 
-  build_wheels:
-    name: Build Python ${{ matrix.python-version }} for ${{ matrix.os }}
+  build_wheels_stringzilla:
+    name: Build StringZilla ${{ matrix.python-version }} for ${{ matrix.os }}
     runs-on: ${{ matrix.os }}
     needs: versioning
     strategy:
@@ -102,15 +102,126 @@ jobs:
           CIBW_BUILD: cp${{ matrix.python-version }}-*
           MACOSX_DEPLOYMENT_TARGET: "11.0"
           CIBW_ENVIRONMENT_LINUX: SZ_IS_QEMU_=1 # When emulating this will disable some tricky SIMD instructions
+          SZ_TARGET: stringzilla
       - name: Upload wheels
         uses: actions/upload-artifact@v4
         with:
-          name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
+          name: cibw-wheels-stringzilla-${{ matrix.os }}-${{ strategy.job-index }}
           path: ./wheelhouse/*.whl
           overwrite: true
 
-  build_sdist:
-    name: Build Python Source Distribution
+  build_wheels_stringzillas_cpus:
+    name: Build StringZillas-CPUs ${{ matrix.python-version }} for ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    needs: versioning
+    strategy:
+      matrix:
+        os: [ubuntu-24.04, macos-13, windows-2022]
+        python-version: ["38", "39", "310", "311", "312", "313"]
+    steps:
+      - uses: actions/checkout@v5
+        with:
+          ref: "main"
+          submodules: recursive
+
+      - name: Set up Python
+        uses: actions/setup-python@v5.2.0
+        with:
+          python-version: 3.x
+      - name: Setup QEMU
+        if: matrix.os == 'ubuntu-24.04' # We only need QEMU for Linux builds
+        uses: docker/setup-qemu-action@v3
+      - name: Install cibuildwheel
+        run: python -m pip install cibuildwheel==2.21.3
+      - name: Build wheels
+        run: cibuildwheel --output-dir wheelhouse
+        env:
+          CIBW_BUILD: cp${{ matrix.python-version }}-*
+          MACOSX_DEPLOYMENT_TARGET: "11.0"
+          CIBW_ENVIRONMENT_LINUX: SZ_IS_QEMU_=1 # When emulating this will disable some tricky SIMD instructions
+          SZ_TARGET: stringzillas-cpus
+          # Limit to 64-bit architectures only
+          CIBW_ARCHS_LINUX: x86_64 aarch64
+          CIBW_ARCHS_MACOS: x86_64 arm64
+          CIBW_ARCHS_WINDOWS: AMD64 ARM64
+      - name: Upload wheels
+        uses: actions/upload-artifact@v4
+        with:
+          name: cibw-wheels-stringzillas-cpus-${{ matrix.os }}-${{ strategy.job-index }}
+          path: ./wheelhouse/*.whl
+          overwrite: true
+
+  build_wheels_stringzillas_cuda:
+    name: Build StringZillas-CUDA ${{ matrix.python-version }} for ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    needs: versioning
+    strategy:
+      matrix:
+        os: [ubuntu-24.04]  # Linux only
+        python-version: ["38", "39", "310", "311", "312", "313"]
+    steps:
+      - uses: actions/checkout@v5
+        with:
+          ref: "main"
+          submodules: recursive
+
+      # Install CUDA Toolkit
+      - name: Install CUDA Toolkit
+        uses: Jimver/cuda-toolkit@v0.2.26
+        with:
+          cuda: "12.9.1"
+          method: "network"
+
+      - name: Set up Python
+        uses: actions/setup-python@v5.2.0
+        with:
+          python-version: 3.x
+      - name: Setup QEMU
+        uses: docker/setup-qemu-action@v3
+      - name: Install cibuildwheel
+        run: python -m pip install cibuildwheel==2.21.3
+      - name: Build wheels
+        run: cibuildwheel --output-dir wheelhouse
+        env:
+          CIBW_BUILD: cp${{ matrix.python-version }}-*
+          CIBW_ENVIRONMENT_LINUX: SZ_IS_QEMU_=1 # When emulating this will disable some tricky SIMD instructions
+          SZ_TARGET: stringzillas-cuda
+          # Limit to 64-bit architectures only
+          CIBW_ARCHS_LINUX: x86_64 aarch64
+      - name: Upload wheels
+        uses: actions/upload-artifact@v4
+        with:
+          name: cibw-wheels-stringzillas-cuda-${{ matrix.os }}-${{ strategy.job-index }}
+          path: ./wheelhouse/*.whl
+          overwrite: true
+
+  build_sdist_stringzilla:
+    name: Build StringZilla Source Distribution
+    runs-on: ubuntu-24.04
+    needs: versioning
+    steps:
+      - uses: actions/checkout@v5
+        with:
+          ref: "main"
+          submodules: recursive
+      - name: Set up Python
+        uses: actions/setup-python@v5.2.0
+        with:
+          python-version: 3.x
+      - name: Build source distribution
+        run: pip install build &&
+          python -m build --sdist
+        env:
+          SZ_TARGET: stringzilla
+      - name: Upload source distribution
+        uses: actions/upload-artifact@v4
+        with:
+          name: sdist-stringzilla
+          path: dist/*.tar.gz
+          retention-days: 1
+
+  build_sdist_stringzillas_cpus:
+    name: Build StringZillas-CPUs Source Distribution
     runs-on: ubuntu-24.04
     needs: versioning
     steps:
@@ -125,19 +236,46 @@ jobs:
       - name: Build source distribution
         run: pip install build &&
           python -m build --sdist
+        env:
+          SZ_TARGET: stringzillas-cpus
       - name: Upload source distribution
         uses: actions/upload-artifact@v4
         with:
-          name: sdist
+          name: sdist-stringzillas-cpus
           path: dist/*.tar.gz
           retention-days: 1
 
-  publish_python:
-    name: Publish Python
-    needs: [build_wheels, build_sdist]
+  build_sdist_stringzillas_cuda:
+    name: Build StringZillas-CUDA Source Distribution
+    runs-on: ubuntu-24.04
+    needs: versioning
+    steps:
+      - uses: actions/checkout@v5
+        with:
+          ref: "main"
+          submodules: recursive
+      - name: Set up Python
+        uses: actions/setup-python@v5.2.0
+        with:
+          python-version: 3.x
+      - name: Build source distribution
+        run: pip install build &&
+          python -m build --sdist
+        env:
+          SZ_TARGET: stringzillas-cuda
+      - name: Upload source distribution
+        uses: actions/upload-artifact@v4
+        with:
+          name: sdist-stringzillas-cuda
+          path: dist/*.tar.gz
+          retention-days: 1
+
+  publish_pypi_stringzilla:
+    name: Publish StringZilla to PyPI
+    needs: [build_wheels_stringzilla, build_sdist_stringzilla]
     runs-on: ubuntu-24.04
     environment:
-      name: pypi
+      name: pypi-stringzilla
       url: https://pypi.org/p/stringzilla
     permissions:
       id-token: write
@@ -146,18 +284,82 @@ jobs:
       - name: Download wheels
         uses: actions/download-artifact@v4
         with:
-          # unpacks all CIBW artifacts into dist/
-          pattern: cibw-*
+          # unpacks all StringZilla wheel artifacts into dist/
+          pattern: cibw-wheels-stringzilla-*
+          path: dist
+          merge-multiple: true
+
+      - name: Download source distribution
+        uses: actions/download-artifact@v4
+        with:
+          name: sdist-stringzilla
+          path: dist
+
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          packages-dir: dist
+          verbose: true
+          print-hash: true
+
+  publish_pypi_stringzillas_cpus:
+    name: Publish StringZillas-CPUs to PyPI
+    needs: [build_wheels_stringzillas_cpus, build_sdist_stringzillas_cpus]
+    runs-on: ubuntu-24.04
+    environment:
+      name: pypi-stringzillas-cpus
+      url: https://pypi.org/p/stringzillas-cpus
+    permissions:
+      id-token: write
+
+    steps:
+      - name: Download wheels
+        uses: actions/download-artifact@v4
+        with:
+          # unpacks all StringZillas-CPUs wheel artifacts into dist/
+          pattern: cibw-wheels-stringzillas-cpus-*
+          path: dist
+          merge-multiple: true
+
+      - name: Download source distribution
+        uses: actions/download-artifact@v4
+        with:
+          name: sdist-stringzillas-cpus
+          path: dist
+
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          packages-dir: dist
+          verbose: true
+          print-hash: true
+
+  publish_pypi_stringzillas_cuda:
+    name: Publish StringZillas-CUDA to PyPI
+    needs: [build_wheels_stringzillas_cuda, build_sdist_stringzillas_cuda]
+    runs-on: ubuntu-24.04
+    environment:
+      name: pypi-stringzillas-cuda
+      url: https://pypi.org/p/stringzillas-cuda
+    permissions:
+      id-token: write
+
+    steps:
+      - name: Download wheels
+        uses: actions/download-artifact@v4
+        with:
+          # unpacks all StringZillas-CUDA wheel artifacts into dist/
+          pattern: cibw-wheels-stringzillas-cuda-*
           path: dist
           merge-multiple: true
 
       - name: Download source distribution
         uses: actions/download-artifact@v4
         with:
-          name: sdist
+          name: sdist-stringzillas-cuda
           path: dist
 
-      - name: Publish to PyPi
+      - name: Publish to PyPI
         uses: pypa/gh-action-pypi-publish@release/v1
         with:
           packages-dir: dist

From 78bbc11bef1c8ccdca77f5e0feb49dc367ad6cd3 Mon Sep 17 00:00:00 2001
From: Ger Hobbelt <ger@hobbelt.com>
Date: Tue, 2 Sep 2025 13:18:19 +0200
Subject: [PATCH 705/751] Fix: Unknown pragmas in MSVC (#231)

Also considered:

```c
#if LIBASSERT_IS_CLANG || LIBASSERT_IS_GCC
  #define LIBASSERT_WARNING_PRAGMA_PUSH _Pragma("GCC diagnostic push")
  #define LIBASSERT_WARNING_PRAGMA_POP _Pragma("GCC diagnostic pop")
#elif LIBASSERT_IS_MSVC
  #define LIBASSERT_WARNING_PRAGMA_PUSH _Pragma("warning(push)")
  #define LIBASSERT_WARNING_PRAGMA_POP _Pragma("warning(pop)")
#endif
```

Closes #226

Co-authored-by: Ger Hobbelt <402462+GerHobbelt@users.noreply.github.com>
Co-authored-by: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
---
 include/stringzilla/find.h  | 4 ++++
 include/stringzilla/types.h | 7 +++++++
 2 files changed, 11 insertions(+)

diff --git a/include/stringzilla/find.h b/include/stringzilla/find.h
index cf4a8284..3b2ffb9a 100644
--- a/include/stringzilla/find.h
+++ b/include/stringzilla/find.h
@@ -311,13 +311,17 @@ SZ_PUBLIC sz_cptr_t sz_find_byteset_serial(sz_cptr_t text, sz_size_t length, sz_
 }
 
 SZ_PUBLIC sz_cptr_t sz_rfind_byteset_serial(sz_cptr_t text, sz_size_t length, sz_byteset_t const *set) {
+#if !defined(_MSC_VER)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
     sz_cptr_t const end = text;
     for (text += length; text != end;)
         if (sz_byteset_contains(set, *(text -= 1))) return text;
     return SZ_NULL_CHAR;
+#if !defined(_MSC_VER)
 #pragma GCC diagnostic pop
+#endif
 }
 
 /*  Find the first occurrence of a @b single-character needle in an arbitrary length haystack.
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 8860f661..0e5a5b82 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -990,9 +990,11 @@ SZ_PUBLIC void sz_sequence_from_null_terminated_strings(sz_cptr_t *start, sz_siz
 
 #pragma region Helper Functions
 
+#if !defined(_MSC_VER)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wconversion"
 #pragma GCC visibility push(hidden)
+#endif
 
 /*
  **********************************************************************************************************************
@@ -1246,6 +1248,7 @@ SZ_INTERNAL __mmask64 sz_u64_clamp_mask_until_(sz_size_t n) {
 #pragma GCC pop_options
 #endif
 #endif
+#endif
 
 /**
  *  @brief  Byte-level equality comparison between two 64-bit integers.
@@ -1417,7 +1420,9 @@ SZ_INTERNAL void sz_memory_free_fixed_(sz_ptr_t start, sz_size_t length, void *h
     sz_unused_(start && length && handle);
 }
 
+#if !defined(_MSC_VER)
 #pragma GCC visibility pop
+#endif
 #pragma endregion
 
 #pragma region Serial Implementation
@@ -1490,7 +1495,9 @@ SZ_PUBLIC void sz_sequence_from_null_terminated_strings(sz_cptr_t *start, sz_siz
 #pragma endregion
 
 #ifdef __cplusplus
+#if !defined(_MSC_VER)
 #pragma GCC diagnostic pop
+#endif
 }
 #endif // __cplusplus
 

From 83492ac2089d14a316604623152c6bfc7fb3ed14 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 2 Sep 2025 11:35:15 +0000
Subject: [PATCH 706/751] Fix: `_MSC_VER` to `__GNUC__` conditions

---
 include/stringzilla/find.h  | 4 ++--
 include/stringzilla/types.h | 9 ++++-----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/include/stringzilla/find.h b/include/stringzilla/find.h
index 3b2ffb9a..e797eec0 100644
--- a/include/stringzilla/find.h
+++ b/include/stringzilla/find.h
@@ -311,7 +311,7 @@ SZ_PUBLIC sz_cptr_t sz_find_byteset_serial(sz_cptr_t text, sz_size_t length, sz_
 }
 
 SZ_PUBLIC sz_cptr_t sz_rfind_byteset_serial(sz_cptr_t text, sz_size_t length, sz_byteset_t const *set) {
-#if !defined(_MSC_VER)
+#if defined(__GNUC__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Warray-bounds"
 #endif
@@ -319,7 +319,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byteset_serial(sz_cptr_t text, sz_size_t length, sz
     for (text += length; text != end;)
         if (sz_byteset_contains(set, *(text -= 1))) return text;
     return SZ_NULL_CHAR;
-#if !defined(_MSC_VER)
+#if defined(__GNUC__)
 #pragma GCC diagnostic pop
 #endif
 }
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 0e5a5b82..648270e3 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -990,7 +990,7 @@ SZ_PUBLIC void sz_sequence_from_null_terminated_strings(sz_cptr_t *start, sz_siz
 
 #pragma region Helper Functions
 
-#if !defined(_MSC_VER)
+#if defined(__GNUC__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wconversion"
 #pragma GCC visibility push(hidden)
@@ -1247,8 +1247,7 @@ SZ_INTERNAL __mmask64 sz_u64_clamp_mask_until_(sz_size_t n) {
 #elif defined(__GNUC__)
 #pragma GCC pop_options
 #endif
-#endif
-#endif
+#endif // SZ_USE_SKYLAKE || SZ_USE_ICE
 
 /**
  *  @brief  Byte-level equality comparison between two 64-bit integers.
@@ -1420,7 +1419,7 @@ SZ_INTERNAL void sz_memory_free_fixed_(sz_ptr_t start, sz_size_t length, void *h
     sz_unused_(start && length && handle);
 }
 
-#if !defined(_MSC_VER)
+#if defined(__GNUC__)
 #pragma GCC visibility pop
 #endif
 #pragma endregion
@@ -1495,7 +1494,7 @@ SZ_PUBLIC void sz_sequence_from_null_terminated_strings(sz_cptr_t *start, sz_siz
 #pragma endregion
 
 #ifdef __cplusplus
-#if !defined(_MSC_VER)
+#if defined(__GNUC__)
 #pragma GCC diagnostic pop
 #endif
 }

From 00d75f530ce8e6a0d2c01c025592da3647a8baa1 Mon Sep 17 00:00:00 2001
From: Mark Reed <markreed99@gmail.com>
Date: Tue, 2 Sep 2025 04:39:53 -0700
Subject: [PATCH 707/751] Improve: NodeJS groundwork & corner-case tests (#151)

Co-authored-by: Mark Reed <mark@chattyr.com>
Co-authored-by: Mark Reed <5108907+MarkReedZ@users.noreply.github.com>
Co-authored-by: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
---
 scripts/test.js | 168 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 168 insertions(+)

diff --git a/scripts/test.js b/scripts/test.js
index e868f190..cf46fbb7 100644
--- a/scripts/test.js
+++ b/scripts/test.js
@@ -214,3 +214,171 @@ test("Zero-Copy Performance Test", () => {
     const byteResult = stringzilla.findByte(largeHaystack, 97); // 'a'
     assert.strictEqual(byteResult, 0n);
 });
+
+test("Edge Cases - Empty Buffers", () => {
+    const haystack = Buffer.from("hello world");
+    const empty = Buffer.alloc(0);
+
+    // Finding empty in non-empty should return 0
+    assert.strictEqual(stringzilla.find(haystack, empty), 0n);
+    assert.strictEqual(stringzilla.findLast(haystack, empty), BigInt(haystack.length));
+
+    // Finding non-empty in empty should return -1
+    assert.strictEqual(stringzilla.find(empty, haystack), -1n);
+    assert.strictEqual(stringzilla.findLast(empty, haystack), -1n);
+
+    // Empty in empty
+    assert.strictEqual(stringzilla.find(empty, empty), 0n);
+    assert.strictEqual(stringzilla.count(empty, empty), 0n);
+});
+
+test("Find Byte - Boundary Values", () => {
+    const buffer = Buffer.from([0, 127, 128, 255]);
+
+    // Test boundary byte values
+    assert.strictEqual(stringzilla.findByte(buffer, 0), 0n);
+    assert.strictEqual(stringzilla.findByte(buffer, 127), 1n);
+    assert.strictEqual(stringzilla.findByte(buffer, 128), 2n);
+    assert.strictEqual(stringzilla.findByte(buffer, 255), 3n);
+
+    // Test not found
+    assert.strictEqual(stringzilla.findByte(buffer, 1), -1n);
+});
+
+test("UTF-8 Multi-byte Character Handling", () => {
+    const haystack = Buffer.from("Hello 世界 World");
+    const needle = Buffer.from("世界");
+
+    // Should work at byte level, not character level
+    const result = stringzilla.find(haystack, needle);
+    assert(result > 0n);
+
+    // Test with emoji
+    const emojiBuffer = Buffer.from("Hello 👋 World");
+    const emoji = Buffer.from("👋");
+    assert(stringzilla.find(emojiBuffer, emoji) > 0n);
+});
+
+test("Pattern at Buffer Boundaries", () => {
+    const haystack = Buffer.from("abcdefghijk");
+
+    // Pattern at start
+    assert.strictEqual(stringzilla.find(haystack, Buffer.from("abc")), 0n);
+
+    // Pattern at end
+    assert.strictEqual(stringzilla.find(haystack, Buffer.from("ijk")), 8n);
+    assert.strictEqual(stringzilla.findLast(haystack, Buffer.from("ijk")), 8n);
+
+    // Pattern spans entire buffer
+    assert.strictEqual(stringzilla.find(haystack, haystack), 0n);
+});
+
+test("Repeated Patterns", () => {
+    const haystack = Buffer.from("aaaaaaaaaa");
+    const needle = Buffer.from("aa");
+
+    // Test first and last occurrence
+    assert.strictEqual(stringzilla.find(haystack, needle), 0n);
+    assert.strictEqual(stringzilla.findLast(haystack, needle), 8n);
+
+    // Count with and without overlap
+    assert.strictEqual(stringzilla.count(haystack, needle, false), 5n);
+    assert.strictEqual(stringzilla.count(haystack, needle, true), 9n);
+});
+
+test("Find Byte From - Edge Cases", () => {
+    const haystack = Buffer.from("1234567890");
+
+    // Empty charset
+    const emptyCharset = Buffer.alloc(0);
+    assert.strictEqual(stringzilla.findByteFrom(haystack, emptyCharset), -1n);
+
+    // Charset with all possible bytes
+    const allBytes = Buffer.alloc(256);
+    for (let i = 0; i < 256; i++) allBytes[i] = i;
+    assert.strictEqual(stringzilla.findByteFrom(haystack, allBytes), 0n);
+
+    // Charset with duplicates
+    const duplicates = Buffer.from("1111");
+    assert.strictEqual(stringzilla.findByteFrom(haystack, duplicates), 0n);
+});
+
+test("Binary Data Handling", () => {
+    // Test with null bytes and binary data
+    const binaryData = Buffer.from([0x00, 0x01, 0x02, 0x00, 0x03, 0x00]);
+    const nullByte = Buffer.from([0x00]);
+
+    assert.strictEqual(stringzilla.find(binaryData, nullByte), 0n);
+    assert.strictEqual(stringzilla.findLast(binaryData, nullByte), 5n);
+    assert.strictEqual(stringzilla.count(binaryData, nullByte), 3n);
+
+    // Test hash consistency with binary data
+    const hash1 = stringzilla.hash(binaryData);
+    const hash2 = stringzilla.hash(binaryData);
+    assert.strictEqual(hash1, hash2);
+});
+
+test("Large Buffer Operations", () => {
+    const size = 100000; // 100KB (smaller than 1MB for faster tests)
+    const largeBuffer = Buffer.alloc(size);
+
+    // Fill with pattern
+    for (let i = 0; i < size; i++) {
+        largeBuffer[i] = i % 256;
+    }
+
+    // Test operations on large buffer
+    const pattern = Buffer.from([0, 1, 2, 3]);
+    assert(stringzilla.count(largeBuffer, pattern) > 0n);
+
+    // Test hash performance
+    const start = Date.now();
+    const hash = stringzilla.hash(largeBuffer);
+    const duration = Date.now() - start;
+    assert(duration < 100); // Should be fast
+    assert(typeof hash === "bigint");
+});
+
+test("Hasher - Incremental vs Single Shot", () => {
+    const data = Buffer.from("a".repeat(1000));
+
+    // Single shot
+    const hashSingle = stringzilla.hash(data);
+
+    // Progressive hashing with different chunk sizes should be consistent
+    const hasher1 = new stringzilla.Hasher();
+    hasher1.update(data.subarray(0, 100));
+    hasher1.update(data.subarray(100, 500));
+    hasher1.update(data.subarray(500));
+    const hashProgressive1 = hasher1.digest();
+
+    const hasher2 = new stringzilla.Hasher();
+    hasher2.update(data.subarray(0, 300));
+    hasher2.update(data.subarray(300));
+    const hashProgressive2 = hasher2.digest();
+
+    // Progressive hashing with same data should be consistent
+    assert.strictEqual(hashProgressive1, hashProgressive2);
+
+    // Test that single-shot and progressive produce valid hashes
+    assert.strictEqual(typeof hashSingle, "bigint");
+    assert.strictEqual(typeof hashProgressive1, "bigint");
+    assert(hashSingle > 0n);
+    assert(hashProgressive1 > 0n);
+});
+
+test("Compare - Special Cases", () => {
+    // Different lengths
+    assert(stringzilla.compare(Buffer.from("a"), Buffer.from("aa")) < 0);
+    assert(stringzilla.compare(Buffer.from("aa"), Buffer.from("a")) > 0);
+
+    // Empty buffers
+    assert.strictEqual(stringzilla.compare(Buffer.alloc(0), Buffer.alloc(0)), 0);
+    assert(stringzilla.compare(Buffer.alloc(0), Buffer.from("a")) < 0);
+    assert(stringzilla.compare(Buffer.from("a"), Buffer.alloc(0)) > 0);
+
+    // Binary data comparison
+    const binary1 = Buffer.from([0x00, 0x01, 0x02]);
+    const binary2 = Buffer.from([0x00, 0x01, 0x03]);
+    assert(stringzilla.compare(binary1, binary2) < 0);
+});

From 607dd140fd137faeb5fc6687e9a4cc9819a705f3 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 2 Sep 2025 12:24:11 +0000
Subject: [PATCH 708/751] Add: Big-endian SWAR backends

Enhances performance for 2, 3, and 4-byte substring searches on big endian
architectures by implementing endianness-aware SWAR techniques. This bridges
the performance gap between little and big endian systems by making optimized
SWAR functions work on both architectures, eliminating the need for separate
fallback paths to slower Horspool algorithms.

Co-authored-by: Semyon Danilov <samvimes@yandex.ru>
Co-authored-by: Semyon Danilov <4058545+SammyVimes@users.noreply.github.com>
---
 include/stringzilla/find.h | 57 ++++++++++++++++++++++++++++++--------
 1 file changed, 46 insertions(+), 11 deletions(-)

diff --git a/include/stringzilla/find.h b/include/stringzilla/find.h
index e797eec0..1d2e42e1 100644
--- a/include/stringzilla/find.h
+++ b/include/stringzilla/find.h
@@ -439,16 +439,29 @@ SZ_INTERNAL sz_cptr_t sz_find_2byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_
 
     // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
     for (; h + 9 <= h_end; h += 8) {
+#if !SZ_IS_BIG_ENDIAN_
         h_even_vec.u64 = *(sz_u64_t *)h;
         h_odd_vec.u64 = (h_even_vec.u64 >> 8) | ((sz_u64_t)h[8] << 56);
+#else
+        h_even_vec.u64 = *(sz_u64_t *)h;
+        h_odd_vec.u64 = (h_even_vec.u64 << 8) | ((sz_u64_t)h[8] >> 56);
+#endif
         matches_even_vec = sz_u64_each_2byte_equal_(h_even_vec, n_vec);
         matches_odd_vec = sz_u64_each_2byte_equal_(h_odd_vec, n_vec);
 
+#if !SZ_IS_BIG_ENDIAN_
         matches_even_vec.u64 >>= 8;
         if (matches_even_vec.u64 + matches_odd_vec.u64) {
             sz_u64_t match_indicators = matches_even_vec.u64 | matches_odd_vec.u64;
             return h + sz_u64_ctz(match_indicators) / 8;
         }
+#else
+        matches_even_vec.u64 <<= 8;
+        if (matches_even_vec.u64 + matches_odd_vec.u64) {
+            sz_u64_t match_indicators = matches_even_vec.u64 | matches_odd_vec.u64;
+            return h + sz_u64_clz(match_indicators) / 8;
+        }
+#endif
     }
 
     for (; h + 2 <= h_end; ++h)
@@ -498,21 +511,36 @@ SZ_INTERNAL sz_cptr_t sz_find_4byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_
     for (; h + sizeof(sz_u64_t) + sizeof(sz_u32_t) <= h_end; h += sizeof(sz_u64_t)) {
         h_page_current = *(sz_u64_t *)h;
         h_page_next = *(sz_u32_t *)(h + 8);
+#if !SZ_IS_BIG_ENDIAN_
         h0_vec.u64 = (h_page_current);
         h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
         h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
         h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
+#else
+        h0_vec.u64 = (h_page_current);
+        h1_vec.u64 = (h_page_current << 8) | (h_page_next >> 24);
+        h2_vec.u64 = (h_page_current << 16) | (h_page_next >> 16);
+        h3_vec.u64 = (h_page_current << 24) | (h_page_next >> 8);
+#endif
         matches0_vec = sz_u64_each_4byte_equal_(h0_vec, n_vec);
         matches1_vec = sz_u64_each_4byte_equal_(h1_vec, n_vec);
         matches2_vec = sz_u64_each_4byte_equal_(h2_vec, n_vec);
         matches3_vec = sz_u64_each_4byte_equal_(h3_vec, n_vec);
 
         if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64) {
+#if !SZ_IS_BIG_ENDIAN_
             matches0_vec.u64 >>= 24;
             matches1_vec.u64 >>= 16;
             matches2_vec.u64 >>= 8;
             sz_u64_t match_indicators = matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64;
             return h + sz_u64_ctz(match_indicators) / 8;
+#else
+            matches0_vec.u64 <<= 24;
+            matches1_vec.u64 <<= 16;
+            matches2_vec.u64 <<= 8;
+            sz_u64_t match_indicators = matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64;
+            return h + sz_u64_clz(match_indicators) / 8;
+#endif
         }
     }
 
@@ -567,10 +595,17 @@ SZ_INTERNAL sz_cptr_t sz_find_3byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_
         h_page_current = *(sz_u64_t *)h;
         h_page_next = *(sz_u16_t *)(h + 8);
         h0_vec.u64 = (h_page_current);
+#if !SZ_IS_BIG_ENDIAN_
         h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
         h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
         h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
         h4_vec.u64 = (h_page_current >> 32) | (h_page_next << 32);
+#else
+        h1_vec.u64 = (h_page_current << 8) | (h_page_next >> 8);
+        h2_vec.u64 = (h_page_current << 16) | (h_page_next >> 16);
+        h3_vec.u64 = (h_page_current << 24) | (h_page_next >> 24);
+        h4_vec.u64 = (h_page_current << 32) | (h_page_next >> 32);
+#endif
         matches0_vec = sz_u64_each_3byte_equal_(h0_vec, n_vec);
         matches1_vec = sz_u64_each_3byte_equal_(h1_vec, n_vec);
         matches2_vec = sz_u64_each_3byte_equal_(h2_vec, n_vec);
@@ -578,6 +613,7 @@ SZ_INTERNAL sz_cptr_t sz_find_3byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_
         matches4_vec = sz_u64_each_3byte_equal_(h4_vec, n_vec);
 
         if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64) {
+#if !SZ_IS_BIG_ENDIAN_
             matches0_vec.u64 >>= 16;
             matches1_vec.u64 >>= 8;
             matches3_vec.u64 <<= 8;
@@ -585,6 +621,15 @@ SZ_INTERNAL sz_cptr_t sz_find_3byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_
             sz_u64_t match_indicators =
                 matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64;
             return h + sz_u64_ctz(match_indicators) / 8;
+#else
+            matches0_vec.u64 <<= 16;
+            matches1_vec.u64 <<= 8;
+            matches3_vec.u64 >>= 8;
+            matches4_vec.u64 >>= 16;
+            sz_u64_t match_indicators =
+                matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64;
+            return h + sz_u64_clz(match_indicators) / 8;
+#endif
         }
     }
 
@@ -768,17 +813,8 @@ SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
     // This almost never fires, but it's better to be safe than sorry.
     if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
 
-#if SZ_IS_BIG_ENDIAN_
-    sz_find_t backends[] = {
-        sz_find_1byte_serial_,
-        sz_find_horspool_upto_256bytes_serial_,
-        sz_find_horspool_over_256bytes_serial_,
-    };
-
-    return backends[(n_length > 1) + (n_length > 256)](h, h_length, n, n_length);
-#else
     sz_find_t backends[] = {
-        // For very short strings brute-force SWAR makes sense.
+        // For very short strings brute-force SWAR makes sense - now optimized for both endianness!
         sz_find_1byte_serial_,
         sz_find_2byte_serial_,
         sz_find_3byte_serial_,
@@ -797,7 +833,6 @@ SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
         (n_length > 4) +
         // For longer needles - use skip tables.
         (n_length > 8) + (n_length > 256)](h, h_length, n, n_length);
-#endif
 }
 
 SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {

From e6f2c0278570a53aeae63d89b17c3291778a3abf Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 2 Sep 2025 13:26:28 +0000
Subject: [PATCH 709/751] Docs: Describe dynamic dispatch & linking

Closes #125
---
 README.md | 108 ++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 76 insertions(+), 32 deletions(-)

diff --git a/README.md b/README.md
index 804a3422..9163def1 100644
--- a/README.md
+++ b/README.md
@@ -27,8 +27,8 @@ It __accelerates exact and fuzzy string matching, edit distance computations, so
 [faq-simd]: https://en.wikipedia.org/wiki/Single_instruction,_multiple_data
 [faq-swar]: https://en.wikipedia.org/wiki/SWAR
 
-- 🐂 __[C](#Basic-Usage-with-C-99-and-Newer) :__ Upgrade LibC's `<string.h>` to `<stringzilla.h>`  in C 99
-- 🐉 __[C++](#basic-usage-with-c-11-and-newer):__ Upgrade STL's `<string>` to `<stringzilla.hpp>` in C++ 11
+- 🐂 __[C](#basic-usage-with-c-99-and-newer) :__ Upgrade LibC's `<string.h>` to `<stringzilla/stringzilla.h>`  in C 99
+- 🐉 __[C++](#basic-usage-with-c-11-and-newer):__ Upgrade STL's `<string>` to `<stringzilla/stringzilla.hpp>` in C++ 11
 - 🐍 __[Python](#quick-start-python-🐍):__ Upgrade your `str` to faster `Str`
 - 🍎 __[Swift](#quick-start-swift-🍏):__ Use the `String+StringZilla` extension
 - 🦀 __[Rust](#quick-start-rust-🦀):__ Use the `StringZilla` traits crate
@@ -298,28 +298,33 @@ Notably, if the CPU supports misaligned loads, even the 64-bit SWAR backends are
 ## Functionality
 
 StringZilla is compatible with most modern CPUs, and provides a broad range of functionality.
+It's split into 2 layers:
 
-Glossary:
+1. StringZilla: single-header C library and C++ wrapper for high-performance string operations.
+2. StringZillas: parallel CPU/GPU backends used for large-batch operations and accelerators.
 
-- StringZilla: single-header C library and C++ wrapper for high-performance string operations.
-- StringZillas: parallel CPU/GPU backends used for large-batch operations and accelerators.
+Having a second C++/CUDA layer greatly simplifies the implementation of similarity scoring and fingerprint functions, that would require too much error-prone boilerplate code in pure C.
+Both layers are designed to be extremely portable:
 
-- [x] works on both Little-Endian and Big-Endian architectures.
-- [x] works on 32-bit and 64-bit hardware architectures.
-- [x] compatible with ASCII and UTF-8 encoding.
+- [x] across both Little-Endian and Big-Endian architectures.
+- [x] across 32-bit and 64-bit hardware architectures.
+- [x] across Operating Systems and compilers.
+- [x] across ASCII and UTF-8 encoded inputs.
 
 Not all features are available across all bindings.
 Consider contributing, if you need a feature that's not yet implemented.
 
-|                                | Maturity | C 99  | C++ 11 | Python | Swift | Rust  |
-| :----------------------------- | :------: | :---: | :----: | :----: | :---: | :---: |
-| Substring Search               |    🌳     |   ✅   |   ✅    |   ✅    |   ✅   |   ✅   |
-| Character Set Search           |    🌳     |   ✅   |   ✅    |   ✅    |   ✅   |   ✅   |
-| Edit Distances                 |    🧐     |   ✅   |   ✅    |   ✅    |   ✅   |   ⚪   |
-| Small String Class             |    🧐     |   ✅   |   ✅    |   ❌    |   ❌   |   ⚪   |
-| Sorting & Sequence Operations  |    🚧     |   ✅   |   ✅    |   ✅    |   ⚪   |   ⚪   |
-| Lazy Ranges, Compressed Arrays |    🧐     |   ⚪   |   ✅    |   ✅    |   ⚪   |   ⚪   |
-| Hashes & Fingerprints          |    🚧     |   ✅   |   ✅    |   ⚪    |   ⚪   |   ⚪   |
+|                                | Maturity | C 99  | C++ 11 | Python | Rust  |  JS   | Swift |
+| :----------------------------- | :------: | :---: | :----: | :----: | :---: | :---: | :---: |
+| Substring Search               |    🌳     |   ✅   |   ✅    |   ✅    |   ✅   |   ✅   |   ✅   |
+| Character Set Search           |    🌳     |   ✅   |   ✅    |   ✅    |   ✅   |   ✅   |   ✅   |
+| Sorting & Sequence Operations  |    🌳     |   ✅   |   ✅    |   ✅    |   ✅   |   ⚪   |   ⚪   |
+| Streaming Hashes               |    🌳     |   ✅   |   ✅    |   ✅    |   ✅   |   ✅   |   ✅   |
+| Small String Class             |    🧐     |   ✅   |   ✅    |   ❌    |   ⚪   |   ❌   |   ❌   |
+| Lazy Ranges, Compressed Arrays |    🌳     |   ❌   |   ✅    |   ✅    |   ✅   |   ❌   |   ⚪   |
+|                                |          |       |        |        |       |       |       |  |
+| Parallel Similarity Scoring    |    🌳     |   ❌   |   ✅    |   ✅    |   ✅   |   ⚪   |   ✅   |
+| Parallel Rolling Fingerprints  |    🌳     |   ❌   |   ✅    |   ✅    |   ✅   |   ✅   |   ⚪   |
 
 > 🌳 parts are used in production.
 > 🧐 parts are in beta.
@@ -432,8 +437,8 @@ If all the chunks are located in consecutive memory regions, the memory overhead
 lines: Strs = text.split(separator='\n') # 4 bytes per line overhead for under 4 GB of text
 batch: Strs = lines.sample(seed=42) # 10x faster than `random.choices`
 lines.shuffle(seed=42) # or shuffle all lines in place and shard with slices
-# WIP: lines.sort() # explodes to 16 bytes per line overhead for any length text
-# WIP: argsort: tuple = lines.argsort() # similar to `numpy.argsort`
+lines_sorted: Strs = lines.sorted() # returns a new Strs in sorted order
+order: tuple = lines.argsort() # similar to `numpy.argsort`
 ```
 
 Working on [RedPajama][redpajama], addressing 20 Billion annotated english documents, one will need only 160 GB of RAM instead of Terabytes.
@@ -599,24 +604,29 @@ Same applies to C++, where you would copy the `stringzilla.hpp` header.
 Alternatively, add it as a submodule, and include it in your build system.
 
 ```sh
-git submodule add https://github.com/ashvardanian/stringzilla.git
+git submodule add https://github.com/ashvardanian/StringZilla.git external/stringzilla
+git submodule update --init --recursive
 ```
 
 Or using a pure CMake approach:
 
 ```cmake
-FetchContent_Declare(stringzilla GIT_REPOSITORY https://github.com/ashvardanian/stringzilla.git)
+FetchContent_Declare(
+    stringzilla
+    GIT_REPOSITORY https://github.com/ashvardanian/StringZilla.git
+    GIT_TAG main  # or specify a version tag
+)
 FetchContent_MakeAvailable(stringzilla)
 ```
 
 Last, but not the least, you can also install it as a library, and link against it.
-This approach is worse for inlining, but brings dynamic runtime dispatch for the most advanced CPU features.
+This approach is worse for inlining, but brings [dynamic runtime dispatch](#dynamic-dispatch) for the most advanced CPU features.
 
 ### Basic Usage with C 99 and Newer
 
 There is a stable C 99 interface, where all function names are prefixed with `sz_`.
 Most interfaces are well documented, and come with self-explanatory names and examples.
-In some cases, hardware specific overloads are available, like `sz_find_avx512` or `sz_find_neon`.
+In some cases, hardware specific overloads are available, like `sz_find_skylake` or `sz_find_neon`.
 Both are companions of the `sz_find`, first for x86 CPUs with AVX-512 support, and second for Arm NEON-capable CPUs.
 
 ```c
@@ -627,10 +637,13 @@ sz_string_view_t haystack = {your_text, your_text_length};
 sz_string_view_t needle = {your_subtext, your_subtext_length};
 
 // Perform string-level operations auto-picking the backend or dispatching manually
-sz_size_t substring_position = sz_find(haystack.start, haystack.length, needle.start, needle.length);
-sz_size_t substring_position = sz_find_skylake(haystack.start, haystack.length, needle.start, needle.length);
-sz_size_t substring_position = sz_find_haswell(haystack.start, haystack.length, needle.start, needle.length);
-sz_size_t substring_position = sz_find_neon(haystack.start, haystack.length, needle.start, needle.length);
+sz_cptr_t ptr = sz_find(haystack.start, haystack.length, needle.start, needle.length);
+sz_size_t substring_position = ptr ? (sz_size_t)(ptr - haystack.start) : SZ_SIZE_MAX; // SZ_SIZE_MAX if not found
+
+// Backend-specific variants return pointers as well
+sz_cptr_t ptr = sz_find_skylake(haystack.start, haystack.length, needle.start, needle.length);
+sz_cptr_t ptr = sz_find_haswell(haystack.start, haystack.length, needle.start, needle.length);
+sz_cptr_t ptr = sz_find_neon(haystack.start, haystack.length, needle.start, needle.length);
 
 // Hash strings at once
 sz_u64_t hash = sz_hash(haystack.start, haystack.length, 42);    // 42 is the seed
@@ -641,7 +654,7 @@ sz_hash_state_t state;
 sz_hash_state_init(&state, 42);
 sz_hash_state_stream(&state, haystack.start, 1);                       // first char
 sz_hash_state_stream(&state, haystack.start + 1, haystack.length - 1); // rest of the string
-sz_u64_t hash = sz_hash_state_fold(&state);
+sz_u64_t hash_streamed = sz_hash_state_fold(&state);
 
 // Perform collection level operations
 sz_sequence_t array = {your_handle, your_count, your_get_start, your_get_length};
@@ -1198,10 +1211,9 @@ __`SZ_DYNAMIC_DISPATCH`__:
 
 __`SZ_USE_MISALIGNED_LOADS`__:
 
-> By default, StringZilla avoids misaligned loads.
-> If supported, it replaces many byte-level operations with word-level ones.
-> Going from `char`-like types to `uint64_t`-like ones can significantly accelerate the serial (SWAR) backend.
-> So consider enabling it if you are building for some embedded device.
+> Default is platform-dependent: enabled on x86 (where unaligned accesses are fast), disabled on others by default.
+> When enabled, many byte-level operations use word-sized loads, which can significantly accelerate the serial (SWAR) backend.
+> Consider enabling it explicitly if you are targeting platforms that support fast unaligned loads.
 
 __`SZ_AVOID_LIBC`__ and __`SZ_OVERRIDE_LIBC`__:
 
@@ -1573,6 +1585,38 @@ So consider transcoding with [simdutf](https://github.com/simdutf/simdutf), if y
 
 [wide-char-offsets]: https://josephg.com/blog/string-length-lies/
 
+## Dynamic Dispatch
+
+Due to the high-level of fragmentation of SIMD support in different CPUs, StringZilla uses the names of select Intel and ARM CPU generations for its backends.
+You can query supported backends and use them manually.
+Use it to guarantee constant performance, or to explore how different algorithms scale on your hardware.
+
+```c
+sz_find(text, length, pattern, 3);          // Auto-dispatch
+sz_find_haswell(text, length, pattern, 3);  // Intel Haswell+ AVX2
+sz_find_skylake(text, length, pattern, 3);  // Intel Skylake+ AVX-512
+sz_find_neon(text, length, pattern, 3);     // Arm NEON 128-bit
+sz_find_sve(text, length, pattern, 3);      // Arm SVE 128/256/512/1024/2048-bit
+```
+
+StringZilla automatically picks the most advanced backend for the given CPU.
+Similarly, in Python, you can log the auto-detected capabilities:
+
+```python
+python -c "import stringzilla; print(stringzilla.__capabilities__)"         # ('serial', 'haswell', 'skylake', 'ice', 'neon', 'sve', 'sve2+aes')
+python -c "import stringzilla; print(stringzilla.__capabilities_str__)"     # "haswell, skylake, ice, neon, sve, sve2+aes"
+```
+
+You can also explicitly set the backend to use, or scope the backend to a specific function.
+
+```python
+import stringzilla as sz
+sz.reset_capabilities(('serial',))          # Force SWAR backend
+sz.reset_capabilities(('haswell',))         # Force AVX2 backend
+sz.reset_capabilities(('neon',))            # Force NEON backend
+sz.reset_capabilities(sz.__capabilities__)  # Reset to auto-dispatch
+```
+
 ## Contributing 👾
 
 Please check out the [contributing guide](https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md) for more details on how to setup the development environment and contribute to this project.

From be363c98862c3f5d6f29a80c43e1bb1507f4bea4 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 2 Sep 2025 13:27:14 +0000
Subject: [PATCH 710/751] Add: Hashers for Swift

---
 swift/StringProtocol+StringZilla.swift |  60 ++++++++++
 swift/Test.swift                       | 155 +++++++++++++++++++++++++
 2 files changed, 215 insertions(+)

diff --git a/swift/StringProtocol+StringZilla.swift b/swift/StringProtocol+StringZilla.swift
index 2afc9a79..bd2f736f 100644
--- a/swift/StringProtocol+StringZilla.swift
+++ b/swift/StringProtocol+StringZilla.swift
@@ -154,6 +154,15 @@ extension String.UTF8View: StringZillaViewable {
 }
 
 extension StringZillaViewable {
+    /// Computes a 64-bit hash of the string content using StringZilla's fast hash algorithm.
+    /// - Parameter seed: Optional seed value for the hash function (default: 0).
+    /// - Returns: A 64-bit unsigned integer hash value.
+    public func hash(seed: UInt64 = 0) -> UInt64 {
+        return withStringZillaScope { pointer, length in
+            sz_hash(pointer, length, seed)
+        }
+    }
+
     /// Finds the first occurrence of the specified substring within the receiver.
     /// - Parameter needle: The substring to search for.
     /// - Returns: The index of the found occurrence, or `nil` if not found.
@@ -256,3 +265,54 @@ extension StringZillaViewable {
         return result
     }
 }
+
+/// A progressive hasher for computing StringZilla hashes incrementally.
+/// Use this class when you need to hash data that arrives in chunks or when building up a hash over time.
+public class StringZillaHasher {
+    private var state: sz_hash_state_t
+    private var isInitialized: Bool = false
+
+    /// Creates a new hasher with the specified seed.
+    /// - Parameter seed: The seed value for the hash function (default: 0).
+    public init(seed: UInt64 = 0) {
+        state = sz_hash_state_t()
+        sz_hash_state_init(&state, seed)
+        isInitialized = true
+    }
+
+    deinit {
+        // StringZilla hash state doesn't require explicit cleanup
+    }
+
+    /// Updates the hash state with additional string content.
+    /// - Parameter content: The string content to add to the hash.
+    /// - Returns: Self for method chaining.
+    @discardableResult
+    public func update<S: StringZillaViewable>(_ content: S) -> StringZillaHasher {
+        precondition(isInitialized, "Hasher has been finalized and cannot be updated")
+
+        content.withStringZillaScope { pointer, length in
+            sz_hash_state_stream(&state, pointer, length)
+        }
+        return self
+    }
+
+    /// Finalizes the hash computation and returns the result.
+    /// - Returns: The computed 64-bit hash value.
+    /// - Note: After calling this method, the hasher cannot be used for further updates.
+    public func finalize() -> UInt64 {
+        precondition(isInitialized, "Hasher has already been finalized")
+
+        let result = sz_hash_state_fold(&state)
+        isInitialized = false
+        return result
+    }
+
+    /// Resets the hasher to its initial state with the same seed.
+    /// - Parameter seed: Optional new seed value (if nil, uses the original seed).
+    public func reset(seed: UInt64? = nil) {
+        let newSeed = seed ?? 0  // Default to 0 if no seed provided
+        sz_hash_state_init(&state, newSeed)
+        isInitialized = true
+    }
+}
diff --git a/swift/Test.swift b/swift/Test.swift
index e4a365d5..11720780 100644
--- a/swift/Test.swift
+++ b/swift/Test.swift
@@ -54,4 +54,159 @@ class StringZillaTests: XCTestCase {
         let index = "aeiou".findLast(characterNotFrom: "aeiou")
         XCTAssertNil(index)
     }
+
+    // MARK: - Hash Function Tests
+
+    func testHashBasic() {
+        let text = "Hello, world!"
+        let hash = text.hash()
+
+        XCTAssertEqual(text.hash(), hash)
+        XCTAssertEqual(text.hash(seed: 0), hash)
+        XCTAssertNotEqual(hash, 0)
+    }
+
+    func testHashWithSeed() {
+        let text = "Hello, world!"
+        let hashWithSeedZero = text.hash(seed: 0)
+        let hashWithSeed123 = text.hash(seed: 123)
+
+        XCTAssertNotEqual(hashWithSeedZero, hashWithSeed123)
+    }
+
+    func testHashConsistency() {
+        let identicalText1 = "StringZilla"
+        let identicalText2 = "StringZilla"
+
+        XCTAssertEqual(identicalText1.hash(), identicalText2.hash())
+        XCTAssertEqual(identicalText1.hash(seed: 42), identicalText2.hash(seed: 42))
+    }
+
+    func testHashDistribution() {
+        let originalText = "StringZilla"
+        let modifiedText = "StringZillb"
+
+        XCTAssertNotEqual(originalText.hash(), modifiedText.hash())
+    }
+
+    func testHashEmptyString() {
+        let emptyString = ""
+        let emptyStringHash = emptyString.hash()
+
+        XCTAssertEqual(emptyString.hash(), emptyStringHash)
+        XCTAssertEqual(emptyString.hash(seed: 0), emptyStringHash)
+    }
+
+    func testHashUnicodeStrings() {
+        let chineseText = "Hello 世界"
+        let emojiText = "Hello 👋"
+
+        XCTAssertEqual(chineseText.hash(), chineseText.hash())
+        XCTAssertEqual(emojiText.hash(), emojiText.hash())
+        XCTAssertNotEqual(chineseText.hash(), emojiText.hash())
+    }
+
+    // MARK: - Progressive Hasher Tests
+
+    func testProgressiveHasherBasic() {
+        let text = "Hello, world!"
+        let hasher = StringZillaHasher()
+
+        hasher.update(text)
+        let progressiveHash = hasher.finalize()
+
+        // Compare with single-shot hash to verify consistency
+        let singleShotHash = text.hash()
+
+        let secondHasher = StringZillaHasher()
+        secondHasher.update(text)
+        let secondProgressiveHash = secondHasher.finalize()
+
+        XCTAssertEqual(progressiveHash, secondProgressiveHash)
+        XCTAssertNotEqual(progressiveHash, 0)
+        XCTAssertNotEqual(singleShotHash, 0)
+        XCTAssertEqual(singleShotHash, text.hash())
+    }
+
+    func testProgressiveHasherMultipleUpdates() {
+        let hasher = StringZillaHasher()
+        hasher.update("Hello, ")
+        hasher.update("world!")
+        let firstChunkingHash = hasher.finalize()
+
+        let sameChunkingHasher = StringZillaHasher()
+        sameChunkingHasher.update("Hello, ")
+        sameChunkingHasher.update("world!")
+        let sameChunkingHash = sameChunkingHasher.finalize()
+
+        let differentChunkingHasher = StringZillaHasher()
+        differentChunkingHasher.update("Hello")
+        differentChunkingHasher.update(", world!")
+        let differentChunkingHash = differentChunkingHasher.finalize()
+
+        XCTAssertEqual(firstChunkingHash, sameChunkingHash)
+        XCTAssertEqual(firstChunkingHash, differentChunkingHash)
+    }
+
+    func testProgressiveHasherWithSeed() {
+        let hasher1 = StringZillaHasher(seed: 0)
+        let hasher2 = StringZillaHasher(seed: 123)
+
+        hasher1.update("test")
+        hasher2.update("test")
+
+        let hash1 = hasher1.finalize()
+        let hash2 = hasher2.finalize()
+
+        // Different seeds should produce different hash values
+        XCTAssertNotEqual(hash1, hash2)
+    }
+
+    func testProgressiveHasherReset() {
+        let hasher = StringZillaHasher(seed: 42)
+        hasher.update("first")
+        let hashBeforeReset = hasher.finalize()
+
+        hasher.reset(seed: 42)
+        hasher.update("first")
+        let hashAfterReset = hasher.finalize()
+
+        XCTAssertEqual(hashBeforeReset, hashAfterReset)
+    }
+
+    func testProgressiveHasherResetWithNewSeed() {
+        let hasher = StringZillaHasher(seed: 0)
+        hasher.update("test")
+        let hashWithSeedZero = hasher.finalize()
+
+        hasher.reset(seed: 123)
+        hasher.update("test")
+        let hashWithSeed123 = hasher.finalize()
+
+        XCTAssertNotEqual(hashWithSeedZero, hashWithSeed123)
+    }
+
+    func testProgressiveHasherMethodChaining() {
+        let hasher = StringZillaHasher()
+        let chainedMethodHash = hasher.update("Hello")
+            .update(", ")
+            .update("world!")
+            .finalize()
+
+        XCTAssertNotEqual(chainedMethodHash, 0)
+    }
+
+    func testProgressiveHasherEmptyUpdates() {
+        let hasherWithEmptyUpdates = StringZillaHasher()
+        hasherWithEmptyUpdates.update("")
+        hasherWithEmptyUpdates.update("test")
+        hasherWithEmptyUpdates.update("")
+        let hashWithEmptyUpdates = hasherWithEmptyUpdates.finalize()
+
+        let hasherWithoutEmptyUpdates = StringZillaHasher()
+        hasherWithoutEmptyUpdates.update("test")
+        let hashWithoutEmptyUpdates = hasherWithoutEmptyUpdates.finalize()
+
+        XCTAssertEqual(hashWithEmptyUpdates, hashWithoutEmptyUpdates)
+    }
 }

From 7c9caac383b7d393c4e6ecd57ceb846cb7a96c9b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 2 Sep 2025 13:27:24 +0000
Subject: [PATCH 711/751] Make: Drop CodeQL noise

---
 .github/workflows/codeql.yml | 43 ------------------------------------
 1 file changed, 43 deletions(-)
 delete mode 100644 .github/workflows/codeql.yml

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
deleted file mode 100644
index 9c923a2a..00000000
--- a/.github/workflows/codeql.yml
+++ /dev/null
@@ -1,43 +0,0 @@
-name: "CodeQL"
-
-on:
-  push:
-    branches: ["main", "main-dev"]
-  pull_request:
-    branches: ["main", "main-dev"]
-  schedule:
-    - cron: "0 6 * * 2" # Weekly on Tuesdays
-
-jobs:
-  analyze:
-    name: Analyze
-    runs-on: ubuntu-latest
-    timeout-minutes: 360
-    permissions:
-      actions: read
-      contents: read
-      security-events: write
-
-    strategy:
-      fail-fast: false
-      matrix:
-        language: ["c-cpp", "python", "rust", "swift", "javascript-typescript"]
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v5
-        with:
-          submodules: recursive
-
-      - name: Initialize CodeQL
-        uses: github/codeql-action/init@v3
-        with:
-          languages: ${{ matrix.language }}
-
-      - name: Autobuild
-        uses: github/codeql-action/autobuild@v3
-
-      - name: Perform CodeQL Analysis
-        uses: github/codeql-action/analyze@v3
-        with:
-          category: "/language:${{matrix.language}}"

From b2cedc4b660cd2b96df52f4c81a6b4cf62b52d86 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 2 Sep 2025 13:28:57 +0000
Subject: [PATCH 712/751] Make: Publish to NPM

---
 .github/workflows/prerelease.yml | 12 ++++----
 .github/workflows/release.yml    | 47 ++++++++++++++++++--------------
 2 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 45410457..2099dd17 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -138,12 +138,12 @@ jobs:
         run: python -X faulthandler -m pytest scripts/test_stringzilla.py -s -vv --maxfail=1 --full-trace
 
         # JavaScript
-      # - name: Set up Node.js
-      #   uses: actions/setup-node
-      #   with:
-      #     node-version: 18
-      # - name: Build and test JavaScript
-      #   run: npm ci && npm test
+      - name: Set up Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+      - name: Build and test JavaScript
+        run: npm ci && npm test
 
       # Rust
       - name: Test Rust
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index b566ced3..71c9f372 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -157,7 +157,7 @@ jobs:
     needs: versioning
     strategy:
       matrix:
-        os: [ubuntu-24.04]  # Linux only
+        os: [ubuntu-24.04] # Linux only
         python-version: ["38", "39", "310", "311", "312", "313"]
     steps:
       - uses: actions/checkout@v5
@@ -383,25 +383,32 @@ jobs:
         with:
           registry-token: ${{ secrets.CARGO_REGISTRY_TOKEN }}
 
-  # Let's not publish the JavaScript package for now
-  # publish_javascript:
-  #   name: Publish JavaScript
-  #   needs: versioning
-  #   runs-on: ubuntu-24.04
-  #   steps:
-  #     - uses: actions/checkout@v5
-  #       with:
-  #         ref: 'main'
-  #     - run: git submodule update --init --recursive
-  #     - uses: actions/setup-node@v3
-  #       with:
-  #         node-version: 18
-  #     - run: npm install
-  #     - run: npm ci
-  #     - run: npm test
-  #     - uses: JS-DevTools/npm-publish@v2
-  #       with:
-  #         token: ${{ secrets.NPM_TOKEN }}
+  # JavaScript
+  publish_javascript:
+    name: Publish JavaScript
+    needs: versioning
+    runs-on: ubuntu-24.04
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v5
+        with:
+          ref: "main"
+      - name: Initialize submodules
+        run: git submodule update --init --recursive
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+      - name: Install dependencies
+        run: npm install
+      - name: Clean install dependencies
+        run: npm ci
+      - name: Run tests
+        run: npm test
+      - name: Publish to NPM
+        uses: JS-DevTools/npm-publish@v3
+        with:
+          token: ${{ secrets.NPM_TOKEN }}
 
   create_linux_deb_package:
     name: Create Debian Package for ${{ matrix.arch }}

From c923ed2b2cb58eae482bf2f6385122a5bfa04d8d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 3 Sep 2025 14:25:27 +0000
Subject: [PATCH 713/751] Docs: JS Quick Start

---
 README.md | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/README.md b/README.md
index 9163def1..3bb66ac3 100644
--- a/README.md
+++ b/README.md
@@ -1310,6 +1310,47 @@ sz::levenshtein_distance_utf8("façade", "facade") // 1
 ```
 
 [memchr-benchmarks]: https://github.com/ashvardanian/memchr_vs_stringzilla
+## Quick Start: JavaScript 🟨
+
+Install the Node.js package and use zero-copy `Buffer` APIs.
+
+```bash
+npm install stringzilla
+```
+
+```js
+import sz from 'stringzilla';
+
+const haystack = Buffer.from('Hello, world!');
+const needle = Buffer.from('world');
+
+// Substring search (BigInt offsets)
+const firstIndex = sz.find(haystack, needle);      // 7n
+const lastIndex = sz.findLast(haystack, needle);   // 7n
+
+// Character / charset search
+const firstOIndex = sz.findByte(haystack, 'o'.charCodeAt(0));                 // 4n
+const firstVowelIndex = sz.findByteFrom(haystack, Buffer.from('aeiou'));      // 1n
+const lastVowelIndex = sz.findLastByteFrom(haystack, Buffer.from('aeiou'));   // 8n
+
+// Counting (optionally overlapping)
+const lCount = sz.count(haystack, Buffer.from('l'));                // 3n
+const llOverlapCount = sz.count(haystack, Buffer.from('ll'), true); // 1n
+
+// Hashing
+const hash = sz.hash(haystack, 0); // 64-bit BigInt
+const hasher = new sz.Hasher(0);
+hasher.stream(Buffer.from('Hello, '));
+hasher.stream(Buffer.from('world!'));
+const streamingHash = hasher.fold();
+
+// Equality/ordering utilities
+const isEqual = sz.equal(Buffer.from('a'), Buffer.from('a'));
+const order = sz.compare(Buffer.from('a'), Buffer.from('b')); // -1, 0, or 1
+
+// Other helpers
+const byteSum = sz.byteSum(haystack); // sum of bytes as BigInt
+```
 
 ## Quick Start: Swift 🍏
 

From e00a98bae16b2e0cbbac6c1d0a7ac82a5ac53a2a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 3 Sep 2025 14:30:42 +0000
Subject: [PATCH 714/751] Add: `try_resize_and_overwrite`

Closes #225

Co-authored-by: PleaseJustDont <149666358+Please-just-dont@users.noreply.github.com>
---
 include/stringzilla/stringzilla.hpp | 62 +++++++++++++++++++++++++++++
 scripts/test_stringzilla.cpp        | 25 ++++++++++++
 2 files changed, 87 insertions(+)

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index d1fdb810..125df07a 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -2815,6 +2815,52 @@ class basic_string {
         return true;
     }
 
+    /**
+     *  @brief Resizes the string to a specified number of characters without initializing new elements.
+     *         The provided callback is called to overwrite the contents of the resized string.
+     *  @param[in] count The new size of the string.
+     *  @param[in] operation A callback that receives a pointer and the new size, and returns the actual new size.
+     *  @return `true` if the resizing was successful, `false` otherwise.
+     *  @see https://en.cppreference.com/w/cpp/string/basic_string/resize_and_overwrite
+     */
+    template <typename operation_type_>
+    bool try_resize_and_overwrite(size_type count, operation_type_ operation) noexcept {
+        if (count > max_size()) return false;
+
+        sz_ptr_t string_start;
+        sz_size_t string_length;
+        sz_size_t string_space;
+        sz_bool_t string_is_external;
+        sz_string_unpack(&string_, &string_start, &string_length, &string_space, &string_is_external);
+
+        // Allocate more space if needed, without initializing
+        if (count >= string_space) {
+            if (_with_alloc([&](sz_alloc_type &alloc) {
+                    return sz_string_expand(&string_, SZ_SIZE_MAX, count - string_length, &alloc) ? sz_success_k
+                                                                                                  : sz_bad_alloc_k;
+                }) != status_t::success_k)
+                return false;
+            sz_string_unpack(&string_, &string_start, &string_length, &string_space, &string_is_external);
+        }
+
+        // Call the user's operation to populate the buffer
+        // The operation receives a mutable pointer to the data and the requested count
+        size_type actual_count = operation(reinterpret_cast<char_type *>(string_start), count);
+
+        // Clamp the actual count to the requested count for safety
+        if (actual_count > count) actual_count = count;
+
+        // Update the string length appropriately
+        if (actual_count > string_length) {
+            string_start[actual_count] = '\0';
+            // ! Knowing the layout of the string, we can perform this operation safely,
+            // ! even if its located on stack.
+            string_.external.length += actual_count - string_length;
+        }
+        else { sz_string_erase(&string_, actual_count, SZ_SIZE_MAX); }
+
+        return true;
+    }
 #pragma endregion
 
 #pragma region STL Interfaces
@@ -2832,6 +2878,22 @@ class basic_string {
         if (!try_resize(count, character)) throw std::bad_alloc();
     }
 
+    /**
+     *  @brief Resizes the string to a specified number of characters without initializing new elements.
+     *         The provided callback is called to overwrite the contents of the resized string.
+     *  @param[in] count The new size of the string.
+     *  @param[in] operation A callback that receives a pointer and the new size, and returns the actual new size.
+     *  @throw `std::length_error` if the string is too long.
+     *  @throw `std::bad_alloc` if the allocation fails.
+     *  @see https://en.cppreference.com/w/cpp/string/basic_string/resize_and_overwrite
+
+     */
+    template <typename operation_type_>
+    void resize_and_overwrite(size_type count, operation_type_ operation) noexcept(false) {
+        if (count > max_size()) throw std::length_error("sz::basic_string::resize_and_overwrite");
+        if (!try_resize_and_overwrite(count, operation)) throw std::bad_alloc();
+    }
+
     /**
      *  @brief Reclaims the unused memory, if any.
      *  @throw `std::bad_alloc` if the allocation fails.
diff --git a/scripts/test_stringzilla.cpp b/scripts/test_stringzilla.cpp
index 56eda561..4c359035 100644
--- a/scripts/test_stringzilla.cpp
+++ b/scripts/test_stringzilla.cpp
@@ -903,6 +903,31 @@ void test_stl_compatibility_for_updates() {
     assert(str().get_allocator() == std::allocator<char>());
     assert(std::strcmp(str("c_str").c_str(), "c_str") == 0);
 
+    // Test C++23 resize and overwrite functionality
+    assert_scoped(str s("hello"),
+                  s.resize_and_overwrite(10,
+                                         [](char *p, std::size_t count) noexcept {
+                                             std::memset(p, 'X', count);
+                                             return count;
+                                         }),
+                  s.size() == 10 && s == "XXXXXXXXXX");
+
+    assert_scoped(str s("test"),
+                  s.resize_and_overwrite(8,
+                                         [](char *p, std::size_t) noexcept {
+                                             std::strcpy(p, "ABCDE");
+                                             return 5;
+                                         }),
+                  s.size() == 5 && s == "ABCDE");
+
+    assert_scoped(str s("orig"),
+                  s.try_resize_and_overwrite(6,
+                                             [](char *p, std::size_t count) noexcept {
+                                                 std::strcpy(p, "works!");
+                                                 return count;
+                                             }),
+                  s.size() == 6 && s == "works!");
+
     // On 32-bit systems the base capacity can be larger than our `z::string::min_capacity`.
     // It's true for MSVC: https://github.com/ashvardanian/StringZilla/issues/168
     if (SZ_IS_64BIT_) assert_scoped(str s = "hello", s.shrink_to_fit(), s.capacity() <= sz::string::min_capacity);

From c3040b7697c8eda8f1811d62d55080ad8089f8d7 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 3 Sep 2025 14:35:53 +0000
Subject: [PATCH 715/751] Fix: Stricter following of `SZ_AVOID_STL`

Previously we could raise STL exception
types regardless of that settings. Now those
APIs will be closed
---
 include/stringzilla/stringzilla.hpp | 51 ++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 125df07a..c9d321c4 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -1382,6 +1382,8 @@ class basic_string_slice {
      */
     void remove_suffix(size_type n) noexcept { assert(n <= size()), length_ -= n; }
 
+#if !SZ_AVOID_STL
+
     /**  @brief Added for STL compatibility. */
     string_slice substr() const noexcept { return *this; }
 
@@ -1417,6 +1419,8 @@ class basic_string_slice {
         return count;
     }
 
+#endif // !SZ_AVOID_STL
+
 #pragma endregion
 
 #pragma endregion
@@ -1957,7 +1961,7 @@ class basic_string_slice {
  *      * `sat`, `sub`, and element access has non-const overloads returning references to mutable objects.
  *
  *  Functions defined for `basic_string`, but not present in `basic_string_slice`:
- *      * `replace`, `insert`, `erase`, `append`, `push_back`, `pop_back`, `resize`, `shrink_to_fit`... from STL,
+ *      * `replace`, `insert`, `erase`, `append`, `push_back`, `pop_back`, `resize`
  *      * `try_` exception-free "try" operations that returning non-zero values on success,
  *      * `replace_all` and `erase_all` similar to Boost,
  *      * `translate` - character mapping,
@@ -2175,8 +2179,6 @@ class basic_string {
     basic_string &operator=(std::string_view other) noexcept(false) { return assign({other.data(), other.size()}); }
     operator std::string_view() const noexcept { return view(); }
 
-#endif
-
 #endif
 
     template <typename first_type, typename second_type>
@@ -2195,6 +2197,8 @@ class basic_string {
         return *this;
     }
 
+#endif // !SZ_AVOID_STL
+
 #pragma endregion
 
 #pragma region Iterators and Accessors
@@ -2229,6 +2233,8 @@ class basic_string {
     pointer c_str() noexcept { return string_.internal.start; }
     const_pointer c_str() const noexcept { return string_.internal.start; }
 
+#if !SZ_AVOID_STL
+
     reference at(size_type pos) noexcept(false) {
         if (pos >= size()) throw std::out_of_range("sz::basic_string::at");
         return string_.internal.start[pos];
@@ -2238,6 +2244,8 @@ class basic_string {
         return string_.internal.start[pos];
     }
 
+#endif // !SZ_AVOID_STL
+
     difference_type ssize() const noexcept { return static_cast<difference_type>(size()); }
     size_type size() const noexcept { return view().size(); }
     size_type length() const noexcept { return size(); }
@@ -2868,6 +2876,25 @@ class basic_string {
     /** @brief Clears the string contents, but @b no deallocations happen. */
     void clear() noexcept { sz_string_erase(&string_, 0, SZ_SIZE_MAX); }
 
+    /**
+     *  @brief Erases @b (in-place) the given range of characters.
+     *  @return Iterator pointing following the erased character, or end() if no such character exists.
+     */
+    iterator erase(const_iterator first, const_iterator last) noexcept {
+        auto start = begin();
+        auto offset = first - start;
+        sz_string_erase(&string_, offset, last - first);
+        return start + offset;
+    }
+
+    /**
+     *  @brief Erases @b (in-place) the one character at a given postion.
+     *  @return Iterator pointing following the erased character, or end() if no such character exists.
+     */
+    iterator erase(const_iterator pos) noexcept { return erase(pos, pos + 1); }
+
+#if !SZ_AVOID_STL
+
     /**
      *  @brief Resizes the string to match @p count, filling the new space with the given @p character.
      *  @throw `std::length_error` if the string is too long.
@@ -3034,23 +3061,6 @@ class basic_string {
         return *this;
     }
 
-    /**
-     *  @brief Erases @b (in-place) the given range of characters.
-     *  @return Iterator pointing following the erased character, or end() if no such character exists.
-     */
-    iterator erase(const_iterator first, const_iterator last) noexcept {
-        auto start = begin();
-        auto offset = first - start;
-        sz_string_erase(&string_, offset, last - first);
-        return start + offset;
-    }
-
-    /**
-     *  @brief Erases @b (in-place) the one character at a given postion.
-     *  @return Iterator pointing following the erased character, or end() if no such character exists.
-     */
-    iterator erase(const_iterator pos) noexcept { return erase(pos, pos + 1); }
-
     /**
      *  @brief Replaces @b (in-place) a range of characters with a given string.
      *  @throws `std::out_of_range` if `pos > size()`.
@@ -3346,6 +3356,7 @@ class basic_string {
         return basic_string {concatenation<string_view, string_view> {view(), string_view(other)}};
     }
 
+#endif
 #pragma endregion
 #pragma endregion
 

From ecad156473c76c6e0882c452bc97c1f43bb37f6b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 3 Sep 2025 14:48:29 +0000
Subject: [PATCH 716/751] Docs: How to use parallel algorithms

---
 README.md | 374 +++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 330 insertions(+), 44 deletions(-)

diff --git a/README.md b/README.md
index 3bb66ac3..3049089a 100644
--- a/README.md
+++ b/README.md
@@ -314,17 +314,17 @@ Both layers are designed to be extremely portable:
 Not all features are available across all bindings.
 Consider contributing, if you need a feature that's not yet implemented.
 
-|                                | Maturity | C 99  | C++ 11 | Python | Rust  |  JS   | Swift |
-| :----------------------------- | :------: | :---: | :----: | :----: | :---: | :---: | :---: |
-| Substring Search               |    🌳     |   ✅   |   ✅    |   ✅    |   ✅   |   ✅   |   ✅   |
-| Character Set Search           |    🌳     |   ✅   |   ✅    |   ✅    |   ✅   |   ✅   |   ✅   |
-| Sorting & Sequence Operations  |    🌳     |   ✅   |   ✅    |   ✅    |   ✅   |   ⚪   |   ⚪   |
-| Streaming Hashes               |    🌳     |   ✅   |   ✅    |   ✅    |   ✅   |   ✅   |   ✅   |
-| Small String Class             |    🧐     |   ✅   |   ✅    |   ❌    |   ⚪   |   ❌   |   ❌   |
-| Lazy Ranges, Compressed Arrays |    🌳     |   ❌   |   ✅    |   ✅    |   ✅   |   ❌   |   ⚪   |
-|                                |          |       |        |        |       |       |       |  |
-| Parallel Similarity Scoring    |    🌳     |   ❌   |   ✅    |   ✅    |   ✅   |   ⚪   |   ✅   |
-| Parallel Rolling Fingerprints  |    🌳     |   ❌   |   ✅    |   ✅    |   ✅   |   ✅   |   ⚪   |
+|                                | Maturity |   C   |  C++  | Python | Rust  |  JS   | Swift |
+| :----------------------------- | :------: | :---: | :---: | :----: | :---: | :---: | :---: |
+| Substring Search               |    🌳     |   ✅   |   ✅   |   ✅    |   ✅   |   ✅   |   ✅   |
+| Character Set Search           |    🌳     |   ✅   |   ✅   |   ✅    |   ✅   |   ✅   |   ✅   |
+| Sorting & Sequence Operations  |    🌳     |   ✅   |   ✅   |   ✅    |   ✅   |   ⚪   |   ⚪   |
+| Streaming Hashes               |    🌳     |   ✅   |   ✅   |   ✅    |   ✅   |   ✅   |   ✅   |
+| Small String Class             |    🧐     |   ✅   |   ✅   |   ❌    |   ⚪   |   ❌   |   ❌   |
+| Lazy Ranges, Compressed Arrays |    🌳     |   ❌   |   ✅   |   ✅    |   ✅   |   ❌   |   ⚪   |
+|                                |          |       |       |        |       |       |       |  |
+| Parallel Similarity Scoring    |    🌳     |   ✅   |   ✅   |   ✅    |   ✅   |   ⚪   |   ⚪   |
+| Parallel Rolling Fingerprints  |    🌳     |   ✅   |   ✅   |   ✅    |   ✅   |   ⚪   |   ⚪   |
 
 > 🌳 parts are used in production.
 > 🧐 parts are in beta.
@@ -336,12 +336,20 @@ Consider contributing, if you need a feature that's not yet implemented.
 ## Quick Start: Python 🐍
 
 Python bindings are available on PyPI for Python 3.8+, and can be installed with `pip`.
+
+```bash
+pip install stringzilla         # for serial algorithms
+pip install stringzillas-cpus   # for parallel multi-CPU backends
+pip install stringzillas-cuda   # for parallel Nvidia GPU backend
+```
+
 You can immediately check the installed version and the used hardware capabilities with following commands:
 
 ```bash
-pip install stringzilla
 python -c "import stringzilla; print(stringzilla.__version__)"
-python -c "import stringzilla; print(stringzilla.__capabilities__)"
+python -c "import stringzillas; print(stringzillas.__version__)"
+python -c "import stringzilla; print(stringzilla.__capabilities__)"     # for serial algorithms
+python -c "import stringzillas; print(stringzillas.__capabilities__)"   # for parallel algorithms
 ```
 
 ### Basic Usage
@@ -492,40 +500,65 @@ offset: int = sz.find("haystack", "needle", start=0, end=sys.maxsize)
 count: int = sz.count("haystack", "needle", start=0, end=sys.maxsize, allowoverlap=False)
 ```
 
-### Edit Distances
+### Similarity Scores
+
+StringZilla exposes high-performance, batch-oriented similarity via the `stringzillas` module. 
+Use `DeviceScope` to pick hardware and optionally limit capabilities per engine.
 
 ```py
-assert sz.levenshtein_distance("apple", "aple") == 1 # skip one ASCII character
-assert sz.levenshtein_distance("αβγδ", "αγδ") == 2 # skip two bytes forming one rune
-assert sz.levenshtein_distance_unicode("αβγδ", "αγδ") == 1 # one unicode rune
+import stringzilla as sz
+import stringzillas as szs
+
+cpu_scope = szs.DeviceScope(cpu_cores=4)    # force CPU-only
+gpu_scope = szs.DeviceScope(gpu_device=0)   # pick GPU 0 if available
+
+strings_a = sz.Strs(["kitten", "flaw"])
+strings_b = sz.Strs(["sitting", "lawn"])
+
+engine = szs.LevenshteinDistances(
+    match=0, mismatch=2,        # costs don't have to be 1
+    open=3, extend=1,           # may be different in Bio
+    capabilities=("serial",)    # avoid SIMD 🤭
+)
+distances = engine(strings_a, strings_b, device=cpu_scope)
+assert int(distances[0]) == 3 and int(distances[1]) == 2
 ```
 
-Several Python libraries provide edit distance computation.
-Most of them are implemented in C, but are not always as fast as StringZilla.
-Taking a 1'000 long proteins around 10'000 characters long, computing just a 100 distances:
+Note, that this computes byte-level distances.
+For UTF-8 codepoints, use a different engine class:
 
-- [JellyFish](https://github.com/jamesturk/jellyfish): 62.3s
-- [EditDistance](https://github.com/roy-ht/editdistance): 32.9s
-- StringZilla: __0.8s__
+```py
+strings_a = sz.Strs(["café", "αβγδ"])
+strings_b = sz.Strs(["cafe", "αγδ"])
+engine = szs.LevenshteinDistancesUTF8(capabilities=("serial",))
+distances = engine(strings_a, strings_b, device=cpu_scope)
+assert int(distances[0]) == 1 and int(distances[1]) == 1
+```
 
-Moreover, you can pass custom substitution matrices to compute the Needleman-Wunsch alignment scores.
-That task is very common in bioinformatics and computational biology.
-It's natively supported in BioPython, and its BLOSUM matrices can be converted to StringZilla's format.
-Alternatively, you can construct an arbitrary 256 by 256 cost matrix using NumPy.
-Depending on arguments, the result may be equal to the negative Levenshtein distance.
+For alignment scoring provide a 256×256 substitution matrix using NumPy:
 
 ```py
 import numpy as np
 import stringzilla as sz
+import stringzillas as szs
 
-costs = np.zeros((256, 256), dtype=np.int8)
-costs.fill(-1)
-np.fill_diagonal(costs, 0)
+substitution_matrix = np.zeros((256, 256), dtype=np.int8)
+substitution_matrix.fill(-1)                # mismatch score
+np.fill_diagonal(substitution_matrix, 0)    # match score
 
-assert sz.alignment_score("first", "second", substitution_matrix=costs, gap_score=-1) == -sz.levenshtein_distance(a, b)
+engine = szs.NeedlemanWunsch(substitution_matrix=substitution_matrix, open=1, extend=1)
+scores = engine(strings_a, strings_b, device=cpu_scope)
 ```
 
-Using the same proteins as for Levenshtein distance benchmarks:
+Several Python libraries provide edit distance computation.
+Most are implemented in C but may be slower than StringZilla on large inputs.
+For proteins ~10k chars, 100 pairs:
+
+- [JellyFish](https://github.com/jamesturk/jellyfish): 62.3s
+- [EditDistance](https://github.com/roy-ht/editdistance): 32.9s
+- StringZilla: __0.8s__
+
+Using the same proteins for Needleman–Wunsch alignment scores:
 
 - [BioPython](https://github.com/biopython/biopython): 25.8s
 - StringZilla: __7.8s__
@@ -559,15 +592,45 @@ for packed_row, packed_row_aminoacid in enumerate(aligner.substitution_matrix.al
 glutathione = "ECG" # Need to rebuild human tissue?
 thyrotropin_releasing_hormone = "QHP" # Or to regulate your metabolism?
 
-assert sz.alignment_score(
-    glutathione,
-    thyrotropin_releasing_hormone, 
-    substitution_matrix=subs_reconstructed, 
-    gap_score=1) == aligner.score(glutathione, thyrotropin_releasing_hormone) # Equal to 6
+import stringzillas as szs
+engine = szs.NeedlemanWunsch(substitution_matrix=subs_reconstructed, open=1, extend=1)
+score = int(engine(sz.Strs([glutathione]), sz.Strs([thyrotropin_releasing_hormone]))[0])
+assert score == aligner.score(glutathione, thyrotropin_releasing_hormone) # Equal to 6
 ```
 
 </details>
 
+### Rolling Fingerprints
+
+MinHashing is a common technique for Information Retrieval, producing compact representations of large documents.
+For $D$ hash-functions and a text of length $L$, in the worst case it involves computing $O(D \cdot L)$ hashes.
+
+```py
+import numpy as np
+import stringzilla as sz
+import stringzillas as szs
+
+texts = sz.Strs([
+    "quick brown fox jumps over the lazy dog",
+    "quick brown fox jumped over a very lazy dog",
+])
+
+cpu = szs.DeviceScope(cpu_cores=4)
+ndim = 1024
+window_widths = np.array([4, 6, 8, 10], dtype=np.uint64)
+engine = szs.Fingerprints(
+    ndim=ndim,
+    window_widths=window_widths,    # optional
+    alphabet_size=256,              # default for byte strings
+    capabilities=("serial",),       # defaults to all, can also pass a `DeviceScope`
+)
+
+hashes, counts = engine(texts, device=cpu)
+assert hashes.shape == (len(texts), ndim)
+assert counts.shape == (len(texts), ndim)
+assert hashes.dtype == np.uint32 and counts.dtype == np.uint32
+```
+
 ### Serialization
 
 #### Filesystem
@@ -588,11 +651,22 @@ A `Str` is easy to cast to [PyArrow](https://arrow.apache.org/docs/python/arrays
 
 ```py
 from pyarrow import foreign_buffer
-from stringzilla import Str
+from stringzilla import Strs
 
-original = "hello"
-view = Str(original)
-arrow = foreign_buffer(view.address, view.nbytes, view)
+strs = Strs(["alpha", "beta", "gamma"])
+arrow = foreign_buffer(strs.address, strs.nbytes, strs)
+```
+
+And only slightly harder to convert in reverse direction:
+
+```py
+arr = pa.Array.from_buffers(
+    pa.large_string() if strs.offsets_are_large else pa.string(),
+    len(strs),
+    [None,
+     pa.foreign_buffer(strs.offsets_address, strs.offsets_nbytes, strs),
+     pa.foreign_buffer(strs.tape_address, strs.tape_nbytes, strs)],
+)
 ```
 
 That means you can convert `Str` to `pyarrow.Buffer` and `Strs` to `pyarrow.Array` without extra copies.
@@ -760,6 +834,128 @@ auto b = "some string"_sv; // sz::string_view
 
 [stl-literal]: https://en.cppreference.com/w/cpp/string/basic_string_view/operator%22%22sv
 
+### Similarity Scores
+
+StringZilla exposes high-performance, batch-oriented similarity via the `stringzillas/stringzillas.h` header. 
+Use `szs_device_scope_t` to pick hardware and optionally limit capabilities per engine.
+
+```cpp
+#include <stringzillas/stringzillas.h>
+
+szs_device_scope_t device = NULL;
+szs_device_scope_init_default(&device);
+
+szs_levenshtein_distances_t engine = NULL;
+szs_levenshtein_distances_init(0, 1, 1, 1, /*alloc*/ NULL, /*caps*/ sz_cap_serial_k, &engine);
+
+sz_sequence_u32tape_t strings_a {data_a, offsets_a, count}; // or `sz_sequence_u64tape_t` for large inputs
+sz_sequence_u32tape_t strings_b {data_b, offsets_b, count}; // or `sz_sequence_t` to pass generic containers
+
+sz_size_t distances[count];
+szs_levenshtein_distances_u32tape(engine, device, &strings_a, &strings_b, distances, sizeof(distances[0]));
+
+szs_levenshtein_distances_free(engine);
+szs_device_scope_free(device);
+```
+
+To target a different device, use the appropriate `szs_device_scope_init_{cpu_cores,gpu_device}` function.
+When dealing with GPU backends, make sure to use the "unified memory" allocators exposed as `szs_unified_{alloc,free}`.
+Similar stable C ABIs are exposed for other workloads as well.
+
+- UTF‑8: `szs_levenshtein_distances_utf8_{sequence,u32tape,u64tape}`
+- Needleman–Wunsch: `szs_needleman_wunsch_scores_{sequence,u32tape,u64tape}`
+- Smith-Waterman: `szs_smith_waterman_scores_{sequence,u32tape,u64tape}`
+
+Moreover, in C++ codebases one can tap into the raw templates implementing that functionality, customizing them with custom executors, SIMD plugins, etc.
+For that include `stringzillas/similarities.hpp` for C++ and `stringzillas/similarities.hpp` for CUDA.
+
+```cpp
+#include <stringzillas/similarities.hpp>
+#include <stringzilla/types.hpp>       // tape of strings
+#include <fork_union.hpp>              // optional thread pool
+
+namespace sz = ashvardanian::stringzilla;
+namespace szs = ashvardanian::stringzillas;
+
+// Pack strings into an Arrow-like tape
+std::vector<std::string> left = {"kitten", "flaw"};
+std::vector<std::string> right = {"sitting", "lawn"};
+sz::arrow_strings_tape<char, sz::size_t, std::allocator<char>> tape_a, tape_b;
+auto _ = tape_a.try_assign(left.begin(), left.end());
+auto _ = tape_b.try_assign(right.begin(), right.end());
+
+// Run on the current thread
+using levenshtein_t = szs::levenshtein_distances<char, szs::linear_gap_costs_t, std::allocator<char>, sz_cap_serial_k>;
+levenshtein_t engine {szs::uniform_substitution_costs_t{0,1}, szs::linear_gap_costs_t{1}};
+std::size_t distances[2];
+auto _ = engine(tape_a, tape_b, distances);
+
+// Or run in parallel with a pool
+fork_union::basic_pool_t pool;
+auto _ = pool.try_spawn(std::thread::hardware_concurrency());
+auto _ = engine(tape_a, tape_b, distances, pool);
+```
+
+All of the potentially-failing StringZillas interfaces return error codes, and none raise C++ exceptions.
+Parallelism is enabled at both collection-level and within individual pairs of large inputs.
+
+### Rolling Fingerprints
+
+StringZilla exposes parallel fingerprinting (Min‑Hashes or Count‑Min‑Sketches) via the `stringzillas/stringzillas.h` header. 
+Use `szs_device_scope_t` to pick hardware and optionally limit capabilities per engine.
+
+```c
+#include <stringzillas/stringzillas.h>
+
+szs_device_scope_t device = NULL;
+szs_device_scope_init_default(&device);
+
+szs_fingerprints_t engine = NULL;
+sz_size_t const dims = 1024; sz_size_t const window_widths[] = {4, 6, 8, 10};
+szs_fingerprints_init(dims, /*alphabet*/ 256, window_widths, 4, /*alloc*/ NULL, /*caps*/ sz_cap_serial_k, &engine);
+
+sz_sequence_u32tape_t texts = {data, offsets, count};
+sz_u32_t *min_hashes = (sz_u32_t*)szs_unified_alloc(count * dims * sizeof(*min_hashes));
+sz_u32_t *min_counts = (sz_u32_t*)szs_unified_alloc(count * dims * sizeof(*min_counts));
+szs_fingerprints_u32tape(engine, device, &texts,
+    min_hashes, dims * sizeof(*min_hashes),     // support strided matrices
+    min_counts, dims * sizeof(*min_counts));    // for both output arguments
+
+szs_fingerprints_free(engine);
+szs_device_scope_free(device);
+```
+
+Moreover, in C++ codebases one can tap into the raw templates implementing that functionality, customizing them with custom executors, SIMD plugins, etc.
+For that include `stringzillas/fingerprints.hpp` for C++ and `stringzillas/fingerprints.hpp` for CUDA.
+
+```cpp
+#include <stringzillas/fingerprints.hpp>
+#include <stringzilla/types.hpp>       // tape of strings
+#include <fork_union.hpp>              // optional thread pool
+
+namespace sz = ashvardanian::stringzilla;
+namespace szs = ashvardanian::stringzillas;
+
+// Pack strings into an Arrow-like tape
+std::vector<std::string> docs = {"alpha beta", "alpha betta"};
+sz::arrow_strings_tape<char, sz::size_t, std::allocator<char>> tape;
+auto _ = tape.try_assign(docs.begin(), docs.end());
+
+// Run on the current thread with a Rabin–Karp family hasher
+constexpr std::size_t dimensions_k = 256;
+using fingerprinter_t = szs::floating_rolling_hashers<sz_cap_serial_k, dimensions_k>;
+fingerprinter_t engine;
+auto _ = engine.try_extend(/*window*/ 7, /*dims*/ 256);
+std::array<sz_u32_t, 256> row{};
+std::vector<decltype(row)> hashes(docs.size()), counts(docs.size());
+auto _ = engine(tape, hashes, counts);
+
+// Or run in parallel with a pool
+fork_union::basic_pool_t pool;
+auto _ = pool.try_spawn(std::thread::hardware_concurrency());
+auto _ = engine(tape, hashes, counts, pool);
+```
+
 ### Memory Ownership and Small String Optimization
 
 Most operations in StringZilla don't assume any memory ownership.
@@ -1241,11 +1437,20 @@ __`STRINGZILLA_BUILD_SHARED`, `STRINGZILLA_BUILD_TEST`, `STRINGZILLA_BUILD_BENCH
 ## Quick Start: Rust 🦀
 
 StringZilla is available as a Rust crate, with documentation available on [docs.rs/stringzilla](https://docs.rs/stringzilla).
+You can immediately check the installed version and the used hardware capabilities with following commands:
+
+```bash
+cargo add stringzilla
+cargo run --example version
+```
+
 To use the latest crate release in your project, add the following to your `Cargo.toml`:
 
 ```toml
 [dependencies]
-stringzilla = ">=3"
+stringzilla = ">=3"                                     # for serial algorithms
+stringzilla = { version = ">=3", features = ["cpus"] }  # for parallel multi-CPU backends
+stringzilla = { version = ">=3", features = ["cuda"] }  # for parallel Nvidia GPU backend
 ```
 
 Or if you want to use the latest pre-release version from the repository:
@@ -1309,7 +1514,88 @@ sz::hamming_distance_utf8("αβγδ", "αγγδ") // 1
 sz::levenshtein_distance_utf8("façade", "facade") // 1
 ```
 
-[memchr-benchmarks]: https://github.com/ashvardanian/memchr_vs_stringzilla
+[memchr-benchmarks]: https://github.com/ashvardanian/StringWa.rs
+
+### Similarity Scores
+
+StringZilla exposes high-performance, batch-oriented similarity via the `szs` module.
+Use `DeviceScope` to pick hardware and optionally limit capabilities per engine.
+
+```rust
+use stringzilla::szs; // re-exported as `szs`
+
+let cpu_scope = szs::DeviceScope::cpu_cores(4).unwrap();    // force CPU-only
+let gpu_scope = szs::DeviceScope::gpu_device(0).unwrap();   // pick GPU 0 if available
+let strings_a = vec!["kitten", "flaw"];
+let strings_b = vec!["sitting", "lawn"];
+
+let engine = szs::LevenshteinDistances::new(
+    &cpu_scope,
+    0,  // match cost
+    2,  // mismatch cost - costs don't have to be 1
+    3,  // open cost - may be different in Bio
+    1,  // extend cost
+).unwrap();
+let distances = engine.compute(&cpu_scope, &strings_a, &strings_b).unwrap();
+assert_eq!(distances[0], 3);
+assert_eq!(distances[1], 2);
+```
+
+Note, that this computes byte-level distances.
+For UTF-8 codepoints, use a different engine class:
+
+```rust
+let strings_a = vec!["café", "αβγδ"];
+let strings_b = vec!["cafe", "αγδ"];
+let engine = szs::LevenshteinDistancesUtf8::new(&cpu_scope, 0, 1, 1, 1).unwrap();
+let distances = engine.compute(&cpu_scope, &strings_a, &strings_b).unwrap();
+assert_eq!(distances, vec![1, 1]);
+```
+
+Similarly, for variable substitution costs, also pass in a a weights matrix:
+
+```rust
+let mut substitution_matrix = [-1i8; 256 * 256];
+for i in 0..256 { substitution_matrix[i * 256 + i] = 0; }
+let engine = szs::NeedlemanWunschScores::new(&cpu_scope, &substitution_matrix, -3, -1).unwrap();
+let scores = engine.compute(&cpu_scope, &strings_a, &strings_b).unwrap();
+```
+
+Or for local alignment scores:
+
+```rust
+let engine = szs::SmithWatermanScores::new(&cpu_scope, &substitution_matrix, -3, -1).unwrap();
+let local_scores = engine.compute(&cpu_scope, &strings_a, &strings_b).unwrap();
+```
+
+### Rolling Fingerprints
+
+MinHashing is a common technique for Information Retrieval, producing compact representations of large documents.
+For $D$ hash-functions and a text of length $L$, in the worst case it involves computing $O(D \cdot L)$ hashes.
+
+```rust
+use stringzilla::szs;
+
+let texts = vec![
+    "quick brown fox jumps over the lazy dog",
+    "quick brown fox jumped over a very lazy dog",
+];
+let cpu = szs::DeviceScope::cpu_cores(4).unwrap();
+let ndim = 1024;
+let window_widths = vec![4u64, 6, 8, 10];
+
+let engine = szs::Fingerprints::new(
+    ndim,           // number of hash functions & dimensions
+    &window_widths, // optional predefined window widths
+    256,            // default alphabet size for byte strings
+    &cpu            // device scope
+).unwrap();
+
+let (hashes, counts) = engine.compute(&cpu, &texts).unwrap();
+assert_eq!(hashes.len(), texts.len() * ndim);
+assert_eq!(counts.len(), texts.len() * ndim);
+```
+
 ## Quick Start: JavaScript 🟨
 
 Install the Node.js package and use zero-copy `Buffer` APIs.

From cb2d6a83f13366963d960871f25f6c314616686d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 4 Sep 2025 09:03:03 +0000
Subject: [PATCH 717/751] Fix: `__cpp_lib_string_resize_and_overwrite` test
 guards

---
 scripts/test_stringzilla.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/test_stringzilla.cpp b/scripts/test_stringzilla.cpp
index 4c359035..73614ec5 100644
--- a/scripts/test_stringzilla.cpp
+++ b/scripts/test_stringzilla.cpp
@@ -903,6 +903,7 @@ void test_stl_compatibility_for_updates() {
     assert(str().get_allocator() == std::allocator<char>());
     assert(std::strcmp(str("c_str").c_str(), "c_str") == 0);
 
+#if SZ_IS_CPP23_ && defined(__cpp_lib_string_resize_and_overwrite)
     // Test C++23 resize and overwrite functionality
     assert_scoped(str s("hello"),
                   s.resize_and_overwrite(10,
@@ -927,6 +928,7 @@ void test_stl_compatibility_for_updates() {
                                                  return count;
                                              }),
                   s.size() == 6 && s == "works!");
+#endif
 
     // On 32-bit systems the base capacity can be larger than our `z::string::min_capacity`.
     // It's true for MSVC: https://github.com/ashvardanian/StringZilla/issues/168

From 66898ad5bf9bbd2366b79cce45ad4074166327e6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 4 Sep 2025 10:05:12 +0000
Subject: [PATCH 718/751] Break: New wording for incremental hashers

---
 README.md                              |  14 +--
 c/stringzilla.c                        |  36 +++----
 include/stringzilla/hash.h             | 118 +++++++++++-----------
 include/stringzilla/types.h            |   8 +-
 javascript/lib.c                       |  51 +++++-----
 python/stringzilla.c                   | 132 +++++++++++++++++++++++++
 rust/stringzilla.rs                    |  24 ++---
 scripts/bench_token.cpp                |  17 ++--
 scripts/test_stringzilla.cpp           |  60 +++++------
 swift/StringProtocol+StringZilla.swift |   7 +-
 10 files changed, 304 insertions(+), 163 deletions(-)

diff --git a/README.md b/README.md
index 3049089a..0d352237 100644
--- a/README.md
+++ b/README.md
@@ -723,12 +723,12 @@ sz_cptr_t ptr = sz_find_neon(haystack.start, haystack.length, needle.start, need
 sz_u64_t hash = sz_hash(haystack.start, haystack.length, 42);    // 42 is the seed
 sz_u64_t checksum = sz_bytesum(haystack.start, haystack.length); // or accumulate byte values
 
-// Hash strings incrementally with "init", "stream", and "fold":
+// Hash strings incrementally with "init", "update", and "digest":
 sz_hash_state_t state; 
 sz_hash_state_init(&state, 42);
-sz_hash_state_stream(&state, haystack.start, 1);                       // first char
-sz_hash_state_stream(&state, haystack.start + 1, haystack.length - 1); // rest of the string
-sz_u64_t hash_streamed = sz_hash_state_fold(&state);
+sz_hash_state_update(&state, haystack.start, 1);                        // first char
+sz_hash_state_update(&state, haystack.start + 1, haystack.length - 1);  // rest of the string
+sz_u64_t streamed_hash = sz_hash_state_digest(&state);
 
 // Perform collection level operations
 sz_sequence_t array = {your_handle, your_count, your_get_start, your_get_length};
@@ -1626,9 +1626,9 @@ const llOverlapCount = sz.count(haystack, Buffer.from('ll'), true); // 1n
 // Hashing
 const hash = sz.hash(haystack, 0); // 64-bit BigInt
 const hasher = new sz.Hasher(0);
-hasher.stream(Buffer.from('Hello, '));
-hasher.stream(Buffer.from('world!'));
-const streamingHash = hasher.fold();
+hasher.update(Buffer.from('Hello, '));
+hasher.update(Buffer.from('world!'));
+const streamedHash = hasher.digest();
 
 // Equality/ordering utilities
 const isEqual = sz.equal(Buffer.from('a'), Buffer.from('a'));
diff --git a/c/stringzilla.c b/c/stringzilla.c
index 721efb8a..5bbe79b1 100644
--- a/c/stringzilla.c
+++ b/c/stringzilla.c
@@ -50,8 +50,8 @@ typedef struct sz_implementations_t {
     sz_bytesum_t bytesum;
     sz_hash_t hash;
     sz_hash_state_init_t hash_state_init;
-    sz_hash_state_stream_t hash_state_stream;
-    sz_hash_state_fold_t hash_state_fold;
+    sz_hash_state_update_t hash_state_update;
+    sz_hash_state_digest_t hash_state_digest;
     sz_fill_random_t fill_random;
 
     sz_find_byte_t find_byte;
@@ -87,8 +87,8 @@ static void sz_dispatch_table_update_implementation_(sz_capability_t caps) {
     impl->bytesum = sz_bytesum_serial;
     impl->hash = sz_hash_serial;
     impl->hash_state_init = sz_hash_state_init_serial;
-    impl->hash_state_stream = sz_hash_state_stream_serial;
-    impl->hash_state_fold = sz_hash_state_fold_serial;
+    impl->hash_state_update = sz_hash_state_update_serial;
+    impl->hash_state_digest = sz_hash_state_digest_serial;
     impl->fill_random = sz_fill_random_serial;
 
     impl->find = sz_find_serial;
@@ -115,8 +115,8 @@ static void sz_dispatch_table_update_implementation_(sz_capability_t caps) {
         impl->bytesum = sz_bytesum_haswell;
         impl->hash = sz_hash_haswell;
         impl->hash_state_init = sz_hash_state_init_haswell;
-        impl->hash_state_stream = sz_hash_state_stream_haswell;
-        impl->hash_state_fold = sz_hash_state_fold_haswell;
+        impl->hash_state_update = sz_hash_state_update_haswell;
+        impl->hash_state_digest = sz_hash_state_digest_haswell;
         impl->fill_random = sz_fill_random_haswell;
 
         impl->find_byte = sz_find_byte_haswell;
@@ -140,8 +140,8 @@ static void sz_dispatch_table_update_implementation_(sz_capability_t caps) {
         impl->bytesum = sz_bytesum_skylake;
         impl->hash = sz_hash_skylake;
         impl->hash_state_init = sz_hash_state_init_skylake;
-        impl->hash_state_stream = sz_hash_state_stream_skylake;
-        impl->hash_state_fold = sz_hash_state_fold_skylake;
+        impl->hash_state_update = sz_hash_state_update_skylake;
+        impl->hash_state_digest = sz_hash_state_digest_skylake;
         impl->fill_random = sz_fill_random_skylake;
 
         impl->find = sz_find_skylake;
@@ -164,8 +164,8 @@ static void sz_dispatch_table_update_implementation_(sz_capability_t caps) {
         impl->bytesum = sz_bytesum_ice;
         impl->hash = sz_hash_ice;
         impl->hash_state_init = sz_hash_state_init_ice;
-        impl->hash_state_stream = sz_hash_state_stream_ice;
-        impl->hash_state_fold = sz_hash_state_fold_ice;
+        impl->hash_state_update = sz_hash_state_update_ice;
+        impl->hash_state_digest = sz_hash_state_digest_ice;
         impl->fill_random = sz_fill_random_ice;
 
         impl->sequence_intersect = sz_sequence_intersect_ice;
@@ -196,8 +196,8 @@ static void sz_dispatch_table_update_implementation_(sz_capability_t caps) {
     if (caps & sz_cap_neon_aes_k) {
         impl->hash = sz_hash_neon;
         impl->hash_state_init = sz_hash_state_init_neon;
-        impl->hash_state_stream = sz_hash_state_stream_neon;
-        impl->hash_state_fold = sz_hash_state_fold_neon;
+        impl->hash_state_update = sz_hash_state_update_neon;
+        impl->hash_state_digest = sz_hash_state_digest_neon;
         impl->fill_random = sz_fill_random_neon;
     }
 #endif
@@ -232,8 +232,8 @@ static void sz_dispatch_table_update_implementation_(sz_capability_t caps) {
     if (caps & sz_cap_sve2_aes_k) {
         impl->hash = sz_hash_sve2;
         impl->hash_state_init = sz_hash_state_init_sve2;
-        impl->hash_state_stream = sz_hash_state_stream_sve2;
-        impl->hash_state_fold = sz_hash_state_fold_sve2;
+        impl->hash_state_update = sz_hash_state_update_sve2;
+        impl->hash_state_digest = sz_hash_state_digest_sve2;
         impl->fill_random = sz_fill_random_sve2;
     }
 #endif
@@ -309,12 +309,12 @@ SZ_DYNAMIC void sz_hash_state_init(sz_hash_state_t *state, sz_u64_t seed) {
     sz_dispatch_table.hash_state_init(state, seed);
 }
 
-SZ_DYNAMIC void sz_hash_state_stream(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
-    sz_dispatch_table.hash_state_stream(state, text, length);
+SZ_DYNAMIC void sz_hash_state_update(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
+    sz_dispatch_table.hash_state_update(state, text, length);
 }
 
-SZ_DYNAMIC sz_u64_t sz_hash_state_fold(sz_hash_state_t const *state) {
-    return sz_dispatch_table.hash_state_fold(state);
+SZ_DYNAMIC sz_u64_t sz_hash_state_digest(sz_hash_state_t const *state) {
+    return sz_dispatch_table.hash_state_digest(state);
 }
 
 SZ_DYNAMIC void sz_fill_random(sz_ptr_t result, sz_size_t result_length, sz_u64_t nonce) {
diff --git a/include/stringzilla/hash.h b/include/stringzilla/hash.h
index 107282b8..0c1993d8 100644
--- a/include/stringzilla/hash.h
+++ b/include/stringzilla/hash.h
@@ -7,7 +7,7 @@
  *
  *  - `sz_bytesum` - for byte-level 64-bit unsigned byte-level checksums.
  *  - `sz_hash` - for 64-bit single-shot hashing using AES instructions.
- *  - `sz_hash_state_init`, `sz_hash_state_stream`, `sz_hash_state_fold` - for incremental hashing.
+ *  - `sz_hash_state_init`, `sz_hash_state_update`, `sz_hash_state_digest` - for incremental hashing.
  *  - `sz_fill_random` - for populating buffers with pseudo-random noise using AES instructions.
  *
  *  Why the hell do we need a yet another hashing library?!
@@ -129,7 +129,7 @@ SZ_DYNAMIC sz_u64_t sz_bytesum(sz_cptr_t text, sz_size_t length);
  *  @sa     sz_hash_serial, sz_hash_haswell, sz_hash_skylake, sz_hash_ice, sz_hash_neon, sz_hash_sve
  *
  *  @note   The algorithm must provide the same output on all platforms in both single-shot and incremental modes.
- *  @sa     sz_hash_state_init, sz_hash_state_stream, sz_hash_state_fold
+ *  @sa     sz_hash_state_init, sz_hash_state_update, sz_hash_state_digest
  */
 SZ_DYNAMIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
 
@@ -170,7 +170,7 @@ SZ_DYNAMIC void sz_fill_random(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
 
 /**
  *  @brief  The state for incremental construction of a hash.
- *  @see    sz_hash_state_init, sz_hash_state_stream, sz_hash_state_fold.
+ *  @see    sz_hash_state_init, sz_hash_state_update, sz_hash_state_digest.
  */
 typedef struct sz_hash_state_t {
     sz_u512_vec_t aes;
@@ -201,7 +201,7 @@ SZ_DYNAMIC void sz_hash_state_init(sz_hash_state_t *state, sz_u64_t seed);
  *  @param[in] text The new data to include in the hash.
  *  @param[in] length The number of bytes in the new data.
  */
-SZ_DYNAMIC void sz_hash_state_stream(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
+SZ_DYNAMIC void sz_hash_state_update(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
 
 /**
  *  @brief  Finalizes the immutable state and returns the hash.
@@ -209,7 +209,7 @@ SZ_DYNAMIC void sz_hash_state_stream(sz_hash_state_t *state, sz_cptr_t text, sz_
  *  @param[in] state The state to fold.
  *  @return The 64-bit hash value.
  */
-SZ_DYNAMIC sz_u64_t sz_hash_state_fold(sz_hash_state_t const *state);
+SZ_DYNAMIC sz_u64_t sz_hash_state_digest(sz_hash_state_t const *state);
 
 /** @copydoc sz_bytesum */
 SZ_PUBLIC sz_u64_t sz_bytesum_serial(sz_cptr_t text, sz_size_t length);
@@ -223,11 +223,11 @@ SZ_PUBLIC void sz_fill_random_serial(sz_ptr_t text, sz_size_t length, sz_u64_t n
 /** @copydoc sz_hash_state_init */
 SZ_PUBLIC void sz_hash_state_init_serial(sz_hash_state_t *state, sz_u64_t seed);
 
-/** @copydoc sz_hash_state_stream */
-SZ_PUBLIC void sz_hash_state_stream_serial(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
+/** @copydoc sz_hash_state_update */
+SZ_PUBLIC void sz_hash_state_update_serial(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
 
-/** @copydoc sz_hash_state_fold */
-SZ_PUBLIC sz_u64_t sz_hash_state_fold_serial(sz_hash_state_t const *state);
+/** @copydoc sz_hash_state_digest */
+SZ_PUBLIC sz_u64_t sz_hash_state_digest_serial(sz_hash_state_t const *state);
 
 #if SZ_USE_HASWELL
 
@@ -243,11 +243,11 @@ SZ_PUBLIC void sz_fill_random_haswell(sz_ptr_t text, sz_size_t length, sz_u64_t
 /** @copydoc sz_hash_state_init */
 SZ_PUBLIC void sz_hash_state_init_haswell(sz_hash_state_t *state, sz_u64_t seed);
 
-/** @copydoc sz_hash_state_stream */
-SZ_PUBLIC void sz_hash_state_stream_haswell(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
+/** @copydoc sz_hash_state_update */
+SZ_PUBLIC void sz_hash_state_update_haswell(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
 
-/** @copydoc sz_hash_state_fold */
-SZ_PUBLIC sz_u64_t sz_hash_state_fold_haswell(sz_hash_state_t const *state);
+/** @copydoc sz_hash_state_digest */
+SZ_PUBLIC sz_u64_t sz_hash_state_digest_haswell(sz_hash_state_t const *state);
 
 #endif
 
@@ -265,11 +265,11 @@ SZ_PUBLIC void sz_fill_random_skylake(sz_ptr_t text, sz_size_t length, sz_u64_t
 /** @copydoc sz_hash_state_init */
 SZ_PUBLIC void sz_hash_state_init_skylake(sz_hash_state_t *state, sz_u64_t seed);
 
-/** @copydoc sz_hash_state_stream */
-SZ_PUBLIC void sz_hash_state_stream_skylake(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
+/** @copydoc sz_hash_state_update */
+SZ_PUBLIC void sz_hash_state_update_skylake(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
 
-/** @copydoc sz_hash_state_fold */
-SZ_PUBLIC sz_u64_t sz_hash_state_fold_skylake(sz_hash_state_t const *state);
+/** @copydoc sz_hash_state_digest */
+SZ_PUBLIC sz_u64_t sz_hash_state_digest_skylake(sz_hash_state_t const *state);
 
 #endif
 
@@ -287,11 +287,11 @@ SZ_PUBLIC void sz_fill_random_ice(sz_ptr_t text, sz_size_t length, sz_u64_t nonc
 /** @copydoc sz_hash_state_init */
 SZ_PUBLIC void sz_hash_state_init_ice(sz_hash_state_t *state, sz_u64_t seed);
 
-/** @copydoc sz_hash_state_stream */
-SZ_PUBLIC void sz_hash_state_stream_ice(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
+/** @copydoc sz_hash_state_update */
+SZ_PUBLIC void sz_hash_state_update_ice(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
 
-/** @copydoc sz_hash_state_fold */
-SZ_PUBLIC sz_u64_t sz_hash_state_fold_ice(sz_hash_state_t const *state);
+/** @copydoc sz_hash_state_digest */
+SZ_PUBLIC sz_u64_t sz_hash_state_digest_ice(sz_hash_state_t const *state);
 
 #endif
 
@@ -313,11 +313,11 @@ SZ_PUBLIC void sz_fill_random_neon(sz_ptr_t text, sz_size_t length, sz_u64_t non
 /** @copydoc sz_hash_state_init */
 SZ_PUBLIC void sz_hash_state_init_neon(sz_hash_state_t *state, sz_u64_t seed);
 
-/** @copydoc sz_hash_state_stream */
-SZ_PUBLIC void sz_hash_state_stream_neon(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
+/** @copydoc sz_hash_state_update */
+SZ_PUBLIC void sz_hash_state_update_neon(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
 
-/** @copydoc sz_hash_state_fold */
-SZ_PUBLIC sz_u64_t sz_hash_state_fold_neon(sz_hash_state_t const *state);
+/** @copydoc sz_hash_state_digest */
+SZ_PUBLIC sz_u64_t sz_hash_state_digest_neon(sz_hash_state_t const *state);
 
 #endif
 
@@ -346,11 +346,11 @@ SZ_PUBLIC void sz_fill_random_sve2(sz_ptr_t text, sz_size_t length, sz_u64_t non
 /** @copydoc sz_hash_state_init */
 SZ_PUBLIC void sz_hash_state_init_sve2(sz_hash_state_t *state, sz_u64_t seed);
 
-/** @copydoc sz_hash_state_stream */
-SZ_PUBLIC void sz_hash_state_stream_sve2(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
+/** @copydoc sz_hash_state_update */
+SZ_PUBLIC void sz_hash_state_update_sve2(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length);
 
-/** @copydoc sz_hash_state_fold */
-SZ_PUBLIC sz_u64_t sz_hash_state_fold_sve2(sz_hash_state_t const *state);
+/** @copydoc sz_hash_state_digest */
+SZ_PUBLIC sz_u64_t sz_hash_state_digest_sve2(sz_hash_state_t const *state);
 
 #endif
 
@@ -802,7 +802,7 @@ SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length, sz_u64_t se
     }
 }
 
-SZ_PUBLIC void sz_hash_state_stream_serial(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
+SZ_PUBLIC void sz_hash_state_update_serial(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
     while (length) {
         sz_size_t progress_in_block = state->ins_length % 64;
         sz_size_t to_copy = sz_min_of_two(length, 64 - progress_in_block);
@@ -821,7 +821,7 @@ SZ_PUBLIC void sz_hash_state_stream_serial(sz_hash_state_t *state, sz_cptr_t tex
     }
 }
 
-SZ_PUBLIC sz_u64_t sz_hash_state_fold_serial(sz_hash_state_t const *state) {
+SZ_PUBLIC sz_u64_t sz_hash_state_digest_serial(sz_hash_state_t const *state) {
     sz_size_t length = state->ins_length;
     if (length >= 64) return sz_hash_state_finalize_serial_(state);
 
@@ -1183,7 +1183,7 @@ SZ_PUBLIC sz_u64_t sz_hash_haswell(sz_cptr_t start, sz_size_t length, sz_u64_t s
     }
 }
 
-SZ_PUBLIC void sz_hash_state_stream_haswell(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
+SZ_PUBLIC void sz_hash_state_update_haswell(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
     while (length) {
         // Append to the internal buffer until it's full
         if (state->ins_length % 64 == 0 && length >= 64) {
@@ -1221,7 +1221,7 @@ SZ_PUBLIC void sz_hash_state_stream_haswell(sz_hash_state_t *state, sz_cptr_t te
     }
 }
 
-SZ_PUBLIC sz_u64_t sz_hash_state_fold_haswell(sz_hash_state_t const *state) {
+SZ_PUBLIC sz_u64_t sz_hash_state_digest_haswell(sz_hash_state_t const *state) {
     sz_size_t length = state->ins_length;
     if (length >= 64) return sz_hash_state_finalize_haswell_(state);
 
@@ -1566,7 +1566,7 @@ SZ_PUBLIC sz_u64_t sz_hash_skylake(sz_cptr_t start, sz_size_t length, sz_u64_t s
     }
 }
 
-SZ_PUBLIC void sz_hash_state_stream_skylake(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
+SZ_PUBLIC void sz_hash_state_update_skylake(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
     while (length) {
         sz_size_t const progress_in_block = state->ins_length % 64;
         sz_size_t const to_copy = sz_min_of_two(length, 64 - progress_in_block);
@@ -1588,9 +1588,9 @@ SZ_PUBLIC void sz_hash_state_stream_skylake(sz_hash_state_t *state, sz_cptr_t te
     }
 }
 
-SZ_PUBLIC sz_u64_t sz_hash_state_fold_skylake(sz_hash_state_t const *state) {
+SZ_PUBLIC sz_u64_t sz_hash_state_digest_skylake(sz_hash_state_t const *state) {
     // ? We don't know a better way to fold the state on Ice Lake, than to use the Haswell implementation.
-    return sz_hash_state_fold_haswell(state);
+    return sz_hash_state_digest_haswell(state);
 }
 
 SZ_PUBLIC void sz_fill_random_skylake(sz_ptr_t text, sz_size_t length, sz_u64_t nonce) {
@@ -1841,7 +1841,7 @@ SZ_PUBLIC void sz_hash_state_init_ice(sz_hash_state_t *state, sz_u64_t seed) {
     sz_hash_state_init_skylake(state, seed);
 }
 
-SZ_PUBLIC void sz_hash_state_stream_ice(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
+SZ_PUBLIC void sz_hash_state_update_ice(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
     while (length) {
         sz_size_t progress_in_block = state->ins_length % 64;
         sz_size_t to_copy = sz_min_of_two(length, 64 - progress_in_block);
@@ -1863,9 +1863,9 @@ SZ_PUBLIC void sz_hash_state_stream_ice(sz_hash_state_t *state, sz_cptr_t text,
     }
 }
 
-SZ_PUBLIC sz_u64_t sz_hash_state_fold_ice(sz_hash_state_t const *state) {
+SZ_PUBLIC sz_u64_t sz_hash_state_digest_ice(sz_hash_state_t const *state) {
     // ? We don't know a better way to fold the state on Ice Lake, than to use the Haswell implementation.
-    return sz_hash_state_fold_haswell(state);
+    return sz_hash_state_digest_haswell(state);
 }
 
 SZ_PUBLIC void sz_fill_random_ice(sz_ptr_t output, sz_size_t length, sz_u64_t nonce) {
@@ -2139,7 +2139,7 @@ SZ_INTERNAL sz_u64_t sz_hash_state_finalize_neon_(sz_hash_state_t const *state)
     return vgetq_lane_u64(vreinterpretq_u64_u8(mixed_in_register), 0);
 }
 
-SZ_PUBLIC void sz_hash_state_stream_neon(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
+SZ_PUBLIC void sz_hash_state_update_neon(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
     // This whole function is identical to Haswell.
     while (length) {
         // Append to the internal buffer until it's full
@@ -2173,7 +2173,7 @@ SZ_PUBLIC void sz_hash_state_stream_neon(sz_hash_state_t *state, sz_cptr_t text,
     }
 }
 
-SZ_PUBLIC sz_u64_t sz_hash_state_fold_neon(sz_hash_state_t const *state) {
+SZ_PUBLIC sz_u64_t sz_hash_state_digest_neon(sz_hash_state_t const *state) {
     // This whole function is identical to Haswell.
     sz_size_t length = state->ins_length;
     if (length >= 64) return sz_hash_state_finalize_neon_(state);
@@ -2546,12 +2546,12 @@ SZ_PUBLIC sz_u64_t sz_hash_sve2_upto16_(sz_cptr_t text, sz_size_t length, sz_u64
 
 SZ_PUBLIC void sz_hash_state_init_sve2(sz_hash_state_t *state, sz_u64_t seed) { sz_hash_state_init_neon(state, seed); }
 
-SZ_PUBLIC void sz_hash_state_stream_sve2(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
-    sz_hash_state_stream_neon(state, text, length);
+SZ_PUBLIC void sz_hash_state_update_sve2(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
+    sz_hash_state_update_neon(state, text, length);
 }
 
-SZ_PUBLIC sz_u64_t sz_hash_state_fold_sve2(sz_hash_state_t const *state) { //
-    return sz_hash_state_fold_neon(state);
+SZ_PUBLIC sz_u64_t sz_hash_state_digest_sve2(sz_hash_state_t const *state) { //
+    return sz_hash_state_digest_neon(state);
 }
 
 SZ_PUBLIC sz_u64_t sz_hash_sve2(sz_cptr_t text, sz_size_t length, sz_u64_t seed) {
@@ -2701,35 +2701,35 @@ SZ_DYNAMIC void sz_hash_state_init(sz_hash_state_t *state, sz_u64_t seed) {
 #endif
 }
 
-SZ_DYNAMIC void sz_hash_state_stream(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
+SZ_DYNAMIC void sz_hash_state_update(sz_hash_state_t *state, sz_cptr_t text, sz_size_t length) {
 #if SZ_USE_ICE
-    sz_hash_state_stream_ice(state, text, length);
+    sz_hash_state_update_ice(state, text, length);
 #elif SZ_USE_SKYLAKE
-    sz_hash_state_stream_skylake(state, text, length);
+    sz_hash_state_update_skylake(state, text, length);
 #elif SZ_USE_HASWELL
-    sz_hash_state_stream_haswell(state, text, length);
+    sz_hash_state_update_haswell(state, text, length);
 #elif SZ_USE_SVE2_AES
-    sz_hash_state_stream_sve2(state, text, length);
+    sz_hash_state_update_sve2(state, text, length);
 #elif SZ_USE_NEON_AES
-    sz_hash_state_stream_neon(state, text, length);
+    sz_hash_state_update_neon(state, text, length);
 #else
-    sz_hash_state_stream_serial(state, text, length);
+    sz_hash_state_update_serial(state, text, length);
 #endif
 }
 
-SZ_DYNAMIC sz_u64_t sz_hash_state_fold(sz_hash_state_t const *state) {
+SZ_DYNAMIC sz_u64_t sz_hash_state_digest(sz_hash_state_t const *state) {
 #if SZ_USE_ICE
-    return sz_hash_state_fold_ice(state);
+    return sz_hash_state_digest_ice(state);
 #elif SZ_USE_SKYLAKE
-    return sz_hash_state_fold_skylake(state);
+    return sz_hash_state_digest_skylake(state);
 #elif SZ_USE_HASWELL
-    return sz_hash_state_fold_haswell(state);
+    return sz_hash_state_digest_haswell(state);
 #elif SZ_USE_SVE2_AES
-    return sz_hash_state_fold_sve2(state);
+    return sz_hash_state_digest_sve2(state);
 #elif SZ_USE_NEON_AES
-    return sz_hash_state_fold_neon(state);
+    return sz_hash_state_digest_neon(state);
 #else
-    return sz_hash_state_fold_serial(state);
+    return sz_hash_state_digest_serial(state);
 #endif
 }
 
diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h
index 648270e3..0f8c22ca 100644
--- a/include/stringzilla/types.h
+++ b/include/stringzilla/types.h
@@ -696,11 +696,11 @@ typedef sz_u64_t (*sz_hash_t)(sz_cptr_t, sz_size_t, sz_u64_t);
 /** @brief Signature of `sz_hash_state_init`. */
 typedef void (*sz_hash_state_init_t)(struct sz_hash_state_t *, sz_u64_t);
 
-/** @brief Signature of `sz_hash_state_stream`. */
-typedef void (*sz_hash_state_stream_t)(struct sz_hash_state_t *, sz_cptr_t, sz_size_t);
+/** @brief Signature of `sz_hash_state_update` (legacy) / `sz_hash_state_update` (preferred). */
+typedef void (*sz_hash_state_update_t)(struct sz_hash_state_t *, sz_cptr_t, sz_size_t);
 
-/** @brief Signature of `sz_hash_state_fold`. */
-typedef sz_u64_t (*sz_hash_state_fold_t)(struct sz_hash_state_t const *);
+/** @brief Signature of `sz_hash_state_digest` (legacy) / `sz_hash_state_digest` (preferred). */
+typedef sz_u64_t (*sz_hash_state_digest_t)(struct sz_hash_state_t const *);
 
 /** @brief Signature of `sz_bytesum`. */
 typedef sz_u64_t (*sz_bytesum_t)(sz_cptr_t, sz_size_t);
diff --git a/javascript/lib.c b/javascript/lib.c
index d66aeb83..cd209157 100644
--- a/javascript/lib.c
+++ b/javascript/lib.c
@@ -134,14 +134,14 @@ napi_value hashAPI(napi_env env, napi_callback_info info) {
 static void hasher_cleanup(napi_env env, void *data, void *hint) { free(data); }
 typedef struct {
     sz_hash_state_t state;
-    sz_u64_t seed;
+    sz_u64_t seed; // Used for `reset`
 } hasher_t;
 
 napi_value hasherConstructor(napi_env env, napi_callback_info info) {
     size_t argc = 1;
     napi_value args[1];
-    napi_value jsthis;
-    napi_get_cb_info(env, info, &argc, args, &jsthis, NULL);
+    napi_value js_this;
+    napi_get_cb_info(env, info, &argc, args, &js_this, NULL);
 
     sz_u64_t seed = 0;
     if (argc > 0) {
@@ -156,19 +156,19 @@ napi_value hasherConstructor(napi_env env, napi_callback_info info) {
     hasher_t *hasher = malloc(sizeof(hasher_t));
     hasher->seed = seed;
     sz_hash_state_init(&hasher->state, seed);
-    napi_wrap(env, jsthis, hasher, hasher_cleanup, NULL, NULL);
+    napi_wrap(env, js_this, hasher, hasher_cleanup, NULL, NULL);
 
-    return jsthis;
+    return js_this;
 }
 
 napi_value hasherUpdate(napi_env env, napi_callback_info info) {
     size_t argc = 1;
     napi_value args[1];
-    napi_value jsthis;
-    napi_get_cb_info(env, info, &argc, args, &jsthis, NULL);
+    napi_value js_this;
+    napi_get_cb_info(env, info, &argc, args, &js_this, NULL);
 
     hasher_t *hasher;
-    napi_unwrap(env, jsthis, (void **)&hasher);
+    napi_unwrap(env, js_this, (void **)&hasher);
 
     void *buffer_data;
     size_t buffer_length;
@@ -178,18 +178,18 @@ napi_value hasherUpdate(napi_env env, napi_callback_info info) {
         return NULL;
     }
 
-    sz_hash_state_stream(&hasher->state, (sz_cptr_t)buffer_data, buffer_length);
-    return jsthis;
+    sz_hash_state_update(&hasher->state, (sz_cptr_t)buffer_data, buffer_length);
+    return js_this;
 }
 
 napi_value hasherDigest(napi_env env, napi_callback_info info) {
-    napi_value jsthis;
-    napi_get_cb_info(env, info, NULL, NULL, &jsthis, NULL);
+    napi_value js_this;
+    napi_get_cb_info(env, info, NULL, NULL, &js_this, NULL);
 
     hasher_t *hasher;
-    napi_unwrap(env, jsthis, (void **)&hasher);
+    napi_unwrap(env, js_this, (void **)&hasher);
 
-    sz_u64_t hash = sz_hash_state_fold(&hasher->state);
+    sz_u64_t hash = sz_hash_state_digest(&hasher->state);
     napi_value js_result;
     napi_create_bigint_uint64(env, hash, &js_result);
 
@@ -197,14 +197,14 @@ napi_value hasherDigest(napi_env env, napi_callback_info info) {
 }
 
 napi_value hasherReset(napi_env env, napi_callback_info info) {
-    napi_value jsthis;
-    napi_get_cb_info(env, info, NULL, NULL, &jsthis, NULL);
+    napi_value js_this;
+    napi_get_cb_info(env, info, NULL, NULL, &js_this, NULL);
 
     hasher_t *hasher;
-    napi_unwrap(env, jsthis, (void **)&hasher);
+    napi_unwrap(env, js_this, (void **)&hasher);
 
     sz_hash_state_init(&hasher->state, hasher->seed);
-    return jsthis;
+    return js_this;
 }
 
 napi_value findLastAPI(napi_env env, napi_callback_info info) {
@@ -485,9 +485,11 @@ napi_value Init(napi_env env, napi_value exports) {
 
     // Create Hasher class constructor
     napi_value hasherClass;
-    napi_property_descriptor hasherProps[] = {{"update", 0, hasherUpdate, 0, 0, 0, napi_default, 0},
-                                              {"digest", 0, hasherDigest, 0, 0, 0, napi_default, 0},
-                                              {"reset", 0, hasherReset, 0, 0, 0, napi_default, 0}};
+    napi_property_descriptor hasherProps[] = {
+        {"update", 0, hasherUpdate, 0, 0, 0, napi_default, 0},
+        {"digest", 0, hasherDigest, 0, 0, 0, napi_default, 0},
+        {"reset", 0, hasherReset, 0, 0, 0, napi_default, 0},
+    };
     napi_define_class(env, "Hasher", NAPI_AUTO_LENGTH, hasherConstructor, NULL,
                       sizeof(hasherProps) / sizeof(hasherProps[0]), hasherProps, &hasherClass);
 
@@ -505,9 +507,10 @@ napi_value Init(napi_env env, napi_value exports) {
     napi_property_descriptor compareDesc = {"compare", 0, compareAPI, 0, 0, 0, napi_default, 0};
     napi_property_descriptor byteSumDesc = {"byteSum", 0, byteSumAPI, 0, 0, 0, napi_default, 0};
     napi_property_descriptor hasherDesc = {"Hasher", 0, 0, 0, 0, hasherClass, napi_default, 0};
-    napi_property_descriptor properties[] = {findDesc,         findLastDesc,         findByteDesc, findLastByteDesc,
-                                             findByteFromDesc, findLastByteFromDesc, countDesc,    hashDesc,
-                                             equalDesc,        compareDesc,          byteSumDesc,  hasherDesc};
+    napi_property_descriptor properties[] = {
+        findDesc,  findLastDesc, findByteDesc, findLastByteDesc, findByteFromDesc, findLastByteFromDesc,
+        countDesc, hashDesc,     equalDesc,    compareDesc,      byteSumDesc,      hasherDesc,
+    };
 
     // Define the properties on the `exports` object
     size_t propertyCount = sizeof(properties) / sizeof(properties[0]);
diff --git a/python/stringzilla.c b/python/stringzilla.c
index 52a9ea71..ecbf5627 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -106,6 +106,7 @@ static PyTypeObject FileType;
 static PyTypeObject StrType;
 static PyTypeObject StrsType;
 static PyTypeObject SplitIteratorType;
+static PyTypeObject HasherType;
 
 static sz_string_view_t temporary_memory = {NULL, 0};
 
@@ -3882,6 +3883,125 @@ static PyTypeObject SplitIteratorType = {
 
 #pragma endregion
 
+#pragma region Hasher
+
+typedef struct {
+    PyObject ob_base;
+    sz_hash_state_t state;
+    sz_u64_t seed;
+} Hasher;
+
+static void Hasher_dealloc(Hasher *self) { Py_TYPE(self)->tp_free((PyObject *)self); }
+
+static PyObject *Hasher_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
+    (void)args;
+    (void)kwds;
+    Hasher *self = (Hasher *)type->tp_alloc(type, 0);
+    if (!self) return NULL;
+    self->seed = 0;
+    sz_hash_state_init(&self->state, self->seed);
+    return (PyObject *)self;
+}
+
+static int Hasher_init(Hasher *self, PyObject *args, PyObject *kwargs) {
+    // Positional seed
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs > 1) {
+        PyErr_SetString(PyExc_TypeError, "Hasher() takes at most 1 positional argument");
+        return -1;
+    }
+    PyObject *seed_obj = nargs == 1 ? PyTuple_GET_ITEM(args, 0) : NULL;
+    // Keyword seed
+    if (kwargs) {
+        PyObject *kw_seed = PyDict_GetItemString(kwargs, "seed");
+        if (kw_seed) {
+            if (seed_obj) {
+                PyErr_SetString(PyExc_TypeError, "seed specified twice");
+                return -1;
+            }
+            seed_obj = kw_seed;
+        }
+        // Check for unexpected kwargs
+        Py_ssize_t pos = 0;
+        PyObject *key, *value;
+        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+            if (PyUnicode_CompareWithASCIIString(key, "seed") != 0) {
+                PyErr_Format(PyExc_TypeError, "unexpected keyword argument: %S", key);
+                return -1;
+            }
+        }
+    }
+    unsigned long long seed = 0ULL;
+    if (seed_obj) {
+        if (!PyLong_Check(seed_obj)) {
+            PyErr_SetString(PyExc_TypeError, "seed must be an integer");
+            return -1;
+        }
+        seed = PyLong_AsUnsignedLongLong(seed_obj);
+        if (PyErr_Occurred()) return -1;
+    }
+    self->seed = (sz_u64_t)seed;
+    sz_hash_state_init(&self->state, self->seed);
+    return 0;
+}
+
+static PyObject *Hasher_update(PyObject *self_obj, PyObject *arg) {
+    Hasher *self = (Hasher *)self_obj;
+    sz_string_view_t text;
+    if (!sz_py_export_string_like(arg, &text.start, &text.length)) {
+        wrap_current_exception("Argument must be string-like");
+        return NULL;
+    }
+    sz_hash_state_update(&self->state, text.start, text.length);
+    Py_INCREF(self_obj);
+    return self_obj;
+}
+
+static PyObject *Hasher_digest(PyObject *self_obj, PyObject *noargs) {
+    sz_unused_(noargs);
+    Hasher *self = (Hasher *)self_obj;
+    sz_u64_t hash = sz_hash_state_digest(&self->state);
+    return PyLong_FromUnsignedLongLong((unsigned long long)hash);
+}
+
+static PyObject *Hasher_hexdigest(PyObject *self_obj, PyObject *noargs) {
+    sz_unused_(noargs);
+    Hasher *self = (Hasher *)self_obj;
+    sz_u64_t hash = sz_hash_state_digest(&self->state);
+    char buf[17]; // lowercase, zero-padded 16 hex digits
+    snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)hash);
+    return PyUnicode_FromString(buf);
+}
+
+static PyObject *Hasher_reset(PyObject *self_obj, PyObject *noargs) {
+    sz_unused_(noargs);
+    Hasher *self = (Hasher *)self_obj;
+    sz_hash_state_init(&self->state, self->seed);
+    Py_INCREF(self_obj);
+    return self_obj;
+}
+
+static PyMethodDef Hasher_methods[] = {
+    {"update", (PyCFunction)Hasher_update, METH_O, "Update with more data; returns self."},
+    {"digest", (PyCFunction)Hasher_digest, METH_NOARGS, "Return current hash as int (does not consume)."},
+    {"hexdigest", (PyCFunction)Hasher_hexdigest, METH_NOARGS, "Return current hash as lowercase hex (16 digits)."},
+    {"reset", (PyCFunction)Hasher_reset, METH_NOARGS, "Reset to initial seed; returns self."},
+    {NULL, NULL, 0, NULL},
+};
+
+static PyTypeObject HasherType = {
+    PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzilla.Hasher",
+    .tp_basicsize = sizeof(Hasher),
+    .tp_itemsize = 0,
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_new = Hasher_new,
+    .tp_init = (initproc)Hasher_init,
+    .tp_dealloc = (destructor)Hasher_dealloc,
+    .tp_methods = Hasher_methods,
+};
+
+#pragma endregion
+
 #pragma region Strs
 
 /**
@@ -5634,6 +5754,7 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
     if (PyType_Ready(&FileType) < 0) return NULL;
     if (PyType_Ready(&StrsType) < 0) return NULL;
     if (PyType_Ready(&SplitIteratorType) < 0) return NULL;
+    if (PyType_Ready(&HasherType) < 0) return NULL;
 
     m = PyModule_Create(&stringzilla_module);
     if (m == NULL) return NULL;
@@ -5715,6 +5836,17 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
         return NULL;
     }
 
+    Py_INCREF(&HasherType);
+    if (PyModule_AddObject(m, "Hasher", (PyObject *)&HasherType) < 0) {
+        Py_XDECREF(&HasherType);
+        Py_XDECREF(&SplitIteratorType);
+        Py_XDECREF(&StrsType);
+        Py_XDECREF(&FileType);
+        Py_XDECREF(&StrType);
+        Py_XDECREF(m);
+        return NULL;
+    }
+
     // Export C API functions as a single capsule structure for StringZillas
     static PyAPI sz_py_api = {
         .sz_py_export_string_like = sz_py_export_string_like,
diff --git a/rust/stringzilla.rs b/rust/stringzilla.rs
index 83d2b1a1..2c735195 100644
--- a/rust/stringzilla.rs
+++ b/rust/stringzilla.rs
@@ -192,8 +192,8 @@ extern "C" {
     fn sz_bytesum(text: *const c_void, length: usize) -> u64;
     fn sz_hash(text: *const c_void, length: usize, seed: u64) -> u64;
     fn sz_hash_state_init(state: *const c_void, seed: u64);
-    fn sz_hash_state_stream(state: *const c_void, text: *const c_void, length: usize);
-    fn sz_hash_state_fold(state: *const c_void) -> u64;
+    fn sz_hash_state_update(state: *const c_void, text: *const c_void, length: usize);
+    fn sz_hash_state_digest(state: *const c_void) -> u64;
 
     pub fn sz_sequence_argsort(
         //
@@ -236,10 +236,10 @@ impl HashState {
         state
     }
 
-    /// Streams data into the hash state.
-    pub fn stream(&mut self, data: &[u8]) -> &mut Self {
+    /// Updates the hash state with more data.
+    pub fn update(&mut self, data: &[u8]) -> &mut Self {
         unsafe {
-            sz_hash_state_stream(
+            sz_hash_state_update(
                 self as *mut _ as *mut c_void,
                 data.as_ptr() as *const c_void,
                 data.len(),
@@ -248,9 +248,9 @@ impl HashState {
         self
     }
 
-    /// Finalizes the hash and returns the folded value.
-    pub fn fold(&self) -> u64 {
-        unsafe { sz_hash_state_fold(self as *const _ as *const c_void) }
+    /// Returns the current hash value without consuming the state.
+    pub fn digest(&self) -> u64 {
+        unsafe { sz_hash_state_digest(self as *const _ as *const c_void) }
     }
 }
 
@@ -1757,15 +1757,15 @@ mod tests {
         for seed in [0u64, 42, 123456789].iter() {
             // Single-pass hashing
             assert_eq!(
-                sz::HashState::new(*seed).stream("Hello".as_bytes()).fold(),
+                sz::HashState::new(*seed).update("Hello".as_bytes()).digest(),
                 sz::hash_with_seed("Hello", *seed)
             );
             // Dual pass for short strings
             assert_eq!(
                 sz::HashState::new(*seed)
-                    .stream("Hello".as_bytes())
-                    .stream("World".as_bytes())
-                    .fold(),
+                    .update("Hello".as_bytes())
+                    .update("World".as_bytes())
+                    .digest(),
                 sz::hash_with_seed("HelloWorld", *seed)
             );
         }
diff --git a/scripts/bench_token.cpp b/scripts/bench_token.cpp
index 0ca327d1..684a4aab 100644
--- a/scripts/bench_token.cpp
+++ b/scripts/bench_token.cpp
@@ -119,7 +119,7 @@ struct hash_from_std_t {
 };
 
 /** @brief Wraps hash state initialization, streaming, and folding for streaming benchmarks. */
-template <sz_hash_state_init_t init_, sz_hash_state_stream_t stream_, sz_hash_state_fold_t fold_>
+template <sz_hash_state_init_t init_, sz_hash_state_update_t stream_, sz_hash_state_digest_t fold_>
 struct hash_stream_from_sz {
 
     environment_t const &env;
@@ -189,30 +189,33 @@ void bench_hashing(environment_t const &env) {
 void bench_stream_hashing(environment_t const &env) {
 
     auto validator =
-        hash_stream_from_sz<sz_hash_state_init_serial, sz_hash_state_stream_serial, sz_hash_state_fold_serial> {env};
+        hash_stream_from_sz<sz_hash_state_init_serial, sz_hash_state_update_serial, sz_hash_state_digest_serial> {env};
     bench_result_t base = bench_unary(env, "sz_hash_stream_serial", validator).log();
     bench_result_t base_stl = bench_unary(env, "std::hash", hash_from_std_t {env}).log(base);
 
 #if SZ_USE_HASWELL
     bench_unary(
         env, "sz_hash_stream_haswell", validator,
-        hash_stream_from_sz<sz_hash_state_init_haswell, sz_hash_state_stream_haswell, sz_hash_state_fold_haswell> {env})
+        hash_stream_from_sz<sz_hash_state_init_haswell, sz_hash_state_update_haswell, sz_hash_state_digest_haswell> {
+            env})
         .log(base, base_stl);
 #endif
 #if SZ_USE_SKYLAKE
     bench_unary(
         env, "sz_hash_stream_skylake", validator,
-        hash_stream_from_sz<sz_hash_state_init_skylake, sz_hash_state_stream_skylake, sz_hash_state_fold_skylake> {env})
+        hash_stream_from_sz<sz_hash_state_init_skylake, sz_hash_state_update_skylake, sz_hash_state_digest_skylake> {
+            env})
         .log(base, base_stl);
 #endif
 #if SZ_USE_ICE
     bench_unary(env, "sz_hash_stream_ice", validator,
-                hash_stream_from_sz<sz_hash_state_init_ice, sz_hash_state_stream_ice, sz_hash_state_fold_ice> {env})
+                hash_stream_from_sz<sz_hash_state_init_ice, sz_hash_state_update_ice, sz_hash_state_digest_ice> {env})
         .log(base, base_stl);
 #endif
 #if SZ_USE_NEON_AES
-    bench_unary(env, "sz_hash_stream_neon", validator,
-                hash_stream_from_sz<sz_hash_state_init_neon, sz_hash_state_stream_neon, sz_hash_state_fold_neon> {env})
+    bench_unary(
+        env, "sz_hash_stream_neon", validator,
+        hash_stream_from_sz<sz_hash_state_init_neon, sz_hash_state_update_neon, sz_hash_state_digest_neon> {env})
         .log(base, base_stl);
 #endif
 }
diff --git a/scripts/test_stringzilla.cpp b/scripts/test_stringzilla.cpp
index 73614ec5..6ff599c8 100644
--- a/scripts/test_stringzilla.cpp
+++ b/scripts/test_stringzilla.cpp
@@ -255,11 +255,11 @@ void test_byteset_struct() {
  *  The test covers increasingly long and complex strings, starting with "abcabc..." repetitions and
  *  progressing towards corner cases like empty strings, all-zero inputs, zero seeds, and so on.
  */
-void test_hash_equivalence(                                             //
-    sz_hash_t hash_base, sz_hash_state_init_t init_base,                //
-    sz_hash_state_stream_t stream_base, sz_hash_state_fold_t fold_base, //
-    sz_hash_t hash_simd, sz_hash_state_init_t init_simd,                //
-    sz_hash_state_stream_t stream_simd, sz_hash_state_fold_t fold_simd) {
+void test_hash_equivalence(                                               //
+    sz_hash_t hash_base, sz_hash_state_init_t init_base,                  //
+    sz_hash_state_update_t stream_base, sz_hash_state_digest_t fold_base, //
+    sz_hash_t hash_simd, sz_hash_state_init_t init_simd,                  //
+    sz_hash_state_update_t stream_simd, sz_hash_state_digest_t fold_simd) {
 
     auto test_on_seed = [&](std::string text, sz_u64_t seed) {
         // Compute the entire hash at once, expecting the same output
@@ -351,43 +351,43 @@ void test_equivalence() {
     assert(sz_hash_serial("abcdefgh", 8, 0) != sz_hash_serial("abcdefgh", 8, 7));
 
 #if SZ_USE_HASWELL
-    test_hash_equivalence(                                      //
-        sz_hash_serial, sz_hash_state_init_serial,              //
-        sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
-        sz_hash_haswell, sz_hash_state_init_haswell,            //
-        sz_hash_state_stream_haswell, sz_hash_state_fold_haswell);
+    test_hash_equivalence(                                        //
+        sz_hash_serial, sz_hash_state_init_serial,                //
+        sz_hash_state_update_serial, sz_hash_state_digest_serial, //
+        sz_hash_haswell, sz_hash_state_init_haswell,              //
+        sz_hash_state_update_haswell, sz_hash_state_digest_haswell);
     test_random_generator_equivalence(sz_fill_random_serial, sz_fill_random_haswell);
 #endif
 #if SZ_USE_SKYLAKE
-    test_hash_equivalence(                                      //
-        sz_hash_serial, sz_hash_state_init_serial,              //
-        sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
-        sz_hash_skylake, sz_hash_state_init_skylake,            //
-        sz_hash_state_stream_skylake, sz_hash_state_fold_skylake);
+    test_hash_equivalence(                                        //
+        sz_hash_serial, sz_hash_state_init_serial,                //
+        sz_hash_state_update_serial, sz_hash_state_digest_serial, //
+        sz_hash_skylake, sz_hash_state_init_skylake,              //
+        sz_hash_state_update_skylake, sz_hash_state_digest_skylake);
     test_random_generator_equivalence(sz_fill_random_serial, sz_fill_random_skylake);
 #endif
 #if SZ_USE_ICE
-    test_hash_equivalence(                                      //
-        sz_hash_serial, sz_hash_state_init_serial,              //
-        sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
-        sz_hash_ice, sz_hash_state_init_ice,                    //
-        sz_hash_state_stream_ice, sz_hash_state_fold_ice);
+    test_hash_equivalence(                                        //
+        sz_hash_serial, sz_hash_state_init_serial,                //
+        sz_hash_state_update_serial, sz_hash_state_digest_serial, //
+        sz_hash_ice, sz_hash_state_init_ice,                      //
+        sz_hash_state_update_ice, sz_hash_state_digest_ice);
     test_random_generator_equivalence(sz_fill_random_serial, sz_fill_random_ice);
 #endif
 #if SZ_USE_NEON_AES
-    test_hash_equivalence(                                      //
-        sz_hash_serial, sz_hash_state_init_serial,              //
-        sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
-        sz_hash_neon, sz_hash_state_init_neon,                  //
-        sz_hash_state_stream_neon, sz_hash_state_fold_neon);
+    test_hash_equivalence(                                        //
+        sz_hash_serial, sz_hash_state_init_serial,                //
+        sz_hash_state_update_serial, sz_hash_state_digest_serial, //
+        sz_hash_neon, sz_hash_state_init_neon,                    //
+        sz_hash_state_update_neon, sz_hash_state_digest_neon);
     test_random_generator_equivalence(sz_fill_random_serial, sz_fill_random_neon);
 #endif
 #if SZ_USE_SVE2_AES
-    test_hash_equivalence(                                      //
-        sz_hash_serial, sz_hash_state_init_serial,              //
-        sz_hash_state_stream_serial, sz_hash_state_fold_serial, //
-        sz_hash_sve2, sz_hash_state_init_sve2,                  //
-        sz_hash_state_stream_sve2, sz_hash_state_fold_sve2);
+    test_hash_equivalence(                                        //
+        sz_hash_serial, sz_hash_state_init_serial,                //
+        sz_hash_state_update_serial, sz_hash_state_digest_serial, //
+        sz_hash_sve2, sz_hash_state_init_sve2,                    //
+        sz_hash_state_update_sve2, sz_hash_state_digest_sve2);
     test_random_generator_equivalence(sz_fill_random_serial, sz_fill_random_sve2);
 #endif
 };
diff --git a/swift/StringProtocol+StringZilla.swift b/swift/StringProtocol+StringZilla.swift
index bd2f736f..c0de636d 100644
--- a/swift/StringProtocol+StringZilla.swift
+++ b/swift/StringProtocol+StringZilla.swift
@@ -292,7 +292,7 @@ public class StringZillaHasher {
         precondition(isInitialized, "Hasher has been finalized and cannot be updated")
 
         content.withStringZillaScope { pointer, length in
-            sz_hash_state_stream(&state, pointer, length)
+            sz_hash_state_update(&state, pointer, length)
         }
         return self
     }
@@ -303,11 +303,14 @@ public class StringZillaHasher {
     public func finalize() -> UInt64 {
         precondition(isInitialized, "Hasher has already been finalized")
 
-        let result = sz_hash_state_fold(&state)
+        let result = sz_hash_state_digest(&state)
         isInitialized = false
         return result
     }
 
+    /// Alias for `finalize()` to match other bindings.
+    public func digest() -> UInt64 { return finalize() }
+
     /// Resets the hasher to its initial state with the same seed.
     /// - Parameter seed: Optional new seed value (if nil, uses the original seed).
     public func reset(seed: UInt64? = nil) {

From a5a24219165147c2b7ab2f28f143b5007b07cc1a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 4 Sep 2025 10:05:52 +0000
Subject: [PATCH 719/751] Fix: Strict aliasing violation

---
 include/stringzilla/stringzilla.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index c9d321c4..ccef17dc 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -2009,7 +2009,7 @@ class basic_string {
         raise(_with_alloc([&](sz_alloc_type &alloc) {
             return (start = sz_string_init_length(&string_, length, &alloc)) ? sz_success_k : sz_bad_alloc_k;
         }));
-        sz_fill(start, length, *(sz_u8_t *)&value);
+        sz_fill(start, length, sz_bitcast_(sz_u8_t, value));
     }
 
     void init(string_view other) noexcept(false) {
@@ -3227,7 +3227,7 @@ class basic_string {
      */
     basic_string &assign(size_type repeats, char_type character) noexcept(false) {
         resize(repeats, character);
-        sz_fill(data(), repeats, *(sz_u8_t *)&character);
+        sz_fill(data(), repeats, sz_bitcast_(sz_u8_t, character));
         return *this;
     }
 

From 5fb0e7413b0b3d74aa3b54999ac1513468f16e98 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 4 Sep 2025 10:06:13 +0000
Subject: [PATCH 720/751] Fix: Self-move construction of `basic_string`

---
 include/stringzilla/stringzilla.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index ccef17dc..953ec1be 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -2089,6 +2089,7 @@ class basic_string {
 
     basic_string(basic_string &&other) noexcept { move(other); }
     basic_string &operator=(basic_string &&other) noexcept {
+        if (this == &other) return *this;
         if (!is_internal()) {
             _with_alloc([&](sz_alloc_type &alloc) {
                 sz_string_free(&string_, &alloc);

From 7c6d37d224fec8cbc5a641648298f00947beb5a7 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 4 Sep 2025 10:06:37 +0000
Subject: [PATCH 721/751] Improve: Test incremental hashers

---
 scripts/test_stringzilla.py | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index 8d5b76e3..8bd01076 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -46,7 +46,7 @@
     import numpy as np
 
     numpy_available = True
-except: # noqa: E722
+except:  # noqa: E722
     # NumPy is not installed, most tests will be skipped
     numpy_available = False
 
@@ -59,7 +59,7 @@
     import pyarrow as pa
 
     pyarrow_available = True
-except: # noqa: E722
+except:  # noqa: E722
     # PyArrow is not installed, most tests will be skipped
     pyarrow_available = False
 
@@ -791,6 +791,39 @@ def test_hash_basic_equivalence(body: str, seed_value: int):
     assert hash_seeded == hash_member
 
 
+@pytest.mark.parametrize("seed_value", SEED_VALUES)
+def test_hasher_incremental_vs_one_shot(seed_value: int):
+    data_full = b"hello world"
+    data_prefix = b"hello "
+    data_suffix = b"world"
+
+    hasher = sz.Hasher(seed=seed_value)
+    hasher.update(data_prefix)
+    hasher.update(data_suffix)
+    streamed_hash = hasher.digest()
+
+    expected_hash = sz.hash(data_full, seed=seed_value)
+    assert isinstance(streamed_hash, int)
+    assert streamed_hash == expected_hash
+
+
+@pytest.mark.parametrize("seed_value", SEED_VALUES)
+def test_hasher_reset_and_hexdigest(seed_value: int):
+    data = b"some test payload"
+    hasher = sz.Hasher(seed=seed_value)
+    hasher.update(data)
+    streamed_hash = hasher.digest()
+    streamed_hex = hasher.hexdigest()
+    assert isinstance(streamed_hex, str) and len(streamed_hex) == 16 and streamed_hex == format(streamed_hash, "016x")
+
+    hasher.reset()
+    hasher.update(data)
+    re_streamed_hash = hasher.digest()
+    re_streamed_hex = hasher.hexdigest()
+    assert streamed_hash == re_streamed_hash
+    assert streamed_hex == re_streamed_hex
+
+
 @pytest.mark.parametrize("length", list(range(0, 300)) + [1024, 4096, 100000])
 @pytest.mark.parametrize("seed_value", SEED_VALUES)
 def test_bytesum_random(length: int, seed_value: int):

From c557a62174bf05d08357a28fbcecf4e987ed68d0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 4 Sep 2025 10:07:54 +0000
Subject: [PATCH 722/751] Improve: Expose `.capabilities` to JS

---
 javascript/lib.c          | 11 +++++++++--
 javascript/stringzilla.js |  7 +++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/javascript/lib.c b/javascript/lib.c
index cd209157..5e808e73 100644
--- a/javascript/lib.c
+++ b/javascript/lib.c
@@ -507,9 +507,16 @@ napi_value Init(napi_env env, napi_value exports) {
     napi_property_descriptor compareDesc = {"compare", 0, compareAPI, 0, 0, 0, napi_default, 0};
     napi_property_descriptor byteSumDesc = {"byteSum", 0, byteSumAPI, 0, 0, 0, napi_default, 0};
     napi_property_descriptor hasherDesc = {"Hasher", 0, 0, 0, 0, hasherClass, napi_default, 0};
+
+    // Export the `capabilities` string for debugging
+    napi_value caps_str_value;
+    const char *caps_cstr = (const char *)sz_capabilities_to_string(sz_capabilities());
+    napi_create_string_utf8(env, caps_cstr, NAPI_AUTO_LENGTH, &caps_str_value);
+    napi_property_descriptor capabilitiesDesc = {"capabilities", 0, 0, 0, 0, caps_str_value, napi_default, 0};
+
     napi_property_descriptor properties[] = {
-        findDesc,  findLastDesc, findByteDesc, findLastByteDesc, findByteFromDesc, findLastByteFromDesc,
-        countDesc, hashDesc,     equalDesc,    compareDesc,      byteSumDesc,      hasherDesc,
+        findDesc, findLastDesc, findByteDesc, findLastByteDesc, findByteFromDesc, findLastByteFromDesc, countDesc,
+        hashDesc, equalDesc,    compareDesc,  byteSumDesc,      hasherDesc,       capabilitiesDesc,
     };
 
     // Define the properties on the `exports` object
diff --git a/javascript/stringzilla.js b/javascript/stringzilla.js
index 7f7ef340..b0fa8c38 100644
--- a/javascript/stringzilla.js
+++ b/javascript/stringzilla.js
@@ -112,4 +112,11 @@ export default {
      *  @returns {bigint} Sum of all byte values
      */
     byteSum: compiled.byteSum,
+
+    /**
+     * Returns a comma-separated string of backend capabilities, e.g. "serial,haswell".
+     * Use this to inspect which SIMD/GPU backends are active.
+     * @returns {string}
+     */
+    capabilities: compiled.capabilities,
 };

From e555cc3660b01395156463169f8a5aff3051a7e8 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 4 Sep 2025 10:36:13 +0000
Subject: [PATCH 723/751] Add: `HashMap` traits for Rust

Closes #215

Co-authored-by: Guillaume de Rouville <31691250+grouville@users.noreply.github.com>
Co-authored-by: Guillaume de Rouville <guillaume.derouville@gmail.com>
Co-authored-by: Mikayel Grigoryan <56165400+mikayelgr@users.noreply.github.com>
Co-authored-by: Mikayel Grigoryan <michael.grigoryan25@gmail.com>
---
 CONTRIBUTING.md     |  16 +-
 Cargo.lock          |  11 +-
 Cargo.toml          |  10 +-
 rust/lib.rs         |   2 +-
 rust/stringzilla.rs | 446 +++++++++++++++++++++++++++++---------------
 5 files changed, 321 insertions(+), 164 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 6f11c169..f81209a2 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -558,10 +558,20 @@ sudo docker run --rm -v "$PWD:/workspace" -w /workspace swift:6.0 /bin/bash -c "
 
 ## Contributing in Rust
 
+StringZilla's Rust crate supports both `std` and `no_std` builds.
+Other options include:
+
+- `std` (default): enables standard library support.
+- `cpus`: multi‑threaded CPU backend (implies `std`).
+- `cuda`: CUDA backend (implies `cpus` and `std`).
+- `rocm`: ROCm backend (implies `cpus` and `std`).
+
 ```bash
-cargo test
-cargo test --features cpus
-cargo test --features cuda
+cargo test --no-default-features                # verify `no_std` build
+cargo test --no-default-features --features std # only test with `std`
+cargo test                                      # default tests with `std`
+cargo test --features cpus                      # for parallel multi-CPU backends
+cargo test --features cuda                      # for parallel Nvidia GPU backend
 ```
 
 If you need to isolate a failing test:
diff --git a/Cargo.lock b/Cargo.lock
index 8fde518f..3bc7f3c8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10,13 +10,20 @@ checksum = "78200ac3468a57d333cd0ea5dd398e25111194dcacd49208afca95c629a6311d"
 
 [[package]]
 name = "cc"
-version = "1.2.33"
+version = "1.2.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3ee0f8803222ba5a7e2777dd72ca451868909b1ac410621b676adf07280e9b5f"
+checksum = "590f9024a68a8c40351881787f1934dc11afd69090f5edb6831464694d836ea3"
 dependencies = [
+ "find-msvc-tools",
  "shlex",
 ]
 
+[[package]]
+name = "find-msvc-tools"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e178e4fba8a2726903f6ba98a6d221e76f9c12c650d5dc0e6afdc50677b49650"
+
 [[package]]
 name = "shlex"
 version = "1.3.0"
diff --git a/Cargo.toml b/Cargo.toml
index 026881f9..f306574b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -24,17 +24,19 @@ name = "stringzilla"
 path = "rust/lib.rs"
 
 [features]
-default = []
+default = ["std"]
+std = []
 cpus = [
+    "std",            # std is required for multi-threaded backend
     "allocator-api2",
     "stringtape",
 ] # Multi-threaded CPU backend (StringZillas)
-cuda = ["cpus"] # CUDA GPU backend (includes multi-threaded CPU backend)
-rocm = ["cpus"] # ROCm GPU backend (includes multi-threaded CPU backend)
+cuda = ["std", "cpus"] # CUDA GPU backend (includes multi-threaded CPU backend)
+rocm = ["std", "cpus"] # ROCm GPU backend (includes multi-threaded CPU backend)
 
 [dependencies]
 allocator-api2 = { version = "0.3.0", optional = true }
 stringtape = { version = "0.4.0", optional = true }
 
 [build-dependencies]
-cc = "1.2.33"
+cc = "1.2.35"
diff --git a/rust/lib.rs b/rust/lib.rs
index 70a97e81..9123460c 100644
--- a/rust/lib.rs
+++ b/rust/lib.rs
@@ -1,4 +1,4 @@
-#![cfg_attr(all(not(test), not(any(feature = "cpus", feature = "cuda", feature = "rocm"))), no_std)]
+#![cfg_attr(not(feature = "std"), no_std)]
 #![doc = r"
 # StringZilla
 
diff --git a/rust/stringzilla.rs b/rust/stringzilla.rs
index 2c735195..79610504 100644
--- a/rust/stringzilla.rs
+++ b/rust/stringzilla.rs
@@ -47,10 +47,15 @@ pub struct Byteset {
     bits: [u64; 4],
 }
 
+/// Incremental hasher state for StringZilla's 64-bit hash.
+///
+/// Use `Hasher::new(seed)` to construct, then call `update(&mut self, data)`
+/// zero or more times, and finally call `digest(&self)` to read the current
+/// hash value without consuming the state.
 #[repr(C)]
 #[derive(Debug, Clone, Copy)]
 #[repr(align(64))] // For optimal performance we align to 64 bytes.
-pub struct HashState {
+pub struct Hasher {
     aes: [u64; 8],
     sum: [u64; 8],
     ins: [u64; 8], // Ignored in comparisons
@@ -220,10 +225,10 @@ impl SemVer {
     }
 }
 
-impl HashState {
-    /// Creates a new `HashState` and initializes it with a given seed.
+impl Hasher {
+    /// Creates a new hasher initialized with `seed`.
     pub fn new(seed: u64) -> Self {
-        let mut state = HashState {
+        let mut state = Hasher {
             aes: [0; 8],
             sum: [0; 8],
             ins: [0; 8],
@@ -236,7 +241,7 @@ impl HashState {
         state
     }
 
-    /// Updates the hash state with more data.
+    /// Updates the hasher with more data.
     pub fn update(&mut self, data: &[u8]) -> &mut Self {
         unsafe {
             sz_hash_state_update(
@@ -254,12 +259,110 @@ impl HashState {
     }
 }
 
-impl PartialEq for HashState {
+impl PartialEq for Hasher {
     fn eq(&self, other: &Self) -> bool {
         self.aes == other.aes && self.sum == other.sum && self.key == other.key
     }
 }
 
+impl Default for Hasher {
+    #[inline]
+    fn default() -> Self {
+        Hasher::new(0)
+    }
+}
+
+/// Standard Hasher trait to interoperate with `std::collections`.
+impl core::hash::Hasher for Hasher {
+    #[inline]
+    fn finish(&self) -> u64 {
+        self.digest()
+    }
+
+    #[inline]
+    fn write(&mut self, bytes: &[u8]) {
+        let _ = self.update(bytes);
+    }
+
+    // Feed integers as little-endian bytes for cross-platform stability
+    #[inline]
+    fn write_u8(&mut self, i: u8) {
+        self.write(&[i]);
+    }
+    #[inline]
+    fn write_u16(&mut self, i: u16) {
+        self.write(&i.to_le_bytes());
+    }
+    #[inline]
+    fn write_u32(&mut self, i: u32) {
+        self.write(&i.to_le_bytes());
+    }
+    #[inline]
+    fn write_u64(&mut self, i: u64) {
+        self.write(&i.to_le_bytes());
+    }
+    #[inline]
+    fn write_u128(&mut self, i: u128) {
+        self.write(&i.to_le_bytes());
+    }
+    #[inline]
+    fn write_usize(&mut self, i: usize) {
+        self.write(&i.to_le_bytes());
+    }
+    #[inline]
+    fn write_i8(&mut self, i: i8) {
+        self.write(&[i as u8]);
+    }
+    #[inline]
+    fn write_i16(&mut self, i: i16) {
+        self.write(&i.to_le_bytes());
+    }
+    #[inline]
+    fn write_i32(&mut self, i: i32) {
+        self.write(&i.to_le_bytes());
+    }
+    #[inline]
+    fn write_i64(&mut self, i: i64) {
+        self.write(&i.to_le_bytes());
+    }
+    #[inline]
+    fn write_i128(&mut self, i: i128) {
+        self.write(&i.to_le_bytes());
+    }
+    #[inline]
+    fn write_isize(&mut self, i: isize) {
+        self.write(&i.to_le_bytes());
+    }
+}
+
+/// BuildHasher for constructing `Hasher` instances, enabling use with HashMap/HashSet.
+///
+/// By default uses seed 0 for deterministic hashing across runs and platforms.
+/// If you need DOS-resistant randomized seeding, consider wrapping this in your
+/// application with a per-process random seed.
+#[cfg(feature = "std")]
+#[derive(Debug, Clone, Copy, Default)]
+pub struct BuildSzHasher {
+    pub seed: u64,
+}
+
+#[cfg(feature = "std")]
+impl BuildSzHasher {
+    #[inline]
+    pub const fn with_seed(seed: u64) -> Self {
+        Self { seed }
+    }
+}
+
+#[cfg(feature = "std")]
+impl std::hash::BuildHasher for BuildSzHasher {
+    type Hasher = Hasher;
+    #[inline]
+    fn build_hasher(&self) -> Self::Hasher {
+        Hasher::new(self.seed)
+    }
+}
+
 /// Checks if the library was compiled with dynamic dispatch enabled.
 pub fn dynamic_dispatch() -> bool {
     unsafe { sz_dynamic_dispatch() != 0 }
@@ -1728,10 +1831,11 @@ where
     }
 }
 
-#[cfg(test)]
+#[cfg(all(test, feature = "std"))]
 mod tests {
     use std::borrow::Cow;
-    use std::collections::HashSet;
+    use std::collections::{HashMap, HashSet};
+    use std::hash::Hasher as _;
 
     use super::*;
     use crate::sz;
@@ -1757,12 +1861,12 @@ mod tests {
         for seed in [0u64, 42, 123456789].iter() {
             // Single-pass hashing
             assert_eq!(
-                sz::HashState::new(*seed).update("Hello".as_bytes()).digest(),
+                sz::Hasher::new(*seed).update("Hello".as_bytes()).digest(),
                 sz::hash_with_seed("Hello", *seed)
             );
             // Dual pass for short strings
             assert_eq!(
-                sz::HashState::new(*seed)
+                sz::Hasher::new(*seed)
                     .update("Hello".as_bytes())
                     .update("World".as_bytes())
                     .digest(),
@@ -1771,6 +1875,44 @@ mod tests {
         }
     }
 
+    #[test]
+    fn streaming_hash() {
+        let mut hasher = sz::Hasher::new(123);
+        hasher.write(b"Hello, ");
+        hasher.write(b"world!");
+        let streamed = hasher.finish();
+
+        let mut hasher = sz::Hasher::new(123);
+        hasher.write(b"Hello, world!");
+        let expected = hasher.finish();
+        assert_eq!(streamed, expected);
+    }
+
+    #[test]
+    fn hashmap_with_sz() {
+        let mut map: HashMap<&str, i32, sz::BuildSzHasher> = HashMap::with_hasher(sz::BuildSzHasher::with_seed(0));
+        map.insert("a", 1);
+        map.insert("b", 2);
+        map.insert("c", 3);
+        assert_eq!(map.get("a"), Some(&1));
+        assert_eq!(map.get("b"), Some(&2));
+        assert_eq!(map.get("c"), Some(&3));
+        assert!(map.get("z").is_none());
+    }
+
+    #[test]
+    fn hashset_with_sz() {
+        let mut set: HashSet<&str, sz::BuildSzHasher> = HashSet::with_hasher(sz::BuildSzHasher::with_seed(42));
+        assert!(set.insert("alpha"));
+        assert!(set.insert("beta"));
+        assert!(set.contains("alpha"));
+        assert!(set.contains("beta"));
+        assert!(!set.contains("gamma"));
+        let len_before = set.len();
+        assert!(!set.insert("alpha"));
+        assert_eq!(set.len(), len_before);
+    }
+
     #[test]
     fn search() {
         let my_string: String = String::from("Hello, world!");
@@ -1818,167 +1960,163 @@ mod tests {
         assert_eq!(first_buffer, second_buffer);
     }
 
-    mod search_split_iterators {
-        use super::*;
-
-        #[test]
-        fn test_matches() {
-            let haystack = b"hello world hello universe";
-            let needle = b"hello";
-            let matches: Vec<_> = haystack.sz_matches(needle).collect();
-            assert_eq!(matches, vec![b"hello", b"hello"]);
-        }
+    #[test]
+    fn iter_matches_forward() {
+        let haystack = b"hello world hello universe";
+        let needle = b"hello";
+        let matches: Vec<_> = haystack.sz_matches(needle).collect();
+        assert_eq!(matches, vec![b"hello", b"hello"]);
+    }
 
-        #[test]
-        fn test_rmatches() {
-            let haystack = b"hello world hello universe";
-            let needle = b"hello";
-            let matches: Vec<_> = haystack.sz_rmatches(needle).collect();
-            assert_eq!(matches, vec![b"hello", b"hello"]);
-        }
+    #[test]
+    fn iter_matches_reverse() {
+        let haystack = b"hello world hello universe";
+        let needle = b"hello";
+        let matches: Vec<_> = haystack.sz_rmatches(needle).collect();
+        assert_eq!(matches, vec![b"hello", b"hello"]);
+    }
 
-        #[test]
-        fn test_splits() {
-            let haystack = b"alpha,beta;gamma";
-            let needle = b",";
-            let splits: Vec<_> = haystack.sz_splits(needle).collect();
-            assert_eq!(splits, vec![&b"alpha"[..], &b"beta;gamma"[..]]);
-        }
+    #[test]
+    fn iter_splits_forward() {
+        let haystack = b"alpha,beta;gamma";
+        let needle = b",";
+        let splits: Vec<_> = haystack.sz_splits(needle).collect();
+        assert_eq!(splits, vec![&b"alpha"[..], &b"beta;gamma"[..]]);
+    }
 
-        #[test]
-        fn test_rsplits() {
-            let haystack = b"alpha,beta;gamma";
-            let needle = b";";
-            let splits: Vec<_> = haystack.sz_rsplits(needle).collect();
-            assert_eq!(splits, vec![&b"gamma"[..], &b"alpha,beta"[..]]);
-        }
+    #[test]
+    fn iter_splits_reverse() {
+        let haystack = b"alpha,beta;gamma";
+        let needle = b";";
+        let splits: Vec<_> = haystack.sz_rsplits(needle).collect();
+        assert_eq!(splits, vec![&b"gamma"[..], &b"alpha,beta"[..]]);
+    }
 
-        #[test]
-        fn test_splits_with_empty_parts() {
-            let haystack = b"a,,b,";
-            let needle = b",";
-            let splits: Vec<_> = haystack.sz_splits(needle).collect();
-            assert_eq!(splits, vec![b"a", &b""[..], b"b", &b""[..]]);
-        }
+    #[test]
+    fn iter_splits_with_empty_parts() {
+        let haystack = b"a,,b,";
+        let needle = b",";
+        let splits: Vec<_> = haystack.sz_splits(needle).collect();
+        assert_eq!(splits, vec![b"a", &b""[..], b"b", &b""[..]]);
+    }
 
-        #[test]
-        fn test_matches_with_overlaps() {
-            let haystack = b"aaaa";
-            let needle = b"aa";
-            let matches: Vec<_> = haystack.sz_matches(needle).collect();
-            assert_eq!(matches, vec![b"aa", b"aa", b"aa"]);
-        }
+    #[test]
+    fn iter_matches_with_overlaps() {
+        let haystack = b"aaaa";
+        let needle = b"aa";
+        let matches: Vec<_> = haystack.sz_matches(needle).collect();
+        assert_eq!(matches, vec![b"aa", b"aa", b"aa"]);
+    }
 
-        #[test]
-        fn test_splits_with_utf8() {
-            let haystack = "こんにちは,世界".as_bytes();
-            let needle = b",";
-            let splits: Vec<_> = haystack.sz_splits(needle).collect();
-            assert_eq!(splits, vec!["こんにちは".as_bytes(), "世界".as_bytes()]);
-        }
+    #[test]
+    fn iter_splits_with_utf8_haystack() {
+        let haystack = "こんにちは,世界".as_bytes();
+        let needle = b",";
+        let splits: Vec<_> = haystack.sz_splits(needle).collect();
+        assert_eq!(splits, vec!["こんにちは".as_bytes(), "世界".as_bytes()]);
+    }
 
-        #[test]
-        fn test_find_first_of() {
-            let haystack = b"hello world";
-            let needles = b"or";
-            let matches: Vec<_> = haystack.sz_find_first_of(needles).collect();
-            assert_eq!(matches, vec![b"o", b"o", b"r"]);
-        }
+    #[test]
+    fn iter_find_first_of() {
+        let haystack = b"hello world";
+        let needles = b"or";
+        let matches: Vec<_> = haystack.sz_find_first_of(needles).collect();
+        assert_eq!(matches, vec![b"o", b"o", b"r"]);
+    }
 
-        #[test]
-        fn test_find_last_of() {
-            let haystack = b"hello world";
-            let needles = b"or";
-            let matches: Vec<_> = haystack.sz_find_last_of(needles).collect();
-            assert_eq!(matches, vec![b"r", b"o", b"o"]);
-        }
+    #[test]
+    fn iter_find_last_of() {
+        let haystack = b"hello world";
+        let needles = b"or";
+        let matches: Vec<_> = haystack.sz_find_last_of(needles).collect();
+        assert_eq!(matches, vec![b"r", b"o", b"o"]);
+    }
 
-        #[test]
-        fn test_find_first_not_of() {
-            let haystack = b"aabbbcccd";
-            let needles = b"ab";
-            let matches: Vec<_> = haystack.sz_find_first_not_of(needles).collect();
-            assert_eq!(matches, vec![b"c", b"c", b"c", b"d"]);
-        }
+    #[test]
+    fn iter_find_first_not_of() {
+        let haystack = b"aabbbcccd";
+        let needles = b"ab";
+        let matches: Vec<_> = haystack.sz_find_first_not_of(needles).collect();
+        assert_eq!(matches, vec![b"c", b"c", b"c", b"d"]);
+    }
 
-        #[test]
-        fn test_find_last_not_of() {
-            let haystack = b"aabbbcccd";
-            let needles = b"cd";
-            let matches: Vec<_> = haystack.sz_find_last_not_of(needles).collect();
-            assert_eq!(matches, vec![b"b", b"b", b"b", b"a", b"a"]);
-        }
+    #[test]
+    fn iter_find_last_not_of() {
+        let haystack = b"aabbbcccd";
+        let needles = b"cd";
+        let matches: Vec<_> = haystack.sz_find_last_not_of(needles).collect();
+        assert_eq!(matches, vec![b"b", b"b", b"b", b"a", b"a"]);
+    }
 
-        #[test]
-        fn test_find_first_of_empty_needles() {
-            let haystack = b"hello world";
-            let needles = b"";
-            let matches: Vec<_> = haystack.sz_find_first_of(needles).collect();
-            assert_eq!(matches, Vec::<&[u8]>::new());
-        }
+    #[test]
+    fn iter_find_first_of_empty_needles() {
+        let haystack = b"hello world";
+        let needles = b"";
+        let matches: Vec<_> = haystack.sz_find_first_of(needles).collect();
+        assert_eq!(matches, Vec::<&[u8]>::new());
+    }
 
-        #[test]
-        fn test_find_last_of_empty_haystack() {
-            let haystack = b"";
-            let needles = b"abc";
-            let matches: Vec<_> = haystack.sz_find_last_of(needles).collect();
-            assert_eq!(matches, Vec::<&[u8]>::new());
-        }
+    #[test]
+    fn iter_find_last_of_empty_haystack() {
+        let haystack = b"";
+        let needles = b"abc";
+        let matches: Vec<_> = haystack.sz_find_last_of(needles).collect();
+        assert_eq!(matches, Vec::<&[u8]>::new());
+    }
 
-        #[test]
-        fn test_find_first_not_of_all_matching() {
-            let haystack = b"aaabbbccc";
-            let needles = b"abc";
-            let matches: Vec<_> = haystack.sz_find_first_not_of(needles).collect();
-            assert_eq!(matches, Vec::<&[u8]>::new());
-        }
+    #[test]
+    fn iter_find_first_not_of_all_matching() {
+        let haystack = b"aaabbbccc";
+        let needles = b"abc";
+        let matches: Vec<_> = haystack.sz_find_first_not_of(needles).collect();
+        assert_eq!(matches, Vec::<&[u8]>::new());
+    }
 
-        #[test]
-        fn test_find_last_not_of_all_not_matching() {
-            let haystack = b"hello world";
-            let needles = b"xyz";
-            let matches: Vec<_> = haystack.sz_find_last_not_of(needles).collect();
-            assert_eq!(
-                matches,
-                vec![b"d", b"l", b"r", b"o", b"w", b" ", b"o", b"l", b"l", b"e", b"h"]
-            );
-        }
+    #[test]
+    fn iter_find_last_not_of_all_not_matching() {
+        let haystack = b"hello world";
+        let needles = b"xyz";
+        let matches: Vec<_> = haystack.sz_find_last_not_of(needles).collect();
+        assert_eq!(
+            matches,
+            vec![b"d", b"l", b"r", b"o", b"w", b" ", b"o", b"l", b"l", b"e", b"h"]
+        );
+    }
 
-        #[test]
-        fn test_range_matches_overlapping() {
-            let haystack = b"aaaa";
-            let matcher = MatcherType::Find(b"aa");
-            let matches: Vec<_> = RangeMatches::new(haystack, matcher, true).collect();
-            assert_eq!(matches, vec![&b"aa"[..], &b"aa"[..], &b"aa"[..]]);
-        }
+    #[test]
+    fn iter_range_matches_overlapping() {
+        let haystack = b"aaaa";
+        let matcher = MatcherType::Find(b"aa");
+        let matches: Vec<_> = RangeMatches::new(haystack, matcher, true).collect();
+        assert_eq!(matches, vec![&b"aa"[..], &b"aa"[..], &b"aa"[..]]);
+    }
 
-        #[test]
-        fn test_range_matches_non_overlapping() {
-            let haystack = b"aaaa";
-            let matcher = MatcherType::Find(b"aa");
-            let matches: Vec<_> = RangeMatches::new(haystack, matcher, false).collect();
-            assert_eq!(matches, vec![&b"aa"[..], &b"aa"[..]]);
-        }
+    #[test]
+    fn iter_range_matches_non_overlapping() {
+        let haystack = b"aaaa";
+        let matcher = MatcherType::Find(b"aa");
+        let matches: Vec<_> = RangeMatches::new(haystack, matcher, false).collect();
+        assert_eq!(matches, vec![&b"aa"[..], &b"aa"[..]]);
+    }
 
-        #[test]
-        fn test_range_rmatches_overlapping() {
-            let haystack = b"aaaa";
-            let matcher = MatcherType::RFind(b"aa");
-            let matches: Vec<_> = RangeRMatches::new(haystack, matcher, true).collect();
-            assert_eq!(matches, vec![&b"aa"[..], &b"aa"[..], &b"aa"[..]]);
-        }
+    #[test]
+    fn iter_range_rmatches_overlapping() {
+        let haystack = b"aaaa";
+        let matcher = MatcherType::RFind(b"aa");
+        let matches: Vec<_> = RangeRMatches::new(haystack, matcher, true).collect();
+        assert_eq!(matches, vec![&b"aa"[..], &b"aa"[..], &b"aa"[..]]);
+    }
 
-        #[test]
-        fn test_range_rmatches_non_overlapping() {
-            let haystack = b"aaaa";
-            let matcher = MatcherType::RFind(b"aa");
-            let matches: Vec<_> = RangeRMatches::new(haystack, matcher, false).collect();
-            assert_eq!(matches, vec![&b"aa"[..], &b"aa"[..]]);
-        }
+    #[test]
+    fn iter_range_rmatches_non_overlapping() {
+        let haystack = b"aaaa";
+        let matcher = MatcherType::RFind(b"aa");
+        let matches: Vec<_> = RangeRMatches::new(haystack, matcher, false).collect();
+        assert_eq!(matches, vec![&b"aa"[..], &b"aa"[..]]);
     }
 
     #[test]
-    fn test_argsort_permutation_default() {
+    fn argsort_permutation_default() {
         // Test with a slice of string literals.
         let fruits = ["banana", "apple", "cherry"];
         let mut order = [0; 3]; // output buffer must be at least fruits.len()
@@ -1995,7 +2133,7 @@ mod tests {
     }
 
     #[test]
-    fn test_argsort_permutation_by_custom() {
+    fn argsort_permutation_by_custom() {
         // Define a custom type.
         #[derive(Debug)]
         #[allow(dead_code)]
@@ -2026,7 +2164,7 @@ mod tests {
     }
 
     #[test]
-    fn test_intersection_default() {
+    fn intersection_default() {
         // Two slices of string literals.
         let set1 = ["banana", "apple", "cherry"];
         let set2 = ["cherry", "orange", "pineapple", "banana"];
@@ -2054,7 +2192,7 @@ mod tests {
     }
 
     #[test]
-    fn test_intersection_by_custom() {
+    fn intersection_by_custom() {
         // Define a custom type.
         #[derive(Debug)]
         #[allow(dead_code)]
@@ -2108,7 +2246,7 @@ mod tests {
     }
 
     #[test]
-    fn test_intersection_debug() {
+    fn intersection_debug() {
         println!("Starting intersection debug test...");
 
         let set1 = ["banana", "apple", "cherry"];

From cb4fe1b27f8c2e5df9242462e88454c65f7e3ef4 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 4 Sep 2025 11:28:27 +0000
Subject: [PATCH 724/751] Docs: Hashing sections for each SDK

---
 .vscode/settings.json | 51 ++++++++++++-------------
 README.md             | 87 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 106 insertions(+), 32 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 978a9223..57a307d8 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,4 +1,27 @@
 {
+  "[c]": {
+    "editor.insertSpaces": true,
+    "editor.tabSize": 4
+  },
+  "[javascript]": {
+    "editor.defaultFormatter": "vscode.typescript-language-features",
+    "editor.insertSpaces": true,
+    "editor.tabSize": 4
+  },
+  "[json]": {
+    "editor.defaultFormatter": "esbenp.prettier-vscode",
+    "editor.insertSpaces": true,
+    "editor.tabSize": 2
+  },
+  "[typescript]": {
+    "editor.defaultFormatter": "vscode.typescript-language-features",
+    "editor.insertSpaces": true,
+    "editor.tabSize": 4
+  },
+  "[yaml]": {
+    "editor.insertSpaces": true,
+    "editor.tabSize": 2
+  },
   "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools",
   "C_Cpp.dimInactiveRegions": false,
   // This may cause overheating.
@@ -79,6 +102,7 @@
     "Gotoh",
     "GPGPU",
     "hashers",
+    "hashset",
     "Haswell",
     "Heikki",
     "hexdigits",
@@ -227,34 +251,11 @@
     "Zilla",
     "Zillable"
   ],
-  "editor.tabSize": 4,
-  "editor.insertSpaces": true,
   "editor.detectIndentation": false,
   "editor.formatOnSave": true,
+  "editor.insertSpaces": true,
   "editor.rulers": [120],
-  "[javascript]": {
-    "editor.tabSize": 4,
-    "editor.insertSpaces": true,
-    "editor.defaultFormatter": "vscode.typescript-language-features"
-  },
-  "[typescript]": {
-    "editor.tabSize": 4,
-    "editor.insertSpaces": true,
-    "editor.defaultFormatter": "vscode.typescript-language-features"
-  },
-  "[json]": {
-    "editor.tabSize": 2,
-    "editor.insertSpaces": true,
-    "editor.defaultFormatter": "esbenp.prettier-vscode"
-  },
-  "[yaml]": {
-    "editor.tabSize": 2,
-    "editor.insertSpaces": true
-  },
-  "[c]": {
-    "editor.tabSize": 4,
-    "editor.insertSpaces": true
-  },
+  "editor.tabSize": 4,
   "files.associations": {
     "__availability": "cpp",
     "__bit_reference": "cpp",
diff --git a/README.md b/README.md
index 0d352237..9bba8c86 100644
--- a/README.md
+++ b/README.md
@@ -436,6 +436,23 @@ image = open("/image/path.jpeg", "rb").read()
 sz.translate(image, look_up_table, inplace=True)
 ```
 
+### Hash
+
+Single‑shot and incremental hashing are both supported:
+
+```py
+import stringzilla as sz
+
+# One‑shot - stable 64‑bit output across all platforms!
+one = sz.hash(b"Hello, world!", seed=42)
+
+# Incremental updates return itself; digest does not consume state
+hasher = sz.Hasher(seed=42)
+hasher.update(b"Hello, ").update(b"world!")
+streamed = hasher.digest() # or `hexdigest()` for a string
+assert one == streamed
+```
+
 ### Collection-Level Operations
 
 Once split into a `Strs` object, you can sort, shuffle, and reorganize the slices, with minimum memory footprint.
@@ -1516,6 +1533,31 @@ sz::levenshtein_distance_utf8("façade", "facade") // 1
 
 [memchr-benchmarks]: https://github.com/ashvardanian/StringWa.rs
 
+### Hash
+
+Single‑shot and incremental hashing are both supported:
+
+```rs
+let mut hasher = sz::Hasher::new(42);
+hasher.write(b"Hello, ");
+hasher.write(b"world!");
+let streamed = hasher.finish();
+
+let mut hasher = sz::Hasher::new(42);
+hasher.write(b"Hello, world!");
+assert_eq!(streamed, hasher.finish());
+```
+
+To use StringZilla with `std::collections`:
+
+```rs
+use std::collections::HashMap;
+let mut map: HashMap<&str, i32, sz::BuildSzHasher> =
+    HashMap::with_hasher(sz::BuildSzHasher::with_seed(42));
+map.insert("a", 1);
+assert_eq!(map.get("a"), Some(&1));
+```
+
 ### Similarity Scores
 
 StringZilla exposes high-performance, batch-oriented similarity via the `szs` module.
@@ -1602,6 +1644,8 @@ Install the Node.js package and use zero-copy `Buffer` APIs.
 
 ```bash
 npm install stringzilla
+node -p "require('stringzilla').capabilities" # for CommonJS
+node -e "import('stringzilla').then(m=>console.log(m.default.capabilities)).catch(console.error)" # for ESM
 ```
 
 ```js
@@ -1623,13 +1667,6 @@ const lastVowelIndex = sz.findLastByteFrom(haystack, Buffer.from('aeiou'));   //
 const lCount = sz.count(haystack, Buffer.from('l'));                // 3n
 const llOverlapCount = sz.count(haystack, Buffer.from('ll'), true); // 1n
 
-// Hashing
-const hash = sz.hash(haystack, 0); // 64-bit BigInt
-const hasher = new sz.Hasher(0);
-hasher.update(Buffer.from('Hello, '));
-hasher.update(Buffer.from('world!'));
-const streamedHash = hasher.digest();
-
 // Equality/ordering utilities
 const isEqual = sz.equal(Buffer.from('a'), Buffer.from('a'));
 const order = sz.compare(Buffer.from('a'), Buffer.from('b')); // -1, 0, or 1
@@ -1638,6 +1675,24 @@ const order = sz.compare(Buffer.from('a'), Buffer.from('b')); // -1, 0, or 1
 const byteSum = sz.byteSum(haystack); // sum of bytes as BigInt
 ```
 
+### Hash
+
+Single‑shot and incremental hashing are both supported:
+
+```js
+import sz from 'stringzilla';
+
+// One‑shot - stable 64‑bit output across all platforms!
+const hash = sz.hash(Buffer.from('Hello, world!'), 42); // returns BigInt
+
+// Incremental updates - hasher maintains state
+const hasher = new sz.Hasher(42); // seed: 42
+hasher.update(Buffer.from('Hello, '));
+hasher.update(Buffer.from('world!'));
+const streamedHash = hasher.digest(); // returns BigInt
+console.assert(hash === streamedHash);
+```
+
 ## Quick Start: Swift 🍏
 
 StringZilla can be added as a dependency in the Swift Package Manager.
@@ -1660,6 +1715,24 @@ s[s.findLast(characterFrom: "aeiou")!...] // "a. 👋")
 s[s.findFirst(characterNotFrom: "aeiou")!...] // "Hello, world! Welcome to StringZilla. 👋"
 ```
 
+### Hash
+
+StringZilla provides high-performance hashing for Swift strings:
+
+```swift
+import StringZilla
+
+// One-shot hashing - stable 64-bit output across all platforms!
+let hash = "Hello, world!".hash(seed: 42)
+
+// Incremental hashing for streaming data
+var hasher = SZHasher(seed: 42)
+hasher.update("Hello, ")
+hasher.update("world!")
+let streamedHash = hasher.digest()
+assert(hash == streamedHash)
+```
+
 ## Algorithms & Design Decisions 📚
 
 StringZilla aims to optimize some of the slowest string operations.

From e951c4d47d995c560fde3941a495aca1f03fc553 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 4 Sep 2025 11:54:37 +0000
Subject: [PATCH 725/751] Make: Missing AES definitions for lib builds

---
 build.rs | 2 ++
 setup.py | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/build.rs b/build.rs
index 0084f788..ca917a32 100644
--- a/build.rs
+++ b/build.rs
@@ -51,8 +51,10 @@ fn build_stringzilla() {
     let flags_to_try = match target_arch.as_str() {
         "arm" | "aarch64" => vec![
             //
+            "SZ_USE_SVE2_AES",
             "SZ_USE_SVE2",
             "SZ_USE_SVE",
+            "SZ_USE_NEON_AES",
             "SZ_USE_NEON",
         ],
         _ => vec![
diff --git a/setup.py b/setup.py
index d350f184..a98efe02 100644
--- a/setup.py
+++ b/setup.py
@@ -122,8 +122,10 @@ def linux_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[Tu
         ("SZ_USE_SKYLAKE", "1" if is_64bit_x86() else "0"),
         ("SZ_USE_ICE", "1" if is_64bit_x86() else "0"),
         ("SZ_USE_NEON", "1" if is_64bit_arm() else "0"),
+        ("SZ_USE_NEON_AES", "1" if is_64bit_arm() else "0"),
         ("SZ_USE_SVE", "1" if is_64bit_arm() else "0"),
         ("SZ_USE_SVE2", "1" if is_64bit_arm() else "0"),
+        ("SZ_USE_SVE2_AES", "1" if is_64bit_arm() else "0"),
         ("SZ_DETECT_BIG_ENDIAN", "1" if is_big_endian() else "0"),
     ]
 
@@ -169,6 +171,7 @@ def darwin_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[T
         ("SZ_USE_SKYLAKE", "0"),
         ("SZ_USE_ICE", "0"),
         ("SZ_USE_NEON", "1" if is_64bit_arm() else "0"),
+        ("SZ_USE_NEON_AES", "1" if is_64bit_arm() else "0"),
         ("SZ_USE_SVE", "0"),
         ("SZ_USE_SVE2", "0"),
     ]
@@ -189,6 +192,7 @@ def windows_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[
         ("SZ_USE_SKYLAKE", "1" if is_64bit_x86() else "0"),
         ("SZ_USE_ICE", "1" if is_64bit_x86() else "0"),
         ("SZ_USE_NEON", "1" if is_64bit_arm() else "0"),
+        ("SZ_USE_NEON_AES", "1" if is_64bit_arm() else "0"),
         ("SZ_USE_SVE", "0"),
         ("SZ_USE_SVE2", "0"),
         ("SZ_DETECT_BIG_ENDIAN", "1" if is_big_endian() else "0"),

From d129f95a433b576118c9e774fafd66f2067756cf Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 4 Sep 2025 12:13:45 +0000
Subject: [PATCH 726/751] Make: Reuse SIMD compilation flags in `build.rs`

---
 build.rs | 72 +++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 45 insertions(+), 27 deletions(-)

diff --git a/build.rs b/build.rs
index ca917a32..81a70466 100644
--- a/build.rs
+++ b/build.rs
@@ -1,19 +1,24 @@
+use std::collections::HashMap;
 use std::env;
 
 fn main() {
     // Build stringzilla (always included, single-string operations)
-    build_stringzilla();
+    let serial_flags = build_stringzilla();
 
     // Build stringzillas (multi-string operations) if any feature is enabled
     if env::var("CARGO_FEATURE_CPUS").is_ok()
         || env::var("CARGO_FEATURE_CUDA").is_ok()
         || env::var("CARGO_FEATURE_ROCM").is_ok()
     {
-        build_stringzillas();
+        build_stringzillas(&serial_flags);
     }
 }
 
-fn build_stringzilla() {
+/// Build the StringZilla C library with dynamic SIMD dispatching
+/// and returns a dictionary of enabled compilation flags to be reused for
+/// parallel backends (e.g., StringZillas).
+fn build_stringzilla() -> HashMap<String, bool> {
+    let mut flags = HashMap::<String, bool>::new();
     let mut build = cc::Build::new();
     build
         .file("c/stringzilla.c")
@@ -29,25 +34,39 @@ fn build_stringzilla() {
 
     // Cargo will set different environment variables that we can use to properly configure the build.
     // https://doc.rust-lang.org/cargo/reference/environment-variables.html#environment-variables-cargo-sets-for-build-scripts
+    // https://doc.rust-lang.org/reference/conditional-compilation.html#r-cfg.target_endian
     let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default();
     let target_endian = env::var("CARGO_CFG_TARGET_ENDIAN").unwrap_or_default();
+    let target_bits = env::var("CARGO_CFG_TARGET_POINTER_WIDTH").unwrap_or_default();
 
     // Set endian-specific macro
     if target_endian == "big" {
         build.define("SZ_DETECT_BIG_ENDIAN", "1");
+        flags.insert("SZ_DETECT_BIG_ENDIAN".to_string(), true);
     } else {
         build.define("SZ_DETECT_BIG_ENDIAN", "0");
+        flags.insert("SZ_DETECT_BIG_ENDIAN".to_string(), false);
     }
 
-    if target_arch == "x86_64" {
+    if target_arch == "x86_64" && target_bits == "64" {
         build.define("SZ_IS_64BIT_X86_", "1");
         build.define("SZ_IS_64BIT_ARM_", "0");
-    } else if target_arch == "aarch64" {
+        flags.insert("SZ_IS_64BIT_X86_".to_string(), true);
+        flags.insert("SZ_IS_64BIT_ARM_".to_string(), false);
+    } else if target_arch == "aarch64" && target_bits == "64" {
         build.define("SZ_IS_64BIT_X86_", "0");
         build.define("SZ_IS_64BIT_ARM_", "1");
+        flags.insert("SZ_IS_64BIT_X86_".to_string(), false);
+        flags.insert("SZ_IS_64BIT_ARM_".to_string(), true);
+    } else {
+        build.define("SZ_IS_64BIT_X86_", "0");
+        build.define("SZ_IS_64BIT_ARM_", "0");
+        flags.insert("SZ_IS_64BIT_X86_".to_string(), false);
+        flags.insert("SZ_IS_64BIT_ARM_".to_string(), false);
     }
 
     // At start we will try compiling with all SIMD backends enabled
+    // https://doc.rust-lang.org/reference/conditional-compilation.html#target_arch
     let flags_to_try = match target_arch.as_str() {
         "arm" | "aarch64" => vec![
             //
@@ -57,15 +76,17 @@ fn build_stringzilla() {
             "SZ_USE_NEON_AES",
             "SZ_USE_NEON",
         ],
-        _ => vec![
+        "x86_64" => vec![
             //
             "SZ_USE_ICE",
             "SZ_USE_SKYLAKE",
             "SZ_USE_HASWELL",
         ],
+        _ => vec![],
     };
     for flag in flags_to_try.iter() {
         build.define(flag, "1");
+        flags.insert(flag.to_string(), true);
     }
 
     // If that fails, we will try disabling them one by one
@@ -74,6 +95,7 @@ fn build_stringzilla() {
 
         for flag in flags_to_try.iter() {
             build.define(flag, "0");
+            flags.insert(flag.to_string(), false);
             if build.try_compile("stringzilla").is_ok() {
                 break;
             }
@@ -99,10 +121,12 @@ fn build_stringzilla() {
     println!("cargo:rerun-if-changed=include/stringzilla/small_string.h");
     println!("cargo:rerun-if-changed=include/stringzilla/sort.h");
     println!("cargo:rerun-if-changed=include/stringzilla/types.h");
+    flags
 }
 
-fn build_stringzillas() {
+fn build_stringzillas(serial_flags: &HashMap<String, bool>) {
     let mut build = cc::Build::new();
+    let is_cpus = env::var("CARGO_FEATURE_CPUS").is_ok();
     let is_cuda = env::var("CARGO_FEATURE_CUDA").is_ok();
     let is_rocm = env::var("CARGO_FEATURE_ROCM").is_ok();
 
@@ -115,7 +139,7 @@ fn build_stringzillas() {
         .define("SZ_DEBUG", "0")
         .flag("-O2");
 
-    // Set GPU backend flags
+    // Nvidia GPU backend
     if is_cuda {
         build.cuda(true);
         build.file("c/stringzillas.cu");
@@ -124,14 +148,18 @@ fn build_stringzillas() {
         build.flag("-std=c++20");
         build.flag("--expt-relaxed-constexpr");
         build.flag("-arch=sm_90a");
-    } else if is_rocm {
+    }
+    // AMD GPU backend
+    else if is_rocm {
         build.cpp(true);
         build.file("c/stringzillas.cu");
         build.define("SZ_USE_CUDA", "0");
         build.define("SZ_USE_ROCM", "1");
         build.flag("-std=c++20");
         // TODO: Add proper HIP/ROCm compiler support
-    } else {
+    }
+    // Multi-core CPU backend
+    else if is_cpus {
         build.cpp(true);
         build.file("c/stringzillas.cpp");
         build.define("SZ_USE_CUDA", "0");
@@ -144,24 +172,14 @@ fn build_stringzillas() {
         .flag_if_supported("-fdiagnostics-color=always")
         .flag_if_supported("-fPIC");
 
-    // Architecture-specific setup (same as stringzilla)
-    let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default();
-    let target_endian = env::var("CARGO_CFG_TARGET_ENDIAN").unwrap_or_default();
-
-    if target_endian == "big" {
-        build.define("SZ_DETECT_BIG_ENDIAN", "1");
-    } else {
-        build.define("SZ_DETECT_BIG_ENDIAN", "0");
-    }
-
-    if target_arch == "x86_64" {
-        build.define("SZ_IS_64BIT_X86_", "1");
-        build.define("SZ_IS_64BIT_ARM_", "0");
-    } else if target_arch == "aarch64" {
-        build.define("SZ_IS_64BIT_X86_", "0");
-        build.define("SZ_IS_64BIT_ARM_", "1");
+    // Apply the same architecture-specific flags as determined for stringzilla
+    for (flag, enabled) in serial_flags.iter() {
+        if *enabled {
+            build.define(flag, "1");
+        } else {
+            build.define(flag, "0");
+        }
     }
-
     // Try compilation with fallback (similar to stringzilla approach)
     if build.try_compile("stringzillas").is_err() {
         println!("cargo:warning=Failed to compile stringzillas with selected backend");

From 61cc860e645f443cc5545d23fca2df8487d82d8b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 4 Sep 2025 12:14:04 +0000
Subject: [PATCH 727/751] Improve: Polish parallel string test names

---
 rust/stringzillas.rs | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/rust/stringzillas.rs b/rust/stringzillas.rs
index 1e02fc99..3bf7fdb3 100644
--- a/rust/stringzillas.rs
+++ b/rust/stringzillas.rs
@@ -2650,14 +2650,14 @@ mod tests {
     use super::*;
 
     #[test]
-    fn test_backend_info() {
+    fn backend_info() {
         let info = backend_info();
         assert!(!info.is_empty());
         println!("Backend: {}", info);
     }
 
     #[test]
-    fn test_device_scope_creation() {
+    fn device_scope_creation() {
         // Test default device scope
         let default_device = DeviceScope::default();
         match default_device {
@@ -2695,7 +2695,7 @@ mod tests {
     }
 
     #[test]
-    fn test_device_scope_validation() {
+    fn device_scope_validation() {
         // Test valid CPU core count - 0 means use all cores
         let all_cores = DeviceScope::cpu_cores(0);
         assert!(all_cores.is_ok(), "CPU cores 0 should mean all cores");
@@ -2710,7 +2710,7 @@ mod tests {
     }
 
     #[test]
-    fn test_fingerprint_builder_configurations() {
+    fn fingerprint_builder_configurations() {
         let device_result = DeviceScope::default();
         if device_result.is_err() {
             println!("Skipping fingerprint tests - device initialization failed");
@@ -2756,7 +2756,7 @@ mod tests {
     }
 
     #[test]
-    fn test_fingerprint_computation() {
+    fn fingerprint_computation() {
         let device_result = DeviceScope::default();
         if device_result.is_err() {
             println!("Skipping fingerprint computation test - device initialization failed");
@@ -2788,7 +2788,7 @@ mod tests {
     }
 
     #[test]
-    fn test_levenshtein_distance_engine() {
+    fn levenshtein_distance_engine() {
         let device_result = DeviceScope::default();
         if device_result.is_err() {
             println!("Skipping Levenshtein test - device initialization failed");
@@ -2825,7 +2825,7 @@ mod tests {
     }
 
     #[test]
-    fn test_levenshtein_utf8_engine() {
+    fn levenshtein_utf8_engine() {
         let device_result = DeviceScope::default();
         if device_result.is_err() {
             println!("Skipping UTF-8 Levenshtein test - device initialization failed");
@@ -2855,7 +2855,7 @@ mod tests {
     }
 
     #[test]
-    fn test_needleman_wunsch_engine() {
+    fn needleman_wunsch_engine() {
         let device_result = DeviceScope::default();
         if device_result.is_err() {
             println!("Skipping Needleman-Wunsch test - device initialization failed");
@@ -2890,7 +2890,7 @@ mod tests {
     }
 
     #[test]
-    fn test_smith_waterman_engine() {
+    fn smith_waterman_engine() {
         let device_result = DeviceScope::default();
         if device_result.is_err() {
             println!("Skipping Smith-Waterman test - device initialization failed");
@@ -2925,7 +2925,7 @@ mod tests {
     }
 
     #[test]
-    fn test_unified_allocator() {
+    fn unified_allocator() {
         // Test basic allocation
         let layout = std::alloc::Layout::from_size_align(1024, 8).unwrap();
         let alloc = UnifiedAlloc;
@@ -2952,7 +2952,7 @@ mod tests {
     }
 
     #[test]
-    fn test_error_handling() {
+    fn error_handling() {
         // Test that valid operations don't panic
         let valid_cpu = DeviceScope::cpu_cores(0); // 0 means all cores - valid
         assert!(valid_cpu.is_ok(), "CPU cores 0 should succeed");
@@ -2973,7 +2973,7 @@ mod tests {
     }
 
     #[test]
-    fn test_thread_safety() {
+    fn thread_safety() {
         use std::sync::Arc;
         use std::thread;
 
@@ -3015,7 +3015,7 @@ mod tests {
     }
 
     #[test]
-    fn test_large_batch_processing() {
+    fn large_batch_processing() {
         let device_result = DeviceScope::default();
         if device_result.is_err() {
             println!("Skipping large batch test - device initialization failed");
@@ -3046,7 +3046,7 @@ mod tests {
     }
 
     #[test]
-    fn test_similarity_estimation() {
+    fn similarity_estimation() {
         let device_result = DeviceScope::default();
         if device_result.is_err() {
             println!("Skipping similarity test - device initialization failed");

From c8f6a49e3c491706b4a6d67e034d919f5ac5232c Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 4 Sep 2025 12:14:20 +0000
Subject: [PATCH 728/751] Make: Enable SIMD in NodeJS builds

---
 .vscode/settings.json |   3 +-
 CONTRIBUTING.md       |  10 ++-
 binding.gyp           | 144 +++++++++++++++++++++++++++++++++++++-----
 3 files changed, 140 insertions(+), 17 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 57a307d8..d6b59507 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -9,7 +9,7 @@
     "editor.tabSize": 4
   },
   "[json]": {
-    "editor.defaultFormatter": "esbenp.prettier-vscode",
+    "editor.defaultFormatter": "vscode.json-language-features",
     "editor.insertSpaces": true,
     "editor.tabSize": 2
   },
@@ -257,6 +257,7 @@
   "editor.rulers": [120],
   "editor.tabSize": 4,
   "files.associations": {
+    "*.gyp": "json",
     "__availability": "cpp",
     "__bit_reference": "cpp",
     "__bits": "cpp",
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f81209a2..6c0a7c93 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -534,7 +534,15 @@ Alternatively, you can explore the Jupyter notebooks in `scripts/` directory.
 ## Contributing in JavaScript
 
 ```bash
-npm ci && npm test
+npm ci
+npm test
+```
+
+Log capabilities:
+
+```bash
+npm link stringzilla
+node --input-type=module -e "import('stringzilla').then(m=>console.log(m.default.capabilities))"
 ```
 
 ## Contributing in Swift
diff --git a/binding.gyp b/binding.gyp
index 572036b6..44bf6acc 100644
--- a/binding.gyp
+++ b/binding.gyp
@@ -1,16 +1,130 @@
 {
-    "targets": [
-        {
-            "target_name": "stringzilla",
-            "sources": ["javascript/lib.c"],
-            "include_dirs": ["include"],
-            "cflags": [
-                "-std=c99",
-                "-Wno-unknown-pragmas",
-                "-Wno-maybe-uninitialized",
-                "-Wno-cast-function-type",
-                "-Wno-unused-function",
-            ],
-        }
-    ]
-}
+  "targets": [
+    {
+      "target_name": "stringzilla",
+      "sources": [
+        "javascript/lib.c",
+        "c/stringzilla.c"
+      ],
+      "include_dirs": [
+        "include"
+      ],
+      "cflags": [
+        "-std=c99",
+        "-fPIC",
+        "-Wno-unknown-pragmas",
+        "-Wno-maybe-uninitialized",
+        "-Wno-cast-function-type",
+        "-Wno-unused-function"
+      ],
+      "defines": [
+        "SZ_DYNAMIC_DISPATCH=1"
+      ],
+      "conditions": [
+        [
+          "OS=='linux' or OS=='freebsd'",
+          {
+            "conditions": [
+              [
+                "target_arch=='x64'",
+                {
+                  "defines": [
+                    "SZ_USE_HASWELL=1",
+                    "SZ_USE_SKYLAKE=1",
+                    "SZ_USE_ICE=1",
+                    "SZ_USE_NEON=0",
+                    "SZ_USE_SVE=0",
+                    "SZ_USE_SVE2=0"
+                  ]
+                }
+              ],
+              [
+                "target_arch=='arm64'",
+                {
+                  "defines": [
+                    "SZ_USE_HASWELL=0",
+                    "SZ_USE_SKYLAKE=0",
+                    "SZ_USE_ICE=0",
+                    "SZ_USE_NEON=1",
+                    "SZ_USE_NEON_AES=1",
+                    "SZ_USE_SVE=1",
+                    "SZ_USE_SVE2=1",
+                    "SZ_USE_SVE2_AES=1"
+                  ]
+                }
+              ]
+            ]
+          }
+        ],
+        [
+          "OS=='mac'",
+          {
+            "conditions": [
+              [
+                "target_arch=='x64'",
+                {
+                  "defines": [
+                    "SZ_USE_HASWELL=1",
+                    "SZ_USE_SKYLAKE=0",
+                    "SZ_USE_ICE=0",
+                    "SZ_USE_NEON=0",
+                    "SZ_USE_SVE=0",
+                    "SZ_USE_SVE2=0"
+                  ]
+                }
+              ],
+              [
+                "target_arch=='arm64'",
+                {
+                  "defines": [
+                    "SZ_USE_HASWELL=0",
+                    "SZ_USE_SKYLAKE=0",
+                    "SZ_USE_ICE=0",
+                    "SZ_USE_NEON=1",
+                    "SZ_USE_NEON_AES=1",
+                    "SZ_USE_SVE=0",
+                    "SZ_USE_SVE2=0"
+                  ]
+                }
+              ]
+            ]
+          }
+        ],
+        [
+          "OS=='win'",
+          {
+            "conditions": [
+              [
+                "target_arch=='x64'",
+                {
+                  "defines": [
+                    "SZ_USE_HASWELL=1",
+                    "SZ_USE_SKYLAKE=1",
+                    "SZ_USE_ICE=1",
+                    "SZ_USE_NEON=0",
+                    "SZ_USE_SVE=0",
+                    "SZ_USE_SVE2=0"
+                  ]
+                }
+              ],
+              [
+                "target_arch=='arm64'",
+                {
+                  "defines": [
+                    "SZ_USE_HASWELL=0",
+                    "SZ_USE_SKYLAKE=0",
+                    "SZ_USE_ICE=0",
+                    "SZ_USE_NEON=1",
+                    "SZ_USE_NEON_AES=1",
+                    "SZ_USE_SVE=0",
+                    "SZ_USE_SVE2=0"
+                  ]
+                }
+              ]
+            ]
+          }
+        ]
+      ]
+    }
+  ]
+}
\ No newline at end of file

From 8b5af6ebd437b2c5f80439b24f02ac10d897b8ac Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 4 Sep 2025 12:28:13 +0000
Subject: [PATCH 729/751] Make: Drop long-deprecated `.releaserc`

---
 .releaserc | 88 ------------------------------------------------------
 1 file changed, 88 deletions(-)
 delete mode 100644 .releaserc

diff --git a/.releaserc b/.releaserc
deleted file mode 100644
index 13f4f8b4..00000000
--- a/.releaserc
+++ /dev/null
@@ -1,88 +0,0 @@
-{
-    "branches": [
-        "main"
-    ],
-    "debug": true,
-    "ci": true,
-    "dryRun": false,
-    "plugins": [
-        [
-            "@semantic-release/commit-analyzer",
-            {
-                "preset": "eslint",
-                "releaseRules": [
-                    {
-                        "tag": "Break",
-                        "release": "major"
-                    },
-                    {
-                        "tag": "Add",
-                        "release": "minor"
-                    },
-                    {
-                        "tag": "Improve",
-                        "release": "patch"
-                    },
-                    {
-                        "tag": "Make",
-                        "release": "patch"
-                    },
-                    {
-                        "tag": "Refactor",
-                        "release": false
-                    }
-                ]
-            }
-        ],
-        [
-            "@semantic-release/release-notes-generator",
-            {
-                "preset": "eslint",
-                "releaseRules": [
-                    {
-                        "tag": "Break",
-                        "release": "major"
-                    },
-                    {
-                        "tag": "Add",
-                        "release": "minor"
-                    },
-                    {
-                        "tag": "Improve",
-                        "release": "patch"
-                    },
-                    {
-                        "tag": "Make",
-                        "release": "patch"
-                    },
-                    {
-                        "tag": "Refactor",
-                        "release": false
-                    }
-                ]
-            }
-        ],
-        "@semantic-release/github",
-        [
-            "@semantic-release/exec",
-            {
-                "prepareCmd": "bash .github/workflows/update_version.sh '${nextRelease.version}'"
-            }
-        ],
-        [
-            "@semantic-release/git",
-            {
-                "assets": [
-                    "VERSION",
-                    "conanfile.py",
-                    "package.json",
-                    "Cargo.toml",
-                    "Cargo.lock",
-                    "CMakeLists.txt",
-                    "include/stringzilla/stringzilla.h"
-                ],
-                "message": "Build: Released ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}"
-            }
-        ]
-    ]
-}
\ No newline at end of file

From 0d95f13e656433f5d8950af429239cb0796b653f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 4 Sep 2025 12:30:10 +0000
Subject: [PATCH 730/751] Make: Packaging for NPM

---
 .github/workflows/prerelease.yml |  4 +--
 .github/workflows/release.yml    |  4 +--
 CONTRIBUTING.md                  |  2 +-
 package.json                     | 47 +++++++++++++++++++++++---------
 4 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 2099dd17..b6f3cfa4 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -139,11 +139,11 @@ jobs:
 
         # JavaScript
       - name: Set up Node.js
-        uses: actions/setup-node@v4
+        uses: actions/setup-node@v5
         with:
           node-version: 20
       - name: Build and test JavaScript
-        run: npm ci && npm test
+        run: npm install && npm test
 
       # Rust
       - name: Test Rust
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 71c9f372..17fe2924 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -396,13 +396,13 @@ jobs:
       - name: Initialize submodules
         run: git submodule update --init --recursive
       - name: Setup Node.js
-        uses: actions/setup-node@v4
+        uses: actions/setup-node@v5
         with:
           node-version: 20
       - name: Install dependencies
         run: npm install
       - name: Clean install dependencies
-        run: npm ci
+        run: npm install
       - name: Run tests
         run: npm test
       - name: Publish to NPM
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 6c0a7c93..075068b6 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -534,7 +534,7 @@ Alternatively, you can explore the Jupyter notebooks in `scripts/` directory.
 ## Contributing in JavaScript
 
 ```bash
-npm ci
+npm install
 npm test
 ```
 
diff --git a/package.json b/package.json
index dff67693..d2fa86cb 100644
--- a/package.json
+++ b/package.json
@@ -3,30 +3,51 @@
   "version": "3.11.3",
   "description": "Search, hash, sort, fingerprint, and fuzzy-match strings faster via SWAR, SIMD, and GPGPU",
   "author": "Ash Vardanian",
-  "license": "Apache 2.0",
+  "license": "Apache-2.0",
   "main": "javascript/stringzilla.js",
   "type": "module",
   "repository": {
     "type": "git",
     "url": "https://github.com/ashvardanian/stringzilla.git"
   },
+  "homepage": "https://ashvardanian.com/posts/stringzilla/",
+  "bugs": {
+    "url": "https://github.com/ashvardanian/stringzilla/issues"
+  },
+  "keywords": [
+    "simd",
+    "search",
+    "string",
+    "performance",
+    "hash",
+    "sort",
+    "gpu",
+    "cuda",
+    "fuzzy-search",
+    "native-addon"
+  ],
   "gypfile": true,
   "engines": {
-    "node": "~10 >=10.20 || >=12.17"
+    "node": ">=18.0.0"
   },
+  "files": [
+    "javascript/",
+    "include/",
+    "c/",
+    "binding.gyp",
+    "README.md"
+  ],
   "dependencies": {
-    "@types/node": "^20.4.5",
-    "bindings": "~1.2.1",
-    "node-addon-api": "^3.0.0"
-  },
-  "scripts": {
-    "test": "node --test ./scripts/test.js"
+    "bindings": "^1.5.0",
+    "node-addon-api": "^8.0.0"
   },
   "devDependencies": {
-    "@semantic-release/exec": "^6.0.3",
-    "@semantic-release/git": "^10.0.1",
-    "conventional-changelog-eslint": "^3.0.9",
-    "semantic-release": "^21.1.2",
-    "typescript": "^5.1.6"
+    "@types/node": "^24.0.0"
+  },
+  "scripts": {
+    "test": "node --test ./scripts/test.js",
+    "build": "node-gyp rebuild",
+    "install": "node-gyp rebuild",
+    "prepack": "npm run build"
   }
 }
\ No newline at end of file

From 4a06f3f4485063db50059a5331c9df7b04525ba2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 4 Sep 2025 15:30:17 +0000
Subject: [PATCH 731/751] Docs: Wording & AI dashes

---
 .vscode/settings.json                 |    1 +
 CONTRIBUTING.md                       |    2 +-
 README.md                             |   70 +-
 cli/README.md                         |   12 +-
 include/stringzilla/sort.h            |   36 +-
 include/stringzillas/fingerprints.hpp |    2 +-
 rust/stringzilla.rs                   |   16 +-
 scripts/explore_fingerprint.ipynb     | 3116 ++++++++++++-------------
 8 files changed, 1628 insertions(+), 1627 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index c743e39c..b3828a99 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -233,6 +233,7 @@
     "Titin",
     "tparam",
     "TPFLAGS",
+    "tripeptide",
     "unigram",
     "unpoison",
     "usecases",
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1a4417aa..5de3a269 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -570,7 +570,7 @@ StringZilla's Rust crate supports both `std` and `no_std` builds.
 Other options include:
 
 - `std` (default): enables standard library support.
-- `cpus`: multi‑threaded CPU backend (implies `std`).
+- `cpus`: multi-threaded CPU backend (implies `std`).
 - `cuda`: CUDA backend (implies `cpus` and `std`).
 - `rocm`: ROCm backend (implies `cpus` and `std`).
 
diff --git a/README.md b/README.md
index 9bba8c86..8ad1cc6b 100644
--- a/README.md
+++ b/README.md
@@ -35,7 +35,7 @@ It __accelerates exact and fuzzy string matching, edit distance computations, so
 - 🐚 __[Shell][faq-shell]__: Accelerate common CLI tools with `sz_` prefix
 - 📚 Researcher? Jump to [Algorithms & Design Decisions](#algorithms--design-decisions-📚)
 - 💡 Thinking to contribute? Look for ["good first issues"][first-issues]
-- 🤝 And check the [guide](https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md) to setup the environment
+- 🤝 And check the [guide](https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md) to set up the environment
 - Want more bindings or features? Let [me](https://github.com/ashvardanian) know!
 
 [faq-shell]: https://github.com/ashvardanian/StringZilla/blob/main/cli/README.md
@@ -47,7 +47,7 @@ __Who is this for?__
 - For software engineers optimizing strings in their apps and services.
 - For bioinformaticians and search engineers looking for edit-distances for [USearch](https://github.com/unum-cloud/usearch).
 - For [DBMS][faq-dbms] devs, optimizing `LIKE`, `ORDER BY`, and `GROUP BY` operations.
-- For hardware designers, needing a SWAR baseline for strings-processing functionality.
+- For hardware designers, needing a SWAR baseline for string-processing functionality.
 - For students studying SIMD/SWAR applications to non-data-parallel operations.
 
 [faq-dbms]: https://en.wikipedia.org/wiki/Database
@@ -185,9 +185,9 @@ __Who is this for?__
       <span style="color:#ABABAB;">arm:</span> <b>25.8</b> MB/s
     </td>
   </tr>
-  <!-- Mapping Characters with Look-Up Table Transforms -->
+  <!-- Mapping characters with lookup table transforms -->
   <tr>
-    <td colspan="4" align="center">Mapping Characters with Look-Up Table Transforms</td>
+    <td colspan="4" align="center">Mapping characters with lookup table transforms</td>
   </tr>
   <tr>
     <td align="center">⚪</td>
@@ -253,7 +253,7 @@ __Who is this for?__
   </tr>
   <!-- Alignment Score -->
   <tr>
-    <td colspan="4" align="center">Needleman-Wunsch alignment scores, ≅ 10 K aminoacids long</td>
+    <td colspan="4" align="center">Needleman-Wunsch alignment scores, ≅ 10 K amino acids long</td>
   </tr>
   <tr>
     <td align="center">⚪</td>
@@ -277,7 +277,7 @@ Notably, if the CPU supports misaligned loads, even the 64-bit SWAR backends are
 
 > Most benchmarks were conducted on a 1 GB English text corpus, with an average word length of 6 characters.
 > The code was compiled with GCC 12, using `glibc` v2.35.
-> The benchmarks performed on Arm-based Graviton3 AWS `c7g` instances and `r7iz` Intel Sapphire Rapids.
+> The benchmarks were performed on Arm-based Graviton3 AWS `c7g` instances and `r7iz` Intel Sapphire Rapids.
 > Most modern Arm-based 64-bit CPUs will have similar relative speedups.
 > Variance within x86 CPUs will be larger.
 > <sup>1</sup> Unlike other libraries, LibC requires strings to be NULL-terminated.
@@ -291,7 +291,7 @@ Notably, if the CPU supports misaligned loads, even the 64-bit SWAR backends are
 > <sup>6</sup> Contrary to the popular opinion, Python's default `sorted` function works faster than the C and C++ standard libraries.
 > That holds for large lists or tuples of strings, but fails as soon as you need more complex logic, like sorting dictionaries by a string key, or producing the "sorted order" permutation.
 > The latter is very common in database engines and is most similar to `numpy.argsort`.
-> Current StringZilla solution can be at least 4x faster without loss of generality.
+> The current StringZilla solution can be at least 4x faster without loss of generality.
 
 [faq-mersenne-twister]: https://en.wikipedia.org/wiki/Mersenne_Twister
 
@@ -303,16 +303,16 @@ It's split into 2 layers:
 1. StringZilla: single-header C library and C++ wrapper for high-performance string operations.
 2. StringZillas: parallel CPU/GPU backends used for large-batch operations and accelerators.
 
-Having a second C++/CUDA layer greatly simplifies the implementation of similarity scoring and fingerprint functions, that would require too much error-prone boilerplate code in pure C.
+Having a second C++/CUDA layer greatly simplifies the implementation of similarity scoring and fingerprinting functions, which would otherwise require too much error-prone boilerplate code in pure C.
 Both layers are designed to be extremely portable:
 
-- [x] across both Little-Endian and Big-Endian architectures.
+- [x] across both little-endian and big-endian architectures.
 - [x] across 32-bit and 64-bit hardware architectures.
-- [x] across Operating Systems and compilers.
+- [x] across operating systems and compilers.
 - [x] across ASCII and UTF-8 encoded inputs.
 
 Not all features are available across all bindings.
-Consider contributing, if you need a feature that's not yet implemented.
+Consider contributing if you need a feature that's not yet implemented.
 
 |                                | Maturity |   C   |  C++  | Python | Rust  |  JS   | Swift |
 | :----------------------------- | :------: | :---: | :---: | :----: | :---: | :---: | :---: |
@@ -354,8 +354,8 @@ python -c "import stringzillas; print(stringzillas.__capabilities__)"   # for pa
 
 ### Basic Usage
 
-If you've ever used the Python `str`, `bytes`, `bytearray`, `memoryview` class, you'll know what to expect.
-StringZilla's `Str` class is a hybrid of those two, providing `str`-like interface to byte-arrays.
+If you've ever used the Python `str`, `bytes`, `bytearray`, or `memoryview` classes, you'll know what to expect.
+StringZilla's `Str` class is a hybrid of the above, providing a `str`-like interface to byte arrays.
 
 ```python
 from stringzilla import Str, File
@@ -369,9 +369,9 @@ alphabet_array = np.arange(ord("a"), ord("z"), dtype=np.uint8)
 text_from_array = Str(memoryview(alphabet_array))
 ```
 
-The `File` class memory-maps a file from persistent memory without loading its copy into RAM.
+The `File` class memory-maps a file from persistent storage without loading its copy into RAM.
 The contents of that file would remain immutable, and the mapping can be shared by multiple Python processes simultaneously.
-A standard dataset pre-processing use case would be to map a sizeable textual dataset like Common Crawl into memory, spawn child processes, and split the job between them.
+A standard dataset pre-processing use case would be to map a sizable textual dataset like Common Crawl into memory, spawn child processes, and split the job between them.
 
 ### Basic Operations
 
@@ -396,8 +396,8 @@ x: Strs = text.rsplit(separator=' ', maxsplit=sys.maxsize, keepseparator=False)
 x: Strs = text.splitlines(keeplinebreaks=False, maxsplit=sys.maxsize)
 ```
 
-It's important to note, that the last function behavior is slightly different from Python's `str.splitlines`.
-The [native version][faq-splitlines] matches `\n`, `\r`, `\v` or `\x0b`, `\f` or `\x0c`, `\x1c`, `\x1d`, `\x1e`, `\x85`, `\r\n`, `\u2028`, `\u2029`, including 3x two-bytes-long runes.
+It's important to note that the last function's behavior is slightly different from Python's `str.splitlines`.
+The [native version][faq-splitlines] matches `\n`, `\r`, `\v` or `\x0b`, `\f` or `\x0c`, `\x1c`, `\x1d`, `\x1e`, `\x85`, `\r\n`, `\u2028`, `\u2029`, including 3x two-byte-long runes.
 The StringZilla version matches only `\n`, `\v`, `\f`, `\r`, `\x1c`, `\x1d`, `\x1e`, `\x85`, avoiding two-byte-long runes.
 
 [faq-splitlines]: https://docs.python.org/3/library/stdtypes.html#str.splitlines
@@ -438,12 +438,12 @@ sz.translate(image, look_up_table, inplace=True)
 
 ### Hash
 
-Single‑shot and incremental hashing are both supported:
+Single-shot and incremental hashing are both supported:
 
 ```py
 import stringzilla as sz
 
-# One‑shot - stable 64‑bit output across all platforms!
+# One-shot - stable 64-bit output across all platforms!
 one = sz.hash(b"Hello, world!", seed=42)
 
 # Incremental updates return itself; digest does not consume state
@@ -455,7 +455,7 @@ assert one == streamed
 
 ### Collection-Level Operations
 
-Once split into a `Strs` object, you can sort, shuffle, and reorganize the slices, with minimum memory footprint.
+Once split into a `Strs` object, you can sort, shuffle, and reorganize the slices with minimal memory footprint.
 If all the chunks are located in consecutive memory regions, the memory overhead can be as low as 4 bytes per chunk.
 
 ```python
@@ -466,7 +466,7 @@ lines_sorted: Strs = lines.sorted() # returns a new Strs in sorted order
 order: tuple = lines.argsort() # similar to `numpy.argsort`
 ```
 
-Working on [RedPajama][redpajama], addressing 20 Billion annotated english documents, one will need only 160 GB of RAM instead of Terabytes.
+Working on [RedPajama][redpajama], addressing 20 billion annotated English documents, one will need only 160 GB of RAM instead of terabytes.
 Once loaded, the data will be memory-mapped, and can be reused between multiple Python processes without copies.
 And of course, you can use slices to navigate the dataset and shard it between multiple workers.
 
@@ -575,7 +575,7 @@ For proteins ~10k chars, 100 pairs:
 - [EditDistance](https://github.com/roy-ht/editdistance): 32.9s
 - StringZilla: __0.8s__
 
-Using the same proteins for Needleman–Wunsch alignment scores:
+Using the same proteins for Needleman-Wunsch alignment scores:
 
 - [BioPython](https://github.com/biopython/biopython): 25.8s
 - StringZilla: __7.8s__
@@ -605,7 +605,7 @@ for packed_row, packed_row_aminoacid in enumerate(aligner.substitution_matrix.al
         reconstructed_column = ord(packed_column_aminoacid)
         subs_reconstructed[reconstructed_row, reconstructed_column] = subs_packed[packed_row, packed_column]
 
-# Let's pick two examples for of tri-peptides (made of 3 aminoacids)
+# Let's pick two examples of tripeptides (made of 3 amino acids)
 glutathione = "ECG" # Need to rebuild human tissue?
 thyrotropin_releasing_hormone = "QHP" # Or to regulate your metabolism?
 
@@ -879,12 +879,12 @@ To target a different device, use the appropriate `szs_device_scope_init_{cpu_co
 When dealing with GPU backends, make sure to use the "unified memory" allocators exposed as `szs_unified_{alloc,free}`.
 Similar stable C ABIs are exposed for other workloads as well.
 
-- UTF‑8: `szs_levenshtein_distances_utf8_{sequence,u32tape,u64tape}`
-- Needleman–Wunsch: `szs_needleman_wunsch_scores_{sequence,u32tape,u64tape}`
+- UTF-8: `szs_levenshtein_distances_utf8_{sequence,u32tape,u64tape}`
+- Needleman-Wunsch: `szs_needleman_wunsch_scores_{sequence,u32tape,u64tape}`
 - Smith-Waterman: `szs_smith_waterman_scores_{sequence,u32tape,u64tape}`
 
 Moreover, in C++ codebases one can tap into the raw templates implementing that functionality, customizing them with custom executors, SIMD plugins, etc.
-For that include `stringzillas/similarities.hpp` for C++ and `stringzillas/similarities.hpp` for CUDA.
+For that include `stringzillas/similarities.hpp` for C++ and `stringzillas/similarities.cuh` for CUDA.
 
 ```cpp
 #include <stringzillas/similarities.hpp>
@@ -913,12 +913,12 @@ auto _ = pool.try_spawn(std::thread::hardware_concurrency());
 auto _ = engine(tape_a, tape_b, distances, pool);
 ```
 
-All of the potentially-failing StringZillas interfaces return error codes, and none raise C++ exceptions.
+All of the potentially failing StringZillas' interfaces return error codes, and none raise C++ exceptions.
 Parallelism is enabled at both collection-level and within individual pairs of large inputs.
 
 ### Rolling Fingerprints
 
-StringZilla exposes parallel fingerprinting (Min‑Hashes or Count‑Min‑Sketches) via the `stringzillas/stringzillas.h` header. 
+StringZilla exposes parallel fingerprinting (Min-Hashes or Count-Min-Sketches) via the `stringzillas/stringzillas.h` header. 
 Use `szs_device_scope_t` to pick hardware and optionally limit capabilities per engine.
 
 ```c
@@ -943,7 +943,7 @@ szs_device_scope_free(device);
 ```
 
 Moreover, in C++ codebases one can tap into the raw templates implementing that functionality, customizing them with custom executors, SIMD plugins, etc.
-For that include `stringzillas/fingerprints.hpp` for C++ and `stringzillas/fingerprints.hpp` for CUDA.
+For that include `stringzillas/fingerprints.hpp` for C++ and `stringzillas/fingerprints.cuh` for CUDA.
 
 ```cpp
 #include <stringzillas/fingerprints.hpp>
@@ -958,7 +958,7 @@ std::vector<std::string> docs = {"alpha beta", "alpha betta"};
 sz::arrow_strings_tape<char, sz::size_t, std::allocator<char>> tape;
 auto _ = tape.try_assign(docs.begin(), docs.end());
 
-// Run on the current thread with a Rabin–Karp family hasher
+// Run on the current thread with a Rabin-Karp family hasher
 constexpr std::size_t dimensions_k = 256;
 using fingerprinter_t = szs::floating_rolling_hashers<sz_cap_serial_k, dimensions_k>;
 fingerprinter_t engine;
@@ -1516,7 +1516,7 @@ assert_eq!(my_str.sz_find("world"), Some(7));
 assert_eq!(my_cow_str.as_ref().sz_find("world"), Some(7));
 ```
 
-The library also exposes Levenshtein and Hamming edit-distances for byte-arrays and UTF-8 strings, as well as Needleman-Wunsch alignment scores.
+The library also exposes Levenshtein and Hamming edit distances for byte arrays and UTF-8 strings, as well as Needleman-Wunsch alignment scores.
 
 ```rust
 use stringzilla::sz;
@@ -1535,7 +1535,7 @@ sz::levenshtein_distance_utf8("façade", "facade") // 1
 
 ### Hash
 
-Single‑shot and incremental hashing are both supported:
+Single-shot and incremental hashing are both supported:
 
 ```rs
 let mut hasher = sz::Hasher::new(42);
@@ -1677,12 +1677,12 @@ const byteSum = sz.byteSum(haystack); // sum of bytes as BigInt
 
 ### Hash
 
-Single‑shot and incremental hashing are both supported:
+Single-shot and incremental hashing are both supported:
 
 ```js
 import sz from 'stringzilla';
 
-// One‑shot - stable 64‑bit output across all platforms!
+// One-shot - stable 64-bit output across all platforms!
 const hash = sz.hash(Buffer.from('Hello, world!'), 42); // returns BigInt
 
 // Incremental updates - hasher maintains state
@@ -2019,7 +2019,7 @@ sz.reset_capabilities(sz.__capabilities__)  # Reset to auto-dispatch
 
 ## Contributing 👾
 
-Please check out the [contributing guide](https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md) for more details on how to setup the development environment and contribute to this project.
+Please check out the [contributing guide](https://github.com/ashvardanian/StringZilla/blob/main/CONTRIBUTING.md) for more details on how to set up the development environment and contribute to this project.
 If you like this project, you may also enjoy [USearch][usearch], [UCall][ucall], [UForm][uform], and [SimSIMD][simsimd]. 🤗
 
 [usearch]: https://github.com/unum-cloud/usearch
diff --git a/cli/README.md b/cli/README.md
index 0bd67323..4c585d60 100644
--- a/cli/README.md
+++ b/cli/README.md
@@ -3,12 +3,12 @@
 This section of the project is pretty much a work in progress.
 The goal is to provide a set of command-line utilities that:
 
-- ✅ that benefit the most from SIMD instructions, 
+- ✅ benefit the most from SIMD instructions, 
 - ✅ rely solely on core StringZilla functionality,
-- ✅ works the same on Linux, macOS, and Windows.
+- ✅ work the same on Linux, macOS, and Windows.
 
 Other utilities are, of course, welcome to use StringZilla but may not be good candidates for this repository.
-To install, pull the Python package from PyPi:
+To install, pull the Python package from PyPI:
 
 ```bash
 pip install stringzilla
@@ -19,15 +19,15 @@ Currently implemented:
 - `sz_wc`: 3x faster `wc` word count.
 - `sz_split`: 4x faster `split` file splitting.
 
-What other interfaces should be added?
+What other interfaces should we add?
 Levenshtein distances?
 Fuzzy search?
-Are there common alternatives to mimic?
+Are there common alternatives to emulate?
 
 ## `wc`: Word Count
 
 The `wc` utility on Linux can be used to count the number of lines, words, and bytes in a file.
-Using SIMD-accelerated character and character-set search, StringZilla, even with slow SSDs, it can be noticeably faster.
+Using SIMD-accelerated character and character-set search, StringZilla can be noticeably faster, even with slow SSDs.
 
 ```bash
 $ time wc enwik9.txt
diff --git a/include/stringzilla/sort.h b/include/stringzilla/sort.h
index 2ea85192..8ea7b2d3 100644
--- a/include/stringzilla/sort.h
+++ b/include/stringzilla/sort.h
@@ -228,7 +228,7 @@ SZ_PUBLIC void sz_pgrams_sort_with_insertion(sz_pgram_t *pgrams, sz_size_t count
     } while (0)
 
 /**
- *  @brief  Sorting network for 2 elements is just a single compare–swap.
+ *  @brief  Sorting network for 2 elements is just a single compare-swap.
  */
 SZ_INTERNAL void sz_sequence_sorting_network_2x_(sz_pgram_t *pgrams, sz_sorted_idx_t *offsets) {
     sz_sequence_sorting_network_conditional_swap_(0, 1);
@@ -237,7 +237,7 @@ SZ_INTERNAL void sz_sequence_sorting_network_2x_(sz_pgram_t *pgrams, sz_sorted_i
 /**
  *  @brief  Sorting network for 3 elements.
  *
- *  The network uses 3 compare–swap operations:
+ *  The network uses 3 compare-swap operations:
  *
  *      Stage 1: (0, 1)
  *      Stage 2: (0, 2)
@@ -258,7 +258,7 @@ SZ_INTERNAL void sz_sequence_sorting_network_3x_(sz_pgram_t *pgrams, sz_sorted_i
 /**
  *  @brief  Sorting network for 4 elements.
  *
- *  The network uses 5 compare–swap operations:
+ *  The network uses 5 compare-swap operations:
  *
  *      Stage 1: (0, 1) and (2, 3)
  *      Stage 2: (0, 2)
@@ -267,17 +267,17 @@ SZ_INTERNAL void sz_sequence_sorting_network_3x_(sz_pgram_t *pgrams, sz_sorted_i
  */
 SZ_INTERNAL void sz_sequence_sorting_network_4x_(sz_pgram_t *pgrams, sz_sorted_idx_t *offsets) {
 
-    // Stage 1: Compare–swap adjacent pairs.
+    // Stage 1: Compare-swap adjacent pairs.
     sz_sequence_sorting_network_conditional_swap_(0, 1);
     sz_sequence_sorting_network_conditional_swap_(2, 3);
 
-    // Stage 2: Compare–swap (0, 2)
+    // Stage 2: Compare-swap (0, 2)
     sz_sequence_sorting_network_conditional_swap_(0, 2);
 
-    // Stage 3: Compare–swap (1, 3)
+    // Stage 3: Compare-swap (1, 3)
     sz_sequence_sorting_network_conditional_swap_(1, 3);
 
-    // Stage 4: Final compare–swap (1, 2)
+    // Stage 4: Final compare-swap (1, 2)
     sz_sequence_sorting_network_conditional_swap_(1, 2);
 
 #if SZ_DEBUG
@@ -291,7 +291,7 @@ SZ_INTERNAL void sz_sequence_sorting_network_4x_(sz_pgram_t *pgrams, sz_sorted_i
  *          and their corresponding offsets in only 19 comparisons, the most efficient
  *          variant currently known.
  *
- *  The network consists of 6 stages with the following compare–swap pairs:
+ *  The network consists of 6 stages with the following compare-swap pairs:
  *
  *      Stage 1: (0,1), (2,3), (4,5), (6,7)
  *      Stage 2: (0,2), (1,3), (4,6), (5,7)
@@ -302,33 +302,33 @@ SZ_INTERNAL void sz_sequence_sorting_network_4x_(sz_pgram_t *pgrams, sz_sorted_i
  */
 SZ_INTERNAL void sz_sequence_sorting_network_8x_(sz_pgram_t *pgrams, sz_sorted_idx_t *offsets) {
 
-    // Stage 1: Compare–swap adjacent pairs.
+    // Stage 1: Compare-swap adjacent pairs.
     sz_sequence_sorting_network_conditional_swap_(0, 1);
     sz_sequence_sorting_network_conditional_swap_(2, 3);
     sz_sequence_sorting_network_conditional_swap_(4, 5);
     sz_sequence_sorting_network_conditional_swap_(6, 7);
 
-    // Stage 2: Compare–swap with stride 2.
+    // Stage 2: Compare-swap with stride 2.
     sz_sequence_sorting_network_conditional_swap_(0, 2);
     sz_sequence_sorting_network_conditional_swap_(1, 3);
     sz_sequence_sorting_network_conditional_swap_(4, 6);
     sz_sequence_sorting_network_conditional_swap_(5, 7);
 
-    // Stage 3: Compare–swap between middle elements.
+    // Stage 3: Compare-swap between middle elements.
     sz_sequence_sorting_network_conditional_swap_(1, 2);
     sz_sequence_sorting_network_conditional_swap_(5, 6);
 
-    // Stage 4: Compare–swap across the two halves.
+    // Stage 4: Compare-swap across the two halves.
     sz_sequence_sorting_network_conditional_swap_(0, 4);
     sz_sequence_sorting_network_conditional_swap_(1, 5);
     sz_sequence_sorting_network_conditional_swap_(2, 6);
     sz_sequence_sorting_network_conditional_swap_(3, 7);
 
-    // Stage 5: Compare–swap within each half.
+    // Stage 5: Compare-swap within each half.
     sz_sequence_sorting_network_conditional_swap_(2, 4);
     sz_sequence_sorting_network_conditional_swap_(3, 5);
 
-    // Stage 6: Final compare–swap of adjacent elements.
+    // Stage 6: Final compare-swap of adjacent elements.
     sz_sequence_sorting_network_conditional_swap_(1, 2);
     sz_sequence_sorting_network_conditional_swap_(3, 4);
     sz_sequence_sorting_network_conditional_swap_(5, 6);
@@ -544,7 +544,7 @@ SZ_PUBLIC void sz_sequence_argsort_serial_next_pgrams_(                   //
 #if !SZ_IS_BIG_ENDIAN_
         sz_size_t current_pgram_length = (sz_size_t)current_pgram_str[0]; //! The byte order was swapped
 #else
-        sz_size_t current_pgram_length = (sz_size_t)current_pgram_str[pgram_capacity]; //! No byte swapping on big-endian
+        sz_size_t current_pgram_length = (sz_size_t)current_pgram_str[pgram_capacity]; //! No byte swaps on big-endian
 #endif
         int has_multiple_strings = nested_end - nested_start > 1;
         int has_more_characters_in_each = current_pgram_length == pgram_capacity;
@@ -869,7 +869,7 @@ SZ_PUBLIC void sz_sequence_argsort_skylake_next_pgrams_(
 #if !SZ_IS_BIG_ENDIAN_
         sz_size_t current_pgram_length = (sz_size_t)current_pgram_str[0]; //! The byte order was swapped
 #else
-        sz_size_t current_pgram_length = (sz_size_t)current_pgram_str[pgram_capacity]; //! No byte swapping on big-endian
+        sz_size_t current_pgram_length = (sz_size_t)current_pgram_str[pgram_capacity]; //! No byte swaps on big-endian
 #endif
         int has_multiple_strings = nested_end - nested_start > 1;
         int has_more_characters_in_each = current_pgram_length == pgram_capacity;
@@ -1112,9 +1112,9 @@ SZ_PUBLIC void sz_sequence_argsort_sve_next_pgrams_(
 
         sz_cptr_t current_pgram_str = (sz_cptr_t)&current_pgram;
 #if !SZ_IS_BIG_ENDIAN_
-        sz_size_t current_pgram_length = (sz_size_t)current_pgram_str[0]; // byte order was swapped
+        sz_size_t current_pgram_length = (sz_size_t)current_pgram_str[0]; //! The byte order was swapped
 #else
-        sz_size_t current_pgram_length = (sz_size_t)current_pgram_str[pgram_capacity]; // No byte swapping on big-endian
+        sz_size_t current_pgram_length = (sz_size_t)current_pgram_str[pgram_capacity]; // ! No byte swaps on big-endian
 #endif
         int has_multiple_strings = nested_end - nested_start > 1;
         int has_more_characters_in_each = current_pgram_length == pgram_capacity;
diff --git a/include/stringzillas/fingerprints.hpp b/include/stringzillas/fingerprints.hpp
index f4c09dce..af7171d7 100644
--- a/include/stringzillas/fingerprints.hpp
+++ b/include/stringzillas/fingerprints.hpp
@@ -163,7 +163,7 @@ struct multiplying_rolling_hasher {
 };
 
 /**
- *  @brief Rabin-Karp–style rolling polynomial hash function.
+ *  @brief Rabin-Karp-style rolling polynomial hash function.
  *  @tparam hash_type_ Type of the hash value, can be `u16`, `u32`, or `u64`.
  *  @tparam accumulator_type_ Type used for modulo arithmetic, e.g., `u64_t`.
  *
diff --git a/rust/stringzilla.rs b/rust/stringzilla.rs
index 79610504..c62a2d6e 100644
--- a/rust/stringzilla.rs
+++ b/rust/stringzilla.rs
@@ -94,13 +94,13 @@ pub struct _SzSequence {
 }
 
 impl Byteset {
-    /// Initializes a bit‑set to an empty collection (all characters banned).
+    /// Initializes a bit-set to an empty collection (all characters banned).
     #[inline]
     pub fn new() -> Self {
         Self { bits: [0; 4] }
     }
 
-    /// Initializes a bit‑set to contain all ASCII characters.
+    /// Initializes a bit-set to contain all ASCII characters.
     #[inline]
     pub fn new_ascii() -> Self {
         Self {
@@ -405,7 +405,7 @@ impl<const N: usize> FixedCString<N> {
     }
 
     /// Returns the current content as a &str.
-    /// Returns an empty string if the content isn’t valid UTF‑8.
+    /// Returns an empty string if the content isn’t valid UTF-8.
     pub fn as_str(&self) -> &str {
         core::str::from_utf8(&self.buf[..self.len]).unwrap_or("")
     }
@@ -914,7 +914,7 @@ where
 }
 
 /// A helper type that holds a mapper closure which, given an index,
-/// returns the corresponding byte‑slice representation.
+/// returns the corresponding byte-slice representation.
 ///
 /// The closure is expected to have type `Fn(usize) -> &[u8]` so that callers
 /// can write closures like `|i| data[i].as_ref()` or `|i| people[i].name.as_bytes()`.
@@ -955,7 +955,7 @@ where
     get_slice_impl::<F>
 }
 
-/// Sorts a sequence of items by comparing their byte‑slice representations.
+/// Sorts a sequence of items by comparing their byte-slice representations.
 ///
 /// The caller must supply an output buffer `order` whose length is at least
 /// equal to the length of `data`. On success, the function writes the sorted
@@ -978,7 +978,7 @@ pub fn argsort_permutation<T: AsRef<[u8]>>(data: &[T], order: &mut [SortedIdx])
     argsort_permutation_by(|i| data[i].as_ref(), order)
 }
 
-/// Sorts a sequence of items by comparing their corresponding byte‑slice representations.
+/// Sorts a sequence of items by comparing their corresponding byte-slice representations.
 /// The size of the permutation is inferred from the length of the `order` slice.
 ///
 /// # Example
@@ -1042,7 +1042,7 @@ where
 // Intersection functions
 // ----------------------------------------------------------------------
 
-/// Intersects two sequences (inner join) using their default byte‑slice views.
+/// Intersects two sequences (inner join) using their default byte-slice views.
 ///
 /// Both sequences must have an output buffer provided (for first and second positions)
 /// whose length is at least the minimum of the two input lengths.
@@ -1091,7 +1091,7 @@ pub fn intersection<T: AsRef<[u8]>>(
     )
 }
 
-/// Intersects two sequences (inner join) using their elements corresponding byte‑slice views.
+/// Intersects two sequences (inner join) using their elements corresponding byte-slice views.
 /// The caller must provide a closure that maps an index to the byte slice representation of
 /// the corresponding element in the first and second sequences.
 ///
diff --git a/scripts/explore_fingerprint.ipynb b/scripts/explore_fingerprint.ipynb
index d4c94a77..fdeb1762 100644
--- a/scripts/explore_fingerprint.ipynb
+++ b/scripts/explore_fingerprint.ipynb
@@ -1,1590 +1,1590 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Exploring Rabin-Karp-style Min-Hash Fingerprinting\n",
-    "\n",
-    "This document showcases the differences between different numeric types that one can use to implement a Rabin-Karp-style min-hash fingerprinting algorithm.\n",
-    "It answers several important questions:\n",
-    "\n",
-    "- How to use floating-point numbers for a traditionally integer-based task - \"hashing\"?\n",
-    "- How to properly compose many such hash functions to maximize the quality of fingerprints?"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Rabin-Karp Rolling Hashing\n",
-    "\n",
-    "Rabin-Karp algorithm is a polynomial rolling hash function built around modulo arithmetic.\n",
-    "Once the hashing window rolls forward, the leftmost character is removed and a new rightmost character is added.\n",
-    "Thus, the cost of computing each slices hash is just $O(1)$, if the previous window's hash is known.\n",
-    "\n",
-    "Assuming, many such rolling hashes will be used later, we can parameterize the algorithm with a few parameters:\n",
-    "- `window_width` - the length of the substring to hash;\n",
-    "- `multiplier` - the multiplier for the polynomial hash;\n",
-    "- `modulo` - the modulo to use for the hash, generally prime;\n",
-    "- `alphabet_size` - the size of the alphabet used in the string, e.g. 256 for ASCII;\n",
-    "- `salt` - an optional salt to add to each character's ordinal value, usually 1 to avoid adding zeroes;\n",
-    "- `seed` - an optional seed for the first hash, can be 0."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from typing import Generator\n",
-    "\n",
-    "\n",
-    "def rabin_karp_ints(\n",
-    "    s: str,\n",
-    "    window_width: int,\n",
-    "    multiplier: int,\n",
-    "    modulo: int,\n",
-    "    alphabet_size: int = 256,\n",
-    "    salt: int = 1,\n",
-    ") -> Generator[int, None, None]:\n",
-    "    \"\"\"Return the rolling polynomial hashes of every length-`window_width` substring of `s`\"\"\"\n",
-    "\n",
-    "    assert window_width > 0, \"Window width must be positive\"\n",
-    "    assert multiplier > 0, \"Multiplier must be positive\"\n",
-    "    assert modulo > 0, \"Modulo must be positive\"\n",
-    "    assert multiplier < modulo, \"Multiplier must be less than modulo\"\n",
-    "\n",
-    "    if len(s) < window_width:\n",
-    "        return\n",
-    "\n",
-    "    current_hash: int = 0\n",
-    "    for char in s[:window_width]:\n",
-    "        new_term = ord(char) + salt\n",
-    "        assert new_term < (alphabet_size + salt), \"Pass correct `alphabet_size`\"\n",
-    "        current_hash = (current_hash * multiplier + new_term) % modulo\n",
-    "    yield current_hash\n",
-    "\n",
-    "    discarding_multiplier: int = pow(multiplier, window_width - 1, modulo)\n",
-    "    total_hashes = len(s) - window_width + 1\n",
-    "    for i in range(1, total_hashes):  # First hash is already yielded\n",
-    "        old_term = ord(s[i - 1]) + salt\n",
-    "        new_term = ord(s[i + window_width - 1]) + salt\n",
-    "\n",
-    "        # Remove leftmost char and add the new rightmost one.\n",
-    "        # All operations must be modulo `modulo`, but assuming the infinite precision of integers,\n",
-    "        # we don't care in this draft.\n",
-    "        current_hash = (current_hash - old_term * discarding_multiplier) % modulo\n",
-    "        current_hash = (current_hash * multiplier + new_term) % modulo\n",
-    "        yield current_hash\n",
-    "\n",
-    "\n",
-    "# Quick sanity-check\n",
-    "assert list(rabin_karp_ints(\"abcd\", 3, 31, 1_000_000_007)) == [\n",
-    "    next(rabin_karp_ints(\"abc\", 3, 31, 1_000_000_007)),\n",
-    "    next(rabin_karp_ints(\"bcd\", 3, 31, 1_000_000_007)),\n",
-    "]\n",
-    "assert list(rabin_karp_ints(\"abcdefdhijklmnopqr\", 17, 31, 65521)) == [\n",
-    "    next(rabin_karp_ints(\"abcdefdhijklmnopq\", 17, 31, 65521)),\n",
-    "    next(rabin_karp_ints(\"bcdefdhijklmnopqr\", 17, 31, 65521)),\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Rabin-Karp Rolling Hashing via Floats\n",
-    "\n",
-    "The Python's `int` type is unbounded, so it can be used to implement the Rabin-Karp rolling hash algorithm without worrying about overflow.\n",
-    "It is, however, insanely expensive to use, and doesn't allow us to explore optimization opportunities.\n",
-    "The `float`, on the other hand, is just a double-precision IEEE 754 floating-point number, which can exactly represent 52-bit integers!\n",
-    "Thus, we can convert our arithmetic to use `float`s, if we guarantee, that no intermediate result will exceed that limit."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from typing import Generator\n",
-    "\n",
-    "LARGEST_INTEGRAL_FLOAT: float = 4503599627370495.0\n",
-    "\n",
-    "\n",
-    "def rabin_karp_floats(\n",
-    "    s: str,\n",
-    "    window_width: int,\n",
-    "    multiplier: int,\n",
-    "    modulo: int,\n",
-    "    alphabet_size: int = 256,\n",
-    "    salt: int = 1,\n",
-    ") -> Generator[int, None, None]:\n",
-    "    \"\"\"Return the rolling polynomial hashes of every length-`window_width` substring of `s`\"\"\"\n",
-    "\n",
-    "    assert window_width > 0, \"Window width must be positive\"\n",
-    "    assert multiplier > 0, \"Multiplier must be positive\"\n",
-    "    assert modulo > 0, \"Modulo must be positive\"\n",
-    "    assert multiplier < modulo, \"Multiplier must be less than modulo\"\n",
-    "\n",
-    "    if len(s) < window_width:\n",
-    "        return\n",
-    "\n",
-    "    multiplier = float(multiplier)\n",
-    "    modulo = float(modulo)\n",
-    "    assert (\n",
-    "        modulo < LARGEST_INTEGRAL_FLOAT\n",
-    "    ), \"Modulo can't exceed the largest integral float value\"\n",
-    "\n",
-    "    # Ensure, we won't overflow the floating-point representation\n",
-    "    largest_post_modulo = modulo - 1\n",
-    "    max_possible_term = alphabet_size\n",
-    "    assert (\n",
-    "        largest_post_modulo * multiplier + max_possible_term <= LARGEST_INTEGRAL_FLOAT\n",
-    "    ), \"Will overflow\"\n",
-    "\n",
-    "    # All of the operations will happen with a modulo:\n",
-    "    def mul_mod(a: float, b: float) -> float:\n",
-    "        return (a * b) % modulo\n",
-    "\n",
-    "    def add_mod(a: float, b: float) -> float:\n",
-    "        return (a + b) % modulo\n",
-    "\n",
-    "    def sub_mod(a: float, b: float) -> float:\n",
-    "        return (a - b) % modulo\n",
-    "\n",
-    "    # Precompute the discarding multiplier\n",
-    "    discarding_multiplier: float = 1.0\n",
-    "    for _ in range(window_width - 1):\n",
-    "        discarding_multiplier = mul_mod(discarding_multiplier, multiplier)\n",
-    "\n",
-    "    # Handle the first window - without dropping any characters\n",
-    "    current_hash: float = 0.0\n",
-    "    for char in s[:window_width]:\n",
-    "        new_term = float(ord(char) + salt)\n",
-    "        assert new_term < (alphabet_size + salt), \"Pass correct `alphabet_size`\"\n",
-    "        current_hash = add_mod(mul_mod(current_hash, multiplier), new_term)\n",
-    "    yield int(current_hash)\n",
-    "\n",
-    "    # Roll through the rest of the string\n",
-    "    total_hashes = len(s) - window_width + 1\n",
-    "    for i in range(1, total_hashes):  # First hash is already yielded\n",
-    "        old_term = float(ord(s[i - 1]) + salt)\n",
-    "        new_term = float(ord(s[i + window_width - 1]) + salt)\n",
-    "\n",
-    "        # Remove leftmost char and add the new rightmost one.\n",
-    "        current_hash = sub_mod(current_hash, mul_mod(old_term, discarding_multiplier))\n",
-    "        current_hash = add_mod(mul_mod(current_hash, multiplier), new_term)\n",
-    "        yield int(current_hash)\n",
-    "\n",
-    "\n",
-    "# Quick sanity-check\n",
-    "assert list(rabin_karp_floats(\"abcd\", 3, 31, 1_000_000_007)) == [\n",
-    "    next(rabin_karp_floats(\"abc\", 3, 31, 1_000_000_007)),\n",
-    "    next(rabin_karp_floats(\"bcd\", 3, 31, 1_000_000_007)),\n",
-    "]\n",
-    "assert list(rabin_karp_floats(\"abcdefdhijklmnopqr\", 17, 31, 65521)) == [\n",
-    "    next(rabin_karp_floats(\"abcdefdhijklmnopq\", 17, 31, 65521)),\n",
-    "    next(rabin_karp_floats(\"bcdefdhijklmnopqr\", 17, 31, 65521)),\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's load some data and ensure that the outputs are identical between the `int` and `float` implementations."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pathlib import Path\n",
-    "\n",
-    "dataset_directory = Path(\"..\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "textual_dataset_path = dataset_directory / \"leipzig1M.txt\"\n",
-    "textual_dataset = open(textual_dataset_path, \"r\").read().strip()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
+  "cells": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loaded 1,000,000 lines of mean length 128.64 characters\n"
-     ]
-    }
-   ],
-   "source": [
-    "textual_lines = textual_dataset.split(\"\\n\")\n",
-    "print(\n",
-    "    f\"Loaded {len(textual_lines):,} lines of mean length {sum(len(line) for line in textual_lines) / len(textual_lines):.2f} characters\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def compare_hashes(line, make_baseline_generator, make_test_generator):\n",
-    "    int_hashes = list(make_baseline_generator(line))\n",
-    "    float_hashes = list(make_test_generator(line))\n",
-    "    if int_hashes != float_hashes:\n",
-    "        print(f\"Int Hashes:   {int_hashes}\")\n",
-    "        print(f\"Float Hashes: {float_hashes}\")\n",
-    "\n",
-    "\n",
-    "for line in textual_lines[:2]:\n",
-    "    compare_hashes(\n",
-    "        line,\n",
-    "        lambda l: rabin_karp_ints(l, 17, 31, 65521),\n",
-    "        lambda l: rabin_karp_floats(l, 17, 31, 65521),\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "A bigger question now is, will the same hold, if we use much larger modulo values?"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Exploring Rabin-Karp-style Min-Hash Fingerprinting\n",
+        "\n",
+        "This document showcases the differences between different numeric types that one can use to implement a Rabin-Karp-style min-hash fingerprinting algorithm.\n",
+        "It answers several important questions:\n",
+        "\n",
+        "- How to use floating-point numbers for a traditionally integer-based task - \"hashing\"?\n",
+        "- How to properly compose many such hash functions to maximize the quality of fingerprints?"
+      ]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Passed for window width: 3!\n",
-      "Passed for window width: 17!\n",
-      "Passed for window width: 64!\n"
-     ]
-    }
-   ],
-   "source": [
-    "LARGEST_SAFE_MODULO = 4503599626977\n",
-    "\n",
-    "for window_width in [3, 17, 64]:\n",
-    "    for line in textual_lines[:50]:\n",
-    "        compare_hashes(\n",
-    "            line,\n",
-    "            lambda l: rabin_karp_ints(\n",
-    "                l, window_width=window_width, multiplier=257, modulo=LARGEST_SAFE_MODULO\n",
-    "            ),\n",
-    "            lambda l: rabin_karp_floats(\n",
-    "                l, window_width=window_width, multiplier=257, modulo=LARGEST_SAFE_MODULO\n",
-    "            ),\n",
-    "        )\n",
-    "    print(f\"Passed for window width: {window_width}!\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Rabin-Karp Rolling Hashing via FMAs\n",
-    "\n",
-    "- How aggressively can we use **FMA** (Fused Multiply-Add) operations to optimize the algorithm?\n",
-    "- How many of the modulo operations can we avoid?\n",
-    "- How can we simplify the `%` modulo operation?"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import math\n",
-    "from typing import Generator\n",
-    "\n",
-    "LARGEST_INTEGRAL_FLOAT: float = 4503599627370495.0\n",
-    "\n",
-    "\n",
-    "def rabin_karp_fma(\n",
-    "    s: str,\n",
-    "    window_width: int,\n",
-    "    multiplier: int,\n",
-    "    modulo: int,\n",
-    "    alphabet_size: int = 256,\n",
-    "    salt: int = 1,\n",
-    ") -> Generator[int, None, None]:\n",
-    "    \"\"\"Return the rolling polynomial hashes of every length-`window_width` substring of `s`\n",
-    "    using Fused-Multiply-Add (FMA) operations & Barrett reduction for performance.\"\"\"\n",
-    "\n",
-    "    assert window_width > 0, \"Window width must be positive\"\n",
-    "    assert multiplier > 0, \"Multiplier must be positive\"\n",
-    "    assert modulo > 0, \"Modulo must be positive\"\n",
-    "    assert multiplier < modulo, \"Multiplier must be less than modulo\"\n",
-    "\n",
-    "    if len(s) < window_width:\n",
-    "        return\n",
-    "\n",
-    "    multiplier = float(multiplier)\n",
-    "    modulo = float(modulo)\n",
-    "    assert (\n",
-    "        modulo < LARGEST_INTEGRAL_FLOAT\n",
-    "    ), \"Modulo can't exceed the largest integral float value\"\n",
-    "\n",
-    "    # Ensure, we won't overflow the floating-point representation\n",
-    "    largest_post_modulo = modulo - 1\n",
-    "    max_possible_term = alphabet_size\n",
-    "    assert (\n",
-    "        largest_post_modulo * multiplier + max_possible_term <= LARGEST_INTEGRAL_FLOAT\n",
-    "    ), \"Will overflow\"\n",
-    "\n",
-    "    inverse_modulo: float = 1.0 / modulo\n",
-    "\n",
-    "    # Barrett reduction function\n",
-    "    # It will be used to reduce the intermediate results to the modulo range\n",
-    "    def barrett_mod(x: float) -> float:\n",
-    "        q = math.floor(x * inverse_modulo)\n",
-    "        result = x - q * modulo\n",
-    "        # Handle potential off-by-one errors\n",
-    "        if result >= modulo:\n",
-    "            result -= modulo\n",
-    "        elif result < 0:\n",
-    "            result += modulo\n",
-    "        assert int(result) == int(x % modulo), \"Barrett reduction failed\"\n",
-    "        return result\n",
-    "\n",
-    "    # All of the operations will happen with a modulo:\n",
-    "    def fma_mod(a: float, b: float, c: float) -> float:\n",
-    "        intermediate = a * b + c\n",
-    "        assert intermediate <= LARGEST_INTEGRAL_FLOAT, \"FMA did exceed integral range\"\n",
-    "        return barrett_mod(intermediate)\n",
-    "\n",
-    "    # Precompute the discarding multiplier\n",
-    "    negative_discarding_multiplier: float = 1.0\n",
-    "    for _ in range(window_width - 1):\n",
-    "        negative_discarding_multiplier = fma_mod(\n",
-    "            negative_discarding_multiplier, multiplier, 0.0\n",
-    "        )\n",
-    "    negative_discarding_multiplier = (\n",
-    "        -negative_discarding_multiplier\n",
-    "    )  # Negate for FMA compatibility\n",
-    "\n",
-    "    # Handle the first window - without dropping any characters\n",
-    "    current_hash: float = 0.0\n",
-    "    for char in s[:window_width]:\n",
-    "        new_term = float(ord(char) + salt)\n",
-    "        assert new_term < (alphabet_size + salt), \"Pass correct `alphabet_size`\"\n",
-    "        current_hash = fma_mod(current_hash, multiplier, new_term)\n",
-    "    yield int(current_hash)\n",
-    "\n",
-    "    # Roll through the rest of the string\n",
-    "    total_hashes = len(s) - window_width + 1\n",
-    "    for i in range(1, total_hashes):  # First hash is already yielded\n",
-    "        old_term = float(ord(s[i - 1]) + salt)\n",
-    "        new_term = float(ord(s[i + window_width - 1]) + salt)\n",
-    "\n",
-    "        # Remove leftmost char and add the new rightmost one.\n",
-    "        current_hash = fma_mod(old_term, negative_discarding_multiplier, current_hash)\n",
-    "        assert (\n",
-    "            current_hash >= -modulo\n",
-    "        ), \"Intermediate hash may be negative, but within modulo range\"\n",
-    "        current_hash = fma_mod(current_hash, multiplier, new_term)\n",
-    "        assert current_hash >= 0, \"Current hash should not be negative\"\n",
-    "        yield int(current_hash)\n",
-    "\n",
-    "\n",
-    "# Quick sanity-check\n",
-    "assert list(rabin_karp_fma(\"abcd\", 3, 31, 1_000_000_007)) == [\n",
-    "    next(rabin_karp_fma(\"abc\", 3, 31, 1_000_000_007)),\n",
-    "    next(rabin_karp_fma(\"bcd\", 3, 31, 1_000_000_007)),\n",
-    "]\n",
-    "assert list(rabin_karp_fma(\"abcdefdhijklmnopqr\", 17, 31, 65521)) == [\n",
-    "    next(rabin_karp_fma(\"abcdefdhijklmnopq\", 17, 31, 65521)),\n",
-    "    next(rabin_karp_fma(\"bcdefdhijklmnopqr\", 17, 31, 65521)),\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Rabin-Karp Rolling Hashing\n",
+        "\n",
+        "Rabin-Karp algorithm is a polynomial rolling hash function built around modulo arithmetic.\n",
+        "Once the hashing window rolls forward, the leftmost character is removed and a new rightmost character is added.\n",
+        "Thus, the cost of computing each slices hash is just $O(1)$, if the previous window's hash is known.\n",
+        "\n",
+        "Assuming, many such rolling hashes will be used later, we can parameterize the algorithm with a few parameters:\n",
+        "- `window_width` - the length of the substring to hash;\n",
+        "- `multiplier` - the multiplier for the polynomial hash;\n",
+        "- `modulo` - the modulo to use for the hash, generally prime;\n",
+        "- `alphabet_size` - the size of the alphabet used in the string, e.g. 256 for ASCII;\n",
+        "- `salt` - an optional salt to add to each character's ordinal value, usually 1 to avoid adding zeroes;\n",
+        "- `seed` - an optional seed for the first hash, can be 0."
+      ]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Passed for window width: 3!\n",
-      "Passed for window width: 17!\n",
-      "Passed for window width: 64!\n"
-     ]
-    }
-   ],
-   "source": [
-    "LARGEST_SAFE_MODULO = 4503599626977\n",
-    "\n",
-    "for window_width in [3, 17, 64]:\n",
-    "    for line in textual_lines[:50]:\n",
-    "        compare_hashes(\n",
-    "            line,\n",
-    "            lambda l: rabin_karp_ints(\n",
-    "                l, window_width=window_width, multiplier=257, modulo=LARGEST_SAFE_MODULO\n",
-    "            ),\n",
-    "            lambda l: rabin_karp_fma(\n",
-    "                l, window_width=window_width, multiplier=257, modulo=LARGEST_SAFE_MODULO\n",
-    "            ),\n",
-    "        )\n",
-    "    print(f\"Passed for window width: {window_width}!\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "As we can handle typical texts, let's try several tricky inputs... where we'll be at a brink of an overflow! Some uncomfortable character values are: `\\x00`, `\\x01`, `\\x7F`, `\\xFF`. To really stress-test, let's pick the largest prime number below `LARGEST_INTEGRAL_FLOAT`, that can be used safely for a given alphabet size."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from typing import Generator\n",
+        "\n",
+        "\n",
+        "def rabin_karp_ints(\n",
+        "    s: str,\n",
+        "    window_width: int,\n",
+        "    multiplier: int,\n",
+        "    modulo: int,\n",
+        "    alphabet_size: int = 256,\n",
+        "    salt: int = 1,\n",
+        ") -> Generator[int, None, None]:\n",
+        "    \"\"\"Return the rolling polynomial hashes of every length-`window_width` substring of `s`\"\"\"\n",
+        "\n",
+        "    assert window_width > 0, \"Window width must be positive\"\n",
+        "    assert multiplier > 0, \"Multiplier must be positive\"\n",
+        "    assert modulo > 0, \"Modulo must be positive\"\n",
+        "    assert multiplier < modulo, \"Multiplier must be less than modulo\"\n",
+        "\n",
+        "    if len(s) < window_width:\n",
+        "        return\n",
+        "\n",
+        "    current_hash: int = 0\n",
+        "    for char in s[:window_width]:\n",
+        "        new_term = ord(char) + salt\n",
+        "        assert new_term < (alphabet_size + salt), \"Pass correct `alphabet_size`\"\n",
+        "        current_hash = (current_hash * multiplier + new_term) % modulo\n",
+        "    yield current_hash\n",
+        "\n",
+        "    discarding_multiplier: int = pow(multiplier, window_width - 1, modulo)\n",
+        "    total_hashes = len(s) - window_width + 1\n",
+        "    for i in range(1, total_hashes):  # First hash is already yielded\n",
+        "        old_term = ord(s[i - 1]) + salt\n",
+        "        new_term = ord(s[i + window_width - 1]) + salt\n",
+        "\n",
+        "        # Remove leftmost char and add the new rightmost one.\n",
+        "        # All operations must be modulo `modulo`, but assuming the infinite precision of integers,\n",
+        "        # we don't care in this draft.\n",
+        "        current_hash = (current_hash - old_term * discarding_multiplier) % modulo\n",
+        "        current_hash = (current_hash * multiplier + new_term) % modulo\n",
+        "        yield current_hash\n",
+        "\n",
+        "\n",
+        "# Quick sanity-check\n",
+        "assert list(rabin_karp_ints(\"abcd\", 3, 31, 1_000_000_007)) == [\n",
+        "    next(rabin_karp_ints(\"abc\", 3, 31, 1_000_000_007)),\n",
+        "    next(rabin_karp_ints(\"bcd\", 3, 31, 1_000_000_007)),\n",
+        "]\n",
+        "assert list(rabin_karp_ints(\"abcdefdhijklmnopqr\", 17, 31, 65521)) == [\n",
+        "    next(rabin_karp_ints(\"abcdefdhijklmnopq\", 17, 31, 65521)),\n",
+        "    next(rabin_karp_ints(\"bcdefdhijklmnopqr\", 17, 31, 65521)),\n",
+        "]"
+      ]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "4,503,599,627,370,449\n"
-     ]
-    }
-   ],
-   "source": [
-    "from typing import Final, List, Generator\n",
-    "\n",
-    "# Fixed witnesses that make Miller-Rabin exact for n < 2**64\n",
-    "MR_BASES: Final[List[int]] = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37]\n",
-    "\n",
-    "\n",
-    "def _is_prime_64(n: int) -> bool:\n",
-    "    \"\"\"Exact primality for 0 < n < 2**64.\"\"\"\n",
-    "    if n < 2:\n",
-    "        return False\n",
-    "    # Quick reject: small prime factors\n",
-    "    for p in MR_BASES:  # covers all primes ≤ 37\n",
-    "        if n == p:\n",
-    "            return True\n",
-    "        if n % p == 0:\n",
-    "            return False\n",
-    "\n",
-    "    # Write n-1 = d · 2ˢ  with d odd\n",
-    "    d, s = n - 1, 0\n",
-    "    while d & 1 == 0:\n",
-    "        d >>= 1\n",
-    "        s += 1\n",
-    "\n",
-    "    # Strong-probable-prime test for each base\n",
-    "    for a in MR_BASES:\n",
-    "        x = pow(a, d, n)\n",
-    "        if x in (1, n - 1):  # self-loop or −1 ⇒ may be prime\n",
-    "            continue\n",
-    "        for _ in range(s - 1):  # square until −1 or cycle\n",
-    "            x = pow(x, 2, n)\n",
-    "            if x == n - 1:\n",
-    "                break\n",
-    "        else:  # never hit −1 ⇒ composite\n",
-    "            return False\n",
-    "    return True\n",
-    "\n",
-    "\n",
-    "def prev_primes(n: int) -> Generator[int, None, None]:\n",
-    "    \"\"\"\n",
-    "    Yield the largest primes strictly less than n (n must be > 2).\n",
-    "    Average cost: O(log n * log log n) because the prime gap ~ log n.\n",
-    "    \"\"\"\n",
-    "    if n <= 2:\n",
-    "        raise ValueError(\"Threshold must exceed 2.\")\n",
-    "    n -= n % 2 == 0  # make n odd\n",
-    "    while n > 2:\n",
-    "        if _is_prime_64(n):\n",
-    "            yield n\n",
-    "        n -= 2\n",
-    "\n",
-    "\n",
-    "def next_primes(n: int) -> Generator[int, None, None]:\n",
-    "    \"\"\"\n",
-    "    Yield the smallest primes strictly greater than n (n must be > 2).\n",
-    "    Average cost: O(log n * log log n) because the prime gap ~ log n.\n",
-    "    \"\"\"\n",
-    "    if n <= 2:\n",
-    "        raise ValueError(\"Threshold must exceed 2.\")\n",
-    "    n += n % 2 == 0  # make n odd\n",
-    "    while True:\n",
-    "        if _is_prime_64(n):\n",
-    "            yield n\n",
-    "        n += 2\n",
-    "\n",
-    "\n",
-    "LARGEST_INTEGRAL_FLOAT_PRIME = next(prev_primes(int(LARGEST_INTEGRAL_FLOAT)))\n",
-    "print(f\"{LARGEST_INTEGRAL_FLOAT_PRIME:,}\")  # This will be used for stress-testing"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Rabin-Karp Rolling Hashing via Floats\n",
+        "\n",
+        "The Python's `int` type is unbounded, so it can be used to implement the Rabin-Karp rolling hash algorithm without worrying about overflow.\n",
+        "It is, however, insanely expensive to use, and doesn't allow us to explore optimization opportunities.\n",
+        "The `float`, on the other hand, is just a double-precision IEEE 754 floating-point number, which can exactly represent 52-bit integers!\n",
+        "Thus, we can convert our arithmetic to use `float`s, if we guarantee, that no intermediate result will exceed that limit."
+      ]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Passed for window width: 3, modulo: 17,523,733,958,369!\n",
-      "Passed for window width: 17, modulo: 17,523,733,958,369!\n",
-      "Passed for window width: 64, modulo: 17,523,733,958,369!\n",
-      "Passed for window width: 707, modulo: 17,523,733,958,369!\n"
-     ]
-    }
-   ],
-   "source": [
-    "import random\n",
-    "\n",
-    "all_0 = \"\\x00\" * 1_000\n",
-    "all_1 = \"\\x01\" * 1_000\n",
-    "all_127 = \"\\x7f\" * 1_000\n",
-    "all_255 = \"\\xff\" * 1_000\n",
-    "all_0_255 = \"\\x00\\xff\" * 500  # alternating 0 and 255 characters\n",
-    "all_uncomfortable = \"\\x00\\x01\\x7f\\xfe\\xff\" * 250  # all uncomfortable characters\n",
-    "\n",
-    "long_random_strings = [\n",
-    "    \"\".join(random.choices(\"\\x00\\x01\\x7f\\xfe\\xff\", k=10_000)) for _ in range(10)\n",
-    "]  # 10 long random strings with uncomfortable characters\n",
-    "\n",
-    "alphabet_size = 256\n",
-    "multiplier = 257\n",
-    "largest_term = alphabet_size + 1  # in this specific case, same as `multiplier`\n",
-    "large_modulo = next(\n",
-    "    prev_primes(int(LARGEST_INTEGRAL_FLOAT) // multiplier - largest_term)\n",
-    ")\n",
-    "\n",
-    "for window_width in [3, 17, 64, 707]:\n",
-    "    for line in [\n",
-    "        all_0,\n",
-    "        all_1,\n",
-    "        all_127,\n",
-    "        all_255,\n",
-    "        all_0_255,\n",
-    "        all_uncomfortable,\n",
-    "        *long_random_strings,\n",
-    "    ]:\n",
-    "        compare_hashes(\n",
-    "            line,\n",
-    "            lambda l: rabin_karp_ints(\n",
-    "                l,\n",
-    "                window_width=window_width,\n",
-    "                multiplier=multiplier,\n",
-    "                modulo=large_modulo,\n",
-    "                alphabet_size=alphabet_size,\n",
-    "            ),\n",
-    "            lambda l: rabin_karp_fma(\n",
-    "                l,\n",
-    "                window_width=window_width,\n",
-    "                multiplier=multiplier,\n",
-    "                modulo=large_modulo,\n",
-    "                alphabet_size=alphabet_size,\n",
-    "            ),\n",
-    "        )\n",
-    "    print(f\"Passed for window width: {window_width}, modulo: {large_modulo:,}!\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Min-Hash Fingerprinting\n",
-    "\n",
-    "Min-Hash fingerprints transform variable length text representations into **fixed-length vectors**, where each dimension stores the minimum hash value of a certain hash function across the whole document.\n",
-    "It's great for large-scale information retrieval using Hamming Distance or Jaccard Similarity ($|A ∩ B| / |A ∪ B|$) or its weighted alternative.\n",
-    "\n",
-    "A potentially more informative alternative is \"weighted Min-Hash\", which takes into account the frequency of each element in the document. This makes the fingerprints compatible with **TF-IDF**-like algorithms, and makes the system more robust especially for narrow rolling windows."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from typing import Generator\n",
+        "\n",
+        "LARGEST_INTEGRAL_FLOAT: float = 4503599627370495.0\n",
+        "\n",
+        "\n",
+        "def rabin_karp_floats(\n",
+        "    s: str,\n",
+        "    window_width: int,\n",
+        "    multiplier: int,\n",
+        "    modulo: int,\n",
+        "    alphabet_size: int = 256,\n",
+        "    salt: int = 1,\n",
+        ") -> Generator[int, None, None]:\n",
+        "    \"\"\"Return the rolling polynomial hashes of every length-`window_width` substring of `s`\"\"\"\n",
+        "\n",
+        "    assert window_width > 0, \"Window width must be positive\"\n",
+        "    assert multiplier > 0, \"Multiplier must be positive\"\n",
+        "    assert modulo > 0, \"Modulo must be positive\"\n",
+        "    assert multiplier < modulo, \"Multiplier must be less than modulo\"\n",
+        "\n",
+        "    if len(s) < window_width:\n",
+        "        return\n",
+        "\n",
+        "    multiplier = float(multiplier)\n",
+        "    modulo = float(modulo)\n",
+        "    assert (\n",
+        "        modulo < LARGEST_INTEGRAL_FLOAT\n",
+        "    ), \"Modulo can't exceed the largest integral float value\"\n",
+        "\n",
+        "    # Ensure, we won't overflow the floating-point representation\n",
+        "    largest_post_modulo = modulo - 1\n",
+        "    max_possible_term = alphabet_size\n",
+        "    assert (\n",
+        "        largest_post_modulo * multiplier + max_possible_term <= LARGEST_INTEGRAL_FLOAT\n",
+        "    ), \"Will overflow\"\n",
+        "\n",
+        "    # All of the operations will happen with a modulo:\n",
+        "    def mul_mod(a: float, b: float) -> float:\n",
+        "        return (a * b) % modulo\n",
+        "\n",
+        "    def add_mod(a: float, b: float) -> float:\n",
+        "        return (a + b) % modulo\n",
+        "\n",
+        "    def sub_mod(a: float, b: float) -> float:\n",
+        "        return (a - b) % modulo\n",
+        "\n",
+        "    # Precompute the discarding multiplier\n",
+        "    discarding_multiplier: float = 1.0\n",
+        "    for _ in range(window_width - 1):\n",
+        "        discarding_multiplier = mul_mod(discarding_multiplier, multiplier)\n",
+        "\n",
+        "    # Handle the first window - without dropping any characters\n",
+        "    current_hash: float = 0.0\n",
+        "    for char in s[:window_width]:\n",
+        "        new_term = float(ord(char) + salt)\n",
+        "        assert new_term < (alphabet_size + salt), \"Pass correct `alphabet_size`\"\n",
+        "        current_hash = add_mod(mul_mod(current_hash, multiplier), new_term)\n",
+        "    yield int(current_hash)\n",
+        "\n",
+        "    # Roll through the rest of the string\n",
+        "    total_hashes = len(s) - window_width + 1\n",
+        "    for i in range(1, total_hashes):  # First hash is already yielded\n",
+        "        old_term = float(ord(s[i - 1]) + salt)\n",
+        "        new_term = float(ord(s[i + window_width - 1]) + salt)\n",
+        "\n",
+        "        # Remove leftmost char and add the new rightmost one.\n",
+        "        current_hash = sub_mod(current_hash, mul_mod(old_term, discarding_multiplier))\n",
+        "        current_hash = add_mod(mul_mod(current_hash, multiplier), new_term)\n",
+        "        yield int(current_hash)\n",
+        "\n",
+        "\n",
+        "# Quick sanity-check\n",
+        "assert list(rabin_karp_floats(\"abcd\", 3, 31, 1_000_000_007)) == [\n",
+        "    next(rabin_karp_floats(\"abc\", 3, 31, 1_000_000_007)),\n",
+        "    next(rabin_karp_floats(\"bcd\", 3, 31, 1_000_000_007)),\n",
+        "]\n",
+        "assert list(rabin_karp_floats(\"abcdefdhijklmnopqr\", 17, 31, 65521)) == [\n",
+        "    next(rabin_karp_floats(\"abcdefdhijklmnopq\", 17, 31, 65521)),\n",
+        "    next(rabin_karp_floats(\"bcdefdhijklmnopqr\", 17, 31, 65521)),\n",
+        "]"
+      ]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Defaulting to user installation because normal site-packages is not writeable\n",
-      "Requirement already satisfied: tqdm in /home/ubuntu/.local/lib/python3.10/site-packages (4.67.1)\n",
-      "Requirement already satisfied: numpy in /home/ubuntu/.local/lib/python3.10/site-packages (2.2.4)\n"
-     ]
-    }
-   ],
-   "source": [
-    "!pip install tqdm numpy"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Let's load some data and ensure that the outputs are identical between the `int` and `float` implementations."
+      ]
+    },
     {
-     "data": {
-      "text/plain": [
-       "(array([2256051662, 1712240109], dtype=uint32),\n",
-       " array([3, 2], dtype=uint32),\n",
-       " array(['abc', 'abcd'], dtype=StringDType()))"
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from pathlib import Path\n",
+        "\n",
+        "dataset_directory = Path(\"..\")"
       ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import numpy as np\n",
-    "from numpy.dtypes import StringDType\n",
-    "from typing import List, Tuple\n",
-    "from stringzilla import hash as sz_hash\n",
-    "\n",
-    "\n",
-    "def count_min_sketch(\n",
-    "    text: str,\n",
-    "    window_widths: List[int],\n",
-    "    seeds: List[int],\n",
-    "    hash_resolution: np.dtype = np.uint32,\n",
-    ") -> Tuple[np.ndarray, np.ndarray, np.ndarray]:\n",
-    "    \"\"\"\n",
-    "    Produces a weighted Min-Hash fingerprint also called a Count-Min Sketch.\n",
-    "    Uses StringZilla's native hash function, as opposed to the Rabin Karp.\n",
-    "\n",
-    "    https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch\n",
-    "    \"\"\"\n",
-    "\n",
-    "    fingerprint_hashes = np.empty((len(window_widths),), dtype=hash_resolution)\n",
-    "    fingerprint_weights = np.empty((len(window_widths),), dtype=np.uint32)\n",
-    "    fingerprint_ngrams = np.empty((len(window_widths),), dtype=StringDType())\n",
-    "\n",
-    "    skipped_final_hash = np.iinfo(hash_resolution).max\n",
-    "    skipped_u64_intermediary = np.iinfo(np.uint64).max\n",
-    "\n",
-    "    for i, (window_width, seed) in enumerate(zip(window_widths, seeds)):\n",
-    "        assert window_width > 0, \"Window width must be positive\"\n",
-    "\n",
-    "        smallest_hash = skipped_u64_intermediary\n",
-    "        smallest_count = 0\n",
-    "        smallest_example = None\n",
-    "\n",
-    "        for j in range(len(text) - window_width + 1):\n",
-    "            text_window = text[j : j + window_width]\n",
-    "            rolling_intermediate_u64_hash = sz_hash(text_window, seed)\n",
-    "            new_smallest_hash = min(smallest_hash, rolling_intermediate_u64_hash)\n",
-    "            if new_smallest_hash < smallest_hash:\n",
-    "                smallest_count = 1\n",
-    "                smallest_hash = new_smallest_hash\n",
-    "                smallest_example = text_window\n",
-    "            elif new_smallest_hash == smallest_hash:\n",
-    "                smallest_count += 1\n",
-    "\n",
-    "        smallest_hash &= skipped_final_hash  # Ensure we don't exceed the `uint32` range\n",
-    "        fingerprint_hashes[i] = smallest_hash\n",
-    "        fingerprint_weights[i] = smallest_count\n",
-    "        fingerprint_ngrams[i] = smallest_example\n",
-    "\n",
-    "    return fingerprint_hashes, fingerprint_weights, fingerprint_ngrams\n",
-    "\n",
-    "\n",
-    "count_min_sketch(\"abcde\", window_widths=[3, 4], seeds=[257, 258])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
+    },
     {
-     "data": {
-      "text/plain": [
-       "(array([   6498345, 1706860248], dtype=uint32),\n",
-       " array([3, 2], dtype=uint32),\n",
-       " array(['abc', 'abcd'], dtype=StringDType()))"
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "textual_dataset_path = dataset_directory / \"leipzig1M.txt\"\n",
+        "textual_dataset = open(textual_dataset_path, \"r\").read().strip()"
       ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import numpy as np\n",
-    "from numpy.dtypes import StringDType\n",
-    "from typing import List, Tuple\n",
-    "\n",
-    "\n",
-    "def rolling_count_min_sketch(\n",
-    "    text: str,\n",
-    "    window_widths: List[int],\n",
-    "    multipliers: List[int],\n",
-    "    salts: List[int],\n",
-    "    modulo: int,\n",
-    "    hash_resolution: np.dtype = np.uint32,\n",
-    ") -> Tuple[np.ndarray, np.ndarray, np.ndarray]:\n",
-    "    \"\"\"\n",
-    "    Produces a weighted Min-Hash fingerprint also called a Count-Min Sketch.\n",
-    "    Uses Rabin-Karp rolling hash function for algorithmic efficiency.\n",
-    "\n",
-    "    https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch\n",
-    "    \"\"\"\n",
-    "\n",
-    "    count_widths = len(window_widths)\n",
-    "    count_multipliers = len(multipliers)\n",
-    "    assert count_widths == count_multipliers, f\"{count_widths=} != {count_multipliers=}\"\n",
-    "\n",
-    "    fingerprint_hashes = np.empty((len(window_widths),), dtype=hash_resolution)\n",
-    "    fingerprint_weights = np.empty((len(window_widths),), dtype=np.uint32)\n",
-    "    fingerprint_ngrams = np.empty((len(window_widths),), dtype=StringDType())\n",
-    "\n",
-    "    skipped_final_hash = np.iinfo(hash_resolution).max\n",
-    "    skipped_u64_intermediary = np.iinfo(np.uint64).max\n",
-    "    hashers = [\n",
-    "        rabin_karp_ints(\n",
-    "            text,\n",
-    "            window_width=width,\n",
-    "            multiplier=multiplier,\n",
-    "            modulo=modulo,\n",
-    "            salt=salt,\n",
-    "        )\n",
-    "        for width, multiplier, salt in zip(window_widths, multipliers, salts)\n",
-    "    ]\n",
-    "\n",
-    "    for i, hasher in enumerate(hashers):\n",
-    "        smallest_hash = skipped_u64_intermediary\n",
-    "        smallest_count = 0\n",
-    "        smallest_example = None\n",
-    "\n",
-    "        for j, rolling_intermediate_u64_hash in enumerate(hasher):\n",
-    "            new_smallest_hash = min(smallest_hash, rolling_intermediate_u64_hash)\n",
-    "            if new_smallest_hash < smallest_hash:\n",
-    "                smallest_count = 1\n",
-    "                smallest_hash = new_smallest_hash\n",
-    "                # Extract N-gram from the correct position where minimum hash occurred\n",
-    "                smallest_example = text[j : j + window_widths[i]]\n",
-    "            elif new_smallest_hash == smallest_hash:\n",
-    "                smallest_count += 1\n",
-    "\n",
-    "        smallest_hash &= skipped_final_hash  # Ensure we don't exceed the `uint32` range\n",
-    "        fingerprint_hashes[i] = smallest_hash\n",
-    "        fingerprint_weights[i] = smallest_count\n",
-    "        fingerprint_ngrams[i] = smallest_example\n",
-    "\n",
-    "    return fingerprint_hashes, fingerprint_weights, fingerprint_ngrams\n",
-    "\n",
-    "\n",
-    "rolling_count_min_sketch(\n",
-    "    \"abcde\",\n",
-    "    window_widths=[3, 4],\n",
-    "    multipliers=[257, 258],\n",
-    "    salts=[1, 2],\n",
-    "    modulo=4503599626977,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "A good set of hyper-parameters for Min-Hashing binary text would be:\n",
-    "\n",
-    "- `window_widths`: ${3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 18, 21, 24, 27, 30}$ - 16 widths\n",
-    "- `alphabet_size`: $256$ for ASCII & binary UTF-8 content\n",
-    "- `ndim`: $16...1024$, something like 192 should be great for X/Twitter\n",
-    "- `multipliers`: ${257, 258, 259, 260, 261, 262, ..., 1024 + 256}$\n",
-    "\n",
-    "When processing less usual inputs, like the DNA sequences, parameters may be different, e.g.:\n",
-    "\n",
-    "- `window_widths`: ${3, 6, 9, 12, 15, 30, 60, 120}$\n",
-    "- `alphabet_size`: $4$ for DNA sequences\n",
-    "- `ndim`: should be probably proportional to $√n$, where $n$ is the typical length of sequences\n",
-    "- `multipliers`: ${5, 6, 7, 8, 9, ..., 4 * n + 1}$\n",
-    "\n",
-    "In every case, the `modulo` should be co-prime to the multiplier.\n",
-    "The easiest option is to use a large prime, that can be obtained via:\n",
-    "\n",
-    "```python\n",
-    "largest_prime_below(int(LARGEST_INTEGRAL_FLOAT) // max(multipliers) - (alphabet_size + 1))\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "from typing import Tuple\n",
-    "\n",
-    "\n",
-    "def jaccard_similarity(a: np.ndarray, b: np.ndarray) -> float:\n",
-    "    if a.shape != b.shape:\n",
-    "        raise ValueError(\"Fingerprints must have identical length\")\n",
-    "\n",
-    "    return float(np.mean(a == b))\n",
-    "\n",
-    "\n",
-    "def weighted_jaccard_similarity(\n",
-    "    a: Tuple[np.ndarray, np.ndarray],\n",
-    "    b: Tuple[np.ndarray, np.ndarray],\n",
-    ") -> float:\n",
-    "    hashes_a, weights_a = a\n",
-    "    hashes_b, weights_b = b\n",
-    "\n",
-    "    if hashes_a.shape != hashes_b.shape or weights_a.shape != weights_b.shape:\n",
-    "        raise ValueError(\"Both fingerprints must have identical dimensions\")\n",
-    "\n",
-    "    magnitude_i = (weights_a * weights_b)[hashes_a == hashes_b].sum()\n",
-    "    magnitude_a = (weights_a * weights_a).sum()\n",
-    "    magnitude_b = (weights_b * weights_b).sum()\n",
-    "    magnitude_u = magnitude_a + magnitude_b - magnitude_i\n",
-    "\n",
-    "    return float(magnitude_i) / float(magnitude_u)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [
+    },
     {
-     "data": {
-      "text/plain": [
-       "165161"
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Loaded 1,000,000 lines of mean length 128.64 characters\n"
+          ]
+        }
+      ],
+      "source": [
+        "textual_lines = textual_dataset.split(\"\\n\")\n",
+        "print(\n",
+        "    f\"Loaded {len(textual_lines):,} lines of mean length {sum(len(line) for line in textual_lines) / len(textual_lines):.2f} characters\"\n",
+        ")"
       ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "NDIM: int = 192\n",
-    "window_widths = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 18, 21, 24, 27, 30]\n",
-    "window_widths *= NDIM // len(window_widths)\n",
-    "\n",
-    "# For Rabin-Karp rolling hashes let's take different prime multipliers,\n",
-    "# with the smallest being a function of the window width and the largest easily representable integer:\n",
-    "smallest_multiplier = int(pow(LARGEST_INTEGRAL_FLOAT, 1 / min(window_widths)))\n",
-    "smallest_prime_multiplier = next(next_primes(smallest_multiplier))\n",
-    "smallest_prime_multiplier"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's compute the rolling fingerprints:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loaded 1,000,000 lines of mean length 128.64 characters\n"
-     ]
-    }
-   ],
-   "source": [
-    "textual_dataset_path = dataset_directory / \"leipzig1M.txt\"\n",
-    "textual_dataset = open(textual_dataset_path, \"r\").read().casefold().strip()\n",
-    "textual_lines = textual_dataset.split(\"\\n\")\n",
-    "print(\n",
-    "    f\"Loaded {len(textual_lines):,} lines of mean length {sum(len(line) for line in textual_lines) / len(textual_lines):.2f} characters\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def compare_hashes(line, make_baseline_generator, make_test_generator):\n",
+        "    int_hashes = list(make_baseline_generator(line))\n",
+        "    float_hashes = list(make_test_generator(line))\n",
+        "    if int_hashes != float_hashes:\n",
+        "        print(f\"Int Hashes:   {int_hashes}\")\n",
+        "        print(f\"Float Hashes: {float_hashes}\")\n",
+        "\n",
+        "\n",
+        "for line in textual_lines[:2]:\n",
+        "    compare_hashes(\n",
+        "        line,\n",
+        "        lambda l: rabin_karp_ints(l, 17, 31, 65521),\n",
+        "        lambda l: rabin_karp_floats(l, 17, 31, 65521),\n",
+        "    )"
+      ]
+    },
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Fingerprinting lines: 100%|██████████| 10000/10000 [01:35<00:00, 104.49line/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "from tqdm import tqdm\n",
-    "from itertools import islice\n",
-    "import random\n",
-    "\n",
-    "\n",
-    "def take_first_n(iterable, n):\n",
-    "    return islice(iterable, n)\n",
-    "\n",
-    "\n",
-    "def keep_each_nth(iterable, k):\n",
-    "    return (x for i, x in enumerate(iterable, 1) if i % k == 0)\n",
-    "\n",
-    "\n",
-    "prime_multipliers = list(\n",
-    "    take_first_n(keep_each_nth(next_primes(smallest_prime_multiplier), 7), NDIM)\n",
-    ")\n",
-    "random_multipliers = [random.randint(257, 1024 * 1024 * 16) for _ in range(NDIM)]\n",
-    "consecutive_multipliers = list(range(256, 256 + NDIM))\n",
-    "\n",
-    "salts = range(1, NDIM + 1)  # Use different salts for each window width\n",
-    "alphabet_size = 256\n",
-    "largest_term = alphabet_size + max(salts)\n",
-    "LARGEST_SAFE_MODULO = next(\n",
-    "    prev_primes(int(LARGEST_INTEGRAL_FLOAT) // max(prime_multipliers) - largest_term)\n",
-    ")\n",
-    "HASH_DTYPE = np.uint64\n",
-    "\n",
-    "fingerprint_hashes = []\n",
-    "fingerprint_counts = []\n",
-    "fingerprint_ngrams = []\n",
-    "\n",
-    "DATASET_SIZE_LIMIT = 10_000\n",
-    "\n",
-    "default_static_sketcher = lambda line: count_min_sketch(\n",
-    "    text=line,\n",
-    "    window_widths=window_widths,\n",
-    "    seeds=prime_multipliers,\n",
-    "    hash_resolution=HASH_DTYPE,\n",
-    ")\n",
-    "# For Rabin-Karp rolling hashes we pass more parameters:\n",
-    "default_rolling_sketcher = lambda line: rolling_count_min_sketch(\n",
-    "    text=line,\n",
-    "    window_widths=window_widths,\n",
-    "    multipliers=random_multipliers,\n",
-    "    salts=salts,\n",
-    "    modulo=LARGEST_SAFE_MODULO,\n",
-    "    hash_resolution=HASH_DTYPE,\n",
-    ")\n",
-    "\n",
-    "for line in tqdm(\n",
-    "    textual_lines[:DATASET_SIZE_LIMIT],\n",
-    "    desc=\"Fingerprinting lines\",\n",
-    "    unit=\"line\",\n",
-    "):\n",
-    "    hashes, counts, ngrams = default_rolling_sketcher(line)\n",
-    "    fingerprint_hashes.append(hashes)\n",
-    "    fingerprint_counts.append(counts)\n",
-    "    fingerprint_ngrams.append(ngrams)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "A bigger question now is, will the same hold, if we use much larger modulo values?"
+      ]
+    },
     {
-     "data": {
-      "text/plain": [
-       "24819100627"
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Passed for window width: 3!\n",
+            "Passed for window width: 17!\n",
+            "Passed for window width: 64!\n"
+          ]
+        }
+      ],
+      "source": [
+        "LARGEST_SAFE_MODULO = 4503599626977\n",
+        "\n",
+        "for window_width in [3, 17, 64]:\n",
+        "    for line in textual_lines[:50]:\n",
+        "        compare_hashes(\n",
+        "            line,\n",
+        "            lambda l: rabin_karp_ints(\n",
+        "                l, window_width=window_width, multiplier=257, modulo=LARGEST_SAFE_MODULO\n",
+        "            ),\n",
+        "            lambda l: rabin_karp_floats(\n",
+        "                l, window_width=window_width, multiplier=257, modulo=LARGEST_SAFE_MODULO\n",
+        "            ),\n",
+        "        )\n",
+        "    print(f\"Passed for window width: {window_width}!\")"
       ]
-     },
-     "execution_count": 19,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "LARGEST_SAFE_MODULO"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's cross-reference the fingerprints counting the number of hash collisions without our test set."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dimension 0: 492 unique hashes, 0 collisions\n",
-      "Dimension 1: 1,552 unique hashes, 0 collisions\n",
-      "Dimension 2: 2,663 unique hashes, 0 collisions\n",
-      "Dimension 3: 2,445 unique hashes, 0 collisions\n",
-      "Dimension 4: 4,936 unique hashes, 0 collisions\n",
-      "Dimension 5: 6,149 unique hashes, 0 collisions\n",
-      "Dimension 6: 7,266 unique hashes, 0 collisions\n",
-      "Dimension 7: 8,090 unique hashes, 0 collisions\n",
-      "Dimension 8: 8,556 unique hashes, 0 collisions\n",
-      "Dimension 9: 9,095 unique hashes, 0 collisions\n",
-      "Dimension 10: 9,654 unique hashes, 0 collisions\n",
-      "Dimension 11: 9,859 unique hashes, 0 collisions\n",
-      "Dimension 12: 9,927 unique hashes, 1 collisions\n",
-      "Dimension 13: 9,943 unique hashes, 0 collisions\n",
-      "Dimension 14: 9,923 unique hashes, 0 collisions\n",
-      "Dimension 15: 9,879 unique hashes, 0 collisions\n",
-      "Dimension 16: 470 unique hashes, 0 collisions\n",
-      "Dimension 17: 1,492 unique hashes, 0 collisions\n",
-      "Dimension 18: 2,358 unique hashes, 0 collisions\n",
-      "Dimension 19: 3,769 unique hashes, 0 collisions\n",
-      "Dimension 20: 5,362 unique hashes, 0 collisions\n",
-      "Dimension 21: 6,214 unique hashes, 0 collisions\n",
-      "Dimension 22: 7,165 unique hashes, 0 collisions\n",
-      "Dimension 23: 7,778 unique hashes, 0 collisions\n",
-      "Dimension 24: 8,727 unique hashes, 1 collisions\n",
-      "Dimension 25: 9,035 unique hashes, 0 collisions\n",
-      "Dimension 26: 9,689 unique hashes, 0 collisions\n",
-      "Dimension 27: 9,861 unique hashes, 0 collisions\n",
-      "Dimension 28: 9,902 unique hashes, 0 collisions\n",
-      "Dimension 29: 9,936 unique hashes, 0 collisions\n",
-      "Dimension 30: 9,926 unique hashes, 0 collisions\n",
-      "Dimension 31: 9,867 unique hashes, 0 collisions\n",
-      "Dimension 32: 419 unique hashes, 0 collisions\n",
-      "Dimension 33: 1,392 unique hashes, 0 collisions\n",
-      "Dimension 34: 2,428 unique hashes, 0 collisions\n",
-      "Dimension 35: 4,020 unique hashes, 0 collisions\n",
-      "Dimension 36: 4,429 unique hashes, 0 collisions\n",
-      "Dimension 37: 6,583 unique hashes, 0 collisions\n",
-      "Dimension 38: 7,302 unique hashes, 0 collisions\n",
-      "Dimension 39: 8,100 unique hashes, 0 collisions\n",
-      "Dimension 40: 8,722 unique hashes, 0 collisions\n",
-      "Dimension 41: 9,033 unique hashes, 0 collisions\n",
-      "Dimension 42: 9,689 unique hashes, 0 collisions\n",
-      "Dimension 43: 9,873 unique hashes, 0 collisions\n",
-      "Dimension 44: 9,922 unique hashes, 0 collisions\n",
-      "Dimension 45: 9,928 unique hashes, 0 collisions\n",
-      "Dimension 46: 9,923 unique hashes, 0 collisions\n",
-      "Dimension 47: 9,863 unique hashes, 0 collisions\n",
-      "Dimension 48: 355 unique hashes, 0 collisions\n",
-      "Dimension 49: 1,016 unique hashes, 0 collisions\n",
-      "Dimension 50: 2,388 unique hashes, 0 collisions\n",
-      "Dimension 51: 3,898 unique hashes, 0 collisions\n",
-      "Dimension 52: 5,295 unique hashes, 0 collisions\n",
-      "Dimension 53: 6,398 unique hashes, 0 collisions\n",
-      "Dimension 54: 7,403 unique hashes, 0 collisions\n",
-      "Dimension 55: 7,948 unique hashes, 0 collisions\n",
-      "Dimension 56: 8,646 unique hashes, 0 collisions\n",
-      "Dimension 57: 8,946 unique hashes, 1 collisions\n",
-      "Dimension 58: 9,648 unique hashes, 0 collisions\n",
-      "Dimension 59: 9,849 unique hashes, 0 collisions\n",
-      "Dimension 60: 9,909 unique hashes, 0 collisions\n",
-      "Dimension 61: 9,928 unique hashes, 0 collisions\n",
-      "Dimension 62: 9,915 unique hashes, 0 collisions\n",
-      "Dimension 63: 9,863 unique hashes, 0 collisions\n",
-      "Dimension 64: 607 unique hashes, 0 collisions\n",
-      "Dimension 65: 809 unique hashes, 0 collisions\n",
-      "Dimension 66: 2,237 unique hashes, 0 collisions\n",
-      "Dimension 67: 3,450 unique hashes, 0 collisions\n",
-      "Dimension 68: 4,635 unique hashes, 0 collisions\n",
-      "Dimension 69: 6,308 unique hashes, 0 collisions\n",
-      "Dimension 70: 7,594 unique hashes, 0 collisions\n",
-      "Dimension 71: 8,234 unique hashes, 0 collisions\n",
-      "Dimension 72: 8,535 unique hashes, 0 collisions\n",
-      "Dimension 73: 8,981 unique hashes, 0 collisions\n",
-      "Dimension 74: 9,643 unique hashes, 0 collisions\n",
-      "Dimension 75: 9,879 unique hashes, 0 collisions\n",
-      "Dimension 76: 9,921 unique hashes, 0 collisions\n",
-      "Dimension 77: 9,932 unique hashes, 1 collisions\n",
-      "Dimension 78: 9,922 unique hashes, 0 collisions\n",
-      "Dimension 79: 9,867 unique hashes, 0 collisions\n",
-      "Dimension 80: 394 unique hashes, 0 collisions\n",
-      "Dimension 81: 1,296 unique hashes, 0 collisions\n",
-      "Dimension 82: 2,071 unique hashes, 0 collisions\n",
-      "Dimension 83: 3,764 unique hashes, 0 collisions\n",
-      "Dimension 84: 4,886 unique hashes, 0 collisions\n",
-      "Dimension 85: 6,365 unique hashes, 0 collisions\n",
-      "Dimension 86: 7,446 unique hashes, 0 collisions\n",
-      "Dimension 87: 7,888 unique hashes, 0 collisions\n",
-      "Dimension 88: 8,647 unique hashes, 0 collisions\n",
-      "Dimension 89: 8,906 unique hashes, 0 collisions\n",
-      "Dimension 90: 9,622 unique hashes, 1 collisions\n",
-      "Dimension 91: 9,874 unique hashes, 1 collisions\n",
-      "Dimension 92: 9,932 unique hashes, 0 collisions\n",
-      "Dimension 93: 9,919 unique hashes, 0 collisions\n",
-      "Dimension 94: 9,906 unique hashes, 1 collisions\n",
-      "Dimension 95: 9,870 unique hashes, 0 collisions\n",
-      "Dimension 96: 681 unique hashes, 0 collisions\n",
-      "Dimension 97: 1,241 unique hashes, 0 collisions\n",
-      "Dimension 98: 2,499 unique hashes, 0 collisions\n",
-      "Dimension 99: 3,866 unique hashes, 0 collisions\n",
-      "Dimension 100: 4,805 unique hashes, 0 collisions\n",
-      "Dimension 101: 6,347 unique hashes, 0 collisions\n",
-      "Dimension 102: 7,291 unique hashes, 0 collisions\n",
-      "Dimension 103: 7,909 unique hashes, 0 collisions\n",
-      "Dimension 104: 8,601 unique hashes, 0 collisions\n",
-      "Dimension 105: 8,999 unique hashes, 0 collisions\n",
-      "Dimension 106: 9,680 unique hashes, 0 collisions\n",
-      "Dimension 107: 9,861 unique hashes, 0 collisions\n",
-      "Dimension 108: 9,938 unique hashes, 0 collisions\n",
-      "Dimension 109: 9,927 unique hashes, 0 collisions\n",
-      "Dimension 110: 9,913 unique hashes, 0 collisions\n",
-      "Dimension 111: 9,871 unique hashes, 0 collisions\n",
-      "Dimension 112: 423 unique hashes, 0 collisions\n",
-      "Dimension 113: 684 unique hashes, 0 collisions\n",
-      "Dimension 114: 2,054 unique hashes, 0 collisions\n",
-      "Dimension 115: 3,796 unique hashes, 0 collisions\n",
-      "Dimension 116: 5,107 unique hashes, 0 collisions\n",
-      "Dimension 117: 5,942 unique hashes, 0 collisions\n",
-      "Dimension 118: 7,303 unique hashes, 0 collisions\n",
-      "Dimension 119: 7,934 unique hashes, 0 collisions\n",
-      "Dimension 120: 8,373 unique hashes, 0 collisions\n",
-      "Dimension 121: 9,024 unique hashes, 0 collisions\n",
-      "Dimension 122: 9,657 unique hashes, 0 collisions\n",
-      "Dimension 123: 9,874 unique hashes, 0 collisions\n",
-      "Dimension 124: 9,907 unique hashes, 0 collisions\n",
-      "Dimension 125: 9,927 unique hashes, 0 collisions\n",
-      "Dimension 126: 9,915 unique hashes, 0 collisions\n",
-      "Dimension 127: 9,872 unique hashes, 0 collisions\n",
-      "Dimension 128: 510 unique hashes, 0 collisions\n",
-      "Dimension 129: 1,205 unique hashes, 0 collisions\n",
-      "Dimension 130: 2,864 unique hashes, 0 collisions\n",
-      "Dimension 131: 2,888 unique hashes, 0 collisions\n",
-      "Dimension 132: 5,104 unique hashes, 0 collisions\n",
-      "Dimension 133: 6,185 unique hashes, 0 collisions\n",
-      "Dimension 134: 7,591 unique hashes, 0 collisions\n",
-      "Dimension 135: 7,889 unique hashes, 0 collisions\n",
-      "Dimension 136: 8,516 unique hashes, 0 collisions\n",
-      "Dimension 137: 8,822 unique hashes, 0 collisions\n",
-      "Dimension 138: 9,675 unique hashes, 0 collisions\n",
-      "Dimension 139: 9,882 unique hashes, 0 collisions\n",
-      "Dimension 140: 9,909 unique hashes, 0 collisions\n",
-      "Dimension 141: 9,927 unique hashes, 0 collisions\n",
-      "Dimension 142: 9,916 unique hashes, 0 collisions\n",
-      "Dimension 143: 9,867 unique hashes, 0 collisions\n",
-      "Dimension 144: 691 unique hashes, 0 collisions\n",
-      "Dimension 145: 1,401 unique hashes, 0 collisions\n",
-      "Dimension 146: 2,549 unique hashes, 0 collisions\n",
-      "Dimension 147: 4,065 unique hashes, 0 collisions\n",
-      "Dimension 148: 4,908 unique hashes, 0 collisions\n",
-      "Dimension 149: 6,141 unique hashes, 0 collisions\n",
-      "Dimension 150: 7,289 unique hashes, 0 collisions\n",
-      "Dimension 151: 7,991 unique hashes, 0 collisions\n",
-      "Dimension 152: 8,693 unique hashes, 1 collisions\n",
-      "Dimension 153: 8,986 unique hashes, 0 collisions\n",
-      "Dimension 154: 9,641 unique hashes, 0 collisions\n",
-      "Dimension 155: 9,822 unique hashes, 0 collisions\n",
-      "Dimension 156: 9,916 unique hashes, 0 collisions\n",
-      "Dimension 157: 9,936 unique hashes, 0 collisions\n",
-      "Dimension 158: 9,920 unique hashes, 0 collisions\n",
-      "Dimension 159: 9,863 unique hashes, 0 collisions\n",
-      "Dimension 160: 530 unique hashes, 0 collisions\n",
-      "Dimension 161: 1,749 unique hashes, 0 collisions\n",
-      "Dimension 162: 2,507 unique hashes, 0 collisions\n",
-      "Dimension 163: 4,411 unique hashes, 0 collisions\n",
-      "Dimension 164: 5,242 unique hashes, 0 collisions\n",
-      "Dimension 165: 6,319 unique hashes, 0 collisions\n",
-      "Dimension 166: 7,316 unique hashes, 0 collisions\n",
-      "Dimension 167: 7,856 unique hashes, 0 collisions\n",
-      "Dimension 168: 8,560 unique hashes, 0 collisions\n",
-      "Dimension 169: 8,924 unique hashes, 0 collisions\n",
-      "Dimension 170: 9,681 unique hashes, 0 collisions\n",
-      "Dimension 171: 9,845 unique hashes, 0 collisions\n",
-      "Dimension 172: 9,935 unique hashes, 0 collisions\n",
-      "Dimension 173: 9,942 unique hashes, 0 collisions\n",
-      "Dimension 174: 9,917 unique hashes, 0 collisions\n",
-      "Dimension 175: 9,876 unique hashes, 0 collisions\n",
-      "Dimension 176: 604 unique hashes, 0 collisions\n",
-      "Dimension 177: 1,569 unique hashes, 0 collisions\n",
-      "Dimension 178: 2,690 unique hashes, 0 collisions\n",
-      "Dimension 179: 3,509 unique hashes, 0 collisions\n",
-      "Dimension 180: 5,181 unique hashes, 0 collisions\n",
-      "Dimension 181: 6,340 unique hashes, 0 collisions\n",
-      "Dimension 182: 7,285 unique hashes, 0 collisions\n",
-      "Dimension 183: 8,280 unique hashes, 0 collisions\n",
-      "Dimension 184: 8,628 unique hashes, 0 collisions\n",
-      "Dimension 185: 9,028 unique hashes, 0 collisions\n",
-      "Dimension 186: 9,631 unique hashes, 0 collisions\n",
-      "Dimension 187: 9,872 unique hashes, 0 collisions\n",
-      "Dimension 188: 9,910 unique hashes, 0 collisions\n",
-      "Dimension 189: 9,926 unique hashes, 0 collisions\n",
-      "Dimension 190: 9,918 unique hashes, 0 collisions\n",
-      "Dimension 191: 9,875 unique hashes, 0 collisions\n"
-     ]
-    }
-   ],
-   "source": [
-    "from typing import Dict, Set\n",
-    "\n",
-    "for dim in range(len(window_widths)):\n",
-    "    hash_to_ngram: Dict[int, str] = {}\n",
-    "    hash_collisions: Set[int] = set()\n",
-    "    for hashes, ngrams in zip(fingerprint_hashes, fingerprint_ngrams):\n",
-    "        hash_value = hashes[dim]\n",
-    "        ngram_value = ngrams[dim]\n",
-    "        if hash_value not in hash_to_ngram:\n",
-    "            hash_to_ngram[hash_value] = ngram_value\n",
-    "        elif hash_to_ngram[hash_value] != ngram_value:\n",
-    "            hash_collisions.add(hash_value)\n",
-    "\n",
-    "    print(\n",
-    "        f\"Dimension {dim}: {len(hash_to_ngram):,} unique hashes, {len(hash_collisions):,} collisions\"\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's estimate Recall @ 1, but before we do that - let's find a way to highlight N-gram matches between strings."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Rabin-Karp Rolling Hashing via FMAs\n",
+        "\n",
+        "- How aggressively can we use **FMA** (Fused Multiply-Add) operations to optimize the algorithm?\n",
+        "- How many of the modulo operations can we avoid?\n",
+        "- How can we simplify the `%` modulo operation?"
+      ]
+    },
     {
-     "data": {
-      "text/plain": [
-       "(\"A short <span style='color:#ff8000'><span style='color:#ffff00'>s<span style='color:#00ff00'>trin</span></span>g</span> w<span style='color:#0080ff'><span style='color:#ff00ff'>ith </span></span>an <span style='color:#ff0000'>n<span style='color:#800080'><span style='color:#ff0000'>-gr</span></span>am</span>\",\n",
-       " \"Longer <span style='color:#ff8000'><span style='color:#ffff00'>s<span style='color:#00ff00'>trin</span></span>g</span>s w<span style='color:#0080ff'><span style='color:#ff00ff'>ith </span></span>different <span style='color:#ff0000'>n<span style='color:#800080'><span style='color:#ff0000'>-gr</span></span>am</span>s\")"
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import math\n",
+        "from typing import Generator\n",
+        "\n",
+        "LARGEST_INTEGRAL_FLOAT: float = 4503599627370495.0\n",
+        "\n",
+        "\n",
+        "def rabin_karp_fma(\n",
+        "    s: str,\n",
+        "    window_width: int,\n",
+        "    multiplier: int,\n",
+        "    modulo: int,\n",
+        "    alphabet_size: int = 256,\n",
+        "    salt: int = 1,\n",
+        ") -> Generator[int, None, None]:\n",
+        "    \"\"\"Return the rolling polynomial hashes of every length-`window_width` substring of `s`\n",
+        "    using Fused-Multiply-Add (FMA) operations & Barrett reduction for performance.\"\"\"\n",
+        "\n",
+        "    assert window_width > 0, \"Window width must be positive\"\n",
+        "    assert multiplier > 0, \"Multiplier must be positive\"\n",
+        "    assert modulo > 0, \"Modulo must be positive\"\n",
+        "    assert multiplier < modulo, \"Multiplier must be less than modulo\"\n",
+        "\n",
+        "    if len(s) < window_width:\n",
+        "        return\n",
+        "\n",
+        "    multiplier = float(multiplier)\n",
+        "    modulo = float(modulo)\n",
+        "    assert (\n",
+        "        modulo < LARGEST_INTEGRAL_FLOAT\n",
+        "    ), \"Modulo can't exceed the largest integral float value\"\n",
+        "\n",
+        "    # Ensure, we won't overflow the floating-point representation\n",
+        "    largest_post_modulo = modulo - 1\n",
+        "    max_possible_term = alphabet_size\n",
+        "    assert (\n",
+        "        largest_post_modulo * multiplier + max_possible_term <= LARGEST_INTEGRAL_FLOAT\n",
+        "    ), \"Will overflow\"\n",
+        "\n",
+        "    inverse_modulo: float = 1.0 / modulo\n",
+        "\n",
+        "    # Barrett reduction function\n",
+        "    # It will be used to reduce the intermediate results to the modulo range\n",
+        "    def barrett_mod(x: float) -> float:\n",
+        "        q = math.floor(x * inverse_modulo)\n",
+        "        result = x - q * modulo\n",
+        "        # Handle potential off-by-one errors\n",
+        "        if result >= modulo:\n",
+        "            result -= modulo\n",
+        "        elif result < 0:\n",
+        "            result += modulo\n",
+        "        assert int(result) == int(x % modulo), \"Barrett reduction failed\"\n",
+        "        return result\n",
+        "\n",
+        "    # All of the operations will happen with a modulo:\n",
+        "    def fma_mod(a: float, b: float, c: float) -> float:\n",
+        "        intermediate = a * b + c\n",
+        "        assert intermediate <= LARGEST_INTEGRAL_FLOAT, \"FMA did exceed integral range\"\n",
+        "        return barrett_mod(intermediate)\n",
+        "\n",
+        "    # Precompute the discarding multiplier\n",
+        "    negative_discarding_multiplier: float = 1.0\n",
+        "    for _ in range(window_width - 1):\n",
+        "        negative_discarding_multiplier = fma_mod(\n",
+        "            negative_discarding_multiplier, multiplier, 0.0\n",
+        "        )\n",
+        "    negative_discarding_multiplier = (\n",
+        "        -negative_discarding_multiplier\n",
+        "    )  # Negate for FMA compatibility\n",
+        "\n",
+        "    # Handle the first window - without dropping any characters\n",
+        "    current_hash: float = 0.0\n",
+        "    for char in s[:window_width]:\n",
+        "        new_term = float(ord(char) + salt)\n",
+        "        assert new_term < (alphabet_size + salt), \"Pass correct `alphabet_size`\"\n",
+        "        current_hash = fma_mod(current_hash, multiplier, new_term)\n",
+        "    yield int(current_hash)\n",
+        "\n",
+        "    # Roll through the rest of the string\n",
+        "    total_hashes = len(s) - window_width + 1\n",
+        "    for i in range(1, total_hashes):  # First hash is already yielded\n",
+        "        old_term = float(ord(s[i - 1]) + salt)\n",
+        "        new_term = float(ord(s[i + window_width - 1]) + salt)\n",
+        "\n",
+        "        # Remove leftmost char and add the new rightmost one.\n",
+        "        current_hash = fma_mod(old_term, negative_discarding_multiplier, current_hash)\n",
+        "        assert (\n",
+        "            current_hash >= -modulo\n",
+        "        ), \"Intermediate hash may be negative, but within modulo range\"\n",
+        "        current_hash = fma_mod(current_hash, multiplier, new_term)\n",
+        "        assert current_hash >= 0, \"Current hash should not be negative\"\n",
+        "        yield int(current_hash)\n",
+        "\n",
+        "\n",
+        "# Quick sanity-check\n",
+        "assert list(rabin_karp_fma(\"abcd\", 3, 31, 1_000_000_007)) == [\n",
+        "    next(rabin_karp_fma(\"abc\", 3, 31, 1_000_000_007)),\n",
+        "    next(rabin_karp_fma(\"bcd\", 3, 31, 1_000_000_007)),\n",
+        "]\n",
+        "assert list(rabin_karp_fma(\"abcdefdhijklmnopqr\", 17, 31, 65521)) == [\n",
+        "    next(rabin_karp_fma(\"abcdefdhijklmnopq\", 17, 31, 65521)),\n",
+        "    next(rabin_karp_fma(\"bcdefdhijklmnopqr\", 17, 31, 65521)),\n",
+        "]"
       ]
-     },
-     "execution_count": 21,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from typing import Tuple\n",
-    "from IPython.display import HTML\n",
-    "import numpy as np\n",
-    "\n",
-    "HTML_COLORS = [\n",
-    "    \"#ff0000\",\n",
-    "    \"#ff8000\",\n",
-    "    \"#ffff00\",\n",
-    "    \"#00ff00\",\n",
-    "    \"#0080ff\",\n",
-    "    \"#ff00ff\",\n",
-    "    \"#800080\",\n",
-    "]\n",
-    "ASCII_COLORS = [\n",
-    "    \"\\033[38;5;196m\",  # red\n",
-    "    \"\\033[38;5;208m\",  # orange\n",
-    "    \"\\033[38;5;226m\",  # yellow\n",
-    "    \"\\033[38;5;082m\",  # green\n",
-    "    \"\\033[38;5;039m\",  # blue\n",
-    "    \"\\033[38;5;201m\",  # magenta\n",
-    "    \"\\033[38;5;129m\",  # purple\n",
-    "]\n",
-    "\n",
-    "\n",
-    "def color_code_matches(\n",
-    "    query_text: str,\n",
-    "    document_text: str,\n",
-    "    query_hashes: np.ndarray,\n",
-    "    document_hashes: np.ndarray,\n",
-    "    query_ngrams: np.ndarray,\n",
-    "    document_ngrams: np.ndarray,\n",
-    "    *,\n",
-    "    html: bool = True,\n",
-    ") -> Tuple[str, str]:\n",
-    "    \"\"\"Highlight matching n‑grams / hash‑collisions in the two texts.\"\"\"\n",
-    "\n",
-    "    COLOR_ARRAY = (\n",
-    "        [f\"<span style='color:{hex_}'>\" for hex_ in HTML_COLORS]\n",
-    "        if html\n",
-    "        else ASCII_COLORS\n",
-    "    )\n",
-    "    COLOR_COLLISION = (\n",
-    "        \"<span style='color:#888888'>\" if html else \"\\033[38;5;244m\"\n",
-    "    )  # grey\n",
-    "    COLOR_RESET = \"</span>\" if html else \"\\033[0m\"\n",
-    "\n",
-    "    def number_of_matches_in_dimension(dim: int) -> int:\n",
-    "        if len(query_ngrams[dim]) == 0 or len(document_ngrams[dim]) == 0:\n",
-    "            return 0\n",
-    "        return min(\n",
-    "            query_text.count(query_ngrams[dim]),\n",
-    "            document_text.count(document_ngrams[dim]),\n",
-    "        )\n",
-    "\n",
-    "    def ngram_length_in_dimension(dim: int) -> int:\n",
-    "        return len(query_ngrams[dim]) if dim < len(query_ngrams) else 0\n",
-    "\n",
-    "    all_dims = [\n",
-    "        d for d in range(len(query_hashes)) if number_of_matches_in_dimension(d)\n",
-    "    ]\n",
-    "    all_dims.sort(key=ngram_length_in_dimension, reverse=True)\n",
-    "\n",
-    "    color_index = 0\n",
-    "    for dim in all_dims:\n",
-    "        if number_of_matches_in_dimension(dim) == 0:\n",
-    "            continue\n",
-    "\n",
-    "        is_hash_eq = query_hashes[dim] == document_hashes[dim]\n",
-    "        is_ngram_eq = query_ngrams[dim] == document_ngrams[dim]\n",
-    "        token = query_ngrams[dim]\n",
-    "        assert token, \"N‑gram must not be empty\"\n",
-    "\n",
-    "        if is_ngram_eq:\n",
-    "            color_tag = COLOR_ARRAY[color_index % len(COLOR_ARRAY)]\n",
-    "            replacement = f\"{color_tag}{token}{COLOR_RESET}\"\n",
-    "            color_index += 1\n",
-    "        elif is_hash_eq:\n",
-    "            replacement = f\"{COLOR_COLLISION}{token}{COLOR_RESET}\"\n",
-    "        else:\n",
-    "            continue\n",
-    "\n",
-    "        query_text = query_text.replace(token, replacement)\n",
-    "        document_text = document_text.replace(token, replacement)\n",
-    "\n",
-    "    return query_text, document_text\n",
-    "\n",
-    "\n",
-    "query_text = \"A short string with an n-gram\"\n",
-    "document_text = \"Longer strings with different n-grams\"\n",
-    "query_hashes, query_weights, query_ngrams = default_rolling_sketcher(query_text)\n",
-    "document_hashes, document_weights, document_ngrams = default_rolling_sketcher(\n",
-    "    document_text\n",
-    ")\n",
-    "color_code_matches(\n",
-    "    query_text=query_text,\n",
-    "    document_text=document_text,\n",
-    "    query_hashes=query_hashes,\n",
-    "    document_hashes=document_hashes,\n",
-    "    query_ngrams=query_ngrams,\n",
-    "    document_ngrams=document_ngrams,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Passed for window width: 3!\n",
+            "Passed for window width: 17!\n",
+            "Passed for window width: 64!\n"
+          ]
+        }
+      ],
+      "source": [
+        "LARGEST_SAFE_MODULO = 4503599626977\n",
+        "\n",
+        "for window_width in [3, 17, 64]:\n",
+        "    for line in textual_lines[:50]:\n",
+        "        compare_hashes(\n",
+        "            line,\n",
+        "            lambda l: rabin_karp_ints(\n",
+        "                l, window_width=window_width, multiplier=257, modulo=LARGEST_SAFE_MODULO\n",
+        "            ),\n",
+        "            lambda l: rabin_karp_fma(\n",
+        "                l, window_width=window_width, multiplier=257, modulo=LARGEST_SAFE_MODULO\n",
+        "            ),\n",
+        "        )\n",
+        "    print(f\"Passed for window width: {window_width}!\")"
+      ]
+    },
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Searching: 100%|██████████| 10/10 [00:00<00:00, 26.40doc/s]\n"
-     ]
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "As we can handle typical texts, let's try several tricky inputs... where we'll be at a brink of an overflow! Some uncomfortable character values are: `\\x00`, `\\x01`, `\\x7F`, `\\xFF`. To really stress-test, let's pick the largest prime number below `LARGEST_INTEGRAL_FLOAT`, that can be used safely for a given alphabet size."
+      ]
     },
     {
-     "data": {
-      "text/html": [
-       "<pre style='font-family:monospace'><b>Matched query 0 with document 1,559 with score 0.0625</b><br/>- a rebel statement sent to lisbon from jamba said 86 government soldiers and 13 gu<span style='color:#ff0000'>er<span style='color:#ff8000'>ril<span style='color:#ff00ff'>l<span style='color:#ff0000'>as w</span>er</span>e<span style='color:#0080ff'><span style='color:#800080'> <span style='color:#ff8000'>kil</span></span>led i</span>n</span> </span>the fighting that ended jan. 3. it said<span style='color:#ffff00'><span style='color:#00ff00'> the rebe</span>l</span> forces sill held mavinga.<br/>- hours later, six leftist gu<span style='color:#ff0000'>er<span style='color:#ff8000'>ril<span style='color:#ff00ff'>l<span style='color:#ff0000'>as w</span>er</span>e<span style='color:#0080ff'><span style='color:#800080'> <span style='color:#ff8000'>kil</span></span>led i</span>n</span> </span>a battle with a special army brigade created to fight<span style='color:#ffff00'><span style='color:#00ff00'> the rebe</span>l</span>s.<br/><b>Matched query 1 with document 3,483 with score 0.0417</b><br/>- authoriti<span style='color:#ff0000'>e<span style='color:#ff8000'>s <span style='color:#ffff00'><span style='color:#0080ff'>las</span>t we</span>e</span></span>k issued a vacate order for a club in manhattan a<span style='color:#00ff00'>nd </span>closed another in the bronx.<br/>- pric<span style='color:#ff0000'>e<span style='color:#ff8000'>s <span style='color:#ffff00'><span style='color:#0080ff'>las</span>t we</span>e</span></span>k were off 41% from six years ago a<span style='color:#00ff00'>nd </span>11% from <span style='color:#0080ff'>las</span>t year's sale.<br/><b>Matched query 2 with document 8,745 with score 0.0469</b><br/>- at <span style='color:#ff0000'>th<span style='color:#ffff00'>e <span style='color:#00ff00'><span style='color:#0080ff'>first</span></span></span></span> pan am bankruptcy hearing, <span style='color:#ff8000'>fo<span style='color:#ff00ff'>r <span style='color:#800080'>exa</span></span>m</span>ple, at least five airlines were represented.<br/>- <span style='color:#ff8000'>fo<span style='color:#ff00ff'>r <span style='color:#800080'>exa</span></span>m</span>ple, libya was <span style='color:#ff0000'>th<span style='color:#ffff00'>e <span style='color:#00ff00'><span style='color:#0080ff'>first</span></span></span></span> to break the $30-a-barrel level and others followed.<br/><b>Matched query 3 with document 2,243 with score 0.0469</b><br/>- mr. neigum, poker-faced<span style='color:#ff0000'> <span style='color:#ff8000'>d<span style='color:#ffff00'>uring t</span>he</span> </span>difficult task, manages a 46-second showing.<br/>- the june contract of the long gilt future on liffe traded between 108 21/32 and 107 5/8<span style='color:#ff0000'> <span style='color:#ff8000'>d<span style='color:#ffff00'>uring t</span>he</span> </span>day.<br/><b>Matched query 4 with document 9,748 with score 0.0417</b><br/>- this,<span style='color:#ff0000'> co<span style='color:#ff8000'>mb<span style='color:#ffff00'>ined </span>wit</span>h </span>the container division talks, suggests the group's bankers might be considering a<span style='color:#00ff00'>n o</span>rderly disposal of<span style='color:#0080ff'> al</span>l assets.<br/>- the near-extinctio<span style='color:#00ff00'>n o</span>f the buffalo herd,<span style='color:#ff0000'> co<span style='color:#ff8000'>mb<span style='color:#ffff00'>ined </span>wit</span>h </span>pressure from local missionaries,<span style='color:#0080ff'> al</span>l but wiped out the rituals that had united the omahas for centuries.<br/><b>Matched query 5 with document 8,400 with score 0.0625</b><br/>- she told the post in <span style='color:#ff0000'>an interv<span style='color:#ff8000'>i<span style='color:#0080ff'>ew </span>p</span>u</span>blished sunday that some of the money may have become \"mingled<span style='color:#00ff00'>\" i</span>nto improvements on her home that included a swimming pool, a $2,500 wide-screen televisio<span style='color:#ffff00'>n a</span>nd renovations to her basement.<br/>- eisner asked. \"i<span style='color:#ffff00'>n a</span> word, panic.<span style='color:#00ff00'>\" i</span>n <span style='color:#ff0000'>an interv<span style='color:#ff8000'>i<span style='color:#0080ff'>ew </span>p</span>u</span>blished in friday's contra costa times, eisner, 48, said he rarely rests easy.<br/><b>Matched query 6 with document 5,453 with score 0.0573</b><br/>- <span style='color:#ffff00'>accor</span>di<span style='color:#ff0000'><span style='color:#ff8000'>n<span style='color:#00ff00'>g to</span></span> a s</span>tud<span style='color:#0080ff'>y b</span>y the marshall institute, the average nasa employee's age in 1963 was 30; now most of its senior and middle-managers will be eligible to retire in five years.<br/>- the resolution was passed tuesda<span style='color:#0080ff'>y b</span>y the afl-cio executive council, <span style='color:#ffff00'>accor</span>di<span style='color:#ff0000'><span style='color:#ff8000'>n<span style='color:#00ff00'>g to</span></span> a s</span>tatement.<br/><b>Matched query 7 with document 8,218 with score 0.0990</b><br/>- preston tisch, 62, is <span style='color:#ffff00'><span style='color:#00ff00'>pre<span style='color:#ff0000'>side</span>nt </span></span>and co-<span style='color:#ff0000'><span style='color:#ff8000'>chi<span style='color:#0080ff'>ef <span style='color:#ff00ff'><span style='color:#800080'>e<span style='color:#ff8000'><span style='color:#ffff00'>xec</span></span></span>u</span></span>t</span>ive offic</span>er of loews corp. and is a former postmaster general.<br/>- hormel said charles b. olson, <span style='color:#ffff00'><span style='color:#00ff00'>pre<span style='color:#ff0000'>side</span>nt </span></span>of jennie-o, will also be named <span style='color:#ff0000'><span style='color:#ff8000'>chi<span style='color:#0080ff'>ef <span style='color:#ff00ff'><span style='color:#800080'>e<span style='color:#ff8000'><span style='color:#ffff00'>xec</span></span></span>u</span></span>t</span>ive offic</span>er of the new subsidiary.<br/><b>Matched query 8 with document 3,568 with score 0.0469</b><br/>- \"we're dealing with an <span style='color:#ff8000'>o<span style='color:#00ff00'>wner</span></span> who couldn't give<span style='color:#ff00ff'> a </span>rip. th<span style='color:#ff0000'>ey </span>cut off her mail a<span style='color:#800080'>nd </span>she got<span style='color:#ff00ff'> a </span>post office box.\" starting friday, a<span style='color:#0080ff'>n a</span>nimal-control officer is ac<span style='color:#ff0000'>c<span style='color:#ffff00'>ompa</span>n</span>ying finster on his route.<br/>- i<span style='color:#0080ff'>n a</span> letter to the <span style='color:#ff0000'>c<span style='color:#ffff00'>ompa</span>n</span>y's board of directors, united airline's pilots, flight attendants a<span style='color:#800080'>nd </span>machinists said th<span style='color:#ff0000'>ey </span>were prepared to negotiate as<span style='color:#ff00ff'> a </span>group for the acquisition of stock through one or more employee stock <span style='color:#ff8000'>o<span style='color:#00ff00'>wner</span></span>ship plans, or esops.<br/><b>Matched query 9 with document 8,323 with score 0.0469</b><br/>- asked if he might br<span style='color:#ff0000'><span style='color:#ff8000'>i<span style='color:#ffff00'><span style='color:#00ff00'><span style='color:#0080ff'>n<span style='color:#ff00ff'>g th</span></span>e w</span>o</span>r</span></span>ld leaders to texas, possibly to san antonio, the president remarked, \"tha<span style='color:#800080'>t's </span>a distinct possibility.<br/>- i<span style='color:#800080'>t's </span>sweep<span style='color:#ff0000'><span style='color:#ff8000'>i<span style='color:#ffff00'><span style='color:#00ff00'><span style='color:#0080ff'>n<span style='color:#ff00ff'>g th</span></span>e w</span>o</span>r</span></span>ld.</pre>"
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "4,503,599,627,370,449\n"
+          ]
+        }
       ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
+      "source": [
+        "from typing import Final, List, Generator\n",
+        "\n",
+        "# Fixed witnesses that make Miller-Rabin exact for n < 2**64\n",
+        "MR_BASES: Final[List[int]] = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37]\n",
+        "\n",
+        "\n",
+        "def _is_prime_64(n: int) -> bool:\n",
+        "    \"\"\"Exact primality for 0 < n < 2**64.\"\"\"\n",
+        "    if n < 2:\n",
+        "        return False\n",
+        "    # Quick reject: small prime factors\n",
+        "    for p in MR_BASES:  # covers all primes ≤ 37\n",
+        "        if n == p:\n",
+        "            return True\n",
+        "        if n % p == 0:\n",
+        "            return False\n",
+        "\n",
+        "    # Write n-1 = d · 2ˢ  with d odd\n",
+        "    d, s = n - 1, 0\n",
+        "    while d & 1 == 0:\n",
+        "        d >>= 1\n",
+        "        s += 1\n",
+        "\n",
+        "    # Strong-probable-prime test for each base\n",
+        "    for a in MR_BASES:\n",
+        "        x = pow(a, d, n)\n",
+        "        if x in (1, n - 1):  # self-loop or −1 ⇒ may be prime\n",
+        "            continue\n",
+        "        for _ in range(s - 1):  # square until −1 or cycle\n",
+        "            x = pow(x, 2, n)\n",
+        "            if x == n - 1:\n",
+        "                break\n",
+        "        else:  # never hit −1 ⇒ composite\n",
+        "            return False\n",
+        "    return True\n",
+        "\n",
+        "\n",
+        "def prev_primes(n: int) -> Generator[int, None, None]:\n",
+        "    \"\"\"\n",
+        "    Yield the largest primes strictly less than n (n must be > 2).\n",
+        "    Average cost: O(log n * log log n) because the prime gap ~ log n.\n",
+        "    \"\"\"\n",
+        "    if n <= 2:\n",
+        "        raise ValueError(\"Threshold must exceed 2.\")\n",
+        "    n -= n % 2 == 0  # make n odd\n",
+        "    while n > 2:\n",
+        "        if _is_prime_64(n):\n",
+        "            yield n\n",
+        "        n -= 2\n",
+        "\n",
+        "\n",
+        "def next_primes(n: int) -> Generator[int, None, None]:\n",
+        "    \"\"\"\n",
+        "    Yield the smallest primes strictly greater than n (n must be > 2).\n",
+        "    Average cost: O(log n * log log n) because the prime gap ~ log n.\n",
+        "    \"\"\"\n",
+        "    if n <= 2:\n",
+        "        raise ValueError(\"Threshold must exceed 2.\")\n",
+        "    n += n % 2 == 0  # make n odd\n",
+        "    while True:\n",
+        "        if _is_prime_64(n):\n",
+        "            yield n\n",
+        "        n += 2\n",
+        "\n",
+        "\n",
+        "LARGEST_INTEGRAL_FLOAT_PRIME = next(prev_primes(int(LARGEST_INTEGRAL_FLOAT)))\n",
+        "print(f\"{LARGEST_INTEGRAL_FLOAT_PRIME:,}\")  # This will be used for stress-testing"
       ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "from tqdm import tqdm\n",
-    "from IPython.display import display\n",
-    "\n",
-    "QUERIES_TO_COMPARE = 10\n",
-    "\n",
-    "log_lines = []\n",
-    "\n",
-    "for i, query_hashes, query_counts, query_ngrams in tqdm(\n",
-    "    zip(\n",
-    "        range(QUERIES_TO_COMPARE),\n",
-    "        fingerprint_hashes[:QUERIES_TO_COMPARE],\n",
-    "        fingerprint_counts[:QUERIES_TO_COMPARE],\n",
-    "        fingerprint_ngrams[:QUERIES_TO_COMPARE],\n",
-    "    ),\n",
-    "    desc=\"Searching\",\n",
-    "    unit=\"doc\",\n",
-    "    total=QUERIES_TO_COMPARE,\n",
-    "):\n",
-    "\n",
-    "    # Compare with all other fingerprints\n",
-    "    best_score, best_index = 0.0, -1\n",
-    "    for j, dataset_hashes, dataset_counts, dataset_ngrams in zip(\n",
-    "        range(len(fingerprint_hashes)),\n",
-    "        fingerprint_hashes,\n",
-    "        fingerprint_counts,\n",
-    "        fingerprint_ngrams,\n",
-    "    ):\n",
-    "        if i == j:\n",
-    "            continue\n",
-    "\n",
-    "        score = jaccard_similarity(query_hashes, dataset_hashes)\n",
-    "        if score > best_score:\n",
-    "            best_score = score\n",
-    "            best_index = j\n",
-    "\n",
-    "    query = textual_lines[i]\n",
-    "    doc = textual_lines[best_index]\n",
-    "    colored_query, colored_doc = color_code_matches(\n",
-    "        query_text=query,\n",
-    "        document_text=doc,\n",
-    "        query_hashes=query_hashes,\n",
-    "        document_hashes=fingerprint_hashes[best_index],\n",
-    "        query_ngrams=query_ngrams,\n",
-    "        document_ngrams=fingerprint_ngrams[best_index],\n",
-    "    )\n",
-    "    log_lines.extend(\n",
-    "        [\n",
-    "            f\"<b>Matched query {i:,} with document {best_index:,} with score {best_score:.4f}</b>\",\n",
-    "            f\"- {colored_query}\",\n",
-    "            f\"- {colored_doc}\",\n",
-    "        ]\n",
-    "    )\n",
-    "\n",
-    "concatenated_log = \"<br/>\".join(log_lines)\n",
-    "monospaced_log = HTML(f\"<pre style='font-family:monospace'>{concatenated_log}</pre>\")\n",
-    "display(monospaced_log)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 11,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Passed for window width: 3, modulo: 17,523,733,958,369!\n",
+            "Passed for window width: 17, modulo: 17,523,733,958,369!\n",
+            "Passed for window width: 64, modulo: 17,523,733,958,369!\n",
+            "Passed for window width: 707, modulo: 17,523,733,958,369!\n"
+          ]
+        }
+      ],
+      "source": [
+        "import random\n",
+        "\n",
+        "all_0 = \"\\x00\" * 1_000\n",
+        "all_1 = \"\\x01\" * 1_000\n",
+        "all_127 = \"\\x7f\" * 1_000\n",
+        "all_255 = \"\\xff\" * 1_000\n",
+        "all_0_255 = \"\\x00\\xff\" * 500  # alternating 0 and 255 characters\n",
+        "all_uncomfortable = \"\\x00\\x01\\x7f\\xfe\\xff\" * 250  # all uncomfortable characters\n",
+        "\n",
+        "long_random_strings = [\n",
+        "    \"\".join(random.choices(\"\\x00\\x01\\x7f\\xfe\\xff\", k=10_000)) for _ in range(10)\n",
+        "]  # 10 long random strings with uncomfortable characters\n",
+        "\n",
+        "alphabet_size = 256\n",
+        "multiplier = 257\n",
+        "largest_term = alphabet_size + 1  # in this specific case, same as `multiplier`\n",
+        "large_modulo = next(\n",
+        "    prev_primes(int(LARGEST_INTEGRAL_FLOAT) // multiplier - largest_term)\n",
+        ")\n",
+        "\n",
+        "for window_width in [3, 17, 64, 707]:\n",
+        "    for line in [\n",
+        "        all_0,\n",
+        "        all_1,\n",
+        "        all_127,\n",
+        "        all_255,\n",
+        "        all_0_255,\n",
+        "        all_uncomfortable,\n",
+        "        *long_random_strings,\n",
+        "    ]:\n",
+        "        compare_hashes(\n",
+        "            line,\n",
+        "            lambda l: rabin_karp_ints(\n",
+        "                l,\n",
+        "                window_width=window_width,\n",
+        "                multiplier=multiplier,\n",
+        "                modulo=large_modulo,\n",
+        "                alphabet_size=alphabet_size,\n",
+        "            ),\n",
+        "            lambda l: rabin_karp_fma(\n",
+        "                l,\n",
+        "                window_width=window_width,\n",
+        "                multiplier=multiplier,\n",
+        "                modulo=large_modulo,\n",
+        "                alphabet_size=alphabet_size,\n",
+        "            ),\n",
+        "        )\n",
+        "    print(f\"Passed for window width: {window_width}, modulo: {large_modulo:,}!\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Min-Hash Fingerprinting\n",
+        "\n",
+        "Min-Hash fingerprints transform variable length text representations into **fixed-length vectors**, where each dimension stores the minimum hash value of a certain hash function across the whole document.\n",
+        "It's great for large-scale information retrieval using Hamming Distance or Jaccard Similarity ($|A ∩ B| / |A ∪ B|$) or its weighted alternative.\n",
+        "\n",
+        "A potentially more informative alternative is \"weighted Min-Hash\", which takes into account the frequency of each element in the document. This makes the fingerprints compatible with **TF-IDF**-like algorithms, and makes the system more robust especially for narrow rolling windows."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 12,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Defaulting to user installation because normal site-packages is not writeable\n",
+            "Requirement already satisfied: tqdm in /home/ubuntu/.local/lib/python3.10/site-packages (4.67.1)\n",
+            "Requirement already satisfied: numpy in /home/ubuntu/.local/lib/python3.10/site-packages (2.2.4)\n"
+          ]
+        }
+      ],
+      "source": [
+        "!pip install tqdm numpy"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "(array([2256051662, 1712240109], dtype=uint32),\n",
+              " array([3, 2], dtype=uint32),\n",
+              " array(['abc', 'abcd'], dtype=StringDType()))"
+            ]
+          },
+          "execution_count": 13,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "import numpy as np\n",
+        "from numpy.dtypes import StringDType\n",
+        "from typing import List, Tuple\n",
+        "from stringzilla import hash as sz_hash\n",
+        "\n",
+        "\n",
+        "def count_min_sketch(\n",
+        "    text: str,\n",
+        "    window_widths: List[int],\n",
+        "    seeds: List[int],\n",
+        "    hash_resolution: np.dtype = np.uint32,\n",
+        ") -> Tuple[np.ndarray, np.ndarray, np.ndarray]:\n",
+        "    \"\"\"\n",
+        "    Produces a weighted Min-Hash fingerprint also called a Count-Min Sketch.\n",
+        "    Uses StringZilla's native hash function, as opposed to the Rabin Karp.\n",
+        "\n",
+        "    https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch\n",
+        "    \"\"\"\n",
+        "\n",
+        "    fingerprint_hashes = np.empty((len(window_widths),), dtype=hash_resolution)\n",
+        "    fingerprint_weights = np.empty((len(window_widths),), dtype=np.uint32)\n",
+        "    fingerprint_ngrams = np.empty((len(window_widths),), dtype=StringDType())\n",
+        "\n",
+        "    skipped_final_hash = np.iinfo(hash_resolution).max\n",
+        "    skipped_u64_intermediary = np.iinfo(np.uint64).max\n",
+        "\n",
+        "    for i, (window_width, seed) in enumerate(zip(window_widths, seeds)):\n",
+        "        assert window_width > 0, \"Window width must be positive\"\n",
+        "\n",
+        "        smallest_hash = skipped_u64_intermediary\n",
+        "        smallest_count = 0\n",
+        "        smallest_example = None\n",
+        "\n",
+        "        for j in range(len(text) - window_width + 1):\n",
+        "            text_window = text[j : j + window_width]\n",
+        "            rolling_intermediate_u64_hash = sz_hash(text_window, seed)\n",
+        "            new_smallest_hash = min(smallest_hash, rolling_intermediate_u64_hash)\n",
+        "            if new_smallest_hash < smallest_hash:\n",
+        "                smallest_count = 1\n",
+        "                smallest_hash = new_smallest_hash\n",
+        "                smallest_example = text_window\n",
+        "            elif new_smallest_hash == smallest_hash:\n",
+        "                smallest_count += 1\n",
+        "\n",
+        "        smallest_hash &= skipped_final_hash  # Ensure we don't exceed the `uint32` range\n",
+        "        fingerprint_hashes[i] = smallest_hash\n",
+        "        fingerprint_weights[i] = smallest_count\n",
+        "        fingerprint_ngrams[i] = smallest_example\n",
+        "\n",
+        "    return fingerprint_hashes, fingerprint_weights, fingerprint_ngrams\n",
+        "\n",
+        "\n",
+        "count_min_sketch(\"abcde\", window_widths=[3, 4], seeds=[257, 258])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 14,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "(array([   6498345, 1706860248], dtype=uint32),\n",
+              " array([3, 2], dtype=uint32),\n",
+              " array(['abc', 'abcd'], dtype=StringDType()))"
+            ]
+          },
+          "execution_count": 14,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "import numpy as np\n",
+        "from numpy.dtypes import StringDType\n",
+        "from typing import List, Tuple\n",
+        "\n",
+        "\n",
+        "def rolling_count_min_sketch(\n",
+        "    text: str,\n",
+        "    window_widths: List[int],\n",
+        "    multipliers: List[int],\n",
+        "    salts: List[int],\n",
+        "    modulo: int,\n",
+        "    hash_resolution: np.dtype = np.uint32,\n",
+        ") -> Tuple[np.ndarray, np.ndarray, np.ndarray]:\n",
+        "    \"\"\"\n",
+        "    Produces a weighted Min-Hash fingerprint also called a Count-Min Sketch.\n",
+        "    Uses Rabin-Karp rolling hash function for algorithmic efficiency.\n",
+        "\n",
+        "    https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch\n",
+        "    \"\"\"\n",
+        "\n",
+        "    count_widths = len(window_widths)\n",
+        "    count_multipliers = len(multipliers)\n",
+        "    assert count_widths == count_multipliers, f\"{count_widths=} != {count_multipliers=}\"\n",
+        "\n",
+        "    fingerprint_hashes = np.empty((len(window_widths),), dtype=hash_resolution)\n",
+        "    fingerprint_weights = np.empty((len(window_widths),), dtype=np.uint32)\n",
+        "    fingerprint_ngrams = np.empty((len(window_widths),), dtype=StringDType())\n",
+        "\n",
+        "    skipped_final_hash = np.iinfo(hash_resolution).max\n",
+        "    skipped_u64_intermediary = np.iinfo(np.uint64).max\n",
+        "    hashers = [\n",
+        "        rabin_karp_ints(\n",
+        "            text,\n",
+        "            window_width=width,\n",
+        "            multiplier=multiplier,\n",
+        "            modulo=modulo,\n",
+        "            salt=salt,\n",
+        "        )\n",
+        "        for width, multiplier, salt in zip(window_widths, multipliers, salts)\n",
+        "    ]\n",
+        "\n",
+        "    for i, hasher in enumerate(hashers):\n",
+        "        smallest_hash = skipped_u64_intermediary\n",
+        "        smallest_count = 0\n",
+        "        smallest_example = None\n",
+        "\n",
+        "        for j, rolling_intermediate_u64_hash in enumerate(hasher):\n",
+        "            new_smallest_hash = min(smallest_hash, rolling_intermediate_u64_hash)\n",
+        "            if new_smallest_hash < smallest_hash:\n",
+        "                smallest_count = 1\n",
+        "                smallest_hash = new_smallest_hash\n",
+        "                # Extract N-gram from the correct position where minimum hash occurred\n",
+        "                smallest_example = text[j : j + window_widths[i]]\n",
+        "            elif new_smallest_hash == smallest_hash:\n",
+        "                smallest_count += 1\n",
+        "\n",
+        "        smallest_hash &= skipped_final_hash  # Ensure we don't exceed the `uint32` range\n",
+        "        fingerprint_hashes[i] = smallest_hash\n",
+        "        fingerprint_weights[i] = smallest_count\n",
+        "        fingerprint_ngrams[i] = smallest_example\n",
+        "\n",
+        "    return fingerprint_hashes, fingerprint_weights, fingerprint_ngrams\n",
+        "\n",
+        "\n",
+        "rolling_count_min_sketch(\n",
+        "    \"abcde\",\n",
+        "    window_widths=[3, 4],\n",
+        "    multipliers=[257, 258],\n",
+        "    salts=[1, 2],\n",
+        "    modulo=4503599626977,\n",
+        ")"
+      ]
+    },
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Searching: 100%|██████████| 10/10 [00:00<00:00, 17.60doc/s]\n"
-     ]
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "A good set of hyper-parameters for Min-Hashing binary text would be:\n",
+        "\n",
+        "- `window_widths`: ${3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 18, 21, 24, 27, 30}$ - 16 widths\n",
+        "- `alphabet_size`: $256$ for ASCII & binary UTF-8 content\n",
+        "- `ndim`: $16...1024$, something like 192 should be great for X/Twitter\n",
+        "- `multipliers`: ${257, 258, 259, 260, 261, 262, ..., 1024 + 256}$\n",
+        "\n",
+        "When processing less usual inputs, like the DNA sequences, parameters may be different, e.g.:\n",
+        "\n",
+        "- `window_widths`: ${3, 6, 9, 12, 15, 30, 60, 120}$\n",
+        "- `alphabet_size`: $4$ for DNA sequences\n",
+        "- `ndim`: should be probably proportional to $√n$, where $n$ is the typical length of sequences\n",
+        "- `multipliers`: ${5, 6, 7, 8, 9, ..., 4 * n + 1}$\n",
+        "\n",
+        "In every case, the `modulo` should be co-prime to the multiplier.\n",
+        "The easiest option is to use a large prime, that can be obtained via:\n",
+        "\n",
+        "```python\n",
+        "largest_prime_below(int(LARGEST_INTEGRAL_FLOAT) // max(multipliers) - (alphabet_size + 1))\n",
+        "```"
+      ]
     },
     {
-     "data": {
-      "text/html": [
-       "<pre style='font-family:monospace'><b>Matched query 0 with document 3,668 with score 0.0305</b><br/>- a rebel statement sent to lisbon from jamba<span style='color:#ffff00'> sai</span>d 86 g<span style='color:#ff0000'><span style='color:#ff8000'>over</span>nme</span>nt soldiers and 13 guerrillas were <span style='color:#00ff00'>kil</span>led in the fighting that ended jan. 3. it<span style='color:#ffff00'> sai</span>d the rebel forces sill held mavinga.<br/>- the peruvian g<span style='color:#ff0000'><span style='color:#ff8000'>over</span>nme</span>nt has<span style='color:#ffff00'> sai</span>d that since the group turned from propaganda to violence in may 1980, it has <span style='color:#00ff00'>kil</span>led more than 15,000 people and caused more than $10 billion in damage.<br/><b>Matched query 1 with document 3,483 with score 0.0508</b><br/>- authoriti<span style='color:#ff0000'>e<span style='color:#ff8000'>s <span style='color:#ffff00'><span style='color:#0080ff'>las</span>t we</span>e</span></span>k issued a vacate order for a club in manhattan a<span style='color:#00ff00'>nd </span>closed another in the bronx.<br/>- pric<span style='color:#ff0000'>e<span style='color:#ff8000'>s <span style='color:#ffff00'><span style='color:#0080ff'>las</span>t we</span>e</span></span>k were off 41% from six years ago a<span style='color:#00ff00'>nd </span>11% from <span style='color:#0080ff'>las</span>t year's sale.<br/><b>Matched query 2 with document 2,193 with score 0.0525</b><br/>- at <span style='color:#ff0000'><span style='color:#ff8000'>the fir</span>st</span> pan am bankruptcy hearing, for example, at least five airlines were represented.<br/>- in <span style='color:#ff0000'><span style='color:#ff8000'>the fir</span>st</span> nine months, frozen food reduced long-term debt to $8.8 million from $19.2 million.<br/><b>Matched query 3 with document 1,436 with score 0.0459</b><br/>- mr. neigum, poker-faced <span style='color:#ff0000'><span style='color:#ff8000'>du<span style='color:#ffff00'>ri<span style='color:#00ff00'>n<span style='color:#0080ff'>g th</span></span></span></span>e</span> difficult task, manages a 46-second showing.<br/>- <span style='color:#ff0000'><span style='color:#ff8000'>du<span style='color:#ffff00'>ri<span style='color:#00ff00'>n<span style='color:#0080ff'>g th</span></span></span></span>e</span> last meeting, the sandinistas presented their most liberal proposal.<br/><b>Matched query 4 with document 7,018 with score 0.0369</b><br/>- this,<span style='color:#ff0000'> comb<span style='color:#ff8000'>ine<span style='color:#ffff00'>d wi</span>th</span> </span>the container division talks, suggests the group's bankers might be considering an orderly disposal of all assets.<br/>- that,<span style='color:#ff0000'> comb<span style='color:#ff8000'>ine<span style='color:#ffff00'>d wi</span>th</span> </span>lower prices, could get gnp back up to zero growth during the first quarter of 1991, he says.<br/><b>Matched query 5 with document 8,400 with score 0.0528</b><br/>- she told the post in <span style='color:#ff0000'>an interv<span style='color:#ff8000'>i<span style='color:#0080ff'>ew </span>p</span>u</span>blished sunday that some of the money may have become \"mingled<span style='color:#00ff00'>\" i</span>nto improvements on her home that included a swimming pool, a $2,500 wide-screen televisio<span style='color:#ffff00'>n a</span>nd renovations to her basement.<br/>- eisner asked. \"i<span style='color:#ffff00'>n a</span> word, panic.<span style='color:#00ff00'>\" i</span>n <span style='color:#ff0000'>an interv<span style='color:#ff8000'>i<span style='color:#0080ff'>ew </span>p</span>u</span>blished in friday's contra costa times, eisner, 48, said he rarely rests easy.<br/><b>Matched query 6 with document 3,224 with score 0.0498</b><br/>- accord<span style='color:#ff0000'>i<span style='color:#ff8000'>n<span style='color:#0080ff'>g to</span></span> </span>a study by the marshall institute, the average nasa employee's age in 1963 was 30; now most of its senior and middle-m<span style='color:#ffff00'>anage</span><span style='color:#00ff00'>rs wi</span>ll be eligible to retire in five years.<br/>- it is try<span style='color:#ff0000'>i<span style='color:#ff8000'>n<span style='color:#0080ff'>g to</span></span> </span>agree redundancy with 150 branch m<span style='color:#ffff00'>anage</span><span style='color:#00ff00'>rs wi</span>thin the next few months, and 120 computer and 80 clerical staff are also affected.<br/><b>Matched query 7 with document 4,990 with score 0.0619</b><br/>- preston tisch, 62, is<span style='color:#ff8000'> pre<span style='color:#0080ff'>side</span>nt</span> and co-<span style='color:#ff0000'>chief <span style='color:#00ff00'>e<span style='color:#ff00ff'><span style='color:#800080'>xec</span></span></span><span style='color:#ffff00'>utive off</span>ic</span>er of loews corp. and is a former postmaster general.<br/>- thomas m. egan,<span style='color:#ff8000'> pre<span style='color:#0080ff'>side</span>nt</span> and <span style='color:#ff0000'>chief <span style='color:#00ff00'>e<span style='color:#ff00ff'><span style='color:#800080'>xec</span></span></span><span style='color:#ffff00'>utive off</span>ic</span>er of stotler group, said the company had been in negotiations with the creditors in efforts to reach a solution.<br/><b>Matched query 8 with document 4,686 with score 0.0309</b><br/>- \"we're dealing with an owner wh<span style='color:#ffff00'><span style='color:#00ff00'>o c</span></span>ould<span style='color:#ff8000'>n't </span>giv<span style='color:#ff0000'>e<span style='color:#0080ff'> a </span></span>rip. they cut off her mail and she got<span style='color:#0080ff'> a </span>post office box.\" starting friday, an animal-control officer is accompanying finster on his route.<br/>- we do<span style='color:#ff8000'>n't </span>have t<span style='color:#ffff00'><span style='color:#00ff00'>o c</span></span>hang<span style='color:#ff0000'>e<span style='color:#0080ff'> a </span></span>single thing,\" bush told campaign staff workers<span style='color:#0080ff'> a </span>day after beating sen. bob dole of kansas, his chief rival for the gop presidential nomination, in new hampshire.<br/><b>Matched query 9 with document 1,831 with score 0.0288</b><br/>- asked if he might bri<span style='color:#ff8000'>n<span style='color:#ffff00'>g th</span></span>e world leaders to t<span style='color:#00ff00'>exa</span>s, possibly to san antonio, the <span style='color:#ff0000'>president </span>remarked, \"that's a distinct possibility.<br/>- the t<span style='color:#00ff00'>exa</span>ns' success in persuadi<span style='color:#ff8000'>n<span style='color:#ffff00'>g th</span></span>e <span style='color:#ff0000'>president </span>suggests that mexico may get better treatment in the bush administration than it has in the past.</pre>"
+      "cell_type": "code",
+      "execution_count": 15,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "from typing import Tuple\n",
+        "\n",
+        "\n",
+        "def jaccard_similarity(a: np.ndarray, b: np.ndarray) -> float:\n",
+        "    if a.shape != b.shape:\n",
+        "        raise ValueError(\"Fingerprints must have identical length\")\n",
+        "\n",
+        "    return float(np.mean(a == b))\n",
+        "\n",
+        "\n",
+        "def weighted_jaccard_similarity(\n",
+        "    a: Tuple[np.ndarray, np.ndarray],\n",
+        "    b: Tuple[np.ndarray, np.ndarray],\n",
+        ") -> float:\n",
+        "    hashes_a, weights_a = a\n",
+        "    hashes_b, weights_b = b\n",
+        "\n",
+        "    if hashes_a.shape != hashes_b.shape or weights_a.shape != weights_b.shape:\n",
+        "        raise ValueError(\"Both fingerprints must have identical dimensions\")\n",
+        "\n",
+        "    magnitude_i = (weights_a * weights_b)[hashes_a == hashes_b].sum()\n",
+        "    magnitude_a = (weights_a * weights_a).sum()\n",
+        "    magnitude_b = (weights_b * weights_b).sum()\n",
+        "    magnitude_u = magnitude_a + magnitude_b - magnitude_i\n",
+        "\n",
+        "    return float(magnitude_i) / float(magnitude_u)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 16,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "165161"
+            ]
+          },
+          "execution_count": 16,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
       ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
+      "source": [
+        "NDIM: int = 192\n",
+        "window_widths = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 18, 21, 24, 27, 30]\n",
+        "window_widths *= NDIM // len(window_widths)\n",
+        "\n",
+        "# For Rabin-Karp rolling hashes let's take different prime multipliers,\n",
+        "# with the smallest being a function of the window width and the largest easily representable integer:\n",
+        "smallest_multiplier = int(pow(LARGEST_INTEGRAL_FLOAT, 1 / min(window_widths)))\n",
+        "smallest_prime_multiplier = next(next_primes(smallest_multiplier))\n",
+        "smallest_prime_multiplier"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Let's compute the rolling fingerprints:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 17,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Loaded 1,000,000 lines of mean length 128.64 characters\n"
+          ]
+        }
+      ],
+      "source": [
+        "textual_dataset_path = dataset_directory / \"leipzig1M.txt\"\n",
+        "textual_dataset = open(textual_dataset_path, \"r\").read().casefold().strip()\n",
+        "textual_lines = textual_dataset.split(\"\\n\")\n",
+        "print(\n",
+        "    f\"Loaded {len(textual_lines):,} lines of mean length {sum(len(line) for line in textual_lines) / len(textual_lines):.2f} characters\"\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 18,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Fingerprinting lines: 100%|██████████| 10000/10000 [01:35<00:00, 104.49line/s]\n"
+          ]
+        }
+      ],
+      "source": [
+        "from tqdm import tqdm\n",
+        "from itertools import islice\n",
+        "import random\n",
+        "\n",
+        "\n",
+        "def take_first_n(iterable, n):\n",
+        "    return islice(iterable, n)\n",
+        "\n",
+        "\n",
+        "def keep_each_nth(iterable, k):\n",
+        "    return (x for i, x in enumerate(iterable, 1) if i % k == 0)\n",
+        "\n",
+        "\n",
+        "prime_multipliers = list(\n",
+        "    take_first_n(keep_each_nth(next_primes(smallest_prime_multiplier), 7), NDIM)\n",
+        ")\n",
+        "random_multipliers = [random.randint(257, 1024 * 1024 * 16) for _ in range(NDIM)]\n",
+        "consecutive_multipliers = list(range(256, 256 + NDIM))\n",
+        "\n",
+        "salts = range(1, NDIM + 1)  # Use different salts for each window width\n",
+        "alphabet_size = 256\n",
+        "largest_term = alphabet_size + max(salts)\n",
+        "LARGEST_SAFE_MODULO = next(\n",
+        "    prev_primes(int(LARGEST_INTEGRAL_FLOAT) // max(prime_multipliers) - largest_term)\n",
+        ")\n",
+        "HASH_DTYPE = np.uint64\n",
+        "\n",
+        "fingerprint_hashes = []\n",
+        "fingerprint_counts = []\n",
+        "fingerprint_ngrams = []\n",
+        "\n",
+        "DATASET_SIZE_LIMIT = 10_000\n",
+        "\n",
+        "default_static_sketcher = lambda line: count_min_sketch(\n",
+        "    text=line,\n",
+        "    window_widths=window_widths,\n",
+        "    seeds=prime_multipliers,\n",
+        "    hash_resolution=HASH_DTYPE,\n",
+        ")\n",
+        "# For Rabin-Karp rolling hashes we pass more parameters:\n",
+        "default_rolling_sketcher = lambda line: rolling_count_min_sketch(\n",
+        "    text=line,\n",
+        "    window_widths=window_widths,\n",
+        "    multipliers=random_multipliers,\n",
+        "    salts=salts,\n",
+        "    modulo=LARGEST_SAFE_MODULO,\n",
+        "    hash_resolution=HASH_DTYPE,\n",
+        ")\n",
+        "\n",
+        "for line in tqdm(\n",
+        "    textual_lines[:DATASET_SIZE_LIMIT],\n",
+        "    desc=\"Fingerprinting lines\",\n",
+        "    unit=\"line\",\n",
+        "):\n",
+        "    hashes, counts, ngrams = default_rolling_sketcher(line)\n",
+        "    fingerprint_hashes.append(hashes)\n",
+        "    fingerprint_counts.append(counts)\n",
+        "    fingerprint_ngrams.append(ngrams)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 19,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "24819100627"
+            ]
+          },
+          "execution_count": 19,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "LARGEST_SAFE_MODULO"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Let's cross-reference the fingerprints counting the number of hash collisions without our test set."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 20,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Dimension 0: 492 unique hashes, 0 collisions\n",
+            "Dimension 1: 1,552 unique hashes, 0 collisions\n",
+            "Dimension 2: 2,663 unique hashes, 0 collisions\n",
+            "Dimension 3: 2,445 unique hashes, 0 collisions\n",
+            "Dimension 4: 4,936 unique hashes, 0 collisions\n",
+            "Dimension 5: 6,149 unique hashes, 0 collisions\n",
+            "Dimension 6: 7,266 unique hashes, 0 collisions\n",
+            "Dimension 7: 8,090 unique hashes, 0 collisions\n",
+            "Dimension 8: 8,556 unique hashes, 0 collisions\n",
+            "Dimension 9: 9,095 unique hashes, 0 collisions\n",
+            "Dimension 10: 9,654 unique hashes, 0 collisions\n",
+            "Dimension 11: 9,859 unique hashes, 0 collisions\n",
+            "Dimension 12: 9,927 unique hashes, 1 collisions\n",
+            "Dimension 13: 9,943 unique hashes, 0 collisions\n",
+            "Dimension 14: 9,923 unique hashes, 0 collisions\n",
+            "Dimension 15: 9,879 unique hashes, 0 collisions\n",
+            "Dimension 16: 470 unique hashes, 0 collisions\n",
+            "Dimension 17: 1,492 unique hashes, 0 collisions\n",
+            "Dimension 18: 2,358 unique hashes, 0 collisions\n",
+            "Dimension 19: 3,769 unique hashes, 0 collisions\n",
+            "Dimension 20: 5,362 unique hashes, 0 collisions\n",
+            "Dimension 21: 6,214 unique hashes, 0 collisions\n",
+            "Dimension 22: 7,165 unique hashes, 0 collisions\n",
+            "Dimension 23: 7,778 unique hashes, 0 collisions\n",
+            "Dimension 24: 8,727 unique hashes, 1 collisions\n",
+            "Dimension 25: 9,035 unique hashes, 0 collisions\n",
+            "Dimension 26: 9,689 unique hashes, 0 collisions\n",
+            "Dimension 27: 9,861 unique hashes, 0 collisions\n",
+            "Dimension 28: 9,902 unique hashes, 0 collisions\n",
+            "Dimension 29: 9,936 unique hashes, 0 collisions\n",
+            "Dimension 30: 9,926 unique hashes, 0 collisions\n",
+            "Dimension 31: 9,867 unique hashes, 0 collisions\n",
+            "Dimension 32: 419 unique hashes, 0 collisions\n",
+            "Dimension 33: 1,392 unique hashes, 0 collisions\n",
+            "Dimension 34: 2,428 unique hashes, 0 collisions\n",
+            "Dimension 35: 4,020 unique hashes, 0 collisions\n",
+            "Dimension 36: 4,429 unique hashes, 0 collisions\n",
+            "Dimension 37: 6,583 unique hashes, 0 collisions\n",
+            "Dimension 38: 7,302 unique hashes, 0 collisions\n",
+            "Dimension 39: 8,100 unique hashes, 0 collisions\n",
+            "Dimension 40: 8,722 unique hashes, 0 collisions\n",
+            "Dimension 41: 9,033 unique hashes, 0 collisions\n",
+            "Dimension 42: 9,689 unique hashes, 0 collisions\n",
+            "Dimension 43: 9,873 unique hashes, 0 collisions\n",
+            "Dimension 44: 9,922 unique hashes, 0 collisions\n",
+            "Dimension 45: 9,928 unique hashes, 0 collisions\n",
+            "Dimension 46: 9,923 unique hashes, 0 collisions\n",
+            "Dimension 47: 9,863 unique hashes, 0 collisions\n",
+            "Dimension 48: 355 unique hashes, 0 collisions\n",
+            "Dimension 49: 1,016 unique hashes, 0 collisions\n",
+            "Dimension 50: 2,388 unique hashes, 0 collisions\n",
+            "Dimension 51: 3,898 unique hashes, 0 collisions\n",
+            "Dimension 52: 5,295 unique hashes, 0 collisions\n",
+            "Dimension 53: 6,398 unique hashes, 0 collisions\n",
+            "Dimension 54: 7,403 unique hashes, 0 collisions\n",
+            "Dimension 55: 7,948 unique hashes, 0 collisions\n",
+            "Dimension 56: 8,646 unique hashes, 0 collisions\n",
+            "Dimension 57: 8,946 unique hashes, 1 collisions\n",
+            "Dimension 58: 9,648 unique hashes, 0 collisions\n",
+            "Dimension 59: 9,849 unique hashes, 0 collisions\n",
+            "Dimension 60: 9,909 unique hashes, 0 collisions\n",
+            "Dimension 61: 9,928 unique hashes, 0 collisions\n",
+            "Dimension 62: 9,915 unique hashes, 0 collisions\n",
+            "Dimension 63: 9,863 unique hashes, 0 collisions\n",
+            "Dimension 64: 607 unique hashes, 0 collisions\n",
+            "Dimension 65: 809 unique hashes, 0 collisions\n",
+            "Dimension 66: 2,237 unique hashes, 0 collisions\n",
+            "Dimension 67: 3,450 unique hashes, 0 collisions\n",
+            "Dimension 68: 4,635 unique hashes, 0 collisions\n",
+            "Dimension 69: 6,308 unique hashes, 0 collisions\n",
+            "Dimension 70: 7,594 unique hashes, 0 collisions\n",
+            "Dimension 71: 8,234 unique hashes, 0 collisions\n",
+            "Dimension 72: 8,535 unique hashes, 0 collisions\n",
+            "Dimension 73: 8,981 unique hashes, 0 collisions\n",
+            "Dimension 74: 9,643 unique hashes, 0 collisions\n",
+            "Dimension 75: 9,879 unique hashes, 0 collisions\n",
+            "Dimension 76: 9,921 unique hashes, 0 collisions\n",
+            "Dimension 77: 9,932 unique hashes, 1 collisions\n",
+            "Dimension 78: 9,922 unique hashes, 0 collisions\n",
+            "Dimension 79: 9,867 unique hashes, 0 collisions\n",
+            "Dimension 80: 394 unique hashes, 0 collisions\n",
+            "Dimension 81: 1,296 unique hashes, 0 collisions\n",
+            "Dimension 82: 2,071 unique hashes, 0 collisions\n",
+            "Dimension 83: 3,764 unique hashes, 0 collisions\n",
+            "Dimension 84: 4,886 unique hashes, 0 collisions\n",
+            "Dimension 85: 6,365 unique hashes, 0 collisions\n",
+            "Dimension 86: 7,446 unique hashes, 0 collisions\n",
+            "Dimension 87: 7,888 unique hashes, 0 collisions\n",
+            "Dimension 88: 8,647 unique hashes, 0 collisions\n",
+            "Dimension 89: 8,906 unique hashes, 0 collisions\n",
+            "Dimension 90: 9,622 unique hashes, 1 collisions\n",
+            "Dimension 91: 9,874 unique hashes, 1 collisions\n",
+            "Dimension 92: 9,932 unique hashes, 0 collisions\n",
+            "Dimension 93: 9,919 unique hashes, 0 collisions\n",
+            "Dimension 94: 9,906 unique hashes, 1 collisions\n",
+            "Dimension 95: 9,870 unique hashes, 0 collisions\n",
+            "Dimension 96: 681 unique hashes, 0 collisions\n",
+            "Dimension 97: 1,241 unique hashes, 0 collisions\n",
+            "Dimension 98: 2,499 unique hashes, 0 collisions\n",
+            "Dimension 99: 3,866 unique hashes, 0 collisions\n",
+            "Dimension 100: 4,805 unique hashes, 0 collisions\n",
+            "Dimension 101: 6,347 unique hashes, 0 collisions\n",
+            "Dimension 102: 7,291 unique hashes, 0 collisions\n",
+            "Dimension 103: 7,909 unique hashes, 0 collisions\n",
+            "Dimension 104: 8,601 unique hashes, 0 collisions\n",
+            "Dimension 105: 8,999 unique hashes, 0 collisions\n",
+            "Dimension 106: 9,680 unique hashes, 0 collisions\n",
+            "Dimension 107: 9,861 unique hashes, 0 collisions\n",
+            "Dimension 108: 9,938 unique hashes, 0 collisions\n",
+            "Dimension 109: 9,927 unique hashes, 0 collisions\n",
+            "Dimension 110: 9,913 unique hashes, 0 collisions\n",
+            "Dimension 111: 9,871 unique hashes, 0 collisions\n",
+            "Dimension 112: 423 unique hashes, 0 collisions\n",
+            "Dimension 113: 684 unique hashes, 0 collisions\n",
+            "Dimension 114: 2,054 unique hashes, 0 collisions\n",
+            "Dimension 115: 3,796 unique hashes, 0 collisions\n",
+            "Dimension 116: 5,107 unique hashes, 0 collisions\n",
+            "Dimension 117: 5,942 unique hashes, 0 collisions\n",
+            "Dimension 118: 7,303 unique hashes, 0 collisions\n",
+            "Dimension 119: 7,934 unique hashes, 0 collisions\n",
+            "Dimension 120: 8,373 unique hashes, 0 collisions\n",
+            "Dimension 121: 9,024 unique hashes, 0 collisions\n",
+            "Dimension 122: 9,657 unique hashes, 0 collisions\n",
+            "Dimension 123: 9,874 unique hashes, 0 collisions\n",
+            "Dimension 124: 9,907 unique hashes, 0 collisions\n",
+            "Dimension 125: 9,927 unique hashes, 0 collisions\n",
+            "Dimension 126: 9,915 unique hashes, 0 collisions\n",
+            "Dimension 127: 9,872 unique hashes, 0 collisions\n",
+            "Dimension 128: 510 unique hashes, 0 collisions\n",
+            "Dimension 129: 1,205 unique hashes, 0 collisions\n",
+            "Dimension 130: 2,864 unique hashes, 0 collisions\n",
+            "Dimension 131: 2,888 unique hashes, 0 collisions\n",
+            "Dimension 132: 5,104 unique hashes, 0 collisions\n",
+            "Dimension 133: 6,185 unique hashes, 0 collisions\n",
+            "Dimension 134: 7,591 unique hashes, 0 collisions\n",
+            "Dimension 135: 7,889 unique hashes, 0 collisions\n",
+            "Dimension 136: 8,516 unique hashes, 0 collisions\n",
+            "Dimension 137: 8,822 unique hashes, 0 collisions\n",
+            "Dimension 138: 9,675 unique hashes, 0 collisions\n",
+            "Dimension 139: 9,882 unique hashes, 0 collisions\n",
+            "Dimension 140: 9,909 unique hashes, 0 collisions\n",
+            "Dimension 141: 9,927 unique hashes, 0 collisions\n",
+            "Dimension 142: 9,916 unique hashes, 0 collisions\n",
+            "Dimension 143: 9,867 unique hashes, 0 collisions\n",
+            "Dimension 144: 691 unique hashes, 0 collisions\n",
+            "Dimension 145: 1,401 unique hashes, 0 collisions\n",
+            "Dimension 146: 2,549 unique hashes, 0 collisions\n",
+            "Dimension 147: 4,065 unique hashes, 0 collisions\n",
+            "Dimension 148: 4,908 unique hashes, 0 collisions\n",
+            "Dimension 149: 6,141 unique hashes, 0 collisions\n",
+            "Dimension 150: 7,289 unique hashes, 0 collisions\n",
+            "Dimension 151: 7,991 unique hashes, 0 collisions\n",
+            "Dimension 152: 8,693 unique hashes, 1 collisions\n",
+            "Dimension 153: 8,986 unique hashes, 0 collisions\n",
+            "Dimension 154: 9,641 unique hashes, 0 collisions\n",
+            "Dimension 155: 9,822 unique hashes, 0 collisions\n",
+            "Dimension 156: 9,916 unique hashes, 0 collisions\n",
+            "Dimension 157: 9,936 unique hashes, 0 collisions\n",
+            "Dimension 158: 9,920 unique hashes, 0 collisions\n",
+            "Dimension 159: 9,863 unique hashes, 0 collisions\n",
+            "Dimension 160: 530 unique hashes, 0 collisions\n",
+            "Dimension 161: 1,749 unique hashes, 0 collisions\n",
+            "Dimension 162: 2,507 unique hashes, 0 collisions\n",
+            "Dimension 163: 4,411 unique hashes, 0 collisions\n",
+            "Dimension 164: 5,242 unique hashes, 0 collisions\n",
+            "Dimension 165: 6,319 unique hashes, 0 collisions\n",
+            "Dimension 166: 7,316 unique hashes, 0 collisions\n",
+            "Dimension 167: 7,856 unique hashes, 0 collisions\n",
+            "Dimension 168: 8,560 unique hashes, 0 collisions\n",
+            "Dimension 169: 8,924 unique hashes, 0 collisions\n",
+            "Dimension 170: 9,681 unique hashes, 0 collisions\n",
+            "Dimension 171: 9,845 unique hashes, 0 collisions\n",
+            "Dimension 172: 9,935 unique hashes, 0 collisions\n",
+            "Dimension 173: 9,942 unique hashes, 0 collisions\n",
+            "Dimension 174: 9,917 unique hashes, 0 collisions\n",
+            "Dimension 175: 9,876 unique hashes, 0 collisions\n",
+            "Dimension 176: 604 unique hashes, 0 collisions\n",
+            "Dimension 177: 1,569 unique hashes, 0 collisions\n",
+            "Dimension 178: 2,690 unique hashes, 0 collisions\n",
+            "Dimension 179: 3,509 unique hashes, 0 collisions\n",
+            "Dimension 180: 5,181 unique hashes, 0 collisions\n",
+            "Dimension 181: 6,340 unique hashes, 0 collisions\n",
+            "Dimension 182: 7,285 unique hashes, 0 collisions\n",
+            "Dimension 183: 8,280 unique hashes, 0 collisions\n",
+            "Dimension 184: 8,628 unique hashes, 0 collisions\n",
+            "Dimension 185: 9,028 unique hashes, 0 collisions\n",
+            "Dimension 186: 9,631 unique hashes, 0 collisions\n",
+            "Dimension 187: 9,872 unique hashes, 0 collisions\n",
+            "Dimension 188: 9,910 unique hashes, 0 collisions\n",
+            "Dimension 189: 9,926 unique hashes, 0 collisions\n",
+            "Dimension 190: 9,918 unique hashes, 0 collisions\n",
+            "Dimension 191: 9,875 unique hashes, 0 collisions\n"
+          ]
+        }
+      ],
+      "source": [
+        "from typing import Dict, Set\n",
+        "\n",
+        "for dim in range(len(window_widths)):\n",
+        "    hash_to_ngram: Dict[int, str] = {}\n",
+        "    hash_collisions: Set[int] = set()\n",
+        "    for hashes, ngrams in zip(fingerprint_hashes, fingerprint_ngrams):\n",
+        "        hash_value = hashes[dim]\n",
+        "        ngram_value = ngrams[dim]\n",
+        "        if hash_value not in hash_to_ngram:\n",
+        "            hash_to_ngram[hash_value] = ngram_value\n",
+        "        elif hash_to_ngram[hash_value] != ngram_value:\n",
+        "            hash_collisions.add(hash_value)\n",
+        "\n",
+        "    print(\n",
+        "        f\"Dimension {dim}: {len(hash_to_ngram):,} unique hashes, {len(hash_collisions):,} collisions\"\n",
+        "    )"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Let's estimate Recall @ 1, but before we do that - let's find a way to highlight N-gram matches between strings."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 21,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "(\"A short <span style='color:#ff8000'><span style='color:#ffff00'>s<span style='color:#00ff00'>trin</span></span>g</span> w<span style='color:#0080ff'><span style='color:#ff00ff'>ith </span></span>an <span style='color:#ff0000'>n<span style='color:#800080'><span style='color:#ff0000'>-gr</span></span>am</span>\",\n",
+              " \"Longer <span style='color:#ff8000'><span style='color:#ffff00'>s<span style='color:#00ff00'>trin</span></span>g</span>s w<span style='color:#0080ff'><span style='color:#ff00ff'>ith </span></span>different <span style='color:#ff0000'>n<span style='color:#800080'><span style='color:#ff0000'>-gr</span></span>am</span>s\")"
+            ]
+          },
+          "execution_count": 21,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "from typing import Tuple\n",
+        "from IPython.display import HTML\n",
+        "import numpy as np\n",
+        "\n",
+        "HTML_COLORS = [\n",
+        "    \"#ff0000\",\n",
+        "    \"#ff8000\",\n",
+        "    \"#ffff00\",\n",
+        "    \"#00ff00\",\n",
+        "    \"#0080ff\",\n",
+        "    \"#ff00ff\",\n",
+        "    \"#800080\",\n",
+        "]\n",
+        "ASCII_COLORS = [\n",
+        "    \"\\033[38;5;196m\",  # red\n",
+        "    \"\\033[38;5;208m\",  # orange\n",
+        "    \"\\033[38;5;226m\",  # yellow\n",
+        "    \"\\033[38;5;082m\",  # green\n",
+        "    \"\\033[38;5;039m\",  # blue\n",
+        "    \"\\033[38;5;201m\",  # magenta\n",
+        "    \"\\033[38;5;129m\",  # purple\n",
+        "]\n",
+        "\n",
+        "\n",
+        "def color_code_matches(\n",
+        "    query_text: str,\n",
+        "    document_text: str,\n",
+        "    query_hashes: np.ndarray,\n",
+        "    document_hashes: np.ndarray,\n",
+        "    query_ngrams: np.ndarray,\n",
+        "    document_ngrams: np.ndarray,\n",
+        "    *,\n",
+        "    html: bool = True,\n",
+        ") -> Tuple[str, str]:\n",
+        "    \"\"\"Highlight matching n-grams / hash-collisions in the two texts.\"\"\"\n",
+        "\n",
+        "    COLOR_ARRAY = (\n",
+        "        [f\"<span style='color:{hex_}'>\" for hex_ in HTML_COLORS]\n",
+        "        if html\n",
+        "        else ASCII_COLORS\n",
+        "    )\n",
+        "    COLOR_COLLISION = (\n",
+        "        \"<span style='color:#888888'>\" if html else \"\\033[38;5;244m\"\n",
+        "    )  # grey\n",
+        "    COLOR_RESET = \"</span>\" if html else \"\\033[0m\"\n",
+        "\n",
+        "    def number_of_matches_in_dimension(dim: int) -> int:\n",
+        "        if len(query_ngrams[dim]) == 0 or len(document_ngrams[dim]) == 0:\n",
+        "            return 0\n",
+        "        return min(\n",
+        "            query_text.count(query_ngrams[dim]),\n",
+        "            document_text.count(document_ngrams[dim]),\n",
+        "        )\n",
+        "\n",
+        "    def ngram_length_in_dimension(dim: int) -> int:\n",
+        "        return len(query_ngrams[dim]) if dim < len(query_ngrams) else 0\n",
+        "\n",
+        "    all_dims = [\n",
+        "        d for d in range(len(query_hashes)) if number_of_matches_in_dimension(d)\n",
+        "    ]\n",
+        "    all_dims.sort(key=ngram_length_in_dimension, reverse=True)\n",
+        "\n",
+        "    color_index = 0\n",
+        "    for dim in all_dims:\n",
+        "        if number_of_matches_in_dimension(dim) == 0:\n",
+        "            continue\n",
+        "\n",
+        "        is_hash_eq = query_hashes[dim] == document_hashes[dim]\n",
+        "        is_ngram_eq = query_ngrams[dim] == document_ngrams[dim]\n",
+        "        token = query_ngrams[dim]\n",
+        "        assert token, \"N-gram must not be empty\"\n",
+        "\n",
+        "        if is_ngram_eq:\n",
+        "            color_tag = COLOR_ARRAY[color_index % len(COLOR_ARRAY)]\n",
+        "            replacement = f\"{color_tag}{token}{COLOR_RESET}\"\n",
+        "            color_index += 1\n",
+        "        elif is_hash_eq:\n",
+        "            replacement = f\"{COLOR_COLLISION}{token}{COLOR_RESET}\"\n",
+        "        else:\n",
+        "            continue\n",
+        "\n",
+        "        query_text = query_text.replace(token, replacement)\n",
+        "        document_text = document_text.replace(token, replacement)\n",
+        "\n",
+        "    return query_text, document_text\n",
+        "\n",
+        "\n",
+        "query_text = \"A short string with an n-gram\"\n",
+        "document_text = \"Longer strings with different n-grams\"\n",
+        "query_hashes, query_weights, query_ngrams = default_rolling_sketcher(query_text)\n",
+        "document_hashes, document_weights, document_ngrams = default_rolling_sketcher(\n",
+        "    document_text\n",
+        ")\n",
+        "color_code_matches(\n",
+        "    query_text=query_text,\n",
+        "    document_text=document_text,\n",
+        "    query_hashes=query_hashes,\n",
+        "    document_hashes=document_hashes,\n",
+        "    query_ngrams=query_ngrams,\n",
+        "    document_ngrams=document_ngrams,\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 22,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Searching: 100%|██████████| 10/10 [00:00<00:00, 26.40doc/s]\n"
+          ]
+        },
+        {
+          "data": {
+            "text/html": [
+              "<pre style='font-family:monospace'><b>Matched query 0 with document 1,559 with score 0.0625</b><br/>- a rebel statement sent to lisbon from jamba said 86 government soldiers and 13 gu<span style='color:#ff0000'>er<span style='color:#ff8000'>ril<span style='color:#ff00ff'>l<span style='color:#ff0000'>as w</span>er</span>e<span style='color:#0080ff'><span style='color:#800080'> <span style='color:#ff8000'>kil</span></span>led i</span>n</span> </span>the fighting that ended jan. 3. it said<span style='color:#ffff00'><span style='color:#00ff00'> the rebe</span>l</span> forces sill held mavinga.<br/>- hours later, six leftist gu<span style='color:#ff0000'>er<span style='color:#ff8000'>ril<span style='color:#ff00ff'>l<span style='color:#ff0000'>as w</span>er</span>e<span style='color:#0080ff'><span style='color:#800080'> <span style='color:#ff8000'>kil</span></span>led i</span>n</span> </span>a battle with a special army brigade created to fight<span style='color:#ffff00'><span style='color:#00ff00'> the rebe</span>l</span>s.<br/><b>Matched query 1 with document 3,483 with score 0.0417</b><br/>- authoriti<span style='color:#ff0000'>e<span style='color:#ff8000'>s <span style='color:#ffff00'><span style='color:#0080ff'>las</span>t we</span>e</span></span>k issued a vacate order for a club in manhattan a<span style='color:#00ff00'>nd </span>closed another in the bronx.<br/>- pric<span style='color:#ff0000'>e<span style='color:#ff8000'>s <span style='color:#ffff00'><span style='color:#0080ff'>las</span>t we</span>e</span></span>k were off 41% from six years ago a<span style='color:#00ff00'>nd </span>11% from <span style='color:#0080ff'>las</span>t year's sale.<br/><b>Matched query 2 with document 8,745 with score 0.0469</b><br/>- at <span style='color:#ff0000'>th<span style='color:#ffff00'>e <span style='color:#00ff00'><span style='color:#0080ff'>first</span></span></span></span> pan am bankruptcy hearing, <span style='color:#ff8000'>fo<span style='color:#ff00ff'>r <span style='color:#800080'>exa</span></span>m</span>ple, at least five airlines were represented.<br/>- <span style='color:#ff8000'>fo<span style='color:#ff00ff'>r <span style='color:#800080'>exa</span></span>m</span>ple, libya was <span style='color:#ff0000'>th<span style='color:#ffff00'>e <span style='color:#00ff00'><span style='color:#0080ff'>first</span></span></span></span> to break the $30-a-barrel level and others followed.<br/><b>Matched query 3 with document 2,243 with score 0.0469</b><br/>- mr. neigum, poker-faced<span style='color:#ff0000'> <span style='color:#ff8000'>d<span style='color:#ffff00'>uring t</span>he</span> </span>difficult task, manages a 46-second showing.<br/>- the june contract of the long gilt future on liffe traded between 108 21/32 and 107 5/8<span style='color:#ff0000'> <span style='color:#ff8000'>d<span style='color:#ffff00'>uring t</span>he</span> </span>day.<br/><b>Matched query 4 with document 9,748 with score 0.0417</b><br/>- this,<span style='color:#ff0000'> co<span style='color:#ff8000'>mb<span style='color:#ffff00'>ined </span>wit</span>h </span>the container division talks, suggests the group's bankers might be considering a<span style='color:#00ff00'>n o</span>rderly disposal of<span style='color:#0080ff'> al</span>l assets.<br/>- the near-extinctio<span style='color:#00ff00'>n o</span>f the buffalo herd,<span style='color:#ff0000'> co<span style='color:#ff8000'>mb<span style='color:#ffff00'>ined </span>wit</span>h </span>pressure from local missionaries,<span style='color:#0080ff'> al</span>l but wiped out the rituals that had united the omahas for centuries.<br/><b>Matched query 5 with document 8,400 with score 0.0625</b><br/>- she told the post in <span style='color:#ff0000'>an interv<span style='color:#ff8000'>i<span style='color:#0080ff'>ew </span>p</span>u</span>blished sunday that some of the money may have become \"mingled<span style='color:#00ff00'>\" i</span>nto improvements on her home that included a swimming pool, a $2,500 wide-screen televisio<span style='color:#ffff00'>n a</span>nd renovations to her basement.<br/>- eisner asked. \"i<span style='color:#ffff00'>n a</span> word, panic.<span style='color:#00ff00'>\" i</span>n <span style='color:#ff0000'>an interv<span style='color:#ff8000'>i<span style='color:#0080ff'>ew </span>p</span>u</span>blished in friday's contra costa times, eisner, 48, said he rarely rests easy.<br/><b>Matched query 6 with document 5,453 with score 0.0573</b><br/>- <span style='color:#ffff00'>accor</span>di<span style='color:#ff0000'><span style='color:#ff8000'>n<span style='color:#00ff00'>g to</span></span> a s</span>tud<span style='color:#0080ff'>y b</span>y the marshall institute, the average nasa employee's age in 1963 was 30; now most of its senior and middle-managers will be eligible to retire in five years.<br/>- the resolution was passed tuesda<span style='color:#0080ff'>y b</span>y the afl-cio executive council, <span style='color:#ffff00'>accor</span>di<span style='color:#ff0000'><span style='color:#ff8000'>n<span style='color:#00ff00'>g to</span></span> a s</span>tatement.<br/><b>Matched query 7 with document 8,218 with score 0.0990</b><br/>- preston tisch, 62, is <span style='color:#ffff00'><span style='color:#00ff00'>pre<span style='color:#ff0000'>side</span>nt </span></span>and co-<span style='color:#ff0000'><span style='color:#ff8000'>chi<span style='color:#0080ff'>ef <span style='color:#ff00ff'><span style='color:#800080'>e<span style='color:#ff8000'><span style='color:#ffff00'>xec</span></span></span>u</span></span>t</span>ive offic</span>er of loews corp. and is a former postmaster general.<br/>- hormel said charles b. olson, <span style='color:#ffff00'><span style='color:#00ff00'>pre<span style='color:#ff0000'>side</span>nt </span></span>of jennie-o, will also be named <span style='color:#ff0000'><span style='color:#ff8000'>chi<span style='color:#0080ff'>ef <span style='color:#ff00ff'><span style='color:#800080'>e<span style='color:#ff8000'><span style='color:#ffff00'>xec</span></span></span>u</span></span>t</span>ive offic</span>er of the new subsidiary.<br/><b>Matched query 8 with document 3,568 with score 0.0469</b><br/>- \"we're dealing with an <span style='color:#ff8000'>o<span style='color:#00ff00'>wner</span></span> who couldn't give<span style='color:#ff00ff'> a </span>rip. th<span style='color:#ff0000'>ey </span>cut off her mail a<span style='color:#800080'>nd </span>she got<span style='color:#ff00ff'> a </span>post office box.\" starting friday, a<span style='color:#0080ff'>n a</span>nimal-control officer is ac<span style='color:#ff0000'>c<span style='color:#ffff00'>ompa</span>n</span>ying finster on his route.<br/>- i<span style='color:#0080ff'>n a</span> letter to the <span style='color:#ff0000'>c<span style='color:#ffff00'>ompa</span>n</span>y's board of directors, united airline's pilots, flight attendants a<span style='color:#800080'>nd </span>machinists said th<span style='color:#ff0000'>ey </span>were prepared to negotiate as<span style='color:#ff00ff'> a </span>group for the acquisition of stock through one or more employee stock <span style='color:#ff8000'>o<span style='color:#00ff00'>wner</span></span>ship plans, or esops.<br/><b>Matched query 9 with document 8,323 with score 0.0469</b><br/>- asked if he might br<span style='color:#ff0000'><span style='color:#ff8000'>i<span style='color:#ffff00'><span style='color:#00ff00'><span style='color:#0080ff'>n<span style='color:#ff00ff'>g th</span></span>e w</span>o</span>r</span></span>ld leaders to texas, possibly to san antonio, the president remarked, \"tha<span style='color:#800080'>t's </span>a distinct possibility.<br/>- i<span style='color:#800080'>t's </span>sweep<span style='color:#ff0000'><span style='color:#ff8000'>i<span style='color:#ffff00'><span style='color:#00ff00'><span style='color:#0080ff'>n<span style='color:#ff00ff'>g th</span></span>e w</span>o</span>r</span></span>ld.</pre>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "from tqdm import tqdm\n",
+        "from IPython.display import display\n",
+        "\n",
+        "QUERIES_TO_COMPARE = 10\n",
+        "\n",
+        "log_lines = []\n",
+        "\n",
+        "for i, query_hashes, query_counts, query_ngrams in tqdm(\n",
+        "    zip(\n",
+        "        range(QUERIES_TO_COMPARE),\n",
+        "        fingerprint_hashes[:QUERIES_TO_COMPARE],\n",
+        "        fingerprint_counts[:QUERIES_TO_COMPARE],\n",
+        "        fingerprint_ngrams[:QUERIES_TO_COMPARE],\n",
+        "    ),\n",
+        "    desc=\"Searching\",\n",
+        "    unit=\"doc\",\n",
+        "    total=QUERIES_TO_COMPARE,\n",
+        "):\n",
+        "\n",
+        "    # Compare with all other fingerprints\n",
+        "    best_score, best_index = 0.0, -1\n",
+        "    for j, dataset_hashes, dataset_counts, dataset_ngrams in zip(\n",
+        "        range(len(fingerprint_hashes)),\n",
+        "        fingerprint_hashes,\n",
+        "        fingerprint_counts,\n",
+        "        fingerprint_ngrams,\n",
+        "    ):\n",
+        "        if i == j:\n",
+        "            continue\n",
+        "\n",
+        "        score = jaccard_similarity(query_hashes, dataset_hashes)\n",
+        "        if score > best_score:\n",
+        "            best_score = score\n",
+        "            best_index = j\n",
+        "\n",
+        "    query = textual_lines[i]\n",
+        "    doc = textual_lines[best_index]\n",
+        "    colored_query, colored_doc = color_code_matches(\n",
+        "        query_text=query,\n",
+        "        document_text=doc,\n",
+        "        query_hashes=query_hashes,\n",
+        "        document_hashes=fingerprint_hashes[best_index],\n",
+        "        query_ngrams=query_ngrams,\n",
+        "        document_ngrams=fingerprint_ngrams[best_index],\n",
+        "    )\n",
+        "    log_lines.extend(\n",
+        "        [\n",
+        "            f\"<b>Matched query {i:,} with document {best_index:,} with score {best_score:.4f}</b>\",\n",
+        "            f\"- {colored_query}\",\n",
+        "            f\"- {colored_doc}\",\n",
+        "        ]\n",
+        "    )\n",
+        "\n",
+        "concatenated_log = \"<br/>\".join(log_lines)\n",
+        "monospaced_log = HTML(f\"<pre style='font-family:monospace'>{concatenated_log}</pre>\")\n",
+        "display(monospaced_log)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 23,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Searching: 100%|██████████| 10/10 [00:00<00:00, 17.60doc/s]\n"
+          ]
+        },
+        {
+          "data": {
+            "text/html": [
+              "<pre style='font-family:monospace'><b>Matched query 0 with document 3,668 with score 0.0305</b><br/>- a rebel statement sent to lisbon from jamba<span style='color:#ffff00'> sai</span>d 86 g<span style='color:#ff0000'><span style='color:#ff8000'>over</span>nme</span>nt soldiers and 13 guerrillas were <span style='color:#00ff00'>kil</span>led in the fighting that ended jan. 3. it<span style='color:#ffff00'> sai</span>d the rebel forces sill held mavinga.<br/>- the peruvian g<span style='color:#ff0000'><span style='color:#ff8000'>over</span>nme</span>nt has<span style='color:#ffff00'> sai</span>d that since the group turned from propaganda to violence in may 1980, it has <span style='color:#00ff00'>kil</span>led more than 15,000 people and caused more than $10 billion in damage.<br/><b>Matched query 1 with document 3,483 with score 0.0508</b><br/>- authoriti<span style='color:#ff0000'>e<span style='color:#ff8000'>s <span style='color:#ffff00'><span style='color:#0080ff'>las</span>t we</span>e</span></span>k issued a vacate order for a club in manhattan a<span style='color:#00ff00'>nd </span>closed another in the bronx.<br/>- pric<span style='color:#ff0000'>e<span style='color:#ff8000'>s <span style='color:#ffff00'><span style='color:#0080ff'>las</span>t we</span>e</span></span>k were off 41% from six years ago a<span style='color:#00ff00'>nd </span>11% from <span style='color:#0080ff'>las</span>t year's sale.<br/><b>Matched query 2 with document 2,193 with score 0.0525</b><br/>- at <span style='color:#ff0000'><span style='color:#ff8000'>the fir</span>st</span> pan am bankruptcy hearing, for example, at least five airlines were represented.<br/>- in <span style='color:#ff0000'><span style='color:#ff8000'>the fir</span>st</span> nine months, frozen food reduced long-term debt to $8.8 million from $19.2 million.<br/><b>Matched query 3 with document 1,436 with score 0.0459</b><br/>- mr. neigum, poker-faced <span style='color:#ff0000'><span style='color:#ff8000'>du<span style='color:#ffff00'>ri<span style='color:#00ff00'>n<span style='color:#0080ff'>g th</span></span></span></span>e</span> difficult task, manages a 46-second showing.<br/>- <span style='color:#ff0000'><span style='color:#ff8000'>du<span style='color:#ffff00'>ri<span style='color:#00ff00'>n<span style='color:#0080ff'>g th</span></span></span></span>e</span> last meeting, the sandinistas presented their most liberal proposal.<br/><b>Matched query 4 with document 7,018 with score 0.0369</b><br/>- this,<span style='color:#ff0000'> comb<span style='color:#ff8000'>ine<span style='color:#ffff00'>d wi</span>th</span> </span>the container division talks, suggests the group's bankers might be considering an orderly disposal of all assets.<br/>- that,<span style='color:#ff0000'> comb<span style='color:#ff8000'>ine<span style='color:#ffff00'>d wi</span>th</span> </span>lower prices, could get gnp back up to zero growth during the first quarter of 1991, he says.<br/><b>Matched query 5 with document 8,400 with score 0.0528</b><br/>- she told the post in <span style='color:#ff0000'>an interv<span style='color:#ff8000'>i<span style='color:#0080ff'>ew </span>p</span>u</span>blished sunday that some of the money may have become \"mingled<span style='color:#00ff00'>\" i</span>nto improvements on her home that included a swimming pool, a $2,500 wide-screen televisio<span style='color:#ffff00'>n a</span>nd renovations to her basement.<br/>- eisner asked. \"i<span style='color:#ffff00'>n a</span> word, panic.<span style='color:#00ff00'>\" i</span>n <span style='color:#ff0000'>an interv<span style='color:#ff8000'>i<span style='color:#0080ff'>ew </span>p</span>u</span>blished in friday's contra costa times, eisner, 48, said he rarely rests easy.<br/><b>Matched query 6 with document 3,224 with score 0.0498</b><br/>- accord<span style='color:#ff0000'>i<span style='color:#ff8000'>n<span style='color:#0080ff'>g to</span></span> </span>a study by the marshall institute, the average nasa employee's age in 1963 was 30; now most of its senior and middle-m<span style='color:#ffff00'>anage</span><span style='color:#00ff00'>rs wi</span>ll be eligible to retire in five years.<br/>- it is try<span style='color:#ff0000'>i<span style='color:#ff8000'>n<span style='color:#0080ff'>g to</span></span> </span>agree redundancy with 150 branch m<span style='color:#ffff00'>anage</span><span style='color:#00ff00'>rs wi</span>thin the next few months, and 120 computer and 80 clerical staff are also affected.<br/><b>Matched query 7 with document 4,990 with score 0.0619</b><br/>- preston tisch, 62, is<span style='color:#ff8000'> pre<span style='color:#0080ff'>side</span>nt</span> and co-<span style='color:#ff0000'>chief <span style='color:#00ff00'>e<span style='color:#ff00ff'><span style='color:#800080'>xec</span></span></span><span style='color:#ffff00'>utive off</span>ic</span>er of loews corp. and is a former postmaster general.<br/>- thomas m. egan,<span style='color:#ff8000'> pre<span style='color:#0080ff'>side</span>nt</span> and <span style='color:#ff0000'>chief <span style='color:#00ff00'>e<span style='color:#ff00ff'><span style='color:#800080'>xec</span></span></span><span style='color:#ffff00'>utive off</span>ic</span>er of stotler group, said the company had been in negotiations with the creditors in efforts to reach a solution.<br/><b>Matched query 8 with document 4,686 with score 0.0309</b><br/>- \"we're dealing with an owner wh<span style='color:#ffff00'><span style='color:#00ff00'>o c</span></span>ould<span style='color:#ff8000'>n't </span>giv<span style='color:#ff0000'>e<span style='color:#0080ff'> a </span></span>rip. they cut off her mail and she got<span style='color:#0080ff'> a </span>post office box.\" starting friday, an animal-control officer is accompanying finster on his route.<br/>- we do<span style='color:#ff8000'>n't </span>have t<span style='color:#ffff00'><span style='color:#00ff00'>o c</span></span>hang<span style='color:#ff0000'>e<span style='color:#0080ff'> a </span></span>single thing,\" bush told campaign staff workers<span style='color:#0080ff'> a </span>day after beating sen. bob dole of kansas, his chief rival for the gop presidential nomination, in new hampshire.<br/><b>Matched query 9 with document 1,831 with score 0.0288</b><br/>- asked if he might bri<span style='color:#ff8000'>n<span style='color:#ffff00'>g th</span></span>e world leaders to t<span style='color:#00ff00'>exa</span>s, possibly to san antonio, the <span style='color:#ff0000'>president </span>remarked, \"that's a distinct possibility.<br/>- the t<span style='color:#00ff00'>exa</span>ns' success in persuadi<span style='color:#ff8000'>n<span style='color:#ffff00'>g th</span></span>e <span style='color:#ff0000'>president </span>suggests that mexico may get better treatment in the bush administration than it has in the past.</pre>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "from tqdm import tqdm\n",
+        "from IPython.display import display\n",
+        "\n",
+        "QUERIES_TO_COMPARE = 10\n",
+        "\n",
+        "log_lines = []\n",
+        "\n",
+        "for i, query_hashes, query_counts, query_ngrams in tqdm(\n",
+        "    zip(\n",
+        "        range(QUERIES_TO_COMPARE),\n",
+        "        fingerprint_hashes[:QUERIES_TO_COMPARE],\n",
+        "        fingerprint_counts[:QUERIES_TO_COMPARE],\n",
+        "        fingerprint_ngrams[:QUERIES_TO_COMPARE],\n",
+        "    ),\n",
+        "    desc=\"Searching\",\n",
+        "    unit=\"doc\",\n",
+        "    total=QUERIES_TO_COMPARE,\n",
+        "):\n",
+        "\n",
+        "    # Compare with all other fingerprints\n",
+        "    best_score, best_index = 0.0, -1\n",
+        "    for j, dataset_hashes, dataset_counts, dataset_ngrams in zip(\n",
+        "        range(len(fingerprint_hashes)),\n",
+        "        fingerprint_hashes,\n",
+        "        fingerprint_counts,\n",
+        "        fingerprint_ngrams,\n",
+        "    ):\n",
+        "        if i == j:\n",
+        "            continue\n",
+        "\n",
+        "        score = weighted_jaccard_similarity(\n",
+        "            (query_hashes, query_counts),\n",
+        "            (dataset_hashes, dataset_counts),\n",
+        "        )\n",
+        "        if score > best_score:\n",
+        "            best_score = score\n",
+        "            best_index = j\n",
+        "\n",
+        "    query = textual_lines[i]\n",
+        "    doc = textual_lines[best_index]\n",
+        "    colored_query, colored_doc = color_code_matches(\n",
+        "        query_text=query,\n",
+        "        document_text=doc,\n",
+        "        query_hashes=query_hashes,\n",
+        "        document_hashes=fingerprint_hashes[best_index],\n",
+        "        query_ngrams=query_ngrams,\n",
+        "        document_ngrams=fingerprint_ngrams[best_index],\n",
+        "    )\n",
+        "    log_lines.extend(\n",
+        "        [\n",
+        "            f\"<b>Matched query {i:,} with document {best_index:,} with score {best_score:.4f}</b>\",\n",
+        "            f\"- {colored_query}\",\n",
+        "            f\"- {colored_doc}\",\n",
+        "        ]\n",
+        "    )\n",
+        "\n",
+        "concatenated_log = \"<br/>\".join(log_lines)\n",
+        "monospaced_log = HTML(f\"<pre style='font-family:monospace'>{concatenated_log}</pre>\")\n",
+        "display(monospaced_log)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Min-Hash Fingerprinting DNA & Protein Sequences"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 24,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "dna_dataset_path = dataset_directory / \"acgt_10k.txt\"\n",
+        "dna_dataset = open(dna_dataset_path, \"r\").read().strip()"
       ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
     }
-   ],
-   "source": [
-    "from tqdm import tqdm\n",
-    "from IPython.display import display\n",
-    "\n",
-    "QUERIES_TO_COMPARE = 10\n",
-    "\n",
-    "log_lines = []\n",
-    "\n",
-    "for i, query_hashes, query_counts, query_ngrams in tqdm(\n",
-    "    zip(\n",
-    "        range(QUERIES_TO_COMPARE),\n",
-    "        fingerprint_hashes[:QUERIES_TO_COMPARE],\n",
-    "        fingerprint_counts[:QUERIES_TO_COMPARE],\n",
-    "        fingerprint_ngrams[:QUERIES_TO_COMPARE],\n",
-    "    ),\n",
-    "    desc=\"Searching\",\n",
-    "    unit=\"doc\",\n",
-    "    total=QUERIES_TO_COMPARE,\n",
-    "):\n",
-    "\n",
-    "    # Compare with all other fingerprints\n",
-    "    best_score, best_index = 0.0, -1\n",
-    "    for j, dataset_hashes, dataset_counts, dataset_ngrams in zip(\n",
-    "        range(len(fingerprint_hashes)),\n",
-    "        fingerprint_hashes,\n",
-    "        fingerprint_counts,\n",
-    "        fingerprint_ngrams,\n",
-    "    ):\n",
-    "        if i == j:\n",
-    "            continue\n",
-    "\n",
-    "        score = weighted_jaccard_similarity(\n",
-    "            (query_hashes, query_counts),\n",
-    "            (dataset_hashes, dataset_counts),\n",
-    "        )\n",
-    "        if score > best_score:\n",
-    "            best_score = score\n",
-    "            best_index = j\n",
-    "\n",
-    "    query = textual_lines[i]\n",
-    "    doc = textual_lines[best_index]\n",
-    "    colored_query, colored_doc = color_code_matches(\n",
-    "        query_text=query,\n",
-    "        document_text=doc,\n",
-    "        query_hashes=query_hashes,\n",
-    "        document_hashes=fingerprint_hashes[best_index],\n",
-    "        query_ngrams=query_ngrams,\n",
-    "        document_ngrams=fingerprint_ngrams[best_index],\n",
-    "    )\n",
-    "    log_lines.extend(\n",
-    "        [\n",
-    "            f\"<b>Matched query {i:,} with document {best_index:,} with score {best_score:.4f}</b>\",\n",
-    "            f\"- {colored_query}\",\n",
-    "            f\"- {colored_doc}\",\n",
-    "        ]\n",
-    "    )\n",
-    "\n",
-    "concatenated_log = \"<br/>\".join(log_lines)\n",
-    "monospaced_log = HTML(f\"<pre style='font-family:monospace'>{concatenated_log}</pre>\")\n",
-    "display(monospaced_log)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Min-Hash Fingerprinting DNA & Protein Sequences"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dna_dataset_path = dataset_directory / \"acgt_10k.txt\"\n",
-    "dna_dataset = open(dna_dataset_path, \"r\").read().strip()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "StringZilla",
-   "language": "python",
-   "name": "python3"
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "StringZilla",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.11.11"
+    }
   },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.11"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
+  "nbformat": 4,
+  "nbformat_minor": 2
+}
\ No newline at end of file

From c71e67e2d82f3a9aa53b860621a14fedbcce0897 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 4 Sep 2025 15:31:22 +0000
Subject: [PATCH 732/751] Fix: Merge artifacts & lifetime annotations

Related to #232
---
 include/stringzilla/memory.h        |   6 +-
 include/stringzilla/stringzilla.hpp | 169 ++--------------------------
 scripts/test_stringzilla.cpp        |   6 +-
 3 files changed, 15 insertions(+), 166 deletions(-)

diff --git a/include/stringzilla/memory.h b/include/stringzilla/memory.h
index e11fef03..615a61c2 100644
--- a/include/stringzilla/memory.h
+++ b/include/stringzilla/memory.h
@@ -502,7 +502,7 @@ SZ_PUBLIC void sz_copy_haswell(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
         if (head_length & 16)
             _mm_store_si128((__m128i *)target, _mm_lddqu_si128((__m128i const *)source)), target += 16, source += 16,
                 head_length -= 16;
-        sz_assert(head_length == 0 && "The head length should be zero after the head copy.");
+        sz_assert_(head_length == 0 && "The head length should be zero after the head copy.");
         sz_assert_((sz_size_t)target % 32 == 0 && "Target is supposed to be aligned to the YMM register size.");
 
         // Fill the aligned body of the buffer.
@@ -519,7 +519,7 @@ SZ_PUBLIC void sz_copy_haswell(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
                                    _mm256_lddqu_si256((__m256i const *)(source + body_length - 32)));
             }
             if (body_length) {
-                sz_assert(body_length == 32 && "The only remaining body length should be 32 bytes.");
+                sz_assert_(body_length == 32 && "The only remaining body length should be 32 bytes.");
                 _mm256_store_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
                 target += 32, source += 32, body_length -= 32;
             }
@@ -536,7 +536,7 @@ SZ_PUBLIC void sz_copy_haswell(sz_ptr_t target, sz_cptr_t source, sz_size_t leng
         if (tail_length & 4) *(sz_u32_t *)target = *(sz_u32_t *)source, target += 4, source += 4, tail_length -= 4;
         if (tail_length & 2) *(sz_u16_t *)target = *(sz_u16_t *)source, target += 2, source += 2, tail_length -= 2;
         if (tail_length & 1) *(sz_u8_t *)target = *(sz_u8_t *)source, target++, source++, tail_length--;
-        sz_assert(tail_length == 0 && "The tail length should be zero after the tail copy.");
+        sz_assert_(tail_length == 0 && "The tail length should be zero after the tail copy.");
     }
 }
 
diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
index 0c4c8d1d..7b211b66 100644
--- a/include/stringzilla/stringzilla.hpp
+++ b/include/stringzilla/stringzilla.hpp
@@ -2141,16 +2141,16 @@ class basic_string {
         init(string_view(list.begin(), list.size()));
     }
 
-    operator string_view() const noexcept { return view(); }
-    string_view view() const noexcept {
+    operator string_view() const noexcept sz_lifetime_bound_ { return view(); }
+    string_view view() const noexcept sz_lifetime_bound_ {
         sz_ptr_t string_start;
         sz_size_t string_length;
         sz_string_range(&string_, &string_start, &string_length);
         return {string_start, string_length};
     }
 
-    operator string_span() noexcept { return span(); }
-    string_span span() noexcept {
+    operator string_span() noexcept sz_lifetime_bound_ { return span(); }
+    string_span span() noexcept sz_lifetime_bound_ {
         sz_ptr_t string_start;
         sz_size_t string_length;
         sz_string_range(&string_, &string_start, &string_length);
@@ -2902,157 +2902,6 @@ class basic_string {
     /** @brief Clears the string contents, but @b no deallocations happen. */
     void clear() noexcept { sz_string_erase(&string_, 0, SZ_SIZE_MAX); }
 
-    /**
-     *  @brief  Resizes the string to the given size, filling the new space with the given character,
-     *          or NULL-character if nothing is provided.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
-     */
-    void resize(size_type count, value_type character = '\0') noexcept(false) {
-        if (count > max_size()) throw std::length_error("sz::basic_string::resize");
-        if (!try_resize(count, character)) throw std::bad_alloc();
-    }
-
-    /**
-     *  @brief  Reclaims the unused memory, if any.
-     *  @throw  `std::bad_alloc` if the allocation fails.
-     */
-    void shrink_to_fit() noexcept(false) {
-        if (!try_shrink_to_fit()) throw std::bad_alloc();
-    }
-
-    /**
-     *  @brief  Informs the string object of a planned change in size, so that it pre-allocate once.
-     *  @throw  `std::length_error` if the string is too long.
-     */
-    void reserve(size_type capacity) noexcept(false) {
-        if (capacity > max_size()) throw std::length_error("sz::basic_string::reserve");
-        if (!try_reserve(capacity)) throw std::bad_alloc();
-    }
-
-    /**
-     *  @brief  Inserts ( @b in-place ) a ::character multiple times at the given offset.
-     *  @throw  `std::out_of_range` if `offset > size()`.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
-     */
-    basic_string &insert(size_type offset, size_type repeats, char_type character) noexcept(false) {
-        if (offset > size()) throw std::out_of_range("sz::basic_string::insert");
-        if (size() + repeats > max_size()) throw std::length_error("sz::basic_string::insert");
-        if (!_with_alloc([&](sz_alloc_type &alloc) { return sz_string_expand(&string_, offset, repeats, &alloc); }))
-            throw std::bad_alloc();
-
-        sz_fill(data() + offset, repeats, character);
-        return *this;
-    }
-
-    /**
-     *  @brief  Inserts ( @b in-place ) a range of characters at the given offset.
-     *  @throw  `std::out_of_range` if `offset > size()`.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
-     */
-    basic_string &insert(size_type offset, string_view other) noexcept(false) {
-        if (offset > size()) throw std::out_of_range("sz::basic_string::insert");
-        if (size() + other.size() > max_size()) throw std::length_error("sz::basic_string::insert");
-        if (!_with_alloc(
-                [&](sz_alloc_type &alloc) { return sz_string_expand(&string_, offset, other.size(), &alloc); }))
-            throw std::bad_alloc();
-
-        sz_copy(data() + offset, other.data(), other.size());
-        return *this;
-    }
-
-    /**
-     *  @brief  Inserts ( @b in-place ) a range of characters at the given offset.
-     *  @throw  `std::out_of_range` if `offset > size()`.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
-     */
-    basic_string &insert(size_type offset, const_pointer start, size_type length) noexcept(false) {
-        return insert(offset, string_view(start, length));
-    }
-
-    /**
-     *  @brief  Inserts ( @b in-place ) a slice of another string at the given offset.
-     *  @throw  `std::out_of_range` if `offset > size()` or `other_index > other.size()`.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
-     */
-    basic_string &insert(size_type offset, string_view other, size_type other_index,
-                         size_type count = npos) noexcept(false) {
-        return insert(offset, other.substr(other_index, count));
-    }
-
-    /**
-     *  @brief  Inserts ( @b in-place ) one ::character at the given iterator position.
-     *  @throw  `std::out_of_range` if `pos > size()` or `other_index > other.size()`.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
-     */
-    iterator insert(const_iterator it, char_type character) noexcept(false) sz_lifetime_bound_ {
-        auto pos = range_length(cbegin(), it);
-        insert(pos, string_view(&character, 1));
-        return begin() + pos;
-    }
-
-    /**
-     *  @brief  Inserts ( @b in-place ) a ::character multiple times at the given iterator position.
-     *  @throw  `std::out_of_range` if `pos > size()` or `other_index > other.size()`.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
-     */
-    iterator insert(const_iterator it, size_type repeats, char_type character) noexcept(false) sz_lifetime_bound_ {
-        auto pos = range_length(cbegin(), it);
-        insert(pos, repeats, character);
-        return begin() + pos;
-    }
-
-    /**
-     *  @brief  Inserts ( @b in-place ) a range at the given iterator position.
-     *  @throw  `std::out_of_range` if `pos > size()` or `other_index > other.size()`.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
-     */
-    template <typename input_iterator>
-    iterator insert(const_iterator it, input_iterator first, input_iterator last) noexcept(false) sz_lifetime_bound_ {
-
-        auto pos = range_length(cbegin(), it);
-        if (pos > size()) throw std::out_of_range("sz::basic_string::insert");
-
-        auto added_length = range_length(first, last);
-        if (size() + added_length > max_size()) throw std::length_error("sz::basic_string::insert");
-
-        if (!_with_alloc([&](sz_alloc_type &alloc) { return sz_string_expand(&string_, pos, added_length, &alloc); }))
-            throw std::bad_alloc();
-
-        iterator result = begin() + pos;
-        for (iterator output = result; first != last; ++first, ++output) *output = *first;
-        return result;
-    }
-
-    /**
-     *  @brief  Inserts ( @b in-place ) an initializer list of characters.
-     *  @throw  `std::out_of_range` if `pos > size()` or `other_index > other.size()`.
-     *  @throw  `std::length_error` if the string is too long.
-     *  @throw  `std::bad_alloc` if the allocation fails.
-     */
-    iterator insert(const_iterator it, std::initializer_list<char_type> list) noexcept(false) sz_lifetime_bound_ {
-        return insert(it, list.begin(), list.end());
-    }
-
-    /**
-     *  @brief  Erases ( @b in-place ) the given range of characters.
-     *  @throws `std::out_of_range` if `pos > size()`.
-     *  @see    `try_erase_slice` for a cleaner exception-less alternative.
-     */
-    basic_string &erase(size_type pos = 0, size_type count = npos) noexcept(false) {
-        if (!count || empty()) return *this;
-        if (pos >= size()) throw std::out_of_range("sz::basic_string::erase");
-        sz_string_erase(&string_, pos, count);
-        return *this;
-    }
-
     /**
      *  @brief  Erases ( @b in-place ) the given range of characters.
      *  @return Iterator pointing following the erased character, or end() if no such character exists.
@@ -3065,7 +2914,7 @@ class basic_string {
     }
 
     /**
-     *  @brief Erases @b (in-place) the one character at a given postion.
+     *  @brief Erases @b (in-place) the one character at a given position.
      *  @return Iterator pointing following the erased character, or end() if no such character exists.
      */
     iterator erase(const_iterator pos) noexcept sz_lifetime_bound_ { return erase(pos, pos + 1); }
@@ -3174,7 +3023,7 @@ class basic_string {
      *  @throw `std::length_error` if the string is too long.
      *  @throw `std::bad_alloc` if the allocation fails.
      */
-    iterator insert(const_iterator it, char_type character) noexcept(false) {
+    iterator insert(const_iterator it, char_type character) noexcept(false) sz_lifetime_bound_ {
         auto pos = range_length(cbegin(), it);
         insert(pos, string_view(&character, 1));
         return begin() + pos;
@@ -3186,7 +3035,7 @@ class basic_string {
      *  @throw `std::length_error` if the string is too long.
      *  @throw `std::bad_alloc` if the allocation fails.
      */
-    iterator insert(const_iterator it, size_type repeats, char_type character) noexcept(false) {
+    iterator insert(const_iterator it, size_type repeats, char_type character) noexcept(false) sz_lifetime_bound_ {
         auto pos = range_length(cbegin(), it);
         insert(pos, repeats, character);
         return begin() + pos;
@@ -3199,7 +3048,7 @@ class basic_string {
      *  @throw `std::bad_alloc` if the allocation fails.
      */
     template <typename input_iterator>
-    iterator insert(const_iterator it, input_iterator first, input_iterator last) noexcept(false) {
+    iterator insert(const_iterator it, input_iterator first, input_iterator last) noexcept(false) sz_lifetime_bound_ {
 
         auto pos = range_length(cbegin(), it);
         if (pos > size()) throw std::out_of_range("sz::basic_string::insert");
@@ -3222,7 +3071,7 @@ class basic_string {
      *  @throw `std::length_error` if the string is too long.
      *  @throw `std::bad_alloc` if the allocation fails.
      */
-    iterator insert(const_iterator it, std::initializer_list<char_type> list) noexcept(false) {
+    iterator insert(const_iterator it, std::initializer_list<char_type> list) noexcept(false) sz_lifetime_bound_ {
         return insert(it, list.begin(), list.end());
     }
 
diff --git a/scripts/test_stringzilla.cpp b/scripts/test_stringzilla.cpp
index b8261b73..12b6656e 100644
--- a/scripts/test_stringzilla.cpp
+++ b/scripts/test_stringzilla.cpp
@@ -790,9 +790,6 @@ void test_stl_compatibility_for_reads() {
     assert(str("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+-").find_last_of("xyz") == 25);  // sets
     assert(str("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+-").find_last_of("XYZ") == 51);  // sets
 
-    // Corner case behaviors for long strings
-    assert(str(258, '0').find(str(256, '1')) == str::npos);
-
     // clang-format off
     // Using single-byte non-ASCII values, e.g., À (0xC0), Æ (0xC6)
     assert(str("abcdefgh" "\x01" "\xC6" "ijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" "\xC0" "\xFA" "0123456789+-", 68).find_first_of("\xC6\xC7") == 9);  // sets
@@ -954,6 +951,9 @@ void test_stl_compatibility_for_updates() {
     assert(str(str("hello"), 2) == "llo");             // Construct from another string suffix
     assert(str(str("hello"), 2, 2) == "ll");           // Construct from another string range
 
+    // Corner case constructors and search behaviors for long strings
+    assert(str(258, '0').find(str(256, '1')) == str::npos);
+
     // Assignments.
     assert_scoped(str s = "obsolete", s = "hello", s == "hello");
     assert_scoped(str s = "obsolete", s.assign("hello"), s == "hello");

From 234b7583dc861885b8b93a3901f455b13d079449 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 4 Sep 2025 18:08:02 +0000
Subject: [PATCH 733/751] Add: GoLang official support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co‑authored‑by: Mark Reed mark@chattyr.com (mark@chattyr.com)
Co‑authored‑by: Mark Reed 5108907+MarkReedZ@users.noreply.github.com (5108907+MarkReedZ@users.noreply.github.com)
---
 .github/workflows/prerelease.yml | 26 ++++++++++
 CMakeLists.txt                   |  3 --
 CONTRIBUTING.md                  | 11 ++--
 README.md                        | 87 ++++++++++++++++++++++++++++----
 golang/lib.go                    | 74 +++++++++++++++++++++++----
 golang/lib_test.go               | 31 +++++++++++-
 6 files changed, 201 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index b6f3cfa4..dd7b0187 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -87,6 +87,19 @@ jobs:
           }
       - name: Test C++
         run: build_artifacts/stringzilla_test_cpp20
+      - name: Build shared library for Go binding
+        run: |
+          cmake -B build_shared -DCMAKE_BUILD_TYPE=RelWithDebInfo -DSTRINGZILLA_BUILD_SHARED=1
+          cmake --build build_shared --target stringzilla_shared --config RelWithDebInfo
+      - name: Set up Go
+        uses: actions/setup-go@v6
+        with:
+          go-version: ">=1.24"
+      - name: Test Go binding
+        working-directory: golang
+        env:
+          LD_LIBRARY_PATH: ${{ github.workspace }}/build_shared
+        run: go test -v
       - name: Test on Real World Data
         run: |
           build_artifacts/stringzilla_bench_memory_cpp20 ${DATASET_PATH}     # for string copies and fills
@@ -197,6 +210,19 @@ jobs:
           }
       - name: Test C++
         run: build_artifacts/stringzilla_test_cpp20
+      - name: Build shared library for Go binding
+        run: |
+          cmake -B build_shared -DCMAKE_BUILD_TYPE=RelWithDebInfo -DSTRINGZILLA_BUILD_SHARED=1
+          cmake --build build_shared --target stringzilla_shared --config RelWithDebInfo
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: ">=1.24"
+      - name: Test Go binding
+        working-directory: golang
+        env:
+          LD_LIBRARY_PATH: ${{ github.workspace }}/build_shared
+        run: go test -v
       - name: Test on Real World Data
         run: |
           build_artifacts/stringzilla_bench_find_cpp20 ${DATASET_PATH}      # for substring search
diff --git a/CMakeLists.txt b/CMakeLists.txt
index aee0096a..bc8139dd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -368,9 +368,6 @@ endfunction ()
 function (define_launcher exec_name source cpp_standard target_arch)
     add_executable(${exec_name})
     target_sources(${exec_name} PRIVATE ${source})
-    # TODO: How do we constrain this scope?!
-    #
-    # set_source_files_properties(${source} TARGET_DIRECTORY ${exec_name} PROPERTIES LANGUAGE CXX)
     set_compiler_flags(${exec_name} ${cpp_standard} "${target_arch}" "${CMAKE_CXX_COMPILER_ID}")
     target_link_libraries(${exec_name} PRIVATE stringzilla_header)
     add_test(NAME ${exec_name} COMMAND ${exec_name})
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 5de3a269..ab25a52d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -20,11 +20,12 @@ The project is split into the following parts:
 - `include/stringzilla/stringzilla.h` - single-header C implementation.
 - `include/stringzilla/stringzilla.hpp` - single-header C++ wrapper.
 - `include/stringzillas/*` - parallel CPU/GPU header-only backends.
-- `c/*` - C sources for dynamic dispatch and parallel backends.
-- `rust/*` - Rust crate sources.
-- `python/*` - Python bindings.
-- `swift/*` - Swift package sources and tests.
-- `javascript/*` - JavaScript bindings.
+- `c/*` - [C](#contributing-in-c-and-c) sources for dynamic dispatch and parallel backends.
+- `rust/*` - [Rust](#contributing-in-rust) crate sources.
+- `python/*` - [Python](#contributing-in-python) bindings.
+- `swift/*` - [Swift](#contributing-in-swift) package sources and tests.
+- `javascript/*` - [JavaScript](#contributing-in-javascript) bindings.
+- `golang/*` - [Go](#contributing-in-golang) bindings.
 - `scripts/*` - Scripts for benchmarking and testing.
 - `cli/*` - SIMD-accelerated CLI utilities.
 
diff --git a/README.md b/README.md
index 8ad1cc6b..6431df1a 100644
--- a/README.md
+++ b/README.md
@@ -32,6 +32,7 @@ It __accelerates exact and fuzzy string matching, edit distance computations, so
 - 🐍 __[Python](#quick-start-python-🐍):__ Upgrade your `str` to faster `Str`
 - 🍎 __[Swift](#quick-start-swift-🍏):__ Use the `String+StringZilla` extension
 - 🦀 __[Rust](#quick-start-rust-🦀):__ Use the `StringZilla` traits crate
+- 🦫 __[Go](#quick-start-golang-🦫):__ Use the `StringZilla` cGo module
 - 🐚 __[Shell][faq-shell]__: Accelerate common CLI tools with `sz_` prefix
 - 📚 Researcher? Jump to [Algorithms & Design Decisions](#algorithms--design-decisions-📚)
 - 💡 Thinking to contribute? Look for ["good first issues"][first-issues]
@@ -314,17 +315,17 @@ Both layers are designed to be extremely portable:
 Not all features are available across all bindings.
 Consider contributing if you need a feature that's not yet implemented.
 
-|                                | Maturity |   C   |  C++  | Python | Rust  |  JS   | Swift |
-| :----------------------------- | :------: | :---: | :---: | :----: | :---: | :---: | :---: |
-| Substring Search               |    🌳     |   ✅   |   ✅   |   ✅    |   ✅   |   ✅   |   ✅   |
-| Character Set Search           |    🌳     |   ✅   |   ✅   |   ✅    |   ✅   |   ✅   |   ✅   |
-| Sorting & Sequence Operations  |    🌳     |   ✅   |   ✅   |   ✅    |   ✅   |   ⚪   |   ⚪   |
-| Streaming Hashes               |    🌳     |   ✅   |   ✅   |   ✅    |   ✅   |   ✅   |   ✅   |
-| Small String Class             |    🧐     |   ✅   |   ✅   |   ❌    |   ⚪   |   ❌   |   ❌   |
-| Lazy Ranges, Compressed Arrays |    🌳     |   ❌   |   ✅   |   ✅    |   ✅   |   ❌   |   ⚪   |
-|                                |          |       |       |        |       |       |       |  |
-| Parallel Similarity Scoring    |    🌳     |   ✅   |   ✅   |   ✅    |   ✅   |   ⚪   |   ⚪   |
-| Parallel Rolling Fingerprints  |    🌳     |   ✅   |   ✅   |   ✅    |   ✅   |   ⚪   |   ⚪   |
+|                                | Maturity |   C   |  C++  | Python | Rust  |  JS   | Swift |  Go   |
+| :----------------------------- | :------: | :---: | :---: | :----: | :---: | :---: | :---: | :---: |
+| Substring Search               |    🌳     |   ✅   |   ✅   |   ✅    |   ✅   |   ✅   |   ✅   |   ✅   |
+| Character Set Search           |    🌳     |   ✅   |   ✅   |   ✅    |   ✅   |   ✅   |   ✅   |   ✅   |
+| Sorting & Sequence Operations  |    🌳     |   ✅   |   ✅   |   ✅    |   ✅   |   ⚪   |   ⚪   |   ⚪   |
+| Streaming Hashes               |    🌳     |   ✅   |   ✅   |   ✅    |   ✅   |   ✅   |   ✅   |   ✅   |
+| Small String Class             |    🧐     |   ✅   |   ✅   |   ❌    |   ⚪   |   ❌   |   ❌   |   ❌   |
+| Lazy Ranges, Compressed Arrays |    🌳     |   ❌   |   ✅   |   ✅    |   ✅   |   ❌   |   ⚪   |   ⚪   |
+|                                |          |       |       |        |       |       |       |       |
+| Parallel Similarity Scoring    |    🌳     |   ✅   |   ✅   |   ✅    |   ✅   |   ⚪   |   ⚪   |   ⚪   |
+| Parallel Rolling Fingerprints  |    🌳     |   ✅   |   ✅   |   ✅    |   ✅   |   ⚪   |   ⚪   |   ⚪   |
 
 > 🌳 parts are used in production.
 > 🧐 parts are in beta.
@@ -1733,6 +1734,70 @@ let streamedHash = hasher.digest()
 assert(hash == streamedHash)
 ```
 
+## Quick Start: GoLang 🦫
+
+Add the Go binding as a module dependency:
+
+```bash
+go get github.com/ashvardanian/stringzilla/golang@latest
+```
+
+Build the shared C library once, then ensure your runtime can locate it (Linux shown):
+
+```bash
+cmake -B build_shared -D STRINGZILLA_BUILD_SHARED=1 -D CMAKE_BUILD_TYPE=Release
+cmake --build build_shared --target stringzilla_shared --config Release
+export LD_LIBRARY_PATH="$PWD/build_shared:$LD_LIBRARY_PATH"
+```
+
+Use finders (substring, bytes, and sets):
+
+```go
+package main
+
+import (
+    "fmt"
+    sz "github.com/ashvardanian/stringzilla/golang"
+)
+
+func main() {
+    s := "the quick brown fox jumps over the lazy dog"
+
+    // Substrings
+    fmt.Println(sz.Contains(s, "brown"))        // true
+    fmt.Println(sz.Index(s, "the"))             // 0
+    fmt.Println(sz.LastIndex(s, "the"))         // 35
+
+    // Single bytes
+    fmt.Println(sz.IndexByte(s, 'o'))            // 12
+    fmt.Println(sz.LastIndexByte(s, 'o'))        // 41
+
+    // Byte sets
+    fmt.Println(sz.IndexAny(s, "aeiou"))        // 2  (first vowel)
+    fmt.Println(sz.LastIndexAny(s, "aeiou"))    // 43 (last vowel)
+
+    // Counting with/without overlaps
+    fmt.Println(sz.Count("aaaaa", "aa", false)) // 2
+    fmt.Println(sz.Count("aaaaa", "aa", true))  // 4
+    fmt.Println(sz.Count("abc", "", false))     // 4
+    fmt.Println(sz.Bytesum("ABC"), sz.Bytesum("ABCD"))
+}
+```
+
+### Hash
+
+Single-shot and incremental hashing are both supported:
+
+```go
+one := sz.Hash("Hello, world!", 42)
+
+hasher := sz.NewHasher(42)
+hasher.Write([]byte("Hello, "))
+hasher.Write([]byte("world!"))
+streamed := hasher.Digest()
+fmt.Println(one == streamed) // true
+```
+
 ## Algorithms & Design Decisions 📚
 
 StringZilla aims to optimize some of the slowest string operations.
diff --git a/golang/lib.go b/golang/lib.go
index 4152d221..0ba64abd 100644
--- a/golang/lib.go
+++ b/golang/lib.go
@@ -12,8 +12,8 @@
 // binary data processing, with less emphasis on UTF-8 and locale-specific tasks.
 package sz
 
-// #cgo CFLAGS: -O3
-// #cgo LDFLAGS: -L. -L/usr/local/lib -lstringzilla_shared
+// #cgo CFLAGS: -O3 -I../include
+// #cgo LDFLAGS: -L. -L/usr/local/lib -L../build_release -lstringzilla_shared
 // #cgo noescape sz_find
 // #cgo nocallback sz_find
 // #cgo noescape sz_find_byte
@@ -22,10 +22,20 @@ package sz
 // #cgo nocallback sz_rfind
 // #cgo noescape sz_rfind_byte
 // #cgo nocallback sz_rfind_byte
-// #cgo noescape sz_find_char_from
-// #cgo nocallback sz_find_char_from
-// #cgo noescape sz_rfind_char_from
-// #cgo nocallback sz_rfind_char_from
+// #cgo noescape sz_find_byte_from
+// #cgo nocallback sz_find_byte_from
+// #cgo noescape sz_rfind_byte_from
+// #cgo nocallback sz_rfind_byte_from
+// #cgo noescape sz_bytesum
+// #cgo nocallback sz_bytesum
+// #cgo noescape sz_hash
+// #cgo nocallback sz_hash
+// #cgo noescape sz_hash_state_init
+// #cgo nocallback sz_hash_state_init
+// #cgo noescape sz_hash_state_update
+// #cgo nocallback sz_hash_state_update
+// #cgo noescape sz_hash_state_digest
+// #cgo nocallback sz_hash_state_digest
 // #define SZ_DYNAMIC_DISPATCH 1
 // #include <stringzilla/stringzilla.h>
 import "C"
@@ -103,13 +113,14 @@ func LastIndexByte(str string, c byte) int64 {
 }
 
 // Index returns the index of the first instance of any byte from `substr` in `str`, or -1 if none are present.
+// Note: This is byte-set based (ASCII/bytes), not Unicode rune semantics like strings.IndexAny.
 // https://pkg.go.dev/strings#IndexAny
 func IndexAny(str string, substr string) int64 {
 	strPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(str)))
 	strLen := len(str)
 	substrPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(substr)))
 	substrLen := len(substr)
-	matchPtr := unsafe.Pointer(C.sz_find_char_from(strPtr, C.ulong(strLen), substrPtr, C.ulong(substrLen)))
+	matchPtr := unsafe.Pointer(C.sz_find_byte_from(strPtr, C.ulong(strLen), substrPtr, C.ulong(substrLen)))
 	if matchPtr == nil {
 		return -1
 	}
@@ -117,19 +128,60 @@ func IndexAny(str string, substr string) int64 {
 }
 
 // Index returns the index of the last instance of any byte from `substr` in `str`, or -1 if none are present.
+// Note: This is byte-set based (ASCII/bytes), not Unicode rune semantics like strings.LastIndexAny.
 // https://pkg.go.dev/strings#LastIndexAny
 func LastIndexAny(str string, substr string) int64 {
 	strPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(str)))
 	strLen := len(str)
 	substrPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(substr)))
 	substrLen := len(substr)
-	matchPtr := unsafe.Pointer(C.sz_rfind_char_from(strPtr, C.ulong(strLen), substrPtr, C.ulong(substrLen)))
+	matchPtr := unsafe.Pointer(C.sz_rfind_byte_from(strPtr, C.ulong(strLen), substrPtr, C.ulong(substrLen)))
 	if matchPtr == nil {
 		return -1
 	}
 	return int64(uintptr(matchPtr) - uintptr(unsafe.Pointer(strPtr)))
 }
 
+// Bytesum computes a simple 64-bit checksum by summing bytes.
+func Bytesum(str string) uint64 {
+	strPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(str)))
+	strLen := C.ulong(len(str))
+	return uint64(C.sz_bytesum(strPtr, strLen))
+}
+
+// Hash computes a 64-bit non-cryptographic hash with a seed.
+func Hash(str string, seed uint64) uint64 {
+	strPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(str)))
+	strLen := C.ulong(len(str))
+	return uint64(C.sz_hash(strPtr, strLen, (C.sz_u64_t)(seed)))
+}
+
+// Hasher is a streaming hasher compatible with Hash when fed the same data.
+type Hasher struct {
+	state C.sz_hash_state_t
+}
+
+// NewHasher creates a new streaming hasher with the given seed.
+func NewHasher(seed uint64) *Hasher {
+	h := &Hasher{}
+	C.sz_hash_state_init(&h.state, (C.sz_u64_t)(seed))
+	return h
+}
+
+// Write adds data to the streaming hasher.
+func (h *Hasher) Write(p []byte) *Hasher {
+	if len(p) == 0 {
+		return h
+	}
+	C.sz_hash_state_update(&h.state, (*C.char)(unsafe.Pointer(&p[0])), C.ulong(len(p)))
+	return h
+}
+
+// Digest returns the current 64-bit hash without consuming the state.
+func (h *Hasher) Digest() uint64 {
+	return uint64(C.sz_hash_state_digest(&h.state))
+}
+
 // Count returns the number of overlapping or non-overlapping instances of `substr` in `str`.
 // If `substr` is an empty string, returns 1 + the length of the `str`.
 // https://pkg.go.dev/strings#Count
@@ -139,12 +191,12 @@ func Count(str string, substr string, overlap bool) int64 {
 	substrPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(substr)))
 	substrLen := int64(len(substr))
 
-	if strLen == 0 || strLen < substrLen {
-		return 0
-	}
 	if substrLen == 0 {
 		return 1 + strLen
 	}
+	if strLen == 0 || strLen < substrLen {
+		return 0
+	}
 
 	count := int64(0)
 	if overlap == true {
diff --git a/golang/lib_test.go b/golang/lib_test.go
index 561c47c5..884c3e0d 100644
--- a/golang/lib_test.go
+++ b/golang/lib_test.go
@@ -157,7 +157,10 @@ func TestCount(t *testing.T) {
 		{"aaaaa", "a", false, 5},
 		{"aaaaa", "aa", false, 2},
 		{"aaaaa", "aa", true, 4},
-		{"", "", false, 0}, // depending on your intended behavior, adjust as needed
+		{"", "", false, 1},    // empty substring counts as len("") + 1
+		{"", "", true, 1},     // overlap flag doesn't affect empty-substring semantics
+		{"abc", "", false, 4}, // empty substring counts as len("abc") + 1
+		{"", "a", false, 0},   // non-empty needle in empty haystack
 	}
 
 	for _, tt := range tests {
@@ -167,3 +170,29 @@ func TestCount(t *testing.T) {
 		}
 	}
 }
+
+// TestHashing verifies hashing and streaming API properties.
+func TestHashing(t *testing.T) {
+	// Deterministic and seed-sensitive
+	a := sz.Hash("Hello, world!", 42)
+	b := sz.Hash("Hello, world!", 42)
+	c := sz.Hash("Hello, world!", 43)
+	if a != b {
+		t.Fatalf("Hash not deterministic: %d != %d", a, b)
+	}
+	if a == c {
+		t.Fatalf("Different seeds should yield different hashes: %d == %d", a, c)
+	}
+
+	// Streaming equals one-shot
+	h := sz.NewHasher(42)
+	h.Write([]byte("Hello, ")).Write([]byte("world!"))
+	if a != h.Digest() {
+		t.Fatalf("Streaming digest mismatch: %d != %d", a, h.Digest())
+	}
+
+	// Bytesum should be monotonic with appended byte (sanity check)
+	if sz.Bytesum("A") >= sz.Bytesum("AB") {
+		t.Fatalf("Bytesum not increasing with appended byte")
+	}
+}

From 802d69901d86fb1a8f38c4afed146913170020a5 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 4 Sep 2025 18:31:13 +0000
Subject: [PATCH 734/751] Add: `Str.count_byteset` for Python

---
 python/stringzilla.c | 142 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 142 insertions(+)

diff --git a/python/stringzilla.c b/python/stringzilla.c
index ecbf5627..1e3e51f5 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -3107,6 +3107,148 @@ static PyObject *Str_find_last_not_of(PyObject *self, PyObject *const *args, Py_
     return PyLong_FromSsize_t(signed_offset);
 }
 
+static char const doc_count_byteset[] = //
+    "Count the occurrences of any character from a set of characters.\n"
+    "\n"
+    "Args:\n"
+    "  text (Str or str or bytes): The string object.\n"
+    "  chars (str): A string containing characters to count.\n"
+    "  start (int, optional): Starting index (default is 0).\n"
+    "  end (int, optional): Ending index (default is the string length).\n"
+    "Returns:\n"
+    "  int: The number of occurrences of any character from the set.";
+
+static PyObject *Str_like_count_byteset(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                        PyObject *args_names_tuple) {
+    // Fast path variables
+    PyObject *haystack_obj = NULL;
+    PyObject *needle_obj = NULL;
+    PyObject *start_obj = NULL;
+    PyObject *end_obj = NULL;
+
+    int const is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+
+    // Fast argument validation
+    Py_ssize_t const args_names_count = args_names_tuple ? PyTuple_Size(args_names_tuple) : 0;
+    Py_ssize_t const total_args = positional_args_count + args_names_count;
+    Py_ssize_t const expected_min = is_member ? 1 : 2; // chars is required
+    Py_ssize_t const expected_max = expected_min + 2;  // + start + end
+
+    if (total_args < expected_min || total_args > expected_max) {
+        PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
+        return NULL;
+    }
+
+    if (positional_args_count > expected_max) {
+        PyErr_SetString(PyExc_TypeError, "Too many positional arguments");
+        return NULL;
+    }
+
+    // Fast positional argument extraction
+    if (is_member) {
+        haystack_obj = self;
+        if (positional_args_count >= 1) needle_obj = args[0];
+        if (positional_args_count >= 2) start_obj = args[1];
+        if (positional_args_count >= 3) end_obj = args[2];
+    }
+    else {
+        if (positional_args_count >= 1) haystack_obj = args[0];
+        if (positional_args_count >= 2) needle_obj = args[1];
+        if (positional_args_count >= 3) start_obj = args[2];
+        if (positional_args_count >= 4) end_obj = args[3];
+    }
+
+    // Fast keyword argument parsing
+    if (args_names_count > 0) {
+        for (Py_ssize_t i = 0; i < args_names_count; ++i) {
+            PyObject *const key = PyTuple_GetItem(args_names_tuple, i);
+            PyObject *const value = args[positional_args_count + i];
+
+            if (PyUnicode_CompareWithASCIIString(key, "start") == 0) {
+                if (start_obj) {
+                    PyErr_SetString(PyExc_TypeError, "start specified twice");
+                    return NULL;
+                }
+                start_obj = value;
+            }
+            else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) {
+                if (end_obj) {
+                    PyErr_SetString(PyExc_TypeError, "end specified twice");
+                    return NULL;
+                }
+                end_obj = value;
+            }
+            else if (!is_member && PyUnicode_CompareWithASCIIString(key, "text") == 0) {
+                if (haystack_obj) {
+                    PyErr_SetString(PyExc_TypeError, "text specified twice");
+                    return NULL;
+                }
+                haystack_obj = value;
+            }
+            else if (PyUnicode_CompareWithASCIIString(key, "chars") == 0) {
+                if (needle_obj) {
+                    PyErr_SetString(PyExc_TypeError, "chars specified twice");
+                    return NULL;
+                }
+                needle_obj = value;
+            }
+            else {
+                PyErr_SetString(PyExc_TypeError, "Unknown keyword argument");
+                return NULL;
+            }
+        }
+    }
+
+    // Validate required arguments
+    if (!haystack_obj || !needle_obj) {
+        PyErr_SetString(PyExc_TypeError, "Required arguments missing");
+        return NULL;
+    }
+
+    // Parse string objects
+    sz_string_view_t haystack_view;
+    sz_string_view_t needle_view;
+    if (!sz_py_export_string_like(haystack_obj, &haystack_view.start, &haystack_view.length) ||
+        !sz_py_export_string_like(needle_obj, &needle_view.start, &needle_view.length)) {
+        wrap_current_exception("Haystack and needle must be string-like");
+        return NULL;
+    }
+
+    // Parse slice bounds
+    Py_ssize_t start_idx = start_obj ? PyLong_AsSsize_t(start_obj) : 0;
+    Py_ssize_t end_idx = end_obj ? PyLong_AsSsize_t(end_obj) : PY_SSIZE_T_MAX;
+    if ((start_idx == -1 || end_idx == -1) && PyErr_Occurred()) return NULL;
+
+    // Normalize slice indices
+    if (end_idx == PY_SSIZE_T_MAX) end_idx = (Py_ssize_t)haystack_view.length;
+    sz_size_t normalized_offset, normalized_length;
+    sz_ssize_clamp_interval(haystack_view.length, start_idx, end_idx, &normalized_offset, &normalized_length);
+    haystack_view.start += normalized_offset;
+    haystack_view.length = normalized_length;
+
+    // Handle empty cases
+    if (needle_view.length == 0 || haystack_view.length == 0) return PyLong_FromSsize_t(0);
+
+    // Count occurrences using `sz_find_byte_from`
+    sz_size_t count = 0;
+    sz_cptr_t current_pos = haystack_view.start;
+    sz_size_t remaining_length = haystack_view.length;
+
+    while (remaining_length > 0) {
+        sz_cptr_t found = sz_find_byte_from(current_pos, remaining_length, needle_view.start, needle_view.length);
+        if (found == NULL) break;
+
+        count++;
+        // Move past the found character
+        sz_size_t offset = (sz_size_t)(found - current_pos + 1);
+        if (offset > remaining_length) break;
+        current_pos = found + 1;
+        remaining_length -= offset;
+    }
+
+    return PyLong_FromSize_t(count);
+}
+
 /**
  *  @brief  Given parsed split settings, constructs an iterator that would produce that split.
  */

From 5fdd8ee0270021b3a8661f568edc8d9aa8d3b070 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 4 Sep 2025 18:36:29 +0000
Subject: [PATCH 735/751] Improve: `Str_like_*` naming convention

---
 python/stringzilla.c | 216 ++++++++++++++++++++++---------------------
 1 file changed, 111 insertions(+), 105 deletions(-)

diff --git a/python/stringzilla.c b/python/stringzilla.c
index 1e3e51f5..3158b2f4 100644
--- a/python/stringzilla.c
+++ b/python/stringzilla.c
@@ -21,6 +21,10 @@
  *  - `sz_py_export_strings_as_u32tape`.
  *  - `sz_py_export_strings_as_u64tape`.
  *  - `sz_py_replace_strings_allocator`.
+ *
+ *  Function Naming Convention:
+ *  - `Str_like_*`: Functions that can be called both as module-level functions AND as member methods.
+ *  - `Str_*`: Functions that are member-only methods or have simpler calling conventions.
  */
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
 #define NOMINMAX
@@ -1246,8 +1250,8 @@ static char const doc_fill_random[] = //
     "Returns:\n"
     "  None: Mutates the buffer slice in place.";
 
-static PyObject *Str_fill_random(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                                 PyObject *args_names_tuple) {
+static PyObject *Str_like_fill_random(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                      PyObject *args_names_tuple) {
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
     if (positional_args_count < !is_member || positional_args_count > !is_member + 3) {
         PyErr_SetString(PyExc_TypeError, "fill_random() expects 1 to 4 positional arguments");
@@ -2020,8 +2024,8 @@ static char const doc_decode[] = //
     "Raises:\n"
     "  UnicodeDecodeError: If decoding fails.";
 
-static PyObject *Str_decode(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                            PyObject *args_names_tuple) {
+static PyObject *Str_like_decode(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                 PyObject *args_names_tuple) {
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
     if (positional_args_count < !is_member || positional_args_count > !is_member + 2) {
         PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
@@ -2328,8 +2332,8 @@ static char const doc_contains[] = //
     "Returns:\n"
     "  bool: True if the substring is found, False otherwise.";
 
-static PyObject *Str_contains(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                              PyObject *args_names_tuple) {
+static PyObject *Str_like_contains(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                   PyObject *args_names_tuple) {
     Py_ssize_t signed_offset;
     sz_string_view_t text;
     sz_string_view_t separator;
@@ -2351,8 +2355,8 @@ static char const doc_find[] = //
     "Returns:\n"
     "  int: The index of the first occurrence, or -1 if not found.";
 
-static PyObject *Str_find(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                          PyObject *args_names_tuple) {
+static PyObject *Str_like_find(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                               PyObject *args_names_tuple) {
     Py_ssize_t signed_offset;
     sz_string_view_t text;
     sz_string_view_t separator;
@@ -2375,8 +2379,8 @@ static char const doc_index[] = //
     "Raises:\n"
     "  ValueError: If the substring is not found.";
 
-static PyObject *Str_index(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                           PyObject *args_names_tuple) {
+static PyObject *Str_like_index(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                PyObject *args_names_tuple) {
     Py_ssize_t signed_offset;
     sz_string_view_t text;
     sz_string_view_t separator;
@@ -2401,8 +2405,8 @@ static char const doc_rfind[] = //
     "Returns:\n"
     "  int: The index of the last occurrence, or -1 if not found.";
 
-static PyObject *Str_rfind(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                           PyObject *args_names_tuple) {
+static PyObject *Str_like_rfind(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                PyObject *args_names_tuple) {
     Py_ssize_t signed_offset;
     sz_string_view_t text;
     sz_string_view_t separator;
@@ -2425,8 +2429,8 @@ static char const doc_rindex[] = //
     "Raises:\n"
     "  ValueError: If the substring is not found.";
 
-static PyObject *Str_rindex(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                            PyObject *args_names_tuple) {
+static PyObject *Str_like_rindex(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                 PyObject *args_names_tuple) {
     Py_ssize_t signed_offset;
     sz_string_view_t text;
     sz_string_view_t separator;
@@ -2505,8 +2509,8 @@ static char const doc_partition[] = //
     "Returns:\n"
     "  tuple: A 3-tuple (head, separator, tail). If the separator is not found, returns (self, '', '').";
 
-static PyObject *Str_partition(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                               PyObject *args_names_tuple) {
+static PyObject *Str_like_partition(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                    PyObject *args_names_tuple) {
     return Str_partition_implementation_(self, args, positional_args_count, args_names_tuple, &sz_find, sz_false_k);
 }
 
@@ -2519,8 +2523,8 @@ static char const doc_rpartition[] = //
     "Returns:\n"
     "  tuple: A 3-tuple (head, separator, tail). If the separator is not found, returns ('', '', self).";
 
-static PyObject *Str_rpartition(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                                PyObject *args_names_tuple) {
+static PyObject *Str_like_rpartition(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                     PyObject *args_names_tuple) {
     return Str_partition_implementation_(self, args, positional_args_count, args_names_tuple, &sz_rfind, sz_true_k);
 }
 
@@ -2536,8 +2540,8 @@ static char const doc_count[] = //
     "Returns:\n"
     "  int: The number of occurrences of the substring.";
 
-static PyObject *Str_count(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                           PyObject *args_names_tuple) {
+static PyObject *Str_like_count(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                PyObject *args_names_tuple) {
     // Fast path variables
     PyObject *haystack_obj = NULL;
     PyObject *needle_obj = NULL;
@@ -2669,8 +2673,8 @@ static char const doc_startswith[] = //
     "Returns:\n"
     "  bool: True if the string starts with the prefix, False otherwise.";
 
-static PyObject *Str_startswith(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                                PyObject *args_names_tuple) {
+static PyObject *Str_like_startswith(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                     PyObject *args_names_tuple) {
     // Fast path variables
     PyObject *str_obj = NULL;
     PyObject *prefix_obj = NULL;
@@ -2777,8 +2781,8 @@ static char const doc_endswith[] = //
     "Returns:\n"
     "  bool: True if the string ends with the suffix, False otherwise.";
 
-static PyObject *Str_endswith(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                              PyObject *args_names_tuple) {
+static PyObject *Str_like_endswith(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                   PyObject *args_names_tuple) {
     // Fast path variables
     PyObject *str_obj = NULL;
     PyObject *suffix_obj = NULL;
@@ -2890,8 +2894,8 @@ static char const doc_translate[] = //
     "  ValueError: If the table is not 256 bytes long.\n"
     "  TypeError: If the table is not a string or dictionary.";
 
-static PyObject *Str_translate(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                               PyObject *args_names_tuple) {
+static PyObject *Str_like_translate(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                    PyObject *args_names_tuple) {
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
     if (positional_args_count < !is_member + 1 || positional_args_count > !is_member + 4) {
         PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
@@ -3030,8 +3034,8 @@ static char const doc_find_first_of[] = //
     "Returns:\n"
     "  int: Index of the first matching character, or -1 if none found.";
 
-static PyObject *Str_find_first_of(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                                   PyObject *args_names_tuple) {
+static PyObject *Str_like_find_first_of(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                        PyObject *args_names_tuple) {
     Py_ssize_t signed_offset;
     sz_string_view_t text;
     sz_string_view_t separator;
@@ -3052,8 +3056,8 @@ static char const doc_find_first_not_of[] = //
     "Returns:\n"
     "  int: Index of the first non-matching character, or -1 if all match.";
 
-static PyObject *Str_find_first_not_of(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                                       PyObject *args_names_tuple) {
+static PyObject *Str_like_find_first_not_of(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                            PyObject *args_names_tuple) {
     Py_ssize_t signed_offset;
     sz_string_view_t text;
     sz_string_view_t separator;
@@ -3074,8 +3078,8 @@ static char const doc_find_last_of[] = //
     "Returns:\n"
     "  int: Index of the last matching character, or -1 if none found.";
 
-static PyObject *Str_find_last_of(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                                  PyObject *args_names_tuple) {
+static PyObject *Str_like_find_last_of(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                       PyObject *args_names_tuple) {
     Py_ssize_t signed_offset;
     sz_string_view_t text;
     sz_string_view_t separator;
@@ -3096,8 +3100,8 @@ static char const doc_find_last_not_of[] = //
     "Returns:\n"
     "  int: Index of the last non-matching character, or -1 if all match.";
 
-static PyObject *Str_find_last_not_of(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                                      PyObject *args_names_tuple) {
+static PyObject *Str_like_find_last_not_of(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                           PyObject *args_names_tuple) {
     Py_ssize_t signed_offset;
     sz_string_view_t text;
     sz_string_view_t separator;
@@ -3561,8 +3565,8 @@ static char const doc_split[] = //
     "Raises:\n"
     "  ValueError: If the separator is an empty string.";
 
-static PyObject *Str_split(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                           PyObject *args_names_tuple) {
+static PyObject *Str_like_split(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                PyObject *args_names_tuple) {
     return Str_split_with_known_callback(self, args, positional_args_count, args_names_tuple, &sz_find, 0, sz_false_k,
                                          sz_false_k);
 }
@@ -3580,8 +3584,8 @@ static char const doc_rsplit[] = //
     "Raises:\n"
     "  ValueError: If the separator is an empty string.";
 
-static PyObject *Str_rsplit(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                            PyObject *args_names_tuple) {
+static PyObject *Str_like_rsplit(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                 PyObject *args_names_tuple) {
     return Str_split_with_known_callback(self, args, positional_args_count, args_names_tuple, &sz_rfind, 0, sz_true_k,
                                          sz_false_k);
 }
@@ -3597,8 +3601,8 @@ static char const doc_split_byteset[] = //
     "Returns:\n"
     "  Strs: A list of strings split by the character set.";
 
-static PyObject *Str_split_byteset(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                                   PyObject *args_names_tuple) {
+static PyObject *Str_like_split_byteset(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                        PyObject *args_names_tuple) {
     return Str_split_with_known_callback(self, args, positional_args_count, args_names_tuple, &sz_find_byte_from, 1,
                                          sz_false_k, sz_false_k);
 }
@@ -3614,8 +3618,8 @@ static char const doc_rsplit_byteset[] = //
     "Returns:\n"
     "  Strs: A list of strings split by the character set.";
 
-static PyObject *Str_rsplit_byteset(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                                    PyObject *args_names_tuple) {
+static PyObject *Str_like_rsplit_byteset(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                         PyObject *args_names_tuple) {
     return Str_split_with_known_callback(self, args, positional_args_count, args_names_tuple, &sz_rfind_byte_from, 1,
                                          sz_true_k, sz_false_k);
 }
@@ -3632,8 +3636,8 @@ static char const doc_split_iter[] = //
     "Raises:\n"
     "  ValueError: If the separator is an empty string.";
 
-static PyObject *Str_split_iter(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                                PyObject *args_names_tuple) {
+static PyObject *Str_like_split_iter(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                     PyObject *args_names_tuple) {
     return Str_split_with_known_callback(self, args, positional_args_count, args_names_tuple, &sz_find, 0, sz_false_k,
                                          sz_true_k);
 }
@@ -3650,8 +3654,8 @@ static char const doc_rsplit_iter[] = //
     "Raises:\n"
     "  ValueError: If the separator is an empty string.";
 
-static PyObject *Str_rsplit_iter(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                                 PyObject *args_names_tuple) {
+static PyObject *Str_like_rsplit_iter(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                      PyObject *args_names_tuple) {
     return Str_split_with_known_callback(self, args, positional_args_count, args_names_tuple, &sz_rfind, 0, sz_true_k,
                                          sz_true_k);
 }
@@ -3666,8 +3670,8 @@ static char const doc_split_byteset_iter[] = //
     "Returns:\n"
     "  iterator: An iterator yielding split substrings.";
 
-static PyObject *Str_split_byteset_iter(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                                        PyObject *args_names_tuple) {
+static PyObject *Str_like_split_byteset_iter(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                             PyObject *args_names_tuple) {
     return Str_split_with_known_callback(self, args, positional_args_count, args_names_tuple, &sz_find_byte_from, 1,
                                          sz_false_k, sz_true_k);
 }
@@ -3682,8 +3686,8 @@ static char const doc_rsplit_byteset_iter[] = //
     "Returns:\n"
     "  iterator: An iterator yielding split substrings in reverse.";
 
-static PyObject *Str_rsplit_byteset_iter(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                                         PyObject *args_names_tuple) {
+static PyObject *Str_like_rsplit_byteset_iter(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                              PyObject *args_names_tuple) {
     return Str_split_with_known_callback(self, args, positional_args_count, args_names_tuple, &sz_rfind_byte_from, 1,
                                          sz_true_k, sz_true_k);
 }
@@ -3698,8 +3702,8 @@ static char const doc_splitlines[] = //
     "Returns:\n"
     "  Strs: A list of strings split by line breaks.";
 
-static PyObject *Str_splitlines(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
-                                PyObject *args_names_tuple) {
+static PyObject *Str_like_splitlines(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
+                                     PyObject *args_names_tuple) {
     // Check minimum arguments
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
     if (positional_args_count < !is_member || positional_args_count > !is_member + 2) {
@@ -3841,45 +3845,46 @@ static PyGetSetDef Str_getsetters[] = {
 #define SZ_METHOD_FLAGS METH_FASTCALL | METH_KEYWORDS
 
 static PyMethodDef Str_methods[] = {
-    {"contains", (PyCFunction)Str_contains, SZ_METHOD_FLAGS, doc_contains},
-    {"count", (PyCFunction)Str_count, SZ_METHOD_FLAGS, doc_count},
-    {"splitlines", (PyCFunction)Str_splitlines, SZ_METHOD_FLAGS, doc_splitlines},
-    {"startswith", (PyCFunction)Str_startswith, SZ_METHOD_FLAGS, doc_startswith},
-    {"endswith", (PyCFunction)Str_endswith, SZ_METHOD_FLAGS, doc_endswith},
-    {"decode", (PyCFunction)Str_decode, SZ_METHOD_FLAGS, doc_decode},
+    {"contains", (PyCFunction)Str_like_contains, SZ_METHOD_FLAGS, doc_contains},
+    {"count", (PyCFunction)Str_like_count, SZ_METHOD_FLAGS, doc_count},
+    {"splitlines", (PyCFunction)Str_like_splitlines, SZ_METHOD_FLAGS, doc_splitlines},
+    {"startswith", (PyCFunction)Str_like_startswith, SZ_METHOD_FLAGS, doc_startswith},
+    {"endswith", (PyCFunction)Str_like_endswith, SZ_METHOD_FLAGS, doc_endswith},
+    {"decode", (PyCFunction)Str_like_decode, SZ_METHOD_FLAGS, doc_decode},
     {"hash", (PyCFunction)Str_like_hash, SZ_METHOD_FLAGS, doc_like_hash},
 
     // Bidirectional operations
-    {"find", (PyCFunction)Str_find, SZ_METHOD_FLAGS, doc_find},
-    {"index", (PyCFunction)Str_index, SZ_METHOD_FLAGS, doc_index},
-    {"partition", (PyCFunction)Str_partition, SZ_METHOD_FLAGS, doc_partition},
-    {"split", (PyCFunction)Str_split, SZ_METHOD_FLAGS, doc_split},
-    {"rfind", (PyCFunction)Str_rfind, SZ_METHOD_FLAGS, doc_rfind},
-    {"rindex", (PyCFunction)Str_rindex, SZ_METHOD_FLAGS, doc_rindex},
-    {"rpartition", (PyCFunction)Str_rpartition, SZ_METHOD_FLAGS, doc_rpartition},
-    {"rsplit", (PyCFunction)Str_rsplit, SZ_METHOD_FLAGS, doc_rsplit},
+    {"find", (PyCFunction)Str_like_find, SZ_METHOD_FLAGS, doc_find},
+    {"index", (PyCFunction)Str_like_index, SZ_METHOD_FLAGS, doc_index},
+    {"partition", (PyCFunction)Str_like_partition, SZ_METHOD_FLAGS, doc_partition},
+    {"split", (PyCFunction)Str_like_split, SZ_METHOD_FLAGS, doc_split},
+    {"rfind", (PyCFunction)Str_like_rfind, SZ_METHOD_FLAGS, doc_rfind},
+    {"rindex", (PyCFunction)Str_like_rindex, SZ_METHOD_FLAGS, doc_rindex},
+    {"rpartition", (PyCFunction)Str_like_rpartition, SZ_METHOD_FLAGS, doc_rpartition},
+    {"rsplit", (PyCFunction)Str_like_rsplit, SZ_METHOD_FLAGS, doc_rsplit},
 
     // Character search extensions
-    {"find_first_of", (PyCFunction)Str_find_first_of, SZ_METHOD_FLAGS, doc_find_first_of},
-    {"find_last_of", (PyCFunction)Str_find_last_of, SZ_METHOD_FLAGS, doc_find_last_of},
-    {"find_first_not_of", (PyCFunction)Str_find_first_not_of, SZ_METHOD_FLAGS, doc_find_first_not_of},
-    {"find_last_not_of", (PyCFunction)Str_find_last_not_of, SZ_METHOD_FLAGS, doc_find_last_not_of},
-    {"split_byteset", (PyCFunction)Str_split_byteset, SZ_METHOD_FLAGS, doc_split_byteset},
-    {"rsplit_byteset", (PyCFunction)Str_rsplit_byteset, SZ_METHOD_FLAGS, doc_rsplit_byteset},
+    {"find_first_of", (PyCFunction)Str_like_find_first_of, SZ_METHOD_FLAGS, doc_find_first_of},
+    {"find_last_of", (PyCFunction)Str_like_find_last_of, SZ_METHOD_FLAGS, doc_find_last_of},
+    {"find_first_not_of", (PyCFunction)Str_like_find_first_not_of, SZ_METHOD_FLAGS, doc_find_first_not_of},
+    {"find_last_not_of", (PyCFunction)Str_like_find_last_not_of, SZ_METHOD_FLAGS, doc_find_last_not_of},
+    {"count_byteset", (PyCFunction)Str_like_count_byteset, SZ_METHOD_FLAGS, doc_count_byteset},
+    {"split_byteset", (PyCFunction)Str_like_split_byteset, SZ_METHOD_FLAGS, doc_split_byteset},
+    {"rsplit_byteset", (PyCFunction)Str_like_rsplit_byteset, SZ_METHOD_FLAGS, doc_rsplit_byteset},
 
     // Lazily evaluated iterators
-    {"split_iter", (PyCFunction)Str_split_iter, SZ_METHOD_FLAGS, doc_split_iter},
-    {"rsplit_iter", (PyCFunction)Str_rsplit_iter, SZ_METHOD_FLAGS, doc_rsplit_iter},
-    {"split_byteset_iter", (PyCFunction)Str_split_byteset_iter, SZ_METHOD_FLAGS, doc_split_byteset_iter},
-    {"rsplit_byteset_iter", (PyCFunction)Str_rsplit_byteset_iter, SZ_METHOD_FLAGS, doc_rsplit_byteset_iter},
+    {"split_iter", (PyCFunction)Str_like_split_iter, SZ_METHOD_FLAGS, doc_split_iter},
+    {"rsplit_iter", (PyCFunction)Str_like_rsplit_iter, SZ_METHOD_FLAGS, doc_rsplit_iter},
+    {"split_byteset_iter", (PyCFunction)Str_like_split_byteset_iter, SZ_METHOD_FLAGS, doc_split_byteset_iter},
+    {"rsplit_byteset_iter", (PyCFunction)Str_like_rsplit_byteset_iter, SZ_METHOD_FLAGS, doc_rsplit_byteset_iter},
 
     // Dealing with larger-than-memory datasets
     {"offset_within", (PyCFunction)Str_offset_within, SZ_METHOD_FLAGS, doc_offset_within},
     {"write_to", (PyCFunction)Str_write_to, SZ_METHOD_FLAGS, doc_write_to},
 
     // In-place transforms
-    {"translate", (PyCFunction)Str_translate, SZ_METHOD_FLAGS, doc_translate},
-    {"fill_random", (PyCFunction)Str_fill_random, SZ_METHOD_FLAGS, doc_fill_random},
+    {"translate", (PyCFunction)Str_like_translate, SZ_METHOD_FLAGS, doc_translate},
+    {"fill_random", (PyCFunction)Str_like_fill_random, SZ_METHOD_FLAGS, doc_fill_random},
 
     {NULL, NULL, 0, NULL} // Sentinel
 };
@@ -5826,50 +5831,51 @@ static void stringzilla_cleanup(PyObject *m) {
 
 static PyMethodDef stringzilla_methods[] = {
     // Basic `str`, `bytes`, and `bytearray`-like functionality
-    {"contains", (PyCFunction)Str_contains, SZ_METHOD_FLAGS, doc_contains},
-    {"count", (PyCFunction)Str_count, SZ_METHOD_FLAGS, doc_count},
-    {"splitlines", (PyCFunction)Str_splitlines, SZ_METHOD_FLAGS, doc_splitlines},
-    {"startswith", (PyCFunction)Str_startswith, SZ_METHOD_FLAGS, doc_startswith},
-    {"endswith", (PyCFunction)Str_endswith, SZ_METHOD_FLAGS, doc_endswith},
-    {"decode", (PyCFunction)Str_decode, SZ_METHOD_FLAGS, doc_decode},
+    {"contains", (PyCFunction)Str_like_contains, SZ_METHOD_FLAGS, doc_contains},
+    {"count", (PyCFunction)Str_like_count, SZ_METHOD_FLAGS, doc_count},
+    {"splitlines", (PyCFunction)Str_like_splitlines, SZ_METHOD_FLAGS, doc_splitlines},
+    {"startswith", (PyCFunction)Str_like_startswith, SZ_METHOD_FLAGS, doc_startswith},
+    {"endswith", (PyCFunction)Str_like_endswith, SZ_METHOD_FLAGS, doc_endswith},
+    {"decode", (PyCFunction)Str_like_decode, SZ_METHOD_FLAGS, doc_decode},
     {"equal", (PyCFunction)Str_like_equal, SZ_METHOD_FLAGS, doc_like_equal},
 
     // Bidirectional operations
-    {"find", (PyCFunction)Str_find, SZ_METHOD_FLAGS, doc_find},
-    {"index", (PyCFunction)Str_index, SZ_METHOD_FLAGS, doc_index},
-    {"partition", (PyCFunction)Str_partition, SZ_METHOD_FLAGS, doc_partition},
-    {"split", (PyCFunction)Str_split, SZ_METHOD_FLAGS, doc_split},
-    {"rfind", (PyCFunction)Str_rfind, SZ_METHOD_FLAGS, doc_rfind},
-    {"rindex", (PyCFunction)Str_rindex, SZ_METHOD_FLAGS, doc_rindex},
-    {"rpartition", (PyCFunction)Str_rpartition, SZ_METHOD_FLAGS, doc_rpartition},
-    {"rsplit", (PyCFunction)Str_rsplit, SZ_METHOD_FLAGS, doc_rsplit},
+    {"find", (PyCFunction)Str_like_find, SZ_METHOD_FLAGS, doc_find},
+    {"index", (PyCFunction)Str_like_index, SZ_METHOD_FLAGS, doc_index},
+    {"partition", (PyCFunction)Str_like_partition, SZ_METHOD_FLAGS, doc_partition},
+    {"split", (PyCFunction)Str_like_split, SZ_METHOD_FLAGS, doc_split},
+    {"rfind", (PyCFunction)Str_like_rfind, SZ_METHOD_FLAGS, doc_rfind},
+    {"rindex", (PyCFunction)Str_like_rindex, SZ_METHOD_FLAGS, doc_rindex},
+    {"rpartition", (PyCFunction)Str_like_rpartition, SZ_METHOD_FLAGS, doc_rpartition},
+    {"rsplit", (PyCFunction)Str_like_rsplit, SZ_METHOD_FLAGS, doc_rsplit},
 
     // Character search extensions
-    {"find_first_of", (PyCFunction)Str_find_first_of, SZ_METHOD_FLAGS, doc_find_first_of},
-    {"find_last_of", (PyCFunction)Str_find_last_of, SZ_METHOD_FLAGS, doc_find_last_of},
-    {"find_first_not_of", (PyCFunction)Str_find_first_not_of, SZ_METHOD_FLAGS, doc_find_first_not_of},
-    {"find_last_not_of", (PyCFunction)Str_find_last_not_of, SZ_METHOD_FLAGS, doc_find_last_not_of},
-    {"split_byteset", (PyCFunction)Str_split_byteset, SZ_METHOD_FLAGS, doc_split_byteset},
-    {"rsplit_byteset", (PyCFunction)Str_rsplit_byteset, SZ_METHOD_FLAGS, doc_rsplit_byteset},
+    {"find_first_of", (PyCFunction)Str_like_find_first_of, SZ_METHOD_FLAGS, doc_find_first_of},
+    {"find_last_of", (PyCFunction)Str_like_find_last_of, SZ_METHOD_FLAGS, doc_find_last_of},
+    {"find_first_not_of", (PyCFunction)Str_like_find_first_not_of, SZ_METHOD_FLAGS, doc_find_first_not_of},
+    {"find_last_not_of", (PyCFunction)Str_like_find_last_not_of, SZ_METHOD_FLAGS, doc_find_last_not_of},
+    {"count_byteset", (PyCFunction)Str_like_count_byteset, SZ_METHOD_FLAGS, doc_count_byteset},
+    {"split_byteset", (PyCFunction)Str_like_split_byteset, SZ_METHOD_FLAGS, doc_split_byteset},
+    {"rsplit_byteset", (PyCFunction)Str_like_rsplit_byteset, SZ_METHOD_FLAGS, doc_rsplit_byteset},
 
     // Lazily evaluated iterators
-    {"split_iter", (PyCFunction)Str_split_iter, SZ_METHOD_FLAGS, doc_split_iter},
-    {"rsplit_iter", (PyCFunction)Str_rsplit_iter, SZ_METHOD_FLAGS, doc_rsplit_iter},
-    {"split_byteset_iter", (PyCFunction)Str_split_byteset_iter, SZ_METHOD_FLAGS, doc_split_byteset_iter},
-    {"rsplit_byteset_iter", (PyCFunction)Str_rsplit_byteset_iter, SZ_METHOD_FLAGS, doc_rsplit_byteset_iter},
+    {"split_iter", (PyCFunction)Str_like_split_iter, SZ_METHOD_FLAGS, doc_split_iter},
+    {"rsplit_iter", (PyCFunction)Str_like_rsplit_iter, SZ_METHOD_FLAGS, doc_rsplit_iter},
+    {"split_byteset_iter", (PyCFunction)Str_like_split_byteset_iter, SZ_METHOD_FLAGS, doc_split_byteset_iter},
+    {"rsplit_byteset_iter", (PyCFunction)Str_like_rsplit_byteset_iter, SZ_METHOD_FLAGS, doc_rsplit_byteset_iter},
 
     // Dealing with larger-than-memory datasets
     {"offset_within", (PyCFunction)Str_offset_within, SZ_METHOD_FLAGS, doc_offset_within},
     {"write_to", (PyCFunction)Str_write_to, SZ_METHOD_FLAGS, doc_write_to},
 
     // In-place transforms
-    {"translate", (PyCFunction)Str_translate, SZ_METHOD_FLAGS, doc_translate},
-    {"fill_random", (PyCFunction)Str_fill_random, SZ_METHOD_FLAGS, doc_fill_random},
+    {"translate", (PyCFunction)Str_like_translate, SZ_METHOD_FLAGS, doc_translate},
+    {"fill_random", (PyCFunction)Str_like_fill_random, SZ_METHOD_FLAGS, doc_fill_random},
 
     // Global unary extensions
     {"hash", (PyCFunction)Str_like_hash, SZ_METHOD_FLAGS, doc_like_hash},
     {"bytesum", (PyCFunction)Str_like_bytesum, SZ_METHOD_FLAGS, doc_like_bytesum},
-    {"fill_random", (PyCFunction)Str_fill_random, SZ_METHOD_FLAGS, doc_fill_random},
+    {"fill_random", (PyCFunction)Str_like_fill_random, SZ_METHOD_FLAGS, doc_fill_random},
 
     // Module-level functionality
     {"random", (PyCFunction)module_random, SZ_METHOD_FLAGS, doc_random},

From 15349c6e8731b589aac1980b76f2965bc16d873e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 4 Sep 2025 20:15:25 +0000
Subject: [PATCH 736/751] Docs: What to know about CUDA

---
 README.md | 50 +++++++++++++++++++++++++-------------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index 6431df1a..c3055a84 100644
--- a/README.md
+++ b/README.md
@@ -29,10 +29,12 @@ It __accelerates exact and fuzzy string matching, edit distance computations, so
 
 - 🐂 __[C](#basic-usage-with-c-99-and-newer) :__ Upgrade LibC's `<string.h>` to `<stringzilla/stringzilla.h>`  in C 99
 - 🐉 __[C++](#basic-usage-with-c-11-and-newer):__ Upgrade STL's `<string>` to `<stringzilla/stringzilla.hpp>` in C++ 11
+- 🧮 __[CUDA](#cuda):__ Process in-bulk with `<stringzillas/stringzillas.cuh>` in CUDA C++ 17
 - 🐍 __[Python](#quick-start-python-🐍):__ Upgrade your `str` to faster `Str`
-- 🍎 __[Swift](#quick-start-swift-🍏):__ Use the `String+StringZilla` extension
 - 🦀 __[Rust](#quick-start-rust-🦀):__ Use the `StringZilla` traits crate
 - 🦫 __[Go](#quick-start-golang-🦫):__ Use the `StringZilla` cGo module
+- 🍎 __[Swift](#quick-start-swift-🍏):__ Use the `String+StringZilla` extension
+- 🟨 __[JavaScript](#quick-start-javascript-🟨):__ Use the `StringZilla` library
 - 🐚 __[Shell][faq-shell]__: Accelerate common CLI tools with `sz_` prefix
 - 📚 Researcher? Jump to [Algorithms & Design Decisions](#algorithms--design-decisions-📚)
 - 💡 Thinking to contribute? Look for ["good first issues"][first-issues]
@@ -961,11 +963,12 @@ auto _ = tape.try_assign(docs.begin(), docs.end());
 
 // Run on the current thread with a Rabin-Karp family hasher
 constexpr std::size_t dimensions_k = 256;
+constexpr std::size_t window_width_k = 7;
+using row_t = std::array<sz_u32_t, 256>;
 using fingerprinter_t = szs::floating_rolling_hashers<sz_cap_serial_k, dimensions_k>;
 fingerprinter_t engine;
-auto _ = engine.try_extend(/*window*/ 7, /*dims*/ 256);
-std::array<sz_u32_t, 256> row{};
-std::vector<decltype(row)> hashes(docs.size()), counts(docs.size());
+auto _ = engine.try_extend(window_width_k, dimensions_k);
+std::vector<row_t> hashes(docs.size()), counts(docs.size());
 auto _ = engine(tape, hashes, counts);
 
 // Or run in parallel with a pool
@@ -974,6 +977,17 @@ auto _ = pool.try_spawn(std::thread::hardware_concurrency());
 auto _ = engine(tape, hashes, counts, pool);
 ```
 
+### CUDA
+
+StringZilla provides CUDA C++ templates for composable string batch-processing operations.
+Different GPUs have varying warp sizes, shared memory capacities, and register counts, affecting algorithm selection, so it's important to query the `gpu_specs_t` via `gpu_specs_fetch`.
+For memory management, ensure that you use GPU-visible' unified memory` exposed in an STL-compatible manner as a `unified_alloc` template class.
+For error handling, `cuda_status_t` extends the traditional `status_t` with GPU-specific information.
+It's implicitly convertible to `status_t`, so you can use it in places expecting a `status_t`.
+
+Most algorithms can load-balance both a large number of small strings and a small number of large strings.
+Still, with large H100-scale GPUs, it's best to submit thousands of inputs at once.
+
 ### Memory Ownership and Small String Optimization
 
 Most operations in StringZilla don't assume any memory ownership.
@@ -1322,25 +1336,6 @@ Standard library functions may not offer the most efficient or convenient method
 - `haystack.lookup(sz::look_up_table::identity())`
 - `haystack.lookup(sz::look_up_table::identity(), haystack.data())`
 
-### Levenshtein Edit Distance and Alignment Scores
-
-Levenshtein and Hamming edit distance are provided for both byte-strings and UTF-8 strings.
-The latter will output the distance in Unicode code points, not bytes.
-Needleman-Wunsch alignment scores are only defined for byte-strings.
-
-```cpp
-// Count number of substitutions in same length strings
-sz::hamming_distance(first, second[, upper_bound]) -> std::size_t;
-sz::hamming_distance_utf8(first, second[, upper_bound]) -> std::size_t;
-
-// Count number of insertions, deletions and substitutions
-sz::levenshtein_distance(first, second[, upper_bound[, allocator]]) -> std::size_t;
-sz::levenshtein_distance_utf8(first, second[, upper_bound[, allocator]]) -> std::size_t;
-
-// Substitution-parametrized Needleman-Wunsch global alignment score
-std::int8_t costs[256][256]; // Substitution costs matrix
-sz::alignment_score(first, second, costs[, gap_score[, allocator]) -> std::ptrdiff_t;
-```
 
 ### Sorting in C and C++
 
@@ -1412,10 +1407,15 @@ __`SZ_DEBUG`__:
 > If you want to enable more aggressive bounds-checking, define `SZ_DEBUG` before including the header.
 > If not explicitly set, it will be inferred from the build type.
 
-__`SZ_USE_HASWELL`, `SZ_USE_SKYLAKE`, `SZ_USE_ICE`, `SZ_USE_NEON`, `SZ_USE_SVE`, `SZ_USE_SVE2`__:
+__`SZ_USE_HASWELL`, `SZ_USE_SKYLAKE`, `SZ_USE_ICE`, `SZ_USE_NEON`, `SZ_USE_NEON_AES`, `SZ_USE_SVE`, `SZ_USE_SVE2`, `SZ_USE_SVE2_AES`__:
 
 > One can explicitly disable certain families of SIMD instructions for compatibility purposes.
-> Default values are inferred at compile time.
+> Default values are inferred at compile time depending on compiler support (for dynamic dispatch) and the target architecture (for static dispatch).
+
+__`SZ_USE_CUDA`, `SZ_USE_KEPLER`, `SZ_USE_HOPPER`__:
+
+> One can explicitly disable certain families of PTX instructions for compatibility purposes.
+> Default values are inferred at compile time depending on compiler support (for dynamic dispatch) and the target architecture (for static dispatch).
 
 __`SZ_DYNAMIC_DISPATCH`__:
 

From bd692d2ef0bef4a00debf0b356615672a8f93256 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 6 Sep 2025 13:02:15 +0000
Subject: [PATCH 737/751] Improve: Byte-set counting PyTests

---
 scripts/test_stringzilla.py | 49 +++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index 8bd01076..ecc38bac 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -155,6 +155,44 @@ def test_unit_count():
     assert big.count("aa", allowoverlap=True) == 4
 
 
+def test_unit_count_byteset():
+    native = "abcdef"
+    big = Str(native)
+
+    assert big.count_byteset("abc") == 3  # a, b, c
+    assert big.count_byteset("xyz") == 0  # no matches
+    assert big.count_byteset("aeiou") == 2  # a and e
+
+    # Empty inputs
+    assert big.count_byteset("", "abc") == 0
+    assert big.count_byteset("abc", "") == 0
+    assert big.count_byteset("", "") == 0
+
+    # Single character set
+    assert big.count_byteset("hello", "l") == 2
+    assert big.count_byteset("hello", "x") == 0
+
+    # Repeated patterns
+    assert big.count_byteset("mississippi", "si") == 8  # s:4, i:4 total
+    assert big.count_byteset("aaaaaa", "a") == 6
+
+    # Test start/end bounds
+    native = "abcdefghij"
+    big = Str(native)
+
+    assert big.count_byteset("abc", 0, 3) == 3  # "abc"
+    assert big.count_byteset("abc", 1, 3) == 2  # "bc"
+    assert big.count_byteset("abc", 3) == 0  # "defghij"
+    assert big.count_byteset("hij", 7) == 3  # "hij"
+    assert big.count_byteset("hij", -3) == 3  # last 3 chars "hij"
+    assert big.count_byteset("abc", 0, -7) == 3  # first 3 chars "abc"
+
+    # Test edge cases
+    assert sz.count_byteset("a", "a", 0, 0) == 0  # empty slice
+    assert sz.count_byteset("abc", "abc", 10, 20) == 0  # out of bounds
+    assert sz.count_byteset("abc", "abc", -10, -5) == 0  # negative out of bounds
+
+
 def test_unit_contains():
     big = Str("abcdef")
     assert "a" in big
@@ -513,6 +551,17 @@ def test_unit_globals():
     assert sz.find_last_not_of("abcdef", "abcdef") == -1
     assert sz.find_last_not_of("hello   ", " ") == 4
 
+    # Test byteset counting
+    assert sz.count_byteset("abcdef", "abc") == 3
+    assert sz.count_byteset("abcdef", "xyz") == 0
+    assert sz.count_byteset("hello world", "aeiou") == 3  # e, o, o
+    assert sz.count_byteset("mississippi", "si") == 8  # s, i, s, s, i, s, s, i
+    assert sz.count_byteset("", "abc") == 0
+    assert sz.count_byteset("abc", "") == 0
+    assert sz.count_byteset("abcdef", "abc", 1, 4) == 2  # bc in "bcd"
+    assert sz.count_byteset("hello world", "aeiou", 2) == 2  # o, o
+    assert sz.count_byteset("hello world", "aeiou", 0, 5) == 2  # e, o in "hello"
+
     # Compare partitioning functions
     assert sz.partition("abcdef", "c") == ("ab", "c", "def")
     assert sz.rpartition("abcdef", "c") == ("ab", "c", "def")

From 4c819d6176be55048101b3548984d2fa5f60257a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 6 Sep 2025 16:19:37 +0000
Subject: [PATCH 738/751] Make: Packaging for Py & Go

---
 CONTRIBUTING.md             |  8 ++++++++
 MANIFEST.in                 | 12 +++++++++++-
 golang/lib.go               |  2 +-
 scripts/test_stringzilla.py | 14 +++++++-------
 4 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index ab25a52d..26f00428 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -484,6 +484,14 @@ Also considering the other optional dependencies for benchmarking and other scri
 uv pip install -r scripts/requirements.txt 
 ```
 
+### Packaging
+
+For source distributions, make sure `MANIFEST.in` is up-to-date:
+
+```bash
+uv build --sdist --out-dir dist
+```
+
 Before you ship, please make sure the `cibuilwheel` packaging works and tests pass on other platforms.
 Don't forget to use the right [CLI arguments][cibuildwheel-cli] to avoid overloading your Docker runtime.
 
diff --git a/MANIFEST.in b/MANIFEST.in
index 9a9f7f75..48d83161 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,2 +1,12 @@
-include include/stringzilla/stringzilla.h
+include README.md
 include VERSION
+include LICENSE
+
+graft include
+graft fork_union/include
+graft c
+graft python
+
+include scripts/test_stringzilla.py
+include scripts/test_stringzillas.py
+include scripts/bench*.py
diff --git a/golang/lib.go b/golang/lib.go
index 0ba64abd..dad7c6f1 100644
--- a/golang/lib.go
+++ b/golang/lib.go
@@ -13,7 +13,7 @@
 package sz
 
 // #cgo CFLAGS: -O3 -I../include
-// #cgo LDFLAGS: -L. -L/usr/local/lib -L../build_release -lstringzilla_shared
+// #cgo LDFLAGS: -L. -L/usr/local/lib -L../build_release -L../build_shared -lstringzilla_shared
 // #cgo noescape sz_find
 // #cgo nocallback sz_find
 // #cgo noescape sz_find_byte
diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index ecc38bac..38869020 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -164,17 +164,17 @@ def test_unit_count_byteset():
     assert big.count_byteset("aeiou") == 2  # a and e
 
     # Empty inputs
-    assert big.count_byteset("", "abc") == 0
-    assert big.count_byteset("abc", "") == 0
-    assert big.count_byteset("", "") == 0
+    assert sz.count_byteset("", "abc") == 0
+    assert sz.count_byteset("abc", "") == 0
+    assert sz.count_byteset("", "") == 0
 
     # Single character set
-    assert big.count_byteset("hello", "l") == 2
-    assert big.count_byteset("hello", "x") == 0
+    assert sz.count_byteset("hello", "l") == 2
+    assert sz.count_byteset("hello", "x") == 0
 
     # Repeated patterns
-    assert big.count_byteset("mississippi", "si") == 8  # s:4, i:4 total
-    assert big.count_byteset("aaaaaa", "a") == 6
+    assert sz.count_byteset("mississippi", "si") == 8  # s:4, i:4 total
+    assert sz.count_byteset("aaaaaa", "a") == 6
 
     # Test start/end bounds
     native = "abcdefghij"

From 230fc130b4abd35f24181e990aeab280a6e258e5 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 6 Sep 2025 17:20:46 +0000
Subject: [PATCH 739/751] Fix: Drop old similarity APIs in benchmarks

---
 scripts/bench_similarities.py | 56 +++++++++--------------------------
 1 file changed, 14 insertions(+), 42 deletions(-)

diff --git a/scripts/bench_similarities.py b/scripts/bench_similarities.py
index 014ec3c8..ec960553 100644
--- a/scripts/bench_similarities.py
+++ b/scripts/bench_similarities.py
@@ -1,6 +1,5 @@
 # /// script
 # dependencies = [
-#   "stringzilla",
 #   "rapidfuzz",
 #   "python-Levenshtein",
 #   "levenshtein",
@@ -33,6 +32,12 @@
     # Benchmark with a file
     uv run --no-project scripts/bench_similarities.py --dataset leipzig1M.txt
 
+    # ... using the same local Python environment
+    uv run --no-project python scripts/bench_similarities.py --dataset leipzig1M.txt
+
+    # ... or as a `uv` script
+    uv run --script scripts/bench_similarities.py --dataset leipzig1M.txt
+
     # Benchmark with limited pairs
     uv run --no-project scripts/bench_similarities.py --dataset leipzig1M.txt --max-pairs 1000
 
@@ -150,21 +155,6 @@ def log_similarity_operation(
 def benchmark_edit_distances(string_pairs: List[Tuple[str, str]], timeout_seconds: int = 10):
     """Benchmark various edit distance implementations."""
 
-    # StringZilla
-    log_similarity_operation(
-        "stringzilla.edit_distance",
-        string_pairs,
-        sz.edit_distance,
-        timeout_seconds,
-    )
-
-    log_similarity_operation(
-        "stringzilla.edit_distance_unicode",
-        string_pairs,
-        sz.edit_distance_unicode,
-        timeout_seconds,
-    )
-
     # RapidFuzz
     log_similarity_operation(
         "rapidfuzz.Levenshtein.distance",
@@ -252,21 +242,18 @@ def benchmark_stringzillas_batch(engine_name, engine_class, device_scope):
 
     # StringZillas Levenshtein distances (batch)
     cpu_scope = szs.DeviceScope(cpu_cores=os.cpu_count())
-    benchmark_stringzillas_batch("stringzillas.LevenshteinDistances(CPU)", szs.LevenshteinDistances, cpu_scope)
+    benchmark_stringzillas_batch("szs.LevenshteinDistances(CPU)", szs.LevenshteinDistances, cpu_scope)
 
     try:
         gpu_scope = szs.DeviceScope(gpu_device=0)
-        benchmark_stringzillas_batch("stringzillas.LevenshteinDistances(GPU)", szs.LevenshteinDistances, gpu_scope)
+        benchmark_stringzillas_batch("szs.LevenshteinDistances(GPU)", szs.LevenshteinDistances, gpu_scope)
     except Exception:
         pass  # GPU may not be available
 
     # StringZillas UTF-8 Levenshtein distances (batch)
-    benchmark_stringzillas_batch("stringzillas.LevenshteinDistancesUTF8(CPU)", szs.LevenshteinDistancesUTF8, cpu_scope)
-
+    benchmark_stringzillas_batch("szs.LevenshteinDistancesUTF8(CPU)", szs.LevenshteinDistancesUTF8, cpu_scope)
     try:
-        benchmark_stringzillas_batch(
-            "stringzillas.LevenshteinDistancesUTF8(GPU)", szs.LevenshteinDistancesUTF8, gpu_scope
-        )
+        benchmark_stringzillas_batch("szs.LevenshteinDistancesUTF8(GPU)", szs.LevenshteinDistancesUTF8, gpu_scope)
     except Exception:
         pass  # GPU may not be available
 
@@ -297,12 +284,6 @@ def benchmark_alignment_scores(string_pairs: List[Tuple[str, str]], timeout_seco
                 reconstructed_column = ord(packed_column_aminoacid)
                 _blosum_matrix[reconstructed_row, reconstructed_column] = subs_packed[packed_row, packed_column]
 
-    # StringZilla alignment score
-    def sz_alignment_score(a: str, b: str) -> int:
-        return sz.alignment_score(a, b, substitution_matrix=_blosum_matrix, gap_score=1)
-
-    log_similarity_operation("stringzilla.alignment_score", string_pairs, sz_alignment_score, timeout_seconds)
-
     # BioPython alignment score
     log_similarity_operation(
         "biopython.PairwiseAligner.score",
@@ -338,27 +319,18 @@ def benchmark_stringzillas_alignment_batch(engine_name, engine_class, substituti
 
     # StringZillas Needleman-Wunsch (global alignment)
     cpu_scope = szs.DeviceScope(cpu_cores=os.cpu_count())
-    benchmark_stringzillas_alignment_batch(
-        "stringzillas.NeedlemanWunsch(CPU)", szs.NeedlemanWunsch, _blosum_matrix, cpu_scope
-    )
+    benchmark_stringzillas_alignment_batch("szs.NeedlemanWunsch(CPU)", szs.NeedlemanWunsch, _blosum_matrix, cpu_scope)
 
     try:
         gpu_scope = szs.DeviceScope(gpu_device=0)
-        benchmark_stringzillas_alignment_batch(
-            "stringzillas.NeedlemanWunsch(GPU)", szs.NeedlemanWunsch, _blosum_matrix, gpu_scope
-        )
+        benchmark_stringzillas_alignment_batch("szs.NeedlemanWunsch(GPU)", szs.NeedlemanWunsch, _blosum_matrix, gpu_scope)
     except:
         pass  # GPU may not be available
 
     # StringZillas Smith-Waterman (local alignment)
-    benchmark_stringzillas_alignment_batch(
-        "stringzillas.SmithWaterman(CPU)", szs.SmithWaterman, _blosum_matrix, cpu_scope
-    )
-
+    benchmark_stringzillas_alignment_batch("szs.SmithWaterman(CPU)", szs.SmithWaterman, _blosum_matrix, cpu_scope)
     try:
-        benchmark_stringzillas_alignment_batch(
-            "stringzillas.SmithWaterman(GPU)", szs.SmithWaterman, _blosum_matrix, gpu_scope
-        )
+        benchmark_stringzillas_alignment_batch("szs.SmithWaterman(GPU)", szs.SmithWaterman, _blosum_matrix, gpu_scope)
     except:
         pass  # GPU may not be available
 

From 7a4b78fa8c1830c7137c7b2e4c7c2183a4022a87 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 7 Sep 2025 21:02:09 +0000
Subject: [PATCH 740/751] Improve: More big-endian SWAR tests

---
 CONTRIBUTING.md              |  2 +-
 include/stringzilla/find.h   | 10 +++++++---
 scripts/test_stringzilla.cpp | 27 +++++++++++++++++++++++++--
 scripts/test_stringzilla.py  |  6 +++++-
 4 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 26f00428..90455c5d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -515,7 +515,7 @@ sudo $(which cibuildwheel) --platform linux
 To avoid QEMU issues on SVE and some other uncommon instructions, you can inform the PyTest suite, that it's running in an emulated environment:
 
 ```bash
-SZ_IS_QEMU_=1 sudo $(which cibuildwheel) --platform linux
+SZ_IS_QEMU_=1 sudo $(which cibuildwheel) --platform linux --archs s390x
 ```
 
 On Windows and macOS, to avoid frequent path resolution issues, you may want to use:
diff --git a/include/stringzilla/find.h b/include/stringzilla/find.h
index 70d11b8d..cccad9eb 100644
--- a/include/stringzilla/find.h
+++ b/include/stringzilla/find.h
@@ -434,7 +434,11 @@ SZ_INTERNAL sz_cptr_t sz_find_2byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_
 
     sz_u64_vec_t h_even_vec, h_odd_vec, n_vec, matches_even_vec, matches_odd_vec;
     n_vec.u64 = 0;
+#if !SZ_IS_BIG_ENDIAN_
     n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1];
+#else
+    n_vec.u8s[6] = n[0], n_vec.u8s[7] = n[1];
+#endif
     n_vec.u64 *= 0x0001000100010001ull; // broadcast
 
     // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
@@ -444,20 +448,20 @@ SZ_INTERNAL sz_cptr_t sz_find_2byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_
         h_odd_vec.u64 = (h_even_vec.u64 >> 8) | ((sz_u64_t)h[8] << 56);
 #else
         h_even_vec.u64 = *(sz_u64_t *)h;
-        h_odd_vec.u64 = (h_even_vec.u64 << 8) | ((sz_u64_t)h[8] >> 56);
+        h_odd_vec.u64 = (h_even_vec.u64 << 8) | (sz_u64_t)h[8];
 #endif
         matches_even_vec = sz_u64_each_2byte_equal_(h_even_vec, n_vec);
         matches_odd_vec = sz_u64_each_2byte_equal_(h_odd_vec, n_vec);
 
 #if !SZ_IS_BIG_ENDIAN_
-        matches_even_vec.u64 >>= 8;
         if (matches_even_vec.u64 + matches_odd_vec.u64) {
+            matches_even_vec.u64 >>= 8;
             sz_u64_t match_indicators = matches_even_vec.u64 | matches_odd_vec.u64;
             return h + sz_u64_ctz(match_indicators) / 8;
         }
 #else
-        matches_even_vec.u64 <<= 8;
         if (matches_even_vec.u64 + matches_odd_vec.u64) {
+            matches_odd_vec.u64 >>= 8;
             sz_u64_t match_indicators = matches_even_vec.u64 | matches_odd_vec.u64;
             return h + sz_u64_clz(match_indicators) / 8;
         }
diff --git a/scripts/test_stringzilla.cpp b/scripts/test_stringzilla.cpp
index 12b6656e..3d4aa11a 100644
--- a/scripts/test_stringzilla.cpp
+++ b/scripts/test_stringzilla.cpp
@@ -736,11 +736,31 @@ void test_stl_compatibility_for_reads() {
 
     // More complex queries.
     assert(str("abbabbaaaaaa").find("aa") == 6);
+    assert(str("abbabbaaaaaa").find("ba") == 2);
+    assert(str("abbabbaaaaaa").find("bb") == 1);
+    assert(str("abbabbaaaaaa").find("bab") == 2);
+    assert(str("abbabbaaaaaa").find("babb") == 2);
+    assert(str("abbabbaaaaaa").find("babba") == 2);
     assert(str("abcdabcd").substr(2, 4).find("abc") == str::npos);
     assert(str("hello, world!").substr(0, 11).find("world") == str::npos);
     assert(str("axabbcxcaaabbccc").find("aaabbccc") == 8);
-
-    // Simple repeating patterns - with one "almost match" before an actual match in each direction
+    assert(str("abcdabcdabc________").find("abcd") == 0);
+    assert(str("________abcdabcdabc").find("abcd") == 1);
+
+    // Cover every SWAR case for unique string sequences.
+    auto lowercase_alphabet = str("abcdefghijklmnopqrstuvwxyz");
+    for (std::size_t one_byte_offset = 0; one_byte_offset + 1 <= lowercase_alphabet.size(); ++one_byte_offset)
+        assert(lowercase_alphabet.find(lowercase_alphabet.substr(one_byte_offset, 1)) == one_byte_offset);
+    for (std::size_t two_byte_offset = 0; two_byte_offset + 2 <= lowercase_alphabet.size(); ++two_byte_offset)
+        assert(lowercase_alphabet.find(lowercase_alphabet.substr(two_byte_offset, 2)) == two_byte_offset);
+    for (std::size_t four_byte_offset = 0; four_byte_offset + 4 <= lowercase_alphabet.size(); ++four_byte_offset)
+        assert(lowercase_alphabet.find(lowercase_alphabet.substr(four_byte_offset, 4)) == four_byte_offset);
+    for (std::size_t three_byte_offset = 0; three_byte_offset + 3 <= lowercase_alphabet.size(); ++three_byte_offset)
+        assert(lowercase_alphabet.find(lowercase_alphabet.substr(three_byte_offset, 3)) == three_byte_offset);
+    for (std::size_t five_byte_offset = 0; five_byte_offset + 5 <= lowercase_alphabet.size(); ++five_byte_offset)
+        assert(lowercase_alphabet.find(lowercase_alphabet.substr(five_byte_offset, 5)) == five_byte_offset);
+
+    // Simple repeating patterns - with one "almost match" before an actual match in each direction.
     assert(str("_ab_abc_").find("abc") == 4);
     assert(str("_abc_ab_").rfind("abc") == 1);
     assert(str("_abc_abcd_").find("abcd") == 5);
@@ -2017,6 +2037,9 @@ int main(int argc, char const **argv) {
     std::printf("- CUDA managed memory support: %s\n", prop.managedMemory == 1 ? "yes" : "no");
     std::printf("- CUDA unified memory support: %s\n", prop.unifiedAddressing == 1 ? "yes" : "no");
 #endif
+#if SZ_IS_CPP17_ && defined(__cpp_lib_string_view)
+    test_stl_compatibility_for_reads<std::string_view>();
+#endif
 
     // Basic utilities
     test_arithmetical_utilities();
diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index 38869020..8acb37e0 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -308,7 +308,11 @@ def test_unit_split():
     assert native.splitlines() == list(big.splitlines())
     assert native.splitlines(True) == list(big.splitlines(keeplinebreaks=True))
 
-    # Check for equivalence with native Python strings, including boundary conditions
+    # Check for equivalence with native Python strings, including SWAR boundary conditions
+    assert native.split("l") == list(big.split("l"))
+    assert native.split("li") == list(big.split("li"))
+    assert native.split("lin") == list(big.split("lin"))
+    assert native.split("line") == list(big.split("line"))
     assert native.split("line1") == list(big.split("line1"))
     assert native.split("line3") == list(big.split("line3"))
     assert native.split("\n", maxsplit=0) == list(big.split("\n", maxsplit=0))

From 230e3542995043f80a40d369f3f53ec67a4d58ea Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 7 Sep 2025 21:32:57 +0000
Subject: [PATCH 741/751] Fix: Avoid SWAR on big-endian

Sadly, code-bloat for maintaining SWAR on
s390x is too high, so its wiser to avoid it for
now.
---
 include/stringzilla/find.h   | 79 ++++++++++++------------------------
 scripts/test_stringzilla.cpp | 18 ++++----
 2 files changed, 33 insertions(+), 64 deletions(-)

diff --git a/include/stringzilla/find.h b/include/stringzilla/find.h
index cccad9eb..cf3d6d43 100644
--- a/include/stringzilla/find.h
+++ b/include/stringzilla/find.h
@@ -426,46 +426,35 @@ SZ_INTERNAL sz_cptr_t sz_find_2byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_
     sz_unused_(n_length); //? We keep this argument only for `sz_find_t` signature compatibility.
     sz_cptr_t const h_end = h + h_length;
 
-#if !SZ_USE_MISALIGNED_LOADS
+    // On big-endian systems, skip SWAR and use simple serial search
+#if SZ_IS_BIG_ENDIAN_
+    for (; h + 2 <= h_end; ++h)
+        if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
+    return SZ_NULL_CHAR;
+#endif
+
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
+#if !SZ_USE_MISALIGNED_LOADS
     for (; ((sz_size_t)h & 7ull) && h + 2 <= h_end; ++h)
         if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
 #endif
 
     sz_u64_vec_t h_even_vec, h_odd_vec, n_vec, matches_even_vec, matches_odd_vec;
     n_vec.u64 = 0;
-#if !SZ_IS_BIG_ENDIAN_
     n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1];
-#else
-    n_vec.u8s[6] = n[0], n_vec.u8s[7] = n[1];
-#endif
     n_vec.u64 *= 0x0001000100010001ull; // broadcast
 
     // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
     for (; h + 9 <= h_end; h += 8) {
-#if !SZ_IS_BIG_ENDIAN_
         h_even_vec.u64 = *(sz_u64_t *)h;
         h_odd_vec.u64 = (h_even_vec.u64 >> 8) | ((sz_u64_t)h[8] << 56);
-#else
-        h_even_vec.u64 = *(sz_u64_t *)h;
-        h_odd_vec.u64 = (h_even_vec.u64 << 8) | (sz_u64_t)h[8];
-#endif
         matches_even_vec = sz_u64_each_2byte_equal_(h_even_vec, n_vec);
         matches_odd_vec = sz_u64_each_2byte_equal_(h_odd_vec, n_vec);
-
-#if !SZ_IS_BIG_ENDIAN_
+        matches_even_vec.u64 >>= 8;
         if (matches_even_vec.u64 + matches_odd_vec.u64) {
-            matches_even_vec.u64 >>= 8;
             sz_u64_t match_indicators = matches_even_vec.u64 | matches_odd_vec.u64;
             return h + sz_u64_ctz(match_indicators) / 8;
         }
-#else
-        if (matches_even_vec.u64 + matches_odd_vec.u64) {
-            matches_odd_vec.u64 >>= 8;
-            sz_u64_t match_indicators = matches_even_vec.u64 | matches_odd_vec.u64;
-            return h + sz_u64_clz(match_indicators) / 8;
-        }
-#endif
     }
 
     for (; h + 2 <= h_end; ++h)
@@ -498,8 +487,15 @@ SZ_INTERNAL sz_cptr_t sz_find_4byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_
     sz_unused_(n_length); //? We keep this argument only for `sz_find_t` signature compatibility.
     sz_cptr_t const h_end = h + h_length;
 
-#if !SZ_USE_MISALIGNED_LOADS
+    // On big-endian systems, skip SWAR and use simple serial search
+#if SZ_IS_BIG_ENDIAN_
+    for (; h + 4 <= h_end; ++h)
+        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
+    return SZ_NULL_CHAR;
+#endif
+
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
+#if !SZ_USE_MISALIGNED_LOADS
     for (; ((sz_size_t)h & 7ull) && h + 4 <= h_end; ++h)
         if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
 #endif
@@ -515,36 +511,21 @@ SZ_INTERNAL sz_cptr_t sz_find_4byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_
     for (; h + sizeof(sz_u64_t) + sizeof(sz_u32_t) <= h_end; h += sizeof(sz_u64_t)) {
         h_page_current = *(sz_u64_t *)h;
         h_page_next = *(sz_u32_t *)(h + 8);
-#if !SZ_IS_BIG_ENDIAN_
         h0_vec.u64 = (h_page_current);
         h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
         h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
         h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
-#else
-        h0_vec.u64 = (h_page_current);
-        h1_vec.u64 = (h_page_current << 8) | (h_page_next >> 24);
-        h2_vec.u64 = (h_page_current << 16) | (h_page_next >> 16);
-        h3_vec.u64 = (h_page_current << 24) | (h_page_next >> 8);
-#endif
         matches0_vec = sz_u64_each_4byte_equal_(h0_vec, n_vec);
         matches1_vec = sz_u64_each_4byte_equal_(h1_vec, n_vec);
         matches2_vec = sz_u64_each_4byte_equal_(h2_vec, n_vec);
         matches3_vec = sz_u64_each_4byte_equal_(h3_vec, n_vec);
 
         if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64) {
-#if !SZ_IS_BIG_ENDIAN_
             matches0_vec.u64 >>= 24;
             matches1_vec.u64 >>= 16;
             matches2_vec.u64 >>= 8;
             sz_u64_t match_indicators = matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64;
             return h + sz_u64_ctz(match_indicators) / 8;
-#else
-            matches0_vec.u64 <<= 24;
-            matches1_vec.u64 <<= 16;
-            matches2_vec.u64 <<= 8;
-            sz_u64_t match_indicators = matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64;
-            return h + sz_u64_clz(match_indicators) / 8;
-#endif
         }
     }
 
@@ -578,8 +559,15 @@ SZ_INTERNAL sz_cptr_t sz_find_3byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_
     sz_unused_(n_length); //? We keep this argument only for `sz_find_t` signature compatibility.
     sz_cptr_t const h_end = h + h_length;
 
-#if !SZ_USE_MISALIGNED_LOADS
+    // On big-endian systems, skip SWAR and use simple serial search
+#if SZ_IS_BIG_ENDIAN_
+    for (; h + 3 <= h_end; ++h)
+        if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
+    return SZ_NULL_CHAR;
+#endif
+
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
+#if !SZ_USE_MISALIGNED_LOADS
     for (; ((sz_size_t)h & 7ull) && h + 3 <= h_end; ++h)
         if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
 #endif
@@ -599,17 +587,10 @@ SZ_INTERNAL sz_cptr_t sz_find_3byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_
         h_page_current = *(sz_u64_t *)h;
         h_page_next = *(sz_u16_t *)(h + 8);
         h0_vec.u64 = (h_page_current);
-#if !SZ_IS_BIG_ENDIAN_
         h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
         h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
         h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
         h4_vec.u64 = (h_page_current >> 32) | (h_page_next << 32);
-#else
-        h1_vec.u64 = (h_page_current << 8) | (h_page_next >> 8);
-        h2_vec.u64 = (h_page_current << 16) | (h_page_next >> 16);
-        h3_vec.u64 = (h_page_current << 24) | (h_page_next >> 24);
-        h4_vec.u64 = (h_page_current << 32) | (h_page_next >> 32);
-#endif
         matches0_vec = sz_u64_each_3byte_equal_(h0_vec, n_vec);
         matches1_vec = sz_u64_each_3byte_equal_(h1_vec, n_vec);
         matches2_vec = sz_u64_each_3byte_equal_(h2_vec, n_vec);
@@ -617,7 +598,6 @@ SZ_INTERNAL sz_cptr_t sz_find_3byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_
         matches4_vec = sz_u64_each_3byte_equal_(h4_vec, n_vec);
 
         if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64) {
-#if !SZ_IS_BIG_ENDIAN_
             matches0_vec.u64 >>= 16;
             matches1_vec.u64 >>= 8;
             matches3_vec.u64 <<= 8;
@@ -625,15 +605,6 @@ SZ_INTERNAL sz_cptr_t sz_find_3byte_serial_(sz_cptr_t h, sz_size_t h_length, sz_
             sz_u64_t match_indicators =
                 matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64;
             return h + sz_u64_ctz(match_indicators) / 8;
-#else
-            matches0_vec.u64 <<= 16;
-            matches1_vec.u64 <<= 8;
-            matches3_vec.u64 >>= 8;
-            matches4_vec.u64 >>= 16;
-            sz_u64_t match_indicators =
-                matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64;
-            return h + sz_u64_clz(match_indicators) / 8;
-#endif
         }
     }
 
diff --git a/scripts/test_stringzilla.cpp b/scripts/test_stringzilla.cpp
index 3d4aa11a..67af38b0 100644
--- a/scripts/test_stringzilla.cpp
+++ b/scripts/test_stringzilla.cpp
@@ -24,13 +24,14 @@
  *  ! but they come handy during development, if you want to validate
  *  ! different ISA-specific implementations.
 
-#define SZ_USE_HASWELL 0
-#define SZ_USE_SKYLAKE 0
-#define SZ_USE_ICE 0
-#define SZ_USE_NEON 0
-#define SZ_USE_SVE 0
+ #define SZ_USE_HASWELL 0
+ #define SZ_USE_SKYLAKE 0
+ #define SZ_USE_ICE 0
+ #define SZ_USE_NEON 0
+ #define SZ_USE_SVE 0
+ #define SZ_USE_SVE2 0
+ #define SZ_USE_MISALIGNED_LOADS 0
  */
-#define SZ_USE_SVE2 0
 #if defined(SZ_DEBUG)
 #undef SZ_DEBUG
 #endif
@@ -745,7 +746,7 @@ void test_stl_compatibility_for_reads() {
     assert(str("hello, world!").substr(0, 11).find("world") == str::npos);
     assert(str("axabbcxcaaabbccc").find("aaabbccc") == 8);
     assert(str("abcdabcdabc________").find("abcd") == 0);
-    assert(str("________abcdabcdabc").find("abcd") == 1);
+    assert(str("________abcdabcdabc").find("abcd") == 8);
 
     // Cover every SWAR case for unique string sequences.
     auto lowercase_alphabet = str("abcdefghijklmnopqrstuvwxyz");
@@ -2037,9 +2038,6 @@ int main(int argc, char const **argv) {
     std::printf("- CUDA managed memory support: %s\n", prop.managedMemory == 1 ? "yes" : "no");
     std::printf("- CUDA unified memory support: %s\n", prop.unifiedAddressing == 1 ? "yes" : "no");
 #endif
-#if SZ_IS_CPP17_ && defined(__cpp_lib_string_view)
-    test_stl_compatibility_for_reads<std::string_view>();
-#endif
 
     // Basic utilities
     test_arithmetical_utilities();

From d0bc604d1e0dfba0fcfb11bb889f4f07f934911f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 8 Sep 2025 11:29:40 +0000
Subject: [PATCH 742/751] Fix: Naming scorers in Python like in Rust

---
 python/stringzillas.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/python/stringzillas.c b/python/stringzillas.c
index faeb133f..792dff77 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -1029,7 +1029,7 @@ static int NeedlemanWunsch_init(NeedlemanWunsch *self, PyObject *args, PyObject
 }
 
 static PyObject *NeedlemanWunsch_repr(NeedlemanWunsch *self) {
-    return PyUnicode_FromFormat("NeedlemanWunsch(subs_checksum,open,extend=%s)", self->description);
+    return PyUnicode_FromFormat("NeedlemanWunschScores(subs_checksum,open,extend=%s)", self->description);
 }
 
 static PyObject *NeedlemanWunsch_get_capabilities(NeedlemanWunsch *self, void *closure) {
@@ -1199,7 +1199,7 @@ static PyObject *NeedlemanWunsch_call(NeedlemanWunsch *self, PyObject *args, PyO
 }
 
 static char const doc_NeedlemanWunsch[] = //
-    "NeedlemanWunsch(substitution_matrix, open=-1, extend=-1, capabilities=None)\n"
+    "NeedlemanWunschScores(substitution_matrix, open=-1, extend=-1, capabilities=None)\n"
     "\n"
     "Needleman-Wunsch global alignment scoring engine.\n"
     "\n"
@@ -1222,19 +1222,19 @@ static char const doc_NeedlemanWunsch[] = //
     "  # Minimal CPU example with BLOSUM62 matrix\n"
     "  import numpy as np, stringzilla as sz, stringzillas as szs\n"
     "  matrix = np.zeros((256, 256), dtype=np.int8)\n"
-    "  engine = szs.NeedlemanWunsch(substitution_matrix=matrix)\n"
+    "  engine = szs.NeedlemanWunschScores(substitution_matrix=matrix)\n"
     "  proteins_a = sz.Strs(['ACGT', 'TGCA'])\n"
     "  proteins_b = sz.Strs(['ACCT', 'TGAA'])\n"
     "  scores = engine(proteins_a, proteins_b)\n"
     "  \n"
     "  # GPU example with custom gap penalties\n"
     "  gpu_scope = szs.DeviceScope(gpu_device=0)\n"
-    "  engine = szs.NeedlemanWunsch(substitution_matrix=matrix, open=-2, extend=-1, capabilities=gpu_scope)\n"
+    "  engine = szs.NeedlemanWunschScores(substitution_matrix=matrix, open=-2, extend=-1, capabilities=gpu_scope)\n"
     "  scores = engine(proteins_a, proteins_b, device=gpu_scope)\n"
     "  ```";
 
 static PyTypeObject NeedlemanWunschType = {
-    PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzillas.NeedlemanWunsch",
+    PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzillas.NeedlemanWunschScores",
     .tp_doc = doc_NeedlemanWunsch,
     .tp_basicsize = sizeof(NeedlemanWunsch),
     .tp_flags = Py_TPFLAGS_DEFAULT,
@@ -1513,7 +1513,7 @@ static PyObject *SmithWaterman_call(SmithWaterman *self, PyObject *args, PyObjec
 }
 
 static PyObject *SmithWaterman_repr(SmithWaterman *self) {
-    return PyUnicode_FromFormat("SmithWaterman(subs_checksum,open,extend=%s)", self->description);
+    return PyUnicode_FromFormat("SmithWatermanScores(subs_checksum,open,extend=%s)", self->description);
 }
 
 static PyObject *SmithWaterman_get_capabilities(SmithWaterman *self, void *closure) {
@@ -1527,7 +1527,7 @@ static PyGetSetDef SmithWaterman_getsetters[] = {
 };
 
 static char const doc_SmithWaterman[] = //
-    "SmithWaterman(substitution_matrix, open=-1, extend=-1, capabilities=None)\n"
+    "SmithWatermanScores(substitution_matrix, open=-1, extend=-1, capabilities=None)\n"
     "\n"
     "Smith-Waterman local alignment scoring engine.\n"
     "\n"
@@ -1550,19 +1550,19 @@ static char const doc_SmithWaterman[] = //
     "  # Minimal CPU example for local alignment\n"
     "  import numpy as np, stringzilla as sz, stringzillas as szs\n"
     "  matrix = np.eye(256, dtype=np.int8)  # Identity matrix\n"
-    "  engine = szs.SmithWaterman(substitution_matrix=matrix)\n"
+    "  engine = szs.SmithWatermanScores(substitution_matrix=matrix)\n"
     "  seqs_a = sz.Strs(['ACGTACGT', 'TGCATGCA'])\n"
     "  seqs_b = sz.Strs(['CGTACGTA', 'GCATGCAT'])\n"
     "  scores = engine(seqs_a, seqs_b)\n"
     "  \n"
     "  # GPU example with different gap costs\n"
     "  gpu_scope = szs.DeviceScope(gpu_device=0)\n"
-    "  engine = szs.SmithWaterman(substitution_matrix=matrix, open=-3, extend=-1, capabilities=gpu_scope)\n"
+    "  engine = szs.SmithWatermanScores(substitution_matrix=matrix, open=-3, extend=-1, capabilities=gpu_scope)\n"
     "  scores = engine(seqs_a, seqs_b, device=gpu_scope)\n"
     "  ```";
 
 static PyTypeObject SmithWatermanType = {
-    PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzillas.SmithWaterman",
+    PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzillas.SmithWatermanScores",
     .tp_doc = doc_SmithWaterman,
     .tp_basicsize = sizeof(SmithWaterman),
     .tp_flags = Py_TPFLAGS_DEFAULT,
@@ -2086,7 +2086,7 @@ PyMODINIT_FUNC PyInit_stringzillas(void) {
     }
 
     Py_INCREF(&NeedlemanWunschType);
-    if (PyModule_AddObject(m, "NeedlemanWunsch", (PyObject *)&NeedlemanWunschType) < 0) {
+    if (PyModule_AddObject(m, "NeedlemanWunschScores", (PyObject *)&NeedlemanWunschType) < 0) {
         Py_XDECREF(&NeedlemanWunschType);
         Py_XDECREF(&LevenshteinDistancesUTF8Type);
         Py_XDECREF(&LevenshteinDistancesType);
@@ -2096,7 +2096,7 @@ PyMODINIT_FUNC PyInit_stringzillas(void) {
     }
 
     Py_INCREF(&SmithWatermanType);
-    if (PyModule_AddObject(m, "SmithWaterman", (PyObject *)&SmithWatermanType) < 0) {
+    if (PyModule_AddObject(m, "SmithWatermanScores", (PyObject *)&SmithWatermanType) < 0) {
         Py_XDECREF(&SmithWatermanType);
         Py_XDECREF(&NeedlemanWunschType);
         Py_XDECREF(&LevenshteinDistancesUTF8Type);

From f2704d734c8ee8ce1b3e84fd24e1ab1fb42da4c4 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 8 Sep 2025 11:30:48 +0000
Subject: [PATCH 743/751] Make: Default to Python 3.12 for better `itertools`

---
 .github/workflows/prerelease.yml | 2 +-
 CONTRIBUTING.md                  | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index dd7b0187..ef0ddb30 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -9,7 +9,7 @@ on:
 env:
   BUILD_TYPE: Release
   GH_TOKEN: ${{ secrets.SEMANTIC_RELEASE_TOKEN }}
-  PYTHON_VERSION: 3.11
+  PYTHON_VERSION: 3.12
   SWIFT_VERSION: 6.0
   PYTHONUTF8: 1
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 90455c5d..90e32ebf 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -441,8 +441,9 @@ Python bindings are implemented using pure CPython, so you wouldn't need to inst
 Still, you need a virtual environment, and it's recommended to use `uv` to create one.
 
 ```bash
-uv venv --python 3.11                   # or your preferred Python version
+uv venv --python 3.12                   # or your preferred Python version
 source .venv/bin/activate               # to activate the virtual environment
+uv pip install setuptools wheel         # to pull the latest build tools
 uv pip install -e . --force-reinstall   # to build locally from source
 ```
 
@@ -529,6 +530,12 @@ python -m cibuildwheel --platform windows
 ### Benchmarking
 
 For high-performance low-latency benchmarking, stick to C/C++ native benchmarks, as the CPython is likely to cause bottlenecks.
+Before running the benchmarks, pull dependencies:
+
+```sh
+uv pip install -r scripts/requirements.txt
+```
+
 For benchmarking, the following scripts are provided.
 
 ```sh

From f8806217e7c2351431ad2a15fbbb5a589335859e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 8 Sep 2025 11:33:39 +0000
Subject: [PATCH 744/751] Make: Option to disable sanitizers for masked IO

---
 CMakeLists.txt  | 3 ++-
 CONTRIBUTING.md | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bc8139dd..23de4095 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -114,6 +114,7 @@ option(STRINGZILLA_BUILD_BENCHMARK "Compile a native benchmark in C++" ${STRINGZ
 option(STRINGZILLA_BUILD_SHARED "Compile a dynamic library" ${STRINGZILLA_IS_MAIN_PROJECT})
 option(STRINGZILLAS_BUILD_SHARED "Compile dynamic parallel libraries" ${STRINGZILLA_IS_MAIN_PROJECT})
 option(STRINGZILLA_BUILD_CUDA "Build CUDA-accelerated targets" ${STRINGZILLA_CAN_BUILD_CUDA})
+option(STRINGZILLA_USE_SANITIZERS "Enable AddressSanitizer and UndefinedBehaviorSanitizer in Debug builds" ON)
 set(STRINGZILLA_TARGET_ARCH
     ""
     CACHE STRING "Architecture to tell the compiler to optimize for (-march)"
@@ -348,7 +349,7 @@ function (set_compiler_flags target cpp_standard target_arch compiler_id)
     # Sanitizer options for Debug mode
     if (CMAKE_BUILD_TYPE STREQUAL "Debug")
         target_compile_definitions(${target} PRIVATE "SZ_DEBUG=1")
-        if (NOT target_type STREQUAL "SHARED_LIBRARY")
+        if (STRINGZILLA_USE_SANITIZERS AND NOT target_type STREQUAL "SHARED_LIBRARY")
             if (compiler_id MATCHES "MSVC")
                 target_compile_options(${target} PRIVATE "/fsanitize=address;/fsanitize=leak")
                 target_link_options(${target} PRIVATE "/fsanitize=address;/fsanitize=leak")
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 90e32ebf..c3c16670 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -163,13 +163,15 @@ cmake --build build_release --config Release
 Using modern syntax, this is how you build and run the test suite:
 
 ```bash
-cmake -D STRINGZILLA_BUILD_TEST=1 -D CMAKE_BUILD_TYPE=Debug -B build_debug
+cmake -D STRINGZILLA_BUILD_TEST=1 -D STRINGZILLA_USE_SANITIZERS=0 -D CMAKE_BUILD_TYPE=Debug -B build_debug
 cmake --build build_debug --config Debug      # Which will produce the following targets:
 build_debug/stringzilla_test_cpp20            # Unit test for the entire library compiled for current hardware
 build_debug/stringzilla_test_cpp20_serial     # x86 variant compiled for IvyBridge - last arch. before AVX2
 build_debug/stringzilla_test_cpp20_serial     # Arm variant compiled without Neon
 ```
 
+Note, that Address Sanitizers have a hard time with masked load and store instructions in AVX-512 and SVE.
+
 To use CppCheck for static analysis make sure to export the compilation commands.
 Overall, CppCheck and Clang-Tidy are extremely noisy and not suitable for CI, but may be useful for local development.
 

From 3713e72850d459bd30425e2b08e95ba1ad782f85 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 8 Sep 2025 11:34:27 +0000
Subject: [PATCH 745/751] Improve: All new similarity-scoring benchmarks

---
 scripts/bench_similarities.py | 752 ++++++++++++++++++++++++----------
 1 file changed, 540 insertions(+), 212 deletions(-)

diff --git a/scripts/bench_similarities.py b/scripts/bench_similarities.py
index ec960553..9993558a 100644
--- a/scripts/bench_similarities.py
+++ b/scripts/bench_similarities.py
@@ -55,8 +55,10 @@
 import time
 import random
 import argparse
+import itertools
+import re
 from pathlib import Path
-from typing import List, Callable, Tuple
+from typing import List, Callable, Tuple, Optional, Pattern, Any
 
 from tqdm import tqdm
 import numpy as np
@@ -97,183 +99,368 @@ def log_similarity_operation(
     string_pairs: List[Tuple[str, str]],
     similarity_func: Callable,
     timeout_seconds: int = 10,
+    batch_size: int = 2048,
+    is_utf8: bool = False,
 ):
-    """Benchmark a similarity operation with timeout and progress tracking."""
+    """Benchmark a similarity operation with timeout and progress tracking.
+
+    Supports batch processing by attempting `similarity_func(list_a, list_b)` when
+    `batch_size > 1`. If the function does not accept batched inputs, falls back
+    to per-pair calls transparently.
+    """
+    # Normalize inputs
+    batch_size = max(1, int(batch_size or 1))
+
     processed_pairs = 0
-    processed_bytes = 0
+    processed_cells = 0
     checksum = 0
-    start_time = time.time_ns()
+    start_ns = time.monotonic_ns()
 
-    try:
-        with tqdm(desc=name, unit="pairs", leave=False, total=len(string_pairs)) as progress_bar:
-            for str_a, str_b in string_pairs:
-                # Check timeout (convert seconds to nanoseconds)
-                if time.time_ns() - start_time > timeout_seconds * 1e9:
-                    break
-
-                try:
-                    distance = similarity_func(str_a, str_b)
-                    checksum += distance
-                    processed_pairs += 1
-                    processed_bytes += len(str_a.encode("utf-8")) + len(str_b.encode("utf-8"))
-
-                    # Update progress bar with custom rate
-                    elapsed_ns = time.time_ns() - start_time
-                    elapsed_s = elapsed_ns / 1e9
-                    if elapsed_s > 0:
-                        pairs_per_sec = processed_pairs / elapsed_s
-                        bytes_per_sec = processed_bytes / elapsed_s
-                        progress_bar.set_postfix(
-                            {
-                                "pairs/s": f"{pairs_per_sec:.0f}",
-                                "MB/s": f"{bytes_per_sec/1e6:.1f}",
-                                "checksum": f"{checksum}",
-                            }
-                        )
-                    progress_bar.update(1)
-
-                except Exception:
-                    # Skip failed operations but continue
-                    continue
+    def timed_out() -> bool:
+        return (time.monotonic_ns() - start_ns) > int(timeout_seconds * 1e9)
+
+    def cells_in_pair(a: str, b: str) -> int:
+        # Use codepoints for UTF-8 aware algos; bytes for binary algos
+        if is_utf8:
+            return len(a) * len(b)
+        else:
+            return len(a.encode("utf-8")) * len(b.encode("utf-8"))
 
+    try:
+        bar = tqdm(total=len(string_pairs), desc=name, unit="pairs", leave=False)
+        for pairs_batch in itertools.batched(string_pairs, batch_size):
+            if timed_out():
+                break
+
+            if batch_size == 1:
+                a, b = pairs_batch[0]
+                result = similarity_func(a, b)
+                results = [result]
+                batch_pairs = 1
+                batch_cells = cells_in_pair(a, b)
+            else:
+                a_array = np.array([a for a, _ in pairs_batch])
+                b_array = np.array([b for _, b in pairs_batch])
+                results = similarity_func(a_array, b_array)
+                batch_pairs = len(pairs_batch)
+                batch_cells = sum(cells_in_pair(a, b) for a, b in pairs_batch)
+
+            # To validate the results, compute a checksum (handles numpy/list/scalar)
+            batch_checksum = _checksum_from_results(results)
+
+            checksum += batch_checksum
+            processed_pairs += batch_pairs
+            processed_cells += batch_cells
+
+            elapsed_s = (time.monotonic_ns() - start_ns) / 1e9
+            if elapsed_s > 0:
+                pairs_per_sec = processed_pairs / elapsed_s
+                cells_per_sec = processed_cells / elapsed_s
+                bar.set_postfix(
+                    {
+                        "pairs/s": f"{pairs_per_sec:.0f}",
+                        "CUPS": f"{cells_per_sec:,.1f}",
+                        "checksum": f"{checksum}",
+                    }
+                )
+            bar.update(batch_pairs)
+        bar.close()
     except KeyboardInterrupt:
         print(f"\n{name}: SKIPPED (interrupted by user)")
         return
 
-    total_time_ns = time.time_ns() - start_time
+    total_time_ns = time.monotonic_ns() - start_ns
     total_time_s = total_time_ns / 1e9
     if processed_pairs > 0:
         pairs_per_sec = processed_pairs / total_time_s
-        mb_per_sec = processed_bytes / (1e6 * total_time_s)
+        cells_per_sec = processed_cells / total_time_s
         print(
-            f"{name}: {processed_pairs:,} pairs in {total_time_s:.2f}s ~ {mb_per_sec:.3f} MB/s, {pairs_per_sec:.0f} pairs/s, checksum={checksum}"
+            f"{name}: {processed_pairs:,} pairs in {total_time_s:.2f}s ~ {cells_per_sec:,.1f} CUPS, {pairs_per_sec:.0f} pairs/s, checksum={checksum}"
         )
     else:
         print(f"{name}: No pairs processed")
 
 
-def benchmark_edit_distances(string_pairs: List[Tuple[str, str]], timeout_seconds: int = 10):
+def _checksum_from_results(results: Any) -> int:
+    """Convert various result types into an integer checksum cleanly.
+
+    Accepts numpy arrays/scalars, Python scalars, or iterables of numerics.
+    """
+    # Numpy array: use vectorized sum
+    if isinstance(results, np.ndarray):
+        return int(results.sum())
+
+    # Numpy scalar
+    if isinstance(results, np.generic):
+        return int(results)
+
+    # Plain Python scalar
+    if isinstance(results, (int, float)):
+        return int(results)
+
+    # Iterable of numerics (but not string/bytes)
+    if hasattr(results, "__iter__") and not isinstance(results, (str, bytes)):
+        return int(sum(int(x) for x in results))
+
+    # Fallback: try to coerce directly
+    try:
+        return int(results)
+    except Exception:
+        return 0
+
+
+def name_matches(name: str, pattern: Optional[re.Pattern]) -> bool:
+    """Return True if no pattern provided or the regex matches the name."""
+    return True if pattern is None else bool(pattern.search(name))
+
+
+def benchmark_third_party_edit_distances(
+    string_pairs: List[Tuple[str, str]],
+    timeout_seconds: int = 10,
+    filter_pattern: Optional[re.Pattern] = None,
+):
     """Benchmark various edit distance implementations."""
 
     # RapidFuzz
-    log_similarity_operation(
-        "rapidfuzz.Levenshtein.distance",
-        string_pairs,
-        rf.distance,
-        timeout_seconds,
-    )
+    if name_matches("rapidfuzz.Levenshtein.distance", filter_pattern):
+        log_similarity_operation(
+            "rapidfuzz.Levenshtein.distance",
+            string_pairs,
+            rf.distance,
+            timeout_seconds,
+            batch_size=1,  # ? Batch API is different
+            is_utf8=True,  # UTF-8 codepoints
+        )
+
+    # RapidFuzz batch API
+    # if name_matches(f"rapidfuzz.Levenshtein.cpdist(batch={batch_size})", filter_pattern):
+    #     log_similarity_operation(
+    #         f"rapidfuzz.Levenshtein.cpdist(batch={batch_size})",
+    #         string_pairs,
+    #         rf.cpdist,
+    #         batch_size=timeout_seconds,
+    #         is_utf8=batch_size,
+    #     )
 
     # python-Levenshtein
-    log_similarity_operation(
-        "Levenshtein.distance",
-        string_pairs,
-        le.distance,
-        timeout_seconds,
-    )
+    if name_matches("Levenshtein.distance", filter_pattern):
+        log_similarity_operation(
+            "Levenshtein.distance",
+            string_pairs,
+            le.distance,
+            timeout_seconds,
+            batch_size=1,
+            is_utf8=True,  # UTF-8 codepoints
+        )
 
     # Jellyfish
-    log_similarity_operation(
-        "jellyfish.levenshtein_distance",
-        string_pairs,
-        jf.levenshtein_distance,
-        timeout_seconds,
-    )
+    if name_matches("jellyfish.levenshtein_distance", filter_pattern):
+        log_similarity_operation(
+            "jellyfish.levenshtein_distance",
+            string_pairs,
+            jf.levenshtein_distance,
+            timeout_seconds,
+            batch_size=1,
+            is_utf8=True,  # UTF-8 codepoints
+        )
 
     # EditDistance
-    log_similarity_operation(
-        "editdistance.eval",
-        string_pairs,
-        ed.eval,
-        timeout_seconds,
-    )
+    if name_matches("editdistance.eval", filter_pattern):
+        log_similarity_operation(
+            "editdistance.eval",
+            string_pairs,
+            ed.eval,
+            timeout_seconds,
+            batch_size=1,
+            is_utf8=True,  # UTF-8 codepoints
+        )
 
     # NLTK
-    log_similarity_operation(
-        "nltk.edit_distance",
-        string_pairs,
-        nltk_ed,
-        timeout_seconds,
-    )
+    if name_matches("nltk.edit_distance", filter_pattern):
+        log_similarity_operation(
+            "nltk.edit_distance",
+            string_pairs,
+            nltk_ed,
+            timeout_seconds,
+            batch_size=1,
+            is_utf8=True,  # UTF-8 codepoints
+        )
 
     # Edlib
-    def edlib_distance(a: str, b: str) -> int:
-        return edlib.align(a, b, mode="NW", task="distance")["editDistance"]
+    if name_matches("edlib.align", filter_pattern):
 
-    log_similarity_operation(
-        "edlib.align",
-        string_pairs,
-        edlib_distance,
-        timeout_seconds,
-    )
+        def kernel(a: str, b: str) -> int:
+            return edlib.align(a, b, mode="NW", task="distance")["editDistance"]
+
+        log_similarity_operation(
+            "edlib.align",
+            string_pairs,
+            kernel,
+            timeout_seconds,
+            batch_size=1,
+            is_utf8=False,  # Binary/bytes
+        )
 
     # Polyleven (if available)
-    if POLYLEVEN_AVAILABLE:
+    if name_matches("polyleven.levenshtein", filter_pattern) and POLYLEVEN_AVAILABLE:
         log_similarity_operation(
             "polyleven.levenshtein",
             string_pairs,
             polyleven.levenshtein,
             timeout_seconds,
+            batch_size=1,
+            is_utf8=False,  # Binary/bytes
         )
 
-    # StringZillas batch processing
-    def benchmark_stringzillas_batch(engine_name, engine_class, device_scope):
-        try:
-            engine = engine_class()
 
-            # Prepare data for batch processing
-            strings_a = sz.Strs([pair[0] for pair in string_pairs])
-            strings_b = sz.Strs([pair[1] for pair in string_pairs])
+def benchmark_stringzillas_edit_distances(
+    string_pairs: List[Tuple[str, str]],
+    timeout_seconds: int = 10,
+    batch_size: int = 1,
+    filter_pattern: Optional[re.Pattern] = None,
+    szs_class: Any = szs.LevenshteinDistances,
+    szs_name: str = "szs.LevenshteinDistances",
+    is_utf8: bool = False,
+):
+    """Benchmark various edit distance implementations."""
 
-            start_time = time.time_ns()
-            results = engine(strings_a, strings_b, device_scope)
-            end_time = time.time_ns()
+    cpu_cores = os.cpu_count()
+    default_scope = szs.DeviceScope()
+    cpu_scope = szs.DeviceScope(cpu_cores=cpu_cores)
+    try:
+        gpu_scope = szs.DeviceScope(gpu_device=0)
+    except Exception:
+        gpu_scope = None
 
-            total_time_s = (end_time - start_time) / 1e9
-            processed_bytes = sum(len(a.encode("utf-8")) + len(b.encode("utf-8")) for a, b in string_pairs)
-            mb_per_sec = processed_bytes / (1e6 * total_time_s)
-            pairs_per_sec = len(string_pairs) / total_time_s
-            checksum = sum(results) if hasattr(results, "__iter__") else 0
+    # Single-input variants on 1 CPU core
+    if name_matches(f"{szs_name}(1xCPU)", filter_pattern):
 
-            print(
-                f"{engine_name}: {len(string_pairs):,} pairs in {total_time_s:.2f}s ~ {mb_per_sec:.3f} MB/s, {pairs_per_sec:.0f} pairs/s, checksum={checksum}"
-            )
-        except Exception as e:
-            print(f"{engine_name}: FAILED - {e}")
+        engine = szs_class(capabilities=default_scope)
 
-    # StringZillas Levenshtein distances (batch)
-    cpu_scope = szs.DeviceScope(cpu_cores=os.cpu_count())
-    benchmark_stringzillas_batch("szs.LevenshteinDistances(CPU)", szs.LevenshteinDistances, cpu_scope)
+        def kernel(a: str, b: str) -> int:
+            a_array = sz.Strs([a])
+            b_array = sz.Strs([b])
+            return engine(a_array, b_array, default_scope)[0]
 
-    try:
-        gpu_scope = szs.DeviceScope(gpu_device=0)
-        benchmark_stringzillas_batch("szs.LevenshteinDistances(GPU)", szs.LevenshteinDistances, gpu_scope)
-    except Exception:
-        pass  # GPU may not be available
+        log_similarity_operation(
+            f"{szs_name}(1xCPU)",
+            string_pairs,
+            kernel,
+            timeout_seconds,
+            batch_size=1,
+            is_utf8=is_utf8,
+        )
 
-    # StringZillas UTF-8 Levenshtein distances (batch)
-    benchmark_stringzillas_batch("szs.LevenshteinDistancesUTF8(CPU)", szs.LevenshteinDistancesUTF8, cpu_scope)
-    try:
-        benchmark_stringzillas_batch("szs.LevenshteinDistancesUTF8(GPU)", szs.LevenshteinDistancesUTF8, gpu_scope)
-    except Exception:
-        pass  # GPU may not be available
+    # Single-input variants on all CPU cores
+    if name_matches(f"{szs_name}({cpu_cores}xCPU)", filter_pattern):
 
+        engine = szs_class(capabilities=cpu_scope)
 
-def benchmark_alignment_scores(string_pairs: List[Tuple[str, str]], timeout_seconds: int = 10):
-    """Benchmark alignment scoring with substitution matrices."""
-    global _biopython_aligner, _blosum_matrix
+        def kernel(a: str, b: str) -> int:
+            a_array = sz.Strs([a])
+            b_array = sz.Strs([b])
+            return engine(a_array, b_array, cpu_scope)[0]
 
-    if not BIOPYTHON_AVAILABLE:
-        print("BioPython not available, skipping alignment benchmarks")
-        return
+        log_similarity_operation(
+            f"{szs_name}({cpu_cores}xCPU)",
+            string_pairs,
+            kernel,
+            timeout_seconds,
+            batch_size=1,
+            is_utf8=is_utf8,
+        )
+
+    # Single-input variants on GPU
+    if name_matches(f"{szs_name}(1xGPU)", filter_pattern) and not is_utf8 and gpu_scope is not None:
+
+        engine = szs_class(capabilities=gpu_scope)
+
+        def kernel(a: str, b: str) -> int:
+            a_array = sz.Strs([a])
+            b_array = sz.Strs([b])
+            return engine(a_array, b_array, gpu_scope)[0]
+
+        log_similarity_operation(
+            f"{szs_name}(1xGPU)",
+            string_pairs,
+            kernel,
+            timeout_seconds,
+            batch_size=1,
+            is_utf8=is_utf8,
+        )
+
+    # Batch-input variants on 1 CPU core
+    if name_matches(f"{szs_name}(1xCPU,batch={batch_size})", filter_pattern):
 
-    # Initialize BioPython aligner
+        engine = szs_class(capabilities=default_scope)
+
+        def kernel(a_list: List[str], b_list: List[str]) -> List[int]:
+            a_array = sz.Strs(a_list)
+            b_array = sz.Strs(b_list)
+            return engine(a_array, b_array, default_scope)
+
+        log_similarity_operation(
+            f"{szs_name}(1xCPU,batch={batch_size})",
+            string_pairs,
+            kernel,
+            timeout_seconds,
+            batch_size=batch_size,
+            is_utf8=is_utf8,
+        )
+
+    # Batch-input variants on all CPU cores
+    if name_matches(f"{szs_name}({cpu_cores}xCPU,batch={batch_size})", filter_pattern):
+
+        engine = szs_class(capabilities=cpu_scope)
+
+        def kernel(a_list: List[str], b_list: List[str]) -> List[int]:
+            a_array = sz.Strs(a_list)
+            b_array = sz.Strs(b_list)
+            return engine(a_array, b_array, cpu_scope)
+
+        log_similarity_operation(
+            f"{szs_name}({cpu_cores}xCPU,batch={batch_size})",
+            string_pairs,
+            kernel,
+            timeout_seconds,
+            batch_size=batch_size,
+            is_utf8=is_utf8,
+        )
+
+    # Batch-input variants on GPU
+    if name_matches(f"{szs_name}(1xGPU,batch={batch_size})", filter_pattern) and not is_utf8 and gpu_scope is not None:
+
+        engine = szs_class(capabilities=gpu_scope)
+
+        def kernel(a_list: List[str], b_list: List[str]) -> List[int]:
+            a_array = sz.Strs(a_list)
+            b_array = sz.Strs(b_list)
+            return engine(a_array, b_array, gpu_scope)
+
+        log_similarity_operation(
+            f"{szs_name}(1xGPU,batch={batch_size})",
+            string_pairs,
+            kernel,
+            timeout_seconds,
+            batch_size=batch_size,
+            is_utf8=is_utf8,
+        )
+
+
+def _ensure_bio_resources():
+    """Initialize BioPython aligner and BLOSUM matrix if available.
+
+    Returns a tuple (aligner, blosum_matrix) or (None, None) if unavailable.
+    """
+    global _biopython_aligner, _blosum_matrix
+    if not BIOPYTHON_AVAILABLE:
+        return None, None
     if _biopython_aligner is None:
         _biopython_aligner = Align.PairwiseAligner()
         _biopython_aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
         _biopython_aligner.open_gap_score = 1
         _biopython_aligner.extend_gap_score = 1
 
-        # Convert BLOSUM matrix to dense 256x256 for StringZilla
         subs_packed = np.array(_biopython_aligner.substitution_matrix).astype(np.int8)
         _blosum_matrix = np.zeros((256, 256), dtype=np.int8)
         _blosum_matrix.fill(127)  # Large penalty for invalid characters
@@ -283,56 +470,162 @@ def benchmark_alignment_scores(string_pairs: List[Tuple[str, str]], timeout_seco
                 reconstructed_row = ord(packed_row_aminoacid)
                 reconstructed_column = ord(packed_column_aminoacid)
                 _blosum_matrix[reconstructed_row, reconstructed_column] = subs_packed[packed_row, packed_column]
+    return _biopython_aligner, _blosum_matrix
 
-    # BioPython alignment score
-    log_similarity_operation(
-        "biopython.PairwiseAligner.score",
-        string_pairs,
-        _biopython_aligner.score,
-        timeout_seconds,
-    )
-
-    # StringZillas alignment functions (batch)
-    def benchmark_stringzillas_alignment_batch(engine_name, engine_class, substitution_matrix, device_scope):
-        try:
-            engine = engine_class(substitution_matrix=substitution_matrix)
-
-            # Prepare data for batch processing
-            strings_a = sz.Strs([pair[0] for pair in string_pairs])
-            strings_b = sz.Strs([pair[1] for pair in string_pairs])
 
-            start_time = time.time_ns()
-            results = engine(strings_a, strings_b, device_scope)
-            end_time = time.time_ns()
+def benchmark_third_party_similarity_scores(
+    string_pairs: List[Tuple[str, str]],
+    timeout_seconds: int = 10,
+    filter_pattern: Optional[re.Pattern] = None,
+):
+    """Benchmark various similarity scoring implementations."""
 
-            total_time_s = (end_time - start_time) / 1e9
-            processed_bytes = sum(len(a.encode("utf-8")) + len(b.encode("utf-8")) for a, b in string_pairs)
-            mb_per_sec = processed_bytes / (1e6 * total_time_s)
-            pairs_per_sec = len(string_pairs) / total_time_s
-            checksum = sum(results) if hasattr(results, "__iter__") else 0
+    # BioPython
+    aligner, blosum = _ensure_bio_resources()
+    if name_matches("biopython.PairwiseAligner.score", filter_pattern) and aligner is not None and blosum is not None:
+        log_similarity_operation(
+            "biopython.PairwiseAligner.score",
+            string_pairs,
+            aligner.score,
+            timeout_seconds,
+            1,
+            False,
+        )
 
-            print(
-                f"{engine_name}: {len(string_pairs):,} pairs in {total_time_s:.2f}s ~ {mb_per_sec:.3f} MB/s, {pairs_per_sec:.0f} pairs/s, checksum={checksum}"
-            )
-        except Exception as e:
-            print(f"{engine_name}: FAILED - {e}")
 
-    # StringZillas Needleman-Wunsch (global alignment)
-    cpu_scope = szs.DeviceScope(cpu_cores=os.cpu_count())
-    benchmark_stringzillas_alignment_batch("szs.NeedlemanWunsch(CPU)", szs.NeedlemanWunsch, _blosum_matrix, cpu_scope)
+def benchmark_stringzillas_similarity_scores(
+    string_pairs: List[Tuple[str, str]],
+    timeout_seconds: int = 10,
+    batch_size: int = 2048,
+    filter_pattern: Optional[re.Pattern] = None,
+    szs_class: Any = szs.NeedlemanWunschScores,
+    szs_name: str = "szs.NeedlemanWunschScores",
+):
+    """Benchmark various edit distance implementations."""
 
+    cpu_cores = os.cpu_count()
+    default_scope = szs.DeviceScope()
+    cpu_scope = szs.DeviceScope(cpu_cores=cpu_cores)
     try:
         gpu_scope = szs.DeviceScope(gpu_device=0)
-        benchmark_stringzillas_alignment_batch("szs.NeedlemanWunsch(GPU)", szs.NeedlemanWunsch, _blosum_matrix, gpu_scope)
-    except:
-        pass  # GPU may not be available
+    except Exception:
+        gpu_scope = None
 
-    # StringZillas Smith-Waterman (local alignment)
-    benchmark_stringzillas_alignment_batch("szs.SmithWaterman(CPU)", szs.SmithWaterman, _blosum_matrix, cpu_scope)
-    try:
-        benchmark_stringzillas_alignment_batch("szs.SmithWaterman(GPU)", szs.SmithWaterman, _blosum_matrix, gpu_scope)
-    except:
-        pass  # GPU may not be available
+    _, blosum = _ensure_bio_resources()
+
+    # Single-input variants on 1 CPU core
+    if name_matches(f"{szs_name}(1xCPU)", filter_pattern):
+
+        engine = szs_class(capabilities=default_scope, substitution_matrix=blosum)
+
+        def kernel(a: str, b: str) -> int:
+            a_array = sz.Strs([a])
+            b_array = sz.Strs([b])
+            return engine(a_array, b_array, default_scope)[0]
+
+        log_similarity_operation(
+            f"{szs_name}(1xCPU)",
+            string_pairs,
+            kernel,
+            timeout_seconds,
+            batch_size=1,
+            is_utf8=False,
+        )
+
+    # Single-input variants on all CPU cores
+    if name_matches(f"{szs_name}({cpu_cores}xCPU)", filter_pattern):
+
+        engine = szs_class(capabilities=cpu_scope, substitution_matrix=blosum)
+
+        def kernel(a: str, b: str) -> int:
+            a_array = sz.Strs([a])
+            b_array = sz.Strs([b])
+            return engine(a_array, b_array, cpu_scope)[0]
+
+        log_similarity_operation(
+            f"{szs_name}({cpu_cores}xCPU)",
+            string_pairs,
+            kernel,
+            timeout_seconds,
+            batch_size=1,
+            is_utf8=False,
+        )
+
+    # Single-input variants on GPU
+    if name_matches(f"{szs_name}(1xGPU)", filter_pattern) and gpu_scope is not None:
+
+        engine = szs_class(capabilities=gpu_scope, substitution_matrix=blosum)
+
+        def kernel(a: str, b: str) -> int:
+            a_array = sz.Strs([a])
+            b_array = sz.Strs([b])
+            return engine(a_array, b_array, gpu_scope)[0]
+
+        log_similarity_operation(
+            f"{szs_name}(1xGPU)",
+            string_pairs,
+            kernel,
+            timeout_seconds,
+            batch_size=1,
+            is_utf8=False,
+        )
+
+    # Batch-input variants on 1 CPU core
+    if name_matches(f"{szs_name}(1xCPU,batch={batch_size})", filter_pattern):
+
+        engine = szs_class(capabilities=default_scope, substitution_matrix=blosum)
+
+        def kernel(a_list: List[str], b_list: List[str]) -> List[int]:
+            a_array = sz.Strs(a_list)
+            b_array = sz.Strs(b_list)
+            return engine(a_array, b_array, default_scope)
+
+        log_similarity_operation(
+            f"{szs_name}(1xCPU,batch={batch_size})",
+            string_pairs,
+            kernel,
+            timeout_seconds,
+            batch_size=batch_size,
+            is_utf8=False,
+        )
+
+    # Batch-input variants on all CPU cores
+    if name_matches(f"{szs_name}({cpu_cores}xCPU,batch={batch_size})", filter_pattern):
+
+        engine = szs_class(capabilities=cpu_scope, substitution_matrix=blosum)
+
+        def kernel(a_list: List[str], b_list: List[str]) -> List[int]:
+            a_array = sz.Strs(a_list)
+            b_array = sz.Strs(b_list)
+            return engine(a_array, b_array, cpu_scope)
+
+        log_similarity_operation(
+            f"{szs_name}({cpu_cores}xCPU,batch={batch_size})",
+            string_pairs,
+            kernel,
+            timeout_seconds,
+            batch_size=batch_size,
+            is_utf8=False,
+        )
+
+    # Batch-input variants on GPU
+    if name_matches(f"{szs_name}(1xGPU,batch={batch_size})", filter_pattern) and gpu_scope is not None:
+
+        engine = szs_class(capabilities=gpu_scope, substitution_matrix=blosum)
+
+        def kernel(a_list: List[str], b_list: List[str]) -> List[int]:
+            a_array = sz.Strs(a_list)
+            b_array = sz.Strs(b_list)
+            return engine(a_array, b_array, gpu_scope)
+
+        log_similarity_operation(
+            f"{szs_name}(1xGPU,batch={batch_size})",
+            string_pairs,
+            kernel,
+            timeout_seconds,
+            batch_size=batch_size,
+            is_utf8=False,
+        )
 
 
 def generate_random_pairs(strings: List[str], num_pairs: int) -> List[Tuple[str, str]]:
@@ -340,61 +633,77 @@ def generate_random_pairs(strings: List[str], num_pairs: int) -> List[Tuple[str,
     return [(random.choice(strings), random.choice(strings)) for _ in range(num_pairs)]
 
 
-def generate_protein_sequences(num_sequences: int, length: int) -> List[str]:
-    """Generate random protein sequences using ACGT alphabet."""
-    return ["".join(random.choices("ACGT", k=length)) for _ in range(num_sequences)]
-
-
 def bench(
     dataset_path: str,
     max_pairs: int = None,
     timeout_seconds: int = 10,
-    protein_mode: bool = False,
-    protein_length: int = 1000,
+    batch_size: int = 2048,
+    filter_pattern: Optional[Pattern] = None,
+    bio: bool = False,
 ):
     """Run similarity benchmarks."""
 
-    if protein_mode:
-        print("=== Protein Sequence Benchmarks ===")
-        print(f"Generating {protein_length}-length protein sequences...")
-        proteins = generate_protein_sequences(1000, protein_length)
-        pairs = generate_random_pairs(proteins, max_pairs or 1000)
+    # Load dataset
+    if not Path(dataset_path).exists():
+        raise FileNotFoundError(f"Dataset not found: {dataset_path}")
 
-        print(f"Generated {len(pairs):,} protein sequence pairs")
-        print(f"Average sequence length: {protein_length} chars")
-        print(f"Timeout per benchmark: {timeout_seconds}s")
-        print()
+    with open(dataset_path, "r", encoding="utf-8", errors="ignore") as f:
+        strings = [line.strip() for line in f if line.strip()]
 
-        print("=== Edit Distance Benchmarks ===")
-        benchmark_edit_distances(pairs, timeout_seconds)
-        print()
+    # Generate random pairs
+    num_pairs = max_pairs or min(100_000, len(strings) * 10)
+    pairs = generate_random_pairs(strings, num_pairs)
 
-        print("=== Alignment Score Benchmarks ===")
-        benchmark_alignment_scores(pairs, timeout_seconds)
+    total_chars = sum(len(a) + len(b) for a, b in pairs)
+    avg_length = total_chars / (2 * len(pairs))
 
-    else:
-        # Load dataset
-        if not Path(dataset_path).exists():
-            raise FileNotFoundError(f"Dataset not found: {dataset_path}")
-
-        with open(dataset_path, "r", encoding="utf-8", errors="ignore") as f:
-            strings = [line.strip() for line in f if line.strip()]
-
-        # Generate random pairs
-        num_pairs = max_pairs or min(100000, len(strings) * 10)
-        pairs = generate_random_pairs(strings, num_pairs)
+    print(f"Prepared {len(pairs):,} string pairs from {len(strings):,} unique strings")
+    print(f"Average string length: {avg_length:.1f} chars")
+    print(f"Total characters: {total_chars:,}")
+    print(f"Timeout per benchmark: {timeout_seconds}s")
+    print()
 
-        total_chars = sum(len(a) + len(b) for a, b in pairs)
-        avg_length = total_chars / (2 * len(pairs))
+    print("=== Edit Distance Benchmarks ===")
+    benchmark_third_party_edit_distances(pairs, timeout_seconds, filter_pattern)
+    benchmark_stringzillas_edit_distances(
+        pairs,
+        timeout_seconds,
+        batch_size,
+        filter_pattern,
+        szs_class=szs.LevenshteinDistances,
+        szs_name="szs.LevenshteinDistances",
+        is_utf8=False,
+    )
+    benchmark_stringzillas_edit_distances(
+        pairs,
+        timeout_seconds,
+        batch_size,
+        filter_pattern,
+        szs_class=szs.LevenshteinDistancesUTF8,
+        szs_name="szs.LevenshteinDistancesUTF8",
+        is_utf8=True,
+    )
 
-        print(f"Prepared {len(pairs):,} string pairs from {len(strings):,} unique strings")
-        print(f"Average string length: {avg_length:.1f} chars")
-        print(f"Total characters: {total_chars:,}")
-        print(f"Timeout per benchmark: {timeout_seconds}s")
-        print()
+    if not bio:
+        return
 
-        print("=== Edit Distance Benchmarks ===")
-        benchmark_edit_distances(pairs, timeout_seconds)
+    benchmark_third_party_similarity_scores(pairs, timeout_seconds, filter_pattern)
+    benchmark_stringzillas_similarity_scores(
+        pairs,
+        timeout_seconds,
+        batch_size,
+        filter_pattern,
+        szs_class=szs.NeedlemanWunschScores,
+        szs_name="szs.NeedlemanWunschScores",
+    )
+    benchmark_stringzillas_similarity_scores(
+        pairs,
+        timeout_seconds,
+        batch_size,
+        filter_pattern,
+        szs_class=szs.SmithWatermanScores,
+        szs_name="szs.SmithWatermanScores",
+    )
 
 
 _main_epilog = """
@@ -423,7 +732,12 @@ def main():
     )
 
     parser.add_argument("--dataset", help="Path to text dataset file")
-    parser.add_argument("--max-pairs", type=int, help="Maximum number of string pairs to process")
+    parser.add_argument(
+        "-n",
+        "--max-pairs",
+        type=int,
+        help="Maximum number of string pairs to process",
+    )
     parser.add_argument(
         "--timeout",
         type=int,
@@ -431,28 +745,42 @@ def main():
         help="Timeout in seconds for each benchmark (default: 10)",
     )
     parser.add_argument(
-        "--protein-mode",
+        "--bio",
         action="store_true",
-        help="Generate random protein sequences instead of using dataset",
+        help="Include BioPython + SW/NW alignment benchmarks (protein alphabet)",
     )
     parser.add_argument(
-        "--protein-length",
+        "--batch-size",
         type=int,
-        default=1000,
-        help="Length of generated protein sequences (default: 1000)",
+        default=2048,
+        help="Number of pairs to process per call in batch-capable APIs (default: 2048)",
+    )
+    parser.add_argument(
+        "-k",
+        "--filter",
+        metavar="REGEX",
+        help="Regex to select which benchmarks to run by name",
     )
 
     args = parser.parse_args()
+    if not args.dataset:
+        parser.error("Dataset is required")
 
-    if not args.protein_mode and not args.dataset:
-        parser.error("Either --dataset or --protein-mode is required")
+    # Compile filter pattern if provided
+    pattern = None
+    if args.filter:
+        try:
+            pattern = re.compile(args.filter)
+        except re.error as e:
+            parser.error(f"Invalid regex for --filter/-k: {e}")
 
     bench(
         args.dataset,
         args.max_pairs,
         args.timeout,
-        args.protein_mode,
-        args.protein_length,
+        args.batch_size,
+        pattern,
+        args.bio,
     )
 
 

From fc948906300ba759f6473951c45ef3e624d15ff3 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 8 Sep 2025 16:26:21 +0000
Subject: [PATCH 746/751] Break: Output error messages via C API

---
 c/stringzillas.cuh                  | 511 +++++++++++++++++-----------
 include/stringzillas/stringzillas.h | 101 ++++--
 python/stringzillas.c               | 236 ++++---------
 rust/stringzillas.rs                | 229 ++++++++++---
 setup.py                            |  22 +-
 5 files changed, 639 insertions(+), 460 deletions(-)

diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
index ab314e6a..86d52466 100644
--- a/c/stringzillas.cuh
+++ b/c/stringzillas.cuh
@@ -180,6 +180,45 @@ constexpr bool is_gpu_capability(sz_capability_t capability) noexcept {
            (capability & sz_cap_hopper_k) != 0;
 }
 
+inline sz_status_t propagate_error(sz::status_t status, char const **reporter_message,
+                                   char const *optional_message = nullptr) noexcept {
+    if (!reporter_message) return static_cast<sz_status_t>(status);
+
+    // If the optional message is provided, use it verbatim
+    if (optional_message && reporter_message) {
+        *reporter_message = optional_message;
+        return static_cast<sz_status_t>(status);
+    }
+
+    // Otherwise, map the status code to a predefined message
+    switch (status) {
+    case sz::status_t::success_k: *reporter_message = nullptr; break;
+    case sz::status_t::bad_alloc_k: *reporter_message = "Memory allocation failed"; break;
+    case sz::status_t::invalid_utf8_k: *reporter_message = "Invalid UTF-8 input"; break;
+    case sz::status_t::contains_duplicates_k: *reporter_message = "Input contains duplicates"; break;
+    case sz::status_t::overflow_risk_k: *reporter_message = "Overflow risk detected"; break;
+    case sz::status_t::unexpected_dimensions_k: *reporter_message = "Input/output size mismatch"; break;
+    case sz::status_t::missing_gpu_k: *reporter_message = "GPU device not available or CUDA not initialized"; break;
+    case sz::status_t::device_code_mismatch_k: *reporter_message = "Backend and executor mismatch"; break;
+    case sz::status_t::device_memory_mismatch_k: *reporter_message = "Use device-reachable or unified memory"; break;
+    case sz::status_t::unknown_k: *reporter_message = "Unknown error"; break;
+    default: *reporter_message = "Unrecognized error code"; break;
+    }
+
+    return static_cast<sz_status_t>(status);
+}
+
+#if SZ_USE_CUDA
+inline sz_status_t propagate_error(szs::cuda_status_t cuda_status, char const **reporter_message,
+                                   char const *optional_message = nullptr) noexcept {
+    if (cuda_status.cuda_error != cudaSuccess) {
+        if (reporter_message) *reporter_message = cudaGetErrorString(cuda_status.cuda_error);
+        return static_cast<sz_status_t>(cuda_status.status);
+    }
+    else { return propagate_error(cuda_status.status, reporter_message, optional_message); }
+}
+#endif
+
 #if SZ_USE_CUDA
 
 /** @brief Redirects to CUDA's unified memory allocator. */
@@ -221,7 +260,7 @@ sz::gpu_specs_t get_specs(gpu_scope_t const &scope) noexcept { return scope.spec
 
 /** Cached default GPU context (device 0) to avoid repeated scheduling boilerplate */
 struct default_gpu_context_t {
-    sz::status_t status = sz::status_t::unknown_k;
+    szs::cuda_status_t status {sz::status_t::unknown_k, cudaSuccess};
     szs::cuda_executor_t executor;
     sz::gpu_specs_t specs;
 };
@@ -229,13 +268,13 @@ struct default_gpu_context_t {
 inline default_gpu_context_t &default_gpu_context() {
     static default_gpu_context_t ctx = [] {
         default_gpu_context_t result;
-        auto specs_status = szs::gpu_specs_fetch(result.specs, 0);
+        szs::cuda_status_t specs_status = szs::gpu_specs_fetch(result.specs, 0);
         if (specs_status.status != sz::status_t::success_k) {
-            result.status = specs_status.status;
+            result.status = specs_status;
             return result;
         }
-        auto exec_status = result.executor.try_scheduling(0);
-        result.status = exec_status.status;
+        szs::cuda_status_t exec_status = result.executor.try_scheduling(0);
+        result.status = exec_status;
         return result;
     }();
     return ctx;
@@ -286,7 +325,7 @@ template <typename texts_type_>
 sz_status_t szs_levenshtein_distances_for_(                                      //
     szs_levenshtein_distances_t engine_punned, szs_device_scope_t device_punned, //
     texts_type_ const &a_container, texts_type_ const &b_container,              //
-    sz_size_t *results, sz_size_t results_stride) {
+    sz_size_t *results, sz_size_t results_stride, char const **error_message) {
 
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
     sz_assert_(device_punned != nullptr && "Device must be initialized");
@@ -310,24 +349,24 @@ sz_status_t szs_levenshtein_distances_for_(
 #if SZ_USE_CUDA
             if (std::holds_alternative<gpu_scope_t>(device->variants)) {
                 auto &device_scope = std::get<gpu_scope_t>(device->variants);
-                sz::status_t status = engine_variant(          //
+                szs::cuda_status_t status = engine_variant(    //
                     a_container, b_container, results_strided, //
                     get_executor(device_scope), get_specs(device_scope));
-                result = static_cast<sz_status_t>(status);
+                result = propagate_error(status, error_message);
             }
             // Try ephemeral GPU on default scope (device 0)
             else if (std::holds_alternative<default_scope_t>(device->variants)) {
                 auto &ctx = default_gpu_context();
-                if (ctx.status != sz::status_t::success_k) { result = static_cast<sz_status_t>(ctx.status); }
-                else {
-                    sz::status_t status = engine_variant( //
-                        a_container, b_container, results_strided, ctx.executor, ctx.specs);
-                    result = static_cast<sz_status_t>(status);
-                }
+                szs::cuda_status_t status =
+                    ctx.status != sz::status_t::success_k
+                        ? ctx.status
+                        : engine_variant( //
+                              a_container, b_container, results_strided, ctx.executor, ctx.specs);
+                result = propagate_error(status, error_message);
             }
-            else { result = sz_device_code_mismatch_k; }
+            else { result = propagate_error(sz::status_t::device_code_mismatch_k, error_message); }
 #else
-            result = sz_status_unknown_k; // GPU support is not enabled
+            result = propagate_error(sz::status_t::missing_gpu_k, error_message);
 #endif // SZ_USE_CUDA
         }
         // CPU backends are only compatible with CPU scopes
@@ -337,16 +376,16 @@ sz_status_t szs_levenshtein_distances_for_(
                 sz::status_t status = engine_variant(          //
                     a_container, b_container, results_strided, //
                     get_executor(device_scope), get_specs(device_scope));
-                result = static_cast<sz_status_t>(status);
+                result = propagate_error(status, error_message);
             }
             else if (std::holds_alternative<cpu_scope_t>(device->variants)) {
                 auto &device_scope = std::get<cpu_scope_t>(device->variants);
                 sz::status_t status = engine_variant(          //
                     a_container, b_container, results_strided, //
                     get_executor(device_scope), get_specs(device_scope));
-                result = static_cast<sz_status_t>(status);
+                result = propagate_error(status, error_message);
             }
-            else { result = sz_device_code_mismatch_k; }
+            else { result = propagate_error(sz::status_t::device_code_mismatch_k, error_message); }
         }
     };
 
@@ -377,7 +416,7 @@ template <typename texts_type_>
 sz_status_t szs_levenshtein_distances_utf8_for_(                                      //
     szs_levenshtein_distances_utf8_t engine_punned, szs_device_scope_t device_punned, //
     texts_type_ const &a_container, texts_type_ const &b_container,                   //
-    sz_size_t *results, sz_size_t results_stride) {
+    sz_size_t *results, sz_size_t results_stride, char const **error_message) {
 
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
     sz_assert_(device_punned != nullptr && "Device must be initialized");
@@ -399,7 +438,7 @@ sz_status_t szs_levenshtein_distances_utf8_for_(
         // GPU backends are only compatible with GPU scopes
         if constexpr (is_gpu_capability(engine_capability_k)) {
             // No GPU backends for UTF8 Levenshtein distances yet
-            result = sz_status_unknown_k;
+            result = propagate_error(sz::status_t::unknown_k, error_message);
         }
         // CPU backends are only compatible with CPU scopes
         else {
@@ -408,14 +447,14 @@ sz_status_t szs_levenshtein_distances_utf8_for_(
                 sz::status_t status = engine_variant(          //
                     a_container, b_container, results_strided, //
                     get_executor(device_scope), get_specs(device_scope));
-                result = static_cast<sz_status_t>(status);
+                result = propagate_error(status, error_message);
             }
             else if (std::holds_alternative<cpu_scope_t>(device->variants)) {
                 auto &device_scope = std::get<cpu_scope_t>(device->variants);
                 sz::status_t status = engine_variant(          //
                     a_container, b_container, results_strided, //
                     get_executor(device_scope), get_specs(device_scope));
-                result = static_cast<sz_status_t>(status);
+                result = propagate_error(status, error_message);
             }
             else { result = sz_device_code_mismatch_k; }
         }
@@ -455,7 +494,7 @@ template <typename texts_type_>
 sz_status_t szs_needleman_wunsch_scores_for_(                                      //
     szs_needleman_wunsch_scores_t engine_punned, szs_device_scope_t device_punned, //
     texts_type_ const &a_container, texts_type_ const &b_container,                //
-    sz_ssize_t *results, sz_size_t results_stride) {
+    sz_ssize_t *results, sz_size_t results_stride, char const **error_message) {
 
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
     sz_assert_(device_punned != nullptr && "Device must be initialized");
@@ -479,23 +518,23 @@ sz_status_t szs_needleman_wunsch_scores_for_(
 #if SZ_USE_CUDA
             if (std::holds_alternative<gpu_scope_t>(device->variants)) {
                 auto &device_scope = std::get<gpu_scope_t>(device->variants);
-                sz::status_t status = engine_variant(          //
+                szs::cuda_status_t status = engine_variant(    //
                     a_container, b_container, results_strided, //
                     get_executor(device_scope), get_specs(device_scope));
-                result = static_cast<sz_status_t>(status);
+                result = propagate_error(status, error_message);
             }
             else if (std::holds_alternative<default_scope_t>(device->variants)) {
                 auto &ctx = default_gpu_context();
-                if (ctx.status != sz::status_t::success_k) { result = static_cast<sz_status_t>(ctx.status); }
-                else {
-                    sz::status_t status = engine_variant( //
-                        a_container, b_container, results_strided, ctx.executor, ctx.specs);
-                    result = static_cast<sz_status_t>(status);
-                }
+                szs::cuda_status_t status =
+                    ctx.status != sz::status_t::success_k
+                        ? ctx.status
+                        : engine_variant( //
+                              a_container, b_container, results_strided, ctx.executor, ctx.specs);
+                result = propagate_error(status, error_message);
             }
-            else { result = sz_status_unknown_k; }
+            else { result = propagate_error(sz::status_t::unknown_k, error_message); }
 #else
-            result = sz_status_unknown_k; // GPU support is not enabled
+            result = propagate_error(sz::status_t::unknown_k, error_message); // GPU support is not enabled
 #endif // SZ_USE_CUDA
         }
         // CPU backends are only compatible with CPU scopes
@@ -505,16 +544,16 @@ sz_status_t szs_needleman_wunsch_scores_for_(
                 sz::status_t status = engine_variant(          //
                     a_container, b_container, results_strided, //
                     get_executor(device_scope), get_specs(device_scope));
-                result = static_cast<sz_status_t>(status);
+                result = propagate_error(status, error_message);
             }
             else if (std::holds_alternative<cpu_scope_t>(device->variants)) {
                 auto &device_scope = std::get<cpu_scope_t>(device->variants);
                 sz::status_t status = engine_variant(          //
                     a_container, b_container, results_strided, //
                     get_executor(device_scope), get_specs(device_scope));
-                result = static_cast<sz_status_t>(status);
+                result = propagate_error(status, error_message);
             }
-            else { result = sz_status_unknown_k; }
+            else { result = propagate_error(sz::status_t::unknown_k, error_message); }
         }
     };
 
@@ -552,7 +591,7 @@ template <typename texts_type_>
 sz_status_t szs_smith_waterman_scores_for_(                                      //
     szs_smith_waterman_scores_t engine_punned, szs_device_scope_t device_punned, //
     texts_type_ const &a_container, texts_type_ const &b_container,              //
-    sz_ssize_t *results, sz_size_t results_stride) {
+    sz_ssize_t *results, sz_size_t results_stride, char const **error_message) {
 
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
     sz_assert_(device_punned != nullptr && "Device must be initialized");
@@ -576,33 +615,33 @@ sz_status_t szs_smith_waterman_scores_for_(
 #if SZ_USE_CUDA
             if (std::holds_alternative<gpu_scope_t>(device->variants)) {
                 auto &device_scope = std::get<gpu_scope_t>(device->variants);
-                sz::status_t status = engine_variant(          //
+                szs::cuda_status_t status = engine_variant(    //
                     a_container, b_container, results_strided, //
                     get_executor(device_scope), get_specs(device_scope));
-                result = static_cast<sz_status_t>(status);
+                result = propagate_error(status, error_message);
             }
             else if (std::holds_alternative<default_scope_t>(device->variants)) {
                 sz::gpu_specs_t specs;
                 auto specs_status = szs::gpu_specs_fetch(specs, 0);
                 if (specs_status.status != sz::status_t::success_k) {
-                    result = static_cast<sz_status_t>(specs_status.status);
+                    result = propagate_error(specs_status, error_message);
                 }
                 else {
                     szs::cuda_executor_t executor;
                     auto exec_status = executor.try_scheduling(0);
                     if (exec_status.status != sz::status_t::success_k) {
-                        result = static_cast<sz_status_t>(exec_status.status);
+                        result = propagate_error(exec_status, error_message);
                     }
                     else {
-                        sz::status_t status = engine_variant( //
+                        szs::cuda_status_t status = engine_variant( //
                             a_container, b_container, results_strided, executor, specs);
-                        result = static_cast<sz_status_t>(status);
+                        result = propagate_error(status, error_message);
                     }
                 }
             }
-            else { result = sz_status_unknown_k; }
+            else { result = propagate_error(sz::status_t::unknown_k, error_message); }
 #else
-            result = sz_status_unknown_k; // GPU support is not enabled
+            result = propagate_error(sz::status_t::unknown_k, error_message); // GPU support is not enabled
 #endif // SZ_USE_CUDA
         }
         // CPU backends are only compatible with CPU scopes
@@ -612,16 +651,16 @@ sz_status_t szs_smith_waterman_scores_for_(
                 sz::status_t status = engine_variant(          //
                     a_container, b_container, results_strided, //
                     get_executor(device_scope), get_specs(device_scope));
-                result = static_cast<sz_status_t>(status);
+                result = propagate_error(status, error_message);
             }
             else if (std::holds_alternative<cpu_scope_t>(device->variants)) {
                 auto &device_scope = std::get<cpu_scope_t>(device->variants);
                 sz::status_t status = engine_variant(          //
                     a_container, b_container, results_strided, //
                     get_executor(device_scope), get_specs(device_scope));
-                result = static_cast<sz_status_t>(status);
+                result = propagate_error(status, error_message);
             }
-            else { result = sz_status_unknown_k; }
+            else { result = propagate_error(sz::status_t::unknown_k, error_message); }
         }
     };
 
@@ -670,7 +709,7 @@ sz_status_t szs_fingerprints_for_(                                      //
     szs_fingerprints_t engine_punned, szs_device_scope_t device_punned, //
     texts_type_ const &texts_container,                                 //
     sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                  //
-    sz_u32_t *min_counts, sz_size_t min_counts_stride) {
+    sz_u32_t *min_counts, sz_size_t min_counts_stride, char const **error_message) {
 
     sz_assert_(engine_punned != nullptr && "Engine must be initialized");
     sz_assert_(device_punned != nullptr && "Device must be initialized");
@@ -709,7 +748,7 @@ sz_status_t szs_fingerprints_for_(                                      //
                 get_executor(device_scope), get_specs(device_scope));
             result = static_cast<sz_status_t>(status);
         }
-        else { result = sz_status_unknown_k; }
+        else { result = propagate_error(sz::status_t::unknown_k, error_message); }
     };
 #if SZ_USE_CUDA
     using fallback_variant_cuda_t = typename fingerprints_backends_t::fallback_variant_cuda_t;
@@ -729,14 +768,14 @@ sz_status_t szs_fingerprints_for_(                                      //
         }
         else if (std::holds_alternative<default_scope_t>(device->variants)) {
             auto &ctx = default_gpu_context();
-            if (ctx.status != sz::status_t::success_k) { result = static_cast<sz_status_t>(ctx.status); }
+            if (ctx.status.status != sz::status_t::success_k) { result = propagate_error(ctx.status, error_message); }
             else {
                 sz::status_t status = fallback_hashers( //
                     texts_container, min_hashes_rows, min_counts_rows, ctx.executor, ctx.specs);
-                result = static_cast<sz_status_t>(status);
+                result = propagate_error(status, error_message);
             }
         }
-        else { result = sz_status_unknown_k; }
+        else { result = propagate_error(sz::status_t::unknown_k, error_message); }
     };
 #endif // SZ_USE_CUDA
 
@@ -764,34 +803,34 @@ sz_status_t szs_fingerprints_for_(                                      //
                 auto &device_scope = std::get<gpu_scope_t>(device->variants);
                 for (std::size_t i = 0; i < unrolled_hashers.size(); ++i) {
                     auto &engine_variant = unrolled_hashers[i];
-                    sz::status_t status = engine_variant(                                             //
+                    szs::cuda_status_t status = engine_variant(                                       //
                         texts_container,                                                              //
                         min_hashes_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
                         min_counts_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
                         get_executor(device_scope), get_specs(device_scope));
-                    result = static_cast<sz_status_t>(status);
+                    result = propagate_error(status, error_message);
                     if (result != sz_success_k) break;
                 }
             }
             else if (std::holds_alternative<default_scope_t>(device->variants)) {
                 auto &ctx = default_gpu_context();
-                if (ctx.status != sz::status_t::success_k) { result = static_cast<sz_status_t>(ctx.status); }
+                if (ctx.status != sz::status_t::success_k) { result = propagate_error(ctx.status, error_message); }
                 else {
                     for (std::size_t i = 0; i < unrolled_hashers.size(); ++i) {
                         auto &engine_variant = unrolled_hashers[i];
-                        sz::status_t status = engine_variant(                                             //
+                        szs::cuda_status_t status = engine_variant(                                       //
                             texts_container,                                                              //
                             min_hashes_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
                             min_counts_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
                             ctx.executor, ctx.specs);
-                        result = static_cast<sz_status_t>(status);
+                        result = propagate_error(status, error_message);
                         if (result != sz_success_k) break;
                     }
                 }
             }
-            else { result = sz_status_unknown_k; }
+            else { result = propagate_error(sz::status_t::unknown_k, error_message); }
 #else
-            result = sz_status_unknown_k; // GPU support is not enabled
+            result = propagate_error(sz::status_t::unknown_k, error_message); // GPU support is not enabled
 #endif // SZ_USE_CUDA
         }
         // CPU backends are only compatible with CPU scopes
@@ -805,7 +844,7 @@ sz_status_t szs_fingerprints_for_(                                      //
                         min_hashes_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
                         min_counts_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
                         get_executor(device_scope), get_specs(device_scope));
-                    result = static_cast<sz_status_t>(status);
+                    result = propagate_error(status, error_message);
                 }
             }
             else if (std::holds_alternative<cpu_scope_t>(device->variants)) {
@@ -817,10 +856,10 @@ sz_status_t szs_fingerprints_for_(                                      //
                         min_hashes_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
                         min_counts_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
                         get_executor(device_scope), get_specs(device_scope));
-                    result = static_cast<sz_status_t>(status);
+                    result = propagate_error(status, error_message);
                 }
             }
-            else { result = sz_status_unknown_k; }
+            else { result = propagate_error(sz::status_t::unknown_k, error_message); }
         }
     };
 
@@ -863,14 +902,14 @@ SZ_DYNAMIC sz_capability_t szs_capabilities(void) {
     return static_caps;
 }
 
-SZ_DYNAMIC sz_status_t sz_memory_allocator_init_unified(sz_memory_allocator_t *alloc) {
+SZ_DYNAMIC sz_status_t sz_memory_allocator_init_unified(sz_memory_allocator_t *alloc, char const **error_message) {
 #if SZ_USE_CUDA
     alloc->allocate = &sz_memory_allocate_from_unified_;
     alloc->free = &sz_memory_free_from_unified_;
     alloc->handle = nullptr;
-    return sz_success_k;
+    return propagate_error(sz::status_t::success_k, error_message);
 #else
-    return sz_missing_gpu_k;
+    return propagate_error(sz::status_t::missing_gpu_k, error_message);
 #endif
 }
 
@@ -878,93 +917,101 @@ SZ_DYNAMIC sz_status_t sz_memory_allocator_init_unified(sz_memory_allocator_t *a
 
 #pragma region Device Scopes
 
-SZ_DYNAMIC sz_status_t szs_device_scope_init_default(szs_device_scope_t *scope_punned) {
+SZ_DYNAMIC sz_status_t szs_device_scope_init_default(szs_device_scope_t *scope_punned, char const **error_message) {
     sz_assert_(scope_punned != nullptr && "Scope must not be null");
     auto *scope = new device_scope_t {default_scope_t {}};
-    if (!scope) return sz_bad_alloc_k;
+    if (!scope) return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate device scope");
     *scope_punned = reinterpret_cast<szs_device_scope_t>(scope);
-    return sz_success_k;
+    return propagate_error(sz::status_t::success_k, error_message);
 }
 
-SZ_DYNAMIC sz_status_t szs_device_scope_init_cpu_cores(sz_size_t cpu_cores, szs_device_scope_t *scope_punned) {
+SZ_DYNAMIC sz_status_t szs_device_scope_init_cpu_cores(sz_size_t cpu_cores, szs_device_scope_t *scope_punned,
+                                                       char const **error_message) {
     sz_assert_(scope_punned != nullptr && "Scope must not be null");
 
-    // If cpu_cores is 0, use all available cores
+    // If `cpu_cores` is 0, use all available cores
     if (cpu_cores == 0) cpu_cores = std::thread::hardware_concurrency();
 
-    // If cpu_cores is 1, redirect to default scope
-    if (cpu_cores == 1) return szs_device_scope_init_default(scope_punned);
+    // If `cpu_cores` is 1, redirect to default scope
+    if (cpu_cores == 1) return szs_device_scope_init_default(scope_punned, error_message);
 
     sz::cpu_specs_t specs;
     auto executor = std::make_unique<fu::basic_pool_t>();
-    if (!executor->try_spawn(cpu_cores)) return sz_bad_alloc_k;
+    if (!executor->try_spawn(cpu_cores))
+        return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to spawn thread pool");
 
     auto *scope =
         new (std::nothrow) device_scope_t(std::in_place_type_t<cpu_scope_t> {}, std::move(executor), std::move(specs));
-    if (!scope) return sz_bad_alloc_k;
+    if (!scope) return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate CPU device scope");
+
     *scope_punned = reinterpret_cast<szs_device_scope_t>(scope);
-    return sz_success_k;
+    return propagate_error(sz::status_t::success_k, error_message);
 }
 
-SZ_DYNAMIC sz_status_t szs_device_scope_init_gpu_device(sz_size_t gpu_device, szs_device_scope_t *scope_punned) {
+SZ_DYNAMIC sz_status_t szs_device_scope_init_gpu_device(sz_size_t gpu_device, szs_device_scope_t *scope_punned,
+                                                        char const **error_message) {
     sz_assert_(scope_punned != nullptr && "Scope must not be null");
 
 #if SZ_USE_CUDA
     sz::gpu_specs_t specs;
     auto specs_status = szs::gpu_specs_fetch(specs, static_cast<int>(gpu_device));
-    if (specs_status.status != sz::status_t::success_k) { return static_cast<sz_status_t>(specs_status.status); }
+    if (specs_status.status != sz::status_t::success_k) { return propagate_error(specs_status, error_message); }
     szs::cuda_executor_t executor;
     auto executor_status = executor.try_scheduling(static_cast<int>(gpu_device));
-    if (executor_status.status != sz::status_t::success_k) { return static_cast<sz_status_t>(executor_status.status); }
+    if (executor_status.status != sz::status_t::success_k) return propagate_error(executor_status, error_message);
 
     auto *scope =
         new (std::nothrow) device_scope_t {gpu_scope_t {.executor = std::move(executor), .specs = std::move(specs)}};
-    if (!scope) return sz_bad_alloc_k;
+    if (!scope) return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate GPU device scope");
     *scope_punned = reinterpret_cast<szs_device_scope_t>(scope);
-    return sz_success_k;
+    return propagate_error(sz::status_t::success_k, error_message);
 #else
     sz_unused_(gpu_device);
     sz_unused_(scope_punned);
-    return sz_missing_gpu_k;
+    return propagate_error(sz::status_t::missing_gpu_k, error_message, "CUDA support not compiled in");
 #endif
 }
 
-SZ_DYNAMIC sz_status_t szs_device_scope_get_cpu_cores(szs_device_scope_t scope_punned, sz_size_t *cpu_cores) {
-    if (scope_punned == nullptr || cpu_cores == nullptr) return sz_status_unknown_k;
+SZ_DYNAMIC sz_status_t szs_device_scope_get_cpu_cores(szs_device_scope_t scope_punned, sz_size_t *cpu_cores,
+                                                      char const **error_message) {
+    if (scope_punned == nullptr || cpu_cores == nullptr)
+        return propagate_error(sz::status_t::unknown_k, error_message, "Invalid null pointer argument");
     auto *scope = reinterpret_cast<device_scope_t *>(scope_punned);
 
     if (std::holds_alternative<cpu_scope_t>(scope->variants)) {
         auto &cpu_scope = std::get<cpu_scope_t>(scope->variants);
         if (cpu_scope.executor_ptr) {
             *cpu_cores = cpu_scope.executor_ptr->threads_count();
-            return sz_success_k;
+            return propagate_error(sz::status_t::success_k, error_message);
         }
     }
     // Default scope is single-threaded
     else if (std::holds_alternative<default_scope_t>(scope->variants)) {
         *cpu_cores = 1;
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
 
-    return sz_status_unknown_k;
+    return propagate_error(sz::status_t::unknown_k, error_message, "Device scope is GPU-only");
 }
 
-SZ_DYNAMIC sz_status_t szs_device_scope_get_gpu_device(szs_device_scope_t scope_punned, sz_size_t *gpu_device) {
-    if (scope_punned == nullptr || gpu_device == nullptr) return sz_status_unknown_k;
+SZ_DYNAMIC sz_status_t szs_device_scope_get_gpu_device(szs_device_scope_t scope_punned, sz_size_t *gpu_device,
+                                                       char const **error_message) {
+    if (scope_punned == nullptr || gpu_device == nullptr)
+        return propagate_error(sz::status_t::unknown_k, error_message, "Invalid null pointer argument");
 
 #if SZ_USE_CUDA
     auto *scope = reinterpret_cast<device_scope_t *>(scope_punned);
     if (std::holds_alternative<gpu_scope_t>(scope->variants)) {
         auto &gpu_scope = std::get<gpu_scope_t>(scope->variants);
         *gpu_device = static_cast<sz_size_t>(gpu_scope.executor.device_id());
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
 #else
     sz_unused_(scope_punned);
     sz_unused_(gpu_device);
 #endif
 
-    return sz_status_unknown_k;
+    return propagate_error(sz::status_t::unknown_k, error_message, "Device scope is CPU-only");
 }
 
 SZ_DYNAMIC void szs_device_scope_free(szs_device_scope_t scope_punned) {
@@ -973,10 +1020,11 @@ SZ_DYNAMIC void szs_device_scope_free(szs_device_scope_t scope_punned) {
     delete scope;
 }
 
-SZ_DYNAMIC sz_status_t szs_device_scope_get_capabilities(szs_device_scope_t scope_punned,
-                                                         sz_capability_t *capabilities) {
+SZ_DYNAMIC sz_status_t szs_device_scope_get_capabilities(szs_device_scope_t scope_punned, sz_capability_t *capabilities,
+                                                         char const **error_message) {
 
-    if (scope_punned == nullptr || capabilities == nullptr) return sz_status_unknown_k;
+    if (scope_punned == nullptr || capabilities == nullptr)
+        return propagate_error(sz::status_t::unknown_k, error_message, "Invalid null pointer argument");
     sz_capability_t system_caps = szs_capabilities();
 
 #if SZ_USE_CUDA
@@ -984,13 +1032,13 @@ SZ_DYNAMIC sz_status_t szs_device_scope_get_capabilities(szs_device_scope_t scop
     if (std::holds_alternative<gpu_scope_t>(scope->variants)) {
         // For GPU scope, intersect system capabilities with CUDA capabilities
         *capabilities = static_cast<sz_capability_t>(system_caps & sz_caps_cuda_k);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
 #endif
 
     // For default and CPU scopes, intersect system capabilities with CPU capabilities
     *capabilities = static_cast<sz_capability_t>(system_caps & sz_caps_cpus_k);
-    return sz_success_k;
+    return propagate_error(sz::status_t::success_k, error_message);
 }
 
 #pragma endregion Device Scopes
@@ -1022,7 +1070,7 @@ SZ_DYNAMIC void szs_unified_free(void *ptr, sz_size_t size_bytes) {
 SZ_DYNAMIC sz_status_t szs_levenshtein_distances_init(                                             //
     sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities,                              //
-    szs_levenshtein_distances_t *engine_punned) {
+    szs_levenshtein_distances_t *engine_punned, char const **error_message) {
 
     sz_assert_(engine_punned != nullptr && *engine_punned == nullptr && "Engine must be uninitialized");
 
@@ -1038,19 +1086,21 @@ SZ_DYNAMIC sz_status_t szs_levenshtein_distances_init(
         auto variant = szs::levenshtein_ice_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)
             levenshtein_backends_t(std::in_place_type_t<szs::levenshtein_ice_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Levenshtein engine");
 
         *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
     else if (can_use_ice) {
         auto variant = szs::affine_levenshtein_ice_t(substitution_costs, affine_costs);
         auto engine = new (std::nothrow)
             levenshtein_backends_t(std::in_place_type_t<szs::affine_levenshtein_ice_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Levenshtein engine");
 
         *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
 #endif // SZ_USE_ICE
 
@@ -1060,19 +1110,21 @@ SZ_DYNAMIC sz_status_t szs_levenshtein_distances_init(
         auto variant = szs::levenshtein_cuda_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)
             levenshtein_backends_t(std::in_place_type_t<szs::levenshtein_cuda_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Levenshtein engine");
 
         *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
     else if (can_use_cuda) {
         auto variant = szs::affine_levenshtein_cuda_t(substitution_costs, affine_costs);
         auto engine = new (std::nothrow)
             levenshtein_backends_t(std::in_place_type_t<szs::affine_levenshtein_cuda_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Levenshtein engine");
 
         *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
 #endif // SZ_USE_CUDA
 
@@ -1082,19 +1134,21 @@ SZ_DYNAMIC sz_status_t szs_levenshtein_distances_init(
         auto variant = szs::levenshtein_kepler_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)
             levenshtein_backends_t(std::in_place_type_t<szs::levenshtein_kepler_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Levenshtein engine");
 
         *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
     else if (can_use_kepler) {
         auto variant = szs::affine_levenshtein_kepler_t(substitution_costs, affine_costs);
         auto engine = new (std::nothrow)
             levenshtein_backends_t(std::in_place_type_t<szs::affine_levenshtein_kepler_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Levenshtein engine");
 
         *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
 #endif // SZ_USE_KEPLER
 
@@ -1104,19 +1158,21 @@ SZ_DYNAMIC sz_status_t szs_levenshtein_distances_init(
         auto variant = szs::levenshtein_hopper_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)
             levenshtein_backends_t(std::in_place_type_t<szs::levenshtein_hopper_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Levenshtein engine");
 
         *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
     else if (can_use_hopper) {
         auto variant = szs::affine_levenshtein_hopper_t(substitution_costs, affine_costs);
         auto engine = new (std::nothrow)
             levenshtein_backends_t(std::in_place_type_t<szs::affine_levenshtein_hopper_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Levenshtein engine");
 
         *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
 #endif // SZ_USE_HOPPER
 
@@ -1124,59 +1180,61 @@ SZ_DYNAMIC sz_status_t szs_levenshtein_distances_init(
         auto variant = szs::levenshtein_serial_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)
             levenshtein_backends_t(std::in_place_type_t<szs::levenshtein_serial_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Levenshtein engine");
 
         *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
     else {
         auto variant = szs::affine_levenshtein_serial_t(substitution_costs, affine_costs);
         auto engine = new (std::nothrow)
             levenshtein_backends_t(std::in_place_type_t<szs::affine_levenshtein_serial_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Levenshtein engine");
 
         *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
 }
 
 SZ_DYNAMIC sz_status_t szs_levenshtein_distances_sequence(                       //
     szs_levenshtein_distances_t engine_punned, szs_device_scope_t device_punned, //
     sz_sequence_t const *a, sz_sequence_t const *b,                              //
-    sz_size_t *results, sz_size_t results_stride) {
+    sz_size_t *results, sz_size_t results_stride, char const **error_message) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_as_cpp_container_t {a};
     auto b_container = sz_sequence_as_cpp_container_t {b};
     return szs_levenshtein_distances_for_(                      //
         engine_punned, device_punned, a_container, b_container, //
-        results, results_stride);
+        results, results_stride, error_message);
 }
 
 SZ_DYNAMIC sz_status_t szs_levenshtein_distances_u32tape(                        //
     szs_levenshtein_distances_t engine_punned, szs_device_scope_t device_punned, //
     sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,              //
-    sz_size_t *results, sz_size_t results_stride) {
+    sz_size_t *results, sz_size_t results_stride, char const **error_message) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_u32tape_as_cpp_container_t {a};
     auto b_container = sz_sequence_u32tape_as_cpp_container_t {b};
     return szs_levenshtein_distances_for_(                      //
         engine_punned, device_punned, a_container, b_container, //
-        results, results_stride);
+        results, results_stride, error_message);
 }
 
 SZ_DYNAMIC sz_status_t szs_levenshtein_distances_u64tape(                        //
     szs_levenshtein_distances_t engine_punned, szs_device_scope_t device_punned, //
     sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,              //
-    sz_size_t *results, sz_size_t results_stride) {
+    sz_size_t *results, sz_size_t results_stride, char const **error_message) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_u64tape_as_cpp_container_t {a};
     auto b_container = sz_sequence_u64tape_as_cpp_container_t {b};
     return szs_levenshtein_distances_for_(                      //
         engine_punned, device_punned, a_container, b_container, //
-        results, results_stride);
+        results, results_stride, error_message);
 }
 
 SZ_DYNAMIC void szs_levenshtein_distances_free(szs_levenshtein_distances_t engine_punned) {
@@ -1192,7 +1250,7 @@ SZ_DYNAMIC void szs_levenshtein_distances_free(szs_levenshtein_distances_t engin
 SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_init(                                        //
     sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities,                              //
-    szs_levenshtein_distances_utf8_t *engine_punned) {
+    szs_levenshtein_distances_utf8_t *engine_punned, char const **error_message) {
 
     sz_assert_(engine_punned != nullptr && *engine_punned == nullptr && "Engine must be uninitialized");
 
@@ -1208,10 +1266,12 @@ SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_init(
         auto variant = szs::levenshtein_utf8_ice_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)
             levenshtein_utf8_backends_t(std::in_place_type_t<szs::levenshtein_utf8_ice_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message,
+                                   "Failed to allocate UTF-8 Levenshtein engine");
 
         *engine_punned = reinterpret_cast<szs_levenshtein_distances_utf8_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
 #endif // SZ_USE_ICE
 
@@ -1220,61 +1280,65 @@ SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_init(
         auto variant = szs::levenshtein_utf8_serial_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)
             levenshtein_utf8_backends_t(std::in_place_type_t<szs::levenshtein_utf8_serial_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message,
+                                   "Failed to allocate UTF-8 Levenshtein engine");
 
         *engine_punned = reinterpret_cast<szs_levenshtein_distances_utf8_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
     else {
         auto variant = szs::affine_levenshtein_utf8_serial_t(substitution_costs, affine_costs);
         auto engine = new (std::nothrow) levenshtein_utf8_backends_t(
             std::in_place_type_t<szs::affine_levenshtein_utf8_serial_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message,
+                                   "Failed to allocate UTF-8 Levenshtein engine");
 
         *engine_punned = reinterpret_cast<szs_levenshtein_distances_utf8_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
 
-    return sz_status_unknown_k; // No supported backends available
+    return propagate_error(sz::status_t::unknown_k, error_message, "No supported UTF-8 Levenshtein backends available");
 }
 
 SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_sequence(                       //
     szs_levenshtein_distances_utf8_t engine_punned, szs_device_scope_t device_punned, //
     sz_sequence_t const *a, sz_sequence_t const *b,                                   //
-    sz_size_t *results, sz_size_t results_stride) {
+    sz_size_t *results, sz_size_t results_stride, char const **error_message) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_as_cpp_container_t {a};
     auto b_container = sz_sequence_as_cpp_container_t {b};
     return szs_levenshtein_distances_utf8_for_(                 //
         engine_punned, device_punned, a_container, b_container, //
-        results, results_stride);
+        results, results_stride, error_message);
 }
 
 SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_u32tape(                        //
     szs_levenshtein_distances_utf8_t engine_punned, szs_device_scope_t device_punned, //
     sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,                   //
-    sz_size_t *results, sz_size_t results_stride) {
+    sz_size_t *results, sz_size_t results_stride, char const **error_message) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_u32tape_as_cpp_container_t {a};
     auto b_container = sz_sequence_u32tape_as_cpp_container_t {b};
     return szs_levenshtein_distances_utf8_for_(                 //
         engine_punned, device_punned, a_container, b_container, //
-        results, results_stride);
+        results, results_stride, error_message);
 }
 
 SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_u64tape(                        //
     szs_levenshtein_distances_utf8_t engine_punned, szs_device_scope_t device_punned, //
     sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,                   //
-    sz_size_t *results, sz_size_t results_stride) {
+    sz_size_t *results, sz_size_t results_stride, char const **error_message) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_u64tape_as_cpp_container_t {a};
     auto b_container = sz_sequence_u64tape_as_cpp_container_t {b};
     return szs_levenshtein_distances_utf8_for_(                 //
         engine_punned, device_punned, a_container, b_container, //
-        results, results_stride);
+        results, results_stride, error_message);
 }
 
 SZ_DYNAMIC void szs_levenshtein_distances_utf8_free(szs_levenshtein_distances_utf8_t engine_punned) {
@@ -1290,7 +1354,7 @@ SZ_DYNAMIC void szs_levenshtein_distances_utf8_free(szs_levenshtein_distances_ut
 SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_init(                       //
     sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities,          //
-    szs_needleman_wunsch_scores_t *engine_punned) {
+    szs_needleman_wunsch_scores_t *engine_punned, char const **error_message) {
 
     sz_assert_(engine_punned != nullptr && *engine_punned == nullptr && "Engine must be uninitialized");
 
@@ -1307,10 +1371,12 @@ SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_init(                       /
         auto variant = szs::needleman_wunsch_ice_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)
             needleman_wunsch_backends_t(std::in_place_type_t<szs::needleman_wunsch_ice_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message,
+                                   "Failed to allocate Needleman-Wunsch engine");
 
         *engine_punned = reinterpret_cast<szs_needleman_wunsch_scores_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
 #endif // SZ_USE_ICE
 
@@ -1320,19 +1386,23 @@ SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_init(                       /
         auto variant = szs::needleman_wunsch_cuda_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)
             needleman_wunsch_backends_t(std::in_place_type_t<szs::needleman_wunsch_cuda_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message,
+                                   "Failed to allocate Needleman-Wunsch engine");
 
         *engine_punned = reinterpret_cast<szs_needleman_wunsch_scores_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
     else if (can_use_cuda) {
         auto variant = szs::affine_needleman_wunsch_cuda_t(substitution_costs, affine_costs);
         auto engine = new (std::nothrow) needleman_wunsch_backends_t(
             std::in_place_type_t<szs::affine_needleman_wunsch_cuda_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message,
+                                   "Failed to allocate Needleman-Wunsch engine");
 
         *engine_punned = reinterpret_cast<szs_needleman_wunsch_scores_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
 #endif // SZ_USE_CUDA
 
@@ -1342,19 +1412,23 @@ SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_init(                       /
         auto variant = szs::needleman_wunsch_hopper_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)
             needleman_wunsch_backends_t(std::in_place_type_t<szs::needleman_wunsch_hopper_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message,
+                                   "Failed to allocate Needleman-Wunsch engine");
 
         *engine_punned = reinterpret_cast<szs_needleman_wunsch_scores_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
     else if (can_use_hopper) {
         auto variant = szs::affine_needleman_wunsch_hopper_t(substitution_costs, affine_costs);
         auto engine = new (std::nothrow) needleman_wunsch_backends_t(
             std::in_place_type_t<szs::affine_needleman_wunsch_hopper_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message,
+                                   "Failed to allocate Needleman-Wunsch engine");
 
         *engine_punned = reinterpret_cast<szs_needleman_wunsch_scores_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
 #endif // SZ_USE_HOPPER
 
@@ -1362,59 +1436,63 @@ SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_init(                       /
         auto variant = szs::needleman_wunsch_serial_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)
             needleman_wunsch_backends_t(std::in_place_type_t<szs::needleman_wunsch_serial_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message,
+                                   "Failed to allocate Needleman-Wunsch engine");
 
         *engine_punned = reinterpret_cast<szs_needleman_wunsch_scores_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
     else {
         auto variant = szs::affine_needleman_wunsch_serial_t(substitution_costs, affine_costs);
         auto engine = new (std::nothrow) needleman_wunsch_backends_t(
             std::in_place_type_t<szs::affine_needleman_wunsch_serial_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message,
+                                   "Failed to allocate Needleman-Wunsch engine");
 
         *engine_punned = reinterpret_cast<szs_needleman_wunsch_scores_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
 }
 
 SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_sequence(                       //
     szs_needleman_wunsch_scores_t engine_punned, szs_device_scope_t device_punned, //
     sz_sequence_t const *a, sz_sequence_t const *b,                                //
-    sz_ssize_t *results, sz_size_t results_stride) {
+    sz_ssize_t *results, sz_size_t results_stride, char const **error_message) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_as_cpp_container_t {a};
     auto b_container = sz_sequence_as_cpp_container_t {b};
     return szs_needleman_wunsch_scores_for_(                    //
         engine_punned, device_punned, a_container, b_container, //
-        results, results_stride);
+        results, results_stride, error_message);
 }
 
 SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_u32tape(                        //
     szs_needleman_wunsch_scores_t engine_punned, szs_device_scope_t device_punned, //
     sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,                //
-    sz_ssize_t *results, sz_size_t results_stride) {
+    sz_ssize_t *results, sz_size_t results_stride, char const **error_message) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_u32tape_as_cpp_container_t {a};
     auto b_container = sz_sequence_u32tape_as_cpp_container_t {b};
     return szs_needleman_wunsch_scores_for_(                    //
         engine_punned, device_punned, a_container, b_container, //
-        results, results_stride);
+        results, results_stride, error_message);
 }
 
 SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_u64tape(                        //
     szs_needleman_wunsch_scores_t engine_punned, szs_device_scope_t device_punned, //
     sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,                //
-    sz_ssize_t *results, sz_size_t results_stride) {
+    sz_ssize_t *results, sz_size_t results_stride, char const **error_message) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_u64tape_as_cpp_container_t {a};
     auto b_container = sz_sequence_u64tape_as_cpp_container_t {b};
     return szs_needleman_wunsch_scores_for_(                    //
         engine_punned, device_punned, a_container, b_container, //
-        results, results_stride);
+        results, results_stride, error_message);
 }
 
 SZ_DYNAMIC void szs_needleman_wunsch_scores_free(szs_needleman_wunsch_scores_t engine_punned) {
@@ -1430,7 +1508,7 @@ SZ_DYNAMIC void szs_needleman_wunsch_scores_free(szs_needleman_wunsch_scores_t e
 SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_init(                         //
     sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities,          //
-    szs_smith_waterman_scores_t *engine_punned) {
+    szs_smith_waterman_scores_t *engine_punned, char const **error_message) {
 
     sz_assert_(engine_punned != nullptr && *engine_punned == nullptr && "Engine must be uninitialized");
 
@@ -1447,10 +1525,12 @@ SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_init(                         /
         auto variant = szs::smith_waterman_ice_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)
             smith_waterman_backends_t(std::in_place_type_t<szs::smith_waterman_ice_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message,
+                                   "Failed to allocate Smith-Waterman engine");
 
         *engine_punned = reinterpret_cast<szs_smith_waterman_scores_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
 #endif // SZ_USE_ICE
 
@@ -1460,19 +1540,23 @@ SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_init(                         /
         auto variant = szs::smith_waterman_cuda_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)
             smith_waterman_backends_t(std::in_place_type_t<szs::smith_waterman_cuda_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message,
+                                   "Failed to allocate Smith-Waterman engine");
 
         *engine_punned = reinterpret_cast<szs_smith_waterman_scores_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
     else if (can_use_cuda) {
         auto variant = szs::affine_smith_waterman_cuda_t(substitution_costs, affine_costs);
         auto engine = new (std::nothrow)
             smith_waterman_backends_t(std::in_place_type_t<szs::affine_smith_waterman_cuda_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message,
+                                   "Failed to allocate Smith-Waterman engine");
 
         *engine_punned = reinterpret_cast<szs_smith_waterman_scores_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
 #endif // SZ_USE_CUDA
 
@@ -1482,19 +1566,23 @@ SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_init(                         /
         auto variant = szs::smith_waterman_hopper_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)
             smith_waterman_backends_t(std::in_place_type_t<szs::smith_waterman_hopper_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message,
+                                   "Failed to allocate Smith-Waterman engine");
 
         *engine_punned = reinterpret_cast<szs_smith_waterman_scores_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
     else if (can_use_hopper) {
         auto variant = szs::affine_smith_waterman_hopper_t(substitution_costs, affine_costs);
         auto engine = new (std::nothrow)
             smith_waterman_backends_t(std::in_place_type_t<szs::affine_smith_waterman_hopper_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message,
+                                   "Failed to allocate Smith-Waterman engine");
 
         *engine_punned = reinterpret_cast<szs_smith_waterman_scores_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
 #endif // SZ_USE_HOPPER
 
@@ -1502,59 +1590,63 @@ SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_init(                         /
         auto variant = szs::smith_waterman_serial_t(substitution_costs, linear_costs);
         auto engine = new (std::nothrow)
             smith_waterman_backends_t(std::in_place_type_t<szs::smith_waterman_serial_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message,
+                                   "Failed to allocate Smith-Waterman engine");
 
         *engine_punned = reinterpret_cast<szs_smith_waterman_scores_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
     else {
         auto variant = szs::affine_smith_waterman_serial_t(substitution_costs, affine_costs);
         auto engine = new (std::nothrow)
             smith_waterman_backends_t(std::in_place_type_t<szs::affine_smith_waterman_serial_t>(), std::move(variant));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message,
+                                   "Failed to allocate Smith-Waterman engine");
 
         *engine_punned = reinterpret_cast<szs_smith_waterman_scores_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
 }
 
 SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_sequence(                       //
     szs_smith_waterman_scores_t engine_punned, szs_device_scope_t device_punned, //
     sz_sequence_t const *a, sz_sequence_t const *b,                              //
-    sz_ssize_t *results, sz_size_t results_stride) {
+    sz_ssize_t *results, sz_size_t results_stride, char const **error_message) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_as_cpp_container_t {a};
     auto b_container = sz_sequence_as_cpp_container_t {b};
     return szs_smith_waterman_scores_for_(                      //
         engine_punned, device_punned, a_container, b_container, //
-        results, results_stride);
+        results, results_stride, error_message);
 }
 
 SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_u32tape(                        //
     szs_smith_waterman_scores_t engine_punned, szs_device_scope_t device_punned, //
     sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,              //
-    sz_ssize_t *results, sz_size_t results_stride) {
+    sz_ssize_t *results, sz_size_t results_stride, char const **error_message) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_u32tape_as_cpp_container_t {a};
     auto b_container = sz_sequence_u32tape_as_cpp_container_t {b};
     return szs_smith_waterman_scores_for_(                      //
         engine_punned, device_punned, a_container, b_container, //
-        results, results_stride);
+        results, results_stride, error_message);
 }
 
 SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_u64tape(                        //
     szs_smith_waterman_scores_t engine_punned, szs_device_scope_t device_punned, //
     sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,              //
-    sz_ssize_t *results, sz_size_t results_stride) {
+    sz_ssize_t *results, sz_size_t results_stride, char const **error_message) {
 
     sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
     auto a_container = sz_sequence_u64tape_as_cpp_container_t {a};
     auto b_container = sz_sequence_u64tape_as_cpp_container_t {b};
     return szs_smith_waterman_scores_for_(                      //
         engine_punned, device_punned, a_container, b_container, //
-        results, results_stride);
+        results, results_stride, error_message);
 }
 
 SZ_DYNAMIC void szs_smith_waterman_scores_free(szs_smith_waterman_scores_t engine_punned) {
@@ -1571,7 +1663,7 @@ SZ_DYNAMIC sz_status_t szs_fingerprints_init(                         //
     sz_size_t dimensions, sz_size_t alphabet_size,                    //
     sz_size_t const *window_widths, sz_size_t window_widths_count,    //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities, //
-    szs_fingerprints_t *engine_punned) {
+    szs_fingerprints_t *engine_punned, char const **error_message) {
 
     sz_assert_(engine_punned != nullptr && *engine_punned == nullptr && "Engine must be uninitialized");
 
@@ -1607,10 +1699,11 @@ SZ_DYNAMIC sz_status_t szs_fingerprints_init(                         //
 
         auto engine =
             new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<vec<hasher_t>>(), std::move(hashers));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Fingerprints engine");
         engine->dimensions = dimensions;
         *engine_punned = reinterpret_cast<szs_fingerprints_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
 #endif // SZ_USE_HASWELL
 
@@ -1632,10 +1725,11 @@ SZ_DYNAMIC sz_status_t szs_fingerprints_init(                         //
 
         auto engine =
             new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<vec<hasher_t>>(), std::move(hashers));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Fingerprints engine");
         engine->dimensions = dimensions;
         *engine_punned = reinterpret_cast<szs_fingerprints_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
 #endif // SZ_USE_SKYLAKE
 
@@ -1657,10 +1751,11 @@ SZ_DYNAMIC sz_status_t szs_fingerprints_init(                         //
 
         auto engine =
             new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<vec<hasher_t>>(), std::move(hashers));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Fingerprints engine");
         engine->dimensions = dimensions;
         *engine_punned = reinterpret_cast<szs_fingerprints_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
     else if (can_use_cuda) {
         using fallback_variant_cuda_t = typename fingerprints_backends_t::fallback_variant_cuda_t;
@@ -1698,10 +1793,11 @@ SZ_DYNAMIC sz_status_t szs_fingerprints_init(                         //
 
         auto engine =
             new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<vec<hasher_t>>(), std::move(hashers));
-        if (!engine) return sz_bad_alloc_k;
+        if (!engine)
+            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Fingerprints engine");
         engine->dimensions = dimensions;
         *engine_punned = reinterpret_cast<szs_fingerprints_t>(engine);
-        return sz_success_k;
+        return propagate_error(sz::status_t::success_k, error_message);
     }
 
     // Build the fallback variant with interleaving width dimensions
@@ -1714,50 +1810,51 @@ SZ_DYNAMIC sz_status_t szs_fingerprints_init(                         //
 
     auto engine =
         new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<fallback_variant_cpus_t>(), std::move(variant));
-    if (!engine) return sz_bad_alloc_k;
+    if (!engine)
+        return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Fingerprints engine");
 
     engine->dimensions = dimensions;
     *engine_punned = reinterpret_cast<szs_fingerprints_t>(engine);
-    return sz_success_k;
+    return propagate_error(sz::status_t::success_k, error_message);
 }
 
 SZ_DYNAMIC sz_status_t szs_fingerprints_sequence(                       //
     szs_fingerprints_t engine_punned, szs_device_scope_t device_punned, //
     sz_sequence_t const *texts,                                         //
     sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                  //
-    sz_u32_t *min_counts, sz_size_t min_counts_stride) {
+    sz_u32_t *min_counts, sz_size_t min_counts_stride, char const **error_message) {
 
     sz_assert_(texts != nullptr && "Input texts cannot be null");
     auto texts_container = sz_sequence_as_cpp_container_t {texts};
     return szs_fingerprints_for_(                      //
         engine_punned, device_punned, texts_container, //
-        min_hashes, min_hashes_stride, min_counts, min_counts_stride);
+        min_hashes, min_hashes_stride, min_counts, min_counts_stride, error_message);
 }
 
 SZ_DYNAMIC sz_status_t szs_fingerprints_u32tape(                        //
     szs_fingerprints_t engine_punned, szs_device_scope_t device_punned, //
     sz_sequence_u32tape_t const *texts,                                 //
     sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                  //
-    sz_u32_t *min_counts, sz_size_t min_counts_stride) {
+    sz_u32_t *min_counts, sz_size_t min_counts_stride, char const **error_message) {
 
     sz_assert_(texts != nullptr && "Input texts cannot be null");
     auto texts_container = sz_sequence_u32tape_as_cpp_container_t {texts};
     return szs_fingerprints_for_(                      //
         engine_punned, device_punned, texts_container, //
-        min_hashes, min_hashes_stride, min_counts, min_counts_stride);
+        min_hashes, min_hashes_stride, min_counts, min_counts_stride, error_message);
 }
 
 SZ_DYNAMIC sz_status_t szs_fingerprints_u64tape(                        //
     szs_fingerprints_t engine_punned, szs_device_scope_t device_punned, //
     sz_sequence_u64tape_t const *texts,                                 //
     sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                  //
-    sz_u32_t *min_counts, sz_size_t min_counts_stride) {
+    sz_u32_t *min_counts, sz_size_t min_counts_stride, char const **error_message) {
 
     sz_assert_(texts != nullptr && "Input texts cannot be null");
     auto texts_container = sz_sequence_u64tape_as_cpp_container_t {texts};
     return szs_fingerprints_for_(                      //
         engine_punned, device_punned, texts_container, //
-        min_hashes, min_hashes_stride, min_counts, min_counts_stride);
+        min_hashes, min_hashes_stride, min_counts, min_counts_stride, error_message);
 }
 
 SZ_DYNAMIC void szs_fingerprints_free(szs_fingerprints_t engine_punned) {
@@ -1774,43 +1871,44 @@ SZ_DYNAMIC sz_status_t szs_fingerprints_utf8_init(                    //
     sz_size_t dimensions, sz_size_t alphabet_size,                    //
     sz_size_t const *window_widths, sz_size_t window_widths_count,    //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities, //
-    szs_fingerprints_utf8_t *engine_punned) {
+    szs_fingerprints_utf8_t *engine_punned, char const **error_message) {
 
     return szs_fingerprints_init( //
-        dimensions, alphabet_size, window_widths, window_widths_count, alloc, capabilities, engine_punned);
+        dimensions, alphabet_size, window_widths, window_widths_count, alloc, capabilities, engine_punned,
+        error_message);
 }
 
 SZ_DYNAMIC sz_status_t szs_fingerprints_utf8_sequence(                       //
     szs_fingerprints_utf8_t engine_punned, szs_device_scope_t device_punned, //
     sz_sequence_t const *texts,                                              //
     sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                       //
-    sz_u32_t *min_counts, sz_size_t min_counts_stride) {
+    sz_u32_t *min_counts, sz_size_t min_counts_stride, char const **error_message) {
 
     return szs_fingerprints_sequence(        //
         engine_punned, device_punned, texts, //
-        min_hashes, min_hashes_stride, min_counts, min_counts_stride);
+        min_hashes, min_hashes_stride, min_counts, min_counts_stride, error_message);
 }
 
 SZ_DYNAMIC sz_status_t szs_fingerprints_utf8_u32tape(                        //
     szs_fingerprints_utf8_t engine_punned, szs_device_scope_t device_punned, //
     sz_sequence_u32tape_t const *texts,                                      //
     sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                       //
-    sz_u32_t *min_counts, sz_size_t min_counts_stride) {
+    sz_u32_t *min_counts, sz_size_t min_counts_stride, char const **error_message) {
 
     return szs_fingerprints_u32tape(         //
         engine_punned, device_punned, texts, //
-        min_hashes, min_hashes_stride, min_counts, min_counts_stride);
+        min_hashes, min_hashes_stride, min_counts, min_counts_stride, error_message);
 }
 
 SZ_DYNAMIC sz_status_t szs_fingerprints_utf8_u64tape(                        //
     szs_fingerprints_utf8_t engine_punned, szs_device_scope_t device_punned, //
     sz_sequence_u64tape_t const *texts,                                      //
     sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                       //
-    sz_u32_t *min_counts, sz_size_t min_counts_stride) {
+    sz_u32_t *min_counts, sz_size_t min_counts_stride, char const **error_message) {
 
     return szs_fingerprints_u64tape(         //
         engine_punned, device_punned, texts, //
-        min_hashes, min_hashes_stride, min_counts, min_counts_stride);
+        min_hashes, min_hashes_stride, min_counts, min_counts_stride, error_message);
 }
 
 SZ_DYNAMIC void szs_fingerprints_utf8_free(szs_fingerprints_utf8_t engine_punned) {
@@ -1818,5 +1916,4 @@ SZ_DYNAMIC void szs_fingerprints_utf8_free(szs_fingerprints_utf8_t engine_punned
 }
 
 #pragma endregion Fingerprints UTF8
-
-} // extern "C"
+}
diff --git a/include/stringzillas/stringzillas.h b/include/stringzillas/stringzillas.h
index 22c22fe1..21ccb502 100644
--- a/include/stringzillas/stringzillas.h
+++ b/include/stringzillas/stringzillas.h
@@ -77,9 +77,10 @@ typedef struct sz_sequence_u64tape_t {
 
 /**
  *  @brief Prepares the default allocator for unified memory management.
+ *  @param[out] error_message Optional output pointer for detailed error information.
  *  @note When compiled on CUDA-capable systems, this function will use `cudaMallocManaged`.
  */
-SZ_DYNAMIC sz_status_t sz_memory_allocator_init_unified(sz_memory_allocator_t *alloc);
+SZ_DYNAMIC sz_status_t sz_memory_allocator_init_unified(sz_memory_allocator_t *alloc, char const **error_message);
 
 /**
  *  Doesn't aim to provide the same level of granularity as the C++ API.
@@ -98,43 +99,54 @@ typedef void *szs_device_scope_t;
 /**
  * @brief Initialize device scope with system defaults.
  * @param[out] scope Pointer to device scope handle.
+ * @param[out] error_message Optional output pointer for detailed error information.
  */
-SZ_DYNAMIC sz_status_t szs_device_scope_init_default(szs_device_scope_t *scope);
+SZ_DYNAMIC sz_status_t szs_device_scope_init_default(szs_device_scope_t *scope, char const **error_message);
 
 /**
  * @brief Initialize device scope for CPU parallel execution.
  * @param[in] cpu_cores Number of CPU cores to use, or zero for all cores.
  * @param[out] scope Pointer to device scope handle.
+ * @param[out] error_message Optional output pointer for detailed error information.
  */
-SZ_DYNAMIC sz_status_t szs_device_scope_init_cpu_cores(sz_size_t cpu_cores, szs_device_scope_t *scope);
+SZ_DYNAMIC sz_status_t szs_device_scope_init_cpu_cores(sz_size_t cpu_cores, szs_device_scope_t *scope,
+                                                       char const **error_message);
 
 /**
  * @brief Initialize device scope for GPU execution.
  * @param[in] gpu_device GPU device index to target.
  * @param[out] scope Pointer to device scope handle.
+ * @param[out] error_message Optional output pointer for detailed error information.
  */
-SZ_DYNAMIC sz_status_t szs_device_scope_init_gpu_device(sz_size_t gpu_device, szs_device_scope_t *scope);
+SZ_DYNAMIC sz_status_t szs_device_scope_init_gpu_device(sz_size_t gpu_device, szs_device_scope_t *scope,
+                                                        char const **error_message);
 
 /**
  * @brief Query configured CPU cores count.
  * @param[in] scope Device scope handle.
  * @param[out] cpu_cores Number of CPU cores configured.
+ * @param[out] error_message Optional output pointer for detailed error information.
  */
-SZ_DYNAMIC sz_status_t szs_device_scope_get_cpu_cores(szs_device_scope_t scope, sz_size_t *cpu_cores);
+SZ_DYNAMIC sz_status_t szs_device_scope_get_cpu_cores(szs_device_scope_t scope, sz_size_t *cpu_cores,
+                                                      char const **error_message);
 
 /**
  * @brief Query configured GPU device ID.
  * @param[in] scope Device scope handle.
  * @param[out] gpu_device GPU device index.
+ * @param[out] error_message Optional output pointer for detailed error information.
  */
-SZ_DYNAMIC sz_status_t szs_device_scope_get_gpu_device(szs_device_scope_t scope, sz_size_t *gpu_device);
+SZ_DYNAMIC sz_status_t szs_device_scope_get_gpu_device(szs_device_scope_t scope, sz_size_t *gpu_device,
+                                                       char const **error_message);
 
 /**
  * @brief Get device scope hardware capabilities.
  * @param[in] scope Device scope handle.
  * @param[out] capabilities Hardware capabilities mask.
+ * @param[out] error_message Optional output pointer for detailed error information.
  */
-SZ_DYNAMIC sz_status_t szs_device_scope_get_capabilities(szs_device_scope_t scope, sz_capability_t *capabilities);
+SZ_DYNAMIC sz_status_t szs_device_scope_get_capabilities(szs_device_scope_t scope, sz_capability_t *capabilities,
+                                                         char const **error_message);
 
 /**
  * @brief Free device scope resources.
@@ -161,11 +173,12 @@ typedef void *szs_levenshtein_distances_utf8_t;
  *  @param[in] alloc Memory allocator (NULL for default).
  *  @param[in] capabilities Hardware capabilities mask.
  *  @param[out] engine Pointer to initialized engine handle.
+ *  @param[out] error_message Optional output pointer for detailed error information.
  */
 SZ_DYNAMIC sz_status_t szs_levenshtein_distances_init(                                             //
     sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities,                              //
-    szs_levenshtein_distances_t *engine);
+    szs_levenshtein_distances_t *engine, char const **error_message);
 
 /**
  *  @brief Compute Levenshtein distances for sequence pairs.
@@ -175,11 +188,13 @@ SZ_DYNAMIC sz_status_t szs_levenshtein_distances_init(
  *  @param[in] b Second sequence collection.
  *  @param[out] results Output distance array.
  *  @param[in] results_stride Stride between results in bytes.
+ *  @param[out] error_message Optional output pointer for detailed error information.
  */
 SZ_DYNAMIC sz_status_t szs_levenshtein_distances_sequence(         //
     szs_levenshtein_distances_t engine, szs_device_scope_t device, //
     sz_sequence_t const *a, sz_sequence_t const *b,                //
-    sz_size_t *results, sz_size_t results_stride);
+    sz_size_t *results, sz_size_t results_stride,                  //
+    char const **error_message);
 
 /**
  *  @brief Compute Levenshtein distances for 32-bit tape format.
@@ -189,11 +204,13 @@ SZ_DYNAMIC sz_status_t szs_levenshtein_distances_sequence(         //
  *  @param[in] b Second sequence tape.
  *  @param[out] results Output distance array.
  *  @param[in] results_stride Stride between results in bytes.
+ *  @param[out] error_message Optional output pointer for detailed error information.
  */
 SZ_DYNAMIC sz_status_t szs_levenshtein_distances_u32tape(           //
     szs_levenshtein_distances_t engine, szs_device_scope_t device,  //
     sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b, //
-    sz_size_t *results, sz_size_t results_stride);
+    sz_size_t *results, sz_size_t results_stride,                   //
+    char const **error_message);
 
 /**
  *  @brief Compute Levenshtein distances for 64-bit tape format.
@@ -203,11 +220,13 @@ SZ_DYNAMIC sz_status_t szs_levenshtein_distances_u32tape(           //
  *  @param[in] b Second sequence tape.
  *  @param[out] results Output distance array.
  *  @param[in] results_stride Stride between results in bytes.
+ *  @param[out] error_message Optional output pointer for detailed error information.
  */
 SZ_DYNAMIC sz_status_t szs_levenshtein_distances_u64tape(           //
     szs_levenshtein_distances_t engine, szs_device_scope_t device,  //
     sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b, //
-    sz_size_t *results, sz_size_t results_stride);
+    sz_size_t *results, sz_size_t results_stride,                   //
+    char const **error_message);
 
 /**
  *  @brief Free Levenshtein distance engine resources.
@@ -228,11 +247,12 @@ SZ_DYNAMIC void szs_levenshtein_distances_free(szs_levenshtein_distances_t engin
  *  @param[in] alloc Memory allocator (NULL for default).
  *  @param[in] capabilities Hardware capabilities mask.
  *  @param[out] engine Pointer to initialized engine handle.
+ *  @param[out] error_message Optional output pointer for detailed error information.
  */
 SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_init(                                        //
     sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities,                              //
-    szs_levenshtein_distances_utf8_t *engine);
+    szs_levenshtein_distances_utf8_t *engine, char const **error_message);
 
 /**
  *  @brief Compute UTF-8 aware Levenshtein distances for sequences.
@@ -242,11 +262,13 @@ SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_init(
  *  @param[in] b Second sequence collection.
  *  @param[out] results Output distance array.
  *  @param[in] results_stride Stride between results in bytes.
+ *  @param[out] error_message Optional output pointer for detailed error information.
  */
 SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_sequence(         //
     szs_levenshtein_distances_utf8_t engine, szs_device_scope_t device, //
     sz_sequence_t const *a, sz_sequence_t const *b,                     //
-    sz_size_t *results, sz_size_t results_stride);
+    sz_size_t *results, sz_size_t results_stride,                       //
+    char const **error_message);
 
 /**
  *  @brief Compute UTF-8 aware distances for 32-bit tape format.
@@ -256,11 +278,13 @@ SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_sequence(         //
  *  @param[in] b Second sequence tape.
  *  @param[out] results Output distance array.
  *  @param[in] results_stride Stride between results in bytes.
+ *  @param[out] error_message Optional output pointer for detailed error information.
  */
 SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_u32tape(          //
     szs_levenshtein_distances_utf8_t engine, szs_device_scope_t device, //
     sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,     //
-    sz_size_t *results, sz_size_t results_stride);
+    sz_size_t *results, sz_size_t results_stride,                       //
+    char const **error_message);
 
 /**
  *  @brief Compute UTF-8 aware distances for 64-bit tape format.
@@ -270,11 +294,13 @@ SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_u32tape(          //
  *  @param[in] b Second sequence tape.
  *  @param[out] results Output distance array.
  *  @param[in] results_stride Stride between results in bytes.
+ *  @param[out] error_message Optional output pointer for detailed error information.
  */
 SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_u64tape(          //
     szs_levenshtein_distances_utf8_t engine, szs_device_scope_t device, //
     sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,     //
-    sz_size_t *results, sz_size_t results_stride);
+    sz_size_t *results, sz_size_t results_stride,                       //
+    char const **error_message);
 
 /**
  *  @brief Free UTF-8 Levenshtein distance engine resources.
@@ -301,11 +327,12 @@ typedef void *szs_smith_waterman_scores_t;
  *  @param[in] alloc Memory allocator (NULL for default).
  *  @param[in] capabilities Hardware capabilities mask.
  *  @param[out] engine Pointer to initialized engine handle.
+ *  @param[out] error_message Optional output pointer for detailed error information.
  */
 SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_init(                       //
     sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities,          //
-    szs_needleman_wunsch_scores_t *engine);
+    szs_needleman_wunsch_scores_t *engine, char const **error_message);
 
 /**
  *  @brief Calculate Needleman-Wunsch global alignment scores for sequences.
@@ -315,11 +342,13 @@ SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_init(                       /
  *  @param[in] b Second sequence collection.
  *  @param[out] results Output score array.
  *  @param[in] results_stride Stride between results in bytes.
+ *  @param[out] error_message Optional output pointer for detailed error information.
  */
 SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_sequence(         //
     szs_needleman_wunsch_scores_t engine, szs_device_scope_t device, //
     sz_sequence_t const *a, sz_sequence_t const *b,                  //
-    sz_ssize_t *results, sz_size_t results_stride);
+    sz_ssize_t *results, sz_size_t results_stride,                   //
+    char const **error_message);
 
 /**
  *  @brief Calculate global alignment scores for 32-bit tape format.
@@ -329,11 +358,13 @@ SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_sequence(         //
  *  @param[in] b Second sequence tape.
  *  @param[out] results Output score array.
  *  @param[in] results_stride Stride between results in bytes.
+ *  @param[out] error_message Optional output pointer for detailed error information.
  */
 SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_u32tape(          //
     szs_needleman_wunsch_scores_t engine, szs_device_scope_t device, //
     sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,  //
-    sz_ssize_t *results, sz_size_t results_stride);
+    sz_ssize_t *results, sz_size_t results_stride,                   //
+    char const **error_message);
 
 /**
  *  @brief Calculate global alignment scores for 64-bit tape format.
@@ -343,11 +374,13 @@ SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_u32tape(          //
  *  @param[in] b Second sequence tape.
  *  @param[out] results Output score array.
  *  @param[in] results_stride Stride between results in bytes.
+ *  @param[out] error_message Optional output pointer for detailed error information.
  */
 SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_u64tape(          //
     szs_needleman_wunsch_scores_t engine, szs_device_scope_t device, //
     sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,  //
-    sz_ssize_t *results, sz_size_t results_stride);
+    sz_ssize_t *results, sz_size_t results_stride,                   //
+    char const **error_message);
 
 /**
  *  @brief Free Needleman-Wunsch scorer resources.
@@ -367,11 +400,12 @@ SZ_DYNAMIC void szs_needleman_wunsch_scores_free(szs_needleman_wunsch_scores_t e
  *  @param[in] alloc Memory allocator (NULL for default).
  *  @param[in] capabilities Hardware capabilities mask.
  *  @param[out] engine Pointer to initialized engine handle.
+ *  @param[out] error_message Optional output pointer for detailed error information.
  */
 SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_init(                         //
     sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities,          //
-    szs_smith_waterman_scores_t *engine);
+    szs_smith_waterman_scores_t *engine, char const **error_message);
 
 /**
  *  @brief Calculate Smith-Waterman local alignment scores for sequences.
@@ -381,11 +415,13 @@ SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_init(                         /
  *  @param[in] b Second sequence collection.
  *  @param[out] results Output score array.
  *  @param[in] results_stride Stride between results in bytes.
+ *  @param[out] error_message Optional output pointer for detailed error information.
  */
 SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_sequence(         //
     szs_smith_waterman_scores_t engine, szs_device_scope_t device, //
     sz_sequence_t const *a, sz_sequence_t const *b,                //
-    sz_ssize_t *results, sz_size_t results_stride);
+    sz_ssize_t *results, sz_size_t results_stride,                 //
+    char const **error_message);
 
 /**
  *  @brief Calculate local alignment scores for 32-bit tape format.
@@ -395,11 +431,13 @@ SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_sequence(         //
  *  @param[in] b Second sequence tape.
  *  @param[out] results Output score array.
  *  @param[in] results_stride Stride between results in bytes.
+ *  @param[out] error_message Optional output pointer for detailed error information.
  */
 SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_u32tape(           //
     szs_smith_waterman_scores_t engine, szs_device_scope_t device,  //
     sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b, //
-    sz_ssize_t *results, sz_size_t results_stride);
+    sz_ssize_t *results, sz_size_t results_stride,                  //
+    char const **error_message);
 
 /**
  *  @brief Calculate local alignment scores for 64-bit tape format.
@@ -409,11 +447,13 @@ SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_u32tape(           //
  *  @param[in] b Second sequence tape.
  *  @param[out] results Output score array.
  *  @param[in] results_stride Stride between results in bytes.
+ *  @param[out] error_message Optional output pointer for detailed error information.
  */
 SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_u64tape(           //
     szs_smith_waterman_scores_t engine, szs_device_scope_t device,  //
     sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b, //
-    sz_ssize_t *results, sz_size_t results_stride);
+    sz_ssize_t *results, sz_size_t results_stride,                  //
+    char const **error_message);
 
 /**
  *  @brief Free Smith-Waterman scorer resources.
@@ -459,13 +499,14 @@ typedef void *szs_fingerprints_utf8_t;
  *  @param[in] alloc Memory allocator (NULL for default).
  *  @param[in] capabilities Hardware capabilities mask.
  *  @param[out] engine Pointer to initialized engine handle.
+ *  @param[out] error_message Optional output pointer for detailed error information.
  *  @note If alphabet_size is 0, defaults to 256. If window_widths is NULL, uses default widths.
  */
 SZ_DYNAMIC sz_status_t szs_fingerprints_init(                         //
     sz_size_t dimensions, sz_size_t alphabet_size,                    //
     sz_size_t const *window_widths, sz_size_t window_widths_count,    //
     sz_memory_allocator_t const *alloc, sz_capability_t capabilities, //
-    szs_fingerprints_t *engine);
+    szs_fingerprints_t *engine, char const **error_message);
 
 /**
  *  @brief Compute Min-Hash fingerprints for sequences.
@@ -476,12 +517,14 @@ SZ_DYNAMIC sz_status_t szs_fingerprints_init(                         //
  *  @param[in] min_hashes_stride Stride between hash results in bytes.
  *  @param[out] min_counts Output Count-Min-Sketch array.
  *  @param[in] min_counts_stride Stride between count results in bytes.
+ *  @param[out] error_message Optional output pointer for detailed error information.
  */
 SZ_DYNAMIC sz_status_t szs_fingerprints_sequence(         //
     szs_fingerprints_t engine, szs_device_scope_t device, //
     sz_sequence_t const *texts,                           //
     sz_u32_t *min_hashes, sz_size_t min_hashes_stride,    //
-    sz_u32_t *min_counts, sz_size_t min_counts_stride);
+    sz_u32_t *min_counts, sz_size_t min_counts_stride,    //
+    char const **error_message);
 
 /**
  *  @brief Compute Min-Hash fingerprints for 64-bit tape format.
@@ -492,12 +535,14 @@ SZ_DYNAMIC sz_status_t szs_fingerprints_sequence(         //
  *  @param[in] min_hashes_stride Stride between hash results in bytes.
  *  @param[out] min_counts Output Count-Min-Sketch array.
  *  @param[in] min_counts_stride Stride between count results in bytes.
+ *  @param[out] error_message Optional output pointer for detailed error information.
  */
 SZ_DYNAMIC sz_status_t szs_fingerprints_u64tape(          //
     szs_fingerprints_t engine, szs_device_scope_t device, //
     sz_sequence_u64tape_t const *texts,                   //
     sz_u32_t *min_hashes, sz_size_t min_hashes_stride,    //
-    sz_u32_t *min_counts, sz_size_t min_counts_stride);
+    sz_u32_t *min_counts, sz_size_t min_counts_stride,    //
+    char const **error_message);
 
 /**
  *  @brief Compute Min-Hash fingerprints for 32-bit tape format.
@@ -508,12 +553,14 @@ SZ_DYNAMIC sz_status_t szs_fingerprints_u64tape(          //
  *  @param[in] min_hashes_stride Stride between hash results in bytes.
  *  @param[out] min_counts Output Count-Min-Sketch array.
  *  @param[in] min_counts_stride Stride between count results in bytes.
+ *  @param[out] error_message Optional output pointer for detailed error information.
  */
 SZ_DYNAMIC sz_status_t szs_fingerprints_u32tape(          //
     szs_fingerprints_t engine, szs_device_scope_t device, //
     sz_sequence_u32tape_t const *texts,                   //
     sz_u32_t *min_hashes, sz_size_t min_hashes_stride,    //
-    sz_u32_t *min_counts, sz_size_t min_counts_stride);
+    sz_u32_t *min_counts, sz_size_t min_counts_stride,    //
+    char const **error_message);
 
 /**
  *  @brief Free fingerprinting engine resources.
diff --git a/python/stringzillas.c b/python/stringzillas.c
index 792dff77..12fd249a 100644
--- a/python/stringzillas.c
+++ b/python/stringzillas.c
@@ -45,6 +45,25 @@ typedef SSIZE_T ssize_t;
 
 #include <stringzillas/stringzillas.h>
 
+/**
+ * @brief Set appropriate Python exception based on StringZilla status code and error detail.
+ * @param[in] status The StringZilla status code
+ * @param[in] error_detail Detailed error message from StringZilla (never NULL)
+ * @param[in] context Context string describing the operation (e.g., "Levenshtein initialization")
+ */
+static void set_stringzilla_error(sz_status_t status, char const *error_detail, char const *context) {
+    switch (status) {
+    case sz_bad_alloc_k: PyErr_Format(PyExc_MemoryError, "%s: %s", context, error_detail); break;
+    case sz_invalid_utf8_k: PyErr_Format(PyExc_ValueError, "%s: %s", context, error_detail); break;
+    case sz_overflow_risk_k: PyErr_Format(PyExc_OverflowError, "%s: %s", context, error_detail); break;
+    case sz_unexpected_dimensions_k: PyErr_Format(PyExc_ValueError, "%s: %s", context, error_detail); break;
+    case sz_missing_gpu_k:
+    case sz_device_code_mismatch_k:
+    case sz_device_memory_mismatch_k:
+    default: PyErr_Format(PyExc_RuntimeError, "%s: %s", context, error_detail); break;
+    }
+}
+
 #pragma region Forward Declarations
 
 /**
@@ -179,6 +198,7 @@ static int DeviceScope_init(DeviceScope *self, PyObject *args, PyObject *kwargs)
     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|OO", kwlist, &cpu_cores_obj, &gpu_device_obj)) return -1;
 
     sz_status_t status;
+    char const *error_detail = NULL;
 
     if (cpu_cores_obj != NULL && gpu_device_obj != NULL) {
         PyErr_SetString(PyExc_ValueError, "Cannot specify both cpu_cores and gpu_device");
@@ -191,7 +211,7 @@ static int DeviceScope_init(DeviceScope *self, PyObject *args, PyObject *kwargs)
         }
         cpu_cores = PyLong_AsSize_t(cpu_cores_obj);
         if (cpu_cores == (sz_size_t)-1 && PyErr_Occurred()) { return -1; }
-        status = szs_device_scope_init_cpu_cores(cpu_cores, &self->handle);
+        status = szs_device_scope_init_cpu_cores(cpu_cores, &self->handle, &error_detail);
         if (cpu_cores == 1) { snprintf(self->description, sizeof(self->description), "default"); }
         else if (cpu_cores == 0) { snprintf(self->description, sizeof(self->description), "CPUs:all"); }
         else { snprintf(self->description, sizeof(self->description), "CPUs:%zu", cpu_cores); }
@@ -203,16 +223,16 @@ static int DeviceScope_init(DeviceScope *self, PyObject *args, PyObject *kwargs)
         }
         gpu_device = PyLong_AsSize_t(gpu_device_obj);
         if (gpu_device == (sz_size_t)-1 && PyErr_Occurred()) { return -1; }
-        status = szs_device_scope_init_gpu_device(gpu_device, &self->handle);
+        status = szs_device_scope_init_gpu_device(gpu_device, &self->handle, &error_detail);
         snprintf(self->description, sizeof(self->description), "GPU:%zu", gpu_device);
     }
     else {
-        status = szs_device_scope_init_default(&self->handle);
+        status = szs_device_scope_init_default(&self->handle, &error_detail);
         snprintf(self->description, sizeof(self->description), "default");
     }
 
     if (status != sz_success_k) {
-        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize device scope");
+        set_stringzilla_error(status, error_detail, "DeviceScope initialization");
         return -1;
     }
 
@@ -262,7 +282,8 @@ static int parse_and_intersect_capabilities(PyObject *caps_obj, sz_capability_t
 
         // Try to get GPU device
         sz_size_t gpu_device;
-        if (szs_device_scope_get_gpu_device(device_scope->handle, &gpu_device) == sz_success_k) {
+        char const *error_detail_gpu = NULL;
+        if (szs_device_scope_get_gpu_device(device_scope->handle, &gpu_device, &error_detail_gpu) == sz_success_k) {
             if (default_hardware_capabilities & sz_caps_cuda_k) {
                 *result = sz_caps_cuda_k & default_hardware_capabilities;
                 return 0;
@@ -275,7 +296,8 @@ static int parse_and_intersect_capabilities(PyObject *caps_obj, sz_capability_t
 
         // Try to get CPU cores first
         sz_size_t cpu_cores;
-        if (szs_device_scope_get_cpu_cores(device_scope->handle, &cpu_cores) == sz_success_k) {
+        char const *error_detail_cpu = NULL;
+        if (szs_device_scope_get_cpu_cores(device_scope->handle, &cpu_cores, &error_detail_cpu) == sz_success_k) {
             *result = sz_caps_cpus_k & default_hardware_capabilities;
             return 0;
         }
@@ -386,11 +408,12 @@ static int LevenshteinDistances_init(LevenshteinDistances *self, PyObject *args,
         if (parse_and_intersect_capabilities(capabilities_tuple, &capabilities) != 0) { return -1; }
     }
 
+    char const *error_detail = NULL;
     sz_status_t status =
-        szs_levenshtein_distances_init(match, mismatch, open, extend, NULL, capabilities, &self->handle);
+        szs_levenshtein_distances_init(match, mismatch, open, extend, NULL, capabilities, &self->handle, &error_detail);
 
     if (status != sz_success_k) {
-        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize Levenshtein distances engine");
+        set_stringzilla_error(status, error_detail, "Levenshtein distances initialization");
         return -1;
     }
 
@@ -429,7 +452,7 @@ static PyObject *LevenshteinDistances_call(LevenshteinDistances *self, PyObject
     sz_size_t *kernel_results = NULL;
     sz_size_t kernel_results_stride = sizeof(sz_size_t);
     sz_status_t (*kernel_punned)(szs_levenshtein_distances_t, szs_device_scope_t, void *, void *, sz_size_t *,
-                                 sz_size_t) = NULL;
+                                 sz_size_t, char const **) = NULL;
 
     // Swap allocators only when using CUDA with a GPU device (inputs must be unified)
     if (requires_unified_memory(self->capabilities))
@@ -533,34 +556,14 @@ static PyObject *LevenshteinDistances_call(LevenshteinDistances *self, PyObject
         Py_INCREF(results_array);
     }
 
+    char const *error_detail = NULL;
     sz_status_t status = kernel_punned(               //
         self->handle, device_handle,                  //
         kernel_a_texts_punned, kernel_b_texts_punned, //
-        kernel_results, kernel_results_stride);
+        kernel_results, kernel_results_stride, &error_detail);
 
     if (status != sz_success_k) {
-        char const *error_msg;
-        switch (status) {
-        case sz_bad_alloc_k: error_msg = "Levenshtein failed: memory allocation failed"; break;
-        case sz_invalid_utf8_k: error_msg = "Levenshtein failed: invalid UTF-8 input"; break;
-        case sz_contains_duplicates_k: error_msg = "Levenshtein failed: contains duplicates"; break;
-        case sz_overflow_risk_k: error_msg = "Levenshtein failed: overflow risk"; break;
-        case sz_unexpected_dimensions_k: error_msg = "Levenshtein failed: input/output size mismatch"; break;
-        case sz_missing_gpu_k:
-            error_msg = "Levenshtein failed: CUDA backend requested but no GPU device scope provided. "
-                        "Pass device=stringzillas.DeviceScope(gpu_device=0) or use serial/CPU capabilities.";
-            break;
-        case sz_device_code_mismatch_k:
-            error_msg = "Levenshtein failed: device-code mismatch between backend and executor. "
-                        "Use a GPU DeviceScope with CUDA backends or select CPU capabilities.";
-            break;
-        case sz_device_memory_mismatch_k:
-            error_msg = "Levenshtein failed: device-memory mismatch (unified/device-accessible memory required).";
-            break;
-        case sz_status_unknown_k: error_msg = "Levenshtein failed: unknown error"; break;
-        default: error_msg = "Levenshtein failed: unexpected error"; break;
-        }
-        PyErr_Format(PyExc_RuntimeError, "%s (status code: %d)", error_msg, (int)status);
+        set_stringzilla_error(status, error_detail, "Levenshtein distances computation");
         goto cleanup;
     }
     return results_array;
@@ -683,11 +686,12 @@ static int LevenshteinDistancesUTF8_init(LevenshteinDistancesUTF8 *self, PyObjec
         if (parse_and_intersect_capabilities(capabilities_tuple, &capabilities) != 0) { return -1; }
     }
 
-    sz_status_t status =
-        szs_levenshtein_distances_utf8_init(match, mismatch, open, extend, NULL, capabilities, &self->handle);
+    char const *error_detail = NULL;
+    sz_status_t status = szs_levenshtein_distances_utf8_init(match, mismatch, open, extend, NULL, capabilities,
+                                                             &self->handle, &error_detail);
 
     if (status != sz_success_k) {
-        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize UTF-8 Levenshtein distances engine");
+        set_stringzilla_error(status, error_detail, "UTF-8 Levenshtein distances initialization");
         return -1;
     }
     snprintf(self->description, sizeof(self->description), "%d,%d,%d,%d", match, mismatch, open, extend);
@@ -725,7 +729,7 @@ static PyObject *LevenshteinDistancesUTF8_call(LevenshteinDistancesUTF8 *self, P
     sz_size_t *kernel_results = NULL;
     sz_size_t kernel_results_stride = sizeof(sz_size_t);
     sz_status_t (*kernel_punned)(szs_levenshtein_distances_t, szs_device_scope_t, void *, void *, sz_size_t *,
-                                 sz_size_t) = NULL;
+                                 sz_size_t, char const **) = NULL;
 
     // Swap allocators when engine supports CUDA
     if (requires_unified_memory(self->capabilities))
@@ -829,34 +833,14 @@ static PyObject *LevenshteinDistancesUTF8_call(LevenshteinDistancesUTF8 *self, P
         Py_INCREF(results_array);
     }
 
+    char const *error_detail = NULL;
     sz_status_t status = kernel_punned(               //
         self->handle, device_handle,                  //
         kernel_a_texts_punned, kernel_b_texts_punned, //
-        kernel_results, kernel_results_stride);
+        kernel_results, kernel_results_stride, &error_detail);
 
     if (status != sz_success_k) {
-        char const *error_msg;
-        switch (status) {
-        case sz_bad_alloc_k: error_msg = "Levenshtein failed: memory allocation failed"; break;
-        case sz_invalid_utf8_k: error_msg = "Levenshtein failed: invalid UTF-8 input"; break;
-        case sz_contains_duplicates_k: error_msg = "Levenshtein failed: contains duplicates"; break;
-        case sz_overflow_risk_k: error_msg = "Levenshtein failed: overflow risk"; break;
-        case sz_unexpected_dimensions_k: error_msg = "Levenshtein failed: input/output size mismatch"; break;
-        case sz_missing_gpu_k:
-            error_msg = "Levenshtein failed: CUDA backend requested but no GPU device scope provided. "
-                        "Pass device=stringzillas.DeviceScope(gpu_device=0) or use serial/CPU capabilities.";
-            break;
-        case sz_device_code_mismatch_k:
-            error_msg = "Levenshtein failed: device-code mismatch between backend and executor. "
-                        "Use a GPU DeviceScope with CUDA backends or select CPU capabilities.";
-            break;
-        case sz_device_memory_mismatch_k:
-            error_msg = "Levenshtein failed: device-memory mismatch (unified/device-accessible memory required).";
-            break;
-        case sz_status_unknown_k: error_msg = "Levenshtein failed: unknown error"; break;
-        default: error_msg = "Levenshtein failed: unexpected error"; break;
-        }
-        PyErr_Format(PyExc_RuntimeError, "%s (status code: %d)", error_msg, (int)status);
+        set_stringzilla_error(status, error_detail, "Levenshtein distances computation");
         goto cleanup;
     }
     return results_array;
@@ -995,31 +979,11 @@ static int NeedlemanWunsch_init(NeedlemanWunsch *self, PyObject *args, PyObject
     for (int i = 0; i < 256; i += 16)                      // Sample every 16th element
         subs_checksum += (sz_u32_t)subs_data[i * 256 + i]; // Diagonal elements
 
-    sz_status_t status = szs_needleman_wunsch_scores_init(subs_data, open, extend, NULL, capabilities, &self->handle);
+    char const *error_detail = NULL;
+    sz_status_t status =
+        szs_needleman_wunsch_scores_init(subs_data, open, extend, NULL, capabilities, &self->handle, &error_detail);
     if (status != sz_success_k) {
-        char const *error_msg;
-        switch (status) {
-        case sz_bad_alloc_k: error_msg = "NeedlemanWunsch failed: memory allocation failed"; break;
-        case sz_invalid_utf8_k: error_msg = "NeedlemanWunsch failed: invalid UTF-8 input"; break;
-        case sz_contains_duplicates_k: error_msg = "NeedlemanWunsch failed: contains duplicates"; break;
-        case sz_overflow_risk_k: error_msg = "NeedlemanWunsch failed: overflow risk"; break;
-        case sz_unexpected_dimensions_k: error_msg = "NeedlemanWunsch failed: input/output size mismatch"; break;
-        case sz_missing_gpu_k:
-            error_msg = "NeedlemanWunsch failed: CUDA backend requested but no GPU device scope provided. "
-                        "Pass device=stringzillas.DeviceScope(gpu_device=0) or use serial/CPU capabilities.";
-            break;
-
-        case sz_device_code_mismatch_k:
-            error_msg = "NeedlemanWunsch failed: device-code mismatch between backend and executor. "
-                        "Use a GPU DeviceScope with CUDA backends or select CPU capabilities.";
-            break;
-        case sz_device_memory_mismatch_k:
-            error_msg = "NeedlemanWunsch failed: device-memory mismatch (unified/device-accessible memory required).";
-            break;
-        case sz_status_unknown_k: error_msg = "NeedlemanWunsch failed: unknown error"; break;
-        default: error_msg = "NeedlemanWunsch failed: unexpected error"; break;
-        }
-        PyErr_Format(PyExc_RuntimeError, "%s (status code: %d)", error_msg, (int)status);
+        set_stringzilla_error(status, error_detail, "NeedlemanWunsch initialization");
         return -1;
     }
 
@@ -1056,7 +1020,7 @@ static PyObject *NeedlemanWunsch_call(NeedlemanWunsch *self, PyObject *args, PyO
     void const *kernel_a_texts_punned = NULL;
     void const *kernel_b_texts_punned = NULL;
     sz_status_t (*kernel_punned)(szs_needleman_wunsch_scores_t, szs_device_scope_t, void const *, void const *,
-                                 sz_ssize_t *, sz_size_t) = NULL;
+                                 sz_ssize_t *, sz_size_t, char const **) = NULL;
 
     // Swap allocators only when using CUDA with a GPU device (inputs must be unified)
     if (requires_unified_memory(self->capabilities))
@@ -1161,34 +1125,14 @@ static PyObject *NeedlemanWunsch_call(NeedlemanWunsch *self, PyObject *args, PyO
         Py_INCREF(results_array);
     }
 
+    char const *error_detail = NULL;
     sz_status_t status = kernel_punned(               //
         self->handle, device_handle,                  //
         kernel_a_texts_punned, kernel_b_texts_punned, //
-        kernel_results, kernel_results_stride);
+        kernel_results, kernel_results_stride, &error_detail);
 
     if (status != sz_success_k) {
-        char const *error_msg;
-        switch (status) {
-        case sz_bad_alloc_k: error_msg = "NeedlemanWunsch failed: memory allocation failed"; break;
-        case sz_invalid_utf8_k: error_msg = "NeedlemanWunsch failed: invalid UTF-8 input"; break;
-        case sz_contains_duplicates_k: error_msg = "NeedlemanWunsch failed: contains duplicates"; break;
-        case sz_overflow_risk_k: error_msg = "NeedlemanWunsch failed: overflow risk"; break;
-        case sz_unexpected_dimensions_k: error_msg = "NeedlemanWunsch failed: input/output size mismatch"; break;
-        case sz_missing_gpu_k:
-            error_msg = "NeedlemanWunsch failed: CUDA backend requested but no GPU device scope provided. "
-                        "Pass device=stringzillas.DeviceScope(gpu_device=0) or use serial/CPU capabilities.";
-            break;
-        case sz_device_code_mismatch_k:
-            error_msg = "NeedlemanWunsch failed: device-code mismatch between backend and executor. "
-                        "Use a GPU DeviceScope with CUDA backends or select CPU capabilities.";
-            break;
-        case sz_device_memory_mismatch_k:
-            error_msg = "NeedlemanWunsch failed: device-memory mismatch (unified/device-accessible memory required).";
-            break;
-        case sz_status_unknown_k: error_msg = "NeedlemanWunsch failed: unknown error"; break;
-        default: error_msg = "NeedlemanWunsch failed: unexpected error"; break;
-        }
-        PyErr_Format(PyExc_RuntimeError, "%s (status code: %d)", error_msg, (int)status);
+        set_stringzilla_error(status, error_detail, "NeedlemanWunsch computation");
         goto cleanup;
     }
     return results_array;
@@ -1312,31 +1256,12 @@ static int SmithWaterman_init(SmithWaterman *self, PyObject *args, PyObject *kwa
 
     // Initialize the engine
     sz_error_cost_t *subs_data = (sz_error_cost_t *)PyArray_DATA(subs_array);
-    sz_status_t status = szs_smith_waterman_scores_init(subs_data, open, extend, NULL, capabilities, &self->handle);
+    char const *error_detail = NULL;
+    sz_status_t status =
+        szs_smith_waterman_scores_init(subs_data, open, extend, NULL, capabilities, &self->handle, &error_detail);
 
     if (status != sz_success_k) {
-        char const *error_msg;
-        switch (status) {
-        case sz_bad_alloc_k: error_msg = "SmithWaterman failed: memory allocation failed"; break;
-        case sz_invalid_utf8_k: error_msg = "SmithWaterman failed: invalid UTF-8 input"; break;
-        case sz_contains_duplicates_k: error_msg = "SmithWaterman failed: contains duplicates"; break;
-        case sz_overflow_risk_k: error_msg = "SmithWaterman failed: overflow risk"; break;
-        case sz_unexpected_dimensions_k: error_msg = "SmithWaterman failed: input/output size mismatch"; break;
-        case sz_missing_gpu_k:
-            error_msg = "SmithWaterman failed: CUDA backend requested but no GPU device scope provided. "
-                        "Pass device=stringzillas.DeviceScope(gpu_device=0) or use serial/CPU capabilities.";
-            break;
-        case sz_device_code_mismatch_k:
-            error_msg = "SmithWaterman failed: device-code mismatch between backend and executor. "
-                        "Use a GPU DeviceScope with CUDA backends or select CPU capabilities.";
-            break;
-        case sz_device_memory_mismatch_k:
-            error_msg = "SmithWaterman failed: device-memory mismatch (unified/device-accessible memory required).";
-            break;
-        case sz_status_unknown_k: error_msg = "SmithWaterman failed: unknown error"; break;
-        default: error_msg = "SmithWaterman failed: unexpected error"; break;
-        }
-        PyErr_Format(PyExc_RuntimeError, "%s (status code: %d)", error_msg, (int)status);
+        set_stringzilla_error(status, error_detail, "SmithWaterman initialization");
         return -1;
     }
 
@@ -1370,7 +1295,7 @@ static PyObject *SmithWaterman_call(SmithWaterman *self, PyObject *args, PyObjec
     void const *kernel_a_texts_punned = NULL;
     void const *kernel_b_texts_punned = NULL;
     sz_status_t (*kernel_punned)(szs_smith_waterman_scores_t, szs_device_scope_t, void const *, void const *,
-                                 sz_ssize_t *, sz_size_t) = NULL;
+                                 sz_ssize_t *, sz_size_t, char const **) = NULL;
 
     // Swap allocators only when using CUDA with a GPU device (inputs must be unified)
     if (requires_unified_memory(self->capabilities))
@@ -1475,34 +1400,14 @@ static PyObject *SmithWaterman_call(SmithWaterman *self, PyObject *args, PyObjec
         Py_INCREF(results_array);
     }
 
+    char const *error_detail = NULL;
     sz_status_t status = kernel_punned(               //
         self->handle, device_handle,                  //
         kernel_a_texts_punned, kernel_b_texts_punned, //
-        kernel_results, kernel_results_stride);
+        kernel_results, kernel_results_stride, &error_detail);
 
     if (status != sz_success_k) {
-        char const *error_msg;
-        switch (status) {
-        case sz_bad_alloc_k: error_msg = "SmithWaterman failed: memory allocation failed"; break;
-        case sz_invalid_utf8_k: error_msg = "SmithWaterman failed: invalid UTF-8 input"; break;
-        case sz_contains_duplicates_k: error_msg = "SmithWaterman failed: contains duplicates"; break;
-        case sz_overflow_risk_k: error_msg = "SmithWaterman failed: overflow risk"; break;
-        case sz_unexpected_dimensions_k: error_msg = "SmithWaterman failed: input/output size mismatch"; break;
-        case sz_missing_gpu_k:
-            error_msg = "SmithWaterman failed: CUDA backend requested but no GPU device scope provided. "
-                        "Pass device=stringzillas.DeviceScope(gpu_device=0) or use serial/CPU capabilities.";
-            break;
-        case sz_device_code_mismatch_k:
-            error_msg = "SmithWaterman failed: device-code mismatch between backend and executor. "
-                        "Use a GPU DeviceScope with CUDA backends or select CPU capabilities.";
-            break;
-        case sz_device_memory_mismatch_k:
-            error_msg = "SmithWaterman failed: device-memory mismatch (unified/device-accessible memory required).";
-            break;
-        case sz_status_unknown_k: error_msg = "SmithWaterman failed: unknown error"; break;
-        default: error_msg = "SmithWaterman failed: unexpected error"; break;
-        }
-        PyErr_Format(PyExc_RuntimeError, "%s (status code: %d)", error_msg, (int)status);
+        set_stringzilla_error(status, error_detail, "SmithWaterman computation");
         goto cleanup;
     }
     return results_array;
@@ -1658,11 +1563,12 @@ static int Fingerprints_init(Fingerprints *self, PyObject *args, PyObject *kwarg
         window_widths = (sz_size_t *)PyArray_DATA(arr);
     }
 
+    char const *error_detail = NULL;
     sz_status_t status = szs_fingerprints_init(ndim, alphabet_size, window_widths, window_widths_count, NULL,
-                                               capabilities, &self->handle);
+                                               capabilities, &self->handle, &error_detail);
 
     if (status != sz_success_k) {
-        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize Fingerprints engine");
+        set_stringzilla_error(status, error_detail, "Fingerprints initialization");
         return -1;
     }
 
@@ -1730,7 +1636,7 @@ static PyObject *Fingerprints_call(Fingerprints *self, PyObject *args, PyObject
     sz_size_t kernel_input_size = 0;
     void *kernel_texts_punned = NULL;
     sz_status_t (*kernel_punned)(szs_fingerprints_t, szs_device_scope_t, void *, sz_u32_t *, sz_size_t, sz_u32_t *,
-                                 sz_size_t) = NULL;
+                                 sz_size_t, char const **) = NULL;
 
     // Handle 32-bit tape inputs
     sz_sequence_u32tape_t texts_u32tape;
@@ -1796,14 +1702,16 @@ static PyObject *Fingerprints_call(Fingerprints *self, PyObject *args, PyObject
             return PyErr_NoMemory();
         }
 
-        sz_status_t status = kernel_punned(self->handle, device_handle, kernel_texts_punned, buf_hashes,
-                                           self->ndim * sizeof(sz_u32_t), buf_counts, self->ndim * sizeof(sz_u32_t));
+        char const *error_detail = NULL;
+        sz_status_t status =
+            kernel_punned(self->handle, device_handle, kernel_texts_punned, buf_hashes, self->ndim * sizeof(sz_u32_t),
+                          buf_counts, self->ndim * sizeof(sz_u32_t), &error_detail);
         if (status != sz_success_k) {
             out_alloc->free(buf_hashes, total_bytes, out_alloc->handle);
             out_alloc->free(buf_counts, total_bytes, out_alloc->handle);
             Py_DECREF(hashes_array);
             Py_DECREF(counts_array);
-            PyErr_SetString(PyExc_RuntimeError, "Fingerprinting computation failed");
+            set_stringzilla_error(status, error_detail, "Fingerprints computation");
             return NULL;
         }
 
@@ -2008,15 +1916,17 @@ PyMODINIT_FUNC PyInit_stringzillas(void) {
     }
 
     // Initialize the unified memory allocator for GPU compatibility
-    sz_status_t alloc_status = sz_memory_allocator_init_unified(&unified_allocator);
+    char const *alloc_error = NULL;
+    sz_status_t alloc_status = sz_memory_allocator_init_unified(&unified_allocator, &alloc_error);
     if (alloc_status != sz_success_k) sz_memory_allocator_init_default(&unified_allocator);
     // Initialize default CPU allocator
     sz_memory_allocator_init_default(&default_allocator);
 
     // Initialize the default device scope for reuse
-    sz_status_t status = szs_device_scope_init_default(&default_device_scope);
+    char const *error_detail = NULL;
+    sz_status_t status = szs_device_scope_init_default(&default_device_scope, &error_detail);
     if (status != sz_success_k) {
-        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize default device scope");
+        set_stringzilla_error(status, error_detail, "Default DeviceScope initialization");
         return NULL;
     }
 
diff --git a/rust/stringzillas.rs b/rust/stringzillas.rs
index 3bf7fdb3..30122967 100644
--- a/rust/stringzillas.rs
+++ b/rust/stringzillas.rs
@@ -1,7 +1,7 @@
 extern crate alloc;
 use alloc::vec::Vec;
 use allocator_api2::{alloc::AllocError, alloc::Allocator, alloc::Layout};
-use core::ffi::c_void;
+use core::ffi::{c_char, c_void, CStr};
 use core::ptr;
 use stringtape::{BytesTape, StringTape};
 
@@ -14,6 +14,40 @@ pub type Capability = u32;
 // Import from stringzilla module
 pub use crate::stringzilla::Status;
 
+/// Custom error type that preserves detailed error messages from the C API.
+#[derive(Debug)]
+pub struct Error {
+    pub status: Status,
+    pub message: Option<String>,
+}
+
+impl std::fmt::Display for Error {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match &self.message {
+            Some(msg) => write!(f, "{}", msg),
+            None => write!(f, "{:?}", self.status),
+        }
+    }
+}
+
+impl std::error::Error for Error {}
+
+impl From<Status> for Error {
+    fn from(status: Status) -> Self {
+        Error { status, message: None }
+    }
+}
+
+fn rust_error_from_c_message(status: Status, error_msg: *const c_char) -> Error {
+    let message = if !error_msg.is_null() && status != Status::Success {
+        unsafe { CStr::from_ptr(error_msg).to_str().ok().map(|s| s.to_string()) }
+    } else {
+        None
+    };
+
+    Error { status, message }
+}
+
 /// Device scope manages execution context and hardware resource allocation.
 ///
 /// Device scopes automatically detect available hardware capabilities and select
@@ -62,7 +96,7 @@ pub use crate::stringzilla::Status;
 /// // GPU might not be available
 /// match DeviceScope::gpu_device(99) {
 ///     Ok(_) => println!("GPU available"),
-///     Err(Status::MissingGpu) => println!("GPU not available or invalid device ID"),
+///     Err(e) if e.status == Status::MissingGpu => println!("GPU not available or invalid device ID"),
 ///     Err(e) => println!("GPU error: {:?}", e),
 /// }
 /// ```
@@ -100,12 +134,13 @@ impl DeviceScope {
     ///     println!("Using CPU with {} cores", device.get_cpu_cores().unwrap());
     /// }
     /// ```
-    pub fn default() -> Result<Self, Status> {
+    pub fn default() -> Result<Self, Error> {
         let mut handle = ptr::null_mut();
-        let status = unsafe { szs_device_scope_init_default(&mut handle) };
+        let mut error_msg: *const c_char = ptr::null();
+        let status = unsafe { szs_device_scope_init_default(&mut handle, &mut error_msg) };
         match status {
             Status::Success => Ok(Self { handle }),
-            err => Err(err),
+            err => Err(rust_error_from_c_message(err, error_msg)),
         }
     }
 
@@ -144,12 +179,13 @@ impl DeviceScope {
     /// - Optimal core count is usually equal to physical cores
     /// - Hyperthreading may not provide linear scaling for SIMD workloads
     /// - Consider NUMA topology for systems with >16 cores
-    pub fn cpu_cores(cpu_cores: usize) -> Result<Self, Status> {
+    pub fn cpu_cores(cpu_cores: usize) -> Result<Self, Error> {
         let mut handle = ptr::null_mut();
-        let status = unsafe { szs_device_scope_init_cpu_cores(cpu_cores, &mut handle) };
+        let mut error_msg: *const c_char = ptr::null();
+        let status = unsafe { szs_device_scope_init_cpu_cores(cpu_cores, &mut handle, &mut error_msg) };
         match status {
             Status::Success => Ok(Self { handle }),
-            err => Err(err),
+            err => Err(rust_error_from_c_message(err, error_msg)),
         }
     }
 
@@ -199,12 +235,13 @@ impl DeviceScope {
     /// - GPU is optimal for batch sizes >1000 string pairs
     /// - Memory transfer overhead affects small workloads
     /// - Use unified memory allocation for best GPU performance
-    pub fn gpu_device(gpu_device: usize) -> Result<Self, Status> {
+    pub fn gpu_device(gpu_device: usize) -> Result<Self, Error> {
         let mut handle = ptr::null_mut();
-        let status = unsafe { szs_device_scope_init_gpu_device(gpu_device, &mut handle) };
+        let mut error_msg: *const c_char = ptr::null();
+        let status = unsafe { szs_device_scope_init_gpu_device(gpu_device, &mut handle, &mut error_msg) };
         match status {
             Status::Success => Ok(Self { handle }),
-            err => Err(err),
+            err => Err(rust_error_from_c_message(err, error_msg)),
         }
     }
 
@@ -231,12 +268,13 @@ impl DeviceScope {
     /// if caps & 0x1 != 0 { println!("Basic SIMD available"); }
     /// if caps & 0x2 != 0 { println!("Advanced SIMD available"); }
     /// ```
-    pub fn get_capabilities(&self) -> Result<Capability, Status> {
+    pub fn get_capabilities(&self) -> Result<Capability, Error> {
         let mut capabilities: Capability = 0;
-        let status = unsafe { szs_device_scope_get_capabilities(self.handle, &mut capabilities) };
+        let mut error_msg: *const c_char = ptr::null();
+        let status = unsafe { szs_device_scope_get_capabilities(self.handle, &mut capabilities, &mut error_msg) };
         match status {
             Status::Success => Ok(capabilities),
-            err => Err(err),
+            err => Err(rust_error_from_c_message(err, error_msg)),
         }
     }
 
@@ -262,12 +300,13 @@ impl DeviceScope {
     /// let cores = default_device.get_cpu_cores().unwrap();
     /// println!("Default device using {} CPU cores", cores);
     /// ```
-    pub fn get_cpu_cores(&self) -> Result<usize, Status> {
+    pub fn get_cpu_cores(&self) -> Result<usize, Error> {
         let mut cpu_cores: usize = 0;
-        let status = unsafe { szs_device_scope_get_cpu_cores(self.handle, &mut cpu_cores) };
+        let mut error_msg: *const c_char = ptr::null();
+        let status = unsafe { szs_device_scope_get_cpu_cores(self.handle, &mut cpu_cores, &mut error_msg) };
         match status {
             Status::Success => Ok(cpu_cores),
-            err => Err(err),
+            err => Err(rust_error_from_c_message(err, error_msg)),
         }
     }
 
@@ -296,12 +335,13 @@ impl DeviceScope {
     /// assert!(cpu_device.get_gpu_device().is_err());
     /// assert!(!cpu_device.is_gpu());
     /// ```
-    pub fn get_gpu_device(&self) -> Result<usize, Status> {
+    pub fn get_gpu_device(&self) -> Result<usize, Error> {
         let mut gpu_device: usize = 0;
-        let status = unsafe { szs_device_scope_get_gpu_device(self.handle, &mut gpu_device) };
+        let mut error_msg: *const c_char = ptr::null();
+        let status = unsafe { szs_device_scope_get_gpu_device(self.handle, &mut gpu_device, &mut error_msg) };
         match status {
             Status::Success => Ok(gpu_device),
-            err => Err(err),
+            err => Err(rust_error_from_c_message(err, error_msg)),
         }
     }
 
@@ -743,7 +783,7 @@ impl FingerprintsBuilder {
     /// let result = engine.compute(&device, &test_data, 256);
     /// assert!(result.is_ok());
     /// ```
-    pub fn build(self, device: &DeviceScope) -> Result<Fingerprints, Status> {
+    pub fn build(self, device: &DeviceScope) -> Result<Fingerprints, Error> {
         let mut engine: FingerprintsHandle = ptr::null_mut();
         let capabilities = device.get_capabilities().unwrap_or(0);
 
@@ -752,6 +792,7 @@ impl FingerprintsBuilder {
             None => (ptr::null(), 0),
         };
 
+        let mut error_msg: *const c_char = ptr::null();
         let status = unsafe {
             szs_fingerprints_init(
                 self.dimensions,
@@ -761,12 +802,13 @@ impl FingerprintsBuilder {
                 ptr::null(), // No custom allocator
                 capabilities,
                 &mut engine,
+                &mut error_msg,
             )
         };
 
         match status {
             Status::Success => Ok(Fingerprints { handle: engine }),
-            err => Err(err),
+            err => Err(rust_error_from_c_message(err, error_msg)),
         }
     }
 }
@@ -953,7 +995,7 @@ impl Fingerprints {
         device: &DeviceScope,
         strings: T,
         dimensions: usize,
-    ) -> Result<(UnifiedVec<u32>, UnifiedVec<u32>), Status>
+    ) -> Result<(UnifiedVec<u32>, UnifiedVec<u32>), Error>
     where
         T: AsRef<[S]>,
         S: AsRef<[u8]>,
@@ -974,6 +1016,7 @@ impl Fingerprints {
         if device.is_gpu() {
             let (tape, use_64bit) = create_tape(strings_slice)?;
 
+            let mut error_msg: *const c_char = ptr::null();
             let status = if use_64bit {
                 let tape_view = create_u64tape_view(&tape);
                 unsafe {
@@ -985,6 +1028,7 @@ impl Fingerprints {
                         hashes_stride,
                         min_counts.as_mut_ptr(),
                         counts_stride,
+                        &mut error_msg,
                     )
                 }
             } else {
@@ -998,15 +1042,17 @@ impl Fingerprints {
                         hashes_stride,
                         min_counts.as_mut_ptr(),
                         counts_stride,
+                        &mut error_msg,
                     )
                 }
             };
             match status {
                 Status::Success => Ok((min_hashes, min_counts)),
-                err => Err(err),
+                err => Err(rust_error_from_c_message(err, error_msg)),
             }
         } else {
             let sequence = create_sequence_view(strings_slice);
+            let mut error_msg: *const c_char = ptr::null();
             let status = unsafe {
                 szs_fingerprints_sequence(
                     self.handle,
@@ -1016,11 +1062,12 @@ impl Fingerprints {
                     hashes_stride,
                     min_counts.as_mut_ptr(),
                     counts_stride,
+                    &mut error_msg,
                 )
             };
             match status {
                 Status::Success => Ok((min_hashes, min_counts)),
-                err => Err(err),
+                err => Err(rust_error_from_c_message(err, error_msg)),
             }
         }
     }
@@ -1077,12 +1124,32 @@ pub type SmithWatermanScoresHandle = *mut c_void;
 // C API bindings
 extern "C" {
     // Device scope functions
-    fn szs_device_scope_init_default(scope: *mut *mut c_void) -> Status;
-    fn szs_device_scope_init_cpu_cores(cpu_cores: usize, scope: *mut *mut c_void) -> Status;
-    fn szs_device_scope_init_gpu_device(gpu_device: usize, scope: *mut *mut c_void) -> Status;
-    fn szs_device_scope_get_capabilities(scope: *mut c_void, capabilities: *mut Capability) -> Status;
-    fn szs_device_scope_get_cpu_cores(scope: *mut c_void, cpu_cores: *mut usize) -> Status;
-    fn szs_device_scope_get_gpu_device(scope: *mut c_void, gpu_device: *mut usize) -> Status;
+    fn szs_device_scope_init_default(scope: *mut *mut c_void, error_message: *mut *const c_char) -> Status;
+    fn szs_device_scope_init_cpu_cores(
+        cpu_cores: usize,
+        scope: *mut *mut c_void,
+        error_message: *mut *const c_char,
+    ) -> Status;
+    fn szs_device_scope_init_gpu_device(
+        gpu_device: usize,
+        scope: *mut *mut c_void,
+        error_message: *mut *const c_char,
+    ) -> Status;
+    fn szs_device_scope_get_capabilities(
+        scope: *mut c_void,
+        capabilities: *mut Capability,
+        error_message: *mut *const c_char,
+    ) -> Status;
+    fn szs_device_scope_get_cpu_cores(
+        scope: *mut c_void,
+        cpu_cores: *mut usize,
+        error_message: *mut *const c_char,
+    ) -> Status;
+    fn szs_device_scope_get_gpu_device(
+        scope: *mut c_void,
+        gpu_device: *mut usize,
+        error_message: *mut *const c_char,
+    ) -> Status;
     fn szs_device_scope_free(scope: *mut c_void);
 
     // Levenshtein distance functions
@@ -1094,6 +1161,7 @@ extern "C" {
         alloc: *const c_void,
         capabilities: Capability,
         engine: *mut LevenshteinDistancesHandle,
+        error_message: *mut *const c_char,
     ) -> Status;
 
     fn szs_levenshtein_distances_sequence(
@@ -1103,6 +1171,7 @@ extern "C" {
         b: *const c_void, // sz_sequence_t
         results: *mut usize,
         results_stride: usize,
+        error_message: *mut *const c_char,
     ) -> Status;
 
     fn szs_levenshtein_distances_u32tape(
@@ -1112,6 +1181,7 @@ extern "C" {
         b: *const c_void, // sz_sequence_u32tape_t
         results: *mut usize,
         results_stride: usize,
+        error_message: *mut *const c_char,
     ) -> Status;
 
     fn szs_levenshtein_distances_u64tape(
@@ -1121,6 +1191,7 @@ extern "C" {
         b: *const c_void, // sz_sequence_u64tape_t
         results: *mut usize,
         results_stride: usize,
+        error_message: *mut *const c_char,
     ) -> Status;
 
     fn szs_levenshtein_distances_free(engine: LevenshteinDistancesHandle);
@@ -1134,6 +1205,7 @@ extern "C" {
         alloc: *const c_void,
         capabilities: Capability,
         engine: *mut LevenshteinDistancesUtf8Handle,
+        error_message: *mut *const c_char,
     ) -> Status;
 
     fn szs_levenshtein_distances_utf8_sequence(
@@ -1143,6 +1215,7 @@ extern "C" {
         b: *const c_void, // sz_sequence_t
         results: *mut usize,
         results_stride: usize,
+        error_message: *mut *const c_char,
     ) -> Status;
 
     fn szs_levenshtein_distances_utf8_u32tape(
@@ -1152,6 +1225,7 @@ extern "C" {
         b: *const c_void, // sz_sequence_u32tape_t
         results: *mut usize,
         results_stride: usize,
+        error_message: *mut *const c_char,
     ) -> Status;
 
     fn szs_levenshtein_distances_utf8_u64tape(
@@ -1161,6 +1235,7 @@ extern "C" {
         b: *const c_void, // sz_sequence_u64tape_t
         results: *mut usize,
         results_stride: usize,
+        error_message: *mut *const c_char,
     ) -> Status;
 
     fn szs_levenshtein_distances_utf8_free(engine: LevenshteinDistancesUtf8Handle);
@@ -1173,6 +1248,7 @@ extern "C" {
         alloc: *const c_void,
         capabilities: Capability,
         engine: *mut NeedlemanWunschScoresHandle,
+        error_message: *mut *const c_char,
     ) -> Status;
 
     fn szs_needleman_wunsch_scores_sequence(
@@ -1182,6 +1258,7 @@ extern "C" {
         b: *const c_void, // sz_sequence_t
         results: *mut isize,
         results_stride: usize,
+        error_message: *mut *const c_char,
     ) -> Status;
 
     fn szs_needleman_wunsch_scores_u32tape(
@@ -1191,6 +1268,7 @@ extern "C" {
         b: *const c_void, // sz_sequence_u32tape_t
         results: *mut isize,
         results_stride: usize,
+        error_message: *mut *const c_char,
     ) -> Status;
 
     fn szs_needleman_wunsch_scores_u64tape(
@@ -1200,6 +1278,7 @@ extern "C" {
         b: *const c_void, // sz_sequence_u64tape_t
         results: *mut isize,
         results_stride: usize,
+        error_message: *mut *const c_char,
     ) -> Status;
 
     fn szs_needleman_wunsch_scores_free(engine: NeedlemanWunschScoresHandle);
@@ -1212,6 +1291,7 @@ extern "C" {
         alloc: *const c_void,
         capabilities: Capability,
         engine: *mut SmithWatermanScoresHandle,
+        error_message: *mut *const c_char,
     ) -> Status;
 
     fn szs_smith_waterman_scores_sequence(
@@ -1221,6 +1301,7 @@ extern "C" {
         b: *const c_void, // sz_sequence_t
         results: *mut isize,
         results_stride: usize,
+        error_message: *mut *const c_char,
     ) -> Status;
 
     fn szs_smith_waterman_scores_u32tape(
@@ -1230,6 +1311,7 @@ extern "C" {
         b: *const c_void, // sz_sequence_u32tape_t
         results: *mut isize,
         results_stride: usize,
+        error_message: *mut *const c_char,
     ) -> Status;
 
     fn szs_smith_waterman_scores_u64tape(
@@ -1239,6 +1321,7 @@ extern "C" {
         b: *const c_void, // sz_sequence_u64tape_t
         results: *mut isize,
         results_stride: usize,
+        error_message: *mut *const c_char,
     ) -> Status;
 
     fn szs_smith_waterman_scores_free(engine: SmithWatermanScoresHandle);
@@ -1252,6 +1335,7 @@ extern "C" {
         alloc: *const c_void, // MemoryAllocator - using null for default
         capabilities: Capability,
         engine: *mut FingerprintsHandle,
+        error_message: *mut *const c_char,
     ) -> Status;
 
     fn szs_fingerprints_sequence(
@@ -1262,6 +1346,7 @@ extern "C" {
         min_hashes_stride: usize,
         min_counts: *mut u32,
         min_counts_stride: usize,
+        error_message: *mut *const c_char,
     ) -> Status;
 
     fn szs_fingerprints_u32tape(
@@ -1272,6 +1357,7 @@ extern "C" {
         min_hashes_stride: usize,
         min_counts: *mut u32,
         min_counts_stride: usize,
+        error_message: *mut *const c_char,
     ) -> Status;
 
     fn szs_fingerprints_u64tape(
@@ -1282,6 +1368,7 @@ extern "C" {
         min_hashes_stride: usize,
         min_counts: *mut u32,
         min_counts_stride: usize,
+        error_message: *mut *const c_char,
     ) -> Status;
 
     fn szs_fingerprints_free(engine: FingerprintsHandle);
@@ -1446,9 +1533,10 @@ impl LevenshteinDistances {
         mismatch_cost: i8,
         open_cost: i8,
         extend_cost: i8,
-    ) -> Result<Self, Status> {
+    ) -> Result<Self, Error> {
         let mut handle = ptr::null_mut();
         let capabilities = device.get_capabilities().unwrap_or(0);
+        let mut error_msg: *const c_char = ptr::null();
         let status = unsafe {
             szs_levenshtein_distances_init(
                 match_cost,
@@ -1458,11 +1546,12 @@ impl LevenshteinDistances {
                 ptr::null(),
                 capabilities,
                 &mut handle,
+                &mut error_msg,
             )
         };
         match status {
             Status::Success => Ok(Self { handle }),
-            err => Err(err),
+            err => Err(rust_error_from_c_message(err, error_msg)),
         }
     }
 
@@ -1517,7 +1606,7 @@ impl LevenshteinDistances {
         device: &DeviceScope,
         sequences_a: T,
         sequences_b: T,
-    ) -> Result<UnifiedVec<usize>, Status>
+    ) -> Result<UnifiedVec<usize>, Error>
     where
         T: AsRef<[S]>,
         S: AsRef<[u8]>,
@@ -1535,6 +1624,7 @@ impl LevenshteinDistances {
             let (tape_a, use_64bit_a) = create_tape(seq_a_slice)?;
             let (tape_b, use_64bit_b) = create_tape(seq_b_slice)?;
 
+            let mut error_msg: *const c_char = ptr::null();
             let status = if use_64bit_a || use_64bit_b {
                 let tape_a_view = create_u64tape_view(&tape_a);
                 let tape_b_view = create_u64tape_view(&tape_b);
@@ -1546,6 +1636,7 @@ impl LevenshteinDistances {
                         &tape_b_view as *const _ as *const c_void,
                         results.as_mut_ptr(),
                         results_stride,
+                        &mut error_msg,
                     )
                 }
             } else {
@@ -1559,16 +1650,18 @@ impl LevenshteinDistances {
                         &tape_b_view as *const _ as *const c_void,
                         results.as_mut_ptr(),
                         results_stride,
+                        &mut error_msg,
                     )
                 }
             };
             match status {
                 Status::Success => Ok(results),
-                err => Err(err),
+                err => Err(rust_error_from_c_message(err, error_msg)),
             }
         } else {
             let seq_a = create_sequence_view(seq_a_slice);
             let seq_b = create_sequence_view(seq_b_slice);
+            let mut error_msg: *const c_char = ptr::null();
             let status = unsafe {
                 szs_levenshtein_distances_sequence(
                     self.handle,
@@ -1577,11 +1670,12 @@ impl LevenshteinDistances {
                     &seq_b as *const _ as *const c_void,
                     results.as_mut_ptr(),
                     results_stride,
+                    &mut error_msg,
                 )
             };
             match status {
                 Status::Success => Ok(results),
-                err => Err(err),
+                err => Err(rust_error_from_c_message(err, error_msg)),
             }
         }
     }
@@ -1692,9 +1786,10 @@ impl LevenshteinDistancesUtf8 {
         mismatch_cost: i8,
         open_cost: i8,
         extend_cost: i8,
-    ) -> Result<Self, Status> {
+    ) -> Result<Self, Error> {
         let mut handle = ptr::null_mut();
         let capabilities = device.get_capabilities().unwrap_or(0);
+        let mut error_msg: *const c_char = ptr::null();
         let status = unsafe {
             szs_levenshtein_distances_utf8_init(
                 match_cost,
@@ -1704,11 +1799,12 @@ impl LevenshteinDistancesUtf8 {
                 ptr::null(),
                 capabilities,
                 &mut handle,
+                &mut error_msg,
             )
         };
         match status {
             Status::Success => Ok(Self { handle }),
-            err => Err(err),
+            err => Err(rust_error_from_c_message(err, error_msg)),
         }
     }
 
@@ -1760,7 +1856,7 @@ impl LevenshteinDistancesUtf8 {
         device: &DeviceScope,
         sequences_a: T,
         sequences_b: T,
-    ) -> Result<UnifiedVec<usize>, Status>
+    ) -> Result<UnifiedVec<usize>, Error>
     where
         T: AsRef<[S]>,
         S: AsRef<str>,
@@ -1778,6 +1874,7 @@ impl LevenshteinDistancesUtf8 {
             let (tape_a, use_64bit_a) = create_tape_str(seq_a_slice)?;
             let (tape_b, use_64bit_b) = create_tape_str(seq_b_slice)?;
 
+            let mut error_msg: *const c_char = ptr::null();
             let status = if use_64bit_a || use_64bit_b {
                 let tape_a_view = create_u64tape_view_str(&tape_a);
                 let tape_b_view = create_u64tape_view_str(&tape_b);
@@ -1789,6 +1886,7 @@ impl LevenshteinDistancesUtf8 {
                         &tape_b_view as *const _ as *const c_void,
                         results.as_mut_ptr(),
                         results_stride,
+                        &mut error_msg,
                     )
                 }
             } else {
@@ -1802,16 +1900,18 @@ impl LevenshteinDistancesUtf8 {
                         &tape_b_view as *const _ as *const c_void,
                         results.as_mut_ptr(),
                         results_stride,
+                        &mut error_msg,
                     )
                 }
             };
             match status {
                 Status::Success => Ok(results),
-                err => Err(err),
+                err => Err(rust_error_from_c_message(err, error_msg)),
             }
         } else {
             let seq_a = create_sequence_view_str(seq_a_slice);
             let seq_b = create_sequence_view_str(seq_b_slice);
+            let mut error_msg: *const c_char = ptr::null();
             let status = unsafe {
                 szs_levenshtein_distances_utf8_sequence(
                     self.handle,
@@ -1820,11 +1920,12 @@ impl LevenshteinDistancesUtf8 {
                     &seq_b as *const _ as *const c_void,
                     results.as_mut_ptr(),
                     results_stride,
+                    &mut error_msg,
                 )
             };
             match status {
                 Status::Success => Ok(results),
-                err => Err(err),
+                err => Err(rust_error_from_c_message(err, error_msg)),
             }
         }
     }
@@ -1981,9 +2082,10 @@ impl NeedlemanWunschScores {
         substitution_matrix: &[[i8; 256]; 256],
         open_cost: i8,
         extend_cost: i8,
-    ) -> Result<Self, Status> {
+    ) -> Result<Self, Error> {
         let mut handle = ptr::null_mut();
         let capabilities = device.get_capabilities().unwrap_or(0);
+        let mut error_msg: *const c_char = ptr::null();
         let status = unsafe {
             szs_needleman_wunsch_scores_init(
                 substitution_matrix.as_ptr() as *const i8,
@@ -1992,11 +2094,12 @@ impl NeedlemanWunschScores {
                 ptr::null(),
                 capabilities,
                 &mut handle,
+                &mut error_msg,
             )
         };
         match status {
             Status::Success => Ok(Self { handle }),
-            err => Err(err),
+            err => Err(rust_error_from_c_message(err, error_msg)),
         }
     }
 
@@ -2069,7 +2172,7 @@ impl NeedlemanWunschScores {
         device: &DeviceScope,
         sequences_a: T,
         sequences_b: T,
-    ) -> Result<UnifiedVec<isize>, Status>
+    ) -> Result<UnifiedVec<isize>, Error>
     where
         T: AsRef<[S]>,
         S: AsRef<[u8]>,
@@ -2087,6 +2190,7 @@ impl NeedlemanWunschScores {
             let (tape_a, use_64bit_a) = create_tape(seq_a_slice)?;
             let (tape_b, use_64bit_b) = create_tape(seq_b_slice)?;
 
+            let mut error_msg: *const c_char = ptr::null();
             let status = if use_64bit_a || use_64bit_b {
                 let tape_a_view = create_u64tape_view(&tape_a);
                 let tape_b_view = create_u64tape_view(&tape_b);
@@ -2098,6 +2202,7 @@ impl NeedlemanWunschScores {
                         &tape_b_view as *const _ as *const c_void,
                         results.as_mut_ptr(),
                         results_stride,
+                        &mut error_msg,
                     )
                 }
             } else {
@@ -2111,16 +2216,18 @@ impl NeedlemanWunschScores {
                         &tape_b_view as *const _ as *const c_void,
                         results.as_mut_ptr(),
                         results_stride,
+                        &mut error_msg,
                     )
                 }
             };
             match status {
                 Status::Success => Ok(results),
-                err => Err(err),
+                err => Err(rust_error_from_c_message(err, error_msg)),
             }
         } else {
             let seq_a = create_sequence_view(seq_a_slice);
             let seq_b = create_sequence_view(seq_b_slice);
+            let mut error_msg: *const c_char = ptr::null();
             let status = unsafe {
                 szs_needleman_wunsch_scores_sequence(
                     self.handle,
@@ -2129,11 +2236,12 @@ impl NeedlemanWunschScores {
                     &seq_b as *const _ as *const c_void,
                     results.as_mut_ptr(),
                     results_stride,
+                    &mut error_msg,
                 )
             };
             match status {
                 Status::Success => Ok(results),
-                err => Err(err),
+                err => Err(rust_error_from_c_message(err, error_msg)),
             }
         }
     }
@@ -2305,9 +2413,10 @@ impl SmithWatermanScores {
         substitution_matrix: &[[i8; 256]; 256],
         open_cost: i8,
         extend_cost: i8,
-    ) -> Result<Self, Status> {
+    ) -> Result<Self, Error> {
         let mut handle = ptr::null_mut();
         let capabilities = device.get_capabilities().unwrap_or(0);
+        let mut error_msg: *const c_char = ptr::null();
         let status = unsafe {
             szs_smith_waterman_scores_init(
                 substitution_matrix.as_ptr() as *const i8,
@@ -2316,11 +2425,12 @@ impl SmithWatermanScores {
                 ptr::null(),
                 capabilities,
                 &mut handle,
+                &mut error_msg,
             )
         };
         match status {
             Status::Success => Ok(Self { handle }),
-            err => Err(err),
+            err => Err(rust_error_from_c_message(err, error_msg)),
         }
     }
 
@@ -2408,7 +2518,7 @@ impl SmithWatermanScores {
         device: &DeviceScope,
         sequences_a: T,
         sequences_b: T,
-    ) -> Result<UnifiedVec<isize>, Status>
+    ) -> Result<UnifiedVec<isize>, Error>
     where
         T: AsRef<[S]>,
         S: AsRef<[u8]>,
@@ -2426,6 +2536,7 @@ impl SmithWatermanScores {
             let (tape_a, use_64bit_a) = create_tape(seq_a_slice)?;
             let (tape_b, use_64bit_b) = create_tape(seq_b_slice)?;
 
+            let mut error_msg: *const c_char = ptr::null();
             let status = if use_64bit_a || use_64bit_b {
                 let tape_a_view = create_u64tape_view(&tape_a);
                 let tape_b_view = create_u64tape_view(&tape_b);
@@ -2437,6 +2548,7 @@ impl SmithWatermanScores {
                         &tape_b_view as *const _ as *const c_void,
                         results.as_mut_ptr(),
                         results_stride,
+                        &mut error_msg,
                     )
                 }
             } else {
@@ -2450,16 +2562,18 @@ impl SmithWatermanScores {
                         &tape_b_view as *const _ as *const c_void,
                         results.as_mut_ptr(),
                         results_stride,
+                        &mut error_msg,
                     )
                 }
             };
             match status {
                 Status::Success => Ok(results),
-                err => Err(err),
+                err => Err(rust_error_from_c_message(err, error_msg)),
             }
         } else {
             let seq_a = create_sequence_view(seq_a_slice);
             let seq_b = create_sequence_view(seq_b_slice);
+            let mut error_msg: *const c_char = ptr::null();
             let status = unsafe {
                 szs_smith_waterman_scores_sequence(
                     self.handle,
@@ -2468,11 +2582,12 @@ impl SmithWatermanScores {
                     &seq_b as *const _ as *const c_void,
                     results.as_mut_ptr(),
                     results_stride,
+                    &mut error_msg,
                 )
             };
             match status {
                 Status::Success => Ok(results),
-                err => Err(err),
+                err => Err(rust_error_from_c_message(err, error_msg)),
             }
         }
     }
@@ -2514,7 +2629,7 @@ fn create_sequence_view_str<T: AsRef<str>>(strings: &[T]) -> SzSequence {
 }
 
 /// Convert StringTape to appropriate tape view for C API
-fn create_tape<T>(sequences: &[T]) -> Result<(BytesTape<i64, UnifiedAlloc>, bool), Status>
+fn create_tape<T>(sequences: &[T]) -> Result<(BytesTape<i64, UnifiedAlloc>, bool), Error>
 where
     T: AsRef<[u8]>,
 {
@@ -2529,12 +2644,12 @@ where
     };
 
     let mut tape = tape;
-    tape.extend(sequences).map_err(|_| SzStatus::BadAlloc)?;
+    tape.extend(sequences).map_err(|_| Error::from(SzStatus::BadAlloc))?;
     Ok((tape, use_64bit))
 }
 
 /// Convert string sequences to StringTape
-fn create_tape_str<T: AsRef<str>>(sequences: &[T]) -> Result<(StringTape<i64, UnifiedAlloc>, bool), Status> {
+fn create_tape_str<T: AsRef<str>>(sequences: &[T]) -> Result<(StringTape<i64, UnifiedAlloc>, bool), Error> {
     // Estimate total size to decide between 32-bit and 64-bit tapes
     let total_size: usize = sequences.iter().map(|s| s.as_ref().len()).sum();
     let use_64bit = total_size > u32::MAX as usize || sequences.len() > u32::MAX as usize;
@@ -2546,7 +2661,7 @@ fn create_tape_str<T: AsRef<str>>(sequences: &[T]) -> Result<(StringTape<i64, Un
     };
 
     let mut tape = tape;
-    tape.extend(sequences).map_err(|_| SzStatus::BadAlloc)?;
+    tape.extend(sequences).map_err(|_| Error::from(SzStatus::BadAlloc))?;
     Ok((tape, use_64bit))
 }
 
@@ -2643,6 +2758,8 @@ pub fn backend_info() -> &'static str {
 
     #[cfg(not(any(feature = "cpus", feature = "cuda", feature = "rocm")))]
     return "StringZillas not available - enable cpus, cuda, or rocm feature";
+
+    "CPU backend"
 }
 
 #[cfg(test)]
@@ -2650,7 +2767,7 @@ mod tests {
     use super::*;
 
     #[test]
-    fn backend_info() {
+    fn test_backend_info() {
         let info = backend_info();
         assert!(!info.is_empty());
         println!("Backend: {}", info);
diff --git a/setup.py b/setup.py
index a98efe02..72c660f7 100644
--- a/setup.py
+++ b/setup.py
@@ -105,12 +105,16 @@ def linux_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[Tu
         "-O2",  # optimization level
         "-fdiagnostics-color=always",  # color console output
         "-Wno-unknown-pragmas",  # like: `pragma region` and some unrolls
-        "-Wno-unused-function",  # like: ... declared ‘static’ but never defined
-        "-Wno-incompatible-pointer-types",  # like: passing argument 4 of ‘sz_export_prefix_u32’ from incompatible pointer type
-        "-Wno-discarded-qualifiers",  # like: passing argument 1 of ‘free’ discards ‘const’ qualifier from pointer target type
+        "-Wno-unused-function",  # like: ... declared `static` but never defined
         "-fPIC",  # to enable dynamic dispatch
         "-g",  # include debug symbols for better debugging experience
     ]
+    # Add C-specific warning suppressions only for C compilation
+    if not use_cpp:
+        compile_args += [
+            "-Wno-incompatible-pointer-types",  # like: passing argument 4 of `sz_export_prefix_u32` from incompatible pointer type
+            "-Wno-discarded-qualifiers",  # like: passing argument 1 of `free` discards `const` qualifier from pointer target type
+        ]
     link_args = [
         "-fPIC",  # to enable dynamic dispatch
     ]
@@ -135,7 +139,7 @@ def linux_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[Tu
 def darwin_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[Tuple[str]]]:
 
     min_macos = os.environ.get("MACOSX_DEPLOYMENT_TARGET", "11.0")
-    
+
     # Force single-architecture builds to prevent `universal2`
     if is_64bit_arm():
         current_arch_flags = ["-arch", "arm64"]
@@ -150,14 +154,18 @@ def darwin_settings(use_cpp: bool = False) -> Tuple[List[str], List[str], List[T
         "-O2",  # optimization level
         "-fcolor-diagnostics",  # color console output
         "-Wno-unknown-pragmas",  # like: `pragma region` and some unrolls
-        "-Wno-incompatible-function-pointer-types",
-        "-Wno-incompatible-pointer-types",  # like: passing argument 4 of 'sz_export_prefix_u32' from incompatible pointer type
-        "-Wno-discarded-qualifiers",  # like: passing argument 1 of 'free' discards 'const' qualifier from pointer target type
         "-fPIC",  # to enable dynamic dispatch
         # "-mfloat-abi=hard",  # NEON intrinsics not available with the soft-float ABI
         f"-mmacosx-version-min={min_macos}",  # minimum macOS version (respect env if provided)
         *current_arch_flags,  # force single architecture to prevent universal2 builds
     ]
+    # Add C-specific warning suppressions only for C compilation
+    if not use_cpp:
+        compile_args += [
+            "-Wno-incompatible-function-pointer-types",
+            "-Wno-incompatible-pointer-types",  # like: passing argument 4 of `sz_export_prefix_u32` from incompatible pointer type
+            "-Wno-discarded-qualifiers",  # like: passing argument 1 of `free` discards `const` qualifier from pointer target type
+        ]
     link_args = [
         "-fPIC",  # to enable dynamic dispatch
         *current_arch_flags,  # force single architecture to prevent universal2 builds

From f5e46d573f040003f6461a068a3da8e264702848 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 8 Sep 2025 16:27:12 +0000
Subject: [PATCH 747/751] Improve: PyTest invalid input arguments

---
 scripts/test_stringzilla.py  |  2 ++
 scripts/test_stringzillas.py | 57 ++++++++++++++++++++++++++++++++++--
 2 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
index 8acb37e0..28132557 100644
--- a/scripts/test_stringzilla.py
+++ b/scripts/test_stringzilla.py
@@ -76,6 +76,7 @@
 def log_test_environment():
     """Automatically log environment info before running any tests."""
 
+    print()  # New line for better readability
     print("=== StringZilla Test Environment ===")
     print(f"Platform: {platform.platform()}")
     print(f"Architecture: {platform.machine()}")
@@ -101,6 +102,7 @@ def log_test_environment():
             sz.reset_capabilities(desired)
 
     print("=" * 40)
+    print()  # New line for better readability
 
 
 def seed_random_generators(seed_value: Optional[int] = None):
diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index 1f716052..9a7e9614 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -50,6 +50,8 @@
 @pytest.fixture(scope="session", autouse=True)
 def log_test_environment():
     """Automatically log environment info before running any tests."""
+
+    print()  # New line for better readability
     print("=== StringZillas Test Environment ===")
     print(f"Platform: {platform.platform()}")
     print(f"Architecture: {platform.machine()}")
@@ -73,6 +75,7 @@ def log_test_environment():
             sz.reset_capabilities(desired)
 
     print("=" * 40)
+    print()  # New line for better readability
 
 
 def test_library_properties():
@@ -183,6 +186,54 @@ def test_device_scope():
         szs.DeviceScope(gpu_device="invalid")
 
 
+def test_parameter_validation():
+    """Test parameter validation and error handling for all engine types."""
+
+    # Test constructor parameter type validation
+    with pytest.raises(TypeError):
+        szs.LevenshteinDistances(open="invalid")  # wrong type
+
+    with pytest.raises(TypeError):
+        szs.LevenshteinDistances(extend="invalid")  # wrong type
+
+    with pytest.raises(TypeError):
+        szs.LevenshteinDistances(mismatch="invalid")  # wrong type
+
+    with pytest.raises(TypeError):
+        szs.Fingerprints(ndim="invalid")  # wrong type
+
+    # Test computation input validation
+    engine = szs.LevenshteinDistances()
+
+    # Test None inputs
+    with pytest.raises(TypeError):
+        engine(None, Strs(["test"]))
+
+    with pytest.raises(TypeError):
+        engine(Strs(["test"]), None)
+
+    # Test mismatched input sizes
+    with pytest.raises(ValueError):
+        a = Strs(["a", "b"])
+        b = Strs(["c"])  # Different size
+        engine(a, b)
+
+    # Test with non-Strs inputs
+    with pytest.raises(TypeError):
+        engine(["test"], Strs(["test"]))  # list instead of Strs
+
+    with pytest.raises(TypeError):
+        engine(Strs(["test"]), ["test"])  # list instead of Strs
+
+    # Test Fingerprints computation validation
+    fp_engine = szs.Fingerprints(ndim=5)
+    with pytest.raises(TypeError):
+        fp_engine(None)  # None input
+
+    with pytest.raises(TypeError):
+        fp_engine(["test"])  # list instead of Strs
+
+
 def get_random_string(
     length: Optional[int] = None,
     variability: Optional[int] = None,
@@ -442,7 +493,7 @@ def test_needleman_wunsch_vs_levenshtein_random(
     baselines = [-baseline_levenshtein_distance(a, b) for a, b in zip(a_batch, b_batch)]
 
     device_scope, base_caps = device_scope_and_capabilities(device_name)
-    engine = szs.NeedlemanWunsch(
+    engine = szs.NeedlemanWunschScores(
         capabilities=base_caps if capabilities_mode == "base" else device_scope,
         substitution_matrix=character_substitutions,
         open=-1,
@@ -498,7 +549,7 @@ def test_needleman_wunsch_against_affine_gaps(
             subs[ord(ci), ord(cj)] = ag.default_proteins_matrix[i, j]
 
     device_scope, base_caps = device_scope_and_capabilities(device_name)
-    engine = szs.NeedlemanWunsch(
+    engine = szs.NeedlemanWunschScores(
         capabilities=base_caps if capabilities_mode == "base" else device_scope,
         substitution_matrix=subs,
         open=ag.default_gap_opening,
@@ -578,7 +629,7 @@ def test_smith_waterman_against_affine_gaps(
             subs[ord(ci), ord(cj)] = ag.default_proteins_matrix[i, j]
 
     device_scope, base_caps = device_scope_and_capabilities(device_name)
-    engine = szs.SmithWaterman(
+    engine = szs.SmithWatermanScores(
         capabilities=base_caps if capabilities_mode == "base" else device_scope,
         substitution_matrix=subs,
         open=ag.default_gap_opening,

From 755f5830b95e6f65584d0d420bfe90f37a938aad Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 8 Sep 2025 16:27:29 +0000
Subject: [PATCH 748/751] Docs: Section titles

---
 CONTRIBUTING.md | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index c3c16670..d7e9ef00 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -20,12 +20,12 @@ The project is split into the following parts:
 - `include/stringzilla/stringzilla.h` - single-header C implementation.
 - `include/stringzilla/stringzilla.hpp` - single-header C++ wrapper.
 - `include/stringzillas/*` - parallel CPU/GPU header-only backends.
-- `c/*` - [C](#contributing-in-c-and-c) sources for dynamic dispatch and parallel backends.
-- `rust/*` - [Rust](#contributing-in-rust) crate sources.
-- `python/*` - [Python](#contributing-in-python) bindings.
-- `swift/*` - [Swift](#contributing-in-swift) package sources and tests.
-- `javascript/*` - [JavaScript](#contributing-in-javascript) bindings.
-- `golang/*` - [Go](#contributing-in-golang) bindings.
+- `c/*` - [C, C++, and CUDA](#c-and-c) sources for dynamic dispatch and parallel backends.
+- `rust/*` - [Rust](#rust) crate sources.
+- `python/*` - [Python](#python) bindings.
+- `swift/*` - [Swift](#swift) package sources and tests.
+- `javascript/*` - [JavaScript](#javascript) bindings.
+- `golang/*` - [Go](#golang) bindings.
 - `scripts/*` - Scripts for benchmarking and testing.
 - `cli/*` - SIMD-accelerated CLI utilities.
 
@@ -112,7 +112,7 @@ For Python code:
 
 - Use lower-case names for functions and variables.
 
-## Contributing in C++ and C
+## C++ and C
 
 The primary C implementation and the C++ wrapper are built with CMake.
 Assuming the extensive use of new SIMD intrinsics and recent C++ language features, using a recent compiler is recommended.
@@ -418,7 +418,7 @@ cmake -D CMAKE_BUILD_TYPE=Release \
 cmake --build build_artifacts --config Release
 ```
 
-## Contributing in Parallel C++ and CUDA
+## Parallel C++ and CUDA
 
 ```sh
 cmake -D CMAKE_BUILD_TYPE=Debug -D STRINGZILLA_BUILD_TEST=1 -B build_debug
@@ -437,7 +437,7 @@ cuda-gdb ./build_debug/stringzillas_test_cu20
 cuda-memcheck ./build_debug/stringzillas_test_cu20
 ```
 
-## Contributing in Python
+## Python
 
 Python bindings are implemented using pure CPython, so you wouldn't need to install SWIG, PyBind11, or any other third-party library.
 Still, you need a virtual environment, and it's recommended to use `uv` to create one.
@@ -549,7 +549,7 @@ uv run --no-project scripts/bench_fingerprints.py --help
 
 Alternatively, you can explore the Jupyter notebooks in `scripts/` directory.
 
-## Contributing in JavaScript
+## JavaScript
 
 ```bash
 npm install
@@ -563,7 +563,7 @@ npm link stringzilla
 node --input-type=module -e "import('stringzilla').then(m=>console.log(m.default.capabilities))"
 ```
 
-## Contributing in Swift
+## Swift
 
 ```bash
 swift build && swift test
@@ -582,7 +582,7 @@ To format the code on Linux:
 sudo docker run --rm -v "$PWD:/workspace" -w /workspace swift:6.0 /bin/bash -c "swift format . -i -r --configuration .swift-format"
 ```
 
-## Contributing in Rust
+## Rust
 
 StringZilla's Rust crate supports both `std` and `no_std` builds.
 Other options include:
@@ -615,7 +615,7 @@ cargo package --list --allow-dirty
 
 If you want to run benchmarks against third-party implementations, check out the [`ashvardanian/memchr_vs_stringzilla`](https://github.com/ashvardanian/memchr_vs_stringzilla/) repository.
 
-## Contributing in GoLang
+## GoLang
 
 First, precompile the C library:
 

From f9cbb00bf8ade67bea81a24ed1b7d289bd2623d1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 8 Sep 2025 16:36:09 +0000
Subject: [PATCH 749/751] Improve: Allow `RuntimeError` for engine calls

---
 scripts/test_stringzillas.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
index 9a7e9614..67962db2 100644
--- a/scripts/test_stringzillas.py
+++ b/scripts/test_stringzillas.py
@@ -205,32 +205,32 @@ def test_parameter_validation():
     # Test computation input validation
     engine = szs.LevenshteinDistances()
 
-    # Test None inputs
-    with pytest.raises(TypeError):
+    # Test None inputs - expect either `TypeError` or `RuntimeError` (GPU memory issues)
+    with pytest.raises((TypeError, RuntimeError)):
         engine(None, Strs(["test"]))
 
-    with pytest.raises(TypeError):
+    with pytest.raises((TypeError, RuntimeError)):
         engine(Strs(["test"]), None)
 
     # Test mismatched input sizes
-    with pytest.raises(ValueError):
+    with pytest.raises((ValueError, RuntimeError)):
         a = Strs(["a", "b"])
         b = Strs(["c"])  # Different size
         engine(a, b)
 
     # Test with non-Strs inputs
-    with pytest.raises(TypeError):
+    with pytest.raises((TypeError, RuntimeError)):
         engine(["test"], Strs(["test"]))  # list instead of Strs
 
-    with pytest.raises(TypeError):
+    with pytest.raises((TypeError, RuntimeError)):
         engine(Strs(["test"]), ["test"])  # list instead of Strs
 
     # Test Fingerprints computation validation
     fp_engine = szs.Fingerprints(ndim=5)
-    with pytest.raises(TypeError):
+    with pytest.raises((TypeError, RuntimeError)):
         fp_engine(None)  # None input
 
-    with pytest.raises(TypeError):
+    with pytest.raises((TypeError, RuntimeError)):
         fp_engine(["test"])  # list instead of Strs
 
 

From 492ecc00b254423a9d98dab38a0d77f96c651259 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 8 Sep 2025 19:35:49 +0000
Subject: [PATCH 750/751] Make: Upgrade `setuptools` in CI

---
 .github/workflows/prerelease.yml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index ef0ddb30..e82f59b5 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -327,9 +327,12 @@ jobs:
         uses: actions/setup-python@v5.6.0
         with:
           python-version: ${{ env.PYTHON_VERSION }}
+        # Ensure modern build backend is available for editable installs
       - name: Build Python StringZillas-CPUs
         run: |
-          pip install pytest pytest-repeat numpy pyarrow wheel affine-gaps
+          python -m pip install --upgrade pip
+          pip install "setuptools>=64" wheel
+          pip install pytest pytest-repeat numpy pyarrow affine-gaps
           SZ_TARGET=stringzilla pip install -e . --force-reinstall --no-build-isolation
           SZ_TARGET=stringzillas-cpus pip install -e . --force-reinstall --no-build-isolation
       - name: Log CPU and Python capabilities
@@ -409,8 +412,11 @@ jobs:
         uses: actions/setup-python@v5.6.0
         with:
           python-version: ${{ env.PYTHON_VERSION }}
+        # Ensure modern build backend is available for editable installs
       - name: Build Python StringZillas-CUDA
         run: |
+          python -m pip install --upgrade pip
+          pip install "setuptools>=64" wheel
           pip install pytest pytest-repeat numpy pyarrow wheel affine-gaps
           SZ_TARGET=stringzilla pip install -e . --force-reinstall --no-build-isolation
           SZ_TARGET=stringzillas-cuda pip install -e . --force-reinstall --no-build-isolation

From c08c6c3181437db190c1d0854b4ba37ca9963e9a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 8 Sep 2025 20:37:27 +0000
Subject: [PATCH 751/751] Docs: Pre-release stats update

---
 CONTRIBUTING.md                |   1 -
 README.md                      |  29 +-
 drafts/bench_fingerprints.py   | 494 +++++++++++++++++++++++++++++++++
 include/stringzillas/types.cuh |   8 +
 scripts/bench_fingerprints.py  | 300 --------------------
 scripts/bench_similarities.py  |  22 +-
 setup.py                       |   2 +-
 7 files changed, 533 insertions(+), 323 deletions(-)
 create mode 100644 drafts/bench_fingerprints.py
 delete mode 100644 scripts/bench_fingerprints.py

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d7e9ef00..f8ebda74 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -544,7 +544,6 @@ For benchmarking, the following scripts are provided.
 uv run --no-project scripts/bench_find.py --help
 uv run --no-project scripts/bench_sequence.py --help
 uv run --no-project scripts/bench_similarities.py --help
-uv run --no-project scripts/bench_fingerprints.py --help
 ```
 
 Alternatively, you can explore the Jupyter notebooks in `scripts/` directory.
diff --git a/README.md b/README.md
index c3055a84..6bffaade 100644
--- a/README.md
+++ b/README.md
@@ -238,38 +238,40 @@ __Who is this for?__
   </tr>
   <!-- Edit Distance -->
   <tr>
-    <td colspan="4" align="center">Levenshtein edit distance, ≅ 5 bytes long</td>
+    <td colspan="4" align="center">Levenshtein edit distance, text lines ≅ 100 bytes long</td>
   </tr>
   <tr>
     <td align="center">⚪</td>
     <td align="center">⚪</td>
     <td align="center">
-      via <code>jellyfish</code> <sup>3</sup><br/>
-      <span style="color:#ABABAB;">x86:</span> <b>1,550</b> &centerdot;
-      <span style="color:#ABABAB;">arm:</span> <b>2,220</b> ns
+      via <code>NLTK</code> <sup>3</sup><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>2,490,161</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>2,081,543</b> CUPS
     </td>
     <td align="center">
-      <code>szs_levenshtein_distance</code><br/>
-      <span style="color:#ABABAB;">x86:</span> <b>99</b> &centerdot;
-      <span style="color:#ABABAB;">arm:</span> <b>180</b> ns
+      <code>szs_levenshtein_distances_t</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>78,851,644</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>36,857,367</b> &centerdot;
+      <span style="color:#ABABAB;">cuda:</span> <b>3,369,569,512</b> CUPS
     </td>
   </tr>
   <!-- Alignment Score -->
   <tr>
-    <td colspan="4" align="center">Needleman-Wunsch alignment scores, ≅ 10 K amino acids long</td>
+    <td colspan="4" align="center">Needleman-Wunsch alignment scores, proteins ≅ 1 K amino acids long</td>
   </tr>
   <tr>
     <td align="center">⚪</td>
     <td align="center">⚪</td>
     <td align="center">
       via <code>biopython</code> <sup>4</sup><br/>
-      <span style="color:#ABABAB;">x86:</span> <b>257</b> &centerdot;
-      <span style="color:#ABABAB;">arm:</span> <b>367</b> ms
+      <span style="color:#ABABAB;">x86:</span> <b>575,981,513</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>436,350,732</b> CUPS
     </td>
     <td align="center">
-      <code>szs_needleman_wunsch_score</code><br/>
-      <span style="color:#ABABAB;">x86:</span> <b>73</b> &centerdot;
-      <span style="color:#ABABAB;">arm:</span> <b>177</b> ms
+      <code>szs_needleman_wunsch_scores_t</code><br/>
+      <span style="color:#ABABAB;">x86:</span> <b>452,629,942</b> &centerdot;
+      <span style="color:#ABABAB;">arm:</span> <b>520,170,239</b> &centerdot;
+      <span style="color:#ABABAB;">cuda:</span> <b>9,017,327,818</b> CUPS
     </td>
   </tr>
 </table>
@@ -283,6 +285,7 @@ Notably, if the CPU supports misaligned loads, even the 64-bit SWAR backends are
 > The benchmarks were performed on Arm-based Graviton3 AWS `c7g` instances and `r7iz` Intel Sapphire Rapids.
 > Most modern Arm-based 64-bit CPUs will have similar relative speedups.
 > Variance within x86 CPUs will be larger.
+> For CUDA benchmarks, the Nvidia H100 GPUs were used.
 > <sup>1</sup> Unlike other libraries, LibC requires strings to be NULL-terminated.
 > <sup>2</sup> Six whitespaces in the ASCII set are: ` \t\n\v\f\r`. Python's and other standard libraries have specialized functions for those.
 > <sup>3</sup> Most Python libraries for strings are also implemented in C.
diff --git a/drafts/bench_fingerprints.py b/drafts/bench_fingerprints.py
new file mode 100644
index 00000000..d0970ee5
--- /dev/null
+++ b/drafts/bench_fingerprints.py
@@ -0,0 +1,494 @@
+# /// script
+# dependencies = [
+#   "stringzilla",
+#   "datasketch",
+#   "scikit-learn",
+#   "numpy",
+#   "tqdm",
+# ]
+# ///
+"""
+StringZilla fingerprinting benchmark script.
+
+Benchmarks document fingerprinting/sketching across libraries:
+- datasketch: MinHash and update_batch
+- scikit-learn: HashingVectorizer (word / char n-grams)
+- stringzillas: SIMD/GPU-accelerated fingerprints (CPU/GPU, single + batch)
+
+Highlights (aligned with bench_similarities.py):
+- Monotonic timing and progress with pairs/s, MB/s
+- Optional batching for functions that can benefit
+- Regex filter to select benchmarks
+
+Examples:
+  uv run --no-project scripts/bench_fingerprints.py --dataset leipzig1M.txt
+  uv run --no-project scripts/bench_fingerprints.py --dataset leipzig1M.txt -n 10000 -d 256 -b 1024 -k "(sklearn|szs)"
+"""
+
+import os
+import time
+import argparse
+import itertools
+import re
+from pathlib import Path
+from typing import Callable, Any, List, Optional
+
+from tqdm import tqdm
+import numpy as np
+
+from datasketch import MinHash
+from sklearn.feature_extraction.text import HashingVectorizer
+import stringzillas as szs
+import stringzilla as sz
+
+# Global state for MinHash to avoid repeated initialization
+_datasketch_min_hash_state = None
+_sklearn_feature_hasher = None
+_sklearn_words_vectorizer = None
+_sklearn_ngrams_vectorizer = None
+
+
+def _name_matches(name: str, pattern: Optional[re.Pattern]) -> bool:
+    return True if pattern is None else bool(pattern.search(name))
+
+
+def _checksum_from_results(result) -> int:
+    """Normalize different return types to an integer checksum.
+
+    - numpy arrays → sum of elements
+    - tuple of arrays (hashes, counts) → sum both
+    - iterable of numerics → sum
+    - scalar → int(result)
+    """
+    try:
+        if isinstance(result, tuple) and len(result) == 2:
+            a, b = result
+            sa = int(np.asarray(a).sum())
+            sb = int(np.asarray(b).sum())
+            return sa + sb
+        if isinstance(result, np.ndarray):
+            return int(result.sum())
+        if hasattr(result, "__iter__") and not isinstance(result, (str, bytes)):
+            return int(sum(int(x) for x in result))
+        return int(result)
+    except Exception:
+        return 0
+
+
+def log(
+    name: str,
+    documents: List[str],
+    document_sizes: List[int],
+    single_doc: Optional[Callable[[str], Any]] = None,
+    batch_docs: Optional[Callable[[List[str]], Iterable[Any]]] = None,
+    timeout_seconds: int = 10,
+    batch_size: int = 1,
+    ops_counter: Optional[Callable[[List[str], Iterable[Any]], int]] = None,
+):
+    """Benchmark an operation with timeout, batching, progress, and checksum.
+
+    Provide one or both callables. If both are given and batch_size > 1, uses batch_docs.
+    """
+    processed_docs = 0
+    processed_bytes = 0
+    checksum = 0
+    start_ns = time.monotonic_ns()
+
+    try:
+        bar = tqdm(desc=name, unit="docs", leave=False, total=len(documents))
+        for batch_indices in itertools.batched(range(len(documents)), max(1, batch_size)):
+            if (time.monotonic_ns() - start_ns) > int(timeout_seconds * 1e9):
+                break
+
+            batch_documents = [documents[i] for i in batch_indices]
+            batch_bytes = [document_sizes[i] for i in batch_indices]
+
+            # Choose batch vs single path explicitly
+            results_iterable: Iterable[Any]
+            if batch_docs is not None and batch_size > 1:
+                results = batch_docs(batch_documents)
+                if hasattr(results, "__iter__") and not isinstance(results, (str, bytes)):
+                    results_iterable = results
+                else:
+                    results_iterable = [results]
+            else:
+                if single_doc is None:
+                    raise ValueError("single_doc callable is required when batch_docs is not provided")
+                results_iterable = (single_doc(doc) for doc in batch_documents)
+
+            for result in results_iterable:
+                checksum += _checksum_from_results(result)
+
+            processed_docs += len(batch_documents)
+            processed_bytes += sum(batch_bytes)
+
+            # Count operations (hashes computed) if provided
+            if ops_counter is not None:
+                try:
+                    # Recompute results for ops counting if generator was consumed
+                    if batch_docs is not None and batch_size > 1:
+                        results_for_ops = batch_docs(batch_documents)
+                    else:
+                        results_for_ops = [single_doc(doc) for doc in batch_documents]  # type: ignore[arg-type]
+                    total_ops = total_ops + ops_counter(batch_documents, results_for_ops)
+                except Exception:
+                    pass
+
+            elapsed_s = (time.monotonic_ns() - start_ns) / 1e9
+            if elapsed_s > 0:
+                docs_per_sec = processed_docs / elapsed_s
+                mb_per_sec = processed_bytes / (1e6 * elapsed_s)
+                hashes_per_sec = (total_ops / elapsed_s) if "total_ops" in locals() else 0
+                bar.set_postfix(
+                    {
+                        "docs/s": f"{docs_per_sec:.0f}",
+                        "MB/s": f"{mb_per_sec:.1f}",
+                        "hashes/s": f"{hashes_per_sec:.0f}",
+                        "chk": checksum,
+                    }
+                )
+            bar.update(len(batch_documents))
+        bar.close()
+    except KeyboardInterrupt:
+        print(f"\n{name}: SKIPPED (interrupted by user)")
+        return
+
+    total_time_s = (time.monotonic_ns() - start_ns) / 1e9
+    if processed_docs:
+        docs_per_sec = processed_docs / total_time_s
+        mb_per_sec = processed_bytes / (1e6 * total_time_s)
+        hashes_per_sec = (total_ops / total_time_s) if "total_ops" in locals() else 0
+        extra = f", {hashes_per_sec:.0f} hashes/s" if hashes_per_sec else ""
+        print(
+            f"{name}: {processed_docs:,} docs in {total_time_s:.2f}s ~ {mb_per_sec:.3f} MB/s, {docs_per_sec:.0f} docs/s{extra}, checksum={checksum}"
+        )
+    else:
+        print(f"{name}: No documents processed")
+
+
+def benchmark_third_party_fingerprints(
+    docs: List[str],
+    docs_sizes: List[int],
+    dimensions: int,
+    timeout_seconds: int = 10,
+    batch_size: int = 1,
+    filter_pattern: Optional[re.Pattern] = None,
+):
+    """Benchmark third-party fingerprinting/sketching implementations."""
+    global _datasketch_min_hash_state, _sklearn_feature_hasher, _sklearn_words_vectorizer, _sklearn_ngram_vectorizer
+
+    binary_docs = [doc.encode("utf-8") for doc in docs]
+    if _datasketch_min_hash_state is None:
+        _datasketch_min_hash_state = MinHash(num_perm=dimensions)
+
+    def datasketch_minhash_update(doc: bytes) -> np.ndarray:
+        _datasketch_min_hash_state.update(doc)
+        digest = _datasketch_min_hash_state.digest()
+        _datasketch_min_hash_state.clear()
+        return digest
+
+    if _name_matches("datasketch.MinHash.update", filter_pattern):
+        log(
+            "datasketch.MinHash.update",
+            binary_docs,
+            docs_sizes,
+            datasketch_minhash_update,
+            timeout_seconds,
+            batch_size=1,
+            ops_counter=lambda batch_docs, _res: len(batch_docs) * dimensions,
+        )
+
+    def datasketch_minhash_update_batch(
+        doc: bytes,
+        window_width: int = 3,
+    ) -> np.ndarray:
+        ngrams = (doc[i : i + window_width] for i in range(len(doc) - window_width + 1))
+        _datasketch_min_hash_state.update_batch(ngrams)
+        digest = _datasketch_min_hash_state.digest()
+        _datasketch_min_hash_state.clear()
+        return digest
+
+    if _name_matches("datasketch.MinHash.update_batch(ngrams)", filter_pattern):
+        ww = 3
+        log(
+            "datasketch.MinHash.update_batch(ngrams)",
+            binary_docs,
+            docs_sizes,
+            datasketch_minhash_update_batch,
+            timeout_seconds,
+            batch_size=1,
+            ops_counter=lambda batch_docs, _res, ww=ww: sum(max(len(d) - ww + 1, 0) for d in batch_docs) * dimensions,
+        )
+
+    _sklearn_words_vectorizer = HashingVectorizer(
+        n_features=dimensions,
+        analyzer="word",
+        decode_error="ignore",
+        norm=None,
+    )
+
+    def sklearn_words_vectorizer(doc: str):
+        return _sklearn_words_vectorizer.transform([doc])
+
+    if _name_matches("sklearn.HashingVectorizer(word)", filter_pattern):
+        log(
+            name="sklearn.HashingVectorizer(word)",
+            documents=docs,
+            document_sizes=docs_sizes,
+            single_doc=sklearn_words_vectorizer,
+            timeout_seconds=timeout_seconds,
+            batch_size=1,
+            ops_counter=lambda batch_documents, results: sum(getattr(r, "nnz", 0) for r in results),
+        )
+
+    _sklearn_ngrams_vectorizer = HashingVectorizer(
+        n_features=dimensions,
+        analyzer="char",
+        ngram_range=(3, 17),  # trigrams and larger, up to 17-grams
+        decode_error="ignore",
+        norm=None,
+    )
+
+    def sklearn_ngrams_vectorizer(doc: str):
+        return _sklearn_ngrams_vectorizer.transform([doc])
+
+    if _name_matches("sklearn.HashingVectorizer(ngram)", filter_pattern):
+        log(
+            name="sklearn.HashingVectorizer(ngram)",
+            documents=docs,
+            document_sizes=docs_sizes,
+            single_doc=sklearn_ngrams_vectorizer,
+            timeout_seconds=timeout_seconds,
+            batch_size=1,
+            ops_counter=lambda batch_documents, results: sum(getattr(r, "nnz", 0) for r in results),
+        )
+
+
+def benchmark_szs_fingerprints(
+    docs: List[str],
+    docs_sizes: List[int],
+    dimensions: int,
+    timeout_seconds: int = 10,
+    batch_size: int = 1,
+    filter_pattern: Optional[re.Pattern] = None,
+):
+    """Benchmark StringZillas Fingerprints across device scopes and modes."""
+    cpu_cores = os.cpu_count()
+    default_scope = szs.DeviceScope()
+    cpu_scope = szs.DeviceScope(cpu_cores=cpu_cores)
+    try:
+        gpu_scope = szs.DeviceScope(gpu_device=0)
+    except Exception:
+        gpu_scope = None
+
+    # Per-doc kernels (single doc per call) for parity with scalar libs
+    if _name_matches("szs.Fingerprints(1xCPU)", filter_pattern):
+        engine = szs.Fingerprints(ndim=dimensions, capabilities=default_scope)
+
+        def kernel(doc: str) -> int:
+            hashes, counts = engine(sz.Strs([doc]), device=default_scope)
+            return _checksum_from_results((hashes, counts))
+
+        log(
+            name="szs.Fingerprints(1xCPU)",
+            documents=docs,
+            document_sizes=docs_sizes,
+            single_doc=kernel,
+            timeout_seconds=timeout_seconds,
+            batch_size=1,
+            ops_counter=lambda batch_documents, _results: len(batch_documents) * dimensions,
+        )
+
+    if _name_matches(f"szs.Fingerprints({cpu_cores}xCPU)", filter_pattern):
+        engine = szs.Fingerprints(ndim=dimensions, capabilities=cpu_scope)
+
+        def kernel(doc: str) -> int:
+            hashes, counts = engine(sz.Strs([doc]), device=cpu_scope)
+            return _checksum_from_results((hashes, counts))
+
+        log(
+            name=f"szs.Fingerprints({cpu_cores}xCPU)",
+            documents=docs,
+            document_sizes=docs_sizes,
+            single_doc=kernel,
+            timeout_seconds=timeout_seconds,
+            batch_size=1,
+            ops_counter=lambda batch_documents, _results, d=dimensions: len(batch_documents) * d,
+        )
+
+    if gpu_scope is not None and _name_matches("szs.Fingerprints(1xGPU)", filter_pattern):
+        engine = szs.Fingerprints(ndim=dimensions, capabilities=gpu_scope)
+
+        def kernel(doc: str) -> int:
+            hashes, counts = engine(sz.Strs([doc]), device=gpu_scope)
+            return _checksum_from_results((hashes, counts))
+
+        log(
+            name="szs.Fingerprints(1xGPU)",
+            documents=docs,
+            document_sizes=docs_sizes,
+            single_doc=kernel,
+            timeout_seconds=timeout_seconds,
+            batch_size=1,
+            ops_counter=lambda batch_documents, _results, d=dimensions: len(batch_documents) * d,
+        )
+
+    # Batched kernels (list[str] → list[int] checksums)
+    if _name_matches(f"szs.Fingerprints(1xCPU,batch={batch_size})", filter_pattern):
+        engine = szs.Fingerprints(ndim=dimensions, capabilities=default_scope)
+
+        def kernel_batch(batch_docs: List[str]) -> List[int]:
+            hashes, counts = engine(sz.Strs(batch_docs), device=default_scope)
+            # Reduce per document: sum of row in both arrays
+            hashes = np.asarray(hashes)
+            counts = np.asarray(counts)
+            per_doc = hashes.sum(axis=1).astype(np.int64) + counts.sum(axis=1).astype(np.int64)
+            return [int(x) for x in per_doc]
+
+        log(
+            name=f"szs.Fingerprints(1xCPU,batch={batch_size})",
+            documents=docs,
+            document_sizes=docs_sizes,
+            batch_docs=kernel_batch,
+            timeout_seconds=timeout_seconds,
+            batch_size=batch_size,
+            ops_counter=lambda batch_documents, _results, d=dimensions: len(batch_documents) * d,
+        )
+
+    if _name_matches(f"szs.Fingerprints({cpu_cores}xCPU,batch={batch_size})", filter_pattern):
+        engine = szs.Fingerprints(ndim=dimensions, capabilities=cpu_scope)
+
+        def kernel_batch(batch_docs: List[str]) -> List[int]:
+            hashes, counts = engine(sz.Strs(batch_docs), device=cpu_scope)
+            hashes = np.asarray(hashes)
+            counts = np.asarray(counts)
+            per_doc = hashes.sum(axis=1).astype(np.int64) + counts.sum(axis=1).astype(np.int64)
+            return [int(x) for x in per_doc]
+
+        log(
+            name=f"szs.Fingerprints({cpu_cores}xCPU,batch={batch_size})",
+            documents=docs,
+            document_sizes=docs_sizes,
+            batch_docs=kernel_batch,
+            timeout_seconds=timeout_seconds,
+            batch_size=batch_size,
+            ops_counter=lambda batch_documents, _results, d=dimensions: len(batch_documents) * d,
+        )
+
+    if gpu_scope is not None and _name_matches(f"szs.Fingerprints(1xGPU,batch={batch_size})", filter_pattern):
+        engine = szs.Fingerprints(ndim=dimensions, capabilities=gpu_scope)
+
+        def kernel_batch(batch_docs: List[str]) -> List[int]:
+            hashes, counts = engine(sz.Strs(batch_docs), device=gpu_scope)
+            hashes = np.asarray(hashes)
+            counts = np.asarray(counts)
+            per_doc = hashes.sum(axis=1).astype(np.int64) + counts.sum(axis=1).astype(np.int64)
+            return [int(x) for x in per_doc]
+
+        log(
+            name=f"szs.Fingerprints(1xGPU,batch={batch_size})",
+            documents=docs,
+            document_sizes=docs_sizes,
+            batch_docs=kernel_batch,
+            timeout_seconds=timeout_seconds,
+            batch_size=batch_size,
+            ops_counter=lambda batch_documents, _results, d=dimensions: len(batch_documents) * d,
+        )
+
+
+def bench(
+    dataset_path: str,
+    max_docs: int = None,
+    dimensions: int = 1024,
+    timeout_seconds: int = 10,
+    batch_size: int = 1,
+    filter_pattern: Optional[re.Pattern] = None,
+):
+    """Run fingerprinting benchmarks."""
+
+    # Load dataset
+    if not Path(dataset_path).exists():
+        raise FileNotFoundError(f"Dataset not found: {dataset_path}")
+
+    with open(dataset_path, "r", encoding="utf-8", errors="ignore") as f:
+        docs = [doc.strip() for doc in f if doc.strip()]
+
+    if max_docs:
+        docs = docs[:max_docs]
+
+    docs_sizes = [len(doc.encode("utf-8")) for doc in docs]
+
+    print(f"Prepared {len(docs):,} docs of {sum(docs_sizes)/len(docs_sizes):.1f} mean byte length!")
+    print(f"Total bytes: {sum(docs_sizes):,}")
+    print(f"Num hashes: {dimensions}")
+    print()
+
+    print("=== Fingerprinting & Sketching Benchmarks ===")
+    benchmark_third_party_fingerprints(docs, docs_sizes, dimensions, timeout_seconds, batch_size, filter_pattern)
+    benchmark_szs_fingerprints(docs, docs_sizes, dimensions, timeout_seconds, batch_size, filter_pattern)
+
+
+_main_epilog = """
+Examples:
+
+  # Benchmark with a file
+  %(prog)s --dataset leipzig1M.txt
+
+  # Benchmark with limited docs
+  %(prog)s --dataset leipzig1M.txt --max-docs 1000
+
+  # Custom parameters
+  %(prog)s --dataset leipzig1M.txt --dimensions 32
+"""
+
+
+def main():
+    """Main entry point with argument parsing."""
+    parser = argparse.ArgumentParser(
+        description="Benchmark StringZilla fingerprinting operations",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=_main_epilog,
+    )
+
+    parser.add_argument("--dataset", required=True, help="Path to text dataset file")
+    parser.add_argument("-n", "--max-docs", type=int, help="Maximum number of docs to process")
+    parser.add_argument(
+        "-d",
+        "--dimensions",
+        type=int,
+        default=1024,
+        help="Number of hash functions for MinHash (default: 1024)",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=10,
+        help="Timeout in seconds for each benchmark (default: 10)",
+    )
+    parser.add_argument(
+        "-b",
+        "--batch-size",
+        type=int,
+        default=1,
+        help="Batch size for batch-capable APIs (default: 1)",
+    )
+    parser.add_argument(
+        "-k",
+        "--filter",
+        metavar="REGEX",
+        help="Regex to select which benchmarks to run by name",
+    )
+
+    args = parser.parse_args()
+    pattern = None
+    if args.filter:
+        try:
+            pattern = re.compile(args.filter)
+        except re.error as e:
+            parser.error(f"Invalid regex for --filter/-k: {e}")
+
+    bench(args.dataset, args.max_docs, args.dimensions, args.timeout, args.batch_size, pattern)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/include/stringzillas/types.cuh b/include/stringzillas/types.cuh
index 089c735c..d37eb2a7 100644
--- a/include/stringzillas/types.cuh
+++ b/include/stringzillas/types.cuh
@@ -20,6 +20,10 @@
 #include <optional>  // `std::optional`
 #include <algorithm> // `std::sort`, `std::partition`
 
+/**
+ *  Hopper-generation logic requires SM90+ (i.e. `__CUDA_ARCH__ >= 900`).
+ *  For dynamic dispatch, however, it's more sensible to check the CUDA version (i.e. `__CUDACC_VER_MAJOR__ >= 11`).
+ */
 #if !defined(SZ_USE_HOPPER)
 #if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
 #define SZ_USE_HOPPER (1)
@@ -28,6 +32,10 @@
 #endif
 #endif
 
+/**
+ *  Kepler-generation logic requires SM30+ (i.e. `__CUDA_ARCH__ >= 300`).
+ *  For dynamic dispatch, however, it's more sensible to check the CUDA version (i.e. `__CUDACC_VER_MAJOR__ >= 11`).
+ */
 #if !defined(SZ_USE_KEPLER)
 #if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 3)
 #define SZ_USE_KEPLER (1)
diff --git a/scripts/bench_fingerprints.py b/scripts/bench_fingerprints.py
deleted file mode 100644
index 080a3baa..00000000
--- a/scripts/bench_fingerprints.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# /// script
-# dependencies = [
-#   "stringzilla",
-#   "datasketch",
-#   "scikit-learn",
-#   "numpy",
-#   "tqdm",
-# ]
-# ///
-"""
-StringZilla fingerprinting benchmark script.
-
-This script benchmarks MinHash fingerprinting operations using specialized sketching libraries:
-- datasketch: MinHash, HyperLogLog, and LSH implementations
-- sklearn: Feature hashing and MinHash variants
-
-Example usage via UV:
-
-    # Benchmark with a file
-    uv run --no-project scripts/bench_fingerprints.py --dataset leipzig1M.txt
-
-    # Benchmark with limited docs
-    uv run --no-project scripts/bench_fingerprints.py --dataset leipzig1M.txt --max-docs 1000
-
-    # Benchmark with custom parameters
-    uv run --no-project scripts/bench_fingerprints.py --dataset leipzig1M.txt --dimensions 32
-"""
-
-import os
-import time
-import argparse
-from pathlib import Path
-from typing import Callable, Iterable
-
-from tqdm import tqdm
-import numpy as np
-
-from datasketch import MinHash
-from sklearn.feature_extraction.text import HashingVectorizer
-import stringzillas as szs
-import stringzilla as sz
-
-# Global state for MinHash to avoid repeated initialization
-_datasketch_min_hash_state = None
-_sklearn_feature_hasher = None
-_sklearn_words_vectorizer = None
-_sklearn_ngram_vectorizer = None
-
-
-def log(
-    name: str,
-    docs: Iterable[str],
-    docs_sizes: Iterable[int],
-    operation_func: Callable,
-    timeout_seconds: int = 10,
-):
-    """Benchmark an operation with timeout and progress tracking."""
-    processed_docs = 0
-    processed_bytes = 0
-    start_time = time.time_ns()
-
-    try:
-        with tqdm(desc=name, unit="docs", leave=False, total=len(docs)) as progress_bar:
-            for doc, doc_size in zip(docs, docs_sizes):
-
-                # Check timeout (convert seconds to nanoseconds)
-                if time.time_ns() - start_time > timeout_seconds * 1e9:
-                    break
-
-                try:
-                    operation_func(doc)
-                    processed_docs += 1
-                    processed_bytes += doc_size
-
-                    # Update progress bar with custom rate
-                    elapsed_ns = time.time_ns() - start_time
-                    elapsed_s = elapsed_ns / 1e9
-                    if elapsed_s > 0:
-                        docs_per_sec = processed_docs / elapsed_s
-                        bytes_per_sec = processed_bytes / elapsed_s
-                        progress_bar.set_postfix(
-                            {
-                                "docs/s": f"{docs_per_sec:.0f}",
-                                "MB/s": f"{bytes_per_sec/1e6:.1f}",
-                            }
-                        )
-                    progress_bar.update(1)
-
-                except Exception:
-                    # Skip failed operations but continue
-                    continue
-
-    except KeyboardInterrupt:
-        print(f"\n{name}: SKIPPED (interrupted by user)")
-        return
-
-    total_time_ns = time.time_ns() - start_time
-    total_time_s = total_time_ns / 1e9
-    if processed_docs > 0:
-        docs_per_sec = processed_docs / total_time_s
-        mb_per_sec = processed_bytes / (1e6 * total_time_s)
-        print(
-            f"{name}: {processed_docs:,} docs in {total_time_s:.2f}s ~ {mb_per_sec:.3f} MB/s, {docs_per_sec:.0f} docs/s"
-        )
-    else:
-        print(f"{name}: No documents processed")
-
-
-def log_fingerprinting_functionality(
-    docs: Iterable[str],
-    docs_sizes: Iterable[int],
-    dimensions: int,
-    timeout_seconds: int = 10,
-):
-    """Benchmark fingerprinting and sketching implementations."""
-    global _datasketch_min_hash_state, _sklearn_feature_hasher, _sklearn_words_vectorizer, _sklearn_ngram_vectorizer
-
-    binary_docs = [doc.encode("utf-8") for doc in docs]
-    if _datasketch_min_hash_state is None:
-        _datasketch_min_hash_state = MinHash(num_perm=dimensions)
-
-    def datasketch_minhash_update(doc: bytes) -> np.ndarray:
-        _datasketch_min_hash_state.update(doc)
-        digest = _datasketch_min_hash_state.digest()
-        _datasketch_min_hash_state.clear()
-        return digest
-
-    log(
-        "datasketch.MinHash.update",
-        binary_docs,
-        docs_sizes,
-        datasketch_minhash_update,
-        timeout_seconds,
-    )
-
-    def datasketch_minhash_update_batch(
-        doc: bytes,
-        window_width: int = 3,
-    ) -> np.ndarray:
-        ngrams = (doc[i : i + window_width] for i in range(len(doc) - window_width + 1))
-        _datasketch_min_hash_state.update_batch(ngrams)
-        digest = _datasketch_min_hash_state.digest()
-        _datasketch_min_hash_state.clear()
-        return digest
-
-    log(
-        "datasketch.MinHash.update_batch(ngrams)",
-        binary_docs,
-        docs_sizes,
-        datasketch_minhash_update_batch,
-        timeout_seconds,
-    )
-
-    _sklearn_words_vectorizer = HashingVectorizer(
-        n_features=dimensions,
-        analyzer="word",
-        decode_error="ignore",
-        norm=None,
-    )
-
-    def sklearn_words_vectorizer(doc: bytes) -> list:
-        return _sklearn_words_vectorizer.transform([doc]).toarray()
-
-    log(
-        "sklearn.HashingVectorizer(word)",
-        docs,
-        docs_sizes,
-        sklearn_words_vectorizer,
-        timeout_seconds,
-    )
-
-    _sklearn_ngrams_vectorizer = HashingVectorizer(
-        n_features=dimensions,
-        analyzer="char",
-        ngram_range=(3, 17),  # trigrams and larger, up to 17-grams
-        decode_error="ignore",
-        norm=None,
-    )
-
-    def sklearn_ngrams_vectorizer(doc: bytes) -> list:
-        return _sklearn_ngrams_vectorizer.transform([doc]).toarray()
-
-    log(
-        "sklearn.HashingVectorizer(ngram)",
-        docs,
-        docs_sizes,
-        sklearn_ngrams_vectorizer,
-        timeout_seconds,
-    )
-
-    # Convert docs to Strs object
-    docs_strs = sz.Strs(docs)
-
-    # Benchmark batch CPU fingerprinting
-    if "serial" in szs.__capabilities__:
-        start_time = time.time_ns()
-        cpu_scope = szs.DeviceScope(cpu_cores=os.cpu_count())
-        fingerprints_cpu = szs.Fingerprints(ndim=dimensions, capabilities=cpu_scope)
-        hashes, counts = fingerprints_cpu(docs_strs, device=cpu_scope)
-        end_time = time.time_ns()
-        total_time_s = (end_time - start_time) / 1e9
-        total_bytes = sum(docs_sizes)
-        mb_per_sec = total_bytes / (1e6 * total_time_s)
-        docs_per_sec = len(docs) / total_time_s
-        print(
-            f"stringzillas.Fingerprints(CPU): {len(docs):,} docs in {total_time_s:.2f}s ~ {mb_per_sec:.3f} MB/s, {docs_per_sec:.0f} docs/s"
-        )
-    else:
-        print(f"stringzillas.Fingerprints(CPU): FAILED - no CPU support?!")
-
-    # Benchmark batch GPU fingerprinting
-    if "cuda" in szs.__capabilities__:
-        start_time = time.time_ns()
-        gpu_scope = szs.DeviceScope(gpu_device=0)
-        fingerprints_gpu = szs.Fingerprints(ndim=dimensions, capabilities=gpu_scope)
-        hashes, counts = fingerprints_gpu(docs_strs, device=gpu_scope)
-        end_time = time.time_ns()
-        total_time_s = (end_time - start_time) / 1e9
-        total_bytes = sum(docs_sizes)
-        mb_per_sec = total_bytes / (1e6 * total_time_s)
-        docs_per_sec = len(docs) / total_time_s
-        print(
-            f"stringzillas.Fingerprints(GPU): {len(docs):,} docs in {total_time_s:.2f}s ~ {mb_per_sec:.3f} MB/s, {docs_per_sec:.0f} docs/s"
-        )
-    else:
-        print("stringzillas.Fingerprints(GPU): SKIPPED - CUDA not available")
-
-
-def bench(
-    dataset_path: str,
-    max_docs: int = None,
-    dimensions: int = 1024,
-    timeout_seconds: int = 10,
-):
-    """Run fingerprinting benchmarks."""
-
-    # Load dataset
-    if not Path(dataset_path).exists():
-        raise FileNotFoundError(f"Dataset not found: {dataset_path}")
-
-    with open(dataset_path, "r", encoding="utf-8", errors="ignore") as f:
-        docs = [doc.strip() for doc in f if doc.strip()]
-
-    if max_docs:
-        docs = docs[:max_docs]
-
-    docs_sizes = [len(doc.encode("utf-8")) for doc in docs]
-
-    print(f"Prepared {len(docs):,} docs of {sum(docs_sizes)/len(docs_sizes):.1f} mean byte length!")
-    print(f"Total bytes: {sum(docs_sizes):,}")
-    print(f"Num hashes: {dimensions}")
-    print()
-
-    print("=== Fingerprinting & Sketching Benchmarks ===")
-    log_fingerprinting_functionality(docs, docs_sizes, dimensions, timeout_seconds)
-
-
-_main_epilog = """
-Examples:
-
-  # Benchmark with a file
-  %(prog)s --dataset leipzig1M.txt
-
-  # Benchmark with limited docs
-  %(prog)s --dataset leipzig1M.txt --max-docs 1000
-
-  # Custom parameters
-  %(prog)s --dataset leipzig1M.txt --dimensions 32
-"""
-
-
-def main():
-    """Main entry point with argument parsing."""
-    parser = argparse.ArgumentParser(
-        description="Benchmark StringZilla fingerprinting operations",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog=_main_epilog,
-    )
-
-    parser.add_argument("--dataset", required=True, help="Path to text dataset file")
-    parser.add_argument("--max-docs", type=int, help="Maximum number of docs to process")
-    parser.add_argument(
-        "--dimensions",
-        type=int,
-        default=1024,
-        help="Number of hash functions for MinHash (default: 1024)",
-    )
-    parser.add_argument(
-        "--timeout",
-        type=int,
-        default=10,
-        help="Timeout in seconds for each benchmark (default: 10)",
-    )
-
-    args = parser.parse_args()
-    bench(args.dataset, args.max_docs, args.dimensions, args.timeout)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/bench_similarities.py b/scripts/bench_similarities.py
index 9993558a..daa963ac 100644
--- a/scripts/bench_similarities.py
+++ b/scripts/bench_similarities.py
@@ -458,8 +458,8 @@ def _ensure_bio_resources():
     if _biopython_aligner is None:
         _biopython_aligner = Align.PairwiseAligner()
         _biopython_aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
-        _biopython_aligner.open_gap_score = 1
-        _biopython_aligner.extend_gap_score = 1
+        _biopython_aligner.open_gap_score = -10  # Realistic gap opening cost
+        _biopython_aligner.extend_gap_score = -2  # Realistic gap extension cost
 
         subs_packed = np.array(_biopython_aligner.substitution_matrix).astype(np.int8)
         _blosum_matrix = np.zeros((256, 256), dtype=np.int8)
@@ -516,7 +516,8 @@ def benchmark_stringzillas_similarity_scores(
     # Single-input variants on 1 CPU core
     if name_matches(f"{szs_name}(1xCPU)", filter_pattern):
 
-        engine = szs_class(capabilities=default_scope, substitution_matrix=blosum)
+        engine = szs_class(capabilities=default_scope, substitution_matrix=blosum, 
+                          open=-10, extend=-2)  # Same gap costs as BioPython
 
         def kernel(a: str, b: str) -> int:
             a_array = sz.Strs([a])
@@ -535,7 +536,8 @@ def kernel(a: str, b: str) -> int:
     # Single-input variants on all CPU cores
     if name_matches(f"{szs_name}({cpu_cores}xCPU)", filter_pattern):
 
-        engine = szs_class(capabilities=cpu_scope, substitution_matrix=blosum)
+        engine = szs_class(capabilities=cpu_scope, substitution_matrix=blosum,
+                          open=-10, extend=-2)  # Same gap costs as BioPython
 
         def kernel(a: str, b: str) -> int:
             a_array = sz.Strs([a])
@@ -554,7 +556,8 @@ def kernel(a: str, b: str) -> int:
     # Single-input variants on GPU
     if name_matches(f"{szs_name}(1xGPU)", filter_pattern) and gpu_scope is not None:
 
-        engine = szs_class(capabilities=gpu_scope, substitution_matrix=blosum)
+        engine = szs_class(capabilities=gpu_scope, substitution_matrix=blosum,
+                          open=-10, extend=-2)  # Same gap costs as BioPython
 
         def kernel(a: str, b: str) -> int:
             a_array = sz.Strs([a])
@@ -573,7 +576,8 @@ def kernel(a: str, b: str) -> int:
     # Batch-input variants on 1 CPU core
     if name_matches(f"{szs_name}(1xCPU,batch={batch_size})", filter_pattern):
 
-        engine = szs_class(capabilities=default_scope, substitution_matrix=blosum)
+        engine = szs_class(capabilities=default_scope, substitution_matrix=blosum,
+                          open=-10, extend=-2)  # Same gap costs as BioPython
 
         def kernel(a_list: List[str], b_list: List[str]) -> List[int]:
             a_array = sz.Strs(a_list)
@@ -592,7 +596,8 @@ def kernel(a_list: List[str], b_list: List[str]) -> List[int]:
     # Batch-input variants on all CPU cores
     if name_matches(f"{szs_name}({cpu_cores}xCPU,batch={batch_size})", filter_pattern):
 
-        engine = szs_class(capabilities=cpu_scope, substitution_matrix=blosum)
+        engine = szs_class(capabilities=cpu_scope, substitution_matrix=blosum,
+                          open=-10, extend=-2)  # Same gap costs as BioPython
 
         def kernel(a_list: List[str], b_list: List[str]) -> List[int]:
             a_array = sz.Strs(a_list)
@@ -611,7 +616,8 @@ def kernel(a_list: List[str], b_list: List[str]) -> List[int]:
     # Batch-input variants on GPU
     if name_matches(f"{szs_name}(1xGPU,batch={batch_size})", filter_pattern) and gpu_scope is not None:
 
-        engine = szs_class(capabilities=gpu_scope, substitution_matrix=blosum)
+        engine = szs_class(capabilities=gpu_scope, substitution_matrix=blosum,
+                          open=-10, extend=-2)  # Same gap costs as BioPython
 
         def kernel(a_list: List[str], b_list: List[str]) -> List[int]:
             a_array = sz.Strs(a_list)
diff --git a/setup.py b/setup.py
index 72c660f7..a5b2cb4a 100644
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,7 @@ def _build_cuda_extension(self, ext):
                 "-O3",
                 "--use_fast_math",
                 "--expt-relaxed-constexpr",  # Allow constexpr functions in device code
-                f"-arch=sm_90a",  # Default to Hopper
+                "-arch=sm_90a",  # Default to Hopper
                 "-DSZ_DYNAMIC_DISPATCH=1",
                 "-DSZ_USE_CUDA=1",
             ]